2009 files changed, 209152 insertions, 104859 deletions
diff --git a/lib/Analysis/AliasAnalysis.cpp b/lib/Analysis/AliasAnalysis.cpp
index 35f2e97622fa..f931b6fc6523 100644
--- a/lib/Analysis/AliasAnalysis.cpp
+++ b/lib/Analysis/AliasAnalysis.cpp
@@ -27,7 +27,8 @@
 #include "llvm/Analysis/AliasAnalysis.h"
 #include "llvm/Analysis/BasicAliasAnalysis.h"
 #include "llvm/Analysis/CFG.h"
-#include "llvm/Analysis/CFLAliasAnalysis.h"
+#include "llvm/Analysis/CFLAndersAliasAnalysis.h"
+#include "llvm/Analysis/CFLSteensAliasAnalysis.h"
 #include "llvm/Analysis/CaptureTracking.h"
 #include "llvm/Analysis/GlobalsModRef.h"
 #include "llvm/Analysis/ObjCARCAliasAnalysis.h"
@@ -52,18 +53,11 @@ using namespace llvm;
 static cl::opt<bool> DisableBasicAA("disable-basicaa", cl::Hidden,
                                     cl::init(false));
 
-AAResults::AAResults(AAResults &&Arg) : AAs(std::move(Arg.AAs)) {
+AAResults::AAResults(AAResults &&Arg) : TLI(Arg.TLI), AAs(std::move(Arg.AAs)) {
   for (auto &AA : AAs)
     AA->setAAResults(this);
 }
 
-AAResults &AAResults::operator=(AAResults &&Arg) {
-  AAs = std::move(Arg.AAs);
-  for (auto &AA : AAs)
-    AA->setAAResults(this);
-  return *this;
-}
-
 AAResults::~AAResults() {
 // FIXME; It would be nice to at least clear out the pointers back to this
 // aggregation here, but we end up with non-nesting lifetimes in the legacy
@@ -116,7 +110,10 @@ ModRefInfo AAResults::getModRefInfo(Instruction *I, ImmutableCallSite Call) {
   // We may have two calls
   if (auto CS = ImmutableCallSite(I)) {
     // Check if the two calls modify the same memory
-    return getModRefInfo(Call, CS);
+    return getModRefInfo(CS, Call);
+  } else if (I->isFenceLike()) {
+    // If this is a fence, just return MRI_ModRef.
+    return MRI_ModRef;
   } else {
     // Otherwise, check if the call modifies or references the
     // location this memory access defines.  The best we can say
@@ -141,6 +138,46 @@ ModRefInfo AAResults::getModRefInfo(ImmutableCallSite CS,
       return Result;
   }
 
+  // Try to refine the mod-ref info further using other API entry points to the
+  // aggregate set of AA results.
+  auto MRB = getModRefBehavior(CS);
+  if (MRB == FMRB_DoesNotAccessMemory)
+    return MRI_NoModRef;
+
+  if (onlyReadsMemory(MRB))
+    Result = ModRefInfo(Result & MRI_Ref);
+  else if (doesNotReadMemory(MRB))
+    Result = ModRefInfo(Result & MRI_Mod);
+
+  if (onlyAccessesArgPointees(MRB)) {
+    bool DoesAlias = false;
+    ModRefInfo AllArgsMask = MRI_NoModRef;
+    if (doesAccessArgPointees(MRB)) {
+      for (auto AI = CS.arg_begin(), AE = CS.arg_end(); AI != AE; ++AI) {
+        const Value *Arg = *AI;
+        if (!Arg->getType()->isPointerTy())
+          continue;
+        unsigned ArgIdx = std::distance(CS.arg_begin(), AI);
+        MemoryLocation ArgLoc = MemoryLocation::getForArgument(CS, ArgIdx, TLI);
+        AliasResult ArgAlias = alias(ArgLoc, Loc);
+        if (ArgAlias != NoAlias) {
+          ModRefInfo ArgMask = getArgModRefInfo(CS, ArgIdx);
+          DoesAlias = true;
+          AllArgsMask = ModRefInfo(AllArgsMask | ArgMask);
+        }
+      }
+    }
+    if (!DoesAlias)
+      return MRI_NoModRef;
+    Result = ModRefInfo(Result & AllArgsMask);
+  }
+
+  // If Loc is a constant memory location, the call definitely could not
+  // modify the memory location.
+  if ((Result & MRI_Mod) &&
+      pointsToConstantMemory(Loc, /*OrLocal*/ false))
+    Result = ModRefInfo(Result & ~MRI_Mod);
+
   return Result;
 }
 
@@ -156,6 +193,90 @@ ModRefInfo AAResults::getModRefInfo(ImmutableCallSite CS1,
       return Result;
   }
 
+  // Try to refine the mod-ref info further using other API entry points to the
+  // aggregate set of AA results.
+
+  // If CS1 or CS2 are readnone, they don't interact.
+  auto CS1B = getModRefBehavior(CS1);
+  if (CS1B == FMRB_DoesNotAccessMemory)
+    return MRI_NoModRef;
+
+  auto CS2B = getModRefBehavior(CS2);
+  if (CS2B == FMRB_DoesNotAccessMemory)
+    return MRI_NoModRef;
+
+  // If they both only read from memory, there is no dependence.
+  if (onlyReadsMemory(CS1B) && onlyReadsMemory(CS2B))
+    return MRI_NoModRef;
+
+  // If CS1 only reads memory, the only dependence on CS2 can be
+  // from CS1 reading memory written by CS2.
+  if (onlyReadsMemory(CS1B))
+    Result = ModRefInfo(Result & MRI_Ref);
+  else if (doesNotReadMemory(CS1B))
+    Result = ModRefInfo(Result & MRI_Mod);
+
+  // If CS2 only access memory through arguments, accumulate the mod/ref
+  // information from CS1's references to the memory referenced by
+  // CS2's arguments.
+  if (onlyAccessesArgPointees(CS2B)) {
+    ModRefInfo R = MRI_NoModRef;
+    if (doesAccessArgPointees(CS2B)) {
+      for (auto I = CS2.arg_begin(), E = CS2.arg_end(); I != E; ++I) {
+        const Value *Arg = *I;
+        if (!Arg->getType()->isPointerTy())
+          continue;
+        unsigned CS2ArgIdx = std::distance(CS2.arg_begin(), I);
+        auto CS2ArgLoc = MemoryLocation::getForArgument(CS2, CS2ArgIdx, TLI);
+
+        // ArgMask indicates what CS2 might do to CS2ArgLoc, and the dependence
+        // of CS1 on that location is the inverse.
+        ModRefInfo ArgMask = getArgModRefInfo(CS2, CS2ArgIdx);
+        if (ArgMask == MRI_Mod)
+          ArgMask = MRI_ModRef;
+        else if (ArgMask == MRI_Ref)
+          ArgMask = MRI_Mod;
+
+        ArgMask = ModRefInfo(ArgMask & getModRefInfo(CS1, CS2ArgLoc));
+
+        R = ModRefInfo((R | ArgMask) & Result);
+        if (R == Result)
+          break;
+      }
+    }
+    return R;
+  }
+
+  // If CS1 only accesses memory through arguments, check if CS2 references
+  // any of the memory referenced by CS1's arguments. If not, return NoModRef.
+  if (onlyAccessesArgPointees(CS1B)) {
+    ModRefInfo R = MRI_NoModRef;
+    if (doesAccessArgPointees(CS1B)) {
+      for (auto I = CS1.arg_begin(), E = CS1.arg_end(); I != E; ++I) {
+        const Value *Arg = *I;
+        if (!Arg->getType()->isPointerTy())
+          continue;
+        unsigned CS1ArgIdx = std::distance(CS1.arg_begin(), I);
+        auto CS1ArgLoc = MemoryLocation::getForArgument(CS1, CS1ArgIdx, TLI);
+
+        // ArgMask indicates what CS1 might do to CS1ArgLoc; if CS1 might Mod
+        // CS1ArgLoc, then we care about either a Mod or a Ref by CS2. If CS1
+        // might Ref, then we care only about a Mod by CS2.
+        ModRefInfo ArgMask = getArgModRefInfo(CS1, CS1ArgIdx);
+        ModRefInfo ArgR = getModRefInfo(CS2, CS1ArgLoc);
+        if (((ArgMask & MRI_Mod) != MRI_NoModRef &&
+             (ArgR & MRI_ModRef) != MRI_NoModRef) ||
+            ((ArgMask & MRI_Ref) != MRI_NoModRef &&
+             (ArgR & MRI_Mod) != MRI_NoModRef))
+          R = ModRefInfo((R | ArgMask) & Result);
+
+        if (R == Result)
+          break;
+      }
+    }
+    return R;
+  }
+
   return Result;
 }
 
@@ -276,7 +397,7 @@ ModRefInfo AAResults::getModRefInfo(const CatchReturnInst *CatchRet,
 ModRefInfo AAResults::getModRefInfo(const AtomicCmpXchgInst *CX,
                                     const MemoryLocation &Loc) {
   // Acquire/Release cmpxchg has properties that matter for arbitrary addresses.
-  if (CX->getSuccessOrdering() > Monotonic)
+  if (isStrongerThanMonotonic(CX->getSuccessOrdering()))
     return MRI_ModRef;
 
   // If the cmpxchg address does not alias the location, it does not access it.
@@ -289,7 +410,7 @@ ModRefInfo AAResults::getModRefInfo(const AtomicCmpXchgInst *CX,
 ModRefInfo AAResults::getModRefInfo(const AtomicRMWInst *RMW,
                                     const MemoryLocation &Loc) {
   // Acquire/Release atomicrmw has properties that matter for arbitrary addresses.
-  if (RMW->getOrdering() > Monotonic)
+  if (isStrongerThanMonotonic(RMW->getOrdering()))
     return MRI_ModRef;
 
   // If the atomicrmw address does not alias the location, it does not access it.
@@ -332,7 +453,7 @@ ModRefInfo AAResults::callCapturesBefore(const Instruction *I,
 
   unsigned ArgNo = 0;
   ModRefInfo R = MRI_NoModRef;
-  for (ImmutableCallSite::arg_iterator CI = CS.arg_begin(), CE = CS.arg_end();
+  for (auto CI = CS.data_operands_begin(), CE = CS.data_operands_end();
        CI != CE; ++CI, ++ArgNo) {
     // Only look at the no-capture or byval pointer arguments.  If this
     // pointer were passed to arguments that were neither of these, then it
@@ -390,6 +511,9 @@ bool AAResults::canInstructionRangeModRef(const Instruction &I1,
 // Provide a definition for the root virtual destructor.
 AAResults::Concept::~Concept() {}
 
+// Provide a definition for the static object used to identify passes.
+char AAManager::PassID;
+
 namespace {
 /// A wrapper pass for external alias analyses. This just squirrels away the
 /// callback used to run any analyses and register their results.
@@ -432,7 +556,8 @@ char AAResultsWrapperPass::ID = 0;
 INITIALIZE_PASS_BEGIN(AAResultsWrapperPass, "aa",
                       "Function Alias Analysis Results", false, true)
 INITIALIZE_PASS_DEPENDENCY(BasicAAWrapperPass)
-INITIALIZE_PASS_DEPENDENCY(CFLAAWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(CFLAndersAAWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(CFLSteensAAWrapperPass)
 INITIALIZE_PASS_DEPENDENCY(ExternalAAWrapperPass)
 INITIALIZE_PASS_DEPENDENCY(GlobalsAAWrapperPass)
 INITIALIZE_PASS_DEPENDENCY(ObjCARCAAWrapperPass)
@@ -461,7 +586,8 @@ bool AAResultsWrapperPass::runOnFunction(Function &F) {
   // unregistering themselves with them. We need to carefully tear down the
   // previous object first, in this case replacing it with an empty one, before
   // registering new results.
-  AAR.reset(new AAResults());
+  AAR.reset(
+      new AAResults(getAnalysis<TargetLibraryInfoWrapperPass>().getTLI()));
 
   // BasicAA is always available for function analyses. Also, we add it first
   // so that it can trump TBAA results when it proves MustAlias.
@@ -482,7 +608,9 @@ bool AAResultsWrapperPass::runOnFunction(Function &F) {
     AAR->addAAResult(WrapperPass->getResult());
   if (auto *WrapperPass = getAnalysisIfAvailable<SCEVAAWrapperPass>())
     AAR->addAAResult(WrapperPass->getResult());
-  if (auto *WrapperPass = getAnalysisIfAvailable<CFLAAWrapperPass>())
+  if (auto *WrapperPass = getAnalysisIfAvailable<CFLAndersAAWrapperPass>())
+    AAR->addAAResult(WrapperPass->getResult());
+  if (auto *WrapperPass = getAnalysisIfAvailable<CFLSteensAAWrapperPass>())
     AAR->addAAResult(WrapperPass->getResult());
 
   // If available, run an external AA providing callback over the results as
@@ -498,6 +626,7 @@ bool AAResultsWrapperPass::runOnFunction(Function &F) {
 void AAResultsWrapperPass::getAnalysisUsage(AnalysisUsage &AU) const {
   AU.setPreservesAll();
   AU.addRequired<BasicAAWrapperPass>();
+  AU.addRequired<TargetLibraryInfoWrapperPass>();
 
   // We also need to mark all the alias analysis passes we will potentially
   // probe in runOnFunction as used here to ensure the legacy pass manager
@@ -508,12 +637,13 @@ void AAResultsWrapperPass::getAnalysisUsage(AnalysisUsage &AU) const {
   AU.addUsedIfAvailable<objcarc::ObjCARCAAWrapperPass>();
   AU.addUsedIfAvailable<GlobalsAAWrapperPass>();
   AU.addUsedIfAvailable<SCEVAAWrapperPass>();
-  AU.addUsedIfAvailable<CFLAAWrapperPass>();
+  AU.addUsedIfAvailable<CFLAndersAAWrapperPass>();
+  AU.addUsedIfAvailable<CFLSteensAAWrapperPass>();
 }
 
 AAResults llvm::createLegacyPMAAResults(Pass &P, Function &F,
                                         BasicAAResult &BAR) {
-  AAResults AAR;
+  AAResults AAR(P.getAnalysis<TargetLibraryInfoWrapperPass>().getTLI());
 
   // Add in our explicitly constructed BasicAA results.
   if (!DisableBasicAA)
@@ -530,38 +660,26 @@ AAResults llvm::createLegacyPMAAResults(Pass &P, Function &F,
     AAR.addAAResult(WrapperPass->getResult());
   if (auto *WrapperPass = P.getAnalysisIfAvailable<GlobalsAAWrapperPass>())
     AAR.addAAResult(WrapperPass->getResult());
-  if (auto *WrapperPass = P.getAnalysisIfAvailable<SCEVAAWrapperPass>())
+  if (auto *WrapperPass = P.getAnalysisIfAvailable<CFLAndersAAWrapperPass>())
     AAR.addAAResult(WrapperPass->getResult());
-  if (auto *WrapperPass = P.getAnalysisIfAvailable<CFLAAWrapperPass>())
+  if (auto *WrapperPass = P.getAnalysisIfAvailable<CFLSteensAAWrapperPass>())
     AAR.addAAResult(WrapperPass->getResult());
 
   return AAR;
 }
 
-/// isNoAliasCall - Return true if this pointer is returned by a noalias
-/// function.
 bool llvm::isNoAliasCall(const Value *V) {
   if (auto CS = ImmutableCallSite(V))
     return CS.paramHasAttr(0, Attribute::NoAlias);
   return false;
 }
 
-/// isNoAliasArgument - Return true if this is an argument with the noalias
-/// attribute.
-bool llvm::isNoAliasArgument(const Value *V)
-{
+bool llvm::isNoAliasArgument(const Value *V) {
   if (const Argument *A = dyn_cast<Argument>(V))
     return A->hasNoAliasAttr();
   return false;
 }
 
-/// isIdentifiedObject - Return true if this pointer refers to a distinct and
-/// identifiable object.  This returns true for:
-///    Global Variables and Functions (but not Global Aliases)
-///    Allocas and Mallocs
-///    ByVal and NoAlias Arguments
-///    NoAlias returns
-///
 bool llvm::isIdentifiedObject(const Value *V) {
   if (isa<AllocaInst>(V))
     return true;
@@ -574,12 +692,19 @@ bool llvm::isIdentifiedObject(const Value *V) {
   return false;
 }
 
-/// isIdentifiedFunctionLocal - Return true if V is umabigously identified
-/// at the function-level. Different IdentifiedFunctionLocals can't alias.
-/// Further, an IdentifiedFunctionLocal can not alias with any function
-/// arguments other than itself, which is not necessarily true for
-/// IdentifiedObjects.
-bool llvm::isIdentifiedFunctionLocal(const Value *V)
-{
+bool llvm::isIdentifiedFunctionLocal(const Value *V) {
   return isa<AllocaInst>(V) || isNoAliasCall(V) || isNoAliasArgument(V);
 }
+
+void llvm::getAAResultsAnalysisUsage(AnalysisUsage &AU) {
+  // This function needs to be in sync with llvm::createLegacyPMAAResults -- if
+  // more alias analyses are added to llvm::createLegacyPMAAResults, they need
+  // to be added here also.
+  AU.addRequired<TargetLibraryInfoWrapperPass>();
+  AU.addUsedIfAvailable<ScopedNoAliasAAWrapperPass>();
+  AU.addUsedIfAvailable<TypeBasedAAWrapperPass>();
+  AU.addUsedIfAvailable<objcarc::ObjCARCAAWrapperPass>();
+  AU.addUsedIfAvailable<GlobalsAAWrapperPass>();
+  AU.addUsedIfAvailable<CFLAndersAAWrapperPass>();
+  AU.addUsedIfAvailable<CFLSteensAAWrapperPass>();
+}
diff --git a/lib/Analysis/AliasAnalysisEvaluator.cpp b/lib/Analysis/AliasAnalysisEvaluator.cpp
index 12917b650e5e..baf8f3f881db 100644
--- a/lib/Analysis/AliasAnalysisEvaluator.cpp
+++ b/lib/Analysis/AliasAnalysisEvaluator.cpp
@@ -6,18 +6,8 @@
 // License. See LICENSE.TXT for details.
 //
 //===----------------------------------------------------------------------===//
-//
-// This file implements a simple N^2 alias analysis accuracy evaluator.
-// Basically, for each function in the program, it simply queries to see how the
-// alias analysis implementation answers alias queries between each pair of
-// pointers in the function.
-//
-// This is inspired and adapted from code by: Naveen Neelakantam, Francesco
-// Spadini, and Wojciech Stryjewski.
-//
-//===----------------------------------------------------------------------===//
 
-#include "llvm/Analysis/Passes.h"
+#include "llvm/Analysis/AliasAnalysisEvaluator.h"
 #include "llvm/ADT/SetVector.h"
 #include "llvm/Analysis/AliasAnalysis.h"
 #include "llvm/IR/Constants.h"
@@ -47,51 +37,9 @@ static cl::opt<bool> PrintModRef("print-modref", cl::ReallyHidden);
 
 static cl::opt<bool> EvalAAMD("evaluate-aa-metadata", cl::ReallyHidden);
 
-namespace {
-  class AAEval : public FunctionPass {
-    unsigned NoAliasCount, MayAliasCount, PartialAliasCount, MustAliasCount;
-    unsigned NoModRefCount, ModCount, RefCount, ModRefCount;
-
-  public:
-    static char ID; // Pass identification, replacement for typeid
-    AAEval() : FunctionPass(ID) {
-      initializeAAEvalPass(*PassRegistry::getPassRegistry());
-    }
-
-    void getAnalysisUsage(AnalysisUsage &AU) const override {
-      AU.addRequired<AAResultsWrapperPass>();
-      AU.setPreservesAll();
-    }
-
-    bool doInitialization(Module &M) override {
-      NoAliasCount = MayAliasCount = PartialAliasCount = MustAliasCount = 0;
-      NoModRefCount = ModCount = RefCount = ModRefCount = 0;
-
-      if (PrintAll) {
-        PrintNoAlias = PrintMayAlias = true;
-        PrintPartialAlias = PrintMustAlias = true;
-        PrintNoModRef = PrintMod = PrintRef = PrintModRef = true;
-      }
-      return false;
-    }
-
-    bool runOnFunction(Function &F) override;
-    bool doFinalization(Module &M) override;
-  };
-}
-
-char AAEval::ID = 0;
-INITIALIZE_PASS_BEGIN(AAEval, "aa-eval",
-                "Exhaustive Alias Analysis Precision Evaluator", false, true)
-INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass)
-INITIALIZE_PASS_END(AAEval, "aa-eval",
-                "Exhaustive Alias Analysis Precision Evaluator", false, true)
-
-FunctionPass *llvm::createAAEvalPass() { return new AAEval(); }
-
 static void PrintResults(const char *Msg, bool P, const Value *V1,
                          const Value *V2, const Module *M) {
-  if (P) {
+  if (PrintAll || P) {
     std::string o1, o2;
     {
       raw_string_ostream os1(o1), os2(o2);
@@ -110,7 +58,7 @@ static void PrintResults(const char *Msg, bool P, const Value *V1,
 static inline void
 PrintModRefResults(const char *Msg, bool P, Instruction *I, Value *Ptr,
                    Module *M) {
-  if (P) {
+  if (PrintAll || P) {
     errs() << "  " << Msg << ":  Ptr: ";
     Ptr->printAsOperand(errs(), true, M);
     errs() << "\t<->" << *I << '\n';
@@ -120,7 +68,7 @@ PrintModRefResults(const char *Msg, bool P, Instruction *I, Value *Ptr,
 static inline void
 PrintModRefResults(const char *Msg, bool P, CallSite CSA, CallSite CSB,
                    Module *M) {
-  if (P) {
+  if (PrintAll || P) {
     errs() << "  " << Msg << ": " << *CSA.getInstruction()
            << " <-> " << *CSB.getInstruction() << '\n';
   }
@@ -129,7 +77,7 @@ PrintModRefResults(const char *Msg, bool P, CallSite CSA, CallSite CSB,
 static inline void
 PrintLoadStoreResults(const char *Msg, bool P, const Value *V1,
                       const Value *V2, const Module *M) {
-  if (P) {
+  if (PrintAll || P) {
     errs() << "  " << Msg << ": " << *V1
            << " <-> " << *V2 << '\n';
   }
@@ -140,9 +88,15 @@ static inline bool isInterestingPointer(Value *V) {
       && !isa<ConstantPointerNull>(V);
 }
 
-bool AAEval::runOnFunction(Function &F) {
+PreservedAnalyses AAEvaluator::run(Function &F, AnalysisManager<Function> &AM) {
+  runInternal(F, AM.getResult<AAManager>(F));
+  return PreservedAnalyses::all();
+}
+
+void AAEvaluator::runInternal(Function &F, AAResults &AA) {
   const DataLayout &DL = F.getParent()->getDataLayout();
-  AliasAnalysis &AA = getAnalysis<AAResultsWrapperPass>().getAAResults();
+
+  ++FunctionCount;
 
   SetVector<Value *> Pointers;
   SmallSetVector<CallSite, 16> CallSites;
@@ -180,8 +134,8 @@ bool AAEval::runOnFunction(Function &F) {
     }
   }
 
-  if (PrintNoAlias || PrintMayAlias || PrintPartialAlias || PrintMustAlias ||
-      PrintNoModRef || PrintMod || PrintRef || PrintModRef)
+  if (PrintAll || PrintNoAlias || PrintMayAlias || PrintPartialAlias ||
+      PrintMustAlias || PrintNoModRef || PrintMod || PrintRef || PrintModRef)
     errs() << "Function: " << F.getName() << ": " << Pointers.size()
            << " pointers, " << CallSites.size() << " call sites\n";
 
@@ -221,29 +175,27 @@ bool AAEval::runOnFunction(Function &F) {
 
   if (EvalAAMD) {
     // iterate over all pairs of load, store
-    for (SetVector<Value *>::iterator I1 = Loads.begin(), E = Loads.end();
-         I1 != E; ++I1) {
-      for (SetVector<Value *>::iterator I2 = Stores.begin(), E2 = Stores.end();
-           I2 != E2; ++I2) {
-        switch (AA.alias(MemoryLocation::get(cast<LoadInst>(*I1)),
-                         MemoryLocation::get(cast<StoreInst>(*I2)))) {
+    for (Value *Load : Loads) {
+      for (Value *Store : Stores) {
+        switch (AA.alias(MemoryLocation::get(cast<LoadInst>(Load)),
+                         MemoryLocation::get(cast<StoreInst>(Store)))) {
         case NoAlias:
-          PrintLoadStoreResults("NoAlias", PrintNoAlias, *I1, *I2,
+          PrintLoadStoreResults("NoAlias", PrintNoAlias, Load, Store,
                                 F.getParent());
           ++NoAliasCount;
           break;
         case MayAlias:
-          PrintLoadStoreResults("MayAlias", PrintMayAlias, *I1, *I2,
+          PrintLoadStoreResults("MayAlias", PrintMayAlias, Load, Store,
                                 F.getParent());
           ++MayAliasCount;
           break;
         case PartialAlias:
-          PrintLoadStoreResults("PartialAlias", PrintPartialAlias, *I1, *I2,
+          PrintLoadStoreResults("PartialAlias", PrintPartialAlias, Load, Store,
                                 F.getParent());
           ++PartialAliasCount;
           break;
         case MustAlias:
-          PrintLoadStoreResults("MustAlias", PrintMustAlias, *I1, *I2,
+          PrintLoadStoreResults("MustAlias", PrintMustAlias, Load, Store,
                                 F.getParent());
           ++MustAliasCount;
           break;
@@ -283,30 +235,31 @@ bool AAEval::runOnFunction(Function &F) {
   }
 
   // Mod/ref alias analysis: compare all pairs of calls and values
-  for (auto C = CallSites.begin(), Ce = CallSites.end(); C != Ce; ++C) {
-    Instruction *I = C->getInstruction();
+  for (CallSite C : CallSites) {
+    Instruction *I = C.getInstruction();
 
-    for (SetVector<Value *>::iterator V = Pointers.begin(), Ve = Pointers.end();
-         V != Ve; ++V) {
+    for (auto Pointer : Pointers) {
       uint64_t Size = MemoryLocation::UnknownSize;
-      Type *ElTy = cast<PointerType>((*V)->getType())->getElementType();
+      Type *ElTy = cast<PointerType>(Pointer->getType())->getElementType();
       if (ElTy->isSized()) Size = DL.getTypeStoreSize(ElTy);
 
-      switch (AA.getModRefInfo(*C, *V, Size)) {
+      switch (AA.getModRefInfo(C, Pointer, Size)) {
       case MRI_NoModRef:
-        PrintModRefResults("NoModRef", PrintNoModRef, I, *V, F.getParent());
+        PrintModRefResults("NoModRef", PrintNoModRef, I, Pointer,
+                           F.getParent());
         ++NoModRefCount;
         break;
       case MRI_Mod:
-        PrintModRefResults("Just Mod", PrintMod, I, *V, F.getParent());
+        PrintModRefResults("Just Mod", PrintMod, I, Pointer, F.getParent());
         ++ModCount;
         break;
       case MRI_Ref:
-        PrintModRefResults("Just Ref", PrintRef, I, *V, F.getParent());
+        PrintModRefResults("Just Ref", PrintRef, I, Pointer, F.getParent());
         ++RefCount;
         break;
       case MRI_ModRef:
-        PrintModRefResults("Both ModRef", PrintModRef, I, *V, F.getParent());
+        PrintModRefResults("Both ModRef", PrintModRef, I, Pointer,
+                           F.getParent());
         ++ModRefCount;
         break;
       }
@@ -338,17 +291,18 @@ bool AAEval::runOnFunction(Function &F) {
       }
     }
   }
-
-  return false;
 }
 
-static void PrintPercent(unsigned Num, unsigned Sum) {
-  errs() << "(" << Num*100ULL/Sum << "."
-         << ((Num*1000ULL/Sum) % 10) << "%)\n";
+static void PrintPercent(int64_t Num, int64_t Sum) {
+  errs() << "(" << Num * 100LL / Sum << "." << ((Num * 1000LL / Sum) % 10)
+         << "%)\n";
 }
 
-bool AAEval::doFinalization(Module &M) {
-  unsigned AliasSum =
+AAEvaluator::~AAEvaluator() {
+  if (FunctionCount == 0)
+    return;
+
+  int64_t AliasSum =
       NoAliasCount + MayAliasCount + PartialAliasCount + MustAliasCount;
   errs() << "===== Alias Analysis Evaluator Report =====\n";
   if (AliasSum == 0) {
@@ -371,7 +325,7 @@ bool AAEval::doFinalization(Module &M) {
   }
 
   // Display the summary for mod/ref analysis
-  unsigned ModRefSum = NoModRefCount + ModCount + RefCount + ModRefCount;
+  int64_t ModRefSum = NoModRefCount + ModCount + RefCount + ModRefCount;
   if (ModRefSum == 0) {
     errs() << "  Alias Analysis Mod/Ref Evaluator Summary: no "
               "mod/ref!\n";
@@ -390,6 +344,46 @@ bool AAEval::doFinalization(Module &M) {
            << ModCount * 100 / ModRefSum << "%/" << RefCount * 100 / ModRefSum
            << "%/" << ModRefCount * 100 / ModRefSum << "%\n";
   }
+}
 
-  return false;
+namespace llvm {
+class AAEvalLegacyPass : public FunctionPass {
+  std::unique_ptr<AAEvaluator> P;
+
+public:
+  static char ID; // Pass identification, replacement for typeid
+  AAEvalLegacyPass() : FunctionPass(ID) {
+    initializeAAEvalLegacyPassPass(*PassRegistry::getPassRegistry());
+  }
+
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    AU.addRequired<AAResultsWrapperPass>();
+    AU.setPreservesAll();
+  }
+
+  bool doInitialization(Module &M) override {
+    P.reset(new AAEvaluator());
+    return false;
+  }
+
+  bool runOnFunction(Function &F) override {
+    P->runInternal(F, getAnalysis<AAResultsWrapperPass>().getAAResults());
+    return false;
+  }
+  bool doFinalization(Module &M) override {
+    P.reset();
+    return false;
+  }
+};
 }
+
+char AAEvalLegacyPass::ID = 0;
+INITIALIZE_PASS_BEGIN(AAEvalLegacyPass, "aa-eval",
+                      "Exhaustive Alias Analysis Precision Evaluator", false,
+                      true)
+INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass)
+INITIALIZE_PASS_END(AAEvalLegacyPass, "aa-eval",
+                    "Exhaustive Alias Analysis Precision Evaluator", false,
+                    true)
+
+FunctionPass *llvm::createAAEvalPass() { return new AAEvalLegacyPass(); }
diff --git a/lib/Analysis/AliasAnalysisSummary.cpp b/lib/Analysis/AliasAnalysisSummary.cpp
new file mode 100644
index 000000000000..f3f13df283db
--- /dev/null
+++ b/lib/Analysis/AliasAnalysisSummary.cpp
@@ -0,0 +1,105 @@
+#include "AliasAnalysisSummary.h"
+#include "llvm/IR/Argument.h"
+#include "llvm/IR/Type.h"
+#include "llvm/Support/Compiler.h"
+
+namespace llvm {
+namespace cflaa {
+
+namespace {
+LLVM_CONSTEXPR unsigned AttrEscapedIndex = 0;
+LLVM_CONSTEXPR unsigned AttrUnknownIndex = 1;
+LLVM_CONSTEXPR unsigned AttrGlobalIndex = 2;
+LLVM_CONSTEXPR unsigned AttrCallerIndex = 3;
+LLVM_CONSTEXPR unsigned AttrFirstArgIndex = 4;
+LLVM_CONSTEXPR unsigned AttrLastArgIndex = NumAliasAttrs;
+LLVM_CONSTEXPR unsigned AttrMaxNumArgs = AttrLastArgIndex - AttrFirstArgIndex;
+
+// NOTE: These aren't AliasAttrs because bitsets don't have a constexpr
+// ctor for some versions of MSVC that we support. We could maybe refactor,
+// but...
+using AliasAttr = unsigned;
+LLVM_CONSTEXPR AliasAttr AttrNone = 0;
+LLVM_CONSTEXPR AliasAttr AttrEscaped = 1 << AttrEscapedIndex;
+LLVM_CONSTEXPR AliasAttr AttrUnknown = 1 << AttrUnknownIndex;
+LLVM_CONSTEXPR AliasAttr AttrGlobal = 1 << AttrGlobalIndex;
+LLVM_CONSTEXPR AliasAttr AttrCaller = 1 << AttrCallerIndex;
+LLVM_CONSTEXPR AliasAttr ExternalAttrMask =
+    AttrEscaped | AttrUnknown | AttrGlobal;
+}
+
+AliasAttrs getAttrNone() { return AttrNone; }
+
+AliasAttrs getAttrUnknown() { return AttrUnknown; }
+bool hasUnknownAttr(AliasAttrs Attr) { return Attr.test(AttrUnknownIndex); }
+
+AliasAttrs getAttrCaller() { return AttrCaller; }
+bool hasCallerAttr(AliasAttrs Attr) { return Attr.test(AttrCaller); }
+bool hasUnknownOrCallerAttr(AliasAttrs Attr) {
+  return Attr.test(AttrUnknownIndex) || Attr.test(AttrCallerIndex);
+}
+
+AliasAttrs getAttrEscaped() { return AttrEscaped; }
+bool hasEscapedAttr(AliasAttrs Attr) { return Attr.test(AttrEscapedIndex); }
+
+static AliasAttr argNumberToAttr(unsigned ArgNum) {
+  if (ArgNum >= AttrMaxNumArgs)
+    return AttrUnknown;
+  // N.B. MSVC complains if we use `1U` here, since AliasAttr' ctor takes
+  // an unsigned long long.
+  return AliasAttr(1ULL << (ArgNum + AttrFirstArgIndex));
+}
+
+AliasAttrs getGlobalOrArgAttrFromValue(const Value &Val) {
+  if (isa<GlobalValue>(Val))
+    return AttrGlobal;
+
+  if (auto *Arg = dyn_cast<Argument>(&Val))
+    // Only pointer arguments should have the argument attribute,
+    // because things can't escape through scalars without us seeing a
+    // cast, and thus, interaction with them doesn't matter.
+    if (!Arg->hasNoAliasAttr() && Arg->getType()->isPointerTy())
+      return argNumberToAttr(Arg->getArgNo());
+  return AttrNone;
+}
+
+bool isGlobalOrArgAttr(AliasAttrs Attr) {
+  return Attr.reset(AttrEscapedIndex)
+      .reset(AttrUnknownIndex)
+      .reset(AttrCallerIndex)
+      .any();
+}
+
+AliasAttrs getExternallyVisibleAttrs(AliasAttrs Attr) {
+  return Attr & AliasAttrs(ExternalAttrMask);
+}
+
+Optional<InstantiatedValue> instantiateInterfaceValue(InterfaceValue IValue,
+                                                      CallSite CS) {
+  auto Index = IValue.Index;
+  auto Value = (Index == 0) ? CS.getInstruction() : CS.getArgument(Index - 1);
+  if (Value->getType()->isPointerTy())
+    return InstantiatedValue{Value, IValue.DerefLevel};
+  return None;
+}
+
+Optional<InstantiatedRelation>
+instantiateExternalRelation(ExternalRelation ERelation, CallSite CS) {
+  auto From = instantiateInterfaceValue(ERelation.From, CS);
+  if (!From)
+    return None;
+  auto To = instantiateInterfaceValue(ERelation.To, CS);
+  if (!To)
+    return None;
+  return InstantiatedRelation{*From, *To};
+}
+
+Optional<InstantiatedAttr> instantiateExternalAttribute(ExternalAttribute EAttr,
+                                                        CallSite CS) {
+  auto Value = instantiateInterfaceValue(EAttr.IValue, CS);
+  if (!Value)
+    return None;
+  return InstantiatedAttr{*Value, EAttr.Attr};
+}
+}
+}
diff --git a/lib/Analysis/AliasAnalysisSummary.h b/lib/Analysis/AliasAnalysisSummary.h
new file mode 100644
index 000000000000..43c0d4cb14f9
--- /dev/null
+++ b/lib/Analysis/AliasAnalysisSummary.h
@@ -0,0 +1,211 @@
+//=====- CFLSummary.h - Abstract stratified sets implementation. --------=====//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+/// \file
+/// This file defines various utility types and functions useful to
+/// summary-based alias analysis.
+///
+/// Summary-based analysis, also known as bottom-up analysis, is a style of
+/// interprocedrual static analysis that tries to analyze the callees before the
+/// callers get analyzed. The key idea of summary-based analysis is to first
+/// process each function indepedently, outline its behavior in a condensed
+/// summary, and then instantiate the summary at the callsite when the said
+/// function is called elsewhere. This is often in contrast to another style
+/// called top-down analysis, in which callers are always analyzed first before
+/// the callees.
+///
+/// In a summary-based analysis, functions must be examined independently and
+/// out-of-context. We have no information on the state of the memory, the
+/// arguments, the global values, and anything else external to the function. To
+/// carry out the analysis conservative assumptions have to be made about those
+/// external states. In exchange for the potential loss of precision, the
+/// summary we obtain this way is highly reusable, which makes the analysis
+/// easier to scale to large programs even if carried out context-sensitively.
+///
+/// Currently, all CFL-based alias analyses adopt the summary-based approach
+/// and therefore heavily rely on this header.
+///
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_ANALYSIS_ALIASANALYSISSUMMARY_H
+#define LLVM_ANALYSIS_ALIASANALYSISSUMMARY_H
+
+#include "llvm/ADT/DenseMapInfo.h"
+#include "llvm/ADT/Optional.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/IR/CallSite.h"
+#include <bitset>
+
+namespace llvm {
+namespace cflaa {
+
+//===----------------------------------------------------------------------===//
+// AliasAttr related stuffs
+//===----------------------------------------------------------------------===//
+
+/// The number of attributes that AliasAttr should contain. Attributes are
+/// described below, and 32 was an arbitrary choice because it fits nicely in 32
+/// bits (because we use a bitset for AliasAttr).
+static const unsigned NumAliasAttrs = 32;
+
+/// These are attributes that an alias analysis can use to mark certain special
+/// properties of a given pointer. Refer to the related functions below to see
+/// what kinds of attributes are currently defined.
+typedef std::bitset<NumAliasAttrs> AliasAttrs;
+
+/// Attr represent whether the said pointer comes from an unknown source
+/// (such as opaque memory or an integer cast).
+AliasAttrs getAttrNone();
+
+/// AttrUnknown represent whether the said pointer comes from a source not known
+/// to alias analyses (such as opaque memory or an integer cast).
+AliasAttrs getAttrUnknown();
+bool hasUnknownAttr(AliasAttrs);
+
+/// AttrCaller represent whether the said pointer comes from a source not known
+/// to the current function but known to the caller. Values pointed to by the
+/// arguments of the current function have this attribute set
+AliasAttrs getAttrCaller();
+bool hasCallerAttr(AliasAttrs);
+bool hasUnknownOrCallerAttr(AliasAttrs);
+
+/// AttrEscaped represent whether the said pointer comes from a known source but
+/// escapes to the unknown world (e.g. casted to an integer, or passed as an
+/// argument to opaque function). Unlike non-escaped pointers, escaped ones may
+/// alias pointers coming from unknown sources.
+AliasAttrs getAttrEscaped();
+bool hasEscapedAttr(AliasAttrs);
+
+/// AttrGlobal represent whether the said pointer is a global value.
+/// AttrArg represent whether the said pointer is an argument, and if so, what
+/// index the argument has.
+AliasAttrs getGlobalOrArgAttrFromValue(const Value &);
+bool isGlobalOrArgAttr(AliasAttrs);
+
+/// Given an AliasAttrs, return a new AliasAttrs that only contains attributes
+/// meaningful to the caller. This function is primarily used for
+/// interprocedural analysis
+/// Currently, externally visible AliasAttrs include AttrUnknown, AttrGlobal,
+/// and AttrEscaped
+AliasAttrs getExternallyVisibleAttrs(AliasAttrs);
+
+//===----------------------------------------------------------------------===//
+// Function summary related stuffs
+//===----------------------------------------------------------------------===//
+
+/// The maximum number of arguments we can put into a summary.
+LLVM_CONSTEXPR static unsigned MaxSupportedArgsInSummary = 50;
+
+/// We use InterfaceValue to describe parameters/return value, as well as
+/// potential memory locations that are pointed to by parameters/return value,
+/// of a function.
+/// Index is an integer which represents a single parameter or a return value.
+/// When the index is 0, it refers to the return value. Non-zero index i refers
+/// to the i-th parameter.
+/// DerefLevel indicates the number of dereferences one must perform on the
+/// parameter/return value to get this InterfaceValue.
+struct InterfaceValue {
+  unsigned Index;
+  unsigned DerefLevel;
+};
+
+inline bool operator==(InterfaceValue LHS, InterfaceValue RHS) {
+  return LHS.Index == RHS.Index && LHS.DerefLevel == RHS.DerefLevel;
+}
+inline bool operator!=(InterfaceValue LHS, InterfaceValue RHS) {
+  return !(LHS == RHS);
+}
+
+/// We use ExternalRelation to describe an externally visible aliasing relations
+/// between parameters/return value of a function.
+struct ExternalRelation {
+  InterfaceValue From, To;
+};
+
+/// We use ExternalAttribute to describe an externally visible AliasAttrs
+/// for parameters/return value.
+struct ExternalAttribute {
+  InterfaceValue IValue;
+  AliasAttrs Attr;
+};
+
+/// AliasSummary is just a collection of ExternalRelation and ExternalAttribute
+struct AliasSummary {
+  // RetParamRelations is a collection of ExternalRelations.
+  SmallVector<ExternalRelation, 8> RetParamRelations;
+
+  // RetParamAttributes is a collection of ExternalAttributes.
+  SmallVector<ExternalAttribute, 8> RetParamAttributes;
+};
+
+/// This is the result of instantiating InterfaceValue at a particular callsite
+struct InstantiatedValue {
+  Value *Val;
+  unsigned DerefLevel;
+};
+Optional<InstantiatedValue> instantiateInterfaceValue(InterfaceValue, CallSite);
+
+inline bool operator==(InstantiatedValue LHS, InstantiatedValue RHS) {
+  return LHS.Val == RHS.Val && LHS.DerefLevel == RHS.DerefLevel;
+}
+inline bool operator!=(InstantiatedValue LHS, InstantiatedValue RHS) {
+  return !(LHS == RHS);
+}
+inline bool operator<(InstantiatedValue LHS, InstantiatedValue RHS) {
+  return std::less<Value *>()(LHS.Val, RHS.Val) ||
+         (LHS.Val == RHS.Val && LHS.DerefLevel < RHS.DerefLevel);
+}
+inline bool operator>(InstantiatedValue LHS, InstantiatedValue RHS) {
+  return RHS < LHS;
+}
+inline bool operator<=(InstantiatedValue LHS, InstantiatedValue RHS) {
+  return !(RHS < LHS);
+}
+inline bool operator>=(InstantiatedValue LHS, InstantiatedValue RHS) {
+  return !(LHS < RHS);
+}
+
+/// This is the result of instantiating ExternalRelation at a particular
+/// callsite
+struct InstantiatedRelation {
+  InstantiatedValue From, To;
+};
+Optional<InstantiatedRelation> instantiateExternalRelation(ExternalRelation,
+                                                           CallSite);
+
+/// This is the result of instantiating ExternalAttribute at a particular
+/// callsite
+struct InstantiatedAttr {
+  InstantiatedValue IValue;
+  AliasAttrs Attr;
+};
+Optional<InstantiatedAttr> instantiateExternalAttribute(ExternalAttribute,
+                                                        CallSite);
+}
+
+template <> struct DenseMapInfo<cflaa::InstantiatedValue> {
+  static inline cflaa::InstantiatedValue getEmptyKey() {
+    return cflaa::InstantiatedValue{DenseMapInfo<Value *>::getEmptyKey(),
+                                    DenseMapInfo<unsigned>::getEmptyKey()};
+  }
+  static inline cflaa::InstantiatedValue getTombstoneKey() {
+    return cflaa::InstantiatedValue{DenseMapInfo<Value *>::getTombstoneKey(),
+                                    DenseMapInfo<unsigned>::getTombstoneKey()};
+  }
+  static unsigned getHashValue(const cflaa::InstantiatedValue &IV) {
+    return DenseMapInfo<std::pair<Value *, unsigned>>::getHashValue(
+        std::make_pair(IV.Val, IV.DerefLevel));
+  }
+  static bool isEqual(const cflaa::InstantiatedValue &LHS,
+                      const cflaa::InstantiatedValue &RHS) {
+    return LHS.Val == RHS.Val && LHS.DerefLevel == RHS.DerefLevel;
+  }
+};
+}
+
+#endif
diff --git a/lib/Analysis/AliasSetTracker.cpp b/lib/Analysis/AliasSetTracker.cpp
index 3094049b3cc3..d349ac51a9b9 100644
--- a/lib/Analysis/AliasSetTracker.cpp
+++ b/lib/Analysis/AliasSetTracker.cpp
@@ -208,13 +208,12 @@ void AliasSetTracker::clear() {
 }
 
 
-/// findAliasSetForPointer - Given a pointer, find the one alias set to put the
-/// instruction referring to the pointer into.  If there are multiple alias sets
-/// that may alias the pointer, merge them together and return the unified set.
-///
-AliasSet *AliasSetTracker::findAliasSetForPointer(const Value *Ptr,
-                                                  uint64_t Size,
-                                                  const AAMDNodes &AAInfo) {
+/// mergeAliasSetsForPointer - Given a pointer, merge all alias sets that may
+/// alias the pointer. Return the unified set, or nullptr if no set that aliases
+/// the pointer was found.
+AliasSet *AliasSetTracker::mergeAliasSetsForPointer(const Value *Ptr,
+                                                    uint64_t Size,
+                                                    const AAMDNodes &AAInfo) {
   AliasSet *FoundSet = nullptr;
   for (iterator I = begin(), E = end(); I != E;) {
     iterator Cur = I++;
@@ -235,15 +234,15 @@ AliasSet *AliasSetTracker::findAliasSetForPointer(const Value *Ptr,
 /// alias sets.
 bool AliasSetTracker::containsPointer(const Value *Ptr, uint64_t Size,
                                       const AAMDNodes &AAInfo) const {
-  for (const_iterator I = begin(), E = end(); I != E; ++I)
-    if (!I->Forward && I->aliasesPointer(Ptr, Size, AAInfo, AA))
+  for (const AliasSet &AS : *this)
+    if (!AS.Forward && AS.aliasesPointer(Ptr, Size, AAInfo, AA))
       return true;
   return false;
 }
 
 bool AliasSetTracker::containsUnknown(const Instruction *Inst) const {
-  for (const_iterator I = begin(), E = end(); I != E; ++I)
-    if (!I->Forward && I->aliasesUnknownInst(Inst, AA))
+  for (const AliasSet &AS : *this)
+    if (!AS.Forward && AS.aliasesUnknownInst(Inst, AA))
       return true;
   return false;
 }
@@ -274,12 +273,18 @@ AliasSet &AliasSetTracker::getAliasSetForPointer(Value *Pointer, uint64_t Size,
 
   // Check to see if the pointer is already known.
   if (Entry.hasAliasSet()) {
-    Entry.updateSizeAndAAInfo(Size, AAInfo);
+    // If the size changed, we may need to merge several alias sets.
+    // Note that we can *not* return the result of mergeAliasSetsForPointer
+    // due to a quirk of alias analysis behavior. Since alias(undef, undef)
+    // is NoAlias, mergeAliasSetsForPointer(undef, ...) will not find the
+    // the right set for undef, even if it exists.
+    if (Entry.updateSizeAndAAInfo(Size, AAInfo))
+      mergeAliasSetsForPointer(Pointer, Size, AAInfo);
     // Return the set!
     return *Entry.getAliasSet(*this)->getForwardedTarget(*this);
   }
   
-  if (AliasSet *AS = findAliasSetForPointer(Pointer, Size, AAInfo)) {
+  if (AliasSet *AS = mergeAliasSetsForPointer(Pointer, Size, AAInfo)) {
     // Add it to the alias set it aliases.
     AS->addPointer(*this, Entry, Size, AAInfo);
     return *AS;
@@ -300,7 +305,7 @@ bool AliasSetTracker::add(Value *Ptr, uint64_t Size, const AAMDNodes &AAInfo) {
 
 
 bool AliasSetTracker::add(LoadInst *LI) {
-  if (LI->getOrdering() > Monotonic) return addUnknown(LI);
+  if (isStrongerThanMonotonic(LI->getOrdering())) return addUnknown(LI);
 
   AAMDNodes AAInfo;
   LI->getAAMetadata(AAInfo);
@@ -316,7 +321,7 @@ bool AliasSetTracker::add(LoadInst *LI) {
 }
 
 bool AliasSetTracker::add(StoreInst *SI) {
-  if (SI->getOrdering() > Monotonic) return addUnknown(SI);
+  if (isStrongerThanMonotonic(SI->getOrdering())) return addUnknown(SI);
 
   AAMDNodes AAInfo;
   SI->getAAMetadata(AAInfo);
@@ -342,6 +347,24 @@ bool AliasSetTracker::add(VAArgInst *VAAI) {
   return NewPtr;
 }
 
+bool AliasSetTracker::add(MemSetInst *MSI) {
+  AAMDNodes AAInfo;
+  MSI->getAAMetadata(AAInfo);
+
+  bool NewPtr;
+  uint64_t Len;
+
+  if (ConstantInt *C = dyn_cast<ConstantInt>(MSI->getLength()))
+    Len = C->getZExtValue();
+  else
+    Len = MemoryLocation::UnknownSize;
+
+  AliasSet &AS =
+      addPointer(MSI->getRawDest(), Len, AAInfo, AliasSet::ModAccess, NewPtr);
+  if (MSI->isVolatile())
+    AS.setVolatile();
+  return NewPtr;
+}
 
 bool AliasSetTracker::addUnknown(Instruction *Inst) {
   if (isa<DbgInfoIntrinsic>(Inst)) 
@@ -368,7 +391,10 @@ bool AliasSetTracker::add(Instruction *I) {
     return add(SI);
   if (VAArgInst *VAAI = dyn_cast<VAArgInst>(I))
     return add(VAAI);
+  if (MemSetInst *MSI = dyn_cast<MemSetInst>(I))
+    return add(MSI);
   return addUnknown(I);
+  // FIXME: add support of memcpy and memmove. 
 }
 
 void AliasSetTracker::add(BasicBlock &BB) {
@@ -383,10 +409,9 @@ void AliasSetTracker::add(const AliasSetTracker &AST) {
   // Loop over all of the alias sets in AST, adding the pointers contained
   // therein into the current alias sets.  This can cause alias sets to be
   // merged together in the current AST.
-  for (const_iterator I = AST.begin(), E = AST.end(); I != E; ++I) {
-    if (I->Forward) continue;   // Ignore forwarding alias sets
-    
-    AliasSet &AS = const_cast<AliasSet&>(*I);
+  for (const AliasSet &AS : AST) {
+    if (AS.Forward)
+      continue; // Ignore forwarding alias sets
 
     // If there are any call sites in the alias set, add them to this AST.
     for (unsigned i = 0, e = AS.UnknownInsts.size(); i != e; ++i)
@@ -436,7 +461,7 @@ void AliasSetTracker::remove(AliasSet &AS) {
 
 bool
 AliasSetTracker::remove(Value *Ptr, uint64_t Size, const AAMDNodes &AAInfo) {
-  AliasSet *AS = findAliasSetForPointer(Ptr, Size, AAInfo);
+  AliasSet *AS = mergeAliasSetsForPointer(Ptr, Size, AAInfo);
   if (!AS) return false;
   remove(*AS);
   return true;
@@ -449,7 +474,7 @@ bool AliasSetTracker::remove(LoadInst *LI) {
   AAMDNodes AAInfo;
   LI->getAAMetadata(AAInfo);
 
-  AliasSet *AS = findAliasSetForPointer(LI->getOperand(0), Size, AAInfo);
+  AliasSet *AS = mergeAliasSetsForPointer(LI->getOperand(0), Size, AAInfo);
   if (!AS) return false;
   remove(*AS);
   return true;
@@ -462,7 +487,7 @@ bool AliasSetTracker::remove(StoreInst *SI) {
   AAMDNodes AAInfo;
   SI->getAAMetadata(AAInfo);
 
-  AliasSet *AS = findAliasSetForPointer(SI->getOperand(1), Size, AAInfo);
+  AliasSet *AS = mergeAliasSetsForPointer(SI->getOperand(1), Size, AAInfo);
   if (!AS) return false;
   remove(*AS);
   return true;
@@ -472,13 +497,30 @@ bool AliasSetTracker::remove(VAArgInst *VAAI) {
   AAMDNodes AAInfo;
   VAAI->getAAMetadata(AAInfo);
 
-  AliasSet *AS = findAliasSetForPointer(VAAI->getOperand(0),
-                                        MemoryLocation::UnknownSize, AAInfo);
+  AliasSet *AS = mergeAliasSetsForPointer(VAAI->getOperand(0),
+                                          MemoryLocation::UnknownSize, AAInfo);
   if (!AS) return false;
   remove(*AS);
   return true;
 }
 
+bool AliasSetTracker::remove(MemSetInst *MSI) {
+  AAMDNodes AAInfo;
+  MSI->getAAMetadata(AAInfo);
+  uint64_t Len;
+
+  if (ConstantInt *C = dyn_cast<ConstantInt>(MSI->getLength()))
+    Len = C->getZExtValue();
+  else
+    Len = MemoryLocation::UnknownSize;
+
+  AliasSet *AS = mergeAliasSetsForPointer(MSI->getRawDest(), Len, AAInfo);
+  if (!AS)
+    return false;
+  remove(*AS);
+  return true;
+}
+
 bool AliasSetTracker::removeUnknown(Instruction *I) {
   if (!I->mayReadOrWriteMemory())
     return false; // doesn't alias anything
@@ -497,7 +539,10 @@ bool AliasSetTracker::remove(Instruction *I) {
     return remove(SI);
   if (VAArgInst *VAAI = dyn_cast<VAArgInst>(I))
     return remove(VAAI);
+  if (MemSetInst *MSI = dyn_cast<MemSetInst>(I))
+    return remove(MSI);
   return removeUnknown(I);
+  // FIXME: add support of memcpy and memmove. 
 }
 
 
@@ -602,14 +647,14 @@ void AliasSet::print(raw_ostream &OS) const {
 void AliasSetTracker::print(raw_ostream &OS) const {
   OS << "Alias Set Tracker: " << AliasSets.size() << " alias sets for "
      << PointerMap.size() << " pointer values.\n";
-  for (const_iterator I = begin(), E = end(); I != E; ++I)
-    I->print(OS);
+  for (const AliasSet &AS : *this)
+    AS.print(OS);
   OS << "\n";
 }
 
 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
-void AliasSet::dump() const { print(dbgs()); }
-void AliasSetTracker::dump() const { print(dbgs()); }
+LLVM_DUMP_METHOD void AliasSet::dump() const { print(dbgs()); }
+LLVM_DUMP_METHOD void AliasSetTracker::dump() const { print(dbgs()); }
 #endif
 
 //===----------------------------------------------------------------------===//
diff --git a/lib/Analysis/Analysis.cpp b/lib/Analysis/Analysis.cpp
index 9c1ac000be2c..c04447ca58c9 100644
--- a/lib/Analysis/Analysis.cpp
+++ b/lib/Analysis/Analysis.cpp
@@ -20,25 +20,27 @@ using namespace llvm;
 
 /// initializeAnalysis - Initialize all passes linked into the Analysis library.
 void llvm::initializeAnalysis(PassRegistry &Registry) {
-  initializeAAEvalPass(Registry);
+  initializeAAEvalLegacyPassPass(Registry);
   initializeAliasSetPrinterPass(Registry);
   initializeBasicAAWrapperPassPass(Registry);
   initializeBlockFrequencyInfoWrapperPassPass(Registry);
   initializeBranchProbabilityInfoWrapperPassPass(Registry);
   initializeCallGraphWrapperPassPass(Registry);
-  initializeCallGraphPrinterPass(Registry);
+  initializeCallGraphDOTPrinterPass(Registry);
+  initializeCallGraphPrinterLegacyPassPass(Registry);
   initializeCallGraphViewerPass(Registry);
   initializeCostModelAnalysisPass(Registry);
   initializeCFGViewerPass(Registry);
   initializeCFGPrinterPass(Registry);
   initializeCFGOnlyViewerPass(Registry);
   initializeCFGOnlyPrinterPass(Registry);
-  initializeCFLAAWrapperPassPass(Registry);
-  initializeDependenceAnalysisPass(Registry);
+  initializeCFLAndersAAWrapperPassPass(Registry);
+  initializeCFLSteensAAWrapperPassPass(Registry);
+  initializeDependenceAnalysisWrapperPassPass(Registry);
   initializeDelinearizationPass(Registry);
-  initializeDemandedBitsPass(Registry);
+  initializeDemandedBitsWrapperPassPass(Registry);
   initializeDivergenceAnalysisPass(Registry);
-  initializeDominanceFrontierPass(Registry);
+  initializeDominanceFrontierWrapperPassPass(Registry);
   initializeDomViewerPass(Registry);
   initializeDomPrinterPass(Registry);
   initializeDomOnlyViewerPass(Registry);
@@ -49,18 +51,21 @@ void llvm::initializeAnalysis(PassRegistry &Registry) {
   initializePostDomOnlyPrinterPass(Registry);
   initializeAAResultsWrapperPassPass(Registry);
   initializeGlobalsAAWrapperPassPass(Registry);
-  initializeIVUsersPass(Registry);
+  initializeIVUsersWrapperPassPass(Registry);
   initializeInstCountPass(Registry);
   initializeIntervalPartitionPass(Registry);
-  initializeLazyValueInfoPass(Registry);
+  initializeLazyBlockFrequencyInfoPassPass(Registry);
+  initializeLazyValueInfoWrapperPassPass(Registry);
   initializeLintPass(Registry);
   initializeLoopInfoWrapperPassPass(Registry);
   initializeMemDepPrinterPass(Registry);
   initializeMemDerefPrinterPass(Registry);
-  initializeMemoryDependenceAnalysisPass(Registry);
+  initializeMemoryDependenceWrapperPassPass(Registry);
   initializeModuleDebugInfoPrinterPass(Registry);
+  initializeModuleSummaryIndexWrapperPassPass(Registry);
   initializeObjCARCAAWrapperPassPass(Registry);
-  initializePostDominatorTreePass(Registry);
+  initializeOptimizationRemarkEmitterWrapperPassPass(Registry);
+  initializePostDominatorTreeWrapperPassPass(Registry);
   initializeRegionInfoPassPass(Registry);
   initializeRegionViewerPass(Registry);
   initializeRegionPrinterPass(Registry);
diff --git a/lib/Analysis/AssumptionCache.cpp b/lib/Analysis/AssumptionCache.cpp
index f468a43ef0b8..ca71644757f0 100644
--- a/lib/Analysis/AssumptionCache.cpp
+++ b/lib/Analysis/AssumptionCache.cpp
@@ -77,8 +77,8 @@ void AssumptionCache::registerAssumption(CallInst *CI) {
 char AssumptionAnalysis::PassID;
 
 PreservedAnalyses AssumptionPrinterPass::run(Function &F,
-                                             AnalysisManager<Function> *AM) {
-  AssumptionCache &AC = AM->getResult<AssumptionAnalysis>(F);
+                                             AnalysisManager<Function> &AM) {
+  AssumptionCache &AC = AM.getResult<AssumptionAnalysis>(F);
 
   OS << "Cached assumptions for function: " << F.getName() << "\n";
   for (auto &VH : AC.assumptions())
diff --git a/lib/Analysis/BasicAliasAnalysis.cpp b/lib/Analysis/BasicAliasAnalysis.cpp
index c3d280350b90..43d5c3ccf907 100644
--- a/lib/Analysis/BasicAliasAnalysis.cpp
+++ b/lib/Analysis/BasicAliasAnalysis.cpp
@@ -37,22 +37,23 @@
 #include "llvm/Pass.h"
 #include "llvm/Support/ErrorHandling.h"
 #include <algorithm>
+
+#define DEBUG_TYPE "basicaa"
+
 using namespace llvm;
 
 /// Enable analysis of recursive PHI nodes.
 static cl::opt<bool> EnableRecPhiAnalysis("basicaa-recphi", cl::Hidden,
                                           cl::init(false));
-
 /// SearchLimitReached / SearchTimes shows how often the limit of
 /// to decompose GEPs is reached. It will affect the precision
 /// of basic alias analysis.
-#define DEBUG_TYPE "basicaa"
 STATISTIC(SearchLimitReached, "Number of times the limit to "
                               "decompose GEPs is reached");
 STATISTIC(SearchTimes, "Number of times a GEP is decomposed");
 
 /// Cutoff after which to stop analysing a set of phi nodes potentially involved
-/// in a cycle. Because we are analysing 'through' phi nodes we need to be
+/// in a cycle. Because we are analysing 'through' phi nodes, we need to be
 /// careful with value equivalence. We use reachability to make sure a value
 /// cannot be involved in a cycle.
 const unsigned MaxNumPhiBBsValueReachabilityCheck = 20;
@@ -83,7 +84,7 @@ static bool isNonEscapingLocalObject(const Value *V) {
   // inside the function.
   if (const Argument *A = dyn_cast<Argument>(V))
     if (A->hasByValAttr() || A->hasNoAliasAttr())
-      // Note even if the argument is marked nocapture we still need to check
+      // Note even if the argument is marked nocapture, we still need to check
       // for copies made inside the function. The nocapture attribute only
       // specifies that there are no copies made that outlive the function.
       return !PointerMayBeCaptured(V, false, /*StoreCaptures=*/true);
@@ -106,7 +107,7 @@ static bool isEscapeSource(const Value *V) {
   return false;
 }
 
-/// Returns the size of the object specified by V, or UnknownSize if unknown.
+/// Returns the size of the object specified by V or UnknownSize if unknown.
 static uint64_t getObjectSize(const Value *V, const DataLayout &DL,
                               const TargetLibraryInfo &TLI,
                               bool RoundToAlign = false) {
@@ -173,7 +174,7 @@ static bool isObjectSize(const Value *V, uint64_t Size, const DataLayout &DL,
 ///
 /// Returns the scale and offset values as APInts and return V as a Value*, and
 /// return whether we looked through any sign or zero extends.  The incoming
-/// Value is known to have IntegerType and it may already be sign or zero
+/// Value is known to have IntegerType, and it may already be sign or zero
 /// extended.
 ///
 /// Note that this looks through extends, so the high bits may not be
@@ -192,8 +193,8 @@ static bool isObjectSize(const Value *V, uint64_t Size, const DataLayout &DL,
   }
 
   if (const ConstantInt *Const = dyn_cast<ConstantInt>(V)) {
-    // if it's a constant, just convert it to an offset and remove the variable.
-    // If we've been called recursively the Offset bit width will be greater
+    // If it's a constant, just convert it to an offset and remove the variable.
+    // If we've been called recursively, the Offset bit width will be greater
     // than the constant's (the Offset's always as wide as the outermost call),
     // so we'll zext here and process any extension in the isa<SExtInst> &
     // isa<ZExtInst> cases below.
@@ -205,8 +206,8 @@ static bool isObjectSize(const Value *V, uint64_t Size, const DataLayout &DL,
   if (const BinaryOperator *BOp = dyn_cast<BinaryOperator>(V)) {
     if (ConstantInt *RHSC = dyn_cast<ConstantInt>(BOp->getOperand(1))) {
 
-      // If we've been called recursively then Offset and Scale will be wider
-      // that the BOp operands. We'll always zext it here as we'll process sign
+      // If we've been called recursively, then Offset and Scale will be wider
+      // than the BOp operands. We'll always zext it here as we'll process sign
       // extensions below (see the isa<SExtInst> / isa<ZExtInst> cases).
       APInt RHS = RHSC->getValue().zextOrSelf(Offset.getBitWidth());
 
@@ -319,6 +320,16 @@ static bool isObjectSize(const Value *V, uint64_t Size, const DataLayout &DL,
   return V;
 }
 
+/// To ensure a pointer offset fits in an integer of size PointerSize
+/// (in bits) when that size is smaller than 64. This is an issue in
+/// particular for 32b programs with negative indices that rely on two's
+/// complement wrap-arounds for precise alias information.
+static int64_t adjustToPointerSize(int64_t Offset, unsigned PointerSize) {
+  assert(PointerSize <= 64 && "Invalid PointerSize!");
+  unsigned ShiftBits = 64 - PointerSize;
+  return (int64_t)((uint64_t)Offset << ShiftBits) >> ShiftBits;
+}
+
 /// If V is a symbolic pointer expression, decompose it into a base pointer
 /// with a constant offset and a number of scaled symbolic offsets.
 ///
@@ -332,28 +343,29 @@ static bool isObjectSize(const Value *V, uint64_t Size, const DataLayout &DL,
 /// GetUnderlyingObject and DecomposeGEPExpression must use the same search
 /// depth (MaxLookupSearchDepth). When DataLayout not is around, it just looks
 /// through pointer casts.
-/*static*/ const Value *BasicAAResult::DecomposeGEPExpression(
-    const Value *V, int64_t &BaseOffs,
-    SmallVectorImpl<VariableGEPIndex> &VarIndices, bool &MaxLookupReached,
-    const DataLayout &DL, AssumptionCache *AC, DominatorTree *DT) {
+bool BasicAAResult::DecomposeGEPExpression(const Value *V,
+       DecomposedGEP &Decomposed, const DataLayout &DL, AssumptionCache *AC,
+       DominatorTree *DT) {
   // Limit recursion depth to limit compile time in crazy cases.
   unsigned MaxLookup = MaxLookupSearchDepth;
-  MaxLookupReached = false;
   SearchTimes++;
 
-  BaseOffs = 0;
+  Decomposed.StructOffset = 0;
+  Decomposed.OtherOffset = 0;
+  Decomposed.VarIndices.clear();
   do {
     // See if this is a bitcast or GEP.
     const Operator *Op = dyn_cast<Operator>(V);
     if (!Op) {
       // The only non-operator case we can handle are GlobalAliases.
       if (const GlobalAlias *GA = dyn_cast<GlobalAlias>(V)) {
-        if (!GA->mayBeOverridden()) {
+        if (!GA->isInterposable()) {
           V = GA->getAliasee();
           continue;
         }
       }
-      return V;
+      Decomposed.Base = V;
+      return false;
     }
 
     if (Op->getOpcode() == Instruction::BitCast ||
@@ -364,6 +376,12 @@ static bool isObjectSize(const Value *V, uint64_t Size, const DataLayout &DL,
 
     const GEPOperator *GEPOp = dyn_cast<GEPOperator>(Op);
     if (!GEPOp) {
+      if (auto CS = ImmutableCallSite(V))
+        if (const Value *RV = CS.getReturnedArgOperand()) {
+          V = RV;
+          continue;
+        }
+
       // If it's not a GEP, hand it off to SimplifyInstruction to see if it
       // can come up with something. This matches what GetUnderlyingObject does.
       if (const Instruction *I = dyn_cast<Instruction>(V))
@@ -377,16 +395,20 @@ static bool isObjectSize(const Value *V, uint64_t Size, const DataLayout &DL,
           continue;
         }
 
-      return V;
+      Decomposed.Base = V;
+      return false;
     }
 
     // Don't attempt to analyze GEPs over unsized objects.
-    if (!GEPOp->getOperand(0)->getType()->getPointerElementType()->isSized())
-      return V;
+    if (!GEPOp->getSourceElementType()->isSized()) {
+      Decomposed.Base = V;
+      return false;
+    }
 
     unsigned AS = GEPOp->getPointerAddressSpace();
     // Walk the indices of the GEP, accumulating them into BaseOff/VarIndices.
     gep_type_iterator GTI = gep_type_begin(GEPOp);
+    unsigned PointerSize = DL.getPointerSizeInBits(AS);
     for (User::const_op_iterator I = GEPOp->op_begin() + 1, E = GEPOp->op_end();
          I != E; ++I) {
       const Value *Index = *I;
@@ -397,7 +419,8 @@ static bool isObjectSize(const Value *V, uint64_t Size, const DataLayout &DL,
         if (FieldNo == 0)
           continue;
 
-        BaseOffs += DL.getStructLayout(STy)->getElementOffset(FieldNo);
+        Decomposed.StructOffset +=
+          DL.getStructLayout(STy)->getElementOffset(FieldNo);
         continue;
       }
 
@@ -405,7 +428,8 @@ static bool isObjectSize(const Value *V, uint64_t Size, const DataLayout &DL,
       if (const ConstantInt *CIdx = dyn_cast<ConstantInt>(Index)) {
         if (CIdx->isZero())
           continue;
-        BaseOffs += DL.getTypeAllocSize(*GTI) * CIdx->getSExtValue();
+        Decomposed.OtherOffset +=
+          DL.getTypeAllocSize(*GTI) * CIdx->getSExtValue();
         continue;
       }
 
@@ -415,7 +439,6 @@ static bool isObjectSize(const Value *V, uint64_t Size, const DataLayout &DL,
       // If the integer type is smaller than the pointer size, it is implicitly
       // sign extended to pointer size.
       unsigned Width = Index->getType()->getIntegerBitWidth();
-      unsigned PointerSize = DL.getPointerSizeInBits(AS);
       if (PointerSize > Width)
         SExtBits += PointerSize - Width;
 
@@ -427,44 +450,48 @@ static bool isObjectSize(const Value *V, uint64_t Size, const DataLayout &DL,
 
       // The GEP index scale ("Scale") scales C1*V+C2, yielding (C1*V+C2)*Scale.
       // This gives us an aggregate computation of (C1*Scale)*V + C2*Scale.
-      BaseOffs += IndexOffset.getSExtValue() * Scale;
+      Decomposed.OtherOffset += IndexOffset.getSExtValue() * Scale;
       Scale *= IndexScale.getSExtValue();
 
       // If we already had an occurrence of this index variable, merge this
       // scale into it.  For example, we want to handle:
       //   A[x][x] -> x*16 + x*4 -> x*20
       // This also ensures that 'x' only appears in the index list once.
-      for (unsigned i = 0, e = VarIndices.size(); i != e; ++i) {
-        if (VarIndices[i].V == Index && VarIndices[i].ZExtBits == ZExtBits &&
-            VarIndices[i].SExtBits == SExtBits) {
-          Scale += VarIndices[i].Scale;
-          VarIndices.erase(VarIndices.begin() + i);
+      for (unsigned i = 0, e = Decomposed.VarIndices.size(); i != e; ++i) {
+        if (Decomposed.VarIndices[i].V == Index && 
+            Decomposed.VarIndices[i].ZExtBits == ZExtBits &&
+            Decomposed.VarIndices[i].SExtBits == SExtBits) {
+          Scale += Decomposed.VarIndices[i].Scale;
+          Decomposed.VarIndices.erase(Decomposed.VarIndices.begin() + i);
           break;
         }
       }
 
       // Make sure that we have a scale that makes sense for this target's
       // pointer size.
-      if (unsigned ShiftBits = 64 - PointerSize) {
-        Scale <<= ShiftBits;
-        Scale = (int64_t)Scale >> ShiftBits;
-      }
+      Scale = adjustToPointerSize(Scale, PointerSize);
 
       if (Scale) {
         VariableGEPIndex Entry = {Index, ZExtBits, SExtBits,
                                   static_cast<int64_t>(Scale)};
-        VarIndices.push_back(Entry);
+        Decomposed.VarIndices.push_back(Entry);
       }
     }
 
+    // Take care of wrap-arounds
+    Decomposed.StructOffset =
+      adjustToPointerSize(Decomposed.StructOffset, PointerSize);
+    Decomposed.OtherOffset =
+      adjustToPointerSize(Decomposed.OtherOffset, PointerSize);
+
     // Analyze the base pointer next.
     V = GEPOp->getOperand(0);
   } while (--MaxLookup);
 
   // If the chain of expressions is too deep, just return early.
-  MaxLookupReached = true;
+  Decomposed.Base = V;
   SearchLimitReached++;
-  return V;
+  return true;
 }
 
 /// Returns whether the given pointer value points to memory that is local to
@@ -530,22 +557,6 @@ bool BasicAAResult::pointsToConstantMemory(const MemoryLocation &Loc,
   return Worklist.empty();
 }
 
-// FIXME: This code is duplicated with MemoryLocation and should be hoisted to
-// some common utility location.
-static bool isMemsetPattern16(const Function *MS,
-                              const TargetLibraryInfo &TLI) {
-  if (TLI.has(LibFunc::memset_pattern16) &&
-      MS->getName() == "memset_pattern16") {
-    FunctionType *MemsetType = MS->getFunctionType();
-    if (!MemsetType->isVarArg() && MemsetType->getNumParams() == 3 &&
-        isa<PointerType>(MemsetType->getParamType(0)) &&
-        isa<PointerType>(MemsetType->getParamType(1)) &&
-        isa<IntegerType>(MemsetType->getParamType(2)))
-      return true;
-  }
-  return false;
-}
-
 /// Returns the behavior when calling the given call site.
 FunctionModRefBehavior BasicAAResult::getModRefBehavior(ImmutableCallSite CS) {
   if (CS.doesNotAccessMemory())
@@ -558,12 +569,21 @@ FunctionModRefBehavior BasicAAResult::getModRefBehavior(ImmutableCallSite CS) {
   // than that.
   if (CS.onlyReadsMemory())
     Min = FMRB_OnlyReadsMemory;
+  else if (CS.doesNotReadMemory())
+    Min = FMRB_DoesNotReadMemory;
 
   if (CS.onlyAccessesArgMemory())
     Min = FunctionModRefBehavior(Min & FMRB_OnlyAccessesArgumentPointees);
 
-  // The AAResultBase base class has some smarts, lets use them.
-  return FunctionModRefBehavior(AAResultBase::getModRefBehavior(CS) & Min);
+  // If CS has operand bundles then aliasing attributes from the function it
+  // calls do not directly apply to the CallSite.  This can be made more
+  // precise in the future.
+  if (!CS.hasOperandBundles())
+    if (const Function *F = CS.getCalledFunction())
+      Min =
+          FunctionModRefBehavior(Min & getBestAAResults().getModRefBehavior(F));
+
+  return Min;
 }
 
 /// Returns the behavior when calling the given function. For use when the call
@@ -578,41 +598,30 @@ FunctionModRefBehavior BasicAAResult::getModRefBehavior(const Function *F) {
   // If the function declares it only reads memory, go with that.
   if (F->onlyReadsMemory())
     Min = FMRB_OnlyReadsMemory;
+  else if (F->doesNotReadMemory())
+    Min = FMRB_DoesNotReadMemory;
 
   if (F->onlyAccessesArgMemory())
     Min = FunctionModRefBehavior(Min & FMRB_OnlyAccessesArgumentPointees);
 
-  // Otherwise be conservative.
-  return FunctionModRefBehavior(AAResultBase::getModRefBehavior(F) & Min);
+  return Min;
 }
 
-/// Returns true if this is a writeonly (i.e Mod only) parameter.  Currently,
-/// we don't have a writeonly attribute, so this only knows about builtin
-/// intrinsics and target library functions.  We could consider adding a
-/// writeonly attribute in the future and moving all of these facts to either
-/// Intrinsics.td or InferFunctionAttr.cpp
+/// Returns true if this is a writeonly (i.e Mod only) parameter.
 static bool isWriteOnlyParam(ImmutableCallSite CS, unsigned ArgIdx,
                              const TargetLibraryInfo &TLI) {
-  if (const IntrinsicInst *II = dyn_cast<IntrinsicInst>(CS.getInstruction()))
-    switch (II->getIntrinsicID()) {
-    default:
-      break;
-    case Intrinsic::memset:
-    case Intrinsic::memcpy:
-    case Intrinsic::memmove:
-      // We don't currently have a writeonly attribute.  All other properties
-      // of these intrinsics are nicely described via attributes in
-      // Intrinsics.td and handled generically.
-      if (ArgIdx == 0)
-        return true;
-    }
+  if (CS.paramHasAttr(ArgIdx + 1, Attribute::WriteOnly))
+    return true;
 
   // We can bound the aliasing properties of memset_pattern16 just as we can
   // for memcpy/memset.  This is particularly important because the
   // LoopIdiomRecognizer likes to turn loops into calls to memset_pattern16
-  // whenever possible.  Note that all but the missing writeonly attribute are
-  // handled via InferFunctionAttr.
-  if (CS.getCalledFunction() && isMemsetPattern16(CS.getCalledFunction(), TLI))
+  // whenever possible.
+  // FIXME Consider handling this in InferFunctionAttr.cpp together with other
+  // attributes.
+  LibFunc::Func F;
+  if (CS.getCalledFunction() && TLI.getLibFunc(*CS.getCalledFunction(), F) &&
+      F == LibFunc::memset_pattern16 && TLI.has(F))
     if (ArgIdx == 0)
       return true;
 
@@ -626,8 +635,7 @@ static bool isWriteOnlyParam(ImmutableCallSite CS, unsigned ArgIdx,
 ModRefInfo BasicAAResult::getArgModRefInfo(ImmutableCallSite CS,
                                            unsigned ArgIdx) {
 
-  // Emulate the missing writeonly attribute by checking for known builtin
-  // intrinsics and target library functions.
+  // Checking for known builtin intrinsics and target library functions.
   if (isWriteOnlyParam(CS, ArgIdx, TLI))
     return MRI_Mod;
 
@@ -640,9 +648,9 @@ ModRefInfo BasicAAResult::getArgModRefInfo(ImmutableCallSite CS,
   return AAResultBase::getArgModRefInfo(CS, ArgIdx);
 }
 
-static bool isAssumeIntrinsic(ImmutableCallSite CS) {
+static bool isIntrinsicCall(ImmutableCallSite CS, Intrinsic::ID IID) {
   const IntrinsicInst *II = dyn_cast<IntrinsicInst>(CS.getInstruction());
-  return II && II->getIntrinsicID() == Intrinsic::assume;
+  return II && II->getIntrinsicID() == IID;
 }
 
 #ifndef NDEBUG
@@ -717,14 +725,14 @@ ModRefInfo BasicAAResult::getModRefInfo(ImmutableCallSite CS,
   if (!isa<Constant>(Object) && CS.getInstruction() != Object &&
       isNonEscapingLocalObject(Object)) {
     bool PassedAsArg = false;
-    unsigned ArgNo = 0;
-    for (ImmutableCallSite::arg_iterator CI = CS.arg_begin(), CE = CS.arg_end();
-         CI != CE; ++CI, ++ArgNo) {
+    unsigned OperandNo = 0;
+    for (auto CI = CS.data_operands_begin(), CE = CS.data_operands_end();
+         CI != CE; ++CI, ++OperandNo) {
       // Only look at the no-capture or byval pointer arguments.  If this
       // pointer were passed to arguments that were neither of these, then it
       // couldn't be no-capture.
       if (!(*CI)->getType()->isPointerTy() ||
-          (!CS.doesNotCapture(ArgNo) && !CS.isByValArgument(ArgNo)))
+          (!CS.doesNotCapture(OperandNo) && !CS.isByValArgument(OperandNo)))
         continue;
 
       // If this is a no-capture pointer argument, see if we can tell that it
@@ -743,12 +751,36 @@ ModRefInfo BasicAAResult::getModRefInfo(ImmutableCallSite CS,
       return MRI_NoModRef;
   }
 
+  // If the CallSite is to malloc or calloc, we can assume that it doesn't
+  // modify any IR visible value.  This is only valid because we assume these
+  // routines do not read values visible in the IR.  TODO: Consider special
+  // casing realloc and strdup routines which access only their arguments as
+  // well.  Or alternatively, replace all of this with inaccessiblememonly once
+  // that's implemented fully. 
+  auto *Inst = CS.getInstruction();
+  if (isMallocLikeFn(Inst, &TLI) || isCallocLikeFn(Inst, &TLI)) {
+    // Be conservative if the accessed pointer may alias the allocation -
+    // fallback to the generic handling below.
+    if (getBestAAResults().alias(MemoryLocation(Inst), Loc) == NoAlias)
+      return MRI_NoModRef;
+  }
+
   // While the assume intrinsic is marked as arbitrarily writing so that
   // proper control dependencies will be maintained, it never aliases any
   // particular memory location.
-  if (isAssumeIntrinsic(CS))
+  if (isIntrinsicCall(CS, Intrinsic::assume))
     return MRI_NoModRef;
 
+  // Like assumes, guard intrinsics are also marked as arbitrarily writing so
+  // that proper control dependencies are maintained but they never mods any
+  // particular memory location.
+  //
+  // *Unlike* assumes, guard intrinsics are modeled as reading memory since the
+  // heap state at the point the guard is issued needs to be consistent in case
+  // the guard invokes the "deopt" continuation.
+  if (isIntrinsicCall(CS, Intrinsic::experimental_guard))
+    return MRI_Ref;
+
   // The AAResultBase base class has some smarts, lets use them.
   return AAResultBase::getModRefInfo(CS, Loc);
 }
@@ -758,9 +790,27 @@ ModRefInfo BasicAAResult::getModRefInfo(ImmutableCallSite CS1,
   // While the assume intrinsic is marked as arbitrarily writing so that
   // proper control dependencies will be maintained, it never aliases any
   // particular memory location.
-  if (isAssumeIntrinsic(CS1) || isAssumeIntrinsic(CS2))
+  if (isIntrinsicCall(CS1, Intrinsic::assume) ||
+      isIntrinsicCall(CS2, Intrinsic::assume))
     return MRI_NoModRef;
 
+  // Like assumes, guard intrinsics are also marked as arbitrarily writing so
+  // that proper control dependencies are maintained but they never mod any
+  // particular memory location.
+  //
+  // *Unlike* assumes, guard intrinsics are modeled as reading memory since the
+  // heap state at the point the guard is issued needs to be consistent in case
+  // the guard invokes the "deopt" continuation.
+
+  // NB! This function is *not* commutative, so we specical case two
+  // possibilities for guard intrinsics.
+
+  if (isIntrinsicCall(CS1, Intrinsic::experimental_guard))
+    return getModRefBehavior(CS2) & MRI_Mod ? MRI_Ref : MRI_NoModRef;
+
+  if (isIntrinsicCall(CS2, Intrinsic::experimental_guard))
+    return getModRefBehavior(CS1) & MRI_Mod ? MRI_Mod : MRI_NoModRef;
+
   // The AAResultBase base class has some smarts, lets use them.
   return AAResultBase::getModRefInfo(CS1, CS2);
 }
@@ -773,7 +823,10 @@ static AliasResult aliasSameBasePointerGEPs(const GEPOperator *GEP1,
                                             uint64_t V2Size,
                                             const DataLayout &DL) {
 
-  assert(GEP1->getPointerOperand() == GEP2->getPointerOperand() &&
+  assert(GEP1->getPointerOperand()->stripPointerCasts() ==
+         GEP2->getPointerOperand()->stripPointerCasts() &&
+         GEP1->getPointerOperand()->getType() ==
+         GEP2->getPointerOperand()->getType() &&
          "Expected GEPs with the same pointer operand");
 
   // Try to determine whether GEP1 and GEP2 index through arrays, into structs,
@@ -796,7 +849,7 @@ static AliasResult aliasSameBasePointerGEPs(const GEPOperator *GEP1,
 
   // If the last (struct) indices are constants and are equal, the other indices
   // might be also be dynamically equal, so the GEPs can alias.
-  if (C1 && C2 && C1 == C2)
+  if (C1 && C2 && C1->getSExtValue() == C2->getSExtValue())
     return MayAlias;
 
   // Find the last-indexed type of the GEP, i.e., the type you'd get if
@@ -895,6 +948,67 @@ static AliasResult aliasSameBasePointerGEPs(const GEPOperator *GEP1,
   return MayAlias;
 }
 
+// If a we have (a) a GEP and (b) a pointer based on an alloca, and the
+// beginning of the object the GEP points would have a negative offset with
+// repsect to the alloca, that means the GEP can not alias pointer (b).
+// Note that the pointer based on the alloca may not be a GEP. For
+// example, it may be the alloca itself.
+// The same applies if (b) is based on a GlobalVariable. Note that just being
+// based on isIdentifiedObject() is not enough - we need an identified object
+// that does not permit access to negative offsets. For example, a negative
+// offset from a noalias argument or call can be inbounds w.r.t the actual
+// underlying object.
+//
+// For example, consider:
+//
+//   struct { int f0, int f1, ...} foo;
+//   foo alloca;
+//   foo* random = bar(alloca);
+//   int *f0 = &alloca.f0
+//   int *f1 = &random->f1;
+//
+// Which is lowered, approximately, to:
+//
+//  %alloca = alloca %struct.foo
+//  %random = call %struct.foo* @random(%struct.foo* %alloca)
+//  %f0 = getelementptr inbounds %struct, %struct.foo* %alloca, i32 0, i32 0
+//  %f1 = getelementptr inbounds %struct, %struct.foo* %random, i32 0, i32 1
+//
+// Assume %f1 and %f0 alias. Then %f1 would point into the object allocated
+// by %alloca. Since the %f1 GEP is inbounds, that means %random must also
+// point into the same object. But since %f0 points to the beginning of %alloca,
+// the highest %f1 can be is (%alloca + 3). This means %random can not be higher
+// than (%alloca - 1), and so is not inbounds, a contradiction.
+bool BasicAAResult::isGEPBaseAtNegativeOffset(const GEPOperator *GEPOp,
+      const DecomposedGEP &DecompGEP, const DecomposedGEP &DecompObject, 
+      uint64_t ObjectAccessSize) {
+  // If the object access size is unknown, or the GEP isn't inbounds, bail.
+  if (ObjectAccessSize == MemoryLocation::UnknownSize || !GEPOp->isInBounds())
+    return false;
+
+  // We need the object to be an alloca or a globalvariable, and want to know
+  // the offset of the pointer from the object precisely, so no variable
+  // indices are allowed.
+  if (!(isa<AllocaInst>(DecompObject.Base) ||
+        isa<GlobalVariable>(DecompObject.Base)) ||
+      !DecompObject.VarIndices.empty())
+    return false;
+
+  int64_t ObjectBaseOffset = DecompObject.StructOffset +
+                             DecompObject.OtherOffset;
+
+  // If the GEP has no variable indices, we know the precise offset
+  // from the base, then use it. If the GEP has variable indices, we're in
+  // a bit more trouble: we can't count on the constant offsets that come
+  // from non-struct sources, since these can be "rewound" by a negative
+  // variable offset. So use only offsets that came from structs.
+  int64_t GEPBaseOffset = DecompGEP.StructOffset;
+  if (DecompGEP.VarIndices.empty())
+    GEPBaseOffset += DecompGEP.OtherOffset;
+
+  return (GEPBaseOffset >= ObjectBaseOffset + (int64_t)ObjectAccessSize);
+}
+
 /// Provides a bunch of ad-hoc rules to disambiguate a GEP instruction against
 /// another pointer.
 ///
@@ -906,14 +1020,34 @@ AliasResult BasicAAResult::aliasGEP(const GEPOperator *GEP1, uint64_t V1Size,
                                     uint64_t V2Size, const AAMDNodes &V2AAInfo,
                                     const Value *UnderlyingV1,
                                     const Value *UnderlyingV2) {
-  int64_t GEP1BaseOffset;
-  bool GEP1MaxLookupReached;
-  SmallVector<VariableGEPIndex, 4> GEP1VariableIndices;
-
+  DecomposedGEP DecompGEP1, DecompGEP2;
+  bool GEP1MaxLookupReached =
+    DecomposeGEPExpression(GEP1, DecompGEP1, DL, &AC, DT);
+  bool GEP2MaxLookupReached =
+    DecomposeGEPExpression(V2, DecompGEP2, DL, &AC, DT);
+
+  int64_t GEP1BaseOffset = DecompGEP1.StructOffset + DecompGEP1.OtherOffset;
+  int64_t GEP2BaseOffset = DecompGEP2.StructOffset + DecompGEP2.OtherOffset;
+
+  assert(DecompGEP1.Base == UnderlyingV1 && DecompGEP2.Base == UnderlyingV2 &&
+         "DecomposeGEPExpression returned a result different from "
+         "GetUnderlyingObject");
+
+  // If the GEP's offset relative to its base is such that the base would
+  // fall below the start of the object underlying V2, then the GEP and V2
+  // cannot alias.
+  if (!GEP1MaxLookupReached && !GEP2MaxLookupReached &&
+      isGEPBaseAtNegativeOffset(GEP1, DecompGEP1, DecompGEP2, V2Size))
+    return NoAlias;
   // If we have two gep instructions with must-alias or not-alias'ing base
   // pointers, figure out if the indexes to the GEP tell us anything about the
   // derived pointer.
   if (const GEPOperator *GEP2 = dyn_cast<GEPOperator>(V2)) {
+    // Check for the GEP base being at a negative offset, this time in the other
+    // direction.
+    if (!GEP1MaxLookupReached && !GEP2MaxLookupReached &&
+        isGEPBaseAtNegativeOffset(GEP2, DecompGEP2, DecompGEP1, V1Size))
+      return NoAlias;
     // Do the base pointers alias?
     AliasResult BaseAlias =
         aliasCheck(UnderlyingV1, MemoryLocation::UnknownSize, AAMDNodes(),
@@ -928,31 +1062,14 @@ AliasResult BasicAAResult::aliasGEP(const GEPOperator *GEP1, uint64_t V1Size,
       if (PreciseBaseAlias == NoAlias) {
         // See if the computed offset from the common pointer tells us about the
         // relation of the resulting pointer.
-        int64_t GEP2BaseOffset;
-        bool GEP2MaxLookupReached;
-        SmallVector<VariableGEPIndex, 4> GEP2VariableIndices;
-        const Value *GEP2BasePtr =
-            DecomposeGEPExpression(GEP2, GEP2BaseOffset, GEP2VariableIndices,
-                                   GEP2MaxLookupReached, DL, &AC, DT);
-        const Value *GEP1BasePtr =
-            DecomposeGEPExpression(GEP1, GEP1BaseOffset, GEP1VariableIndices,
-                                   GEP1MaxLookupReached, DL, &AC, DT);
-        // DecomposeGEPExpression and GetUnderlyingObject should return the
-        // same result except when DecomposeGEPExpression has no DataLayout.
-        // FIXME: They always have a DataLayout so this should become an
-        // assert.
-        if (GEP1BasePtr != UnderlyingV1 || GEP2BasePtr != UnderlyingV2) {
-          return MayAlias;
-        }
         // If the max search depth is reached the result is undefined
         if (GEP2MaxLookupReached || GEP1MaxLookupReached)
           return MayAlias;
 
         // Same offsets.
         if (GEP1BaseOffset == GEP2BaseOffset &&
-            GEP1VariableIndices == GEP2VariableIndices)
+            DecompGEP1.VarIndices == DecompGEP2.VarIndices)
           return NoAlias;
-        GEP1VariableIndices.clear();
       }
     }
 
@@ -964,42 +1081,27 @@ AliasResult BasicAAResult::aliasGEP(const GEPOperator *GEP1, uint64_t V1Size,
     // Otherwise, we have a MustAlias.  Since the base pointers alias each other
     // exactly, see if the computed offset from the common pointer tells us
     // about the relation of the resulting pointer.
-    const Value *GEP1BasePtr =
-        DecomposeGEPExpression(GEP1, GEP1BaseOffset, GEP1VariableIndices,
-                               GEP1MaxLookupReached, DL, &AC, DT);
-
-    int64_t GEP2BaseOffset;
-    bool GEP2MaxLookupReached;
-    SmallVector<VariableGEPIndex, 4> GEP2VariableIndices;
-    const Value *GEP2BasePtr =
-        DecomposeGEPExpression(GEP2, GEP2BaseOffset, GEP2VariableIndices,
-                               GEP2MaxLookupReached, DL, &AC, DT);
-
-    // DecomposeGEPExpression and GetUnderlyingObject should return the
-    // same result except when DecomposeGEPExpression has no DataLayout.
-    // FIXME: They always have a DataLayout so this should become an assert.
-    if (GEP1BasePtr != UnderlyingV1 || GEP2BasePtr != UnderlyingV2) {
-      return MayAlias;
-    }
-
     // If we know the two GEPs are based off of the exact same pointer (and not
     // just the same underlying object), see if that tells us anything about
     // the resulting pointers.
-    if (GEP1->getPointerOperand() == GEP2->getPointerOperand()) {
+    if (GEP1->getPointerOperand()->stripPointerCasts() ==
+        GEP2->getPointerOperand()->stripPointerCasts() &&
+        GEP1->getPointerOperand()->getType() ==
+        GEP2->getPointerOperand()->getType()) {
       AliasResult R = aliasSameBasePointerGEPs(GEP1, V1Size, GEP2, V2Size, DL);
       // If we couldn't find anything interesting, don't abandon just yet.
       if (R != MayAlias)
         return R;
     }
 
-    // If the max search depth is reached the result is undefined
+    // If the max search depth is reached, the result is undefined
     if (GEP2MaxLookupReached || GEP1MaxLookupReached)
       return MayAlias;
 
     // Subtract the GEP2 pointer from the GEP1 pointer to find out their
     // symbolic difference.
     GEP1BaseOffset -= GEP2BaseOffset;
-    GetIndexDifference(GEP1VariableIndices, GEP2VariableIndices);
+    GetIndexDifference(DecompGEP1.VarIndices, DecompGEP2.VarIndices);
 
   } else {
     // Check to see if these two pointers are related by the getelementptr
@@ -1021,16 +1123,6 @@ AliasResult BasicAAResult::aliasGEP(const GEPOperator *GEP1, uint64_t V1Size,
       // with the first operand of the getelementptr".
       return R;
 
-    const Value *GEP1BasePtr =
-        DecomposeGEPExpression(GEP1, GEP1BaseOffset, GEP1VariableIndices,
-                               GEP1MaxLookupReached, DL, &AC, DT);
-
-    // DecomposeGEPExpression and GetUnderlyingObject should return the
-    // same result except when DecomposeGEPExpression has no DataLayout.
-    // FIXME: They always have a DataLayout so this should become an assert.
-    if (GEP1BasePtr != UnderlyingV1) {
-      return MayAlias;
-    }
     // If the max search depth is reached the result is undefined
     if (GEP1MaxLookupReached)
       return MayAlias;
@@ -1038,18 +1130,18 @@ AliasResult BasicAAResult::aliasGEP(const GEPOperator *GEP1, uint64_t V1Size,
 
   // In the two GEP Case, if there is no difference in the offsets of the
   // computed pointers, the resultant pointers are a must alias.  This
-  // hapens when we have two lexically identical GEP's (for example).
+  // happens when we have two lexically identical GEP's (for example).
   //
   // In the other case, if we have getelementptr <ptr>, 0, 0, 0, 0, ... and V2
   // must aliases the GEP, the end result is a must alias also.
-  if (GEP1BaseOffset == 0 && GEP1VariableIndices.empty())
+  if (GEP1BaseOffset == 0 && DecompGEP1.VarIndices.empty())
     return MustAlias;
 
   // If there is a constant difference between the pointers, but the difference
   // is less than the size of the associated memory object, then we know
   // that the objects are partially overlapping.  If the difference is
   // greater, we know they do not overlap.
-  if (GEP1BaseOffset != 0 && GEP1VariableIndices.empty()) {
+  if (GEP1BaseOffset != 0 && DecompGEP1.VarIndices.empty()) {
     if (GEP1BaseOffset >= 0) {
       if (V2Size != MemoryLocation::UnknownSize) {
         if ((uint64_t)GEP1BaseOffset < V2Size)
@@ -1074,22 +1166,22 @@ AliasResult BasicAAResult::aliasGEP(const GEPOperator *GEP1, uint64_t V1Size,
     }
   }
 
-  if (!GEP1VariableIndices.empty()) {
+  if (!DecompGEP1.VarIndices.empty()) {
     uint64_t Modulo = 0;
     bool AllPositive = true;
-    for (unsigned i = 0, e = GEP1VariableIndices.size(); i != e; ++i) {
+    for (unsigned i = 0, e = DecompGEP1.VarIndices.size(); i != e; ++i) {
 
       // Try to distinguish something like &A[i][1] against &A[42][0].
       // Grab the least significant bit set in any of the scales. We
       // don't need std::abs here (even if the scale's negative) as we'll
       // be ^'ing Modulo with itself later.
-      Modulo |= (uint64_t)GEP1VariableIndices[i].Scale;
+      Modulo |= (uint64_t)DecompGEP1.VarIndices[i].Scale;
 
       if (AllPositive) {
         // If the Value could change between cycles, then any reasoning about
         // the Value this cycle may not hold in the next cycle. We'll just
         // give up if we can't determine conditions that hold for every cycle:
-        const Value *V = GEP1VariableIndices[i].V;
+        const Value *V = DecompGEP1.VarIndices[i].V;
 
         bool SignKnownZero, SignKnownOne;
         ComputeSignBit(const_cast<Value *>(V), SignKnownZero, SignKnownOne, DL,
@@ -1097,14 +1189,14 @@ AliasResult BasicAAResult::aliasGEP(const GEPOperator *GEP1, uint64_t V1Size,
 
         // Zero-extension widens the variable, and so forces the sign
         // bit to zero.
-        bool IsZExt = GEP1VariableIndices[i].ZExtBits > 0 || isa<ZExtInst>(V);
+        bool IsZExt = DecompGEP1.VarIndices[i].ZExtBits > 0 || isa<ZExtInst>(V);
         SignKnownZero |= IsZExt;
         SignKnownOne &= !IsZExt;
 
         // If the variable begins with a zero then we know it's
         // positive, regardless of whether the value is signed or
         // unsigned.
-        int64_t Scale = GEP1VariableIndices[i].Scale;
+        int64_t Scale = DecompGEP1.VarIndices[i].Scale;
         AllPositive =
             (SignKnownZero && Scale >= 0) || (SignKnownOne && Scale < 0);
       }
@@ -1127,7 +1219,7 @@ AliasResult BasicAAResult::aliasGEP(const GEPOperator *GEP1, uint64_t V1Size,
     if (AllPositive && GEP1BaseOffset > 0 && V2Size <= (uint64_t)GEP1BaseOffset)
       return NoAlias;
 
-    if (constantOffsetHeuristic(GEP1VariableIndices, V1Size, V2Size,
+    if (constantOffsetHeuristic(DecompGEP1.VarIndices, V1Size, V2Size,
                                 GEP1BaseOffset, &AC, DT))
       return NoAlias;
   }
@@ -1312,7 +1404,7 @@ AliasResult BasicAAResult::aliasCheck(const Value *V1, uint64_t V1Size,
     return NoAlias;
 
   // Are we checking for alias of the same value?
-  // Because we look 'through' phi nodes we could look at "Value" pointers from
+  // Because we look 'through' phi nodes, we could look at "Value" pointers from
   // different iterations. We must therefore make sure that this is not the
   // case. The function isValueEqualInPotentialCycles ensures that this cannot
   // happen by looking at the visited phi nodes and making sure they cannot
@@ -1337,7 +1429,7 @@ AliasResult BasicAAResult::aliasCheck(const Value *V1, uint64_t V1Size,
       return NoAlias;
 
   if (O1 != O2) {
-    // If V1/V2 point to two different objects we know that we have no alias.
+    // If V1/V2 point to two different objects, we know that we have no alias.
     if (isIdentifiedObject(O1) && isIdentifiedObject(O2))
       return NoAlias;
 
@@ -1430,8 +1522,7 @@ AliasResult BasicAAResult::aliasCheck(const Value *V1, uint64_t V1Size,
   }
 
   // If both pointers are pointing into the same object and one of them
-  // accesses is accessing the entire object, then the accesses must
-  // overlap in some way.
+  // accesses the entire object, then the accesses must overlap in some way.
   if (O1 == O2)
     if ((V1Size != MemoryLocation::UnknownSize &&
          isObjectSize(O1, V1Size, DL, TLI)) ||
@@ -1544,7 +1635,8 @@ bool BasicAAResult::constantOffsetHeuristic(
   unsigned V0ZExtBits = 0, V0SExtBits = 0, V1ZExtBits = 0, V1SExtBits = 0;
   const Value *V0 = GetLinearExpression(Var0.V, V0Scale, V0Offset, V0ZExtBits,
                                         V0SExtBits, DL, 0, AC, DT, NSW, NUW);
-  NSW = true, NUW = true;
+  NSW = true;
+  NUW = true;
   const Value *V1 = GetLinearExpression(Var1.V, V1Scale, V1Offset, V1ZExtBits,
                                         V1SExtBits, DL, 0, AC, DT, NSW, NUW);
 
@@ -1577,12 +1669,12 @@ bool BasicAAResult::constantOffsetHeuristic(
 
 char BasicAA::PassID;
 
-BasicAAResult BasicAA::run(Function &F, AnalysisManager<Function> *AM) {
+BasicAAResult BasicAA::run(Function &F, AnalysisManager<Function> &AM) {
   return BasicAAResult(F.getParent()->getDataLayout(),
-                       AM->getResult<TargetLibraryAnalysis>(F),
-                       AM->getResult<AssumptionAnalysis>(F),
-                       AM->getCachedResult<DominatorTreeAnalysis>(F),
-                       AM->getCachedResult<LoopAnalysis>(F));
+                       AM.getResult<TargetLibraryAnalysis>(F),
+                       AM.getResult<AssumptionAnalysis>(F),
+                       &AM.getResult<DominatorTreeAnalysis>(F),
+                       AM.getCachedResult<LoopAnalysis>(F));
 }
 
 BasicAAWrapperPass::BasicAAWrapperPass() : FunctionPass(ID) {
@@ -1595,6 +1687,7 @@ void BasicAAWrapperPass::anchor() {}
 INITIALIZE_PASS_BEGIN(BasicAAWrapperPass, "basicaa",
                       "Basic Alias Analysis (stateless AA impl)", true, true)
 INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker)
+INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
 INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfoWrapperPass)
 INITIALIZE_PASS_END(BasicAAWrapperPass, "basicaa",
                     "Basic Alias Analysis (stateless AA impl)", true, true)
@@ -1606,12 +1699,11 @@ FunctionPass *llvm::createBasicAAWrapperPass() {
 bool BasicAAWrapperPass::runOnFunction(Function &F) {
   auto &ACT = getAnalysis<AssumptionCacheTracker>();
   auto &TLIWP = getAnalysis<TargetLibraryInfoWrapperPass>();
-  auto *DTWP = getAnalysisIfAvailable<DominatorTreeWrapperPass>();
+  auto &DTWP = getAnalysis<DominatorTreeWrapperPass>();
   auto *LIWP = getAnalysisIfAvailable<LoopInfoWrapperPass>();
 
   Result.reset(new BasicAAResult(F.getParent()->getDataLayout(), TLIWP.getTLI(),
-                                 ACT.getAssumptionCache(F),
-                                 DTWP ? &DTWP->getDomTree() : nullptr,
+                                 ACT.getAssumptionCache(F), &DTWP.getDomTree(),
                                  LIWP ? &LIWP->getLoopInfo() : nullptr));
 
   return false;
@@ -1620,6 +1712,7 @@ bool BasicAAWrapperPass::runOnFunction(Function &F) {
 void BasicAAWrapperPass::getAnalysisUsage(AnalysisUsage &AU) const {
   AU.setPreservesAll();
   AU.addRequired<AssumptionCacheTracker>();
+  AU.addRequired<DominatorTreeWrapperPass>();
   AU.addRequired<TargetLibraryInfoWrapperPass>();
 }
 
diff --git a/lib/Analysis/BlockFrequencyInfo.cpp b/lib/Analysis/BlockFrequencyInfo.cpp
index 90b7a339a0fe..1dd8f4fdfcfe 100644
--- a/lib/Analysis/BlockFrequencyInfo.cpp
+++ b/lib/Analysis/BlockFrequencyInfo.cpp
@@ -27,24 +27,34 @@ using namespace llvm;
 #define DEBUG_TYPE "block-freq"
 
 #ifndef NDEBUG
-enum GVDAGType {
-  GVDT_None,
-  GVDT_Fraction,
-  GVDT_Integer
-};
-
-static cl::opt<GVDAGType>
-ViewBlockFreqPropagationDAG("view-block-freq-propagation-dags", cl::Hidden,
-          cl::desc("Pop up a window to show a dag displaying how block "
-                   "frequencies propagation through the CFG."),
-          cl::values(
-            clEnumValN(GVDT_None, "none",
-                       "do not display graphs."),
-            clEnumValN(GVDT_Fraction, "fraction", "display a graph using the "
-                       "fractional block frequency representation."),
-            clEnumValN(GVDT_Integer, "integer", "display a graph using the raw "
-                       "integer fractional block frequency representation."),
-            clEnumValEnd));
+static cl::opt<GVDAGType> ViewBlockFreqPropagationDAG(
+    "view-block-freq-propagation-dags", cl::Hidden,
+    cl::desc("Pop up a window to show a dag displaying how block "
+             "frequencies propagation through the CFG."),
+    cl::values(clEnumValN(GVDT_None, "none", "do not display graphs."),
+               clEnumValN(GVDT_Fraction, "fraction",
+                          "display a graph using the "
+                          "fractional block frequency representation."),
+               clEnumValN(GVDT_Integer, "integer",
+                          "display a graph using the raw "
+                          "integer fractional block frequency representation."),
+               clEnumValN(GVDT_Count, "count", "display a graph using the real "
+                                               "profile count if available."),
+               clEnumValEnd));
+
+cl::opt<std::string>
+    ViewBlockFreqFuncName("view-bfi-func-name", cl::Hidden,
+                          cl::desc("The option to specify "
+                                   "the name of the function "
+                                   "whose CFG will be displayed."));
+
+cl::opt<unsigned>
+    ViewHotFreqPercent("view-hot-freq-percent", cl::init(10), cl::Hidden,
+                       cl::desc("An integer in percent used to specify "
+                                "the hot blocks/edges to be displayed "
+                                "in red: a block or edge whose frequency "
+                                "is no less than the max frequency of the "
+                                "function multiplied by this percent."));
 
 namespace llvm {
 
@@ -71,34 +81,31 @@ struct GraphTraits<BlockFrequencyInfo *> {
   }
 };
 
-template<>
-struct DOTGraphTraits<BlockFrequencyInfo*> : public DefaultDOTGraphTraits {
-  explicit DOTGraphTraits(bool isSimple=false) :
-    DefaultDOTGraphTraits(isSimple) {}
+typedef BFIDOTGraphTraitsBase<BlockFrequencyInfo, BranchProbabilityInfo>
+    BFIDOTGTraitsBase;
 
-  static std::string getGraphName(const BlockFrequencyInfo *G) {
-    return G->getFunction()->getName();
-  }
+template <>
+struct DOTGraphTraits<BlockFrequencyInfo *> : public BFIDOTGTraitsBase {
+  explicit DOTGraphTraits(bool isSimple = false)
+      : BFIDOTGTraitsBase(isSimple) {}
 
   std::string getNodeLabel(const BasicBlock *Node,
                            const BlockFrequencyInfo *Graph) {
-    std::string Result;
-    raw_string_ostream OS(Result);
-
-    OS << Node->getName() << ":";
-    switch (ViewBlockFreqPropagationDAG) {
-    case GVDT_Fraction:
-      Graph->printBlockFreq(OS, Node);
-      break;
-    case GVDT_Integer:
-      OS << Graph->getBlockFreq(Node).getFrequency();
-      break;
-    case GVDT_None:
-      llvm_unreachable("If we are not supposed to render a graph we should "
-                       "never reach this point.");
-    }
-
-    return Result;
+
+    return BFIDOTGTraitsBase::getNodeLabel(Node, Graph,
+                                           ViewBlockFreqPropagationDAG);
+  }
+
+  std::string getNodeAttributes(const BasicBlock *Node,
+                                const BlockFrequencyInfo *Graph) {
+    return BFIDOTGTraitsBase::getNodeAttributes(Node, Graph,
+                                                ViewHotFreqPercent);
+  }
+
+  std::string getEdgeAttributes(const BasicBlock *Node, EdgeIter EI,
+                                const BlockFrequencyInfo *BFI) {
+    return BFIDOTGTraitsBase::getEdgeAttributes(Node, EI, BFI, BFI->getBPI(),
+                                                ViewHotFreqPercent);
   }
 };
 
@@ -113,6 +120,21 @@ BlockFrequencyInfo::BlockFrequencyInfo(const Function &F,
   calculate(F, BPI, LI);
 }
 
+BlockFrequencyInfo::BlockFrequencyInfo(BlockFrequencyInfo &&Arg)
+    : BFI(std::move(Arg.BFI)) {}
+
+BlockFrequencyInfo &BlockFrequencyInfo::operator=(BlockFrequencyInfo &&RHS) {
+  releaseMemory();
+  BFI = std::move(RHS.BFI);
+  return *this;
+}
+
+// Explicitly define the default constructor otherwise it would be implicitly
+// defined at the first ODR-use which is the BFI member in the
+// LazyBlockFrequencyInfo header.  The dtor needs the BlockFrequencyInfoImpl
+// template instantiated which is not available in the header.
+BlockFrequencyInfo::~BlockFrequencyInfo() {}
+
 void BlockFrequencyInfo::calculate(const Function &F,
                                    const BranchProbabilityInfo &BPI,
                                    const LoopInfo &LI) {
@@ -120,8 +142,11 @@ void BlockFrequencyInfo::calculate(const Function &F,
     BFI.reset(new ImplType);
   BFI->calculate(F, BPI, LI);
 #ifndef NDEBUG
-  if (ViewBlockFreqPropagationDAG != GVDT_None)
+  if (ViewBlockFreqPropagationDAG != GVDT_None &&
+      (ViewBlockFreqFuncName.empty() ||
+       F.getName().equals(ViewBlockFreqFuncName))) {
     view();
+  }
 #endif
 }
 
@@ -129,8 +154,15 @@ BlockFrequency BlockFrequencyInfo::getBlockFreq(const BasicBlock *BB) const {
   return BFI ? BFI->getBlockFreq(BB) : 0;
 }
 
-void BlockFrequencyInfo::setBlockFreq(const BasicBlock *BB,
-                                      uint64_t Freq) {
+Optional<uint64_t>
+BlockFrequencyInfo::getBlockProfileCount(const BasicBlock *BB) const {
+  if (!BFI)
+    return None;
+
+  return BFI->getBlockProfileCount(*getFunction(), BB);
+}
+
+void BlockFrequencyInfo::setBlockFreq(const BasicBlock *BB, uint64_t Freq) {
   assert(BFI && "Expected analysis to be available");
   BFI->setBlockFreq(BB, Freq);
 }
@@ -151,6 +183,10 @@ const Function *BlockFrequencyInfo::getFunction() const {
   return BFI ? BFI->getFunction() : nullptr;
 }
 
+const BranchProbabilityInfo *BlockFrequencyInfo::getBPI() const {
+  return BFI ? &BFI->getBPI() : nullptr;
+}
+
 raw_ostream &BlockFrequencyInfo::
 printBlockFreq(raw_ostream &OS, const BlockFrequency Freq) const {
   return BFI ? BFI->printBlockFreq(OS, Freq) : OS;
@@ -211,3 +247,21 @@ bool BlockFrequencyInfoWrapperPass::runOnFunction(Function &F) {
   BFI.calculate(F, BPI, LI);
   return false;
 }
+
+char BlockFrequencyAnalysis::PassID;
+BlockFrequencyInfo BlockFrequencyAnalysis::run(Function &F,
+                                               AnalysisManager<Function> &AM) {
+  BlockFrequencyInfo BFI;
+  BFI.calculate(F, AM.getResult<BranchProbabilityAnalysis>(F),
+                AM.getResult<LoopAnalysis>(F));
+  return BFI;
+}
+
+PreservedAnalyses
+BlockFrequencyPrinterPass::run(Function &F, AnalysisManager<Function> &AM) {
+  OS << "Printing analysis results of BFI for function "
+     << "'" << F.getName() << "':"
+     << "\n";
+  AM.getResult<BlockFrequencyAnalysis>(F).print(OS);
+  return PreservedAnalyses::all();
+}
diff --git a/lib/Analysis/BlockFrequencyInfoImpl.cpp b/lib/Analysis/BlockFrequencyInfoImpl.cpp
index 48e23af2690a..90bc249bcb39 100644
--- a/lib/Analysis/BlockFrequencyInfoImpl.cpp
+++ b/lib/Analysis/BlockFrequencyInfoImpl.cpp
@@ -13,6 +13,7 @@
 
 #include "llvm/Analysis/BlockFrequencyInfoImpl.h"
 #include "llvm/ADT/SCCIterator.h"
+#include "llvm/IR/Function.h"
 #include "llvm/Support/raw_ostream.h"
 #include <numeric>
 
@@ -27,7 +28,7 @@ ScaledNumber<uint64_t> BlockMass::toScaled() const {
   return ScaledNumber<uint64_t>(getMass() + 1, -64);
 }
 
-void BlockMass::dump() const { print(dbgs()); }
+LLVM_DUMP_METHOD void BlockMass::dump() const { print(dbgs()); }
 
 static char getHexDigit(int N) {
   assert(N < 16);
@@ -35,6 +36,7 @@ static char getHexDigit(int N) {
     return '0' + N;
   return 'a' + N - 10;
 }
+
 raw_ostream &BlockMass::print(raw_ostream &OS) const {
   for (int Digits = 0; Digits < 16; ++Digits)
     OS << getHexDigit(Mass >> (60 - Digits * 4) & 0xf);
@@ -78,7 +80,7 @@ struct DitheringDistributer {
   BlockMass takeMass(uint32_t Weight);
 };
 
-} // end namespace
+} // end anonymous namespace
 
 DitheringDistributer::DitheringDistributer(Distribution &Dist,
                                            const BlockMass &Mass) {
@@ -130,6 +132,7 @@ static void combineWeight(Weight &W, const Weight &OtherW) {
   else
     W.Amount += OtherW.Amount;
 }
+
 static void combineWeightsBySorting(WeightList &Weights) {
   // Sort so edges to the same node are adjacent.
   std::sort(Weights.begin(), Weights.end(),
@@ -149,8 +152,8 @@ static void combineWeightsBySorting(WeightList &Weights) {
 
   // Erase extra entries.
   Weights.erase(O, Weights.end());
-  return;
 }
+
 static void combineWeightsByHashing(WeightList &Weights) {
   // Collect weights into a DenseMap.
   typedef DenseMap<BlockNode::IndexType, Weight> HashTable;
@@ -168,6 +171,7 @@ static void combineWeightsByHashing(WeightList &Weights) {
   for (const auto &I : Combined)
     Weights.push_back(I.second);
 }
+
 static void combineWeights(WeightList &Weights) {
   // Use a hash table for many successors to keep this linear.
   if (Weights.size() > 128) {
@@ -177,6 +181,7 @@ static void combineWeights(WeightList &Weights) {
 
   combineWeightsBySorting(Weights);
 }
+
 static uint64_t shiftRightAndRound(uint64_t N, int Shift) {
   assert(Shift >= 0);
   assert(Shift < 64);
@@ -184,6 +189,7 @@ static uint64_t shiftRightAndRound(uint64_t N, int Shift) {
     return N;
   return (N >> Shift) + (UINT64_C(1) & N >> (Shift - 1));
 }
+
 void Distribution::normalize() {
   // Early exit for termination nodes.
   if (Weights.empty())
@@ -345,7 +351,7 @@ void BlockFrequencyInfoImplBase::computeLoopScale(LoopData &Loop) {
   // replaced to be the maximum of all computed scales plus 1. This would
   // appropriately describe the loop as having a large scale, without skewing
   // the final frequency computation.
-  const Scaled64 InifiniteLoopScale(1, 12);
+  const Scaled64 InfiniteLoopScale(1, 12);
 
   // LoopScale == 1 / ExitMass
   // ExitMass == HeadMass - BackedgeMass
@@ -358,7 +364,7 @@ void BlockFrequencyInfoImplBase::computeLoopScale(LoopData &Loop) {
   // its exit mass will be zero. In this case, use an arbitrary scale for the
   // loop scale.
   Loop.Scale =
-      ExitMass.isEmpty() ? InifiniteLoopScale : ExitMass.toScaled().inverse();
+      ExitMass.isEmpty() ? InfiniteLoopScale : ExitMass.toScaled().inverse();
 
   DEBUG(dbgs() << " - exit-mass = " << ExitMass << " (" << BlockMass::getFull()
                << " - " << TotalBackedgeMass << ")\n"
@@ -523,6 +529,22 @@ BlockFrequencyInfoImplBase::getBlockFreq(const BlockNode &Node) const {
     return 0;
   return Freqs[Node.Index].Integer;
 }
+
+Optional<uint64_t>
+BlockFrequencyInfoImplBase::getBlockProfileCount(const Function &F,
+                                                 const BlockNode &Node) const {
+  auto EntryCount = F.getEntryCount();
+  if (!EntryCount)
+    return None;
+  // Use 128 bit APInt to do the arithmetic to avoid overflow.
+  APInt BlockCount(128, EntryCount.getValue());
+  APInt BlockFreq(128, getBlockFreq(Node).getFrequency());
+  APInt EntryFreq(128, getEntryFreq());
+  BlockCount *= BlockFreq;
+  BlockCount = BlockCount.udiv(EntryFreq);
+  return BlockCount.getLimitedValue();
+}
+
 Scaled64
 BlockFrequencyInfoImplBase::getFloatingBlockFreq(const BlockNode &Node) const {
   if (!Node.isValid())
@@ -541,6 +563,7 @@ std::string
 BlockFrequencyInfoImplBase::getBlockName(const BlockNode &Node) const {
   return std::string();
 }
+
 std::string
 BlockFrequencyInfoImplBase::getLoopName(const LoopData &Loop) const {
   return getBlockName(Loop.getHeader()) + (Loop.isIrreducible() ? "**" : "*");
@@ -568,6 +591,7 @@ void IrreducibleGraph::addNodesInLoop(const BFIBase::LoopData &OuterLoop) {
     addNode(N);
   indexNodes();
 }
+
 void IrreducibleGraph::addNodesInFunction() {
   Start = 0;
   for (uint32_t Index = 0; Index < BFI.Working.size(); ++Index)
@@ -575,10 +599,12 @@ void IrreducibleGraph::addNodesInFunction() {
       addNode(Index);
   indexNodes();
 }
+
 void IrreducibleGraph::indexNodes() {
   for (auto &I : Nodes)
     Lookup[I.Node.Index] = &I;
 }
+
 void IrreducibleGraph::addEdge(IrrNode &Irr, const BlockNode &Succ,
                                const BFIBase::LoopData *OuterLoop) {
   if (OuterLoop && OuterLoop->isHeader(Succ))
@@ -605,7 +631,7 @@ template <> struct GraphTraits<IrreducibleGraph> {
   static ChildIteratorType child_begin(NodeType *N) { return N->succ_begin(); }
   static ChildIteratorType child_end(NodeType *N) { return N->succ_end(); }
 };
-}
+} // end namespace llvm
 
 /// \brief Find extra irreducible headers.
 ///
diff --git a/lib/Analysis/BranchProbabilityInfo.cpp b/lib/Analysis/BranchProbabilityInfo.cpp
index cf0cc8da6ef8..d802552d4e29 100644
--- a/lib/Analysis/BranchProbabilityInfo.cpp
+++ b/lib/Analysis/BranchProbabilityInfo.cpp
@@ -112,10 +112,15 @@ static const uint32_t IH_NONTAKEN_WEIGHT = 1;
 ///
 /// Predict that a successor which leads necessarily to an
 /// unreachable-terminated block as extremely unlikely.
-bool BranchProbabilityInfo::calcUnreachableHeuristics(BasicBlock *BB) {
-  TerminatorInst *TI = BB->getTerminator();
+bool BranchProbabilityInfo::calcUnreachableHeuristics(const BasicBlock *BB) {
+  const TerminatorInst *TI = BB->getTerminator();
   if (TI->getNumSuccessors() == 0) {
-    if (isa<UnreachableInst>(TI))
+    if (isa<UnreachableInst>(TI) ||
+        // If this block is terminated by a call to
+        // @llvm.experimental.deoptimize then treat it like an unreachable since
+        // the @llvm.experimental.deoptimize call is expected to practically
+        // never execute.
+        BB->getTerminatingDeoptimizeCall())
       PostDominatedByUnreachable.insert(BB);
     return false;
   }
@@ -123,7 +128,7 @@ bool BranchProbabilityInfo::calcUnreachableHeuristics(BasicBlock *BB) {
   SmallVector<unsigned, 4> UnreachableEdges;
   SmallVector<unsigned, 4> ReachableEdges;
 
-  for (succ_iterator I = succ_begin(BB), E = succ_end(BB); I != E; ++I) {
+  for (succ_const_iterator I = succ_begin(BB), E = succ_end(BB); I != E; ++I) {
     if (PostDominatedByUnreachable.count(*I))
       UnreachableEdges.push_back(I.getSuccessorIndex());
     else
@@ -174,8 +179,8 @@ bool BranchProbabilityInfo::calcUnreachableHeuristics(BasicBlock *BB) {
 
 // Propagate existing explicit probabilities from either profile data or
 // 'expect' intrinsic processing.
-bool BranchProbabilityInfo::calcMetadataWeights(BasicBlock *BB) {
-  TerminatorInst *TI = BB->getTerminator();
+bool BranchProbabilityInfo::calcMetadataWeights(const BasicBlock *BB) {
+  const TerminatorInst *TI = BB->getTerminator();
   if (TI->getNumSuccessors() == 1)
     return false;
   if (!isa<BranchInst>(TI) && !isa<SwitchInst>(TI))
@@ -244,15 +249,15 @@ bool BranchProbabilityInfo::calcMetadataWeights(BasicBlock *BB) {
 ///
 /// Return true if we could compute the weights for cold edges.
 /// Return false, otherwise.
-bool BranchProbabilityInfo::calcColdCallHeuristics(BasicBlock *BB) {
-  TerminatorInst *TI = BB->getTerminator();
+bool BranchProbabilityInfo::calcColdCallHeuristics(const BasicBlock *BB) {
+  const TerminatorInst *TI = BB->getTerminator();
   if (TI->getNumSuccessors() == 0)
     return false;
 
   // Determine which successors are post-dominated by a cold block.
   SmallVector<unsigned, 4> ColdEdges;
   SmallVector<unsigned, 4> NormalEdges;
-  for (succ_iterator I = succ_begin(BB), E = succ_end(BB); I != E; ++I)
+  for (succ_const_iterator I = succ_begin(BB), E = succ_end(BB); I != E; ++I)
     if (PostDominatedByColdCall.count(*I))
       ColdEdges.push_back(I.getSuccessorIndex());
     else
@@ -266,8 +271,8 @@ bool BranchProbabilityInfo::calcColdCallHeuristics(BasicBlock *BB) {
     // Otherwise, if the block itself contains a cold function, add it to the
     // set of blocks postdominated by a cold call.
     assert(!PostDominatedByColdCall.count(BB));
-    for (BasicBlock::iterator I = BB->begin(), E = BB->end(); I != E; ++I)
-      if (CallInst *CI = dyn_cast<CallInst>(I))
+    for (BasicBlock::const_iterator I = BB->begin(), E = BB->end(); I != E; ++I)
+      if (const CallInst *CI = dyn_cast<CallInst>(I))
         if (CI->hasFnAttr(Attribute::Cold)) {
           PostDominatedByColdCall.insert(BB);
           break;
@@ -302,8 +307,8 @@ bool BranchProbabilityInfo::calcColdCallHeuristics(BasicBlock *BB) {
 
 // Calculate Edge Weights using "Pointer Heuristics". Predict a comparsion
 // between two pointer or pointer and NULL will fail.
-bool BranchProbabilityInfo::calcPointerHeuristics(BasicBlock *BB) {
-  BranchInst * BI = dyn_cast<BranchInst>(BB->getTerminator());
+bool BranchProbabilityInfo::calcPointerHeuristics(const BasicBlock *BB) {
+  const BranchInst *BI = dyn_cast<BranchInst>(BB->getTerminator());
   if (!BI || !BI->isConditional())
     return false;
 
@@ -337,7 +342,7 @@ bool BranchProbabilityInfo::calcPointerHeuristics(BasicBlock *BB) {
 
 // Calculate Edge Weights using "Loop Branch Heuristics". Predict backedges
 // as taken, exiting edges as not-taken.
-bool BranchProbabilityInfo::calcLoopBranchHeuristics(BasicBlock *BB,
+bool BranchProbabilityInfo::calcLoopBranchHeuristics(const BasicBlock *BB,
                                                      const LoopInfo &LI) {
   Loop *L = LI.getLoopFor(BB);
   if (!L)
@@ -347,7 +352,7 @@ bool BranchProbabilityInfo::calcLoopBranchHeuristics(BasicBlock *BB,
   SmallVector<unsigned, 8> ExitingEdges;
   SmallVector<unsigned, 8> InEdges; // Edges from header to the loop.
 
-  for (succ_iterator I = succ_begin(BB), E = succ_end(BB); I != E; ++I) {
+  for (succ_const_iterator I = succ_begin(BB), E = succ_end(BB); I != E; ++I) {
     if (!L->contains(*I))
       ExitingEdges.push_back(I.getSuccessorIndex());
     else if (L->getHeader() == *I)
@@ -361,7 +366,9 @@ bool BranchProbabilityInfo::calcLoopBranchHeuristics(BasicBlock *BB,
 
   // Collect the sum of probabilities of back-edges/in-edges/exiting-edges, and
   // normalize them so that they sum up to one.
-  SmallVector<BranchProbability, 4> Probs(3, BranchProbability::getZero());
+  BranchProbability Probs[] = {BranchProbability::getZero(),
+                               BranchProbability::getZero(),
+                               BranchProbability::getZero()};
   unsigned Denom = (BackEdges.empty() ? 0 : LBH_TAKEN_WEIGHT) +
                    (InEdges.empty() ? 0 : LBH_TAKEN_WEIGHT) +
                    (ExitingEdges.empty() ? 0 : LBH_NONTAKEN_WEIGHT);
@@ -393,8 +400,8 @@ bool BranchProbabilityInfo::calcLoopBranchHeuristics(BasicBlock *BB,
   return true;
 }
 
-bool BranchProbabilityInfo::calcZeroHeuristics(BasicBlock *BB) {
-  BranchInst * BI = dyn_cast<BranchInst>(BB->getTerminator());
+bool BranchProbabilityInfo::calcZeroHeuristics(const BasicBlock *BB) {
+  const BranchInst *BI = dyn_cast<BranchInst>(BB->getTerminator());
   if (!BI || !BI->isConditional())
     return false;
 
@@ -476,8 +483,8 @@ bool BranchProbabilityInfo::calcZeroHeuristics(BasicBlock *BB) {
   return true;
 }
 
-bool BranchProbabilityInfo::calcFloatingPointHeuristics(BasicBlock *BB) {
-  BranchInst *BI = dyn_cast<BranchInst>(BB->getTerminator());
+bool BranchProbabilityInfo::calcFloatingPointHeuristics(const BasicBlock *BB) {
+  const BranchInst *BI = dyn_cast<BranchInst>(BB->getTerminator());
   if (!BI || !BI->isConditional())
     return false;
 
@@ -513,8 +520,8 @@ bool BranchProbabilityInfo::calcFloatingPointHeuristics(BasicBlock *BB) {
   return true;
 }
 
-bool BranchProbabilityInfo::calcInvokeHeuristics(BasicBlock *BB) {
-  InvokeInst *II = dyn_cast<InvokeInst>(BB->getTerminator());
+bool BranchProbabilityInfo::calcInvokeHeuristics(const BasicBlock *BB) {
+  const InvokeInst *II = dyn_cast<InvokeInst>(BB->getTerminator());
   if (!II)
     return false;
 
@@ -549,12 +556,13 @@ isEdgeHot(const BasicBlock *Src, const BasicBlock *Dst) const {
   return getEdgeProbability(Src, Dst) > BranchProbability(4, 5);
 }
 
-BasicBlock *BranchProbabilityInfo::getHotSucc(BasicBlock *BB) const {
+const BasicBlock *
+BranchProbabilityInfo::getHotSucc(const BasicBlock *BB) const {
   auto MaxProb = BranchProbability::getZero();
-  BasicBlock *MaxSucc = nullptr;
+  const BasicBlock *MaxSucc = nullptr;
 
-  for (succ_iterator I = succ_begin(BB), E = succ_end(BB); I != E; ++I) {
-    BasicBlock *Succ = *I;
+  for (succ_const_iterator I = succ_begin(BB), E = succ_end(BB); I != E; ++I) {
+    const BasicBlock *Succ = *I;
     auto Prob = getEdgeProbability(BB, Succ);
     if (Prob > MaxProb) {
       MaxProb = Prob;
@@ -616,6 +624,7 @@ void BranchProbabilityInfo::setEdgeProbability(const BasicBlock *Src,
                                                unsigned IndexInSuccessors,
                                                BranchProbability Prob) {
   Probs[std::make_pair(Src, IndexInSuccessors)] = Prob;
+  Handles.insert(BasicBlockCallbackVH(Src, this));
   DEBUG(dbgs() << "set edge " << Src->getName() << " -> " << IndexInSuccessors
                << " successor probability to " << Prob << "\n");
 }
@@ -633,7 +642,15 @@ BranchProbabilityInfo::printEdgeProbability(raw_ostream &OS,
   return OS;
 }
 
-void BranchProbabilityInfo::calculate(Function &F, const LoopInfo& LI) {
+void BranchProbabilityInfo::eraseBlock(const BasicBlock *BB) {
+  for (auto I = Probs.begin(), E = Probs.end(); I != E; ++I) {
+    auto Key = I->first;
+    if (Key.first == BB)
+      Probs.erase(Key);
+  }
+}
+
+void BranchProbabilityInfo::calculate(const Function &F, const LoopInfo &LI) {
   DEBUG(dbgs() << "---- Branch Probability Info : " << F.getName()
                << " ----\n\n");
   LastF = &F; // Store the last function we ran on for printing.
@@ -683,3 +700,20 @@ void BranchProbabilityInfoWrapperPass::print(raw_ostream &OS,
                                              const Module *) const {
   BPI.print(OS);
 }
+
+char BranchProbabilityAnalysis::PassID;
+BranchProbabilityInfo
+BranchProbabilityAnalysis::run(Function &F, AnalysisManager<Function> &AM) {
+  BranchProbabilityInfo BPI;
+  BPI.calculate(F, AM.getResult<LoopAnalysis>(F));
+  return BPI;
+}
+
+PreservedAnalyses
+BranchProbabilityPrinterPass::run(Function &F, AnalysisManager<Function> &AM) {
+  OS << "Printing analysis results of BPI for function "
+     << "'" << F.getName() << "':"
+     << "\n";
+  AM.getResult<BranchProbabilityAnalysis>(F).print(OS);
+  return PreservedAnalyses::all();
+}
diff --git a/lib/Analysis/CFG.cpp b/lib/Analysis/CFG.cpp
index 0dfd57d3cb6b..a319be8092f9 100644
--- a/lib/Analysis/CFG.cpp
+++ b/lib/Analysis/CFG.cpp
@@ -138,7 +138,7 @@ bool llvm::isPotentiallyReachableFromMany(
   // Limit the number of blocks we visit. The goal is to avoid run-away compile
   // times on large CFGs without hampering sensible code. Arbitrarily chosen.
   unsigned Limit = 32;
-  SmallSet<const BasicBlock*, 64> Visited;
+  SmallPtrSet<const BasicBlock*, 32> Visited;
   do {
     BasicBlock *BB = Worklist.pop_back_val();
     if (!Visited.insert(BB).second)
diff --git a/lib/Analysis/CFLAliasAnalysis.cpp b/lib/Analysis/CFLAliasAnalysis.cpp
deleted file mode 100644
index 4843ed6587a8..000000000000
--- a/lib/Analysis/CFLAliasAnalysis.cpp
+++ /dev/null
@@ -1,1119 +0,0 @@
-//===- CFLAliasAnalysis.cpp - CFL-Based Alias Analysis Implementation ------==//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-//
-// This file implements a CFL-based context-insensitive alias analysis
-// algorithm. It does not depend on types. The algorithm is a mixture of the one
-// described in "Demand-driven alias analysis for C" by Xin Zheng and Radu
-// Rugina, and "Fast algorithms for Dyck-CFL-reachability with applications to
-// Alias Analysis" by Zhang Q, Lyu M R, Yuan H, and Su Z. -- to summarize the
-// papers, we build a graph of the uses of a variable, where each node is a
-// memory location, and each edge is an action that happened on that memory
-// location.  The "actions" can be one of Dereference, Reference, or Assign.
-//
-// Two variables are considered as aliasing iff you can reach one value's node
-// from the other value's node and the language formed by concatenating all of
-// the edge labels (actions) conforms to a context-free grammar.
-//
-// Because this algorithm requires a graph search on each query, we execute the
-// algorithm outlined in "Fast algorithms..." (mentioned above)
-// in order to transform the graph into sets of variables that may alias in
-// ~nlogn time (n = number of variables.), which makes queries take constant
-// time.
-//===----------------------------------------------------------------------===//
-
-#include "llvm/Analysis/CFLAliasAnalysis.h"
-#include "StratifiedSets.h"
-#include "llvm/ADT/BitVector.h"
-#include "llvm/ADT/DenseMap.h"
-#include "llvm/ADT/None.h"
-#include "llvm/ADT/Optional.h"
-#include "llvm/Analysis/TargetLibraryInfo.h"
-#include "llvm/IR/Constants.h"
-#include "llvm/IR/Function.h"
-#include "llvm/IR/InstVisitor.h"
-#include "llvm/IR/Instructions.h"
-#include "llvm/Pass.h"
-#include "llvm/Support/Allocator.h"
-#include "llvm/Support/Compiler.h"
-#include "llvm/Support/Debug.h"
-#include "llvm/Support/ErrorHandling.h"
-#include "llvm/Support/raw_ostream.h"
-#include <algorithm>
-#include <cassert>
-#include <memory>
-#include <tuple>
-
-using namespace llvm;
-
-#define DEBUG_TYPE "cfl-aa"
-
-CFLAAResult::CFLAAResult(const TargetLibraryInfo &TLI) : AAResultBase(TLI) {}
-CFLAAResult::CFLAAResult(CFLAAResult &&Arg) : AAResultBase(std::move(Arg)) {}
-
-// \brief Information we have about a function and would like to keep around
-struct CFLAAResult::FunctionInfo {
-  StratifiedSets<Value *> Sets;
-  // Lots of functions have < 4 returns. Adjust as necessary.
-  SmallVector<Value *, 4> ReturnedValues;
-
-  FunctionInfo(StratifiedSets<Value *> &&S, SmallVector<Value *, 4> &&RV)
-      : Sets(std::move(S)), ReturnedValues(std::move(RV)) {}
-};
-
-// Try to go from a Value* to a Function*. Never returns nullptr.
-static Optional<Function *> parentFunctionOfValue(Value *);
-
-// Returns possible functions called by the Inst* into the given
-// SmallVectorImpl. Returns true if targets found, false otherwise.
-// This is templated because InvokeInst/CallInst give us the same
-// set of functions that we care about, and I don't like repeating
-// myself.
-template <typename Inst>
-static bool getPossibleTargets(Inst *, SmallVectorImpl<Function *> &);
-
-// Some instructions need to have their users tracked. Instructions like
-// `add` require you to get the users of the Instruction* itself, other
-// instructions like `store` require you to get the users of the first
-// operand. This function gets the "proper" value to track for each
-// type of instruction we support.
-static Optional<Value *> getTargetValue(Instruction *);
-
-// There are certain instructions (i.e. FenceInst, etc.) that we ignore.
-// This notes that we should ignore those.
-static bool hasUsefulEdges(Instruction *);
-
-const StratifiedIndex StratifiedLink::SetSentinel =
-    std::numeric_limits<StratifiedIndex>::max();
-
-namespace {
-// StratifiedInfo Attribute things.
-typedef unsigned StratifiedAttr;
-LLVM_CONSTEXPR unsigned MaxStratifiedAttrIndex = NumStratifiedAttrs;
-LLVM_CONSTEXPR unsigned AttrAllIndex = 0;
-LLVM_CONSTEXPR unsigned AttrGlobalIndex = 1;
-LLVM_CONSTEXPR unsigned AttrUnknownIndex = 2;
-LLVM_CONSTEXPR unsigned AttrFirstArgIndex = 3;
-LLVM_CONSTEXPR unsigned AttrLastArgIndex = MaxStratifiedAttrIndex;
-LLVM_CONSTEXPR unsigned AttrMaxNumArgs = AttrLastArgIndex - AttrFirstArgIndex;
-
-LLVM_CONSTEXPR StratifiedAttr AttrNone = 0;
-LLVM_CONSTEXPR StratifiedAttr AttrUnknown = 1 << AttrUnknownIndex;
-LLVM_CONSTEXPR StratifiedAttr AttrAll = ~AttrNone;
-
-// \brief StratifiedSets call for knowledge of "direction", so this is how we
-// represent that locally.
-enum class Level { Same, Above, Below };
-
-// \brief Edges can be one of four "weights" -- each weight must have an inverse
-// weight (Assign has Assign; Reference has Dereference).
-enum class EdgeType {
-  // The weight assigned when assigning from or to a value. For example, in:
-  // %b = getelementptr %a, 0
-  // ...The relationships are %b assign %a, and %a assign %b. This used to be
-  // two edges, but having a distinction bought us nothing.
-  Assign,
-
-  // The edge used when we have an edge going from some handle to a Value.
-  // Examples of this include:
-  // %b = load %a              (%b Dereference %a)
-  // %b = extractelement %a, 0 (%a Dereference %b)
-  Dereference,
-
-  // The edge used when our edge goes from a value to a handle that may have
-  // contained it at some point. Examples:
-  // %b = load %a              (%a Reference %b)
-  // %b = extractelement %a, 0 (%b Reference %a)
-  Reference
-};
-
-// \brief Encodes the notion of a "use"
-struct Edge {
-  // \brief Which value the edge is coming from
-  Value *From;
-
-  // \brief Which value the edge is pointing to
-  Value *To;
-
-  // \brief Edge weight
-  EdgeType Weight;
-
-  // \brief Whether we aliased any external values along the way that may be
-  // invisible to the analysis (i.e. landingpad for exceptions, calls for
-  // interprocedural analysis, etc.)
-  StratifiedAttrs AdditionalAttrs;
-
-  Edge(Value *From, Value *To, EdgeType W, StratifiedAttrs A)
-      : From(From), To(To), Weight(W), AdditionalAttrs(A) {}
-};
-
-// \brief Gets the edges our graph should have, based on an Instruction*
-class GetEdgesVisitor : public InstVisitor<GetEdgesVisitor, void> {
-  CFLAAResult &AA;
-  SmallVectorImpl<Edge> &Output;
-
-public:
-  GetEdgesVisitor(CFLAAResult &AA, SmallVectorImpl<Edge> &Output)
-      : AA(AA), Output(Output) {}
-
-  void visitInstruction(Instruction &) {
-    llvm_unreachable("Unsupported instruction encountered");
-  }
-
-  void visitPtrToIntInst(PtrToIntInst &Inst) {
-    auto *Ptr = Inst.getOperand(0);
-    Output.push_back(Edge(Ptr, Ptr, EdgeType::Assign, AttrUnknown));
-  }
-
-  void visitIntToPtrInst(IntToPtrInst &Inst) {
-    auto *Ptr = &Inst;
-    Output.push_back(Edge(Ptr, Ptr, EdgeType::Assign, AttrUnknown));
-  }
-
-  void visitCastInst(CastInst &Inst) {
-    Output.push_back(
-        Edge(&Inst, Inst.getOperand(0), EdgeType::Assign, AttrNone));
-  }
-
-  void visitBinaryOperator(BinaryOperator &Inst) {
-    auto *Op1 = Inst.getOperand(0);
-    auto *Op2 = Inst.getOperand(1);
-    Output.push_back(Edge(&Inst, Op1, EdgeType::Assign, AttrNone));
-    Output.push_back(Edge(&Inst, Op2, EdgeType::Assign, AttrNone));
-  }
-
-  void visitAtomicCmpXchgInst(AtomicCmpXchgInst &Inst) {
-    auto *Ptr = Inst.getPointerOperand();
-    auto *Val = Inst.getNewValOperand();
-    Output.push_back(Edge(Ptr, Val, EdgeType::Dereference, AttrNone));
-  }
-
-  void visitAtomicRMWInst(AtomicRMWInst &Inst) {
-    auto *Ptr = Inst.getPointerOperand();
-    auto *Val = Inst.getValOperand();
-    Output.push_back(Edge(Ptr, Val, EdgeType::Dereference, AttrNone));
-  }
-
-  void visitPHINode(PHINode &Inst) {
-    for (Value *Val : Inst.incoming_values()) {
-      Output.push_back(Edge(&Inst, Val, EdgeType::Assign, AttrNone));
-    }
-  }
-
-  void visitGetElementPtrInst(GetElementPtrInst &Inst) {
-    auto *Op = Inst.getPointerOperand();
-    Output.push_back(Edge(&Inst, Op, EdgeType::Assign, AttrNone));
-    for (auto I = Inst.idx_begin(), E = Inst.idx_end(); I != E; ++I)
-      Output.push_back(Edge(&Inst, *I, EdgeType::Assign, AttrNone));
-  }
-
-  void visitSelectInst(SelectInst &Inst) {
-    // Condition is not processed here (The actual statement producing
-    // the condition result is processed elsewhere). For select, the
-    // condition is evaluated, but not loaded, stored, or assigned
-    // simply as a result of being the condition of a select.
-
-    auto *TrueVal = Inst.getTrueValue();
-    Output.push_back(Edge(&Inst, TrueVal, EdgeType::Assign, AttrNone));
-    auto *FalseVal = Inst.getFalseValue();
-    Output.push_back(Edge(&Inst, FalseVal, EdgeType::Assign, AttrNone));
-  }
-
-  void visitAllocaInst(AllocaInst &) {}
-
-  void visitLoadInst(LoadInst &Inst) {
-    auto *Ptr = Inst.getPointerOperand();
-    auto *Val = &Inst;
-    Output.push_back(Edge(Val, Ptr, EdgeType::Reference, AttrNone));
-  }
-
-  void visitStoreInst(StoreInst &Inst) {
-    auto *Ptr = Inst.getPointerOperand();
-    auto *Val = Inst.getValueOperand();
-    Output.push_back(Edge(Ptr, Val, EdgeType::Dereference, AttrNone));
-  }
-
-  void visitVAArgInst(VAArgInst &Inst) {
-    // We can't fully model va_arg here. For *Ptr = Inst.getOperand(0), it does
-    // two things:
-    //  1. Loads a value from *((T*)*Ptr).
-    //  2. Increments (stores to) *Ptr by some target-specific amount.
-    // For now, we'll handle this like a landingpad instruction (by placing the
-    // result in its own group, and having that group alias externals).
-    auto *Val = &Inst;
-    Output.push_back(Edge(Val, Val, EdgeType::Assign, AttrAll));
-  }
-
-  static bool isFunctionExternal(Function *Fn) {
-    return Fn->isDeclaration() || !Fn->hasLocalLinkage();
-  }
-
-  // Gets whether the sets at Index1 above, below, or equal to the sets at
-  // Index2. Returns None if they are not in the same set chain.
-  static Optional<Level> getIndexRelation(const StratifiedSets<Value *> &Sets,
-                                          StratifiedIndex Index1,
-                                          StratifiedIndex Index2) {
-    if (Index1 == Index2)
-      return Level::Same;
-
-    const auto *Current = &Sets.getLink(Index1);
-    while (Current->hasBelow()) {
-      if (Current->Below == Index2)
-        return Level::Below;
-      Current = &Sets.getLink(Current->Below);
-    }
-
-    Current = &Sets.getLink(Index1);
-    while (Current->hasAbove()) {
-      if (Current->Above == Index2)
-        return Level::Above;
-      Current = &Sets.getLink(Current->Above);
-    }
-
-    return NoneType();
-  }
-
-  bool
-  tryInterproceduralAnalysis(const SmallVectorImpl<Function *> &Fns,
-                             Value *FuncValue,
-                             const iterator_range<User::op_iterator> &Args) {
-    const unsigned ExpectedMaxArgs = 8;
-    const unsigned MaxSupportedArgs = 50;
-    assert(Fns.size() > 0);
-
-    // I put this here to give us an upper bound on time taken by IPA. Is it
-    // really (realistically) needed? Keep in mind that we do have an n^2 algo.
-    if (std::distance(Args.begin(), Args.end()) > (int)MaxSupportedArgs)
-      return false;
-
-    // Exit early if we'll fail anyway
-    for (auto *Fn : Fns) {
-      if (isFunctionExternal(Fn) || Fn->isVarArg())
-        return false;
-      auto &MaybeInfo = AA.ensureCached(Fn);
-      if (!MaybeInfo.hasValue())
-        return false;
-    }
-
-    SmallVector<Value *, ExpectedMaxArgs> Arguments(Args.begin(), Args.end());
-    SmallVector<StratifiedInfo, ExpectedMaxArgs> Parameters;
-    for (auto *Fn : Fns) {
-      auto &Info = *AA.ensureCached(Fn);
-      auto &Sets = Info.Sets;
-      auto &RetVals = Info.ReturnedValues;
-
-      Parameters.clear();
-      for (auto &Param : Fn->args()) {
-        auto MaybeInfo = Sets.find(&Param);
-        // Did a new parameter somehow get added to the function/slip by?
-        if (!MaybeInfo.hasValue())
-          return false;
-        Parameters.push_back(*MaybeInfo);
-      }
-
-      // Adding an edge from argument -> return value for each parameter that
-      // may alias the return value
-      for (unsigned I = 0, E = Parameters.size(); I != E; ++I) {
-        auto &ParamInfo = Parameters[I];
-        auto &ArgVal = Arguments[I];
-        bool AddEdge = false;
-        StratifiedAttrs Externals;
-        for (unsigned X = 0, XE = RetVals.size(); X != XE; ++X) {
-          auto MaybeInfo = Sets.find(RetVals[X]);
-          if (!MaybeInfo.hasValue())
-            return false;
-
-          auto &RetInfo = *MaybeInfo;
-          auto RetAttrs = Sets.getLink(RetInfo.Index).Attrs;
-          auto ParamAttrs = Sets.getLink(ParamInfo.Index).Attrs;
-          auto MaybeRelation =
-              getIndexRelation(Sets, ParamInfo.Index, RetInfo.Index);
-          if (MaybeRelation.hasValue()) {
-            AddEdge = true;
-            Externals |= RetAttrs | ParamAttrs;
-          }
-        }
-        if (AddEdge)
-          Output.push_back(Edge(FuncValue, ArgVal, EdgeType::Assign,
-                                StratifiedAttrs().flip()));
-      }
-
-      if (Parameters.size() != Arguments.size())
-        return false;
-
-      // Adding edges between arguments for arguments that may end up aliasing
-      // each other. This is necessary for functions such as
-      // void foo(int** a, int** b) { *a = *b; }
-      // (Technically, the proper sets for this would be those below
-      // Arguments[I] and Arguments[X], but our algorithm will produce
-      // extremely similar, and equally correct, results either way)
-      for (unsigned I = 0, E = Arguments.size(); I != E; ++I) {
-        auto &MainVal = Arguments[I];
-        auto &MainInfo = Parameters[I];
-        auto &MainAttrs = Sets.getLink(MainInfo.Index).Attrs;
-        for (unsigned X = I + 1; X != E; ++X) {
-          auto &SubInfo = Parameters[X];
-          auto &SubVal = Arguments[X];
-          auto &SubAttrs = Sets.getLink(SubInfo.Index).Attrs;
-          auto MaybeRelation =
-              getIndexRelation(Sets, MainInfo.Index, SubInfo.Index);
-
-          if (!MaybeRelation.hasValue())
-            continue;
-
-          auto NewAttrs = SubAttrs | MainAttrs;
-          Output.push_back(Edge(MainVal, SubVal, EdgeType::Assign, NewAttrs));
-        }
-      }
-    }
-    return true;
-  }
-
-  template <typename InstT> void visitCallLikeInst(InstT &Inst) {
-    // TODO: Add support for noalias args/all the other fun function attributes
-    // that we can tack on.
-    SmallVector<Function *, 4> Targets;
-    if (getPossibleTargets(&Inst, Targets)) {
-      if (tryInterproceduralAnalysis(Targets, &Inst, Inst.arg_operands()))
-        return;
-      // Cleanup from interprocedural analysis
-      Output.clear();
-    }
-
-    // Because the function is opaque, we need to note that anything
-    // could have happened to the arguments, and that the result could alias
-    // just about anything, too.
-    // The goal of the loop is in part to unify many Values into one set, so we
-    // don't care if the function is void there.
-    for (Value *V : Inst.arg_operands())
-      Output.push_back(Edge(&Inst, V, EdgeType::Assign, AttrAll));
-    if (Inst.getNumArgOperands() == 0 &&
-        Inst.getType() != Type::getVoidTy(Inst.getContext()))
-      Output.push_back(Edge(&Inst, &Inst, EdgeType::Assign, AttrAll));
-  }
-
-  void visitCallInst(CallInst &Inst) { visitCallLikeInst(Inst); }
-
-  void visitInvokeInst(InvokeInst &Inst) { visitCallLikeInst(Inst); }
-
-  // Because vectors/aggregates are immutable and unaddressable,
-  // there's nothing we can do to coax a value out of them, other
-  // than calling Extract{Element,Value}. We can effectively treat
-  // them as pointers to arbitrary memory locations we can store in
-  // and load from.
-  void visitExtractElementInst(ExtractElementInst &Inst) {
-    auto *Ptr = Inst.getVectorOperand();
-    auto *Val = &Inst;
-    Output.push_back(Edge(Val, Ptr, EdgeType::Reference, AttrNone));
-  }
-
-  void visitInsertElementInst(InsertElementInst &Inst) {
-    auto *Vec = Inst.getOperand(0);
-    auto *Val = Inst.getOperand(1);
-    Output.push_back(Edge(&Inst, Vec, EdgeType::Assign, AttrNone));
-    Output.push_back(Edge(&Inst, Val, EdgeType::Dereference, AttrNone));
-  }
-
-  void visitLandingPadInst(LandingPadInst &Inst) {
-    // Exceptions come from "nowhere", from our analysis' perspective.
-    // So we place the instruction its own group, noting that said group may
-    // alias externals
-    Output.push_back(Edge(&Inst, &Inst, EdgeType::Assign, AttrAll));
-  }
-
-  void visitInsertValueInst(InsertValueInst &Inst) {
-    auto *Agg = Inst.getOperand(0);
-    auto *Val = Inst.getOperand(1);
-    Output.push_back(Edge(&Inst, Agg, EdgeType::Assign, AttrNone));
-    Output.push_back(Edge(&Inst, Val, EdgeType::Dereference, AttrNone));
-  }
-
-  void visitExtractValueInst(ExtractValueInst &Inst) {
-    auto *Ptr = Inst.getAggregateOperand();
-    Output.push_back(Edge(&Inst, Ptr, EdgeType::Reference, AttrNone));
-  }
-
-  void visitShuffleVectorInst(ShuffleVectorInst &Inst) {
-    auto *From1 = Inst.getOperand(0);
-    auto *From2 = Inst.getOperand(1);
-    Output.push_back(Edge(&Inst, From1, EdgeType::Assign, AttrNone));
-    Output.push_back(Edge(&Inst, From2, EdgeType::Assign, AttrNone));
-  }
-
-  void visitConstantExpr(ConstantExpr *CE) {
-    switch (CE->getOpcode()) {
-    default:
-      llvm_unreachable("Unknown instruction type encountered!");
-// Build the switch statement using the Instruction.def file.
-#define HANDLE_INST(NUM, OPCODE, CLASS)                                        \
-  case Instruction::OPCODE:                                                    \
-    visit##OPCODE(*(CLASS *)CE);                                               \
-    break;
-#include "llvm/IR/Instruction.def"
-    }
-  }
-};
-
-// For a given instruction, we need to know which Value* to get the
-// users of in order to build our graph. In some cases (i.e. add),
-// we simply need the Instruction*. In other cases (i.e. store),
-// finding the users of the Instruction* is useless; we need to find
-// the users of the first operand. This handles determining which
-// value to follow for us.
-//
-// Note: we *need* to keep this in sync with GetEdgesVisitor. Add
-// something to GetEdgesVisitor, add it here -- remove something from
-// GetEdgesVisitor, remove it here.
-class GetTargetValueVisitor
-    : public InstVisitor<GetTargetValueVisitor, Value *> {
-public:
-  Value *visitInstruction(Instruction &Inst) { return &Inst; }
-
-  Value *visitStoreInst(StoreInst &Inst) { return Inst.getPointerOperand(); }
-
-  Value *visitAtomicCmpXchgInst(AtomicCmpXchgInst &Inst) {
-    return Inst.getPointerOperand();
-  }
-
-  Value *visitAtomicRMWInst(AtomicRMWInst &Inst) {
-    return Inst.getPointerOperand();
-  }
-
-  Value *visitInsertElementInst(InsertElementInst &Inst) {
-    return Inst.getOperand(0);
-  }
-
-  Value *visitInsertValueInst(InsertValueInst &Inst) {
-    return Inst.getAggregateOperand();
-  }
-};
-
-// Set building requires a weighted bidirectional graph.
-template <typename EdgeTypeT> class WeightedBidirectionalGraph {
-public:
-  typedef std::size_t Node;
-
-private:
-  const static Node StartNode = Node(0);
-
-  struct Edge {
-    EdgeTypeT Weight;
-    Node Other;
-
-    Edge(const EdgeTypeT &W, const Node &N) : Weight(W), Other(N) {}
-
-    bool operator==(const Edge &E) const {
-      return Weight == E.Weight && Other == E.Other;
-    }
-
-    bool operator!=(const Edge &E) const { return !operator==(E); }
-  };
-
-  struct NodeImpl {
-    std::vector<Edge> Edges;
-  };
-
-  std::vector<NodeImpl> NodeImpls;
-
-  bool inbounds(Node NodeIndex) const { return NodeIndex < NodeImpls.size(); }
-
-  const NodeImpl &getNode(Node N) const { return NodeImpls[N]; }
-  NodeImpl &getNode(Node N) { return NodeImpls[N]; }
-
-public:
-  // ----- Various Edge iterators for the graph ----- //
-
-  // \brief Iterator for edges. Because this graph is bidirected, we don't
-  // allow modification of the edges using this iterator. Additionally, the
-  // iterator becomes invalid if you add edges to or from the node you're
-  // getting the edges of.
-  struct EdgeIterator : public std::iterator<std::forward_iterator_tag,
-                                             std::tuple<EdgeTypeT, Node *>> {
-    EdgeIterator(const typename std::vector<Edge>::const_iterator &Iter)
-        : Current(Iter) {}
-
-    EdgeIterator(NodeImpl &Impl) : Current(Impl.begin()) {}
-
-    EdgeIterator &operator++() {
-      ++Current;
-      return *this;
-    }
-
-    EdgeIterator operator++(int) {
-      EdgeIterator Copy(Current);
-      operator++();
-      return Copy;
-    }
-
-    std::tuple<EdgeTypeT, Node> &operator*() {
-      Store = std::make_tuple(Current->Weight, Current->Other);
-      return Store;
-    }
-
-    bool operator==(const EdgeIterator &Other) const {
-      return Current == Other.Current;
-    }
-
-    bool operator!=(const EdgeIterator &Other) const {
-      return !operator==(Other);
-    }
-
-  private:
-    typename std::vector<Edge>::const_iterator Current;
-    std::tuple<EdgeTypeT, Node> Store;
-  };
-
-  // Wrapper for EdgeIterator with begin()/end() calls.
-  struct EdgeIterable {
-    EdgeIterable(const std::vector<Edge> &Edges)
-        : BeginIter(Edges.begin()), EndIter(Edges.end()) {}
-
-    EdgeIterator begin() { return EdgeIterator(BeginIter); }
-
-    EdgeIterator end() { return EdgeIterator(EndIter); }
-
-  private:
-    typename std::vector<Edge>::const_iterator BeginIter;
-    typename std::vector<Edge>::const_iterator EndIter;
-  };
-
-  // ----- Actual graph-related things ----- //
-
-  WeightedBidirectionalGraph() {}
-
-  WeightedBidirectionalGraph(WeightedBidirectionalGraph<EdgeTypeT> &&Other)
-      : NodeImpls(std::move(Other.NodeImpls)) {}
-
-  WeightedBidirectionalGraph<EdgeTypeT> &
-  operator=(WeightedBidirectionalGraph<EdgeTypeT> &&Other) {
-    NodeImpls = std::move(Other.NodeImpls);
-    return *this;
-  }
-
-  Node addNode() {
-    auto Index = NodeImpls.size();
-    auto NewNode = Node(Index);
-    NodeImpls.push_back(NodeImpl());
-    return NewNode;
-  }
-
-  void addEdge(Node From, Node To, const EdgeTypeT &Weight,
-               const EdgeTypeT &ReverseWeight) {
-    assert(inbounds(From));
-    assert(inbounds(To));
-    auto &FromNode = getNode(From);
-    auto &ToNode = getNode(To);
-    FromNode.Edges.push_back(Edge(Weight, To));
-    ToNode.Edges.push_back(Edge(ReverseWeight, From));
-  }
-
-  EdgeIterable edgesFor(const Node &N) const {
-    const auto &Node = getNode(N);
-    return EdgeIterable(Node.Edges);
-  }
-
-  bool empty() const { return NodeImpls.empty(); }
-  std::size_t size() const { return NodeImpls.size(); }
-
-  // \brief Gets an arbitrary node in the graph as a starting point for
-  // traversal.
-  Node getEntryNode() {
-    assert(inbounds(StartNode));
-    return StartNode;
-  }
-};
-
-typedef WeightedBidirectionalGraph<std::pair<EdgeType, StratifiedAttrs>> GraphT;
-typedef DenseMap<Value *, GraphT::Node> NodeMapT;
-}
-
-//===----------------------------------------------------------------------===//
-// Function declarations that require types defined in the namespace above
-//===----------------------------------------------------------------------===//
-
-// Given an argument number, returns the appropriate Attr index to set.
-static StratifiedAttr argNumberToAttrIndex(StratifiedAttr);
-
-// Given a Value, potentially return which AttrIndex it maps to.
-static Optional<StratifiedAttr> valueToAttrIndex(Value *Val);
-
-// Gets the inverse of a given EdgeType.
-static EdgeType flipWeight(EdgeType);
-
-// Gets edges of the given Instruction*, writing them to the SmallVector*.
-static void argsToEdges(CFLAAResult &, Instruction *, SmallVectorImpl<Edge> &);
-
-// Gets edges of the given ConstantExpr*, writing them to the SmallVector*.
-static void argsToEdges(CFLAAResult &, ConstantExpr *, SmallVectorImpl<Edge> &);
-
-// Gets the "Level" that one should travel in StratifiedSets
-// given an EdgeType.
-static Level directionOfEdgeType(EdgeType);
-
-// Builds the graph needed for constructing the StratifiedSets for the
-// given function
-static void buildGraphFrom(CFLAAResult &, Function *,
-                           SmallVectorImpl<Value *> &, NodeMapT &, GraphT &);
-
-// Gets the edges of a ConstantExpr as if it was an Instruction. This
-// function also acts on any nested ConstantExprs, adding the edges
-// of those to the given SmallVector as well.
-static void constexprToEdges(CFLAAResult &, ConstantExpr &,
-                             SmallVectorImpl<Edge> &);
-
-// Given an Instruction, this will add it to the graph, along with any
-// Instructions that are potentially only available from said Instruction
-// For example, given the following line:
-//   %0 = load i16* getelementptr ([1 x i16]* @a, 0, 0), align 2
-// addInstructionToGraph would add both the `load` and `getelementptr`
-// instructions to the graph appropriately.
-static void addInstructionToGraph(CFLAAResult &, Instruction &,
-                                  SmallVectorImpl<Value *> &, NodeMapT &,
-                                  GraphT &);
-
-// Notes whether it would be pointless to add the given Value to our sets.
-static bool canSkipAddingToSets(Value *Val);
-
-static Optional<Function *> parentFunctionOfValue(Value *Val) {
-  if (auto *Inst = dyn_cast<Instruction>(Val)) {
-    auto *Bb = Inst->getParent();
-    return Bb->getParent();
-  }
-
-  if (auto *Arg = dyn_cast<Argument>(Val))
-    return Arg->getParent();
-  return NoneType();
-}
-
-template <typename Inst>
-static bool getPossibleTargets(Inst *Call,
-                               SmallVectorImpl<Function *> &Output) {
-  if (auto *Fn = Call->getCalledFunction()) {
-    Output.push_back(Fn);
-    return true;
-  }
-
-  // TODO: If the call is indirect, we might be able to enumerate all potential
-  // targets of the call and return them, rather than just failing.
-  return false;
-}
-
-static Optional<Value *> getTargetValue(Instruction *Inst) {
-  GetTargetValueVisitor V;
-  return V.visit(Inst);
-}
-
-static bool hasUsefulEdges(Instruction *Inst) {
-  bool IsNonInvokeTerminator =
-      isa<TerminatorInst>(Inst) && !isa<InvokeInst>(Inst);
-  return !isa<CmpInst>(Inst) && !isa<FenceInst>(Inst) && !IsNonInvokeTerminator;
-}
-
-static bool hasUsefulEdges(ConstantExpr *CE) {
-  // ConstantExpr doesn't have terminators, invokes, or fences, so only needs
-  // to check for compares.
-  return CE->getOpcode() != Instruction::ICmp &&
-         CE->getOpcode() != Instruction::FCmp;
-}
-
-static Optional<StratifiedAttr> valueToAttrIndex(Value *Val) {
-  if (isa<GlobalValue>(Val))
-    return AttrGlobalIndex;
-
-  if (auto *Arg = dyn_cast<Argument>(Val))
-    // Only pointer arguments should have the argument attribute,
-    // because things can't escape through scalars without us seeing a
-    // cast, and thus, interaction with them doesn't matter.
-    if (!Arg->hasNoAliasAttr() && Arg->getType()->isPointerTy())
-      return argNumberToAttrIndex(Arg->getArgNo());
-  return NoneType();
-}
-
-static StratifiedAttr argNumberToAttrIndex(unsigned ArgNum) {
-  if (ArgNum >= AttrMaxNumArgs)
-    return AttrAllIndex;
-  return ArgNum + AttrFirstArgIndex;
-}
-
-static EdgeType flipWeight(EdgeType Initial) {
-  switch (Initial) {
-  case EdgeType::Assign:
-    return EdgeType::Assign;
-  case EdgeType::Dereference:
-    return EdgeType::Reference;
-  case EdgeType::Reference:
-    return EdgeType::Dereference;
-  }
-  llvm_unreachable("Incomplete coverage of EdgeType enum");
-}
-
-static void argsToEdges(CFLAAResult &Analysis, Instruction *Inst,
-                        SmallVectorImpl<Edge> &Output) {
-  assert(hasUsefulEdges(Inst) &&
-         "Expected instructions to have 'useful' edges");
-  GetEdgesVisitor v(Analysis, Output);
-  v.visit(Inst);
-}
-
-static void argsToEdges(CFLAAResult &Analysis, ConstantExpr *CE,
-                        SmallVectorImpl<Edge> &Output) {
-  assert(hasUsefulEdges(CE) && "Expected constant expr to have 'useful' edges");
-  GetEdgesVisitor v(Analysis, Output);
-  v.visitConstantExpr(CE);
-}
-
-static Level directionOfEdgeType(EdgeType Weight) {
-  switch (Weight) {
-  case EdgeType::Reference:
-    return Level::Above;
-  case EdgeType::Dereference:
-    return Level::Below;
-  case EdgeType::Assign:
-    return Level::Same;
-  }
-  llvm_unreachable("Incomplete switch coverage");
-}
-
-static void constexprToEdges(CFLAAResult &Analysis,
-                             ConstantExpr &CExprToCollapse,
-                             SmallVectorImpl<Edge> &Results) {
-  SmallVector<ConstantExpr *, 4> Worklist;
-  Worklist.push_back(&CExprToCollapse);
-
-  SmallVector<Edge, 8> ConstexprEdges;
-  SmallPtrSet<ConstantExpr *, 4> Visited;
-  while (!Worklist.empty()) {
-    auto *CExpr = Worklist.pop_back_val();
-
-    if (!hasUsefulEdges(CExpr))
-      continue;
-
-    ConstexprEdges.clear();
-    argsToEdges(Analysis, CExpr, ConstexprEdges);
-    for (auto &Edge : ConstexprEdges) {
-      if (auto *Nested = dyn_cast<ConstantExpr>(Edge.From))
-        if (Visited.insert(Nested).second)
-          Worklist.push_back(Nested);
-
-      if (auto *Nested = dyn_cast<ConstantExpr>(Edge.To))
-        if (Visited.insert(Nested).second)
-          Worklist.push_back(Nested);
-    }
-
-    Results.append(ConstexprEdges.begin(), ConstexprEdges.end());
-  }
-}
-
-static void addInstructionToGraph(CFLAAResult &Analysis, Instruction &Inst,
-                                  SmallVectorImpl<Value *> &ReturnedValues,
-                                  NodeMapT &Map, GraphT &Graph) {
-  const auto findOrInsertNode = [&Map, &Graph](Value *Val) {
-    auto Pair = Map.insert(std::make_pair(Val, GraphT::Node()));
-    auto &Iter = Pair.first;
-    if (Pair.second) {
-      auto NewNode = Graph.addNode();
-      Iter->second = NewNode;
-    }
-    return Iter->second;
-  };
-
-  // We don't want the edges of most "return" instructions, but we *do* want
-  // to know what can be returned.
-  if (isa<ReturnInst>(&Inst))
-    ReturnedValues.push_back(&Inst);
-
-  if (!hasUsefulEdges(&Inst))
-    return;
-
-  SmallVector<Edge, 8> Edges;
-  argsToEdges(Analysis, &Inst, Edges);
-
-  // In the case of an unused alloca (or similar), edges may be empty. Note
-  // that it exists so we can potentially answer NoAlias.
-  if (Edges.empty()) {
-    auto MaybeVal = getTargetValue(&Inst);
-    assert(MaybeVal.hasValue());
-    auto *Target = *MaybeVal;
-    findOrInsertNode(Target);
-    return;
-  }
-
-  const auto addEdgeToGraph = [&Graph, &findOrInsertNode](const Edge &E) {
-    auto To = findOrInsertNode(E.To);
-    auto From = findOrInsertNode(E.From);
-    auto FlippedWeight = flipWeight(E.Weight);
-    auto Attrs = E.AdditionalAttrs;
-    Graph.addEdge(From, To, std::make_pair(E.Weight, Attrs),
-                  std::make_pair(FlippedWeight, Attrs));
-  };
-
-  SmallVector<ConstantExpr *, 4> ConstantExprs;
-  for (const Edge &E : Edges) {
-    addEdgeToGraph(E);
-    if (auto *Constexpr = dyn_cast<ConstantExpr>(E.To))
-      ConstantExprs.push_back(Constexpr);
-    if (auto *Constexpr = dyn_cast<ConstantExpr>(E.From))
-      ConstantExprs.push_back(Constexpr);
-  }
-
-  for (ConstantExpr *CE : ConstantExprs) {
-    Edges.clear();
-    constexprToEdges(Analysis, *CE, Edges);
-    std::for_each(Edges.begin(), Edges.end(), addEdgeToGraph);
-  }
-}
-
-// Aside: We may remove graph construction entirely, because it doesn't really
-// buy us much that we don't already have. I'd like to add interprocedural
-// analysis prior to this however, in case that somehow requires the graph
-// produced by this for efficient execution
-static void buildGraphFrom(CFLAAResult &Analysis, Function *Fn,
-                           SmallVectorImpl<Value *> &ReturnedValues,
-                           NodeMapT &Map, GraphT &Graph) {
-  for (auto &Bb : Fn->getBasicBlockList())
-    for (auto &Inst : Bb.getInstList())
-      addInstructionToGraph(Analysis, Inst, ReturnedValues, Map, Graph);
-}
-
-static bool canSkipAddingToSets(Value *Val) {
-  // Constants can share instances, which may falsely unify multiple
-  // sets, e.g. in
-  // store i32* null, i32** %ptr1
-  // store i32* null, i32** %ptr2
-  // clearly ptr1 and ptr2 should not be unified into the same set, so
-  // we should filter out the (potentially shared) instance to
-  // i32* null.
-  if (isa<Constant>(Val)) {
-    bool Container = isa<ConstantVector>(Val) || isa<ConstantArray>(Val) ||
-                     isa<ConstantStruct>(Val);
-    // TODO: Because all of these things are constant, we can determine whether
-    // the data is *actually* mutable at graph building time. This will probably
-    // come for free/cheap with offset awareness.
-    bool CanStoreMutableData =
-        isa<GlobalValue>(Val) || isa<ConstantExpr>(Val) || Container;
-    return !CanStoreMutableData;
-  }
-
-  return false;
-}
-
-// Builds the graph + StratifiedSets for a function.
-CFLAAResult::FunctionInfo CFLAAResult::buildSetsFrom(Function *Fn) {
-  NodeMapT Map;
-  GraphT Graph;
-  SmallVector<Value *, 4> ReturnedValues;
-
-  buildGraphFrom(*this, Fn, ReturnedValues, Map, Graph);
-
-  DenseMap<GraphT::Node, Value *> NodeValueMap;
-  NodeValueMap.resize(Map.size());
-  for (const auto &Pair : Map)
-    NodeValueMap.insert(std::make_pair(Pair.second, Pair.first));
-
-  const auto findValueOrDie = [&NodeValueMap](GraphT::Node Node) {
-    auto ValIter = NodeValueMap.find(Node);
-    assert(ValIter != NodeValueMap.end());
-    return ValIter->second;
-  };
-
-  StratifiedSetsBuilder<Value *> Builder;
-
-  SmallVector<GraphT::Node, 16> Worklist;
-  for (auto &Pair : Map) {
-    Worklist.clear();
-
-    auto *Value = Pair.first;
-    Builder.add(Value);
-    auto InitialNode = Pair.second;
-    Worklist.push_back(InitialNode);
-    while (!Worklist.empty()) {
-      auto Node = Worklist.pop_back_val();
-      auto *CurValue = findValueOrDie(Node);
-      if (canSkipAddingToSets(CurValue))
-        continue;
-
-      for (const auto &EdgeTuple : Graph.edgesFor(Node)) {
-        auto Weight = std::get<0>(EdgeTuple);
-        auto Label = Weight.first;
-        auto &OtherNode = std::get<1>(EdgeTuple);
-        auto *OtherValue = findValueOrDie(OtherNode);
-
-        if (canSkipAddingToSets(OtherValue))
-          continue;
-
-        bool Added;
-        switch (directionOfEdgeType(Label)) {
-        case Level::Above:
-          Added = Builder.addAbove(CurValue, OtherValue);
-          break;
-        case Level::Below:
-          Added = Builder.addBelow(CurValue, OtherValue);
-          break;
-        case Level::Same:
-          Added = Builder.addWith(CurValue, OtherValue);
-          break;
-        }
-
-        auto Aliasing = Weight.second;
-        if (auto MaybeCurIndex = valueToAttrIndex(CurValue))
-          Aliasing.set(*MaybeCurIndex);
-        if (auto MaybeOtherIndex = valueToAttrIndex(OtherValue))
-          Aliasing.set(*MaybeOtherIndex);
-        Builder.noteAttributes(CurValue, Aliasing);
-        Builder.noteAttributes(OtherValue, Aliasing);
-
-        if (Added)
-          Worklist.push_back(OtherNode);
-      }
-    }
-  }
-
-  // There are times when we end up with parameters not in our graph (i.e. if
-  // it's only used as the condition of a branch). Other bits of code depend on
-  // things that were present during construction being present in the graph.
-  // So, we add all present arguments here.
-  for (auto &Arg : Fn->args()) {
-    if (!Builder.add(&Arg))
-      continue;
-
-    auto Attrs = valueToAttrIndex(&Arg);
-    if (Attrs.hasValue())
-      Builder.noteAttributes(&Arg, *Attrs);
-  }
-
-  return FunctionInfo(Builder.build(), std::move(ReturnedValues));
-}
-
-void CFLAAResult::scan(Function *Fn) {
-  auto InsertPair = Cache.insert(std::make_pair(Fn, Optional<FunctionInfo>()));
-  (void)InsertPair;
-  assert(InsertPair.second &&
-         "Trying to scan a function that has already been cached");
-
-  FunctionInfo Info(buildSetsFrom(Fn));
-  Cache[Fn] = std::move(Info);
-  Handles.push_front(FunctionHandle(Fn, this));
-}
-
-void CFLAAResult::evict(Function *Fn) { Cache.erase(Fn); }
-
-/// \brief Ensures that the given function is available in the cache.
-/// Returns the appropriate entry from the cache.
-const Optional<CFLAAResult::FunctionInfo> &
-CFLAAResult::ensureCached(Function *Fn) {
-  auto Iter = Cache.find(Fn);
-  if (Iter == Cache.end()) {
-    scan(Fn);
-    Iter = Cache.find(Fn);
-    assert(Iter != Cache.end());
-    assert(Iter->second.hasValue());
-  }
-  return Iter->second;
-}
-
-AliasResult CFLAAResult::query(const MemoryLocation &LocA,
-                               const MemoryLocation &LocB) {
-  auto *ValA = const_cast<Value *>(LocA.Ptr);
-  auto *ValB = const_cast<Value *>(LocB.Ptr);
-
-  Function *Fn = nullptr;
-  auto MaybeFnA = parentFunctionOfValue(ValA);
-  auto MaybeFnB = parentFunctionOfValue(ValB);
-  if (!MaybeFnA.hasValue() && !MaybeFnB.hasValue()) {
-    // The only times this is known to happen are when globals + InlineAsm
-    // are involved
-    DEBUG(dbgs() << "CFLAA: could not extract parent function information.\n");
-    return MayAlias;
-  }
-
-  if (MaybeFnA.hasValue()) {
-    Fn = *MaybeFnA;
-    assert((!MaybeFnB.hasValue() || *MaybeFnB == *MaybeFnA) &&
-           "Interprocedural queries not supported");
-  } else {
-    Fn = *MaybeFnB;
-  }
-
-  assert(Fn != nullptr);
-  auto &MaybeInfo = ensureCached(Fn);
-  assert(MaybeInfo.hasValue());
-
-  auto &Sets = MaybeInfo->Sets;
-  auto MaybeA = Sets.find(ValA);
-  if (!MaybeA.hasValue())
-    return MayAlias;
-
-  auto MaybeB = Sets.find(ValB);
-  if (!MaybeB.hasValue())
-    return MayAlias;
-
-  auto SetA = *MaybeA;
-  auto SetB = *MaybeB;
-  auto AttrsA = Sets.getLink(SetA.Index).Attrs;
-  auto AttrsB = Sets.getLink(SetB.Index).Attrs;
-
-  // Stratified set attributes are used as markets to signify whether a member
-  // of a StratifiedSet (or a member of a set above the current set) has
-  // interacted with either arguments or globals. "Interacted with" meaning
-  // its value may be different depending on the value of an argument or
-  // global. The thought behind this is that, because arguments and globals
-  // may alias each other, if AttrsA and AttrsB have touched args/globals,
-  // we must conservatively say that they alias. However, if at least one of
-  // the sets has no values that could legally be altered by changing the value
-  // of an argument or global, then we don't have to be as conservative.
-  if (AttrsA.any() && AttrsB.any())
-    return MayAlias;
-
-  // We currently unify things even if the accesses to them may not be in
-  // bounds, so we can't return partial alias here because we don't
-  // know whether the pointer is really within the object or not.
-  // IE Given an out of bounds GEP and an alloca'd pointer, we may
-  // unify the two. We can't return partial alias for this case.
-  // Since we do not currently track enough information to
-  // differentiate
-
-  if (SetA.Index == SetB.Index)
-    return MayAlias;
-
-  return NoAlias;
-}
-
-CFLAAResult CFLAA::run(Function &F, AnalysisManager<Function> *AM) {
-  return CFLAAResult(AM->getResult<TargetLibraryAnalysis>(F));
-}
-
-char CFLAA::PassID;
-
-char CFLAAWrapperPass::ID = 0;
-INITIALIZE_PASS_BEGIN(CFLAAWrapperPass, "cfl-aa", "CFL-Based Alias Analysis",
-                      false, true)
-INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfoWrapperPass)
-INITIALIZE_PASS_END(CFLAAWrapperPass, "cfl-aa", "CFL-Based Alias Analysis",
-                    false, true)
-
-ImmutablePass *llvm::createCFLAAWrapperPass() { return new CFLAAWrapperPass(); }
-
-CFLAAWrapperPass::CFLAAWrapperPass() : ImmutablePass(ID) {
-  initializeCFLAAWrapperPassPass(*PassRegistry::getPassRegistry());
-}
-
-bool CFLAAWrapperPass::doInitialization(Module &M) {
-  Result.reset(
-      new CFLAAResult(getAnalysis<TargetLibraryInfoWrapperPass>().getTLI()));
-  return false;
-}
-
-bool CFLAAWrapperPass::doFinalization(Module &M) {
-  Result.reset();
-  return false;
-}
-
-void CFLAAWrapperPass::getAnalysisUsage(AnalysisUsage &AU) const {
-  AU.setPreservesAll();
-  AU.addRequired<TargetLibraryInfoWrapperPass>();
-}
diff --git a/lib/Analysis/CFLAndersAliasAnalysis.cpp b/lib/Analysis/CFLAndersAliasAnalysis.cpp
new file mode 100644
index 000000000000..7d5bd94133a7
--- /dev/null
+++ b/lib/Analysis/CFLAndersAliasAnalysis.cpp
@@ -0,0 +1,584 @@
+//- CFLAndersAliasAnalysis.cpp - Unification-based Alias Analysis ---*- C++-*-//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements a CFL-based, summary-based alias analysis algorithm. It
+// differs from CFLSteensAliasAnalysis in its inclusion-based nature while
+// CFLSteensAliasAnalysis is unification-based. This pass has worse performance
+// than CFLSteensAliasAnalysis (the worst case complexity of
+// CFLAndersAliasAnalysis is cubic, while the worst case complexity of
+// CFLSteensAliasAnalysis is almost linear), but it is able to yield more
+// precise analysis result. The precision of this analysis is roughly the same
+// as that of an one level context-sensitive Andersen's algorithm.
+//
+// The algorithm used here is based on recursive state machine matching scheme
+// proposed in "Demand-driven alias analysis for C" by Xin Zheng and Radu
+// Rugina. The general idea is to extend the tranditional transitive closure
+// algorithm to perform CFL matching along the way: instead of recording
+// "whether X is reachable from Y", we keep track of "whether X is reachable
+// from Y at state Z", where the "state" field indicates where we are in the CFL
+// matching process. To understand the matching better, it is advisable to have
+// the state machine shown in Figure 3 of the paper available when reading the
+// codes: all we do here is to selectively expand the transitive closure by
+// discarding edges that are not recognized by the state machine.
+//
+// There is one difference between our current implementation and the one
+// described in the paper: out algorithm eagerly computes all alias pairs after
+// the CFLGraph is built, while in the paper the authors did the computation in
+// a demand-driven fashion. We did not implement the demand-driven algorithm due
+// to the additional coding complexity and higher memory profile, but if we
+// found it necessary we may switch to it eventually.
+//
+//===----------------------------------------------------------------------===//
+
+// N.B. AliasAnalysis as a whole is phrased as a FunctionPass at the moment, and
+// CFLAndersAA is interprocedural. This is *technically* A Bad Thing, because
+// FunctionPasses are only allowed to inspect the Function that they're being
+// run on. Realistically, this likely isn't a problem until we allow
+// FunctionPasses to run concurrently.
+
+#include "llvm/Analysis/CFLAndersAliasAnalysis.h"
+#include "CFLGraph.h"
+#include "llvm/ADT/DenseSet.h"
+#include "llvm/Pass.h"
+
+using namespace llvm;
+using namespace llvm::cflaa;
+
+#define DEBUG_TYPE "cfl-anders-aa"
+
+CFLAndersAAResult::CFLAndersAAResult(const TargetLibraryInfo &TLI) : TLI(TLI) {}
+CFLAndersAAResult::CFLAndersAAResult(CFLAndersAAResult &&RHS)
+    : AAResultBase(std::move(RHS)), TLI(RHS.TLI) {}
+CFLAndersAAResult::~CFLAndersAAResult() {}
+
+static const Function *parentFunctionOfValue(const Value *Val) {
+  if (auto *Inst = dyn_cast<Instruction>(Val)) {
+    auto *Bb = Inst->getParent();
+    return Bb->getParent();
+  }
+
+  if (auto *Arg = dyn_cast<Argument>(Val))
+    return Arg->getParent();
+  return nullptr;
+}
+
+namespace {
+
+enum class MatchState : uint8_t {
+  FlowFrom = 0,     // S1 in the paper
+  FlowFromMemAlias, // S2 in the paper
+  FlowTo,           // S3 in the paper
+  FlowToMemAlias    // S4 in the paper
+};
+
+// We use ReachabilitySet to keep track of value aliases (The nonterminal "V" in
+// the paper) during the analysis.
+class ReachabilitySet {
+  typedef std::bitset<4> StateSet;
+  typedef DenseMap<InstantiatedValue, StateSet> ValueStateMap;
+  typedef DenseMap<InstantiatedValue, ValueStateMap> ValueReachMap;
+  ValueReachMap ReachMap;
+
+public:
+  typedef ValueStateMap::const_iterator const_valuestate_iterator;
+  typedef ValueReachMap::const_iterator const_value_iterator;
+
+  // Insert edge 'From->To' at state 'State'
+  bool insert(InstantiatedValue From, InstantiatedValue To, MatchState State) {
+    auto &States = ReachMap[To][From];
+    auto Idx = static_cast<size_t>(State);
+    if (!States.test(Idx)) {
+      States.set(Idx);
+      return true;
+    }
+    return false;
+  }
+
+  // Return the set of all ('From', 'State') pair for a given node 'To'
+  iterator_range<const_valuestate_iterator>
+  reachableValueAliases(InstantiatedValue V) const {
+    auto Itr = ReachMap.find(V);
+    if (Itr == ReachMap.end())
+      return make_range<const_valuestate_iterator>(const_valuestate_iterator(),
+                                                   const_valuestate_iterator());
+    return make_range<const_valuestate_iterator>(Itr->second.begin(),
+                                                 Itr->second.end());
+  }
+
+  iterator_range<const_value_iterator> value_mappings() const {
+    return make_range<const_value_iterator>(ReachMap.begin(), ReachMap.end());
+  }
+};
+
+// We use AliasMemSet to keep track of all memory aliases (the nonterminal "M"
+// in the paper) during the analysis.
+class AliasMemSet {
+  typedef DenseSet<InstantiatedValue> MemSet;
+  typedef DenseMap<InstantiatedValue, MemSet> MemMapType;
+  MemMapType MemMap;
+
+public:
+  typedef MemSet::const_iterator const_mem_iterator;
+
+  bool insert(InstantiatedValue LHS, InstantiatedValue RHS) {
+    // Top-level values can never be memory aliases because one cannot take the
+    // addresses of them
+    assert(LHS.DerefLevel > 0 && RHS.DerefLevel > 0);
+    return MemMap[LHS].insert(RHS).second;
+  }
+
+  const MemSet *getMemoryAliases(InstantiatedValue V) const {
+    auto Itr = MemMap.find(V);
+    if (Itr == MemMap.end())
+      return nullptr;
+    return &Itr->second;
+  }
+};
+
+// We use AliasAttrMap to keep track of the AliasAttr of each node.
+class AliasAttrMap {
+  typedef DenseMap<InstantiatedValue, AliasAttrs> MapType;
+  MapType AttrMap;
+
+public:
+  typedef MapType::const_iterator const_iterator;
+
+  bool add(InstantiatedValue V, AliasAttrs Attr) {
+    if (Attr.none())
+      return false;
+    auto &OldAttr = AttrMap[V];
+    auto NewAttr = OldAttr | Attr;
+    if (OldAttr == NewAttr)
+      return false;
+    OldAttr = NewAttr;
+    return true;
+  }
+
+  AliasAttrs getAttrs(InstantiatedValue V) const {
+    AliasAttrs Attr;
+    auto Itr = AttrMap.find(V);
+    if (Itr != AttrMap.end())
+      Attr = Itr->second;
+    return Attr;
+  }
+
+  iterator_range<const_iterator> mappings() const {
+    return make_range<const_iterator>(AttrMap.begin(), AttrMap.end());
+  }
+};
+
+struct WorkListItem {
+  InstantiatedValue From;
+  InstantiatedValue To;
+  MatchState State;
+};
+}
+
+class CFLAndersAAResult::FunctionInfo {
+  /// Map a value to other values that may alias it
+  /// Since the alias relation is symmetric, to save some space we assume values
+  /// are properly ordered: if a and b alias each other, and a < b, then b is in
+  /// AliasMap[a] but not vice versa.
+  DenseMap<const Value *, std::vector<const Value *>> AliasMap;
+
+  /// Map a value to its corresponding AliasAttrs
+  DenseMap<const Value *, AliasAttrs> AttrMap;
+
+  /// Summary of externally visible effects.
+  AliasSummary Summary;
+
+  AliasAttrs getAttrs(const Value *) const;
+
+public:
+  FunctionInfo(const ReachabilitySet &, AliasAttrMap);
+
+  bool mayAlias(const Value *LHS, const Value *RHS) const;
+  const AliasSummary &getAliasSummary() const { return Summary; }
+};
+
+CFLAndersAAResult::FunctionInfo::FunctionInfo(const ReachabilitySet &ReachSet,
+                                              AliasAttrMap AMap) {
+  // Populate AttrMap
+  for (const auto &Mapping : AMap.mappings()) {
+    auto IVal = Mapping.first;
+
+    // AttrMap only cares about top-level values
+    if (IVal.DerefLevel == 0)
+      AttrMap[IVal.Val] = Mapping.second;
+  }
+
+  // Populate AliasMap
+  for (const auto &OuterMapping : ReachSet.value_mappings()) {
+    // AliasMap only cares about top-level values
+    if (OuterMapping.first.DerefLevel > 0)
+      continue;
+
+    auto Val = OuterMapping.first.Val;
+    auto &AliasList = AliasMap[Val];
+    for (const auto &InnerMapping : OuterMapping.second) {
+      // Again, AliasMap only cares about top-level values
+      if (InnerMapping.first.DerefLevel == 0)
+        AliasList.push_back(InnerMapping.first.Val);
+    }
+
+    // Sort AliasList for faster lookup
+    std::sort(AliasList.begin(), AliasList.end(), std::less<const Value *>());
+  }
+
+  // TODO: Populate function summary here
+}
+
+AliasAttrs CFLAndersAAResult::FunctionInfo::getAttrs(const Value *V) const {
+  assert(V != nullptr);
+
+  AliasAttrs Attr;
+  auto Itr = AttrMap.find(V);
+  if (Itr != AttrMap.end())
+    Attr = Itr->second;
+  return Attr;
+}
+
+bool CFLAndersAAResult::FunctionInfo::mayAlias(const Value *LHS,
+                                               const Value *RHS) const {
+  assert(LHS && RHS);
+
+  auto Itr = AliasMap.find(LHS);
+  if (Itr != AliasMap.end()) {
+    if (std::binary_search(Itr->second.begin(), Itr->second.end(), RHS,
+                           std::less<const Value *>()))
+      return true;
+  }
+
+  // Even if LHS and RHS are not reachable, they may still alias due to their
+  // AliasAttrs
+  auto AttrsA = getAttrs(LHS);
+  auto AttrsB = getAttrs(RHS);
+
+  if (AttrsA.none() || AttrsB.none())
+    return false;
+  if (hasUnknownOrCallerAttr(AttrsA) || hasUnknownOrCallerAttr(AttrsB))
+    return true;
+  if (isGlobalOrArgAttr(AttrsA) && isGlobalOrArgAttr(AttrsB))
+    return true;
+  return false;
+}
+
+static void propagate(InstantiatedValue From, InstantiatedValue To,
+                      MatchState State, ReachabilitySet &ReachSet,
+                      std::vector<WorkListItem> &WorkList) {
+  if (From == To)
+    return;
+  if (ReachSet.insert(From, To, State))
+    WorkList.push_back(WorkListItem{From, To, State});
+}
+
+static void initializeWorkList(std::vector<WorkListItem> &WorkList,
+                               ReachabilitySet &ReachSet,
+                               const CFLGraph &Graph) {
+  for (const auto &Mapping : Graph.value_mappings()) {
+    auto Val = Mapping.first;
+    auto &ValueInfo = Mapping.second;
+    assert(ValueInfo.getNumLevels() > 0);
+
+    // Insert all immediate assignment neighbors to the worklist
+    for (unsigned I = 0, E = ValueInfo.getNumLevels(); I < E; ++I) {
+      auto Src = InstantiatedValue{Val, I};
+      // If there's an assignment edge from X to Y, it means Y is reachable from
+      // X at S2 and X is reachable from Y at S1
+      for (auto &Edge : ValueInfo.getNodeInfoAtLevel(I).Edges) {
+        propagate(Edge.Other, Src, MatchState::FlowFrom, ReachSet, WorkList);
+        propagate(Src, Edge.Other, MatchState::FlowTo, ReachSet, WorkList);
+      }
+    }
+  }
+}
+
+static Optional<InstantiatedValue> getNodeBelow(const CFLGraph &Graph,
+                                                InstantiatedValue V) {
+  auto NodeBelow = InstantiatedValue{V.Val, V.DerefLevel + 1};
+  if (Graph.getNode(NodeBelow))
+    return NodeBelow;
+  return None;
+}
+
+static void processWorkListItem(const WorkListItem &Item, const CFLGraph &Graph,
+                                ReachabilitySet &ReachSet, AliasMemSet &MemSet,
+                                std::vector<WorkListItem> &WorkList) {
+  auto FromNode = Item.From;
+  auto ToNode = Item.To;
+
+  auto NodeInfo = Graph.getNode(ToNode);
+  assert(NodeInfo != nullptr);
+
+  // TODO: propagate field offsets
+
+  // FIXME: Here is a neat trick we can do: since both ReachSet and MemSet holds
+  // relations that are symmetric, we could actually cut the storage by half by
+  // sorting FromNode and ToNode before insertion happens.
+
+  // The newly added value alias pair may pontentially generate more memory
+  // alias pairs. Check for them here.
+  auto FromNodeBelow = getNodeBelow(Graph, FromNode);
+  auto ToNodeBelow = getNodeBelow(Graph, ToNode);
+  if (FromNodeBelow && ToNodeBelow &&
+      MemSet.insert(*FromNodeBelow, *ToNodeBelow)) {
+    propagate(*FromNodeBelow, *ToNodeBelow, MatchState::FlowFromMemAlias,
+              ReachSet, WorkList);
+    for (const auto &Mapping : ReachSet.reachableValueAliases(*FromNodeBelow)) {
+      auto Src = Mapping.first;
+      if (Mapping.second.test(static_cast<size_t>(MatchState::FlowFrom)))
+        propagate(Src, *ToNodeBelow, MatchState::FlowFromMemAlias, ReachSet,
+                  WorkList);
+      if (Mapping.second.test(static_cast<size_t>(MatchState::FlowTo)))
+        propagate(Src, *ToNodeBelow, MatchState::FlowToMemAlias, ReachSet,
+                  WorkList);
+    }
+  }
+
+  // This is the core of the state machine walking algorithm. We expand ReachSet
+  // based on which state we are at (which in turn dictates what edges we
+  // should examine)
+  // From a high-level point of view, the state machine here guarantees two
+  // properties:
+  // - If *X and *Y are memory aliases, then X and Y are value aliases
+  // - If Y is an alias of X, then reverse assignment edges (if there is any)
+  // should precede any assignment edges on the path from X to Y.
+  switch (Item.State) {
+  case MatchState::FlowFrom: {
+    for (const auto &RevAssignEdge : NodeInfo->ReverseEdges)
+      propagate(FromNode, RevAssignEdge.Other, MatchState::FlowFrom, ReachSet,
+                WorkList);
+    for (const auto &AssignEdge : NodeInfo->Edges)
+      propagate(FromNode, AssignEdge.Other, MatchState::FlowTo, ReachSet,
+                WorkList);
+    if (auto AliasSet = MemSet.getMemoryAliases(ToNode)) {
+      for (const auto &MemAlias : *AliasSet)
+        propagate(FromNode, MemAlias, MatchState::FlowFromMemAlias, ReachSet,
+                  WorkList);
+    }
+    break;
+  }
+  case MatchState::FlowFromMemAlias: {
+    for (const auto &RevAssignEdge : NodeInfo->ReverseEdges)
+      propagate(FromNode, RevAssignEdge.Other, MatchState::FlowFrom, ReachSet,
+                WorkList);
+    for (const auto &AssignEdge : NodeInfo->Edges)
+      propagate(FromNode, AssignEdge.Other, MatchState::FlowTo, ReachSet,
+                WorkList);
+    break;
+  }
+  case MatchState::FlowTo: {
+    for (const auto &AssignEdge : NodeInfo->Edges)
+      propagate(FromNode, AssignEdge.Other, MatchState::FlowTo, ReachSet,
+                WorkList);
+    if (auto AliasSet = MemSet.getMemoryAliases(ToNode)) {
+      for (const auto &MemAlias : *AliasSet)
+        propagate(FromNode, MemAlias, MatchState::FlowToMemAlias, ReachSet,
+                  WorkList);
+    }
+    break;
+  }
+  case MatchState::FlowToMemAlias: {
+    for (const auto &AssignEdge : NodeInfo->Edges)
+      propagate(FromNode, AssignEdge.Other, MatchState::FlowTo, ReachSet,
+                WorkList);
+    break;
+  }
+  }
+}
+
+static AliasAttrMap buildAttrMap(const CFLGraph &Graph,
+                                 const ReachabilitySet &ReachSet) {
+  AliasAttrMap AttrMap;
+  std::vector<InstantiatedValue> WorkList, NextList;
+
+  // Initialize each node with its original AliasAttrs in CFLGraph
+  for (const auto &Mapping : Graph.value_mappings()) {
+    auto Val = Mapping.first;
+    auto &ValueInfo = Mapping.second;
+    for (unsigned I = 0, E = ValueInfo.getNumLevels(); I < E; ++I) {
+      auto Node = InstantiatedValue{Val, I};
+      AttrMap.add(Node, ValueInfo.getNodeInfoAtLevel(I).Attr);
+      WorkList.push_back(Node);
+    }
+  }
+
+  while (!WorkList.empty()) {
+    for (const auto &Dst : WorkList) {
+      auto DstAttr = AttrMap.getAttrs(Dst);
+      if (DstAttr.none())
+        continue;
+
+      // Propagate attr on the same level
+      for (const auto &Mapping : ReachSet.reachableValueAliases(Dst)) {
+        auto Src = Mapping.first;
+        if (AttrMap.add(Src, DstAttr))
+          NextList.push_back(Src);
+      }
+
+      // Propagate attr to the levels below
+      auto DstBelow = getNodeBelow(Graph, Dst);
+      while (DstBelow) {
+        if (AttrMap.add(*DstBelow, DstAttr)) {
+          NextList.push_back(*DstBelow);
+          break;
+        }
+        DstBelow = getNodeBelow(Graph, *DstBelow);
+      }
+    }
+    WorkList.swap(NextList);
+    NextList.clear();
+  }
+
+  return AttrMap;
+}
+
+CFLAndersAAResult::FunctionInfo
+CFLAndersAAResult::buildInfoFrom(const Function &Fn) {
+  CFLGraphBuilder<CFLAndersAAResult> GraphBuilder(
+      *this, TLI,
+      // Cast away the constness here due to GraphBuilder's API requirement
+      const_cast<Function &>(Fn));
+  auto &Graph = GraphBuilder.getCFLGraph();
+
+  ReachabilitySet ReachSet;
+  AliasMemSet MemSet;
+
+  std::vector<WorkListItem> WorkList, NextList;
+  initializeWorkList(WorkList, ReachSet, Graph);
+  // TODO: make sure we don't stop before the fix point is reached
+  while (!WorkList.empty()) {
+    for (const auto &Item : WorkList)
+      processWorkListItem(Item, Graph, ReachSet, MemSet, NextList);
+
+    NextList.swap(WorkList);
+    NextList.clear();
+  }
+
+  // Now that we have all the reachability info, propagate AliasAttrs according
+  // to it
+  auto IValueAttrMap = buildAttrMap(Graph, ReachSet);
+
+  return FunctionInfo(ReachSet, std::move(IValueAttrMap));
+}
+
+void CFLAndersAAResult::scan(const Function &Fn) {
+  auto InsertPair = Cache.insert(std::make_pair(&Fn, Optional<FunctionInfo>()));
+  (void)InsertPair;
+  assert(InsertPair.second &&
+         "Trying to scan a function that has already been cached");
+
+  // Note that we can't do Cache[Fn] = buildSetsFrom(Fn) here: the function call
+  // may get evaluated after operator[], potentially triggering a DenseMap
+  // resize and invalidating the reference returned by operator[]
+  auto FunInfo = buildInfoFrom(Fn);
+  Cache[&Fn] = std::move(FunInfo);
+  Handles.push_front(FunctionHandle(const_cast<Function *>(&Fn), this));
+}
+
+void CFLAndersAAResult::evict(const Function &Fn) { Cache.erase(&Fn); }
+
+const Optional<CFLAndersAAResult::FunctionInfo> &
+CFLAndersAAResult::ensureCached(const Function &Fn) {
+  auto Iter = Cache.find(&Fn);
+  if (Iter == Cache.end()) {
+    scan(Fn);
+    Iter = Cache.find(&Fn);
+    assert(Iter != Cache.end());
+    assert(Iter->second.hasValue());
+  }
+  return Iter->second;
+}
+
+const AliasSummary *CFLAndersAAResult::getAliasSummary(const Function &Fn) {
+  auto &FunInfo = ensureCached(Fn);
+  if (FunInfo.hasValue())
+    return &FunInfo->getAliasSummary();
+  else
+    return nullptr;
+}
+
+AliasResult CFLAndersAAResult::query(const MemoryLocation &LocA,
+                                     const MemoryLocation &LocB) {
+  auto *ValA = LocA.Ptr;
+  auto *ValB = LocB.Ptr;
+
+  if (!ValA->getType()->isPointerTy() || !ValB->getType()->isPointerTy())
+    return NoAlias;
+
+  auto *Fn = parentFunctionOfValue(ValA);
+  if (!Fn) {
+    Fn = parentFunctionOfValue(ValB);
+    if (!Fn) {
+      // The only times this is known to happen are when globals + InlineAsm are
+      // involved
+      DEBUG(dbgs()
+            << "CFLAndersAA: could not extract parent function information.\n");
+      return MayAlias;
+    }
+  } else {
+    assert(!parentFunctionOfValue(ValB) || parentFunctionOfValue(ValB) == Fn);
+  }
+
+  assert(Fn != nullptr);
+  auto &FunInfo = ensureCached(*Fn);
+
+  // AliasMap lookup
+  if (FunInfo->mayAlias(ValA, ValB))
+    return MayAlias;
+  return NoAlias;
+}
+
+AliasResult CFLAndersAAResult::alias(const MemoryLocation &LocA,
+                                     const MemoryLocation &LocB) {
+  if (LocA.Ptr == LocB.Ptr)
+    return LocA.Size == LocB.Size ? MustAlias : PartialAlias;
+
+  // Comparisons between global variables and other constants should be
+  // handled by BasicAA.
+  // CFLAndersAA may report NoAlias when comparing a GlobalValue and
+  // ConstantExpr, but every query needs to have at least one Value tied to a
+  // Function, and neither GlobalValues nor ConstantExprs are.
+  if (isa<Constant>(LocA.Ptr) && isa<Constant>(LocB.Ptr))
+    return AAResultBase::alias(LocA, LocB);
+
+  AliasResult QueryResult = query(LocA, LocB);
+  if (QueryResult == MayAlias)
+    return AAResultBase::alias(LocA, LocB);
+
+  return QueryResult;
+}
+
+char CFLAndersAA::PassID;
+
+CFLAndersAAResult CFLAndersAA::run(Function &F, AnalysisManager<Function> &AM) {
+  return CFLAndersAAResult(AM.getResult<TargetLibraryAnalysis>(F));
+}
+
+char CFLAndersAAWrapperPass::ID = 0;
+INITIALIZE_PASS(CFLAndersAAWrapperPass, "cfl-anders-aa",
+                "Inclusion-Based CFL Alias Analysis", false, true)
+
+ImmutablePass *llvm::createCFLAndersAAWrapperPass() {
+  return new CFLAndersAAWrapperPass();
+}
+
+CFLAndersAAWrapperPass::CFLAndersAAWrapperPass() : ImmutablePass(ID) {
+  initializeCFLAndersAAWrapperPassPass(*PassRegistry::getPassRegistry());
+}
+
+void CFLAndersAAWrapperPass::initializePass() {
+  auto &TLIWP = getAnalysis<TargetLibraryInfoWrapperPass>();
+  Result.reset(new CFLAndersAAResult(TLIWP.getTLI()));
+}
+
+void CFLAndersAAWrapperPass::getAnalysisUsage(AnalysisUsage &AU) const {
+  AU.setPreservesAll();
+  AU.addRequired<TargetLibraryInfoWrapperPass>();
+}
diff --git a/lib/Analysis/CFLGraph.h b/lib/Analysis/CFLGraph.h
new file mode 100644
index 000000000000..bc6e794d0b2a
--- /dev/null
+++ b/lib/Analysis/CFLGraph.h
@@ -0,0 +1,544 @@
+//======- CFLGraph.h - Abstract stratified sets implementation. --------======//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+/// \file
+/// This file defines CFLGraph, an auxiliary data structure used by CFL-based
+/// alias analysis.
+///
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_ANALYSIS_CFLGRAPH_H
+#define LLVM_ANALYSIS_CFLGRAPH_H
+
+#include "AliasAnalysisSummary.h"
+#include "llvm/ADT/SmallPtrSet.h"
+#include "llvm/Analysis/MemoryBuiltins.h"
+#include "llvm/IR/InstVisitor.h"
+#include "llvm/IR/Instructions.h"
+
+namespace llvm {
+namespace cflaa {
+
+/// \brief The Program Expression Graph (PEG) of CFL analysis
+/// CFLGraph is auxiliary data structure used by CFL-based alias analysis to
+/// describe flow-insensitive pointer-related behaviors. Given an LLVM function,
+/// the main purpose of this graph is to abstract away unrelated facts and
+/// translate the rest into a form that can be easily digested by CFL analyses.
+/// Each Node in the graph is an InstantiatedValue, and each edge represent a
+/// pointer assignment between InstantiatedValue. Pointer
+/// references/dereferences are not explicitly stored in the graph: we
+/// implicitly assume that for each node (X, I) it has a dereference edge to (X,
+/// I+1) and a reference edge to (X, I-1).
+class CFLGraph {
+public:
+  typedef InstantiatedValue Node;
+
+  struct Edge {
+    Node Other;
+  };
+
+  typedef std::vector<Edge> EdgeList;
+
+  struct NodeInfo {
+    EdgeList Edges, ReverseEdges;
+    AliasAttrs Attr;
+  };
+
+  class ValueInfo {
+    std::vector<NodeInfo> Levels;
+
+  public:
+    bool addNodeToLevel(unsigned Level) {
+      auto NumLevels = Levels.size();
+      if (NumLevels > Level)
+        return false;
+      Levels.resize(Level + 1);
+      return true;
+    }
+
+    NodeInfo &getNodeInfoAtLevel(unsigned Level) {
+      assert(Level < Levels.size());
+      return Levels[Level];
+    }
+    const NodeInfo &getNodeInfoAtLevel(unsigned Level) const {
+      assert(Level < Levels.size());
+      return Levels[Level];
+    }
+
+    unsigned getNumLevels() const { return Levels.size(); }
+  };
+
+private:
+  typedef DenseMap<Value *, ValueInfo> ValueMap;
+  ValueMap ValueImpls;
+
+  NodeInfo *getNode(Node N) {
+    auto Itr = ValueImpls.find(N.Val);
+    if (Itr == ValueImpls.end() || Itr->second.getNumLevels() <= N.DerefLevel)
+      return nullptr;
+    return &Itr->second.getNodeInfoAtLevel(N.DerefLevel);
+  }
+
+public:
+  typedef ValueMap::const_iterator const_value_iterator;
+
+  bool addNode(Node N, AliasAttrs Attr = AliasAttrs()) {
+    assert(N.Val != nullptr);
+    auto &ValInfo = ValueImpls[N.Val];
+    auto Changed = ValInfo.addNodeToLevel(N.DerefLevel);
+    ValInfo.getNodeInfoAtLevel(N.DerefLevel).Attr |= Attr;
+    return Changed;
+  }
+
+  void addAttr(Node N, AliasAttrs Attr) {
+    auto *Info = getNode(N);
+    assert(Info != nullptr);
+    Info->Attr |= Attr;
+  }
+
+  void addEdge(Node From, Node To, int64_t Offset = 0) {
+    auto *FromInfo = getNode(From);
+    assert(FromInfo != nullptr);
+    auto *ToInfo = getNode(To);
+    assert(ToInfo != nullptr);
+
+    FromInfo->Edges.push_back(Edge{To});
+    ToInfo->ReverseEdges.push_back(Edge{From});
+  }
+
+  const NodeInfo *getNode(Node N) const {
+    auto Itr = ValueImpls.find(N.Val);
+    if (Itr == ValueImpls.end() || Itr->second.getNumLevels() <= N.DerefLevel)
+      return nullptr;
+    return &Itr->second.getNodeInfoAtLevel(N.DerefLevel);
+  }
+
+  AliasAttrs attrFor(Node N) const {
+    auto *Info = getNode(N);
+    assert(Info != nullptr);
+    return Info->Attr;
+  }
+
+  iterator_range<const_value_iterator> value_mappings() const {
+    return make_range<const_value_iterator>(ValueImpls.begin(),
+                                            ValueImpls.end());
+  }
+};
+
+///\brief A builder class used to create CFLGraph instance from a given function
+/// The CFL-AA that uses this builder must provide its own type as a template
+/// argument. This is necessary for interprocedural processing: CFLGraphBuilder
+/// needs a way of obtaining the summary of other functions when callinsts are
+/// encountered.
+/// As a result, we expect the said CFL-AA to expose a getAliasSummary() public
+/// member function that takes a Function& and returns the corresponding summary
+/// as a const AliasSummary*.
+template <typename CFLAA> class CFLGraphBuilder {
+  // Input of the builder
+  CFLAA &Analysis;
+  const TargetLibraryInfo &TLI;
+
+  // Output of the builder
+  CFLGraph Graph;
+  SmallVector<Value *, 4> ReturnedValues;
+
+  // Helper class
+  /// Gets the edges our graph should have, based on an Instruction*
+  class GetEdgesVisitor : public InstVisitor<GetEdgesVisitor, void> {
+    CFLAA &AA;
+    const TargetLibraryInfo &TLI;
+
+    CFLGraph &Graph;
+    SmallVectorImpl<Value *> &ReturnValues;
+
+    static bool hasUsefulEdges(ConstantExpr *CE) {
+      // ConstantExpr doesn't have terminators, invokes, or fences, so only
+      // needs
+      // to check for compares.
+      return CE->getOpcode() != Instruction::ICmp &&
+             CE->getOpcode() != Instruction::FCmp;
+    }
+
+    // Returns possible functions called by CS into the given SmallVectorImpl.
+    // Returns true if targets found, false otherwise.
+    static bool getPossibleTargets(CallSite CS,
+                                   SmallVectorImpl<Function *> &Output) {
+      if (auto *Fn = CS.getCalledFunction()) {
+        Output.push_back(Fn);
+        return true;
+      }
+
+      // TODO: If the call is indirect, we might be able to enumerate all
+      // potential
+      // targets of the call and return them, rather than just failing.
+      return false;
+    }
+
+    void addNode(Value *Val, AliasAttrs Attr = AliasAttrs()) {
+      assert(Val != nullptr && Val->getType()->isPointerTy());
+      if (auto GVal = dyn_cast<GlobalValue>(Val)) {
+        if (Graph.addNode(InstantiatedValue{GVal, 0},
+                          getGlobalOrArgAttrFromValue(*GVal)))
+          Graph.addNode(InstantiatedValue{GVal, 1}, getAttrUnknown());
+      } else if (auto CExpr = dyn_cast<ConstantExpr>(Val)) {
+        if (hasUsefulEdges(CExpr)) {
+          if (Graph.addNode(InstantiatedValue{CExpr, 0}))
+            visitConstantExpr(CExpr);
+        }
+      } else
+        Graph.addNode(InstantiatedValue{Val, 0}, Attr);
+    }
+
+    void addAssignEdge(Value *From, Value *To, int64_t Offset = 0) {
+      assert(From != nullptr && To != nullptr);
+      if (!From->getType()->isPointerTy() || !To->getType()->isPointerTy())
+        return;
+      addNode(From);
+      if (To != From) {
+        addNode(To);
+        Graph.addEdge(InstantiatedValue{From, 0}, InstantiatedValue{To, 0},
+                      Offset);
+      }
+    }
+
+    void addDerefEdge(Value *From, Value *To, bool IsRead) {
+      assert(From != nullptr && To != nullptr);
+      if (!From->getType()->isPointerTy() || !To->getType()->isPointerTy())
+        return;
+      addNode(From);
+      addNode(To);
+      if (IsRead) {
+        Graph.addNode(InstantiatedValue{From, 1});
+        Graph.addEdge(InstantiatedValue{From, 1}, InstantiatedValue{To, 0});
+      } else {
+        Graph.addNode(InstantiatedValue{To, 1});
+        Graph.addEdge(InstantiatedValue{From, 0}, InstantiatedValue{To, 1});
+      }
+    }
+
+    void addLoadEdge(Value *From, Value *To) { addDerefEdge(From, To, true); }
+    void addStoreEdge(Value *From, Value *To) { addDerefEdge(From, To, false); }
+
+  public:
+    GetEdgesVisitor(CFLGraphBuilder &Builder)
+        : AA(Builder.Analysis), TLI(Builder.TLI), Graph(Builder.Graph),
+          ReturnValues(Builder.ReturnedValues) {}
+
+    void visitInstruction(Instruction &) {
+      llvm_unreachable("Unsupported instruction encountered");
+    }
+
+    void visitReturnInst(ReturnInst &Inst) {
+      if (auto RetVal = Inst.getReturnValue()) {
+        if (RetVal->getType()->isPointerTy()) {
+          addNode(RetVal);
+          ReturnValues.push_back(RetVal);
+        }
+      }
+    }
+
+    void visitPtrToIntInst(PtrToIntInst &Inst) {
+      auto *Ptr = Inst.getOperand(0);
+      addNode(Ptr, getAttrEscaped());
+    }
+
+    void visitIntToPtrInst(IntToPtrInst &Inst) {
+      auto *Ptr = &Inst;
+      addNode(Ptr, getAttrUnknown());
+    }
+
+    void visitCastInst(CastInst &Inst) {
+      auto *Src = Inst.getOperand(0);
+      addAssignEdge(Src, &Inst);
+    }
+
+    void visitBinaryOperator(BinaryOperator &Inst) {
+      auto *Op1 = Inst.getOperand(0);
+      auto *Op2 = Inst.getOperand(1);
+      addAssignEdge(Op1, &Inst);
+      addAssignEdge(Op2, &Inst);
+    }
+
+    void visitAtomicCmpXchgInst(AtomicCmpXchgInst &Inst) {
+      auto *Ptr = Inst.getPointerOperand();
+      auto *Val = Inst.getNewValOperand();
+      addStoreEdge(Val, Ptr);
+    }
+
+    void visitAtomicRMWInst(AtomicRMWInst &Inst) {
+      auto *Ptr = Inst.getPointerOperand();
+      auto *Val = Inst.getValOperand();
+      addStoreEdge(Val, Ptr);
+    }
+
+    void visitPHINode(PHINode &Inst) {
+      for (Value *Val : Inst.incoming_values())
+        addAssignEdge(Val, &Inst);
+    }
+
+    void visitGetElementPtrInst(GetElementPtrInst &Inst) {
+      auto *Op = Inst.getPointerOperand();
+      addAssignEdge(Op, &Inst);
+    }
+
+    void visitSelectInst(SelectInst &Inst) {
+      // Condition is not processed here (The actual statement producing
+      // the condition result is processed elsewhere). For select, the
+      // condition is evaluated, but not loaded, stored, or assigned
+      // simply as a result of being the condition of a select.
+
+      auto *TrueVal = Inst.getTrueValue();
+      auto *FalseVal = Inst.getFalseValue();
+      addAssignEdge(TrueVal, &Inst);
+      addAssignEdge(FalseVal, &Inst);
+    }
+
+    void visitAllocaInst(AllocaInst &Inst) { addNode(&Inst); }
+
+    void visitLoadInst(LoadInst &Inst) {
+      auto *Ptr = Inst.getPointerOperand();
+      auto *Val = &Inst;
+      addLoadEdge(Ptr, Val);
+    }
+
+    void visitStoreInst(StoreInst &Inst) {
+      auto *Ptr = Inst.getPointerOperand();
+      auto *Val = Inst.getValueOperand();
+      addStoreEdge(Val, Ptr);
+    }
+
+    void visitVAArgInst(VAArgInst &Inst) {
+      // We can't fully model va_arg here. For *Ptr = Inst.getOperand(0), it
+      // does
+      // two things:
+      //  1. Loads a value from *((T*)*Ptr).
+      //  2. Increments (stores to) *Ptr by some target-specific amount.
+      // For now, we'll handle this like a landingpad instruction (by placing
+      // the
+      // result in its own group, and having that group alias externals).
+      addNode(&Inst, getAttrUnknown());
+    }
+
+    static bool isFunctionExternal(Function *Fn) {
+      return !Fn->hasExactDefinition();
+    }
+
+    bool tryInterproceduralAnalysis(CallSite CS,
+                                    const SmallVectorImpl<Function *> &Fns) {
+      assert(Fns.size() > 0);
+
+      if (CS.arg_size() > MaxSupportedArgsInSummary)
+        return false;
+
+      // Exit early if we'll fail anyway
+      for (auto *Fn : Fns) {
+        if (isFunctionExternal(Fn) || Fn->isVarArg())
+          return false;
+        // Fail if the caller does not provide enough arguments
+        assert(Fn->arg_size() <= CS.arg_size());
+        if (!AA.getAliasSummary(*Fn))
+          return false;
+      }
+
+      for (auto *Fn : Fns) {
+        auto Summary = AA.getAliasSummary(*Fn);
+        assert(Summary != nullptr);
+
+        auto &RetParamRelations = Summary->RetParamRelations;
+        for (auto &Relation : RetParamRelations) {
+          auto IRelation = instantiateExternalRelation(Relation, CS);
+          if (IRelation.hasValue()) {
+            Graph.addNode(IRelation->From);
+            Graph.addNode(IRelation->To);
+            Graph.addEdge(IRelation->From, IRelation->To);
+          }
+        }
+
+        auto &RetParamAttributes = Summary->RetParamAttributes;
+        for (auto &Attribute : RetParamAttributes) {
+          auto IAttr = instantiateExternalAttribute(Attribute, CS);
+          if (IAttr.hasValue())
+            Graph.addNode(IAttr->IValue, IAttr->Attr);
+        }
+      }
+
+      return true;
+    }
+
+    void visitCallSite(CallSite CS) {
+      auto Inst = CS.getInstruction();
+
+      // Make sure all arguments and return value are added to the graph first
+      for (Value *V : CS.args())
+        if (V->getType()->isPointerTy())
+          addNode(V);
+      if (Inst->getType()->isPointerTy())
+        addNode(Inst);
+
+      // Check if Inst is a call to a library function that
+      // allocates/deallocates
+      // on the heap. Those kinds of functions do not introduce any aliases.
+      // TODO: address other common library functions such as realloc(),
+      // strdup(),
+      // etc.
+      if (isMallocLikeFn(Inst, &TLI) || isCallocLikeFn(Inst, &TLI) ||
+          isFreeCall(Inst, &TLI))
+        return;
+
+      // TODO: Add support for noalias args/all the other fun function
+      // attributes
+      // that we can tack on.
+      SmallVector<Function *, 4> Targets;
+      if (getPossibleTargets(CS, Targets))
+        if (tryInterproceduralAnalysis(CS, Targets))
+          return;
+
+      // Because the function is opaque, we need to note that anything
+      // could have happened to the arguments (unless the function is marked
+      // readonly or readnone), and that the result could alias just about
+      // anything, too (unless the result is marked noalias).
+      if (!CS.onlyReadsMemory())
+        for (Value *V : CS.args()) {
+          if (V->getType()->isPointerTy()) {
+            // The argument itself escapes.
+            Graph.addAttr(InstantiatedValue{V, 0}, getAttrEscaped());
+            // The fate of argument memory is unknown. Note that since
+            // AliasAttrs is transitive with respect to dereference, we only
+            // need to specify it for the first-level memory.
+            Graph.addNode(InstantiatedValue{V, 1}, getAttrUnknown());
+          }
+        }
+
+      if (Inst->getType()->isPointerTy()) {
+        auto *Fn = CS.getCalledFunction();
+        if (Fn == nullptr || !Fn->doesNotAlias(0))
+          // No need to call addNode() since we've added Inst at the
+          // beginning of this function and we know it is not a global.
+          Graph.addAttr(InstantiatedValue{Inst, 0}, getAttrUnknown());
+      }
+    }
+
+    /// Because vectors/aggregates are immutable and unaddressable, there's
+    /// nothing we can do to coax a value out of them, other than calling
+    /// Extract{Element,Value}. We can effectively treat them as pointers to
+    /// arbitrary memory locations we can store in and load from.
+    void visitExtractElementInst(ExtractElementInst &Inst) {
+      auto *Ptr = Inst.getVectorOperand();
+      auto *Val = &Inst;
+      addLoadEdge(Ptr, Val);
+    }
+
+    void visitInsertElementInst(InsertElementInst &Inst) {
+      auto *Vec = Inst.getOperand(0);
+      auto *Val = Inst.getOperand(1);
+      addAssignEdge(Vec, &Inst);
+      addStoreEdge(Val, &Inst);
+    }
+
+    void visitLandingPadInst(LandingPadInst &Inst) {
+      // Exceptions come from "nowhere", from our analysis' perspective.
+      // So we place the instruction its own group, noting that said group may
+      // alias externals
+      addNode(&Inst, getAttrUnknown());
+    }
+
+    void visitInsertValueInst(InsertValueInst &Inst) {
+      auto *Agg = Inst.getOperand(0);
+      auto *Val = Inst.getOperand(1);
+      addAssignEdge(Agg, &Inst);
+      addStoreEdge(Val, &Inst);
+    }
+
+    void visitExtractValueInst(ExtractValueInst &Inst) {
+      auto *Ptr = Inst.getAggregateOperand();
+      addLoadEdge(Ptr, &Inst);
+    }
+
+    void visitShuffleVectorInst(ShuffleVectorInst &Inst) {
+      auto *From1 = Inst.getOperand(0);
+      auto *From2 = Inst.getOperand(1);
+      addAssignEdge(From1, &Inst);
+      addAssignEdge(From2, &Inst);
+    }
+
+    void visitConstantExpr(ConstantExpr *CE) {
+      switch (CE->getOpcode()) {
+      default:
+        llvm_unreachable("Unknown instruction type encountered!");
+// Build the switch statement using the Instruction.def file.
+#define HANDLE_INST(NUM, OPCODE, CLASS)                                        \
+  case Instruction::OPCODE:                                                    \
+    this->visit##OPCODE(*(CLASS *)CE);                                         \
+    break;
+#include "llvm/IR/Instruction.def"
+      }
+    }
+  };
+
+  // Helper functions
+
+  // Determines whether or not we an instruction is useless to us (e.g.
+  // FenceInst)
+  static bool hasUsefulEdges(Instruction *Inst) {
+    bool IsNonInvokeRetTerminator = isa<TerminatorInst>(Inst) &&
+                                    !isa<InvokeInst>(Inst) &&
+                                    !isa<ReturnInst>(Inst);
+    return !isa<CmpInst>(Inst) && !isa<FenceInst>(Inst) &&
+           !IsNonInvokeRetTerminator;
+  }
+
+  void addArgumentToGraph(Argument &Arg) {
+    if (Arg.getType()->isPointerTy()) {
+      Graph.addNode(InstantiatedValue{&Arg, 0},
+                    getGlobalOrArgAttrFromValue(Arg));
+      // Pointees of a formal parameter is known to the caller
+      Graph.addNode(InstantiatedValue{&Arg, 1}, getAttrCaller());
+    }
+  }
+
+  // Given an Instruction, this will add it to the graph, along with any
+  // Instructions that are potentially only available from said Instruction
+  // For example, given the following line:
+  //   %0 = load i16* getelementptr ([1 x i16]* @a, 0, 0), align 2
+  // addInstructionToGraph would add both the `load` and `getelementptr`
+  // instructions to the graph appropriately.
+  void addInstructionToGraph(GetEdgesVisitor &Visitor, Instruction &Inst) {
+    if (!hasUsefulEdges(&Inst))
+      return;
+
+    Visitor.visit(Inst);
+  }
+
+  // Builds the graph needed for constructing the StratifiedSets for the given
+  // function
+  void buildGraphFrom(Function &Fn) {
+    GetEdgesVisitor Visitor(*this);
+
+    for (auto &Bb : Fn.getBasicBlockList())
+      for (auto &Inst : Bb.getInstList())
+        addInstructionToGraph(Visitor, Inst);
+
+    for (auto &Arg : Fn.args())
+      addArgumentToGraph(Arg);
+  }
+
+public:
+  CFLGraphBuilder(CFLAA &Analysis, const TargetLibraryInfo &TLI, Function &Fn)
+      : Analysis(Analysis), TLI(TLI) {
+    buildGraphFrom(Fn);
+  }
+
+  const CFLGraph &getCFLGraph() const { return Graph; }
+  const SmallVector<Value *, 4> &getReturnValues() const {
+    return ReturnedValues;
+  }
+};
+}
+}
+
+#endif
diff --git a/lib/Analysis/CFLSteensAliasAnalysis.cpp b/lib/Analysis/CFLSteensAliasAnalysis.cpp
new file mode 100644
index 000000000000..d816822aaaea
--- /dev/null
+++ b/lib/Analysis/CFLSteensAliasAnalysis.cpp
@@ -0,0 +1,442 @@
+//- CFLSteensAliasAnalysis.cpp - Unification-based Alias Analysis ---*- C++-*-//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements a CFL-base, summary-based alias analysis algorithm. It
+// does not depend on types. The algorithm is a mixture of the one described in
+// "Demand-driven alias analysis for C" by Xin Zheng and Radu Rugina, and "Fast
+// algorithms for Dyck-CFL-reachability with applications to Alias Analysis" by
+// Zhang Q, Lyu M R, Yuan H, and Su Z. -- to summarize the papers, we build a
+// graph of the uses of a variable, where each node is a memory location, and
+// each edge is an action that happened on that memory location.  The "actions"
+// can be one of Dereference, Reference, or Assign. The precision of this
+// analysis is roughly the same as that of an one level context-sensitive
+// Steensgaard's algorithm.
+//
+// Two variables are considered as aliasing iff you can reach one value's node
+// from the other value's node and the language formed by concatenating all of
+// the edge labels (actions) conforms to a context-free grammar.
+//
+// Because this algorithm requires a graph search on each query, we execute the
+// algorithm outlined in "Fast algorithms..." (mentioned above)
+// in order to transform the graph into sets of variables that may alias in
+// ~nlogn time (n = number of variables), which makes queries take constant
+// time.
+//===----------------------------------------------------------------------===//
+
+// N.B. AliasAnalysis as a whole is phrased as a FunctionPass at the moment, and
+// CFLSteensAA is interprocedural. This is *technically* A Bad Thing, because
+// FunctionPasses are only allowed to inspect the Function that they're being
+// run on. Realistically, this likely isn't a problem until we allow
+// FunctionPasses to run concurrently.
+
+#include "llvm/Analysis/CFLSteensAliasAnalysis.h"
+#include "CFLGraph.h"
+#include "StratifiedSets.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/None.h"
+#include "llvm/ADT/Optional.h"
+#include "llvm/Analysis/TargetLibraryInfo.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/Function.h"
+#include "llvm/Pass.h"
+#include "llvm/Support/Compiler.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/raw_ostream.h"
+#include <algorithm>
+#include <cassert>
+#include <memory>
+#include <tuple>
+
+using namespace llvm;
+using namespace llvm::cflaa;
+
+#define DEBUG_TYPE "cfl-steens-aa"
+
+CFLSteensAAResult::CFLSteensAAResult(const TargetLibraryInfo &TLI)
+    : AAResultBase(), TLI(TLI) {}
+CFLSteensAAResult::CFLSteensAAResult(CFLSteensAAResult &&Arg)
+    : AAResultBase(std::move(Arg)), TLI(Arg.TLI) {}
+CFLSteensAAResult::~CFLSteensAAResult() {}
+
+/// Information we have about a function and would like to keep around.
+class CFLSteensAAResult::FunctionInfo {
+  StratifiedSets<InstantiatedValue> Sets;
+  AliasSummary Summary;
+
+public:
+  FunctionInfo(Function &Fn, const SmallVectorImpl<Value *> &RetVals,
+               StratifiedSets<InstantiatedValue> S);
+
+  const StratifiedSets<InstantiatedValue> &getStratifiedSets() const {
+    return Sets;
+  }
+  const AliasSummary &getAliasSummary() const { return Summary; }
+};
+
+/// Try to go from a Value* to a Function*. Never returns nullptr.
+static Optional<Function *> parentFunctionOfValue(Value *);
+
+const StratifiedIndex StratifiedLink::SetSentinel =
+    std::numeric_limits<StratifiedIndex>::max();
+
+//===----------------------------------------------------------------------===//
+// Function declarations that require types defined in the namespace above
+//===----------------------------------------------------------------------===//
+
+/// Determines whether it would be pointless to add the given Value to our sets.
+static bool canSkipAddingToSets(Value *Val);
+
+static Optional<Function *> parentFunctionOfValue(Value *Val) {
+  if (auto *Inst = dyn_cast<Instruction>(Val)) {
+    auto *Bb = Inst->getParent();
+    return Bb->getParent();
+  }
+
+  if (auto *Arg = dyn_cast<Argument>(Val))
+    return Arg->getParent();
+  return None;
+}
+
+static bool canSkipAddingToSets(Value *Val) {
+  // Constants can share instances, which may falsely unify multiple
+  // sets, e.g. in
+  // store i32* null, i32** %ptr1
+  // store i32* null, i32** %ptr2
+  // clearly ptr1 and ptr2 should not be unified into the same set, so
+  // we should filter out the (potentially shared) instance to
+  // i32* null.
+  if (isa<Constant>(Val)) {
+    // TODO: Because all of these things are constant, we can determine whether
+    // the data is *actually* mutable at graph building time. This will probably
+    // come for free/cheap with offset awareness.
+    bool CanStoreMutableData = isa<GlobalValue>(Val) ||
+                               isa<ConstantExpr>(Val) ||
+                               isa<ConstantAggregate>(Val);
+    return !CanStoreMutableData;
+  }
+
+  return false;
+}
+
+CFLSteensAAResult::FunctionInfo::FunctionInfo(
+    Function &Fn, const SmallVectorImpl<Value *> &RetVals,
+    StratifiedSets<InstantiatedValue> S)
+    : Sets(std::move(S)) {
+  // Historically, an arbitrary upper-bound of 50 args was selected. We may want
+  // to remove this if it doesn't really matter in practice.
+  if (Fn.arg_size() > MaxSupportedArgsInSummary)
+    return;
+
+  DenseMap<StratifiedIndex, InterfaceValue> InterfaceMap;
+
+  // Our intention here is to record all InterfaceValues that share the same
+  // StratifiedIndex in RetParamRelations. For each valid InterfaceValue, we
+  // have its StratifiedIndex scanned here and check if the index is presented
+  // in InterfaceMap: if it is not, we add the correspondence to the map;
+  // otherwise, an aliasing relation is found and we add it to
+  // RetParamRelations.
+
+  auto AddToRetParamRelations = [&](unsigned InterfaceIndex,
+                                    StratifiedIndex SetIndex) {
+    unsigned Level = 0;
+    while (true) {
+      InterfaceValue CurrValue{InterfaceIndex, Level};
+
+      auto Itr = InterfaceMap.find(SetIndex);
+      if (Itr != InterfaceMap.end()) {
+        if (CurrValue != Itr->second)
+          Summary.RetParamRelations.push_back(
+              ExternalRelation{CurrValue, Itr->second});
+        break;
+      }
+
+      auto &Link = Sets.getLink(SetIndex);
+      InterfaceMap.insert(std::make_pair(SetIndex, CurrValue));
+      auto ExternalAttrs = getExternallyVisibleAttrs(Link.Attrs);
+      if (ExternalAttrs.any())
+        Summary.RetParamAttributes.push_back(
+            ExternalAttribute{CurrValue, ExternalAttrs});
+
+      if (!Link.hasBelow())
+        break;
+
+      ++Level;
+      SetIndex = Link.Below;
+    }
+  };
+
+  // Populate RetParamRelations for return values
+  for (auto *RetVal : RetVals) {
+    assert(RetVal != nullptr);
+    assert(RetVal->getType()->isPointerTy());
+    auto RetInfo = Sets.find(InstantiatedValue{RetVal, 0});
+    if (RetInfo.hasValue())
+      AddToRetParamRelations(0, RetInfo->Index);
+  }
+
+  // Populate RetParamRelations for parameters
+  unsigned I = 0;
+  for (auto &Param : Fn.args()) {
+    if (Param.getType()->isPointerTy()) {
+      auto ParamInfo = Sets.find(InstantiatedValue{&Param, 0});
+      if (ParamInfo.hasValue())
+        AddToRetParamRelations(I + 1, ParamInfo->Index);
+    }
+    ++I;
+  }
+}
+
+// Builds the graph + StratifiedSets for a function.
+CFLSteensAAResult::FunctionInfo CFLSteensAAResult::buildSetsFrom(Function *Fn) {
+  CFLGraphBuilder<CFLSteensAAResult> GraphBuilder(*this, TLI, *Fn);
+  StratifiedSetsBuilder<InstantiatedValue> SetBuilder;
+
+  // Add all CFLGraph nodes and all Dereference edges to StratifiedSets
+  auto &Graph = GraphBuilder.getCFLGraph();
+  for (const auto &Mapping : Graph.value_mappings()) {
+    auto Val = Mapping.first;
+    if (canSkipAddingToSets(Val))
+      continue;
+    auto &ValueInfo = Mapping.second;
+
+    assert(ValueInfo.getNumLevels() > 0);
+    SetBuilder.add(InstantiatedValue{Val, 0});
+    SetBuilder.noteAttributes(InstantiatedValue{Val, 0},
+                              ValueInfo.getNodeInfoAtLevel(0).Attr);
+    for (unsigned I = 0, E = ValueInfo.getNumLevels() - 1; I < E; ++I) {
+      SetBuilder.add(InstantiatedValue{Val, I + 1});
+      SetBuilder.noteAttributes(InstantiatedValue{Val, I + 1},
+                                ValueInfo.getNodeInfoAtLevel(I + 1).Attr);
+      SetBuilder.addBelow(InstantiatedValue{Val, I},
+                          InstantiatedValue{Val, I + 1});
+    }
+  }
+
+  // Add all assign edges to StratifiedSets
+  for (const auto &Mapping : Graph.value_mappings()) {
+    auto Val = Mapping.first;
+    if (canSkipAddingToSets(Val))
+      continue;
+    auto &ValueInfo = Mapping.second;
+
+    for (unsigned I = 0, E = ValueInfo.getNumLevels(); I < E; ++I) {
+      auto Src = InstantiatedValue{Val, I};
+      for (auto &Edge : ValueInfo.getNodeInfoAtLevel(I).Edges)
+        SetBuilder.addWith(Src, Edge.Other);
+    }
+  }
+
+  return FunctionInfo(*Fn, GraphBuilder.getReturnValues(), SetBuilder.build());
+}
+
+void CFLSteensAAResult::scan(Function *Fn) {
+  auto InsertPair = Cache.insert(std::make_pair(Fn, Optional<FunctionInfo>()));
+  (void)InsertPair;
+  assert(InsertPair.second &&
+         "Trying to scan a function that has already been cached");
+
+  // Note that we can't do Cache[Fn] = buildSetsFrom(Fn) here: the function call
+  // may get evaluated after operator[], potentially triggering a DenseMap
+  // resize and invalidating the reference returned by operator[]
+  auto FunInfo = buildSetsFrom(Fn);
+  Cache[Fn] = std::move(FunInfo);
+
+  Handles.push_front(FunctionHandle(Fn, this));
+}
+
+void CFLSteensAAResult::evict(Function *Fn) { Cache.erase(Fn); }
+
+/// Ensures that the given function is available in the cache, and returns the
+/// entry.
+const Optional<CFLSteensAAResult::FunctionInfo> &
+CFLSteensAAResult::ensureCached(Function *Fn) {
+  auto Iter = Cache.find(Fn);
+  if (Iter == Cache.end()) {
+    scan(Fn);
+    Iter = Cache.find(Fn);
+    assert(Iter != Cache.end());
+    assert(Iter->second.hasValue());
+  }
+  return Iter->second;
+}
+
+const AliasSummary *CFLSteensAAResult::getAliasSummary(Function &Fn) {
+  auto &FunInfo = ensureCached(&Fn);
+  if (FunInfo.hasValue())
+    return &FunInfo->getAliasSummary();
+  else
+    return nullptr;
+}
+
+AliasResult CFLSteensAAResult::query(const MemoryLocation &LocA,
+                                     const MemoryLocation &LocB) {
+  auto *ValA = const_cast<Value *>(LocA.Ptr);
+  auto *ValB = const_cast<Value *>(LocB.Ptr);
+
+  if (!ValA->getType()->isPointerTy() || !ValB->getType()->isPointerTy())
+    return NoAlias;
+
+  Function *Fn = nullptr;
+  auto MaybeFnA = parentFunctionOfValue(ValA);
+  auto MaybeFnB = parentFunctionOfValue(ValB);
+  if (!MaybeFnA.hasValue() && !MaybeFnB.hasValue()) {
+    // The only times this is known to happen are when globals + InlineAsm are
+    // involved
+    DEBUG(dbgs()
+          << "CFLSteensAA: could not extract parent function information.\n");
+    return MayAlias;
+  }
+
+  if (MaybeFnA.hasValue()) {
+    Fn = *MaybeFnA;
+    assert((!MaybeFnB.hasValue() || *MaybeFnB == *MaybeFnA) &&
+           "Interprocedural queries not supported");
+  } else {
+    Fn = *MaybeFnB;
+  }
+
+  assert(Fn != nullptr);
+  auto &MaybeInfo = ensureCached(Fn);
+  assert(MaybeInfo.hasValue());
+
+  auto &Sets = MaybeInfo->getStratifiedSets();
+  auto MaybeA = Sets.find(InstantiatedValue{ValA, 0});
+  if (!MaybeA.hasValue())
+    return MayAlias;
+
+  auto MaybeB = Sets.find(InstantiatedValue{ValB, 0});
+  if (!MaybeB.hasValue())
+    return MayAlias;
+
+  auto SetA = *MaybeA;
+  auto SetB = *MaybeB;
+  auto AttrsA = Sets.getLink(SetA.Index).Attrs;
+  auto AttrsB = Sets.getLink(SetB.Index).Attrs;
+
+  // If both values are local (meaning the corresponding set has attribute
+  // AttrNone or AttrEscaped), then we know that CFLSteensAA fully models them:
+  // they may-alias each other if and only if they are in the same set.
+  // If at least one value is non-local (meaning it either is global/argument or
+  // it comes from unknown sources like integer cast), the situation becomes a
+  // bit more interesting. We follow three general rules described below:
+  // - Non-local values may alias each other
+  // - AttrNone values do not alias any non-local values
+  // - AttrEscaped do not alias globals/arguments, but they may alias
+  // AttrUnknown values
+  if (SetA.Index == SetB.Index)
+    return MayAlias;
+  if (AttrsA.none() || AttrsB.none())
+    return NoAlias;
+  if (hasUnknownOrCallerAttr(AttrsA) || hasUnknownOrCallerAttr(AttrsB))
+    return MayAlias;
+  if (isGlobalOrArgAttr(AttrsA) && isGlobalOrArgAttr(AttrsB))
+    return MayAlias;
+  return NoAlias;
+}
+
+ModRefInfo CFLSteensAAResult::getArgModRefInfo(ImmutableCallSite CS,
+                                               unsigned ArgIdx) {
+  if (auto CalledFunc = CS.getCalledFunction()) {
+    auto &MaybeInfo = ensureCached(const_cast<Function *>(CalledFunc));
+    if (!MaybeInfo.hasValue())
+      return MRI_ModRef;
+    auto &RetParamAttributes = MaybeInfo->getAliasSummary().RetParamAttributes;
+    auto &RetParamRelations = MaybeInfo->getAliasSummary().RetParamRelations;
+
+    bool ArgAttributeIsWritten =
+        std::any_of(RetParamAttributes.begin(), RetParamAttributes.end(),
+                    [ArgIdx](const ExternalAttribute &ExtAttr) {
+                      return ExtAttr.IValue.Index == ArgIdx + 1;
+                    });
+    bool ArgIsAccessed =
+        std::any_of(RetParamRelations.begin(), RetParamRelations.end(),
+                    [ArgIdx](const ExternalRelation &ExtRelation) {
+                      return ExtRelation.To.Index == ArgIdx + 1 ||
+                             ExtRelation.From.Index == ArgIdx + 1;
+                    });
+
+    return (!ArgIsAccessed && !ArgAttributeIsWritten) ? MRI_NoModRef
+                                                      : MRI_ModRef;
+  }
+
+  return MRI_ModRef;
+}
+
+FunctionModRefBehavior
+CFLSteensAAResult::getModRefBehavior(ImmutableCallSite CS) {
+  // If we know the callee, try analyzing it
+  if (auto CalledFunc = CS.getCalledFunction())
+    return getModRefBehavior(CalledFunc);
+
+  // Otherwise, be conservative
+  return FMRB_UnknownModRefBehavior;
+}
+
+FunctionModRefBehavior CFLSteensAAResult::getModRefBehavior(const Function *F) {
+  assert(F != nullptr);
+
+  // TODO: Remove the const_cast
+  auto &MaybeInfo = ensureCached(const_cast<Function *>(F));
+  if (!MaybeInfo.hasValue())
+    return FMRB_UnknownModRefBehavior;
+  auto &RetParamAttributes = MaybeInfo->getAliasSummary().RetParamAttributes;
+  auto &RetParamRelations = MaybeInfo->getAliasSummary().RetParamRelations;
+
+  // First, if any argument is marked Escpaed, Unknown or Global, anything may
+  // happen to them and thus we can't draw any conclusion.
+  if (!RetParamAttributes.empty())
+    return FMRB_UnknownModRefBehavior;
+
+  // Currently we don't (and can't) distinguish reads from writes in
+  // RetParamRelations. All we can say is whether there may be memory access or
+  // not.
+  if (RetParamRelations.empty())
+    return FMRB_DoesNotAccessMemory;
+
+  // Check if something beyond argmem gets touched.
+  bool AccessArgMemoryOnly =
+      std::all_of(RetParamRelations.begin(), RetParamRelations.end(),
+                  [](const ExternalRelation &ExtRelation) {
+                    // Both DerefLevels has to be 0, since we don't know which
+                    // one is a read and which is a write.
+                    return ExtRelation.From.DerefLevel == 0 &&
+                           ExtRelation.To.DerefLevel == 0;
+                  });
+  return AccessArgMemoryOnly ? FMRB_OnlyAccessesArgumentPointees
+                             : FMRB_UnknownModRefBehavior;
+}
+
+char CFLSteensAA::PassID;
+
+CFLSteensAAResult CFLSteensAA::run(Function &F, AnalysisManager<Function> &AM) {
+  return CFLSteensAAResult(AM.getResult<TargetLibraryAnalysis>(F));
+}
+
+char CFLSteensAAWrapperPass::ID = 0;
+INITIALIZE_PASS(CFLSteensAAWrapperPass, "cfl-steens-aa",
+                "Unification-Based CFL Alias Analysis", false, true)
+
+ImmutablePass *llvm::createCFLSteensAAWrapperPass() {
+  return new CFLSteensAAWrapperPass();
+}
+
+CFLSteensAAWrapperPass::CFLSteensAAWrapperPass() : ImmutablePass(ID) {
+  initializeCFLSteensAAWrapperPassPass(*PassRegistry::getPassRegistry());
+}
+
+void CFLSteensAAWrapperPass::initializePass() {
+  auto &TLIWP = getAnalysis<TargetLibraryInfoWrapperPass>();
+  Result.reset(new CFLSteensAAResult(TLIWP.getTLI()));
+}
+
+void CFLSteensAAWrapperPass::getAnalysisUsage(AnalysisUsage &AU) const {
+  AU.setPreservesAll();
+  AU.addRequired<TargetLibraryInfoWrapperPass>();
+}
diff --git a/lib/Analysis/CGSCCPassManager.cpp b/lib/Analysis/CGSCCPassManager.cpp
index 4a03002e510b..f6f30bb927a5 100644
--- a/lib/Analysis/CGSCCPassManager.cpp
+++ b/lib/Analysis/CGSCCPassManager.cpp
@@ -8,65 +8,17 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/Analysis/CGSCCPassManager.h"
-#include "llvm/Support/CommandLine.h"
-#include "llvm/Support/Debug.h"
 
 using namespace llvm;
 
-char CGSCCAnalysisManagerModuleProxy::PassID;
-
-CGSCCAnalysisManagerModuleProxy::Result
-CGSCCAnalysisManagerModuleProxy::run(Module &M) {
-  assert(CGAM->empty() && "CGSCC analyses ran prior to the module proxy!");
-  return Result(*CGAM);
-}
-
-CGSCCAnalysisManagerModuleProxy::Result::~Result() {
-  // Clear out the analysis manager if we're being destroyed -- it means we
-  // didn't even see an invalidate call when we got invalidated.
-  CGAM->clear();
-}
-
-bool CGSCCAnalysisManagerModuleProxy::Result::invalidate(
-    Module &M, const PreservedAnalyses &PA) {
-  // If this proxy isn't marked as preserved, then we can't even invalidate
-  // individual CGSCC analyses, there may be an invalid set of SCC objects in
-  // the cache making it impossible to incrementally preserve them.
-  // Just clear the entire manager.
-  if (!PA.preserved(ID()))
-    CGAM->clear();
-
-  // Return false to indicate that this result is still a valid proxy.
-  return false;
-}
-
-char ModuleAnalysisManagerCGSCCProxy::PassID;
-
-char FunctionAnalysisManagerCGSCCProxy::PassID;
-
-FunctionAnalysisManagerCGSCCProxy::Result
-FunctionAnalysisManagerCGSCCProxy::run(LazyCallGraph::SCC &C) {
-  assert(FAM->empty() && "Function analyses ran prior to the CGSCC proxy!");
-  return Result(*FAM);
-}
-
-FunctionAnalysisManagerCGSCCProxy::Result::~Result() {
-  // Clear out the analysis manager if we're being destroyed -- it means we
-  // didn't even see an invalidate call when we got invalidated.
-  FAM->clear();
+// Explicit instantiations for the core proxy templates.
+namespace llvm {
+template class PassManager<LazyCallGraph::SCC>;
+template class AnalysisManager<LazyCallGraph::SCC>;
+template class InnerAnalysisManagerProxy<CGSCCAnalysisManager, Module>;
+template class OuterAnalysisManagerProxy<ModuleAnalysisManager,
+                                         LazyCallGraph::SCC>;
+template class InnerAnalysisManagerProxy<FunctionAnalysisManager,
+                                         LazyCallGraph::SCC>;
+template class OuterAnalysisManagerProxy<CGSCCAnalysisManager, Function>;
 }
-
-bool FunctionAnalysisManagerCGSCCProxy::Result::invalidate(
-    LazyCallGraph::SCC &C, const PreservedAnalyses &PA) {
-  // If this proxy isn't marked as preserved, then we can't even invalidate
-  // individual function analyses, there may be an invalid set of Function
-  // objects in the cache making it impossible to incrementally preserve them.
-  // Just clear the entire manager.
-  if (!PA.preserved(ID()))
-    FAM->clear();
-
-  // Return false to indicate that this result is still a valid proxy.
-  return false;
-}
-
-char CGSCCAnalysisManagerFunctionProxy::PassID;
diff --git a/lib/Analysis/CMakeLists.txt b/lib/Analysis/CMakeLists.txt
index 69623619a8b0..57ad437ef4fd 100644
--- a/lib/Analysis/CMakeLists.txt
+++ b/lib/Analysis/CMakeLists.txt
@@ -1,6 +1,7 @@
 add_llvm_library(LLVMAnalysis
   AliasAnalysis.cpp
   AliasAnalysisEvaluator.cpp
+  AliasAnalysisSummary.cpp
   AliasSetTracker.cpp
   Analysis.cpp
   AssumptionCache.cpp
@@ -10,7 +11,8 @@ add_llvm_library(LLVMAnalysis
   BranchProbabilityInfo.cpp
   CFG.cpp
   CFGPrinter.cpp
-  CFLAliasAnalysis.cpp
+  CFLAndersAliasAnalysis.cpp
+  CFLSteensAliasAnalysis.cpp
   CGSCCPassManager.cpp
   CallGraph.cpp
   CallGraphSCCPass.cpp
@@ -28,31 +30,38 @@ add_llvm_library(LLVMAnalysis
   EHPersonalities.cpp
   GlobalsModRef.cpp
   IVUsers.cpp
+  IndirectCallPromotionAnalysis.cpp
   InlineCost.cpp
   InstCount.cpp
   InstructionSimplify.cpp
   Interval.cpp
   IntervalPartition.cpp
   IteratedDominanceFrontier.cpp
+  LazyBlockFrequencyInfo.cpp
   LazyCallGraph.cpp
   LazyValueInfo.cpp
   Lint.cpp
   Loads.cpp
   LoopAccessAnalysis.cpp
+  LoopUnrollAnalyzer.cpp
   LoopInfo.cpp
   LoopPass.cpp
+  LoopPassManager.cpp
   MemDepPrinter.cpp
   MemDerefPrinter.cpp
   MemoryBuiltins.cpp
   MemoryDependenceAnalysis.cpp
   MemoryLocation.cpp
   ModuleDebugInfoPrinter.cpp
+  ModuleSummaryAnalysis.cpp
   ObjCARCAliasAnalysis.cpp
   ObjCARCAnalysisUtils.cpp
   ObjCARCInstKind.cpp
+  OptimizationDiagnosticInfo.cpp
   OrderedBasicBlock.cpp
   PHITransAddr.cpp
   PostDominators.cpp
+  ProfileSummaryInfo.cpp
   PtrUseVisitor.cpp
   RegionInfo.cpp
   RegionPass.cpp
@@ -66,6 +75,7 @@ add_llvm_library(LLVMAnalysis
   TargetTransformInfo.cpp
   Trace.cpp
   TypeBasedAliasAnalysis.cpp
+  TypeMetadataUtils.cpp
   ScopedNoAliasAA.cpp
   ValueTracking.cpp
   VectorUtils.cpp
diff --git a/lib/Analysis/CallGraph.cpp b/lib/Analysis/CallGraph.cpp
index 7cec962678e8..39cb86d2ccb1 100644
--- a/lib/Analysis/CallGraph.cpp
+++ b/lib/Analysis/CallGraph.cpp
@@ -80,11 +80,9 @@ void CallGraph::addToCallGraph(Function *F) {
     Node->addCalledFunction(CallSite(), CallsExternalNode.get());
 
   // Look for calls by this function.
-  for (Function::iterator BB = F->begin(), BBE = F->end(); BB != BBE; ++BB)
-    for (BasicBlock::iterator II = BB->begin(), IE = BB->end(); II != IE;
-         ++II) {
-      CallSite CS(cast<Value>(II));
-      if (CS) {
+  for (BasicBlock &BB : *F)
+    for (Instruction &I : BB) {
+      if (auto CS = CallSite(&I)) {
         const Function *Callee = CS.getCalledFunction();
         if (!Callee || !Intrinsic::isLeaf(Callee->getIntrinsicID()))
           // Indirect calls of intrinsics are not allowed so no need to check.
@@ -111,8 +109,8 @@ void CallGraph::print(raw_ostream &OS) const {
   SmallVector<CallGraphNode *, 16> Nodes;
   Nodes.reserve(FunctionMap.size());
 
-  for (auto I = begin(), E = end(); I != E; ++I)
-    Nodes.push_back(I->second.get());
+  for (const auto &I : *this)
+    Nodes.push_back(I.second.get());
 
   std::sort(Nodes.begin(), Nodes.end(),
             [](CallGraphNode *LHS, CallGraphNode *RHS) {
@@ -186,9 +184,9 @@ void CallGraphNode::print(raw_ostream &OS) const {
   
   OS << "<<" << this << ">>  #uses=" << getNumReferences() << '\n';
 
-  for (const_iterator I = begin(), E = end(); I != E; ++I) {
-    OS << "  CS<" << I->first << "> calls ";
-    if (Function *FI = I->second->getFunction())
+  for (const auto &I : *this) {
+    OS << "  CS<" << I.first << "> calls ";
+    if (Function *FI = I.second->getFunction())
       OS << "function '" << FI->getName() <<"'\n";
     else
       OS << "external node\n";
@@ -259,12 +257,19 @@ void CallGraphNode::replaceCallEdge(CallSite CS,
   }
 }
 
+// Provide an explicit template instantiation for the static ID.
+char CallGraphAnalysis::PassID;
+
+PreservedAnalyses CallGraphPrinterPass::run(Module &M,
+                                            AnalysisManager<Module> &AM) {
+  AM.getResult<CallGraphAnalysis>(M).print(OS);
+  return PreservedAnalyses::all();
+}
+
 //===----------------------------------------------------------------------===//
 // Out-of-line definitions of CallGraphAnalysis class members.
 //
 
-char CallGraphAnalysis::PassID;
-
 //===----------------------------------------------------------------------===//
 // Implementations of the CallGraphWrapperPass class methods.
 //
@@ -304,3 +309,29 @@ void CallGraphWrapperPass::print(raw_ostream &OS, const Module *) const {
 
 LLVM_DUMP_METHOD
 void CallGraphWrapperPass::dump() const { print(dbgs(), nullptr); }
+
+namespace {
+struct CallGraphPrinterLegacyPass : public ModulePass {
+  static char ID; // Pass ID, replacement for typeid
+  CallGraphPrinterLegacyPass() : ModulePass(ID) {
+    initializeCallGraphPrinterLegacyPassPass(*PassRegistry::getPassRegistry());
+  }
+
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    AU.setPreservesAll();
+    AU.addRequiredTransitive<CallGraphWrapperPass>();
+  }
+  bool runOnModule(Module &M) override {
+    getAnalysis<CallGraphWrapperPass>().print(errs(), &M);
+    return false;
+  }
+};
+}
+
+char CallGraphPrinterLegacyPass::ID = 0;
+
+INITIALIZE_PASS_BEGIN(CallGraphPrinterLegacyPass, "print-callgraph",
+                      "Print a call graph", true, true)
+INITIALIZE_PASS_DEPENDENCY(CallGraphWrapperPass)
+INITIALIZE_PASS_END(CallGraphPrinterLegacyPass, "print-callgraph",
+                    "Print a call graph", true, true)
diff --git a/lib/Analysis/CallGraphSCCPass.cpp b/lib/Analysis/CallGraphSCCPass.cpp
index 6dd1d0a066b6..69d767354785 100644
--- a/lib/Analysis/CallGraphSCCPass.cpp
+++ b/lib/Analysis/CallGraphSCCPass.cpp
@@ -23,6 +23,7 @@
 #include "llvm/IR/IntrinsicInst.h"
 #include "llvm/IR/LLVMContext.h"
 #include "llvm/IR/LegacyPassManagers.h"
+#include "llvm/IR/OptBisect.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/Timer.h"
@@ -260,10 +261,10 @@ bool CGPassManager::RefreshCallGraph(CallGraphSCC &CurSCC,
     // Loop over all of the instructions in the function, getting the callsites.
     // Keep track of the number of direct/indirect calls added.
     unsigned NumDirectAdded = 0, NumIndirectAdded = 0;
-    
-    for (Function::iterator BB = F->begin(), E = F->end(); BB != E; ++BB)
-      for (BasicBlock::iterator I = BB->begin(), E = BB->end(); I != E; ++I) {
-        CallSite CS(cast<Value>(I));
+
+    for (BasicBlock &BB : *F)
+      for (Instruction &I : BB) {
+        CallSite CS(&I);
         if (!CS) continue;
         Function *Callee = CS.getCalledFunction();
         if (Callee && Callee->isIntrinsic()) continue;
@@ -444,7 +445,7 @@ bool CGPassManager::runOnModule(Module &M) {
   // Walk the callgraph in bottom-up SCC order.
   scc_iterator<CallGraph*> CGI = scc_begin(&CG);
 
-  CallGraphSCC CurSCC(&CGI);
+  CallGraphSCC CurSCC(CG, &CGI);
   while (!CGI.isAtEnd()) {
     // Copy the current SCC and increment past it so that the pass can hack
     // on the SCC if it wants to without invalidating our iterator.
@@ -631,3 +632,13 @@ Pass *CallGraphSCCPass::createPrinterPass(raw_ostream &O,
   return new PrintCallGraphPass(Banner, O);
 }
 
+bool CallGraphSCCPass::skipSCC(CallGraphSCC &SCC) const {
+  return !SCC.getCallGraph().getModule()
+              .getContext()
+              .getOptBisect()
+              .shouldRunPass(this, SCC);
+}
+
+char DummyCGSCCPass::ID = 0;
+INITIALIZE_PASS(DummyCGSCCPass, "DummyCGSCCPass", "DummyCGSCCPass", false,
+                false)
diff --git a/lib/Analysis/CallPrinter.cpp b/lib/Analysis/CallPrinter.cpp
index 68dcd3c06427..af942e9ed3e9 100644
--- a/lib/Analysis/CallPrinter.cpp
+++ b/lib/Analysis/CallPrinter.cpp
@@ -58,16 +58,16 @@ struct CallGraphViewer
   }
 };
 
-struct CallGraphPrinter : public DOTGraphTraitsModulePrinter<
+struct CallGraphDOTPrinter : public DOTGraphTraitsModulePrinter<
                               CallGraphWrapperPass, true, CallGraph *,
                               AnalysisCallGraphWrapperPassTraits> {
   static char ID;
 
-  CallGraphPrinter()
+  CallGraphDOTPrinter()
       : DOTGraphTraitsModulePrinter<CallGraphWrapperPass, true, CallGraph *,
                                     AnalysisCallGraphWrapperPassTraits>(
             "callgraph", ID) {
-    initializeCallGraphPrinterPass(*PassRegistry::getPassRegistry());
+    initializeCallGraphDOTPrinterPass(*PassRegistry::getPassRegistry());
   }
 };
 
@@ -77,8 +77,8 @@ char CallGraphViewer::ID = 0;
 INITIALIZE_PASS(CallGraphViewer, "view-callgraph", "View call graph", false,
                 false)
 
-char CallGraphPrinter::ID = 0;
-INITIALIZE_PASS(CallGraphPrinter, "dot-callgraph",
+char CallGraphDOTPrinter::ID = 0;
+INITIALIZE_PASS(CallGraphDOTPrinter, "dot-callgraph",
                 "Print call graph to 'dot' file", false, false)
 
 // Create methods available outside of this file, to use them
@@ -87,6 +87,6 @@ INITIALIZE_PASS(CallGraphPrinter, "dot-callgraph",
 
 ModulePass *llvm::createCallGraphViewerPass() { return new CallGraphViewer(); }
 
-ModulePass *llvm::createCallGraphPrinterPass() {
-  return new CallGraphPrinter();
+ModulePass *llvm::createCallGraphDOTPrinterPass() {
+  return new CallGraphDOTPrinter();
 }
diff --git a/lib/Analysis/CaptureTracking.cpp b/lib/Analysis/CaptureTracking.cpp
index 1add2fa77566..9862c3c9c270 100644
--- a/lib/Analysis/CaptureTracking.cpp
+++ b/lib/Analysis/CaptureTracking.cpp
@@ -26,6 +26,7 @@
 #include "llvm/IR/Constants.h"
 #include "llvm/IR/Dominators.h"
 #include "llvm/IR/Instructions.h"
+#include "llvm/IR/IntrinsicInst.h"
 
 using namespace llvm;
 
@@ -242,6 +243,13 @@ void llvm::PointerMayBeCaptured(const Value *V, CaptureTracker *Tracker) {
       if (CS.onlyReadsMemory() && CS.doesNotThrow() && I->getType()->isVoidTy())
         break;
 
+      // Volatile operations effectively capture the memory location that they
+      // load and store to.
+      if (auto *MI = dyn_cast<MemIntrinsic>(I))
+        if (MI->isVolatile())
+          if (Tracker->captured(U))
+            return;
+
       // Not captured if only passed via 'nocapture' arguments.  Note that
       // calling a function pointer does not in itself cause the pointer to
       // be captured.  This is a subtle point considering that (for example)
@@ -259,18 +267,46 @@ void llvm::PointerMayBeCaptured(const Value *V, CaptureTracker *Tracker) {
       break;
     }
     case Instruction::Load:
-      // Loading from a pointer does not cause it to be captured.
+      // Volatile loads make the address observable.
+      if (cast<LoadInst>(I)->isVolatile())
+        if (Tracker->captured(U))
+          return;
       break;
     case Instruction::VAArg:
       // "va-arg" from a pointer does not cause it to be captured.
       break;
     case Instruction::Store:
-      if (V == I->getOperand(0))
         // Stored the pointer - conservatively assume it may be captured.
+        // Volatile stores make the address observable.
+      if (V == I->getOperand(0) || cast<StoreInst>(I)->isVolatile())
+        if (Tracker->captured(U))
+          return;
+      break;
+    case Instruction::AtomicRMW: {
+      // atomicrmw conceptually includes both a load and store from
+      // the same location.
+      // As with a store, the location being accessed is not captured,
+      // but the value being stored is.
+      // Volatile stores make the address observable.
+      auto *ARMWI = cast<AtomicRMWInst>(I);
+      if (ARMWI->getValOperand() == V || ARMWI->isVolatile())
         if (Tracker->captured(U))
           return;
-      // Storing to the pointee does not cause the pointer to be captured.
       break;
+    }
+    case Instruction::AtomicCmpXchg: {
+      // cmpxchg conceptually includes both a load and store from
+      // the same location.
+      // As with a store, the location being accessed is not captured,
+      // but the value being stored is.
+      // Volatile stores make the address observable.
+      auto *ACXI = cast<AtomicCmpXchgInst>(I);
+      if (ACXI->getCompareOperand() == V || ACXI->getNewValOperand() == V ||
+          ACXI->isVolatile())
+        if (Tracker->captured(U))
+          return;
+      break;
+    }
     case Instruction::BitCast:
     case Instruction::GetElementPtr:
     case Instruction::PHI:
@@ -289,7 +325,7 @@ void llvm::PointerMayBeCaptured(const Value *V, CaptureTracker *Tracker) {
             Worklist.push_back(&UU);
       }
       break;
-    case Instruction::ICmp:
+    case Instruction::ICmp: {
       // Don't count comparisons of a no-alias return value against null as
       // captures. This allows us to ignore comparisons of malloc results
       // with null, for example.
@@ -298,11 +334,19 @@ void llvm::PointerMayBeCaptured(const Value *V, CaptureTracker *Tracker) {
         if (CPN->getType()->getAddressSpace() == 0)
           if (isNoAliasCall(V->stripPointerCasts()))
             break;
+      // Comparison against value stored in global variable. Given the pointer
+      // does not escape, its value cannot be guessed and stored separately in a
+      // global variable.
+      unsigned OtherIndex = (I->getOperand(0) == V) ? 1 : 0;
+      auto *LI = dyn_cast<LoadInst>(I->getOperand(OtherIndex));
+      if (LI && isa<GlobalVariable>(LI->getPointerOperand()))
+        break;
       // Otherwise, be conservative. There are crazy ways to capture pointers
       // using comparisons.
       if (Tracker->captured(U))
         return;
       break;
+    }
     default:
       // Something else - be conservative and say it is captured.
       if (Tracker->captured(U))
diff --git a/lib/Analysis/CodeMetrics.cpp b/lib/Analysis/CodeMetrics.cpp
index 4090b4cd752b..ed8370498dd0 100644
--- a/lib/Analysis/CodeMetrics.cpp
+++ b/lib/Analysis/CodeMetrics.cpp
@@ -100,22 +100,21 @@ void CodeMetrics::collectEphemeralValues(
   completeEphemeralValues(WorkSet, EphValues);
 }
 
-/// analyzeBasicBlock - Fill in the current structure with information gleaned
-/// from the specified block.
+/// Fill in the current structure with information gleaned from the specified
+/// block.
 void CodeMetrics::analyzeBasicBlock(const BasicBlock *BB,
                                     const TargetTransformInfo &TTI,
                                     SmallPtrSetImpl<const Value*> &EphValues) {
   ++NumBlocks;
   unsigned NumInstsBeforeThisBB = NumInsts;
-  for (BasicBlock::const_iterator II = BB->begin(), E = BB->end();
-       II != E; ++II) {
+  for (const Instruction &I : *BB) {
     // Skip ephemeral values.
-    if (EphValues.count(&*II))
+    if (EphValues.count(&I))
       continue;
 
     // Special handling for calls.
-    if (isa<CallInst>(II) || isa<InvokeInst>(II)) {
-      ImmutableCallSite CS(cast<Instruction>(II));
+    if (isa<CallInst>(I) || isa<InvokeInst>(I)) {
+      ImmutableCallSite CS(&I);
 
       if (const Function *F = CS.getCalledFunction()) {
         // If a function is both internal and has a single use, then it is
@@ -141,26 +140,29 @@ void CodeMetrics::analyzeBasicBlock(const BasicBlock *BB,
       }
     }
 
-    if (const AllocaInst *AI = dyn_cast<AllocaInst>(II)) {
+    if (const AllocaInst *AI = dyn_cast<AllocaInst>(&I)) {
       if (!AI->isStaticAlloca())
         this->usesDynamicAlloca = true;
     }
 
-    if (isa<ExtractElementInst>(II) || II->getType()->isVectorTy())
+    if (isa<ExtractElementInst>(I) || I.getType()->isVectorTy())
       ++NumVectorInsts;
 
-    if (II->getType()->isTokenTy() && II->isUsedOutsideOfBlock(BB))
+    if (I.getType()->isTokenTy() && I.isUsedOutsideOfBlock(BB))
       notDuplicatable = true;
 
-    if (const CallInst *CI = dyn_cast<CallInst>(II))
+    if (const CallInst *CI = dyn_cast<CallInst>(&I)) {
       if (CI->cannotDuplicate())
         notDuplicatable = true;
+      if (CI->isConvergent())
+        convergent = true;
+    }
 
-    if (const InvokeInst *InvI = dyn_cast<InvokeInst>(II))
+    if (const InvokeInst *InvI = dyn_cast<InvokeInst>(&I))
       if (InvI->cannotDuplicate())
         notDuplicatable = true;
 
-    NumInsts += TTI.getUserCost(&*II);
+    NumInsts += TTI.getUserCost(&I);
   }
 
   if (isa<ReturnInst>(BB->getTerminator()))
diff --git a/lib/Analysis/ConstantFolding.cpp b/lib/Analysis/ConstantFolding.cpp
index ccb56631b846..6c471ab45048 100644
--- a/lib/Analysis/ConstantFolding.cpp
+++ b/lib/Analysis/ConstantFolding.cpp
@@ -17,6 +17,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/Analysis/ConstantFolding.h"
+#include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/SmallPtrSet.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/StringMap.h"
@@ -34,15 +35,16 @@
 #include "llvm/IR/Operator.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/MathExtras.h"
+#include <cassert>
 #include <cerrno>
+#include <cfenv>
 #include <cmath>
-
-#ifdef HAVE_FENV_H
-#include <fenv.h>
-#endif
+#include <limits>
 
 using namespace llvm;
 
+namespace {
+
 //===----------------------------------------------------------------------===//
 // Constant Folding internal helper functions
 //===----------------------------------------------------------------------===//
@@ -50,7 +52,7 @@ using namespace llvm;
 /// Constant fold bitcast, symbolically evaluating it with DataLayout.
 /// This always returns a non-null constant, but it may be a
 /// ConstantExpr if unfoldable.
-static Constant *FoldBitCast(Constant *C, Type *DestTy, const DataLayout &DL) {
+Constant *FoldBitCast(Constant *C, Type *DestTy, const DataLayout &DL) {
   // Catch the obvious splat cases.
   if (C->isNullValue() && !DestTy->isX86_MMXTy())
     return Constant::getNullValue(DestTy);
@@ -59,8 +61,8 @@ static Constant *FoldBitCast(Constant *C, Type *DestTy, const DataLayout &DL) {
     return Constant::getAllOnesValue(DestTy);
 
   // Handle a vector->integer cast.
-  if (IntegerType *IT = dyn_cast<IntegerType>(DestTy)) {
-    VectorType *VTy = dyn_cast<VectorType>(C->getType());
+  if (auto *IT = dyn_cast<IntegerType>(DestTy)) {
+    auto *VTy = dyn_cast<VectorType>(C->getType());
     if (!VTy)
       return ConstantExpr::getBitCast(C, DestTy);
 
@@ -77,27 +79,30 @@ static Constant *FoldBitCast(Constant *C, Type *DestTy, const DataLayout &DL) {
       C = ConstantExpr::getBitCast(C, SrcIVTy);
     }
 
-    ConstantDataVector *CDV = dyn_cast<ConstantDataVector>(C);
-    if (!CDV)
-      return ConstantExpr::getBitCast(C, DestTy);
-
     // Now that we know that the input value is a vector of integers, just shift
     // and insert them into our result.
-    unsigned BitShift = DL.getTypeAllocSizeInBits(SrcEltTy);
+    unsigned BitShift = DL.getTypeSizeInBits(SrcEltTy);
     APInt Result(IT->getBitWidth(), 0);
     for (unsigned i = 0; i != NumSrcElts; ++i) {
-      Result <<= BitShift;
+      Constant *Element;
       if (DL.isLittleEndian())
-        Result |= CDV->getElementAsInteger(NumSrcElts-i-1);
+        Element = C->getAggregateElement(NumSrcElts-i-1);
       else
-        Result |= CDV->getElementAsInteger(i);
+        Element = C->getAggregateElement(i);
+
+      auto *ElementCI = dyn_cast_or_null<ConstantInt>(Element);
+      if (!ElementCI)
+        return ConstantExpr::getBitCast(C, DestTy);
+
+      Result <<= BitShift;
+      Result |= ElementCI->getValue().zextOrSelf(IT->getBitWidth());
     }
 
     return ConstantInt::get(IT, Result);
   }
 
   // The code below only handles casts to vectors currently.
-  VectorType *DestVTy = dyn_cast<VectorType>(DestTy);
+  auto *DestVTy = dyn_cast<VectorType>(DestTy);
   if (!DestVTy)
     return ConstantExpr::getBitCast(C, DestTy);
 
@@ -175,7 +180,7 @@ static Constant *FoldBitCast(Constant *C, Type *DestTy, const DataLayout &DL) {
       Constant *Elt = Zero;
       unsigned ShiftAmt = isLittleEndian ? 0 : SrcBitSize*(Ratio-1);
       for (unsigned j = 0; j != Ratio; ++j) {
-        Constant *Src =dyn_cast<ConstantInt>(C->getAggregateElement(SrcElt++));
+        Constant *Src = dyn_cast<ConstantInt>(C->getAggregateElement(SrcElt++));
         if (!Src)  // Reject constantexpr elements.
           return ConstantExpr::getBitCast(C, DestTy);
 
@@ -201,7 +206,7 @@ static Constant *FoldBitCast(Constant *C, Type *DestTy, const DataLayout &DL) {
 
   // Loop over each source value, expanding into multiple results.
   for (unsigned i = 0; i != NumSrcElt; ++i) {
-    Constant *Src = dyn_cast<ConstantInt>(C->getAggregateElement(i));
+    auto *Src = dyn_cast<ConstantInt>(C->getAggregateElement(i));
     if (!Src)  // Reject constantexpr elements.
       return ConstantExpr::getBitCast(C, DestTy);
 
@@ -230,11 +235,12 @@ static Constant *FoldBitCast(Constant *C, Type *DestTy, const DataLayout &DL) {
   return ConstantVector::get(Result);
 }
 
+} // end anonymous namespace
 
 /// If this constant is a constant offset from a global, return the global and
 /// the constant. Because of constantexprs, this function is recursive.
-static bool IsConstantOffsetFromGlobal(Constant *C, GlobalValue *&GV,
-                                       APInt &Offset, const DataLayout &DL) {
+bool llvm::IsConstantOffsetFromGlobal(Constant *C, GlobalValue *&GV,
+                                      APInt &Offset, const DataLayout &DL) {
   // Trivial case, constant is the global.
   if ((GV = dyn_cast<GlobalValue>(C))) {
     unsigned BitWidth = DL.getPointerTypeSizeInBits(GV->getType());
@@ -243,7 +249,7 @@ static bool IsConstantOffsetFromGlobal(Constant *C, GlobalValue *&GV,
   }
 
   // Otherwise, if this isn't a constant expr, bail out.
-  ConstantExpr *CE = dyn_cast<ConstantExpr>(C);
+  auto *CE = dyn_cast<ConstantExpr>(C);
   if (!CE) return false;
 
   // Look through ptr->int and ptr->ptr casts.
@@ -252,7 +258,7 @@ static bool IsConstantOffsetFromGlobal(Constant *C, GlobalValue *&GV,
     return IsConstantOffsetFromGlobal(CE->getOperand(0), GV, Offset, DL);
 
   // i32* getelementptr ([5 x i32]* @a, i32 0, i32 5)
-  GEPOperator *GEP = dyn_cast<GEPOperator>(CE);
+  auto *GEP = dyn_cast<GEPOperator>(CE);
   if (!GEP)
     return false;
 
@@ -271,13 +277,14 @@ static bool IsConstantOffsetFromGlobal(Constant *C, GlobalValue *&GV,
   return true;
 }
 
+namespace {
+
 /// Recursive helper to read bits out of global. C is the constant being copied
 /// out of. ByteOffset is an offset into C. CurPtr is the pointer to copy
 /// results into and BytesLeft is the number of bytes left in
 /// the CurPtr buffer. DL is the DataLayout.
-static bool ReadDataFromGlobal(Constant *C, uint64_t ByteOffset,
-                               unsigned char *CurPtr, unsigned BytesLeft,
-                               const DataLayout &DL) {
+bool ReadDataFromGlobal(Constant *C, uint64_t ByteOffset, unsigned char *CurPtr,
+                        unsigned BytesLeft, const DataLayout &DL) {
   assert(ByteOffset <= DL.getTypeAllocSize(C->getType()) &&
          "Out of range access");
 
@@ -286,7 +293,7 @@ static bool ReadDataFromGlobal(Constant *C, uint64_t ByteOffset,
   if (isa<ConstantAggregateZero>(C) || isa<UndefValue>(C))
     return true;
 
-  if (ConstantInt *CI = dyn_cast<ConstantInt>(C)) {
+  if (auto *CI = dyn_cast<ConstantInt>(C)) {
     if (CI->getBitWidth() > 64 ||
         (CI->getBitWidth() & 7) != 0)
       return false;
@@ -304,7 +311,7 @@ static bool ReadDataFromGlobal(Constant *C, uint64_t ByteOffset,
     return true;
   }
 
-  if (ConstantFP *CFP = dyn_cast<ConstantFP>(C)) {
+  if (auto *CFP = dyn_cast<ConstantFP>(C)) {
     if (CFP->getType()->isDoubleTy()) {
       C = FoldBitCast(C, Type::getInt64Ty(C->getContext()), DL);
       return ReadDataFromGlobal(C, ByteOffset, CurPtr, BytesLeft, DL);
@@ -320,7 +327,7 @@ static bool ReadDataFromGlobal(Constant *C, uint64_t ByteOffset,
     return false;
   }
 
-  if (ConstantStruct *CS = dyn_cast<ConstantStruct>(C)) {
+  if (auto *CS = dyn_cast<ConstantStruct>(C)) {
     const StructLayout *SL = DL.getStructLayout(CS->getType());
     unsigned Index = SL->getElementContainingOffset(ByteOffset);
     uint64_t CurEltOffset = SL->getElementOffset(Index);
@@ -364,7 +371,7 @@ static bool ReadDataFromGlobal(Constant *C, uint64_t ByteOffset,
     uint64_t Index = ByteOffset / EltSize;
     uint64_t Offset = ByteOffset - Index * EltSize;
     uint64_t NumElts;
-    if (ArrayType *AT = dyn_cast<ArrayType>(C->getType()))
+    if (auto *AT = dyn_cast<ArrayType>(C->getType()))
       NumElts = AT->getNumElements();
     else
       NumElts = C->getType()->getVectorNumElements();
@@ -386,7 +393,7 @@ static bool ReadDataFromGlobal(Constant *C, uint64_t ByteOffset,
     return true;
   }
 
-  if (ConstantExpr *CE = dyn_cast<ConstantExpr>(C)) {
+  if (auto *CE = dyn_cast<ConstantExpr>(C)) {
     if (CE->getOpcode() == Instruction::IntToPtr &&
         CE->getOperand(0)->getType() == DL.getIntPtrType(CE->getType())) {
       return ReadDataFromGlobal(CE->getOperand(0), ByteOffset, CurPtr,
@@ -398,11 +405,10 @@ static bool ReadDataFromGlobal(Constant *C, uint64_t ByteOffset,
   return false;
 }
 
-static Constant *FoldReinterpretLoadFromConstPtr(Constant *C,
-                                                 const DataLayout &DL) {
-  PointerType *PTy = cast<PointerType>(C->getType());
-  Type *LoadTy = PTy->getElementType();
-  IntegerType *IntType = dyn_cast<IntegerType>(LoadTy);
+Constant *FoldReinterpretLoadFromConstPtr(Constant *C, Type *LoadTy,
+                                          const DataLayout &DL) {
+  auto *PTy = cast<PointerType>(C->getType());
+  auto *IntType = dyn_cast<IntegerType>(LoadTy);
 
   // If this isn't an integer load we can't fold it directly.
   if (!IntType) {
@@ -414,19 +420,19 @@ static Constant *FoldReinterpretLoadFromConstPtr(Constant *C,
     // an actual new load.
     Type *MapTy;
     if (LoadTy->isHalfTy())
-      MapTy = Type::getInt16PtrTy(C->getContext(), AS);
+      MapTy = Type::getInt16Ty(C->getContext());
     else if (LoadTy->isFloatTy())
-      MapTy = Type::getInt32PtrTy(C->getContext(), AS);
+      MapTy = Type::getInt32Ty(C->getContext());
     else if (LoadTy->isDoubleTy())
-      MapTy = Type::getInt64PtrTy(C->getContext(), AS);
+      MapTy = Type::getInt64Ty(C->getContext());
     else if (LoadTy->isVectorTy()) {
-      MapTy = PointerType::getIntNPtrTy(C->getContext(),
-                                        DL.getTypeAllocSizeInBits(LoadTy), AS);
+      MapTy = PointerType::getIntNTy(C->getContext(),
+                                     DL.getTypeAllocSizeInBits(LoadTy));
     } else
       return nullptr;
 
-    C = FoldBitCast(C, MapTy, DL);
-    if (Constant *Res = FoldReinterpretLoadFromConstPtr(C, DL))
+    C = FoldBitCast(C, MapTy->getPointerTo(AS), DL);
+    if (Constant *Res = FoldReinterpretLoadFromConstPtr(C, MapTy, DL))
       return FoldBitCast(Res, LoadTy, DL);
     return nullptr;
   }
@@ -436,28 +442,38 @@ static Constant *FoldReinterpretLoadFromConstPtr(Constant *C,
     return nullptr;
 
   GlobalValue *GVal;
-  APInt Offset;
-  if (!IsConstantOffsetFromGlobal(C, GVal, Offset, DL))
+  APInt OffsetAI;
+  if (!IsConstantOffsetFromGlobal(C, GVal, OffsetAI, DL))
     return nullptr;
 
-  GlobalVariable *GV = dyn_cast<GlobalVariable>(GVal);
+  auto *GV = dyn_cast<GlobalVariable>(GVal);
   if (!GV || !GV->isConstant() || !GV->hasDefinitiveInitializer() ||
       !GV->getInitializer()->getType()->isSized())
     return nullptr;
 
-  // If we're loading off the beginning of the global, some bytes may be valid,
-  // but we don't try to handle this.
-  if (Offset.isNegative())
-    return nullptr;
+  int64_t Offset = OffsetAI.getSExtValue();
+  int64_t InitializerSize = DL.getTypeAllocSize(GV->getInitializer()->getType());
+
+  // If we're not accessing anything in this constant, the result is undefined.
+  if (Offset + BytesLoaded <= 0)
+    return UndefValue::get(IntType);
 
   // If we're not accessing anything in this constant, the result is undefined.
-  if (Offset.getZExtValue() >=
-      DL.getTypeAllocSize(GV->getInitializer()->getType()))
+  if (Offset >= InitializerSize)
     return UndefValue::get(IntType);
 
   unsigned char RawBytes[32] = {0};
-  if (!ReadDataFromGlobal(GV->getInitializer(), Offset.getZExtValue(), RawBytes,
-                          BytesLoaded, DL))
+  unsigned char *CurPtr = RawBytes;
+  unsigned BytesLeft = BytesLoaded;
+
+  // If we're loading off the beginning of the global, some bytes may be valid.
+  if (Offset < 0) {
+    CurPtr += -Offset;
+    BytesLeft += Offset;
+    Offset = 0;
+  }
+
+  if (!ReadDataFromGlobal(GV->getInitializer(), Offset, CurPtr, BytesLeft, DL))
     return nullptr;
 
   APInt ResultVal = APInt(IntType->getBitWidth(), 0);
@@ -478,14 +494,15 @@ static Constant *FoldReinterpretLoadFromConstPtr(Constant *C,
   return ConstantInt::get(IntType->getContext(), ResultVal);
 }
 
-static Constant *ConstantFoldLoadThroughBitcast(ConstantExpr *CE,
-                                                const DataLayout &DL) {
-  auto *DestPtrTy = dyn_cast<PointerType>(CE->getType());
-  if (!DestPtrTy)
+Constant *ConstantFoldLoadThroughBitcast(ConstantExpr *CE, Type *DestTy,
+                                         const DataLayout &DL) {
+  auto *SrcPtr = CE->getOperand(0);
+  auto *SrcPtrTy = dyn_cast<PointerType>(SrcPtr->getType());
+  if (!SrcPtrTy)
     return nullptr;
-  Type *DestTy = DestPtrTy->getElementType();
+  Type *SrcTy = SrcPtrTy->getPointerElementType();
 
-  Constant *C = ConstantFoldLoadFromConstPtr(CE->getOperand(0), DL);
+  Constant *C = ConstantFoldLoadFromConstPtr(SrcPtr, SrcTy, DL);
   if (!C)
     return nullptr;
 
@@ -522,26 +539,26 @@ static Constant *ConstantFoldLoadThroughBitcast(ConstantExpr *CE,
   return nullptr;
 }
 
-/// Return the value that a load from C would produce if it is constant and
-/// determinable. If this is not determinable, return null.
-Constant *llvm::ConstantFoldLoadFromConstPtr(Constant *C,
+} // end anonymous namespace
+
+Constant *llvm::ConstantFoldLoadFromConstPtr(Constant *C, Type *Ty,
                                              const DataLayout &DL) {
   // First, try the easy cases:
-  if (GlobalVariable *GV = dyn_cast<GlobalVariable>(C))
+  if (auto *GV = dyn_cast<GlobalVariable>(C))
     if (GV->isConstant() && GV->hasDefinitiveInitializer())
       return GV->getInitializer();
 
   if (auto *GA = dyn_cast<GlobalAlias>(C))
-    if (GA->getAliasee() && !GA->mayBeOverridden())
-      return ConstantFoldLoadFromConstPtr(GA->getAliasee(), DL);
+    if (GA->getAliasee() && !GA->isInterposable())
+      return ConstantFoldLoadFromConstPtr(GA->getAliasee(), Ty, DL);
 
   // If the loaded value isn't a constant expr, we can't handle it.
-  ConstantExpr *CE = dyn_cast<ConstantExpr>(C);
+  auto *CE = dyn_cast<ConstantExpr>(C);
   if (!CE)
     return nullptr;
 
   if (CE->getOpcode() == Instruction::GetElementPtr) {
-    if (GlobalVariable *GV = dyn_cast<GlobalVariable>(CE->getOperand(0))) {
+    if (auto *GV = dyn_cast<GlobalVariable>(CE->getOperand(0))) {
       if (GV->isConstant() && GV->hasDefinitiveInitializer()) {
         if (Constant *V =
              ConstantFoldLoadThroughGEPConstantExpr(GV->getInitializer(), CE))
@@ -551,15 +568,14 @@ Constant *llvm::ConstantFoldLoadFromConstPtr(Constant *C,
   }
 
   if (CE->getOpcode() == Instruction::BitCast)
-    if (Constant *LoadedC = ConstantFoldLoadThroughBitcast(CE, DL))
+    if (Constant *LoadedC = ConstantFoldLoadThroughBitcast(CE, Ty, DL))
       return LoadedC;
 
   // Instead of loading constant c string, use corresponding integer value
   // directly if string length is small enough.
   StringRef Str;
   if (getConstantStringInfo(CE, Str) && !Str.empty()) {
-    unsigned StrLen = Str.size();
-    Type *Ty = cast<PointerType>(CE->getType())->getElementType();
+    size_t StrLen = Str.size();
     unsigned NumBits = Ty->getPrimitiveSizeInBits();
     // Replace load with immediate integer if the result is an integer or fp
     // value.
@@ -568,13 +584,13 @@ Constant *llvm::ConstantFoldLoadFromConstPtr(Constant *C,
       APInt StrVal(NumBits, 0);
       APInt SingleChar(NumBits, 0);
       if (DL.isLittleEndian()) {
-        for (signed i = StrLen-1; i >= 0; i--) {
-          SingleChar = (uint64_t) Str[i] & UCHAR_MAX;
+        for (unsigned char C : reverse(Str.bytes())) {
+          SingleChar = static_cast<uint64_t>(C);
           StrVal = (StrVal << 8) | SingleChar;
         }
       } else {
-        for (unsigned i = 0; i < StrLen; i++) {
-          SingleChar = (uint64_t) Str[i] & UCHAR_MAX;
+        for (unsigned char C : Str.bytes()) {
+          SingleChar = static_cast<uint64_t>(C);
           StrVal = (StrVal << 8) | SingleChar;
         }
         // Append NULL at the end.
@@ -591,27 +607,26 @@ Constant *llvm::ConstantFoldLoadFromConstPtr(Constant *C,
 
   // If this load comes from anywhere in a constant global, and if the global
   // is all undef or zero, we know what it loads.
-  if (GlobalVariable *GV =
-          dyn_cast<GlobalVariable>(GetUnderlyingObject(CE, DL))) {
+  if (auto *GV = dyn_cast<GlobalVariable>(GetUnderlyingObject(CE, DL))) {
     if (GV->isConstant() && GV->hasDefinitiveInitializer()) {
-      Type *ResTy = cast<PointerType>(C->getType())->getElementType();
       if (GV->getInitializer()->isNullValue())
-        return Constant::getNullValue(ResTy);
+        return Constant::getNullValue(Ty);
       if (isa<UndefValue>(GV->getInitializer()))
-        return UndefValue::get(ResTy);
+        return UndefValue::get(Ty);
     }
   }
 
   // Try hard to fold loads from bitcasted strange and non-type-safe things.
-  return FoldReinterpretLoadFromConstPtr(CE, DL);
+  return FoldReinterpretLoadFromConstPtr(CE, Ty, DL);
 }
 
-static Constant *ConstantFoldLoadInst(const LoadInst *LI,
-                                      const DataLayout &DL) {
+namespace {
+
+Constant *ConstantFoldLoadInst(const LoadInst *LI, const DataLayout &DL) {
   if (LI->isVolatile()) return nullptr;
 
-  if (Constant *C = dyn_cast<Constant>(LI->getOperand(0)))
-    return ConstantFoldLoadFromConstPtr(C, DL);
+  if (auto *C = dyn_cast<Constant>(LI->getOperand(0)))
+    return ConstantFoldLoadFromConstPtr(C, LI->getType(), DL);
 
   return nullptr;
 }
@@ -620,9 +635,8 @@ static Constant *ConstantFoldLoadInst(const LoadInst *LI,
 /// Attempt to symbolically evaluate the result of a binary operator merging
 /// these together.  If target data info is available, it is provided as DL,
 /// otherwise DL is null.
-static Constant *SymbolicallyEvaluateBinop(unsigned Opc, Constant *Op0,
-                                           Constant *Op1,
-                                           const DataLayout &DL) {
+Constant *SymbolicallyEvaluateBinop(unsigned Opc, Constant *Op0, Constant *Op1,
+                                    const DataLayout &DL) {
   // SROA
 
   // Fold (and 0xffffffff00000000, (shl x, 32)) -> shl.
@@ -674,18 +688,16 @@ static Constant *SymbolicallyEvaluateBinop(unsigned Opc, Constant *Op0,
 
 /// If array indices are not pointer-sized integers, explicitly cast them so
 /// that they aren't implicitly casted by the getelementptr.
-static Constant *CastGEPIndices(Type *SrcTy, ArrayRef<Constant *> Ops,
-                                Type *ResultTy, const DataLayout &DL,
-                                const TargetLibraryInfo *TLI) {
+Constant *CastGEPIndices(Type *SrcElemTy, ArrayRef<Constant *> Ops,
+                         Type *ResultTy, const DataLayout &DL,
+                         const TargetLibraryInfo *TLI) {
   Type *IntPtrTy = DL.getIntPtrType(ResultTy);
 
   bool Any = false;
   SmallVector<Constant*, 32> NewIdxs;
   for (unsigned i = 1, e = Ops.size(); i != e; ++i) {
     if ((i == 1 ||
-         !isa<StructType>(GetElementPtrInst::getIndexedType(
-             cast<PointerType>(Ops[0]->getType()->getScalarType())
-                 ->getElementType(),
+         !isa<StructType>(GetElementPtrInst::getIndexedType(SrcElemTy,
              Ops.slice(1, i - 1)))) &&
         Ops[i]->getType() != IntPtrTy) {
       Any = true;
@@ -701,8 +713,8 @@ static Constant *CastGEPIndices(Type *SrcTy, ArrayRef<Constant *> Ops,
   if (!Any)
     return nullptr;
 
-  Constant *C = ConstantExpr::getGetElementPtr(SrcTy, Ops[0], NewIdxs);
-  if (ConstantExpr *CE = dyn_cast<ConstantExpr>(C)) {
+  Constant *C = ConstantExpr::getGetElementPtr(SrcElemTy, Ops[0], NewIdxs);
+  if (auto *CE = dyn_cast<ConstantExpr>(C)) {
     if (Constant *Folded = ConstantFoldConstantExpression(CE, DL, TLI))
       C = Folded;
   }
@@ -711,32 +723,41 @@ static Constant *CastGEPIndices(Type *SrcTy, ArrayRef<Constant *> Ops,
 }
 
 /// Strip the pointer casts, but preserve the address space information.
-static Constant* StripPtrCastKeepAS(Constant* Ptr) {
+Constant* StripPtrCastKeepAS(Constant* Ptr, Type *&ElemTy) {
   assert(Ptr->getType()->isPointerTy() && "Not a pointer type");
-  PointerType *OldPtrTy = cast<PointerType>(Ptr->getType());
+  auto *OldPtrTy = cast<PointerType>(Ptr->getType());
   Ptr = Ptr->stripPointerCasts();
-  PointerType *NewPtrTy = cast<PointerType>(Ptr->getType());
+  auto *NewPtrTy = cast<PointerType>(Ptr->getType());
+
+  ElemTy = NewPtrTy->getPointerElementType();
 
   // Preserve the address space number of the pointer.
   if (NewPtrTy->getAddressSpace() != OldPtrTy->getAddressSpace()) {
-    NewPtrTy = NewPtrTy->getElementType()->getPointerTo(
-      OldPtrTy->getAddressSpace());
+    NewPtrTy = ElemTy->getPointerTo(OldPtrTy->getAddressSpace());
     Ptr = ConstantExpr::getPointerCast(Ptr, NewPtrTy);
   }
   return Ptr;
 }
 
 /// If we can symbolically evaluate the GEP constant expression, do so.
-static Constant *SymbolicallyEvaluateGEP(Type *SrcTy, ArrayRef<Constant *> Ops,
-                                         Type *ResultTy, const DataLayout &DL,
-                                         const TargetLibraryInfo *TLI) {
+Constant *SymbolicallyEvaluateGEP(const GEPOperator *GEP,
+                                  ArrayRef<Constant *> Ops,
+                                  const DataLayout &DL,
+                                  const TargetLibraryInfo *TLI) {
+  Type *SrcElemTy = GEP->getSourceElementType();
+  Type *ResElemTy = GEP->getResultElementType();
+  Type *ResTy = GEP->getType();
+  if (!SrcElemTy->isSized())
+    return nullptr;
+
+  if (Constant *C = CastGEPIndices(SrcElemTy, Ops, ResTy, DL, TLI))
+    return C;
+
   Constant *Ptr = Ops[0];
-  if (!Ptr->getType()->getPointerElementType()->isSized() ||
-      !Ptr->getType()->isPointerTy())
+  if (!Ptr->getType()->isPointerTy())
     return nullptr;
 
   Type *IntPtrTy = DL.getIntPtrType(Ptr->getType());
-  Type *ResultElementTy = ResultTy->getPointerElementType();
 
   // If this is a constant expr gep that is effectively computing an
   // "offsetof", fold it into 'cast int Size to T*' instead of 'gep 0, 0, 12'
@@ -745,16 +766,16 @@ static Constant *SymbolicallyEvaluateGEP(Type *SrcTy, ArrayRef<Constant *> Ops,
 
       // If this is "gep i8* Ptr, (sub 0, V)", fold this as:
       // "inttoptr (sub (ptrtoint Ptr), V)"
-      if (Ops.size() == 2 && ResultElementTy->isIntegerTy(8)) {
-        ConstantExpr *CE = dyn_cast<ConstantExpr>(Ops[1]);
+      if (Ops.size() == 2 && ResElemTy->isIntegerTy(8)) {
+        auto *CE = dyn_cast<ConstantExpr>(Ops[1]);
         assert((!CE || CE->getType() == IntPtrTy) &&
                "CastGEPIndices didn't canonicalize index types!");
         if (CE && CE->getOpcode() == Instruction::Sub &&
             CE->getOperand(0)->isNullValue()) {
           Constant *Res = ConstantExpr::getPtrToInt(Ptr, CE->getType());
           Res = ConstantExpr::getSub(Res, CE->getOperand(1));
-          Res = ConstantExpr::getIntToPtr(Res, ResultTy);
-          if (ConstantExpr *ResCE = dyn_cast<ConstantExpr>(Res))
+          Res = ConstantExpr::getIntToPtr(Res, ResTy);
+          if (auto *ResCE = dyn_cast<ConstantExpr>(Res))
             Res = ConstantFoldConstantExpression(ResCE, DL, TLI);
           return Res;
         }
@@ -765,19 +786,19 @@ static Constant *SymbolicallyEvaluateGEP(Type *SrcTy, ArrayRef<Constant *> Ops,
   unsigned BitWidth = DL.getTypeSizeInBits(IntPtrTy);
   APInt Offset =
       APInt(BitWidth,
-            DL.getIndexedOffset(
-                Ptr->getType(),
+            DL.getIndexedOffsetInType(
+                SrcElemTy,
                 makeArrayRef((Value * const *)Ops.data() + 1, Ops.size() - 1)));
-  Ptr = StripPtrCastKeepAS(Ptr);
+  Ptr = StripPtrCastKeepAS(Ptr, SrcElemTy);
 
   // If this is a GEP of a GEP, fold it all into a single GEP.
-  while (GEPOperator *GEP = dyn_cast<GEPOperator>(Ptr)) {
+  while (auto *GEP = dyn_cast<GEPOperator>(Ptr)) {
     SmallVector<Value *, 4> NestedOps(GEP->op_begin() + 1, GEP->op_end());
 
     // Do not try the incorporate the sub-GEP if some index is not a number.
     bool AllConstantInt = true;
-    for (unsigned i = 0, e = NestedOps.size(); i != e; ++i)
-      if (!isa<ConstantInt>(NestedOps[i])) {
+    for (Value *NestedOp : NestedOps)
+      if (!isa<ConstantInt>(NestedOp)) {
         AllConstantInt = false;
         break;
       }
@@ -785,23 +806,24 @@ static Constant *SymbolicallyEvaluateGEP(Type *SrcTy, ArrayRef<Constant *> Ops,
       break;
 
     Ptr = cast<Constant>(GEP->getOperand(0));
-    Offset += APInt(BitWidth, DL.getIndexedOffset(Ptr->getType(), NestedOps));
-    Ptr = StripPtrCastKeepAS(Ptr);
+    SrcElemTy = GEP->getSourceElementType();
+    Offset += APInt(BitWidth, DL.getIndexedOffsetInType(SrcElemTy, NestedOps));
+    Ptr = StripPtrCastKeepAS(Ptr, SrcElemTy);
   }
 
   // If the base value for this address is a literal integer value, fold the
   // getelementptr to the resulting integer value casted to the pointer type.
   APInt BasePtr(BitWidth, 0);
-  if (ConstantExpr *CE = dyn_cast<ConstantExpr>(Ptr)) {
+  if (auto *CE = dyn_cast<ConstantExpr>(Ptr)) {
     if (CE->getOpcode() == Instruction::IntToPtr) {
-      if (ConstantInt *Base = dyn_cast<ConstantInt>(CE->getOperand(0)))
+      if (auto *Base = dyn_cast<ConstantInt>(CE->getOperand(0)))
         BasePtr = Base->getValue().zextOrTrunc(BitWidth);
     }
   }
 
   if (Ptr->isNullValue() || BasePtr != 0) {
     Constant *C = ConstantInt::get(Ptr->getContext(), Offset + BasePtr);
-    return ConstantExpr::getIntToPtr(C, ResultTy);
+    return ConstantExpr::getIntToPtr(C, ResTy);
   }
 
   // Otherwise form a regular getelementptr. Recompute the indices so that
@@ -813,39 +835,49 @@ static Constant *SymbolicallyEvaluateGEP(Type *SrcTy, ArrayRef<Constant *> Ops,
   SmallVector<Constant *, 32> NewIdxs;
 
   do {
-    if (SequentialType *ATy = dyn_cast<SequentialType>(Ty)) {
-      if (ATy->isPointerTy()) {
+    if (!Ty->isStructTy()) {
+      if (Ty->isPointerTy()) {
         // The only pointer indexing we'll do is on the first index of the GEP.
         if (!NewIdxs.empty())
           break;
 
+        Ty = SrcElemTy;
+
         // Only handle pointers to sized types, not pointers to functions.
-        if (!ATy->getElementType()->isSized())
+        if (!Ty->isSized())
           return nullptr;
+      } else if (auto *ATy = dyn_cast<SequentialType>(Ty)) {
+        Ty = ATy->getElementType();
+      } else {
+        // We've reached some non-indexable type.
+        break;
       }
 
       // Determine which element of the array the offset points into.
-      APInt ElemSize(BitWidth, DL.getTypeAllocSize(ATy->getElementType()));
-      if (ElemSize == 0)
+      APInt ElemSize(BitWidth, DL.getTypeAllocSize(Ty));
+      if (ElemSize == 0) {
         // The element size is 0. This may be [0 x Ty]*, so just use a zero
         // index for this level and proceed to the next level to see if it can
         // accommodate the offset.
         NewIdxs.push_back(ConstantInt::get(IntPtrTy, 0));
-      else {
+      } else {
         // The element size is non-zero divide the offset by the element
         // size (rounding down), to compute the index at this level.
-        APInt NewIdx = Offset.udiv(ElemSize);
+        bool Overflow;
+        APInt NewIdx = Offset.sdiv_ov(ElemSize, Overflow);
+        if (Overflow)
+          break;
         Offset -= NewIdx * ElemSize;
         NewIdxs.push_back(ConstantInt::get(IntPtrTy, NewIdx));
       }
-      Ty = ATy->getElementType();
-    } else if (StructType *STy = dyn_cast<StructType>(Ty)) {
+    } else {
+      auto *STy = cast<StructType>(Ty);
       // If we end up with an offset that isn't valid for this struct type, we
       // can't re-form this GEP in a regular form, so bail out. The pointer
       // operand likely went through casts that are necessary to make the GEP
       // sensible.
       const StructLayout &SL = *DL.getStructLayout(STy);
-      if (Offset.uge(SL.getSizeInBytes()))
+      if (Offset.isNegative() || Offset.uge(SL.getSizeInBytes()))
         break;
 
       // Determine which field of the struct the offset points into. The
@@ -856,11 +888,8 @@ static Constant *SymbolicallyEvaluateGEP(Type *SrcTy, ArrayRef<Constant *> Ops,
                                          ElIdx));
       Offset -= APInt(BitWidth, SL.getElementOffset(ElIdx));
       Ty = STy->getTypeAtIndex(ElIdx);
-    } else {
-      // We've reached some non-indexable type.
-      break;
     }
-  } while (Ty != ResultElementTy);
+  } while (Ty != ResElemTy);
 
   // If we haven't used up the entire offset by descending the static
   // type, then the offset is pointing into the middle of an indivisible
@@ -869,33 +898,78 @@ static Constant *SymbolicallyEvaluateGEP(Type *SrcTy, ArrayRef<Constant *> Ops,
     return nullptr;
 
   // Create a GEP.
-  Constant *C = ConstantExpr::getGetElementPtr(SrcTy, Ptr, NewIdxs);
+  Constant *C = ConstantExpr::getGetElementPtr(SrcElemTy, Ptr, NewIdxs);
   assert(C->getType()->getPointerElementType() == Ty &&
          "Computed GetElementPtr has unexpected type!");
 
   // If we ended up indexing a member with a type that doesn't match
   // the type of what the original indices indexed, add a cast.
-  if (Ty != ResultElementTy)
-    C = FoldBitCast(C, ResultTy, DL);
+  if (Ty != ResElemTy)
+    C = FoldBitCast(C, ResTy, DL);
 
   return C;
 }
 
+/// Attempt to constant fold an instruction with the
+/// specified opcode and operands.  If successful, the constant result is
+/// returned, if not, null is returned.  Note that this function can fail when
+/// attempting to fold instructions like loads and stores, which have no
+/// constant expression form.
+///
+/// TODO: This function neither utilizes nor preserves nsw/nuw/inbounds/etc
+/// information, due to only being passed an opcode and operands. Constant
+/// folding using this function strips this information.
+///
+Constant *ConstantFoldInstOperandsImpl(const Value *InstOrCE, Type *DestTy,
+                                       unsigned Opcode,
+                                       ArrayRef<Constant *> Ops,
+                                       const DataLayout &DL,
+                                       const TargetLibraryInfo *TLI) {
+  // Handle easy binops first.
+  if (Instruction::isBinaryOp(Opcode))
+    return ConstantFoldBinaryOpOperands(Opcode, Ops[0], Ops[1], DL);
+
+  if (Instruction::isCast(Opcode))
+    return ConstantFoldCastOperand(Opcode, Ops[0], DestTy, DL);
+
+  if (auto *GEP = dyn_cast<GEPOperator>(InstOrCE)) {
+    if (Constant *C = SymbolicallyEvaluateGEP(GEP, Ops, DL, TLI))
+      return C;
+
+    return ConstantExpr::getGetElementPtr(GEP->getSourceElementType(),
+                                          Ops[0], Ops.slice(1));
+  }
+
+  switch (Opcode) {
+  default: return nullptr;
+  case Instruction::ICmp:
+  case Instruction::FCmp: llvm_unreachable("Invalid for compares");
+  case Instruction::Call:
+    if (auto *F = dyn_cast<Function>(Ops.back()))
+      if (canConstantFoldCallTo(F))
+        return ConstantFoldCall(F, Ops.slice(0, Ops.size() - 1), TLI);
+    return nullptr;
+  case Instruction::Select:
+    return ConstantExpr::getSelect(Ops[0], Ops[1], Ops[2]);
+  case Instruction::ExtractElement:
+    return ConstantExpr::getExtractElement(Ops[0], Ops[1]);
+  case Instruction::InsertElement:
+    return ConstantExpr::getInsertElement(Ops[0], Ops[1], Ops[2]);
+  case Instruction::ShuffleVector:
+    return ConstantExpr::getShuffleVector(Ops[0], Ops[1], Ops[2]);
+  }
+}
 
+} // end anonymous namespace
 
 //===----------------------------------------------------------------------===//
 // Constant Folding public APIs
 //===----------------------------------------------------------------------===//
 
-/// Try to constant fold the specified instruction.
-/// If successful, the constant result is returned, if not, null is returned.
-/// Note that this fails if not all of the operands are constant.  Otherwise,
-/// this function can only fail when attempting to fold instructions like loads
-/// and stores, which have no constant expression form.
 Constant *llvm::ConstantFoldInstruction(Instruction *I, const DataLayout &DL,
                                         const TargetLibraryInfo *TLI) {
   // Handle PHI nodes quickly here...
-  if (PHINode *PN = dyn_cast<PHINode>(I)) {
+  if (auto *PN = dyn_cast<PHINode>(I)) {
     Constant *CommonValue = nullptr;
 
     for (Value *Incoming : PN->incoming_values()) {
@@ -906,11 +980,11 @@ Constant *llvm::ConstantFoldInstruction(Instruction *I, const DataLayout &DL,
       if (isa<UndefValue>(Incoming))
         continue;
       // If the incoming value is not a constant, then give up.
-      Constant *C = dyn_cast<Constant>(Incoming);
+      auto *C = dyn_cast<Constant>(Incoming);
       if (!C)
         return nullptr;
       // Fold the PHI's operands.
-      if (ConstantExpr *NewC = dyn_cast<ConstantExpr>(C))
+      if (auto *NewC = dyn_cast<ConstantExpr>(C))
         C = ConstantFoldConstantExpression(NewC, DL, TLI);
       // If the incoming value is a different constant to
       // the one we saw previously, then give up.
@@ -925,54 +999,55 @@ Constant *llvm::ConstantFoldInstruction(Instruction *I, const DataLayout &DL,
   }
 
   // Scan the operand list, checking to see if they are all constants, if so,
-  // hand off to ConstantFoldInstOperands.
-  SmallVector<Constant*, 8> Ops;
-  for (User::op_iterator i = I->op_begin(), e = I->op_end(); i != e; ++i) {
-    Constant *Op = dyn_cast<Constant>(*i);
-    if (!Op)
-      return nullptr;  // All operands not constant!
+  // hand off to ConstantFoldInstOperandsImpl.
+  if (!all_of(I->operands(), [](Use &U) { return isa<Constant>(U); }))
+    return nullptr;
 
+  SmallVector<Constant *, 8> Ops;
+  for (const Use &OpU : I->operands()) {
+    auto *Op = cast<Constant>(&OpU);
     // Fold the Instruction's operands.
-    if (ConstantExpr *NewCE = dyn_cast<ConstantExpr>(Op))
+    if (auto *NewCE = dyn_cast<ConstantExpr>(Op))
       Op = ConstantFoldConstantExpression(NewCE, DL, TLI);
 
     Ops.push_back(Op);
   }
 
-  if (const CmpInst *CI = dyn_cast<CmpInst>(I))
+  if (const auto *CI = dyn_cast<CmpInst>(I))
     return ConstantFoldCompareInstOperands(CI->getPredicate(), Ops[0], Ops[1],
                                            DL, TLI);
 
-  if (const LoadInst *LI = dyn_cast<LoadInst>(I))
+  if (const auto *LI = dyn_cast<LoadInst>(I))
     return ConstantFoldLoadInst(LI, DL);
 
-  if (InsertValueInst *IVI = dyn_cast<InsertValueInst>(I)) {
+  if (auto *IVI = dyn_cast<InsertValueInst>(I)) {
     return ConstantExpr::getInsertValue(
                                 cast<Constant>(IVI->getAggregateOperand()),
                                 cast<Constant>(IVI->getInsertedValueOperand()),
                                 IVI->getIndices());
   }
 
-  if (ExtractValueInst *EVI = dyn_cast<ExtractValueInst>(I)) {
+  if (auto *EVI = dyn_cast<ExtractValueInst>(I)) {
     return ConstantExpr::getExtractValue(
                                     cast<Constant>(EVI->getAggregateOperand()),
                                     EVI->getIndices());
   }
 
-  return ConstantFoldInstOperands(I->getOpcode(), I->getType(), Ops, DL, TLI);
+  return ConstantFoldInstOperands(I, Ops, DL, TLI);
 }
 
-static Constant *
+namespace {
+
+Constant *
 ConstantFoldConstantExpressionImpl(const ConstantExpr *CE, const DataLayout &DL,
                                    const TargetLibraryInfo *TLI,
                                    SmallPtrSetImpl<ConstantExpr *> &FoldedOps) {
   SmallVector<Constant *, 8> Ops;
-  for (User::const_op_iterator i = CE->op_begin(), e = CE->op_end(); i != e;
-       ++i) {
-    Constant *NewC = cast<Constant>(*i);
+  for (const Use &NewU : CE->operands()) {
+    auto *NewC = cast<Constant>(&NewU);
     // Recursively fold the ConstantExpr's operands. If we have already folded
     // a ConstantExpr, we don't have to process it again.
-    if (ConstantExpr *NewCE = dyn_cast<ConstantExpr>(NewC)) {
+    if (auto *NewCE = dyn_cast<ConstantExpr>(NewC)) {
       if (FoldedOps.insert(NewCE).second)
         NewC = ConstantFoldConstantExpressionImpl(NewCE, DL, TLI, FoldedOps);
     }
@@ -982,12 +1057,13 @@ ConstantFoldConstantExpressionImpl(const ConstantExpr *CE, const DataLayout &DL,
   if (CE->isCompare())
     return ConstantFoldCompareInstOperands(CE->getPredicate(), Ops[0], Ops[1],
                                            DL, TLI);
-  return ConstantFoldInstOperands(CE->getOpcode(), CE->getType(), Ops, DL, TLI);
+
+  return ConstantFoldInstOperandsImpl(CE, CE->getType(), CE->getOpcode(), Ops,
+                                      DL, TLI);
 }
 
-/// Attempt to fold the constant expression
-/// using the specified DataLayout.  If successful, the constant result is
-/// result is returned, if not, null is returned.
+} // end anonymous namespace
+
 Constant *llvm::ConstantFoldConstantExpression(const ConstantExpr *CE,
                                                const DataLayout &DL,
                                                const TargetLibraryInfo *TLI) {
@@ -995,114 +1071,22 @@ Constant *llvm::ConstantFoldConstantExpression(const ConstantExpr *CE,
   return ConstantFoldConstantExpressionImpl(CE, DL, TLI, FoldedOps);
 }
 
-/// Attempt to constant fold an instruction with the
-/// specified opcode and operands.  If successful, the constant result is
-/// returned, if not, null is returned.  Note that this function can fail when
-/// attempting to fold instructions like loads and stores, which have no
-/// constant expression form.
-///
-/// TODO: This function neither utilizes nor preserves nsw/nuw/inbounds/etc
-/// information, due to only being passed an opcode and operands. Constant
-/// folding using this function strips this information.
-///
-Constant *llvm::ConstantFoldInstOperands(unsigned Opcode, Type *DestTy,
+Constant *llvm::ConstantFoldInstOperands(Instruction *I,
                                          ArrayRef<Constant *> Ops,
                                          const DataLayout &DL,
                                          const TargetLibraryInfo *TLI) {
-  // Handle easy binops first.
-  if (Instruction::isBinaryOp(Opcode)) {
-    if (isa<ConstantExpr>(Ops[0]) || isa<ConstantExpr>(Ops[1])) {
-      if (Constant *C = SymbolicallyEvaluateBinop(Opcode, Ops[0], Ops[1], DL))
-        return C;
-    }
-
-    return ConstantExpr::get(Opcode, Ops[0], Ops[1]);
-  }
-
-  switch (Opcode) {
-  default: return nullptr;
-  case Instruction::ICmp:
-  case Instruction::FCmp: llvm_unreachable("Invalid for compares");
-  case Instruction::Call:
-    if (Function *F = dyn_cast<Function>(Ops.back()))
-      if (canConstantFoldCallTo(F))
-        return ConstantFoldCall(F, Ops.slice(0, Ops.size() - 1), TLI);
-    return nullptr;
-  case Instruction::PtrToInt:
-    // If the input is a inttoptr, eliminate the pair.  This requires knowing
-    // the width of a pointer, so it can't be done in ConstantExpr::getCast.
-    if (ConstantExpr *CE = dyn_cast<ConstantExpr>(Ops[0])) {
-      if (CE->getOpcode() == Instruction::IntToPtr) {
-        Constant *Input = CE->getOperand(0);
-        unsigned InWidth = Input->getType()->getScalarSizeInBits();
-        unsigned PtrWidth = DL.getPointerTypeSizeInBits(CE->getType());
-        if (PtrWidth < InWidth) {
-          Constant *Mask =
-            ConstantInt::get(CE->getContext(),
-                             APInt::getLowBitsSet(InWidth, PtrWidth));
-          Input = ConstantExpr::getAnd(Input, Mask);
-        }
-        // Do a zext or trunc to get to the dest size.
-        return ConstantExpr::getIntegerCast(Input, DestTy, false);
-      }
-    }
-    return ConstantExpr::getCast(Opcode, Ops[0], DestTy);
-  case Instruction::IntToPtr:
-    // If the input is a ptrtoint, turn the pair into a ptr to ptr bitcast if
-    // the int size is >= the ptr size and the address spaces are the same.
-    // This requires knowing the width of a pointer, so it can't be done in
-    // ConstantExpr::getCast.
-    if (ConstantExpr *CE = dyn_cast<ConstantExpr>(Ops[0])) {
-      if (CE->getOpcode() == Instruction::PtrToInt) {
-        Constant *SrcPtr = CE->getOperand(0);
-        unsigned SrcPtrSize = DL.getPointerTypeSizeInBits(SrcPtr->getType());
-        unsigned MidIntSize = CE->getType()->getScalarSizeInBits();
-
-        if (MidIntSize >= SrcPtrSize) {
-          unsigned SrcAS = SrcPtr->getType()->getPointerAddressSpace();
-          if (SrcAS == DestTy->getPointerAddressSpace())
-            return FoldBitCast(CE->getOperand(0), DestTy, DL);
-        }
-      }
-    }
-
-    return ConstantExpr::getCast(Opcode, Ops[0], DestTy);
-  case Instruction::Trunc:
-  case Instruction::ZExt:
-  case Instruction::SExt:
-  case Instruction::FPTrunc:
-  case Instruction::FPExt:
-  case Instruction::UIToFP:
-  case Instruction::SIToFP:
-  case Instruction::FPToUI:
-  case Instruction::FPToSI:
-  case Instruction::AddrSpaceCast:
-      return ConstantExpr::getCast(Opcode, Ops[0], DestTy);
-  case Instruction::BitCast:
-    return FoldBitCast(Ops[0], DestTy, DL);
-  case Instruction::Select:
-    return ConstantExpr::getSelect(Ops[0], Ops[1], Ops[2]);
-  case Instruction::ExtractElement:
-    return ConstantExpr::getExtractElement(Ops[0], Ops[1]);
-  case Instruction::InsertElement:
-    return ConstantExpr::getInsertElement(Ops[0], Ops[1], Ops[2]);
-  case Instruction::ShuffleVector:
-    return ConstantExpr::getShuffleVector(Ops[0], Ops[1], Ops[2]);
-  case Instruction::GetElementPtr: {
-    Type *SrcTy = nullptr;
-    if (Constant *C = CastGEPIndices(SrcTy, Ops, DestTy, DL, TLI))
-      return C;
-    if (Constant *C = SymbolicallyEvaluateGEP(SrcTy, Ops, DestTy, DL, TLI))
-      return C;
+  return ConstantFoldInstOperandsImpl(I, I->getType(), I->getOpcode(), Ops, DL,
+                                      TLI);
+}
 
-    return ConstantExpr::getGetElementPtr(SrcTy, Ops[0], Ops.slice(1));
-  }
-  }
+Constant *llvm::ConstantFoldInstOperands(unsigned Opcode, Type *DestTy,
+                                         ArrayRef<Constant *> Ops,
+                                         const DataLayout &DL,
+                                         const TargetLibraryInfo *TLI) {
+  assert(Opcode != Instruction::GetElementPtr && "Invalid for GEPs");
+  return ConstantFoldInstOperandsImpl(nullptr, DestTy, Opcode, Ops, DL, TLI);
 }
 
-/// Attempt to constant fold a compare
-/// instruction (icmp/fcmp) with the specified operands.  If it fails, it
-/// returns a constant expression of the specified operands.
 Constant *llvm::ConstantFoldCompareInstOperands(unsigned Predicate,
                                                 Constant *Ops0, Constant *Ops1,
                                                 const DataLayout &DL,
@@ -1115,7 +1099,7 @@ Constant *llvm::ConstantFoldCompareInstOperands(unsigned Predicate,
   // FIXME: The following comment is out of data and the DataLayout is here now.
   // ConstantExpr::getCompare cannot do this, because it doesn't have DL
   // around to know if bit truncation is happening.
-  if (ConstantExpr *CE0 = dyn_cast<ConstantExpr>(Ops0)) {
+  if (auto *CE0 = dyn_cast<ConstantExpr>(Ops0)) {
     if (Ops1->isNullValue()) {
       if (CE0->getOpcode() == Instruction::IntToPtr) {
         Type *IntPtrTy = DL.getIntPtrType(CE0->getType());
@@ -1139,7 +1123,7 @@ Constant *llvm::ConstantFoldCompareInstOperands(unsigned Predicate,
       }
     }
 
-    if (ConstantExpr *CE1 = dyn_cast<ConstantExpr>(Ops1)) {
+    if (auto *CE1 = dyn_cast<ConstantExpr>(Ops1)) {
       if (CE0->getOpcode() == CE1->getOpcode()) {
         if (CE0->getOpcode() == Instruction::IntToPtr) {
           Type *IntPtrTy = DL.getIntPtrType(CE0->getType());
@@ -1176,18 +1160,85 @@ Constant *llvm::ConstantFoldCompareInstOperands(unsigned Predicate,
           Predicate, CE0->getOperand(1), Ops1, DL, TLI);
       unsigned OpC =
         Predicate == ICmpInst::ICMP_EQ ? Instruction::And : Instruction::Or;
-      Constant *Ops[] = { LHS, RHS };
-      return ConstantFoldInstOperands(OpC, LHS->getType(), Ops, DL, TLI);
+      return ConstantFoldBinaryOpOperands(OpC, LHS, RHS, DL);
     }
   }
 
   return ConstantExpr::getCompare(Predicate, Ops0, Ops1);
 }
 
+Constant *llvm::ConstantFoldBinaryOpOperands(unsigned Opcode, Constant *LHS,
+                                             Constant *RHS,
+                                             const DataLayout &DL) {
+  assert(Instruction::isBinaryOp(Opcode));
+  if (isa<ConstantExpr>(LHS) || isa<ConstantExpr>(RHS))
+    if (Constant *C = SymbolicallyEvaluateBinop(Opcode, LHS, RHS, DL))
+      return C;
+
+  return ConstantExpr::get(Opcode, LHS, RHS);
+}
+
+Constant *llvm::ConstantFoldCastOperand(unsigned Opcode, Constant *C,
+                                        Type *DestTy, const DataLayout &DL) {
+  assert(Instruction::isCast(Opcode));
+  switch (Opcode) {
+  default:
+    llvm_unreachable("Missing case");
+  case Instruction::PtrToInt:
+    // If the input is a inttoptr, eliminate the pair.  This requires knowing
+    // the width of a pointer, so it can't be done in ConstantExpr::getCast.
+    if (auto *CE = dyn_cast<ConstantExpr>(C)) {
+      if (CE->getOpcode() == Instruction::IntToPtr) {
+        Constant *Input = CE->getOperand(0);
+        unsigned InWidth = Input->getType()->getScalarSizeInBits();
+        unsigned PtrWidth = DL.getPointerTypeSizeInBits(CE->getType());
+        if (PtrWidth < InWidth) {
+          Constant *Mask =
+            ConstantInt::get(CE->getContext(),
+                             APInt::getLowBitsSet(InWidth, PtrWidth));
+          Input = ConstantExpr::getAnd(Input, Mask);
+        }
+        // Do a zext or trunc to get to the dest size.
+        return ConstantExpr::getIntegerCast(Input, DestTy, false);
+      }
+    }
+    return ConstantExpr::getCast(Opcode, C, DestTy);
+  case Instruction::IntToPtr:
+    // If the input is a ptrtoint, turn the pair into a ptr to ptr bitcast if
+    // the int size is >= the ptr size and the address spaces are the same.
+    // This requires knowing the width of a pointer, so it can't be done in
+    // ConstantExpr::getCast.
+    if (auto *CE = dyn_cast<ConstantExpr>(C)) {
+      if (CE->getOpcode() == Instruction::PtrToInt) {
+        Constant *SrcPtr = CE->getOperand(0);
+        unsigned SrcPtrSize = DL.getPointerTypeSizeInBits(SrcPtr->getType());
+        unsigned MidIntSize = CE->getType()->getScalarSizeInBits();
+
+        if (MidIntSize >= SrcPtrSize) {
+          unsigned SrcAS = SrcPtr->getType()->getPointerAddressSpace();
+          if (SrcAS == DestTy->getPointerAddressSpace())
+            return FoldBitCast(CE->getOperand(0), DestTy, DL);
+        }
+      }
+    }
+
+    return ConstantExpr::getCast(Opcode, C, DestTy);
+  case Instruction::Trunc:
+  case Instruction::ZExt:
+  case Instruction::SExt:
+  case Instruction::FPTrunc:
+  case Instruction::FPExt:
+  case Instruction::UIToFP:
+  case Instruction::SIToFP:
+  case Instruction::FPToUI:
+  case Instruction::FPToSI:
+  case Instruction::AddrSpaceCast:
+      return ConstantExpr::getCast(Opcode, C, DestTy);
+  case Instruction::BitCast:
+    return FoldBitCast(C, DestTy, DL);
+  }
+}
 
-/// Given a constant and a getelementptr constantexpr, return the constant value
-/// being addressed by the constant expression, or null if something is funny
-/// and we can't decide.
 Constant *llvm::ConstantFoldLoadThroughGEPConstantExpr(Constant *C,
                                                        ConstantExpr *CE) {
   if (!CE->getOperand(1)->isNullValue())
@@ -1203,27 +1254,23 @@ Constant *llvm::ConstantFoldLoadThroughGEPConstantExpr(Constant *C,
   return C;
 }
 
-/// Given a constant and getelementptr indices (with an *implied* zero pointer
-/// index that is not in the list), return the constant value being addressed by
-/// a virtual load, or null if something is funny and we can't decide.
-Constant *llvm::ConstantFoldLoadThroughGEPIndices(Constant *C,
-                                                  ArrayRef<Constant*> Indices) {
+Constant *
+llvm::ConstantFoldLoadThroughGEPIndices(Constant *C,
+                                        ArrayRef<Constant *> Indices) {
   // Loop over all of the operands, tracking down which value we are
   // addressing.
-  for (unsigned i = 0, e = Indices.size(); i != e; ++i) {
-    C = C->getAggregateElement(Indices[i]);
+  for (Constant *Index : Indices) {
+    C = C->getAggregateElement(Index);
     if (!C)
       return nullptr;
   }
   return C;
 }
 
-
 //===----------------------------------------------------------------------===//
 //  Constant Folding for Calls
 //
 
-/// Return true if it's even possible to fold a call to the specified function.
 bool llvm::canConstantFoldCallTo(const Function *F) {
   switch (F->getIntrinsicID()) {
   case Intrinsic::fabs:
@@ -1252,6 +1299,7 @@ bool llvm::canConstantFoldCallTo(const Function *F) {
   case Intrinsic::fmuladd:
   case Intrinsic::copysign:
   case Intrinsic::round:
+  case Intrinsic::masked_load:
   case Intrinsic::sadd_with_overflow:
   case Intrinsic::uadd_with_overflow:
   case Intrinsic::ssub_with_overflow:
@@ -1260,6 +1308,7 @@ bool llvm::canConstantFoldCallTo(const Function *F) {
   case Intrinsic::umul_with_overflow:
   case Intrinsic::convert_from_fp16:
   case Intrinsic::convert_to_fp16:
+  case Intrinsic::bitreverse:
   case Intrinsic::x86_sse_cvtss2si:
   case Intrinsic::x86_sse_cvtss2si64:
   case Intrinsic::x86_sse_cvttss2si:
@@ -1309,7 +1358,9 @@ bool llvm::canConstantFoldCallTo(const Function *F) {
   }
 }
 
-static Constant *GetConstantFoldFPValue(double V, Type *Ty) {
+namespace {
+
+Constant *GetConstantFoldFPValue(double V, Type *Ty) {
   if (Ty->isHalfTy()) {
     APFloat APF(V);
     bool unused;
@@ -1321,12 +1372,10 @@ static Constant *GetConstantFoldFPValue(double V, Type *Ty) {
   if (Ty->isDoubleTy())
     return ConstantFP::get(Ty->getContext(), APFloat(V));
   llvm_unreachable("Can only constant fold half/float/double");
-
 }
 
-namespace {
 /// Clear the floating-point exception state.
-static inline void llvm_fenv_clearexcept() {
+inline void llvm_fenv_clearexcept() {
 #if defined(HAVE_FENV_H) && HAVE_DECL_FE_ALL_EXCEPT
   feclearexcept(FE_ALL_EXCEPT);
 #endif
@@ -1334,7 +1383,7 @@ static inline void llvm_fenv_clearexcept() {
 }
 
 /// Test if a floating-point exception was raised.
-static inline bool llvm_fenv_testexcept() {
+inline bool llvm_fenv_testexcept() {
   int errno_val = errno;
   if (errno_val == ERANGE || errno_val == EDOM)
     return true;
@@ -1344,10 +1393,8 @@ static inline bool llvm_fenv_testexcept() {
 #endif
   return false;
 }
-} // End namespace
 
-static Constant *ConstantFoldFP(double (*NativeFP)(double), double V,
-                                Type *Ty) {
+Constant *ConstantFoldFP(double (*NativeFP)(double), double V, Type *Ty) {
   llvm_fenv_clearexcept();
   V = NativeFP(V);
   if (llvm_fenv_testexcept()) {
@@ -1358,8 +1405,8 @@ static Constant *ConstantFoldFP(double (*NativeFP)(double), double V,
   return GetConstantFoldFPValue(V, Ty);
 }
 
-static Constant *ConstantFoldBinaryFP(double (*NativeFP)(double, double),
-                                      double V, double W, Type *Ty) {
+Constant *ConstantFoldBinaryFP(double (*NativeFP)(double, double), double V,
+                               double W, Type *Ty) {
   llvm_fenv_clearexcept();
   V = NativeFP(V, W);
   if (llvm_fenv_testexcept()) {
@@ -1377,8 +1424,8 @@ static Constant *ConstantFoldBinaryFP(double (*NativeFP)(double, double),
 /// integer type Ty is used to select how many bits are available for the
 /// result. Returns null if the conversion cannot be performed, otherwise
 /// returns the Constant value resulting from the conversion.
-static Constant *ConstantFoldConvertToInt(const APFloat &Val,
-                                          bool roundTowardZero, Type *Ty) {
+Constant *ConstantFoldConvertToInt(const APFloat &Val, bool roundTowardZero,
+                                   Type *Ty) {
   // All of these conversion intrinsics form an integer of at most 64bits.
   unsigned ResultWidth = Ty->getIntegerBitWidth();
   assert(ResultWidth <= 64 &&
@@ -1396,7 +1443,7 @@ static Constant *ConstantFoldConvertToInt(const APFloat &Val,
   return ConstantInt::get(Ty, UIntVal, /*isSigned=*/true);
 }
 
-static double getValueAsDouble(ConstantFP *Op) {
+double getValueAsDouble(ConstantFP *Op) {
   Type *Ty = Op->getType();
 
   if (Ty->isFloatTy())
@@ -1411,11 +1458,16 @@ static double getValueAsDouble(ConstantFP *Op) {
   return APF.convertToDouble();
 }
 
-static Constant *ConstantFoldScalarCall(StringRef Name, unsigned IntrinsicID,
-                                        Type *Ty, ArrayRef<Constant *> Operands,
-                                        const TargetLibraryInfo *TLI) {
+Constant *ConstantFoldScalarCall(StringRef Name, unsigned IntrinsicID, Type *Ty,
+                                 ArrayRef<Constant *> Operands,
+                                 const TargetLibraryInfo *TLI) {
   if (Operands.size() == 1) {
-    if (ConstantFP *Op = dyn_cast<ConstantFP>(Operands[0])) {
+    if (isa<UndefValue>(Operands[0])) {
+      // cosine(arg) is between -1 and 1. cosine(invalid arg) is NaN
+      if (IntrinsicID == Intrinsic::cos)
+        return Constant::getNullValue(Ty);
+    }
+    if (auto *Op = dyn_cast<ConstantFP>(Operands[0])) {
       if (IntrinsicID == Intrinsic::convert_to_fp16) {
         APFloat Val(Op->getValueAPF());
 
@@ -1586,12 +1638,14 @@ static Constant *ConstantFoldScalarCall(StringRef Name, unsigned IntrinsicID,
       return nullptr;
     }
 
-    if (ConstantInt *Op = dyn_cast<ConstantInt>(Operands[0])) {
+    if (auto *Op = dyn_cast<ConstantInt>(Operands[0])) {
       switch (IntrinsicID) {
       case Intrinsic::bswap:
         return ConstantInt::get(Ty->getContext(), Op->getValue().byteSwap());
       case Intrinsic::ctpop:
         return ConstantInt::get(Ty, Op->getValue().countPopulation());
+      case Intrinsic::bitreverse:
+        return ConstantInt::get(Ty->getContext(), Op->getValue().reverseBits());
       case Intrinsic::convert_from_fp16: {
         APFloat Val(APFloat::IEEEhalf, Op->getValue());
 
@@ -1614,7 +1668,7 @@ static Constant *ConstantFoldScalarCall(StringRef Name, unsigned IntrinsicID,
     // Support ConstantVector in case we have an Undef in the top.
     if (isa<ConstantVector>(Operands[0]) ||
         isa<ConstantDataVector>(Operands[0])) {
-      Constant *Op = cast<Constant>(Operands[0]);
+      auto *Op = cast<Constant>(Operands[0]);
       switch (IntrinsicID) {
       default: break;
       case Intrinsic::x86_sse_cvtss2si:
@@ -1646,12 +1700,12 @@ static Constant *ConstantFoldScalarCall(StringRef Name, unsigned IntrinsicID,
   }
 
   if (Operands.size() == 2) {
-    if (ConstantFP *Op1 = dyn_cast<ConstantFP>(Operands[0])) {
+    if (auto *Op1 = dyn_cast<ConstantFP>(Operands[0])) {
       if (!Ty->isHalfTy() && !Ty->isFloatTy() && !Ty->isDoubleTy())
         return nullptr;
       double Op1V = getValueAsDouble(Op1);
 
-      if (ConstantFP *Op2 = dyn_cast<ConstantFP>(Operands[1])) {
+      if (auto *Op2 = dyn_cast<ConstantFP>(Operands[1])) {
         if (Op2->getType() != Op1->getType())
           return nullptr;
 
@@ -1661,7 +1715,7 @@ static Constant *ConstantFoldScalarCall(StringRef Name, unsigned IntrinsicID,
         }
         if (IntrinsicID == Intrinsic::copysign) {
           APFloat V1 = Op1->getValueAPF();
-          APFloat V2 = Op2->getValueAPF();
+          const APFloat &V2 = Op2->getValueAPF();
           V1.copySign(V2);
           return ConstantFP::get(Ty->getContext(), V1);
         }
@@ -1689,7 +1743,7 @@ static Constant *ConstantFoldScalarCall(StringRef Name, unsigned IntrinsicID,
         if ((Name == "atan2" && TLI->has(LibFunc::atan2)) ||
             (Name == "atan2f" && TLI->has(LibFunc::atan2f)))
           return ConstantFoldBinaryFP(atan2, Op1V, Op2V, Ty);
-      } else if (ConstantInt *Op2C = dyn_cast<ConstantInt>(Operands[1])) {
+      } else if (auto *Op2C = dyn_cast<ConstantInt>(Operands[1])) {
         if (IntrinsicID == Intrinsic::powi && Ty->isHalfTy())
           return ConstantFP::get(Ty->getContext(),
                                  APFloat((float)std::pow((float)Op1V,
@@ -1706,8 +1760,8 @@ static Constant *ConstantFoldScalarCall(StringRef Name, unsigned IntrinsicID,
       return nullptr;
     }
 
-    if (ConstantInt *Op1 = dyn_cast<ConstantInt>(Operands[0])) {
-      if (ConstantInt *Op2 = dyn_cast<ConstantInt>(Operands[1])) {
+    if (auto *Op1 = dyn_cast<ConstantInt>(Operands[0])) {
+      if (auto *Op2 = dyn_cast<ConstantInt>(Operands[1])) {
         switch (IntrinsicID) {
         default: break;
         case Intrinsic::sadd_with_overflow:
@@ -1764,9 +1818,9 @@ static Constant *ConstantFoldScalarCall(StringRef Name, unsigned IntrinsicID,
   if (Operands.size() != 3)
     return nullptr;
 
-  if (const ConstantFP *Op1 = dyn_cast<ConstantFP>(Operands[0])) {
-    if (const ConstantFP *Op2 = dyn_cast<ConstantFP>(Operands[1])) {
-      if (const ConstantFP *Op3 = dyn_cast<ConstantFP>(Operands[2])) {
+  if (const auto *Op1 = dyn_cast<ConstantFP>(Operands[0])) {
+    if (const auto *Op2 = dyn_cast<ConstantFP>(Operands[1])) {
+      if (const auto *Op3 = dyn_cast<ConstantFP>(Operands[2])) {
         switch (IntrinsicID) {
         default: break;
         case Intrinsic::fma:
@@ -1788,14 +1842,53 @@ static Constant *ConstantFoldScalarCall(StringRef Name, unsigned IntrinsicID,
   return nullptr;
 }
 
-static Constant *ConstantFoldVectorCall(StringRef Name, unsigned IntrinsicID,
-                                        VectorType *VTy,
-                                        ArrayRef<Constant *> Operands,
-                                        const TargetLibraryInfo *TLI) {
+Constant *ConstantFoldVectorCall(StringRef Name, unsigned IntrinsicID,
+                                 VectorType *VTy, ArrayRef<Constant *> Operands,
+                                 const DataLayout &DL,
+                                 const TargetLibraryInfo *TLI) {
   SmallVector<Constant *, 4> Result(VTy->getNumElements());
   SmallVector<Constant *, 4> Lane(Operands.size());
   Type *Ty = VTy->getElementType();
 
+  if (IntrinsicID == Intrinsic::masked_load) {
+    auto *SrcPtr = Operands[0];
+    auto *Mask = Operands[2];
+    auto *Passthru = Operands[3];
+
+    Constant *VecData = ConstantFoldLoadFromConstPtr(SrcPtr, VTy, DL);
+
+    SmallVector<Constant *, 32> NewElements;
+    for (unsigned I = 0, E = VTy->getNumElements(); I != E; ++I) {
+      auto *MaskElt = Mask->getAggregateElement(I);
+      if (!MaskElt)
+        break;
+      auto *PassthruElt = Passthru->getAggregateElement(I);
+      auto *VecElt = VecData ? VecData->getAggregateElement(I) : nullptr;
+      if (isa<UndefValue>(MaskElt)) {
+        if (PassthruElt)
+          NewElements.push_back(PassthruElt);
+        else if (VecElt)
+          NewElements.push_back(VecElt);
+        else
+          return nullptr;
+      }
+      if (MaskElt->isNullValue()) {
+        if (!PassthruElt)
+          return nullptr;
+        NewElements.push_back(PassthruElt);
+      } else if (MaskElt->isOneValue()) {
+        if (!VecElt)
+          return nullptr;
+        NewElements.push_back(VecElt);
+      } else {
+        return nullptr;
+      }
+    }
+    if (NewElements.size() != VTy->getNumElements())
+      return nullptr;
+    return ConstantVector::get(NewElements);
+  }
+
   for (unsigned I = 0, E = VTy->getNumElements(); I != E; ++I) {
     // Gather a column of constants.
     for (unsigned J = 0, JE = Operands.size(); J != JE; ++J) {
@@ -1816,8 +1909,8 @@ static Constant *ConstantFoldVectorCall(StringRef Name, unsigned IntrinsicID,
   return ConstantVector::get(Result);
 }
 
-/// Attempt to constant fold a call to the specified function
-/// with the specified arguments, returning null if unsuccessful.
+} // end anonymous namespace
+
 Constant *
 llvm::ConstantFoldCall(Function *F, ArrayRef<Constant *> Operands,
                        const TargetLibraryInfo *TLI) {
@@ -1827,8 +1920,9 @@ llvm::ConstantFoldCall(Function *F, ArrayRef<Constant *> Operands,
 
   Type *Ty = F->getReturnType();
 
-  if (VectorType *VTy = dyn_cast<VectorType>(Ty))
-    return ConstantFoldVectorCall(Name, F->getIntrinsicID(), VTy, Operands, TLI);
+  if (auto *VTy = dyn_cast<VectorType>(Ty))
+    return ConstantFoldVectorCall(Name, F->getIntrinsicID(), VTy, Operands,
+                                  F->getParent()->getDataLayout(), TLI);
 
   return ConstantFoldScalarCall(Name, F->getIntrinsicID(), Ty, Operands, TLI);
 }
diff --git a/lib/Analysis/CostModel.cpp b/lib/Analysis/CostModel.cpp
index 0383cbfbbe4c..68a4bea96baa 100644
--- a/lib/Analysis/CostModel.cpp
+++ b/lib/Analysis/CostModel.cpp
@@ -504,8 +504,12 @@ unsigned CostModelAnalysis::getInstructionCost(const Instruction *I) const {
       for (unsigned J = 0, JE = II->getNumArgOperands(); J != JE; ++J)
         Args.push_back(II->getArgOperand(J));
 
+      FastMathFlags FMF;
+      if (auto *FPMO = dyn_cast<FPMathOperator>(II))
+        FMF = FPMO->getFastMathFlags();
+
       return TTI->getIntrinsicInstrCost(II->getIntrinsicID(), II->getType(),
-                                        Args);
+                                        Args, FMF);
     }
     return -1;
   default:
@@ -518,16 +522,15 @@ void CostModelAnalysis::print(raw_ostream &OS, const Module*) const {
   if (!F)
     return;
 
-  for (Function::iterator B = F->begin(), BE = F->end(); B != BE; ++B) {
-    for (BasicBlock::iterator it = B->begin(), e = B->end(); it != e; ++it) {
-      Instruction *Inst = &*it;
-      unsigned Cost = getInstructionCost(Inst);
+  for (BasicBlock &B : *F) {
+    for (Instruction &Inst : B) {
+      unsigned Cost = getInstructionCost(&Inst);
       if (Cost != (unsigned)-1)
         OS << "Cost Model: Found an estimated cost of " << Cost;
       else
         OS << "Cost Model: Unknown cost";
 
-      OS << " for instruction: "<< *Inst << "\n";
+      OS << " for instruction: " << Inst << "\n";
     }
   }
 }
diff --git a/lib/Analysis/Delinearization.cpp b/lib/Analysis/Delinearization.cpp
index baee8b3b084b..dd5af9d43ef8 100644
--- a/lib/Analysis/Delinearization.cpp
+++ b/lib/Analysis/Delinearization.cpp
@@ -14,11 +14,11 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "llvm/IR/Constants.h"
 #include "llvm/Analysis/LoopInfo.h"
 #include "llvm/Analysis/Passes.h"
 #include "llvm/Analysis/ScalarEvolution.h"
 #include "llvm/Analysis/ScalarEvolutionExpressions.h"
+#include "llvm/IR/Constants.h"
 #include "llvm/IR/DerivedTypes.h"
 #include "llvm/IR/Function.h"
 #include "llvm/IR/InstIterator.h"
@@ -26,7 +26,6 @@
 #include "llvm/IR/LLVMContext.h"
 #include "llvm/IR/Type.h"
 #include "llvm/Pass.h"
-#include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/raw_ostream.h"
 
diff --git a/lib/Analysis/DemandedBits.cpp b/lib/Analysis/DemandedBits.cpp
index 6f92ba6289a4..a3f8b7fda08a 100644
--- a/lib/Analysis/DemandedBits.cpp
+++ b/lib/Analysis/DemandedBits.cpp
@@ -20,8 +20,6 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/Analysis/DemandedBits.h"
-#include "llvm/Transforms/Scalar.h"
-#include "llvm/ADT/DenseMap.h"
 #include "llvm/ADT/DepthFirstIterator.h"
 #include "llvm/ADT/SmallPtrSet.h"
 #include "llvm/ADT/SmallVector.h"
@@ -44,25 +42,29 @@ using namespace llvm;
 
 #define DEBUG_TYPE "demanded-bits"
 
-char DemandedBits::ID = 0;
-INITIALIZE_PASS_BEGIN(DemandedBits, "demanded-bits", "Demanded bits analysis",
-                      false, false)
+char DemandedBitsWrapperPass::ID = 0;
+INITIALIZE_PASS_BEGIN(DemandedBitsWrapperPass, "demanded-bits",
+                      "Demanded bits analysis", false, false)
 INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker)
 INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
-INITIALIZE_PASS_END(DemandedBits, "demanded-bits", "Demanded bits analysis",
-                    false, false)
+INITIALIZE_PASS_END(DemandedBitsWrapperPass, "demanded-bits",
+                    "Demanded bits analysis", false, false)
 
-DemandedBits::DemandedBits() : FunctionPass(ID), F(nullptr), Analyzed(false) {
-  initializeDemandedBitsPass(*PassRegistry::getPassRegistry());
+DemandedBitsWrapperPass::DemandedBitsWrapperPass() : FunctionPass(ID) {
+  initializeDemandedBitsWrapperPassPass(*PassRegistry::getPassRegistry());
 }
 
-void DemandedBits::getAnalysisUsage(AnalysisUsage &AU) const {
+void DemandedBitsWrapperPass::getAnalysisUsage(AnalysisUsage &AU) const {
   AU.setPreservesCFG();
   AU.addRequired<AssumptionCacheTracker>();
   AU.addRequired<DominatorTreeWrapperPass>();
   AU.setPreservesAll();
 }
 
+void DemandedBitsWrapperPass::print(raw_ostream &OS, const Module *M) const {
+  DB->print(OS);
+}
+
 static bool isAlwaysLive(Instruction *I) {
   return isa<TerminatorInst>(I) || isa<DbgInfoIntrinsic>(I) ||
       I->isEHPad() || I->mayHaveSideEffects();
@@ -86,13 +88,13 @@ void DemandedBits::determineLiveOperandBits(
         KnownZero = APInt(BitWidth, 0);
         KnownOne = APInt(BitWidth, 0);
         computeKnownBits(const_cast<Value *>(V1), KnownZero, KnownOne, DL, 0,
-                         AC, UserI, DT);
+                         &AC, UserI, &DT);
 
         if (V2) {
           KnownZero2 = APInt(BitWidth, 0);
           KnownOne2 = APInt(BitWidth, 0);
           computeKnownBits(const_cast<Value *>(V2), KnownZero2, KnownOne2, DL,
-                           0, AC, UserI, DT);
+                           0, &AC, UserI, &DT);
         }
       };
 
@@ -245,19 +247,22 @@ void DemandedBits::determineLiveOperandBits(
   }
 }
 
-bool DemandedBits::runOnFunction(Function& Fn) {
-  F = &Fn;
-  Analyzed = false;
+bool DemandedBitsWrapperPass::runOnFunction(Function &F) {
+  auto &AC = getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F);
+  auto &DT = getAnalysis<DominatorTreeWrapperPass>().getDomTree();
+  DB.emplace(F, AC, DT);
   return false;
 }
 
+void DemandedBitsWrapperPass::releaseMemory() {
+  DB.reset();
+}
+
 void DemandedBits::performAnalysis() {
   if (Analyzed)
     // Analysis already completed for this function.
     return;
   Analyzed = true;
-  AC = &getAnalysis<AssumptionCacheTracker>().getAssumptionCache(*F);
-  DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
   
   Visited.clear();
   AliveBits.clear();
@@ -265,7 +270,7 @@ void DemandedBits::performAnalysis() {
   SmallVector<Instruction*, 128> Worklist;
 
   // Collect the set of "root" instructions that are known live.
-  for (Instruction &I : instructions(*F)) {
+  for (Instruction &I : instructions(F)) {
     if (!isAlwaysLive(&I))
       continue;
 
@@ -370,16 +375,29 @@ bool DemandedBits::isInstructionDead(Instruction *I) {
     !isAlwaysLive(I);
 }
 
-void DemandedBits::print(raw_ostream &OS, const Module *M) const {
-  // This is gross. But the alternative is making all the state mutable
-  // just because of this one debugging method.
-  const_cast<DemandedBits*>(this)->performAnalysis();
+void DemandedBits::print(raw_ostream &OS) {
+  performAnalysis();
   for (auto &KV : AliveBits) {
     OS << "DemandedBits: 0x" << utohexstr(KV.second.getLimitedValue()) << " for "
        << *KV.first << "\n";
   }
 }
 
-FunctionPass *llvm::createDemandedBitsPass() {
-  return new DemandedBits();
+FunctionPass *llvm::createDemandedBitsWrapperPass() {
+  return new DemandedBitsWrapperPass();
+}
+
+char DemandedBitsAnalysis::PassID;
+
+DemandedBits DemandedBitsAnalysis::run(Function &F,
+                                             AnalysisManager<Function> &AM) {
+  auto &AC = AM.getResult<AssumptionAnalysis>(F);
+  auto &DT = AM.getResult<DominatorTreeAnalysis>(F);
+  return DemandedBits(F, AC, DT);
+}
+
+PreservedAnalyses DemandedBitsPrinterPass::run(Function &F,
+                                               FunctionAnalysisManager &AM) {
+  AM.getResult<DemandedBitsAnalysis>(F).print(OS);
+  return PreservedAnalyses::all();
 }
diff --git a/lib/Analysis/DependenceAnalysis.cpp b/lib/Analysis/DependenceAnalysis.cpp
index 4040ad3cacd5..eb4d925fea73 100644
--- a/lib/Analysis/DependenceAnalysis.cpp
+++ b/lib/Analysis/DependenceAnalysis.cpp
@@ -114,36 +114,43 @@ Delinearize("da-delinearize", cl::init(false), cl::Hidden, cl::ZeroOrMore,
 //===----------------------------------------------------------------------===//
 // basics
 
-INITIALIZE_PASS_BEGIN(DependenceAnalysis, "da",
+DependenceAnalysis::Result
+DependenceAnalysis::run(Function &F, FunctionAnalysisManager &FAM) {
+  auto &AA = FAM.getResult<AAManager>(F);
+  auto &SE = FAM.getResult<ScalarEvolutionAnalysis>(F);
+  auto &LI = FAM.getResult<LoopAnalysis>(F);
+  return DependenceInfo(&F, &AA, &SE, &LI);
+}
+
+char DependenceAnalysis::PassID;
+
+INITIALIZE_PASS_BEGIN(DependenceAnalysisWrapperPass, "da",
                       "Dependence Analysis", true, true)
 INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass)
 INITIALIZE_PASS_DEPENDENCY(ScalarEvolutionWrapperPass)
 INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass)
-INITIALIZE_PASS_END(DependenceAnalysis, "da",
-                    "Dependence Analysis", true, true)
+INITIALIZE_PASS_END(DependenceAnalysisWrapperPass, "da", "Dependence Analysis",
+                    true, true)
 
-char DependenceAnalysis::ID = 0;
+char DependenceAnalysisWrapperPass::ID = 0;
 
-
-FunctionPass *llvm::createDependenceAnalysisPass() {
-  return new DependenceAnalysis();
+FunctionPass *llvm::createDependenceAnalysisWrapperPass() {
+  return new DependenceAnalysisWrapperPass();
 }
 
-
-bool DependenceAnalysis::runOnFunction(Function &F) {
-  this->F = &F;
-  AA = &getAnalysis<AAResultsWrapperPass>().getAAResults();
-  SE = &getAnalysis<ScalarEvolutionWrapperPass>().getSE();
-  LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
+bool DependenceAnalysisWrapperPass::runOnFunction(Function &F) {
+  auto &AA = getAnalysis<AAResultsWrapperPass>().getAAResults();
+  auto &SE = getAnalysis<ScalarEvolutionWrapperPass>().getSE();
+  auto &LI = getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
+  info.reset(new DependenceInfo(&F, &AA, &SE, &LI));
   return false;
 }
 
+DependenceInfo &DependenceAnalysisWrapperPass::getDI() const { return *info; }
 
-void DependenceAnalysis::releaseMemory() {
-}
-
+void DependenceAnalysisWrapperPass::releaseMemory() { info.reset(); }
 
-void DependenceAnalysis::getAnalysisUsage(AnalysisUsage &AU) const {
+void DependenceAnalysisWrapperPass::getAnalysisUsage(AnalysisUsage &AU) const {
   AU.setPreservesAll();
   AU.addRequiredTransitive<AAResultsWrapperPass>();
   AU.addRequiredTransitive<ScalarEvolutionWrapperPass>();
@@ -155,11 +162,10 @@ void DependenceAnalysis::getAnalysisUsage(AnalysisUsage &AU) const {
 // Looks through the function, noting loads and stores.
 // Calls depends() on every possible pair and prints out the result.
 // Ignores all other instructions.
-static
-void dumpExampleDependence(raw_ostream &OS, Function *F,
-                           DependenceAnalysis *DA) {
-  for (inst_iterator SrcI = inst_begin(F), SrcE = inst_end(F);
-       SrcI != SrcE; ++SrcI) {
+static void dumpExampleDependence(raw_ostream &OS, DependenceInfo *DA) {
+  auto *F = DA->getFunction();
+  for (inst_iterator SrcI = inst_begin(F), SrcE = inst_end(F); SrcI != SrcE;
+       ++SrcI) {
     if (isa<StoreInst>(*SrcI) || isa<LoadInst>(*SrcI)) {
       for (inst_iterator DstI = SrcI, DstE = inst_end(F);
            DstI != DstE; ++DstI) {
@@ -183,9 +189,9 @@ void dumpExampleDependence(raw_ostream &OS, Function *F,
   }
 }
 
-
-void DependenceAnalysis::print(raw_ostream &OS, const Module*) const {
-  dumpExampleDependence(OS, F, const_cast<DependenceAnalysis *>(this));
+void DependenceAnalysisWrapperPass::print(raw_ostream &OS,
+                                          const Module *) const {
+  dumpExampleDependence(OS, info.get());
 }
 
 //===----------------------------------------------------------------------===//
@@ -286,11 +292,11 @@ bool FullDependence::isSplitable(unsigned Level) const {
 
 
 //===----------------------------------------------------------------------===//
-// DependenceAnalysis::Constraint methods
+// DependenceInfo::Constraint methods
 
 // If constraint is a point <X, Y>, returns X.
 // Otherwise assert.
-const SCEV *DependenceAnalysis::Constraint::getX() const {
+const SCEV *DependenceInfo::Constraint::getX() const {
   assert(Kind == Point && "Kind should be Point");
   return A;
 }
@@ -298,7 +304,7 @@ const SCEV *DependenceAnalysis::Constraint::getX() const {
 
 // If constraint is a point <X, Y>, returns Y.
 // Otherwise assert.
-const SCEV *DependenceAnalysis::Constraint::getY() const {
+const SCEV *DependenceInfo::Constraint::getY() const {
   assert(Kind == Point && "Kind should be Point");
   return B;
 }
@@ -306,7 +312,7 @@ const SCEV *DependenceAnalysis::Constraint::getY() const {
 
 // If constraint is a line AX + BY = C, returns A.
 // Otherwise assert.
-const SCEV *DependenceAnalysis::Constraint::getA() const {
+const SCEV *DependenceInfo::Constraint::getA() const {
   assert((Kind == Line || Kind == Distance) &&
          "Kind should be Line (or Distance)");
   return A;
@@ -315,7 +321,7 @@ const SCEV *DependenceAnalysis::Constraint::getA() const {
 
 // If constraint is a line AX + BY = C, returns B.
 // Otherwise assert.
-const SCEV *DependenceAnalysis::Constraint::getB() const {
+const SCEV *DependenceInfo::Constraint::getB() const {
   assert((Kind == Line || Kind == Distance) &&
          "Kind should be Line (or Distance)");
   return B;
@@ -324,7 +330,7 @@ const SCEV *DependenceAnalysis::Constraint::getB() const {
 
 // If constraint is a line AX + BY = C, returns C.
 // Otherwise assert.
-const SCEV *DependenceAnalysis::Constraint::getC() const {
+const SCEV *DependenceInfo::Constraint::getC() const {
   assert((Kind == Line || Kind == Distance) &&
          "Kind should be Line (or Distance)");
   return C;
@@ -333,34 +339,29 @@ const SCEV *DependenceAnalysis::Constraint::getC() const {
 
 // If constraint is a distance, returns D.
 // Otherwise assert.
-const SCEV *DependenceAnalysis::Constraint::getD() const {
+const SCEV *DependenceInfo::Constraint::getD() const {
   assert(Kind == Distance && "Kind should be Distance");
   return SE->getNegativeSCEV(C);
 }
 
 
 // Returns the loop associated with this constraint.
-const Loop *DependenceAnalysis::Constraint::getAssociatedLoop() const {
+const Loop *DependenceInfo::Constraint::getAssociatedLoop() const {
   assert((Kind == Distance || Kind == Line || Kind == Point) &&
          "Kind should be Distance, Line, or Point");
   return AssociatedLoop;
 }
 
-
-void DependenceAnalysis::Constraint::setPoint(const SCEV *X,
-                                              const SCEV *Y,
-                                              const Loop *CurLoop) {
+void DependenceInfo::Constraint::setPoint(const SCEV *X, const SCEV *Y,
+                                          const Loop *CurLoop) {
   Kind = Point;
   A = X;
   B = Y;
   AssociatedLoop = CurLoop;
 }
 
-
-void DependenceAnalysis::Constraint::setLine(const SCEV *AA,
-                                             const SCEV *BB,
-                                             const SCEV *CC,
-                                             const Loop *CurLoop) {
+void DependenceInfo::Constraint::setLine(const SCEV *AA, const SCEV *BB,
+                                         const SCEV *CC, const Loop *CurLoop) {
   Kind = Line;
   A = AA;
   B = BB;
@@ -368,9 +369,8 @@ void DependenceAnalysis::Constraint::setLine(const SCEV *AA,
   AssociatedLoop = CurLoop;
 }
 
-
-void DependenceAnalysis::Constraint::setDistance(const SCEV *D,
-                                                 const Loop *CurLoop) {
+void DependenceInfo::Constraint::setDistance(const SCEV *D,
+                                             const Loop *CurLoop) {
   Kind = Distance;
   A = SE->getOne(D->getType());
   B = SE->getNegativeSCEV(A);
@@ -378,20 +378,16 @@ void DependenceAnalysis::Constraint::setDistance(const SCEV *D,
   AssociatedLoop = CurLoop;
 }
 
+void DependenceInfo::Constraint::setEmpty() { Kind = Empty; }
 
-void DependenceAnalysis::Constraint::setEmpty() {
-  Kind = Empty;
-}
-
-
-void DependenceAnalysis::Constraint::setAny(ScalarEvolution *NewSE) {
+void DependenceInfo::Constraint::setAny(ScalarEvolution *NewSE) {
   SE = NewSE;
   Kind = Any;
 }
 
 
 // For debugging purposes. Dumps the constraint out to OS.
-void DependenceAnalysis::Constraint::dump(raw_ostream &OS) const {
+void DependenceInfo::Constraint::dump(raw_ostream &OS) const {
   if (isEmpty())
     OS << " Empty\n";
   else if (isAny())
@@ -416,8 +412,7 @@ void DependenceAnalysis::Constraint::dump(raw_ostream &OS) const {
 //            Practical Dependence Testing
 //            Goff, Kennedy, Tseng
 //            PLDI 1991
-bool DependenceAnalysis::intersectConstraints(Constraint *X,
-                                              const Constraint *Y) {
+bool DependenceInfo::intersectConstraints(Constraint *X, const Constraint *Y) {
   ++DeltaApplications;
   DEBUG(dbgs() << "\tintersect constraints\n");
   DEBUG(dbgs() << "\t    X ="; X->dump(dbgs()));
@@ -528,7 +523,7 @@ bool DependenceAnalysis::intersectConstraints(Constraint *X,
       }
       if (const SCEVConstant *CUB =
           collectConstantUpperBound(X->getAssociatedLoop(), Prod1->getType())) {
-        APInt UpperBound = CUB->getAPInt();
+        const APInt &UpperBound = CUB->getAPInt();
         DEBUG(dbgs() << "\t\tupper bound = " << UpperBound << "\n");
         if (Xq.sgt(UpperBound) || Yq.sgt(UpperBound)) {
           X->setEmpty();
@@ -569,7 +564,7 @@ bool DependenceAnalysis::intersectConstraints(Constraint *X,
 
 
 //===----------------------------------------------------------------------===//
-// DependenceAnalysis methods
+// DependenceInfo methods
 
 // For debugging purposes. Dumps a dependence to OS.
 void Dependence::dump(raw_ostream &OS) const {
@@ -709,8 +704,8 @@ Value *getPointerOperand(Instruction *I) {
 //     e - 5
 //     f - 6
 //     g - 7 = MaxLevels
-void DependenceAnalysis::establishNestingLevels(const Instruction *Src,
-                                                const Instruction *Dst) {
+void DependenceInfo::establishNestingLevels(const Instruction *Src,
+                                            const Instruction *Dst) {
   const BasicBlock *SrcBlock = Src->getParent();
   const BasicBlock *DstBlock = Dst->getParent();
   unsigned SrcLevel = LI->getLoopDepth(SrcBlock);
@@ -739,14 +734,14 @@ void DependenceAnalysis::establishNestingLevels(const Instruction *Src,
 
 // Given one of the loops containing the source, return
 // its level index in our numbering scheme.
-unsigned DependenceAnalysis::mapSrcLoop(const Loop *SrcLoop) const {
+unsigned DependenceInfo::mapSrcLoop(const Loop *SrcLoop) const {
   return SrcLoop->getLoopDepth();
 }
 
 
 // Given one of the loops containing the destination,
 // return its level index in our numbering scheme.
-unsigned DependenceAnalysis::mapDstLoop(const Loop *DstLoop) const {
+unsigned DependenceInfo::mapDstLoop(const Loop *DstLoop) const {
   unsigned D = DstLoop->getLoopDepth();
   if (D > CommonLevels)
     return D - CommonLevels + SrcLevels;
@@ -756,8 +751,8 @@ unsigned DependenceAnalysis::mapDstLoop(const Loop *DstLoop) const {
 
 
 // Returns true if Expression is loop invariant in LoopNest.
-bool DependenceAnalysis::isLoopInvariant(const SCEV *Expression,
-                                         const Loop *LoopNest) const {
+bool DependenceInfo::isLoopInvariant(const SCEV *Expression,
+                                     const Loop *LoopNest) const {
   if (!LoopNest)
     return true;
   return SE->isLoopInvariant(Expression, LoopNest) &&
@@ -768,9 +763,9 @@ bool DependenceAnalysis::isLoopInvariant(const SCEV *Expression,
 
 // Finds the set of loops from the LoopNest that
 // have a level <= CommonLevels and are referred to by the SCEV Expression.
-void DependenceAnalysis::collectCommonLoops(const SCEV *Expression,
-                                            const Loop *LoopNest,
-                                            SmallBitVector &Loops) const {
+void DependenceInfo::collectCommonLoops(const SCEV *Expression,
+                                        const Loop *LoopNest,
+                                        SmallBitVector &Loops) const {
   while (LoopNest) {
     unsigned Level = LoopNest->getLoopDepth();
     if (Level <= CommonLevels && !SE->isLoopInvariant(Expression, LoopNest))
@@ -779,16 +774,16 @@ void DependenceAnalysis::collectCommonLoops(const SCEV *Expression,
   }
 }
 
-void DependenceAnalysis::unifySubscriptType(ArrayRef<Subscript *> Pairs) {
+void DependenceInfo::unifySubscriptType(ArrayRef<Subscript *> Pairs) {
 
   unsigned widestWidthSeen = 0;
   Type *widestType;
 
   // Go through each pair and find the widest bit to which we need
   // to extend all of them.
-  for (unsigned i = 0; i < Pairs.size(); i++) {
-    const SCEV *Src = Pairs[i]->Src;
-    const SCEV *Dst = Pairs[i]->Dst;
+  for (Subscript *Pair : Pairs) {
+    const SCEV *Src = Pair->Src;
+    const SCEV *Dst = Pair->Dst;
     IntegerType *SrcTy = dyn_cast<IntegerType>(Src->getType());
     IntegerType *DstTy = dyn_cast<IntegerType>(Dst->getType());
     if (SrcTy == nullptr || DstTy == nullptr) {
@@ -811,9 +806,9 @@ void DependenceAnalysis::unifySubscriptType(ArrayRef<Subscript *> Pairs) {
   assert(widestWidthSeen > 0);
 
   // Now extend each pair to the widest seen.
-  for (unsigned i = 0; i < Pairs.size(); i++) {
-    const SCEV *Src = Pairs[i]->Src;
-    const SCEV *Dst = Pairs[i]->Dst;
+  for (Subscript *Pair : Pairs) {
+    const SCEV *Src = Pair->Src;
+    const SCEV *Dst = Pair->Dst;
     IntegerType *SrcTy = dyn_cast<IntegerType>(Src->getType());
     IntegerType *DstTy = dyn_cast<IntegerType>(Dst->getType());
     if (SrcTy == nullptr || DstTy == nullptr) {
@@ -824,10 +819,10 @@ void DependenceAnalysis::unifySubscriptType(ArrayRef<Subscript *> Pairs) {
     }
     if (SrcTy->getBitWidth() < widestWidthSeen)
       // Sign-extend Src to widestType
-      Pairs[i]->Src = SE->getSignExtendExpr(Src, widestType);
+      Pair->Src = SE->getSignExtendExpr(Src, widestType);
     if (DstTy->getBitWidth() < widestWidthSeen) {
       // Sign-extend Dst to widestType
-      Pairs[i]->Dst = SE->getSignExtendExpr(Dst, widestType);
+      Pair->Dst = SE->getSignExtendExpr(Dst, widestType);
     }
   }
 }
@@ -836,7 +831,7 @@ void DependenceAnalysis::unifySubscriptType(ArrayRef<Subscript *> Pairs) {
 // If the source and destination are identically sign (or zero)
 // extended, it strips off the extension in an effect to simplify
 // the actual analysis.
-void DependenceAnalysis::removeMatchingExtensions(Subscript *Pair) {
+void DependenceInfo::removeMatchingExtensions(Subscript *Pair) {
   const SCEV *Src = Pair->Src;
   const SCEV *Dst = Pair->Dst;
   if ((isa<SCEVZeroExtendExpr>(Src) && isa<SCEVZeroExtendExpr>(Dst)) ||
@@ -855,9 +850,8 @@ void DependenceAnalysis::removeMatchingExtensions(Subscript *Pair) {
 
 // Examine the scev and return true iff it's linear.
 // Collect any loops mentioned in the set of "Loops".
-bool DependenceAnalysis::checkSrcSubscript(const SCEV *Src,
-                                           const Loop *LoopNest,
-                                           SmallBitVector &Loops) {
+bool DependenceInfo::checkSrcSubscript(const SCEV *Src, const Loop *LoopNest,
+                                       SmallBitVector &Loops) {
   const SCEVAddRecExpr *AddRec = dyn_cast<SCEVAddRecExpr>(Src);
   if (!AddRec)
     return isLoopInvariant(Src, LoopNest);
@@ -881,9 +875,8 @@ bool DependenceAnalysis::checkSrcSubscript(const SCEV *Src,
 
 // Examine the scev and return true iff it's linear.
 // Collect any loops mentioned in the set of "Loops".
-bool DependenceAnalysis::checkDstSubscript(const SCEV *Dst,
-                                           const Loop *LoopNest,
-                                           SmallBitVector &Loops) {
+bool DependenceInfo::checkDstSubscript(const SCEV *Dst, const Loop *LoopNest,
+                                       SmallBitVector &Loops) {
   const SCEVAddRecExpr *AddRec = dyn_cast<SCEVAddRecExpr>(Dst);
   if (!AddRec)
     return isLoopInvariant(Dst, LoopNest);
@@ -907,10 +900,10 @@ bool DependenceAnalysis::checkDstSubscript(const SCEV *Dst,
 // Examines the subscript pair (the Src and Dst SCEVs)
 // and classifies it as either ZIV, SIV, RDIV, MIV, or Nonlinear.
 // Collects the associated loops in a set.
-DependenceAnalysis::Subscript::ClassificationKind
-DependenceAnalysis::classifyPair(const SCEV *Src, const Loop *SrcLoopNest,
-                                 const SCEV *Dst, const Loop *DstLoopNest,
-                                 SmallBitVector &Loops) {
+DependenceInfo::Subscript::ClassificationKind
+DependenceInfo::classifyPair(const SCEV *Src, const Loop *SrcLoopNest,
+                             const SCEV *Dst, const Loop *DstLoopNest,
+                             SmallBitVector &Loops) {
   SmallBitVector SrcLoops(MaxLevels + 1);
   SmallBitVector DstLoops(MaxLevels + 1);
   if (!checkSrcSubscript(Src, SrcLoopNest, SrcLoops))
@@ -942,9 +935,8 @@ DependenceAnalysis::classifyPair(const SCEV *Src, const Loop *SrcLoopNest,
 // If SCEV::isKnownPredicate can't prove the predicate,
 // we try simple subtraction, which seems to help in some cases
 // involving symbolics.
-bool DependenceAnalysis::isKnownPredicate(ICmpInst::Predicate Pred,
-                                          const SCEV *X,
-                                          const SCEV *Y) const {
+bool DependenceInfo::isKnownPredicate(ICmpInst::Predicate Pred, const SCEV *X,
+                                      const SCEV *Y) const {
   if (Pred == CmpInst::ICMP_EQ ||
       Pred == CmpInst::ICMP_NE) {
     if ((isa<SCEVSignExtendExpr>(X) &&
@@ -995,8 +987,7 @@ bool DependenceAnalysis::isKnownPredicate(ICmpInst::Predicate Pred,
 // Truncating is safe when subscripts are known not to wrap. Cases without
 // nowrap flags should have been rejected earlier.
 // Return null if no bound available.
-const SCEV *DependenceAnalysis::collectUpperBound(const Loop *L,
-                                                  Type *T) const {
+const SCEV *DependenceInfo::collectUpperBound(const Loop *L, Type *T) const {
   if (SE->hasLoopInvariantBackedgeTakenCount(L)) {
     const SCEV *UB = SE->getBackedgeTakenCount(L);
     return SE->getTruncateOrZeroExtend(UB, T);
@@ -1007,9 +998,8 @@ const SCEV *DependenceAnalysis::collectUpperBound(const Loop *L,
 
 // Calls collectUpperBound(), then attempts to cast it to SCEVConstant.
 // If the cast fails, returns NULL.
-const SCEVConstant *DependenceAnalysis::collectConstantUpperBound(const Loop *L,
-                                                                  Type *T
-                                                                  ) const {
+const SCEVConstant *DependenceInfo::collectConstantUpperBound(const Loop *L,
+                                                              Type *T) const {
   if (const SCEV *UB = collectUpperBound(L, T))
     return dyn_cast<SCEVConstant>(UB);
   return nullptr;
@@ -1026,9 +1016,8 @@ const SCEVConstant *DependenceAnalysis::collectConstantUpperBound(const Loop *L,
 // 3) the values might be equal, so we have to assume a dependence.
 //
 // Return true if dependence disproved.
-bool DependenceAnalysis::testZIV(const SCEV *Src,
-                                 const SCEV *Dst,
-                                 FullDependence &Result) const {
+bool DependenceInfo::testZIV(const SCEV *Src, const SCEV *Dst,
+                             FullDependence &Result) const {
   DEBUG(dbgs() << "    src = " << *Src << "\n");
   DEBUG(dbgs() << "    dst = " << *Dst << "\n");
   ++ZIVapplications;
@@ -1074,13 +1063,10 @@ bool DependenceAnalysis::testZIV(const SCEV *Src,
 //                { > if d < 0
 //
 // Return true if dependence disproved.
-bool DependenceAnalysis::strongSIVtest(const SCEV *Coeff,
-                                       const SCEV *SrcConst,
-                                       const SCEV *DstConst,
-                                       const Loop *CurLoop,
-                                       unsigned Level,
-                                       FullDependence &Result,
-                                       Constraint &NewConstraint) const {
+bool DependenceInfo::strongSIVtest(const SCEV *Coeff, const SCEV *SrcConst,
+                                   const SCEV *DstConst, const Loop *CurLoop,
+                                   unsigned Level, FullDependence &Result,
+                                   Constraint &NewConstraint) const {
   DEBUG(dbgs() << "\tStrong SIV test\n");
   DEBUG(dbgs() << "\t    Coeff = " << *Coeff);
   DEBUG(dbgs() << ", " << *Coeff->getType() << "\n");
@@ -1213,14 +1199,10 @@ bool DependenceAnalysis::strongSIVtest(const SCEV *Coeff,
 // Can determine iteration for splitting.
 //
 // Return true if dependence disproved.
-bool DependenceAnalysis::weakCrossingSIVtest(const SCEV *Coeff,
-                                             const SCEV *SrcConst,
-                                             const SCEV *DstConst,
-                                             const Loop *CurLoop,
-                                             unsigned Level,
-                                             FullDependence &Result,
-                                             Constraint &NewConstraint,
-                                             const SCEV *&SplitIter) const {
+bool DependenceInfo::weakCrossingSIVtest(
+    const SCEV *Coeff, const SCEV *SrcConst, const SCEV *DstConst,
+    const Loop *CurLoop, unsigned Level, FullDependence &Result,
+    Constraint &NewConstraint, const SCEV *&SplitIter) const {
   DEBUG(dbgs() << "\tWeak-Crossing SIV test\n");
   DEBUG(dbgs() << "\t    Coeff = " << *Coeff << "\n");
   DEBUG(dbgs() << "\t    SrcConst = " << *SrcConst << "\n");
@@ -1256,7 +1238,7 @@ bool DependenceAnalysis::weakCrossingSIVtest(const SCEV *Coeff,
   }
   assert(SE->isKnownPositive(ConstCoeff) && "ConstCoeff should be positive");
 
-  // compute SplitIter for use by DependenceAnalysis::getSplitIteration()
+  // compute SplitIter for use by DependenceInfo::getSplitIteration()
   SplitIter = SE->getUDivExpr(
       SE->getSMaxExpr(SE->getZero(Delta->getType()), Delta),
       SE->getMulExpr(SE->getConstant(Delta->getType(), 2), ConstCoeff));
@@ -1344,9 +1326,8 @@ bool DependenceAnalysis::weakCrossingSIVtest(const SCEV *Coeff,
 // Computes the GCD of AM and BM.
 // Also finds a solution to the equation ax - by = gcd(a, b).
 // Returns true if dependence disproved; i.e., gcd does not divide Delta.
-static
-bool findGCD(unsigned Bits, APInt AM, APInt BM, APInt Delta,
-             APInt &G, APInt &X, APInt &Y) {
+static bool findGCD(unsigned Bits, const APInt &AM, const APInt &BM,
+                    const APInt &Delta, APInt &G, APInt &X, APInt &Y) {
   APInt A0(Bits, 1, true), A1(Bits, 0, true);
   APInt B0(Bits, 0, true), B1(Bits, 1, true);
   APInt G0 = AM.abs();
@@ -1375,9 +1356,7 @@ bool findGCD(unsigned Bits, APInt AM, APInt BM, APInt Delta,
   return false;
 }
 
-
-static
-APInt floorOfQuotient(APInt A, APInt B) {
+static APInt floorOfQuotient(const APInt &A, const APInt &B) {
   APInt Q = A; // these need to be initialized
   APInt R = A;
   APInt::sdivrem(A, B, Q, R);
@@ -1390,9 +1369,7 @@ APInt floorOfQuotient(APInt A, APInt B) {
     return Q - 1;
 }
 
-
-static
-APInt ceilingOfQuotient(APInt A, APInt B) {
+static APInt ceilingOfQuotient(const APInt &A, const APInt &B) {
   APInt Q = A; // these need to be initialized
   APInt R = A;
   APInt::sdivrem(A, B, Q, R);
@@ -1433,14 +1410,11 @@ APInt minAPInt(APInt A, APInt B) {
 // in the case of the strong SIV test, can compute Distances.
 //
 // Return true if dependence disproved.
-bool DependenceAnalysis::exactSIVtest(const SCEV *SrcCoeff,
-                                      const SCEV *DstCoeff,
-                                      const SCEV *SrcConst,
-                                      const SCEV *DstConst,
-                                      const Loop *CurLoop,
-                                      unsigned Level,
-                                      FullDependence &Result,
-                                      Constraint &NewConstraint) const {
+bool DependenceInfo::exactSIVtest(const SCEV *SrcCoeff, const SCEV *DstCoeff,
+                                  const SCEV *SrcConst, const SCEV *DstConst,
+                                  const Loop *CurLoop, unsigned Level,
+                                  FullDependence &Result,
+                                  Constraint &NewConstraint) const {
   DEBUG(dbgs() << "\tExact SIV test\n");
   DEBUG(dbgs() << "\t    SrcCoeff = " << *SrcCoeff << " = AM\n");
   DEBUG(dbgs() << "\t    DstCoeff = " << *DstCoeff << " = BM\n");
@@ -1608,8 +1582,8 @@ bool DependenceAnalysis::exactSIVtest(const SCEV *SrcCoeff,
 static
 bool isRemainderZero(const SCEVConstant *Dividend,
                      const SCEVConstant *Divisor) {
-  APInt ConstDividend = Dividend->getAPInt();
-  APInt ConstDivisor = Divisor->getAPInt();
+  const APInt &ConstDividend = Dividend->getAPInt();
+  const APInt &ConstDivisor = Divisor->getAPInt();
   return ConstDividend.srem(ConstDivisor) == 0;
 }
 
@@ -1645,13 +1619,12 @@ bool isRemainderZero(const SCEVConstant *Dividend,
 // (see also weakZeroDstSIVtest)
 //
 // Return true if dependence disproved.
-bool DependenceAnalysis::weakZeroSrcSIVtest(const SCEV *DstCoeff,
-                                            const SCEV *SrcConst,
-                                            const SCEV *DstConst,
-                                            const Loop *CurLoop,
-                                            unsigned Level,
-                                            FullDependence &Result,
-                                            Constraint &NewConstraint) const {
+bool DependenceInfo::weakZeroSrcSIVtest(const SCEV *DstCoeff,
+                                        const SCEV *SrcConst,
+                                        const SCEV *DstConst,
+                                        const Loop *CurLoop, unsigned Level,
+                                        FullDependence &Result,
+                                        Constraint &NewConstraint) const {
   // For the WeakSIV test, it's possible the loop isn't common to
   // the Src and Dst loops. If it isn't, then there's no need to
   // record a direction.
@@ -1756,13 +1729,12 @@ bool DependenceAnalysis::weakZeroSrcSIVtest(const SCEV *DstCoeff,
 // (see also weakZeroSrcSIVtest)
 //
 // Return true if dependence disproved.
-bool DependenceAnalysis::weakZeroDstSIVtest(const SCEV *SrcCoeff,
-                                            const SCEV *SrcConst,
-                                            const SCEV *DstConst,
-                                            const Loop *CurLoop,
-                                            unsigned Level,
-                                            FullDependence &Result,
-                                            Constraint &NewConstraint) const {
+bool DependenceInfo::weakZeroDstSIVtest(const SCEV *SrcCoeff,
+                                        const SCEV *SrcConst,
+                                        const SCEV *DstConst,
+                                        const Loop *CurLoop, unsigned Level,
+                                        FullDependence &Result,
+                                        Constraint &NewConstraint) const {
   // For the WeakSIV test, it's possible the loop isn't common to the
   // Src and Dst loops. If it isn't, then there's no need to record a direction.
   DEBUG(dbgs() << "\tWeak-Zero (dst) SIV test\n");
@@ -1842,13 +1814,10 @@ bool DependenceAnalysis::weakZeroDstSIVtest(const SCEV *SrcCoeff,
 // Returns true if any possible dependence is disproved.
 // Marks the result as inconsistent.
 // Works in some cases that symbolicRDIVtest doesn't, and vice versa.
-bool DependenceAnalysis::exactRDIVtest(const SCEV *SrcCoeff,
-                                       const SCEV *DstCoeff,
-                                       const SCEV *SrcConst,
-                                       const SCEV *DstConst,
-                                       const Loop *SrcLoop,
-                                       const Loop *DstLoop,
-                                       FullDependence &Result) const {
+bool DependenceInfo::exactRDIVtest(const SCEV *SrcCoeff, const SCEV *DstCoeff,
+                                   const SCEV *SrcConst, const SCEV *DstConst,
+                                   const Loop *SrcLoop, const Loop *DstLoop,
+                                   FullDependence &Result) const {
   DEBUG(dbgs() << "\tExact RDIV test\n");
   DEBUG(dbgs() << "\t    SrcCoeff = " << *SrcCoeff << " = AM\n");
   DEBUG(dbgs() << "\t    DstCoeff = " << *DstCoeff << " = BM\n");
@@ -1986,12 +1955,10 @@ bool DependenceAnalysis::exactRDIVtest(const SCEV *SrcCoeff,
 //        a1*N1         <= c2 - c1 <=       -a2*N2
 //
 // return true if dependence disproved
-bool DependenceAnalysis::symbolicRDIVtest(const SCEV *A1,
-                                          const SCEV *A2,
-                                          const SCEV *C1,
-                                          const SCEV *C2,
-                                          const Loop *Loop1,
-                                          const Loop *Loop2) const {
+bool DependenceInfo::symbolicRDIVtest(const SCEV *A1, const SCEV *A2,
+                                      const SCEV *C1, const SCEV *C2,
+                                      const Loop *Loop1,
+                                      const Loop *Loop2) const {
   ++SymbolicRDIVapplications;
   DEBUG(dbgs() << "\ttry symbolic RDIV test\n");
   DEBUG(dbgs() << "\t    A1 = " << *A1);
@@ -2103,12 +2070,9 @@ bool DependenceAnalysis::symbolicRDIVtest(const SCEV *A1,
 // they apply; they're cheaper and sometimes more precise.
 //
 // Return true if dependence disproved.
-bool DependenceAnalysis::testSIV(const SCEV *Src,
-                                 const SCEV *Dst,
-                                 unsigned &Level,
-                                 FullDependence &Result,
-                                 Constraint &NewConstraint,
-                                 const SCEV *&SplitIter) const {
+bool DependenceInfo::testSIV(const SCEV *Src, const SCEV *Dst, unsigned &Level,
+                             FullDependence &Result, Constraint &NewConstraint,
+                             const SCEV *&SplitIter) const {
   DEBUG(dbgs() << "    src = " << *Src << "\n");
   DEBUG(dbgs() << "    dst = " << *Dst << "\n");
   const SCEVAddRecExpr *SrcAddRec = dyn_cast<SCEVAddRecExpr>(Src);
@@ -2174,9 +2138,8 @@ bool DependenceAnalysis::testSIV(const SCEV *Src,
 // [c1 + a1*i + a2*j][c2].
 //
 // Return true if dependence disproved.
-bool DependenceAnalysis::testRDIV(const SCEV *Src,
-                                  const SCEV *Dst,
-                                  FullDependence &Result) const {
+bool DependenceInfo::testRDIV(const SCEV *Src, const SCEV *Dst,
+                              FullDependence &Result) const {
   // we have 3 possible situations here:
   //   1) [a*i + b] and [c*j + d]
   //   2) [a*i + c*j + b] and [d]
@@ -2241,10 +2204,9 @@ bool DependenceAnalysis::testRDIV(const SCEV *Src,
 // Tests the single-subscript MIV pair (Src and Dst) for dependence.
 // Return true if dependence disproved.
 // Can sometimes refine direction vectors.
-bool DependenceAnalysis::testMIV(const SCEV *Src,
-                                 const SCEV *Dst,
-                                 const SmallBitVector &Loops,
-                                 FullDependence &Result) const {
+bool DependenceInfo::testMIV(const SCEV *Src, const SCEV *Dst,
+                             const SmallBitVector &Loops,
+                             FullDependence &Result) const {
   DEBUG(dbgs() << "    src = " << *Src << "\n");
   DEBUG(dbgs() << "    dst = " << *Dst << "\n");
   Result.Consistent = false;
@@ -2256,11 +2218,12 @@ bool DependenceAnalysis::testMIV(const SCEV *Src,
 // Given a product, e.g., 10*X*Y, returns the first constant operand,
 // in this case 10. If there is no constant part, returns NULL.
 static
-const SCEVConstant *getConstantPart(const SCEVMulExpr *Product) {
-  for (unsigned Op = 0, Ops = Product->getNumOperands(); Op < Ops; Op++) {
-    if (const SCEVConstant *Constant = dyn_cast<SCEVConstant>(Product->getOperand(Op)))
+const SCEVConstant *getConstantPart(const SCEV *Expr) {
+  if (const auto *Constant = dyn_cast<SCEVConstant>(Expr))
+    return Constant;
+  else if (const auto *Product = dyn_cast<SCEVMulExpr>(Expr))
+    if (const auto *Constant = dyn_cast<SCEVConstant>(Product->getOperand(0)))
       return Constant;
-  }
   return nullptr;
 }
 
@@ -2283,9 +2246,8 @@ const SCEVConstant *getConstantPart(const SCEVMulExpr *Product) {
 // It occurs to me that the presence of loop-invariant variables
 // changes the nature of the test from "greatest common divisor"
 // to "a common divisor".
-bool DependenceAnalysis::gcdMIVtest(const SCEV *Src,
-                                    const SCEV *Dst,
-                                    FullDependence &Result) const {
+bool DependenceInfo::gcdMIVtest(const SCEV *Src, const SCEV *Dst,
+                                FullDependence &Result) const {
   DEBUG(dbgs() << "starting gcd\n");
   ++GCDapplications;
   unsigned BitWidth = SE->getTypeSizeInBits(Src->getType());
@@ -2299,11 +2261,9 @@ bool DependenceAnalysis::gcdMIVtest(const SCEV *Src,
   while (const SCEVAddRecExpr *AddRec =
          dyn_cast<SCEVAddRecExpr>(Coefficients)) {
     const SCEV *Coeff = AddRec->getStepRecurrence(*SE);
-    const SCEVConstant *Constant = dyn_cast<SCEVConstant>(Coeff);
-    if (const SCEVMulExpr *Product = dyn_cast<SCEVMulExpr>(Coeff))
-      // If the coefficient is the product of a constant and other stuff,
-      // we can use the constant in the GCD computation.
-      Constant = getConstantPart(Product);
+    // If the coefficient is the product of a constant and other stuff,
+    // we can use the constant in the GCD computation.
+    const auto *Constant = getConstantPart(Coeff);
     if (!Constant)
       return false;
     APInt ConstCoeff = Constant->getAPInt();
@@ -2320,11 +2280,9 @@ bool DependenceAnalysis::gcdMIVtest(const SCEV *Src,
   while (const SCEVAddRecExpr *AddRec =
          dyn_cast<SCEVAddRecExpr>(Coefficients)) {
     const SCEV *Coeff = AddRec->getStepRecurrence(*SE);
-    const SCEVConstant *Constant = dyn_cast<SCEVConstant>(Coeff);
-    if (const SCEVMulExpr *Product = dyn_cast<SCEVMulExpr>(Coeff))
-      // If the coefficient is the product of a constant and other stuff,
-      // we can use the constant in the GCD computation.
-      Constant = getConstantPart(Product);
+    // If the coefficient is the product of a constant and other stuff,
+    // we can use the constant in the GCD computation.
+    const auto *Constant = getConstantPart(Coeff);
     if (!Constant)
       return false;
     APInt ConstCoeff = Constant->getAPInt();
@@ -2403,12 +2361,11 @@ bool DependenceAnalysis::gcdMIVtest(const SCEV *Src,
       if (CurLoop == AddRec->getLoop())
         ; // SrcCoeff == Coeff
       else {
-        if (const SCEVMulExpr *Product = dyn_cast<SCEVMulExpr>(Coeff))
-          // If the coefficient is the product of a constant and other stuff,
-          // we can use the constant in the GCD computation.
-          Constant = getConstantPart(Product);
-        else
-          Constant = cast<SCEVConstant>(Coeff);
+        // If the coefficient is the product of a constant and other stuff,
+        // we can use the constant in the GCD computation.
+        Constant = getConstantPart(Coeff);
+        if (!Constant)
+          return false;
         APInt ConstCoeff = Constant->getAPInt();
         RunningGCD = APIntOps::GreatestCommonDivisor(RunningGCD, ConstCoeff.abs());
       }
@@ -2421,29 +2378,24 @@ bool DependenceAnalysis::gcdMIVtest(const SCEV *Src,
       if (CurLoop == AddRec->getLoop())
         DstCoeff = Coeff;
       else {
-        if (const SCEVMulExpr *Product = dyn_cast<SCEVMulExpr>(Coeff))
-          // If the coefficient is the product of a constant and other stuff,
-          // we can use the constant in the GCD computation.
-          Constant = getConstantPart(Product);
-        else
-          Constant = cast<SCEVConstant>(Coeff);
+        // If the coefficient is the product of a constant and other stuff,
+        // we can use the constant in the GCD computation.
+        Constant = getConstantPart(Coeff);
+        if (!Constant)
+          return false;
         APInt ConstCoeff = Constant->getAPInt();
         RunningGCD = APIntOps::GreatestCommonDivisor(RunningGCD, ConstCoeff.abs());
       }
       Inner = AddRec->getStart();
     }
     Delta = SE->getMinusSCEV(SrcCoeff, DstCoeff);
-    if (const SCEVMulExpr *Product = dyn_cast<SCEVMulExpr>(Delta))
-      // If the coefficient is the product of a constant and other stuff,
-      // we can use the constant in the GCD computation.
-      Constant = getConstantPart(Product);
-    else if (isa<SCEVConstant>(Delta))
-      Constant = cast<SCEVConstant>(Delta);
-    else {
+    // If the coefficient is the product of a constant and other stuff,
+    // we can use the constant in the GCD computation.
+    Constant = getConstantPart(Delta);
+    if (!Constant)
       // The difference of the two coefficients might not be a product
       // or constant, in which case we give up on this direction.
       continue;
-    }
     APInt ConstCoeff = Constant->getAPInt();
     RunningGCD = APIntOps::GreatestCommonDivisor(RunningGCD, ConstCoeff.abs());
     DEBUG(dbgs() << "\tRunningGCD = " << RunningGCD << "\n");
@@ -2497,10 +2449,9 @@ bool DependenceAnalysis::gcdMIVtest(const SCEV *Src,
 // for the lower bound, NULL denotes -inf.
 //
 // Return true if dependence disproved.
-bool DependenceAnalysis::banerjeeMIVtest(const SCEV *Src,
-                                         const SCEV *Dst,
-                                         const SmallBitVector &Loops,
-                                         FullDependence &Result) const {
+bool DependenceInfo::banerjeeMIVtest(const SCEV *Src, const SCEV *Dst,
+                                     const SmallBitVector &Loops,
+                                     FullDependence &Result) const {
   DEBUG(dbgs() << "starting Banerjee\n");
   ++BanerjeeApplications;
   DEBUG(dbgs() << "    Src = " << *Src << '\n');
@@ -2578,13 +2529,11 @@ bool DependenceAnalysis::banerjeeMIVtest(const SCEV *Src,
 // in the DirSet field of Bound. Returns the number of distinct
 // dependences discovered. If the dependence is disproved,
 // it will return 0.
-unsigned DependenceAnalysis::exploreDirections(unsigned Level,
-                                               CoefficientInfo *A,
-                                               CoefficientInfo *B,
-                                               BoundInfo *Bound,
-                                               const SmallBitVector &Loops,
-                                               unsigned &DepthExpanded,
-                                               const SCEV *Delta) const {
+unsigned DependenceInfo::exploreDirections(unsigned Level, CoefficientInfo *A,
+                                           CoefficientInfo *B, BoundInfo *Bound,
+                                           const SmallBitVector &Loops,
+                                           unsigned &DepthExpanded,
+                                           const SCEV *Delta) const {
   if (Level > CommonLevels) {
     // record result
     DEBUG(dbgs() << "\t[");
@@ -2679,10 +2628,8 @@ unsigned DependenceAnalysis::exploreDirections(unsigned Level,
 
 
 // Returns true iff the current bounds are plausible.
-bool DependenceAnalysis::testBounds(unsigned char DirKind,
-                                    unsigned Level,
-                                    BoundInfo *Bound,
-                                    const SCEV *Delta) const {
+bool DependenceInfo::testBounds(unsigned char DirKind, unsigned Level,
+                                BoundInfo *Bound, const SCEV *Delta) const {
   Bound[Level].Direction = DirKind;
   if (const SCEV *LowerBound = getLowerBound(Bound))
     if (isKnownPredicate(CmpInst::ICMP_SGT, LowerBound, Delta))
@@ -2709,10 +2656,8 @@ bool DependenceAnalysis::testBounds(unsigned char DirKind,
 // We must be careful to handle the case where the upper bound is unknown.
 // Note that the lower bound is always <= 0
 // and the upper bound is always >= 0.
-void DependenceAnalysis::findBoundsALL(CoefficientInfo *A,
-                                       CoefficientInfo *B,
-                                       BoundInfo *Bound,
-                                       unsigned K) const {
+void DependenceInfo::findBoundsALL(CoefficientInfo *A, CoefficientInfo *B,
+                                   BoundInfo *Bound, unsigned K) const {
   Bound[K].Lower[Dependence::DVEntry::ALL] = nullptr; // Default value = -infinity.
   Bound[K].Upper[Dependence::DVEntry::ALL] = nullptr; // Default value = +infinity.
   if (Bound[K].Iterations) {
@@ -2750,10 +2695,8 @@ void DependenceAnalysis::findBoundsALL(CoefficientInfo *A,
 // We must be careful to handle the case where the upper bound is unknown.
 // Note that the lower bound is always <= 0
 // and the upper bound is always >= 0.
-void DependenceAnalysis::findBoundsEQ(CoefficientInfo *A,
-                                      CoefficientInfo *B,
-                                      BoundInfo *Bound,
-                                      unsigned K) const {
+void DependenceInfo::findBoundsEQ(CoefficientInfo *A, CoefficientInfo *B,
+                                  BoundInfo *Bound, unsigned K) const {
   Bound[K].Lower[Dependence::DVEntry::EQ] = nullptr; // Default value = -infinity.
   Bound[K].Upper[Dependence::DVEntry::EQ] = nullptr; // Default value = +infinity.
   if (Bound[K].Iterations) {
@@ -2792,10 +2735,8 @@ void DependenceAnalysis::findBoundsEQ(CoefficientInfo *A,
 //    UB^<_k = (A^+_k - B_k)^+ (U_k - 1) - B_k
 //
 // We must be careful to handle the case where the upper bound is unknown.
-void DependenceAnalysis::findBoundsLT(CoefficientInfo *A,
-                                      CoefficientInfo *B,
-                                      BoundInfo *Bound,
-                                      unsigned K) const {
+void DependenceInfo::findBoundsLT(CoefficientInfo *A, CoefficientInfo *B,
+                                  BoundInfo *Bound, unsigned K) const {
   Bound[K].Lower[Dependence::DVEntry::LT] = nullptr; // Default value = -infinity.
   Bound[K].Upper[Dependence::DVEntry::LT] = nullptr; // Default value = +infinity.
   if (Bound[K].Iterations) {
@@ -2838,10 +2779,8 @@ void DependenceAnalysis::findBoundsLT(CoefficientInfo *A,
 //    UB^>_k = (A_k - B^-_k)^+ (U_k - 1) + A_k
 //
 // We must be careful to handle the case where the upper bound is unknown.
-void DependenceAnalysis::findBoundsGT(CoefficientInfo *A,
-                                      CoefficientInfo *B,
-                                      BoundInfo *Bound,
-                                      unsigned K) const {
+void DependenceInfo::findBoundsGT(CoefficientInfo *A, CoefficientInfo *B,
+                                  BoundInfo *Bound, unsigned K) const {
   Bound[K].Lower[Dependence::DVEntry::GT] = nullptr; // Default value = -infinity.
   Bound[K].Upper[Dependence::DVEntry::GT] = nullptr; // Default value = +infinity.
   if (Bound[K].Iterations) {
@@ -2870,13 +2809,13 @@ void DependenceAnalysis::findBoundsGT(CoefficientInfo *A,
 
 
 // X^+ = max(X, 0)
-const SCEV *DependenceAnalysis::getPositivePart(const SCEV *X) const {
+const SCEV *DependenceInfo::getPositivePart(const SCEV *X) const {
   return SE->getSMaxExpr(X, SE->getZero(X->getType()));
 }
 
 
 // X^- = min(X, 0)
-const SCEV *DependenceAnalysis::getNegativePart(const SCEV *X) const {
+const SCEV *DependenceInfo::getNegativePart(const SCEV *X) const {
   return SE->getSMinExpr(X, SE->getZero(X->getType()));
 }
 
@@ -2884,10 +2823,9 @@ const SCEV *DependenceAnalysis::getNegativePart(const SCEV *X) const {
 // Walks through the subscript,
 // collecting each coefficient, the associated loop bounds,
 // and recording its positive and negative parts for later use.
-DependenceAnalysis::CoefficientInfo *
-DependenceAnalysis::collectCoeffInfo(const SCEV *Subscript,
-                                     bool SrcFlag,
-                                     const SCEV *&Constant) const {
+DependenceInfo::CoefficientInfo *
+DependenceInfo::collectCoeffInfo(const SCEV *Subscript, bool SrcFlag,
+                                 const SCEV *&Constant) const {
   const SCEV *Zero = SE->getZero(Subscript->getType());
   CoefficientInfo *CI = new CoefficientInfo[MaxLevels + 1];
   for (unsigned K = 1; K <= MaxLevels; ++K) {
@@ -2931,7 +2869,7 @@ DependenceAnalysis::collectCoeffInfo(const SCEV *Subscript,
 // computes the lower bound given the current direction settings
 // at each level. If the lower bound for any level is -inf,
 // the result is -inf.
-const SCEV *DependenceAnalysis::getLowerBound(BoundInfo *Bound) const {
+const SCEV *DependenceInfo::getLowerBound(BoundInfo *Bound) const {
   const SCEV *Sum = Bound[1].Lower[Bound[1].Direction];
   for (unsigned K = 2; Sum && K <= MaxLevels; ++K) {
     if (Bound[K].Lower[Bound[K].Direction])
@@ -2947,7 +2885,7 @@ const SCEV *DependenceAnalysis::getLowerBound(BoundInfo *Bound) const {
 // computes the upper bound given the current direction settings
 // at each level. If the upper bound at any level is +inf,
 // the result is +inf.
-const SCEV *DependenceAnalysis::getUpperBound(BoundInfo *Bound) const {
+const SCEV *DependenceInfo::getUpperBound(BoundInfo *Bound) const {
   const SCEV *Sum = Bound[1].Upper[Bound[1].Direction];
   for (unsigned K = 2; Sum && K <= MaxLevels; ++K) {
     if (Bound[K].Upper[Bound[K].Direction])
@@ -2968,8 +2906,8 @@ const SCEV *DependenceAnalysis::getUpperBound(BoundInfo *Bound) const {
 // If there isn't one, return 0.
 // For example, given a*i + b*j + c*k, finding the coefficient
 // corresponding to the j loop would yield b.
-const SCEV *DependenceAnalysis::findCoefficient(const SCEV *Expr,
-                                                const Loop *TargetLoop)  const {
+const SCEV *DependenceInfo::findCoefficient(const SCEV *Expr,
+                                            const Loop *TargetLoop) const {
   const SCEVAddRecExpr *AddRec = dyn_cast<SCEVAddRecExpr>(Expr);
   if (!AddRec)
     return SE->getZero(Expr->getType());
@@ -2984,8 +2922,8 @@ const SCEV *DependenceAnalysis::findCoefficient(const SCEV *Expr,
 // corresponding to the specified loop.
 // For example, given a*i + b*j + c*k, zeroing the coefficient
 // corresponding to the j loop would yield a*i + c*k.
-const SCEV *DependenceAnalysis::zeroCoefficient(const SCEV *Expr,
-                                                const Loop *TargetLoop)  const {
+const SCEV *DependenceInfo::zeroCoefficient(const SCEV *Expr,
+                                            const Loop *TargetLoop) const {
   const SCEVAddRecExpr *AddRec = dyn_cast<SCEVAddRecExpr>(Expr);
   if (!AddRec)
     return Expr; // ignore
@@ -3003,9 +2941,9 @@ const SCEV *DependenceAnalysis::zeroCoefficient(const SCEV *Expr,
 // coefficient corresponding to the specified TargetLoop.
 // For example, given a*i + b*j + c*k, adding 1 to the coefficient
 // corresponding to the j loop would yield a*i + (b+1)*j + c*k.
-const SCEV *DependenceAnalysis::addToCoefficient(const SCEV *Expr,
-                                                 const Loop *TargetLoop,
-                                                 const SCEV *Value)  const {
+const SCEV *DependenceInfo::addToCoefficient(const SCEV *Expr,
+                                             const Loop *TargetLoop,
+                                             const SCEV *Value) const {
   const SCEVAddRecExpr *AddRec = dyn_cast<SCEVAddRecExpr>(Expr);
   if (!AddRec) // create a new addRec
     return SE->getAddRecExpr(Expr,
@@ -3040,11 +2978,10 @@ const SCEV *DependenceAnalysis::addToCoefficient(const SCEV *Expr,
 //            Practical Dependence Testing
 //            Goff, Kennedy, Tseng
 //            PLDI 1991
-bool DependenceAnalysis::propagate(const SCEV *&Src,
-                                   const SCEV *&Dst,
-                                   SmallBitVector &Loops,
-                                   SmallVectorImpl<Constraint> &Constraints,
-                                   bool &Consistent) {
+bool DependenceInfo::propagate(const SCEV *&Src, const SCEV *&Dst,
+                               SmallBitVector &Loops,
+                               SmallVectorImpl<Constraint> &Constraints,
+                               bool &Consistent) {
   bool Result = false;
   for (int LI = Loops.find_first(); LI >= 0; LI = Loops.find_next(LI)) {
     DEBUG(dbgs() << "\t    Constraint[" << LI << "] is");
@@ -3065,10 +3002,9 @@ bool DependenceAnalysis::propagate(const SCEV *&Src,
 // Return true if some simplification occurs.
 // If the simplification isn't exact (that is, if it is conservative
 // in terms of dependence), set consistent to false.
-bool DependenceAnalysis::propagateDistance(const SCEV *&Src,
-                                           const SCEV *&Dst,
-                                           Constraint &CurConstraint,
-                                           bool &Consistent) {
+bool DependenceInfo::propagateDistance(const SCEV *&Src, const SCEV *&Dst,
+                                       Constraint &CurConstraint,
+                                       bool &Consistent) {
   const Loop *CurLoop = CurConstraint.getAssociatedLoop();
   DEBUG(dbgs() << "\t\tSrc is " << *Src << "\n");
   const SCEV *A_K = findCoefficient(Src, CurLoop);
@@ -3092,10 +3028,9 @@ bool DependenceAnalysis::propagateDistance(const SCEV *&Src,
 // Return true if some simplification occurs.
 // If the simplification isn't exact (that is, if it is conservative
 // in terms of dependence), set consistent to false.
-bool DependenceAnalysis::propagateLine(const SCEV *&Src,
-                                       const SCEV *&Dst,
-                                       Constraint &CurConstraint,
-                                       bool &Consistent) {
+bool DependenceInfo::propagateLine(const SCEV *&Src, const SCEV *&Dst,
+                                   Constraint &CurConstraint,
+                                   bool &Consistent) {
   const Loop *CurLoop = CurConstraint.getAssociatedLoop();
   const SCEV *A = CurConstraint.getA();
   const SCEV *B = CurConstraint.getB();
@@ -3167,9 +3102,8 @@ bool DependenceAnalysis::propagateLine(const SCEV *&Src,
 // Attempt to propagate a point
 // constraint into a subscript pair (Src and Dst).
 // Return true if some simplification occurs.
-bool DependenceAnalysis::propagatePoint(const SCEV *&Src,
-                                        const SCEV *&Dst,
-                                        Constraint &CurConstraint) {
+bool DependenceInfo::propagatePoint(const SCEV *&Src, const SCEV *&Dst,
+                                    Constraint &CurConstraint) {
   const Loop *CurLoop = CurConstraint.getAssociatedLoop();
   const SCEV *A_K = findCoefficient(Src, CurLoop);
   const SCEV *AP_K = findCoefficient(Dst, CurLoop);
@@ -3187,9 +3121,8 @@ bool DependenceAnalysis::propagatePoint(const SCEV *&Src,
 
 
 // Update direction vector entry based on the current constraint.
-void DependenceAnalysis::updateDirection(Dependence::DVEntry &Level,
-                                         const Constraint &CurConstraint
-                                         ) const {
+void DependenceInfo::updateDirection(Dependence::DVEntry &Level,
+                                     const Constraint &CurConstraint) const {
   DEBUG(dbgs() << "\tUpdate direction, constraint =");
   DEBUG(CurConstraint.dump(dbgs()));
   if (CurConstraint.isAny())
@@ -3241,10 +3174,8 @@ void DependenceAnalysis::updateDirection(Dependence::DVEntry &Level,
 /// source and destination array references are recurrences on a nested loop,
 /// this function flattens the nested recurrences into separate recurrences
 /// for each loop level.
-bool DependenceAnalysis::tryDelinearize(Instruction *Src,
-                                        Instruction *Dst,
-                                        SmallVectorImpl<Subscript> &Pair)
-{
+bool DependenceInfo::tryDelinearize(Instruction *Src, Instruction *Dst,
+                                    SmallVectorImpl<Subscript> &Pair) {
   Value *SrcPtr = getPointerOperand(Src);
   Value *DstPtr = getPointerOperand(Dst);
 
@@ -3355,8 +3286,8 @@ static void dumpSmallBitVector(SmallBitVector &BV) {
 // Care is required to keep the routine below, getSplitIteration(),
 // up to date with respect to this routine.
 std::unique_ptr<Dependence>
-DependenceAnalysis::depends(Instruction *Src, Instruction *Dst,
-                            bool PossiblyLoopIndependent) {
+DependenceInfo::depends(Instruction *Src, Instruction *Dst,
+                        bool PossiblyLoopIndependent) {
   if (Src == Dst)
     PossiblyLoopIndependent = false;
 
@@ -3811,8 +3742,8 @@ DependenceAnalysis::depends(Instruction *Src, Instruction *Dst,
 //
 // breaks the dependence and allows us to vectorize/parallelize
 // both loops.
-const  SCEV *DependenceAnalysis::getSplitIteration(const Dependence &Dep,
-                                                   unsigned SplitLevel) {
+const SCEV *DependenceInfo::getSplitIteration(const Dependence &Dep,
+                                              unsigned SplitLevel) {
   assert(Dep.isSplitable(SplitLevel) &&
          "Dep should be splitable at SplitLevel");
   Instruction *Src = Dep.getSrc();
diff --git a/lib/Analysis/DivergenceAnalysis.cpp b/lib/Analysis/DivergenceAnalysis.cpp
index 5ae6d74130a7..1b36569f7a07 100644
--- a/lib/Analysis/DivergenceAnalysis.cpp
+++ b/lib/Analysis/DivergenceAnalysis.cpp
@@ -73,10 +73,8 @@
 #include "llvm/IR/Instructions.h"
 #include "llvm/IR/IntrinsicInst.h"
 #include "llvm/IR/Value.h"
-#include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/raw_ostream.h"
-#include "llvm/Transforms/Scalar.h"
 #include <vector>
 using namespace llvm;
 
@@ -140,14 +138,25 @@ void DivergencePropagator::exploreSyncDependency(TerminatorInst *TI) {
   //   a2 = 2;
   // a = phi(a1, a2); // sync dependent on (tid < 5)
   BasicBlock *ThisBB = TI->getParent();
-  BasicBlock *IPostDom = PDT.getNode(ThisBB)->getIDom()->getBlock();
+
+  // Unreachable blocks may not be in the dominator tree.
+  if (!DT.isReachableFromEntry(ThisBB))
+    return;
+
+  // If the function has no exit blocks or doesn't reach any exit blocks, the
+  // post dominator may be null.
+  DomTreeNode *ThisNode = PDT.getNode(ThisBB);
+  if (!ThisNode)
+    return;
+
+  BasicBlock *IPostDom = ThisNode->getIDom()->getBlock();
   if (IPostDom == nullptr)
     return;
 
   for (auto I = IPostDom->begin(); isa<PHINode>(I); ++I) {
     // A PHINode is uniform if it returns the same value no matter which path is
     // taken.
-    if (!cast<PHINode>(I)->hasConstantValue() && DV.insert(&*I).second)
+    if (!cast<PHINode>(I)->hasConstantOrUndefValue() && DV.insert(&*I).second)
       Worklist.push_back(&*I);
   }
 
@@ -259,7 +268,7 @@ char DivergenceAnalysis::ID = 0;
 INITIALIZE_PASS_BEGIN(DivergenceAnalysis, "divergence", "Divergence Analysis",
                       false, true)
 INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
-INITIALIZE_PASS_DEPENDENCY(PostDominatorTree)
+INITIALIZE_PASS_DEPENDENCY(PostDominatorTreeWrapperPass)
 INITIALIZE_PASS_END(DivergenceAnalysis, "divergence", "Divergence Analysis",
                     false, true)
 
@@ -269,7 +278,7 @@ FunctionPass *llvm::createDivergenceAnalysisPass() {
 
 void DivergenceAnalysis::getAnalysisUsage(AnalysisUsage &AU) const {
   AU.addRequired<DominatorTreeWrapperPass>();
-  AU.addRequired<PostDominatorTree>();
+  AU.addRequired<PostDominatorTreeWrapperPass>();
   AU.setPreservesAll();
 }
 
@@ -285,9 +294,10 @@ bool DivergenceAnalysis::runOnFunction(Function &F) {
     return false;
 
   DivergentValues.clear();
+  auto &PDT = getAnalysis<PostDominatorTreeWrapperPass>().getPostDomTree();
   DivergencePropagator DP(F, TTI,
                           getAnalysis<DominatorTreeWrapperPass>().getDomTree(),
-                          getAnalysis<PostDominatorTree>(), DivergentValues);
+                          PDT, DivergentValues);
   DP.populateWithSourcesOfDivergence();
   DP.propagate();
   return false;
diff --git a/lib/Analysis/DomPrinter.cpp b/lib/Analysis/DomPrinter.cpp
index 0c880df54f8e..7acfb41500d4 100644
--- a/lib/Analysis/DomPrinter.cpp
+++ b/lib/Analysis/DomPrinter.cpp
@@ -111,20 +111,36 @@ struct DomOnlyViewer : public DOTGraphTraitsViewer<
   }
 };
 
-struct PostDomViewer
-  : public DOTGraphTraitsViewer<PostDominatorTree, false> {
+struct PostDominatorTreeWrapperPassAnalysisGraphTraits {
+  static PostDominatorTree *getGraph(PostDominatorTreeWrapperPass *PDTWP) {
+    return &PDTWP->getPostDomTree();
+  }
+};
+
+struct PostDomViewer : public DOTGraphTraitsViewer<
+                          PostDominatorTreeWrapperPass, false,
+                          PostDominatorTree *,
+                          PostDominatorTreeWrapperPassAnalysisGraphTraits> {
   static char ID;
   PostDomViewer() :
-    DOTGraphTraitsViewer<PostDominatorTree, false>("postdom", ID){
+    DOTGraphTraitsViewer<PostDominatorTreeWrapperPass, false,
+                         PostDominatorTree *,
+                         PostDominatorTreeWrapperPassAnalysisGraphTraits>(
+        "postdom", ID){
       initializePostDomViewerPass(*PassRegistry::getPassRegistry());
     }
 };
 
-struct PostDomOnlyViewer
-  : public DOTGraphTraitsViewer<PostDominatorTree, true> {
+struct PostDomOnlyViewer : public DOTGraphTraitsViewer<
+                            PostDominatorTreeWrapperPass, true,
+                            PostDominatorTree *,
+                            PostDominatorTreeWrapperPassAnalysisGraphTraits> {
   static char ID;
   PostDomOnlyViewer() :
-    DOTGraphTraitsViewer<PostDominatorTree, true>("postdomonly", ID){
+    DOTGraphTraitsViewer<PostDominatorTreeWrapperPass, true,
+                         PostDominatorTree *,
+                         PostDominatorTreeWrapperPassAnalysisGraphTraits>(
+        "postdomonly", ID){
       initializePostDomOnlyViewerPass(*PassRegistry::getPassRegistry());
     }
 };
@@ -175,19 +191,31 @@ struct DomOnlyPrinter : public DOTGraphTraitsPrinter<
 };
 
 struct PostDomPrinter
-  : public DOTGraphTraitsPrinter<PostDominatorTree, false> {
+  : public DOTGraphTraitsPrinter<
+                            PostDominatorTreeWrapperPass, false,
+                            PostDominatorTree *,
+                            PostDominatorTreeWrapperPassAnalysisGraphTraits> {
   static char ID;
   PostDomPrinter() :
-    DOTGraphTraitsPrinter<PostDominatorTree, false>("postdom", ID) {
+    DOTGraphTraitsPrinter<PostDominatorTreeWrapperPass, false,
+                          PostDominatorTree *,
+                          PostDominatorTreeWrapperPassAnalysisGraphTraits>(
+        "postdom", ID) {
       initializePostDomPrinterPass(*PassRegistry::getPassRegistry());
     }
 };
 
 struct PostDomOnlyPrinter
-  : public DOTGraphTraitsPrinter<PostDominatorTree, true> {
+  : public DOTGraphTraitsPrinter<
+                            PostDominatorTreeWrapperPass, true,
+                            PostDominatorTree *,
+                            PostDominatorTreeWrapperPassAnalysisGraphTraits> {
   static char ID;
   PostDomOnlyPrinter() :
-    DOTGraphTraitsPrinter<PostDominatorTree, true>("postdomonly", ID) {
+    DOTGraphTraitsPrinter<PostDominatorTreeWrapperPass, true,
+                          PostDominatorTree *,
+                          PostDominatorTreeWrapperPassAnalysisGraphTraits>(
+        "postdomonly", ID) {
       initializePostDomOnlyPrinterPass(*PassRegistry::getPassRegistry());
     }
 };
diff --git a/lib/Analysis/DominanceFrontier.cpp b/lib/Analysis/DominanceFrontier.cpp
index 7ba91bc90dfc..4554374252a4 100644
--- a/lib/Analysis/DominanceFrontier.cpp
+++ b/lib/Analysis/DominanceFrontier.cpp
@@ -9,6 +9,7 @@
 
 #include "llvm/Analysis/DominanceFrontier.h"
 #include "llvm/Analysis/DominanceFrontierImpl.h"
+#include "llvm/IR/PassManager.h"
 
 using namespace llvm;
 
@@ -17,41 +18,60 @@ template class DominanceFrontierBase<BasicBlock>;
 template class ForwardDominanceFrontierBase<BasicBlock>;
 }
 
-char DominanceFrontier::ID = 0;
+char DominanceFrontierWrapperPass::ID = 0;
 
-INITIALIZE_PASS_BEGIN(DominanceFrontier, "domfrontier",
+INITIALIZE_PASS_BEGIN(DominanceFrontierWrapperPass, "domfrontier",
                 "Dominance Frontier Construction", true, true)
 INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
-INITIALIZE_PASS_END(DominanceFrontier, "domfrontier",
+INITIALIZE_PASS_END(DominanceFrontierWrapperPass, "domfrontier",
                 "Dominance Frontier Construction", true, true)
 
-DominanceFrontier::DominanceFrontier()
-  : FunctionPass(ID),
-    Base() {
-  initializeDominanceFrontierPass(*PassRegistry::getPassRegistry());
+ DominanceFrontierWrapperPass::DominanceFrontierWrapperPass()
+    : FunctionPass(ID), DF() {
+  initializeDominanceFrontierWrapperPassPass(*PassRegistry::getPassRegistry());
 }
 
-void DominanceFrontier::releaseMemory() {
-  Base.releaseMemory();
+void DominanceFrontierWrapperPass::releaseMemory() {
+  DF.releaseMemory();
 }
 
-bool DominanceFrontier::runOnFunction(Function &) {
+bool DominanceFrontierWrapperPass::runOnFunction(Function &) {
   releaseMemory();
-  Base.analyze(getAnalysis<DominatorTreeWrapperPass>().getDomTree());
+  DF.analyze(getAnalysis<DominatorTreeWrapperPass>().getDomTree());
   return false;
 }
 
-void DominanceFrontier::getAnalysisUsage(AnalysisUsage &AU) const {
+void DominanceFrontierWrapperPass::getAnalysisUsage(AnalysisUsage &AU) const {
   AU.setPreservesAll();
   AU.addRequired<DominatorTreeWrapperPass>();
 }
 
-void DominanceFrontier::print(raw_ostream &OS, const Module *) const {
-  Base.print(OS);
+void DominanceFrontierWrapperPass::print(raw_ostream &OS, const Module *) const {
+  DF.print(OS);
 }
 
 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
-void DominanceFrontier::dump() const {
+LLVM_DUMP_METHOD void DominanceFrontierWrapperPass::dump() const {
   print(dbgs());
 }
 #endif
+
+char DominanceFrontierAnalysis::PassID;
+
+DominanceFrontier DominanceFrontierAnalysis::run(Function &F,
+                                                 FunctionAnalysisManager &AM) {
+  DominanceFrontier DF;
+  DF.analyze(AM.getResult<DominatorTreeAnalysis>(F));
+  return DF;
+}
+
+DominanceFrontierPrinterPass::DominanceFrontierPrinterPass(raw_ostream &OS)
+  : OS(OS) {}
+
+PreservedAnalyses
+DominanceFrontierPrinterPass::run(Function &F, FunctionAnalysisManager &AM) {
+  OS << "DominanceFrontier for function: " << F.getName() << "\n";
+  AM.getResult<DominanceFrontierAnalysis>(F).print(OS);
+
+  return PreservedAnalyses::all();
+}
diff --git a/lib/Analysis/EHPersonalities.cpp b/lib/Analysis/EHPersonalities.cpp
index 01be8b38fadd..5f951f5112e9 100644
--- a/lib/Analysis/EHPersonalities.cpp
+++ b/lib/Analysis/EHPersonalities.cpp
@@ -27,13 +27,16 @@ EHPersonality llvm::classifyEHPersonality(const Value *Pers) {
   return StringSwitch<EHPersonality>(F->getName())
     .Case("__gnat_eh_personality", EHPersonality::GNU_Ada)
     .Case("__gxx_personality_v0",  EHPersonality::GNU_CXX)
+    .Case("__gxx_personality_sj0", EHPersonality::GNU_CXX_SjLj)
     .Case("__gcc_personality_v0",  EHPersonality::GNU_C)
+    .Case("__gcc_personality_sj0", EHPersonality::GNU_C_SjLj)
     .Case("__objc_personality_v0", EHPersonality::GNU_ObjC)
     .Case("_except_handler3",      EHPersonality::MSVC_X86SEH)
     .Case("_except_handler4",      EHPersonality::MSVC_X86SEH)
     .Case("__C_specific_handler",  EHPersonality::MSVC_Win64SEH)
     .Case("__CxxFrameHandler3",    EHPersonality::MSVC_CXX)
     .Case("ProcessCLRException",   EHPersonality::CoreCLR)
+    .Case("rust_eh_personality",   EHPersonality::Rust)
     .Default(EHPersonality::Unknown);
 }
 
@@ -92,7 +95,7 @@ DenseMap<BasicBlock *, ColorVector> llvm::colorEHFunclets(Function &F) {
     BasicBlock *SuccColor = Color;
     TerminatorInst *Terminator = Visiting->getTerminator();
     if (auto *CatchRet = dyn_cast<CatchReturnInst>(Terminator)) {
-      Value *ParentPad = CatchRet->getParentPad();
+      Value *ParentPad = CatchRet->getCatchSwitchParentPad();
       if (isa<ConstantTokenNone>(ParentPad))
         SuccColor = EntryBlock;
       else
diff --git a/lib/Analysis/GlobalsModRef.cpp b/lib/Analysis/GlobalsModRef.cpp
index 1babb822074b..a7d1e048e133 100644
--- a/lib/Analysis/GlobalsModRef.cpp
+++ b/lib/Analysis/GlobalsModRef.cpp
@@ -243,13 +243,14 @@ FunctionModRefBehavior
 GlobalsAAResult::getModRefBehavior(ImmutableCallSite CS) {
   FunctionModRefBehavior Min = FMRB_UnknownModRefBehavior;
 
-  if (const Function *F = CS.getCalledFunction())
-    if (FunctionInfo *FI = getFunctionInfo(F)) {
-      if (FI->getModRefInfo() == MRI_NoModRef)
-        Min = FMRB_DoesNotAccessMemory;
-      else if ((FI->getModRefInfo() & MRI_Mod) == 0)
-        Min = FMRB_OnlyReadsMemory;
-    }
+  if (!CS.hasOperandBundles())
+    if (const Function *F = CS.getCalledFunction())
+      if (FunctionInfo *FI = getFunctionInfo(F)) {
+        if (FI->getModRefInfo() == MRI_NoModRef)
+          Min = FMRB_DoesNotAccessMemory;
+        else if ((FI->getModRefInfo() & MRI_Mod) == 0)
+          Min = FMRB_OnlyReadsMemory;
+      }
 
   return FunctionModRefBehavior(AAResultBase::getModRefBehavior(CS) & Min);
 }
@@ -269,7 +270,7 @@ GlobalsAAResult::getFunctionInfo(const Function *F) {
 /// (really, their address passed to something nontrivial), record this fact,
 /// and record the functions that they are used directly in.
 void GlobalsAAResult::AnalyzeGlobals(Module &M) {
-  SmallPtrSet<Function *, 64> TrackedFunctions;
+  SmallPtrSet<Function *, 32> TrackedFunctions;
   for (Function &F : M)
     if (F.hasLocalLinkage())
       if (!AnalyzeUsesOfPointer(&F)) {
@@ -281,7 +282,7 @@ void GlobalsAAResult::AnalyzeGlobals(Module &M) {
         ++NumNonAddrTakenFunctions;
       }
 
-  SmallPtrSet<Function *, 64> Readers, Writers;
+  SmallPtrSet<Function *, 16> Readers, Writers;
   for (GlobalVariable &GV : M.globals())
     if (GV.hasLocalLinkage()) {
       if (!AnalyzeUsesOfPointer(&GV, &Readers,
@@ -310,7 +311,7 @@ void GlobalsAAResult::AnalyzeGlobals(Module &M) {
         ++NumNonAddrTakenGlobalVars;
 
         // If this global holds a pointer type, see if it is an indirect global.
-        if (GV.getType()->getElementType()->isPointerTy() &&
+        if (GV.getValueType()->isPointerTy() &&
             AnalyzeIndirectGlobalMemory(&GV))
           ++NumIndirectGlobalVars;
       }
@@ -470,9 +471,10 @@ void GlobalsAAResult::AnalyzeCallGraph(CallGraph &CG, Module &M) {
     const std::vector<CallGraphNode *> &SCC = *I;
     assert(!SCC.empty() && "SCC with no functions?");
 
-    if (!SCC[0]->getFunction() || SCC[0]->getFunction()->mayBeOverridden()) {
-      // Calls externally or is weak - can't say anything useful. Remove any existing
-      // function records (may have been created when scanning globals).
+    if (!SCC[0]->getFunction() || !SCC[0]->getFunction()->isDefinitionExact()) {
+      // Calls externally or not exact - can't say anything useful. Remove any
+      // existing function records (may have been created when scanning
+      // globals).
       for (auto *Node : SCC)
         FunctionInfos.erase(Node->getFunction());
       continue;
@@ -496,7 +498,7 @@ void GlobalsAAResult::AnalyzeCallGraph(CallGraph &CG, Module &M) {
           // Can't do better than that!
         } else if (F->onlyReadsMemory()) {
           FI.addModRefInfo(MRI_Ref);
-          if (!F->isIntrinsic())
+          if (!F->isIntrinsic() && !F->onlyAccessesArgMemory())
             // This function might call back into the module and read a global -
             // consider every global as possibly being read by this function.
             FI.setMayReadAnyGlobal();
@@ -698,7 +700,7 @@ bool GlobalsAAResult::isNonEscapingGlobalNoAlias(const GlobalValue *GV,
       auto *InputGVar = dyn_cast<GlobalVariable>(InputGV);
       if (GVar && InputGVar &&
           !GVar->isDeclaration() && !InputGVar->isDeclaration() &&
-          !GVar->mayBeOverridden() && !InputGVar->mayBeOverridden()) {
+          !GVar->isInterposable() && !InputGVar->isInterposable()) {
         Type *GVType = GVar->getInitializer()->getType();
         Type *InputGVType = InputGVar->getInitializer()->getType();
         if (GVType->isSized() && InputGVType->isSized() &&
@@ -863,7 +865,11 @@ ModRefInfo GlobalsAAResult::getModRefInfoForArgument(ImmutableCallSite CS,
     GetUnderlyingObjects(A, Objects, DL);
     
     // All objects must be identified.
-    if (!std::all_of(Objects.begin(), Objects.end(), isIdentifiedObject))
+    if (!std::all_of(Objects.begin(), Objects.end(), isIdentifiedObject) &&
+        // Try ::alias to see if all objects are known not to alias GV.
+        !std::all_of(Objects.begin(), Objects.end(), [&](Value *V) {
+          return this->alias(MemoryLocation(V), MemoryLocation(GV)) == NoAlias;
+          }))
       return ConservativeResult;
 
     if (std::find(Objects.begin(), Objects.end(), GV) != Objects.end())
@@ -896,10 +902,10 @@ ModRefInfo GlobalsAAResult::getModRefInfo(ImmutableCallSite CS,
 
 GlobalsAAResult::GlobalsAAResult(const DataLayout &DL,
                                  const TargetLibraryInfo &TLI)
-    : AAResultBase(TLI), DL(DL) {}
+    : AAResultBase(), DL(DL), TLI(TLI) {}
 
 GlobalsAAResult::GlobalsAAResult(GlobalsAAResult &&Arg)
-    : AAResultBase(std::move(Arg)), DL(Arg.DL),
+    : AAResultBase(std::move(Arg)), DL(Arg.DL), TLI(Arg.TLI),
       NonAddressTakenGlobals(std::move(Arg.NonAddressTakenGlobals)),
       IndirectGlobals(std::move(Arg.IndirectGlobals)),
       AllocsForIndirectGlobals(std::move(Arg.AllocsForIndirectGlobals)),
@@ -912,6 +918,8 @@ GlobalsAAResult::GlobalsAAResult(GlobalsAAResult &&Arg)
   }
 }
 
+GlobalsAAResult::~GlobalsAAResult() {}
+
 /*static*/ GlobalsAAResult
 GlobalsAAResult::analyzeModule(Module &M, const TargetLibraryInfo &TLI,
                                CallGraph &CG) {
@@ -929,14 +937,14 @@ GlobalsAAResult::analyzeModule(Module &M, const TargetLibraryInfo &TLI,
   return Result;
 }
 
-GlobalsAAResult GlobalsAA::run(Module &M, AnalysisManager<Module> *AM) {
+char GlobalsAA::PassID;
+
+GlobalsAAResult GlobalsAA::run(Module &M, AnalysisManager<Module> &AM) {
   return GlobalsAAResult::analyzeModule(M,
-                                        AM->getResult<TargetLibraryAnalysis>(M),
-                                        AM->getResult<CallGraphAnalysis>(M));
+                                        AM.getResult<TargetLibraryAnalysis>(M),
+                                        AM.getResult<CallGraphAnalysis>(M));
 }
 
-char GlobalsAA::PassID;
-
 char GlobalsAAWrapperPass::ID = 0;
 INITIALIZE_PASS_BEGIN(GlobalsAAWrapperPass, "globals-aa",
                       "Globals Alias Analysis", false, true)
diff --git a/lib/Analysis/IVUsers.cpp b/lib/Analysis/IVUsers.cpp
index e0c5d8fa5f5a..43c0ba17fe4a 100644
--- a/lib/Analysis/IVUsers.cpp
+++ b/lib/Analysis/IVUsers.cpp
@@ -12,11 +12,12 @@
 //
 //===----------------------------------------------------------------------===//
 
+#include "llvm/Analysis/IVUsers.h"
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/Analysis/AssumptionCache.h"
 #include "llvm/Analysis/CodeMetrics.h"
-#include "llvm/Analysis/IVUsers.h"
 #include "llvm/Analysis/LoopPass.h"
+#include "llvm/Analysis/LoopPassManager.h"
 #include "llvm/Analysis/ScalarEvolutionExpressions.h"
 #include "llvm/Analysis/ValueTracking.h"
 #include "llvm/IR/Constants.h"
@@ -33,19 +34,35 @@ using namespace llvm;
 
 #define DEBUG_TYPE "iv-users"
 
-char IVUsers::ID = 0;
-INITIALIZE_PASS_BEGIN(IVUsers, "iv-users",
+char IVUsersAnalysis::PassID;
+
+IVUsers IVUsersAnalysis::run(Loop &L, AnalysisManager<Loop> &AM) {
+  const auto &FAM =
+      AM.getResult<FunctionAnalysisManagerLoopProxy>(L).getManager();
+  Function *F = L.getHeader()->getParent();
+
+  return IVUsers(&L, FAM.getCachedResult<AssumptionAnalysis>(*F),
+                 FAM.getCachedResult<LoopAnalysis>(*F),
+                 FAM.getCachedResult<DominatorTreeAnalysis>(*F),
+                 FAM.getCachedResult<ScalarEvolutionAnalysis>(*F));
+}
+
+PreservedAnalyses IVUsersPrinterPass::run(Loop &L, AnalysisManager<Loop> &AM) {
+  AM.getResult<IVUsersAnalysis>(L).print(OS);
+  return PreservedAnalyses::all();
+}
+
+char IVUsersWrapperPass::ID = 0;
+INITIALIZE_PASS_BEGIN(IVUsersWrapperPass, "iv-users",
                       "Induction Variable Users", false, true)
 INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker)
 INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass)
 INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
 INITIALIZE_PASS_DEPENDENCY(ScalarEvolutionWrapperPass)
-INITIALIZE_PASS_END(IVUsers, "iv-users",
-                      "Induction Variable Users", false, true)
+INITIALIZE_PASS_END(IVUsersWrapperPass, "iv-users", "Induction Variable Users",
+                    false, true)
 
-Pass *llvm::createIVUsersPass() {
-  return new IVUsers();
-}
+Pass *llvm::createIVUsersPass() { return new IVUsersWrapperPass(); }
 
 /// isInteresting - Test whether the given expression is "interesting" when
 /// used by the given expression, within the context of analyzing the
@@ -246,28 +263,9 @@ IVStrideUse &IVUsers::AddUser(Instruction *User, Value *Operand) {
   return IVUses.back();
 }
 
-IVUsers::IVUsers()
-    : LoopPass(ID) {
-  initializeIVUsersPass(*PassRegistry::getPassRegistry());
-}
-
-void IVUsers::getAnalysisUsage(AnalysisUsage &AU) const {
-  AU.addRequired<AssumptionCacheTracker>();
-  AU.addRequired<LoopInfoWrapperPass>();
-  AU.addRequired<DominatorTreeWrapperPass>();
-  AU.addRequired<ScalarEvolutionWrapperPass>();
-  AU.setPreservesAll();
-}
-
-bool IVUsers::runOnLoop(Loop *l, LPPassManager &LPM) {
-
-  L = l;
-  AC = &getAnalysis<AssumptionCacheTracker>().getAssumptionCache(
-      *L->getHeader()->getParent());
-  LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
-  DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
-  SE = &getAnalysis<ScalarEvolutionWrapperPass>().getSE();
-
+IVUsers::IVUsers(Loop *L, AssumptionCache *AC, LoopInfo *LI, DominatorTree *DT,
+                 ScalarEvolution *SE)
+    : L(L), AC(AC), LI(LI), DT(DT), SE(SE), IVUses() {
   // Collect ephemeral values so that AddUsersIfInteresting skips them.
   EphValues.clear();
   CodeMetrics::collectEphemeralValues(L, AC, EphValues);
@@ -277,34 +275,28 @@ bool IVUsers::runOnLoop(Loop *l, LPPassManager &LPM) {
   // this loop.  If they are induction variables, inspect their uses.
   for (BasicBlock::iterator I = L->getHeader()->begin(); isa<PHINode>(I); ++I)
     (void)AddUsersIfInteresting(&*I);
-
-  return false;
 }
 
 void IVUsers::print(raw_ostream &OS, const Module *M) const {
   OS << "IV Users for loop ";
   L->getHeader()->printAsOperand(OS, false);
   if (SE->hasLoopInvariantBackedgeTakenCount(L)) {
-    OS << " with backedge-taken count "
-       << *SE->getBackedgeTakenCount(L);
+    OS << " with backedge-taken count " << *SE->getBackedgeTakenCount(L);
   }
   OS << ":\n";
 
-  for (ilist<IVStrideUse>::const_iterator UI = IVUses.begin(),
-       E = IVUses.end(); UI != E; ++UI) {
+  for (const IVStrideUse &IVUse : IVUses) {
     OS << "  ";
-    UI->getOperandValToReplace()->printAsOperand(OS, false);
-    OS << " = " << *getReplacementExpr(*UI);
-    for (PostIncLoopSet::const_iterator
-         I = UI->PostIncLoops.begin(),
-         E = UI->PostIncLoops.end(); I != E; ++I) {
+    IVUse.getOperandValToReplace()->printAsOperand(OS, false);
+    OS << " = " << *getReplacementExpr(IVUse);
+    for (auto PostIncLoop : IVUse.PostIncLoops) {
       OS << " (post-inc with loop ";
-      (*I)->getHeader()->printAsOperand(OS, false);
+      PostIncLoop->getHeader()->printAsOperand(OS, false);
       OS << ")";
     }
     OS << " in  ";
-    if (UI->getUser())
-      UI->getUser()->print(OS);
+    if (IVUse.getUser())
+      IVUse.getUser()->print(OS);
     else
       OS << "Printing <null> User";
     OS << '\n';
@@ -312,9 +304,7 @@ void IVUsers::print(raw_ostream &OS, const Module *M) const {
 }
 
 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
-void IVUsers::dump() const {
-  print(dbgs());
-}
+LLVM_DUMP_METHOD void IVUsers::dump() const { print(dbgs()); }
 #endif
 
 void IVUsers::releaseMemory() {
@@ -322,6 +312,35 @@ void IVUsers::releaseMemory() {
   IVUses.clear();
 }
 
+IVUsersWrapperPass::IVUsersWrapperPass() : LoopPass(ID) {
+  initializeIVUsersWrapperPassPass(*PassRegistry::getPassRegistry());
+}
+
+void IVUsersWrapperPass::getAnalysisUsage(AnalysisUsage &AU) const {
+  AU.addRequired<AssumptionCacheTracker>();
+  AU.addRequired<LoopInfoWrapperPass>();
+  AU.addRequired<DominatorTreeWrapperPass>();
+  AU.addRequired<ScalarEvolutionWrapperPass>();
+  AU.setPreservesAll();
+}
+
+bool IVUsersWrapperPass::runOnLoop(Loop *L, LPPassManager &LPM) {
+  auto *AC = &getAnalysis<AssumptionCacheTracker>().getAssumptionCache(
+      *L->getHeader()->getParent());
+  auto *LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
+  auto *DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
+  auto *SE = &getAnalysis<ScalarEvolutionWrapperPass>().getSE();
+
+  IU.reset(new IVUsers(L, AC, LI, DT, SE));
+  return false;
+}
+
+void IVUsersWrapperPass::print(raw_ostream &OS, const Module *M) const {
+  IU->print(OS, M);
+}
+
+void IVUsersWrapperPass::releaseMemory() { IU->releaseMemory(); }
+
 /// getReplacementExpr - Return a SCEV expression which computes the
 /// value of the OperandValToReplace.
 const SCEV *IVUsers::getReplacementExpr(const IVStrideUse &IU) const {
diff --git a/lib/Analysis/IndirectCallPromotionAnalysis.cpp b/lib/Analysis/IndirectCallPromotionAnalysis.cpp
new file mode 100644
index 000000000000..3da33ac71421
--- /dev/null
+++ b/lib/Analysis/IndirectCallPromotionAnalysis.cpp
@@ -0,0 +1,109 @@
+//===-- IndirectCallPromotionAnalysis.cpp - Find promotion candidates ===//
+//
+//                      The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// Helper methods for identifying profitable indirect call promotion
+// candidates for an instruction when the indirect-call value profile metadata
+// is available.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Analysis/IndirectCallPromotionAnalysis.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/Analysis/IndirectCallSiteVisitor.h"
+#include "llvm/IR/CallSite.h"
+#include "llvm/IR/DiagnosticInfo.h"
+#include "llvm/IR/InstIterator.h"
+#include "llvm/IR/InstVisitor.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/ProfileData/InstrProf.h"
+#include "llvm/Support/Debug.h"
+#include <string>
+#include <utility>
+#include <vector>
+
+using namespace llvm;
+
+#define DEBUG_TYPE "pgo-icall-prom-analysis"
+
+// The minimum call count for the direct-call target to be considered as the
+// promotion candidate.
+static cl::opt<unsigned>
+    ICPCountThreshold("icp-count-threshold", cl::Hidden, cl::ZeroOrMore,
+                      cl::init(1000),
+                      cl::desc("The minimum count to the direct call target "
+                               "for the promotion"));
+
+// The percent threshold for the direct-call target (this call site vs the
+// total call count) for it to be considered as the promotion target.
+static cl::opt<unsigned>
+    ICPPercentThreshold("icp-percent-threshold", cl::init(33), cl::Hidden,
+                        cl::ZeroOrMore,
+                        cl::desc("The percentage threshold for the promotion"));
+
+// Set the maximum number of targets to promote for a single indirect-call
+// callsite.
+static cl::opt<unsigned>
+    MaxNumPromotions("icp-max-prom", cl::init(2), cl::Hidden, cl::ZeroOrMore,
+                     cl::desc("Max number of promotions for a single indirect "
+                              "call callsite"));
+
+ICallPromotionAnalysis::ICallPromotionAnalysis() {
+  ValueDataArray = llvm::make_unique<InstrProfValueData[]>(MaxNumPromotions);
+}
+
+bool ICallPromotionAnalysis::isPromotionProfitable(uint64_t Count,
+                                                   uint64_t TotalCount) {
+  if (Count < ICPCountThreshold)
+    return false;
+
+  unsigned Percentage = (Count * 100) / TotalCount;
+  return (Percentage >= ICPPercentThreshold);
+}
+
+// Indirect-call promotion heuristic. The direct targets are sorted based on
+// the count. Stop at the first target that is not promoted. Returns the
+// number of candidates deemed profitable.
+uint32_t ICallPromotionAnalysis::getProfitablePromotionCandidates(
+    const Instruction *Inst, uint32_t NumVals, uint64_t TotalCount) {
+  ArrayRef<InstrProfValueData> ValueDataRef(ValueDataArray.get(), NumVals);
+
+  DEBUG(dbgs() << " \nWork on callsite " << *Inst << " Num_targets: " << NumVals
+               << "\n");
+
+  uint32_t I = 0;
+  for (; I < MaxNumPromotions && I < NumVals; I++) {
+    uint64_t Count = ValueDataRef[I].Count;
+    assert(Count <= TotalCount);
+    DEBUG(dbgs() << " Candidate " << I << " Count=" << Count
+                 << "  Target_func: " << ValueDataRef[I].Value << "\n");
+
+    if (!isPromotionProfitable(Count, TotalCount)) {
+      DEBUG(dbgs() << " Not promote: Cold target.\n");
+      return I;
+    }
+    TotalCount -= Count;
+  }
+  return I;
+}
+
+ArrayRef<InstrProfValueData>
+ICallPromotionAnalysis::getPromotionCandidatesForInstruction(
+    const Instruction *I, uint32_t &NumVals, uint64_t &TotalCount,
+    uint32_t &NumCandidates) {
+  bool Res =
+      getValueProfDataFromInst(*I, IPVK_IndirectCallTarget, MaxNumPromotions,
+                               ValueDataArray.get(), NumVals, TotalCount);
+  if (!Res) {
+    NumCandidates = 0;
+    return ArrayRef<InstrProfValueData>();
+  }
+  NumCandidates = getProfitablePromotionCandidates(I, NumVals, TotalCount);
+  return ArrayRef<InstrProfValueData>(ValueDataArray.get(), NumVals);
+}
diff --git a/lib/Analysis/InlineCost.cpp b/lib/Analysis/InlineCost.cpp
index a86a703ed9d6..dcb724abc02d 100644
--- a/lib/Analysis/InlineCost.cpp
+++ b/lib/Analysis/InlineCost.cpp
@@ -21,6 +21,7 @@
 #include "llvm/Analysis/CodeMetrics.h"
 #include "llvm/Analysis/ConstantFolding.h"
 #include "llvm/Analysis/InstructionSimplify.h"
+#include "llvm/Analysis/ProfileSummaryInfo.h"
 #include "llvm/Analysis/TargetTransformInfo.h"
 #include "llvm/IR/CallSite.h"
 #include "llvm/IR/CallingConv.h"
@@ -39,6 +40,32 @@ using namespace llvm;
 
 STATISTIC(NumCallsAnalyzed, "Number of call sites analyzed");
 
+// Threshold to use when optsize is specified (and there is no
+// -inline-threshold).
+const int OptSizeThreshold = 75;
+
+// Threshold to use when -Oz is specified (and there is no -inline-threshold).
+const int OptMinSizeThreshold = 25;
+
+// Threshold to use when -O[34] is specified (and there is no
+// -inline-threshold).
+const int OptAggressiveThreshold = 275;
+
+static cl::opt<int> DefaultInlineThreshold(
+    "inline-threshold", cl::Hidden, cl::init(225), cl::ZeroOrMore,
+    cl::desc("Control the amount of inlining to perform (default = 225)"));
+
+static cl::opt<int> HintThreshold(
+    "inlinehint-threshold", cl::Hidden, cl::init(325),
+    cl::desc("Threshold for inlining functions with inline hint"));
+
+// We introduce this threshold to help performance of instrumentation based
+// PGO before we actually hook up inliner with analysis passes such as BPI and
+// BFI.
+static cl::opt<int> ColdThreshold(
+    "inlinecold-threshold", cl::Hidden, cl::init(225),
+    cl::desc("Threshold for inlining functions with cold attribute"));
+
 namespace {
 
 class CallAnalyzer : public InstVisitor<CallAnalyzer, bool> {
@@ -51,6 +78,9 @@ class CallAnalyzer : public InstVisitor<CallAnalyzer, bool> {
   /// The cache of @llvm.assume intrinsics.
   AssumptionCacheTracker *ACT;
 
+  /// Profile summary information.
+  ProfileSummaryInfo *PSI;
+
   // The called function.
   Function &F;
 
@@ -96,7 +126,7 @@ class CallAnalyzer : public InstVisitor<CallAnalyzer, bool> {
   DenseMap<Value *, int> SROAArgCosts;
 
   // Keep track of values which map to a pointer base and constant offset.
-  DenseMap<Value *, std::pair<Value *, APInt> > ConstantOffsetPtrs;
+  DenseMap<Value *, std::pair<Value *, APInt>> ConstantOffsetPtrs;
 
   // Custom simplification helper routines.
   bool isAllocaDerivedArg(Value *V);
@@ -117,19 +147,31 @@ class CallAnalyzer : public InstVisitor<CallAnalyzer, bool> {
   /// attributes since these can be more precise than the ones on the callee
   /// itself.
   bool paramHasAttr(Argument *A, Attribute::AttrKind Attr);
-  
+
   /// Return true if the given value is known non null within the callee if
   /// inlined through this particular callsite.
   bool isKnownNonNullInCallee(Value *V);
 
+  /// Update Threshold based on callsite properties such as callee
+  /// attributes and callee hotness for PGO builds. The Callee is explicitly
+  /// passed to support analyzing indirect calls whose target is inferred by
+  /// analysis.
+  void updateThreshold(CallSite CS, Function &Callee);
+
+  /// Return true if size growth is allowed when inlining the callee at CS.
+  bool allowSizeGrowth(CallSite CS);
+
   // Custom analysis routines.
   bool analyzeBlock(BasicBlock *BB, SmallPtrSetImpl<const Value *> &EphValues);
 
   // Disable several entry points to the visitor so we don't accidentally use
   // them by declaring but not defining them here.
-  void visit(Module *);     void visit(Module &);
-  void visit(Function *);   void visit(Function &);
-  void visit(BasicBlock *); void visit(BasicBlock &);
+  void visit(Module *);
+  void visit(Module &);
+  void visit(Function *);
+  void visit(Function &);
+  void visit(BasicBlock *);
+  void visit(BasicBlock &);
 
   // Provide base case for our instruction visit.
   bool visitInstruction(Instruction &I);
@@ -162,17 +204,19 @@ class CallAnalyzer : public InstVisitor<CallAnalyzer, bool> {
 
 public:
   CallAnalyzer(const TargetTransformInfo &TTI, AssumptionCacheTracker *ACT,
-               Function &Callee, int Threshold, CallSite CSArg)
-    : TTI(TTI), ACT(ACT), F(Callee), CandidateCS(CSArg), Threshold(Threshold),
-        Cost(0), IsCallerRecursive(false), IsRecursiveCall(false),
-        ExposesReturnsTwice(false), HasDynamicAlloca(false),
-        ContainsNoDuplicateCall(false), HasReturn(false), HasIndirectBr(false),
-        HasFrameEscape(false), AllocatedSize(0), NumInstructions(0),
-        NumVectorInstructions(0), FiftyPercentVectorBonus(0),
-        TenPercentVectorBonus(0), VectorBonus(0), NumConstantArgs(0),
-        NumConstantOffsetPtrArgs(0), NumAllocaArgs(0), NumConstantPtrCmps(0),
-        NumConstantPtrDiffs(0), NumInstructionsSimplified(0),
-        SROACostSavings(0), SROACostSavingsLost(0) {}
+               ProfileSummaryInfo *PSI, Function &Callee, int Threshold,
+               CallSite CSArg)
+      : TTI(TTI), ACT(ACT), PSI(PSI), F(Callee), CandidateCS(CSArg),
+        Threshold(Threshold), Cost(0), IsCallerRecursive(false),
+        IsRecursiveCall(false), ExposesReturnsTwice(false),
+        HasDynamicAlloca(false), ContainsNoDuplicateCall(false),
+        HasReturn(false), HasIndirectBr(false), HasFrameEscape(false),
+        AllocatedSize(0), NumInstructions(0), NumVectorInstructions(0),
+        FiftyPercentVectorBonus(0), TenPercentVectorBonus(0), VectorBonus(0),
+        NumConstantArgs(0), NumConstantOffsetPtrArgs(0), NumAllocaArgs(0),
+        NumConstantPtrCmps(0), NumConstantPtrDiffs(0),
+        NumInstructionsSimplified(0), SROACostSavings(0),
+        SROACostSavingsLost(0) {}
 
   bool analyzeCall(CallSite CS);
 
@@ -272,7 +316,8 @@ bool CallAnalyzer::accumulateGEPOffset(GEPOperator &GEP, APInt &Offset) {
         OpC = dyn_cast<ConstantInt>(SimpleOp);
     if (!OpC)
       return false;
-    if (OpC->isZero()) continue;
+    if (OpC->isZero())
+      continue;
 
     // Handle a struct index, which adds its field offset to the pointer.
     if (StructType *STy = dyn_cast<StructType>(*GTI)) {
@@ -290,13 +335,14 @@ bool CallAnalyzer::accumulateGEPOffset(GEPOperator &GEP, APInt &Offset) {
 
 bool CallAnalyzer::visitAlloca(AllocaInst &I) {
   // Check whether inlining will turn a dynamic alloca into a static
-  // alloca, and handle that case.
+  // alloca and handle that case.
   if (I.isArrayAllocation()) {
-    if (Constant *Size = SimplifiedValues.lookup(I.getArraySize())) {
-      ConstantInt *AllocSize = dyn_cast<ConstantInt>(Size);
-      assert(AllocSize && "Allocation size not a constant int?");
+    Constant *Size = SimplifiedValues.lookup(I.getArraySize());
+    if (auto *AllocSize = dyn_cast_or_null<ConstantInt>(Size)) {
+      const DataLayout &DL = F.getParent()->getDataLayout();
       Type *Ty = I.getAllocatedType();
-      AllocatedSize += Ty->getPrimitiveSizeInBits() * AllocSize->getZExtValue();
+      AllocatedSize = SaturatingMultiplyAdd(
+          AllocSize->getLimitedValue(), DL.getTypeAllocSize(Ty), AllocatedSize);
       return Base::visitAlloca(I);
     }
   }
@@ -305,7 +351,7 @@ bool CallAnalyzer::visitAlloca(AllocaInst &I) {
   if (I.isStaticAlloca()) {
     const DataLayout &DL = F.getParent()->getDataLayout();
     Type *Ty = I.getAllocatedType();
-    AllocatedSize += DL.getTypeAllocSize(Ty);
+    AllocatedSize = SaturatingAdd(DL.getTypeAllocSize(Ty), AllocatedSize);
   }
 
   // We will happily inline static alloca instructions.
@@ -336,8 +382,8 @@ bool CallAnalyzer::visitPHI(PHINode &I) {
 bool CallAnalyzer::visitGetElementPtr(GetElementPtrInst &I) {
   Value *SROAArg;
   DenseMap<Value *, int>::iterator CostIt;
-  bool SROACandidate = lookupSROAArgAndCost(I.getPointerOperand(),
-                                            SROAArg, CostIt);
+  bool SROACandidate =
+      lookupSROAArgAndCost(I.getPointerOperand(), SROAArg, CostIt);
 
   // Try to fold GEPs of constant-offset call site argument pointers. This
   // requires target data and inbounds GEPs.
@@ -393,8 +439,8 @@ bool CallAnalyzer::visitBitCast(BitCastInst &I) {
     }
 
   // Track base/offsets through casts
-  std::pair<Value *, APInt> BaseAndOffset
-    = ConstantOffsetPtrs.lookup(I.getOperand(0));
+  std::pair<Value *, APInt> BaseAndOffset =
+      ConstantOffsetPtrs.lookup(I.getOperand(0));
   // Casts don't change the offset, just wrap it up.
   if (BaseAndOffset.first)
     ConstantOffsetPtrs[&I] = BaseAndOffset;
@@ -425,8 +471,8 @@ bool CallAnalyzer::visitPtrToInt(PtrToIntInst &I) {
   unsigned IntegerSize = I.getType()->getScalarSizeInBits();
   const DataLayout &DL = F.getParent()->getDataLayout();
   if (IntegerSize >= DL.getPointerSizeInBits()) {
-    std::pair<Value *, APInt> BaseAndOffset
-      = ConstantOffsetPtrs.lookup(I.getOperand(0));
+    std::pair<Value *, APInt> BaseAndOffset =
+        ConstantOffsetPtrs.lookup(I.getOperand(0));
     if (BaseAndOffset.first)
       ConstantOffsetPtrs[&I] = BaseAndOffset;
   }
@@ -501,8 +547,7 @@ bool CallAnalyzer::visitUnaryInstruction(UnaryInstruction &I) {
     COp = SimplifiedValues.lookup(Operand);
   if (COp) {
     const DataLayout &DL = F.getParent()->getDataLayout();
-    if (Constant *C = ConstantFoldInstOperands(I.getOpcode(), I.getType(),
-                                               COp, DL)) {
+    if (Constant *C = ConstantFoldInstOperands(&I, COp, DL)) {
       SimplifiedValues[&I] = C;
       return true;
     }
@@ -516,7 +561,7 @@ bool CallAnalyzer::visitUnaryInstruction(UnaryInstruction &I) {
 
 bool CallAnalyzer::paramHasAttr(Argument *A, Attribute::AttrKind Attr) {
   unsigned ArgNo = A->getArgNo();
-  return CandidateCS.paramHasAttr(ArgNo+1, Attr);
+  return CandidateCS.paramHasAttr(ArgNo + 1, Attr);
 }
 
 bool CallAnalyzer::isKnownNonNullInCallee(Value *V) {
@@ -528,7 +573,7 @@ bool CallAnalyzer::isKnownNonNullInCallee(Value *V) {
   if (Argument *A = dyn_cast<Argument>(V))
     if (paramHasAttr(A, Attribute::NonNull))
       return true;
-  
+
   // Is this an alloca in the caller?  This is distinct from the attribute case
   // above because attributes aren't updated within the inliner itself and we
   // always want to catch the alloca derived case.
@@ -537,10 +582,86 @@ bool CallAnalyzer::isKnownNonNullInCallee(Value *V) {
     // alloca-derived value and null. Note that this fires regardless of
     // SROA firing.
     return true;
-  
+
   return false;
 }
 
+bool CallAnalyzer::allowSizeGrowth(CallSite CS) {
+  // If the normal destination of the invoke or the parent block of the call
+  // site is unreachable-terminated, there is little point in inlining this
+  // unless there is literally zero cost.
+  // FIXME: Note that it is possible that an unreachable-terminated block has a
+  // hot entry. For example, in below scenario inlining hot_call_X() may be
+  // beneficial :
+  // main() {
+  //   hot_call_1();
+  //   ...
+  //   hot_call_N()
+  //   exit(0);
+  // }
+  // For now, we are not handling this corner case here as it is rare in real
+  // code. In future, we should elaborate this based on BPI and BFI in more
+  // general threshold adjusting heuristics in updateThreshold().
+  Instruction *Instr = CS.getInstruction();
+  if (InvokeInst *II = dyn_cast<InvokeInst>(Instr)) {
+    if (isa<UnreachableInst>(II->getNormalDest()->getTerminator()))
+      return false;
+  } else if (isa<UnreachableInst>(Instr->getParent()->getTerminator()))
+    return false;
+
+  return true;
+}
+
+void CallAnalyzer::updateThreshold(CallSite CS, Function &Callee) {
+  // If no size growth is allowed for this inlining, set Threshold to 0.
+  if (!allowSizeGrowth(CS)) {
+    Threshold = 0;
+    return;
+  }
+
+  Function *Caller = CS.getCaller();
+  if (DefaultInlineThreshold.getNumOccurrences() > 0) {
+    // Explicitly specified -inline-threhold overrides the threshold passed to
+    // CallAnalyzer's constructor.
+    Threshold = DefaultInlineThreshold;
+  } else {
+    // If -inline-threshold is not given, listen to the optsize and minsize
+    // attributes when they would decrease the threshold.
+    if (Caller->optForMinSize() && OptMinSizeThreshold < Threshold)
+      Threshold = OptMinSizeThreshold;
+    else if (Caller->optForSize() && OptSizeThreshold < Threshold)
+      Threshold = OptSizeThreshold;
+  }
+
+  bool HotCallsite = false;
+  uint64_t TotalWeight;
+  if (CS.getInstruction()->extractProfTotalWeight(TotalWeight) &&
+      PSI->isHotCount(TotalWeight))
+    HotCallsite = true;
+
+  // Listen to the inlinehint attribute or profile based hotness information
+  // when it would increase the threshold and the caller does not need to
+  // minimize its size.
+  bool InlineHint = Callee.hasFnAttribute(Attribute::InlineHint) ||
+                    PSI->isHotFunction(&Callee) ||
+                    HotCallsite;
+  if (InlineHint && HintThreshold > Threshold && !Caller->optForMinSize())
+    Threshold = HintThreshold;
+
+  bool ColdCallee = PSI->isColdFunction(&Callee);
+  // Command line argument for DefaultInlineThreshold will override the default
+  // ColdThreshold. If we have -inline-threshold but no -inlinecold-threshold,
+  // do not use the default cold threshold even if it is smaller.
+  if ((DefaultInlineThreshold.getNumOccurrences() == 0 ||
+       ColdThreshold.getNumOccurrences() > 0) &&
+      ColdCallee && ColdThreshold < Threshold)
+    Threshold = ColdThreshold;
+
+  // Finally, take the target-specific inlining threshold multiplier into
+  // account.
+  Threshold *= TTI.getInliningThresholdMultiplier();
+}
+
 bool CallAnalyzer::visitCmpInst(CmpInst &I) {
   Value *LHS = I.getOperand(0), *RHS = I.getOperand(1);
   // First try to handle simplified comparisons.
@@ -552,7 +673,8 @@ bool CallAnalyzer::visitCmpInst(CmpInst &I) {
       RHS = SimpleRHS;
   if (Constant *CLHS = dyn_cast<Constant>(LHS)) {
     if (Constant *CRHS = dyn_cast<Constant>(RHS))
-      if (Constant *C = ConstantExpr::getCompare(I.getPredicate(), CLHS, CRHS)) {
+      if (Constant *C =
+              ConstantExpr::getCompare(I.getPredicate(), CLHS, CRHS)) {
         SimplifiedValues[&I] = C;
         return true;
       }
@@ -713,8 +835,8 @@ bool CallAnalyzer::visitInsertValue(InsertValueInst &I) {
   if (!InsertedC)
     InsertedC = SimplifiedValues.lookup(I.getInsertedValueOperand());
   if (AggC && InsertedC) {
-    SimplifiedValues[&I] = ConstantExpr::getInsertValue(AggC, InsertedC,
-                                                        I.getIndices());
+    SimplifiedValues[&I] =
+        ConstantExpr::getInsertValue(AggC, InsertedC, I.getIndices());
     return true;
   }
 
@@ -739,8 +861,8 @@ bool CallAnalyzer::simplifyCallSite(Function *F, CallSite CS) {
   // Try to re-map the arguments to constants.
   SmallVector<Constant *, 4> ConstantArgs;
   ConstantArgs.reserve(CS.arg_size());
-  for (CallSite::arg_iterator I = CS.arg_begin(), E = CS.arg_end();
-       I != E; ++I) {
+  for (CallSite::arg_iterator I = CS.arg_begin(), E = CS.arg_end(); I != E;
+       ++I) {
     Constant *C = dyn_cast<Constant>(*I);
     if (!C)
       C = dyn_cast_or_null<Constant>(SimplifiedValues.lookup(*I));
@@ -764,8 +886,7 @@ bool CallAnalyzer::visitCallSite(CallSite CS) {
     ExposesReturnsTwice = true;
     return false;
   }
-  if (CS.isCall() &&
-      cast<CallInst>(CS.getInstruction())->cannotDuplicate())
+  if (CS.isCall() && cast<CallInst>(CS.getInstruction())->cannotDuplicate())
     ContainsNoDuplicateCall = true;
 
   if (Function *F = CS.getCalledFunction()) {
@@ -780,6 +901,11 @@ bool CallAnalyzer::visitCallSite(CallSite CS) {
       default:
         return Base::visitCallSite(CS);
 
+      case Intrinsic::load_relative:
+        // This is normally lowered to 4 LLVM instructions.
+        Cost += 3 * InlineConstants::InstrCost;
+        return false;
+
       case Intrinsic::memset:
       case Intrinsic::memcpy:
       case Intrinsic::memmove:
@@ -831,7 +957,8 @@ bool CallAnalyzer::visitCallSite(CallSite CS) {
   // during devirtualization and so we want to give it a hefty bonus for
   // inlining, but cap that bonus in the event that inlining wouldn't pan
   // out. Pretend to inline the function, with a custom threshold.
-  CallAnalyzer CA(TTI, ACT, *F, InlineConstants::IndirectCallThreshold, CS);
+  CallAnalyzer CA(TTI, ACT, PSI, *F, InlineConstants::IndirectCallThreshold,
+                  CS);
   if (CA.analyzeCall(CS)) {
     // We were able to inline the indirect call! Subtract the cost from the
     // threshold to get the bonus we want to apply, but don't go below zero.
@@ -938,7 +1065,6 @@ bool CallAnalyzer::visitInstruction(Instruction &I) {
   return false;
 }
 
-
 /// \brief Analyze a basic block for its contribution to the inline cost.
 ///
 /// This method walks the analyzer over every instruction in the given basic
@@ -1044,7 +1170,7 @@ ConstantInt *CallAnalyzer::stripAndComputeInBoundsConstantOffsets(Value *&V) {
     } else if (Operator::getOpcode(V) == Instruction::BitCast) {
       V = cast<Operator>(V)->getOperand(0);
     } else if (GlobalAlias *GA = dyn_cast<GlobalAlias>(V)) {
-      if (GA->mayBeOverridden())
+      if (GA->isInterposable())
         break;
       V = GA->getAliasee();
     } else {
@@ -1079,6 +1205,10 @@ bool CallAnalyzer::analyzeCall(CallSite CS) {
   // nice to base the bonus values on something more scientific.
   assert(NumInstructions == 0);
   assert(NumVectorInstructions == 0);
+
+  // Update the threshold based on callsite properties
+  updateThreshold(CS, F);
+
   FiftyPercentVectorBonus = 3 * Threshold / 2;
   TenPercentVectorBonus = 3 * Threshold / 4;
   const DataLayout &DL = F.getParent()->getDataLayout();
@@ -1124,22 +1254,11 @@ bool CallAnalyzer::analyzeCall(CallSite CS) {
 
   // If there is only one call of the function, and it has internal linkage,
   // the cost of inlining it drops dramatically.
-  bool OnlyOneCallAndLocalLinkage = F.hasLocalLinkage() && F.hasOneUse() &&
-    &F == CS.getCalledFunction();
+  bool OnlyOneCallAndLocalLinkage =
+      F.hasLocalLinkage() && F.hasOneUse() && &F == CS.getCalledFunction();
   if (OnlyOneCallAndLocalLinkage)
     Cost += InlineConstants::LastCallToStaticBonus;
 
-  // If the instruction after the call, or if the normal destination of the
-  // invoke is an unreachable instruction, the function is noreturn. As such,
-  // there is little point in inlining this unless there is literally zero
-  // cost.
-  Instruction *Instr = CS.getInstruction();
-  if (InvokeInst *II = dyn_cast<InvokeInst>(Instr)) {
-    if (isa<UnreachableInst>(II->getNormalDest()->begin()))
-      Threshold = 0;
-  } else if (isa<UnreachableInst>(++BasicBlock::iterator(Instr)))
-    Threshold = 0;
-
   // If this function uses the coldcc calling convention, prefer not to inline
   // it.
   if (F.getCallingConv() == CallingConv::Cold)
@@ -1193,7 +1312,8 @@ bool CallAnalyzer::analyzeCall(CallSite CS) {
   // the ephemeral values multiple times (and they're completely determined by
   // the callee, so this is purely duplicate work).
   SmallPtrSet<const Value *, 32> EphValues;
-  CodeMetrics::collectEphemeralValues(&F, &ACT->getAssumptionCache(F), EphValues);
+  CodeMetrics::collectEphemeralValues(&F, &ACT->getAssumptionCache(F),
+                                      EphValues);
 
   // The worklist of live basic blocks in the callee *after* inlining. We avoid
   // adding basic blocks of the callee which can be proven to be dead for this
@@ -1203,7 +1323,8 @@ bool CallAnalyzer::analyzeCall(CallSite CS) {
   // accomplish this, prioritizing for small iterations because we exit after
   // crossing our threshold, we use a small-size optimized SetVector.
   typedef SetVector<BasicBlock *, SmallVector<BasicBlock *, 16>,
-                                  SmallPtrSet<BasicBlock *, 16> > BBSetVector;
+                    SmallPtrSet<BasicBlock *, 16>>
+      BBSetVector;
   BBSetVector BBWorklist;
   BBWorklist.insert(&F.getEntryBlock());
   // Note that we *must not* cache the size, this loop grows the worklist.
@@ -1228,20 +1349,8 @@ bool CallAnalyzer::analyzeCall(CallSite CS) {
 
     // Analyze the cost of this block. If we blow through the threshold, this
     // returns false, and we can bail on out.
-    if (!analyzeBlock(BB, EphValues)) {
-      if (IsRecursiveCall || ExposesReturnsTwice || HasDynamicAlloca ||
-          HasIndirectBr || HasFrameEscape)
-        return false;
-
-      // If the caller is a recursive function then we don't want to inline
-      // functions which allocate a lot of stack space because it would increase
-      // the caller stack usage dramatically.
-      if (IsCallerRecursive &&
-          AllocatedSize > InlineConstants::TotalAllocaSizeRecursiveCaller)
-        return false;
-
-      break;
-    }
+    if (!analyzeBlock(BB, EphValues))
+      return false;
 
     TerminatorInst *TI = BB->getTerminator();
 
@@ -1250,16 +1359,16 @@ bool CallAnalyzer::analyzeCall(CallSite CS) {
     if (BranchInst *BI = dyn_cast<BranchInst>(TI)) {
       if (BI->isConditional()) {
         Value *Cond = BI->getCondition();
-        if (ConstantInt *SimpleCond
-              = dyn_cast_or_null<ConstantInt>(SimplifiedValues.lookup(Cond))) {
+        if (ConstantInt *SimpleCond =
+                dyn_cast_or_null<ConstantInt>(SimplifiedValues.lookup(Cond))) {
           BBWorklist.insert(BI->getSuccessor(SimpleCond->isZero() ? 1 : 0));
           continue;
         }
       }
     } else if (SwitchInst *SI = dyn_cast<SwitchInst>(TI)) {
       Value *Cond = SI->getCondition();
-      if (ConstantInt *SimpleCond
-            = dyn_cast_or_null<ConstantInt>(SimplifiedValues.lookup(Cond))) {
+      if (ConstantInt *SimpleCond =
+              dyn_cast_or_null<ConstantInt>(SimplifiedValues.lookup(Cond))) {
         BBWorklist.insert(SI->findCaseValue(SimpleCond).getCaseSuccessor());
         continue;
       }
@@ -1296,12 +1405,12 @@ bool CallAnalyzer::analyzeCall(CallSite CS) {
   else if (NumVectorInstructions <= NumInstructions / 2)
     Threshold -= (FiftyPercentVectorBonus - TenPercentVectorBonus);
 
-  return Cost <= std::max(0, Threshold);
+  return Cost < std::max(1, Threshold);
 }
 
 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
 /// \brief Dump stats about this call's analysis.
-void CallAnalyzer::dump() {
+LLVM_DUMP_METHOD void CallAnalyzer::dump() {
 #define DEBUG_PRINT_STAT(x) dbgs() << "      " #x ": " << x << "\n"
   DEBUG_PRINT_STAT(NumConstantArgs);
   DEBUG_PRINT_STAT(NumConstantOffsetPtrArgs);
@@ -1321,7 +1430,7 @@ void CallAnalyzer::dump() {
 
 /// \brief Test that two functions either have or have not the given attribute
 ///        at the same time.
-template<typename AttrKind>
+template <typename AttrKind>
 static bool attributeMatches(Function *F1, Function *F2, AttrKind Attr) {
   return F1->getFnAttribute(Attr) == F2->getFnAttribute(Attr);
 }
@@ -1335,15 +1444,33 @@ static bool functionsHaveCompatibleAttributes(Function *Caller,
          AttributeFuncs::areInlineCompatible(*Caller, *Callee);
 }
 
-InlineCost llvm::getInlineCost(CallSite CS, int Threshold,
+InlineCost llvm::getInlineCost(CallSite CS, int DefaultThreshold,
                                TargetTransformInfo &CalleeTTI,
-                               AssumptionCacheTracker *ACT) {
-  return getInlineCost(CS, CS.getCalledFunction(), Threshold, CalleeTTI, ACT);
+                               AssumptionCacheTracker *ACT,
+                               ProfileSummaryInfo *PSI) {
+  return getInlineCost(CS, CS.getCalledFunction(), DefaultThreshold, CalleeTTI,
+                       ACT, PSI);
+}
+
+int llvm::computeThresholdFromOptLevels(unsigned OptLevel,
+                                        unsigned SizeOptLevel) {
+  if (OptLevel > 2)
+    return OptAggressiveThreshold;
+  if (SizeOptLevel == 1) // -Os
+    return OptSizeThreshold;
+  if (SizeOptLevel == 2) // -Oz
+    return OptMinSizeThreshold;
+  return DefaultInlineThreshold;
 }
 
-InlineCost llvm::getInlineCost(CallSite CS, Function *Callee, int Threshold,
+int llvm::getDefaultInlineThreshold() { return DefaultInlineThreshold; }
+
+InlineCost llvm::getInlineCost(CallSite CS, Function *Callee,
+                               int DefaultThreshold,
                                TargetTransformInfo &CalleeTTI,
-                               AssumptionCacheTracker *ACT) {
+                               AssumptionCacheTracker *ACT,
+                               ProfileSummaryInfo *PSI) {
+
   // Cannot inline indirect calls.
   if (!Callee)
     return llvm::InlineCost::getNever();
@@ -1365,17 +1492,18 @@ InlineCost llvm::getInlineCost(CallSite CS, Function *Callee, int Threshold,
   if (CS.getCaller()->hasFnAttribute(Attribute::OptimizeNone))
     return llvm::InlineCost::getNever();
 
-  // Don't inline functions which can be redefined at link-time to mean
-  // something else.  Don't inline functions marked noinline or call sites
-  // marked noinline.
-  if (Callee->mayBeOverridden() ||
-      Callee->hasFnAttribute(Attribute::NoInline) || CS.isNoInline())
+  // Don't inline functions which can be interposed at link-time.  Don't inline
+  // functions marked noinline or call sites marked noinline.
+  // Note: inlining non-exact non-interposable fucntions is fine, since we know
+  // we have *a* correct implementation of the source level function.
+  if (Callee->isInterposable() || Callee->hasFnAttribute(Attribute::NoInline) ||
+      CS.isNoInline())
     return llvm::InlineCost::getNever();
 
   DEBUG(llvm::dbgs() << "      Analyzing call of " << Callee->getName()
-        << "...\n");
+                     << "...\n");
 
-  CallAnalyzer CA(CalleeTTI, ACT, *Callee, Threshold, CS);
+  CallAnalyzer CA(CalleeTTI, ACT, PSI, *Callee, DefaultThreshold, CS);
   bool ShouldInline = CA.analyzeCall(CS);
 
   DEBUG(CA.dump());
diff --git a/lib/Analysis/InstructionSimplify.cpp b/lib/Analysis/InstructionSimplify.cpp
index 6dfe62596275..0cb2c78afb40 100644
--- a/lib/Analysis/InstructionSimplify.cpp
+++ b/lib/Analysis/InstructionSimplify.cpp
@@ -21,6 +21,7 @@
 #include "llvm/ADT/SetVector.h"
 #include "llvm/ADT/Statistic.h"
 #include "llvm/Analysis/AliasAnalysis.h"
+#include "llvm/Analysis/CaptureTracking.h"
 #include "llvm/Analysis/ConstantFolding.h"
 #include "llvm/Analysis/MemoryBuiltins.h"
 #include "llvm/Analysis/ValueTracking.h"
@@ -528,11 +529,8 @@ static Value *ThreadCmpOverPHI(CmpInst::Predicate Pred, Value *LHS, Value *RHS,
 static Value *SimplifyAddInst(Value *Op0, Value *Op1, bool isNSW, bool isNUW,
                               const Query &Q, unsigned MaxRecurse) {
   if (Constant *CLHS = dyn_cast<Constant>(Op0)) {
-    if (Constant *CRHS = dyn_cast<Constant>(Op1)) {
-      Constant *Ops[] = { CLHS, CRHS };
-      return ConstantFoldInstOperands(Instruction::Add, CLHS->getType(), Ops,
-                                      Q.DL, Q.TLI);
-    }
+    if (Constant *CRHS = dyn_cast<Constant>(Op1))
+      return ConstantFoldBinaryOpOperands(Instruction::Add, CLHS, CRHS, Q.DL);
 
     // Canonicalize the constant to the RHS.
     std::swap(Op0, Op1);
@@ -619,10 +617,15 @@ static Constant *stripAndComputeConstantOffsets(const DataLayout &DL, Value *&V,
     } else if (Operator::getOpcode(V) == Instruction::BitCast) {
       V = cast<Operator>(V)->getOperand(0);
     } else if (GlobalAlias *GA = dyn_cast<GlobalAlias>(V)) {
-      if (GA->mayBeOverridden())
+      if (GA->isInterposable())
         break;
       V = GA->getAliasee();
     } else {
+      if (auto CS = CallSite(V))
+        if (Value *RV = CS.getReturnedArgOperand()) {
+          V = RV;
+          continue;
+        }
       break;
     }
     assert(V->getType()->getScalarType()->isPointerTy() &&
@@ -660,11 +663,8 @@ static Constant *computePointerDifference(const DataLayout &DL, Value *LHS,
 static Value *SimplifySubInst(Value *Op0, Value *Op1, bool isNSW, bool isNUW,
                               const Query &Q, unsigned MaxRecurse) {
   if (Constant *CLHS = dyn_cast<Constant>(Op0))
-    if (Constant *CRHS = dyn_cast<Constant>(Op1)) {
-      Constant *Ops[] = { CLHS, CRHS };
-      return ConstantFoldInstOperands(Instruction::Sub, CLHS->getType(),
-                                      Ops, Q.DL, Q.TLI);
-    }
+    if (Constant *CRHS = dyn_cast<Constant>(Op1))
+      return ConstantFoldBinaryOpOperands(Instruction::Sub, CLHS, CRHS, Q.DL);
 
   // X - undef -> undef
   // undef - X -> undef
@@ -787,11 +787,8 @@ Value *llvm::SimplifySubInst(Value *Op0, Value *Op1, bool isNSW, bool isNUW,
 static Value *SimplifyFAddInst(Value *Op0, Value *Op1, FastMathFlags FMF,
                               const Query &Q, unsigned MaxRecurse) {
   if (Constant *CLHS = dyn_cast<Constant>(Op0)) {
-    if (Constant *CRHS = dyn_cast<Constant>(Op1)) {
-      Constant *Ops[] = { CLHS, CRHS };
-      return ConstantFoldInstOperands(Instruction::FAdd, CLHS->getType(),
-                                      Ops, Q.DL, Q.TLI);
-    }
+    if (Constant *CRHS = dyn_cast<Constant>(Op1))
+      return ConstantFoldBinaryOpOperands(Instruction::FAdd, CLHS, CRHS, Q.DL);
 
     // Canonicalize the constant to the RHS.
     std::swap(Op0, Op1);
@@ -803,7 +800,7 @@ static Value *SimplifyFAddInst(Value *Op0, Value *Op1, FastMathFlags FMF,
 
   // fadd X, 0 ==> X, when we know X is not -0
   if (match(Op1, m_Zero()) &&
-      (FMF.noSignedZeros() || CannotBeNegativeZero(Op0)))
+      (FMF.noSignedZeros() || CannotBeNegativeZero(Op0, Q.TLI)))
     return Op0;
 
   // fadd [nnan ninf] X, (fsub [nnan ninf] 0, X) ==> 0
@@ -829,11 +826,8 @@ static Value *SimplifyFAddInst(Value *Op0, Value *Op1, FastMathFlags FMF,
 static Value *SimplifyFSubInst(Value *Op0, Value *Op1, FastMathFlags FMF,
                               const Query &Q, unsigned MaxRecurse) {
   if (Constant *CLHS = dyn_cast<Constant>(Op0)) {
-    if (Constant *CRHS = dyn_cast<Constant>(Op1)) {
-      Constant *Ops[] = { CLHS, CRHS };
-      return ConstantFoldInstOperands(Instruction::FSub, CLHS->getType(),
-                                      Ops, Q.DL, Q.TLI);
-    }
+    if (Constant *CRHS = dyn_cast<Constant>(Op1))
+      return ConstantFoldBinaryOpOperands(Instruction::FSub, CLHS, CRHS, Q.DL);
   }
 
   // fsub X, 0 ==> X
@@ -842,17 +836,18 @@ static Value *SimplifyFSubInst(Value *Op0, Value *Op1, FastMathFlags FMF,
 
   // fsub X, -0 ==> X, when we know X is not -0
   if (match(Op1, m_NegZero()) &&
-      (FMF.noSignedZeros() || CannotBeNegativeZero(Op0)))
+      (FMF.noSignedZeros() || CannotBeNegativeZero(Op0, Q.TLI)))
     return Op0;
 
-  // fsub 0, (fsub -0.0, X) ==> X
+  // fsub -0.0, (fsub -0.0, X) ==> X
   Value *X;
-  if (match(Op0, m_AnyZero())) {
-    if (match(Op1, m_FSub(m_NegZero(), m_Value(X))))
-      return X;
-    if (FMF.noSignedZeros() && match(Op1, m_FSub(m_AnyZero(), m_Value(X))))
-      return X;
-  }
+  if (match(Op0, m_NegZero()) && match(Op1, m_FSub(m_NegZero(), m_Value(X))))
+    return X;
+
+  // fsub 0.0, (fsub 0.0, X) ==> X if signed zeros are ignored.
+  if (FMF.noSignedZeros() && match(Op0, m_AnyZero()) &&
+      match(Op1, m_FSub(m_AnyZero(), m_Value(X))))
+    return X;
 
   // fsub nnan x, x ==> 0.0
   if (FMF.noNaNs() && Op0 == Op1)
@@ -867,11 +862,8 @@ static Value *SimplifyFMulInst(Value *Op0, Value *Op1,
                                const Query &Q,
                                unsigned MaxRecurse) {
  if (Constant *CLHS = dyn_cast<Constant>(Op0)) {
-    if (Constant *CRHS = dyn_cast<Constant>(Op1)) {
-      Constant *Ops[] = { CLHS, CRHS };
-      return ConstantFoldInstOperands(Instruction::FMul, CLHS->getType(),
-                                      Ops, Q.DL, Q.TLI);
-    }
+    if (Constant *CRHS = dyn_cast<Constant>(Op1))
+      return ConstantFoldBinaryOpOperands(Instruction::FMul, CLHS, CRHS, Q.DL);
 
     // Canonicalize the constant to the RHS.
     std::swap(Op0, Op1);
@@ -893,11 +885,8 @@ static Value *SimplifyFMulInst(Value *Op0, Value *Op1,
 static Value *SimplifyMulInst(Value *Op0, Value *Op1, const Query &Q,
                               unsigned MaxRecurse) {
   if (Constant *CLHS = dyn_cast<Constant>(Op0)) {
-    if (Constant *CRHS = dyn_cast<Constant>(Op1)) {
-      Constant *Ops[] = { CLHS, CRHS };
-      return ConstantFoldInstOperands(Instruction::Mul, CLHS->getType(),
-                                      Ops, Q.DL, Q.TLI);
-    }
+    if (Constant *CRHS = dyn_cast<Constant>(Op1))
+      return ConstantFoldBinaryOpOperands(Instruction::Mul, CLHS, CRHS, Q.DL);
 
     // Canonicalize the constant to the RHS.
     std::swap(Op0, Op1);
@@ -992,12 +981,9 @@ Value *llvm::SimplifyMulInst(Value *Op0, Value *Op1, const DataLayout &DL,
 /// If not, this returns null.
 static Value *SimplifyDiv(Instruction::BinaryOps Opcode, Value *Op0, Value *Op1,
                           const Query &Q, unsigned MaxRecurse) {
-  if (Constant *C0 = dyn_cast<Constant>(Op0)) {
-    if (Constant *C1 = dyn_cast<Constant>(Op1)) {
-      Constant *Ops[] = { C0, C1 };
-      return ConstantFoldInstOperands(Opcode, C0->getType(), Ops, Q.DL, Q.TLI);
-    }
-  }
+  if (Constant *C0 = dyn_cast<Constant>(Op0))
+    if (Constant *C1 = dyn_cast<Constant>(Op1))
+      return ConstantFoldBinaryOpOperands(Opcode, C0, C1, Q.DL);
 
   bool isSigned = Opcode == Instruction::SDiv;
 
@@ -1157,12 +1143,9 @@ Value *llvm::SimplifyFDivInst(Value *Op0, Value *Op1, FastMathFlags FMF,
 /// If not, this returns null.
 static Value *SimplifyRem(Instruction::BinaryOps Opcode, Value *Op0, Value *Op1,
                           const Query &Q, unsigned MaxRecurse) {
-  if (Constant *C0 = dyn_cast<Constant>(Op0)) {
-    if (Constant *C1 = dyn_cast<Constant>(Op1)) {
-      Constant *Ops[] = { C0, C1 };
-      return ConstantFoldInstOperands(Opcode, C0->getType(), Ops, Q.DL, Q.TLI);
-    }
-  }
+  if (Constant *C0 = dyn_cast<Constant>(Op0))
+    if (Constant *C1 = dyn_cast<Constant>(Op1))
+      return ConstantFoldBinaryOpOperands(Opcode, C0, C1, Q.DL);
 
   // X % undef -> undef
   if (match(Op1, m_Undef()))
@@ -1309,12 +1292,9 @@ static bool isUndefShift(Value *Amount) {
 /// If not, this returns null.
 static Value *SimplifyShift(unsigned Opcode, Value *Op0, Value *Op1,
                             const Query &Q, unsigned MaxRecurse) {
-  if (Constant *C0 = dyn_cast<Constant>(Op0)) {
-    if (Constant *C1 = dyn_cast<Constant>(Op1)) {
-      Constant *Ops[] = { C0, C1 };
-      return ConstantFoldInstOperands(Opcode, C0->getType(), Ops, Q.DL, Q.TLI);
-    }
-  }
+  if (Constant *C0 = dyn_cast<Constant>(Op0))
+    if (Constant *C1 = dyn_cast<Constant>(Op1))
+      return ConstantFoldBinaryOpOperands(Opcode, C0, C1, Q.DL);
 
   // 0 shift by X -> 0
   if (match(Op0, m_Zero()))
@@ -1340,6 +1320,22 @@ static Value *SimplifyShift(unsigned Opcode, Value *Op0, Value *Op1,
     if (Value *V = ThreadBinOpOverPHI(Opcode, Op0, Op1, Q, MaxRecurse))
       return V;
 
+  // If any bits in the shift amount make that value greater than or equal to
+  // the number of bits in the type, the shift is undefined.
+  unsigned BitWidth = Op1->getType()->getScalarSizeInBits();
+  APInt KnownZero(BitWidth, 0);
+  APInt KnownOne(BitWidth, 0);
+  computeKnownBits(Op1, KnownZero, KnownOne, Q.DL, 0, Q.AC, Q.CxtI, Q.DT);
+  if (KnownOne.getLimitedValue() >= BitWidth)
+    return UndefValue::get(Op0->getType());
+
+  // If all valid bits in the shift amount are known zero, the first operand is
+  // unchanged.
+  unsigned NumValidShiftBits = Log2_32_Ceil(BitWidth);
+  APInt ShiftAmountMask = APInt::getLowBitsSet(BitWidth, NumValidShiftBits);
+  if ((KnownZero & ShiftAmountMask) == ShiftAmountMask)
+    return Op0;
+
   return nullptr;
 }
 
@@ -1501,9 +1497,8 @@ static Value *simplifyUnsignedRangeCheck(ICmpInst *ZeroICmp,
   return nullptr;
 }
 
-/// Simplify (and (icmp ...) (icmp ...)) to true when we can tell that the range
-/// of possible values cannot be satisfied.
 static Value *SimplifyAndOfICmps(ICmpInst *Op0, ICmpInst *Op1) {
+  Type *ITy = Op0->getType();
   ICmpInst::Predicate Pred0, Pred1;
   ConstantInt *CI1, *CI2;
   Value *V;
@@ -1511,15 +1506,25 @@ static Value *SimplifyAndOfICmps(ICmpInst *Op0, ICmpInst *Op1) {
   if (Value *X = simplifyUnsignedRangeCheck(Op0, Op1, /*IsAnd=*/true))
     return X;
 
+  // Look for this pattern: (icmp V, C0) & (icmp V, C1)).
+  const APInt *C0, *C1;
+  if (match(Op0, m_ICmp(Pred0, m_Value(V), m_APInt(C0))) &&
+      match(Op1, m_ICmp(Pred1, m_Specific(V), m_APInt(C1)))) {
+    // Make a constant range that's the intersection of the two icmp ranges.
+    // If the intersection is empty, we know that the result is false.
+    auto Range0 = ConstantRange::makeAllowedICmpRegion(Pred0, *C0);
+    auto Range1 = ConstantRange::makeAllowedICmpRegion(Pred1, *C1);
+    if (Range0.intersectWith(Range1).isEmptySet())
+      return getFalse(ITy);
+  }
+
   if (!match(Op0, m_ICmp(Pred0, m_Add(m_Value(V), m_ConstantInt(CI1)),
                          m_ConstantInt(CI2))))
-   return nullptr;
+    return nullptr;
 
   if (!match(Op1, m_ICmp(Pred1, m_Specific(V), m_Specific(CI1))))
     return nullptr;
 
-  Type *ITy = Op0->getType();
-
   auto *AddInst = cast<BinaryOperator>(Op0->getOperand(0));
   bool isNSW = AddInst->hasNoSignedWrap();
   bool isNUW = AddInst->hasNoUnsignedWrap();
@@ -1558,11 +1563,8 @@ static Value *SimplifyAndOfICmps(ICmpInst *Op0, ICmpInst *Op1) {
 static Value *SimplifyAndInst(Value *Op0, Value *Op1, const Query &Q,
                               unsigned MaxRecurse) {
   if (Constant *CLHS = dyn_cast<Constant>(Op0)) {
-    if (Constant *CRHS = dyn_cast<Constant>(Op1)) {
-      Constant *Ops[] = { CLHS, CRHS };
-      return ConstantFoldInstOperands(Instruction::And, CLHS->getType(),
-                                      Ops, Q.DL, Q.TLI);
-    }
+    if (Constant *CRHS = dyn_cast<Constant>(Op1))
+      return ConstantFoldBinaryOpOperands(Instruction::And, CLHS, CRHS, Q.DL);
 
     // Canonicalize the constant to the RHS.
     std::swap(Op0, Op1);
@@ -1620,6 +1622,24 @@ static Value *SimplifyAndInst(Value *Op0, Value *Op1, const Query &Q,
     }
   }
 
+  // The compares may be hidden behind casts. Look through those and try the
+  // same folds as above.
+  auto *Cast0 = dyn_cast<CastInst>(Op0);
+  auto *Cast1 = dyn_cast<CastInst>(Op1);
+  if (Cast0 && Cast1 && Cast0->getOpcode() == Cast1->getOpcode() &&
+      Cast0->getSrcTy() == Cast1->getSrcTy()) {
+    auto *Cmp0 = dyn_cast<ICmpInst>(Cast0->getOperand(0));
+    auto *Cmp1 = dyn_cast<ICmpInst>(Cast1->getOperand(0));
+    if (Cmp0 && Cmp1) {
+      Instruction::CastOps CastOpc = Cast0->getOpcode();
+      Type *ResultType = Cast0->getType();
+      if (auto *V = dyn_cast_or_null<Constant>(SimplifyAndOfICmps(Cmp0, Cmp1)))
+        return ConstantExpr::getCast(CastOpc, V, ResultType);
+      if (auto *V = dyn_cast_or_null<Constant>(SimplifyAndOfICmps(Cmp1, Cmp0)))
+        return ConstantExpr::getCast(CastOpc, V, ResultType);
+    }
+  }
+
   // Try some generic simplifications for associative operations.
   if (Value *V = SimplifyAssociativeBinOp(Instruction::And, Op0, Op1, Q,
                                           MaxRecurse))
@@ -1717,11 +1737,8 @@ static Value *SimplifyOrOfICmps(ICmpInst *Op0, ICmpInst *Op1) {
 static Value *SimplifyOrInst(Value *Op0, Value *Op1, const Query &Q,
                              unsigned MaxRecurse) {
   if (Constant *CLHS = dyn_cast<Constant>(Op0)) {
-    if (Constant *CRHS = dyn_cast<Constant>(Op1)) {
-      Constant *Ops[] = { CLHS, CRHS };
-      return ConstantFoldInstOperands(Instruction::Or, CLHS->getType(),
-                                      Ops, Q.DL, Q.TLI);
-    }
+    if (Constant *CRHS = dyn_cast<Constant>(Op1))
+      return ConstantFoldBinaryOpOperands(Instruction::Or, CLHS, CRHS, Q.DL);
 
     // Canonicalize the constant to the RHS.
     std::swap(Op0, Op1);
@@ -1853,11 +1870,8 @@ Value *llvm::SimplifyOrInst(Value *Op0, Value *Op1, const DataLayout &DL,
 static Value *SimplifyXorInst(Value *Op0, Value *Op1, const Query &Q,
                               unsigned MaxRecurse) {
   if (Constant *CLHS = dyn_cast<Constant>(Op0)) {
-    if (Constant *CRHS = dyn_cast<Constant>(Op1)) {
-      Constant *Ops[] = { CLHS, CRHS };
-      return ConstantFoldInstOperands(Instruction::Xor, CLHS->getType(),
-                                      Ops, Q.DL, Q.TLI);
-    }
+    if (Constant *CRHS = dyn_cast<Constant>(Op1))
+      return ConstantFoldBinaryOpOperands(Instruction::Xor, CLHS, CRHS, Q.DL);
 
     // Canonicalize the constant to the RHS.
     std::swap(Op0, Op1);
@@ -1957,16 +1971,16 @@ static Value *ExtractEquivalentCondition(Value *V, CmpInst::Predicate Pred,
 // If the C and C++ standards are ever made sufficiently restrictive in this
 // area, it may be possible to update LLVM's semantics accordingly and reinstate
 // this optimization.
-static Constant *computePointerICmp(const DataLayout &DL,
-                                    const TargetLibraryInfo *TLI,
-                                    CmpInst::Predicate Pred, Value *LHS,
-                                    Value *RHS) {
+static Constant *
+computePointerICmp(const DataLayout &DL, const TargetLibraryInfo *TLI,
+                   const DominatorTree *DT, CmpInst::Predicate Pred,
+                   const Instruction *CxtI, Value *LHS, Value *RHS) {
   // First, skip past any trivial no-ops.
   LHS = LHS->stripPointerCasts();
   RHS = RHS->stripPointerCasts();
 
   // A non-null pointer is not equal to a null pointer.
-  if (llvm::isKnownNonNull(LHS, TLI) && isa<ConstantPointerNull>(RHS) &&
+  if (llvm::isKnownNonNull(LHS) && isa<ConstantPointerNull>(RHS) &&
       (Pred == CmpInst::ICMP_EQ || Pred == CmpInst::ICMP_NE))
     return ConstantInt::get(GetCompareTy(LHS),
                             !CmpInst::isTrueWhenEqual(Pred));
@@ -2104,7 +2118,7 @@ static Constant *computePointerICmp(const DataLayout &DL,
           return AI->getParent() && AI->getFunction() && AI->isStaticAlloca();
         if (const GlobalValue *GV = dyn_cast<GlobalValue>(V))
           return (GV->hasLocalLinkage() || GV->hasHiddenVisibility() ||
-                  GV->hasProtectedVisibility() || GV->hasUnnamedAddr()) &&
+                  GV->hasProtectedVisibility() || GV->hasGlobalUnnamedAddr()) &&
                  !GV->isThreadLocal();
         if (const Argument *A = dyn_cast<Argument>(V))
           return A->hasByValAttr();
@@ -2116,6 +2130,20 @@ static Constant *computePointerICmp(const DataLayout &DL,
         (IsNAC(RHSUObjs) && IsAllocDisjoint(LHSUObjs)))
         return ConstantInt::get(GetCompareTy(LHS),
                                 !CmpInst::isTrueWhenEqual(Pred));
+
+    // Fold comparisons for non-escaping pointer even if the allocation call
+    // cannot be elided. We cannot fold malloc comparison to null. Also, the
+    // dynamic allocation call could be either of the operands.
+    Value *MI = nullptr;
+    if (isAllocLikeFn(LHS, TLI) && llvm::isKnownNonNullAt(RHS, CxtI, DT))
+      MI = LHS;
+    else if (isAllocLikeFn(RHS, TLI) && llvm::isKnownNonNullAt(LHS, CxtI, DT))
+      MI = RHS;
+    // FIXME: We should also fold the compare when the pointer escapes, but the
+    // compare dominates the pointer escape
+    if (MI && !PointerMayBeCaptured(MI, true, true))
+      return ConstantInt::get(GetCompareTy(LHS),
+                              CmpInst::isFalseWhenEqual(Pred));
   }
 
   // Otherwise, fail.
@@ -2166,24 +2194,26 @@ static Value *SimplifyICmpInst(unsigned Predicate, Value *LHS, Value *RHS,
       if (match(RHS, m_Zero()))
         return LHS;
       break;
-    case ICmpInst::ICMP_UGE:
+    case ICmpInst::ICMP_UGE: {
       // X >=u 1 -> X
       if (match(RHS, m_One()))
         return LHS;
-      if (isImpliedCondition(RHS, LHS, Q.DL))
+      if (isImpliedCondition(RHS, LHS, Q.DL).getValueOr(false))
         return getTrue(ITy);
       break;
-    case ICmpInst::ICMP_SGE:
-      /// For signed comparison, the values for an i1 are 0 and -1 
+    }
+    case ICmpInst::ICMP_SGE: {
+      /// For signed comparison, the values for an i1 are 0 and -1
       /// respectively. This maps into a truth table of:
       /// LHS | RHS | LHS >=s RHS   | LHS implies RHS
       ///  0  |  0  |  1 (0 >= 0)   |  1
       ///  0  |  1  |  1 (0 >= -1)  |  1
       ///  1  |  0  |  0 (-1 >= 0)  |  0
       ///  1  |  1  |  1 (-1 >= -1) |  1
-      if (isImpliedCondition(LHS, RHS, Q.DL))
+      if (isImpliedCondition(LHS, RHS, Q.DL).getValueOr(false))
         return getTrue(ITy);
       break;
+    }
     case ICmpInst::ICMP_SLT:
       // X <s 0 -> X
       if (match(RHS, m_Zero()))
@@ -2194,11 +2224,12 @@ static Value *SimplifyICmpInst(unsigned Predicate, Value *LHS, Value *RHS,
       if (match(RHS, m_One()))
         return LHS;
       break;
-    case ICmpInst::ICMP_ULE:
-      if (isImpliedCondition(LHS, RHS, Q.DL))
+    case ICmpInst::ICMP_ULE: {
+      if (isImpliedCondition(LHS, RHS, Q.DL).getValueOr(false))
         return getTrue(ITy);
       break;
     }
+    }
   }
 
   // If we are comparing with zero then try hard since this is a common case.
@@ -2300,7 +2331,7 @@ static Value *SimplifyICmpInst(unsigned Predicate, Value *LHS, Value *RHS,
     } else if (match(LHS, m_SDiv(m_Value(), m_ConstantInt(CI2)))) {
       APInt IntMin = APInt::getSignedMinValue(Width);
       APInt IntMax = APInt::getSignedMaxValue(Width);
-      APInt Val = CI2->getValue();
+      const APInt &Val = CI2->getValue();
       if (Val.isAllOnesValue()) {
         // 'sdiv x, -1' produces [INT_MIN + 1, INT_MAX]
         //    where CI2 != -1 and CI2 != 0 and CI2 != 1
@@ -2581,7 +2612,7 @@ static Value *SimplifyICmpInst(unsigned Predicate, Value *LHS, Value *RHS,
     return Pred == ICmpInst::ICMP_NE ?
       ConstantInt::getTrue(Ctx) : ConstantInt::getFalse(Ctx);
   }
-  
+
   // Special logic for binary operators.
   BinaryOperator *LBO = dyn_cast<BinaryOperator>(LHS);
   BinaryOperator *RBO = dyn_cast<BinaryOperator>(RHS);
@@ -2645,21 +2676,48 @@ static Value *SimplifyICmpInst(unsigned Predicate, Value *LHS, Value *RHS,
     }
   }
 
-  // icmp pred (or X, Y), X
-  if (LBO && match(LBO, m_CombineOr(m_Or(m_Value(), m_Specific(RHS)),
-                                    m_Or(m_Specific(RHS), m_Value())))) {
-    if (Pred == ICmpInst::ICMP_ULT)
-      return getFalse(ITy);
-    if (Pred == ICmpInst::ICMP_UGE)
-      return getTrue(ITy);
-  }
-  // icmp pred X, (or X, Y)
-  if (RBO && match(RBO, m_CombineOr(m_Or(m_Value(), m_Specific(LHS)),
-                                    m_Or(m_Specific(LHS), m_Value())))) {
-    if (Pred == ICmpInst::ICMP_ULE)
-      return getTrue(ITy);
-    if (Pred == ICmpInst::ICMP_UGT)
-      return getFalse(ITy);
+  {
+    Value *Y = nullptr;
+    // icmp pred (or X, Y), X
+    if (LBO && match(LBO, m_c_Or(m_Value(Y), m_Specific(RHS)))) {
+      if (Pred == ICmpInst::ICMP_ULT)
+        return getFalse(ITy);
+      if (Pred == ICmpInst::ICMP_UGE)
+        return getTrue(ITy);
+
+      if (Pred == ICmpInst::ICMP_SLT || Pred == ICmpInst::ICMP_SGE) {
+        bool RHSKnownNonNegative, RHSKnownNegative;
+        bool YKnownNonNegative, YKnownNegative;
+        ComputeSignBit(RHS, RHSKnownNonNegative, RHSKnownNegative, Q.DL, 0,
+                       Q.AC, Q.CxtI, Q.DT);
+        ComputeSignBit(Y, YKnownNonNegative, YKnownNegative, Q.DL, 0, Q.AC,
+                       Q.CxtI, Q.DT);
+        if (RHSKnownNonNegative && YKnownNegative)
+          return Pred == ICmpInst::ICMP_SLT ? getTrue(ITy) : getFalse(ITy);
+        if (RHSKnownNegative || YKnownNonNegative)
+          return Pred == ICmpInst::ICMP_SLT ? getFalse(ITy) : getTrue(ITy);
+      }
+    }
+    // icmp pred X, (or X, Y)
+    if (RBO && match(RBO, m_c_Or(m_Value(Y), m_Specific(LHS)))) {
+      if (Pred == ICmpInst::ICMP_ULE)
+        return getTrue(ITy);
+      if (Pred == ICmpInst::ICMP_UGT)
+        return getFalse(ITy);
+
+      if (Pred == ICmpInst::ICMP_SGT || Pred == ICmpInst::ICMP_SLE) {
+        bool LHSKnownNonNegative, LHSKnownNegative;
+        bool YKnownNonNegative, YKnownNegative;
+        ComputeSignBit(LHS, LHSKnownNonNegative, LHSKnownNegative, Q.DL, 0,
+                       Q.AC, Q.CxtI, Q.DT);
+        ComputeSignBit(Y, YKnownNonNegative, YKnownNegative, Q.DL, 0, Q.AC,
+                       Q.CxtI, Q.DT);
+        if (LHSKnownNonNegative && YKnownNegative)
+          return Pred == ICmpInst::ICMP_SGT ? getTrue(ITy) : getFalse(ITy);
+        if (LHSKnownNegative || YKnownNonNegative)
+          return Pred == ICmpInst::ICMP_SGT ? getFalse(ITy) : getTrue(ITy);
+      }
+    }
   }
 
   // icmp pred (and X, Y), X
@@ -2763,9 +2821,11 @@ static Value *SimplifyICmpInst(unsigned Predicate, Value *LHS, Value *RHS,
     }
   }
 
+  // x >> y <=u x
   // x udiv y <=u x.
-  if (LBO && match(LBO, m_UDiv(m_Specific(RHS), m_Value()))) {
-    // icmp pred (X /u Y), X
+  if (LBO && (match(LBO, m_LShr(m_Specific(RHS), m_Value())) ||
+              match(LBO, m_UDiv(m_Specific(RHS), m_Value())))) {
+    // icmp pred (X op Y), X
     if (Pred == ICmpInst::ICMP_UGT)
       return getFalse(ITy);
     if (Pred == ICmpInst::ICMP_ULE)
@@ -3030,7 +3090,7 @@ static Value *SimplifyICmpInst(unsigned Predicate, Value *LHS, Value *RHS,
   // Simplify comparisons of related pointers using a powerful, recursive
   // GEP-walk when we have target data available..
   if (LHS->getType()->isPointerTy())
-    if (Constant *C = computePointerICmp(Q.DL, Q.TLI, Pred, LHS, RHS))
+    if (auto *C = computePointerICmp(Q.DL, Q.TLI, Q.DT, Pred, Q.CxtI, LHS, RHS))
       return C;
 
   if (GetElementPtrInst *GLHS = dyn_cast<GetElementPtrInst>(LHS)) {
@@ -3145,7 +3205,14 @@ static Value *SimplifyFCmpInst(unsigned Predicate, Value *LHS, Value *RHS,
   }
 
   // Handle fcmp with constant RHS
-  if (ConstantFP *CFP = dyn_cast<ConstantFP>(RHS)) {
+  const ConstantFP *CFP = nullptr;
+  if (const auto *RHSC = dyn_cast<Constant>(RHS)) {
+    if (RHS->getType()->isVectorTy())
+      CFP = dyn_cast_or_null<ConstantFP>(RHSC->getSplatValue());
+    else
+      CFP = dyn_cast<ConstantFP>(RHSC);
+  }
+  if (CFP) {
     // If the constant is a nan, see if we can fold the comparison based on it.
     if (CFP->getValueAPF().isNaN()) {
       if (FCmpInst::isOrdered(Pred)) // True "if ordered and foo"
@@ -3153,7 +3220,7 @@ static Value *SimplifyFCmpInst(unsigned Predicate, Value *LHS, Value *RHS,
       assert(FCmpInst::isUnordered(Pred) &&
              "Comparison must be either ordered or unordered!");
       // True if unordered.
-      return ConstantInt::getTrue(CFP->getContext());
+      return ConstantInt::get(GetCompareTy(LHS), 1);
     }
     // Check whether the constant is an infinity.
     if (CFP->getValueAPF().isInfinity()) {
@@ -3161,10 +3228,10 @@ static Value *SimplifyFCmpInst(unsigned Predicate, Value *LHS, Value *RHS,
         switch (Pred) {
         case FCmpInst::FCMP_OLT:
           // No value is ordered and less than negative infinity.
-          return ConstantInt::getFalse(CFP->getContext());
+          return ConstantInt::get(GetCompareTy(LHS), 0);
         case FCmpInst::FCMP_UGE:
           // All values are unordered with or at least negative infinity.
-          return ConstantInt::getTrue(CFP->getContext());
+          return ConstantInt::get(GetCompareTy(LHS), 1);
         default:
           break;
         }
@@ -3172,10 +3239,10 @@ static Value *SimplifyFCmpInst(unsigned Predicate, Value *LHS, Value *RHS,
         switch (Pred) {
         case FCmpInst::FCMP_OGT:
           // No value is ordered and greater than infinity.
-          return ConstantInt::getFalse(CFP->getContext());
+          return ConstantInt::get(GetCompareTy(LHS), 0);
         case FCmpInst::FCMP_ULE:
           // All values are unordered with and at most infinity.
-          return ConstantInt::getTrue(CFP->getContext());
+          return ConstantInt::get(GetCompareTy(LHS), 1);
         default:
           break;
         }
@@ -3184,13 +3251,13 @@ static Value *SimplifyFCmpInst(unsigned Predicate, Value *LHS, Value *RHS,
     if (CFP->getValueAPF().isZero()) {
       switch (Pred) {
       case FCmpInst::FCMP_UGE:
-        if (CannotBeOrderedLessThanZero(LHS))
-          return ConstantInt::getTrue(CFP->getContext());
+        if (CannotBeOrderedLessThanZero(LHS, Q.TLI))
+          return ConstantInt::get(GetCompareTy(LHS), 1);
         break;
       case FCmpInst::FCMP_OLT:
         // X < 0
-        if (CannotBeOrderedLessThanZero(LHS))
-          return ConstantInt::getFalse(CFP->getContext());
+        if (CannotBeOrderedLessThanZero(LHS, Q.TLI))
+          return ConstantInt::get(GetCompareTy(LHS), 0);
         break;
       default:
         break;
@@ -3295,10 +3362,9 @@ static const Value *SimplifyWithOpReplaced(Value *V, Value *Op, Value *RepOp,
 
       if (LoadInst *LI = dyn_cast<LoadInst>(I))
         if (!LI->isVolatile())
-          return ConstantFoldLoadFromConstPtr(ConstOps[0], Q.DL);
+          return ConstantFoldLoadFromConstPtr(ConstOps[0], LI->getType(), Q.DL);
 
-      return ConstantFoldInstOperands(I->getOpcode(), I->getType(), ConstOps,
-                                      Q.DL, Q.TLI);
+      return ConstantFoldInstOperands(I, ConstOps, Q.DL, Q.TLI);
     }
   }
 
@@ -3527,13 +3593,13 @@ static Value *SimplifyGEPInst(Type *SrcTy, ArrayRef<Value *> Ops,
                                         Ops.slice(1));
 }
 
-Value *llvm::SimplifyGEPInst(ArrayRef<Value *> Ops, const DataLayout &DL,
+Value *llvm::SimplifyGEPInst(Type *SrcTy, ArrayRef<Value *> Ops,
+                             const DataLayout &DL,
                              const TargetLibraryInfo *TLI,
                              const DominatorTree *DT, AssumptionCache *AC,
                              const Instruction *CxtI) {
-  return ::SimplifyGEPInst(
-      cast<PointerType>(Ops[0]->getType()->getScalarType())->getElementType(),
-      Ops, Query(DL, TLI, DT, AC, CxtI), RecursionLimit);
+  return ::SimplifyGEPInst(SrcTy, Ops,
+                           Query(DL, TLI, DT, AC, CxtI), RecursionLimit);
 }
 
 /// Given operands for an InsertValueInst, see if we can fold the result.
@@ -3675,7 +3741,7 @@ static Value *SimplifyPHINode(PHINode *PN, const Query &Q) {
 
 static Value *SimplifyTruncInst(Value *Op, Type *Ty, const Query &Q, unsigned) {
   if (Constant *C = dyn_cast<Constant>(Op))
-    return ConstantFoldInstOperands(Instruction::Trunc, Ty, C, Q.DL, Q.TLI);
+    return ConstantFoldCastOperand(Instruction::Trunc, C, Ty, Q.DL);
 
   return nullptr;
 }
@@ -3730,11 +3796,8 @@ static Value *SimplifyBinOp(unsigned Opcode, Value *LHS, Value *RHS,
   case Instruction::Xor: return SimplifyXorInst(LHS, RHS, Q, MaxRecurse);
   default:
     if (Constant *CLHS = dyn_cast<Constant>(LHS))
-      if (Constant *CRHS = dyn_cast<Constant>(RHS)) {
-        Constant *COps[] = {CLHS, CRHS};
-        return ConstantFoldInstOperands(Opcode, LHS->getType(), COps, Q.DL,
-                                        Q.TLI);
-      }
+      if (Constant *CRHS = dyn_cast<Constant>(RHS))
+        return ConstantFoldBinaryOpOperands(Opcode, CLHS, CRHS, Q.DL);
 
     // If the operation is associative, try some generic simplifications.
     if (Instruction::isAssociative(Opcode))
@@ -3825,6 +3888,78 @@ static bool IsIdempotent(Intrinsic::ID ID) {
   }
 }
 
+static Value *SimplifyRelativeLoad(Constant *Ptr, Constant *Offset,
+                                   const DataLayout &DL) {
+  GlobalValue *PtrSym;
+  APInt PtrOffset;
+  if (!IsConstantOffsetFromGlobal(Ptr, PtrSym, PtrOffset, DL))
+    return nullptr;
+
+  Type *Int8PtrTy = Type::getInt8PtrTy(Ptr->getContext());
+  Type *Int32Ty = Type::getInt32Ty(Ptr->getContext());
+  Type *Int32PtrTy = Int32Ty->getPointerTo();
+  Type *Int64Ty = Type::getInt64Ty(Ptr->getContext());
+
+  auto *OffsetConstInt = dyn_cast<ConstantInt>(Offset);
+  if (!OffsetConstInt || OffsetConstInt->getType()->getBitWidth() > 64)
+    return nullptr;
+
+  uint64_t OffsetInt = OffsetConstInt->getSExtValue();
+  if (OffsetInt % 4 != 0)
+    return nullptr;
+
+  Constant *C = ConstantExpr::getGetElementPtr(
+      Int32Ty, ConstantExpr::getBitCast(Ptr, Int32PtrTy),
+      ConstantInt::get(Int64Ty, OffsetInt / 4));
+  Constant *Loaded = ConstantFoldLoadFromConstPtr(C, Int32Ty, DL);
+  if (!Loaded)
+    return nullptr;
+
+  auto *LoadedCE = dyn_cast<ConstantExpr>(Loaded);
+  if (!LoadedCE)
+    return nullptr;
+
+  if (LoadedCE->getOpcode() == Instruction::Trunc) {
+    LoadedCE = dyn_cast<ConstantExpr>(LoadedCE->getOperand(0));
+    if (!LoadedCE)
+      return nullptr;
+  }
+
+  if (LoadedCE->getOpcode() != Instruction::Sub)
+    return nullptr;
+
+  auto *LoadedLHS = dyn_cast<ConstantExpr>(LoadedCE->getOperand(0));
+  if (!LoadedLHS || LoadedLHS->getOpcode() != Instruction::PtrToInt)
+    return nullptr;
+  auto *LoadedLHSPtr = LoadedLHS->getOperand(0);
+
+  Constant *LoadedRHS = LoadedCE->getOperand(1);
+  GlobalValue *LoadedRHSSym;
+  APInt LoadedRHSOffset;
+  if (!IsConstantOffsetFromGlobal(LoadedRHS, LoadedRHSSym, LoadedRHSOffset,
+                                  DL) ||
+      PtrSym != LoadedRHSSym || PtrOffset != LoadedRHSOffset)
+    return nullptr;
+
+  return ConstantExpr::getBitCast(LoadedLHSPtr, Int8PtrTy);
+}
+
+static bool maskIsAllZeroOrUndef(Value *Mask) {
+  auto *ConstMask = dyn_cast<Constant>(Mask);
+  if (!ConstMask)
+    return false;
+  if (ConstMask->isNullValue() || isa<UndefValue>(ConstMask))
+    return true;
+  for (unsigned I = 0, E = ConstMask->getType()->getVectorNumElements(); I != E;
+       ++I) {
+    if (auto *MaskElt = ConstMask->getAggregateElement(I))
+      if (MaskElt->isNullValue() || isa<UndefValue>(MaskElt))
+        continue;
+    return false;
+  }
+  return true;
+}
+
 template <typename IterTy>
 static Value *SimplifyIntrinsic(Function *F, IterTy ArgBegin, IterTy ArgEnd,
                                 const Query &Q, unsigned MaxRecurse) {
@@ -3865,6 +4000,20 @@ static Value *SimplifyIntrinsic(Function *F, IterTy ArgBegin, IterTy ArgEnd,
       if (match(RHS, m_Undef()))
         return Constant::getNullValue(ReturnType);
     }
+
+    if (IID == Intrinsic::load_relative && isa<Constant>(LHS) &&
+        isa<Constant>(RHS))
+      return SimplifyRelativeLoad(cast<Constant>(LHS), cast<Constant>(RHS),
+                                  Q.DL);
+  }
+
+  // Simplify calls to llvm.masked.load.*
+  if (IID == Intrinsic::masked_load) {
+    Value *MaskArg = ArgBegin[2];
+    Value *PassthruArg = ArgBegin[3];
+    // If the mask is all zeros or undef, the "passthru" argument is the result.
+    if (maskIsAllZeroOrUndef(MaskArg))
+      return PassthruArg;
   }
 
   // Perform idempotent optimizations
@@ -3889,7 +4038,8 @@ static Value *SimplifyCall(Value *V, IterTy ArgBegin, IterTy ArgEnd,
   FunctionType *FTy = cast<FunctionType>(Ty);
 
   // call undef -> undef
-  if (isa<UndefValue>(V))
+  // call null -> undef
+  if (isa<UndefValue>(V) || isa<ConstantPointerNull>(V))
     return UndefValue::get(FTy->getReturnType());
 
   Function *F = dyn_cast<Function>(V);
@@ -4038,7 +4188,8 @@ Value *llvm::SimplifyInstruction(Instruction *I, const DataLayout &DL,
     break;
   case Instruction::GetElementPtr: {
     SmallVector<Value*, 8> Ops(I->op_begin(), I->op_end());
-    Result = SimplifyGEPInst(Ops, DL, TLI, DT, AC, I);
+    Result = SimplifyGEPInst(cast<GetElementPtrInst>(I)->getSourceElementType(),
+                             Ops, DL, TLI, DT, AC, I);
     break;
   }
   case Instruction::InsertValue: {
@@ -4092,7 +4243,7 @@ Value *llvm::SimplifyInstruction(Instruction *I, const DataLayout &DL,
   return Result == I ? UndefValue::get(I->getType()) : Result;
 }
 
-/// \brief Implementation of recursive simplification through an instructions
+/// \brief Implementation of recursive simplification through an instruction's
 /// uses.
 ///
 /// This is the common implementation of the recursive simplification routines.
diff --git a/lib/Analysis/Interval.cpp b/lib/Analysis/Interval.cpp
index e3e785ffc45f..6c10d73bcb44 100644
--- a/lib/Analysis/Interval.cpp
+++ b/lib/Analysis/Interval.cpp
@@ -42,17 +42,14 @@ void Interval::print(raw_ostream &OS) const {
        << "Interval Contents:\n";
 
   // Print out all of the basic blocks in the interval...
-  for (std::vector<BasicBlock*>::const_iterator I = Nodes.begin(),
-         E = Nodes.end(); I != E; ++I)
-    OS << **I << "\n";
+  for (const BasicBlock *Node : Nodes)
+    OS << *Node << "\n";
 
   OS << "Interval Predecessors:\n";
-  for (std::vector<BasicBlock*>::const_iterator I = Predecessors.begin(),
-         E = Predecessors.end(); I != E; ++I)
-    OS << **I << "\n";
+  for (const BasicBlock *Predecessor : Predecessors)
+    OS << *Predecessor << "\n";
 
   OS << "Interval Successors:\n";
-  for (std::vector<BasicBlock*>::const_iterator I = Successors.begin(),
-         E = Successors.end(); I != E; ++I)
-    OS << **I << "\n";
+  for (const BasicBlock *Successor : Successors)
+    OS << *Successor << "\n";
 }
diff --git a/lib/Analysis/IntervalPartition.cpp b/lib/Analysis/IntervalPartition.cpp
index a0583e86d185..a4e56e0694bc 100644
--- a/lib/Analysis/IntervalPartition.cpp
+++ b/lib/Analysis/IntervalPartition.cpp
@@ -57,9 +57,8 @@ void IntervalPartition::addIntervalToPartition(Interval *I) {
 //
 void IntervalPartition::updatePredecessors(Interval *Int) {
   BasicBlock *Header = Int->getHeaderNode();
-  for (Interval::succ_iterator I = Int->Successors.begin(),
-         E = Int->Successors.end(); I != E; ++I)
-    getBlockInterval(*I)->Predecessors.push_back(Header);
+  for (BasicBlock *Successor : Int->Successors)
+    getBlockInterval(Successor)->Predecessors.push_back(Header);
 }
 
 // IntervalPartition ctor - Build the first level interval partition for the
diff --git a/lib/Analysis/IteratedDominanceFrontier.cpp b/lib/Analysis/IteratedDominanceFrontier.cpp
index 9f1edd21820f..3ab6b5d60905 100644
--- a/lib/Analysis/IteratedDominanceFrontier.cpp
+++ b/lib/Analysis/IteratedDominanceFrontier.cpp
@@ -16,9 +16,10 @@
 #include "llvm/IR/Dominators.h"
 #include <queue>
 
-using namespace llvm;
-
-void IDFCalculator::calculate(SmallVectorImpl<BasicBlock *> &PHIBlocks) {
+namespace llvm {
+template <class NodeTy>
+void IDFCalculator<NodeTy>::calculate(
+    SmallVectorImpl<BasicBlock *> &PHIBlocks) {
   // If we haven't computed dominator tree levels, do so now.
   if (DomLevels.empty()) {
     for (auto DFI = df_begin(DT.getRootNode()), DFE = df_end(DT.getRootNode());
@@ -61,8 +62,12 @@ void IDFCalculator::calculate(SmallVectorImpl<BasicBlock *> &PHIBlocks) {
     while (!Worklist.empty()) {
       DomTreeNode *Node = Worklist.pop_back_val();
       BasicBlock *BB = Node->getBlock();
-
-      for (auto Succ : successors(BB)) {
+      // Succ is the successor in the direction we are calculating IDF, so it is
+      // successor for IDF, and predecessor for Reverse IDF.
+      for (auto SuccIter = GraphTraits<NodeTy>::child_begin(BB),
+                End = GraphTraits<NodeTy>::child_end(BB);
+           SuccIter != End; ++SuccIter) {
+        BasicBlock *Succ = *SuccIter;
         DomTreeNode *SuccNode = DT.getNode(Succ);
 
         // Quickly skip all CFG edges that are also dominator tree edges instead
@@ -93,3 +98,7 @@ void IDFCalculator::calculate(SmallVectorImpl<BasicBlock *> &PHIBlocks) {
     }
   }
 }
+
+template class IDFCalculator<BasicBlock *>;
+template class IDFCalculator<Inverse<BasicBlock *>>;
+}
diff --git a/lib/Analysis/LLVMBuild.txt b/lib/Analysis/LLVMBuild.txt
index bddf1a3ac201..08af5f37700d 100644
--- a/lib/Analysis/LLVMBuild.txt
+++ b/lib/Analysis/LLVMBuild.txt
@@ -19,4 +19,4 @@
 type = Library
 name = Analysis
 parent = Libraries
-required_libraries = Core Support
+required_libraries = Core Support ProfileData
diff --git a/lib/Analysis/LazyBlockFrequencyInfo.cpp b/lib/Analysis/LazyBlockFrequencyInfo.cpp
new file mode 100644
index 000000000000..7debfde87d2a
--- /dev/null
+++ b/lib/Analysis/LazyBlockFrequencyInfo.cpp
@@ -0,0 +1,68 @@
+//===- LazyBlockFrequencyInfo.cpp - Lazy Block Frequency Analysis ---------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This is an alternative analysis pass to BlockFrequencyInfoWrapperPass.  The
+// difference is that with this pass the block frequencies are not computed when
+// the analysis pass is executed but rather when the BFI results is explicitly
+// requested by the analysis client.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Analysis/LazyBlockFrequencyInfo.h"
+#include "llvm/Analysis/BranchProbabilityInfo.h"
+#include "llvm/Analysis/LoopInfo.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "lazy-block-freq"
+
+INITIALIZE_PASS_BEGIN(LazyBlockFrequencyInfoPass, DEBUG_TYPE,
+                      "Lazy Block Frequency Analysis", true, true)
+INITIALIZE_PASS_DEPENDENCY(BranchProbabilityInfoWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass)
+INITIALIZE_PASS_END(LazyBlockFrequencyInfoPass, DEBUG_TYPE,
+                    "Lazy Block Frequency Analysis", true, true)
+
+char LazyBlockFrequencyInfoPass::ID = 0;
+
+LazyBlockFrequencyInfoPass::LazyBlockFrequencyInfoPass() : FunctionPass(ID) {
+  initializeLazyBlockFrequencyInfoPassPass(*PassRegistry::getPassRegistry());
+}
+
+void LazyBlockFrequencyInfoPass::print(raw_ostream &OS, const Module *) const {
+  LBFI.getCalculated().print(OS);
+}
+
+void LazyBlockFrequencyInfoPass::getAnalysisUsage(AnalysisUsage &AU) const {
+  AU.addRequired<BranchProbabilityInfoWrapperPass>();
+  AU.addRequired<LoopInfoWrapperPass>();
+  AU.setPreservesAll();
+}
+
+void LazyBlockFrequencyInfoPass::releaseMemory() { LBFI.releaseMemory(); }
+
+bool LazyBlockFrequencyInfoPass::runOnFunction(Function &F) {
+  BranchProbabilityInfo &BPI =
+      getAnalysis<BranchProbabilityInfoWrapperPass>().getBPI();
+  LoopInfo &LI = getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
+  LBFI.setAnalysis(&F, &BPI, &LI);
+  return false;
+}
+
+void LazyBlockFrequencyInfoPass::getLazyBFIAnalysisUsage(AnalysisUsage &AU) {
+  AU.addRequired<BranchProbabilityInfoWrapperPass>();
+  AU.addRequired<LazyBlockFrequencyInfoPass>();
+  AU.addRequired<LoopInfoWrapperPass>();
+}
+
+void llvm::initializeLazyBFIPassPass(PassRegistry &Registry) {
+  INITIALIZE_PASS_DEPENDENCY(BranchProbabilityInfoWrapperPass);
+  INITIALIZE_PASS_DEPENDENCY(LazyBlockFrequencyInfoPass);
+  INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass);
+}
diff --git a/lib/Analysis/LazyCallGraph.cpp b/lib/Analysis/LazyCallGraph.cpp
index 0f0f31e62ac7..acff8529b151 100644
--- a/lib/Analysis/LazyCallGraph.cpp
+++ b/lib/Analysis/LazyCallGraph.cpp
@@ -14,36 +14,41 @@
 #include "llvm/IR/Instructions.h"
 #include "llvm/IR/PassManager.h"
 #include "llvm/Support/Debug.h"
-#include "llvm/Support/raw_ostream.h"
+#include "llvm/Support/GraphWriter.h"
 
 using namespace llvm;
 
 #define DEBUG_TYPE "lcg"
 
-static void findCallees(
-    SmallVectorImpl<Constant *> &Worklist, SmallPtrSetImpl<Constant *> &Visited,
-    SmallVectorImpl<PointerUnion<Function *, LazyCallGraph::Node *>> &Callees,
-    DenseMap<Function *, size_t> &CalleeIndexMap) {
+static void addEdge(SmallVectorImpl<LazyCallGraph::Edge> &Edges,
+                    DenseMap<Function *, int> &EdgeIndexMap, Function &F,
+                    LazyCallGraph::Edge::Kind EK) {
+  // Note that we consider *any* function with a definition to be a viable
+  // edge. Even if the function's definition is subject to replacement by
+  // some other module (say, a weak definition) there may still be
+  // optimizations which essentially speculate based on the definition and
+  // a way to check that the specific definition is in fact the one being
+  // used. For example, this could be done by moving the weak definition to
+  // a strong (internal) definition and making the weak definition be an
+  // alias. Then a test of the address of the weak function against the new
+  // strong definition's address would be an effective way to determine the
+  // safety of optimizing a direct call edge.
+  if (!F.isDeclaration() &&
+      EdgeIndexMap.insert({&F, Edges.size()}).second) {
+    DEBUG(dbgs() << "    Added callable function: " << F.getName() << "\n");
+    Edges.emplace_back(LazyCallGraph::Edge(F, EK));
+  }
+}
+
+static void findReferences(SmallVectorImpl<Constant *> &Worklist,
+                           SmallPtrSetImpl<Constant *> &Visited,
+                           SmallVectorImpl<LazyCallGraph::Edge> &Edges,
+                           DenseMap<Function *, int> &EdgeIndexMap) {
   while (!Worklist.empty()) {
     Constant *C = Worklist.pop_back_val();
 
     if (Function *F = dyn_cast<Function>(C)) {
-      // Note that we consider *any* function with a definition to be a viable
-      // edge. Even if the function's definition is subject to replacement by
-      // some other module (say, a weak definition) there may still be
-      // optimizations which essentially speculate based on the definition and
-      // a way to check that the specific definition is in fact the one being
-      // used. For example, this could be done by moving the weak definition to
-      // a strong (internal) definition and making the weak definition be an
-      // alias. Then a test of the address of the weak function against the new
-      // strong definition's address would be an effective way to determine the
-      // safety of optimizing a direct call edge.
-      if (!F->isDeclaration() &&
-          CalleeIndexMap.insert(std::make_pair(F, Callees.size())).second) {
-        DEBUG(dbgs() << "    Added callable function: " << F->getName()
-                     << "\n");
-        Callees.push_back(F);
-      }
+      addEdge(Edges, EdgeIndexMap, *F, LazyCallGraph::Edge::Ref);
       continue;
     }
 
@@ -59,42 +64,63 @@ LazyCallGraph::Node::Node(LazyCallGraph &G, Function &F)
                << "' to the graph.\n");
 
   SmallVector<Constant *, 16> Worklist;
+  SmallPtrSet<Function *, 4> Callees;
   SmallPtrSet<Constant *, 16> Visited;
-  // Find all the potential callees in this function. First walk the
-  // instructions and add every operand which is a constant to the worklist.
+
+  // Find all the potential call graph edges in this function. We track both
+  // actual call edges and indirect references to functions. The direct calls
+  // are trivially added, but to accumulate the latter we walk the instructions
+  // and add every operand which is a constant to the worklist to process
+  // afterward.
   for (BasicBlock &BB : F)
-    for (Instruction &I : BB)
+    for (Instruction &I : BB) {
+      if (auto CS = CallSite(&I))
+        if (Function *Callee = CS.getCalledFunction())
+          if (Callees.insert(Callee).second) {
+            Visited.insert(Callee);
+            addEdge(Edges, EdgeIndexMap, *Callee, LazyCallGraph::Edge::Call);
+          }
+
       for (Value *Op : I.operand_values())
         if (Constant *C = dyn_cast<Constant>(Op))
           if (Visited.insert(C).second)
             Worklist.push_back(C);
+    }
 
   // We've collected all the constant (and thus potentially function or
   // function containing) operands to all of the instructions in the function.
   // Process them (recursively) collecting every function found.
-  findCallees(Worklist, Visited, Callees, CalleeIndexMap);
+  findReferences(Worklist, Visited, Edges, EdgeIndexMap);
 }
 
-void LazyCallGraph::Node::insertEdgeInternal(Function &Callee) {
-  if (Node *N = G->lookup(Callee))
-    return insertEdgeInternal(*N);
+void LazyCallGraph::Node::insertEdgeInternal(Function &Target, Edge::Kind EK) {
+  if (Node *N = G->lookup(Target))
+    return insertEdgeInternal(*N, EK);
+
+  EdgeIndexMap.insert({&Target, Edges.size()});
+  Edges.emplace_back(Target, EK);
+}
 
-  CalleeIndexMap.insert(std::make_pair(&Callee, Callees.size()));
-  Callees.push_back(&Callee);
+void LazyCallGraph::Node::insertEdgeInternal(Node &TargetN, Edge::Kind EK) {
+  EdgeIndexMap.insert({&TargetN.getFunction(), Edges.size()});
+  Edges.emplace_back(TargetN, EK);
 }
 
-void LazyCallGraph::Node::insertEdgeInternal(Node &CalleeN) {
-  CalleeIndexMap.insert(std::make_pair(&CalleeN.getFunction(), Callees.size()));
-  Callees.push_back(&CalleeN);
+void LazyCallGraph::Node::setEdgeKind(Function &TargetF, Edge::Kind EK) {
+  Edges[EdgeIndexMap.find(&TargetF)->second].setKind(EK);
 }
 
-void LazyCallGraph::Node::removeEdgeInternal(Function &Callee) {
-  auto IndexMapI = CalleeIndexMap.find(&Callee);
-  assert(IndexMapI != CalleeIndexMap.end() &&
-         "Callee not in the callee set for this caller?");
+void LazyCallGraph::Node::removeEdgeInternal(Function &Target) {
+  auto IndexMapI = EdgeIndexMap.find(&Target);
+  assert(IndexMapI != EdgeIndexMap.end() &&
+         "Target not in the edge set for this caller?");
 
-  Callees[IndexMapI->second] = nullptr;
-  CalleeIndexMap.erase(IndexMapI);
+  Edges[IndexMapI->second] = Edge();
+  EdgeIndexMap.erase(IndexMapI);
+}
+
+void LazyCallGraph::Node::dump() const {
+  dbgs() << *this << '\n';
 }
 
 LazyCallGraph::LazyCallGraph(Module &M) : NextDFSNumber(0) {
@@ -102,10 +128,10 @@ LazyCallGraph::LazyCallGraph(Module &M) : NextDFSNumber(0) {
                << "\n");
   for (Function &F : M)
     if (!F.isDeclaration() && !F.hasLocalLinkage())
-      if (EntryIndexMap.insert(std::make_pair(&F, EntryNodes.size())).second) {
+      if (EntryIndexMap.insert({&F, EntryEdges.size()}).second) {
         DEBUG(dbgs() << "  Adding '" << F.getName()
                      << "' to entry set of the graph.\n");
-        EntryNodes.push_back(&F);
+        EntryEdges.emplace_back(F, Edge::Ref);
       }
 
   // Now add entry nodes for functions reachable via initializers to globals.
@@ -118,25 +144,19 @@ LazyCallGraph::LazyCallGraph(Module &M) : NextDFSNumber(0) {
 
   DEBUG(dbgs() << "  Adding functions referenced by global initializers to the "
                   "entry set.\n");
-  findCallees(Worklist, Visited, EntryNodes, EntryIndexMap);
+  findReferences(Worklist, Visited, EntryEdges, EntryIndexMap);
 
-  for (auto &Entry : EntryNodes) {
-    assert(!Entry.isNull() &&
-           "We can't have removed edges before we finish the constructor!");
-    if (Function *F = Entry.dyn_cast<Function *>())
-      SCCEntryNodes.push_back(F);
-    else
-      SCCEntryNodes.push_back(&Entry.get<Node *>()->getFunction());
-  }
+  for (const Edge &E : EntryEdges)
+    RefSCCEntryNodes.push_back(&E.getFunction());
 }
 
 LazyCallGraph::LazyCallGraph(LazyCallGraph &&G)
     : BPA(std::move(G.BPA)), NodeMap(std::move(G.NodeMap)),
-      EntryNodes(std::move(G.EntryNodes)),
+      EntryEdges(std::move(G.EntryEdges)),
       EntryIndexMap(std::move(G.EntryIndexMap)), SCCBPA(std::move(G.SCCBPA)),
-      SCCMap(std::move(G.SCCMap)), LeafSCCs(std::move(G.LeafSCCs)),
+      SCCMap(std::move(G.SCCMap)), LeafRefSCCs(std::move(G.LeafRefSCCs)),
       DFSStack(std::move(G.DFSStack)),
-      SCCEntryNodes(std::move(G.SCCEntryNodes)),
+      RefSCCEntryNodes(std::move(G.RefSCCEntryNodes)),
       NextDFSNumber(G.NextDFSNumber) {
   updateGraphPtrs();
 }
@@ -144,402 +164,1080 @@ LazyCallGraph::LazyCallGraph(LazyCallGraph &&G)
 LazyCallGraph &LazyCallGraph::operator=(LazyCallGraph &&G) {
   BPA = std::move(G.BPA);
   NodeMap = std::move(G.NodeMap);
-  EntryNodes = std::move(G.EntryNodes);
+  EntryEdges = std::move(G.EntryEdges);
   EntryIndexMap = std::move(G.EntryIndexMap);
   SCCBPA = std::move(G.SCCBPA);
   SCCMap = std::move(G.SCCMap);
-  LeafSCCs = std::move(G.LeafSCCs);
+  LeafRefSCCs = std::move(G.LeafRefSCCs);
   DFSStack = std::move(G.DFSStack);
-  SCCEntryNodes = std::move(G.SCCEntryNodes);
+  RefSCCEntryNodes = std::move(G.RefSCCEntryNodes);
   NextDFSNumber = G.NextDFSNumber;
   updateGraphPtrs();
   return *this;
 }
 
-void LazyCallGraph::SCC::insert(Node &N) {
-  N.DFSNumber = N.LowLink = -1;
-  Nodes.push_back(&N);
-  G->SCCMap[&N] = this;
+void LazyCallGraph::SCC::dump() const {
+  dbgs() << *this << '\n';
 }
 
-bool LazyCallGraph::SCC::isDescendantOf(const SCC &C) const {
+#ifndef NDEBUG
+void LazyCallGraph::SCC::verify() {
+  assert(OuterRefSCC && "Can't have a null RefSCC!");
+  assert(!Nodes.empty() && "Can't have an empty SCC!");
+
+  for (Node *N : Nodes) {
+    assert(N && "Can't have a null node!");
+    assert(OuterRefSCC->G->lookupSCC(*N) == this &&
+           "Node does not map to this SCC!");
+    assert(N->DFSNumber == -1 &&
+           "Must set DFS numbers to -1 when adding a node to an SCC!");
+    assert(N->LowLink == -1 &&
+           "Must set low link to -1 when adding a node to an SCC!");
+    for (Edge &E : *N)
+      assert(E.getNode() && "Can't have an edge to a raw function!");
+  }
+}
+#endif
+
+LazyCallGraph::RefSCC::RefSCC(LazyCallGraph &G) : G(&G) {}
+
+void LazyCallGraph::RefSCC::dump() const {
+  dbgs() << *this << '\n';
+}
+
+#ifndef NDEBUG
+void LazyCallGraph::RefSCC::verify() {
+  assert(G && "Can't have a null graph!");
+  assert(!SCCs.empty() && "Can't have an empty SCC!");
+
+  // Verify basic properties of the SCCs.
+  for (SCC *C : SCCs) {
+    assert(C && "Can't have a null SCC!");
+    C->verify();
+    assert(&C->getOuterRefSCC() == this &&
+           "SCC doesn't think it is inside this RefSCC!");
+  }
+
+  // Check that our indices map correctly.
+  for (auto &SCCIndexPair : SCCIndices) {
+    SCC *C = SCCIndexPair.first;
+    int i = SCCIndexPair.second;
+    assert(C && "Can't have a null SCC in the indices!");
+    assert(SCCs[i] == C && "Index doesn't point to SCC!");
+  }
+
+  // Check that the SCCs are in fact in post-order.
+  for (int i = 0, Size = SCCs.size(); i < Size; ++i) {
+    SCC &SourceSCC = *SCCs[i];
+    for (Node &N : SourceSCC)
+      for (Edge &E : N) {
+        if (!E.isCall())
+          continue;
+        SCC &TargetSCC = *G->lookupSCC(*E.getNode());
+        if (&TargetSCC.getOuterRefSCC() == this) {
+          assert(SCCIndices.find(&TargetSCC)->second <= i &&
+                 "Edge between SCCs violates post-order relationship.");
+          continue;
+        }
+        assert(TargetSCC.getOuterRefSCC().Parents.count(this) &&
+               "Edge to a RefSCC missing us in its parent set.");
+      }
+  }
+}
+#endif
+
+bool LazyCallGraph::RefSCC::isDescendantOf(const RefSCC &C) const {
   // Walk up the parents of this SCC and verify that we eventually find C.
-  SmallVector<const SCC *, 4> AncestorWorklist;
+  SmallVector<const RefSCC *, 4> AncestorWorklist;
   AncestorWorklist.push_back(this);
   do {
-    const SCC *AncestorC = AncestorWorklist.pop_back_val();
+    const RefSCC *AncestorC = AncestorWorklist.pop_back_val();
     if (AncestorC->isChildOf(C))
       return true;
-    for (const SCC *ParentC : AncestorC->ParentSCCs)
+    for (const RefSCC *ParentC : AncestorC->Parents)
       AncestorWorklist.push_back(ParentC);
   } while (!AncestorWorklist.empty());
 
   return false;
 }
 
-void LazyCallGraph::SCC::insertIntraSCCEdge(Node &CallerN, Node &CalleeN) {
-  // First insert it into the caller.
-  CallerN.insertEdgeInternal(CalleeN);
+SmallVector<LazyCallGraph::SCC *, 1>
+LazyCallGraph::RefSCC::switchInternalEdgeToCall(Node &SourceN, Node &TargetN) {
+  assert(!SourceN[TargetN].isCall() && "Must start with a ref edge!");
+
+  SmallVector<SCC *, 1> DeletedSCCs;
+
+  SCC &SourceSCC = *G->lookupSCC(SourceN);
+  SCC &TargetSCC = *G->lookupSCC(TargetN);
+
+  // If the two nodes are already part of the same SCC, we're also done as
+  // we've just added more connectivity.
+  if (&SourceSCC == &TargetSCC) {
+    SourceN.setEdgeKind(TargetN.getFunction(), Edge::Call);
+#ifndef NDEBUG
+    // Check that the RefSCC is still valid.
+    verify();
+#endif
+    return DeletedSCCs;
+  }
+
+  // At this point we leverage the postorder list of SCCs to detect when the
+  // insertion of an edge changes the SCC structure in any way.
+  //
+  // First and foremost, we can eliminate the need for any changes when the
+  // edge is toward the beginning of the postorder sequence because all edges
+  // flow in that direction already. Thus adding a new one cannot form a cycle.
+  int SourceIdx = SCCIndices[&SourceSCC];
+  int TargetIdx = SCCIndices[&TargetSCC];
+  if (TargetIdx < SourceIdx) {
+    SourceN.setEdgeKind(TargetN.getFunction(), Edge::Call);
+#ifndef NDEBUG
+    // Check that the RefSCC is still valid.
+    verify();
+#endif
+    return DeletedSCCs;
+  }
+
+  // When we do have an edge from an earlier SCC to a later SCC in the
+  // postorder sequence, all of the SCCs which may be impacted are in the
+  // closed range of those two within the postorder sequence. The algorithm to
+  // restore the state is as follows:
+  //
+  // 1) Starting from the source SCC, construct a set of SCCs which reach the
+  //    source SCC consisting of just the source SCC. Then scan toward the
+  //    target SCC in postorder and for each SCC, if it has an edge to an SCC
+  //    in the set, add it to the set. Otherwise, the source SCC is not
+  //    a successor, move it in the postorder sequence to immediately before
+  //    the source SCC, shifting the source SCC and all SCCs in the set one
+  //    position toward the target SCC. Stop scanning after processing the
+  //    target SCC.
+  // 2) If the source SCC is now past the target SCC in the postorder sequence,
+  //    and thus the new edge will flow toward the start, we are done.
+  // 3) Otherwise, starting from the target SCC, walk all edges which reach an
+  //    SCC between the source and the target, and add them to the set of
+  //    connected SCCs, then recurse through them. Once a complete set of the
+  //    SCCs the target connects to is known, hoist the remaining SCCs between
+  //    the source and the target to be above the target. Note that there is no
+  //    need to process the source SCC, it is already known to connect.
+  // 4) At this point, all of the SCCs in the closed range between the source
+  //    SCC and the target SCC in the postorder sequence are connected,
+  //    including the target SCC and the source SCC. Inserting the edge from
+  //    the source SCC to the target SCC will form a cycle out of precisely
+  //    these SCCs. Thus we can merge all of the SCCs in this closed range into
+  //    a single SCC.
+  //
+  // This process has various important properties:
+  // - Only mutates the SCCs when adding the edge actually changes the SCC
+  //   structure.
+  // - Never mutates SCCs which are unaffected by the change.
+  // - Updates the postorder sequence to correctly satisfy the postorder
+  //   constraint after the edge is inserted.
+  // - Only reorders SCCs in the closed postorder sequence from the source to
+  //   the target, so easy to bound how much has changed even in the ordering.
+  // - Big-O is the number of edges in the closed postorder range of SCCs from
+  //   source to target.
+
+  assert(SourceIdx < TargetIdx && "Cannot have equal indices here!");
+  SmallPtrSet<SCC *, 4> ConnectedSet;
+
+  // Compute the SCCs which (transitively) reach the source.
+  ConnectedSet.insert(&SourceSCC);
+  auto IsConnected = [&](SCC &C) {
+    for (Node &N : C)
+      for (Edge &E : N.calls()) {
+        assert(E.getNode() && "Must have formed a node within an SCC!");
+        if (ConnectedSet.count(G->lookupSCC(*E.getNode())))
+          return true;
+      }
+
+    return false;
+  };
+
+  for (SCC *C :
+       make_range(SCCs.begin() + SourceIdx + 1, SCCs.begin() + TargetIdx + 1))
+    if (IsConnected(*C))
+      ConnectedSet.insert(C);
+
+  // Partition the SCCs in this part of the port-order sequence so only SCCs
+  // connecting to the source remain between it and the target. This is
+  // a benign partition as it preserves postorder.
+  auto SourceI = std::stable_partition(
+      SCCs.begin() + SourceIdx, SCCs.begin() + TargetIdx + 1,
+      [&ConnectedSet](SCC *C) { return !ConnectedSet.count(C); });
+  for (int i = SourceIdx, e = TargetIdx + 1; i < e; ++i)
+    SCCIndices.find(SCCs[i])->second = i;
+
+  // If the target doesn't connect to the source, then we've corrected the
+  // post-order and there are no cycles formed.
+  if (!ConnectedSet.count(&TargetSCC)) {
+    assert(SourceI > (SCCs.begin() + SourceIdx) &&
+           "Must have moved the source to fix the post-order.");
+    assert(*std::prev(SourceI) == &TargetSCC &&
+           "Last SCC to move should have bene the target.");
+    SourceN.setEdgeKind(TargetN.getFunction(), Edge::Call);
+#ifndef NDEBUG
+    verify();
+#endif
+    return DeletedSCCs;
+  }
+
+  assert(SCCs[TargetIdx] == &TargetSCC &&
+         "Should not have moved target if connected!");
+  SourceIdx = SourceI - SCCs.begin();
+
+#ifndef NDEBUG
+  // Check that the RefSCC is still valid.
+  verify();
+#endif
+
+  // See whether there are any remaining intervening SCCs between the source
+  // and target. If so we need to make sure they all are reachable form the
+  // target.
+  if (SourceIdx + 1 < TargetIdx) {
+    // Use a normal worklist to find which SCCs the target connects to. We still
+    // bound the search based on the range in the postorder list we care about,
+    // but because this is forward connectivity we just "recurse" through the
+    // edges.
+    ConnectedSet.clear();
+    ConnectedSet.insert(&TargetSCC);
+    SmallVector<SCC *, 4> Worklist;
+    Worklist.push_back(&TargetSCC);
+    do {
+      SCC &C = *Worklist.pop_back_val();
+      for (Node &N : C)
+        for (Edge &E : N) {
+          assert(E.getNode() && "Must have formed a node within an SCC!");
+          if (!E.isCall())
+            continue;
+          SCC &EdgeC = *G->lookupSCC(*E.getNode());
+          if (&EdgeC.getOuterRefSCC() != this)
+            // Not in this RefSCC...
+            continue;
+          if (SCCIndices.find(&EdgeC)->second <= SourceIdx)
+            // Not in the postorder sequence between source and target.
+            continue;
+
+          if (ConnectedSet.insert(&EdgeC).second)
+            Worklist.push_back(&EdgeC);
+        }
+    } while (!Worklist.empty());
+
+    // Partition SCCs so that only SCCs reached from the target remain between
+    // the source and the target. This preserves postorder.
+    auto TargetI = std::stable_partition(
+        SCCs.begin() + SourceIdx + 1, SCCs.begin() + TargetIdx + 1,
+        [&ConnectedSet](SCC *C) { return ConnectedSet.count(C); });
+    for (int i = SourceIdx + 1, e = TargetIdx + 1; i < e; ++i)
+      SCCIndices.find(SCCs[i])->second = i;
+    TargetIdx = std::prev(TargetI) - SCCs.begin();
+    assert(SCCs[TargetIdx] == &TargetSCC &&
+           "Should always end with the target!");
+
+#ifndef NDEBUG
+    // Check that the RefSCC is still valid.
+    verify();
+#endif
+  }
+
+  // At this point, we know that connecting source to target forms a cycle
+  // because target connects back to source, and we know that all of the SCCs
+  // between the source and target in the postorder sequence participate in that
+  // cycle. This means that we need to merge all of these SCCs into a single
+  // result SCC.
+  //
+  // NB: We merge into the target because all of these functions were already
+  // reachable from the target, meaning any SCC-wide properties deduced about it
+  // other than the set of functions within it will not have changed.
+  auto MergeRange =
+      make_range(SCCs.begin() + SourceIdx, SCCs.begin() + TargetIdx);
+  for (SCC *C : MergeRange) {
+    assert(C != &TargetSCC &&
+           "We merge *into* the target and shouldn't process it here!");
+    SCCIndices.erase(C);
+    TargetSCC.Nodes.append(C->Nodes.begin(), C->Nodes.end());
+    for (Node *N : C->Nodes)
+      G->SCCMap[N] = &TargetSCC;
+    C->clear();
+    DeletedSCCs.push_back(C);
+  }
+
+  // Erase the merged SCCs from the list and update the indices of the
+  // remaining SCCs.
+  int IndexOffset = MergeRange.end() - MergeRange.begin();
+  auto EraseEnd = SCCs.erase(MergeRange.begin(), MergeRange.end());
+  for (SCC *C : make_range(EraseEnd, SCCs.end()))
+    SCCIndices[C] -= IndexOffset;
 
-  assert(G->SCCMap.lookup(&CallerN) == this && "Caller must be in this SCC.");
-  assert(G->SCCMap.lookup(&CalleeN) == this && "Callee must be in this SCC.");
+  // Now that the SCC structure is finalized, flip the kind to call.
+  SourceN.setEdgeKind(TargetN.getFunction(), Edge::Call);
 
-  // Nothing changes about this SCC or any other.
+#ifndef NDEBUG
+  // And we're done! Verify in debug builds that the RefSCC is coherent.
+  verify();
+#endif
+  return DeletedSCCs;
+}
+
+void LazyCallGraph::RefSCC::switchInternalEdgeToRef(Node &SourceN,
+                                                    Node &TargetN) {
+  assert(SourceN[TargetN].isCall() && "Must start with a call edge!");
+
+  SCC &SourceSCC = *G->lookupSCC(SourceN);
+  SCC &TargetSCC = *G->lookupSCC(TargetN);
+
+  assert(&SourceSCC.getOuterRefSCC() == this &&
+         "Source must be in this RefSCC.");
+  assert(&TargetSCC.getOuterRefSCC() == this &&
+         "Target must be in this RefSCC.");
+
+  // Set the edge kind.
+  SourceN.setEdgeKind(TargetN.getFunction(), Edge::Ref);
+
+  // If this call edge is just connecting two separate SCCs within this RefSCC,
+  // there is nothing to do.
+  if (&SourceSCC != &TargetSCC) {
+#ifndef NDEBUG
+    // Check that the RefSCC is still valid.
+    verify();
+#endif
+    return;
+  }
+
+  // Otherwise we are removing a call edge from a single SCC. This may break
+  // the cycle. In order to compute the new set of SCCs, we need to do a small
+  // DFS over the nodes within the SCC to form any sub-cycles that remain as
+  // distinct SCCs and compute a postorder over the resulting SCCs.
+  //
+  // However, we specially handle the target node. The target node is known to
+  // reach all other nodes in the original SCC by definition. This means that
+  // we want the old SCC to be replaced with an SCC contaning that node as it
+  // will be the root of whatever SCC DAG results from the DFS. Assumptions
+  // about an SCC such as the set of functions called will continue to hold,
+  // etc.
+
+  SCC &OldSCC = TargetSCC;
+  SmallVector<std::pair<Node *, call_edge_iterator>, 16> DFSStack;
+  SmallVector<Node *, 16> PendingSCCStack;
+  SmallVector<SCC *, 4> NewSCCs;
+
+  // Prepare the nodes for a fresh DFS.
+  SmallVector<Node *, 16> Worklist;
+  Worklist.swap(OldSCC.Nodes);
+  for (Node *N : Worklist) {
+    N->DFSNumber = N->LowLink = 0;
+    G->SCCMap.erase(N);
+  }
+
+  // Force the target node to be in the old SCC. This also enables us to take
+  // a very significant short-cut in the standard Tarjan walk to re-form SCCs
+  // below: whenever we build an edge that reaches the target node, we know
+  // that the target node eventually connects back to all other nodes in our
+  // walk. As a consequence, we can detect and handle participants in that
+  // cycle without walking all the edges that form this connection, and instead
+  // by relying on the fundamental guarantee coming into this operation (all
+  // nodes are reachable from the target due to previously forming an SCC).
+  TargetN.DFSNumber = TargetN.LowLink = -1;
+  OldSCC.Nodes.push_back(&TargetN);
+  G->SCCMap[&TargetN] = &OldSCC;
+
+  // Scan down the stack and DFS across the call edges.
+  for (Node *RootN : Worklist) {
+    assert(DFSStack.empty() &&
+           "Cannot begin a new root with a non-empty DFS stack!");
+    assert(PendingSCCStack.empty() &&
+           "Cannot begin a new root with pending nodes for an SCC!");
+
+    // Skip any nodes we've already reached in the DFS.
+    if (RootN->DFSNumber != 0) {
+      assert(RootN->DFSNumber == -1 &&
+             "Shouldn't have any mid-DFS root nodes!");
+      continue;
+    }
+
+    RootN->DFSNumber = RootN->LowLink = 1;
+    int NextDFSNumber = 2;
+
+    DFSStack.push_back({RootN, RootN->call_begin()});
+    do {
+      Node *N;
+      call_edge_iterator I;
+      std::tie(N, I) = DFSStack.pop_back_val();
+      auto E = N->call_end();
+      while (I != E) {
+        Node &ChildN = *I->getNode();
+        if (ChildN.DFSNumber == 0) {
+          // We haven't yet visited this child, so descend, pushing the current
+          // node onto the stack.
+          DFSStack.push_back({N, I});
+
+          assert(!G->SCCMap.count(&ChildN) &&
+                 "Found a node with 0 DFS number but already in an SCC!");
+          ChildN.DFSNumber = ChildN.LowLink = NextDFSNumber++;
+          N = &ChildN;
+          I = N->call_begin();
+          E = N->call_end();
+          continue;
+        }
+
+        // Check for the child already being part of some component.
+        if (ChildN.DFSNumber == -1) {
+          if (G->lookupSCC(ChildN) == &OldSCC) {
+            // If the child is part of the old SCC, we know that it can reach
+            // every other node, so we have formed a cycle. Pull the entire DFS
+            // and pending stacks into it. See the comment above about setting
+            // up the old SCC for why we do this.
+            int OldSize = OldSCC.size();
+            OldSCC.Nodes.push_back(N);
+            OldSCC.Nodes.append(PendingSCCStack.begin(), PendingSCCStack.end());
+            PendingSCCStack.clear();
+            while (!DFSStack.empty())
+              OldSCC.Nodes.push_back(DFSStack.pop_back_val().first);
+            for (Node &N : make_range(OldSCC.begin() + OldSize, OldSCC.end())) {
+              N.DFSNumber = N.LowLink = -1;
+              G->SCCMap[&N] = &OldSCC;
+            }
+            N = nullptr;
+            break;
+          }
+
+          // If the child has already been added to some child component, it
+          // couldn't impact the low-link of this parent because it isn't
+          // connected, and thus its low-link isn't relevant so skip it.
+          ++I;
+          continue;
+        }
+
+        // Track the lowest linked child as the lowest link for this node.
+        assert(ChildN.LowLink > 0 && "Must have a positive low-link number!");
+        if (ChildN.LowLink < N->LowLink)
+          N->LowLink = ChildN.LowLink;
+
+        // Move to the next edge.
+        ++I;
+      }
+      if (!N)
+        // Cleared the DFS early, start another round.
+        break;
+
+      // We've finished processing N and its descendents, put it on our pending
+      // SCC stack to eventually get merged into an SCC of nodes.
+      PendingSCCStack.push_back(N);
+
+      // If this node is linked to some lower entry, continue walking up the
+      // stack.
+      if (N->LowLink != N->DFSNumber)
+        continue;
+
+      // Otherwise, we've completed an SCC. Append it to our post order list of
+      // SCCs.
+      int RootDFSNumber = N->DFSNumber;
+      // Find the range of the node stack by walking down until we pass the
+      // root DFS number.
+      auto SCCNodes = make_range(
+          PendingSCCStack.rbegin(),
+          std::find_if(PendingSCCStack.rbegin(), PendingSCCStack.rend(),
+                       [RootDFSNumber](Node *N) {
+                         return N->DFSNumber < RootDFSNumber;
+                       }));
+
+      // Form a new SCC out of these nodes and then clear them off our pending
+      // stack.
+      NewSCCs.push_back(G->createSCC(*this, SCCNodes));
+      for (Node &N : *NewSCCs.back()) {
+        N.DFSNumber = N.LowLink = -1;
+        G->SCCMap[&N] = NewSCCs.back();
+      }
+      PendingSCCStack.erase(SCCNodes.end().base(), PendingSCCStack.end());
+    } while (!DFSStack.empty());
+  }
+
+  // Insert the remaining SCCs before the old one. The old SCC can reach all
+  // other SCCs we form because it contains the target node of the removed edge
+  // of the old SCC. This means that we will have edges into all of the new
+  // SCCs, which means the old one must come last for postorder.
+  int OldIdx = SCCIndices[&OldSCC];
+  SCCs.insert(SCCs.begin() + OldIdx, NewSCCs.begin(), NewSCCs.end());
+
+  // Update the mapping from SCC* to index to use the new SCC*s, and remove the
+  // old SCC from the mapping.
+  for (int Idx = OldIdx, Size = SCCs.size(); Idx < Size; ++Idx)
+    SCCIndices[SCCs[Idx]] = Idx;
+
+#ifndef NDEBUG
+  // We're done. Check the validity on our way out.
+  verify();
+#endif
+}
+
+void LazyCallGraph::RefSCC::switchOutgoingEdgeToCall(Node &SourceN,
+                                                     Node &TargetN) {
+  assert(!SourceN[TargetN].isCall() && "Must start with a ref edge!");
+
+  assert(G->lookupRefSCC(SourceN) == this && "Source must be in this RefSCC.");
+  assert(G->lookupRefSCC(TargetN) != this &&
+         "Target must not be in this RefSCC.");
+  assert(G->lookupRefSCC(TargetN)->isDescendantOf(*this) &&
+         "Target must be a descendant of the Source.");
+
+  // Edges between RefSCCs are the same regardless of call or ref, so we can
+  // just flip the edge here.
+  SourceN.setEdgeKind(TargetN.getFunction(), Edge::Call);
+
+#ifndef NDEBUG
+  // Check that the RefSCC is still valid.
+  verify();
+#endif
 }
 
-void LazyCallGraph::SCC::insertOutgoingEdge(Node &CallerN, Node &CalleeN) {
+void LazyCallGraph::RefSCC::switchOutgoingEdgeToRef(Node &SourceN,
+                                                    Node &TargetN) {
+  assert(SourceN[TargetN].isCall() && "Must start with a call edge!");
+
+  assert(G->lookupRefSCC(SourceN) == this && "Source must be in this RefSCC.");
+  assert(G->lookupRefSCC(TargetN) != this &&
+         "Target must not be in this RefSCC.");
+  assert(G->lookupRefSCC(TargetN)->isDescendantOf(*this) &&
+         "Target must be a descendant of the Source.");
+
+  // Edges between RefSCCs are the same regardless of call or ref, so we can
+  // just flip the edge here.
+  SourceN.setEdgeKind(TargetN.getFunction(), Edge::Ref);
+
+#ifndef NDEBUG
+  // Check that the RefSCC is still valid.
+  verify();
+#endif
+}
+
+void LazyCallGraph::RefSCC::insertInternalRefEdge(Node &SourceN,
+                                                  Node &TargetN) {
+  assert(G->lookupRefSCC(SourceN) == this && "Source must be in this RefSCC.");
+  assert(G->lookupRefSCC(TargetN) == this && "Target must be in this RefSCC.");
+
+  SourceN.insertEdgeInternal(TargetN, Edge::Ref);
+
+#ifndef NDEBUG
+  // Check that the RefSCC is still valid.
+  verify();
+#endif
+}
+
+void LazyCallGraph::RefSCC::insertOutgoingEdge(Node &SourceN, Node &TargetN,
+                                               Edge::Kind EK) {
   // First insert it into the caller.
-  CallerN.insertEdgeInternal(CalleeN);
+  SourceN.insertEdgeInternal(TargetN, EK);
 
-  assert(G->SCCMap.lookup(&CallerN) == this && "Caller must be in this SCC.");
+  assert(G->lookupRefSCC(SourceN) == this && "Source must be in this RefSCC.");
 
-  SCC &CalleeC = *G->SCCMap.lookup(&CalleeN);
-  assert(&CalleeC != this && "Callee must not be in this SCC.");
-  assert(CalleeC.isDescendantOf(*this) &&
-         "Callee must be a descendant of the Caller.");
+  RefSCC &TargetC = *G->lookupRefSCC(TargetN);
+  assert(&TargetC != this && "Target must not be in this RefSCC.");
+  assert(TargetC.isDescendantOf(*this) &&
+         "Target must be a descendant of the Source.");
 
   // The only change required is to add this SCC to the parent set of the
   // callee.
-  CalleeC.ParentSCCs.insert(this);
+  TargetC.Parents.insert(this);
+
+#ifndef NDEBUG
+  // Check that the RefSCC is still valid.
+  verify();
+#endif
 }
 
-SmallVector<LazyCallGraph::SCC *, 1>
-LazyCallGraph::SCC::insertIncomingEdge(Node &CallerN, Node &CalleeN) {
-  // First insert it into the caller.
-  CallerN.insertEdgeInternal(CalleeN);
+SmallVector<LazyCallGraph::RefSCC *, 1>
+LazyCallGraph::RefSCC::insertIncomingRefEdge(Node &SourceN, Node &TargetN) {
+  assert(G->lookupRefSCC(TargetN) == this && "Target must be in this SCC.");
 
-  assert(G->SCCMap.lookup(&CalleeN) == this && "Callee must be in this SCC.");
+  // We store the RefSCCs found to be connected in postorder so that we can use
+  // that when merging. We also return this to the caller to allow them to
+  // invalidate information pertaining to these RefSCCs.
+  SmallVector<RefSCC *, 1> Connected;
 
-  SCC &CallerC = *G->SCCMap.lookup(&CallerN);
-  assert(&CallerC != this && "Caller must not be in this SCC.");
-  assert(CallerC.isDescendantOf(*this) &&
-         "Caller must be a descendant of the Callee.");
+  RefSCC &SourceC = *G->lookupRefSCC(SourceN);
+  assert(&SourceC != this && "Source must not be in this SCC.");
+  assert(SourceC.isDescendantOf(*this) &&
+         "Source must be a descendant of the Target.");
 
   // The algorithm we use for merging SCCs based on the cycle introduced here
-  // is to walk the SCC inverted DAG formed by the parent SCC sets. The inverse
-  // graph has the same cycle properties as the actual DAG of the SCCs, and
-  // when forming SCCs lazily by a DFS, the bottom of the graph won't exist in
-  // many cases which should prune the search space.
+  // is to walk the RefSCC inverted DAG formed by the parent sets. The inverse
+  // graph has the same cycle properties as the actual DAG of the RefSCCs, and
+  // when forming RefSCCs lazily by a DFS, the bottom of the graph won't exist
+  // in many cases which should prune the search space.
   //
-  // FIXME: We can get this pruning behavior even after the incremental SCC
+  // FIXME: We can get this pruning behavior even after the incremental RefSCC
   // formation by leaving behind (conservative) DFS numberings in the nodes,
   // and pruning the search with them. These would need to be cleverly updated
   // during the removal of intra-SCC edges, but could be preserved
   // conservatively.
+  //
+  // FIXME: This operation currently creates ordering stability problems
+  // because we don't use stably ordered containers for the parent SCCs.
 
-  // The set of SCCs that are connected to the caller, and thus will
+  // The set of RefSCCs that are connected to the parent, and thus will
   // participate in the merged connected component.
-  SmallPtrSet<SCC *, 8> ConnectedSCCs;
-  ConnectedSCCs.insert(this);
-  ConnectedSCCs.insert(&CallerC);
+  SmallPtrSet<RefSCC *, 8> ConnectedSet;
+  ConnectedSet.insert(this);
 
   // We build up a DFS stack of the parents chains.
-  SmallVector<std::pair<SCC *, SCC::parent_iterator>, 8> DFSSCCs;
-  SmallPtrSet<SCC *, 8> VisitedSCCs;
+  SmallVector<std::pair<RefSCC *, parent_iterator>, 8> DFSStack;
+  SmallPtrSet<RefSCC *, 8> Visited;
   int ConnectedDepth = -1;
-  SCC *C = this;
-  parent_iterator I = parent_begin(), E = parent_end();
-  for (;;) {
+  DFSStack.push_back({&SourceC, SourceC.parent_begin()});
+  do {
+    auto DFSPair = DFSStack.pop_back_val();
+    RefSCC *C = DFSPair.first;
+    parent_iterator I = DFSPair.second;
+    auto E = C->parent_end();
+
     while (I != E) {
-      SCC &ParentSCC = *I++;
+      RefSCC &Parent = *I++;
 
       // If we have already processed this parent SCC, skip it, and remember
       // whether it was connected so we don't have to check the rest of the
       // stack. This also handles when we reach a child of the 'this' SCC (the
       // callee) which terminates the search.
-      if (ConnectedSCCs.count(&ParentSCC)) {
-        ConnectedDepth = std::max<int>(ConnectedDepth, DFSSCCs.size());
+      if (ConnectedSet.count(&Parent)) {
+        assert(ConnectedDepth < (int)DFSStack.size() &&
+               "Cannot have a connected depth greater than the DFS depth!");
+        ConnectedDepth = DFSStack.size();
         continue;
       }
-      if (VisitedSCCs.count(&ParentSCC))
+      if (Visited.count(&Parent))
         continue;
 
       // We fully explore the depth-first space, adding nodes to the connected
       // set only as we pop them off, so "recurse" by rotating to the parent.
-      DFSSCCs.push_back(std::make_pair(C, I));
-      C = &ParentSCC;
-      I = ParentSCC.parent_begin();
-      E = ParentSCC.parent_end();
+      DFSStack.push_back({C, I});
+      C = &Parent;
+      I = C->parent_begin();
+      E = C->parent_end();
     }
 
     // If we've found a connection anywhere below this point on the stack (and
     // thus up the parent graph from the caller), the current node needs to be
     // added to the connected set now that we've processed all of its parents.
-    if ((int)DFSSCCs.size() == ConnectedDepth) {
+    if ((int)DFSStack.size() == ConnectedDepth) {
       --ConnectedDepth; // We're finished with this connection.
-      ConnectedSCCs.insert(C);
+      bool Inserted = ConnectedSet.insert(C).second;
+      (void)Inserted;
+      assert(Inserted && "Cannot insert a refSCC multiple times!");
+      Connected.push_back(C);
     } else {
       // Otherwise remember that its parents don't ever connect.
-      assert(ConnectedDepth < (int)DFSSCCs.size() &&
+      assert(ConnectedDepth < (int)DFSStack.size() &&
              "Cannot have a connected depth greater than the DFS depth!");
-      VisitedSCCs.insert(C);
+      Visited.insert(C);
     }
-
-    if (DFSSCCs.empty())
-      break; // We've walked all the parents of the caller transitively.
-
-    // Pop off the prior node and position to unwind the depth first recursion.
-    std::tie(C, I) = DFSSCCs.pop_back_val();
-    E = C->parent_end();
-  }
+  } while (!DFSStack.empty());
 
   // Now that we have identified all of the SCCs which need to be merged into
   // a connected set with the inserted edge, merge all of them into this SCC.
-  // FIXME: This operation currently creates ordering stability problems
-  // because we don't use stably ordered containers for the parent SCCs or the
-  // connected SCCs.
-  unsigned NewNodeBeginIdx = Nodes.size();
-  for (SCC *C : ConnectedSCCs) {
-    if (C == this)
-      continue;
-    for (SCC *ParentC : C->ParentSCCs)
-      if (!ConnectedSCCs.count(ParentC))
-        ParentSCCs.insert(ParentC);
-    C->ParentSCCs.clear();
-
-    for (Node *N : *C) {
-      for (Node &ChildN : *N) {
-        SCC &ChildC = *G->SCCMap.lookup(&ChildN);
-        if (&ChildC != C)
-          ChildC.ParentSCCs.erase(C);
+  // We walk the newly connected RefSCCs in the reverse postorder of the parent
+  // DAG walk above and merge in each of their SCC postorder lists. This
+  // ensures a merged postorder SCC list.
+  SmallVector<SCC *, 16> MergedSCCs;
+  int SCCIndex = 0;
+  for (RefSCC *C : reverse(Connected)) {
+    assert(C != this &&
+           "This RefSCC should terminate the DFS without being reached.");
+
+    // Merge the parents which aren't part of the merge into the our parents.
+    for (RefSCC *ParentC : C->Parents)
+      if (!ConnectedSet.count(ParentC))
+        Parents.insert(ParentC);
+    C->Parents.clear();
+
+    // Walk the inner SCCs to update their up-pointer and walk all the edges to
+    // update any parent sets.
+    // FIXME: We should try to find a way to avoid this (rather expensive) edge
+    // walk by updating the parent sets in some other manner.
+    for (SCC &InnerC : *C) {
+      InnerC.OuterRefSCC = this;
+      SCCIndices[&InnerC] = SCCIndex++;
+      for (Node &N : InnerC) {
+        G->SCCMap[&N] = &InnerC;
+        for (Edge &E : N) {
+          assert(E.getNode() &&
+                 "Cannot have a null node within a visited SCC!");
+          RefSCC &ChildRC = *G->lookupRefSCC(*E.getNode());
+          if (ConnectedSet.count(&ChildRC))
+            continue;
+          ChildRC.Parents.erase(C);
+          ChildRC.Parents.insert(this);
+        }
       }
-      G->SCCMap[N] = this;
-      Nodes.push_back(N);
     }
-    C->Nodes.clear();
+
+    // Now merge in the SCCs. We can actually move here so try to reuse storage
+    // the first time through.
+    if (MergedSCCs.empty())
+      MergedSCCs = std::move(C->SCCs);
+    else
+      MergedSCCs.append(C->SCCs.begin(), C->SCCs.end());
+    C->SCCs.clear();
   }
-  for (auto I = Nodes.begin() + NewNodeBeginIdx, E = Nodes.end(); I != E; ++I)
-    for (Node &ChildN : **I) {
-      SCC &ChildC = *G->SCCMap.lookup(&ChildN);
-      if (&ChildC != this)
-        ChildC.ParentSCCs.insert(this);
-    }
+
+  // Finally append our original SCCs to the merged list and move it into
+  // place.
+  for (SCC &InnerC : *this)
+    SCCIndices[&InnerC] = SCCIndex++;
+  MergedSCCs.append(SCCs.begin(), SCCs.end());
+  SCCs = std::move(MergedSCCs);
+
+  // At this point we have a merged RefSCC with a post-order SCCs list, just
+  // connect the nodes to form the new edge.
+  SourceN.insertEdgeInternal(TargetN, Edge::Ref);
+
+#ifndef NDEBUG
+  // Check that the RefSCC is still valid.
+  verify();
+#endif
 
   // We return the list of SCCs which were merged so that callers can
   // invalidate any data they have associated with those SCCs. Note that these
   // SCCs are no longer in an interesting state (they are totally empty) but
   // the pointers will remain stable for the life of the graph itself.
-  return SmallVector<SCC *, 1>(ConnectedSCCs.begin(), ConnectedSCCs.end());
+  return Connected;
 }
 
-void LazyCallGraph::SCC::removeInterSCCEdge(Node &CallerN, Node &CalleeN) {
+void LazyCallGraph::RefSCC::removeOutgoingEdge(Node &SourceN, Node &TargetN) {
+  assert(G->lookupRefSCC(SourceN) == this &&
+         "The source must be a member of this RefSCC.");
+
+  RefSCC &TargetRC = *G->lookupRefSCC(TargetN);
+  assert(&TargetRC != this && "The target must not be a member of this RefSCC");
+
+  assert(std::find(G->LeafRefSCCs.begin(), G->LeafRefSCCs.end(), this) ==
+             G->LeafRefSCCs.end() &&
+         "Cannot have a leaf RefSCC source.");
+
   // First remove it from the node.
-  CallerN.removeEdgeInternal(CalleeN.getFunction());
-
-  assert(G->SCCMap.lookup(&CallerN) == this &&
-         "The caller must be a member of this SCC.");
-
-  SCC &CalleeC = *G->SCCMap.lookup(&CalleeN);
-  assert(&CalleeC != this &&
-         "This API only supports the rmoval of inter-SCC edges.");
-
-  assert(std::find(G->LeafSCCs.begin(), G->LeafSCCs.end(), this) ==
-             G->LeafSCCs.end() &&
-         "Cannot have a leaf SCC caller with a different SCC callee.");
-
-  bool HasOtherCallToCalleeC = false;
-  bool HasOtherCallOutsideSCC = false;
-  for (Node *N : *this) {
-    for (Node &OtherCalleeN : *N) {
-      SCC &OtherCalleeC = *G->SCCMap.lookup(&OtherCalleeN);
-      if (&OtherCalleeC == &CalleeC) {
-        HasOtherCallToCalleeC = true;
-        break;
+  SourceN.removeEdgeInternal(TargetN.getFunction());
+
+  bool HasOtherEdgeToChildRC = false;
+  bool HasOtherChildRC = false;
+  for (SCC *InnerC : SCCs) {
+    for (Node &N : *InnerC) {
+      for (Edge &E : N) {
+        assert(E.getNode() && "Cannot have a missing node in a visited SCC!");
+        RefSCC &OtherChildRC = *G->lookupRefSCC(*E.getNode());
+        if (&OtherChildRC == &TargetRC) {
+          HasOtherEdgeToChildRC = true;
+          break;
+        }
+        if (&OtherChildRC != this)
+          HasOtherChildRC = true;
       }
-      if (&OtherCalleeC != this)
-        HasOtherCallOutsideSCC = true;
+      if (HasOtherEdgeToChildRC)
+        break;
     }
-    if (HasOtherCallToCalleeC)
+    if (HasOtherEdgeToChildRC)
       break;
   }
   // Because the SCCs form a DAG, deleting such an edge cannot change the set
   // of SCCs in the graph. However, it may cut an edge of the SCC DAG, making
-  // the caller no longer a parent of the callee. Walk the other call edges
-  // in the caller to tell.
-  if (!HasOtherCallToCalleeC) {
-    bool Removed = CalleeC.ParentSCCs.erase(this);
+  // the source SCC no longer connected to the target SCC. If so, we need to
+  // update the target SCC's map of its parents.
+  if (!HasOtherEdgeToChildRC) {
+    bool Removed = TargetRC.Parents.erase(this);
     (void)Removed;
     assert(Removed &&
-           "Did not find the caller SCC in the callee SCC's parent list!");
+           "Did not find the source SCC in the target SCC's parent list!");
 
     // It may orphan an SCC if it is the last edge reaching it, but that does
     // not violate any invariants of the graph.
-    if (CalleeC.ParentSCCs.empty())
-      DEBUG(dbgs() << "LCG: Update removing " << CallerN.getFunction().getName()
-                   << " -> " << CalleeN.getFunction().getName()
+    if (TargetRC.Parents.empty())
+      DEBUG(dbgs() << "LCG: Update removing " << SourceN.getFunction().getName()
+                   << " -> " << TargetN.getFunction().getName()
                    << " edge orphaned the callee's SCC!\n");
-  }
 
-  // It may make the Caller SCC a leaf SCC.
-  if (!HasOtherCallOutsideSCC)
-    G->LeafSCCs.push_back(this);
+    // It may make the Source SCC a leaf SCC.
+    if (!HasOtherChildRC)
+      G->LeafRefSCCs.push_back(this);
+  }
 }
 
-void LazyCallGraph::SCC::internalDFS(
-    SmallVectorImpl<std::pair<Node *, Node::iterator>> &DFSStack,
-    SmallVectorImpl<Node *> &PendingSCCStack, Node *N,
-    SmallVectorImpl<SCC *> &ResultSCCs) {
-  Node::iterator I = N->begin();
-  N->LowLink = N->DFSNumber = 1;
-  int NextDFSNumber = 2;
-  for (;;) {
-    assert(N->DFSNumber != 0 && "We should always assign a DFS number "
-                                "before processing a node.");
+SmallVector<LazyCallGraph::RefSCC *, 1>
+LazyCallGraph::RefSCC::removeInternalRefEdge(Node &SourceN, Node &TargetN) {
+  assert(!SourceN[TargetN].isCall() &&
+         "Cannot remove a call edge, it must first be made a ref edge");
 
-    // We simulate recursion by popping out of the nested loop and continuing.
-    Node::iterator E = N->end();
-    while (I != E) {
-      Node &ChildN = *I;
-      if (SCC *ChildSCC = G->SCCMap.lookup(&ChildN)) {
-        // Check if we have reached a node in the new (known connected) set of
-        // this SCC. If so, the entire stack is necessarily in that set and we
-        // can re-start.
-        if (ChildSCC == this) {
-          insert(*N);
-          while (!PendingSCCStack.empty())
-            insert(*PendingSCCStack.pop_back_val());
-          while (!DFSStack.empty())
-            insert(*DFSStack.pop_back_val().first);
-          return;
-        }
+  // First remove the actual edge.
+  SourceN.removeEdgeInternal(TargetN.getFunction());
 
-        // If this child isn't currently in this SCC, no need to process it.
-        // However, we do need to remove this SCC from its SCC's parent set.
-        ChildSCC->ParentSCCs.erase(this);
-        ++I;
-        continue;
-      }
+  // We return a list of the resulting *new* RefSCCs in post-order.
+  SmallVector<RefSCC *, 1> Result;
 
-      if (ChildN.DFSNumber == 0) {
-        // Mark that we should start at this child when next this node is the
-        // top of the stack. We don't start at the next child to ensure this
-        // child's lowlink is reflected.
-        DFSStack.push_back(std::make_pair(N, I));
+  // Direct recursion doesn't impact the SCC graph at all.
+  if (&SourceN == &TargetN)
+    return Result;
+
+  // We build somewhat synthetic new RefSCCs by providing a postorder mapping
+  // for each inner SCC. We also store these associated with *nodes* rather
+  // than SCCs because this saves a round-trip through the node->SCC map and in
+  // the common case, SCCs are small. We will verify that we always give the
+  // same number to every node in the SCC such that these are equivalent.
+  const int RootPostOrderNumber = 0;
+  int PostOrderNumber = RootPostOrderNumber + 1;
+  SmallDenseMap<Node *, int> PostOrderMapping;
+
+  // Every node in the target SCC can already reach every node in this RefSCC
+  // (by definition). It is the only node we know will stay inside this RefSCC.
+  // Everything which transitively reaches Target will also remain in the
+  // RefSCC. We handle this by pre-marking that the nodes in the target SCC map
+  // back to the root post order number.
+  //
+  // This also enables us to take a very significant short-cut in the standard
+  // Tarjan walk to re-form RefSCCs below: whenever we build an edge that
+  // references the target node, we know that the target node eventually
+  // references all other nodes in our walk. As a consequence, we can detect
+  // and handle participants in that cycle without walking all the edges that
+  // form the connections, and instead by relying on the fundamental guarantee
+  // coming into this operation.
+  SCC &TargetC = *G->lookupSCC(TargetN);
+  for (Node &N : TargetC)
+    PostOrderMapping[&N] = RootPostOrderNumber;
+
+  // Reset all the other nodes to prepare for a DFS over them, and add them to
+  // our worklist.
+  SmallVector<Node *, 8> Worklist;
+  for (SCC *C : SCCs) {
+    if (C == &TargetC)
+      continue;
 
-        // Continue, resetting to the child node.
-        ChildN.LowLink = ChildN.DFSNumber = NextDFSNumber++;
-        N = &ChildN;
-        I = ChildN.begin();
-        E = ChildN.end();
-        continue;
-      }
+    for (Node &N : *C)
+      N.DFSNumber = N.LowLink = 0;
 
-      // Track the lowest link of the children, if any are still in the stack.
-      // Any child not on the stack will have a LowLink of -1.
-      assert(ChildN.LowLink != 0 &&
-             "Low-link must not be zero with a non-zero DFS number.");
-      if (ChildN.LowLink >= 0 && ChildN.LowLink < N->LowLink)
-        N->LowLink = ChildN.LowLink;
-      ++I;
-    }
+    Worklist.append(C->Nodes.begin(), C->Nodes.end());
+  }
 
-    if (N->LowLink == N->DFSNumber) {
-      ResultSCCs.push_back(G->formSCC(N, PendingSCCStack));
-      if (DFSStack.empty())
-        return;
-    } else {
-      // At this point we know that N cannot ever be an SCC root. Its low-link
-      // is not its dfs-number, and we've processed all of its children. It is
-      // just sitting here waiting until some node further down the stack gets
-      // low-link == dfs-number and pops it off as well. Move it to the pending
-      // stack which is pulled into the next SCC to be formed.
-      PendingSCCStack.push_back(N);
+  auto MarkNodeForSCCNumber = [&PostOrderMapping](Node &N, int Number) {
+    N.DFSNumber = N.LowLink = -1;
+    PostOrderMapping[&N] = Number;
+  };
 
-      assert(!DFSStack.empty() && "We shouldn't have an empty stack!");
+  SmallVector<std::pair<Node *, edge_iterator>, 4> DFSStack;
+  SmallVector<Node *, 4> PendingRefSCCStack;
+  do {
+    assert(DFSStack.empty() &&
+           "Cannot begin a new root with a non-empty DFS stack!");
+    assert(PendingRefSCCStack.empty() &&
+           "Cannot begin a new root with pending nodes for an SCC!");
+
+    Node *RootN = Worklist.pop_back_val();
+    // Skip any nodes we've already reached in the DFS.
+    if (RootN->DFSNumber != 0) {
+      assert(RootN->DFSNumber == -1 &&
+             "Shouldn't have any mid-DFS root nodes!");
+      continue;
     }
 
-    N = DFSStack.back().first;
-    I = DFSStack.back().second;
-    DFSStack.pop_back();
-  }
-}
+    RootN->DFSNumber = RootN->LowLink = 1;
+    int NextDFSNumber = 2;
 
-SmallVector<LazyCallGraph::SCC *, 1>
-LazyCallGraph::SCC::removeIntraSCCEdge(Node &CallerN, Node &CalleeN) {
-  // First remove it from the node.
-  CallerN.removeEdgeInternal(CalleeN.getFunction());
+    DFSStack.push_back({RootN, RootN->begin()});
+    do {
+      Node *N;
+      edge_iterator I;
+      std::tie(N, I) = DFSStack.pop_back_val();
+      auto E = N->end();
+
+      assert(N->DFSNumber != 0 && "We should always assign a DFS number "
+                                  "before processing a node.");
+
+      while (I != E) {
+        Node &ChildN = I->getNode(*G);
+        if (ChildN.DFSNumber == 0) {
+          // Mark that we should start at this child when next this node is the
+          // top of the stack. We don't start at the next child to ensure this
+          // child's lowlink is reflected.
+          DFSStack.push_back({N, I});
+
+          // Continue, resetting to the child node.
+          ChildN.LowLink = ChildN.DFSNumber = NextDFSNumber++;
+          N = &ChildN;
+          I = ChildN.begin();
+          E = ChildN.end();
+          continue;
+        }
+        if (ChildN.DFSNumber == -1) {
+          // Check if this edge's target node connects to the deleted edge's
+          // target node. If so, we know that every node connected will end up
+          // in this RefSCC, so collapse the entire current stack into the root
+          // slot in our SCC numbering. See above for the motivation of
+          // optimizing the target connected nodes in this way.
+          auto PostOrderI = PostOrderMapping.find(&ChildN);
+          if (PostOrderI != PostOrderMapping.end() &&
+              PostOrderI->second == RootPostOrderNumber) {
+            MarkNodeForSCCNumber(*N, RootPostOrderNumber);
+            while (!PendingRefSCCStack.empty())
+              MarkNodeForSCCNumber(*PendingRefSCCStack.pop_back_val(),
+                                   RootPostOrderNumber);
+            while (!DFSStack.empty())
+              MarkNodeForSCCNumber(*DFSStack.pop_back_val().first,
+                                   RootPostOrderNumber);
+            // Ensure we break all the way out of the enclosing loop.
+            N = nullptr;
+            break;
+          }
+
+          // If this child isn't currently in this RefSCC, no need to process
+          // it.
+          // However, we do need to remove this RefSCC from its RefSCC's parent
+          // set.
+          RefSCC &ChildRC = *G->lookupRefSCC(ChildN);
+          ChildRC.Parents.erase(this);
+          ++I;
+          continue;
+        }
 
-  // We return a list of the resulting *new* SCCs in postorder.
-  SmallVector<SCC *, 1> ResultSCCs;
+        // Track the lowest link of the children, if any are still in the stack.
+        // Any child not on the stack will have a LowLink of -1.
+        assert(ChildN.LowLink != 0 &&
+               "Low-link must not be zero with a non-zero DFS number.");
+        if (ChildN.LowLink >= 0 && ChildN.LowLink < N->LowLink)
+          N->LowLink = ChildN.LowLink;
+        ++I;
+      }
+      if (!N)
+        // We short-circuited this node.
+        break;
 
-  // Direct recursion doesn't impact the SCC graph at all.
-  if (&CallerN == &CalleeN)
-    return ResultSCCs;
+      // We've finished processing N and its descendents, put it on our pending
+      // stack to eventually get merged into a RefSCC.
+      PendingRefSCCStack.push_back(N);
 
-  // The worklist is every node in the original SCC.
-  SmallVector<Node *, 1> Worklist;
-  Worklist.swap(Nodes);
-  for (Node *N : Worklist) {
-    // The nodes formerly in this SCC are no longer in any SCC.
-    N->DFSNumber = 0;
-    N->LowLink = 0;
-    G->SCCMap.erase(N);
-  }
-  assert(Worklist.size() > 1 && "We have to have at least two nodes to have an "
-                                "edge between them that is within the SCC.");
-
-  // The callee can already reach every node in this SCC (by definition). It is
-  // the only node we know will stay inside this SCC. Everything which
-  // transitively reaches Callee will also remain in the SCC. To model this we
-  // incrementally add any chain of nodes which reaches something in the new
-  // node set to the new node set. This short circuits one side of the Tarjan's
-  // walk.
-  insert(CalleeN);
-
-  // We're going to do a full mini-Tarjan's walk using a local stack here.
-  SmallVector<std::pair<Node *, Node::iterator>, 4> DFSStack;
-  SmallVector<Node *, 4> PendingSCCStack;
-  do {
-    Node *N = Worklist.pop_back_val();
-    if (N->DFSNumber == 0)
-      internalDFS(DFSStack, PendingSCCStack, N, ResultSCCs);
+      // If this node is linked to some lower entry, continue walking up the
+      // stack.
+      if (N->LowLink != N->DFSNumber) {
+        assert(!DFSStack.empty() &&
+               "We never found a viable root for a RefSCC to pop off!");
+        continue;
+      }
+
+      // Otherwise, form a new RefSCC from the top of the pending node stack.
+      int RootDFSNumber = N->DFSNumber;
+      // Find the range of the node stack by walking down until we pass the
+      // root DFS number.
+      auto RefSCCNodes = make_range(
+          PendingRefSCCStack.rbegin(),
+          std::find_if(PendingRefSCCStack.rbegin(), PendingRefSCCStack.rend(),
+                       [RootDFSNumber](Node *N) {
+                         return N->DFSNumber < RootDFSNumber;
+                       }));
+
+      // Mark the postorder number for these nodes and clear them off the
+      // stack. We'll use the postorder number to pull them into RefSCCs at the
+      // end. FIXME: Fuse with the loop above.
+      int RefSCCNumber = PostOrderNumber++;
+      for (Node *N : RefSCCNodes)
+        MarkNodeForSCCNumber(*N, RefSCCNumber);
+
+      PendingRefSCCStack.erase(RefSCCNodes.end().base(),
+                               PendingRefSCCStack.end());
+    } while (!DFSStack.empty());
 
     assert(DFSStack.empty() && "Didn't flush the entire DFS stack!");
-    assert(PendingSCCStack.empty() && "Didn't flush all pending SCC nodes!");
+    assert(PendingRefSCCStack.empty() && "Didn't flush all pending nodes!");
   } while (!Worklist.empty());
 
-  // Now we need to reconnect the current SCC to the graph.
-  bool IsLeafSCC = true;
-  for (Node *N : Nodes) {
-    for (Node &ChildN : *N) {
-      SCC &ChildSCC = *G->SCCMap.lookup(&ChildN);
-      if (&ChildSCC == this)
-        continue;
-      ChildSCC.ParentSCCs.insert(this);
-      IsLeafSCC = false;
-    }
+  // We now have a post-order numbering for RefSCCs and a mapping from each
+  // node in this RefSCC to its final RefSCC. We create each new RefSCC node
+  // (re-using this RefSCC node for the root) and build a radix-sort style map
+  // from postorder number to the RefSCC. We then append SCCs to each of these
+  // RefSCCs in the order they occured in the original SCCs container.
+  for (int i = 1; i < PostOrderNumber; ++i)
+    Result.push_back(G->createRefSCC(*G));
+
+  for (SCC *C : SCCs) {
+    auto PostOrderI = PostOrderMapping.find(&*C->begin());
+    assert(PostOrderI != PostOrderMapping.end() &&
+           "Cannot have missing mappings for nodes!");
+    int SCCNumber = PostOrderI->second;
+#ifndef NDEBUG
+    for (Node &N : *C)
+      assert(PostOrderMapping.find(&N)->second == SCCNumber &&
+             "Cannot have different numbers for nodes in the same SCC!");
+#endif
+    if (SCCNumber == 0)
+      // The root node is handled separately by removing the SCCs.
+      continue;
+
+    RefSCC &RC = *Result[SCCNumber - 1];
+    int SCCIndex = RC.SCCs.size();
+    RC.SCCs.push_back(C);
+    SCCIndices[C] = SCCIndex;
+    C->OuterRefSCC = &RC;
   }
+
+  // FIXME: We re-walk the edges in each RefSCC to establish whether it is
+  // a leaf and connect it to the rest of the graph's parents lists. This is
+  // really wasteful. We should instead do this during the DFS to avoid yet
+  // another edge walk.
+  for (RefSCC *RC : Result)
+    G->connectRefSCC(*RC);
+
+  // Now erase all but the root's SCCs.
+  SCCs.erase(std::remove_if(SCCs.begin(), SCCs.end(),
+                            [&](SCC *C) {
+                              return PostOrderMapping.lookup(&*C->begin()) !=
+                                     RootPostOrderNumber;
+                            }),
+             SCCs.end());
+
+#ifndef NDEBUG
+  // Now we need to reconnect the current (root) SCC to the graph. We do this
+  // manually because we can special case our leaf handling and detect errors.
+  bool IsLeaf = true;
+#endif
+  for (SCC *C : SCCs)
+    for (Node &N : *C) {
+      for (Edge &E : N) {
+        assert(E.getNode() && "Cannot have a missing node in a visited SCC!");
+        RefSCC &ChildRC = *G->lookupRefSCC(*E.getNode());
+        if (&ChildRC == this)
+          continue;
+        ChildRC.Parents.insert(this);
+#ifndef NDEBUG
+        IsLeaf = false;
+#endif
+      }
+    }
 #ifndef NDEBUG
-  if (!ResultSCCs.empty())
-    assert(!IsLeafSCC && "This SCC cannot be a leaf as we have split out new "
-                         "SCCs by removing this edge.");
-  if (!std::any_of(G->LeafSCCs.begin(), G->LeafSCCs.end(),
-                   [&](SCC *C) { return C == this; }))
-    assert(!IsLeafSCC && "This SCC cannot be a leaf as it already had child "
-                         "SCCs before we removed this edge.");
+  if (!Result.empty())
+    assert(!IsLeaf && "This SCC cannot be a leaf as we have split out new "
+                      "SCCs by removing this edge.");
+  if (!std::any_of(G->LeafRefSCCs.begin(), G->LeafRefSCCs.end(),
+                   [&](RefSCC *C) { return C == this; }))
+    assert(!IsLeaf && "This SCC cannot be a leaf as it already had child "
+                      "SCCs before we removed this edge.");
 #endif
   // If this SCC stopped being a leaf through this edge removal, remove it from
-  // the leaf SCC list.
-  if (!IsLeafSCC && !ResultSCCs.empty())
-    G->LeafSCCs.erase(std::remove(G->LeafSCCs.begin(), G->LeafSCCs.end(), this),
-                      G->LeafSCCs.end());
+  // the leaf SCC list. Note that this DTRT in the case where this was never
+  // a leaf.
+  // FIXME: As LeafRefSCCs could be very large, we might want to not walk the
+  // entire list if this RefSCC wasn't a leaf before the edge removal.
+  if (!Result.empty())
+    G->LeafRefSCCs.erase(
+        std::remove(G->LeafRefSCCs.begin(), G->LeafRefSCCs.end(), this),
+        G->LeafRefSCCs.end());
 
   // Return the new list of SCCs.
-  return ResultSCCs;
+  return Result;
 }
 
-void LazyCallGraph::insertEdge(Node &CallerN, Function &Callee) {
+void LazyCallGraph::insertEdge(Node &SourceN, Function &Target, Edge::Kind EK) {
   assert(SCCMap.empty() && DFSStack.empty() &&
          "This method cannot be called after SCCs have been formed!");
 
-  return CallerN.insertEdgeInternal(Callee);
+  return SourceN.insertEdgeInternal(Target, EK);
 }
 
-void LazyCallGraph::removeEdge(Node &CallerN, Function &Callee) {
+void LazyCallGraph::removeEdge(Node &SourceN, Function &Target) {
   assert(SCCMap.empty() && DFSStack.empty() &&
          "This method cannot be called after SCCs have been formed!");
 
-  return CallerN.removeEdgeInternal(Callee);
+  return SourceN.removeEdgeInternal(Target);
 }
 
 LazyCallGraph::Node &LazyCallGraph::insertInto(Function &F, Node *&MappedN) {
@@ -550,133 +1248,266 @@ void LazyCallGraph::updateGraphPtrs() {
   // Process all nodes updating the graph pointers.
   {
     SmallVector<Node *, 16> Worklist;
-    for (auto &Entry : EntryNodes)
-      if (Node *EntryN = Entry.dyn_cast<Node *>())
+    for (Edge &E : EntryEdges)
+      if (Node *EntryN = E.getNode())
         Worklist.push_back(EntryN);
 
     while (!Worklist.empty()) {
       Node *N = Worklist.pop_back_val();
       N->G = this;
-      for (auto &Callee : N->Callees)
-        if (!Callee.isNull())
-          if (Node *CalleeN = Callee.dyn_cast<Node *>())
-            Worklist.push_back(CalleeN);
+      for (Edge &E : N->Edges)
+        if (Node *TargetN = E.getNode())
+          Worklist.push_back(TargetN);
     }
   }
 
   // Process all SCCs updating the graph pointers.
   {
-    SmallVector<SCC *, 16> Worklist(LeafSCCs.begin(), LeafSCCs.end());
+    SmallVector<RefSCC *, 16> Worklist(LeafRefSCCs.begin(), LeafRefSCCs.end());
 
     while (!Worklist.empty()) {
-      SCC *C = Worklist.pop_back_val();
-      C->G = this;
-      Worklist.insert(Worklist.end(), C->ParentSCCs.begin(),
-                      C->ParentSCCs.end());
+      RefSCC &C = *Worklist.pop_back_val();
+      C.G = this;
+      for (RefSCC &ParentC : C.parents())
+        Worklist.push_back(&ParentC);
     }
   }
 }
 
-LazyCallGraph::SCC *LazyCallGraph::formSCC(Node *RootN,
-                                           SmallVectorImpl<Node *> &NodeStack) {
-  // The tail of the stack is the new SCC. Allocate the SCC and pop the stack
-  // into it.
-  SCC *NewSCC = new (SCCBPA.Allocate()) SCC(*this);
+/// Build the internal SCCs for a RefSCC from a sequence of nodes.
+///
+/// Appends the SCCs to the provided vector and updates the map with their
+/// indices. Both the vector and map must be empty when passed into this
+/// routine.
+void LazyCallGraph::buildSCCs(RefSCC &RC, node_stack_range Nodes) {
+  assert(RC.SCCs.empty() && "Already built SCCs!");
+  assert(RC.SCCIndices.empty() && "Already mapped SCC indices!");
 
-  while (!NodeStack.empty() && NodeStack.back()->DFSNumber > RootN->DFSNumber) {
-    assert(NodeStack.back()->LowLink >= RootN->LowLink &&
+  for (Node *N : Nodes) {
+    assert(N->LowLink >= (*Nodes.begin())->LowLink &&
            "We cannot have a low link in an SCC lower than its root on the "
            "stack!");
-    NewSCC->insert(*NodeStack.pop_back_val());
+
+    // This node will go into the next RefSCC, clear out its DFS and low link
+    // as we scan.
+    N->DFSNumber = N->LowLink = 0;
   }
-  NewSCC->insert(*RootN);
-
-  // A final pass over all edges in the SCC (this remains linear as we only
-  // do this once when we build the SCC) to connect it to the parent sets of
-  // its children.
-  bool IsLeafSCC = true;
-  for (Node *SCCN : NewSCC->Nodes)
-    for (Node &SCCChildN : *SCCN) {
-      SCC &ChildSCC = *SCCMap.lookup(&SCCChildN);
-      if (&ChildSCC == NewSCC)
-        continue;
-      ChildSCC.ParentSCCs.insert(NewSCC);
-      IsLeafSCC = false;
+
+  // Each RefSCC contains a DAG of the call SCCs. To build these, we do
+  // a direct walk of the call edges using Tarjan's algorithm. We reuse the
+  // internal storage as we won't need it for the outer graph's DFS any longer.
+
+  SmallVector<std::pair<Node *, call_edge_iterator>, 16> DFSStack;
+  SmallVector<Node *, 16> PendingSCCStack;
+
+  // Scan down the stack and DFS across the call edges.
+  for (Node *RootN : Nodes) {
+    assert(DFSStack.empty() &&
+           "Cannot begin a new root with a non-empty DFS stack!");
+    assert(PendingSCCStack.empty() &&
+           "Cannot begin a new root with pending nodes for an SCC!");
+
+    // Skip any nodes we've already reached in the DFS.
+    if (RootN->DFSNumber != 0) {
+      assert(RootN->DFSNumber == -1 &&
+             "Shouldn't have any mid-DFS root nodes!");
+      continue;
     }
 
-  // For the SCCs where we fine no child SCCs, add them to the leaf list.
-  if (IsLeafSCC)
-    LeafSCCs.push_back(NewSCC);
+    RootN->DFSNumber = RootN->LowLink = 1;
+    int NextDFSNumber = 2;
+
+    DFSStack.push_back({RootN, RootN->call_begin()});
+    do {
+      Node *N;
+      call_edge_iterator I;
+      std::tie(N, I) = DFSStack.pop_back_val();
+      auto E = N->call_end();
+      while (I != E) {
+        Node &ChildN = *I->getNode();
+        if (ChildN.DFSNumber == 0) {
+          // We haven't yet visited this child, so descend, pushing the current
+          // node onto the stack.
+          DFSStack.push_back({N, I});
+
+          assert(!lookupSCC(ChildN) &&
+                 "Found a node with 0 DFS number but already in an SCC!");
+          ChildN.DFSNumber = ChildN.LowLink = NextDFSNumber++;
+          N = &ChildN;
+          I = N->call_begin();
+          E = N->call_end();
+          continue;
+        }
+
+        // If the child has already been added to some child component, it
+        // couldn't impact the low-link of this parent because it isn't
+        // connected, and thus its low-link isn't relevant so skip it.
+        if (ChildN.DFSNumber == -1) {
+          ++I;
+          continue;
+        }
+
+        // Track the lowest linked child as the lowest link for this node.
+        assert(ChildN.LowLink > 0 && "Must have a positive low-link number!");
+        if (ChildN.LowLink < N->LowLink)
+          N->LowLink = ChildN.LowLink;
+
+        // Move to the next edge.
+        ++I;
+      }
+
+      // We've finished processing N and its descendents, put it on our pending
+      // SCC stack to eventually get merged into an SCC of nodes.
+      PendingSCCStack.push_back(N);
+
+      // If this node is linked to some lower entry, continue walking up the
+      // stack.
+      if (N->LowLink != N->DFSNumber)
+        continue;
+
+      // Otherwise, we've completed an SCC. Append it to our post order list of
+      // SCCs.
+      int RootDFSNumber = N->DFSNumber;
+      // Find the range of the node stack by walking down until we pass the
+      // root DFS number.
+      auto SCCNodes = make_range(
+          PendingSCCStack.rbegin(),
+          std::find_if(PendingSCCStack.rbegin(), PendingSCCStack.rend(),
+                       [RootDFSNumber](Node *N) {
+                         return N->DFSNumber < RootDFSNumber;
+                       }));
+      // Form a new SCC out of these nodes and then clear them off our pending
+      // stack.
+      RC.SCCs.push_back(createSCC(RC, SCCNodes));
+      for (Node &N : *RC.SCCs.back()) {
+        N.DFSNumber = N.LowLink = -1;
+        SCCMap[&N] = RC.SCCs.back();
+      }
+      PendingSCCStack.erase(SCCNodes.end().base(), PendingSCCStack.end());
+    } while (!DFSStack.empty());
+  }
+
+  // Wire up the SCC indices.
+  for (int i = 0, Size = RC.SCCs.size(); i < Size; ++i)
+    RC.SCCIndices[RC.SCCs[i]] = i;
+}
+
+// FIXME: We should move callers of this to embed the parent linking and leaf
+// tracking into their DFS in order to remove a full walk of all edges.
+void LazyCallGraph::connectRefSCC(RefSCC &RC) {
+  // Walk all edges in the RefSCC (this remains linear as we only do this once
+  // when we build the RefSCC) to connect it to the parent sets of its
+  // children.
+  bool IsLeaf = true;
+  for (SCC &C : RC)
+    for (Node &N : C)
+      for (Edge &E : N) {
+        assert(E.getNode() &&
+               "Cannot have a missing node in a visited part of the graph!");
+        RefSCC &ChildRC = *lookupRefSCC(*E.getNode());
+        if (&ChildRC == &RC)
+          continue;
+        ChildRC.Parents.insert(&RC);
+        IsLeaf = false;
+      }
 
-  return NewSCC;
+  // For the SCCs where we fine no child SCCs, add them to the leaf list.
+  if (IsLeaf)
+    LeafRefSCCs.push_back(&RC);
 }
 
-LazyCallGraph::SCC *LazyCallGraph::getNextSCCInPostOrder() {
-  Node *N;
-  Node::iterator I;
-  if (!DFSStack.empty()) {
-    N = DFSStack.back().first;
-    I = DFSStack.back().second;
-    DFSStack.pop_back();
-  } else {
-    // If we've handled all candidate entry nodes to the SCC forest, we're done.
+LazyCallGraph::RefSCC *LazyCallGraph::getNextRefSCCInPostOrder() {
+  if (DFSStack.empty()) {
+    Node *N;
     do {
-      if (SCCEntryNodes.empty())
+      // If we've handled all candidate entry nodes to the SCC forest, we're
+      // done.
+      if (RefSCCEntryNodes.empty())
         return nullptr;
 
-      N = &get(*SCCEntryNodes.pop_back_val());
+      N = &get(*RefSCCEntryNodes.pop_back_val());
     } while (N->DFSNumber != 0);
-    I = N->begin();
+
+    // Found a new root, begin the DFS here.
     N->LowLink = N->DFSNumber = 1;
     NextDFSNumber = 2;
+    DFSStack.push_back({N, N->begin()});
   }
 
   for (;;) {
-    assert(N->DFSNumber != 0 && "We should always assign a DFS number "
-                                "before placing a node onto the stack.");
+    Node *N;
+    edge_iterator I;
+    std::tie(N, I) = DFSStack.pop_back_val();
 
-    Node::iterator E = N->end();
+    assert(N->DFSNumber > 0 && "We should always assign a DFS number "
+                               "before placing a node onto the stack.");
+
+    auto E = N->end();
     while (I != E) {
-      Node &ChildN = *I;
+      Node &ChildN = I->getNode(*this);
       if (ChildN.DFSNumber == 0) {
-        // Mark that we should start at this child when next this node is the
-        // top of the stack. We don't start at the next child to ensure this
-        // child's lowlink is reflected.
-        DFSStack.push_back(std::make_pair(N, N->begin()));
+        // We haven't yet visited this child, so descend, pushing the current
+        // node onto the stack.
+        DFSStack.push_back({N, N->begin()});
 
-        // Recurse onto this node via a tail call.
         assert(!SCCMap.count(&ChildN) &&
                "Found a node with 0 DFS number but already in an SCC!");
         ChildN.LowLink = ChildN.DFSNumber = NextDFSNumber++;
         N = &ChildN;
-        I = ChildN.begin();
-        E = ChildN.end();
+        I = N->begin();
+        E = N->end();
+        continue;
+      }
+
+      // If the child has already been added to some child component, it
+      // couldn't impact the low-link of this parent because it isn't
+      // connected, and thus its low-link isn't relevant so skip it.
+      if (ChildN.DFSNumber == -1) {
+        ++I;
         continue;
       }
 
-      // Track the lowest link of the children, if any are still in the stack.
-      assert(ChildN.LowLink != 0 &&
-             "Low-link must not be zero with a non-zero DFS number.");
-      if (ChildN.LowLink >= 0 && ChildN.LowLink < N->LowLink)
+      // Track the lowest linked child as the lowest link for this node.
+      assert(ChildN.LowLink > 0 && "Must have a positive low-link number!");
+      if (ChildN.LowLink < N->LowLink)
         N->LowLink = ChildN.LowLink;
+
+      // Move to the next edge.
       ++I;
     }
 
-    if (N->LowLink == N->DFSNumber)
-      // Form the new SCC out of the top of the DFS stack.
-      return formSCC(N, PendingSCCStack);
-
-    // At this point we know that N cannot ever be an SCC root. Its low-link
-    // is not its dfs-number, and we've processed all of its children. It is
-    // just sitting here waiting until some node further down the stack gets
-    // low-link == dfs-number and pops it off as well. Move it to the pending
-    // stack which is pulled into the next SCC to be formed.
-    PendingSCCStack.push_back(N);
-
-    assert(!DFSStack.empty() && "We never found a viable root!");
-    N = DFSStack.back().first;
-    I = DFSStack.back().second;
-    DFSStack.pop_back();
+    // We've finished processing N and its descendents, put it on our pending
+    // SCC stack to eventually get merged into an SCC of nodes.
+    PendingRefSCCStack.push_back(N);
+
+    // If this node is linked to some lower entry, continue walking up the
+    // stack.
+    if (N->LowLink != N->DFSNumber) {
+      assert(!DFSStack.empty() &&
+             "We never found a viable root for an SCC to pop off!");
+      continue;
+    }
+
+    // Otherwise, form a new RefSCC from the top of the pending node stack.
+    int RootDFSNumber = N->DFSNumber;
+    // Find the range of the node stack by walking down until we pass the
+    // root DFS number.
+    auto RefSCCNodes = node_stack_range(
+        PendingRefSCCStack.rbegin(),
+        std::find_if(
+            PendingRefSCCStack.rbegin(), PendingRefSCCStack.rend(),
+            [RootDFSNumber](Node *N) { return N->DFSNumber < RootDFSNumber; }));
+    // Form a new RefSCC out of these nodes and then clear them off our pending
+    // stack.
+    RefSCC *NewRC = createRefSCC(*this);
+    buildSCCs(*NewRC, RefSCCNodes);
+    connectRefSCC(*NewRC);
+    PendingRefSCCStack.erase(RefSCCNodes.end().base(),
+                             PendingRefSCCStack.end());
+
+    // We return the new node here. This essentially suspends the DFS walk
+    // until another RefSCC is requested.
+    return NewRC;
   }
 }
 
@@ -684,44 +1515,76 @@ char LazyCallGraphAnalysis::PassID;
 
 LazyCallGraphPrinterPass::LazyCallGraphPrinterPass(raw_ostream &OS) : OS(OS) {}
 
-static void printNodes(raw_ostream &OS, LazyCallGraph::Node &N,
-                       SmallPtrSetImpl<LazyCallGraph::Node *> &Printed) {
-  // Recurse depth first through the nodes.
-  for (LazyCallGraph::Node &ChildN : N)
-    if (Printed.insert(&ChildN).second)
-      printNodes(OS, ChildN, Printed);
-
-  OS << "  Call edges in function: " << N.getFunction().getName() << "\n";
-  for (LazyCallGraph::iterator I = N.begin(), E = N.end(); I != E; ++I)
-    OS << "    -> " << I->getFunction().getName() << "\n";
+static void printNode(raw_ostream &OS, LazyCallGraph::Node &N) {
+  OS << "  Edges in function: " << N.getFunction().getName() << "\n";
+  for (const LazyCallGraph::Edge &E : N)
+    OS << "    " << (E.isCall() ? "call" : "ref ") << " -> "
+       << E.getFunction().getName() << "\n";
 
   OS << "\n";
 }
 
-static void printSCC(raw_ostream &OS, LazyCallGraph::SCC &SCC) {
-  ptrdiff_t SCCSize = std::distance(SCC.begin(), SCC.end());
-  OS << "  SCC with " << SCCSize << " functions:\n";
+static void printSCC(raw_ostream &OS, LazyCallGraph::SCC &C) {
+  ptrdiff_t Size = std::distance(C.begin(), C.end());
+  OS << "    SCC with " << Size << " functions:\n";
+
+  for (LazyCallGraph::Node &N : C)
+    OS << "      " << N.getFunction().getName() << "\n";
+}
 
-  for (LazyCallGraph::Node *N : SCC)
-    OS << "    " << N->getFunction().getName() << "\n";
+static void printRefSCC(raw_ostream &OS, LazyCallGraph::RefSCC &C) {
+  ptrdiff_t Size = std::distance(C.begin(), C.end());
+  OS << "  RefSCC with " << Size << " call SCCs:\n";
+
+  for (LazyCallGraph::SCC &InnerC : C)
+    printSCC(OS, InnerC);
 
   OS << "\n";
 }
 
 PreservedAnalyses LazyCallGraphPrinterPass::run(Module &M,
-                                                ModuleAnalysisManager *AM) {
-  LazyCallGraph &G = AM->getResult<LazyCallGraphAnalysis>(M);
+                                                ModuleAnalysisManager &AM) {
+  LazyCallGraph &G = AM.getResult<LazyCallGraphAnalysis>(M);
 
   OS << "Printing the call graph for module: " << M.getModuleIdentifier()
      << "\n\n";
 
-  SmallPtrSet<LazyCallGraph::Node *, 16> Printed;
-  for (LazyCallGraph::Node &N : G)
-    if (Printed.insert(&N).second)
-      printNodes(OS, N, Printed);
+  for (Function &F : M)
+    printNode(OS, G.get(F));
+
+  for (LazyCallGraph::RefSCC &C : G.postorder_ref_sccs())
+    printRefSCC(OS, C);
+
+  return PreservedAnalyses::all();
+}
+
+LazyCallGraphDOTPrinterPass::LazyCallGraphDOTPrinterPass(raw_ostream &OS)
+    : OS(OS) {}
+
+static void printNodeDOT(raw_ostream &OS, LazyCallGraph::Node &N) {
+  std::string Name = "\"" + DOT::EscapeString(N.getFunction().getName()) + "\"";
+
+  for (const LazyCallGraph::Edge &E : N) {
+    OS << "  " << Name << " -> \""
+       << DOT::EscapeString(E.getFunction().getName()) << "\"";
+    if (!E.isCall()) // It is a ref edge.
+      OS << " [style=dashed,label=\"ref\"]";
+    OS << ";\n";
+  }
+
+  OS << "\n";
+}
+
+PreservedAnalyses LazyCallGraphDOTPrinterPass::run(Module &M,
+                                                   ModuleAnalysisManager &AM) {
+  LazyCallGraph &G = AM.getResult<LazyCallGraphAnalysis>(M);
+
+  OS << "digraph \"" << DOT::EscapeString(M.getModuleIdentifier()) << "\" {\n";
+
+  for (Function &F : M)
+    printNodeDOT(OS, G.get(F));
 
-  for (LazyCallGraph::SCC &SCC : G.postorder_sccs())
-    printSCC(OS, SCC);
+  OS << "}\n";
 
   return PreservedAnalyses::all();
 }
diff --git a/lib/Analysis/LazyValueInfo.cpp b/lib/Analysis/LazyValueInfo.cpp
index 0d1d34e0cb4f..4d09b7ca006b 100644
--- a/lib/Analysis/LazyValueInfo.cpp
+++ b/lib/Analysis/LazyValueInfo.cpp
@@ -38,18 +38,19 @@ using namespace PatternMatch;
 
 #define DEBUG_TYPE "lazy-value-info"
 
-char LazyValueInfo::ID = 0;
-INITIALIZE_PASS_BEGIN(LazyValueInfo, "lazy-value-info",
+char LazyValueInfoWrapperPass::ID = 0;
+INITIALIZE_PASS_BEGIN(LazyValueInfoWrapperPass, "lazy-value-info",
                 "Lazy Value Information Analysis", false, true)
 INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker)
 INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfoWrapperPass)
-INITIALIZE_PASS_END(LazyValueInfo, "lazy-value-info",
+INITIALIZE_PASS_END(LazyValueInfoWrapperPass, "lazy-value-info",
                 "Lazy Value Information Analysis", false, true)
 
 namespace llvm {
-  FunctionPass *createLazyValueInfoPass() { return new LazyValueInfo(); }
+  FunctionPass *createLazyValueInfoPass() { return new LazyValueInfoWrapperPass(); }
 }
 
+char LazyValueAnalysis::PassID;
 
 //===----------------------------------------------------------------------===//
 //                               LVILatticeVal
@@ -63,19 +64,24 @@ namespace llvm {
 namespace {
 class LVILatticeVal {
   enum LatticeValueTy {
-    /// This Value has no known value yet.
+    /// This Value has no known value yet.  As a result, this implies the
+    /// producing instruction is dead.  Caution: We use this as the starting
+    /// state in our local meet rules.  In this usage, it's taken to mean
+    /// "nothing known yet".
     undefined,
 
-    /// This Value has a specific constant value.
+    /// This Value has a specific constant value.  (For integers, constantrange
+    /// is used instead.)
     constant,
 
-    /// This Value is known to not have the specified value.
+    /// This Value is known to not have the specified value.  (For integers,
+    /// constantrange is used instead.)
     notconstant,
 
-    /// The Value falls within this range.
+    /// The Value falls within this range. (Used only for integer typed values.)
     constantrange,
 
-    /// This value is not known to be constant, and we know that it has a value.
+    /// We can not precisely model the dynamic values this value might take.
     overdefined
   };
 
@@ -102,7 +108,7 @@ public:
   }
   static LVILatticeVal getRange(ConstantRange CR) {
     LVILatticeVal Res;
-    Res.markConstantRange(CR);
+    Res.markConstantRange(std::move(CR));
     return Res;
   }
   static LVILatticeVal getOverdefined() {
@@ -110,7 +116,7 @@ public:
     Res.markOverdefined();
     return Res;
   }
-  
+
   bool isUndefined() const     { return Tag == undefined; }
   bool isConstant() const      { return Tag == constant; }
   bool isNotConstant() const   { return Tag == notconstant; }
@@ -176,13 +182,13 @@ public:
   }
 
   /// Return true if this is a change in status.
-  bool markConstantRange(const ConstantRange NewR) {
+  bool markConstantRange(ConstantRange NewR) {
     if (isConstantRange()) {
       if (NewR.isEmptySet())
         return markOverdefined();
 
       bool changed = Range != NewR;
-      Range = NewR;
+      Range = std::move(NewR);
       return changed;
     }
 
@@ -191,7 +197,7 @@ public:
       return markOverdefined();
 
     Tag = constantrange;
-    Range = NewR;
+    Range = std::move(NewR);
     return true;
   }
 
@@ -230,11 +236,6 @@ public:
         return markOverdefined();
       }
 
-      // RHS is a ConstantRange, LHS is a non-integer Constant.
-
-      // FIXME: consider the case where RHS is a range [1, 0) and LHS is
-      // a function. The correct result is to pick up RHS.
-
       return markOverdefined();
     }
 
@@ -287,13 +288,76 @@ raw_ostream &operator<<(raw_ostream &OS, const LVILatticeVal &Val) {
 
   if (Val.isNotConstant())
     return OS << "notconstant<" << *Val.getNotConstant() << '>';
-  else if (Val.isConstantRange())
+  if (Val.isConstantRange())
     return OS << "constantrange<" << Val.getConstantRange().getLower() << ", "
               << Val.getConstantRange().getUpper() << '>';
   return OS << "constant<" << *Val.getConstant() << '>';
 }
 }
 
+/// Returns true if this lattice value represents at most one possible value.
+/// This is as precise as any lattice value can get while still representing
+/// reachable code.
+static bool hasSingleValue(const LVILatticeVal &Val) {
+  if (Val.isConstantRange() &&
+      Val.getConstantRange().isSingleElement())
+    // Integer constants are single element ranges
+    return true;
+  if (Val.isConstant())
+    // Non integer constants
+    return true;
+  return false;
+}
+
+/// Combine two sets of facts about the same value into a single set of
+/// facts.  Note that this method is not suitable for merging facts along
+/// different paths in a CFG; that's what the mergeIn function is for.  This
+/// is for merging facts gathered about the same value at the same location
+/// through two independent means.
+/// Notes:
+/// * This method does not promise to return the most precise possible lattice
+///   value implied by A and B.  It is allowed to return any lattice element
+///   which is at least as strong as *either* A or B (unless our facts
+///   conflict, see below).
+/// * Due to unreachable code, the intersection of two lattice values could be
+///   contradictory.  If this happens, we return some valid lattice value so as
+///   not confuse the rest of LVI.  Ideally, we'd always return Undefined, but
+///   we do not make this guarantee.  TODO: This would be a useful enhancement.
+static LVILatticeVal intersect(LVILatticeVal A, LVILatticeVal B) {
+  // Undefined is the strongest state.  It means the value is known to be along
+  // an unreachable path.
+  if (A.isUndefined())
+    return A;
+  if (B.isUndefined())
+    return B;
+
+  // If we gave up for one, but got a useable fact from the other, use it.
+  if (A.isOverdefined())
+    return B;
+  if (B.isOverdefined())
+    return A;
+
+  // Can't get any more precise than constants.
+  if (hasSingleValue(A))
+    return A;
+  if (hasSingleValue(B))
+    return B;
+
+  // Could be either constant range or not constant here.
+  if (!A.isConstantRange() || !B.isConstantRange()) {
+    // TODO: Arbitrary choice, could be improved
+    return A;
+  }
+
+  // Intersect two constant ranges
+  ConstantRange Range =
+    A.getConstantRange().intersectWith(B.getConstantRange());
+  // Note: An empty range is implicitly converted to overdefined internally.
+  // TODO: We could instead use Undefined here since we've proven a conflict
+  // and thus know this path must be unreachable.
+  return LVILatticeVal::getRange(std::move(Range));
+}
+
 //===----------------------------------------------------------------------===//
 //                          LazyValueInfoCache Decl
 //===----------------------------------------------------------------------===//
@@ -354,6 +418,8 @@ namespace {
       if (!BlockValueSet.insert(BV).second)
         return false;  // It's already in the stack.
 
+      DEBUG(dbgs() << "PUSH: " << *BV.second << " in " << BV.first->getName()
+                   << "\n");
       BlockValueStack.push(BV);
       return true;
     }
@@ -375,30 +441,31 @@ namespace {
         lookup(Val)[BB] = Result;
     }
 
-    LVILatticeVal getBlockValue(Value *Val, BasicBlock *BB);
-    bool getEdgeValue(Value *V, BasicBlock *F, BasicBlock *T,
-                      LVILatticeVal &Result,
-                      Instruction *CxtI = nullptr);
-    bool hasBlockValue(Value *Val, BasicBlock *BB);
-
-    // These methods process one work item and may add more. A false value
-    // returned means that the work item was not completely processed and must
-    // be revisited after going through the new items.
-    bool solveBlockValue(Value *Val, BasicBlock *BB);
-    bool solveBlockValueNonLocal(LVILatticeVal &BBLV,
-                                 Value *Val, BasicBlock *BB);
-    bool solveBlockValuePHINode(LVILatticeVal &BBLV,
-                                PHINode *PN, BasicBlock *BB);
-    bool solveBlockValueConstantRange(LVILatticeVal &BBLV,
-                                      Instruction *BBI, BasicBlock *BB);
-    void mergeAssumeBlockValueConstantRange(Value *Val, LVILatticeVal &BBLV,
-                                            Instruction *BBI);
-
-    void solve();
-
-    ValueCacheEntryTy &lookup(Value *V) {
-      return ValueCache[LVIValueHandle(V, this)];
-    }
+  LVILatticeVal getBlockValue(Value *Val, BasicBlock *BB);
+  bool getEdgeValue(Value *V, BasicBlock *F, BasicBlock *T,
+                    LVILatticeVal &Result, Instruction *CxtI = nullptr);
+  bool hasBlockValue(Value *Val, BasicBlock *BB);
+
+  // These methods process one work item and may add more. A false value
+  // returned means that the work item was not completely processed and must
+  // be revisited after going through the new items.
+  bool solveBlockValue(Value *Val, BasicBlock *BB);
+  bool solveBlockValueNonLocal(LVILatticeVal &BBLV, Value *Val, BasicBlock *BB);
+  bool solveBlockValuePHINode(LVILatticeVal &BBLV, PHINode *PN, BasicBlock *BB);
+  bool solveBlockValueSelect(LVILatticeVal &BBLV, SelectInst *S,
+                             BasicBlock *BB);
+  bool solveBlockValueBinaryOp(LVILatticeVal &BBLV, Instruction *BBI,
+                               BasicBlock *BB);
+  bool solveBlockValueCast(LVILatticeVal &BBLV, Instruction *BBI,
+                           BasicBlock *BB);
+  void intersectAssumeBlockValueConstantRange(Value *Val, LVILatticeVal &BBLV,
+                                              Instruction *BBI);
+
+  void solve();
+
+  ValueCacheEntryTy &lookup(Value *V) {
+    return ValueCache[LVIValueHandle(V, this)];
+  }
 
     bool isOverdefined(Value *V, BasicBlock *BB) const {
       auto ODI = OverDefinedCache.find(BB);
@@ -427,7 +494,7 @@ namespace {
 
       return lookup(V)[BB];
     }
-    
+
   public:
     /// This is the query interface to determine the lattice
     /// value for the specified Value* at the end of the specified block.
@@ -493,8 +560,8 @@ void LazyValueInfoCache::eraseBlock(BasicBlock *BB) {
   if (ODI != OverDefinedCache.end())
     OverDefinedCache.erase(ODI);
 
-  for (auto I = ValueCache.begin(), E = ValueCache.end(); I != E; ++I)
-    I->second.erase(BB);
+  for (auto &I : ValueCache)
+    I.second.erase(BB);
 }
 
 void LazyValueInfoCache::solve() {
@@ -508,6 +575,9 @@ void LazyValueInfoCache::solve() {
       assert(hasCachedValueInfo(e.second, e.first) &&
              "Result should be in cache!");
 
+      DEBUG(dbgs() << "POP " << *e.second << " in " << e.first->getName()
+                   << " = " << getCachedValueInfo(e.second, e.first) << "\n");
+
       BlockValueStack.pop();
       BlockValueSet.erase(e);
     } else {
@@ -542,15 +612,12 @@ static LVILatticeVal getFromRangeMetadata(Instruction *BBI) {
   case Instruction::Invoke:
     if (MDNode *Ranges = BBI->getMetadata(LLVMContext::MD_range)) 
       if (isa<IntegerType>(BBI->getType())) {
-        ConstantRange Result = getConstantRangeFromMetadata(*Ranges);
-        return LVILatticeVal::getRange(Result);
+        return LVILatticeVal::getRange(getConstantRangeFromMetadata(*Ranges));
       }
     break;
   };
-  // Nothing known - Note that we do not want overdefined here.  We may know
-  // something else about the value and not having range metadata shouldn't
-  // cause us to throw away those facts.
-  return LVILatticeVal();
+  // Nothing known - will be intersected with other facts
+  return LVILatticeVal::getOverdefined();
 }
 
 bool LazyValueInfoCache::solveBlockValue(Value *Val, BasicBlock *BB) {
@@ -587,44 +654,47 @@ bool LazyValueInfoCache::solveBlockValue(Value *Val, BasicBlock *BB) {
     return true;
   }
 
-  // If this value is a nonnull pointer, record it's range and bailout.
-  PointerType *PT = dyn_cast<PointerType>(BBI->getType());
-  if (PT && isKnownNonNull(BBI)) {
-    Res = LVILatticeVal::getNot(ConstantPointerNull::get(PT));
+  if (auto *SI = dyn_cast<SelectInst>(BBI)) {
+    if (!solveBlockValueSelect(Res, SI, BB))
+      return false;
     insertResult(Val, BB, Res);
     return true;
   }
 
-  // If this is an instruction which supports range metadata, return the
-  // implied range.  TODO: This should be an intersection, not a union.
-  Res.mergeIn(getFromRangeMetadata(BBI), DL);
-
-  // We can only analyze the definitions of certain classes of instructions
-  // (integral binops and casts at the moment), so bail if this isn't one.
-  LVILatticeVal Result;
-  if ((!isa<BinaryOperator>(BBI) && !isa<CastInst>(BBI)) ||
-     !BBI->getType()->isIntegerTy()) {
-    DEBUG(dbgs() << " compute BB '" << BB->getName()
-                 << "' - overdefined because inst def found.\n");
-    Res.markOverdefined();
+  // If this value is a nonnull pointer, record it's range and bailout.  Note
+  // that for all other pointer typed values, we terminate the search at the
+  // definition.  We could easily extend this to look through geps, bitcasts,
+  // and the like to prove non-nullness, but it's not clear that's worth it
+  // compile time wise.  The context-insensative value walk done inside
+  // isKnownNonNull gets most of the profitable cases at much less expense.
+  // This does mean that we have a sensativity to where the defining
+  // instruction is placed, even if it could legally be hoisted much higher.
+  // That is unfortunate.
+  PointerType *PT = dyn_cast<PointerType>(BBI->getType());
+  if (PT && isKnownNonNull(BBI)) {
+    Res = LVILatticeVal::getNot(ConstantPointerNull::get(PT));
     insertResult(Val, BB, Res);
     return true;
   }
-
-  // FIXME: We're currently limited to binops with a constant RHS.  This should
-  // be improved.
-  BinaryOperator *BO = dyn_cast<BinaryOperator>(BBI);
-  if (BO && !isa<ConstantInt>(BO->getOperand(1))) { 
-    DEBUG(dbgs() << " compute BB '" << BB->getName()
-                 << "' - overdefined because inst def found.\n");
-
-    Res.markOverdefined();
-    insertResult(Val, BB, Res);
-    return true;
+  if (BBI->getType()->isIntegerTy()) {
+    if (isa<CastInst>(BBI)) {
+      if (!solveBlockValueCast(Res, BBI, BB))
+        return false;
+      insertResult(Val, BB, Res);
+      return true;
+    }
+    BinaryOperator *BO = dyn_cast<BinaryOperator>(BBI);
+    if (BO && isa<ConstantInt>(BO->getOperand(1))) { 
+      if (!solveBlockValueBinaryOp(Res, BBI, BB))
+        return false;
+      insertResult(Val, BB, Res);
+      return true;
+    }
   }
 
-  if (!solveBlockValueConstantRange(Res, BBI, BB))
-    return false;
+  DEBUG(dbgs() << " compute BB '" << BB->getName()
+                 << "' - unknown inst def found.\n");
+  Res = getFromRangeMetadata(BBI);
   insertResult(Val, BB, Res);
   return true;
 }
@@ -660,37 +730,36 @@ static bool InstructionDereferencesPointer(Instruction *I, Value *Ptr) {
   return false;
 }
 
+/// Return true if the allocation associated with Val is ever dereferenced
+/// within the given basic block.  This establishes the fact Val is not null,
+/// but does not imply that the memory at Val is dereferenceable.  (Val may
+/// point off the end of the dereferenceable part of the object.)
+static bool isObjectDereferencedInBlock(Value *Val, BasicBlock *BB) {
+  assert(Val->getType()->isPointerTy());
+
+  const DataLayout &DL = BB->getModule()->getDataLayout();
+  Value *UnderlyingVal = GetUnderlyingObject(Val, DL);
+  // If 'GetUnderlyingObject' didn't converge, skip it. It won't converge
+  // inside InstructionDereferencesPointer either.
+  if (UnderlyingVal == GetUnderlyingObject(UnderlyingVal, DL, 1))
+    for (Instruction &I : *BB)
+      if (InstructionDereferencesPointer(&I, UnderlyingVal))
+        return true;
+  return false;
+}
+
 bool LazyValueInfoCache::solveBlockValueNonLocal(LVILatticeVal &BBLV,
                                                  Value *Val, BasicBlock *BB) {
   LVILatticeVal Result;  // Start Undefined.
 
-  // If this is a pointer, and there's a load from that pointer in this BB,
-  // then we know that the pointer can't be NULL.
-  bool NotNull = false;
-  if (Val->getType()->isPointerTy()) {
-    if (isKnownNonNull(Val)) {
-      NotNull = true;
-    } else {
-      const DataLayout &DL = BB->getModule()->getDataLayout();
-      Value *UnderlyingVal = GetUnderlyingObject(Val, DL);
-      // If 'GetUnderlyingObject' didn't converge, skip it. It won't converge
-      // inside InstructionDereferencesPointer either.
-      if (UnderlyingVal == GetUnderlyingObject(UnderlyingVal, DL, 1)) {
-        for (Instruction &I : *BB) {
-          if (InstructionDereferencesPointer(&I, UnderlyingVal)) {
-            NotNull = true;
-            break;
-          }
-        }
-      }
-    }
-  }
-
   // If this is the entry block, we must be asking about an argument.  The
   // value is overdefined.
   if (BB == &BB->getParent()->getEntryBlock()) {
     assert(isa<Argument>(Val) && "Unknown live-in to the entry block");
-    if (NotNull) {
+    // Bofore giving up, see if we can prove the pointer non-null local to
+    // this particular block.
+    if (Val->getType()->isPointerTy() &&
+        (isKnownNonNull(Val) || isObjectDereferencedInBlock(Val, BB))) {
       PointerType *PTy = cast<PointerType>(Val->getType());
       Result = LVILatticeVal::getNot(ConstantPointerNull::get(PTy));
     } else {
@@ -715,10 +784,11 @@ bool LazyValueInfoCache::solveBlockValueNonLocal(LVILatticeVal &BBLV,
     // to overdefined.
     if (Result.isOverdefined()) {
       DEBUG(dbgs() << " compute BB '" << BB->getName()
-            << "' - overdefined because of pred.\n");
-      // If we previously determined that this is a pointer that can't be null
-      // then return that rather than giving up entirely.
-      if (NotNull) {
+            << "' - overdefined because of pred (non local).\n");
+      // Bofore giving up, see if we can prove the pointer non-null local to
+      // this particular block.
+      if (Val->getType()->isPointerTy() &&
+          isObjectDereferencedInBlock(Val, BB)) {
         PointerType *PTy = cast<PointerType>(Val->getType());
         Result = LVILatticeVal::getNot(ConstantPointerNull::get(PTy));
       }
@@ -760,7 +830,7 @@ bool LazyValueInfoCache::solveBlockValuePHINode(LVILatticeVal &BBLV,
     // to overdefined.
     if (Result.isOverdefined()) {
       DEBUG(dbgs() << " compute BB '" << BB->getName()
-            << "' - overdefined because of pred.\n");
+            << "' - overdefined because of pred (local).\n");
 
       BBLV = Result;
       return true;
@@ -779,10 +849,9 @@ static bool getValueFromFromCondition(Value *Val, ICmpInst *ICI,
                                       LVILatticeVal &Result,
                                       bool isTrueDest = true);
 
-// If we can determine a constant range for the value Val in the context
-// provided by the instruction BBI, then merge it into BBLV. If we did find a
-// constant range, return true.
-void LazyValueInfoCache::mergeAssumeBlockValueConstantRange(Value *Val,
+// If we can determine a constraint on the value given conditions assumed by
+// the program, intersect those constraints with BBLV
+void LazyValueInfoCache::intersectAssumeBlockValueConstantRange(Value *Val,
                                                             LVILatticeVal &BBLV,
                                                             Instruction *BBI) {
   BBI = BBI ? BBI : dyn_cast<Instruction>(Val);
@@ -799,46 +868,264 @@ void LazyValueInfoCache::mergeAssumeBlockValueConstantRange(Value *Val,
     Value *C = I->getArgOperand(0);
     if (ICmpInst *ICI = dyn_cast<ICmpInst>(C)) {
       LVILatticeVal Result;
-      if (getValueFromFromCondition(Val, ICI, Result)) {
-        if (BBLV.isOverdefined())
-          BBLV = Result;
-        else
-          BBLV.mergeIn(Result, DL);
-      }
+      if (getValueFromFromCondition(Val, ICI, Result))
+        BBLV = intersect(BBLV, Result);
     }
   }
 }
 
-bool LazyValueInfoCache::solveBlockValueConstantRange(LVILatticeVal &BBLV,
-                                                      Instruction *BBI,
-                                                      BasicBlock *BB) {
-  // Figure out the range of the LHS.  If that fails, bail.
-  if (!hasBlockValue(BBI->getOperand(0), BB)) {
-    if (pushBlockValue(std::make_pair(BB, BBI->getOperand(0))))
+bool LazyValueInfoCache::solveBlockValueSelect(LVILatticeVal &BBLV,
+                                               SelectInst *SI, BasicBlock *BB) {
+
+  // Recurse on our inputs if needed
+  if (!hasBlockValue(SI->getTrueValue(), BB)) {
+    if (pushBlockValue(std::make_pair(BB, SI->getTrueValue())))
       return false;
     BBLV.markOverdefined();
     return true;
   }
+  LVILatticeVal TrueVal = getBlockValue(SI->getTrueValue(), BB);
+  // If we hit overdefined, don't ask more queries.  We want to avoid poisoning
+  // extra slots in the table if we can.
+  if (TrueVal.isOverdefined()) {
+    BBLV.markOverdefined();
+    return true;
+  }
 
-  LVILatticeVal LHSVal = getBlockValue(BBI->getOperand(0), BB);
-  mergeAssumeBlockValueConstantRange(BBI->getOperand(0), LHSVal, BBI);
-  if (!LHSVal.isConstantRange()) {
+  if (!hasBlockValue(SI->getFalseValue(), BB)) {
+    if (pushBlockValue(std::make_pair(BB, SI->getFalseValue())))
+      return false;
+    BBLV.markOverdefined();
+    return true;
+  }
+  LVILatticeVal FalseVal = getBlockValue(SI->getFalseValue(), BB);
+  // If we hit overdefined, don't ask more queries.  We want to avoid poisoning
+  // extra slots in the table if we can.
+  if (FalseVal.isOverdefined()) {
     BBLV.markOverdefined();
     return true;
   }
 
-  ConstantRange LHSRange = LHSVal.getConstantRange();
-  ConstantRange RHSRange(1);
-  IntegerType *ResultTy = cast<IntegerType>(BBI->getType());
-  if (isa<BinaryOperator>(BBI)) {
-    if (ConstantInt *RHS = dyn_cast<ConstantInt>(BBI->getOperand(1))) {
-      RHSRange = ConstantRange(RHS->getValue());
-    } else {
-      BBLV.markOverdefined();
-      return true;
+  if (TrueVal.isConstantRange() && FalseVal.isConstantRange()) {
+    ConstantRange TrueCR = TrueVal.getConstantRange();
+    ConstantRange FalseCR = FalseVal.getConstantRange();
+    Value *LHS = nullptr;
+    Value *RHS = nullptr;
+    SelectPatternResult SPR = matchSelectPattern(SI, LHS, RHS);
+    // Is this a min specifically of our two inputs?  (Avoid the risk of
+    // ValueTracking getting smarter looking back past our immediate inputs.)
+    if (SelectPatternResult::isMinOrMax(SPR.Flavor) &&
+        LHS == SI->getTrueValue() && RHS == SI->getFalseValue()) {
+      switch (SPR.Flavor) {
+      default:
+        llvm_unreachable("unexpected minmax type!");
+      case SPF_SMIN:                   /// Signed minimum
+        BBLV.markConstantRange(TrueCR.smin(FalseCR));
+        return true;
+      case SPF_UMIN:                   /// Unsigned minimum
+        BBLV.markConstantRange(TrueCR.umin(FalseCR));
+        return true;
+      case SPF_SMAX:                   /// Signed maximum
+        BBLV.markConstantRange(TrueCR.smax(FalseCR));
+        return true;
+      case SPF_UMAX:                   /// Unsigned maximum
+        BBLV.markConstantRange(TrueCR.umax(FalseCR));
+        return true;
+      };
+    }
+
+    // TODO: ABS, NABS from the SelectPatternResult
+  }
+
+  // Can we constrain the facts about the true and false values by using the
+  // condition itself?  This shows up with idioms like e.g. select(a > 5, a, 5).
+  // TODO: We could potentially refine an overdefined true value above.
+  if (auto *ICI = dyn_cast<ICmpInst>(SI->getCondition())) {
+    LVILatticeVal TrueValTaken, FalseValTaken;
+    if (!getValueFromFromCondition(SI->getTrueValue(), ICI,
+                                   TrueValTaken, true))
+      TrueValTaken.markOverdefined();
+    if (!getValueFromFromCondition(SI->getFalseValue(), ICI,
+                                   FalseValTaken, false))
+      FalseValTaken.markOverdefined();
+
+    TrueVal = intersect(TrueVal, TrueValTaken);
+    FalseVal = intersect(FalseVal, FalseValTaken);
+
+
+    // Handle clamp idioms such as:
+    //   %24 = constantrange<0, 17>
+    //   %39 = icmp eq i32 %24, 0
+    //   %40 = add i32 %24, -1
+    //   %siv.next = select i1 %39, i32 16, i32 %40
+    //   %siv.next = constantrange<0, 17> not <-1, 17>
+    // In general, this can handle any clamp idiom which tests the edge
+    // condition via an equality or inequality.
+    ICmpInst::Predicate Pred = ICI->getPredicate();
+    Value *A = ICI->getOperand(0);
+    if (ConstantInt *CIBase = dyn_cast<ConstantInt>(ICI->getOperand(1))) {
+      auto addConstants = [](ConstantInt *A, ConstantInt *B) {
+        assert(A->getType() == B->getType());
+        return ConstantInt::get(A->getType(), A->getValue() + B->getValue());
+      };
+      // See if either input is A + C2, subject to the constraint from the
+      // condition that A != C when that input is used.  We can assume that
+      // that input doesn't include C + C2.
+      ConstantInt *CIAdded;
+      switch (Pred) {
+      default: break;
+      case ICmpInst::ICMP_EQ:
+        if (match(SI->getFalseValue(), m_Add(m_Specific(A),
+                                             m_ConstantInt(CIAdded)))) {
+          auto ResNot = addConstants(CIBase, CIAdded);
+          FalseVal = intersect(FalseVal,
+                               LVILatticeVal::getNot(ResNot));
+        }
+        break;
+      case ICmpInst::ICMP_NE:
+        if (match(SI->getTrueValue(), m_Add(m_Specific(A),
+                                            m_ConstantInt(CIAdded)))) {
+          auto ResNot = addConstants(CIBase, CIAdded);
+          TrueVal = intersect(TrueVal,
+                              LVILatticeVal::getNot(ResNot));
+        }
+        break;
+      };
     }
   }
 
+  LVILatticeVal Result;  // Start Undefined.
+  Result.mergeIn(TrueVal, DL);
+  Result.mergeIn(FalseVal, DL);
+  BBLV = Result;
+  return true;
+}
+
+bool LazyValueInfoCache::solveBlockValueCast(LVILatticeVal &BBLV,
+                                             Instruction *BBI,
+                                             BasicBlock *BB) {
+  if (!BBI->getOperand(0)->getType()->isSized()) {
+    // Without knowing how wide the input is, we can't analyze it in any useful
+    // way.
+    BBLV.markOverdefined();
+    return true;
+  }
+
+  // Filter out casts we don't know how to reason about before attempting to
+  // recurse on our operand.  This can cut a long search short if we know we're
+  // not going to be able to get any useful information anways.
+  switch (BBI->getOpcode()) {
+  case Instruction::Trunc:
+  case Instruction::SExt:
+  case Instruction::ZExt:
+  case Instruction::BitCast:
+    break;
+  default:
+    // Unhandled instructions are overdefined.
+    DEBUG(dbgs() << " compute BB '" << BB->getName()
+                 << "' - overdefined (unknown cast).\n");
+    BBLV.markOverdefined();
+    return true;
+  }
+
+  // Figure out the range of the LHS.  If that fails, we still apply the
+  // transfer rule on the full set since we may be able to locally infer
+  // interesting facts.
+  if (!hasBlockValue(BBI->getOperand(0), BB))
+    if (pushBlockValue(std::make_pair(BB, BBI->getOperand(0))))
+      // More work to do before applying this transfer rule.
+      return false;
+
+  const unsigned OperandBitWidth =
+    DL.getTypeSizeInBits(BBI->getOperand(0)->getType());
+  ConstantRange LHSRange = ConstantRange(OperandBitWidth);
+  if (hasBlockValue(BBI->getOperand(0), BB)) {
+    LVILatticeVal LHSVal = getBlockValue(BBI->getOperand(0), BB);
+    intersectAssumeBlockValueConstantRange(BBI->getOperand(0), LHSVal, BBI);
+    if (LHSVal.isConstantRange())
+      LHSRange = LHSVal.getConstantRange();
+  }
+
+  const unsigned ResultBitWidth =
+    cast<IntegerType>(BBI->getType())->getBitWidth();
+
+  // NOTE: We're currently limited by the set of operations that ConstantRange
+  // can evaluate symbolically.  Enhancing that set will allows us to analyze
+  // more definitions.
+  LVILatticeVal Result;
+  switch (BBI->getOpcode()) {
+  case Instruction::Trunc:
+    Result.markConstantRange(LHSRange.truncate(ResultBitWidth));
+    break;
+  case Instruction::SExt:
+    Result.markConstantRange(LHSRange.signExtend(ResultBitWidth));
+    break;
+  case Instruction::ZExt:
+    Result.markConstantRange(LHSRange.zeroExtend(ResultBitWidth));
+    break;
+  case Instruction::BitCast:
+    Result.markConstantRange(LHSRange);
+    break;
+  default:
+    // Should be dead if the code above is correct
+    llvm_unreachable("inconsistent with above");
+    break;
+  }
+
+  BBLV = Result;
+  return true;
+}
+
+bool LazyValueInfoCache::solveBlockValueBinaryOp(LVILatticeVal &BBLV,
+                                                 Instruction *BBI,
+                                                 BasicBlock *BB) {
+
+  assert(BBI->getOperand(0)->getType()->isSized() &&
+         "all operands to binary operators are sized");
+
+  // Filter out operators we don't know how to reason about before attempting to
+  // recurse on our operand(s).  This can cut a long search short if we know
+  // we're not going to be able to get any useful information anways.
+  switch (BBI->getOpcode()) {
+  case Instruction::Add:
+  case Instruction::Sub:
+  case Instruction::Mul:
+  case Instruction::UDiv:
+  case Instruction::Shl:
+  case Instruction::LShr:
+  case Instruction::And:
+  case Instruction::Or:
+    // continue into the code below
+    break;
+  default:
+    // Unhandled instructions are overdefined.
+    DEBUG(dbgs() << " compute BB '" << BB->getName()
+                 << "' - overdefined (unknown binary operator).\n");
+    BBLV.markOverdefined();
+    return true;
+  };
+
+  // Figure out the range of the LHS.  If that fails, use a conservative range,
+  // but apply the transfer rule anyways.  This lets us pick up facts from
+  // expressions like "and i32 (call i32 @foo()), 32"
+  if (!hasBlockValue(BBI->getOperand(0), BB))
+    if (pushBlockValue(std::make_pair(BB, BBI->getOperand(0))))
+      // More work to do before applying this transfer rule.
+      return false;
+
+  const unsigned OperandBitWidth =
+    DL.getTypeSizeInBits(BBI->getOperand(0)->getType());
+  ConstantRange LHSRange = ConstantRange(OperandBitWidth);
+  if (hasBlockValue(BBI->getOperand(0), BB)) {
+    LVILatticeVal LHSVal = getBlockValue(BBI->getOperand(0), BB);
+    intersectAssumeBlockValueConstantRange(BBI->getOperand(0), LHSVal, BBI);
+    if (LHSVal.isConstantRange())
+      LHSRange = LHSVal.getConstantRange();
+  }
+
+  ConstantInt *RHS = cast<ConstantInt>(BBI->getOperand(1));
+  ConstantRange RHSRange = ConstantRange(RHS->getValue());
+
   // NOTE: We're currently limited by the set of operations that ConstantRange
   // can evaluate symbolically.  Enhancing that set will allows us to analyze
   // more definitions.
@@ -862,30 +1149,15 @@ bool LazyValueInfoCache::solveBlockValueConstantRange(LVILatticeVal &BBLV,
   case Instruction::LShr:
     Result.markConstantRange(LHSRange.lshr(RHSRange));
     break;
-  case Instruction::Trunc:
-    Result.markConstantRange(LHSRange.truncate(ResultTy->getBitWidth()));
-    break;
-  case Instruction::SExt:
-    Result.markConstantRange(LHSRange.signExtend(ResultTy->getBitWidth()));
-    break;
-  case Instruction::ZExt:
-    Result.markConstantRange(LHSRange.zeroExtend(ResultTy->getBitWidth()));
-    break;
-  case Instruction::BitCast:
-    Result.markConstantRange(LHSRange);
-    break;
   case Instruction::And:
     Result.markConstantRange(LHSRange.binaryAnd(RHSRange));
     break;
   case Instruction::Or:
     Result.markConstantRange(LHSRange.binaryOr(RHSRange));
     break;
-
-  // Unhandled instructions are overdefined.
   default:
-    DEBUG(dbgs() << " compute BB '" << BB->getName()
-                 << "' - overdefined because inst def found.\n");
-    Result.markOverdefined();
+    // Should be dead if the code above is correct
+    llvm_unreachable("inconsistent with above");
     break;
   }
 
@@ -895,10 +1167,11 @@ bool LazyValueInfoCache::solveBlockValueConstantRange(LVILatticeVal &BBLV,
 
 bool getValueFromFromCondition(Value *Val, ICmpInst *ICI,
                                LVILatticeVal &Result, bool isTrueDest) {
-  if (ICI && isa<Constant>(ICI->getOperand(1))) {
+  assert(ICI && "precondition");
+  if (isa<Constant>(ICI->getOperand(1))) {
     if (ICI->isEquality() && ICI->getOperand(0) == Val) {
       // We know that V has the RHS constant if this is a true SETEQ or
-      // false SETNE. 
+      // false SETNE.
       if (isTrueDest == (ICI->getPredicate() == ICmpInst::ICMP_EQ))
         Result = LVILatticeVal::get(cast<Constant>(ICI->getOperand(1)));
       else
@@ -926,7 +1199,7 @@ bool getValueFromFromCondition(Value *Val, ICmpInst *ICI,
       // If we're interested in the false dest, invert the condition.
       if (!isTrueDest) TrueValues = TrueValues.inverse();
 
-      Result = LVILatticeVal::getRange(TrueValues);
+      Result = LVILatticeVal::getRange(std::move(TrueValues));
       return true;
     }
   }
@@ -935,7 +1208,8 @@ bool getValueFromFromCondition(Value *Val, ICmpInst *ICI,
 }
 
 /// \brief Compute the value of Val on the edge BBFrom -> BBTo. Returns false if
-/// Val is not constrained on the edge.
+/// Val is not constrained on the edge.  Result is unspecified if return value
+/// is false.
 static bool getEdgeValueLocal(Value *Val, BasicBlock *BBFrom,
                               BasicBlock *BBTo, LVILatticeVal &Result) {
   // TODO: Handle more complex conditionals. If (v == 0 || v2 < 1) is false, we
@@ -985,7 +1259,7 @@ static bool getEdgeValueLocal(Value *Val, BasicBlock *BBFrom,
       } else if (i.getCaseSuccessor() == BBTo)
         EdgesVals = EdgesVals.unionWith(EdgeVal);
     }
-    Result = LVILatticeVal::getRange(EdgesVals);
+    Result = LVILatticeVal::getRange(std::move(EdgesVals));
     return true;
   }
   return false;
@@ -1002,46 +1276,29 @@ bool LazyValueInfoCache::getEdgeValue(Value *Val, BasicBlock *BBFrom,
     return true;
   }
 
-  if (getEdgeValueLocal(Val, BBFrom, BBTo, Result)) {
-    if (!Result.isConstantRange() ||
-        Result.getConstantRange().getSingleElement())
-      return true;
-
-    // FIXME: this check should be moved to the beginning of the function when
-    // LVI better supports recursive values. Even for the single value case, we
-    // can intersect to detect dead code (an empty range).
-    if (!hasBlockValue(Val, BBFrom)) {
-      if (pushBlockValue(std::make_pair(BBFrom, Val)))
-        return false;
-      Result.markOverdefined();
-      return true;
-    }
-
-    // Try to intersect ranges of the BB and the constraint on the edge.
-    LVILatticeVal InBlock = getBlockValue(Val, BBFrom);
-    mergeAssumeBlockValueConstantRange(Val, InBlock, BBFrom->getTerminator());
-    // See note on the use of the CxtI with mergeAssumeBlockValueConstantRange,
-    // and caching, below.
-    mergeAssumeBlockValueConstantRange(Val, InBlock, CxtI);
-    if (!InBlock.isConstantRange())
-      return true;
+  LVILatticeVal LocalResult;
+  if (!getEdgeValueLocal(Val, BBFrom, BBTo, LocalResult))
+    // If we couldn't constrain the value on the edge, LocalResult doesn't
+    // provide any information.
+    LocalResult.markOverdefined();
 
-    ConstantRange Range =
-      Result.getConstantRange().intersectWith(InBlock.getConstantRange());
-    Result = LVILatticeVal::getRange(Range);
+  if (hasSingleValue(LocalResult)) {
+    // Can't get any more precise here
+    Result = LocalResult;
     return true;
   }
 
   if (!hasBlockValue(Val, BBFrom)) {
     if (pushBlockValue(std::make_pair(BBFrom, Val)))
       return false;
-    Result.markOverdefined();
+    // No new information.
+    Result = LocalResult;
     return true;
   }
 
-  // If we couldn't compute the value on the edge, use the value from the BB.
-  Result = getBlockValue(Val, BBFrom);
-  mergeAssumeBlockValueConstantRange(Val, Result, BBFrom->getTerminator());
+  // Try to intersect ranges of the BB and the constraint on the edge.
+  LVILatticeVal InBlock = getBlockValue(Val, BBFrom);
+  intersectAssumeBlockValueConstantRange(Val, InBlock, BBFrom->getTerminator());
   // We can use the context instruction (generically the ultimate instruction
   // the calling pass is trying to simplify) here, even though the result of
   // this function is generally cached when called from the solve* functions
@@ -1050,7 +1307,9 @@ bool LazyValueInfoCache::getEdgeValue(Value *Val, BasicBlock *BBFrom,
   // functions, the context instruction is not provided. When called from
   // LazyValueInfoCache::getValueOnEdge, the context instruction is provided,
   // but then the result is not cached.
-  mergeAssumeBlockValueConstantRange(Val, Result, CxtI);
+  intersectAssumeBlockValueConstantRange(Val, InBlock, CxtI);
+
+  Result = intersect(LocalResult, InBlock);
   return true;
 }
 
@@ -1060,11 +1319,12 @@ LVILatticeVal LazyValueInfoCache::getValueInBlock(Value *V, BasicBlock *BB,
         << BB->getName() << "'\n");
 
   assert(BlockValueStack.empty() && BlockValueSet.empty());
-  pushBlockValue(std::make_pair(BB, V));
-
-  solve();
+  if (!hasBlockValue(V, BB)) {
+    pushBlockValue(std::make_pair(BB, V)); 
+    solve();
+  }
   LVILatticeVal Result = getBlockValue(V, BB);
-  mergeAssumeBlockValueConstantRange(V, Result, CxtI);
+  intersectAssumeBlockValueConstantRange(V, Result, CxtI);
 
   DEBUG(dbgs() << "  Result = " << Result << "\n");
   return Result;
@@ -1074,10 +1334,13 @@ LVILatticeVal LazyValueInfoCache::getValueAt(Value *V, Instruction *CxtI) {
   DEBUG(dbgs() << "LVI Getting value " << *V << " at '"
         << CxtI->getName() << "'\n");
 
-  LVILatticeVal Result;
+  if (auto *C = dyn_cast<Constant>(V))
+    return LVILatticeVal::get(C);
+
+  LVILatticeVal Result = LVILatticeVal::getOverdefined();
   if (auto *I = dyn_cast<Instruction>(V))
     Result = getFromRangeMetadata(I);
-  mergeAssumeBlockValueConstantRange(V, Result, CxtI);
+  intersectAssumeBlockValueConstantRange(V, Result, CxtI);
 
   DEBUG(dbgs() << "  Result = " << Result << "\n");
   return Result;
@@ -1172,29 +1435,32 @@ static LazyValueInfoCache &getCache(void *&PImpl, AssumptionCache *AC,
   return *static_cast<LazyValueInfoCache*>(PImpl);
 }
 
-bool LazyValueInfo::runOnFunction(Function &F) {
-  AC = &getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F);
+bool LazyValueInfoWrapperPass::runOnFunction(Function &F) {
+  Info.AC = &getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F);
   const DataLayout &DL = F.getParent()->getDataLayout();
 
   DominatorTreeWrapperPass *DTWP =
       getAnalysisIfAvailable<DominatorTreeWrapperPass>();
-  DT = DTWP ? &DTWP->getDomTree() : nullptr;
-
-  TLI = &getAnalysis<TargetLibraryInfoWrapperPass>().getTLI();
+  Info.DT = DTWP ? &DTWP->getDomTree() : nullptr;
+  Info.TLI = &getAnalysis<TargetLibraryInfoWrapperPass>().getTLI();
 
-  if (PImpl)
-    getCache(PImpl, AC, &DL, DT).clear();
+  if (Info.PImpl)
+    getCache(Info.PImpl, Info.AC, &DL, Info.DT).clear();
 
   // Fully lazy.
   return false;
 }
 
-void LazyValueInfo::getAnalysisUsage(AnalysisUsage &AU) const {
+void LazyValueInfoWrapperPass::getAnalysisUsage(AnalysisUsage &AU) const {
   AU.setPreservesAll();
   AU.addRequired<AssumptionCacheTracker>();
   AU.addRequired<TargetLibraryInfoWrapperPass>();
 }
 
+LazyValueInfo &LazyValueInfoWrapperPass::getLVI() { return Info; }
+
+LazyValueInfo::~LazyValueInfo() { releaseMemory(); }
+
 void LazyValueInfo::releaseMemory() {
   // If the cache was allocated, free it.
   if (PImpl) {
@@ -1203,6 +1469,16 @@ void LazyValueInfo::releaseMemory() {
   }
 }
 
+void LazyValueInfoWrapperPass::releaseMemory() { Info.releaseMemory(); }
+
+LazyValueInfo LazyValueAnalysis::run(Function &F, FunctionAnalysisManager &FAM) {
+  auto &AC = FAM.getResult<AssumptionAnalysis>(F);
+  auto &TLI = FAM.getResult<TargetLibraryAnalysis>(F);
+  auto *DT = FAM.getCachedResult<DominatorTreeAnalysis>(F);
+
+  return LazyValueInfo(&AC, &TLI, DT);
+}
+
 Constant *LazyValueInfo::getConstant(Value *V, BasicBlock *BB,
                                      Instruction *CxtI) {
   const DataLayout &DL = BB->getModule()->getDataLayout();
@@ -1219,6 +1495,21 @@ Constant *LazyValueInfo::getConstant(Value *V, BasicBlock *BB,
   return nullptr;
 }
 
+ConstantRange LazyValueInfo::getConstantRange(Value *V, BasicBlock *BB,
+                                              Instruction *CxtI) {
+  assert(V->getType()->isIntegerTy());
+  unsigned Width = V->getType()->getIntegerBitWidth();
+  const DataLayout &DL = BB->getModule()->getDataLayout();
+  LVILatticeVal Result =
+      getCache(PImpl, AC, &DL, DT).getValueInBlock(V, BB, CxtI);
+  assert(!Result.isConstant());
+  if (Result.isUndefined())
+    return ConstantRange(Width, /*isFullSet=*/false);
+  if (Result.isConstantRange())
+    return Result.getConstantRange();
+  return ConstantRange(Width, /*isFullSet=*/true);
+}
+
 /// Determine whether the specified value is known to be a
 /// constant on the specified edge. Return null if not.
 Constant *LazyValueInfo::getConstantOnEdge(Value *V, BasicBlock *FromBB,
@@ -1349,7 +1640,7 @@ LazyValueInfo::getPredicateAt(unsigned Pred, Value *V, Constant *C,
   // We limit the search to one step backwards from the current BB and value.
   // We could consider extending this to search further backwards through the
   // CFG and/or value graph, but there are non-obvious compile time vs quality
-  // tradeoffs.  
+  // tradeoffs.
   if (CxtI) {
     BasicBlock *BB = CxtI->getParent();
 
@@ -1369,10 +1660,10 @@ LazyValueInfo::getPredicateAt(unsigned Pred, Value *V, Constant *C,
         for (unsigned i = 0, e = PHI->getNumIncomingValues(); i < e; i++) {
           Value *Incoming = PHI->getIncomingValue(i);
           BasicBlock *PredBB = PHI->getIncomingBlock(i);
-          // Note that PredBB may be BB itself.        
+          // Note that PredBB may be BB itself.
           Tristate Result = getPredicateOnEdge(Pred, Incoming, C, PredBB, BB,
                                                CxtI);
-          
+
           // Keep going as long as we've seen a consistent known result for
           // all inputs.
           Baseline = (i == 0) ? Result /* First iteration */
diff --git a/lib/Analysis/Lint.cpp b/lib/Analysis/Lint.cpp
index 2dfb09c95ad6..fdf5f55dab9f 100644
--- a/lib/Analysis/Lint.cpp
+++ b/lib/Analysis/Lint.cpp
@@ -435,7 +435,7 @@ void Lint::visitMemoryReference(Instruction &I,
       // If the global may be defined differently in another compilation unit
       // then don't warn about funky memory accesses.
       if (GV->hasDefinitiveInitializer()) {
-        Type *GTy = GV->getType()->getElementType();
+        Type *GTy = GV->getValueType();
         if (GTy->isSized())
           BaseSize = DL->getTypeAllocSize(GTy);
         BaseAlign = GV->getAlignment();
@@ -642,8 +642,7 @@ Value *Lint::findValueImpl(Value *V, bool OffsetOk,
       if (!VisitedBlocks.insert(BB).second)
         break;
       if (Value *U =
-          FindAvailableLoadedValue(L->getPointerOperand(),
-                                   BB, BBI, DefMaxInstsToScan, AA))
+          FindAvailableLoadedValue(L, BB, BBI, DefMaxInstsToScan, AA))
         return findValueImpl(U, OffsetOk, Visited);
       if (BBI != BB->begin()) break;
       BB = BB->getUniquePredecessor();
diff --git a/lib/Analysis/Loads.cpp b/lib/Analysis/Loads.cpp
index 4b2fa3c6505a..75426b54195a 100644
--- a/lib/Analysis/Loads.cpp
+++ b/lib/Analysis/Loads.cpp
@@ -21,8 +21,125 @@
 #include "llvm/IR/LLVMContext.h"
 #include "llvm/IR/Module.h"
 #include "llvm/IR/Operator.h"
+#include "llvm/IR/Statepoint.h"
+
 using namespace llvm;
 
+static bool isAligned(const Value *Base, const APInt &Offset, unsigned Align,
+                      const DataLayout &DL) {
+  APInt BaseAlign(Offset.getBitWidth(), Base->getPointerAlignment(DL));
+
+  if (!BaseAlign) {
+    Type *Ty = Base->getType()->getPointerElementType();
+    if (!Ty->isSized())
+      return false;
+    BaseAlign = DL.getABITypeAlignment(Ty);
+  }
+
+  APInt Alignment(Offset.getBitWidth(), Align);
+
+  assert(Alignment.isPowerOf2() && "must be a power of 2!");
+  return BaseAlign.uge(Alignment) && !(Offset & (Alignment-1));
+}
+
+static bool isAligned(const Value *Base, unsigned Align, const DataLayout &DL) {
+  Type *Ty = Base->getType();
+  assert(Ty->isSized() && "must be sized");
+  APInt Offset(DL.getTypeStoreSizeInBits(Ty), 0);
+  return isAligned(Base, Offset, Align, DL);
+}
+
+/// Test if V is always a pointer to allocated and suitably aligned memory for
+/// a simple load or store.
+static bool isDereferenceableAndAlignedPointer(
+    const Value *V, unsigned Align, const APInt &Size, const DataLayout &DL,
+    const Instruction *CtxI, const DominatorTree *DT,
+    SmallPtrSetImpl<const Value *> &Visited) {
+  // Note that it is not safe to speculate into a malloc'd region because
+  // malloc may return null.
+
+  // bitcast instructions are no-ops as far as dereferenceability is concerned.
+  if (const BitCastOperator *BC = dyn_cast<BitCastOperator>(V))
+    return isDereferenceableAndAlignedPointer(BC->getOperand(0), Align, Size,
+                                              DL, CtxI, DT, Visited);
+
+  bool CheckForNonNull = false;
+  APInt KnownDerefBytes(Size.getBitWidth(),
+                        V->getPointerDereferenceableBytes(DL, CheckForNonNull));
+  if (KnownDerefBytes.getBoolValue()) {
+    if (KnownDerefBytes.uge(Size))
+      if (!CheckForNonNull || isKnownNonNullAt(V, CtxI, DT))
+        return isAligned(V, Align, DL);
+  }
+
+  // For GEPs, determine if the indexing lands within the allocated object.
+  if (const GEPOperator *GEP = dyn_cast<GEPOperator>(V)) {
+    const Value *Base = GEP->getPointerOperand();
+
+    APInt Offset(DL.getPointerTypeSizeInBits(GEP->getType()), 0);
+    if (!GEP->accumulateConstantOffset(DL, Offset) || Offset.isNegative() ||
+        !Offset.urem(APInt(Offset.getBitWidth(), Align)).isMinValue())
+      return false;
+
+    // If the base pointer is dereferenceable for Offset+Size bytes, then the
+    // GEP (== Base + Offset) is dereferenceable for Size bytes.  If the base
+    // pointer is aligned to Align bytes, and the Offset is divisible by Align
+    // then the GEP (== Base + Offset == k_0 * Align + k_1 * Align) is also
+    // aligned to Align bytes.
+
+    return Visited.insert(Base).second &&
+           isDereferenceableAndAlignedPointer(Base, Align, Offset + Size, DL,
+                                              CtxI, DT, Visited);
+  }
+
+  // For gc.relocate, look through relocations
+  if (const GCRelocateInst *RelocateInst = dyn_cast<GCRelocateInst>(V))
+    return isDereferenceableAndAlignedPointer(
+        RelocateInst->getDerivedPtr(), Align, Size, DL, CtxI, DT, Visited);
+
+  if (const AddrSpaceCastInst *ASC = dyn_cast<AddrSpaceCastInst>(V))
+    return isDereferenceableAndAlignedPointer(ASC->getOperand(0), Align, Size,
+                                              DL, CtxI, DT, Visited);
+
+  if (auto CS = ImmutableCallSite(V))
+    if (const Value *RV = CS.getReturnedArgOperand())
+      return isDereferenceableAndAlignedPointer(RV, Align, Size, DL, CtxI, DT,
+                                                Visited);
+
+  // If we don't know, assume the worst.
+  return false;
+}
+
+bool llvm::isDereferenceableAndAlignedPointer(const Value *V, unsigned Align,
+                                              const DataLayout &DL,
+                                              const Instruction *CtxI,
+                                              const DominatorTree *DT) {
+  // When dereferenceability information is provided by a dereferenceable
+  // attribute, we know exactly how many bytes are dereferenceable. If we can
+  // determine the exact offset to the attributed variable, we can use that
+  // information here.
+  Type *VTy = V->getType();
+  Type *Ty = VTy->getPointerElementType();
+
+  // Require ABI alignment for loads without alignment specification
+  if (Align == 0)
+    Align = DL.getABITypeAlignment(Ty);
+
+  if (!Ty->isSized())
+    return false;
+
+  SmallPtrSet<const Value *, 32> Visited;
+  return ::isDereferenceableAndAlignedPointer(
+      V, Align, APInt(DL.getTypeSizeInBits(VTy), DL.getTypeStoreSize(Ty)), DL,
+      CtxI, DT, Visited);
+}
+
+bool llvm::isDereferenceablePointer(const Value *V, const DataLayout &DL,
+                                    const Instruction *CtxI,
+                                    const DominatorTree *DT) {
+  return isDereferenceableAndAlignedPointer(V, 1, DL, CtxI, DT);
+}
+
 /// \brief Test if A and B will obviously have the same value.
 ///
 /// This includes recognizing that %t0 and %t1 will have the same
@@ -56,21 +173,29 @@ static bool AreEquivalentAddressValues(const Value *A, const Value *B) {
 
 /// \brief Check if executing a load of this pointer value cannot trap.
 ///
+/// If DT and ScanFrom are specified this method performs context-sensitive
+/// analysis and returns true if it is safe to load immediately before ScanFrom.
+///
 /// If it is not obviously safe to load from the specified pointer, we do
 /// a quick local scan of the basic block containing \c ScanFrom, to determine
 /// if the address is already accessed.
 ///
 /// This uses the pointee type to determine how many bytes need to be safe to
 /// load from the pointer.
-bool llvm::isSafeToLoadUnconditionally(Value *V, Instruction *ScanFrom,
-                                       unsigned Align) {
-  const DataLayout &DL = ScanFrom->getModule()->getDataLayout();
-
+bool llvm::isSafeToLoadUnconditionally(Value *V, unsigned Align,
+                                       const DataLayout &DL,
+                                       Instruction *ScanFrom,
+                                       const DominatorTree *DT) {
   // Zero alignment means that the load has the ABI alignment for the target
   if (Align == 0)
     Align = DL.getABITypeAlignment(V->getType()->getPointerElementType());
   assert(isPowerOf2_32(Align));
 
+  // If DT is not specified we can't make context-sensitive query
+  const Instruction* CtxI = DT ? ScanFrom : nullptr;
+  if (isDereferenceableAndAlignedPointer(V, Align, DL, CtxI, DT))
+    return true;
+
   int64_t ByteOffset = 0;
   Value *Base = V;
   Base = GetPointerBaseWithConstantOffset(V, ByteOffset, DL);
@@ -86,9 +211,9 @@ bool llvm::isSafeToLoadUnconditionally(Value *V, Instruction *ScanFrom,
     BaseAlign = AI->getAlignment();
   } else if (const GlobalVariable *GV = dyn_cast<GlobalVariable>(Base)) {
     // Global variables are not necessarily safe to load from if they are
-    // overridden. Their size may change or they may be weak and require a test
-    // to determine if they were in fact provided.
-    if (!GV->mayBeOverridden()) {
+    // interposed arbitrarily. Their size may change or they may be weak and
+    // require a test to determine if they were in fact provided.
+    if (!GV->isInterposable()) {
       BaseType = GV->getType()->getElementType();
       BaseAlign = GV->getAlignment();
     }
@@ -113,6 +238,9 @@ bool llvm::isSafeToLoadUnconditionally(Value *V, Instruction *ScanFrom,
     }
   }
 
+  if (!ScanFrom)
+    return false;
+
   // Otherwise, be a little bit aggressive by scanning the local block where we
   // want to check to see if the pointer is already being loaded or stored
   // from/to.  If so, the previous load or store would have already trapped,
@@ -174,33 +302,24 @@ llvm::DefMaxInstsToScan("available-load-scan-limit", cl::init(6), cl::Hidden,
            "to scan backward from a given instruction, when searching for "
            "available loaded value"));
 
-/// \brief Scan the ScanBB block backwards to see if we have the value at the
-/// memory address *Ptr locally available within a small number of instructions.
-///
-/// The scan starts from \c ScanFrom. \c MaxInstsToScan specifies the maximum
-/// instructions to scan in the block. If it is set to \c 0, it will scan the whole
-/// block.
-///
-/// If the value is available, this function returns it. If not, it returns the
-/// iterator for the last validated instruction that the value would be live
-/// through. If we scanned the entire block and didn't find something that
-/// invalidates \c *Ptr or provides it, \c ScanFrom is left at the last
-/// instruction processed and this returns null.
-///
-/// You can also optionally specify an alias analysis implementation, which
-/// makes this more precise.
-///
-/// If \c AATags is non-null and a load or store is found, the AA tags from the
-/// load or store are recorded there. If there are no AA tags or if no access is
-/// found, it is left unmodified.
-Value *llvm::FindAvailableLoadedValue(Value *Ptr, BasicBlock *ScanBB,
+Value *llvm::FindAvailableLoadedValue(LoadInst *Load, BasicBlock *ScanBB,
                                       BasicBlock::iterator &ScanFrom,
                                       unsigned MaxInstsToScan,
-                                      AliasAnalysis *AA, AAMDNodes *AATags) {
+                                      AliasAnalysis *AA, AAMDNodes *AATags,
+                                      bool *IsLoadCSE) {
   if (MaxInstsToScan == 0)
     MaxInstsToScan = ~0U;
 
-  Type *AccessTy = cast<PointerType>(Ptr->getType())->getElementType();
+  Value *Ptr = Load->getPointerOperand();
+  Type *AccessTy = Load->getType();
+
+  // We can never remove a volatile load
+  if (Load->isVolatile())
+    return nullptr;
+
+  // Anything stronger than unordered is currently unimplemented.
+  if (!Load->isUnordered())
+    return nullptr;
 
   const DataLayout &DL = ScanBB->getModule()->getDataLayout();
 
@@ -231,8 +350,16 @@ Value *llvm::FindAvailableLoadedValue(Value *Ptr, BasicBlock *ScanBB,
       if (AreEquivalentAddressValues(
               LI->getPointerOperand()->stripPointerCasts(), StrippedPtr) &&
           CastInst::isBitOrNoopPointerCastable(LI->getType(), AccessTy, DL)) {
+
+        // We can value forward from an atomic to a non-atomic, but not the
+        // other way around.
+        if (LI->isAtomic() < Load->isAtomic())
+          return nullptr;
+
         if (AATags)
           LI->getAAMetadata(*AATags);
+        if (IsLoadCSE)
+            *IsLoadCSE = true;
         return LI;
       }
 
@@ -244,6 +371,12 @@ Value *llvm::FindAvailableLoadedValue(Value *Ptr, BasicBlock *ScanBB,
       if (AreEquivalentAddressValues(StorePtr, StrippedPtr) &&
           CastInst::isBitOrNoopPointerCastable(SI->getValueOperand()->getType(),
                                                AccessTy, DL)) {
+
+        // We can value forward from an atomic to a non-atomic, but not the
+        // other way around.
+        if (SI->isAtomic() < Load->isAtomic())
+          return nullptr;
+
         if (AATags)
           SI->getAAMetadata(*AATags);
         return SI->getOperand(0);
diff --git a/lib/Analysis/LoopAccessAnalysis.cpp b/lib/Analysis/LoopAccessAnalysis.cpp
index 8bcdcb862014..0d774cf08e2f 100644
--- a/lib/Analysis/LoopAccessAnalysis.cpp
+++ b/lib/Analysis/LoopAccessAnalysis.cpp
@@ -14,15 +14,17 @@
 
 #include "llvm/Analysis/LoopAccessAnalysis.h"
 #include "llvm/Analysis/LoopInfo.h"
+#include "llvm/Analysis/LoopPassManager.h"
 #include "llvm/Analysis/ScalarEvolutionExpander.h"
 #include "llvm/Analysis/TargetLibraryInfo.h"
 #include "llvm/Analysis/ValueTracking.h"
+#include "llvm/Analysis/VectorUtils.h"
 #include "llvm/IR/DiagnosticInfo.h"
 #include "llvm/IR/Dominators.h"
 #include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/PassManager.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/raw_ostream.h"
-#include "llvm/Analysis/VectorUtils.h"
 using namespace llvm;
 
 #define DEBUG_TYPE "loop-accesses"
@@ -65,6 +67,28 @@ static cl::opt<unsigned>
                             "loop-access analysis (default = 100)"),
                    cl::init(100));
 
+/// This enables versioning on the strides of symbolically striding memory
+/// accesses in code like the following.
+///   for (i = 0; i < N; ++i)
+///     A[i * Stride1] += B[i * Stride2] ...
+///
+/// Will be roughly translated to
+///    if (Stride1 == 1 && Stride2 == 1) {
+///      for (i = 0; i < N; i+=4)
+///       A[i:i+3] += ...
+///    } else
+///      ...
+static cl::opt<bool> EnableMemAccessVersioning(
+    "enable-mem-access-versioning", cl::init(true), cl::Hidden,
+    cl::desc("Enable symbolic stride memory access versioning"));
+
+/// \brief Enable store-to-load forwarding conflict detection. This option can
+/// be disabled for correctness testing.
+static cl::opt<bool> EnableForwardingConflictDetection(
+    "store-to-load-forwarding-conflict-detection", cl::Hidden,
+    cl::desc("Enable conflict detection in loop-access analysis"),
+    cl::init(true));
+
 bool VectorizerParams::isInterleaveForced() {
   return ::VectorizationInterleave.getNumOccurrences() > 0;
 }
@@ -81,7 +105,7 @@ void LoopAccessReport::emitAnalysis(const LoopAccessReport &Message,
 }
 
 Value *llvm::stripIntegerCast(Value *V) {
-  if (CastInst *CI = dyn_cast<CastInst>(V))
+  if (auto *CI = dyn_cast<CastInst>(V))
     if (CI->getOperand(0)->getType()->isIntegerTy())
       return CI->getOperand(0);
   return V;
@@ -130,26 +154,34 @@ void RuntimePointerChecking::insert(Loop *Lp, Value *Ptr, bool WritePtr,
                                     PredicatedScalarEvolution &PSE) {
   // Get the stride replaced scev.
   const SCEV *Sc = replaceSymbolicStrideSCEV(PSE, Strides, Ptr);
-  const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(Sc);
-  assert(AR && "Invalid addrec expression");
   ScalarEvolution *SE = PSE.getSE();
-  const SCEV *Ex = SE->getBackedgeTakenCount(Lp);
 
-  const SCEV *ScStart = AR->getStart();
-  const SCEV *ScEnd = AR->evaluateAtIteration(Ex, *SE);
-  const SCEV *Step = AR->getStepRecurrence(*SE);
+  const SCEV *ScStart;
+  const SCEV *ScEnd;
 
-  // For expressions with negative step, the upper bound is ScStart and the
-  // lower bound is ScEnd.
-  if (const SCEVConstant *CStep = dyn_cast<const SCEVConstant>(Step)) {
-    if (CStep->getValue()->isNegative())
-      std::swap(ScStart, ScEnd);
-  } else {
-    // Fallback case: the step is not constant, but the we can still
-    // get the upper and lower bounds of the interval by using min/max
-    // expressions.
-    ScStart = SE->getUMinExpr(ScStart, ScEnd);
-    ScEnd = SE->getUMaxExpr(AR->getStart(), ScEnd);
+  if (SE->isLoopInvariant(Sc, Lp))
+    ScStart = ScEnd = Sc;
+  else {
+    const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(Sc);
+    assert(AR && "Invalid addrec expression");
+    const SCEV *Ex = PSE.getBackedgeTakenCount();
+
+    ScStart = AR->getStart();
+    ScEnd = AR->evaluateAtIteration(Ex, *SE);
+    const SCEV *Step = AR->getStepRecurrence(*SE);
+
+    // For expressions with negative step, the upper bound is ScStart and the
+    // lower bound is ScEnd.
+    if (const auto *CStep = dyn_cast<SCEVConstant>(Step)) {
+      if (CStep->getValue()->isNegative())
+        std::swap(ScStart, ScEnd);
+    } else {
+      // Fallback case: the step is not constant, but the we can still
+      // get the upper and lower bounds of the interval by using min/max
+      // expressions.
+      ScStart = SE->getUMinExpr(ScStart, ScEnd);
+      ScEnd = SE->getUMaxExpr(AR->getStart(), ScEnd);
+    }
   }
 
   Pointers.emplace_back(Ptr, ScStart, ScEnd, WritePtr, DepSetId, ASId, Sc);
@@ -452,7 +484,7 @@ public:
   /// (i.e. the pointers have computable bounds).
   bool canCheckPtrAtRT(RuntimePointerChecking &RtCheck, ScalarEvolution *SE,
                        Loop *TheLoop, const ValueToValueMap &Strides,
-                       bool ShouldCheckStride = false);
+                       bool ShouldCheckWrap = false);
 
   /// \brief Goes over all memory accesses, checks whether a RT check is needed
   /// and builds sets of dependent accesses.
@@ -524,6 +556,11 @@ static bool hasComputableBounds(PredicatedScalarEvolution &PSE,
                                 const ValueToValueMap &Strides, Value *Ptr,
                                 Loop *L) {
   const SCEV *PtrScev = replaceSymbolicStrideSCEV(PSE, Strides, Ptr);
+
+  // The bounds for loop-invariant pointer is trivial.
+  if (PSE.getSE()->isLoopInvariant(PtrScev, L))
+    return true;
+
   const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(PtrScev);
   if (!AR)
     return false;
@@ -531,10 +568,21 @@ static bool hasComputableBounds(PredicatedScalarEvolution &PSE,
   return AR->isAffine();
 }
 
+/// \brief Check whether a pointer address cannot wrap.
+static bool isNoWrap(PredicatedScalarEvolution &PSE,
+                     const ValueToValueMap &Strides, Value *Ptr, Loop *L) {
+  const SCEV *PtrScev = PSE.getSCEV(Ptr);
+  if (PSE.getSE()->isLoopInvariant(PtrScev, L))
+    return true;
+
+  int64_t Stride = getPtrStride(PSE, Ptr, L, Strides);
+  return Stride == 1;
+}
+
 bool AccessAnalysis::canCheckPtrAtRT(RuntimePointerChecking &RtCheck,
                                      ScalarEvolution *SE, Loop *TheLoop,
                                      const ValueToValueMap &StridesMap,
-                                     bool ShouldCheckStride) {
+                                     bool ShouldCheckWrap) {
   // Find pointers with computable bounds. We are going to use this information
   // to place a runtime bound check.
   bool CanDoRT = true;
@@ -569,8 +617,7 @@ bool AccessAnalysis::canCheckPtrAtRT(RuntimePointerChecking &RtCheck,
       if (hasComputableBounds(PSE, StridesMap, Ptr, TheLoop) &&
           // When we run after a failing dependency check we have to make sure
           // we don't have wrapping pointers.
-          (!ShouldCheckStride ||
-           isStridedPtr(PSE, Ptr, TheLoop, StridesMap) == 1)) {
+          (!ShouldCheckWrap || isNoWrap(PSE, StridesMap, Ptr, TheLoop))) {
         // The id of the dependence set.
         unsigned DepId;
 
@@ -773,7 +820,7 @@ static bool isInBoundsGep(Value *Ptr) {
 /// \brief Return true if an AddRec pointer \p Ptr is unsigned non-wrapping,
 /// i.e. monotonically increasing/decreasing.
 static bool isNoWrapAddRec(Value *Ptr, const SCEVAddRecExpr *AR,
-                           ScalarEvolution *SE, const Loop *L) {
+                           PredicatedScalarEvolution &PSE, const Loop *L) {
   // FIXME: This should probably only return true for NUW.
   if (AR->getNoWrapFlags(SCEV::NoWrapMask))
     return true;
@@ -792,11 +839,11 @@ static bool isNoWrapAddRec(Value *Ptr, const SCEVAddRecExpr *AR,
 
   // Make sure there is only one non-const index and analyze that.
   Value *NonConstIndex = nullptr;
-  for (auto Index = GEP->idx_begin(); Index != GEP->idx_end(); ++Index)
-    if (!isa<ConstantInt>(*Index)) {
+  for (Value *Index : make_range(GEP->idx_begin(), GEP->idx_end()))
+    if (!isa<ConstantInt>(Index)) {
       if (NonConstIndex)
         return false;
-      NonConstIndex = *Index;
+      NonConstIndex = Index;
     }
   if (!NonConstIndex)
     // The recurrence is on the pointer, ignore for now.
@@ -809,7 +856,7 @@ static bool isNoWrapAddRec(Value *Ptr, const SCEVAddRecExpr *AR,
         // Assume constant for other the operand so that the AddRec can be
         // easily found.
         isa<ConstantInt>(OBO->getOperand(1))) {
-      auto *OpScev = SE->getSCEV(OBO->getOperand(0));
+      auto *OpScev = PSE.getSCEV(OBO->getOperand(0));
 
       if (auto *OpAR = dyn_cast<SCEVAddRecExpr>(OpScev))
         return OpAR->getLoop() == L && OpAR->getNoWrapFlags(SCEV::FlagNSW);
@@ -819,32 +866,36 @@ static bool isNoWrapAddRec(Value *Ptr, const SCEVAddRecExpr *AR,
 }
 
 /// \brief Check whether the access through \p Ptr has a constant stride.
-int llvm::isStridedPtr(PredicatedScalarEvolution &PSE, Value *Ptr,
-                       const Loop *Lp, const ValueToValueMap &StridesMap) {
+int64_t llvm::getPtrStride(PredicatedScalarEvolution &PSE, Value *Ptr,
+                           const Loop *Lp, const ValueToValueMap &StridesMap,
+                           bool Assume) {
   Type *Ty = Ptr->getType();
   assert(Ty->isPointerTy() && "Unexpected non-ptr");
 
   // Make sure that the pointer does not point to aggregate types.
   auto *PtrTy = cast<PointerType>(Ty);
   if (PtrTy->getElementType()->isAggregateType()) {
-    DEBUG(dbgs() << "LAA: Bad stride - Not a pointer to a scalar type"
-          << *Ptr << "\n");
+    DEBUG(dbgs() << "LAA: Bad stride - Not a pointer to a scalar type" << *Ptr
+                 << "\n");
     return 0;
   }
 
   const SCEV *PtrScev = replaceSymbolicStrideSCEV(PSE, StridesMap, Ptr);
 
   const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(PtrScev);
+  if (Assume && !AR)
+    AR = PSE.getAsAddRec(Ptr);
+
   if (!AR) {
-    DEBUG(dbgs() << "LAA: Bad stride - Not an AddRecExpr pointer "
-          << *Ptr << " SCEV: " << *PtrScev << "\n");
+    DEBUG(dbgs() << "LAA: Bad stride - Not an AddRecExpr pointer " << *Ptr
+                 << " SCEV: " << *PtrScev << "\n");
     return 0;
   }
 
   // The accesss function must stride over the innermost loop.
   if (Lp != AR->getLoop()) {
     DEBUG(dbgs() << "LAA: Bad stride - Not striding over innermost loop " <<
-          *Ptr << " SCEV: " << *PtrScev << "\n");
+          *Ptr << " SCEV: " << *AR << "\n");
     return 0;
   }
 
@@ -856,12 +907,23 @@ int llvm::isStridedPtr(PredicatedScalarEvolution &PSE, Value *Ptr,
   // to access the pointer value "0" which is undefined behavior in address
   // space 0, therefore we can also vectorize this case.
   bool IsInBoundsGEP = isInBoundsGep(Ptr);
-  bool IsNoWrapAddRec = isNoWrapAddRec(Ptr, AR, PSE.getSE(), Lp);
+  bool IsNoWrapAddRec =
+      PSE.hasNoOverflow(Ptr, SCEVWrapPredicate::IncrementNUSW) ||
+      isNoWrapAddRec(Ptr, AR, PSE, Lp);
   bool IsInAddressSpaceZero = PtrTy->getAddressSpace() == 0;
   if (!IsNoWrapAddRec && !IsInBoundsGEP && !IsInAddressSpaceZero) {
-    DEBUG(dbgs() << "LAA: Bad stride - Pointer may wrap in the address space "
-                 << *Ptr << " SCEV: " << *PtrScev << "\n");
-    return 0;
+    if (Assume) {
+      PSE.setNoOverflow(Ptr, SCEVWrapPredicate::IncrementNUSW);
+      IsNoWrapAddRec = true;
+      DEBUG(dbgs() << "LAA: Pointer may wrap in the address space:\n"
+                   << "LAA:   Pointer: " << *Ptr << "\n"
+                   << "LAA:   SCEV: " << *AR << "\n"
+                   << "LAA:   Added an overflow assumption\n");
+    } else {
+      DEBUG(dbgs() << "LAA: Bad stride - Pointer may wrap in the address space "
+                   << *Ptr << " SCEV: " << *AR << "\n");
+      return 0;
+    }
   }
 
   // Check the step is constant.
@@ -871,7 +933,7 @@ int llvm::isStridedPtr(PredicatedScalarEvolution &PSE, Value *Ptr,
   const SCEVConstant *C = dyn_cast<SCEVConstant>(Step);
   if (!C) {
     DEBUG(dbgs() << "LAA: Bad stride - Not a constant strided " << *Ptr <<
-          " SCEV: " << *PtrScev << "\n");
+          " SCEV: " << *AR << "\n");
     return 0;
   }
 
@@ -895,12 +957,94 @@ int llvm::isStridedPtr(PredicatedScalarEvolution &PSE, Value *Ptr,
   // know we can't "wrap around the address space". In case of address space
   // zero we know that this won't happen without triggering undefined behavior.
   if (!IsNoWrapAddRec && (IsInBoundsGEP || IsInAddressSpaceZero) &&
-      Stride != 1 && Stride != -1)
-    return 0;
+      Stride != 1 && Stride != -1) {
+    if (Assume) {
+      // We can avoid this case by adding a run-time check.
+      DEBUG(dbgs() << "LAA: Non unit strided pointer which is not either "
+                   << "inbouds or in address space 0 may wrap:\n"
+                   << "LAA:   Pointer: " << *Ptr << "\n"
+                   << "LAA:   SCEV: " << *AR << "\n"
+                   << "LAA:   Added an overflow assumption\n");
+      PSE.setNoOverflow(Ptr, SCEVWrapPredicate::IncrementNUSW);
+    } else
+      return 0;
+  }
 
   return Stride;
 }
 
+/// Take the pointer operand from the Load/Store instruction.
+/// Returns NULL if this is not a valid Load/Store instruction.
+static Value *getPointerOperand(Value *I) {
+  if (auto *LI = dyn_cast<LoadInst>(I))
+    return LI->getPointerOperand();
+  if (auto *SI = dyn_cast<StoreInst>(I))
+    return SI->getPointerOperand();
+  return nullptr;
+}
+
+/// Take the address space operand from the Load/Store instruction.
+/// Returns -1 if this is not a valid Load/Store instruction.
+static unsigned getAddressSpaceOperand(Value *I) {
+  if (LoadInst *L = dyn_cast<LoadInst>(I))
+    return L->getPointerAddressSpace();
+  if (StoreInst *S = dyn_cast<StoreInst>(I))
+    return S->getPointerAddressSpace();
+  return -1;
+}
+
+/// Returns true if the memory operations \p A and \p B are consecutive.
+bool llvm::isConsecutiveAccess(Value *A, Value *B, const DataLayout &DL,
+                               ScalarEvolution &SE, bool CheckType) {
+  Value *PtrA = getPointerOperand(A);
+  Value *PtrB = getPointerOperand(B);
+  unsigned ASA = getAddressSpaceOperand(A);
+  unsigned ASB = getAddressSpaceOperand(B);
+
+  // Check that the address spaces match and that the pointers are valid.
+  if (!PtrA || !PtrB || (ASA != ASB))
+    return false;
+
+  // Make sure that A and B are different pointers.
+  if (PtrA == PtrB)
+    return false;
+
+  // Make sure that A and B have the same type if required.
+  if(CheckType && PtrA->getType() != PtrB->getType())
+      return false;
+
+  unsigned PtrBitWidth = DL.getPointerSizeInBits(ASA);
+  Type *Ty = cast<PointerType>(PtrA->getType())->getElementType();
+  APInt Size(PtrBitWidth, DL.getTypeStoreSize(Ty));
+
+  APInt OffsetA(PtrBitWidth, 0), OffsetB(PtrBitWidth, 0);
+  PtrA = PtrA->stripAndAccumulateInBoundsConstantOffsets(DL, OffsetA);
+  PtrB = PtrB->stripAndAccumulateInBoundsConstantOffsets(DL, OffsetB);
+
+  //  OffsetDelta = OffsetB - OffsetA;
+  const SCEV *OffsetSCEVA = SE.getConstant(OffsetA);
+  const SCEV *OffsetSCEVB = SE.getConstant(OffsetB);
+  const SCEV *OffsetDeltaSCEV = SE.getMinusSCEV(OffsetSCEVB, OffsetSCEVA);
+  const SCEVConstant *OffsetDeltaC = dyn_cast<SCEVConstant>(OffsetDeltaSCEV);
+  const APInt &OffsetDelta = OffsetDeltaC->getAPInt();
+  // Check if they are based on the same pointer. That makes the offsets
+  // sufficient.
+  if (PtrA == PtrB)
+    return OffsetDelta == Size;
+
+  // Compute the necessary base pointer delta to have the necessary final delta
+  // equal to the size.
+  // BaseDelta = Size - OffsetDelta;
+  const SCEV *SizeSCEV = SE.getConstant(Size);
+  const SCEV *BaseDelta = SE.getMinusSCEV(SizeSCEV, OffsetDeltaSCEV);
+
+  // Otherwise compute the distance with SCEV between the base pointers.
+  const SCEV *PtrSCEVA = SE.getSCEV(PtrA);
+  const SCEV *PtrSCEVB = SE.getSCEV(PtrB);
+  const SCEV *X = SE.getAddExpr(PtrSCEVA, BaseDelta);
+  return X == PtrSCEVB;
+}
+
 bool MemoryDepChecker::Dependence::isSafeForVectorization(DepType Type) {
   switch (Type) {
   case NoDep:
@@ -953,8 +1097,8 @@ bool MemoryDepChecker::Dependence::isForward() const {
   llvm_unreachable("unexpected DepType!");
 }
 
-bool MemoryDepChecker::couldPreventStoreLoadForward(unsigned Distance,
-                                                    unsigned TypeByteSize) {
+bool MemoryDepChecker::couldPreventStoreLoadForward(uint64_t Distance,
+                                                    uint64_t TypeByteSize) {
   // If loads occur at a distance that is not a multiple of a feasible vector
   // factor store-load forwarding does not take place.
   // Positive dependences might cause troubles because vectorizing them might
@@ -964,30 +1108,34 @@ bool MemoryDepChecker::couldPreventStoreLoadForward(unsigned Distance,
   //   hence on your typical architecture store-load forwarding does not take
   //   place. Vectorizing in such cases does not make sense.
   // Store-load forwarding distance.
-  const unsigned NumCyclesForStoreLoadThroughMemory = 8*TypeByteSize;
+
+  // After this many iterations store-to-load forwarding conflicts should not
+  // cause any slowdowns.
+  const uint64_t NumItersForStoreLoadThroughMemory = 8 * TypeByteSize;
   // Maximum vector factor.
-  unsigned MaxVFWithoutSLForwardIssues =
-    VectorizerParams::MaxVectorWidth * TypeByteSize;
-  if(MaxSafeDepDistBytes < MaxVFWithoutSLForwardIssues)
-    MaxVFWithoutSLForwardIssues = MaxSafeDepDistBytes;
-
-  for (unsigned vf = 2*TypeByteSize; vf <= MaxVFWithoutSLForwardIssues;
-       vf *= 2) {
-    if (Distance % vf && Distance / vf < NumCyclesForStoreLoadThroughMemory) {
-      MaxVFWithoutSLForwardIssues = (vf >>=1);
+  uint64_t MaxVFWithoutSLForwardIssues = std::min(
+      VectorizerParams::MaxVectorWidth * TypeByteSize, MaxSafeDepDistBytes);
+
+  // Compute the smallest VF at which the store and load would be misaligned.
+  for (uint64_t VF = 2 * TypeByteSize; VF <= MaxVFWithoutSLForwardIssues;
+       VF *= 2) {
+    // If the number of vector iteration between the store and the load are
+    // small we could incur conflicts.
+    if (Distance % VF && Distance / VF < NumItersForStoreLoadThroughMemory) {
+      MaxVFWithoutSLForwardIssues = (VF >>= 1);
       break;
     }
   }
 
-  if (MaxVFWithoutSLForwardIssues< 2*TypeByteSize) {
-    DEBUG(dbgs() << "LAA: Distance " << Distance <<
-          " that could cause a store-load forwarding conflict\n");
+  if (MaxVFWithoutSLForwardIssues < 2 * TypeByteSize) {
+    DEBUG(dbgs() << "LAA: Distance " << Distance
+                 << " that could cause a store-load forwarding conflict\n");
     return true;
   }
 
   if (MaxVFWithoutSLForwardIssues < MaxSafeDepDistBytes &&
       MaxVFWithoutSLForwardIssues !=
-      VectorizerParams::MaxVectorWidth * TypeByteSize)
+          VectorizerParams::MaxVectorWidth * TypeByteSize)
     MaxSafeDepDistBytes = MaxVFWithoutSLForwardIssues;
   return false;
 }
@@ -997,8 +1145,8 @@ bool MemoryDepChecker::couldPreventStoreLoadForward(unsigned Distance,
 /// bytes.
 ///
 /// \returns true if they are independent.
-static bool areStridedAccessesIndependent(unsigned Distance, unsigned Stride,
-                                          unsigned TypeByteSize) {
+static bool areStridedAccessesIndependent(uint64_t Distance, uint64_t Stride,
+                                          uint64_t TypeByteSize) {
   assert(Stride > 1 && "The stride must be greater than 1");
   assert(TypeByteSize > 0 && "The type size in byte must be non-zero");
   assert(Distance > 0 && "The distance must be non-zero");
@@ -1007,7 +1155,7 @@ static bool areStridedAccessesIndependent(unsigned Distance, unsigned Stride,
   if (Distance % TypeByteSize)
     return false;
 
-  unsigned ScaledDist = Distance / TypeByteSize;
+  uint64_t ScaledDist = Distance / TypeByteSize;
 
   // No dependence if the scaled distance is not multiple of the stride.
   // E.g.
@@ -1048,20 +1196,15 @@ MemoryDepChecker::isDependent(const MemAccessInfo &A, unsigned AIdx,
       BPtr->getType()->getPointerAddressSpace())
     return Dependence::Unknown;
 
-  const SCEV *AScev = replaceSymbolicStrideSCEV(PSE, Strides, APtr);
-  const SCEV *BScev = replaceSymbolicStrideSCEV(PSE, Strides, BPtr);
-
-  int StrideAPtr = isStridedPtr(PSE, APtr, InnermostLoop, Strides);
-  int StrideBPtr = isStridedPtr(PSE, BPtr, InnermostLoop, Strides);
+  int64_t StrideAPtr = getPtrStride(PSE, APtr, InnermostLoop, Strides, true);
+  int64_t StrideBPtr = getPtrStride(PSE, BPtr, InnermostLoop, Strides, true);
 
-  const SCEV *Src = AScev;
-  const SCEV *Sink = BScev;
+  const SCEV *Src = PSE.getSCEV(APtr);
+  const SCEV *Sink = PSE.getSCEV(BPtr);
 
   // If the induction step is negative we have to invert source and sink of the
   // dependence.
   if (StrideAPtr < 0) {
-    //Src = BScev;
-    //Sink = AScev;
     std::swap(APtr, BPtr);
     std::swap(Src, Sink);
     std::swap(AIsWrite, BIsWrite);
@@ -1094,18 +1237,30 @@ MemoryDepChecker::isDependent(const MemAccessInfo &A, unsigned AIdx,
   Type *ATy = APtr->getType()->getPointerElementType();
   Type *BTy = BPtr->getType()->getPointerElementType();
   auto &DL = InnermostLoop->getHeader()->getModule()->getDataLayout();
-  unsigned TypeByteSize = DL.getTypeAllocSize(ATy);
+  uint64_t TypeByteSize = DL.getTypeAllocSize(ATy);
 
-  // Negative distances are not plausible dependencies.
   const APInt &Val = C->getAPInt();
+  int64_t Distance = Val.getSExtValue();
+  uint64_t Stride = std::abs(StrideAPtr);
+
+  // Attempt to prove strided accesses independent.
+  if (std::abs(Distance) > 0 && Stride > 1 && ATy == BTy &&
+      areStridedAccessesIndependent(std::abs(Distance), Stride, TypeByteSize)) {
+    DEBUG(dbgs() << "LAA: Strided accesses are independent\n");
+    return Dependence::NoDep;
+  }
+
+  // Negative distances are not plausible dependencies.
   if (Val.isNegative()) {
     bool IsTrueDataDependence = (AIsWrite && !BIsWrite);
-    if (IsTrueDataDependence &&
+    if (IsTrueDataDependence && EnableForwardingConflictDetection &&
         (couldPreventStoreLoadForward(Val.abs().getZExtValue(), TypeByteSize) ||
-         ATy != BTy))
+         ATy != BTy)) {
+      DEBUG(dbgs() << "LAA: Forward but may prevent st->ld forwarding\n");
       return Dependence::ForwardButPreventsForwarding;
+    }
 
-    DEBUG(dbgs() << "LAA: Dependence is negative: NoDep\n");
+    DEBUG(dbgs() << "LAA: Dependence is negative\n");
     return Dependence::Forward;
   }
 
@@ -1126,15 +1281,6 @@ MemoryDepChecker::isDependent(const MemAccessInfo &A, unsigned AIdx,
     return Dependence::Unknown;
   }
 
-  unsigned Distance = (unsigned) Val.getZExtValue();
-
-  unsigned Stride = std::abs(StrideAPtr);
-  if (Stride > 1 &&
-      areStridedAccessesIndependent(Distance, Stride, TypeByteSize)) {
-    DEBUG(dbgs() << "LAA: Strided accesses are independent\n");
-    return Dependence::NoDep;
-  }
-
   // Bail out early if passed-in parameters make vectorization not feasible.
   unsigned ForcedFactor = (VectorizerParams::VectorizationFactor ?
                            VectorizerParams::VectorizationFactor : 1);
@@ -1169,9 +1315,9 @@ MemoryDepChecker::isDependent(const MemAccessInfo &A, unsigned AIdx,
   // If MinNumIter is 4 (Say if a user forces the vectorization factor to be 4),
   // the minimum distance needed is 28, which is greater than distance. It is
   // not safe to do vectorization.
-  unsigned MinDistanceNeeded =
+  uint64_t MinDistanceNeeded =
       TypeByteSize * Stride * (MinNumIter - 1) + TypeByteSize;
-  if (MinDistanceNeeded > Distance) {
+  if (MinDistanceNeeded > static_cast<uint64_t>(Distance)) {
     DEBUG(dbgs() << "LAA: Failure because of positive distance " << Distance
                  << '\n');
     return Dependence::Backward;
@@ -1201,10 +1347,10 @@ MemoryDepChecker::isDependent(const MemAccessInfo &A, unsigned AIdx,
   // is 8, which is less than 2 and forbidden vectorization, But actually
   // both A and B could be vectorized by 2 iterations.
   MaxSafeDepDistBytes =
-      Distance < MaxSafeDepDistBytes ? Distance : MaxSafeDepDistBytes;
+      std::min(static_cast<uint64_t>(Distance), MaxSafeDepDistBytes);
 
   bool IsTrueDataDependence = (!AIsWrite && BIsWrite);
-  if (IsTrueDataDependence &&
+  if (IsTrueDataDependence && EnableForwardingConflictDetection &&
       couldPreventStoreLoadForward(Distance, TypeByteSize))
     return Dependence::BackwardVectorizableButPreventsForwarding;
 
@@ -1219,7 +1365,7 @@ bool MemoryDepChecker::areDepsSafe(DepCandidates &AccessSets,
                                    MemAccessInfoSet &CheckDeps,
                                    const ValueToValueMap &Strides) {
 
-  MaxSafeDepDistBytes = -1U;
+  MaxSafeDepDistBytes = -1;
   while (!CheckDeps.empty()) {
     MemAccessInfo CurAccess = *CheckDeps.begin();
 
@@ -1228,8 +1374,10 @@ bool MemoryDepChecker::areDepsSafe(DepCandidates &AccessSets,
       AccessSets.findValue(AccessSets.getLeaderValue(CurAccess));
 
     // Check accesses within this set.
-    EquivalenceClasses<MemAccessInfo>::member_iterator AI, AE;
-    AI = AccessSets.member_begin(I), AE = AccessSets.member_end();
+    EquivalenceClasses<MemAccessInfo>::member_iterator AI =
+        AccessSets.member_begin(I);
+    EquivalenceClasses<MemAccessInfo>::member_iterator AE =
+        AccessSets.member_end();
 
     // Check every access pair.
     while (AI != AE) {
@@ -1305,10 +1453,11 @@ void MemoryDepChecker::Dependence::print(
 
 bool LoopAccessInfo::canAnalyzeLoop() {
   // We need to have a loop header.
-  DEBUG(dbgs() << "LAA: Found a loop: " <<
-        TheLoop->getHeader()->getName() << '\n');
+  DEBUG(dbgs() << "LAA: Found a loop in "
+               << TheLoop->getHeader()->getParent()->getName() << ": "
+               << TheLoop->getHeader()->getName() << '\n');
 
-    // We can only analyze innermost loops.
+  // We can only analyze innermost loops.
   if (!TheLoop->empty()) {
     DEBUG(dbgs() << "LAA: loop is not the innermost loop\n");
     emitAnalysis(LoopAccessReport() << "loop is not the innermost loop");
@@ -1345,8 +1494,8 @@ bool LoopAccessInfo::canAnalyzeLoop() {
   }
 
   // ScalarEvolution needs to be able to find the exit count.
-  const SCEV *ExitCount = PSE.getSE()->getBackedgeTakenCount(TheLoop);
-  if (ExitCount == PSE.getSE()->getCouldNotCompute()) {
+  const SCEV *ExitCount = PSE->getBackedgeTakenCount();
+  if (ExitCount == PSE->getSE()->getCouldNotCompute()) {
     emitAnalysis(LoopAccessReport()
                  << "could not determine number of loop iterations");
     DEBUG(dbgs() << "LAA: SCEV could not compute the loop exit count.\n");
@@ -1356,41 +1505,37 @@ bool LoopAccessInfo::canAnalyzeLoop() {
   return true;
 }
 
-void LoopAccessInfo::analyzeLoop(const ValueToValueMap &Strides) {
-
-  typedef SmallVector<Value*, 16> ValueVector;
+void LoopAccessInfo::analyzeLoop(AliasAnalysis *AA, LoopInfo *LI,
+                                 const TargetLibraryInfo *TLI,
+                                 DominatorTree *DT) {
   typedef SmallPtrSet<Value*, 16> ValueSet;
 
-  // Holds the Load and Store *instructions*.
-  ValueVector Loads;
-  ValueVector Stores;
+  // Holds the Load and Store instructions.
+  SmallVector<LoadInst *, 16> Loads;
+  SmallVector<StoreInst *, 16> Stores;
 
   // Holds all the different accesses in the loop.
   unsigned NumReads = 0;
   unsigned NumReadWrites = 0;
 
-  PtrRtChecking.Pointers.clear();
-  PtrRtChecking.Need = false;
+  PtrRtChecking->Pointers.clear();
+  PtrRtChecking->Need = false;
 
   const bool IsAnnotatedParallel = TheLoop->isAnnotatedParallel();
 
   // For each block.
-  for (Loop::block_iterator bb = TheLoop->block_begin(),
-       be = TheLoop->block_end(); bb != be; ++bb) {
-
+  for (BasicBlock *BB : TheLoop->blocks()) {
     // Scan the BB and collect legal loads and stores.
-    for (BasicBlock::iterator it = (*bb)->begin(), e = (*bb)->end(); it != e;
-         ++it) {
-
+    for (Instruction &I : *BB) {
       // If this is a load, save it. If this instruction can read from memory
       // but is not a load, then we quit. Notice that we don't handle function
       // calls that read or write.
-      if (it->mayReadFromMemory()) {
+      if (I.mayReadFromMemory()) {
         // Many math library functions read the rounding mode. We will only
         // vectorize a loop if it contains known function calls that don't set
         // the flag. Therefore, it is safe to ignore this read from memory.
-        CallInst *Call = dyn_cast<CallInst>(it);
-        if (Call && getIntrinsicIDForCall(Call, TLI))
+        auto *Call = dyn_cast<CallInst>(&I);
+        if (Call && getVectorIntrinsicIDForCall(Call, TLI))
           continue;
 
         // If the function has an explicit vectorized counterpart, we can safely
@@ -1399,7 +1544,7 @@ void LoopAccessInfo::analyzeLoop(const ValueToValueMap &Strides) {
             TLI->isFunctionVectorizable(Call->getCalledFunction()->getName()))
           continue;
 
-        LoadInst *Ld = dyn_cast<LoadInst>(it);
+        auto *Ld = dyn_cast<LoadInst>(&I);
         if (!Ld || (!Ld->isSimple() && !IsAnnotatedParallel)) {
           emitAnalysis(LoopAccessReport(Ld)
                        << "read with atomic ordering or volatile read");
@@ -1409,16 +1554,18 @@ void LoopAccessInfo::analyzeLoop(const ValueToValueMap &Strides) {
         }
         NumLoads++;
         Loads.push_back(Ld);
-        DepChecker.addAccess(Ld);
+        DepChecker->addAccess(Ld);
+        if (EnableMemAccessVersioning)
+          collectStridedAccess(Ld);
         continue;
       }
 
       // Save 'store' instructions. Abort if other instructions write to memory.
-      if (it->mayWriteToMemory()) {
-        StoreInst *St = dyn_cast<StoreInst>(it);
+      if (I.mayWriteToMemory()) {
+        auto *St = dyn_cast<StoreInst>(&I);
         if (!St) {
-          emitAnalysis(LoopAccessReport(&*it) <<
-                       "instruction cannot be vectorized");
+          emitAnalysis(LoopAccessReport(St)
+                       << "instruction cannot be vectorized");
           CanVecMem = false;
           return;
         }
@@ -1431,7 +1578,9 @@ void LoopAccessInfo::analyzeLoop(const ValueToValueMap &Strides) {
         }
         NumStores++;
         Stores.push_back(St);
-        DepChecker.addAccess(St);
+        DepChecker->addAccess(St);
+        if (EnableMemAccessVersioning)
+          collectStridedAccess(St);
       }
     } // Next instr.
   } // Next block.
@@ -1449,7 +1598,7 @@ void LoopAccessInfo::analyzeLoop(const ValueToValueMap &Strides) {
 
   MemoryDepChecker::DepCandidates DependentAccesses;
   AccessAnalysis Accesses(TheLoop->getHeader()->getModule()->getDataLayout(),
-                          AA, LI, DependentAccesses, PSE);
+                          AA, LI, DependentAccesses, *PSE);
 
   // Holds the analyzed pointers. We don't want to call GetUnderlyingObjects
   // multiple times on the same object. If the ptr is accessed twice, once
@@ -1458,10 +1607,8 @@ void LoopAccessInfo::analyzeLoop(const ValueToValueMap &Strides) {
   // writes and between reads and writes, but not between reads and reads.
   ValueSet Seen;
 
-  ValueVector::iterator I, IE;
-  for (I = Stores.begin(), IE = Stores.end(); I != IE; ++I) {
-    StoreInst *ST = cast<StoreInst>(*I);
-    Value* Ptr = ST->getPointerOperand();
+  for (StoreInst *ST : Stores) {
+    Value *Ptr = ST->getPointerOperand();
     // Check for store to loop invariant address.
     StoreToLoopInvariantAddress |= isUniform(Ptr);
     // If we did *not* see this pointer before, insert it to  the read-write
@@ -1488,9 +1635,8 @@ void LoopAccessInfo::analyzeLoop(const ValueToValueMap &Strides) {
     return;
   }
 
-  for (I = Loads.begin(), IE = Loads.end(); I != IE; ++I) {
-    LoadInst *LD = cast<LoadInst>(*I);
-    Value* Ptr = LD->getPointerOperand();
+  for (LoadInst *LD : Loads) {
+    Value *Ptr = LD->getPointerOperand();
     // If we did *not* see this pointer before, insert it to the
     // read list. If we *did* see it before, then it is already in
     // the read-write list. This allows us to vectorize expressions
@@ -1500,7 +1646,8 @@ void LoopAccessInfo::analyzeLoop(const ValueToValueMap &Strides) {
     // read a few words, modify, and write a few words, and some of the
     // words may be written to the same address.
     bool IsReadOnlyPtr = false;
-    if (Seen.insert(Ptr).second || !isStridedPtr(PSE, Ptr, TheLoop, Strides)) {
+    if (Seen.insert(Ptr).second ||
+        !getPtrStride(*PSE, Ptr, TheLoop, SymbolicStrides)) {
       ++NumReads;
       IsReadOnlyPtr = true;
     }
@@ -1529,8 +1676,8 @@ void LoopAccessInfo::analyzeLoop(const ValueToValueMap &Strides) {
 
   // Find pointers with computable bounds. We are going to use this information
   // to place a runtime bound check.
-  bool CanDoRTIfNeeded =
-      Accesses.canCheckPtrAtRT(PtrRtChecking, PSE.getSE(), TheLoop, Strides);
+  bool CanDoRTIfNeeded = Accesses.canCheckPtrAtRT(*PtrRtChecking, PSE->getSE(),
+                                                  TheLoop, SymbolicStrides);
   if (!CanDoRTIfNeeded) {
     emitAnalysis(LoopAccessReport() << "cannot identify array bounds");
     DEBUG(dbgs() << "LAA: We can't vectorize because we can't find "
@@ -1544,22 +1691,22 @@ void LoopAccessInfo::analyzeLoop(const ValueToValueMap &Strides) {
   CanVecMem = true;
   if (Accesses.isDependencyCheckNeeded()) {
     DEBUG(dbgs() << "LAA: Checking memory dependencies\n");
-    CanVecMem = DepChecker.areDepsSafe(
-        DependentAccesses, Accesses.getDependenciesToCheck(), Strides);
-    MaxSafeDepDistBytes = DepChecker.getMaxSafeDepDistBytes();
+    CanVecMem = DepChecker->areDepsSafe(
+        DependentAccesses, Accesses.getDependenciesToCheck(), SymbolicStrides);
+    MaxSafeDepDistBytes = DepChecker->getMaxSafeDepDistBytes();
 
-    if (!CanVecMem && DepChecker.shouldRetryWithRuntimeCheck()) {
+    if (!CanVecMem && DepChecker->shouldRetryWithRuntimeCheck()) {
       DEBUG(dbgs() << "LAA: Retrying with memory checks\n");
 
       // Clear the dependency checks. We assume they are not needed.
-      Accesses.resetDepChecks(DepChecker);
+      Accesses.resetDepChecks(*DepChecker);
 
-      PtrRtChecking.reset();
-      PtrRtChecking.Need = true;
+      PtrRtChecking->reset();
+      PtrRtChecking->Need = true;
 
-      auto *SE = PSE.getSE();
-      CanDoRTIfNeeded =
-          Accesses.canCheckPtrAtRT(PtrRtChecking, SE, TheLoop, Strides, true);
+      auto *SE = PSE->getSE();
+      CanDoRTIfNeeded = Accesses.canCheckPtrAtRT(*PtrRtChecking, SE, TheLoop,
+                                                 SymbolicStrides, true);
 
       // Check that we found the bounds for the pointer.
       if (!CanDoRTIfNeeded) {
@@ -1576,11 +1723,15 @@ void LoopAccessInfo::analyzeLoop(const ValueToValueMap &Strides) {
 
   if (CanVecMem)
     DEBUG(dbgs() << "LAA: No unsafe dependent memory operations in loop.  We"
-                 << (PtrRtChecking.Need ? "" : " don't")
+                 << (PtrRtChecking->Need ? "" : " don't")
                  << " need runtime memory checks.\n");
   else {
-    emitAnalysis(LoopAccessReport() <<
-                 "unsafe dependent memory operations in loop");
+    emitAnalysis(
+        LoopAccessReport()
+        << "unsafe dependent memory operations in loop. Use "
+           "#pragma loop distribute(enable) to allow loop distribution "
+           "to attempt to isolate the offending operations into a separate "
+           "loop");
     DEBUG(dbgs() << "LAA: unsafe dependent memory operations in loop\n");
   }
 }
@@ -1600,7 +1751,7 @@ void LoopAccessInfo::emitAnalysis(LoopAccessReport &Message) {
 }
 
 bool LoopAccessInfo::isUniform(Value *V) const {
-  return (PSE.getSE()->isLoopInvariant(PSE.getSE()->getSCEV(V), TheLoop));
+  return (PSE->getSE()->isLoopInvariant(PSE->getSE()->getSCEV(V), TheLoop));
 }
 
 // FIXME: this function is currently a duplicate of the one in
@@ -1681,10 +1832,11 @@ std::pair<Instruction *, Instruction *> LoopAccessInfo::addRuntimeChecks(
     Instruction *Loc,
     const SmallVectorImpl<RuntimePointerChecking::PointerCheck> &PointerChecks)
     const {
-  auto *SE = PSE.getSE();
+  const DataLayout &DL = TheLoop->getHeader()->getModule()->getDataLayout();
+  auto *SE = PSE->getSE();
   SCEVExpander Exp(*SE, DL, "induction");
   auto ExpandedChecks =
-      expandBounds(PointerChecks, TheLoop, Loc, SE, Exp, PtrRtChecking);
+      expandBounds(PointerChecks, TheLoop, Loc, SE, Exp, *PtrRtChecking);
 
   LLVMContext &Ctx = Loc->getContext();
   Instruction *FirstInst = nullptr;
@@ -1740,47 +1892,68 @@ std::pair<Instruction *, Instruction *> LoopAccessInfo::addRuntimeChecks(
 
 std::pair<Instruction *, Instruction *>
 LoopAccessInfo::addRuntimeChecks(Instruction *Loc) const {
-  if (!PtrRtChecking.Need)
+  if (!PtrRtChecking->Need)
     return std::make_pair(nullptr, nullptr);
 
-  return addRuntimeChecks(Loc, PtrRtChecking.getChecks());
+  return addRuntimeChecks(Loc, PtrRtChecking->getChecks());
+}
+
+void LoopAccessInfo::collectStridedAccess(Value *MemAccess) {
+  Value *Ptr = nullptr;
+  if (LoadInst *LI = dyn_cast<LoadInst>(MemAccess))
+    Ptr = LI->getPointerOperand();
+  else if (StoreInst *SI = dyn_cast<StoreInst>(MemAccess))
+    Ptr = SI->getPointerOperand();
+  else
+    return;
+
+  Value *Stride = getStrideFromPointer(Ptr, PSE->getSE(), TheLoop);
+  if (!Stride)
+    return;
+
+  DEBUG(dbgs() << "LAA: Found a strided access that we can version");
+  DEBUG(dbgs() << "  Ptr: " << *Ptr << " Stride: " << *Stride << "\n");
+  SymbolicStrides[Ptr] = Stride;
+  StrideSet.insert(Stride);
 }
 
 LoopAccessInfo::LoopAccessInfo(Loop *L, ScalarEvolution *SE,
-                               const DataLayout &DL,
                                const TargetLibraryInfo *TLI, AliasAnalysis *AA,
-                               DominatorTree *DT, LoopInfo *LI,
-                               const ValueToValueMap &Strides)
-    : PSE(*SE), PtrRtChecking(SE), DepChecker(PSE, L), TheLoop(L), DL(DL),
-      TLI(TLI), AA(AA), DT(DT), LI(LI), NumLoads(0), NumStores(0),
-      MaxSafeDepDistBytes(-1U), CanVecMem(false),
+                               DominatorTree *DT, LoopInfo *LI)
+    : PSE(llvm::make_unique<PredicatedScalarEvolution>(*SE, *L)),
+      PtrRtChecking(llvm::make_unique<RuntimePointerChecking>(SE)),
+      DepChecker(llvm::make_unique<MemoryDepChecker>(*PSE, L)), TheLoop(L),
+      NumLoads(0), NumStores(0), MaxSafeDepDistBytes(-1), CanVecMem(false),
       StoreToLoopInvariantAddress(false) {
   if (canAnalyzeLoop())
-    analyzeLoop(Strides);
+    analyzeLoop(AA, LI, TLI, DT);
 }
 
 void LoopAccessInfo::print(raw_ostream &OS, unsigned Depth) const {
   if (CanVecMem) {
-    if (PtrRtChecking.Need)
-      OS.indent(Depth) << "Memory dependences are safe with run-time checks\n";
-    else
-      OS.indent(Depth) << "Memory dependences are safe\n";
+    OS.indent(Depth) << "Memory dependences are safe";
+    if (MaxSafeDepDistBytes != -1ULL)
+      OS << " with a maximum dependence distance of " << MaxSafeDepDistBytes
+         << " bytes";
+    if (PtrRtChecking->Need)
+      OS << " with run-time checks";
+    OS << "\n";
   }
 
   if (Report)
     OS.indent(Depth) << "Report: " << Report->str() << "\n";
 
-  if (auto *Dependences = DepChecker.getDependences()) {
+  if (auto *Dependences = DepChecker->getDependences()) {
     OS.indent(Depth) << "Dependences:\n";
     for (auto &Dep : *Dependences) {
-      Dep.print(OS, Depth + 2, DepChecker.getMemoryInstructions());
+      Dep.print(OS, Depth + 2, DepChecker->getMemoryInstructions());
       OS << "\n";
     }
   } else
     OS.indent(Depth) << "Too many dependences, not recorded\n";
 
   // List the pair of accesses need run-time checks to prove independence.
-  PtrRtChecking.print(OS, Depth);
+  PtrRtChecking->print(OS, Depth);
   OS << "\n";
 
   OS.indent(Depth) << "Store to invariant address was "
@@ -1788,43 +1961,35 @@ void LoopAccessInfo::print(raw_ostream &OS, unsigned Depth) const {
                    << "found in loop.\n";
 
   OS.indent(Depth) << "SCEV assumptions:\n";
-  PSE.getUnionPredicate().print(OS, Depth);
+  PSE->getUnionPredicate().print(OS, Depth);
+
+  OS << "\n";
+
+  OS.indent(Depth) << "Expressions re-written:\n";
+  PSE->print(OS, Depth);
 }
 
-const LoopAccessInfo &
-LoopAccessAnalysis::getInfo(Loop *L, const ValueToValueMap &Strides) {
+const LoopAccessInfo &LoopAccessLegacyAnalysis::getInfo(Loop *L) {
   auto &LAI = LoopAccessInfoMap[L];
 
-#ifndef NDEBUG
-  assert((!LAI || LAI->NumSymbolicStrides == Strides.size()) &&
-         "Symbolic strides changed for loop");
-#endif
-
-  if (!LAI) {
-    const DataLayout &DL = L->getHeader()->getModule()->getDataLayout();
-    LAI =
-        llvm::make_unique<LoopAccessInfo>(L, SE, DL, TLI, AA, DT, LI, Strides);
-#ifndef NDEBUG
-    LAI->NumSymbolicStrides = Strides.size();
-#endif
-  }
+  if (!LAI)
+    LAI = llvm::make_unique<LoopAccessInfo>(L, SE, TLI, AA, DT, LI);
+
   return *LAI.get();
 }
 
-void LoopAccessAnalysis::print(raw_ostream &OS, const Module *M) const {
-  LoopAccessAnalysis &LAA = *const_cast<LoopAccessAnalysis *>(this);
-
-  ValueToValueMap NoSymbolicStrides;
+void LoopAccessLegacyAnalysis::print(raw_ostream &OS, const Module *M) const {
+  LoopAccessLegacyAnalysis &LAA = *const_cast<LoopAccessLegacyAnalysis *>(this);
 
   for (Loop *TopLevelLoop : *LI)
     for (Loop *L : depth_first(TopLevelLoop)) {
       OS.indent(2) << L->getHeader()->getName() << ":\n";
-      auto &LAI = LAA.getInfo(L, NoSymbolicStrides);
+      auto &LAI = LAA.getInfo(L);
       LAI.print(OS, 4);
     }
 }
 
-bool LoopAccessAnalysis::runOnFunction(Function &F) {
+bool LoopAccessLegacyAnalysis::runOnFunction(Function &F) {
   SE = &getAnalysis<ScalarEvolutionWrapperPass>().getSE();
   auto *TLIP = getAnalysisIfAvailable<TargetLibraryInfoWrapperPass>();
   TLI = TLIP ? &TLIP->getTLI() : nullptr;
@@ -1835,7 +2000,7 @@ bool LoopAccessAnalysis::runOnFunction(Function &F) {
   return false;
 }
 
-void LoopAccessAnalysis::getAnalysisUsage(AnalysisUsage &AU) const {
+void LoopAccessLegacyAnalysis::getAnalysisUsage(AnalysisUsage &AU) const {
     AU.addRequired<ScalarEvolutionWrapperPass>();
     AU.addRequired<AAResultsWrapperPass>();
     AU.addRequired<DominatorTreeWrapperPass>();
@@ -1844,19 +2009,52 @@ void LoopAccessAnalysis::getAnalysisUsage(AnalysisUsage &AU) const {
     AU.setPreservesAll();
 }
 
-char LoopAccessAnalysis::ID = 0;
+char LoopAccessLegacyAnalysis::ID = 0;
 static const char laa_name[] = "Loop Access Analysis";
 #define LAA_NAME "loop-accesses"
 
-INITIALIZE_PASS_BEGIN(LoopAccessAnalysis, LAA_NAME, laa_name, false, true)
+INITIALIZE_PASS_BEGIN(LoopAccessLegacyAnalysis, LAA_NAME, laa_name, false, true)
 INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass)
 INITIALIZE_PASS_DEPENDENCY(ScalarEvolutionWrapperPass)
 INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
 INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass)
-INITIALIZE_PASS_END(LoopAccessAnalysis, LAA_NAME, laa_name, false, true)
+INITIALIZE_PASS_END(LoopAccessLegacyAnalysis, LAA_NAME, laa_name, false, true)
+
+char LoopAccessAnalysis::PassID;
+
+LoopAccessInfo LoopAccessAnalysis::run(Loop &L, AnalysisManager<Loop> &AM) {
+  const AnalysisManager<Function> &FAM =
+      AM.getResult<FunctionAnalysisManagerLoopProxy>(L).getManager();
+  Function &F = *L.getHeader()->getParent();
+  auto *SE = FAM.getCachedResult<ScalarEvolutionAnalysis>(F);
+  auto *TLI = FAM.getCachedResult<TargetLibraryAnalysis>(F);
+  auto *AA = FAM.getCachedResult<AAManager>(F);
+  auto *DT = FAM.getCachedResult<DominatorTreeAnalysis>(F);
+  auto *LI = FAM.getCachedResult<LoopAnalysis>(F);
+  if (!SE)
+    report_fatal_error(
+        "ScalarEvolution must have been cached at a higher level");
+  if (!AA)
+    report_fatal_error("AliasAnalysis must have been cached at a higher level");
+  if (!DT)
+    report_fatal_error("DominatorTree must have been cached at a higher level");
+  if (!LI)
+    report_fatal_error("LoopInfo must have been cached at a higher level");
+  return LoopAccessInfo(&L, SE, TLI, AA, DT, LI);
+}
+
+PreservedAnalyses LoopAccessInfoPrinterPass::run(Loop &L,
+                                                 AnalysisManager<Loop> &AM) {
+  Function &F = *L.getHeader()->getParent();
+  auto &LAI = AM.getResult<LoopAccessAnalysis>(L);
+  OS << "Loop access info in function '" << F.getName() << "':\n";
+  OS.indent(2) << L.getHeader()->getName() << ":\n";
+  LAI.print(OS, 4);
+  return PreservedAnalyses::all();
+}
 
 namespace llvm {
   Pass *createLAAPass() {
-    return new LoopAccessAnalysis();
+    return new LoopAccessLegacyAnalysis();
   }
 }
diff --git a/lib/Analysis/LoopInfo.cpp b/lib/Analysis/LoopInfo.cpp
index 0c725fcadff7..30f7ef392422 100644
--- a/lib/Analysis/LoopInfo.cpp
+++ b/lib/Analysis/LoopInfo.cpp
@@ -22,6 +22,7 @@
 #include "llvm/Analysis/ValueTracking.h"
 #include "llvm/IR/CFG.h"
 #include "llvm/IR/Constants.h"
+#include "llvm/IR/DebugLoc.h"
 #include "llvm/IR/Dominators.h"
 #include "llvm/IR/Instructions.h"
 #include "llvm/IR/LLVMContext.h"
@@ -38,7 +39,7 @@ template class llvm::LoopBase<BasicBlock, Loop>;
 template class llvm::LoopInfoBase<BasicBlock, Loop>;
 
 // Always verify loopinfo if expensive checking is enabled.
-#ifdef XDEBUG
+#ifdef EXPENSIVE_CHECKS
 static bool VerifyLoopInfo = true;
 #else
 static bool VerifyLoopInfo = false;
@@ -47,36 +48,20 @@ static cl::opt<bool,true>
 VerifyLoopInfoX("verify-loop-info", cl::location(VerifyLoopInfo),
                 cl::desc("Verify loop info (time consuming)"));
 
-// Loop identifier metadata name.
-static const char *const LoopMDName = "llvm.loop";
-
 //===----------------------------------------------------------------------===//
 // Loop implementation
 //
 
-/// isLoopInvariant - Return true if the specified value is loop invariant
-///
 bool Loop::isLoopInvariant(const Value *V) const {
   if (const Instruction *I = dyn_cast<Instruction>(V))
     return !contains(I);
   return true;  // All non-instructions are loop invariant
 }
 
-/// hasLoopInvariantOperands - Return true if all the operands of the
-/// specified instruction are loop invariant.
 bool Loop::hasLoopInvariantOperands(const Instruction *I) const {
   return all_of(I->operands(), [this](Value *V) { return isLoopInvariant(V); });
 }
 
-/// makeLoopInvariant - If the given value is an instruciton inside of the
-/// loop and it can be hoisted, do so to make it trivially loop-invariant.
-/// Return true if the value after any hoisting is loop invariant. This
-/// function can be used as a slightly more aggressive replacement for
-/// isLoopInvariant.
-///
-/// If InsertPt is specified, it is the point to hoist instructions to.
-/// If null, the terminator of the loop preheader is used.
-///
 bool Loop::makeLoopInvariant(Value *V, bool &Changed,
                              Instruction *InsertPt) const {
   if (Instruction *I = dyn_cast<Instruction>(V))
@@ -84,15 +69,6 @@ bool Loop::makeLoopInvariant(Value *V, bool &Changed,
   return true;  // All non-instructions are loop-invariant.
 }
 
-/// makeLoopInvariant - If the given instruction is inside of the
-/// loop and it can be hoisted, do so to make it trivially loop-invariant.
-/// Return true if the instruction after any hoisting is loop invariant. This
-/// function can be used as a slightly more aggressive replacement for
-/// isLoopInvariant.
-///
-/// If InsertPt is specified, it is the point to hoist instructions to.
-/// If null, the terminator of the loop preheader is used.
-///
 bool Loop::makeLoopInvariant(Instruction *I, bool &Changed,
                              Instruction *InsertPt) const {
   // Test if the value is already loop-invariant.
@@ -114,8 +90,8 @@ bool Loop::makeLoopInvariant(Instruction *I, bool &Changed,
     InsertPt = Preheader->getTerminator();
   }
   // Don't hoist instructions with loop-variant operands.
-  for (unsigned i = 0, e = I->getNumOperands(); i != e; ++i)
-    if (!makeLoopInvariant(I->getOperand(i), Changed, InsertPt))
+  for (Value *Operand : I->operands())
+    if (!makeLoopInvariant(Operand, Changed, InsertPt))
       return false;
 
   // Hoist.
@@ -131,14 +107,6 @@ bool Loop::makeLoopInvariant(Instruction *I, bool &Changed,
   return true;
 }
 
-/// getCanonicalInductionVariable - Check to see if the loop has a canonical
-/// induction variable: an integer recurrence that starts at 0 and increments
-/// by one each time through the loop.  If so, return the phi node that
-/// corresponds to it.
-///
-/// The IndVarSimplify pass transforms loops to have a canonical induction
-/// variable.
-///
 PHINode *Loop::getCanonicalInductionVariable() const {
   BasicBlock *H = getHeader();
 
@@ -175,18 +143,16 @@ PHINode *Loop::getCanonicalInductionVariable() const {
   return nullptr;
 }
 
-/// isLCSSAForm - Return true if the Loop is in LCSSA form
 bool Loop::isLCSSAForm(DominatorTree &DT) const {
-  for (block_iterator BI = block_begin(), E = block_end(); BI != E; ++BI) {
-    BasicBlock *BB = *BI;
-    for (BasicBlock::iterator I = BB->begin(), E = BB->end(); I != E;++I) {
+  for (BasicBlock *BB : this->blocks()) {
+    for (Instruction &I : *BB) {
       // Tokens can't be used in PHI nodes and live-out tokens prevent loop
       // optimizations, so for the purposes of considered LCSSA form, we
       // can ignore them.
-      if (I->getType()->isTokenTy())
+      if (I.getType()->isTokenTy())
         continue;
 
-      for (Use &U : I->uses()) {
+      for (Use &U : I.uses()) {
         Instruction *UI = cast<Instruction>(U.getUser());
         BasicBlock *UserBB = UI->getParent();
         if (PHINode *P = dyn_cast<PHINode>(UI))
@@ -216,42 +182,24 @@ bool Loop::isRecursivelyLCSSAForm(DominatorTree &DT) const {
   });
 }
 
-/// isLoopSimplifyForm - Return true if the Loop is in the form that
-/// the LoopSimplify form transforms loops to, which is sometimes called
-/// normal form.
 bool Loop::isLoopSimplifyForm() const {
   // Normal-form loops have a preheader, a single backedge, and all of their
   // exits have all their predecessors inside the loop.
   return getLoopPreheader() && getLoopLatch() && hasDedicatedExits();
 }
 
-/// isSafeToClone - Return true if the loop body is safe to clone in practice.
-/// Routines that reform the loop CFG and split edges often fail on indirectbr.
+// Routines that reform the loop CFG and split edges often fail on indirectbr.
 bool Loop::isSafeToClone() const {
   // Return false if any loop blocks contain indirectbrs, or there are any calls
   // to noduplicate functions.
-  for (Loop::block_iterator I = block_begin(), E = block_end(); I != E; ++I) {
-    if (isa<IndirectBrInst>((*I)->getTerminator()))
+  for (BasicBlock *BB : this->blocks()) {
+    if (isa<IndirectBrInst>(BB->getTerminator()))
       return false;
 
-    if (const InvokeInst *II = dyn_cast<InvokeInst>((*I)->getTerminator())) {
-      if (II->cannotDuplicate())
-        return false;
-      // Return false if any loop blocks contain invokes to EH-pads other than
-      // landingpads;  we don't know how to split those edges yet.
-      auto *FirstNonPHI = II->getUnwindDest()->getFirstNonPHI();
-      if (FirstNonPHI->isEHPad() && !isa<LandingPadInst>(FirstNonPHI))
-        return false;
-    }
-
-    for (BasicBlock::iterator BI = (*I)->begin(), BE = (*I)->end(); BI != BE; ++BI) {
-      if (const CallInst *CI = dyn_cast<CallInst>(BI)) {
-        if (CI->cannotDuplicate())
+    for (Instruction &I : *BB)
+      if (auto CS = CallSite(&I))
+        if (CS.cannotDuplicate())
           return false;
-      }
-      if (BI->getType()->isTokenTy() && BI->isUsedOutsideOfBlock(*I))
-        return false;
-    }
   }
   return true;
 }
@@ -259,19 +207,19 @@ bool Loop::isSafeToClone() const {
 MDNode *Loop::getLoopID() const {
   MDNode *LoopID = nullptr;
   if (isLoopSimplifyForm()) {
-    LoopID = getLoopLatch()->getTerminator()->getMetadata(LoopMDName);
+    LoopID = getLoopLatch()->getTerminator()->getMetadata(LLVMContext::MD_loop);
   } else {
     // Go through each predecessor of the loop header and check the
     // terminator for the metadata.
     BasicBlock *H = getHeader();
-    for (block_iterator I = block_begin(), IE = block_end(); I != IE; ++I) {
-      TerminatorInst *TI = (*I)->getTerminator();
+    for (BasicBlock *BB : this->blocks()) {
+      TerminatorInst *TI = BB->getTerminator();
       MDNode *MD = nullptr;
 
       // Check if this terminator branches to the loop header.
-      for (unsigned i = 0, ie = TI->getNumSuccessors(); i != ie; ++i) {
-        if (TI->getSuccessor(i) == H) {
-          MD = TI->getMetadata(LoopMDName);
+      for (BasicBlock *Successor : TI->successors()) {
+        if (Successor == H) {
+          MD = TI->getMetadata(LLVMContext::MD_loop);
           break;
         }
       }
@@ -296,24 +244,24 @@ void Loop::setLoopID(MDNode *LoopID) const {
   assert(LoopID->getOperand(0) == LoopID && "Loop ID should refer to itself");
 
   if (isLoopSimplifyForm()) {
-    getLoopLatch()->getTerminator()->setMetadata(LoopMDName, LoopID);
+    getLoopLatch()->getTerminator()->setMetadata(LLVMContext::MD_loop, LoopID);
     return;
   }
 
   BasicBlock *H = getHeader();
-  for (block_iterator I = block_begin(), IE = block_end(); I != IE; ++I) {
-    TerminatorInst *TI = (*I)->getTerminator();
-    for (unsigned i = 0, ie = TI->getNumSuccessors(); i != ie; ++i) {
-      if (TI->getSuccessor(i) == H)
-        TI->setMetadata(LoopMDName, LoopID);
+  for (BasicBlock *BB : this->blocks()) {
+    TerminatorInst *TI = BB->getTerminator();
+    for (BasicBlock *Successor : TI->successors()) {
+      if (Successor == H)
+        TI->setMetadata(LLVMContext::MD_loop, LoopID);
     }
   }
 }
 
 bool Loop::isAnnotatedParallel() const {
-  MDNode *desiredLoopIdMetadata = getLoopID();
+  MDNode *DesiredLoopIdMetadata = getLoopID();
 
-  if (!desiredLoopIdMetadata)
+  if (!DesiredLoopIdMetadata)
       return false;
 
   // The loop branch contains the parallel loop metadata. In order to ensure
@@ -321,108 +269,112 @@ bool Loop::isAnnotatedParallel() const {
   // dependencies (thus converted the loop back to a sequential loop), check
   // that all the memory instructions in the loop contain parallelism metadata
   // that point to the same unique "loop id metadata" the loop branch does.
-  for (block_iterator BB = block_begin(), BE = block_end(); BB != BE; ++BB) {
-    for (BasicBlock::iterator II = (*BB)->begin(), EE = (*BB)->end();
-         II != EE; II++) {
-
-      if (!II->mayReadOrWriteMemory())
+  for (BasicBlock *BB : this->blocks()) {
+    for (Instruction &I : *BB) {
+      if (!I.mayReadOrWriteMemory())
         continue;
 
       // The memory instruction can refer to the loop identifier metadata
       // directly or indirectly through another list metadata (in case of
       // nested parallel loops). The loop identifier metadata refers to
       // itself so we can check both cases with the same routine.
-      MDNode *loopIdMD =
-          II->getMetadata(LLVMContext::MD_mem_parallel_loop_access);
+      MDNode *LoopIdMD =
+          I.getMetadata(LLVMContext::MD_mem_parallel_loop_access);
 
-      if (!loopIdMD)
+      if (!LoopIdMD)
         return false;
 
-      bool loopIdMDFound = false;
-      for (unsigned i = 0, e = loopIdMD->getNumOperands(); i < e; ++i) {
-        if (loopIdMD->getOperand(i) == desiredLoopIdMetadata) {
-          loopIdMDFound = true;
+      bool LoopIdMDFound = false;
+      for (const MDOperand &MDOp : LoopIdMD->operands()) {
+        if (MDOp == DesiredLoopIdMetadata) {
+          LoopIdMDFound = true;
           break;
         }
       }
 
-      if (!loopIdMDFound)
+      if (!LoopIdMDFound)
         return false;
     }
   }
   return true;
 }
 
+DebugLoc Loop::getStartLoc() const {
+  // If we have a debug location in the loop ID, then use it.
+  if (MDNode *LoopID = getLoopID())
+    for (unsigned i = 1, ie = LoopID->getNumOperands(); i < ie; ++i)
+      if (DILocation *L = dyn_cast<DILocation>(LoopID->getOperand(i)))
+        return DebugLoc(L);
+
+  // Try the pre-header first.
+  if (BasicBlock *PHeadBB = getLoopPreheader())
+    if (DebugLoc DL = PHeadBB->getTerminator()->getDebugLoc())
+      return DL;
+
+  // If we have no pre-header or there are no instructions with debug
+  // info in it, try the header.
+  if (BasicBlock *HeadBB = getHeader())
+    return HeadBB->getTerminator()->getDebugLoc();
+
+  return DebugLoc();
+}
 
-/// hasDedicatedExits - Return true if no exit block for the loop
-/// has a predecessor that is outside the loop.
 bool Loop::hasDedicatedExits() const {
   // Each predecessor of each exit block of a normal loop is contained
   // within the loop.
   SmallVector<BasicBlock *, 4> ExitBlocks;
   getExitBlocks(ExitBlocks);
-  for (unsigned i = 0, e = ExitBlocks.size(); i != e; ++i)
-    for (pred_iterator PI = pred_begin(ExitBlocks[i]),
-         PE = pred_end(ExitBlocks[i]); PI != PE; ++PI)
-      if (!contains(*PI))
+  for (BasicBlock *BB : ExitBlocks)
+    for (BasicBlock *Predecessor : predecessors(BB))
+      if (!contains(Predecessor))
         return false;
   // All the requirements are met.
   return true;
 }
 
-/// getUniqueExitBlocks - Return all unique successor blocks of this loop.
-/// These are the blocks _outside of the current loop_ which are branched to.
-/// This assumes that loop exits are in canonical form.
-///
 void
 Loop::getUniqueExitBlocks(SmallVectorImpl<BasicBlock *> &ExitBlocks) const {
   assert(hasDedicatedExits() &&
          "getUniqueExitBlocks assumes the loop has canonical form exits!");
 
-  SmallVector<BasicBlock *, 32> switchExitBlocks;
-
-  for (block_iterator BI = block_begin(), BE = block_end(); BI != BE; ++BI) {
-
-    BasicBlock *current = *BI;
-    switchExitBlocks.clear();
-
-    for (succ_iterator I = succ_begin(*BI), E = succ_end(*BI); I != E; ++I) {
-      // If block is inside the loop then it is not a exit block.
-      if (contains(*I))
+  SmallVector<BasicBlock *, 32> SwitchExitBlocks;
+  for (BasicBlock *BB : this->blocks()) {
+    SwitchExitBlocks.clear();
+    for (BasicBlock *Successor : successors(BB)) {
+      // If block is inside the loop then it is not an exit block.
+      if (contains(Successor))
         continue;
 
-      pred_iterator PI = pred_begin(*I);
-      BasicBlock *firstPred = *PI;
+      pred_iterator PI = pred_begin(Successor);
+      BasicBlock *FirstPred = *PI;
 
       // If current basic block is this exit block's first predecessor
       // then only insert exit block in to the output ExitBlocks vector.
       // This ensures that same exit block is not inserted twice into
       // ExitBlocks vector.
-      if (current != firstPred)
+      if (BB != FirstPred)
         continue;
 
       // If a terminator has more then two successors, for example SwitchInst,
       // then it is possible that there are multiple edges from current block
       // to one exit block.
-      if (std::distance(succ_begin(current), succ_end(current)) <= 2) {
-        ExitBlocks.push_back(*I);
+      if (std::distance(succ_begin(BB), succ_end(BB)) <= 2) {
+        ExitBlocks.push_back(Successor);
         continue;
       }
 
       // In case of multiple edges from current block to exit block, collect
       // only one edge in ExitBlocks. Use switchExitBlocks to keep track of
       // duplicate edges.
-      if (std::find(switchExitBlocks.begin(), switchExitBlocks.end(), *I)
-          == switchExitBlocks.end()) {
-        switchExitBlocks.push_back(*I);
-        ExitBlocks.push_back(*I);
+      if (std::find(SwitchExitBlocks.begin(), SwitchExitBlocks.end(), Successor)
+          == SwitchExitBlocks.end()) {
+        SwitchExitBlocks.push_back(Successor);
+        ExitBlocks.push_back(Successor);
       }
     }
   }
 }
 
-/// getUniqueExitBlock - If getUniqueExitBlocks would return exactly one
-/// block, return that block. Otherwise return null.
 BasicBlock *Loop::getUniqueExitBlock() const {
   SmallVector<BasicBlock *, 8> UniqueExitBlocks;
   getUniqueExitBlocks(UniqueExitBlocks);
@@ -432,7 +384,7 @@ BasicBlock *Loop::getUniqueExitBlock() const {
 }
 
 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
-void Loop::dump() const {
+LLVM_DUMP_METHOD void Loop::dump() const {
   print(dbgs());
 }
 #endif
@@ -445,7 +397,7 @@ namespace {
 /// Find the new parent loop for all blocks within the "unloop" whose last
 /// backedges has just been removed.
 class UnloopUpdater {
-  Loop *Unloop;
+  Loop &Unloop;
   LoopInfo *LI;
 
   LoopBlocksDFS DFS;
@@ -462,7 +414,7 @@ class UnloopUpdater {
 
 public:
   UnloopUpdater(Loop *UL, LoopInfo *LInfo) :
-    Unloop(UL), LI(LInfo), DFS(UL), FoundIB(false) {}
+    Unloop(*UL), LI(LInfo), DFS(UL), FoundIB(false) {}
 
   void updateBlockParents();
 
@@ -475,29 +427,28 @@ protected:
 };
 } // end anonymous namespace
 
-/// updateBlockParents - Update the parent loop for all blocks that are directly
-/// contained within the original "unloop".
+/// Update the parent loop for all blocks that are directly contained within the
+/// original "unloop".
 void UnloopUpdater::updateBlockParents() {
-  if (Unloop->getNumBlocks()) {
+  if (Unloop.getNumBlocks()) {
     // Perform a post order CFG traversal of all blocks within this loop,
     // propagating the nearest loop from sucessors to predecessors.
     LoopBlocksTraversal Traversal(DFS, LI);
-    for (LoopBlocksTraversal::POTIterator POI = Traversal.begin(),
-           POE = Traversal.end(); POI != POE; ++POI) {
+    for (BasicBlock *POI : Traversal) {
 
-      Loop *L = LI->getLoopFor(*POI);
-      Loop *NL = getNearestLoop(*POI, L);
+      Loop *L = LI->getLoopFor(POI);
+      Loop *NL = getNearestLoop(POI, L);
 
       if (NL != L) {
         // For reducible loops, NL is now an ancestor of Unloop.
-        assert((NL != Unloop && (!NL || NL->contains(Unloop))) &&
+        assert((NL != &Unloop && (!NL || NL->contains(&Unloop))) &&
                "uninitialized successor");
-        LI->changeLoopFor(*POI, NL);
+        LI->changeLoopFor(POI, NL);
       }
       else {
         // Or the current block is part of a subloop, in which case its parent
         // is unchanged.
-        assert((FoundIB || Unloop->contains(L)) && "uninitialized successor");
+        assert((FoundIB || Unloop.contains(L)) && "uninitialized successor");
       }
     }
   }
@@ -505,7 +456,7 @@ void UnloopUpdater::updateBlockParents() {
   // the DFS result cached by Traversal.
   bool Changed = FoundIB;
   for (unsigned NIters = 0; Changed; ++NIters) {
-    assert(NIters < Unloop->getNumBlocks() && "runaway iterative algorithm");
+    assert(NIters < Unloop.getNumBlocks() && "runaway iterative algorithm");
 
     // Iterate over the postorder list of blocks, propagating the nearest loop
     // from successors to predecessors as before.
@@ -516,7 +467,7 @@ void UnloopUpdater::updateBlockParents() {
       Loop *L = LI->getLoopFor(*POI);
       Loop *NL = getNearestLoop(*POI, L);
       if (NL != L) {
-        assert(NL != Unloop && (!NL || NL->contains(Unloop)) &&
+        assert(NL != &Unloop && (!NL || NL->contains(&Unloop)) &&
                "uninitialized successor");
         LI->changeLoopFor(*POI, NL);
         Changed = true;
@@ -525,22 +476,21 @@ void UnloopUpdater::updateBlockParents() {
   }
 }
 
-/// removeBlocksFromAncestors - Remove unloop's blocks from all ancestors below
-/// their new parents.
+/// Remove unloop's blocks from all ancestors below their new parents.
 void UnloopUpdater::removeBlocksFromAncestors() {
   // Remove all unloop's blocks (including those in nested subloops) from
   // ancestors below the new parent loop.
-  for (Loop::block_iterator BI = Unloop->block_begin(),
-         BE = Unloop->block_end(); BI != BE; ++BI) {
+  for (Loop::block_iterator BI = Unloop.block_begin(),
+         BE = Unloop.block_end(); BI != BE; ++BI) {
     Loop *OuterParent = LI->getLoopFor(*BI);
-    if (Unloop->contains(OuterParent)) {
-      while (OuterParent->getParentLoop() != Unloop)
+    if (Unloop.contains(OuterParent)) {
+      while (OuterParent->getParentLoop() != &Unloop)
         OuterParent = OuterParent->getParentLoop();
       OuterParent = SubloopParents[OuterParent];
     }
     // Remove blocks from former Ancestors except Unloop itself which will be
     // deleted.
-    for (Loop *OldParent = Unloop->getParentLoop(); OldParent != OuterParent;
+    for (Loop *OldParent = Unloop.getParentLoop(); OldParent != OuterParent;
          OldParent = OldParent->getParentLoop()) {
       assert(OldParent && "new loop is not an ancestor of the original");
       OldParent->removeBlockFromLoop(*BI);
@@ -548,12 +498,11 @@ void UnloopUpdater::removeBlocksFromAncestors() {
   }
 }
 
-/// updateSubloopParents - Update the parent loop for all subloops directly
-/// nested within unloop.
+/// Update the parent loop for all subloops directly nested within unloop.
 void UnloopUpdater::updateSubloopParents() {
-  while (!Unloop->empty()) {
-    Loop *Subloop = *std::prev(Unloop->end());
-    Unloop->removeChildLoop(std::prev(Unloop->end()));
+  while (!Unloop.empty()) {
+    Loop *Subloop = *std::prev(Unloop.end());
+    Unloop.removeChildLoop(std::prev(Unloop.end()));
 
     assert(SubloopParents.count(Subloop) && "DFS failed to visit subloop");
     if (Loop *Parent = SubloopParents[Subloop])
@@ -563,9 +512,9 @@ void UnloopUpdater::updateSubloopParents() {
   }
 }
 
-/// getNearestLoop - Return the nearest parent loop among this block's
-/// successors. If a successor is a subloop header, consider its parent to be
-/// the nearest parent of the subloop's exits.
+/// Return the nearest parent loop among this block's successors. If a successor
+/// is a subloop header, consider its parent to be the nearest parent of the
+/// subloop's exits.
 ///
 /// For subloop blocks, simply update SubloopParents and return NULL.
 Loop *UnloopUpdater::getNearestLoop(BasicBlock *BB, Loop *BBLoop) {
@@ -575,16 +524,16 @@ Loop *UnloopUpdater::getNearestLoop(BasicBlock *BB, Loop *BBLoop) {
   Loop *NearLoop = BBLoop;
 
   Loop *Subloop = nullptr;
-  if (NearLoop != Unloop && Unloop->contains(NearLoop)) {
+  if (NearLoop != &Unloop && Unloop.contains(NearLoop)) {
     Subloop = NearLoop;
     // Find the subloop ancestor that is directly contained within Unloop.
-    while (Subloop->getParentLoop() != Unloop) {
+    while (Subloop->getParentLoop() != &Unloop) {
       Subloop = Subloop->getParentLoop();
       assert(Subloop && "subloop is not an ancestor of the original loop");
     }
     // Get the current nearest parent of the Subloop exits, initially Unloop.
     NearLoop =
-      SubloopParents.insert(std::make_pair(Subloop, Unloop)).first->second;
+      SubloopParents.insert(std::make_pair(Subloop, &Unloop)).first->second;
   }
 
   succ_iterator I = succ_begin(BB), E = succ_end(BB);
@@ -597,33 +546,33 @@ Loop *UnloopUpdater::getNearestLoop(BasicBlock *BB, Loop *BBLoop) {
       continue; // self loops are uninteresting
 
     Loop *L = LI->getLoopFor(*I);
-    if (L == Unloop) {
+    if (L == &Unloop) {
       // This successor has not been processed. This path must lead to an
       // irreducible backedge.
       assert((FoundIB || !DFS.hasPostorder(*I)) && "should have seen IB");
       FoundIB = true;
     }
-    if (L != Unloop && Unloop->contains(L)) {
+    if (L != &Unloop && Unloop.contains(L)) {
       // Successor is in a subloop.
       if (Subloop)
         continue; // Branching within subloops. Ignore it.
 
       // BB branches from the original into a subloop header.
-      assert(L->getParentLoop() == Unloop && "cannot skip into nested loops");
+      assert(L->getParentLoop() == &Unloop && "cannot skip into nested loops");
 
       // Get the current nearest parent of the Subloop's exits.
       L = SubloopParents[L];
       // L could be Unloop if the only exit was an irreducible backedge.
     }
-    if (L == Unloop) {
+    if (L == &Unloop) {
       continue;
     }
     // Handle critical edges from Unloop into a sibling loop.
-    if (L && !L->contains(Unloop)) {
+    if (L && !L->contains(&Unloop)) {
       L = L->getParentLoop();
     }
     // Remember the nearest parent loop among successors or subloop exits.
-    if (NearLoop == Unloop || !NearLoop || NearLoop->contains(L))
+    if (NearLoop == &Unloop || !NearLoop || NearLoop->contains(L))
       NearLoop = L;
   }
   if (Subloop) {
@@ -698,7 +647,7 @@ void LoopInfo::markAsRemoved(Loop *Unloop) {
 
 char LoopAnalysis::PassID;
 
-LoopInfo LoopAnalysis::run(Function &F, AnalysisManager<Function> *AM) {
+LoopInfo LoopAnalysis::run(Function &F, AnalysisManager<Function> &AM) {
   // FIXME: Currently we create a LoopInfo from scratch for every function.
   // This may prove to be too wasteful due to deallocating and re-allocating
   // memory each time for the underlying map and vector datastructures. At some
@@ -706,13 +655,13 @@ LoopInfo LoopAnalysis::run(Function &F, AnalysisManager<Function> *AM) {
   // objects. I don't want to add that kind of complexity until the scope of
   // the problem is better understood.
   LoopInfo LI;
-  LI.analyze(AM->getResult<DominatorTreeAnalysis>(F));
+  LI.analyze(AM.getResult<DominatorTreeAnalysis>(F));
   return LI;
 }
 
 PreservedAnalyses LoopPrinterPass::run(Function &F,
-                                       AnalysisManager<Function> *AM) {
-  AM->getResult<LoopAnalysis>(F).print(OS);
+                                       AnalysisManager<Function> &AM) {
+  AM.getResult<LoopAnalysis>(F).print(OS);
   return PreservedAnalyses::all();
 }
 
@@ -720,7 +669,7 @@ PrintLoopPass::PrintLoopPass() : OS(dbgs()) {}
 PrintLoopPass::PrintLoopPass(raw_ostream &OS, const std::string &Banner)
     : OS(OS), Banner(Banner) {}
 
-PreservedAnalyses PrintLoopPass::run(Loop &L) {
+PreservedAnalyses PrintLoopPass::run(Loop &L, AnalysisManager<Loop> &) {
   OS << Banner;
   for (auto *Block : L.blocks())
     if (Block)
diff --git a/lib/Analysis/LoopPass.cpp b/lib/Analysis/LoopPass.cpp
index 8163231c3323..222345c9a980 100644
--- a/lib/Analysis/LoopPass.cpp
+++ b/lib/Analysis/LoopPass.cpp
@@ -14,8 +14,10 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/Analysis/LoopPass.h"
+#include "llvm/Analysis/LoopPassManager.h"
 #include "llvm/IR/IRPrintingPasses.h"
 #include "llvm/IR/LLVMContext.h"
+#include "llvm/IR/OptBisect.h"
 #include "llvm/IR/PassManager.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/Timer.h"
@@ -45,8 +47,10 @@ public:
     auto BBI = find_if(L->blocks().begin(), L->blocks().end(),
                        [](BasicBlock *BB) { return BB; });
     if (BBI != L->blocks().end() &&
-        isFunctionInPrintList((*BBI)->getParent()->getName()))
-      P.run(*L);
+        isFunctionInPrintList((*BBI)->getParent()->getName())) {
+      AnalysisManager<Loop> DummyLAM;
+      P.run(*L, DummyLAM);
+    }
     return false;
   }
 };
@@ -105,9 +109,7 @@ void LPPassManager::cloneBasicBlockSimpleAnalysis(BasicBlock *From,
 /// deleteSimpleAnalysisValue - Invoke deleteAnalysisValue hook for all passes.
 void LPPassManager::deleteSimpleAnalysisValue(Value *V, Loop *L) {
   if (BasicBlock *BB = dyn_cast<BasicBlock>(V)) {
-    for (BasicBlock::iterator BI = BB->begin(), BE = BB->end(); BI != BE;
-         ++BI) {
-      Instruction &I = *BI;
+    for (Instruction &I : *BB) {
       deleteSimpleAnalysisValue(&I, L);
     }
   }
@@ -335,11 +337,16 @@ void LoopPass::assignPassManager(PMStack &PMS,
   LPPM->add(this);
 }
 
-// Containing function has Attribute::OptimizeNone and transformation
-// passes should skip it.
-bool LoopPass::skipOptnoneFunction(const Loop *L) const {
+bool LoopPass::skipLoop(const Loop *L) const {
   const Function *F = L->getHeader()->getParent();
-  if (F && F->hasFnAttribute(Attribute::OptimizeNone)) {
+  if (!F)
+    return false;
+  // Check the opt bisect limit.
+  LLVMContext &Context = F->getContext();
+  if (!Context.getOptBisect().shouldRunPass(this, *L))
+    return true;
+  // Check for the OptimizeNone attribute.
+  if (F->hasFnAttribute(Attribute::OptimizeNone)) {
     // FIXME: Report this to dbgs() only once per function.
     DEBUG(dbgs() << "Skipping pass '" << getPassName()
           << "' in function " << F->getName() << "\n");
diff --git a/lib/Analysis/LoopPassManager.cpp b/lib/Analysis/LoopPassManager.cpp
new file mode 100644
index 000000000000..8bac19a58217
--- /dev/null
+++ b/lib/Analysis/LoopPassManager.cpp
@@ -0,0 +1,39 @@
+//===- LoopPassManager.cpp - Loop pass management -------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Analysis/LoopPassManager.h"
+#include "llvm/Analysis/BasicAliasAnalysis.h"
+#include "llvm/Analysis/GlobalsModRef.h"
+#include "llvm/Analysis/LoopInfo.h"
+#include "llvm/Analysis/ScalarEvolution.h"
+#include "llvm/Analysis/ScalarEvolutionAliasAnalysis.h"
+#include "llvm/IR/Dominators.h"
+
+using namespace llvm;
+
+// Explicit instantiations for core typedef'ed templates.
+namespace llvm {
+template class PassManager<Loop>;
+template class AnalysisManager<Loop>;
+template class InnerAnalysisManagerProxy<LoopAnalysisManager, Function>;
+template class OuterAnalysisManagerProxy<FunctionAnalysisManager, Loop>;
+}
+
+PreservedAnalyses llvm::getLoopPassPreservedAnalyses() {
+  PreservedAnalyses PA;
+  PA.preserve<DominatorTreeAnalysis>();
+  PA.preserve<LoopAnalysis>();
+  PA.preserve<ScalarEvolutionAnalysis>();
+  // TODO: What we really want to do here is preserve an AA category, but that
+  // concept doesn't exist yet.
+  PA.preserve<BasicAA>();
+  PA.preserve<GlobalsAA>();
+  PA.preserve<SCEVAA>();
+  return PA;
+}
diff --git a/lib/Analysis/LoopUnrollAnalyzer.cpp b/lib/Analysis/LoopUnrollAnalyzer.cpp
new file mode 100644
index 000000000000..f59257ab16b5
--- /dev/null
+++ b/lib/Analysis/LoopUnrollAnalyzer.cpp
@@ -0,0 +1,210 @@
+//===- LoopUnrollAnalyzer.cpp - Unrolling Effect Estimation -----*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements UnrolledInstAnalyzer class. It's used for predicting
+// potential effects that loop unrolling might have, such as enabling constant
+// propagation and other optimizations.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Analysis/LoopUnrollAnalyzer.h"
+#include "llvm/IR/Dominators.h"
+
+using namespace llvm;
+
+/// \brief Try to simplify instruction \param I using its SCEV expression.
+///
+/// The idea is that some AddRec expressions become constants, which then
+/// could trigger folding of other instructions. However, that only happens
+/// for expressions whose start value is also constant, which isn't always the
+/// case. In another common and important case the start value is just some
+/// address (i.e. SCEVUnknown) - in this case we compute the offset and save
+/// it along with the base address instead.
+bool UnrolledInstAnalyzer::simplifyInstWithSCEV(Instruction *I) {
+  if (!SE.isSCEVable(I->getType()))
+    return false;
+
+  const SCEV *S = SE.getSCEV(I);
+  if (auto *SC = dyn_cast<SCEVConstant>(S)) {
+    SimplifiedValues[I] = SC->getValue();
+    return true;
+  }
+
+  auto *AR = dyn_cast<SCEVAddRecExpr>(S);
+  if (!AR || AR->getLoop() != L)
+    return false;
+
+  const SCEV *ValueAtIteration = AR->evaluateAtIteration(IterationNumber, SE);
+  // Check if the AddRec expression becomes a constant.
+  if (auto *SC = dyn_cast<SCEVConstant>(ValueAtIteration)) {
+    SimplifiedValues[I] = SC->getValue();
+    return true;
+  }
+
+  // Check if the offset from the base address becomes a constant.
+  auto *Base = dyn_cast<SCEVUnknown>(SE.getPointerBase(S));
+  if (!Base)
+    return false;
+  auto *Offset =
+      dyn_cast<SCEVConstant>(SE.getMinusSCEV(ValueAtIteration, Base));
+  if (!Offset)
+    return false;
+  SimplifiedAddress Address;
+  Address.Base = Base->getValue();
+  Address.Offset = Offset->getValue();
+  SimplifiedAddresses[I] = Address;
+  return false;
+}
+
+/// Try to simplify binary operator I.
+///
+/// TODO: Probably it's worth to hoist the code for estimating the
+/// simplifications effects to a separate class, since we have a very similar
+/// code in InlineCost already.
+bool UnrolledInstAnalyzer::visitBinaryOperator(BinaryOperator &I) {
+  Value *LHS = I.getOperand(0), *RHS = I.getOperand(1);
+  if (!isa<Constant>(LHS))
+    if (Constant *SimpleLHS = SimplifiedValues.lookup(LHS))
+      LHS = SimpleLHS;
+  if (!isa<Constant>(RHS))
+    if (Constant *SimpleRHS = SimplifiedValues.lookup(RHS))
+      RHS = SimpleRHS;
+
+  Value *SimpleV = nullptr;
+  const DataLayout &DL = I.getModule()->getDataLayout();
+  if (auto FI = dyn_cast<FPMathOperator>(&I))
+    SimpleV =
+        SimplifyFPBinOp(I.getOpcode(), LHS, RHS, FI->getFastMathFlags(), DL);
+  else
+    SimpleV = SimplifyBinOp(I.getOpcode(), LHS, RHS, DL);
+
+  if (Constant *C = dyn_cast_or_null<Constant>(SimpleV))
+    SimplifiedValues[&I] = C;
+
+  if (SimpleV)
+    return true;
+  return Base::visitBinaryOperator(I);
+}
+
+/// Try to fold load I.
+bool UnrolledInstAnalyzer::visitLoad(LoadInst &I) {
+  Value *AddrOp = I.getPointerOperand();
+
+  auto AddressIt = SimplifiedAddresses.find(AddrOp);
+  if (AddressIt == SimplifiedAddresses.end())
+    return false;
+  ConstantInt *SimplifiedAddrOp = AddressIt->second.Offset;
+
+  auto *GV = dyn_cast<GlobalVariable>(AddressIt->second.Base);
+  // We're only interested in loads that can be completely folded to a
+  // constant.
+  if (!GV || !GV->hasDefinitiveInitializer() || !GV->isConstant())
+    return false;
+
+  ConstantDataSequential *CDS =
+      dyn_cast<ConstantDataSequential>(GV->getInitializer());
+  if (!CDS)
+    return false;
+
+  // We might have a vector load from an array. FIXME: for now we just bail
+  // out in this case, but we should be able to resolve and simplify such
+  // loads.
+  if(CDS->getElementType() != I.getType())
+    return false;
+
+  int ElemSize = CDS->getElementType()->getPrimitiveSizeInBits() / 8U;
+  if (SimplifiedAddrOp->getValue().getActiveBits() >= 64)
+    return false;
+  int64_t Index = SimplifiedAddrOp->getSExtValue() / ElemSize;
+  if (Index >= CDS->getNumElements()) {
+    // FIXME: For now we conservatively ignore out of bound accesses, but
+    // we're allowed to perform the optimization in this case.
+    return false;
+  }
+
+  Constant *CV = CDS->getElementAsConstant(Index);
+  assert(CV && "Constant expected.");
+  SimplifiedValues[&I] = CV;
+
+  return true;
+}
+
+/// Try to simplify cast instruction.
+bool UnrolledInstAnalyzer::visitCastInst(CastInst &I) {
+  // Propagate constants through casts.
+  Constant *COp = dyn_cast<Constant>(I.getOperand(0));
+  if (!COp)
+    COp = SimplifiedValues.lookup(I.getOperand(0));
+
+  // If we know a simplified value for this operand and cast is valid, save the
+  // result to SimplifiedValues.
+  // The cast can be invalid, because SimplifiedValues contains results of SCEV
+  // analysis, which operates on integers (and, e.g., might convert i8* null to
+  // i32 0).
+  if (COp && CastInst::castIsValid(I.getOpcode(), COp, I.getType())) {
+    if (Constant *C =
+            ConstantExpr::getCast(I.getOpcode(), COp, I.getType())) {
+      SimplifiedValues[&I] = C;
+      return true;
+    }
+  }
+
+  return Base::visitCastInst(I);
+}
+
+/// Try to simplify cmp instruction.
+bool UnrolledInstAnalyzer::visitCmpInst(CmpInst &I) {
+  Value *LHS = I.getOperand(0), *RHS = I.getOperand(1);
+
+  // First try to handle simplified comparisons.
+  if (!isa<Constant>(LHS))
+    if (Constant *SimpleLHS = SimplifiedValues.lookup(LHS))
+      LHS = SimpleLHS;
+  if (!isa<Constant>(RHS))
+    if (Constant *SimpleRHS = SimplifiedValues.lookup(RHS))
+      RHS = SimpleRHS;
+
+  if (!isa<Constant>(LHS) && !isa<Constant>(RHS)) {
+    auto SimplifiedLHS = SimplifiedAddresses.find(LHS);
+    if (SimplifiedLHS != SimplifiedAddresses.end()) {
+      auto SimplifiedRHS = SimplifiedAddresses.find(RHS);
+      if (SimplifiedRHS != SimplifiedAddresses.end()) {
+        SimplifiedAddress &LHSAddr = SimplifiedLHS->second;
+        SimplifiedAddress &RHSAddr = SimplifiedRHS->second;
+        if (LHSAddr.Base == RHSAddr.Base) {
+          LHS = LHSAddr.Offset;
+          RHS = RHSAddr.Offset;
+        }
+      }
+    }
+  }
+
+  if (Constant *CLHS = dyn_cast<Constant>(LHS)) {
+    if (Constant *CRHS = dyn_cast<Constant>(RHS)) {
+      if (CLHS->getType() == CRHS->getType()) {
+        if (Constant *C = ConstantExpr::getCompare(I.getPredicate(), CLHS, CRHS)) {
+          SimplifiedValues[&I] = C;
+          return true;
+        }
+      }
+    }
+  }
+
+  return Base::visitCmpInst(I);
+}
+
+bool UnrolledInstAnalyzer::visitPHINode(PHINode &PN) {
+  // Run base visitor first. This way we can gather some useful for later
+  // analysis information.
+  if (Base::visitPHINode(PN))
+    return true;
+
+  // The loop induction PHI nodes are definitionally free.
+  return PN.getParent() == L->getHeader();
+}
diff --git a/lib/Analysis/Makefile b/lib/Analysis/Makefile
deleted file mode 100644
index 93fd7f9bdd93..000000000000
--- a/lib/Analysis/Makefile
+++ /dev/null
@@ -1,15 +0,0 @@
-##===- lib/Analysis/Makefile -------------------------------*- Makefile -*-===##
-#
-#                     The LLVM Compiler Infrastructure
-#
-# This file is distributed under the University of Illinois Open Source
-# License. See LICENSE.TXT for details.
-#
-##===----------------------------------------------------------------------===##
-
-LEVEL = ../..
-LIBRARYNAME = LLVMAnalysis
-BUILD_ARCHIVE = 1
-
-include $(LEVEL)/Makefile.common
-
diff --git a/lib/Analysis/MemDepPrinter.cpp b/lib/Analysis/MemDepPrinter.cpp
index 078cefe51807..e7a85ae06e68 100644
--- a/lib/Analysis/MemDepPrinter.cpp
+++ b/lib/Analysis/MemDepPrinter.cpp
@@ -50,7 +50,7 @@ namespace {
 
     void getAnalysisUsage(AnalysisUsage &AU) const override {
       AU.addRequiredTransitive<AAResultsWrapperPass>();
-      AU.addRequiredTransitive<MemoryDependenceAnalysis>();
+      AU.addRequiredTransitive<MemoryDependenceWrapperPass>();
       AU.setPreservesAll();
     }
 
@@ -79,7 +79,7 @@ namespace {
 char MemDepPrinter::ID = 0;
 INITIALIZE_PASS_BEGIN(MemDepPrinter, "print-memdeps",
                       "Print MemDeps of function", false, true)
-INITIALIZE_PASS_DEPENDENCY(MemoryDependenceAnalysis)
+INITIALIZE_PASS_DEPENDENCY(MemoryDependenceWrapperPass)
 INITIALIZE_PASS_END(MemDepPrinter, "print-memdeps",
                       "Print MemDeps of function", false, true)
 
@@ -92,7 +92,7 @@ const char *const MemDepPrinter::DepTypeStr[]
 
 bool MemDepPrinter::runOnFunction(Function &F) {
   this->F = &F;
-  MemoryDependenceAnalysis &MDA = getAnalysis<MemoryDependenceAnalysis>();
+  MemoryDependenceResults &MDA = getAnalysis<MemoryDependenceWrapperPass>().getMemDep();
 
   // All this code uses non-const interfaces because MemDep is not
   // const-friendly, though nothing is actually modified.
@@ -107,14 +107,13 @@ bool MemDepPrinter::runOnFunction(Function &F) {
       Deps[Inst].insert(std::make_pair(getInstTypePair(Res),
                                        static_cast<BasicBlock *>(nullptr)));
     } else if (auto CS = CallSite(Inst)) {
-      const MemoryDependenceAnalysis::NonLocalDepInfo &NLDI =
+      const MemoryDependenceResults::NonLocalDepInfo &NLDI =
         MDA.getNonLocalCallDependency(CS);
 
       DepSet &InstDeps = Deps[Inst];
-      for (MemoryDependenceAnalysis::NonLocalDepInfo::const_iterator
-           I = NLDI.begin(), E = NLDI.end(); I != E; ++I) {
-        const MemDepResult &Res = I->getResult();
-        InstDeps.insert(std::make_pair(getInstTypePair(Res), I->getBB()));
+      for (const NonLocalDepEntry &I : NLDI) {
+        const MemDepResult &Res = I.getResult();
+        InstDeps.insert(std::make_pair(getInstTypePair(Res), I.getBB()));
       }
     } else {
       SmallVector<NonLocalDepResult, 4> NLDI;
@@ -123,10 +122,9 @@ bool MemDepPrinter::runOnFunction(Function &F) {
       MDA.getNonLocalPointerDependency(Inst, NLDI);
 
       DepSet &InstDeps = Deps[Inst];
-      for (SmallVectorImpl<NonLocalDepResult>::const_iterator
-           I = NLDI.begin(), E = NLDI.end(); I != E; ++I) {
-        const MemDepResult &Res = I->getResult();
-        InstDeps.insert(std::make_pair(getInstTypePair(Res), I->getBB()));
+      for (const NonLocalDepResult &I : NLDI) {
+        const MemDepResult &Res = I.getResult();
+        InstDeps.insert(std::make_pair(getInstTypePair(Res), I.getBB()));
       }
     }
   }
diff --git a/lib/Analysis/MemDerefPrinter.cpp b/lib/Analysis/MemDerefPrinter.cpp
index 36f1424c8cf9..fa0cc5a46c2b 100644
--- a/lib/Analysis/MemDerefPrinter.cpp
+++ b/lib/Analysis/MemDerefPrinter.cpp
@@ -10,7 +10,7 @@
 #include "llvm/Analysis/Passes.h"
 #include "llvm/ADT/SetVector.h"
 #include "llvm/Analysis/MemoryDependenceAnalysis.h"
-#include "llvm/Analysis/ValueTracking.h"
+#include "llvm/Analysis/Loads.h"
 #include "llvm/IR/CallSite.h"
 #include "llvm/IR/DataLayout.h"
 #include "llvm/IR/InstIterator.h"
diff --git a/lib/Analysis/MemoryBuiltins.cpp b/lib/Analysis/MemoryBuiltins.cpp
index 9e896aed0dce..f23477622bec 100644
--- a/lib/Analysis/MemoryBuiltins.cpp
+++ b/lib/Analysis/MemoryBuiltins.cpp
@@ -42,39 +42,38 @@ enum AllocType : uint8_t {
 };
 
 struct AllocFnsTy {
-  LibFunc::Func Func;
   AllocType AllocTy;
-  unsigned char NumParams;
+  unsigned NumParams;
   // First and Second size parameters (or -1 if unused)
-  signed char FstParam, SndParam;
+  int FstParam, SndParam;
 };
 
 // FIXME: certain users need more information. E.g., SimplifyLibCalls needs to
 // know which functions are nounwind, noalias, nocapture parameters, etc.
-static const AllocFnsTy AllocationFnData[] = {
-  {LibFunc::malloc,              MallocLike,  1, 0,  -1},
-  {LibFunc::valloc,              MallocLike,  1, 0,  -1},
-  {LibFunc::Znwj,                OpNewLike,   1, 0,  -1}, // new(unsigned int)
-  {LibFunc::ZnwjRKSt9nothrow_t,  MallocLike,  2, 0,  -1}, // new(unsigned int, nothrow)
-  {LibFunc::Znwm,                OpNewLike,   1, 0,  -1}, // new(unsigned long)
-  {LibFunc::ZnwmRKSt9nothrow_t,  MallocLike,  2, 0,  -1}, // new(unsigned long, nothrow)
-  {LibFunc::Znaj,                OpNewLike,   1, 0,  -1}, // new[](unsigned int)
-  {LibFunc::ZnajRKSt9nothrow_t,  MallocLike,  2, 0,  -1}, // new[](unsigned int, nothrow)
-  {LibFunc::Znam,                OpNewLike,   1, 0,  -1}, // new[](unsigned long)
-  {LibFunc::ZnamRKSt9nothrow_t,  MallocLike,  2, 0,  -1}, // new[](unsigned long, nothrow)
-  {LibFunc::msvc_new_int,         OpNewLike,   1, 0,  -1}, // new(unsigned int)
-  {LibFunc::msvc_new_int_nothrow, MallocLike,  2, 0,  -1}, // new(unsigned int, nothrow)
-  {LibFunc::msvc_new_longlong,         OpNewLike,   1, 0,  -1}, // new(unsigned long long)
-  {LibFunc::msvc_new_longlong_nothrow, MallocLike,  2, 0,  -1}, // new(unsigned long long, nothrow)
-  {LibFunc::msvc_new_array_int,         OpNewLike,   1, 0,  -1}, // new[](unsigned int)
-  {LibFunc::msvc_new_array_int_nothrow, MallocLike,  2, 0,  -1}, // new[](unsigned int, nothrow)
-  {LibFunc::msvc_new_array_longlong,         OpNewLike,   1, 0,  -1}, // new[](unsigned long long)
-  {LibFunc::msvc_new_array_longlong_nothrow, MallocLike,  2, 0,  -1}, // new[](unsigned long long, nothrow)
-  {LibFunc::calloc,              CallocLike,  2, 0,   1},
-  {LibFunc::realloc,             ReallocLike, 2, 1,  -1},
-  {LibFunc::reallocf,            ReallocLike, 2, 1,  -1},
-  {LibFunc::strdup,              StrDupLike,  1, -1, -1},
-  {LibFunc::strndup,             StrDupLike,  2, 1,  -1}
+static const std::pair<LibFunc::Func, AllocFnsTy> AllocationFnData[] = {
+  {LibFunc::malloc,              {MallocLike,  1, 0,  -1}},
+  {LibFunc::valloc,              {MallocLike,  1, 0,  -1}},
+  {LibFunc::Znwj,                {OpNewLike,   1, 0,  -1}}, // new(unsigned int)
+  {LibFunc::ZnwjRKSt9nothrow_t,  {MallocLike,  2, 0,  -1}}, // new(unsigned int, nothrow)
+  {LibFunc::Znwm,                {OpNewLike,   1, 0,  -1}}, // new(unsigned long)
+  {LibFunc::ZnwmRKSt9nothrow_t,  {MallocLike,  2, 0,  -1}}, // new(unsigned long, nothrow)
+  {LibFunc::Znaj,                {OpNewLike,   1, 0,  -1}}, // new[](unsigned int)
+  {LibFunc::ZnajRKSt9nothrow_t,  {MallocLike,  2, 0,  -1}}, // new[](unsigned int, nothrow)
+  {LibFunc::Znam,                {OpNewLike,   1, 0,  -1}}, // new[](unsigned long)
+  {LibFunc::ZnamRKSt9nothrow_t,  {MallocLike,  2, 0,  -1}}, // new[](unsigned long, nothrow)
+  {LibFunc::msvc_new_int,         {OpNewLike,   1, 0,  -1}}, // new(unsigned int)
+  {LibFunc::msvc_new_int_nothrow, {MallocLike,  2, 0,  -1}}, // new(unsigned int, nothrow)
+  {LibFunc::msvc_new_longlong,         {OpNewLike,   1, 0,  -1}}, // new(unsigned long long)
+  {LibFunc::msvc_new_longlong_nothrow, {MallocLike,  2, 0,  -1}}, // new(unsigned long long, nothrow)
+  {LibFunc::msvc_new_array_int,         {OpNewLike,   1, 0,  -1}}, // new[](unsigned int)
+  {LibFunc::msvc_new_array_int_nothrow, {MallocLike,  2, 0,  -1}}, // new[](unsigned int, nothrow)
+  {LibFunc::msvc_new_array_longlong,         {OpNewLike,   1, 0,  -1}}, // new[](unsigned long long)
+  {LibFunc::msvc_new_array_longlong_nothrow, {MallocLike,  2, 0,  -1}}, // new[](unsigned long long, nothrow)
+  {LibFunc::calloc,              {CallocLike,  2, 0,   1}},
+  {LibFunc::realloc,             {ReallocLike, 2, 1,  -1}},
+  {LibFunc::reallocf,            {ReallocLike, 2, 1,  -1}},
+  {LibFunc::strdup,              {StrDupLike,  1, -1, -1}},
+  {LibFunc::strndup,             {StrDupLike,  2, 1,  -1}}
   // TODO: Handle "int posix_memalign(void **, size_t, size_t)"
 };
 
@@ -96,34 +95,57 @@ static Function *getCalledFunction(const Value *V, bool LookThroughBitCast) {
   return Callee;
 }
 
-/// \brief Returns the allocation data for the given value if it is a call to a
-/// known allocation function, and NULL otherwise.
-static const AllocFnsTy *getAllocationData(const Value *V, AllocType AllocTy,
-                                           const TargetLibraryInfo *TLI,
-                                           bool LookThroughBitCast = false) {
+/// Returns the allocation data for the given value if it's either a call to a
+/// known allocation function, or a call to a function with the allocsize
+/// attribute.
+static Optional<AllocFnsTy> getAllocationData(const Value *V, AllocType AllocTy,
+                                              const TargetLibraryInfo *TLI,
+                                              bool LookThroughBitCast = false) {
   // Skip intrinsics
   if (isa<IntrinsicInst>(V))
-    return nullptr;
+    return None;
 
-  Function *Callee = getCalledFunction(V, LookThroughBitCast);
+  const Function *Callee = getCalledFunction(V, LookThroughBitCast);
   if (!Callee)
-    return nullptr;
+    return None;
+
+  // If it has allocsize, we can skip checking if it's a known function.
+  //
+  // MallocLike is chosen here because allocsize makes no guarantees about the
+  // nullness of the result of the function, nor does it deal with strings, nor
+  // does it require that the memory returned is zeroed out.
+  LLVM_CONSTEXPR auto AllocSizeAllocTy = MallocLike;
+  if ((AllocTy & AllocSizeAllocTy) == AllocSizeAllocTy &&
+      Callee->hasFnAttribute(Attribute::AllocSize)) {
+    Attribute Attr = Callee->getFnAttribute(Attribute::AllocSize);
+    std::pair<unsigned, Optional<unsigned>> Args = Attr.getAllocSizeArgs();
+
+    AllocFnsTy Result;
+    Result.AllocTy = AllocSizeAllocTy;
+    Result.NumParams = Callee->getNumOperands();
+    Result.FstParam = Args.first;
+    Result.SndParam = Args.second.getValueOr(-1);
+    return Result;
+  }
 
   // Make sure that the function is available.
   StringRef FnName = Callee->getName();
   LibFunc::Func TLIFn;
   if (!TLI || !TLI->getLibFunc(FnName, TLIFn) || !TLI->has(TLIFn))
-    return nullptr;
+    return None;
 
-  const AllocFnsTy *FnData =
+  const auto *Iter =
       std::find_if(std::begin(AllocationFnData), std::end(AllocationFnData),
-                   [TLIFn](const AllocFnsTy &Fn) { return Fn.Func == TLIFn; });
+                   [TLIFn](const std::pair<LibFunc::Func, AllocFnsTy> &P) {
+                     return P.first == TLIFn;
+                   });
 
-  if (FnData == std::end(AllocationFnData))
-    return nullptr;
+  if (Iter == std::end(AllocationFnData))
+    return None;
 
+  const AllocFnsTy *FnData = &Iter->second;
   if ((FnData->AllocTy & AllocTy) != FnData->AllocTy)
-    return nullptr;
+    return None;
 
   // Check function prototype.
   int FstParam = FnData->FstParam;
@@ -138,13 +160,13 @@ static const AllocFnsTy *getAllocationData(const Value *V, AllocType AllocTy,
       (SndParam < 0 ||
        FTy->getParamType(SndParam)->isIntegerTy(32) ||
        FTy->getParamType(SndParam)->isIntegerTy(64)))
-    return FnData;
-  return nullptr;
+    return *FnData;
+  return None;
 }
 
 static bool hasNoAliasAttr(const Value *V, bool LookThroughBitCast) {
   ImmutableCallSite CS(LookThroughBitCast ? V->stripPointerCasts() : V);
-  return CS && CS.hasFnAttr(Attribute::NoAlias);
+  return CS && CS.paramHasAttr(AttributeSet::ReturnIndex, Attribute::NoAlias);
 }
 
 
@@ -153,7 +175,7 @@ static bool hasNoAliasAttr(const Value *V, bool LookThroughBitCast) {
 /// like).
 bool llvm::isAllocationFn(const Value *V, const TargetLibraryInfo *TLI,
                           bool LookThroughBitCast) {
-  return getAllocationData(V, AnyAlloc, TLI, LookThroughBitCast);
+  return getAllocationData(V, AnyAlloc, TLI, LookThroughBitCast).hasValue();
 }
 
 /// \brief Tests if a value is a call or invoke to a function that returns a
@@ -170,21 +192,21 @@ bool llvm::isNoAliasFn(const Value *V, const TargetLibraryInfo *TLI,
 /// allocates uninitialized memory (such as malloc).
 bool llvm::isMallocLikeFn(const Value *V, const TargetLibraryInfo *TLI,
                           bool LookThroughBitCast) {
-  return getAllocationData(V, MallocLike, TLI, LookThroughBitCast);
+  return getAllocationData(V, MallocLike, TLI, LookThroughBitCast).hasValue();
 }
 
 /// \brief Tests if a value is a call or invoke to a library function that
 /// allocates zero-filled memory (such as calloc).
 bool llvm::isCallocLikeFn(const Value *V, const TargetLibraryInfo *TLI,
                           bool LookThroughBitCast) {
-  return getAllocationData(V, CallocLike, TLI, LookThroughBitCast);
+  return getAllocationData(V, CallocLike, TLI, LookThroughBitCast).hasValue();
 }
 
 /// \brief Tests if a value is a call or invoke to a library function that
 /// allocates memory (either malloc, calloc, or strdup like).
 bool llvm::isAllocLikeFn(const Value *V, const TargetLibraryInfo *TLI,
                          bool LookThroughBitCast) {
-  return getAllocationData(V, AllocLike, TLI, LookThroughBitCast);
+  return getAllocationData(V, AllocLike, TLI, LookThroughBitCast).hasValue();
 }
 
 /// extractMallocCall - Returns the corresponding CallInst if the instruction
@@ -214,8 +236,7 @@ static Value *computeArraySize(const CallInst *CI, const DataLayout &DL,
   // return the multiple.  Otherwise, return NULL.
   Value *MallocArg = CI->getArgOperand(0);
   Value *Multiple = nullptr;
-  if (ComputeMultiple(MallocArg, ElementSize, Multiple,
-                      LookThroughSExt))
+  if (ComputeMultiple(MallocArg, ElementSize, Multiple, LookThroughSExt))
     return Multiple;
 
   return nullptr;
@@ -345,29 +366,29 @@ const CallInst *llvm::isFreeCall(const Value *I, const TargetLibraryInfo *TLI) {
 //===----------------------------------------------------------------------===//
 //  Utility functions to compute size of objects.
 //
-
+static APInt getSizeWithOverflow(const SizeOffsetType &Data) {
+  if (Data.second.isNegative() || Data.first.ult(Data.second))
+    return APInt(Data.first.getBitWidth(), 0);
+  return Data.first - Data.second;
+}
 
 /// \brief Compute the size of the object pointed by Ptr. Returns true and the
 /// object size in Size if successful, and false otherwise.
 /// If RoundToAlign is true, then Size is rounded up to the aligment of allocas,
 /// byval arguments, and global variables.
 bool llvm::getObjectSize(const Value *Ptr, uint64_t &Size, const DataLayout &DL,
-                         const TargetLibraryInfo *TLI, bool RoundToAlign) {
-  ObjectSizeOffsetVisitor Visitor(DL, TLI, Ptr->getContext(), RoundToAlign);
+                         const TargetLibraryInfo *TLI, bool RoundToAlign,
+                         llvm::ObjSizeMode Mode) {
+  ObjectSizeOffsetVisitor Visitor(DL, TLI, Ptr->getContext(),
+                                  RoundToAlign, Mode);
   SizeOffsetType Data = Visitor.compute(const_cast<Value*>(Ptr));
   if (!Visitor.bothKnown(Data))
     return false;
 
-  APInt ObjSize = Data.first, Offset = Data.second;
-  // check for overflow
-  if (Offset.slt(0) || ObjSize.ult(Offset))
-    Size = 0;
-  else
-    Size = (ObjSize - Offset).getZExtValue();
+  Size = getSizeWithOverflow(Data).getZExtValue();
   return true;
 }
 
-
 STATISTIC(ObjectVisitorArgument,
           "Number of arguments with unsolved size and offset");
 STATISTIC(ObjectVisitorLoad,
@@ -376,15 +397,16 @@ STATISTIC(ObjectVisitorLoad,
 
 APInt ObjectSizeOffsetVisitor::align(APInt Size, uint64_t Align) {
   if (RoundToAlign && Align)
-    return APInt(IntTyBits, RoundUpToAlignment(Size.getZExtValue(), Align));
+    return APInt(IntTyBits, alignTo(Size.getZExtValue(), Align));
   return Size;
 }
 
 ObjectSizeOffsetVisitor::ObjectSizeOffsetVisitor(const DataLayout &DL,
                                                  const TargetLibraryInfo *TLI,
                                                  LLVMContext &Context,
-                                                 bool RoundToAlign)
-    : DL(DL), TLI(TLI), RoundToAlign(RoundToAlign) {
+                                                 bool RoundToAlign,
+                                                 ObjSizeMode Mode)
+    : DL(DL), TLI(TLI), RoundToAlign(RoundToAlign), Mode(Mode) {
   // Pointer size must be rechecked for each object visited since it could have
   // a different address space.
 }
@@ -443,7 +465,7 @@ SizeOffsetType ObjectSizeOffsetVisitor::visitAllocaInst(AllocaInst &I) {
 }
 
 SizeOffsetType ObjectSizeOffsetVisitor::visitArgument(Argument &A) {
-  // no interprocedural analysis is done at the moment
+  // No interprocedural analysis is done at the moment.
   if (!A.hasByValOrInAllocaAttr()) {
     ++ObjectVisitorArgument;
     return unknown();
@@ -454,20 +476,21 @@ SizeOffsetType ObjectSizeOffsetVisitor::visitArgument(Argument &A) {
 }
 
 SizeOffsetType ObjectSizeOffsetVisitor::visitCallSite(CallSite CS) {
-  const AllocFnsTy *FnData = getAllocationData(CS.getInstruction(), AnyAlloc,
-                                               TLI);
+  Optional<AllocFnsTy> FnData =
+      getAllocationData(CS.getInstruction(), AnyAlloc, TLI);
   if (!FnData)
     return unknown();
 
-  // handle strdup-like functions separately
+  // Handle strdup-like functions separately.
   if (FnData->AllocTy == StrDupLike) {
     APInt Size(IntTyBits, GetStringLength(CS.getArgument(0)));
     if (!Size)
       return unknown();
 
-    // strndup limits strlen
+    // Strndup limits strlen.
     if (FnData->FstParam > 0) {
-      ConstantInt *Arg= dyn_cast<ConstantInt>(CS.getArgument(FnData->FstParam));
+      ConstantInt *Arg =
+          dyn_cast<ConstantInt>(CS.getArgument(FnData->FstParam));
       if (!Arg)
         return unknown();
 
@@ -482,8 +505,26 @@ SizeOffsetType ObjectSizeOffsetVisitor::visitCallSite(CallSite CS) {
   if (!Arg)
     return unknown();
 
-  APInt Size = Arg->getValue().zextOrSelf(IntTyBits);
-  // size determined by just 1 parameter
+  // When we're compiling N-bit code, and the user uses parameters that are
+  // greater than N bits (e.g. uint64_t on a 32-bit build), we can run into
+  // trouble with APInt size issues. This function handles resizing + overflow
+  // checks for us.
+  auto CheckedZextOrTrunc = [&](APInt &I) {
+    // More bits than we can handle. Checking the bit width isn't necessary, but
+    // it's faster than checking active bits, and should give `false` in the
+    // vast majority of cases.
+    if (I.getBitWidth() > IntTyBits && I.getActiveBits() > IntTyBits)
+      return false;
+    if (I.getBitWidth() != IntTyBits)
+      I = I.zextOrTrunc(IntTyBits);
+    return true;
+  };
+
+  APInt Size = Arg->getValue();
+  if (!CheckedZextOrTrunc(Size))
+    return unknown();
+
+  // Size is determined by just 1 parameter.
   if (FnData->SndParam < 0)
     return std::make_pair(Size, Zero);
 
@@ -491,8 +532,13 @@ SizeOffsetType ObjectSizeOffsetVisitor::visitCallSite(CallSite CS) {
   if (!Arg)
     return unknown();
 
-  Size *= Arg->getValue().zextOrSelf(IntTyBits);
-  return std::make_pair(Size, Zero);
+  APInt NumElems = Arg->getValue();
+  if (!CheckedZextOrTrunc(NumElems))
+    return unknown();
+
+  bool Overflow;
+  Size = Size.umul_ov(NumElems, Overflow);
+  return Overflow ? unknown() : std::make_pair(Size, Zero);
 
   // TODO: handle more standard functions (+ wchar cousins):
   // - strdup / strndup
@@ -529,7 +575,7 @@ SizeOffsetType ObjectSizeOffsetVisitor::visitGEPOperator(GEPOperator &GEP) {
 }
 
 SizeOffsetType ObjectSizeOffsetVisitor::visitGlobalAlias(GlobalAlias &GA) {
-  if (GA.mayBeOverridden())
+  if (GA.isInterposable())
     return unknown();
   return compute(GA.getAliasee());
 }
@@ -560,8 +606,28 @@ SizeOffsetType ObjectSizeOffsetVisitor::visitPHINode(PHINode&) {
 SizeOffsetType ObjectSizeOffsetVisitor::visitSelectInst(SelectInst &I) {
   SizeOffsetType TrueSide  = compute(I.getTrueValue());
   SizeOffsetType FalseSide = compute(I.getFalseValue());
-  if (bothKnown(TrueSide) && bothKnown(FalseSide) && TrueSide == FalseSide)
-    return TrueSide;
+  if (bothKnown(TrueSide) && bothKnown(FalseSide)) {
+    if (TrueSide == FalseSide) {
+        return TrueSide;
+    }
+
+    APInt TrueResult = getSizeWithOverflow(TrueSide);
+    APInt FalseResult = getSizeWithOverflow(FalseSide);
+
+    if (TrueResult == FalseResult) {
+      return TrueSide;
+    }
+    if (Mode == ObjSizeMode::Min) {
+      if (TrueResult.slt(FalseResult))
+        return TrueSide;
+      return FalseSide;
+    }
+    if (Mode == ObjSizeMode::Max) {
+      if (TrueResult.sgt(FalseResult))
+        return TrueSide;
+      return FalseSide;
+    }
+  }
   return unknown();
 }
 
@@ -591,11 +657,11 @@ SizeOffsetEvalType ObjectSizeOffsetEvaluator::compute(Value *V) {
   SizeOffsetEvalType Result = compute_(V);
 
   if (!bothKnown(Result)) {
-    // erase everything that was computed in this iteration from the cache, so
+    // Erase everything that was computed in this iteration from the cache, so
     // that no dangling references are left behind. We could be a bit smarter if
     // we kept a dependency graph. It's probably not worth the complexity.
-    for (PtrSetTy::iterator I=SeenVals.begin(), E=SeenVals.end(); I != E; ++I) {
-      CacheMapTy::iterator CacheIt = CacheMap.find(*I);
+    for (const Value *SeenVal : SeenVals) {
+      CacheMapTy::iterator CacheIt = CacheMap.find(SeenVal);
       // non-computable results can be safely cached
       if (CacheIt != CacheMap.end() && anyKnown(CacheIt->second))
         CacheMap.erase(CacheIt);
@@ -615,18 +681,18 @@ SizeOffsetEvalType ObjectSizeOffsetEvaluator::compute_(Value *V) {
 
   V = V->stripPointerCasts();
 
-  // check cache
+  // Check cache.
   CacheMapTy::iterator CacheIt = CacheMap.find(V);
   if (CacheIt != CacheMap.end())
     return CacheIt->second;
 
-  // always generate code immediately before the instruction being
-  // processed, so that the generated code dominates the same BBs
+  // Always generate code immediately before the instruction being
+  // processed, so that the generated code dominates the same BBs.
   BuilderTy::InsertPointGuard Guard(Builder);
   if (Instruction *I = dyn_cast<Instruction>(V))
     Builder.SetInsertPoint(I);
 
-  // now compute the size and offset
+  // Now compute the size and offset.
   SizeOffsetEvalType Result;
 
   // Record the pointers that were handled in this run, so that they can be
@@ -643,7 +709,7 @@ SizeOffsetEvalType ObjectSizeOffsetEvaluator::compute_(Value *V) {
               cast<ConstantExpr>(V)->getOpcode() == Instruction::IntToPtr) ||
              isa<GlobalAlias>(V) ||
              isa<GlobalVariable>(V)) {
-    // ignore values where we cannot do more than what ObjectSizeVisitor can
+    // Ignore values where we cannot do more than ObjectSizeVisitor.
     Result = unknown();
   } else {
     DEBUG(dbgs() << "ObjectSizeOffsetEvaluator::compute() unhandled value: "
@@ -670,12 +736,12 @@ SizeOffsetEvalType ObjectSizeOffsetEvaluator::visitAllocaInst(AllocaInst &I) {
 }
 
 SizeOffsetEvalType ObjectSizeOffsetEvaluator::visitCallSite(CallSite CS) {
-  const AllocFnsTy *FnData = getAllocationData(CS.getInstruction(), AnyAlloc,
-                                               TLI);
+  Optional<AllocFnsTy> FnData =
+      getAllocationData(CS.getInstruction(), AnyAlloc, TLI);
   if (!FnData)
     return unknown();
 
-  // handle strdup-like functions separately
+  // Handle strdup-like functions separately.
   if (FnData->AllocTy == StrDupLike) {
     // TODO
     return unknown();
@@ -731,14 +797,14 @@ SizeOffsetEvalType ObjectSizeOffsetEvaluator::visitLoadInst(LoadInst&) {
 }
 
 SizeOffsetEvalType ObjectSizeOffsetEvaluator::visitPHINode(PHINode &PHI) {
-  // create 2 PHIs: one for size and another for offset
+  // Create 2 PHIs: one for size and another for offset.
   PHINode *SizePHI   = Builder.CreatePHI(IntTy, PHI.getNumIncomingValues());
   PHINode *OffsetPHI = Builder.CreatePHI(IntTy, PHI.getNumIncomingValues());
 
-  // insert right away in the cache to handle recursive PHIs
+  // Insert right away in the cache to handle recursive PHIs.
   CacheMap[&PHI] = std::make_pair(SizePHI, OffsetPHI);
 
-  // compute offset/size for each PHI incoming pointer
+  // Compute offset/size for each PHI incoming pointer.
   for (unsigned i = 0, e = PHI.getNumIncomingValues(); i != e; ++i) {
     Builder.SetInsertPoint(&*PHI.getIncomingBlock(i)->getFirstInsertionPt());
     SizeOffsetEvalType EdgeData = compute_(PHI.getIncomingValue(i));
diff --git a/lib/Analysis/MemoryDependenceAnalysis.cpp b/lib/Analysis/MemoryDependenceAnalysis.cpp
index 6918360536a3..33499334fefa 100644
--- a/lib/Analysis/MemoryDependenceAnalysis.cpp
+++ b/lib/Analysis/MemoryDependenceAnalysis.cpp
@@ -45,8 +45,7 @@ STATISTIC(NumCacheNonLocalPtr,
           "Number of fully cached non-local ptr responses");
 STATISTIC(NumCacheDirtyNonLocalPtr,
           "Number of cached, but dirty, non-local ptr responses");
-STATISTIC(NumUncacheNonLocalPtr,
-          "Number of uncached non-local ptr responses");
+STATISTIC(NumUncacheNonLocalPtr, "Number of uncached non-local ptr responses");
 STATISTIC(NumCacheCompleteNonLocalPtr,
           "Number of block queries that were completely cached");
 
@@ -57,75 +56,35 @@ static cl::opt<unsigned> BlockScanLimit(
     cl::desc("The number of instructions to scan in a block in memory "
              "dependency analysis (default = 100)"));
 
+static cl::opt<unsigned>
+    BlockNumberLimit("memdep-block-number-limit", cl::Hidden, cl::init(1000),
+                     cl::desc("The number of blocks to scan during memory "
+                              "dependency analysis (default = 1000)"));
+
 // Limit on the number of memdep results to process.
 static const unsigned int NumResultsLimit = 100;
 
-char MemoryDependenceAnalysis::ID = 0;
-
-// Register this pass...
-INITIALIZE_PASS_BEGIN(MemoryDependenceAnalysis, "memdep",
-                "Memory Dependence Analysis", false, true)
-INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker)
-INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass)
-INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfoWrapperPass)
-INITIALIZE_PASS_END(MemoryDependenceAnalysis, "memdep",
-                      "Memory Dependence Analysis", false, true)
-
-MemoryDependenceAnalysis::MemoryDependenceAnalysis()
-    : FunctionPass(ID) {
-  initializeMemoryDependenceAnalysisPass(*PassRegistry::getPassRegistry());
-}
-MemoryDependenceAnalysis::~MemoryDependenceAnalysis() {
-}
-
-/// Clean up memory in between runs
-void MemoryDependenceAnalysis::releaseMemory() {
-  LocalDeps.clear();
-  NonLocalDeps.clear();
-  NonLocalPointerDeps.clear();
-  ReverseLocalDeps.clear();
-  ReverseNonLocalDeps.clear();
-  ReverseNonLocalPtrDeps.clear();
-  PredCache.clear();
-}
-
-/// getAnalysisUsage - Does not modify anything.  It uses Alias Analysis.
+/// This is a helper function that removes Val from 'Inst's set in ReverseMap.
 ///
-void MemoryDependenceAnalysis::getAnalysisUsage(AnalysisUsage &AU) const {
-  AU.setPreservesAll();
-  AU.addRequired<AssumptionCacheTracker>();
-  AU.addRequiredTransitive<AAResultsWrapperPass>();
-  AU.addRequiredTransitive<TargetLibraryInfoWrapperPass>();
-}
-
-bool MemoryDependenceAnalysis::runOnFunction(Function &F) {
-  AA = &getAnalysis<AAResultsWrapperPass>().getAAResults();
-  AC = &getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F);
-  DominatorTreeWrapperPass *DTWP =
-      getAnalysisIfAvailable<DominatorTreeWrapperPass>();
-  DT = DTWP ? &DTWP->getDomTree() : nullptr;
-  TLI = &getAnalysis<TargetLibraryInfoWrapperPass>().getTLI();
-  return false;
-}
-
-/// RemoveFromReverseMap - This is a helper function that removes Val from
-/// 'Inst's set in ReverseMap.  If the set becomes empty, remove Inst's entry.
+/// If the set becomes empty, remove Inst's entry.
 template <typename KeyTy>
-static void RemoveFromReverseMap(DenseMap<Instruction*,
-                                 SmallPtrSet<KeyTy, 4> > &ReverseMap,
-                                 Instruction *Inst, KeyTy Val) {
-  typename DenseMap<Instruction*, SmallPtrSet<KeyTy, 4> >::iterator
-  InstIt = ReverseMap.find(Inst);
+static void
+RemoveFromReverseMap(DenseMap<Instruction *, SmallPtrSet<KeyTy, 4>> &ReverseMap,
+                     Instruction *Inst, KeyTy Val) {
+  typename DenseMap<Instruction *, SmallPtrSet<KeyTy, 4>>::iterator InstIt =
+      ReverseMap.find(Inst);
   assert(InstIt != ReverseMap.end() && "Reverse map out of sync?");
   bool Found = InstIt->second.erase(Val);
-  assert(Found && "Invalid reverse map!"); (void)Found;
+  assert(Found && "Invalid reverse map!");
+  (void)Found;
   if (InstIt->second.empty())
     ReverseMap.erase(InstIt);
 }
 
-/// GetLocation - If the given instruction references a specific memory
-/// location, fill in Loc with the details, otherwise set Loc.Ptr to null.
-/// Return a ModRefInfo value describing the general behavior of the
+/// If the given instruction references a specific memory location, fill in Loc
+/// with the details, otherwise set Loc.Ptr to null.
+///
+/// Returns a ModRefInfo value describing the general behavior of the
 /// instruction.
 static ModRefInfo GetLocation(const Instruction *Inst, MemoryLocation &Loc,
                               const TargetLibraryInfo &TLI) {
@@ -134,7 +93,7 @@ static ModRefInfo GetLocation(const Instruction *Inst, MemoryLocation &Loc,
       Loc = MemoryLocation::get(LI);
       return MRI_Ref;
     }
-    if (LI->getOrdering() == Monotonic) {
+    if (LI->getOrdering() == AtomicOrdering::Monotonic) {
       Loc = MemoryLocation::get(LI);
       return MRI_ModRef;
     }
@@ -147,7 +106,7 @@ static ModRefInfo GetLocation(const Instruction *Inst, MemoryLocation &Loc,
       Loc = MemoryLocation::get(SI);
       return MRI_Mod;
     }
-    if (SI->getOrdering() == Monotonic) {
+    if (SI->getOrdering() == AtomicOrdering::Monotonic) {
       Loc = MemoryLocation::get(SI);
       return MRI_ModRef;
     }
@@ -201,11 +160,10 @@ static ModRefInfo GetLocation(const Instruction *Inst, MemoryLocation &Loc,
   return MRI_NoModRef;
 }
 
-/// getCallSiteDependencyFrom - Private helper for finding the local
-/// dependencies of a call site.
-MemDepResult MemoryDependenceAnalysis::
-getCallSiteDependencyFrom(CallSite CS, bool isReadOnlyCall,
-                          BasicBlock::iterator ScanIt, BasicBlock *BB) {
+/// Private helper for finding the local dependencies of a call site.
+MemDepResult MemoryDependenceResults::getCallSiteDependencyFrom(
+    CallSite CS, bool isReadOnlyCall, BasicBlock::iterator ScanIt,
+    BasicBlock *BB) {
   unsigned Limit = BlockScanLimit;
 
   // Walk backwards through the block, looking for dependencies
@@ -220,19 +178,20 @@ getCallSiteDependencyFrom(CallSite CS, bool isReadOnlyCall,
 
     // If this inst is a memory op, get the pointer it accessed
     MemoryLocation Loc;
-    ModRefInfo MR = GetLocation(Inst, Loc, *TLI);
+    ModRefInfo MR = GetLocation(Inst, Loc, TLI);
     if (Loc.Ptr) {
       // A simple instruction.
-      if (AA->getModRefInfo(CS, Loc) != MRI_NoModRef)
+      if (AA.getModRefInfo(CS, Loc) != MRI_NoModRef)
         return MemDepResult::getClobber(Inst);
       continue;
     }
 
     if (auto InstCS = CallSite(Inst)) {
       // Debug intrinsics don't cause dependences.
-      if (isa<DbgInfoIntrinsic>(Inst)) continue;
+      if (isa<DbgInfoIntrinsic>(Inst))
+        continue;
       // If these two calls do not interfere, look past it.
-      switch (AA->getModRefInfo(CS, InstCS)) {
+      switch (AA.getModRefInfo(CS, InstCS)) {
       case MRI_NoModRef:
         // If the two calls are the same, return InstCS as a Def, so that
         // CS can be found redundant and eliminated.
@@ -261,8 +220,8 @@ getCallSiteDependencyFrom(CallSite CS, bool isReadOnlyCall,
   return MemDepResult::getNonFuncLocal();
 }
 
-/// isLoadLoadClobberIfExtendedToFullWidth - Return true if LI is a load that
-/// would fully overlap MemLoc if done as a wider legal integer load.
+/// Return true if LI is a load that would fully overlap MemLoc if done as
+/// a wider legal integer load.
 ///
 /// MemLocBase, MemLocOffset are lazily computed here the first time the
 /// base/offs of memloc is needed.
@@ -276,23 +235,17 @@ static bool isLoadLoadClobberIfExtendedToFullWidth(const MemoryLocation &MemLoc,
   if (!MemLocBase)
     MemLocBase = GetPointerBaseWithConstantOffset(MemLoc.Ptr, MemLocOffs, DL);
 
-  unsigned Size = MemoryDependenceAnalysis::getLoadLoadClobberFullWidthSize(
+  unsigned Size = MemoryDependenceResults::getLoadLoadClobberFullWidthSize(
       MemLocBase, MemLocOffs, MemLoc.Size, LI);
   return Size != 0;
 }
 
-/// getLoadLoadClobberFullWidthSize - This is a little bit of analysis that
-/// looks at a memory location for a load (specified by MemLocBase, Offs,
-/// and Size) and compares it against a load.  If the specified load could
-/// be safely widened to a larger integer load that is 1) still efficient,
-/// 2) safe for the target, and 3) would provide the specified memory
-/// location value, then this function returns the size in bytes of the
-/// load width to use.  If not, this returns zero.
-unsigned MemoryDependenceAnalysis::getLoadLoadClobberFullWidthSize(
+unsigned MemoryDependenceResults::getLoadLoadClobberFullWidthSize(
     const Value *MemLocBase, int64_t MemLocOffs, unsigned MemLocSize,
     const LoadInst *LI) {
   // We can only extend simple integer loads.
-  if (!isa<IntegerType>(LI->getType()) || !LI->isSimple()) return 0;
+  if (!isa<IntegerType>(LI->getType()) || !LI->isSimple())
+    return 0;
 
   // Load widening is hostile to ThreadSanitizer: it may cause false positives
   // or make the reports more cryptic (access sizes are wrong).
@@ -308,7 +261,8 @@ unsigned MemoryDependenceAnalysis::getLoadLoadClobberFullWidthSize(
 
   // If the two pointers are not based on the same pointer, we can't tell that
   // they are related.
-  if (LIBase != MemLocBase) return 0;
+  if (LIBase != MemLocBase)
+    return 0;
 
   // Okay, the two values are based on the same pointer, but returned as
   // no-alias.  This happens when we have things like two byte loads at "P+1"
@@ -317,7 +271,8 @@ unsigned MemoryDependenceAnalysis::getLoadLoadClobberFullWidthSize(
   // the bits required by MemLoc.
 
   // If MemLoc is before LI, then no widening of LI will help us out.
-  if (MemLocOffs < LIOffs) return 0;
+  if (MemLocOffs < LIOffs)
+    return 0;
 
   // Get the alignment of the load in bytes.  We assume that it is safe to load
   // any legal integer up to this size without a problem.  For example, if we're
@@ -326,21 +281,22 @@ unsigned MemoryDependenceAnalysis::getLoadLoadClobberFullWidthSize(
   // to i16.
   unsigned LoadAlign = LI->getAlignment();
 
-  int64_t MemLocEnd = MemLocOffs+MemLocSize;
+  int64_t MemLocEnd = MemLocOffs + MemLocSize;
 
   // If no amount of rounding up will let MemLoc fit into LI, then bail out.
-  if (LIOffs+LoadAlign < MemLocEnd) return 0;
+  if (LIOffs + LoadAlign < MemLocEnd)
+    return 0;
 
   // This is the size of the load to try.  Start with the next larger power of
   // two.
-  unsigned NewLoadByteSize = LI->getType()->getPrimitiveSizeInBits()/8U;
+  unsigned NewLoadByteSize = LI->getType()->getPrimitiveSizeInBits() / 8U;
   NewLoadByteSize = NextPowerOf2(NewLoadByteSize);
 
   while (1) {
     // If this load size is bigger than our known alignment or would not fit
     // into a native integer register, then we fail.
     if (NewLoadByteSize > LoadAlign ||
-        !DL.fitsInLegalInteger(NewLoadByteSize*8))
+        !DL.fitsInLegalInteger(NewLoadByteSize * 8))
       return 0;
 
     if (LIOffs + NewLoadByteSize > MemLocEnd &&
@@ -352,7 +308,7 @@ unsigned MemoryDependenceAnalysis::getLoadLoadClobberFullWidthSize(
       return 0;
 
     // If a load of this width would include all of MemLoc, then we succeed.
-    if (LIOffs+NewLoadByteSize >= MemLocEnd)
+    if (LIOffs + NewLoadByteSize >= MemLocEnd)
       return NewLoadByteSize;
 
     NewLoadByteSize <<= 1;
@@ -369,14 +325,7 @@ static bool isVolatile(Instruction *Inst) {
   return false;
 }
 
-
-/// getPointerDependencyFrom - Return the instruction on which a memory
-/// location depends.  If isLoad is true, this routine ignores may-aliases with
-/// read-only operations.  If isLoad is false, this routine ignores may-aliases
-/// with reads from read-only locations.  If possible, pass the query
-/// instruction as well; this function may take advantage of the metadata
-/// annotated to the query instruction to refine the result.
-MemDepResult MemoryDependenceAnalysis::getPointerDependencyFrom(
+MemDepResult MemoryDependenceResults::getPointerDependencyFrom(
     const MemoryLocation &MemLoc, bool isLoad, BasicBlock::iterator ScanIt,
     BasicBlock *BB, Instruction *QueryInst) {
 
@@ -393,7 +342,7 @@ MemDepResult MemoryDependenceAnalysis::getPointerDependencyFrom(
 }
 
 MemDepResult
-MemoryDependenceAnalysis::getInvariantGroupPointerDependency(LoadInst *LI,
+MemoryDependenceResults::getInvariantGroupPointerDependency(LoadInst *LI,
                                                              BasicBlock *BB) {
   Value *LoadOperand = LI->getPointerOperand();
   // It's is not safe to walk the use list of global value, because function
@@ -416,21 +365,19 @@ MemoryDependenceAnalysis::getInvariantGroupPointerDependency(LoadInst *LI,
       continue;
 
     if (auto *BCI = dyn_cast<BitCastInst>(Ptr)) {
-      if (!Seen.count(BCI->getOperand(0))) {
+      if (Seen.insert(BCI->getOperand(0)).second) {
         LoadOperandsQueue.push_back(BCI->getOperand(0));
-        Seen.insert(BCI->getOperand(0));
       }
     }
 
     for (Use &Us : Ptr->uses()) {
       auto *U = dyn_cast<Instruction>(Us.getUser());
-      if (!U || U == LI || !DT->dominates(U, LI))
+      if (!U || U == LI || !DT.dominates(U, LI))
         continue;
 
       if (auto *BCI = dyn_cast<BitCastInst>(U)) {
-        if (!Seen.count(BCI)) {
+        if (Seen.insert(BCI).second) {
           LoadOperandsQueue.push_back(BCI);
-          Seen.insert(BCI);
         }
         continue;
       }
@@ -445,7 +392,7 @@ MemoryDependenceAnalysis::getInvariantGroupPointerDependency(LoadInst *LI,
   return Result;
 }
 
-MemDepResult MemoryDependenceAnalysis::getSimplePointerDependencyFrom(
+MemDepResult MemoryDependenceResults::getSimplePointerDependencyFrom(
     const MemoryLocation &MemLoc, bool isLoad, BasicBlock::iterator ScanIt,
     BasicBlock *BB, Instruction *QueryInst) {
 
@@ -455,7 +402,7 @@ MemDepResult MemoryDependenceAnalysis::getSimplePointerDependencyFrom(
   bool isInvariantLoad = false;
 
   // We must be careful with atomic accesses, as they may allow another thread
-  //   to touch this location, cloberring it. We are conservative: if the
+  //   to touch this location, clobbering it. We are conservative: if the
   //   QueryInst is not a simple (non-atomic) memory access, we automatically
   //   return getClobber.
   // If it is simple, we know based on the results of
@@ -476,9 +423,9 @@ MemDepResult MemoryDependenceAnalysis::getSimplePointerDependencyFrom(
   // with synchronization from 1 to 2 and from 3 to 4, resulting in %val
   // being 42. A key property of this program however is that if either
   // 1 or 4 were missing, there would be a race between the store of 42
-  // either the store of 0 or the load (making the whole progam racy).
+  // either the store of 0 or the load (making the whole program racy).
   // The paper mentioned above shows that the same property is respected
-  // by every program that can detect any optimisation of that kind: either
+  // by every program that can detect any optimization of that kind: either
   // it is racy (undefined) or there is a release followed by an acquire
   // between the pair of accesses under consideration.
 
@@ -500,13 +447,30 @@ MemDepResult MemoryDependenceAnalysis::getSimplePointerDependencyFrom(
   // AliasAnalysis::callCapturesBefore.
   OrderedBasicBlock OBB(BB);
 
+  // Return "true" if and only if the instruction I is either a non-simple
+  // load or a non-simple store.
+  auto isNonSimpleLoadOrStore = [](Instruction *I) -> bool {
+    if (auto *LI = dyn_cast<LoadInst>(I))
+      return !LI->isSimple();
+    if (auto *SI = dyn_cast<StoreInst>(I))
+      return !SI->isSimple();
+    return false;
+  };
+
+  // Return "true" if I is not a load and not a store, but it does access
+  // memory.
+  auto isOtherMemAccess = [](Instruction *I) -> bool {
+    return !isa<LoadInst>(I) && !isa<StoreInst>(I) && I->mayReadOrWriteMemory();
+  };
+
   // Walk backwards through the basic block, looking for dependencies.
   while (ScanIt != BB->begin()) {
     Instruction *Inst = &*--ScanIt;
 
     if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(Inst))
       // Debug intrinsics don't (and can't) cause dependencies.
-      if (isa<DbgInfoIntrinsic>(II)) continue;
+      if (isa<DbgInfoIntrinsic>(II))
+        continue;
 
     // Limit the amount of scanning we do so we don't end up with quadratic
     // running time on extreme testcases.
@@ -522,17 +486,17 @@ MemDepResult MemoryDependenceAnalysis::getSimplePointerDependencyFrom(
         // pointer, not on query pointers that are indexed off of them.  It'd
         // be nice to handle that at some point (the right approach is to use
         // GetPointerBaseWithConstantOffset).
-        if (AA->isMustAlias(MemoryLocation(II->getArgOperand(1)), MemLoc))
+        if (AA.isMustAlias(MemoryLocation(II->getArgOperand(1)), MemLoc))
           return MemDepResult::getDef(II);
         continue;
       }
     }
 
-    // Values depend on loads if the pointers are must aliased.  This means that
-    // a load depends on another must aliased load from the same value.
-    // One exception is atomic loads: a value can depend on an atomic load that it
-    // does not alias with when this atomic load indicates that another thread may
-    // be accessing the location.
+    // Values depend on loads if the pointers are must aliased.  This means
+    // that a load depends on another must aliased load from the same value.
+    // One exception is atomic loads: a value can depend on an atomic load that
+    // it does not alias with when this atomic load indicates that another
+    // thread may be accessing the location.
     if (LoadInst *LI = dyn_cast<LoadInst>(Inst)) {
 
       // While volatile access cannot be eliminated, they do not have to clobber
@@ -547,30 +511,23 @@ MemDepResult MemoryDependenceAnalysis::getSimplePointerDependencyFrom(
           return MemDepResult::getClobber(LI);
         // Otherwise, volatile doesn't imply any special ordering
       }
-      
+
       // Atomic loads have complications involved.
-      // A Monotonic (or higher) load is OK if the query inst is itself not atomic.
+      // A Monotonic (or higher) load is OK if the query inst is itself not
+      // atomic.
       // FIXME: This is overly conservative.
-      if (LI->isAtomic() && LI->getOrdering() > Unordered) {
-        if (!QueryInst)
-          return MemDepResult::getClobber(LI);
-        if (LI->getOrdering() != Monotonic)
+      if (LI->isAtomic() && isStrongerThanUnordered(LI->getOrdering())) {
+        if (!QueryInst || isNonSimpleLoadOrStore(QueryInst) ||
+            isOtherMemAccess(QueryInst))
           return MemDepResult::getClobber(LI);
-        if (auto *QueryLI = dyn_cast<LoadInst>(QueryInst)) {
-          if (!QueryLI->isSimple())
-            return MemDepResult::getClobber(LI);
-        } else if (auto *QuerySI = dyn_cast<StoreInst>(QueryInst)) {
-          if (!QuerySI->isSimple())
-            return MemDepResult::getClobber(LI);
-        } else if (QueryInst->mayReadOrWriteMemory()) {
+        if (LI->getOrdering() != AtomicOrdering::Monotonic)
           return MemDepResult::getClobber(LI);
-        }
       }
 
       MemoryLocation LoadLoc = MemoryLocation::get(LI);
 
       // If we found a pointer, check if it could be the same as our pointer.
-      AliasResult R = AA->alias(LoadLoc, MemLoc);
+      AliasResult R = AA.alias(LoadLoc, MemLoc);
 
       if (isLoad) {
         if (R == NoAlias) {
@@ -614,7 +571,7 @@ MemDepResult MemoryDependenceAnalysis::getSimplePointerDependencyFrom(
         continue;
 
       // Stores don't alias loads from read-only memory.
-      if (AA->pointsToConstantMemory(LoadLoc))
+      if (AA.pointsToConstantMemory(LoadLoc))
         continue;
 
       // Stores depend on may/must aliased loads.
@@ -625,20 +582,12 @@ MemDepResult MemoryDependenceAnalysis::getSimplePointerDependencyFrom(
       // Atomic stores have complications involved.
       // A Monotonic store is OK if the query inst is itself not atomic.
       // FIXME: This is overly conservative.
-      if (!SI->isUnordered()) {
-        if (!QueryInst)
-          return MemDepResult::getClobber(SI);
-        if (SI->getOrdering() != Monotonic)
+      if (!SI->isUnordered() && SI->isAtomic()) {
+        if (!QueryInst || isNonSimpleLoadOrStore(QueryInst) ||
+            isOtherMemAccess(QueryInst))
           return MemDepResult::getClobber(SI);
-        if (auto *QueryLI = dyn_cast<LoadInst>(QueryInst)) {
-          if (!QueryLI->isSimple())
-            return MemDepResult::getClobber(SI);
-        } else if (auto *QuerySI = dyn_cast<StoreInst>(QueryInst)) {
-          if (!QuerySI->isSimple())
-            return MemDepResult::getClobber(SI);
-        } else if (QueryInst->mayReadOrWriteMemory()) {
+        if (SI->getOrdering() != AtomicOrdering::Monotonic)
           return MemDepResult::getClobber(SI);
-        }
       }
 
       // FIXME: this is overly conservative.
@@ -646,12 +595,14 @@ MemDepResult MemoryDependenceAnalysis::getSimplePointerDependencyFrom(
       // non-aliasing locations, as normal accesses can for example be reordered
       // with volatile accesses.
       if (SI->isVolatile())
-        return MemDepResult::getClobber(SI);
+        if (!QueryInst || isNonSimpleLoadOrStore(QueryInst) ||
+            isOtherMemAccess(QueryInst))
+          return MemDepResult::getClobber(SI);
 
       // If alias analysis can tell that this store is guaranteed to not modify
       // the query pointer, ignore it.  Use getModRefInfo to handle cases where
       // the query pointer points to constant memory etc.
-      if (AA->getModRefInfo(SI, MemLoc) == MRI_NoModRef)
+      if (AA.getModRefInfo(SI, MemLoc) == MRI_NoModRef)
         continue;
 
       // Ok, this store might clobber the query pointer.  Check to see if it is
@@ -659,50 +610,46 @@ MemDepResult MemoryDependenceAnalysis::getSimplePointerDependencyFrom(
       MemoryLocation StoreLoc = MemoryLocation::get(SI);
 
       // If we found a pointer, check if it could be the same as our pointer.
-      AliasResult R = AA->alias(StoreLoc, MemLoc);
+      AliasResult R = AA.alias(StoreLoc, MemLoc);
 
       if (R == NoAlias)
         continue;
       if (R == MustAlias)
         return MemDepResult::getDef(Inst);
       if (isInvariantLoad)
-       continue;
+        continue;
       return MemDepResult::getClobber(Inst);
     }
 
     // If this is an allocation, and if we know that the accessed pointer is to
     // the allocation, return Def.  This means that there is no dependence and
     // the access can be optimized based on that.  For example, a load could
-    // turn into undef.
-    // Note: Only determine this to be a malloc if Inst is the malloc call, not
-    // a subsequent bitcast of the malloc call result.  There can be stores to
-    // the malloced memory between the malloc call and its bitcast uses, and we
-    // need to continue scanning until the malloc call.
-    if (isa<AllocaInst>(Inst) || isNoAliasFn(Inst, TLI)) {
+    // turn into undef.  Note that we can bypass the allocation itself when
+    // looking for a clobber in many cases; that's an alias property and is
+    // handled by BasicAA.
+    if (isa<AllocaInst>(Inst) || isNoAliasFn(Inst, &TLI)) {
       const Value *AccessPtr = GetUnderlyingObject(MemLoc.Ptr, DL);
-
-      if (AccessPtr == Inst || AA->isMustAlias(Inst, AccessPtr))
+      if (AccessPtr == Inst || AA.isMustAlias(Inst, AccessPtr))
         return MemDepResult::getDef(Inst);
-      if (isInvariantLoad)
-        continue;
-      // Be conservative if the accessed pointer may alias the allocation -
-      // fallback to the generic handling below.
-      if ((AA->alias(Inst, AccessPtr) == NoAlias) &&
-          // If the allocation is not aliased and does not read memory (like
-          // strdup), it is safe to ignore.
-          (isa<AllocaInst>(Inst) || isMallocLikeFn(Inst, TLI) ||
-           isCallocLikeFn(Inst, TLI)))
-        continue;
     }
 
     if (isInvariantLoad)
-       continue;
+      continue;
+
+    // A release fence requires that all stores complete before it, but does
+    // not prevent the reordering of following loads or stores 'before' the
+    // fence.  As a result, we look past it when finding a dependency for
+    // loads.  DSE uses this to find preceeding stores to delete and thus we
+    // can't bypass the fence if the query instruction is a store.
+    if (FenceInst *FI = dyn_cast<FenceInst>(Inst))
+      if (isLoad && FI->getOrdering() == AtomicOrdering::Release)
+        continue;
 
     // See if this instruction (e.g. a call or vaarg) mod/ref's the pointer.
-    ModRefInfo MR = AA->getModRefInfo(Inst, MemLoc);
+    ModRefInfo MR = AA.getModRefInfo(Inst, MemLoc);
     // If necessary, perform additional analysis.
     if (MR == MRI_ModRef)
-      MR = AA->callCapturesBefore(Inst, MemLoc, DT, &OBB);
+      MR = AA.callCapturesBefore(Inst, MemLoc, &DT, &OBB);
     switch (MR) {
     case MRI_NoModRef:
       // If the call has no effect on the queried pointer, just ignore it.
@@ -727,9 +674,7 @@ MemDepResult MemoryDependenceAnalysis::getSimplePointerDependencyFrom(
   return MemDepResult::getNonFuncLocal();
 }
 
-/// getDependency - Return the instruction on which a memory operation
-/// depends.
-MemDepResult MemoryDependenceAnalysis::getDependency(Instruction *QueryInst) {
+MemDepResult MemoryDependenceResults::getDependency(Instruction *QueryInst) {
   Instruction *ScanPos = QueryInst;
 
   // Check for a cached result
@@ -760,7 +705,7 @@ MemDepResult MemoryDependenceAnalysis::getDependency(Instruction *QueryInst) {
       LocalCache = MemDepResult::getNonFuncLocal();
   } else {
     MemoryLocation MemLoc;
-    ModRefInfo MR = GetLocation(QueryInst, MemLoc, *TLI);
+    ModRefInfo MR = GetLocation(QueryInst, MemLoc, TLI);
     if (MemLoc.Ptr) {
       // If we can do a pointer scan, make it happen.
       bool isLoad = !(MR & MRI_Mod);
@@ -771,7 +716,7 @@ MemDepResult MemoryDependenceAnalysis::getDependency(Instruction *QueryInst) {
           MemLoc, isLoad, ScanPos->getIterator(), QueryParent, QueryInst);
     } else if (isa<CallInst>(QueryInst) || isa<InvokeInst>(QueryInst)) {
       CallSite QueryCS(QueryInst);
-      bool isReadOnly = AA->onlyReadsMemory(QueryCS);
+      bool isReadOnly = AA.onlyReadsMemory(QueryCS);
       LocalCache = getCallSiteDependencyFrom(
           QueryCS, isReadOnly, ScanPos->getIterator(), QueryParent);
     } else
@@ -787,40 +732,29 @@ MemDepResult MemoryDependenceAnalysis::getDependency(Instruction *QueryInst) {
 }
 
 #ifndef NDEBUG
-/// AssertSorted - This method is used when -debug is specified to verify that
-/// cache arrays are properly kept sorted.
-static void AssertSorted(MemoryDependenceAnalysis::NonLocalDepInfo &Cache,
+/// This method is used when -debug is specified to verify that cache arrays
+/// are properly kept sorted.
+static void AssertSorted(MemoryDependenceResults::NonLocalDepInfo &Cache,
                          int Count = -1) {
-  if (Count == -1) Count = Cache.size();
+  if (Count == -1)
+    Count = Cache.size();
   assert(std::is_sorted(Cache.begin(), Cache.begin() + Count) &&
          "Cache isn't sorted!");
 }
 #endif
 
-/// getNonLocalCallDependency - Perform a full dependency query for the
-/// specified call, returning the set of blocks that the value is
-/// potentially live across.  The returned set of results will include a
-/// "NonLocal" result for all blocks where the value is live across.
-///
-/// This method assumes the instruction returns a "NonLocal" dependency
-/// within its own block.
-///
-/// This returns a reference to an internal data structure that may be
-/// invalidated on the next non-local query or when an instruction is
-/// removed.  Clients must copy this data if they want it around longer than
-/// that.
-const MemoryDependenceAnalysis::NonLocalDepInfo &
-MemoryDependenceAnalysis::getNonLocalCallDependency(CallSite QueryCS) {
+const MemoryDependenceResults::NonLocalDepInfo &
+MemoryDependenceResults::getNonLocalCallDependency(CallSite QueryCS) {
   assert(getDependency(QueryCS.getInstruction()).isNonLocal() &&
- "getNonLocalCallDependency should only be used on calls with non-local deps!");
+         "getNonLocalCallDependency should only be used on calls with "
+         "non-local deps!");
   PerInstNLInfo &CacheP = NonLocalDeps[QueryCS.getInstruction()];
   NonLocalDepInfo &Cache = CacheP.first;
 
-  /// DirtyBlocks - This is the set of blocks that need to be recomputed.  In
-  /// the cached case, this can happen due to instructions being deleted etc. In
-  /// the uncached case, this starts out as the set of predecessors we care
-  /// about.
-  SmallVector<BasicBlock*, 32> DirtyBlocks;
+  // This is the set of blocks that need to be recomputed.  In the cached case,
+  // this can happen due to instructions being deleted etc. In the uncached
+  // case, this starts out as the set of predecessors we care about.
+  SmallVector<BasicBlock *, 32> DirtyBlocks;
 
   if (!Cache.empty()) {
     // Okay, we have a cache entry.  If we know it is not dirty, just return it
@@ -832,16 +766,15 @@ MemoryDependenceAnalysis::getNonLocalCallDependency(CallSite QueryCS) {
 
     // If we already have a partially computed set of results, scan them to
     // determine what is dirty, seeding our initial DirtyBlocks worklist.
-    for (NonLocalDepInfo::iterator I = Cache.begin(), E = Cache.end();
-       I != E; ++I)
-      if (I->getResult().isDirty())
-        DirtyBlocks.push_back(I->getBB());
+    for (auto &Entry : Cache)
+      if (Entry.getResult().isDirty())
+        DirtyBlocks.push_back(Entry.getBB());
 
     // Sort the cache so that we can do fast binary search lookups below.
     std::sort(Cache.begin(), Cache.end());
 
     ++NumCacheDirtyNonLocal;
-    //cerr << "CACHED CASE: " << DirtyBlocks.size() << " dirty: "
+    // cerr << "CACHED CASE: " << DirtyBlocks.size() << " dirty: "
     //     << Cache.size() << " cached: " << *QueryInst;
   } else {
     // Seed DirtyBlocks with each of the preds of QueryInst's block.
@@ -852,9 +785,9 @@ MemoryDependenceAnalysis::getNonLocalCallDependency(CallSite QueryCS) {
   }
 
   // isReadonlyCall - If this is a read-only call, we can be more aggressive.
-  bool isReadonlyCall = AA->onlyReadsMemory(QueryCS);
+  bool isReadonlyCall = AA.onlyReadsMemory(QueryCS);
 
-  SmallPtrSet<BasicBlock*, 64> Visited;
+  SmallPtrSet<BasicBlock *, 32> Visited;
 
   unsigned NumSortedEntries = Cache.size();
   DEBUG(AssertSorted(Cache));
@@ -872,13 +805,13 @@ MemoryDependenceAnalysis::getNonLocalCallDependency(CallSite QueryCS) {
     // the cache set.  If so, find it.
     DEBUG(AssertSorted(Cache, NumSortedEntries));
     NonLocalDepInfo::iterator Entry =
-      std::upper_bound(Cache.begin(), Cache.begin()+NumSortedEntries,
-                       NonLocalDepEntry(DirtyBB));
+        std::upper_bound(Cache.begin(), Cache.begin() + NumSortedEntries,
+                         NonLocalDepEntry(DirtyBB));
     if (Entry != Cache.begin() && std::prev(Entry)->getBB() == DirtyBB)
       --Entry;
 
     NonLocalDepEntry *ExistingResult = nullptr;
-    if (Entry != Cache.begin()+NumSortedEntries &&
+    if (Entry != Cache.begin() + NumSortedEntries &&
         Entry->getBB() == DirtyBB) {
       // If we already have an entry, and if it isn't already dirty, the block
       // is done.
@@ -905,7 +838,8 @@ MemoryDependenceAnalysis::getNonLocalCallDependency(CallSite QueryCS) {
     MemDepResult Dep;
 
     if (ScanPos != DirtyBB->begin()) {
-      Dep = getCallSiteDependencyFrom(QueryCS, isReadonlyCall,ScanPos, DirtyBB);
+      Dep =
+          getCallSiteDependencyFrom(QueryCS, isReadonlyCall, ScanPos, DirtyBB);
     } else if (DirtyBB != &DirtyBB->getParent()->getEntryBlock()) {
       // No dependence found.  If this is the entry block of the function, it is
       // a clobber, otherwise it is unknown.
@@ -940,16 +874,8 @@ MemoryDependenceAnalysis::getNonLocalCallDependency(CallSite QueryCS) {
   return Cache;
 }
 
-/// getNonLocalPointerDependency - Perform a full dependency query for an
-/// access to the specified (non-volatile) memory location, returning the
-/// set of instructions that either define or clobber the value.
-///
-/// This method assumes the pointer has a "NonLocal" dependency within its
-/// own block.
-///
-void MemoryDependenceAnalysis::
-getNonLocalPointerDependency(Instruction *QueryInst,
-                             SmallVectorImpl<NonLocalDepResult> &Result) {
+void MemoryDependenceResults::getNonLocalPointerDependency(
+    Instruction *QueryInst, SmallVectorImpl<NonLocalDepResult> &Result) {
   const MemoryLocation Loc = MemoryLocation::get(QueryInst);
   bool isLoad = isa<LoadInst>(QueryInst);
   BasicBlock *FromBB = QueryInst->getParent();
@@ -958,7 +884,7 @@ getNonLocalPointerDependency(Instruction *QueryInst,
   assert(Loc.Ptr->getType()->isPointerTy() &&
          "Can't get pointer deps of a non-pointer!");
   Result.clear();
-  
+
   // This routine does not expect to deal with volatile instructions.
   // Doing so would require piping through the QueryInst all the way through.
   // TODO: volatiles can't be elided, but they can be reordered with other
@@ -976,46 +902,44 @@ getNonLocalPointerDependency(Instruction *QueryInst,
     return false;
   };
   if (isVolatile(QueryInst) || isOrdered(QueryInst)) {
-    Result.push_back(NonLocalDepResult(FromBB,
-                                       MemDepResult::getUnknown(),
+    Result.push_back(NonLocalDepResult(FromBB, MemDepResult::getUnknown(),
                                        const_cast<Value *>(Loc.Ptr)));
     return;
   }
   const DataLayout &DL = FromBB->getModule()->getDataLayout();
-  PHITransAddr Address(const_cast<Value *>(Loc.Ptr), DL, AC);
+  PHITransAddr Address(const_cast<Value *>(Loc.Ptr), DL, &AC);
 
   // This is the set of blocks we've inspected, and the pointer we consider in
   // each block.  Because of critical edges, we currently bail out if querying
   // a block with multiple different pointers.  This can happen during PHI
   // translation.
-  DenseMap<BasicBlock*, Value*> Visited;
-  if (!getNonLocalPointerDepFromBB(QueryInst, Address, Loc, isLoad, FromBB,
+  DenseMap<BasicBlock *, Value *> Visited;
+  if (getNonLocalPointerDepFromBB(QueryInst, Address, Loc, isLoad, FromBB,
                                    Result, Visited, true))
     return;
   Result.clear();
-  Result.push_back(NonLocalDepResult(FromBB,
-                                     MemDepResult::getUnknown(),
+  Result.push_back(NonLocalDepResult(FromBB, MemDepResult::getUnknown(),
                                      const_cast<Value *>(Loc.Ptr)));
 }
 
-/// GetNonLocalInfoForBlock - Compute the memdep value for BB with
-/// Pointer/PointeeSize using either cached information in Cache or by doing a
-/// lookup (which may use dirty cache info if available).  If we do a lookup,
-/// add the result to the cache.
-MemDepResult MemoryDependenceAnalysis::GetNonLocalInfoForBlock(
+/// Compute the memdep value for BB with Pointer/PointeeSize using either
+/// cached information in Cache or by doing a lookup (which may use dirty cache
+/// info if available).
+///
+/// If we do a lookup, add the result to the cache.
+MemDepResult MemoryDependenceResults::GetNonLocalInfoForBlock(
     Instruction *QueryInst, const MemoryLocation &Loc, bool isLoad,
     BasicBlock *BB, NonLocalDepInfo *Cache, unsigned NumSortedEntries) {
 
   // Do a binary search to see if we already have an entry for this block in
   // the cache set.  If so, find it.
-  NonLocalDepInfo::iterator Entry =
-    std::upper_bound(Cache->begin(), Cache->begin()+NumSortedEntries,
-                     NonLocalDepEntry(BB));
-  if (Entry != Cache->begin() && (Entry-1)->getBB() == BB)
+  NonLocalDepInfo::iterator Entry = std::upper_bound(
+      Cache->begin(), Cache->begin() + NumSortedEntries, NonLocalDepEntry(BB));
+  if (Entry != Cache->begin() && (Entry - 1)->getBB() == BB)
     --Entry;
 
   NonLocalDepEntry *ExistingResult = nullptr;
-  if (Entry != Cache->begin()+NumSortedEntries && Entry->getBB() == BB)
+  if (Entry != Cache->begin() + NumSortedEntries && Entry->getBB() == BB)
     ExistingResult = &*Entry;
 
   // If we have a cached entry, and it is non-dirty, use it as the value for
@@ -1043,8 +967,8 @@ MemDepResult MemoryDependenceAnalysis::GetNonLocalInfoForBlock(
   }
 
   // Scan the block for the dependency.
-  MemDepResult Dep = getPointerDependencyFrom(Loc, isLoad, ScanPos, BB,
-                                              QueryInst);
+  MemDepResult Dep =
+      getPointerDependencyFrom(Loc, isLoad, ScanPos, BB, QueryInst);
 
   // If we had a dirty entry for the block, update it.  Otherwise, just add
   // a new entry.
@@ -1068,11 +992,12 @@ MemDepResult MemoryDependenceAnalysis::GetNonLocalInfoForBlock(
   return Dep;
 }
 
-/// SortNonLocalDepInfoCache - Sort the NonLocalDepInfo cache, given a certain
-/// number of elements in the array that are already properly ordered.  This is
-/// optimized for the case when only a few entries are added.
+/// Sort the NonLocalDepInfo cache, given a certain number of elements in the
+/// array that are already properly ordered.
+///
+/// This is optimized for the case when only a few entries are added.
 static void
-SortNonLocalDepInfoCache(MemoryDependenceAnalysis::NonLocalDepInfo &Cache,
+SortNonLocalDepInfoCache(MemoryDependenceResults::NonLocalDepInfo &Cache,
                          unsigned NumSortedEntries) {
   switch (Cache.size() - NumSortedEntries) {
   case 0:
@@ -1082,8 +1007,8 @@ SortNonLocalDepInfoCache(MemoryDependenceAnalysis::NonLocalDepInfo &Cache,
     // Two new entries, insert the last one into place.
     NonLocalDepEntry Val = Cache.back();
     Cache.pop_back();
-    MemoryDependenceAnalysis::NonLocalDepInfo::iterator Entry =
-      std::upper_bound(Cache.begin(), Cache.end()-1, Val);
+    MemoryDependenceResults::NonLocalDepInfo::iterator Entry =
+        std::upper_bound(Cache.begin(), Cache.end() - 1, Val);
     Cache.insert(Entry, Val);
     // FALL THROUGH.
   }
@@ -1092,8 +1017,8 @@ SortNonLocalDepInfoCache(MemoryDependenceAnalysis::NonLocalDepInfo &Cache,
     if (Cache.size() != 1) {
       NonLocalDepEntry Val = Cache.back();
       Cache.pop_back();
-      MemoryDependenceAnalysis::NonLocalDepInfo::iterator Entry =
-        std::upper_bound(Cache.begin(), Cache.end(), Val);
+      MemoryDependenceResults::NonLocalDepInfo::iterator Entry =
+          std::upper_bound(Cache.begin(), Cache.end(), Val);
       Cache.insert(Entry, Val);
     }
     break;
@@ -1104,19 +1029,20 @@ SortNonLocalDepInfoCache(MemoryDependenceAnalysis::NonLocalDepInfo &Cache,
   }
 }
 
-/// getNonLocalPointerDepFromBB - Perform a dependency query based on
-/// pointer/pointeesize starting at the end of StartBB.  Add any clobber/def
-/// results to the results vector and keep track of which blocks are visited in
-/// 'Visited'.
+/// Perform a dependency query based on pointer/pointeesize starting at the end
+/// of StartBB.
+///
+/// Add any clobber/def results to the results vector and keep track of which
+/// blocks are visited in 'Visited'.
 ///
 /// This has special behavior for the first block queries (when SkipFirstBlock
 /// is true).  In this special case, it ignores the contents of the specified
 /// block and starts returning dependence info for its predecessors.
 ///
-/// This function returns false on success, or true to indicate that it could
+/// This function returns true on success, or false to indicate that it could
 /// not compute dependence information for some reason.  This should be treated
 /// as a clobber dependence on the first instruction in the predecessor block.
-bool MemoryDependenceAnalysis::getNonLocalPointerDepFromBB(
+bool MemoryDependenceResults::getNonLocalPointerDepFromBB(
     Instruction *QueryInst, const PHITransAddr &Pointer,
     const MemoryLocation &Loc, bool isLoad, BasicBlock *StartBB,
     SmallVectorImpl<NonLocalDepResult> &Result,
@@ -1135,7 +1061,7 @@ bool MemoryDependenceAnalysis::getNonLocalPointerDepFromBB(
   // Get the NLPI for CacheKey, inserting one into the map if it doesn't
   // already have one.
   std::pair<CachedNonLocalPointerInfo::iterator, bool> Pair =
-    NonLocalPointerDeps.insert(std::make_pair(CacheKey, InitialNLPI));
+      NonLocalPointerDeps.insert(std::make_pair(CacheKey, InitialNLPI));
   NonLocalPointerInfo *CacheInfo = &Pair.first->second;
 
   // If we already have a cache entry for this CacheKey, we may need to do some
@@ -1146,18 +1072,16 @@ bool MemoryDependenceAnalysis::getNonLocalPointerDepFromBB(
       // cached data and proceed with the query at the greater size.
       CacheInfo->Pair = BBSkipFirstBlockPair();
       CacheInfo->Size = Loc.Size;
-      for (NonLocalDepInfo::iterator DI = CacheInfo->NonLocalDeps.begin(),
-           DE = CacheInfo->NonLocalDeps.end(); DI != DE; ++DI)
-        if (Instruction *Inst = DI->getResult().getInst())
+      for (auto &Entry : CacheInfo->NonLocalDeps)
+        if (Instruction *Inst = Entry.getResult().getInst())
           RemoveFromReverseMap(ReverseNonLocalPtrDeps, Inst, CacheKey);
       CacheInfo->NonLocalDeps.clear();
     } else if (CacheInfo->Size > Loc.Size) {
       // This query's Size is less than the cached one. Conservatively restart
       // the query using the greater size.
-      return getNonLocalPointerDepFromBB(QueryInst, Pointer,
-                                         Loc.getWithNewSize(CacheInfo->Size),
-                                         isLoad, StartBB, Result, Visited,
-                                         SkipFirstBlock);
+      return getNonLocalPointerDepFromBB(
+          QueryInst, Pointer, Loc.getWithNewSize(CacheInfo->Size), isLoad,
+          StartBB, Result, Visited, SkipFirstBlock);
     }
 
     // If the query's AATags are inconsistent with the cached one,
@@ -1167,17 +1091,15 @@ bool MemoryDependenceAnalysis::getNonLocalPointerDepFromBB(
       if (CacheInfo->AATags) {
         CacheInfo->Pair = BBSkipFirstBlockPair();
         CacheInfo->AATags = AAMDNodes();
-        for (NonLocalDepInfo::iterator DI = CacheInfo->NonLocalDeps.begin(),
-             DE = CacheInfo->NonLocalDeps.end(); DI != DE; ++DI)
-          if (Instruction *Inst = DI->getResult().getInst())
+        for (auto &Entry : CacheInfo->NonLocalDeps)
+          if (Instruction *Inst = Entry.getResult().getInst())
             RemoveFromReverseMap(ReverseNonLocalPtrDeps, Inst, CacheKey);
         CacheInfo->NonLocalDeps.clear();
       }
       if (Loc.AATags)
-        return getNonLocalPointerDepFromBB(QueryInst,
-                                           Pointer, Loc.getWithoutAATags(),
-                                           isLoad, StartBB, Result, Visited,
-                                           SkipFirstBlock);
+        return getNonLocalPointerDepFromBB(
+            QueryInst, Pointer, Loc.getWithoutAATags(), isLoad, StartBB, Result,
+            Visited, SkipFirstBlock);
     }
   }
 
@@ -1192,37 +1114,33 @@ bool MemoryDependenceAnalysis::getNonLocalPointerDepFromBB(
     // to ensure that if a block in the results set is in the visited set that
     // it was for the same pointer query.
     if (!Visited.empty()) {
-      for (NonLocalDepInfo::iterator I = Cache->begin(), E = Cache->end();
-           I != E; ++I) {
-        DenseMap<BasicBlock*, Value*>::iterator VI = Visited.find(I->getBB());
+      for (auto &Entry : *Cache) {
+        DenseMap<BasicBlock *, Value *>::iterator VI =
+            Visited.find(Entry.getBB());
         if (VI == Visited.end() || VI->second == Pointer.getAddr())
           continue;
 
-        // We have a pointer mismatch in a block.  Just return clobber, saying
+        // We have a pointer mismatch in a block.  Just return false, saying
         // that something was clobbered in this result.  We could also do a
         // non-fully cached query, but there is little point in doing this.
-        return true;
+        return false;
       }
     }
 
     Value *Addr = Pointer.getAddr();
-    for (NonLocalDepInfo::iterator I = Cache->begin(), E = Cache->end();
-         I != E; ++I) {
-      Visited.insert(std::make_pair(I->getBB(), Addr));
-      if (I->getResult().isNonLocal()) {
+    for (auto &Entry : *Cache) {
+      Visited.insert(std::make_pair(Entry.getBB(), Addr));
+      if (Entry.getResult().isNonLocal()) {
         continue;
       }
 
-      if (!DT) {
-        Result.push_back(NonLocalDepResult(I->getBB(),
-                                           MemDepResult::getUnknown(),
-                                           Addr));
-      } else if (DT->isReachableFromEntry(I->getBB())) {
-        Result.push_back(NonLocalDepResult(I->getBB(), I->getResult(), Addr));
+      if (DT.isReachableFromEntry(Entry.getBB())) {
+        Result.push_back(
+            NonLocalDepResult(Entry.getBB(), Entry.getResult(), Addr));
       }
     }
     ++NumCacheCompleteNonLocalPtr;
-    return false;
+    return true;
   }
 
   // Otherwise, either this is a new block, a block with an invalid cache
@@ -1234,11 +1152,11 @@ bool MemoryDependenceAnalysis::getNonLocalPointerDepFromBB(
   else
     CacheInfo->Pair = BBSkipFirstBlockPair();
 
-  SmallVector<BasicBlock*, 32> Worklist;
+  SmallVector<BasicBlock *, 32> Worklist;
   Worklist.push_back(StartBB);
 
   // PredList used inside loop.
-  SmallVector<std::pair<BasicBlock*, PHITransAddr>, 16> PredList;
+  SmallVector<std::pair<BasicBlock *, PHITransAddr>, 16> PredList;
 
   // Keep track of the entries that we know are sorted.  Previously cached
   // entries will all be sorted.  The entries we add we only sort on demand (we
@@ -1246,6 +1164,8 @@ bool MemoryDependenceAnalysis::getNonLocalPointerDepFromBB(
   // won't get any reuse from currently inserted values, because we don't
   // revisit blocks after we insert info for them.
   unsigned NumSortedEntries = Cache->size();
+  unsigned WorklistEntries = BlockNumberLimit;
+  bool GotWorklistLimit = false;
   DEBUG(AssertSorted(*Cache));
 
   while (!Worklist.empty()) {
@@ -1266,7 +1186,7 @@ bool MemoryDependenceAnalysis::getNonLocalPointerDepFromBB(
       // specific block queries) but we can't do the fastpath "return all
       // results from the set".  Clear out the indicator for this.
       CacheInfo->Pair = BBSkipFirstBlockPair();
-      return true;
+      return false;
     }
 
     // Skip the first block if we have it.
@@ -1278,18 +1198,12 @@ bool MemoryDependenceAnalysis::getNonLocalPointerDepFromBB(
       // Get the dependency info for Pointer in BB.  If we have cached
       // information, we will use it, otherwise we compute it.
       DEBUG(AssertSorted(*Cache, NumSortedEntries));
-      MemDepResult Dep = GetNonLocalInfoForBlock(QueryInst,
-                                                 Loc, isLoad, BB, Cache,
-                                                 NumSortedEntries);
+      MemDepResult Dep = GetNonLocalInfoForBlock(QueryInst, Loc, isLoad, BB,
+                                                 Cache, NumSortedEntries);
 
       // If we got a Def or Clobber, add this to the list of results.
       if (!Dep.isNonLocal()) {
-        if (!DT) {
-          Result.push_back(NonLocalDepResult(BB,
-                                             MemDepResult::getUnknown(),
-                                             Pointer.getAddr()));
-          continue;
-        } else if (DT->isReachableFromEntry(BB)) {
+        if (DT.isReachableFromEntry(BB)) {
           Result.push_back(NonLocalDepResult(BB, Dep, Pointer.getAddr()));
           continue;
         }
@@ -1302,11 +1216,11 @@ bool MemoryDependenceAnalysis::getNonLocalPointerDepFromBB(
     // the same Pointer.
     if (!Pointer.NeedsPHITranslationFromBlock(BB)) {
       SkipFirstBlock = false;
-      SmallVector<BasicBlock*, 16> NewBlocks;
+      SmallVector<BasicBlock *, 16> NewBlocks;
       for (BasicBlock *Pred : PredCache.get(BB)) {
         // Verify that we haven't looked at this block yet.
-        std::pair<DenseMap<BasicBlock*,Value*>::iterator, bool>
-          InsertRes = Visited.insert(std::make_pair(Pred, Pointer.getAddr()));
+        std::pair<DenseMap<BasicBlock *, Value *>::iterator, bool> InsertRes =
+            Visited.insert(std::make_pair(Pred, Pointer.getAddr()));
         if (InsertRes.second) {
           // First time we've looked at *PI.
           NewBlocks.push_back(Pred);
@@ -1324,6 +1238,15 @@ bool MemoryDependenceAnalysis::getNonLocalPointerDepFromBB(
           goto PredTranslationFailure;
         }
       }
+      if (NewBlocks.size() > WorklistEntries) {
+        // Make sure to clean up the Visited map before continuing on to
+        // PredTranslationFailure.
+        for (unsigned i = 0; i < NewBlocks.size(); i++)
+          Visited.erase(NewBlocks[i]);
+        GotWorklistLimit = true;
+        goto PredTranslationFailure;
+      }
+      WorklistEntries -= NewBlocks.size();
       Worklist.append(NewBlocks.begin(), NewBlocks.end());
       continue;
     }
@@ -1351,7 +1274,7 @@ bool MemoryDependenceAnalysis::getNonLocalPointerDepFromBB(
       // Get the PHI translated pointer in this predecessor.  This can fail if
       // not translatable, in which case the getAddr() returns null.
       PHITransAddr &PredPointer = PredList.back().second;
-      PredPointer.PHITranslateValue(BB, Pred, DT, /*MustDominate=*/false);
+      PredPointer.PHITranslateValue(BB, Pred, &DT, /*MustDominate=*/false);
       Value *PredPtrVal = PredPointer.getAddr();
 
       // Check to see if we have already visited this pred block with another
@@ -1359,8 +1282,8 @@ bool MemoryDependenceAnalysis::getNonLocalPointerDepFromBB(
       // with PHI translation when a critical edge exists and the PHI node in
       // the successor translates to a pointer value different than the
       // pointer the block was first analyzed with.
-      std::pair<DenseMap<BasicBlock*,Value*>::iterator, bool>
-        InsertRes = Visited.insert(std::make_pair(Pred, PredPtrVal));
+      std::pair<DenseMap<BasicBlock *, Value *>::iterator, bool> InsertRes =
+          Visited.insert(std::make_pair(Pred, PredPtrVal));
 
       if (!InsertRes.second) {
         // We found the pred; take it off the list of preds to visit.
@@ -1411,10 +1334,9 @@ bool MemoryDependenceAnalysis::getNonLocalPointerDepFromBB(
       // result conflicted with the Visited list; we have to conservatively
       // assume it is unknown, but this also does not block PRE of the load.
       if (!CanTranslate ||
-          getNonLocalPointerDepFromBB(QueryInst, PredPointer,
-                                      Loc.getWithNewPtr(PredPtrVal),
-                                      isLoad, Pred,
-                                      Result, Visited)) {
+          !getNonLocalPointerDepFromBB(QueryInst, PredPointer,
+                                      Loc.getWithNewPtr(PredPtrVal), isLoad,
+                                      Pred, Result, Visited)) {
         // Add the entry to the Result list.
         NonLocalDepResult Entry(Pred, MemDepResult::getUnknown(), PredPtrVal);
         Result.push_back(Entry);
@@ -1467,35 +1389,38 @@ bool MemoryDependenceAnalysis::getNonLocalPointerDepFromBB(
     // incoming value.  Since we can't phi translate to one of the predecessors,
     // we have to bail out.
     if (SkipFirstBlock)
-      return true;
+      return false;
 
-    for (NonLocalDepInfo::reverse_iterator I = Cache->rbegin(); ; ++I) {
-      assert(I != Cache->rend() && "Didn't find current block??");
-      if (I->getBB() != BB)
+    bool foundBlock = false;
+    for (NonLocalDepEntry &I : llvm::reverse(*Cache)) {
+      if (I.getBB() != BB)
         continue;
 
-      assert((I->getResult().isNonLocal() || !DT->isReachableFromEntry(BB)) &&
+      assert((GotWorklistLimit || I.getResult().isNonLocal() ||
+              !DT.isReachableFromEntry(BB)) &&
              "Should only be here with transparent block");
-      I->setResult(MemDepResult::getUnknown());
-      Result.push_back(NonLocalDepResult(I->getBB(), I->getResult(),
-                                         Pointer.getAddr()));
+      foundBlock = true;
+      I.setResult(MemDepResult::getUnknown());
+      Result.push_back(
+          NonLocalDepResult(I.getBB(), I.getResult(), Pointer.getAddr()));
       break;
     }
+    (void)foundBlock; (void)GotWorklistLimit;
+    assert((foundBlock || GotWorklistLimit) && "Current block not in cache?");
   }
 
   // Okay, we're done now.  If we added new values to the cache, re-sort it.
   SortNonLocalDepInfoCache(*Cache, NumSortedEntries);
   DEBUG(AssertSorted(*Cache));
-  return false;
+  return true;
 }
 
-/// RemoveCachedNonLocalPointerDependencies - If P exists in
-/// CachedNonLocalPointerInfo, remove it.
-void MemoryDependenceAnalysis::
-RemoveCachedNonLocalPointerDependencies(ValueIsLoadPair P) {
-  CachedNonLocalPointerInfo::iterator It =
-    NonLocalPointerDeps.find(P);
-  if (It == NonLocalPointerDeps.end()) return;
+/// If P exists in CachedNonLocalPointerInfo, remove it.
+void MemoryDependenceResults::RemoveCachedNonLocalPointerDependencies(
+    ValueIsLoadPair P) {
+  CachedNonLocalPointerInfo::iterator It = NonLocalPointerDeps.find(P);
+  if (It == NonLocalPointerDeps.end())
+    return;
 
   // Remove all of the entries in the BB->val map.  This involves removing
   // instructions from the reverse map.
@@ -1503,7 +1428,8 @@ RemoveCachedNonLocalPointerDependencies(ValueIsLoadPair P) {
 
   for (unsigned i = 0, e = PInfo.size(); i != e; ++i) {
     Instruction *Target = PInfo[i].getResult().getInst();
-    if (!Target) continue;  // Ignore non-local dep results.
+    if (!Target)
+      continue; // Ignore non-local dep results.
     assert(Target->getParent() == PInfo[i].getBB());
 
     // Eliminating the dirty entry from 'Cache', so update the reverse info.
@@ -1514,41 +1440,28 @@ RemoveCachedNonLocalPointerDependencies(ValueIsLoadPair P) {
   NonLocalPointerDeps.erase(It);
 }
 
-
-/// invalidateCachedPointerInfo - This method is used to invalidate cached
-/// information about the specified pointer, because it may be too
-/// conservative in memdep.  This is an optional call that can be used when
-/// the client detects an equivalence between the pointer and some other
-/// value and replaces the other value with ptr. This can make Ptr available
-/// in more places that cached info does not necessarily keep.
-void MemoryDependenceAnalysis::invalidateCachedPointerInfo(Value *Ptr) {
+void MemoryDependenceResults::invalidateCachedPointerInfo(Value *Ptr) {
   // If Ptr isn't really a pointer, just ignore it.
-  if (!Ptr->getType()->isPointerTy()) return;
+  if (!Ptr->getType()->isPointerTy())
+    return;
   // Flush store info for the pointer.
   RemoveCachedNonLocalPointerDependencies(ValueIsLoadPair(Ptr, false));
   // Flush load info for the pointer.
   RemoveCachedNonLocalPointerDependencies(ValueIsLoadPair(Ptr, true));
 }
 
-/// invalidateCachedPredecessors - Clear the PredIteratorCache info.
-/// This needs to be done when the CFG changes, e.g., due to splitting
-/// critical edges.
-void MemoryDependenceAnalysis::invalidateCachedPredecessors() {
+void MemoryDependenceResults::invalidateCachedPredecessors() {
   PredCache.clear();
 }
 
-/// removeInstruction - Remove an instruction from the dependence analysis,
-/// updating the dependence of instructions that previously depended on it.
-/// This method attempts to keep the cache coherent using the reverse map.
-void MemoryDependenceAnalysis::removeInstruction(Instruction *RemInst) {
+void MemoryDependenceResults::removeInstruction(Instruction *RemInst) {
   // Walk through the Non-local dependencies, removing this one as the value
   // for any cached queries.
   NonLocalDepMapType::iterator NLDI = NonLocalDeps.find(RemInst);
   if (NLDI != NonLocalDeps.end()) {
     NonLocalDepInfo &BlockMap = NLDI->second.first;
-    for (NonLocalDepInfo::iterator DI = BlockMap.begin(), DE = BlockMap.end();
-         DI != DE; ++DI)
-      if (Instruction *Inst = DI->getResult().getInst())
+    for (auto &Entry : BlockMap)
+      if (Instruction *Inst = Entry.getResult().getInst())
         RemoveFromReverseMap(ReverseNonLocalDeps, Inst, RemInst);
     NonLocalDeps.erase(NLDI);
   }
@@ -1578,7 +1491,7 @@ void MemoryDependenceAnalysis::removeInstruction(Instruction *RemInst) {
 
   // Loop over all of the things that depend on the instruction we're removing.
   //
-  SmallVector<std::pair<Instruction*, Instruction*>, 8> ReverseDepsToAdd;
+  SmallVector<std::pair<Instruction *, Instruction *>, 8> ReverseDepsToAdd;
 
   // If we find RemInst as a clobber or Def in any of the maps for other values,
   // we need to replace its entry with a dirty version of the instruction after
@@ -1603,10 +1516,11 @@ void MemoryDependenceAnalysis::removeInstruction(Instruction *RemInst) {
       LocalDeps[InstDependingOnRemInst] = NewDirtyVal;
 
       // Make sure to remember that new things depend on NewDepInst.
-      assert(NewDirtyVal.getInst() && "There is no way something else can have "
+      assert(NewDirtyVal.getInst() &&
+             "There is no way something else can have "
              "a local dep on this if it is a terminator!");
-      ReverseDepsToAdd.push_back(std::make_pair(NewDirtyVal.getInst(),
-                                                InstDependingOnRemInst));
+      ReverseDepsToAdd.push_back(
+          std::make_pair(NewDirtyVal.getInst(), InstDependingOnRemInst));
     }
 
     ReverseLocalDeps.erase(ReverseDepIt);
@@ -1614,8 +1528,8 @@ void MemoryDependenceAnalysis::removeInstruction(Instruction *RemInst) {
     // Add new reverse deps after scanning the set, to avoid invalidating the
     // 'ReverseDeps' reference.
     while (!ReverseDepsToAdd.empty()) {
-      ReverseLocalDeps[ReverseDepsToAdd.back().first]
-        .insert(ReverseDepsToAdd.back().second);
+      ReverseLocalDeps[ReverseDepsToAdd.back().first].insert(
+          ReverseDepsToAdd.back().second);
       ReverseDepsToAdd.pop_back();
     }
   }
@@ -1629,12 +1543,12 @@ void MemoryDependenceAnalysis::removeInstruction(Instruction *RemInst) {
       // The information is now dirty!
       INLD.second = true;
 
-      for (NonLocalDepInfo::iterator DI = INLD.first.begin(),
-           DE = INLD.first.end(); DI != DE; ++DI) {
-        if (DI->getResult().getInst() != RemInst) continue;
+      for (auto &Entry : INLD.first) {
+        if (Entry.getResult().getInst() != RemInst)
+          continue;
 
         // Convert to a dirty entry for the subsequent instruction.
-        DI->setResult(NewDirtyVal);
+        Entry.setResult(NewDirtyVal);
 
         if (Instruction *NextI = NewDirtyVal.getInst())
           ReverseDepsToAdd.push_back(std::make_pair(NextI, I));
@@ -1645,8 +1559,8 @@ void MemoryDependenceAnalysis::removeInstruction(Instruction *RemInst) {
 
     // Add new reverse deps after scanning the set, to avoid invalidating 'Set'
     while (!ReverseDepsToAdd.empty()) {
-      ReverseNonLocalDeps[ReverseDepsToAdd.back().first]
-        .insert(ReverseDepsToAdd.back().second);
+      ReverseNonLocalDeps[ReverseDepsToAdd.back().first].insert(
+          ReverseDepsToAdd.back().second);
       ReverseDepsToAdd.pop_back();
     }
   }
@@ -1654,9 +1568,10 @@ void MemoryDependenceAnalysis::removeInstruction(Instruction *RemInst) {
   // If the instruction is in ReverseNonLocalPtrDeps then it appears as a
   // value in the NonLocalPointerDeps info.
   ReverseNonLocalPtrDepTy::iterator ReversePtrDepIt =
-    ReverseNonLocalPtrDeps.find(RemInst);
+      ReverseNonLocalPtrDeps.find(RemInst);
   if (ReversePtrDepIt != ReverseNonLocalPtrDeps.end()) {
-    SmallVector<std::pair<Instruction*, ValueIsLoadPair>,8> ReversePtrDepsToAdd;
+    SmallVector<std::pair<Instruction *, ValueIsLoadPair>, 8>
+        ReversePtrDepsToAdd;
 
     for (ValueIsLoadPair P : ReversePtrDepIt->second) {
       assert(P.getPointer() != RemInst &&
@@ -1668,12 +1583,12 @@ void MemoryDependenceAnalysis::removeInstruction(Instruction *RemInst) {
       NonLocalPointerDeps[P].Pair = BBSkipFirstBlockPair();
 
       // Update any entries for RemInst to use the instruction after it.
-      for (NonLocalDepInfo::iterator DI = NLPDI.begin(), DE = NLPDI.end();
-           DI != DE; ++DI) {
-        if (DI->getResult().getInst() != RemInst) continue;
+      for (auto &Entry : NLPDI) {
+        if (Entry.getResult().getInst() != RemInst)
+          continue;
 
         // Convert to a dirty entry for the subsequent instruction.
-        DI->setResult(NewDirtyVal);
+        Entry.setResult(NewDirtyVal);
 
         if (Instruction *NewDirtyInst = NewDirtyVal.getInst())
           ReversePtrDepsToAdd.push_back(std::make_pair(NewDirtyInst, P));
@@ -1687,70 +1602,107 @@ void MemoryDependenceAnalysis::removeInstruction(Instruction *RemInst) {
     ReverseNonLocalPtrDeps.erase(ReversePtrDepIt);
 
     while (!ReversePtrDepsToAdd.empty()) {
-      ReverseNonLocalPtrDeps[ReversePtrDepsToAdd.back().first]
-        .insert(ReversePtrDepsToAdd.back().second);
+      ReverseNonLocalPtrDeps[ReversePtrDepsToAdd.back().first].insert(
+          ReversePtrDepsToAdd.back().second);
       ReversePtrDepsToAdd.pop_back();
     }
   }
 
-
   assert(!NonLocalDeps.count(RemInst) && "RemInst got reinserted?");
   DEBUG(verifyRemoved(RemInst));
 }
-/// verifyRemoved - Verify that the specified instruction does not occur
-/// in our internal data structures. This function verifies by asserting in
-/// debug builds.
-void MemoryDependenceAnalysis::verifyRemoved(Instruction *D) const {
+
+/// Verify that the specified instruction does not occur in our internal data
+/// structures.
+///
+/// This function verifies by asserting in debug builds.
+void MemoryDependenceResults::verifyRemoved(Instruction *D) const {
 #ifndef NDEBUG
-  for (LocalDepMapType::const_iterator I = LocalDeps.begin(),
-       E = LocalDeps.end(); I != E; ++I) {
-    assert(I->first != D && "Inst occurs in data structures");
-    assert(I->second.getInst() != D &&
-           "Inst occurs in data structures");
+  for (const auto &DepKV : LocalDeps) {
+    assert(DepKV.first != D && "Inst occurs in data structures");
+    assert(DepKV.second.getInst() != D && "Inst occurs in data structures");
   }
 
-  for (CachedNonLocalPointerInfo::const_iterator I =NonLocalPointerDeps.begin(),
-       E = NonLocalPointerDeps.end(); I != E; ++I) {
-    assert(I->first.getPointer() != D && "Inst occurs in NLPD map key");
-    const NonLocalDepInfo &Val = I->second.NonLocalDeps;
-    for (NonLocalDepInfo::const_iterator II = Val.begin(), E = Val.end();
-         II != E; ++II)
-      assert(II->getResult().getInst() != D && "Inst occurs as NLPD value");
+  for (const auto &DepKV : NonLocalPointerDeps) {
+    assert(DepKV.first.getPointer() != D && "Inst occurs in NLPD map key");
+    for (const auto &Entry : DepKV.second.NonLocalDeps)
+      assert(Entry.getResult().getInst() != D && "Inst occurs as NLPD value");
   }
 
-  for (NonLocalDepMapType::const_iterator I = NonLocalDeps.begin(),
-       E = NonLocalDeps.end(); I != E; ++I) {
-    assert(I->first != D && "Inst occurs in data structures");
-    const PerInstNLInfo &INLD = I->second;
-    for (NonLocalDepInfo::const_iterator II = INLD.first.begin(),
-         EE = INLD.first.end(); II  != EE; ++II)
-      assert(II->getResult().getInst() != D && "Inst occurs in data structures");
+  for (const auto &DepKV : NonLocalDeps) {
+    assert(DepKV.first != D && "Inst occurs in data structures");
+    const PerInstNLInfo &INLD = DepKV.second;
+    for (const auto &Entry : INLD.first)
+      assert(Entry.getResult().getInst() != D &&
+             "Inst occurs in data structures");
   }
 
-  for (ReverseDepMapType::const_iterator I = ReverseLocalDeps.begin(),
-       E = ReverseLocalDeps.end(); I != E; ++I) {
-    assert(I->first != D && "Inst occurs in data structures");
-    for (Instruction *Inst : I->second)
+  for (const auto &DepKV : ReverseLocalDeps) {
+    assert(DepKV.first != D && "Inst occurs in data structures");
+    for (Instruction *Inst : DepKV.second)
       assert(Inst != D && "Inst occurs in data structures");
   }
 
-  for (ReverseDepMapType::const_iterator I = ReverseNonLocalDeps.begin(),
-       E = ReverseNonLocalDeps.end();
-       I != E; ++I) {
-    assert(I->first != D && "Inst occurs in data structures");
-    for (Instruction *Inst : I->second)
+  for (const auto &DepKV : ReverseNonLocalDeps) {
+    assert(DepKV.first != D && "Inst occurs in data structures");
+    for (Instruction *Inst : DepKV.second)
       assert(Inst != D && "Inst occurs in data structures");
   }
 
-  for (ReverseNonLocalPtrDepTy::const_iterator
-       I = ReverseNonLocalPtrDeps.begin(),
-       E = ReverseNonLocalPtrDeps.end(); I != E; ++I) {
-    assert(I->first != D && "Inst occurs in rev NLPD map");
+  for (const auto &DepKV : ReverseNonLocalPtrDeps) {
+    assert(DepKV.first != D && "Inst occurs in rev NLPD map");
 
-    for (ValueIsLoadPair P : I->second)
-      assert(P != ValueIsLoadPair(D, false) &&
-             P != ValueIsLoadPair(D, true) &&
+    for (ValueIsLoadPair P : DepKV.second)
+      assert(P != ValueIsLoadPair(D, false) && P != ValueIsLoadPair(D, true) &&
              "Inst occurs in ReverseNonLocalPtrDeps map");
   }
 #endif
 }
+
+char MemoryDependenceAnalysis::PassID;
+
+MemoryDependenceResults
+MemoryDependenceAnalysis::run(Function &F, AnalysisManager<Function> &AM) {
+  auto &AA = AM.getResult<AAManager>(F);
+  auto &AC = AM.getResult<AssumptionAnalysis>(F);
+  auto &TLI = AM.getResult<TargetLibraryAnalysis>(F);
+  auto &DT = AM.getResult<DominatorTreeAnalysis>(F);
+  return MemoryDependenceResults(AA, AC, TLI, DT);
+}
+
+char MemoryDependenceWrapperPass::ID = 0;
+
+INITIALIZE_PASS_BEGIN(MemoryDependenceWrapperPass, "memdep",
+                      "Memory Dependence Analysis", false, true)
+INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker)
+INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfoWrapperPass)
+INITIALIZE_PASS_END(MemoryDependenceWrapperPass, "memdep",
+                    "Memory Dependence Analysis", false, true)
+
+MemoryDependenceWrapperPass::MemoryDependenceWrapperPass() : FunctionPass(ID) {
+  initializeMemoryDependenceWrapperPassPass(*PassRegistry::getPassRegistry());
+}
+MemoryDependenceWrapperPass::~MemoryDependenceWrapperPass() {}
+
+void MemoryDependenceWrapperPass::releaseMemory() {
+  MemDep.reset();
+}
+
+void MemoryDependenceWrapperPass::getAnalysisUsage(AnalysisUsage &AU) const {
+  AU.setPreservesAll();
+  AU.addRequired<AssumptionCacheTracker>();
+  AU.addRequired<DominatorTreeWrapperPass>();
+  AU.addRequiredTransitive<AAResultsWrapperPass>();
+  AU.addRequiredTransitive<TargetLibraryInfoWrapperPass>();
+}
+
+bool MemoryDependenceWrapperPass::runOnFunction(Function &F) {
+  auto &AA = getAnalysis<AAResultsWrapperPass>().getAAResults();
+  auto &AC = getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F);
+  auto &TLI = getAnalysis<TargetLibraryInfoWrapperPass>().getTLI();
+  auto &DT = getAnalysis<DominatorTreeWrapperPass>().getDomTree();
+  MemDep.emplace(AA, AC, TLI, DT);
+  return false;
+}
diff --git a/lib/Analysis/MemoryLocation.cpp b/lib/Analysis/MemoryLocation.cpp
index e4491261e055..a0ae72f1415f 100644
--- a/lib/Analysis/MemoryLocation.cpp
+++ b/lib/Analysis/MemoryLocation.cpp
@@ -90,23 +90,6 @@ MemoryLocation MemoryLocation::getForDest(const MemIntrinsic *MTI) {
   return MemoryLocation(MTI->getRawDest(), Size, AATags);
 }
 
-// FIXME: This code is duplicated with BasicAliasAnalysis and should be hoisted
-// to some common utility location.
-static bool isMemsetPattern16(const Function *MS,
-                              const TargetLibraryInfo &TLI) {
-  if (TLI.has(LibFunc::memset_pattern16) &&
-      MS->getName() == "memset_pattern16") {
-    FunctionType *MemsetType = MS->getFunctionType();
-    if (!MemsetType->isVarArg() && MemsetType->getNumParams() == 3 &&
-        isa<PointerType>(MemsetType->getParamType(0)) &&
-        isa<PointerType>(MemsetType->getParamType(1)) &&
-        isa<IntegerType>(MemsetType->getParamType(2)))
-      return true;
-  }
-
-  return false;
-}
-
 MemoryLocation MemoryLocation::getForArgument(ImmutableCallSite CS,
                                               unsigned ArgIdx,
                                               const TargetLibraryInfo &TLI) {
@@ -159,8 +142,9 @@ MemoryLocation MemoryLocation::getForArgument(ImmutableCallSite CS,
   // for memcpy/memset.  This is particularly important because the
   // LoopIdiomRecognizer likes to turn loops into calls to memset_pattern16
   // whenever possible.
-  if (CS.getCalledFunction() &&
-      isMemsetPattern16(CS.getCalledFunction(), TLI)) {
+  LibFunc::Func F;
+  if (CS.getCalledFunction() && TLI.getLibFunc(*CS.getCalledFunction(), F) &&
+      F == LibFunc::memset_pattern16 && TLI.has(F)) {
     assert((ArgIdx == 0 || ArgIdx == 1) &&
            "Invalid argument index for memset_pattern16");
     if (ArgIdx == 1)
diff --git a/lib/Analysis/ModuleSummaryAnalysis.cpp b/lib/Analysis/ModuleSummaryAnalysis.cpp
new file mode 100644
index 000000000000..c9ac2bdb7942
--- /dev/null
+++ b/lib/Analysis/ModuleSummaryAnalysis.cpp
@@ -0,0 +1,249 @@
+//===- ModuleSummaryAnalysis.cpp - Module summary index builder -----------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This pass builds a ModuleSummaryIndex object for the module, to be written
+// to bitcode or LLVM assembly.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Analysis/ModuleSummaryAnalysis.h"
+#include "llvm/Analysis/BlockFrequencyInfo.h"
+#include "llvm/Analysis/BlockFrequencyInfoImpl.h"
+#include "llvm/Analysis/BranchProbabilityInfo.h"
+#include "llvm/Analysis/IndirectCallPromotionAnalysis.h"
+#include "llvm/Analysis/LoopInfo.h"
+#include "llvm/IR/CallSite.h"
+#include "llvm/IR/Dominators.h"
+#include "llvm/IR/InstIterator.h"
+#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/ValueSymbolTable.h"
+#include "llvm/Pass.h"
+using namespace llvm;
+
+#define DEBUG_TYPE "module-summary-analysis"
+
+// Walk through the operands of a given User via worklist iteration and populate
+// the set of GlobalValue references encountered. Invoked either on an
+// Instruction or a GlobalVariable (which walks its initializer).
+static void findRefEdges(const User *CurUser, DenseSet<const Value *> &RefEdges,
+                         SmallPtrSet<const User *, 8> &Visited) {
+  SmallVector<const User *, 32> Worklist;
+  Worklist.push_back(CurUser);
+
+  while (!Worklist.empty()) {
+    const User *U = Worklist.pop_back_val();
+
+    if (!Visited.insert(U).second)
+      continue;
+
+    ImmutableCallSite CS(U);
+
+    for (const auto &OI : U->operands()) {
+      const User *Operand = dyn_cast<User>(OI);
+      if (!Operand)
+        continue;
+      if (isa<BlockAddress>(Operand))
+        continue;
+      if (isa<GlobalValue>(Operand)) {
+        // We have a reference to a global value. This should be added to
+        // the reference set unless it is a callee. Callees are handled
+        // specially by WriteFunction and are added to a separate list.
+        if (!(CS && CS.isCallee(&OI)))
+          RefEdges.insert(Operand);
+        continue;
+      }
+      Worklist.push_back(Operand);
+    }
+  }
+}
+
+void ModuleSummaryIndexBuilder::computeFunctionSummary(
+    const Function &F, BlockFrequencyInfo *BFI) {
+  // Summary not currently supported for anonymous functions, they must
+  // be renamed.
+  if (!F.hasName())
+    return;
+
+  unsigned NumInsts = 0;
+  // Map from callee ValueId to profile count. Used to accumulate profile
+  // counts for all static calls to a given callee.
+  DenseMap<const Value *, CalleeInfo> CallGraphEdges;
+  DenseMap<GlobalValue::GUID, CalleeInfo> IndirectCallEdges;
+  DenseSet<const Value *> RefEdges;
+  ICallPromotionAnalysis ICallAnalysis;
+
+  SmallPtrSet<const User *, 8> Visited;
+  for (const BasicBlock &BB : F)
+    for (const Instruction &I : BB) {
+      if (!isa<DbgInfoIntrinsic>(I))
+        ++NumInsts;
+
+      if (auto CS = ImmutableCallSite(&I)) {
+        auto *CalledFunction = CS.getCalledFunction();
+        // Check if this is a direct call to a known function.
+        if (CalledFunction) {
+          if (CalledFunction->hasName() && !CalledFunction->isIntrinsic()) {
+            auto ScaledCount = BFI ? BFI->getBlockProfileCount(&BB) : None;
+            auto *CalleeId =
+                M->getValueSymbolTable().lookup(CalledFunction->getName());
+            CallGraphEdges[CalleeId] +=
+                (ScaledCount ? ScaledCount.getValue() : 0);
+          }
+        } else {
+          // Otherwise, check for an indirect call (call to a non-const value
+          // that isn't an inline assembly call).
+          const CallInst *CI = dyn_cast<CallInst>(&I);
+          if (CS.getCalledValue() && !isa<Constant>(CS.getCalledValue()) &&
+              !(CI && CI->isInlineAsm())) {
+            uint32_t NumVals, NumCandidates;
+            uint64_t TotalCount;
+            auto CandidateProfileData =
+                ICallAnalysis.getPromotionCandidatesForInstruction(
+                    &I, NumVals, TotalCount, NumCandidates);
+            for (auto &Candidate : CandidateProfileData)
+              IndirectCallEdges[Candidate.Value] += Candidate.Count;
+          }
+        }
+      }
+      findRefEdges(&I, RefEdges, Visited);
+    }
+
+  GlobalValueSummary::GVFlags Flags(F);
+  std::unique_ptr<FunctionSummary> FuncSummary =
+      llvm::make_unique<FunctionSummary>(Flags, NumInsts);
+  FuncSummary->addCallGraphEdges(CallGraphEdges);
+  FuncSummary->addCallGraphEdges(IndirectCallEdges);
+  FuncSummary->addRefEdges(RefEdges);
+  Index->addGlobalValueSummary(F.getName(), std::move(FuncSummary));
+}
+
+void ModuleSummaryIndexBuilder::computeVariableSummary(
+    const GlobalVariable &V) {
+  DenseSet<const Value *> RefEdges;
+  SmallPtrSet<const User *, 8> Visited;
+  findRefEdges(&V, RefEdges, Visited);
+  GlobalValueSummary::GVFlags Flags(V);
+  std::unique_ptr<GlobalVarSummary> GVarSummary =
+      llvm::make_unique<GlobalVarSummary>(Flags);
+  GVarSummary->addRefEdges(RefEdges);
+  Index->addGlobalValueSummary(V.getName(), std::move(GVarSummary));
+}
+
+ModuleSummaryIndexBuilder::ModuleSummaryIndexBuilder(
+    const Module *M,
+    std::function<BlockFrequencyInfo *(const Function &F)> Ftor)
+    : Index(llvm::make_unique<ModuleSummaryIndex>()), M(M) {
+  // Check if the module can be promoted, otherwise just disable importing from
+  // it by not emitting any summary.
+  // FIXME: we could still import *into* it most of the time.
+  if (!moduleCanBeRenamedForThinLTO(*M))
+    return;
+
+  // Compute summaries for all functions defined in module, and save in the
+  // index.
+  for (auto &F : *M) {
+    if (F.isDeclaration())
+      continue;
+
+    BlockFrequencyInfo *BFI = nullptr;
+    std::unique_ptr<BlockFrequencyInfo> BFIPtr;
+    if (Ftor)
+      BFI = Ftor(F);
+    else if (F.getEntryCount().hasValue()) {
+      LoopInfo LI{DominatorTree(const_cast<Function &>(F))};
+      BranchProbabilityInfo BPI{F, LI};
+      BFIPtr = llvm::make_unique<BlockFrequencyInfo>(F, BPI, LI);
+      BFI = BFIPtr.get();
+    }
+
+    computeFunctionSummary(F, BFI);
+  }
+
+  // Compute summaries for all variables defined in module, and save in the
+  // index.
+  for (const GlobalVariable &G : M->globals()) {
+    if (G.isDeclaration())
+      continue;
+    computeVariableSummary(G);
+  }
+}
+
+char ModuleSummaryIndexWrapperPass::ID = 0;
+INITIALIZE_PASS_BEGIN(ModuleSummaryIndexWrapperPass, "module-summary-analysis",
+                      "Module Summary Analysis", false, true)
+INITIALIZE_PASS_DEPENDENCY(BlockFrequencyInfoWrapperPass)
+INITIALIZE_PASS_END(ModuleSummaryIndexWrapperPass, "module-summary-analysis",
+                    "Module Summary Analysis", false, true)
+
+ModulePass *llvm::createModuleSummaryIndexWrapperPass() {
+  return new ModuleSummaryIndexWrapperPass();
+}
+
+ModuleSummaryIndexWrapperPass::ModuleSummaryIndexWrapperPass()
+    : ModulePass(ID) {
+  initializeModuleSummaryIndexWrapperPassPass(*PassRegistry::getPassRegistry());
+}
+
+bool ModuleSummaryIndexWrapperPass::runOnModule(Module &M) {
+  IndexBuilder = llvm::make_unique<ModuleSummaryIndexBuilder>(
+      &M, [this](const Function &F) {
+        return &(this->getAnalysis<BlockFrequencyInfoWrapperPass>(
+                         *const_cast<Function *>(&F))
+                     .getBFI());
+      });
+  return false;
+}
+
+bool ModuleSummaryIndexWrapperPass::doFinalization(Module &M) {
+  IndexBuilder.reset();
+  return false;
+}
+
+void ModuleSummaryIndexWrapperPass::getAnalysisUsage(AnalysisUsage &AU) const {
+  AU.setPreservesAll();
+  AU.addRequired<BlockFrequencyInfoWrapperPass>();
+}
+
+bool llvm::moduleCanBeRenamedForThinLTO(const Module &M) {
+  // We cannot currently promote or rename anything used in inline assembly,
+  // which are not visible to the compiler. Detect a possible case by looking
+  // for a llvm.used local value, in conjunction with an inline assembly call
+  // in the module. Prevent importing of any modules containing these uses by
+  // suppressing generation of the index. This also prevents importing
+  // into this module, which is also necessary to avoid needing to rename
+  // in case of a name clash between a local in this module and an imported
+  // global.
+  // FIXME: If we find we need a finer-grained approach of preventing promotion
+  // and renaming of just the functions using inline assembly we will need to:
+  // - Add flag in the function summaries to identify those with inline asm.
+  // - Prevent importing of any functions with flag set.
+  // - Prevent importing of any global function with the same name as a
+  //   function in current module that has the flag set.
+  // - For any llvm.used value that is exported and promoted, add a private
+  //   alias to the original name in the current module (even if we don't
+  //   export the function using those values in inline asm, another function
+  //   with a reference could be exported).
+  SmallPtrSet<GlobalValue *, 8> Used;
+  collectUsedGlobalVariables(M, Used, /*CompilerUsed*/ false);
+  bool LocalIsUsed =
+      llvm::any_of(Used, [](GlobalValue *V) { return V->hasLocalLinkage(); });
+  if (!LocalIsUsed)
+    return true;
+
+  // Walk all the instructions in the module and find if one is inline ASM
+  auto HasInlineAsm = llvm::any_of(M, [](const Function &F) {
+    return llvm::any_of(instructions(F), [](const Instruction &I) {
+      const CallInst *CallI = dyn_cast<CallInst>(&I);
+      if (!CallI)
+        return false;
+      return CallI->isInlineAsm();
+    });
+  });
+  return !HasInlineAsm;
+}
diff --git a/lib/Analysis/ObjCARCAliasAnalysis.cpp b/lib/Analysis/ObjCARCAliasAnalysis.cpp
index 25f660ffe221..9bb1048ea8ba 100644
--- a/lib/Analysis/ObjCARCAliasAnalysis.cpp
+++ b/lib/Analysis/ObjCARCAliasAnalysis.cpp
@@ -131,19 +131,13 @@ ModRefInfo ObjCARCAAResult::getModRefInfo(ImmutableCallSite CS,
   return AAResultBase::getModRefInfo(CS, Loc);
 }
 
-ObjCARCAAResult ObjCARCAA::run(Function &F, AnalysisManager<Function> *AM) {
-  return ObjCARCAAResult(F.getParent()->getDataLayout(),
-                         AM->getResult<TargetLibraryAnalysis>(F));
+ObjCARCAAResult ObjCARCAA::run(Function &F, AnalysisManager<Function> &AM) {
+  return ObjCARCAAResult(F.getParent()->getDataLayout());
 }
 
-char ObjCARCAA::PassID;
-
 char ObjCARCAAWrapperPass::ID = 0;
-INITIALIZE_PASS_BEGIN(ObjCARCAAWrapperPass, "objc-arc-aa",
-                      "ObjC-ARC-Based Alias Analysis", false, true)
-INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfoWrapperPass)
-INITIALIZE_PASS_END(ObjCARCAAWrapperPass, "objc-arc-aa",
-                    "ObjC-ARC-Based Alias Analysis", false, true)
+INITIALIZE_PASS(ObjCARCAAWrapperPass, "objc-arc-aa",
+                "ObjC-ARC-Based Alias Analysis", false, true)
 
 ImmutablePass *llvm::createObjCARCAAWrapperPass() {
   return new ObjCARCAAWrapperPass();
@@ -154,8 +148,7 @@ ObjCARCAAWrapperPass::ObjCARCAAWrapperPass() : ImmutablePass(ID) {
 }
 
 bool ObjCARCAAWrapperPass::doInitialization(Module &M) {
-  Result.reset(new ObjCARCAAResult(
-      M.getDataLayout(), getAnalysis<TargetLibraryInfoWrapperPass>().getTLI()));
+  Result.reset(new ObjCARCAAResult(M.getDataLayout()));
   return false;
 }
 
@@ -166,5 +159,4 @@ bool ObjCARCAAWrapperPass::doFinalization(Module &M) {
 
 void ObjCARCAAWrapperPass::getAnalysisUsage(AnalysisUsage &AU) const {
   AU.setPreservesAll();
-  AU.addRequired<TargetLibraryInfoWrapperPass>();
 }
diff --git a/lib/Analysis/ObjCARCInstKind.cpp b/lib/Analysis/ObjCARCInstKind.cpp
index 133b63513c87..3dc1463b8d8b 100644
--- a/lib/Analysis/ObjCARCInstKind.cpp
+++ b/lib/Analysis/ObjCARCInstKind.cpp
@@ -34,6 +34,8 @@ raw_ostream &llvm::objcarc::operator<<(raw_ostream &OS,
     return OS << "ARCInstKind::Retain";
   case ARCInstKind::RetainRV:
     return OS << "ARCInstKind::RetainRV";
+  case ARCInstKind::ClaimRV:
+    return OS << "ARCInstKind::ClaimRV";
   case ARCInstKind::RetainBlock:
     return OS << "ARCInstKind::RetainBlock";
   case ARCInstKind::Release:
@@ -103,6 +105,8 @@ ARCInstKind llvm::objcarc::GetFunctionClass(const Function *F) {
         return StringSwitch<ARCInstKind>(F->getName())
             .Case("objc_retain", ARCInstKind::Retain)
             .Case("objc_retainAutoreleasedReturnValue", ARCInstKind::RetainRV)
+            .Case("objc_unsafeClaimAutoreleasedReturnValue",
+                  ARCInstKind::ClaimRV)
             .Case("objc_retainBlock", ARCInstKind::RetainBlock)
             .Case("objc_release", ARCInstKind::Release)
             .Case("objc_autorelease", ARCInstKind::Autorelease)
@@ -350,6 +354,7 @@ bool llvm::objcarc::IsUser(ARCInstKind Class) {
   case ARCInstKind::StoreStrong:
   case ARCInstKind::Call:
   case ARCInstKind::None:
+  case ARCInstKind::ClaimRV:
     return false;
   }
   llvm_unreachable("covered switch isn't covered?");
@@ -385,6 +390,7 @@ bool llvm::objcarc::IsRetain(ARCInstKind Class) {
   case ARCInstKind::Call:
   case ARCInstKind::User:
   case ARCInstKind::None:
+  case ARCInstKind::ClaimRV:
     return false;
   }
   llvm_unreachable("covered switch isn't covered?");
@@ -398,6 +404,7 @@ bool llvm::objcarc::IsAutorelease(ARCInstKind Class) {
     return true;
   case ARCInstKind::Retain:
   case ARCInstKind::RetainRV:
+  case ARCInstKind::ClaimRV:
   case ARCInstKind::RetainBlock:
   case ARCInstKind::Release:
   case ARCInstKind::AutoreleasepoolPush:
@@ -429,6 +436,7 @@ bool llvm::objcarc::IsForwarding(ARCInstKind Class) {
   switch (Class) {
   case ARCInstKind::Retain:
   case ARCInstKind::RetainRV:
+  case ARCInstKind::ClaimRV:
   case ARCInstKind::Autorelease:
   case ARCInstKind::AutoreleaseRV:
   case ARCInstKind::NoopCast:
@@ -463,6 +471,7 @@ bool llvm::objcarc::IsNoopOnNull(ARCInstKind Class) {
   switch (Class) {
   case ARCInstKind::Retain:
   case ARCInstKind::RetainRV:
+  case ARCInstKind::ClaimRV:
   case ARCInstKind::Release:
   case ARCInstKind::Autorelease:
   case ARCInstKind::AutoreleaseRV:
@@ -498,6 +507,7 @@ bool llvm::objcarc::IsAlwaysTail(ARCInstKind Class) {
   switch (Class) {
   case ARCInstKind::Retain:
   case ARCInstKind::RetainRV:
+  case ARCInstKind::ClaimRV:
   case ARCInstKind::AutoreleaseRV:
     return true;
   case ARCInstKind::Release:
@@ -538,6 +548,7 @@ bool llvm::objcarc::IsNeverTail(ARCInstKind Class) {
     return true;
   case ARCInstKind::Retain:
   case ARCInstKind::RetainRV:
+  case ARCInstKind::ClaimRV:
   case ARCInstKind::AutoreleaseRV:
   case ARCInstKind::Release:
   case ARCInstKind::RetainBlock:
@@ -572,6 +583,7 @@ bool llvm::objcarc::IsNoThrow(ARCInstKind Class) {
   switch (Class) {
   case ARCInstKind::Retain:
   case ARCInstKind::RetainRV:
+  case ARCInstKind::ClaimRV:
   case ARCInstKind::Release:
   case ARCInstKind::Autorelease:
   case ARCInstKind::AutoreleaseRV:
@@ -616,6 +628,7 @@ bool llvm::objcarc::CanInterruptRV(ARCInstKind Class) {
     return true;
   case ARCInstKind::Retain:
   case ARCInstKind::RetainRV:
+  case ARCInstKind::ClaimRV:
   case ARCInstKind::Release:
   case ARCInstKind::AutoreleasepoolPush:
   case ARCInstKind::RetainBlock:
@@ -668,6 +681,7 @@ bool llvm::objcarc::CanDecrementRefCount(ARCInstKind Kind) {
   case ARCInstKind::StoreStrong:
   case ARCInstKind::CallOrUser:
   case ARCInstKind::Call:
+  case ARCInstKind::ClaimRV:
     return true;
   }
 
diff --git a/lib/Analysis/OptimizationDiagnosticInfo.cpp b/lib/Analysis/OptimizationDiagnosticInfo.cpp
new file mode 100644
index 000000000000..e979ba2531e4
--- /dev/null
+++ b/lib/Analysis/OptimizationDiagnosticInfo.cpp
@@ -0,0 +1,88 @@
+//===- OptimizationDiagnosticInfo.cpp - Optimization Diagnostic -*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// Optimization diagnostic interfaces.  It's packaged as an analysis pass so
+// that by using this service passes become dependent on BFI as well.  BFI is
+// used to compute the "hotness" of the diagnostic message.
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Analysis/OptimizationDiagnosticInfo.h"
+#include "llvm/Analysis/LazyBlockFrequencyInfo.h"
+#include "llvm/Analysis/LoopInfo.h"
+#include "llvm/IR/DiagnosticInfo.h"
+#include "llvm/IR/LLVMContext.h"
+
+using namespace llvm;
+
+Optional<uint64_t> OptimizationRemarkEmitter::computeHotness(Value *V) {
+  if (!BFI)
+    return None;
+
+  return BFI->getBlockProfileCount(cast<BasicBlock>(V));
+}
+
+void OptimizationRemarkEmitter::emitOptimizationRemarkMissed(
+    const char *PassName, const DebugLoc &DLoc, Value *V, const Twine &Msg) {
+  LLVMContext &Ctx = F->getContext();
+  Ctx.diagnose(DiagnosticInfoOptimizationRemarkMissed(PassName, *F, DLoc, Msg,
+                                                      computeHotness(V)));
+}
+
+void OptimizationRemarkEmitter::emitOptimizationRemarkMissed(
+    const char *PassName, Loop *L, const Twine &Msg) {
+  emitOptimizationRemarkMissed(PassName, L->getStartLoc(), L->getHeader(), Msg);
+}
+
+OptimizationRemarkEmitterWrapperPass::OptimizationRemarkEmitterWrapperPass()
+    : FunctionPass(ID) {
+  initializeOptimizationRemarkEmitterWrapperPassPass(
+      *PassRegistry::getPassRegistry());
+}
+
+bool OptimizationRemarkEmitterWrapperPass::runOnFunction(Function &Fn) {
+  BlockFrequencyInfo *BFI;
+
+  if (Fn.getContext().getDiagnosticHotnessRequested())
+    BFI = &getAnalysis<LazyBlockFrequencyInfoPass>().getBFI();
+  else
+    BFI = nullptr;
+
+  ORE = llvm::make_unique<OptimizationRemarkEmitter>(&Fn, BFI);
+  return false;
+}
+
+void OptimizationRemarkEmitterWrapperPass::getAnalysisUsage(
+    AnalysisUsage &AU) const {
+  LazyBlockFrequencyInfoPass::getLazyBFIAnalysisUsage(AU);
+  AU.setPreservesAll();
+}
+
+char OptimizationRemarkEmitterAnalysis::PassID;
+
+OptimizationRemarkEmitter
+OptimizationRemarkEmitterAnalysis::run(Function &F, AnalysisManager<Function> &AM) {
+  BlockFrequencyInfo *BFI;
+
+  if (F.getContext().getDiagnosticHotnessRequested())
+    BFI = &AM.getResult<BlockFrequencyAnalysis>(F);
+  else
+    BFI = nullptr;
+
+  return OptimizationRemarkEmitter(&F, BFI);
+}
+
+char OptimizationRemarkEmitterWrapperPass::ID = 0;
+static const char ore_name[] = "Optimization Remark Emitter";
+#define ORE_NAME "opt-remark-emitter"
+
+INITIALIZE_PASS_BEGIN(OptimizationRemarkEmitterWrapperPass, ORE_NAME, ore_name,
+                      false, true)
+INITIALIZE_PASS_DEPENDENCY(LazyBFIPass)
+INITIALIZE_PASS_END(OptimizationRemarkEmitterWrapperPass, ORE_NAME, ore_name,
+                    false, true)
diff --git a/lib/Analysis/PHITransAddr.cpp b/lib/Analysis/PHITransAddr.cpp
index f7545ea05a39..b4aad74d50dc 100644
--- a/lib/Analysis/PHITransAddr.cpp
+++ b/lib/Analysis/PHITransAddr.cpp
@@ -42,7 +42,7 @@ static bool CanPHITrans(Instruction *Inst) {
 }
 
 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
-void PHITransAddr::dump() const {
+LLVM_DUMP_METHOD void PHITransAddr::dump() const {
   if (!Addr) {
     dbgs() << "PHITransAddr: null\n";
     return;
@@ -229,7 +229,8 @@ Value *PHITransAddr::PHITranslateSubExpr(Value *V, BasicBlock *CurBB,
       return GEP;
 
     // Simplify the GEP to handle 'gep x, 0' -> x etc.
-    if (Value *V = SimplifyGEPInst(GEPOps, DL, TLI, DT, AC)) {
+    if (Value *V = SimplifyGEPInst(GEP->getSourceElementType(),
+                                   GEPOps, DL, TLI, DT, AC)) {
       for (unsigned i = 0, e = GEPOps.size(); i != e; ++i)
         RemoveInstInputs(GEPOps[i], InstInputs);
 
diff --git a/lib/Analysis/PostDominators.cpp b/lib/Analysis/PostDominators.cpp
index 6d929091e3d2..73550805d5ba 100644
--- a/lib/Analysis/PostDominators.cpp
+++ b/lib/Analysis/PostDominators.cpp
@@ -16,6 +16,7 @@
 #include "llvm/ADT/SetOperations.h"
 #include "llvm/IR/CFG.h"
 #include "llvm/IR/Instructions.h"
+#include "llvm/IR/PassManager.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/GenericDomTreeConstruction.h"
 using namespace llvm;
@@ -26,25 +27,39 @@ using namespace llvm;
 //  PostDominatorTree Implementation
 //===----------------------------------------------------------------------===//
 
-char PostDominatorTree::ID = 0;
-INITIALIZE_PASS(PostDominatorTree, "postdomtree",
+char PostDominatorTreeWrapperPass::ID = 0;
+INITIALIZE_PASS(PostDominatorTreeWrapperPass, "postdomtree",
                 "Post-Dominator Tree Construction", true, true)
 
-bool PostDominatorTree::runOnFunction(Function &F) {
-  DT->recalculate(F);
+bool PostDominatorTreeWrapperPass::runOnFunction(Function &F) {
+  DT.recalculate(F);
   return false;
 }
 
-PostDominatorTree::~PostDominatorTree() {
-  delete DT;
+void PostDominatorTreeWrapperPass::print(raw_ostream &OS, const Module *) const {
+  DT.print(OS);
 }
 
-void PostDominatorTree::print(raw_ostream &OS, const Module *) const {
-  DT->print(OS);
+FunctionPass* llvm::createPostDomTree() {
+  return new PostDominatorTreeWrapperPass();
 }
 
+char PostDominatorTreeAnalysis::PassID;
 
-FunctionPass* llvm::createPostDomTree() {
-  return new PostDominatorTree();
+PostDominatorTree PostDominatorTreeAnalysis::run(Function &F,
+                                                 FunctionAnalysisManager &) {
+  PostDominatorTree PDT;
+  PDT.recalculate(F);
+  return PDT;
 }
 
+PostDominatorTreePrinterPass::PostDominatorTreePrinterPass(raw_ostream &OS)
+  : OS(OS) {}
+
+PreservedAnalyses
+PostDominatorTreePrinterPass::run(Function &F, FunctionAnalysisManager &AM) {
+  OS << "PostDominatorTree for function: " << F.getName() << "\n";
+  AM.getResult<PostDominatorTreeAnalysis>(F).print(OS);
+
+  return PreservedAnalyses::all();
+}
diff --git a/lib/Analysis/ProfileSummaryInfo.cpp b/lib/Analysis/ProfileSummaryInfo.cpp
new file mode 100644
index 000000000000..9cf99af49581
--- /dev/null
+++ b/lib/Analysis/ProfileSummaryInfo.cpp
@@ -0,0 +1,166 @@
+//===- ProfileSummaryInfo.cpp - Global profile summary information --------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains a pass that provides access to the global profile summary
+// information.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Analysis/ProfileSummaryInfo.h"
+#include "llvm/IR/Metadata.h"
+#include "llvm/IR/Module.h"
+#include "llvm/IR/ProfileSummary.h"
+using namespace llvm;
+
+// The following two parameters determine the threshold for a count to be
+// considered hot/cold. These two parameters are percentile values (multiplied
+// by 10000). If the counts are sorted in descending order, the minimum count to
+// reach ProfileSummaryCutoffHot gives the threshold to determine a hot count.
+// Similarly, the minimum count to reach ProfileSummaryCutoffCold gives the
+// threshold for determining cold count (everything <= this threshold is
+// considered cold).
+
+static cl::opt<int> ProfileSummaryCutoffHot(
+    "profile-summary-cutoff-hot", cl::Hidden, cl::init(999000), cl::ZeroOrMore,
+    cl::desc("A count is hot if it exceeds the minimum count to"
+             " reach this percentile of total counts."));
+
+static cl::opt<int> ProfileSummaryCutoffCold(
+    "profile-summary-cutoff-cold", cl::Hidden, cl::init(999999), cl::ZeroOrMore,
+    cl::desc("A count is cold if it is below the minimum count"
+             " to reach this percentile of total counts."));
+
+// Find the minimum count to reach a desired percentile of counts.
+static uint64_t getMinCountForPercentile(SummaryEntryVector &DS,
+                                         uint64_t Percentile) {
+  auto Compare = [](const ProfileSummaryEntry &Entry, uint64_t Percentile) {
+    return Entry.Cutoff < Percentile;
+  };
+  auto It = std::lower_bound(DS.begin(), DS.end(), Percentile, Compare);
+  // The required percentile has to be <= one of the percentiles in the
+  // detailed summary.
+  if (It == DS.end())
+    report_fatal_error("Desired percentile exceeds the maximum cutoff");
+  return It->MinCount;
+}
+
+// The profile summary metadata may be attached either by the frontend or by
+// any backend passes (IR level instrumentation, for example). This method
+// checks if the Summary is null and if so checks if the summary metadata is now
+// available in the module and parses it to get the Summary object.
+void ProfileSummaryInfo::computeSummary() {
+  if (Summary)
+    return;
+  auto *SummaryMD = M.getProfileSummary();
+  if (!SummaryMD)
+    return;
+  Summary.reset(ProfileSummary::getFromMD(SummaryMD));
+}
+
+// Returns true if the function is a hot function. If it returns false, it
+// either means it is not hot or it is unknown whether F is hot or not (for
+// example, no profile data is available).
+bool ProfileSummaryInfo::isHotFunction(const Function *F) {
+  computeSummary();
+  if (!F || !Summary)
+    return false;
+  auto FunctionCount = F->getEntryCount();
+  // FIXME: The heuristic used below for determining hotness is based on
+  // preliminary SPEC tuning for inliner. This will eventually be a
+  // convenience method that calls isHotCount.
+  return (FunctionCount &&
+          FunctionCount.getValue() >=
+              (uint64_t)(0.3 * (double)Summary->getMaxFunctionCount()));
+}
+
+// Returns true if the function is a cold function. If it returns false, it
+// either means it is not cold or it is unknown whether F is cold or not (for
+// example, no profile data is available).
+bool ProfileSummaryInfo::isColdFunction(const Function *F) {
+  computeSummary();
+  if (!F)
+    return false;
+  if (F->hasFnAttribute(Attribute::Cold)) {
+    return true;
+  }
+  if (!Summary)
+    return false;
+  auto FunctionCount = F->getEntryCount();
+  // FIXME: The heuristic used below for determining coldness is based on
+  // preliminary SPEC tuning for inliner. This will eventually be a
+  // convenience method that calls isHotCount.
+  return (FunctionCount &&
+          FunctionCount.getValue() <=
+              (uint64_t)(0.01 * (double)Summary->getMaxFunctionCount()));
+}
+
+// Compute the hot and cold thresholds.
+void ProfileSummaryInfo::computeThresholds() {
+  if (!Summary)
+    computeSummary();
+  if (!Summary)
+    return;
+  auto &DetailedSummary = Summary->getDetailedSummary();
+  HotCountThreshold =
+      getMinCountForPercentile(DetailedSummary, ProfileSummaryCutoffHot);
+  ColdCountThreshold =
+      getMinCountForPercentile(DetailedSummary, ProfileSummaryCutoffCold);
+}
+
+bool ProfileSummaryInfo::isHotCount(uint64_t C) {
+  if (!HotCountThreshold)
+    computeThresholds();
+  return HotCountThreshold && C >= HotCountThreshold.getValue();
+}
+
+bool ProfileSummaryInfo::isColdCount(uint64_t C) {
+  if (!ColdCountThreshold)
+    computeThresholds();
+  return ColdCountThreshold && C <= ColdCountThreshold.getValue();
+}
+
+ProfileSummaryInfo *ProfileSummaryInfoWrapperPass::getPSI(Module &M) {
+  if (!PSI)
+    PSI.reset(new ProfileSummaryInfo(M));
+  return PSI.get();
+}
+
+INITIALIZE_PASS(ProfileSummaryInfoWrapperPass, "profile-summary-info",
+                "Profile summary info", false, true)
+
+ProfileSummaryInfoWrapperPass::ProfileSummaryInfoWrapperPass()
+    : ImmutablePass(ID) {
+  initializeProfileSummaryInfoWrapperPassPass(*PassRegistry::getPassRegistry());
+}
+
+char ProfileSummaryAnalysis::PassID;
+ProfileSummaryInfo ProfileSummaryAnalysis::run(Module &M,
+                                               ModuleAnalysisManager &) {
+  return ProfileSummaryInfo(M);
+}
+
+// FIXME: This only tests isHotFunction and isColdFunction and not the
+// isHotCount and isColdCount calls.
+PreservedAnalyses ProfileSummaryPrinterPass::run(Module &M,
+                                                 AnalysisManager<Module> &AM) {
+  ProfileSummaryInfo &PSI = AM.getResult<ProfileSummaryAnalysis>(M);
+
+  OS << "Functions in " << M.getName() << " with hot/cold annotations: \n";
+  for (auto &F : M) {
+    OS << F.getName();
+    if (PSI.isHotFunction(&F))
+      OS << " :hot ";
+    else if (PSI.isColdFunction(&F))
+      OS << " :cold ";
+    OS << "\n";
+  }
+  return PreservedAnalyses::all();
+}
+
+char ProfileSummaryInfoWrapperPass::ID = 0;
diff --git a/lib/Analysis/RegionInfo.cpp b/lib/Analysis/RegionInfo.cpp
index f59d26730327..6860a3e63953 100644
--- a/lib/Analysis/RegionInfo.cpp
+++ b/lib/Analysis/RegionInfo.cpp
@@ -15,12 +15,10 @@
 #include "llvm/Analysis/LoopInfo.h"
 #include "llvm/Analysis/RegionInfoImpl.h"
 #include "llvm/Analysis/RegionIterator.h"
+#include "llvm/IR/PassManager.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/ErrorHandling.h"
-#include <algorithm>
-#include <iterator>
-#include <set>
 #ifndef NDEBUG
 #include "llvm/Analysis/RegionPrinter.h"
 #endif
@@ -128,8 +126,8 @@ bool RegionInfoPass::runOnFunction(Function &F) {
   releaseMemory();
 
   auto DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
-  auto PDT = &getAnalysis<PostDominatorTree>();
-  auto DF = &getAnalysis<DominanceFrontier>();
+  auto PDT = &getAnalysis<PostDominatorTreeWrapperPass>().getPostDomTree();
+  auto DF = &getAnalysis<DominanceFrontierWrapperPass>().getDominanceFrontier();
 
   RI.recalculate(F, DT, PDT, DF);
   return false;
@@ -146,8 +144,8 @@ void RegionInfoPass::verifyAnalysis() const {
 void RegionInfoPass::getAnalysisUsage(AnalysisUsage &AU) const {
   AU.setPreservesAll();
   AU.addRequiredTransitive<DominatorTreeWrapperPass>();
-  AU.addRequired<PostDominatorTree>();
-  AU.addRequired<DominanceFrontier>();
+  AU.addRequired<PostDominatorTreeWrapperPass>();
+  AU.addRequired<DominanceFrontierWrapperPass>();
 }
 
 void RegionInfoPass::print(raw_ostream &OS, const Module *) const {
@@ -155,7 +153,7 @@ void RegionInfoPass::print(raw_ostream &OS, const Module *) const {
 }
 
 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
-void RegionInfoPass::dump() const {
+LLVM_DUMP_METHOD void RegionInfoPass::dump() const {
   RI.dump();
 }
 #endif
@@ -165,8 +163,8 @@ char RegionInfoPass::ID = 0;
 INITIALIZE_PASS_BEGIN(RegionInfoPass, "regions",
                 "Detect single entry single exit regions", true, true)
 INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
-INITIALIZE_PASS_DEPENDENCY(PostDominatorTree)
-INITIALIZE_PASS_DEPENDENCY(DominanceFrontier)
+INITIALIZE_PASS_DEPENDENCY(PostDominatorTreeWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(DominanceFrontierWrapperPass)
 INITIALIZE_PASS_END(RegionInfoPass, "regions",
                 "Detect single entry single exit regions", true, true)
 
@@ -180,3 +178,36 @@ namespace llvm {
   }
 }
 
+//===----------------------------------------------------------------------===//
+// RegionInfoAnalysis implementation
+//
+
+char RegionInfoAnalysis::PassID;
+
+RegionInfo RegionInfoAnalysis::run(Function &F, AnalysisManager<Function> &AM) {
+  RegionInfo RI;
+  auto *DT = &AM.getResult<DominatorTreeAnalysis>(F);
+  auto *PDT = &AM.getResult<PostDominatorTreeAnalysis>(F);
+  auto *DF = &AM.getResult<DominanceFrontierAnalysis>(F);
+
+  RI.recalculate(F, DT, PDT, DF);
+  return RI;
+}
+
+RegionInfoPrinterPass::RegionInfoPrinterPass(raw_ostream &OS)
+  : OS(OS) {}
+
+PreservedAnalyses RegionInfoPrinterPass::run(Function &F,
+                                             FunctionAnalysisManager &AM) {
+  OS << "Region Tree for function: " << F.getName() << "\n";
+  AM.getResult<RegionInfoAnalysis>(F).print(OS);
+
+  return PreservedAnalyses::all();
+}
+
+PreservedAnalyses RegionInfoVerifierPass::run(Function &F,
+                                              AnalysisManager<Function> &AM) {
+  AM.getResult<RegionInfoAnalysis>(F).verifyAnalysis();
+
+  return PreservedAnalyses::all();
+}
diff --git a/lib/Analysis/RegionPrinter.cpp b/lib/Analysis/RegionPrinter.cpp
index acb218d5fea0..30a4e011060e 100644
--- a/lib/Analysis/RegionPrinter.cpp
+++ b/lib/Analysis/RegionPrinter.cpp
@@ -117,8 +117,8 @@ struct DOTGraphTraits<RegionInfo *> : public DOTGraphTraits<RegionNode *> {
         << ((R.getDepth() * 2 % 12) + 2) << "\n";
     }
 
-    for (Region::const_iterator RI = R.begin(), RE = R.end(); RI != RE; ++RI)
-      printRegionCluster(**RI, GW, depth + 1);
+    for (const auto &RI : R)
+      printRegionCluster(*RI, GW, depth + 1);
 
     const RegionInfo &RI = *static_cast<const RegionInfo*>(R.getRegionInfo());
 
diff --git a/lib/Analysis/ScalarEvolution.cpp b/lib/Analysis/ScalarEvolution.cpp
index ef1bb3a36c8d..2abbf3480358 100644
--- a/lib/Analysis/ScalarEvolution.cpp
+++ b/lib/Analysis/ScalarEvolution.cpp
@@ -111,10 +111,14 @@ MaxBruteForceIterations("scalar-evolution-max-iterations", cl::ReallyHidden,
                                  "derived loop"),
                         cl::init(100));
 
-// FIXME: Enable this with XDEBUG when the test suite is clean.
+// FIXME: Enable this with EXPENSIVE_CHECKS when the test suite is clean.
 static cl::opt<bool>
 VerifySCEV("verify-scev",
            cl::desc("Verify ScalarEvolution's backedge taken counts (slow)"));
+static cl::opt<bool>
+    VerifySCEVMap("verify-scev-maps",
+                  cl::desc("Verify no dangling value in ScalarEvolution's "
+                           "ExprValueMap (slow)"));
 
 //===----------------------------------------------------------------------===//
 //                           SCEV class definitions
@@ -162,11 +166,11 @@ void SCEV::print(raw_ostream &OS) const {
     for (unsigned i = 1, e = AR->getNumOperands(); i != e; ++i)
       OS << ",+," << *AR->getOperand(i);
     OS << "}<";
-    if (AR->getNoWrapFlags(FlagNUW))
+    if (AR->hasNoUnsignedWrap())
       OS << "nuw><";
-    if (AR->getNoWrapFlags(FlagNSW))
+    if (AR->hasNoSignedWrap())
       OS << "nsw><";
-    if (AR->getNoWrapFlags(FlagNW) &&
+    if (AR->hasNoSelfWrap() &&
         !AR->getNoWrapFlags((NoWrapFlags)(FlagNUW | FlagNSW)))
       OS << "nw><";
     AR->getLoop()->getHeader()->printAsOperand(OS, /*PrintType=*/false);
@@ -196,9 +200,9 @@ void SCEV::print(raw_ostream &OS) const {
     switch (NAry->getSCEVType()) {
     case scAddExpr:
     case scMulExpr:
-      if (NAry->getNoWrapFlags(FlagNUW))
+      if (NAry->hasNoUnsignedWrap())
         OS << "<nuw>";
-      if (NAry->getNoWrapFlags(FlagNSW))
+      if (NAry->hasNoSignedWrap())
         OS << "<nsw>";
     }
     return;
@@ -283,8 +287,6 @@ bool SCEV::isAllOnesValue() const {
   return false;
 }
 
-/// isNonConstantNegative - Return true if the specified scev is negated, but
-/// not a constant.
 bool SCEV::isNonConstantNegative() const {
   const SCEVMulExpr *Mul = dyn_cast<SCEVMulExpr>(this);
   if (!Mul) return false;
@@ -620,10 +622,10 @@ public:
 };
 }  // end anonymous namespace
 
-/// GroupByComplexity - Given a list of SCEV objects, order them by their
-/// complexity, and group objects of the same complexity together by value.
-/// When this routine is finished, we know that any duplicates in the vector are
-/// consecutive and that complexity is monotonically increasing.
+/// Given a list of SCEV objects, order them by their complexity, and group
+/// objects of the same complexity together by value.  When this routine is
+/// finished, we know that any duplicates in the vector are consecutive and that
+/// complexity is monotonically increasing.
 ///
 /// Note that we go take special precautions to ensure that we get deterministic
 /// results from this routine.  In other words, we don't want the results of
@@ -723,7 +725,7 @@ public:
     }
 
     // Split the Denominator when it is a product.
-    if (const SCEVMulExpr *T = dyn_cast<const SCEVMulExpr>(Denominator)) {
+    if (const SCEVMulExpr *T = dyn_cast<SCEVMulExpr>(Denominator)) {
       const SCEV *Q, *R;
       *Quotient = Numerator;
       for (const SCEV *Op : T->operands()) {
@@ -922,8 +924,7 @@ private:
 //                      Simple SCEV method implementations
 //===----------------------------------------------------------------------===//
 
-/// BinomialCoefficient - Compute BC(It, K).  The result has width W.
-/// Assume, K > 0.
+/// Compute BC(It, K).  The result has width W.  Assume, K > 0.
 static const SCEV *BinomialCoefficient(const SCEV *It, unsigned K,
                                        ScalarEvolution &SE,
                                        Type *ResultTy) {
@@ -1034,10 +1035,10 @@ static const SCEV *BinomialCoefficient(const SCEV *It, unsigned K,
                        SE.getTruncateOrZeroExtend(DivResult, ResultTy));
 }
 
-/// evaluateAtIteration - Return the value of this chain of recurrences at
-/// the specified iteration number.  We can evaluate this recurrence by
-/// multiplying each element in the chain by the binomial coefficient
-/// corresponding to it.  In other words, we can evaluate {A,+,B,+,C,+,D} as:
+/// Return the value of this chain of recurrences at the specified iteration
+/// number.  We can evaluate this recurrence by multiplying each element in the
+/// chain by the binomial coefficient corresponding to it.  In other words, we
+/// can evaluate {A,+,B,+,C,+,D} as:
 ///
 ///   A*BC(It, 0) + B*BC(It, 1) + C*BC(It, 2) + D*BC(It, 3)
 ///
@@ -1450,9 +1451,14 @@ const SCEV *ScalarEvolution::getZeroExtendExpr(const SCEV *Op,
       unsigned BitWidth = getTypeSizeInBits(AR->getType());
       const Loop *L = AR->getLoop();
 
+      if (!AR->hasNoUnsignedWrap()) {
+        auto NewFlags = proveNoWrapViaConstantRanges(AR);
+        const_cast<SCEVAddRecExpr *>(AR)->setNoWrapFlags(NewFlags);
+      }
+
       // If we have special knowledge that this addrec won't overflow,
       // we don't need to do any further analysis.
-      if (AR->getNoWrapFlags(SCEV::FlagNUW))
+      if (AR->hasNoUnsignedWrap())
         return getAddRecExpr(
             getExtendAddRecStart<SCEVZeroExtendExpr>(AR, Ty, this),
             getZeroExtendExpr(Step, Ty), L, AR->getNoWrapFlags());
@@ -1512,11 +1518,22 @@ const SCEV *ScalarEvolution::getZeroExtendExpr(const SCEV *Op,
                 getSignExtendExpr(Step, Ty), L, AR->getNoWrapFlags());
           }
         }
+      }
 
-        // If the backedge is guarded by a comparison with the pre-inc value
-        // the addrec is safe. Also, if the entry is guarded by a comparison
-        // with the start value and the backedge is guarded by a comparison
-        // with the post-inc value, the addrec is safe.
+      // Normally, in the cases we can prove no-overflow via a
+      // backedge guarding condition, we can also compute a backedge
+      // taken count for the loop.  The exceptions are assumptions and
+      // guards present in the loop -- SCEV is not great at exploiting
+      // these to compute max backedge taken counts, but can still use
+      // these to prove lack of overflow.  Use this fact to avoid
+      // doing extra work that may not pay off.
+      if (!isa<SCEVCouldNotCompute>(MaxBECount) || HasGuards ||
+          !AC.assumptions().empty()) {
+        // If the backedge is guarded by a comparison with the pre-inc
+        // value the addrec is safe. Also, if the entry is guarded by
+        // a comparison with the start value and the backedge is
+        // guarded by a comparison with the post-inc value, the addrec
+        // is safe.
         if (isKnownPositive(Step)) {
           const SCEV *N = getConstant(APInt::getMinValue(BitWidth) -
                                       getUnsignedRange(Step).getUnsignedMax());
@@ -1524,7 +1541,8 @@ const SCEV *ScalarEvolution::getZeroExtendExpr(const SCEV *Op,
               (isLoopEntryGuardedByCond(L, ICmpInst::ICMP_ULT, Start, N) &&
                isLoopBackedgeGuardedByCond(L, ICmpInst::ICMP_ULT,
                                            AR->getPostIncExpr(*this), N))) {
-            // Cache knowledge of AR NUW, which is propagated to this AddRec.
+            // Cache knowledge of AR NUW, which is propagated to this
+            // AddRec.
             const_cast<SCEVAddRecExpr *>(AR)->setNoWrapFlags(SCEV::FlagNUW);
             // Return the expression with the addrec on the outside.
             return getAddRecExpr(
@@ -1538,8 +1556,9 @@ const SCEV *ScalarEvolution::getZeroExtendExpr(const SCEV *Op,
               (isLoopEntryGuardedByCond(L, ICmpInst::ICMP_UGT, Start, N) &&
                isLoopBackedgeGuardedByCond(L, ICmpInst::ICMP_UGT,
                                            AR->getPostIncExpr(*this), N))) {
-            // Cache knowledge of AR NW, which is propagated to this AddRec.
-            // Negative step causes unsigned wrap, but it still can't self-wrap.
+            // Cache knowledge of AR NW, which is propagated to this
+            // AddRec.  Negative step causes unsigned wrap, but it
+            // still can't self-wrap.
             const_cast<SCEVAddRecExpr *>(AR)->setNoWrapFlags(SCEV::FlagNW);
             // Return the expression with the addrec on the outside.
             return getAddRecExpr(
@@ -1559,7 +1578,7 @@ const SCEV *ScalarEvolution::getZeroExtendExpr(const SCEV *Op,
 
   if (auto *SA = dyn_cast<SCEVAddExpr>(Op)) {
     // zext((A + B + ...)<nuw>) --> (zext(A) + zext(B) + ...)<nuw>
-    if (SA->getNoWrapFlags(SCEV::FlagNUW)) {
+    if (SA->hasNoUnsignedWrap()) {
       // If the addition does not unsign overflow then we can, by definition,
       // commute the zero extension with the addition operation.
       SmallVector<const SCEV *, 4> Ops;
@@ -1608,10 +1627,6 @@ const SCEV *ScalarEvolution::getSignExtendExpr(const SCEV *Op,
   void *IP = nullptr;
   if (const SCEV *S = UniqueSCEVs.FindNodeOrInsertPos(ID, IP)) return S;
 
-  // If the input value is provably positive, build a zext instead.
-  if (isKnownNonNegative(Op))
-    return getZeroExtendExpr(Op, Ty);
-
   // sext(trunc(x)) --> sext(x) or x or trunc(x)
   if (const SCEVTruncateExpr *ST = dyn_cast<SCEVTruncateExpr>(Op)) {
     // It's possible the bits taken off by the truncate were all sign bits. If
@@ -1643,7 +1658,7 @@ const SCEV *ScalarEvolution::getSignExtendExpr(const SCEV *Op,
     }
 
     // sext((A + B + ...)<nsw>) --> (sext(A) + sext(B) + ...)<nsw>
-    if (SA->getNoWrapFlags(SCEV::FlagNSW)) {
+    if (SA->hasNoSignedWrap()) {
       // If the addition does not sign overflow then we can, by definition,
       // commute the sign extension with the addition operation.
       SmallVector<const SCEV *, 4> Ops;
@@ -1663,9 +1678,14 @@ const SCEV *ScalarEvolution::getSignExtendExpr(const SCEV *Op,
       unsigned BitWidth = getTypeSizeInBits(AR->getType());
       const Loop *L = AR->getLoop();
 
+      if (!AR->hasNoSignedWrap()) {
+        auto NewFlags = proveNoWrapViaConstantRanges(AR);
+        const_cast<SCEVAddRecExpr *>(AR)->setNoWrapFlags(NewFlags);
+      }
+
       // If we have special knowledge that this addrec won't overflow,
       // we don't need to do any further analysis.
-      if (AR->getNoWrapFlags(SCEV::FlagNSW))
+      if (AR->hasNoSignedWrap())
         return getAddRecExpr(
             getExtendAddRecStart<SCEVSignExtendExpr>(AR, Ty, this),
             getSignExtendExpr(Step, Ty), L, SCEV::FlagNSW);
@@ -1732,11 +1752,23 @@ const SCEV *ScalarEvolution::getSignExtendExpr(const SCEV *Op,
                 getZeroExtendExpr(Step, Ty), L, AR->getNoWrapFlags());
           }
         }
+      }
 
-        // If the backedge is guarded by a comparison with the pre-inc value
-        // the addrec is safe. Also, if the entry is guarded by a comparison
-        // with the start value and the backedge is guarded by a comparison
-        // with the post-inc value, the addrec is safe.
+      // Normally, in the cases we can prove no-overflow via a
+      // backedge guarding condition, we can also compute a backedge
+      // taken count for the loop.  The exceptions are assumptions and
+      // guards present in the loop -- SCEV is not great at exploiting
+      // these to compute max backedge taken counts, but can still use
+      // these to prove lack of overflow.  Use this fact to avoid
+      // doing extra work that may not pay off.
+
+      if (!isa<SCEVCouldNotCompute>(MaxBECount) || HasGuards ||
+          !AC.assumptions().empty()) {
+        // If the backedge is guarded by a comparison with the pre-inc
+        // value the addrec is safe. Also, if the entry is guarded by
+        // a comparison with the start value and the backedge is
+        // guarded by a comparison with the post-inc value, the addrec
+        // is safe.
         ICmpInst::Predicate Pred;
         const SCEV *OverflowLimit =
             getSignedOverflowLimitForStep(Step, &Pred, this);
@@ -1752,6 +1784,7 @@ const SCEV *ScalarEvolution::getSignExtendExpr(const SCEV *Op,
               getSignExtendExpr(Step, Ty), L, AR->getNoWrapFlags());
         }
       }
+
       // If Start and Step are constants, check if we can apply this
       // transformation:
       // sext{C1,+,C2} --> C1 + sext{0,+,C2} if C1 < C2
@@ -1777,6 +1810,11 @@ const SCEV *ScalarEvolution::getSignExtendExpr(const SCEV *Op,
       }
     }
 
+  // If the input value is provably positive and we could not simplify
+  // away the sext build a zext instead.
+  if (isKnownNonNegative(Op))
+    return getZeroExtendExpr(Op, Ty);
+
   // The cast wasn't folded; create an explicit cast node.
   // Recompute the insert position, as it may have been invalidated.
   if (const SCEV *S = UniqueSCEVs.FindNodeOrInsertPos(ID, IP)) return S;
@@ -1836,11 +1874,10 @@ const SCEV *ScalarEvolution::getAnyExtendExpr(const SCEV *Op,
   return ZExt;
 }
 
-/// CollectAddOperandsWithScales - Process the given Ops list, which is
-/// a list of operands to be added under the given scale, update the given
-/// map. This is a helper function for getAddRecExpr. As an example of
-/// what it does, given a sequence of operands that would form an add
-/// expression like this:
+/// Process the given Ops list, which is a list of operands to be added under
+/// the given scale, update the given map. This is a helper function for
+/// getAddRecExpr. As an example of what it does, given a sequence of operands
+/// that would form an add expression like this:
 ///
 ///    m + n + 13 + (A * (o + p + (B * (q + m + 29)))) + r + (-1 * r)
 ///
@@ -1899,7 +1936,7 @@ CollectAddOperandsWithScales(DenseMap<const SCEV *, APInt> &M,
         // the map.
         SmallVector<const SCEV *, 4> MulOps(Mul->op_begin()+1, Mul->op_end());
         const SCEV *Key = SE.getMulExpr(MulOps);
-        auto Pair = M.insert(std::make_pair(Key, NewScale));
+        auto Pair = M.insert({Key, NewScale});
         if (Pair.second) {
           NewOps.push_back(Pair.first->first);
         } else {
@@ -1912,7 +1949,7 @@ CollectAddOperandsWithScales(DenseMap<const SCEV *, APInt> &M,
     } else {
       // An ordinary operand. Update the map.
       std::pair<DenseMap<const SCEV *, APInt>::iterator, bool> Pair =
-        M.insert(std::make_pair(Ops[i], Scale));
+          M.insert({Ops[i], Scale});
       if (Pair.second) {
         NewOps.push_back(Pair.first->first);
       } else {
@@ -1965,15 +2002,14 @@ StrengthenNoWrapFlags(ScalarEvolution *SE, SCEVTypes Type,
 
     const APInt &C = cast<SCEVConstant>(Ops[0])->getAPInt();
     if (!(SignOrUnsignWrap & SCEV::FlagNSW)) {
-      auto NSWRegion =
-        ConstantRange::makeNoWrapRegion(Instruction::Add, C, OBO::NoSignedWrap);
+      auto NSWRegion = ConstantRange::makeGuaranteedNoWrapRegion(
+          Instruction::Add, C, OBO::NoSignedWrap);
       if (NSWRegion.contains(SE->getSignedRange(Ops[1])))
         Flags = ScalarEvolution::setFlags(Flags, SCEV::FlagNSW);
     }
     if (!(SignOrUnsignWrap & SCEV::FlagNUW)) {
-      auto NUWRegion =
-        ConstantRange::makeNoWrapRegion(Instruction::Add, C,
-                                        OBO::NoUnsignedWrap);
+      auto NUWRegion = ConstantRange::makeGuaranteedNoWrapRegion(
+          Instruction::Add, C, OBO::NoUnsignedWrap);
       if (NUWRegion.contains(SE->getUnsignedRange(Ops[1])))
         Flags = ScalarEvolution::setFlags(Flags, SCEV::FlagNUW);
     }
@@ -1982,8 +2018,7 @@ StrengthenNoWrapFlags(ScalarEvolution *SE, SCEVTypes Type,
   return Flags;
 }
 
-/// getAddExpr - Get a canonical add expression, or something simpler if
-/// possible.
+/// Get a canonical add expression, or something simpler if possible.
 const SCEV *ScalarEvolution::getAddExpr(SmallVectorImpl<const SCEV *> &Ops,
                                         SCEV::NoWrapFlags Flags) {
   assert(!(Flags & ~(SCEV::FlagNUW | SCEV::FlagNSW)) &&
@@ -2266,7 +2301,10 @@ const SCEV *ScalarEvolution::getAddExpr(SmallVectorImpl<const SCEV *> &Ops,
 
       SmallVector<const SCEV *, 4> AddRecOps(AddRec->op_begin(),
                                              AddRec->op_end());
-      AddRecOps[0] = getAddExpr(LIOps);
+      // This follows from the fact that the no-wrap flags on the outer add
+      // expression are applicable on the 0th iteration, when the add recurrence
+      // will be equal to its start value.
+      AddRecOps[0] = getAddExpr(LIOps, Flags);
 
       // Build the new addrec. Propagate the NUW and NSW flags if both the
       // outer add and the inner addrec are guaranteed to have no overflow.
@@ -2391,8 +2429,7 @@ static bool containsConstantSomewhere(const SCEV *StartExpr) {
   return false;
 }
 
-/// getMulExpr - Get a canonical multiply expression, or something simpler if
-/// possible.
+/// Get a canonical multiply expression, or something simpler if possible.
 const SCEV *ScalarEvolution::getMulExpr(SmallVectorImpl<const SCEV *> &Ops,
                                         SCEV::NoWrapFlags Flags) {
   assert(Flags == maskFlags(Flags, SCEV::FlagNUW | SCEV::FlagNSW) &&
@@ -2632,8 +2669,8 @@ const SCEV *ScalarEvolution::getMulExpr(SmallVectorImpl<const SCEV *> &Ops,
   return S;
 }
 
-/// getUDivExpr - Get a canonical unsigned division expression, or something
-/// simpler if possible.
+/// Get a canonical unsigned division expression, or something simpler if
+/// possible.
 const SCEV *ScalarEvolution::getUDivExpr(const SCEV *LHS,
                                          const SCEV *RHS) {
   assert(getEffectiveSCEVType(LHS->getType()) ==
@@ -2764,10 +2801,10 @@ static const APInt gcd(const SCEVConstant *C1, const SCEVConstant *C2) {
   return APIntOps::GreatestCommonDivisor(A, B);
 }
 
-/// getUDivExactExpr - Get a canonical unsigned division expression, or
-/// something simpler if possible. There is no representation for an exact udiv
-/// in SCEV IR, but we can attempt to remove factors from the LHS and RHS.
-/// We can't do this when it's not exact because the udiv may be clearing bits.
+/// Get a canonical unsigned division expression, or something simpler if
+/// possible. There is no representation for an exact udiv in SCEV IR, but we
+/// can attempt to remove factors from the LHS and RHS.  We can't do this when
+/// it's not exact because the udiv may be clearing bits.
 const SCEV *ScalarEvolution::getUDivExactExpr(const SCEV *LHS,
                                               const SCEV *RHS) {
   // TODO: we could try to find factors in all sorts of things, but for now we
@@ -2821,8 +2858,8 @@ const SCEV *ScalarEvolution::getUDivExactExpr(const SCEV *LHS,
   return getUDivExpr(LHS, RHS);
 }
 
-/// getAddRecExpr - Get an add recurrence expression for the specified loop.
-/// Simplify the expression as much as possible.
+/// Get an add recurrence expression for the specified loop.  Simplify the
+/// expression as much as possible.
 const SCEV *ScalarEvolution::getAddRecExpr(const SCEV *Start, const SCEV *Step,
                                            const Loop *L,
                                            SCEV::NoWrapFlags Flags) {
@@ -2838,8 +2875,8 @@ const SCEV *ScalarEvolution::getAddRecExpr(const SCEV *Start, const SCEV *Step,
   return getAddRecExpr(Operands, L, Flags);
 }
 
-/// getAddRecExpr - Get an add recurrence expression for the specified loop.
-/// Simplify the expression as much as possible.
+/// Get an add recurrence expression for the specified loop.  Simplify the
+/// expression as much as possible.
 const SCEV *
 ScalarEvolution::getAddRecExpr(SmallVectorImpl<const SCEV *> &Operands,
                                const Loop *L, SCEV::NoWrapFlags Flags) {
@@ -2985,9 +3022,7 @@ ScalarEvolution::getGEPExpr(Type *PointeeType, const SCEV *BaseExpr,
 
 const SCEV *ScalarEvolution::getSMaxExpr(const SCEV *LHS,
                                          const SCEV *RHS) {
-  SmallVector<const SCEV *, 2> Ops;
-  Ops.push_back(LHS);
-  Ops.push_back(RHS);
+  SmallVector<const SCEV *, 2> Ops = {LHS, RHS};
   return getSMaxExpr(Ops);
 }
 
@@ -3088,9 +3123,7 @@ ScalarEvolution::getSMaxExpr(SmallVectorImpl<const SCEV *> &Ops) {
 
 const SCEV *ScalarEvolution::getUMaxExpr(const SCEV *LHS,
                                          const SCEV *RHS) {
-  SmallVector<const SCEV *, 2> Ops;
-  Ops.push_back(LHS);
-  Ops.push_back(RHS);
+  SmallVector<const SCEV *, 2> Ops = {LHS, RHS};
   return getUMaxExpr(Ops);
 }
 
@@ -3244,26 +3277,25 @@ const SCEV *ScalarEvolution::getUnknown(Value *V) {
 //            Basic SCEV Analysis and PHI Idiom Recognition Code
 //
 
-/// isSCEVable - Test if values of the given type are analyzable within
-/// the SCEV framework. This primarily includes integer types, and it
-/// can optionally include pointer types if the ScalarEvolution class
-/// has access to target-specific information.
+/// Test if values of the given type are analyzable within the SCEV
+/// framework. This primarily includes integer types, and it can optionally
+/// include pointer types if the ScalarEvolution class has access to
+/// target-specific information.
 bool ScalarEvolution::isSCEVable(Type *Ty) const {
   // Integers and pointers are always SCEVable.
   return Ty->isIntegerTy() || Ty->isPointerTy();
 }
 
-/// getTypeSizeInBits - Return the size in bits of the specified type,
-/// for which isSCEVable must return true.
+/// Return the size in bits of the specified type, for which isSCEVable must
+/// return true.
 uint64_t ScalarEvolution::getTypeSizeInBits(Type *Ty) const {
   assert(isSCEVable(Ty) && "Type is not SCEVable!");
   return getDataLayout().getTypeSizeInBits(Ty);
 }
 
-/// getEffectiveSCEVType - Return a type with the same bitwidth as
-/// the given type and which represents how SCEV will treat the given
-/// type, for which isSCEVable must return true. For pointer types,
-/// this is the pointer-sized integer type.
+/// Return a type with the same bitwidth as the given type and which represents
+/// how SCEV will treat the given type, for which isSCEVable must return
+/// true. For pointer types, this is the pointer-sized integer type.
 Type *ScalarEvolution::getEffectiveSCEVType(Type *Ty) const {
   assert(isSCEVable(Ty) && "Type is not SCEVable!");
 
@@ -3310,15 +3342,88 @@ bool ScalarEvolution::checkValidity(const SCEV *S) const {
   return !F.FindOne;
 }
 
-/// getSCEV - Return an existing SCEV if it exists, otherwise analyze the
-/// expression and create a new one.
+namespace {
+// Helper class working with SCEVTraversal to figure out if a SCEV contains
+// a sub SCEV of scAddRecExpr type.  FindInvalidSCEVUnknown::FoundOne is set
+// iff if such sub scAddRecExpr type SCEV is found.
+struct FindAddRecurrence {
+  bool FoundOne;
+  FindAddRecurrence() : FoundOne(false) {}
+
+  bool follow(const SCEV *S) {
+    switch (static_cast<SCEVTypes>(S->getSCEVType())) {
+    case scAddRecExpr:
+      FoundOne = true;
+    case scConstant:
+    case scUnknown:
+    case scCouldNotCompute:
+      return false;
+    default:
+      return true;
+    }
+  }
+  bool isDone() const { return FoundOne; }
+};
+}
+
+bool ScalarEvolution::containsAddRecurrence(const SCEV *S) {
+  HasRecMapType::iterator I = HasRecMap.find_as(S);
+  if (I != HasRecMap.end())
+    return I->second;
+
+  FindAddRecurrence F;
+  SCEVTraversal<FindAddRecurrence> ST(F);
+  ST.visitAll(S);
+  HasRecMap.insert({S, F.FoundOne});
+  return F.FoundOne;
+}
+
+/// Return the Value set from S.
+SetVector<Value *> *ScalarEvolution::getSCEVValues(const SCEV *S) {
+  ExprValueMapType::iterator SI = ExprValueMap.find_as(S);
+  if (SI == ExprValueMap.end())
+    return nullptr;
+#ifndef NDEBUG
+  if (VerifySCEVMap) {
+    // Check there is no dangling Value in the set returned.
+    for (const auto &VE : SI->second)
+      assert(ValueExprMap.count(VE));
+  }
+#endif
+  return &SI->second;
+}
+
+/// Erase Value from ValueExprMap and ExprValueMap.  If ValueExprMap.erase(V) is
+/// not used together with forgetMemoizedResults(S), eraseValueFromMap should be
+/// used instead to ensure whenever V->S is removed from ValueExprMap, V is also
+/// removed from the set of ExprValueMap[S].
+void ScalarEvolution::eraseValueFromMap(Value *V) {
+  ValueExprMapType::iterator I = ValueExprMap.find_as(V);
+  if (I != ValueExprMap.end()) {
+    const SCEV *S = I->second;
+    SetVector<Value *> *SV = getSCEVValues(S);
+    // Remove V from the set of ExprValueMap[S]
+    if (SV)
+      SV->remove(V);
+    ValueExprMap.erase(V);
+  }
+}
+
+/// Return an existing SCEV if it exists, otherwise analyze the expression and
+/// create a new one.
 const SCEV *ScalarEvolution::getSCEV(Value *V) {
   assert(isSCEVable(V->getType()) && "Value is not SCEVable!");
 
   const SCEV *S = getExistingSCEV(V);
   if (S == nullptr) {
     S = createSCEV(V);
-    ValueExprMap.insert(std::make_pair(SCEVCallbackVH(V, this), S));
+    // During PHI resolution, it is possible to create two SCEVs for the same
+    // V, so it is needed to double check whether V->S is inserted into
+    // ValueExprMap before insert S->V into ExprValueMap.
+    std::pair<ValueExprMapType::iterator, bool> Pair =
+        ValueExprMap.insert({SCEVCallbackVH(V, this), S});
+    if (Pair.second)
+      ExprValueMap[S].insert(V);
   }
   return S;
 }
@@ -3331,12 +3436,13 @@ const SCEV *ScalarEvolution::getExistingSCEV(Value *V) {
     const SCEV *S = I->second;
     if (checkValidity(S))
       return S;
+    forgetMemoizedResults(S);
     ValueExprMap.erase(I);
   }
   return nullptr;
 }
 
-/// getNegativeSCEV - Return a SCEV corresponding to -V = -1*V
+/// Return a SCEV corresponding to -V = -1*V
 ///
 const SCEV *ScalarEvolution::getNegativeSCEV(const SCEV *V,
                                              SCEV::NoWrapFlags Flags) {
@@ -3350,7 +3456,7 @@ const SCEV *ScalarEvolution::getNegativeSCEV(const SCEV *V,
       V, getConstant(cast<ConstantInt>(Constant::getAllOnesValue(Ty))), Flags);
 }
 
-/// getNotSCEV - Return a SCEV corresponding to ~V = -1-V
+/// Return a SCEV corresponding to ~V = -1-V
 const SCEV *ScalarEvolution::getNotSCEV(const SCEV *V) {
   if (const SCEVConstant *VC = dyn_cast<SCEVConstant>(V))
     return getConstant(
@@ -3363,7 +3469,6 @@ const SCEV *ScalarEvolution::getNotSCEV(const SCEV *V) {
   return getMinusSCEV(AllOnes, V);
 }
 
-/// getMinusSCEV - Return LHS-RHS.  Minus is represented in SCEV as A+B*-1.
 const SCEV *ScalarEvolution::getMinusSCEV(const SCEV *LHS, const SCEV *RHS,
                                           SCEV::NoWrapFlags Flags) {
   // Fast path: X - X --> 0.
@@ -3402,9 +3507,6 @@ const SCEV *ScalarEvolution::getMinusSCEV(const SCEV *LHS, const SCEV *RHS,
   return getAddExpr(LHS, getNegativeSCEV(RHS, NegFlags), AddFlags);
 }
 
-/// getTruncateOrZeroExtend - Return a SCEV corresponding to a conversion of the
-/// input value to the specified type.  If the type must be extended, it is zero
-/// extended.
 const SCEV *
 ScalarEvolution::getTruncateOrZeroExtend(const SCEV *V, Type *Ty) {
   Type *SrcTy = V->getType();
@@ -3418,9 +3520,6 @@ ScalarEvolution::getTruncateOrZeroExtend(const SCEV *V, Type *Ty) {
   return getZeroExtendExpr(V, Ty);
 }
 
-/// getTruncateOrSignExtend - Return a SCEV corresponding to a conversion of the
-/// input value to the specified type.  If the type must be extended, it is sign
-/// extended.
 const SCEV *
 ScalarEvolution::getTruncateOrSignExtend(const SCEV *V,
                                          Type *Ty) {
@@ -3435,9 +3534,6 @@ ScalarEvolution::getTruncateOrSignExtend(const SCEV *V,
   return getSignExtendExpr(V, Ty);
 }
 
-/// getNoopOrZeroExtend - Return a SCEV corresponding to a conversion of the
-/// input value to the specified type.  If the type must be extended, it is zero
-/// extended.  The conversion must not be narrowing.
 const SCEV *
 ScalarEvolution::getNoopOrZeroExtend(const SCEV *V, Type *Ty) {
   Type *SrcTy = V->getType();
@@ -3451,9 +3547,6 @@ ScalarEvolution::getNoopOrZeroExtend(const SCEV *V, Type *Ty) {
   return getZeroExtendExpr(V, Ty);
 }
 
-/// getNoopOrSignExtend - Return a SCEV corresponding to a conversion of the
-/// input value to the specified type.  If the type must be extended, it is sign
-/// extended.  The conversion must not be narrowing.
 const SCEV *
 ScalarEvolution::getNoopOrSignExtend(const SCEV *V, Type *Ty) {
   Type *SrcTy = V->getType();
@@ -3467,10 +3560,6 @@ ScalarEvolution::getNoopOrSignExtend(const SCEV *V, Type *Ty) {
   return getSignExtendExpr(V, Ty);
 }
 
-/// getNoopOrAnyExtend - Return a SCEV corresponding to a conversion of
-/// the input value to the specified type. If the type must be extended,
-/// it is extended with unspecified bits. The conversion must not be
-/// narrowing.
 const SCEV *
 ScalarEvolution::getNoopOrAnyExtend(const SCEV *V, Type *Ty) {
   Type *SrcTy = V->getType();
@@ -3484,8 +3573,6 @@ ScalarEvolution::getNoopOrAnyExtend(const SCEV *V, Type *Ty) {
   return getAnyExtendExpr(V, Ty);
 }
 
-/// getTruncateOrNoop - Return a SCEV corresponding to a conversion of the
-/// input value to the specified type.  The conversion must not be widening.
 const SCEV *
 ScalarEvolution::getTruncateOrNoop(const SCEV *V, Type *Ty) {
   Type *SrcTy = V->getType();
@@ -3499,9 +3586,6 @@ ScalarEvolution::getTruncateOrNoop(const SCEV *V, Type *Ty) {
   return getTruncateExpr(V, Ty);
 }
 
-/// getUMaxFromMismatchedTypes - Promote the operands to the wider of
-/// the types using zero-extension, and then perform a umax operation
-/// with them.
 const SCEV *ScalarEvolution::getUMaxFromMismatchedTypes(const SCEV *LHS,
                                                         const SCEV *RHS) {
   const SCEV *PromotedLHS = LHS;
@@ -3515,9 +3599,6 @@ const SCEV *ScalarEvolution::getUMaxFromMismatchedTypes(const SCEV *LHS,
   return getUMaxExpr(PromotedLHS, PromotedRHS);
 }
 
-/// getUMinFromMismatchedTypes - Promote the operands to the wider of
-/// the types using zero-extension, and then perform a umin operation
-/// with them.
 const SCEV *ScalarEvolution::getUMinFromMismatchedTypes(const SCEV *LHS,
                                                         const SCEV *RHS) {
   const SCEV *PromotedLHS = LHS;
@@ -3531,10 +3612,6 @@ const SCEV *ScalarEvolution::getUMinFromMismatchedTypes(const SCEV *LHS,
   return getUMinExpr(PromotedLHS, PromotedRHS);
 }
 
-/// getPointerBase - Transitively follow the chain of pointer-type operands
-/// until reaching a SCEV that does not have a single pointer operand. This
-/// returns a SCEVUnknown pointer for well-formed pointer-type expressions,
-/// but corner cases do exist.
 const SCEV *ScalarEvolution::getPointerBase(const SCEV *V) {
   // A pointer operand may evaluate to a nonpointer expression, such as null.
   if (!V->getType()->isPointerTy())
@@ -3559,8 +3636,7 @@ const SCEV *ScalarEvolution::getPointerBase(const SCEV *V) {
   return V;
 }
 
-/// PushDefUseChildren - Push users of the given Instruction
-/// onto the given Worklist.
+/// Push users of the given Instruction onto the given Worklist.
 static void
 PushDefUseChildren(Instruction *I,
                    SmallVectorImpl<Instruction *> &Worklist) {
@@ -3569,12 +3645,7 @@ PushDefUseChildren(Instruction *I,
     Worklist.push_back(cast<Instruction>(U));
 }
 
-/// ForgetSymbolicValue - This looks up computed SCEV values for all
-/// instructions that depend on the given instruction and removes them from
-/// the ValueExprMapType map if they reference SymName. This is used during PHI
-/// resolution.
-void
-ScalarEvolution::ForgetSymbolicName(Instruction *PN, const SCEV *SymName) {
+void ScalarEvolution::forgetSymbolicName(Instruction *PN, const SCEV *SymName) {
   SmallVector<Instruction *, 16> Worklist;
   PushDefUseChildren(PN, Worklist);
 
@@ -3616,10 +3687,10 @@ ScalarEvolution::ForgetSymbolicName(Instruction *PN, const SCEV *SymName) {
 namespace {
 class SCEVInitRewriter : public SCEVRewriteVisitor<SCEVInitRewriter> {
 public:
-  static const SCEV *rewrite(const SCEV *Scev, const Loop *L,
+  static const SCEV *rewrite(const SCEV *S, const Loop *L,
                              ScalarEvolution &SE) {
     SCEVInitRewriter Rewriter(L, SE);
-    const SCEV *Result = Rewriter.visit(Scev);
+    const SCEV *Result = Rewriter.visit(S);
     return Rewriter.isValid() ? Result : SE.getCouldNotCompute();
   }
 
@@ -3649,10 +3720,10 @@ private:
 
 class SCEVShiftRewriter : public SCEVRewriteVisitor<SCEVShiftRewriter> {
 public:
-  static const SCEV *rewrite(const SCEV *Scev, const Loop *L,
+  static const SCEV *rewrite(const SCEV *S, const Loop *L,
                              ScalarEvolution &SE) {
     SCEVShiftRewriter Rewriter(L, SE);
-    const SCEV *Result = Rewriter.visit(Scev);
+    const SCEV *Result = Rewriter.visit(S);
     return Rewriter.isValid() ? Result : SE.getCouldNotCompute();
   }
 
@@ -3680,6 +3751,167 @@ private:
 };
 } // end anonymous namespace
 
+SCEV::NoWrapFlags
+ScalarEvolution::proveNoWrapViaConstantRanges(const SCEVAddRecExpr *AR) {
+  if (!AR->isAffine())
+    return SCEV::FlagAnyWrap;
+
+  typedef OverflowingBinaryOperator OBO;
+  SCEV::NoWrapFlags Result = SCEV::FlagAnyWrap;
+
+  if (!AR->hasNoSignedWrap()) {
+    ConstantRange AddRecRange = getSignedRange(AR);
+    ConstantRange IncRange = getSignedRange(AR->getStepRecurrence(*this));
+
+    auto NSWRegion = ConstantRange::makeGuaranteedNoWrapRegion(
+        Instruction::Add, IncRange, OBO::NoSignedWrap);
+    if (NSWRegion.contains(AddRecRange))
+      Result = ScalarEvolution::setFlags(Result, SCEV::FlagNSW);
+  }
+
+  if (!AR->hasNoUnsignedWrap()) {
+    ConstantRange AddRecRange = getUnsignedRange(AR);
+    ConstantRange IncRange = getUnsignedRange(AR->getStepRecurrence(*this));
+
+    auto NUWRegion = ConstantRange::makeGuaranteedNoWrapRegion(
+        Instruction::Add, IncRange, OBO::NoUnsignedWrap);
+    if (NUWRegion.contains(AddRecRange))
+      Result = ScalarEvolution::setFlags(Result, SCEV::FlagNUW);
+  }
+
+  return Result;
+}
+
+namespace {
+/// Represents an abstract binary operation.  This may exist as a
+/// normal instruction or constant expression, or may have been
+/// derived from an expression tree.
+struct BinaryOp {
+  unsigned Opcode;
+  Value *LHS;
+  Value *RHS;
+  bool IsNSW;
+  bool IsNUW;
+
+  /// Op is set if this BinaryOp corresponds to a concrete LLVM instruction or
+  /// constant expression.
+  Operator *Op;
+
+  explicit BinaryOp(Operator *Op)
+      : Opcode(Op->getOpcode()), LHS(Op->getOperand(0)), RHS(Op->getOperand(1)),
+        IsNSW(false), IsNUW(false), Op(Op) {
+    if (auto *OBO = dyn_cast<OverflowingBinaryOperator>(Op)) {
+      IsNSW = OBO->hasNoSignedWrap();
+      IsNUW = OBO->hasNoUnsignedWrap();
+    }
+  }
+
+  explicit BinaryOp(unsigned Opcode, Value *LHS, Value *RHS, bool IsNSW = false,
+                    bool IsNUW = false)
+      : Opcode(Opcode), LHS(LHS), RHS(RHS), IsNSW(IsNSW), IsNUW(IsNUW),
+        Op(nullptr) {}
+};
+}
+
+
+/// Try to map \p V into a BinaryOp, and return \c None on failure.
+static Optional<BinaryOp> MatchBinaryOp(Value *V, DominatorTree &DT) {
+  auto *Op = dyn_cast<Operator>(V);
+  if (!Op)
+    return None;
+
+  // Implementation detail: all the cleverness here should happen without
+  // creating new SCEV expressions -- our caller knowns tricks to avoid creating
+  // SCEV expressions when possible, and we should not break that.
+
+  switch (Op->getOpcode()) {
+  case Instruction::Add:
+  case Instruction::Sub:
+  case Instruction::Mul:
+  case Instruction::UDiv:
+  case Instruction::And:
+  case Instruction::Or:
+  case Instruction::AShr:
+  case Instruction::Shl:
+    return BinaryOp(Op);
+
+  case Instruction::Xor:
+    if (auto *RHSC = dyn_cast<ConstantInt>(Op->getOperand(1)))
+      // If the RHS of the xor is a signbit, then this is just an add.
+      // Instcombine turns add of signbit into xor as a strength reduction step.
+      if (RHSC->getValue().isSignBit())
+        return BinaryOp(Instruction::Add, Op->getOperand(0), Op->getOperand(1));
+    return BinaryOp(Op);
+
+  case Instruction::LShr:
+    // Turn logical shift right of a constant into a unsigned divide.
+    if (ConstantInt *SA = dyn_cast<ConstantInt>(Op->getOperand(1))) {
+      uint32_t BitWidth = cast<IntegerType>(Op->getType())->getBitWidth();
+
+      // If the shift count is not less than the bitwidth, the result of
+      // the shift is undefined. Don't try to analyze it, because the
+      // resolution chosen here may differ from the resolution chosen in
+      // other parts of the compiler.
+      if (SA->getValue().ult(BitWidth)) {
+        Constant *X =
+            ConstantInt::get(SA->getContext(),
+                             APInt::getOneBitSet(BitWidth, SA->getZExtValue()));
+        return BinaryOp(Instruction::UDiv, Op->getOperand(0), X);
+      }
+    }
+    return BinaryOp(Op);
+
+  case Instruction::ExtractValue: {
+    auto *EVI = cast<ExtractValueInst>(Op);
+    if (EVI->getNumIndices() != 1 || EVI->getIndices()[0] != 0)
+      break;
+
+    auto *CI = dyn_cast<CallInst>(EVI->getAggregateOperand());
+    if (!CI)
+      break;
+
+    if (auto *F = CI->getCalledFunction())
+      switch (F->getIntrinsicID()) {
+      case Intrinsic::sadd_with_overflow:
+      case Intrinsic::uadd_with_overflow: {
+        if (!isOverflowIntrinsicNoWrap(cast<IntrinsicInst>(CI), DT))
+          return BinaryOp(Instruction::Add, CI->getArgOperand(0),
+                          CI->getArgOperand(1));
+
+        // Now that we know that all uses of the arithmetic-result component of
+        // CI are guarded by the overflow check, we can go ahead and pretend
+        // that the arithmetic is non-overflowing.
+        if (F->getIntrinsicID() == Intrinsic::sadd_with_overflow)
+          return BinaryOp(Instruction::Add, CI->getArgOperand(0),
+                          CI->getArgOperand(1), /* IsNSW = */ true,
+                          /* IsNUW = */ false);
+        else
+          return BinaryOp(Instruction::Add, CI->getArgOperand(0),
+                          CI->getArgOperand(1), /* IsNSW = */ false,
+                          /* IsNUW*/ true);
+      }
+
+      case Intrinsic::ssub_with_overflow:
+      case Intrinsic::usub_with_overflow:
+        return BinaryOp(Instruction::Sub, CI->getArgOperand(0),
+                        CI->getArgOperand(1));
+
+      case Intrinsic::smul_with_overflow:
+      case Intrinsic::umul_with_overflow:
+        return BinaryOp(Instruction::Mul, CI->getArgOperand(0),
+                        CI->getArgOperand(1));
+      default:
+        break;
+      }
+  }
+
+  default:
+    break;
+  }
+
+  return None;
+}
+
 const SCEV *ScalarEvolution::createAddRecFromPHI(PHINode *PN) {
   const Loop *L = LI.getLoopFor(PN->getParent());
   if (!L || L->getHeader() != PN->getParent())
@@ -3710,7 +3942,7 @@ const SCEV *ScalarEvolution::createAddRecFromPHI(PHINode *PN) {
     const SCEV *SymbolicName = getUnknown(PN);
     assert(ValueExprMap.find_as(PN) == ValueExprMap.end() &&
            "PHI node already processed?");
-    ValueExprMap.insert(std::make_pair(SCEVCallbackVH(PN, this), SymbolicName));
+    ValueExprMap.insert({SCEVCallbackVH(PN, this), SymbolicName});
 
     // Using this symbolic name for the PHI, analyze the value coming around
     // the back-edge.
@@ -3747,13 +3979,11 @@ const SCEV *ScalarEvolution::createAddRecFromPHI(PHINode *PN) {
              cast<SCEVAddRecExpr>(Accum)->getLoop() == L)) {
           SCEV::NoWrapFlags Flags = SCEV::FlagAnyWrap;
 
-          // If the increment doesn't overflow, then neither the addrec nor
-          // the post-increment will overflow.
-          if (const AddOperator *OBO = dyn_cast<AddOperator>(BEValueV)) {
-            if (OBO->getOperand(0) == PN) {
-              if (OBO->hasNoUnsignedWrap())
+          if (auto BO = MatchBinaryOp(BEValueV, DT)) {
+            if (BO->Opcode == Instruction::Add && BO->LHS == PN) {
+              if (BO->IsNUW)
                 Flags = setFlags(Flags, SCEV::FlagNUW);
-              if (OBO->hasNoSignedWrap())
+              if (BO->IsNSW)
                 Flags = setFlags(Flags, SCEV::FlagNSW);
             }
           } else if (GEPOperator *GEP = dyn_cast<GEPOperator>(BEValueV)) {
@@ -3779,16 +4009,19 @@ const SCEV *ScalarEvolution::createAddRecFromPHI(PHINode *PN) {
           const SCEV *StartVal = getSCEV(StartValueV);
           const SCEV *PHISCEV = getAddRecExpr(StartVal, Accum, L, Flags);
 
-          // Since the no-wrap flags are on the increment, they apply to the
-          // post-incremented value as well.
-          if (isLoopInvariant(Accum, L))
-            (void)getAddRecExpr(getAddExpr(StartVal, Accum), Accum, L, Flags);
-
           // Okay, for the entire analysis of this edge we assumed the PHI
           // to be symbolic.  We now need to go back and purge all of the
           // entries for the scalars that use the symbolic expression.
-          ForgetSymbolicName(PN, SymbolicName);
+          forgetSymbolicName(PN, SymbolicName);
           ValueExprMap[SCEVCallbackVH(PN, this)] = PHISCEV;
+
+          // We can add Flags to the post-inc expression only if we
+          // know that it us *undefined behavior* for BEValueV to
+          // overflow.
+          if (auto *BEInst = dyn_cast<Instruction>(BEValueV))
+            if (isLoopInvariant(Accum, L) && isAddRecNeverPoison(BEInst, L))
+              (void)getAddRecExpr(getAddExpr(StartVal, Accum), Accum, L, Flags);
+
           return PHISCEV;
         }
       }
@@ -3811,12 +4044,18 @@ const SCEV *ScalarEvolution::createAddRecFromPHI(PHINode *PN) {
           // Okay, for the entire analysis of this edge we assumed the PHI
           // to be symbolic.  We now need to go back and purge all of the
           // entries for the scalars that use the symbolic expression.
-          ForgetSymbolicName(PN, SymbolicName);
+          forgetSymbolicName(PN, SymbolicName);
           ValueExprMap[SCEVCallbackVH(PN, this)] = Shifted;
           return Shifted;
         }
       }
     }
+
+    // Remove the temporary PHI node SCEV that has been inserted while intending
+    // to create an AddRecExpr for this PHI node. We can not keep this temporary
+    // as it will prevent later (possibly simpler) SCEV expressions to be added
+    // to the ValueExprMap.
+    ValueExprMap.erase(PN);
   }
 
   return nullptr;
@@ -4083,26 +4322,21 @@ const SCEV *ScalarEvolution::createNodeForSelectOrPHI(Instruction *I,
   return getUnknown(I);
 }
 
-/// createNodeForGEP - Expand GEP instructions into add and multiply
-/// operations. This allows them to be analyzed by regular SCEV code.
-///
+/// Expand GEP instructions into add and multiply operations. This allows them
+/// to be analyzed by regular SCEV code.
 const SCEV *ScalarEvolution::createNodeForGEP(GEPOperator *GEP) {
-  Value *Base = GEP->getOperand(0);
   // Don't attempt to analyze GEPs over unsized objects.
-  if (!Base->getType()->getPointerElementType()->isSized())
+  if (!GEP->getSourceElementType()->isSized())
     return getUnknown(GEP);
 
   SmallVector<const SCEV *, 4> IndexExprs;
   for (auto Index = GEP->idx_begin(); Index != GEP->idx_end(); ++Index)
     IndexExprs.push_back(getSCEV(*Index));
-  return getGEPExpr(GEP->getSourceElementType(), getSCEV(Base), IndexExprs,
-                    GEP->isInBounds());
+  return getGEPExpr(GEP->getSourceElementType(),
+                    getSCEV(GEP->getPointerOperand()),
+                    IndexExprs, GEP->isInBounds());
 }
 
-/// GetMinTrailingZeros - Determine the minimum number of zero bits that S is
-/// guaranteed to end in (at every loop iteration).  It is, at the same time,
-/// the minimum number of times S is divisible by 2.  For example, given {4,+,8}
-/// it returns 2.  If S is guaranteed to be 0, it returns the bitwidth of S.
 uint32_t
 ScalarEvolution::GetMinTrailingZeros(const SCEV *S) {
   if (const SCEVConstant *C = dyn_cast<SCEVConstant>(S))
@@ -4180,8 +4414,7 @@ ScalarEvolution::GetMinTrailingZeros(const SCEV *S) {
   return 0;
 }
 
-/// GetRangeFromMetadata - Helper method to assign a range to V from
-/// metadata present in the IR.
+/// Helper method to assign a range to V from metadata present in the IR.
 static Optional<ConstantRange> GetRangeFromMetadata(Value *V) {
   if (Instruction *I = dyn_cast<Instruction>(V))
     if (MDNode *MD = I->getMetadata(LLVMContext::MD_range))
@@ -4190,10 +4423,9 @@ static Optional<ConstantRange> GetRangeFromMetadata(Value *V) {
   return None;
 }
 
-/// getRange - Determine the range for a particular SCEV.  If SignHint is
+/// Determine the range for a particular SCEV.  If SignHint is
 /// HINT_RANGE_UNSIGNED (resp. HINT_RANGE_SIGNED) then getRange prefers ranges
 /// with a "cleaner" unsigned (resp. signed) representation.
-///
 ConstantRange
 ScalarEvolution::getRange(const SCEV *S,
                           ScalarEvolution::RangeSignHint SignHint) {
@@ -4282,7 +4514,7 @@ ScalarEvolution::getRange(const SCEV *S,
   if (const SCEVAddRecExpr *AddRec = dyn_cast<SCEVAddRecExpr>(S)) {
     // If there's no unsigned wrap, the value will never be less than its
     // initial value.
-    if (AddRec->getNoWrapFlags(SCEV::FlagNUW))
+    if (AddRec->hasNoUnsignedWrap())
       if (const SCEVConstant *C = dyn_cast<SCEVConstant>(AddRec->getStart()))
         if (!C->getValue()->isZero())
           ConservativeResult = ConservativeResult.intersectWith(
@@ -4290,7 +4522,7 @@ ScalarEvolution::getRange(const SCEV *S,
 
     // If there's no signed wrap, and all the operands have the same sign or
     // zero, the value won't ever change sign.
-    if (AddRec->getNoWrapFlags(SCEV::FlagNSW)) {
+    if (AddRec->hasNoSignedWrap()) {
       bool AllNonNeg = true;
       bool AllNonPos = true;
       for (unsigned i = 0, e = AddRec->getNumOperands(); i != e; ++i) {
@@ -4309,66 +4541,22 @@ ScalarEvolution::getRange(const SCEV *S,
 
     // TODO: non-affine addrec
     if (AddRec->isAffine()) {
-      Type *Ty = AddRec->getType();
       const SCEV *MaxBECount = getMaxBackedgeTakenCount(AddRec->getLoop());
       if (!isa<SCEVCouldNotCompute>(MaxBECount) &&
           getTypeSizeInBits(MaxBECount->getType()) <= BitWidth) {
-
-        // Check for overflow.  This must be done with ConstantRange arithmetic
-        // because we could be called from within the ScalarEvolution overflow
-        // checking code.
-
-        MaxBECount = getNoopOrZeroExtend(MaxBECount, Ty);
-        ConstantRange MaxBECountRange = getUnsignedRange(MaxBECount);
-        ConstantRange ZExtMaxBECountRange =
-            MaxBECountRange.zextOrTrunc(BitWidth * 2 + 1);
-
-        const SCEV *Start = AddRec->getStart();
-        const SCEV *Step = AddRec->getStepRecurrence(*this);
-        ConstantRange StepSRange = getSignedRange(Step);
-        ConstantRange SExtStepSRange = StepSRange.sextOrTrunc(BitWidth * 2 + 1);
-
-        ConstantRange StartURange = getUnsignedRange(Start);
-        ConstantRange EndURange =
-            StartURange.add(MaxBECountRange.multiply(StepSRange));
-
-        // Check for unsigned overflow.
-        ConstantRange ZExtStartURange =
-            StartURange.zextOrTrunc(BitWidth * 2 + 1);
-        ConstantRange ZExtEndURange = EndURange.zextOrTrunc(BitWidth * 2 + 1);
-        if (ZExtStartURange.add(ZExtMaxBECountRange.multiply(SExtStepSRange)) ==
-            ZExtEndURange) {
-          APInt Min = APIntOps::umin(StartURange.getUnsignedMin(),
-                                     EndURange.getUnsignedMin());
-          APInt Max = APIntOps::umax(StartURange.getUnsignedMax(),
-                                     EndURange.getUnsignedMax());
-          bool IsFullRange = Min.isMinValue() && Max.isMaxValue();
-          if (!IsFullRange)
-            ConservativeResult =
-                ConservativeResult.intersectWith(ConstantRange(Min, Max + 1));
-        }
-
-        ConstantRange StartSRange = getSignedRange(Start);
-        ConstantRange EndSRange =
-            StartSRange.add(MaxBECountRange.multiply(StepSRange));
-
-        // Check for signed overflow. This must be done with ConstantRange
-        // arithmetic because we could be called from within the ScalarEvolution
-        // overflow checking code.
-        ConstantRange SExtStartSRange =
-            StartSRange.sextOrTrunc(BitWidth * 2 + 1);
-        ConstantRange SExtEndSRange = EndSRange.sextOrTrunc(BitWidth * 2 + 1);
-        if (SExtStartSRange.add(ZExtMaxBECountRange.multiply(SExtStepSRange)) ==
-            SExtEndSRange) {
-          APInt Min = APIntOps::smin(StartSRange.getSignedMin(),
-                                     EndSRange.getSignedMin());
-          APInt Max = APIntOps::smax(StartSRange.getSignedMax(),
-                                     EndSRange.getSignedMax());
-          bool IsFullRange = Min.isMinSignedValue() && Max.isMaxSignedValue();
-          if (!IsFullRange)
-            ConservativeResult =
-                ConservativeResult.intersectWith(ConstantRange(Min, Max + 1));
-        }
+        auto RangeFromAffine = getRangeForAffineAR(
+            AddRec->getStart(), AddRec->getStepRecurrence(*this), MaxBECount,
+            BitWidth);
+        if (!RangeFromAffine.isFullSet())
+          ConservativeResult =
+              ConservativeResult.intersectWith(RangeFromAffine);
+
+        auto RangeFromFactoring = getRangeViaFactoring(
+            AddRec->getStart(), AddRec->getStepRecurrence(*this), MaxBECount,
+            BitWidth);
+        if (!RangeFromFactoring.isFullSet())
+          ConservativeResult =
+              ConservativeResult.intersectWith(RangeFromFactoring);
       }
     }
 
@@ -4408,6 +4596,186 @@ ScalarEvolution::getRange(const SCEV *S,
   return setRange(S, SignHint, ConservativeResult);
 }
 
+ConstantRange ScalarEvolution::getRangeForAffineAR(const SCEV *Start,
+                                                   const SCEV *Step,
+                                                   const SCEV *MaxBECount,
+                                                   unsigned BitWidth) {
+  assert(!isa<SCEVCouldNotCompute>(MaxBECount) &&
+         getTypeSizeInBits(MaxBECount->getType()) <= BitWidth &&
+         "Precondition!");
+
+  ConstantRange Result(BitWidth, /* isFullSet = */ true);
+
+  // Check for overflow.  This must be done with ConstantRange arithmetic
+  // because we could be called from within the ScalarEvolution overflow
+  // checking code.
+
+  MaxBECount = getNoopOrZeroExtend(MaxBECount, Start->getType());
+  ConstantRange MaxBECountRange = getUnsignedRange(MaxBECount);
+  ConstantRange ZExtMaxBECountRange =
+      MaxBECountRange.zextOrTrunc(BitWidth * 2 + 1);
+
+  ConstantRange StepSRange = getSignedRange(Step);
+  ConstantRange SExtStepSRange = StepSRange.sextOrTrunc(BitWidth * 2 + 1);
+
+  ConstantRange StartURange = getUnsignedRange(Start);
+  ConstantRange EndURange =
+      StartURange.add(MaxBECountRange.multiply(StepSRange));
+
+  // Check for unsigned overflow.
+  ConstantRange ZExtStartURange = StartURange.zextOrTrunc(BitWidth * 2 + 1);
+  ConstantRange ZExtEndURange = EndURange.zextOrTrunc(BitWidth * 2 + 1);
+  if (ZExtStartURange.add(ZExtMaxBECountRange.multiply(SExtStepSRange)) ==
+      ZExtEndURange) {
+    APInt Min = APIntOps::umin(StartURange.getUnsignedMin(),
+                               EndURange.getUnsignedMin());
+    APInt Max = APIntOps::umax(StartURange.getUnsignedMax(),
+                               EndURange.getUnsignedMax());
+    bool IsFullRange = Min.isMinValue() && Max.isMaxValue();
+    if (!IsFullRange)
+      Result =
+          Result.intersectWith(ConstantRange(Min, Max + 1));
+  }
+
+  ConstantRange StartSRange = getSignedRange(Start);
+  ConstantRange EndSRange =
+      StartSRange.add(MaxBECountRange.multiply(StepSRange));
+
+  // Check for signed overflow. This must be done with ConstantRange
+  // arithmetic because we could be called from within the ScalarEvolution
+  // overflow checking code.
+  ConstantRange SExtStartSRange = StartSRange.sextOrTrunc(BitWidth * 2 + 1);
+  ConstantRange SExtEndSRange = EndSRange.sextOrTrunc(BitWidth * 2 + 1);
+  if (SExtStartSRange.add(ZExtMaxBECountRange.multiply(SExtStepSRange)) ==
+      SExtEndSRange) {
+    APInt Min =
+        APIntOps::smin(StartSRange.getSignedMin(), EndSRange.getSignedMin());
+    APInt Max =
+        APIntOps::smax(StartSRange.getSignedMax(), EndSRange.getSignedMax());
+    bool IsFullRange = Min.isMinSignedValue() && Max.isMaxSignedValue();
+    if (!IsFullRange)
+      Result =
+          Result.intersectWith(ConstantRange(Min, Max + 1));
+  }
+
+  return Result;
+}
+
+ConstantRange ScalarEvolution::getRangeViaFactoring(const SCEV *Start,
+                                                    const SCEV *Step,
+                                                    const SCEV *MaxBECount,
+                                                    unsigned BitWidth) {
+  //    RangeOf({C?A:B,+,C?P:Q}) == RangeOf(C?{A,+,P}:{B,+,Q})
+  // == RangeOf({A,+,P}) union RangeOf({B,+,Q})
+
+  struct SelectPattern {
+    Value *Condition = nullptr;
+    APInt TrueValue;
+    APInt FalseValue;
+
+    explicit SelectPattern(ScalarEvolution &SE, unsigned BitWidth,
+                           const SCEV *S) {
+      Optional<unsigned> CastOp;
+      APInt Offset(BitWidth, 0);
+
+      assert(SE.getTypeSizeInBits(S->getType()) == BitWidth &&
+             "Should be!");
+
+      // Peel off a constant offset:
+      if (auto *SA = dyn_cast<SCEVAddExpr>(S)) {
+        // In the future we could consider being smarter here and handle
+        // {Start+Step,+,Step} too.
+        if (SA->getNumOperands() != 2 || !isa<SCEVConstant>(SA->getOperand(0)))
+          return;
+
+        Offset = cast<SCEVConstant>(SA->getOperand(0))->getAPInt();
+        S = SA->getOperand(1);
+      }
+
+      // Peel off a cast operation
+      if (auto *SCast = dyn_cast<SCEVCastExpr>(S)) {
+        CastOp = SCast->getSCEVType();
+        S = SCast->getOperand();
+      }
+
+      using namespace llvm::PatternMatch;
+
+      auto *SU = dyn_cast<SCEVUnknown>(S);
+      const APInt *TrueVal, *FalseVal;
+      if (!SU ||
+          !match(SU->getValue(), m_Select(m_Value(Condition), m_APInt(TrueVal),
+                                          m_APInt(FalseVal)))) {
+        Condition = nullptr;
+        return;
+      }
+
+      TrueValue = *TrueVal;
+      FalseValue = *FalseVal;
+
+      // Re-apply the cast we peeled off earlier
+      if (CastOp.hasValue())
+        switch (*CastOp) {
+        default:
+          llvm_unreachable("Unknown SCEV cast type!");
+
+        case scTruncate:
+          TrueValue = TrueValue.trunc(BitWidth);
+          FalseValue = FalseValue.trunc(BitWidth);
+          break;
+        case scZeroExtend:
+          TrueValue = TrueValue.zext(BitWidth);
+          FalseValue = FalseValue.zext(BitWidth);
+          break;
+        case scSignExtend:
+          TrueValue = TrueValue.sext(BitWidth);
+          FalseValue = FalseValue.sext(BitWidth);
+          break;
+        }
+
+      // Re-apply the constant offset we peeled off earlier
+      TrueValue += Offset;
+      FalseValue += Offset;
+    }
+
+    bool isRecognized() { return Condition != nullptr; }
+  };
+
+  SelectPattern StartPattern(*this, BitWidth, Start);
+  if (!StartPattern.isRecognized())
+    return ConstantRange(BitWidth, /* isFullSet = */ true);
+
+  SelectPattern StepPattern(*this, BitWidth, Step);
+  if (!StepPattern.isRecognized())
+    return ConstantRange(BitWidth, /* isFullSet = */ true);
+
+  if (StartPattern.Condition != StepPattern.Condition) {
+    // We don't handle this case today; but we could, by considering four
+    // possibilities below instead of two. I'm not sure if there are cases where
+    // that will help over what getRange already does, though.
+    return ConstantRange(BitWidth, /* isFullSet = */ true);
+  }
+
+  // NB! Calling ScalarEvolution::getConstant is fine, but we should not try to
+  // construct arbitrary general SCEV expressions here.  This function is called
+  // from deep in the call stack, and calling getSCEV (on a sext instruction,
+  // say) can end up caching a suboptimal value.
+
+  // FIXME: without the explicit `this` receiver below, MSVC errors out with
+  // C2352 and C2512 (otherwise it isn't needed).
+
+  const SCEV *TrueStart = this->getConstant(StartPattern.TrueValue);
+  const SCEV *TrueStep = this->getConstant(StepPattern.TrueValue);
+  const SCEV *FalseStart = this->getConstant(StartPattern.FalseValue);
+  const SCEV *FalseStep = this->getConstant(StepPattern.FalseValue);
+
+  ConstantRange TrueRange =
+      this->getRangeForAffineAR(TrueStart, TrueStep, MaxBECount, BitWidth);
+  ConstantRange FalseRange =
+      this->getRangeForAffineAR(FalseStart, FalseStep, MaxBECount, BitWidth);
+
+  return TrueRange.unionWith(FalseRange);
+}
+
 SCEV::NoWrapFlags ScalarEvolution::getNoWrapFlagsFromUB(const Value *V) {
   if (isa<ConstantExpr>(V)) return SCEV::FlagAnyWrap;
   const BinaryOperator *BinOp = cast<BinaryOperator>(V);
@@ -4418,273 +4786,359 @@ SCEV::NoWrapFlags ScalarEvolution::getNoWrapFlagsFromUB(const Value *V) {
     Flags = ScalarEvolution::setFlags(Flags, SCEV::FlagNUW);
   if (BinOp->hasNoSignedWrap())
     Flags = ScalarEvolution::setFlags(Flags, SCEV::FlagNSW);
-  if (Flags == SCEV::FlagAnyWrap) {
+  if (Flags == SCEV::FlagAnyWrap)
     return SCEV::FlagAnyWrap;
-  }
 
-  // Here we check that BinOp is in the header of the innermost loop
-  // containing BinOp, since we only deal with instructions in the loop
-  // header. The actual loop we need to check later will come from an add
-  // recurrence, but getting that requires computing the SCEV of the operands,
-  // which can be expensive. This check we can do cheaply to rule out some
-  // cases early.
-  Loop *innermostContainingLoop = LI.getLoopFor(BinOp->getParent());
-  if (innermostContainingLoop == nullptr ||
-      innermostContainingLoop->getHeader() != BinOp->getParent())
-    return SCEV::FlagAnyWrap;
+  return isSCEVExprNeverPoison(BinOp) ? Flags : SCEV::FlagAnyWrap;
+}
 
-  // Only proceed if we can prove that BinOp does not yield poison.
-  if (!isKnownNotFullPoison(BinOp)) return SCEV::FlagAnyWrap;
+bool ScalarEvolution::isSCEVExprNeverPoison(const Instruction *I) {
+  // Here we check that I is in the header of the innermost loop containing I,
+  // since we only deal with instructions in the loop header. The actual loop we
+  // need to check later will come from an add recurrence, but getting that
+  // requires computing the SCEV of the operands, which can be expensive. This
+  // check we can do cheaply to rule out some cases early.
+  Loop *InnermostContainingLoop = LI.getLoopFor(I->getParent());
+  if (InnermostContainingLoop == nullptr ||
+      InnermostContainingLoop->getHeader() != I->getParent())
+    return false;
+
+  // Only proceed if we can prove that I does not yield poison.
+  if (!isKnownNotFullPoison(I)) return false;
 
-  // At this point we know that if V is executed, then it does not wrap
-  // according to at least one of NSW or NUW. If V is not executed, then we do
-  // not know if the calculation that V represents would wrap. Multiple
-  // instructions can map to the same SCEV. If we apply NSW or NUW from V to
+  // At this point we know that if I is executed, then it does not wrap
+  // according to at least one of NSW or NUW. If I is not executed, then we do
+  // not know if the calculation that I represents would wrap. Multiple
+  // instructions can map to the same SCEV. If we apply NSW or NUW from I to
   // the SCEV, we must guarantee no wrapping for that SCEV also when it is
   // derived from other instructions that map to the same SCEV. We cannot make
-  // that guarantee for cases where V is not executed. So we need to find the
-  // loop that V is considered in relation to and prove that V is executed for
-  // every iteration of that loop. That implies that the value that V
+  // that guarantee for cases where I is not executed. So we need to find the
+  // loop that I is considered in relation to and prove that I is executed for
+  // every iteration of that loop. That implies that the value that I
   // calculates does not wrap anywhere in the loop, so then we can apply the
   // flags to the SCEV.
   //
-  // We check isLoopInvariant to disambiguate in case we are adding two
-  // recurrences from different loops, so that we know which loop to prove
-  // that V is executed in.
-  for (int OpIndex = 0; OpIndex < 2; ++OpIndex) {
-    const SCEV *Op = getSCEV(BinOp->getOperand(OpIndex));
+  // We check isLoopInvariant to disambiguate in case we are adding recurrences
+  // from different loops, so that we know which loop to prove that I is
+  // executed in.
+  for (unsigned OpIndex = 0; OpIndex < I->getNumOperands(); ++OpIndex) {
+    const SCEV *Op = getSCEV(I->getOperand(OpIndex));
     if (auto *AddRec = dyn_cast<SCEVAddRecExpr>(Op)) {
-      const int OtherOpIndex = 1 - OpIndex;
-      const SCEV *OtherOp = getSCEV(BinOp->getOperand(OtherOpIndex));
-      if (isLoopInvariant(OtherOp, AddRec->getLoop()) &&
-          isGuaranteedToExecuteForEveryIteration(BinOp, AddRec->getLoop()))
-        return Flags;
+      bool AllOtherOpsLoopInvariant = true;
+      for (unsigned OtherOpIndex = 0; OtherOpIndex < I->getNumOperands();
+           ++OtherOpIndex) {
+        if (OtherOpIndex != OpIndex) {
+          const SCEV *OtherOp = getSCEV(I->getOperand(OtherOpIndex));
+          if (!isLoopInvariant(OtherOp, AddRec->getLoop())) {
+            AllOtherOpsLoopInvariant = false;
+            break;
+          }
+        }
+      }
+      if (AllOtherOpsLoopInvariant &&
+          isGuaranteedToExecuteForEveryIteration(I, AddRec->getLoop()))
+        return true;
+    }
+  }
+  return false;
+}
+
+bool ScalarEvolution::isAddRecNeverPoison(const Instruction *I, const Loop *L) {
+  // If we know that \c I can never be poison period, then that's enough.
+  if (isSCEVExprNeverPoison(I))
+    return true;
+
+  // For an add recurrence specifically, we assume that infinite loops without
+  // side effects are undefined behavior, and then reason as follows:
+  //
+  // If the add recurrence is poison in any iteration, it is poison on all
+  // future iterations (since incrementing poison yields poison). If the result
+  // of the add recurrence is fed into the loop latch condition and the loop
+  // does not contain any throws or exiting blocks other than the latch, we now
+  // have the ability to "choose" whether the backedge is taken or not (by
+  // choosing a sufficiently evil value for the poison feeding into the branch)
+  // for every iteration including and after the one in which \p I first became
+  // poison.  There are two possibilities (let's call the iteration in which \p
+  // I first became poison as K):
+  //
+  //  1. In the set of iterations including and after K, the loop body executes
+  //     no side effects.  In this case executing the backege an infinte number
+  //     of times will yield undefined behavior.
+  //
+  //  2. In the set of iterations including and after K, the loop body executes
+  //     at least one side effect.  In this case, that specific instance of side
+  //     effect is control dependent on poison, which also yields undefined
+  //     behavior.
+
+  auto *ExitingBB = L->getExitingBlock();
+  auto *LatchBB = L->getLoopLatch();
+  if (!ExitingBB || !LatchBB || ExitingBB != LatchBB)
+    return false;
+
+  SmallPtrSet<const Instruction *, 16> Pushed;
+  SmallVector<const Instruction *, 8> PoisonStack;
+
+  // We start by assuming \c I, the post-inc add recurrence, is poison.  Only
+  // things that are known to be fully poison under that assumption go on the
+  // PoisonStack.
+  Pushed.insert(I);
+  PoisonStack.push_back(I);
+
+  bool LatchControlDependentOnPoison = false;
+  while (!PoisonStack.empty() && !LatchControlDependentOnPoison) {
+    const Instruction *Poison = PoisonStack.pop_back_val();
+
+    for (auto *PoisonUser : Poison->users()) {
+      if (propagatesFullPoison(cast<Instruction>(PoisonUser))) {
+        if (Pushed.insert(cast<Instruction>(PoisonUser)).second)
+          PoisonStack.push_back(cast<Instruction>(PoisonUser));
+      } else if (auto *BI = dyn_cast<BranchInst>(PoisonUser)) {
+        assert(BI->isConditional() && "Only possibility!");
+        if (BI->getParent() == LatchBB) {
+          LatchControlDependentOnPoison = true;
+          break;
+        }
+      }
     }
   }
-  return SCEV::FlagAnyWrap;
+
+  return LatchControlDependentOnPoison && loopHasNoAbnormalExits(L);
+}
+
+bool ScalarEvolution::loopHasNoAbnormalExits(const Loop *L) {
+  auto Itr = LoopHasNoAbnormalExits.find(L);
+  if (Itr == LoopHasNoAbnormalExits.end()) {
+    auto NoAbnormalExitInBB = [&](BasicBlock *BB) {
+      return all_of(*BB, [](Instruction &I) {
+        return isGuaranteedToTransferExecutionToSuccessor(&I);
+      });
+    };
+
+    auto InsertPair = LoopHasNoAbnormalExits.insert(
+        {L, all_of(L->getBlocks(), NoAbnormalExitInBB)});
+    assert(InsertPair.second && "We just checked!");
+    Itr = InsertPair.first;
+  }
+
+  return Itr->second;
 }
 
-/// createSCEV - We know that there is no SCEV for the specified value.  Analyze
-/// the expression.
-///
 const SCEV *ScalarEvolution::createSCEV(Value *V) {
   if (!isSCEVable(V->getType()))
     return getUnknown(V);
 
-  unsigned Opcode = Instruction::UserOp1;
   if (Instruction *I = dyn_cast<Instruction>(V)) {
-    Opcode = I->getOpcode();
-
     // Don't attempt to analyze instructions in blocks that aren't
     // reachable. Such instructions don't matter, and they aren't required
     // to obey basic rules for definitions dominating uses which this
     // analysis depends on.
     if (!DT.isReachableFromEntry(I->getParent()))
       return getUnknown(V);
-  } else if (ConstantExpr *CE = dyn_cast<ConstantExpr>(V))
-    Opcode = CE->getOpcode();
-  else if (ConstantInt *CI = dyn_cast<ConstantInt>(V))
+  } else if (ConstantInt *CI = dyn_cast<ConstantInt>(V))
     return getConstant(CI);
   else if (isa<ConstantPointerNull>(V))
     return getZero(V->getType());
   else if (GlobalAlias *GA = dyn_cast<GlobalAlias>(V))
-    return GA->mayBeOverridden() ? getUnknown(V) : getSCEV(GA->getAliasee());
-  else
+    return GA->isInterposable() ? getUnknown(V) : getSCEV(GA->getAliasee());
+  else if (!isa<ConstantExpr>(V))
     return getUnknown(V);
 
   Operator *U = cast<Operator>(V);
-  switch (Opcode) {
-  case Instruction::Add: {
-    // The simple thing to do would be to just call getSCEV on both operands
-    // and call getAddExpr with the result. However if we're looking at a
-    // bunch of things all added together, this can be quite inefficient,
-    // because it leads to N-1 getAddExpr calls for N ultimate operands.
-    // Instead, gather up all the operands and make a single getAddExpr call.
-    // LLVM IR canonical form means we need only traverse the left operands.
-    SmallVector<const SCEV *, 4> AddOps;
-    for (Value *Op = U;; Op = U->getOperand(0)) {
-      U = dyn_cast<Operator>(Op);
-      unsigned Opcode = U ? U->getOpcode() : 0;
-      if (!U || (Opcode != Instruction::Add && Opcode != Instruction::Sub)) {
-        assert(Op != V && "V should be an add");
-        AddOps.push_back(getSCEV(Op));
-        break;
-      }
+  if (auto BO = MatchBinaryOp(U, DT)) {
+    switch (BO->Opcode) {
+    case Instruction::Add: {
+      // The simple thing to do would be to just call getSCEV on both operands
+      // and call getAddExpr with the result. However if we're looking at a
+      // bunch of things all added together, this can be quite inefficient,
+      // because it leads to N-1 getAddExpr calls for N ultimate operands.
+      // Instead, gather up all the operands and make a single getAddExpr call.
+      // LLVM IR canonical form means we need only traverse the left operands.
+      SmallVector<const SCEV *, 4> AddOps;
+      do {
+        if (BO->Op) {
+          if (auto *OpSCEV = getExistingSCEV(BO->Op)) {
+            AddOps.push_back(OpSCEV);
+            break;
+          }
 
-      if (auto *OpSCEV = getExistingSCEV(U)) {
-        AddOps.push_back(OpSCEV);
-        break;
-      }
+          // If a NUW or NSW flag can be applied to the SCEV for this
+          // addition, then compute the SCEV for this addition by itself
+          // with a separate call to getAddExpr. We need to do that
+          // instead of pushing the operands of the addition onto AddOps,
+          // since the flags are only known to apply to this particular
+          // addition - they may not apply to other additions that can be
+          // formed with operands from AddOps.
+          const SCEV *RHS = getSCEV(BO->RHS);
+          SCEV::NoWrapFlags Flags = getNoWrapFlagsFromUB(BO->Op);
+          if (Flags != SCEV::FlagAnyWrap) {
+            const SCEV *LHS = getSCEV(BO->LHS);
+            if (BO->Opcode == Instruction::Sub)
+              AddOps.push_back(getMinusSCEV(LHS, RHS, Flags));
+            else
+              AddOps.push_back(getAddExpr(LHS, RHS, Flags));
+            break;
+          }
+        }
 
-      // If a NUW or NSW flag can be applied to the SCEV for this
-      // addition, then compute the SCEV for this addition by itself
-      // with a separate call to getAddExpr. We need to do that
-      // instead of pushing the operands of the addition onto AddOps,
-      // since the flags are only known to apply to this particular
-      // addition - they may not apply to other additions that can be
-      // formed with operands from AddOps.
-      const SCEV *RHS = getSCEV(U->getOperand(1));
-      SCEV::NoWrapFlags Flags = getNoWrapFlagsFromUB(U);
-      if (Flags != SCEV::FlagAnyWrap) {
-        const SCEV *LHS = getSCEV(U->getOperand(0));
-        if (Opcode == Instruction::Sub)
-          AddOps.push_back(getMinusSCEV(LHS, RHS, Flags));
+        if (BO->Opcode == Instruction::Sub)
+          AddOps.push_back(getNegativeSCEV(getSCEV(BO->RHS)));
         else
-          AddOps.push_back(getAddExpr(LHS, RHS, Flags));
-        break;
-      }
+          AddOps.push_back(getSCEV(BO->RHS));
 
-      if (Opcode == Instruction::Sub)
-        AddOps.push_back(getNegativeSCEV(RHS));
-      else
-        AddOps.push_back(RHS);
+        auto NewBO = MatchBinaryOp(BO->LHS, DT);
+        if (!NewBO || (NewBO->Opcode != Instruction::Add &&
+                       NewBO->Opcode != Instruction::Sub)) {
+          AddOps.push_back(getSCEV(BO->LHS));
+          break;
+        }
+        BO = NewBO;
+      } while (true);
+
+      return getAddExpr(AddOps);
     }
-    return getAddExpr(AddOps);
-  }
 
-  case Instruction::Mul: {
-    SmallVector<const SCEV *, 4> MulOps;
-    for (Value *Op = U;; Op = U->getOperand(0)) {
-      U = dyn_cast<Operator>(Op);
-      if (!U || U->getOpcode() != Instruction::Mul) {
-        assert(Op != V && "V should be a mul");
-        MulOps.push_back(getSCEV(Op));
-        break;
-      }
+    case Instruction::Mul: {
+      SmallVector<const SCEV *, 4> MulOps;
+      do {
+        if (BO->Op) {
+          if (auto *OpSCEV = getExistingSCEV(BO->Op)) {
+            MulOps.push_back(OpSCEV);
+            break;
+          }
 
-      if (auto *OpSCEV = getExistingSCEV(U)) {
-        MulOps.push_back(OpSCEV);
-        break;
-      }
+          SCEV::NoWrapFlags Flags = getNoWrapFlagsFromUB(BO->Op);
+          if (Flags != SCEV::FlagAnyWrap) {
+            MulOps.push_back(
+                getMulExpr(getSCEV(BO->LHS), getSCEV(BO->RHS), Flags));
+            break;
+          }
+        }
 
-      SCEV::NoWrapFlags Flags = getNoWrapFlagsFromUB(U);
-      if (Flags != SCEV::FlagAnyWrap) {
-        MulOps.push_back(getMulExpr(getSCEV(U->getOperand(0)),
-                                    getSCEV(U->getOperand(1)), Flags));
-        break;
+        MulOps.push_back(getSCEV(BO->RHS));
+        auto NewBO = MatchBinaryOp(BO->LHS, DT);
+        if (!NewBO || NewBO->Opcode != Instruction::Mul) {
+          MulOps.push_back(getSCEV(BO->LHS));
+          break;
+        }
+        BO = NewBO;
+      } while (true);
+
+      return getMulExpr(MulOps);
+    }
+    case Instruction::UDiv:
+      return getUDivExpr(getSCEV(BO->LHS), getSCEV(BO->RHS));
+    case Instruction::Sub: {
+      SCEV::NoWrapFlags Flags = SCEV::FlagAnyWrap;
+      if (BO->Op)
+        Flags = getNoWrapFlagsFromUB(BO->Op);
+      return getMinusSCEV(getSCEV(BO->LHS), getSCEV(BO->RHS), Flags);
+    }
+    case Instruction::And:
+      // For an expression like x&255 that merely masks off the high bits,
+      // use zext(trunc(x)) as the SCEV expression.
+      if (ConstantInt *CI = dyn_cast<ConstantInt>(BO->RHS)) {
+        if (CI->isNullValue())
+          return getSCEV(BO->RHS);
+        if (CI->isAllOnesValue())
+          return getSCEV(BO->LHS);
+        const APInt &A = CI->getValue();
+
+        // Instcombine's ShrinkDemandedConstant may strip bits out of
+        // constants, obscuring what would otherwise be a low-bits mask.
+        // Use computeKnownBits to compute what ShrinkDemandedConstant
+        // knew about to reconstruct a low-bits mask value.
+        unsigned LZ = A.countLeadingZeros();
+        unsigned TZ = A.countTrailingZeros();
+        unsigned BitWidth = A.getBitWidth();
+        APInt KnownZero(BitWidth, 0), KnownOne(BitWidth, 0);
+        computeKnownBits(BO->LHS, KnownZero, KnownOne, getDataLayout(),
+                         0, &AC, nullptr, &DT);
+
+        APInt EffectiveMask =
+            APInt::getLowBitsSet(BitWidth, BitWidth - LZ - TZ).shl(TZ);
+        if ((LZ != 0 || TZ != 0) && !((~A & ~KnownZero) & EffectiveMask)) {
+          const SCEV *MulCount = getConstant(ConstantInt::get(
+              getContext(), APInt::getOneBitSet(BitWidth, TZ)));
+          return getMulExpr(
+              getZeroExtendExpr(
+                  getTruncateExpr(
+                      getUDivExactExpr(getSCEV(BO->LHS), MulCount),
+                      IntegerType::get(getContext(), BitWidth - LZ - TZ)),
+                  BO->LHS->getType()),
+              MulCount);
+        }
       }
+      break;
 
-      MulOps.push_back(getSCEV(U->getOperand(1)));
-    }
-    return getMulExpr(MulOps);
-  }
-  case Instruction::UDiv:
-    return getUDivExpr(getSCEV(U->getOperand(0)),
-                       getSCEV(U->getOperand(1)));
-  case Instruction::Sub:
-    return getMinusSCEV(getSCEV(U->getOperand(0)), getSCEV(U->getOperand(1)),
-                        getNoWrapFlagsFromUB(U));
-  case Instruction::And:
-    // For an expression like x&255 that merely masks off the high bits,
-    // use zext(trunc(x)) as the SCEV expression.
-    if (ConstantInt *CI = dyn_cast<ConstantInt>(U->getOperand(1))) {
-      if (CI->isNullValue())
-        return getSCEV(U->getOperand(1));
-      if (CI->isAllOnesValue())
-        return getSCEV(U->getOperand(0));
-      const APInt &A = CI->getValue();
-
-      // Instcombine's ShrinkDemandedConstant may strip bits out of
-      // constants, obscuring what would otherwise be a low-bits mask.
-      // Use computeKnownBits to compute what ShrinkDemandedConstant
-      // knew about to reconstruct a low-bits mask value.
-      unsigned LZ = A.countLeadingZeros();
-      unsigned TZ = A.countTrailingZeros();
-      unsigned BitWidth = A.getBitWidth();
-      APInt KnownZero(BitWidth, 0), KnownOne(BitWidth, 0);
-      computeKnownBits(U->getOperand(0), KnownZero, KnownOne, getDataLayout(),
-                       0, &AC, nullptr, &DT);
-
-      APInt EffectiveMask =
-          APInt::getLowBitsSet(BitWidth, BitWidth - LZ - TZ).shl(TZ);
-      if ((LZ != 0 || TZ != 0) && !((~A & ~KnownZero) & EffectiveMask)) {
-        const SCEV *MulCount = getConstant(
-            ConstantInt::get(getContext(), APInt::getOneBitSet(BitWidth, TZ)));
-        return getMulExpr(
-            getZeroExtendExpr(
-                getTruncateExpr(
-                    getUDivExactExpr(getSCEV(U->getOperand(0)), MulCount),
-                    IntegerType::get(getContext(), BitWidth - LZ - TZ)),
-                U->getType()),
-            MulCount);
+    case Instruction::Or:
+      // If the RHS of the Or is a constant, we may have something like:
+      // X*4+1 which got turned into X*4|1.  Handle this as an Add so loop
+      // optimizations will transparently handle this case.
+      //
+      // In order for this transformation to be safe, the LHS must be of the
+      // form X*(2^n) and the Or constant must be less than 2^n.
+      if (ConstantInt *CI = dyn_cast<ConstantInt>(BO->RHS)) {
+        const SCEV *LHS = getSCEV(BO->LHS);
+        const APInt &CIVal = CI->getValue();
+        if (GetMinTrailingZeros(LHS) >=
+            (CIVal.getBitWidth() - CIVal.countLeadingZeros())) {
+          // Build a plain add SCEV.
+          const SCEV *S = getAddExpr(LHS, getSCEV(CI));
+          // If the LHS of the add was an addrec and it has no-wrap flags,
+          // transfer the no-wrap flags, since an or won't introduce a wrap.
+          if (const SCEVAddRecExpr *NewAR = dyn_cast<SCEVAddRecExpr>(S)) {
+            const SCEVAddRecExpr *OldAR = cast<SCEVAddRecExpr>(LHS);
+            const_cast<SCEVAddRecExpr *>(NewAR)->setNoWrapFlags(
+                OldAR->getNoWrapFlags());
+          }
+          return S;
+        }
       }
-    }
-    break;
+      break;
 
-  case Instruction::Or:
-    // If the RHS of the Or is a constant, we may have something like:
-    // X*4+1 which got turned into X*4|1.  Handle this as an Add so loop
-    // optimizations will transparently handle this case.
-    //
-    // In order for this transformation to be safe, the LHS must be of the
-    // form X*(2^n) and the Or constant must be less than 2^n.
-    if (ConstantInt *CI = dyn_cast<ConstantInt>(U->getOperand(1))) {
-      const SCEV *LHS = getSCEV(U->getOperand(0));
-      const APInt &CIVal = CI->getValue();
-      if (GetMinTrailingZeros(LHS) >=
-          (CIVal.getBitWidth() - CIVal.countLeadingZeros())) {
-        // Build a plain add SCEV.
-        const SCEV *S = getAddExpr(LHS, getSCEV(CI));
-        // If the LHS of the add was an addrec and it has no-wrap flags,
-        // transfer the no-wrap flags, since an or won't introduce a wrap.
-        if (const SCEVAddRecExpr *NewAR = dyn_cast<SCEVAddRecExpr>(S)) {
-          const SCEVAddRecExpr *OldAR = cast<SCEVAddRecExpr>(LHS);
-          const_cast<SCEVAddRecExpr *>(NewAR)->setNoWrapFlags(
-            OldAR->getNoWrapFlags());
-        }
-        return S;
+    case Instruction::Xor:
+      if (ConstantInt *CI = dyn_cast<ConstantInt>(BO->RHS)) {
+        // If the RHS of xor is -1, then this is a not operation.
+        if (CI->isAllOnesValue())
+          return getNotSCEV(getSCEV(BO->LHS));
+
+        // Model xor(and(x, C), C) as and(~x, C), if C is a low-bits mask.
+        // This is a variant of the check for xor with -1, and it handles
+        // the case where instcombine has trimmed non-demanded bits out
+        // of an xor with -1.
+        if (auto *LBO = dyn_cast<BinaryOperator>(BO->LHS))
+          if (ConstantInt *LCI = dyn_cast<ConstantInt>(LBO->getOperand(1)))
+            if (LBO->getOpcode() == Instruction::And &&
+                LCI->getValue() == CI->getValue())
+              if (const SCEVZeroExtendExpr *Z =
+                      dyn_cast<SCEVZeroExtendExpr>(getSCEV(BO->LHS))) {
+                Type *UTy = BO->LHS->getType();
+                const SCEV *Z0 = Z->getOperand();
+                Type *Z0Ty = Z0->getType();
+                unsigned Z0TySize = getTypeSizeInBits(Z0Ty);
+
+                // If C is a low-bits mask, the zero extend is serving to
+                // mask off the high bits. Complement the operand and
+                // re-apply the zext.
+                if (APIntOps::isMask(Z0TySize, CI->getValue()))
+                  return getZeroExtendExpr(getNotSCEV(Z0), UTy);
+
+                // If C is a single bit, it may be in the sign-bit position
+                // before the zero-extend. In this case, represent the xor
+                // using an add, which is equivalent, and re-apply the zext.
+                APInt Trunc = CI->getValue().trunc(Z0TySize);
+                if (Trunc.zext(getTypeSizeInBits(UTy)) == CI->getValue() &&
+                    Trunc.isSignBit())
+                  return getZeroExtendExpr(getAddExpr(Z0, getConstant(Trunc)),
+                                           UTy);
+              }
       }
-    }
-    break;
-  case Instruction::Xor:
-    if (ConstantInt *CI = dyn_cast<ConstantInt>(U->getOperand(1))) {
-      // If the RHS of the xor is a signbit, then this is just an add.
-      // Instcombine turns add of signbit into xor as a strength reduction step.
-      if (CI->getValue().isSignBit())
-        return getAddExpr(getSCEV(U->getOperand(0)),
-                          getSCEV(U->getOperand(1)));
-
-      // If the RHS of xor is -1, then this is a not operation.
-      if (CI->isAllOnesValue())
-        return getNotSCEV(getSCEV(U->getOperand(0)));
-
-      // Model xor(and(x, C), C) as and(~x, C), if C is a low-bits mask.
-      // This is a variant of the check for xor with -1, and it handles
-      // the case where instcombine has trimmed non-demanded bits out
-      // of an xor with -1.
-      if (BinaryOperator *BO = dyn_cast<BinaryOperator>(U->getOperand(0)))
-        if (ConstantInt *LCI = dyn_cast<ConstantInt>(BO->getOperand(1)))
-          if (BO->getOpcode() == Instruction::And &&
-              LCI->getValue() == CI->getValue())
-            if (const SCEVZeroExtendExpr *Z =
-                  dyn_cast<SCEVZeroExtendExpr>(getSCEV(U->getOperand(0)))) {
-              Type *UTy = U->getType();
-              const SCEV *Z0 = Z->getOperand();
-              Type *Z0Ty = Z0->getType();
-              unsigned Z0TySize = getTypeSizeInBits(Z0Ty);
-
-              // If C is a low-bits mask, the zero extend is serving to
-              // mask off the high bits. Complement the operand and
-              // re-apply the zext.
-              if (APIntOps::isMask(Z0TySize, CI->getValue()))
-                return getZeroExtendExpr(getNotSCEV(Z0), UTy);
-
-              // If C is a single bit, it may be in the sign-bit position
-              // before the zero-extend. In this case, represent the xor
-              // using an add, which is equivalent, and re-apply the zext.
-              APInt Trunc = CI->getValue().trunc(Z0TySize);
-              if (Trunc.zext(getTypeSizeInBits(UTy)) == CI->getValue() &&
-                  Trunc.isSignBit())
-                return getZeroExtendExpr(getAddExpr(Z0, getConstant(Trunc)),
-                                         UTy);
-            }
-    }
-    break;
+      break;
 
   case Instruction::Shl:
     // Turn shift left of a constant amount into a multiply.
-    if (ConstantInt *SA = dyn_cast<ConstantInt>(U->getOperand(1))) {
-      uint32_t BitWidth = cast<IntegerType>(U->getType())->getBitWidth();
+    if (ConstantInt *SA = dyn_cast<ConstantInt>(BO->RHS)) {
+      uint32_t BitWidth = cast<IntegerType>(SA->getType())->getBitWidth();
 
       // If the shift count is not less than the bitwidth, the result of
       // the shift is undefined. Don't try to analyze it, because the
@@ -4700,58 +5154,43 @@ const SCEV *ScalarEvolution::createSCEV(Value *V) {
       // http://lists.llvm.org/pipermail/llvm-dev/2015-April/084195.html
       // and http://reviews.llvm.org/D8890 .
       auto Flags = SCEV::FlagAnyWrap;
-      if (SA->getValue().ult(BitWidth - 1)) Flags = getNoWrapFlagsFromUB(U);
+      if (BO->Op && SA->getValue().ult(BitWidth - 1))
+        Flags = getNoWrapFlagsFromUB(BO->Op);
 
       Constant *X = ConstantInt::get(getContext(),
         APInt::getOneBitSet(BitWidth, SA->getZExtValue()));
-      return getMulExpr(getSCEV(U->getOperand(0)), getSCEV(X), Flags);
+      return getMulExpr(getSCEV(BO->LHS), getSCEV(X), Flags);
     }
     break;
 
-  case Instruction::LShr:
-    // Turn logical shift right of a constant into a unsigned divide.
-    if (ConstantInt *SA = dyn_cast<ConstantInt>(U->getOperand(1))) {
-      uint32_t BitWidth = cast<IntegerType>(U->getType())->getBitWidth();
-
-      // If the shift count is not less than the bitwidth, the result of
-      // the shift is undefined. Don't try to analyze it, because the
-      // resolution chosen here may differ from the resolution chosen in
-      // other parts of the compiler.
-      if (SA->getValue().uge(BitWidth))
-        break;
+    case Instruction::AShr:
+      // For a two-shift sext-inreg, use sext(trunc(x)) as the SCEV expression.
+      if (ConstantInt *CI = dyn_cast<ConstantInt>(BO->RHS))
+        if (Operator *L = dyn_cast<Operator>(BO->LHS))
+          if (L->getOpcode() == Instruction::Shl &&
+              L->getOperand(1) == BO->RHS) {
+            uint64_t BitWidth = getTypeSizeInBits(BO->LHS->getType());
+
+            // If the shift count is not less than the bitwidth, the result of
+            // the shift is undefined. Don't try to analyze it, because the
+            // resolution chosen here may differ from the resolution chosen in
+            // other parts of the compiler.
+            if (CI->getValue().uge(BitWidth))
+              break;
 
-      Constant *X = ConstantInt::get(getContext(),
-        APInt::getOneBitSet(BitWidth, SA->getZExtValue()));
-      return getUDivExpr(getSCEV(U->getOperand(0)), getSCEV(X));
+            uint64_t Amt = BitWidth - CI->getZExtValue();
+            if (Amt == BitWidth)
+              return getSCEV(L->getOperand(0)); // shift by zero --> noop
+            return getSignExtendExpr(
+                getTruncateExpr(getSCEV(L->getOperand(0)),
+                                IntegerType::get(getContext(), Amt)),
+                BO->LHS->getType());
+          }
+      break;
     }
-    break;
-
-  case Instruction::AShr:
-    // For a two-shift sext-inreg, use sext(trunc(x)) as the SCEV expression.
-    if (ConstantInt *CI = dyn_cast<ConstantInt>(U->getOperand(1)))
-      if (Operator *L = dyn_cast<Operator>(U->getOperand(0)))
-        if (L->getOpcode() == Instruction::Shl &&
-            L->getOperand(1) == U->getOperand(1)) {
-          uint64_t BitWidth = getTypeSizeInBits(U->getType());
-
-          // If the shift count is not less than the bitwidth, the result of
-          // the shift is undefined. Don't try to analyze it, because the
-          // resolution chosen here may differ from the resolution chosen in
-          // other parts of the compiler.
-          if (CI->getValue().uge(BitWidth))
-            break;
-
-          uint64_t Amt = BitWidth - CI->getZExtValue();
-          if (Amt == BitWidth)
-            return getSCEV(L->getOperand(0));       // shift by zero --> noop
-          return
-            getSignExtendExpr(getTruncateExpr(getSCEV(L->getOperand(0)),
-                                              IntegerType::get(getContext(),
-                                                               Amt)),
-                              U->getType());
-        }
-    break;
+  }
 
+  switch (U->getOpcode()) {
   case Instruction::Trunc:
     return getTruncateExpr(getSCEV(U->getOperand(0)), U->getType());
 
@@ -4786,8 +5225,12 @@ const SCEV *ScalarEvolution::createSCEV(Value *V) {
     if (isa<Instruction>(U))
       return createNodeForSelectOrPHI(cast<Instruction>(U), U->getOperand(0),
                                       U->getOperand(1), U->getOperand(2));
+    break;
 
-  default: // We cannot analyze this expression.
+  case Instruction::Call:
+  case Instruction::Invoke:
+    if (Value *RV = CallSite(U).getReturnedArgOperand())
+      return getSCEV(RV);
     break;
   }
 
@@ -4808,16 +5251,6 @@ unsigned ScalarEvolution::getSmallConstantTripCount(Loop *L) {
   return 0;
 }
 
-/// getSmallConstantTripCount - Returns the maximum trip count of this loop as a
-/// normal unsigned value. Returns 0 if the trip count is unknown or not
-/// constant. Will also return 0 if the maximum trip count is very large (>=
-/// 2^32).
-///
-/// This "trip count" assumes that control exits via ExitingBlock. More
-/// precisely, it is the number of times that control may reach ExitingBlock
-/// before taking the branch. For loops with multiple exits, it may not be the
-/// number times that the loop header executes because the loop may exit
-/// prematurely via another branch.
 unsigned ScalarEvolution::getSmallConstantTripCount(Loop *L,
                                                     BasicBlock *ExitingBlock) {
   assert(ExitingBlock && "Must pass a non-null exiting block!");
@@ -4846,10 +5279,10 @@ unsigned ScalarEvolution::getSmallConstantTripMultiple(Loop *L) {
   return 0;
 }
 
-/// getSmallConstantTripMultiple - Returns the largest constant divisor of the
-/// trip count of this loop as a normal unsigned value, if possible. This
-/// means that the actual trip count is always a multiple of the returned
-/// value (don't forget the trip count could very well be zero as well!).
+/// Returns the largest constant divisor of the trip count of this loop as a
+/// normal unsigned value, if possible. This means that the actual trip count is
+/// always a multiple of the returned value (don't forget the trip count could
+/// very well be zero as well!).
 ///
 /// Returns 1 if the trip count is unknown or not guaranteed to be the
 /// multiple of a constant (which is also the case if the trip count is simply
@@ -4891,37 +5324,30 @@ ScalarEvolution::getSmallConstantTripMultiple(Loop *L,
   return (unsigned)Result->getZExtValue();
 }
 
-// getExitCount - Get the expression for the number of loop iterations for which
-// this loop is guaranteed not to exit via ExitingBlock. Otherwise return
-// SCEVCouldNotCompute.
+/// Get the expression for the number of loop iterations for which this loop is
+/// guaranteed not to exit via ExitingBlock. Otherwise return
+/// SCEVCouldNotCompute.
 const SCEV *ScalarEvolution::getExitCount(Loop *L, BasicBlock *ExitingBlock) {
   return getBackedgeTakenInfo(L).getExact(ExitingBlock, this);
 }
 
-/// getBackedgeTakenCount - If the specified loop has a predictable
-/// backedge-taken count, return it, otherwise return a SCEVCouldNotCompute
-/// object. The backedge-taken count is the number of times the loop header
-/// will be branched to from within the loop. This is one less than the
-/// trip count of the loop, since it doesn't count the first iteration,
-/// when the header is branched to from outside the loop.
-///
-/// Note that it is not valid to call this method on a loop without a
-/// loop-invariant backedge-taken count (see
-/// hasLoopInvariantBackedgeTakenCount).
-///
+const SCEV *
+ScalarEvolution::getPredicatedBackedgeTakenCount(const Loop *L,
+                                                 SCEVUnionPredicate &Preds) {
+  return getPredicatedBackedgeTakenInfo(L).getExact(this, &Preds);
+}
+
 const SCEV *ScalarEvolution::getBackedgeTakenCount(const Loop *L) {
   return getBackedgeTakenInfo(L).getExact(this);
 }
 
-/// getMaxBackedgeTakenCount - Similar to getBackedgeTakenCount, except
-/// return the least SCEV value that is known never to be less than the
-/// actual backedge taken count.
+/// Similar to getBackedgeTakenCount, except return the least SCEV value that is
+/// known never to be less than the actual backedge taken count.
 const SCEV *ScalarEvolution::getMaxBackedgeTakenCount(const Loop *L) {
   return getBackedgeTakenInfo(L).getMax(this);
 }
 
-/// PushLoopPHIs - Push PHI nodes in the header of the given loop
-/// onto the given Worklist.
+/// Push PHI nodes in the header of the given loop onto the given Worklist.
 static void
 PushLoopPHIs(const Loop *L, SmallVectorImpl<Instruction *> &Worklist) {
   BasicBlock *Header = L->getHeader();
@@ -4933,6 +5359,23 @@ PushLoopPHIs(const Loop *L, SmallVectorImpl<Instruction *> &Worklist) {
 }
 
 const ScalarEvolution::BackedgeTakenInfo &
+ScalarEvolution::getPredicatedBackedgeTakenInfo(const Loop *L) {
+  auto &BTI = getBackedgeTakenInfo(L);
+  if (BTI.hasFullInfo())
+    return BTI;
+
+  auto Pair = PredicatedBackedgeTakenCounts.insert({L, BackedgeTakenInfo()});
+
+  if (!Pair.second)
+    return Pair.first->second;
+
+  BackedgeTakenInfo Result =
+      computeBackedgeTakenCount(L, /*AllowPredicates=*/true);
+
+  return PredicatedBackedgeTakenCounts.find(L)->second = Result;
+}
+
+const ScalarEvolution::BackedgeTakenInfo &
 ScalarEvolution::getBackedgeTakenInfo(const Loop *L) {
   // Initially insert an invalid entry for this loop. If the insertion
   // succeeds, proceed to actually compute a backedge-taken count and
@@ -4940,7 +5383,7 @@ ScalarEvolution::getBackedgeTakenInfo(const Loop *L) {
   // code elsewhere that it shouldn't attempt to request a new
   // backedge-taken count, which could result in infinite recursion.
   std::pair<DenseMap<const Loop *, BackedgeTakenInfo>::iterator, bool> Pair =
-    BackedgeTakenCounts.insert(std::make_pair(L, BackedgeTakenInfo()));
+      BackedgeTakenCounts.insert({L, BackedgeTakenInfo()});
   if (!Pair.second)
     return Pair.first->second;
 
@@ -5007,17 +5450,19 @@ ScalarEvolution::getBackedgeTakenInfo(const Loop *L) {
   return BackedgeTakenCounts.find(L)->second = Result;
 }
 
-/// forgetLoop - This method should be called by the client when it has
-/// changed a loop in a way that may effect ScalarEvolution's ability to
-/// compute a trip count, or if the loop is deleted.
 void ScalarEvolution::forgetLoop(const Loop *L) {
   // Drop any stored trip count value.
-  DenseMap<const Loop*, BackedgeTakenInfo>::iterator BTCPos =
-    BackedgeTakenCounts.find(L);
-  if (BTCPos != BackedgeTakenCounts.end()) {
-    BTCPos->second.clear();
-    BackedgeTakenCounts.erase(BTCPos);
-  }
+  auto RemoveLoopFromBackedgeMap =
+      [L](DenseMap<const Loop *, BackedgeTakenInfo> &Map) {
+        auto BTCPos = Map.find(L);
+        if (BTCPos != Map.end()) {
+          BTCPos->second.clear();
+          Map.erase(BTCPos);
+        }
+      };
+
+  RemoveLoopFromBackedgeMap(BackedgeTakenCounts);
+  RemoveLoopFromBackedgeMap(PredicatedBackedgeTakenCounts);
 
   // Drop information about expressions based on loop-header PHIs.
   SmallVector<Instruction *, 16> Worklist;
@@ -5043,13 +5488,12 @@ void ScalarEvolution::forgetLoop(const Loop *L) {
 
   // Forget all contained loops too, to avoid dangling entries in the
   // ValuesAtScopes map.
-  for (Loop::iterator I = L->begin(), E = L->end(); I != E; ++I)
-    forgetLoop(*I);
+  for (Loop *I : *L)
+    forgetLoop(I);
+
+  LoopHasNoAbnormalExits.erase(L);
 }
 
-/// forgetValue - This method should be called by the client when it has
-/// changed a value in a way that may effect its value, or which may
-/// disconnect it from a def-use chain linking it to a loop.
 void ScalarEvolution::forgetValue(Value *V) {
   Instruction *I = dyn_cast<Instruction>(V);
   if (!I) return;
@@ -5077,16 +5521,17 @@ void ScalarEvolution::forgetValue(Value *V) {
   }
 }
 
-/// getExact - Get the exact loop backedge taken count considering all loop
-/// exits. A computable result can only be returned for loops with a single
-/// exit.  Returning the minimum taken count among all exits is incorrect
-/// because one of the loop's exit limit's may have been skipped. HowFarToZero
-/// assumes that the limit of each loop test is never skipped. This is a valid
-/// assumption as long as the loop exits via that test. For precise results, it
-/// is the caller's responsibility to specify the relevant loop exit using
+/// Get the exact loop backedge taken count considering all loop exits. A
+/// computable result can only be returned for loops with a single exit.
+/// Returning the minimum taken count among all exits is incorrect because one
+/// of the loop's exit limit's may have been skipped. howFarToZero assumes that
+/// the limit of each loop test is never skipped. This is a valid assumption as
+/// long as the loop exits via that test. For precise results, it is the
+/// caller's responsibility to specify the relevant loop exit using
 /// getExact(ExitingBlock, SE).
 const SCEV *
-ScalarEvolution::BackedgeTakenInfo::getExact(ScalarEvolution *SE) const {
+ScalarEvolution::BackedgeTakenInfo::getExact(
+    ScalarEvolution *SE, SCEVUnionPredicate *Preds) const {
   // If any exits were not computable, the loop is not computable.
   if (!ExitNotTaken.isCompleteList()) return SE->getCouldNotCompute();
 
@@ -5095,36 +5540,42 @@ ScalarEvolution::BackedgeTakenInfo::getExact(ScalarEvolution *SE) const {
   assert(ExitNotTaken.ExactNotTaken && "uninitialized not-taken info");
 
   const SCEV *BECount = nullptr;
-  for (const ExitNotTakenInfo *ENT = &ExitNotTaken;
-       ENT != nullptr; ENT = ENT->getNextExit()) {
-
-    assert(ENT->ExactNotTaken != SE->getCouldNotCompute() && "bad exit SCEV");
+  for (auto &ENT : ExitNotTaken) {
+    assert(ENT.ExactNotTaken != SE->getCouldNotCompute() && "bad exit SCEV");
 
     if (!BECount)
-      BECount = ENT->ExactNotTaken;
-    else if (BECount != ENT->ExactNotTaken)
+      BECount = ENT.ExactNotTaken;
+    else if (BECount != ENT.ExactNotTaken)
       return SE->getCouldNotCompute();
+    if (Preds && ENT.getPred())
+      Preds->add(ENT.getPred());
+
+    assert((Preds || ENT.hasAlwaysTruePred()) &&
+           "Predicate should be always true!");
   }
+
   assert(BECount && "Invalid not taken count for loop exit");
   return BECount;
 }
 
-/// getExact - Get the exact not taken count for this loop exit.
+/// Get the exact not taken count for this loop exit.
 const SCEV *
 ScalarEvolution::BackedgeTakenInfo::getExact(BasicBlock *ExitingBlock,
                                              ScalarEvolution *SE) const {
-  for (const ExitNotTakenInfo *ENT = &ExitNotTaken;
-       ENT != nullptr; ENT = ENT->getNextExit()) {
+  for (auto &ENT : ExitNotTaken)
+    if (ENT.ExitingBlock == ExitingBlock && ENT.hasAlwaysTruePred())
+      return ENT.ExactNotTaken;
 
-    if (ENT->ExitingBlock == ExitingBlock)
-      return ENT->ExactNotTaken;
-  }
   return SE->getCouldNotCompute();
 }
 
 /// getMax - Get the max backedge taken count for the loop.
 const SCEV *
 ScalarEvolution::BackedgeTakenInfo::getMax(ScalarEvolution *SE) const {
+  for (auto &ENT : ExitNotTaken)
+    if (!ENT.hasAlwaysTruePred())
+      return SE->getCouldNotCompute();
+
   return Max ? Max : SE->getCouldNotCompute();
 }
 
@@ -5136,22 +5587,19 @@ bool ScalarEvolution::BackedgeTakenInfo::hasOperand(const SCEV *S,
   if (!ExitNotTaken.ExitingBlock)
     return false;
 
-  for (const ExitNotTakenInfo *ENT = &ExitNotTaken;
-       ENT != nullptr; ENT = ENT->getNextExit()) {
-
-    if (ENT->ExactNotTaken != SE->getCouldNotCompute()
-        && SE->hasOperand(ENT->ExactNotTaken, S)) {
+  for (auto &ENT : ExitNotTaken)
+    if (ENT.ExactNotTaken != SE->getCouldNotCompute() &&
+        SE->hasOperand(ENT.ExactNotTaken, S))
       return true;
-    }
-  }
+
   return false;
 }
 
 /// Allocate memory for BackedgeTakenInfo and copy the not-taken count of each
 /// computable exit into a persistent ExitNotTakenInfo array.
 ScalarEvolution::BackedgeTakenInfo::BackedgeTakenInfo(
-  SmallVectorImpl< std::pair<BasicBlock *, const SCEV *> > &ExitCounts,
-  bool Complete, const SCEV *MaxCount) : Max(MaxCount) {
+    SmallVectorImpl<EdgeInfo> &ExitCounts, bool Complete, const SCEV *MaxCount)
+    : Max(MaxCount) {
 
   if (!Complete)
     ExitNotTaken.setIncomplete();
@@ -5159,36 +5607,63 @@ ScalarEvolution::BackedgeTakenInfo::BackedgeTakenInfo(
   unsigned NumExits = ExitCounts.size();
   if (NumExits == 0) return;
 
-  ExitNotTaken.ExitingBlock = ExitCounts[0].first;
-  ExitNotTaken.ExactNotTaken = ExitCounts[0].second;
-  if (NumExits == 1) return;
+  ExitNotTaken.ExitingBlock = ExitCounts[0].ExitBlock;
+  ExitNotTaken.ExactNotTaken = ExitCounts[0].Taken;
+
+  // Determine the number of ExitNotTakenExtras structures that we need.
+  unsigned ExtraInfoSize = 0;
+  if (NumExits > 1)
+    ExtraInfoSize = 1 + std::count_if(std::next(ExitCounts.begin()),
+                                      ExitCounts.end(), [](EdgeInfo &Entry) {
+                                        return !Entry.Pred.isAlwaysTrue();
+                                      });
+  else if (!ExitCounts[0].Pred.isAlwaysTrue())
+    ExtraInfoSize = 1;
+
+  ExitNotTakenExtras *ENT = nullptr;
+
+  // Allocate the ExitNotTakenExtras structures and initialize the first
+  // element (ExitNotTaken).
+  if (ExtraInfoSize > 0) {
+    ENT = new ExitNotTakenExtras[ExtraInfoSize];
+    ExitNotTaken.ExtraInfo = &ENT[0];
+    *ExitNotTaken.getPred() = std::move(ExitCounts[0].Pred);
+  }
+
+  if (NumExits == 1)
+    return;
+
+  assert(ENT && "ExitNotTakenExtras is NULL while having more than one exit");
+
+  auto &Exits = ExitNotTaken.ExtraInfo->Exits;
 
   // Handle the rare case of multiple computable exits.
-  ExitNotTakenInfo *ENT = new ExitNotTakenInfo[NumExits-1];
+  for (unsigned i = 1, PredPos = 1; i < NumExits; ++i) {
+    ExitNotTakenExtras *Ptr = nullptr;
+    if (!ExitCounts[i].Pred.isAlwaysTrue()) {
+      Ptr = &ENT[PredPos++];
+      Ptr->Pred = std::move(ExitCounts[i].Pred);
+    }
 
-  ExitNotTakenInfo *PrevENT = &ExitNotTaken;
-  for (unsigned i = 1; i < NumExits; ++i, PrevENT = ENT, ++ENT) {
-    PrevENT->setNextExit(ENT);
-    ENT->ExitingBlock = ExitCounts[i].first;
-    ENT->ExactNotTaken = ExitCounts[i].second;
+    Exits.emplace_back(ExitCounts[i].ExitBlock, ExitCounts[i].Taken, Ptr);
   }
 }
 
-/// clear - Invalidate this result and free the ExitNotTakenInfo array.
+/// Invalidate this result and free the ExitNotTakenInfo array.
 void ScalarEvolution::BackedgeTakenInfo::clear() {
   ExitNotTaken.ExitingBlock = nullptr;
   ExitNotTaken.ExactNotTaken = nullptr;
-  delete[] ExitNotTaken.getNextExit();
+  delete[] ExitNotTaken.ExtraInfo;
 }
 
-/// computeBackedgeTakenCount - Compute the number of times the backedge
-/// of the specified loop will execute.
+/// Compute the number of times the backedge of the specified loop will execute.
 ScalarEvolution::BackedgeTakenInfo
-ScalarEvolution::computeBackedgeTakenCount(const Loop *L) {
+ScalarEvolution::computeBackedgeTakenCount(const Loop *L,
+                                           bool AllowPredicates) {
   SmallVector<BasicBlock *, 8> ExitingBlocks;
   L->getExitingBlocks(ExitingBlocks);
 
-  SmallVector<std::pair<BasicBlock *, const SCEV *>, 4> ExitCounts;
+  SmallVector<EdgeInfo, 4> ExitCounts;
   bool CouldComputeBECount = true;
   BasicBlock *Latch = L->getLoopLatch(); // may be NULL.
   const SCEV *MustExitMaxBECount = nullptr;
@@ -5196,9 +5671,13 @@ ScalarEvolution::computeBackedgeTakenCount(const Loop *L) {
 
   // Compute the ExitLimit for each loop exit. Use this to populate ExitCounts
   // and compute maxBECount.
+  // Do a union of all the predicates here.
   for (unsigned i = 0, e = ExitingBlocks.size(); i != e; ++i) {
     BasicBlock *ExitBB = ExitingBlocks[i];
-    ExitLimit EL = computeExitLimit(L, ExitBB);
+    ExitLimit EL = computeExitLimit(L, ExitBB, AllowPredicates);
+
+    assert((AllowPredicates || EL.Pred.isAlwaysTrue()) &&
+           "Predicated exit limit when predicates are not allowed!");
 
     // 1. For each exit that can be computed, add an entry to ExitCounts.
     // CouldComputeBECount is true only if all exits can be computed.
@@ -5207,7 +5686,7 @@ ScalarEvolution::computeBackedgeTakenCount(const Loop *L) {
       // we won't be able to compute an exact value for the loop.
       CouldComputeBECount = false;
     else
-      ExitCounts.push_back(std::make_pair(ExitBB, EL.Exact));
+      ExitCounts.emplace_back(EdgeInfo(ExitBB, EL.Exact, EL.Pred));
 
     // 2. Derive the loop's MaxBECount from each exit's max number of
     // non-exiting iterations. Partition the loop exits into two kinds:
@@ -5241,20 +5720,20 @@ ScalarEvolution::computeBackedgeTakenCount(const Loop *L) {
 }
 
 ScalarEvolution::ExitLimit
-ScalarEvolution::computeExitLimit(const Loop *L, BasicBlock *ExitingBlock) {
+ScalarEvolution::computeExitLimit(const Loop *L, BasicBlock *ExitingBlock,
+                                  bool AllowPredicates) {
 
   // Okay, we've chosen an exiting block.  See what condition causes us to exit
   // at this block and remember the exit block and whether all other targets
   // lead to the loop header.
   bool MustExecuteLoopHeader = true;
   BasicBlock *Exit = nullptr;
-  for (succ_iterator SI = succ_begin(ExitingBlock), SE = succ_end(ExitingBlock);
-       SI != SE; ++SI)
-    if (!L->contains(*SI)) {
+  for (auto *SBB : successors(ExitingBlock))
+    if (!L->contains(SBB)) {
       if (Exit) // Multiple exit successors.
         return getCouldNotCompute();
-      Exit = *SI;
-    } else if (*SI != L->getHeader()) {
+      Exit = SBB;
+    } else if (SBB != L->getHeader()) {
       MustExecuteLoopHeader = false;
     }
 
@@ -5307,9 +5786,9 @@ ScalarEvolution::computeExitLimit(const Loop *L, BasicBlock *ExitingBlock) {
   if (BranchInst *BI = dyn_cast<BranchInst>(Term)) {
     assert(BI->isConditional() && "If unconditional, it can't be in loop!");
     // Proceed to the next level to examine the exit condition expression.
-    return computeExitLimitFromCond(L, BI->getCondition(), BI->getSuccessor(0),
-                                    BI->getSuccessor(1),
-                                    /*ControlsExit=*/IsOnlyExit);
+    return computeExitLimitFromCond(
+        L, BI->getCondition(), BI->getSuccessor(0), BI->getSuccessor(1),
+        /*ControlsExit=*/IsOnlyExit, AllowPredicates);
   }
 
   if (SwitchInst *SI = dyn_cast<SwitchInst>(Term))
@@ -5319,29 +5798,24 @@ ScalarEvolution::computeExitLimit(const Loop *L, BasicBlock *ExitingBlock) {
   return getCouldNotCompute();
 }
 
-/// computeExitLimitFromCond - Compute the number of times the
-/// backedge of the specified loop will execute if its exit condition
-/// were a conditional branch of ExitCond, TBB, and FBB.
-///
-/// @param ControlsExit is true if ExitCond directly controls the exit
-/// branch. In this case, we can assume that the loop exits only if the
-/// condition is true and can infer that failing to meet the condition prior to
-/// integer wraparound results in undefined behavior.
 ScalarEvolution::ExitLimit
 ScalarEvolution::computeExitLimitFromCond(const Loop *L,
                                           Value *ExitCond,
                                           BasicBlock *TBB,
                                           BasicBlock *FBB,
-                                          bool ControlsExit) {
+                                          bool ControlsExit,
+                                          bool AllowPredicates) {
   // Check if the controlling expression for this loop is an And or Or.
   if (BinaryOperator *BO = dyn_cast<BinaryOperator>(ExitCond)) {
     if (BO->getOpcode() == Instruction::And) {
       // Recurse on the operands of the and.
       bool EitherMayExit = L->contains(TBB);
       ExitLimit EL0 = computeExitLimitFromCond(L, BO->getOperand(0), TBB, FBB,
-                                               ControlsExit && !EitherMayExit);
+                                               ControlsExit && !EitherMayExit,
+                                               AllowPredicates);
       ExitLimit EL1 = computeExitLimitFromCond(L, BO->getOperand(1), TBB, FBB,
-                                               ControlsExit && !EitherMayExit);
+                                               ControlsExit && !EitherMayExit,
+                                               AllowPredicates);
       const SCEV *BECount = getCouldNotCompute();
       const SCEV *MaxBECount = getCouldNotCompute();
       if (EitherMayExit) {
@@ -5368,6 +5842,9 @@ ScalarEvolution::computeExitLimitFromCond(const Loop *L,
           BECount = EL0.Exact;
       }
 
+      SCEVUnionPredicate NP;
+      NP.add(&EL0.Pred);
+      NP.add(&EL1.Pred);
       // There are cases (e.g. PR26207) where computeExitLimitFromCond is able
       // to be more aggressive when computing BECount than when computing
       // MaxBECount.  In these cases it is possible for EL0.Exact and EL1.Exact
@@ -5376,15 +5853,17 @@ ScalarEvolution::computeExitLimitFromCond(const Loop *L,
           !isa<SCEVCouldNotCompute>(BECount))
         MaxBECount = BECount;
 
-      return ExitLimit(BECount, MaxBECount);
+      return ExitLimit(BECount, MaxBECount, NP);
     }
     if (BO->getOpcode() == Instruction::Or) {
       // Recurse on the operands of the or.
       bool EitherMayExit = L->contains(FBB);
       ExitLimit EL0 = computeExitLimitFromCond(L, BO->getOperand(0), TBB, FBB,
-                                               ControlsExit && !EitherMayExit);
+                                               ControlsExit && !EitherMayExit,
+                                               AllowPredicates);
       ExitLimit EL1 = computeExitLimitFromCond(L, BO->getOperand(1), TBB, FBB,
-                                               ControlsExit && !EitherMayExit);
+                                               ControlsExit && !EitherMayExit,
+                                               AllowPredicates);
       const SCEV *BECount = getCouldNotCompute();
       const SCEV *MaxBECount = getCouldNotCompute();
       if (EitherMayExit) {
@@ -5411,14 +5890,25 @@ ScalarEvolution::computeExitLimitFromCond(const Loop *L,
           BECount = EL0.Exact;
       }
 
-      return ExitLimit(BECount, MaxBECount);
+      SCEVUnionPredicate NP;
+      NP.add(&EL0.Pred);
+      NP.add(&EL1.Pred);
+      return ExitLimit(BECount, MaxBECount, NP);
     }
   }
 
   // With an icmp, it may be feasible to compute an exact backedge-taken count.
   // Proceed to the next level to examine the icmp.
-  if (ICmpInst *ExitCondICmp = dyn_cast<ICmpInst>(ExitCond))
-    return computeExitLimitFromICmp(L, ExitCondICmp, TBB, FBB, ControlsExit);
+  if (ICmpInst *ExitCondICmp = dyn_cast<ICmpInst>(ExitCond)) {
+    ExitLimit EL =
+        computeExitLimitFromICmp(L, ExitCondICmp, TBB, FBB, ControlsExit);
+    if (EL.hasFullInfo() || !AllowPredicates)
+      return EL;
+
+    // Try again, but use SCEV predicates this time.
+    return computeExitLimitFromICmp(L, ExitCondICmp, TBB, FBB, ControlsExit,
+                                    /*AllowPredicates=*/true);
+  }
 
   // Check for a constant condition. These are normally stripped out by
   // SimplifyCFG, but ScalarEvolution may be used by a pass which wishes to
@@ -5442,7 +5932,8 @@ ScalarEvolution::computeExitLimitFromICmp(const Loop *L,
                                           ICmpInst *ExitCond,
                                           BasicBlock *TBB,
                                           BasicBlock *FBB,
-                                          bool ControlsExit) {
+                                          bool ControlsExit,
+                                          bool AllowPredicates) {
 
   // If the condition was exit on true, convert the condition to exit on false
   ICmpInst::Predicate Cond;
@@ -5460,11 +5951,6 @@ ScalarEvolution::computeExitLimitFromICmp(const Loop *L,
         return ItCnt;
     }
 
-  ExitLimit ShiftEL = computeShiftCompareExitLimit(
-      ExitCond->getOperand(0), ExitCond->getOperand(1), L, Cond);
-  if (ShiftEL.hasAnyInfo())
-    return ShiftEL;
-
   const SCEV *LHS = getSCEV(ExitCond->getOperand(0));
   const SCEV *RHS = getSCEV(ExitCond->getOperand(1));
 
@@ -5499,34 +5985,46 @@ ScalarEvolution::computeExitLimitFromICmp(const Loop *L,
   switch (Cond) {
   case ICmpInst::ICMP_NE: {                     // while (X != Y)
     // Convert to: while (X-Y != 0)
-    ExitLimit EL = HowFarToZero(getMinusSCEV(LHS, RHS), L, ControlsExit);
+    ExitLimit EL = howFarToZero(getMinusSCEV(LHS, RHS), L, ControlsExit,
+                                AllowPredicates);
     if (EL.hasAnyInfo()) return EL;
     break;
   }
   case ICmpInst::ICMP_EQ: {                     // while (X == Y)
     // Convert to: while (X-Y == 0)
-    ExitLimit EL = HowFarToNonZero(getMinusSCEV(LHS, RHS), L);
+    ExitLimit EL = howFarToNonZero(getMinusSCEV(LHS, RHS), L);
     if (EL.hasAnyInfo()) return EL;
     break;
   }
   case ICmpInst::ICMP_SLT:
   case ICmpInst::ICMP_ULT: {                    // while (X < Y)
     bool IsSigned = Cond == ICmpInst::ICMP_SLT;
-    ExitLimit EL = HowManyLessThans(LHS, RHS, L, IsSigned, ControlsExit);
+    ExitLimit EL = howManyLessThans(LHS, RHS, L, IsSigned, ControlsExit,
+                                    AllowPredicates);
     if (EL.hasAnyInfo()) return EL;
     break;
   }
   case ICmpInst::ICMP_SGT:
   case ICmpInst::ICMP_UGT: {                    // while (X > Y)
     bool IsSigned = Cond == ICmpInst::ICMP_SGT;
-    ExitLimit EL = HowManyGreaterThans(LHS, RHS, L, IsSigned, ControlsExit);
+    ExitLimit EL =
+        howManyGreaterThans(LHS, RHS, L, IsSigned, ControlsExit,
+                            AllowPredicates);
     if (EL.hasAnyInfo()) return EL;
     break;
   }
   default:
     break;
   }
-  return computeExitCountExhaustively(L, ExitCond, !L->contains(TBB));
+
+  auto *ExhaustiveCount =
+      computeExitCountExhaustively(L, ExitCond, !L->contains(TBB));
+
+  if (!isa<SCEVCouldNotCompute>(ExhaustiveCount))
+    return ExhaustiveCount;
+
+  return computeShiftCompareExitLimit(ExitCond->getOperand(0),
+                                      ExitCond->getOperand(1), L, Cond);
 }
 
 ScalarEvolution::ExitLimit
@@ -5546,7 +6044,7 @@ ScalarEvolution::computeExitLimitFromSingleExitSwitch(const Loop *L,
   const SCEV *RHS = getConstant(Switch->findCaseDest(ExitingBlock));
 
   // while (X != Y) --> while (X-Y != 0)
-  ExitLimit EL = HowFarToZero(getMinusSCEV(LHS, RHS), L, ControlsExit);
+  ExitLimit EL = howFarToZero(getMinusSCEV(LHS, RHS), L, ControlsExit);
   if (EL.hasAnyInfo())
     return EL;
 
@@ -5563,9 +6061,8 @@ EvaluateConstantChrecAtConstant(const SCEVAddRecExpr *AddRec, ConstantInt *C,
   return cast<SCEVConstant>(Val)->getValue();
 }
 
-/// computeLoadConstantCompareExitLimit - Given an exit condition of
-/// 'icmp op load X, cst', try to see if we can compute the backedge
-/// execution count.
+/// Given an exit condition of 'icmp op load X, cst', try to see if we can
+/// compute the backedge execution count.
 ScalarEvolution::ExitLimit
 ScalarEvolution::computeLoadConstantCompareExitLimit(
   LoadInst *LI,
@@ -5781,14 +6278,15 @@ ScalarEvolution::ExitLimit ScalarEvolution::computeShiftCompareExitLimit(
     unsigned BitWidth = getTypeSizeInBits(RHS->getType());
     const SCEV *UpperBound =
         getConstant(getEffectiveSCEVType(RHS->getType()), BitWidth);
-    return ExitLimit(getCouldNotCompute(), UpperBound);
+    SCEVUnionPredicate P;
+    return ExitLimit(getCouldNotCompute(), UpperBound, P);
   }
 
   return getCouldNotCompute();
 }
 
-/// CanConstantFold - Return true if we can constant fold an instruction of the
-/// specified type, assuming that all operands were constants.
+/// Return true if we can constant fold an instruction of the specified type,
+/// assuming that all operands were constants.
 static bool CanConstantFold(const Instruction *I) {
   if (isa<BinaryOperator>(I) || isa<CmpInst>(I) ||
       isa<SelectInst>(I) || isa<CastInst>(I) || isa<GetElementPtrInst>(I) ||
@@ -5916,10 +6414,9 @@ static Constant *EvaluateExpression(Value *V, const Loop *L,
                                            Operands[1], DL, TLI);
   if (LoadInst *LI = dyn_cast<LoadInst>(I)) {
     if (!LI->isVolatile())
-      return ConstantFoldLoadFromConstPtr(Operands[0], DL);
+      return ConstantFoldLoadFromConstPtr(Operands[0], LI->getType(), DL);
   }
-  return ConstantFoldInstOperands(I->getOpcode(), I->getType(), Operands, DL,
-                                  TLI);
+  return ConstantFoldInstOperands(I, Operands, DL, TLI);
 }
 
 
@@ -6107,16 +6604,6 @@ const SCEV *ScalarEvolution::computeExitCountExhaustively(const Loop *L,
   return getCouldNotCompute();
 }
 
-/// getSCEVAtScope - Return a SCEV expression for the specified value
-/// at the specified scope in the program.  The L value specifies a loop
-/// nest to evaluate the expression at, where null is the top-level or a
-/// specified loop is immediately inside of the loop.
-///
-/// This method can be used to compute the exit value for a variable defined
-/// in a loop by querying what the value will hold in the parent loop.
-///
-/// In the case that a relevant loop exit value cannot be computed, the
-/// original value V is returned.
 const SCEV *ScalarEvolution::getSCEVAtScope(const SCEV *V, const Loop *L) {
   SmallVector<std::pair<const Loop *, const SCEV *>, 2> &Values =
       ValuesAtScopes[V];
@@ -6305,10 +6792,9 @@ const SCEV *ScalarEvolution::computeSCEVAtScope(const SCEV *V, const Loop *L) {
                                                 Operands[1], DL, &TLI);
           else if (const LoadInst *LI = dyn_cast<LoadInst>(I)) {
             if (!LI->isVolatile())
-              C = ConstantFoldLoadFromConstPtr(Operands[0], DL);
+              C = ConstantFoldLoadFromConstPtr(Operands[0], LI->getType(), DL);
           } else
-            C = ConstantFoldInstOperands(I->getOpcode(), I->getType(), Operands,
-                                         DL, &TLI);
+            C = ConstantFoldInstOperands(I, Operands, DL, &TLI);
           if (!C) return V;
           return getSCEV(C);
         }
@@ -6428,14 +6914,11 @@ const SCEV *ScalarEvolution::computeSCEVAtScope(const SCEV *V, const Loop *L) {
   llvm_unreachable("Unknown SCEV type!");
 }
 
-/// getSCEVAtScope - This is a convenience function which does
-/// getSCEVAtScope(getSCEV(V), L).
 const SCEV *ScalarEvolution::getSCEVAtScope(Value *V, const Loop *L) {
   return getSCEVAtScope(getSCEV(V), L);
 }
 
-/// SolveLinEquationWithOverflow - Finds the minimum unsigned root of the
-/// following equation:
+/// Finds the minimum unsigned root of the following equation:
 ///
 ///     A * X = B (mod N)
 ///
@@ -6482,11 +6965,11 @@ static const SCEV *SolveLinEquationWithOverflow(const APInt &A, const APInt &B,
   return SE.getConstant(Result.trunc(BW));
 }
 
-/// SolveQuadraticEquation - Find the roots of the quadratic equation for the
-/// given quadratic chrec {L,+,M,+,N}.  This returns either the two roots (which
-/// might be the same) or two SCEVCouldNotCompute objects.
+/// Find the roots of the quadratic equation for the given quadratic chrec
+/// {L,+,M,+,N}.  This returns either the two roots (which might be the same) or
+/// two SCEVCouldNotCompute objects.
 ///
-static std::pair<const SCEV *,const SCEV *>
+static Optional<std::pair<const SCEVConstant *,const SCEVConstant *>>
 SolveQuadraticEquation(const SCEVAddRecExpr *AddRec, ScalarEvolution &SE) {
   assert(AddRec->getNumOperands() == 3 && "This is not a quadratic chrec!");
   const SCEVConstant *LC = dyn_cast<SCEVConstant>(AddRec->getOperand(0));
@@ -6494,10 +6977,8 @@ SolveQuadraticEquation(const SCEVAddRecExpr *AddRec, ScalarEvolution &SE) {
   const SCEVConstant *NC = dyn_cast<SCEVConstant>(AddRec->getOperand(2));
 
   // We currently can only solve this if the coefficients are constants.
-  if (!LC || !MC || !NC) {
-    const SCEV *CNC = SE.getCouldNotCompute();
-    return std::make_pair(CNC, CNC);
-  }
+  if (!LC || !MC || !NC)
+    return None;
 
   uint32_t BitWidth = LC->getAPInt().getBitWidth();
   const APInt &L = LC->getAPInt();
@@ -6524,8 +7005,7 @@ SolveQuadraticEquation(const SCEVAddRecExpr *AddRec, ScalarEvolution &SE) {
 
     if (SqrtTerm.isNegative()) {
       // The loop is provably infinite.
-      const SCEV *CNC = SE.getCouldNotCompute();
-      return std::make_pair(CNC, CNC);
+      return None;
     }
 
     // Compute sqrt(B^2-4ac). This is guaranteed to be the nearest
@@ -6536,10 +7016,8 @@ SolveQuadraticEquation(const SCEVAddRecExpr *AddRec, ScalarEvolution &SE) {
     // The divisions must be performed as signed divisions.
     APInt NegB(-B);
     APInt TwoA(A << 1);
-    if (TwoA.isMinValue()) {
-      const SCEV *CNC = SE.getCouldNotCompute();
-      return std::make_pair(CNC, CNC);
-    }
+    if (TwoA.isMinValue())
+      return None;
 
     LLVMContext &Context = SE.getContext();
 
@@ -6548,20 +7026,21 @@ SolveQuadraticEquation(const SCEVAddRecExpr *AddRec, ScalarEvolution &SE) {
     ConstantInt *Solution2 =
       ConstantInt::get(Context, (NegB - SqrtVal).sdiv(TwoA));
 
-    return std::make_pair(SE.getConstant(Solution1),
-                          SE.getConstant(Solution2));
+    return std::make_pair(cast<SCEVConstant>(SE.getConstant(Solution1)),
+                          cast<SCEVConstant>(SE.getConstant(Solution2)));
   } // end APIntOps namespace
 }
 
-/// HowFarToZero - Return the number of times a backedge comparing the specified
-/// value to zero will execute.  If not computable, return CouldNotCompute.
-///
-/// This is only used for loops with a "x != y" exit test. The exit condition is
-/// now expressed as a single expression, V = x-y. So the exit test is
-/// effectively V != 0.  We know and take advantage of the fact that this
-/// expression only being used in a comparison by zero context.
 ScalarEvolution::ExitLimit
-ScalarEvolution::HowFarToZero(const SCEV *V, const Loop *L, bool ControlsExit) {
+ScalarEvolution::howFarToZero(const SCEV *V, const Loop *L, bool ControlsExit,
+                              bool AllowPredicates) {
+
+  // This is only used for loops with a "x != y" exit test. The exit condition
+  // is now expressed as a single expression, V = x-y. So the exit test is
+  // effectively V != 0.  We know and take advantage of the fact that this
+  // expression only being used in a comparison by zero context.
+
+  SCEVUnionPredicate P;
   // If the value is a constant
   if (const SCEVConstant *C = dyn_cast<SCEVConstant>(V)) {
     // If the value is already zero, the branch will execute zero times.
@@ -6570,31 +7049,33 @@ ScalarEvolution::HowFarToZero(const SCEV *V, const Loop *L, bool ControlsExit) {
   }
 
   const SCEVAddRecExpr *AddRec = dyn_cast<SCEVAddRecExpr>(V);
+  if (!AddRec && AllowPredicates)
+    // Try to make this an AddRec using runtime tests, in the first X
+    // iterations of this loop, where X is the SCEV expression found by the
+    // algorithm below.
+    AddRec = convertSCEVToAddRecWithPredicates(V, L, P);
+
   if (!AddRec || AddRec->getLoop() != L)
     return getCouldNotCompute();
 
   // If this is a quadratic (3-term) AddRec {L,+,M,+,N}, find the roots of
   // the quadratic equation to solve it.
   if (AddRec->isQuadratic() && AddRec->getType()->isIntegerTy()) {
-    std::pair<const SCEV *,const SCEV *> Roots =
-      SolveQuadraticEquation(AddRec, *this);
-    const SCEVConstant *R1 = dyn_cast<SCEVConstant>(Roots.first);
-    const SCEVConstant *R2 = dyn_cast<SCEVConstant>(Roots.second);
-    if (R1 && R2) {
+    if (auto Roots = SolveQuadraticEquation(AddRec, *this)) {
+      const SCEVConstant *R1 = Roots->first;
+      const SCEVConstant *R2 = Roots->second;
       // Pick the smallest positive root value.
-      if (ConstantInt *CB =
-          dyn_cast<ConstantInt>(ConstantExpr::getICmp(CmpInst::ICMP_ULT,
-                                                      R1->getValue(),
-                                                      R2->getValue()))) {
+      if (ConstantInt *CB = dyn_cast<ConstantInt>(ConstantExpr::getICmp(
+              CmpInst::ICMP_ULT, R1->getValue(), R2->getValue()))) {
         if (!CB->getZExtValue())
-          std::swap(R1, R2);   // R1 is the minimum root now.
+          std::swap(R1, R2); // R1 is the minimum root now.
 
         // We can only use this value if the chrec ends up with an exact zero
         // value at this index.  When solving for "X*X != 5", for example, we
         // should not accept a root of 2.
         const SCEV *Val = AddRec->evaluateAtIteration(R1, *this);
         if (Val->isZero())
-          return R1;  // We found a quadratic root!
+          return ExitLimit(R1, R1, P); // We found a quadratic root!
       }
     }
     return getCouldNotCompute();
@@ -6651,7 +7132,7 @@ ScalarEvolution::HowFarToZero(const SCEV *V, const Loop *L, bool ControlsExit) {
     else
       MaxBECount = getConstant(CountDown ? CR.getUnsignedMax()
                                          : -CR.getUnsignedMin());
-    return ExitLimit(Distance, MaxBECount);
+    return ExitLimit(Distance, MaxBECount, P);
   }
 
   // As a special case, handle the instance where Step is a positive power of
@@ -6704,7 +7185,9 @@ ScalarEvolution::HowFarToZero(const SCEV *V, const Loop *L, bool ControlsExit) {
       auto *NarrowTy = IntegerType::get(getContext(), NarrowWidth);
       auto *WideTy = Distance->getType();
 
-      return getZeroExtendExpr(getTruncateExpr(ModuloResult, NarrowTy), WideTy);
+      const SCEV *Limit =
+          getZeroExtendExpr(getTruncateExpr(ModuloResult, NarrowTy), WideTy);
+      return ExitLimit(Limit, Limit, P);
     }
   }
 
@@ -6713,24 +7196,24 @@ ScalarEvolution::HowFarToZero(const SCEV *V, const Loop *L, bool ControlsExit) {
   // compute the backedge count.  In this case, the step may not divide the
   // distance, but we don't care because if the condition is "missed" the loop
   // will have undefined behavior due to wrapping.
-  if (ControlsExit && AddRec->getNoWrapFlags(SCEV::FlagNW)) {
+  if (ControlsExit && AddRec->hasNoSelfWrap() &&
+      loopHasNoAbnormalExits(AddRec->getLoop())) {
     const SCEV *Exact =
         getUDivExpr(Distance, CountDown ? getNegativeSCEV(Step) : Step);
-    return ExitLimit(Exact, Exact);
+    return ExitLimit(Exact, Exact, P);
   }
 
   // Then, try to solve the above equation provided that Start is constant.
-  if (const SCEVConstant *StartC = dyn_cast<SCEVConstant>(Start))
-    return SolveLinEquationWithOverflow(StepC->getAPInt(), -StartC->getAPInt(),
-                                        *this);
+  if (const SCEVConstant *StartC = dyn_cast<SCEVConstant>(Start)) {
+    const SCEV *E = SolveLinEquationWithOverflow(
+        StepC->getValue()->getValue(), -StartC->getValue()->getValue(), *this);
+    return ExitLimit(E, E, P);
+  }
   return getCouldNotCompute();
 }
 
-/// HowFarToNonZero - Return the number of times a backedge checking the
-/// specified value for nonzero will execute.  If not computable, return
-/// CouldNotCompute
 ScalarEvolution::ExitLimit
-ScalarEvolution::HowFarToNonZero(const SCEV *V, const Loop *L) {
+ScalarEvolution::howFarToNonZero(const SCEV *V, const Loop *L) {
   // Loops that look like: while (X == 0) are very strange indeed.  We don't
   // handle them yet except for the trivial case.  This could be expanded in the
   // future as needed.
@@ -6748,33 +7231,27 @@ ScalarEvolution::HowFarToNonZero(const SCEV *V, const Loop *L) {
   return getCouldNotCompute();
 }
 
-/// getPredecessorWithUniqueSuccessorForBB - Return a predecessor of BB
-/// (which may not be an immediate predecessor) which has exactly one
-/// successor from which BB is reachable, or null if no such block is
-/// found.
-///
 std::pair<BasicBlock *, BasicBlock *>
 ScalarEvolution::getPredecessorWithUniqueSuccessorForBB(BasicBlock *BB) {
   // If the block has a unique predecessor, then there is no path from the
   // predecessor to the block that does not go through the direct edge
   // from the predecessor to the block.
   if (BasicBlock *Pred = BB->getSinglePredecessor())
-    return std::make_pair(Pred, BB);
+    return {Pred, BB};
 
   // A loop's header is defined to be a block that dominates the loop.
   // If the header has a unique predecessor outside the loop, it must be
   // a block that has exactly one successor that can reach the loop.
   if (Loop *L = LI.getLoopFor(BB))
-    return std::make_pair(L->getLoopPredecessor(), L->getHeader());
+    return {L->getLoopPredecessor(), L->getHeader()};
 
-  return std::pair<BasicBlock *, BasicBlock *>();
+  return {nullptr, nullptr};
 }
 
-/// HasSameValue - SCEV structural equivalence is usually sufficient for
-/// testing whether two expressions are equal, however for the purposes of
-/// looking for a condition guarding a loop, it can be useful to be a little
-/// more general, since a front-end may have replicated the controlling
-/// expression.
+/// SCEV structural equivalence is usually sufficient for testing whether two
+/// expressions are equal, however for the purposes of looking for a condition
+/// guarding a loop, it can be useful to be a little more general, since a
+/// front-end may have replicated the controlling expression.
 ///
 static bool HasSameValue(const SCEV *A, const SCEV *B) {
   // Quick check to see if they are the same SCEV.
@@ -6800,9 +7277,6 @@ static bool HasSameValue(const SCEV *A, const SCEV *B) {
   return false;
 }
 
-/// SimplifyICmpOperands - Simplify LHS and RHS in a comparison with
-/// predicate Pred. Return true iff any changes were made.
-///
 bool ScalarEvolution::SimplifyICmpOperands(ICmpInst::Predicate &Pred,
                                            const SCEV *&LHS, const SCEV *&RHS,
                                            unsigned Depth) {
@@ -7134,7 +7608,7 @@ bool ScalarEvolution::isKnownPredicate(ICmpInst::Predicate Pred,
     return true;
 
   // Otherwise see what can be done with known constant ranges.
-  return isKnownPredicateWithRanges(Pred, LHS, RHS);
+  return isKnownPredicateViaConstantRanges(Pred, LHS, RHS);
 }
 
 bool ScalarEvolution::isMonotonicPredicate(const SCEVAddRecExpr *LHS,
@@ -7180,7 +7654,7 @@ bool ScalarEvolution::isMonotonicPredicateImpl(const SCEVAddRecExpr *LHS,
   case ICmpInst::ICMP_UGE:
   case ICmpInst::ICMP_ULT:
   case ICmpInst::ICMP_ULE:
-    if (!LHS->getNoWrapFlags(SCEV::FlagNUW))
+    if (!LHS->hasNoUnsignedWrap())
       return false;
 
     Increasing = Pred == ICmpInst::ICMP_UGT || Pred == ICmpInst::ICMP_UGE;
@@ -7190,7 +7664,7 @@ bool ScalarEvolution::isMonotonicPredicateImpl(const SCEVAddRecExpr *LHS,
   case ICmpInst::ICMP_SGE:
   case ICmpInst::ICMP_SLT:
   case ICmpInst::ICMP_SLE: {
-    if (!LHS->getNoWrapFlags(SCEV::FlagNSW))
+    if (!LHS->hasNoSignedWrap())
       return false;
 
     const SCEV *Step = LHS->getStepRecurrence(*this);
@@ -7264,78 +7738,34 @@ bool ScalarEvolution::isLoopInvariantPredicate(
   return true;
 }
 
-bool
-ScalarEvolution::isKnownPredicateWithRanges(ICmpInst::Predicate Pred,
-                                            const SCEV *LHS, const SCEV *RHS) {
+bool ScalarEvolution::isKnownPredicateViaConstantRanges(
+    ICmpInst::Predicate Pred, const SCEV *LHS, const SCEV *RHS) {
   if (HasSameValue(LHS, RHS))
     return ICmpInst::isTrueWhenEqual(Pred);
 
   // This code is split out from isKnownPredicate because it is called from
   // within isLoopEntryGuardedByCond.
-  switch (Pred) {
-  default:
-    llvm_unreachable("Unexpected ICmpInst::Predicate value!");
-  case ICmpInst::ICMP_SGT:
-    std::swap(LHS, RHS);
-  case ICmpInst::ICMP_SLT: {
-    ConstantRange LHSRange = getSignedRange(LHS);
-    ConstantRange RHSRange = getSignedRange(RHS);
-    if (LHSRange.getSignedMax().slt(RHSRange.getSignedMin()))
-      return true;
-    if (LHSRange.getSignedMin().sge(RHSRange.getSignedMax()))
-      return false;
-    break;
-  }
-  case ICmpInst::ICMP_SGE:
-    std::swap(LHS, RHS);
-  case ICmpInst::ICMP_SLE: {
-    ConstantRange LHSRange = getSignedRange(LHS);
-    ConstantRange RHSRange = getSignedRange(RHS);
-    if (LHSRange.getSignedMax().sle(RHSRange.getSignedMin()))
-      return true;
-    if (LHSRange.getSignedMin().sgt(RHSRange.getSignedMax()))
-      return false;
-    break;
-  }
-  case ICmpInst::ICMP_UGT:
-    std::swap(LHS, RHS);
-  case ICmpInst::ICMP_ULT: {
-    ConstantRange LHSRange = getUnsignedRange(LHS);
-    ConstantRange RHSRange = getUnsignedRange(RHS);
-    if (LHSRange.getUnsignedMax().ult(RHSRange.getUnsignedMin()))
-      return true;
-    if (LHSRange.getUnsignedMin().uge(RHSRange.getUnsignedMax()))
-      return false;
-    break;
-  }
-  case ICmpInst::ICMP_UGE:
-    std::swap(LHS, RHS);
-  case ICmpInst::ICMP_ULE: {
-    ConstantRange LHSRange = getUnsignedRange(LHS);
-    ConstantRange RHSRange = getUnsignedRange(RHS);
-    if (LHSRange.getUnsignedMax().ule(RHSRange.getUnsignedMin()))
-      return true;
-    if (LHSRange.getUnsignedMin().ugt(RHSRange.getUnsignedMax()))
-      return false;
-    break;
-  }
-  case ICmpInst::ICMP_NE: {
-    if (getUnsignedRange(LHS).intersectWith(getUnsignedRange(RHS)).isEmptySet())
-      return true;
-    if (getSignedRange(LHS).intersectWith(getSignedRange(RHS)).isEmptySet())
-      return true;
 
-    const SCEV *Diff = getMinusSCEV(LHS, RHS);
-    if (isKnownNonZero(Diff))
-      return true;
-    break;
-  }
-  case ICmpInst::ICMP_EQ:
-    // The check at the top of the function catches the case where
-    // the values are known to be equal.
-    break;
-  }
-  return false;
+  auto CheckRanges =
+      [&](const ConstantRange &RangeLHS, const ConstantRange &RangeRHS) {
+    return ConstantRange::makeSatisfyingICmpRegion(Pred, RangeRHS)
+        .contains(RangeLHS);
+  };
+
+  // The check at the top of the function catches the case where the values are
+  // known to be equal.
+  if (Pred == CmpInst::ICMP_EQ)
+    return false;
+
+  if (Pred == CmpInst::ICMP_NE)
+    return CheckRanges(getSignedRange(LHS), getSignedRange(RHS)) ||
+           CheckRanges(getUnsignedRange(LHS), getUnsignedRange(RHS)) ||
+           isKnownNonZero(getMinusSCEV(LHS, RHS));
+
+  if (CmpInst::isSigned(Pred))
+    return CheckRanges(getSignedRange(LHS), getSignedRange(RHS));
+
+  return CheckRanges(getUnsignedRange(LHS), getUnsignedRange(RHS));
 }
 
 bool ScalarEvolution::isKnownPredicateViaNoOverflow(ICmpInst::Predicate Pred,
@@ -7416,6 +7846,23 @@ bool ScalarEvolution::isKnownPredicateViaSplitting(ICmpInst::Predicate Pred,
          isKnownPredicate(CmpInst::ICMP_SLT, LHS, RHS);
 }
 
+bool ScalarEvolution::isImpliedViaGuard(BasicBlock *BB,
+                                        ICmpInst::Predicate Pred,
+                                        const SCEV *LHS, const SCEV *RHS) {
+  // No need to even try if we know the module has no guards.
+  if (!HasGuards)
+    return false;
+
+  return any_of(*BB, [&](Instruction &I) {
+    using namespace llvm::PatternMatch;
+
+    Value *Condition;
+    return match(&I, m_Intrinsic<Intrinsic::experimental_guard>(
+                         m_Value(Condition))) &&
+           isImpliedCond(Pred, LHS, RHS, Condition, false);
+  });
+}
+
 /// isLoopBackedgeGuardedByCond - Test whether the backedge of the loop is
 /// protected by a conditional between LHS and RHS.  This is used to
 /// to eliminate casts.
@@ -7427,7 +7874,8 @@ ScalarEvolution::isLoopBackedgeGuardedByCond(const Loop *L,
   // (interprocedural conditions notwithstanding).
   if (!L) return true;
 
-  if (isKnownPredicateWithRanges(Pred, LHS, RHS)) return true;
+  if (isKnownPredicateViaConstantRanges(Pred, LHS, RHS))
+    return true;
 
   BasicBlock *Latch = L->getLoopLatch();
   if (!Latch)
@@ -7482,12 +7930,18 @@ ScalarEvolution::isLoopBackedgeGuardedByCond(const Loop *L,
   if (!DT.isReachableFromEntry(L->getHeader()))
     return false;
 
+  if (isImpliedViaGuard(Latch, Pred, LHS, RHS))
+    return true;
+
   for (DomTreeNode *DTN = DT[Latch], *HeaderDTN = DT[L->getHeader()];
        DTN != HeaderDTN; DTN = DTN->getIDom()) {
 
     assert(DTN && "should reach the loop header before reaching the root!");
 
     BasicBlock *BB = DTN->getBlock();
+    if (isImpliedViaGuard(BB, Pred, LHS, RHS))
+      return true;
+
     BasicBlock *PBB = BB->getSinglePredecessor();
     if (!PBB)
       continue;
@@ -7518,9 +7972,6 @@ ScalarEvolution::isLoopBackedgeGuardedByCond(const Loop *L,
   return false;
 }
 
-/// isLoopEntryGuardedByCond - Test whether entry to the loop is protected
-/// by a conditional between LHS and RHS.  This is used to help avoid max
-/// expressions in loop trip counts, and to eliminate casts.
 bool
 ScalarEvolution::isLoopEntryGuardedByCond(const Loop *L,
                                           ICmpInst::Predicate Pred,
@@ -7529,7 +7980,8 @@ ScalarEvolution::isLoopEntryGuardedByCond(const Loop *L,
   // (interprocedural conditions notwithstanding).
   if (!L) return false;
 
-  if (isKnownPredicateWithRanges(Pred, LHS, RHS)) return true;
+  if (isKnownPredicateViaConstantRanges(Pred, LHS, RHS))
+    return true;
 
   // Starting at the loop predecessor, climb up the predecessor chain, as long
   // as there are predecessors that can be found that have unique successors
@@ -7539,6 +7991,9 @@ ScalarEvolution::isLoopEntryGuardedByCond(const Loop *L,
        Pair.first;
        Pair = getPredecessorWithUniqueSuccessorForBB(Pair.first)) {
 
+    if (isImpliedViaGuard(Pair.first, Pred, LHS, RHS))
+      return true;
+
     BranchInst *LoopEntryPredicate =
       dyn_cast<BranchInst>(Pair.first->getTerminator());
     if (!LoopEntryPredicate ||
@@ -7586,8 +8041,6 @@ struct MarkPendingLoopPredicate {
 };
 } // end anonymous namespace
 
-/// isImpliedCond - Test whether the condition described by Pred, LHS,
-/// and RHS is true whenever the given Cond value evaluates to true.
 bool ScalarEvolution::isImpliedCond(ICmpInst::Predicate Pred,
                                     const SCEV *LHS, const SCEV *RHS,
                                     Value *FoundCondValue,
@@ -7910,9 +8363,6 @@ bool ScalarEvolution::isImpliedCondOperandsViaNoOverflow(
                                   getConstant(FoundRHSLimit));
 }
 
-/// isImpliedCondOperands - Test whether the condition described by Pred,
-/// LHS, and RHS is true whenever the condition described by Pred, FoundLHS,
-/// and FoundRHS is true.
 bool ScalarEvolution::isImpliedCondOperands(ICmpInst::Predicate Pred,
                                             const SCEV *LHS, const SCEV *RHS,
                                             const SCEV *FoundLHS,
@@ -8037,9 +8487,6 @@ static bool IsKnownPredicateViaMinOrMax(ScalarEvolution &SE,
   llvm_unreachable("covered switch fell through?!");
 }
 
-/// isImpliedCondOperandsHelper - Test whether the condition described by
-/// Pred, LHS, and RHS is true whenever the condition described by Pred,
-/// FoundLHS, and FoundRHS is true.
 bool
 ScalarEvolution::isImpliedCondOperandsHelper(ICmpInst::Predicate Pred,
                                              const SCEV *LHS, const SCEV *RHS,
@@ -8047,7 +8494,7 @@ ScalarEvolution::isImpliedCondOperandsHelper(ICmpInst::Predicate Pred,
                                              const SCEV *FoundRHS) {
   auto IsKnownPredicateFull =
       [this](ICmpInst::Predicate Pred, const SCEV *LHS, const SCEV *RHS) {
-    return isKnownPredicateWithRanges(Pred, LHS, RHS) ||
+    return isKnownPredicateViaConstantRanges(Pred, LHS, RHS) ||
            IsKnownPredicateViaMinOrMax(*this, Pred, LHS, RHS) ||
            IsKnownPredicateViaAddRecStart(*this, Pred, LHS, RHS) ||
            isKnownPredicateViaNoOverflow(Pred, LHS, RHS);
@@ -8089,8 +8536,6 @@ ScalarEvolution::isImpliedCondOperandsHelper(ICmpInst::Predicate Pred,
   return false;
 }
 
-/// isImpliedCondOperandsViaRanges - helper function for isImpliedCondOperands.
-/// Tries to get cases like "X `sgt` 0 => X - 1 `sgt` -1".
 bool ScalarEvolution::isImpliedCondOperandsViaRanges(ICmpInst::Predicate Pred,
                                                      const SCEV *LHS,
                                                      const SCEV *RHS,
@@ -8129,9 +8574,6 @@ bool ScalarEvolution::isImpliedCondOperandsViaRanges(ICmpInst::Predicate Pred,
   return SatisfyingLHSRange.contains(LHSRange);
 }
 
-// Verify if an linear IV with positive stride can overflow when in a
-// less-than comparison, knowing the invariant term of the comparison, the
-// stride and the knowledge of NSW/NUW flags on the recurrence.
 bool ScalarEvolution::doesIVOverflowOnLT(const SCEV *RHS, const SCEV *Stride,
                                          bool IsSigned, bool NoWrap) {
   if (NoWrap) return false;
@@ -8158,9 +8600,6 @@ bool ScalarEvolution::doesIVOverflowOnLT(const SCEV *RHS, const SCEV *Stride,
   return (MaxValue - MaxStrideMinusOne).ult(MaxRHS);
 }
 
-// Verify if an linear IV with negative stride can overflow when in a
-// greater-than comparison, knowing the invariant term of the comparison,
-// the stride and the knowledge of NSW/NUW flags on the recurrence.
 bool ScalarEvolution::doesIVOverflowOnGT(const SCEV *RHS, const SCEV *Stride,
                                          bool IsSigned, bool NoWrap) {
   if (NoWrap) return false;
@@ -8187,8 +8626,6 @@ bool ScalarEvolution::doesIVOverflowOnGT(const SCEV *RHS, const SCEV *Stride,
   return (MinValue + MaxStrideMinusOne).ugt(MinRHS);
 }
 
-// Compute the backedge taken count knowing the interval difference, the
-// stride and presence of the equality in the comparison.
 const SCEV *ScalarEvolution::computeBECount(const SCEV *Delta, const SCEV *Step,
                                             bool Equality) {
   const SCEV *One = getOne(Step->getType());
@@ -8197,22 +8634,21 @@ const SCEV *ScalarEvolution::computeBECount(const SCEV *Delta, const SCEV *Step,
   return getUDivExpr(Delta, Step);
 }
 
-/// HowManyLessThans - Return the number of times a backedge containing the
-/// specified less-than comparison will execute.  If not computable, return
-/// CouldNotCompute.
-///
-/// @param ControlsExit is true when the LHS < RHS condition directly controls
-/// the branch (loops exits only if condition is true). In this case, we can use
-/// NoWrapFlags to skip overflow checks.
 ScalarEvolution::ExitLimit
-ScalarEvolution::HowManyLessThans(const SCEV *LHS, const SCEV *RHS,
+ScalarEvolution::howManyLessThans(const SCEV *LHS, const SCEV *RHS,
                                   const Loop *L, bool IsSigned,
-                                  bool ControlsExit) {
+                                  bool ControlsExit, bool AllowPredicates) {
+  SCEVUnionPredicate P;
   // We handle only IV < Invariant
   if (!isLoopInvariant(RHS, L))
     return getCouldNotCompute();
 
   const SCEVAddRecExpr *IV = dyn_cast<SCEVAddRecExpr>(LHS);
+  if (!IV && AllowPredicates)
+    // Try to make this an AddRec using runtime tests, in the first X
+    // iterations of this loop, where X is the SCEV expression found by the
+    // algorithm below.
+    IV = convertSCEVToAddRecWithPredicates(LHS, L, P);
 
   // Avoid weird loops
   if (!IV || IV->getLoop() != L || !IV->isAffine())
@@ -8238,19 +8674,8 @@ ScalarEvolution::HowManyLessThans(const SCEV *LHS, const SCEV *RHS,
                                       : ICmpInst::ICMP_ULT;
   const SCEV *Start = IV->getStart();
   const SCEV *End = RHS;
-  if (!isLoopEntryGuardedByCond(L, Cond, getMinusSCEV(Start, Stride), RHS)) {
-    const SCEV *Diff = getMinusSCEV(RHS, Start);
-    // If we have NoWrap set, then we can assume that the increment won't
-    // overflow, in which case if RHS - Start is a constant, we don't need to
-    // do a max operation since we can just figure it out statically
-    if (NoWrap && isa<SCEVConstant>(Diff)) {
-      APInt D = dyn_cast<const SCEVConstant>(Diff)->getAPInt();
-      if (D.isNegative())
-        End = Start;
-    } else
-      End = IsSigned ? getSMaxExpr(RHS, Start)
-                     : getUMaxExpr(RHS, Start);
-  }
+  if (!isLoopEntryGuardedByCond(L, Cond, getMinusSCEV(Start, Stride), RHS))
+    End = IsSigned ? getSMaxExpr(RHS, Start) : getUMaxExpr(RHS, Start);
 
   const SCEV *BECount = computeBECount(getMinusSCEV(End, Start), Stride, false);
 
@@ -8281,18 +8706,24 @@ ScalarEvolution::HowManyLessThans(const SCEV *LHS, const SCEV *RHS,
   if (isa<SCEVCouldNotCompute>(MaxBECount))
     MaxBECount = BECount;
 
-  return ExitLimit(BECount, MaxBECount);
+  return ExitLimit(BECount, MaxBECount, P);
 }
 
 ScalarEvolution::ExitLimit
-ScalarEvolution::HowManyGreaterThans(const SCEV *LHS, const SCEV *RHS,
+ScalarEvolution::howManyGreaterThans(const SCEV *LHS, const SCEV *RHS,
                                      const Loop *L, bool IsSigned,
-                                     bool ControlsExit) {
+                                     bool ControlsExit, bool AllowPredicates) {
+  SCEVUnionPredicate P;
   // We handle only IV > Invariant
   if (!isLoopInvariant(RHS, L))
     return getCouldNotCompute();
 
   const SCEVAddRecExpr *IV = dyn_cast<SCEVAddRecExpr>(LHS);
+  if (!IV && AllowPredicates)
+    // Try to make this an AddRec using runtime tests, in the first X
+    // iterations of this loop, where X is the SCEV expression found by the
+    // algorithm below.
+    IV = convertSCEVToAddRecWithPredicates(LHS, L, P);
 
   // Avoid weird loops
   if (!IV || IV->getLoop() != L || !IV->isAffine())
@@ -8319,19 +8750,8 @@ ScalarEvolution::HowManyGreaterThans(const SCEV *LHS, const SCEV *RHS,
 
   const SCEV *Start = IV->getStart();
   const SCEV *End = RHS;
-  if (!isLoopEntryGuardedByCond(L, Cond, getAddExpr(Start, Stride), RHS)) {
-    const SCEV *Diff = getMinusSCEV(RHS, Start);
-    // If we have NoWrap set, then we can assume that the increment won't
-    // overflow, in which case if RHS - Start is a constant, we don't need to
-    // do a max operation since we can just figure it out statically
-    if (NoWrap && isa<SCEVConstant>(Diff)) {
-      APInt D = dyn_cast<const SCEVConstant>(Diff)->getAPInt();
-      if (!D.isNegative())
-        End = Start;
-    } else
-      End = IsSigned ? getSMinExpr(RHS, Start)
-                     : getUMinExpr(RHS, Start);
-  }
+  if (!isLoopEntryGuardedByCond(L, Cond, getAddExpr(Start, Stride), RHS))
+    End = IsSigned ? getSMinExpr(RHS, Start) : getUMinExpr(RHS, Start);
 
   const SCEV *BECount = computeBECount(getMinusSCEV(Start, End), Stride, false);
 
@@ -8363,15 +8783,10 @@ ScalarEvolution::HowManyGreaterThans(const SCEV *LHS, const SCEV *RHS,
   if (isa<SCEVCouldNotCompute>(MaxBECount))
     MaxBECount = BECount;
 
-  return ExitLimit(BECount, MaxBECount);
+  return ExitLimit(BECount, MaxBECount, P);
 }
 
-/// getNumIterationsInRange - Return the number of iterations of this loop that
-/// produce values in the specified constant range.  Another way of looking at
-/// this is that it returns the first iteration number where the value is not in
-/// the condition, thus computing the exit count. If the iteration count can't
-/// be computed, an instance of SCEVCouldNotCompute is returned.
-const SCEV *SCEVAddRecExpr::getNumIterationsInRange(ConstantRange Range,
+const SCEV *SCEVAddRecExpr::getNumIterationsInRange(const ConstantRange &Range,
                                                     ScalarEvolution &SE) const {
   if (Range.isFullSet())  // Infinite loop.
     return SE.getCouldNotCompute();
@@ -8445,22 +8860,21 @@ const SCEV *SCEVAddRecExpr::getNumIterationsInRange(ConstantRange Range,
                                              FlagAnyWrap);
 
     // Next, solve the constructed addrec
-    auto Roots = SolveQuadraticEquation(cast<SCEVAddRecExpr>(NewAddRec), SE);
-    const SCEVConstant *R1 = dyn_cast<SCEVConstant>(Roots.first);
-    const SCEVConstant *R2 = dyn_cast<SCEVConstant>(Roots.second);
-    if (R1) {
+    if (auto Roots =
+            SolveQuadraticEquation(cast<SCEVAddRecExpr>(NewAddRec), SE)) {
+      const SCEVConstant *R1 = Roots->first;
+      const SCEVConstant *R2 = Roots->second;
       // Pick the smallest positive root value.
       if (ConstantInt *CB = dyn_cast<ConstantInt>(ConstantExpr::getICmp(
               ICmpInst::ICMP_ULT, R1->getValue(), R2->getValue()))) {
         if (!CB->getZExtValue())
-          std::swap(R1, R2);   // R1 is the minimum root now.
+          std::swap(R1, R2); // R1 is the minimum root now.
 
         // Make sure the root is not off by one.  The returned iteration should
         // not be in the range, but the previous one should be.  When solving
         // for "X*X < 5", for example, we should not return a root of 2.
-        ConstantInt *R1Val = EvaluateConstantChrecAtConstant(this,
-                                                             R1->getValue(),
-                                                             SE);
+        ConstantInt *R1Val =
+            EvaluateConstantChrecAtConstant(this, R1->getValue(), SE);
         if (Range.contains(R1Val->getValue())) {
           // The next iteration must be out of the range...
           ConstantInt *NextVal =
@@ -8469,7 +8883,7 @@ const SCEV *SCEVAddRecExpr::getNumIterationsInRange(ConstantRange Range,
           R1Val = EvaluateConstantChrecAtConstant(this, NextVal, SE);
           if (!Range.contains(R1Val->getValue()))
             return SE.getConstant(NextVal);
-          return SE.getCouldNotCompute();  // Something strange happened
+          return SE.getCouldNotCompute(); // Something strange happened
         }
 
         // If R1 was not in the range, then it is a good return value.  Make
@@ -8479,7 +8893,7 @@ const SCEV *SCEVAddRecExpr::getNumIterationsInRange(ConstantRange Range,
         R1Val = EvaluateConstantChrecAtConstant(this, NextVal, SE);
         if (Range.contains(R1Val->getValue()))
           return R1;
-        return SE.getCouldNotCompute();  // Something strange happened
+        return SE.getCouldNotCompute(); // Something strange happened
       }
     }
   }
@@ -8789,12 +9203,9 @@ const SCEV *ScalarEvolution::getElementSize(Instruction *Inst) {
   return getSizeOfExpr(ETy, Ty);
 }
 
-/// Second step of delinearization: compute the array dimensions Sizes from the
-/// set of Terms extracted from the memory access function of this SCEVAddRec.
 void ScalarEvolution::findArrayDimensions(SmallVectorImpl<const SCEV *> &Terms,
                                           SmallVectorImpl<const SCEV *> &Sizes,
                                           const SCEV *ElementSize) const {
-
   if (Terms.size() < 1 || !ElementSize)
     return;
 
@@ -8858,8 +9269,6 @@ void ScalarEvolution::findArrayDimensions(SmallVectorImpl<const SCEV *> &Terms,
     });
 }
 
-/// Third step of delinearization: compute the access functions for the
-/// Subscripts based on the dimensions in Sizes.
 void ScalarEvolution::computeAccessFunctions(
     const SCEV *Expr, SmallVectorImpl<const SCEV *> &Subscripts,
     SmallVectorImpl<const SCEV *> &Sizes) {
@@ -9012,7 +9421,7 @@ void ScalarEvolution::SCEVCallbackVH::deleted() {
   assert(SE && "SCEVCallbackVH called with a null ScalarEvolution!");
   if (PHINode *PN = dyn_cast<PHINode>(getValPtr()))
     SE->ConstantEvolutionLoopExitValue.erase(PN);
-  SE->ValueExprMap.erase(getValPtr());
+  SE->eraseValueFromMap(getValPtr());
   // this now dangles!
 }
 
@@ -9035,13 +9444,13 @@ void ScalarEvolution::SCEVCallbackVH::allUsesReplacedWith(Value *V) {
       continue;
     if (PHINode *PN = dyn_cast<PHINode>(U))
       SE->ConstantEvolutionLoopExitValue.erase(PN);
-    SE->ValueExprMap.erase(U);
+    SE->eraseValueFromMap(U);
     Worklist.insert(Worklist.end(), U->user_begin(), U->user_end());
   }
   // Delete the Old value.
   if (PHINode *PN = dyn_cast<PHINode>(Old))
     SE->ConstantEvolutionLoopExitValue.erase(PN);
-  SE->ValueExprMap.erase(Old);
+  SE->eraseValueFromMap(Old);
   // this now dangles!
 }
 
@@ -9059,14 +9468,31 @@ ScalarEvolution::ScalarEvolution(Function &F, TargetLibraryInfo &TLI,
       CouldNotCompute(new SCEVCouldNotCompute()),
       WalkingBEDominatingConds(false), ProvingSplitPredicate(false),
       ValuesAtScopes(64), LoopDispositions(64), BlockDispositions(64),
-      FirstUnknown(nullptr) {}
+      FirstUnknown(nullptr) {
+
+  // To use guards for proving predicates, we need to scan every instruction in
+  // relevant basic blocks, and not just terminators.  Doing this is a waste of
+  // time if the IR does not actually contain any calls to
+  // @llvm.experimental.guard, so do a quick check and remember this beforehand.
+  //
+  // This pessimizes the case where a pass that preserves ScalarEvolution wants
+  // to _add_ guards to the module when there weren't any before, and wants
+  // ScalarEvolution to optimize based on those guards.  For now we prefer to be
+  // efficient in lieu of being smart in that rather obscure case.
+
+  auto *GuardDecl = F.getParent()->getFunction(
+      Intrinsic::getName(Intrinsic::experimental_guard));
+  HasGuards = GuardDecl && !GuardDecl->use_empty();
+}
 
 ScalarEvolution::ScalarEvolution(ScalarEvolution &&Arg)
-    : F(Arg.F), TLI(Arg.TLI), AC(Arg.AC), DT(Arg.DT), LI(Arg.LI),
-      CouldNotCompute(std::move(Arg.CouldNotCompute)),
+    : F(Arg.F), HasGuards(Arg.HasGuards), TLI(Arg.TLI), AC(Arg.AC), DT(Arg.DT),
+      LI(Arg.LI), CouldNotCompute(std::move(Arg.CouldNotCompute)),
       ValueExprMap(std::move(Arg.ValueExprMap)),
       WalkingBEDominatingConds(false), ProvingSplitPredicate(false),
       BackedgeTakenCounts(std::move(Arg.BackedgeTakenCounts)),
+      PredicatedBackedgeTakenCounts(
+          std::move(Arg.PredicatedBackedgeTakenCounts)),
       ConstantEvolutionLoopExitValue(
           std::move(Arg.ConstantEvolutionLoopExitValue)),
       ValuesAtScopes(std::move(Arg.ValuesAtScopes)),
@@ -9091,12 +9517,16 @@ ScalarEvolution::~ScalarEvolution() {
   }
   FirstUnknown = nullptr;
 
+  ExprValueMap.clear();
   ValueExprMap.clear();
+  HasRecMap.clear();
 
   // Free any extra memory created for ExitNotTakenInfo in the unlikely event
   // that a loop had multiple computable exits.
   for (auto &BTCI : BackedgeTakenCounts)
     BTCI.second.clear();
+  for (auto &BTCI : PredicatedBackedgeTakenCounts)
+    BTCI.second.clear();
 
   assert(PendingLoopPredicates.empty() && "isImpliedCond garbage");
   assert(!WalkingBEDominatingConds && "isLoopBackedgeGuardedByCond garbage!");
@@ -9110,8 +9540,8 @@ bool ScalarEvolution::hasLoopInvariantBackedgeTakenCount(const Loop *L) {
 static void PrintLoopInfo(raw_ostream &OS, ScalarEvolution *SE,
                           const Loop *L) {
   // Print all inner loops first
-  for (Loop::iterator I = L->begin(), E = L->end(); I != E; ++I)
-    PrintLoopInfo(OS, SE, *I);
+  for (Loop *I : *L)
+    PrintLoopInfo(OS, SE, I);
 
   OS << "Loop ";
   L->getHeader()->printAsOperand(OS, /*PrintType=*/false);
@@ -9139,9 +9569,35 @@ static void PrintLoopInfo(raw_ostream &OS, ScalarEvolution *SE,
     OS << "Unpredictable max backedge-taken count. ";
   }
 
+  OS << "\n"
+        "Loop ";
+  L->getHeader()->printAsOperand(OS, /*PrintType=*/false);
+  OS << ": ";
+
+  SCEVUnionPredicate Pred;
+  auto PBT = SE->getPredicatedBackedgeTakenCount(L, Pred);
+  if (!isa<SCEVCouldNotCompute>(PBT)) {
+    OS << "Predicated backedge-taken count is " << *PBT << "\n";
+    OS << " Predicates:\n";
+    Pred.print(OS, 4);
+  } else {
+    OS << "Unpredictable predicated backedge-taken count. ";
+  }
   OS << "\n";
 }
 
+static StringRef loopDispositionToStr(ScalarEvolution::LoopDisposition LD) {
+  switch (LD) {
+  case ScalarEvolution::LoopVariant:
+    return "Variant";
+  case ScalarEvolution::LoopInvariant:
+    return "Invariant";
+  case ScalarEvolution::LoopComputable:
+    return "Computable";
+  }
+  llvm_unreachable("Unknown ScalarEvolution::LoopDisposition kind!");
+}
+
 void ScalarEvolution::print(raw_ostream &OS) const {
   // ScalarEvolution's implementation of the print method is to print
   // out SCEV values of all instructions that are interesting. Doing
@@ -9189,6 +9645,35 @@ void ScalarEvolution::print(raw_ostream &OS) const {
         } else {
           OS << *ExitValue;
         }
+
+        bool First = true;
+        for (auto *Iter = L; Iter; Iter = Iter->getParentLoop()) {
+          if (First) {
+            OS << "\t\t" "LoopDispositions: { ";
+            First = false;
+          } else {
+            OS << ", ";
+          }
+
+          Iter->getHeader()->printAsOperand(OS, /*PrintType=*/false);
+          OS << ": " << loopDispositionToStr(SE.getLoopDisposition(SV, Iter));
+        }
+
+        for (auto *InnerL : depth_first(L)) {
+          if (InnerL == L)
+            continue;
+          if (First) {
+            OS << "\t\t" "LoopDispositions: { ";
+            First = false;
+          } else {
+            OS << ", ";
+          }
+
+          InnerL->getHeader()->printAsOperand(OS, /*PrintType=*/false);
+          OS << ": " << loopDispositionToStr(SE.getLoopDisposition(SV, InnerL));
+        }
+
+        OS << " }";
       }
 
       OS << "\n";
@@ -9197,8 +9682,8 @@ void ScalarEvolution::print(raw_ostream &OS) const {
   OS << "Determining loop execution counts for: ";
   F.printAsOperand(OS, /*PrintType=*/false);
   OS << "\n";
-  for (LoopInfo::iterator I = LI.begin(), E = LI.end(); I != E; ++I)
-    PrintLoopInfo(OS, &SE, *I);
+  for (Loop *I : LI)
+    PrintLoopInfo(OS, &SE, I);
 }
 
 ScalarEvolution::LoopDisposition
@@ -9420,17 +9905,23 @@ void ScalarEvolution::forgetMemoizedResults(const SCEV *S) {
   BlockDispositions.erase(S);
   UnsignedRanges.erase(S);
   SignedRanges.erase(S);
+  ExprValueMap.erase(S);
+  HasRecMap.erase(S);
+
+  auto RemoveSCEVFromBackedgeMap =
+      [S, this](DenseMap<const Loop *, BackedgeTakenInfo> &Map) {
+        for (auto I = Map.begin(), E = Map.end(); I != E;) {
+          BackedgeTakenInfo &BEInfo = I->second;
+          if (BEInfo.hasOperand(S, this)) {
+            BEInfo.clear();
+            Map.erase(I++);
+          } else
+            ++I;
+        }
+      };
 
-  for (DenseMap<const Loop*, BackedgeTakenInfo>::iterator I =
-         BackedgeTakenCounts.begin(), E = BackedgeTakenCounts.end(); I != E; ) {
-    BackedgeTakenInfo &BEInfo = I->second;
-    if (BEInfo.hasOperand(S, this)) {
-      BEInfo.clear();
-      BackedgeTakenCounts.erase(I++);
-    }
-    else
-      ++I;
-  }
+  RemoveSCEVFromBackedgeMap(BackedgeTakenCounts);
+  RemoveSCEVFromBackedgeMap(PredicatedBackedgeTakenCounts);
 }
 
 typedef DenseMap<const Loop *, std::string> VerifyMap;
@@ -9516,16 +10007,16 @@ void ScalarEvolution::verify() const {
 char ScalarEvolutionAnalysis::PassID;
 
 ScalarEvolution ScalarEvolutionAnalysis::run(Function &F,
-                                             AnalysisManager<Function> *AM) {
-  return ScalarEvolution(F, AM->getResult<TargetLibraryAnalysis>(F),
-                         AM->getResult<AssumptionAnalysis>(F),
-                         AM->getResult<DominatorTreeAnalysis>(F),
-                         AM->getResult<LoopAnalysis>(F));
+                                             AnalysisManager<Function> &AM) {
+  return ScalarEvolution(F, AM.getResult<TargetLibraryAnalysis>(F),
+                         AM.getResult<AssumptionAnalysis>(F),
+                         AM.getResult<DominatorTreeAnalysis>(F),
+                         AM.getResult<LoopAnalysis>(F));
 }
 
 PreservedAnalyses
-ScalarEvolutionPrinterPass::run(Function &F, AnalysisManager<Function> *AM) {
-  AM->getResult<ScalarEvolutionAnalysis>(F).print(OS);
+ScalarEvolutionPrinterPass::run(Function &F, AnalysisManager<Function> &AM) {
+  AM.getResult<ScalarEvolutionAnalysis>(F).print(OS);
   return PreservedAnalyses::all();
 }
 
@@ -9590,36 +10081,121 @@ ScalarEvolution::getEqualPredicate(const SCEVUnknown *LHS,
   return Eq;
 }
 
+const SCEVPredicate *ScalarEvolution::getWrapPredicate(
+    const SCEVAddRecExpr *AR,
+    SCEVWrapPredicate::IncrementWrapFlags AddedFlags) {
+  FoldingSetNodeID ID;
+  // Unique this node based on the arguments
+  ID.AddInteger(SCEVPredicate::P_Wrap);
+  ID.AddPointer(AR);
+  ID.AddInteger(AddedFlags);
+  void *IP = nullptr;
+  if (const auto *S = UniquePreds.FindNodeOrInsertPos(ID, IP))
+    return S;
+  auto *OF = new (SCEVAllocator)
+      SCEVWrapPredicate(ID.Intern(SCEVAllocator), AR, AddedFlags);
+  UniquePreds.InsertNode(OF, IP);
+  return OF;
+}
+
 namespace {
+
 class SCEVPredicateRewriter : public SCEVRewriteVisitor<SCEVPredicateRewriter> {
 public:
-  static const SCEV *rewrite(const SCEV *Scev, ScalarEvolution &SE,
-                             SCEVUnionPredicate &A) {
-    SCEVPredicateRewriter Rewriter(SE, A);
-    return Rewriter.visit(Scev);
+  // Rewrites \p S in the context of a loop L and the predicate A.
+  // If Assume is true, rewrite is free to add further predicates to A
+  // such that the result will be an AddRecExpr.
+  static const SCEV *rewrite(const SCEV *S, const Loop *L, ScalarEvolution &SE,
+                             SCEVUnionPredicate &A, bool Assume) {
+    SCEVPredicateRewriter Rewriter(L, SE, A, Assume);
+    return Rewriter.visit(S);
   }
 
-  SCEVPredicateRewriter(ScalarEvolution &SE, SCEVUnionPredicate &P)
-      : SCEVRewriteVisitor(SE), P(P) {}
+  SCEVPredicateRewriter(const Loop *L, ScalarEvolution &SE,
+                        SCEVUnionPredicate &P, bool Assume)
+      : SCEVRewriteVisitor(SE), P(P), L(L), Assume(Assume) {}
 
   const SCEV *visitUnknown(const SCEVUnknown *Expr) {
     auto ExprPreds = P.getPredicatesForExpr(Expr);
     for (auto *Pred : ExprPreds)
-      if (const auto *IPred = dyn_cast<const SCEVEqualPredicate>(Pred))
+      if (const auto *IPred = dyn_cast<SCEVEqualPredicate>(Pred))
         if (IPred->getLHS() == Expr)
           return IPred->getRHS();
 
     return Expr;
   }
 
+  const SCEV *visitZeroExtendExpr(const SCEVZeroExtendExpr *Expr) {
+    const SCEV *Operand = visit(Expr->getOperand());
+    const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(Operand);
+    if (AR && AR->getLoop() == L && AR->isAffine()) {
+      // This couldn't be folded because the operand didn't have the nuw
+      // flag. Add the nusw flag as an assumption that we could make.
+      const SCEV *Step = AR->getStepRecurrence(SE);
+      Type *Ty = Expr->getType();
+      if (addOverflowAssumption(AR, SCEVWrapPredicate::IncrementNUSW))
+        return SE.getAddRecExpr(SE.getZeroExtendExpr(AR->getStart(), Ty),
+                                SE.getSignExtendExpr(Step, Ty), L,
+                                AR->getNoWrapFlags());
+    }
+    return SE.getZeroExtendExpr(Operand, Expr->getType());
+  }
+
+  const SCEV *visitSignExtendExpr(const SCEVSignExtendExpr *Expr) {
+    const SCEV *Operand = visit(Expr->getOperand());
+    const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(Operand);
+    if (AR && AR->getLoop() == L && AR->isAffine()) {
+      // This couldn't be folded because the operand didn't have the nsw
+      // flag. Add the nssw flag as an assumption that we could make.
+      const SCEV *Step = AR->getStepRecurrence(SE);
+      Type *Ty = Expr->getType();
+      if (addOverflowAssumption(AR, SCEVWrapPredicate::IncrementNSSW))
+        return SE.getAddRecExpr(SE.getSignExtendExpr(AR->getStart(), Ty),
+                                SE.getSignExtendExpr(Step, Ty), L,
+                                AR->getNoWrapFlags());
+    }
+    return SE.getSignExtendExpr(Operand, Expr->getType());
+  }
+
 private:
+  bool addOverflowAssumption(const SCEVAddRecExpr *AR,
+                             SCEVWrapPredicate::IncrementWrapFlags AddedFlags) {
+    auto *A = SE.getWrapPredicate(AR, AddedFlags);
+    if (!Assume) {
+      // Check if we've already made this assumption.
+      if (P.implies(A))
+        return true;
+      return false;
+    }
+    P.add(A);
+    return true;
+  }
+
   SCEVUnionPredicate &P;
+  const Loop *L;
+  bool Assume;
 };
 } // end anonymous namespace
 
-const SCEV *ScalarEvolution::rewriteUsingPredicate(const SCEV *Scev,
+const SCEV *ScalarEvolution::rewriteUsingPredicate(const SCEV *S, const Loop *L,
+                                                   SCEVUnionPredicate &Preds) {
+  return SCEVPredicateRewriter::rewrite(S, L, *this, Preds, false);
+}
+
+const SCEVAddRecExpr *
+ScalarEvolution::convertSCEVToAddRecWithPredicates(const SCEV *S, const Loop *L,
                                                    SCEVUnionPredicate &Preds) {
-  return SCEVPredicateRewriter::rewrite(Scev, *this, Preds);
+  SCEVUnionPredicate TransformPreds;
+  S = SCEVPredicateRewriter::rewrite(S, L, *this, TransformPreds, true);
+  auto *AddRec = dyn_cast<SCEVAddRecExpr>(S);
+
+  if (!AddRec)
+    return nullptr;
+
+  // Since the transformation was successful, we can now transfer the SCEV
+  // predicates.
+  Preds.add(&TransformPreds);
+  return AddRec;
 }
 
 /// SCEV predicates
@@ -9633,7 +10209,7 @@ SCEVEqualPredicate::SCEVEqualPredicate(const FoldingSetNodeIDRef ID,
     : SCEVPredicate(ID, P_Equal), LHS(LHS), RHS(RHS) {}
 
 bool SCEVEqualPredicate::implies(const SCEVPredicate *N) const {
-  const auto *Op = dyn_cast<const SCEVEqualPredicate>(N);
+  const auto *Op = dyn_cast<SCEVEqualPredicate>(N);
 
   if (!Op)
     return false;
@@ -9649,6 +10225,59 @@ void SCEVEqualPredicate::print(raw_ostream &OS, unsigned Depth) const {
   OS.indent(Depth) << "Equal predicate: " << *LHS << " == " << *RHS << "\n";
 }
 
+SCEVWrapPredicate::SCEVWrapPredicate(const FoldingSetNodeIDRef ID,
+                                     const SCEVAddRecExpr *AR,
+                                     IncrementWrapFlags Flags)
+    : SCEVPredicate(ID, P_Wrap), AR(AR), Flags(Flags) {}
+
+const SCEV *SCEVWrapPredicate::getExpr() const { return AR; }
+
+bool SCEVWrapPredicate::implies(const SCEVPredicate *N) const {
+  const auto *Op = dyn_cast<SCEVWrapPredicate>(N);
+
+  return Op && Op->AR == AR && setFlags(Flags, Op->Flags) == Flags;
+}
+
+bool SCEVWrapPredicate::isAlwaysTrue() const {
+  SCEV::NoWrapFlags ScevFlags = AR->getNoWrapFlags();
+  IncrementWrapFlags IFlags = Flags;
+
+  if (ScalarEvolution::setFlags(ScevFlags, SCEV::FlagNSW) == ScevFlags)
+    IFlags = clearFlags(IFlags, IncrementNSSW);
+
+  return IFlags == IncrementAnyWrap;
+}
+
+void SCEVWrapPredicate::print(raw_ostream &OS, unsigned Depth) const {
+  OS.indent(Depth) << *getExpr() << " Added Flags: ";
+  if (SCEVWrapPredicate::IncrementNUSW & getFlags())
+    OS << "<nusw>";
+  if (SCEVWrapPredicate::IncrementNSSW & getFlags())
+    OS << "<nssw>";
+  OS << "\n";
+}
+
+SCEVWrapPredicate::IncrementWrapFlags
+SCEVWrapPredicate::getImpliedFlags(const SCEVAddRecExpr *AR,
+                                   ScalarEvolution &SE) {
+  IncrementWrapFlags ImpliedFlags = IncrementAnyWrap;
+  SCEV::NoWrapFlags StaticFlags = AR->getNoWrapFlags();
+
+  // We can safely transfer the NSW flag as NSSW.
+  if (ScalarEvolution::setFlags(StaticFlags, SCEV::FlagNSW) == StaticFlags)
+    ImpliedFlags = IncrementNSSW;
+
+  if (ScalarEvolution::setFlags(StaticFlags, SCEV::FlagNUW) == StaticFlags) {
+    // If the increment is positive, the SCEV NUW flag will also imply the
+    // WrapPredicate NUSW flag.
+    if (const auto *Step = dyn_cast<SCEVConstant>(AR->getStepRecurrence(SE)))
+      if (Step->getValue()->getValue().isNonNegative())
+        ImpliedFlags = setFlags(ImpliedFlags, IncrementNUSW);
+  }
+
+  return ImpliedFlags;
+}
+
 /// Union predicates don't get cached so create a dummy set ID for it.
 SCEVUnionPredicate::SCEVUnionPredicate()
     : SCEVPredicate(FoldingSetNodeIDRef(nullptr, 0), P_Union) {}
@@ -9667,7 +10296,7 @@ SCEVUnionPredicate::getPredicatesForExpr(const SCEV *Expr) {
 }
 
 bool SCEVUnionPredicate::implies(const SCEVPredicate *N) const {
-  if (const auto *Set = dyn_cast<const SCEVUnionPredicate>(N))
+  if (const auto *Set = dyn_cast<SCEVUnionPredicate>(N))
     return all_of(Set->Preds,
                   [this](const SCEVPredicate *I) { return this->implies(I); });
 
@@ -9688,7 +10317,7 @@ void SCEVUnionPredicate::print(raw_ostream &OS, unsigned Depth) const {
 }
 
 void SCEVUnionPredicate::add(const SCEVPredicate *N) {
-  if (const auto *Set = dyn_cast<const SCEVUnionPredicate>(N)) {
+  if (const auto *Set = dyn_cast<SCEVUnionPredicate>(N)) {
     for (auto Pred : Set->Preds)
       add(Pred);
     return;
@@ -9705,8 +10334,9 @@ void SCEVUnionPredicate::add(const SCEVPredicate *N) {
   Preds.push_back(N);
 }
 
-PredicatedScalarEvolution::PredicatedScalarEvolution(ScalarEvolution &SE)
-    : SE(SE), Generation(0) {}
+PredicatedScalarEvolution::PredicatedScalarEvolution(ScalarEvolution &SE,
+                                                     Loop &L)
+    : SE(SE), L(L), Generation(0), BackedgeCount(nullptr) {}
 
 const SCEV *PredicatedScalarEvolution::getSCEV(Value *V) {
   const SCEV *Expr = SE.getSCEV(V);
@@ -9721,12 +10351,21 @@ const SCEV *PredicatedScalarEvolution::getSCEV(Value *V) {
   if (Entry.second)
     Expr = Entry.second;
 
-  const SCEV *NewSCEV = SE.rewriteUsingPredicate(Expr, Preds);
+  const SCEV *NewSCEV = SE.rewriteUsingPredicate(Expr, &L, Preds);
   Entry = {Generation, NewSCEV};
 
   return NewSCEV;
 }
 
+const SCEV *PredicatedScalarEvolution::getBackedgeTakenCount() {
+  if (!BackedgeCount) {
+    SCEVUnionPredicate BackedgePred;
+    BackedgeCount = SE.getPredicatedBackedgeTakenCount(&L, BackedgePred);
+    addPredicate(BackedgePred);
+  }
+  return BackedgeCount;
+}
+
 void PredicatedScalarEvolution::addPredicate(const SCEVPredicate &Pred) {
   if (Preds.implies(&Pred))
     return;
@@ -9743,7 +10382,82 @@ void PredicatedScalarEvolution::updateGeneration() {
   if (++Generation == 0) {
     for (auto &II : RewriteMap) {
       const SCEV *Rewritten = II.second.second;
-      II.second = {Generation, SE.rewriteUsingPredicate(Rewritten, Preds)};
+      II.second = {Generation, SE.rewriteUsingPredicate(Rewritten, &L, Preds)};
     }
   }
 }
+
+void PredicatedScalarEvolution::setNoOverflow(
+    Value *V, SCEVWrapPredicate::IncrementWrapFlags Flags) {
+  const SCEV *Expr = getSCEV(V);
+  const auto *AR = cast<SCEVAddRecExpr>(Expr);
+
+  auto ImpliedFlags = SCEVWrapPredicate::getImpliedFlags(AR, SE);
+
+  // Clear the statically implied flags.
+  Flags = SCEVWrapPredicate::clearFlags(Flags, ImpliedFlags);
+  addPredicate(*SE.getWrapPredicate(AR, Flags));
+
+  auto II = FlagsMap.insert({V, Flags});
+  if (!II.second)
+    II.first->second = SCEVWrapPredicate::setFlags(Flags, II.first->second);
+}
+
+bool PredicatedScalarEvolution::hasNoOverflow(
+    Value *V, SCEVWrapPredicate::IncrementWrapFlags Flags) {
+  const SCEV *Expr = getSCEV(V);
+  const auto *AR = cast<SCEVAddRecExpr>(Expr);
+
+  Flags = SCEVWrapPredicate::clearFlags(
+      Flags, SCEVWrapPredicate::getImpliedFlags(AR, SE));
+
+  auto II = FlagsMap.find(V);
+
+  if (II != FlagsMap.end())
+    Flags = SCEVWrapPredicate::clearFlags(Flags, II->second);
+
+  return Flags == SCEVWrapPredicate::IncrementAnyWrap;
+}
+
+const SCEVAddRecExpr *PredicatedScalarEvolution::getAsAddRec(Value *V) {
+  const SCEV *Expr = this->getSCEV(V);
+  auto *New = SE.convertSCEVToAddRecWithPredicates(Expr, &L, Preds);
+
+  if (!New)
+    return nullptr;
+
+  updateGeneration();
+  RewriteMap[SE.getSCEV(V)] = {Generation, New};
+  return New;
+}
+
+PredicatedScalarEvolution::PredicatedScalarEvolution(
+    const PredicatedScalarEvolution &Init)
+    : RewriteMap(Init.RewriteMap), SE(Init.SE), L(Init.L), Preds(Init.Preds),
+      Generation(Init.Generation), BackedgeCount(Init.BackedgeCount) {
+  for (const auto &I : Init.FlagsMap)
+    FlagsMap.insert(I);
+}
+
+void PredicatedScalarEvolution::print(raw_ostream &OS, unsigned Depth) const {
+  // For each block.
+  for (auto *BB : L.getBlocks())
+    for (auto &I : *BB) {
+      if (!SE.isSCEVable(I.getType()))
+        continue;
+
+      auto *Expr = SE.getSCEV(&I);
+      auto II = RewriteMap.find(Expr);
+
+      if (II == RewriteMap.end())
+        continue;
+
+      // Don't print things that are not interesting.
+      if (II->second.second == Expr)
+        continue;
+
+      OS.indent(Depth) << "[PSE]" << I << ":\n";
+      OS.indent(Depth + 2) << *Expr << "\n";
+      OS.indent(Depth + 2) << "--> " << *II->second.second << "\n";
+    }
+}
diff --git a/lib/Analysis/ScalarEvolutionAliasAnalysis.cpp b/lib/Analysis/ScalarEvolutionAliasAnalysis.cpp
index 2e50c80c4e73..61fb411d3150 100644
--- a/lib/Analysis/ScalarEvolutionAliasAnalysis.cpp
+++ b/lib/Analysis/ScalarEvolutionAliasAnalysis.cpp
@@ -20,7 +20,6 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/Analysis/ScalarEvolutionAliasAnalysis.h"
-#include "llvm/Analysis/TargetLibraryInfo.h"
 using namespace llvm;
 
 AliasResult SCEVAAResult::alias(const MemoryLocation &LocA,
@@ -111,18 +110,16 @@ Value *SCEVAAResult::GetBaseValue(const SCEV *S) {
   return nullptr;
 }
 
-SCEVAAResult SCEVAA::run(Function &F, AnalysisManager<Function> *AM) {
-  return SCEVAAResult(AM->getResult<TargetLibraryAnalysis>(F),
-                      AM->getResult<ScalarEvolutionAnalysis>(F));
-}
-
 char SCEVAA::PassID;
 
+SCEVAAResult SCEVAA::run(Function &F, AnalysisManager<Function> &AM) {
+  return SCEVAAResult(AM.getResult<ScalarEvolutionAnalysis>(F));
+}
+
 char SCEVAAWrapperPass::ID = 0;
 INITIALIZE_PASS_BEGIN(SCEVAAWrapperPass, "scev-aa",
                       "ScalarEvolution-based Alias Analysis", false, true)
 INITIALIZE_PASS_DEPENDENCY(ScalarEvolutionWrapperPass)
-INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfoWrapperPass)
 INITIALIZE_PASS_END(SCEVAAWrapperPass, "scev-aa",
                     "ScalarEvolution-based Alias Analysis", false, true)
 
@@ -136,13 +133,11 @@ SCEVAAWrapperPass::SCEVAAWrapperPass() : FunctionPass(ID) {
 
 bool SCEVAAWrapperPass::runOnFunction(Function &F) {
   Result.reset(
-      new SCEVAAResult(getAnalysis<TargetLibraryInfoWrapperPass>().getTLI(),
-                       getAnalysis<ScalarEvolutionWrapperPass>().getSE()));
+      new SCEVAAResult(getAnalysis<ScalarEvolutionWrapperPass>().getSE()));
   return false;
 }
 
 void SCEVAAWrapperPass::getAnalysisUsage(AnalysisUsage &AU) const {
   AU.setPreservesAll();
   AU.addRequired<ScalarEvolutionWrapperPass>();
-  AU.addRequired<TargetLibraryInfoWrapperPass>();
 }
diff --git a/lib/Analysis/ScalarEvolutionExpander.cpp b/lib/Analysis/ScalarEvolutionExpander.cpp
index 921403ddc0fd..77e4ec7ab40c 100644
--- a/lib/Analysis/ScalarEvolutionExpander.cpp
+++ b/lib/Analysis/ScalarEvolutionExpander.cpp
@@ -1,4 +1,4 @@
-//===- ScalarEvolutionExpander.cpp - Scalar Evolution Analysis --*- C++ -*-===//
+//===- ScalarEvolutionExpander.cpp - Scalar Evolution Analysis ------------===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -95,14 +95,12 @@ static BasicBlock::iterator findInsertPointAfter(Instruction *I,
   while (isa<PHINode>(IP))
     ++IP;
 
-  while (IP->isEHPad()) {
-    if (isa<FuncletPadInst>(IP) || isa<LandingPadInst>(IP)) {
-      ++IP;
-    } else if (isa<CatchSwitchInst>(IP)) {
-      IP = MustDominate->getFirstInsertionPt();
-    } else {
-      llvm_unreachable("unexpected eh pad!");
-    }
+  if (isa<FuncletPadInst>(IP) || isa<LandingPadInst>(IP)) {
+    ++IP;
+  } else if (isa<CatchSwitchInst>(IP)) {
+    IP = MustDominate->getFirstInsertionPt();
+  } else {
+    assert(!IP->isEHPad() && "unexpected eh pad!");
   }
 
   return IP;
@@ -198,7 +196,7 @@ Value *SCEVExpander::InsertBinop(Instruction::BinaryOps Opcode,
 
   // Save the original insertion point so we can restore it when we're done.
   DebugLoc Loc = Builder.GetInsertPoint()->getDebugLoc();
-  BuilderType::InsertPointGuard Guard(Builder);
+  SCEVInsertPointGuard Guard(Builder, this);
 
   // Move the insertion point out of as many loops as we can.
   while (const Loop *L = SE.LI.getLoopFor(Builder.GetInsertBlock())) {
@@ -525,7 +523,7 @@ Value *SCEVExpander::expandAddToGEP(const SCEV *const *op_begin,
     }
 
     // Save the original insertion point so we can restore it when we're done.
-    BuilderType::InsertPointGuard Guard(Builder);
+    SCEVInsertPointGuard Guard(Builder, this);
 
     // Move the insertion point out of as many loops as we can.
     while (const Loop *L = SE.LI.getLoopFor(Builder.GetInsertBlock())) {
@@ -544,39 +542,37 @@ Value *SCEVExpander::expandAddToGEP(const SCEV *const *op_begin,
     return GEP;
   }
 
-  // Save the original insertion point so we can restore it when we're done.
-  BuilderType::InsertPoint SaveInsertPt = Builder.saveIP();
+  {
+    SCEVInsertPointGuard Guard(Builder, this);
 
-  // Move the insertion point out of as many loops as we can.
-  while (const Loop *L = SE.LI.getLoopFor(Builder.GetInsertBlock())) {
-    if (!L->isLoopInvariant(V)) break;
-
-    bool AnyIndexNotLoopInvariant =
-        std::any_of(GepIndices.begin(), GepIndices.end(),
-                    [L](Value *Op) { return !L->isLoopInvariant(Op); });
+    // Move the insertion point out of as many loops as we can.
+    while (const Loop *L = SE.LI.getLoopFor(Builder.GetInsertBlock())) {
+      if (!L->isLoopInvariant(V)) break;
 
-    if (AnyIndexNotLoopInvariant)
-      break;
+      bool AnyIndexNotLoopInvariant =
+          std::any_of(GepIndices.begin(), GepIndices.end(),
+                      [L](Value *Op) { return !L->isLoopInvariant(Op); });
 
-    BasicBlock *Preheader = L->getLoopPreheader();
-    if (!Preheader) break;
+      if (AnyIndexNotLoopInvariant)
+        break;
 
-    // Ok, move up a level.
-    Builder.SetInsertPoint(Preheader->getTerminator());
-  }
+      BasicBlock *Preheader = L->getLoopPreheader();
+      if (!Preheader) break;
 
-  // Insert a pretty getelementptr. Note that this GEP is not marked inbounds,
-  // because ScalarEvolution may have changed the address arithmetic to
-  // compute a value which is beyond the end of the allocated object.
-  Value *Casted = V;
-  if (V->getType() != PTy)
-    Casted = InsertNoopCastOfTo(Casted, PTy);
-  Value *GEP = Builder.CreateGEP(OriginalElTy, Casted, GepIndices, "scevgep");
-  Ops.push_back(SE.getUnknown(GEP));
-  rememberInstruction(GEP);
+      // Ok, move up a level.
+      Builder.SetInsertPoint(Preheader->getTerminator());
+    }
 
-  // Restore the original insert point.
-  Builder.restoreIP(SaveInsertPt);
+    // Insert a pretty getelementptr. Note that this GEP is not marked inbounds,
+    // because ScalarEvolution may have changed the address arithmetic to
+    // compute a value which is beyond the end of the allocated object.
+    Value *Casted = V;
+    if (V->getType() != PTy)
+      Casted = InsertNoopCastOfTo(Casted, PTy);
+    Value *GEP = Builder.CreateGEP(OriginalElTy, Casted, GepIndices, "scevgep");
+    Ops.push_back(SE.getUnknown(GEP));
+    rememberInstruction(GEP);
+  }
 
   return expand(SE.getAddExpr(Ops));
 }
@@ -907,6 +903,23 @@ Instruction *SCEVExpander::getIVIncOperand(Instruction *IncV,
   }
 }
 
+/// If the insert point of the current builder or any of the builders on the
+/// stack of saved builders has 'I' as its insert point, update it to point to
+/// the instruction after 'I'.  This is intended to be used when the instruction
+/// 'I' is being moved.  If this fixup is not done and 'I' is moved to a
+/// different block, the inconsistent insert point (with a mismatched
+/// Instruction and Block) can lead to an instruction being inserted in a block
+/// other than its parent.
+void SCEVExpander::fixupInsertPoints(Instruction *I) {
+  BasicBlock::iterator It(*I);
+  BasicBlock::iterator NewInsertPt = std::next(It);
+  if (Builder.GetInsertPoint() == It)
+    Builder.SetInsertPoint(&*NewInsertPt);
+  for (auto *InsertPtGuard : InsertPointGuards)
+    if (InsertPtGuard->GetInsertPoint() == It)
+      InsertPtGuard->SetInsertPoint(NewInsertPt);
+}
+
 /// hoistStep - Attempt to hoist a simple IV increment above InsertPos to make
 /// it available to other uses in this loop. Recursively hoist any operands,
 /// until we reach a value that dominates InsertPos.
@@ -936,6 +949,7 @@ bool SCEVExpander::hoistIVInc(Instruction *IncV, Instruction *InsertPos) {
       break;
   }
   for (auto I = IVIncs.rbegin(), E = IVIncs.rend(); I != E; ++I) {
+    fixupInsertPoints(*I);
     (*I)->moveBefore(InsertPos);
   }
   return true;
@@ -989,13 +1003,14 @@ Value *SCEVExpander::expandIVInc(PHINode *PN, Value *StepV, const Loop *L,
 
 /// \brief Hoist the addrec instruction chain rooted in the loop phi above the
 /// position. This routine assumes that this is possible (has been checked).
-static void hoistBeforePos(DominatorTree *DT, Instruction *InstToHoist,
-                           Instruction *Pos, PHINode *LoopPhi) {
+void SCEVExpander::hoistBeforePos(DominatorTree *DT, Instruction *InstToHoist,
+                                  Instruction *Pos, PHINode *LoopPhi) {
   do {
     if (DT->dominates(InstToHoist, Pos))
       break;
     // Make sure the increment is where we want it. But don't move it
     // down past a potential existing post-inc user.
+    fixupInsertPoints(InstToHoist);
     InstToHoist->moveBefore(Pos);
     Pos = InstToHoist;
     InstToHoist = cast<Instruction>(InstToHoist->getOperand(0));
@@ -1156,7 +1171,7 @@ SCEVExpander::getAddRecExprPHILiterally(const SCEVAddRecExpr *Normalized,
   }
 
   // Save the original insertion point so we can restore it when we're done.
-  BuilderType::InsertPointGuard Guard(Builder);
+  SCEVInsertPointGuard Guard(Builder, this);
 
   // Another AddRec may need to be recursively expanded below. For example, if
   // this AddRec is quadratic, the StepV may itself be an AddRec in this
@@ -1273,6 +1288,13 @@ Value *SCEVExpander::expandAddRecExprLiterally(const SCEVAddRecExpr *S) {
   if (!SE.dominates(Step, L->getHeader())) {
     PostLoopScale = Step;
     Step = SE.getConstant(Normalized->getType(), 1);
+    if (!Start->isZero()) {
+        // The normalization below assumes that Start is constant zero, so if
+        // it isn't re-associate Start to PostLoopOffset.
+        assert(!PostLoopOffset && "Start not-null but PostLoopOffset set?");
+        PostLoopOffset = Start;
+        Start = SE.getConstant(Normalized->getType(), 0);
+    }
     Normalized =
       cast<SCEVAddRecExpr>(SE.getAddRecExpr(
                              Start, Step, Normalized->getLoop(),
@@ -1321,7 +1343,7 @@ Value *SCEVExpander::expandAddRecExprLiterally(const SCEVAddRecExpr *S) {
       Value *StepV;
       {
         // Expand the step somewhere that dominates the loop header.
-        BuilderType::InsertPointGuard Guard(Builder);
+        SCEVInsertPointGuard Guard(Builder, this);
         StepV = expandCodeFor(Step, IntTy, &L->getHeader()->front());
       }
       Result = expandIVInc(PN, StepV, L, ExpandTy, IntTy, useSubtract);
@@ -1428,8 +1450,12 @@ Value *SCEVExpander::visitAddRecExpr(const SCEVAddRecExpr *S) {
     }
 
     // Just do a normal add. Pre-expand the operands to suppress folding.
-    return expand(SE.getAddExpr(SE.getUnknown(expand(S->getStart())),
-                                SE.getUnknown(expand(Rest))));
+    //
+    // The LHS and RHS values are factored out of the expand call to make the
+    // output independent of the argument evaluation order.
+    const SCEV *AddExprLHS = SE.getUnknown(expand(S->getStart()));
+    const SCEV *AddExprRHS = SE.getUnknown(expand(Rest));
+    return expand(SE.getAddExpr(AddExprLHS, AddExprRHS));
   }
 
   // If we don't yet have a canonical IV, create one.
@@ -1600,6 +1626,40 @@ Value *SCEVExpander::expandCodeFor(const SCEV *SH, Type *Ty) {
   return V;
 }
 
+Value *SCEVExpander::FindValueInExprValueMap(const SCEV *S,
+                                             const Instruction *InsertPt) {
+  SetVector<Value *> *Set = SE.getSCEVValues(S);
+  // If the expansion is not in CanonicalMode, and the SCEV contains any
+  // sub scAddRecExpr type SCEV, it is required to expand the SCEV literally.
+  if (CanonicalMode || !SE.containsAddRecurrence(S)) {
+    // If S is scConstant, it may be worse to reuse an existing Value.
+    if (S->getSCEVType() != scConstant && Set) {
+      // Choose a Value from the set which dominates the insertPt.
+      // insertPt should be inside the Value's parent loop so as not to break
+      // the LCSSA form.
+      for (auto const &Ent : *Set) {
+        Instruction *EntInst = nullptr;
+        if (Ent && isa<Instruction>(Ent) &&
+            (EntInst = cast<Instruction>(Ent)) &&
+            S->getType() == Ent->getType() &&
+            EntInst->getFunction() == InsertPt->getFunction() &&
+            SE.DT.dominates(EntInst, InsertPt) &&
+            (SE.LI.getLoopFor(EntInst->getParent()) == nullptr ||
+             SE.LI.getLoopFor(EntInst->getParent())->contains(InsertPt))) {
+          return Ent;
+        }
+      }
+    }
+  }
+  return nullptr;
+}
+
+// The expansion of SCEV will either reuse a previous Value in ExprValueMap,
+// or expand the SCEV literally. Specifically, if the expansion is in LSRMode,
+// and the SCEV contains any sub scAddRecExpr type SCEV, it will be expanded
+// literally, to prevent LSR's transformed SCEV from being reverted. Otherwise,
+// the expansion will try to reuse Value from ExprValueMap, and only when it
+// fails, expand the SCEV literally.
 Value *SCEVExpander::expand(const SCEV *S) {
   // Compute an insertion point for this SCEV object. Hoist the instructions
   // as far out in the loop nest as possible.
@@ -1622,9 +1682,9 @@ Value *SCEVExpander::expand(const SCEV *S) {
       // there) so that it is guaranteed to dominate any user inside the loop.
       if (L && SE.hasComputableLoopEvolution(S, L) && !PostIncLoops.count(L))
         InsertPt = &*L->getHeader()->getFirstInsertionPt();
-      while (InsertPt != Builder.GetInsertPoint()
-             && (isInsertedInstruction(InsertPt)
-                 || isa<DbgInfoIntrinsic>(InsertPt))) {
+      while (InsertPt->getIterator() != Builder.GetInsertPoint() &&
+             (isInsertedInstruction(InsertPt) ||
+              isa<DbgInfoIntrinsic>(InsertPt))) {
         InsertPt = &*std::next(InsertPt->getIterator());
       }
       break;
@@ -1635,11 +1695,14 @@ Value *SCEVExpander::expand(const SCEV *S) {
   if (I != InsertedExpressions.end())
     return I->second;
 
-  BuilderType::InsertPointGuard Guard(Builder);
+  SCEVInsertPointGuard Guard(Builder, this);
   Builder.SetInsertPoint(InsertPt);
 
   // Expand the expression into instructions.
-  Value *V = visit(S);
+  Value *V = FindValueInExprValueMap(S, InsertPt);
+
+  if (!V)
+    V = visit(S);
 
   // Remember the expanded value for this SCEV at this location.
   //
@@ -1673,7 +1736,7 @@ SCEVExpander::getOrInsertCanonicalInductionVariable(const Loop *L,
                                    SE.getConstant(Ty, 1), L, SCEV::FlagAnyWrap);
 
   // Emit code for it.
-  BuilderType::InsertPointGuard Guard(Builder);
+  SCEVInsertPointGuard Guard(Builder, this);
   PHINode *V =
       cast<PHINode>(expandCodeFor(H, nullptr, &L->getHeader()->front()));
 
@@ -1742,8 +1805,8 @@ unsigned SCEVExpander::replaceCongruentIVs(Loop *L, const DominatorTree *DT,
     PHINode *&OrigPhiRef = ExprToIVMap[SE.getSCEV(Phi)];
     if (!OrigPhiRef) {
       OrigPhiRef = Phi;
-      if (Phi->getType()->isIntegerTy() && TTI
-          && TTI->isTruncateFree(Phi->getType(), Phis.back()->getType())) {
+      if (Phi->getType()->isIntegerTy() && TTI &&
+          TTI->isTruncateFree(Phi->getType(), Phis.back()->getType())) {
         // This phi can be freely truncated to the narrowest phi type. Map the
         // truncated expression to it so it will be reused for narrow types.
         const SCEV *TruncExpr =
@@ -1759,56 +1822,59 @@ unsigned SCEVExpander::replaceCongruentIVs(Loop *L, const DominatorTree *DT,
       continue;
 
     if (BasicBlock *LatchBlock = L->getLoopLatch()) {
-      Instruction *OrigInc =
-        cast<Instruction>(OrigPhiRef->getIncomingValueForBlock(LatchBlock));
+      Instruction *OrigInc = dyn_cast<Instruction>(
+          OrigPhiRef->getIncomingValueForBlock(LatchBlock));
       Instruction *IsomorphicInc =
-        cast<Instruction>(Phi->getIncomingValueForBlock(LatchBlock));
-
-      // If this phi has the same width but is more canonical, replace the
-      // original with it. As part of the "more canonical" determination,
-      // respect a prior decision to use an IV chain.
-      if (OrigPhiRef->getType() == Phi->getType()
-          && !(ChainedPhis.count(Phi)
-               || isExpandedAddRecExprPHI(OrigPhiRef, OrigInc, L))
-          && (ChainedPhis.count(Phi)
-              || isExpandedAddRecExprPHI(Phi, IsomorphicInc, L))) {
-        std::swap(OrigPhiRef, Phi);
-        std::swap(OrigInc, IsomorphicInc);
-      }
-      // Replacing the congruent phi is sufficient because acyclic redundancy
-      // elimination, CSE/GVN, should handle the rest. However, once SCEV proves
-      // that a phi is congruent, it's often the head of an IV user cycle that
-      // is isomorphic with the original phi. It's worth eagerly cleaning up the
-      // common case of a single IV increment so that DeleteDeadPHIs can remove
-      // cycles that had postinc uses.
-      const SCEV *TruncExpr = SE.getTruncateOrNoop(SE.getSCEV(OrigInc),
-                                                   IsomorphicInc->getType());
-      if (OrigInc != IsomorphicInc
-          && TruncExpr == SE.getSCEV(IsomorphicInc)
-          && ((isa<PHINode>(OrigInc) && isa<PHINode>(IsomorphicInc))
-              || hoistIVInc(OrigInc, IsomorphicInc))) {
-        DEBUG_WITH_TYPE(DebugType, dbgs()
-                        << "INDVARS: Eliminated congruent iv.inc: "
-                        << *IsomorphicInc << '\n');
-        Value *NewInc = OrigInc;
-        if (OrigInc->getType() != IsomorphicInc->getType()) {
-          Instruction *IP = nullptr;
-          if (PHINode *PN = dyn_cast<PHINode>(OrigInc))
-            IP = &*PN->getParent()->getFirstInsertionPt();
-          else
-            IP = OrigInc->getNextNode();
-
-          IRBuilder<> Builder(IP);
-          Builder.SetCurrentDebugLocation(IsomorphicInc->getDebugLoc());
-          NewInc = Builder.
-            CreateTruncOrBitCast(OrigInc, IsomorphicInc->getType(), IVName);
+          dyn_cast<Instruction>(Phi->getIncomingValueForBlock(LatchBlock));
+
+      if (OrigInc && IsomorphicInc) {
+        // If this phi has the same width but is more canonical, replace the
+        // original with it. As part of the "more canonical" determination,
+        // respect a prior decision to use an IV chain.
+        if (OrigPhiRef->getType() == Phi->getType() &&
+            !(ChainedPhis.count(Phi) ||
+              isExpandedAddRecExprPHI(OrigPhiRef, OrigInc, L)) &&
+            (ChainedPhis.count(Phi) ||
+             isExpandedAddRecExprPHI(Phi, IsomorphicInc, L))) {
+          std::swap(OrigPhiRef, Phi);
+          std::swap(OrigInc, IsomorphicInc);
+        }
+        // Replacing the congruent phi is sufficient because acyclic
+        // redundancy elimination, CSE/GVN, should handle the
+        // rest. However, once SCEV proves that a phi is congruent,
+        // it's often the head of an IV user cycle that is isomorphic
+        // with the original phi. It's worth eagerly cleaning up the
+        // common case of a single IV increment so that DeleteDeadPHIs
+        // can remove cycles that had postinc uses.
+        const SCEV *TruncExpr =
+            SE.getTruncateOrNoop(SE.getSCEV(OrigInc), IsomorphicInc->getType());
+        if (OrigInc != IsomorphicInc &&
+            TruncExpr == SE.getSCEV(IsomorphicInc) &&
+            SE.LI.replacementPreservesLCSSAForm(IsomorphicInc, OrigInc) &&
+            hoistIVInc(OrigInc, IsomorphicInc)) {
+          DEBUG_WITH_TYPE(DebugType,
+                          dbgs() << "INDVARS: Eliminated congruent iv.inc: "
+                                 << *IsomorphicInc << '\n');
+          Value *NewInc = OrigInc;
+          if (OrigInc->getType() != IsomorphicInc->getType()) {
+            Instruction *IP = nullptr;
+            if (PHINode *PN = dyn_cast<PHINode>(OrigInc))
+              IP = &*PN->getParent()->getFirstInsertionPt();
+            else
+              IP = OrigInc->getNextNode();
+
+            IRBuilder<> Builder(IP);
+            Builder.SetCurrentDebugLocation(IsomorphicInc->getDebugLoc());
+            NewInc = Builder.CreateTruncOrBitCast(
+                OrigInc, IsomorphicInc->getType(), IVName);
+          }
+          IsomorphicInc->replaceAllUsesWith(NewInc);
+          DeadInsts.emplace_back(IsomorphicInc);
         }
-        IsomorphicInc->replaceAllUsesWith(NewInc);
-        DeadInsts.emplace_back(IsomorphicInc);
       }
     }
-    DEBUG_WITH_TYPE(DebugType, dbgs()
-                    << "INDVARS: Eliminated congruent iv: " << *Phi << '\n');
+    DEBUG_WITH_TYPE(DebugType, dbgs() << "INDVARS: Eliminated congruent iv: "
+                                      << *Phi << '\n');
     ++NumElim;
     Value *NewIV = OrigPhiRef;
     if (OrigPhiRef->getType() != Phi->getType()) {
@@ -1847,6 +1913,11 @@ Value *SCEVExpander::findExistingExpansion(const SCEV *S,
       return RHS;
   }
 
+  // Use expand's logic which is used for reusing a previous Value in
+  // ExprValueMap.
+  if (Value *Val = FindValueInExprValueMap(S, At))
+    return Val;
+
   // There is potential to make this significantly smarter, but this simple
   // heuristic already gets some interesting cases.
 
@@ -1940,6 +2011,10 @@ Value *SCEVExpander::expandCodeForPredicate(const SCEVPredicate *Pred,
     return expandUnionPredicate(cast<SCEVUnionPredicate>(Pred), IP);
   case SCEVPredicate::P_Equal:
     return expandEqualPredicate(cast<SCEVEqualPredicate>(Pred), IP);
+  case SCEVPredicate::P_Wrap: {
+    auto *AddRecPred = cast<SCEVWrapPredicate>(Pred);
+    return expandWrapPredicate(AddRecPred, IP);
+  }
   }
   llvm_unreachable("Unknown SCEV predicate type");
 }
@@ -1954,6 +2029,116 @@ Value *SCEVExpander::expandEqualPredicate(const SCEVEqualPredicate *Pred,
   return I;
 }
 
+Value *SCEVExpander::generateOverflowCheck(const SCEVAddRecExpr *AR,
+                                           Instruction *Loc, bool Signed) {
+  assert(AR->isAffine() && "Cannot generate RT check for "
+                           "non-affine expression");
+
+  SCEVUnionPredicate Pred;
+  const SCEV *ExitCount =
+      SE.getPredicatedBackedgeTakenCount(AR->getLoop(), Pred);
+
+  assert(ExitCount != SE.getCouldNotCompute() && "Invalid loop count");
+
+  const SCEV *Step = AR->getStepRecurrence(SE);
+  const SCEV *Start = AR->getStart();
+
+  unsigned SrcBits = SE.getTypeSizeInBits(ExitCount->getType());
+  unsigned DstBits = SE.getTypeSizeInBits(AR->getType());
+
+  // The expression {Start,+,Step} has nusw/nssw if
+  //   Step < 0, Start - |Step| * Backedge <= Start
+  //   Step >= 0, Start + |Step| * Backedge > Start
+  // and |Step| * Backedge doesn't unsigned overflow.
+
+  IntegerType *CountTy = IntegerType::get(Loc->getContext(), SrcBits);
+  Builder.SetInsertPoint(Loc);
+  Value *TripCountVal = expandCodeFor(ExitCount, CountTy, Loc);
+
+  IntegerType *Ty =
+      IntegerType::get(Loc->getContext(), SE.getTypeSizeInBits(AR->getType()));
+
+  Value *StepValue = expandCodeFor(Step, Ty, Loc);
+  Value *NegStepValue = expandCodeFor(SE.getNegativeSCEV(Step), Ty, Loc);
+  Value *StartValue = expandCodeFor(Start, Ty, Loc);
+
+  ConstantInt *Zero =
+      ConstantInt::get(Loc->getContext(), APInt::getNullValue(DstBits));
+
+  Builder.SetInsertPoint(Loc);
+  // Compute |Step|
+  Value *StepCompare = Builder.CreateICmp(ICmpInst::ICMP_SLT, StepValue, Zero);
+  Value *AbsStep = Builder.CreateSelect(StepCompare, NegStepValue, StepValue);
+
+  // Get the backedge taken count and truncate or extended to the AR type.
+  Value *TruncTripCount = Builder.CreateZExtOrTrunc(TripCountVal, Ty);
+  auto *MulF = Intrinsic::getDeclaration(Loc->getModule(),
+                                         Intrinsic::umul_with_overflow, Ty);
+
+  // Compute |Step| * Backedge
+  CallInst *Mul = Builder.CreateCall(MulF, {AbsStep, TruncTripCount}, "mul");
+  Value *MulV = Builder.CreateExtractValue(Mul, 0, "mul.result");
+  Value *OfMul = Builder.CreateExtractValue(Mul, 1, "mul.overflow");
+
+  // Compute:
+  //   Start + |Step| * Backedge < Start
+  //   Start - |Step| * Backedge > Start
+  Value *Add = Builder.CreateAdd(StartValue, MulV);
+  Value *Sub = Builder.CreateSub(StartValue, MulV);
+
+  Value *EndCompareGT = Builder.CreateICmp(
+      Signed ? ICmpInst::ICMP_SGT : ICmpInst::ICMP_UGT, Sub, StartValue);
+
+  Value *EndCompareLT = Builder.CreateICmp(
+      Signed ? ICmpInst::ICMP_SLT : ICmpInst::ICMP_ULT, Add, StartValue);
+
+  // Select the answer based on the sign of Step.
+  Value *EndCheck =
+      Builder.CreateSelect(StepCompare, EndCompareGT, EndCompareLT);
+
+  // If the backedge taken count type is larger than the AR type,
+  // check that we don't drop any bits by truncating it. If we are
+  // droping bits, then we have overflow (unless the step is zero).
+  if (SE.getTypeSizeInBits(CountTy) > SE.getTypeSizeInBits(Ty)) {
+    auto MaxVal = APInt::getMaxValue(DstBits).zext(SrcBits);
+    auto *BackedgeCheck =
+        Builder.CreateICmp(ICmpInst::ICMP_UGT, TripCountVal,
+                           ConstantInt::get(Loc->getContext(), MaxVal));
+    BackedgeCheck = Builder.CreateAnd(
+        BackedgeCheck, Builder.CreateICmp(ICmpInst::ICMP_NE, StepValue, Zero));
+
+    EndCheck = Builder.CreateOr(EndCheck, BackedgeCheck);
+  }
+
+  EndCheck = Builder.CreateOr(EndCheck, OfMul);
+  return EndCheck;
+}
+
+Value *SCEVExpander::expandWrapPredicate(const SCEVWrapPredicate *Pred,
+                                         Instruction *IP) {
+  const auto *A = cast<SCEVAddRecExpr>(Pred->getExpr());
+  Value *NSSWCheck = nullptr, *NUSWCheck = nullptr;
+
+  // Add a check for NUSW
+  if (Pred->getFlags() & SCEVWrapPredicate::IncrementNUSW)
+    NUSWCheck = generateOverflowCheck(A, IP, false);
+
+  // Add a check for NSSW
+  if (Pred->getFlags() & SCEVWrapPredicate::IncrementNSSW)
+    NSSWCheck = generateOverflowCheck(A, IP, true);
+
+  if (NUSWCheck && NSSWCheck)
+    return Builder.CreateOr(NUSWCheck, NSSWCheck);
+
+  if (NUSWCheck)
+    return NUSWCheck;
+
+  if (NSSWCheck)
+    return NSSWCheck;
+
+  return ConstantInt::getFalse(IP->getContext());
+}
+
 Value *SCEVExpander::expandUnionPredicate(const SCEVUnionPredicate *Union,
                                           Instruction *IP) {
   auto *BoolType = IntegerType::get(IP->getContext(), 1);
diff --git a/lib/Analysis/ScalarEvolutionNormalization.cpp b/lib/Analysis/ScalarEvolutionNormalization.cpp
index b7fd5d506175..c1f9503816ee 100644
--- a/lib/Analysis/ScalarEvolutionNormalization.cpp
+++ b/lib/Analysis/ScalarEvolutionNormalization.cpp
@@ -1,4 +1,4 @@
-//===- ScalarEvolutionNormalization.cpp - See below -------------*- C++ -*-===//
+//===- ScalarEvolutionNormalization.cpp - See below -----------------------===//
 //
 //                     The LLVM Compiler Infrastructure
 //
diff --git a/lib/Analysis/ScopedNoAliasAA.cpp b/lib/Analysis/ScopedNoAliasAA.cpp
index 486f3a583284..82e65a1f2088 100644
--- a/lib/Analysis/ScopedNoAliasAA.cpp
+++ b/lib/Analysis/ScopedNoAliasAA.cpp
@@ -34,7 +34,6 @@
 
 #include "llvm/Analysis/ScopedNoAliasAA.h"
 #include "llvm/ADT/SmallPtrSet.h"
-#include "llvm/Analysis/TargetLibraryInfo.h"
 #include "llvm/IR/Constants.h"
 #include "llvm/IR/LLVMContext.h"
 #include "llvm/IR/Metadata.h"
@@ -51,9 +50,9 @@ static cl::opt<bool> EnableScopedNoAlias("enable-scoped-noalias",
                                          cl::init(true));
 
 namespace {
-/// AliasScopeNode - This is a simple wrapper around an MDNode which provides
-/// a higher-level interface by hiding the details of how alias analysis
-/// information is encoded in its operands.
+/// This is a simple wrapper around an MDNode which provides a higher-level
+/// interface by hiding the details of how alias analysis information is encoded
+/// in its operands.
 class AliasScopeNode {
   const MDNode *Node;
 
@@ -61,10 +60,10 @@ public:
   AliasScopeNode() : Node(nullptr) {}
   explicit AliasScopeNode(const MDNode *N) : Node(N) {}
 
-  /// getNode - Get the MDNode for this AliasScopeNode.
+  /// Get the MDNode for this AliasScopeNode.
   const MDNode *getNode() const { return Node; }
 
-  /// getDomain - Get the MDNode for this AliasScopeNode's domain.
+  /// Get the MDNode for this AliasScopeNode's domain.
   const MDNode *getDomain() const {
     if (Node->getNumOperands() < 2)
       return nullptr;
@@ -131,8 +130,8 @@ ModRefInfo ScopedNoAliasAAResult::getModRefInfo(ImmutableCallSite CS1,
 void ScopedNoAliasAAResult::collectMDInDomain(
     const MDNode *List, const MDNode *Domain,
     SmallPtrSetImpl<const MDNode *> &Nodes) const {
-  for (unsigned i = 0, ie = List->getNumOperands(); i != ie; ++i)
-    if (const MDNode *MD = dyn_cast<MDNode>(List->getOperand(i)))
+  for (const MDOperand &MDOp : List->operands())
+    if (const MDNode *MD = dyn_cast<MDNode>(MDOp))
       if (AliasScopeNode(MD).getDomain() == Domain)
         Nodes.insert(MD);
 }
@@ -144,8 +143,8 @@ bool ScopedNoAliasAAResult::mayAliasInScopes(const MDNode *Scopes,
 
   // Collect the set of scope domains relevant to the noalias scopes.
   SmallPtrSet<const MDNode *, 16> Domains;
-  for (unsigned i = 0, ie = NoAlias->getNumOperands(); i != ie; ++i)
-    if (const MDNode *NAMD = dyn_cast<MDNode>(NoAlias->getOperand(i)))
+  for (const MDOperand &MDOp : NoAlias->operands())
+    if (const MDNode *NAMD = dyn_cast<MDNode>(MDOp))
       if (const MDNode *Domain = AliasScopeNode(NAMD).getDomain())
         Domains.insert(Domain);
 
@@ -173,19 +172,16 @@ bool ScopedNoAliasAAResult::mayAliasInScopes(const MDNode *Scopes,
   return true;
 }
 
+char ScopedNoAliasAA::PassID;
+
 ScopedNoAliasAAResult ScopedNoAliasAA::run(Function &F,
-                                           AnalysisManager<Function> *AM) {
-  return ScopedNoAliasAAResult(AM->getResult<TargetLibraryAnalysis>(F));
+                                           AnalysisManager<Function> &AM) {
+  return ScopedNoAliasAAResult();
 }
 
-char ScopedNoAliasAA::PassID;
-
 char ScopedNoAliasAAWrapperPass::ID = 0;
-INITIALIZE_PASS_BEGIN(ScopedNoAliasAAWrapperPass, "scoped-noalias",
-                      "Scoped NoAlias Alias Analysis", false, true)
-INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfoWrapperPass)
-INITIALIZE_PASS_END(ScopedNoAliasAAWrapperPass, "scoped-noalias",
-                    "Scoped NoAlias Alias Analysis", false, true)
+INITIALIZE_PASS(ScopedNoAliasAAWrapperPass, "scoped-noalias",
+                "Scoped NoAlias Alias Analysis", false, true)
 
 ImmutablePass *llvm::createScopedNoAliasAAWrapperPass() {
   return new ScopedNoAliasAAWrapperPass();
@@ -196,8 +192,7 @@ ScopedNoAliasAAWrapperPass::ScopedNoAliasAAWrapperPass() : ImmutablePass(ID) {
 }
 
 bool ScopedNoAliasAAWrapperPass::doInitialization(Module &M) {
-  Result.reset(new ScopedNoAliasAAResult(
-      getAnalysis<TargetLibraryInfoWrapperPass>().getTLI()));
+  Result.reset(new ScopedNoAliasAAResult());
   return false;
 }
 
@@ -208,5 +203,4 @@ bool ScopedNoAliasAAWrapperPass::doFinalization(Module &M) {
 
 void ScopedNoAliasAAWrapperPass::getAnalysisUsage(AnalysisUsage &AU) const {
   AU.setPreservesAll();
-  AU.addRequired<TargetLibraryInfoWrapperPass>();
 }
diff --git a/lib/Analysis/SparsePropagation.cpp b/lib/Analysis/SparsePropagation.cpp
index f5a927b80525..79dc84e25533 100644
--- a/lib/Analysis/SparsePropagation.cpp
+++ b/lib/Analysis/SparsePropagation.cpp
@@ -320,8 +320,8 @@ void SparseSolver::Solve(Function &F) {
 
       // Notify all instructions in this basic block that they are newly
       // executable.
-      for (BasicBlock::iterator I = BB->begin(), E = BB->end(); I != E; ++I)
-        visitInst(*I);
+      for (Instruction &I : *BB)
+        visitInst(I);
     }
   }
 }
diff --git a/lib/Analysis/StratifiedSets.h b/lib/Analysis/StratifiedSets.h
index fd3fbc0d86ad..fd3a241d79c1 100644
--- a/lib/Analysis/StratifiedSets.h
+++ b/lib/Analysis/StratifiedSets.h
@@ -10,60 +10,49 @@
 #ifndef LLVM_ADT_STRATIFIEDSETS_H
 #define LLVM_ADT_STRATIFIEDSETS_H
 
+#include "AliasAnalysisSummary.h"
 #include "llvm/ADT/DenseMap.h"
 #include "llvm/ADT/Optional.h"
-#include "llvm/ADT/SmallPtrSet.h"
 #include "llvm/ADT/SmallSet.h"
 #include "llvm/ADT/SmallVector.h"
-#include "llvm/Support/Compiler.h"
 #include <bitset>
 #include <cassert>
 #include <cmath>
-#include <limits>
 #include <type_traits>
 #include <utility>
 #include <vector>
 
 namespace llvm {
-// \brief An index into Stratified Sets.
+namespace cflaa {
+/// An index into Stratified Sets.
 typedef unsigned StratifiedIndex;
-// NOTE: ^ This can't be a short -- bootstrapping clang has a case where
-// ~1M sets exist.
+/// NOTE: ^ This can't be a short -- bootstrapping clang has a case where
+/// ~1M sets exist.
 
 // \brief Container of information related to a value in a StratifiedSet.
 struct StratifiedInfo {
   StratifiedIndex Index;
-  // For field sensitivity, etc. we can tack attributes on to this struct.
+  /// For field sensitivity, etc. we can tack fields on here.
 };
 
-// The number of attributes that StratifiedAttrs should contain. Attributes are
-// described below, and 32 was an arbitrary choice because it fits nicely in 32
-// bits (because we use a bitset for StratifiedAttrs).
-static const unsigned NumStratifiedAttrs = 32;
-
-// These are attributes that the users of StratifiedSets/StratifiedSetBuilders
-// may use for various purposes. These also have the special property of that
-// they are merged down. So, if set A is above set B, and one decides to set an
-// attribute in set A, then the attribute will automatically be set in set B.
-typedef std::bitset<NumStratifiedAttrs> StratifiedAttrs;
-
-// \brief A "link" between two StratifiedSets.
+/// A "link" between two StratifiedSets.
 struct StratifiedLink {
-  // \brief This is a value used to signify "does not exist" where
-  // the StratifiedIndex type is used. This is used instead of
-  // Optional<StratifiedIndex> because Optional<StratifiedIndex> would
-  // eat up a considerable amount of extra memory, after struct
-  // padding/alignment is taken into account.
+  /// \brief This is a value used to signify "does not exist" where the
+  /// StratifiedIndex type is used.
+  ///
+  /// This is used instead of Optional<StratifiedIndex> because
+  /// Optional<StratifiedIndex> would eat up a considerable amount of extra
+  /// memory, after struct padding/alignment is taken into account.
   static const StratifiedIndex SetSentinel;
 
-  // \brief The index for the set "above" current
+  /// The index for the set "above" current
   StratifiedIndex Above;
 
-  // \brief The link for the set "below" current
+  /// The link for the set "below" current
   StratifiedIndex Below;
 
-  // \brief Attributes for these StratifiedSets.
-  StratifiedAttrs Attrs;
+  /// Attributes for these StratifiedSets.
+  AliasAttrs Attrs;
 
   StratifiedLink() : Above(SetSentinel), Below(SetSentinel) {}
 
@@ -74,46 +63,48 @@ struct StratifiedLink {
   void clearAbove() { Above = SetSentinel; }
 };
 
-// \brief These are stratified sets, as described in "Fast algorithms for
-// Dyck-CFL-reachability with applications to Alias Analysis" by Zhang Q, Lyu M
-// R, Yuan H, and Su Z. -- in short, this is meant to represent different sets
-// of Value*s. If two Value*s are in the same set, or if both sets have 
-// overlapping attributes, then the Value*s are said to alias.
-//
-// Sets may be related by position, meaning that one set may be considered as
-// above or below another. In CFL Alias Analysis, this gives us an indication
-// of how two variables are related; if the set of variable A is below a set
-// containing variable B, then at some point, a variable that has interacted
-// with B (or B itself) was either used in order to extract the variable A, or
-// was used as storage of variable A.
-//
-// Sets may also have attributes (as noted above). These attributes are
-// generally used for noting whether a variable in the set has interacted with
-// a variable whose origins we don't quite know (i.e. globals/arguments), or if
-// the variable may have had operations performed on it (modified in a function
-// call). All attributes that exist in a set A must exist in all sets marked as
-// below set A.
+/// \brief These are stratified sets, as described in "Fast algorithms for
+/// Dyck-CFL-reachability with applications to Alias Analysis" by Zhang Q, Lyu M
+/// R, Yuan H, and Su Z. -- in short, this is meant to represent different sets
+/// of Value*s. If two Value*s are in the same set, or if both sets have
+/// overlapping attributes, then the Value*s are said to alias.
+///
+/// Sets may be related by position, meaning that one set may be considered as
+/// above or below another. In CFL Alias Analysis, this gives us an indication
+/// of how two variables are related; if the set of variable A is below a set
+/// containing variable B, then at some point, a variable that has interacted
+/// with B (or B itself) was either used in order to extract the variable A, or
+/// was used as storage of variable A.
+///
+/// Sets may also have attributes (as noted above). These attributes are
+/// generally used for noting whether a variable in the set has interacted with
+/// a variable whose origins we don't quite know (i.e. globals/arguments), or if
+/// the variable may have had operations performed on it (modified in a function
+/// call). All attributes that exist in a set A must exist in all sets marked as
+/// below set A.
 template <typename T> class StratifiedSets {
 public:
-  StratifiedSets() {}
-
-  StratifiedSets(DenseMap<T, StratifiedInfo> Map,
-                 std::vector<StratifiedLink> Links)
-      : Values(std::move(Map)), Links(std::move(Links)) {}
+  StratifiedSets() = default;
 
-  StratifiedSets(StratifiedSets<T> &&Other) { *this = std::move(Other); }
+  // TODO: Figure out how to make MSVC not call the copy ctor here, and delete
+  // it.
 
-  StratifiedSets &operator=(StratifiedSets<T> &&Other) {
+  // Can't default these due to compile errors in MSVC2013
+  StratifiedSets(StratifiedSets &&Other) { *this = std::move(Other); }
+  StratifiedSets &operator=(StratifiedSets &&Other) {
     Values = std::move(Other.Values);
     Links = std::move(Other.Links);
     return *this;
   }
 
+  StratifiedSets(DenseMap<T, StratifiedInfo> Map,
+                 std::vector<StratifiedLink> Links)
+      : Values(std::move(Map)), Links(std::move(Links)) {}
+
   Optional<StratifiedInfo> find(const T &Elem) const {
     auto Iter = Values.find(Elem);
-    if (Iter == Values.end()) {
-      return NoneType();
-    }
+    if (Iter == Values.end())
+      return None;
     return Iter->second;
   }
 
@@ -129,91 +120,70 @@ private:
   bool inbounds(StratifiedIndex Idx) const { return Idx < Links.size(); }
 };
 
-// \brief Generic Builder class that produces StratifiedSets instances.
-//
-// The goal of this builder is to efficiently produce correct StratifiedSets
-// instances. To this end, we use a few tricks:
-//   > Set chains (A method for linking sets together)
-//   > Set remaps (A method for marking a set as an alias [irony?] of another)
-//
-// ==== Set chains ====
-// This builder has a notion of some value A being above, below, or with some
-// other value B:
-//   > The `A above B` relationship implies that there is a reference edge going
-//   from A to B. Namely, it notes that A can store anything in B's set.
-//   > The `A below B` relationship is the opposite of `A above B`. It implies
-//   that there's a dereference edge going from A to B.
-//   > The `A with B` relationship states that there's an assignment edge going
-//   from A to B, and that A and B should be treated as equals.
-//
-// As an example, take the following code snippet:
-//
-// %a = alloca i32, align 4
-// %ap = alloca i32*, align 8
-// %app = alloca i32**, align 8
-// store %a, %ap
-// store %ap, %app
-// %aw = getelementptr %ap, 0
-//
-// Given this, the follow relations exist:
-//   - %a below %ap & %ap above %a
-//   - %ap below %app & %app above %ap
-//   - %aw with %ap & %ap with %aw
-//
-// These relations produce the following sets:
-//   [{%a}, {%ap, %aw}, {%app}]
-//
-// ...Which states that the only MayAlias relationship in the above program is
-// between %ap and %aw.
-//
-// Life gets more complicated when we actually have logic in our programs. So,
-// we either must remove this logic from our programs, or make consessions for
-// it in our AA algorithms. In this case, we have decided to select the latter
-// option.
-//
-// First complication: Conditionals
-// Motivation:
-//  %ad = alloca int, align 4
-//  %a = alloca int*, align 8
-//  %b = alloca int*, align 8
-//  %bp = alloca int**, align 8
-//  %c = call i1 @SomeFunc()
-//  %k = select %c, %ad, %bp
-//  store %ad, %a
-//  store %b, %bp
-//
-// %k has 'with' edges to both %a and %b, which ordinarily would not be linked
-// together. So, we merge the set that contains %a with the set that contains
-// %b. We then recursively merge the set above %a with the set above %b, and
-// the set below  %a with the set below %b, etc. Ultimately, the sets for this
-// program would end up like: {%ad}, {%a, %b, %k}, {%bp}, where {%ad} is below
-// {%a, %b, %c} is below {%ad}.
-//
-// Second complication: Arbitrary casts
-// Motivation:
-//  %ip = alloca int*, align 8
-//  %ipp = alloca int**, align 8
-//  %i = bitcast ipp to int
-//  store %ip, %ipp
-//  store %i, %ip
-//
-// This is impossible to construct with any of the rules above, because a set
-// containing both {%i, %ipp} is supposed to exist, the set with %i is supposed
-// to be below the set with %ip, and the set with %ip is supposed to be below
-// the set with %ipp. Because we don't allow circular relationships like this,
-// we merge all concerned sets into one. So, the above code would generate a
-// single StratifiedSet: {%ip, %ipp, %i}.
-//
-// ==== Set remaps ====
-// More of an implementation detail than anything -- when merging sets, we need
-// to update the numbers of all of the elements mapped to those sets. Rather
-// than doing this at each merge, we note in the BuilderLink structure that a
-// remap has occurred, and use this information so we can defer renumbering set
-// elements until build time.
+/// Generic Builder class that produces StratifiedSets instances.
+///
+/// The goal of this builder is to efficiently produce correct StratifiedSets
+/// instances. To this end, we use a few tricks:
+///   > Set chains (A method for linking sets together)
+///   > Set remaps (A method for marking a set as an alias [irony?] of another)
+///
+/// ==== Set chains ====
+/// This builder has a notion of some value A being above, below, or with some
+/// other value B:
+///   > The `A above B` relationship implies that there is a reference edge
+///   going from A to B. Namely, it notes that A can store anything in B's set.
+///   > The `A below B` relationship is the opposite of `A above B`. It implies
+///   that there's a dereference edge going from A to B.
+///   > The `A with B` relationship states that there's an assignment edge going
+///   from A to B, and that A and B should be treated as equals.
+///
+/// As an example, take the following code snippet:
+///
+/// %a = alloca i32, align 4
+/// %ap = alloca i32*, align 8
+/// %app = alloca i32**, align 8
+/// store %a, %ap
+/// store %ap, %app
+/// %aw = getelementptr %ap, i32 0
+///
+/// Given this, the following relations exist:
+///   - %a below %ap & %ap above %a
+///   - %ap below %app & %app above %ap
+///   - %aw with %ap & %ap with %aw
+///
+/// These relations produce the following sets:
+///   [{%a}, {%ap, %aw}, {%app}]
+///
+/// ...Which state that the only MayAlias relationship in the above program is
+/// between %ap and %aw.
+///
+/// Because LLVM allows arbitrary casts, code like the following needs to be
+/// supported:
+///   %ip = alloca i64, align 8
+///   %ipp = alloca i64*, align 8
+///   %i = bitcast i64** ipp to i64
+///   store i64* %ip, i64** %ipp
+///   store i64 %i, i64* %ip
+///
+/// Which, because %ipp ends up *both* above and below %ip, is fun.
+///
+/// This is solved by merging %i and %ipp into a single set (...which is the
+/// only way to solve this, since their bit patterns are equivalent). Any sets
+/// that ended up in between %i and %ipp at the time of merging (in this case,
+/// the set containing %ip) also get conservatively merged into the set of %i
+/// and %ipp. In short, the resulting StratifiedSet from the above code would be
+/// {%ip, %ipp, %i}.
+///
+/// ==== Set remaps ====
+/// More of an implementation detail than anything -- when merging sets, we need
+/// to update the numbers of all of the elements mapped to those sets. Rather
+/// than doing this at each merge, we note in the BuilderLink structure that a
+/// remap has occurred, and use this information so we can defer renumbering set
+/// elements until build time.
 template <typename T> class StratifiedSetsBuilder {
-  // \brief Represents a Stratified Set, with information about the Stratified
-  // Set above it, the set below it, and whether the current set has been
-  // remapped to another.
+  /// \brief Represents a Stratified Set, with information about the Stratified
+  /// Set above it, the set below it, and whether the current set has been
+  /// remapped to another.
   struct BuilderLink {
     const StratifiedIndex Number;
 
@@ -263,25 +233,19 @@ template <typename T> class StratifiedSetsBuilder {
       return Link.Above;
     }
 
-    StratifiedAttrs &getAttrs() {
+    AliasAttrs getAttrs() {
       assert(!isRemapped());
       return Link.Attrs;
     }
 
-    void setAttr(unsigned index) {
+    void setAttrs(AliasAttrs Other) {
       assert(!isRemapped());
-      assert(index < NumStratifiedAttrs);
-      Link.Attrs.set(index);
-    }
-
-    void setAttrs(const StratifiedAttrs &other) {
-      assert(!isRemapped());
-      Link.Attrs |= other;
+      Link.Attrs |= Other;
     }
 
     bool isRemapped() const { return Remap != StratifiedLink::SetSentinel; }
 
-    // \brief For initial remapping to another set
+    /// For initial remapping to another set
     void remapTo(StratifiedIndex Other) {
       assert(!isRemapped());
       Remap = Other;
@@ -292,15 +256,15 @@ template <typename T> class StratifiedSetsBuilder {
       return Remap;
     }
 
-    // \brief Should only be called when we're already remapped.
+    /// Should only be called when we're already remapped.
     void updateRemap(StratifiedIndex Other) {
       assert(isRemapped());
       Remap = Other;
     }
 
-    // \brief Prefer the above functions to calling things directly on what's
-    // returned from this -- they guard against unexpected calls when the
-    // current BuilderLink is remapped.
+    /// Prefer the above functions to calling things directly on what's returned
+    /// from this -- they guard against unexpected calls when the current
+    /// BuilderLink is remapped.
     const StratifiedLink &getLink() const { return Link; }
 
   private:
@@ -308,15 +272,14 @@ template <typename T> class StratifiedSetsBuilder {
     StratifiedIndex Remap;
   };
 
-  // \brief This function performs all of the set unioning/value renumbering
-  // that we've been putting off, and generates a vector<StratifiedLink> that
-  // may be placed in a StratifiedSets instance.
+  /// \brief This function performs all of the set unioning/value renumbering
+  /// that we've been putting off, and generates a vector<StratifiedLink> that
+  /// may be placed in a StratifiedSets instance.
   void finalizeSets(std::vector<StratifiedLink> &StratLinks) {
     DenseMap<StratifiedIndex, StratifiedIndex> Remaps;
     for (auto &Link : Links) {
-      if (Link.isRemapped()) {
+      if (Link.isRemapped())
         continue;
-      }
 
       StratifiedIndex Number = StratLinks.size();
       Remaps.insert(std::make_pair(Link.Number, Number));
@@ -348,8 +311,8 @@ template <typename T> class StratifiedSetsBuilder {
     }
   }
 
-  // \brief There's a guarantee in StratifiedLink where all bits set in a
-  // Link.externals will be set in all Link.externals "below" it.
+  /// \brief There's a guarantee in StratifiedLink where all bits set in a
+  /// Link.externals will be set in all Link.externals "below" it.
   static void propagateAttrs(std::vector<StratifiedLink> &Links) {
     const auto getHighestParentAbove = [&Links](StratifiedIndex Idx) {
       const auto *Link = &Links[Idx];
@@ -363,9 +326,8 @@ template <typename T> class StratifiedSetsBuilder {
     SmallSet<StratifiedIndex, 16> Visited;
     for (unsigned I = 0, E = Links.size(); I < E; ++I) {
       auto CurrentIndex = getHighestParentAbove(I);
-      if (!Visited.insert(CurrentIndex).second) {
+      if (!Visited.insert(CurrentIndex).second)
         continue;
-      }
 
       while (Links[CurrentIndex].hasBelow()) {
         auto &CurrentBits = Links[CurrentIndex].Attrs;
@@ -378,8 +340,8 @@ template <typename T> class StratifiedSetsBuilder {
   }
 
 public:
-  // \brief Builds a StratifiedSet from the information we've been given since
-  // either construction or the prior build() call.
+  /// Builds a StratifiedSet from the information we've been given since either
+  /// construction or the prior build() call.
   StratifiedSets<T> build() {
     std::vector<StratifiedLink> StratLinks;
     finalizeSets(StratLinks);
@@ -388,9 +350,6 @@ public:
     return StratifiedSets<T>(std::move(Values), std::move(StratLinks));
   }
 
-  std::size_t size() const { return Values.size(); }
-  std::size_t numSets() const { return Links.size(); }
-
   bool has(const T &Elem) const { return get(Elem).hasValue(); }
 
   bool add(const T &Main) {
@@ -401,9 +360,9 @@ public:
     return addAtMerging(Main, NewIndex);
   }
 
-  // \brief Restructures the stratified sets as necessary to make "ToAdd" in a
-  // set above "Main". There are some cases where this is not possible (see
-  // above), so we merge them such that ToAdd and Main are in the same set.
+  /// \brief Restructures the stratified sets as necessary to make "ToAdd" in a
+  /// set above "Main". There are some cases where this is not possible (see
+  /// above), so we merge them such that ToAdd and Main are in the same set.
   bool addAbove(const T &Main, const T &ToAdd) {
     assert(has(Main));
     auto Index = *indexOf(Main);
@@ -414,9 +373,9 @@ public:
     return addAtMerging(ToAdd, Above);
   }
 
-  // \brief Restructures the stratified sets as necessary to make "ToAdd" in a
-  // set below "Main". There are some cases where this is not possible (see
-  // above), so we merge them such that ToAdd and Main are in the same set.
+  /// \brief Restructures the stratified sets as necessary to make "ToAdd" in a
+  /// set below "Main". There are some cases where this is not possible (see
+  /// above), so we merge them such that ToAdd and Main are in the same set.
   bool addBelow(const T &Main, const T &ToAdd) {
     assert(has(Main));
     auto Index = *indexOf(Main);
@@ -433,65 +392,18 @@ public:
     return addAtMerging(ToAdd, MainIndex);
   }
 
-  void noteAttribute(const T &Main, unsigned AttrNum) {
-    assert(has(Main));
-    assert(AttrNum < StratifiedLink::SetSentinel);
-    auto *Info = *get(Main);
-    auto &Link = linksAt(Info->Index);
-    Link.setAttr(AttrNum);
-  }
-
-  void noteAttributes(const T &Main, const StratifiedAttrs &NewAttrs) {
+  void noteAttributes(const T &Main, AliasAttrs NewAttrs) {
     assert(has(Main));
     auto *Info = *get(Main);
     auto &Link = linksAt(Info->Index);
     Link.setAttrs(NewAttrs);
   }
 
-  StratifiedAttrs getAttributes(const T &Main) {
-    assert(has(Main));
-    auto *Info = *get(Main);
-    auto *Link = &linksAt(Info->Index);
-    auto Attrs = Link->getAttrs();
-    while (Link->hasAbove()) {
-      Link = &linksAt(Link->getAbove());
-      Attrs |= Link->getAttrs();
-    }
-
-    return Attrs;
-  }
-
-  bool getAttribute(const T &Main, unsigned AttrNum) {
-    assert(AttrNum < StratifiedLink::SetSentinel);
-    auto Attrs = getAttributes(Main);
-    return Attrs[AttrNum];
-  }
-
-  // \brief Gets the attributes that have been applied to the set that Main
-  // belongs to. It ignores attributes in any sets above the one that Main
-  // resides in.
-  StratifiedAttrs getRawAttributes(const T &Main) {
-    assert(has(Main));
-    auto *Info = *get(Main);
-    auto &Link = linksAt(Info->Index);
-    return Link.getAttrs();
-  }
-
-  // \brief Gets an attribute from the attributes that have been applied to the
-  // set that Main belongs to. It ignores attributes in any sets above the one
-  // that Main resides in.
-  bool getRawAttribute(const T &Main, unsigned AttrNum) {
-    assert(AttrNum < StratifiedLink::SetSentinel);
-    auto Attrs = getRawAttributes(Main);
-    return Attrs[AttrNum];
-  }
-
 private:
   DenseMap<T, StratifiedInfo> Values;
   std::vector<BuilderLink> Links;
 
-  // \brief Adds the given element at the given index, merging sets if
-  // necessary.
+  /// Adds the given element at the given index, merging sets if necessary.
   bool addAtMerging(const T &ToAdd, StratifiedIndex Index) {
     StratifiedInfo Info = {Index};
     auto Pair = Values.insert(std::make_pair(ToAdd, Info));
@@ -509,8 +421,8 @@ private:
     return false;
   }
 
-  // \brief Gets the BuilderLink at the given index, taking set remapping into
-  // account.
+  /// Gets the BuilderLink at the given index, taking set remapping into
+  /// account.
   BuilderLink &linksAt(StratifiedIndex Index) {
     auto *Start = &Links[Index];
     if (!Start->isRemapped())
@@ -534,8 +446,8 @@ private:
     return *Current;
   }
 
-  // \brief Merges two sets into one another. Assumes that these sets are not
-  // already one in the same
+  /// \brief Merges two sets into one another. Assumes that these sets are not
+  /// already one in the same.
   void merge(StratifiedIndex Idx1, StratifiedIndex Idx2) {
     assert(inbounds(Idx1) && inbounds(Idx2));
     assert(&linksAt(Idx1) != &linksAt(Idx2) &&
@@ -555,8 +467,8 @@ private:
     mergeDirect(Idx1, Idx2);
   }
 
-  // \brief Merges two sets assuming that the set at `Idx1` is unreachable from
-  // traversing above or below the set at `Idx2`.
+  /// \brief Merges two sets assuming that the set at `Idx1` is unreachable from
+  /// traversing above or below the set at `Idx2`.
   void mergeDirect(StratifiedIndex Idx1, StratifiedIndex Idx2) {
     assert(inbounds(Idx1) && inbounds(Idx2));
 
@@ -582,7 +494,7 @@ private:
     //  match `LinksFrom.Below`
     //  > If both have links above, deal with those next.
     while (LinksInto->hasBelow() && LinksFrom->hasBelow()) {
-      auto &FromAttrs = LinksFrom->getAttrs();
+      auto FromAttrs = LinksFrom->getAttrs();
       LinksInto->setAttrs(FromAttrs);
 
       // Remap needs to happen after getBelow(), but before
@@ -599,12 +511,13 @@ private:
       NewBelow.setAbove(LinksInto->Number);
     }
 
+    LinksInto->setAttrs(LinksFrom->getAttrs());
     LinksFrom->remapTo(LinksInto->Number);
   }
 
-  // \brief Checks to see if lowerIndex is at a level lower than upperIndex.
-  // If so, it will merge lowerIndex with upperIndex (and all of the sets
-  // between) and return true. Otherwise, it will return false.
+  /// Checks to see if lowerIndex is at a level lower than upperIndex. If so, it
+  /// will merge lowerIndex with upperIndex (and all of the sets between) and
+  /// return true. Otherwise, it will return false.
   bool tryMergeUpwards(StratifiedIndex LowerIndex, StratifiedIndex UpperIndex) {
     assert(inbounds(LowerIndex) && inbounds(UpperIndex));
     auto *Lower = &linksAt(LowerIndex);
@@ -644,21 +557,21 @@ private:
   Optional<const StratifiedInfo *> get(const T &Val) const {
     auto Result = Values.find(Val);
     if (Result == Values.end())
-      return NoneType();
+      return None;
     return &Result->second;
   }
 
   Optional<StratifiedInfo *> get(const T &Val) {
     auto Result = Values.find(Val);
     if (Result == Values.end())
-      return NoneType();
+      return None;
     return &Result->second;
   }
 
   Optional<StratifiedIndex> indexOf(const T &Val) {
     auto MaybeVal = get(Val);
     if (!MaybeVal.hasValue())
-      return NoneType();
+      return None;
     auto *Info = *MaybeVal;
     auto &Link = linksAt(Info->Index);
     return Link.Number;
@@ -689,4 +602,5 @@ private:
   bool inbounds(StratifiedIndex N) const { return N < Links.size(); }
 };
 }
+}
 #endif // LLVM_ADT_STRATIFIEDSETS_H
diff --git a/lib/Analysis/TargetLibraryInfo.cpp b/lib/Analysis/TargetLibraryInfo.cpp
index ce3881925627..93d537ad3abb 100644
--- a/lib/Analysis/TargetLibraryInfo.cpp
+++ b/lib/Analysis/TargetLibraryInfo.cpp
@@ -65,14 +65,18 @@ static void initialize(TargetLibraryInfoImpl &TLI, const Triple &T,
     TLI.setUnavailable(LibFunc::ldexp);
     TLI.setUnavailable(LibFunc::ldexpf);
     TLI.setUnavailable(LibFunc::ldexpl);
+    TLI.setUnavailable(LibFunc::exp10);
+    TLI.setUnavailable(LibFunc::exp10f);
+    TLI.setUnavailable(LibFunc::exp10l);
+    TLI.setUnavailable(LibFunc::log10);
+    TLI.setUnavailable(LibFunc::log10f);
+    TLI.setUnavailable(LibFunc::log10l);
   }
 
   // There are no library implementations of mempcy and memset for AMD gpus and
   // these can be difficult to lower in the backend.
   if (T.getArch() == Triple::r600 ||
-      T.getArch() == Triple::amdgcn ||
-      T.getArch() == Triple::wasm32 ||
-      T.getArch() == Triple::wasm64) {
+      T.getArch() == Triple::amdgcn) {
     TLI.setUnavailable(LibFunc::memcpy);
     TLI.setUnavailable(LibFunc::memset);
     TLI.setUnavailable(LibFunc::memset_pattern16);
@@ -207,6 +211,8 @@ static void initialize(TargetLibraryInfoImpl &TLI, const Triple &T,
       TLI.setUnavailable(LibFunc::fmaxf);
       TLI.setUnavailable(LibFunc::fmodf);
       TLI.setUnavailable(LibFunc::logf);
+      TLI.setUnavailable(LibFunc::log10f);
+      TLI.setUnavailable(LibFunc::modff);
       TLI.setUnavailable(LibFunc::powf);
       TLI.setUnavailable(LibFunc::sinf);
       TLI.setUnavailable(LibFunc::sinhf);
@@ -387,6 +393,24 @@ static void initialize(TargetLibraryInfoImpl &TLI, const Triple &T,
     TLI.setUnavailable(LibFunc::tmpfile64);
   }
 
+  // As currently implemented in clang, NVPTX code has no standard library to
+  // speak of.  Headers provide a standard-ish library implementation, but many
+  // of the signatures are wrong -- for example, many libm functions are not
+  // extern "C".
+  //
+  // libdevice, an IR library provided by nvidia, is linked in by the front-end,
+  // but only used functions are provided to llvm.  Moreover, most of the
+  // functions in libdevice don't map precisely to standard library functions.
+  //
+  // FIXME: Having no standard library prevents e.g. many fastmath
+  // optimizations, so this situation should be fixed.
+  if (T.isNVPTX()) {
+    TLI.disableAllFunctions();
+    TLI.setAvailable(LibFunc::nvvm_reflect);
+  } else {
+    TLI.setUnavailable(LibFunc::nvvm_reflect);
+  }
+
   TLI.addVectorizableFunctionsFromVecLib(ClVectorLibrary);
 }
 
@@ -444,7 +468,7 @@ static StringRef sanitizeFunctionName(StringRef funcName) {
 }
 
 bool TargetLibraryInfoImpl::getLibFunc(StringRef funcName,
-                                   LibFunc::Func &F) const {
+                                       LibFunc::Func &F) const {
   const char *const *Start = &StandardNames[0];
   const char *const *End = &StandardNames[LibFunc::NumLibFuncs];
 
@@ -463,6 +487,518 @@ bool TargetLibraryInfoImpl::getLibFunc(StringRef funcName,
   return false;
 }
 
+bool TargetLibraryInfoImpl::isValidProtoForLibFunc(const FunctionType &FTy,
+                                                   LibFunc::Func F,
+                                                   const DataLayout *DL) const {
+  LLVMContext &Ctx = FTy.getContext();
+  Type *PCharTy = Type::getInt8PtrTy(Ctx);
+  Type *SizeTTy = DL ? DL->getIntPtrType(Ctx, /*AS=*/0) : nullptr;
+  auto IsSizeTTy = [SizeTTy](Type *Ty) {
+    return SizeTTy ? Ty == SizeTTy : Ty->isIntegerTy();
+  };
+  unsigned NumParams = FTy.getNumParams();
+
+  switch (F) {
+  case LibFunc::strlen:
+    return (NumParams == 1 && FTy.getParamType(0)->isPointerTy() &&
+            FTy.getReturnType()->isIntegerTy());
+
+  case LibFunc::strchr:
+  case LibFunc::strrchr:
+    return (NumParams == 2 && FTy.getReturnType()->isPointerTy() &&
+            FTy.getParamType(0) == FTy.getReturnType() &&
+            FTy.getParamType(1)->isIntegerTy());
+
+  case LibFunc::strtol:
+  case LibFunc::strtod:
+  case LibFunc::strtof:
+  case LibFunc::strtoul:
+  case LibFunc::strtoll:
+  case LibFunc::strtold:
+  case LibFunc::strtoull:
+    return ((NumParams == 2 || NumParams == 3) &&
+            FTy.getParamType(0)->isPointerTy() &&
+            FTy.getParamType(1)->isPointerTy());
+  case LibFunc::strcat:
+    return (NumParams == 2 && FTy.getReturnType()->isPointerTy() &&
+            FTy.getParamType(0) == FTy.getReturnType() &&
+            FTy.getParamType(1) == FTy.getReturnType());
+
+  case LibFunc::strncat:
+    return (NumParams == 3 && FTy.getReturnType()->isPointerTy() &&
+            FTy.getParamType(0) == FTy.getReturnType() &&
+            FTy.getParamType(1) == FTy.getReturnType() &&
+            FTy.getParamType(2)->isIntegerTy());
+
+  case LibFunc::strcpy_chk:
+  case LibFunc::stpcpy_chk:
+    --NumParams;
+    if (!IsSizeTTy(FTy.getParamType(NumParams)))
+      return false;
+  // fallthrough
+  case LibFunc::strcpy:
+  case LibFunc::stpcpy:
+    return (NumParams == 2 && FTy.getReturnType() == FTy.getParamType(0) &&
+            FTy.getParamType(0) == FTy.getParamType(1) &&
+            FTy.getParamType(0) == PCharTy);
+
+  case LibFunc::strncpy_chk:
+  case LibFunc::stpncpy_chk:
+    --NumParams;
+    if (!IsSizeTTy(FTy.getParamType(NumParams)))
+      return false;
+  // fallthrough
+  case LibFunc::strncpy:
+  case LibFunc::stpncpy:
+    return (NumParams == 3 && FTy.getReturnType() == FTy.getParamType(0) &&
+            FTy.getParamType(0) == FTy.getParamType(1) &&
+            FTy.getParamType(0) == PCharTy &&
+            FTy.getParamType(2)->isIntegerTy());
+
+  case LibFunc::strxfrm:
+    return (NumParams == 3 && FTy.getParamType(0)->isPointerTy() &&
+            FTy.getParamType(1)->isPointerTy());
+
+  case LibFunc::strcmp:
+    return (NumParams == 2 && FTy.getReturnType()->isIntegerTy(32) &&
+            FTy.getParamType(0)->isPointerTy() &&
+            FTy.getParamType(0) == FTy.getParamType(1));
+
+  case LibFunc::strncmp:
+    return (NumParams == 3 && FTy.getReturnType()->isIntegerTy(32) &&
+            FTy.getParamType(0)->isPointerTy() &&
+            FTy.getParamType(0) == FTy.getParamType(1) &&
+            FTy.getParamType(2)->isIntegerTy());
+
+  case LibFunc::strspn:
+  case LibFunc::strcspn:
+    return (NumParams == 2 && FTy.getParamType(0)->isPointerTy() &&
+            FTy.getParamType(0) == FTy.getParamType(1) &&
+            FTy.getReturnType()->isIntegerTy());
+
+  case LibFunc::strcoll:
+  case LibFunc::strcasecmp:
+  case LibFunc::strncasecmp:
+    return (NumParams >= 2 && FTy.getParamType(0)->isPointerTy() &&
+            FTy.getParamType(1)->isPointerTy());
+
+  case LibFunc::strstr:
+    return (NumParams == 2 && FTy.getReturnType()->isPointerTy() &&
+            FTy.getParamType(0)->isPointerTy() &&
+            FTy.getParamType(1)->isPointerTy());
+
+  case LibFunc::strpbrk:
+    return (NumParams == 2 && FTy.getParamType(0)->isPointerTy() &&
+            FTy.getReturnType() == FTy.getParamType(0) &&
+            FTy.getParamType(0) == FTy.getParamType(1));
+
+  case LibFunc::strtok:
+  case LibFunc::strtok_r:
+    return (NumParams >= 2 && FTy.getParamType(1)->isPointerTy());
+  case LibFunc::scanf:
+  case LibFunc::setbuf:
+  case LibFunc::setvbuf:
+    return (NumParams >= 1 && FTy.getParamType(0)->isPointerTy());
+  case LibFunc::strdup:
+  case LibFunc::strndup:
+    return (NumParams >= 1 && FTy.getReturnType()->isPointerTy() &&
+            FTy.getParamType(0)->isPointerTy());
+  case LibFunc::sscanf:
+  case LibFunc::stat:
+  case LibFunc::statvfs:
+  case LibFunc::sprintf:
+    return (NumParams >= 2 && FTy.getParamType(0)->isPointerTy() &&
+            FTy.getParamType(1)->isPointerTy());
+  case LibFunc::snprintf:
+    return (NumParams == 3 && FTy.getParamType(0)->isPointerTy() &&
+            FTy.getParamType(2)->isPointerTy());
+  case LibFunc::setitimer:
+    return (NumParams == 3 && FTy.getParamType(1)->isPointerTy() &&
+            FTy.getParamType(2)->isPointerTy());
+  case LibFunc::system:
+    return (NumParams == 1 && FTy.getParamType(0)->isPointerTy());
+  case LibFunc::malloc:
+    return (NumParams == 1 && FTy.getReturnType()->isPointerTy());
+  case LibFunc::memcmp:
+    return (NumParams == 3 && FTy.getParamType(0)->isPointerTy() &&
+            FTy.getParamType(1)->isPointerTy() &&
+            FTy.getReturnType()->isIntegerTy(32));
+
+  case LibFunc::memchr:
+  case LibFunc::memrchr:
+    return (NumParams == 3 && FTy.getParamType(0)->isPointerTy() &&
+            FTy.getParamType(1)->isIntegerTy(32) &&
+            FTy.getParamType(2)->isIntegerTy() &&
+            FTy.getReturnType()->isPointerTy());
+  case LibFunc::modf:
+  case LibFunc::modff:
+  case LibFunc::modfl:
+    return (NumParams >= 2 && FTy.getParamType(1)->isPointerTy());
+
+  case LibFunc::memcpy_chk:
+  case LibFunc::memmove_chk:
+    --NumParams;
+    if (!IsSizeTTy(FTy.getParamType(NumParams)))
+      return false;
+  // fallthrough
+  case LibFunc::memcpy:
+  case LibFunc::memmove:
+    return (NumParams == 3 && FTy.getReturnType() == FTy.getParamType(0) &&
+            FTy.getParamType(0)->isPointerTy() &&
+            FTy.getParamType(1)->isPointerTy() &&
+            IsSizeTTy(FTy.getParamType(2)));
+
+  case LibFunc::memset_chk:
+    --NumParams;
+    if (!IsSizeTTy(FTy.getParamType(NumParams)))
+      return false;
+  // fallthrough
+  case LibFunc::memset:
+    return (NumParams == 3 && FTy.getReturnType() == FTy.getParamType(0) &&
+            FTy.getParamType(0)->isPointerTy() &&
+            FTy.getParamType(1)->isIntegerTy() &&
+            IsSizeTTy(FTy.getParamType(2)));
+
+  case LibFunc::memccpy:
+    return (NumParams >= 2 && FTy.getParamType(1)->isPointerTy());
+  case LibFunc::memalign:
+    return (FTy.getReturnType()->isPointerTy());
+  case LibFunc::realloc:
+    return (NumParams == 2 && FTy.getParamType(0)->isPointerTy() &&
+            FTy.getReturnType()->isPointerTy());
+  case LibFunc::read:
+    return (NumParams == 3 && FTy.getParamType(1)->isPointerTy());
+  case LibFunc::rewind:
+  case LibFunc::rmdir:
+  case LibFunc::remove:
+  case LibFunc::realpath:
+    return (NumParams >= 1 && FTy.getParamType(0)->isPointerTy());
+  case LibFunc::rename:
+    return (NumParams >= 2 && FTy.getParamType(0)->isPointerTy() &&
+            FTy.getParamType(1)->isPointerTy());
+  case LibFunc::readlink:
+    return (NumParams >= 2 && FTy.getParamType(0)->isPointerTy() &&
+            FTy.getParamType(1)->isPointerTy());
+  case LibFunc::write:
+    return (NumParams == 3 && FTy.getParamType(1)->isPointerTy());
+  case LibFunc::bcopy:
+  case LibFunc::bcmp:
+    return (NumParams == 3 && FTy.getParamType(0)->isPointerTy() &&
+            FTy.getParamType(1)->isPointerTy());
+  case LibFunc::bzero:
+    return (NumParams == 2 && FTy.getParamType(0)->isPointerTy());
+  case LibFunc::calloc:
+    return (NumParams == 2 && FTy.getReturnType()->isPointerTy());
+
+  case LibFunc::atof:
+  case LibFunc::atoi:
+  case LibFunc::atol:
+  case LibFunc::atoll:
+  case LibFunc::ferror:
+  case LibFunc::getenv:
+  case LibFunc::getpwnam:
+  case LibFunc::pclose:
+  case LibFunc::perror:
+  case LibFunc::printf:
+  case LibFunc::puts:
+  case LibFunc::uname:
+  case LibFunc::under_IO_getc:
+  case LibFunc::unlink:
+  case LibFunc::unsetenv:
+    return (NumParams == 1 && FTy.getParamType(0)->isPointerTy());
+
+  case LibFunc::chmod:
+  case LibFunc::chown:
+  case LibFunc::clearerr:
+  case LibFunc::closedir:
+  case LibFunc::ctermid:
+  case LibFunc::fclose:
+  case LibFunc::feof:
+  case LibFunc::fflush:
+  case LibFunc::fgetc:
+  case LibFunc::fileno:
+  case LibFunc::flockfile:
+  case LibFunc::free:
+  case LibFunc::fseek:
+  case LibFunc::fseeko64:
+  case LibFunc::fseeko:
+  case LibFunc::fsetpos:
+  case LibFunc::ftell:
+  case LibFunc::ftello64:
+  case LibFunc::ftello:
+  case LibFunc::ftrylockfile:
+  case LibFunc::funlockfile:
+  case LibFunc::getc:
+  case LibFunc::getc_unlocked:
+  case LibFunc::getlogin_r:
+  case LibFunc::mkdir:
+  case LibFunc::mktime:
+  case LibFunc::times:
+    return (NumParams != 0 && FTy.getParamType(0)->isPointerTy());
+
+  case LibFunc::access:
+    return (NumParams == 2 && FTy.getParamType(0)->isPointerTy());
+  case LibFunc::fopen:
+    return (NumParams == 2 && FTy.getReturnType()->isPointerTy() &&
+            FTy.getParamType(0)->isPointerTy() &&
+            FTy.getParamType(1)->isPointerTy());
+  case LibFunc::fdopen:
+    return (NumParams == 2 && FTy.getReturnType()->isPointerTy() &&
+            FTy.getParamType(1)->isPointerTy());
+  case LibFunc::fputc:
+  case LibFunc::fstat:
+  case LibFunc::frexp:
+  case LibFunc::frexpf:
+  case LibFunc::frexpl:
+  case LibFunc::fstatvfs:
+    return (NumParams == 2 && FTy.getParamType(1)->isPointerTy());
+  case LibFunc::fgets:
+    return (NumParams == 3 && FTy.getParamType(0)->isPointerTy() &&
+            FTy.getParamType(2)->isPointerTy());
+  case LibFunc::fread:
+    return (NumParams == 4 && FTy.getParamType(0)->isPointerTy() &&
+            FTy.getParamType(3)->isPointerTy());
+  case LibFunc::fwrite:
+    return (NumParams == 4 && FTy.getReturnType()->isIntegerTy() &&
+            FTy.getParamType(0)->isPointerTy() &&
+            FTy.getParamType(1)->isIntegerTy() &&
+            FTy.getParamType(2)->isIntegerTy() &&
+            FTy.getParamType(3)->isPointerTy());
+  case LibFunc::fputs:
+    return (NumParams >= 2 && FTy.getParamType(0)->isPointerTy() &&
+            FTy.getParamType(1)->isPointerTy());
+  case LibFunc::fscanf:
+  case LibFunc::fprintf:
+    return (NumParams >= 2 && FTy.getParamType(0)->isPointerTy() &&
+            FTy.getParamType(1)->isPointerTy());
+  case LibFunc::fgetpos:
+    return (NumParams >= 2 && FTy.getParamType(0)->isPointerTy() &&
+            FTy.getParamType(1)->isPointerTy());
+  case LibFunc::gets:
+  case LibFunc::getchar:
+  case LibFunc::getitimer:
+    return (NumParams == 2 && FTy.getParamType(1)->isPointerTy());
+  case LibFunc::ungetc:
+    return (NumParams == 2 && FTy.getParamType(1)->isPointerTy());
+  case LibFunc::utime:
+  case LibFunc::utimes:
+    return (NumParams == 2 && FTy.getParamType(0)->isPointerTy() &&
+            FTy.getParamType(1)->isPointerTy());
+  case LibFunc::putc:
+    return (NumParams == 2 && FTy.getParamType(1)->isPointerTy());
+  case LibFunc::pread:
+  case LibFunc::pwrite:
+    return (NumParams == 4 && FTy.getParamType(1)->isPointerTy());
+  case LibFunc::popen:
+    return (NumParams == 2 && FTy.getReturnType()->isPointerTy() &&
+            FTy.getParamType(0)->isPointerTy() &&
+            FTy.getParamType(1)->isPointerTy());
+  case LibFunc::vscanf:
+    return (NumParams == 2 && FTy.getParamType(1)->isPointerTy());
+  case LibFunc::vsscanf:
+    return (NumParams == 3 && FTy.getParamType(1)->isPointerTy() &&
+            FTy.getParamType(2)->isPointerTy());
+  case LibFunc::vfscanf:
+    return (NumParams == 3 && FTy.getParamType(1)->isPointerTy() &&
+            FTy.getParamType(2)->isPointerTy());
+  case LibFunc::valloc:
+    return (FTy.getReturnType()->isPointerTy());
+  case LibFunc::vprintf:
+    return (NumParams == 2 && FTy.getParamType(0)->isPointerTy());
+  case LibFunc::vfprintf:
+  case LibFunc::vsprintf:
+    return (NumParams == 3 && FTy.getParamType(0)->isPointerTy() &&
+            FTy.getParamType(1)->isPointerTy());
+  case LibFunc::vsnprintf:
+    return (NumParams == 4 && FTy.getParamType(0)->isPointerTy() &&
+            FTy.getParamType(2)->isPointerTy());
+  case LibFunc::open:
+    return (NumParams >= 2 && FTy.getParamType(0)->isPointerTy());
+  case LibFunc::opendir:
+    return (NumParams == 1 && FTy.getReturnType()->isPointerTy() &&
+            FTy.getParamType(0)->isPointerTy());
+  case LibFunc::tmpfile:
+    return (FTy.getReturnType()->isPointerTy());
+  case LibFunc::htonl:
+  case LibFunc::htons:
+  case LibFunc::ntohl:
+  case LibFunc::ntohs:
+  case LibFunc::lstat:
+    return (NumParams == 2 && FTy.getParamType(0)->isPointerTy() &&
+            FTy.getParamType(1)->isPointerTy());
+  case LibFunc::lchown:
+    return (NumParams == 3 && FTy.getParamType(0)->isPointerTy());
+  case LibFunc::qsort:
+    return (NumParams == 4 && FTy.getParamType(3)->isPointerTy());
+  case LibFunc::dunder_strdup:
+  case LibFunc::dunder_strndup:
+    return (NumParams >= 1 && FTy.getReturnType()->isPointerTy() &&
+            FTy.getParamType(0)->isPointerTy());
+  case LibFunc::dunder_strtok_r:
+    return (NumParams == 3 && FTy.getParamType(1)->isPointerTy());
+  case LibFunc::under_IO_putc:
+    return (NumParams == 2 && FTy.getParamType(1)->isPointerTy());
+  case LibFunc::dunder_isoc99_scanf:
+    return (NumParams >= 1 && FTy.getParamType(0)->isPointerTy());
+  case LibFunc::stat64:
+  case LibFunc::lstat64:
+  case LibFunc::statvfs64:
+    return (NumParams >= 1 && FTy.getParamType(0)->isPointerTy() &&
+            FTy.getParamType(1)->isPointerTy());
+  case LibFunc::dunder_isoc99_sscanf:
+    return (NumParams >= 1 && FTy.getParamType(0)->isPointerTy() &&
+            FTy.getParamType(1)->isPointerTy());
+  case LibFunc::fopen64:
+    return (NumParams == 2 && FTy.getReturnType()->isPointerTy() &&
+            FTy.getParamType(0)->isPointerTy() &&
+            FTy.getParamType(1)->isPointerTy());
+  case LibFunc::tmpfile64:
+    return (FTy.getReturnType()->isPointerTy());
+  case LibFunc::fstat64:
+  case LibFunc::fstatvfs64:
+    return (NumParams == 2 && FTy.getParamType(1)->isPointerTy());
+  case LibFunc::open64:
+    return (NumParams >= 2 && FTy.getParamType(0)->isPointerTy());
+  case LibFunc::gettimeofday:
+    return (NumParams == 2 && FTy.getParamType(0)->isPointerTy() &&
+            FTy.getParamType(1)->isPointerTy());
+
+  case LibFunc::Znwj:                    // new(unsigned int);
+  case LibFunc::Znwm:                    // new(unsigned long);
+  case LibFunc::Znaj:                    // new[](unsigned int);
+  case LibFunc::Znam:                    // new[](unsigned long);
+  case LibFunc::msvc_new_int:            // new(unsigned int);
+  case LibFunc::msvc_new_longlong:       // new(unsigned long long);
+  case LibFunc::msvc_new_array_int:      // new[](unsigned int);
+  case LibFunc::msvc_new_array_longlong: // new[](unsigned long long);
+    return (NumParams == 1);
+
+  case LibFunc::memset_pattern16:
+    return (!FTy.isVarArg() && NumParams == 3 &&
+            isa<PointerType>(FTy.getParamType(0)) &&
+            isa<PointerType>(FTy.getParamType(1)) &&
+            isa<IntegerType>(FTy.getParamType(2)));
+
+  // int __nvvm_reflect(const char *);
+  case LibFunc::nvvm_reflect:
+    return (NumParams == 1 && isa<PointerType>(FTy.getParamType(0)));
+
+  case LibFunc::sin:
+  case LibFunc::sinf:
+  case LibFunc::sinl:
+  case LibFunc::cos:
+  case LibFunc::cosf:
+  case LibFunc::cosl:
+  case LibFunc::tan:
+  case LibFunc::tanf:
+  case LibFunc::tanl:
+  case LibFunc::exp:
+  case LibFunc::expf:
+  case LibFunc::expl:
+  case LibFunc::exp2:
+  case LibFunc::exp2f:
+  case LibFunc::exp2l:
+  case LibFunc::log:
+  case LibFunc::logf:
+  case LibFunc::logl:
+  case LibFunc::log10:
+  case LibFunc::log10f:
+  case LibFunc::log10l:
+  case LibFunc::log2:
+  case LibFunc::log2f:
+  case LibFunc::log2l:
+  case LibFunc::fabs:
+  case LibFunc::fabsf:
+  case LibFunc::fabsl:
+  case LibFunc::floor:
+  case LibFunc::floorf:
+  case LibFunc::floorl:
+  case LibFunc::ceil:
+  case LibFunc::ceilf:
+  case LibFunc::ceill:
+  case LibFunc::trunc:
+  case LibFunc::truncf:
+  case LibFunc::truncl:
+  case LibFunc::rint:
+  case LibFunc::rintf:
+  case LibFunc::rintl:
+  case LibFunc::nearbyint:
+  case LibFunc::nearbyintf:
+  case LibFunc::nearbyintl:
+  case LibFunc::round:
+  case LibFunc::roundf:
+  case LibFunc::roundl:
+  case LibFunc::sqrt:
+  case LibFunc::sqrtf:
+  case LibFunc::sqrtl:
+    return (NumParams == 1 && FTy.getReturnType()->isFloatingPointTy() &&
+            FTy.getReturnType() == FTy.getParamType(0));
+
+  case LibFunc::fmin:
+  case LibFunc::fminf:
+  case LibFunc::fminl:
+  case LibFunc::fmax:
+  case LibFunc::fmaxf:
+  case LibFunc::fmaxl:
+  case LibFunc::copysign:
+  case LibFunc::copysignf:
+  case LibFunc::copysignl:
+  case LibFunc::pow:
+  case LibFunc::powf:
+  case LibFunc::powl:
+    return (NumParams == 2 && FTy.getReturnType()->isFloatingPointTy() &&
+            FTy.getReturnType() == FTy.getParamType(0) &&
+            FTy.getReturnType() == FTy.getParamType(1));
+
+  case LibFunc::ffs:
+  case LibFunc::ffsl:
+  case LibFunc::ffsll:
+  case LibFunc::isdigit:
+  case LibFunc::isascii:
+  case LibFunc::toascii:
+    return (NumParams == 1 && FTy.getReturnType()->isIntegerTy(32) &&
+            FTy.getParamType(0)->isIntegerTy());
+
+  case LibFunc::fls:
+  case LibFunc::flsl:
+  case LibFunc::flsll:
+  case LibFunc::abs:
+  case LibFunc::labs:
+  case LibFunc::llabs:
+    return (NumParams == 1 && FTy.getReturnType()->isIntegerTy() &&
+            FTy.getReturnType() == FTy.getParamType(0));
+
+  case LibFunc::cxa_atexit:
+    return (NumParams == 3 && FTy.getReturnType()->isIntegerTy() &&
+            FTy.getParamType(0)->isPointerTy() &&
+            FTy.getParamType(1)->isPointerTy() &&
+            FTy.getParamType(2)->isPointerTy());
+
+  case LibFunc::sinpi:
+  case LibFunc::cospi:
+    return (NumParams == 1 && FTy.getReturnType()->isDoubleTy() &&
+            FTy.getReturnType() == FTy.getParamType(0));
+
+  case LibFunc::sinpif:
+  case LibFunc::cospif:
+    return (NumParams == 1 && FTy.getReturnType()->isFloatTy() &&
+            FTy.getReturnType() == FTy.getParamType(0));
+
+  default:
+    // Assume the other functions are correct.
+    // FIXME: It'd be really nice to cover them all.
+    return true;
+  }
+}
+
+bool TargetLibraryInfoImpl::getLibFunc(const Function &FDecl,
+                                       LibFunc::Func &F) const {
+  const DataLayout *DL =
+      FDecl.getParent() ? &FDecl.getParent()->getDataLayout() : nullptr;
+  return getLibFunc(FDecl.getName(), F) &&
+         isValidProtoForLibFunc(*FDecl.getFunctionType(), F, DL);
+}
+
 void TargetLibraryInfoImpl::disableAllFunctions() {
   memset(AvailableArray, 0, sizeof(AvailableArray));
 }
@@ -583,14 +1119,16 @@ StringRef TargetLibraryInfoImpl::getScalarizedFunction(StringRef F,
   return I->ScalarFnName;
 }
 
-TargetLibraryInfo TargetLibraryAnalysis::run(Module &M) {
+TargetLibraryInfo TargetLibraryAnalysis::run(Module &M,
+                                             ModuleAnalysisManager &) {
   if (PresetInfoImpl)
     return TargetLibraryInfo(*PresetInfoImpl);
 
   return TargetLibraryInfo(lookupInfoImpl(Triple(M.getTargetTriple())));
 }
 
-TargetLibraryInfo TargetLibraryAnalysis::run(Function &F) {
+TargetLibraryInfo TargetLibraryAnalysis::run(Function &F,
+                                             FunctionAnalysisManager &) {
   if (PresetInfoImpl)
     return TargetLibraryInfo(*PresetInfoImpl);
 
@@ -598,7 +1136,7 @@ TargetLibraryInfo TargetLibraryAnalysis::run(Function &F) {
       lookupInfoImpl(Triple(F.getParent()->getTargetTriple())));
 }
 
-TargetLibraryInfoImpl &TargetLibraryAnalysis::lookupInfoImpl(Triple T) {
+TargetLibraryInfoImpl &TargetLibraryAnalysis::lookupInfoImpl(const Triple &T) {
   std::unique_ptr<TargetLibraryInfoImpl> &Impl =
       Impls[T.normalize()];
   if (!Impl)
diff --git a/lib/Analysis/TargetTransformInfo.cpp b/lib/Analysis/TargetTransformInfo.cpp
index 9c1d3fd4f582..52013f796c56 100644
--- a/lib/Analysis/TargetTransformInfo.cpp
+++ b/lib/Analysis/TargetTransformInfo.cpp
@@ -17,6 +17,7 @@
 #include "llvm/IR/Module.h"
 #include "llvm/IR/Operator.h"
 #include "llvm/Support/ErrorHandling.h"
+#include <utility>
 
 using namespace llvm;
 
@@ -66,6 +67,15 @@ int TargetTransformInfo::getCallCost(const Function *F,
   return Cost;
 }
 
+unsigned TargetTransformInfo::getInliningThresholdMultiplier() const {
+  return TTIImpl->getInliningThresholdMultiplier();
+}
+
+int TargetTransformInfo::getGEPCost(Type *PointeeType, const Value *Ptr,
+                                    ArrayRef<const Value *> Operands) const {
+  return TTIImpl->getGEPCost(PointeeType, Ptr, Operands);
+}
+
 int TargetTransformInfo::getIntrinsicCost(
     Intrinsic::ID IID, Type *RetTy, ArrayRef<const Value *> Arguments) const {
   int Cost = TTIImpl->getIntrinsicCost(IID, RetTy, Arguments);
@@ -172,6 +182,18 @@ bool TargetTransformInfo::enableInterleavedAccessVectorization() const {
   return TTIImpl->enableInterleavedAccessVectorization();
 }
 
+bool TargetTransformInfo::isFPVectorizationPotentiallyUnsafe() const {
+  return TTIImpl->isFPVectorizationPotentiallyUnsafe();
+}
+
+bool TargetTransformInfo::allowsMisalignedMemoryAccesses(unsigned BitWidth,
+                                                         unsigned AddressSpace,
+                                                         unsigned Alignment,
+                                                         bool *Fast) const {
+  return TTIImpl->allowsMisalignedMemoryAccesses(BitWidth, AddressSpace,
+                                                 Alignment, Fast);
+}
+
 TargetTransformInfo::PopcntSupportKind
 TargetTransformInfo::getPopcntSupport(unsigned IntTyWidthInBit) const {
   return TTIImpl->getPopcntSupport(IntTyWidthInBit);
@@ -187,6 +209,14 @@ int TargetTransformInfo::getFPOpCost(Type *Ty) const {
   return Cost;
 }
 
+int TargetTransformInfo::getIntImmCodeSizeCost(unsigned Opcode, unsigned Idx,
+                                               const APInt &Imm,
+                                               Type *Ty) const {
+  int Cost = TTIImpl->getIntImmCodeSizeCost(Opcode, Idx, Imm, Ty);
+  assert(Cost >= 0 && "TTI should not produce negative costs!");
+  return Cost;
+}
+
 int TargetTransformInfo::getIntImmCost(const APInt &Imm, Type *Ty) const {
   int Cost = TTIImpl->getIntImmCost(Imm, Ty);
   assert(Cost >= 0 && "TTI should not produce negative costs!");
@@ -215,6 +245,26 @@ unsigned TargetTransformInfo::getRegisterBitWidth(bool Vector) const {
   return TTIImpl->getRegisterBitWidth(Vector);
 }
 
+unsigned TargetTransformInfo::getLoadStoreVecRegBitWidth(unsigned AS) const {
+  return TTIImpl->getLoadStoreVecRegBitWidth(AS);
+}
+
+unsigned TargetTransformInfo::getCacheLineSize() const {
+  return TTIImpl->getCacheLineSize();
+}
+
+unsigned TargetTransformInfo::getPrefetchDistance() const {
+  return TTIImpl->getPrefetchDistance();
+}
+
+unsigned TargetTransformInfo::getMinPrefetchStride() const {
+  return TTIImpl->getMinPrefetchStride();
+}
+
+unsigned TargetTransformInfo::getMaxPrefetchIterationsAhead() const {
+  return TTIImpl->getMaxPrefetchIterationsAhead();
+}
+
 unsigned TargetTransformInfo::getMaxInterleaveFactor(unsigned VF) const {
   return TTIImpl->getMaxInterleaveFactor(VF);
 }
@@ -243,6 +293,14 @@ int TargetTransformInfo::getCastInstrCost(unsigned Opcode, Type *Dst,
   return Cost;
 }
 
+int TargetTransformInfo::getExtractWithExtendCost(unsigned Opcode, Type *Dst,
+                                                  VectorType *VecTy,
+                                                  unsigned Index) const {
+  int Cost = TTIImpl->getExtractWithExtendCost(Opcode, Dst, VecTy, Index);
+  assert(Cost >= 0 && "TTI should not produce negative costs!");
+  return Cost;
+}
+
 int TargetTransformInfo::getCFInstrCost(unsigned Opcode) const {
   int Cost = TTIImpl->getCFInstrCost(Opcode);
   assert(Cost >= 0 && "TTI should not produce negative costs!");
@@ -299,15 +357,17 @@ int TargetTransformInfo::getInterleavedMemoryOpCost(
 }
 
 int TargetTransformInfo::getIntrinsicInstrCost(Intrinsic::ID ID, Type *RetTy,
-                                               ArrayRef<Type *> Tys) const {
-  int Cost = TTIImpl->getIntrinsicInstrCost(ID, RetTy, Tys);
+                                               ArrayRef<Type *> Tys,
+                                               FastMathFlags FMF) const {
+  int Cost = TTIImpl->getIntrinsicInstrCost(ID, RetTy, Tys, FMF);
   assert(Cost >= 0 && "TTI should not produce negative costs!");
   return Cost;
 }
 
 int TargetTransformInfo::getIntrinsicInstrCost(Intrinsic::ID ID, Type *RetTy,
-                                               ArrayRef<Value *> Args) const {
-  int Cost = TTIImpl->getIntrinsicInstrCost(ID, RetTy, Args);
+                                               ArrayRef<Value *> Args,
+                                               FastMathFlags FMF) const {
+  int Cost = TTIImpl->getIntrinsicInstrCost(ID, RetTy, Args, FMF);
   assert(Cost >= 0 && "TTI should not produce negative costs!");
   return Cost;
 }
@@ -363,9 +423,10 @@ TargetIRAnalysis::TargetIRAnalysis() : TTICallback(&getDefaultTTI) {}
 
 TargetIRAnalysis::TargetIRAnalysis(
     std::function<Result(const Function &)> TTICallback)
-    : TTICallback(TTICallback) {}
+    : TTICallback(std::move(TTICallback)) {}
 
-TargetIRAnalysis::Result TargetIRAnalysis::run(const Function &F) {
+TargetIRAnalysis::Result TargetIRAnalysis::run(const Function &F,
+                                               AnalysisManager<Function> &) {
   return TTICallback(F);
 }
 
@@ -396,7 +457,8 @@ TargetTransformInfoWrapperPass::TargetTransformInfoWrapperPass(
 }
 
 TargetTransformInfo &TargetTransformInfoWrapperPass::getTTI(const Function &F) {
-  TTI = TIRA.run(F);
+  AnalysisManager<Function> DummyFAM;
+  TTI = TIRA.run(F, DummyFAM);
   return *TTI;
 }
 
diff --git a/lib/Analysis/Trace.cpp b/lib/Analysis/Trace.cpp
index 5a1acc00fb94..c7e2c0f3412a 100644
--- a/lib/Analysis/Trace.cpp
+++ b/lib/Analysis/Trace.cpp
@@ -46,7 +46,7 @@ void Trace::print(raw_ostream &O) const {
 /// dump - Debugger convenience method; writes trace to standard error
 /// output stream.
 ///
-void Trace::dump() const {
+LLVM_DUMP_METHOD void Trace::dump() const {
   print(dbgs());
 }
 #endif
diff --git a/lib/Analysis/TypeBasedAliasAnalysis.cpp b/lib/Analysis/TypeBasedAliasAnalysis.cpp
index 9f923913ca27..20d162a03c30 100644
--- a/lib/Analysis/TypeBasedAliasAnalysis.cpp
+++ b/lib/Analysis/TypeBasedAliasAnalysis.cpp
@@ -122,7 +122,6 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/Analysis/TypeBasedAliasAnalysis.h"
-#include "llvm/Analysis/TargetLibraryInfo.h"
 #include "llvm/ADT/SetVector.h"
 #include "llvm/IR/Constants.h"
 #include "llvm/IR/LLVMContext.h"
@@ -584,18 +583,15 @@ bool TypeBasedAAResult::PathAliases(const MDNode *A, const MDNode *B) const {
   return false;
 }
 
-TypeBasedAAResult TypeBasedAA::run(Function &F, AnalysisManager<Function> *AM) {
-  return TypeBasedAAResult(AM->getResult<TargetLibraryAnalysis>(F));
-}
-
 char TypeBasedAA::PassID;
 
+TypeBasedAAResult TypeBasedAA::run(Function &F, AnalysisManager<Function> &AM) {
+  return TypeBasedAAResult();
+}
+
 char TypeBasedAAWrapperPass::ID = 0;
-INITIALIZE_PASS_BEGIN(TypeBasedAAWrapperPass, "tbaa",
-                      "Type-Based Alias Analysis", false, true)
-INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfoWrapperPass)
-INITIALIZE_PASS_END(TypeBasedAAWrapperPass, "tbaa", "Type-Based Alias Analysis",
-                    false, true)
+INITIALIZE_PASS(TypeBasedAAWrapperPass, "tbaa", "Type-Based Alias Analysis",
+                false, true)
 
 ImmutablePass *llvm::createTypeBasedAAWrapperPass() {
   return new TypeBasedAAWrapperPass();
@@ -606,8 +602,7 @@ TypeBasedAAWrapperPass::TypeBasedAAWrapperPass() : ImmutablePass(ID) {
 }
 
 bool TypeBasedAAWrapperPass::doInitialization(Module &M) {
-  Result.reset(new TypeBasedAAResult(
-      getAnalysis<TargetLibraryInfoWrapperPass>().getTLI()));
+  Result.reset(new TypeBasedAAResult());
   return false;
 }
 
@@ -618,5 +613,4 @@ bool TypeBasedAAWrapperPass::doFinalization(Module &M) {
 
 void TypeBasedAAWrapperPass::getAnalysisUsage(AnalysisUsage &AU) const {
   AU.setPreservesAll();
-  AU.addRequired<TargetLibraryInfoWrapperPass>();
 }
diff --git a/lib/Analysis/TypeMetadataUtils.cpp b/lib/Analysis/TypeMetadataUtils.cpp
new file mode 100644
index 000000000000..31e2b42075d6
--- /dev/null
+++ b/lib/Analysis/TypeMetadataUtils.cpp
@@ -0,0 +1,118 @@
+//===- TypeMetadataUtils.cpp - Utilities related to type metadata ---------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains functions that make it easier to manipulate type metadata
+// for devirtualization.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Analysis/TypeMetadataUtils.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/Intrinsics.h"
+#include "llvm/IR/Module.h"
+
+using namespace llvm;
+
+// Search for virtual calls that call FPtr and add them to DevirtCalls.
+static void
+findCallsAtConstantOffset(SmallVectorImpl<DevirtCallSite> &DevirtCalls,
+                          bool *HasNonCallUses, Value *FPtr, uint64_t Offset) {
+  for (const Use &U : FPtr->uses()) {
+    Value *User = U.getUser();
+    if (isa<BitCastInst>(User)) {
+      findCallsAtConstantOffset(DevirtCalls, HasNonCallUses, User, Offset);
+    } else if (auto CI = dyn_cast<CallInst>(User)) {
+      DevirtCalls.push_back({Offset, CI});
+    } else if (auto II = dyn_cast<InvokeInst>(User)) {
+      DevirtCalls.push_back({Offset, II});
+    } else if (HasNonCallUses) {
+      *HasNonCallUses = true;
+    }
+  }
+}
+
+// Search for virtual calls that load from VPtr and add them to DevirtCalls.
+static void
+findLoadCallsAtConstantOffset(Module *M,
+                              SmallVectorImpl<DevirtCallSite> &DevirtCalls,
+                              Value *VPtr, int64_t Offset) {
+  for (const Use &U : VPtr->uses()) {
+    Value *User = U.getUser();
+    if (isa<BitCastInst>(User)) {
+      findLoadCallsAtConstantOffset(M, DevirtCalls, User, Offset);
+    } else if (isa<LoadInst>(User)) {
+      findCallsAtConstantOffset(DevirtCalls, nullptr, User, Offset);
+    } else if (auto GEP = dyn_cast<GetElementPtrInst>(User)) {
+      // Take into account the GEP offset.
+      if (VPtr == GEP->getPointerOperand() && GEP->hasAllConstantIndices()) {
+        SmallVector<Value *, 8> Indices(GEP->op_begin() + 1, GEP->op_end());
+        int64_t GEPOffset = M->getDataLayout().getIndexedOffsetInType(
+            GEP->getSourceElementType(), Indices);
+        findLoadCallsAtConstantOffset(M, DevirtCalls, User, Offset + GEPOffset);
+      }
+    }
+  }
+}
+
+void llvm::findDevirtualizableCallsForTypeTest(
+    SmallVectorImpl<DevirtCallSite> &DevirtCalls,
+    SmallVectorImpl<CallInst *> &Assumes, CallInst *CI) {
+  assert(CI->getCalledFunction()->getIntrinsicID() == Intrinsic::type_test);
+
+  Module *M = CI->getParent()->getParent()->getParent();
+
+  // Find llvm.assume intrinsics for this llvm.type.test call.
+  for (const Use &CIU : CI->uses()) {
+    auto AssumeCI = dyn_cast<CallInst>(CIU.getUser());
+    if (AssumeCI) {
+      Function *F = AssumeCI->getCalledFunction();
+      if (F && F->getIntrinsicID() == Intrinsic::assume)
+        Assumes.push_back(AssumeCI);
+    }
+  }
+
+  // If we found any, search for virtual calls based on %p and add them to
+  // DevirtCalls.
+  if (!Assumes.empty())
+    findLoadCallsAtConstantOffset(M, DevirtCalls,
+                                  CI->getArgOperand(0)->stripPointerCasts(), 0);
+}
+
+void llvm::findDevirtualizableCallsForTypeCheckedLoad(
+    SmallVectorImpl<DevirtCallSite> &DevirtCalls,
+    SmallVectorImpl<Instruction *> &LoadedPtrs,
+    SmallVectorImpl<Instruction *> &Preds, bool &HasNonCallUses, CallInst *CI) {
+  assert(CI->getCalledFunction()->getIntrinsicID() ==
+         Intrinsic::type_checked_load);
+
+  auto *Offset = dyn_cast<ConstantInt>(CI->getArgOperand(1));
+  if (!Offset) {
+    HasNonCallUses = true;
+    return;
+  }
+
+  for (Use &U : CI->uses()) {
+    auto CIU = U.getUser();
+    if (auto EVI = dyn_cast<ExtractValueInst>(CIU)) {
+      if (EVI->getNumIndices() == 1 && EVI->getIndices()[0] == 0) {
+        LoadedPtrs.push_back(EVI);
+        continue;
+      }
+      if (EVI->getNumIndices() == 1 && EVI->getIndices()[0] == 1) {
+        Preds.push_back(EVI);
+        continue;
+      }
+    }
+    HasNonCallUses = true;
+  }
+
+  for (Value *LoadedPtr : LoadedPtrs)
+    findCallsAtConstantOffset(DevirtCalls, &HasNonCallUses, LoadedPtr,
+                              Offset->getZExtValue());
+}
diff --git a/lib/Analysis/ValueTracking.cpp b/lib/Analysis/ValueTracking.cpp
index a83e207bd265..f2b40787443a 100644
--- a/lib/Analysis/ValueTracking.cpp
+++ b/lib/Analysis/ValueTracking.cpp
@@ -18,7 +18,9 @@
 #include "llvm/Analysis/AssumptionCache.h"
 #include "llvm/Analysis/InstructionSimplify.h"
 #include "llvm/Analysis/MemoryBuiltins.h"
+#include "llvm/Analysis/Loads.h"
 #include "llvm/Analysis/LoopInfo.h"
+#include "llvm/Analysis/VectorUtils.h"
 #include "llvm/IR/CallSite.h"
 #include "llvm/IR/ConstantRange.h"
 #include "llvm/IR/Constants.h"
@@ -36,40 +38,19 @@
 #include "llvm/IR/Statepoint.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/MathExtras.h"
+#include <algorithm>
+#include <array>
 #include <cstring>
 using namespace llvm;
 using namespace llvm::PatternMatch;
 
 const unsigned MaxDepth = 6;
 
-/// Enable an experimental feature to leverage information about dominating
-/// conditions to compute known bits.  The individual options below control how
-/// hard we search.  The defaults are chosen to be fairly aggressive.  If you
-/// run into compile time problems when testing, scale them back and report
-/// your findings.
-static cl::opt<bool> EnableDomConditions("value-tracking-dom-conditions",
-                                         cl::Hidden, cl::init(false));
-
-// This is expensive, so we only do it for the top level query value.
-// (TODO: evaluate cost vs profit, consider higher thresholds)
-static cl::opt<unsigned> DomConditionsMaxDepth("dom-conditions-max-depth",
-                                               cl::Hidden, cl::init(1));
-
-/// How many dominating blocks should be scanned looking for dominating
-/// conditions?
-static cl::opt<unsigned> DomConditionsMaxDomBlocks("dom-conditions-dom-blocks",
-                                                   cl::Hidden,
-                                                   cl::init(20));
-
 // Controls the number of uses of the value searched for possible
 // dominating comparisons.
 static cl::opt<unsigned> DomConditionsMaxUses("dom-conditions-max-uses",
                                               cl::Hidden, cl::init(20));
 
-// If true, don't consider only compares whose only use is a branch.
-static cl::opt<bool> DomConditionsSingleCmpUse("dom-conditions-single-cmp-use",
-                                               cl::Hidden, cl::init(false));
-
 /// Returns the bitwidth of the given scalar or pointer type (if unknown returns
 /// 0). For vector types, returns the element type's bitwidth.
 static unsigned getBitWidth(Type *Ty, const DataLayout &DL) {
@@ -79,34 +60,45 @@ static unsigned getBitWidth(Type *Ty, const DataLayout &DL) {
   return DL.getPointerTypeSizeInBits(Ty);
 }
 
-// Many of these functions have internal versions that take an assumption
-// exclusion set. This is because of the potential for mutual recursion to
-// cause computeKnownBits to repeatedly visit the same assume intrinsic. The
-// classic case of this is assume(x = y), which will attempt to determine
-// bits in x from bits in y, which will attempt to determine bits in y from
-// bits in x, etc. Regarding the mutual recursion, computeKnownBits can call
-// isKnownNonZero, which calls computeKnownBits and ComputeSignBit and
-// isKnownToBeAPowerOfTwo (all of which can call computeKnownBits), and so on.
-typedef SmallPtrSet<const Value *, 8> ExclInvsSet;
-
 namespace {
 // Simplifying using an assume can only be done in a particular control-flow
 // context (the context instruction provides that context). If an assume and
 // the context instruction are not in the same block then the DT helps in
 // figuring out if we can use it.
 struct Query {
-  ExclInvsSet ExclInvs;
+  const DataLayout &DL;
   AssumptionCache *AC;
   const Instruction *CxtI;
   const DominatorTree *DT;
 
-  Query(AssumptionCache *AC = nullptr, const Instruction *CxtI = nullptr,
-        const DominatorTree *DT = nullptr)
-      : AC(AC), CxtI(CxtI), DT(DT) {}
+  /// Set of assumptions that should be excluded from further queries.
+  /// This is because of the potential for mutual recursion to cause
+  /// computeKnownBits to repeatedly visit the same assume intrinsic. The
+  /// classic case of this is assume(x = y), which will attempt to determine
+  /// bits in x from bits in y, which will attempt to determine bits in y from
+  /// bits in x, etc. Regarding the mutual recursion, computeKnownBits can call
+  /// isKnownNonZero, which calls computeKnownBits and ComputeSignBit and
+  /// isKnownToBeAPowerOfTwo (all of which can call computeKnownBits), and so
+  /// on.
+  std::array<const Value*, MaxDepth> Excluded;
+  unsigned NumExcluded;
+
+  Query(const DataLayout &DL, AssumptionCache *AC, const Instruction *CxtI,
+        const DominatorTree *DT)
+      : DL(DL), AC(AC), CxtI(CxtI), DT(DT), NumExcluded(0) {}
 
   Query(const Query &Q, const Value *NewExcl)
-      : ExclInvs(Q.ExclInvs), AC(Q.AC), CxtI(Q.CxtI), DT(Q.DT) {
-    ExclInvs.insert(NewExcl);
+      : DL(Q.DL), AC(Q.AC), CxtI(Q.CxtI), DT(Q.DT), NumExcluded(Q.NumExcluded) {
+    Excluded = Q.Excluded;
+    Excluded[NumExcluded++] = NewExcl;
+    assert(NumExcluded <= Excluded.size());
+  }
+
+  bool isExcluded(const Value *Value) const {
+    if (NumExcluded == 0)
+      return false;
+    auto End = Excluded.begin() + NumExcluded;
+    return std::find(Excluded.begin(), End, Value) != End;
   }
 };
 } // end anonymous namespace
@@ -128,15 +120,14 @@ static const Instruction *safeCxtI(const Value *V, const Instruction *CxtI) {
 }
 
 static void computeKnownBits(Value *V, APInt &KnownZero, APInt &KnownOne,
-                             const DataLayout &DL, unsigned Depth,
-                             const Query &Q);
+                             unsigned Depth, const Query &Q);
 
 void llvm::computeKnownBits(Value *V, APInt &KnownZero, APInt &KnownOne,
                             const DataLayout &DL, unsigned Depth,
                             AssumptionCache *AC, const Instruction *CxtI,
                             const DominatorTree *DT) {
-  ::computeKnownBits(V, KnownZero, KnownOne, DL, Depth,
-                     Query(AC, safeCxtI(V, CxtI), DT));
+  ::computeKnownBits(V, KnownZero, KnownOne, Depth,
+                     Query(DL, AC, safeCxtI(V, CxtI), DT));
 }
 
 bool llvm::haveNoCommonBitsSet(Value *LHS, Value *RHS, const DataLayout &DL,
@@ -155,35 +146,33 @@ bool llvm::haveNoCommonBitsSet(Value *LHS, Value *RHS, const DataLayout &DL,
 }
 
 static void ComputeSignBit(Value *V, bool &KnownZero, bool &KnownOne,
-                           const DataLayout &DL, unsigned Depth,
-                           const Query &Q);
+                           unsigned Depth, const Query &Q);
 
 void llvm::ComputeSignBit(Value *V, bool &KnownZero, bool &KnownOne,
                           const DataLayout &DL, unsigned Depth,
                           AssumptionCache *AC, const Instruction *CxtI,
                           const DominatorTree *DT) {
-  ::ComputeSignBit(V, KnownZero, KnownOne, DL, Depth,
-                   Query(AC, safeCxtI(V, CxtI), DT));
+  ::ComputeSignBit(V, KnownZero, KnownOne, Depth,
+                   Query(DL, AC, safeCxtI(V, CxtI), DT));
 }
 
 static bool isKnownToBeAPowerOfTwo(Value *V, bool OrZero, unsigned Depth,
-                                   const Query &Q, const DataLayout &DL);
+                                   const Query &Q);
 
 bool llvm::isKnownToBeAPowerOfTwo(Value *V, const DataLayout &DL, bool OrZero,
                                   unsigned Depth, AssumptionCache *AC,
                                   const Instruction *CxtI,
                                   const DominatorTree *DT) {
   return ::isKnownToBeAPowerOfTwo(V, OrZero, Depth,
-                                  Query(AC, safeCxtI(V, CxtI), DT), DL);
+                                  Query(DL, AC, safeCxtI(V, CxtI), DT));
 }
 
-static bool isKnownNonZero(Value *V, const DataLayout &DL, unsigned Depth,
-                           const Query &Q);
+static bool isKnownNonZero(Value *V, unsigned Depth, const Query &Q);
 
 bool llvm::isKnownNonZero(Value *V, const DataLayout &DL, unsigned Depth,
                           AssumptionCache *AC, const Instruction *CxtI,
                           const DominatorTree *DT) {
-  return ::isKnownNonZero(V, DL, Depth, Query(AC, safeCxtI(V, CxtI), DT));
+  return ::isKnownNonZero(V, Depth, Query(DL, AC, safeCxtI(V, CxtI), DT));
 }
 
 bool llvm::isKnownNonNegative(Value *V, const DataLayout &DL, unsigned Depth,
@@ -194,42 +183,59 @@ bool llvm::isKnownNonNegative(Value *V, const DataLayout &DL, unsigned Depth,
   return NonNegative;
 }
 
-static bool isKnownNonEqual(Value *V1, Value *V2, const DataLayout &DL,
-                           const Query &Q);
+bool llvm::isKnownPositive(Value *V, const DataLayout &DL, unsigned Depth,
+                           AssumptionCache *AC, const Instruction *CxtI,
+                           const DominatorTree *DT) {
+  if (auto *CI = dyn_cast<ConstantInt>(V))
+    return CI->getValue().isStrictlyPositive();
+
+  // TODO: We'd doing two recursive queries here.  We should factor this such
+  // that only a single query is needed.
+  return isKnownNonNegative(V, DL, Depth, AC, CxtI, DT) &&
+    isKnownNonZero(V, DL, Depth, AC, CxtI, DT);
+}
+
+bool llvm::isKnownNegative(Value *V, const DataLayout &DL, unsigned Depth,
+                           AssumptionCache *AC, const Instruction *CxtI,
+                           const DominatorTree *DT) {
+  bool NonNegative, Negative;
+  ComputeSignBit(V, NonNegative, Negative, DL, Depth, AC, CxtI, DT);
+  return Negative;
+}
+
+static bool isKnownNonEqual(Value *V1, Value *V2, const Query &Q);
 
 bool llvm::isKnownNonEqual(Value *V1, Value *V2, const DataLayout &DL,
                           AssumptionCache *AC, const Instruction *CxtI,
                           const DominatorTree *DT) {
-  return ::isKnownNonEqual(V1, V2, DL, Query(AC,
-                                             safeCxtI(V1, safeCxtI(V2, CxtI)),
-                                             DT));
+  return ::isKnownNonEqual(V1, V2, Query(DL, AC,
+                                         safeCxtI(V1, safeCxtI(V2, CxtI)),
+                                         DT));
 }
 
-static bool MaskedValueIsZero(Value *V, const APInt &Mask, const DataLayout &DL,
-                              unsigned Depth, const Query &Q);
+static bool MaskedValueIsZero(Value *V, const APInt &Mask, unsigned Depth,
+                              const Query &Q);
 
 bool llvm::MaskedValueIsZero(Value *V, const APInt &Mask, const DataLayout &DL,
                              unsigned Depth, AssumptionCache *AC,
                              const Instruction *CxtI, const DominatorTree *DT) {
-  return ::MaskedValueIsZero(V, Mask, DL, Depth,
-                             Query(AC, safeCxtI(V, CxtI), DT));
+  return ::MaskedValueIsZero(V, Mask, Depth,
+                             Query(DL, AC, safeCxtI(V, CxtI), DT));
 }
 
-static unsigned ComputeNumSignBits(Value *V, const DataLayout &DL,
-                                   unsigned Depth, const Query &Q);
+static unsigned ComputeNumSignBits(Value *V, unsigned Depth, const Query &Q);
 
 unsigned llvm::ComputeNumSignBits(Value *V, const DataLayout &DL,
                                   unsigned Depth, AssumptionCache *AC,
                                   const Instruction *CxtI,
                                   const DominatorTree *DT) {
-  return ::ComputeNumSignBits(V, DL, Depth, Query(AC, safeCxtI(V, CxtI), DT));
+  return ::ComputeNumSignBits(V, Depth, Query(DL, AC, safeCxtI(V, CxtI), DT));
 }
 
 static void computeKnownBitsAddSub(bool Add, Value *Op0, Value *Op1, bool NSW,
                                    APInt &KnownZero, APInt &KnownOne,
                                    APInt &KnownZero2, APInt &KnownOne2,
-                                   const DataLayout &DL, unsigned Depth,
-                                   const Query &Q) {
+                                   unsigned Depth, const Query &Q) {
   if (!Add) {
     if (ConstantInt *CLHS = dyn_cast<ConstantInt>(Op0)) {
       // We know that the top bits of C-X are clear if X contains less bits
@@ -240,7 +246,7 @@ static void computeKnownBitsAddSub(bool Add, Value *Op0, Value *Op1, bool NSW,
         unsigned NLZ = (CLHS->getValue()+1).countLeadingZeros();
         // NLZ can't be BitWidth with no sign bit
         APInt MaskV = APInt::getHighBitsSet(BitWidth, NLZ+1);
-        computeKnownBits(Op1, KnownZero2, KnownOne2, DL, Depth + 1, Q);
+        computeKnownBits(Op1, KnownZero2, KnownOne2, Depth + 1, Q);
 
         // If all of the MaskV bits are known to be zero, then we know the
         // output top bits are zero, because we now know that the output is
@@ -259,8 +265,8 @@ static void computeKnownBitsAddSub(bool Add, Value *Op0, Value *Op1, bool NSW,
   // If an initial sequence of bits in the result is not needed, the
   // corresponding bits in the operands are not needed.
   APInt LHSKnownZero(BitWidth, 0), LHSKnownOne(BitWidth, 0);
-  computeKnownBits(Op0, LHSKnownZero, LHSKnownOne, DL, Depth + 1, Q);
-  computeKnownBits(Op1, KnownZero2, KnownOne2, DL, Depth + 1, Q);
+  computeKnownBits(Op0, LHSKnownZero, LHSKnownOne, Depth + 1, Q);
+  computeKnownBits(Op1, KnownZero2, KnownOne2, Depth + 1, Q);
 
   // Carry in a 1 for a subtract, rather than a 0.
   APInt CarryIn(BitWidth, 0);
@@ -308,11 +314,10 @@ static void computeKnownBitsAddSub(bool Add, Value *Op0, Value *Op1, bool NSW,
 static void computeKnownBitsMul(Value *Op0, Value *Op1, bool NSW,
                                 APInt &KnownZero, APInt &KnownOne,
                                 APInt &KnownZero2, APInt &KnownOne2,
-                                const DataLayout &DL, unsigned Depth,
-                                const Query &Q) {
+                                unsigned Depth, const Query &Q) {
   unsigned BitWidth = KnownZero.getBitWidth();
-  computeKnownBits(Op1, KnownZero, KnownOne, DL, Depth + 1, Q);
-  computeKnownBits(Op0, KnownZero2, KnownOne2, DL, Depth + 1, Q);
+  computeKnownBits(Op1, KnownZero, KnownOne, Depth + 1, Q);
+  computeKnownBits(Op0, KnownZero2, KnownOne2, Depth + 1, Q);
 
   bool isKnownNegative = false;
   bool isKnownNonNegative = false;
@@ -333,9 +338,9 @@ static void computeKnownBitsMul(Value *Op0, Value *Op1, bool NSW,
       // negative or zero.
       if (!isKnownNonNegative)
         isKnownNegative = (isKnownNegativeOp1 && isKnownNonNegativeOp0 &&
-                           isKnownNonZero(Op0, DL, Depth, Q)) ||
+                           isKnownNonZero(Op0, Depth, Q)) ||
                           (isKnownNegativeOp0 && isKnownNonNegativeOp1 &&
-                           isKnownNonZero(Op1, DL, Depth, Q));
+                           isKnownNonZero(Op1, Depth, Q));
     }
   }
 
@@ -451,7 +456,8 @@ static bool isAssumeLikeIntrinsic(const Instruction *I) {
   return false;
 }
 
-static bool isValidAssumeForContext(Value *V, const Query &Q) {
+static bool isValidAssumeForContext(Value *V, const Instruction *CxtI,
+                                    const DominatorTree *DT) {
   Instruction *Inv = cast<Instruction>(V);
 
   // There are two restrictions on the use of an assume:
@@ -462,43 +468,43 @@ static bool isValidAssumeForContext(Value *V, const Query &Q) {
   //     feeding the assume is trivially true, thus causing the removal of
   //     the assume).
 
-  if (Q.DT) {
-    if (Q.DT->dominates(Inv, Q.CxtI)) {
+  if (DT) {
+    if (DT->dominates(Inv, CxtI)) {
       return true;
-    } else if (Inv->getParent() == Q.CxtI->getParent()) {
+    } else if (Inv->getParent() == CxtI->getParent()) {
       // The context comes first, but they're both in the same block. Make sure
       // there is nothing in between that might interrupt the control flow.
       for (BasicBlock::const_iterator I =
-             std::next(BasicBlock::const_iterator(Q.CxtI)),
+             std::next(BasicBlock::const_iterator(CxtI)),
                                       IE(Inv); I != IE; ++I)
         if (!isSafeToSpeculativelyExecute(&*I) && !isAssumeLikeIntrinsic(&*I))
           return false;
 
-      return !isEphemeralValueOf(Inv, Q.CxtI);
+      return !isEphemeralValueOf(Inv, CxtI);
     }
 
     return false;
   }
 
   // When we don't have a DT, we do a limited search...
-  if (Inv->getParent() == Q.CxtI->getParent()->getSinglePredecessor()) {
+  if (Inv->getParent() == CxtI->getParent()->getSinglePredecessor()) {
     return true;
-  } else if (Inv->getParent() == Q.CxtI->getParent()) {
+  } else if (Inv->getParent() == CxtI->getParent()) {
     // Search forward from the assume until we reach the context (or the end
     // of the block); the common case is that the assume will come first.
     for (BasicBlock::iterator I = std::next(BasicBlock::iterator(Inv)),
          IE = Inv->getParent()->end(); I != IE; ++I)
-      if (&*I == Q.CxtI)
+      if (&*I == CxtI)
         return true;
 
     // The context must come first...
     for (BasicBlock::const_iterator I =
-           std::next(BasicBlock::const_iterator(Q.CxtI)),
+           std::next(BasicBlock::const_iterator(CxtI)),
                                     IE(Inv); I != IE; ++I)
       if (!isSafeToSpeculativelyExecute(&*I) && !isAssumeLikeIntrinsic(&*I))
         return false;
 
-    return !isEphemeralValueOf(Inv, Q.CxtI);
+    return !isEphemeralValueOf(Inv, CxtI);
   }
 
   return false;
@@ -507,226 +513,12 @@ static bool isValidAssumeForContext(Value *V, const Query &Q) {
 bool llvm::isValidAssumeForContext(const Instruction *I,
                                    const Instruction *CxtI,
                                    const DominatorTree *DT) {
-  return ::isValidAssumeForContext(const_cast<Instruction *>(I),
-                                   Query(nullptr, CxtI, DT));
-}
-
-template<typename LHS, typename RHS>
-inline match_combine_or<CmpClass_match<LHS, RHS, ICmpInst, ICmpInst::Predicate>,
-                        CmpClass_match<RHS, LHS, ICmpInst, ICmpInst::Predicate>>
-m_c_ICmp(ICmpInst::Predicate &Pred, const LHS &L, const RHS &R) {
-  return m_CombineOr(m_ICmp(Pred, L, R), m_ICmp(Pred, R, L));
-}
-
-template<typename LHS, typename RHS>
-inline match_combine_or<BinaryOp_match<LHS, RHS, Instruction::And>,
-                        BinaryOp_match<RHS, LHS, Instruction::And>>
-m_c_And(const LHS &L, const RHS &R) {
-  return m_CombineOr(m_And(L, R), m_And(R, L));
-}
-
-template<typename LHS, typename RHS>
-inline match_combine_or<BinaryOp_match<LHS, RHS, Instruction::Or>,
-                        BinaryOp_match<RHS, LHS, Instruction::Or>>
-m_c_Or(const LHS &L, const RHS &R) {
-  return m_CombineOr(m_Or(L, R), m_Or(R, L));
-}
-
-template<typename LHS, typename RHS>
-inline match_combine_or<BinaryOp_match<LHS, RHS, Instruction::Xor>,
-                        BinaryOp_match<RHS, LHS, Instruction::Xor>>
-m_c_Xor(const LHS &L, const RHS &R) {
-  return m_CombineOr(m_Xor(L, R), m_Xor(R, L));
-}
-
-/// Compute known bits in 'V' under the assumption that the condition 'Cmp' is
-/// true (at the context instruction.)  This is mostly a utility function for
-/// the prototype dominating conditions reasoning below.
-static void computeKnownBitsFromTrueCondition(Value *V, ICmpInst *Cmp,
-                                              APInt &KnownZero,
-                                              APInt &KnownOne,
-                                              const DataLayout &DL,
-                                              unsigned Depth, const Query &Q) {
-  Value *LHS = Cmp->getOperand(0);
-  Value *RHS = Cmp->getOperand(1);
-  // TODO: We could potentially be more aggressive here.  This would be worth
-  // evaluating.  If we can, explore commoning this code with the assume
-  // handling logic.
-  if (LHS != V && RHS != V)
-    return;
-
-  const unsigned BitWidth = KnownZero.getBitWidth();
-
-  switch (Cmp->getPredicate()) {
-  default:
-    // We know nothing from this condition
-    break;
-  // TODO: implement unsigned bound from below (known one bits)
-  // TODO: common condition check implementations with assumes
-  // TODO: implement other patterns from assume (e.g. V & B == A)
-  case ICmpInst::ICMP_SGT:
-    if (LHS == V) {
-      APInt KnownZeroTemp(BitWidth, 0), KnownOneTemp(BitWidth, 0);
-      computeKnownBits(RHS, KnownZeroTemp, KnownOneTemp, DL, Depth + 1, Q);
-      if (KnownOneTemp.isAllOnesValue() || KnownZeroTemp.isNegative()) {
-        // We know that the sign bit is zero.
-        KnownZero |= APInt::getSignBit(BitWidth);
-      }
-    }
-    break;
-  case ICmpInst::ICMP_EQ:
-    {
-      APInt KnownZeroTemp(BitWidth, 0), KnownOneTemp(BitWidth, 0);
-      if (LHS == V)
-        computeKnownBits(RHS, KnownZeroTemp, KnownOneTemp, DL, Depth + 1, Q);
-      else if (RHS == V)
-        computeKnownBits(LHS, KnownZeroTemp, KnownOneTemp, DL, Depth + 1, Q);
-      else
-        llvm_unreachable("missing use?");
-      KnownZero |= KnownZeroTemp;
-      KnownOne |= KnownOneTemp;
-    }
-    break;
-  case ICmpInst::ICMP_ULE:
-    if (LHS == V) {
-      APInt KnownZeroTemp(BitWidth, 0), KnownOneTemp(BitWidth, 0);
-      computeKnownBits(RHS, KnownZeroTemp, KnownOneTemp, DL, Depth + 1, Q);
-      // The known zero bits carry over
-      unsigned SignBits = KnownZeroTemp.countLeadingOnes();
-      KnownZero |= APInt::getHighBitsSet(BitWidth, SignBits);
-    }
-    break;
-  case ICmpInst::ICMP_ULT:
-    if (LHS == V) {
-      APInt KnownZeroTemp(BitWidth, 0), KnownOneTemp(BitWidth, 0);
-      computeKnownBits(RHS, KnownZeroTemp, KnownOneTemp, DL, Depth + 1, Q);
-      // Whatever high bits in rhs are zero are known to be zero (if rhs is a
-      // power of 2, then one more).
-      unsigned SignBits = KnownZeroTemp.countLeadingOnes();
-      if (isKnownToBeAPowerOfTwo(RHS, false, Depth + 1, Query(Q, Cmp), DL))
-        SignBits++;
-      KnownZero |= APInt::getHighBitsSet(BitWidth, SignBits);
-    }
-    break;
-  };
-}
-
-/// Compute known bits in 'V' from conditions which are known to be true along
-/// all paths leading to the context instruction.  In particular, look for
-/// cases where one branch of an interesting condition dominates the context
-/// instruction.  This does not do general dataflow.
-/// NOTE: This code is EXPERIMENTAL and currently off by default.
-static void computeKnownBitsFromDominatingCondition(Value *V, APInt &KnownZero,
-                                                    APInt &KnownOne,
-                                                    const DataLayout &DL,
-                                                    unsigned Depth,
-                                                    const Query &Q) {
-  // Need both the dominator tree and the query location to do anything useful
-  if (!Q.DT || !Q.CxtI)
-    return;
-  Instruction *Cxt = const_cast<Instruction *>(Q.CxtI);
-  // The context instruction might be in a statically unreachable block.  If
-  // so, asking dominator queries may yield suprising results.  (e.g. the block
-  // may not have a dom tree node)
-  if (!Q.DT->isReachableFromEntry(Cxt->getParent()))
-    return;
-
-  // Avoid useless work
-  if (auto VI = dyn_cast<Instruction>(V))
-    if (VI->getParent() == Cxt->getParent())
-      return;
-
-  // Note: We currently implement two options.  It's not clear which of these
-  // will survive long term, we need data for that.
-  // Option 1 - Try walking the dominator tree looking for conditions which
-  // might apply.  This works well for local conditions (loop guards, etc..),
-  // but not as well for things far from the context instruction (presuming a
-  // low max blocks explored).  If we can set an high enough limit, this would
-  // be all we need.
-  // Option 2 - We restrict out search to those conditions which are uses of
-  // the value we're interested in.  This is independent of dom structure,
-  // but is slightly less powerful without looking through lots of use chains.
-  // It does handle conditions far from the context instruction (e.g. early
-  // function exits on entry) really well though.
-
-  // Option 1 - Search the dom tree
-  unsigned NumBlocksExplored = 0;
-  BasicBlock *Current = Cxt->getParent();
-  while (true) {
-    // Stop searching if we've gone too far up the chain
-    if (NumBlocksExplored >= DomConditionsMaxDomBlocks)
-      break;
-    NumBlocksExplored++;
-
-    if (!Q.DT->getNode(Current)->getIDom())
-      break;
-    Current = Q.DT->getNode(Current)->getIDom()->getBlock();
-    if (!Current)
-      // found function entry
-      break;
-
-    BranchInst *BI = dyn_cast<BranchInst>(Current->getTerminator());
-    if (!BI || BI->isUnconditional())
-      continue;
-    ICmpInst *Cmp = dyn_cast<ICmpInst>(BI->getCondition());
-    if (!Cmp)
-      continue;
-
-    // We're looking for conditions that are guaranteed to hold at the context
-    // instruction.  Finding a condition where one path dominates the context
-    // isn't enough because both the true and false cases could merge before
-    // the context instruction we're actually interested in.  Instead, we need
-    // to ensure that the taken *edge* dominates the context instruction.  We
-    // know that the edge must be reachable since we started from a reachable
-    // block.
-    BasicBlock *BB0 = BI->getSuccessor(0);
-    BasicBlockEdge Edge(BI->getParent(), BB0);
-    if (!Edge.isSingleEdge() || !Q.DT->dominates(Edge, Q.CxtI->getParent()))
-      continue;
-
-    computeKnownBitsFromTrueCondition(V, Cmp, KnownZero, KnownOne, DL, Depth,
-                                      Q);
-  }
-
-  // Option 2 - Search the other uses of V
-  unsigned NumUsesExplored = 0;
-  for (auto U : V->users()) {
-    // Avoid massive lists
-    if (NumUsesExplored >= DomConditionsMaxUses)
-      break;
-    NumUsesExplored++;
-    // Consider only compare instructions uniquely controlling a branch
-    ICmpInst *Cmp = dyn_cast<ICmpInst>(U);
-    if (!Cmp)
-      continue;
-
-    if (DomConditionsSingleCmpUse && !Cmp->hasOneUse())
-      continue;
-
-    for (auto *CmpU : Cmp->users()) {
-      BranchInst *BI = dyn_cast<BranchInst>(CmpU);
-      if (!BI || BI->isUnconditional())
-        continue;
-      // We're looking for conditions that are guaranteed to hold at the
-      // context instruction.  Finding a condition where one path dominates
-      // the context isn't enough because both the true and false cases could
-      // merge before the context instruction we're actually interested in.
-      // Instead, we need to ensure that the taken *edge* dominates the context
-      // instruction. 
-      BasicBlock *BB0 = BI->getSuccessor(0);
-      BasicBlockEdge Edge(BI->getParent(), BB0);
-      if (!Edge.isSingleEdge() || !Q.DT->dominates(Edge, Q.CxtI->getParent()))
-        continue;
-
-      computeKnownBitsFromTrueCondition(V, Cmp, KnownZero, KnownOne, DL, Depth,
-                                        Q);
-    }
-  }
+  return ::isValidAssumeForContext(const_cast<Instruction *>(I), CxtI, DT);
 }
 
 static void computeKnownBitsFromAssume(Value *V, APInt &KnownZero,
-                                       APInt &KnownOne, const DataLayout &DL,
-                                       unsigned Depth, const Query &Q) {
+                                       APInt &KnownOne, unsigned Depth,
+                                       const Query &Q) {
   // Use of assumptions is context-sensitive. If we don't have a context, we
   // cannot use them!
   if (!Q.AC || !Q.CxtI)
@@ -740,7 +532,7 @@ static void computeKnownBitsFromAssume(Value *V, APInt &KnownZero,
     CallInst *I = cast<CallInst>(AssumeVH);
     assert(I->getParent()->getParent() == Q.CxtI->getParent()->getParent() &&
            "Got assumption for the wrong function!");
-    if (Q.ExclInvs.count(I))
+    if (Q.isExcluded(I))
       continue;
 
     // Warning: This loop can end up being somewhat performance sensetive.
@@ -752,7 +544,7 @@ static void computeKnownBitsFromAssume(Value *V, APInt &KnownZero,
 
     Value *Arg = I->getArgOperand(0);
 
-    if (Arg == V && isValidAssumeForContext(I, Q)) {
+    if (Arg == V && isValidAssumeForContext(I, Q.CxtI, Q.DT)) {
       assert(BitWidth == 1 && "assume operand is not i1?");
       KnownZero.clearAllBits();
       KnownOne.setAllBits();
@@ -772,19 +564,20 @@ static void computeKnownBitsFromAssume(Value *V, APInt &KnownZero,
     ConstantInt *C;
     // assume(v = a)
     if (match(Arg, m_c_ICmp(Pred, m_V, m_Value(A))) &&
-        Pred == ICmpInst::ICMP_EQ && isValidAssumeForContext(I, Q)) {
+        Pred == ICmpInst::ICMP_EQ && isValidAssumeForContext(I, Q.CxtI, Q.DT)) {
       APInt RHSKnownZero(BitWidth, 0), RHSKnownOne(BitWidth, 0);
-      computeKnownBits(A, RHSKnownZero, RHSKnownOne, DL, Depth+1, Query(Q, I));
+      computeKnownBits(A, RHSKnownZero, RHSKnownOne, Depth+1, Query(Q, I));
       KnownZero |= RHSKnownZero;
       KnownOne  |= RHSKnownOne;
     // assume(v & b = a)
     } else if (match(Arg,
                      m_c_ICmp(Pred, m_c_And(m_V, m_Value(B)), m_Value(A))) &&
-               Pred == ICmpInst::ICMP_EQ && isValidAssumeForContext(I, Q)) {
+               Pred == ICmpInst::ICMP_EQ &&
+               isValidAssumeForContext(I, Q.CxtI, Q.DT)) {
       APInt RHSKnownZero(BitWidth, 0), RHSKnownOne(BitWidth, 0);
-      computeKnownBits(A, RHSKnownZero, RHSKnownOne, DL, Depth+1, Query(Q, I));
+      computeKnownBits(A, RHSKnownZero, RHSKnownOne, Depth+1, Query(Q, I));
       APInt MaskKnownZero(BitWidth, 0), MaskKnownOne(BitWidth, 0);
-      computeKnownBits(B, MaskKnownZero, MaskKnownOne, DL, Depth+1, Query(Q, I));
+      computeKnownBits(B, MaskKnownZero, MaskKnownOne, Depth+1, Query(Q, I));
 
       // For those bits in the mask that are known to be one, we can propagate
       // known bits from the RHS to V.
@@ -793,11 +586,12 @@ static void computeKnownBitsFromAssume(Value *V, APInt &KnownZero,
     // assume(~(v & b) = a)
     } else if (match(Arg, m_c_ICmp(Pred, m_Not(m_c_And(m_V, m_Value(B))),
                                    m_Value(A))) &&
-               Pred == ICmpInst::ICMP_EQ && isValidAssumeForContext(I, Q)) {
+               Pred == ICmpInst::ICMP_EQ &&
+               isValidAssumeForContext(I, Q.CxtI, Q.DT)) {
       APInt RHSKnownZero(BitWidth, 0), RHSKnownOne(BitWidth, 0);
-      computeKnownBits(A, RHSKnownZero, RHSKnownOne, DL, Depth+1, Query(Q, I));
+      computeKnownBits(A, RHSKnownZero, RHSKnownOne, Depth+1, Query(Q, I));
       APInt MaskKnownZero(BitWidth, 0), MaskKnownOne(BitWidth, 0);
-      computeKnownBits(B, MaskKnownZero, MaskKnownOne, DL, Depth+1, Query(Q, I));
+      computeKnownBits(B, MaskKnownZero, MaskKnownOne, Depth+1, Query(Q, I));
 
       // For those bits in the mask that are known to be one, we can propagate
       // inverted known bits from the RHS to V.
@@ -806,11 +600,12 @@ static void computeKnownBitsFromAssume(Value *V, APInt &KnownZero,
     // assume(v | b = a)
     } else if (match(Arg,
                      m_c_ICmp(Pred, m_c_Or(m_V, m_Value(B)), m_Value(A))) &&
-               Pred == ICmpInst::ICMP_EQ && isValidAssumeForContext(I, Q)) {
+               Pred == ICmpInst::ICMP_EQ &&
+               isValidAssumeForContext(I, Q.CxtI, Q.DT)) {
       APInt RHSKnownZero(BitWidth, 0), RHSKnownOne(BitWidth, 0);
-      computeKnownBits(A, RHSKnownZero, RHSKnownOne, DL, Depth+1, Query(Q, I));
+      computeKnownBits(A, RHSKnownZero, RHSKnownOne, Depth+1, Query(Q, I));
       APInt BKnownZero(BitWidth, 0), BKnownOne(BitWidth, 0);
-      computeKnownBits(B, BKnownZero, BKnownOne, DL, Depth+1, Query(Q, I));
+      computeKnownBits(B, BKnownZero, BKnownOne, Depth+1, Query(Q, I));
 
       // For those bits in B that are known to be zero, we can propagate known
       // bits from the RHS to V.
@@ -819,11 +614,12 @@ static void computeKnownBitsFromAssume(Value *V, APInt &KnownZero,
     // assume(~(v | b) = a)
     } else if (match(Arg, m_c_ICmp(Pred, m_Not(m_c_Or(m_V, m_Value(B))),
                                    m_Value(A))) &&
-               Pred == ICmpInst::ICMP_EQ && isValidAssumeForContext(I, Q)) {
+               Pred == ICmpInst::ICMP_EQ &&
+               isValidAssumeForContext(I, Q.CxtI, Q.DT)) {
       APInt RHSKnownZero(BitWidth, 0), RHSKnownOne(BitWidth, 0);
-      computeKnownBits(A, RHSKnownZero, RHSKnownOne, DL, Depth+1, Query(Q, I));
+      computeKnownBits(A, RHSKnownZero, RHSKnownOne, Depth+1, Query(Q, I));
       APInt BKnownZero(BitWidth, 0), BKnownOne(BitWidth, 0);
-      computeKnownBits(B, BKnownZero, BKnownOne, DL, Depth+1, Query(Q, I));
+      computeKnownBits(B, BKnownZero, BKnownOne, Depth+1, Query(Q, I));
 
       // For those bits in B that are known to be zero, we can propagate
       // inverted known bits from the RHS to V.
@@ -832,11 +628,12 @@ static void computeKnownBitsFromAssume(Value *V, APInt &KnownZero,
     // assume(v ^ b = a)
     } else if (match(Arg,
                      m_c_ICmp(Pred, m_c_Xor(m_V, m_Value(B)), m_Value(A))) &&
-               Pred == ICmpInst::ICMP_EQ && isValidAssumeForContext(I, Q)) {
+               Pred == ICmpInst::ICMP_EQ &&
+               isValidAssumeForContext(I, Q.CxtI, Q.DT)) {
       APInt RHSKnownZero(BitWidth, 0), RHSKnownOne(BitWidth, 0);
-      computeKnownBits(A, RHSKnownZero, RHSKnownOne, DL, Depth+1, Query(Q, I));
+      computeKnownBits(A, RHSKnownZero, RHSKnownOne, Depth+1, Query(Q, I));
       APInt BKnownZero(BitWidth, 0), BKnownOne(BitWidth, 0);
-      computeKnownBits(B, BKnownZero, BKnownOne, DL, Depth+1, Query(Q, I));
+      computeKnownBits(B, BKnownZero, BKnownOne, Depth+1, Query(Q, I));
 
       // For those bits in B that are known to be zero, we can propagate known
       // bits from the RHS to V. For those bits in B that are known to be one,
@@ -848,11 +645,12 @@ static void computeKnownBitsFromAssume(Value *V, APInt &KnownZero,
     // assume(~(v ^ b) = a)
     } else if (match(Arg, m_c_ICmp(Pred, m_Not(m_c_Xor(m_V, m_Value(B))),
                                    m_Value(A))) &&
-               Pred == ICmpInst::ICMP_EQ && isValidAssumeForContext(I, Q)) {
+               Pred == ICmpInst::ICMP_EQ &&
+               isValidAssumeForContext(I, Q.CxtI, Q.DT)) {
       APInt RHSKnownZero(BitWidth, 0), RHSKnownOne(BitWidth, 0);
-      computeKnownBits(A, RHSKnownZero, RHSKnownOne, DL, Depth+1, Query(Q, I));
+      computeKnownBits(A, RHSKnownZero, RHSKnownOne, Depth+1, Query(Q, I));
       APInt BKnownZero(BitWidth, 0), BKnownOne(BitWidth, 0);
-      computeKnownBits(B, BKnownZero, BKnownOne, DL, Depth+1, Query(Q, I));
+      computeKnownBits(B, BKnownZero, BKnownOne, Depth+1, Query(Q, I));
 
       // For those bits in B that are known to be zero, we can propagate
       // inverted known bits from the RHS to V. For those bits in B that are
@@ -864,9 +662,10 @@ static void computeKnownBitsFromAssume(Value *V, APInt &KnownZero,
     // assume(v << c = a)
     } else if (match(Arg, m_c_ICmp(Pred, m_Shl(m_V, m_ConstantInt(C)),
                                    m_Value(A))) &&
-               Pred == ICmpInst::ICMP_EQ && isValidAssumeForContext(I, Q)) {
+               Pred == ICmpInst::ICMP_EQ &&
+               isValidAssumeForContext(I, Q.CxtI, Q.DT)) {
       APInt RHSKnownZero(BitWidth, 0), RHSKnownOne(BitWidth, 0);
-      computeKnownBits(A, RHSKnownZero, RHSKnownOne, DL, Depth+1, Query(Q, I));
+      computeKnownBits(A, RHSKnownZero, RHSKnownOne, Depth+1, Query(Q, I));
       // For those bits in RHS that are known, we can propagate them to known
       // bits in V shifted to the right by C.
       KnownZero |= RHSKnownZero.lshr(C->getZExtValue());
@@ -874,9 +673,10 @@ static void computeKnownBitsFromAssume(Value *V, APInt &KnownZero,
     // assume(~(v << c) = a)
     } else if (match(Arg, m_c_ICmp(Pred, m_Not(m_Shl(m_V, m_ConstantInt(C))),
                                    m_Value(A))) &&
-               Pred == ICmpInst::ICMP_EQ && isValidAssumeForContext(I, Q)) {
+               Pred == ICmpInst::ICMP_EQ &&
+               isValidAssumeForContext(I, Q.CxtI, Q.DT)) {
       APInt RHSKnownZero(BitWidth, 0), RHSKnownOne(BitWidth, 0);
-      computeKnownBits(A, RHSKnownZero, RHSKnownOne, DL, Depth+1, Query(Q, I));
+      computeKnownBits(A, RHSKnownZero, RHSKnownOne, Depth+1, Query(Q, I));
       // For those bits in RHS that are known, we can propagate them inverted
       // to known bits in V shifted to the right by C.
       KnownZero |= RHSKnownOne.lshr(C->getZExtValue());
@@ -886,9 +686,10 @@ static void computeKnownBitsFromAssume(Value *V, APInt &KnownZero,
                      m_c_ICmp(Pred, m_CombineOr(m_LShr(m_V, m_ConstantInt(C)),
                                                 m_AShr(m_V, m_ConstantInt(C))),
                               m_Value(A))) &&
-               Pred == ICmpInst::ICMP_EQ && isValidAssumeForContext(I, Q)) {
+               Pred == ICmpInst::ICMP_EQ &&
+               isValidAssumeForContext(I, Q.CxtI, Q.DT)) {
       APInt RHSKnownZero(BitWidth, 0), RHSKnownOne(BitWidth, 0);
-      computeKnownBits(A, RHSKnownZero, RHSKnownOne, DL, Depth+1, Query(Q, I));
+      computeKnownBits(A, RHSKnownZero, RHSKnownOne, Depth+1, Query(Q, I));
       // For those bits in RHS that are known, we can propagate them to known
       // bits in V shifted to the right by C.
       KnownZero |= RHSKnownZero << C->getZExtValue();
@@ -898,18 +699,20 @@ static void computeKnownBitsFromAssume(Value *V, APInt &KnownZero,
                                              m_LShr(m_V, m_ConstantInt(C)),
                                              m_AShr(m_V, m_ConstantInt(C)))),
                                    m_Value(A))) &&
-               Pred == ICmpInst::ICMP_EQ && isValidAssumeForContext(I, Q)) {
+               Pred == ICmpInst::ICMP_EQ &&
+               isValidAssumeForContext(I, Q.CxtI, Q.DT)) {
       APInt RHSKnownZero(BitWidth, 0), RHSKnownOne(BitWidth, 0);
-      computeKnownBits(A, RHSKnownZero, RHSKnownOne, DL, Depth+1, Query(Q, I));
+      computeKnownBits(A, RHSKnownZero, RHSKnownOne, Depth+1, Query(Q, I));
       // For those bits in RHS that are known, we can propagate them inverted
       // to known bits in V shifted to the right by C.
       KnownZero |= RHSKnownOne  << C->getZExtValue();
       KnownOne  |= RHSKnownZero << C->getZExtValue();
     // assume(v >=_s c) where c is non-negative
     } else if (match(Arg, m_ICmp(Pred, m_V, m_Value(A))) &&
-               Pred == ICmpInst::ICMP_SGE && isValidAssumeForContext(I, Q)) {
+               Pred == ICmpInst::ICMP_SGE &&
+               isValidAssumeForContext(I, Q.CxtI, Q.DT)) {
       APInt RHSKnownZero(BitWidth, 0), RHSKnownOne(BitWidth, 0);
-      computeKnownBits(A, RHSKnownZero, RHSKnownOne, DL, Depth+1, Query(Q, I));
+      computeKnownBits(A, RHSKnownZero, RHSKnownOne, Depth+1, Query(Q, I));
 
       if (RHSKnownZero.isNegative()) {
         // We know that the sign bit is zero.
@@ -917,9 +720,10 @@ static void computeKnownBitsFromAssume(Value *V, APInt &KnownZero,
       }
     // assume(v >_s c) where c is at least -1.
     } else if (match(Arg, m_ICmp(Pred, m_V, m_Value(A))) &&
-               Pred == ICmpInst::ICMP_SGT && isValidAssumeForContext(I, Q)) {
+               Pred == ICmpInst::ICMP_SGT &&
+               isValidAssumeForContext(I, Q.CxtI, Q.DT)) {
       APInt RHSKnownZero(BitWidth, 0), RHSKnownOne(BitWidth, 0);
-      computeKnownBits(A, RHSKnownZero, RHSKnownOne, DL, Depth+1, Query(Q, I));
+      computeKnownBits(A, RHSKnownZero, RHSKnownOne, Depth+1, Query(Q, I));
 
       if (RHSKnownOne.isAllOnesValue() || RHSKnownZero.isNegative()) {
         // We know that the sign bit is zero.
@@ -927,9 +731,10 @@ static void computeKnownBitsFromAssume(Value *V, APInt &KnownZero,
       }
     // assume(v <=_s c) where c is negative
     } else if (match(Arg, m_ICmp(Pred, m_V, m_Value(A))) &&
-               Pred == ICmpInst::ICMP_SLE && isValidAssumeForContext(I, Q)) {
+               Pred == ICmpInst::ICMP_SLE &&
+               isValidAssumeForContext(I, Q.CxtI, Q.DT)) {
       APInt RHSKnownZero(BitWidth, 0), RHSKnownOne(BitWidth, 0);
-      computeKnownBits(A, RHSKnownZero, RHSKnownOne, DL, Depth+1, Query(Q, I));
+      computeKnownBits(A, RHSKnownZero, RHSKnownOne, Depth+1, Query(Q, I));
 
       if (RHSKnownOne.isNegative()) {
         // We know that the sign bit is one.
@@ -937,9 +742,10 @@ static void computeKnownBitsFromAssume(Value *V, APInt &KnownZero,
       }
     // assume(v <_s c) where c is non-positive
     } else if (match(Arg, m_ICmp(Pred, m_V, m_Value(A))) &&
-               Pred == ICmpInst::ICMP_SLT && isValidAssumeForContext(I, Q)) {
+               Pred == ICmpInst::ICMP_SLT &&
+               isValidAssumeForContext(I, Q.CxtI, Q.DT)) {
       APInt RHSKnownZero(BitWidth, 0), RHSKnownOne(BitWidth, 0);
-      computeKnownBits(A, RHSKnownZero, RHSKnownOne, DL, Depth+1, Query(Q, I));
+      computeKnownBits(A, RHSKnownZero, RHSKnownOne, Depth+1, Query(Q, I));
 
       if (RHSKnownZero.isAllOnesValue() || RHSKnownOne.isNegative()) {
         // We know that the sign bit is one.
@@ -947,22 +753,24 @@ static void computeKnownBitsFromAssume(Value *V, APInt &KnownZero,
       }
     // assume(v <=_u c)
     } else if (match(Arg, m_ICmp(Pred, m_V, m_Value(A))) &&
-               Pred == ICmpInst::ICMP_ULE && isValidAssumeForContext(I, Q)) {
+               Pred == ICmpInst::ICMP_ULE &&
+               isValidAssumeForContext(I, Q.CxtI, Q.DT)) {
       APInt RHSKnownZero(BitWidth, 0), RHSKnownOne(BitWidth, 0);
-      computeKnownBits(A, RHSKnownZero, RHSKnownOne, DL, Depth+1, Query(Q, I));
+      computeKnownBits(A, RHSKnownZero, RHSKnownOne, Depth+1, Query(Q, I));
 
       // Whatever high bits in c are zero are known to be zero.
       KnownZero |=
         APInt::getHighBitsSet(BitWidth, RHSKnownZero.countLeadingOnes());
     // assume(v <_u c)
     } else if (match(Arg, m_ICmp(Pred, m_V, m_Value(A))) &&
-               Pred == ICmpInst::ICMP_ULT && isValidAssumeForContext(I, Q)) {
+               Pred == ICmpInst::ICMP_ULT &&
+               isValidAssumeForContext(I, Q.CxtI, Q.DT)) {
       APInt RHSKnownZero(BitWidth, 0), RHSKnownOne(BitWidth, 0);
-      computeKnownBits(A, RHSKnownZero, RHSKnownOne, DL, Depth+1, Query(Q, I));
+      computeKnownBits(A, RHSKnownZero, RHSKnownOne, Depth+1, Query(Q, I));
 
       // Whatever high bits in c are zero are known to be zero (if c is a power
       // of 2, then one more).
-      if (isKnownToBeAPowerOfTwo(A, false, Depth + 1, Query(Q, I), DL))
+      if (isKnownToBeAPowerOfTwo(A, false, Depth + 1, Query(Q, I)))
         KnownZero |=
           APInt::getHighBitsSet(BitWidth, RHSKnownZero.countLeadingOnes()+1);
       else
@@ -984,20 +792,19 @@ template <typename KZFunctor, typename KOFunctor>
 static void computeKnownBitsFromShiftOperator(Operator *I,
               APInt &KnownZero, APInt &KnownOne,
               APInt &KnownZero2, APInt &KnownOne2,
-              const DataLayout &DL, unsigned Depth, const Query &Q,
-              KZFunctor KZF, KOFunctor KOF) {
+              unsigned Depth, const Query &Q, KZFunctor KZF, KOFunctor KOF) {
   unsigned BitWidth = KnownZero.getBitWidth();
 
   if (auto *SA = dyn_cast<ConstantInt>(I->getOperand(1))) {
     unsigned ShiftAmt = SA->getLimitedValue(BitWidth-1);
 
-    computeKnownBits(I->getOperand(0), KnownZero, KnownOne, DL, Depth + 1, Q);
+    computeKnownBits(I->getOperand(0), KnownZero, KnownOne, Depth + 1, Q);
     KnownZero = KZF(KnownZero, ShiftAmt);
     KnownOne  = KOF(KnownOne, ShiftAmt);
     return;
   }
 
-  computeKnownBits(I->getOperand(1), KnownZero, KnownOne, DL, Depth + 1, Q);
+  computeKnownBits(I->getOperand(1), KnownZero, KnownOne, Depth + 1, Q);
 
   // Note: We cannot use KnownZero.getLimitedValue() here, because if
   // BitWidth > 64 and any upper bits are known, we'll end up returning the
@@ -1007,7 +814,8 @@ static void computeKnownBitsFromShiftOperator(Operator *I,
 
   // It would be more-clearly correct to use the two temporaries for this
   // calculation. Reusing the APInts here to prevent unnecessary allocations.
-  KnownZero.clearAllBits(), KnownOne.clearAllBits();
+  KnownZero.clearAllBits();
+  KnownOne.clearAllBits();
 
   // If we know the shifter operand is nonzero, we can sometimes infer more
   // known bits. However this is expensive to compute, so be lazy about it and
@@ -1017,12 +825,12 @@ static void computeKnownBitsFromShiftOperator(Operator *I,
   // Early exit if we can't constrain any well-defined shift amount.
   if (!(ShiftAmtKZ & (BitWidth - 1)) && !(ShiftAmtKO & (BitWidth - 1))) {
     ShifterOperandIsNonZero =
-        isKnownNonZero(I->getOperand(1), DL, Depth + 1, Q);
+        isKnownNonZero(I->getOperand(1), Depth + 1, Q);
     if (!*ShifterOperandIsNonZero)
       return;
   }
 
-  computeKnownBits(I->getOperand(0), KnownZero2, KnownOne2, DL, Depth + 1, Q);
+  computeKnownBits(I->getOperand(0), KnownZero2, KnownOne2, Depth + 1, Q);
 
   KnownZero = KnownOne = APInt::getAllOnesValue(BitWidth);
   for (unsigned ShiftAmt = 0; ShiftAmt < BitWidth; ++ShiftAmt) {
@@ -1038,7 +846,7 @@ static void computeKnownBitsFromShiftOperator(Operator *I,
     if (ShiftAmt == 0) {
       if (!ShifterOperandIsNonZero.hasValue())
         ShifterOperandIsNonZero =
-            isKnownNonZero(I->getOperand(1), DL, Depth + 1, Q);
+            isKnownNonZero(I->getOperand(1), Depth + 1, Q);
       if (*ShifterOperandIsNonZero)
         continue;
     }
@@ -1052,13 +860,15 @@ static void computeKnownBitsFromShiftOperator(Operator *I,
   // return anything we'd like, but we need to make sure the sets of known bits
   // stay disjoint (it should be better for some other code to actually
   // propagate the undef than to pick a value here using known bits).
-  if ((KnownZero & KnownOne) != 0)
-    KnownZero.clearAllBits(), KnownOne.clearAllBits();
+  if ((KnownZero & KnownOne) != 0) {
+    KnownZero.clearAllBits();
+    KnownOne.clearAllBits();
+  }
 }
 
 static void computeKnownBitsFromOperator(Operator *I, APInt &KnownZero,
-                                         APInt &KnownOne, const DataLayout &DL,
-                                         unsigned Depth, const Query &Q) {
+                                         APInt &KnownOne, unsigned Depth,
+                                         const Query &Q) {
   unsigned BitWidth = KnownZero.getBitWidth();
 
   APInt KnownZero2(KnownZero), KnownOne2(KnownOne);
@@ -1070,8 +880,8 @@ static void computeKnownBitsFromOperator(Operator *I, APInt &KnownZero,
     break;
   case Instruction::And: {
     // If either the LHS or the RHS are Zero, the result is zero.
-    computeKnownBits(I->getOperand(1), KnownZero, KnownOne, DL, Depth + 1, Q);
-    computeKnownBits(I->getOperand(0), KnownZero2, KnownOne2, DL, Depth + 1, Q);
+    computeKnownBits(I->getOperand(1), KnownZero, KnownOne, Depth + 1, Q);
+    computeKnownBits(I->getOperand(0), KnownZero2, KnownOne2, Depth + 1, Q);
 
     // Output known-1 bits are only known if set in both the LHS & RHS.
     KnownOne &= KnownOne2;
@@ -1089,15 +899,15 @@ static void computeKnownBitsFromOperator(Operator *I, APInt &KnownZero,
         match(I->getOperand(1), m_Add(m_Specific(I->getOperand(0)),
                                       m_Value(Y)))) {
       APInt KnownZero3(BitWidth, 0), KnownOne3(BitWidth, 0);
-      computeKnownBits(Y, KnownZero3, KnownOne3, DL, Depth + 1, Q);
+      computeKnownBits(Y, KnownZero3, KnownOne3, Depth + 1, Q);
       if (KnownOne3.countTrailingOnes() > 0)
         KnownZero |= APInt::getLowBitsSet(BitWidth, 1);
     }
     break;
   }
   case Instruction::Or: {
-    computeKnownBits(I->getOperand(1), KnownZero, KnownOne, DL, Depth + 1, Q);
-    computeKnownBits(I->getOperand(0), KnownZero2, KnownOne2, DL, Depth + 1, Q);
+    computeKnownBits(I->getOperand(1), KnownZero, KnownOne, Depth + 1, Q);
+    computeKnownBits(I->getOperand(0), KnownZero2, KnownOne2, Depth + 1, Q);
 
     // Output known-0 bits are only known if clear in both the LHS & RHS.
     KnownZero &= KnownZero2;
@@ -1106,8 +916,8 @@ static void computeKnownBitsFromOperator(Operator *I, APInt &KnownZero,
     break;
   }
   case Instruction::Xor: {
-    computeKnownBits(I->getOperand(1), KnownZero, KnownOne, DL, Depth + 1, Q);
-    computeKnownBits(I->getOperand(0), KnownZero2, KnownOne2, DL, Depth + 1, Q);
+    computeKnownBits(I->getOperand(1), KnownZero, KnownOne, Depth + 1, Q);
+    computeKnownBits(I->getOperand(0), KnownZero2, KnownOne2, Depth + 1, Q);
 
     // Output known-0 bits are known if clear or set in both the LHS & RHS.
     APInt KnownZeroOut = (KnownZero & KnownZero2) | (KnownOne & KnownOne2);
@@ -1119,19 +929,19 @@ static void computeKnownBitsFromOperator(Operator *I, APInt &KnownZero,
   case Instruction::Mul: {
     bool NSW = cast<OverflowingBinaryOperator>(I)->hasNoSignedWrap();
     computeKnownBitsMul(I->getOperand(0), I->getOperand(1), NSW, KnownZero,
-                        KnownOne, KnownZero2, KnownOne2, DL, Depth, Q);
+                        KnownOne, KnownZero2, KnownOne2, Depth, Q);
     break;
   }
   case Instruction::UDiv: {
     // For the purposes of computing leading zeros we can conservatively
     // treat a udiv as a logical right shift by the power of 2 known to
     // be less than the denominator.
-    computeKnownBits(I->getOperand(0), KnownZero2, KnownOne2, DL, Depth + 1, Q);
+    computeKnownBits(I->getOperand(0), KnownZero2, KnownOne2, Depth + 1, Q);
     unsigned LeadZ = KnownZero2.countLeadingOnes();
 
     KnownOne2.clearAllBits();
     KnownZero2.clearAllBits();
-    computeKnownBits(I->getOperand(1), KnownZero2, KnownOne2, DL, Depth + 1, Q);
+    computeKnownBits(I->getOperand(1), KnownZero2, KnownOne2, Depth + 1, Q);
     unsigned RHSUnknownLeadingOnes = KnownOne2.countLeadingZeros();
     if (RHSUnknownLeadingOnes != BitWidth)
       LeadZ = std::min(BitWidth,
@@ -1141,8 +951,8 @@ static void computeKnownBitsFromOperator(Operator *I, APInt &KnownZero,
     break;
   }
   case Instruction::Select:
-    computeKnownBits(I->getOperand(2), KnownZero, KnownOne, DL, Depth + 1, Q);
-    computeKnownBits(I->getOperand(1), KnownZero2, KnownOne2, DL, Depth + 1, Q);
+    computeKnownBits(I->getOperand(2), KnownZero, KnownOne, Depth + 1, Q);
+    computeKnownBits(I->getOperand(1), KnownZero2, KnownOne2, Depth + 1, Q);
 
     // Only known if known in both the LHS and RHS.
     KnownOne &= KnownOne2;
@@ -1166,12 +976,12 @@ static void computeKnownBitsFromOperator(Operator *I, APInt &KnownZero,
     unsigned SrcBitWidth;
     // Note that we handle pointer operands here because of inttoptr/ptrtoint
     // which fall through here.
-    SrcBitWidth = DL.getTypeSizeInBits(SrcTy->getScalarType());
+    SrcBitWidth = Q.DL.getTypeSizeInBits(SrcTy->getScalarType());
 
     assert(SrcBitWidth && "SrcBitWidth can't be zero");
     KnownZero = KnownZero.zextOrTrunc(SrcBitWidth);
     KnownOne = KnownOne.zextOrTrunc(SrcBitWidth);
-    computeKnownBits(I->getOperand(0), KnownZero, KnownOne, DL, Depth + 1, Q);
+    computeKnownBits(I->getOperand(0), KnownZero, KnownOne, Depth + 1, Q);
     KnownZero = KnownZero.zextOrTrunc(BitWidth);
     KnownOne = KnownOne.zextOrTrunc(BitWidth);
     // Any top bits are known to be zero.
@@ -1181,12 +991,11 @@ static void computeKnownBitsFromOperator(Operator *I, APInt &KnownZero,
   }
   case Instruction::BitCast: {
     Type *SrcTy = I->getOperand(0)->getType();
-    if ((SrcTy->isIntegerTy() || SrcTy->isPointerTy() ||
-         SrcTy->isFloatingPointTy()) &&
+    if ((SrcTy->isIntegerTy() || SrcTy->isPointerTy()) &&
         // TODO: For now, not handling conversions like:
         // (bitcast i64 %x to <2 x i32>)
         !I->getType()->isVectorTy()) {
-      computeKnownBits(I->getOperand(0), KnownZero, KnownOne, DL, Depth + 1, Q);
+      computeKnownBits(I->getOperand(0), KnownZero, KnownOne, Depth + 1, Q);
       break;
     }
     break;
@@ -1197,7 +1006,7 @@ static void computeKnownBitsFromOperator(Operator *I, APInt &KnownZero,
 
     KnownZero = KnownZero.trunc(SrcBitWidth);
     KnownOne = KnownOne.trunc(SrcBitWidth);
-    computeKnownBits(I->getOperand(0), KnownZero, KnownOne, DL, Depth + 1, Q);
+    computeKnownBits(I->getOperand(0), KnownZero, KnownOne, Depth + 1, Q);
     KnownZero = KnownZero.zext(BitWidth);
     KnownOne = KnownOne.zext(BitWidth);
 
@@ -1221,8 +1030,8 @@ static void computeKnownBitsFromOperator(Operator *I, APInt &KnownZero,
     };
 
     computeKnownBitsFromShiftOperator(I, KnownZero, KnownOne,
-                                      KnownZero2, KnownOne2, DL, Depth, Q,
-                                      KZF, KOF);
+                                      KnownZero2, KnownOne2, Depth, Q, KZF,
+                                      KOF);
     break;
   }
   case Instruction::LShr: {
@@ -1238,8 +1047,8 @@ static void computeKnownBitsFromOperator(Operator *I, APInt &KnownZero,
     };
 
     computeKnownBitsFromShiftOperator(I, KnownZero, KnownOne,
-                                      KnownZero2, KnownOne2, DL, Depth, Q,
-                                      KZF, KOF);
+                                      KnownZero2, KnownOne2, Depth, Q, KZF,
+                                      KOF);
     break;
   }
   case Instruction::AShr: {
@@ -1253,22 +1062,22 @@ static void computeKnownBitsFromOperator(Operator *I, APInt &KnownZero,
     };
 
     computeKnownBitsFromShiftOperator(I, KnownZero, KnownOne,
-                                      KnownZero2, KnownOne2, DL, Depth, Q,
-                                      KZF, KOF);
+                                      KnownZero2, KnownOne2, Depth, Q, KZF,
+                                      KOF);
     break;
   }
   case Instruction::Sub: {
     bool NSW = cast<OverflowingBinaryOperator>(I)->hasNoSignedWrap();
     computeKnownBitsAddSub(false, I->getOperand(0), I->getOperand(1), NSW,
-                           KnownZero, KnownOne, KnownZero2, KnownOne2, DL,
-                           Depth, Q);
+                           KnownZero, KnownOne, KnownZero2, KnownOne2, Depth,
+                           Q);
     break;
   }
   case Instruction::Add: {
     bool NSW = cast<OverflowingBinaryOperator>(I)->hasNoSignedWrap();
     computeKnownBitsAddSub(true, I->getOperand(0), I->getOperand(1), NSW,
-                           KnownZero, KnownOne, KnownZero2, KnownOne2, DL,
-                           Depth, Q);
+                           KnownZero, KnownOne, KnownZero2, KnownOne2, Depth,
+                           Q);
     break;
   }
   case Instruction::SRem:
@@ -1276,7 +1085,7 @@ static void computeKnownBitsFromOperator(Operator *I, APInt &KnownZero,
       APInt RA = Rem->getValue().abs();
       if (RA.isPowerOf2()) {
         APInt LowBits = RA - 1;
-        computeKnownBits(I->getOperand(0), KnownZero2, KnownOne2, DL, Depth + 1,
+        computeKnownBits(I->getOperand(0), KnownZero2, KnownOne2, Depth + 1,
                          Q);
 
         // The low bits of the first operand are unchanged by the srem.
@@ -1301,8 +1110,8 @@ static void computeKnownBitsFromOperator(Operator *I, APInt &KnownZero,
     // remainder is zero.
     if (KnownZero.isNonNegative()) {
       APInt LHSKnownZero(BitWidth, 0), LHSKnownOne(BitWidth, 0);
-      computeKnownBits(I->getOperand(0), LHSKnownZero, LHSKnownOne, DL,
-                       Depth + 1, Q);
+      computeKnownBits(I->getOperand(0), LHSKnownZero, LHSKnownOne, Depth + 1,
+                       Q);
       // If it's known zero, our sign bit is also zero.
       if (LHSKnownZero.isNegative())
         KnownZero.setBit(BitWidth - 1);
@@ -1311,11 +1120,10 @@ static void computeKnownBitsFromOperator(Operator *I, APInt &KnownZero,
     break;
   case Instruction::URem: {
     if (ConstantInt *Rem = dyn_cast<ConstantInt>(I->getOperand(1))) {
-      APInt RA = Rem->getValue();
+      const APInt &RA = Rem->getValue();
       if (RA.isPowerOf2()) {
         APInt LowBits = (RA - 1);
-        computeKnownBits(I->getOperand(0), KnownZero, KnownOne, DL, Depth + 1,
-                         Q);
+        computeKnownBits(I->getOperand(0), KnownZero, KnownOne, Depth + 1, Q);
         KnownZero |= ~LowBits;
         KnownOne &= LowBits;
         break;
@@ -1324,8 +1132,8 @@ static void computeKnownBitsFromOperator(Operator *I, APInt &KnownZero,
 
     // Since the result is less than or equal to either operand, any leading
     // zero bits in either operand must also exist in the result.
-    computeKnownBits(I->getOperand(0), KnownZero, KnownOne, DL, Depth + 1, Q);
-    computeKnownBits(I->getOperand(1), KnownZero2, KnownOne2, DL, Depth + 1, Q);
+    computeKnownBits(I->getOperand(0), KnownZero, KnownOne, Depth + 1, Q);
+    computeKnownBits(I->getOperand(1), KnownZero2, KnownOne2, Depth + 1, Q);
 
     unsigned Leaders = std::max(KnownZero.countLeadingOnes(),
                                 KnownZero2.countLeadingOnes());
@@ -1338,7 +1146,7 @@ static void computeKnownBitsFromOperator(Operator *I, APInt &KnownZero,
     AllocaInst *AI = cast<AllocaInst>(I);
     unsigned Align = AI->getAlignment();
     if (Align == 0)
-      Align = DL.getABITypeAlignment(AI->getType()->getElementType());
+      Align = Q.DL.getABITypeAlignment(AI->getAllocatedType());
 
     if (Align > 0)
       KnownZero = APInt::getLowBitsSet(BitWidth, countTrailingZeros(Align));
@@ -1348,8 +1156,8 @@ static void computeKnownBitsFromOperator(Operator *I, APInt &KnownZero,
     // Analyze all of the subscripts of this getelementptr instruction
     // to determine if we can prove known low zero bits.
     APInt LocalKnownZero(BitWidth, 0), LocalKnownOne(BitWidth, 0);
-    computeKnownBits(I->getOperand(0), LocalKnownZero, LocalKnownOne, DL,
-                     Depth + 1, Q);
+    computeKnownBits(I->getOperand(0), LocalKnownZero, LocalKnownOne, Depth + 1,
+                     Q);
     unsigned TrailZ = LocalKnownZero.countTrailingOnes();
 
     gep_type_iterator GTI = gep_type_begin(I);
@@ -1367,7 +1175,7 @@ static void computeKnownBitsFromOperator(Operator *I, APInt &KnownZero,
           Index = CIndex->getSplatValue();
 
         unsigned Idx = cast<ConstantInt>(Index)->getZExtValue();
-        const StructLayout *SL = DL.getStructLayout(STy);
+        const StructLayout *SL = Q.DL.getStructLayout(STy);
         uint64_t Offset = SL->getElementOffset(Idx);
         TrailZ = std::min<unsigned>(TrailZ,
                                     countTrailingZeros(Offset));
@@ -1379,10 +1187,9 @@ static void computeKnownBitsFromOperator(Operator *I, APInt &KnownZero,
           break;
         }
         unsigned GEPOpiBits = Index->getType()->getScalarSizeInBits();
-        uint64_t TypeSize = DL.getTypeAllocSize(IndexedTy);
+        uint64_t TypeSize = Q.DL.getTypeAllocSize(IndexedTy);
         LocalKnownZero = LocalKnownOne = APInt(GEPOpiBits, 0);
-        computeKnownBits(Index, LocalKnownZero, LocalKnownOne, DL, Depth + 1,
-                         Q);
+        computeKnownBits(Index, LocalKnownZero, LocalKnownOne, Depth + 1, Q);
         TrailZ = std::min(TrailZ,
                           unsigned(countTrailingZeros(TypeSize) +
                                    LocalKnownZero.countTrailingOnes()));
@@ -1424,11 +1231,11 @@ static void computeKnownBitsFromOperator(Operator *I, APInt &KnownZero,
             break;
           // Ok, we have a PHI of the form L op= R. Check for low
           // zero bits.
-          computeKnownBits(R, KnownZero2, KnownOne2, DL, Depth + 1, Q);
+          computeKnownBits(R, KnownZero2, KnownOne2, Depth + 1, Q);
 
           // We need to take the minimum number of known bits
           APInt KnownZero3(KnownZero), KnownOne3(KnownOne);
-          computeKnownBits(L, KnownZero3, KnownOne3, DL, Depth + 1, Q);
+          computeKnownBits(L, KnownZero3, KnownOne3, Depth + 1, Q);
 
           KnownZero = APInt::getLowBitsSet(BitWidth,
                                            std::min(KnownZero2.countTrailingOnes(),
@@ -1459,8 +1266,7 @@ static void computeKnownBitsFromOperator(Operator *I, APInt &KnownZero,
         KnownOne2 = APInt(BitWidth, 0);
         // Recurse, but cap the recursion to one level, because we don't
         // want to waste time spinning around in loops.
-        computeKnownBits(IncValue, KnownZero2, KnownOne2, DL,
-                         MaxDepth - 1, Q);
+        computeKnownBits(IncValue, KnownZero2, KnownOne2, MaxDepth - 1, Q);
         KnownZero &= KnownZero2;
         KnownOne &= KnownOne2;
         // If all bits have been ruled out, there's no need to check
@@ -1473,17 +1279,21 @@ static void computeKnownBitsFromOperator(Operator *I, APInt &KnownZero,
   }
   case Instruction::Call:
   case Instruction::Invoke:
+    // If range metadata is attached to this call, set known bits from that,
+    // and then intersect with known bits based on other properties of the
+    // function.
     if (MDNode *MD = cast<Instruction>(I)->getMetadata(LLVMContext::MD_range))
       computeKnownBitsFromRangeMetadata(*MD, KnownZero, KnownOne);
-    // If a range metadata is attached to this IntrinsicInst, intersect the
-    // explicit range specified by the metadata and the implicit range of
-    // the intrinsic.
+    if (Value *RV = CallSite(I).getReturnedArgOperand()) {
+      computeKnownBits(RV, KnownZero2, KnownOne2, Depth + 1, Q);
+      KnownZero |= KnownZero2;
+      KnownOne |= KnownOne2;
+    }
     if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(I)) {
       switch (II->getIntrinsicID()) {
       default: break;
       case Intrinsic::bswap:
-        computeKnownBits(I->getOperand(0), KnownZero2, KnownOne2, DL,
-                         Depth + 1, Q);
+        computeKnownBits(I->getOperand(0), KnownZero2, KnownOne2, Depth + 1, Q);
         KnownZero |= KnownZero2.byteSwap();
         KnownOne |= KnownOne2.byteSwap();
         break;
@@ -1497,8 +1307,7 @@ static void computeKnownBitsFromOperator(Operator *I, APInt &KnownZero,
         break;
       }
       case Intrinsic::ctpop: {
-        computeKnownBits(I->getOperand(0), KnownZero2, KnownOne2, DL,
-                         Depth + 1, Q);
+        computeKnownBits(I->getOperand(0), KnownZero2, KnownOne2, Depth + 1, Q);
         // We can bound the space the count needs.  Also, bits known to be zero
         // can't contribute to the population.
         unsigned BitsPossiblySet = BitWidth - KnownZero2.countPopulation();
@@ -1511,12 +1320,6 @@ static void computeKnownBitsFromOperator(Operator *I, APInt &KnownZero,
         // of bits which might be set provided by popcnt KnownOne2.
         break;
       }
-      case Intrinsic::fabs: {
-        Type *Ty = II->getType();
-        APInt SignBit = APInt::getSignBit(Ty->getScalarSizeInBits());
-        KnownZero |= APInt::getSplat(Ty->getPrimitiveSizeInBits(), SignBit);
-        break;
-      }
       case Intrinsic::x86_sse42_crc32_64_64:
         KnownZero |= APInt::getHighBitsSet(64, 32);
         break;
@@ -1534,19 +1337,19 @@ static void computeKnownBitsFromOperator(Operator *I, APInt &KnownZero,
         case Intrinsic::sadd_with_overflow:
           computeKnownBitsAddSub(true, II->getArgOperand(0),
                                  II->getArgOperand(1), false, KnownZero,
-                                 KnownOne, KnownZero2, KnownOne2, DL, Depth, Q);
+                                 KnownOne, KnownZero2, KnownOne2, Depth, Q);
           break;
         case Intrinsic::usub_with_overflow:
         case Intrinsic::ssub_with_overflow:
           computeKnownBitsAddSub(false, II->getArgOperand(0),
                                  II->getArgOperand(1), false, KnownZero,
-                                 KnownOne, KnownZero2, KnownOne2, DL, Depth, Q);
+                                 KnownOne, KnownZero2, KnownOne2, Depth, Q);
           break;
         case Intrinsic::umul_with_overflow:
         case Intrinsic::smul_with_overflow:
           computeKnownBitsMul(II->getArgOperand(0), II->getArgOperand(1), false,
-                              KnownZero, KnownOne, KnownZero2, KnownOne2, DL,
-                              Depth, Q);
+                              KnownZero, KnownOne, KnownZero2, KnownOne2, Depth,
+                              Q);
           break;
         }
       }
@@ -1554,46 +1357,6 @@ static void computeKnownBitsFromOperator(Operator *I, APInt &KnownZero,
   }
 }
 
-static unsigned getAlignment(const Value *V, const DataLayout &DL) {
-  unsigned Align = 0;
-  if (auto *GO = dyn_cast<GlobalObject>(V)) {
-    Align = GO->getAlignment();
-    if (Align == 0) {
-      if (auto *GVar = dyn_cast<GlobalVariable>(GO)) {
-        Type *ObjectType = GVar->getType()->getElementType();
-        if (ObjectType->isSized()) {
-          // If the object is defined in the current Module, we'll be giving
-          // it the preferred alignment. Otherwise, we have to assume that it
-          // may only have the minimum ABI alignment.
-          if (GVar->isStrongDefinitionForLinker())
-            Align = DL.getPreferredAlignment(GVar);
-          else
-            Align = DL.getABITypeAlignment(ObjectType);
-        }
-      }
-    }
-  } else if (const Argument *A = dyn_cast<Argument>(V)) {
-    Align = A->getType()->isPointerTy() ? A->getParamAlignment() : 0;
-
-    if (!Align && A->hasStructRetAttr()) {
-      // An sret parameter has at least the ABI alignment of the return type.
-      Type *EltTy = cast<PointerType>(A->getType())->getElementType();
-      if (EltTy->isSized())
-        Align = DL.getABITypeAlignment(EltTy);
-    }
-  } else if (const AllocaInst *AI = dyn_cast<AllocaInst>(V))
-    Align = AI->getAlignment();
-  else if (auto CS = ImmutableCallSite(V))
-    Align = CS.getAttributes().getParamAlignment(AttributeSet::ReturnIndex);
-  else if (const LoadInst *LI = dyn_cast<LoadInst>(V))
-    if (MDNode *MD = LI->getMetadata(LLVMContext::MD_align)) {
-      ConstantInt *CI = mdconst::extract<ConstantInt>(MD->getOperand(0));
-      Align = CI->getLimitedValue();
-    }
-
-  return Align;
-}
-
 /// Determine which bits of V are known to be either zero or one and return
 /// them in the KnownZero/KnownOne bit sets.
 ///
@@ -1610,16 +1373,15 @@ static unsigned getAlignment(const Value *V, const DataLayout &DL) {
 /// same width as the vector element, and the bit is set only if it is true
 /// for all of the elements in the vector.
 void computeKnownBits(Value *V, APInt &KnownZero, APInt &KnownOne,
-                      const DataLayout &DL, unsigned Depth, const Query &Q) {
+                      unsigned Depth, const Query &Q) {
   assert(V && "No Value?");
   assert(Depth <= MaxDepth && "Limit Search Depth");
   unsigned BitWidth = KnownZero.getBitWidth();
 
   assert((V->getType()->isIntOrIntVectorTy() ||
-          V->getType()->isFPOrFPVectorTy() ||
           V->getType()->getScalarType()->isPointerTy()) &&
-         "Not integer, floating point, or pointer type!");
-  assert((DL.getTypeSizeInBits(V->getType()->getScalarType()) == BitWidth) &&
+         "Not integer or pointer type!");
+  assert((Q.DL.getTypeSizeInBits(V->getType()->getScalarType()) == BitWidth) &&
          (!V->getType()->isIntOrIntVectorTy() ||
           V->getType()->getScalarSizeInBits() == BitWidth) &&
          KnownZero.getBitWidth() == BitWidth &&
@@ -1633,15 +1395,13 @@ void computeKnownBits(Value *V, APInt &KnownZero, APInt &KnownOne,
     return;
   }
   // Null and aggregate-zero are all-zeros.
-  if (isa<ConstantPointerNull>(V) ||
-      isa<ConstantAggregateZero>(V)) {
+  if (isa<ConstantPointerNull>(V) || isa<ConstantAggregateZero>(V)) {
     KnownOne.clearAllBits();
     KnownZero = APInt::getAllOnesValue(BitWidth);
     return;
   }
   // Handle a constant vector by taking the intersection of the known bits of
-  // each element.  There is no real need to handle ConstantVector here, because
-  // we don't handle undef in any particularly useful way.
+  // each element.
   if (ConstantDataSequential *CDS = dyn_cast<ConstantDataSequential>(V)) {
     // We know that CDS must be a vector of integers. Take the intersection of
     // each element.
@@ -1655,6 +1415,26 @@ void computeKnownBits(Value *V, APInt &KnownZero, APInt &KnownOne,
     return;
   }
 
+  if (auto *CV = dyn_cast<ConstantVector>(V)) {
+    // We know that CV must be a vector of integers. Take the intersection of
+    // each element.
+    KnownZero.setAllBits(); KnownOne.setAllBits();
+    APInt Elt(KnownZero.getBitWidth(), 0);
+    for (unsigned i = 0, e = CV->getNumOperands(); i != e; ++i) {
+      Constant *Element = CV->getAggregateElement(i);
+      auto *ElementCI = dyn_cast_or_null<ConstantInt>(Element);
+      if (!ElementCI) {
+        KnownZero.clearAllBits();
+        KnownOne.clearAllBits();
+        return;
+      }
+      Elt = ElementCI->getValue();
+      KnownZero &= ~Elt;
+      KnownOne &= Elt;
+    }
+    return;
+  }
+
   // Start out not knowing anything.
   KnownZero.clearAllBits(); KnownOne.clearAllBits();
 
@@ -1666,33 +1446,26 @@ void computeKnownBits(Value *V, APInt &KnownZero, APInt &KnownOne,
   // A weak GlobalAlias is totally unknown. A non-weak GlobalAlias has
   // the bits of its aliasee.
   if (GlobalAlias *GA = dyn_cast<GlobalAlias>(V)) {
-    if (!GA->mayBeOverridden())
-      computeKnownBits(GA->getAliasee(), KnownZero, KnownOne, DL, Depth + 1, Q);
+    if (!GA->isInterposable())
+      computeKnownBits(GA->getAliasee(), KnownZero, KnownOne, Depth + 1, Q);
     return;
   }
 
   if (Operator *I = dyn_cast<Operator>(V))
-    computeKnownBitsFromOperator(I, KnownZero, KnownOne, DL, Depth, Q);
+    computeKnownBitsFromOperator(I, KnownZero, KnownOne, Depth, Q);
 
   // Aligned pointers have trailing zeros - refine KnownZero set
   if (V->getType()->isPointerTy()) {
-    unsigned Align = getAlignment(V, DL);
+    unsigned Align = V->getPointerAlignment(Q.DL);
     if (Align)
       KnownZero |= APInt::getLowBitsSet(BitWidth, countTrailingZeros(Align));
   }
 
-  // computeKnownBitsFromAssume and computeKnownBitsFromDominatingCondition
-  // strictly refines KnownZero and KnownOne. Therefore, we run them after
-  // computeKnownBitsFromOperator.
+  // computeKnownBitsFromAssume strictly refines KnownZero and
+  // KnownOne. Therefore, we run them after computeKnownBitsFromOperator.
 
   // Check whether a nearby assume intrinsic can determine some known bits.
-  computeKnownBitsFromAssume(V, KnownZero, KnownOne, DL, Depth, Q);
-
-  // Check whether there's a dominating condition which implies something about
-  // this value at the given context.
-  if (EnableDomConditions && Depth <= DomConditionsMaxDepth)
-    computeKnownBitsFromDominatingCondition(V, KnownZero, KnownOne, DL, Depth,
-                                            Q);
+  computeKnownBitsFromAssume(V, KnownZero, KnownOne, Depth, Q);
 
   assert((KnownZero & KnownOne) == 0 && "Bits known to be one AND zero?");
 }
@@ -1700,8 +1473,8 @@ void computeKnownBits(Value *V, APInt &KnownZero, APInt &KnownOne,
 /// Determine whether the sign bit is known to be zero or one.
 /// Convenience wrapper around computeKnownBits.
 void ComputeSignBit(Value *V, bool &KnownZero, bool &KnownOne,
-                    const DataLayout &DL, unsigned Depth, const Query &Q) {
-  unsigned BitWidth = getBitWidth(V->getType(), DL);
+                    unsigned Depth, const Query &Q) {
+  unsigned BitWidth = getBitWidth(V->getType(), Q.DL);
   if (!BitWidth) {
     KnownZero = false;
     KnownOne = false;
@@ -1709,7 +1482,7 @@ void ComputeSignBit(Value *V, bool &KnownZero, bool &KnownOne,
   }
   APInt ZeroBits(BitWidth, 0);
   APInt OneBits(BitWidth, 0);
-  computeKnownBits(V, ZeroBits, OneBits, DL, Depth, Q);
+  computeKnownBits(V, ZeroBits, OneBits, Depth, Q);
   KnownOne = OneBits[BitWidth - 1];
   KnownZero = ZeroBits[BitWidth - 1];
 }
@@ -1719,13 +1492,14 @@ void ComputeSignBit(Value *V, bool &KnownZero, bool &KnownOne,
 /// be a power of two when defined. Supports values with integer or pointer
 /// types and vectors of integers.
 bool isKnownToBeAPowerOfTwo(Value *V, bool OrZero, unsigned Depth,
-                            const Query &Q, const DataLayout &DL) {
+                            const Query &Q) {
   if (Constant *C = dyn_cast<Constant>(V)) {
     if (C->isNullValue())
       return OrZero;
-    if (ConstantInt *CI = dyn_cast<ConstantInt>(C))
-      return CI->getValue().isPowerOf2();
-    // TODO: Handle vector constants.
+
+    const APInt *ConstIntOrConstSplatInt;
+    if (match(C, m_APInt(ConstIntOrConstSplatInt)))
+      return ConstIntOrConstSplatInt->isPowerOf2();
   }
 
   // 1 << X is clearly a power of two if the one is not shifted off the end.  If
@@ -1747,19 +1521,19 @@ bool isKnownToBeAPowerOfTwo(Value *V, bool OrZero, unsigned Depth,
   // or zero.
   if (OrZero && (match(V, m_Shl(m_Value(X), m_Value())) ||
                  match(V, m_LShr(m_Value(X), m_Value()))))
-    return isKnownToBeAPowerOfTwo(X, /*OrZero*/ true, Depth, Q, DL);
+    return isKnownToBeAPowerOfTwo(X, /*OrZero*/ true, Depth, Q);
 
   if (ZExtInst *ZI = dyn_cast<ZExtInst>(V))
-    return isKnownToBeAPowerOfTwo(ZI->getOperand(0), OrZero, Depth, Q, DL);
+    return isKnownToBeAPowerOfTwo(ZI->getOperand(0), OrZero, Depth, Q);
 
   if (SelectInst *SI = dyn_cast<SelectInst>(V))
-    return isKnownToBeAPowerOfTwo(SI->getTrueValue(), OrZero, Depth, Q, DL) &&
-           isKnownToBeAPowerOfTwo(SI->getFalseValue(), OrZero, Depth, Q, DL);
+    return isKnownToBeAPowerOfTwo(SI->getTrueValue(), OrZero, Depth, Q) &&
+           isKnownToBeAPowerOfTwo(SI->getFalseValue(), OrZero, Depth, Q);
 
   if (OrZero && match(V, m_And(m_Value(X), m_Value(Y)))) {
     // A power of two and'd with anything is a power of two or zero.
-    if (isKnownToBeAPowerOfTwo(X, /*OrZero*/ true, Depth, Q, DL) ||
-        isKnownToBeAPowerOfTwo(Y, /*OrZero*/ true, Depth, Q, DL))
+    if (isKnownToBeAPowerOfTwo(X, /*OrZero*/ true, Depth, Q) ||
+        isKnownToBeAPowerOfTwo(Y, /*OrZero*/ true, Depth, Q))
       return true;
     // X & (-X) is always a power of two or zero.
     if (match(X, m_Neg(m_Specific(Y))) || match(Y, m_Neg(m_Specific(X))))
@@ -1774,19 +1548,19 @@ bool isKnownToBeAPowerOfTwo(Value *V, bool OrZero, unsigned Depth,
     if (OrZero || VOBO->hasNoUnsignedWrap() || VOBO->hasNoSignedWrap()) {
       if (match(X, m_And(m_Specific(Y), m_Value())) ||
           match(X, m_And(m_Value(), m_Specific(Y))))
-        if (isKnownToBeAPowerOfTwo(Y, OrZero, Depth, Q, DL))
+        if (isKnownToBeAPowerOfTwo(Y, OrZero, Depth, Q))
           return true;
       if (match(Y, m_And(m_Specific(X), m_Value())) ||
           match(Y, m_And(m_Value(), m_Specific(X))))
-        if (isKnownToBeAPowerOfTwo(X, OrZero, Depth, Q, DL))
+        if (isKnownToBeAPowerOfTwo(X, OrZero, Depth, Q))
           return true;
 
       unsigned BitWidth = V->getType()->getScalarSizeInBits();
       APInt LHSZeroBits(BitWidth, 0), LHSOneBits(BitWidth, 0);
-      computeKnownBits(X, LHSZeroBits, LHSOneBits, DL, Depth, Q);
+      computeKnownBits(X, LHSZeroBits, LHSOneBits, Depth, Q);
 
       APInt RHSZeroBits(BitWidth, 0), RHSOneBits(BitWidth, 0);
-      computeKnownBits(Y, RHSZeroBits, RHSOneBits, DL, Depth, Q);
+      computeKnownBits(Y, RHSZeroBits, RHSOneBits, Depth, Q);
       // If i8 V is a power of two or zero:
       //  ZeroBits: 1 1 1 0 1 1 1 1
       // ~ZeroBits: 0 0 0 1 0 0 0 0
@@ -1804,7 +1578,7 @@ bool isKnownToBeAPowerOfTwo(Value *V, bool OrZero, unsigned Depth,
   if (match(V, m_Exact(m_LShr(m_Value(), m_Value()))) ||
       match(V, m_Exact(m_UDiv(m_Value(), m_Value())))) {
     return isKnownToBeAPowerOfTwo(cast<Operator>(V)->getOperand(0), OrZero,
-                                  Depth, Q, DL);
+                                  Depth, Q);
   }
 
   return false;
@@ -1816,8 +1590,8 @@ bool isKnownToBeAPowerOfTwo(Value *V, bool OrZero, unsigned Depth,
 /// to be non-null.
 ///
 /// Currently this routine does not support vector GEPs.
-static bool isGEPKnownNonNull(GEPOperator *GEP, const DataLayout &DL,
-                              unsigned Depth, const Query &Q) {
+static bool isGEPKnownNonNull(GEPOperator *GEP, unsigned Depth,
+                              const Query &Q) {
   if (!GEP->isInBounds() || GEP->getPointerAddressSpace() != 0)
     return false;
 
@@ -1826,7 +1600,7 @@ static bool isGEPKnownNonNull(GEPOperator *GEP, const DataLayout &DL,
 
   // If the base pointer is non-null, we cannot walk to a null address with an
   // inbounds GEP in address space zero.
-  if (isKnownNonZero(GEP->getPointerOperand(), DL, Depth, Q))
+  if (isKnownNonZero(GEP->getPointerOperand(), Depth, Q))
     return true;
 
   // Walk the GEP operands and see if any operand introduces a non-zero offset.
@@ -1838,7 +1612,7 @@ static bool isGEPKnownNonNull(GEPOperator *GEP, const DataLayout &DL,
     if (StructType *STy = dyn_cast<StructType>(*GTI)) {
       ConstantInt *OpC = cast<ConstantInt>(GTI.getOperand());
       unsigned ElementIdx = OpC->getZExtValue();
-      const StructLayout *SL = DL.getStructLayout(STy);
+      const StructLayout *SL = Q.DL.getStructLayout(STy);
       uint64_t ElementOffset = SL->getElementOffset(ElementIdx);
       if (ElementOffset > 0)
         return true;
@@ -1846,7 +1620,7 @@ static bool isGEPKnownNonNull(GEPOperator *GEP, const DataLayout &DL,
     }
 
     // If we have a zero-sized type, the index doesn't matter. Keep looping.
-    if (DL.getTypeAllocSize(GTI.getIndexedType()) == 0)
+    if (Q.DL.getTypeAllocSize(GTI.getIndexedType()) == 0)
       continue;
 
     // Fast path the constant operand case both for efficiency and so we don't
@@ -1865,7 +1639,7 @@ static bool isGEPKnownNonNull(GEPOperator *GEP, const DataLayout &DL,
     if (Depth++ >= MaxDepth)
       continue;
 
-    if (isKnownNonZero(GTI.getOperand(), DL, Depth, Q))
+    if (isKnownNonZero(GTI.getOperand(), Depth, Q))
       return true;
   }
 
@@ -1875,8 +1649,7 @@ static bool isGEPKnownNonNull(GEPOperator *GEP, const DataLayout &DL,
 /// Does the 'Range' metadata (which must be a valid MD_range operand list)
 /// ensure that the value it's attached to is never Value?  'RangeType' is
 /// is the type of the value described by the range.
-static bool rangeMetadataExcludesValue(MDNode* Ranges,
-                                       const APInt& Value) {
+static bool rangeMetadataExcludesValue(MDNode* Ranges, const APInt& Value) {
   const unsigned NumRanges = Ranges->getNumOperands() / 2;
   assert(NumRanges >= 1);
   for (unsigned i = 0; i < NumRanges; ++i) {
@@ -1895,23 +1668,35 @@ static bool rangeMetadataExcludesValue(MDNode* Ranges,
 /// For vectors return true if every element is known to be non-zero when
 /// defined. Supports values with integer or pointer type and vectors of
 /// integers.
-bool isKnownNonZero(Value *V, const DataLayout &DL, unsigned Depth,
-                    const Query &Q) {
-  if (Constant *C = dyn_cast<Constant>(V)) {
+bool isKnownNonZero(Value *V, unsigned Depth, const Query &Q) {
+  if (auto *C = dyn_cast<Constant>(V)) {
     if (C->isNullValue())
       return false;
     if (isa<ConstantInt>(C))
       // Must be non-zero due to null test above.
       return true;
-    // TODO: Handle vectors
+
+    // For constant vectors, check that all elements are undefined or known
+    // non-zero to determine that the whole vector is known non-zero.
+    if (auto *VecTy = dyn_cast<VectorType>(C->getType())) {
+      for (unsigned i = 0, e = VecTy->getNumElements(); i != e; ++i) {
+        Constant *Elt = C->getAggregateElement(i);
+        if (!Elt || Elt->isNullValue())
+          return false;
+        if (!isa<UndefValue>(Elt) && !isa<ConstantInt>(Elt))
+          return false;
+      }
+      return true;
+    }
+
     return false;
   }
 
-  if (Instruction* I = dyn_cast<Instruction>(V)) {
+  if (auto *I = dyn_cast<Instruction>(V)) {
     if (MDNode *Ranges = I->getMetadata(LLVMContext::MD_range)) {
       // If the possible ranges don't contain zero, then the value is
       // definitely non-zero.
-      if (IntegerType* Ty = dyn_cast<IntegerType>(V->getType())) {
+      if (auto *Ty = dyn_cast<IntegerType>(V->getType())) {
         const APInt ZeroValue(Ty->getBitWidth(), 0);
         if (rangeMetadataExcludesValue(Ranges, ZeroValue))
           return true;
@@ -1926,22 +1711,22 @@ bool isKnownNonZero(Value *V, const DataLayout &DL, unsigned Depth,
   // Check for pointer simplifications.
   if (V->getType()->isPointerTy()) {
     if (isKnownNonNull(V))
-      return true; 
+      return true;
     if (GEPOperator *GEP = dyn_cast<GEPOperator>(V))
-      if (isGEPKnownNonNull(GEP, DL, Depth, Q))
+      if (isGEPKnownNonNull(GEP, Depth, Q))
         return true;
   }
 
-  unsigned BitWidth = getBitWidth(V->getType()->getScalarType(), DL);
+  unsigned BitWidth = getBitWidth(V->getType()->getScalarType(), Q.DL);
 
   // X | Y != 0 if X != 0 or Y != 0.
   Value *X = nullptr, *Y = nullptr;
   if (match(V, m_Or(m_Value(X), m_Value(Y))))
-    return isKnownNonZero(X, DL, Depth, Q) || isKnownNonZero(Y, DL, Depth, Q);
+    return isKnownNonZero(X, Depth, Q) || isKnownNonZero(Y, Depth, Q);
 
   // ext X != 0 if X != 0.
   if (isa<SExtInst>(V) || isa<ZExtInst>(V))
-    return isKnownNonZero(cast<Instruction>(V)->getOperand(0), DL, Depth, Q);
+    return isKnownNonZero(cast<Instruction>(V)->getOperand(0), Depth, Q);
 
   // shl X, Y != 0 if X is odd.  Note that the value of the shift is undefined
   // if the lowest bit is shifted off the end.
@@ -1949,11 +1734,11 @@ bool isKnownNonZero(Value *V, const DataLayout &DL, unsigned Depth,
     // shl nuw can't remove any non-zero bits.
     OverflowingBinaryOperator *BO = cast<OverflowingBinaryOperator>(V);
     if (BO->hasNoUnsignedWrap())
-      return isKnownNonZero(X, DL, Depth, Q);
+      return isKnownNonZero(X, Depth, Q);
 
     APInt KnownZero(BitWidth, 0);
     APInt KnownOne(BitWidth, 0);
-    computeKnownBits(X, KnownZero, KnownOne, DL, Depth, Q);
+    computeKnownBits(X, KnownZero, KnownOne, Depth, Q);
     if (KnownOne[0])
       return true;
   }
@@ -1963,10 +1748,10 @@ bool isKnownNonZero(Value *V, const DataLayout &DL, unsigned Depth,
     // shr exact can only shift out zero bits.
     PossiblyExactOperator *BO = cast<PossiblyExactOperator>(V);
     if (BO->isExact())
-      return isKnownNonZero(X, DL, Depth, Q);
+      return isKnownNonZero(X, Depth, Q);
 
     bool XKnownNonNegative, XKnownNegative;
-    ComputeSignBit(X, XKnownNonNegative, XKnownNegative, DL, Depth, Q);
+    ComputeSignBit(X, XKnownNonNegative, XKnownNegative, Depth, Q);
     if (XKnownNegative)
       return true;
 
@@ -1976,32 +1761,32 @@ bool isKnownNonZero(Value *V, const DataLayout &DL, unsigned Depth,
     if (ConstantInt *Shift = dyn_cast<ConstantInt>(Y)) {
       APInt KnownZero(BitWidth, 0);
       APInt KnownOne(BitWidth, 0);
-      computeKnownBits(X, KnownZero, KnownOne, DL, Depth, Q);
-      
+      computeKnownBits(X, KnownZero, KnownOne, Depth, Q);
+
       auto ShiftVal = Shift->getLimitedValue(BitWidth - 1);
       // Is there a known one in the portion not shifted out?
       if (KnownOne.countLeadingZeros() < BitWidth - ShiftVal)
         return true;
       // Are all the bits to be shifted out known zero?
       if (KnownZero.countTrailingOnes() >= ShiftVal)
-        return isKnownNonZero(X, DL, Depth, Q);
+        return isKnownNonZero(X, Depth, Q);
     }
   }
   // div exact can only produce a zero if the dividend is zero.
   else if (match(V, m_Exact(m_IDiv(m_Value(X), m_Value())))) {
-    return isKnownNonZero(X, DL, Depth, Q);
+    return isKnownNonZero(X, Depth, Q);
   }
   // X + Y.
   else if (match(V, m_Add(m_Value(X), m_Value(Y)))) {
     bool XKnownNonNegative, XKnownNegative;
     bool YKnownNonNegative, YKnownNegative;
-    ComputeSignBit(X, XKnownNonNegative, XKnownNegative, DL, Depth, Q);
-    ComputeSignBit(Y, YKnownNonNegative, YKnownNegative, DL, Depth, Q);
+    ComputeSignBit(X, XKnownNonNegative, XKnownNegative, Depth, Q);
+    ComputeSignBit(Y, YKnownNonNegative, YKnownNegative, Depth, Q);
 
     // If X and Y are both non-negative (as signed values) then their sum is not
     // zero unless both X and Y are zero.
     if (XKnownNonNegative && YKnownNonNegative)
-      if (isKnownNonZero(X, DL, Depth, Q) || isKnownNonZero(Y, DL, Depth, Q))
+      if (isKnownNonZero(X, Depth, Q) || isKnownNonZero(Y, Depth, Q))
         return true;
 
     // If X and Y are both negative (as signed values) then their sum is not
@@ -2012,22 +1797,22 @@ bool isKnownNonZero(Value *V, const DataLayout &DL, unsigned Depth,
       APInt Mask = APInt::getSignedMaxValue(BitWidth);
       // The sign bit of X is set.  If some other bit is set then X is not equal
       // to INT_MIN.
-      computeKnownBits(X, KnownZero, KnownOne, DL, Depth, Q);
+      computeKnownBits(X, KnownZero, KnownOne, Depth, Q);
       if ((KnownOne & Mask) != 0)
         return true;
       // The sign bit of Y is set.  If some other bit is set then Y is not equal
       // to INT_MIN.
-      computeKnownBits(Y, KnownZero, KnownOne, DL, Depth, Q);
+      computeKnownBits(Y, KnownZero, KnownOne, Depth, Q);
       if ((KnownOne & Mask) != 0)
         return true;
     }
 
     // The sum of a non-negative number and a power of two is not zero.
     if (XKnownNonNegative &&
-        isKnownToBeAPowerOfTwo(Y, /*OrZero*/ false, Depth, Q, DL))
+        isKnownToBeAPowerOfTwo(Y, /*OrZero*/ false, Depth, Q))
       return true;
     if (YKnownNonNegative &&
-        isKnownToBeAPowerOfTwo(X, /*OrZero*/ false, Depth, Q, DL))
+        isKnownToBeAPowerOfTwo(X, /*OrZero*/ false, Depth, Q))
       return true;
   }
   // X * Y.
@@ -2036,13 +1821,13 @@ bool isKnownNonZero(Value *V, const DataLayout &DL, unsigned Depth,
     // If X and Y are non-zero then so is X * Y as long as the multiplication
     // does not overflow.
     if ((BO->hasNoSignedWrap() || BO->hasNoUnsignedWrap()) &&
-        isKnownNonZero(X, DL, Depth, Q) && isKnownNonZero(Y, DL, Depth, Q))
+        isKnownNonZero(X, Depth, Q) && isKnownNonZero(Y, Depth, Q))
       return true;
   }
   // (C ? X : Y) != 0 if X != 0 and Y != 0.
   else if (SelectInst *SI = dyn_cast<SelectInst>(V)) {
-    if (isKnownNonZero(SI->getTrueValue(), DL, Depth, Q) &&
-        isKnownNonZero(SI->getFalseValue(), DL, Depth, Q))
+    if (isKnownNonZero(SI->getTrueValue(), Depth, Q) &&
+        isKnownNonZero(SI->getFalseValue(), Depth, Q))
       return true;
   }
   // PHI
@@ -2064,18 +1849,23 @@ bool isKnownNonZero(Value *V, const DataLayout &DL, unsigned Depth,
         }
       }
     }
+    // Check if all incoming values are non-zero constant.
+    bool AllNonZeroConstants = all_of(PN->operands(), [](Value *V) {
+      return isa<ConstantInt>(V) && !cast<ConstantInt>(V)->isZeroValue();
+    });
+    if (AllNonZeroConstants)
+      return true;
   }
 
   if (!BitWidth) return false;
   APInt KnownZero(BitWidth, 0);
   APInt KnownOne(BitWidth, 0);
-  computeKnownBits(V, KnownZero, KnownOne, DL, Depth, Q);
+  computeKnownBits(V, KnownZero, KnownOne, Depth, Q);
   return KnownOne != 0;
 }
 
 /// Return true if V2 == V1 + X, where X is known non-zero.
-static bool isAddOfNonZero(Value *V1, Value *V2, const DataLayout &DL,
-                           const Query &Q) {
+static bool isAddOfNonZero(Value *V1, Value *V2, const Query &Q) {
   BinaryOperator *BO = dyn_cast<BinaryOperator>(V1);
   if (!BO || BO->getOpcode() != Instruction::Add)
     return false;
@@ -2086,18 +1876,17 @@ static bool isAddOfNonZero(Value *V1, Value *V2, const DataLayout &DL,
     Op = BO->getOperand(0);
   else
     return false;
-  return isKnownNonZero(Op, DL, 0, Q);
+  return isKnownNonZero(Op, 0, Q);
 }
 
 /// Return true if it is known that V1 != V2.
-static bool isKnownNonEqual(Value *V1, Value *V2, const DataLayout &DL,
-                            const Query &Q) {
+static bool isKnownNonEqual(Value *V1, Value *V2, const Query &Q) {
   if (V1->getType()->isVectorTy() || V1 == V2)
     return false;
   if (V1->getType() != V2->getType())
     // We can't look through casts yet.
     return false;
-  if (isAddOfNonZero(V1, V2, DL, Q) || isAddOfNonZero(V2, V1, DL, Q))
+  if (isAddOfNonZero(V1, V2, Q) || isAddOfNonZero(V2, V1, Q))
     return true;
 
   if (IntegerType *Ty = dyn_cast<IntegerType>(V1->getType())) {
@@ -2106,10 +1895,10 @@ static bool isKnownNonEqual(Value *V1, Value *V2, const DataLayout &DL,
     auto BitWidth = Ty->getBitWidth();
     APInt KnownZero1(BitWidth, 0);
     APInt KnownOne1(BitWidth, 0);
-    computeKnownBits(V1, KnownZero1, KnownOne1, DL, 0, Q);
+    computeKnownBits(V1, KnownZero1, KnownOne1, 0, Q);
     APInt KnownZero2(BitWidth, 0);
     APInt KnownOne2(BitWidth, 0);
-    computeKnownBits(V2, KnownZero2, KnownOne2, DL, 0, Q);
+    computeKnownBits(V2, KnownZero2, KnownOne2, 0, Q);
 
     auto OppositeBits = (KnownZero1 & KnownOne2) | (KnownZero2 & KnownOne1);
     if (OppositeBits.getBoolValue())
@@ -2127,26 +1916,48 @@ static bool isKnownNonEqual(Value *V1, Value *V2, const DataLayout &DL,
 /// where V is a vector, the mask, known zero, and known one values are the
 /// same width as the vector element, and the bit is set only if it is true
 /// for all of the elements in the vector.
-bool MaskedValueIsZero(Value *V, const APInt &Mask, const DataLayout &DL,
-                       unsigned Depth, const Query &Q) {
+bool MaskedValueIsZero(Value *V, const APInt &Mask, unsigned Depth,
+                       const Query &Q) {
   APInt KnownZero(Mask.getBitWidth(), 0), KnownOne(Mask.getBitWidth(), 0);
-  computeKnownBits(V, KnownZero, KnownOne, DL, Depth, Q);
+  computeKnownBits(V, KnownZero, KnownOne, Depth, Q);
   return (KnownZero & Mask) == Mask;
 }
 
+/// For vector constants, loop over the elements and find the constant with the
+/// minimum number of sign bits. Return 0 if the value is not a vector constant
+/// or if any element was not analyzed; otherwise, return the count for the
+/// element with the minimum number of sign bits.
+static unsigned computeNumSignBitsVectorConstant(Value *V, unsigned TyBits) {
+  auto *CV = dyn_cast<Constant>(V);
+  if (!CV || !CV->getType()->isVectorTy())
+    return 0;
 
+  unsigned MinSignBits = TyBits;
+  unsigned NumElts = CV->getType()->getVectorNumElements();
+  for (unsigned i = 0; i != NumElts; ++i) {
+    // If we find a non-ConstantInt, bail out.
+    auto *Elt = dyn_cast_or_null<ConstantInt>(CV->getAggregateElement(i));
+    if (!Elt)
+      return 0;
+
+    // If the sign bit is 1, flip the bits, so we always count leading zeros.
+    APInt EltVal = Elt->getValue();
+    if (EltVal.isNegative())
+      EltVal = ~EltVal;
+    MinSignBits = std::min(MinSignBits, EltVal.countLeadingZeros());
+  }
+
+  return MinSignBits;
+}
 
 /// Return the number of times the sign bit of the register is replicated into
 /// the other bits. We know that at least 1 bit is always equal to the sign bit
 /// (itself), but other cases can give us information. For example, immediately
 /// after an "ashr X, 2", we know that the top 3 bits are all equal to each
-/// other, so we return 3.
-///
-/// 'Op' must have a scalar integer type.
-///
-unsigned ComputeNumSignBits(Value *V, const DataLayout &DL, unsigned Depth,
-                            const Query &Q) {
-  unsigned TyBits = DL.getTypeSizeInBits(V->getType()->getScalarType());
+/// other, so we return 3. For vectors, return the number of sign bits for the
+/// vector element with the mininum number of known sign bits.
+unsigned ComputeNumSignBits(Value *V, unsigned Depth, const Query &Q) {
+  unsigned TyBits = Q.DL.getTypeSizeInBits(V->getType()->getScalarType());
   unsigned Tmp, Tmp2;
   unsigned FirstAnswer = 1;
 
@@ -2161,7 +1972,7 @@ unsigned ComputeNumSignBits(Value *V, const DataLayout &DL, unsigned Depth,
   default: break;
   case Instruction::SExt:
     Tmp = TyBits - U->getOperand(0)->getType()->getScalarSizeInBits();
-    return ComputeNumSignBits(U->getOperand(0), DL, Depth + 1, Q) + Tmp;
+    return ComputeNumSignBits(U->getOperand(0), Depth + 1, Q) + Tmp;
 
   case Instruction::SDiv: {
     const APInt *Denominator;
@@ -2173,7 +1984,7 @@ unsigned ComputeNumSignBits(Value *V, const DataLayout &DL, unsigned Depth,
         break;
 
       // Calculate the incoming numerator bits.
-      unsigned NumBits = ComputeNumSignBits(U->getOperand(0), DL, Depth + 1, Q);
+      unsigned NumBits = ComputeNumSignBits(U->getOperand(0), Depth + 1, Q);
 
       // Add floor(log(C)) bits to the numerator bits.
       return std::min(TyBits, NumBits + Denominator->logBase2());
@@ -2195,7 +2006,7 @@ unsigned ComputeNumSignBits(Value *V, const DataLayout &DL, unsigned Depth,
       // Calculate the incoming numerator bits. SRem by a positive constant
       // can't lower the number of sign bits.
       unsigned NumrBits =
-          ComputeNumSignBits(U->getOperand(0), DL, Depth + 1, Q);
+          ComputeNumSignBits(U->getOperand(0), Depth + 1, Q);
 
       // Calculate the leading sign bit constraints by examining the
       // denominator.  Given that the denominator is positive, there are two
@@ -2217,7 +2028,7 @@ unsigned ComputeNumSignBits(Value *V, const DataLayout &DL, unsigned Depth,
   }
 
   case Instruction::AShr: {
-    Tmp = ComputeNumSignBits(U->getOperand(0), DL, Depth + 1, Q);
+    Tmp = ComputeNumSignBits(U->getOperand(0), Depth + 1, Q);
     // ashr X, C   -> adds C sign bits.  Vectors too.
     const APInt *ShAmt;
     if (match(U->getOperand(1), m_APInt(ShAmt))) {
@@ -2230,7 +2041,7 @@ unsigned ComputeNumSignBits(Value *V, const DataLayout &DL, unsigned Depth,
     const APInt *ShAmt;
     if (match(U->getOperand(1), m_APInt(ShAmt))) {
       // shl destroys sign bits.
-      Tmp = ComputeNumSignBits(U->getOperand(0), DL, Depth + 1, Q);
+      Tmp = ComputeNumSignBits(U->getOperand(0), Depth + 1, Q);
       Tmp2 = ShAmt->getZExtValue();
       if (Tmp2 >= TyBits ||      // Bad shift.
           Tmp2 >= Tmp) break;    // Shifted all sign bits out.
@@ -2242,9 +2053,9 @@ unsigned ComputeNumSignBits(Value *V, const DataLayout &DL, unsigned Depth,
   case Instruction::Or:
   case Instruction::Xor:    // NOT is handled here.
     // Logical binary ops preserve the number of sign bits at the worst.
-    Tmp = ComputeNumSignBits(U->getOperand(0), DL, Depth + 1, Q);
+    Tmp = ComputeNumSignBits(U->getOperand(0), Depth + 1, Q);
     if (Tmp != 1) {
-      Tmp2 = ComputeNumSignBits(U->getOperand(1), DL, Depth + 1, Q);
+      Tmp2 = ComputeNumSignBits(U->getOperand(1), Depth + 1, Q);
       FirstAnswer = std::min(Tmp, Tmp2);
       // We computed what we know about the sign bits as our first
       // answer. Now proceed to the generic code that uses
@@ -2253,23 +2064,22 @@ unsigned ComputeNumSignBits(Value *V, const DataLayout &DL, unsigned Depth,
     break;
 
   case Instruction::Select:
-    Tmp = ComputeNumSignBits(U->getOperand(1), DL, Depth + 1, Q);
+    Tmp = ComputeNumSignBits(U->getOperand(1), Depth + 1, Q);
     if (Tmp == 1) return 1;  // Early out.
-    Tmp2 = ComputeNumSignBits(U->getOperand(2), DL, Depth + 1, Q);
+    Tmp2 = ComputeNumSignBits(U->getOperand(2), Depth + 1, Q);
     return std::min(Tmp, Tmp2);
 
   case Instruction::Add:
     // Add can have at most one carry bit.  Thus we know that the output
     // is, at worst, one more bit than the inputs.
-    Tmp = ComputeNumSignBits(U->getOperand(0), DL, Depth + 1, Q);
+    Tmp = ComputeNumSignBits(U->getOperand(0), Depth + 1, Q);
     if (Tmp == 1) return 1;  // Early out.
 
     // Special case decrementing a value (ADD X, -1):
     if (const auto *CRHS = dyn_cast<Constant>(U->getOperand(1)))
       if (CRHS->isAllOnesValue()) {
         APInt KnownZero(TyBits, 0), KnownOne(TyBits, 0);
-        computeKnownBits(U->getOperand(0), KnownZero, KnownOne, DL, Depth + 1,
-                         Q);
+        computeKnownBits(U->getOperand(0), KnownZero, KnownOne, Depth + 1, Q);
 
         // If the input is known to be 0 or 1, the output is 0/-1, which is all
         // sign bits set.
@@ -2282,20 +2092,19 @@ unsigned ComputeNumSignBits(Value *V, const DataLayout &DL, unsigned Depth,
           return Tmp;
       }
 
-    Tmp2 = ComputeNumSignBits(U->getOperand(1), DL, Depth + 1, Q);
+    Tmp2 = ComputeNumSignBits(U->getOperand(1), Depth + 1, Q);
     if (Tmp2 == 1) return 1;
     return std::min(Tmp, Tmp2)-1;
 
   case Instruction::Sub:
-    Tmp2 = ComputeNumSignBits(U->getOperand(1), DL, Depth + 1, Q);
+    Tmp2 = ComputeNumSignBits(U->getOperand(1), Depth + 1, Q);
     if (Tmp2 == 1) return 1;
 
     // Handle NEG.
     if (const auto *CLHS = dyn_cast<Constant>(U->getOperand(0)))
       if (CLHS->isNullValue()) {
         APInt KnownZero(TyBits, 0), KnownOne(TyBits, 0);
-        computeKnownBits(U->getOperand(1), KnownZero, KnownOne, DL, Depth + 1,
-                         Q);
+        computeKnownBits(U->getOperand(1), KnownZero, KnownOne, Depth + 1, Q);
         // If the input is known to be 0 or 1, the output is 0/-1, which is all
         // sign bits set.
         if ((KnownZero | APInt(TyBits, 1)).isAllOnesValue())
@@ -2311,7 +2120,7 @@ unsigned ComputeNumSignBits(Value *V, const DataLayout &DL, unsigned Depth,
 
     // Sub can have at most one carry bit.  Thus we know that the output
     // is, at worst, one more bit than the inputs.
-    Tmp = ComputeNumSignBits(U->getOperand(0), DL, Depth + 1, Q);
+    Tmp = ComputeNumSignBits(U->getOperand(0), Depth + 1, Q);
     if (Tmp == 1) return 1;  // Early out.
     return std::min(Tmp, Tmp2)-1;
 
@@ -2325,11 +2134,11 @@ unsigned ComputeNumSignBits(Value *V, const DataLayout &DL, unsigned Depth,
 
     // Take the minimum of all incoming values.  This can't infinitely loop
     // because of our depth threshold.
-    Tmp = ComputeNumSignBits(PN->getIncomingValue(0), DL, Depth + 1, Q);
+    Tmp = ComputeNumSignBits(PN->getIncomingValue(0), Depth + 1, Q);
     for (unsigned i = 1, e = NumIncomingValues; i != e; ++i) {
       if (Tmp == 1) return Tmp;
       Tmp = std::min(
-          Tmp, ComputeNumSignBits(PN->getIncomingValue(i), DL, Depth + 1, Q));
+          Tmp, ComputeNumSignBits(PN->getIncomingValue(i), Depth + 1, Q));
     }
     return Tmp;
   }
@@ -2342,26 +2151,25 @@ unsigned ComputeNumSignBits(Value *V, const DataLayout &DL, unsigned Depth,
 
   // Finally, if we can prove that the top bits of the result are 0's or 1's,
   // use this information.
+
+  // If we can examine all elements of a vector constant successfully, we're
+  // done (we can't do any better than that). If not, keep trying.
+  if (unsigned VecSignBits = computeNumSignBitsVectorConstant(V, TyBits))
+    return VecSignBits;
+
   APInt KnownZero(TyBits, 0), KnownOne(TyBits, 0);
-  APInt Mask;
-  computeKnownBits(V, KnownZero, KnownOne, DL, Depth, Q);
-
-  if (KnownZero.isNegative()) {        // sign bit is 0
-    Mask = KnownZero;
-  } else if (KnownOne.isNegative()) {  // sign bit is 1;
-    Mask = KnownOne;
-  } else {
-    // Nothing known.
-    return FirstAnswer;
-  }
+  computeKnownBits(V, KnownZero, KnownOne, Depth, Q);
+
+  // If we know that the sign bit is either zero or one, determine the number of
+  // identical bits in the top of the input value.
+  if (KnownZero.isNegative())
+    return std::max(FirstAnswer, KnownZero.countLeadingOnes());
 
-  // Okay, we know that the sign bit in Mask is set.  Use CLZ to determine
-  // the number of identical bits in the top of the input value.
-  Mask = ~Mask;
-  Mask <<= Mask.getBitWidth()-TyBits;
-  // Return # leading zeros.  We use 'min' here in case Val was zero before
-  // shifting.  We don't want to return '64' as for an i32 "0".
-  return std::max(FirstAnswer, std::min(TyBits, Mask.countLeadingZeros()));
+  if (KnownOne.isNegative())
+    return std::max(FirstAnswer, KnownOne.countLeadingOnes());
+
+  // computeKnownBits gave us no extra information about the top bits.
+  return FirstAnswer;
 }
 
 /// This function computes the integer multiple of Base that equals V.
@@ -2484,13 +2292,124 @@ bool llvm::ComputeMultiple(Value *V, unsigned Base, Value *&Multiple,
   return false;
 }
 
+Intrinsic::ID llvm::getIntrinsicForCallSite(ImmutableCallSite ICS,
+                                            const TargetLibraryInfo *TLI) {
+  const Function *F = ICS.getCalledFunction();
+  if (!F)
+    return Intrinsic::not_intrinsic;
+
+  if (F->isIntrinsic())
+    return F->getIntrinsicID();
+
+  if (!TLI)
+    return Intrinsic::not_intrinsic;
+
+  LibFunc::Func Func;
+  // We're going to make assumptions on the semantics of the functions, check
+  // that the target knows that it's available in this environment and it does
+  // not have local linkage.
+  if (!F || F->hasLocalLinkage() || !TLI->getLibFunc(*F, Func))
+    return Intrinsic::not_intrinsic;
+
+  if (!ICS.onlyReadsMemory())
+    return Intrinsic::not_intrinsic;
+
+  // Otherwise check if we have a call to a function that can be turned into a
+  // vector intrinsic.
+  switch (Func) {
+  default:
+    break;
+  case LibFunc::sin:
+  case LibFunc::sinf:
+  case LibFunc::sinl:
+    return Intrinsic::sin;
+  case LibFunc::cos:
+  case LibFunc::cosf:
+  case LibFunc::cosl:
+    return Intrinsic::cos;
+  case LibFunc::exp:
+  case LibFunc::expf:
+  case LibFunc::expl:
+    return Intrinsic::exp;
+  case LibFunc::exp2:
+  case LibFunc::exp2f:
+  case LibFunc::exp2l:
+    return Intrinsic::exp2;
+  case LibFunc::log:
+  case LibFunc::logf:
+  case LibFunc::logl:
+    return Intrinsic::log;
+  case LibFunc::log10:
+  case LibFunc::log10f:
+  case LibFunc::log10l:
+    return Intrinsic::log10;
+  case LibFunc::log2:
+  case LibFunc::log2f:
+  case LibFunc::log2l:
+    return Intrinsic::log2;
+  case LibFunc::fabs:
+  case LibFunc::fabsf:
+  case LibFunc::fabsl:
+    return Intrinsic::fabs;
+  case LibFunc::fmin:
+  case LibFunc::fminf:
+  case LibFunc::fminl:
+    return Intrinsic::minnum;
+  case LibFunc::fmax:
+  case LibFunc::fmaxf:
+  case LibFunc::fmaxl:
+    return Intrinsic::maxnum;
+  case LibFunc::copysign:
+  case LibFunc::copysignf:
+  case LibFunc::copysignl:
+    return Intrinsic::copysign;
+  case LibFunc::floor:
+  case LibFunc::floorf:
+  case LibFunc::floorl:
+    return Intrinsic::floor;
+  case LibFunc::ceil:
+  case LibFunc::ceilf:
+  case LibFunc::ceill:
+    return Intrinsic::ceil;
+  case LibFunc::trunc:
+  case LibFunc::truncf:
+  case LibFunc::truncl:
+    return Intrinsic::trunc;
+  case LibFunc::rint:
+  case LibFunc::rintf:
+  case LibFunc::rintl:
+    return Intrinsic::rint;
+  case LibFunc::nearbyint:
+  case LibFunc::nearbyintf:
+  case LibFunc::nearbyintl:
+    return Intrinsic::nearbyint;
+  case LibFunc::round:
+  case LibFunc::roundf:
+  case LibFunc::roundl:
+    return Intrinsic::round;
+  case LibFunc::pow:
+  case LibFunc::powf:
+  case LibFunc::powl:
+    return Intrinsic::pow;
+  case LibFunc::sqrt:
+  case LibFunc::sqrtf:
+  case LibFunc::sqrtl:
+    if (ICS->hasNoNaNs())
+      return Intrinsic::sqrt;
+    return Intrinsic::not_intrinsic;
+  }
+
+  return Intrinsic::not_intrinsic;
+}
+
 /// Return true if we can prove that the specified FP value is never equal to
 /// -0.0.
 ///
 /// NOTE: this function will need to be revisited when we support non-default
 /// rounding modes!
 ///
-bool llvm::CannotBeNegativeZero(const Value *V, unsigned Depth) {
+bool llvm::CannotBeNegativeZero(const Value *V, const TargetLibraryInfo *TLI,
+                                unsigned Depth) {
   if (const ConstantFP *CFP = dyn_cast<ConstantFP>(V))
     return !CFP->getValueAPF().isNegZero();
 
@@ -2518,30 +2437,26 @@ bool llvm::CannotBeNegativeZero(const Value *V, unsigned Depth) {
   if (isa<SIToFPInst>(I) || isa<UIToFPInst>(I))
     return true;
 
-  if (const IntrinsicInst *II = dyn_cast<IntrinsicInst>(I))
+  if (const CallInst *CI = dyn_cast<CallInst>(I)) {
+    Intrinsic::ID IID = getIntrinsicForCallSite(CI, TLI);
+    switch (IID) {
+    default:
+      break;
     // sqrt(-0.0) = -0.0, no other negative results are possible.
-    if (II->getIntrinsicID() == Intrinsic::sqrt)
-      return CannotBeNegativeZero(II->getArgOperand(0), Depth+1);
-
-  if (const CallInst *CI = dyn_cast<CallInst>(I))
-    if (const Function *F = CI->getCalledFunction()) {
-      if (F->isDeclaration()) {
-        // abs(x) != -0.0
-        if (F->getName() == "abs") return true;
-        // fabs[lf](x) != -0.0
-        if (F->getName() == "fabs") return true;
-        if (F->getName() == "fabsf") return true;
-        if (F->getName() == "fabsl") return true;
-        if (F->getName() == "sqrt" || F->getName() == "sqrtf" ||
-            F->getName() == "sqrtl")
-          return CannotBeNegativeZero(CI->getArgOperand(0), Depth+1);
-      }
+    case Intrinsic::sqrt:
+      return CannotBeNegativeZero(CI->getArgOperand(0), TLI, Depth + 1);
+    // fabs(x) != -0.0
+    case Intrinsic::fabs:
+      return true;
     }
+  }
 
   return false;
 }
 
-bool llvm::CannotBeOrderedLessThanZero(const Value *V, unsigned Depth) {
+bool llvm::CannotBeOrderedLessThanZero(const Value *V,
+                                       const TargetLibraryInfo *TLI,
+                                       unsigned Depth) {
   if (const ConstantFP *CFP = dyn_cast<ConstantFP>(V))
     return !CFP->getValueAPF().isNegative() || CFP->getValueAPF().isZero();
 
@@ -2561,52 +2476,53 @@ bool llvm::CannotBeOrderedLessThanZero(const Value *V, unsigned Depth) {
     return true;
   case Instruction::FMul:
     // x*x is always non-negative or a NaN.
-    if (I->getOperand(0) == I->getOperand(1)) 
+    if (I->getOperand(0) == I->getOperand(1))
       return true;
     // Fall through
   case Instruction::FAdd:
   case Instruction::FDiv:
   case Instruction::FRem:
-    return CannotBeOrderedLessThanZero(I->getOperand(0), Depth+1) &&
-           CannotBeOrderedLessThanZero(I->getOperand(1), Depth+1);
+    return CannotBeOrderedLessThanZero(I->getOperand(0), TLI, Depth + 1) &&
+           CannotBeOrderedLessThanZero(I->getOperand(1), TLI, Depth + 1);
   case Instruction::Select:
-    return CannotBeOrderedLessThanZero(I->getOperand(1), Depth+1) &&
-           CannotBeOrderedLessThanZero(I->getOperand(2), Depth+1);
+    return CannotBeOrderedLessThanZero(I->getOperand(1), TLI, Depth + 1) &&
+           CannotBeOrderedLessThanZero(I->getOperand(2), TLI, Depth + 1);
   case Instruction::FPExt:
   case Instruction::FPTrunc:
     // Widening/narrowing never change sign.
-    return CannotBeOrderedLessThanZero(I->getOperand(0), Depth+1);
-  case Instruction::Call: 
-    if (const IntrinsicInst *II = dyn_cast<IntrinsicInst>(I)) 
-      switch (II->getIntrinsicID()) {
-      default: break;
-      case Intrinsic::maxnum:
-        return CannotBeOrderedLessThanZero(I->getOperand(0), Depth+1) ||
-               CannotBeOrderedLessThanZero(I->getOperand(1), Depth+1);
-      case Intrinsic::minnum:
-        return CannotBeOrderedLessThanZero(I->getOperand(0), Depth+1) &&
-               CannotBeOrderedLessThanZero(I->getOperand(1), Depth+1);
-      case Intrinsic::exp:
-      case Intrinsic::exp2:
-      case Intrinsic::fabs:
-      case Intrinsic::sqrt:
-        return true;
-      case Intrinsic::powi: 
-        if (ConstantInt *CI = dyn_cast<ConstantInt>(I->getOperand(1))) {
-          // powi(x,n) is non-negative if n is even.
-          if (CI->getBitWidth() <= 64 && CI->getSExtValue() % 2u == 0)
-            return true;
-        }
-        return CannotBeOrderedLessThanZero(I->getOperand(0), Depth+1);
-      case Intrinsic::fma:
-      case Intrinsic::fmuladd:
-        // x*x+y is non-negative if y is non-negative.
-        return I->getOperand(0) == I->getOperand(1) && 
-               CannotBeOrderedLessThanZero(I->getOperand(2), Depth+1);
+    return CannotBeOrderedLessThanZero(I->getOperand(0), TLI, Depth + 1);
+  case Instruction::Call:
+    Intrinsic::ID IID = getIntrinsicForCallSite(cast<CallInst>(I), TLI);
+    switch (IID) {
+    default:
+      break;
+    case Intrinsic::maxnum:
+      return CannotBeOrderedLessThanZero(I->getOperand(0), TLI, Depth + 1) ||
+             CannotBeOrderedLessThanZero(I->getOperand(1), TLI, Depth + 1);
+    case Intrinsic::minnum:
+      return CannotBeOrderedLessThanZero(I->getOperand(0), TLI, Depth + 1) &&
+             CannotBeOrderedLessThanZero(I->getOperand(1), TLI, Depth + 1);
+    case Intrinsic::exp:
+    case Intrinsic::exp2:
+    case Intrinsic::fabs:
+    case Intrinsic::sqrt:
+      return true;
+    case Intrinsic::powi:
+      if (ConstantInt *CI = dyn_cast<ConstantInt>(I->getOperand(1))) {
+        // powi(x,n) is non-negative if n is even.
+        if (CI->getBitWidth() <= 64 && CI->getSExtValue() % 2u == 0)
+          return true;
       }
+      return CannotBeOrderedLessThanZero(I->getOperand(0), TLI, Depth + 1);
+    case Intrinsic::fma:
+    case Intrinsic::fmuladd:
+      // x*x+y is non-negative if y is non-negative.
+      return I->getOperand(0) == I->getOperand(1) &&
+             CannotBeOrderedLessThanZero(I->getOperand(2), TLI, Depth + 1);
+    }
     break;
   }
-  return false; 
+  return false;
 }
 
 /// If the specified value can be set by repeating the same byte in memory,
@@ -2863,7 +2779,7 @@ Value *llvm::GetPointerBaseWithConstantOffset(Value *Ptr, int64_t &Offset,
                Operator::getOpcode(Ptr) == Instruction::AddrSpaceCast) {
       Ptr = cast<Operator>(Ptr)->getOperand(0);
     } else if (GlobalAlias *GA = dyn_cast<GlobalAlias>(Ptr)) {
-      if (GA->mayBeOverridden())
+      if (GA->isInterposable())
         break;
       Ptr = GA->getAliasee();
     } else {
@@ -2874,6 +2790,24 @@ Value *llvm::GetPointerBaseWithConstantOffset(Value *Ptr, int64_t &Offset,
   return Ptr;
 }
 
+bool llvm::isGEPBasedOnPointerToString(const GEPOperator *GEP) {
+  // Make sure the GEP has exactly three arguments.
+  if (GEP->getNumOperands() != 3)
+    return false;
+
+  // Make sure the index-ee is a pointer to array of i8.
+  ArrayType *AT = dyn_cast<ArrayType>(GEP->getSourceElementType());
+  if (!AT || !AT->getElementType()->isIntegerTy(8))
+    return false;
+
+  // Check to make sure that the first operand of the GEP is an integer and
+  // has value 0 so that we are sure we're indexing into the initializer.
+  const ConstantInt *FirstIdx = dyn_cast<ConstantInt>(GEP->getOperand(1));
+  if (!FirstIdx || !FirstIdx->isZero())
+    return false;
+
+  return true;
+}
 
 /// This function computes the length of a null-terminated C string pointed to
 /// by V. If successful, it returns true and returns the string in Str.
@@ -2888,20 +2822,9 @@ bool llvm::getConstantStringInfo(const Value *V, StringRef &Str,
   // If the value is a GEP instruction or constant expression, treat it as an
   // offset.
   if (const GEPOperator *GEP = dyn_cast<GEPOperator>(V)) {
-    // Make sure the GEP has exactly three arguments.
-    if (GEP->getNumOperands() != 3)
-      return false;
-
-    // Make sure the index-ee is a pointer to array of i8.
-    PointerType *PT = cast<PointerType>(GEP->getOperand(0)->getType());
-    ArrayType *AT = dyn_cast<ArrayType>(PT->getElementType());
-    if (!AT || !AT->getElementType()->isIntegerTy(8))
-      return false;
-
-    // Check to make sure that the first operand of the GEP is an integer and
-    // has value 0 so that we are sure we're indexing into the initializer.
-    const ConstantInt *FirstIdx = dyn_cast<ConstantInt>(GEP->getOperand(1));
-    if (!FirstIdx || !FirstIdx->isZero())
+    // The GEP operator should be based on a pointer to string constant, and is
+    // indexing into the string constant.
+    if (!isGEPBasedOnPointerToString(GEP))
       return false;
 
     // If the second index isn't a ConstantInt, then this is a variable index
@@ -2923,7 +2846,7 @@ bool llvm::getConstantStringInfo(const Value *V, StringRef &Str,
   if (!GV || !GV->isConstant() || !GV->hasDefinitiveInitializer())
     return false;
 
-  // Handle the all-zeros case
+  // Handle the all-zeros case.
   if (GV->getInitializer()->isNullValue()) {
     // This is a degenerate case. The initializer is constant zero so the
     // length of the string must be zero.
@@ -2931,13 +2854,12 @@ bool llvm::getConstantStringInfo(const Value *V, StringRef &Str,
     return true;
   }
 
-  // Must be a Constant Array
-  const ConstantDataArray *Array =
-    dyn_cast<ConstantDataArray>(GV->getInitializer());
+  // This must be a ConstantDataArray.
+  const auto *Array = dyn_cast<ConstantDataArray>(GV->getInitializer());
   if (!Array || !Array->isString())
     return false;
 
-  // Get the number of elements in the array
+  // Get the number of elements in the array.
   uint64_t NumElts = Array->getType()->getArrayNumElements();
 
   // Start out with the entire array in the StringRef.
@@ -3060,10 +2982,16 @@ Value *llvm::GetUnderlyingObject(Value *V, const DataLayout &DL,
                Operator::getOpcode(V) == Instruction::AddrSpaceCast) {
       V = cast<Operator>(V)->getOperand(0);
     } else if (GlobalAlias *GA = dyn_cast<GlobalAlias>(V)) {
-      if (GA->mayBeOverridden())
+      if (GA->isInterposable())
         return V;
       V = GA->getAliasee();
     } else {
+      if (auto CS = CallSite(V))
+        if (Value *RV = CS.getReturnedArgOperand()) {
+          V = RV;
+          continue;
+        }
+
       // See if InstructionSimplify knows any relevant tricks.
       if (Instruction *I = dyn_cast<Instruction>(V))
         // TODO: Acquire a DominatorTree and AssumptionCache and use them.
@@ -3133,213 +3061,9 @@ bool llvm::onlyUsedByLifetimeMarkers(const Value *V) {
   return true;
 }
 
-static bool isDereferenceableFromAttribute(const Value *BV, APInt Offset,
-                                           Type *Ty, const DataLayout &DL,
-                                           const Instruction *CtxI,
-                                           const DominatorTree *DT,
-                                           const TargetLibraryInfo *TLI) {
-  assert(Offset.isNonNegative() && "offset can't be negative");
-  assert(Ty->isSized() && "must be sized");
-  
-  APInt DerefBytes(Offset.getBitWidth(), 0);
-  bool CheckForNonNull = false;
-  if (const Argument *A = dyn_cast<Argument>(BV)) {
-    DerefBytes = A->getDereferenceableBytes();
-    if (!DerefBytes.getBoolValue()) {
-      DerefBytes = A->getDereferenceableOrNullBytes();
-      CheckForNonNull = true;
-    }
-  } else if (auto CS = ImmutableCallSite(BV)) {
-    DerefBytes = CS.getDereferenceableBytes(0);
-    if (!DerefBytes.getBoolValue()) {
-      DerefBytes = CS.getDereferenceableOrNullBytes(0);
-      CheckForNonNull = true;
-    }
-  } else if (const LoadInst *LI = dyn_cast<LoadInst>(BV)) {
-    if (MDNode *MD = LI->getMetadata(LLVMContext::MD_dereferenceable)) {
-      ConstantInt *CI = mdconst::extract<ConstantInt>(MD->getOperand(0));
-      DerefBytes = CI->getLimitedValue();
-    }
-    if (!DerefBytes.getBoolValue()) {
-      if (MDNode *MD = 
-              LI->getMetadata(LLVMContext::MD_dereferenceable_or_null)) {
-        ConstantInt *CI = mdconst::extract<ConstantInt>(MD->getOperand(0));
-        DerefBytes = CI->getLimitedValue();
-      }
-      CheckForNonNull = true;
-    }
-  }
-  
-  if (DerefBytes.getBoolValue())
-    if (DerefBytes.uge(Offset + DL.getTypeStoreSize(Ty)))
-      if (!CheckForNonNull || isKnownNonNullAt(BV, CtxI, DT, TLI))
-        return true;
-
-  return false;
-}
-
-static bool isDereferenceableFromAttribute(const Value *V, const DataLayout &DL,
-                                           const Instruction *CtxI,
-                                           const DominatorTree *DT,
-                                           const TargetLibraryInfo *TLI) {
-  Type *VTy = V->getType();
-  Type *Ty = VTy->getPointerElementType();
-  if (!Ty->isSized())
-    return false;
-  
-  APInt Offset(DL.getTypeStoreSizeInBits(VTy), 0);
-  return isDereferenceableFromAttribute(V, Offset, Ty, DL, CtxI, DT, TLI);
-}
-
-static bool isAligned(const Value *Base, APInt Offset, unsigned Align,
-                      const DataLayout &DL) {
-  APInt BaseAlign(Offset.getBitWidth(), getAlignment(Base, DL));
-
-  if (!BaseAlign) {
-    Type *Ty = Base->getType()->getPointerElementType();
-    if (!Ty->isSized())
-      return false;
-    BaseAlign = DL.getABITypeAlignment(Ty);
-  }
-
-  APInt Alignment(Offset.getBitWidth(), Align);
-
-  assert(Alignment.isPowerOf2() && "must be a power of 2!");
-  return BaseAlign.uge(Alignment) && !(Offset & (Alignment-1));
-}
-
-static bool isAligned(const Value *Base, unsigned Align, const DataLayout &DL) {
-  Type *Ty = Base->getType();
-  assert(Ty->isSized() && "must be sized");
-  APInt Offset(DL.getTypeStoreSizeInBits(Ty), 0);
-  return isAligned(Base, Offset, Align, DL);
-}
-
-/// Test if V is always a pointer to allocated and suitably aligned memory for
-/// a simple load or store.
-static bool isDereferenceableAndAlignedPointer(
-    const Value *V, unsigned Align, const DataLayout &DL,
-    const Instruction *CtxI, const DominatorTree *DT,
-    const TargetLibraryInfo *TLI, SmallPtrSetImpl<const Value *> &Visited) {
-  // Note that it is not safe to speculate into a malloc'd region because
-  // malloc may return null.
-
-  // These are obviously ok if aligned.
-  if (isa<AllocaInst>(V))
-    return isAligned(V, Align, DL);
-
-  // It's not always safe to follow a bitcast, for example:
-  //   bitcast i8* (alloca i8) to i32*
-  // would result in a 4-byte load from a 1-byte alloca. However,
-  // if we're casting from a pointer from a type of larger size
-  // to a type of smaller size (or the same size), and the alignment
-  // is at least as large as for the resulting pointer type, then
-  // we can look through the bitcast.
-  if (const BitCastOperator *BC = dyn_cast<BitCastOperator>(V)) {
-    Type *STy = BC->getSrcTy()->getPointerElementType(),
-         *DTy = BC->getDestTy()->getPointerElementType();
-    if (STy->isSized() && DTy->isSized() &&
-        (DL.getTypeStoreSize(STy) >= DL.getTypeStoreSize(DTy)) &&
-        (DL.getABITypeAlignment(STy) >= DL.getABITypeAlignment(DTy)))
-      return isDereferenceableAndAlignedPointer(BC->getOperand(0), Align, DL,
-                                                CtxI, DT, TLI, Visited);
-  }
-
-  // Global variables which can't collapse to null are ok.
-  if (const GlobalVariable *GV = dyn_cast<GlobalVariable>(V))
-    if (!GV->hasExternalWeakLinkage())
-      return isAligned(V, Align, DL);
-
-  // byval arguments are okay.
-  if (const Argument *A = dyn_cast<Argument>(V))
-    if (A->hasByValAttr())
-      return isAligned(V, Align, DL);
-
-  if (isDereferenceableFromAttribute(V, DL, CtxI, DT, TLI))
-    return isAligned(V, Align, DL);
-
-  // For GEPs, determine if the indexing lands within the allocated object.
-  if (const GEPOperator *GEP = dyn_cast<GEPOperator>(V)) {
-    Type *VTy = GEP->getType();
-    Type *Ty = VTy->getPointerElementType();
-    const Value *Base = GEP->getPointerOperand();
-
-    // Conservatively require that the base pointer be fully dereferenceable
-    // and aligned.
-    if (!Visited.insert(Base).second)
-      return false;
-    if (!isDereferenceableAndAlignedPointer(Base, Align, DL, CtxI, DT, TLI,
-                                            Visited))
-      return false;
-
-    APInt Offset(DL.getPointerTypeSizeInBits(VTy), 0);
-    if (!GEP->accumulateConstantOffset(DL, Offset))
-      return false;
-
-    // Check if the load is within the bounds of the underlying object
-    // and offset is aligned.
-    uint64_t LoadSize = DL.getTypeStoreSize(Ty);
-    Type *BaseType = Base->getType()->getPointerElementType();
-    assert(isPowerOf2_32(Align) && "must be a power of 2!");
-    return (Offset + LoadSize).ule(DL.getTypeAllocSize(BaseType)) && 
-           !(Offset & APInt(Offset.getBitWidth(), Align-1));
-  }
-
-  // For gc.relocate, look through relocations
-  if (const GCRelocateInst *RelocateInst = dyn_cast<GCRelocateInst>(V))
-    return isDereferenceableAndAlignedPointer(
-        RelocateInst->getDerivedPtr(), Align, DL, CtxI, DT, TLI, Visited);
-
-  if (const AddrSpaceCastInst *ASC = dyn_cast<AddrSpaceCastInst>(V))
-    return isDereferenceableAndAlignedPointer(ASC->getOperand(0), Align, DL,
-                                              CtxI, DT, TLI, Visited);
-
-  // If we don't know, assume the worst.
-  return false;
-}
-
-bool llvm::isDereferenceableAndAlignedPointer(const Value *V, unsigned Align,
-                                              const DataLayout &DL,
-                                              const Instruction *CtxI,
-                                              const DominatorTree *DT,
-                                              const TargetLibraryInfo *TLI) {
-  // When dereferenceability information is provided by a dereferenceable
-  // attribute, we know exactly how many bytes are dereferenceable. If we can
-  // determine the exact offset to the attributed variable, we can use that
-  // information here.
-  Type *VTy = V->getType();
-  Type *Ty = VTy->getPointerElementType();
-
-  // Require ABI alignment for loads without alignment specification
-  if (Align == 0)
-    Align = DL.getABITypeAlignment(Ty);
-
-  if (Ty->isSized()) {
-    APInt Offset(DL.getTypeStoreSizeInBits(VTy), 0);
-    const Value *BV = V->stripAndAccumulateInBoundsConstantOffsets(DL, Offset);
-
-    if (Offset.isNonNegative())
-      if (isDereferenceableFromAttribute(BV, Offset, Ty, DL, CtxI, DT, TLI) &&
-          isAligned(BV, Offset, Align, DL))
-        return true;
-  }
-
-  SmallPtrSet<const Value *, 32> Visited;
-  return ::isDereferenceableAndAlignedPointer(V, Align, DL, CtxI, DT, TLI,
-                                              Visited);
-}
-
-bool llvm::isDereferenceablePointer(const Value *V, const DataLayout &DL,
-                                    const Instruction *CtxI,
-                                    const DominatorTree *DT,
-                                    const TargetLibraryInfo *TLI) {
-  return isDereferenceableAndAlignedPointer(V, 1, DL, CtxI, DT, TLI);
-}
-
 bool llvm::isSafeToSpeculativelyExecute(const Value *V,
                                         const Instruction *CtxI,
-                                        const DominatorTree *DT,
-                                        const TargetLibraryInfo *TLI) {
+                                        const DominatorTree *DT) {
   const Operator *Inst = dyn_cast<Operator>(V);
   if (!Inst)
     return false;
@@ -3383,15 +3107,13 @@ bool llvm::isSafeToSpeculativelyExecute(const Value *V,
     const LoadInst *LI = cast<LoadInst>(Inst);
     if (!LI->isUnordered() ||
         // Speculative load may create a race that did not exist in the source.
-        LI->getParent()->getParent()->hasFnAttribute(
-            Attribute::SanitizeThread) ||
+        LI->getFunction()->hasFnAttribute(Attribute::SanitizeThread) ||
         // Speculative load may load data from dirty regions.
-        LI->getParent()->getParent()->hasFnAttribute(
-            Attribute::SanitizeAddress))
+        LI->getFunction()->hasFnAttribute(Attribute::SanitizeAddress))
       return false;
     const DataLayout &DL = LI->getModule()->getDataLayout();
-    return isDereferenceableAndAlignedPointer(
-        LI->getPointerOperand(), LI->getAlignment(), DL, CtxI, DT, TLI);
+    return isDereferenceableAndAlignedPointer(LI->getPointerOperand(),
+                                              LI->getAlignment(), DL, CtxI, DT);
   }
   case Instruction::Call: {
     if (const IntrinsicInst *II = dyn_cast<IntrinsicInst>(Inst)) {
@@ -3416,17 +3138,29 @@ bool llvm::isSafeToSpeculativelyExecute(const Value *V,
       case Intrinsic::umul_with_overflow:
       case Intrinsic::usub_with_overflow:
         return true;
-      // Sqrt should be OK, since the llvm sqrt intrinsic isn't defined to set
-      // errno like libm sqrt would.
+      // These intrinsics are defined to have the same behavior as libm
+      // functions except for setting errno.
       case Intrinsic::sqrt:
       case Intrinsic::fma:
       case Intrinsic::fmuladd:
+        return true;
+      // These intrinsics are defined to have the same behavior as libm
+      // functions, and the corresponding libm functions never set errno.
+      case Intrinsic::trunc:
+      case Intrinsic::copysign:
       case Intrinsic::fabs:
       case Intrinsic::minnum:
       case Intrinsic::maxnum:
         return true;
-      // TODO: some fp intrinsics are marked as having the same error handling
-      // as libm. They're safe to speculate when they won't error.
+      // These intrinsics are defined to have the same behavior as libm
+      // functions, which never overflow when operating on the IEEE754 types
+      // that we support, and never set errno otherwise.
+      case Intrinsic::ceil:
+      case Intrinsic::floor:
+      case Intrinsic::nearbyint:
+      case Intrinsic::rint:
+      case Intrinsic::round:
+        return true;
       // TODO: are convert_{from,to}_fp16 safe?
       // TODO: can we list target-specific intrinsics here?
       default: break;
@@ -3464,7 +3198,7 @@ bool llvm::mayBeMemoryDependent(const Instruction &I) {
 }
 
 /// Return true if we know that the specified value is never null.
-bool llvm::isKnownNonNull(const Value *V, const TargetLibraryInfo *TLI) {
+bool llvm::isKnownNonNull(const Value *V) {
   assert(V->getType()->isPointerTy() && "V must be pointer type");
 
   // Alloca never returns null, malloc might.
@@ -3481,7 +3215,7 @@ bool llvm::isKnownNonNull(const Value *V, const TargetLibraryInfo *TLI) {
     return !GV->hasExternalWeakLinkage() &&
            GV->getType()->getAddressSpace() == 0;
 
-  // A Load tagged w/nonnull metadata is never null. 
+  // A Load tagged with nonnull metadata is never null.
   if (const LoadInst *LI = dyn_cast<LoadInst>(V))
     return LI->getMetadata(LLVMContext::MD_nonnull);
 
@@ -3498,41 +3232,31 @@ static bool isKnownNonNullFromDominatingCondition(const Value *V,
   assert(V->getType()->isPointerTy() && "V must be pointer type");
 
   unsigned NumUsesExplored = 0;
-  for (auto U : V->users()) {
+  for (auto *U : V->users()) {
     // Avoid massive lists
     if (NumUsesExplored >= DomConditionsMaxUses)
       break;
     NumUsesExplored++;
     // Consider only compare instructions uniquely controlling a branch
-    const ICmpInst *Cmp = dyn_cast<ICmpInst>(U);
-    if (!Cmp)
-      continue;
-
-    if (DomConditionsSingleCmpUse && !Cmp->hasOneUse())
+    CmpInst::Predicate Pred;
+    if (!match(const_cast<User *>(U),
+               m_c_ICmp(Pred, m_Specific(V), m_Zero())) ||
+        (Pred != ICmpInst::ICMP_EQ && Pred != ICmpInst::ICMP_NE))
       continue;
 
-    for (auto *CmpU : Cmp->users()) {
-      const BranchInst *BI = dyn_cast<BranchInst>(CmpU);
-      if (!BI)
-        continue;
-      
-      assert(BI->isConditional() && "uses a comparison!");
-
-      BasicBlock *NonNullSuccessor = nullptr;
-      CmpInst::Predicate Pred;
-
-      if (match(const_cast<ICmpInst*>(Cmp),
-                m_c_ICmp(Pred, m_Specific(V), m_Zero()))) {
-        if (Pred == ICmpInst::ICMP_EQ)
-          NonNullSuccessor = BI->getSuccessor(1);
-        else if (Pred == ICmpInst::ICMP_NE)
-          NonNullSuccessor = BI->getSuccessor(0);
-      }
+    for (auto *CmpU : U->users()) {
+      if (const BranchInst *BI = dyn_cast<BranchInst>(CmpU)) {
+        assert(BI->isConditional() && "uses a comparison!");
 
-      if (NonNullSuccessor) {
+        BasicBlock *NonNullSuccessor =
+            BI->getSuccessor(Pred == ICmpInst::ICMP_EQ ? 1 : 0);
         BasicBlockEdge Edge(BI->getParent(), NonNullSuccessor);
         if (Edge.isSingleEdge() && DT->dominates(Edge, CtxI->getParent()))
           return true;
+      } else if (Pred == ICmpInst::ICMP_NE &&
+                 match(CmpU, m_Intrinsic<Intrinsic::experimental_guard>()) &&
+                 DT->dominates(cast<Instruction>(CmpU), CtxI)) {
+        return true;
       }
     }
   }
@@ -3541,8 +3265,8 @@ static bool isKnownNonNullFromDominatingCondition(const Value *V,
 }
 
 bool llvm::isKnownNonNullAt(const Value *V, const Instruction *CtxI,
-                   const DominatorTree *DT, const TargetLibraryInfo *TLI) {
-  if (isKnownNonNull(V, TLI))
+                            const DominatorTree *DT) {
+  if (isKnownNonNull(V))
     return true;
 
   return CtxI ? ::isKnownNonNullFromDominatingCondition(V, CtxI, DT) : false;
@@ -3671,6 +3395,67 @@ static OverflowResult computeOverflowForSignedAdd(
   return OverflowResult::MayOverflow;
 }
 
+bool llvm::isOverflowIntrinsicNoWrap(IntrinsicInst *II, DominatorTree &DT) {
+#ifndef NDEBUG
+  auto IID = II->getIntrinsicID();
+  assert((IID == Intrinsic::sadd_with_overflow ||
+          IID == Intrinsic::uadd_with_overflow ||
+          IID == Intrinsic::ssub_with_overflow ||
+          IID == Intrinsic::usub_with_overflow ||
+          IID == Intrinsic::smul_with_overflow ||
+          IID == Intrinsic::umul_with_overflow) &&
+         "Not an overflow intrinsic!");
+#endif
+
+  SmallVector<BranchInst *, 2> GuardingBranches;
+  SmallVector<ExtractValueInst *, 2> Results;
+
+  for (User *U : II->users()) {
+    if (auto *EVI = dyn_cast<ExtractValueInst>(U)) {
+      assert(EVI->getNumIndices() == 1 && "Obvious from CI's type");
+
+      if (EVI->getIndices()[0] == 0)
+        Results.push_back(EVI);
+      else {
+        assert(EVI->getIndices()[0] == 1 && "Obvious from CI's type");
+
+        for (auto *U : EVI->users())
+          if (auto *B = dyn_cast<BranchInst>(U)) {
+            assert(B->isConditional() && "How else is it using an i1?");
+            GuardingBranches.push_back(B);
+          }
+      }
+    } else {
+      // We are using the aggregate directly in a way we don't want to analyze
+      // here (storing it to a global, say).
+      return false;
+    }
+  }
+
+  auto AllUsesGuardedByBranch = [&](BranchInst *BI) {
+    BasicBlockEdge NoWrapEdge(BI->getParent(), BI->getSuccessor(1));
+    if (!NoWrapEdge.isSingleEdge())
+      return false;
+
+    // Check if all users of the add are provably no-wrap.
+    for (auto *Result : Results) {
+      // If the extractvalue itself is not executed on overflow, the we don't
+      // need to check each use separately, since domination is transitive.
+      if (DT.dominates(NoWrapEdge, Result->getParent()))
+        continue;
+
+      for (auto &RU : Result->uses())
+        if (!DT.dominates(NoWrapEdge, RU))
+          return false;
+    }
+
+    return true;
+  };
+
+  return any_of(GuardingBranches, AllUsesGuardedByBranch);
+}
+
+
 OverflowResult llvm::computeOverflowForSignedAdd(AddOperator *Add,
                                                  const DataLayout &DL,
                                                  AssumptionCache *AC,
@@ -3689,16 +3474,46 @@ OverflowResult llvm::computeOverflowForSignedAdd(Value *LHS, Value *RHS,
 }
 
 bool llvm::isGuaranteedToTransferExecutionToSuccessor(const Instruction *I) {
-  // FIXME: This conservative implementation can be relaxed. E.g. most
-  // atomic operations are guaranteed to terminate on most platforms
-  // and most functions terminate.
-
-  return !I->isAtomic() &&       // atomics may never succeed on some platforms
-         !isa<CallInst>(I) &&    // could throw and might not terminate
-         !isa<InvokeInst>(I) &&  // might not terminate and could throw to
-                                 //   non-successor (see bug 24185 for details).
-         !isa<ResumeInst>(I) &&  // has no successors
-         !isa<ReturnInst>(I);    // has no successors
+  // A memory operation returns normally if it isn't volatile. A volatile
+  // operation is allowed to trap.
+  //
+  // An atomic operation isn't guaranteed to return in a reasonable amount of
+  // time because it's possible for another thread to interfere with it for an
+  // arbitrary length of time, but programs aren't allowed to rely on that.
+  if (const LoadInst *LI = dyn_cast<LoadInst>(I))
+    return !LI->isVolatile();
+  if (const StoreInst *SI = dyn_cast<StoreInst>(I))
+    return !SI->isVolatile();
+  if (const AtomicCmpXchgInst *CXI = dyn_cast<AtomicCmpXchgInst>(I))
+    return !CXI->isVolatile();
+  if (const AtomicRMWInst *RMWI = dyn_cast<AtomicRMWInst>(I))
+    return !RMWI->isVolatile();
+  if (const MemIntrinsic *MII = dyn_cast<MemIntrinsic>(I))
+    return !MII->isVolatile();
+
+  // If there is no successor, then execution can't transfer to it.
+  if (const auto *CRI = dyn_cast<CleanupReturnInst>(I))
+    return !CRI->unwindsToCaller();
+  if (const auto *CatchSwitch = dyn_cast<CatchSwitchInst>(I))
+    return !CatchSwitch->unwindsToCaller();
+  if (isa<ResumeInst>(I))
+    return false;
+  if (isa<ReturnInst>(I))
+    return false;
+
+  // Calls can throw, or contain an infinite loop, or kill the process.
+  if (CallSite CS = CallSite(const_cast<Instruction*>(I))) {
+    // Calls which don't write to arbitrary memory are safe.
+    // FIXME: Ignoring infinite loops without any side-effects is too aggressive,
+    // but it's consistent with other passes. See http://llvm.org/PR965 .
+    // FIXME: This isn't aggressive enough; a call which only writes to a
+    // global is guaranteed to return.
+    return CS.onlyReadsMemory() || CS.onlyAccessesArgMemory() ||
+           match(I, m_Intrinsic<Intrinsic::assume>());
+  }
+
+  // Other instructions return normally.
+  return true;
 }
 
 bool llvm::isGuaranteedToExecuteForEveryIteration(const Instruction *I,
@@ -3775,6 +3590,11 @@ bool llvm::propagatesFullPoison(const Instruction *I) {
       return false;
     }
 
+    case Instruction::ICmp:
+      // Comparing poison with any value yields poison.  This is why, for
+      // instance, x s< (x +nsw 1) can be folded to true.
+      return true;
+
     case Instruction::GetElementPtr:
       // A GEP implicitly represents a sequence of additions, subtractions,
       // truncations, sign extensions and multiplications. The multiplications
@@ -3827,26 +3647,44 @@ bool llvm::isKnownNotFullPoison(const Instruction *PoisonI) {
   // Set of instructions that we have proved will yield poison if PoisonI
   // does.
   SmallSet<const Value *, 16> YieldsPoison;
+  SmallSet<const BasicBlock *, 4> Visited;
   YieldsPoison.insert(PoisonI);
+  Visited.insert(PoisonI->getParent());
 
-  for (BasicBlock::const_iterator I = PoisonI->getIterator(), E = BB->end();
-       I != E; ++I) {
-    if (&*I != PoisonI) {
-      const Value *NotPoison = getGuaranteedNonFullPoisonOp(&*I);
-      if (NotPoison != nullptr && YieldsPoison.count(NotPoison)) return true;
-      if (!isGuaranteedToTransferExecutionToSuccessor(&*I))
-        return false;
+  BasicBlock::const_iterator Begin = PoisonI->getIterator(), End = BB->end();
+
+  unsigned Iter = 0;
+  while (Iter++ < MaxDepth) {
+    for (auto &I : make_range(Begin, End)) {
+      if (&I != PoisonI) {
+        const Value *NotPoison = getGuaranteedNonFullPoisonOp(&I);
+        if (NotPoison != nullptr && YieldsPoison.count(NotPoison))
+          return true;
+        if (!isGuaranteedToTransferExecutionToSuccessor(&I))
+          return false;
+      }
+
+      // Mark poison that propagates from I through uses of I.
+      if (YieldsPoison.count(&I)) {
+        for (const User *User : I.users()) {
+          const Instruction *UserI = cast<Instruction>(User);
+          if (propagatesFullPoison(UserI))
+            YieldsPoison.insert(User);
+        }
+      }
     }
 
-    // Mark poison that propagates from I through uses of I.
-    if (YieldsPoison.count(&*I)) {
-      for (const User *User : I->users()) {
-        const Instruction *UserI = cast<Instruction>(User);
-        if (UserI->getParent() == BB && propagatesFullPoison(UserI))
-          YieldsPoison.insert(User);
+    if (auto *NextBB = BB->getSingleSuccessor()) {
+      if (Visited.insert(NextBB).second) {
+        BB = NextBB;
+        Begin = BB->getFirstNonPHI()->getIterator();
+        End = BB->end();
+        continue;
       }
     }
-  }
+
+    break;
+  };
   return false;
 }
 
@@ -3979,10 +3817,11 @@ static SelectPatternResult matchSelectPattern(CmpInst::Predicate Pred,
         return {(CmpLHS == FalseVal) ? SPF_ABS : SPF_NABS, SPNB_NA, false};
       }
     }
-    
+
     // Y >s C ? ~Y : ~C == ~Y <s ~C ? ~Y : ~C = SMIN(~Y, ~C)
     if (const auto *C2 = dyn_cast<ConstantInt>(FalseVal)) {
-      if (C1->getType() == C2->getType() && ~C1->getValue() == C2->getValue() &&
+      if (Pred == ICmpInst::ICMP_SGT && C1->getType() == C2->getType() &&
+          ~C1->getValue() == C2->getValue() &&
           (match(TrueVal, m_Not(m_Specific(CmpLHS))) ||
            match(CmpLHS, m_Not(m_Specific(TrueVal))))) {
         LHS = TrueVal;
@@ -4001,12 +3840,11 @@ static Value *lookThroughCast(CmpInst *CmpI, Value *V1, Value *V2,
                               Instruction::CastOps *CastOp) {
   CastInst *CI = dyn_cast<CastInst>(V1);
   Constant *C = dyn_cast<Constant>(V2);
-  CastInst *CI2 = dyn_cast<CastInst>(V2);
   if (!CI)
     return nullptr;
   *CastOp = CI->getOpcode();
 
-  if (CI2) {
+  if (auto *CI2 = dyn_cast<CastInst>(V2)) {
     // If V1 and V2 are both the same cast from the same type, we can look
     // through V1.
     if (CI2->getOpcode() == CI->getOpcode() &&
@@ -4017,43 +3855,48 @@ static Value *lookThroughCast(CmpInst *CmpI, Value *V1, Value *V2,
     return nullptr;
   }
 
-  if (isa<SExtInst>(CI) && CmpI->isSigned()) {
-    Constant *T = ConstantExpr::getTrunc(C, CI->getSrcTy());
-    // This is only valid if the truncated value can be sign-extended
-    // back to the original value.
-    if (ConstantExpr::getSExt(T, C->getType()) == C)
-      return T;
-    return nullptr;
-  }
+  Constant *CastedTo = nullptr;
+
   if (isa<ZExtInst>(CI) && CmpI->isUnsigned())
-    return ConstantExpr::getTrunc(C, CI->getSrcTy());
+    CastedTo = ConstantExpr::getTrunc(C, CI->getSrcTy());
+
+  if (isa<SExtInst>(CI) && CmpI->isSigned())
+    CastedTo = ConstantExpr::getTrunc(C, CI->getSrcTy(), true);
 
   if (isa<TruncInst>(CI))
-    return ConstantExpr::getIntegerCast(C, CI->getSrcTy(), CmpI->isSigned());
+    CastedTo = ConstantExpr::getIntegerCast(C, CI->getSrcTy(), CmpI->isSigned());
+
+  if (isa<FPTruncInst>(CI))
+    CastedTo = ConstantExpr::getFPExtend(C, CI->getSrcTy(), true);
+
+  if (isa<FPExtInst>(CI))
+    CastedTo = ConstantExpr::getFPTrunc(C, CI->getSrcTy(), true);
 
   if (isa<FPToUIInst>(CI))
-    return ConstantExpr::getUIToFP(C, CI->getSrcTy(), true);
+    CastedTo = ConstantExpr::getUIToFP(C, CI->getSrcTy(), true);
 
   if (isa<FPToSIInst>(CI))
-    return ConstantExpr::getSIToFP(C, CI->getSrcTy(), true);
+    CastedTo = ConstantExpr::getSIToFP(C, CI->getSrcTy(), true);
 
   if (isa<UIToFPInst>(CI))
-    return ConstantExpr::getFPToUI(C, CI->getSrcTy(), true);
+    CastedTo = ConstantExpr::getFPToUI(C, CI->getSrcTy(), true);
 
   if (isa<SIToFPInst>(CI))
-    return ConstantExpr::getFPToSI(C, CI->getSrcTy(), true);
+    CastedTo = ConstantExpr::getFPToSI(C, CI->getSrcTy(), true);
 
-  if (isa<FPTruncInst>(CI))
-    return ConstantExpr::getFPExtend(C, CI->getSrcTy(), true);
+  if (!CastedTo)
+    return nullptr;
 
-  if (isa<FPExtInst>(CI))
-    return ConstantExpr::getFPTrunc(C, CI->getSrcTy(), true);
+  Constant *CastedBack =
+      ConstantExpr::getCast(CI->getOpcode(), CastedTo, C->getType(), true);
+  // Make sure the cast doesn't lose any information.
+  if (CastedBack != C)
+    return nullptr;
 
-  return nullptr;
+  return CastedTo;
 }
 
-SelectPatternResult llvm::matchSelectPattern(Value *V,
-                                             Value *&LHS, Value *&RHS,
+SelectPatternResult llvm::matchSelectPattern(Value *V, Value *&LHS, Value *&RHS,
                                              Instruction::CastOps *CastOp) {
   SelectInst *SI = dyn_cast<SelectInst>(V);
   if (!SI) return {SPF_UNKNOWN, SPNB_NA, false};
@@ -4172,46 +4015,105 @@ static bool isTruePredicate(CmpInst::Predicate Pred, Value *LHS, Value *RHS,
 }
 
 /// Return true if "icmp Pred BLHS BRHS" is true whenever "icmp Pred
-/// ALHS ARHS" is true.
-static bool isImpliedCondOperands(CmpInst::Predicate Pred, Value *ALHS,
-                                  Value *ARHS, Value *BLHS, Value *BRHS,
-                                  const DataLayout &DL, unsigned Depth,
-                                  AssumptionCache *AC, const Instruction *CxtI,
-                                  const DominatorTree *DT) {
+/// ALHS ARHS" is true.  Otherwise, return None.
+static Optional<bool>
+isImpliedCondOperands(CmpInst::Predicate Pred, Value *ALHS, Value *ARHS,
+                      Value *BLHS, Value *BRHS, const DataLayout &DL,
+                      unsigned Depth, AssumptionCache *AC,
+                      const Instruction *CxtI, const DominatorTree *DT) {
   switch (Pred) {
   default:
-    return false;
+    return None;
 
   case CmpInst::ICMP_SLT:
   case CmpInst::ICMP_SLE:
-    return isTruePredicate(CmpInst::ICMP_SLE, BLHS, ALHS, DL, Depth, AC, CxtI,
-                           DT) &&
-           isTruePredicate(CmpInst::ICMP_SLE, ARHS, BRHS, DL, Depth, AC, CxtI,
-                           DT);
+    if (isTruePredicate(CmpInst::ICMP_SLE, BLHS, ALHS, DL, Depth, AC, CxtI,
+                        DT) &&
+        isTruePredicate(CmpInst::ICMP_SLE, ARHS, BRHS, DL, Depth, AC, CxtI, DT))
+      return true;
+    return None;
 
   case CmpInst::ICMP_ULT:
   case CmpInst::ICMP_ULE:
-    return isTruePredicate(CmpInst::ICMP_ULE, BLHS, ALHS, DL, Depth, AC, CxtI,
-                           DT) &&
-           isTruePredicate(CmpInst::ICMP_ULE, ARHS, BRHS, DL, Depth, AC, CxtI,
-                           DT);
+    if (isTruePredicate(CmpInst::ICMP_ULE, BLHS, ALHS, DL, Depth, AC, CxtI,
+                        DT) &&
+        isTruePredicate(CmpInst::ICMP_ULE, ARHS, BRHS, DL, Depth, AC, CxtI, DT))
+      return true;
+    return None;
   }
 }
 
-bool llvm::isImpliedCondition(Value *LHS, Value *RHS, const DataLayout &DL,
-                              unsigned Depth, AssumptionCache *AC,
-                              const Instruction *CxtI,
-                              const DominatorTree *DT) {
-  assert(LHS->getType() == RHS->getType() && "mismatched type");
+/// Return true if the operands of the two compares match.  IsSwappedOps is true
+/// when the operands match, but are swapped.
+static bool isMatchingOps(Value *ALHS, Value *ARHS, Value *BLHS, Value *BRHS,
+                          bool &IsSwappedOps) {
+
+  bool IsMatchingOps = (ALHS == BLHS && ARHS == BRHS);
+  IsSwappedOps = (ALHS == BRHS && ARHS == BLHS);
+  return IsMatchingOps || IsSwappedOps;
+}
+
+/// Return true if "icmp1 APred ALHS ARHS" implies "icmp2 BPred BLHS BRHS" is
+/// true.  Return false if "icmp1 APred ALHS ARHS" implies "icmp2 BPred BLHS
+/// BRHS" is false.  Otherwise, return None if we can't infer anything.
+static Optional<bool> isImpliedCondMatchingOperands(CmpInst::Predicate APred,
+                                                    Value *ALHS, Value *ARHS,
+                                                    CmpInst::Predicate BPred,
+                                                    Value *BLHS, Value *BRHS,
+                                                    bool IsSwappedOps) {
+  // Canonicalize the operands so they're matching.
+  if (IsSwappedOps) {
+    std::swap(BLHS, BRHS);
+    BPred = ICmpInst::getSwappedPredicate(BPred);
+  }
+  if (CmpInst::isImpliedTrueByMatchingCmp(APred, BPred))
+    return true;
+  if (CmpInst::isImpliedFalseByMatchingCmp(APred, BPred))
+    return false;
+
+  return None;
+}
+
+/// Return true if "icmp1 APred ALHS C1" implies "icmp2 BPred BLHS C2" is
+/// true.  Return false if "icmp1 APred ALHS C1" implies "icmp2 BPred BLHS
+/// C2" is false.  Otherwise, return None if we can't infer anything.
+static Optional<bool>
+isImpliedCondMatchingImmOperands(CmpInst::Predicate APred, Value *ALHS,
+                                 ConstantInt *C1, CmpInst::Predicate BPred,
+                                 Value *BLHS, ConstantInt *C2) {
+  assert(ALHS == BLHS && "LHS operands must match.");
+  ConstantRange DomCR =
+      ConstantRange::makeExactICmpRegion(APred, C1->getValue());
+  ConstantRange CR =
+      ConstantRange::makeAllowedICmpRegion(BPred, C2->getValue());
+  ConstantRange Intersection = DomCR.intersectWith(CR);
+  ConstantRange Difference = DomCR.difference(CR);
+  if (Intersection.isEmptySet())
+    return false;
+  if (Difference.isEmptySet())
+    return true;
+  return None;
+}
+
+Optional<bool> llvm::isImpliedCondition(Value *LHS, Value *RHS,
+                                        const DataLayout &DL, bool InvertAPred,
+                                        unsigned Depth, AssumptionCache *AC,
+                                        const Instruction *CxtI,
+                                        const DominatorTree *DT) {
+  // A mismatch occurs when we compare a scalar cmp to a vector cmp, for example.
+  if (LHS->getType() != RHS->getType())
+    return None;
+
   Type *OpTy = LHS->getType();
   assert(OpTy->getScalarType()->isIntegerTy(1));
 
   // LHS ==> RHS by definition
-  if (LHS == RHS) return true;
+  if (!InvertAPred && LHS == RHS)
+    return true;
 
   if (OpTy->isVectorTy())
     // TODO: extending the code below to handle vectors
-    return false;
+    return None;
   assert(OpTy->isIntegerTy(1) && "implied by above");
 
   ICmpInst::Predicate APred, BPred;
@@ -4220,11 +4122,37 @@ bool llvm::isImpliedCondition(Value *LHS, Value *RHS, const DataLayout &DL,
 
   if (!match(LHS, m_ICmp(APred, m_Value(ALHS), m_Value(ARHS))) ||
       !match(RHS, m_ICmp(BPred, m_Value(BLHS), m_Value(BRHS))))
-    return false;
+    return None;
+
+  if (InvertAPred)
+    APred = CmpInst::getInversePredicate(APred);
+
+  // Can we infer anything when the two compares have matching operands?
+  bool IsSwappedOps;
+  if (isMatchingOps(ALHS, ARHS, BLHS, BRHS, IsSwappedOps)) {
+    if (Optional<bool> Implication = isImpliedCondMatchingOperands(
+            APred, ALHS, ARHS, BPred, BLHS, BRHS, IsSwappedOps))
+      return Implication;
+    // No amount of additional analysis will infer the second condition, so
+    // early exit.
+    return None;
+  }
+
+  // Can we infer anything when the LHS operands match and the RHS operands are
+  // constants (not necessarily matching)?
+  if (ALHS == BLHS && isa<ConstantInt>(ARHS) && isa<ConstantInt>(BRHS)) {
+    if (Optional<bool> Implication = isImpliedCondMatchingImmOperands(
+            APred, ALHS, cast<ConstantInt>(ARHS), BPred, BLHS,
+            cast<ConstantInt>(BRHS)))
+      return Implication;
+    // No amount of additional analysis will infer the second condition, so
+    // early exit.
+    return None;
+  }
 
   if (APred == BPred)
     return isImpliedCondOperands(APred, ALHS, ARHS, BLHS, BRHS, DL, Depth, AC,
                                  CxtI, DT);
 
-  return false;
+  return None;
 }
diff --git a/lib/Analysis/VectorUtils.cpp b/lib/Analysis/VectorUtils.cpp
index 4b244ec5e1f6..53e7153a350f 100644
--- a/lib/Analysis/VectorUtils.cpp
+++ b/lib/Analysis/VectorUtils.cpp
@@ -17,6 +17,7 @@
 #include "llvm/Analysis/ScalarEvolutionExpressions.h"
 #include "llvm/Analysis/ScalarEvolution.h"
 #include "llvm/Analysis/TargetTransformInfo.h"
+#include "llvm/Analysis/ValueTracking.h"
 #include "llvm/Analysis/VectorUtils.h"
 #include "llvm/IR/GetElementPtrTypeIterator.h"
 #include "llvm/IR/PatternMatch.h"
@@ -51,6 +52,7 @@ bool llvm::isTriviallyVectorizable(Intrinsic::ID ID) {
   case Intrinsic::nearbyint:
   case Intrinsic::round:
   case Intrinsic::bswap:
+  case Intrinsic::bitreverse:
   case Intrinsic::ctpop:
   case Intrinsic::pow:
   case Intrinsic::fma:
@@ -78,150 +80,18 @@ bool llvm::hasVectorInstrinsicScalarOpd(Intrinsic::ID ID,
   }
 }
 
-/// \brief Check call has a unary float signature
-/// It checks following:
-/// a) call should have a single argument
-/// b) argument type should be floating point type
-/// c) call instruction type and argument type should be same
-/// d) call should only reads memory.
-/// If all these condition is met then return ValidIntrinsicID
-/// else return not_intrinsic.
-Intrinsic::ID
-llvm::checkUnaryFloatSignature(const CallInst &I,
-                               Intrinsic::ID ValidIntrinsicID) {
-  if (I.getNumArgOperands() != 1 ||
-      !I.getArgOperand(0)->getType()->isFloatingPointTy() ||
-      I.getType() != I.getArgOperand(0)->getType() || !I.onlyReadsMemory())
-    return Intrinsic::not_intrinsic;
-
-  return ValidIntrinsicID;
-}
-
-/// \brief Check call has a binary float signature
-/// It checks following:
-/// a) call should have 2 arguments.
-/// b) arguments type should be floating point type
-/// c) call instruction type and arguments type should be same
-/// d) call should only reads memory.
-/// If all these condition is met then return ValidIntrinsicID
-/// else return not_intrinsic.
-Intrinsic::ID
-llvm::checkBinaryFloatSignature(const CallInst &I,
-                                Intrinsic::ID ValidIntrinsicID) {
-  if (I.getNumArgOperands() != 2 ||
-      !I.getArgOperand(0)->getType()->isFloatingPointTy() ||
-      !I.getArgOperand(1)->getType()->isFloatingPointTy() ||
-      I.getType() != I.getArgOperand(0)->getType() ||
-      I.getType() != I.getArgOperand(1)->getType() || !I.onlyReadsMemory())
-    return Intrinsic::not_intrinsic;
-
-  return ValidIntrinsicID;
-}
-
 /// \brief Returns intrinsic ID for call.
 /// For the input call instruction it finds mapping intrinsic and returns
 /// its ID, in case it does not found it return not_intrinsic.
-Intrinsic::ID llvm::getIntrinsicIDForCall(CallInst *CI,
-                                          const TargetLibraryInfo *TLI) {
-  // If we have an intrinsic call, check if it is trivially vectorizable.
-  if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(CI)) {
-    Intrinsic::ID ID = II->getIntrinsicID();
-    if (isTriviallyVectorizable(ID) || ID == Intrinsic::lifetime_start ||
-        ID == Intrinsic::lifetime_end || ID == Intrinsic::assume)
-      return ID;
+Intrinsic::ID llvm::getVectorIntrinsicIDForCall(const CallInst *CI,
+                                                const TargetLibraryInfo *TLI) {
+  Intrinsic::ID ID = getIntrinsicForCallSite(CI, TLI);
+  if (ID == Intrinsic::not_intrinsic)
     return Intrinsic::not_intrinsic;
-  }
-
-  if (!TLI)
-    return Intrinsic::not_intrinsic;
-
-  LibFunc::Func Func;
-  Function *F = CI->getCalledFunction();
-  // We're going to make assumptions on the semantics of the functions, check
-  // that the target knows that it's available in this environment and it does
-  // not have local linkage.
-  if (!F || F->hasLocalLinkage() || !TLI->getLibFunc(F->getName(), Func))
-    return Intrinsic::not_intrinsic;
-
-  // Otherwise check if we have a call to a function that can be turned into a
-  // vector intrinsic.
-  switch (Func) {
-  default:
-    break;
-  case LibFunc::sin:
-  case LibFunc::sinf:
-  case LibFunc::sinl:
-    return checkUnaryFloatSignature(*CI, Intrinsic::sin);
-  case LibFunc::cos:
-  case LibFunc::cosf:
-  case LibFunc::cosl:
-    return checkUnaryFloatSignature(*CI, Intrinsic::cos);
-  case LibFunc::exp:
-  case LibFunc::expf:
-  case LibFunc::expl:
-    return checkUnaryFloatSignature(*CI, Intrinsic::exp);
-  case LibFunc::exp2:
-  case LibFunc::exp2f:
-  case LibFunc::exp2l:
-    return checkUnaryFloatSignature(*CI, Intrinsic::exp2);
-  case LibFunc::log:
-  case LibFunc::logf:
-  case LibFunc::logl:
-    return checkUnaryFloatSignature(*CI, Intrinsic::log);
-  case LibFunc::log10:
-  case LibFunc::log10f:
-  case LibFunc::log10l:
-    return checkUnaryFloatSignature(*CI, Intrinsic::log10);
-  case LibFunc::log2:
-  case LibFunc::log2f:
-  case LibFunc::log2l:
-    return checkUnaryFloatSignature(*CI, Intrinsic::log2);
-  case LibFunc::fabs:
-  case LibFunc::fabsf:
-  case LibFunc::fabsl:
-    return checkUnaryFloatSignature(*CI, Intrinsic::fabs);
-  case LibFunc::fmin:
-  case LibFunc::fminf:
-  case LibFunc::fminl:
-    return checkBinaryFloatSignature(*CI, Intrinsic::minnum);
-  case LibFunc::fmax:
-  case LibFunc::fmaxf:
-  case LibFunc::fmaxl:
-    return checkBinaryFloatSignature(*CI, Intrinsic::maxnum);
-  case LibFunc::copysign:
-  case LibFunc::copysignf:
-  case LibFunc::copysignl:
-    return checkBinaryFloatSignature(*CI, Intrinsic::copysign);
-  case LibFunc::floor:
-  case LibFunc::floorf:
-  case LibFunc::floorl:
-    return checkUnaryFloatSignature(*CI, Intrinsic::floor);
-  case LibFunc::ceil:
-  case LibFunc::ceilf:
-  case LibFunc::ceill:
-    return checkUnaryFloatSignature(*CI, Intrinsic::ceil);
-  case LibFunc::trunc:
-  case LibFunc::truncf:
-  case LibFunc::truncl:
-    return checkUnaryFloatSignature(*CI, Intrinsic::trunc);
-  case LibFunc::rint:
-  case LibFunc::rintf:
-  case LibFunc::rintl:
-    return checkUnaryFloatSignature(*CI, Intrinsic::rint);
-  case LibFunc::nearbyint:
-  case LibFunc::nearbyintf:
-  case LibFunc::nearbyintl:
-    return checkUnaryFloatSignature(*CI, Intrinsic::nearbyint);
-  case LibFunc::round:
-  case LibFunc::roundf:
-  case LibFunc::roundl:
-    return checkUnaryFloatSignature(*CI, Intrinsic::round);
-  case LibFunc::pow:
-  case LibFunc::powf:
-  case LibFunc::powl:
-    return checkBinaryFloatSignature(*CI, Intrinsic::pow);
-  }
 
+  if (isTriviallyVectorizable(ID) || ID == Intrinsic::lifetime_start ||
+      ID == Intrinsic::lifetime_end || ID == Intrinsic::assume)
+    return ID;
   return Intrinsic::not_intrinsic;
 }
 
@@ -231,8 +101,7 @@ Intrinsic::ID llvm::getIntrinsicIDForCall(CallInst *CI,
 unsigned llvm::getGEPInductionOperand(const GetElementPtrInst *Gep) {
   const DataLayout &DL = Gep->getModule()->getDataLayout();
   unsigned LastOperand = Gep->getNumOperands() - 1;
-  unsigned GEPAllocSize = DL.getTypeAllocSize(
-      cast<PointerType>(Gep->getType()->getScalarType())->getElementType());
+  unsigned GEPAllocSize = DL.getTypeAllocSize(Gep->getResultElementType());
 
   // Walk backwards and try to peel off zeros.
   while (LastOperand > 1 && match(Gep->getOperand(LastOperand), m_Zero())) {
@@ -318,8 +187,6 @@ Value *llvm::getStrideFromPointer(Value *Ptr, ScalarEvolution *SE, Loop *Lp) {
   // Strip off the size of access multiplication if we are still analyzing the
   // pointer.
   if (OrigPtr == Ptr) {
-    const DataLayout &DL = Lp->getHeader()->getModule()->getDataLayout();
-    DL.getTypeAllocSize(PtrTy->getElementType());
     if (const SCEVMulExpr *M = dyn_cast<SCEVMulExpr>(V)) {
       if (M->getOperand(0)->getSCEVType() != scConstant)
         return nullptr;
@@ -502,6 +369,7 @@ llvm::computeMinimumValueSizes(ArrayRef<BasicBlock *> Blocks, DemandedBits &DB,
 
     uint64_t V = DB.getDemandedBits(I).getZExtValue();
     DBits[Leader] |= V;
+    DBits[I] = V;
 
     // Casts, loads and instructions outside of our range terminate a chain
     // successfully.
@@ -552,6 +420,20 @@ llvm::computeMinimumValueSizes(ArrayRef<BasicBlock *> Blocks, DemandedBits &DB,
     // Round up to a power of 2
     if (!isPowerOf2_64((uint64_t)MinBW))
       MinBW = NextPowerOf2(MinBW);
+
+    // We don't modify the types of PHIs. Reductions will already have been
+    // truncated if possible, and inductions' sizes will have been chosen by
+    // indvars.
+    // If we are required to shrink a PHI, abandon this entire equivalence class.
+    bool Abort = false;
+    for (auto MI = ECs.member_begin(I), ME = ECs.member_end(); MI != ME; ++MI)
+      if (isa<PHINode>(*MI) && MinBW < (*MI)->getType()->getScalarSizeInBits()) {
+        Abort = true;
+        break;
+      }
+    if (Abort)
+      continue;
+
     for (auto MI = ECs.member_begin(I), ME = ECs.member_end(); MI != ME; ++MI) {
       if (!isa<Instruction>(*MI))
         continue;
@@ -565,3 +447,44 @@ llvm::computeMinimumValueSizes(ArrayRef<BasicBlock *> Blocks, DemandedBits &DB,
 
   return MinBWs;
 }
+
+/// \returns \p I after propagating metadata from \p VL.
+Instruction *llvm::propagateMetadata(Instruction *Inst, ArrayRef<Value *> VL) {
+  Instruction *I0 = cast<Instruction>(VL[0]);
+  SmallVector<std::pair<unsigned, MDNode *>, 4> Metadata;
+  I0->getAllMetadataOtherThanDebugLoc(Metadata);
+
+  for (auto Kind : { LLVMContext::MD_tbaa, LLVMContext::MD_alias_scope,
+                     LLVMContext::MD_noalias, LLVMContext::MD_fpmath,
+                     LLVMContext::MD_nontemporal }) {
+    MDNode *MD = I0->getMetadata(Kind);
+
+    for (int J = 1, E = VL.size(); MD && J != E; ++J) {
+      const Instruction *IJ = cast<Instruction>(VL[J]);
+      MDNode *IMD = IJ->getMetadata(Kind);
+      switch (Kind) {
+      case LLVMContext::MD_tbaa:
+        MD = MDNode::getMostGenericTBAA(MD, IMD);
+        break;
+      case LLVMContext::MD_alias_scope:
+        MD = MDNode::getMostGenericAliasScope(MD, IMD);
+        break;
+      case LLVMContext::MD_noalias:
+        MD = MDNode::intersect(MD, IMD);
+        break;
+      case LLVMContext::MD_fpmath:
+        MD = MDNode::getMostGenericFPMath(MD, IMD);
+        break;
+      case LLVMContext::MD_nontemporal:
+        MD = MDNode::intersect(MD, IMD);
+        break;
+      default:
+        llvm_unreachable("unhandled metadata");
+      }
+    }
+
+    Inst->setMetadata(Kind, MD);
+  }
+
+  return Inst;
+}
diff --git a/lib/AsmParser/LLLexer.cpp b/lib/AsmParser/LLLexer.cpp
index 26eca230bb31..507e7e76ecd2 100644
--- a/lib/AsmParser/LLLexer.cpp
+++ b/lib/AsmParser/LLLexer.cpp
@@ -513,6 +513,7 @@ lltok::Kind LLLexer::LexIdentifier() {
   KEYWORD(hidden);
   KEYWORD(protected);
   KEYWORD(unnamed_addr);
+  KEYWORD(local_unnamed_addr);
   KEYWORD(externally_initialized);
   KEYWORD(extern_weak);
   KEYWORD(external);
@@ -533,6 +534,7 @@ lltok::Kind LLLexer::LexIdentifier() {
   KEYWORD(notail);
   KEYWORD(target);
   KEYWORD(triple);
+  KEYWORD(source_filename);
   KEYWORD(unwind);
   KEYWORD(deplibs);             // FIXME: Remove in 4.0.
   KEYWORD(datalayout);
@@ -559,6 +561,7 @@ lltok::Kind LLLexer::LexIdentifier() {
   KEYWORD(addrspace);
   KEYWORD(section);
   KEYWORD(alias);
+  KEYWORD(ifunc);
   KEYWORD(module);
   KEYWORD(asm);
   KEYWORD(sideeffect);
@@ -579,6 +582,8 @@ lltok::Kind LLLexer::LexIdentifier() {
   KEYWORD(arm_aapcscc);
   KEYWORD(arm_aapcs_vfpcc);
   KEYWORD(msp430_intrcc);
+  KEYWORD(avr_intrcc);
+  KEYWORD(avr_signalcc);
   KEYWORD(ptx_kernel);
   KEYWORD(ptx_device);
   KEYWORD(spir_kernel);
@@ -587,6 +592,7 @@ lltok::Kind LLLexer::LexIdentifier() {
   KEYWORD(x86_64_sysvcc);
   KEYWORD(x86_64_win64cc);
   KEYWORD(webkit_jscc);
+  KEYWORD(swiftcc);
   KEYWORD(anyregcc);
   KEYWORD(preserve_mostcc);
   KEYWORD(preserve_allcc);
@@ -595,6 +601,11 @@ lltok::Kind LLLexer::LexIdentifier() {
   KEYWORD(hhvmcc);
   KEYWORD(hhvm_ccc);
   KEYWORD(cxx_fast_tlscc);
+  KEYWORD(amdgpu_vs);
+  KEYWORD(amdgpu_gs);
+  KEYWORD(amdgpu_ps);
+  KEYWORD(amdgpu_cs);
+  KEYWORD(amdgpu_kernel);
 
   KEYWORD(cc);
   KEYWORD(c);
@@ -602,6 +613,7 @@ lltok::Kind LLLexer::LexIdentifier() {
   KEYWORD(attributes);
 
   KEYWORD(alwaysinline);
+  KEYWORD(allocsize);
   KEYWORD(argmemonly);
   KEYWORD(builtin);
   KEYWORD(byval);
@@ -645,7 +657,10 @@ lltok::Kind LLLexer::LexIdentifier() {
   KEYWORD(sanitize_address);
   KEYWORD(sanitize_thread);
   KEYWORD(sanitize_memory);
+  KEYWORD(swifterror);
+  KEYWORD(swiftself);
   KEYWORD(uwtable);
+  KEYWORD(writeonly);
   KEYWORD(zeroext);
 
   KEYWORD(type);
@@ -780,14 +795,19 @@ lltok::Kind LLLexer::LexIdentifier() {
   DWKEYWORD(ATE, DwarfAttEncoding);
   DWKEYWORD(VIRTUALITY, DwarfVirtuality);
   DWKEYWORD(LANG, DwarfLang);
+  DWKEYWORD(CC, DwarfCC);
   DWKEYWORD(OP, DwarfOp);
   DWKEYWORD(MACINFO, DwarfMacinfo);
 #undef DWKEYWORD
-
   if (Keyword.startswith("DIFlag")) {
     StrVal.assign(Keyword.begin(), Keyword.end());
     return lltok::DIFlag;
   }
+  if (Keyword == "NoDebug" || Keyword == "FullDebug" ||
+      Keyword == "LineTablesOnly") {
+    StrVal.assign(Keyword.begin(), Keyword.end());
+    return lltok::EmissionKind;
+  }
 
   // Check for [us]0x[0-9A-Fa-f]+ which are Hexadecimal constant generated by
   // the CFE to avoid forcing it to deal with 64-bit numbers.
@@ -941,7 +961,8 @@ lltok::Kind LLLexer::LexDigitOrNegative() {
     }
   }
 
-  APFloatVal = APFloat(std::atof(TokStart));
+  APFloatVal = APFloat(APFloat::IEEEdouble,
+                       StringRef(TokStart, CurPtr - TokStart));
   return lltok::APFloat;
 }
 
@@ -977,6 +998,7 @@ lltok::Kind LLLexer::LexPositive() {
     }
   }
 
-  APFloatVal = APFloat(std::atof(TokStart));
+  APFloatVal = APFloat(APFloat::IEEEdouble,
+                       StringRef(TokStart, CurPtr - TokStart));
   return lltok::APFloat;
 }
diff --git a/lib/AsmParser/LLParser.cpp b/lib/AsmParser/LLParser.cpp
index 3471a2dbd05c..a2fcbf41204d 100644
--- a/lib/AsmParser/LLParser.cpp
+++ b/lib/AsmParser/LLParser.cpp
@@ -14,19 +14,23 @@
 #include "LLParser.h"
 #include "llvm/ADT/SmallPtrSet.h"
 #include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/StringExtras.h"
 #include "llvm/AsmParser/SlotMapping.h"
 #include "llvm/IR/AutoUpgrade.h"
 #include "llvm/IR/CallingConv.h"
+#include "llvm/IR/CallSite.h"
 #include "llvm/IR/Constants.h"
 #include "llvm/IR/DebugInfo.h"
 #include "llvm/IR/DebugInfoMetadata.h"
 #include "llvm/IR/DerivedTypes.h"
 #include "llvm/IR/InlineAsm.h"
 #include "llvm/IR/Instructions.h"
+#include "llvm/IR/Intrinsics.h"
 #include "llvm/IR/LLVMContext.h"
 #include "llvm/IR/Module.h"
 #include "llvm/IR/Operator.h"
 #include "llvm/IR/ValueSymbolTable.h"
+#include "llvm/Support/Debug.h"
 #include "llvm/Support/Dwarf.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/SaveAndRestore.h"
@@ -45,6 +49,11 @@ bool LLParser::Run() {
   // Prime the lexer.
   Lex.Lex();
 
+  if (Context.shouldDiscardValueNames())
+    return Error(
+        Lex.getLoc(),
+        "Can't read textual IR with a Context that discards named Values");
+
   return ParseTopLevelEntities() ||
          ValidateEndOfModule();
 }
@@ -62,6 +71,22 @@ bool LLParser::parseStandaloneConstantValue(Constant *&C,
   return false;
 }
 
+bool LLParser::parseTypeAtBeginning(Type *&Ty, unsigned &Read,
+                                    const SlotMapping *Slots) {
+  restoreParsingState(Slots);
+  Lex.Lex();
+
+  Read = 0;
+  SMLoc Start = Lex.getLoc();
+  Ty = nullptr;
+  if (ParseType(Ty))
+    return true;
+  SMLoc End = Lex.getLoc();
+  Read = End.getPointer() - Start.getPointer();
+
+  return false;
+}
+
 void LLParser::restoreParsingState(const SlotMapping *Slots) {
   if (!Slots)
     return;
@@ -78,9 +103,6 @@ void LLParser::restoreParsingState(const SlotMapping *Slots) {
 /// ValidateEndOfModule - Do final validity and sanity checks at the end of the
 /// module.
 bool LLParser::ValidateEndOfModule() {
-  for (unsigned I = 0, E = InstsWithTBAATag.size(); I < E; I++)
-    UpgradeInstWithTBAATag(InstsWithTBAATag[I]);
-
   // Handle any function attribute group forward references.
   for (std::map<Value*, std::vector<unsigned> >::iterator
          I = ForwardRefAttrGroups.begin(), E = ForwardRefAttrGroups.end();
@@ -183,12 +205,28 @@ bool LLParser::ValidateEndOfModule() {
       N.second->resolveCycles();
   }
 
+  for (unsigned I = 0, E = InstsWithTBAATag.size(); I < E; I++)
+    UpgradeInstWithTBAATag(InstsWithTBAATag[I]);
+
   // Look for intrinsic functions and CallInst that need to be upgraded
   for (Module::iterator FI = M->begin(), FE = M->end(); FI != FE; )
     UpgradeCallsToIntrinsic(&*FI++); // must be post-increment, as we remove
 
+  // Some types could be renamed during loading if several modules are
+  // loaded in the same LLVMContext (LTO scenario). In this case we should
+  // remangle intrinsics names as well.
+  for (Module::iterator FI = M->begin(), FE = M->end(); FI != FE; ) {
+    Function *F = &*FI++;
+    if (auto Remangled = Intrinsic::remangleIntrinsicFunction(F)) {
+      F->replaceAllUsesWith(Remangled.getValue());
+      F->eraseFromParent();
+    }
+  }
+
   UpgradeDebugInfo(*M);
 
+  UpgradeModuleFlags(*M);
+
   if (!Slots)
     return false;
   // Initialize the slot mapping.
@@ -217,6 +255,10 @@ bool LLParser::ParseTopLevelEntities() {
     case lltok::kw_define:  if (ParseDefine()) return true; break;
     case lltok::kw_module:  if (ParseModuleAsm()) return true; break;
     case lltok::kw_target:  if (ParseTargetDefinition()) return true; break;
+    case lltok::kw_source_filename:
+      if (ParseSourceFileName())
+        return true;
+      break;
     case lltok::kw_deplibs: if (ParseDepLibs()) return true; break;
     case lltok::LocalVarID: if (ParseUnnamedType()) return true; break;
     case lltok::LocalVar:   if (ParseNamedType()) return true; break;
@@ -225,46 +267,6 @@ bool LLParser::ParseTopLevelEntities() {
     case lltok::ComdatVar:  if (parseComdat()) return true; break;
     case lltok::exclaim:    if (ParseStandaloneMetadata()) return true; break;
     case lltok::MetadataVar:if (ParseNamedMetadata()) return true; break;
-
-    // The Global variable production with no name can have many different
-    // optional leading prefixes, the production is:
-    // GlobalVar ::= OptionalLinkage OptionalVisibility OptionalDLLStorageClass
-    //               OptionalThreadLocal OptionalAddrSpace OptionalUnnamedAddr
-    //               ('constant'|'global') ...
-    case lltok::kw_private:             // OptionalLinkage
-    case lltok::kw_internal:            // OptionalLinkage
-    case lltok::kw_weak:                // OptionalLinkage
-    case lltok::kw_weak_odr:            // OptionalLinkage
-    case lltok::kw_linkonce:            // OptionalLinkage
-    case lltok::kw_linkonce_odr:        // OptionalLinkage
-    case lltok::kw_appending:           // OptionalLinkage
-    case lltok::kw_common:              // OptionalLinkage
-    case lltok::kw_extern_weak:         // OptionalLinkage
-    case lltok::kw_external:            // OptionalLinkage
-    case lltok::kw_default:             // OptionalVisibility
-    case lltok::kw_hidden:              // OptionalVisibility
-    case lltok::kw_protected:           // OptionalVisibility
-    case lltok::kw_dllimport:           // OptionalDLLStorageClass
-    case lltok::kw_dllexport:           // OptionalDLLStorageClass
-    case lltok::kw_thread_local:        // OptionalThreadLocal
-    case lltok::kw_addrspace:           // OptionalAddrSpace
-    case lltok::kw_constant:            // GlobalType
-    case lltok::kw_global: {            // GlobalType
-      unsigned Linkage, Visibility, DLLStorageClass;
-      bool UnnamedAddr;
-      GlobalVariable::ThreadLocalMode TLM;
-      bool HasLinkage;
-      if (ParseOptionalLinkage(Linkage, HasLinkage) ||
-          ParseOptionalVisibility(Visibility) ||
-          ParseOptionalDLLStorageClass(DLLStorageClass) ||
-          ParseOptionalThreadLocal(TLM) ||
-          parseOptionalUnnamedAddr(UnnamedAddr) ||
-          ParseGlobal("", SMLoc(), Linkage, HasLinkage, Visibility,
-                      DLLStorageClass, TLM, UnnamedAddr))
-        return true;
-      break;
-    }
-
     case lltok::kw_attributes: if (ParseUnnamedAttrGrp()) return true; break;
     case lltok::kw_uselistorder: if (ParseUseListOrder()) return true; break;
     case lltok::kw_uselistorder_bb:
@@ -314,6 +316,19 @@ bool LLParser::ParseTargetDefinition() {
 }
 
 /// toplevelentity
+///   ::= 'source_filename' '=' STRINGCONSTANT
+bool LLParser::ParseSourceFileName() {
+  assert(Lex.getKind() == lltok::kw_source_filename);
+  std::string Str;
+  Lex.Lex();
+  if (ParseToken(lltok::equal, "expected '=' after source_filename") ||
+      ParseStringConstant(Str))
+    return true;
+  M->setSourceFileName(Str);
+  return false;
+}
+
+/// toplevelentity
 ///   ::= 'deplibs' '=' '[' ']'
 ///   ::= 'deplibs' '=' '[' STRINGCONSTANT (',' STRINGCONSTANT)* ']'
 /// FIXME: Remove in 4.0. Currently parse, but ignore.
@@ -395,8 +410,21 @@ bool LLParser::ParseDeclare() {
   assert(Lex.getKind() == lltok::kw_declare);
   Lex.Lex();
 
+  std::vector<std::pair<unsigned, MDNode *>> MDs;
+  while (Lex.getKind() == lltok::MetadataVar) {
+    unsigned MDK;
+    MDNode *N;
+    if (ParseMetadataAttachment(MDK, N))
+      return true;
+    MDs.push_back({MDK, N});
+  }
+
   Function *F;
-  return ParseFunctionHeader(F, false);
+  if (ParseFunctionHeader(F, false))
+    return true;
+  for (auto &MD : MDs)
+    F->addMetadata(MD.first, *MD.second);
+  return false;
 }
 
 /// toplevelentity
@@ -427,11 +455,22 @@ bool LLParser::ParseGlobalType(bool &IsConstant) {
   return false;
 }
 
+bool LLParser::ParseOptionalUnnamedAddr(
+    GlobalVariable::UnnamedAddr &UnnamedAddr) {
+  if (EatIfPresent(lltok::kw_unnamed_addr))
+    UnnamedAddr = GlobalValue::UnnamedAddr::Global;
+  else if (EatIfPresent(lltok::kw_local_unnamed_addr))
+    UnnamedAddr = GlobalValue::UnnamedAddr::Local;
+  else
+    UnnamedAddr = GlobalValue::UnnamedAddr::None;
+  return false;
+}
+
 /// ParseUnnamedGlobal:
-///   OptionalVisibility ALIAS ...
+///   OptionalVisibility (ALIAS | IFUNC) ...
 ///   OptionalLinkage OptionalVisibility OptionalDLLStorageClass
 ///                                                     ...   -> global variable
-///   GlobalID '=' OptionalVisibility ALIAS ...
+///   GlobalID '=' OptionalVisibility (ALIAS | IFUNC) ...
 ///   GlobalID '=' OptionalLinkage OptionalVisibility OptionalDLLStorageClass
 ///                                                     ...   -> global variable
 bool LLParser::ParseUnnamedGlobal() {
@@ -453,23 +492,21 @@ bool LLParser::ParseUnnamedGlobal() {
   bool HasLinkage;
   unsigned Linkage, Visibility, DLLStorageClass;
   GlobalVariable::ThreadLocalMode TLM;
-  bool UnnamedAddr;
-  if (ParseOptionalLinkage(Linkage, HasLinkage) ||
-      ParseOptionalVisibility(Visibility) ||
-      ParseOptionalDLLStorageClass(DLLStorageClass) ||
-      ParseOptionalThreadLocal(TLM) ||
-      parseOptionalUnnamedAddr(UnnamedAddr))
+  GlobalVariable::UnnamedAddr UnnamedAddr;
+  if (ParseOptionalLinkage(Linkage, HasLinkage, Visibility, DLLStorageClass) ||
+      ParseOptionalThreadLocal(TLM) || ParseOptionalUnnamedAddr(UnnamedAddr))
     return true;
 
-  if (Lex.getKind() != lltok::kw_alias)
+  if (Lex.getKind() != lltok::kw_alias && Lex.getKind() != lltok::kw_ifunc)
     return ParseGlobal(Name, NameLoc, Linkage, HasLinkage, Visibility,
                        DLLStorageClass, TLM, UnnamedAddr);
-  return ParseAlias(Name, NameLoc, Linkage, Visibility, DLLStorageClass, TLM,
-                    UnnamedAddr);
+
+  return parseIndirectSymbol(Name, NameLoc, Linkage, Visibility,
+                             DLLStorageClass, TLM, UnnamedAddr);
 }
 
 /// ParseNamedGlobal:
-///   GlobalVar '=' OptionalVisibility ALIAS ...
+///   GlobalVar '=' OptionalVisibility (ALIAS | IFUNC) ...
 ///   GlobalVar '=' OptionalLinkage OptionalVisibility OptionalDLLStorageClass
 ///                                                     ...   -> global variable
 bool LLParser::ParseNamedGlobal() {
@@ -481,21 +518,18 @@ bool LLParser::ParseNamedGlobal() {
   bool HasLinkage;
   unsigned Linkage, Visibility, DLLStorageClass;
   GlobalVariable::ThreadLocalMode TLM;
-  bool UnnamedAddr;
+  GlobalVariable::UnnamedAddr UnnamedAddr;
   if (ParseToken(lltok::equal, "expected '=' in global variable") ||
-      ParseOptionalLinkage(Linkage, HasLinkage) ||
-      ParseOptionalVisibility(Visibility) ||
-      ParseOptionalDLLStorageClass(DLLStorageClass) ||
-      ParseOptionalThreadLocal(TLM) ||
-      parseOptionalUnnamedAddr(UnnamedAddr))
+      ParseOptionalLinkage(Linkage, HasLinkage, Visibility, DLLStorageClass) ||
+      ParseOptionalThreadLocal(TLM) || ParseOptionalUnnamedAddr(UnnamedAddr))
     return true;
 
-  if (Lex.getKind() != lltok::kw_alias)
+  if (Lex.getKind() != lltok::kw_alias && Lex.getKind() != lltok::kw_ifunc)
     return ParseGlobal(Name, NameLoc, Linkage, HasLinkage, Visibility,
                        DLLStorageClass, TLM, UnnamedAddr);
 
-  return ParseAlias(Name, NameLoc, Linkage, Visibility, DLLStorageClass, TLM,
-                    UnnamedAddr);
+  return parseIndirectSymbol(Name, NameLoc, Linkage, Visibility,
+                             DLLStorageClass, TLM, UnnamedAddr);
 }
 
 bool LLParser::parseComdat() {
@@ -553,7 +587,6 @@ bool LLParser::parseComdat() {
 bool LLParser::ParseMDString(MDString *&Result) {
   std::string Str;
   if (ParseStringConstant(Str)) return true;
-  llvm::UpgradeMDStringConstant(Str);
   Result = MDString::get(Context, Str);
   return false;
 }
@@ -562,6 +595,7 @@ bool LLParser::ParseMDString(MDString *&Result) {
 //   ::= '!' MDNodeNumber
 bool LLParser::ParseMDNodeID(MDNode *&Result) {
   // !{ ..., !42, ... }
+  LocTy IDLoc = Lex.getLoc();
   unsigned MID = 0;
   if (ParseUInt32(MID))
     return true;
@@ -574,7 +608,7 @@ bool LLParser::ParseMDNodeID(MDNode *&Result) {
 
   // Otherwise, create MDNode forward reference.
   auto &FwdRef = ForwardRefMDNodes[MID];
-  FwdRef = std::make_pair(MDTuple::getTemporary(Context, None), Lex.getLoc());
+  FwdRef = std::make_pair(MDTuple::getTemporary(Context, None), IDLoc);
 
   Result = FwdRef.first.get();
   NumberedMetadata[MID].reset(Result);
@@ -652,26 +686,32 @@ static bool isValidVisibilityForLinkage(unsigned V, unsigned L) {
          (GlobalValue::VisibilityTypes)V == GlobalValue::DefaultVisibility;
 }
 
-/// ParseAlias:
+/// parseIndirectSymbol:
 ///   ::= GlobalVar '=' OptionalLinkage OptionalVisibility
 ///                     OptionalDLLStorageClass OptionalThreadLocal
-///                     OptionalUnnamedAddr 'alias' Aliasee
+///                     OptionalUnnamedAddr 'alias|ifunc' IndirectSymbol
 ///
-/// Aliasee
+/// IndirectSymbol
 ///   ::= TypeAndValue
 ///
 /// Everything through OptionalUnnamedAddr has already been parsed.
 ///
-bool LLParser::ParseAlias(const std::string &Name, LocTy NameLoc, unsigned L,
-                          unsigned Visibility, unsigned DLLStorageClass,
-                          GlobalVariable::ThreadLocalMode TLM,
-                          bool UnnamedAddr) {
-  assert(Lex.getKind() == lltok::kw_alias);
+bool LLParser::parseIndirectSymbol(
+    const std::string &Name, LocTy NameLoc, unsigned L, unsigned Visibility,
+    unsigned DLLStorageClass, GlobalVariable::ThreadLocalMode TLM,
+    GlobalVariable::UnnamedAddr UnnamedAddr) {
+  bool IsAlias;
+  if (Lex.getKind() == lltok::kw_alias)
+    IsAlias = true;
+  else if (Lex.getKind() == lltok::kw_ifunc)
+    IsAlias = false;
+  else
+    llvm_unreachable("Not an alias or ifunc!");
   Lex.Lex();
 
   GlobalValue::LinkageTypes Linkage = (GlobalValue::LinkageTypes) L;
 
-  if(!GlobalAlias::isValidLinkage(Linkage))
+  if(IsAlias && !GlobalAlias::isValidLinkage(Linkage))
     return Error(NameLoc, "invalid linkage type for alias");
 
   if (!isValidVisibilityForLinkage(Visibility, L))
@@ -681,7 +721,7 @@ bool LLParser::ParseAlias(const std::string &Name, LocTy NameLoc, unsigned L,
   Type *Ty;
   LocTy ExplicitTypeLoc = Lex.getLoc();
   if (ParseType(Ty) ||
-      ParseToken(lltok::comma, "expected comma after alias's type"))
+      ParseToken(lltok::comma, "expected comma after alias or ifunc's type"))
     return true;
 
   Constant *Aliasee;
@@ -705,14 +745,19 @@ bool LLParser::ParseAlias(const std::string &Name, LocTy NameLoc, unsigned L,
   Type *AliaseeType = Aliasee->getType();
   auto *PTy = dyn_cast<PointerType>(AliaseeType);
   if (!PTy)
-    return Error(AliaseeLoc, "An alias must have pointer type");
+    return Error(AliaseeLoc, "An alias or ifunc must have pointer type");
   unsigned AddrSpace = PTy->getAddressSpace();
 
-  if (Ty != PTy->getElementType())
+  if (IsAlias && Ty != PTy->getElementType())
     return Error(
         ExplicitTypeLoc,
         "explicit pointee type doesn't match operand's pointee type");
 
+  if (!IsAlias && !PTy->getElementType()->isFunctionTy())
+    return Error(
+        ExplicitTypeLoc,
+        "explicit pointee type should be a function type");
+
   GlobalValue *GVal = nullptr;
 
   // See if the alias was forward referenced, if so, prepare to replace the
@@ -732,9 +777,15 @@ bool LLParser::ParseAlias(const std::string &Name, LocTy NameLoc, unsigned L,
   }
 
   // Okay, create the alias but do not insert it into the module yet.
-  std::unique_ptr<GlobalAlias> GA(
-      GlobalAlias::create(Ty, AddrSpace, (GlobalValue::LinkageTypes)Linkage,
-                          Name, Aliasee, /*Parent*/ nullptr));
+  std::unique_ptr<GlobalIndirectSymbol> GA;
+  if (IsAlias)
+    GA.reset(GlobalAlias::create(Ty, AddrSpace,
+                                 (GlobalValue::LinkageTypes)Linkage, Name,
+                                 Aliasee, /*Parent*/ nullptr));
+  else
+    GA.reset(GlobalIFunc::create(Ty, AddrSpace,
+                                 (GlobalValue::LinkageTypes)Linkage, Name,
+                                 Aliasee, /*Parent*/ nullptr));
   GA->setThreadLocalMode(TLM);
   GA->setVisibility((GlobalValue::VisibilityTypes)Visibility);
   GA->setDLLStorageClass((GlobalValue::DLLStorageClassTypes)DLLStorageClass);
@@ -757,7 +808,10 @@ bool LLParser::ParseAlias(const std::string &Name, LocTy NameLoc, unsigned L,
   }
 
   // Insert into the module, we know its name won't collide now.
-  M->getAliasList().push_back(GA.get());
+  if (IsAlias)
+    M->getAliasList().push_back(cast<GlobalAlias>(GA.get()));
+  else
+    M->getIFuncList().push_back(cast<GlobalIFunc>(GA.get()));
   assert(GA->getName() == Name && "Should not be a name conflict!");
 
   // The module owns this now
@@ -781,7 +835,7 @@ bool LLParser::ParseGlobal(const std::string &Name, LocTy NameLoc,
                            unsigned Linkage, bool HasLinkage,
                            unsigned Visibility, unsigned DLLStorageClass,
                            GlobalVariable::ThreadLocalMode TLM,
-                           bool UnnamedAddr) {
+                           GlobalVariable::UnnamedAddr UnnamedAddr) {
   if (!isValidVisibilityForLinkage(Visibility, Linkage))
     return Error(NameLoc,
                  "symbol with local linkage must have default visibility");
@@ -803,8 +857,9 @@ bool LLParser::ParseGlobal(const std::string &Name, LocTy NameLoc,
   // If the linkage is specified and is external, then no initializer is
   // present.
   Constant *Init = nullptr;
-  if (!HasLinkage || (Linkage != GlobalValue::ExternalWeakLinkage &&
-                      Linkage != GlobalValue::ExternalLinkage)) {
+  if (!HasLinkage ||
+      !GlobalValue::isValidDeclarationLinkage(
+          (GlobalValue::LinkageTypes)Linkage)) {
     if (ParseGlobalValue(Ty, Init))
       return true;
   }
@@ -872,6 +927,9 @@ bool LLParser::ParseGlobal(const std::string &Name, LocTy NameLoc,
       unsigned Alignment;
       if (ParseOptionalAlignment(Alignment)) return true;
       GV->setAlignment(Alignment);
+    } else if (Lex.getKind() == lltok::MetadataVar) {
+      if (ParseGlobalObjectMetadataAttachment(*GV))
+        return true;
     } else {
       Comdat *C;
       if (parseOptionalComdat(Name, C))
@@ -990,6 +1048,15 @@ bool LLParser::ParseFnAttributeValuePairs(AttrBuilder &B,
       B.addStackAlignmentAttr(Alignment);
       continue;
     }
+    case lltok::kw_allocsize: {
+      unsigned ElemSizeArg;
+      Optional<unsigned> NumElemsArg;
+      // inAttrGrp doesn't matter; we only support allocsize(a[, b])
+      if (parseAllocSizeArguments(ElemSizeArg, NumElemsArg))
+        return true;
+      B.addAllocSizeAttr(ElemSizeArg, NumElemsArg);
+      continue;
+    }
     case lltok::kw_alwaysinline: B.addAttribute(Attribute::AlwaysInline); break;
     case lltok::kw_argmemonly: B.addAttribute(Attribute::ArgMemOnly); break;
     case lltok::kw_builtin: B.addAttribute(Attribute::Builtin); break;
@@ -1031,6 +1098,7 @@ bool LLParser::ParseFnAttributeValuePairs(AttrBuilder &B,
     case lltok::kw_sanitize_memory:
       B.addAttribute(Attribute::SanitizeMemory); break;
     case lltok::kw_uwtable: B.addAttribute(Attribute::UWTable); break;
+    case lltok::kw_writeonly: B.addAttribute(Attribute::WriteOnly); break;
 
     // Error handling.
     case lltok::kw_inreg:
@@ -1050,6 +1118,8 @@ bool LLParser::ParseFnAttributeValuePairs(AttrBuilder &B,
     case lltok::kw_nonnull:
     case lltok::kw_returned:
     case lltok::kw_sret:
+    case lltok::kw_swifterror:
+    case lltok::kw_swiftself:
       HaveError |=
         Error(Lex.getLoc(),
               "invalid use of parameter-only attribute on a function");
@@ -1323,6 +1393,9 @@ bool LLParser::ParseOptionalParamAttrs(AttrBuilder &B) {
     case lltok::kw_returned:        B.addAttribute(Attribute::Returned); break;
     case lltok::kw_signext:         B.addAttribute(Attribute::SExt); break;
     case lltok::kw_sret:            B.addAttribute(Attribute::StructRet); break;
+    case lltok::kw_swifterror:      B.addAttribute(Attribute::SwiftError); break;
+    case lltok::kw_swiftself:       B.addAttribute(Attribute::SwiftSelf); break;
+    case lltok::kw_writeonly:       B.addAttribute(Attribute::WriteOnly); break;
     case lltok::kw_zeroext:         B.addAttribute(Attribute::ZExt); break;
 
     case lltok::kw_alignstack:
@@ -1410,6 +1483,8 @@ bool LLParser::ParseOptionalReturnAttrs(AttrBuilder &B) {
     case lltok::kw_nocapture:
     case lltok::kw_returned:
     case lltok::kw_sret:
+    case lltok::kw_swifterror:
+    case lltok::kw_swiftself:
       HaveError |= Error(Lex.getLoc(), "invalid use of parameter-only attribute");
       break;
 
@@ -1453,6 +1528,37 @@ bool LLParser::ParseOptionalReturnAttrs(AttrBuilder &B) {
   }
 }
 
+static unsigned parseOptionalLinkageAux(lltok::Kind Kind, bool &HasLinkage) {
+  HasLinkage = true;
+  switch (Kind) {
+  default:
+    HasLinkage = false;
+    return GlobalValue::ExternalLinkage;
+  case lltok::kw_private:
+    return GlobalValue::PrivateLinkage;
+  case lltok::kw_internal:
+    return GlobalValue::InternalLinkage;
+  case lltok::kw_weak:
+    return GlobalValue::WeakAnyLinkage;
+  case lltok::kw_weak_odr:
+    return GlobalValue::WeakODRLinkage;
+  case lltok::kw_linkonce:
+    return GlobalValue::LinkOnceAnyLinkage;
+  case lltok::kw_linkonce_odr:
+    return GlobalValue::LinkOnceODRLinkage;
+  case lltok::kw_available_externally:
+    return GlobalValue::AvailableExternallyLinkage;
+  case lltok::kw_appending:
+    return GlobalValue::AppendingLinkage;
+  case lltok::kw_common:
+    return GlobalValue::CommonLinkage;
+  case lltok::kw_extern_weak:
+    return GlobalValue::ExternalWeakLinkage;
+  case lltok::kw_external:
+    return GlobalValue::ExternalLinkage;
+  }
+}
+
 /// ParseOptionalLinkage
 ///   ::= /*empty*/
 ///   ::= 'private'
@@ -1466,26 +1572,14 @@ bool LLParser::ParseOptionalReturnAttrs(AttrBuilder &B) {
 ///   ::= 'common'
 ///   ::= 'extern_weak'
 ///   ::= 'external'
-bool LLParser::ParseOptionalLinkage(unsigned &Res, bool &HasLinkage) {
-  HasLinkage = false;
-  switch (Lex.getKind()) {
-  default:                       Res=GlobalValue::ExternalLinkage; return false;
-  case lltok::kw_private:        Res = GlobalValue::PrivateLinkage;       break;
-  case lltok::kw_internal:       Res = GlobalValue::InternalLinkage;      break;
-  case lltok::kw_weak:           Res = GlobalValue::WeakAnyLinkage;       break;
-  case lltok::kw_weak_odr:       Res = GlobalValue::WeakODRLinkage;       break;
-  case lltok::kw_linkonce:       Res = GlobalValue::LinkOnceAnyLinkage;   break;
-  case lltok::kw_linkonce_odr:   Res = GlobalValue::LinkOnceODRLinkage;   break;
-  case lltok::kw_available_externally:
-    Res = GlobalValue::AvailableExternallyLinkage;
-    break;
-  case lltok::kw_appending:      Res = GlobalValue::AppendingLinkage;     break;
-  case lltok::kw_common:         Res = GlobalValue::CommonLinkage;        break;
-  case lltok::kw_extern_weak:    Res = GlobalValue::ExternalWeakLinkage;  break;
-  case lltok::kw_external:       Res = GlobalValue::ExternalLinkage;      break;
-  }
-  Lex.Lex();
-  HasLinkage = true;
+bool LLParser::ParseOptionalLinkage(unsigned &Res, bool &HasLinkage,
+                                    unsigned &Visibility,
+                                    unsigned &DLLStorageClass) {
+  Res = parseOptionalLinkageAux(Lex.getKind(), HasLinkage);
+  if (HasLinkage)
+    Lex.Lex();
+  ParseOptionalVisibility(Visibility);
+  ParseOptionalDLLStorageClass(DLLStorageClass);
   return false;
 }
 
@@ -1495,15 +1589,22 @@ bool LLParser::ParseOptionalLinkage(unsigned &Res, bool &HasLinkage) {
 ///   ::= 'hidden'
 ///   ::= 'protected'
 ///
-bool LLParser::ParseOptionalVisibility(unsigned &Res) {
+void LLParser::ParseOptionalVisibility(unsigned &Res) {
   switch (Lex.getKind()) {
-  default:                  Res = GlobalValue::DefaultVisibility; return false;
-  case lltok::kw_default:   Res = GlobalValue::DefaultVisibility; break;
-  case lltok::kw_hidden:    Res = GlobalValue::HiddenVisibility; break;
-  case lltok::kw_protected: Res = GlobalValue::ProtectedVisibility; break;
+  default:
+    Res = GlobalValue::DefaultVisibility;
+    return;
+  case lltok::kw_default:
+    Res = GlobalValue::DefaultVisibility;
+    break;
+  case lltok::kw_hidden:
+    Res = GlobalValue::HiddenVisibility;
+    break;
+  case lltok::kw_protected:
+    Res = GlobalValue::ProtectedVisibility;
+    break;
   }
   Lex.Lex();
-  return false;
 }
 
 /// ParseOptionalDLLStorageClass
@@ -1511,14 +1612,19 @@ bool LLParser::ParseOptionalVisibility(unsigned &Res) {
 ///   ::= 'dllimport'
 ///   ::= 'dllexport'
 ///
-bool LLParser::ParseOptionalDLLStorageClass(unsigned &Res) {
+void LLParser::ParseOptionalDLLStorageClass(unsigned &Res) {
   switch (Lex.getKind()) {
-  default:                  Res = GlobalValue::DefaultStorageClass; return false;
-  case lltok::kw_dllimport: Res = GlobalValue::DLLImportStorageClass; break;
-  case lltok::kw_dllexport: Res = GlobalValue::DLLExportStorageClass; break;
+  default:
+    Res = GlobalValue::DefaultStorageClass;
+    return;
+  case lltok::kw_dllimport:
+    Res = GlobalValue::DLLImportStorageClass;
+    break;
+  case lltok::kw_dllexport:
+    Res = GlobalValue::DLLExportStorageClass;
+    break;
   }
   Lex.Lex();
-  return false;
 }
 
 /// ParseOptionalCallingConv
@@ -1535,6 +1641,8 @@ bool LLParser::ParseOptionalDLLStorageClass(unsigned &Res) {
 ///   ::= 'arm_aapcscc'
 ///   ::= 'arm_aapcs_vfpcc'
 ///   ::= 'msp430_intrcc'
+///   ::= 'avr_intrcc'
+///   ::= 'avr_signalcc'
 ///   ::= 'ptx_kernel'
 ///   ::= 'ptx_device'
 ///   ::= 'spir_func'
@@ -1546,10 +1654,18 @@ bool LLParser::ParseOptionalDLLStorageClass(unsigned &Res) {
 ///   ::= 'preserve_mostcc'
 ///   ::= 'preserve_allcc'
 ///   ::= 'ghccc'
+///   ::= 'swiftcc'
 ///   ::= 'x86_intrcc'
 ///   ::= 'hhvmcc'
 ///   ::= 'hhvm_ccc'
 ///   ::= 'cxx_fast_tlscc'
+///   ::= 'amdgpu_vs'
+///   ::= 'amdgpu_tcs'
+///   ::= 'amdgpu_tes'
+///   ::= 'amdgpu_gs'
+///   ::= 'amdgpu_ps'
+///   ::= 'amdgpu_cs'
+///   ::= 'amdgpu_kernel'
 ///   ::= 'cc' UINT
 ///
 bool LLParser::ParseOptionalCallingConv(unsigned &CC) {
@@ -1566,6 +1682,8 @@ bool LLParser::ParseOptionalCallingConv(unsigned &CC) {
   case lltok::kw_arm_aapcscc:    CC = CallingConv::ARM_AAPCS; break;
   case lltok::kw_arm_aapcs_vfpcc:CC = CallingConv::ARM_AAPCS_VFP; break;
   case lltok::kw_msp430_intrcc:  CC = CallingConv::MSP430_INTR; break;
+  case lltok::kw_avr_intrcc:     CC = CallingConv::AVR_INTR; break;
+  case lltok::kw_avr_signalcc:   CC = CallingConv::AVR_SIGNAL; break;
   case lltok::kw_ptx_kernel:     CC = CallingConv::PTX_Kernel; break;
   case lltok::kw_ptx_device:     CC = CallingConv::PTX_Device; break;
   case lltok::kw_spir_kernel:    CC = CallingConv::SPIR_KERNEL; break;
@@ -1578,10 +1696,16 @@ bool LLParser::ParseOptionalCallingConv(unsigned &CC) {
   case lltok::kw_preserve_mostcc:CC = CallingConv::PreserveMost; break;
   case lltok::kw_preserve_allcc: CC = CallingConv::PreserveAll; break;
   case lltok::kw_ghccc:          CC = CallingConv::GHC; break;
+  case lltok::kw_swiftcc:        CC = CallingConv::Swift; break;
   case lltok::kw_x86_intrcc:     CC = CallingConv::X86_INTR; break;
   case lltok::kw_hhvmcc:         CC = CallingConv::HHVM; break;
   case lltok::kw_hhvm_ccc:       CC = CallingConv::HHVM_C; break;
   case lltok::kw_cxx_fast_tlscc: CC = CallingConv::CXX_FAST_TLS; break;
+  case lltok::kw_amdgpu_vs:      CC = CallingConv::AMDGPU_VS; break;
+  case lltok::kw_amdgpu_gs:      CC = CallingConv::AMDGPU_GS; break;
+  case lltok::kw_amdgpu_ps:      CC = CallingConv::AMDGPU_PS; break;
+  case lltok::kw_amdgpu_cs:      CC = CallingConv::AMDGPU_CS; break;
+  case lltok::kw_amdgpu_kernel:  CC = CallingConv::AMDGPU_KERNEL; break;
   case lltok::kw_cc: {
       Lex.Lex();
       return ParseUInt32(CC);
@@ -1625,17 +1749,24 @@ bool LLParser::ParseInstructionMetadata(Instruction &Inst) {
   return false;
 }
 
+/// ParseGlobalObjectMetadataAttachment
+///   ::= !dbg !57
+bool LLParser::ParseGlobalObjectMetadataAttachment(GlobalObject &GO) {
+  unsigned MDK;
+  MDNode *N;
+  if (ParseMetadataAttachment(MDK, N))
+    return true;
+
+  GO.addMetadata(MDK, *N);
+  return false;
+}
+
 /// ParseOptionalFunctionMetadata
 ///   ::= (!dbg !57)*
 bool LLParser::ParseOptionalFunctionMetadata(Function &F) {
-  while (Lex.getKind() == lltok::MetadataVar) {
-    unsigned MDK;
-    MDNode *N;
-    if (ParseMetadataAttachment(MDK, N))
+  while (Lex.getKind() == lltok::MetadataVar)
+    if (ParseGlobalObjectMetadataAttachment(F))
       return true;
-
-    F.setMetadata(MDK, N);
-  }
   return false;
 }
 
@@ -1707,6 +1838,35 @@ bool LLParser::ParseOptionalCommaAlign(unsigned &Alignment,
   return false;
 }
 
+bool LLParser::parseAllocSizeArguments(unsigned &BaseSizeArg,
+                                       Optional<unsigned> &HowManyArg) {
+  Lex.Lex();
+
+  auto StartParen = Lex.getLoc();
+  if (!EatIfPresent(lltok::lparen))
+    return Error(StartParen, "expected '('");
+
+  if (ParseUInt32(BaseSizeArg))
+    return true;
+
+  if (EatIfPresent(lltok::comma)) {
+    auto HowManyAt = Lex.getLoc();
+    unsigned HowMany;
+    if (ParseUInt32(HowMany))
+      return true;
+    if (HowMany == BaseSizeArg)
+      return Error(HowManyAt,
+                   "'allocsize' indices can't refer to the same parameter");
+    HowManyArg = HowMany;
+  } else
+    HowManyArg = None;
+
+  auto EndParen = Lex.getLoc();
+  if (!EatIfPresent(lltok::rparen))
+    return Error(EndParen, "expected ')'");
+  return false;
+}
+
 /// ParseScopeAndOrdering
 ///   if isAtomic: ::= 'singlethread'? AtomicOrdering
 ///   else: ::=
@@ -1731,12 +1891,16 @@ bool LLParser::ParseScopeAndOrdering(bool isAtomic, SynchronizationScope &Scope,
 bool LLParser::ParseOrdering(AtomicOrdering &Ordering) {
   switch (Lex.getKind()) {
   default: return TokError("Expected ordering on atomic instruction");
-  case lltok::kw_unordered: Ordering = Unordered; break;
-  case lltok::kw_monotonic: Ordering = Monotonic; break;
-  case lltok::kw_acquire: Ordering = Acquire; break;
-  case lltok::kw_release: Ordering = Release; break;
-  case lltok::kw_acq_rel: Ordering = AcquireRelease; break;
-  case lltok::kw_seq_cst: Ordering = SequentiallyConsistent; break;
+  case lltok::kw_unordered: Ordering = AtomicOrdering::Unordered; break;
+  case lltok::kw_monotonic: Ordering = AtomicOrdering::Monotonic; break;
+  // Not specified yet:
+  // case lltok::kw_consume: Ordering = AtomicOrdering::Consume; break;
+  case lltok::kw_acquire: Ordering = AtomicOrdering::Acquire; break;
+  case lltok::kw_release: Ordering = AtomicOrdering::Release; break;
+  case lltok::kw_acq_rel: Ordering = AtomicOrdering::AcquireRelease; break;
+  case lltok::kw_seq_cst:
+    Ordering = AtomicOrdering::SequentiallyConsistent;
+    break;
   }
   Lex.Lex();
   return false;
@@ -3215,6 +3379,12 @@ struct DwarfVirtualityField : public MDUnsignedField {
 struct DwarfLangField : public MDUnsignedField {
   DwarfLangField() : MDUnsignedField(0, dwarf::DW_LANG_hi_user) {}
 };
+struct DwarfCCField : public MDUnsignedField {
+  DwarfCCField() : MDUnsignedField(0, dwarf::DW_CC_hi_user) {}
+};
+struct EmissionKindField : public MDUnsignedField {
+  EmissionKindField() : MDUnsignedField(0, DICompileUnit::LastEmissionKind) {}
+};
 
 struct DIFlagField : public MDUnsignedField {
   DIFlagField() : MDUnsignedField(0, UINT32_MAX) {}
@@ -3327,7 +3497,7 @@ bool LLParser::ParseMDField(LocTy Loc, StringRef Name,
     return TokError("expected DWARF virtuality code");
 
   unsigned Virtuality = dwarf::getVirtuality(Lex.getStrVal());
-  if (!Virtuality)
+  if (Virtuality == dwarf::DW_VIRTUALITY_invalid)
     return TokError("invalid DWARF virtuality code" + Twine(" '") +
                     Lex.getStrVal() + "'");
   assert(Virtuality <= Result.Max && "Expected valid DWARF virtuality code");
@@ -3355,6 +3525,42 @@ bool LLParser::ParseMDField(LocTy Loc, StringRef Name, DwarfLangField &Result) {
 }
 
 template <>
+bool LLParser::ParseMDField(LocTy Loc, StringRef Name, DwarfCCField &Result) {
+  if (Lex.getKind() == lltok::APSInt)
+    return ParseMDField(Loc, Name, static_cast<MDUnsignedField &>(Result));
+
+  if (Lex.getKind() != lltok::DwarfCC)
+    return TokError("expected DWARF calling convention");
+
+  unsigned CC = dwarf::getCallingConvention(Lex.getStrVal());
+  if (!CC)
+    return TokError("invalid DWARF calling convention" + Twine(" '") + Lex.getStrVal() +
+                    "'");
+  assert(CC <= Result.Max && "Expected valid DWARF calling convention");
+  Result.assign(CC);
+  Lex.Lex();
+  return false;
+}
+
+template <>
+bool LLParser::ParseMDField(LocTy Loc, StringRef Name, EmissionKindField &Result) {
+  if (Lex.getKind() == lltok::APSInt)
+    return ParseMDField(Loc, Name, static_cast<MDUnsignedField &>(Result));
+
+  if (Lex.getKind() != lltok::EmissionKind)
+    return TokError("expected emission kind");
+
+  auto Kind = DICompileUnit::getEmissionKind(Lex.getStrVal());
+  if (!Kind)
+    return TokError("invalid emission kind" + Twine(" '") + Lex.getStrVal() +
+                    "'");
+  assert(*Kind <= Result.Max && "Expected valid emission kind");
+  Result.assign(*Kind);
+  Lex.Lex();
+  return false;
+}
+  
+template <>
 bool LLParser::ParseMDField(LocTy Loc, StringRef Name,
                             DwarfAttEncodingField &Result) {
   if (Lex.getKind() == lltok::APSInt)
@@ -3692,6 +3898,19 @@ bool LLParser::ParseDICompositeType(MDNode *&Result, bool IsDistinct) {
   PARSE_MD_FIELDS();
 #undef VISIT_MD_FIELDS
 
+  // If this has an identifier try to build an ODR type.
+  if (identifier.Val)
+    if (auto *CT = DICompositeType::buildODRType(
+            Context, *identifier.Val, tag.Val, name.Val, file.Val, line.Val,
+            scope.Val, baseType.Val, size.Val, align.Val, offset.Val, flags.Val,
+            elements.Val, runtimeLang.Val, vtableHolder.Val,
+            templateParams.Val)) {
+      Result = CT;
+      return false;
+    }
+
+  // Create a new node, and save it in the context if it belongs in the type
+  // map.
   Result = GET_OR_DISTINCT(
       DICompositeType,
       (Context, tag.Val, name.Val, file.Val, line.Val, scope.Val, baseType.Val,
@@ -3703,11 +3922,13 @@ bool LLParser::ParseDICompositeType(MDNode *&Result, bool IsDistinct) {
 bool LLParser::ParseDISubroutineType(MDNode *&Result, bool IsDistinct) {
 #define VISIT_MD_FIELDS(OPTIONAL, REQUIRED)                                    \
   OPTIONAL(flags, DIFlagField, );                                              \
+  OPTIONAL(cc, DwarfCCField, );                                                \
   REQUIRED(types, MDField, );
   PARSE_MD_FIELDS();
 #undef VISIT_MD_FIELDS
 
-  Result = GET_OR_DISTINCT(DISubroutineType, (Context, flags.Val, types.Val));
+  Result = GET_OR_DISTINCT(DISubroutineType,
+                           (Context, flags.Val, cc.Val, types.Val));
   return false;
 }
 
@@ -3727,8 +3948,8 @@ bool LLParser::ParseDIFile(MDNode *&Result, bool IsDistinct) {
 /// ParseDICompileUnit:
 ///   ::= !DICompileUnit(language: DW_LANG_C99, file: !0, producer: "clang",
 ///                      isOptimized: true, flags: "-O2", runtimeVersion: 1,
-///                      splitDebugFilename: "abc.debug", emissionKind: 1,
-///                      enums: !1, retainedTypes: !2, subprograms: !3,
+///                      splitDebugFilename: "abc.debug",
+///                      emissionKind: FullDebug, enums: !1, retainedTypes: !2,
 ///                      globals: !4, imports: !5, macros: !6, dwoId: 0x0abcd)
 bool LLParser::ParseDICompileUnit(MDNode *&Result, bool IsDistinct) {
   if (!IsDistinct)
@@ -3742,10 +3963,9 @@ bool LLParser::ParseDICompileUnit(MDNode *&Result, bool IsDistinct) {
   OPTIONAL(flags, MDStringField, );                                            \
   OPTIONAL(runtimeVersion, MDUnsignedField, (0, UINT32_MAX));                  \
   OPTIONAL(splitDebugFilename, MDStringField, );                               \
-  OPTIONAL(emissionKind, MDUnsignedField, (0, UINT32_MAX));                    \
+  OPTIONAL(emissionKind, EmissionKindField, );                                 \
   OPTIONAL(enums, MDField, );                                                  \
   OPTIONAL(retainedTypes, MDField, );                                          \
-  OPTIONAL(subprograms, MDField, );                                            \
   OPTIONAL(globals, MDField, );                                                \
   OPTIONAL(imports, MDField, );                                                \
   OPTIONAL(macros, MDField, );                                                 \
@@ -3756,8 +3976,7 @@ bool LLParser::ParseDICompileUnit(MDNode *&Result, bool IsDistinct) {
   Result = DICompileUnit::getDistinct(
       Context, language.Val, file.Val, producer.Val, isOptimized.Val, flags.Val,
       runtimeVersion.Val, splitDebugFilename.Val, emissionKind.Val, enums.Val,
-      retainedTypes.Val, subprograms.Val, globals.Val, imports.Val, macros.Val,
-      dwoId.Val);
+      retainedTypes.Val, globals.Val, imports.Val, macros.Val, dwoId.Val);
   return false;
 }
 
@@ -3766,7 +3985,7 @@ bool LLParser::ParseDICompileUnit(MDNode *&Result, bool IsDistinct) {
 ///                     file: !1, line: 7, type: !2, isLocal: false,
 ///                     isDefinition: true, scopeLine: 8, containingType: !3,
 ///                     virtuality: DW_VIRTUALTIY_pure_virtual,
-///                     virtualIndex: 10, flags: 11,
+///                     virtualIndex: 10, thisAdjustment: 4, flags: 11,
 ///                     isOptimized: false, templateParams: !4, declaration: !5,
 ///                     variables: !6)
 bool LLParser::ParseDISubprogram(MDNode *&Result, bool IsDistinct) {
@@ -3784,8 +4003,10 @@ bool LLParser::ParseDISubprogram(MDNode *&Result, bool IsDistinct) {
   OPTIONAL(containingType, MDField, );                                         \
   OPTIONAL(virtuality, DwarfVirtualityField, );                                \
   OPTIONAL(virtualIndex, MDUnsignedField, (0, UINT32_MAX));                    \
+  OPTIONAL(thisAdjustment, MDSignedField, (0, INT32_MIN, INT32_MAX));          \
   OPTIONAL(flags, DIFlagField, );                                              \
   OPTIONAL(isOptimized, MDBoolField, );                                        \
+  OPTIONAL(unit, MDField, );                                                   \
   OPTIONAL(templateParams, MDField, );                                         \
   OPTIONAL(declaration, MDField, );                                            \
   OPTIONAL(variables, MDField, );
@@ -3798,11 +4019,12 @@ bool LLParser::ParseDISubprogram(MDNode *&Result, bool IsDistinct) {
         "missing 'distinct', required for !DISubprogram when 'isDefinition'");
 
   Result = GET_OR_DISTINCT(
-      DISubprogram,
-      (Context, scope.Val, name.Val, linkageName.Val, file.Val, line.Val,
-       type.Val, isLocal.Val, isDefinition.Val, scopeLine.Val,
-       containingType.Val, virtuality.Val, virtualIndex.Val, flags.Val,
-       isOptimized.Val, templateParams.Val, declaration.Val, variables.Val));
+      DISubprogram, (Context, scope.Val, name.Val, linkageName.Val, file.Val,
+                     line.Val, type.Val, isLocal.Val, isDefinition.Val,
+                     scopeLine.Val, containingType.Val, virtuality.Val,
+                     virtualIndex.Val, thisAdjustment.Val, flags.Val,
+                     isOptimized.Val, unit.Val, templateParams.Val,
+                     declaration.Val, variables.Val));
   return false;
 }
 
@@ -4332,13 +4554,11 @@ bool LLParser::ParseFunctionHeader(Function *&Fn, bool isDefine) {
   unsigned DLLStorageClass;
   AttrBuilder RetAttrs;
   unsigned CC;
+  bool HasLinkage;
   Type *RetType = nullptr;
   LocTy RetTypeLoc = Lex.getLoc();
-  if (ParseOptionalLinkage(Linkage) ||
-      ParseOptionalVisibility(Visibility) ||
-      ParseOptionalDLLStorageClass(DLLStorageClass) ||
-      ParseOptionalCallingConv(CC) ||
-      ParseOptionalReturnAttrs(RetAttrs) ||
+  if (ParseOptionalLinkage(Linkage, HasLinkage, Visibility, DLLStorageClass) ||
+      ParseOptionalCallingConv(CC) || ParseOptionalReturnAttrs(RetAttrs) ||
       ParseType(RetType, RetTypeLoc, true /*void allowed*/))
     return true;
 
@@ -4400,7 +4620,7 @@ bool LLParser::ParseFunctionHeader(Function *&Fn, bool isDefine) {
   std::string Section;
   unsigned Alignment;
   std::string GC;
-  bool UnnamedAddr;
+  GlobalValue::UnnamedAddr UnnamedAddr = GlobalValue::UnnamedAddr::None;
   LocTy UnnamedAddrLoc;
   Constant *Prefix = nullptr;
   Constant *Prologue = nullptr;
@@ -4408,8 +4628,7 @@ bool LLParser::ParseFunctionHeader(Function *&Fn, bool isDefine) {
   Comdat *C;
 
   if (ParseArgumentList(ArgList, isVarArg) ||
-      ParseOptionalToken(lltok::kw_unnamed_addr, UnnamedAddr,
-                         &UnnamedAddrLoc) ||
+      ParseOptionalUnnamedAddr(UnnamedAddr) ||
       ParseFnAttributeValuePairs(FuncAttrs, FwdRefAttrGrps, false,
                                  BuiltinLoc) ||
       (EatIfPresent(lltok::kw_section) &&
@@ -4521,7 +4740,7 @@ bool LLParser::ParseFunctionHeader(Function *&Fn, bool isDefine) {
   Fn->setSection(Section);
   Fn->setComdat(C);
   Fn->setPersonalityFn(PersonalityFn);
-  if (!GC.empty()) Fn->setGC(GC.c_str());
+  if (!GC.empty()) Fn->setGC(GC);
   Fn->setPrefixData(Prefix);
   Fn->setPrologueData(Prologue);
   ForwardRefAttrGroups[Fn] = FwdRefAttrGrps;
@@ -5735,7 +5954,8 @@ bool LLParser::ParseCall(Instruction *&Inst, PerFunctionState &PFS,
 //===----------------------------------------------------------------------===//
 
 /// ParseAlloc
-///   ::= 'alloca' 'inalloca'? Type (',' TypeAndValue)? (',' 'align' i32)?
+///   ::= 'alloca' 'inalloca'? 'swifterror'? Type (',' TypeAndValue)?
+///       (',' 'align' i32)?
 int LLParser::ParseAlloc(Instruction *&Inst, PerFunctionState &PFS) {
   Value *Size = nullptr;
   LocTy SizeLoc, TyLoc;
@@ -5743,6 +5963,7 @@ int LLParser::ParseAlloc(Instruction *&Inst, PerFunctionState &PFS) {
   Type *Ty = nullptr;
 
   bool IsInAlloca = EatIfPresent(lltok::kw_inalloca);
+  bool IsSwiftError = EatIfPresent(lltok::kw_swifterror);
 
   if (ParseType(Ty, TyLoc)) return true;
 
@@ -5767,6 +5988,7 @@ int LLParser::ParseAlloc(Instruction *&Inst, PerFunctionState &PFS) {
 
   AllocaInst *AI = new AllocaInst(Ty, Size, Alignment);
   AI->setUsedWithInAlloca(IsInAlloca);
+  AI->setSwiftError(IsSwiftError);
   Inst = AI;
   return AteExtraComma ? InstExtraComma : InstNormal;
 }
@@ -5780,7 +6002,7 @@ int LLParser::ParseLoad(Instruction *&Inst, PerFunctionState &PFS) {
   unsigned Alignment = 0;
   bool AteExtraComma = false;
   bool isAtomic = false;
-  AtomicOrdering Ordering = NotAtomic;
+  AtomicOrdering Ordering = AtomicOrdering::NotAtomic;
   SynchronizationScope Scope = CrossThread;
 
   if (Lex.getKind() == lltok::kw_atomic) {
@@ -5807,7 +6029,8 @@ int LLParser::ParseLoad(Instruction *&Inst, PerFunctionState &PFS) {
     return Error(Loc, "load operand must be a pointer to a first class type");
   if (isAtomic && !Alignment)
     return Error(Loc, "atomic load must have explicit non-zero alignment");
-  if (Ordering == Release || Ordering == AcquireRelease)
+  if (Ordering == AtomicOrdering::Release ||
+      Ordering == AtomicOrdering::AcquireRelease)
     return Error(Loc, "atomic load cannot use Release ordering");
 
   if (Ty != cast<PointerType>(Val->getType())->getElementType())
@@ -5828,7 +6051,7 @@ int LLParser::ParseStore(Instruction *&Inst, PerFunctionState &PFS) {
   unsigned Alignment = 0;
   bool AteExtraComma = false;
   bool isAtomic = false;
-  AtomicOrdering Ordering = NotAtomic;
+  AtomicOrdering Ordering = AtomicOrdering::NotAtomic;
   SynchronizationScope Scope = CrossThread;
 
   if (Lex.getKind() == lltok::kw_atomic) {
@@ -5857,7 +6080,8 @@ int LLParser::ParseStore(Instruction *&Inst, PerFunctionState &PFS) {
     return Error(Loc, "stored value and pointer type do not match");
   if (isAtomic && !Alignment)
     return Error(Loc, "atomic store must have explicit non-zero alignment");
-  if (Ordering == Acquire || Ordering == AcquireRelease)
+  if (Ordering == AtomicOrdering::Acquire ||
+      Ordering == AtomicOrdering::AcquireRelease)
     return Error(Loc, "atomic store cannot use Acquire ordering");
 
   Inst = new StoreInst(Val, Ptr, isVolatile, Alignment, Ordering, Scope);
@@ -5870,8 +6094,8 @@ int LLParser::ParseStore(Instruction *&Inst, PerFunctionState &PFS) {
 int LLParser::ParseCmpXchg(Instruction *&Inst, PerFunctionState &PFS) {
   Value *Ptr, *Cmp, *New; LocTy PtrLoc, CmpLoc, NewLoc;
   bool AteExtraComma = false;
-  AtomicOrdering SuccessOrdering = NotAtomic;
-  AtomicOrdering FailureOrdering = NotAtomic;
+  AtomicOrdering SuccessOrdering = AtomicOrdering::NotAtomic;
+  AtomicOrdering FailureOrdering = AtomicOrdering::NotAtomic;
   SynchronizationScope Scope = CrossThread;
   bool isVolatile = false;
   bool isWeak = false;
@@ -5891,25 +6115,24 @@ int LLParser::ParseCmpXchg(Instruction *&Inst, PerFunctionState &PFS) {
       ParseOrdering(FailureOrdering))
     return true;
 
-  if (SuccessOrdering == Unordered || FailureOrdering == Unordered)
+  if (SuccessOrdering == AtomicOrdering::Unordered ||
+      FailureOrdering == AtomicOrdering::Unordered)
     return TokError("cmpxchg cannot be unordered");
-  if (SuccessOrdering < FailureOrdering)
-    return TokError("cmpxchg must be at least as ordered on success as failure");
-  if (FailureOrdering == Release || FailureOrdering == AcquireRelease)
-    return TokError("cmpxchg failure ordering cannot include release semantics");
+  if (isStrongerThan(FailureOrdering, SuccessOrdering))
+    return TokError("cmpxchg failure argument shall be no stronger than the "
+                    "success argument");
+  if (FailureOrdering == AtomicOrdering::Release ||
+      FailureOrdering == AtomicOrdering::AcquireRelease)
+    return TokError(
+        "cmpxchg failure ordering cannot include release semantics");
   if (!Ptr->getType()->isPointerTy())
     return Error(PtrLoc, "cmpxchg operand must be a pointer");
   if (cast<PointerType>(Ptr->getType())->getElementType() != Cmp->getType())
     return Error(CmpLoc, "compare value and pointer type do not match");
   if (cast<PointerType>(Ptr->getType())->getElementType() != New->getType())
     return Error(NewLoc, "new value and pointer type do not match");
-  if (!New->getType()->isIntegerTy())
-    return Error(NewLoc, "cmpxchg operand must be an integer");
-  unsigned Size = New->getType()->getPrimitiveSizeInBits();
-  if (Size < 8 || (Size & (Size - 1)))
-    return Error(NewLoc, "cmpxchg operand must be power-of-two byte-sized"
-                         " integer");
-
+  if (!New->getType()->isFirstClassType())
+    return Error(NewLoc, "cmpxchg operand must be a first class value");
   AtomicCmpXchgInst *CXI = new AtomicCmpXchgInst(
       Ptr, Cmp, New, SuccessOrdering, FailureOrdering, Scope);
   CXI->setVolatile(isVolatile);
@@ -5924,7 +6147,7 @@ int LLParser::ParseCmpXchg(Instruction *&Inst, PerFunctionState &PFS) {
 int LLParser::ParseAtomicRMW(Instruction *&Inst, PerFunctionState &PFS) {
   Value *Ptr, *Val; LocTy PtrLoc, ValLoc;
   bool AteExtraComma = false;
-  AtomicOrdering Ordering = NotAtomic;
+  AtomicOrdering Ordering = AtomicOrdering::NotAtomic;
   SynchronizationScope Scope = CrossThread;
   bool isVolatile = false;
   AtomicRMWInst::BinOp Operation;
@@ -5954,7 +6177,7 @@ int LLParser::ParseAtomicRMW(Instruction *&Inst, PerFunctionState &PFS) {
       ParseScopeAndOrdering(true /*Always atomic*/, Scope, Ordering))
     return true;
 
-  if (Ordering == Unordered)
+  if (Ordering == AtomicOrdering::Unordered)
     return TokError("atomicrmw cannot be unordered");
   if (!Ptr->getType()->isPointerTy())
     return Error(PtrLoc, "atomicrmw operand must be a pointer");
@@ -5977,14 +6200,14 @@ int LLParser::ParseAtomicRMW(Instruction *&Inst, PerFunctionState &PFS) {
 /// ParseFence
 ///   ::= 'fence' 'singlethread'? AtomicOrdering
 int LLParser::ParseFence(Instruction *&Inst, PerFunctionState &PFS) {
-  AtomicOrdering Ordering = NotAtomic;
+  AtomicOrdering Ordering = AtomicOrdering::NotAtomic;
   SynchronizationScope Scope = CrossThread;
   if (ParseScopeAndOrdering(true /*Always atomic*/, Scope, Ordering))
     return true;
 
-  if (Ordering == Unordered)
+  if (Ordering == AtomicOrdering::Unordered)
     return TokError("fence cannot be unordered");
-  if (Ordering == Monotonic)
+  if (Ordering == AtomicOrdering::Monotonic)
     return TokError("fence cannot be monotonic");
 
   Inst = new FenceInst(Context, Ordering, Scope);
diff --git a/lib/AsmParser/LLParser.h b/lib/AsmParser/LLParser.h
index f61a5e5e3a38..479ff96bc8a3 100644
--- a/lib/AsmParser/LLParser.h
+++ b/lib/AsmParser/LLParser.h
@@ -15,7 +15,7 @@
 #define LLVM_LIB_ASMPARSER_LLPARSER_H
 
 #include "LLLexer.h"
-#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/Optional.h"
 #include "llvm/ADT/StringMap.h"
 #include "llvm/IR/Attributes.h"
 #include "llvm/IR/Instructions.h"
@@ -148,6 +148,9 @@ namespace llvm {
 
     bool parseStandaloneConstantValue(Constant *&C, const SlotMapping *Slots);
 
+    bool parseTypeAtBeginning(Type *&Ty, unsigned &Read,
+                              const SlotMapping *Slots);
+
     LLVMContext &getContext() { return Context; }
 
   private:
@@ -223,18 +226,14 @@ namespace llvm {
 
     bool ParseTLSModel(GlobalVariable::ThreadLocalMode &TLM);
     bool ParseOptionalThreadLocal(GlobalVariable::ThreadLocalMode &TLM);
-    bool parseOptionalUnnamedAddr(bool &UnnamedAddr) {
-      return ParseOptionalToken(lltok::kw_unnamed_addr, UnnamedAddr);
-    }
+    bool ParseOptionalUnnamedAddr(GlobalVariable::UnnamedAddr &UnnamedAddr);
     bool ParseOptionalAddrSpace(unsigned &AddrSpace);
     bool ParseOptionalParamAttrs(AttrBuilder &B);
     bool ParseOptionalReturnAttrs(AttrBuilder &B);
-    bool ParseOptionalLinkage(unsigned &Linkage, bool &HasLinkage);
-    bool ParseOptionalLinkage(unsigned &Linkage) {
-      bool HasLinkage; return ParseOptionalLinkage(Linkage, HasLinkage);
-    }
-    bool ParseOptionalVisibility(unsigned &Visibility);
-    bool ParseOptionalDLLStorageClass(unsigned &DLLStorageClass);
+    bool ParseOptionalLinkage(unsigned &Linkage, bool &HasLinkage,
+                              unsigned &Visibility, unsigned &DLLStorageClass);
+    void ParseOptionalVisibility(unsigned &Visibility);
+    void ParseOptionalDLLStorageClass(unsigned &DLLStorageClass);
     bool ParseOptionalCallingConv(unsigned &CC);
     bool ParseOptionalAlignment(unsigned &Alignment);
     bool ParseOptionalDerefAttrBytes(lltok::Kind AttrKind, uint64_t &Bytes);
@@ -244,7 +243,10 @@ namespace llvm {
     bool ParseOptionalStackAlignment(unsigned &Alignment);
     bool ParseOptionalCommaAlign(unsigned &Alignment, bool &AteExtraComma);
     bool ParseOptionalCommaInAlloca(bool &IsInAlloca);
-    bool ParseIndexList(SmallVectorImpl<unsigned> &Indices,bool &AteExtraComma);
+    bool parseAllocSizeArguments(unsigned &ElemSizeArg,
+                                 Optional<unsigned> &HowManyArg);
+    bool ParseIndexList(SmallVectorImpl<unsigned> &Indices,
+                        bool &AteExtraComma);
     bool ParseIndexList(SmallVectorImpl<unsigned> &Indices) {
       bool AteExtraComma;
       if (ParseIndexList(Indices, AteExtraComma)) return true;
@@ -258,6 +260,7 @@ namespace llvm {
     bool ValidateEndOfModule();
     bool ParseTargetDefinition();
     bool ParseModuleAsm();
+    bool ParseSourceFileName();
     bool ParseDepLibs();        // FIXME: Remove in 4.0.
     bool ParseUnnamedType();
     bool ParseNamedType();
@@ -270,10 +273,13 @@ namespace llvm {
     bool ParseGlobal(const std::string &Name, LocTy Loc, unsigned Linkage,
                      bool HasLinkage, unsigned Visibility,
                      unsigned DLLStorageClass,
-                     GlobalVariable::ThreadLocalMode TLM, bool UnnamedAddr);
-    bool ParseAlias(const std::string &Name, LocTy Loc, unsigned Linkage,
-                    unsigned Visibility, unsigned DLLStorageClass,
-                    GlobalVariable::ThreadLocalMode TLM, bool UnnamedAddr);
+                     GlobalVariable::ThreadLocalMode TLM,
+                     GlobalVariable::UnnamedAddr UnnamedAddr);
+    bool parseIndirectSymbol(const std::string &Name, LocTy Loc,
+                             unsigned Linkage, unsigned Visibility,
+                             unsigned DLLStorageClass,
+                             GlobalVariable::ThreadLocalMode TLM,
+                             GlobalVariable::UnnamedAddr UnnamedAddr);
     bool parseComdat();
     bool ParseStandaloneMetadata();
     bool ParseNamedMetadata();
@@ -417,6 +423,7 @@ namespace llvm {
     bool ParseMDNodeVector(SmallVectorImpl<Metadata *> &MDs);
     bool ParseMetadataAttachment(unsigned &Kind, MDNode *&MD);
     bool ParseInstructionMetadata(Instruction &Inst);
+    bool ParseGlobalObjectMetadataAttachment(GlobalObject &GO);
     bool ParseOptionalFunctionMetadata(Function &F);
 
     template <class FieldTy>
diff --git a/lib/AsmParser/LLToken.h b/lib/AsmParser/LLToken.h
index 29a7f16d3c20..37998e879503 100644
--- a/lib/AsmParser/LLToken.h
+++ b/lib/AsmParser/LLToken.h
@@ -16,218 +16,348 @@
 
 namespace llvm {
 namespace lltok {
-  enum Kind {
-    // Markers
-    Eof, Error,
-
-    // Tokens with no info.
-    dotdotdot,         // ...
-    equal, comma,      // =  ,
-    star,              // *
-    lsquare, rsquare,  // [  ]
-    lbrace, rbrace,    // {  }
-    less, greater,     // <  >
-    lparen, rparen,    // (  )
-    exclaim,           // !
-    bar,               // |
-
-    kw_x,
-    kw_true,    kw_false,
-    kw_declare, kw_define,
-    kw_global,  kw_constant,
-
-    kw_private,
-    kw_internal,
-    kw_linkonce, kw_linkonce_odr,
-    kw_weak, // Used as a linkage, and a modifier for "cmpxchg".
-    kw_weak_odr, kw_appending,
-    kw_dllimport, kw_dllexport, kw_common, kw_available_externally,
-    kw_default, kw_hidden, kw_protected,
-    kw_unnamed_addr,
-    kw_externally_initialized,
-    kw_extern_weak,
-    kw_external, kw_thread_local,
-    kw_localdynamic, kw_initialexec, kw_localexec,
-    kw_zeroinitializer,
-    kw_undef, kw_null, kw_none,
-    kw_to,
-    kw_caller,
-    kw_within,
-    kw_from,
-    kw_tail,
-    kw_musttail,
-    kw_notail,
-    kw_target,
-    kw_triple,
-    kw_unwind,
-    kw_deplibs,                 // FIXME: Remove in 4.0
-    kw_datalayout,
-    kw_volatile,
-    kw_atomic,
-    kw_unordered, kw_monotonic, kw_acquire, kw_release, kw_acq_rel, kw_seq_cst,
-    kw_singlethread,
-    kw_nnan,
-    kw_ninf,
-    kw_nsz,
-    kw_arcp,
-    kw_fast,
-    kw_nuw,
-    kw_nsw,
-    kw_exact,
-    kw_inbounds,
-    kw_align,
-    kw_addrspace,
-    kw_section,
-    kw_alias,
-    kw_module,
-    kw_asm,
-    kw_sideeffect,
-    kw_alignstack,
-    kw_inteldialect,
-    kw_gc,
-    kw_prefix,
-    kw_prologue,
-    kw_c,
-
-    kw_cc, kw_ccc, kw_fastcc, kw_coldcc,
-    kw_intel_ocl_bicc,
-    kw_x86_stdcallcc, kw_x86_fastcallcc, kw_x86_thiscallcc, kw_x86_vectorcallcc,
-    kw_arm_apcscc, kw_arm_aapcscc, kw_arm_aapcs_vfpcc,
-    kw_msp430_intrcc,
-    kw_ptx_kernel, kw_ptx_device,
-    kw_spir_kernel, kw_spir_func,
-    kw_x86_64_sysvcc, kw_x86_64_win64cc,
-    kw_webkit_jscc, kw_anyregcc,
-    kw_preserve_mostcc, kw_preserve_allcc,
-    kw_ghccc,
-    kw_x86_intrcc,
-    kw_hhvmcc, kw_hhvm_ccc,
-    kw_cxx_fast_tlscc,
-
-    // Attributes:
-    kw_attributes,
-    kw_alwaysinline,
-    kw_argmemonly,
-    kw_sanitize_address,
-    kw_builtin,
-    kw_byval,
-    kw_inalloca,
-    kw_cold,
-    kw_convergent,
-    kw_dereferenceable,
-    kw_dereferenceable_or_null,
-    kw_inaccessiblememonly,
-    kw_inaccessiblemem_or_argmemonly,
-    kw_inlinehint,
-    kw_inreg,
-    kw_jumptable,
-    kw_minsize,
-    kw_naked,
-    kw_nest,
-    kw_noalias,
-    kw_nobuiltin,
-    kw_nocapture,
-    kw_noduplicate,
-    kw_noimplicitfloat,
-    kw_noinline,
-    kw_norecurse,
-    kw_nonlazybind,
-    kw_nonnull,
-    kw_noredzone,
-    kw_noreturn,
-    kw_nounwind,
-    kw_optnone,
-    kw_optsize,
-    kw_readnone,
-    kw_readonly,
-    kw_returned,
-    kw_returns_twice,
-    kw_signext,
-    kw_ssp,
-    kw_sspreq,
-    kw_sspstrong,
-    kw_safestack,
-    kw_sret,
-    kw_sanitize_thread,
-    kw_sanitize_memory,
-    kw_uwtable,
-    kw_zeroext,
-
-    kw_type,
-    kw_opaque,
-
-    kw_comdat,
-
-    // Comdat types
-    kw_any,
-    kw_exactmatch,
-    kw_largest,
-    kw_noduplicates,
-    kw_samesize,
-
-    kw_eq, kw_ne, kw_slt, kw_sgt, kw_sle, kw_sge, kw_ult, kw_ugt, kw_ule,
-    kw_uge, kw_oeq, kw_one, kw_olt, kw_ogt, kw_ole, kw_oge, kw_ord, kw_uno,
-    kw_ueq, kw_une,
-
-    // atomicrmw operations that aren't also instruction keywords.
-    kw_xchg, kw_nand, kw_max, kw_min, kw_umax, kw_umin,
-
-    // Instruction Opcodes (Opcode in UIntVal).
-    kw_add,  kw_fadd, kw_sub,  kw_fsub, kw_mul,  kw_fmul,
-    kw_udiv, kw_sdiv, kw_fdiv,
-    kw_urem, kw_srem, kw_frem, kw_shl,  kw_lshr, kw_ashr,
-    kw_and,  kw_or,   kw_xor,  kw_icmp, kw_fcmp,
-
-    kw_phi, kw_call,
-    kw_trunc, kw_zext, kw_sext, kw_fptrunc, kw_fpext, kw_uitofp, kw_sitofp,
-    kw_fptoui, kw_fptosi, kw_inttoptr, kw_ptrtoint, kw_bitcast,
-    kw_addrspacecast,
-    kw_select, kw_va_arg,
-
-    kw_landingpad, kw_personality, kw_cleanup, kw_catch, kw_filter,
-
-    kw_ret, kw_br, kw_switch, kw_indirectbr, kw_invoke, kw_resume,
-    kw_unreachable, kw_cleanupret, kw_catchswitch, kw_catchret, kw_catchpad,
-    kw_cleanuppad,
-
-    kw_alloca, kw_load, kw_store, kw_fence, kw_cmpxchg, kw_atomicrmw,
-    kw_getelementptr,
-
-    kw_extractelement, kw_insertelement, kw_shufflevector,
-    kw_extractvalue, kw_insertvalue, kw_blockaddress,
-
-    // Metadata types.
-    kw_distinct,
-
-    // Use-list order directives.
-    kw_uselistorder, kw_uselistorder_bb,
-
-    // Unsigned Valued tokens (UIntVal).
-    GlobalID,          // @42
-    LocalVarID,        // %42
-    AttrGrpID,         // #42
-
-    // String valued tokens (StrVal).
-    LabelStr,          // foo:
-    GlobalVar,         // @foo @"foo"
-    ComdatVar,         // $foo
-    LocalVar,          // %foo %"foo"
-    MetadataVar,       // !foo
-    StringConstant,    // "foo"
-    DwarfTag,          // DW_TAG_foo
-    DwarfAttEncoding,  // DW_ATE_foo
-    DwarfVirtuality,   // DW_VIRTUALITY_foo
-    DwarfLang,         // DW_LANG_foo
-    DwarfOp,           // DW_OP_foo
-    DIFlag,            // DIFlagFoo
-    DwarfMacinfo,      // DW_MACINFO_foo
-
-    // Type valued tokens (TyVal).
-    Type,
-
-    APFloat,  // APFloatVal
-    APSInt // APSInt
-  };
+enum Kind {
+  // Markers
+  Eof,
+  Error,
+
+  // Tokens with no info.
+  dotdotdot, // ...
+  equal,
+  comma, // =  ,
+  star,  // *
+  lsquare,
+  rsquare, // [  ]
+  lbrace,
+  rbrace, // {  }
+  less,
+  greater, // <  >
+  lparen,
+  rparen,  // (  )
+  exclaim, // !
+  bar,     // |
+
+  kw_x,
+  kw_true,
+  kw_false,
+  kw_declare,
+  kw_define,
+  kw_global,
+  kw_constant,
+
+  kw_private,
+  kw_internal,
+  kw_linkonce,
+  kw_linkonce_odr,
+  kw_weak, // Used as a linkage, and a modifier for "cmpxchg".
+  kw_weak_odr,
+  kw_appending,
+  kw_dllimport,
+  kw_dllexport,
+  kw_common,
+  kw_available_externally,
+  kw_default,
+  kw_hidden,
+  kw_protected,
+  kw_unnamed_addr,
+  kw_local_unnamed_addr,
+  kw_externally_initialized,
+  kw_extern_weak,
+  kw_external,
+  kw_thread_local,
+  kw_localdynamic,
+  kw_initialexec,
+  kw_localexec,
+  kw_zeroinitializer,
+  kw_undef,
+  kw_null,
+  kw_none,
+  kw_to,
+  kw_caller,
+  kw_within,
+  kw_from,
+  kw_tail,
+  kw_musttail,
+  kw_notail,
+  kw_target,
+  kw_triple,
+  kw_source_filename,
+  kw_unwind,
+  kw_deplibs, // FIXME: Remove in 4.0
+  kw_datalayout,
+  kw_volatile,
+  kw_atomic,
+  kw_unordered,
+  kw_monotonic,
+  kw_acquire,
+  kw_release,
+  kw_acq_rel,
+  kw_seq_cst,
+  kw_singlethread,
+  kw_nnan,
+  kw_ninf,
+  kw_nsz,
+  kw_arcp,
+  kw_fast,
+  kw_nuw,
+  kw_nsw,
+  kw_exact,
+  kw_inbounds,
+  kw_align,
+  kw_addrspace,
+  kw_section,
+  kw_alias,
+  kw_ifunc,
+  kw_module,
+  kw_asm,
+  kw_sideeffect,
+  kw_alignstack,
+  kw_inteldialect,
+  kw_gc,
+  kw_prefix,
+  kw_prologue,
+  kw_c,
+
+  kw_cc,
+  kw_ccc,
+  kw_fastcc,
+  kw_coldcc,
+  kw_intel_ocl_bicc,
+  kw_x86_stdcallcc,
+  kw_x86_fastcallcc,
+  kw_x86_thiscallcc,
+  kw_x86_vectorcallcc,
+  kw_arm_apcscc,
+  kw_arm_aapcscc,
+  kw_arm_aapcs_vfpcc,
+  kw_msp430_intrcc,
+  kw_avr_intrcc,
+  kw_avr_signalcc,
+  kw_ptx_kernel,
+  kw_ptx_device,
+  kw_spir_kernel,
+  kw_spir_func,
+  kw_x86_64_sysvcc,
+  kw_x86_64_win64cc,
+  kw_webkit_jscc,
+  kw_anyregcc,
+  kw_swiftcc,
+  kw_preserve_mostcc,
+  kw_preserve_allcc,
+  kw_ghccc,
+  kw_x86_intrcc,
+  kw_hhvmcc,
+  kw_hhvm_ccc,
+  kw_cxx_fast_tlscc,
+  kw_amdgpu_vs,
+  kw_amdgpu_gs,
+  kw_amdgpu_ps,
+  kw_amdgpu_cs,
+  kw_amdgpu_kernel,
+
+  // Attributes:
+  kw_attributes,
+  kw_allocsize,
+  kw_alwaysinline,
+  kw_argmemonly,
+  kw_sanitize_address,
+  kw_builtin,
+  kw_byval,
+  kw_inalloca,
+  kw_cold,
+  kw_convergent,
+  kw_dereferenceable,
+  kw_dereferenceable_or_null,
+  kw_inaccessiblememonly,
+  kw_inaccessiblemem_or_argmemonly,
+  kw_inlinehint,
+  kw_inreg,
+  kw_jumptable,
+  kw_minsize,
+  kw_naked,
+  kw_nest,
+  kw_noalias,
+  kw_nobuiltin,
+  kw_nocapture,
+  kw_noduplicate,
+  kw_noimplicitfloat,
+  kw_noinline,
+  kw_norecurse,
+  kw_nonlazybind,
+  kw_nonnull,
+  kw_noredzone,
+  kw_noreturn,
+  kw_nounwind,
+  kw_optnone,
+  kw_optsize,
+  kw_readnone,
+  kw_readonly,
+  kw_returned,
+  kw_returns_twice,
+  kw_signext,
+  kw_ssp,
+  kw_sspreq,
+  kw_sspstrong,
+  kw_safestack,
+  kw_sret,
+  kw_sanitize_thread,
+  kw_sanitize_memory,
+  kw_swifterror,
+  kw_swiftself,
+  kw_uwtable,
+  kw_writeonly,
+  kw_zeroext,
+
+  kw_type,
+  kw_opaque,
+
+  kw_comdat,
+
+  // Comdat types
+  kw_any,
+  kw_exactmatch,
+  kw_largest,
+  kw_noduplicates,
+  kw_samesize,
+
+  kw_eq,
+  kw_ne,
+  kw_slt,
+  kw_sgt,
+  kw_sle,
+  kw_sge,
+  kw_ult,
+  kw_ugt,
+  kw_ule,
+  kw_uge,
+  kw_oeq,
+  kw_one,
+  kw_olt,
+  kw_ogt,
+  kw_ole,
+  kw_oge,
+  kw_ord,
+  kw_uno,
+  kw_ueq,
+  kw_une,
+
+  // atomicrmw operations that aren't also instruction keywords.
+  kw_xchg,
+  kw_nand,
+  kw_max,
+  kw_min,
+  kw_umax,
+  kw_umin,
+
+  // Instruction Opcodes (Opcode in UIntVal).
+  kw_add,
+  kw_fadd,
+  kw_sub,
+  kw_fsub,
+  kw_mul,
+  kw_fmul,
+  kw_udiv,
+  kw_sdiv,
+  kw_fdiv,
+  kw_urem,
+  kw_srem,
+  kw_frem,
+  kw_shl,
+  kw_lshr,
+  kw_ashr,
+  kw_and,
+  kw_or,
+  kw_xor,
+  kw_icmp,
+  kw_fcmp,
+
+  kw_phi,
+  kw_call,
+  kw_trunc,
+  kw_zext,
+  kw_sext,
+  kw_fptrunc,
+  kw_fpext,
+  kw_uitofp,
+  kw_sitofp,
+  kw_fptoui,
+  kw_fptosi,
+  kw_inttoptr,
+  kw_ptrtoint,
+  kw_bitcast,
+  kw_addrspacecast,
+  kw_select,
+  kw_va_arg,
+
+  kw_landingpad,
+  kw_personality,
+  kw_cleanup,
+  kw_catch,
+  kw_filter,
+
+  kw_ret,
+  kw_br,
+  kw_switch,
+  kw_indirectbr,
+  kw_invoke,
+  kw_resume,
+  kw_unreachable,
+  kw_cleanupret,
+  kw_catchswitch,
+  kw_catchret,
+  kw_catchpad,
+  kw_cleanuppad,
+
+  kw_alloca,
+  kw_load,
+  kw_store,
+  kw_fence,
+  kw_cmpxchg,
+  kw_atomicrmw,
+  kw_getelementptr,
+
+  kw_extractelement,
+  kw_insertelement,
+  kw_shufflevector,
+  kw_extractvalue,
+  kw_insertvalue,
+  kw_blockaddress,
+
+  // Metadata types.
+  kw_distinct,
+
+  // Use-list order directives.
+  kw_uselistorder,
+  kw_uselistorder_bb,
+
+  // Unsigned Valued tokens (UIntVal).
+  GlobalID,   // @42
+  LocalVarID, // %42
+  AttrGrpID,  // #42
+
+  // String valued tokens (StrVal).
+  LabelStr,         // foo:
+  GlobalVar,        // @foo @"foo"
+  ComdatVar,        // $foo
+  LocalVar,         // %foo %"foo"
+  MetadataVar,      // !foo
+  StringConstant,   // "foo"
+  DwarfTag,         // DW_TAG_foo
+  DwarfAttEncoding, // DW_ATE_foo
+  DwarfVirtuality,  // DW_VIRTUALITY_foo
+  DwarfLang,        // DW_LANG_foo
+  DwarfCC,          // DW_CC_foo
+  EmissionKind,     // lineTablesOnly
+  DwarfOp,          // DW_OP_foo
+  DIFlag,           // DIFlagFoo
+  DwarfMacinfo,     // DW_MACINFO_foo
+
+  // Type valued tokens (TyVal).
+  Type,
+
+  APFloat, // APFloatVal
+  APSInt   // APSInt
+};
 } // end namespace lltok
 } // end namespace llvm
 
diff --git a/lib/AsmParser/Makefile b/lib/AsmParser/Makefile
deleted file mode 100644
index 995bb0e130e2..000000000000
--- a/lib/AsmParser/Makefile
+++ /dev/null
@@ -1,14 +0,0 @@
-##===- lib/AsmParser/Makefile ------------------------------*- Makefile -*-===##
-#
-#                     The LLVM Compiler Infrastructure
-#
-# This file is distributed under the University of Illinois Open Source
-# License. See LICENSE.TXT for details.
-#
-##===----------------------------------------------------------------------===##
-
-LEVEL = ../..
-LIBRARYNAME := LLVMAsmParser
-BUILD_ARCHIVE = 1
-
-include $(LEVEL)/Makefile.common
diff --git a/lib/AsmParser/Parser.cpp b/lib/AsmParser/Parser.cpp
index 4e55e62ecf5c..bee07ad9e0a5 100644
--- a/lib/AsmParser/Parser.cpp
+++ b/lib/AsmParser/Parser.cpp
@@ -78,3 +78,32 @@ Constant *llvm::parseConstantValue(StringRef Asm, SMDiagnostic &Err,
     return nullptr;
   return C;
 }
+
+Type *llvm::parseType(StringRef Asm, SMDiagnostic &Err, const Module &M,
+                      const SlotMapping *Slots) {
+  unsigned Read;
+  Type *Ty = parseTypeAtBeginning(Asm, Read, Err, M, Slots);
+  if (!Ty)
+    return nullptr;
+  if (Read != Asm.size()) {
+    SourceMgr SM;
+    std::unique_ptr<MemoryBuffer> Buf = MemoryBuffer::getMemBuffer(Asm);
+    SM.AddNewSourceBuffer(std::move(Buf), SMLoc());
+    Err = SM.GetMessage(SMLoc::getFromPointer(Asm.begin() + Read),
+                        SourceMgr::DK_Error, "expected end of string");
+    return nullptr;
+  }
+  return Ty;
+}
+Type *llvm::parseTypeAtBeginning(StringRef Asm, unsigned &Read,
+                                 SMDiagnostic &Err, const Module &M,
+                                 const SlotMapping *Slots) {
+  SourceMgr SM;
+  std::unique_ptr<MemoryBuffer> Buf = MemoryBuffer::getMemBuffer(Asm);
+  SM.AddNewSourceBuffer(std::move(Buf), SMLoc());
+  Type *Ty;
+  if (LLParser(Asm, SM, Err, const_cast<Module *>(&M))
+          .parseTypeAtBeginning(Ty, Read, Slots))
+    return nullptr;
+  return Ty;
+}
diff --git a/lib/AsmParser/module.modulemap b/lib/AsmParser/module.modulemap
deleted file mode 100644
index cc300060b3f5..000000000000
--- a/lib/AsmParser/module.modulemap
+++ /dev/null
@@ -1 +0,0 @@
-module AsmParser { requires cplusplus umbrella "." module * { export * } }
diff --git a/lib/Bitcode/Makefile b/lib/Bitcode/Makefile
deleted file mode 100644
index 2d6b5ad1fe88..000000000000
--- a/lib/Bitcode/Makefile
+++ /dev/null
@@ -1,14 +0,0 @@
-##===- lib/Bitcode/Makefile --------------------------------*- Makefile -*-===##
-#
-#                     The LLVM Compiler Infrastructure
-#
-# This file is distributed under the University of Illinois Open Source
-# License. See LICENSE.TXT for details.
-#
-##===----------------------------------------------------------------------===##
-
-LEVEL = ../..
-PARALLEL_DIRS = Reader Writer
-
-include $(LEVEL)/Makefile.common
-
diff --git a/lib/Bitcode/Reader/BitReader.cpp b/lib/Bitcode/Reader/BitReader.cpp
index 385c18a40006..9ac3cb95d086 100644
--- a/lib/Bitcode/Reader/BitReader.cpp
+++ b/lib/Bitcode/Reader/BitReader.cpp
@@ -25,14 +25,13 @@ using namespace llvm;
    Optionally returns a human-readable error message via OutMessage. */
 LLVMBool LLVMParseBitcode(LLVMMemoryBufferRef MemBuf, LLVMModuleRef *OutModule,
                           char **OutMessage) {
-  return LLVMParseBitcodeInContext(wrap(&getGlobalContext()), MemBuf, OutModule,
+  return LLVMParseBitcodeInContext(LLVMGetGlobalContext(), MemBuf, OutModule,
                                    OutMessage);
 }
 
 LLVMBool LLVMParseBitcode2(LLVMMemoryBufferRef MemBuf,
                            LLVMModuleRef *OutModule) {
-  return LLVMParseBitcodeInContext2(wrap(&getGlobalContext()), MemBuf,
-                                    OutModule);
+  return LLVMParseBitcodeInContext2(LLVMGetGlobalContext(), MemBuf, OutModule);
 }
 
 static void diagnosticHandler(const DiagnosticInfo &DI, void *C) {
diff --git a/lib/Bitcode/Reader/BitcodeReader.cpp b/lib/Bitcode/Reader/BitcodeReader.cpp
index 2ad4b32e3157..73a30c61ecaa 100644
--- a/lib/Bitcode/Reader/BitcodeReader.cpp
+++ b/lib/Bitcode/Reader/BitcodeReader.cpp
@@ -7,14 +7,15 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "llvm/Bitcode/ReaderWriter.h"
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/SmallString.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/Triple.h"
 #include "llvm/Bitcode/BitstreamReader.h"
 #include "llvm/Bitcode/LLVMBitCodes.h"
+#include "llvm/Bitcode/ReaderWriter.h"
 #include "llvm/IR/AutoUpgrade.h"
+#include "llvm/IR/CallSite.h"
 #include "llvm/IR/Constants.h"
 #include "llvm/IR/DebugInfo.h"
 #include "llvm/IR/DebugInfoMetadata.h"
@@ -25,18 +26,27 @@
 #include "llvm/IR/IntrinsicInst.h"
 #include "llvm/IR/LLVMContext.h"
 #include "llvm/IR/Module.h"
+#include "llvm/IR/ModuleSummaryIndex.h"
 #include "llvm/IR/OperandTraits.h"
 #include "llvm/IR/Operator.h"
-#include "llvm/IR/FunctionInfo.h"
 #include "llvm/IR/ValueHandle.h"
+#include "llvm/Support/CommandLine.h"
 #include "llvm/Support/DataStream.h"
+#include "llvm/Support/Debug.h"
 #include "llvm/Support/ManagedStatic.h"
 #include "llvm/Support/MathExtras.h"
 #include "llvm/Support/MemoryBuffer.h"
 #include "llvm/Support/raw_ostream.h"
 #include <deque>
+#include <utility>
+
 using namespace llvm;
 
+static cl::opt<bool> PrintSummaryGUIDs(
+    "print-summary-global-ids", cl::init(false), cl::Hidden,
+    cl::desc(
+        "Print the global id for each value when reading the module summary"));
+
 namespace {
 enum {
   SWITCH_INST_MAGIC = 0x4B5 // May 2012 => 1205 => Hex
@@ -77,7 +87,7 @@ public:
   }
 
   Value *back() const { return ValuePtrs.back(); }
-    void pop_back() { ValuePtrs.pop_back(); }
+  void pop_back() { ValuePtrs.pop_back(); }
   bool empty() const { return ValuePtrs.empty(); }
   void shrinkTo(unsigned N) {
     assert(N <= size() && "Invalid shrinkTo request!");
@@ -99,7 +109,20 @@ class BitcodeReaderMetadataList {
   bool AnyFwdRefs;
   unsigned MinFwdRef;
   unsigned MaxFwdRef;
-  std::vector<TrackingMDRef> MetadataPtrs;
+
+  /// Array of metadata references.
+  ///
+  /// Don't use std::vector here.  Some versions of libc++ copy (instead of
+  /// move) on resize, and TrackingMDRef is very expensive to copy.
+  SmallVector<TrackingMDRef, 1> MetadataPtrs;
+
+  /// Structures for resolving old type refs.
+  struct {
+    SmallDenseMap<MDString *, TempMDTuple, 1> Unknown;
+    SmallDenseMap<MDString *, DICompositeType *, 1> Final;
+    SmallDenseMap<MDString *, DICompositeType *, 1> FwdDecls;
+    SmallVector<std::pair<TrackingMDRef, TempMDTuple>, 1> Arrays;
+  } OldTypeRefs;
 
   LLVMContext &Context;
 public:
@@ -120,14 +143,44 @@ public:
     return MetadataPtrs[i];
   }
 
+  Metadata *lookup(unsigned I) const {
+    if (I < MetadataPtrs.size())
+      return MetadataPtrs[I];
+    return nullptr;
+  }
+
   void shrinkTo(unsigned N) {
     assert(N <= size() && "Invalid shrinkTo request!");
+    assert(!AnyFwdRefs && "Unexpected forward refs");
     MetadataPtrs.resize(N);
   }
 
-  Metadata *getValueFwdRef(unsigned Idx);
+  /// Return the given metadata, creating a replaceable forward reference if
+  /// necessary.
+  Metadata *getMetadataFwdRef(unsigned Idx);
+
+  /// Return the the given metadata only if it is fully resolved.
+  ///
+  /// Gives the same result as \a lookup(), unless \a MDNode::isResolved()
+  /// would give \c false.
+  Metadata *getMetadataIfResolved(unsigned Idx);
+
+  MDNode *getMDNodeFwdRefOrNull(unsigned Idx);
   void assignValue(Metadata *MD, unsigned Idx);
   void tryToResolveCycles();
+  bool hasFwdRefs() const { return AnyFwdRefs; }
+
+  /// Upgrade a type that had an MDString reference.
+  void addTypeRef(MDString &UUID, DICompositeType &CT);
+
+  /// Upgrade a type that had an MDString reference.
+  Metadata *upgradeTypeRef(Metadata *MaybeUUID);
+
+  /// Upgrade a type ref array that may have MDString references.
+  Metadata *upgradeTypeRefArray(Metadata *MaybeTuple);
+
+private:
+  Metadata *resolveTypeRefArray(Metadata *MaybeTuple);
 };
 
 class BitcodeReader : public GVMaterializer {
@@ -144,11 +197,6 @@ class BitcodeReader : public GVMaterializer {
   uint64_t VSTOffset = 0;
   // Contains an arbitrary and optional string identifying the bitcode producer
   std::string ProducerIdentification;
-  // Number of module level metadata records specified by the
-  // MODULE_CODE_METADATA_VALUES record.
-  unsigned NumModuleMDs = 0;
-  // Support older bitcode without the MODULE_CODE_METADATA_VALUES record.
-  bool SeenModuleValuesRecord = false;
 
   std::vector<Type*> TypeList;
   BitcodeReaderValueList ValueList;
@@ -157,13 +205,15 @@ class BitcodeReader : public GVMaterializer {
   SmallVector<Instruction *, 64> InstructionList;
 
   std::vector<std::pair<GlobalVariable*, unsigned> > GlobalInits;
-  std::vector<std::pair<GlobalAlias*, unsigned> > AliasInits;
+  std::vector<std::pair<GlobalIndirectSymbol*, unsigned> > IndirectSymbolInits;
   std::vector<std::pair<Function*, unsigned> > FunctionPrefixes;
   std::vector<std::pair<Function*, unsigned> > FunctionPrologues;
   std::vector<std::pair<Function*, unsigned> > FunctionPersonalityFns;
 
   SmallVector<Instruction*, 64> InstsWithTBAATag;
 
+  bool HasSeenOldLoopTags = false;
+
   /// The set of attributes by index.  Index zero in the file is for null, and
   /// is thus not represented here.  As such all indices are off by one.
   std::vector<AttributeSet> MAttributes;
@@ -181,8 +231,10 @@ class BitcodeReader : public GVMaterializer {
 
   // When intrinsic functions are encountered which require upgrading they are
   // stored here with their replacement function.
-  typedef DenseMap<Function*, Function*> UpgradedIntrinsicMap;
-  UpgradedIntrinsicMap UpgradedIntrinsics;
+  typedef DenseMap<Function*, Function*> UpdatedIntrinsicMap;
+  UpdatedIntrinsicMap UpgradedIntrinsics;
+  // Intrinsics which were remangled because of types rename
+  UpdatedIntrinsicMap RemangledIntrinsics;
 
   // Map the bitcode's custom MDKind ID to the Module's MDKind ID.
   DenseMap<unsigned, unsigned> MDKindMap;
@@ -232,7 +284,6 @@ class BitcodeReader : public GVMaterializer {
 
 public:
   std::error_code error(BitcodeError E, const Twine &Message);
-  std::error_code error(BitcodeError E);
   std::error_code error(const Twine &Message);
 
   BitcodeReader(MemoryBuffer *Buffer, LLVMContext &Context);
@@ -262,6 +313,10 @@ public:
   /// Cheap mechanism to just extract the identification block out of bitcode.
   ErrorOr<std::string> parseIdentificationBlock();
 
+  /// Peak at the module content and return true if any ObjC category or class
+  /// is found.
+  ErrorOr<bool> hasObjCCategory();
+
   static uint64_t decodeSignRotatedValue(uint64_t V);
 
   /// Materialize any deferred Metadata block.
@@ -269,14 +324,6 @@ public:
 
   void setStripDebugInfo() override;
 
-  /// Save the mapping between the metadata values and the corresponding
-  /// value id that were recorded in the MetadataList during parsing. If
-  /// OnlyTempMD is true, then only record those entries that are still
-  /// temporary metadata. This interface is used when metadata linking is
-  /// performed as a postpass, such as during function importing.
-  void saveMetadataList(DenseMap<const Metadata *, unsigned> &MetadataToIDs,
-                        bool OnlyTempMD) override;
-
 private:
   /// Parse the "IDENTIFICATION_BLOCK_ID" block, populate the
   // ProducerIdentification data member, and do some basic enforcement on the
@@ -294,7 +341,7 @@ private:
     return ValueList.getValueFwdRef(ID, Ty);
   }
   Metadata *getFnMetadataByID(unsigned ID) {
-    return MetadataList.getValueFwdRef(ID);
+    return MetadataList.getMetadataFwdRef(ID);
   }
   BasicBlock *getBasicBlock(unsigned ID) const {
     if (ID >= FunctionBBs.size()) return nullptr; // Invalid ID
@@ -395,12 +442,19 @@ private:
   std::error_code rememberAndSkipMetadata();
   std::error_code parseFunctionBody(Function *F);
   std::error_code globalCleanup();
-  std::error_code resolveGlobalAndAliasInits();
+  std::error_code resolveGlobalAndIndirectSymbolInits();
   std::error_code parseMetadata(bool ModuleLevel = false);
+  std::error_code parseMetadataStrings(ArrayRef<uint64_t> Record,
+                                       StringRef Blob,
+                                       unsigned &NextMetadataNo);
   std::error_code parseMetadataKinds();
   std::error_code parseMetadataKindRecord(SmallVectorImpl<uint64_t> &Record);
+  std::error_code
+  parseGlobalObjectAttachment(GlobalObject &GO,
+                              ArrayRef<uint64_t> Record);
   std::error_code parseMetadataAttachment(Function &F);
   ErrorOr<std::string> parseModuleTriple();
+  ErrorOr<bool> hasObjCCategoryInModule();
   std::error_code parseUseLists();
   std::error_code initStream(std::unique_ptr<DataStreamer> Streamer);
   std::error_code initStreamFromBuffer();
@@ -412,92 +466,86 @@ private:
 
 /// Class to manage reading and parsing function summary index bitcode
 /// files/sections.
-class FunctionIndexBitcodeReader {
+class ModuleSummaryIndexBitcodeReader {
   DiagnosticHandlerFunction DiagnosticHandler;
 
-  /// Eventually points to the function index built during parsing.
-  FunctionInfoIndex *TheIndex = nullptr;
+  /// Eventually points to the module index built during parsing.
+  ModuleSummaryIndex *TheIndex = nullptr;
 
   std::unique_ptr<MemoryBuffer> Buffer;
   std::unique_ptr<BitstreamReader> StreamFile;
   BitstreamCursor Stream;
 
-  /// \brief Used to indicate whether we are doing lazy parsing of summary data.
-  ///
-  /// If false, the summary section is fully parsed into the index during
-  /// the initial parse. Otherwise, if true, the caller is expected to
-  /// invoke \a readFunctionSummary for each summary needed, and the summary
-  /// section is thus parsed lazily.
-  bool IsLazy = false;
-
   /// Used to indicate whether caller only wants to check for the presence
-  /// of the function summary bitcode section. All blocks are skipped,
-  /// but the SeenFuncSummary boolean is set.
-  bool CheckFuncSummaryPresenceOnly = false;
+  /// of the global value summary bitcode section. All blocks are skipped,
+  /// but the SeenGlobalValSummary boolean is set.
+  bool CheckGlobalValSummaryPresenceOnly = false;
 
-  /// Indicates whether we have encountered a function summary section
-  /// yet during parsing, used when checking if file contains function
+  /// Indicates whether we have encountered a global value summary section
+  /// yet during parsing, used when checking if file contains global value
   /// summary section.
-  bool SeenFuncSummary = false;
+  bool SeenGlobalValSummary = false;
 
-  /// \brief Map populated during function summary section parsing, and
-  /// consumed during ValueSymbolTable parsing.
-  ///
-  /// Used to correlate summary records with VST entries. For the per-module
-  /// index this maps the ValueID to the parsed function summary, and
-  /// for the combined index this maps the summary record's bitcode
-  /// offset to the function summary (since in the combined index the
-  /// VST records do not hold value IDs but rather hold the function
-  /// summary record offset).
-  DenseMap<uint64_t, std::unique_ptr<FunctionSummary>> SummaryMap;
+  /// Indicates whether we have already parsed the VST, used for error checking.
+  bool SeenValueSymbolTable = false;
+
+  /// Set to the offset of the VST recorded in the MODULE_CODE_VSTOFFSET record.
+  /// Used to enable on-demand parsing of the VST.
+  uint64_t VSTOffset = 0;
+
+  // Map to save ValueId to GUID association that was recorded in the
+  // ValueSymbolTable. It is used after the VST is parsed to convert
+  // call graph edges read from the function summary from referencing
+  // callees by their ValueId to using the GUID instead, which is how
+  // they are recorded in the summary index being built.
+  // We save a second GUID which is the same as the first one, but ignoring the
+  // linkage, i.e. for value other than local linkage they are identical.
+  DenseMap<unsigned, std::pair<GlobalValue::GUID, GlobalValue::GUID>>
+      ValueIdToCallGraphGUIDMap;
 
   /// Map populated during module path string table parsing, from the
   /// module ID to a string reference owned by the index's module
-  /// path string table, used to correlate with combined index function
+  /// path string table, used to correlate with combined index
   /// summary records.
   DenseMap<uint64_t, StringRef> ModuleIdMap;
 
+  /// Original source file name recorded in a bitcode record.
+  std::string SourceFileName;
+
 public:
-  std::error_code error(BitcodeError E, const Twine &Message);
-  std::error_code error(BitcodeError E);
   std::error_code error(const Twine &Message);
 
-  FunctionIndexBitcodeReader(MemoryBuffer *Buffer,
-                             DiagnosticHandlerFunction DiagnosticHandler,
-                             bool IsLazy = false,
-                             bool CheckFuncSummaryPresenceOnly = false);
-  FunctionIndexBitcodeReader(DiagnosticHandlerFunction DiagnosticHandler,
-                             bool IsLazy = false,
-                             bool CheckFuncSummaryPresenceOnly = false);
-  ~FunctionIndexBitcodeReader() { freeState(); }
+  ModuleSummaryIndexBitcodeReader(
+      MemoryBuffer *Buffer, DiagnosticHandlerFunction DiagnosticHandler,
+      bool CheckGlobalValSummaryPresenceOnly = false);
+  ~ModuleSummaryIndexBitcodeReader() { freeState(); }
 
   void freeState();
 
   void releaseBuffer();
 
-  /// Check if the parser has encountered a function summary section.
-  bool foundFuncSummary() { return SeenFuncSummary; }
+  /// Check if the parser has encountered a summary section.
+  bool foundGlobalValSummary() { return SeenGlobalValSummary; }
 
   /// \brief Main interface to parsing a bitcode buffer.
   /// \returns true if an error occurred.
   std::error_code parseSummaryIndexInto(std::unique_ptr<DataStreamer> Streamer,
-                                        FunctionInfoIndex *I);
-
-  /// \brief Interface for parsing a function summary lazily.
-  std::error_code parseFunctionSummary(std::unique_ptr<DataStreamer> Streamer,
-                                       FunctionInfoIndex *I,
-                                       size_t FunctionSummaryOffset);
+                                        ModuleSummaryIndex *I);
 
 private:
   std::error_code parseModule();
-  std::error_code parseValueSymbolTable();
+  std::error_code parseValueSymbolTable(
+      uint64_t Offset,
+      DenseMap<unsigned, GlobalValue::LinkageTypes> &ValueIdToLinkageMap);
   std::error_code parseEntireSummary();
   std::error_code parseModuleStringTable();
   std::error_code initStream(std::unique_ptr<DataStreamer> Streamer);
   std::error_code initStreamFromBuffer();
   std::error_code initLazyStream(std::unique_ptr<DataStreamer> Streamer);
+  std::pair<GlobalValue::GUID, GlobalValue::GUID>
+  getGUIDFromValueId(unsigned ValueId);
 };
-} // namespace
+} // end anonymous namespace
 
 BitcodeDiagnosticInfo::BitcodeDiagnosticInfo(std::error_code EC,
                                              DiagnosticSeverity Severity,
@@ -506,28 +554,19 @@ BitcodeDiagnosticInfo::BitcodeDiagnosticInfo(std::error_code EC,
 
 void BitcodeDiagnosticInfo::print(DiagnosticPrinter &DP) const { DP << Msg; }
 
-static std::error_code error(DiagnosticHandlerFunction DiagnosticHandler,
+static std::error_code error(const DiagnosticHandlerFunction &DiagnosticHandler,
                              std::error_code EC, const Twine &Message) {
   BitcodeDiagnosticInfo DI(EC, DS_Error, Message);
   DiagnosticHandler(DI);
   return EC;
 }
 
-static std::error_code error(DiagnosticHandlerFunction DiagnosticHandler,
-                             std::error_code EC) {
-  return error(DiagnosticHandler, EC, EC.message());
-}
-
 static std::error_code error(LLVMContext &Context, std::error_code EC,
                              const Twine &Message) {
   return error([&](const DiagnosticInfo &DI) { Context.diagnose(DI); }, EC,
                Message);
 }
 
-static std::error_code error(LLVMContext &Context, std::error_code EC) {
-  return error(Context, EC, EC.message());
-}
-
 static std::error_code error(LLVMContext &Context, const Twine &Message) {
   return error(Context, make_error_code(BitcodeError::CorruptedBitcode),
                Message);
@@ -552,10 +591,6 @@ std::error_code BitcodeReader::error(const Twine &Message) {
                  Message);
 }
 
-std::error_code BitcodeReader::error(BitcodeError E) {
-  return ::error(Context, make_error_code(E));
-}
-
 BitcodeReader::BitcodeReader(MemoryBuffer *Buffer, LLVMContext &Context)
     : Context(Context), Buffer(Buffer), ValueList(Context),
       MetadataList(Context) {}
@@ -685,6 +720,18 @@ static GlobalValue::LinkageTypes getDecodedLinkage(unsigned Val) {
   }
 }
 
+// Decode the flags for GlobalValue in the summary
+static GlobalValueSummary::GVFlags getDecodedGVSummaryFlags(uint64_t RawFlags,
+                                                            uint64_t Version) {
+  // Summary were not emitted before LLVM 3.9, we don't need to upgrade Linkage
+  // like getDecodedLinkage() above. Any future change to the linkage enum and
+  // to getDecodedLinkage() will need to be taken into account here as above.
+  auto Linkage = GlobalValue::LinkageTypes(RawFlags & 0xF); // 4 bits
+  RawFlags = RawFlags >> 4;
+  auto HasSection = RawFlags & 0x1; // bool
+  return GlobalValueSummary::GVFlags(Linkage, HasSection);
+}
+
 static GlobalValue::VisibilityTypes getDecodedVisibility(unsigned Val) {
   switch (Val) {
   default: // Map unknown visibilities to default.
@@ -715,6 +762,15 @@ static GlobalVariable::ThreadLocalMode getDecodedThreadLocalMode(unsigned Val) {
   }
 }
 
+static GlobalVariable::UnnamedAddr getDecodedUnnamedAddrType(unsigned Val) {
+  switch (Val) {
+    default: // Map unknown to UnnamedAddr::None.
+    case 0: return GlobalVariable::UnnamedAddr::None;
+    case 1: return GlobalVariable::UnnamedAddr::Global;
+    case 2: return GlobalVariable::UnnamedAddr::Local;
+  }
+}
+
 static int getDecodedCastOpcode(unsigned Val) {
   switch (Val) {
   default: return -1;
@@ -791,14 +847,14 @@ static AtomicRMWInst::BinOp getDecodedRMWOperation(unsigned Val) {
 
 static AtomicOrdering getDecodedOrdering(unsigned Val) {
   switch (Val) {
-  case bitc::ORDERING_NOTATOMIC: return NotAtomic;
-  case bitc::ORDERING_UNORDERED: return Unordered;
-  case bitc::ORDERING_MONOTONIC: return Monotonic;
-  case bitc::ORDERING_ACQUIRE: return Acquire;
-  case bitc::ORDERING_RELEASE: return Release;
-  case bitc::ORDERING_ACQREL: return AcquireRelease;
+  case bitc::ORDERING_NOTATOMIC: return AtomicOrdering::NotAtomic;
+  case bitc::ORDERING_UNORDERED: return AtomicOrdering::Unordered;
+  case bitc::ORDERING_MONOTONIC: return AtomicOrdering::Monotonic;
+  case bitc::ORDERING_ACQUIRE: return AtomicOrdering::Acquire;
+  case bitc::ORDERING_RELEASE: return AtomicOrdering::Release;
+  case bitc::ORDERING_ACQREL: return AtomicOrdering::AcquireRelease;
   default: // Map unknown orderings to sequentially-consistent.
-  case bitc::ORDERING_SEQCST: return SequentiallyConsistent;
+  case bitc::ORDERING_SEQCST: return AtomicOrdering::SequentiallyConsistent;
   }
 }
 
@@ -872,7 +928,7 @@ public:
   /// Provide fast operand accessors
   DECLARE_TRANSPARENT_OPERAND_ACCESSORS(Value);
 };
-}
+} // end anonymous namespace
 
 // FIXME: can we inherit this from ConstantExpr?
 template <>
@@ -880,7 +936,7 @@ struct OperandTraits<ConstantPlaceHolder> :
   public FixedNumOperandTraits<ConstantPlaceHolder, 1> {
 };
 DEFINE_TRANSPARENT_OPERAND_ACCESSORS(ConstantPlaceHolder, Value)
-}
+} // end namespace llvm
 
 void BitcodeReaderValueList::assignValue(Value *V, unsigned Idx) {
   if (Idx == size()) {
@@ -908,11 +964,8 @@ void BitcodeReaderValueList::assignValue(Value *V, unsigned Idx) {
     OldV->replaceAllUsesWith(V);
     delete PrevVal;
   }
-
-  return;
 }
 
-
 Constant *BitcodeReaderValueList::getConstantFwdRef(unsigned Idx,
                                                     Type *Ty) {
   if (Idx >= size())
@@ -1056,7 +1109,7 @@ void BitcodeReaderMetadataList::assignValue(Metadata *MD, unsigned Idx) {
   --NumFwdRefs;
 }
 
-Metadata *BitcodeReaderMetadataList::getValueFwdRef(unsigned Idx) {
+Metadata *BitcodeReaderMetadataList::getMetadataFwdRef(unsigned Idx) {
   if (Idx >= size())
     resize(Idx + 1);
 
@@ -1079,15 +1132,61 @@ Metadata *BitcodeReaderMetadataList::getValueFwdRef(unsigned Idx) {
   return MD;
 }
 
-void BitcodeReaderMetadataList::tryToResolveCycles() {
-  if (!AnyFwdRefs)
-    // Nothing to do.
-    return;
+Metadata *BitcodeReaderMetadataList::getMetadataIfResolved(unsigned Idx) {
+  Metadata *MD = lookup(Idx);
+  if (auto *N = dyn_cast_or_null<MDNode>(MD))
+    if (!N->isResolved())
+      return nullptr;
+  return MD;
+}
 
+MDNode *BitcodeReaderMetadataList::getMDNodeFwdRefOrNull(unsigned Idx) {
+  return dyn_cast_or_null<MDNode>(getMetadataFwdRef(Idx));
+}
+
+void BitcodeReaderMetadataList::tryToResolveCycles() {
   if (NumFwdRefs)
     // Still forward references... can't resolve cycles.
     return;
 
+  bool DidReplaceTypeRefs = false;
+
+  // Give up on finding a full definition for any forward decls that remain.
+  for (const auto &Ref : OldTypeRefs.FwdDecls)
+    OldTypeRefs.Final.insert(Ref);
+  OldTypeRefs.FwdDecls.clear();
+
+  // Upgrade from old type ref arrays.  In strange cases, this could add to
+  // OldTypeRefs.Unknown.
+  for (const auto &Array : OldTypeRefs.Arrays) {
+    DidReplaceTypeRefs = true;
+    Array.second->replaceAllUsesWith(resolveTypeRefArray(Array.first.get()));
+  }
+  OldTypeRefs.Arrays.clear();
+
+  // Replace old string-based type refs with the resolved node, if possible.
+  // If we haven't seen the node, leave it to the verifier to complain about
+  // the invalid string reference.
+  for (const auto &Ref : OldTypeRefs.Unknown) {
+    DidReplaceTypeRefs = true;
+    if (DICompositeType *CT = OldTypeRefs.Final.lookup(Ref.first))
+      Ref.second->replaceAllUsesWith(CT);
+    else
+      Ref.second->replaceAllUsesWith(Ref.first);
+  }
+  OldTypeRefs.Unknown.clear();
+
+  // Make sure all the upgraded types are resolved.
+  if (DidReplaceTypeRefs) {
+    AnyFwdRefs = true;
+    MinFwdRef = 0;
+    MaxFwdRef = MetadataPtrs.size() - 1;
+  }
+
+  if (!AnyFwdRefs)
+    // Nothing to do.
+    return;
+
   // Resolve any cycles.
   for (unsigned I = MinFwdRef, E = MaxFwdRef + 1; I != E; ++I) {
     auto &MD = MetadataPtrs[I];
@@ -1103,6 +1202,60 @@ void BitcodeReaderMetadataList::tryToResolveCycles() {
   AnyFwdRefs = false;
 }
 
+void BitcodeReaderMetadataList::addTypeRef(MDString &UUID,
+                                           DICompositeType &CT) {
+  assert(CT.getRawIdentifier() == &UUID && "Mismatched UUID");
+  if (CT.isForwardDecl())
+    OldTypeRefs.FwdDecls.insert(std::make_pair(&UUID, &CT));
+  else
+    OldTypeRefs.Final.insert(std::make_pair(&UUID, &CT));
+}
+
+Metadata *BitcodeReaderMetadataList::upgradeTypeRef(Metadata *MaybeUUID) {
+  auto *UUID = dyn_cast_or_null<MDString>(MaybeUUID);
+  if (LLVM_LIKELY(!UUID))
+    return MaybeUUID;
+
+  if (auto *CT = OldTypeRefs.Final.lookup(UUID))
+    return CT;
+
+  auto &Ref = OldTypeRefs.Unknown[UUID];
+  if (!Ref)
+    Ref = MDNode::getTemporary(Context, None);
+  return Ref.get();
+}
+
+Metadata *BitcodeReaderMetadataList::upgradeTypeRefArray(Metadata *MaybeTuple) {
+  auto *Tuple = dyn_cast_or_null<MDTuple>(MaybeTuple);
+  if (!Tuple || Tuple->isDistinct())
+    return MaybeTuple;
+
+  // Look through the array immediately if possible.
+  if (!Tuple->isTemporary())
+    return resolveTypeRefArray(Tuple);
+
+  // Create and return a placeholder to use for now.  Eventually
+  // resolveTypeRefArrays() will be resolve this forward reference.
+  OldTypeRefs.Arrays.emplace_back(
+      std::piecewise_construct, std::forward_as_tuple(Tuple),
+      std::forward_as_tuple(MDTuple::getTemporary(Context, None)));
+  return OldTypeRefs.Arrays.back().second.get();
+}
+
+Metadata *BitcodeReaderMetadataList::resolveTypeRefArray(Metadata *MaybeTuple) {
+  auto *Tuple = dyn_cast_or_null<MDTuple>(MaybeTuple);
+  if (!Tuple || Tuple->isDistinct())
+    return MaybeTuple;
+
+  // Look through the DITypeRefArray, upgrading each DITypeRef.
+  SmallVector<Metadata *, 32> Ops;
+  Ops.reserve(Tuple->getNumOperands());
+  for (Metadata *MD : Tuple->operands())
+    Ops.push_back(upgradeTypeRef(MD));
+
+  return MDTuple::get(Context, Ops);
+}
+
 Type *BitcodeReader::getTypeByID(unsigned ID) {
   // The type table size is always specified correctly.
   if (ID >= TypeList.size())
@@ -1129,7 +1282,6 @@ StructType *BitcodeReader::createIdentifiedStructType(LLVMContext &Context) {
   return Ret;
 }
 
-
 //===----------------------------------------------------------------------===//
 //  Functions for parsing blocks from the bitcode file
 //===----------------------------------------------------------------------===//
@@ -1271,6 +1423,8 @@ static Attribute::AttrKind getAttrFromCode(uint64_t Code) {
     return Attribute::Dereferenceable;
   case bitc::ATTR_KIND_DEREFERENCEABLE_OR_NULL:
     return Attribute::DereferenceableOrNull;
+  case bitc::ATTR_KIND_ALLOC_SIZE:
+    return Attribute::AllocSize;
   case bitc::ATTR_KIND_NO_RED_ZONE:
     return Attribute::NoRedZone;
   case bitc::ATTR_KIND_NO_RETURN:
@@ -1309,8 +1463,14 @@ static Attribute::AttrKind getAttrFromCode(uint64_t Code) {
     return Attribute::SanitizeThread;
   case bitc::ATTR_KIND_SANITIZE_MEMORY:
     return Attribute::SanitizeMemory;
+  case bitc::ATTR_KIND_SWIFT_ERROR:
+    return Attribute::SwiftError;
+  case bitc::ATTR_KIND_SWIFT_SELF:
+    return Attribute::SwiftSelf;
   case bitc::ATTR_KIND_UW_TABLE:
     return Attribute::UWTable;
+  case bitc::ATTR_KIND_WRITEONLY:
+    return Attribute::WriteOnly;
   case bitc::ATTR_KIND_Z_EXT:
     return Attribute::ZExt;
   }
@@ -1391,6 +1551,8 @@ std::error_code BitcodeReader::parseAttributeGroupBlock() {
             B.addDereferenceableAttr(Record[++i]);
           else if (Kind == Attribute::DereferenceableOrNull)
             B.addDereferenceableOrNullAttr(Record[++i]);
+          else if (Kind == Attribute::AllocSize)
+            B.addAllocSizeAttrFromRawRepr(Record[++i]);
         } else {                     // String attribute
           assert((Record[i] == 3 || Record[i] == 4) &&
                  "Invalid attribute group entry");
@@ -1727,6 +1889,27 @@ ErrorOr<Value *> BitcodeReader::recordValue(SmallVectorImpl<uint64_t> &Record,
   return V;
 }
 
+/// Helper to note and return the current location, and jump to the given
+/// offset.
+static uint64_t jumpToValueSymbolTable(uint64_t Offset,
+                                       BitstreamCursor &Stream) {
+  // Save the current parsing location so we can jump back at the end
+  // of the VST read.
+  uint64_t CurrentBit = Stream.GetCurrentBitNo();
+  Stream.JumpToBit(Offset * 32);
+#ifndef NDEBUG
+  // Do some checking if we are in debug mode.
+  BitstreamEntry Entry = Stream.advance();
+  assert(Entry.Kind == BitstreamEntry::SubBlock);
+  assert(Entry.ID == bitc::VALUE_SYMTAB_BLOCK_ID);
+#else
+  // In NDEBUG mode ignore the output so we don't get an unused variable
+  // warning.
+  Stream.advance();
+#endif
+  return CurrentBit;
+}
+
 /// Parse the value symbol table at either the current parsing location or
 /// at the given bit offset if provided.
 std::error_code BitcodeReader::parseValueSymbolTable(uint64_t Offset) {
@@ -1734,22 +1917,8 @@ std::error_code BitcodeReader::parseValueSymbolTable(uint64_t Offset) {
   // Pass in the Offset to distinguish between calling for the module-level
   // VST (where we want to jump to the VST offset) and the function-level
   // VST (where we don't).
-  if (Offset > 0) {
-    // Save the current parsing location so we can jump back at the end
-    // of the VST read.
-    CurrentBit = Stream.GetCurrentBitNo();
-    Stream.JumpToBit(Offset * 32);
-#ifndef NDEBUG
-    // Do some checking if we are in debug mode.
-    BitstreamEntry Entry = Stream.advance();
-    assert(Entry.Kind == BitstreamEntry::SubBlock);
-    assert(Entry.ID == bitc::VALUE_SYMTAB_BLOCK_ID);
-#else
-    // In NDEBUG mode ignore the output so we don't get an unused variable
-    // warning.
-    Stream.advance();
-#endif
-  }
+  if (Offset > 0)
+    CurrentBit = jumpToValueSymbolTable(Offset, Stream);
 
   // Compute the delta between the bitcode indices in the VST (the word offset
   // to the word-aligned ENTER_SUBBLOCK for the function block, and that
@@ -1795,7 +1964,7 @@ std::error_code BitcodeReader::parseValueSymbolTable(uint64_t Offset) {
     switch (Stream.readRecord(Entry.ID, Record)) {
     default:  // Default behavior: unknown type.
       break;
-    case bitc::VST_CODE_ENTRY: {  // VST_ENTRY: [valueid, namechar x N]
+    case bitc::VST_CODE_ENTRY: {  // VST_CODE_ENTRY: [valueid, namechar x N]
       ErrorOr<Value *> ValOrErr = recordValue(Record, 1, TT);
       if (std::error_code EC = ValOrErr.getError())
         return EC;
@@ -1803,7 +1972,7 @@ std::error_code BitcodeReader::parseValueSymbolTable(uint64_t Offset) {
       break;
     }
     case bitc::VST_CODE_FNENTRY: {
-      // VST_FNENTRY: [valueid, offset, namechar x N]
+      // VST_CODE_FNENTRY: [valueid, offset, namechar x N]
       ErrorOr<Value *> ValOrErr = recordValue(Record, 2, TT);
       if (std::error_code EC = ValOrErr.getError())
         return EC;
@@ -1863,47 +2032,122 @@ BitcodeReader::parseMetadataKindRecord(SmallVectorImpl<uint64_t> &Record) {
 
 static int64_t unrotateSign(uint64_t U) { return U & 1 ? ~(U >> 1) : U >> 1; }
 
+std::error_code BitcodeReader::parseMetadataStrings(ArrayRef<uint64_t> Record,
+                                                    StringRef Blob,
+                                                    unsigned &NextMetadataNo) {
+  // All the MDStrings in the block are emitted together in a single
+  // record.  The strings are concatenated and stored in a blob along with
+  // their sizes.
+  if (Record.size() != 2)
+    return error("Invalid record: metadata strings layout");
+
+  unsigned NumStrings = Record[0];
+  unsigned StringsOffset = Record[1];
+  if (!NumStrings)
+    return error("Invalid record: metadata strings with no strings");
+  if (StringsOffset > Blob.size())
+    return error("Invalid record: metadata strings corrupt offset");
+
+  StringRef Lengths = Blob.slice(0, StringsOffset);
+  SimpleBitstreamCursor R(*StreamFile);
+  R.jumpToPointer(Lengths.begin());
+
+  // Ensure that Blob doesn't get invalidated, even if this is reading from
+  // a StreamingMemoryObject with corrupt data.
+  R.setArtificialByteLimit(R.getCurrentByteNo() + StringsOffset);
+
+  StringRef Strings = Blob.drop_front(StringsOffset);
+  do {
+    if (R.AtEndOfStream())
+      return error("Invalid record: metadata strings bad length");
+
+    unsigned Size = R.ReadVBR(6);
+    if (Strings.size() < Size)
+      return error("Invalid record: metadata strings truncated chars");
+
+    MetadataList.assignValue(MDString::get(Context, Strings.slice(0, Size)),
+                             NextMetadataNo++);
+    Strings = Strings.drop_front(Size);
+  } while (--NumStrings);
+
+  return std::error_code();
+}
+
+namespace {
+class PlaceholderQueue {
+  // Placeholders would thrash around when moved, so store in a std::deque
+  // instead of some sort of vector.
+  std::deque<DistinctMDOperandPlaceholder> PHs;
+
+public:
+  DistinctMDOperandPlaceholder &getPlaceholderOp(unsigned ID);
+  void flush(BitcodeReaderMetadataList &MetadataList);
+};
+} // end namespace
+
+DistinctMDOperandPlaceholder &PlaceholderQueue::getPlaceholderOp(unsigned ID) {
+  PHs.emplace_back(ID);
+  return PHs.back();
+}
+
+void PlaceholderQueue::flush(BitcodeReaderMetadataList &MetadataList) {
+  while (!PHs.empty()) {
+    PHs.front().replaceUseWith(
+        MetadataList.getMetadataFwdRef(PHs.front().getID()));
+    PHs.pop_front();
+  }
+}
+
 /// Parse a METADATA_BLOCK. If ModuleLevel is true then we are parsing
 /// module level metadata.
 std::error_code BitcodeReader::parseMetadata(bool ModuleLevel) {
+  assert((ModuleLevel || DeferredMetadataInfo.empty()) &&
+         "Must read all module-level metadata before function-level");
+
   IsMetadataMaterialized = true;
   unsigned NextMetadataNo = MetadataList.size();
-  if (ModuleLevel && SeenModuleValuesRecord) {
-    // Now that we are parsing the module level metadata, we want to restart
-    // the numbering of the MD values, and replace temp MD created earlier
-    // with their real values. If we saw a METADATA_VALUE record then we
-    // would have set the MetadataList size to the number specified in that
-    // record, to support parsing function-level metadata first, and we need
-    // to reset back to 0 to fill the MetadataList in with the parsed module
-    // The function-level metadata parsing should have reset the MetadataList
-    // size back to the value reported by the METADATA_VALUE record, saved in
-    // NumModuleMDs.
-    assert(NumModuleMDs == MetadataList.size() &&
-           "Expected MetadataList to only contain module level values");
-    NextMetadataNo = 0;
-  }
+
+  if (!ModuleLevel && MetadataList.hasFwdRefs())
+    return error("Invalid metadata: fwd refs into function blocks");
 
   if (Stream.EnterSubBlock(bitc::METADATA_BLOCK_ID))
     return error("Invalid record");
 
+  std::vector<std::pair<DICompileUnit *, Metadata *>> CUSubprograms;
   SmallVector<uint64_t, 64> Record;
 
+  PlaceholderQueue Placeholders;
+  bool IsDistinct;
   auto getMD = [&](unsigned ID) -> Metadata * {
-    return MetadataList.getValueFwdRef(ID);
+    if (!IsDistinct)
+      return MetadataList.getMetadataFwdRef(ID);
+    if (auto *MD = MetadataList.getMetadataIfResolved(ID))
+      return MD;
+    return &Placeholders.getPlaceholderOp(ID);
   };
-  auto getMDOrNull = [&](unsigned ID) -> Metadata *{
+  auto getMDOrNull = [&](unsigned ID) -> Metadata * {
     if (ID)
       return getMD(ID - 1);
     return nullptr;
   };
+  auto getMDOrNullWithoutPlaceholders = [&](unsigned ID) -> Metadata * {
+    if (ID)
+      return MetadataList.getMetadataFwdRef(ID - 1);
+    return nullptr;
+  };
   auto getMDString = [&](unsigned ID) -> MDString *{
     // This requires that the ID is not really a forward reference.  In
     // particular, the MDString must already have been resolved.
     return cast_or_null<MDString>(getMDOrNull(ID));
   };
 
-#define GET_OR_DISTINCT(CLASS, DISTINCT, ARGS)                                 \
-  (DISTINCT ? CLASS::getDistinct ARGS : CLASS::get ARGS)
+  // Support for old type refs.
+  auto getDITypeRefOrNull = [&](unsigned ID) {
+    return MetadataList.upgradeTypeRef(getMDOrNull(ID));
+  };
+
+#define GET_OR_DISTINCT(CLASS, ARGS)                                           \
+  (IsDistinct ? CLASS::getDistinct ARGS : CLASS::get ARGS)
 
   // Read all the records.
   while (1) {
@@ -1914,10 +2158,15 @@ std::error_code BitcodeReader::parseMetadata(bool ModuleLevel) {
     case BitstreamEntry::Error:
       return error("Malformed block");
     case BitstreamEntry::EndBlock:
+      // Upgrade old-style CU <-> SP pointers to point from SP to CU.
+      for (auto CU_SP : CUSubprograms)
+        if (auto *SPs = dyn_cast_or_null<MDTuple>(CU_SP.second))
+          for (auto &Op : SPs->operands())
+            if (auto *SP = dyn_cast_or_null<MDNode>(Op))
+              SP->replaceOperandWith(7, CU_SP.first);
+
       MetadataList.tryToResolveCycles();
-      assert((!(ModuleLevel && SeenModuleValuesRecord) ||
-              NumModuleMDs == MetadataList.size()) &&
-             "Inconsistent bitcode: METADATA_VALUES mismatch");
+      Placeholders.flush(MetadataList);
       return std::error_code();
     case BitstreamEntry::Record:
       // The interesting case.
@@ -1926,8 +2175,9 @@ std::error_code BitcodeReader::parseMetadata(bool ModuleLevel) {
 
     // Read a record.
     Record.clear();
-    unsigned Code = Stream.readRecord(Entry.ID, Record);
-    bool IsDistinct = false;
+    StringRef Blob;
+    unsigned Code = Stream.readRecord(Entry.ID, Record, &Blob);
+    IsDistinct = false;
     switch (Code) {
     default:  // Default behavior: ignore.
       break;
@@ -1945,8 +2195,7 @@ std::error_code BitcodeReader::parseMetadata(bool ModuleLevel) {
       unsigned Size = Record.size();
       NamedMDNode *NMD = TheModule->getOrInsertNamedMetadata(Name);
       for (unsigned i = 0; i != Size; ++i) {
-        MDNode *MD =
-            dyn_cast_or_null<MDNode>(MetadataList.getValueFwdRef(Record[i]));
+        MDNode *MD = MetadataList.getMDNodeFwdRefOrNull(Record[i]);
         if (!MD)
           return error("Invalid record");
         NMD->addOperand(MD);
@@ -1993,7 +2242,7 @@ std::error_code BitcodeReader::parseMetadata(bool ModuleLevel) {
         if (!Ty)
           return error("Invalid record");
         if (Ty->isMetadataTy())
-          Elts.push_back(MetadataList.getValueFwdRef(Record[i + 1]));
+          Elts.push_back(getMD(Record[i + 1]));
         else if (!Ty->isVoidTy()) {
           auto *MD =
               ValueAsMetadata::get(ValueList.getValueFwdRef(Record[i + 1], Ty));
@@ -2026,7 +2275,7 @@ std::error_code BitcodeReader::parseMetadata(bool ModuleLevel) {
       SmallVector<Metadata *, 8> Elts;
       Elts.reserve(Record.size());
       for (unsigned ID : Record)
-        Elts.push_back(ID ? MetadataList.getValueFwdRef(ID - 1) : nullptr);
+        Elts.push_back(getMDOrNull(ID));
       MetadataList.assignValue(IsDistinct ? MDNode::getDistinct(Context, Elts)
                                           : MDNode::get(Context, Elts),
                                NextMetadataNo++);
@@ -2036,13 +2285,13 @@ std::error_code BitcodeReader::parseMetadata(bool ModuleLevel) {
       if (Record.size() != 5)
         return error("Invalid record");
 
+      IsDistinct = Record[0];
       unsigned Line = Record[1];
       unsigned Column = Record[2];
-      MDNode *Scope = cast<MDNode>(MetadataList.getValueFwdRef(Record[3]));
-      Metadata *InlinedAt =
-          Record[4] ? MetadataList.getValueFwdRef(Record[4] - 1) : nullptr;
+      Metadata *Scope = getMD(Record[3]);
+      Metadata *InlinedAt = getMDOrNull(Record[4]);
       MetadataList.assignValue(
-          GET_OR_DISTINCT(DILocation, Record[0],
+          GET_OR_DISTINCT(DILocation,
                           (Context, Line, Column, Scope, InlinedAt)),
           NextMetadataNo++);
       break;
@@ -2051,6 +2300,7 @@ std::error_code BitcodeReader::parseMetadata(bool ModuleLevel) {
       if (Record.size() < 4)
         return error("Invalid record");
 
+      IsDistinct = Record[0];
       unsigned Tag = Record[1];
       unsigned Version = Record[2];
 
@@ -2060,11 +2310,9 @@ std::error_code BitcodeReader::parseMetadata(bool ModuleLevel) {
       auto *Header = getMDString(Record[3]);
       SmallVector<Metadata *, 8> DwarfOps;
       for (unsigned I = 4, E = Record.size(); I != E; ++I)
-        DwarfOps.push_back(
-            Record[I] ? MetadataList.getValueFwdRef(Record[I] - 1) : nullptr);
+        DwarfOps.push_back(getMDOrNull(Record[I]));
       MetadataList.assignValue(
-          GET_OR_DISTINCT(GenericDINode, Record[0],
-                          (Context, Tag, Header, DwarfOps)),
+          GET_OR_DISTINCT(GenericDINode, (Context, Tag, Header, DwarfOps)),
           NextMetadataNo++);
       break;
     }
@@ -2072,8 +2320,9 @@ std::error_code BitcodeReader::parseMetadata(bool ModuleLevel) {
       if (Record.size() != 3)
         return error("Invalid record");
 
+      IsDistinct = Record[0];
       MetadataList.assignValue(
-          GET_OR_DISTINCT(DISubrange, Record[0],
+          GET_OR_DISTINCT(DISubrange,
                           (Context, Record[1], unrotateSign(Record[2]))),
           NextMetadataNo++);
       break;
@@ -2082,10 +2331,10 @@ std::error_code BitcodeReader::parseMetadata(bool ModuleLevel) {
       if (Record.size() != 3)
         return error("Invalid record");
 
+      IsDistinct = Record[0];
       MetadataList.assignValue(
-          GET_OR_DISTINCT(
-              DIEnumerator, Record[0],
-              (Context, unrotateSign(Record[1]), getMDString(Record[2]))),
+          GET_OR_DISTINCT(DIEnumerator, (Context, unrotateSign(Record[1]),
+                                         getMDString(Record[2]))),
           NextMetadataNo++);
       break;
     }
@@ -2093,8 +2342,9 @@ std::error_code BitcodeReader::parseMetadata(bool ModuleLevel) {
       if (Record.size() != 6)
         return error("Invalid record");
 
+      IsDistinct = Record[0];
       MetadataList.assignValue(
-          GET_OR_DISTINCT(DIBasicType, Record[0],
+          GET_OR_DISTINCT(DIBasicType,
                           (Context, Record[1], getMDString(Record[2]),
                            Record[3], Record[4], Record[5])),
           NextMetadataNo++);
@@ -2104,13 +2354,14 @@ std::error_code BitcodeReader::parseMetadata(bool ModuleLevel) {
       if (Record.size() != 12)
         return error("Invalid record");
 
+      IsDistinct = Record[0];
       MetadataList.assignValue(
-          GET_OR_DISTINCT(DIDerivedType, Record[0],
-                          (Context, Record[1], getMDString(Record[2]),
-                           getMDOrNull(Record[3]), Record[4],
-                           getMDOrNull(Record[5]), getMDOrNull(Record[6]),
-                           Record[7], Record[8], Record[9], Record[10],
-                           getMDOrNull(Record[11]))),
+          GET_OR_DISTINCT(
+              DIDerivedType,
+              (Context, Record[1], getMDString(Record[2]),
+               getMDOrNull(Record[3]), Record[4], getDITypeRefOrNull(Record[5]),
+               getDITypeRefOrNull(Record[6]), Record[7], Record[8], Record[9],
+               Record[10], getDITypeRefOrNull(Record[11]))),
           NextMetadataNo++);
       break;
     }
@@ -2118,25 +2369,58 @@ std::error_code BitcodeReader::parseMetadata(bool ModuleLevel) {
       if (Record.size() != 16)
         return error("Invalid record");
 
-      MetadataList.assignValue(
-          GET_OR_DISTINCT(DICompositeType, Record[0],
-                          (Context, Record[1], getMDString(Record[2]),
-                           getMDOrNull(Record[3]), Record[4],
-                           getMDOrNull(Record[5]), getMDOrNull(Record[6]),
-                           Record[7], Record[8], Record[9], Record[10],
-                           getMDOrNull(Record[11]), Record[12],
-                           getMDOrNull(Record[13]), getMDOrNull(Record[14]),
-                           getMDString(Record[15]))),
-          NextMetadataNo++);
+      // If we have a UUID and this is not a forward declaration, lookup the
+      // mapping.
+      IsDistinct = Record[0] & 0x1;
+      bool IsNotUsedInTypeRef = Record[0] >= 2;
+      unsigned Tag = Record[1];
+      MDString *Name = getMDString(Record[2]);
+      Metadata *File = getMDOrNull(Record[3]);
+      unsigned Line = Record[4];
+      Metadata *Scope = getDITypeRefOrNull(Record[5]);
+      Metadata *BaseType = getDITypeRefOrNull(Record[6]);
+      uint64_t SizeInBits = Record[7];
+      uint64_t AlignInBits = Record[8];
+      uint64_t OffsetInBits = Record[9];
+      unsigned Flags = Record[10];
+      Metadata *Elements = getMDOrNull(Record[11]);
+      unsigned RuntimeLang = Record[12];
+      Metadata *VTableHolder = getDITypeRefOrNull(Record[13]);
+      Metadata *TemplateParams = getMDOrNull(Record[14]);
+      auto *Identifier = getMDString(Record[15]);
+      DICompositeType *CT = nullptr;
+      if (Identifier)
+        CT = DICompositeType::buildODRType(
+            Context, *Identifier, Tag, Name, File, Line, Scope, BaseType,
+            SizeInBits, AlignInBits, OffsetInBits, Flags, Elements, RuntimeLang,
+            VTableHolder, TemplateParams);
+
+      // Create a node if we didn't get a lazy ODR type.
+      if (!CT)
+        CT = GET_OR_DISTINCT(DICompositeType,
+                             (Context, Tag, Name, File, Line, Scope, BaseType,
+                              SizeInBits, AlignInBits, OffsetInBits, Flags,
+                              Elements, RuntimeLang, VTableHolder,
+                              TemplateParams, Identifier));
+      if (!IsNotUsedInTypeRef && Identifier)
+        MetadataList.addTypeRef(*Identifier, *cast<DICompositeType>(CT));
+
+      MetadataList.assignValue(CT, NextMetadataNo++);
       break;
     }
     case bitc::METADATA_SUBROUTINE_TYPE: {
-      if (Record.size() != 3)
+      if (Record.size() < 3 || Record.size() > 4)
         return error("Invalid record");
+      bool IsOldTypeRefArray = Record[0] < 2;
+      unsigned CC = (Record.size() > 3) ? Record[3] : 0;
+
+      IsDistinct = Record[0] & 0x1;
+      Metadata *Types = getMDOrNull(Record[2]);
+      if (LLVM_UNLIKELY(IsOldTypeRefArray))
+        Types = MetadataList.upgradeTypeRefArray(Types);
 
       MetadataList.assignValue(
-          GET_OR_DISTINCT(DISubroutineType, Record[0],
-                          (Context, Record[1], getMDOrNull(Record[2]))),
+          GET_OR_DISTINCT(DISubroutineType, (Context, Record[1], CC, Types)),
           NextMetadataNo++);
       break;
     }
@@ -2145,8 +2429,9 @@ std::error_code BitcodeReader::parseMetadata(bool ModuleLevel) {
       if (Record.size() != 6)
         return error("Invalid record");
 
+      IsDistinct = Record[0];
       MetadataList.assignValue(
-          GET_OR_DISTINCT(DIModule, Record[0],
+          GET_OR_DISTINCT(DIModule,
                           (Context, getMDOrNull(Record[1]),
                            getMDString(Record[2]), getMDString(Record[3]),
                            getMDString(Record[4]), getMDString(Record[5]))),
@@ -2158,9 +2443,10 @@ std::error_code BitcodeReader::parseMetadata(bool ModuleLevel) {
       if (Record.size() != 3)
         return error("Invalid record");
 
+      IsDistinct = Record[0];
       MetadataList.assignValue(
-          GET_OR_DISTINCT(DIFile, Record[0], (Context, getMDString(Record[1]),
-                                              getMDString(Record[2]))),
+          GET_OR_DISTINCT(DIFile, (Context, getMDString(Record[1]),
+                                   getMDString(Record[2]))),
           NextMetadataNo++);
       break;
     }
@@ -2170,38 +2456,66 @@ std::error_code BitcodeReader::parseMetadata(bool ModuleLevel) {
 
       // Ignore Record[0], which indicates whether this compile unit is
       // distinct.  It's always distinct.
-      MetadataList.assignValue(
-          DICompileUnit::getDistinct(
-              Context, Record[1], getMDOrNull(Record[2]),
-              getMDString(Record[3]), Record[4], getMDString(Record[5]),
-              Record[6], getMDString(Record[7]), Record[8],
-              getMDOrNull(Record[9]), getMDOrNull(Record[10]),
-              getMDOrNull(Record[11]), getMDOrNull(Record[12]),
-              getMDOrNull(Record[13]),
-              Record.size() <= 15 ? 0 : getMDOrNull(Record[15]),
-              Record.size() <= 14 ? 0 : Record[14]),
-          NextMetadataNo++);
+      IsDistinct = true;
+      auto *CU = DICompileUnit::getDistinct(
+          Context, Record[1], getMDOrNull(Record[2]), getMDString(Record[3]),
+          Record[4], getMDString(Record[5]), Record[6], getMDString(Record[7]),
+          Record[8], getMDOrNull(Record[9]), getMDOrNull(Record[10]),
+          getMDOrNull(Record[12]), getMDOrNull(Record[13]),
+          Record.size() <= 15 ? nullptr : getMDOrNull(Record[15]),
+          Record.size() <= 14 ? 0 : Record[14]);
+
+      MetadataList.assignValue(CU, NextMetadataNo++);
+
+      // Move the Upgrade the list of subprograms.
+      if (Metadata *SPs = getMDOrNullWithoutPlaceholders(Record[11]))
+        CUSubprograms.push_back({CU, SPs});
       break;
     }
     case bitc::METADATA_SUBPROGRAM: {
-      if (Record.size() != 18 && Record.size() != 19)
-        return error("Invalid record");
-
-      bool HasFn = Record.size() == 19;
+      if (Record.size() < 18 || Record.size() > 20)
+        return error("Invalid record");
+
+      IsDistinct =
+          (Record[0] & 1) || Record[8]; // All definitions should be distinct.
+      // Version 1 has a Function as Record[15].
+      // Version 2 has removed Record[15].
+      // Version 3 has the Unit as Record[15].
+      // Version 4 added thisAdjustment.
+      bool HasUnit = Record[0] >= 2;
+      if (HasUnit && Record.size() < 19)
+        return error("Invalid record");
+      Metadata *CUorFn = getMDOrNull(Record[15]);
+      unsigned Offset = Record.size() >= 19 ? 1 : 0;
+      bool HasFn = Offset && !HasUnit;
+      bool HasThisAdj = Record.size() >= 20;
       DISubprogram *SP = GET_OR_DISTINCT(
-          DISubprogram,
-          Record[0] || Record[8], // All definitions should be distinct.
-          (Context, getMDOrNull(Record[1]), getMDString(Record[2]),
-           getMDString(Record[3]), getMDOrNull(Record[4]), Record[5],
-           getMDOrNull(Record[6]), Record[7], Record[8], Record[9],
-           getMDOrNull(Record[10]), Record[11], Record[12], Record[13],
-           Record[14], getMDOrNull(Record[15 + HasFn]),
-           getMDOrNull(Record[16 + HasFn]), getMDOrNull(Record[17 + HasFn])));
+          DISubprogram, (Context,
+                         getDITypeRefOrNull(Record[1]),    // scope
+                         getMDString(Record[2]),           // name
+                         getMDString(Record[3]),           // linkageName
+                         getMDOrNull(Record[4]),           // file
+                         Record[5],                        // line
+                         getMDOrNull(Record[6]),           // type
+                         Record[7],                        // isLocal
+                         Record[8],                        // isDefinition
+                         Record[9],                        // scopeLine
+                         getDITypeRefOrNull(Record[10]),   // containingType
+                         Record[11],                       // virtuality
+                         Record[12],                       // virtualIndex
+                         HasThisAdj ? Record[19] : 0,      // thisAdjustment
+                         Record[13],                       // flags
+                         Record[14],                       // isOptimized
+                         HasUnit ? CUorFn : nullptr,       // unit
+                         getMDOrNull(Record[15 + Offset]), // templateParams
+                         getMDOrNull(Record[16 + Offset]), // declaration
+                         getMDOrNull(Record[17 + Offset])  // variables
+                         ));
       MetadataList.assignValue(SP, NextMetadataNo++);
 
       // Upgrade sp->function mapping to function->sp mapping.
-      if (HasFn && Record[15]) {
-        if (auto *CMD = dyn_cast<ConstantAsMetadata>(getMDOrNull(Record[15])))
+      if (HasFn) {
+        if (auto *CMD = dyn_cast_or_null<ConstantAsMetadata>(CUorFn))
           if (auto *F = dyn_cast<Function>(CMD->getValue())) {
             if (F->isMaterializable())
               // Defer until materialized; unmaterialized functions may not have
@@ -2217,8 +2531,9 @@ std::error_code BitcodeReader::parseMetadata(bool ModuleLevel) {
       if (Record.size() != 5)
         return error("Invalid record");
 
+      IsDistinct = Record[0];
       MetadataList.assignValue(
-          GET_OR_DISTINCT(DILexicalBlock, Record[0],
+          GET_OR_DISTINCT(DILexicalBlock,
                           (Context, getMDOrNull(Record[1]),
                            getMDOrNull(Record[2]), Record[3], Record[4])),
           NextMetadataNo++);
@@ -2228,8 +2543,9 @@ std::error_code BitcodeReader::parseMetadata(bool ModuleLevel) {
       if (Record.size() != 4)
         return error("Invalid record");
 
+      IsDistinct = Record[0];
       MetadataList.assignValue(
-          GET_OR_DISTINCT(DILexicalBlockFile, Record[0],
+          GET_OR_DISTINCT(DILexicalBlockFile,
                           (Context, getMDOrNull(Record[1]),
                            getMDOrNull(Record[2]), Record[3])),
           NextMetadataNo++);
@@ -2239,11 +2555,11 @@ std::error_code BitcodeReader::parseMetadata(bool ModuleLevel) {
       if (Record.size() != 5)
         return error("Invalid record");
 
+      IsDistinct = Record[0];
       MetadataList.assignValue(
-          GET_OR_DISTINCT(DINamespace, Record[0],
-                          (Context, getMDOrNull(Record[1]),
-                           getMDOrNull(Record[2]), getMDString(Record[3]),
-                           Record[4])),
+          GET_OR_DISTINCT(DINamespace, (Context, getMDOrNull(Record[1]),
+                                        getMDOrNull(Record[2]),
+                                        getMDString(Record[3]), Record[4])),
           NextMetadataNo++);
       break;
     }
@@ -2251,8 +2567,9 @@ std::error_code BitcodeReader::parseMetadata(bool ModuleLevel) {
       if (Record.size() != 5)
         return error("Invalid record");
 
+      IsDistinct = Record[0];
       MetadataList.assignValue(
-          GET_OR_DISTINCT(DIMacro, Record[0],
+          GET_OR_DISTINCT(DIMacro,
                           (Context, Record[1], Record[2],
                            getMDString(Record[3]), getMDString(Record[4]))),
           NextMetadataNo++);
@@ -2262,8 +2579,9 @@ std::error_code BitcodeReader::parseMetadata(bool ModuleLevel) {
       if (Record.size() != 5)
         return error("Invalid record");
 
+      IsDistinct = Record[0];
       MetadataList.assignValue(
-          GET_OR_DISTINCT(DIMacroFile, Record[0],
+          GET_OR_DISTINCT(DIMacroFile,
                           (Context, Record[1], Record[2],
                            getMDOrNull(Record[3]), getMDOrNull(Record[4]))),
           NextMetadataNo++);
@@ -2273,10 +2591,10 @@ std::error_code BitcodeReader::parseMetadata(bool ModuleLevel) {
       if (Record.size() != 3)
         return error("Invalid record");
 
+      IsDistinct = Record[0];
       MetadataList.assignValue(GET_OR_DISTINCT(DITemplateTypeParameter,
-                                               Record[0],
                                                (Context, getMDString(Record[1]),
-                                                getMDOrNull(Record[2]))),
+                                                getDITypeRefOrNull(Record[2]))),
                                NextMetadataNo++);
       break;
     }
@@ -2284,10 +2602,12 @@ std::error_code BitcodeReader::parseMetadata(bool ModuleLevel) {
       if (Record.size() != 5)
         return error("Invalid record");
 
+      IsDistinct = Record[0];
       MetadataList.assignValue(
-          GET_OR_DISTINCT(DITemplateValueParameter, Record[0],
+          GET_OR_DISTINCT(DITemplateValueParameter,
                           (Context, Record[1], getMDString(Record[2]),
-                           getMDOrNull(Record[3]), getMDOrNull(Record[4]))),
+                           getDITypeRefOrNull(Record[3]),
+                           getMDOrNull(Record[4]))),
           NextMetadataNo++);
       break;
     }
@@ -2295,12 +2615,13 @@ std::error_code BitcodeReader::parseMetadata(bool ModuleLevel) {
       if (Record.size() != 11)
         return error("Invalid record");
 
+      IsDistinct = Record[0];
       MetadataList.assignValue(
-          GET_OR_DISTINCT(DIGlobalVariable, Record[0],
+          GET_OR_DISTINCT(DIGlobalVariable,
                           (Context, getMDOrNull(Record[1]),
                            getMDString(Record[2]), getMDString(Record[3]),
                            getMDOrNull(Record[4]), Record[5],
-                           getMDOrNull(Record[6]), Record[7], Record[8],
+                           getDITypeRefOrNull(Record[6]), Record[7], Record[8],
                            getMDOrNull(Record[9]), getMDOrNull(Record[10]))),
           NextMetadataNo++);
       break;
@@ -2312,14 +2633,15 @@ std::error_code BitcodeReader::parseMetadata(bool ModuleLevel) {
 
       // 2nd field used to be an artificial tag, either DW_TAG_auto_variable or
       // DW_TAG_arg_variable.
+      IsDistinct = Record[0];
       bool HasTag = Record.size() > 8;
       MetadataList.assignValue(
-          GET_OR_DISTINCT(DILocalVariable, Record[0],
+          GET_OR_DISTINCT(DILocalVariable,
                           (Context, getMDOrNull(Record[1 + HasTag]),
                            getMDString(Record[2 + HasTag]),
                            getMDOrNull(Record[3 + HasTag]), Record[4 + HasTag],
-                           getMDOrNull(Record[5 + HasTag]), Record[6 + HasTag],
-                           Record[7 + HasTag])),
+                           getDITypeRefOrNull(Record[5 + HasTag]),
+                           Record[6 + HasTag], Record[7 + HasTag])),
           NextMetadataNo++);
       break;
     }
@@ -2327,8 +2649,9 @@ std::error_code BitcodeReader::parseMetadata(bool ModuleLevel) {
       if (Record.size() < 1)
         return error("Invalid record");
 
+      IsDistinct = Record[0];
       MetadataList.assignValue(
-          GET_OR_DISTINCT(DIExpression, Record[0],
+          GET_OR_DISTINCT(DIExpression,
                           (Context, makeArrayRef(Record).slice(1))),
           NextMetadataNo++);
       break;
@@ -2337,12 +2660,13 @@ std::error_code BitcodeReader::parseMetadata(bool ModuleLevel) {
       if (Record.size() != 8)
         return error("Invalid record");
 
+      IsDistinct = Record[0];
       MetadataList.assignValue(
-          GET_OR_DISTINCT(DIObjCProperty, Record[0],
+          GET_OR_DISTINCT(DIObjCProperty,
                           (Context, getMDString(Record[1]),
                            getMDOrNull(Record[2]), Record[3],
                            getMDString(Record[4]), getMDString(Record[5]),
-                           Record[6], getMDOrNull(Record[7]))),
+                           Record[6], getDITypeRefOrNull(Record[7]))),
           NextMetadataNo++);
       break;
     }
@@ -2350,21 +2674,40 @@ std::error_code BitcodeReader::parseMetadata(bool ModuleLevel) {
       if (Record.size() != 6)
         return error("Invalid record");
 
+      IsDistinct = Record[0];
       MetadataList.assignValue(
-          GET_OR_DISTINCT(DIImportedEntity, Record[0],
+          GET_OR_DISTINCT(DIImportedEntity,
                           (Context, Record[1], getMDOrNull(Record[2]),
-                           getMDOrNull(Record[3]), Record[4],
+                           getDITypeRefOrNull(Record[3]), Record[4],
                            getMDString(Record[5]))),
           NextMetadataNo++);
       break;
     }
-    case bitc::METADATA_STRING: {
+    case bitc::METADATA_STRING_OLD: {
       std::string String(Record.begin(), Record.end());
-      llvm::UpgradeMDStringConstant(String);
+
+      // Test for upgrading !llvm.loop.
+      HasSeenOldLoopTags |= mayBeOldLoopAttachmentTag(String);
+
       Metadata *MD = MDString::get(Context, String);
       MetadataList.assignValue(MD, NextMetadataNo++);
       break;
     }
+    case bitc::METADATA_STRINGS:
+      if (std::error_code EC =
+              parseMetadataStrings(Record, Blob, NextMetadataNo))
+        return EC;
+      break;
+    case bitc::METADATA_GLOBAL_DECL_ATTACHMENT: {
+      if (Record.size() % 2 == 0)
+        return error("Invalid record");
+      unsigned ValueID = Record[0];
+      if (ValueID >= ValueList.size())
+        return error("Invalid record");
+      if (auto *GO = dyn_cast<GlobalObject>(ValueList[ValueID]))
+        parseGlobalObjectAttachment(*GO, ArrayRef<uint64_t>(Record).slice(1));
+      break;
+    }
     case bitc::METADATA_KIND: {
       // Support older bitcode files that had METADATA_KIND records in a
       // block with METADATA_BLOCK_ID.
@@ -2426,15 +2769,16 @@ uint64_t BitcodeReader::decodeSignRotatedValue(uint64_t V) {
 }
 
 /// Resolve all of the initializers for global values and aliases that we can.
-std::error_code BitcodeReader::resolveGlobalAndAliasInits() {
+std::error_code BitcodeReader::resolveGlobalAndIndirectSymbolInits() {
   std::vector<std::pair<GlobalVariable*, unsigned> > GlobalInitWorklist;
-  std::vector<std::pair<GlobalAlias*, unsigned> > AliasInitWorklist;
+  std::vector<std::pair<GlobalIndirectSymbol*, unsigned> >
+      IndirectSymbolInitWorklist;
   std::vector<std::pair<Function*, unsigned> > FunctionPrefixWorklist;
   std::vector<std::pair<Function*, unsigned> > FunctionPrologueWorklist;
   std::vector<std::pair<Function*, unsigned> > FunctionPersonalityFnWorklist;
 
   GlobalInitWorklist.swap(GlobalInits);
-  AliasInitWorklist.swap(AliasInits);
+  IndirectSymbolInitWorklist.swap(IndirectSymbolInits);
   FunctionPrefixWorklist.swap(FunctionPrefixes);
   FunctionPrologueWorklist.swap(FunctionPrologues);
   FunctionPersonalityFnWorklist.swap(FunctionPersonalityFns);
@@ -2453,20 +2797,20 @@ std::error_code BitcodeReader::resolveGlobalAndAliasInits() {
     GlobalInitWorklist.pop_back();
   }
 
-  while (!AliasInitWorklist.empty()) {
-    unsigned ValID = AliasInitWorklist.back().second;
+  while (!IndirectSymbolInitWorklist.empty()) {
+    unsigned ValID = IndirectSymbolInitWorklist.back().second;
     if (ValID >= ValueList.size()) {
-      AliasInits.push_back(AliasInitWorklist.back());
+      IndirectSymbolInits.push_back(IndirectSymbolInitWorklist.back());
     } else {
       Constant *C = dyn_cast_or_null<Constant>(ValueList[ValID]);
       if (!C)
         return error("Expected a constant");
-      GlobalAlias *Alias = AliasInitWorklist.back().first;
-      if (C->getType() != Alias->getType())
+      GlobalIndirectSymbol *GIS = IndirectSymbolInitWorklist.back().first;
+      if (isa<GlobalAlias>(GIS) && C->getType() != GIS->getType())
         return error("Alias and aliasee types don't match");
-      Alias->setAliasee(C);
+      GIS->setIndirectSymbol(C);
     }
-    AliasInitWorklist.pop_back();
+    IndirectSymbolInitWorklist.pop_back();
   }
 
   while (!FunctionPrefixWorklist.empty()) {
@@ -2537,7 +2881,7 @@ std::error_code BitcodeReader::parseConstants() {
       return error("Malformed block");
     case BitstreamEntry::EndBlock:
       if (NextCstNo != ValueList.size())
-        return error("Invalid ronstant reference");
+        return error("Invalid constant reference");
 
       // Once all the constants have been read, go through and resolve forward
       // references.
@@ -2550,6 +2894,7 @@ std::error_code BitcodeReader::parseConstants() {
 
     // Read a record.
     Record.clear();
+    Type *VoidType = Type::getVoidTy(Context);
     Value *V = nullptr;
     unsigned BitCode = Stream.readRecord(Entry.ID, Record);
     switch (BitCode) {
@@ -2562,6 +2907,8 @@ std::error_code BitcodeReader::parseConstants() {
         return error("Invalid record");
       if (Record[0] >= TypeList.size() || !TypeList[Record[0]])
         return error("Invalid record");
+      if (TypeList[Record[0]] == VoidType)
+        return error("Invalid constant type");
       CurTy = TypeList[Record[0]];
       continue;  // Skip the ValueList manipulation.
     case bitc::CST_CODE_NULL:      // NULL
@@ -2701,7 +3048,6 @@ std::error_code BitcodeReader::parseConstants() {
       }
       break;
     }
-
     case bitc::CST_CODE_CE_BINOP: {  // CE_BINOP: [opcode, opval, opval]
       if (Record.size() < 3)
         return error("Invalid record");
@@ -2770,6 +3116,9 @@ std::error_code BitcodeReader::parseConstants() {
         return error("Explicit gep operator type does not match pointee type "
                      "of pointer operand");
 
+      if (Elts.size() < 1)
+        return error("Invalid gep with no operands");
+
       ArrayRef<Constant *> Indices(Elts.begin() + 1, Elts.end());
       V = ConstantExpr::getGetElementPtr(PointeeType, Elts[0], Indices,
                                          BitCode ==
@@ -3067,35 +3416,6 @@ std::error_code BitcodeReader::materializeMetadata() {
 
 void BitcodeReader::setStripDebugInfo() { StripDebugInfo = true; }
 
-void BitcodeReader::saveMetadataList(
-    DenseMap<const Metadata *, unsigned> &MetadataToIDs, bool OnlyTempMD) {
-  for (unsigned ID = 0; ID < MetadataList.size(); ++ID) {
-    Metadata *MD = MetadataList[ID];
-    auto *N = dyn_cast_or_null<MDNode>(MD);
-    assert((!N || (N->isResolved() || N->isTemporary())) &&
-           "Found non-resolved non-temp MDNode while saving metadata");
-    // Save all values if !OnlyTempMD, otherwise just the temporary metadata.
-    // Note that in the !OnlyTempMD case we need to save all Metadata, not
-    // just MDNode, as we may have references to other types of module-level
-    // metadata (e.g. ValueAsMetadata) from instructions.
-    if (!OnlyTempMD || (N && N->isTemporary())) {
-      // Will call this after materializing each function, in order to
-      // handle remapping of the function's instructions/metadata.
-      // See if we already have an entry in that case.
-      if (OnlyTempMD && MetadataToIDs.count(MD)) {
-        assert(MetadataToIDs[MD] == ID && "Inconsistent metadata value id");
-        continue;
-      }
-      if (N && N->isTemporary())
-        // Ensure that we assert if someone tries to RAUW this temporary
-        // metadata while it is the key of a map. The flag will be set back
-        // to true when the saved metadata list is destroyed.
-        N->setCanReplace(false);
-      MetadataToIDs[MD] = ID;
-    }
-  }
-}
-
 /// When we see the block for a function body, remember where it is and then
 /// skip it.  This lets us lazily deserialize the functions.
 std::error_code BitcodeReader::rememberAndSkipFunctionBody() {
@@ -3121,8 +3441,8 @@ std::error_code BitcodeReader::rememberAndSkipFunctionBody() {
 
 std::error_code BitcodeReader::globalCleanup() {
   // Patch the initializers for globals and aliases up.
-  resolveGlobalAndAliasInits();
-  if (!GlobalInits.empty() || !AliasInits.empty())
+  resolveGlobalAndIndirectSymbolInits();
+  if (!GlobalInits.empty() || !IndirectSymbolInits.empty())
     return error("Malformed global initializer set");
 
   // Look for intrinsic functions which need to be upgraded at some point
@@ -3130,6 +3450,11 @@ std::error_code BitcodeReader::globalCleanup() {
     Function *NewFn;
     if (UpgradeIntrinsicFunction(&F, NewFn))
       UpgradedIntrinsics[&F] = NewFn;
+    else if (auto Remangled = Intrinsic::remangleIntrinsicFunction(&F))
+      // Some types could be renamed during loading if several modules are
+      // loaded in the same LLVMContext (LTO scenario). In this case we should
+      // remangle intrinsics names as well.
+      RemangledIntrinsics[&F] = Remangled.getValue();
   }
 
   // Look for global variables which need to be renamed.
@@ -3139,7 +3464,8 @@ std::error_code BitcodeReader::globalCleanup() {
   // Force deallocation of memory for these vectors to favor the client that
   // want lazy deserialization.
   std::vector<std::pair<GlobalVariable*, unsigned> >().swap(GlobalInits);
-  std::vector<std::pair<GlobalAlias*, unsigned> >().swap(AliasInits);
+  std::vector<std::pair<GlobalIndirectSymbol*, unsigned> >().swap(
+      IndirectSymbolInits);
   return std::error_code();
 }
 
@@ -3289,7 +3615,7 @@ std::error_code BitcodeReader::parseModule(uint64_t ResumeBit,
       case bitc::CONSTANTS_BLOCK_ID:
         if (std::error_code EC = parseConstants())
           return EC;
-        if (std::error_code EC = resolveGlobalAndAliasInits())
+        if (std::error_code EC = resolveGlobalAndIndirectSymbolInits())
           return EC;
         break;
       case bitc::METADATA_BLOCK_ID:
@@ -3374,7 +3700,6 @@ std::error_code BitcodeReader::parseModule(uint64_t ResumeBit,
       break;
     }
 
-
     // Read a record.
     auto BitCode = Stream.readRecord(Entry.ID, Record);
     switch (BitCode) {
@@ -3496,9 +3821,9 @@ std::error_code BitcodeReader::parseModule(uint64_t ResumeBit,
       if (Record.size() > 7)
         TLM = getDecodedThreadLocalMode(Record[7]);
 
-      bool UnnamedAddr = false;
+      GlobalValue::UnnamedAddr UnnamedAddr = GlobalValue::UnnamedAddr::None;
       if (Record.size() > 8)
-        UnnamedAddr = Record[8];
+        UnnamedAddr = getDecodedUnnamedAddrType(Record[8]);
 
       bool ExternallyInitialized = false;
       if (Record.size() > 9)
@@ -3533,6 +3858,7 @@ std::error_code BitcodeReader::parseModule(uint64_t ResumeBit,
       } else if (hasImplicitComdat(RawLinkage)) {
         NewGV->setComdat(reinterpret_cast<Comdat *>(1));
       }
+
       break;
     }
     // FUNCTION:  [type, callingconv, isproto, linkage, paramattr,
@@ -3578,11 +3904,11 @@ std::error_code BitcodeReader::parseModule(uint64_t ResumeBit,
       if (Record.size() > 8 && Record[8]) {
         if (Record[8]-1 >= GCTable.size())
           return error("Invalid ID");
-        Func->setGC(GCTable[Record[8]-1].c_str());
+        Func->setGC(GCTable[Record[8] - 1]);
       }
-      bool UnnamedAddr = false;
+      GlobalValue::UnnamedAddr UnnamedAddr = GlobalValue::UnnamedAddr::None;
       if (Record.size() > 9)
-        UnnamedAddr = Record[9];
+        UnnamedAddr = getDecodedUnnamedAddrType(Record[9]);
       Func->setUnnamedAddr(UnnamedAddr);
       if (Record.size() > 10 && Record[10] != 0)
         FunctionPrologues.push_back(std::make_pair(Func, Record[10]-1));
@@ -3621,9 +3947,11 @@ std::error_code BitcodeReader::parseModule(uint64_t ResumeBit,
     }
     // ALIAS: [alias type, addrspace, aliasee val#, linkage]
     // ALIAS: [alias type, addrspace, aliasee val#, linkage, visibility, dllstorageclass]
+    // IFUNC: [alias type, addrspace, aliasee val#, linkage, visibility, dllstorageclass]
+    case bitc::MODULE_CODE_IFUNC:
     case bitc::MODULE_CODE_ALIAS:
     case bitc::MODULE_CODE_ALIAS_OLD: {
-      bool NewRecord = BitCode == bitc::MODULE_CODE_ALIAS;
+      bool NewRecord = BitCode != bitc::MODULE_CODE_ALIAS_OLD;
       if (Record.size() < (3 + (unsigned)NewRecord))
         return error("Invalid record");
       unsigned OpNum = 0;
@@ -3644,8 +3972,14 @@ std::error_code BitcodeReader::parseModule(uint64_t ResumeBit,
 
       auto Val = Record[OpNum++];
       auto Linkage = Record[OpNum++];
-      auto *NewGA = GlobalAlias::create(
-          Ty, AddrSpace, getDecodedLinkage(Linkage), "", TheModule);
+      GlobalIndirectSymbol *NewGA;
+      if (BitCode == bitc::MODULE_CODE_ALIAS ||
+          BitCode == bitc::MODULE_CODE_ALIAS_OLD)
+        NewGA = GlobalAlias::create(Ty, AddrSpace, getDecodedLinkage(Linkage),
+                                    "", TheModule);
+      else
+        NewGA = GlobalIFunc::create(Ty, AddrSpace, getDecodedLinkage(Linkage),
+                                    "", nullptr, TheModule);
       // Old bitcode files didn't have visibility field.
       // Local linkage must have default visibility.
       if (OpNum != Record.size()) {
@@ -3661,9 +3995,9 @@ std::error_code BitcodeReader::parseModule(uint64_t ResumeBit,
       if (OpNum != Record.size())
         NewGA->setThreadLocalMode(getDecodedThreadLocalMode(Record[OpNum++]));
       if (OpNum != Record.size())
-        NewGA->setUnnamedAddr(Record[OpNum++]);
+        NewGA->setUnnamedAddr(getDecodedUnnamedAddrType(Record[OpNum++]));
       ValueList.push_back(NewGA);
-      AliasInits.push_back(std::make_pair(NewGA, Val));
+      IndirectSymbolInits.push_back(std::make_pair(NewGA, Val));
       break;
     }
     /// MODULE_CODE_PURGEVALS: [numvals]
@@ -3679,27 +4013,12 @@ std::error_code BitcodeReader::parseModule(uint64_t ResumeBit,
         return error("Invalid record");
       VSTOffset = Record[0];
       break;
-    /// MODULE_CODE_METADATA_VALUES: [numvals]
-    case bitc::MODULE_CODE_METADATA_VALUES:
-      if (Record.size() < 1)
+    /// MODULE_CODE_SOURCE_FILENAME: [namechar x N]
+    case bitc::MODULE_CODE_SOURCE_FILENAME:
+      SmallString<128> ValueName;
+      if (convertToString(Record, 0, ValueName))
         return error("Invalid record");
-      assert(!IsMetadataMaterialized);
-      // This record contains the number of metadata values in the module-level
-      // METADATA_BLOCK. It is used to support lazy parsing of metadata as
-      // a postpass, where we will parse function-level metadata first.
-      // This is needed because the ids of metadata are assigned implicitly
-      // based on their ordering in the bitcode, with the function-level
-      // metadata ids starting after the module-level metadata ids. Otherwise,
-      // we would have to parse the module-level metadata block to prime the
-      // MetadataList when we are lazy loading metadata during function
-      // importing. Initialize the MetadataList size here based on the
-      // record value, regardless of whether we are doing lazy metadata
-      // loading, so that we have consistent handling and assertion
-      // checking in parseMetadata for module-level metadata.
-      NumModuleMDs = Record[0];
-      SeenModuleValuesRecord = true;
-      assert(MetadataList.size() == 0);
-      MetadataList.resize(NumModuleMDs);
+      TheModule->setSourceFileName(ValueName);
       break;
     }
     Record.clear();
@@ -3866,6 +4185,96 @@ ErrorOr<std::string> BitcodeReader::parseIdentificationBlock() {
   }
 }
 
+std::error_code BitcodeReader::parseGlobalObjectAttachment(
+    GlobalObject &GO, ArrayRef<uint64_t> Record) {
+  assert(Record.size() % 2 == 0);
+  for (unsigned I = 0, E = Record.size(); I != E; I += 2) {
+    auto K = MDKindMap.find(Record[I]);
+    if (K == MDKindMap.end())
+      return error("Invalid ID");
+    MDNode *MD = MetadataList.getMDNodeFwdRefOrNull(Record[I + 1]);
+    if (!MD)
+      return error("Invalid metadata attachment");
+    GO.addMetadata(K->second, *MD);
+  }
+  return std::error_code();
+}
+
+ErrorOr<bool> BitcodeReader::hasObjCCategory() {
+  if (std::error_code EC = initStream(nullptr))
+    return EC;
+
+  // Sniff for the signature.
+  if (!hasValidBitcodeHeader(Stream))
+    return error("Invalid bitcode signature");
+
+  // We expect a number of well-defined blocks, though we don't necessarily
+  // need to understand them all.
+  while (1) {
+    BitstreamEntry Entry = Stream.advance();
+
+    switch (Entry.Kind) {
+    case BitstreamEntry::Error:
+      return error("Malformed block");
+    case BitstreamEntry::EndBlock:
+      return std::error_code();
+
+    case BitstreamEntry::SubBlock:
+      if (Entry.ID == bitc::MODULE_BLOCK_ID)
+        return hasObjCCategoryInModule();
+
+      // Ignore other sub-blocks.
+      if (Stream.SkipBlock())
+        return error("Malformed block");
+      continue;
+
+    case BitstreamEntry::Record:
+      Stream.skipRecord(Entry.ID);
+      continue;
+    }
+  }
+}
+
+ErrorOr<bool> BitcodeReader::hasObjCCategoryInModule() {
+  if (Stream.EnterSubBlock(bitc::MODULE_BLOCK_ID))
+    return error("Invalid record");
+
+  SmallVector<uint64_t, 64> Record;
+  // Read all the records for this module.
+  while (1) {
+    BitstreamEntry Entry = Stream.advanceSkippingSubblocks();
+
+    switch (Entry.Kind) {
+    case BitstreamEntry::SubBlock: // Handled for us already.
+    case BitstreamEntry::Error:
+      return error("Malformed block");
+    case BitstreamEntry::EndBlock:
+      return false;
+    case BitstreamEntry::Record:
+      // The interesting case.
+      break;
+    }
+
+    // Read a record.
+    switch (Stream.readRecord(Entry.ID, Record)) {
+    default:
+      break; // Default behavior, ignore unknown content.
+    case bitc::MODULE_CODE_SECTIONNAME: { // SECTIONNAME: [strchr x N]
+      std::string S;
+      if (convertToString(Record, 0, S))
+        return error("Invalid record");
+      // Check for the i386 and other (x86_64, ARM) conventions
+      if (S.find("__DATA, __objc_catlist") != std::string::npos ||
+          S.find("__OBJC,__category") != std::string::npos)
+        return true;
+      break;
+    }
+    }
+    Record.clear();
+  }
+  llvm_unreachable("Exit infinite loop");
+}
+
 /// Parse metadata attachments.
 std::error_code BitcodeReader::parseMetadataAttachment(Function &F) {
   if (Stream.EnterSubBlock(bitc::METADATA_ATTACHMENT_ID))
@@ -3897,13 +4306,8 @@ std::error_code BitcodeReader::parseMetadataAttachment(Function &F) {
         return error("Invalid record");
       if (RecordLength % 2 == 0) {
         // A function attachment.
-        for (unsigned I = 0; I != RecordLength; I += 2) {
-          auto K = MDKindMap.find(Record[I]);
-          if (K == MDKindMap.end())
-            return error("Invalid ID");
-          Metadata *MD = MetadataList.getValueFwdRef(Record[I + 1]);
-          F.setMetadata(K->second, cast<MDNode>(MD));
-        }
+        if (std::error_code EC = parseGlobalObjectAttachment(F, Record))
+          return EC;
         continue;
       }
 
@@ -3915,14 +4319,23 @@ std::error_code BitcodeReader::parseMetadataAttachment(Function &F) {
           MDKindMap.find(Kind);
         if (I == MDKindMap.end())
           return error("Invalid ID");
-        Metadata *Node = MetadataList.getValueFwdRef(Record[i + 1]);
+        Metadata *Node = MetadataList.getMetadataFwdRef(Record[i + 1]);
         if (isa<LocalAsMetadata>(Node))
           // Drop the attachment.  This used to be legal, but there's no
           // upgrade path.
           break;
-        Inst->setMetadata(I->second, cast<MDNode>(Node));
-        if (I->second == LLVMContext::MD_tbaa)
+        MDNode *MD = dyn_cast_or_null<MDNode>(Node);
+        if (!MD)
+          return error("Invalid metadata attachment");
+
+        if (HasSeenOldLoopTags && I->second == LLVMContext::MD_loop)
+          MD = upgradeInstructionLoopAttachment(*MD);
+
+        Inst->setMetadata(I->second, MD);
+        if (I->second == LLVMContext::MD_tbaa) {
           InstsWithTBAATag.push_back(Inst);
+          continue;
+        }
       }
       break;
     }
@@ -3949,6 +4362,10 @@ std::error_code BitcodeReader::parseFunctionBody(Function *F) {
   if (Stream.EnterSubBlock(bitc::FUNCTION_BLOCK_ID))
     return error("Invalid record");
 
+  // Unexpected unresolved metadata when parsing function.
+  if (MetadataList.hasFwdRefs())
+    return error("Invalid function metadata: incoming forward references");
+
   InstructionList.clear();
   unsigned ModuleValueListSize = ValueList.size();
   unsigned ModuleMetadataListSize = MetadataList.size();
@@ -4081,10 +4498,16 @@ std::error_code BitcodeReader::parseFunctionBody(Function *F) {
       unsigned ScopeID = Record[2], IAID = Record[3];
 
       MDNode *Scope = nullptr, *IA = nullptr;
-      if (ScopeID)
-        Scope = cast<MDNode>(MetadataList.getValueFwdRef(ScopeID - 1));
-      if (IAID)
-        IA = cast<MDNode>(MetadataList.getValueFwdRef(IAID - 1));
+      if (ScopeID) {
+        Scope = MetadataList.getMDNodeFwdRefOrNull(ScopeID - 1);
+        if (!Scope)
+          return error("Invalid record");
+      }
+      if (IAID) {
+        IA = MetadataList.getMDNodeFwdRefOrNull(IAID - 1);
+        if (!IA)
+          return error("Invalid record");
+      }
       LastLoc = DebugLoc::get(Line, Col, Scope, IA);
       I->setDebugLoc(LastLoc);
       I = nullptr;
@@ -4820,10 +5243,11 @@ std::error_code BitcodeReader::parseFunctionBody(Function *F) {
       uint64_t AlignRecord = Record[3];
       const uint64_t InAllocaMask = uint64_t(1) << 5;
       const uint64_t ExplicitTypeMask = uint64_t(1) << 6;
-      // Reserve bit 7 for SwiftError flag.
-      // const uint64_t SwiftErrorMask = uint64_t(1) << 7;
-      const uint64_t FlagMask = InAllocaMask | ExplicitTypeMask;
+      const uint64_t SwiftErrorMask = uint64_t(1) << 7;
+      const uint64_t FlagMask = InAllocaMask | ExplicitTypeMask |
+                                SwiftErrorMask;
       bool InAlloca = AlignRecord & InAllocaMask;
+      bool SwiftError = AlignRecord & SwiftErrorMask;
       Type *Ty = getTypeByID(Record[0]);
       if ((AlignRecord & ExplicitTypeMask) == 0) {
         auto *PTy = dyn_cast_or_null<PointerType>(Ty);
@@ -4842,6 +5266,7 @@ std::error_code BitcodeReader::parseFunctionBody(Function *F) {
         return error("Invalid record");
       AllocaInst *AI = new AllocaInst(Ty, Size, Align);
       AI->setUsedWithInAlloca(InAlloca);
+      AI->setSwiftError(SwiftError);
       I = AI;
       InstructionList.push_back(I);
       break;
@@ -4886,10 +5311,11 @@ std::error_code BitcodeReader::parseFunctionBody(Function *F) {
         Ty = cast<PointerType>(Op->getType())->getElementType();
 
       AtomicOrdering Ordering = getDecodedOrdering(Record[OpNum + 2]);
-      if (Ordering == NotAtomic || Ordering == Release ||
-          Ordering == AcquireRelease)
+      if (Ordering == AtomicOrdering::NotAtomic ||
+          Ordering == AtomicOrdering::Release ||
+          Ordering == AtomicOrdering::AcquireRelease)
         return error("Invalid record");
-      if (Ordering != NotAtomic && Record[OpNum] == 0)
+      if (Ordering != AtomicOrdering::NotAtomic && Record[OpNum] == 0)
         return error("Invalid record");
       SynchronizationScope SynchScope = getDecodedSynchScope(Record[OpNum + 3]);
 
@@ -4930,6 +5356,7 @@ std::error_code BitcodeReader::parseFunctionBody(Function *F) {
       unsigned OpNum = 0;
       Value *Val, *Ptr;
       if (getValueTypePair(Record, OpNum, NextValueNo, Ptr) ||
+          !isa<PointerType>(Ptr->getType()) ||
           (BitCode == bitc::FUNC_CODE_INST_STOREATOMIC
                ? getValueTypePair(Record, OpNum, NextValueNo, Val)
                : popValue(Record, OpNum, NextValueNo,
@@ -4942,11 +5369,12 @@ std::error_code BitcodeReader::parseFunctionBody(Function *F) {
               typeCheckLoadStoreInst(Val->getType(), Ptr->getType()))
         return EC;
       AtomicOrdering Ordering = getDecodedOrdering(Record[OpNum + 2]);
-      if (Ordering == NotAtomic || Ordering == Acquire ||
-          Ordering == AcquireRelease)
+      if (Ordering == AtomicOrdering::NotAtomic ||
+          Ordering == AtomicOrdering::Acquire ||
+          Ordering == AtomicOrdering::AcquireRelease)
         return error("Invalid record");
       SynchronizationScope SynchScope = getDecodedSynchScope(Record[OpNum + 3]);
-      if (Ordering != NotAtomic && Record[OpNum] == 0)
+      if (Ordering != AtomicOrdering::NotAtomic && Record[OpNum] == 0)
         return error("Invalid record");
 
       unsigned Align;
@@ -4972,7 +5400,8 @@ std::error_code BitcodeReader::parseFunctionBody(Function *F) {
           Record.size() < OpNum + 3 || Record.size() > OpNum + 5)
         return error("Invalid record");
       AtomicOrdering SuccessOrdering = getDecodedOrdering(Record[OpNum + 1]);
-      if (SuccessOrdering == NotAtomic || SuccessOrdering == Unordered)
+      if (SuccessOrdering == AtomicOrdering::NotAtomic ||
+          SuccessOrdering == AtomicOrdering::Unordered)
         return error("Invalid record");
       SynchronizationScope SynchScope = getDecodedSynchScope(Record[OpNum + 2]);
 
@@ -5008,6 +5437,7 @@ std::error_code BitcodeReader::parseFunctionBody(Function *F) {
       unsigned OpNum = 0;
       Value *Ptr, *Val;
       if (getValueTypePair(Record, OpNum, NextValueNo, Ptr) ||
+          !isa<PointerType>(Ptr->getType()) ||
           popValue(Record, OpNum, NextValueNo,
                     cast<PointerType>(Ptr->getType())->getElementType(), Val) ||
           OpNum+4 != Record.size())
@@ -5017,7 +5447,8 @@ std::error_code BitcodeReader::parseFunctionBody(Function *F) {
           Operation > AtomicRMWInst::LAST_BINOP)
         return error("Invalid record");
       AtomicOrdering Ordering = getDecodedOrdering(Record[OpNum + 2]);
-      if (Ordering == NotAtomic || Ordering == Unordered)
+      if (Ordering == AtomicOrdering::NotAtomic ||
+          Ordering == AtomicOrdering::Unordered)
         return error("Invalid record");
       SynchronizationScope SynchScope = getDecodedSynchScope(Record[OpNum + 3]);
       I = new AtomicRMWInst(Operation, Ptr, Val, Ordering, SynchScope);
@@ -5029,8 +5460,9 @@ std::error_code BitcodeReader::parseFunctionBody(Function *F) {
       if (2 != Record.size())
         return error("Invalid record");
       AtomicOrdering Ordering = getDecodedOrdering(Record[0]);
-      if (Ordering == NotAtomic || Ordering == Unordered ||
-          Ordering == Monotonic)
+      if (Ordering == AtomicOrdering::NotAtomic ||
+          Ordering == AtomicOrdering::Unordered ||
+          Ordering == AtomicOrdering::Monotonic)
         return error("Invalid record");
       SynchronizationScope SynchScope = getDecodedSynchScope(Record[1]);
       I = new FenceInst(Context, Ordering, SynchScope);
@@ -5200,8 +5632,9 @@ OutOfRecordLoop:
     }
   }
 
-  // FIXME: Check for unresolved forward-declared metadata references
-  // and clean up leaks.
+  // Unexpected unresolved metadata about to be dropped.
+  if (MetadataList.hasFwdRefs())
+    return error("Invalid function metadata: outgoing forward refs");
 
   // Trim the value list down to the size it was before we parsed this function.
   ValueList.shrinkTo(ModuleValueListSize);
@@ -5235,13 +5668,6 @@ std::error_code BitcodeReader::findFunctionInStream(
 void BitcodeReader::releaseBuffer() { Buffer.release(); }
 
 std::error_code BitcodeReader::materialize(GlobalValue *GV) {
-  // In older bitcode we must materialize the metadata before parsing
-  // any functions, in order to set up the MetadataList properly.
-  if (!SeenModuleValuesRecord) {
-    if (std::error_code EC = materializeMetadata())
-      return EC;
-  }
-
   Function *F = dyn_cast<Function>(GV);
   // If it's not a function or is already material, ignore the request.
   if (!F || !F->isMaterializable())
@@ -5255,6 +5681,10 @@ std::error_code BitcodeReader::materialize(GlobalValue *GV) {
     if (std::error_code EC = findFunctionInStream(F, DFII))
       return EC;
 
+  // Materialize metadata before parsing any function bodies.
+  if (std::error_code EC = materializeMetadata())
+    return EC;
+
   // Move the bit stream to the saved position of the deferred function body.
   Stream.JumpToBit(DFII->second);
 
@@ -5276,6 +5706,13 @@ std::error_code BitcodeReader::materialize(GlobalValue *GV) {
     }
   }
 
+  // Update calls to the remangled intrinsics
+  for (auto &I : RemangledIntrinsics)
+    for (auto UI = I.first->materialized_user_begin(), UE = I.first->user_end();
+         UI != UE;)
+      // Don't expect any other users than call sites
+      CallSite(*UI++).setCalledFunction(I.second);
+
   // Finish fn->subprogram upgrade for materialized functions.
   if (DISubprogram *SP = FunctionsWithSPs.lookup(F))
     F->setSubprogram(SP);
@@ -5310,6 +5747,11 @@ std::error_code BitcodeReader::materializeModule() {
   if (!BasicBlockFwdRefs.empty())
     return error("Never resolved function from blockaddress");
 
+  // Upgrading intrinsic calls before TBAA can cause TBAA metadata to be lost,
+  // to prevent this instructions with TBAA tags should be upgraded first.
+  for (unsigned I = 0, E = InstsWithTBAATag.size(); I < E; I++)
+    UpgradeInstWithTBAATag(InstsWithTBAATag[I]);
+
   // Upgrade any intrinsic calls that slipped through (should not happen!) and
   // delete the old functions to clean up. We can't do this unless the entire
   // module is materialized because there could always be another function body
@@ -5324,11 +5766,16 @@ std::error_code BitcodeReader::materializeModule() {
     I.first->eraseFromParent();
   }
   UpgradedIntrinsics.clear();
-
-  for (unsigned I = 0, E = InstsWithTBAATag.size(); I < E; I++)
-    UpgradeInstWithTBAATag(InstsWithTBAATag[I]);
+  // Do the same for remangled intrinsics
+  for (auto &I : RemangledIntrinsics) {
+    I.first->replaceAllUsesWith(I.second);
+    I.first->eraseFromParent();
+  }
+  RemangledIntrinsics.clear();
 
   UpgradeDebugInfo(*TheModule);
+
+  UpgradeModuleFlags(*TheModule);
   return std::error_code();
 }
 
@@ -5389,43 +5836,37 @@ BitcodeReader::initLazyStream(std::unique_ptr<DataStreamer> Streamer) {
   return std::error_code();
 }
 
-std::error_code FunctionIndexBitcodeReader::error(BitcodeError E,
-                                                  const Twine &Message) {
-  return ::error(DiagnosticHandler, make_error_code(E), Message);
-}
-
-std::error_code FunctionIndexBitcodeReader::error(const Twine &Message) {
+std::error_code ModuleSummaryIndexBitcodeReader::error(const Twine &Message) {
   return ::error(DiagnosticHandler,
                  make_error_code(BitcodeError::CorruptedBitcode), Message);
 }
 
-std::error_code FunctionIndexBitcodeReader::error(BitcodeError E) {
-  return ::error(DiagnosticHandler, make_error_code(E));
+ModuleSummaryIndexBitcodeReader::ModuleSummaryIndexBitcodeReader(
+    MemoryBuffer *Buffer, DiagnosticHandlerFunction DiagnosticHandler,
+    bool CheckGlobalValSummaryPresenceOnly)
+    : DiagnosticHandler(std::move(DiagnosticHandler)), Buffer(Buffer),
+      CheckGlobalValSummaryPresenceOnly(CheckGlobalValSummaryPresenceOnly) {}
+
+void ModuleSummaryIndexBitcodeReader::freeState() { Buffer = nullptr; }
+
+void ModuleSummaryIndexBitcodeReader::releaseBuffer() { Buffer.release(); }
+
+std::pair<GlobalValue::GUID, GlobalValue::GUID>
+ModuleSummaryIndexBitcodeReader::getGUIDFromValueId(unsigned ValueId) {
+  auto VGI = ValueIdToCallGraphGUIDMap.find(ValueId);
+  assert(VGI != ValueIdToCallGraphGUIDMap.end());
+  return VGI->second;
 }
 
-FunctionIndexBitcodeReader::FunctionIndexBitcodeReader(
-    MemoryBuffer *Buffer, DiagnosticHandlerFunction DiagnosticHandler,
-    bool IsLazy, bool CheckFuncSummaryPresenceOnly)
-    : DiagnosticHandler(DiagnosticHandler), Buffer(Buffer), IsLazy(IsLazy),
-      CheckFuncSummaryPresenceOnly(CheckFuncSummaryPresenceOnly) {}
-
-FunctionIndexBitcodeReader::FunctionIndexBitcodeReader(
-    DiagnosticHandlerFunction DiagnosticHandler, bool IsLazy,
-    bool CheckFuncSummaryPresenceOnly)
-    : DiagnosticHandler(DiagnosticHandler), Buffer(nullptr), IsLazy(IsLazy),
-      CheckFuncSummaryPresenceOnly(CheckFuncSummaryPresenceOnly) {}
-
-void FunctionIndexBitcodeReader::freeState() { Buffer = nullptr; }
-
-void FunctionIndexBitcodeReader::releaseBuffer() { Buffer.release(); }
-
-// Specialized value symbol table parser used when reading function index
-// blocks where we don't actually create global values.
-// At the end of this routine the function index is populated with a map
-// from function name to FunctionInfo. The function info contains
-// the function block's bitcode offset as well as the offset into the
-// function summary section.
-std::error_code FunctionIndexBitcodeReader::parseValueSymbolTable() {
+// Specialized value symbol table parser used when reading module index
+// blocks where we don't actually create global values. The parsed information
+// is saved in the bitcode reader for use when later parsing summaries.
+std::error_code ModuleSummaryIndexBitcodeReader::parseValueSymbolTable(
+    uint64_t Offset,
+    DenseMap<unsigned, GlobalValue::LinkageTypes> &ValueIdToLinkageMap) {
+  assert(Offset > 0 && "Expected non-zero VST offset");
+  uint64_t CurrentBit = jumpToValueSymbolTable(Offset, Stream);
+
   if (Stream.EnterSubBlock(bitc::VALUE_SYMTAB_BLOCK_ID))
     return error("Invalid record");
 
@@ -5441,6 +5882,8 @@ std::error_code FunctionIndexBitcodeReader::parseValueSymbolTable() {
     case BitstreamEntry::Error:
       return error("Malformed block");
     case BitstreamEntry::EndBlock:
+      // Done parsing VST, jump back to wherever we came from.
+      Stream.JumpToBit(CurrentBit);
       return std::error_code();
     case BitstreamEntry::Record:
       // The interesting case.
@@ -5452,58 +5895,79 @@ std::error_code FunctionIndexBitcodeReader::parseValueSymbolTable() {
     switch (Stream.readRecord(Entry.ID, Record)) {
     default: // Default behavior: ignore (e.g. VST_CODE_BBENTRY records).
       break;
-    case bitc::VST_CODE_FNENTRY: {
-      // VST_FNENTRY: [valueid, offset, namechar x N]
-      if (convertToString(Record, 2, ValueName))
+    case bitc::VST_CODE_ENTRY: { // VST_CODE_ENTRY: [valueid, namechar x N]
+      if (convertToString(Record, 1, ValueName))
         return error("Invalid record");
       unsigned ValueID = Record[0];
-      uint64_t FuncOffset = Record[1];
-      std::unique_ptr<FunctionInfo> FuncInfo =
-          llvm::make_unique<FunctionInfo>(FuncOffset);
-      if (foundFuncSummary() && !IsLazy) {
-        DenseMap<uint64_t, std::unique_ptr<FunctionSummary>>::iterator SMI =
-            SummaryMap.find(ValueID);
-        assert(SMI != SummaryMap.end() && "Summary info not found");
-        FuncInfo->setFunctionSummary(std::move(SMI->second));
-      }
-      TheIndex->addFunctionInfo(ValueName, std::move(FuncInfo));
-
+      assert(!SourceFileName.empty());
+      auto VLI = ValueIdToLinkageMap.find(ValueID);
+      assert(VLI != ValueIdToLinkageMap.end() &&
+             "No linkage found for VST entry?");
+      auto Linkage = VLI->second;
+      std::string GlobalId =
+          GlobalValue::getGlobalIdentifier(ValueName, Linkage, SourceFileName);
+      auto ValueGUID = GlobalValue::getGUID(GlobalId);
+      auto OriginalNameID = ValueGUID;
+      if (GlobalValue::isLocalLinkage(Linkage))
+        OriginalNameID = GlobalValue::getGUID(ValueName);
+      if (PrintSummaryGUIDs)
+        dbgs() << "GUID " << ValueGUID << "(" << OriginalNameID << ") is "
+               << ValueName << "\n";
+      ValueIdToCallGraphGUIDMap[ValueID] =
+          std::make_pair(ValueGUID, OriginalNameID);
       ValueName.clear();
       break;
     }
-    case bitc::VST_CODE_COMBINED_FNENTRY: {
-      // VST_FNENTRY: [offset, namechar x N]
-      if (convertToString(Record, 1, ValueName))
+    case bitc::VST_CODE_FNENTRY: {
+      // VST_CODE_FNENTRY: [valueid, offset, namechar x N]
+      if (convertToString(Record, 2, ValueName))
         return error("Invalid record");
-      uint64_t FuncSummaryOffset = Record[0];
-      std::unique_ptr<FunctionInfo> FuncInfo =
-          llvm::make_unique<FunctionInfo>(FuncSummaryOffset);
-      if (foundFuncSummary() && !IsLazy) {
-        DenseMap<uint64_t, std::unique_ptr<FunctionSummary>>::iterator SMI =
-            SummaryMap.find(FuncSummaryOffset);
-        assert(SMI != SummaryMap.end() && "Summary info not found");
-        FuncInfo->setFunctionSummary(std::move(SMI->second));
-      }
-      TheIndex->addFunctionInfo(ValueName, std::move(FuncInfo));
+      unsigned ValueID = Record[0];
+      assert(!SourceFileName.empty());
+      auto VLI = ValueIdToLinkageMap.find(ValueID);
+      assert(VLI != ValueIdToLinkageMap.end() &&
+             "No linkage found for VST entry?");
+      auto Linkage = VLI->second;
+      std::string FunctionGlobalId = GlobalValue::getGlobalIdentifier(
+          ValueName, VLI->second, SourceFileName);
+      auto FunctionGUID = GlobalValue::getGUID(FunctionGlobalId);
+      auto OriginalNameID = FunctionGUID;
+      if (GlobalValue::isLocalLinkage(Linkage))
+        OriginalNameID = GlobalValue::getGUID(ValueName);
+      if (PrintSummaryGUIDs)
+        dbgs() << "GUID " << FunctionGUID << "(" << OriginalNameID << ") is "
+               << ValueName << "\n";
+      ValueIdToCallGraphGUIDMap[ValueID] =
+          std::make_pair(FunctionGUID, OriginalNameID);
 
       ValueName.clear();
       break;
     }
+    case bitc::VST_CODE_COMBINED_ENTRY: {
+      // VST_CODE_COMBINED_ENTRY: [valueid, refguid]
+      unsigned ValueID = Record[0];
+      GlobalValue::GUID RefGUID = Record[1];
+      // The "original name", which is the second value of the pair will be
+      // overriden later by a FS_COMBINED_ORIGINAL_NAME in the combined index.
+      ValueIdToCallGraphGUIDMap[ValueID] = std::make_pair(RefGUID, RefGUID);
+      break;
+    }
     }
   }
 }
 
-// Parse just the blocks needed for function index building out of the module.
-// At the end of this routine the function Index is populated with a map
-// from function name to FunctionInfo. The function info contains
-// either the parsed function summary information (when parsing summaries
-// eagerly), or just to the function summary record's offset
-// if parsing lazily (IsLazy).
-std::error_code FunctionIndexBitcodeReader::parseModule() {
+// Parse just the blocks needed for building the index out of the module.
+// At the end of this routine the module Index is populated with a map
+// from global value id to GlobalValueSummary objects.
+std::error_code ModuleSummaryIndexBitcodeReader::parseModule() {
   if (Stream.EnterSubBlock(bitc::MODULE_BLOCK_ID))
     return error("Invalid record");
 
-  // Read the function index for this module.
+  SmallVector<uint64_t, 64> Record;
+  DenseMap<unsigned, GlobalValue::LinkageTypes> ValueIdToLinkageMap;
+  unsigned ValueId = 0;
+
+  // Read the index for this module.
   while (1) {
     BitstreamEntry Entry = Stream.advance();
 
@@ -5514,9 +5978,9 @@ std::error_code FunctionIndexBitcodeReader::parseModule() {
       return std::error_code();
 
     case BitstreamEntry::SubBlock:
-      if (CheckFuncSummaryPresenceOnly) {
-        if (Entry.ID == bitc::FUNCTION_SUMMARY_BLOCK_ID) {
-          SeenFuncSummary = true;
+      if (CheckGlobalValSummaryPresenceOnly) {
+        if (Entry.ID == bitc::GLOBALVAL_SUMMARY_BLOCK_ID) {
+          SeenGlobalValSummary = true;
           // No need to parse the rest since we found the summary.
           return std::error_code();
         }
@@ -5535,16 +5999,24 @@ std::error_code FunctionIndexBitcodeReader::parseModule() {
           return error("Malformed block");
         break;
       case bitc::VALUE_SYMTAB_BLOCK_ID:
-        if (std::error_code EC = parseValueSymbolTable())
-          return EC;
+        // Should have been parsed earlier via VSTOffset, unless there
+        // is no summary section.
+        assert(((SeenValueSymbolTable && VSTOffset > 0) ||
+                !SeenGlobalValSummary) &&
+               "Expected early VST parse via VSTOffset record");
+        if (Stream.SkipBlock())
+          return error("Invalid record");
         break;
-      case bitc::FUNCTION_SUMMARY_BLOCK_ID:
-        SeenFuncSummary = true;
-        if (IsLazy) {
-          // Lazy parsing of summary info, skip it.
-          if (Stream.SkipBlock())
-            return error("Invalid record");
-        } else if (std::error_code EC = parseEntireSummary())
+      case bitc::GLOBALVAL_SUMMARY_BLOCK_ID:
+        assert(VSTOffset > 0 && "Expected non-zero VST offset");
+        assert(!SeenValueSymbolTable &&
+               "Already read VST when parsing summary block?");
+        if (std::error_code EC =
+                parseValueSymbolTable(VSTOffset, ValueIdToLinkageMap))
+          return EC;
+        SeenValueSymbolTable = true;
+        SeenGlobalValSummary = true;
+        if (std::error_code EC = parseEntireSummary())
           return EC;
         break;
       case bitc::MODULE_STRTAB_BLOCK_ID:
@@ -5554,22 +6026,109 @@ std::error_code FunctionIndexBitcodeReader::parseModule() {
       }
       continue;
 
-    case BitstreamEntry::Record:
-      Stream.skipRecord(Entry.ID);
+    case BitstreamEntry::Record: {
+        Record.clear();
+        auto BitCode = Stream.readRecord(Entry.ID, Record);
+        switch (BitCode) {
+        default:
+          break; // Default behavior, ignore unknown content.
+        /// MODULE_CODE_SOURCE_FILENAME: [namechar x N]
+        case bitc::MODULE_CODE_SOURCE_FILENAME: {
+          SmallString<128> ValueName;
+          if (convertToString(Record, 0, ValueName))
+            return error("Invalid record");
+          SourceFileName = ValueName.c_str();
+          break;
+        }
+        /// MODULE_CODE_HASH: [5*i32]
+        case bitc::MODULE_CODE_HASH: {
+          if (Record.size() != 5)
+            return error("Invalid hash length " + Twine(Record.size()).str());
+          if (!TheIndex)
+            break;
+          if (TheIndex->modulePaths().empty())
+            // Does not have any summary emitted.
+            break;
+          if (TheIndex->modulePaths().size() != 1)
+            return error("Don't expect multiple modules defined?");
+          auto &Hash = TheIndex->modulePaths().begin()->second.second;
+          int Pos = 0;
+          for (auto &Val : Record) {
+            assert(!(Val >> 32) && "Unexpected high bits set");
+            Hash[Pos++] = Val;
+          }
+          break;
+        }
+        /// MODULE_CODE_VSTOFFSET: [offset]
+        case bitc::MODULE_CODE_VSTOFFSET:
+          if (Record.size() < 1)
+            return error("Invalid record");
+          VSTOffset = Record[0];
+          break;
+        // GLOBALVAR: [pointer type, isconst, initid,
+        //             linkage, alignment, section, visibility, threadlocal,
+        //             unnamed_addr, externally_initialized, dllstorageclass,
+        //             comdat]
+        case bitc::MODULE_CODE_GLOBALVAR: {
+          if (Record.size() < 6)
+            return error("Invalid record");
+          uint64_t RawLinkage = Record[3];
+          GlobalValue::LinkageTypes Linkage = getDecodedLinkage(RawLinkage);
+          ValueIdToLinkageMap[ValueId++] = Linkage;
+          break;
+        }
+        // FUNCTION:  [type, callingconv, isproto, linkage, paramattr,
+        //             alignment, section, visibility, gc, unnamed_addr,
+        //             prologuedata, dllstorageclass, comdat, prefixdata]
+        case bitc::MODULE_CODE_FUNCTION: {
+          if (Record.size() < 8)
+            return error("Invalid record");
+          uint64_t RawLinkage = Record[3];
+          GlobalValue::LinkageTypes Linkage = getDecodedLinkage(RawLinkage);
+          ValueIdToLinkageMap[ValueId++] = Linkage;
+          break;
+        }
+        // ALIAS: [alias type, addrspace, aliasee val#, linkage, visibility,
+        // dllstorageclass]
+        case bitc::MODULE_CODE_ALIAS: {
+          if (Record.size() < 6)
+            return error("Invalid record");
+          uint64_t RawLinkage = Record[3];
+          GlobalValue::LinkageTypes Linkage = getDecodedLinkage(RawLinkage);
+          ValueIdToLinkageMap[ValueId++] = Linkage;
+          break;
+        }
+        }
+      }
       continue;
     }
   }
 }
 
-// Eagerly parse the entire function summary block (i.e. for all functions
-// in the index). This populates the FunctionSummary objects in
-// the index.
-std::error_code FunctionIndexBitcodeReader::parseEntireSummary() {
-  if (Stream.EnterSubBlock(bitc::FUNCTION_SUMMARY_BLOCK_ID))
+// Eagerly parse the entire summary block. This populates the GlobalValueSummary
+// objects in the index.
+std::error_code ModuleSummaryIndexBitcodeReader::parseEntireSummary() {
+  if (Stream.EnterSubBlock(bitc::GLOBALVAL_SUMMARY_BLOCK_ID))
     return error("Invalid record");
-
   SmallVector<uint64_t, 64> Record;
 
+  // Parse version
+  {
+    BitstreamEntry Entry = Stream.advanceSkippingSubblocks();
+    if (Entry.Kind != BitstreamEntry::Record)
+      return error("Invalid Summary Block: record for version expected");
+    if (Stream.readRecord(Entry.ID, Record) != bitc::FS_VERSION)
+      return error("Invalid Summary Block: version expected");
+  }
+  const uint64_t Version = Record[0];
+  if (Version != 1)
+    return error("Invalid summary version " + Twine(Version) + ", 1 expected");
+  Record.clear();
+
+  // Keep around the last seen summary to be used when we see an optional
+  // "OriginalName" attachement.
+  GlobalValueSummary *LastSeenSummary = nullptr;
+  bool Combined = false;
   while (1) {
     BitstreamEntry Entry = Stream.advanceSkippingSubblocks();
 
@@ -5578,6 +6137,16 @@ std::error_code FunctionIndexBitcodeReader::parseEntireSummary() {
     case BitstreamEntry::Error:
       return error("Malformed block");
     case BitstreamEntry::EndBlock:
+      // For a per-module index, remove any entries that still have empty
+      // summaries. The VST parsing creates entries eagerly for all symbols,
+      // but not all have associated summaries (e.g. it doesn't know how to
+      // distinguish between VST_CODE_ENTRY for function declarations vs global
+      // variables with initializers that end up with a summary). Remove those
+      // entries now so that we don't need to rely on the combined index merger
+      // to clean them up (especially since that may not run for the first
+      // module's index if we merge into that).
+      if (!Combined)
+        TheIndex->removeEmptySummaryEntries();
       return std::error_code();
     case BitstreamEntry::Record:
       // The interesting case.
@@ -5592,35 +6161,197 @@ std::error_code FunctionIndexBitcodeReader::parseEntireSummary() {
     // in the combined index VST entries). The records also contain
     // information used for ThinLTO renaming and importing.
     Record.clear();
-    uint64_t CurRecordBit = Stream.GetCurrentBitNo();
-    switch (Stream.readRecord(Entry.ID, Record)) {
+    auto BitCode = Stream.readRecord(Entry.ID, Record);
+    switch (BitCode) {
     default: // Default behavior: ignore.
       break;
-    // FS_PERMODULE_ENTRY: [valueid, islocal, instcount]
-    case bitc::FS_CODE_PERMODULE_ENTRY: {
+    // FS_PERMODULE: [valueid, flags, instcount, numrefs, numrefs x valueid,
+    //                n x (valueid, callsitecount)]
+    // FS_PERMODULE_PROFILE: [valueid, flags, instcount, numrefs,
+    //                        numrefs x valueid,
+    //                        n x (valueid, callsitecount, profilecount)]
+    case bitc::FS_PERMODULE:
+    case bitc::FS_PERMODULE_PROFILE: {
       unsigned ValueID = Record[0];
-      bool IsLocal = Record[1];
+      uint64_t RawFlags = Record[1];
       unsigned InstCount = Record[2];
+      unsigned NumRefs = Record[3];
+      auto Flags = getDecodedGVSummaryFlags(RawFlags, Version);
       std::unique_ptr<FunctionSummary> FS =
-          llvm::make_unique<FunctionSummary>(InstCount);
-      FS->setLocalFunction(IsLocal);
+          llvm::make_unique<FunctionSummary>(Flags, InstCount);
       // The module path string ref set in the summary must be owned by the
       // index's module string table. Since we don't have a module path
       // string table section in the per-module index, we create a single
       // module path string table entry with an empty (0) ID to take
       // ownership.
       FS->setModulePath(
-          TheIndex->addModulePath(Buffer->getBufferIdentifier(), 0));
-      SummaryMap[ValueID] = std::move(FS);
+          TheIndex->addModulePath(Buffer->getBufferIdentifier(), 0)->first());
+      static int RefListStartIndex = 4;
+      int CallGraphEdgeStartIndex = RefListStartIndex + NumRefs;
+      assert(Record.size() >= RefListStartIndex + NumRefs &&
+             "Record size inconsistent with number of references");
+      for (unsigned I = 4, E = CallGraphEdgeStartIndex; I != E; ++I) {
+        unsigned RefValueId = Record[I];
+        GlobalValue::GUID RefGUID = getGUIDFromValueId(RefValueId).first;
+        FS->addRefEdge(RefGUID);
+      }
+      bool HasProfile = (BitCode == bitc::FS_PERMODULE_PROFILE);
+      for (unsigned I = CallGraphEdgeStartIndex, E = Record.size(); I != E;
+           ++I) {
+        unsigned CalleeValueId = Record[I];
+        unsigned CallsiteCount = Record[++I];
+        uint64_t ProfileCount = HasProfile ? Record[++I] : 0;
+        GlobalValue::GUID CalleeGUID = getGUIDFromValueId(CalleeValueId).first;
+        FS->addCallGraphEdge(CalleeGUID,
+                             CalleeInfo(CallsiteCount, ProfileCount));
+      }
+      auto GUID = getGUIDFromValueId(ValueID);
+      FS->setOriginalName(GUID.second);
+      TheIndex->addGlobalValueSummary(GUID.first, std::move(FS));
+      break;
     }
-    // FS_COMBINED_ENTRY: [modid, instcount]
-    case bitc::FS_CODE_COMBINED_ENTRY: {
-      uint64_t ModuleId = Record[0];
-      unsigned InstCount = Record[1];
+    // FS_ALIAS: [valueid, flags, valueid]
+    // Aliases must be emitted (and parsed) after all FS_PERMODULE entries, as
+    // they expect all aliasee summaries to be available.
+    case bitc::FS_ALIAS: {
+      unsigned ValueID = Record[0];
+      uint64_t RawFlags = Record[1];
+      unsigned AliaseeID = Record[2];
+      auto Flags = getDecodedGVSummaryFlags(RawFlags, Version);
+      std::unique_ptr<AliasSummary> AS = llvm::make_unique<AliasSummary>(Flags);
+      // The module path string ref set in the summary must be owned by the
+      // index's module string table. Since we don't have a module path
+      // string table section in the per-module index, we create a single
+      // module path string table entry with an empty (0) ID to take
+      // ownership.
+      AS->setModulePath(
+          TheIndex->addModulePath(Buffer->getBufferIdentifier(), 0)->first());
+
+      GlobalValue::GUID AliaseeGUID = getGUIDFromValueId(AliaseeID).first;
+      auto *AliaseeSummary = TheIndex->getGlobalValueSummary(AliaseeGUID);
+      if (!AliaseeSummary)
+        return error("Alias expects aliasee summary to be parsed");
+      AS->setAliasee(AliaseeSummary);
+
+      auto GUID = getGUIDFromValueId(ValueID);
+      AS->setOriginalName(GUID.second);
+      TheIndex->addGlobalValueSummary(GUID.first, std::move(AS));
+      break;
+    }
+    // FS_PERMODULE_GLOBALVAR_INIT_REFS: [valueid, flags, n x valueid]
+    case bitc::FS_PERMODULE_GLOBALVAR_INIT_REFS: {
+      unsigned ValueID = Record[0];
+      uint64_t RawFlags = Record[1];
+      auto Flags = getDecodedGVSummaryFlags(RawFlags, Version);
+      std::unique_ptr<GlobalVarSummary> FS =
+          llvm::make_unique<GlobalVarSummary>(Flags);
+      FS->setModulePath(
+          TheIndex->addModulePath(Buffer->getBufferIdentifier(), 0)->first());
+      for (unsigned I = 2, E = Record.size(); I != E; ++I) {
+        unsigned RefValueId = Record[I];
+        GlobalValue::GUID RefGUID = getGUIDFromValueId(RefValueId).first;
+        FS->addRefEdge(RefGUID);
+      }
+      auto GUID = getGUIDFromValueId(ValueID);
+      FS->setOriginalName(GUID.second);
+      TheIndex->addGlobalValueSummary(GUID.first, std::move(FS));
+      break;
+    }
+    // FS_COMBINED: [valueid, modid, flags, instcount, numrefs,
+    //               numrefs x valueid, n x (valueid, callsitecount)]
+    // FS_COMBINED_PROFILE: [valueid, modid, flags, instcount, numrefs,
+    //                       numrefs x valueid,
+    //                       n x (valueid, callsitecount, profilecount)]
+    case bitc::FS_COMBINED:
+    case bitc::FS_COMBINED_PROFILE: {
+      unsigned ValueID = Record[0];
+      uint64_t ModuleId = Record[1];
+      uint64_t RawFlags = Record[2];
+      unsigned InstCount = Record[3];
+      unsigned NumRefs = Record[4];
+      auto Flags = getDecodedGVSummaryFlags(RawFlags, Version);
       std::unique_ptr<FunctionSummary> FS =
-          llvm::make_unique<FunctionSummary>(InstCount);
+          llvm::make_unique<FunctionSummary>(Flags, InstCount);
+      LastSeenSummary = FS.get();
+      FS->setModulePath(ModuleIdMap[ModuleId]);
+      static int RefListStartIndex = 5;
+      int CallGraphEdgeStartIndex = RefListStartIndex + NumRefs;
+      assert(Record.size() >= RefListStartIndex + NumRefs &&
+             "Record size inconsistent with number of references");
+      for (unsigned I = RefListStartIndex, E = CallGraphEdgeStartIndex; I != E;
+           ++I) {
+        unsigned RefValueId = Record[I];
+        GlobalValue::GUID RefGUID = getGUIDFromValueId(RefValueId).first;
+        FS->addRefEdge(RefGUID);
+      }
+      bool HasProfile = (BitCode == bitc::FS_COMBINED_PROFILE);
+      for (unsigned I = CallGraphEdgeStartIndex, E = Record.size(); I != E;
+           ++I) {
+        unsigned CalleeValueId = Record[I];
+        unsigned CallsiteCount = Record[++I];
+        uint64_t ProfileCount = HasProfile ? Record[++I] : 0;
+        GlobalValue::GUID CalleeGUID = getGUIDFromValueId(CalleeValueId).first;
+        FS->addCallGraphEdge(CalleeGUID,
+                             CalleeInfo(CallsiteCount, ProfileCount));
+      }
+      GlobalValue::GUID GUID = getGUIDFromValueId(ValueID).first;
+      TheIndex->addGlobalValueSummary(GUID, std::move(FS));
+      Combined = true;
+      break;
+    }
+    // FS_COMBINED_ALIAS: [valueid, modid, flags, valueid]
+    // Aliases must be emitted (and parsed) after all FS_COMBINED entries, as
+    // they expect all aliasee summaries to be available.
+    case bitc::FS_COMBINED_ALIAS: {
+      unsigned ValueID = Record[0];
+      uint64_t ModuleId = Record[1];
+      uint64_t RawFlags = Record[2];
+      unsigned AliaseeValueId = Record[3];
+      auto Flags = getDecodedGVSummaryFlags(RawFlags, Version);
+      std::unique_ptr<AliasSummary> AS = llvm::make_unique<AliasSummary>(Flags);
+      LastSeenSummary = AS.get();
+      AS->setModulePath(ModuleIdMap[ModuleId]);
+
+      auto AliaseeGUID = getGUIDFromValueId(AliaseeValueId).first;
+      auto AliaseeInModule =
+          TheIndex->findSummaryInModule(AliaseeGUID, AS->modulePath());
+      if (!AliaseeInModule)
+        return error("Alias expects aliasee summary to be parsed");
+      AS->setAliasee(AliaseeInModule);
+
+      GlobalValue::GUID GUID = getGUIDFromValueId(ValueID).first;
+      TheIndex->addGlobalValueSummary(GUID, std::move(AS));
+      Combined = true;
+      break;
+    }
+    // FS_COMBINED_GLOBALVAR_INIT_REFS: [valueid, modid, flags, n x valueid]
+    case bitc::FS_COMBINED_GLOBALVAR_INIT_REFS: {
+      unsigned ValueID = Record[0];
+      uint64_t ModuleId = Record[1];
+      uint64_t RawFlags = Record[2];
+      auto Flags = getDecodedGVSummaryFlags(RawFlags, Version);
+      std::unique_ptr<GlobalVarSummary> FS =
+          llvm::make_unique<GlobalVarSummary>(Flags);
+      LastSeenSummary = FS.get();
       FS->setModulePath(ModuleIdMap[ModuleId]);
-      SummaryMap[CurRecordBit] = std::move(FS);
+      for (unsigned I = 3, E = Record.size(); I != E; ++I) {
+        unsigned RefValueId = Record[I];
+        GlobalValue::GUID RefGUID = getGUIDFromValueId(RefValueId).first;
+        FS->addRefEdge(RefGUID);
+      }
+      GlobalValue::GUID GUID = getGUIDFromValueId(ValueID).first;
+      TheIndex->addGlobalValueSummary(GUID, std::move(FS));
+      Combined = true;
+      break;
+    }
+    // FS_COMBINED_ORIGINAL_NAME: [original_name]
+    case bitc::FS_COMBINED_ORIGINAL_NAME: {
+      uint64_t OriginalName = Record[0];
+      if (!LastSeenSummary)
+        return error("Name attachment that does not follow a combined record");
+      LastSeenSummary->setOriginalName(OriginalName);
+      // Reset the LastSeenSummary
+      LastSeenSummary = nullptr;
     }
     }
   }
@@ -5629,13 +6360,14 @@ std::error_code FunctionIndexBitcodeReader::parseEntireSummary() {
 
 // Parse the  module string table block into the Index.
 // This populates the ModulePathStringTable map in the index.
-std::error_code FunctionIndexBitcodeReader::parseModuleStringTable() {
+std::error_code ModuleSummaryIndexBitcodeReader::parseModuleStringTable() {
   if (Stream.EnterSubBlock(bitc::MODULE_STRTAB_BLOCK_ID))
     return error("Invalid record");
 
   SmallVector<uint64_t, 64> Record;
 
   SmallString<128> ModulePath;
+  ModulePathStringTableTy::iterator LastSeenModulePath;
   while (1) {
     BitstreamEntry Entry = Stream.advanceSkippingSubblocks();
 
@@ -5656,22 +6388,40 @@ std::error_code FunctionIndexBitcodeReader::parseModuleStringTable() {
       break;
     case bitc::MST_CODE_ENTRY: {
       // MST_ENTRY: [modid, namechar x N]
+      uint64_t ModuleId = Record[0];
+
       if (convertToString(Record, 1, ModulePath))
         return error("Invalid record");
-      uint64_t ModuleId = Record[0];
-      StringRef ModulePathInMap = TheIndex->addModulePath(ModulePath, ModuleId);
-      ModuleIdMap[ModuleId] = ModulePathInMap;
+
+      LastSeenModulePath = TheIndex->addModulePath(ModulePath, ModuleId);
+      ModuleIdMap[ModuleId] = LastSeenModulePath->first();
+
       ModulePath.clear();
       break;
     }
+    /// MST_CODE_HASH: [5*i32]
+    case bitc::MST_CODE_HASH: {
+      if (Record.size() != 5)
+        return error("Invalid hash length " + Twine(Record.size()).str());
+      if (LastSeenModulePath == TheIndex->modulePaths().end())
+        return error("Invalid hash that does not follow a module path");
+      int Pos = 0;
+      for (auto &Val : Record) {
+        assert(!(Val >> 32) && "Unexpected high bits set");
+        LastSeenModulePath->second.second[Pos++] = Val;
+      }
+      // Reset LastSeenModulePath to avoid overriding the hash unexpectedly.
+      LastSeenModulePath = TheIndex->modulePaths().end();
+      break;
+    }
     }
   }
   llvm_unreachable("Exit infinite loop");
 }
 
 // Parse the function info index from the bitcode streamer into the given index.
-std::error_code FunctionIndexBitcodeReader::parseSummaryIndexInto(
-    std::unique_ptr<DataStreamer> Streamer, FunctionInfoIndex *I) {
+std::error_code ModuleSummaryIndexBitcodeReader::parseSummaryIndexInto(
+    std::unique_ptr<DataStreamer> Streamer, ModuleSummaryIndex *I) {
   TheIndex = I;
 
   if (std::error_code EC = initStream(std::move(Streamer)))
@@ -5705,55 +6455,14 @@ std::error_code FunctionIndexBitcodeReader::parseSummaryIndexInto(
   }
 }
 
-// Parse the function information at the given offset in the buffer into
-// the index. Used to support lazy parsing of function summaries from the
-// combined index during importing.
-// TODO: This function is not yet complete as it won't have a consumer
-// until ThinLTO function importing is added.
-std::error_code FunctionIndexBitcodeReader::parseFunctionSummary(
-    std::unique_ptr<DataStreamer> Streamer, FunctionInfoIndex *I,
-    size_t FunctionSummaryOffset) {
-  TheIndex = I;
-
-  if (std::error_code EC = initStream(std::move(Streamer)))
-    return EC;
-
-  // Sniff for the signature.
-  if (!hasValidBitcodeHeader(Stream))
-    return error("Invalid bitcode signature");
-
-  Stream.JumpToBit(FunctionSummaryOffset);
-
-  BitstreamEntry Entry = Stream.advanceSkippingSubblocks();
-
-  switch (Entry.Kind) {
-  default:
-    return error("Malformed block");
-  case BitstreamEntry::Record:
-    // The expected case.
-    break;
-  }
-
-  // TODO: Read a record. This interface will be completed when ThinLTO
-  // importing is added so that it can be tested.
-  SmallVector<uint64_t, 64> Record;
-  switch (Stream.readRecord(Entry.ID, Record)) {
-  case bitc::FS_CODE_COMBINED_ENTRY:
-  default:
-    return error("Invalid record");
-  }
-
-  return std::error_code();
-}
-
-std::error_code
-FunctionIndexBitcodeReader::initStream(std::unique_ptr<DataStreamer> Streamer) {
+std::error_code ModuleSummaryIndexBitcodeReader::initStream(
+    std::unique_ptr<DataStreamer> Streamer) {
   if (Streamer)
     return initLazyStream(std::move(Streamer));
   return initStreamFromBuffer();
 }
 
-std::error_code FunctionIndexBitcodeReader::initStreamFromBuffer() {
+std::error_code ModuleSummaryIndexBitcodeReader::initStreamFromBuffer() {
   const unsigned char *BufPtr = (const unsigned char *)Buffer->getBufferStart();
   const unsigned char *BufEnd = BufPtr + Buffer->getBufferSize();
 
@@ -5772,7 +6481,7 @@ std::error_code FunctionIndexBitcodeReader::initStreamFromBuffer() {
   return std::error_code();
 }
 
-std::error_code FunctionIndexBitcodeReader::initLazyStream(
+std::error_code ModuleSummaryIndexBitcodeReader::initLazyStream(
     std::unique_ptr<DataStreamer> Streamer) {
   // Check and strip off the bitcode wrapper; BitstreamReader expects never to
   // see it.
@@ -5800,6 +6509,9 @@ std::error_code FunctionIndexBitcodeReader::initLazyStream(
 }
 
 namespace {
+// FIXME: This class is only here to support the transition to llvm::Error. It
+// will be removed once this transition is complete. Clients should prefer to
+// deal with the Error value directly, rather than converting to error_code.
 class BitcodeErrorCategoryType : public std::error_category {
   const char *name() const LLVM_NOEXCEPT override {
     return "llvm.bitcode";
@@ -5815,7 +6527,7 @@ class BitcodeErrorCategoryType : public std::error_category {
     llvm_unreachable("Unknown error type!");
   }
 };
-}
+} // end anonymous namespace
 
 static ManagedStatic<BitcodeErrorCategoryType> ErrorCategory;
 
@@ -5916,6 +6628,16 @@ std::string llvm::getBitcodeTargetTriple(MemoryBufferRef Buffer,
   return Triple.get();
 }
 
+bool llvm::isBitcodeContainingObjCCategory(MemoryBufferRef Buffer,
+                                           LLVMContext &Context) {
+  std::unique_ptr<MemoryBuffer> Buf = MemoryBuffer::getMemBuffer(Buffer, false);
+  auto R = llvm::make_unique<BitcodeReader>(Buf.release(), Context);
+  ErrorOr<bool> hasObjCCategory = R->hasObjCCategory();
+  if (hasObjCCategory.getError())
+    return false;
+  return hasObjCCategory.get();
+}
+
 std::string llvm::getBitcodeProducerString(MemoryBufferRef Buffer,
                                            LLVMContext &Context) {
   std::unique_ptr<MemoryBuffer> Buf = MemoryBuffer::getMemBuffer(Buffer, false);
@@ -5927,18 +6649,13 @@ std::string llvm::getBitcodeProducerString(MemoryBufferRef Buffer,
 }
 
 // Parse the specified bitcode buffer, returning the function info index.
-// If IsLazy is false, parse the entire function summary into
-// the index. Otherwise skip the function summary section, and only create
-// an index object with a map from function name to function summary offset.
-// The index is used to perform lazy function summary reading later.
-ErrorOr<std::unique_ptr<FunctionInfoIndex>>
-llvm::getFunctionInfoIndex(MemoryBufferRef Buffer,
-                           DiagnosticHandlerFunction DiagnosticHandler,
-                           bool IsLazy) {
+ErrorOr<std::unique_ptr<ModuleSummaryIndex>> llvm::getModuleSummaryIndex(
+    MemoryBufferRef Buffer,
+    const DiagnosticHandlerFunction &DiagnosticHandler) {
   std::unique_ptr<MemoryBuffer> Buf = MemoryBuffer::getMemBuffer(Buffer, false);
-  FunctionIndexBitcodeReader R(Buf.get(), DiagnosticHandler, IsLazy);
+  ModuleSummaryIndexBitcodeReader R(Buf.get(), DiagnosticHandler);
 
-  auto Index = llvm::make_unique<FunctionInfoIndex>();
+  auto Index = llvm::make_unique<ModuleSummaryIndex>();
 
   auto cleanupOnError = [&](std::error_code EC) {
     R.releaseBuffer(); // Never take ownership on error.
@@ -5948,15 +6665,16 @@ llvm::getFunctionInfoIndex(MemoryBufferRef Buffer,
   if (std::error_code EC = R.parseSummaryIndexInto(nullptr, Index.get()))
     return cleanupOnError(EC);
 
-  Buf.release(); // The FunctionIndexBitcodeReader owns it now.
+  Buf.release(); // The ModuleSummaryIndexBitcodeReader owns it now.
   return std::move(Index);
 }
 
-// Check if the given bitcode buffer contains a function summary block.
-bool llvm::hasFunctionSummary(MemoryBufferRef Buffer,
-                              DiagnosticHandlerFunction DiagnosticHandler) {
+// Check if the given bitcode buffer contains a global value summary block.
+bool llvm::hasGlobalValueSummary(
+    MemoryBufferRef Buffer,
+    const DiagnosticHandlerFunction &DiagnosticHandler) {
   std::unique_ptr<MemoryBuffer> Buf = MemoryBuffer::getMemBuffer(Buffer, false);
-  FunctionIndexBitcodeReader R(Buf.get(), DiagnosticHandler, false, true);
+  ModuleSummaryIndexBitcodeReader R(Buf.get(), DiagnosticHandler, true);
 
   auto cleanupOnError = [&](std::error_code EC) {
     R.releaseBuffer(); // Never take ownership on error.
@@ -5966,38 +6684,6 @@ bool llvm::hasFunctionSummary(MemoryBufferRef Buffer,
   if (std::error_code EC = R.parseSummaryIndexInto(nullptr, nullptr))
     return cleanupOnError(EC);
 
-  Buf.release(); // The FunctionIndexBitcodeReader owns it now.
-  return R.foundFuncSummary();
-}
-
-// This method supports lazy reading of function summary data from the combined
-// index during ThinLTO function importing. When reading the combined index
-// file, getFunctionInfoIndex is first invoked with IsLazy=true.
-// Then this method is called for each function considered for importing,
-// to parse the summary information for the given function name into
-// the index.
-std::error_code llvm::readFunctionSummary(
-    MemoryBufferRef Buffer, DiagnosticHandlerFunction DiagnosticHandler,
-    StringRef FunctionName, std::unique_ptr<FunctionInfoIndex> Index) {
-  std::unique_ptr<MemoryBuffer> Buf = MemoryBuffer::getMemBuffer(Buffer, false);
-  FunctionIndexBitcodeReader R(Buf.get(), DiagnosticHandler);
-
-  auto cleanupOnError = [&](std::error_code EC) {
-    R.releaseBuffer(); // Never take ownership on error.
-    return EC;
-  };
-
-  // Lookup the given function name in the FunctionMap, which may
-  // contain a list of function infos in the case of a COMDAT. Walk through
-  // and parse each function summary info at the function summary offset
-  // recorded when parsing the value symbol table.
-  for (const auto &FI : Index->getFunctionInfoList(FunctionName)) {
-    size_t FunctionSummaryOffset = FI->bitcodeIndex();
-    if (std::error_code EC =
-            R.parseFunctionSummary(nullptr, Index.get(), FunctionSummaryOffset))
-      return cleanupOnError(EC);
-  }
-
-  Buf.release(); // The FunctionIndexBitcodeReader owns it now.
-  return std::error_code();
+  Buf.release(); // The ModuleSummaryIndexBitcodeReader owns it now.
+  return R.foundGlobalValSummary();
 }
diff --git a/lib/Bitcode/Reader/BitstreamReader.cpp b/lib/Bitcode/Reader/BitstreamReader.cpp
index a103fbdf4a93..60360d2ef78f 100644
--- a/lib/Bitcode/Reader/BitstreamReader.cpp
+++ b/lib/Bitcode/Reader/BitstreamReader.cpp
@@ -32,7 +32,7 @@ bool BitstreamCursor::EnterSubBlock(unsigned BlockID, unsigned *NumWordsP) {
 
   // Add the abbrevs specific to this block to the CurAbbrevs list.
   if (const BitstreamReader::BlockInfo *Info =
-      BitStream->getBlockInfo(BlockID)) {
+          getBitStreamReader()->getBlockInfo(BlockID)) {
     CurAbbrevs.insert(CurAbbrevs.end(), Info->Abbrevs.begin(),
                       Info->Abbrevs.end());
   }
@@ -131,8 +131,25 @@ void BitstreamCursor::skipRecord(unsigned AbbrevID) {
       const BitCodeAbbrevOp &EltEnc = Abbv->getOperandInfo(++i);
 
       // Read all the elements.
-      for (; NumElts; --NumElts)
-        skipAbbreviatedField(*this, EltEnc);
+      // Decode the value as we are commanded.
+      switch (EltEnc.getEncoding()) {
+      default:
+        report_fatal_error("Array element type can't be an Array or a Blob");
+      case BitCodeAbbrevOp::Fixed:
+        assert((unsigned)Op.getEncodingData() <= MaxChunkSize);
+        for (; NumElts; --NumElts)
+          Read((unsigned)EltEnc.getEncodingData());
+        break;
+      case BitCodeAbbrevOp::VBR:
+        assert((unsigned)Op.getEncodingData() <= MaxChunkSize);
+        for (; NumElts; --NumElts)
+          ReadVBR64((unsigned)EltEnc.getEncodingData());
+        break;
+      case BitCodeAbbrevOp::Char6:
+        for (; NumElts; --NumElts)
+          Read(6);
+        break;
+      }
       continue;
     }
 
@@ -147,7 +164,7 @@ void BitstreamCursor::skipRecord(unsigned AbbrevID) {
     // If this would read off the end of the bitcode file, just set the
     // record to empty and return.
     if (!canSkipToPos(NewEnd/8)) {
-      NextChar = BitStream->getBitcodeBytes().getExtent();
+      skipToEnd();
       break;
     }
 
@@ -206,13 +223,23 @@ unsigned BitstreamCursor::readRecord(unsigned AbbrevID,
       if (!EltEnc.isEncoding())
         report_fatal_error(
             "Array element type has to be an encoding of a type");
-      if (EltEnc.getEncoding() == BitCodeAbbrevOp::Array ||
-          EltEnc.getEncoding() == BitCodeAbbrevOp::Blob)
-        report_fatal_error("Array element type can't be an Array or a Blob");
 
       // Read all the elements.
-      for (; NumElts; --NumElts)
-        Vals.push_back(readAbbreviatedField(*this, EltEnc));
+      switch (EltEnc.getEncoding()) {
+      default:
+        report_fatal_error("Array element type can't be an Array or a Blob");
+      case BitCodeAbbrevOp::Fixed:
+        for (; NumElts; --NumElts)
+          Vals.push_back(Read((unsigned)EltEnc.getEncodingData()));
+        break;
+      case BitCodeAbbrevOp::VBR:
+        for (; NumElts; --NumElts)
+          Vals.push_back(ReadVBR64((unsigned)EltEnc.getEncodingData()));
+        break;
+      case BitCodeAbbrevOp::Char6:
+        for (; NumElts; --NumElts)
+          Vals.push_back(BitCodeAbbrevOp::DecodeChar6(Read(6)));
+      }
       continue;
     }
 
@@ -229,13 +256,15 @@ unsigned BitstreamCursor::readRecord(unsigned AbbrevID,
     // record to empty and return.
     if (!canSkipToPos(NewEnd/8)) {
       Vals.append(NumElts, 0);
-      NextChar = BitStream->getBitcodeBytes().getExtent();
+      skipToEnd();
       break;
     }
 
-    // Otherwise, inform the streamer that we need these bytes in memory.
-    const char *Ptr = (const char*)
-      BitStream->getBitcodeBytes().getPointer(CurBitPos/8, NumElts);
+    // Otherwise, inform the streamer that we need these bytes in memory.  Skip
+    // over tail padding first, in case jumping to NewEnd invalidates the Blob
+    // pointer.
+    JumpToBit(NewEnd);
+    const char *Ptr = (const char *)getPointerToBit(CurBitPos, NumElts);
 
     // If we can return a reference to the data, do so to avoid copying it.
     if (Blob) {
@@ -245,8 +274,6 @@ unsigned BitstreamCursor::readRecord(unsigned AbbrevID,
       for (; NumElts; --NumElts)
         Vals.push_back((unsigned char)*Ptr++);
     }
-    // Skip over tail padding.
-    JumpToBit(NewEnd);
   }
 
   return Code;
@@ -293,7 +320,7 @@ void BitstreamCursor::ReadAbbrevRecord() {
 
 bool BitstreamCursor::ReadBlockInfoBlock() {
   // If this is the second stream to get to the block info block, skip it.
-  if (BitStream->hasBlockInfoRecords())
+  if (getBitStreamReader()->hasBlockInfoRecords())
     return SkipBlock();
 
   if (EnterSubBlock(bitc::BLOCKINFO_BLOCK_ID)) return true;
@@ -334,11 +361,13 @@ bool BitstreamCursor::ReadBlockInfoBlock() {
       default: break;  // Default behavior, ignore unknown content.
       case bitc::BLOCKINFO_CODE_SETBID:
         if (Record.size() < 1) return true;
-        CurBlockInfo = &BitStream->getOrCreateBlockInfo((unsigned)Record[0]);
+        CurBlockInfo =
+            &getBitStreamReader()->getOrCreateBlockInfo((unsigned)Record[0]);
         break;
       case bitc::BLOCKINFO_CODE_BLOCKNAME: {
         if (!CurBlockInfo) return true;
-        if (BitStream->isIgnoringBlockInfoNames()) break;  // Ignore name.
+        if (getBitStreamReader()->isIgnoringBlockInfoNames())
+          break; // Ignore name.
         std::string Name;
         for (unsigned i = 0, e = Record.size(); i != e; ++i)
           Name += (char)Record[i];
@@ -347,7 +376,8 @@ bool BitstreamCursor::ReadBlockInfoBlock() {
       }
       case bitc::BLOCKINFO_CODE_SETRECORDNAME: {
         if (!CurBlockInfo) return true;
-        if (BitStream->isIgnoringBlockInfoNames()) break;  // Ignore name.
+        if (getBitStreamReader()->isIgnoringBlockInfoNames())
+          break; // Ignore name.
         std::string Name;
         for (unsigned i = 1, e = Record.size(); i != e; ++i)
           Name += (char)Record[i];
diff --git a/lib/Bitcode/Reader/Makefile b/lib/Bitcode/Reader/Makefile
deleted file mode 100644
index 59af8d53a73e..000000000000
--- a/lib/Bitcode/Reader/Makefile
+++ /dev/null
@@ -1,15 +0,0 @@
-##===- lib/Bitcode/Reader/Makefile -------------------------*- Makefile -*-===##
-#
-#                     The LLVM Compiler Infrastructure
-#
-# This file is distributed under the University of Illinois Open Source
-# License. See LICENSE.TXT for details.
-#
-##===----------------------------------------------------------------------===##
-
-LEVEL = ../../..
-LIBRARYNAME = LLVMBitReader
-BUILD_ARCHIVE = 1
-
-include $(LEVEL)/Makefile.common
-
diff --git a/lib/Bitcode/Writer/BitcodeWriter.cpp b/lib/Bitcode/Writer/BitcodeWriter.cpp
index a899a0cc3ee4..dcb8b58cd7b3 100644
--- a/lib/Bitcode/Writer/BitcodeWriter.cpp
+++ b/lib/Bitcode/Writer/BitcodeWriter.cpp
@@ -11,12 +11,12 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "llvm/Bitcode/ReaderWriter.h"
 #include "ValueEnumerator.h"
-#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/StringExtras.h"
 #include "llvm/ADT/Triple.h"
 #include "llvm/Bitcode/BitstreamWriter.h"
 #include "llvm/Bitcode/LLVMBitCodes.h"
+#include "llvm/Bitcode/ReaderWriter.h"
 #include "llvm/IR/CallSite.h"
 #include "llvm/IR/Constants.h"
 #include "llvm/IR/DebugInfoMetadata.h"
@@ -24,20 +24,20 @@
 #include "llvm/IR/InlineAsm.h"
 #include "llvm/IR/Instructions.h"
 #include "llvm/IR/LLVMContext.h"
-#include "llvm/IR/IntrinsicInst.h"
 #include "llvm/IR/Module.h"
 #include "llvm/IR/Operator.h"
 #include "llvm/IR/UseListOrder.h"
 #include "llvm/IR/ValueSymbolTable.h"
-#include "llvm/Support/CommandLine.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/MathExtras.h"
 #include "llvm/Support/Program.h"
+#include "llvm/Support/SHA1.h"
 #include "llvm/Support/raw_ostream.h"
 #include <cctype>
 #include <map>
 using namespace llvm;
 
+namespace {
 /// These are manifest constants used by the bitcode writer. They do not need to
 /// be kept in sync with the reader, but need to be consistent within this file.
 enum {
@@ -64,7 +64,455 @@ enum {
   FUNCTION_INST_GEP_ABBREV,
 };
 
-static unsigned GetEncodedCastOpcode(unsigned Opcode) {
+/// Abstract class to manage the bitcode writing, subclassed for each bitcode
+/// file type. Owns the BitstreamWriter, and includes the main entry point for
+/// writing.
+class BitcodeWriter {
+protected:
+  /// Pointer to the buffer allocated by caller for bitcode writing.
+  const SmallVectorImpl<char> &Buffer;
+
+  /// The stream created and owned by the BitodeWriter.
+  BitstreamWriter Stream;
+
+  /// Saves the offset of the VSTOffset record that must eventually be
+  /// backpatched with the offset of the actual VST.
+  uint64_t VSTOffsetPlaceholder = 0;
+
+public:
+  /// Constructs a BitcodeWriter object, and initializes a BitstreamRecord,
+  /// writing to the provided \p Buffer.
+  BitcodeWriter(SmallVectorImpl<char> &Buffer)
+      : Buffer(Buffer), Stream(Buffer) {}
+
+  virtual ~BitcodeWriter() = default;
+
+  /// Main entry point to write the bitcode file, which writes the bitcode
+  /// header and will then invoke the virtual writeBlocks() method.
+  void write();
+
+private:
+  /// Derived classes must implement this to write the corresponding blocks for
+  /// that bitcode file type.
+  virtual void writeBlocks() = 0;
+
+protected:
+  bool hasVSTOffsetPlaceholder() { return VSTOffsetPlaceholder != 0; }
+  void writeValueSymbolTableForwardDecl();
+  void writeBitcodeHeader();
+};
+
+/// Class to manage the bitcode writing for a module.
+class ModuleBitcodeWriter : public BitcodeWriter {
+  /// The Module to write to bitcode.
+  const Module &M;
+
+  /// Enumerates ids for all values in the module.
+  ValueEnumerator VE;
+
+  /// Optional per-module index to write for ThinLTO.
+  const ModuleSummaryIndex *Index;
+
+  /// True if a module hash record should be written.
+  bool GenerateHash;
+
+  /// The start bit of the module block, for use in generating a module hash
+  uint64_t BitcodeStartBit = 0;
+
+  /// Map that holds the correspondence between GUIDs in the summary index,
+  /// that came from indirect call profiles, and a value id generated by this
+  /// class to use in the VST and summary block records.
+  std::map<GlobalValue::GUID, unsigned> GUIDToValueIdMap;
+
+  /// Tracks the last value id recorded in the GUIDToValueMap.
+  unsigned GlobalValueId;
+
+public:
+  /// Constructs a ModuleBitcodeWriter object for the given Module,
+  /// writing to the provided \p Buffer.
+  ModuleBitcodeWriter(const Module *M, SmallVectorImpl<char> &Buffer,
+                      bool ShouldPreserveUseListOrder,
+                      const ModuleSummaryIndex *Index, bool GenerateHash)
+      : BitcodeWriter(Buffer), M(*M), VE(*M, ShouldPreserveUseListOrder),
+        Index(Index), GenerateHash(GenerateHash) {
+    // Save the start bit of the actual bitcode, in case there is space
+    // saved at the start for the darwin header above. The reader stream
+    // will start at the bitcode, and we need the offset of the VST
+    // to line up.
+    BitcodeStartBit = Stream.GetCurrentBitNo();
+
+    // Assign ValueIds to any callee values in the index that came from
+    // indirect call profiles and were recorded as a GUID not a Value*
+    // (which would have been assigned an ID by the ValueEnumerator).
+    // The starting ValueId is just after the number of values in the
+    // ValueEnumerator, so that they can be emitted in the VST.
+    GlobalValueId = VE.getValues().size();
+    if (Index)
+      for (const auto &GUIDSummaryLists : *Index)
+        // Examine all summaries for this GUID.
+        for (auto &Summary : GUIDSummaryLists.second)
+          if (auto FS = dyn_cast<FunctionSummary>(Summary.get()))
+            // For each call in the function summary, see if the call
+            // is to a GUID (which means it is for an indirect call,
+            // otherwise we would have a Value for it). If so, synthesize
+            // a value id.
+            for (auto &CallEdge : FS->calls())
+              if (CallEdge.first.isGUID())
+                assignValueId(CallEdge.first.getGUID());
+  }
+
+private:
+  /// Main entry point for writing a module to bitcode, invoked by
+  /// BitcodeWriter::write() after it writes the header.
+  void writeBlocks() override;
+
+  /// Create the "IDENTIFICATION_BLOCK_ID" containing a single string with the
+  /// current llvm version, and a record for the epoch number.
+  void writeIdentificationBlock();
+
+  /// Emit the current module to the bitstream.
+  void writeModule();
+
+  uint64_t bitcodeStartBit() { return BitcodeStartBit; }
+
+  void writeStringRecord(unsigned Code, StringRef Str, unsigned AbbrevToUse);
+  void writeAttributeGroupTable();
+  void writeAttributeTable();
+  void writeTypeTable();
+  void writeComdats();
+  void writeModuleInfo();
+  void writeValueAsMetadata(const ValueAsMetadata *MD,
+                            SmallVectorImpl<uint64_t> &Record);
+  void writeMDTuple(const MDTuple *N, SmallVectorImpl<uint64_t> &Record,
+                    unsigned Abbrev);
+  unsigned createDILocationAbbrev();
+  void writeDILocation(const DILocation *N, SmallVectorImpl<uint64_t> &Record,
+                       unsigned &Abbrev);
+  unsigned createGenericDINodeAbbrev();
+  void writeGenericDINode(const GenericDINode *N,
+                          SmallVectorImpl<uint64_t> &Record, unsigned &Abbrev);
+  void writeDISubrange(const DISubrange *N, SmallVectorImpl<uint64_t> &Record,
+                       unsigned Abbrev);
+  void writeDIEnumerator(const DIEnumerator *N,
+                         SmallVectorImpl<uint64_t> &Record, unsigned Abbrev);
+  void writeDIBasicType(const DIBasicType *N, SmallVectorImpl<uint64_t> &Record,
+                        unsigned Abbrev);
+  void writeDIDerivedType(const DIDerivedType *N,
+                          SmallVectorImpl<uint64_t> &Record, unsigned Abbrev);
+  void writeDICompositeType(const DICompositeType *N,
+                            SmallVectorImpl<uint64_t> &Record, unsigned Abbrev);
+  void writeDISubroutineType(const DISubroutineType *N,
+                             SmallVectorImpl<uint64_t> &Record,
+                             unsigned Abbrev);
+  void writeDIFile(const DIFile *N, SmallVectorImpl<uint64_t> &Record,
+                   unsigned Abbrev);
+  void writeDICompileUnit(const DICompileUnit *N,
+                          SmallVectorImpl<uint64_t> &Record, unsigned Abbrev);
+  void writeDISubprogram(const DISubprogram *N,
+                         SmallVectorImpl<uint64_t> &Record, unsigned Abbrev);
+  void writeDILexicalBlock(const DILexicalBlock *N,
+                           SmallVectorImpl<uint64_t> &Record, unsigned Abbrev);
+  void writeDILexicalBlockFile(const DILexicalBlockFile *N,
+                               SmallVectorImpl<uint64_t> &Record,
+                               unsigned Abbrev);
+  void writeDINamespace(const DINamespace *N, SmallVectorImpl<uint64_t> &Record,
+                        unsigned Abbrev);
+  void writeDIMacro(const DIMacro *N, SmallVectorImpl<uint64_t> &Record,
+                    unsigned Abbrev);
+  void writeDIMacroFile(const DIMacroFile *N, SmallVectorImpl<uint64_t> &Record,
+                        unsigned Abbrev);
+  void writeDIModule(const DIModule *N, SmallVectorImpl<uint64_t> &Record,
+                     unsigned Abbrev);
+  void writeDITemplateTypeParameter(const DITemplateTypeParameter *N,
+                                    SmallVectorImpl<uint64_t> &Record,
+                                    unsigned Abbrev);
+  void writeDITemplateValueParameter(const DITemplateValueParameter *N,
+                                     SmallVectorImpl<uint64_t> &Record,
+                                     unsigned Abbrev);
+  void writeDIGlobalVariable(const DIGlobalVariable *N,
+                             SmallVectorImpl<uint64_t> &Record,
+                             unsigned Abbrev);
+  void writeDILocalVariable(const DILocalVariable *N,
+                            SmallVectorImpl<uint64_t> &Record, unsigned Abbrev);
+  void writeDIExpression(const DIExpression *N,
+                         SmallVectorImpl<uint64_t> &Record, unsigned Abbrev);
+  void writeDIObjCProperty(const DIObjCProperty *N,
+                           SmallVectorImpl<uint64_t> &Record, unsigned Abbrev);
+  void writeDIImportedEntity(const DIImportedEntity *N,
+                             SmallVectorImpl<uint64_t> &Record,
+                             unsigned Abbrev);
+  unsigned createNamedMetadataAbbrev();
+  void writeNamedMetadata(SmallVectorImpl<uint64_t> &Record);
+  unsigned createMetadataStringsAbbrev();
+  void writeMetadataStrings(ArrayRef<const Metadata *> Strings,
+                            SmallVectorImpl<uint64_t> &Record);
+  void writeMetadataRecords(ArrayRef<const Metadata *> MDs,
+                            SmallVectorImpl<uint64_t> &Record);
+  void writeModuleMetadata();
+  void writeFunctionMetadata(const Function &F);
+  void writeFunctionMetadataAttachment(const Function &F);
+  void writeGlobalVariableMetadataAttachment(const GlobalVariable &GV);
+  void pushGlobalMetadataAttachment(SmallVectorImpl<uint64_t> &Record,
+                                    const GlobalObject &GO);
+  void writeModuleMetadataKinds();
+  void writeOperandBundleTags();
+  void writeConstants(unsigned FirstVal, unsigned LastVal, bool isGlobal);
+  void writeModuleConstants();
+  bool pushValueAndType(const Value *V, unsigned InstID,
+                        SmallVectorImpl<unsigned> &Vals);
+  void writeOperandBundles(ImmutableCallSite CS, unsigned InstID);
+  void pushValue(const Value *V, unsigned InstID,
+                 SmallVectorImpl<unsigned> &Vals);
+  void pushValueSigned(const Value *V, unsigned InstID,
+                       SmallVectorImpl<uint64_t> &Vals);
+  void writeInstruction(const Instruction &I, unsigned InstID,
+                        SmallVectorImpl<unsigned> &Vals);
+  void writeValueSymbolTable(
+      const ValueSymbolTable &VST, bool IsModuleLevel = false,
+      DenseMap<const Function *, uint64_t> *FunctionToBitcodeIndex = nullptr);
+  void writeUseList(UseListOrder &&Order);
+  void writeUseListBlock(const Function *F);
+  void
+  writeFunction(const Function &F,
+                DenseMap<const Function *, uint64_t> &FunctionToBitcodeIndex);
+  void writeBlockInfo();
+  void writePerModuleFunctionSummaryRecord(SmallVector<uint64_t, 64> &NameVals,
+                                           GlobalValueSummary *Summary,
+                                           unsigned ValueID,
+                                           unsigned FSCallsAbbrev,
+                                           unsigned FSCallsProfileAbbrev,
+                                           const Function &F);
+  void writeModuleLevelReferences(const GlobalVariable &V,
+                                  SmallVector<uint64_t, 64> &NameVals,
+                                  unsigned FSModRefsAbbrev);
+  void writePerModuleGlobalValueSummary();
+  void writeModuleHash(size_t BlockStartPos);
+
+  void assignValueId(GlobalValue::GUID ValGUID) {
+    GUIDToValueIdMap[ValGUID] = ++GlobalValueId;
+  }
+  unsigned getValueId(GlobalValue::GUID ValGUID) {
+    const auto &VMI = GUIDToValueIdMap.find(ValGUID);
+    assert(VMI != GUIDToValueIdMap.end());
+    return VMI->second;
+  }
+  // Helper to get the valueId for the type of value recorded in VI.
+  unsigned getValueId(ValueInfo VI) {
+    if (VI.isGUID())
+      return getValueId(VI.getGUID());
+    return VE.getValueID(VI.getValue());
+  }
+  std::map<GlobalValue::GUID, unsigned> &valueIds() { return GUIDToValueIdMap; }
+};
+
+/// Class to manage the bitcode writing for a combined index.
+class IndexBitcodeWriter : public BitcodeWriter {
+  /// The combined index to write to bitcode.
+  const ModuleSummaryIndex &Index;
+
+  /// When writing a subset of the index for distributed backends, client
+  /// provides a map of modules to the corresponding GUIDs/summaries to write.
+  std::map<std::string, GVSummaryMapTy> *ModuleToSummariesForIndex;
+
+  /// Map that holds the correspondence between the GUID used in the combined
+  /// index and a value id generated by this class to use in references.
+  std::map<GlobalValue::GUID, unsigned> GUIDToValueIdMap;
+
+  /// Tracks the last value id recorded in the GUIDToValueMap.
+  unsigned GlobalValueId = 0;
+
+public:
+  /// Constructs a IndexBitcodeWriter object for the given combined index,
+  /// writing to the provided \p Buffer. When writing a subset of the index
+  /// for a distributed backend, provide a \p ModuleToSummariesForIndex map.
+  IndexBitcodeWriter(SmallVectorImpl<char> &Buffer,
+                     const ModuleSummaryIndex &Index,
+                     std::map<std::string, GVSummaryMapTy>
+                         *ModuleToSummariesForIndex = nullptr)
+      : BitcodeWriter(Buffer), Index(Index),
+        ModuleToSummariesForIndex(ModuleToSummariesForIndex) {
+    // Assign unique value ids to all summaries to be written, for use
+    // in writing out the call graph edges. Save the mapping from GUID
+    // to the new global value id to use when writing those edges, which
+    // are currently saved in the index in terms of GUID.
+    for (const auto &I : *this)
+      GUIDToValueIdMap[I.first] = ++GlobalValueId;
+  }
+
+  /// The below iterator returns the GUID and associated summary.
+  typedef std::pair<GlobalValue::GUID, GlobalValueSummary *> GVInfo;
+
+  /// Iterator over the value GUID and summaries to be written to bitcode,
+  /// hides the details of whether they are being pulled from the entire
+  /// index or just those in a provided ModuleToSummariesForIndex map.
+  class iterator
+      : public llvm::iterator_facade_base<iterator, std::forward_iterator_tag,
+                                          GVInfo> {
+    /// Enables access to parent class.
+    const IndexBitcodeWriter &Writer;
+
+    // Iterators used when writing only those summaries in a provided
+    // ModuleToSummariesForIndex map:
+
+    /// Points to the last element in outer ModuleToSummariesForIndex map.
+    std::map<std::string, GVSummaryMapTy>::iterator ModuleSummariesBack;
+    /// Iterator on outer ModuleToSummariesForIndex map.
+    std::map<std::string, GVSummaryMapTy>::iterator ModuleSummariesIter;
+    /// Iterator on an inner global variable summary map.
+    GVSummaryMapTy::iterator ModuleGVSummariesIter;
+
+    // Iterators used when writing all summaries in the index:
+
+    /// Points to the last element in the Index outer GlobalValueMap.
+    const_gvsummary_iterator IndexSummariesBack;
+    /// Iterator on outer GlobalValueMap.
+    const_gvsummary_iterator IndexSummariesIter;
+    /// Iterator on an inner GlobalValueSummaryList.
+    GlobalValueSummaryList::const_iterator IndexGVSummariesIter;
+
+  public:
+    /// Construct iterator from parent \p Writer and indicate if we are
+    /// constructing the end iterator.
+    iterator(const IndexBitcodeWriter &Writer, bool IsAtEnd) : Writer(Writer) {
+      // Set up the appropriate set of iterators given whether we are writing
+      // the full index or just a subset.
+      // Can't setup the Back or inner iterators if the corresponding map
+      // is empty. This will be handled specially in operator== as well.
+      if (Writer.ModuleToSummariesForIndex &&
+          !Writer.ModuleToSummariesForIndex->empty()) {
+        for (ModuleSummariesBack = Writer.ModuleToSummariesForIndex->begin();
+             std::next(ModuleSummariesBack) !=
+             Writer.ModuleToSummariesForIndex->end();
+             ModuleSummariesBack++)
+          ;
+        ModuleSummariesIter = !IsAtEnd
+                                  ? Writer.ModuleToSummariesForIndex->begin()
+                                  : ModuleSummariesBack;
+        ModuleGVSummariesIter = !IsAtEnd ? ModuleSummariesIter->second.begin()
+                                         : ModuleSummariesBack->second.end();
+      } else if (!Writer.ModuleToSummariesForIndex &&
+                 Writer.Index.begin() != Writer.Index.end()) {
+        for (IndexSummariesBack = Writer.Index.begin();
+             std::next(IndexSummariesBack) != Writer.Index.end();
+             IndexSummariesBack++)
+          ;
+        IndexSummariesIter =
+            !IsAtEnd ? Writer.Index.begin() : IndexSummariesBack;
+        IndexGVSummariesIter = !IsAtEnd ? IndexSummariesIter->second.begin()
+                                        : IndexSummariesBack->second.end();
+      }
+    }
+
+    /// Increment the appropriate set of iterators.
+    iterator &operator++() {
+      // First the inner iterator is incremented, then if it is at the end
+      // and there are more outer iterations to go, the inner is reset to
+      // the start of the next inner list.
+      if (Writer.ModuleToSummariesForIndex) {
+        ++ModuleGVSummariesIter;
+        if (ModuleGVSummariesIter == ModuleSummariesIter->second.end() &&
+            ModuleSummariesIter != ModuleSummariesBack) {
+          ++ModuleSummariesIter;
+          ModuleGVSummariesIter = ModuleSummariesIter->second.begin();
+        }
+      } else {
+        ++IndexGVSummariesIter;
+        if (IndexGVSummariesIter == IndexSummariesIter->second.end() &&
+            IndexSummariesIter != IndexSummariesBack) {
+          ++IndexSummariesIter;
+          IndexGVSummariesIter = IndexSummariesIter->second.begin();
+        }
+      }
+      return *this;
+    }
+
+    /// Access the <GUID,GlobalValueSummary*> pair corresponding to the current
+    /// outer and inner iterator positions.
+    GVInfo operator*() {
+      if (Writer.ModuleToSummariesForIndex)
+        return std::make_pair(ModuleGVSummariesIter->first,
+                              ModuleGVSummariesIter->second);
+      return std::make_pair(IndexSummariesIter->first,
+                            IndexGVSummariesIter->get());
+    }
+
+    /// Checks if the iterators are equal, with special handling for empty
+    /// indexes.
+    bool operator==(const iterator &RHS) const {
+      if (Writer.ModuleToSummariesForIndex) {
+        // First ensure that both are writing the same subset.
+        if (Writer.ModuleToSummariesForIndex !=
+            RHS.Writer.ModuleToSummariesForIndex)
+          return false;
+        // Already determined above that maps are the same, so if one is
+        // empty, they both are.
+        if (Writer.ModuleToSummariesForIndex->empty())
+          return true;
+        // Ensure the ModuleGVSummariesIter are iterating over the same
+        // container before checking them below.
+        if (ModuleSummariesIter != RHS.ModuleSummariesIter)
+          return false;
+        return ModuleGVSummariesIter == RHS.ModuleGVSummariesIter;
+      }
+      // First ensure RHS also writing the full index, and that both are
+      // writing the same full index.
+      if (RHS.Writer.ModuleToSummariesForIndex ||
+          &Writer.Index != &RHS.Writer.Index)
+        return false;
+      // Already determined above that maps are the same, so if one is
+      // empty, they both are.
+      if (Writer.Index.begin() == Writer.Index.end())
+        return true;
+      // Ensure the IndexGVSummariesIter are iterating over the same
+      // container before checking them below.
+      if (IndexSummariesIter != RHS.IndexSummariesIter)
+        return false;
+      return IndexGVSummariesIter == RHS.IndexGVSummariesIter;
+    }
+  };
+
+  /// Obtain the start iterator over the summaries to be written.
+  iterator begin() { return iterator(*this, /*IsAtEnd=*/false); }
+  /// Obtain the end iterator over the summaries to be written.
+  iterator end() { return iterator(*this, /*IsAtEnd=*/true); }
+
+private:
+  /// Main entry point for writing a combined index to bitcode, invoked by
+  /// BitcodeWriter::write() after it writes the header.
+  void writeBlocks() override;
+
+  void writeIndex();
+  void writeModStrings();
+  void writeCombinedValueSymbolTable();
+  void writeCombinedGlobalValueSummary();
+
+  /// Indicates whether the provided \p ModulePath should be written into
+  /// the module string table, e.g. if full index written or if it is in
+  /// the provided subset.
+  bool doIncludeModule(StringRef ModulePath) {
+    return !ModuleToSummariesForIndex ||
+           ModuleToSummariesForIndex->count(ModulePath);
+  }
+
+  bool hasValueId(GlobalValue::GUID ValGUID) {
+    const auto &VMI = GUIDToValueIdMap.find(ValGUID);
+    return VMI != GUIDToValueIdMap.end();
+  }
+  unsigned getValueId(GlobalValue::GUID ValGUID) {
+    const auto &VMI = GUIDToValueIdMap.find(ValGUID);
+    // If this GUID doesn't have an entry, assign one.
+    if (VMI == GUIDToValueIdMap.end()) {
+      GUIDToValueIdMap[ValGUID] = ++GlobalValueId;
+      return GlobalValueId;
+    } else {
+      return VMI->second;
+    }
+  }
+  std::map<GlobalValue::GUID, unsigned> &valueIds() { return GUIDToValueIdMap; }
+};
+} // end anonymous namespace
+
+static unsigned getEncodedCastOpcode(unsigned Opcode) {
   switch (Opcode) {
   default: llvm_unreachable("Unknown cast instruction!");
   case Instruction::Trunc   : return bitc::CAST_TRUNC;
@@ -83,7 +531,7 @@ static unsigned GetEncodedCastOpcode(unsigned Opcode) {
   }
 }
 
-static unsigned GetEncodedBinaryOpcode(unsigned Opcode) {
+static unsigned getEncodedBinaryOpcode(unsigned Opcode) {
   switch (Opcode) {
   default: llvm_unreachable("Unknown binary instruction!");
   case Instruction::Add:
@@ -107,7 +555,7 @@ static unsigned GetEncodedBinaryOpcode(unsigned Opcode) {
   }
 }
 
-static unsigned GetEncodedRMWOperation(AtomicRMWInst::BinOp Op) {
+static unsigned getEncodedRMWOperation(AtomicRMWInst::BinOp Op) {
   switch (Op) {
   default: llvm_unreachable("Unknown RMW operation!");
   case AtomicRMWInst::Xchg: return bitc::RMW_XCHG;
@@ -124,20 +572,20 @@ static unsigned GetEncodedRMWOperation(AtomicRMWInst::BinOp Op) {
   }
 }
 
-static unsigned GetEncodedOrdering(AtomicOrdering Ordering) {
+static unsigned getEncodedOrdering(AtomicOrdering Ordering) {
   switch (Ordering) {
-  case NotAtomic: return bitc::ORDERING_NOTATOMIC;
-  case Unordered: return bitc::ORDERING_UNORDERED;
-  case Monotonic: return bitc::ORDERING_MONOTONIC;
-  case Acquire: return bitc::ORDERING_ACQUIRE;
-  case Release: return bitc::ORDERING_RELEASE;
-  case AcquireRelease: return bitc::ORDERING_ACQREL;
-  case SequentiallyConsistent: return bitc::ORDERING_SEQCST;
+  case AtomicOrdering::NotAtomic: return bitc::ORDERING_NOTATOMIC;
+  case AtomicOrdering::Unordered: return bitc::ORDERING_UNORDERED;
+  case AtomicOrdering::Monotonic: return bitc::ORDERING_MONOTONIC;
+  case AtomicOrdering::Acquire: return bitc::ORDERING_ACQUIRE;
+  case AtomicOrdering::Release: return bitc::ORDERING_RELEASE;
+  case AtomicOrdering::AcquireRelease: return bitc::ORDERING_ACQREL;
+  case AtomicOrdering::SequentiallyConsistent: return bitc::ORDERING_SEQCST;
   }
   llvm_unreachable("Invalid ordering");
 }
 
-static unsigned GetEncodedSynchScope(SynchronizationScope SynchScope) {
+static unsigned getEncodedSynchScope(SynchronizationScope SynchScope) {
   switch (SynchScope) {
   case SingleThread: return bitc::SYNCHSCOPE_SINGLETHREAD;
   case CrossThread: return bitc::SYNCHSCOPE_CROSSTHREAD;
@@ -145,8 +593,8 @@ static unsigned GetEncodedSynchScope(SynchronizationScope SynchScope) {
   llvm_unreachable("Invalid synch scope");
 }
 
-static void WriteStringRecord(unsigned Code, StringRef Str,
-                              unsigned AbbrevToUse, BitstreamWriter &Stream) {
+void ModuleBitcodeWriter::writeStringRecord(unsigned Code, StringRef Str,
+                                            unsigned AbbrevToUse) {
   SmallVector<unsigned, 64> Vals;
 
   // Code: [strchar x N]
@@ -164,6 +612,8 @@ static uint64_t getAttrKindEncoding(Attribute::AttrKind Kind) {
   switch (Kind) {
   case Attribute::Alignment:
     return bitc::ATTR_KIND_ALIGNMENT;
+  case Attribute::AllocSize:
+    return bitc::ATTR_KIND_ALLOC_SIZE;
   case Attribute::AlwaysInline:
     return bitc::ATTR_KIND_ALWAYS_INLINE;
   case Attribute::ArgMemOnly:
@@ -254,8 +704,14 @@ static uint64_t getAttrKindEncoding(Attribute::AttrKind Kind) {
     return bitc::ATTR_KIND_SANITIZE_THREAD;
   case Attribute::SanitizeMemory:
     return bitc::ATTR_KIND_SANITIZE_MEMORY;
+  case Attribute::SwiftError:
+    return bitc::ATTR_KIND_SWIFT_ERROR;
+  case Attribute::SwiftSelf:
+    return bitc::ATTR_KIND_SWIFT_SELF;
   case Attribute::UWTable:
     return bitc::ATTR_KIND_UW_TABLE;
+  case Attribute::WriteOnly:
+    return bitc::ATTR_KIND_WRITEONLY;
   case Attribute::ZExt:
     return bitc::ATTR_KIND_Z_EXT;
   case Attribute::EndAttrKinds:
@@ -267,8 +723,7 @@ static uint64_t getAttrKindEncoding(Attribute::AttrKind Kind) {
   llvm_unreachable("Trying to encode unknown attribute");
 }
 
-static void WriteAttributeGroupTable(const ValueEnumerator &VE,
-                                     BitstreamWriter &Stream) {
+void ModuleBitcodeWriter::writeAttributeGroupTable() {
   const std::vector<AttributeSet> &AttrGrps = VE.getAttributeGroups();
   if (AttrGrps.empty()) return;
 
@@ -315,8 +770,7 @@ static void WriteAttributeGroupTable(const ValueEnumerator &VE,
   Stream.ExitBlock();
 }
 
-static void WriteAttributeTable(const ValueEnumerator &VE,
-                                BitstreamWriter &Stream) {
+void ModuleBitcodeWriter::writeAttributeTable() {
   const std::vector<AttributeSet> &Attrs = VE.getAttributes();
   if (Attrs.empty()) return;
 
@@ -336,7 +790,7 @@ static void WriteAttributeTable(const ValueEnumerator &VE,
 }
 
 /// WriteTypeTable - Write out the type table for a module.
-static void WriteTypeTable(const ValueEnumerator &VE, BitstreamWriter &Stream) {
+void ModuleBitcodeWriter::writeTypeTable() {
   const ValueEnumerator::TypeList &TypeList = VE.getTypes();
 
   Stream.EnterSubblock(bitc::TYPE_BLOCK_ID_NEW, 4 /*count from # abbrevs */);
@@ -464,8 +918,8 @@ static void WriteTypeTable(const ValueEnumerator &VE, BitstreamWriter &Stream) {
 
         // Emit the name if it is present.
         if (!ST->getName().empty())
-          WriteStringRecord(bitc::TYPE_CODE_STRUCT_NAME, ST->getName(),
-                            StructNameAbbrev, Stream);
+          writeStringRecord(bitc::TYPE_CODE_STRUCT_NAME, ST->getName(),
+                            StructNameAbbrev);
       }
       break;
     }
@@ -496,8 +950,8 @@ static void WriteTypeTable(const ValueEnumerator &VE, BitstreamWriter &Stream) {
   Stream.ExitBlock();
 }
 
-static unsigned getEncodedLinkage(const GlobalValue &GV) {
-  switch (GV.getLinkage()) {
+static unsigned getEncodedLinkage(const GlobalValue::LinkageTypes Linkage) {
+  switch (Linkage) {
   case GlobalValue::ExternalLinkage:
     return 0;
   case GlobalValue::WeakAnyLinkage:
@@ -524,6 +978,24 @@ static unsigned getEncodedLinkage(const GlobalValue &GV) {
   llvm_unreachable("Invalid linkage");
 }
 
+static unsigned getEncodedLinkage(const GlobalValue &GV) {
+  return getEncodedLinkage(GV.getLinkage());
+}
+
+// Decode the flags for GlobalValue in the summary
+static uint64_t getEncodedGVSummaryFlags(GlobalValueSummary::GVFlags Flags) {
+  uint64_t RawFlags = 0;
+
+  RawFlags |= Flags.HasSection; // bool
+
+  // Linkage don't need to be remapped at that time for the summary. Any future
+  // change to the getEncodedLinkage() function will need to be taken into
+  // account here as well.
+  RawFlags = (RawFlags << 4) | Flags.Linkage; // 4 bits
+
+  return RawFlags;
+}
+
 static unsigned getEncodedVisibility(const GlobalValue &GV) {
   switch (GV.getVisibility()) {
   case GlobalValue::DefaultVisibility:   return 0;
@@ -569,13 +1041,22 @@ static unsigned getEncodedComdatSelectionKind(const Comdat &C) {
   llvm_unreachable("Invalid selection kind");
 }
 
-static void writeComdats(const ValueEnumerator &VE, BitstreamWriter &Stream) {
-  SmallVector<uint16_t, 64> Vals;
+static unsigned getEncodedUnnamedAddr(const GlobalValue &GV) {
+  switch (GV.getUnnamedAddr()) {
+  case GlobalValue::UnnamedAddr::None:   return 0;
+  case GlobalValue::UnnamedAddr::Local:  return 2;
+  case GlobalValue::UnnamedAddr::Global: return 1;
+  }
+  llvm_unreachable("Invalid unnamed_addr");
+}
+
+void ModuleBitcodeWriter::writeComdats() {
+  SmallVector<unsigned, 64> Vals;
   for (const Comdat *C : VE.getComdats()) {
     // COMDAT: [selection_kind, name]
     Vals.push_back(getEncodedComdatSelectionKind(*C));
     size_t Size = C->getName().size();
-    assert(isUInt<16>(Size));
+    assert(isUInt<32>(Size));
     Vals.push_back(Size);
     for (char Chr : C->getName())
       Vals.push_back((unsigned char)Chr);
@@ -586,12 +1067,8 @@ static void writeComdats(const ValueEnumerator &VE, BitstreamWriter &Stream) {
 
 /// Write a record that will eventually hold the word offset of the
 /// module-level VST. For now the offset is 0, which will be backpatched
-/// after the real VST is written. Returns the bit offset to backpatch.
-static uint64_t WriteValueSymbolTableForwardDecl(const ValueSymbolTable &VST,
-                                                 BitstreamWriter &Stream) {
-  if (VST.empty())
-    return 0;
-
+/// after the real VST is written. Saves the bit offset to backpatch.
+void BitcodeWriter::writeValueSymbolTableForwardDecl() {
   // Write a placeholder value in for the offset of the real VST,
   // which is written after the function blocks so that it can include
   // the offset of each function. The placeholder offset will be
@@ -608,27 +1085,44 @@ static uint64_t WriteValueSymbolTableForwardDecl(const ValueSymbolTable &VST,
   uint64_t Vals[] = {bitc::MODULE_CODE_VSTOFFSET, 0};
   Stream.EmitRecordWithAbbrev(VSTOffsetAbbrev, Vals);
 
-  // Compute and return the bit offset to the placeholder, which will be
+  // Compute and save the bit offset to the placeholder, which will be
   // patched when the real VST is written. We can simply subtract the 32-bit
   // fixed size from the current bit number to get the location to backpatch.
-  return Stream.GetCurrentBitNo() - 32;
+  VSTOffsetPlaceholder = Stream.GetCurrentBitNo() - 32;
+}
+
+enum StringEncoding { SE_Char6, SE_Fixed7, SE_Fixed8 };
+
+/// Determine the encoding to use for the given string name and length.
+static StringEncoding getStringEncoding(const char *Str, unsigned StrLen) {
+  bool isChar6 = true;
+  for (const char *C = Str, *E = C + StrLen; C != E; ++C) {
+    if (isChar6)
+      isChar6 = BitCodeAbbrevOp::isChar6(*C);
+    if ((unsigned char)*C & 128)
+      // don't bother scanning the rest.
+      return SE_Fixed8;
+  }
+  if (isChar6)
+    return SE_Char6;
+  else
+    return SE_Fixed7;
 }
 
 /// Emit top-level description of module, including target triple, inline asm,
 /// descriptors for global variables, and function prototype info.
 /// Returns the bit offset to backpatch with the location of the real VST.
-static uint64_t WriteModuleInfo(const Module *M, const ValueEnumerator &VE,
-                                BitstreamWriter &Stream) {
+void ModuleBitcodeWriter::writeModuleInfo() {
   // Emit various pieces of data attached to a module.
-  if (!M->getTargetTriple().empty())
-    WriteStringRecord(bitc::MODULE_CODE_TRIPLE, M->getTargetTriple(),
-                      0/*TODO*/, Stream);
-  const std::string &DL = M->getDataLayoutStr();
+  if (!M.getTargetTriple().empty())
+    writeStringRecord(bitc::MODULE_CODE_TRIPLE, M.getTargetTriple(),
+                      0 /*TODO*/);
+  const std::string &DL = M.getDataLayoutStr();
   if (!DL.empty())
-    WriteStringRecord(bitc::MODULE_CODE_DATALAYOUT, DL, 0 /*TODO*/, Stream);
-  if (!M->getModuleInlineAsm().empty())
-    WriteStringRecord(bitc::MODULE_CODE_ASM, M->getModuleInlineAsm(),
-                      0/*TODO*/, Stream);
+    writeStringRecord(bitc::MODULE_CODE_DATALAYOUT, DL, 0 /*TODO*/);
+  if (!M.getModuleInlineAsm().empty())
+    writeStringRecord(bitc::MODULE_CODE_ASM, M.getModuleInlineAsm(),
+                      0 /*TODO*/);
 
   // Emit information about sections and GC, computing how many there are. Also
   // compute the maximum alignment value.
@@ -636,27 +1130,27 @@ static uint64_t WriteModuleInfo(const Module *M, const ValueEnumerator &VE,
   std::map<std::string, unsigned> GCMap;
   unsigned MaxAlignment = 0;
   unsigned MaxGlobalType = 0;
-  for (const GlobalValue &GV : M->globals()) {
+  for (const GlobalValue &GV : M.globals()) {
     MaxAlignment = std::max(MaxAlignment, GV.getAlignment());
     MaxGlobalType = std::max(MaxGlobalType, VE.getTypeID(GV.getValueType()));
     if (GV.hasSection()) {
       // Give section names unique ID's.
       unsigned &Entry = SectionMap[GV.getSection()];
       if (!Entry) {
-        WriteStringRecord(bitc::MODULE_CODE_SECTIONNAME, GV.getSection(),
-                          0/*TODO*/, Stream);
+        writeStringRecord(bitc::MODULE_CODE_SECTIONNAME, GV.getSection(),
+                          0 /*TODO*/);
         Entry = SectionMap.size();
       }
     }
   }
-  for (const Function &F : *M) {
+  for (const Function &F : M) {
     MaxAlignment = std::max(MaxAlignment, F.getAlignment());
     if (F.hasSection()) {
       // Give section names unique ID's.
       unsigned &Entry = SectionMap[F.getSection()];
       if (!Entry) {
-        WriteStringRecord(bitc::MODULE_CODE_SECTIONNAME, F.getSection(),
-                          0/*TODO*/, Stream);
+        writeStringRecord(bitc::MODULE_CODE_SECTIONNAME, F.getSection(),
+                          0 /*TODO*/);
         Entry = SectionMap.size();
       }
     }
@@ -664,8 +1158,7 @@ static uint64_t WriteModuleInfo(const Module *M, const ValueEnumerator &VE,
       // Same for GC names.
       unsigned &Entry = GCMap[F.getGC()];
       if (!Entry) {
-        WriteStringRecord(bitc::MODULE_CODE_GCNAME, F.getGC(),
-                          0/*TODO*/, Stream);
+        writeStringRecord(bitc::MODULE_CODE_GCNAME, F.getGC(), 0 /*TODO*/);
         Entry = GCMap.size();
       }
     }
@@ -673,7 +1166,7 @@ static uint64_t WriteModuleInfo(const Module *M, const ValueEnumerator &VE,
 
   // Emit abbrev for globals, now that we know # sections and max alignment.
   unsigned SimpleGVarAbbrev = 0;
-  if (!M->global_empty()) {
+  if (!M.global_empty()) {
     // Add an abbrev for common globals with no visibility or thread localness.
     BitCodeAbbrev *Abbv = new BitCodeAbbrev();
     Abbv->Add(BitCodeAbbrevOp(bitc::MODULE_CODE_GLOBALVAR));
@@ -702,7 +1195,7 @@ static uint64_t WriteModuleInfo(const Module *M, const ValueEnumerator &VE,
 
   // Emit the global variable information.
   SmallVector<unsigned, 64> Vals;
-  for (const GlobalVariable &GV : M->globals()) {
+  for (const GlobalVariable &GV : M.globals()) {
     unsigned AbbrevToUse = 0;
 
     // GLOBALVAR: [type, isconst, initid,
@@ -718,12 +1211,13 @@ static uint64_t WriteModuleInfo(const Module *M, const ValueEnumerator &VE,
     Vals.push_back(GV.hasSection() ? SectionMap[GV.getSection()] : 0);
     if (GV.isThreadLocal() ||
         GV.getVisibility() != GlobalValue::DefaultVisibility ||
-        GV.hasUnnamedAddr() || GV.isExternallyInitialized() ||
+        GV.getUnnamedAddr() != GlobalValue::UnnamedAddr::None ||
+        GV.isExternallyInitialized() ||
         GV.getDLLStorageClass() != GlobalValue::DefaultStorageClass ||
         GV.hasComdat()) {
       Vals.push_back(getEncodedVisibility(GV));
       Vals.push_back(getEncodedThreadLocalMode(GV));
-      Vals.push_back(GV.hasUnnamedAddr());
+      Vals.push_back(getEncodedUnnamedAddr(GV));
       Vals.push_back(GV.isExternallyInitialized());
       Vals.push_back(getEncodedDLLStorageClass(GV));
       Vals.push_back(GV.hasComdat() ? VE.getComdatID(GV.getComdat()) : 0);
@@ -736,7 +1230,7 @@ static uint64_t WriteModuleInfo(const Module *M, const ValueEnumerator &VE,
   }
 
   // Emit the function proto information.
-  for (const Function &F : *M) {
+  for (const Function &F : M) {
     // FUNCTION:  [type, callingconv, isproto, linkage, paramattrs, alignment,
     //             section, visibility, gc, unnamed_addr, prologuedata,
     //             dllstorageclass, comdat, prefixdata, personalityfn]
@@ -749,7 +1243,7 @@ static uint64_t WriteModuleInfo(const Module *M, const ValueEnumerator &VE,
     Vals.push_back(F.hasSection() ? SectionMap[F.getSection()] : 0);
     Vals.push_back(getEncodedVisibility(F));
     Vals.push_back(F.hasGC() ? GCMap[F.getGC()] : 0);
-    Vals.push_back(F.hasUnnamedAddr());
+    Vals.push_back(getEncodedUnnamedAddr(F));
     Vals.push_back(F.hasPrologueData() ? (VE.getValueID(F.getPrologueData()) + 1)
                                        : 0);
     Vals.push_back(getEncodedDLLStorageClass(F));
@@ -765,8 +1259,9 @@ static uint64_t WriteModuleInfo(const Module *M, const ValueEnumerator &VE,
   }
 
   // Emit the alias information.
-  for (const GlobalAlias &A : M->aliases()) {
-    // ALIAS: [alias type, aliasee val#, linkage, visibility]
+  for (const GlobalAlias &A : M.aliases()) {
+    // ALIAS: [alias type, aliasee val#, linkage, visibility, dllstorageclass,
+    //         threadlocal, unnamed_addr]
     Vals.push_back(VE.getTypeID(A.getValueType()));
     Vals.push_back(A.getType()->getAddressSpace());
     Vals.push_back(VE.getValueID(A.getAliasee()));
@@ -774,33 +1269,56 @@ static uint64_t WriteModuleInfo(const Module *M, const ValueEnumerator &VE,
     Vals.push_back(getEncodedVisibility(A));
     Vals.push_back(getEncodedDLLStorageClass(A));
     Vals.push_back(getEncodedThreadLocalMode(A));
-    Vals.push_back(A.hasUnnamedAddr());
+    Vals.push_back(getEncodedUnnamedAddr(A));
     unsigned AbbrevToUse = 0;
     Stream.EmitRecord(bitc::MODULE_CODE_ALIAS, Vals, AbbrevToUse);
     Vals.clear();
   }
 
-  // Write a record indicating the number of module-level metadata IDs
-  // This is needed because the ids of metadata are assigned implicitly
-  // based on their ordering in the bitcode, with the function-level
-  // metadata ids starting after the module-level metadata ids. For
-  // function importing where we lazy load the metadata as a postpass,
-  // we want to avoid parsing the module-level metadata before parsing
-  // the imported functions.
-  BitCodeAbbrev *Abbv = new BitCodeAbbrev();
-  Abbv->Add(BitCodeAbbrevOp(bitc::MODULE_CODE_METADATA_VALUES));
-  Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 6));
-  unsigned MDValsAbbrev = Stream.EmitAbbrev(Abbv);
-  Vals.push_back(VE.numMDs());
-  Stream.EmitRecord(bitc::MODULE_CODE_METADATA_VALUES, Vals, MDValsAbbrev);
-  Vals.clear();
+  // Emit the ifunc information.
+  for (const GlobalIFunc &I : M.ifuncs()) {
+    // IFUNC: [ifunc type, address space, resolver val#, linkage, visibility]
+    Vals.push_back(VE.getTypeID(I.getValueType()));
+    Vals.push_back(I.getType()->getAddressSpace());
+    Vals.push_back(VE.getValueID(I.getResolver()));
+    Vals.push_back(getEncodedLinkage(I));
+    Vals.push_back(getEncodedVisibility(I));
+    Stream.EmitRecord(bitc::MODULE_CODE_IFUNC, Vals);
+    Vals.clear();
+  }
+
+  // Emit the module's source file name.
+  {
+    StringEncoding Bits = getStringEncoding(M.getSourceFileName().data(),
+                                            M.getSourceFileName().size());
+    BitCodeAbbrevOp AbbrevOpToUse = BitCodeAbbrevOp(BitCodeAbbrevOp::Fixed, 8);
+    if (Bits == SE_Char6)
+      AbbrevOpToUse = BitCodeAbbrevOp(BitCodeAbbrevOp::Char6);
+    else if (Bits == SE_Fixed7)
+      AbbrevOpToUse = BitCodeAbbrevOp(BitCodeAbbrevOp::Fixed, 7);
+
+    // MODULE_CODE_SOURCE_FILENAME: [namechar x N]
+    BitCodeAbbrev *Abbv = new BitCodeAbbrev();
+    Abbv->Add(BitCodeAbbrevOp(bitc::MODULE_CODE_SOURCE_FILENAME));
+    Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Array));
+    Abbv->Add(AbbrevOpToUse);
+    unsigned FilenameAbbrev = Stream.EmitAbbrev(Abbv);
+
+    for (const auto P : M.getSourceFileName())
+      Vals.push_back((unsigned char)P);
 
-  uint64_t VSTOffsetPlaceholder =
-      WriteValueSymbolTableForwardDecl(M->getValueSymbolTable(), Stream);
-  return VSTOffsetPlaceholder;
+    // Emit the finished record.
+    Stream.EmitRecord(bitc::MODULE_CODE_SOURCE_FILENAME, Vals, FilenameAbbrev);
+    Vals.clear();
+  }
+
+  // If we have a VST, write the VSTOFFSET record placeholder.
+  if (M.getValueSymbolTable().empty())
+    return;
+  writeValueSymbolTableForwardDecl();
 }
 
-static uint64_t GetOptimizationFlags(const Value *V) {
+static uint64_t getOptimizationFlags(const Value *V) {
   uint64_t Flags = 0;
 
   if (const auto *OBO = dyn_cast<OverflowingBinaryOperator>(V)) {
@@ -827,10 +1345,8 @@ static uint64_t GetOptimizationFlags(const Value *V) {
   return Flags;
 }
 
-static void WriteValueAsMetadata(const ValueAsMetadata *MD,
-                                 const ValueEnumerator &VE,
-                                 BitstreamWriter &Stream,
-                                 SmallVectorImpl<uint64_t> &Record) {
+void ModuleBitcodeWriter::writeValueAsMetadata(
+    const ValueAsMetadata *MD, SmallVectorImpl<uint64_t> &Record) {
   // Mimic an MDNode with a value as one operand.
   Value *V = MD->getValue();
   Record.push_back(VE.getTypeID(V->getType()));
@@ -839,9 +1355,9 @@ static void WriteValueAsMetadata(const ValueAsMetadata *MD,
   Record.clear();
 }
 
-static void WriteMDTuple(const MDTuple *N, const ValueEnumerator &VE,
-                         BitstreamWriter &Stream,
-                         SmallVectorImpl<uint64_t> &Record, unsigned Abbrev) {
+void ModuleBitcodeWriter::writeMDTuple(const MDTuple *N,
+                                       SmallVectorImpl<uint64_t> &Record,
+                                       unsigned Abbrev) {
   for (unsigned i = 0, e = N->getNumOperands(); i != e; ++i) {
     Metadata *MD = N->getOperand(i);
     assert(!(MD && isa<LocalAsMetadata>(MD)) &&
@@ -854,10 +1370,25 @@ static void WriteMDTuple(const MDTuple *N, const ValueEnumerator &VE,
   Record.clear();
 }
 
-static void WriteDILocation(const DILocation *N, const ValueEnumerator &VE,
-                            BitstreamWriter &Stream,
-                            SmallVectorImpl<uint64_t> &Record,
-                            unsigned Abbrev) {
+unsigned ModuleBitcodeWriter::createDILocationAbbrev() {
+  // Assume the column is usually under 128, and always output the inlined-at
+  // location (it's never more expensive than building an array size 1).
+  BitCodeAbbrev *Abbv = new BitCodeAbbrev();
+  Abbv->Add(BitCodeAbbrevOp(bitc::METADATA_LOCATION));
+  Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Fixed, 1));
+  Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 6));
+  Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 8));
+  Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 6));
+  Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 6));
+  return Stream.EmitAbbrev(Abbv);
+}
+
+void ModuleBitcodeWriter::writeDILocation(const DILocation *N,
+                                          SmallVectorImpl<uint64_t> &Record,
+                                          unsigned &Abbrev) {
+  if (!Abbrev)
+    Abbrev = createDILocationAbbrev();
+
   Record.push_back(N->isDistinct());
   Record.push_back(N->getLine());
   Record.push_back(N->getColumn());
@@ -868,11 +1399,26 @@ static void WriteDILocation(const DILocation *N, const ValueEnumerator &VE,
   Record.clear();
 }
 
-static void WriteGenericDINode(const GenericDINode *N,
-                               const ValueEnumerator &VE,
-                               BitstreamWriter &Stream,
-                               SmallVectorImpl<uint64_t> &Record,
-                               unsigned Abbrev) {
+unsigned ModuleBitcodeWriter::createGenericDINodeAbbrev() {
+  // Assume the column is usually under 128, and always output the inlined-at
+  // location (it's never more expensive than building an array size 1).
+  BitCodeAbbrev *Abbv = new BitCodeAbbrev();
+  Abbv->Add(BitCodeAbbrevOp(bitc::METADATA_GENERIC_DEBUG));
+  Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Fixed, 1));
+  Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 6));
+  Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Fixed, 1));
+  Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 6));
+  Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Array));
+  Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 6));
+  return Stream.EmitAbbrev(Abbv);
+}
+
+void ModuleBitcodeWriter::writeGenericDINode(const GenericDINode *N,
+                                             SmallVectorImpl<uint64_t> &Record,
+                                             unsigned &Abbrev) {
+  if (!Abbrev)
+    Abbrev = createGenericDINodeAbbrev();
+
   Record.push_back(N->isDistinct());
   Record.push_back(N->getTag());
   Record.push_back(0); // Per-tag version field; unused for now.
@@ -889,10 +1435,9 @@ static uint64_t rotateSign(int64_t I) {
   return I < 0 ? ~(U << 1) : U << 1;
 }
 
-static void WriteDISubrange(const DISubrange *N, const ValueEnumerator &,
-                            BitstreamWriter &Stream,
-                            SmallVectorImpl<uint64_t> &Record,
-                            unsigned Abbrev) {
+void ModuleBitcodeWriter::writeDISubrange(const DISubrange *N,
+                                          SmallVectorImpl<uint64_t> &Record,
+                                          unsigned Abbrev) {
   Record.push_back(N->isDistinct());
   Record.push_back(N->getCount());
   Record.push_back(rotateSign(N->getLowerBound()));
@@ -901,10 +1446,9 @@ static void WriteDISubrange(const DISubrange *N, const ValueEnumerator &,
   Record.clear();
 }
 
-static void WriteDIEnumerator(const DIEnumerator *N, const ValueEnumerator &VE,
-                              BitstreamWriter &Stream,
-                              SmallVectorImpl<uint64_t> &Record,
-                              unsigned Abbrev) {
+void ModuleBitcodeWriter::writeDIEnumerator(const DIEnumerator *N,
+                                            SmallVectorImpl<uint64_t> &Record,
+                                            unsigned Abbrev) {
   Record.push_back(N->isDistinct());
   Record.push_back(rotateSign(N->getValue()));
   Record.push_back(VE.getMetadataOrNullID(N->getRawName()));
@@ -913,10 +1457,9 @@ static void WriteDIEnumerator(const DIEnumerator *N, const ValueEnumerator &VE,
   Record.clear();
 }
 
-static void WriteDIBasicType(const DIBasicType *N, const ValueEnumerator &VE,
-                             BitstreamWriter &Stream,
-                             SmallVectorImpl<uint64_t> &Record,
-                             unsigned Abbrev) {
+void ModuleBitcodeWriter::writeDIBasicType(const DIBasicType *N,
+                                           SmallVectorImpl<uint64_t> &Record,
+                                           unsigned Abbrev) {
   Record.push_back(N->isDistinct());
   Record.push_back(N->getTag());
   Record.push_back(VE.getMetadataOrNullID(N->getRawName()));
@@ -928,11 +1471,9 @@ static void WriteDIBasicType(const DIBasicType *N, const ValueEnumerator &VE,
   Record.clear();
 }
 
-static void WriteDIDerivedType(const DIDerivedType *N,
-                               const ValueEnumerator &VE,
-                               BitstreamWriter &Stream,
-                               SmallVectorImpl<uint64_t> &Record,
-                               unsigned Abbrev) {
+void ModuleBitcodeWriter::writeDIDerivedType(const DIDerivedType *N,
+                                             SmallVectorImpl<uint64_t> &Record,
+                                             unsigned Abbrev) {
   Record.push_back(N->isDistinct());
   Record.push_back(N->getTag());
   Record.push_back(VE.getMetadataOrNullID(N->getRawName()));
@@ -950,12 +1491,11 @@ static void WriteDIDerivedType(const DIDerivedType *N,
   Record.clear();
 }
 
-static void WriteDICompositeType(const DICompositeType *N,
-                                 const ValueEnumerator &VE,
-                                 BitstreamWriter &Stream,
-                                 SmallVectorImpl<uint64_t> &Record,
-                                 unsigned Abbrev) {
-  Record.push_back(N->isDistinct());
+void ModuleBitcodeWriter::writeDICompositeType(
+    const DICompositeType *N, SmallVectorImpl<uint64_t> &Record,
+    unsigned Abbrev) {
+  const unsigned IsNotUsedInOldTypeRef = 0x2;
+  Record.push_back(IsNotUsedInOldTypeRef | (unsigned)N->isDistinct());
   Record.push_back(N->getTag());
   Record.push_back(VE.getMetadataOrNullID(N->getRawName()));
   Record.push_back(VE.getMetadataOrNullID(N->getFile()));
@@ -976,22 +1516,22 @@ static void WriteDICompositeType(const DICompositeType *N,
   Record.clear();
 }
 
-static void WriteDISubroutineType(const DISubroutineType *N,
-                                  const ValueEnumerator &VE,
-                                  BitstreamWriter &Stream,
-                                  SmallVectorImpl<uint64_t> &Record,
-                                  unsigned Abbrev) {
-  Record.push_back(N->isDistinct());
+void ModuleBitcodeWriter::writeDISubroutineType(
+    const DISubroutineType *N, SmallVectorImpl<uint64_t> &Record,
+    unsigned Abbrev) {
+  const unsigned HasNoOldTypeRefs = 0x2;
+  Record.push_back(HasNoOldTypeRefs | (unsigned)N->isDistinct());
   Record.push_back(N->getFlags());
   Record.push_back(VE.getMetadataOrNullID(N->getTypeArray().get()));
+  Record.push_back(N->getCC());
 
   Stream.EmitRecord(bitc::METADATA_SUBROUTINE_TYPE, Record, Abbrev);
   Record.clear();
 }
 
-static void WriteDIFile(const DIFile *N, const ValueEnumerator &VE,
-                        BitstreamWriter &Stream,
-                        SmallVectorImpl<uint64_t> &Record, unsigned Abbrev) {
+void ModuleBitcodeWriter::writeDIFile(const DIFile *N,
+                                      SmallVectorImpl<uint64_t> &Record,
+                                      unsigned Abbrev) {
   Record.push_back(N->isDistinct());
   Record.push_back(VE.getMetadataOrNullID(N->getRawFilename()));
   Record.push_back(VE.getMetadataOrNullID(N->getRawDirectory()));
@@ -1000,11 +1540,9 @@ static void WriteDIFile(const DIFile *N, const ValueEnumerator &VE,
   Record.clear();
 }
 
-static void WriteDICompileUnit(const DICompileUnit *N,
-                               const ValueEnumerator &VE,
-                               BitstreamWriter &Stream,
-                               SmallVectorImpl<uint64_t> &Record,
-                               unsigned Abbrev) {
+void ModuleBitcodeWriter::writeDICompileUnit(const DICompileUnit *N,
+                                             SmallVectorImpl<uint64_t> &Record,
+                                             unsigned Abbrev) {
   assert(N->isDistinct() && "Expected distinct compile units");
   Record.push_back(/* IsDistinct */ true);
   Record.push_back(N->getSourceLanguage());
@@ -1017,7 +1555,7 @@ static void WriteDICompileUnit(const DICompileUnit *N,
   Record.push_back(N->getEmissionKind());
   Record.push_back(VE.getMetadataOrNullID(N->getEnumTypes().get()));
   Record.push_back(VE.getMetadataOrNullID(N->getRetainedTypes().get()));
-  Record.push_back(VE.getMetadataOrNullID(N->getSubprograms().get()));
+  Record.push_back(/* subprograms */ 0);
   Record.push_back(VE.getMetadataOrNullID(N->getGlobalVariables().get()));
   Record.push_back(VE.getMetadataOrNullID(N->getImportedEntities().get()));
   Record.push_back(N->getDWOId());
@@ -1027,11 +1565,11 @@ static void WriteDICompileUnit(const DICompileUnit *N,
   Record.clear();
 }
 
-static void WriteDISubprogram(const DISubprogram *N, const ValueEnumerator &VE,
-                              BitstreamWriter &Stream,
-                              SmallVectorImpl<uint64_t> &Record,
-                              unsigned Abbrev) {
-  Record.push_back(N->isDistinct());
+void ModuleBitcodeWriter::writeDISubprogram(const DISubprogram *N,
+                                            SmallVectorImpl<uint64_t> &Record,
+                                            unsigned Abbrev) {
+  uint64_t HasUnitFlag = 1 << 1;
+  Record.push_back(N->isDistinct() | HasUnitFlag);
   Record.push_back(VE.getMetadataOrNullID(N->getScope()));
   Record.push_back(VE.getMetadataOrNullID(N->getRawName()));
   Record.push_back(VE.getMetadataOrNullID(N->getRawLinkageName()));
@@ -1046,19 +1584,19 @@ static void WriteDISubprogram(const DISubprogram *N, const ValueEnumerator &VE,
   Record.push_back(N->getVirtualIndex());
   Record.push_back(N->getFlags());
   Record.push_back(N->isOptimized());
+  Record.push_back(VE.getMetadataOrNullID(N->getRawUnit()));
   Record.push_back(VE.getMetadataOrNullID(N->getTemplateParams().get()));
   Record.push_back(VE.getMetadataOrNullID(N->getDeclaration()));
   Record.push_back(VE.getMetadataOrNullID(N->getVariables().get()));
+  Record.push_back(N->getThisAdjustment());
 
   Stream.EmitRecord(bitc::METADATA_SUBPROGRAM, Record, Abbrev);
   Record.clear();
 }
 
-static void WriteDILexicalBlock(const DILexicalBlock *N,
-                                const ValueEnumerator &VE,
-                                BitstreamWriter &Stream,
-                                SmallVectorImpl<uint64_t> &Record,
-                                unsigned Abbrev) {
+void ModuleBitcodeWriter::writeDILexicalBlock(const DILexicalBlock *N,
+                                              SmallVectorImpl<uint64_t> &Record,
+                                              unsigned Abbrev) {
   Record.push_back(N->isDistinct());
   Record.push_back(VE.getMetadataOrNullID(N->getScope()));
   Record.push_back(VE.getMetadataOrNullID(N->getFile()));
@@ -1069,11 +1607,9 @@ static void WriteDILexicalBlock(const DILexicalBlock *N,
   Record.clear();
 }
 
-static void WriteDILexicalBlockFile(const DILexicalBlockFile *N,
-                                    const ValueEnumerator &VE,
-                                    BitstreamWriter &Stream,
-                                    SmallVectorImpl<uint64_t> &Record,
-                                    unsigned Abbrev) {
+void ModuleBitcodeWriter::writeDILexicalBlockFile(
+    const DILexicalBlockFile *N, SmallVectorImpl<uint64_t> &Record,
+    unsigned Abbrev) {
   Record.push_back(N->isDistinct());
   Record.push_back(VE.getMetadataOrNullID(N->getScope()));
   Record.push_back(VE.getMetadataOrNullID(N->getFile()));
@@ -1083,10 +1619,9 @@ static void WriteDILexicalBlockFile(const DILexicalBlockFile *N,
   Record.clear();
 }
 
-static void WriteDINamespace(const DINamespace *N, const ValueEnumerator &VE,
-                             BitstreamWriter &Stream,
-                             SmallVectorImpl<uint64_t> &Record,
-                             unsigned Abbrev) {
+void ModuleBitcodeWriter::writeDINamespace(const DINamespace *N,
+                                           SmallVectorImpl<uint64_t> &Record,
+                                           unsigned Abbrev) {
   Record.push_back(N->isDistinct());
   Record.push_back(VE.getMetadataOrNullID(N->getScope()));
   Record.push_back(VE.getMetadataOrNullID(N->getFile()));
@@ -1097,9 +1632,9 @@ static void WriteDINamespace(const DINamespace *N, const ValueEnumerator &VE,
   Record.clear();
 }
 
-static void WriteDIMacro(const DIMacro *N, const ValueEnumerator &VE,
-                         BitstreamWriter &Stream,
-                         SmallVectorImpl<uint64_t> &Record, unsigned Abbrev) {
+void ModuleBitcodeWriter::writeDIMacro(const DIMacro *N,
+                                       SmallVectorImpl<uint64_t> &Record,
+                                       unsigned Abbrev) {
   Record.push_back(N->isDistinct());
   Record.push_back(N->getMacinfoType());
   Record.push_back(N->getLine());
@@ -1110,10 +1645,9 @@ static void WriteDIMacro(const DIMacro *N, const ValueEnumerator &VE,
   Record.clear();
 }
 
-static void WriteDIMacroFile(const DIMacroFile *N, const ValueEnumerator &VE,
-                             BitstreamWriter &Stream,
-                             SmallVectorImpl<uint64_t> &Record,
-                             unsigned Abbrev) {
+void ModuleBitcodeWriter::writeDIMacroFile(const DIMacroFile *N,
+                                           SmallVectorImpl<uint64_t> &Record,
+                                           unsigned Abbrev) {
   Record.push_back(N->isDistinct());
   Record.push_back(N->getMacinfoType());
   Record.push_back(N->getLine());
@@ -1124,9 +1658,9 @@ static void WriteDIMacroFile(const DIMacroFile *N, const ValueEnumerator &VE,
   Record.clear();
 }
 
-static void WriteDIModule(const DIModule *N, const ValueEnumerator &VE,
-                          BitstreamWriter &Stream,
-                          SmallVectorImpl<uint64_t> &Record, unsigned Abbrev) {
+void ModuleBitcodeWriter::writeDIModule(const DIModule *N,
+                                        SmallVectorImpl<uint64_t> &Record,
+                                        unsigned Abbrev) {
   Record.push_back(N->isDistinct());
   for (auto &I : N->operands())
     Record.push_back(VE.getMetadataOrNullID(I));
@@ -1135,11 +1669,9 @@ static void WriteDIModule(const DIModule *N, const ValueEnumerator &VE,
   Record.clear();
 }
 
-static void WriteDITemplateTypeParameter(const DITemplateTypeParameter *N,
-                                         const ValueEnumerator &VE,
-                                         BitstreamWriter &Stream,
-                                         SmallVectorImpl<uint64_t> &Record,
-                                         unsigned Abbrev) {
+void ModuleBitcodeWriter::writeDITemplateTypeParameter(
+    const DITemplateTypeParameter *N, SmallVectorImpl<uint64_t> &Record,
+    unsigned Abbrev) {
   Record.push_back(N->isDistinct());
   Record.push_back(VE.getMetadataOrNullID(N->getRawName()));
   Record.push_back(VE.getMetadataOrNullID(N->getType()));
@@ -1148,11 +1680,9 @@ static void WriteDITemplateTypeParameter(const DITemplateTypeParameter *N,
   Record.clear();
 }
 
-static void WriteDITemplateValueParameter(const DITemplateValueParameter *N,
-                                          const ValueEnumerator &VE,
-                                          BitstreamWriter &Stream,
-                                          SmallVectorImpl<uint64_t> &Record,
-                                          unsigned Abbrev) {
+void ModuleBitcodeWriter::writeDITemplateValueParameter(
+    const DITemplateValueParameter *N, SmallVectorImpl<uint64_t> &Record,
+    unsigned Abbrev) {
   Record.push_back(N->isDistinct());
   Record.push_back(N->getTag());
   Record.push_back(VE.getMetadataOrNullID(N->getRawName()));
@@ -1163,11 +1693,9 @@ static void WriteDITemplateValueParameter(const DITemplateValueParameter *N,
   Record.clear();
 }
 
-static void WriteDIGlobalVariable(const DIGlobalVariable *N,
-                                  const ValueEnumerator &VE,
-                                  BitstreamWriter &Stream,
-                                  SmallVectorImpl<uint64_t> &Record,
-                                  unsigned Abbrev) {
+void ModuleBitcodeWriter::writeDIGlobalVariable(
+    const DIGlobalVariable *N, SmallVectorImpl<uint64_t> &Record,
+    unsigned Abbrev) {
   Record.push_back(N->isDistinct());
   Record.push_back(VE.getMetadataOrNullID(N->getScope()));
   Record.push_back(VE.getMetadataOrNullID(N->getRawName()));
@@ -1184,11 +1712,9 @@ static void WriteDIGlobalVariable(const DIGlobalVariable *N,
   Record.clear();
 }
 
-static void WriteDILocalVariable(const DILocalVariable *N,
-                                 const ValueEnumerator &VE,
-                                 BitstreamWriter &Stream,
-                                 SmallVectorImpl<uint64_t> &Record,
-                                 unsigned Abbrev) {
+void ModuleBitcodeWriter::writeDILocalVariable(
+    const DILocalVariable *N, SmallVectorImpl<uint64_t> &Record,
+    unsigned Abbrev) {
   Record.push_back(N->isDistinct());
   Record.push_back(VE.getMetadataOrNullID(N->getScope()));
   Record.push_back(VE.getMetadataOrNullID(N->getRawName()));
@@ -1202,10 +1728,9 @@ static void WriteDILocalVariable(const DILocalVariable *N,
   Record.clear();
 }
 
-static void WriteDIExpression(const DIExpression *N, const ValueEnumerator &,
-                              BitstreamWriter &Stream,
-                              SmallVectorImpl<uint64_t> &Record,
-                              unsigned Abbrev) {
+void ModuleBitcodeWriter::writeDIExpression(const DIExpression *N,
+                                            SmallVectorImpl<uint64_t> &Record,
+                                            unsigned Abbrev) {
   Record.reserve(N->getElements().size() + 1);
 
   Record.push_back(N->isDistinct());
@@ -1215,11 +1740,9 @@ static void WriteDIExpression(const DIExpression *N, const ValueEnumerator &,
   Record.clear();
 }
 
-static void WriteDIObjCProperty(const DIObjCProperty *N,
-                                const ValueEnumerator &VE,
-                                BitstreamWriter &Stream,
-                                SmallVectorImpl<uint64_t> &Record,
-                                unsigned Abbrev) {
+void ModuleBitcodeWriter::writeDIObjCProperty(const DIObjCProperty *N,
+                                              SmallVectorImpl<uint64_t> &Record,
+                                              unsigned Abbrev) {
   Record.push_back(N->isDistinct());
   Record.push_back(VE.getMetadataOrNullID(N->getRawName()));
   Record.push_back(VE.getMetadataOrNullID(N->getFile()));
@@ -1233,11 +1756,9 @@ static void WriteDIObjCProperty(const DIObjCProperty *N,
   Record.clear();
 }
 
-static void WriteDIImportedEntity(const DIImportedEntity *N,
-                                  const ValueEnumerator &VE,
-                                  BitstreamWriter &Stream,
-                                  SmallVectorImpl<uint64_t> &Record,
-                                  unsigned Abbrev) {
+void ModuleBitcodeWriter::writeDIImportedEntity(
+    const DIImportedEntity *N, SmallVectorImpl<uint64_t> &Record,
+    unsigned Abbrev) {
   Record.push_back(N->isDistinct());
   Record.push_back(N->getTag());
   Record.push_back(VE.getMetadataOrNullID(N->getScope()));
@@ -1249,71 +1770,87 @@ static void WriteDIImportedEntity(const DIImportedEntity *N,
   Record.clear();
 }
 
-static void WriteModuleMetadata(const Module *M,
-                                const ValueEnumerator &VE,
-                                BitstreamWriter &Stream) {
-  const auto &MDs = VE.getMDs();
-  if (MDs.empty() && M->named_metadata_empty())
+unsigned ModuleBitcodeWriter::createNamedMetadataAbbrev() {
+  BitCodeAbbrev *Abbv = new BitCodeAbbrev();
+  Abbv->Add(BitCodeAbbrevOp(bitc::METADATA_NAME));
+  Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Array));
+  Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Fixed, 8));
+  return Stream.EmitAbbrev(Abbv);
+}
+
+void ModuleBitcodeWriter::writeNamedMetadata(
+    SmallVectorImpl<uint64_t> &Record) {
+  if (M.named_metadata_empty())
     return;
 
-  Stream.EnterSubblock(bitc::METADATA_BLOCK_ID, 3);
+  unsigned Abbrev = createNamedMetadataAbbrev();
+  for (const NamedMDNode &NMD : M.named_metadata()) {
+    // Write name.
+    StringRef Str = NMD.getName();
+    Record.append(Str.bytes_begin(), Str.bytes_end());
+    Stream.EmitRecord(bitc::METADATA_NAME, Record, Abbrev);
+    Record.clear();
 
-  unsigned MDSAbbrev = 0;
-  if (VE.hasMDString()) {
-    // Abbrev for METADATA_STRING.
-    BitCodeAbbrev *Abbv = new BitCodeAbbrev();
-    Abbv->Add(BitCodeAbbrevOp(bitc::METADATA_STRING));
-    Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Array));
-    Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Fixed, 8));
-    MDSAbbrev = Stream.EmitAbbrev(Abbv);
+    // Write named metadata operands.
+    for (const MDNode *N : NMD.operands())
+      Record.push_back(VE.getMetadataID(N));
+    Stream.EmitRecord(bitc::METADATA_NAMED_NODE, Record, 0);
+    Record.clear();
   }
+}
 
-  // Initialize MDNode abbreviations.
-#define HANDLE_MDNODE_LEAF(CLASS) unsigned CLASS##Abbrev = 0;
-#include "llvm/IR/Metadata.def"
+unsigned ModuleBitcodeWriter::createMetadataStringsAbbrev() {
+  BitCodeAbbrev *Abbv = new BitCodeAbbrev();
+  Abbv->Add(BitCodeAbbrevOp(bitc::METADATA_STRINGS));
+  Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 6)); // # of strings
+  Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 6)); // offset to chars
+  Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Blob));
+  return Stream.EmitAbbrev(Abbv);
+}
 
-  if (VE.hasDILocation()) {
-    // Abbrev for METADATA_LOCATION.
-    //
-    // Assume the column is usually under 128, and always output the inlined-at
-    // location (it's never more expensive than building an array size 1).
-    BitCodeAbbrev *Abbv = new BitCodeAbbrev();
-    Abbv->Add(BitCodeAbbrevOp(bitc::METADATA_LOCATION));
-    Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Fixed, 1));
-    Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 6));
-    Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 8));
-    Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 6));
-    Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 6));
-    DILocationAbbrev = Stream.EmitAbbrev(Abbv);
-  }
+/// Write out a record for MDString.
+///
+/// All the metadata strings in a metadata block are emitted in a single
+/// record.  The sizes and strings themselves are shoved into a blob.
+void ModuleBitcodeWriter::writeMetadataStrings(
+    ArrayRef<const Metadata *> Strings, SmallVectorImpl<uint64_t> &Record) {
+  if (Strings.empty())
+    return;
 
-  if (VE.hasGenericDINode()) {
-    // Abbrev for METADATA_GENERIC_DEBUG.
-    //
-    // Assume the column is usually under 128, and always output the inlined-at
-    // location (it's never more expensive than building an array size 1).
-    BitCodeAbbrev *Abbv = new BitCodeAbbrev();
-    Abbv->Add(BitCodeAbbrevOp(bitc::METADATA_GENERIC_DEBUG));
-    Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Fixed, 1));
-    Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 6));
-    Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Fixed, 1));
-    Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 6));
-    Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Array));
-    Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 6));
-    GenericDINodeAbbrev = Stream.EmitAbbrev(Abbv);
-  }
+  // Start the record with the number of strings.
+  Record.push_back(bitc::METADATA_STRINGS);
+  Record.push_back(Strings.size());
 
-  unsigned NameAbbrev = 0;
-  if (!M->named_metadata_empty()) {
-    // Abbrev for METADATA_NAME.
-    BitCodeAbbrev *Abbv = new BitCodeAbbrev();
-    Abbv->Add(BitCodeAbbrevOp(bitc::METADATA_NAME));
-    Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Array));
-    Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Fixed, 8));
-    NameAbbrev = Stream.EmitAbbrev(Abbv);
+  // Emit the sizes of the strings in the blob.
+  SmallString<256> Blob;
+  {
+    BitstreamWriter W(Blob);
+    for (const Metadata *MD : Strings)
+      W.EmitVBR(cast<MDString>(MD)->getLength(), 6);
+    W.FlushToWord();
   }
 
-  SmallVector<uint64_t, 64> Record;
+  // Add the offset to the strings to the record.
+  Record.push_back(Blob.size());
+
+  // Add the strings to the blob.
+  for (const Metadata *MD : Strings)
+    Blob.append(cast<MDString>(MD)->getString());
+
+  // Emit the final record.
+  Stream.EmitRecordWithBlob(createMetadataStringsAbbrev(), Record, Blob);
+  Record.clear();
+}
+
+void ModuleBitcodeWriter::writeMetadataRecords(
+    ArrayRef<const Metadata *> MDs, SmallVectorImpl<uint64_t> &Record) {
+  if (MDs.empty())
+    return;
+
+  // Initialize MDNode abbreviations.
+#define HANDLE_MDNODE_LEAF(CLASS) unsigned CLASS##Abbrev = 0;
+#include "llvm/IR/Metadata.def"
+
   for (const Metadata *MD : MDs) {
     if (const MDNode *N = dyn_cast<MDNode>(MD)) {
       assert(N->isResolved() && "Expected forward references to be resolved");
@@ -1323,82 +1860,79 @@ static void WriteModuleMetadata(const Module *M,
         llvm_unreachable("Invalid MDNode subclass");
 #define HANDLE_MDNODE_LEAF(CLASS)                                              \
   case Metadata::CLASS##Kind:                                                  \
-    Write##CLASS(cast<CLASS>(N), VE, Stream, Record, CLASS##Abbrev);           \
+    write##CLASS(cast<CLASS>(N), Record, CLASS##Abbrev);                       \
     continue;
 #include "llvm/IR/Metadata.def"
       }
     }
-    if (const auto *MDC = dyn_cast<ConstantAsMetadata>(MD)) {
-      WriteValueAsMetadata(MDC, VE, Stream, Record);
-      continue;
-    }
-    const MDString *MDS = cast<MDString>(MD);
-    // Code: [strchar x N]
-    Record.append(MDS->bytes_begin(), MDS->bytes_end());
-
-    // Emit the finished record.
-    Stream.EmitRecord(bitc::METADATA_STRING, Record, MDSAbbrev);
-    Record.clear();
+    writeValueAsMetadata(cast<ValueAsMetadata>(MD), Record);
   }
+}
 
-  // Write named metadata.
-  for (const NamedMDNode &NMD : M->named_metadata()) {
-    // Write name.
-    StringRef Str = NMD.getName();
-    Record.append(Str.bytes_begin(), Str.bytes_end());
-    Stream.EmitRecord(bitc::METADATA_NAME, Record, NameAbbrev);
-    Record.clear();
+void ModuleBitcodeWriter::writeModuleMetadata() {
+  if (!VE.hasMDs() && M.named_metadata_empty())
+    return;
 
-    // Write named metadata operands.
-    for (const MDNode *N : NMD.operands())
-      Record.push_back(VE.getMetadataID(N));
-    Stream.EmitRecord(bitc::METADATA_NAMED_NODE, Record, 0);
-    Record.clear();
-  }
+  Stream.EnterSubblock(bitc::METADATA_BLOCK_ID, 3);
+  SmallVector<uint64_t, 64> Record;
+  writeMetadataStrings(VE.getMDStrings(), Record);
+  writeMetadataRecords(VE.getNonMDStrings(), Record);
+  writeNamedMetadata(Record);
+
+  auto AddDeclAttachedMetadata = [&](const GlobalObject &GO) {
+    SmallVector<uint64_t, 4> Record;
+    Record.push_back(VE.getValueID(&GO));
+    pushGlobalMetadataAttachment(Record, GO);
+    Stream.EmitRecord(bitc::METADATA_GLOBAL_DECL_ATTACHMENT, Record);
+  };
+  for (const Function &F : M)
+    if (F.isDeclaration() && F.hasMetadata())
+      AddDeclAttachedMetadata(F);
+  // FIXME: Only store metadata for declarations here, and move data for global
+  // variable definitions to a separate block (PR28134).
+  for (const GlobalVariable &GV : M.globals())
+    if (GV.hasMetadata())
+      AddDeclAttachedMetadata(GV);
 
   Stream.ExitBlock();
 }
 
-static void WriteFunctionLocalMetadata(const Function &F,
-                                       const ValueEnumerator &VE,
-                                       BitstreamWriter &Stream) {
-  bool StartedMetadataBlock = false;
+void ModuleBitcodeWriter::writeFunctionMetadata(const Function &F) {
+  if (!VE.hasMDs())
+    return;
+
+  Stream.EnterSubblock(bitc::METADATA_BLOCK_ID, 3);
   SmallVector<uint64_t, 64> Record;
-  const SmallVectorImpl<const LocalAsMetadata *> &MDs =
-      VE.getFunctionLocalMDs();
-  for (unsigned i = 0, e = MDs.size(); i != e; ++i) {
-    assert(MDs[i] && "Expected valid function-local metadata");
-    if (!StartedMetadataBlock) {
-      Stream.EnterSubblock(bitc::METADATA_BLOCK_ID, 3);
-      StartedMetadataBlock = true;
-    }
-    WriteValueAsMetadata(MDs[i], VE, Stream, Record);
-  }
+  writeMetadataStrings(VE.getMDStrings(), Record);
+  writeMetadataRecords(VE.getNonMDStrings(), Record);
+  Stream.ExitBlock();
+}
 
-  if (StartedMetadataBlock)
-    Stream.ExitBlock();
+void ModuleBitcodeWriter::pushGlobalMetadataAttachment(
+    SmallVectorImpl<uint64_t> &Record, const GlobalObject &GO) {
+  // [n x [id, mdnode]]
+  SmallVector<std::pair<unsigned, MDNode *>, 4> MDs;
+  GO.getAllMetadata(MDs);
+  for (const auto &I : MDs) {
+    Record.push_back(I.first);
+    Record.push_back(VE.getMetadataID(I.second));
+  }
 }
 
-static void WriteMetadataAttachment(const Function &F,
-                                    const ValueEnumerator &VE,
-                                    BitstreamWriter &Stream) {
+void ModuleBitcodeWriter::writeFunctionMetadataAttachment(const Function &F) {
   Stream.EnterSubblock(bitc::METADATA_ATTACHMENT_ID, 3);
 
   SmallVector<uint64_t, 64> Record;
 
-  // Write metadata attachments
-  // METADATA_ATTACHMENT - [m x [value, [n x [id, mdnode]]]
-  SmallVector<std::pair<unsigned, MDNode *>, 4> MDs;
-  F.getAllMetadata(MDs);
-  if (!MDs.empty()) {
-    for (const auto &I : MDs) {
-      Record.push_back(I.first);
-      Record.push_back(VE.getMetadataID(I.second));
-    }
+  if (F.hasMetadata()) {
+    pushGlobalMetadataAttachment(Record, F);
     Stream.EmitRecord(bitc::METADATA_ATTACHMENT, Record, 0);
     Record.clear();
   }
 
+  // Write metadata attachments
+  // METADATA_ATTACHMENT - [m x [value, [n x [id, mdnode]]]
+  SmallVector<std::pair<unsigned, MDNode *>, 4> MDs;
   for (const BasicBlock &BB : F)
     for (const Instruction &I : BB) {
       MDs.clear();
@@ -1420,13 +1954,13 @@ static void WriteMetadataAttachment(const Function &F,
   Stream.ExitBlock();
 }
 
-static void WriteModuleMetadataStore(const Module *M, BitstreamWriter &Stream) {
+void ModuleBitcodeWriter::writeModuleMetadataKinds() {
   SmallVector<uint64_t, 64> Record;
 
   // Write metadata kinds
   // METADATA_KIND - [n x [id, name]]
   SmallVector<StringRef, 8> Names;
-  M->getMDKindNames(Names);
+  M.getMDKindNames(Names);
 
   if (Names.empty()) return;
 
@@ -1444,7 +1978,7 @@ static void WriteModuleMetadataStore(const Module *M, BitstreamWriter &Stream) {
   Stream.ExitBlock();
 }
 
-static void WriteOperandBundleTags(const Module *M, BitstreamWriter &Stream) {
+void ModuleBitcodeWriter::writeOperandBundleTags() {
   // Write metadata kinds
   //
   // OPERAND_BUNDLE_TAGS_BLOCK_ID : N x OPERAND_BUNDLE_TAG
@@ -1452,7 +1986,7 @@ static void WriteOperandBundleTags(const Module *M, BitstreamWriter &Stream) {
   // OPERAND_BUNDLE_TAG - [strchr x N]
 
   SmallVector<StringRef, 8> Tags;
-  M->getOperandBundleTags(Tags);
+  M.getOperandBundleTags(Tags);
 
   if (Tags.empty())
     return;
@@ -1478,9 +2012,8 @@ static void emitSignedInt64(SmallVectorImpl<uint64_t> &Vals, uint64_t V) {
     Vals.push_back((-V << 1) | 1);
 }
 
-static void WriteConstants(unsigned FirstVal, unsigned LastVal,
-                           const ValueEnumerator &VE,
-                           BitstreamWriter &Stream, bool isGlobal) {
+void ModuleBitcodeWriter::writeConstants(unsigned FirstVal, unsigned LastVal,
+                                         bool isGlobal) {
   if (FirstVal == LastVal) return;
 
   Stream.EnterSubblock(bitc::CONSTANTS_BLOCK_ID, 4);
@@ -1635,8 +2168,7 @@ static void WriteConstants(unsigned FirstVal, unsigned LastVal,
           Record.push_back(
               CDS->getElementAsAPFloat(i).bitcastToAPInt().getLimitedValue());
       }
-    } else if (isa<ConstantArray>(C) || isa<ConstantStruct>(C) ||
-               isa<ConstantVector>(C)) {
+    } else if (isa<ConstantAggregate>(C)) {
       Code = bitc::CST_CODE_AGGREGATE;
       for (const Value *Op : C->operands())
         Record.push_back(VE.getValueID(Op));
@@ -1646,17 +2178,17 @@ static void WriteConstants(unsigned FirstVal, unsigned LastVal,
       default:
         if (Instruction::isCast(CE->getOpcode())) {
           Code = bitc::CST_CODE_CE_CAST;
-          Record.push_back(GetEncodedCastOpcode(CE->getOpcode()));
+          Record.push_back(getEncodedCastOpcode(CE->getOpcode()));
           Record.push_back(VE.getTypeID(C->getOperand(0)->getType()));
           Record.push_back(VE.getValueID(C->getOperand(0)));
           AbbrevToUse = CONSTANTS_CE_CAST_Abbrev;
         } else {
           assert(CE->getNumOperands() == 2 && "Unknown constant expr!");
           Code = bitc::CST_CODE_CE_BINOP;
-          Record.push_back(GetEncodedBinaryOpcode(CE->getOpcode()));
+          Record.push_back(getEncodedBinaryOpcode(CE->getOpcode()));
           Record.push_back(VE.getValueID(C->getOperand(0)));
           Record.push_back(VE.getValueID(C->getOperand(1)));
-          uint64_t Flags = GetOptimizationFlags(CE);
+          uint64_t Flags = getOptimizationFlags(CE);
           if (Flags != 0)
             Record.push_back(Flags);
         }
@@ -1735,21 +2267,20 @@ static void WriteConstants(unsigned FirstVal, unsigned LastVal,
   Stream.ExitBlock();
 }
 
-static void WriteModuleConstants(const ValueEnumerator &VE,
-                                 BitstreamWriter &Stream) {
+void ModuleBitcodeWriter::writeModuleConstants() {
   const ValueEnumerator::ValueList &Vals = VE.getValues();
 
   // Find the first constant to emit, which is the first non-globalvalue value.
   // We know globalvalues have been emitted by WriteModuleInfo.
   for (unsigned i = 0, e = Vals.size(); i != e; ++i) {
     if (!isa<GlobalValue>(Vals[i].first)) {
-      WriteConstants(i, Vals.size(), VE, Stream, true);
+      writeConstants(i, Vals.size(), true);
       return;
     }
   }
 }
 
-/// PushValueAndType - The file has to encode both the value and type id for
+/// pushValueAndType - The file has to encode both the value and type id for
 /// many values, because we need to know what type to create for forward
 /// references.  However, most operands are not forward references, so this type
 /// field is not needed.
@@ -1757,9 +2288,8 @@ static void WriteModuleConstants(const ValueEnumerator &VE,
 /// This function adds V's value ID to Vals.  If the value ID is higher than the
 /// instruction ID, then it is a forward reference, and it also includes the
 /// type ID.  The value ID that is written is encoded relative to the InstID.
-static bool PushValueAndType(const Value *V, unsigned InstID,
-                             SmallVectorImpl<unsigned> &Vals,
-                             ValueEnumerator &VE) {
+bool ModuleBitcodeWriter::pushValueAndType(const Value *V, unsigned InstID,
+                                           SmallVectorImpl<unsigned> &Vals) {
   unsigned ValID = VE.getValueID(V);
   // Make encoding relative to the InstID.
   Vals.push_back(InstID - ValID);
@@ -1770,8 +2300,8 @@ static bool PushValueAndType(const Value *V, unsigned InstID,
   return false;
 }
 
-static void WriteOperandBundles(BitstreamWriter &Stream, ImmutableCallSite CS,
-                                unsigned InstID, ValueEnumerator &VE) {
+void ModuleBitcodeWriter::writeOperandBundles(ImmutableCallSite CS,
+                                              unsigned InstID) {
   SmallVector<unsigned, 64> Record;
   LLVMContext &C = CS.getInstruction()->getContext();
 
@@ -1780,34 +2310,32 @@ static void WriteOperandBundles(BitstreamWriter &Stream, ImmutableCallSite CS,
     Record.push_back(C.getOperandBundleTagID(Bundle.getTagName()));
 
     for (auto &Input : Bundle.Inputs)
-      PushValueAndType(Input, InstID, Record, VE);
+      pushValueAndType(Input, InstID, Record);
 
     Stream.EmitRecord(bitc::FUNC_CODE_OPERAND_BUNDLE, Record);
     Record.clear();
   }
 }
 
-/// pushValue - Like PushValueAndType, but where the type of the value is
+/// pushValue - Like pushValueAndType, but where the type of the value is
 /// omitted (perhaps it was already encoded in an earlier operand).
-static void pushValue(const Value *V, unsigned InstID,
-                      SmallVectorImpl<unsigned> &Vals,
-                      ValueEnumerator &VE) {
+void ModuleBitcodeWriter::pushValue(const Value *V, unsigned InstID,
+                                    SmallVectorImpl<unsigned> &Vals) {
   unsigned ValID = VE.getValueID(V);
   Vals.push_back(InstID - ValID);
 }
 
-static void pushValueSigned(const Value *V, unsigned InstID,
-                            SmallVectorImpl<uint64_t> &Vals,
-                            ValueEnumerator &VE) {
+void ModuleBitcodeWriter::pushValueSigned(const Value *V, unsigned InstID,
+                                          SmallVectorImpl<uint64_t> &Vals) {
   unsigned ValID = VE.getValueID(V);
   int64_t diff = ((int32_t)InstID - (int32_t)ValID);
   emitSignedInt64(Vals, diff);
 }
 
 /// WriteInstruction - Emit an instruction to the specified stream.
-static void WriteInstruction(const Instruction &I, unsigned InstID,
-                             ValueEnumerator &VE, BitstreamWriter &Stream,
-                             SmallVectorImpl<unsigned> &Vals) {
+void ModuleBitcodeWriter::writeInstruction(const Instruction &I,
+                                           unsigned InstID,
+                                           SmallVectorImpl<unsigned> &Vals) {
   unsigned Code = 0;
   unsigned AbbrevToUse = 0;
   VE.setInstructionID(&I);
@@ -1815,18 +2343,18 @@ static void WriteInstruction(const Instruction &I, unsigned InstID,
   default:
     if (Instruction::isCast(I.getOpcode())) {
       Code = bitc::FUNC_CODE_INST_CAST;
-      if (!PushValueAndType(I.getOperand(0), InstID, Vals, VE))
+      if (!pushValueAndType(I.getOperand(0), InstID, Vals))
         AbbrevToUse = FUNCTION_INST_CAST_ABBREV;
       Vals.push_back(VE.getTypeID(I.getType()));
-      Vals.push_back(GetEncodedCastOpcode(I.getOpcode()));
+      Vals.push_back(getEncodedCastOpcode(I.getOpcode()));
     } else {
       assert(isa<BinaryOperator>(I) && "Unknown instruction!");
       Code = bitc::FUNC_CODE_INST_BINOP;
-      if (!PushValueAndType(I.getOperand(0), InstID, Vals, VE))
+      if (!pushValueAndType(I.getOperand(0), InstID, Vals))
         AbbrevToUse = FUNCTION_INST_BINOP_ABBREV;
-      pushValue(I.getOperand(1), InstID, Vals, VE);
-      Vals.push_back(GetEncodedBinaryOpcode(I.getOpcode()));
-      uint64_t Flags = GetOptimizationFlags(&I);
+      pushValue(I.getOperand(1), InstID, Vals);
+      Vals.push_back(getEncodedBinaryOpcode(I.getOpcode()));
+      uint64_t Flags = getOptimizationFlags(&I);
       if (Flags != 0) {
         if (AbbrevToUse == FUNCTION_INST_BINOP_ABBREV)
           AbbrevToUse = FUNCTION_INST_BINOP_FLAGS_ABBREV;
@@ -1842,55 +2370,55 @@ static void WriteInstruction(const Instruction &I, unsigned InstID,
     Vals.push_back(GEPInst.isInBounds());
     Vals.push_back(VE.getTypeID(GEPInst.getSourceElementType()));
     for (unsigned i = 0, e = I.getNumOperands(); i != e; ++i)
-      PushValueAndType(I.getOperand(i), InstID, Vals, VE);
+      pushValueAndType(I.getOperand(i), InstID, Vals);
     break;
   }
   case Instruction::ExtractValue: {
     Code = bitc::FUNC_CODE_INST_EXTRACTVAL;
-    PushValueAndType(I.getOperand(0), InstID, Vals, VE);
+    pushValueAndType(I.getOperand(0), InstID, Vals);
     const ExtractValueInst *EVI = cast<ExtractValueInst>(&I);
     Vals.append(EVI->idx_begin(), EVI->idx_end());
     break;
   }
   case Instruction::InsertValue: {
     Code = bitc::FUNC_CODE_INST_INSERTVAL;
-    PushValueAndType(I.getOperand(0), InstID, Vals, VE);
-    PushValueAndType(I.getOperand(1), InstID, Vals, VE);
+    pushValueAndType(I.getOperand(0), InstID, Vals);
+    pushValueAndType(I.getOperand(1), InstID, Vals);
     const InsertValueInst *IVI = cast<InsertValueInst>(&I);
     Vals.append(IVI->idx_begin(), IVI->idx_end());
     break;
   }
   case Instruction::Select:
     Code = bitc::FUNC_CODE_INST_VSELECT;
-    PushValueAndType(I.getOperand(1), InstID, Vals, VE);
-    pushValue(I.getOperand(2), InstID, Vals, VE);
-    PushValueAndType(I.getOperand(0), InstID, Vals, VE);
+    pushValueAndType(I.getOperand(1), InstID, Vals);
+    pushValue(I.getOperand(2), InstID, Vals);
+    pushValueAndType(I.getOperand(0), InstID, Vals);
     break;
   case Instruction::ExtractElement:
     Code = bitc::FUNC_CODE_INST_EXTRACTELT;
-    PushValueAndType(I.getOperand(0), InstID, Vals, VE);
-    PushValueAndType(I.getOperand(1), InstID, Vals, VE);
+    pushValueAndType(I.getOperand(0), InstID, Vals);
+    pushValueAndType(I.getOperand(1), InstID, Vals);
     break;
   case Instruction::InsertElement:
     Code = bitc::FUNC_CODE_INST_INSERTELT;
-    PushValueAndType(I.getOperand(0), InstID, Vals, VE);
-    pushValue(I.getOperand(1), InstID, Vals, VE);
-    PushValueAndType(I.getOperand(2), InstID, Vals, VE);
+    pushValueAndType(I.getOperand(0), InstID, Vals);
+    pushValue(I.getOperand(1), InstID, Vals);
+    pushValueAndType(I.getOperand(2), InstID, Vals);
     break;
   case Instruction::ShuffleVector:
     Code = bitc::FUNC_CODE_INST_SHUFFLEVEC;
-    PushValueAndType(I.getOperand(0), InstID, Vals, VE);
-    pushValue(I.getOperand(1), InstID, Vals, VE);
-    pushValue(I.getOperand(2), InstID, Vals, VE);
+    pushValueAndType(I.getOperand(0), InstID, Vals);
+    pushValue(I.getOperand(1), InstID, Vals);
+    pushValue(I.getOperand(2), InstID, Vals);
     break;
   case Instruction::ICmp:
   case Instruction::FCmp: {
     // compare returning Int1Ty or vector of Int1Ty
     Code = bitc::FUNC_CODE_INST_CMP2;
-    PushValueAndType(I.getOperand(0), InstID, Vals, VE);
-    pushValue(I.getOperand(1), InstID, Vals, VE);
+    pushValueAndType(I.getOperand(0), InstID, Vals);
+    pushValue(I.getOperand(1), InstID, Vals);
     Vals.push_back(cast<CmpInst>(I).getPredicate());
-    uint64_t Flags = GetOptimizationFlags(&I);
+    uint64_t Flags = getOptimizationFlags(&I);
     if (Flags != 0)
       Vals.push_back(Flags);
     break;
@@ -1903,11 +2431,11 @@ static void WriteInstruction(const Instruction &I, unsigned InstID,
       if (NumOperands == 0)
         AbbrevToUse = FUNCTION_INST_RET_VOID_ABBREV;
       else if (NumOperands == 1) {
-        if (!PushValueAndType(I.getOperand(0), InstID, Vals, VE))
+        if (!pushValueAndType(I.getOperand(0), InstID, Vals))
           AbbrevToUse = FUNCTION_INST_RET_VAL_ABBREV;
       } else {
         for (unsigned i = 0, e = NumOperands; i != e; ++i)
-          PushValueAndType(I.getOperand(i), InstID, Vals, VE);
+          pushValueAndType(I.getOperand(i), InstID, Vals);
       }
     }
     break;
@@ -1918,7 +2446,7 @@ static void WriteInstruction(const Instruction &I, unsigned InstID,
       Vals.push_back(VE.getValueID(II.getSuccessor(0)));
       if (II.isConditional()) {
         Vals.push_back(VE.getValueID(II.getSuccessor(1)));
-        pushValue(II.getCondition(), InstID, Vals, VE);
+        pushValue(II.getCondition(), InstID, Vals);
       }
     }
     break;
@@ -1927,7 +2455,7 @@ static void WriteInstruction(const Instruction &I, unsigned InstID,
       Code = bitc::FUNC_CODE_INST_SWITCH;
       const SwitchInst &SI = cast<SwitchInst>(I);
       Vals.push_back(VE.getTypeID(SI.getCondition()->getType()));
-      pushValue(SI.getCondition(), InstID, Vals, VE);
+      pushValue(SI.getCondition(), InstID, Vals);
       Vals.push_back(VE.getValueID(SI.getDefaultDest()));
       for (SwitchInst::ConstCaseIt Case : SI.cases()) {
         Vals.push_back(VE.getValueID(Case.getCaseValue()));
@@ -1939,7 +2467,7 @@ static void WriteInstruction(const Instruction &I, unsigned InstID,
     Code = bitc::FUNC_CODE_INST_INDIRECTBR;
     Vals.push_back(VE.getTypeID(I.getOperand(0)->getType()));
     // Encode the address operand as relative, but not the basic blocks.
-    pushValue(I.getOperand(0), InstID, Vals, VE);
+    pushValue(I.getOperand(0), InstID, Vals);
     for (unsigned i = 1, e = I.getNumOperands(); i != e; ++i)
       Vals.push_back(VE.getValueID(I.getOperand(i)));
     break;
@@ -1950,7 +2478,7 @@ static void WriteInstruction(const Instruction &I, unsigned InstID,
     FunctionType *FTy = II->getFunctionType();
 
     if (II->hasOperandBundles())
-      WriteOperandBundles(Stream, II, InstID, VE);
+      writeOperandBundles(II, InstID);
 
     Code = bitc::FUNC_CODE_INST_INVOKE;
 
@@ -1959,28 +2487,28 @@ static void WriteInstruction(const Instruction &I, unsigned InstID,
     Vals.push_back(VE.getValueID(II->getNormalDest()));
     Vals.push_back(VE.getValueID(II->getUnwindDest()));
     Vals.push_back(VE.getTypeID(FTy));
-    PushValueAndType(Callee, InstID, Vals, VE);
+    pushValueAndType(Callee, InstID, Vals);
 
     // Emit value #'s for the fixed parameters.
     for (unsigned i = 0, e = FTy->getNumParams(); i != e; ++i)
-      pushValue(I.getOperand(i), InstID, Vals, VE);  // fixed param.
+      pushValue(I.getOperand(i), InstID, Vals); // fixed param.
 
     // Emit type/value pairs for varargs params.
     if (FTy->isVarArg()) {
       for (unsigned i = FTy->getNumParams(), e = I.getNumOperands()-3;
            i != e; ++i)
-        PushValueAndType(I.getOperand(i), InstID, Vals, VE); // vararg
+        pushValueAndType(I.getOperand(i), InstID, Vals); // vararg
     }
     break;
   }
   case Instruction::Resume:
     Code = bitc::FUNC_CODE_INST_RESUME;
-    PushValueAndType(I.getOperand(0), InstID, Vals, VE);
+    pushValueAndType(I.getOperand(0), InstID, Vals);
     break;
   case Instruction::CleanupRet: {
     Code = bitc::FUNC_CODE_INST_CLEANUPRET;
     const auto &CRI = cast<CleanupReturnInst>(I);
-    pushValue(CRI.getCleanupPad(), InstID, Vals, VE);
+    pushValue(CRI.getCleanupPad(), InstID, Vals);
     if (CRI.hasUnwindDest())
       Vals.push_back(VE.getValueID(CRI.getUnwindDest()));
     break;
@@ -1988,7 +2516,7 @@ static void WriteInstruction(const Instruction &I, unsigned InstID,
   case Instruction::CatchRet: {
     Code = bitc::FUNC_CODE_INST_CATCHRET;
     const auto &CRI = cast<CatchReturnInst>(I);
-    pushValue(CRI.getCatchPad(), InstID, Vals, VE);
+    pushValue(CRI.getCatchPad(), InstID, Vals);
     Vals.push_back(VE.getValueID(CRI.getSuccessor()));
     break;
   }
@@ -1997,19 +2525,19 @@ static void WriteInstruction(const Instruction &I, unsigned InstID,
     const auto &FuncletPad = cast<FuncletPadInst>(I);
     Code = isa<CatchPadInst>(FuncletPad) ? bitc::FUNC_CODE_INST_CATCHPAD
                                          : bitc::FUNC_CODE_INST_CLEANUPPAD;
-    pushValue(FuncletPad.getParentPad(), InstID, Vals, VE);
+    pushValue(FuncletPad.getParentPad(), InstID, Vals);
 
     unsigned NumArgOperands = FuncletPad.getNumArgOperands();
     Vals.push_back(NumArgOperands);
     for (unsigned Op = 0; Op != NumArgOperands; ++Op)
-      PushValueAndType(FuncletPad.getArgOperand(Op), InstID, Vals, VE);
+      pushValueAndType(FuncletPad.getArgOperand(Op), InstID, Vals);
     break;
   }
   case Instruction::CatchSwitch: {
     Code = bitc::FUNC_CODE_INST_CATCHSWITCH;
     const auto &CatchSwitch = cast<CatchSwitchInst>(I);
 
-    pushValue(CatchSwitch.getParentPad(), InstID, Vals, VE);
+    pushValue(CatchSwitch.getParentPad(), InstID, Vals);
 
     unsigned NumHandlers = CatchSwitch.getNumHandlers();
     Vals.push_back(NumHandlers);
@@ -2034,7 +2562,7 @@ static void WriteInstruction(const Instruction &I, unsigned InstID,
     SmallVector<uint64_t, 128> Vals64;
     Vals64.push_back(VE.getTypeID(PN.getType()));
     for (unsigned i = 0, e = PN.getNumIncomingValues(); i != e; ++i) {
-      pushValueSigned(PN.getIncomingValue(i), InstID, Vals64, VE);
+      pushValueSigned(PN.getIncomingValue(i), InstID, Vals64);
       Vals64.push_back(VE.getValueID(PN.getIncomingBlock(i)));
     }
     // Emit a Vals64 vector and exit.
@@ -2054,7 +2582,7 @@ static void WriteInstruction(const Instruction &I, unsigned InstID,
         Vals.push_back(LandingPadInst::Catch);
       else
         Vals.push_back(LandingPadInst::Filter);
-      PushValueAndType(LP.getClause(I), InstID, Vals, VE);
+      pushValueAndType(LP.getClause(I), InstID, Vals);
     }
     break;
   }
@@ -2071,8 +2599,7 @@ static void WriteInstruction(const Instruction &I, unsigned InstID,
     assert(AlignRecord < 1 << 5 && "alignment greater than 1 << 64");
     AlignRecord |= AI.isUsedWithInAlloca() << 5;
     AlignRecord |= 1 << 6;
-    // Reserve bit 7 for SwiftError flag.
-    // AlignRecord |= AI.isSwiftError() << 7;
+    AlignRecord |= AI.isSwiftError() << 7;
     Vals.push_back(AlignRecord);
     break;
   }
@@ -2080,18 +2607,18 @@ static void WriteInstruction(const Instruction &I, unsigned InstID,
   case Instruction::Load:
     if (cast<LoadInst>(I).isAtomic()) {
       Code = bitc::FUNC_CODE_INST_LOADATOMIC;
-      PushValueAndType(I.getOperand(0), InstID, Vals, VE);
+      pushValueAndType(I.getOperand(0), InstID, Vals);
     } else {
       Code = bitc::FUNC_CODE_INST_LOAD;
-      if (!PushValueAndType(I.getOperand(0), InstID, Vals, VE))  // ptr
+      if (!pushValueAndType(I.getOperand(0), InstID, Vals)) // ptr
         AbbrevToUse = FUNCTION_INST_LOAD_ABBREV;
     }
     Vals.push_back(VE.getTypeID(I.getType()));
     Vals.push_back(Log2_32(cast<LoadInst>(I).getAlignment())+1);
     Vals.push_back(cast<LoadInst>(I).isVolatile());
     if (cast<LoadInst>(I).isAtomic()) {
-      Vals.push_back(GetEncodedOrdering(cast<LoadInst>(I).getOrdering()));
-      Vals.push_back(GetEncodedSynchScope(cast<LoadInst>(I).getSynchScope()));
+      Vals.push_back(getEncodedOrdering(cast<LoadInst>(I).getOrdering()));
+      Vals.push_back(getEncodedSynchScope(cast<LoadInst>(I).getSynchScope()));
     }
     break;
   case Instruction::Store:
@@ -2099,57 +2626,57 @@ static void WriteInstruction(const Instruction &I, unsigned InstID,
       Code = bitc::FUNC_CODE_INST_STOREATOMIC;
     else
       Code = bitc::FUNC_CODE_INST_STORE;
-    PushValueAndType(I.getOperand(1), InstID, Vals, VE);  // ptrty + ptr
-    PushValueAndType(I.getOperand(0), InstID, Vals, VE);  // valty + val
+    pushValueAndType(I.getOperand(1), InstID, Vals); // ptrty + ptr
+    pushValueAndType(I.getOperand(0), InstID, Vals); // valty + val
     Vals.push_back(Log2_32(cast<StoreInst>(I).getAlignment())+1);
     Vals.push_back(cast<StoreInst>(I).isVolatile());
     if (cast<StoreInst>(I).isAtomic()) {
-      Vals.push_back(GetEncodedOrdering(cast<StoreInst>(I).getOrdering()));
-      Vals.push_back(GetEncodedSynchScope(cast<StoreInst>(I).getSynchScope()));
+      Vals.push_back(getEncodedOrdering(cast<StoreInst>(I).getOrdering()));
+      Vals.push_back(getEncodedSynchScope(cast<StoreInst>(I).getSynchScope()));
     }
     break;
   case Instruction::AtomicCmpXchg:
     Code = bitc::FUNC_CODE_INST_CMPXCHG;
-    PushValueAndType(I.getOperand(0), InstID, Vals, VE);  // ptrty + ptr
-    PushValueAndType(I.getOperand(1), InstID, Vals, VE);         // cmp.
-    pushValue(I.getOperand(2), InstID, Vals, VE);         // newval.
+    pushValueAndType(I.getOperand(0), InstID, Vals); // ptrty + ptr
+    pushValueAndType(I.getOperand(1), InstID, Vals); // cmp.
+    pushValue(I.getOperand(2), InstID, Vals);        // newval.
     Vals.push_back(cast<AtomicCmpXchgInst>(I).isVolatile());
-    Vals.push_back(GetEncodedOrdering(
-                     cast<AtomicCmpXchgInst>(I).getSuccessOrdering()));
-    Vals.push_back(GetEncodedSynchScope(
-                     cast<AtomicCmpXchgInst>(I).getSynchScope()));
-    Vals.push_back(GetEncodedOrdering(
-                     cast<AtomicCmpXchgInst>(I).getFailureOrdering()));
+    Vals.push_back(
+        getEncodedOrdering(cast<AtomicCmpXchgInst>(I).getSuccessOrdering()));
+    Vals.push_back(
+        getEncodedSynchScope(cast<AtomicCmpXchgInst>(I).getSynchScope()));
+    Vals.push_back(
+        getEncodedOrdering(cast<AtomicCmpXchgInst>(I).getFailureOrdering()));
     Vals.push_back(cast<AtomicCmpXchgInst>(I).isWeak());
     break;
   case Instruction::AtomicRMW:
     Code = bitc::FUNC_CODE_INST_ATOMICRMW;
-    PushValueAndType(I.getOperand(0), InstID, Vals, VE);  // ptrty + ptr
-    pushValue(I.getOperand(1), InstID, Vals, VE);         // val.
-    Vals.push_back(GetEncodedRMWOperation(
-                     cast<AtomicRMWInst>(I).getOperation()));
+    pushValueAndType(I.getOperand(0), InstID, Vals); // ptrty + ptr
+    pushValue(I.getOperand(1), InstID, Vals);        // val.
+    Vals.push_back(
+        getEncodedRMWOperation(cast<AtomicRMWInst>(I).getOperation()));
     Vals.push_back(cast<AtomicRMWInst>(I).isVolatile());
-    Vals.push_back(GetEncodedOrdering(cast<AtomicRMWInst>(I).getOrdering()));
-    Vals.push_back(GetEncodedSynchScope(
-                     cast<AtomicRMWInst>(I).getSynchScope()));
+    Vals.push_back(getEncodedOrdering(cast<AtomicRMWInst>(I).getOrdering()));
+    Vals.push_back(
+        getEncodedSynchScope(cast<AtomicRMWInst>(I).getSynchScope()));
     break;
   case Instruction::Fence:
     Code = bitc::FUNC_CODE_INST_FENCE;
-    Vals.push_back(GetEncodedOrdering(cast<FenceInst>(I).getOrdering()));
-    Vals.push_back(GetEncodedSynchScope(cast<FenceInst>(I).getSynchScope()));
+    Vals.push_back(getEncodedOrdering(cast<FenceInst>(I).getOrdering()));
+    Vals.push_back(getEncodedSynchScope(cast<FenceInst>(I).getSynchScope()));
     break;
   case Instruction::Call: {
     const CallInst &CI = cast<CallInst>(I);
     FunctionType *FTy = CI.getFunctionType();
 
     if (CI.hasOperandBundles())
-      WriteOperandBundles(Stream, &CI, InstID, VE);
+      writeOperandBundles(&CI, InstID);
 
     Code = bitc::FUNC_CODE_INST_CALL;
 
     Vals.push_back(VE.getAttributeID(CI.getAttributes()));
 
-    unsigned Flags = GetOptimizationFlags(&I);
+    unsigned Flags = getOptimizationFlags(&I);
     Vals.push_back(CI.getCallingConv() << bitc::CALL_CCONV |
                    unsigned(CI.isTailCall()) << bitc::CALL_TAIL |
                    unsigned(CI.isMustTailCall()) << bitc::CALL_MUSTTAIL |
@@ -2160,7 +2687,7 @@ static void WriteInstruction(const Instruction &I, unsigned InstID,
       Vals.push_back(Flags);
 
     Vals.push_back(VE.getTypeID(FTy));
-    PushValueAndType(CI.getCalledValue(), InstID, Vals, VE);  // Callee
+    pushValueAndType(CI.getCalledValue(), InstID, Vals); // Callee
 
     // Emit value #'s for the fixed parameters.
     for (unsigned i = 0, e = FTy->getNumParams(); i != e; ++i) {
@@ -2168,21 +2695,21 @@ static void WriteInstruction(const Instruction &I, unsigned InstID,
       if (FTy->getParamType(i)->isLabelTy())
         Vals.push_back(VE.getValueID(CI.getArgOperand(i)));
       else
-        pushValue(CI.getArgOperand(i), InstID, Vals, VE);  // fixed param.
+        pushValue(CI.getArgOperand(i), InstID, Vals); // fixed param.
     }
 
     // Emit type/value pairs for varargs params.
     if (FTy->isVarArg()) {
       for (unsigned i = FTy->getNumParams(), e = CI.getNumArgOperands();
            i != e; ++i)
-        PushValueAndType(CI.getArgOperand(i), InstID, Vals, VE);  // varargs
+        pushValueAndType(CI.getArgOperand(i), InstID, Vals); // varargs
     }
     break;
   }
   case Instruction::VAArg:
     Code = bitc::FUNC_CODE_INST_VAARG;
     Vals.push_back(VE.getTypeID(I.getOperand(0)->getType()));   // valistty
-    pushValue(I.getOperand(0), InstID, Vals, VE); // valist.
+    pushValue(I.getOperand(0), InstID, Vals);                   // valist.
     Vals.push_back(VE.getTypeID(I.getType())); // restype.
     break;
   }
@@ -2191,49 +2718,27 @@ static void WriteInstruction(const Instruction &I, unsigned InstID,
   Vals.clear();
 }
 
-enum StringEncoding { SE_Char6, SE_Fixed7, SE_Fixed8 };
-
-/// Determine the encoding to use for the given string name and length.
-static StringEncoding getStringEncoding(const char *Str, unsigned StrLen) {
-  bool isChar6 = true;
-  for (const char *C = Str, *E = C + StrLen; C != E; ++C) {
-    if (isChar6)
-      isChar6 = BitCodeAbbrevOp::isChar6(*C);
-    if ((unsigned char)*C & 128)
-      // don't bother scanning the rest.
-      return SE_Fixed8;
-  }
-  if (isChar6)
-    return SE_Char6;
-  else
-    return SE_Fixed7;
-}
-
-/// Emit names for globals/functions etc. The VSTOffsetPlaceholder,
-/// BitcodeStartBit and FunctionIndex are only passed for the module-level
-/// VST, where we are including a function bitcode index and need to
-/// backpatch the VST forward declaration record.
-static void WriteValueSymbolTable(
-    const ValueSymbolTable &VST, const ValueEnumerator &VE,
-    BitstreamWriter &Stream, uint64_t VSTOffsetPlaceholder = 0,
-    uint64_t BitcodeStartBit = 0,
-    DenseMap<const Function *, std::unique_ptr<FunctionInfo>> *FunctionIndex =
-        nullptr) {
+/// Emit names for globals/functions etc. \p IsModuleLevel is true when
+/// we are writing the module-level VST, where we are including a function
+/// bitcode index and need to backpatch the VST forward declaration record.
+void ModuleBitcodeWriter::writeValueSymbolTable(
+    const ValueSymbolTable &VST, bool IsModuleLevel,
+    DenseMap<const Function *, uint64_t> *FunctionToBitcodeIndex) {
   if (VST.empty()) {
-    // WriteValueSymbolTableForwardDecl should have returned early as
+    // writeValueSymbolTableForwardDecl should have returned early as
     // well. Ensure this handling remains in sync by asserting that
     // the placeholder offset is not set.
-    assert(VSTOffsetPlaceholder == 0);
+    assert(!IsModuleLevel || !hasVSTOffsetPlaceholder());
     return;
   }
 
-  if (VSTOffsetPlaceholder > 0) {
+  if (IsModuleLevel && hasVSTOffsetPlaceholder()) {
     // Get the offset of the VST we are writing, and backpatch it into
     // the VST forward declaration record.
     uint64_t VSTOffset = Stream.GetCurrentBitNo();
     // The BitcodeStartBit was the stream offset of the actual bitcode
     // (e.g. excluding any initial darwin header).
-    VSTOffset -= BitcodeStartBit;
+    VSTOffset -= bitcodeStartBit();
     assert((VSTOffset & 31) == 0 && "VST block not 32-bit aligned");
     Stream.BackpatchWord(VSTOffsetPlaceholder, VSTOffset / 32);
   }
@@ -2245,8 +2750,9 @@ static void WriteValueSymbolTable(
   unsigned FnEntry8BitAbbrev;
   unsigned FnEntry7BitAbbrev;
   unsigned FnEntry6BitAbbrev;
-  if (VSTOffsetPlaceholder > 0) {
-    // 8-bit fixed-width VST_FNENTRY function strings.
+  unsigned GUIDEntryAbbrev;
+  if (IsModuleLevel && hasVSTOffsetPlaceholder()) {
+    // 8-bit fixed-width VST_CODE_FNENTRY function strings.
     BitCodeAbbrev *Abbv = new BitCodeAbbrev();
     Abbv->Add(BitCodeAbbrevOp(bitc::VST_CODE_FNENTRY));
     Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 8)); // value id
@@ -2255,7 +2761,7 @@ static void WriteValueSymbolTable(
     Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Fixed, 8));
     FnEntry8BitAbbrev = Stream.EmitAbbrev(Abbv);
 
-    // 7-bit fixed width VST_FNENTRY function strings.
+    // 7-bit fixed width VST_CODE_FNENTRY function strings.
     Abbv = new BitCodeAbbrev();
     Abbv->Add(BitCodeAbbrevOp(bitc::VST_CODE_FNENTRY));
     Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 8)); // value id
@@ -2264,7 +2770,7 @@ static void WriteValueSymbolTable(
     Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Fixed, 7));
     FnEntry7BitAbbrev = Stream.EmitAbbrev(Abbv);
 
-    // 6-bit char6 VST_FNENTRY function strings.
+    // 6-bit char6 VST_CODE_FNENTRY function strings.
     Abbv = new BitCodeAbbrev();
     Abbv->Add(BitCodeAbbrevOp(bitc::VST_CODE_FNENTRY));
     Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 8)); // value id
@@ -2272,11 +2778,19 @@ static void WriteValueSymbolTable(
     Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Array));
     Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Char6));
     FnEntry6BitAbbrev = Stream.EmitAbbrev(Abbv);
+
+    // FIXME: Change the name of this record as it is now used by
+    // the per-module index as well.
+    Abbv = new BitCodeAbbrev();
+    Abbv->Add(BitCodeAbbrevOp(bitc::VST_CODE_COMBINED_ENTRY));
+    Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 8)); // valueid
+    Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 8)); // refguid
+    GUIDEntryAbbrev = Stream.EmitAbbrev(Abbv);
   }
 
   // FIXME: Set up the abbrev, we know how many values there are!
   // FIXME: We know if the type names can use 7-bit ascii.
-  SmallVector<unsigned, 64> NameVals;
+  SmallVector<uint64_t, 64> NameVals;
 
   for (const ValueName &Name : VST) {
     // Figure out the encoding to use for the name.
@@ -2295,9 +2809,9 @@ static void WriteValueSymbolTable(
         F = dyn_cast<Function>(GA->getBaseObject());
     }
 
-    // VST_ENTRY:   [valueid, namechar x N]
-    // VST_FNENTRY: [valueid, funcoffset, namechar x N]
-    // VST_BBENTRY: [bbid, namechar x N]
+    // VST_CODE_ENTRY:   [valueid, namechar x N]
+    // VST_CODE_FNENTRY: [valueid, funcoffset, namechar x N]
+    // VST_CODE_BBENTRY: [bbid, namechar x N]
     unsigned Code;
     if (isa<BasicBlock>(Name.getValue())) {
       Code = bitc::VST_CODE_BBENTRY;
@@ -2307,14 +2821,12 @@ static void WriteValueSymbolTable(
       // Must be the module-level VST, where we pass in the Index and
       // have a VSTOffsetPlaceholder. The function-level VST should not
       // contain any Function symbols.
-      assert(FunctionIndex);
-      assert(VSTOffsetPlaceholder > 0);
+      assert(FunctionToBitcodeIndex);
+      assert(hasVSTOffsetPlaceholder());
 
       // Save the word offset of the function (from the start of the
       // actual bitcode written to the stream).
-      assert(FunctionIndex->count(F) == 1);
-      uint64_t BitcodeIndex =
-          (*FunctionIndex)[F]->bitcodeIndex() - BitcodeStartBit;
+      uint64_t BitcodeIndex = (*FunctionToBitcodeIndex)[F] - bitcodeStartBit();
       assert((BitcodeIndex & 31) == 0 && "function block not 32-bit aligned");
       NameVals.push_back(BitcodeIndex / 32);
 
@@ -2339,71 +2851,51 @@ static void WriteValueSymbolTable(
     Stream.EmitRecord(Code, NameVals, AbbrevToUse);
     NameVals.clear();
   }
+  // Emit any GUID valueIDs created for indirect call edges into the
+  // module-level VST.
+  if (IsModuleLevel && hasVSTOffsetPlaceholder())
+    for (const auto &GI : valueIds()) {
+      NameVals.push_back(GI.second);
+      NameVals.push_back(GI.first);
+      Stream.EmitRecord(bitc::VST_CODE_COMBINED_ENTRY, NameVals,
+                        GUIDEntryAbbrev);
+      NameVals.clear();
+    }
   Stream.ExitBlock();
 }
 
 /// Emit function names and summary offsets for the combined index
 /// used by ThinLTO.
-static void WriteCombinedValueSymbolTable(const FunctionInfoIndex &Index,
-                                          BitstreamWriter &Stream) {
+void IndexBitcodeWriter::writeCombinedValueSymbolTable() {
+  assert(hasVSTOffsetPlaceholder() && "Expected non-zero VSTOffsetPlaceholder");
+  // Get the offset of the VST we are writing, and backpatch it into
+  // the VST forward declaration record.
+  uint64_t VSTOffset = Stream.GetCurrentBitNo();
+  assert((VSTOffset & 31) == 0 && "VST block not 32-bit aligned");
+  Stream.BackpatchWord(VSTOffsetPlaceholder, VSTOffset / 32);
+
   Stream.EnterSubblock(bitc::VALUE_SYMTAB_BLOCK_ID, 4);
 
-  // 8-bit fixed-width VST_COMBINED_FNENTRY function strings.
   BitCodeAbbrev *Abbv = new BitCodeAbbrev();
-  Abbv->Add(BitCodeAbbrevOp(bitc::VST_CODE_COMBINED_FNENTRY));
-  Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 8)); // funcoffset
-  Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Array));
-  Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Fixed, 8));
-  unsigned FnEntry8BitAbbrev = Stream.EmitAbbrev(Abbv);
-
-  // 7-bit fixed width VST_COMBINED_FNENTRY function strings.
-  Abbv = new BitCodeAbbrev();
-  Abbv->Add(BitCodeAbbrevOp(bitc::VST_CODE_COMBINED_FNENTRY));
-  Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 8)); // funcoffset
-  Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Array));
-  Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Fixed, 7));
-  unsigned FnEntry7BitAbbrev = Stream.EmitAbbrev(Abbv);
+  Abbv->Add(BitCodeAbbrevOp(bitc::VST_CODE_COMBINED_ENTRY));
+  Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 8)); // valueid
+  Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 8)); // refguid
+  unsigned EntryAbbrev = Stream.EmitAbbrev(Abbv);
 
-  // 6-bit char6 VST_COMBINED_FNENTRY function strings.
-  Abbv = new BitCodeAbbrev();
-  Abbv->Add(BitCodeAbbrevOp(bitc::VST_CODE_COMBINED_FNENTRY));
-  Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 8)); // funcoffset
-  Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Array));
-  Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Char6));
-  unsigned FnEntry6BitAbbrev = Stream.EmitAbbrev(Abbv);
-
-  // FIXME: We know if the type names can use 7-bit ascii.
-  SmallVector<unsigned, 64> NameVals;
+  SmallVector<uint64_t, 64> NameVals;
+  for (const auto &GVI : valueIds()) {
+    // VST_CODE_COMBINED_ENTRY: [valueid, refguid]
+    NameVals.push_back(GVI.second);
+    NameVals.push_back(GVI.first);
 
-  for (const auto &FII : Index) {
-    for (const auto &FI : FII.getValue()) {
-      NameVals.push_back(FI->bitcodeIndex());
-
-      StringRef FuncName = FII.first();
-
-      // Figure out the encoding to use for the name.
-      StringEncoding Bits = getStringEncoding(FuncName.data(), FuncName.size());
-
-      // VST_COMBINED_FNENTRY: [funcsumoffset, namechar x N]
-      unsigned AbbrevToUse = FnEntry8BitAbbrev;
-      if (Bits == SE_Char6)
-        AbbrevToUse = FnEntry6BitAbbrev;
-      else if (Bits == SE_Fixed7)
-        AbbrevToUse = FnEntry7BitAbbrev;
-
-      for (const auto P : FuncName)
-        NameVals.push_back((unsigned char)P);
-
-      // Emit the finished record.
-      Stream.EmitRecord(bitc::VST_CODE_COMBINED_FNENTRY, NameVals, AbbrevToUse);
-      NameVals.clear();
-    }
+    // Emit the finished record.
+    Stream.EmitRecord(bitc::VST_CODE_COMBINED_ENTRY, NameVals, EntryAbbrev);
+    NameVals.clear();
   }
   Stream.ExitBlock();
 }
 
-static void WriteUseList(ValueEnumerator &VE, UseListOrder &&Order,
-                         BitstreamWriter &Stream) {
+void ModuleBitcodeWriter::writeUseList(UseListOrder &&Order) {
   assert(Order.Shuffle.size() >= 2 && "Shuffle too small");
   unsigned Code;
   if (isa<BasicBlock>(Order.V))
@@ -2416,8 +2908,7 @@ static void WriteUseList(ValueEnumerator &VE, UseListOrder &&Order,
   Stream.EmitRecord(Code, Record);
 }
 
-static void WriteUseListBlock(const Function *F, ValueEnumerator &VE,
-                              BitstreamWriter &Stream) {
+void ModuleBitcodeWriter::writeUseListBlock(const Function *F) {
   assert(VE.shouldPreserveUseListOrder() &&
          "Expected to be preserving use-list order");
 
@@ -2430,39 +2921,19 @@ static void WriteUseListBlock(const Function *F, ValueEnumerator &VE,
 
   Stream.EnterSubblock(bitc::USELIST_BLOCK_ID, 3);
   while (hasMore()) {
-    WriteUseList(VE, std::move(VE.UseListOrders.back()), Stream);
+    writeUseList(std::move(VE.UseListOrders.back()));
     VE.UseListOrders.pop_back();
   }
   Stream.ExitBlock();
 }
 
-/// \brief Save information for the given function into the function index.
-///
-/// At a minimum this saves the bitcode index of the function record that
-/// was just written. However, if we are emitting function summary information,
-/// for example for ThinLTO, then a \a FunctionSummary object is created
-/// to hold the provided summary information.
-static void SaveFunctionInfo(
-    const Function &F,
-    DenseMap<const Function *, std::unique_ptr<FunctionInfo>> &FunctionIndex,
-    unsigned NumInsts, uint64_t BitcodeIndex, bool EmitFunctionSummary) {
-  std::unique_ptr<FunctionSummary> FuncSummary;
-  if (EmitFunctionSummary) {
-    FuncSummary = llvm::make_unique<FunctionSummary>(NumInsts);
-    FuncSummary->setLocalFunction(F.hasLocalLinkage());
-  }
-  FunctionIndex[&F] =
-      llvm::make_unique<FunctionInfo>(BitcodeIndex, std::move(FuncSummary));
-}
-
 /// Emit a function body to the module stream.
-static void WriteFunction(
-    const Function &F, ValueEnumerator &VE, BitstreamWriter &Stream,
-    DenseMap<const Function *, std::unique_ptr<FunctionInfo>> &FunctionIndex,
-    bool EmitFunctionSummary) {
+void ModuleBitcodeWriter::writeFunction(
+    const Function &F,
+    DenseMap<const Function *, uint64_t> &FunctionToBitcodeIndex) {
   // Save the bitcode index of the start of this function block for recording
   // in the VST.
-  uint64_t BitcodeIndex = Stream.GetCurrentBitNo();
+  FunctionToBitcodeIndex[&F] = Stream.GetCurrentBitNo();
 
   Stream.EnterSubblock(bitc::FUNCTION_BLOCK_ID, 4);
   VE.incorporateFunction(F);
@@ -2478,10 +2949,10 @@ static void WriteFunction(
   // If there are function-local constants, emit them now.
   unsigned CstStart, CstEnd;
   VE.getFunctionConstantRange(CstStart, CstEnd);
-  WriteConstants(CstStart, CstEnd, VE, Stream, false);
+  writeConstants(CstStart, CstEnd, false);
 
   // If there is function-local metadata, emit it now.
-  WriteFunctionLocalMetadata(F, VE, Stream);
+  writeFunctionMetadata(F);
 
   // Keep a running idea of what the instruction ID is.
   unsigned InstID = CstEnd;
@@ -2489,16 +2960,11 @@ static void WriteFunction(
   bool NeedsMetadataAttachment = F.hasMetadata();
 
   DILocation *LastDL = nullptr;
-  unsigned NumInsts = 0;
-
   // Finally, emit all the instructions, in order.
   for (Function::const_iterator BB = F.begin(), E = F.end(); BB != E; ++BB)
     for (BasicBlock::const_iterator I = BB->begin(), E = BB->end();
          I != E; ++I) {
-      WriteInstruction(*I, InstID, VE, Stream, Vals);
-
-      if (!isa<DbgInfoIntrinsic>(I))
-        ++NumInsts;
+      writeInstruction(*I, InstID, Vals);
 
       if (!I->getType()->isVoidTy())
         ++InstID;
@@ -2528,65 +2994,62 @@ static void WriteFunction(
     }
 
   // Emit names for all the instructions etc.
-  WriteValueSymbolTable(F.getValueSymbolTable(), VE, Stream);
+  writeValueSymbolTable(F.getValueSymbolTable());
 
   if (NeedsMetadataAttachment)
-    WriteMetadataAttachment(F, VE, Stream);
+    writeFunctionMetadataAttachment(F);
   if (VE.shouldPreserveUseListOrder())
-    WriteUseListBlock(&F, VE, Stream);
+    writeUseListBlock(&F);
   VE.purgeFunction();
   Stream.ExitBlock();
-
-  SaveFunctionInfo(F, FunctionIndex, NumInsts, BitcodeIndex,
-                   EmitFunctionSummary);
 }
 
 // Emit blockinfo, which defines the standard abbreviations etc.
-static void WriteBlockInfo(const ValueEnumerator &VE, BitstreamWriter &Stream) {
+void ModuleBitcodeWriter::writeBlockInfo() {
   // We only want to emit block info records for blocks that have multiple
   // instances: CONSTANTS_BLOCK, FUNCTION_BLOCK and VALUE_SYMTAB_BLOCK.
   // Other blocks can define their abbrevs inline.
   Stream.EnterBlockInfoBlock(2);
 
-  { // 8-bit fixed-width VST_ENTRY/VST_BBENTRY strings.
+  { // 8-bit fixed-width VST_CODE_ENTRY/VST_CODE_BBENTRY strings.
     BitCodeAbbrev *Abbv = new BitCodeAbbrev();
     Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Fixed, 3));
     Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 8));
     Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Array));
     Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Fixed, 8));
-    if (Stream.EmitBlockInfoAbbrev(bitc::VALUE_SYMTAB_BLOCK_ID,
-                                   Abbv) != VST_ENTRY_8_ABBREV)
+    if (Stream.EmitBlockInfoAbbrev(bitc::VALUE_SYMTAB_BLOCK_ID, Abbv) !=
+        VST_ENTRY_8_ABBREV)
       llvm_unreachable("Unexpected abbrev ordering!");
   }
 
-  { // 7-bit fixed width VST_ENTRY strings.
+  { // 7-bit fixed width VST_CODE_ENTRY strings.
     BitCodeAbbrev *Abbv = new BitCodeAbbrev();
     Abbv->Add(BitCodeAbbrevOp(bitc::VST_CODE_ENTRY));
     Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 8));
     Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Array));
     Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Fixed, 7));
-    if (Stream.EmitBlockInfoAbbrev(bitc::VALUE_SYMTAB_BLOCK_ID,
-                                   Abbv) != VST_ENTRY_7_ABBREV)
+    if (Stream.EmitBlockInfoAbbrev(bitc::VALUE_SYMTAB_BLOCK_ID, Abbv) !=
+        VST_ENTRY_7_ABBREV)
       llvm_unreachable("Unexpected abbrev ordering!");
   }
-  { // 6-bit char6 VST_ENTRY strings.
+  { // 6-bit char6 VST_CODE_ENTRY strings.
     BitCodeAbbrev *Abbv = new BitCodeAbbrev();
     Abbv->Add(BitCodeAbbrevOp(bitc::VST_CODE_ENTRY));
     Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 8));
     Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Array));
     Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Char6));
-    if (Stream.EmitBlockInfoAbbrev(bitc::VALUE_SYMTAB_BLOCK_ID,
-                                   Abbv) != VST_ENTRY_6_ABBREV)
+    if (Stream.EmitBlockInfoAbbrev(bitc::VALUE_SYMTAB_BLOCK_ID, Abbv) !=
+        VST_ENTRY_6_ABBREV)
       llvm_unreachable("Unexpected abbrev ordering!");
   }
-  { // 6-bit char6 VST_BBENTRY strings.
+  { // 6-bit char6 VST_CODE_BBENTRY strings.
     BitCodeAbbrev *Abbv = new BitCodeAbbrev();
     Abbv->Add(BitCodeAbbrevOp(bitc::VST_CODE_BBENTRY));
     Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 8));
     Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Array));
     Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Char6));
-    if (Stream.EmitBlockInfoAbbrev(bitc::VALUE_SYMTAB_BLOCK_ID,
-                                   Abbv) != VST_BBENTRY_6_ABBREV)
+    if (Stream.EmitBlockInfoAbbrev(bitc::VALUE_SYMTAB_BLOCK_ID, Abbv) !=
+        VST_BBENTRY_6_ABBREV)
       llvm_unreachable("Unexpected abbrev ordering!");
   }
 
@@ -2597,8 +3060,8 @@ static void WriteBlockInfo(const ValueEnumerator &VE, BitstreamWriter &Stream) {
     Abbv->Add(BitCodeAbbrevOp(bitc::CST_CODE_SETTYPE));
     Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Fixed,
                               VE.computeBitsRequiredForTypeIndicies()));
-    if (Stream.EmitBlockInfoAbbrev(bitc::CONSTANTS_BLOCK_ID,
-                                   Abbv) != CONSTANTS_SETTYPE_ABBREV)
+    if (Stream.EmitBlockInfoAbbrev(bitc::CONSTANTS_BLOCK_ID, Abbv) !=
+        CONSTANTS_SETTYPE_ABBREV)
       llvm_unreachable("Unexpected abbrev ordering!");
   }
 
@@ -2606,8 +3069,8 @@ static void WriteBlockInfo(const ValueEnumerator &VE, BitstreamWriter &Stream) {
     BitCodeAbbrev *Abbv = new BitCodeAbbrev();
     Abbv->Add(BitCodeAbbrevOp(bitc::CST_CODE_INTEGER));
     Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 8));
-    if (Stream.EmitBlockInfoAbbrev(bitc::CONSTANTS_BLOCK_ID,
-                                   Abbv) != CONSTANTS_INTEGER_ABBREV)
+    if (Stream.EmitBlockInfoAbbrev(bitc::CONSTANTS_BLOCK_ID, Abbv) !=
+        CONSTANTS_INTEGER_ABBREV)
       llvm_unreachable("Unexpected abbrev ordering!");
   }
 
@@ -2619,15 +3082,15 @@ static void WriteBlockInfo(const ValueEnumerator &VE, BitstreamWriter &Stream) {
                               VE.computeBitsRequiredForTypeIndicies()));
     Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 8));    // value id
 
-    if (Stream.EmitBlockInfoAbbrev(bitc::CONSTANTS_BLOCK_ID,
-                                   Abbv) != CONSTANTS_CE_CAST_Abbrev)
+    if (Stream.EmitBlockInfoAbbrev(bitc::CONSTANTS_BLOCK_ID, Abbv) !=
+        CONSTANTS_CE_CAST_Abbrev)
       llvm_unreachable("Unexpected abbrev ordering!");
   }
   { // NULL abbrev for CONSTANTS_BLOCK.
     BitCodeAbbrev *Abbv = new BitCodeAbbrev();
     Abbv->Add(BitCodeAbbrevOp(bitc::CST_CODE_NULL));
-    if (Stream.EmitBlockInfoAbbrev(bitc::CONSTANTS_BLOCK_ID,
-                                   Abbv) != CONSTANTS_NULL_Abbrev)
+    if (Stream.EmitBlockInfoAbbrev(bitc::CONSTANTS_BLOCK_ID, Abbv) !=
+        CONSTANTS_NULL_Abbrev)
       llvm_unreachable("Unexpected abbrev ordering!");
   }
 
@@ -2641,8 +3104,8 @@ static void WriteBlockInfo(const ValueEnumerator &VE, BitstreamWriter &Stream) {
                               VE.computeBitsRequiredForTypeIndicies()));
     Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 4)); // Align
     Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Fixed, 1)); // volatile
-    if (Stream.EmitBlockInfoAbbrev(bitc::FUNCTION_BLOCK_ID,
-                                   Abbv) != FUNCTION_INST_LOAD_ABBREV)
+    if (Stream.EmitBlockInfoAbbrev(bitc::FUNCTION_BLOCK_ID, Abbv) !=
+        FUNCTION_INST_LOAD_ABBREV)
       llvm_unreachable("Unexpected abbrev ordering!");
   }
   { // INST_BINOP abbrev for FUNCTION_BLOCK.
@@ -2651,8 +3114,8 @@ static void WriteBlockInfo(const ValueEnumerator &VE, BitstreamWriter &Stream) {
     Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 6)); // LHS
     Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 6)); // RHS
     Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Fixed, 4)); // opc
-    if (Stream.EmitBlockInfoAbbrev(bitc::FUNCTION_BLOCK_ID,
-                                   Abbv) != FUNCTION_INST_BINOP_ABBREV)
+    if (Stream.EmitBlockInfoAbbrev(bitc::FUNCTION_BLOCK_ID, Abbv) !=
+        FUNCTION_INST_BINOP_ABBREV)
       llvm_unreachable("Unexpected abbrev ordering!");
   }
   { // INST_BINOP_FLAGS abbrev for FUNCTION_BLOCK.
@@ -2662,8 +3125,8 @@ static void WriteBlockInfo(const ValueEnumerator &VE, BitstreamWriter &Stream) {
     Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 6)); // RHS
     Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Fixed, 4)); // opc
     Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Fixed, 7)); // flags
-    if (Stream.EmitBlockInfoAbbrev(bitc::FUNCTION_BLOCK_ID,
-                                   Abbv) != FUNCTION_INST_BINOP_FLAGS_ABBREV)
+    if (Stream.EmitBlockInfoAbbrev(bitc::FUNCTION_BLOCK_ID, Abbv) !=
+        FUNCTION_INST_BINOP_FLAGS_ABBREV)
       llvm_unreachable("Unexpected abbrev ordering!");
   }
   { // INST_CAST abbrev for FUNCTION_BLOCK.
@@ -2673,31 +3136,31 @@ static void WriteBlockInfo(const ValueEnumerator &VE, BitstreamWriter &Stream) {
     Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Fixed,       // dest ty
                               VE.computeBitsRequiredForTypeIndicies()));
     Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Fixed, 4));  // opc
-    if (Stream.EmitBlockInfoAbbrev(bitc::FUNCTION_BLOCK_ID,
-                                   Abbv) != FUNCTION_INST_CAST_ABBREV)
+    if (Stream.EmitBlockInfoAbbrev(bitc::FUNCTION_BLOCK_ID, Abbv) !=
+        FUNCTION_INST_CAST_ABBREV)
       llvm_unreachable("Unexpected abbrev ordering!");
   }
 
   { // INST_RET abbrev for FUNCTION_BLOCK.
     BitCodeAbbrev *Abbv = new BitCodeAbbrev();
     Abbv->Add(BitCodeAbbrevOp(bitc::FUNC_CODE_INST_RET));
-    if (Stream.EmitBlockInfoAbbrev(bitc::FUNCTION_BLOCK_ID,
-                                   Abbv) != FUNCTION_INST_RET_VOID_ABBREV)
+    if (Stream.EmitBlockInfoAbbrev(bitc::FUNCTION_BLOCK_ID, Abbv) !=
+        FUNCTION_INST_RET_VOID_ABBREV)
       llvm_unreachable("Unexpected abbrev ordering!");
   }
   { // INST_RET abbrev for FUNCTION_BLOCK.
     BitCodeAbbrev *Abbv = new BitCodeAbbrev();
     Abbv->Add(BitCodeAbbrevOp(bitc::FUNC_CODE_INST_RET));
     Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 6)); // ValID
-    if (Stream.EmitBlockInfoAbbrev(bitc::FUNCTION_BLOCK_ID,
-                                   Abbv) != FUNCTION_INST_RET_VAL_ABBREV)
+    if (Stream.EmitBlockInfoAbbrev(bitc::FUNCTION_BLOCK_ID, Abbv) !=
+        FUNCTION_INST_RET_VAL_ABBREV)
       llvm_unreachable("Unexpected abbrev ordering!");
   }
   { // INST_UNREACHABLE abbrev for FUNCTION_BLOCK.
     BitCodeAbbrev *Abbv = new BitCodeAbbrev();
     Abbv->Add(BitCodeAbbrevOp(bitc::FUNC_CODE_INST_UNREACHABLE));
-    if (Stream.EmitBlockInfoAbbrev(bitc::FUNCTION_BLOCK_ID,
-                                   Abbv) != FUNCTION_INST_UNREACHABLE_ABBREV)
+    if (Stream.EmitBlockInfoAbbrev(bitc::FUNCTION_BLOCK_ID, Abbv) !=
+        FUNCTION_INST_UNREACHABLE_ABBREV)
       llvm_unreachable("Unexpected abbrev ordering!");
   }
   {
@@ -2718,8 +3181,7 @@ static void WriteBlockInfo(const ValueEnumerator &VE, BitstreamWriter &Stream) {
 
 /// Write the module path strings, currently only used when generating
 /// a combined index file.
-static void WriteModStrings(const FunctionInfoIndex &I,
-                            BitstreamWriter &Stream) {
+void IndexBitcodeWriter::writeModStrings() {
   Stream.EnterSubblock(bitc::MODULE_STRTAB_BLOCK_ID, 3);
 
   // TODO: See which abbrev sizes we actually need to emit
@@ -2748,8 +3210,20 @@ static void WriteModStrings(const FunctionInfoIndex &I,
   Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Char6));
   unsigned Abbrev6Bit = Stream.EmitAbbrev(Abbv);
 
-  SmallVector<unsigned, 64> NameVals;
-  for (const StringMapEntry<uint64_t> &MPSE : I.modPathStringEntries()) {
+  // Module Hash, 160 bits SHA1. Optionally, emitted after each MST_CODE_ENTRY.
+  Abbv = new BitCodeAbbrev();
+  Abbv->Add(BitCodeAbbrevOp(bitc::MST_CODE_HASH));
+  Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Fixed, 32));
+  Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Fixed, 32));
+  Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Fixed, 32));
+  Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Fixed, 32));
+  Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Fixed, 32));
+  unsigned AbbrevHash = Stream.EmitAbbrev(Abbv);
+
+  SmallVector<unsigned, 64> Vals;
+  for (const auto &MPSE : Index.modulePaths()) {
+    if (!doIncludeModule(MPSE.getKey()))
+      continue;
     StringEncoding Bits =
         getStringEncoding(MPSE.getKey().data(), MPSE.getKey().size());
     unsigned AbbrevToUse = Abbrev8Bit;
@@ -2758,116 +3232,355 @@ static void WriteModStrings(const FunctionInfoIndex &I,
     else if (Bits == SE_Fixed7)
       AbbrevToUse = Abbrev7Bit;
 
-    NameVals.push_back(MPSE.getValue());
+    Vals.push_back(MPSE.getValue().first);
 
     for (const auto P : MPSE.getKey())
-      NameVals.push_back((unsigned char)P);
+      Vals.push_back((unsigned char)P);
 
     // Emit the finished record.
-    Stream.EmitRecord(bitc::MST_CODE_ENTRY, NameVals, AbbrevToUse);
-    NameVals.clear();
+    Stream.EmitRecord(bitc::MST_CODE_ENTRY, Vals, AbbrevToUse);
+
+    Vals.clear();
+    // Emit an optional hash for the module now
+    auto &Hash = MPSE.getValue().second;
+    bool AllZero = true; // Detect if the hash is empty, and do not generate it
+    for (auto Val : Hash) {
+      if (Val)
+        AllZero = false;
+      Vals.push_back(Val);
+    }
+    if (!AllZero) {
+      // Emit the hash record.
+      Stream.EmitRecord(bitc::MST_CODE_HASH, Vals, AbbrevHash);
+    }
+
+    Vals.clear();
   }
   Stream.ExitBlock();
 }
 
 // Helper to emit a single function summary record.
-static void WritePerModuleFunctionSummaryRecord(
-    SmallVector<unsigned, 64> &NameVals, FunctionSummary *FS, unsigned ValueID,
-    unsigned FSAbbrev, BitstreamWriter &Stream) {
-  assert(FS);
+void ModuleBitcodeWriter::writePerModuleFunctionSummaryRecord(
+    SmallVector<uint64_t, 64> &NameVals, GlobalValueSummary *Summary,
+    unsigned ValueID, unsigned FSCallsAbbrev, unsigned FSCallsProfileAbbrev,
+    const Function &F) {
   NameVals.push_back(ValueID);
-  NameVals.push_back(FS->isLocalFunction());
+
+  FunctionSummary *FS = cast<FunctionSummary>(Summary);
+  NameVals.push_back(getEncodedGVSummaryFlags(FS->flags()));
   NameVals.push_back(FS->instCount());
+  NameVals.push_back(FS->refs().size());
+
+  unsigned SizeBeforeRefs = NameVals.size();
+  for (auto &RI : FS->refs())
+    NameVals.push_back(VE.getValueID(RI.getValue()));
+  // Sort the refs for determinism output, the vector returned by FS->refs() has
+  // been initialized from a DenseSet.
+  std::sort(NameVals.begin() + SizeBeforeRefs, NameVals.end());
+
+  std::vector<FunctionSummary::EdgeTy> Calls = FS->calls();
+  std::sort(Calls.begin(), Calls.end(),
+            [this](const FunctionSummary::EdgeTy &L,
+                   const FunctionSummary::EdgeTy &R) {
+              return getValueId(L.first) < getValueId(R.first);
+            });
+  bool HasProfileData = F.getEntryCount().hasValue();
+  for (auto &ECI : Calls) {
+    NameVals.push_back(getValueId(ECI.first));
+    assert(ECI.second.CallsiteCount > 0 && "Expected at least one callsite");
+    NameVals.push_back(ECI.second.CallsiteCount);
+    if (HasProfileData)
+      NameVals.push_back(ECI.second.ProfileCount);
+  }
+
+  unsigned FSAbbrev = (HasProfileData ? FSCallsProfileAbbrev : FSCallsAbbrev);
+  unsigned Code =
+      (HasProfileData ? bitc::FS_PERMODULE_PROFILE : bitc::FS_PERMODULE);
 
   // Emit the finished record.
-  Stream.EmitRecord(bitc::FS_CODE_PERMODULE_ENTRY, NameVals, FSAbbrev);
+  Stream.EmitRecord(Code, NameVals, FSAbbrev);
   NameVals.clear();
 }
 
-/// Emit the per-module function summary section alongside the rest of
+// Collect the global value references in the given variable's initializer,
+// and emit them in a summary record.
+void ModuleBitcodeWriter::writeModuleLevelReferences(
+    const GlobalVariable &V, SmallVector<uint64_t, 64> &NameVals,
+    unsigned FSModRefsAbbrev) {
+  // Only interested in recording variable defs in the summary.
+  if (V.isDeclaration())
+    return;
+  NameVals.push_back(VE.getValueID(&V));
+  NameVals.push_back(getEncodedGVSummaryFlags(V));
+  auto *Summary = Index->getGlobalValueSummary(V);
+  GlobalVarSummary *VS = cast<GlobalVarSummary>(Summary);
+
+  unsigned SizeBeforeRefs = NameVals.size();
+  for (auto &RI : VS->refs())
+    NameVals.push_back(VE.getValueID(RI.getValue()));
+  // Sort the refs for determinism output, the vector returned by FS->refs() has
+  // been initialized from a DenseSet.
+  std::sort(NameVals.begin() + SizeBeforeRefs, NameVals.end());
+
+  Stream.EmitRecord(bitc::FS_PERMODULE_GLOBALVAR_INIT_REFS, NameVals,
+                    FSModRefsAbbrev);
+  NameVals.clear();
+}
+
+// Current version for the summary.
+// This is bumped whenever we introduce changes in the way some record are
+// interpreted, like flags for instance.
+static const uint64_t INDEX_VERSION = 1;
+
+/// Emit the per-module summary section alongside the rest of
 /// the module's bitcode.
-static void WritePerModuleFunctionSummary(
-    DenseMap<const Function *, std::unique_ptr<FunctionInfo>> &FunctionIndex,
-    const Module *M, const ValueEnumerator &VE, BitstreamWriter &Stream) {
-  Stream.EnterSubblock(bitc::FUNCTION_SUMMARY_BLOCK_ID, 3);
+void ModuleBitcodeWriter::writePerModuleGlobalValueSummary() {
+  if (Index->begin() == Index->end())
+    return;
 
-  // Abbrev for FS_CODE_PERMODULE_ENTRY.
+  Stream.EnterSubblock(bitc::GLOBALVAL_SUMMARY_BLOCK_ID, 4);
+
+  Stream.EmitRecord(bitc::FS_VERSION, ArrayRef<uint64_t>{INDEX_VERSION});
+
+  // Abbrev for FS_PERMODULE.
   BitCodeAbbrev *Abbv = new BitCodeAbbrev();
-  Abbv->Add(BitCodeAbbrevOp(bitc::FS_CODE_PERMODULE_ENTRY));
+  Abbv->Add(BitCodeAbbrevOp(bitc::FS_PERMODULE));
   Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 8));   // valueid
-  Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Fixed, 1)); // islocal
+  Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 6));   // flags
   Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 8));   // instcount
-  unsigned FSAbbrev = Stream.EmitAbbrev(Abbv);
+  Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 4));   // numrefs
+  // numrefs x valueid, n x (valueid, callsitecount)
+  Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Array));
+  Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 8));
+  unsigned FSCallsAbbrev = Stream.EmitAbbrev(Abbv);
+
+  // Abbrev for FS_PERMODULE_PROFILE.
+  Abbv = new BitCodeAbbrev();
+  Abbv->Add(BitCodeAbbrevOp(bitc::FS_PERMODULE_PROFILE));
+  Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 8));   // valueid
+  Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 6));   // flags
+  Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 8));   // instcount
+  Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 4));   // numrefs
+  // numrefs x valueid, n x (valueid, callsitecount, profilecount)
+  Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Array));
+  Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 8));
+  unsigned FSCallsProfileAbbrev = Stream.EmitAbbrev(Abbv);
+
+  // Abbrev for FS_PERMODULE_GLOBALVAR_INIT_REFS.
+  Abbv = new BitCodeAbbrev();
+  Abbv->Add(BitCodeAbbrevOp(bitc::FS_PERMODULE_GLOBALVAR_INIT_REFS));
+  Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 8)); // valueid
+  Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 6)); // flags
+  Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Array));  // valueids
+  Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 8));
+  unsigned FSModRefsAbbrev = Stream.EmitAbbrev(Abbv);
 
-  SmallVector<unsigned, 64> NameVals;
-  for (auto &I : FunctionIndex) {
-    // Skip anonymous functions. We will emit a function summary for
-    // any aliases below.
-    if (!I.first->hasName())
+  // Abbrev for FS_ALIAS.
+  Abbv = new BitCodeAbbrev();
+  Abbv->Add(BitCodeAbbrevOp(bitc::FS_ALIAS));
+  Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 8));   // valueid
+  Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 6));   // flags
+  Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 8));   // valueid
+  unsigned FSAliasAbbrev = Stream.EmitAbbrev(Abbv);
+
+  SmallVector<uint64_t, 64> NameVals;
+  // Iterate over the list of functions instead of the Index to
+  // ensure the ordering is stable.
+  for (const Function &F : M) {
+    if (F.isDeclaration())
       continue;
+    // Summary emission does not support anonymous functions, they have to
+    // renamed using the anonymous function renaming pass.
+    if (!F.hasName())
+      report_fatal_error("Unexpected anonymous function when writing summary");
 
-    WritePerModuleFunctionSummaryRecord(
-        NameVals, I.second->functionSummary(),
-        VE.getValueID(M->getValueSymbolTable().lookup(I.first->getName())),
-        FSAbbrev, Stream);
+    auto *Summary = Index->getGlobalValueSummary(F);
+    writePerModuleFunctionSummaryRecord(NameVals, Summary, VE.getValueID(&F),
+                                        FSCallsAbbrev, FSCallsProfileAbbrev, F);
   }
 
-  for (const GlobalAlias &A : M->aliases()) {
-    if (!A.getBaseObject())
-      continue;
-    const Function *F = dyn_cast<Function>(A.getBaseObject());
-    if (!F || F->isDeclaration())
-      continue;
+  // Capture references from GlobalVariable initializers, which are outside
+  // of a function scope.
+  for (const GlobalVariable &G : M.globals())
+    writeModuleLevelReferences(G, NameVals, FSModRefsAbbrev);
 
-    assert(FunctionIndex.count(F) == 1);
-    WritePerModuleFunctionSummaryRecord(
-        NameVals, FunctionIndex[F]->functionSummary(),
-        VE.getValueID(M->getValueSymbolTable().lookup(A.getName())), FSAbbrev,
-        Stream);
+  for (const GlobalAlias &A : M.aliases()) {
+    auto *Aliasee = A.getBaseObject();
+    if (!Aliasee->hasName())
+      // Nameless function don't have an entry in the summary, skip it.
+      continue;
+    auto AliasId = VE.getValueID(&A);
+    auto AliaseeId = VE.getValueID(Aliasee);
+    NameVals.push_back(AliasId);
+    NameVals.push_back(getEncodedGVSummaryFlags(A));
+    NameVals.push_back(AliaseeId);
+    Stream.EmitRecord(bitc::FS_ALIAS, NameVals, FSAliasAbbrev);
+    NameVals.clear();
   }
 
   Stream.ExitBlock();
 }
 
-/// Emit the combined function summary section into the combined index
-/// file.
-static void WriteCombinedFunctionSummary(const FunctionInfoIndex &I,
-                                         BitstreamWriter &Stream) {
-  Stream.EnterSubblock(bitc::FUNCTION_SUMMARY_BLOCK_ID, 3);
+/// Emit the combined summary section into the combined index file.
+void IndexBitcodeWriter::writeCombinedGlobalValueSummary() {
+  Stream.EnterSubblock(bitc::GLOBALVAL_SUMMARY_BLOCK_ID, 3);
+  Stream.EmitRecord(bitc::FS_VERSION, ArrayRef<uint64_t>{INDEX_VERSION});
 
-  // Abbrev for FS_CODE_COMBINED_ENTRY.
+  // Abbrev for FS_COMBINED.
   BitCodeAbbrev *Abbv = new BitCodeAbbrev();
-  Abbv->Add(BitCodeAbbrevOp(bitc::FS_CODE_COMBINED_ENTRY));
-  Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 8)); // modid
-  Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 8)); // instcount
-  unsigned FSAbbrev = Stream.EmitAbbrev(Abbv);
+  Abbv->Add(BitCodeAbbrevOp(bitc::FS_COMBINED));
+  Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 8));   // valueid
+  Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 8));   // modid
+  Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 6));   // flags
+  Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 8));   // instcount
+  Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 4));   // numrefs
+  // numrefs x valueid, n x (valueid, callsitecount)
+  Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Array));
+  Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 8));
+  unsigned FSCallsAbbrev = Stream.EmitAbbrev(Abbv);
+
+  // Abbrev for FS_COMBINED_PROFILE.
+  Abbv = new BitCodeAbbrev();
+  Abbv->Add(BitCodeAbbrevOp(bitc::FS_COMBINED_PROFILE));
+  Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 8));   // valueid
+  Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 8));   // modid
+  Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 6));   // flags
+  Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 8));   // instcount
+  Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 4));   // numrefs
+  // numrefs x valueid, n x (valueid, callsitecount, profilecount)
+  Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Array));
+  Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 8));
+  unsigned FSCallsProfileAbbrev = Stream.EmitAbbrev(Abbv);
+
+  // Abbrev for FS_COMBINED_GLOBALVAR_INIT_REFS.
+  Abbv = new BitCodeAbbrev();
+  Abbv->Add(BitCodeAbbrevOp(bitc::FS_COMBINED_GLOBALVAR_INIT_REFS));
+  Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 8));   // valueid
+  Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 8));   // modid
+  Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 6));   // flags
+  Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Array));    // valueids
+  Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 8));
+  unsigned FSModRefsAbbrev = Stream.EmitAbbrev(Abbv);
+
+  // Abbrev for FS_COMBINED_ALIAS.
+  Abbv = new BitCodeAbbrev();
+  Abbv->Add(BitCodeAbbrevOp(bitc::FS_COMBINED_ALIAS));
+  Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 8));   // valueid
+  Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 8));   // modid
+  Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 6));   // flags
+  Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 8));   // valueid
+  unsigned FSAliasAbbrev = Stream.EmitAbbrev(Abbv);
+
+  // The aliases are emitted as a post-pass, and will point to the value
+  // id of the aliasee. Save them in a vector for post-processing.
+  SmallVector<AliasSummary *, 64> Aliases;
 
-  SmallVector<unsigned, 64> NameVals;
-  for (const auto &FII : I) {
-    for (auto &FI : FII.getValue()) {
-      FunctionSummary *FS = FI->functionSummary();
-      assert(FS);
+  // Save the value id for each summary for alias emission.
+  DenseMap<const GlobalValueSummary *, unsigned> SummaryToValueIdMap;
 
-      NameVals.push_back(I.getModuleId(FS->modulePath()));
-      NameVals.push_back(FS->instCount());
+  SmallVector<uint64_t, 64> NameVals;
 
-      // Record the starting offset of this summary entry for use
-      // in the VST entry. Add the current code size since the
-      // reader will invoke readRecord after the abbrev id read.
-      FI->setBitcodeIndex(Stream.GetCurrentBitNo() + Stream.GetAbbrevIDWidth());
+  // For local linkage, we also emit the original name separately
+  // immediately after the record.
+  auto MaybeEmitOriginalName = [&](GlobalValueSummary &S) {
+    if (!GlobalValue::isLocalLinkage(S.linkage()))
+      return;
+    NameVals.push_back(S.getOriginalName());
+    Stream.EmitRecord(bitc::FS_COMBINED_ORIGINAL_NAME, NameVals);
+    NameVals.clear();
+  };
+
+  for (const auto &I : *this) {
+    GlobalValueSummary *S = I.second;
+    assert(S);
+
+    assert(hasValueId(I.first));
+    unsigned ValueId = getValueId(I.first);
+    SummaryToValueIdMap[S] = ValueId;
+
+    if (auto *AS = dyn_cast<AliasSummary>(S)) {
+      // Will process aliases as a post-pass because the reader wants all
+      // global to be loaded first.
+      Aliases.push_back(AS);
+      continue;
+    }
+
+    if (auto *VS = dyn_cast<GlobalVarSummary>(S)) {
+      NameVals.push_back(ValueId);
+      NameVals.push_back(Index.getModuleId(VS->modulePath()));
+      NameVals.push_back(getEncodedGVSummaryFlags(VS->flags()));
+      for (auto &RI : VS->refs()) {
+        NameVals.push_back(getValueId(RI.getGUID()));
+      }
 
       // Emit the finished record.
-      Stream.EmitRecord(bitc::FS_CODE_COMBINED_ENTRY, NameVals, FSAbbrev);
+      Stream.EmitRecord(bitc::FS_COMBINED_GLOBALVAR_INIT_REFS, NameVals,
+                        FSModRefsAbbrev);
       NameVals.clear();
+      MaybeEmitOriginalName(*S);
+      continue;
+    }
+
+    auto *FS = cast<FunctionSummary>(S);
+    NameVals.push_back(ValueId);
+    NameVals.push_back(Index.getModuleId(FS->modulePath()));
+    NameVals.push_back(getEncodedGVSummaryFlags(FS->flags()));
+    NameVals.push_back(FS->instCount());
+    NameVals.push_back(FS->refs().size());
+
+    for (auto &RI : FS->refs()) {
+      NameVals.push_back(getValueId(RI.getGUID()));
     }
+
+    bool HasProfileData = false;
+    for (auto &EI : FS->calls()) {
+      HasProfileData |= EI.second.ProfileCount != 0;
+      if (HasProfileData)
+        break;
+    }
+
+    for (auto &EI : FS->calls()) {
+      // If this GUID doesn't have a value id, it doesn't have a function
+      // summary and we don't need to record any calls to it.
+      if (!hasValueId(EI.first.getGUID()))
+        continue;
+      NameVals.push_back(getValueId(EI.first.getGUID()));
+      assert(EI.second.CallsiteCount > 0 && "Expected at least one callsite");
+      NameVals.push_back(EI.second.CallsiteCount);
+      if (HasProfileData)
+        NameVals.push_back(EI.second.ProfileCount);
+    }
+
+    unsigned FSAbbrev = (HasProfileData ? FSCallsProfileAbbrev : FSCallsAbbrev);
+    unsigned Code =
+        (HasProfileData ? bitc::FS_COMBINED_PROFILE : bitc::FS_COMBINED);
+
+    // Emit the finished record.
+    Stream.EmitRecord(Code, NameVals, FSAbbrev);
+    NameVals.clear();
+    MaybeEmitOriginalName(*S);
+  }
+
+  for (auto *AS : Aliases) {
+    auto AliasValueId = SummaryToValueIdMap[AS];
+    assert(AliasValueId);
+    NameVals.push_back(AliasValueId);
+    NameVals.push_back(Index.getModuleId(AS->modulePath()));
+    NameVals.push_back(getEncodedGVSummaryFlags(AS->flags()));
+    auto AliaseeValueId = SummaryToValueIdMap[&AS->getAliasee()];
+    assert(AliaseeValueId);
+    NameVals.push_back(AliaseeValueId);
+
+    // Emit the finished record.
+    Stream.EmitRecord(bitc::FS_COMBINED_ALIAS, NameVals, FSAliasAbbrev);
+    NameVals.clear();
+    MaybeEmitOriginalName(*AS);
   }
 
   Stream.ExitBlock();
 }
 
-// Create the "IDENTIFICATION_BLOCK_ID" containing a single string with the
-// current llvm version, and a record for the epoch number.
-static void WriteIdentificationBlock(const Module *M, BitstreamWriter &Stream) {
+void ModuleBitcodeWriter::writeIdentificationBlock() {
   Stream.EnterSubblock(bitc::IDENTIFICATION_BLOCK_ID, 5);
 
   // Write the "user readable" string identifying the bitcode producer
@@ -2876,8 +3589,8 @@ static void WriteIdentificationBlock(const Module *M, BitstreamWriter &Stream) {
   Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Array));
   Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Char6));
   auto StringAbbrev = Stream.EmitAbbrev(Abbv);
-  WriteStringRecord(bitc::IDENTIFICATION_CODE_STRING,
-                    "LLVM" LLVM_VERSION_STRING, StringAbbrev, Stream);
+  writeStringRecord(bitc::IDENTIFICATION_CODE_STRING,
+                    "LLVM" LLVM_VERSION_STRING, StringAbbrev);
 
   // Write the epoch version
   Abbv = new BitCodeAbbrev();
@@ -2889,71 +3602,114 @@ static void WriteIdentificationBlock(const Module *M, BitstreamWriter &Stream) {
   Stream.ExitBlock();
 }
 
-/// WriteModule - Emit the specified module to the bitstream.
-static void WriteModule(const Module *M, BitstreamWriter &Stream,
-                        bool ShouldPreserveUseListOrder,
-                        uint64_t BitcodeStartBit, bool EmitFunctionSummary) {
+void ModuleBitcodeWriter::writeModuleHash(size_t BlockStartPos) {
+  // Emit the module's hash.
+  // MODULE_CODE_HASH: [5*i32]
+  SHA1 Hasher;
+  Hasher.update(ArrayRef<uint8_t>((const uint8_t *)&(Buffer)[BlockStartPos],
+                                  Buffer.size() - BlockStartPos));
+  auto Hash = Hasher.result();
+  SmallVector<uint64_t, 20> Vals;
+  auto LShift = [&](unsigned char Val, unsigned Amount)
+                    -> uint64_t { return ((uint64_t)Val) << Amount; };
+  for (int Pos = 0; Pos < 20; Pos += 4) {
+    uint32_t SubHash = LShift(Hash[Pos + 0], 24);
+    SubHash |= LShift(Hash[Pos + 1], 16) | LShift(Hash[Pos + 2], 8) |
+               (unsigned)(unsigned char)Hash[Pos + 3];
+    Vals.push_back(SubHash);
+  }
+
+  // Emit the finished record.
+  Stream.EmitRecord(bitc::MODULE_CODE_HASH, Vals);
+}
+
+void BitcodeWriter::write() {
+  // Emit the file header first.
+  writeBitcodeHeader();
+
+  writeBlocks();
+}
+
+void ModuleBitcodeWriter::writeBlocks() {
+  writeIdentificationBlock();
+  writeModule();
+}
+
+void IndexBitcodeWriter::writeBlocks() {
+  // Index contains only a single outer (module) block.
+  writeIndex();
+}
+
+void ModuleBitcodeWriter::writeModule() {
   Stream.EnterSubblock(bitc::MODULE_BLOCK_ID, 3);
+  size_t BlockStartPos = Buffer.size();
 
   SmallVector<unsigned, 1> Vals;
   unsigned CurVersion = 1;
   Vals.push_back(CurVersion);
   Stream.EmitRecord(bitc::MODULE_CODE_VERSION, Vals);
 
-  // Analyze the module, enumerating globals, functions, etc.
-  ValueEnumerator VE(*M, ShouldPreserveUseListOrder);
-
   // Emit blockinfo, which defines the standard abbreviations etc.
-  WriteBlockInfo(VE, Stream);
+  writeBlockInfo();
 
   // Emit information about attribute groups.
-  WriteAttributeGroupTable(VE, Stream);
+  writeAttributeGroupTable();
 
   // Emit information about parameter attributes.
-  WriteAttributeTable(VE, Stream);
+  writeAttributeTable();
 
   // Emit information describing all of the types in the module.
-  WriteTypeTable(VE, Stream);
+  writeTypeTable();
 
-  writeComdats(VE, Stream);
+  writeComdats();
 
   // Emit top-level description of module, including target triple, inline asm,
   // descriptors for global variables, and function prototype info.
-  uint64_t VSTOffsetPlaceholder = WriteModuleInfo(M, VE, Stream);
+  writeModuleInfo();
 
   // Emit constants.
-  WriteModuleConstants(VE, Stream);
+  writeModuleConstants();
 
-  // Emit metadata.
-  WriteModuleMetadata(M, VE, Stream);
+  // Emit metadata kind names.
+  writeModuleMetadataKinds();
 
   // Emit metadata.
-  WriteModuleMetadataStore(M, Stream);
+  writeModuleMetadata();
 
   // Emit module-level use-lists.
   if (VE.shouldPreserveUseListOrder())
-    WriteUseListBlock(nullptr, VE, Stream);
+    writeUseListBlock(nullptr);
 
-  WriteOperandBundleTags(M, Stream);
+  writeOperandBundleTags();
 
   // Emit function bodies.
-  DenseMap<const Function *, std::unique_ptr<FunctionInfo>> FunctionIndex;
-  for (Module::const_iterator F = M->begin(), E = M->end(); F != E; ++F)
+  DenseMap<const Function *, uint64_t> FunctionToBitcodeIndex;
+  for (Module::const_iterator F = M.begin(), E = M.end(); F != E; ++F)
     if (!F->isDeclaration())
-      WriteFunction(*F, VE, Stream, FunctionIndex, EmitFunctionSummary);
+      writeFunction(*F, FunctionToBitcodeIndex);
 
   // Need to write after the above call to WriteFunction which populates
   // the summary information in the index.
-  if (EmitFunctionSummary)
-    WritePerModuleFunctionSummary(FunctionIndex, M, VE, Stream);
+  if (Index)
+    writePerModuleGlobalValueSummary();
 
-  WriteValueSymbolTable(M->getValueSymbolTable(), VE, Stream,
-                        VSTOffsetPlaceholder, BitcodeStartBit, &FunctionIndex);
+  writeValueSymbolTable(M.getValueSymbolTable(),
+                        /* IsModuleLevel */ true, &FunctionToBitcodeIndex);
+
+  if (GenerateHash) {
+    writeModuleHash(BlockStartPos);
+  }
 
   Stream.ExitBlock();
 }
 
-/// EmitDarwinBCHeader - If generating a bc file on darwin, we have to emit a
+static void writeInt32ToBuffer(uint32_t Value, SmallVectorImpl<char> &Buffer,
+                               uint32_t &Position) {
+  support::endian::write32le(&Buffer[Position], Value);
+  Position += 4;
+}
+
+/// If generating a bc file on darwin, we have to emit a
 /// header and trailer to make it compatible with the system archiver.  To do
 /// this we emit the following header, and then emit a trailer that pads the
 /// file out to be a multiple of 16 bytes.
@@ -2966,18 +3722,7 @@ static void WriteModule(const Module *M, BitstreamWriter &Stream,
 ///   uint32_t CPUType;       // CPU specifier.
 ///   ... potentially more later ...
 /// };
-enum {
-  DarwinBCSizeFieldOffset = 3*4, // Offset to bitcode_size.
-  DarwinBCHeaderSize = 5*4
-};
-
-static void WriteInt32ToBuffer(uint32_t Value, SmallVectorImpl<char> &Buffer,
-                               uint32_t &Position) {
-  support::endian::write32le(&Buffer[Position], Value);
-  Position += 4;
-}
-
-static void EmitDarwinBCHeaderAndTrailer(SmallVectorImpl<char> &Buffer,
+static void emitDarwinBCHeaderAndTrailer(SmallVectorImpl<char> &Buffer,
                                          const Triple &TT) {
   unsigned CPUType = ~0U;
 
@@ -3005,18 +3750,18 @@ static void EmitDarwinBCHeaderAndTrailer(SmallVectorImpl<char> &Buffer,
     CPUType = DARWIN_CPU_TYPE_ARM;
 
   // Traditional Bitcode starts after header.
-  assert(Buffer.size() >= DarwinBCHeaderSize &&
+  assert(Buffer.size() >= BWH_HeaderSize &&
          "Expected header size to be reserved");
-  unsigned BCOffset = DarwinBCHeaderSize;
-  unsigned BCSize = Buffer.size()-DarwinBCHeaderSize;
+  unsigned BCOffset = BWH_HeaderSize;
+  unsigned BCSize = Buffer.size() - BWH_HeaderSize;
 
   // Write the magic and version.
   unsigned Position = 0;
-  WriteInt32ToBuffer(0x0B17C0DE , Buffer, Position);
-  WriteInt32ToBuffer(0          , Buffer, Position); // Version.
-  WriteInt32ToBuffer(BCOffset   , Buffer, Position);
-  WriteInt32ToBuffer(BCSize     , Buffer, Position);
-  WriteInt32ToBuffer(CPUType    , Buffer, Position);
+  writeInt32ToBuffer(0x0B17C0DE, Buffer, Position);
+  writeInt32ToBuffer(0, Buffer, Position); // Version.
+  writeInt32ToBuffer(BCOffset, Buffer, Position);
+  writeInt32ToBuffer(BCSize, Buffer, Position);
+  writeInt32ToBuffer(CPUType, Buffer, Position);
 
   // If the file is not a multiple of 16 bytes, insert dummy padding.
   while (Buffer.size() & 15)
@@ -3024,7 +3769,7 @@ static void EmitDarwinBCHeaderAndTrailer(SmallVectorImpl<char> &Buffer,
 }
 
 /// Helper to write the header common to all bitcode files.
-static void WriteBitcodeHeader(BitstreamWriter &Stream) {
+void BitcodeWriter::writeBitcodeHeader() {
   // Emit the file header.
   Stream.Emit((unsigned)'B', 8);
   Stream.Emit((unsigned)'C', 8);
@@ -3038,55 +3783,30 @@ static void WriteBitcodeHeader(BitstreamWriter &Stream) {
 /// stream.
 void llvm::WriteBitcodeToFile(const Module *M, raw_ostream &Out,
                               bool ShouldPreserveUseListOrder,
-                              bool EmitFunctionSummary) {
+                              const ModuleSummaryIndex *Index,
+                              bool GenerateHash) {
   SmallVector<char, 0> Buffer;
   Buffer.reserve(256*1024);
 
   // If this is darwin or another generic macho target, reserve space for the
   // header.
   Triple TT(M->getTargetTriple());
-  if (TT.isOSDarwin())
-    Buffer.insert(Buffer.begin(), DarwinBCHeaderSize, 0);
+  if (TT.isOSDarwin() || TT.isOSBinFormatMachO())
+    Buffer.insert(Buffer.begin(), BWH_HeaderSize, 0);
 
   // Emit the module into the buffer.
-  {
-    BitstreamWriter Stream(Buffer);
-    // Save the start bit of the actual bitcode, in case there is space
-    // saved at the start for the darwin header above. The reader stream
-    // will start at the bitcode, and we need the offset of the VST
-    // to line up.
-    uint64_t BitcodeStartBit = Stream.GetCurrentBitNo();
-
-    // Emit the file header.
-    WriteBitcodeHeader(Stream);
-
-    WriteIdentificationBlock(M, Stream);
-
-    // Emit the module.
-    WriteModule(M, Stream, ShouldPreserveUseListOrder, BitcodeStartBit,
-                EmitFunctionSummary);
-  }
+  ModuleBitcodeWriter ModuleWriter(M, Buffer, ShouldPreserveUseListOrder, Index,
+                                   GenerateHash);
+  ModuleWriter.write();
 
-  if (TT.isOSDarwin())
-    EmitDarwinBCHeaderAndTrailer(Buffer, TT);
+  if (TT.isOSDarwin() || TT.isOSBinFormatMachO())
+    emitDarwinBCHeaderAndTrailer(Buffer, TT);
 
   // Write the generated bitstream to "Out".
   Out.write((char*)&Buffer.front(), Buffer.size());
 }
 
-// Write the specified function summary index to the given raw output stream,
-// where it will be written in a new bitcode block. This is used when
-// writing the combined index file for ThinLTO.
-void llvm::WriteFunctionSummaryToFile(const FunctionInfoIndex &Index,
-                                      raw_ostream &Out) {
-  SmallVector<char, 0> Buffer;
-  Buffer.reserve(256 * 1024);
-
-  BitstreamWriter Stream(Buffer);
-
-  // Emit the bitcode header.
-  WriteBitcodeHeader(Stream);
-
+void IndexBitcodeWriter::writeIndex() {
   Stream.EnterSubblock(bitc::MODULE_BLOCK_ID, 3);
 
   SmallVector<unsigned, 1> Vals;
@@ -3094,17 +3814,34 @@ void llvm::WriteFunctionSummaryToFile(const FunctionInfoIndex &Index,
   Vals.push_back(CurVersion);
   Stream.EmitRecord(bitc::MODULE_CODE_VERSION, Vals);
 
+  // If we have a VST, write the VSTOFFSET record placeholder.
+  writeValueSymbolTableForwardDecl();
+
   // Write the module paths in the combined index.
-  WriteModStrings(Index, Stream);
+  writeModStrings();
 
-  // Write the function summary combined index records.
-  WriteCombinedFunctionSummary(Index, Stream);
+  // Write the summary combined index records.
+  writeCombinedGlobalValueSummary();
 
   // Need a special VST writer for the combined index (we don't have a
   // real VST and real values when this is invoked).
-  WriteCombinedValueSymbolTable(Index, Stream);
+  writeCombinedValueSymbolTable();
 
   Stream.ExitBlock();
+}
+
+// Write the specified module summary index to the given raw output stream,
+// where it will be written in a new bitcode block. This is used when
+// writing the combined index file for ThinLTO. When writing a subset of the
+// index for a distributed backend, provide a \p ModuleToSummariesForIndex map.
+void llvm::WriteIndexToFile(
+    const ModuleSummaryIndex &Index, raw_ostream &Out,
+    std::map<std::string, GVSummaryMapTy> *ModuleToSummariesForIndex) {
+  SmallVector<char, 0> Buffer;
+  Buffer.reserve(256 * 1024);
+
+  IndexBitcodeWriter IndexWriter(Buffer, Index, ModuleToSummariesForIndex);
+  IndexWriter.write();
 
   Out.write((char *)&Buffer.front(), Buffer.size());
 }
diff --git a/lib/Bitcode/Writer/BitcodeWriterPass.cpp b/lib/Bitcode/Writer/BitcodeWriterPass.cpp
index 24de99a34d33..3e89ade424a2 100644
--- a/lib/Bitcode/Writer/BitcodeWriterPass.cpp
+++ b/lib/Bitcode/Writer/BitcodeWriterPass.cpp
@@ -12,14 +12,19 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/Bitcode/BitcodeWriterPass.h"
+#include "llvm/Analysis/ModuleSummaryAnalysis.h"
 #include "llvm/Bitcode/ReaderWriter.h"
 #include "llvm/IR/Module.h"
 #include "llvm/IR/PassManager.h"
 #include "llvm/Pass.h"
 using namespace llvm;
 
-PreservedAnalyses BitcodeWriterPass::run(Module &M) {
-  WriteBitcodeToFile(&M, OS, ShouldPreserveUseListOrder, EmitFunctionSummary);
+PreservedAnalyses BitcodeWriterPass::run(Module &M, ModuleAnalysisManager &) {
+  std::unique_ptr<ModuleSummaryIndex> Index;
+  if (EmitSummaryIndex)
+    Index = ModuleSummaryIndexBuilder(&M).takeIndex();
+  WriteBitcodeToFile(&M, OS, ShouldPreserveUseListOrder, Index.get(),
+                     EmitModuleHash);
   return PreservedAnalyses::all();
 }
 
@@ -27,31 +32,52 @@ namespace {
   class WriteBitcodePass : public ModulePass {
     raw_ostream &OS; // raw_ostream to print on
     bool ShouldPreserveUseListOrder;
-    bool EmitFunctionSummary;
+    bool EmitSummaryIndex;
+    bool EmitModuleHash;
 
   public:
     static char ID; // Pass identification, replacement for typeid
+    WriteBitcodePass() : ModulePass(ID), OS(dbgs()) {
+      initializeWriteBitcodePassPass(*PassRegistry::getPassRegistry());
+    }
+
     explicit WriteBitcodePass(raw_ostream &o, bool ShouldPreserveUseListOrder,
-                              bool EmitFunctionSummary)
+                              bool EmitSummaryIndex, bool EmitModuleHash)
         : ModulePass(ID), OS(o),
           ShouldPreserveUseListOrder(ShouldPreserveUseListOrder),
-          EmitFunctionSummary(EmitFunctionSummary) {}
+          EmitSummaryIndex(EmitSummaryIndex), EmitModuleHash(EmitModuleHash) {
+      initializeWriteBitcodePassPass(*PassRegistry::getPassRegistry());
+    }
 
     const char *getPassName() const override { return "Bitcode Writer"; }
 
     bool runOnModule(Module &M) override {
-      WriteBitcodeToFile(&M, OS, ShouldPreserveUseListOrder,
-                         EmitFunctionSummary);
+      const ModuleSummaryIndex *Index =
+          EmitSummaryIndex
+              ? &(getAnalysis<ModuleSummaryIndexWrapperPass>().getIndex())
+              : nullptr;
+      WriteBitcodeToFile(&M, OS, ShouldPreserveUseListOrder, Index,
+                         EmitModuleHash);
       return false;
     }
+    void getAnalysisUsage(AnalysisUsage &AU) const override {
+      AU.setPreservesAll();
+      if (EmitSummaryIndex)
+        AU.addRequired<ModuleSummaryIndexWrapperPass>();
+    }
   };
 }
 
 char WriteBitcodePass::ID = 0;
+INITIALIZE_PASS_BEGIN(WriteBitcodePass, "write-bitcode", "Write Bitcode", false,
+                      true)
+INITIALIZE_PASS_DEPENDENCY(ModuleSummaryIndexWrapperPass)
+INITIALIZE_PASS_END(WriteBitcodePass, "write-bitcode", "Write Bitcode", false,
+                    true)
 
 ModulePass *llvm::createBitcodeWriterPass(raw_ostream &Str,
                                           bool ShouldPreserveUseListOrder,
-                                          bool EmitFunctionSummary) {
+                                          bool EmitSummaryIndex, bool EmitModuleHash) {
   return new WriteBitcodePass(Str, ShouldPreserveUseListOrder,
-                              EmitFunctionSummary);
+                              EmitSummaryIndex, EmitModuleHash);
 }
diff --git a/lib/Bitcode/Writer/LLVMBuild.txt b/lib/Bitcode/Writer/LLVMBuild.txt
index 7d9e1de771b9..a450b38fba2c 100644
--- a/lib/Bitcode/Writer/LLVMBuild.txt
+++ b/lib/Bitcode/Writer/LLVMBuild.txt
@@ -19,4 +19,4 @@
 type = Library
 name = BitWriter
 parent = Bitcode
-required_libraries = Core Support
+required_libraries = Analysis Core Support
diff --git a/lib/Bitcode/Writer/Makefile b/lib/Bitcode/Writer/Makefile
deleted file mode 100644
index 7b0bd72159ad..000000000000
--- a/lib/Bitcode/Writer/Makefile
+++ /dev/null
@@ -1,15 +0,0 @@
-##===- lib/Bitcode/Reader/Makefile -------------------------*- Makefile -*-===##
-#
-#                     The LLVM Compiler Infrastructure
-#
-# This file is distributed under the University of Illinois Open Source
-# License. See LICENSE.TXT for details.
-#
-##===----------------------------------------------------------------------===##
-
-LEVEL = ../../..
-LIBRARYNAME = LLVMBitWriter
-BUILD_ARCHIVE = 1
-
-include $(LEVEL)/Makefile.common
-
diff --git a/lib/Bitcode/Writer/ValueEnumerator.cpp b/lib/Bitcode/Writer/ValueEnumerator.cpp
index e07563b5a390..5d5bfab58b81 100644
--- a/lib/Bitcode/Writer/ValueEnumerator.cpp
+++ b/lib/Bitcode/Writer/ValueEnumerator.cpp
@@ -86,6 +86,9 @@ static OrderMap orderModule(const Module &M) {
   for (const GlobalAlias &A : M.aliases())
     if (!isa<GlobalValue>(A.getAliasee()))
       orderValue(A.getAliasee(), OM);
+  for (const GlobalIFunc &I : M.ifuncs())
+    if (!isa<GlobalValue>(I.getResolver()))
+      orderValue(I.getResolver(), OM);
   for (const Function &F : M) {
     for (const Use &U : F.operands())
       if (!isa<GlobalValue>(U.get()))
@@ -105,6 +108,8 @@ static OrderMap orderModule(const Module &M) {
     orderValue(&F, OM);
   for (const GlobalAlias &A : M.aliases())
     orderValue(&A, OM);
+  for (const GlobalIFunc &I : M.ifuncs())
+    orderValue(&I, OM);
   for (const GlobalVariable &G : M.globals())
     orderValue(&G, OM);
   OM.LastGlobalValueID = OM.size();
@@ -261,11 +266,15 @@ static UseListOrderStack predictUseListOrder(const Module &M) {
     predictValueUseListOrder(&F, nullptr, OM, Stack);
   for (const GlobalAlias &A : M.aliases())
     predictValueUseListOrder(&A, nullptr, OM, Stack);
+  for (const GlobalIFunc &I : M.ifuncs())
+    predictValueUseListOrder(&I, nullptr, OM, Stack);
   for (const GlobalVariable &G : M.globals())
     if (G.hasInitializer())
       predictValueUseListOrder(G.getInitializer(), nullptr, OM, Stack);
   for (const GlobalAlias &A : M.aliases())
     predictValueUseListOrder(A.getAliasee(), nullptr, OM, Stack);
+  for (const GlobalIFunc &I : M.ifuncs())
+    predictValueUseListOrder(I.getResolver(), nullptr, OM, Stack);
   for (const Function &F : M) {
     for (const Use &U : F.operands())
       predictValueUseListOrder(U.get(), nullptr, OM, Stack);
@@ -280,8 +289,7 @@ static bool isIntOrIntVectorValue(const std::pair<const Value*, unsigned> &V) {
 
 ValueEnumerator::ValueEnumerator(const Module &M,
                                  bool ShouldPreserveUseListOrder)
-    : HasMDString(false), HasDILocation(false), HasGenericDINode(false),
-      ShouldPreserveUseListOrder(ShouldPreserveUseListOrder) {
+    : ShouldPreserveUseListOrder(ShouldPreserveUseListOrder) {
   if (ShouldPreserveUseListOrder)
     UseListOrders = predictUseListOrder(M);
 
@@ -299,6 +307,10 @@ ValueEnumerator::ValueEnumerator(const Module &M,
   for (const GlobalAlias &GA : M.aliases())
     EnumerateValue(&GA);
 
+  // Enumerate the ifuncs.
+  for (const GlobalIFunc &GIF : M.ifuncs())
+    EnumerateValue(&GIF);
+
   // Remember what is the cutoff between globalvalue's and other constants.
   unsigned FirstConstant = Values.size();
 
@@ -311,6 +323,10 @@ ValueEnumerator::ValueEnumerator(const Module &M,
   for (const GlobalAlias &GA : M.aliases())
     EnumerateValue(GA.getAliasee());
 
+  // Enumerate the ifunc resolvers.
+  for (const GlobalIFunc &GIF : M.ifuncs())
+    EnumerateValue(GIF.getResolver());
+
   // Enumerate any optional Function data.
   for (const Function &F : M)
     for (const Use &U : F.operands())
@@ -328,6 +344,15 @@ ValueEnumerator::ValueEnumerator(const Module &M,
   EnumerateNamedMetadata(M);
 
   SmallVector<std::pair<unsigned, MDNode *>, 8> MDs;
+  for (const GlobalVariable &GV : M.globals()) {
+    MDs.clear();
+    GV.getAllMetadata(MDs);
+    for (const auto &I : MDs)
+      // FIXME: Pass GV to EnumerateMetadata and arrange for the bitcode writer
+      // to write metadata to the global variable's own metadata block
+      // (PR28134).
+      EnumerateMetadata(nullptr, I.second);
+  }
 
   // Enumerate types used by function bodies and argument lists.
   for (const Function &F : M) {
@@ -335,9 +360,10 @@ ValueEnumerator::ValueEnumerator(const Module &M,
       EnumerateType(A.getType());
 
     // Enumerate metadata attached to this function.
+    MDs.clear();
     F.getAllMetadata(MDs);
     for (const auto &I : MDs)
-      EnumerateMetadata(I.second);
+      EnumerateMetadata(F.isDeclaration() ? nullptr : &F, I.second);
 
     for (const BasicBlock &BB : F)
       for (const Instruction &I : BB) {
@@ -352,7 +378,7 @@ ValueEnumerator::ValueEnumerator(const Module &M,
           if (isa<LocalAsMetadata>(MD->getMetadata()))
             continue;
 
-          EnumerateMetadata(MD->getMetadata());
+          EnumerateMetadata(&F, MD->getMetadata());
         }
         EnumerateType(I.getType());
         if (const CallInst *CI = dyn_cast<CallInst>(&I))
@@ -364,17 +390,21 @@ ValueEnumerator::ValueEnumerator(const Module &M,
         MDs.clear();
         I.getAllMetadataOtherThanDebugLoc(MDs);
         for (unsigned i = 0, e = MDs.size(); i != e; ++i)
-          EnumerateMetadata(MDs[i].second);
+          EnumerateMetadata(&F, MDs[i].second);
 
         // Don't enumerate the location directly -- it has a special record
         // type -- but enumerate its operands.
         if (DILocation *L = I.getDebugLoc())
-          EnumerateMDNodeOperands(L);
+          for (const Metadata *Op : L->operands())
+            EnumerateMetadata(&F, Op);
       }
   }
 
   // Optimize constant ordering.
   OptimizeConstants(FirstConstant, Values.size());
+
+  // Organize metadata ordering.
+  organizeMetadata();
 }
 
 unsigned ValueEnumerator::getInstructionID(const Instruction *Inst) const {
@@ -402,7 +432,7 @@ unsigned ValueEnumerator::getValueID(const Value *V) const {
   return I->second-1;
 }
 
-void ValueEnumerator::dump() const {
+LLVM_DUMP_METHOD void ValueEnumerator::dump() const {
   print(dbgs(), ValueMap, "Default");
   dbgs() << '\n';
   print(dbgs(), MetadataMap, "MetaData");
@@ -445,8 +475,10 @@ void ValueEnumerator::print(raw_ostream &OS, const MetadataMapType &Map,
   OS << "Size: " << Map.size() << "\n";
   for (auto I = Map.begin(), E = Map.end(); I != E; ++I) {
     const Metadata *MD = I->first;
-    OS << "Metadata: slot = " << I->second << "\n";
+    OS << "Metadata: slot = " << I->second.ID << "\n";
+    OS << "Metadata: function = " << I->second.F << "\n";
     MD->print(OS);
+    OS << "\n";
   }
 }
 
@@ -472,8 +504,8 @@ void ValueEnumerator::OptimizeConstants(unsigned CstStart, unsigned CstEnd) {
   // Ensure that integer and vector of integer constants are at the start of the
   // constant pool.  This is important so that GEP structure indices come before
   // gep constant exprs.
-  std::partition(Values.begin()+CstStart, Values.begin()+CstEnd,
-                 isIntOrIntVectorValue);
+  std::stable_partition(Values.begin() + CstStart, Values.begin() + CstEnd,
+                        isIntOrIntVectorValue);
 
   // Rebuild the modified portion of ValueMap.
   for (; CstStart != CstEnd; ++CstStart)
@@ -498,65 +530,244 @@ void ValueEnumerator::EnumerateNamedMetadata(const Module &M) {
 
 void ValueEnumerator::EnumerateNamedMDNode(const NamedMDNode *MD) {
   for (unsigned i = 0, e = MD->getNumOperands(); i != e; ++i)
-    EnumerateMetadata(MD->getOperand(i));
+    EnumerateMetadata(nullptr, MD->getOperand(i));
+}
+
+unsigned ValueEnumerator::getMetadataFunctionID(const Function *F) const {
+  return F ? getValueID(F) + 1 : 0;
+}
+
+void ValueEnumerator::EnumerateMetadata(const Function *F, const Metadata *MD) {
+  EnumerateMetadata(getMetadataFunctionID(F), MD);
+}
+
+void ValueEnumerator::EnumerateFunctionLocalMetadata(
+    const Function &F, const LocalAsMetadata *Local) {
+  EnumerateFunctionLocalMetadata(getMetadataFunctionID(&F), Local);
+}
+
+void ValueEnumerator::dropFunctionFromMetadata(
+    MetadataMapType::value_type &FirstMD) {
+  SmallVector<const MDNode *, 64> Worklist;
+  auto push = [this, &Worklist](MetadataMapType::value_type &MD) {
+    auto &Entry = MD.second;
+
+    // Nothing to do if this metadata isn't tagged.
+    if (!Entry.F)
+      return;
+
+    // Drop the function tag.
+    Entry.F = 0;
+
+    // If this is has an ID and is an MDNode, then its operands have entries as
+    // well.  We need to drop the function from them too.
+    if (Entry.ID)
+      if (auto *N = dyn_cast<MDNode>(MD.first))
+        Worklist.push_back(N);
+  };
+  push(FirstMD);
+  while (!Worklist.empty())
+    for (const Metadata *Op : Worklist.pop_back_val()->operands()) {
+      if (!Op)
+        continue;
+      auto MD = MetadataMap.find(Op);
+      if (MD != MetadataMap.end())
+        push(*MD);
+    }
 }
 
-/// EnumerateMDNodeOperands - Enumerate all non-function-local values
-/// and types referenced by the given MDNode.
-void ValueEnumerator::EnumerateMDNodeOperands(const MDNode *N) {
-  for (unsigned i = 0, e = N->getNumOperands(); i != e; ++i) {
-    Metadata *MD = N->getOperand(i);
-    if (!MD)
+void ValueEnumerator::EnumerateMetadata(unsigned F, const Metadata *MD) {
+  // It's vital for reader efficiency that uniqued subgraphs are done in
+  // post-order; it's expensive when their operands have forward references.
+  // If a distinct node is referenced from a uniqued node, it'll be delayed
+  // until the uniqued subgraph has been completely traversed.
+  SmallVector<const MDNode *, 32> DelayedDistinctNodes;
+
+  // Start by enumerating MD, and then work through its transitive operands in
+  // post-order.  This requires a depth-first search.
+  SmallVector<std::pair<const MDNode *, MDNode::op_iterator>, 32> Worklist;
+  if (const MDNode *N = enumerateMetadataImpl(F, MD))
+    Worklist.push_back(std::make_pair(N, N->op_begin()));
+
+  while (!Worklist.empty()) {
+    const MDNode *N = Worklist.back().first;
+
+    // Enumerate operands until we hit a new node.  We need to traverse these
+    // nodes' operands before visiting the rest of N's operands.
+    MDNode::op_iterator I = std::find_if(
+        Worklist.back().second, N->op_end(),
+        [&](const Metadata *MD) { return enumerateMetadataImpl(F, MD); });
+    if (I != N->op_end()) {
+      auto *Op = cast<MDNode>(*I);
+      Worklist.back().second = ++I;
+
+      // Delay traversing Op if it's a distinct node and N is uniqued.
+      if (Op->isDistinct() && !N->isDistinct())
+        DelayedDistinctNodes.push_back(Op);
+      else
+        Worklist.push_back(std::make_pair(Op, Op->op_begin()));
       continue;
-    assert(!isa<LocalAsMetadata>(MD) && "MDNodes cannot be function-local");
-    EnumerateMetadata(MD);
+    }
+
+    // All the operands have been visited.  Now assign an ID.
+    Worklist.pop_back();
+    MDs.push_back(N);
+    MetadataMap[N].ID = MDs.size();
+
+    // Flush out any delayed distinct nodes; these are all the distinct nodes
+    // that are leaves in last uniqued subgraph.
+    if (Worklist.empty() || Worklist.back().first->isDistinct()) {
+      for (const MDNode *N : DelayedDistinctNodes)
+        Worklist.push_back(std::make_pair(N, N->op_begin()));
+      DelayedDistinctNodes.clear();
+    }
   }
 }
 
-void ValueEnumerator::EnumerateMetadata(const Metadata *MD) {
+const MDNode *ValueEnumerator::enumerateMetadataImpl(unsigned F, const Metadata *MD) {
+  if (!MD)
+    return nullptr;
+
   assert(
       (isa<MDNode>(MD) || isa<MDString>(MD) || isa<ConstantAsMetadata>(MD)) &&
       "Invalid metadata kind");
 
-  // Insert a dummy ID to block the co-recursive call to
-  // EnumerateMDNodeOperands() from re-visiting MD in a cyclic graph.
-  //
-  // Return early if there's already an ID.
-  if (!MetadataMap.insert(std::make_pair(MD, 0)).second)
-    return;
+  auto Insertion = MetadataMap.insert(std::make_pair(MD, MDIndex(F)));
+  MDIndex &Entry = Insertion.first->second;
+  if (!Insertion.second) {
+    // Already mapped.  If F doesn't match the function tag, drop it.
+    if (Entry.hasDifferentFunction(F))
+      dropFunctionFromMetadata(*Insertion.first);
+    return nullptr;
+  }
 
-  // Visit operands first to minimize RAUW.
+  // Don't assign IDs to metadata nodes.
   if (auto *N = dyn_cast<MDNode>(MD))
-    EnumerateMDNodeOperands(N);
-  else if (auto *C = dyn_cast<ConstantAsMetadata>(MD))
-    EnumerateValue(C->getValue());
+    return N;
 
-  HasMDString |= isa<MDString>(MD);
-  HasDILocation |= isa<DILocation>(MD);
-  HasGenericDINode |= isa<GenericDINode>(MD);
-
-  // Replace the dummy ID inserted above with the correct one.  MetadataMap may
-  // have changed by inserting operands, so we need a fresh lookup here.
+  // Save the metadata.
   MDs.push_back(MD);
-  MetadataMap[MD] = MDs.size();
+  Entry.ID = MDs.size();
+
+  // Enumerate the constant, if any.
+  if (auto *C = dyn_cast<ConstantAsMetadata>(MD))
+    EnumerateValue(C->getValue());
+
+  return nullptr;
 }
 
 /// EnumerateFunctionLocalMetadataa - Incorporate function-local metadata
 /// information reachable from the metadata.
 void ValueEnumerator::EnumerateFunctionLocalMetadata(
-    const LocalAsMetadata *Local) {
+    unsigned F, const LocalAsMetadata *Local) {
+  assert(F && "Expected a function");
+
   // Check to see if it's already in!
-  unsigned &MetadataID = MetadataMap[Local];
-  if (MetadataID)
+  MDIndex &Index = MetadataMap[Local];
+  if (Index.ID) {
+    assert(Index.F == F && "Expected the same function");
     return;
+  }
 
   MDs.push_back(Local);
-  MetadataID = MDs.size();
+  Index.F = F;
+  Index.ID = MDs.size();
 
   EnumerateValue(Local->getValue());
+}
+
+static unsigned getMetadataTypeOrder(const Metadata *MD) {
+  // Strings are emitted in bulk and must come first.
+  if (isa<MDString>(MD))
+    return 0;
+
+  // ConstantAsMetadata doesn't reference anything.  We may as well shuffle it
+  // to the front since we can detect it.
+  auto *N = dyn_cast<MDNode>(MD);
+  if (!N)
+    return 1;
+
+  // The reader is fast forward references for distinct node operands, but slow
+  // when uniqued operands are unresolved.
+  return N->isDistinct() ? 2 : 3;
+}
+
+void ValueEnumerator::organizeMetadata() {
+  assert(MetadataMap.size() == MDs.size() &&
+         "Metadata map and vector out of sync");
+
+  if (MDs.empty())
+    return;
+
+  // Copy out the index information from MetadataMap in order to choose a new
+  // order.
+  SmallVector<MDIndex, 64> Order;
+  Order.reserve(MetadataMap.size());
+  for (const Metadata *MD : MDs)
+    Order.push_back(MetadataMap.lookup(MD));
+
+  // Partition:
+  //   - by function, then
+  //   - by isa<MDString>
+  // and then sort by the original/current ID.  Since the IDs are guaranteed to
+  // be unique, the result of std::sort will be deterministic.  There's no need
+  // for std::stable_sort.
+  std::sort(Order.begin(), Order.end(), [this](MDIndex LHS, MDIndex RHS) {
+    return std::make_tuple(LHS.F, getMetadataTypeOrder(LHS.get(MDs)), LHS.ID) <
+           std::make_tuple(RHS.F, getMetadataTypeOrder(RHS.get(MDs)), RHS.ID);
+  });
+
+  // Rebuild MDs, index the metadata ranges for each function in FunctionMDs,
+  // and fix up MetadataMap.
+  std::vector<const Metadata *> OldMDs = std::move(MDs);
+  MDs.reserve(OldMDs.size());
+  for (unsigned I = 0, E = Order.size(); I != E && !Order[I].F; ++I) {
+    auto *MD = Order[I].get(OldMDs);
+    MDs.push_back(MD);
+    MetadataMap[MD].ID = I + 1;
+    if (isa<MDString>(MD))
+      ++NumMDStrings;
+  }
 
-  // Also, collect all function-local metadata for easy access.
-  FunctionLocalMDs.push_back(Local);
+  // Return early if there's nothing for the functions.
+  if (MDs.size() == Order.size())
+    return;
+
+  // Build the function metadata ranges.
+  MDRange R;
+  FunctionMDs.reserve(OldMDs.size());
+  unsigned PrevF = 0;
+  for (unsigned I = MDs.size(), E = Order.size(), ID = MDs.size(); I != E;
+       ++I) {
+    unsigned F = Order[I].F;
+    if (!PrevF) {
+      PrevF = F;
+    } else if (PrevF != F) {
+      R.Last = FunctionMDs.size();
+      std::swap(R, FunctionMDInfo[PrevF]);
+      R.First = FunctionMDs.size();
+
+      ID = MDs.size();
+      PrevF = F;
+    }
+
+    auto *MD = Order[I].get(OldMDs);
+    FunctionMDs.push_back(MD);
+    MetadataMap[MD].ID = ++ID;
+    if (isa<MDString>(MD))
+      ++R.NumStrings;
+  }
+  R.Last = FunctionMDs.size();
+  FunctionMDInfo[PrevF] = R;
+}
+
+void ValueEnumerator::incorporateFunctionMetadata(const Function &F) {
+  NumModuleMDs = MDs.size();
+
+  auto R = FunctionMDInfo.lookup(getValueID(&F) + 1);
+  NumMDStrings = R.NumStrings;
+  MDs.insert(MDs.end(), FunctionMDs.begin() + R.First,
+             FunctionMDs.begin() + R.Last);
 }
 
 void ValueEnumerator::EnumerateValue(const Value *V) {
@@ -650,13 +861,7 @@ void ValueEnumerator::EnumerateType(Type *Ty) {
 void ValueEnumerator::EnumerateOperandType(const Value *V) {
   EnumerateType(V->getType());
 
-  if (auto *MD = dyn_cast<MetadataAsValue>(V)) {
-    assert(!isa<LocalAsMetadata>(MD->getMetadata()) &&
-           "Function-local metadata should be left for later");
-
-    EnumerateMetadata(MD->getMetadata());
-    return;
-  }
+  assert(!isa<MetadataAsValue>(V) && "Unexpected metadata operand");
 
   const Constant *C = dyn_cast<Constant>(V);
   if (!C)
@@ -704,7 +909,10 @@ void ValueEnumerator::EnumerateAttributes(AttributeSet PAL) {
 void ValueEnumerator::incorporateFunction(const Function &F) {
   InstructionCount = 0;
   NumModuleValues = Values.size();
-  NumModuleMDs = MDs.size();
+
+  // Add global metadata to the function block.  This doesn't include
+  // LocalAsMetadata.
+  incorporateFunctionMetadata(F);
 
   // Adding function arguments to the value table.
   for (const auto &I : F.args())
@@ -749,8 +957,13 @@ void ValueEnumerator::incorporateFunction(const Function &F) {
   }
 
   // Add all of the function-local metadata.
-  for (unsigned i = 0, e = FnLocalMDVector.size(); i != e; ++i)
-    EnumerateFunctionLocalMetadata(FnLocalMDVector[i]);
+  for (unsigned i = 0, e = FnLocalMDVector.size(); i != e; ++i) {
+    // At this point, every local values have been incorporated, we shouldn't
+    // have a metadata operand that references a value that hasn't been seen.
+    assert(ValueMap.count(FnLocalMDVector[i]->getValue()) &&
+           "Missing value for metadata operand");
+    EnumerateFunctionLocalMetadata(F, FnLocalMDVector[i]);
+  }
 }
 
 void ValueEnumerator::purgeFunction() {
@@ -765,7 +978,7 @@ void ValueEnumerator::purgeFunction() {
   Values.resize(NumModuleValues);
   MDs.resize(NumModuleMDs);
   BasicBlocks.clear();
-  FunctionLocalMDs.clear();
+  NumMDStrings = 0;
 }
 
 static void IncorporateFunctionInfoGlobalBBIDs(const Function *F,
diff --git a/lib/Bitcode/Writer/ValueEnumerator.h b/lib/Bitcode/Writer/ValueEnumerator.h
index 9fb8325150e9..bff2de70b3ec 100644
--- a/lib/Bitcode/Writer/ValueEnumerator.h
+++ b/lib/Bitcode/Writer/ValueEnumerator.h
@@ -15,9 +15,10 @@
 #define LLVM_LIB_BITCODE_WRITER_VALUEENUMERATOR_H
 
 #include "llvm/ADT/DenseMap.h"
-#include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/UniqueVector.h"
 #include "llvm/IR/Attributes.h"
+#include "llvm/IR/Metadata.h"
+#include "llvm/IR/Type.h"
 #include "llvm/IR/UseListOrder.h"
 #include <vector>
 
@@ -33,6 +34,7 @@ class Module;
 class Metadata;
 class LocalAsMetadata;
 class MDNode;
+class MDOperand;
 class NamedMDNode;
 class AttributeSet;
 class ValueSymbolTable;
@@ -61,12 +63,43 @@ private:
   ComdatSetType Comdats;
 
   std::vector<const Metadata *> MDs;
-  SmallVector<const LocalAsMetadata *, 8> FunctionLocalMDs;
-  typedef DenseMap<const Metadata *, unsigned> MetadataMapType;
+  std::vector<const Metadata *> FunctionMDs;
+
+  /// Index of information about a piece of metadata.
+  struct MDIndex {
+    unsigned F = 0;  ///< The ID of the function for this metadata, if any.
+    unsigned ID = 0; ///< The implicit ID of this metadata in bitcode.
+
+    MDIndex() = default;
+    explicit MDIndex(unsigned F) : F(F) {}
+
+    /// Check if this has a function tag, and it's different from NewF.
+    bool hasDifferentFunction(unsigned NewF) const { return F && F != NewF; }
+
+    /// Fetch the MD this references out of the given metadata array.
+    const Metadata *get(ArrayRef<const Metadata *> MDs) const {
+      assert(ID && "Expected non-zero ID");
+      assert(ID <= MDs.size() && "Expected valid ID");
+      return MDs[ID - 1];
+    }
+  };
+
+  typedef DenseMap<const Metadata *, MDIndex> MetadataMapType;
   MetadataMapType MetadataMap;
-  bool HasMDString;
-  bool HasDILocation;
-  bool HasGenericDINode;
+
+  /// Range of metadata IDs, as a half-open range.
+  struct MDRange {
+    unsigned First = 0;
+    unsigned Last = 0;
+
+    /// Number of strings in the prefix of the metadata range.
+    unsigned NumStrings = 0;
+
+    MDRange() = default;
+    explicit MDRange(unsigned First) : First(First) {}
+  };
+  SmallDenseMap<unsigned, MDRange, 1> FunctionMDInfo;
+
   bool ShouldPreserveUseListOrder;
 
   typedef DenseMap<AttributeSet, unsigned> AttributeGroupMapType;
@@ -95,7 +128,8 @@ private:
 
   /// When a function is incorporated, this is the size of the Metadatas list
   /// before incorporation.
-  unsigned NumModuleMDs;
+  unsigned NumModuleMDs = 0;
+  unsigned NumMDStrings = 0;
 
   unsigned FirstFuncConstantID;
   unsigned FirstInstID;
@@ -117,14 +151,10 @@ public:
     return ID - 1;
   }
   unsigned getMetadataOrNullID(const Metadata *MD) const {
-    return MetadataMap.lookup(MD);
+    return MetadataMap.lookup(MD).ID;
   }
   unsigned numMDs() const { return MDs.size(); }
 
-  bool hasMDString() const { return HasMDString; }
-  bool hasDILocation() const { return HasDILocation; }
-  bool hasGenericDINode() const { return HasGenericDINode; }
-
   bool shouldPreserveUseListOrder() const { return ShouldPreserveUseListOrder; }
 
   unsigned getTypeID(Type *T) const {
@@ -158,10 +188,20 @@ public:
   }
 
   const ValueList &getValues() const { return Values; }
-  const std::vector<const Metadata *> &getMDs() const { return MDs; }
-  const SmallVectorImpl<const LocalAsMetadata *> &getFunctionLocalMDs() const {
-    return FunctionLocalMDs;
+
+  /// Check whether the current block has any metadata to emit.
+  bool hasMDs() const { return NumModuleMDs < MDs.size(); }
+
+  /// Get the MDString metadata for this block.
+  ArrayRef<const Metadata *> getMDStrings() const {
+    return makeArrayRef(MDs).slice(NumModuleMDs, NumMDStrings);
+  }
+
+  /// Get the non-MDString metadata for this block.
+  ArrayRef<const Metadata *> getNonMDStrings() const {
+    return makeArrayRef(MDs).slice(NumModuleMDs).slice(NumMDStrings);
   }
+
   const TypeList &getTypes() const { return Types; }
   const std::vector<const BasicBlock*> &getBasicBlocks() const {
     return BasicBlocks;
@@ -191,9 +231,54 @@ public:
 private:
   void OptimizeConstants(unsigned CstStart, unsigned CstEnd);
 
-  void EnumerateMDNodeOperands(const MDNode *N);
-  void EnumerateMetadata(const Metadata *MD);
-  void EnumerateFunctionLocalMetadata(const LocalAsMetadata *Local);
+  /// Reorder the reachable metadata.
+  ///
+  /// This is not just an optimization, but is mandatory for emitting MDString
+  /// correctly.
+  void organizeMetadata();
+
+  /// Drop the function tag from the transitive operands of the given node.
+  void dropFunctionFromMetadata(MetadataMapType::value_type &FirstMD);
+
+  /// Incorporate the function metadata.
+  ///
+  /// This should be called before enumerating LocalAsMetadata for the
+  /// function.
+  void incorporateFunctionMetadata(const Function &F);
+
+  /// Enumerate a single instance of metadata with the given function tag.
+  ///
+  /// If \c MD has already been enumerated, check that \c F matches its
+  /// function tag.  If not, call \a dropFunctionFromMetadata().
+  ///
+  /// Otherwise, mark \c MD as visited.  Assign it an ID, or just return it if
+  /// it's an \a MDNode.
+  const MDNode *enumerateMetadataImpl(unsigned F, const Metadata *MD);
+
+  unsigned getMetadataFunctionID(const Function *F) const;
+
+  /// Enumerate reachable metadata in (almost) post-order.
+  ///
+  /// Enumerate all the metadata reachable from MD.  We want to minimize the
+  /// cost of reading bitcode records, and so the primary consideration is that
+  /// operands of uniqued nodes are resolved before the nodes are read.  This
+  /// avoids re-uniquing them on the context and factors away RAUW support.
+  ///
+  /// This algorithm guarantees that subgraphs of uniqued nodes are in
+  /// post-order.  Distinct subgraphs reachable only from a single uniqued node
+  /// will be in post-order.
+  ///
+  /// \note The relative order of a distinct and uniqued node is irrelevant.
+  /// \a organizeMetadata() will later partition distinct nodes ahead of
+  /// uniqued ones.
+  ///{
+  void EnumerateMetadata(const Function *F, const Metadata *MD);
+  void EnumerateMetadata(unsigned F, const Metadata *MD);
+  ///}
+
+  void EnumerateFunctionLocalMetadata(const Function &F,
+                                      const LocalAsMetadata *Local);
+  void EnumerateFunctionLocalMetadata(unsigned F, const LocalAsMetadata *Local);
   void EnumerateNamedMDNode(const NamedMDNode *NMD);
   void EnumerateValue(const Value *V);
   void EnumerateType(Type *T);
diff --git a/lib/Bitcode/module.modulemap b/lib/Bitcode/module.modulemap
deleted file mode 100644
index 7df1a0a3c721..000000000000
--- a/lib/Bitcode/module.modulemap
+++ /dev/null
@@ -1 +0,0 @@
-module Bitcode { requires cplusplus umbrella "." module * { export * } }
diff --git a/lib/CMakeLists.txt b/lib/CMakeLists.txt
index d00c10f5802f..9449421ef7af 100644
--- a/lib/CMakeLists.txt
+++ b/lib/CMakeLists.txt
@@ -10,6 +10,7 @@ add_subdirectory(Analysis)
 add_subdirectory(LTO)
 add_subdirectory(MC)
 add_subdirectory(Object)
+add_subdirectory(ObjectYAML)
 add_subdirectory(Option)
 add_subdirectory(DebugInfo)
 add_subdirectory(ExecutionEngine)
diff --git a/lib/CodeGen/AggressiveAntiDepBreaker.cpp b/lib/CodeGen/AggressiveAntiDepBreaker.cpp
index 4060db74a9b7..a736884be672 100644
--- a/lib/CodeGen/AggressiveAntiDepBreaker.cpp
+++ b/lib/CodeGen/AggressiveAntiDepBreaker.cpp
@@ -180,7 +180,7 @@ void AggressiveAntiDepBreaker::FinishBlock() {
   State = nullptr;
 }
 
-void AggressiveAntiDepBreaker::Observe(MachineInstr *MI, unsigned Count,
+void AggressiveAntiDepBreaker::Observe(MachineInstr &MI, unsigned Count,
                                        unsigned InsertPosIndex) {
   assert(Count < InsertPosIndex && "Instruction index out of expected range!");
 
@@ -190,7 +190,7 @@ void AggressiveAntiDepBreaker::Observe(MachineInstr *MI, unsigned Count,
   ScanInstruction(MI, Count);
 
   DEBUG(dbgs() << "Observe: ");
-  DEBUG(MI->dump());
+  DEBUG(MI.dump());
   DEBUG(dbgs() << "\tRegs:");
 
   std::vector<unsigned> &DefIndices = State->GetDefIndices();
@@ -214,9 +214,8 @@ void AggressiveAntiDepBreaker::Observe(MachineInstr *MI, unsigned Count,
   DEBUG(dbgs() << '\n');
 }
 
-bool AggressiveAntiDepBreaker::IsImplicitDefUse(MachineInstr *MI,
-                                                MachineOperand& MO)
-{
+bool AggressiveAntiDepBreaker::IsImplicitDefUse(MachineInstr &MI,
+                                                MachineOperand &MO) {
   if (!MO.isReg() || !MO.isImplicit())
     return false;
 
@@ -226,19 +225,19 @@ bool AggressiveAntiDepBreaker::IsImplicitDefUse(MachineInstr *MI,
 
   MachineOperand *Op = nullptr;
   if (MO.isDef())
-    Op = MI->findRegisterUseOperand(Reg, true);
+    Op = MI.findRegisterUseOperand(Reg, true);
   else
-    Op = MI->findRegisterDefOperand(Reg);
+    Op = MI.findRegisterDefOperand(Reg);
 
   return(Op && Op->isImplicit());
 }
 
-void AggressiveAntiDepBreaker::GetPassthruRegs(MachineInstr *MI,
-                                           std::set<unsigned>& PassthruRegs) {
-  for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) {
-    MachineOperand &MO = MI->getOperand(i);
+void AggressiveAntiDepBreaker::GetPassthruRegs(
+    MachineInstr &MI, std::set<unsigned> &PassthruRegs) {
+  for (unsigned i = 0, e = MI.getNumOperands(); i != e; ++i) {
+    MachineOperand &MO = MI.getOperand(i);
     if (!MO.isReg()) continue;
-    if ((MO.isDef() && MI->isRegTiedToUseOperand(i)) ||
+    if ((MO.isDef() && MI.isRegTiedToUseOperand(i)) ||
         IsImplicitDefUse(MI, MO)) {
       const unsigned Reg = MO.getReg();
       for (MCSubRegIterator SubRegs(Reg, TRI, /*IncludeSelf=*/true);
@@ -313,28 +312,30 @@ void AggressiveAntiDepBreaker::HandleLastUse(unsigned Reg, unsigned KillIdx,
     DEBUG(if (header) {
         dbgs() << header << TRI->getName(Reg); header = nullptr; });
     DEBUG(dbgs() << "->g" << State->GetGroup(Reg) << tag);
-  }
-  // Repeat for subregisters.
-  for (MCSubRegIterator SubRegs(Reg, TRI); SubRegs.isValid(); ++SubRegs) {
-    unsigned SubregReg = *SubRegs;
-    if (!State->IsLive(SubregReg)) {
-      KillIndices[SubregReg] = KillIdx;
-      DefIndices[SubregReg] = ~0u;
-      RegRefs.erase(SubregReg);
-      State->LeaveGroup(SubregReg);
-      DEBUG(if (header) {
-          dbgs() << header << TRI->getName(Reg); header = nullptr; });
-      DEBUG(dbgs() << " " << TRI->getName(SubregReg) << "->g" <<
-            State->GetGroup(SubregReg) << tag);
+    // Repeat for subregisters. Note that we only do this if the superregister
+    // was not live because otherwise, regardless whether we have an explicit
+    // use of the subregister, the subregister's contents are needed for the
+    // uses of the superregister.
+    for (MCSubRegIterator SubRegs(Reg, TRI); SubRegs.isValid(); ++SubRegs) {
+      unsigned SubregReg = *SubRegs;
+      if (!State->IsLive(SubregReg)) {
+        KillIndices[SubregReg] = KillIdx;
+        DefIndices[SubregReg] = ~0u;
+        RegRefs.erase(SubregReg);
+        State->LeaveGroup(SubregReg);
+        DEBUG(if (header) {
+            dbgs() << header << TRI->getName(Reg); header = nullptr; });
+        DEBUG(dbgs() << " " << TRI->getName(SubregReg) << "->g" <<
+              State->GetGroup(SubregReg) << tag);
+      }
     }
   }
 
   DEBUG(if (!header && footer) dbgs() << footer);
 }
 
-void AggressiveAntiDepBreaker::PrescanInstruction(MachineInstr *MI,
-                                                  unsigned Count,
-                                             std::set<unsigned>& PassthruRegs) {
+void AggressiveAntiDepBreaker::PrescanInstruction(
+    MachineInstr &MI, unsigned Count, std::set<unsigned> &PassthruRegs) {
   std::vector<unsigned> &DefIndices = State->GetDefIndices();
   std::multimap<unsigned, AggressiveAntiDepState::RegisterReference>&
     RegRefs = State->GetRegRefs();
@@ -344,8 +345,8 @@ void AggressiveAntiDepBreaker::PrescanInstruction(MachineInstr *MI,
   // dead, or because only a subregister is live at the def. If we
   // don't do this the dead def will be incorrectly merged into the
   // previous def.
-  for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) {
-    MachineOperand &MO = MI->getOperand(i);
+  for (unsigned i = 0, e = MI.getNumOperands(); i != e; ++i) {
+    MachineOperand &MO = MI.getOperand(i);
     if (!MO.isReg() || !MO.isDef()) continue;
     unsigned Reg = MO.getReg();
     if (Reg == 0) continue;
@@ -354,8 +355,8 @@ void AggressiveAntiDepBreaker::PrescanInstruction(MachineInstr *MI,
   }
 
   DEBUG(dbgs() << "\tDef Groups:");
-  for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) {
-    MachineOperand &MO = MI->getOperand(i);
+  for (unsigned i = 0, e = MI.getNumOperands(); i != e; ++i) {
+    MachineOperand &MO = MI.getOperand(i);
     if (!MO.isReg() || !MO.isDef()) continue;
     unsigned Reg = MO.getReg();
     if (Reg == 0) continue;
@@ -367,8 +368,8 @@ void AggressiveAntiDepBreaker::PrescanInstruction(MachineInstr *MI,
     // defined in a call must not be changed (ABI). Inline assembly may
     // reference either system calls or the register directly. Skip it until we
     // can tell user specified registers from compiler-specified.
-    if (MI->isCall() || MI->hasExtraDefRegAllocReq() ||
-        TII->isPredicated(MI) || MI->isInlineAsm()) {
+    if (MI.isCall() || MI.hasExtraDefRegAllocReq() || TII->isPredicated(MI) ||
+        MI.isInlineAsm()) {
       DEBUG(if (State->GetGroup(Reg) != 0) dbgs() << "->g0(alloc-req)");
       State->UnionGroups(Reg, 0);
     }
@@ -386,8 +387,8 @@ void AggressiveAntiDepBreaker::PrescanInstruction(MachineInstr *MI,
 
     // Note register reference...
     const TargetRegisterClass *RC = nullptr;
-    if (i < MI->getDesc().getNumOperands())
-      RC = TII->getRegClass(MI->getDesc(), i, TRI, MF);
+    if (i < MI.getDesc().getNumOperands())
+      RC = TII->getRegClass(MI.getDesc(), i, TRI, MF);
     AggressiveAntiDepState::RegisterReference RR = { &MO, RC };
     RegRefs.insert(std::make_pair(Reg, RR));
   }
@@ -396,13 +397,13 @@ void AggressiveAntiDepBreaker::PrescanInstruction(MachineInstr *MI,
 
   // Scan the register defs for this instruction and update
   // live-ranges.
-  for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) {
-    MachineOperand &MO = MI->getOperand(i);
+  for (unsigned i = 0, e = MI.getNumOperands(); i != e; ++i) {
+    MachineOperand &MO = MI.getOperand(i);
     if (!MO.isReg() || !MO.isDef()) continue;
     unsigned Reg = MO.getReg();
     if (Reg == 0) continue;
     // Ignore KILLs and passthru registers for liveness...
-    if (MI->isKill() || (PassthruRegs.count(Reg) != 0))
+    if (MI.isKill() || (PassthruRegs.count(Reg) != 0))
       continue;
 
     // Update def for Reg and aliases.
@@ -421,7 +422,7 @@ void AggressiveAntiDepBreaker::PrescanInstruction(MachineInstr *MI,
   }
 }
 
-void AggressiveAntiDepBreaker::ScanInstruction(MachineInstr *MI,
+void AggressiveAntiDepBreaker::ScanInstruction(MachineInstr &MI,
                                                unsigned Count) {
   DEBUG(dbgs() << "\tUse Groups:");
   std::multimap<unsigned, AggressiveAntiDepState::RegisterReference>&
@@ -444,14 +445,13 @@ void AggressiveAntiDepBreaker::ScanInstruction(MachineInstr *MI,
   // instruction which may not be executed. The second R6 def may or may not
   // re-define R6 so it's not safe to change it since the last R6 use cannot be
   // changed.
-  bool Special = MI->isCall() ||
-    MI->hasExtraSrcRegAllocReq() ||
-    TII->isPredicated(MI) || MI->isInlineAsm();
+  bool Special = MI.isCall() || MI.hasExtraSrcRegAllocReq() ||
+                 TII->isPredicated(MI) || MI.isInlineAsm();
 
   // Scan the register uses for this instruction and update
   // live-ranges, groups and RegRefs.
-  for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) {
-    MachineOperand &MO = MI->getOperand(i);
+  for (unsigned i = 0, e = MI.getNumOperands(); i != e; ++i) {
+    MachineOperand &MO = MI.getOperand(i);
     if (!MO.isReg() || !MO.isUse()) continue;
     unsigned Reg = MO.getReg();
     if (Reg == 0) continue;
@@ -471,8 +471,8 @@ void AggressiveAntiDepBreaker::ScanInstruction(MachineInstr *MI,
 
     // Note register reference...
     const TargetRegisterClass *RC = nullptr;
-    if (i < MI->getDesc().getNumOperands())
-      RC = TII->getRegClass(MI->getDesc(), i, TRI, MF);
+    if (i < MI.getDesc().getNumOperands())
+      RC = TII->getRegClass(MI.getDesc(), i, TRI, MF);
     AggressiveAntiDepState::RegisterReference RR = { &MO, RC };
     RegRefs.insert(std::make_pair(Reg, RR));
   }
@@ -481,12 +481,12 @@ void AggressiveAntiDepBreaker::ScanInstruction(MachineInstr *MI,
 
   // Form a group of all defs and uses of a KILL instruction to ensure
   // that all registers are renamed as a group.
-  if (MI->isKill()) {
+  if (MI.isKill()) {
     DEBUG(dbgs() << "\tKill Group:");
 
     unsigned FirstReg = 0;
-    for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) {
-      MachineOperand &MO = MI->getOperand(i);
+    for (unsigned i = 0, e = MI.getNumOperands(); i != e; ++i) {
+      MachineOperand &MO = MI.getOperand(i);
       if (!MO.isReg()) continue;
       unsigned Reg = MO.getReg();
       if (Reg == 0) continue;
@@ -563,13 +563,16 @@ bool AggressiveAntiDepBreaker::FindSuitableFreeRegisters(
     if (RegRefs.count(Reg) > 0) {
       DEBUG(dbgs() << "\t\t" << TRI->getName(Reg) << ":");
 
-      BitVector BV = GetRenameRegisters(Reg);
-      RenameRegisterMap.insert(std::pair<unsigned, BitVector>(Reg, BV));
+      BitVector &BV = RenameRegisterMap[Reg];
+      assert(BV.empty());
+      BV = GetRenameRegisters(Reg);
 
-      DEBUG(dbgs() << " ::");
-      DEBUG(for (int r = BV.find_first(); r != -1; r = BV.find_next(r))
-              dbgs() << " " << TRI->getName(r));
-      DEBUG(dbgs() << "\n");
+      DEBUG({
+        dbgs() << " ::";
+        for (int r = BV.find_first(); r != -1; r = BV.find_next(r))
+          dbgs() << " " << TRI->getName(r);
+        dbgs() << "\n";
+      });
     }
   }
 
@@ -650,8 +653,7 @@ bool AggressiveAntiDepBreaker::FindSuitableFreeRegisters(
       DEBUG(dbgs() << " " << TRI->getName(NewReg));
 
       // Check if Reg can be renamed to NewReg.
-      BitVector BV = RenameRegisterMap[Reg];
-      if (!BV.test(NewReg)) {
+      if (!RenameRegisterMap[Reg].test(NewReg)) {
         DEBUG(dbgs() << "(no rename)");
         goto next_super_reg;
       }
@@ -785,6 +787,8 @@ unsigned AggressiveAntiDepBreaker::BreakAntiDependencies(
   DEBUG(dbgs() << '\n');
 #endif
 
+  BitVector RegAliases(TRI->getNumRegs());
+
   // Attempt to break anti-dependence edges. Walk the instructions
   // from the bottom up, tracking information about liveness as we go
   // to help determine which registers are available.
@@ -792,13 +796,13 @@ unsigned AggressiveAntiDepBreaker::BreakAntiDependencies(
   unsigned Count = InsertPosIndex - 1;
   for (MachineBasicBlock::iterator I = End, E = Begin;
        I != E; --Count) {
-    MachineInstr *MI = --I;
+    MachineInstr &MI = *--I;
 
-    if (MI->isDebugValue())
+    if (MI.isDebugValue())
       continue;
 
     DEBUG(dbgs() << "Anti: ");
-    DEBUG(MI->dump());
+    DEBUG(MI.dump());
 
     std::set<unsigned> PassthruRegs;
     GetPassthruRegs(MI, PassthruRegs);
@@ -809,13 +813,13 @@ unsigned AggressiveAntiDepBreaker::BreakAntiDependencies(
     // The dependence edges that represent anti- and output-
     // dependencies that are candidates for breaking.
     std::vector<const SDep *> Edges;
-    const SUnit *PathSU = MISUnitMap[MI];
+    const SUnit *PathSU = MISUnitMap[&MI];
     AntiDepEdges(PathSU, Edges);
 
     // If MI is not on the critical path, then we don't rename
     // registers in the CriticalPathSet.
     BitVector *ExcludeRegs = nullptr;
-    if (MI == CriticalPathMI) {
+    if (&MI == CriticalPathMI) {
       CriticalPathSU = CriticalPathStep(CriticalPathSU);
       CriticalPathMI = (CriticalPathSU) ? CriticalPathSU->getInstr() : nullptr;
     } else if (CriticalPathSet.any()) {
@@ -824,7 +828,7 @@ unsigned AggressiveAntiDepBreaker::BreakAntiDependencies(
 
     // Ignore KILL instructions (they form a group in ScanInstruction
     // but don't cause any anti-dependence breaking themselves)
-    if (!MI->isKill()) {
+    if (!MI.isKill()) {
       // Attempt to break each anti-dependency...
       for (unsigned i = 0, e = Edges.size(); i != e; ++i) {
         const SDep *Edge = Edges[i];
@@ -854,7 +858,7 @@ unsigned AggressiveAntiDepBreaker::BreakAntiDependencies(
           continue;
         } else {
           // No anti-dep breaking for implicit deps
-          MachineOperand *AntiDepOp = MI->findRegisterDefOperand(AntiDepReg);
+          MachineOperand *AntiDepOp = MI.findRegisterDefOperand(AntiDepReg);
           assert(AntiDepOp && "Can't find index for defined register operand");
           if (!AntiDepOp || AntiDepOp->isImplicit()) {
             DEBUG(dbgs() << " (implicit)\n");
@@ -896,6 +900,29 @@ unsigned AggressiveAntiDepBreaker::BreakAntiDependencies(
           }
 
           if (AntiDepReg == 0) continue;
+
+          // If the definition of the anti-dependency register does not start
+          // a new live range, bail out. This can happen if the anti-dep
+          // register is a sub-register of another register whose live range
+          // spans over PathSU. In such case, PathSU defines only a part of
+          // the larger register.
+          RegAliases.reset();
+          for (MCRegAliasIterator AI(AntiDepReg, TRI, true); AI.isValid(); ++AI)
+            RegAliases.set(*AI);
+          for (SDep S : PathSU->Succs) {
+            SDep::Kind K = S.getKind();
+            if (K != SDep::Data && K != SDep::Output && K != SDep::Anti)
+              continue;
+            unsigned R = S.getReg();
+            if (!RegAliases[R])
+              continue;
+            if (R == AntiDepReg || TRI->isSubRegister(AntiDepReg, R))
+              continue;
+            AntiDepReg = 0;
+            break;
+          }
+
+          if (AntiDepReg == 0) continue;
         }
 
         assert(AntiDepReg != 0);
@@ -938,7 +965,7 @@ unsigned AggressiveAntiDepBreaker::BreakAntiDependencies(
               for (DbgValueVector::iterator DVI = DbgValues.begin(),
                      DVE = DbgValues.end(); DVI != DVE; ++DVI)
                 if (DVI->second == Q.second.Operand->getParent())
-                  UpdateDbgValue(DVI->first, AntiDepReg, NewReg);
+                  UpdateDbgValue(*DVI->first, AntiDepReg, NewReg);
             }
 
             // We just went back in time and modified history; the
diff --git a/lib/CodeGen/AggressiveAntiDepBreaker.h b/lib/CodeGen/AggressiveAntiDepBreaker.h
index eba738396606..f97e6666b219 100644
--- a/lib/CodeGen/AggressiveAntiDepBreaker.h
+++ b/lib/CodeGen/AggressiveAntiDepBreaker.h
@@ -144,7 +144,7 @@ class LLVM_LIBRARY_VISIBILITY AggressiveAntiDepState {
     /// Update liveness information to account for the current
     /// instruction, which will not be scheduled.
     ///
-    void Observe(MachineInstr *MI, unsigned Count,
+    void Observe(MachineInstr &MI, unsigned Count,
                  unsigned InsertPosIndex) override;
 
     /// Finish anti-dep breaking for a basic block.
@@ -156,19 +156,19 @@ class LLVM_LIBRARY_VISIBILITY AggressiveAntiDepState {
 
     /// Return true if MO represents a register
     /// that is both implicitly used and defined in MI
-    bool IsImplicitDefUse(MachineInstr *MI, MachineOperand& MO);
+    bool IsImplicitDefUse(MachineInstr &MI, MachineOperand &MO);
 
     /// If MI implicitly def/uses a register, then
     /// return that register and all subregisters.
-    void GetPassthruRegs(MachineInstr *MI, std::set<unsigned>& PassthruRegs);
+    void GetPassthruRegs(MachineInstr &MI, std::set<unsigned> &PassthruRegs);
 
     void HandleLastUse(unsigned Reg, unsigned KillIdx, const char *tag,
                        const char *header = nullptr,
                        const char *footer = nullptr);
 
-    void PrescanInstruction(MachineInstr *MI, unsigned Count,
-                            std::set<unsigned>& PassthruRegs);
-    void ScanInstruction(MachineInstr *MI, unsigned Count);
+    void PrescanInstruction(MachineInstr &MI, unsigned Count,
+                            std::set<unsigned> &PassthruRegs);
+    void ScanInstruction(MachineInstr &MI, unsigned Count);
     BitVector GetRenameRegisters(unsigned Reg);
     bool FindSuitableFreeRegisters(unsigned AntiDepGroupIndex,
                                    RenameOrderType& RenameOrder,
diff --git a/lib/CodeGen/Analysis.cpp b/lib/CodeGen/Analysis.cpp
index 75579a2b4559..d69073458cdf 100644
--- a/lib/CodeGen/Analysis.cpp
+++ b/lib/CodeGen/Analysis.cpp
@@ -15,7 +15,6 @@
 #include "llvm/Analysis/ValueTracking.h"
 #include "llvm/CodeGen/MachineFunction.h"
 #include "llvm/CodeGen/MachineModuleInfo.h"
-#include "llvm/CodeGen/SelectionDAG.h"
 #include "llvm/IR/DataLayout.h"
 #include "llvm/IR/DerivedTypes.h"
 #include "llvm/IR/Function.h"
@@ -624,7 +623,9 @@ bool llvm::canBeOmittedFromSymbolTable(const GlobalValue *GV) {
   if (!GV->hasLinkOnceODRLinkage())
     return false;
 
-  if (GV->hasUnnamedAddr())
+  // We assume that anyone who sets global unnamed_addr on a non-constant knows
+  // what they're doing.
+  if (GV->hasGlobalUnnamedAddr())
     return true;
 
   // If it is a non constant variable, it needs to be uniqued across shared
@@ -634,47 +635,36 @@ bool llvm::canBeOmittedFromSymbolTable(const GlobalValue *GV) {
       return false;
   }
 
-  // An alias can point to a variable. We could try to resolve the alias to
-  // decide, but for now just don't hide them.
-  if (isa<GlobalAlias>(GV))
-    return false;
-
-  GlobalStatus GS;
-  if (GlobalStatus::analyzeGlobal(GV, GS))
-    return false;
-
-  return !GS.IsCompared;
+  return GV->hasAtLeastLocalUnnamedAddr();
 }
 
 static void collectFuncletMembers(
     DenseMap<const MachineBasicBlock *, int> &FuncletMembership, int Funclet,
     const MachineBasicBlock *MBB) {
-  // Add this MBB to our funclet.
-  auto P = FuncletMembership.insert(std::make_pair(MBB, Funclet));
+  SmallVector<const MachineBasicBlock *, 16> Worklist = {MBB};
+  while (!Worklist.empty()) {
+    const MachineBasicBlock *Visiting = Worklist.pop_back_val();
+    // Don't follow blocks which start new funclets.
+    if (Visiting->isEHPad() && Visiting != MBB)
+      continue;
 
-  // Don't revisit blocks.
-  if (!P.second) {
-    assert(P.first->second == Funclet && "MBB is part of two funclets!");
-    return;
-  }
+    // Add this MBB to our funclet.
+    auto P = FuncletMembership.insert(std::make_pair(Visiting, Funclet));
 
-  bool IsReturn = false;
-  int NumTerminators = 0;
-  for (const MachineInstr &MI : MBB->terminators()) {
-    IsReturn |= MI.isReturn();
-    ++NumTerminators;
-  }
-  assert((!IsReturn || NumTerminators == 1) &&
-         "Expected only one terminator when a return is present!");
+    // Don't revisit blocks.
+    if (!P.second) {
+      assert(P.first->second == Funclet && "MBB is part of two funclets!");
+      continue;
+    }
 
-  // Returns are boundaries where funclet transfer can occur, don't follow
-  // successors.
-  if (IsReturn)
-    return;
+    // Returns are boundaries where funclet transfer can occur, don't follow
+    // successors.
+    if (Visiting->isReturnBlock())
+      continue;
 
-  for (const MachineBasicBlock *SMBB : MBB->successors())
-    if (!SMBB->isEHPad())
-      collectFuncletMembers(FuncletMembership, Funclet, SMBB);
+    for (const MachineBasicBlock *Succ : Visiting->successors())
+      Worklist.push_back(Succ);
+  }
 }
 
 DenseMap<const MachineBasicBlock *, int>
diff --git a/lib/CodeGen/AntiDepBreaker.h b/lib/CodeGen/AntiDepBreaker.h
index 9f05200dcdf3..04f7f419f5ea 100644
--- a/lib/CodeGen/AntiDepBreaker.h
+++ b/lib/CodeGen/AntiDepBreaker.h
@@ -47,18 +47,18 @@ public:
   
   /// Update liveness information to account for the current
   /// instruction, which will not be scheduled.
-  virtual void Observe(MachineInstr *MI, unsigned Count,
-                       unsigned InsertPosIndex) =0;
-  
+  virtual void Observe(MachineInstr &MI, unsigned Count,
+                       unsigned InsertPosIndex) = 0;
+
   /// Finish anti-dep breaking for a basic block.
   virtual void FinishBlock() =0;
 
   /// Update DBG_VALUE if dependency breaker is updating
   /// other machine instruction to use NewReg.
-  void UpdateDbgValue(MachineInstr *MI, unsigned OldReg, unsigned NewReg) {
-    assert (MI->isDebugValue() && "MI is not DBG_VALUE!");
-    if (MI && MI->getOperand(0).isReg() && MI->getOperand(0).getReg() == OldReg)
-      MI->getOperand(0).setReg(NewReg);
+  void UpdateDbgValue(MachineInstr &MI, unsigned OldReg, unsigned NewReg) {
+    assert(MI.isDebugValue() && "MI is not DBG_VALUE!");
+    if (MI.getOperand(0).isReg() && MI.getOperand(0).getReg() == OldReg)
+      MI.getOperand(0).setReg(NewReg);
   }
 };
 
diff --git a/lib/CodeGen/AsmPrinter/ARMException.cpp b/lib/CodeGen/AsmPrinter/ARMException.cpp
index ade2d7105b88..5294c98e314d 100644
--- a/lib/CodeGen/AsmPrinter/ARMException.cpp
+++ b/lib/CodeGen/AsmPrinter/ARMException.cpp
@@ -12,7 +12,6 @@
 //===----------------------------------------------------------------------===//
 
 #include "DwarfException.h"
-#include "llvm/ADT/SmallString.h"
 #include "llvm/ADT/StringExtras.h"
 #include "llvm/ADT/Twine.h"
 #include "llvm/CodeGen/AsmPrinter.h"
@@ -28,7 +27,6 @@
 #include "llvm/MC/MCSection.h"
 #include "llvm/MC/MCStreamer.h"
 #include "llvm/MC/MCSymbol.h"
-#include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Dwarf.h"
 #include "llvm/Support/FormattedStream.h"
 #include "llvm/Target/TargetFrameLowering.h"
diff --git a/lib/CodeGen/AsmPrinter/AddressPool.h b/lib/CodeGen/AsmPrinter/AddressPool.h
index 211fc98c7f6f..ba3e3b7c315d 100644
--- a/lib/CodeGen/AsmPrinter/AddressPool.h
+++ b/lib/CodeGen/AsmPrinter/AddressPool.h
@@ -11,10 +11,10 @@
 #define LLVM_LIB_CODEGEN_ASMPRINTER_ADDRESSPOOL_H
 
 #include "llvm/ADT/DenseMap.h"
+#include "llvm/MC/MCSymbol.h"
 
 namespace llvm {
 class MCSection;
-class MCSymbol;
 class AsmPrinter;
 // Collection of addresses for this unit and assorted labels.
 // A Symbol->unsigned mapping of addresses used by indirect
diff --git a/lib/CodeGen/AsmPrinter/AsmPrinter.cpp b/lib/CodeGen/AsmPrinter/AsmPrinter.cpp
index 5f67d3daa97f..272baceeed89 100644
--- a/lib/CodeGen/AsmPrinter/AsmPrinter.cpp
+++ b/lib/CodeGen/AsmPrinter/AsmPrinter.cpp
@@ -12,11 +12,10 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/CodeGen/AsmPrinter.h"
+#include "CodeViewDebug.h"
 #include "DwarfDebug.h"
 #include "DwarfException.h"
 #include "WinException.h"
-#include "WinCodeViewLineTables.h"
-#include "llvm/ADT/SmallString.h"
 #include "llvm/ADT/Statistic.h"
 #include "llvm/Analysis/ConstantFolding.h"
 #include "llvm/CodeGen/Analysis.h"
@@ -125,6 +124,10 @@ AsmPrinter::~AsmPrinter() {
   }
 }
 
+bool AsmPrinter::isPositionIndependent() const {
+  return TM.isPositionIndependent();
+}
+
 /// getFunctionNumber - Return a unique ID for the current function.
 ///
 unsigned AsmPrinter::getFunctionNumber() const {
@@ -248,12 +251,13 @@ bool AsmPrinter::doInitialization(Module &M) {
   if (MAI->doesSupportDebugInformation()) {
     bool EmitCodeView = MMI->getModule()->getCodeViewFlag();
     if (EmitCodeView && TM.getTargetTriple().isKnownWindowsMSVCEnvironment()) {
-      Handlers.push_back(HandlerInfo(new WinCodeViewLineTables(this),
+      Handlers.push_back(HandlerInfo(new CodeViewDebug(this),
                                      DbgTimerName,
                                      CodeViewLineTablesGroupName));
     }
     if (!EmitCodeView || MMI->getModule()->getDwarfVersion()) {
       DD = new DwarfDebug(this, &M);
+      DD->beginModule();
       Handlers.push_back(HandlerInfo(DD, DbgTimerName, DWARFGroupName));
     }
   }
@@ -319,21 +323,17 @@ void AsmPrinter::EmitLinkage(const GlobalValue *GV, MCSymbol *GVSym) const {
       OutStreamer->EmitSymbolAttribute(GVSym, MCSA_Weak);
     }
     return;
-  case GlobalValue::AppendingLinkage:
-    // FIXME: appending linkage variables should go into a section of
-    // their name or something.  For now, just emit them as external.
   case GlobalValue::ExternalLinkage:
-    // If external or appending, declare as a global symbol.
-    // .globl _foo
+    // If external, declare as a global symbol: .globl _foo
     OutStreamer->EmitSymbolAttribute(GVSym, MCSA_Global);
     return;
   case GlobalValue::PrivateLinkage:
   case GlobalValue::InternalLinkage:
     return;
+  case GlobalValue::AppendingLinkage:
   case GlobalValue::AvailableExternallyLinkage:
-    llvm_unreachable("Should never emit this");
   case GlobalValue::ExternalWeakLinkage:
-    llvm_unreachable("Don't know how to emit these");
+    llvm_unreachable("Should never emit this");
   }
   llvm_unreachable("Unknown linkage type!");
 }
@@ -347,51 +347,17 @@ MCSymbol *AsmPrinter::getSymbol(const GlobalValue *GV) const {
   return TM.getSymbol(GV, *Mang);
 }
 
-static MCSymbol *getOrCreateEmuTLSControlSym(MCSymbol *GVSym, MCContext &C) {
-  return C.getOrCreateSymbol(Twine("__emutls_v.") + GVSym->getName());
-}
-
-static MCSymbol *getOrCreateEmuTLSInitSym(MCSymbol *GVSym, MCContext &C) {
-  return C.getOrCreateSymbol(Twine("__emutls_t.") + GVSym->getName());
-}
-
-/// EmitEmulatedTLSControlVariable - Emit the control variable for an emulated TLS variable.
-void AsmPrinter::EmitEmulatedTLSControlVariable(const GlobalVariable *GV,
-                                                MCSymbol *EmittedSym,
-                                                bool AllZeroInitValue) {
-  MCSection *TLSVarSection = getObjFileLowering().getDataSection();
-  OutStreamer->SwitchSection(TLSVarSection);
-  MCSymbol *GVSym = getSymbol(GV);
-  EmitLinkage(GV, EmittedSym);  // same linkage as GV
-  const DataLayout &DL = GV->getParent()->getDataLayout();
-  uint64_t Size = DL.getTypeAllocSize(GV->getType()->getElementType());
-  unsigned AlignLog = getGVAlignmentLog2(GV, DL);
-  unsigned WordSize = DL.getPointerSize();
-  unsigned Alignment = DL.getPointerABIAlignment();
-  EmitAlignment(Log2_32(Alignment));
-  OutStreamer->EmitLabel(EmittedSym);
-  OutStreamer->EmitIntValue(Size, WordSize);
-  OutStreamer->EmitIntValue((1 << AlignLog), WordSize);
-  OutStreamer->EmitIntValue(0, WordSize);
-  if (GV->hasInitializer() && !AllZeroInitValue) {
-    OutStreamer->EmitSymbolValue(
-        getOrCreateEmuTLSInitSym(GVSym, OutContext), WordSize);
-  } else
-    OutStreamer->EmitIntValue(0, WordSize);
-  if (MAI->hasDotTypeDotSizeDirective())
-    OutStreamer->emitELFSize(cast<MCSymbolELF>(EmittedSym),
-                             MCConstantExpr::create(4 * WordSize, OutContext));
-  OutStreamer->AddBlankLine();  // End of the __emutls_v.* variable.
-}
-
 /// EmitGlobalVariable - Emit the specified global variable to the .s file.
 void AsmPrinter::EmitGlobalVariable(const GlobalVariable *GV) {
-  bool IsEmuTLSVar =
-      GV->getThreadLocalMode() != llvm::GlobalVariable::NotThreadLocal &&
-      TM.Options.EmulatedTLS;
+  bool IsEmuTLSVar = TM.Options.EmulatedTLS && GV->isThreadLocal();
   assert(!(IsEmuTLSVar && GV->hasCommonLinkage()) &&
          "No emulated TLS variables in the common section");
 
+  // Never emit TLS variable xyz in emulated TLS model.
+  // The initialization value is in __emutls_t.xyz instead of xyz.
+  if (IsEmuTLSVar)
+    return;
+
   if (GV->hasInitializer()) {
     // Check to see if this is a special global used by LLVM, if so, emit it.
     if (EmitSpecialLLVMGlobal(GV))
@@ -402,7 +368,7 @@ void AsmPrinter::EmitGlobalVariable(const GlobalVariable *GV) {
     if (GlobalGOTEquivs.count(getSymbol(GV)))
       return;
 
-    if (isVerbose() && !IsEmuTLSVar) {
+    if (isVerbose()) {
       // When printing the control variable __emutls_v.*,
       // we don't need to print the original TLS variable name.
       GV->printAsOperand(OutStreamer->GetCommentOS(),
@@ -412,11 +378,11 @@ void AsmPrinter::EmitGlobalVariable(const GlobalVariable *GV) {
   }
 
   MCSymbol *GVSym = getSymbol(GV);
-  MCSymbol *EmittedSym = IsEmuTLSVar ?
-      getOrCreateEmuTLSControlSym(GVSym, OutContext) : GVSym;
-  // getOrCreateEmuTLSControlSym only creates the symbol with name and default attributes.
-  // GV's or GVSym's attributes will be used for the EmittedSym.
+  MCSymbol *EmittedSym = GVSym;
 
+  // getOrCreateEmuTLSControlSym only creates the symbol with name and default
+  // attributes.
+  // GV's or GVSym's attributes will be used for the EmittedSym.
   EmitVisibility(EmittedSym, GV->getVisibility(), !GV->isDeclaration());
 
   if (!GV->hasInitializer())   // External globals require no extra code.
@@ -440,48 +406,47 @@ void AsmPrinter::EmitGlobalVariable(const GlobalVariable *GV) {
   // sections and expected to be contiguous (e.g. ObjC metadata).
   unsigned AlignLog = getGVAlignmentLog2(GV, DL);
 
-  bool AllZeroInitValue = false;
-  const Constant *InitValue = GV->getInitializer();
-  if (isa<ConstantAggregateZero>(InitValue))
-    AllZeroInitValue = true;
-  else {
-    const ConstantInt *InitIntValue = dyn_cast<ConstantInt>(InitValue);
-    if (InitIntValue && InitIntValue->isZero())
-      AllZeroInitValue = true;
-  }
-  if (IsEmuTLSVar)
-    EmitEmulatedTLSControlVariable(GV, EmittedSym, AllZeroInitValue);
-
   for (const HandlerInfo &HI : Handlers) {
     NamedRegionTimer T(HI.TimerName, HI.TimerGroupName, TimePassesIsEnabled);
     HI.Handler->setSymbolSize(GVSym, Size);
   }
 
-  // Handle common and BSS local symbols (.lcomm).
-  if (GVKind.isCommon() || GVKind.isBSSLocal()) {
-    assert(!(IsEmuTLSVar && GVKind.isCommon()) &&
-           "No emulated TLS variables in the common section");
+  // Handle common symbols
+  if (GVKind.isCommon()) {
     if (Size == 0) Size = 1;   // .comm Foo, 0 is undefined, avoid it.
     unsigned Align = 1 << AlignLog;
+    if (!getObjFileLowering().getCommDirectiveSupportsAlignment())
+      Align = 0;
 
-    // Handle common symbols.
-    if (GVKind.isCommon()) {
-      if (!getObjFileLowering().getCommDirectiveSupportsAlignment())
-        Align = 0;
+    // .comm _foo, 42, 4
+    OutStreamer->EmitCommonSymbol(GVSym, Size, Align);
+    return;
+  }
 
-      // .comm _foo, 42, 4
-      OutStreamer->EmitCommonSymbol(GVSym, Size, Align);
-      return;
-    }
+  // Determine to which section this global should be emitted.
+  MCSection *TheSection =
+      getObjFileLowering().SectionForGlobal(GV, GVKind, *Mang, TM);
 
-    // Handle local BSS symbols.
-    if (MAI->hasMachoZeroFillDirective()) {
-      MCSection *TheSection =
-          getObjFileLowering().SectionForGlobal(GV, GVKind, *Mang, TM);
-      // .zerofill __DATA, __bss, _foo, 400, 5
-      OutStreamer->EmitZerofill(TheSection, GVSym, Size, Align);
-      return;
-    }
+  // If we have a bss global going to a section that supports the
+  // zerofill directive, do so here.
+  if (GVKind.isBSS() && MAI->hasMachoZeroFillDirective() &&
+      TheSection->isVirtualSection()) {
+    if (Size == 0)
+      Size = 1; // zerofill of 0 bytes is undefined.
+    unsigned Align = 1 << AlignLog;
+    EmitLinkage(GV, GVSym);
+    // .zerofill __DATA, __bss, _foo, 400, 5
+    OutStreamer->EmitZerofill(TheSection, GVSym, Size, Align);
+    return;
+  }
+
+  // If this is a BSS local symbol and we are emitting in the BSS
+  // section use .lcomm/.comm directive.
+  if (GVKind.isBSSLocal() &&
+      getObjFileLowering().getBSSSection() == TheSection) {
+    if (Size == 0)
+      Size = 1; // .comm Foo, 0 is undefined, avoid it.
+    unsigned Align = 1 << AlignLog;
 
     // Use .lcomm only if it supports user-specified alignment.
     // Otherwise, while it would still be correct to use .lcomm in some
@@ -505,30 +470,6 @@ void AsmPrinter::EmitGlobalVariable(const GlobalVariable *GV) {
     return;
   }
 
-  if (IsEmuTLSVar && AllZeroInitValue)
-    return;  // No need of initialization values.
-
-  MCSymbol *EmittedInitSym = IsEmuTLSVar ?
-      getOrCreateEmuTLSInitSym(GVSym, OutContext) : GVSym;
-  // getOrCreateEmuTLSInitSym only creates the symbol with name and default attributes.
-  // GV's or GVSym's attributes will be used for the EmittedInitSym.
-
-  MCSection *TheSection = IsEmuTLSVar ?
-      getObjFileLowering().getReadOnlySection() :
-      getObjFileLowering().SectionForGlobal(GV, GVKind, *Mang, TM);
-
-  // Handle the zerofill directive on darwin, which is a special form of BSS
-  // emission.
-  if (GVKind.isBSSExtern() && MAI->hasMachoZeroFillDirective() && !IsEmuTLSVar) {
-    if (Size == 0) Size = 1;  // zerofill of 0 bytes is undefined.
-
-    // .globl _foo
-    OutStreamer->EmitSymbolAttribute(GVSym, MCSA_Global);
-    // .zerofill __DATA, __common, _foo, 400, 5
-    OutStreamer->EmitZerofill(TheSection, GVSym, Size, 1 << AlignLog);
-    return;
-  }
-
   // Handle thread local data for mach-o which requires us to output an
   // additional structure of data and mangle the original symbol so that we
   // can reference it later.
@@ -539,7 +480,7 @@ void AsmPrinter::EmitGlobalVariable(const GlobalVariable *GV) {
   // TLOF class.  This will also make it more obvious that stuff like
   // MCStreamer::EmitTBSSSymbol is macho specific and only called from macho
   // specific code.
-  if (GVKind.isThreadLocal() && MAI->hasMachoTBSSDirective() && !IsEmuTLSVar) {
+  if (GVKind.isThreadLocal() && MAI->hasMachoTBSSDirective()) {
     // Emit the .tbss symbol
     MCSymbol *MangSym =
       OutContext.getOrCreateSymbol(GVSym->getName() + Twine("$tlv$init"));
@@ -581,11 +522,11 @@ void AsmPrinter::EmitGlobalVariable(const GlobalVariable *GV) {
     return;
   }
 
+  MCSymbol *EmittedInitSym = GVSym;
+
   OutStreamer->SwitchSection(TheSection);
 
-  // emutls_t.* symbols are only used in the current compilation unit.
-  if (!IsEmuTLSVar)
-    EmitLinkage(GV, EmittedInitSym);
+  EmitLinkage(GV, EmittedInitSym);
   EmitAlignment(AlignLog, GV);
 
   OutStreamer->EmitLabel(EmittedInitSym);
@@ -696,20 +637,20 @@ static void emitComments(const MachineInstr &MI, raw_ostream &CommentOS) {
   // We assume a single instruction only has a spill or reload, not
   // both.
   const MachineMemOperand *MMO;
-  if (TII->isLoadFromStackSlotPostFE(&MI, FI)) {
+  if (TII->isLoadFromStackSlotPostFE(MI, FI)) {
     if (FrameInfo->isSpillSlotObjectIndex(FI)) {
       MMO = *MI.memoperands_begin();
       CommentOS << MMO->getSize() << "-byte Reload\n";
     }
-  } else if (TII->hasLoadFromStackSlot(&MI, MMO, FI)) {
+  } else if (TII->hasLoadFromStackSlot(MI, MMO, FI)) {
     if (FrameInfo->isSpillSlotObjectIndex(FI))
       CommentOS << MMO->getSize() << "-byte Folded Reload\n";
-  } else if (TII->isStoreToStackSlotPostFE(&MI, FI)) {
+  } else if (TII->isStoreToStackSlotPostFE(MI, FI)) {
     if (FrameInfo->isSpillSlotObjectIndex(FI)) {
       MMO = *MI.memoperands_begin();
       CommentOS << MMO->getSize() << "-byte Spill\n";
     }
-  } else if (TII->hasStoreToStackSlot(&MI, MMO, FI)) {
+  } else if (TII->hasStoreToStackSlot(MI, MMO, FI)) {
     if (FrameInfo->isSpillSlotObjectIndex(FI))
       CommentOS << MMO->getSize() << "-byte Folded Spill\n";
   }
@@ -745,7 +686,7 @@ static void emitKill(const MachineInstr *MI, AsmPrinter &AP) {
                    AP.MF->getSubtarget().getRegisterInfo())
        << (Op.isDef() ? "<def>" : "<kill>");
   }
-  AP.OutStreamer->AddComment(Str);
+  AP.OutStreamer->AddComment(OS.str());
   AP.OutStreamer->AddBlankLine();
 }
 
@@ -1065,8 +1006,9 @@ static bool isGOTEquivalentCandidate(const GlobalVariable *GV,
   // Global GOT equivalents are unnamed private globals with a constant
   // pointer initializer to another global symbol. They must point to a
   // GlobalVariable or Function, i.e., as GlobalValue.
-  if (!GV->hasUnnamedAddr() || !GV->hasInitializer() || !GV->isConstant() ||
-      !GV->isDiscardableIfUnused() || !dyn_cast<GlobalValue>(GV->getOperand(0)))
+  if (!GV->hasGlobalUnnamedAddr() || !GV->hasInitializer() ||
+      !GV->isConstant() || !GV->isDiscardableIfUnused() ||
+      !dyn_cast<GlobalValue>(GV->getOperand(0)))
     return false;
 
   // To be a got equivalent, at least one of its users need to be a constant
@@ -1118,6 +1060,52 @@ void AsmPrinter::emitGlobalGOTEquivs() {
     EmitGlobalVariable(GV);
 }
 
+void AsmPrinter::emitGlobalIndirectSymbol(Module &M,
+                                          const GlobalIndirectSymbol& GIS) {
+  MCSymbol *Name = getSymbol(&GIS);
+
+  if (GIS.hasExternalLinkage() || !MAI->getWeakRefDirective())
+    OutStreamer->EmitSymbolAttribute(Name, MCSA_Global);
+  else if (GIS.hasWeakLinkage() || GIS.hasLinkOnceLinkage())
+    OutStreamer->EmitSymbolAttribute(Name, MCSA_WeakReference);
+  else
+    assert(GIS.hasLocalLinkage() && "Invalid alias or ifunc linkage");
+
+  // Set the symbol type to function if the alias has a function type.
+  // This affects codegen when the aliasee is not a function.
+  if (GIS.getType()->getPointerElementType()->isFunctionTy()) {
+    OutStreamer->EmitSymbolAttribute(Name, MCSA_ELF_TypeFunction);
+    if (isa<GlobalIFunc>(GIS))
+      OutStreamer->EmitSymbolAttribute(Name, MCSA_ELF_TypeIndFunction);
+  }
+
+  EmitVisibility(Name, GIS.getVisibility());
+
+  const MCExpr *Expr = lowerConstant(GIS.getIndirectSymbol());
+
+  if (isa<GlobalAlias>(&GIS) && MAI->hasAltEntry() && isa<MCBinaryExpr>(Expr))
+    OutStreamer->EmitSymbolAttribute(Name, MCSA_AltEntry);
+
+  // Emit the directives as assignments aka .set:
+  OutStreamer->EmitAssignment(Name, Expr);
+
+  if (auto *GA = dyn_cast<GlobalAlias>(&GIS)) {
+    // If the aliasee does not correspond to a symbol in the output, i.e. the
+    // alias is not of an object or the aliased object is private, then set the
+    // size of the alias symbol from the type of the alias. We don't do this in
+    // other situations as the alias and aliasee having differing types but same
+    // size may be intentional.
+    const GlobalObject *BaseObject = GA->getBaseObject();
+    if (MAI->hasDotTypeDotSizeDirective() && GA->getValueType()->isSized() &&
+        (!BaseObject || BaseObject->hasPrivateLinkage())) {
+      const DataLayout &DL = M.getDataLayout();
+      uint64_t Size = DL.getTypeAllocSize(GA->getValueType());
+      OutStreamer->emitELFSize(cast<MCSymbolELF>(Name),
+                               MCConstantExpr::create(Size, OutContext));
+    }
+  }
+}
+
 bool AsmPrinter::doFinalization(Module &M) {
   // Set the MachineFunction to nullptr so that we can catch attempted
   // accesses to MF specific features at the module level and so that
@@ -1191,55 +1179,35 @@ bool AsmPrinter::doFinalization(Module &M) {
     // to notice uses in operands (due to constant exprs etc).  This should
     // happen with the MC stuff eventually.
 
-    // Print out module-level global variables here.
-    for (const auto &G : M.globals()) {
-      if (!G.hasExternalWeakLinkage())
+    // Print out module-level global objects here.
+    for (const auto &GO : M.global_objects()) {
+      if (!GO.hasExternalWeakLinkage())
         continue;
-      OutStreamer->EmitSymbolAttribute(getSymbol(&G), MCSA_WeakReference);
-    }
-
-    for (const auto &F : M) {
-      if (!F.hasExternalWeakLinkage())
-        continue;
-      OutStreamer->EmitSymbolAttribute(getSymbol(&F), MCSA_WeakReference);
+      OutStreamer->EmitSymbolAttribute(getSymbol(&GO), MCSA_WeakReference);
     }
   }
 
   OutStreamer->AddBlankLine();
-  for (const auto &Alias : M.aliases()) {
-    MCSymbol *Name = getSymbol(&Alias);
-
-    if (Alias.hasExternalLinkage() || !MAI->getWeakRefDirective())
-      OutStreamer->EmitSymbolAttribute(Name, MCSA_Global);
-    else if (Alias.hasWeakLinkage() || Alias.hasLinkOnceLinkage())
-      OutStreamer->EmitSymbolAttribute(Name, MCSA_WeakReference);
-    else
-      assert(Alias.hasLocalLinkage() && "Invalid alias linkage");
-
-    // Set the symbol type to function if the alias has a function type.
-    // This affects codegen when the aliasee is not a function.
-    if (Alias.getType()->getPointerElementType()->isFunctionTy())
-      OutStreamer->EmitSymbolAttribute(Name, MCSA_ELF_TypeFunction);
-
-    EmitVisibility(Name, Alias.getVisibility());
 
-    // Emit the directives as assignments aka .set:
-    OutStreamer->EmitAssignment(Name, lowerConstant(Alias.getAliasee()));
-
-    // If the aliasee does not correspond to a symbol in the output, i.e. the
-    // alias is not of an object or the aliased object is private, then set the
-    // size of the alias symbol from the type of the alias. We don't do this in
-    // other situations as the alias and aliasee having differing types but same
-    // size may be intentional.
-    const GlobalObject *BaseObject = Alias.getBaseObject();
-    if (MAI->hasDotTypeDotSizeDirective() && Alias.getValueType()->isSized() &&
-        (!BaseObject || BaseObject->hasPrivateLinkage())) {
-      const DataLayout &DL = M.getDataLayout();
-      uint64_t Size = DL.getTypeAllocSize(Alias.getValueType());
-      OutStreamer->emitELFSize(cast<MCSymbolELF>(Name),
-                               MCConstantExpr::create(Size, OutContext));
+  // Print aliases in topological order, that is, for each alias a = b,
+  // b must be printed before a.
+  // This is because on some targets (e.g. PowerPC) linker expects aliases in
+  // such an order to generate correct TOC information.
+  SmallVector<const GlobalAlias *, 16> AliasStack;
+  SmallPtrSet<const GlobalAlias *, 16> AliasVisited;
+  for (const auto &Alias : M.aliases()) {
+    for (const GlobalAlias *Cur = &Alias; Cur;
+         Cur = dyn_cast<GlobalAlias>(Cur->getAliasee())) {
+      if (!AliasVisited.insert(Cur).second)
+        break;
+      AliasStack.push_back(Cur);
     }
+    for (const GlobalAlias *AncestorAlias : reverse(AliasStack))
+      emitGlobalIndirectSymbol(M, *AncestorAlias);
+    AliasStack.clear();
   }
+  for (const auto &IFunc : M.ifuncs())
+    emitGlobalIndirectSymbol(M, IFunc);
 
   GCModuleInfo *MI = getAnalysisIfAvailable<GCModuleInfo>();
   assert(MI && "AsmPrinter didn't require GCModuleInfo?");
@@ -1252,9 +1220,10 @@ bool AsmPrinter::doFinalization(Module &M) {
 
   // Emit __morestack address if needed for indirect calls.
   if (MMI->usesMorestackAddr()) {
+    unsigned Align = 1;
     MCSection *ReadOnlySection = getObjFileLowering().getSectionForConstant(
         getDataLayout(), SectionKind::getReadOnly(),
-        /*C=*/nullptr);
+        /*C=*/nullptr, Align);
     OutStreamer->SwitchSection(ReadOnlySection);
 
     MCSymbol *AddrSymbol =
@@ -1344,8 +1313,8 @@ void AsmPrinter::EmitConstantPool() {
     if (!CPE.isMachineConstantPoolEntry())
       C = CPE.Val.ConstVal;
 
-    MCSection *S =
-        getObjFileLowering().getSectionForConstant(getDataLayout(), Kind, C);
+    MCSection *S = getObjFileLowering().getSectionForConstant(getDataLayout(),
+                                                              Kind, C, Align);
 
     // The number of sections are small, just do a linear search from the
     // last section to the first.
@@ -1443,7 +1412,7 @@ void AsmPrinter::EmitJumpTableInfo() {
     // For the EK_LabelDifference32 entry, if using .set avoids a relocation,
     /// emit a .set directive for each unique entry.
     if (MJTI->getEntryKind() == MachineJumpTableInfo::EK_LabelDifference32 &&
-        MAI->doesSetDirectiveSuppressesReloc()) {
+        MAI->doesSetDirectiveSuppressReloc()) {
       SmallPtrSet<const MachineBasicBlock*, 16> EmittedSets;
       const TargetLowering *TLI = MF->getSubtarget().getTargetLowering();
       const MCExpr *Base = TLI->getPICJumpTableRelocBaseExpr(MF,JTI,OutContext);
@@ -1524,7 +1493,7 @@ void AsmPrinter::EmitJumpTableEntry(const MachineJumpTableInfo *MJTI,
     // If the .set directive avoids relocations, this is emitted as:
     //      .set L4_5_set_123, LBB123 - LJTI1_2
     //      .word L4_5_set_123
-    if (MAI->doesSetDirectiveSuppressesReloc()) {
+    if (MAI->doesSetDirectiveSuppressReloc()) {
       Value = MCSymbolRefExpr::create(GetJTSetSymbol(UID, MBB->getNumber()),
                                       OutContext);
       break;
@@ -1555,7 +1524,7 @@ bool AsmPrinter::EmitSpecialLLVMGlobal(const GlobalVariable *GV) {
   }
 
   // Ignore debug and non-emitted data.  This handles llvm.compiler.used.
-  if (StringRef(GV->getSection()) == "llvm.metadata" ||
+  if (GV->getSection() == "llvm.metadata" ||
       GV->hasAvailableExternallyLinkage())
     return true;
 
@@ -1589,7 +1558,7 @@ bool AsmPrinter::EmitSpecialLLVMGlobal(const GlobalVariable *GV) {
     return true;
   }
 
-  return false;
+  report_fatal_error("unknown special variable");
 }
 
 /// EmitLLVMUsedList - For targets that define a MAI::UsedDirective, mark each
@@ -1648,7 +1617,8 @@ void AsmPrinter::EmitXXStructorList(const DataLayout &DL, const Constant *List,
     S.Priority = Priority->getLimitedValue(65535);
     S.Func = CS->getOperand(1);
     if (ETy->getNumElements() == 3 && !CS->getOperand(2)->isNullValue())
-      S.ComdatKey = dyn_cast<GlobalValue>(CS->getOperand(2)->stripPointerCasts());
+      S.ComdatKey =
+          dyn_cast<GlobalValue>(CS->getOperand(2)->stripPointerCasts());
   }
 
   // Emit the function pointers in the target-specific order
@@ -1789,10 +1759,6 @@ const MCExpr *AsmPrinter::lowerConstant(const Constant *CV) {
     llvm_unreachable("Unknown constant value to lower!");
   }
 
-  if (const MCExpr *RelocExpr
-      = getObjFileLowering().getExecutableRelativeSymbol(CE, *Mang, TM))
-    return RelocExpr;
-
   switch (CE->getOpcode()) {
   default:
     // If the code isn't optimized, there may be outstanding folding
@@ -1868,10 +1834,34 @@ const MCExpr *AsmPrinter::lowerConstant(const Constant *CV) {
     return MCBinaryExpr::createAnd(OpExpr, MaskExpr, Ctx);
   }
 
+  case Instruction::Sub: {
+    GlobalValue *LHSGV;
+    APInt LHSOffset;
+    if (IsConstantOffsetFromGlobal(CE->getOperand(0), LHSGV, LHSOffset,
+                                   getDataLayout())) {
+      GlobalValue *RHSGV;
+      APInt RHSOffset;
+      if (IsConstantOffsetFromGlobal(CE->getOperand(1), RHSGV, RHSOffset,
+                                     getDataLayout())) {
+        const MCExpr *RelocExpr = getObjFileLowering().lowerRelativeReference(
+            LHSGV, RHSGV, *Mang, TM);
+        if (!RelocExpr)
+          RelocExpr = MCBinaryExpr::createSub(
+              MCSymbolRefExpr::create(getSymbol(LHSGV), Ctx),
+              MCSymbolRefExpr::create(getSymbol(RHSGV), Ctx), Ctx);
+        int64_t Addend = (LHSOffset - RHSOffset).getSExtValue();
+        if (Addend != 0)
+          RelocExpr = MCBinaryExpr::createAdd(
+              RelocExpr, MCConstantExpr::create(Addend, Ctx), Ctx);
+        return RelocExpr;
+      }
+    }
+  }
+  // else fallthrough
+
   // The MC library also has a right-shift operator, but it isn't consistently
   // signed or unsigned between different targets.
   case Instruction::Add:
-  case Instruction::Sub:
   case Instruction::Mul:
   case Instruction::SDiv:
   case Instruction::SRem:
@@ -1964,7 +1954,7 @@ static void emitGlobalConstantDataSequential(const DataLayout &DL,
     uint64_t Bytes = DL.getTypeAllocSize(CDS->getType());
     // Don't emit a 1-byte object as a .fill.
     if (Bytes > 1)
-      return AP.OutStreamer->EmitFill(Bytes, Value);
+      return AP.OutStreamer->emitFill(Bytes, Value);
   }
 
   // If this can be emitted with .ascii/.asciz, emit it as such.
@@ -2003,7 +1993,7 @@ static void emitGlobalConstantArray(const DataLayout &DL,
 
   if (Value != -1) {
     uint64_t Bytes = DL.getTypeAllocSize(CA->getType());
-    AP.OutStreamer->EmitFill(Bytes, Value);
+    AP.OutStreamer->emitFill(Bytes, Value);
   }
   else {
     for (unsigned i = 0, e = CA->getNumOperands(); i != e; ++i) {
@@ -2582,7 +2572,7 @@ isBlockOnlyReachableByFallthrough(const MachineBasicBlock *MBB) const {
     // If we are the operands of one of the branches, this is not a fall
     // through. Note that targets with delay slots will usually bundle
     // terminators with the delay slot instruction.
-    for (ConstMIBundleOperands OP(&MI); OP.isValid(); ++OP) {
+    for (ConstMIBundleOperands OP(MI); OP.isValid(); ++OP) {
       if (OP->isJTI())
         return false;
       if (OP->isMBB() && OP->getMBB() == MBB)
diff --git a/lib/CodeGen/AsmPrinter/AsmPrinterDwarf.cpp b/lib/CodeGen/AsmPrinter/AsmPrinterDwarf.cpp
index 504c5d283cba..60f40d063cc8 100644
--- a/lib/CodeGen/AsmPrinter/AsmPrinterDwarf.cpp
+++ b/lib/CodeGen/AsmPrinter/AsmPrinterDwarf.cpp
@@ -178,8 +178,7 @@ void AsmPrinter::emitDwarfStringOffset(DwarfStringPoolEntryRef S) const {
 /// EmitDwarfRegOp - Emit dwarf register operation.
 void AsmPrinter::EmitDwarfRegOp(ByteStreamer &Streamer,
                                 const MachineLocation &MLoc) const {
-  DebugLocDwarfExpression Expr(*MF->getSubtarget().getRegisterInfo(),
-                               getDwarfDebug()->getDwarfVersion(), Streamer);
+  DebugLocDwarfExpression Expr(getDwarfDebug()->getDwarfVersion(), Streamer);
   const MCRegisterInfo *MRI = MMI->getContext().getRegisterInfo();
   int Reg = MRI->getDwarfRegNum(MLoc.getReg(), false);
   if (Reg < 0) {
@@ -193,7 +192,8 @@ void AsmPrinter::EmitDwarfRegOp(ByteStreamer &Streamer,
                          "nop (could not find a dwarf register number)");
 
     // Attempt to find a valid super- or sub-register.
-    if (!Expr.AddMachineRegPiece(MLoc.getReg()))
+    if (!Expr.AddMachineRegPiece(*MF->getSubtarget().getRegisterInfo(),
+                                 MLoc.getReg()))
       Expr.EmitOp(dwarf::DW_OP_nop,
                   "nop (could not find a dwarf register number)");
     return;
diff --git a/lib/CodeGen/AsmPrinter/AsmPrinterHandler.h b/lib/CodeGen/AsmPrinter/AsmPrinterHandler.h
index e59961f85769..638226e90a7a 100644
--- a/lib/CodeGen/AsmPrinter/AsmPrinterHandler.h
+++ b/lib/CodeGen/AsmPrinter/AsmPrinterHandler.h
@@ -19,11 +19,14 @@
 
 namespace llvm {
 
+class AsmPrinter;
 class MachineBasicBlock;
 class MachineFunction;
 class MachineInstr;
 class MCSymbol;
 
+typedef MCSymbol *ExceptionSymbolProvider(AsmPrinter *Asm);
+
 /// \brief Collects and handles AsmPrinter objects required to build debug
 /// or EH information.
 class AsmPrinterHandler {
@@ -51,6 +54,10 @@ public:
   /// beginFunction at all.
   virtual void endFunction(const MachineFunction *MF) = 0;
 
+  virtual void beginFragment(const MachineBasicBlock *MBB,
+                             ExceptionSymbolProvider ESP) {}
+  virtual void endFragment() {}
+
   /// \brief Emit target-specific EH funclet machinery.
   virtual void beginFunclet(const MachineBasicBlock &MBB,
                             MCSymbol *Sym = nullptr) {}
diff --git a/lib/CodeGen/AsmPrinter/AsmPrinterInlineAsm.cpp b/lib/CodeGen/AsmPrinter/AsmPrinterInlineAsm.cpp
index 5633aa4a5655..2ce6c182235f 100644
--- a/lib/CodeGen/AsmPrinter/AsmPrinterInlineAsm.cpp
+++ b/lib/CodeGen/AsmPrinter/AsmPrinterInlineAsm.cpp
@@ -23,10 +23,10 @@
 #include "llvm/IR/LLVMContext.h"
 #include "llvm/IR/Module.h"
 #include "llvm/MC/MCAsmInfo.h"
+#include "llvm/MC/MCParser/MCTargetAsmParser.h"
 #include "llvm/MC/MCStreamer.h"
 #include "llvm/MC/MCSubtargetInfo.h"
 #include "llvm/MC/MCSymbol.h"
-#include "llvm/MC/MCTargetAsmParser.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/MemoryBuffer.h"
 #include "llvm/Support/SourceMgr.h"
diff --git a/lib/CodeGen/AsmPrinter/ByteStreamer.h b/lib/CodeGen/AsmPrinter/ByteStreamer.h
index df1997bcb72c..aaf6180c9404 100644
--- a/lib/CodeGen/AsmPrinter/ByteStreamer.h
+++ b/lib/CodeGen/AsmPrinter/ByteStreamer.h
@@ -16,7 +16,6 @@
 #define LLVM_LIB_CODEGEN_ASMPRINTER_BYTESTREAMER_H
 
 #include "DIEHash.h"
-#include "llvm/ADT/ArrayRef.h"
 #include "llvm/CodeGen/AsmPrinter.h"
 #include "llvm/MC/MCStreamer.h"
 #include "llvm/Support/LEB128.h"
diff --git a/lib/CodeGen/AsmPrinter/CMakeLists.txt b/lib/CodeGen/AsmPrinter/CMakeLists.txt
index ba2f61a44828..68a9973d77ef 100644
--- a/lib/CodeGen/AsmPrinter/CMakeLists.txt
+++ b/lib/CodeGen/AsmPrinter/CMakeLists.txt
@@ -5,6 +5,7 @@ add_llvm_library(LLVMAsmPrinter
   AsmPrinterDwarf.cpp
   AsmPrinterInlineAsm.cpp
   DbgValueHistoryCalculator.cpp
+  DebugHandlerBase.cpp
   DebugLocStream.cpp
   DIE.cpp
   DIEHash.cpp
@@ -20,7 +21,7 @@ add_llvm_library(LLVMAsmPrinter
   ErlangGCPrinter.cpp
   OcamlGCPrinter.cpp
   WinException.cpp
-  WinCodeViewLineTables.cpp
+  CodeViewDebug.cpp
   )
 
 add_dependencies(LLVMAsmPrinter intrinsics_gen)
diff --git a/lib/CodeGen/AsmPrinter/CodeViewDebug.cpp b/lib/CodeGen/AsmPrinter/CodeViewDebug.cpp
new file mode 100644
index 000000000000..b0ba57122206
--- /dev/null
+++ b/lib/CodeGen/AsmPrinter/CodeViewDebug.cpp
@@ -0,0 +1,2076 @@
+//===-- llvm/lib/CodeGen/AsmPrinter/CodeViewDebug.cpp --*- C++ -*--===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains support for writing Microsoft CodeView debug info.
+//
+//===----------------------------------------------------------------------===//
+
+#include "CodeViewDebug.h"
+#include "llvm/ADT/TinyPtrVector.h"
+#include "llvm/DebugInfo/CodeView/ByteStream.h"
+#include "llvm/DebugInfo/CodeView/CVTypeVisitor.h"
+#include "llvm/DebugInfo/CodeView/CodeView.h"
+#include "llvm/DebugInfo/CodeView/FieldListRecordBuilder.h"
+#include "llvm/DebugInfo/CodeView/Line.h"
+#include "llvm/DebugInfo/CodeView/SymbolRecord.h"
+#include "llvm/DebugInfo/CodeView/TypeDumper.h"
+#include "llvm/DebugInfo/CodeView/TypeIndex.h"
+#include "llvm/DebugInfo/CodeView/TypeRecord.h"
+#include "llvm/DebugInfo/CodeView/TypeVisitorCallbacks.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/MC/MCExpr.h"
+#include "llvm/MC/MCSectionCOFF.h"
+#include "llvm/MC/MCSymbol.h"
+#include "llvm/Support/COFF.h"
+#include "llvm/Support/ScopedPrinter.h"
+#include "llvm/Target/TargetFrameLowering.h"
+#include "llvm/Target/TargetRegisterInfo.h"
+#include "llvm/Target/TargetSubtargetInfo.h"
+
+using namespace llvm;
+using namespace llvm::codeview;
+
+CodeViewDebug::CodeViewDebug(AsmPrinter *AP)
+    : DebugHandlerBase(AP), OS(*Asm->OutStreamer), CurFn(nullptr) {
+  // If module doesn't have named metadata anchors or COFF debug section
+  // is not available, skip any debug info related stuff.
+  if (!MMI->getModule()->getNamedMetadata("llvm.dbg.cu") ||
+      !AP->getObjFileLowering().getCOFFDebugSymbolsSection()) {
+    Asm = nullptr;
+    return;
+  }
+
+  // Tell MMI that we have debug info.
+  MMI->setDebugInfoAvailability(true);
+}
+
+StringRef CodeViewDebug::getFullFilepath(const DIFile *File) {
+  std::string &Filepath = FileToFilepathMap[File];
+  if (!Filepath.empty())
+    return Filepath;
+
+  StringRef Dir = File->getDirectory(), Filename = File->getFilename();
+
+  // Clang emits directory and relative filename info into the IR, but CodeView
+  // operates on full paths.  We could change Clang to emit full paths too, but
+  // that would increase the IR size and probably not needed for other users.
+  // For now, just concatenate and canonicalize the path here.
+  if (Filename.find(':') == 1)
+    Filepath = Filename;
+  else
+    Filepath = (Dir + "\\" + Filename).str();
+
+  // Canonicalize the path.  We have to do it textually because we may no longer
+  // have access the file in the filesystem.
+  // First, replace all slashes with backslashes.
+  std::replace(Filepath.begin(), Filepath.end(), '/', '\\');
+
+  // Remove all "\.\" with "\".
+  size_t Cursor = 0;
+  while ((Cursor = Filepath.find("\\.\\", Cursor)) != std::string::npos)
+    Filepath.erase(Cursor, 2);
+
+  // Replace all "\XXX\..\" with "\".  Don't try too hard though as the original
+  // path should be well-formatted, e.g. start with a drive letter, etc.
+  Cursor = 0;
+  while ((Cursor = Filepath.find("\\..\\", Cursor)) != std::string::npos) {
+    // Something's wrong if the path starts with "\..\", abort.
+    if (Cursor == 0)
+      break;
+
+    size_t PrevSlash = Filepath.rfind('\\', Cursor - 1);
+    if (PrevSlash == std::string::npos)
+      // Something's wrong, abort.
+      break;
+
+    Filepath.erase(PrevSlash, Cursor + 3 - PrevSlash);
+    // The next ".." might be following the one we've just erased.
+    Cursor = PrevSlash;
+  }
+
+  // Remove all duplicate backslashes.
+  Cursor = 0;
+  while ((Cursor = Filepath.find("\\\\", Cursor)) != std::string::npos)
+    Filepath.erase(Cursor, 1);
+
+  return Filepath;
+}
+
+unsigned CodeViewDebug::maybeRecordFile(const DIFile *F) {
+  unsigned NextId = FileIdMap.size() + 1;
+  auto Insertion = FileIdMap.insert(std::make_pair(F, NextId));
+  if (Insertion.second) {
+    // We have to compute the full filepath and emit a .cv_file directive.
+    StringRef FullPath = getFullFilepath(F);
+    NextId = OS.EmitCVFileDirective(NextId, FullPath);
+    assert(NextId == FileIdMap.size() && ".cv_file directive failed");
+  }
+  return Insertion.first->second;
+}
+
+CodeViewDebug::InlineSite &
+CodeViewDebug::getInlineSite(const DILocation *InlinedAt,
+                             const DISubprogram *Inlinee) {
+  auto SiteInsertion = CurFn->InlineSites.insert({InlinedAt, InlineSite()});
+  InlineSite *Site = &SiteInsertion.first->second;
+  if (SiteInsertion.second) {
+    Site->SiteFuncId = NextFuncId++;
+    Site->Inlinee = Inlinee;
+    InlinedSubprograms.insert(Inlinee);
+    getFuncIdForSubprogram(Inlinee);
+  }
+  return *Site;
+}
+
+static StringRef getPrettyScopeName(const DIScope *Scope) {
+  StringRef ScopeName = Scope->getName();
+  if (!ScopeName.empty())
+    return ScopeName;
+
+  switch (Scope->getTag()) {
+  case dwarf::DW_TAG_enumeration_type:
+  case dwarf::DW_TAG_class_type:
+  case dwarf::DW_TAG_structure_type:
+  case dwarf::DW_TAG_union_type:
+    return "<unnamed-tag>";
+  case dwarf::DW_TAG_namespace:
+    return "`anonymous namespace'";
+  }
+
+  return StringRef();
+}
+
+static const DISubprogram *getQualifiedNameComponents(
+    const DIScope *Scope, SmallVectorImpl<StringRef> &QualifiedNameComponents) {
+  const DISubprogram *ClosestSubprogram = nullptr;
+  while (Scope != nullptr) {
+    if (ClosestSubprogram == nullptr)
+      ClosestSubprogram = dyn_cast<DISubprogram>(Scope);
+    StringRef ScopeName = getPrettyScopeName(Scope);
+    if (!ScopeName.empty())
+      QualifiedNameComponents.push_back(ScopeName);
+    Scope = Scope->getScope().resolve();
+  }
+  return ClosestSubprogram;
+}
+
+static std::string getQualifiedName(ArrayRef<StringRef> QualifiedNameComponents,
+                                    StringRef TypeName) {
+  std::string FullyQualifiedName;
+  for (StringRef QualifiedNameComponent : reverse(QualifiedNameComponents)) {
+    FullyQualifiedName.append(QualifiedNameComponent);
+    FullyQualifiedName.append("::");
+  }
+  FullyQualifiedName.append(TypeName);
+  return FullyQualifiedName;
+}
+
+static std::string getFullyQualifiedName(const DIScope *Scope, StringRef Name) {
+  SmallVector<StringRef, 5> QualifiedNameComponents;
+  getQualifiedNameComponents(Scope, QualifiedNameComponents);
+  return getQualifiedName(QualifiedNameComponents, Name);
+}
+
+struct CodeViewDebug::TypeLoweringScope {
+  TypeLoweringScope(CodeViewDebug &CVD) : CVD(CVD) { ++CVD.TypeEmissionLevel; }
+  ~TypeLoweringScope() {
+    // Don't decrement TypeEmissionLevel until after emitting deferred types, so
+    // inner TypeLoweringScopes don't attempt to emit deferred types.
+    if (CVD.TypeEmissionLevel == 1)
+      CVD.emitDeferredCompleteTypes();
+    --CVD.TypeEmissionLevel;
+  }
+  CodeViewDebug &CVD;
+};
+
+static std::string getFullyQualifiedName(const DIScope *Ty) {
+  const DIScope *Scope = Ty->getScope().resolve();
+  return getFullyQualifiedName(Scope, getPrettyScopeName(Ty));
+}
+
+TypeIndex CodeViewDebug::getScopeIndex(const DIScope *Scope) {
+  // No scope means global scope and that uses the zero index.
+  if (!Scope || isa<DIFile>(Scope))
+    return TypeIndex();
+
+  assert(!isa<DIType>(Scope) && "shouldn't make a namespace scope for a type");
+
+  // Check if we've already translated this scope.
+  auto I = TypeIndices.find({Scope, nullptr});
+  if (I != TypeIndices.end())
+    return I->second;
+
+  // Build the fully qualified name of the scope.
+  std::string ScopeName = getFullyQualifiedName(Scope);
+  TypeIndex TI =
+      TypeTable.writeStringId(StringIdRecord(TypeIndex(), ScopeName));
+  return recordTypeIndexForDINode(Scope, TI);
+}
+
+TypeIndex CodeViewDebug::getFuncIdForSubprogram(const DISubprogram *SP) {
+  // It's possible to ask for the FuncId of a function which doesn't have a
+  // subprogram: inlining a function with debug info into a function with none.
+  if (!SP)
+    return TypeIndex::None();
+
+  // Check if we've already translated this subprogram.
+  auto I = TypeIndices.find({SP, nullptr});
+  if (I != TypeIndices.end())
+    return I->second;
+
+  // The display name includes function template arguments. Drop them to match
+  // MSVC.
+  StringRef DisplayName = SP->getDisplayName().split('<').first;
+
+  const DIScope *Scope = SP->getScope().resolve();
+  TypeIndex TI;
+  if (const auto *Class = dyn_cast_or_null<DICompositeType>(Scope)) {
+    // If the scope is a DICompositeType, then this must be a method. Member
+    // function types take some special handling, and require access to the
+    // subprogram.
+    TypeIndex ClassType = getTypeIndex(Class);
+    MemberFuncIdRecord MFuncId(ClassType, getMemberFunctionType(SP, Class),
+                               DisplayName);
+    TI = TypeTable.writeMemberFuncId(MFuncId);
+  } else {
+    // Otherwise, this must be a free function.
+    TypeIndex ParentScope = getScopeIndex(Scope);
+    FuncIdRecord FuncId(ParentScope, getTypeIndex(SP->getType()), DisplayName);
+    TI = TypeTable.writeFuncId(FuncId);
+  }
+
+  return recordTypeIndexForDINode(SP, TI);
+}
+
+TypeIndex CodeViewDebug::getMemberFunctionType(const DISubprogram *SP,
+                                               const DICompositeType *Class) {
+  // Always use the method declaration as the key for the function type. The
+  // method declaration contains the this adjustment.
+  if (SP->getDeclaration())
+    SP = SP->getDeclaration();
+  assert(!SP->getDeclaration() && "should use declaration as key");
+
+  // Key the MemberFunctionRecord into the map as {SP, Class}. It won't collide
+  // with the MemberFuncIdRecord, which is keyed in as {SP, nullptr}.
+  auto I = TypeIndices.find({SP, Class});
+  if (I != TypeIndices.end())
+    return I->second;
+
+  // Make sure complete type info for the class is emitted *after* the member
+  // function type, as the complete class type is likely to reference this
+  // member function type.
+  TypeLoweringScope S(*this);
+  TypeIndex TI =
+      lowerTypeMemberFunction(SP->getType(), Class, SP->getThisAdjustment());
+  return recordTypeIndexForDINode(SP, TI, Class);
+}
+
+TypeIndex CodeViewDebug::recordTypeIndexForDINode(const DINode *Node,
+                                                  TypeIndex TI,
+                                                  const DIType *ClassTy) {
+  auto InsertResult = TypeIndices.insert({{Node, ClassTy}, TI});
+  (void)InsertResult;
+  assert(InsertResult.second && "DINode was already assigned a type index");
+  return TI;
+}
+
+unsigned CodeViewDebug::getPointerSizeInBytes() {
+  return MMI->getModule()->getDataLayout().getPointerSizeInBits() / 8;
+}
+
+void CodeViewDebug::recordLocalVariable(LocalVariable &&Var,
+                                        const DILocation *InlinedAt) {
+  if (InlinedAt) {
+    // This variable was inlined. Associate it with the InlineSite.
+    const DISubprogram *Inlinee = Var.DIVar->getScope()->getSubprogram();
+    InlineSite &Site = getInlineSite(InlinedAt, Inlinee);
+    Site.InlinedLocals.emplace_back(Var);
+  } else {
+    // This variable goes in the main ProcSym.
+    CurFn->Locals.emplace_back(Var);
+  }
+}
+
+static void addLocIfNotPresent(SmallVectorImpl<const DILocation *> &Locs,
+                               const DILocation *Loc) {
+  auto B = Locs.begin(), E = Locs.end();
+  if (std::find(B, E, Loc) == E)
+    Locs.push_back(Loc);
+}
+
+void CodeViewDebug::maybeRecordLocation(const DebugLoc &DL,
+                                        const MachineFunction *MF) {
+  // Skip this instruction if it has the same location as the previous one.
+  if (DL == CurFn->LastLoc)
+    return;
+
+  const DIScope *Scope = DL.get()->getScope();
+  if (!Scope)
+    return;
+
+  // Skip this line if it is longer than the maximum we can record.
+  LineInfo LI(DL.getLine(), DL.getLine(), /*IsStatement=*/true);
+  if (LI.getStartLine() != DL.getLine() || LI.isAlwaysStepInto() ||
+      LI.isNeverStepInto())
+    return;
+
+  ColumnInfo CI(DL.getCol(), /*EndColumn=*/0);
+  if (CI.getStartColumn() != DL.getCol())
+    return;
+
+  if (!CurFn->HaveLineInfo)
+    CurFn->HaveLineInfo = true;
+  unsigned FileId = 0;
+  if (CurFn->LastLoc.get() && CurFn->LastLoc->getFile() == DL->getFile())
+    FileId = CurFn->LastFileId;
+  else
+    FileId = CurFn->LastFileId = maybeRecordFile(DL->getFile());
+  CurFn->LastLoc = DL;
+
+  unsigned FuncId = CurFn->FuncId;
+  if (const DILocation *SiteLoc = DL->getInlinedAt()) {
+    const DILocation *Loc = DL.get();
+
+    // If this location was actually inlined from somewhere else, give it the ID
+    // of the inline call site.
+    FuncId =
+        getInlineSite(SiteLoc, Loc->getScope()->getSubprogram()).SiteFuncId;
+
+    // Ensure we have links in the tree of inline call sites.
+    bool FirstLoc = true;
+    while ((SiteLoc = Loc->getInlinedAt())) {
+      InlineSite &Site =
+          getInlineSite(SiteLoc, Loc->getScope()->getSubprogram());
+      if (!FirstLoc)
+        addLocIfNotPresent(Site.ChildSites, Loc);
+      FirstLoc = false;
+      Loc = SiteLoc;
+    }
+    addLocIfNotPresent(CurFn->ChildSites, Loc);
+  }
+
+  OS.EmitCVLocDirective(FuncId, FileId, DL.getLine(), DL.getCol(),
+                        /*PrologueEnd=*/false,
+                        /*IsStmt=*/false, DL->getFilename());
+}
+
+void CodeViewDebug::emitCodeViewMagicVersion() {
+  OS.EmitValueToAlignment(4);
+  OS.AddComment("Debug section magic");
+  OS.EmitIntValue(COFF::DEBUG_SECTION_MAGIC, 4);
+}
+
+void CodeViewDebug::endModule() {
+  if (!Asm || !MMI->hasDebugInfo())
+    return;
+
+  assert(Asm != nullptr);
+
+  // The COFF .debug$S section consists of several subsections, each starting
+  // with a 4-byte control code (e.g. 0xF1, 0xF2, etc) and then a 4-byte length
+  // of the payload followed by the payload itself.  The subsections are 4-byte
+  // aligned.
+
+  // Use the generic .debug$S section, and make a subsection for all the inlined
+  // subprograms.
+  switchToDebugSectionForSymbol(nullptr);
+  emitInlineeLinesSubsection();
+
+  // Emit per-function debug information.
+  for (auto &P : FnDebugInfo)
+    if (!P.first->isDeclarationForLinker())
+      emitDebugInfoForFunction(P.first, P.second);
+
+  // Emit global variable debug information.
+  setCurrentSubprogram(nullptr);
+  emitDebugInfoForGlobals();
+
+  // Emit retained types.
+  emitDebugInfoForRetainedTypes();
+
+  // Switch back to the generic .debug$S section after potentially processing
+  // comdat symbol sections.
+  switchToDebugSectionForSymbol(nullptr);
+
+  // Emit UDT records for any types used by global variables.
+  if (!GlobalUDTs.empty()) {
+    MCSymbol *SymbolsEnd = beginCVSubsection(ModuleSubstreamKind::Symbols);
+    emitDebugInfoForUDTs(GlobalUDTs);
+    endCVSubsection(SymbolsEnd);
+  }
+
+  // This subsection holds a file index to offset in string table table.
+  OS.AddComment("File index to string table offset subsection");
+  OS.EmitCVFileChecksumsDirective();
+
+  // This subsection holds the string table.
+  OS.AddComment("String table");
+  OS.EmitCVStringTableDirective();
+
+  // Emit type information last, so that any types we translate while emitting
+  // function info are included.
+  emitTypeInformation();
+
+  clear();
+}
+
+static void emitNullTerminatedSymbolName(MCStreamer &OS, StringRef S) {
+  // Microsoft's linker seems to have trouble with symbol names longer than
+  // 0xffd8 bytes.
+  S = S.substr(0, 0xffd8);
+  SmallString<32> NullTerminatedString(S);
+  NullTerminatedString.push_back('\0');
+  OS.EmitBytes(NullTerminatedString);
+}
+
+void CodeViewDebug::emitTypeInformation() {
+  // Do nothing if we have no debug info or if no non-trivial types were emitted
+  // to TypeTable during codegen.
+  NamedMDNode *CU_Nodes = MMI->getModule()->getNamedMetadata("llvm.dbg.cu");
+  if (!CU_Nodes)
+    return;
+  if (TypeTable.empty())
+    return;
+
+  // Start the .debug$T section with 0x4.
+  OS.SwitchSection(Asm->getObjFileLowering().getCOFFDebugTypesSection());
+  emitCodeViewMagicVersion();
+
+  SmallString<8> CommentPrefix;
+  if (OS.isVerboseAsm()) {
+    CommentPrefix += '\t';
+    CommentPrefix += Asm->MAI->getCommentString();
+    CommentPrefix += ' ';
+  }
+
+  CVTypeDumper CVTD(nullptr, /*PrintRecordBytes=*/false);
+  TypeTable.ForEachRecord(
+      [&](TypeIndex Index, StringRef Record) {
+        if (OS.isVerboseAsm()) {
+          // Emit a block comment describing the type record for readability.
+          SmallString<512> CommentBlock;
+          raw_svector_ostream CommentOS(CommentBlock);
+          ScopedPrinter SP(CommentOS);
+          SP.setPrefix(CommentPrefix);
+          CVTD.setPrinter(&SP);
+          Error E = CVTD.dump({Record.bytes_begin(), Record.bytes_end()});
+          if (E) {
+            logAllUnhandledErrors(std::move(E), errs(), "error: ");
+            llvm_unreachable("produced malformed type record");
+          }
+          // emitRawComment will insert its own tab and comment string before
+          // the first line, so strip off our first one. It also prints its own
+          // newline.
+          OS.emitRawComment(
+              CommentOS.str().drop_front(CommentPrefix.size() - 1).rtrim());
+        } else {
+#ifndef NDEBUG
+          // Assert that the type data is valid even if we aren't dumping
+          // comments. The MSVC linker doesn't do much type record validation,
+          // so the first link of an invalid type record can succeed while
+          // subsequent links will fail with LNK1285.
+          ByteStream<> Stream({Record.bytes_begin(), Record.bytes_end()});
+          CVTypeArray Types;
+          StreamReader Reader(Stream);
+          Error E = Reader.readArray(Types, Reader.getLength());
+          if (!E) {
+            TypeVisitorCallbacks C;
+            E = CVTypeVisitor(C).visitTypeStream(Types);
+          }
+          if (E) {
+            logAllUnhandledErrors(std::move(E), errs(), "error: ");
+            llvm_unreachable("produced malformed type record");
+          }
+#endif
+        }
+        OS.EmitBinaryData(Record);
+      });
+}
+
+void CodeViewDebug::emitInlineeLinesSubsection() {
+  if (InlinedSubprograms.empty())
+    return;
+
+  OS.AddComment("Inlinee lines subsection");
+  MCSymbol *InlineEnd = beginCVSubsection(ModuleSubstreamKind::InlineeLines);
+
+  // We don't provide any extra file info.
+  // FIXME: Find out if debuggers use this info.
+  OS.AddComment("Inlinee lines signature");
+  OS.EmitIntValue(unsigned(InlineeLinesSignature::Normal), 4);
+
+  for (const DISubprogram *SP : InlinedSubprograms) {
+    assert(TypeIndices.count({SP, nullptr}));
+    TypeIndex InlineeIdx = TypeIndices[{SP, nullptr}];
+
+    OS.AddBlankLine();
+    unsigned FileId = maybeRecordFile(SP->getFile());
+    OS.AddComment("Inlined function " + SP->getDisplayName() + " starts at " +
+                  SP->getFilename() + Twine(':') + Twine(SP->getLine()));
+    OS.AddBlankLine();
+    // The filechecksum table uses 8 byte entries for now, and file ids start at
+    // 1.
+    unsigned FileOffset = (FileId - 1) * 8;
+    OS.AddComment("Type index of inlined function");
+    OS.EmitIntValue(InlineeIdx.getIndex(), 4);
+    OS.AddComment("Offset into filechecksum table");
+    OS.EmitIntValue(FileOffset, 4);
+    OS.AddComment("Starting line number");
+    OS.EmitIntValue(SP->getLine(), 4);
+  }
+
+  endCVSubsection(InlineEnd);
+}
+
+void CodeViewDebug::collectInlineSiteChildren(
+    SmallVectorImpl<unsigned> &Children, const FunctionInfo &FI,
+    const InlineSite &Site) {
+  for (const DILocation *ChildSiteLoc : Site.ChildSites) {
+    auto I = FI.InlineSites.find(ChildSiteLoc);
+    const InlineSite &ChildSite = I->second;
+    Children.push_back(ChildSite.SiteFuncId);
+    collectInlineSiteChildren(Children, FI, ChildSite);
+  }
+}
+
+void CodeViewDebug::emitInlinedCallSite(const FunctionInfo &FI,
+                                        const DILocation *InlinedAt,
+                                        const InlineSite &Site) {
+  MCSymbol *InlineBegin = MMI->getContext().createTempSymbol(),
+           *InlineEnd = MMI->getContext().createTempSymbol();
+
+  assert(TypeIndices.count({Site.Inlinee, nullptr}));
+  TypeIndex InlineeIdx = TypeIndices[{Site.Inlinee, nullptr}];
+
+  // SymbolRecord
+  OS.AddComment("Record length");
+  OS.emitAbsoluteSymbolDiff(InlineEnd, InlineBegin, 2);   // RecordLength
+  OS.EmitLabel(InlineBegin);
+  OS.AddComment("Record kind: S_INLINESITE");
+  OS.EmitIntValue(SymbolKind::S_INLINESITE, 2); // RecordKind
+
+  OS.AddComment("PtrParent");
+  OS.EmitIntValue(0, 4);
+  OS.AddComment("PtrEnd");
+  OS.EmitIntValue(0, 4);
+  OS.AddComment("Inlinee type index");
+  OS.EmitIntValue(InlineeIdx.getIndex(), 4);
+
+  unsigned FileId = maybeRecordFile(Site.Inlinee->getFile());
+  unsigned StartLineNum = Site.Inlinee->getLine();
+  SmallVector<unsigned, 3> SecondaryFuncIds;
+  collectInlineSiteChildren(SecondaryFuncIds, FI, Site);
+
+  OS.EmitCVInlineLinetableDirective(Site.SiteFuncId, FileId, StartLineNum,
+                                    FI.Begin, FI.End, SecondaryFuncIds);
+
+  OS.EmitLabel(InlineEnd);
+
+  emitLocalVariableList(Site.InlinedLocals);
+
+  // Recurse on child inlined call sites before closing the scope.
+  for (const DILocation *ChildSite : Site.ChildSites) {
+    auto I = FI.InlineSites.find(ChildSite);
+    assert(I != FI.InlineSites.end() &&
+           "child site not in function inline site map");
+    emitInlinedCallSite(FI, ChildSite, I->second);
+  }
+
+  // Close the scope.
+  OS.AddComment("Record length");
+  OS.EmitIntValue(2, 2);                                  // RecordLength
+  OS.AddComment("Record kind: S_INLINESITE_END");
+  OS.EmitIntValue(SymbolKind::S_INLINESITE_END, 2); // RecordKind
+}
+
+void CodeViewDebug::switchToDebugSectionForSymbol(const MCSymbol *GVSym) {
+  // If we have a symbol, it may be in a section that is COMDAT. If so, find the
+  // comdat key. A section may be comdat because of -ffunction-sections or
+  // because it is comdat in the IR.
+  MCSectionCOFF *GVSec =
+      GVSym ? dyn_cast<MCSectionCOFF>(&GVSym->getSection()) : nullptr;
+  const MCSymbol *KeySym = GVSec ? GVSec->getCOMDATSymbol() : nullptr;
+
+  MCSectionCOFF *DebugSec = cast<MCSectionCOFF>(
+      Asm->getObjFileLowering().getCOFFDebugSymbolsSection());
+  DebugSec = OS.getContext().getAssociativeCOFFSection(DebugSec, KeySym);
+
+  OS.SwitchSection(DebugSec);
+
+  // Emit the magic version number if this is the first time we've switched to
+  // this section.
+  if (ComdatDebugSections.insert(DebugSec).second)
+    emitCodeViewMagicVersion();
+}
+
+void CodeViewDebug::emitDebugInfoForFunction(const Function *GV,
+                                             FunctionInfo &FI) {
+  // For each function there is a separate subsection
+  // which holds the PC to file:line table.
+  const MCSymbol *Fn = Asm->getSymbol(GV);
+  assert(Fn);
+
+  // Switch to the to a comdat section, if appropriate.
+  switchToDebugSectionForSymbol(Fn);
+
+  std::string FuncName;
+  auto *SP = GV->getSubprogram();
+  setCurrentSubprogram(SP);
+
+  // If we have a display name, build the fully qualified name by walking the
+  // chain of scopes.
+  if (SP != nullptr && !SP->getDisplayName().empty())
+    FuncName =
+        getFullyQualifiedName(SP->getScope().resolve(), SP->getDisplayName());
+
+  // If our DISubprogram name is empty, use the mangled name.
+  if (FuncName.empty())
+    FuncName = GlobalValue::getRealLinkageName(GV->getName());
+
+  // Emit a symbol subsection, required by VS2012+ to find function boundaries.
+  OS.AddComment("Symbol subsection for " + Twine(FuncName));
+  MCSymbol *SymbolsEnd = beginCVSubsection(ModuleSubstreamKind::Symbols);
+  {
+    MCSymbol *ProcRecordBegin = MMI->getContext().createTempSymbol(),
+             *ProcRecordEnd = MMI->getContext().createTempSymbol();
+    OS.AddComment("Record length");
+    OS.emitAbsoluteSymbolDiff(ProcRecordEnd, ProcRecordBegin, 2);
+    OS.EmitLabel(ProcRecordBegin);
+
+  if (GV->hasLocalLinkage()) {
+    OS.AddComment("Record kind: S_LPROC32_ID");
+    OS.EmitIntValue(unsigned(SymbolKind::S_LPROC32_ID), 2);
+  } else {
+    OS.AddComment("Record kind: S_GPROC32_ID");
+    OS.EmitIntValue(unsigned(SymbolKind::S_GPROC32_ID), 2);
+  }
+
+    // These fields are filled in by tools like CVPACK which run after the fact.
+    OS.AddComment("PtrParent");
+    OS.EmitIntValue(0, 4);
+    OS.AddComment("PtrEnd");
+    OS.EmitIntValue(0, 4);
+    OS.AddComment("PtrNext");
+    OS.EmitIntValue(0, 4);
+    // This is the important bit that tells the debugger where the function
+    // code is located and what's its size:
+    OS.AddComment("Code size");
+    OS.emitAbsoluteSymbolDiff(FI.End, Fn, 4);
+    OS.AddComment("Offset after prologue");
+    OS.EmitIntValue(0, 4);
+    OS.AddComment("Offset before epilogue");
+    OS.EmitIntValue(0, 4);
+    OS.AddComment("Function type index");
+    OS.EmitIntValue(getFuncIdForSubprogram(GV->getSubprogram()).getIndex(), 4);
+    OS.AddComment("Function section relative address");
+    OS.EmitCOFFSecRel32(Fn);
+    OS.AddComment("Function section index");
+    OS.EmitCOFFSectionIndex(Fn);
+    OS.AddComment("Flags");
+    OS.EmitIntValue(0, 1);
+    // Emit the function display name as a null-terminated string.
+    OS.AddComment("Function name");
+    // Truncate the name so we won't overflow the record length field.
+    emitNullTerminatedSymbolName(OS, FuncName);
+    OS.EmitLabel(ProcRecordEnd);
+
+    emitLocalVariableList(FI.Locals);
+
+    // Emit inlined call site information. Only emit functions inlined directly
+    // into the parent function. We'll emit the other sites recursively as part
+    // of their parent inline site.
+    for (const DILocation *InlinedAt : FI.ChildSites) {
+      auto I = FI.InlineSites.find(InlinedAt);
+      assert(I != FI.InlineSites.end() &&
+             "child site not in function inline site map");
+      emitInlinedCallSite(FI, InlinedAt, I->second);
+    }
+
+    if (SP != nullptr)
+      emitDebugInfoForUDTs(LocalUDTs);
+
+    // We're done with this function.
+    OS.AddComment("Record length");
+    OS.EmitIntValue(0x0002, 2);
+    OS.AddComment("Record kind: S_PROC_ID_END");
+    OS.EmitIntValue(unsigned(SymbolKind::S_PROC_ID_END), 2);
+  }
+  endCVSubsection(SymbolsEnd);
+
+  // We have an assembler directive that takes care of the whole line table.
+  OS.EmitCVLinetableDirective(FI.FuncId, Fn, FI.End);
+}
+
+CodeViewDebug::LocalVarDefRange
+CodeViewDebug::createDefRangeMem(uint16_t CVRegister, int Offset) {
+  LocalVarDefRange DR;
+  DR.InMemory = -1;
+  DR.DataOffset = Offset;
+  assert(DR.DataOffset == Offset && "truncation");
+  DR.StructOffset = 0;
+  DR.CVRegister = CVRegister;
+  return DR;
+}
+
+CodeViewDebug::LocalVarDefRange
+CodeViewDebug::createDefRangeReg(uint16_t CVRegister) {
+  LocalVarDefRange DR;
+  DR.InMemory = 0;
+  DR.DataOffset = 0;
+  DR.StructOffset = 0;
+  DR.CVRegister = CVRegister;
+  return DR;
+}
+
+void CodeViewDebug::collectVariableInfoFromMMITable(
+    DenseSet<InlinedVariable> &Processed) {
+  const TargetSubtargetInfo &TSI = Asm->MF->getSubtarget();
+  const TargetFrameLowering *TFI = TSI.getFrameLowering();
+  const TargetRegisterInfo *TRI = TSI.getRegisterInfo();
+
+  for (const MachineModuleInfo::VariableDbgInfo &VI :
+       MMI->getVariableDbgInfo()) {
+    if (!VI.Var)
+      continue;
+    assert(VI.Var->isValidLocationForIntrinsic(VI.Loc) &&
+           "Expected inlined-at fields to agree");
+
+    Processed.insert(InlinedVariable(VI.Var, VI.Loc->getInlinedAt()));
+    LexicalScope *Scope = LScopes.findLexicalScope(VI.Loc);
+
+    // If variable scope is not found then skip this variable.
+    if (!Scope)
+      continue;
+
+    // Get the frame register used and the offset.
+    unsigned FrameReg = 0;
+    int FrameOffset = TFI->getFrameIndexReference(*Asm->MF, VI.Slot, FrameReg);
+    uint16_t CVReg = TRI->getCodeViewRegNum(FrameReg);
+
+    // Calculate the label ranges.
+    LocalVarDefRange DefRange = createDefRangeMem(CVReg, FrameOffset);
+    for (const InsnRange &Range : Scope->getRanges()) {
+      const MCSymbol *Begin = getLabelBeforeInsn(Range.first);
+      const MCSymbol *End = getLabelAfterInsn(Range.second);
+      End = End ? End : Asm->getFunctionEnd();
+      DefRange.Ranges.emplace_back(Begin, End);
+    }
+
+    LocalVariable Var;
+    Var.DIVar = VI.Var;
+    Var.DefRanges.emplace_back(std::move(DefRange));
+    recordLocalVariable(std::move(Var), VI.Loc->getInlinedAt());
+  }
+}
+
+void CodeViewDebug::collectVariableInfo(const DISubprogram *SP) {
+  DenseSet<InlinedVariable> Processed;
+  // Grab the variable info that was squirreled away in the MMI side-table.
+  collectVariableInfoFromMMITable(Processed);
+
+  const TargetRegisterInfo *TRI = Asm->MF->getSubtarget().getRegisterInfo();
+
+  for (const auto &I : DbgValues) {
+    InlinedVariable IV = I.first;
+    if (Processed.count(IV))
+      continue;
+    const DILocalVariable *DIVar = IV.first;
+    const DILocation *InlinedAt = IV.second;
+
+    // Instruction ranges, specifying where IV is accessible.
+    const auto &Ranges = I.second;
+
+    LexicalScope *Scope = nullptr;
+    if (InlinedAt)
+      Scope = LScopes.findInlinedScope(DIVar->getScope(), InlinedAt);
+    else
+      Scope = LScopes.findLexicalScope(DIVar->getScope());
+    // If variable scope is not found then skip this variable.
+    if (!Scope)
+      continue;
+
+    LocalVariable Var;
+    Var.DIVar = DIVar;
+
+    // Calculate the definition ranges.
+    for (auto I = Ranges.begin(), E = Ranges.end(); I != E; ++I) {
+      const InsnRange &Range = *I;
+      const MachineInstr *DVInst = Range.first;
+      assert(DVInst->isDebugValue() && "Invalid History entry");
+      const DIExpression *DIExpr = DVInst->getDebugExpression();
+
+      // Bail if there is a complex DWARF expression for now.
+      if (DIExpr && DIExpr->getNumElements() > 0)
+        continue;
+
+      // Bail if operand 0 is not a valid register. This means the variable is a
+      // simple constant, or is described by a complex expression.
+      // FIXME: Find a way to represent constant variables, since they are
+      // relatively common.
+      unsigned Reg =
+          DVInst->getOperand(0).isReg() ? DVInst->getOperand(0).getReg() : 0;
+      if (Reg == 0)
+        continue;
+
+      // Handle the two cases we can handle: indirect in memory and in register.
+      bool IsIndirect = DVInst->getOperand(1).isImm();
+      unsigned CVReg = TRI->getCodeViewRegNum(DVInst->getOperand(0).getReg());
+      {
+        LocalVarDefRange DefRange;
+        if (IsIndirect) {
+          int64_t Offset = DVInst->getOperand(1).getImm();
+          DefRange = createDefRangeMem(CVReg, Offset);
+        } else {
+          DefRange = createDefRangeReg(CVReg);
+        }
+        if (Var.DefRanges.empty() ||
+            Var.DefRanges.back().isDifferentLocation(DefRange)) {
+          Var.DefRanges.emplace_back(std::move(DefRange));
+        }
+      }
+
+      // Compute the label range.
+      const MCSymbol *Begin = getLabelBeforeInsn(Range.first);
+      const MCSymbol *End = getLabelAfterInsn(Range.second);
+      if (!End) {
+        if (std::next(I) != E)
+          End = getLabelBeforeInsn(std::next(I)->first);
+        else
+          End = Asm->getFunctionEnd();
+      }
+
+      // If the last range end is our begin, just extend the last range.
+      // Otherwise make a new range.
+      SmallVectorImpl<std::pair<const MCSymbol *, const MCSymbol *>> &Ranges =
+          Var.DefRanges.back().Ranges;
+      if (!Ranges.empty() && Ranges.back().second == Begin)
+        Ranges.back().second = End;
+      else
+        Ranges.emplace_back(Begin, End);
+
+      // FIXME: Do more range combining.
+    }
+
+    recordLocalVariable(std::move(Var), InlinedAt);
+  }
+}
+
+void CodeViewDebug::beginFunction(const MachineFunction *MF) {
+  assert(!CurFn && "Can't process two functions at once!");
+
+  if (!Asm || !MMI->hasDebugInfo())
+    return;
+
+  DebugHandlerBase::beginFunction(MF);
+
+  const Function *GV = MF->getFunction();
+  assert(FnDebugInfo.count(GV) == false);
+  CurFn = &FnDebugInfo[GV];
+  CurFn->FuncId = NextFuncId++;
+  CurFn->Begin = Asm->getFunctionBegin();
+
+  // Find the end of the function prolog.  First known non-DBG_VALUE and
+  // non-frame setup location marks the beginning of the function body.
+  // FIXME: is there a simpler a way to do this? Can we just search
+  // for the first instruction of the function, not the last of the prolog?
+  DebugLoc PrologEndLoc;
+  bool EmptyPrologue = true;
+  for (const auto &MBB : *MF) {
+    for (const auto &MI : MBB) {
+      if (!MI.isDebugValue() && !MI.getFlag(MachineInstr::FrameSetup) &&
+          MI.getDebugLoc()) {
+        PrologEndLoc = MI.getDebugLoc();
+        break;
+      } else if (!MI.isDebugValue()) {
+        EmptyPrologue = false;
+      }
+    }
+  }
+
+  // Record beginning of function if we have a non-empty prologue.
+  if (PrologEndLoc && !EmptyPrologue) {
+    DebugLoc FnStartDL = PrologEndLoc.getFnDebugLoc();
+    maybeRecordLocation(FnStartDL, MF);
+  }
+}
+
+void CodeViewDebug::addToUDTs(const DIType *Ty, TypeIndex TI) {
+  // Don't record empty UDTs.
+  if (Ty->getName().empty())
+    return;
+
+  SmallVector<StringRef, 5> QualifiedNameComponents;
+  const DISubprogram *ClosestSubprogram = getQualifiedNameComponents(
+      Ty->getScope().resolve(), QualifiedNameComponents);
+
+  std::string FullyQualifiedName =
+      getQualifiedName(QualifiedNameComponents, getPrettyScopeName(Ty));
+
+  if (ClosestSubprogram == nullptr)
+    GlobalUDTs.emplace_back(std::move(FullyQualifiedName), TI);
+  else if (ClosestSubprogram == CurrentSubprogram)
+    LocalUDTs.emplace_back(std::move(FullyQualifiedName), TI);
+
+  // TODO: What if the ClosestSubprogram is neither null or the current
+  // subprogram?  Currently, the UDT just gets dropped on the floor.
+  //
+  // The current behavior is not desirable.  To get maximal fidelity, we would
+  // need to perform all type translation before beginning emission of .debug$S
+  // and then make LocalUDTs a member of FunctionInfo
+}
+
+TypeIndex CodeViewDebug::lowerType(const DIType *Ty, const DIType *ClassTy) {
+  // Generic dispatch for lowering an unknown type.
+  switch (Ty->getTag()) {
+  case dwarf::DW_TAG_array_type:
+    return lowerTypeArray(cast<DICompositeType>(Ty));
+  case dwarf::DW_TAG_typedef:
+    return lowerTypeAlias(cast<DIDerivedType>(Ty));
+  case dwarf::DW_TAG_base_type:
+    return lowerTypeBasic(cast<DIBasicType>(Ty));
+  case dwarf::DW_TAG_pointer_type:
+  case dwarf::DW_TAG_reference_type:
+  case dwarf::DW_TAG_rvalue_reference_type:
+    return lowerTypePointer(cast<DIDerivedType>(Ty));
+  case dwarf::DW_TAG_ptr_to_member_type:
+    return lowerTypeMemberPointer(cast<DIDerivedType>(Ty));
+  case dwarf::DW_TAG_const_type:
+  case dwarf::DW_TAG_volatile_type:
+    return lowerTypeModifier(cast<DIDerivedType>(Ty));
+  case dwarf::DW_TAG_subroutine_type:
+    if (ClassTy) {
+      // The member function type of a member function pointer has no
+      // ThisAdjustment.
+      return lowerTypeMemberFunction(cast<DISubroutineType>(Ty), ClassTy,
+                                     /*ThisAdjustment=*/0);
+    }
+    return lowerTypeFunction(cast<DISubroutineType>(Ty));
+  case dwarf::DW_TAG_enumeration_type:
+    return lowerTypeEnum(cast<DICompositeType>(Ty));
+  case dwarf::DW_TAG_class_type:
+  case dwarf::DW_TAG_structure_type:
+    return lowerTypeClass(cast<DICompositeType>(Ty));
+  case dwarf::DW_TAG_union_type:
+    return lowerTypeUnion(cast<DICompositeType>(Ty));
+  default:
+    // Use the null type index.
+    return TypeIndex();
+  }
+}
+
+TypeIndex CodeViewDebug::lowerTypeAlias(const DIDerivedType *Ty) {
+  DITypeRef UnderlyingTypeRef = Ty->getBaseType();
+  TypeIndex UnderlyingTypeIndex = getTypeIndex(UnderlyingTypeRef);
+  StringRef TypeName = Ty->getName();
+
+  addToUDTs(Ty, UnderlyingTypeIndex);
+
+  if (UnderlyingTypeIndex == TypeIndex(SimpleTypeKind::Int32Long) &&
+      TypeName == "HRESULT")
+    return TypeIndex(SimpleTypeKind::HResult);
+  if (UnderlyingTypeIndex == TypeIndex(SimpleTypeKind::UInt16Short) &&
+      TypeName == "wchar_t")
+    return TypeIndex(SimpleTypeKind::WideCharacter);
+
+  return UnderlyingTypeIndex;
+}
+
+TypeIndex CodeViewDebug::lowerTypeArray(const DICompositeType *Ty) {
+  DITypeRef ElementTypeRef = Ty->getBaseType();
+  TypeIndex ElementTypeIndex = getTypeIndex(ElementTypeRef);
+  // IndexType is size_t, which depends on the bitness of the target.
+  TypeIndex IndexType = Asm->MAI->getPointerSize() == 8
+                            ? TypeIndex(SimpleTypeKind::UInt64Quad)
+                            : TypeIndex(SimpleTypeKind::UInt32Long);
+
+  uint64_t ElementSize = getBaseTypeSize(ElementTypeRef) / 8;
+
+  bool UndefinedSubrange = false;
+
+  // FIXME:
+  // There is a bug in the front-end where an array of a structure, which was
+  // declared as incomplete structure first, ends up not getting a size assigned
+  // to it. (PR28303)
+  // Example:
+  //   struct A(*p)[3];
+  //   struct A { int f; } a[3];
+  //
+  // This needs to be fixed in the front-end, but in the meantime we don't want
+  // to trigger an assertion because of this.
+  if (Ty->getSizeInBits() == 0) {
+    UndefinedSubrange = true;
+  }
+
+  // Add subranges to array type.
+  DINodeArray Elements = Ty->getElements();
+  for (int i = Elements.size() - 1; i >= 0; --i) {
+    const DINode *Element = Elements[i];
+    assert(Element->getTag() == dwarf::DW_TAG_subrange_type);
+
+    const DISubrange *Subrange = cast<DISubrange>(Element);
+    assert(Subrange->getLowerBound() == 0 &&
+           "codeview doesn't support subranges with lower bounds");
+    int64_t Count = Subrange->getCount();
+
+    // Variable Length Array (VLA) has Count equal to '-1'.
+    // Replace with Count '1', assume it is the minimum VLA length.
+    // FIXME: Make front-end support VLA subrange and emit LF_DIMVARLU.
+    if (Count == -1) {
+      Count = 1;
+      UndefinedSubrange = true;
+    }
+
+    StringRef Name = (i == 0) ? Ty->getName() : "";
+    // Update the element size and element type index for subsequent subranges.
+    ElementSize *= Count;
+    ElementTypeIndex = TypeTable.writeArray(
+        ArrayRecord(ElementTypeIndex, IndexType, ElementSize, Name));
+  }
+
+  (void)UndefinedSubrange;
+  assert(UndefinedSubrange || ElementSize == (Ty->getSizeInBits() / 8));
+
+  return ElementTypeIndex;
+}
+
+TypeIndex CodeViewDebug::lowerTypeBasic(const DIBasicType *Ty) {
+  TypeIndex Index;
+  dwarf::TypeKind Kind;
+  uint32_t ByteSize;
+
+  Kind = static_cast<dwarf::TypeKind>(Ty->getEncoding());
+  ByteSize = Ty->getSizeInBits() / 8;
+
+  SimpleTypeKind STK = SimpleTypeKind::None;
+  switch (Kind) {
+  case dwarf::DW_ATE_address:
+    // FIXME: Translate
+    break;
+  case dwarf::DW_ATE_boolean:
+    switch (ByteSize) {
+    case 1:  STK = SimpleTypeKind::Boolean8;   break;
+    case 2:  STK = SimpleTypeKind::Boolean16;  break;
+    case 4:  STK = SimpleTypeKind::Boolean32;  break;
+    case 8:  STK = SimpleTypeKind::Boolean64;  break;
+    case 16: STK = SimpleTypeKind::Boolean128; break;
+    }
+    break;
+  case dwarf::DW_ATE_complex_float:
+    switch (ByteSize) {
+    case 2:  STK = SimpleTypeKind::Complex16;  break;
+    case 4:  STK = SimpleTypeKind::Complex32;  break;
+    case 8:  STK = SimpleTypeKind::Complex64;  break;
+    case 10: STK = SimpleTypeKind::Complex80;  break;
+    case 16: STK = SimpleTypeKind::Complex128; break;
+    }
+    break;
+  case dwarf::DW_ATE_float:
+    switch (ByteSize) {
+    case 2:  STK = SimpleTypeKind::Float16;  break;
+    case 4:  STK = SimpleTypeKind::Float32;  break;
+    case 6:  STK = SimpleTypeKind::Float48;  break;
+    case 8:  STK = SimpleTypeKind::Float64;  break;
+    case 10: STK = SimpleTypeKind::Float80;  break;
+    case 16: STK = SimpleTypeKind::Float128; break;
+    }
+    break;
+  case dwarf::DW_ATE_signed:
+    switch (ByteSize) {
+    case 1:  STK = SimpleTypeKind::SByte;      break;
+    case 2:  STK = SimpleTypeKind::Int16Short; break;
+    case 4:  STK = SimpleTypeKind::Int32;      break;
+    case 8:  STK = SimpleTypeKind::Int64Quad;  break;
+    case 16: STK = SimpleTypeKind::Int128Oct;  break;
+    }
+    break;
+  case dwarf::DW_ATE_unsigned:
+    switch (ByteSize) {
+    case 1:  STK = SimpleTypeKind::Byte;        break;
+    case 2:  STK = SimpleTypeKind::UInt16Short; break;
+    case 4:  STK = SimpleTypeKind::UInt32;      break;
+    case 8:  STK = SimpleTypeKind::UInt64Quad;  break;
+    case 16: STK = SimpleTypeKind::UInt128Oct;  break;
+    }
+    break;
+  case dwarf::DW_ATE_UTF:
+    switch (ByteSize) {
+    case 2: STK = SimpleTypeKind::Character16; break;
+    case 4: STK = SimpleTypeKind::Character32; break;
+    }
+    break;
+  case dwarf::DW_ATE_signed_char:
+    if (ByteSize == 1)
+      STK = SimpleTypeKind::SignedCharacter;
+    break;
+  case dwarf::DW_ATE_unsigned_char:
+    if (ByteSize == 1)
+      STK = SimpleTypeKind::UnsignedCharacter;
+    break;
+  default:
+    break;
+  }
+
+  // Apply some fixups based on the source-level type name.
+  if (STK == SimpleTypeKind::Int32 && Ty->getName() == "long int")
+    STK = SimpleTypeKind::Int32Long;
+  if (STK == SimpleTypeKind::UInt32 && Ty->getName() == "long unsigned int")
+    STK = SimpleTypeKind::UInt32Long;
+  if (STK == SimpleTypeKind::UInt16Short &&
+      (Ty->getName() == "wchar_t" || Ty->getName() == "__wchar_t"))
+    STK = SimpleTypeKind::WideCharacter;
+  if ((STK == SimpleTypeKind::SignedCharacter ||
+       STK == SimpleTypeKind::UnsignedCharacter) &&
+      Ty->getName() == "char")
+    STK = SimpleTypeKind::NarrowCharacter;
+
+  return TypeIndex(STK);
+}
+
+TypeIndex CodeViewDebug::lowerTypePointer(const DIDerivedType *Ty) {
+  TypeIndex PointeeTI = getTypeIndex(Ty->getBaseType());
+
+  // While processing the type being pointed to it is possible we already
+  // created this pointer type.  If so, we check here and return the existing
+  // pointer type.
+  auto I = TypeIndices.find({Ty, nullptr});
+  if (I != TypeIndices.end())
+    return I->second;
+
+  // Pointers to simple types can use SimpleTypeMode, rather than having a
+  // dedicated pointer type record.
+  if (PointeeTI.isSimple() &&
+      PointeeTI.getSimpleMode() == SimpleTypeMode::Direct &&
+      Ty->getTag() == dwarf::DW_TAG_pointer_type) {
+    SimpleTypeMode Mode = Ty->getSizeInBits() == 64
+                              ? SimpleTypeMode::NearPointer64
+                              : SimpleTypeMode::NearPointer32;
+    return TypeIndex(PointeeTI.getSimpleKind(), Mode);
+  }
+
+  PointerKind PK =
+      Ty->getSizeInBits() == 64 ? PointerKind::Near64 : PointerKind::Near32;
+  PointerMode PM = PointerMode::Pointer;
+  switch (Ty->getTag()) {
+  default: llvm_unreachable("not a pointer tag type");
+  case dwarf::DW_TAG_pointer_type:
+    PM = PointerMode::Pointer;
+    break;
+  case dwarf::DW_TAG_reference_type:
+    PM = PointerMode::LValueReference;
+    break;
+  case dwarf::DW_TAG_rvalue_reference_type:
+    PM = PointerMode::RValueReference;
+    break;
+  }
+  // FIXME: MSVC folds qualifiers into PointerOptions in the context of a method
+  // 'this' pointer, but not normal contexts. Figure out what we're supposed to
+  // do.
+  PointerOptions PO = PointerOptions::None;
+  PointerRecord PR(PointeeTI, PK, PM, PO, Ty->getSizeInBits() / 8);
+  return TypeTable.writePointer(PR);
+}
+
+static PointerToMemberRepresentation
+translatePtrToMemberRep(unsigned SizeInBytes, bool IsPMF, unsigned Flags) {
+  // SizeInBytes being zero generally implies that the member pointer type was
+  // incomplete, which can happen if it is part of a function prototype. In this
+  // case, use the unknown model instead of the general model.
+  if (IsPMF) {
+    switch (Flags & DINode::FlagPtrToMemberRep) {
+    case 0:
+      return SizeInBytes == 0 ? PointerToMemberRepresentation::Unknown
+                              : PointerToMemberRepresentation::GeneralFunction;
+    case DINode::FlagSingleInheritance:
+      return PointerToMemberRepresentation::SingleInheritanceFunction;
+    case DINode::FlagMultipleInheritance:
+      return PointerToMemberRepresentation::MultipleInheritanceFunction;
+    case DINode::FlagVirtualInheritance:
+      return PointerToMemberRepresentation::VirtualInheritanceFunction;
+    }
+  } else {
+    switch (Flags & DINode::FlagPtrToMemberRep) {
+    case 0:
+      return SizeInBytes == 0 ? PointerToMemberRepresentation::Unknown
+                              : PointerToMemberRepresentation::GeneralData;
+    case DINode::FlagSingleInheritance:
+      return PointerToMemberRepresentation::SingleInheritanceData;
+    case DINode::FlagMultipleInheritance:
+      return PointerToMemberRepresentation::MultipleInheritanceData;
+    case DINode::FlagVirtualInheritance:
+      return PointerToMemberRepresentation::VirtualInheritanceData;
+    }
+  }
+  llvm_unreachable("invalid ptr to member representation");
+}
+
+TypeIndex CodeViewDebug::lowerTypeMemberPointer(const DIDerivedType *Ty) {
+  assert(Ty->getTag() == dwarf::DW_TAG_ptr_to_member_type);
+  TypeIndex ClassTI = getTypeIndex(Ty->getClassType());
+  TypeIndex PointeeTI = getTypeIndex(Ty->getBaseType(), Ty->getClassType());
+  PointerKind PK = Asm->MAI->getPointerSize() == 8 ? PointerKind::Near64
+                                                   : PointerKind::Near32;
+  bool IsPMF = isa<DISubroutineType>(Ty->getBaseType());
+  PointerMode PM = IsPMF ? PointerMode::PointerToMemberFunction
+                         : PointerMode::PointerToDataMember;
+  PointerOptions PO = PointerOptions::None; // FIXME
+  assert(Ty->getSizeInBits() / 8 <= 0xff && "pointer size too big");
+  uint8_t SizeInBytes = Ty->getSizeInBits() / 8;
+  MemberPointerInfo MPI(
+      ClassTI, translatePtrToMemberRep(SizeInBytes, IsPMF, Ty->getFlags()));
+  PointerRecord PR(PointeeTI, PK, PM, PO, SizeInBytes, MPI);
+  return TypeTable.writePointer(PR);
+}
+
+/// Given a DWARF calling convention, get the CodeView equivalent. If we don't
+/// have a translation, use the NearC convention.
+static CallingConvention dwarfCCToCodeView(unsigned DwarfCC) {
+  switch (DwarfCC) {
+  case dwarf::DW_CC_normal:             return CallingConvention::NearC;
+  case dwarf::DW_CC_BORLAND_msfastcall: return CallingConvention::NearFast;
+  case dwarf::DW_CC_BORLAND_thiscall:   return CallingConvention::ThisCall;
+  case dwarf::DW_CC_BORLAND_stdcall:    return CallingConvention::NearStdCall;
+  case dwarf::DW_CC_BORLAND_pascal:     return CallingConvention::NearPascal;
+  case dwarf::DW_CC_LLVM_vectorcall:    return CallingConvention::NearVector;
+  }
+  return CallingConvention::NearC;
+}
+
+TypeIndex CodeViewDebug::lowerTypeModifier(const DIDerivedType *Ty) {
+  ModifierOptions Mods = ModifierOptions::None;
+  bool IsModifier = true;
+  const DIType *BaseTy = Ty;
+  while (IsModifier && BaseTy) {
+    // FIXME: Need to add DWARF tag for __unaligned.
+    switch (BaseTy->getTag()) {
+    case dwarf::DW_TAG_const_type:
+      Mods |= ModifierOptions::Const;
+      break;
+    case dwarf::DW_TAG_volatile_type:
+      Mods |= ModifierOptions::Volatile;
+      break;
+    default:
+      IsModifier = false;
+      break;
+    }
+    if (IsModifier)
+      BaseTy = cast<DIDerivedType>(BaseTy)->getBaseType().resolve();
+  }
+  TypeIndex ModifiedTI = getTypeIndex(BaseTy);
+
+  // While processing the type being pointed to, it is possible we already
+  // created this modifier type.  If so, we check here and return the existing
+  // modifier type.
+  auto I = TypeIndices.find({Ty, nullptr});
+  if (I != TypeIndices.end())
+    return I->second;
+
+  ModifierRecord MR(ModifiedTI, Mods);
+  return TypeTable.writeModifier(MR);
+}
+
+TypeIndex CodeViewDebug::lowerTypeFunction(const DISubroutineType *Ty) {
+  SmallVector<TypeIndex, 8> ReturnAndArgTypeIndices;
+  for (DITypeRef ArgTypeRef : Ty->getTypeArray())
+    ReturnAndArgTypeIndices.push_back(getTypeIndex(ArgTypeRef));
+
+  TypeIndex ReturnTypeIndex = TypeIndex::Void();
+  ArrayRef<TypeIndex> ArgTypeIndices = None;
+  if (!ReturnAndArgTypeIndices.empty()) {
+    auto ReturnAndArgTypesRef = makeArrayRef(ReturnAndArgTypeIndices);
+    ReturnTypeIndex = ReturnAndArgTypesRef.front();
+    ArgTypeIndices = ReturnAndArgTypesRef.drop_front();
+  }
+
+  ArgListRecord ArgListRec(TypeRecordKind::ArgList, ArgTypeIndices);
+  TypeIndex ArgListIndex = TypeTable.writeArgList(ArgListRec);
+
+  CallingConvention CC = dwarfCCToCodeView(Ty->getCC());
+
+  ProcedureRecord Procedure(ReturnTypeIndex, CC, FunctionOptions::None,
+                            ArgTypeIndices.size(), ArgListIndex);
+  return TypeTable.writeProcedure(Procedure);
+}
+
+TypeIndex CodeViewDebug::lowerTypeMemberFunction(const DISubroutineType *Ty,
+                                                 const DIType *ClassTy,
+                                                 int ThisAdjustment) {
+  // Lower the containing class type.
+  TypeIndex ClassType = getTypeIndex(ClassTy);
+
+  SmallVector<TypeIndex, 8> ReturnAndArgTypeIndices;
+  for (DITypeRef ArgTypeRef : Ty->getTypeArray())
+    ReturnAndArgTypeIndices.push_back(getTypeIndex(ArgTypeRef));
+
+  TypeIndex ReturnTypeIndex = TypeIndex::Void();
+  ArrayRef<TypeIndex> ArgTypeIndices = None;
+  if (!ReturnAndArgTypeIndices.empty()) {
+    auto ReturnAndArgTypesRef = makeArrayRef(ReturnAndArgTypeIndices);
+    ReturnTypeIndex = ReturnAndArgTypesRef.front();
+    ArgTypeIndices = ReturnAndArgTypesRef.drop_front();
+  }
+  TypeIndex ThisTypeIndex = TypeIndex::Void();
+  if (!ArgTypeIndices.empty()) {
+    ThisTypeIndex = ArgTypeIndices.front();
+    ArgTypeIndices = ArgTypeIndices.drop_front();
+  }
+
+  ArgListRecord ArgListRec(TypeRecordKind::ArgList, ArgTypeIndices);
+  TypeIndex ArgListIndex = TypeTable.writeArgList(ArgListRec);
+
+  CallingConvention CC = dwarfCCToCodeView(Ty->getCC());
+
+  // TODO: Need to use the correct values for:
+  //       FunctionOptions
+  //       ThisPointerAdjustment.
+  TypeIndex TI = TypeTable.writeMemberFunction(MemberFunctionRecord(
+      ReturnTypeIndex, ClassType, ThisTypeIndex, CC, FunctionOptions::None,
+      ArgTypeIndices.size(), ArgListIndex, ThisAdjustment));
+
+  return TI;
+}
+
+static MemberAccess translateAccessFlags(unsigned RecordTag, unsigned Flags) {
+  switch (Flags & DINode::FlagAccessibility) {
+  case DINode::FlagPrivate:   return MemberAccess::Private;
+  case DINode::FlagPublic:    return MemberAccess::Public;
+  case DINode::FlagProtected: return MemberAccess::Protected;
+  case 0:
+    // If there was no explicit access control, provide the default for the tag.
+    return RecordTag == dwarf::DW_TAG_class_type ? MemberAccess::Private
+                                                 : MemberAccess::Public;
+  }
+  llvm_unreachable("access flags are exclusive");
+}
+
+static MethodOptions translateMethodOptionFlags(const DISubprogram *SP) {
+  if (SP->isArtificial())
+    return MethodOptions::CompilerGenerated;
+
+  // FIXME: Handle other MethodOptions.
+
+  return MethodOptions::None;
+}
+
+static MethodKind translateMethodKindFlags(const DISubprogram *SP,
+                                           bool Introduced) {
+  switch (SP->getVirtuality()) {
+  case dwarf::DW_VIRTUALITY_none:
+    break;
+  case dwarf::DW_VIRTUALITY_virtual:
+    return Introduced ? MethodKind::IntroducingVirtual : MethodKind::Virtual;
+  case dwarf::DW_VIRTUALITY_pure_virtual:
+    return Introduced ? MethodKind::PureIntroducingVirtual
+                      : MethodKind::PureVirtual;
+  default:
+    llvm_unreachable("unhandled virtuality case");
+  }
+
+  // FIXME: Get Clang to mark DISubprogram as static and do something with it.
+
+  return MethodKind::Vanilla;
+}
+
+static TypeRecordKind getRecordKind(const DICompositeType *Ty) {
+  switch (Ty->getTag()) {
+  case dwarf::DW_TAG_class_type:     return TypeRecordKind::Class;
+  case dwarf::DW_TAG_structure_type: return TypeRecordKind::Struct;
+  }
+  llvm_unreachable("unexpected tag");
+}
+
+/// Return ClassOptions that should be present on both the forward declaration
+/// and the defintion of a tag type.
+static ClassOptions getCommonClassOptions(const DICompositeType *Ty) {
+  ClassOptions CO = ClassOptions::None;
+
+  // MSVC always sets this flag, even for local types. Clang doesn't always
+  // appear to give every type a linkage name, which may be problematic for us.
+  // FIXME: Investigate the consequences of not following them here.
+  if (!Ty->getIdentifier().empty())
+    CO |= ClassOptions::HasUniqueName;
+
+  // Put the Nested flag on a type if it appears immediately inside a tag type.
+  // Do not walk the scope chain. Do not attempt to compute ContainsNestedClass
+  // here. That flag is only set on definitions, and not forward declarations.
+  const DIScope *ImmediateScope = Ty->getScope().resolve();
+  if (ImmediateScope && isa<DICompositeType>(ImmediateScope))
+    CO |= ClassOptions::Nested;
+
+  // Put the Scoped flag on function-local types.
+  for (const DIScope *Scope = ImmediateScope; Scope != nullptr;
+       Scope = Scope->getScope().resolve()) {
+    if (isa<DISubprogram>(Scope)) {
+      CO |= ClassOptions::Scoped;
+      break;
+    }
+  }
+
+  return CO;
+}
+
+TypeIndex CodeViewDebug::lowerTypeEnum(const DICompositeType *Ty) {
+  ClassOptions CO = getCommonClassOptions(Ty);
+  TypeIndex FTI;
+  unsigned EnumeratorCount = 0;
+
+  if (Ty->isForwardDecl()) {
+    CO |= ClassOptions::ForwardReference;
+  } else {
+    FieldListRecordBuilder Fields;
+    for (const DINode *Element : Ty->getElements()) {
+      // We assume that the frontend provides all members in source declaration
+      // order, which is what MSVC does.
+      if (auto *Enumerator = dyn_cast_or_null<DIEnumerator>(Element)) {
+        Fields.writeEnumerator(EnumeratorRecord(
+            MemberAccess::Public, APSInt::getUnsigned(Enumerator->getValue()),
+            Enumerator->getName()));
+        EnumeratorCount++;
+      }
+    }
+    FTI = TypeTable.writeFieldList(Fields);
+  }
+
+  std::string FullName = getFullyQualifiedName(Ty);
+
+  return TypeTable.writeEnum(EnumRecord(EnumeratorCount, CO, FTI, FullName,
+                                        Ty->getIdentifier(),
+                                        getTypeIndex(Ty->getBaseType())));
+}
+
+//===----------------------------------------------------------------------===//
+// ClassInfo
+//===----------------------------------------------------------------------===//
+
+struct llvm::ClassInfo {
+  struct MemberInfo {
+    const DIDerivedType *MemberTypeNode;
+    uint64_t BaseOffset;
+  };
+  // [MemberInfo]
+  typedef std::vector<MemberInfo> MemberList;
+
+  typedef TinyPtrVector<const DISubprogram *> MethodsList;
+  // MethodName -> MethodsList
+  typedef MapVector<MDString *, MethodsList> MethodsMap;
+
+  /// Base classes.
+  std::vector<const DIDerivedType *> Inheritance;
+
+  /// Direct members.
+  MemberList Members;
+  // Direct overloaded methods gathered by name.
+  MethodsMap Methods;
+
+  std::vector<const DICompositeType *> NestedClasses;
+};
+
+void CodeViewDebug::clear() {
+  assert(CurFn == nullptr);
+  FileIdMap.clear();
+  FnDebugInfo.clear();
+  FileToFilepathMap.clear();
+  LocalUDTs.clear();
+  GlobalUDTs.clear();
+  TypeIndices.clear();
+  CompleteTypeIndices.clear();
+}
+
+void CodeViewDebug::collectMemberInfo(ClassInfo &Info,
+                                      const DIDerivedType *DDTy) {
+  if (!DDTy->getName().empty()) {
+    Info.Members.push_back({DDTy, 0});
+    return;
+  }
+  // An unnamed member must represent a nested struct or union. Add all the
+  // indirect fields to the current record.
+  assert((DDTy->getOffsetInBits() % 8) == 0 && "Unnamed bitfield member!");
+  uint64_t Offset = DDTy->getOffsetInBits();
+  const DIType *Ty = DDTy->getBaseType().resolve();
+  const DICompositeType *DCTy = cast<DICompositeType>(Ty);
+  ClassInfo NestedInfo = collectClassInfo(DCTy);
+  for (const ClassInfo::MemberInfo &IndirectField : NestedInfo.Members)
+    Info.Members.push_back(
+        {IndirectField.MemberTypeNode, IndirectField.BaseOffset + Offset});
+}
+
+ClassInfo CodeViewDebug::collectClassInfo(const DICompositeType *Ty) {
+  ClassInfo Info;
+  // Add elements to structure type.
+  DINodeArray Elements = Ty->getElements();
+  for (auto *Element : Elements) {
+    // We assume that the frontend provides all members in source declaration
+    // order, which is what MSVC does.
+    if (!Element)
+      continue;
+    if (auto *SP = dyn_cast<DISubprogram>(Element)) {
+      Info.Methods[SP->getRawName()].push_back(SP);
+    } else if (auto *DDTy = dyn_cast<DIDerivedType>(Element)) {
+      if (DDTy->getTag() == dwarf::DW_TAG_member) {
+        collectMemberInfo(Info, DDTy);
+      } else if (DDTy->getTag() == dwarf::DW_TAG_inheritance) {
+        Info.Inheritance.push_back(DDTy);
+      } else if (DDTy->getTag() == dwarf::DW_TAG_friend) {
+        // Ignore friend members. It appears that MSVC emitted info about
+        // friends in the past, but modern versions do not.
+      }
+      // FIXME: Get Clang to emit function virtual table here and handle it.
+    } else if (auto *Composite = dyn_cast<DICompositeType>(Element)) {
+      Info.NestedClasses.push_back(Composite);
+    }
+    // Skip other unrecognized kinds of elements.
+  }
+  return Info;
+}
+
+TypeIndex CodeViewDebug::lowerTypeClass(const DICompositeType *Ty) {
+  // First, construct the forward decl.  Don't look into Ty to compute the
+  // forward decl options, since it might not be available in all TUs.
+  TypeRecordKind Kind = getRecordKind(Ty);
+  ClassOptions CO =
+      ClassOptions::ForwardReference | getCommonClassOptions(Ty);
+  std::string FullName = getFullyQualifiedName(Ty);
+  TypeIndex FwdDeclTI = TypeTable.writeClass(ClassRecord(
+      Kind, 0, CO, HfaKind::None, WindowsRTClassKind::None, TypeIndex(),
+      TypeIndex(), TypeIndex(), 0, FullName, Ty->getIdentifier()));
+  if (!Ty->isForwardDecl())
+    DeferredCompleteTypes.push_back(Ty);
+  return FwdDeclTI;
+}
+
+TypeIndex CodeViewDebug::lowerCompleteTypeClass(const DICompositeType *Ty) {
+  // Construct the field list and complete type record.
+  TypeRecordKind Kind = getRecordKind(Ty);
+  ClassOptions CO = getCommonClassOptions(Ty);
+  TypeIndex FieldTI;
+  TypeIndex VShapeTI;
+  unsigned FieldCount;
+  bool ContainsNestedClass;
+  std::tie(FieldTI, VShapeTI, FieldCount, ContainsNestedClass) =
+      lowerRecordFieldList(Ty);
+
+  if (ContainsNestedClass)
+    CO |= ClassOptions::ContainsNestedClass;
+
+  std::string FullName = getFullyQualifiedName(Ty);
+
+  uint64_t SizeInBytes = Ty->getSizeInBits() / 8;
+
+  TypeIndex ClassTI = TypeTable.writeClass(ClassRecord(
+      Kind, FieldCount, CO, HfaKind::None, WindowsRTClassKind::None, FieldTI,
+      TypeIndex(), VShapeTI, SizeInBytes, FullName, Ty->getIdentifier()));
+
+  TypeTable.writeUdtSourceLine(UdtSourceLineRecord(
+      ClassTI, TypeTable.writeStringId(StringIdRecord(
+                   TypeIndex(0x0), getFullFilepath(Ty->getFile()))),
+      Ty->getLine()));
+
+  addToUDTs(Ty, ClassTI);
+
+  return ClassTI;
+}
+
+TypeIndex CodeViewDebug::lowerTypeUnion(const DICompositeType *Ty) {
+  ClassOptions CO =
+      ClassOptions::ForwardReference | getCommonClassOptions(Ty);
+  std::string FullName = getFullyQualifiedName(Ty);
+  TypeIndex FwdDeclTI =
+      TypeTable.writeUnion(UnionRecord(0, CO, HfaKind::None, TypeIndex(), 0,
+                                       FullName, Ty->getIdentifier()));
+  if (!Ty->isForwardDecl())
+    DeferredCompleteTypes.push_back(Ty);
+  return FwdDeclTI;
+}
+
+TypeIndex CodeViewDebug::lowerCompleteTypeUnion(const DICompositeType *Ty) {
+  ClassOptions CO = ClassOptions::Sealed | getCommonClassOptions(Ty);
+  TypeIndex FieldTI;
+  unsigned FieldCount;
+  bool ContainsNestedClass;
+  std::tie(FieldTI, std::ignore, FieldCount, ContainsNestedClass) =
+      lowerRecordFieldList(Ty);
+
+  if (ContainsNestedClass)
+    CO |= ClassOptions::ContainsNestedClass;
+
+  uint64_t SizeInBytes = Ty->getSizeInBits() / 8;
+  std::string FullName = getFullyQualifiedName(Ty);
+
+  TypeIndex UnionTI = TypeTable.writeUnion(
+      UnionRecord(FieldCount, CO, HfaKind::None, FieldTI, SizeInBytes, FullName,
+                  Ty->getIdentifier()));
+
+  TypeTable.writeUdtSourceLine(UdtSourceLineRecord(
+      UnionTI, TypeTable.writeStringId(StringIdRecord(
+                   TypeIndex(0x0), getFullFilepath(Ty->getFile()))),
+      Ty->getLine()));
+
+  addToUDTs(Ty, UnionTI);
+
+  return UnionTI;
+}
+
+std::tuple<TypeIndex, TypeIndex, unsigned, bool>
+CodeViewDebug::lowerRecordFieldList(const DICompositeType *Ty) {
+  // Manually count members. MSVC appears to count everything that generates a
+  // field list record. Each individual overload in a method overload group
+  // contributes to this count, even though the overload group is a single field
+  // list record.
+  unsigned MemberCount = 0;
+  ClassInfo Info = collectClassInfo(Ty);
+  FieldListRecordBuilder Fields;
+
+  // Create base classes.
+  for (const DIDerivedType *I : Info.Inheritance) {
+    if (I->getFlags() & DINode::FlagVirtual) {
+      // Virtual base.
+      // FIXME: Emit VBPtrOffset when the frontend provides it.
+      unsigned VBPtrOffset = 0;
+      // FIXME: Despite the accessor name, the offset is really in bytes.
+      unsigned VBTableIndex = I->getOffsetInBits() / 4;
+      Fields.writeVirtualBaseClass(VirtualBaseClassRecord(
+          translateAccessFlags(Ty->getTag(), I->getFlags()),
+          getTypeIndex(I->getBaseType()), getVBPTypeIndex(), VBPtrOffset,
+          VBTableIndex));
+    } else {
+      assert(I->getOffsetInBits() % 8 == 0 &&
+             "bases must be on byte boundaries");
+      Fields.writeBaseClass(BaseClassRecord(
+          translateAccessFlags(Ty->getTag(), I->getFlags()),
+          getTypeIndex(I->getBaseType()), I->getOffsetInBits() / 8));
+    }
+  }
+
+  // Create members.
+  for (ClassInfo::MemberInfo &MemberInfo : Info.Members) {
+    const DIDerivedType *Member = MemberInfo.MemberTypeNode;
+    TypeIndex MemberBaseType = getTypeIndex(Member->getBaseType());
+    StringRef MemberName = Member->getName();
+    MemberAccess Access =
+        translateAccessFlags(Ty->getTag(), Member->getFlags());
+
+    if (Member->isStaticMember()) {
+      Fields.writeStaticDataMember(
+          StaticDataMemberRecord(Access, MemberBaseType, MemberName));
+      MemberCount++;
+      continue;
+    }
+
+    // Data member.
+    uint64_t MemberOffsetInBits =
+        Member->getOffsetInBits() + MemberInfo.BaseOffset;
+    if (Member->isBitField()) {
+      uint64_t StartBitOffset = MemberOffsetInBits;
+      if (const auto *CI =
+              dyn_cast_or_null<ConstantInt>(Member->getStorageOffsetInBits())) {
+        MemberOffsetInBits = CI->getZExtValue() + MemberInfo.BaseOffset;
+      }
+      StartBitOffset -= MemberOffsetInBits;
+      MemberBaseType = TypeTable.writeBitField(BitFieldRecord(
+          MemberBaseType, Member->getSizeInBits(), StartBitOffset));
+    }
+    uint64_t MemberOffsetInBytes = MemberOffsetInBits / 8;
+    Fields.writeDataMember(DataMemberRecord(Access, MemberBaseType,
+                                            MemberOffsetInBytes, MemberName));
+    MemberCount++;
+  }
+
+  // Create methods
+  for (auto &MethodItr : Info.Methods) {
+    StringRef Name = MethodItr.first->getString();
+
+    std::vector<OneMethodRecord> Methods;
+    for (const DISubprogram *SP : MethodItr.second) {
+      TypeIndex MethodType = getMemberFunctionType(SP, Ty);
+      bool Introduced = SP->getFlags() & DINode::FlagIntroducedVirtual;
+
+      unsigned VFTableOffset = -1;
+      if (Introduced)
+        VFTableOffset = SP->getVirtualIndex() * getPointerSizeInBytes();
+
+      Methods.push_back(
+          OneMethodRecord(MethodType, translateMethodKindFlags(SP, Introduced),
+                          translateMethodOptionFlags(SP),
+                          translateAccessFlags(Ty->getTag(), SP->getFlags()),
+                          VFTableOffset, Name));
+      MemberCount++;
+    }
+    assert(Methods.size() > 0 && "Empty methods map entry");
+    if (Methods.size() == 1)
+      Fields.writeOneMethod(Methods[0]);
+    else {
+      TypeIndex MethodList =
+          TypeTable.writeMethodOverloadList(MethodOverloadListRecord(Methods));
+      Fields.writeOverloadedMethod(
+          OverloadedMethodRecord(Methods.size(), MethodList, Name));
+    }
+  }
+
+  // Create nested classes.
+  for (const DICompositeType *Nested : Info.NestedClasses) {
+    NestedTypeRecord R(getTypeIndex(DITypeRef(Nested)), Nested->getName());
+    Fields.writeNestedType(R);
+    MemberCount++;
+  }
+
+  TypeIndex FieldTI = TypeTable.writeFieldList(Fields);
+  return std::make_tuple(FieldTI, TypeIndex(), MemberCount,
+                         !Info.NestedClasses.empty());
+}
+
+TypeIndex CodeViewDebug::getVBPTypeIndex() {
+  if (!VBPType.getIndex()) {
+    // Make a 'const int *' type.
+    ModifierRecord MR(TypeIndex::Int32(), ModifierOptions::Const);
+    TypeIndex ModifiedTI = TypeTable.writeModifier(MR);
+
+    PointerKind PK = getPointerSizeInBytes() == 8 ? PointerKind::Near64
+                                                  : PointerKind::Near32;
+    PointerMode PM = PointerMode::Pointer;
+    PointerOptions PO = PointerOptions::None;
+    PointerRecord PR(ModifiedTI, PK, PM, PO, getPointerSizeInBytes());
+
+    VBPType = TypeTable.writePointer(PR);
+  }
+
+  return VBPType;
+}
+
+TypeIndex CodeViewDebug::getTypeIndex(DITypeRef TypeRef, DITypeRef ClassTyRef) {
+  const DIType *Ty = TypeRef.resolve();
+  const DIType *ClassTy = ClassTyRef.resolve();
+
+  // The null DIType is the void type. Don't try to hash it.
+  if (!Ty)
+    return TypeIndex::Void();
+
+  // Check if we've already translated this type. Don't try to do a
+  // get-or-create style insertion that caches the hash lookup across the
+  // lowerType call. It will update the TypeIndices map.
+  auto I = TypeIndices.find({Ty, ClassTy});
+  if (I != TypeIndices.end())
+    return I->second;
+
+  TypeLoweringScope S(*this);
+  TypeIndex TI = lowerType(Ty, ClassTy);
+  return recordTypeIndexForDINode(Ty, TI, ClassTy);
+}
+
+TypeIndex CodeViewDebug::getCompleteTypeIndex(DITypeRef TypeRef) {
+  const DIType *Ty = TypeRef.resolve();
+
+  // The null DIType is the void type. Don't try to hash it.
+  if (!Ty)
+    return TypeIndex::Void();
+
+  // If this is a non-record type, the complete type index is the same as the
+  // normal type index. Just call getTypeIndex.
+  switch (Ty->getTag()) {
+  case dwarf::DW_TAG_class_type:
+  case dwarf::DW_TAG_structure_type:
+  case dwarf::DW_TAG_union_type:
+    break;
+  default:
+    return getTypeIndex(Ty);
+  }
+
+  // Check if we've already translated the complete record type.  Lowering a
+  // complete type should never trigger lowering another complete type, so we
+  // can reuse the hash table lookup result.
+  const auto *CTy = cast<DICompositeType>(Ty);
+  auto InsertResult = CompleteTypeIndices.insert({CTy, TypeIndex()});
+  if (!InsertResult.second)
+    return InsertResult.first->second;
+
+  TypeLoweringScope S(*this);
+
+  // Make sure the forward declaration is emitted first. It's unclear if this
+  // is necessary, but MSVC does it, and we should follow suit until we can show
+  // otherwise.
+  TypeIndex FwdDeclTI = getTypeIndex(CTy);
+
+  // Just use the forward decl if we don't have complete type info. This might
+  // happen if the frontend is using modules and expects the complete definition
+  // to be emitted elsewhere.
+  if (CTy->isForwardDecl())
+    return FwdDeclTI;
+
+  TypeIndex TI;
+  switch (CTy->getTag()) {
+  case dwarf::DW_TAG_class_type:
+  case dwarf::DW_TAG_structure_type:
+    TI = lowerCompleteTypeClass(CTy);
+    break;
+  case dwarf::DW_TAG_union_type:
+    TI = lowerCompleteTypeUnion(CTy);
+    break;
+  default:
+    llvm_unreachable("not a record");
+  }
+
+  InsertResult.first->second = TI;
+  return TI;
+}
+
+/// Emit all the deferred complete record types. Try to do this in FIFO order,
+/// and do this until fixpoint, as each complete record type typically
+/// references
+/// many other record types.
+void CodeViewDebug::emitDeferredCompleteTypes() {
+  SmallVector<const DICompositeType *, 4> TypesToEmit;
+  while (!DeferredCompleteTypes.empty()) {
+    std::swap(DeferredCompleteTypes, TypesToEmit);
+    for (const DICompositeType *RecordTy : TypesToEmit)
+      getCompleteTypeIndex(RecordTy);
+    TypesToEmit.clear();
+  }
+}
+
+void CodeViewDebug::emitLocalVariableList(ArrayRef<LocalVariable> Locals) {
+  // Get the sorted list of parameters and emit them first.
+  SmallVector<const LocalVariable *, 6> Params;
+  for (const LocalVariable &L : Locals)
+    if (L.DIVar->isParameter())
+      Params.push_back(&L);
+  std::sort(Params.begin(), Params.end(),
+            [](const LocalVariable *L, const LocalVariable *R) {
+              return L->DIVar->getArg() < R->DIVar->getArg();
+            });
+  for (const LocalVariable *L : Params)
+    emitLocalVariable(*L);
+
+  // Next emit all non-parameters in the order that we found them.
+  for (const LocalVariable &L : Locals)
+    if (!L.DIVar->isParameter())
+      emitLocalVariable(L);
+}
+
+void CodeViewDebug::emitLocalVariable(const LocalVariable &Var) {
+  // LocalSym record, see SymbolRecord.h for more info.
+  MCSymbol *LocalBegin = MMI->getContext().createTempSymbol(),
+           *LocalEnd = MMI->getContext().createTempSymbol();
+  OS.AddComment("Record length");
+  OS.emitAbsoluteSymbolDiff(LocalEnd, LocalBegin, 2);
+  OS.EmitLabel(LocalBegin);
+
+  OS.AddComment("Record kind: S_LOCAL");
+  OS.EmitIntValue(unsigned(SymbolKind::S_LOCAL), 2);
+
+  LocalSymFlags Flags = LocalSymFlags::None;
+  if (Var.DIVar->isParameter())
+    Flags |= LocalSymFlags::IsParameter;
+  if (Var.DefRanges.empty())
+    Flags |= LocalSymFlags::IsOptimizedOut;
+
+  OS.AddComment("TypeIndex");
+  TypeIndex TI = getCompleteTypeIndex(Var.DIVar->getType());
+  OS.EmitIntValue(TI.getIndex(), 4);
+  OS.AddComment("Flags");
+  OS.EmitIntValue(static_cast<uint16_t>(Flags), 2);
+  // Truncate the name so we won't overflow the record length field.
+  emitNullTerminatedSymbolName(OS, Var.DIVar->getName());
+  OS.EmitLabel(LocalEnd);
+
+  // Calculate the on disk prefix of the appropriate def range record. The
+  // records and on disk formats are described in SymbolRecords.h. BytePrefix
+  // should be big enough to hold all forms without memory allocation.
+  SmallString<20> BytePrefix;
+  for (const LocalVarDefRange &DefRange : Var.DefRanges) {
+    BytePrefix.clear();
+    // FIXME: Handle bitpieces.
+    if (DefRange.StructOffset != 0)
+      continue;
+
+    if (DefRange.InMemory) {
+      DefRangeRegisterRelSym Sym(DefRange.CVRegister, 0, DefRange.DataOffset, 0,
+                                 0, 0, ArrayRef<LocalVariableAddrGap>());
+      ulittle16_t SymKind = ulittle16_t(S_DEFRANGE_REGISTER_REL);
+      BytePrefix +=
+          StringRef(reinterpret_cast<const char *>(&SymKind), sizeof(SymKind));
+      BytePrefix +=
+          StringRef(reinterpret_cast<const char *>(&Sym.Header),
+                    sizeof(Sym.Header) - sizeof(LocalVariableAddrRange));
+    } else {
+      assert(DefRange.DataOffset == 0 && "unexpected offset into register");
+      // Unclear what matters here.
+      DefRangeRegisterSym Sym(DefRange.CVRegister, 0, 0, 0, 0,
+                              ArrayRef<LocalVariableAddrGap>());
+      ulittle16_t SymKind = ulittle16_t(S_DEFRANGE_REGISTER);
+      BytePrefix +=
+          StringRef(reinterpret_cast<const char *>(&SymKind), sizeof(SymKind));
+      BytePrefix +=
+          StringRef(reinterpret_cast<const char *>(&Sym.Header),
+                    sizeof(Sym.Header) - sizeof(LocalVariableAddrRange));
+    }
+    OS.EmitCVDefRangeDirective(DefRange.Ranges, BytePrefix);
+  }
+}
+
+void CodeViewDebug::endFunction(const MachineFunction *MF) {
+  if (!Asm || !CurFn)  // We haven't created any debug info for this function.
+    return;
+
+  const Function *GV = MF->getFunction();
+  assert(FnDebugInfo.count(GV));
+  assert(CurFn == &FnDebugInfo[GV]);
+
+  collectVariableInfo(GV->getSubprogram());
+
+  DebugHandlerBase::endFunction(MF);
+
+  // Don't emit anything if we don't have any line tables.
+  if (!CurFn->HaveLineInfo) {
+    FnDebugInfo.erase(GV);
+    CurFn = nullptr;
+    return;
+  }
+
+  CurFn->End = Asm->getFunctionEnd();
+
+  CurFn = nullptr;
+}
+
+void CodeViewDebug::beginInstruction(const MachineInstr *MI) {
+  DebugHandlerBase::beginInstruction(MI);
+
+  // Ignore DBG_VALUE locations and function prologue.
+  if (!Asm || MI->isDebugValue() || MI->getFlag(MachineInstr::FrameSetup))
+    return;
+  DebugLoc DL = MI->getDebugLoc();
+  if (DL == PrevInstLoc || !DL)
+    return;
+  maybeRecordLocation(DL, Asm->MF);
+}
+
+MCSymbol *CodeViewDebug::beginCVSubsection(ModuleSubstreamKind Kind) {
+  MCSymbol *BeginLabel = MMI->getContext().createTempSymbol(),
+           *EndLabel = MMI->getContext().createTempSymbol();
+  OS.EmitIntValue(unsigned(Kind), 4);
+  OS.AddComment("Subsection size");
+  OS.emitAbsoluteSymbolDiff(EndLabel, BeginLabel, 4);
+  OS.EmitLabel(BeginLabel);
+  return EndLabel;
+}
+
+void CodeViewDebug::endCVSubsection(MCSymbol *EndLabel) {
+  OS.EmitLabel(EndLabel);
+  // Every subsection must be aligned to a 4-byte boundary.
+  OS.EmitValueToAlignment(4);
+}
+
+void CodeViewDebug::emitDebugInfoForUDTs(
+    ArrayRef<std::pair<std::string, TypeIndex>> UDTs) {
+  for (const std::pair<std::string, codeview::TypeIndex> &UDT : UDTs) {
+    MCSymbol *UDTRecordBegin = MMI->getContext().createTempSymbol(),
+             *UDTRecordEnd = MMI->getContext().createTempSymbol();
+    OS.AddComment("Record length");
+    OS.emitAbsoluteSymbolDiff(UDTRecordEnd, UDTRecordBegin, 2);
+    OS.EmitLabel(UDTRecordBegin);
+
+    OS.AddComment("Record kind: S_UDT");
+    OS.EmitIntValue(unsigned(SymbolKind::S_UDT), 2);
+
+    OS.AddComment("Type");
+    OS.EmitIntValue(UDT.second.getIndex(), 4);
+
+    emitNullTerminatedSymbolName(OS, UDT.first);
+    OS.EmitLabel(UDTRecordEnd);
+  }
+}
+
+void CodeViewDebug::emitDebugInfoForGlobals() {
+  NamedMDNode *CUs = MMI->getModule()->getNamedMetadata("llvm.dbg.cu");
+  for (const MDNode *Node : CUs->operands()) {
+    const auto *CU = cast<DICompileUnit>(Node);
+
+    // First, emit all globals that are not in a comdat in a single symbol
+    // substream. MSVC doesn't like it if the substream is empty, so only open
+    // it if we have at least one global to emit.
+    switchToDebugSectionForSymbol(nullptr);
+    MCSymbol *EndLabel = nullptr;
+    for (const DIGlobalVariable *G : CU->getGlobalVariables()) {
+      if (const auto *GV = dyn_cast_or_null<GlobalVariable>(G->getVariable())) {
+        if (!GV->hasComdat() && !GV->isDeclarationForLinker()) {
+          if (!EndLabel) {
+            OS.AddComment("Symbol subsection for globals");
+            EndLabel = beginCVSubsection(ModuleSubstreamKind::Symbols);
+          }
+          emitDebugInfoForGlobal(G, Asm->getSymbol(GV));
+        }
+      }
+    }
+    if (EndLabel)
+      endCVSubsection(EndLabel);
+
+    // Second, emit each global that is in a comdat into its own .debug$S
+    // section along with its own symbol substream.
+    for (const DIGlobalVariable *G : CU->getGlobalVariables()) {
+      if (const auto *GV = dyn_cast_or_null<GlobalVariable>(G->getVariable())) {
+        if (GV->hasComdat()) {
+          MCSymbol *GVSym = Asm->getSymbol(GV);
+          OS.AddComment("Symbol subsection for " +
+                        Twine(GlobalValue::getRealLinkageName(GV->getName())));
+          switchToDebugSectionForSymbol(GVSym);
+          EndLabel = beginCVSubsection(ModuleSubstreamKind::Symbols);
+          emitDebugInfoForGlobal(G, GVSym);
+          endCVSubsection(EndLabel);
+        }
+      }
+    }
+  }
+}
+
+void CodeViewDebug::emitDebugInfoForRetainedTypes() {
+  NamedMDNode *CUs = MMI->getModule()->getNamedMetadata("llvm.dbg.cu");
+  for (const MDNode *Node : CUs->operands()) {
+    for (auto *Ty : cast<DICompileUnit>(Node)->getRetainedTypes()) {
+      if (DIType *RT = dyn_cast<DIType>(Ty)) {
+        getTypeIndex(RT);
+        // FIXME: Add to global/local DTU list.
+      }
+    }
+  }
+}
+
+void CodeViewDebug::emitDebugInfoForGlobal(const DIGlobalVariable *DIGV,
+                                           MCSymbol *GVSym) {
+  // DataSym record, see SymbolRecord.h for more info.
+  // FIXME: Thread local data, etc
+  MCSymbol *DataBegin = MMI->getContext().createTempSymbol(),
+           *DataEnd = MMI->getContext().createTempSymbol();
+  OS.AddComment("Record length");
+  OS.emitAbsoluteSymbolDiff(DataEnd, DataBegin, 2);
+  OS.EmitLabel(DataBegin);
+  const auto *GV = cast<GlobalVariable>(DIGV->getVariable());
+  if (DIGV->isLocalToUnit()) {
+    if (GV->isThreadLocal()) {
+      OS.AddComment("Record kind: S_LTHREAD32");
+      OS.EmitIntValue(unsigned(SymbolKind::S_LTHREAD32), 2);
+    } else {
+      OS.AddComment("Record kind: S_LDATA32");
+      OS.EmitIntValue(unsigned(SymbolKind::S_LDATA32), 2);
+    }
+  } else {
+    if (GV->isThreadLocal()) {
+      OS.AddComment("Record kind: S_GTHREAD32");
+      OS.EmitIntValue(unsigned(SymbolKind::S_GTHREAD32), 2);
+    } else {
+      OS.AddComment("Record kind: S_GDATA32");
+      OS.EmitIntValue(unsigned(SymbolKind::S_GDATA32), 2);
+    }
+  }
+  OS.AddComment("Type");
+  OS.EmitIntValue(getCompleteTypeIndex(DIGV->getType()).getIndex(), 4);
+  OS.AddComment("DataOffset");
+  OS.EmitCOFFSecRel32(GVSym);
+  OS.AddComment("Segment");
+  OS.EmitCOFFSectionIndex(GVSym);
+  OS.AddComment("Name");
+  emitNullTerminatedSymbolName(OS, DIGV->getName());
+  OS.EmitLabel(DataEnd);
+}
diff --git a/lib/CodeGen/AsmPrinter/CodeViewDebug.h b/lib/CodeGen/AsmPrinter/CodeViewDebug.h
new file mode 100644
index 000000000000..e4bbd61d4ce0
--- /dev/null
+++ b/lib/CodeGen/AsmPrinter/CodeViewDebug.h
@@ -0,0 +1,310 @@
+//===-- llvm/lib/CodeGen/AsmPrinter/CodeViewDebug.h ----*- C++ -*--===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains support for writing Microsoft CodeView debug info.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_CODEGEN_ASMPRINTER_CODEVIEWDEBUG_H
+#define LLVM_LIB_CODEGEN_ASMPRINTER_CODEVIEWDEBUG_H
+
+#include "DebugHandlerBase.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/StringMap.h"
+#include "llvm/CodeGen/AsmPrinter.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineModuleInfo.h"
+#include "llvm/DebugInfo/CodeView/MemoryTypeTableBuilder.h"
+#include "llvm/DebugInfo/CodeView/TypeIndex.h"
+#include "llvm/IR/DebugInfo.h"
+#include "llvm/IR/DebugLoc.h"
+#include "llvm/MC/MCStreamer.h"
+#include "llvm/Target/TargetLoweringObjectFile.h"
+
+namespace llvm {
+
+class StringRef;
+class LexicalScope;
+struct ClassInfo;
+
+/// \brief Collects and handles line tables information in a CodeView format.
+class LLVM_LIBRARY_VISIBILITY CodeViewDebug : public DebugHandlerBase {
+  MCStreamer &OS;
+  codeview::MemoryTypeTableBuilder TypeTable;
+
+  /// Represents the most general definition range.
+  struct LocalVarDefRange {
+    /// Indicates that variable data is stored in memory relative to the
+    /// specified register.
+    int InMemory : 1;
+
+    /// Offset of variable data in memory.
+    int DataOffset : 31;
+
+    /// Offset of the data into the user level struct. If zero, no splitting
+    /// occurred.
+    uint16_t StructOffset;
+
+    /// Register containing the data or the register base of the memory
+    /// location containing the data.
+    uint16_t CVRegister;
+
+    /// Compares all location fields. This includes all fields except the label
+    /// ranges.
+    bool isDifferentLocation(LocalVarDefRange &O) {
+      return InMemory != O.InMemory || DataOffset != O.DataOffset ||
+             StructOffset != O.StructOffset || CVRegister != O.CVRegister;
+    }
+
+    SmallVector<std::pair<const MCSymbol *, const MCSymbol *>, 1> Ranges;
+  };
+
+  static LocalVarDefRange createDefRangeMem(uint16_t CVRegister, int Offset);
+  static LocalVarDefRange createDefRangeReg(uint16_t CVRegister);
+
+  /// Similar to DbgVariable in DwarfDebug, but not dwarf-specific.
+  struct LocalVariable {
+    const DILocalVariable *DIVar = nullptr;
+    SmallVector<LocalVarDefRange, 1> DefRanges;
+  };
+
+  struct InlineSite {
+    SmallVector<LocalVariable, 1> InlinedLocals;
+    SmallVector<const DILocation *, 1> ChildSites;
+    const DISubprogram *Inlinee = nullptr;
+
+    /// The ID of the inline site or function used with .cv_loc. Not a type
+    /// index.
+    unsigned SiteFuncId = 0;
+  };
+
+  // For each function, store a vector of labels to its instructions, as well as
+  // to the end of the function.
+  struct FunctionInfo {
+    /// Map from inlined call site to inlined instructions and child inlined
+    /// call sites. Listed in program order.
+    std::unordered_map<const DILocation *, InlineSite> InlineSites;
+
+    /// Ordered list of top-level inlined call sites.
+    SmallVector<const DILocation *, 1> ChildSites;
+
+    SmallVector<LocalVariable, 1> Locals;
+
+    DebugLoc LastLoc;
+    const MCSymbol *Begin = nullptr;
+    const MCSymbol *End = nullptr;
+    unsigned FuncId = 0;
+    unsigned LastFileId = 0;
+    bool HaveLineInfo = false;
+  };
+  FunctionInfo *CurFn;
+
+  /// The set of comdat .debug$S sections that we've seen so far. Each section
+  /// must start with a magic version number that must only be emitted once.
+  /// This set tracks which sections we've already opened.
+  DenseSet<MCSectionCOFF *> ComdatDebugSections;
+
+  /// Switch to the appropriate .debug$S section for GVSym. If GVSym, the symbol
+  /// of an emitted global value, is in a comdat COFF section, this will switch
+  /// to a new .debug$S section in that comdat. This method ensures that the
+  /// section starts with the magic version number on first use. If GVSym is
+  /// null, uses the main .debug$S section.
+  void switchToDebugSectionForSymbol(const MCSymbol *GVSym);
+
+  /// The next available function index for use with our .cv_* directives. Not
+  /// to be confused with type indices for LF_FUNC_ID records.
+  unsigned NextFuncId = 0;
+
+  InlineSite &getInlineSite(const DILocation *InlinedAt,
+                            const DISubprogram *Inlinee);
+
+  codeview::TypeIndex getFuncIdForSubprogram(const DISubprogram *SP);
+
+  static void collectInlineSiteChildren(SmallVectorImpl<unsigned> &Children,
+                                        const FunctionInfo &FI,
+                                        const InlineSite &Site);
+
+  /// Remember some debug info about each function. Keep it in a stable order to
+  /// emit at the end of the TU.
+  MapVector<const Function *, FunctionInfo> FnDebugInfo;
+
+  /// Map from DIFile to .cv_file id.
+  DenseMap<const DIFile *, unsigned> FileIdMap;
+
+  /// All inlined subprograms in the order they should be emitted.
+  SmallSetVector<const DISubprogram *, 4> InlinedSubprograms;
+
+  /// Map from a pair of DI metadata nodes and its DI type (or scope) that can
+  /// be nullptr, to CodeView type indices. Primarily indexed by
+  /// {DIType*, DIType*} and {DISubprogram*, DIType*}.
+  ///
+  /// The second entry in the key is needed for methods as DISubroutineType
+  /// representing static method type are shared with non-method function type.
+  DenseMap<std::pair<const DINode *, const DIType *>, codeview::TypeIndex>
+      TypeIndices;
+
+  /// Map from DICompositeType* to complete type index. Non-record types are
+  /// always looked up in the normal TypeIndices map.
+  DenseMap<const DICompositeType *, codeview::TypeIndex> CompleteTypeIndices;
+
+  /// Complete record types to emit after all active type lowerings are
+  /// finished.
+  SmallVector<const DICompositeType *, 4> DeferredCompleteTypes;
+
+  /// Number of type lowering frames active on the stack.
+  unsigned TypeEmissionLevel = 0;
+
+  codeview::TypeIndex VBPType;
+
+  const DISubprogram *CurrentSubprogram = nullptr;
+
+  // The UDTs we have seen while processing types; each entry is a pair of type
+  // index and type name.
+  std::vector<std::pair<std::string, codeview::TypeIndex>> LocalUDTs,
+      GlobalUDTs;
+
+  typedef std::map<const DIFile *, std::string> FileToFilepathMapTy;
+  FileToFilepathMapTy FileToFilepathMap;
+  StringRef getFullFilepath(const DIFile *S);
+
+  unsigned maybeRecordFile(const DIFile *F);
+
+  void maybeRecordLocation(const DebugLoc &DL, const MachineFunction *MF);
+
+  void clear();
+
+  void setCurrentSubprogram(const DISubprogram *SP) {
+    CurrentSubprogram = SP;
+    LocalUDTs.clear();
+  }
+
+  /// Emit the magic version number at the start of a CodeView type or symbol
+  /// section. Appears at the front of every .debug$S or .debug$T section.
+  void emitCodeViewMagicVersion();
+
+  void emitTypeInformation();
+
+  void emitInlineeLinesSubsection();
+
+  void emitDebugInfoForFunction(const Function *GV, FunctionInfo &FI);
+
+  void emitDebugInfoForGlobals();
+
+  void emitDebugInfoForRetainedTypes();
+
+  void emitDebugInfoForUDTs(
+      ArrayRef<std::pair<std::string, codeview::TypeIndex>> UDTs);
+
+  void emitDebugInfoForGlobal(const DIGlobalVariable *DIGV, MCSymbol *GVSym);
+
+  /// Opens a subsection of the given kind in a .debug$S codeview section.
+  /// Returns an end label for use with endCVSubsection when the subsection is
+  /// finished.
+  MCSymbol *beginCVSubsection(codeview::ModuleSubstreamKind Kind);
+
+  void endCVSubsection(MCSymbol *EndLabel);
+
+  void emitInlinedCallSite(const FunctionInfo &FI, const DILocation *InlinedAt,
+                           const InlineSite &Site);
+
+  typedef DbgValueHistoryMap::InlinedVariable InlinedVariable;
+
+  void collectVariableInfo(const DISubprogram *SP);
+
+  void collectVariableInfoFromMMITable(DenseSet<InlinedVariable> &Processed);
+
+  /// Records information about a local variable in the appropriate scope. In
+  /// particular, locals from inlined code live inside the inlining site.
+  void recordLocalVariable(LocalVariable &&Var, const DILocation *Loc);
+
+  /// Emits local variables in the appropriate order.
+  void emitLocalVariableList(ArrayRef<LocalVariable> Locals);
+
+  /// Emits an S_LOCAL record and its associated defined ranges.
+  void emitLocalVariable(const LocalVariable &Var);
+
+  /// Translates the DIType to codeview if necessary and returns a type index
+  /// for it.
+  codeview::TypeIndex getTypeIndex(DITypeRef TypeRef,
+                                   DITypeRef ClassTyRef = DITypeRef());
+
+  codeview::TypeIndex getMemberFunctionType(const DISubprogram *SP,
+                                            const DICompositeType *Class);
+
+  codeview::TypeIndex getScopeIndex(const DIScope *Scope);
+
+  codeview::TypeIndex getVBPTypeIndex();
+
+  void addToUDTs(const DIType *Ty, codeview::TypeIndex TI);
+
+  codeview::TypeIndex lowerType(const DIType *Ty, const DIType *ClassTy);
+  codeview::TypeIndex lowerTypeAlias(const DIDerivedType *Ty);
+  codeview::TypeIndex lowerTypeArray(const DICompositeType *Ty);
+  codeview::TypeIndex lowerTypeBasic(const DIBasicType *Ty);
+  codeview::TypeIndex lowerTypePointer(const DIDerivedType *Ty);
+  codeview::TypeIndex lowerTypeMemberPointer(const DIDerivedType *Ty);
+  codeview::TypeIndex lowerTypeModifier(const DIDerivedType *Ty);
+  codeview::TypeIndex lowerTypeFunction(const DISubroutineType *Ty);
+  codeview::TypeIndex lowerTypeMemberFunction(const DISubroutineType *Ty,
+                                              const DIType *ClassTy,
+                                              int ThisAdjustment);
+  codeview::TypeIndex lowerTypeEnum(const DICompositeType *Ty);
+  codeview::TypeIndex lowerTypeClass(const DICompositeType *Ty);
+  codeview::TypeIndex lowerTypeUnion(const DICompositeType *Ty);
+
+  /// Symbol records should point to complete types, but type records should
+  /// always point to incomplete types to avoid cycles in the type graph. Only
+  /// use this entry point when generating symbol records. The complete and
+  /// incomplete type indices only differ for record types. All other types use
+  /// the same index.
+  codeview::TypeIndex getCompleteTypeIndex(DITypeRef TypeRef);
+
+  codeview::TypeIndex lowerCompleteTypeClass(const DICompositeType *Ty);
+  codeview::TypeIndex lowerCompleteTypeUnion(const DICompositeType *Ty);
+
+  struct TypeLoweringScope;
+
+  void emitDeferredCompleteTypes();
+
+  void collectMemberInfo(ClassInfo &Info, const DIDerivedType *DDTy);
+  ClassInfo collectClassInfo(const DICompositeType *Ty);
+
+  /// Common record member lowering functionality for record types, which are
+  /// structs, classes, and unions. Returns the field list index and the member
+  /// count.
+  std::tuple<codeview::TypeIndex, codeview::TypeIndex, unsigned, bool>
+  lowerRecordFieldList(const DICompositeType *Ty);
+
+  /// Inserts {{Node, ClassTy}, TI} into TypeIndices and checks for duplicates.
+  codeview::TypeIndex recordTypeIndexForDINode(const DINode *Node,
+                                               codeview::TypeIndex TI,
+                                               const DIType *ClassTy = nullptr);
+
+  unsigned getPointerSizeInBytes();
+
+public:
+  CodeViewDebug(AsmPrinter *Asm);
+
+  void setSymbolSize(const llvm::MCSymbol *, uint64_t) override {}
+
+  /// \brief Emit the COFF section that holds the line table information.
+  void endModule() override;
+
+  /// \brief Gather pre-function debug information.
+  void beginFunction(const MachineFunction *MF) override;
+
+  /// \brief Gather post-function debug information.
+  void endFunction(const MachineFunction *) override;
+
+  /// \brief Process beginning of an instruction.
+  void beginInstruction(const MachineInstr *MI) override;
+};
+} // End of namespace llvm
+
+#endif
diff --git a/lib/CodeGen/AsmPrinter/DIE.cpp b/lib/CodeGen/AsmPrinter/DIE.cpp
index 7b0cdbde3791..2aaa85a58094 100644
--- a/lib/CodeGen/AsmPrinter/DIE.cpp
+++ b/lib/CodeGen/AsmPrinter/DIE.cpp
@@ -32,39 +32,6 @@
 using namespace llvm;
 
 //===----------------------------------------------------------------------===//
-// EmittingAsmStreamer Implementation
-//===----------------------------------------------------------------------===//
-unsigned EmittingAsmStreamer::emitULEB128(uint64_t Value, const char *Desc,
-                                          unsigned PadTo) {
-  AP->EmitULEB128(Value, Desc, PadTo);
-  return 0;
-}
-
-unsigned EmittingAsmStreamer::emitInt8(unsigned char Value) {
-  AP->EmitInt8(Value);
-  return 0;
-}
-
-unsigned EmittingAsmStreamer::emitBytes(StringRef Data) {
-  AP->OutStreamer->EmitBytes(Data);
-  return 0;
-}
-
-//===----------------------------------------------------------------------===//
-// SizeReporterAsmStreamer Implementation
-//===----------------------------------------------------------------------===//
-unsigned SizeReporterAsmStreamer::emitULEB128(uint64_t Value, const char *Desc,
-                                              unsigned PadTo) {
-  return getULEB128Size(Value);
-}
-
-unsigned SizeReporterAsmStreamer::emitInt8(unsigned char Value) { return 1; }
-
-unsigned SizeReporterAsmStreamer::emitBytes(StringRef Data) {
-  return Data.size();
-}
-
-//===----------------------------------------------------------------------===//
 // DIEAbbrevData Implementation
 //===----------------------------------------------------------------------===//
 
@@ -512,20 +479,6 @@ void DIEEntry::print(raw_ostream &O) const {
 }
 
 //===----------------------------------------------------------------------===//
-// DIETypeSignature Implementation
-//===----------------------------------------------------------------------===//
-void DIETypeSignature::EmitValue(const AsmPrinter *Asm,
-                                 dwarf::Form Form) const {
-  assert(Form == dwarf::DW_FORM_ref_sig8);
-  Asm->OutStreamer->EmitIntValue(Unit->getTypeSignature(), 8);
-}
-
-LLVM_DUMP_METHOD
-void DIETypeSignature::print(raw_ostream &O) const {
-  O << format("Type Unit: 0x%lx", Unit->getTypeSignature());
-}
-
-//===----------------------------------------------------------------------===//
 // DIELoc Implementation
 //===----------------------------------------------------------------------===//
 
diff --git a/lib/CodeGen/AsmPrinter/DIEHash.cpp b/lib/CodeGen/AsmPrinter/DIEHash.cpp
index 02010654a6f4..74c47d151c62 100644
--- a/lib/CodeGen/AsmPrinter/DIEHash.cpp
+++ b/lib/CodeGen/AsmPrinter/DIEHash.cpp
@@ -279,7 +279,7 @@ void DIEHash::hashLocList(const DIELocList &LocList) {
 
 // Hash an individual attribute \param Attr based on the type of attribute and
 // the form.
-void DIEHash::hashAttribute(DIEValue Value, dwarf::Tag Tag) {
+void DIEHash::hashAttribute(const DIEValue &Value, dwarf::Tag Tag) {
   dwarf::Attribute Attribute = Value.getAttribute();
 
   // Other attribute values use the letter 'A' as the marker, and the value
@@ -353,7 +353,6 @@ void DIEHash::hashAttribute(DIEValue Value, dwarf::Tag Tag) {
   case DIEValue::isExpr:
   case DIEValue::isLabel:
   case DIEValue::isDelta:
-  case DIEValue::isTypeSignature:
     llvm_unreachable("Add support for additional value types.");
   }
 }
diff --git a/lib/CodeGen/AsmPrinter/DIEHash.h b/lib/CodeGen/AsmPrinter/DIEHash.h
index 44f0ce88523d..996cd7ef3d2e 100644
--- a/lib/CodeGen/AsmPrinter/DIEHash.h
+++ b/lib/CodeGen/AsmPrinter/DIEHash.h
@@ -131,7 +131,7 @@ private:
   void hashLocList(const DIELocList &LocList);
 
   /// \brief Hashes an individual attribute.
-  void hashAttribute(DIEValue Value, dwarf::Tag Tag);
+  void hashAttribute(const DIEValue &Value, dwarf::Tag Tag);
 
   /// \brief Hashes an attribute that refers to another DIE.
   void hashDIEEntry(dwarf::Attribute Attribute, dwarf::Tag Tag,
diff --git a/lib/CodeGen/AsmPrinter/DbgValueHistoryCalculator.cpp b/lib/CodeGen/AsmPrinter/DbgValueHistoryCalculator.cpp
index 3c46a99d0845..adc536f1add8 100644
--- a/lib/CodeGen/AsmPrinter/DbgValueHistoryCalculator.cpp
+++ b/lib/CodeGen/AsmPrinter/DbgValueHistoryCalculator.cpp
@@ -15,7 +15,9 @@
 #include "llvm/IR/DebugInfo.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/raw_ostream.h"
+#include "llvm/Target/TargetLowering.h"
 #include "llvm/Target/TargetRegisterInfo.h"
+#include "llvm/Target/TargetSubtargetInfo.h"
 #include <algorithm>
 #include <map>
 using namespace llvm;
@@ -40,7 +42,7 @@ void DbgValueHistoryMap::startInstrRange(InlinedVariable Var,
   assert(MI.isDebugValue() && "not a DBG_VALUE");
   auto &Ranges = VarInstrRanges[Var];
   if (!Ranges.empty() && Ranges.back().second == nullptr &&
-      Ranges.back().first->isIdenticalTo(&MI)) {
+      Ranges.back().first->isIdenticalTo(MI)) {
     DEBUG(dbgs() << "Coalescing identical DBG_VALUE entries:\n"
                  << "\t" << Ranges.back().first << "\t" << MI << "\n");
     return;
@@ -122,26 +124,6 @@ static void clobberRegisterUses(RegDescribedVarsMap &RegVars, unsigned RegNo,
   clobberRegisterUses(RegVars, I, HistMap, ClobberingInstr);
 }
 
-// \brief Collect all registers clobbered by @MI and apply the functor
-// @Func to their RegNo.
-// @Func should be a functor with a void(unsigned) signature. We're
-// not using std::function here for performance reasons. It has a
-// small but measurable impact. By using a functor instead of a
-// std::set& here, we can avoid the overhead of constructing
-// temporaries in calculateDbgValueHistory, which has a significant
-// performance impact.
-template<typename Callable>
-static void applyToClobberedRegisters(const MachineInstr &MI,
-                                      const TargetRegisterInfo *TRI,
-                                      Callable Func) {
-  for (const MachineOperand &MO : MI.operands()) {
-    if (!MO.isReg() || !MO.isDef() || !MO.getReg())
-      continue;
-    for (MCRegAliasIterator AI(MO.getReg(), TRI, true); AI.isValid(); ++AI)
-      Func(*AI);
-  }
-}
-
 // \brief Returns the first instruction in @MBB which corresponds to
 // the function epilogue, or nullptr if @MBB doesn't contain an epilogue.
 static const MachineInstr *getFirstEpilogueInst(const MachineBasicBlock &MBB) {
@@ -156,12 +138,12 @@ static const MachineInstr *getFirstEpilogueInst(const MachineBasicBlock &MBB) {
        E = MBB.rend();
        I != E; ++I) {
     if (I->getDebugLoc() != LastLoc)
-      return Res;
+      return &*Res;
     Res = &*I;
   }
   // If all instructions have the same debug location, assume whole MBB is
   // an epilogue.
-  return MBB.begin();
+  return &*MBB.begin();
 }
 
 // \brief Collect registers that are modified in the function body (their
@@ -173,10 +155,23 @@ static void collectChangingRegs(const MachineFunction *MF,
     auto FirstEpilogueInst = getFirstEpilogueInst(MBB);
 
     for (const auto &MI : MBB) {
+      // Avoid looking at prologue or epilogue instructions.
       if (&MI == FirstEpilogueInst)
         break;
-      if (!MI.getFlag(MachineInstr::FrameSetup))
-        applyToClobberedRegisters(MI, TRI, [&](unsigned r) { Regs.set(r); });
+      if (MI.getFlag(MachineInstr::FrameSetup))
+        continue;
+
+      // Look for register defs and register masks. Register masks are
+      // typically on calls and they clobber everything not in the mask.
+      for (const MachineOperand &MO : MI.operands()) {
+        if (MO.isReg() && MO.isDef() && MO.getReg()) {
+          for (MCRegAliasIterator AI(MO.getReg(), TRI, true); AI.isValid();
+               ++AI)
+            Regs.set(*AI);
+        } else if (MO.isRegMask()) {
+          Regs.setBitsNotInMask(MO.getRegMask());
+        }
+      }
     }
   }
 }
@@ -187,16 +182,35 @@ void llvm::calculateDbgValueHistory(const MachineFunction *MF,
   BitVector ChangingRegs(TRI->getNumRegs());
   collectChangingRegs(MF, TRI, ChangingRegs);
 
+  const TargetLowering *TLI = MF->getSubtarget().getTargetLowering();
+  unsigned SP = TLI->getStackPointerRegisterToSaveRestore();
   RegDescribedVarsMap RegVars;
   for (const auto &MBB : *MF) {
     for (const auto &MI : MBB) {
       if (!MI.isDebugValue()) {
         // Not a DBG_VALUE instruction. It may clobber registers which describe
         // some variables.
-        applyToClobberedRegisters(MI, TRI, [&](unsigned RegNo) {
-          if (ChangingRegs.test(RegNo))
-            clobberRegisterUses(RegVars, RegNo, Result, MI);
-        });
+        for (const MachineOperand &MO : MI.operands()) {
+          if (MO.isReg() && MO.isDef() && MO.getReg()) {
+            // If this is a register def operand, it may end a debug value
+            // range.
+            for (MCRegAliasIterator AI(MO.getReg(), TRI, true); AI.isValid();
+                 ++AI)
+              if (ChangingRegs.test(*AI))
+                clobberRegisterUses(RegVars, *AI, Result, MI);
+          } else if (MO.isRegMask()) {
+            // If this is a register mask operand, clobber all debug values in
+            // non-CSRs.
+            for (int I = ChangingRegs.find_first(); I != -1;
+                 I = ChangingRegs.find_next(I)) {
+              // Don't consider SP to be clobbered by register masks.
+              if (unsigned(I) != SP && TRI->isPhysicalRegister(I) &&
+                  MO.clobbersPhysReg(I)) {
+                clobberRegisterUses(RegVars, I, Result, MI);
+              }
+            }
+          }
+        }
         continue;
       }
 
diff --git a/lib/CodeGen/AsmPrinter/DbgValueHistoryCalculator.h b/lib/CodeGen/AsmPrinter/DbgValueHistoryCalculator.h
index 546d1b443781..16d2d7fd7e99 100644
--- a/lib/CodeGen/AsmPrinter/DbgValueHistoryCalculator.h
+++ b/lib/CodeGen/AsmPrinter/DbgValueHistoryCalculator.h
@@ -12,13 +12,12 @@
 
 #include "llvm/ADT/MapVector.h"
 #include "llvm/ADT/SmallVector.h"
+#include "llvm/IR/DebugInfoMetadata.h"
 
 namespace llvm {
 
 class MachineFunction;
 class MachineInstr;
-class DILocalVariable;
-class DILocation;
 class TargetRegisterInfo;
 
 // For each user variable, keep a list of instruction ranges where this variable
diff --git a/lib/CodeGen/AsmPrinter/DebugHandlerBase.cpp b/lib/CodeGen/AsmPrinter/DebugHandlerBase.cpp
new file mode 100644
index 000000000000..16ffe2e15acd
--- /dev/null
+++ b/lib/CodeGen/AsmPrinter/DebugHandlerBase.cpp
@@ -0,0 +1,230 @@
+//===-- llvm/lib/CodeGen/AsmPrinter/DebugHandlerBase.cpp -------*- C++ -*--===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// Common functionality for different debug information format backends.
+// LLVM currently supports DWARF and CodeView.
+//
+//===----------------------------------------------------------------------===//
+
+#include "DebugHandlerBase.h"
+#include "llvm/CodeGen/AsmPrinter.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineInstr.h"
+#include "llvm/CodeGen/MachineModuleInfo.h"
+#include "llvm/IR/DebugInfo.h"
+#include "llvm/Target/TargetSubtargetInfo.h"
+
+using namespace llvm;
+
+DebugHandlerBase::DebugHandlerBase(AsmPrinter *A) : Asm(A), MMI(Asm->MMI) {}
+
+// Each LexicalScope has first instruction and last instruction to mark
+// beginning and end of a scope respectively. Create an inverse map that list
+// scopes starts (and ends) with an instruction. One instruction may start (or
+// end) multiple scopes. Ignore scopes that are not reachable.
+void DebugHandlerBase::identifyScopeMarkers() {
+  SmallVector<LexicalScope *, 4> WorkList;
+  WorkList.push_back(LScopes.getCurrentFunctionScope());
+  while (!WorkList.empty()) {
+    LexicalScope *S = WorkList.pop_back_val();
+
+    const SmallVectorImpl<LexicalScope *> &Children = S->getChildren();
+    if (!Children.empty())
+      WorkList.append(Children.begin(), Children.end());
+
+    if (S->isAbstractScope())
+      continue;
+
+    for (const InsnRange &R : S->getRanges()) {
+      assert(R.first && "InsnRange does not have first instruction!");
+      assert(R.second && "InsnRange does not have second instruction!");
+      requestLabelBeforeInsn(R.first);
+      requestLabelAfterInsn(R.second);
+    }
+  }
+}
+
+// Return Label preceding the instruction.
+MCSymbol *DebugHandlerBase::getLabelBeforeInsn(const MachineInstr *MI) {
+  MCSymbol *Label = LabelsBeforeInsn.lookup(MI);
+  assert(Label && "Didn't insert label before instruction");
+  return Label;
+}
+
+// Return Label immediately following the instruction.
+MCSymbol *DebugHandlerBase::getLabelAfterInsn(const MachineInstr *MI) {
+  return LabelsAfterInsn.lookup(MI);
+}
+
+// Determine the relative position of the pieces described by P1 and P2.
+// Returns  -1 if P1 is entirely before P2, 0 if P1 and P2 overlap,
+// 1 if P1 is entirely after P2.
+int DebugHandlerBase::pieceCmp(const DIExpression *P1, const DIExpression *P2) {
+  unsigned l1 = P1->getBitPieceOffset();
+  unsigned l2 = P2->getBitPieceOffset();
+  unsigned r1 = l1 + P1->getBitPieceSize();
+  unsigned r2 = l2 + P2->getBitPieceSize();
+  if (r1 <= l2)
+    return -1;
+  else if (r2 <= l1)
+    return 1;
+  else
+    return 0;
+}
+
+/// Determine whether two variable pieces overlap.
+bool DebugHandlerBase::piecesOverlap(const DIExpression *P1, const DIExpression *P2) {
+  if (!P1->isBitPiece() || !P2->isBitPiece())
+    return true;
+  return pieceCmp(P1, P2) == 0;
+}
+
+/// If this type is derived from a base type then return base type size.
+uint64_t DebugHandlerBase::getBaseTypeSize(const DITypeRef TyRef) {
+  DIType *Ty = TyRef.resolve();
+  assert(Ty);
+  DIDerivedType *DDTy = dyn_cast<DIDerivedType>(Ty);
+  if (!DDTy)
+    return Ty->getSizeInBits();
+
+  unsigned Tag = DDTy->getTag();
+
+  if (Tag != dwarf::DW_TAG_member && Tag != dwarf::DW_TAG_typedef &&
+      Tag != dwarf::DW_TAG_const_type && Tag != dwarf::DW_TAG_volatile_type &&
+      Tag != dwarf::DW_TAG_restrict_type)
+    return DDTy->getSizeInBits();
+
+  DIType *BaseType = DDTy->getBaseType().resolve();
+
+  assert(BaseType && "Unexpected invalid base type");
+
+  // If this is a derived type, go ahead and get the base type, unless it's a
+  // reference then it's just the size of the field. Pointer types have no need
+  // of this since they're a different type of qualification on the type.
+  if (BaseType->getTag() == dwarf::DW_TAG_reference_type ||
+      BaseType->getTag() == dwarf::DW_TAG_rvalue_reference_type)
+    return Ty->getSizeInBits();
+
+  return getBaseTypeSize(BaseType);
+}
+
+void DebugHandlerBase::beginFunction(const MachineFunction *MF) {
+  // Grab the lexical scopes for the function, if we don't have any of those
+  // then we're not going to be able to do anything.
+  LScopes.initialize(*MF);
+  if (LScopes.empty())
+    return;
+
+  // Make sure that each lexical scope will have a begin/end label.
+  identifyScopeMarkers();
+
+  // Calculate history for local variables.
+  assert(DbgValues.empty() && "DbgValues map wasn't cleaned!");
+  calculateDbgValueHistory(MF, Asm->MF->getSubtarget().getRegisterInfo(),
+                           DbgValues);
+
+  // Request labels for the full history.
+  for (const auto &I : DbgValues) {
+    const auto &Ranges = I.second;
+    if (Ranges.empty())
+      continue;
+
+    // The first mention of a function argument gets the CurrentFnBegin
+    // label, so arguments are visible when breaking at function entry.
+    const DILocalVariable *DIVar = Ranges.front().first->getDebugVariable();
+    if (DIVar->isParameter() &&
+        getDISubprogram(DIVar->getScope())->describes(MF->getFunction())) {
+      LabelsBeforeInsn[Ranges.front().first] = Asm->getFunctionBegin();
+      if (Ranges.front().first->getDebugExpression()->isBitPiece()) {
+        // Mark all non-overlapping initial pieces.
+        for (auto I = Ranges.begin(); I != Ranges.end(); ++I) {
+          const DIExpression *Piece = I->first->getDebugExpression();
+          if (std::all_of(Ranges.begin(), I,
+                          [&](DbgValueHistoryMap::InstrRange Pred) {
+                return !piecesOverlap(Piece, Pred.first->getDebugExpression());
+              }))
+            LabelsBeforeInsn[I->first] = Asm->getFunctionBegin();
+          else
+            break;
+        }
+      }
+    }
+
+    for (const auto &Range : Ranges) {
+      requestLabelBeforeInsn(Range.first);
+      if (Range.second)
+        requestLabelAfterInsn(Range.second);
+    }
+  }
+
+  PrevInstLoc = DebugLoc();
+  PrevLabel = Asm->getFunctionBegin();
+}
+
+void DebugHandlerBase::beginInstruction(const MachineInstr *MI) {
+  if (!MMI->hasDebugInfo())
+    return;
+
+  assert(CurMI == nullptr);
+  CurMI = MI;
+
+  // Insert labels where requested.
+  DenseMap<const MachineInstr *, MCSymbol *>::iterator I =
+      LabelsBeforeInsn.find(MI);
+
+  // No label needed.
+  if (I == LabelsBeforeInsn.end())
+    return;
+
+  // Label already assigned.
+  if (I->second)
+    return;
+
+  if (!PrevLabel) {
+    PrevLabel = MMI->getContext().createTempSymbol();
+    Asm->OutStreamer->EmitLabel(PrevLabel);
+  }
+  I->second = PrevLabel;
+}
+
+void DebugHandlerBase::endInstruction() {
+  if (!MMI->hasDebugInfo())
+    return;
+
+  assert(CurMI != nullptr);
+  // Don't create a new label after DBG_VALUE instructions.
+  // They don't generate code.
+  if (!CurMI->isDebugValue())
+    PrevLabel = nullptr;
+
+  DenseMap<const MachineInstr *, MCSymbol *>::iterator I =
+      LabelsAfterInsn.find(CurMI);
+  CurMI = nullptr;
+
+  // No label needed.
+  if (I == LabelsAfterInsn.end())
+    return;
+
+  // Label already assigned.
+  if (I->second)
+    return;
+
+  // We need a label after this instruction.
+  if (!PrevLabel) {
+    PrevLabel = MMI->getContext().createTempSymbol();
+    Asm->OutStreamer->EmitLabel(PrevLabel);
+  }
+  I->second = PrevLabel;
+}
+
+void DebugHandlerBase::endFunction(const MachineFunction *MF) {
+  DbgValues.clear();
+  LabelsBeforeInsn.clear();
+  LabelsAfterInsn.clear();
+}
diff --git a/lib/CodeGen/AsmPrinter/DebugHandlerBase.h b/lib/CodeGen/AsmPrinter/DebugHandlerBase.h
new file mode 100644
index 000000000000..b8bbcec133fd
--- /dev/null
+++ b/lib/CodeGen/AsmPrinter/DebugHandlerBase.h
@@ -0,0 +1,109 @@
+//===-- llvm/lib/CodeGen/AsmPrinter/DebugHandlerBase.h --------*- C++ -*--===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// Common functionality for different debug information format backends.
+// LLVM currently supports DWARF and CodeView.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_CODEGEN_ASMPRINTER_DEBUGHANDLERBASE_H
+#define LLVM_LIB_CODEGEN_ASMPRINTER_DEBUGHANDLERBASE_H
+
+#include "AsmPrinterHandler.h"
+#include "DbgValueHistoryCalculator.h"
+#include "llvm/CodeGen/LexicalScopes.h"
+#include "llvm/CodeGen/MachineInstr.h"
+
+namespace llvm {
+
+class AsmPrinter;
+class MachineModuleInfo;
+
+/// Base class for debug information backends. Common functionality related to
+/// tracking which variables and scopes are alive at a given PC live here.
+class DebugHandlerBase : public AsmPrinterHandler {
+protected:
+  DebugHandlerBase(AsmPrinter *A);
+
+  /// Target of debug info emission.
+  AsmPrinter *Asm;
+
+  /// Collected machine module information.
+  MachineModuleInfo *MMI;
+
+  /// Previous instruction's location information. This is used to
+  /// determine label location to indicate scope boundries in dwarf
+  /// debug info.
+  DebugLoc PrevInstLoc;
+  MCSymbol *PrevLabel = nullptr;
+
+  /// This location indicates end of function prologue and beginning of
+  /// function body.
+  DebugLoc PrologEndLoc;
+
+  /// If nonnull, stores the current machine instruction we're processing.
+  const MachineInstr *CurMI = nullptr;
+
+  LexicalScopes LScopes;
+
+  /// History of DBG_VALUE and clobber instructions for each user
+  /// variable.  Variables are listed in order of appearance.
+  DbgValueHistoryMap DbgValues;
+
+  /// Maps instruction with label emitted before instruction.
+  /// FIXME: Make this private from DwarfDebug, we have the necessary accessors
+  /// for it.
+  DenseMap<const MachineInstr *, MCSymbol *> LabelsBeforeInsn;
+
+  /// Maps instruction with label emitted after instruction.
+  DenseMap<const MachineInstr *, MCSymbol *> LabelsAfterInsn;
+
+  /// Indentify instructions that are marking the beginning of or
+  /// ending of a scope.
+  void identifyScopeMarkers();
+
+  /// Ensure that a label will be emitted before MI.
+  void requestLabelBeforeInsn(const MachineInstr *MI) {
+    LabelsBeforeInsn.insert(std::make_pair(MI, nullptr));
+  }
+
+  /// Ensure that a label will be emitted after MI.
+  void requestLabelAfterInsn(const MachineInstr *MI) {
+    LabelsAfterInsn.insert(std::make_pair(MI, nullptr));
+  }
+
+  // AsmPrinterHandler overrides.
+public:
+  void beginInstruction(const MachineInstr *MI) override;
+  void endInstruction() override;
+
+  void beginFunction(const MachineFunction *MF) override;
+  void endFunction(const MachineFunction *MF) override;
+
+  /// Return Label preceding the instruction.
+  MCSymbol *getLabelBeforeInsn(const MachineInstr *MI);
+
+  /// Return Label immediately following the instruction.
+  MCSymbol *getLabelAfterInsn(const MachineInstr *MI);
+
+  /// Determine the relative position of the pieces described by P1 and P2.
+  /// Returns  -1 if P1 is entirely before P2, 0 if P1 and P2 overlap,
+  /// 1 if P1 is entirely after P2.
+  static int pieceCmp(const DIExpression *P1, const DIExpression *P2);
+
+  /// Determine whether two variable pieces overlap.
+  static bool piecesOverlap(const DIExpression *P1, const DIExpression *P2);
+
+  /// If this type is derived from a base type then return base type size.
+  static uint64_t getBaseTypeSize(const DITypeRef TyRef);
+};
+
+}
+
+#endif
diff --git a/lib/CodeGen/AsmPrinter/DebugLocEntry.h b/lib/CodeGen/AsmPrinter/DebugLocEntry.h
index b60ab9151ef2..20acd45e5720 100644
--- a/lib/CodeGen/AsmPrinter/DebugLocEntry.h
+++ b/lib/CodeGen/AsmPrinter/DebugLocEntry.h
@@ -11,11 +11,11 @@
 #define LLVM_LIB_CODEGEN_ASMPRINTER_DEBUGLOCENTRY_H
 
 #include "DebugLocStream.h"
-#include "llvm/ADT/SmallString.h"
 #include "llvm/IR/Constants.h"
 #include "llvm/IR/DebugInfo.h"
 #include "llvm/MC/MCSymbol.h"
 #include "llvm/MC/MachineLocation.h"
+#include "llvm/Support/Debug.h"
 
 namespace llvm {
 class AsmPrinter;
@@ -76,6 +76,20 @@ public:
     const DIExpression *getExpression() const { return Expression; }
     friend bool operator==(const Value &, const Value &);
     friend bool operator<(const Value &, const Value &);
+    void dump() const {
+      if (isLocation()) {
+        llvm::dbgs() << "Loc = { reg=" << Loc.getReg() << " ";
+        if (Loc.isIndirect())
+          llvm::dbgs() << '+' << Loc.getOffset();
+        llvm::dbgs() << "} ";
+      }
+      else if (isConstantInt())
+        Constant.CIP->dump();
+      else if (isConstantFP())
+        Constant.CFP->dump();
+      if (Expression)
+        Expression->dump();
+    }
   };
 
 private:
diff --git a/lib/CodeGen/AsmPrinter/DwarfCFIException.cpp b/lib/CodeGen/AsmPrinter/DwarfCFIException.cpp
index 6665c16159a0..2eae1b234473 100644
--- a/lib/CodeGen/AsmPrinter/DwarfCFIException.cpp
+++ b/lib/CodeGen/AsmPrinter/DwarfCFIException.cpp
@@ -12,7 +12,6 @@
 //===----------------------------------------------------------------------===//
 
 #include "DwarfException.h"
-#include "llvm/ADT/SmallString.h"
 #include "llvm/ADT/StringExtras.h"
 #include "llvm/ADT/Twine.h"
 #include "llvm/CodeGen/AsmPrinter.h"
@@ -43,8 +42,7 @@ DwarfCFIExceptionBase::DwarfCFIExceptionBase(AsmPrinter *A)
     : EHStreamer(A), shouldEmitCFI(false) {}
 
 void DwarfCFIExceptionBase::markFunctionEnd() {
-  if (shouldEmitCFI)
-    Asm->OutStreamer->EmitCFIEndProc();
+  endFragment();
 
   if (MMI->getLandingPads().empty())
     return;
@@ -53,23 +51,28 @@ void DwarfCFIExceptionBase::markFunctionEnd() {
   MMI->TidyLandingPads();
 }
 
+void DwarfCFIExceptionBase::endFragment() {
+  if (shouldEmitCFI)
+    Asm->OutStreamer->EmitCFIEndProc();
+}
+
 DwarfCFIException::DwarfCFIException(AsmPrinter *A)
     : DwarfCFIExceptionBase(A), shouldEmitPersonality(false),
-      shouldEmitLSDA(false), shouldEmitMoves(false),
-      moveTypeModule(AsmPrinter::CFI_M_None) {}
+      forceEmitPersonality(false), shouldEmitLSDA(false),
+      shouldEmitMoves(false), moveTypeModule(AsmPrinter::CFI_M_None) {}
 
 DwarfCFIException::~DwarfCFIException() {}
 
 /// endModule - Emit all exception information that should come after the
 /// content.
 void DwarfCFIException::endModule() {
-  if (moveTypeModule == AsmPrinter::CFI_M_Debug)
-    Asm->OutStreamer->EmitCFISections(false, true);
-
   // SjLj uses this pass and it doesn't need this info.
   if (!Asm->MAI->usesCFIForEH())
     return;
 
+  if (moveTypeModule == AsmPrinter::CFI_M_Debug)
+    Asm->OutStreamer->EmitCFISections(false, true);
+
   const TargetLoweringObjectFile &TLOF = Asm->getObjFileLowering();
 
   unsigned PerEncoding = TLOF.getPersonalityEncoding();
@@ -86,6 +89,10 @@ void DwarfCFIException::endModule() {
   }
 }
 
+static MCSymbol *getExceptionSym(AsmPrinter *Asm) {
+  return Asm->getCurExceptionSym();
+}
+
 void DwarfCFIException::beginFunction(const MachineFunction *MF) {
   shouldEmitMoves = shouldEmitPersonality = shouldEmitLSDA = false;
   const Function *F = MF->getFunction();
@@ -109,7 +116,7 @@ void DwarfCFIException::beginFunction(const MachineFunction *MF) {
     Per = dyn_cast<Function>(F->getPersonalityFn()->stripPointerCasts());
 
   // Emit a personality function even when there are no landing pads
-  bool forceEmitPersonality =
+  forceEmitPersonality =
       // ...if a personality function is explicitly specified
       F->hasPersonalityFn() &&
       // ... and it's not known to be a noop in the absence of invokes
@@ -126,7 +133,13 @@ void DwarfCFIException::beginFunction(const MachineFunction *MF) {
   shouldEmitLSDA = shouldEmitPersonality &&
     LSDAEncoding != dwarf::DW_EH_PE_omit;
 
-  shouldEmitCFI = shouldEmitPersonality || shouldEmitMoves;
+  shouldEmitCFI = MF->getMMI().getContext().getAsmInfo()->usesCFIForEH() &&
+                  (shouldEmitPersonality || shouldEmitMoves);
+  beginFragment(&*MF->begin(), getExceptionSym);
+}
+
+void DwarfCFIException::beginFragment(const MachineBasicBlock *MBB,
+                                      ExceptionSymbolProvider ESP) {
   if (!shouldEmitCFI)
     return;
 
@@ -136,20 +149,24 @@ void DwarfCFIException::beginFunction(const MachineFunction *MF) {
   if (!shouldEmitPersonality)
     return;
 
+  auto *F = MBB->getParent()->getFunction();
+  auto *P = dyn_cast<Function>(F->getPersonalityFn()->stripPointerCasts());
+  assert(P && "Expected personality function");
+
   // If we are forced to emit this personality, make sure to record
   // it because it might not appear in any landingpad
   if (forceEmitPersonality)
-    MMI->addPersonality(Per);
+    MMI->addPersonality(P);
 
+  const TargetLoweringObjectFile &TLOF = Asm->getObjFileLowering();
+  unsigned PerEncoding = TLOF.getPersonalityEncoding();
   const MCSymbol *Sym =
-      TLOF.getCFIPersonalitySymbol(Per, *Asm->Mang, Asm->TM, MMI);
+      TLOF.getCFIPersonalitySymbol(P, *Asm->Mang, Asm->TM, MMI);
   Asm->OutStreamer->EmitCFIPersonality(Sym, PerEncoding);
 
   // Provide LSDA information.
-  if (!shouldEmitLSDA)
-    return;
-
-  Asm->OutStreamer->EmitCFILsda(Asm->getCurExceptionSym(), LSDAEncoding);
+  if (shouldEmitLSDA)
+    Asm->OutStreamer->EmitCFILsda(ESP(Asm), TLOF.getLSDAEncoding());
 }
 
 /// endFunction - Gather and emit post-function exception information.
diff --git a/lib/CodeGen/AsmPrinter/DwarfCompileUnit.cpp b/lib/CodeGen/AsmPrinter/DwarfCompileUnit.cpp
index 725063a8177b..7822814c7a0f 100644
--- a/lib/CodeGen/AsmPrinter/DwarfCompileUnit.cpp
+++ b/lib/CodeGen/AsmPrinter/DwarfCompileUnit.cpp
@@ -19,9 +19,10 @@ namespace llvm {
 DwarfCompileUnit::DwarfCompileUnit(unsigned UID, const DICompileUnit *Node,
                                    AsmPrinter *A, DwarfDebug *DW,
                                    DwarfFile *DWU)
-    : DwarfUnit(UID, dwarf::DW_TAG_compile_unit, Node, A, DW, DWU),
+    : DwarfUnit(dwarf::DW_TAG_compile_unit, Node, A, DW, DWU), UniqueID(UID),
       Skeleton(nullptr), BaseAddress(nullptr) {
   insertDIE(Node, &getUnitDie());
+  MacroLabelBegin = Asm->createTempSymbol("cu_macro_begin");
 }
 
 /// addLabelAddress - Add a dwarf label attribute data and value using
@@ -83,8 +84,8 @@ static const ConstantExpr *getMergedGlobalExpr(const Value *V) {
 
   // First operand points to a global struct.
   Value *Ptr = CE->getOperand(0);
-  if (!isa<GlobalValue>(Ptr) ||
-      !isa<StructType>(cast<PointerType>(Ptr->getType())->getElementType()))
+  GlobalValue *GV = dyn_cast<GlobalValue>(Ptr);
+  if (!GV || !isa<StructType>(GV->getValueType()))
     return nullptr;
 
   // Second operand is zero.
@@ -147,61 +148,69 @@ DIE *DwarfCompileUnit::getOrCreateGlobalVariableDIE(
   // Add location.
   bool addToAccelTable = false;
   if (auto *Global = dyn_cast_or_null<GlobalVariable>(GV->getVariable())) {
-    addToAccelTable = true;
-    DIELoc *Loc = new (DIEValueAllocator) DIELoc;
-    const MCSymbol *Sym = Asm->getSymbol(Global);
-    if (Global->isThreadLocal()) {
-      if (Asm->TM.Options.EmulatedTLS) {
-        // TODO: add debug info for emulated thread local mode.
-      } else {
-        // FIXME: Make this work with -gsplit-dwarf.
-        unsigned PointerSize = Asm->getDataLayout().getPointerSize();
-        assert((PointerSize == 4 || PointerSize == 8) &&
-               "Add support for other sizes if necessary");
-        // Based on GCC's support for TLS:
-        if (!DD->useSplitDwarf()) {
-          // 1) Start with a constNu of the appropriate pointer size
-          addUInt(*Loc, dwarf::DW_FORM_data1, PointerSize == 4
-                                                  ? dwarf::DW_OP_const4u
-                                                  : dwarf::DW_OP_const8u);
-          // 2) containing the (relocated) offset of the TLS variable
-          //    within the module's TLS block.
-          addExpr(*Loc, dwarf::DW_FORM_udata,
-                  Asm->getObjFileLowering().getDebugThreadLocalSymbol(Sym));
+    // We cannot describe the location of dllimport'd variables: the computation
+    // of their address requires loads from the IAT.
+    if (!Global->hasDLLImportStorageClass()) {
+      addToAccelTable = true;
+      DIELoc *Loc = new (DIEValueAllocator) DIELoc;
+      const MCSymbol *Sym = Asm->getSymbol(Global);
+      if (Global->isThreadLocal()) {
+        if (Asm->TM.Options.EmulatedTLS) {
+          // TODO: add debug info for emulated thread local mode.
         } else {
-          addUInt(*Loc, dwarf::DW_FORM_data1, dwarf::DW_OP_GNU_const_index);
-          addUInt(*Loc, dwarf::DW_FORM_udata,
-                  DD->getAddressPool().getIndex(Sym, /* TLS */ true));
+          // FIXME: Make this work with -gsplit-dwarf.
+          unsigned PointerSize = Asm->getDataLayout().getPointerSize();
+          assert((PointerSize == 4 || PointerSize == 8) &&
+                 "Add support for other sizes if necessary");
+          // Based on GCC's support for TLS:
+          if (!DD->useSplitDwarf()) {
+            // 1) Start with a constNu of the appropriate pointer size
+            addUInt(*Loc, dwarf::DW_FORM_data1, PointerSize == 4
+                                                    ? dwarf::DW_OP_const4u
+                                                    : dwarf::DW_OP_const8u);
+            // 2) containing the (relocated) offset of the TLS variable
+            //    within the module's TLS block.
+            addExpr(*Loc, dwarf::DW_FORM_udata,
+                    Asm->getObjFileLowering().getDebugThreadLocalSymbol(Sym));
+          } else {
+            addUInt(*Loc, dwarf::DW_FORM_data1, dwarf::DW_OP_GNU_const_index);
+            addUInt(*Loc, dwarf::DW_FORM_udata,
+                    DD->getAddressPool().getIndex(Sym, /* TLS */ true));
+          }
+          // 3) followed by an OP to make the debugger do a TLS lookup.
+          addUInt(*Loc, dwarf::DW_FORM_data1,
+                  DD->useGNUTLSOpcode() ? dwarf::DW_OP_GNU_push_tls_address
+                                        : dwarf::DW_OP_form_tls_address);
         }
-        // 3) followed by an OP to make the debugger do a TLS lookup.
-        addUInt(*Loc, dwarf::DW_FORM_data1,
-                DD->useGNUTLSOpcode() ? dwarf::DW_OP_GNU_push_tls_address
-                                      : dwarf::DW_OP_form_tls_address);
+      } else {
+        DD->addArangeLabel(SymbolCU(this, Sym));
+        addOpAddress(*Loc, Sym);
       }
-    } else {
-      DD->addArangeLabel(SymbolCU(this, Sym));
-      addOpAddress(*Loc, Sym);
-    }
 
-    addBlock(*VariableDIE, dwarf::DW_AT_location, Loc);
-    addLinkageName(*VariableDIE, GV->getLinkageName());
+      addBlock(*VariableDIE, dwarf::DW_AT_location, Loc);
+      if (DD->useAllLinkageNames())
+        addLinkageName(*VariableDIE, GV->getLinkageName());
+    }
   } else if (const ConstantInt *CI =
                  dyn_cast_or_null<ConstantInt>(GV->getVariable())) {
     addConstantValue(*VariableDIE, CI, GTy);
   } else if (const ConstantExpr *CE = getMergedGlobalExpr(GV->getVariable())) {
-    addToAccelTable = true;
-    // GV is a merged global.
-    DIELoc *Loc = new (DIEValueAllocator) DIELoc;
-    Value *Ptr = CE->getOperand(0);
-    MCSymbol *Sym = Asm->getSymbol(cast<GlobalValue>(Ptr));
-    DD->addArangeLabel(SymbolCU(this, Sym));
-    addOpAddress(*Loc, Sym);
-    addUInt(*Loc, dwarf::DW_FORM_data1, dwarf::DW_OP_constu);
-    SmallVector<Value *, 3> Idx(CE->op_begin() + 1, CE->op_end());
-    addUInt(*Loc, dwarf::DW_FORM_udata,
-            Asm->getDataLayout().getIndexedOffset(Ptr->getType(), Idx));
-    addUInt(*Loc, dwarf::DW_FORM_data1, dwarf::DW_OP_plus);
-    addBlock(*VariableDIE, dwarf::DW_AT_location, Loc);
+    auto *Ptr = cast<GlobalValue>(CE->getOperand(0));
+    if (!Ptr->hasDLLImportStorageClass()) {
+      addToAccelTable = true;
+      // GV is a merged global.
+      DIELoc *Loc = new (DIEValueAllocator) DIELoc;
+      MCSymbol *Sym = Asm->getSymbol(Ptr);
+      DD->addArangeLabel(SymbolCU(this, Sym));
+      addOpAddress(*Loc, Sym);
+      addUInt(*Loc, dwarf::DW_FORM_data1, dwarf::DW_OP_constu);
+      SmallVector<Value *, 3> Idx(CE->op_begin() + 1, CE->op_end());
+      addUInt(*Loc, dwarf::DW_FORM_udata,
+              Asm->getDataLayout().getIndexedOffsetInType(Ptr->getValueType(),
+                                                          Idx));
+      addUInt(*Loc, dwarf::DW_FORM_data1, dwarf::DW_OP_plus);
+      addBlock(*VariableDIE, dwarf::DW_AT_location, Loc);
+    }
   }
 
   if (addToAccelTable) {
@@ -285,7 +294,8 @@ DIE &DwarfCompileUnit::updateSubprogramScopeDIE(const DISubprogram *SP) {
   DIE *SPDie = getOrCreateSubprogramDIE(SP, includeMinimalInlineScopes());
 
   attachLowHighPC(*SPDie, Asm->getFunctionBegin(), Asm->getFunctionEnd());
-  if (!DD->getCurrentFunction()->getTarget().Options.DisableFramePointerElim(
+  if (DD->useAppleExtensionAttributes() &&
+      !DD->getCurrentFunction()->getTarget().Options.DisableFramePointerElim(
           *DD->getCurrentFunction()))
     addFlag(*SPDie, dwarf::DW_AT_APPLE_omit_frame_ptr);
 
@@ -503,9 +513,20 @@ DIE *DwarfCompileUnit::constructVariableDIEImpl(const DbgVariable &DV,
         addVariableAddress(DV, *VariableDie, Location);
       } else if (RegOp.getReg())
         addVariableAddress(DV, *VariableDie, MachineLocation(RegOp.getReg()));
-    } else if (DVInsn->getOperand(0).isImm())
-      addConstantValue(*VariableDie, DVInsn->getOperand(0), DV.getType());
-    else if (DVInsn->getOperand(0).isFPImm())
+    } else if (DVInsn->getOperand(0).isImm()) {
+      // This variable is described by a single constant.
+      // Check whether it has a DIExpression.
+      auto *Expr = DV.getSingleExpression();
+      if (Expr && Expr->getNumElements()) {
+        DIELoc *Loc = new (DIEValueAllocator) DIELoc;
+        DIEDwarfExpression DwarfExpr(*Asm, *this, *Loc);
+        // If there is an expression, emit raw unsigned bytes.
+        DwarfExpr.AddUnsignedConstant(DVInsn->getOperand(0).getImm());
+        DwarfExpr.AddExpression(Expr->expr_op_begin(), Expr->expr_op_end());
+        addBlock(*VariableDie, dwarf::DW_AT_location, Loc);
+      } else
+        addConstantValue(*VariableDie, DVInsn->getOperand(0), DV.getType());
+    } else if (DVInsn->getOperand(0).isFPImm())
       addConstantFPValue(*VariableDie, DVInsn->getOperand(0));
     else if (DVInsn->getOperand(0).isCImm())
       addConstantValue(*VariableDie, DVInsn->getOperand(0).getCImm(),
@@ -526,7 +547,8 @@ DIE *DwarfCompileUnit::constructVariableDIEImpl(const DbgVariable &DV,
     const TargetFrameLowering *TFI = Asm->MF->getSubtarget().getFrameLowering();
     int Offset = TFI->getFrameIndexReference(*Asm->MF, FI, FrameReg);
     assert(Expr != DV.getExpression().end() && "Wrong number of expressions");
-    DwarfExpr.AddMachineRegIndirect(FrameReg, Offset);
+    DwarfExpr.AddMachineRegIndirect(*Asm->MF->getSubtarget().getRegisterInfo(),
+                                    FrameReg, Offset);
     DwarfExpr.AddExpression((*Expr)->expr_op_begin(), (*Expr)->expr_op_end());
     ++Expr;
   }
@@ -683,25 +705,6 @@ void DwarfCompileUnit::finishSubprogramDefinition(const DISubprogram *SP) {
       applySubprogramAttributesToDefinition(SP, *D);
   }
 }
-void DwarfCompileUnit::collectDeadVariables(const DISubprogram *SP) {
-  assert(SP && "CU's subprogram list contains a non-subprogram");
-  assert(SP->isDefinition() &&
-         "CU's subprogram list contains a subprogram declaration");
-  auto Variables = SP->getVariables();
-  if (Variables.size() == 0)
-    return;
-
-  DIE *SPDIE = DU->getAbstractSPDies().lookup(SP);
-  if (!SPDIE)
-    SPDIE = getDIE(SP);
-  assert(SPDIE);
-  for (const DILocalVariable *DV : Variables) {
-    DbgVariable NewVar(DV, /* IA */ nullptr, DD);
-    auto VariableDie = constructVariableDIE(NewVar);
-    applyVariableAttributes(NewVar, *VariableDie);
-    SPDIE->addChild(std::move(VariableDie));
-  }
-}
 
 void DwarfCompileUnit::emitHeader(bool UseOffsets) {
   // Don't bother labeling the .dwo unit, as its offset isn't used.
@@ -770,16 +773,16 @@ void DwarfCompileUnit::addComplexAddress(const DbgVariable &DV, DIE &Die,
                                          const MachineLocation &Location) {
   DIELoc *Loc = new (DIEValueAllocator) DIELoc;
   DIEDwarfExpression DwarfExpr(*Asm, *this, *Loc);
-  assert(DV.getExpression().size() == 1);
-  const DIExpression *Expr = DV.getExpression().back();
+  const DIExpression *Expr = DV.getSingleExpression();
   bool ValidReg;
+  const TargetRegisterInfo &TRI = *Asm->MF->getSubtarget().getRegisterInfo();
   if (Location.getOffset()) {
-    ValidReg = DwarfExpr.AddMachineRegIndirect(Location.getReg(),
+    ValidReg = DwarfExpr.AddMachineRegIndirect(TRI, Location.getReg(),
                                                Location.getOffset());
     if (ValidReg)
       DwarfExpr.AddExpression(Expr->expr_op_begin(), Expr->expr_op_end());
   } else
-    ValidReg = DwarfExpr.AddMachineRegExpression(Expr, Location.getReg());
+    ValidReg = DwarfExpr.AddMachineRegExpression(TRI, Expr, Location.getReg());
 
   // Now attach the location information to the DIE.
   if (ValidReg)
@@ -824,7 +827,7 @@ bool DwarfCompileUnit::isDwoUnit() const {
 }
 
 bool DwarfCompileUnit::includeMinimalInlineScopes() const {
-  return getCUNode()->getEmissionKind() == DIBuilder::LineTablesOnly ||
+  return getCUNode()->getEmissionKind() == DICompileUnit::LineTablesOnly ||
          (DD->useSplitDwarf() && !Skeleton);
 }
 } // end llvm namespace
diff --git a/lib/CodeGen/AsmPrinter/DwarfCompileUnit.h b/lib/CodeGen/AsmPrinter/DwarfCompileUnit.h
index 2e2846790cc1..90f74a3686ea 100644
--- a/lib/CodeGen/AsmPrinter/DwarfCompileUnit.h
+++ b/lib/CodeGen/AsmPrinter/DwarfCompileUnit.h
@@ -15,12 +15,12 @@
 #define LLVM_LIB_CODEGEN_ASMPRINTER_DWARFCOMPILEUNIT_H
 
 #include "DwarfUnit.h"
-#include "llvm/ADT/StringRef.h"
 #include "llvm/IR/DebugInfo.h"
 #include "llvm/Support/Dwarf.h"
 
 namespace llvm {
 
+class StringRef;
 class AsmPrinter;
 class DIE;
 class DwarfDebug;
@@ -29,6 +29,12 @@ class MCSymbol;
 class LexicalScope;
 
 class DwarfCompileUnit : public DwarfUnit {
+  /// A numeric ID unique among all CUs in the module
+  unsigned UniqueID;
+
+  /// Offset of the UnitDie from beginning of debug info section.
+  unsigned DebugInfoOffset = 0;
+
   /// The attribute index of DW_AT_stmt_list in the compile unit DIE, avoiding
   /// the need to search for it in applyStmtList.
   DIE::value_iterator StmtListValue;
@@ -39,6 +45,9 @@ class DwarfCompileUnit : public DwarfUnit {
   /// The start of the unit within its section.
   MCSymbol *LabelBegin;
 
+  /// The start of the unit macro info within macro section.
+  MCSymbol *MacroLabelBegin;
+
   typedef llvm::SmallVector<const MDNode *, 8> ImportedEntityList;
   typedef llvm::DenseMap<const MDNode *, ImportedEntityList>
   ImportedEntityMap;
@@ -74,6 +83,10 @@ public:
   DwarfCompileUnit(unsigned UID, const DICompileUnit *Node, AsmPrinter *A,
                    DwarfDebug *DW, DwarfFile *DWU);
 
+  unsigned getUniqueID() const { return UniqueID; }
+  unsigned getDebugInfoOffset() const { return DebugInfoOffset; }
+  void setDebugInfoOffset(unsigned DbgInfoOff) { DebugInfoOffset = DbgInfoOff; }
+
   DwarfCompileUnit *getSkeleton() const {
     return Skeleton;
   }
@@ -105,7 +118,14 @@ public:
   unsigned getOrCreateSourceID(StringRef FileName, StringRef DirName) override;
 
   void addImportedEntity(const DIImportedEntity* IE) {
-    ImportedEntities[IE->getScope()].push_back(IE);
+    DIScope *Scope = IE->getScope();
+    assert(Scope && "Invalid Scope encoding!");
+    if (!isa<DILocalScope>(Scope))
+      // No need to add imported enities that are not local declaration.
+      return;
+
+    auto *LocalScope = cast<DILocalScope>(Scope)->getNonLexicalBlockFileScope();
+    ImportedEntities[LocalScope].push_back(IE);
   }
 
   /// addRange - Add an address range to the list of ranges for this unit.
@@ -167,8 +187,6 @@ public:
 
   void finishSubprogramDefinition(const DISubprogram *SP);
 
-  void collectDeadVariables(const DISubprogram *SP);
-
   /// Set the skeleton unit associated with this unit.
   void setSkeleton(DwarfCompileUnit &Skel) { Skeleton = &Skel; }
 
@@ -189,6 +207,10 @@ public:
     return LabelBegin;
   }
 
+  MCSymbol *getMacroLabelBegin() const {
+    return MacroLabelBegin;
+  }
+
   /// Add a new global name to the compile unit.
   void addGlobalName(StringRef Name, DIE &Die, const DIScope *Context) override;
 
diff --git a/lib/CodeGen/AsmPrinter/DwarfDebug.cpp b/lib/CodeGen/AsmPrinter/DwarfDebug.cpp
index f56c8e492e52..7fba7688f7fb 100644
--- a/lib/CodeGen/AsmPrinter/DwarfDebug.cpp
+++ b/lib/CodeGen/AsmPrinter/DwarfDebug.cpp
@@ -26,7 +26,6 @@
 #include "llvm/CodeGen/MachineFunction.h"
 #include "llvm/CodeGen/MachineModuleInfo.h"
 #include "llvm/IR/Constants.h"
-#include "llvm/IR/DIBuilder.h"
 #include "llvm/IR/DataLayout.h"
 #include "llvm/IR/DebugInfo.h"
 #include "llvm/IR/Instructions.h"
@@ -54,6 +53,7 @@
 #include "llvm/Target/TargetOptions.h"
 #include "llvm/Target/TargetRegisterInfo.h"
 #include "llvm/Target/TargetSubtargetInfo.h"
+
 using namespace llvm;
 
 #define DEBUG_TYPE "dwarfdebug"
@@ -105,13 +105,21 @@ DwarfPubSections("generate-dwarf-pub-sections", cl::Hidden,
                             clEnumVal(Disable, "Disabled"), clEnumValEnd),
                  cl::init(Default));
 
-static cl::opt<DefaultOnOff>
-DwarfLinkageNames("dwarf-linkage-names", cl::Hidden,
-                  cl::desc("Emit DWARF linkage-name attributes."),
-                  cl::values(clEnumVal(Default, "Default for platform"),
-                             clEnumVal(Enable, "Enabled"),
-                             clEnumVal(Disable, "Disabled"), clEnumValEnd),
-                  cl::init(Default));
+enum LinkageNameOption {
+  DefaultLinkageNames,
+  AllLinkageNames,
+  AbstractLinkageNames
+};
+static cl::opt<LinkageNameOption>
+    DwarfLinkageNames("dwarf-linkage-names", cl::Hidden,
+                      cl::desc("Which DWARF linkage-name attributes to emit."),
+                      cl::values(clEnumValN(DefaultLinkageNames, "Default",
+                                            "Default for platform"),
+                                 clEnumValN(AllLinkageNames, "All", "All"),
+                                 clEnumValN(AbstractLinkageNames, "Abstract",
+                                            "Abstract subprograms"),
+                                 clEnumValEnd),
+                      cl::init(DefaultLinkageNames));
 
 static const char *const DWARFGroupName = "DWARF Emission";
 static const char *const DbgTimerName = "DWARF Debug Writer";
@@ -130,28 +138,21 @@ void DebugLocDwarfExpression::EmitUnsigned(uint64_t Value) {
   BS.EmitULEB128(Value, Twine(Value));
 }
 
-bool DebugLocDwarfExpression::isFrameRegister(unsigned MachineReg) {
+bool DebugLocDwarfExpression::isFrameRegister(const TargetRegisterInfo &TRI,
+                                              unsigned MachineReg) {
   // This information is not available while emitting .debug_loc entries.
   return false;
 }
 
 //===----------------------------------------------------------------------===//
 
-/// resolve - Look in the DwarfDebug map for the MDNode that
-/// corresponds to the reference.
-template <typename T> T *DbgVariable::resolve(TypedDINodeRef<T> Ref) const {
-  return DD->resolve(Ref);
-}
-
 bool DbgVariable::isBlockByrefVariable() const {
   assert(Var && "Invalid complex DbgVariable!");
-  return Var->getType()
-      .resolve(DD->getTypeIdentifierMap())
-      ->isBlockByrefStruct();
+  return Var->getType().resolve()->isBlockByrefStruct();
 }
 
 const DIType *DbgVariable::getType() const {
-  DIType *Ty = Var->getType().resolve(DD->getTypeIdentifierMap());
+  DIType *Ty = Var->getType().resolve();
   // FIXME: isBlockByrefVariable should be reformulated in terms of complex
   // addresses instead.
   if (Ty->isBlockByrefStruct()) {
@@ -201,8 +202,8 @@ static LLVM_CONSTEXPR DwarfAccelTable::Atom TypeAtoms[] = {
     DwarfAccelTable::Atom(dwarf::DW_ATOM_type_flags, dwarf::DW_FORM_data1)};
 
 DwarfDebug::DwarfDebug(AsmPrinter *A, Module *M)
-    : Asm(A), MMI(Asm->MMI), DebugLocs(A->OutStreamer->isVerboseAsm()),
-      PrevLabel(nullptr), InfoHolder(A, "info_string", DIEValueAllocator),
+    : DebugHandlerBase(A), DebugLocs(A->OutStreamer->isVerboseAsm()),
+      InfoHolder(A, "info_string", DIEValueAllocator),
       SkeletonHolder(A, "skel_string", DIEValueAllocator),
       IsDarwin(Triple(A->getTargetTriple()).isOSDarwin()),
       AccelNames(DwarfAccelTable::Atom(dwarf::DW_ATOM_die_offset,
@@ -214,7 +215,6 @@ DwarfDebug::DwarfDebug(AsmPrinter *A, Module *M)
       AccelTypes(TypeAtoms), DebuggerTuning(DebuggerKind::Default) {
 
   CurFn = nullptr;
-  CurMI = nullptr;
   Triple TT(Asm->getTargetTriple());
 
   // Make sure we know our "debugger tuning."  The target option takes
@@ -234,6 +234,8 @@ DwarfDebug::DwarfDebug(AsmPrinter *A, Module *M)
   else
     HasDwarfAccelTables = DwarfAccelTables == Enable;
 
+  HasAppleExtensionAttributes = tuneForLLDB();
+
   // Handle split DWARF. Off by default for now.
   if (SplitDwarf == Default)
     HasSplitDwarf = false;
@@ -246,11 +248,11 @@ DwarfDebug::DwarfDebug(AsmPrinter *A, Module *M)
   else
     HasDwarfPubSections = DwarfPubSections == Enable;
 
-  // SCE does not use linkage names.
-  if (DwarfLinkageNames == Default)
-    UseLinkageNames = !tuneForSCE();
+  // SCE defaults to linkage names only for abstract subprograms.
+  if (DwarfLinkageNames == DefaultLinkageNames)
+    UseAllLinkageNames = !tuneForSCE();
   else
-    UseLinkageNames = DwarfLinkageNames == Enable;
+    UseAllLinkageNames = DwarfLinkageNames == AllLinkageNames;
 
   unsigned DwarfVersionNumber = Asm->TM.Options.MCOptions.DwarfVersion;
   DwarfVersion = DwarfVersionNumber ? DwarfVersionNumber
@@ -265,12 +267,10 @@ DwarfDebug::DwarfDebug(AsmPrinter *A, Module *M)
   // https://sourceware.org/bugzilla/show_bug.cgi?id=11616
   UseGNUTLSOpcode = tuneForGDB() || DwarfVersion < 3;
 
-  Asm->OutStreamer->getContext().setDwarfVersion(DwarfVersion);
+  // GDB does not fully support the DWARF 4 representation for bitfields.
+  UseDWARF2Bitfields = (DwarfVersion < 4) || tuneForGDB();
 
-  {
-    NamedRegionTimer T(DbgTimerName, DWARFGroupName, TimePassesIsEnabled);
-    beginModule();
-  }
+  Asm->OutStreamer->getContext().setDwarfVersion(DwarfVersion);
 }
 
 // Define out of line so we don't have to include DwarfUnit.h in DwarfDebug.h.
@@ -297,7 +297,6 @@ static void getObjCClassCategory(StringRef In, StringRef &Class,
 
   Class = In.slice(In.find('[') + 1, In.find('('));
   Category = In.slice(In.find('[') + 1, In.find(' '));
-  return;
 }
 
 static StringRef getObjCMethodName(StringRef In) {
@@ -367,8 +366,8 @@ void DwarfDebug::constructAbstractSubprogramScopeDIE(LexicalScope *Scope) {
 
   // Find the subprogram's DwarfCompileUnit in the SPMap in case the subprogram
   // was inlined from another compile unit.
-  auto &CU = SPMap[SP];
-  forBothCUs(*CU, [&](DwarfCompileUnit &CU) {
+  auto &CU = *CUMap.lookup(cast<DISubprogram>(SP)->getUnit());
+  forBothCUs(CU, [&](DwarfCompileUnit &CU) {
     CU.constructAbstractSubprogramScopeDIE(Scope);
   });
 }
@@ -392,8 +391,11 @@ DwarfDebug::constructDwarfCompileUnit(const DICompileUnit *DIUnit) {
   DwarfCompileUnit &NewCU = *OwnedUnit;
   DIE &Die = NewCU.getUnitDie();
   InfoHolder.addUnit(std::move(OwnedUnit));
-  if (useSplitDwarf())
+  if (useSplitDwarf()) {
     NewCU.setSkeleton(constructSkeletonCU(NewCU));
+    NewCU.addString(Die, dwarf::DW_AT_GNU_dwo_name,
+                    DIUnit->getSplitDebugFilename());
+  }
 
   // LTO with assembly output shares a single line table amongst multiple CUs.
   // To avoid the compilation directory being ambiguous, let the line table
@@ -419,16 +421,18 @@ DwarfDebug::constructDwarfCompileUnit(const DICompileUnit *DIUnit) {
     addGnuPubAttributes(NewCU, Die);
   }
 
-  if (DIUnit->isOptimized())
-    NewCU.addFlag(Die, dwarf::DW_AT_APPLE_optimized);
+  if (useAppleExtensionAttributes()) {
+    if (DIUnit->isOptimized())
+      NewCU.addFlag(Die, dwarf::DW_AT_APPLE_optimized);
 
-  StringRef Flags = DIUnit->getFlags();
-  if (!Flags.empty())
-    NewCU.addString(Die, dwarf::DW_AT_APPLE_flags, Flags);
+    StringRef Flags = DIUnit->getFlags();
+    if (!Flags.empty())
+      NewCU.addString(Die, dwarf::DW_AT_APPLE_flags, Flags);
 
-  if (unsigned RVer = DIUnit->getRuntimeVersion())
-    NewCU.addUInt(Die, dwarf::DW_AT_APPLE_major_runtime_vers,
-                  dwarf::DW_FORM_data1, RVer);
+    if (unsigned RVer = DIUnit->getRuntimeVersion())
+      NewCU.addUInt(Die, dwarf::DW_AT_APPLE_major_runtime_vers,
+                    dwarf::DW_FORM_data1, RVer);
+  }
 
   if (useSplitDwarf())
     NewCU.initSection(Asm->getObjFileLowering().getDwarfInfoDWOSection());
@@ -460,48 +464,42 @@ void DwarfDebug::constructAndAddImportedEntityDIE(DwarfCompileUnit &TheCU,
 // global DIEs and emit initial debug info sections. This is invoked by
 // the target AsmPrinter.
 void DwarfDebug::beginModule() {
+  NamedRegionTimer T(DbgTimerName, DWARFGroupName, TimePassesIsEnabled);
   if (DisableDebugInfoPrinting)
     return;
 
   const Module *M = MMI->getModule();
 
-  NamedMDNode *CU_Nodes = M->getNamedMetadata("llvm.dbg.cu");
-  if (!CU_Nodes)
-    return;
-  TypeIdentifierMap = generateDITypeIdentifierMap(CU_Nodes);
-
-  SingleCU = CU_Nodes->getNumOperands() == 1;
+  unsigned NumDebugCUs = std::distance(M->debug_compile_units_begin(),
+                                       M->debug_compile_units_end());
+  // Tell MMI whether we have debug info.
+  MMI->setDebugInfoAvailability(NumDebugCUs > 0);
+  SingleCU = NumDebugCUs == 1;
 
-  for (MDNode *N : CU_Nodes->operands()) {
-    auto *CUNode = cast<DICompileUnit>(N);
+  for (DICompileUnit *CUNode : M->debug_compile_units()) {
     DwarfCompileUnit &CU = constructDwarfCompileUnit(CUNode);
     for (auto *IE : CUNode->getImportedEntities())
       CU.addImportedEntity(IE);
     for (auto *GV : CUNode->getGlobalVariables())
       CU.getOrCreateGlobalVariableDIE(GV);
-    for (auto *SP : CUNode->getSubprograms())
-      SPMap.insert(std::make_pair(SP, &CU));
     for (auto *Ty : CUNode->getEnumTypes()) {
       // The enum types array by design contains pointers to
       // MDNodes rather than DIRefs. Unique them here.
-      CU.getOrCreateTypeDIE(cast<DIType>(resolve(Ty->getRef())));
+      CU.getOrCreateTypeDIE(cast<DIType>(Ty));
     }
     for (auto *Ty : CUNode->getRetainedTypes()) {
       // The retained types array by design contains pointers to
       // MDNodes rather than DIRefs. Unique them here.
-      DIType *RT = cast<DIType>(resolve(Ty->getRef()));
-      if (!RT->isExternalTypeRef())
-        // There is no point in force-emitting a forward declaration.
-        CU.getOrCreateTypeDIE(RT);
+      if (DIType *RT = dyn_cast<DIType>(Ty))
+        if (!RT->isExternalTypeRef())
+          // There is no point in force-emitting a forward declaration.
+          CU.getOrCreateTypeDIE(RT);
     }
     // Emit imported_modules last so that the relevant context is already
     // available.
     for (auto *IE : CUNode->getImportedEntities())
       constructAndAddImportedEntityDIE(CU, IE);
   }
-
-  // Tell MMI that we have debug info.
-  MMI->setDebugInfoAvailability(true);
 }
 
 void DwarfDebug::finishVariableDefinitions() {
@@ -524,31 +522,13 @@ void DwarfDebug::finishVariableDefinitions() {
 }
 
 void DwarfDebug::finishSubprogramDefinitions() {
-  for (const auto &P : SPMap)
-    forBothCUs(*P.second, [&](DwarfCompileUnit &CU) {
-      CU.finishSubprogramDefinition(cast<DISubprogram>(P.first));
-    });
-}
-
-
-// Collect info for variables that were optimized out.
-void DwarfDebug::collectDeadVariables() {
-  const Module *M = MMI->getModule();
-
-  if (NamedMDNode *CU_Nodes = M->getNamedMetadata("llvm.dbg.cu")) {
-    for (MDNode *N : CU_Nodes->operands()) {
-      auto *TheCU = cast<DICompileUnit>(N);
-      // Construct subprogram DIE and add variables DIEs.
-      DwarfCompileUnit *SPCU =
-          static_cast<DwarfCompileUnit *>(CUMap.lookup(TheCU));
-      assert(SPCU && "Unable to find Compile Unit!");
-      for (auto *SP : TheCU->getSubprograms()) {
-        if (ProcessedSPNodes.count(SP) != 0)
-          continue;
-        SPCU->collectDeadVariables(SP);
-      }
-    }
-  }
+  for (auto &F : MMI->getModule()->functions())
+    if (auto *SP = F.getSubprogram())
+      if (ProcessedSPNodes.count(SP) &&
+          SP->getUnit()->getEmissionKind() != DICompileUnit::NoDebug)
+        forBothCUs(*CUMap.lookup(SP->getUnit()), [&](DwarfCompileUnit &CU) {
+          CU.finishSubprogramDefinition(SP);
+        });
 }
 
 void DwarfDebug::finalizeModuleInfo() {
@@ -558,11 +538,6 @@ void DwarfDebug::finalizeModuleInfo() {
 
   finishVariableDefinitions();
 
-  // Collect info for variables that were optimized out.
-  collectDeadVariables();
-
-  unsigned MacroOffset = 0;
-  std::unique_ptr<AsmStreamerBase> AS(new SizeReporterAsmStreamer(Asm));
   // Handle anything that needs to be done on a per-unit basis after
   // all other generation.
   for (const auto &P : CUMap) {
@@ -617,13 +592,11 @@ void DwarfDebug::finalizeModuleInfo() {
     }
 
     auto *CUNode = cast<DICompileUnit>(P.first);
-    if (CUNode->getMacros()) {
-      // Compile Unit has macros, emit "DW_AT_macro_info" attribute.
-      U.addUInt(U.getUnitDie(), dwarf::DW_AT_macro_info,
-                dwarf::DW_FORM_sec_offset, MacroOffset);
-      // Update macro section offset
-      MacroOffset += handleMacroNodes(AS.get(), CUNode->getMacros(), U);
-    }
+    // If compile Unit has macros, emit "DW_AT_macro_info" attribute.
+    if (CUNode->getMacros())
+      U.addSectionLabel(U.getUnitDie(), dwarf::DW_AT_macro_info,
+                        U.getMacroLabelBegin(),
+                        TLOF.getDwarfMacinfoSection()->getBeginSymbol());
   }
 
   // Compute DIE offsets and sizes.
@@ -694,7 +667,6 @@ void DwarfDebug::endModule() {
   }
 
   // clean up.
-  SPMap.clear();
   AbstractVariables.clear();
 }
 
@@ -717,7 +689,7 @@ DbgVariable *DwarfDebug::getExistingAbstractVariable(InlinedVariable IV) {
 
 void DwarfDebug::createAbstractVariable(const DILocalVariable *Var,
                                         LexicalScope *Scope) {
-  auto AbsDbgVariable = make_unique<DbgVariable>(Var, /* IA */ nullptr, this);
+  auto AbsDbgVariable = make_unique<DbgVariable>(Var, /* IA */ nullptr);
   InfoHolder.addScopeVariable(Scope, AbsDbgVariable.get());
   AbstractVariables[Var] = std::move(AbsDbgVariable);
 }
@@ -761,7 +733,7 @@ void DwarfDebug::collectVariableInfoFromMMITable(
       continue;
 
     ensureAbstractVariableIsCreatedIfScoped(Var, Scope->getScopeNode());
-    auto RegVar = make_unique<DbgVariable>(Var.first, Var.second, this);
+    auto RegVar = make_unique<DbgVariable>(Var.first, Var.second);
     RegVar->initializeMMI(VI.Expr, VI.Slot);
     if (InfoHolder.addScopeVariable(Scope, RegVar.get()))
       ConcreteVariables.push_back(std::move(RegVar));
@@ -793,29 +765,6 @@ static DebugLocEntry::Value getDebugLocValue(const MachineInstr *MI) {
   llvm_unreachable("Unexpected 4-operand DBG_VALUE instruction!");
 }
 
-// Determine the relative position of the pieces described by P1 and P2.
-// Returns  -1 if P1 is entirely before P2, 0 if P1 and P2 overlap,
-// 1 if P1 is entirely after P2.
-static int pieceCmp(const DIExpression *P1, const DIExpression *P2) {
-  unsigned l1 = P1->getBitPieceOffset();
-  unsigned l2 = P2->getBitPieceOffset();
-  unsigned r1 = l1 + P1->getBitPieceSize();
-  unsigned r2 = l2 + P2->getBitPieceSize();
-  if (r1 <= l2)
-    return -1;
-  else if (r2 <= l1)
-    return 1;
-  else
-    return 0;
-}
-
-/// Determine whether two variable pieces overlap.
-static bool piecesOverlap(const DIExpression *P1, const DIExpression *P2) {
-  if (!P1->isBitPiece() || !P2->isBitPiece())
-    return true;
-  return pieceCmp(P1, P2) == 0;
-}
-
 /// \brief If this and Next are describing different pieces of the same
 /// variable, merge them by appending Next's values to the current
 /// list of values.
@@ -832,8 +781,9 @@ bool DebugLocEntry::MergeValues(const DebugLocEntry &Next) {
     // sorted.
     for (unsigned i = 0, j = 0; i < Values.size(); ++i) {
       for (; j < Next.Values.size(); ++j) {
-        int res = pieceCmp(cast<DIExpression>(Values[i].Expression),
-                           cast<DIExpression>(Next.Values[j].Expression));
+        int res = DebugHandlerBase::pieceCmp(
+            cast<DIExpression>(Values[i].Expression),
+            cast<DIExpression>(Next.Values[j].Expression));
         if (res == 0) // The two expressions overlap, we can't merge.
           return false;
         // Values[i] is entirely before Next.Values[j],
@@ -944,7 +894,7 @@ DwarfDebug::buildLocationList(SmallVectorImpl<DebugLocEntry> &DebugLoc,
     DEBUG({
       dbgs() << CurEntry->getValues().size() << " Values:\n";
       for (auto &Value : CurEntry->getValues())
-        Value.getExpression()->dump();
+        Value.dump();
       dbgs() << "-----\n";
     });
 
@@ -957,12 +907,23 @@ DwarfDebug::buildLocationList(SmallVectorImpl<DebugLocEntry> &DebugLoc,
 DbgVariable *DwarfDebug::createConcreteVariable(LexicalScope &Scope,
                                                 InlinedVariable IV) {
   ensureAbstractVariableIsCreatedIfScoped(IV, Scope.getScopeNode());
-  ConcreteVariables.push_back(
-      make_unique<DbgVariable>(IV.first, IV.second, this));
+  ConcreteVariables.push_back(make_unique<DbgVariable>(IV.first, IV.second));
   InfoHolder.addScopeVariable(&Scope, ConcreteVariables.back().get());
   return ConcreteVariables.back().get();
 }
 
+// Determine whether this DBG_VALUE is valid at the beginning of the function.
+static bool validAtEntry(const MachineInstr *MInsn) {
+  auto MBB = MInsn->getParent();
+  // Is it in the entry basic block?
+  if (!MBB->pred_empty())
+    return false;
+  for (MachineBasicBlock::const_reverse_iterator I(MInsn); I != MBB->rend(); ++I)
+    if (!(I->isDebugValue() || I->getFlag(MachineInstr::FrameSetup)))
+      return false;
+  return true;
+}
+
 // Find variables for each lexical scope.
 void DwarfDebug::collectVariableInfo(DwarfCompileUnit &TheCU,
                                      const DISubprogram *SP,
@@ -995,8 +956,11 @@ void DwarfDebug::collectVariableInfo(DwarfCompileUnit &TheCU,
     const MachineInstr *MInsn = Ranges.front().first;
     assert(MInsn->isDebugValue() && "History must begin with debug value");
 
-    // Check if the first DBG_VALUE is valid for the rest of the function.
-    if (Ranges.size() == 1 && Ranges.front().second == nullptr) {
+    // Check if there is a single DBG_VALUE, valid throughout the function.
+    // A single constant is also considered valid for the entire function.
+    if (Ranges.size() == 1 &&
+        (MInsn->getOperand(0).isImm() ||
+         (validAtEntry(MInsn) && Ranges.front().second == nullptr))) {
       RegVar->initializeDbgValue(MInsn);
       continue;
     }
@@ -1008,7 +972,7 @@ void DwarfDebug::collectVariableInfo(DwarfCompileUnit &TheCU,
     SmallVector<DebugLocEntry, 8> Entries;
     buildLocationList(Entries, Ranges);
 
-    // If the variable has an DIBasicType, extract it.  Basic types cannot have
+    // If the variable has a DIBasicType, extract it.  Basic types cannot have
     // unique identifiers, so don't bother resolving the type with the
     // identifier map.
     const DIBasicType *BT = dyn_cast<DIBasicType>(
@@ -1027,25 +991,14 @@ void DwarfDebug::collectVariableInfo(DwarfCompileUnit &TheCU,
   }
 }
 
-// Return Label preceding the instruction.
-MCSymbol *DwarfDebug::getLabelBeforeInsn(const MachineInstr *MI) {
-  MCSymbol *Label = LabelsBeforeInsn.lookup(MI);
-  assert(Label && "Didn't insert label before instruction");
-  return Label;
-}
-
-// Return Label immediately following the instruction.
-MCSymbol *DwarfDebug::getLabelAfterInsn(const MachineInstr *MI) {
-  return LabelsAfterInsn.lookup(MI);
-}
-
 // Process beginning of an instruction.
 void DwarfDebug::beginInstruction(const MachineInstr *MI) {
-  assert(CurMI == nullptr);
-  CurMI = MI;
+  DebugHandlerBase::beginInstruction(MI);
+  assert(CurMI);
+
   // Check if source location changes, but ignore DBG_VALUE locations.
   if (!MI->isDebugValue()) {
-    DebugLoc DL = MI->getDebugLoc();
+    const DebugLoc &DL = MI->getDebugLoc();
     if (DL != PrevInstLoc) {
       if (DL) {
         unsigned Flags = 0;
@@ -1067,78 +1020,6 @@ void DwarfDebug::beginInstruction(const MachineInstr *MI) {
       }
     }
   }
-
-  // Insert labels where requested.
-  DenseMap<const MachineInstr *, MCSymbol *>::iterator I =
-      LabelsBeforeInsn.find(MI);
-
-  // No label needed.
-  if (I == LabelsBeforeInsn.end())
-    return;
-
-  // Label already assigned.
-  if (I->second)
-    return;
-
-  if (!PrevLabel) {
-    PrevLabel = MMI->getContext().createTempSymbol();
-    Asm->OutStreamer->EmitLabel(PrevLabel);
-  }
-  I->second = PrevLabel;
-}
-
-// Process end of an instruction.
-void DwarfDebug::endInstruction() {
-  assert(CurMI != nullptr);
-  // Don't create a new label after DBG_VALUE instructions.
-  // They don't generate code.
-  if (!CurMI->isDebugValue())
-    PrevLabel = nullptr;
-
-  DenseMap<const MachineInstr *, MCSymbol *>::iterator I =
-      LabelsAfterInsn.find(CurMI);
-  CurMI = nullptr;
-
-  // No label needed.
-  if (I == LabelsAfterInsn.end())
-    return;
-
-  // Label already assigned.
-  if (I->second)
-    return;
-
-  // We need a label after this instruction.
-  if (!PrevLabel) {
-    PrevLabel = MMI->getContext().createTempSymbol();
-    Asm->OutStreamer->EmitLabel(PrevLabel);
-  }
-  I->second = PrevLabel;
-}
-
-// Each LexicalScope has first instruction and last instruction to mark
-// beginning and end of a scope respectively. Create an inverse map that list
-// scopes starts (and ends) with an instruction. One instruction may start (or
-// end) multiple scopes. Ignore scopes that are not reachable.
-void DwarfDebug::identifyScopeMarkers() {
-  SmallVector<LexicalScope *, 4> WorkList;
-  WorkList.push_back(LScopes.getCurrentFunctionScope());
-  while (!WorkList.empty()) {
-    LexicalScope *S = WorkList.pop_back_val();
-
-    const SmallVectorImpl<LexicalScope *> &Children = S->getChildren();
-    if (!Children.empty())
-      WorkList.append(Children.begin(), Children.end());
-
-    if (S->isAbstractScope())
-      continue;
-
-    for (const InsnRange &R : S->getRanges()) {
-      assert(R.first && "InsnRange does not have first instruction!");
-      assert(R.second && "InsnRange does not have second instruction!");
-      requestLabelBeforeInsn(R.first);
-      requestLabelAfterInsn(R.second);
-    }
-  }
 }
 
 static DebugLoc findPrologueEndLoc(const MachineFunction *MF) {
@@ -1167,15 +1048,10 @@ void DwarfDebug::beginFunction(const MachineFunction *MF) {
 
   // Grab the lexical scopes for the function, if we don't have any of those
   // then we're not going to be able to do anything.
-  LScopes.initialize(*MF);
+  DebugHandlerBase::beginFunction(MF);
   if (LScopes.empty())
     return;
 
-  assert(DbgValues.empty() && "DbgValues map wasn't cleaned!");
-
-  // Make sure that each lexical scope will have a begin/end label.
-  identifyScopeMarkers();
-
   // Set DwarfDwarfCompileUnitID in MCContext to the Compile Unit this function
   // belongs to so that we add to the correct per-cu line table in the
   // non-asm case.
@@ -1188,55 +1064,19 @@ void DwarfDebug::beginFunction(const MachineFunction *MF) {
   // isn't structurally identical (see: file path/name info from clang, which
   // includes the directory of the cpp file being built, even when the file name
   // is absolute (such as an <> lookup header)))
-  DwarfCompileUnit *TheCU = SPMap.lookup(FnScope->getScopeNode());
-  assert(TheCU && "Unable to find compile unit!");
+  auto *SP = cast<DISubprogram>(FnScope->getScopeNode());
+  DwarfCompileUnit *TheCU = CUMap.lookup(SP->getUnit());
+  if (!TheCU) {
+    assert(SP->getUnit()->getEmissionKind() == DICompileUnit::NoDebug &&
+           "DICompileUnit missing from llvm.dbg.cu?");
+    return;
+  }
   if (Asm->OutStreamer->hasRawTextSupport())
     // Use a single line table if we are generating assembly.
     Asm->OutStreamer->getContext().setDwarfCompileUnitID(0);
   else
     Asm->OutStreamer->getContext().setDwarfCompileUnitID(TheCU->getUniqueID());
 
-  // Calculate history for local variables.
-  calculateDbgValueHistory(MF, Asm->MF->getSubtarget().getRegisterInfo(),
-                           DbgValues);
-
-  // Request labels for the full history.
-  for (const auto &I : DbgValues) {
-    const auto &Ranges = I.second;
-    if (Ranges.empty())
-      continue;
-
-    // The first mention of a function argument gets the CurrentFnBegin
-    // label, so arguments are visible when breaking at function entry.
-    const DILocalVariable *DIVar = Ranges.front().first->getDebugVariable();
-    if (DIVar->isParameter() &&
-        getDISubprogram(DIVar->getScope())->describes(MF->getFunction())) {
-      LabelsBeforeInsn[Ranges.front().first] = Asm->getFunctionBegin();
-      if (Ranges.front().first->getDebugExpression()->isBitPiece()) {
-        // Mark all non-overlapping initial pieces.
-        for (auto I = Ranges.begin(); I != Ranges.end(); ++I) {
-          const DIExpression *Piece = I->first->getDebugExpression();
-          if (std::all_of(Ranges.begin(), I,
-                          [&](DbgValueHistoryMap::InstrRange Pred) {
-                return !piecesOverlap(Piece, Pred.first->getDebugExpression());
-              }))
-            LabelsBeforeInsn[I->first] = Asm->getFunctionBegin();
-          else
-            break;
-        }
-      }
-    }
-
-    for (const auto &Range : Ranges) {
-      requestLabelBeforeInsn(Range.first);
-      if (Range.second)
-        requestLabelAfterInsn(Range.second);
-    }
-  }
-
-  PrevInstLoc = DebugLoc();
-  PrevLabel = Asm->getFunctionBegin();
-
   // Record beginning of function.
   PrologEndLoc = findPrologueEndLoc(MF);
   if (DILocation *L = PrologEndLoc) {
@@ -1252,13 +1092,19 @@ void DwarfDebug::endFunction(const MachineFunction *MF) {
   assert(CurFn == MF &&
       "endFunction should be called with the same function as beginFunction");
 
-  if (!MMI->hasDebugInfo() || LScopes.empty() ||
-      !MF->getFunction()->getSubprogram()) {
+  const DISubprogram *SP = MF->getFunction()->getSubprogram();
+  if (!MMI->hasDebugInfo() || LScopes.empty() || !SP ||
+      SP->getUnit()->getEmissionKind() == DICompileUnit::NoDebug) {
     // If we don't have a lexical scope for this function then there will
     // be a hole in the range information. Keep note of this by setting the
     // previously used section to nullptr.
     PrevCU = nullptr;
     CurFn = nullptr;
+    DebugHandlerBase::endFunction(MF);
+    // Mark functions with no debug info on any instructions, but a
+    // valid DISubprogram as processed.
+    if (SP)
+      ProcessedSPNodes.insert(SP);
     return;
   }
 
@@ -1266,8 +1112,8 @@ void DwarfDebug::endFunction(const MachineFunction *MF) {
   Asm->OutStreamer->getContext().setDwarfCompileUnitID(0);
 
   LexicalScope *FnScope = LScopes.getCurrentFunctionScope();
-  auto *SP = cast<DISubprogram>(FnScope->getScopeNode());
-  DwarfCompileUnit &TheCU = *SPMap.lookup(SP);
+  SP = cast<DISubprogram>(FnScope->getScopeNode());
+  DwarfCompileUnit &TheCU = *CUMap.lookup(SP->getUnit());
 
   DenseSet<InlinedVariable> ProcessedVars;
   collectVariableInfo(TheCU, SP, ProcessedVars);
@@ -1277,17 +1123,16 @@ void DwarfDebug::endFunction(const MachineFunction *MF) {
 
   // Under -gmlt, skip building the subprogram if there are no inlined
   // subroutines inside it.
-  if (TheCU.getCUNode()->getEmissionKind() == DIBuilder::LineTablesOnly &&
+  if (TheCU.getCUNode()->getEmissionKind() == DICompileUnit::LineTablesOnly &&
       LScopes.getAbstractScopesList().empty() && !IsDarwin) {
     assert(InfoHolder.getScopeVariables().empty());
     assert(DbgValues.empty());
     // FIXME: This wouldn't be true in LTO with a -g (with inlining) CU followed
     // by a -gmlt CU. Add a test and remove this assertion.
     assert(AbstractVariables.empty());
-    LabelsBeforeInsn.clear();
-    LabelsAfterInsn.clear();
     PrevLabel = nullptr;
     CurFn = nullptr;
+    DebugHandlerBase::endFunction(MF);
     return;
   }
 
@@ -1319,11 +1164,9 @@ void DwarfDebug::endFunction(const MachineFunction *MF) {
   // DbgVariables except those that are also in AbstractVariables (since they
   // can be used cross-function)
   InfoHolder.getScopeVariables().clear();
-  DbgValues.clear();
-  LabelsBeforeInsn.clear();
-  LabelsAfterInsn.clear();
   PrevLabel = nullptr;
   CurFn = nullptr;
+  DebugHandlerBase::endFunction(MF);
 }
 
 // Register a source line with debug info. Returns the  unique label that was
@@ -1535,7 +1378,7 @@ void DwarfDebug::emitDebugPubTypes(bool GnuStyle) {
                       &DwarfCompileUnit::getGlobalTypes);
 }
 
-// Emit visible names into a debug str section.
+/// Emit null-terminated strings into a debug str section.
 void DwarfDebug::emitDebugStr() {
   DwarfFile &Holder = useSplitDwarf() ? SkeletonHolder : InfoHolder;
   Holder.emitStrings(Asm->getObjFileLowering().getDwarfStrSection());
@@ -1554,8 +1397,7 @@ static void emitDebugLocValue(const AsmPrinter &AP, const DIBasicType *BT,
                               ByteStreamer &Streamer,
                               const DebugLocEntry::Value &Value,
                               unsigned PieceOffsetInBits) {
-  DebugLocDwarfExpression DwarfExpr(*AP.MF->getSubtarget().getRegisterInfo(),
-                                    AP.getDwarfDebug()->getDwarfVersion(),
+  DebugLocDwarfExpression DwarfExpr(AP.getDwarfDebug()->getDwarfVersion(),
                                     Streamer);
   // Regular entry.
   if (Value.isInt()) {
@@ -1572,18 +1414,19 @@ static void emitDebugLocValue(const AsmPrinter &AP, const DIBasicType *BT,
       AP.EmitDwarfRegOp(Streamer, Loc);
     else {
       // Complex address entry.
+      const TargetRegisterInfo &TRI = *AP.MF->getSubtarget().getRegisterInfo();
       if (Loc.getOffset()) {
-        DwarfExpr.AddMachineRegIndirect(Loc.getReg(), Loc.getOffset());
+        DwarfExpr.AddMachineRegIndirect(TRI, Loc.getReg(), Loc.getOffset());
         DwarfExpr.AddExpression(Expr->expr_op_begin(), Expr->expr_op_end(),
                                 PieceOffsetInBits);
       } else
-        DwarfExpr.AddMachineRegExpression(Expr, Loc.getReg(),
+        DwarfExpr.AddMachineRegExpression(TRI, Expr, Loc.getReg(),
                                           PieceOffsetInBits);
     }
+  } else if (Value.isConstantFP()) {
+    APInt RawBytes = Value.getConstantFP()->getValueAPF().bitcastToAPInt();
+    DwarfExpr.AddUnsignedConstant(RawBytes);
   }
-  // else ... ignore constant fp. There is not any good way to
-  // to represent them here in dwarf.
-  // FIXME: ^
 }
 
 void DebugLocEntry::finalize(const AsmPrinter &AP,
@@ -1608,8 +1451,7 @@ void DebugLocEntry::finalize(const AsmPrinter &AP,
       assert(Offset <= PieceOffset && "overlapping or duplicate pieces");
       if (Offset < PieceOffset) {
         // The DWARF spec seriously mandates pieces with no locations for gaps.
-        DebugLocDwarfExpression Expr(*AP.MF->getSubtarget().getRegisterInfo(),
-                                     AP.getDwarfDebug()->getDwarfVersion(),
+        DebugLocDwarfExpression Expr(AP.getDwarfDebug()->getDwarfVersion(),
                                      Streamer);
         Expr.AddOpPiece(PieceOffset-Offset, 0);
         Offset += PieceOffset-Offset;
@@ -1708,24 +1550,12 @@ void DwarfDebug::emitDebugARanges() {
     }
   }
 
-  // Add terminating symbols for each section.
-  for (const auto &I : SectionMap) {
-    MCSection *Section = I.first;
-    MCSymbol *Sym = nullptr;
-
-    if (Section)
-      Sym = Asm->OutStreamer->endSection(Section);
-
-    // Insert a final terminator.
-    SectionMap[Section].push_back(SymbolCU(nullptr, Sym));
-  }
-
   DenseMap<DwarfCompileUnit *, std::vector<ArangeSpan>> Spans;
 
   for (auto &I : SectionMap) {
-    const MCSection *Section = I.first;
+    MCSection *Section = I.first;
     SmallVector<SymbolCU, 8> &List = I.second;
-    if (List.size() < 2)
+    if (List.size() < 1)
       continue;
 
     // If we have no section (e.g. common), just write out
@@ -1735,26 +1565,29 @@ void DwarfDebug::emitDebugARanges() {
         ArangeSpan Span;
         Span.Start = Cur.Sym;
         Span.End = nullptr;
-        if (Cur.CU)
-          Spans[Cur.CU].push_back(Span);
+        assert(Cur.CU);
+        Spans[Cur.CU].push_back(Span);
       }
       continue;
     }
 
     // Sort the symbols by offset within the section.
-    std::sort(List.begin(), List.end(),
-              [&](const SymbolCU &A, const SymbolCU &B) {
-      unsigned IA = A.Sym ? Asm->OutStreamer->GetSymbolOrder(A.Sym) : 0;
-      unsigned IB = B.Sym ? Asm->OutStreamer->GetSymbolOrder(B.Sym) : 0;
-
-      // Symbols with no order assigned should be placed at the end.
-      // (e.g. section end labels)
-      if (IA == 0)
-        return false;
-      if (IB == 0)
-        return true;
-      return IA < IB;
-    });
+    std::sort(
+        List.begin(), List.end(), [&](const SymbolCU &A, const SymbolCU &B) {
+          unsigned IA = A.Sym ? Asm->OutStreamer->GetSymbolOrder(A.Sym) : 0;
+          unsigned IB = B.Sym ? Asm->OutStreamer->GetSymbolOrder(B.Sym) : 0;
+
+          // Symbols with no order assigned should be placed at the end.
+          // (e.g. section end labels)
+          if (IA == 0)
+            return false;
+          if (IB == 0)
+            return true;
+          return IA < IB;
+        });
+
+    // Insert a final terminator.
+    List.push_back(SymbolCU(nullptr, Asm->OutStreamer->endSection(Section)));
 
     // Build spans between each label.
     const MCSymbol *StartSym = List[0].Sym;
@@ -1767,6 +1600,7 @@ void DwarfDebug::emitDebugARanges() {
         ArangeSpan Span;
         Span.Start = StartSym;
         Span.End = Cur.Sym;
+        assert(Prev.CU);
         Spans[Prev.CU].push_back(Span);
         StartSym = Cur.Sym;
       }
@@ -1787,9 +1621,10 @@ void DwarfDebug::emitDebugARanges() {
   }
 
   // Sort the CU list (again, to ensure consistent output order).
-  std::sort(CUs.begin(), CUs.end(), [](const DwarfUnit *A, const DwarfUnit *B) {
-    return A->getUniqueID() < B->getUniqueID();
-  });
+  std::sort(CUs.begin(), CUs.end(),
+            [](const DwarfCompileUnit *A, const DwarfCompileUnit *B) {
+              return A->getUniqueID() < B->getUniqueID();
+            });
 
   // Emit an arange table for each CU we used.
   for (DwarfCompileUnit *CU : CUs) {
@@ -1827,7 +1662,7 @@ void DwarfDebug::emitDebugARanges() {
     Asm->OutStreamer->AddComment("Segment Size (in bytes)");
     Asm->EmitInt8(0);
 
-    Asm->OutStreamer->EmitFill(Padding, 0xff);
+    Asm->OutStreamer->emitFill(Padding, 0xff);
 
     for (const ArangeSpan &Span : List) {
       Asm->EmitLabelReference(Span.Start, PtrSize);
@@ -1852,7 +1687,7 @@ void DwarfDebug::emitDebugARanges() {
   }
 }
 
-// Emit visible names into a debug ranges section.
+/// Emit address ranges into a debug ranges section.
 void DwarfDebug::emitDebugRanges() {
   // Start the dwarf ranges section.
   Asm->OutStreamer->SwitchSection(
@@ -1894,65 +1729,56 @@ void DwarfDebug::emitDebugRanges() {
   }
 }
 
-unsigned DwarfDebug::handleMacroNodes(AsmStreamerBase *AS,
-                                      DIMacroNodeArray Nodes,
-                                      DwarfCompileUnit &U) {
-  unsigned Size = 0;
+void DwarfDebug::handleMacroNodes(DIMacroNodeArray Nodes, DwarfCompileUnit &U) {
   for (auto *MN : Nodes) {
     if (auto *M = dyn_cast<DIMacro>(MN))
-      Size += emitMacro(AS, *M);
+      emitMacro(*M);
     else if (auto *F = dyn_cast<DIMacroFile>(MN))
-      Size += emitMacroFile(AS, *F, U);
+      emitMacroFile(*F, U);
     else
       llvm_unreachable("Unexpected DI type!");
   }
-  return Size;
 }
 
-unsigned DwarfDebug::emitMacro(AsmStreamerBase *AS, DIMacro &M) {
-  int Size = 0;
-  Size += AS->emitULEB128(M.getMacinfoType());
-  Size += AS->emitULEB128(M.getLine());
+void DwarfDebug::emitMacro(DIMacro &M) {
+  Asm->EmitULEB128(M.getMacinfoType());
+  Asm->EmitULEB128(M.getLine());
   StringRef Name = M.getName();
   StringRef Value = M.getValue();
-  Size += AS->emitBytes(Name);
+  Asm->OutStreamer->EmitBytes(Name);
   if (!Value.empty()) {
     // There should be one space between macro name and macro value.
-    Size += AS->emitInt8(' ');
-    Size += AS->emitBytes(Value);
+    Asm->EmitInt8(' ');
+    Asm->OutStreamer->EmitBytes(Value);
   }
-  Size += AS->emitInt8('\0');
-  return Size;
+  Asm->EmitInt8('\0');
 }
 
-unsigned DwarfDebug::emitMacroFile(AsmStreamerBase *AS, DIMacroFile &F,
-                                   DwarfCompileUnit &U) {
-  int Size = 0;
+void DwarfDebug::emitMacroFile(DIMacroFile &F, DwarfCompileUnit &U) {
   assert(F.getMacinfoType() == dwarf::DW_MACINFO_start_file);
-  Size += AS->emitULEB128(dwarf::DW_MACINFO_start_file);
-  Size += AS->emitULEB128(F.getLine());
+  Asm->EmitULEB128(dwarf::DW_MACINFO_start_file);
+  Asm->EmitULEB128(F.getLine());
   DIFile *File = F.getFile();
   unsigned FID =
       U.getOrCreateSourceID(File->getFilename(), File->getDirectory());
-  Size += AS->emitULEB128(FID);
-  Size += handleMacroNodes(AS, F.getElements(), U);
-  Size += AS->emitULEB128(dwarf::DW_MACINFO_end_file);
-  return Size;
+  Asm->EmitULEB128(FID);
+  handleMacroNodes(F.getElements(), U);
+  Asm->EmitULEB128(dwarf::DW_MACINFO_end_file);
 }
 
-// Emit visible names into a debug macinfo section.
+/// Emit macros into a debug macinfo section.
 void DwarfDebug::emitDebugMacinfo() {
-  if (MCSection *Macinfo = Asm->getObjFileLowering().getDwarfMacinfoSection()) {
-    // Start the dwarf macinfo section.
-    Asm->OutStreamer->SwitchSection(Macinfo);
-  }
-  std::unique_ptr<AsmStreamerBase> AS(new EmittingAsmStreamer(Asm));
+  // Start the dwarf macinfo section.
+  Asm->OutStreamer->SwitchSection(
+      Asm->getObjFileLowering().getDwarfMacinfoSection());
+
   for (const auto &P : CUMap) {
     auto &TheCU = *P.second;
     auto *SkCU = TheCU.getSkeleton();
     DwarfCompileUnit &U = SkCU ? *SkCU : TheCU;
     auto *CUNode = cast<DICompileUnit>(P.first);
-    handleMacroNodes(AS.get(), CUNode->getMacros(), U);
+    Asm->OutStreamer->EmitLabel(U.getMacroLabelBegin());
+    handleMacroNodes(CUNode->getMacros(), U);
   }
   Asm->OutStreamer->AddComment("End Of Macro List Mark");
   Asm->EmitInt8(0);
@@ -1961,7 +1787,7 @@ void DwarfDebug::emitDebugMacinfo() {
 // DWARF5 Experimental Separate Dwarf emitters.
 
 void DwarfDebug::initSkeletonUnit(const DwarfUnit &U, DIE &Die,
-                                  std::unique_ptr<DwarfUnit> NewU) {
+                                  std::unique_ptr<DwarfCompileUnit> NewU) {
   NewU->addString(Die, dwarf::DW_AT_GNU_dwo_name,
                   U.getCUNode()->getSplitDebugFilename());
 
@@ -2050,21 +1876,19 @@ void DwarfDebug::addDwarfTypeUnitType(DwarfCompileUnit &CU,
   if (!TypeUnitsUnderConstruction.empty() && AddrPool.hasBeenUsed())
     return;
 
-  const DwarfTypeUnit *&TU = DwarfTypeUnits[CTy];
-  if (TU) {
-    CU.addDIETypeSignature(RefDie, *TU);
+  auto Ins = TypeSignatures.insert(std::make_pair(CTy, 0));
+  if (!Ins.second) {
+    CU.addDIETypeSignature(RefDie, Ins.first->second);
     return;
   }
 
   bool TopLevelType = TypeUnitsUnderConstruction.empty();
   AddrPool.resetUsedFlag();
 
-  auto OwnedUnit = make_unique<DwarfTypeUnit>(
-      InfoHolder.getUnits().size() + TypeUnitsUnderConstruction.size(), CU, Asm,
-      this, &InfoHolder, getDwoLineTable(CU));
+  auto OwnedUnit = make_unique<DwarfTypeUnit>(CU, Asm, this, &InfoHolder,
+                                              getDwoLineTable(CU));
   DwarfTypeUnit &NewTU = *OwnedUnit;
   DIE &UnitDie = NewTU.getUnitDie();
-  TU = &NewTU;
   TypeUnitsUnderConstruction.push_back(
       std::make_pair(std::move(OwnedUnit), CTy));
 
@@ -2073,6 +1897,7 @@ void DwarfDebug::addDwarfTypeUnitType(DwarfCompileUnit &CU,
 
   uint64_t Signature = makeTypeSignature(Identifier);
   NewTU.setTypeSignature(Signature);
+  Ins.first->second = Signature;
 
   if (useSplitDwarf())
     NewTU.initSection(Asm->getObjFileLowering().getDwarfTypesDWOSection());
@@ -2096,7 +1921,7 @@ void DwarfDebug::addDwarfTypeUnitType(DwarfCompileUnit &CU,
       // This is pessimistic as some of these types might not be dependent on
       // the type that used an address.
       for (const auto &TU : TypeUnitsToAdd)
-        DwarfTypeUnits.erase(TU.second);
+        TypeSignatures.erase(TU.second);
 
       // Construct this type in the CU directly.
       // This is inefficient because all the dependent types will be rebuilt
@@ -2108,10 +1933,12 @@ void DwarfDebug::addDwarfTypeUnitType(DwarfCompileUnit &CU,
 
     // If the type wasn't dependent on fission addresses, finish adding the type
     // and all its dependent types.
-    for (auto &TU : TypeUnitsToAdd)
-      InfoHolder.addUnit(std::move(TU.first));
+    for (auto &TU : TypeUnitsToAdd) {
+      InfoHolder.computeSizeAndOffsetsForUnit(TU.first.get());
+      InfoHolder.emitUnit(TU.first.get(), useSplitDwarf());
+    }
   }
-  CU.addDIETypeSignature(RefDie, NewTU);
+  CU.addDIETypeSignature(RefDie, Signature);
 }
 
 // Accelerator table mutators - add each name along with its companion
diff --git a/lib/CodeGen/AsmPrinter/DwarfDebug.h b/lib/CodeGen/AsmPrinter/DwarfDebug.h
index 460c186683fc..6b06757628b6 100644
--- a/lib/CodeGen/AsmPrinter/DwarfDebug.h
+++ b/lib/CodeGen/AsmPrinter/DwarfDebug.h
@@ -14,14 +14,13 @@
 #ifndef LLVM_LIB_CODEGEN_ASMPRINTER_DWARFDEBUG_H
 #define LLVM_LIB_CODEGEN_ASMPRINTER_DWARFDEBUG_H
 
-#include "AsmPrinterHandler.h"
 #include "DbgValueHistoryCalculator.h"
+#include "DebugHandlerBase.h"
 #include "DebugLocStream.h"
 #include "DwarfAccelTable.h"
 #include "DwarfFile.h"
 #include "llvm/ADT/DenseMap.h"
 #include "llvm/ADT/DenseSet.h"
-#include "llvm/ADT/FoldingSet.h"
 #include "llvm/ADT/MapVector.h"
 #include "llvm/ADT/SmallPtrSet.h"
 #include "llvm/ADT/StringMap.h"
@@ -69,15 +68,14 @@ class DbgVariable {
   unsigned DebugLocListIndex = ~0u;          /// Offset in DebugLocs.
   const MachineInstr *MInsn = nullptr;       /// DBG_VALUE instruction.
   SmallVector<int, 1> FrameIndex;            /// Frame index.
-  DwarfDebug *DD;
 
 public:
   /// Construct a DbgVariable.
   ///
   /// Creates a variable without any DW_AT_location.  Call \a initializeMMI()
   /// for MMI entries, or \a initializeDbgValue() for DBG_VALUE instructions.
-  DbgVariable(const DILocalVariable *V, const DILocation *IA, DwarfDebug *DD)
-      : Var(V), IA(IA), DD(DD) {}
+  DbgVariable(const DILocalVariable *V, const DILocation *IA)
+      : Var(V), IA(IA) {}
 
   /// Initialize from the MMI table.
   void initializeMMI(const DIExpression *E, int FI) {
@@ -111,6 +109,10 @@ public:
   const DILocalVariable *getVariable() const { return Var; }
   const DILocation *getInlinedAt() const { return IA; }
   ArrayRef<const DIExpression *> getExpression() const { return Expr; }
+  const DIExpression *getSingleExpression() const {
+    assert(MInsn && Expr.size() <= 1);
+    return Expr.size() ? Expr[0] : nullptr;
+  }
   void setDIE(DIE &D) { TheDIE = &D; }
   DIE *getDIE() const { return TheDIE; }
   void setDebugLocListIndex(unsigned O) { DebugLocListIndex = O; }
@@ -174,9 +176,9 @@ public:
   const DIType *getType() const;
 
 private:
-  /// Look in the DwarfDebug map for the MDNode that
-  /// corresponds to the reference.
-  template <typename T> T *resolve(TypedDINodeRef<T> Ref) const;
+  template <typename T> T *resolve(TypedDINodeRef<T> Ref) const {
+    return Ref.resolve();
+  }
 };
 
 
@@ -188,22 +190,13 @@ struct SymbolCU {
 };
 
 /// Collects and handles dwarf debug information.
-class DwarfDebug : public AsmPrinterHandler {
-  /// Target of Dwarf emission.
-  AsmPrinter *Asm;
-
-  /// Collected machine module information.
-  MachineModuleInfo *MMI;
-
+class DwarfDebug : public DebugHandlerBase {
   /// All DIEValues are allocated through this allocator.
   BumpPtrAllocator DIEValueAllocator;
 
   /// Maps MDNode with its corresponding DwarfCompileUnit.
   MapVector<const MDNode *, DwarfCompileUnit *> CUMap;
 
-  /// Maps subprogram MDNode with its corresponding DwarfCompileUnit.
-  MapVector<const MDNode *, DwarfCompileUnit *> SPMap;
-
   /// Maps a CU DIE with its corresponding DwarfCompileUnit.
   DenseMap<const DIE *, DwarfCompileUnit *> CUDieMap;
 
@@ -213,8 +206,6 @@ class DwarfDebug : public AsmPrinterHandler {
   /// Size of each symbol emitted (for those symbols that have a specific size).
   DenseMap<const MCSymbol *, uint64_t> SymSize;
 
-  LexicalScopes LScopes;
-
   /// Collection of abstract variables.
   DenseMap<const MDNode *, std::unique_ptr<DbgVariable>> AbstractVariables;
   SmallVector<std::unique_ptr<DbgVariable>, 64> ConcreteVariables;
@@ -227,32 +218,9 @@ class DwarfDebug : public AsmPrinterHandler {
   /// create DIEs.
   SmallPtrSet<const MDNode *, 16> ProcessedSPNodes;
 
-  /// Maps instruction with label emitted before instruction.
-  DenseMap<const MachineInstr *, MCSymbol *> LabelsBeforeInsn;
-
-  /// Maps instruction with label emitted after instruction.
-  DenseMap<const MachineInstr *, MCSymbol *> LabelsAfterInsn;
-
-  /// History of DBG_VALUE and clobber instructions for each user
-  /// variable.  Variables are listed in order of appearance.
-  DbgValueHistoryMap DbgValues;
-
-  /// Previous instruction's location information. This is used to
-  /// determine label location to indicate scope boundries in dwarf
-  /// debug info.
-  DebugLoc PrevInstLoc;
-  MCSymbol *PrevLabel;
-
-  /// This location indicates end of function prologue and beginning of
-  /// function body.
-  DebugLoc PrologEndLoc;
-
   /// If nonnull, stores the current machine function we're processing.
   const MachineFunction *CurFn;
 
-  /// If nonnull, stores the current machine instruction we're processing.
-  const MachineInstr *CurMI;
-
   /// If nonnull, stores the CU in which the previous subprogram was contained.
   const DwarfCompileUnit *PrevCU;
 
@@ -266,9 +234,9 @@ class DwarfDebug : public AsmPrinterHandler {
   /// Holders for the various debug information flags that we might need to
   /// have exposed. See accessor functions below for description.
 
-  /// Map from MDNodes for user-defined types to the type units that
-  /// describe them.
-  DenseMap<const MDNode *, const DwarfTypeUnit *> DwarfTypeUnits;
+  /// Map from MDNodes for user-defined types to their type signatures. Also
+  /// used to keep track of which types we have emitted type units for.
+  DenseMap<const MDNode *, uint64_t> TypeSignatures;
 
   SmallVector<
       std::pair<std::unique_ptr<DwarfTypeUnit>, const DICompositeType *>, 1>
@@ -280,18 +248,19 @@ class DwarfDebug : public AsmPrinterHandler {
   /// Whether to use the GNU TLS opcode (instead of the standard opcode).
   bool UseGNUTLSOpcode;
 
-  /// Whether to emit DW_AT_[MIPS_]linkage_name.
-  bool UseLinkageNames;
+  /// Whether to use DWARF 2 bitfields (instead of the DWARF 4 format).
+  bool UseDWARF2Bitfields;
+
+  /// Whether to emit all linkage names, or just abstract subprograms.
+  bool UseAllLinkageNames;
 
   /// Version of dwarf we're emitting.
   unsigned DwarfVersion;
 
-  /// Maps from a type identifier to the actual MDNode.
-  DITypeIdentifierMap TypeIdentifierMap;
-
   /// DWARF5 Experimental Options
   /// @{
   bool HasDwarfAccelTables;
+  bool HasAppleExtensionAttributes;
   bool HasSplitDwarf;
 
   /// Separated Dwarf Variables
@@ -324,9 +293,19 @@ class DwarfDebug : public AsmPrinterHandler {
   // Identify a debugger for "tuning" the debug info.
   DebuggerKind DebuggerTuning;
 
+  /// \defgroup DebuggerTuning Predicates to tune DWARF for a given debugger.
+  ///
+  /// Returns whether we are "tuning" for a given debugger.
+  /// Should be used only within the constructor, to set feature flags.
+  /// @{
+  bool tuneForGDB() const { return DebuggerTuning == DebuggerKind::GDB; }
+  bool tuneForLLDB() const { return DebuggerTuning == DebuggerKind::LLDB; }
+  bool tuneForSCE() const { return DebuggerTuning == DebuggerKind::SCE; }
+  /// @}
+
   MCDwarfDwoLineTable *getDwoLineTable(const DwarfCompileUnit &);
 
-  const SmallVectorImpl<std::unique_ptr<DwarfUnit>> &getUnits() {
+  const SmallVectorImpl<std::unique_ptr<DwarfCompileUnit>> &getUnits() {
     return InfoHolder.getUnits();
   }
 
@@ -347,9 +326,6 @@ class DwarfDebug : public AsmPrinterHandler {
   /// Construct a DIE for this abstract scope.
   void constructAbstractSubprogramScopeDIE(LexicalScope *Scope);
 
-  /// Collect info for variables that were optimized out.
-  void collectDeadVariables();
-
   void finishVariableDefinitions();
 
   void finishSubprogramDefinitions();
@@ -397,7 +373,7 @@ class DwarfDebug : public AsmPrinterHandler {
       bool GnuStyle, MCSection *PSec, StringRef Name,
       const StringMap<const DIE *> &(DwarfCompileUnit::*Accessor)() const);
 
-  /// Emit visible names into a debug str section.
+  /// Emit null-terminated strings into a debug str section.
   void emitDebugStr();
 
   /// Emit variable locations into a debug loc section.
@@ -414,17 +390,15 @@ class DwarfDebug : public AsmPrinterHandler {
 
   /// Emit macros into a debug macinfo section.
   void emitDebugMacinfo();
-  unsigned emitMacro(AsmStreamerBase *AS, DIMacro &M);
-  unsigned emitMacroFile(AsmStreamerBase *AS, DIMacroFile &F,
-                         DwarfCompileUnit &U);
-  unsigned handleMacroNodes(AsmStreamerBase *AS, DIMacroNodeArray Nodes,
-                            DwarfCompileUnit &U);
+  void emitMacro(DIMacro &M);
+  void emitMacroFile(DIMacroFile &F, DwarfCompileUnit &U);
+  void handleMacroNodes(DIMacroNodeArray Nodes, DwarfCompileUnit &U);
 
   /// DWARF 5 Experimental Split Dwarf Emitters
 
   /// Initialize common features of skeleton units.
   void initSkeletonUnit(const DwarfUnit &U, DIE &Die,
-                        std::unique_ptr<DwarfUnit> NewU);
+                        std::unique_ptr<DwarfCompileUnit> NewU);
 
   /// Construct the split debug info compile unit for the debug info
   /// section.
@@ -460,10 +434,6 @@ class DwarfDebug : public AsmPrinterHandler {
   void recordSourceLine(unsigned Line, unsigned Col, const MDNode *Scope,
                         unsigned Flags);
 
-  /// Indentify instructions that are marking the beginning of or
-  /// ending of a scope.
-  void identifyScopeMarkers();
-
   /// Populate LexicalScope entries with variables' info.
   void collectVariableInfo(DwarfCompileUnit &TheCU, const DISubprogram *SP,
                            DenseSet<InlinedVariable> &ProcessedVars);
@@ -477,16 +447,6 @@ class DwarfDebug : public AsmPrinterHandler {
   /// by MMI.
   void collectVariableInfoFromMMITable(DenseSet<InlinedVariable> &P);
 
-  /// Ensure that a label will be emitted before MI.
-  void requestLabelBeforeInsn(const MachineInstr *MI) {
-    LabelsBeforeInsn.insert(std::make_pair(MI, nullptr));
-  }
-
-  /// Ensure that a label will be emitted after MI.
-  void requestLabelAfterInsn(const MachineInstr *MI) {
-    LabelsAfterInsn.insert(std::make_pair(MI, nullptr));
-  }
-
 public:
   //===--------------------------------------------------------------------===//
   // Main entry points.
@@ -511,9 +471,6 @@ public:
   /// Process beginning of an instruction.
   void beginInstruction(const MachineInstr *MI) override;
 
-  /// Process end of an instruction.
-  void endInstruction() override;
-
   /// Perform an MD5 checksum of \p Identifier and return the lower 64 bits.
   static uint64_t makeTypeSignature(StringRef Identifier);
 
@@ -531,21 +488,17 @@ public:
     SymSize[Sym] = Size;
   }
 
-  /// Returns whether to emit DW_AT_[MIPS_]linkage_name.
-  bool useLinkageNames() const { return UseLinkageNames; }
+  /// Returns whether we should emit all DW_AT_[MIPS_]linkage_name.
+  /// If not, we still might emit certain cases.
+  bool useAllLinkageNames() const { return UseAllLinkageNames; }
 
   /// Returns whether to use DW_OP_GNU_push_tls_address, instead of the
   /// standard DW_OP_form_tls_address opcode
   bool useGNUTLSOpcode() const { return UseGNUTLSOpcode; }
 
-  /// \defgroup DebuggerTuning Predicates to tune DWARF for a given debugger.
-  ///
-  /// Returns whether we are "tuning" for a given debugger.
-  /// @{
-  bool tuneForGDB() const { return DebuggerTuning == DebuggerKind::GDB; }
-  bool tuneForLLDB() const { return DebuggerTuning == DebuggerKind::LLDB; }
-  bool tuneForSCE() const { return DebuggerTuning == DebuggerKind::SCE; }
-  /// @}
+  /// Returns whether to use the DWARF2 format for bitfields instyead of the
+  /// DWARF4 format.
+  bool useDWARF2Bitfields() const { return UseDWARF2Bitfields; }
 
   // Experimental DWARF5 features.
 
@@ -553,6 +506,10 @@ public:
   /// use to accelerate lookup.
   bool useDwarfAccelTables() const { return HasDwarfAccelTables; }
 
+  bool useAppleExtensionAttributes() const {
+    return HasAppleExtensionAttributes;
+  }
+
   /// Returns whether or not to change the current debug info for the
   /// split dwarf proposal support.
   bool useSplitDwarf() const { return HasSplitDwarf; }
@@ -577,12 +534,7 @@ public:
 
   /// Find the MDNode for the given reference.
   template <typename T> T *resolve(TypedDINodeRef<T> Ref) const {
-    return Ref.resolve(TypeIdentifierMap);
-  }
-
-  /// Return the TypeIdentifierMap.
-  const DITypeIdentifierMap &getTypeIdentifierMap() const {
-    return TypeIdentifierMap;
+    return Ref.resolve();
   }
 
   /// Find the DwarfCompileUnit for the given CU Die.
@@ -608,12 +560,6 @@ public:
   /// going to be null.
   bool isLexicalScopeDIENull(LexicalScope *Scope);
 
-  /// Return Label preceding the instruction.
-  MCSymbol *getLabelBeforeInsn(const MachineInstr *MI);
-
-  /// Return Label immediately following the instruction.
-  MCSymbol *getLabelAfterInsn(const MachineInstr *MI);
-
   // FIXME: Sink these functions down into DwarfFile/Dwarf*Unit.
 
   SmallPtrSet<const MDNode *, 16> &getProcessedSPNodes() {
diff --git a/lib/CodeGen/AsmPrinter/DwarfException.h b/lib/CodeGen/AsmPrinter/DwarfException.h
index f4667b4a3464..8287f289f22b 100644
--- a/lib/CodeGen/AsmPrinter/DwarfException.h
+++ b/lib/CodeGen/AsmPrinter/DwarfException.h
@@ -16,6 +16,7 @@
 
 #include "EHStreamer.h"
 #include "llvm/CodeGen/AsmPrinter.h"
+#include "llvm/MC/MCDwarf.h"
 
 namespace llvm {
 class MachineFunction;
@@ -29,12 +30,16 @@ protected:
   bool shouldEmitCFI;
 
   void markFunctionEnd() override;
+  void endFragment() override;
 };
 
 class LLVM_LIBRARY_VISIBILITY DwarfCFIException : public DwarfCFIExceptionBase {
   /// Per-function flag to indicate if .cfi_personality should be emitted.
   bool shouldEmitPersonality;
 
+  /// Per-function flag to indicate if .cfi_personality must be emitted.
+  bool forceEmitPersonality;
+
   /// Per-function flag to indicate if .cfi_lsda should be emitted.
   bool shouldEmitLSDA;
 
@@ -59,6 +64,9 @@ public:
 
   /// Gather and emit post-function exception information.
   void endFunction(const MachineFunction *) override;
+
+  void beginFragment(const MachineBasicBlock *MBB,
+                     ExceptionSymbolProvider ESP) override;
 };
 
 class LLVM_LIBRARY_VISIBILITY ARMException : public DwarfCFIExceptionBase {
diff --git a/lib/CodeGen/AsmPrinter/DwarfExpression.cpp b/lib/CodeGen/AsmPrinter/DwarfExpression.cpp
index 7b5b831da166..7dbc6cb39951 100644
--- a/lib/CodeGen/AsmPrinter/DwarfExpression.cpp
+++ b/lib/CodeGen/AsmPrinter/DwarfExpression.cpp
@@ -65,8 +65,9 @@ void DwarfExpression::AddShr(unsigned ShiftBy) {
   EmitOp(dwarf::DW_OP_shr);
 }
 
-bool DwarfExpression::AddMachineRegIndirect(unsigned MachineReg, int Offset) {
-  if (isFrameRegister(MachineReg)) {
+bool DwarfExpression::AddMachineRegIndirect(const TargetRegisterInfo &TRI,
+                                            unsigned MachineReg, int Offset) {
+  if (isFrameRegister(TRI, MachineReg)) {
     // If variable offset is based in frame register then use fbreg.
     EmitOp(dwarf::DW_OP_fbreg);
     EmitSigned(Offset);
@@ -81,7 +82,8 @@ bool DwarfExpression::AddMachineRegIndirect(unsigned MachineReg, int Offset) {
   return true;
 }
 
-bool DwarfExpression::AddMachineRegPiece(unsigned MachineReg,
+bool DwarfExpression::AddMachineRegPiece(const TargetRegisterInfo &TRI,
+                                         unsigned MachineReg,
                                          unsigned PieceSizeInBits,
                                          unsigned PieceOffsetInBits) {
   if (!TRI.isPhysicalRegister(MachineReg))
@@ -159,29 +161,37 @@ bool DwarfExpression::AddMachineRegPiece(unsigned MachineReg,
   return CurPos > PieceOffsetInBits;
 }
 
-void DwarfExpression::AddSignedConstant(int Value) {
-  EmitOp(dwarf::DW_OP_consts);
-  EmitSigned(Value);
-  // The proper way to describe a constant value is
-  // DW_OP_constu <const>, DW_OP_stack_value.
-  // Unfortunately, DW_OP_stack_value was not available until DWARF-4,
-  // so we will continue to generate DW_OP_constu <const> for DWARF-2
-  // and DWARF-3. Technically, this is incorrect since DW_OP_const <const>
-  // actually describes a value at a constant addess, not a constant value.
-  // However, in the past there was no better way  to describe a constant
-  // value, so the producers and consumers started to rely on heuristics
-  // to disambiguate the value vs. location status of the expression.
-  // See PR21176 for more details.
+void DwarfExpression::AddStackValue() {
   if (DwarfVersion >= 4)
     EmitOp(dwarf::DW_OP_stack_value);
 }
 
-void DwarfExpression::AddUnsignedConstant(unsigned Value) {
+void DwarfExpression::AddSignedConstant(int64_t Value) {
+  EmitOp(dwarf::DW_OP_consts);
+  EmitSigned(Value);
+  AddStackValue();
+}
+
+void DwarfExpression::AddUnsignedConstant(uint64_t Value) {
   EmitOp(dwarf::DW_OP_constu);
   EmitUnsigned(Value);
-  // cf. comment in DwarfExpression::AddSignedConstant().
-  if (DwarfVersion >= 4)
-    EmitOp(dwarf::DW_OP_stack_value);
+  AddStackValue();
+}
+
+void DwarfExpression::AddUnsignedConstant(const APInt &Value) {
+  unsigned Size = Value.getBitWidth();
+  const uint64_t *Data = Value.getRawData();
+
+  // Chop it up into 64-bit pieces, because that's the maximum that
+  // AddUnsignedConstant takes.
+  unsigned Offset = 0;
+  while (Offset < Size) {
+    AddUnsignedConstant(*Data++);
+    if (Offset == 0 && Size <= 64)
+      break;
+    AddOpPiece(std::min(Size-Offset, 64u), Offset);
+    Offset += 64;
+  }
 }
 
 static unsigned getOffsetOrZero(unsigned OffsetInBits,
@@ -192,13 +202,14 @@ static unsigned getOffsetOrZero(unsigned OffsetInBits,
   return OffsetInBits;
 }
 
-bool DwarfExpression::AddMachineRegExpression(const DIExpression *Expr,
+bool DwarfExpression::AddMachineRegExpression(const TargetRegisterInfo &TRI,
+                                              const DIExpression *Expr,
                                               unsigned MachineReg,
                                               unsigned PieceOffsetInBits) {
   auto I = Expr->expr_op_begin();
   auto E = Expr->expr_op_end();
   if (I == E)
-    return AddMachineRegPiece(MachineReg);
+    return AddMachineRegPiece(TRI, MachineReg);
 
   // Pattern-match combinations for which more efficient representations exist
   // first.
@@ -208,7 +219,7 @@ bool DwarfExpression::AddMachineRegExpression(const DIExpression *Expr,
     unsigned OffsetInBits = I->getArg(0);
     unsigned SizeInBits   = I->getArg(1);
     // Piece always comes at the end of the expression.
-    return AddMachineRegPiece(MachineReg, SizeInBits,
+    return AddMachineRegPiece(TRI, MachineReg, SizeInBits,
                getOffsetOrZero(OffsetInBits, PieceOffsetInBits));
   }
   case dwarf::DW_OP_plus:
@@ -219,15 +230,15 @@ bool DwarfExpression::AddMachineRegExpression(const DIExpression *Expr,
     if (N != E && N->getOp() == dwarf::DW_OP_deref) {
       unsigned Offset = I->getArg(0);
       ValidReg = AddMachineRegIndirect(
-          MachineReg, I->getOp() == dwarf::DW_OP_plus ? Offset : -Offset);
+          TRI, MachineReg, I->getOp() == dwarf::DW_OP_plus ? Offset : -Offset);
       std::advance(I, 2);
       break;
     } else
-      ValidReg = AddMachineRegPiece(MachineReg);
+      ValidReg = AddMachineRegPiece(TRI, MachineReg);
   }
   case dwarf::DW_OP_deref: {
       // [DW_OP_reg,DW_OP_deref] --> [DW_OP_breg].
-      ValidReg = AddMachineRegIndirect(MachineReg);
+      ValidReg = AddMachineRegIndirect(TRI, MachineReg);
       ++I;
       break;
   }
diff --git a/lib/CodeGen/AsmPrinter/DwarfExpression.h b/lib/CodeGen/AsmPrinter/DwarfExpression.h
index 78ec937a6b60..5fff28d8a13c 100644
--- a/lib/CodeGen/AsmPrinter/DwarfExpression.h
+++ b/lib/CodeGen/AsmPrinter/DwarfExpression.h
@@ -31,13 +31,10 @@ class DIELoc;
 class DwarfExpression {
 protected:
   // Various convenience accessors that extract things out of AsmPrinter.
-  const TargetRegisterInfo &TRI;
   unsigned DwarfVersion;
 
 public:
-  DwarfExpression(const TargetRegisterInfo &TRI,
-                  unsigned DwarfVersion)
-    : TRI(TRI), DwarfVersion(DwarfVersion) {}
+  DwarfExpression(unsigned DwarfVersion) : DwarfVersion(DwarfVersion) {}
   virtual ~DwarfExpression() {}
 
   /// Output a dwarf operand and an optional assembler comment.
@@ -48,7 +45,7 @@ public:
   virtual void EmitUnsigned(uint64_t Value) = 0;
   /// Return whether the given machine register is the frame register in the
   /// current function.
-  virtual bool isFrameRegister(unsigned MachineReg) = 0;
+  virtual bool isFrameRegister(const TargetRegisterInfo &TRI, unsigned MachineReg) = 0;
 
   /// Emit a dwarf register operation.
   void AddReg(int DwarfReg, const char *Comment = nullptr);
@@ -61,10 +58,24 @@ public:
   void AddOpPiece(unsigned SizeInBits, unsigned OffsetInBits = 0);
   /// Emit a shift-right dwarf expression.
   void AddShr(unsigned ShiftBy);
+  /// Emit a DW_OP_stack_value, if supported.
+  ///
+  /// The proper way to describe a constant value is
+  /// DW_OP_constu <const>, DW_OP_stack_value.
+  /// Unfortunately, DW_OP_stack_value was not available until DWARF-4,
+  /// so we will continue to generate DW_OP_constu <const> for DWARF-2
+  /// and DWARF-3. Technically, this is incorrect since DW_OP_const <const>
+  /// actually describes a value at a constant addess, not a constant value.
+  /// However, in the past there was no better way  to describe a constant
+  /// value, so the producers and consumers started to rely on heuristics
+  /// to disambiguate the value vs. location status of the expression.
+  /// See PR21176 for more details.
+  void AddStackValue();
 
   /// Emit an indirect dwarf register operation for the given machine register.
   /// \return false if no DWARF register exists for MachineReg.
-  bool AddMachineRegIndirect(unsigned MachineReg, int Offset = 0);
+  bool AddMachineRegIndirect(const TargetRegisterInfo &TRI, unsigned MachineReg,
+                             int Offset = 0);
 
   /// \brief Emit a partial DWARF register operation.
   /// \param MachineReg        the register
@@ -80,20 +91,24 @@ public:
   /// subregisters that alias the register.
   ///
   /// \return false if no DWARF register exists for MachineReg.
-  bool AddMachineRegPiece(unsigned MachineReg, unsigned PieceSizeInBits = 0,
+  bool AddMachineRegPiece(const TargetRegisterInfo &TRI, unsigned MachineReg,
+                          unsigned PieceSizeInBits = 0,
                           unsigned PieceOffsetInBits = 0);
 
   /// Emit a signed constant.
-  void AddSignedConstant(int Value);
+  void AddSignedConstant(int64_t Value);
+  /// Emit an unsigned constant.
+  void AddUnsignedConstant(uint64_t Value);
   /// Emit an unsigned constant.
-  void AddUnsignedConstant(unsigned Value);
+  void AddUnsignedConstant(const APInt &Value);
 
   /// \brief Emit an entire expression on top of a machine register location.
   ///
   /// \param PieceOffsetInBits If this is one piece out of a fragmented
   /// location, this is the offset of the piece inside the entire variable.
   /// \return false if no DWARF register exists for MachineReg.
-  bool AddMachineRegExpression(const DIExpression *Expr, unsigned MachineReg,
+  bool AddMachineRegExpression(const TargetRegisterInfo &TRI,
+                               const DIExpression *Expr, unsigned MachineReg,
                                unsigned PieceOffsetInBits = 0);
   /// Emit a the operations remaining the DIExpressionIterator I.
   /// \param PieceOffsetInBits If this is one piece out of a fragmented
@@ -108,14 +123,14 @@ class DebugLocDwarfExpression : public DwarfExpression {
   ByteStreamer &BS;
 
 public:
-  DebugLocDwarfExpression(const TargetRegisterInfo &TRI,
-                          unsigned DwarfVersion, ByteStreamer &BS)
-    : DwarfExpression(TRI, DwarfVersion), BS(BS) {}
+  DebugLocDwarfExpression(unsigned DwarfVersion, ByteStreamer &BS)
+      : DwarfExpression(DwarfVersion), BS(BS) {}
 
   void EmitOp(uint8_t Op, const char *Comment = nullptr) override;
   void EmitSigned(int64_t Value) override;
   void EmitUnsigned(uint64_t Value) override;
-  bool isFrameRegister(unsigned MachineReg) override;
+  bool isFrameRegister(const TargetRegisterInfo &TRI,
+                       unsigned MachineReg) override;
 };
 
 /// DwarfExpression implementation for singular DW_AT_location.
@@ -129,7 +144,8 @@ public:
   void EmitOp(uint8_t Op, const char *Comment = nullptr) override;
   void EmitSigned(int64_t Value) override;
   void EmitUnsigned(uint64_t Value) override;
-  bool isFrameRegister(unsigned MachineReg) override;
+  bool isFrameRegister(const TargetRegisterInfo &TRI,
+                       unsigned MachineReg) override;
 };
 }
 
diff --git a/lib/CodeGen/AsmPrinter/DwarfFile.cpp b/lib/CodeGen/AsmPrinter/DwarfFile.cpp
index 51b27b462a7c..e9fe98ab3cf9 100644
--- a/lib/CodeGen/AsmPrinter/DwarfFile.cpp
+++ b/lib/CodeGen/AsmPrinter/DwarfFile.cpp
@@ -8,6 +8,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "DwarfFile.h"
+#include "DwarfCompileUnit.h"
 #include "DwarfDebug.h"
 #include "DwarfUnit.h"
 #include "llvm/ADT/STLExtras.h"
@@ -50,22 +51,25 @@ DIEAbbrev &DwarfFile::assignAbbrevNumber(DIE &Die) {
   return *New;
 }
 
-void DwarfFile::addUnit(std::unique_ptr<DwarfUnit> U) {
+void DwarfFile::addUnit(std::unique_ptr<DwarfCompileUnit> U) {
   CUs.push_back(std::move(U));
 }
 
 // Emit the various dwarf units to the unit section USection with
 // the abbreviations going into ASection.
 void DwarfFile::emitUnits(bool UseOffsets) {
-  for (const auto &TheU : CUs) {
-    DIE &Die = TheU->getUnitDie();
-    MCSection *USection = TheU->getSection();
-    Asm->OutStreamer->SwitchSection(USection);
+  for (const auto &TheU : CUs)
+    emitUnit(TheU.get(), UseOffsets);
+}
 
-    TheU->emitHeader(UseOffsets);
+void DwarfFile::emitUnit(DwarfUnit *TheU, bool UseOffsets) {
+  DIE &Die = TheU->getUnitDie();
+  MCSection *USection = TheU->getSection();
+  Asm->OutStreamer->SwitchSection(USection);
 
-    Asm->emitDwarfDIE(Die);
-  }
+  TheU->emitHeader(UseOffsets);
+
+  Asm->emitDwarfDIE(Die);
 }
 
 // Compute the size and offset for each DIE.
@@ -77,17 +81,20 @@ void DwarfFile::computeSizeAndOffsets() {
   // DIE within each compile unit. All offsets are CU relative.
   for (const auto &TheU : CUs) {
     TheU->setDebugInfoOffset(SecOffset);
+    SecOffset += computeSizeAndOffsetsForUnit(TheU.get());
+  }
+}
 
-    // CU-relative offset is reset to 0 here.
-    unsigned Offset = sizeof(int32_t) +      // Length of Unit Info
-                      TheU->getHeaderSize(); // Unit-specific headers
+unsigned DwarfFile::computeSizeAndOffsetsForUnit(DwarfUnit *TheU) {
+  // CU-relative offset is reset to 0 here.
+  unsigned Offset = sizeof(int32_t) +      // Length of Unit Info
+                    TheU->getHeaderSize(); // Unit-specific headers
 
-    // EndOffset here is CU-relative, after laying out
-    // all of the CU DIE.
-    unsigned EndOffset = computeSizeAndOffset(TheU->getUnitDie(), Offset);
-    SecOffset += EndOffset;
-  }
+  // The return value here is CU-relative, after laying out
+  // all of the CU DIE.
+  return computeSizeAndOffset(TheU->getUnitDie(), Offset);
 }
+
 // Compute the size and offset of a DIE. The offset is relative to start of the
 // CU. It returns the offset after laying out the DIE.
 unsigned DwarfFile::computeSizeAndOffset(DIE &Die, unsigned Offset) {
diff --git a/lib/CodeGen/AsmPrinter/DwarfFile.h b/lib/CodeGen/AsmPrinter/DwarfFile.h
index 8402027edd6f..b73d89b0e499 100644
--- a/lib/CodeGen/AsmPrinter/DwarfFile.h
+++ b/lib/CodeGen/AsmPrinter/DwarfFile.h
@@ -16,14 +16,15 @@
 #include "llvm/ADT/FoldingSet.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/StringMap.h"
+#include "llvm/IR/Metadata.h"
 #include "llvm/Support/Allocator.h"
 #include <memory>
-#include <string>
 #include <vector>
 
 namespace llvm {
 class AsmPrinter;
 class DbgVariable;
+class DwarfCompileUnit;
 class DwarfUnit;
 class DIEAbbrev;
 class MCSymbol;
@@ -46,7 +47,7 @@ class DwarfFile {
   std::vector<DIEAbbrev *> Abbreviations;
 
   // A pointer to all units in the section.
-  SmallVector<std::unique_ptr<DwarfUnit>, 1> CUs;
+  SmallVector<std::unique_ptr<DwarfCompileUnit>, 1> CUs;
 
   DwarfStringPool StrPool;
 
@@ -66,7 +67,9 @@ public:
 
   ~DwarfFile();
 
-  const SmallVectorImpl<std::unique_ptr<DwarfUnit>> &getUnits() { return CUs; }
+  const SmallVectorImpl<std::unique_ptr<DwarfCompileUnit>> &getUnits() {
+    return CUs;
+  }
 
   /// \brief Compute the size and offset of a DIE given an incoming Offset.
   unsigned computeSizeAndOffset(DIE &Die, unsigned Offset);
@@ -74,6 +77,10 @@ public:
   /// \brief Compute the size and offset of all the DIEs.
   void computeSizeAndOffsets();
 
+  /// \brief Compute the size and offset of all the DIEs in the given unit.
+  /// \returns The size of the root DIE.
+  unsigned computeSizeAndOffsetsForUnit(DwarfUnit *TheU);
+
   /// Define a unique number for the abbreviation.
   ///
   /// Compute the abbreviation for \c Die, look up its unique number, and
@@ -81,12 +88,15 @@ public:
   DIEAbbrev &assignAbbrevNumber(DIE &Die);
 
   /// \brief Add a unit to the list of CUs.
-  void addUnit(std::unique_ptr<DwarfUnit> U);
+  void addUnit(std::unique_ptr<DwarfCompileUnit> U);
 
   /// \brief Emit all of the units to the section listed with the given
   /// abbreviation section.
   void emitUnits(bool UseOffsets);
 
+  /// \brief Emit the given unit to its section.
+  void emitUnit(DwarfUnit *U, bool UseOffsets);
+
   /// \brief Emit a set of abbreviations to the specific section.
   void emitAbbrevs(MCSection *);
 
diff --git a/lib/CodeGen/AsmPrinter/DwarfUnit.cpp b/lib/CodeGen/AsmPrinter/DwarfUnit.cpp
index d75fea5d8c8a..4100d728a53b 100644
--- a/lib/CodeGen/AsmPrinter/DwarfUnit.cpp
+++ b/lib/CodeGen/AsmPrinter/DwarfUnit.cpp
@@ -46,9 +46,8 @@ GenerateDwarfTypeUnits("generate-type-units", cl::Hidden,
 
 DIEDwarfExpression::DIEDwarfExpression(const AsmPrinter &AP, DwarfUnit &DU,
                                        DIELoc &DIE)
-    : DwarfExpression(*AP.MF->getSubtarget().getRegisterInfo(),
-                      AP.getDwarfDebug()->getDwarfVersion()),
-      AP(AP), DU(DU), DIE(DIE) {}
+    : DwarfExpression(AP.getDwarfDebug()->getDwarfVersion()), AP(AP), DU(DU),
+      DIE(DIE) {}
 
 void DIEDwarfExpression::EmitOp(uint8_t Op, const char* Comment) {
   DU.addUInt(DIE, dwarf::DW_FORM_data1, Op);
@@ -59,25 +58,24 @@ void DIEDwarfExpression::EmitSigned(int64_t Value) {
 void DIEDwarfExpression::EmitUnsigned(uint64_t Value) {
   DU.addUInt(DIE, dwarf::DW_FORM_udata, Value);
 }
-bool DIEDwarfExpression::isFrameRegister(unsigned MachineReg) {
+bool DIEDwarfExpression::isFrameRegister(const TargetRegisterInfo &TRI,
+                                         unsigned MachineReg) {
   return MachineReg == TRI.getFrameRegister(*AP.MF);
 }
 
-DwarfUnit::DwarfUnit(unsigned UID, dwarf::Tag UnitTag,
-                     const DICompileUnit *Node, AsmPrinter *A, DwarfDebug *DW,
-                     DwarfFile *DWU)
-    : UniqueID(UID), CUNode(Node),
-      UnitDie(*DIE::get(DIEValueAllocator, UnitTag)), DebugInfoOffset(0),
-      Asm(A), DD(DW), DU(DWU), IndexTyDie(nullptr), Section(nullptr) {
+DwarfUnit::DwarfUnit(dwarf::Tag UnitTag, const DICompileUnit *Node,
+                     AsmPrinter *A, DwarfDebug *DW, DwarfFile *DWU)
+    : CUNode(Node), UnitDie(*DIE::get(DIEValueAllocator, UnitTag)), Asm(A),
+      DD(DW), DU(DWU), IndexTyDie(nullptr), Section(nullptr) {
   assert(UnitTag == dwarf::DW_TAG_compile_unit ||
          UnitTag == dwarf::DW_TAG_type_unit);
 }
 
-DwarfTypeUnit::DwarfTypeUnit(unsigned UID, DwarfCompileUnit &CU, AsmPrinter *A,
+DwarfTypeUnit::DwarfTypeUnit(DwarfCompileUnit &CU, AsmPrinter *A,
                              DwarfDebug *DW, DwarfFile *DWU,
                              MCDwarfDwoLineTable *SplitLineTable)
-    : DwarfUnit(UID, dwarf::DW_TAG_type_unit, CU.getCUNode(), A, DW, DWU),
-      CU(CU), SplitLineTable(SplitLineTable) {
+    : DwarfUnit(dwarf::DW_TAG_type_unit, CU.getCUNode(), A, DW, DWU), CU(CU),
+      SplitLineTable(SplitLineTable) {
   if (SplitLineTable)
     addSectionOffset(UnitDie, dwarf::DW_AT_stmt_list, 0);
 }
@@ -268,7 +266,7 @@ void DwarfUnit::addDIEEntry(DIE &Die, dwarf::Attribute Attribute, DIE &Entry) {
   addDIEEntry(Die, Attribute, DIEEntry(Entry));
 }
 
-void DwarfUnit::addDIETypeSignature(DIE &Die, const DwarfTypeUnit &Type) {
+void DwarfUnit::addDIETypeSignature(DIE &Die, uint64_t Signature) {
   // Flag the type unit reference as a declaration so that if it contains
   // members (implicit special members, static data member definitions, member
   // declarations for definitions in this CU, etc) consumers don't get confused
@@ -276,7 +274,7 @@ void DwarfUnit::addDIETypeSignature(DIE &Die, const DwarfTypeUnit &Type) {
   addFlag(Die, dwarf::DW_AT_declaration);
 
   Die.addValue(DIEValueAllocator, dwarf::DW_AT_signature,
-               dwarf::DW_FORM_ref_sig8, DIETypeSignature(Type));
+               dwarf::DW_FORM_ref_sig8, DIEInteger(Signature));
 }
 
 void DwarfUnit::addDIETypeSignature(DIE &Die, dwarf::Attribute Attribute,
@@ -370,14 +368,16 @@ void DwarfUnit::addSourceLine(DIE &Die, const DINamespace *NS) {
 bool DwarfUnit::addRegisterOpPiece(DIELoc &TheDie, unsigned Reg,
                                    unsigned SizeInBits, unsigned OffsetInBits) {
   DIEDwarfExpression Expr(*Asm, *this, TheDie);
-  Expr.AddMachineRegPiece(Reg, SizeInBits, OffsetInBits);
+  Expr.AddMachineRegPiece(*Asm->MF->getSubtarget().getRegisterInfo(), Reg,
+                          SizeInBits, OffsetInBits);
   return true;
 }
 
 bool DwarfUnit::addRegisterOffset(DIELoc &TheDie, unsigned Reg,
                                   int64_t Offset) {
   DIEDwarfExpression Expr(*Asm, *this, TheDie);
-  return Expr.AddMachineRegIndirect(Reg, Offset);
+  return Expr.AddMachineRegIndirect(*Asm->MF->getSubtarget().getRegisterInfo(),
+                                    Reg, Offset);
 }
 
 /* Byref variables, in Blocks, are declared by the programmer as "SomeType
@@ -561,32 +561,6 @@ static bool isUnsignedDIType(DwarfDebug *DD, const DIType *Ty) {
          Ty->getTag() == dwarf::DW_TAG_unspecified_type;
 }
 
-/// If this type is derived from a base type then return base type size.
-static uint64_t getBaseTypeSize(DwarfDebug *DD, const DIDerivedType *Ty) {
-  unsigned Tag = Ty->getTag();
-
-  if (Tag != dwarf::DW_TAG_member && Tag != dwarf::DW_TAG_typedef &&
-      Tag != dwarf::DW_TAG_const_type && Tag != dwarf::DW_TAG_volatile_type &&
-      Tag != dwarf::DW_TAG_restrict_type)
-    return Ty->getSizeInBits();
-
-  auto *BaseType = DD->resolve(Ty->getBaseType());
-
-  assert(BaseType && "Unexpected invalid base type");
-
-  // If this is a derived type, go ahead and get the base type, unless it's a
-  // reference then it's just the size of the field. Pointer types have no need
-  // of this since they're a different type of qualification on the type.
-  if (BaseType->getTag() == dwarf::DW_TAG_reference_type ||
-      BaseType->getTag() == dwarf::DW_TAG_rvalue_reference_type)
-    return Ty->getSizeInBits();
-
-  if (auto *DT = dyn_cast<DIDerivedType>(BaseType))
-    return getBaseTypeSize(DD, DT);
-
-  return BaseType->getSizeInBits();
-}
-
 void DwarfUnit::addConstantFPValue(DIE &Die, const MachineOperand &MO) {
   assert(MO.isFPImm() && "Invalid machine operand!");
   DIEBlock *Block = new (DIEValueAllocator) DIEBlock;
@@ -667,7 +641,7 @@ void DwarfUnit::addConstantValue(DIE &Die, const APInt &Val, bool Unsigned) {
 }
 
 void DwarfUnit::addLinkageName(DIE &Die, StringRef LinkageName) {
-  if (!LinkageName.empty() && DD->useLinkageNames())
+  if (!LinkageName.empty())
     addString(Die,
               DD->getDwarfVersion() >= 4 ? dwarf::DW_AT_linkage_name
                                          : dwarf::DW_AT_MIPS_linkage_name,
@@ -720,8 +694,6 @@ DIE *DwarfUnit::getOrCreateTypeDIE(const MDNode *TyNode) {
     return nullptr;
 
   auto *Ty = cast<DIType>(TyNode);
-  assert(Ty == resolve(Ty->getRef()) &&
-         "type was not uniqued, possible ODR violation.");
 
   // DW_TAG_restrict_type is not supported in DWARF2
   if (Ty->getTag() == dwarf::DW_TAG_restrict_type && DD->getDwarfVersion() <= 2)
@@ -903,6 +875,11 @@ void DwarfUnit::constructTypeDIE(DIE &Buffer, const DISubroutineType *CTy) {
        Language == dwarf::DW_LANG_ObjC))
     addFlag(Buffer, dwarf::DW_AT_prototyped);
 
+  // Add a DW_AT_calling_convention if this has an explicit convention.
+  if (CTy->getCC() && CTy->getCC() != dwarf::DW_CC_normal)
+    addUInt(Buffer, dwarf::DW_AT_calling_convention, dwarf::DW_FORM_data1,
+            CTy->getCC());
+
   if (CTy->isLValueReference())
     addFlag(Buffer, dwarf::DW_AT_reference);
 
@@ -1050,14 +1027,18 @@ void DwarfUnit::constructTemplateValueParameterDIE(
     if (ConstantInt *CI = mdconst::dyn_extract<ConstantInt>(Val))
       addConstantValue(ParamDIE, CI, resolve(VP->getType()));
     else if (GlobalValue *GV = mdconst::dyn_extract<GlobalValue>(Val)) {
-      // For declaration non-type template parameters (such as global values and
-      // functions)
-      DIELoc *Loc = new (DIEValueAllocator) DIELoc;
-      addOpAddress(*Loc, Asm->getSymbol(GV));
-      // Emit DW_OP_stack_value to use the address as the immediate value of the
-      // parameter, rather than a pointer to it.
-      addUInt(*Loc, dwarf::DW_FORM_data1, dwarf::DW_OP_stack_value);
-      addBlock(ParamDIE, dwarf::DW_AT_location, Loc);
+      // We cannot describe the location of dllimport'd entities: the
+      // computation of their address requires loads from the IAT.
+      if (!GV->hasDLLImportStorageClass()) {
+        // For declaration non-type template parameters (such as global values
+        // and functions)
+        DIELoc *Loc = new (DIEValueAllocator) DIELoc;
+        addOpAddress(*Loc, Asm->getSymbol(GV));
+        // Emit DW_OP_stack_value to use the address as the immediate value of
+        // the parameter, rather than a pointer to it.
+        addUInt(*Loc, dwarf::DW_FORM_data1, dwarf::DW_OP_stack_value);
+        addBlock(ParamDIE, dwarf::DW_AT_location, Loc);
+      }
     } else if (VP->getTag() == dwarf::DW_TAG_GNU_template_template_param) {
       assert(isa<MDString>(Val));
       addString(ParamDIE, dwarf::DW_AT_GNU_template_name,
@@ -1171,7 +1152,9 @@ bool DwarfUnit::applySubprogramDefinitionAttributes(const DISubprogram *SP,
   assert(((LinkageName.empty() || DeclLinkageName.empty()) ||
           LinkageName == DeclLinkageName) &&
          "decl has a linkage name and it is different");
-  if (DeclLinkageName.empty())
+  if (DeclLinkageName.empty() &&
+      // Always emit it for abstract subprograms.
+      (DD->useAllLinkageNames() || DU->getAbstractSPDies().lookup(SP)))
     addLinkageName(SPDie, LinkageName);
 
   if (!DeclDie)
@@ -1207,9 +1190,16 @@ void DwarfUnit::applySubprogramAttributes(const DISubprogram *SP, DIE &SPDie,
        Language == dwarf::DW_LANG_ObjC))
     addFlag(SPDie, dwarf::DW_AT_prototyped);
 
+  unsigned CC = 0;
   DITypeRefArray Args;
-  if (const DISubroutineType *SPTy = SP->getType())
+  if (const DISubroutineType *SPTy = SP->getType()) {
     Args = SPTy->getTypeArray();
+    CC = SPTy->getCC();
+  }
+
+  // Add a DW_AT_calling_convention if this has an explicit convention.
+  if (CC && CC != dwarf::DW_CC_normal)
+    addUInt(SPDie, dwarf::DW_AT_calling_convention, dwarf::DW_FORM_data1, CC);
 
   // Add a return type. If this is a type like a C/C++ void type we don't add a
   // return type.
@@ -1220,10 +1210,12 @@ void DwarfUnit::applySubprogramAttributes(const DISubprogram *SP, DIE &SPDie,
   unsigned VK = SP->getVirtuality();
   if (VK) {
     addUInt(SPDie, dwarf::DW_AT_virtuality, dwarf::DW_FORM_data1, VK);
-    DIELoc *Block = getDIELoc();
-    addUInt(*Block, dwarf::DW_FORM_data1, dwarf::DW_OP_constu);
-    addUInt(*Block, dwarf::DW_FORM_udata, SP->getVirtualIndex());
-    addBlock(SPDie, dwarf::DW_AT_vtable_elem_location, Block);
+    if (SP->getVirtualIndex() != -1u) {
+      DIELoc *Block = getDIELoc();
+      addUInt(*Block, dwarf::DW_FORM_data1, dwarf::DW_OP_constu);
+      addUInt(*Block, dwarf::DW_FORM_udata, SP->getVirtualIndex());
+      addBlock(SPDie, dwarf::DW_AT_vtable_elem_location, Block);
+    }
     ContainingTypeMap.insert(
         std::make_pair(&SPDie, resolve(SP->getContainingType())));
   }
@@ -1242,11 +1234,13 @@ void DwarfUnit::applySubprogramAttributes(const DISubprogram *SP, DIE &SPDie,
   if (!SP->isLocalToUnit())
     addFlag(SPDie, dwarf::DW_AT_external);
 
-  if (SP->isOptimized())
-    addFlag(SPDie, dwarf::DW_AT_APPLE_optimized);
+  if (DD->useAppleExtensionAttributes()) {
+    if (SP->isOptimized())
+      addFlag(SPDie, dwarf::DW_AT_APPLE_optimized);
 
-  if (unsigned isa = Asm->getISAEncoding())
-    addUInt(SPDie, dwarf::DW_AT_APPLE_isa, dwarf::DW_FORM_flag, isa);
+    if (unsigned isa = Asm->getISAEncoding())
+      addUInt(SPDie, dwarf::DW_AT_APPLE_isa, dwarf::DW_FORM_flag, isa);
+  }
 
   if (SP->isLValueReference())
     addFlag(SPDie, dwarf::DW_AT_reference);
@@ -1388,58 +1382,49 @@ void DwarfUnit::constructMemberDIE(DIE &Buffer, const DIDerivedType *DT) {
     addBlock(MemberDie, dwarf::DW_AT_data_member_location, VBaseLocationDie);
   } else {
     uint64_t Size = DT->getSizeInBits();
-    uint64_t FieldSize = getBaseTypeSize(DD, DT);
+    uint64_t FieldSize = DD->getBaseTypeSize(DT);
     uint64_t OffsetInBytes;
 
-    if (FieldSize && Size != FieldSize) {
+    bool IsBitfield = FieldSize && Size != FieldSize;
+    if (IsBitfield) {
       // Handle bitfield, assume bytes are 8 bits.
-      addUInt(MemberDie, dwarf::DW_AT_byte_size, None, FieldSize/8);
+      if (DD->useDWARF2Bitfields())
+        addUInt(MemberDie, dwarf::DW_AT_byte_size, None, FieldSize/8);
       addUInt(MemberDie, dwarf::DW_AT_bit_size, None, Size);
-      //
-      // The DWARF 2 DW_AT_bit_offset is counting the bits between the most
-      // significant bit of the aligned storage unit containing the bit field to
-      // the most significan bit of the bit field.
-      //
-      // FIXME: DWARF 4 states that DW_AT_data_bit_offset (which
-      // counts from the beginning, regardless of endianness) should
-      // be used instead.
-      //
-      //
-      // Struct      Align       Align       Align
-      // v           v           v           v
-      // +-----------+-----*-----+-----*-----+--
-      // | ...             |b1|b2|b3|b4|
-      // +-----------+-----*-----+-----*-----+--
-      // |           |     |<-- Size ->|     |
-      // |<---- Offset --->|           |<--->|
-      // |           |     |              \_ DW_AT_bit_offset (little endian)
-      // |           |<--->|
-      // |<--------->|  \_ StartBitOffset = DW_AT_bit_offset (big endian)
-      //     \                            = DW_AT_data_bit_offset (biendian)
-      //      \_ OffsetInBytes
+
       uint64_t Offset = DT->getOffsetInBits();
       uint64_t Align = DT->getAlignInBits() ? DT->getAlignInBits() : FieldSize;
       uint64_t AlignMask = ~(Align - 1);
       // The bits from the start of the storage unit to the start of the field.
       uint64_t StartBitOffset = Offset - (Offset & AlignMask);
-      // The endian-dependent DWARF 2 offset.
-      uint64_t DwarfBitOffset = Asm->getDataLayout().isLittleEndian()
-        ? OffsetToAlignment(Offset + Size, Align)
-        : StartBitOffset;
-
       // The byte offset of the field's aligned storage unit inside the struct.
       OffsetInBytes = (Offset - StartBitOffset) / 8;
-      addUInt(MemberDie, dwarf::DW_AT_bit_offset, None, DwarfBitOffset);
-    } else
+
+      if (DD->useDWARF2Bitfields()) {
+        uint64_t HiMark = (Offset + FieldSize) & AlignMask;
+        uint64_t FieldOffset = (HiMark - FieldSize);
+        Offset -= FieldOffset;
+
+        // Maybe we need to work from the other end.
+        if (Asm->getDataLayout().isLittleEndian())
+          Offset = FieldSize - (Offset + Size);
+
+        addUInt(MemberDie, dwarf::DW_AT_bit_offset, None, Offset);
+        OffsetInBytes = FieldOffset >> 3;
+      } else {
+        addUInt(MemberDie, dwarf::DW_AT_data_bit_offset, None, Offset);
+      }
+    } else {
       // This is not a bitfield.
       OffsetInBytes = DT->getOffsetInBits() / 8;
+    }
 
     if (DD->getDwarfVersion() <= 2) {
       DIELoc *MemLocationDie = new (DIEValueAllocator) DIELoc;
       addUInt(*MemLocationDie, dwarf::DW_FORM_data1, dwarf::DW_OP_plus_uconst);
       addUInt(*MemLocationDie, dwarf::DW_FORM_udata, OffsetInBytes);
       addBlock(MemberDie, dwarf::DW_AT_data_member_location, MemLocationDie);
-    } else
+    } else if (!IsBitfield || DD->useDWARF2Bitfields())
       addUInt(MemberDie, dwarf::DW_AT_data_member_location, None,
               OffsetInBytes);
   }
@@ -1524,8 +1509,11 @@ void DwarfUnit::emitHeader(bool UseOffsets) {
   // start of the section. Use a relocatable offset where needed to ensure
   // linking doesn't invalidate that offset.
   const TargetLoweringObjectFile &TLOF = Asm->getObjFileLowering();
-  Asm->emitDwarfSymbolReference(TLOF.getDwarfAbbrevSection()->getBeginSymbol(),
-                                UseOffsets);
+  if (UseOffsets)
+    Asm->EmitInt32(0);
+  else
+    Asm->emitDwarfSymbolReference(
+        TLOF.getDwarfAbbrevSection()->getBeginSymbol(), false);
 
   Asm->OutStreamer->AddComment("Address Size (in bytes)");
   Asm->EmitInt8(Asm->getDataLayout().getPointerSize());
diff --git a/lib/CodeGen/AsmPrinter/DwarfUnit.h b/lib/CodeGen/AsmPrinter/DwarfUnit.h
index 82760bf21839..e225f92116d4 100644
--- a/lib/CodeGen/AsmPrinter/DwarfUnit.h
+++ b/lib/CodeGen/AsmPrinter/DwarfUnit.h
@@ -67,9 +67,6 @@ public:
 /// source file.
 class DwarfUnit {
 protected:
-  /// A numeric ID unique among all CUs in the module
-  unsigned UniqueID;
-
   /// MDNode for the compile unit.
   const DICompileUnit *CUNode;
 
@@ -79,9 +76,6 @@ protected:
   /// Unit debug information entry.
   DIE &UnitDie;
 
-  /// Offset of the UnitDie from beginning of debug info section.
-  unsigned DebugInfoOffset;
-
   /// Target of Dwarf emission.
   AsmPrinter *Asm;
 
@@ -110,8 +104,8 @@ protected:
   /// The section this unit will be emitted in.
   MCSection *Section;
 
-  DwarfUnit(unsigned UID, dwarf::Tag, const DICompileUnit *CU, AsmPrinter *A,
-            DwarfDebug *DW, DwarfFile *DWU);
+  DwarfUnit(dwarf::Tag, const DICompileUnit *CU, AsmPrinter *A, DwarfDebug *DW,
+            DwarfFile *DWU);
 
   bool applySubprogramDefinitionAttributes(const DISubprogram *SP, DIE &SPDie);
 
@@ -127,14 +121,10 @@ public:
 
   // Accessors.
   AsmPrinter* getAsmPrinter() const { return Asm; }
-  unsigned getUniqueID() const { return UniqueID; }
   uint16_t getLanguage() const { return CUNode->getSourceLanguage(); }
   const DICompileUnit *getCUNode() const { return CUNode; }
   DIE &getUnitDie() { return UnitDie; }
 
-  unsigned getDebugInfoOffset() const { return DebugInfoOffset; }
-  void setDebugInfoOffset(unsigned DbgInfoOff) { DebugInfoOffset = DbgInfoOff; }
-
   /// Return true if this compile unit has something to write out.
   bool hasContent() const { return UnitDie.hasChildren(); }
 
@@ -221,7 +211,7 @@ public:
   void addDIEEntry(DIE &Die, dwarf::Attribute Attribute, DIEEntry Entry);
 
   /// Add a type's DW_AT_signature and set the  declaration flag.
-  void addDIETypeSignature(DIE &Die, const DwarfTypeUnit &Type);
+  void addDIETypeSignature(DIE &Die, uint64_t Signature);
   /// Add an attribute containing the type signature for a unique identifier.
   void addDIETypeSignature(DIE &Die, dwarf::Attribute Attribute,
                            StringRef Identifier);
@@ -338,7 +328,7 @@ protected:
   /// Look in the DwarfDebug map for the MDNode that corresponds to the
   /// reference.
   template <typename T> T *resolve(TypedDINodeRef<T> Ref) const {
-    return DD->resolve(Ref);
+    return Ref.resolve();
   }
 
 private:
@@ -383,12 +373,10 @@ class DwarfTypeUnit : public DwarfUnit {
   bool isDwoUnit() const override;
 
 public:
-  DwarfTypeUnit(unsigned UID, DwarfCompileUnit &CU, AsmPrinter *A,
-                DwarfDebug *DW, DwarfFile *DWU,
-                MCDwarfDwoLineTable *SplitLineTable = nullptr);
+  DwarfTypeUnit(DwarfCompileUnit &CU, AsmPrinter *A, DwarfDebug *DW,
+                DwarfFile *DWU, MCDwarfDwoLineTable *SplitLineTable = nullptr);
 
   void setTypeSignature(uint64_t Signature) { TypeSignature = Signature; }
-  uint64_t getTypeSignature() const { return TypeSignature; }
   void setType(const DIE *Ty) { this->Ty = Ty; }
 
   /// Emit the header for this unit, not including the initial length field.
diff --git a/lib/CodeGen/AsmPrinter/EHStreamer.h b/lib/CodeGen/AsmPrinter/EHStreamer.h
index c6a0e9d0524c..080fdd14b467 100644
--- a/lib/CodeGen/AsmPrinter/EHStreamer.h
+++ b/lib/CodeGen/AsmPrinter/EHStreamer.h
@@ -22,7 +22,6 @@ struct LandingPadInfo;
 class MachineModuleInfo;
 class MachineInstr;
 class MachineFunction;
-class AsmPrinter;
 class MCSymbol;
 class MCSymbolRefExpr;
 
diff --git a/lib/CodeGen/AsmPrinter/LLVMBuild.txt b/lib/CodeGen/AsmPrinter/LLVMBuild.txt
index bbdb0c7fc3c7..e741a1a4c4ed 100644
--- a/lib/CodeGen/AsmPrinter/LLVMBuild.txt
+++ b/lib/CodeGen/AsmPrinter/LLVMBuild.txt
@@ -19,4 +19,4 @@
 type = Library
 name = AsmPrinter
 parent = Libraries
-required_libraries = Analysis CodeGen Core MC MCParser Support Target TransformUtils
+required_libraries = Analysis CodeGen Core DebugInfoCodeView MC MCParser Support Target TransformUtils
diff --git a/lib/CodeGen/AsmPrinter/Makefile b/lib/CodeGen/AsmPrinter/Makefile
deleted file mode 100644
index 60aa6cbcf6f3..000000000000
--- a/lib/CodeGen/AsmPrinter/Makefile
+++ /dev/null
@@ -1,13 +0,0 @@
-##===- lib/CodeGen/AsmPrinter/Makefile ---------------------*- Makefile -*-===##
-#
-#                     The LLVM Compiler Infrastructure
-#
-# This file is distributed under the University of Illinois Open Source
-# License. See LICENSE.TXT for details.
-#
-##===----------------------------------------------------------------------===##
-
-LEVEL = ../../..
-LIBRARYNAME = LLVMAsmPrinter
-
-include $(LEVEL)/Makefile.common
diff --git a/lib/CodeGen/AsmPrinter/WinCodeViewLineTables.cpp b/lib/CodeGen/AsmPrinter/WinCodeViewLineTables.cpp
deleted file mode 100644
index 1e2f55b71151..000000000000
--- a/lib/CodeGen/AsmPrinter/WinCodeViewLineTables.cpp
+++ /dev/null
@@ -1,411 +0,0 @@
-//===-- llvm/lib/CodeGen/AsmPrinter/WinCodeViewLineTables.cpp --*- C++ -*--===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-//
-// This file contains support for writing line tables info into COFF files.
-//
-//===----------------------------------------------------------------------===//
-
-#include "WinCodeViewLineTables.h"
-#include "llvm/MC/MCExpr.h"
-#include "llvm/MC/MCSymbol.h"
-#include "llvm/Support/COFF.h"
-
-namespace llvm {
-
-StringRef WinCodeViewLineTables::getFullFilepath(const MDNode *S) {
-  assert(S);
-  assert((isa<DICompileUnit>(S) || isa<DIFile>(S) || isa<DISubprogram>(S) ||
-          isa<DILexicalBlockBase>(S)) &&
-         "Unexpected scope info");
-
-  auto *Scope = cast<DIScope>(S);
-  StringRef Dir = Scope->getDirectory(),
-            Filename = Scope->getFilename();
-  std::string &Filepath =
-      DirAndFilenameToFilepathMap[std::make_pair(Dir, Filename)];
-  if (!Filepath.empty())
-    return Filepath;
-
-  // Clang emits directory and relative filename info into the IR, but CodeView
-  // operates on full paths.  We could change Clang to emit full paths too, but
-  // that would increase the IR size and probably not needed for other users.
-  // For now, just concatenate and canonicalize the path here.
-  if (Filename.find(':') == 1)
-    Filepath = Filename;
-  else
-    Filepath = (Dir + "\\" + Filename).str();
-
-  // Canonicalize the path.  We have to do it textually because we may no longer
-  // have access the file in the filesystem.
-  // First, replace all slashes with backslashes.
-  std::replace(Filepath.begin(), Filepath.end(), '/', '\\');
-
-  // Remove all "\.\" with "\".
-  size_t Cursor = 0;
-  while ((Cursor = Filepath.find("\\.\\", Cursor)) != std::string::npos)
-    Filepath.erase(Cursor, 2);
-
-  // Replace all "\XXX\..\" with "\".  Don't try too hard though as the original
-  // path should be well-formatted, e.g. start with a drive letter, etc.
-  Cursor = 0;
-  while ((Cursor = Filepath.find("\\..\\", Cursor)) != std::string::npos) {
-    // Something's wrong if the path starts with "\..\", abort.
-    if (Cursor == 0)
-      break;
-
-    size_t PrevSlash = Filepath.rfind('\\', Cursor - 1);
-    if (PrevSlash == std::string::npos)
-      // Something's wrong, abort.
-      break;
-
-    Filepath.erase(PrevSlash, Cursor + 3 - PrevSlash);
-    // The next ".." might be following the one we've just erased.
-    Cursor = PrevSlash;
-  }
-
-  // Remove all duplicate backslashes.
-  Cursor = 0;
-  while ((Cursor = Filepath.find("\\\\", Cursor)) != std::string::npos)
-    Filepath.erase(Cursor, 1);
-
-  return Filepath;
-}
-
-void WinCodeViewLineTables::maybeRecordLocation(DebugLoc DL,
-                                                const MachineFunction *MF) {
-  const MDNode *Scope = DL.getScope();
-  if (!Scope)
-    return;
-  unsigned LineNumber = DL.getLine();
-  // Skip this line if it is longer than the maximum we can record.
-  if (LineNumber > COFF::CVL_MaxLineNumber)
-    return;
-
-  unsigned ColumnNumber = DL.getCol();
-  // Truncate the column number if it is longer than the maximum we can record.
-  if (ColumnNumber > COFF::CVL_MaxColumnNumber)
-    ColumnNumber = 0;
-
-  StringRef Filename = getFullFilepath(Scope);
-
-  // Skip this instruction if it has the same file:line as the previous one.
-  assert(CurFn);
-  if (!CurFn->Instrs.empty()) {
-    const InstrInfoTy &LastInstr = InstrInfo[CurFn->Instrs.back()];
-    if (LastInstr.Filename == Filename && LastInstr.LineNumber == LineNumber &&
-        LastInstr.ColumnNumber == ColumnNumber)
-      return;
-  }
-  FileNameRegistry.add(Filename);
-
-  MCSymbol *MCL = Asm->MMI->getContext().createTempSymbol();
-  Asm->OutStreamer->EmitLabel(MCL);
-  CurFn->Instrs.push_back(MCL);
-  InstrInfo[MCL] = InstrInfoTy(Filename, LineNumber, ColumnNumber);
-}
-
-WinCodeViewLineTables::WinCodeViewLineTables(AsmPrinter *AP)
-    : Asm(nullptr), CurFn(nullptr) {
-  MachineModuleInfo *MMI = AP->MMI;
-
-  // If module doesn't have named metadata anchors or COFF debug section
-  // is not available, skip any debug info related stuff.
-  if (!MMI->getModule()->getNamedMetadata("llvm.dbg.cu") ||
-      !AP->getObjFileLowering().getCOFFDebugSymbolsSection())
-    return;
-
-  // Tell MMI that we have debug info.
-  MMI->setDebugInfoAvailability(true);
-  Asm = AP;
-}
-
-void WinCodeViewLineTables::endModule() {
-  if (FnDebugInfo.empty())
-    return;
-
-  assert(Asm != nullptr);
-  Asm->OutStreamer->SwitchSection(
-      Asm->getObjFileLowering().getCOFFDebugSymbolsSection());
-  Asm->EmitInt32(COFF::DEBUG_SECTION_MAGIC);
-
-  // The COFF .debug$S section consists of several subsections, each starting
-  // with a 4-byte control code (e.g. 0xF1, 0xF2, etc) and then a 4-byte length
-  // of the payload followed by the payload itself.  The subsections are 4-byte
-  // aligned.
-
-  // Emit per-function debug information.  This code is extracted into a
-  // separate function for readability.
-  for (size_t I = 0, E = VisitedFunctions.size(); I != E; ++I)
-    emitDebugInfoForFunction(VisitedFunctions[I]);
-
-  // This subsection holds a file index to offset in string table table.
-  Asm->OutStreamer->AddComment("File index to string table offset subsection");
-  Asm->EmitInt32(COFF::DEBUG_INDEX_SUBSECTION);
-  size_t NumFilenames = FileNameRegistry.Infos.size();
-  Asm->EmitInt32(8 * NumFilenames);
-  for (size_t I = 0, E = FileNameRegistry.Filenames.size(); I != E; ++I) {
-    StringRef Filename = FileNameRegistry.Filenames[I];
-    // For each unique filename, just write its offset in the string table.
-    Asm->EmitInt32(FileNameRegistry.Infos[Filename].StartOffset);
-    // The function name offset is not followed by any additional data.
-    Asm->EmitInt32(0);
-  }
-
-  // This subsection holds the string table.
-  Asm->OutStreamer->AddComment("String table");
-  Asm->EmitInt32(COFF::DEBUG_STRING_TABLE_SUBSECTION);
-  Asm->EmitInt32(FileNameRegistry.LastOffset);
-  // The payload starts with a null character.
-  Asm->EmitInt8(0);
-
-  for (size_t I = 0, E = FileNameRegistry.Filenames.size(); I != E; ++I) {
-    // Just emit unique filenames one by one, separated by a null character.
-    Asm->OutStreamer->EmitBytes(FileNameRegistry.Filenames[I]);
-    Asm->EmitInt8(0);
-  }
-
-  // No more subsections. Fill with zeros to align the end of the section by 4.
-  Asm->OutStreamer->EmitFill((-FileNameRegistry.LastOffset) % 4, 0);
-
-  clear();
-}
-
-static void EmitLabelDiff(MCStreamer &Streamer,
-                          const MCSymbol *From, const MCSymbol *To,
-                          unsigned int Size = 4) {
-  MCSymbolRefExpr::VariantKind Variant = MCSymbolRefExpr::VK_None;
-  MCContext &Context = Streamer.getContext();
-  const MCExpr *FromRef = MCSymbolRefExpr::create(From, Variant, Context),
-               *ToRef   = MCSymbolRefExpr::create(To, Variant, Context);
-  const MCExpr *AddrDelta =
-      MCBinaryExpr::create(MCBinaryExpr::Sub, ToRef, FromRef, Context);
-  Streamer.EmitValue(AddrDelta, Size);
-}
-
-void WinCodeViewLineTables::emitDebugInfoForFunction(const Function *GV) {
-  // For each function there is a separate subsection
-  // which holds the PC to file:line table.
-  const MCSymbol *Fn = Asm->getSymbol(GV);
-  assert(Fn);
-
-  const FunctionInfo &FI = FnDebugInfo[GV];
-  if (FI.Instrs.empty())
-    return;
-  assert(FI.End && "Don't know where the function ends?");
-
-  StringRef GVName = GV->getName();
-  StringRef FuncName;
-  if (auto *SP = getDISubprogram(GV))
-    FuncName = SP->getDisplayName();
-
-  // FIXME Clang currently sets DisplayName to "bar" for a C++
-  // "namespace_foo::bar" function, see PR21528.  Luckily, dbghelp.dll is trying
-  // to demangle display names anyways, so let's just put a mangled name into
-  // the symbols subsection until Clang gives us what we need.
-  if (GVName.startswith("\01?"))
-    FuncName = GVName.substr(1);
-  // Emit a symbol subsection, required by VS2012+ to find function boundaries.
-  MCSymbol *SymbolsBegin = Asm->MMI->getContext().createTempSymbol(),
-           *SymbolsEnd = Asm->MMI->getContext().createTempSymbol();
-  Asm->OutStreamer->AddComment("Symbol subsection for " + Twine(FuncName));
-  Asm->EmitInt32(COFF::DEBUG_SYMBOL_SUBSECTION);
-  EmitLabelDiff(*Asm->OutStreamer, SymbolsBegin, SymbolsEnd);
-  Asm->OutStreamer->EmitLabel(SymbolsBegin);
-  {
-    MCSymbol *ProcSegmentBegin = Asm->MMI->getContext().createTempSymbol(),
-             *ProcSegmentEnd = Asm->MMI->getContext().createTempSymbol();
-    EmitLabelDiff(*Asm->OutStreamer, ProcSegmentBegin, ProcSegmentEnd, 2);
-    Asm->OutStreamer->EmitLabel(ProcSegmentBegin);
-
-    Asm->EmitInt16(COFF::DEBUG_SYMBOL_TYPE_PROC_START);
-    // Some bytes of this segment don't seem to be required for basic debugging,
-    // so just fill them with zeroes.
-    Asm->OutStreamer->EmitFill(12, 0);
-    // This is the important bit that tells the debugger where the function
-    // code is located and what's its size:
-    EmitLabelDiff(*Asm->OutStreamer, Fn, FI.End);
-    Asm->OutStreamer->EmitFill(12, 0);
-    Asm->OutStreamer->EmitCOFFSecRel32(Fn);
-    Asm->OutStreamer->EmitCOFFSectionIndex(Fn);
-    Asm->EmitInt8(0);
-    // Emit the function display name as a null-terminated string.
-    Asm->OutStreamer->EmitBytes(FuncName);
-    Asm->EmitInt8(0);
-    Asm->OutStreamer->EmitLabel(ProcSegmentEnd);
-
-    // We're done with this function.
-    Asm->EmitInt16(0x0002);
-    Asm->EmitInt16(COFF::DEBUG_SYMBOL_TYPE_PROC_END);
-  }
-  Asm->OutStreamer->EmitLabel(SymbolsEnd);
-  // Every subsection must be aligned to a 4-byte boundary.
-  Asm->OutStreamer->EmitFill((-FuncName.size()) % 4, 0);
-
-  // PCs/Instructions are grouped into segments sharing the same filename.
-  // Pre-calculate the lengths (in instructions) of these segments and store
-  // them in a map for convenience.  Each index in the map is the sequential
-  // number of the respective instruction that starts a new segment.
-  DenseMap<size_t, size_t> FilenameSegmentLengths;
-  size_t LastSegmentEnd = 0;
-  StringRef PrevFilename = InstrInfo[FI.Instrs[0]].Filename;
-  for (size_t J = 1, F = FI.Instrs.size(); J != F; ++J) {
-    if (PrevFilename == InstrInfo[FI.Instrs[J]].Filename)
-      continue;
-    FilenameSegmentLengths[LastSegmentEnd] = J - LastSegmentEnd;
-    LastSegmentEnd = J;
-    PrevFilename = InstrInfo[FI.Instrs[J]].Filename;
-  }
-  FilenameSegmentLengths[LastSegmentEnd] = FI.Instrs.size() - LastSegmentEnd;
-
-  // Emit a line table subsection, required to do PC-to-file:line lookup.
-  Asm->OutStreamer->AddComment("Line table subsection for " + Twine(FuncName));
-  Asm->EmitInt32(COFF::DEBUG_LINE_TABLE_SUBSECTION);
-  MCSymbol *LineTableBegin = Asm->MMI->getContext().createTempSymbol(),
-           *LineTableEnd = Asm->MMI->getContext().createTempSymbol();
-  EmitLabelDiff(*Asm->OutStreamer, LineTableBegin, LineTableEnd);
-  Asm->OutStreamer->EmitLabel(LineTableBegin);
-
-  // Identify the function this subsection is for.
-  Asm->OutStreamer->EmitCOFFSecRel32(Fn);
-  Asm->OutStreamer->EmitCOFFSectionIndex(Fn);
-  // Insert flags after a 16-bit section index.
-  Asm->EmitInt16(COFF::DEBUG_LINE_TABLES_HAVE_COLUMN_RECORDS);
-
-  // Length of the function's code, in bytes.
-  EmitLabelDiff(*Asm->OutStreamer, Fn, FI.End);
-
-  // PC-to-linenumber lookup table:
-  MCSymbol *FileSegmentEnd = nullptr;
-
-  // The start of the last segment:
-  size_t LastSegmentStart = 0;
-
-  auto FinishPreviousChunk = [&] {
-    if (!FileSegmentEnd)
-      return;
-    for (size_t ColSegI = LastSegmentStart,
-                ColSegEnd = ColSegI + FilenameSegmentLengths[LastSegmentStart];
-         ColSegI != ColSegEnd; ++ColSegI) {
-      unsigned ColumnNumber = InstrInfo[FI.Instrs[ColSegI]].ColumnNumber;
-      assert(ColumnNumber <= COFF::CVL_MaxColumnNumber);
-      Asm->EmitInt16(ColumnNumber); // Start column
-      Asm->EmitInt16(0);            // End column
-    }
-    Asm->OutStreamer->EmitLabel(FileSegmentEnd);
-  };
-
-  for (size_t J = 0, F = FI.Instrs.size(); J != F; ++J) {
-    MCSymbol *Instr = FI.Instrs[J];
-    assert(InstrInfo.count(Instr));
-
-    if (FilenameSegmentLengths.count(J)) {
-      // We came to a beginning of a new filename segment.
-      FinishPreviousChunk();
-      StringRef CurFilename = InstrInfo[FI.Instrs[J]].Filename;
-      assert(FileNameRegistry.Infos.count(CurFilename));
-      size_t IndexInStringTable =
-          FileNameRegistry.Infos[CurFilename].FilenameID;
-      // Each segment starts with the offset of the filename
-      // in the string table.
-      Asm->OutStreamer->AddComment(
-          "Segment for file '" + Twine(CurFilename) + "' begins");
-      MCSymbol *FileSegmentBegin = Asm->MMI->getContext().createTempSymbol();
-      Asm->OutStreamer->EmitLabel(FileSegmentBegin);
-      Asm->EmitInt32(8 * IndexInStringTable);
-
-      // Number of PC records in the lookup table.
-      size_t SegmentLength = FilenameSegmentLengths[J];
-      Asm->EmitInt32(SegmentLength);
-
-      // Full size of the segment for this filename, including the prev two
-      // records.
-      FileSegmentEnd = Asm->MMI->getContext().createTempSymbol();
-      EmitLabelDiff(*Asm->OutStreamer, FileSegmentBegin, FileSegmentEnd);
-      LastSegmentStart = J;
-    }
-
-    // The first PC with the given linenumber and the linenumber itself.
-    EmitLabelDiff(*Asm->OutStreamer, Fn, Instr);
-    uint32_t LineNumber = InstrInfo[Instr].LineNumber;
-    assert(LineNumber <= COFF::CVL_MaxLineNumber);
-    uint32_t LineData = LineNumber | COFF::CVL_IsStatement;
-    Asm->EmitInt32(LineData);
-  }
-
-  FinishPreviousChunk();
-  Asm->OutStreamer->EmitLabel(LineTableEnd);
-}
-
-void WinCodeViewLineTables::beginFunction(const MachineFunction *MF) {
-  assert(!CurFn && "Can't process two functions at once!");
-
-  if (!Asm || !Asm->MMI->hasDebugInfo())
-    return;
-
-  const Function *GV = MF->getFunction();
-  assert(FnDebugInfo.count(GV) == false);
-  VisitedFunctions.push_back(GV);
-  CurFn = &FnDebugInfo[GV];
-
-  // Find the end of the function prolog.
-  // FIXME: is there a simpler a way to do this? Can we just search
-  // for the first instruction of the function, not the last of the prolog?
-  DebugLoc PrologEndLoc;
-  bool EmptyPrologue = true;
-  for (const auto &MBB : *MF) {
-    if (PrologEndLoc)
-      break;
-    for (const auto &MI : MBB) {
-      if (MI.isDebugValue())
-        continue;
-
-      // First known non-DBG_VALUE and non-frame setup location marks
-      // the beginning of the function body.
-      // FIXME: do we need the first subcondition?
-      if (!MI.getFlag(MachineInstr::FrameSetup) && MI.getDebugLoc()) {
-        PrologEndLoc = MI.getDebugLoc();
-        break;
-      }
-      EmptyPrologue = false;
-    }
-  }
-  // Record beginning of function if we have a non-empty prologue.
-  if (PrologEndLoc && !EmptyPrologue) {
-    DebugLoc FnStartDL = PrologEndLoc.getFnDebugLoc();
-    maybeRecordLocation(FnStartDL, MF);
-  }
-}
-
-void WinCodeViewLineTables::endFunction(const MachineFunction *MF) {
-  if (!Asm || !CurFn)  // We haven't created any debug info for this function.
-    return;
-
-  const Function *GV = MF->getFunction();
-  assert(FnDebugInfo.count(GV));
-  assert(CurFn == &FnDebugInfo[GV]);
-
-  if (CurFn->Instrs.empty()) {
-    FnDebugInfo.erase(GV);
-    VisitedFunctions.pop_back();
-  } else {
-    CurFn->End = Asm->getFunctionEnd();
-  }
-  CurFn = nullptr;
-}
-
-void WinCodeViewLineTables::beginInstruction(const MachineInstr *MI) {
-  // Ignore DBG_VALUE locations and function prologue.
-  if (!Asm || MI->isDebugValue() || MI->getFlag(MachineInstr::FrameSetup))
-    return;
-  DebugLoc DL = MI->getDebugLoc();
-  if (DL == PrevInstLoc || !DL)
-    return;
-  maybeRecordLocation(DL, Asm->MF);
-}
-}
diff --git a/lib/CodeGen/AsmPrinter/WinCodeViewLineTables.h b/lib/CodeGen/AsmPrinter/WinCodeViewLineTables.h
deleted file mode 100644
index 78068e07c16f..000000000000
--- a/lib/CodeGen/AsmPrinter/WinCodeViewLineTables.h
+++ /dev/null
@@ -1,138 +0,0 @@
-//===-- llvm/lib/CodeGen/AsmPrinter/WinCodeViewLineTables.h ----*- C++ -*--===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-//
-// This file contains support for writing line tables info into COFF files.
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef LLVM_LIB_CODEGEN_ASMPRINTER_WINCODEVIEWLINETABLES_H
-#define LLVM_LIB_CODEGEN_ASMPRINTER_WINCODEVIEWLINETABLES_H
-
-#include "AsmPrinterHandler.h"
-#include "llvm/ADT/DenseMap.h"
-#include "llvm/ADT/StringMap.h"
-#include "llvm/ADT/StringRef.h"
-#include "llvm/CodeGen/AsmPrinter.h"
-#include "llvm/CodeGen/LexicalScopes.h"
-#include "llvm/CodeGen/MachineFunction.h"
-#include "llvm/CodeGen/MachineModuleInfo.h"
-#include "llvm/IR/DebugInfo.h"
-#include "llvm/IR/DebugLoc.h"
-#include "llvm/MC/MCStreamer.h"
-#include "llvm/Target/TargetLoweringObjectFile.h"
-
-namespace llvm {
-/// \brief Collects and handles line tables information in a CodeView format.
-class LLVM_LIBRARY_VISIBILITY WinCodeViewLineTables : public AsmPrinterHandler {
-  AsmPrinter *Asm;
-  DebugLoc PrevInstLoc;
-
-  // For each function, store a vector of labels to its instructions, as well as
-  // to the end of the function.
-  struct FunctionInfo {
-    SmallVector<MCSymbol *, 10> Instrs;
-    MCSymbol *End;
-    FunctionInfo() : End(nullptr) {}
-  } *CurFn;
-
-  typedef DenseMap<const Function *, FunctionInfo> FnDebugInfoTy;
-  FnDebugInfoTy FnDebugInfo;
-  // Store the functions we've visited in a vector so we can maintain a stable
-  // order while emitting subsections.
-  SmallVector<const Function *, 10> VisitedFunctions;
-
-  // InstrInfoTy - Holds the Filename:LineNumber information for every
-  // instruction with a unique debug location.
-  struct InstrInfoTy {
-    StringRef Filename;
-    unsigned LineNumber;
-    unsigned ColumnNumber;
-
-    InstrInfoTy() : LineNumber(0), ColumnNumber(0) {}
-
-    InstrInfoTy(StringRef Filename, unsigned LineNumber, unsigned ColumnNumber)
-        : Filename(Filename), LineNumber(LineNumber),
-          ColumnNumber(ColumnNumber) {}
-  };
-  DenseMap<MCSymbol *, InstrInfoTy> InstrInfo;
-
-  // FileNameRegistry - Manages filenames observed while generating debug info
-  // by filtering out duplicates and bookkeeping the offsets in the string
-  // table to be generated.
-  struct FileNameRegistryTy {
-    SmallVector<StringRef, 10> Filenames;
-    struct PerFileInfo {
-      size_t FilenameID, StartOffset;
-    };
-    StringMap<PerFileInfo> Infos;
-
-    // The offset in the string table where we'll write the next unique
-    // filename.
-    size_t LastOffset;
-
-    FileNameRegistryTy() {
-      clear();
-    }
-
-    // Add Filename to the registry, if it was not observed before.
-    void add(StringRef Filename) {
-      if (Infos.count(Filename))
-        return;
-      size_t OldSize = Infos.size();
-      Infos[Filename].FilenameID = OldSize;
-      Infos[Filename].StartOffset = LastOffset;
-      LastOffset += Filename.size() + 1;
-      Filenames.push_back(Filename);
-    }
-
-    void clear() {
-      LastOffset = 1;
-      Infos.clear();
-      Filenames.clear();
-    }
-  } FileNameRegistry;
-
-  typedef std::map<std::pair<StringRef, StringRef>, std::string>
-      DirAndFilenameToFilepathMapTy;
-  DirAndFilenameToFilepathMapTy DirAndFilenameToFilepathMap;
-  StringRef getFullFilepath(const MDNode *S);
-
-  void maybeRecordLocation(DebugLoc DL, const MachineFunction *MF);
-
-  void clear() {
-    assert(CurFn == nullptr);
-    FileNameRegistry.clear();
-    InstrInfo.clear();
-  }
-
-  void emitDebugInfoForFunction(const Function *GV);
-
-public:
-  WinCodeViewLineTables(AsmPrinter *Asm);
-
-  void setSymbolSize(const llvm::MCSymbol *, uint64_t) override {}
-
-  /// \brief Emit the COFF section that holds the line table information.
-  void endModule() override;
-
-  /// \brief Gather pre-function debug information.
-  void beginFunction(const MachineFunction *MF) override;
-
-  /// \brief Gather post-function debug information.
-  void endFunction(const MachineFunction *) override;
-
-  /// \brief Process beginning of an instruction.
-  void beginInstruction(const MachineInstr *MI) override;
-
-  /// \brief Process end of an instruction.
-  void endInstruction() override {}
-};
-} // End of namespace llvm
-
-#endif
diff --git a/lib/CodeGen/AsmPrinter/WinException.cpp b/lib/CodeGen/AsmPrinter/WinException.cpp
index 4da5b580fcda..e5933d8d4160 100644
--- a/lib/CodeGen/AsmPrinter/WinException.cpp
+++ b/lib/CodeGen/AsmPrinter/WinException.cpp
@@ -12,7 +12,6 @@
 //===----------------------------------------------------------------------===//
 
 #include "WinException.h"
-#include "llvm/ADT/SmallString.h"
 #include "llvm/ADT/StringExtras.h"
 #include "llvm/ADT/Twine.h"
 #include "llvm/CodeGen/AsmPrinter.h"
@@ -35,6 +34,7 @@
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/FormattedStream.h"
 #include "llvm/Target/TargetFrameLowering.h"
+#include "llvm/Target/TargetLowering.h"
 #include "llvm/Target/TargetLoweringObjectFile.h"
 #include "llvm/Target/TargetOptions.h"
 #include "llvm/Target/TargetRegisterInfo.h"
@@ -125,10 +125,9 @@ void WinException::endFunction(const MachineFunction *MF) {
   if (shouldEmitPersonality || shouldEmitLSDA) {
     Asm->OutStreamer->PushSection();
 
-    // Just switch sections to the right xdata section. This use of CurrentFnSym
-    // assumes that we only emit the LSDA when ending the parent function.
-    MCSection *XData = WinEH::UnwindEmitter::getXDataSection(Asm->CurrentFnSym,
-                                                             Asm->OutContext);
+    // Just switch sections to the right xdata section.
+    MCSection *XData = Asm->OutStreamer->getAssociatedXDataSection(
+        Asm->OutStreamer->getCurrentSectionOnly());
     Asm->OutStreamer->SwitchSection(XData);
 
     // Emit the tables appropriate to the personality function in use. If we
@@ -303,8 +302,17 @@ int WinException::getFrameIndexOffset(int FrameIndex,
                                       const WinEHFuncInfo &FuncInfo) {
   const TargetFrameLowering &TFI = *Asm->MF->getSubtarget().getFrameLowering();
   unsigned UnusedReg;
-  if (Asm->MAI->usesWindowsCFI())
-    return TFI.getFrameIndexReferenceFromSP(*Asm->MF, FrameIndex, UnusedReg);
+  if (Asm->MAI->usesWindowsCFI()) {
+    int Offset =
+        TFI.getFrameIndexReferencePreferSP(*Asm->MF, FrameIndex, UnusedReg,
+                                           /*IgnoreSPUpdates*/ true);
+    assert(UnusedReg ==
+           Asm->MF->getSubtarget()
+               .getTargetLowering()
+               ->getStackPointerRegisterToSaveRestore());
+    return Offset;
+  }
+
   // For 32-bit, offsets should be relative to the end of the EH registration
   // node. For 64-bit, it's relative to SP at the end of the prologue.
   assert(FuncInfo.EHRegNodeEndOffset != INT_MAX);
@@ -793,6 +801,7 @@ void WinException::emitCXXFrameHandler3Table(const MachineFunction *MF) {
         const MCExpr *FrameAllocOffsetRef = nullptr;
         if (HT.CatchObj.FrameIndex != INT_MAX) {
           int Offset = getFrameIndexOffset(HT.CatchObj.FrameIndex, FuncInfo);
+          assert(Offset != 0 && "Illegal offset for catch object!");
           FrameAllocOffsetRef = MCConstantExpr::create(Offset, Asm->OutContext);
         } else {
           FrameAllocOffsetRef = MCConstantExpr::create(0, Asm->OutContext);
@@ -945,15 +954,42 @@ void WinException::emitExceptHandlerTable(const MachineFunction *MF) {
     //   ScopeTableEntry ScopeRecord[];
     // };
     //
-    // Only the EHCookieOffset field appears to vary, and it appears to be the
-    // offset from the final saved SP value to the retaddr.
+    // Offsets are %ebp relative.
+    //
+    // The GS cookie is present only if the function needs stack protection.
+    // GSCookieOffset = -2 means that GS cookie is not used.
+    //
+    // The EH cookie is always present.
+    //
+    // Check is done the following way:
+    //    (ebp+CookieXOROffset) ^ [ebp+CookieOffset] == _security_cookie
+
+    // Retrieve the Guard Stack slot.
+    int GSCookieOffset = -2;
+    const MachineFrameInfo *MFI = MF->getFrameInfo();
+    if (MFI->hasStackProtectorIndex()) {
+      unsigned UnusedReg;
+      const TargetFrameLowering *TFI = MF->getSubtarget().getFrameLowering();
+      int SSPIdx = MFI->getStackProtectorIndex();
+      GSCookieOffset = TFI->getFrameIndexReference(*MF, SSPIdx, UnusedReg);
+    }
+
+    // Retrieve the EH Guard slot.
+    // TODO(etienneb): Get rid of this value and change it for and assertion.
+    int EHCookieOffset = 9999;
+    if (FuncInfo.EHGuardFrameIndex != INT_MAX) {
+      unsigned UnusedReg;
+      const TargetFrameLowering *TFI = MF->getSubtarget().getFrameLowering();
+      int EHGuardIdx = FuncInfo.EHGuardFrameIndex;
+      EHCookieOffset = TFI->getFrameIndexReference(*MF, EHGuardIdx, UnusedReg);
+    }
+
     AddComment("GSCookieOffset");
-    OS.EmitIntValue(-2, 4);
+    OS.EmitIntValue(GSCookieOffset, 4);
     AddComment("GSCookieXOROffset");
     OS.EmitIntValue(0, 4);
-    // FIXME: Calculate.
     AddComment("EHCookieOffset");
-    OS.EmitIntValue(9999, 4);
+    OS.EmitIntValue(EHCookieOffset, 4);
     AddComment("EHCookieXOROffset");
     OS.EmitIntValue(0, 4);
     BaseState = -2;
diff --git a/lib/CodeGen/AtomicExpandPass.cpp b/lib/CodeGen/AtomicExpandPass.cpp
index d12fdb246984..bf5cf105a8f8 100644
--- a/lib/CodeGen/AtomicExpandPass.cpp
+++ b/lib/CodeGen/AtomicExpandPass.cpp
@@ -8,10 +8,10 @@
 //===----------------------------------------------------------------------===//
 //
 // This file contains a pass (at IR level) to replace atomic instructions with
-// target specific instruction which implement the same semantics in a way
-// which better fits the target backend.  This can include the use of either
-// (intrinsic-based) load-linked/store-conditional loops, AtomicCmpXchg, or
-// type coercions.
+// __atomic_* library calls, or target specific instruction which implement the
+// same semantics in a way which better fits the target backend.  This can
+// include the use of (intrinsic-based) load-linked/store-conditional loops,
+// AtomicCmpXchg, or type coercions.
 //
 //===----------------------------------------------------------------------===//
 
@@ -57,25 +57,121 @@ namespace {
     StoreInst *convertAtomicStoreToIntegerType(StoreInst *SI);
     bool expandAtomicStore(StoreInst *SI);
     bool tryExpandAtomicRMW(AtomicRMWInst *AI);
-    bool expandAtomicOpToLLSC(
-        Instruction *I, Value *Addr, AtomicOrdering MemOpOrder,
-        std::function<Value *(IRBuilder<> &, Value *)> PerformOp);
+    Value *
+    insertRMWLLSCLoop(IRBuilder<> &Builder, Type *ResultTy, Value *Addr,
+                      AtomicOrdering MemOpOrder,
+                      function_ref<Value *(IRBuilder<> &, Value *)> PerformOp);
+    void expandAtomicOpToLLSC(
+        Instruction *I, Type *ResultTy, Value *Addr, AtomicOrdering MemOpOrder,
+        function_ref<Value *(IRBuilder<> &, Value *)> PerformOp);
+    void expandPartwordAtomicRMW(
+        AtomicRMWInst *I,
+        TargetLoweringBase::AtomicExpansionKind ExpansionKind);
+    void expandPartwordCmpXchg(AtomicCmpXchgInst *I);
+
+    AtomicCmpXchgInst *convertCmpXchgToIntegerType(AtomicCmpXchgInst *CI);
+    static Value *insertRMWCmpXchgLoop(
+        IRBuilder<> &Builder, Type *ResultType, Value *Addr,
+        AtomicOrdering MemOpOrder,
+        function_ref<Value *(IRBuilder<> &, Value *)> PerformOp,
+        CreateCmpXchgInstFun CreateCmpXchg);
+
     bool expandAtomicCmpXchg(AtomicCmpXchgInst *CI);
     bool isIdempotentRMW(AtomicRMWInst *AI);
     bool simplifyIdempotentRMW(AtomicRMWInst *AI);
+
+    bool expandAtomicOpToLibcall(Instruction *I, unsigned Size, unsigned Align,
+                                 Value *PointerOperand, Value *ValueOperand,
+                                 Value *CASExpected, AtomicOrdering Ordering,
+                                 AtomicOrdering Ordering2,
+                                 ArrayRef<RTLIB::Libcall> Libcalls);
+    void expandAtomicLoadToLibcall(LoadInst *LI);
+    void expandAtomicStoreToLibcall(StoreInst *LI);
+    void expandAtomicRMWToLibcall(AtomicRMWInst *I);
+    void expandAtomicCASToLibcall(AtomicCmpXchgInst *I);
+
+    friend bool
+    llvm::expandAtomicRMWToCmpXchg(AtomicRMWInst *AI,
+                                   CreateCmpXchgInstFun CreateCmpXchg);
   };
 }
 
 char AtomicExpand::ID = 0;
 char &llvm::AtomicExpandID = AtomicExpand::ID;
-INITIALIZE_TM_PASS(AtomicExpand, "atomic-expand",
-    "Expand Atomic calls in terms of either load-linked & store-conditional or cmpxchg",
-    false, false)
+INITIALIZE_TM_PASS(AtomicExpand, "atomic-expand", "Expand Atomic instructions",
+                   false, false)
 
 FunctionPass *llvm::createAtomicExpandPass(const TargetMachine *TM) {
   return new AtomicExpand(TM);
 }
 
+namespace {
+// Helper functions to retrieve the size of atomic instructions.
+unsigned getAtomicOpSize(LoadInst *LI) {
+  const DataLayout &DL = LI->getModule()->getDataLayout();
+  return DL.getTypeStoreSize(LI->getType());
+}
+
+unsigned getAtomicOpSize(StoreInst *SI) {
+  const DataLayout &DL = SI->getModule()->getDataLayout();
+  return DL.getTypeStoreSize(SI->getValueOperand()->getType());
+}
+
+unsigned getAtomicOpSize(AtomicRMWInst *RMWI) {
+  const DataLayout &DL = RMWI->getModule()->getDataLayout();
+  return DL.getTypeStoreSize(RMWI->getValOperand()->getType());
+}
+
+unsigned getAtomicOpSize(AtomicCmpXchgInst *CASI) {
+  const DataLayout &DL = CASI->getModule()->getDataLayout();
+  return DL.getTypeStoreSize(CASI->getCompareOperand()->getType());
+}
+
+// Helper functions to retrieve the alignment of atomic instructions.
+unsigned getAtomicOpAlign(LoadInst *LI) {
+  unsigned Align = LI->getAlignment();
+  // In the future, if this IR restriction is relaxed, we should
+  // return DataLayout::getABITypeAlignment when there's no align
+  // value.
+  assert(Align != 0 && "An atomic LoadInst always has an explicit alignment");
+  return Align;
+}
+
+unsigned getAtomicOpAlign(StoreInst *SI) {
+  unsigned Align = SI->getAlignment();
+  // In the future, if this IR restriction is relaxed, we should
+  // return DataLayout::getABITypeAlignment when there's no align
+  // value.
+  assert(Align != 0 && "An atomic StoreInst always has an explicit alignment");
+  return Align;
+}
+
+unsigned getAtomicOpAlign(AtomicRMWInst *RMWI) {
+  // TODO(PR27168): This instruction has no alignment attribute, but unlike the
+  // default alignment for load/store, the default here is to assume
+  // it has NATURAL alignment, not DataLayout-specified alignment.
+  const DataLayout &DL = RMWI->getModule()->getDataLayout();
+  return DL.getTypeStoreSize(RMWI->getValOperand()->getType());
+}
+
+unsigned getAtomicOpAlign(AtomicCmpXchgInst *CASI) {
+  // TODO(PR27168): same comment as above.
+  const DataLayout &DL = CASI->getModule()->getDataLayout();
+  return DL.getTypeStoreSize(CASI->getCompareOperand()->getType());
+}
+
+// Determine if a particular atomic operation has a supported size,
+// and is of appropriate alignment, to be passed through for target
+// lowering. (Versus turning into a __atomic libcall)
+template <typename Inst>
+bool atomicSizeSupported(const TargetLowering *TLI, Inst *I) {
+  unsigned Size = getAtomicOpSize(I);
+  unsigned Align = getAtomicOpAlign(I);
+  return Align >= Size && Size <= TLI->getMaxAtomicSizeInBitsSupported() / 8;
+}
+
+} // end anonymous namespace
+
 bool AtomicExpand::runOnFunction(Function &F) {
   if (!TM || !TM->getSubtargetImpl(F)->enableAtomicExpand())
     return false;
@@ -85,9 +181,10 @@ bool AtomicExpand::runOnFunction(Function &F) {
 
   // Changing control-flow while iterating through it is a bad idea, so gather a
   // list of all atomic instructions before we start.
-  for (inst_iterator I = inst_begin(F), E = inst_end(F); I != E; ++I) {
-    if (I->isAtomic())
-      AtomicInsts.push_back(&*I);
+  for (inst_iterator II = inst_begin(F), E = inst_end(F); II != E; ++II) {
+    Instruction *I = &*II;
+    if (I->isAtomic() && !isa<FenceInst>(I))
+      AtomicInsts.push_back(I);
   }
 
   bool MadeChange = false;
@@ -96,41 +193,67 @@ bool AtomicExpand::runOnFunction(Function &F) {
     auto SI = dyn_cast<StoreInst>(I);
     auto RMWI = dyn_cast<AtomicRMWInst>(I);
     auto CASI = dyn_cast<AtomicCmpXchgInst>(I);
-    assert((LI || SI || RMWI || CASI || isa<FenceInst>(I)) &&
-           "Unknown atomic instruction");
+    assert((LI || SI || RMWI || CASI) && "Unknown atomic instruction");
+
+    // If the Size/Alignment is not supported, replace with a libcall.
+    if (LI) {
+      if (!atomicSizeSupported(TLI, LI)) {
+        expandAtomicLoadToLibcall(LI);
+        MadeChange = true;
+        continue;
+      }
+    } else if (SI) {
+      if (!atomicSizeSupported(TLI, SI)) {
+        expandAtomicStoreToLibcall(SI);
+        MadeChange = true;
+        continue;
+      }
+    } else if (RMWI) {
+      if (!atomicSizeSupported(TLI, RMWI)) {
+        expandAtomicRMWToLibcall(RMWI);
+        MadeChange = true;
+        continue;
+      }
+    } else if (CASI) {
+      if (!atomicSizeSupported(TLI, CASI)) {
+        expandAtomicCASToLibcall(CASI);
+        MadeChange = true;
+        continue;
+      }
+    }
 
-    auto FenceOrdering = Monotonic;
-    bool IsStore, IsLoad;
-    if (TLI->getInsertFencesForAtomic()) {
-      if (LI && isAtLeastAcquire(LI->getOrdering())) {
+    if (TLI->shouldInsertFencesForAtomic(I)) {
+      auto FenceOrdering = AtomicOrdering::Monotonic;
+      bool IsStore, IsLoad;
+      if (LI && isAcquireOrStronger(LI->getOrdering())) {
         FenceOrdering = LI->getOrdering();
-        LI->setOrdering(Monotonic);
+        LI->setOrdering(AtomicOrdering::Monotonic);
         IsStore = false;
         IsLoad = true;
-      } else if (SI && isAtLeastRelease(SI->getOrdering())) {
+      } else if (SI && isReleaseOrStronger(SI->getOrdering())) {
         FenceOrdering = SI->getOrdering();
-        SI->setOrdering(Monotonic);
+        SI->setOrdering(AtomicOrdering::Monotonic);
         IsStore = true;
         IsLoad = false;
-      } else if (RMWI && (isAtLeastRelease(RMWI->getOrdering()) ||
-                          isAtLeastAcquire(RMWI->getOrdering()))) {
+      } else if (RMWI && (isReleaseOrStronger(RMWI->getOrdering()) ||
+                          isAcquireOrStronger(RMWI->getOrdering()))) {
         FenceOrdering = RMWI->getOrdering();
-        RMWI->setOrdering(Monotonic);
+        RMWI->setOrdering(AtomicOrdering::Monotonic);
         IsStore = IsLoad = true;
       } else if (CASI && !TLI->shouldExpandAtomicCmpXchgInIR(CASI) &&
-                 (isAtLeastRelease(CASI->getSuccessOrdering()) ||
-                  isAtLeastAcquire(CASI->getSuccessOrdering()))) {
+                 (isReleaseOrStronger(CASI->getSuccessOrdering()) ||
+                  isAcquireOrStronger(CASI->getSuccessOrdering()))) {
         // If a compare and swap is lowered to LL/SC, we can do smarter fence
         // insertion, with a stronger one on the success path than on the
         // failure path. As a result, fence insertion is directly done by
         // expandAtomicCmpXchg in that case.
         FenceOrdering = CASI->getSuccessOrdering();
-        CASI->setSuccessOrdering(Monotonic);
-        CASI->setFailureOrdering(Monotonic);
+        CASI->setSuccessOrdering(AtomicOrdering::Monotonic);
+        CASI->setFailureOrdering(AtomicOrdering::Monotonic);
         IsStore = IsLoad = true;
       }
 
-      if (FenceOrdering != Monotonic) {
+      if (FenceOrdering != AtomicOrdering::Monotonic) {
         MadeChange |= bracketInstWithFences(I, FenceOrdering, IsStore, IsLoad);
       }
     }
@@ -143,7 +266,7 @@ bool AtomicExpand::runOnFunction(Function &F) {
         assert(LI->getType()->isIntegerTy() && "invariant broken");
         MadeChange = true;
       }
-      
+
       MadeChange |= tryExpandAtomicLoad(LI);
     } else if (SI) {
       if (SI->getValueOperand()->getType()->isFloatingPointTy()) {
@@ -168,8 +291,30 @@ bool AtomicExpand::runOnFunction(Function &F) {
       } else {
         MadeChange |= tryExpandAtomicRMW(RMWI);
       }
-    } else if (CASI && TLI->shouldExpandAtomicCmpXchgInIR(CASI)) {
-      MadeChange |= expandAtomicCmpXchg(CASI);
+    } else if (CASI) {
+      // TODO: when we're ready to make the change at the IR level, we can
+      // extend convertCmpXchgToInteger for floating point too.
+      assert(!CASI->getCompareOperand()->getType()->isFloatingPointTy() &&
+             "unimplemented - floating point not legal at IR level");
+      if (CASI->getCompareOperand()->getType()->isPointerTy() ) {
+        // TODO: add a TLI hook to control this so that each target can
+        // convert to lowering the original type one at a time.
+        CASI = convertCmpXchgToIntegerType(CASI);
+        assert(CASI->getCompareOperand()->getType()->isIntegerTy() &&
+               "invariant broken");
+        MadeChange = true;
+      }
+
+      unsigned MinCASSize = TLI->getMinCmpXchgSizeInBits() / 8;
+      unsigned ValueSize = getAtomicOpSize(CASI);
+      if (ValueSize < MinCASSize) {
+        assert(!TLI->shouldExpandAtomicCmpXchgInIR(CASI) &&
+               "MinCmpXchgSizeInBits not yet supported for LL/SC expansions.");
+        expandPartwordCmpXchg(CASI);
+      } else {
+        if (TLI->shouldExpandAtomicCmpXchgInIR(CASI))
+          MadeChange |= expandAtomicCmpXchg(CASI);
+      }
     }
   }
   return MadeChange;
@@ -206,7 +351,7 @@ IntegerType *AtomicExpand::getCorrespondingIntegerType(Type *T,
 }
 
 /// Convert an atomic load of a non-integral type to an integer load of the
-/// equivelent bitwidth.  See the function comment on
+/// equivalent bitwidth.  See the function comment on
 /// convertAtomicStoreToIntegerType for background.  
 LoadInst *AtomicExpand::convertAtomicLoadToIntegerType(LoadInst *LI) {
   auto *M = LI->getModule();
@@ -237,9 +382,10 @@ bool AtomicExpand::tryExpandAtomicLoad(LoadInst *LI) {
   case TargetLoweringBase::AtomicExpansionKind::None:
     return false;
   case TargetLoweringBase::AtomicExpansionKind::LLSC:
-    return expandAtomicOpToLLSC(
-        LI, LI->getPointerOperand(), LI->getOrdering(),
+    expandAtomicOpToLLSC(
+        LI, LI->getType(), LI->getPointerOperand(), LI->getOrdering(),
         [](IRBuilder<> &Builder, Value *Loaded) { return Loaded; });
+    return true;
   case TargetLoweringBase::AtomicExpansionKind::LLOnly:
     return expandAtomicLoadToLL(LI);
   case TargetLoweringBase::AtomicExpansionKind::CmpXChg:
@@ -283,7 +429,7 @@ bool AtomicExpand::expandAtomicLoadToCmpXchg(LoadInst *LI) {
 }
 
 /// Convert an atomic store of a non-integral type to an integer store of the
-/// equivelent bitwidth.  We used to not support floating point or vector
+/// equivalent bitwidth.  We used to not support floating point or vector
 /// atomics in the IR at all.  The backends learned to deal with the bitcast
 /// idiom because that was the only way of expressing the notion of a atomic
 /// float or vector store.  The long term plan is to teach each backend to
@@ -380,32 +526,353 @@ bool AtomicExpand::tryExpandAtomicRMW(AtomicRMWInst *AI) {
   switch (TLI->shouldExpandAtomicRMWInIR(AI)) {
   case TargetLoweringBase::AtomicExpansionKind::None:
     return false;
-  case TargetLoweringBase::AtomicExpansionKind::LLSC:
-    return expandAtomicOpToLLSC(AI, AI->getPointerOperand(), AI->getOrdering(),
-                                [&](IRBuilder<> &Builder, Value *Loaded) {
-                                  return performAtomicOp(AI->getOperation(),
-                                                         Builder, Loaded,
-                                                         AI->getValOperand());
-                                });
-  case TargetLoweringBase::AtomicExpansionKind::CmpXChg:
-    return expandAtomicRMWToCmpXchg(AI, createCmpXchgInstFun);
+  case TargetLoweringBase::AtomicExpansionKind::LLSC: {
+    unsigned MinCASSize = TLI->getMinCmpXchgSizeInBits() / 8;
+    unsigned ValueSize = getAtomicOpSize(AI);
+    if (ValueSize < MinCASSize) {
+      llvm_unreachable(
+          "MinCmpXchgSizeInBits not yet supported for LL/SC architectures.");
+    } else {
+      auto PerformOp = [&](IRBuilder<> &Builder, Value *Loaded) {
+        return performAtomicOp(AI->getOperation(), Builder, Loaded,
+                               AI->getValOperand());
+      };
+      expandAtomicOpToLLSC(AI, AI->getType(), AI->getPointerOperand(),
+                           AI->getOrdering(), PerformOp);
+    }
+    return true;
+  }
+  case TargetLoweringBase::AtomicExpansionKind::CmpXChg: {
+    unsigned MinCASSize = TLI->getMinCmpXchgSizeInBits() / 8;
+    unsigned ValueSize = getAtomicOpSize(AI);
+    if (ValueSize < MinCASSize) {
+      expandPartwordAtomicRMW(AI,
+                              TargetLoweringBase::AtomicExpansionKind::CmpXChg);
+    } else {
+      expandAtomicRMWToCmpXchg(AI, createCmpXchgInstFun);
+    }
+    return true;
+  }
   default:
     llvm_unreachable("Unhandled case in tryExpandAtomicRMW");
   }
 }
 
-bool AtomicExpand::expandAtomicOpToLLSC(
-    Instruction *I, Value *Addr, AtomicOrdering MemOpOrder,
-    std::function<Value *(IRBuilder<> &, Value *)> PerformOp) {
+namespace {
+
+/// Result values from createMaskInstrs helper.
+struct PartwordMaskValues {
+  Type *WordType;
+  Type *ValueType;
+  Value *AlignedAddr;
+  Value *ShiftAmt;
+  Value *Mask;
+  Value *Inv_Mask;
+};
+} // end anonymous namespace
+
+/// This is a helper function which builds instructions to provide
+/// values necessary for partword atomic operations. It takes an
+/// incoming address, Addr, and ValueType, and constructs the address,
+/// shift-amounts and masks needed to work with a larger value of size
+/// WordSize.
+///
+/// AlignedAddr: Addr rounded down to a multiple of WordSize
+///
+/// ShiftAmt: Number of bits to right-shift a WordSize value loaded
+///           from AlignAddr for it to have the same value as if
+///           ValueType was loaded from Addr.
+///
+/// Mask: Value to mask with the value loaded from AlignAddr to
+///       include only the part that would've been loaded from Addr.
+///
+/// Inv_Mask: The inverse of Mask.
+
+static PartwordMaskValues createMaskInstrs(IRBuilder<> &Builder, Instruction *I,
+                                           Type *ValueType, Value *Addr,
+                                           unsigned WordSize) {
+  PartwordMaskValues Ret;
+
   BasicBlock *BB = I->getParent();
   Function *F = BB->getParent();
+  Module *M = I->getModule();
+
   LLVMContext &Ctx = F->getContext();
+  const DataLayout &DL = M->getDataLayout();
+
+  unsigned ValueSize = DL.getTypeStoreSize(ValueType);
+
+  assert(ValueSize < WordSize);
+
+  Ret.ValueType = ValueType;
+  Ret.WordType = Type::getIntNTy(Ctx, WordSize * 8);
+
+  Type *WordPtrType =
+      Ret.WordType->getPointerTo(Addr->getType()->getPointerAddressSpace());
+
+  Value *AddrInt = Builder.CreatePtrToInt(Addr, DL.getIntPtrType(Ctx));
+  Ret.AlignedAddr = Builder.CreateIntToPtr(
+      Builder.CreateAnd(AddrInt, ~(uint64_t)(WordSize - 1)), WordPtrType,
+      "AlignedAddr");
+
+  Value *PtrLSB = Builder.CreateAnd(AddrInt, WordSize - 1, "PtrLSB");
+  if (DL.isLittleEndian()) {
+    // turn bytes into bits
+    Ret.ShiftAmt = Builder.CreateShl(PtrLSB, 3);
+  } else {
+    // turn bytes into bits, and count from the other side.
+    Ret.ShiftAmt =
+        Builder.CreateShl(Builder.CreateXor(PtrLSB, WordSize - ValueSize), 3);
+  }
+
+  Ret.ShiftAmt = Builder.CreateTrunc(Ret.ShiftAmt, Ret.WordType, "ShiftAmt");
+  Ret.Mask = Builder.CreateShl(
+      ConstantInt::get(Ret.WordType, (1 << ValueSize * 8) - 1), Ret.ShiftAmt,
+      "Mask");
+  Ret.Inv_Mask = Builder.CreateNot(Ret.Mask, "Inv_Mask");
+
+  return Ret;
+}
+
+/// Emit IR to implement a masked version of a given atomicrmw
+/// operation. (That is, only the bits under the Mask should be
+/// affected by the operation)
+static Value *performMaskedAtomicOp(AtomicRMWInst::BinOp Op,
+                                    IRBuilder<> &Builder, Value *Loaded,
+                                    Value *Shifted_Inc, Value *Inc,
+                                    const PartwordMaskValues &PMV) {
+  switch (Op) {
+  case AtomicRMWInst::Xchg: {
+    Value *Loaded_MaskOut = Builder.CreateAnd(Loaded, PMV.Inv_Mask);
+    Value *FinalVal = Builder.CreateOr(Loaded_MaskOut, Shifted_Inc);
+    return FinalVal;
+  }
+  case AtomicRMWInst::Or:
+  case AtomicRMWInst::Xor:
+    // Or/Xor won't affect any other bits, so can just be done
+    // directly.
+    return performAtomicOp(Op, Builder, Loaded, Shifted_Inc);
+  case AtomicRMWInst::Add:
+  case AtomicRMWInst::Sub:
+  case AtomicRMWInst::And:
+  case AtomicRMWInst::Nand: {
+    // The other arithmetic ops need to be masked into place.
+    Value *NewVal = performAtomicOp(Op, Builder, Loaded, Shifted_Inc);
+    Value *NewVal_Masked = Builder.CreateAnd(NewVal, PMV.Mask);
+    Value *Loaded_MaskOut = Builder.CreateAnd(Loaded, PMV.Inv_Mask);
+    Value *FinalVal = Builder.CreateOr(Loaded_MaskOut, NewVal_Masked);
+    return FinalVal;
+  }
+  case AtomicRMWInst::Max:
+  case AtomicRMWInst::Min:
+  case AtomicRMWInst::UMax:
+  case AtomicRMWInst::UMin: {
+    // Finally, comparison ops will operate on the full value, so
+    // truncate down to the original size, and expand out again after
+    // doing the operation.
+    Value *Loaded_Shiftdown = Builder.CreateTrunc(
+        Builder.CreateLShr(Loaded, PMV.ShiftAmt), PMV.ValueType);
+    Value *NewVal = performAtomicOp(Op, Builder, Loaded_Shiftdown, Inc);
+    Value *NewVal_Shiftup = Builder.CreateShl(
+        Builder.CreateZExt(NewVal, PMV.WordType), PMV.ShiftAmt);
+    Value *Loaded_MaskOut = Builder.CreateAnd(Loaded, PMV.Inv_Mask);
+    Value *FinalVal = Builder.CreateOr(Loaded_MaskOut, NewVal_Shiftup);
+    return FinalVal;
+  }
+  default:
+    llvm_unreachable("Unknown atomic op");
+  }
+}
+
+/// Expand a sub-word atomicrmw operation into an appropriate
+/// word-sized operation.
+///
+/// It will create an LL/SC or cmpxchg loop, as appropriate, the same
+/// way as a typical atomicrmw expansion. The only difference here is
+/// that the operation inside of the loop must operate only upon a
+/// part of the value.
+void AtomicExpand::expandPartwordAtomicRMW(
+    AtomicRMWInst *AI, TargetLoweringBase::AtomicExpansionKind ExpansionKind) {
+
+  assert(ExpansionKind == TargetLoweringBase::AtomicExpansionKind::CmpXChg);
+
+  AtomicOrdering MemOpOrder = AI->getOrdering();
+
+  IRBuilder<> Builder(AI);
+
+  PartwordMaskValues PMV =
+      createMaskInstrs(Builder, AI, AI->getType(), AI->getPointerOperand(),
+                       TLI->getMinCmpXchgSizeInBits() / 8);
+
+  Value *ValOperand_Shifted =
+      Builder.CreateShl(Builder.CreateZExt(AI->getValOperand(), PMV.WordType),
+                        PMV.ShiftAmt, "ValOperand_Shifted");
+
+  auto PerformPartwordOp = [&](IRBuilder<> &Builder, Value *Loaded) {
+    return performMaskedAtomicOp(AI->getOperation(), Builder, Loaded,
+                                 ValOperand_Shifted, AI->getValOperand(), PMV);
+  };
+
+  // TODO: When we're ready to support LLSC conversions too, use
+  // insertRMWLLSCLoop here for ExpansionKind==LLSC.
+  Value *OldResult =
+      insertRMWCmpXchgLoop(Builder, PMV.WordType, PMV.AlignedAddr, MemOpOrder,
+                           PerformPartwordOp, createCmpXchgInstFun);
+  Value *FinalOldResult = Builder.CreateTrunc(
+      Builder.CreateLShr(OldResult, PMV.ShiftAmt), PMV.ValueType);
+  AI->replaceAllUsesWith(FinalOldResult);
+  AI->eraseFromParent();
+}
+
+void AtomicExpand::expandPartwordCmpXchg(AtomicCmpXchgInst *CI) {
+  // The basic idea here is that we're expanding a cmpxchg of a
+  // smaller memory size up to a word-sized cmpxchg. To do this, we
+  // need to add a retry-loop for strong cmpxchg, so that
+  // modifications to other parts of the word don't cause a spurious
+  // failure.
+
+  // This generates code like the following:
+  //     [[Setup mask values PMV.*]]
+  //     %NewVal_Shifted = shl i32 %NewVal, %PMV.ShiftAmt
+  //     %Cmp_Shifted = shl i32 %Cmp, %PMV.ShiftAmt
+  //     %InitLoaded = load i32* %addr
+  //     %InitLoaded_MaskOut = and i32 %InitLoaded, %PMV.Inv_Mask
+  //     br partword.cmpxchg.loop
+  // partword.cmpxchg.loop:
+  //     %Loaded_MaskOut = phi i32 [ %InitLoaded_MaskOut, %entry ],
+  //        [ %OldVal_MaskOut, %partword.cmpxchg.failure ]
+  //     %FullWord_NewVal = or i32 %Loaded_MaskOut, %NewVal_Shifted
+  //     %FullWord_Cmp = or i32 %Loaded_MaskOut, %Cmp_Shifted
+  //     %NewCI = cmpxchg i32* %PMV.AlignedAddr, i32 %FullWord_Cmp,
+  //        i32 %FullWord_NewVal success_ordering failure_ordering
+  //     %OldVal = extractvalue { i32, i1 } %NewCI, 0
+  //     %Success = extractvalue { i32, i1 } %NewCI, 1
+  //     br i1 %Success, label %partword.cmpxchg.end,
+  //        label %partword.cmpxchg.failure
+  // partword.cmpxchg.failure:
+  //     %OldVal_MaskOut = and i32 %OldVal, %PMV.Inv_Mask
+  //     %ShouldContinue = icmp ne i32 %Loaded_MaskOut, %OldVal_MaskOut
+  //     br i1 %ShouldContinue, label %partword.cmpxchg.loop,
+  //         label %partword.cmpxchg.end
+  // partword.cmpxchg.end:
+  //    %tmp1 = lshr i32 %OldVal, %PMV.ShiftAmt
+  //    %FinalOldVal = trunc i32 %tmp1 to i8
+  //    %tmp2 = insertvalue { i8, i1 } undef, i8 %FinalOldVal, 0
+  //    %Res = insertvalue { i8, i1 } %25, i1 %Success, 1
+
+  Value *Addr = CI->getPointerOperand();
+  Value *Cmp = CI->getCompareOperand();
+  Value *NewVal = CI->getNewValOperand();
+
+  BasicBlock *BB = CI->getParent();
+  Function *F = BB->getParent();
+  IRBuilder<> Builder(CI);
+  LLVMContext &Ctx = Builder.getContext();
+
+  const int WordSize = TLI->getMinCmpXchgSizeInBits() / 8;
+
+  BasicBlock *EndBB =
+      BB->splitBasicBlock(CI->getIterator(), "partword.cmpxchg.end");
+  auto FailureBB =
+      BasicBlock::Create(Ctx, "partword.cmpxchg.failure", F, EndBB);
+  auto LoopBB = BasicBlock::Create(Ctx, "partword.cmpxchg.loop", F, FailureBB);
+
+  // The split call above "helpfully" added a branch at the end of BB
+  // (to the wrong place).
+  std::prev(BB->end())->eraseFromParent();
+  Builder.SetInsertPoint(BB);
+
+  PartwordMaskValues PMV = createMaskInstrs(
+      Builder, CI, CI->getCompareOperand()->getType(), Addr, WordSize);
+
+  // Shift the incoming values over, into the right location in the word.
+  Value *NewVal_Shifted =
+      Builder.CreateShl(Builder.CreateZExt(NewVal, PMV.WordType), PMV.ShiftAmt);
+  Value *Cmp_Shifted =
+      Builder.CreateShl(Builder.CreateZExt(Cmp, PMV.WordType), PMV.ShiftAmt);
+
+  // Load the entire current word, and mask into place the expected and new
+  // values
+  LoadInst *InitLoaded = Builder.CreateLoad(PMV.WordType, PMV.AlignedAddr);
+  InitLoaded->setVolatile(CI->isVolatile());
+  Value *InitLoaded_MaskOut = Builder.CreateAnd(InitLoaded, PMV.Inv_Mask);
+  Builder.CreateBr(LoopBB);
+
+  // partword.cmpxchg.loop:
+  Builder.SetInsertPoint(LoopBB);
+  PHINode *Loaded_MaskOut = Builder.CreatePHI(PMV.WordType, 2);
+  Loaded_MaskOut->addIncoming(InitLoaded_MaskOut, BB);
+
+  // Mask/Or the expected and new values into place in the loaded word.
+  Value *FullWord_NewVal = Builder.CreateOr(Loaded_MaskOut, NewVal_Shifted);
+  Value *FullWord_Cmp = Builder.CreateOr(Loaded_MaskOut, Cmp_Shifted);
+  AtomicCmpXchgInst *NewCI = Builder.CreateAtomicCmpXchg(
+      PMV.AlignedAddr, FullWord_Cmp, FullWord_NewVal, CI->getSuccessOrdering(),
+      CI->getFailureOrdering(), CI->getSynchScope());
+  NewCI->setVolatile(CI->isVolatile());
+  // When we're building a strong cmpxchg, we need a loop, so you
+  // might think we could use a weak cmpxchg inside. But, using strong
+  // allows the below comparison for ShouldContinue, and we're
+  // expecting the underlying cmpxchg to be a machine instruction,
+  // which is strong anyways.
+  NewCI->setWeak(CI->isWeak());
+
+  Value *OldVal = Builder.CreateExtractValue(NewCI, 0);
+  Value *Success = Builder.CreateExtractValue(NewCI, 1);
+
+  if (CI->isWeak())
+    Builder.CreateBr(EndBB);
+  else
+    Builder.CreateCondBr(Success, EndBB, FailureBB);
+
+  // partword.cmpxchg.failure:
+  Builder.SetInsertPoint(FailureBB);
+  // Upon failure, verify that the masked-out part of the loaded value
+  // has been modified.  If it didn't, abort the cmpxchg, since the
+  // masked-in part must've.
+  Value *OldVal_MaskOut = Builder.CreateAnd(OldVal, PMV.Inv_Mask);
+  Value *ShouldContinue = Builder.CreateICmpNE(Loaded_MaskOut, OldVal_MaskOut);
+  Builder.CreateCondBr(ShouldContinue, LoopBB, EndBB);
+
+  // Add the second value to the phi from above
+  Loaded_MaskOut->addIncoming(OldVal_MaskOut, FailureBB);
+
+  // partword.cmpxchg.end:
+  Builder.SetInsertPoint(CI);
+
+  Value *FinalOldVal = Builder.CreateTrunc(
+      Builder.CreateLShr(OldVal, PMV.ShiftAmt), PMV.ValueType);
+  Value *Res = UndefValue::get(CI->getType());
+  Res = Builder.CreateInsertValue(Res, FinalOldVal, 0);
+  Res = Builder.CreateInsertValue(Res, Success, 1);
+
+  CI->replaceAllUsesWith(Res);
+  CI->eraseFromParent();
+}
+
+void AtomicExpand::expandAtomicOpToLLSC(
+    Instruction *I, Type *ResultType, Value *Addr, AtomicOrdering MemOpOrder,
+    function_ref<Value *(IRBuilder<> &, Value *)> PerformOp) {
+  IRBuilder<> Builder(I);
+  Value *Loaded =
+      insertRMWLLSCLoop(Builder, ResultType, Addr, MemOpOrder, PerformOp);
+
+  I->replaceAllUsesWith(Loaded);
+  I->eraseFromParent();
+}
+
+Value *AtomicExpand::insertRMWLLSCLoop(
+    IRBuilder<> &Builder, Type *ResultTy, Value *Addr,
+    AtomicOrdering MemOpOrder,
+    function_ref<Value *(IRBuilder<> &, Value *)> PerformOp) {
+  LLVMContext &Ctx = Builder.getContext();
+  BasicBlock *BB = Builder.GetInsertBlock();
+  Function *F = BB->getParent();
 
   // Given: atomicrmw some_op iN* %addr, iN %incr ordering
   //
   // The standard expansion we produce is:
   //     [...]
-  //     fence?
   // atomicrmw.start:
   //     %loaded = @load.linked(%addr)
   //     %new = some_op iN %loaded, %incr
@@ -413,17 +880,13 @@ bool AtomicExpand::expandAtomicOpToLLSC(
   //     %try_again = icmp i32 ne %stored, 0
   //     br i1 %try_again, label %loop, label %atomicrmw.end
   // atomicrmw.end:
-  //     fence?
   //     [...]
-  BasicBlock *ExitBB = BB->splitBasicBlock(I->getIterator(), "atomicrmw.end");
+  BasicBlock *ExitBB =
+      BB->splitBasicBlock(Builder.GetInsertPoint(), "atomicrmw.end");
   BasicBlock *LoopBB =  BasicBlock::Create(Ctx, "atomicrmw.start", F, ExitBB);
 
-  // This grabs the DebugLoc from I.
-  IRBuilder<> Builder(I);
-
   // The split call above "helpfully" added a branch at the end of BB (to the
-  // wrong place), but we might want a fence too. It's easiest to just remove
-  // the branch entirely.
+  // wrong place).
   std::prev(BB->end())->eraseFromParent();
   Builder.SetInsertPoint(BB);
   Builder.CreateBr(LoopBB);
@@ -441,13 +904,53 @@ bool AtomicExpand::expandAtomicOpToLLSC(
   Builder.CreateCondBr(TryAgain, LoopBB, ExitBB);
 
   Builder.SetInsertPoint(ExitBB, ExitBB->begin());
+  return Loaded;
+}
 
-  I->replaceAllUsesWith(Loaded);
-  I->eraseFromParent();
+/// Convert an atomic cmpxchg of a non-integral type to an integer cmpxchg of
+/// the equivalent bitwidth.  We used to not support pointer cmpxchg in the
+/// IR.  As a migration step, we convert back to what use to be the standard
+/// way to represent a pointer cmpxchg so that we can update backends one by
+/// one. 
+AtomicCmpXchgInst *AtomicExpand::convertCmpXchgToIntegerType(AtomicCmpXchgInst *CI) {
+  auto *M = CI->getModule();
+  Type *NewTy = getCorrespondingIntegerType(CI->getCompareOperand()->getType(),
+                                            M->getDataLayout());
 
-  return true;
+  IRBuilder<> Builder(CI);
+  
+  Value *Addr = CI->getPointerOperand();
+  Type *PT = PointerType::get(NewTy,
+                              Addr->getType()->getPointerAddressSpace());
+  Value *NewAddr = Builder.CreateBitCast(Addr, PT);
+
+  Value *NewCmp = Builder.CreatePtrToInt(CI->getCompareOperand(), NewTy);
+  Value *NewNewVal = Builder.CreatePtrToInt(CI->getNewValOperand(), NewTy);
+  
+  
+  auto *NewCI = Builder.CreateAtomicCmpXchg(NewAddr, NewCmp, NewNewVal,
+                                            CI->getSuccessOrdering(),
+                                            CI->getFailureOrdering(),
+                                            CI->getSynchScope());
+  NewCI->setVolatile(CI->isVolatile());
+  NewCI->setWeak(CI->isWeak());
+  DEBUG(dbgs() << "Replaced " << *CI << " with " << *NewCI << "\n");
+
+  Value *OldVal = Builder.CreateExtractValue(NewCI, 0);
+  Value *Succ = Builder.CreateExtractValue(NewCI, 1);
+
+  OldVal = Builder.CreateIntToPtr(OldVal, CI->getCompareOperand()->getType());
+
+  Value *Res = UndefValue::get(CI->getType());
+  Res = Builder.CreateInsertValue(Res, OldVal, 0);
+  Res = Builder.CreateInsertValue(Res, Succ, 1);
+
+  CI->replaceAllUsesWith(Res);
+  CI->eraseFromParent();
+  return NewCI;
 }
 
+
 bool AtomicExpand::expandAtomicCmpXchg(AtomicCmpXchgInst *CI) {
   AtomicOrdering SuccessOrder = CI->getSuccessOrdering();
   AtomicOrdering FailureOrder = CI->getFailureOrdering();
@@ -455,37 +958,71 @@ bool AtomicExpand::expandAtomicCmpXchg(AtomicCmpXchgInst *CI) {
   BasicBlock *BB = CI->getParent();
   Function *F = BB->getParent();
   LLVMContext &Ctx = F->getContext();
-  // If getInsertFencesForAtomic() returns true, then the target does not want
-  // to deal with memory orders, and emitLeading/TrailingFence should take care
-  // of everything. Otherwise, emitLeading/TrailingFence are no-op and we
+  // If shouldInsertFencesForAtomic() returns true, then the target does not
+  // want to deal with memory orders, and emitLeading/TrailingFence should take
+  // care of everything. Otherwise, emitLeading/TrailingFence are no-op and we
   // should preserve the ordering.
+  bool ShouldInsertFencesForAtomic = TLI->shouldInsertFencesForAtomic(CI);
   AtomicOrdering MemOpOrder =
-      TLI->getInsertFencesForAtomic() ? Monotonic : SuccessOrder;
+      ShouldInsertFencesForAtomic ? AtomicOrdering::Monotonic : SuccessOrder;
+
+  // In implementations which use a barrier to achieve release semantics, we can
+  // delay emitting this barrier until we know a store is actually going to be
+  // attempted. The cost of this delay is that we need 2 copies of the block
+  // emitting the load-linked, affecting code size.
+  //
+  // Ideally, this logic would be unconditional except for the minsize check
+  // since in other cases the extra blocks naturally collapse down to the
+  // minimal loop. Unfortunately, this puts too much stress on later
+  // optimisations so we avoid emitting the extra logic in those cases too.
+  bool HasReleasedLoadBB = !CI->isWeak() && ShouldInsertFencesForAtomic &&
+                           SuccessOrder != AtomicOrdering::Monotonic &&
+                           SuccessOrder != AtomicOrdering::Acquire &&
+                           !F->optForMinSize();
+
+  // There's no overhead for sinking the release barrier in a weak cmpxchg, so
+  // do it even on minsize.
+  bool UseUnconditionalReleaseBarrier = F->optForMinSize() && !CI->isWeak();
 
   // Given: cmpxchg some_op iN* %addr, iN %desired, iN %new success_ord fail_ord
   //
   // The full expansion we produce is:
   //     [...]
-  //     fence?
   // cmpxchg.start:
-  //     %loaded = @load.linked(%addr)
-  //     %should_store = icmp eq %loaded, %desired
-  //     br i1 %should_store, label %cmpxchg.trystore,
+  //     %unreleasedload = @load.linked(%addr)
+  //     %should_store = icmp eq %unreleasedload, %desired
+  //     br i1 %should_store, label %cmpxchg.fencedstore,
   //                          label %cmpxchg.nostore
+  // cmpxchg.releasingstore:
+  //     fence?
+  //     br label cmpxchg.trystore
   // cmpxchg.trystore:
+  //     %loaded.trystore = phi [%unreleasedload, %releasingstore],
+  //                            [%releasedload, %cmpxchg.releasedload]
   //     %stored = @store_conditional(%new, %addr)
   //     %success = icmp eq i32 %stored, 0
-  //     br i1 %success, label %cmpxchg.success, label %loop/%cmpxchg.failure
+  //     br i1 %success, label %cmpxchg.success,
+  //                     label %cmpxchg.releasedload/%cmpxchg.failure
+  // cmpxchg.releasedload:
+  //     %releasedload = @load.linked(%addr)
+  //     %should_store = icmp eq %releasedload, %desired
+  //     br i1 %should_store, label %cmpxchg.trystore,
+  //                          label %cmpxchg.failure
   // cmpxchg.success:
   //     fence?
   //     br label %cmpxchg.end
   // cmpxchg.nostore:
+  //     %loaded.nostore = phi [%unreleasedload, %cmpxchg.start],
+  //                           [%releasedload,
+  //                               %cmpxchg.releasedload/%cmpxchg.trystore]
   //     @load_linked_fail_balance()?
   //     br label %cmpxchg.failure
   // cmpxchg.failure:
   //     fence?
   //     br label %cmpxchg.end
   // cmpxchg.end:
+  //     %loaded = phi [%loaded.nostore, %cmpxchg.failure],
+  //                   [%loaded.trystore, %cmpxchg.trystore]
   //     %success = phi i1 [true, %cmpxchg.success], [false, %cmpxchg.failure]
   //     %restmp = insertvalue { iN, i1 } undef, iN %loaded, 0
   //     %res = insertvalue { iN, i1 } %restmp, i1 %success, 1
@@ -494,8 +1031,13 @@ bool AtomicExpand::expandAtomicCmpXchg(AtomicCmpXchgInst *CI) {
   auto FailureBB = BasicBlock::Create(Ctx, "cmpxchg.failure", F, ExitBB);
   auto NoStoreBB = BasicBlock::Create(Ctx, "cmpxchg.nostore", F, FailureBB);
   auto SuccessBB = BasicBlock::Create(Ctx, "cmpxchg.success", F, NoStoreBB);
-  auto TryStoreBB = BasicBlock::Create(Ctx, "cmpxchg.trystore", F, SuccessBB);
-  auto LoopBB = BasicBlock::Create(Ctx, "cmpxchg.start", F, TryStoreBB);
+  auto ReleasedLoadBB =
+      BasicBlock::Create(Ctx, "cmpxchg.releasedload", F, SuccessBB);
+  auto TryStoreBB =
+      BasicBlock::Create(Ctx, "cmpxchg.trystore", F, ReleasedLoadBB);
+  auto ReleasingStoreBB =
+      BasicBlock::Create(Ctx, "cmpxchg.fencedstore", F, TryStoreBB);
+  auto StartBB = BasicBlock::Create(Ctx, "cmpxchg.start", F, ReleasingStoreBB);
 
   // This grabs the DebugLoc from CI
   IRBuilder<> Builder(CI);
@@ -505,32 +1047,55 @@ bool AtomicExpand::expandAtomicCmpXchg(AtomicCmpXchgInst *CI) {
   // the branch entirely.
   std::prev(BB->end())->eraseFromParent();
   Builder.SetInsertPoint(BB);
-  TLI->emitLeadingFence(Builder, SuccessOrder, /*IsStore=*/true,
-                        /*IsLoad=*/true);
-  Builder.CreateBr(LoopBB);
+  if (ShouldInsertFencesForAtomic && UseUnconditionalReleaseBarrier)
+    TLI->emitLeadingFence(Builder, SuccessOrder, /*IsStore=*/true,
+                          /*IsLoad=*/true);
+  Builder.CreateBr(StartBB);
 
   // Start the main loop block now that we've taken care of the preliminaries.
-  Builder.SetInsertPoint(LoopBB);
-  Value *Loaded = TLI->emitLoadLinked(Builder, Addr, MemOpOrder);
-  Value *ShouldStore =
-      Builder.CreateICmpEQ(Loaded, CI->getCompareOperand(), "should_store");
+  Builder.SetInsertPoint(StartBB);
+  Value *UnreleasedLoad = TLI->emitLoadLinked(Builder, Addr, MemOpOrder);
+  Value *ShouldStore = Builder.CreateICmpEQ(
+      UnreleasedLoad, CI->getCompareOperand(), "should_store");
 
   // If the cmpxchg doesn't actually need any ordering when it fails, we can
   // jump straight past that fence instruction (if it exists).
-  Builder.CreateCondBr(ShouldStore, TryStoreBB, NoStoreBB);
+  Builder.CreateCondBr(ShouldStore, ReleasingStoreBB, NoStoreBB);
+
+  Builder.SetInsertPoint(ReleasingStoreBB);
+  if (ShouldInsertFencesForAtomic && !UseUnconditionalReleaseBarrier)
+    TLI->emitLeadingFence(Builder, SuccessOrder, /*IsStore=*/true,
+                          /*IsLoad=*/true);
+  Builder.CreateBr(TryStoreBB);
 
   Builder.SetInsertPoint(TryStoreBB);
   Value *StoreSuccess = TLI->emitStoreConditional(
       Builder, CI->getNewValOperand(), Addr, MemOpOrder);
   StoreSuccess = Builder.CreateICmpEQ(
       StoreSuccess, ConstantInt::get(Type::getInt32Ty(Ctx), 0), "success");
+  BasicBlock *RetryBB = HasReleasedLoadBB ? ReleasedLoadBB : StartBB;
   Builder.CreateCondBr(StoreSuccess, SuccessBB,
-                       CI->isWeak() ? FailureBB : LoopBB);
-
-  // Make sure later instructions don't get reordered with a fence if necessary.
+                       CI->isWeak() ? FailureBB : RetryBB);
+
+  Builder.SetInsertPoint(ReleasedLoadBB);
+  Value *SecondLoad;
+  if (HasReleasedLoadBB) {
+    SecondLoad = TLI->emitLoadLinked(Builder, Addr, MemOpOrder);
+    ShouldStore = Builder.CreateICmpEQ(SecondLoad, CI->getCompareOperand(),
+                                       "should_store");
+
+    // If the cmpxchg doesn't actually need any ordering when it fails, we can
+    // jump straight past that fence instruction (if it exists).
+    Builder.CreateCondBr(ShouldStore, TryStoreBB, NoStoreBB);
+  } else
+    Builder.CreateUnreachable();
+
+  // Make sure later instructions don't get reordered with a fence if
+  // necessary.
   Builder.SetInsertPoint(SuccessBB);
-  TLI->emitTrailingFence(Builder, SuccessOrder, /*IsStore=*/true,
-                         /*IsLoad=*/true);
+  if (ShouldInsertFencesForAtomic)
+    TLI->emitTrailingFence(Builder, SuccessOrder, /*IsStore=*/true,
+                           /*IsLoad=*/true);
   Builder.CreateBr(ExitBB);
 
   Builder.SetInsertPoint(NoStoreBB);
@@ -541,20 +1106,43 @@ bool AtomicExpand::expandAtomicCmpXchg(AtomicCmpXchgInst *CI) {
   Builder.CreateBr(FailureBB);
 
   Builder.SetInsertPoint(FailureBB);
-  TLI->emitTrailingFence(Builder, FailureOrder, /*IsStore=*/true,
-                         /*IsLoad=*/true);
+  if (ShouldInsertFencesForAtomic)
+    TLI->emitTrailingFence(Builder, FailureOrder, /*IsStore=*/true,
+                           /*IsLoad=*/true);
   Builder.CreateBr(ExitBB);
 
   // Finally, we have control-flow based knowledge of whether the cmpxchg
   // succeeded or not. We expose this to later passes by converting any
-  // subsequent "icmp eq/ne %loaded, %oldval" into a use of an appropriate PHI.
-
-  // Setup the builder so we can create any PHIs we need.
+  // subsequent "icmp eq/ne %loaded, %oldval" into a use of an appropriate
+  // PHI.
   Builder.SetInsertPoint(ExitBB, ExitBB->begin());
   PHINode *Success = Builder.CreatePHI(Type::getInt1Ty(Ctx), 2);
   Success->addIncoming(ConstantInt::getTrue(Ctx), SuccessBB);
   Success->addIncoming(ConstantInt::getFalse(Ctx), FailureBB);
 
+  // Setup the builder so we can create any PHIs we need.
+  Value *Loaded;
+  if (!HasReleasedLoadBB)
+    Loaded = UnreleasedLoad;
+  else {
+    Builder.SetInsertPoint(TryStoreBB, TryStoreBB->begin());
+    PHINode *TryStoreLoaded = Builder.CreatePHI(UnreleasedLoad->getType(), 2);
+    TryStoreLoaded->addIncoming(UnreleasedLoad, ReleasingStoreBB);
+    TryStoreLoaded->addIncoming(SecondLoad, ReleasedLoadBB);
+
+    Builder.SetInsertPoint(NoStoreBB, NoStoreBB->begin());
+    PHINode *NoStoreLoaded = Builder.CreatePHI(UnreleasedLoad->getType(), 2);
+    NoStoreLoaded->addIncoming(UnreleasedLoad, StartBB);
+    NoStoreLoaded->addIncoming(SecondLoad, ReleasedLoadBB);
+
+    Builder.SetInsertPoint(ExitBB, ++ExitBB->begin());
+    PHINode *ExitLoaded = Builder.CreatePHI(UnreleasedLoad->getType(), 2);
+    ExitLoaded->addIncoming(TryStoreLoaded, SuccessBB);
+    ExitLoaded->addIncoming(NoStoreLoaded, FailureBB);
+
+    Loaded = ExitLoaded;
+  }
+
   // Look for any users of the cmpxchg that are just comparing the loaded value
   // against the desired one, and replace them with the CFG-derived version.
   SmallVector<ExtractValueInst *, 2> PrunedInsts;
@@ -620,16 +1208,14 @@ bool AtomicExpand::simplifyIdempotentRMW(AtomicRMWInst* RMWI) {
   return false;
 }
 
-bool llvm::expandAtomicRMWToCmpXchg(AtomicRMWInst *AI,
-                                    CreateCmpXchgInstFun CreateCmpXchg) {
-  assert(AI);
-
-  AtomicOrdering MemOpOrder =
-      AI->getOrdering() == Unordered ? Monotonic : AI->getOrdering();
-  Value *Addr = AI->getPointerOperand();
-  BasicBlock *BB = AI->getParent();
+Value *AtomicExpand::insertRMWCmpXchgLoop(
+    IRBuilder<> &Builder, Type *ResultTy, Value *Addr,
+    AtomicOrdering MemOpOrder,
+    function_ref<Value *(IRBuilder<> &, Value *)> PerformOp,
+    CreateCmpXchgInstFun CreateCmpXchg) {
+  LLVMContext &Ctx = Builder.getContext();
+  BasicBlock *BB = Builder.GetInsertBlock();
   Function *F = BB->getParent();
-  LLVMContext &Ctx = F->getContext();
 
   // Given: atomicrmw some_op iN* %addr, iN %incr ordering
   //
@@ -646,34 +1232,34 @@ bool llvm::expandAtomicRMWToCmpXchg(AtomicRMWInst *AI,
   //     br i1 %success, label %atomicrmw.end, label %loop
   // atomicrmw.end:
   //     [...]
-  BasicBlock *ExitBB = BB->splitBasicBlock(AI->getIterator(), "atomicrmw.end");
+  BasicBlock *ExitBB =
+      BB->splitBasicBlock(Builder.GetInsertPoint(), "atomicrmw.end");
   BasicBlock *LoopBB = BasicBlock::Create(Ctx, "atomicrmw.start", F, ExitBB);
 
-  // This grabs the DebugLoc from AI.
-  IRBuilder<> Builder(AI);
-
   // The split call above "helpfully" added a branch at the end of BB (to the
   // wrong place), but we want a load. It's easiest to just remove
   // the branch entirely.
   std::prev(BB->end())->eraseFromParent();
   Builder.SetInsertPoint(BB);
-  LoadInst *InitLoaded = Builder.CreateLoad(Addr);
+  LoadInst *InitLoaded = Builder.CreateLoad(ResultTy, Addr);
   // Atomics require at least natural alignment.
-  InitLoaded->setAlignment(AI->getType()->getPrimitiveSizeInBits() / 8);
+  InitLoaded->setAlignment(ResultTy->getPrimitiveSizeInBits() / 8);
   Builder.CreateBr(LoopBB);
 
   // Start the main loop block now that we've taken care of the preliminaries.
   Builder.SetInsertPoint(LoopBB);
-  PHINode *Loaded = Builder.CreatePHI(AI->getType(), 2, "loaded");
+  PHINode *Loaded = Builder.CreatePHI(ResultTy, 2, "loaded");
   Loaded->addIncoming(InitLoaded, BB);
 
-  Value *NewVal =
-      performAtomicOp(AI->getOperation(), Builder, Loaded, AI->getValOperand());
+  Value *NewVal = PerformOp(Builder, Loaded);
 
   Value *NewLoaded = nullptr;
   Value *Success = nullptr;
 
-  CreateCmpXchg(Builder, Addr, Loaded, NewVal, MemOpOrder,
+  CreateCmpXchg(Builder, Addr, Loaded, NewVal,
+                MemOpOrder == AtomicOrdering::Unordered
+                    ? AtomicOrdering::Monotonic
+                    : MemOpOrder,
                 Success, NewLoaded);
   assert(Success && NewLoaded);
 
@@ -682,9 +1268,373 @@ bool llvm::expandAtomicRMWToCmpXchg(AtomicRMWInst *AI,
   Builder.CreateCondBr(Success, ExitBB, LoopBB);
 
   Builder.SetInsertPoint(ExitBB, ExitBB->begin());
+  return NewLoaded;
+}
 
-  AI->replaceAllUsesWith(NewLoaded);
+// Note: This function is exposed externally by AtomicExpandUtils.h
+bool llvm::expandAtomicRMWToCmpXchg(AtomicRMWInst *AI,
+                                    CreateCmpXchgInstFun CreateCmpXchg) {
+  IRBuilder<> Builder(AI);
+  Value *Loaded = AtomicExpand::insertRMWCmpXchgLoop(
+      Builder, AI->getType(), AI->getPointerOperand(), AI->getOrdering(),
+      [&](IRBuilder<> &Builder, Value *Loaded) {
+        return performAtomicOp(AI->getOperation(), Builder, Loaded,
+                               AI->getValOperand());
+      },
+      CreateCmpXchg);
+
+  AI->replaceAllUsesWith(Loaded);
   AI->eraseFromParent();
+  return true;
+}
 
+// In order to use one of the sized library calls such as
+// __atomic_fetch_add_4, the alignment must be sufficient, the size
+// must be one of the potentially-specialized sizes, and the value
+// type must actually exist in C on the target (otherwise, the
+// function wouldn't actually be defined.)
+static bool canUseSizedAtomicCall(unsigned Size, unsigned Align,
+                                  const DataLayout &DL) {
+  // TODO: "LargestSize" is an approximation for "largest type that
+  // you can express in C". It seems to be the case that int128 is
+  // supported on all 64-bit platforms, otherwise only up to 64-bit
+  // integers are supported. If we get this wrong, then we'll try to
+  // call a sized libcall that doesn't actually exist. There should
+  // really be some more reliable way in LLVM of determining integer
+  // sizes which are valid in the target's C ABI...
+  unsigned LargestSize = DL.getLargestLegalIntTypeSizeInBits() >= 64 ? 16 : 8;
+  return Align >= Size &&
+         (Size == 1 || Size == 2 || Size == 4 || Size == 8 || Size == 16) &&
+         Size <= LargestSize;
+}
+
+void AtomicExpand::expandAtomicLoadToLibcall(LoadInst *I) {
+  static const RTLIB::Libcall Libcalls[6] = {
+      RTLIB::ATOMIC_LOAD,   RTLIB::ATOMIC_LOAD_1, RTLIB::ATOMIC_LOAD_2,
+      RTLIB::ATOMIC_LOAD_4, RTLIB::ATOMIC_LOAD_8, RTLIB::ATOMIC_LOAD_16};
+  unsigned Size = getAtomicOpSize(I);
+  unsigned Align = getAtomicOpAlign(I);
+
+  bool expanded = expandAtomicOpToLibcall(
+      I, Size, Align, I->getPointerOperand(), nullptr, nullptr,
+      I->getOrdering(), AtomicOrdering::NotAtomic, Libcalls);
+  (void)expanded;
+  assert(expanded && "expandAtomicOpToLibcall shouldn't fail tor Load");
+}
+
+void AtomicExpand::expandAtomicStoreToLibcall(StoreInst *I) {
+  static const RTLIB::Libcall Libcalls[6] = {
+      RTLIB::ATOMIC_STORE,   RTLIB::ATOMIC_STORE_1, RTLIB::ATOMIC_STORE_2,
+      RTLIB::ATOMIC_STORE_4, RTLIB::ATOMIC_STORE_8, RTLIB::ATOMIC_STORE_16};
+  unsigned Size = getAtomicOpSize(I);
+  unsigned Align = getAtomicOpAlign(I);
+
+  bool expanded = expandAtomicOpToLibcall(
+      I, Size, Align, I->getPointerOperand(), I->getValueOperand(), nullptr,
+      I->getOrdering(), AtomicOrdering::NotAtomic, Libcalls);
+  (void)expanded;
+  assert(expanded && "expandAtomicOpToLibcall shouldn't fail tor Store");
+}
+
+void AtomicExpand::expandAtomicCASToLibcall(AtomicCmpXchgInst *I) {
+  static const RTLIB::Libcall Libcalls[6] = {
+      RTLIB::ATOMIC_COMPARE_EXCHANGE,   RTLIB::ATOMIC_COMPARE_EXCHANGE_1,
+      RTLIB::ATOMIC_COMPARE_EXCHANGE_2, RTLIB::ATOMIC_COMPARE_EXCHANGE_4,
+      RTLIB::ATOMIC_COMPARE_EXCHANGE_8, RTLIB::ATOMIC_COMPARE_EXCHANGE_16};
+  unsigned Size = getAtomicOpSize(I);
+  unsigned Align = getAtomicOpAlign(I);
+
+  bool expanded = expandAtomicOpToLibcall(
+      I, Size, Align, I->getPointerOperand(), I->getNewValOperand(),
+      I->getCompareOperand(), I->getSuccessOrdering(), I->getFailureOrdering(),
+      Libcalls);
+  (void)expanded;
+  assert(expanded && "expandAtomicOpToLibcall shouldn't fail tor CAS");
+}
+
+static ArrayRef<RTLIB::Libcall> GetRMWLibcall(AtomicRMWInst::BinOp Op) {
+  static const RTLIB::Libcall LibcallsXchg[6] = {
+      RTLIB::ATOMIC_EXCHANGE,   RTLIB::ATOMIC_EXCHANGE_1,
+      RTLIB::ATOMIC_EXCHANGE_2, RTLIB::ATOMIC_EXCHANGE_4,
+      RTLIB::ATOMIC_EXCHANGE_8, RTLIB::ATOMIC_EXCHANGE_16};
+  static const RTLIB::Libcall LibcallsAdd[6] = {
+      RTLIB::UNKNOWN_LIBCALL,    RTLIB::ATOMIC_FETCH_ADD_1,
+      RTLIB::ATOMIC_FETCH_ADD_2, RTLIB::ATOMIC_FETCH_ADD_4,
+      RTLIB::ATOMIC_FETCH_ADD_8, RTLIB::ATOMIC_FETCH_ADD_16};
+  static const RTLIB::Libcall LibcallsSub[6] = {
+      RTLIB::UNKNOWN_LIBCALL,    RTLIB::ATOMIC_FETCH_SUB_1,
+      RTLIB::ATOMIC_FETCH_SUB_2, RTLIB::ATOMIC_FETCH_SUB_4,
+      RTLIB::ATOMIC_FETCH_SUB_8, RTLIB::ATOMIC_FETCH_SUB_16};
+  static const RTLIB::Libcall LibcallsAnd[6] = {
+      RTLIB::UNKNOWN_LIBCALL,    RTLIB::ATOMIC_FETCH_AND_1,
+      RTLIB::ATOMIC_FETCH_AND_2, RTLIB::ATOMIC_FETCH_AND_4,
+      RTLIB::ATOMIC_FETCH_AND_8, RTLIB::ATOMIC_FETCH_AND_16};
+  static const RTLIB::Libcall LibcallsOr[6] = {
+      RTLIB::UNKNOWN_LIBCALL,   RTLIB::ATOMIC_FETCH_OR_1,
+      RTLIB::ATOMIC_FETCH_OR_2, RTLIB::ATOMIC_FETCH_OR_4,
+      RTLIB::ATOMIC_FETCH_OR_8, RTLIB::ATOMIC_FETCH_OR_16};
+  static const RTLIB::Libcall LibcallsXor[6] = {
+      RTLIB::UNKNOWN_LIBCALL,    RTLIB::ATOMIC_FETCH_XOR_1,
+      RTLIB::ATOMIC_FETCH_XOR_2, RTLIB::ATOMIC_FETCH_XOR_4,
+      RTLIB::ATOMIC_FETCH_XOR_8, RTLIB::ATOMIC_FETCH_XOR_16};
+  static const RTLIB::Libcall LibcallsNand[6] = {
+      RTLIB::UNKNOWN_LIBCALL,     RTLIB::ATOMIC_FETCH_NAND_1,
+      RTLIB::ATOMIC_FETCH_NAND_2, RTLIB::ATOMIC_FETCH_NAND_4,
+      RTLIB::ATOMIC_FETCH_NAND_8, RTLIB::ATOMIC_FETCH_NAND_16};
+
+  switch (Op) {
+  case AtomicRMWInst::BAD_BINOP:
+    llvm_unreachable("Should not have BAD_BINOP.");
+  case AtomicRMWInst::Xchg:
+    return makeArrayRef(LibcallsXchg);
+  case AtomicRMWInst::Add:
+    return makeArrayRef(LibcallsAdd);
+  case AtomicRMWInst::Sub:
+    return makeArrayRef(LibcallsSub);
+  case AtomicRMWInst::And:
+    return makeArrayRef(LibcallsAnd);
+  case AtomicRMWInst::Or:
+    return makeArrayRef(LibcallsOr);
+  case AtomicRMWInst::Xor:
+    return makeArrayRef(LibcallsXor);
+  case AtomicRMWInst::Nand:
+    return makeArrayRef(LibcallsNand);
+  case AtomicRMWInst::Max:
+  case AtomicRMWInst::Min:
+  case AtomicRMWInst::UMax:
+  case AtomicRMWInst::UMin:
+    // No atomic libcalls are available for max/min/umax/umin.
+    return {};
+  }
+  llvm_unreachable("Unexpected AtomicRMW operation.");
+}
+
+void AtomicExpand::expandAtomicRMWToLibcall(AtomicRMWInst *I) {
+  ArrayRef<RTLIB::Libcall> Libcalls = GetRMWLibcall(I->getOperation());
+
+  unsigned Size = getAtomicOpSize(I);
+  unsigned Align = getAtomicOpAlign(I);
+
+  bool Success = false;
+  if (!Libcalls.empty())
+    Success = expandAtomicOpToLibcall(
+        I, Size, Align, I->getPointerOperand(), I->getValOperand(), nullptr,
+        I->getOrdering(), AtomicOrdering::NotAtomic, Libcalls);
+
+  // The expansion failed: either there were no libcalls at all for
+  // the operation (min/max), or there were only size-specialized
+  // libcalls (add/sub/etc) and we needed a generic. So, expand to a
+  // CAS libcall, via a CAS loop, instead.
+  if (!Success) {
+    expandAtomicRMWToCmpXchg(I, [this](IRBuilder<> &Builder, Value *Addr,
+                                       Value *Loaded, Value *NewVal,
+                                       AtomicOrdering MemOpOrder,
+                                       Value *&Success, Value *&NewLoaded) {
+      // Create the CAS instruction normally...
+      AtomicCmpXchgInst *Pair = Builder.CreateAtomicCmpXchg(
+          Addr, Loaded, NewVal, MemOpOrder,
+          AtomicCmpXchgInst::getStrongestFailureOrdering(MemOpOrder));
+      Success = Builder.CreateExtractValue(Pair, 1, "success");
+      NewLoaded = Builder.CreateExtractValue(Pair, 0, "newloaded");
+
+      // ...and then expand the CAS into a libcall.
+      expandAtomicCASToLibcall(Pair);
+    });
+  }
+}
+
+// A helper routine for the above expandAtomic*ToLibcall functions.
+//
+// 'Libcalls' contains an array of enum values for the particular
+// ATOMIC libcalls to be emitted. All of the other arguments besides
+// 'I' are extracted from the Instruction subclass by the
+// caller. Depending on the particular call, some will be null.
+bool AtomicExpand::expandAtomicOpToLibcall(
+    Instruction *I, unsigned Size, unsigned Align, Value *PointerOperand,
+    Value *ValueOperand, Value *CASExpected, AtomicOrdering Ordering,
+    AtomicOrdering Ordering2, ArrayRef<RTLIB::Libcall> Libcalls) {
+  assert(Libcalls.size() == 6);
+
+  LLVMContext &Ctx = I->getContext();
+  Module *M = I->getModule();
+  const DataLayout &DL = M->getDataLayout();
+  IRBuilder<> Builder(I);
+  IRBuilder<> AllocaBuilder(&I->getFunction()->getEntryBlock().front());
+
+  bool UseSizedLibcall = canUseSizedAtomicCall(Size, Align, DL);
+  Type *SizedIntTy = Type::getIntNTy(Ctx, Size * 8);
+
+  unsigned AllocaAlignment = DL.getPrefTypeAlignment(SizedIntTy);
+
+  // TODO: the "order" argument type is "int", not int32. So
+  // getInt32Ty may be wrong if the arch uses e.g. 16-bit ints.
+  ConstantInt *SizeVal64 = ConstantInt::get(Type::getInt64Ty(Ctx), Size);
+  assert(Ordering != AtomicOrdering::NotAtomic && "expect atomic MO");
+  Constant *OrderingVal =
+      ConstantInt::get(Type::getInt32Ty(Ctx), (int)toCABI(Ordering));
+  Constant *Ordering2Val = nullptr;
+  if (CASExpected) {
+    assert(Ordering2 != AtomicOrdering::NotAtomic && "expect atomic MO");
+    Ordering2Val =
+        ConstantInt::get(Type::getInt32Ty(Ctx), (int)toCABI(Ordering2));
+  }
+  bool HasResult = I->getType() != Type::getVoidTy(Ctx);
+
+  RTLIB::Libcall RTLibType;
+  if (UseSizedLibcall) {
+    switch (Size) {
+    case 1: RTLibType = Libcalls[1]; break;
+    case 2: RTLibType = Libcalls[2]; break;
+    case 4: RTLibType = Libcalls[3]; break;
+    case 8: RTLibType = Libcalls[4]; break;
+    case 16: RTLibType = Libcalls[5]; break;
+    }
+  } else if (Libcalls[0] != RTLIB::UNKNOWN_LIBCALL) {
+    RTLibType = Libcalls[0];
+  } else {
+    // Can't use sized function, and there's no generic for this
+    // operation, so give up.
+    return false;
+  }
+
+  // Build up the function call. There's two kinds. First, the sized
+  // variants.  These calls are going to be one of the following (with
+  // N=1,2,4,8,16):
+  //  iN    __atomic_load_N(iN *ptr, int ordering)
+  //  void  __atomic_store_N(iN *ptr, iN val, int ordering)
+  //  iN    __atomic_{exchange|fetch_*}_N(iN *ptr, iN val, int ordering)
+  //  bool  __atomic_compare_exchange_N(iN *ptr, iN *expected, iN desired,
+  //                                    int success_order, int failure_order)
+  //
+  // Note that these functions can be used for non-integer atomic
+  // operations, the values just need to be bitcast to integers on the
+  // way in and out.
+  //
+  // And, then, the generic variants. They look like the following:
+  //  void  __atomic_load(size_t size, void *ptr, void *ret, int ordering)
+  //  void  __atomic_store(size_t size, void *ptr, void *val, int ordering)
+  //  void  __atomic_exchange(size_t size, void *ptr, void *val, void *ret,
+  //                          int ordering)
+  //  bool  __atomic_compare_exchange(size_t size, void *ptr, void *expected,
+  //                                  void *desired, int success_order,
+  //                                  int failure_order)
+  //
+  // The different signatures are built up depending on the
+  // 'UseSizedLibcall', 'CASExpected', 'ValueOperand', and 'HasResult'
+  // variables.
+
+  AllocaInst *AllocaCASExpected = nullptr;
+  Value *AllocaCASExpected_i8 = nullptr;
+  AllocaInst *AllocaValue = nullptr;
+  Value *AllocaValue_i8 = nullptr;
+  AllocaInst *AllocaResult = nullptr;
+  Value *AllocaResult_i8 = nullptr;
+
+  Type *ResultTy;
+  SmallVector<Value *, 6> Args;
+  AttributeSet Attr;
+
+  // 'size' argument.
+  if (!UseSizedLibcall) {
+    // Note, getIntPtrType is assumed equivalent to size_t.
+    Args.push_back(ConstantInt::get(DL.getIntPtrType(Ctx), Size));
+  }
+
+  // 'ptr' argument.
+  Value *PtrVal =
+      Builder.CreateBitCast(PointerOperand, Type::getInt8PtrTy(Ctx));
+  Args.push_back(PtrVal);
+
+  // 'expected' argument, if present.
+  if (CASExpected) {
+    AllocaCASExpected = AllocaBuilder.CreateAlloca(CASExpected->getType());
+    AllocaCASExpected->setAlignment(AllocaAlignment);
+    AllocaCASExpected_i8 =
+        Builder.CreateBitCast(AllocaCASExpected, Type::getInt8PtrTy(Ctx));
+    Builder.CreateLifetimeStart(AllocaCASExpected_i8, SizeVal64);
+    Builder.CreateAlignedStore(CASExpected, AllocaCASExpected, AllocaAlignment);
+    Args.push_back(AllocaCASExpected_i8);
+  }
+
+  // 'val' argument ('desired' for cas), if present.
+  if (ValueOperand) {
+    if (UseSizedLibcall) {
+      Value *IntValue =
+          Builder.CreateBitOrPointerCast(ValueOperand, SizedIntTy);
+      Args.push_back(IntValue);
+    } else {
+      AllocaValue = AllocaBuilder.CreateAlloca(ValueOperand->getType());
+      AllocaValue->setAlignment(AllocaAlignment);
+      AllocaValue_i8 =
+          Builder.CreateBitCast(AllocaValue, Type::getInt8PtrTy(Ctx));
+      Builder.CreateLifetimeStart(AllocaValue_i8, SizeVal64);
+      Builder.CreateAlignedStore(ValueOperand, AllocaValue, AllocaAlignment);
+      Args.push_back(AllocaValue_i8);
+    }
+  }
+
+  // 'ret' argument.
+  if (!CASExpected && HasResult && !UseSizedLibcall) {
+    AllocaResult = AllocaBuilder.CreateAlloca(I->getType());
+    AllocaResult->setAlignment(AllocaAlignment);
+    AllocaResult_i8 =
+        Builder.CreateBitCast(AllocaResult, Type::getInt8PtrTy(Ctx));
+    Builder.CreateLifetimeStart(AllocaResult_i8, SizeVal64);
+    Args.push_back(AllocaResult_i8);
+  }
+
+  // 'ordering' ('success_order' for cas) argument.
+  Args.push_back(OrderingVal);
+
+  // 'failure_order' argument, if present.
+  if (Ordering2Val)
+    Args.push_back(Ordering2Val);
+
+  // Now, the return type.
+  if (CASExpected) {
+    ResultTy = Type::getInt1Ty(Ctx);
+    Attr = Attr.addAttribute(Ctx, AttributeSet::ReturnIndex, Attribute::ZExt);
+  } else if (HasResult && UseSizedLibcall)
+    ResultTy = SizedIntTy;
+  else
+    ResultTy = Type::getVoidTy(Ctx);
+
+  // Done with setting up arguments and return types, create the call:
+  SmallVector<Type *, 6> ArgTys;
+  for (Value *Arg : Args)
+    ArgTys.push_back(Arg->getType());
+  FunctionType *FnType = FunctionType::get(ResultTy, ArgTys, false);
+  Constant *LibcallFn =
+      M->getOrInsertFunction(TLI->getLibcallName(RTLibType), FnType, Attr);
+  CallInst *Call = Builder.CreateCall(LibcallFn, Args);
+  Call->setAttributes(Attr);
+  Value *Result = Call;
+
+  // And then, extract the results...
+  if (ValueOperand && !UseSizedLibcall)
+    Builder.CreateLifetimeEnd(AllocaValue_i8, SizeVal64);
+
+  if (CASExpected) {
+    // The final result from the CAS is {load of 'expected' alloca, bool result
+    // from call}
+    Type *FinalResultTy = I->getType();
+    Value *V = UndefValue::get(FinalResultTy);
+    Value *ExpectedOut =
+        Builder.CreateAlignedLoad(AllocaCASExpected, AllocaAlignment);
+    Builder.CreateLifetimeEnd(AllocaCASExpected_i8, SizeVal64);
+    V = Builder.CreateInsertValue(V, ExpectedOut, 0);
+    V = Builder.CreateInsertValue(V, Result, 1);
+    I->replaceAllUsesWith(V);
+  } else if (HasResult) {
+    Value *V;
+    if (UseSizedLibcall)
+      V = Builder.CreateBitOrPointerCast(Result, I->getType());
+    else {
+      V = Builder.CreateAlignedLoad(AllocaResult, AllocaAlignment);
+      Builder.CreateLifetimeEnd(AllocaResult_i8, SizeVal64);
+    }
+    I->replaceAllUsesWith(V);
+  }
+  I->eraseFromParent();
   return true;
 }
diff --git a/lib/CodeGen/BranchFolding.cpp b/lib/CodeGen/BranchFolding.cpp
index df5cac5a9f7a..fa705761645f 100644
--- a/lib/CodeGen/BranchFolding.cpp
+++ b/lib/CodeGen/BranchFolding.cpp
@@ -27,10 +27,11 @@
 #include "llvm/CodeGen/MachineFunctionPass.h"
 #include "llvm/CodeGen/MachineJumpTableInfo.h"
 #include "llvm/CodeGen/MachineMemOperand.h"
+#include "llvm/CodeGen/MachineLoopInfo.h"
 #include "llvm/CodeGen/MachineModuleInfo.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
 #include "llvm/CodeGen/Passes.h"
-#include "llvm/CodeGen/RegisterScavenging.h"
+#include "llvm/CodeGen/TargetPassConfig.h"
 #include "llvm/IR/Function.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Debug.h"
@@ -90,7 +91,7 @@ INITIALIZE_PASS(BranchFolderPass, "branch-folder",
                 "Control Flow Optimizer", false, false)
 
 bool BranchFolderPass::runOnMachineFunction(MachineFunction &MF) {
-  if (skipOptnoneFunction(*MF.getFunction()))
+  if (skipFunction(*MF.getFunction()))
     return false;
 
   TargetPassConfig *PassConfig = &getAnalysis<TargetPassConfig>();
@@ -98,8 +99,9 @@ bool BranchFolderPass::runOnMachineFunction(MachineFunction &MF) {
   // HW that requires structurized CFG.
   bool EnableTailMerge = !MF.getTarget().requiresStructuredCFG() &&
                          PassConfig->getEnableTailMerge();
-  BranchFolder Folder(EnableTailMerge, /*CommonHoist=*/true,
-                      getAnalysis<MachineBlockFrequencyInfo>(),
+  BranchFolder::MBFIWrapper MBBFreqInfo(
+      getAnalysis<MachineBlockFrequencyInfo>());
+  BranchFolder Folder(EnableTailMerge, /*CommonHoist=*/true, MBBFreqInfo,
                       getAnalysis<MachineBranchProbabilityInfo>());
   return Folder.OptimizeFunction(MF, MF.getSubtarget().getInstrInfo(),
                                  MF.getSubtarget().getRegisterInfo(),
@@ -107,7 +109,7 @@ bool BranchFolderPass::runOnMachineFunction(MachineFunction &MF) {
 }
 
 BranchFolder::BranchFolder(bool defaultEnableTailMerge, bool CommonHoist,
-                           const MachineBlockFrequencyInfo &FreqInfo,
+                           MBFIWrapper &FreqInfo,
                            const MachineBranchProbabilityInfo &ProbInfo)
     : EnableHoistCommonCode(CommonHoist), MBBFreqInfo(FreqInfo),
       MBPI(ProbInfo) {
@@ -135,6 +137,8 @@ void BranchFolder::RemoveDeadBlock(MachineBasicBlock *MBB) {
   // Remove the block.
   MF->erase(MBB);
   FuncletMembership.erase(MBB);
+  if (MLI)
+    MLI->removeBlock(MBB);
 }
 
 /// OptimizeImpDefsBlock - If a basic block is just a bunch of implicit_def
@@ -167,7 +171,7 @@ bool BranchFolder::OptimizeImpDefsBlock(MachineBasicBlock *MBB) {
 
   MachineBasicBlock::iterator FirstTerm = I;
   while (I != MBB->end()) {
-    if (!TII->isUnpredicatedTerminator(I))
+    if (!TII->isUnpredicatedTerminator(*I))
       return false;
     // See if it uses any of the implicitly defined registers.
     for (const MachineOperand &MO : I->operands()) {
@@ -191,25 +195,26 @@ bool BranchFolder::OptimizeImpDefsBlock(MachineBasicBlock *MBB) {
 }
 
 /// OptimizeFunction - Perhaps branch folding, tail merging and other
-/// CFG optimizations on the given function.
+/// CFG optimizations on the given function.  Block placement changes the layout
+/// and may create new tail merging opportunities.
 bool BranchFolder::OptimizeFunction(MachineFunction &MF,
                                     const TargetInstrInfo *tii,
                                     const TargetRegisterInfo *tri,
-                                    MachineModuleInfo *mmi) {
+                                    MachineModuleInfo *mmi,
+                                    MachineLoopInfo *mli, bool AfterPlacement) {
   if (!tii) return false;
 
   TriedMerging.clear();
 
+  AfterBlockPlacement = AfterPlacement;
   TII = tii;
   TRI = tri;
   MMI = mmi;
-  RS = nullptr;
+  MLI = mli;
 
-  // Use a RegScavenger to help update liveness when required.
   MachineRegisterInfo &MRI = MF.getRegInfo();
-  if (MRI.tracksLiveness() && TRI->trackLivenessAfterRegAlloc(MF))
-    RS = new RegScavenger();
-  else
+  UpdateLiveIns = MRI.tracksLiveness() && TRI->trackLivenessAfterRegAlloc(MF);
+  if (!UpdateLiveIns)
     MRI.invalidateLiveness();
 
   // Fix CFG.  The later algorithms expect it to be right.
@@ -217,7 +222,7 @@ bool BranchFolder::OptimizeFunction(MachineFunction &MF,
   for (MachineBasicBlock &MBB : MF) {
     MachineBasicBlock *TBB = nullptr, *FBB = nullptr;
     SmallVector<MachineOperand, 4> Cond;
-    if (!TII->AnalyzeBranch(MBB, TBB, FBB, Cond, true))
+    if (!TII->analyzeBranch(MBB, TBB, FBB, Cond, true))
       MadeChange |= MBB.CorrectExtraCFGEdges(TBB, FBB, !Cond.empty());
     MadeChange |= OptimizeImpDefsBlock(&MBB);
   }
@@ -228,7 +233,10 @@ bool BranchFolder::OptimizeFunction(MachineFunction &MF,
   bool MadeChangeThisIteration = true;
   while (MadeChangeThisIteration) {
     MadeChangeThisIteration    = TailMergeBlocks(MF);
-    MadeChangeThisIteration   |= OptimizeBranches(MF);
+    // No need to clean up if tail merging does not change anything after the
+    // block placement.
+    if (!AfterBlockPlacement || MadeChangeThisIteration)
+      MadeChangeThisIteration |= OptimizeBranches(MF);
     if (EnableHoistCommonCode)
       MadeChangeThisIteration |= HoistCommonCode(MF);
     MadeChange |= MadeChangeThisIteration;
@@ -237,10 +245,8 @@ bool BranchFolder::OptimizeFunction(MachineFunction &MF,
   // See if any jump tables have become dead as the code generator
   // did its thing.
   MachineJumpTableInfo *JTI = MF.getJumpTableInfo();
-  if (!JTI) {
-    delete RS;
+  if (!JTI)
     return MadeChange;
-  }
 
   // Walk the function to find jump tables that are live.
   BitVector JTIsLive(JTI->getJumpTables().size());
@@ -262,7 +268,6 @@ bool BranchFolder::OptimizeFunction(MachineFunction &MF,
       MadeChange = true;
     }
 
-  delete RS;
   return MadeChange;
 }
 
@@ -271,10 +276,10 @@ bool BranchFolder::OptimizeFunction(MachineFunction &MF,
 //===----------------------------------------------------------------------===//
 
 /// HashMachineInstr - Compute a hash value for MI and its operands.
-static unsigned HashMachineInstr(const MachineInstr *MI) {
-  unsigned Hash = MI->getOpcode();
-  for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) {
-    const MachineOperand &Op = MI->getOperand(i);
+static unsigned HashMachineInstr(const MachineInstr &MI) {
+  unsigned Hash = MI.getOpcode();
+  for (unsigned i = 0, e = MI.getNumOperands(); i != e; ++i) {
+    const MachineOperand &Op = MI.getOperand(i);
 
     // Merge in bits from the operand if easy. We can't use MachineOperand's
     // hash_code here because it's not deterministic and we sort by hash value
@@ -311,12 +316,12 @@ static unsigned HashMachineInstr(const MachineInstr *MI) {
 }
 
 /// HashEndOfMBB - Hash the last instruction in the MBB.
-static unsigned HashEndOfMBB(const MachineBasicBlock *MBB) {
-  MachineBasicBlock::const_iterator I = MBB->getLastNonDebugInstr();
-  if (I == MBB->end())
+static unsigned HashEndOfMBB(const MachineBasicBlock &MBB) {
+  MachineBasicBlock::const_iterator I = MBB.getLastNonDebugInstr();
+  if (I == MBB.end())
     return 0;
 
-  return HashMachineInstr(I);
+  return HashMachineInstr(*I);
 }
 
 /// ComputeCommonTailLength - Given two machine basic blocks, compute the number
@@ -357,7 +362,7 @@ static unsigned ComputeCommonTailLength(MachineBasicBlock *MBB1,
       --I2;
     }
     // I1, I2==first (untested) non-DBGs preceding known match
-    if (!I1->isIdenticalTo(I2) ||
+    if (!I1->isIdenticalTo(*I2) ||
         // FIXME: This check is dubious. It's used to get around a problem where
         // people incorrectly expect inline asm directives to remain in the same
         // relative order. This is untenable because normal compiler
@@ -394,15 +399,27 @@ static unsigned ComputeCommonTailLength(MachineBasicBlock *MBB1,
   return TailLen;
 }
 
-void BranchFolder::MaintainLiveIns(MachineBasicBlock *CurMBB,
-                                   MachineBasicBlock *NewMBB) {
-  if (RS) {
-    RS->enterBasicBlock(CurMBB);
-    if (!CurMBB->empty())
-      RS->forward(std::prev(CurMBB->end()));
-    for (unsigned int i = 1, e = TRI->getNumRegs(); i != e; i++)
-      if (RS->isRegUsed(i, false))
-        NewMBB->addLiveIn(i);
+void BranchFolder::computeLiveIns(MachineBasicBlock &MBB) {
+  if (!UpdateLiveIns)
+    return;
+
+  LiveRegs.init(TRI);
+  LiveRegs.addLiveOutsNoPristines(MBB);
+  for (MachineInstr &MI : make_range(MBB.rbegin(), MBB.rend()))
+    LiveRegs.stepBackward(MI);
+
+  for (unsigned Reg : LiveRegs) {
+    // Skip the register if we are about to add one of its super registers.
+    bool ContainsSuperReg = false;
+    for (MCSuperRegIterator SReg(Reg, TRI); SReg.isValid(); ++SReg) {
+      if (LiveRegs.contains(*SReg)) {
+        ContainsSuperReg = true;
+        break;
+      }
+    }
+    if (ContainsSuperReg)
+      continue;
+    MBB.addLiveIn(Reg);
   }
 }
 
@@ -410,12 +427,9 @@ void BranchFolder::MaintainLiveIns(MachineBasicBlock *CurMBB,
 /// after it, replacing it with an unconditional branch to NewDest.
 void BranchFolder::ReplaceTailWithBranchTo(MachineBasicBlock::iterator OldInst,
                                            MachineBasicBlock *NewDest) {
-  MachineBasicBlock *CurMBB = OldInst->getParent();
-
   TII->ReplaceTailWithBranchTo(OldInst, NewDest);
 
-  // For targets that use the register scavenger, we must maintain LiveIns.
-  MaintainLiveIns(CurMBB, NewDest);
+  computeLiveIns(*NewDest);
 
   ++NumTailMerge;
 }
@@ -445,16 +459,22 @@ MachineBasicBlock *BranchFolder::SplitMBBAt(MachineBasicBlock &CurMBB,
   // Splice the code over.
   NewMBB->splice(NewMBB->end(), &CurMBB, BBI1, CurMBB.end());
 
+  // NewMBB belongs to the same loop as CurMBB.
+  if (MLI) 
+    if (MachineLoop *ML = MLI->getLoopFor(&CurMBB))
+      ML->addBasicBlockToLoop(NewMBB, MLI->getBase());
+
   // NewMBB inherits CurMBB's block frequency.
   MBBFreqInfo.setBlockFreq(NewMBB, MBBFreqInfo.getBlockFreq(&CurMBB));
 
-  // For targets that use the register scavenger, we must maintain LiveIns.
-  MaintainLiveIns(&CurMBB, NewMBB);
+  computeLiveIns(*NewMBB);
 
   // Add the new block to the funclet.
   const auto &FuncletI = FuncletMembership.find(&CurMBB);
-  if (FuncletI != FuncletMembership.end())
-    FuncletMembership[NewMBB] = FuncletI->second;
+  if (FuncletI != FuncletMembership.end()) {
+    auto n = FuncletI->second;
+    FuncletMembership[NewMBB] = n;
+  }
 
   return NewMBB;
 }
@@ -488,8 +508,7 @@ static void FixTail(MachineBasicBlock *CurMBB, MachineBasicBlock *SuccBB,
   MachineBasicBlock *TBB = nullptr, *FBB = nullptr;
   SmallVector<MachineOperand, 4> Cond;
   DebugLoc dl;  // FIXME: this is nowhere
-  if (I != MF->end() &&
-      !TII->AnalyzeBranch(*CurMBB, TBB, FBB, Cond, true)) {
+  if (I != MF->end() && !TII->analyzeBranch(*CurMBB, TBB, FBB, Cond, true)) {
     MachineBasicBlock *NextBB = &*I;
     if (TBB == NextBB && !Cond.empty() && !FBB) {
       if (!TII->ReverseBranchCondition(Cond)) {
@@ -537,6 +556,18 @@ void BranchFolder::MBFIWrapper::setBlockFreq(const MachineBasicBlock *MBB,
   MergedBBFreq[MBB] = F;
 }
 
+raw_ostream &
+BranchFolder::MBFIWrapper::printBlockFreq(raw_ostream &OS,
+                                          const MachineBasicBlock *MBB) const {
+  return MBFI.printBlockFreq(OS, getBlockFreq(MBB));
+}
+
+raw_ostream &
+BranchFolder::MBFIWrapper::printBlockFreq(raw_ostream &OS,
+                                          const BlockFrequency Freq) const {
+  return MBFI.printBlockFreq(OS, Freq);
+}
+
 /// CountTerminators - Count the number of terminators in the given
 /// block and set I to the position of the first non-terminator, if there
 /// is one, or MBB->end() otherwise.
@@ -745,11 +776,9 @@ bool BranchFolder::CreateCommonTailOnlyBlock(MachineBasicBlock *&PredBB,
 }
 
 static void
-removeMMOsFromMemoryOperations(MachineBasicBlock::iterator MBBIStartPos,
-                               MachineBasicBlock &MBBCommon) {
-  // Remove MMOs from memory operations in the common block
-  // when they do not match the ones from the block being tail-merged.
-  // This ensures later passes conservatively compute dependencies.
+mergeMMOsFromMemoryOperations(MachineBasicBlock::iterator MBBIStartPos,
+                              MachineBasicBlock &MBBCommon) {
+  // Merge MMOs from memory operations in the common block.
   MachineBasicBlock *MBB = MBBIStartPos->getParent();
   // Note CommonTailLen does not necessarily matches the size of
   // the common BB nor all its instructions because of debug
@@ -777,7 +806,7 @@ removeMMOsFromMemoryOperations(MachineBasicBlock::iterator MBBIStartPos,
 
     assert(MBBICommon != MBBIECommon &&
            "Reached BB end within common tail length!");
-    assert(MBBICommon->isIdenticalTo(&*MBBI) && "Expected matching MIIs!");
+    assert(MBBICommon->isIdenticalTo(*MBBI) && "Expected matching MIIs!");
 
     if (MBBICommon->mayLoad() || MBBICommon->mayStore())
       MBBICommon->setMemRefs(MBBICommon->mergeMemRefsWith(*MBBI));
@@ -787,14 +816,13 @@ removeMMOsFromMemoryOperations(MachineBasicBlock::iterator MBBIStartPos,
   }
 }
 
-// See if any of the blocks in MergePotentials (which all have a common single
-// successor, or all have no successor) can be tail-merged.  If there is a
-// successor, any blocks in MergePotentials that are not tail-merged and
-// are not immediately before Succ must have an unconditional branch to
-// Succ added (but the predecessor/successor lists need no adjustment).
-// The lone predecessor of Succ that falls through into Succ,
+// See if any of the blocks in MergePotentials (which all have SuccBB as a
+// successor, or all have no successor if it is null) can be tail-merged.
+// If there is a successor, any blocks in MergePotentials that are not
+// tail-merged and are not immediately before Succ must have an unconditional
+// branch to Succ added (but the predecessor/successor lists need no
+// adjustment). The lone predecessor of Succ that falls through into Succ,
 // if any, is given in PredBB.
-
 bool BranchFolder::TryTailMergeBlocks(MachineBasicBlock *SuccBB,
                                       MachineBasicBlock *PredBB) {
   bool MadeChange = false;
@@ -888,7 +916,7 @@ bool BranchFolder::TryTailMergeBlocks(MachineBasicBlock *SuccBB,
 
     MachineBasicBlock *MBB = SameTails[commonTailIndex].getBlock();
 
-    // Recompute commont tail MBB's edge weights and block frequency.
+    // Recompute common tail MBB's edge weights and block frequency.
     setCommonTailEdgeWeights(*MBB);
 
     // MBB is common tail.  Adjust all other BB's to jump to this one.
@@ -900,8 +928,8 @@ bool BranchFolder::TryTailMergeBlocks(MachineBasicBlock *SuccBB,
         continue;
       DEBUG(dbgs() << "BB#" << SameTails[i].getBlock()->getNumber()
                    << (i == e-1 ? "" : ", "));
-      // Remove MMOs from memory operations as needed.
-      removeMMOsFromMemoryOperations(SameTails[i].getTailStartPos(), *MBB);
+      // Merge MMOs from memory operations as needed.
+      mergeMMOsFromMemoryOperations(SameTails[i].getTailStartPos(), *MBB);
       // Hack the end off BB i, making it jump to BB commonTailIndex instead.
       ReplaceTailWithBranchTo(SameTails[i].getTailStartPos(), MBB);
       // BB i is no longer a predecessor of SuccBB; remove it from the worklist.
@@ -920,23 +948,27 @@ bool BranchFolder::TailMergeBlocks(MachineFunction &MF) {
   if (!EnableTailMerge) return MadeChange;
 
   // First find blocks with no successors.
-  MergePotentials.clear();
-  for (MachineBasicBlock &MBB : MF) {
-    if (MergePotentials.size() == TailMergeThreshold)
-      break;
-    if (!TriedMerging.count(&MBB) && MBB.succ_empty())
-      MergePotentials.push_back(MergePotentialsElt(HashEndOfMBB(&MBB), &MBB));
-  }
+  // Block placement does not create new tail merging opportunities for these
+  // blocks.
+  if (!AfterBlockPlacement) {
+    MergePotentials.clear();
+    for (MachineBasicBlock &MBB : MF) {
+      if (MergePotentials.size() == TailMergeThreshold)
+        break;
+      if (!TriedMerging.count(&MBB) && MBB.succ_empty())
+        MergePotentials.push_back(MergePotentialsElt(HashEndOfMBB(MBB), &MBB));
+    }
 
-  // If this is a large problem, avoid visiting the same basic blocks
-  // multiple times.
-  if (MergePotentials.size() == TailMergeThreshold)
-    for (unsigned i = 0, e = MergePotentials.size(); i != e; ++i)
-      TriedMerging.insert(MergePotentials[i].getBlock());
+    // If this is a large problem, avoid visiting the same basic blocks
+    // multiple times.
+    if (MergePotentials.size() == TailMergeThreshold)
+      for (unsigned i = 0, e = MergePotentials.size(); i != e; ++i)
+        TriedMerging.insert(MergePotentials[i].getBlock());
 
-  // See if we can do any tail merging on those.
-  if (MergePotentials.size() >= 2)
-    MadeChange |= TryTailMergeBlocks(nullptr, nullptr);
+    // See if we can do any tail merging on those.
+    if (MergePotentials.size() >= 2)
+      MadeChange |= TryTailMergeBlocks(nullptr, nullptr);
+  }
 
   // Look at blocks (IBB) with multiple predecessors (PBB).
   // We change each predecessor to a canonical form, by
@@ -983,9 +1015,20 @@ bool BranchFolder::TailMergeBlocks(MachineFunction &MF) {
       if (PBB->hasEHPadSuccessor())
         continue;
 
+      // Bail out if the loop header (IBB) is not the top of the loop chain
+      // after the block placement.  Otherwise, the common tail of IBB's
+      // predecessors may become the loop top if block placement is called again
+      // and the predecessors may branch to this common tail.
+      // FIXME: Relaxed this check if the algorithm of finding loop top is
+      // changed in MBP.
+      if (AfterBlockPlacement && MLI)
+        if (MachineLoop *ML = MLI->getLoopFor(IBB))
+          if (IBB == ML->getHeader() && ML == MLI->getLoopFor(PBB))
+            continue;
+
       MachineBasicBlock *TBB = nullptr, *FBB = nullptr;
       SmallVector<MachineOperand, 4> Cond;
-      if (!TII->AnalyzeBranch(*PBB, TBB, FBB, Cond, true)) {
+      if (!TII->analyzeBranch(*PBB, TBB, FBB, Cond, true)) {
         // Failing case: IBB is the target of a cbr, and we cannot reverse the
         // branch.
         SmallVector<MachineOperand, 4> NewCond(Cond);
@@ -1033,7 +1076,7 @@ bool BranchFolder::TailMergeBlocks(MachineFunction &MF) {
                               NewCond, dl);
         }
 
-        MergePotentials.push_back(MergePotentialsElt(HashEndOfMBB(PBB), PBB));
+        MergePotentials.push_back(MergePotentialsElt(HashEndOfMBB(*PBB), PBB));
       }
     }
 
@@ -1211,7 +1254,7 @@ ReoptimizeBlock:
       // where a BB jumps to more than one landing pad.
       // TODO: Is it ever worth rewriting predecessors which don't already
       // jump to a landing pad, and so can safely jump to the fallthrough?
-    } else {
+    } else if (MBB->isSuccessor(&*FallThrough)) {
       // Rewrite all predecessors of the old block to go to the fallthrough
       // instead.
       while (!MBB->pred_empty()) {
@@ -1234,7 +1277,7 @@ ReoptimizeBlock:
   MachineBasicBlock *PriorTBB = nullptr, *PriorFBB = nullptr;
   SmallVector<MachineOperand, 4> PriorCond;
   bool PriorUnAnalyzable =
-    TII->AnalyzeBranch(PrevBB, PriorTBB, PriorFBB, PriorCond, true);
+      TII->analyzeBranch(PrevBB, PriorTBB, PriorFBB, PriorCond, true);
   if (!PriorUnAnalyzable) {
     // If the CFG for the prior block has extra edges, remove them.
     MadeChange |= PrevBB.CorrectExtraCFGEdges(PriorTBB, PriorFBB,
@@ -1275,11 +1318,11 @@ ReoptimizeBlock:
         // DBG_VALUE at the beginning of MBB.
         while (PrevBBIter != PrevBB.begin() && MBBIter != MBB->end()
                && PrevBBIter->isDebugValue() && MBBIter->isDebugValue()) {
-          if (!MBBIter->isIdenticalTo(PrevBBIter))
+          if (!MBBIter->isIdenticalTo(*PrevBBIter))
             break;
-          MachineInstr *DuplicateDbg = MBBIter;
+          MachineInstr &DuplicateDbg = *MBBIter;
           ++MBBIter; -- PrevBBIter;
-          DuplicateDbg->eraseFromParent();
+          DuplicateDbg.eraseFromParent();
         }
       }
       PrevBB.splice(PrevBB.end(), MBB, MBB->begin(), MBB->end());
@@ -1371,7 +1414,8 @@ ReoptimizeBlock:
   // Analyze the branch in the current block.
   MachineBasicBlock *CurTBB = nullptr, *CurFBB = nullptr;
   SmallVector<MachineOperand, 4> CurCond;
-  bool CurUnAnalyzable= TII->AnalyzeBranch(*MBB, CurTBB, CurFBB, CurCond, true);
+  bool CurUnAnalyzable =
+      TII->analyzeBranch(*MBB, CurTBB, CurFBB, CurCond, true);
   if (!CurUnAnalyzable) {
     // If the CFG for the prior block has extra edges, remove them.
     MadeChange |= MBB->CorrectExtraCFGEdges(CurTBB, CurFBB, !CurCond.empty());
@@ -1455,8 +1499,8 @@ ReoptimizeBlock:
               // change this to an unconditional branch (and fix the CFG).
               MachineBasicBlock *NewCurTBB = nullptr, *NewCurFBB = nullptr;
               SmallVector<MachineOperand, 4> NewCurCond;
-              bool NewCurUnAnalyzable = TII->AnalyzeBranch(*PMBB, NewCurTBB,
-                      NewCurFBB, NewCurCond, true);
+              bool NewCurUnAnalyzable = TII->analyzeBranch(
+                  *PMBB, NewCurTBB, NewCurFBB, NewCurCond, true);
               if (!NewCurUnAnalyzable && NewCurTBB && NewCurTBB == NewCurFBB) {
                 DebugLoc pdl = getBranchDebugLoc(*PMBB);
                 TII->RemoveBranch(*PMBB);
@@ -1502,9 +1546,9 @@ ReoptimizeBlock:
         MachineBasicBlock *PredTBB = nullptr, *PredFBB = nullptr;
         SmallVector<MachineOperand, 4> PredCond;
         if (PredBB != MBB && !PredBB->canFallThrough() &&
-            !TII->AnalyzeBranch(*PredBB, PredTBB, PredFBB, PredCond, true)
-            && (!CurFallsThru || !CurTBB || !CurFBB)
-            && (!CurFallsThru || MBB->getNumber() >= PredBB->getNumber())) {
+            !TII->analyzeBranch(*PredBB, PredTBB, PredFBB, PredCond, true) &&
+            (!CurFallsThru || !CurTBB || !CurFBB) &&
+            (!CurFallsThru || MBB->getNumber() >= PredBB->getNumber())) {
           // If the current block doesn't fall through, just move it.
           // If the current block can fall through and does not end with a
           // conditional branch, we need to append an unconditional jump to
@@ -1560,7 +1604,7 @@ ReoptimizeBlock:
       // Now check to see if the current block is sitting between PrevBB and
       // a block to which it could fall through.
       if (FallThrough != MF.end() &&
-          !TII->AnalyzeBranch(PrevBB, PrevTBB, PrevFBB, PrevCond, true) &&
+          !TII->analyzeBranch(PrevBB, PrevTBB, PrevFBB, PrevCond, true) &&
           PrevBB.isSuccessor(&*FallThrough)) {
         MBB->moveAfter(&MF.back());
         MadeChange = true;
@@ -1623,7 +1667,7 @@ MachineBasicBlock::iterator findHoistingInsertPosAndDeps(MachineBasicBlock *MBB,
                                                   SmallSet<unsigned,4> &Uses,
                                                   SmallSet<unsigned,4> &Defs) {
   MachineBasicBlock::iterator Loc = MBB->getFirstTerminator();
-  if (!TII->isUnpredicatedTerminator(Loc))
+  if (!TII->isUnpredicatedTerminator(*Loc))
     return MBB->end();
 
   for (const MachineOperand &MO : Loc->operands()) {
@@ -1685,7 +1729,7 @@ MachineBasicBlock::iterator findHoistingInsertPosAndDeps(MachineBasicBlock *MBB,
   // Also avoid moving code above predicated instruction since it's hard to
   // reason about register liveness with predicated instruction.
   bool DontMoveAcrossStore = true;
-  if (!PI->isSafeToMove(nullptr, DontMoveAcrossStore) || TII->isPredicated(PI))
+  if (!PI->isSafeToMove(nullptr, DontMoveAcrossStore) || TII->isPredicated(*PI))
     return MBB->end();
 
 
@@ -1719,7 +1763,7 @@ MachineBasicBlock::iterator findHoistingInsertPosAndDeps(MachineBasicBlock *MBB,
 bool BranchFolder::HoistCommonCodeInSuccs(MachineBasicBlock *MBB) {
   MachineBasicBlock *TBB = nullptr, *FBB = nullptr;
   SmallVector<MachineOperand, 4> Cond;
-  if (TII->AnalyzeBranch(*MBB, TBB, FBB, Cond, true) || !TBB || Cond.empty())
+  if (TII->analyzeBranch(*MBB, TBB, FBB, Cond, true) || !TBB || Cond.empty())
     return false;
 
   if (!FBB) FBB = findFalseBlock(MBB, TBB);
@@ -1762,10 +1806,10 @@ bool BranchFolder::HoistCommonCodeInSuccs(MachineBasicBlock *MBB) {
       if (FIB == FIE)
         break;
     }
-    if (!TIB->isIdenticalTo(FIB, MachineInstr::CheckKillDead))
+    if (!TIB->isIdenticalTo(*FIB, MachineInstr::CheckKillDead))
       break;
 
-    if (TII->isPredicated(TIB))
+    if (TII->isPredicated(*TIB))
       // Hard to reason about register liveness with predicated instruction.
       break;
 
@@ -1844,7 +1888,7 @@ bool BranchFolder::HoistCommonCodeInSuccs(MachineBasicBlock *MBB) {
       if (!MO.isReg() || !MO.isDef() || MO.isDead())
         continue;
       unsigned Reg = MO.getReg();
-      if (!Reg)
+      if (!Reg || TargetRegisterInfo::isVirtualRegister(Reg))
         continue;
       LocalDefs.push_back(Reg);
       addRegAndItsAliases(Reg, TRI, LocalDefsSet);
diff --git a/lib/CodeGen/BranchFolding.h b/lib/CodeGen/BranchFolding.h
index d759d53e27f2..36a5a2e2c97c 100644
--- a/lib/CodeGen/BranchFolding.h
+++ b/lib/CodeGen/BranchFolding.h
@@ -11,6 +11,7 @@
 #define LLVM_LIB_CODEGEN_BRANCHFOLDING_H
 
 #include "llvm/ADT/SmallPtrSet.h"
+#include "llvm/CodeGen/LivePhysRegs.h"
 #include "llvm/CodeGen/MachineBasicBlock.h"
 #include "llvm/Support/BlockFrequency.h"
 #include <vector>
@@ -20,20 +21,23 @@ namespace llvm {
   class MachineBranchProbabilityInfo;
   class MachineFunction;
   class MachineModuleInfo;
-  class RegScavenger;
+  class MachineLoopInfo;
   class TargetInstrInfo;
   class TargetRegisterInfo;
 
   class LLVM_LIBRARY_VISIBILITY BranchFolder {
   public:
+    class MBFIWrapper;
+
     explicit BranchFolder(bool defaultEnableTailMerge, bool CommonHoist,
-                          const MachineBlockFrequencyInfo &MBFI,
+                          MBFIWrapper &MBFI,
                           const MachineBranchProbabilityInfo &MBPI);
 
-    bool OptimizeFunction(MachineFunction &MF,
-                          const TargetInstrInfo *tii,
-                          const TargetRegisterInfo *tri,
-                          MachineModuleInfo *mmi);
+    bool OptimizeFunction(MachineFunction &MF, const TargetInstrInfo *tii,
+                          const TargetRegisterInfo *tri, MachineModuleInfo *mmi,
+                          MachineLoopInfo *mli = nullptr,
+                          bool AfterPlacement = false);
+
   private:
     class MergePotentialsElt {
       unsigned Hash;
@@ -91,13 +95,17 @@ namespace llvm {
     };
     std::vector<SameTailElt> SameTails;
 
+    bool AfterBlockPlacement;
     bool EnableTailMerge;
     bool EnableHoistCommonCode;
+    bool UpdateLiveIns;
     const TargetInstrInfo *TII;
     const TargetRegisterInfo *TRI;
     MachineModuleInfo *MMI;
-    RegScavenger *RS;
+    MachineLoopInfo *MLI;
+    LivePhysRegs LiveRegs;
 
+  public:
     /// \brief This class keeps track of branch frequencies of newly created
     /// blocks and tail-merged blocks.
     class MBFIWrapper {
@@ -105,21 +113,25 @@ namespace llvm {
       MBFIWrapper(const MachineBlockFrequencyInfo &I) : MBFI(I) {}
       BlockFrequency getBlockFreq(const MachineBasicBlock *MBB) const;
       void setBlockFreq(const MachineBasicBlock *MBB, BlockFrequency F);
+      raw_ostream &printBlockFreq(raw_ostream &OS,
+                                  const MachineBasicBlock *MBB) const;
+      raw_ostream &printBlockFreq(raw_ostream &OS,
+                                  const BlockFrequency Freq) const;
 
     private:
       const MachineBlockFrequencyInfo &MBFI;
       DenseMap<const MachineBasicBlock *, BlockFrequency> MergedBBFreq;
     };
 
-    MBFIWrapper MBBFreqInfo;
+  private:
+    MBFIWrapper &MBBFreqInfo;
     const MachineBranchProbabilityInfo &MBPI;
 
     bool TailMergeBlocks(MachineFunction &MF);
     bool TryTailMergeBlocks(MachineBasicBlock* SuccBB,
                        MachineBasicBlock* PredBB);
     void setCommonTailEdgeWeights(MachineBasicBlock &TailMBB);
-    void MaintainLiveIns(MachineBasicBlock *CurMBB,
-                         MachineBasicBlock *NewMBB);
+    void computeLiveIns(MachineBasicBlock &MBB);
     void ReplaceTailWithBranchTo(MachineBasicBlock::iterator OldInst,
                                  MachineBasicBlock *NewDest);
     MachineBasicBlock *SplitMBBAt(MachineBasicBlock &CurMBB,
diff --git a/lib/CodeGen/BuiltinGCs.cpp b/lib/CodeGen/BuiltinGCs.cpp
new file mode 100644
index 000000000000..ff7c99de0420
--- /dev/null
+++ b/lib/CodeGen/BuiltinGCs.cpp
@@ -0,0 +1,139 @@
+//===-- BuiltinGCs.cpp - Boilerplate for our built in GC types --*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the boilerplate required to define our various built in
+// gc lowering strategies.  
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/CodeGen/GCs.h"
+#include "llvm/CodeGen/GCStrategy.h"
+
+using namespace llvm;
+
+namespace {
+
+/// An example GC which attempts to be compatibile with Erlang/OTP garbage
+/// collector.
+///
+/// The frametable emitter is in ErlangGCPrinter.cpp.
+class ErlangGC : public GCStrategy {
+public:
+  ErlangGC() {
+    InitRoots = false;
+    NeededSafePoints = 1 << GC::PostCall;
+    UsesMetadata = true;
+    CustomRoots = false;
+  }
+};
+
+/// An example GC which attempts to be compatible with Objective Caml 3.10.0
+///
+/// The frametable emitter is in OcamlGCPrinter.cpp.
+class OcamlGC : public GCStrategy {
+public:
+  OcamlGC() {
+    NeededSafePoints = 1 << GC::PostCall;
+    UsesMetadata = true;
+  }
+};
+
+/// A GC strategy for uncooperative targets.  This implements lowering for the
+/// llvm.gc* intrinsics for targets that do not natively support them (which
+/// includes the C backend). Note that the code generated is not quite as
+/// efficient as algorithms which generate stack maps to identify roots.
+///
+/// In order to support this particular transformation, all stack roots are
+/// coallocated in the stack. This allows a fully target-independent stack map
+/// while introducing only minor runtime overhead.
+class ShadowStackGC : public GCStrategy {
+public:
+  ShadowStackGC() {
+    InitRoots = true;
+    CustomRoots = true;
+  }
+};
+
+/// A GCStrategy which serves as an example for the usage of a statepoint based
+/// lowering strategy.  This GCStrategy is intended to suitable as a default
+/// implementation usable with any collector which can consume the standard
+/// stackmap format generated by statepoints, uses the default addrespace to
+/// distinguish between gc managed and non-gc managed pointers, and has
+/// reasonable relocation semantics.
+class StatepointGC : public GCStrategy {
+public:
+  StatepointGC() {
+    UseStatepoints = true;
+    // These options are all gc.root specific, we specify them so that the
+    // gc.root lowering code doesn't run.
+    InitRoots = false;
+    NeededSafePoints = 0;
+    UsesMetadata = false;
+    CustomRoots = false;
+  }
+  Optional<bool> isGCManagedPointer(const Type *Ty) const override {
+    // Method is only valid on pointer typed values.
+    const PointerType *PT = cast<PointerType>(Ty);
+    // For the sake of this example GC, we arbitrarily pick addrspace(1) as our
+    // GC managed heap.  We know that a pointer into this heap needs to be
+    // updated and that no other pointer does.  Note that addrspace(1) is used
+    // only as an example, it has no special meaning, and is not reserved for
+    // GC usage.
+    return (1 == PT->getAddressSpace());
+  }
+};
+
+/// A GCStrategy for the CoreCLR Runtime. The strategy is similar to
+/// Statepoint-example GC, but differs from it in certain aspects, such as:
+/// 1) Base-pointers need not be explicitly tracked and reported for
+///    interior pointers
+/// 2) Uses a different format for encoding stack-maps
+/// 3) Location of Safe-point polls: polls are only needed before loop-back
+///    edges and before tail-calls (not needed at function-entry)
+///
+/// The above differences in behavior are to be implemented in upcoming
+/// checkins.
+class CoreCLRGC : public GCStrategy {
+public:
+  CoreCLRGC() {
+    UseStatepoints = true;
+    // These options are all gc.root specific, we specify them so that the
+    // gc.root lowering code doesn't run.
+    InitRoots = false;
+    NeededSafePoints = 0;
+    UsesMetadata = false;
+    CustomRoots = false;
+  }
+  Optional<bool> isGCManagedPointer(const Type *Ty) const override {
+    // Method is only valid on pointer typed values.
+    const PointerType *PT = cast<PointerType>(Ty);
+    // We pick addrspace(1) as our GC managed heap.
+    return (1 == PT->getAddressSpace());
+  }
+};
+}
+
+// Register all the above so that they can be found at runtime.  Note that
+// these static initializers are important since the registration list is
+// constructed from their storage.
+static GCRegistry::Add<ErlangGC> A("erlang",
+                                   "erlang-compatible garbage collector");
+static GCRegistry::Add<OcamlGC> B("ocaml", "ocaml 3.10-compatible GC");
+static GCRegistry::Add<ShadowStackGC>
+    C("shadow-stack", "Very portable GC for uncooperative code generators");
+static GCRegistry::Add<StatepointGC> D("statepoint-example",
+                                       "an example strategy for statepoint");
+static GCRegistry::Add<CoreCLRGC> E("coreclr", "CoreCLR-compatible GC");
+
+// Provide hooks to ensure the containing library is fully loaded.
+void llvm::linkErlangGC() {}
+void llvm::linkOcamlGC() {}
+void llvm::linkShadowStackGC() {}
+void llvm::linkStatepointExampleGC() {}
+void llvm::linkCoreCLRGC() {}
diff --git a/lib/CodeGen/CMakeLists.txt b/lib/CodeGen/CMakeLists.txt
index a078c3c707a0..d103a68c60d1 100644
--- a/lib/CodeGen/CMakeLists.txt
+++ b/lib/CodeGen/CMakeLists.txt
@@ -1,8 +1,3 @@
-set(system_libs)
-if(CMAKE_HOST_UNIX AND LLVM_ENABLE_THREADS AND HAVE_LIBPTHREAD)
-  set(system_libs ${system_libs} pthread)
-endif()
-
 add_llvm_library(LLVMCodeGen
   AggressiveAntiDepBreaker.cpp
   AllocationOrder.cpp
@@ -10,22 +5,21 @@ add_llvm_library(LLVMCodeGen
   AtomicExpandPass.cpp
   BasicTargetTransformInfo.cpp
   BranchFolding.cpp
+  BuiltinGCs.cpp
   CalcSpillWeights.cpp
   CallingConvLower.cpp
   CodeGen.cpp
   CodeGenPrepare.cpp
-  CoreCLRGC.cpp
   CriticalAntiDepBreaker.cpp
-  DFAPacketizer.cpp
   DeadMachineInstructionElim.cpp
+  DetectDeadLanes.cpp
+  DFAPacketizer.cpp
   DwarfEHPrepare.cpp
   EarlyIfConversion.cpp
   EdgeBundles.cpp
-  ErlangGC.cpp
   ExecutionDepsFix.cpp
   ExpandISelPseudos.cpp
   ExpandPostRAPseudos.cpp
-  LiveDebugValues.cpp
   FaultMaps.cpp
   FuncletLayout.cpp
   GCMetadata.cpp
@@ -39,58 +33,61 @@ add_llvm_library(LLVMCodeGen
   InterferenceCache.cpp
   InterleavedAccessPass.cpp
   IntrinsicLowering.cpp
-  LLVMTargetMachine.cpp
   LatencyPriorityQueue.cpp
   LexicalScopes.cpp
+  LiveDebugValues.cpp
   LiveDebugVariables.cpp
-  LiveInterval.cpp
   LiveIntervalAnalysis.cpp
+  LiveInterval.cpp
   LiveIntervalUnion.cpp
+  LivePhysRegs.cpp
   LiveRangeCalc.cpp
   LiveRangeEdit.cpp
   LiveRegMatrix.cpp
-  LivePhysRegs.cpp
   LiveStackAnalysis.cpp
   LiveVariables.cpp
+  LLVMTargetMachine.cpp
   LocalStackSlotAllocation.cpp
+  LowerEmuTLS.cpp
   MachineBasicBlock.cpp
   MachineBlockFrequencyInfo.cpp
   MachineBlockPlacement.cpp
   MachineBranchProbabilityInfo.cpp
-  MachineCSE.cpp
   MachineCombiner.cpp
   MachineCopyPropagation.cpp
-  MachineDominators.cpp
+  MachineCSE.cpp
   MachineDominanceFrontier.cpp
-  MachineFunction.cpp
+  MachineDominators.cpp
   MachineFunctionAnalysis.cpp
+  MachineFunction.cpp
   MachineFunctionPass.cpp
   MachineFunctionPrinterPass.cpp
-  MachineInstr.cpp
   MachineInstrBundle.cpp
+  MachineInstr.cpp
   MachineLICM.cpp
   MachineLoopInfo.cpp
   MachineModuleInfo.cpp
   MachineModuleInfoImpls.cpp
   MachinePassRegistry.cpp
   MachinePostDominators.cpp
-  MachineRegisterInfo.cpp
   MachineRegionInfo.cpp
-  MachineSSAUpdater.cpp
+  MachineRegisterInfo.cpp
   MachineScheduler.cpp
   MachineSink.cpp
+  MachineSSAUpdater.cpp
   MachineTraceMetrics.cpp
   MachineVerifier.cpp
+  PatchableFunction.cpp
   MIRPrinter.cpp
   MIRPrintingPass.cpp
-  OcamlGC.cpp
   OptimizePHIs.cpp
-  PHIElimination.cpp
-  PHIEliminationUtils.cpp
   ParallelCG.cpp
-  Passes.cpp
   PeepholeOptimizer.cpp
+  PHIElimination.cpp
+  PHIEliminationUtils.cpp
+  PostRAHazardRecognizer.cpp
   PostRASchedulerList.cpp
+  PreISelIntrinsicLowering.cpp
   ProcessImplicitDefs.cpp
   PrologEpilogInserter.cpp
   PseudoSourceValue.cpp
@@ -103,41 +100,49 @@ add_llvm_library(LLVMCodeGen
   RegisterCoalescer.cpp
   RegisterPressure.cpp
   RegisterScavenging.cpp
+  RenameIndependentSubregs.cpp
+  RegisterUsageInfo.cpp
+  RegUsageInfoCollector.cpp
+  RegUsageInfoPropagate.cpp
+  SafeStack.cpp
+  SafeStackColoring.cpp
+  SafeStackLayout.cpp
   ScheduleDAG.cpp
   ScheduleDAGInstrs.cpp
   ScheduleDAGPrinter.cpp
   ScoreboardHazardRecognizer.cpp
-  ShrinkWrap.cpp
-  ShadowStackGC.cpp
   ShadowStackGCLowering.cpp
+  ShrinkWrap.cpp
   SjLjEHPrepare.cpp
   SlotIndexes.cpp
   SpillPlacement.cpp
   SplitKit.cpp
   StackColoring.cpp
-  StackProtector.cpp
-  StackSlotColoring.cpp
   StackMapLivenessAnalysis.cpp
   StackMaps.cpp
-  StatepointExampleGC.cpp
+  StackProtector.cpp
+  StackSlotColoring.cpp
   TailDuplication.cpp
+  TailDuplicator.cpp
   TargetFrameLoweringImpl.cpp
   TargetInstrInfo.cpp
   TargetLoweringBase.cpp
   TargetLoweringObjectFileImpl.cpp
   TargetOptionsImpl.cpp
+  TargetPassConfig.cpp
   TargetRegisterInfo.cpp
   TargetSchedule.cpp
   TwoAddressInstructionPass.cpp
   UnreachableBlockElim.cpp
   VirtRegMap.cpp
   WinEHPrepare.cpp
+  XRayInstrumentation.cpp
 
   ADDITIONAL_HEADER_DIRS
   ${LLVM_MAIN_INCLUDE_DIR}/llvm/CodeGen
   ${LLVM_MAIN_INCLUDE_DIR}/llvm/CodeGen/PBQP
 
-  LINK_LIBS ${system_libs}
+  LINK_LIBS ${PTHREAD_LIB}
   )
 
 add_dependencies(LLVMCodeGen intrinsics_gen)
@@ -145,3 +150,4 @@ add_dependencies(LLVMCodeGen intrinsics_gen)
 add_subdirectory(SelectionDAG)
 add_subdirectory(AsmPrinter)
 add_subdirectory(MIRParser)
+add_subdirectory(GlobalISel)
diff --git a/lib/CodeGen/CalcSpillWeights.cpp b/lib/CodeGen/CalcSpillWeights.cpp
index 26aa46fb6c2a..dc2d38a95f99 100644
--- a/lib/CodeGen/CalcSpillWeights.cpp
+++ b/lib/CodeGen/CalcSpillWeights.cpp
@@ -121,7 +121,7 @@ static bool isRematerializable(const LiveInterval &LI,
       }
     }
 
-    if (!TII.isTriviallyReMaterializable(MI, LIS.getAliasAnalysis()))
+    if (!TII.isTriviallyReMaterializable(*MI, LIS.getAliasAnalysis()))
       return false;
   }
   return true;
@@ -170,8 +170,7 @@ VirtRegAuxInfo::calculateSpillWeightAndHint(LiveInterval &li) {
       // Calculate instr weight.
       bool reads, writes;
       std::tie(reads, writes) = mi->readsWritesVirtualRegister(li.reg);
-      weight = LiveIntervals::getSpillWeight(
-        writes, reads, &MBFI, mi);
+      weight = LiveIntervals::getSpillWeight(writes, reads, &MBFI, *mi);
 
       // Give extra weight to what looks like a loop induction variable update.
       if (writes && isExiting && LIS.isLiveOutOfMBB(li, mbb))
@@ -192,11 +191,15 @@ VirtRegAuxInfo::calculateSpillWeightAndHint(LiveInterval &li) {
     // FIXME: we probably shouldn't use floats at all.
     volatile float hweight = Hint[hint] += weight;
     if (TargetRegisterInfo::isPhysicalRegister(hint)) {
-      if (hweight > bestPhys && mri.isAllocatable(hint))
-        bestPhys = hweight, hintPhys = hint;
+      if (hweight > bestPhys && mri.isAllocatable(hint)) {
+        bestPhys = hweight;
+        hintPhys = hint;
+      }
     } else {
-      if (hweight > bestVirt)
-        bestVirt = hweight, hintVirt = hint;
+      if (hweight > bestVirt) {
+        bestVirt = hweight;
+        hintVirt = hint;
+      }
     }
   }
 
diff --git a/lib/CodeGen/CallingConvLower.cpp b/lib/CodeGen/CallingConvLower.cpp
index 23c0d542560e..7d67bcfe5469 100644
--- a/lib/CodeGen/CallingConvLower.cpp
+++ b/lib/CodeGen/CallingConvLower.cpp
@@ -51,9 +51,9 @@ void CCState::HandleByVal(unsigned ValNo, MVT ValVT,
     Size = MinSize;
   if (MinAlign > (int)Align)
     Align = MinAlign;
-  MF.getFrameInfo()->ensureMaxAlignment(Align);
+  ensureMaxAlignment(Align);
   MF.getSubtarget().getTargetLowering()->HandleByVal(this, Size, Align);
-  Size = unsigned(RoundUpToAlignment(Size, MinAlign));
+  Size = unsigned(alignTo(Size, MinAlign));
   unsigned Offset = AllocateStack(Size, Align);
   addLoc(CCValAssign::getMem(ValNo, ValVT, Offset, LocVT, LocInfo));
 }
@@ -236,6 +236,7 @@ void CCState::analyzeMustTailForwardedRegisters(
   // variadic functions, so we need to assume we're not variadic so that we get
   // all the registers that might be used in a non-variadic call.
   SaveAndRestore<bool> SavedVarArg(IsVarArg, false);
+  SaveAndRestore<bool> SavedMustTail(AnalyzingMustTailForwardedRegs, true);
 
   for (MVT RegVT : RegParmTypes) {
     SmallVector<MCPhysReg, 8> RemainingRegs;
@@ -248,3 +249,39 @@ void CCState::analyzeMustTailForwardedRegisters(
     }
   }
 }
+
+bool CCState::resultsCompatible(CallingConv::ID CalleeCC,
+                                CallingConv::ID CallerCC, MachineFunction &MF,
+                                LLVMContext &C,
+                                const SmallVectorImpl<ISD::InputArg> &Ins,
+                                CCAssignFn CalleeFn, CCAssignFn CallerFn) {
+  if (CalleeCC == CallerCC)
+    return true;
+  SmallVector<CCValAssign, 4> RVLocs1;
+  CCState CCInfo1(CalleeCC, false, MF, RVLocs1, C);
+  CCInfo1.AnalyzeCallResult(Ins, CalleeFn);
+
+  SmallVector<CCValAssign, 4> RVLocs2;
+  CCState CCInfo2(CallerCC, false, MF, RVLocs2, C);
+  CCInfo2.AnalyzeCallResult(Ins, CallerFn);
+
+  if (RVLocs1.size() != RVLocs2.size())
+    return false;
+  for (unsigned I = 0, E = RVLocs1.size(); I != E; ++I) {
+    const CCValAssign &Loc1 = RVLocs1[I];
+    const CCValAssign &Loc2 = RVLocs2[I];
+    if (Loc1.getLocInfo() != Loc2.getLocInfo())
+      return false;
+    bool RegLoc1 = Loc1.isRegLoc();
+    if (RegLoc1 != Loc2.isRegLoc())
+      return false;
+    if (RegLoc1) {
+      if (Loc1.getLocReg() != Loc2.getLocReg())
+        return false;
+    } else {
+      if (Loc1.getLocMemOffset() != Loc2.getLocMemOffset())
+        return false;
+    }
+  }
+  return true;
+}
diff --git a/lib/CodeGen/CodeGen.cpp b/lib/CodeGen/CodeGen.cpp
index dc13b5b11d30..6679819fdef6 100644
--- a/lib/CodeGen/CodeGen.cpp
+++ b/lib/CodeGen/CodeGen.cpp
@@ -24,6 +24,7 @@ void llvm::initializeCodeGen(PassRegistry &Registry) {
   initializeBranchFolderPassPass(Registry);
   initializeCodeGenPreparePass(Registry);
   initializeDeadMachineInstructionElimPass(Registry);
+  initializeDetectDeadLanesPass(Registry);
   initializeDwarfEHPreparePass(Registry);
   initializeEarlyIfConverterPass(Registry);
   initializeExpandISelPseudosPass(Registry);
@@ -33,6 +34,7 @@ void llvm::initializeCodeGen(PassRegistry &Registry) {
   initializeGCMachineCodeAnalysisPass(Registry);
   initializeGCModuleInfoPass(Registry);
   initializeIfConverterPass(Registry);
+  initializeInterleavedAccessPass(Registry);
   initializeLiveDebugVariablesPass(Registry);
   initializeLiveIntervalsPass(Registry);
   initializeLiveStacksPass(Registry);
@@ -55,26 +57,32 @@ void llvm::initializeCodeGen(PassRegistry &Registry) {
   initializeMachineSchedulerPass(Registry);
   initializeMachineSinkingPass(Registry);
   initializeMachineVerifierPassPass(Registry);
+  initializeXRayInstrumentationPass(Registry);
+  initializePatchableFunctionPass(Registry);
   initializeOptimizePHIsPass(Registry);
   initializePEIPass(Registry);
   initializePHIEliminationPass(Registry);
   initializePeepholeOptimizerPass(Registry);
   initializePostMachineSchedulerPass(Registry);
+  initializePostRAHazardRecognizerPass(Registry);
   initializePostRASchedulerPass(Registry);
+  initializePreISelIntrinsicLoweringLegacyPassPass(Registry);
   initializeProcessImplicitDefsPass(Registry);
   initializeRegisterCoalescerPass(Registry);
+  initializeRenameIndependentSubregsPass(Registry);
   initializeShrinkWrapPass(Registry);
   initializeSlotIndexesPass(Registry);
   initializeStackColoringPass(Registry);
   initializeStackMapLivenessPass(Registry);
   initializeLiveDebugValuesPass(Registry);
+  initializeSafeStackPass(Registry);
   initializeStackProtectorPass(Registry);
   initializeStackSlotColoringPass(Registry);
   initializeTailDuplicatePassPass(Registry);
   initializeTargetPassConfigPass(Registry);
   initializeTwoAddressInstructionPassPass(Registry);
   initializeUnpackMachineBundlesPass(Registry);
-  initializeUnreachableBlockElimPass(Registry);
+  initializeUnreachableBlockElimLegacyPassPass(Registry);
   initializeUnreachableMachineBlockElimPass(Registry);
   initializeVirtRegMapPass(Registry);
   initializeVirtRegRewriterPass(Registry);
diff --git a/lib/CodeGen/CodeGenPrepare.cpp b/lib/CodeGen/CodeGenPrepare.cpp
index c8007a524e70..ede404149a1c 100644
--- a/lib/CodeGen/CodeGenPrepare.cpp
+++ b/lib/CodeGen/CodeGenPrepare.cpp
@@ -18,9 +18,11 @@
 #include "llvm/ADT/SmallSet.h"
 #include "llvm/ADT/Statistic.h"
 #include "llvm/Analysis/InstructionSimplify.h"
+#include "llvm/Analysis/LoopInfo.h"
 #include "llvm/Analysis/TargetLibraryInfo.h"
 #include "llvm/Analysis/TargetTransformInfo.h"
 #include "llvm/Analysis/ValueTracking.h"
+#include "llvm/Analysis/MemoryBuiltins.h"
 #include "llvm/IR/CallSite.h"
 #include "llvm/IR/Constants.h"
 #include "llvm/IR/DataLayout.h"
@@ -38,6 +40,7 @@
 #include "llvm/IR/ValueHandle.h"
 #include "llvm/IR/ValueMap.h"
 #include "llvm/Pass.h"
+#include "llvm/Support/BranchProbability.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/raw_ostream.h"
@@ -111,6 +114,10 @@ static cl::opt<bool> StressExtLdPromotion(
     cl::desc("Stress test ext(promotable(ld)) -> promoted(ext(ld)) "
              "optimization in CodeGenPrepare"));
 
+static cl::opt<bool> DisablePreheaderProtect(
+    "disable-preheader-prot", cl::Hidden, cl::init(false),
+    cl::desc("Disable protection against removing loop preheaders"));
+
 namespace {
 typedef SmallPtrSet<Instruction *, 16> SetOfInstrs;
 typedef PointerIntPair<Type *, 1, bool> TypeIsSExt;
@@ -122,6 +129,7 @@ class TypePromotionTransaction;
     const TargetLowering *TLI;
     const TargetTransformInfo *TTI;
     const TargetLibraryInfo *TLInfo;
+    const LoopInfo *LI;
 
     /// As we scan instructions optimizing them, this is the next instruction
     /// to optimize. Transforms that can invalidate this should update it.
@@ -158,9 +166,10 @@ class TypePromotionTransaction;
     const char *getPassName() const override { return "CodeGen Prepare"; }
 
     void getAnalysisUsage(AnalysisUsage &AU) const override {
-      AU.addPreserved<DominatorTreeWrapperPass>();
+      // FIXME: When we can selectively preserve passes, preserve the domtree.
       AU.addRequired<TargetLibraryInfoWrapperPass>();
       AU.addRequired<TargetTransformInfoWrapperPass>();
+      AU.addRequired<LoopInfoWrapperPass>();
     }
 
   private:
@@ -203,7 +212,7 @@ FunctionPass *llvm::createCodeGenPreparePass(const TargetMachine *TM) {
 }
 
 bool CodeGenPrepare::runOnFunction(Function &F) {
-  if (skipOptnoneFunction(F))
+  if (skipFunction(F))
     return false;
 
   DL = &F.getParent()->getDataLayout();
@@ -218,6 +227,7 @@ bool CodeGenPrepare::runOnFunction(Function &F) {
     TLI = TM->getSubtargetImpl(F)->getTargetLowering();
   TLInfo = &getAnalysis<TargetLibraryInfoWrapperPass>().getTLI();
   TTI = &getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F);
+  LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
   OptSize = F.optForSize();
 
   /// This optimization identifies DIV instructions that can be
@@ -359,6 +369,15 @@ bool CodeGenPrepare::eliminateFallThrough(Function &F) {
 /// edges in ways that are non-optimal for isel. Start by eliminating these
 /// blocks so we can split them the way we want them.
 bool CodeGenPrepare::eliminateMostlyEmptyBlocks(Function &F) {
+  SmallPtrSet<BasicBlock *, 16> Preheaders;
+  SmallVector<Loop *, 16> LoopList(LI->begin(), LI->end());
+  while (!LoopList.empty()) {
+    Loop *L = LoopList.pop_back_val();
+    LoopList.insert(LoopList.end(), L->begin(), L->end());
+    if (BasicBlock *Preheader = L->getLoopPreheader())
+      Preheaders.insert(Preheader);
+  }
+
   bool MadeChange = false;
   // Note that this intentionally skips the entry block.
   for (Function::iterator I = std::next(F.begin()), E = F.end(); I != E;) {
@@ -391,6 +410,14 @@ bool CodeGenPrepare::eliminateMostlyEmptyBlocks(Function &F) {
     if (!canMergeBlocks(BB, DestBB))
       continue;
 
+    // Do not delete loop preheaders if doing so would create a critical edge.
+    // Loop preheaders can be good locations to spill registers. If the
+    // preheader is deleted and we create a critical edge, registers may be
+    // spilled in the loop body instead.
+    if (!DisablePreheaderProtect && Preheaders.count(BB) &&
+        !(BB->getSinglePredecessor() && BB->getSinglePredecessor()->getSingleSuccessor()))
+     continue;
+
     eliminateMostlyEmptyBlock(BB);
     MadeChange = true;
   }
@@ -612,7 +639,8 @@ simplifyRelocatesOffABase(GCRelocateInst *RelocatedBase,
       continue;
 
     // Create a Builder and replace the target callsite with a gep
-    assert(RelocatedBase->getNextNode() && "Should always have one since it's not a terminator");
+    assert(RelocatedBase->getNextNode() &&
+           "Should always have one since it's not a terminator");
 
     // Insert after RelocatedBase
     IRBuilder<> Builder(RelocatedBase->getNextNode());
@@ -730,6 +758,11 @@ static bool SinkCast(CastInst *CI) {
     // Preincrement use iterator so we don't invalidate it.
     ++UI;
 
+    // The first insertion point of a block containing an EH pad is after the
+    // pad.  If the pad is the user, we cannot sink the cast past the pad.
+    if (User->isEHPad())
+      continue;
+
     // If the block selected to receive the cast is an EH pad that does not
     // allow non-PHI instructions before the terminator, we can't sink the
     // cast.
@@ -854,10 +887,14 @@ static bool CombineUAddWithOverflow(CmpInst *CI) {
 /// lose; some adjustment may be wanted there.
 ///
 /// Return true if any changes are made.
-static bool SinkCmpExpression(CmpInst *CI) {
+static bool SinkCmpExpression(CmpInst *CI, const TargetLowering *TLI) {
   BasicBlock *DefBB = CI->getParent();
 
-  /// Only insert a cmp in each block once.
+  // Avoid sinking soft-FP comparisons, since this can move them into a loop.
+  if (TLI && TLI->useSoftFloat() && isa<FCmpInst>(CI))
+    return false;
+
+  // Only insert a cmp in each block once.
   DenseMap<BasicBlock*, CmpInst*> InsertedCmps;
 
   bool MadeChange = false;
@@ -905,8 +942,8 @@ static bool SinkCmpExpression(CmpInst *CI) {
   return MadeChange;
 }
 
-static bool OptimizeCmpExpression(CmpInst *CI) {
-  if (SinkCmpExpression(CI))
+static bool OptimizeCmpExpression(CmpInst *CI, const TargetLowering *TLI) {
+  if (SinkCmpExpression(CI, TLI))
     return true;
 
   if (CombineUAddWithOverflow(CI))
@@ -1138,7 +1175,7 @@ static bool OptimizeExtractBits(BinaryOperator *ShiftI, ConstantInt *CI,
 //  %13 = icmp eq i1 %12, true
 //  br i1 %13, label %cond.load4, label %else5
 //
-static void ScalarizeMaskedLoad(CallInst *CI) {
+static void scalarizeMaskedLoad(CallInst *CI) {
   Value *Ptr  = CI->getArgOperand(0);
   Value *Alignment = CI->getArgOperand(1);
   Value *Mask = CI->getArgOperand(2);
@@ -1284,7 +1321,7 @@ static void ScalarizeMaskedLoad(CallInst *CI) {
 //   store i32 %8, i32* %9
 //   br label %else2
 //   . . .
-static void ScalarizeMaskedStore(CallInst *CI) {
+static void scalarizeMaskedStore(CallInst *CI) {
   Value *Src = CI->getArgOperand(0);
   Value *Ptr  = CI->getArgOperand(1);
   Value *Alignment = CI->getArgOperand(2);
@@ -1403,7 +1440,7 @@ static void ScalarizeMaskedStore(CallInst *CI) {
 // . . .
 // % Result = select <16 x i1> %Mask, <16 x i32> %res.phi.select, <16 x i32> %Src
 // ret <16 x i32> %Result
-static void ScalarizeMaskedGather(CallInst *CI) {
+static void scalarizeMaskedGather(CallInst *CI) {
   Value *Ptrs = CI->getArgOperand(0);
   Value *Alignment = CI->getArgOperand(1);
   Value *Mask = CI->getArgOperand(2);
@@ -1538,7 +1575,7 @@ static void ScalarizeMaskedGather(CallInst *CI) {
 // store i32 % Elt1, i32* % Ptr1, align 4
 // br label %else2
 //   . . .
-static void ScalarizeMaskedScatter(CallInst *CI) {
+static void scalarizeMaskedScatter(CallInst *CI) {
   Value *Src = CI->getArgOperand(0);
   Value *Ptrs = CI->getArgOperand(1);
   Value *Alignment = CI->getArgOperand(2);
@@ -1653,7 +1690,7 @@ static bool despeculateCountZeros(IntrinsicInst *CountZeros,
   // Only handle legal scalar cases. Anything else requires too much work.
   Type *Ty = CountZeros->getType();
   unsigned SizeInBits = Ty->getPrimitiveSizeInBits();
-  if (Ty->isVectorTy() || SizeInBits > DL->getLargestLegalIntTypeSize())
+  if (Ty->isVectorTy() || SizeInBits > DL->getLargestLegalIntTypeSizeInBits())
     return false;
 
   // The intrinsic will be sunk behind a compare against zero and branch.
@@ -1743,8 +1780,8 @@ bool CodeGenPrepare::optimizeCallInst(CallInst *CI, bool& ModifiedDT) {
       // forbidden.
       GlobalVariable *GV;
       if ((GV = dyn_cast<GlobalVariable>(Val)) && GV->canIncreaseAlignment() &&
-          GV->getAlignment() < PrefAlign &&
-          DL->getTypeAllocSize(GV->getType()->getElementType()) >=
+          GV->getPointerAlignment(*DL) < PrefAlign &&
+          DL->getTypeAllocSize(GV->getValueType()) >=
               MinSize + Offset2)
         GV->setAlignment(PrefAlign);
     }
@@ -1759,27 +1796,47 @@ bool CodeGenPrepare::optimizeCallInst(CallInst *CI, bool& ModifiedDT) {
     }
   }
 
+  // If we have a cold call site, try to sink addressing computation into the
+  // cold block.  This interacts with our handling for loads and stores to
+  // ensure that we can fold all uses of a potential addressing computation
+  // into their uses.  TODO: generalize this to work over profiling data
+  if (!OptSize && CI->hasFnAttr(Attribute::Cold))
+    for (auto &Arg : CI->arg_operands()) {
+      if (!Arg->getType()->isPointerTy())
+        continue;
+      unsigned AS = Arg->getType()->getPointerAddressSpace();
+      return optimizeMemoryInst(CI, Arg, Arg->getType(), AS);
+    }
+
   IntrinsicInst *II = dyn_cast<IntrinsicInst>(CI);
   if (II) {
     switch (II->getIntrinsicID()) {
     default: break;
     case Intrinsic::objectsize: {
       // Lower all uses of llvm.objectsize.*
-      bool Min = (cast<ConstantInt>(II->getArgOperand(1))->getZExtValue() == 1);
+      uint64_t Size;
       Type *ReturnTy = CI->getType();
-      Constant *RetVal = ConstantInt::get(ReturnTy, Min ? 0 : -1ULL);
-
+      Constant *RetVal = nullptr;
+      ConstantInt *Op1 = cast<ConstantInt>(II->getArgOperand(1));
+      ObjSizeMode Mode = Op1->isZero() ? ObjSizeMode::Max : ObjSizeMode::Min;
+      if (getObjectSize(II->getArgOperand(0),
+                        Size, *DL, TLInfo, false, Mode)) {
+        RetVal = ConstantInt::get(ReturnTy, Size);
+      } else {
+        RetVal = ConstantInt::get(ReturnTy,
+                                  Mode == ObjSizeMode::Min ? 0 : -1ULL);
+      }
       // Substituting this can cause recursive simplifications, which can
       // invalidate our iterator.  Use a WeakVH to hold onto it in case this
       // happens.
-      WeakVH IterHandle(&*CurInstIterator);
+      Value *CurValue = &*CurInstIterator;
+      WeakVH IterHandle(CurValue);
 
-      replaceAndRecursivelySimplify(CI, RetVal,
-                                    TLInfo, nullptr);
+      replaceAndRecursivelySimplify(CI, RetVal, TLInfo, nullptr);
 
       // If the iterator instruction was recursively deleted, start over at the
       // start of the block.
-      if (IterHandle != CurInstIterator.getNodePtrUnchecked()) {
+      if (IterHandle != CurValue) {
         CurInstIterator = BB->begin();
         SunkAddrs.clear();
       }
@@ -1788,7 +1845,7 @@ bool CodeGenPrepare::optimizeCallInst(CallInst *CI, bool& ModifiedDT) {
     case Intrinsic::masked_load: {
       // Scalarize unsupported vector masked load
       if (!TTI->isLegalMaskedLoad(CI->getType())) {
-        ScalarizeMaskedLoad(CI);
+        scalarizeMaskedLoad(CI);
         ModifiedDT = true;
         return true;
       }
@@ -1796,7 +1853,7 @@ bool CodeGenPrepare::optimizeCallInst(CallInst *CI, bool& ModifiedDT) {
     }
     case Intrinsic::masked_store: {
       if (!TTI->isLegalMaskedStore(CI->getArgOperand(0)->getType())) {
-        ScalarizeMaskedStore(CI);
+        scalarizeMaskedStore(CI);
         ModifiedDT = true;
         return true;
       }
@@ -1804,7 +1861,7 @@ bool CodeGenPrepare::optimizeCallInst(CallInst *CI, bool& ModifiedDT) {
     }
     case Intrinsic::masked_gather: {
       if (!TTI->isLegalMaskedGather(CI->getType())) {
-        ScalarizeMaskedGather(CI);
+        scalarizeMaskedGather(CI);
         ModifiedDT = true;
         return true;
       }
@@ -1812,7 +1869,7 @@ bool CodeGenPrepare::optimizeCallInst(CallInst *CI, bool& ModifiedDT) {
     }
     case Intrinsic::masked_scatter: {
       if (!TTI->isLegalMaskedScatter(CI->getArgOperand(0)->getType())) {
-        ScalarizeMaskedScatter(CI);
+        scalarizeMaskedScatter(CI);
         ModifiedDT = true;
         return true;
       }
@@ -2076,7 +2133,7 @@ void ExtAddrMode::print(raw_ostream &OS) const {
 }
 
 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
-void ExtAddrMode::dump() const {
+LLVM_DUMP_METHOD void ExtAddrMode::dump() const {
   print(dbgs());
   dbgs() << '\n';
 }
@@ -3442,6 +3499,8 @@ static bool FindAllMemoryUses(
   if (!MightBeFoldableInst(I))
     return true;
 
+  const bool OptSize = I->getFunction()->optForSize();
+
   // Loop over all the uses, recursively processing them.
   for (Use &U : I->uses()) {
     Instruction *UserI = cast<Instruction>(U.getUser());
@@ -3459,6 +3518,11 @@ static bool FindAllMemoryUses(
     }
 
     if (CallInst *CI = dyn_cast<CallInst>(UserI)) {
+      // If this is a cold call, we can sink the addressing calculation into
+      // the cold path.  See optimizeCallInst
+      if (!OptSize && CI->hasFnAttr(Attribute::Cold))
+        continue;
+
       InlineAsm *IA = dyn_cast<InlineAsm>(CI->getCalledValue());
       if (!IA) return true;
 
@@ -3550,10 +3614,10 @@ isProfitableToFoldIntoAddressingMode(Instruction *I, ExtAddrMode &AMBefore,
   if (!BaseReg && !ScaledReg)
     return true;
 
-  // If all uses of this instruction are ultimately load/store/inlineasm's,
-  // check to see if their addressing modes will include this instruction.  If
-  // so, we can fold it into all uses, so it doesn't matter if it has multiple
-  // uses.
+  // If all uses of this instruction can have the address mode sunk into them,
+  // we can remove the addressing mode and effectively trade one live register
+  // for another (at worst.)  In this context, folding an addressing mode into
+  // the use is just a particularly nice way of sinking it.
   SmallVector<std::pair<Instruction*,unsigned>, 16> MemoryUses;
   SmallPtrSet<Instruction*, 16> ConsideredInsts;
   if (FindAllMemoryUses(I, MemoryUses, ConsideredInsts, TM))
@@ -3561,8 +3625,13 @@ isProfitableToFoldIntoAddressingMode(Instruction *I, ExtAddrMode &AMBefore,
 
   // Now that we know that all uses of this instruction are part of a chain of
   // computation involving only operations that could theoretically be folded
-  // into a memory use, loop over each of these uses and see if they could
-  // *actually* fold the instruction.
+  // into a memory use, loop over each of these memory operation uses and see
+  // if they could  *actually* fold the instruction.  The assumption is that
+  // addressing modes are cheap and that duplicating the computation involved
+  // many times is worthwhile, even on a fastpath. For sinking candidates
+  // (i.e. cold call sites), this serves as a way to prevent excessive code
+  // growth since most architectures have some reasonable small and fast way to
+  // compute an effective address.  (i.e LEA on x86)
   SmallVector<Instruction*, 32> MatchedAddrModeInsts;
   for (unsigned i = 0, e = MemoryUses.size(); i != e; ++i) {
     Instruction *User = MemoryUses[i].first;
@@ -3616,6 +3685,11 @@ static bool IsNonLocalValue(Value *V, BasicBlock *BB) {
   return false;
 }
 
+/// Sink addressing mode computation immediate before MemoryInst if doing so
+/// can be done without increasing register pressure.  The need for the
+/// register pressure constraint means this can end up being an all or nothing
+/// decision for all uses of the same addressing computation.
+///
 /// Load and Store Instructions often have addressing modes that can do
 /// significant amounts of computation. As such, instruction selection will try
 /// to get the load or store to do as much computation as possible for the
@@ -3623,7 +3697,13 @@ static bool IsNonLocalValue(Value *V, BasicBlock *BB) {
 /// such, we sink as much legal addressing mode work into the block as possible.
 ///
 /// This method is used to optimize both load/store and inline asms with memory
-/// operands.
+/// operands.  It's also used to sink addressing computations feeding into cold
+/// call sites into their (cold) basic block.
+///
+/// The motivation for handling sinking into cold blocks is that doing so can
+/// both enable other address mode sinking (by satisfying the register pressure
+/// constraint above), and reduce register pressure globally (by removing the
+/// addressing mode computation from the fast path entirely.).
 bool CodeGenPrepare::optimizeMemoryInst(Instruction *MemoryInst, Value *Addr,
                                         Type *AccessTy, unsigned AddrSpace) {
   Value *Repl = Addr;
@@ -3662,7 +3742,9 @@ bool CodeGenPrepare::optimizeMemoryInst(Instruction *MemoryInst, Value *Addr,
       continue;
     }
 
-    // For non-PHIs, determine the addressing mode being computed.
+    // For non-PHIs, determine the addressing mode being computed.  Note that
+    // the result may differ depending on what other uses our candidate
+    // addressing instructions might have.
     SmallVector<Instruction*, 16> NewAddrModeInsts;
     ExtAddrMode NewAddrMode = AddressingModeMatcher::Match(
       V, AccessTy, AddrSpace, MemoryInst, NewAddrModeInsts, *TM,
@@ -3945,12 +4027,13 @@ bool CodeGenPrepare::optimizeMemoryInst(Instruction *MemoryInst, Value *Addr,
   if (Repl->use_empty()) {
     // This can cause recursive deletion, which can invalidate our iterator.
     // Use a WeakVH to hold onto it in case this happens.
-    WeakVH IterHandle(&*CurInstIterator);
+    Value *CurValue = &*CurInstIterator;
+    WeakVH IterHandle(CurValue);
     BasicBlock *BB = CurInstIterator->getParent();
 
     RecursivelyDeleteTriviallyDeadInstructions(Repl, TLInfo);
 
-    if (IterHandle != CurInstIterator.getNodePtrUnchecked()) {
+    if (IterHandle != CurValue) {
       // If the iterator instruction was recursively deleted, start over at the
       // start of the block.
       CurInstIterator = BB->begin();
@@ -4461,11 +4544,27 @@ static bool sinkSelectOperand(const TargetTransformInfo *TTI, Value *V) {
 
 /// Returns true if a SelectInst should be turned into an explicit branch.
 static bool isFormingBranchFromSelectProfitable(const TargetTransformInfo *TTI,
+                                                const TargetLowering *TLI,
                                                 SelectInst *SI) {
+  // If even a predictable select is cheap, then a branch can't be cheaper.
+  if (!TLI->isPredictableSelectExpensive())
+    return false;
+
   // FIXME: This should use the same heuristics as IfConversion to determine
-  // whether a select is better represented as a branch.  This requires that
-  // branch probability metadata is preserved for the select, which is not the
-  // case currently.
+  // whether a select is better represented as a branch.
+
+  // If metadata tells us that the select condition is obviously predictable,
+  // then we want to replace the select with a branch.
+  uint64_t TrueWeight, FalseWeight;
+  if (SI->extractProfMetadata(TrueWeight, FalseWeight)) {
+    uint64_t Max = std::max(TrueWeight, FalseWeight);
+    uint64_t Sum = TrueWeight + FalseWeight;
+    if (Sum != 0) {
+      auto Probability = BranchProbability::getBranchProbability(Max, Sum);
+      if (Probability > TLI->getPredictableBranchThreshold())
+        return true;
+    }
+  }
 
   CmpInst *Cmp = dyn_cast<CmpInst>(SI->getCondition());
 
@@ -4475,17 +4574,6 @@ static bool isFormingBranchFromSelectProfitable(const TargetTransformInfo *TTI,
   if (!Cmp || !Cmp->hasOneUse())
     return false;
 
-  Value *CmpOp0 = Cmp->getOperand(0);
-  Value *CmpOp1 = Cmp->getOperand(1);
-
-  // Emit "cmov on compare with a memory operand" as a branch to avoid stalls
-  // on a load from memory. But if the load is used more than once, do not
-  // change the select to a branch because the load is probably needed
-  // regardless of whether the branch is taken or not.
-  if ((isa<LoadInst>(CmpOp0) && CmpOp0->hasOneUse()) ||
-      (isa<LoadInst>(CmpOp1) && CmpOp1->hasOneUse()))
-    return true;
-
   // If either operand of the select is expensive and only needed on one side
   // of the select, we should form a branch.
   if (sinkSelectOperand(TTI, SI->getTrueValue()) ||
@@ -4502,7 +4590,8 @@ bool CodeGenPrepare::optimizeSelectInst(SelectInst *SI) {
   bool VectorCond = !SI->getCondition()->getType()->isIntegerTy(1);
 
   // Can we convert the 'select' to CF ?
-  if (DisableSelectToBranch || OptSize || !TLI || VectorCond)
+  if (DisableSelectToBranch || OptSize || !TLI || VectorCond ||
+      SI->getMetadata(LLVMContext::MD_unpredictable))
     return false;
 
   TargetLowering::SelectSupportKind SelectKind;
@@ -4513,14 +4602,9 @@ bool CodeGenPrepare::optimizeSelectInst(SelectInst *SI) {
   else
     SelectKind = TargetLowering::ScalarValSelect;
 
-  // Do we have efficient codegen support for this kind of 'selects' ?
-  if (TLI->isSelectSupported(SelectKind)) {
-    // We have efficient codegen support for the select instruction.
-    // Check if it is profitable to keep this 'select'.
-    if (!TLI->isPredictableSelectExpensive() ||
-        !isFormingBranchFromSelectProfitable(TTI, SI))
-      return false;
-  }
+  if (TLI->isSelectSupported(SelectKind) &&
+      !isFormingBranchFromSelectProfitable(TTI, TLI, SI))
+    return false;
 
   ModifiedDT = true;
 
@@ -5145,7 +5229,7 @@ bool CodeGenPrepare::optimizeInst(Instruction *I, bool& ModifiedDT) {
 
   if (CmpInst *CI = dyn_cast<CmpInst>(I))
     if (!TLI || !TLI->hasMultipleConditionRegisters())
-      return OptimizeCmpExpression(CI);
+      return OptimizeCmpExpression(CI, TLI);
 
   if (LoadInst *LI = dyn_cast<LoadInst>(I)) {
     stripInvariantGroupMetadata(*LI);
@@ -5221,7 +5305,7 @@ static bool makeBitReverse(Instruction &I, const DataLayout &DL,
     return false;
 
   SmallVector<Instruction*, 4> Insts;
-  if (!recognizeBitReverseOrBSwapIdiom(&I, false, true, Insts))
+  if (!recognizeBSwapOrBitReverseIdiom(&I, false, true, Insts))
     return false;
   Instruction *LastInst = Insts.back();
   I.replaceAllUsesWith(LastInst);
@@ -5249,12 +5333,13 @@ bool CodeGenPrepare::optimizeBlock(BasicBlock &BB, bool& ModifiedDT) {
     for (auto &I : reverse(BB)) {
       if (makeBitReverse(I, *DL, *TLI)) {
         MadeBitReverse = MadeChange = true;
+        ModifiedDT = true;
         break;
       }
     }
   }
   MadeChange |= dupRetToEnableTailCallOpts(&BB);
-  
+
   return MadeChange;
 }
 
@@ -5310,43 +5395,38 @@ bool CodeGenPrepare::sinkAndCmp(Function &F) {
   if (!TLI || !TLI->isMaskAndBranchFoldingLegal())
     return false;
   bool MadeChange = false;
-  for (Function::iterator I = F.begin(), E = F.end(); I != E; ) {
-    BasicBlock *BB = &*I++;
-
+  for (BasicBlock &BB : F) {
     // Does this BB end with the following?
     //   %andVal = and %val, #single-bit-set
     //   %icmpVal = icmp %andResult, 0
     //   br i1 %cmpVal label %dest1, label %dest2"
-    BranchInst *Brcc = dyn_cast<BranchInst>(BB->getTerminator());
+    BranchInst *Brcc = dyn_cast<BranchInst>(BB.getTerminator());
     if (!Brcc || !Brcc->isConditional())
       continue;
     ICmpInst *Cmp = dyn_cast<ICmpInst>(Brcc->getOperand(0));
-    if (!Cmp || Cmp->getParent() != BB)
+    if (!Cmp || Cmp->getParent() != &BB)
       continue;
     ConstantInt *Zero = dyn_cast<ConstantInt>(Cmp->getOperand(1));
     if (!Zero || !Zero->isZero())
       continue;
     Instruction *And = dyn_cast<Instruction>(Cmp->getOperand(0));
-    if (!And || And->getOpcode() != Instruction::And || And->getParent() != BB)
+    if (!And || And->getOpcode() != Instruction::And || And->getParent() != &BB)
       continue;
     ConstantInt* Mask = dyn_cast<ConstantInt>(And->getOperand(1));
     if (!Mask || !Mask->getUniqueInteger().isPowerOf2())
       continue;
-    DEBUG(dbgs() << "found and; icmp ?,0; brcc\n"); DEBUG(BB->dump());
+    DEBUG(dbgs() << "found and; icmp ?,0; brcc\n"); DEBUG(BB.dump());
 
     // Push the "and; icmp" for any users that are conditional branches.
     // Since there can only be one branch use per BB, we don't need to keep
     // track of which BBs we insert into.
-    for (Value::use_iterator UI = Cmp->use_begin(), E = Cmp->use_end();
-         UI != E; ) {
-      Use &TheUse = *UI;
+    for (Use &TheUse : Cmp->uses()) {
       // Find brcc use.
-      BranchInst *BrccUser = dyn_cast<BranchInst>(*UI);
-      ++UI;
+      BranchInst *BrccUser = dyn_cast<BranchInst>(TheUse);
       if (!BrccUser || !BrccUser->isConditional())
         continue;
       BasicBlock *UserBB = BrccUser->getParent();
-      if (UserBB == BB) continue;
+      if (UserBB == &BB) continue;
       DEBUG(dbgs() << "found Brcc use\n");
 
       // Sink the "and; icmp" to use.
@@ -5365,29 +5445,6 @@ bool CodeGenPrepare::sinkAndCmp(Function &F) {
   return MadeChange;
 }
 
-/// \brief Retrieve the probabilities of a conditional branch. Returns true on
-/// success, or returns false if no or invalid metadata was found.
-static bool extractBranchMetadata(BranchInst *BI,
-                                  uint64_t &ProbTrue, uint64_t &ProbFalse) {
-  assert(BI->isConditional() &&
-         "Looking for probabilities on unconditional branch?");
-  auto *ProfileData = BI->getMetadata(LLVMContext::MD_prof);
-  if (!ProfileData || ProfileData->getNumOperands() != 3)
-    return false;
-
-  const auto *CITrue =
-      mdconst::dyn_extract<ConstantInt>(ProfileData->getOperand(1));
-  const auto *CIFalse =
-      mdconst::dyn_extract<ConstantInt>(ProfileData->getOperand(2));
-  if (!CITrue || !CIFalse)
-    return false;
-
-  ProbTrue = CITrue->getValue().getZExtValue();
-  ProbFalse = CIFalse->getValue().getZExtValue();
-
-  return true;
-}
-
 /// \brief Scale down both weights to fit into uint32_t.
 static void scaleWeights(uint64_t &NewTrue, uint64_t &NewFalse) {
   uint64_t NewMax = (NewTrue > NewFalse) ? NewTrue : NewFalse;
@@ -5456,11 +5513,9 @@ bool CodeGenPrepare::splitBranchCondition(Function &F) {
     DEBUG(dbgs() << "Before branch condition splitting\n"; BB.dump());
 
     // Create a new BB.
-    auto *InsertBefore = std::next(Function::iterator(BB))
-        .getNodePtrUnchecked();
-    auto TmpBB = BasicBlock::Create(BB.getContext(),
-                                    BB.getName() + ".cond.split",
-                                    BB.getParent(), InsertBefore);
+    auto TmpBB =
+        BasicBlock::Create(BB.getContext(), BB.getName() + ".cond.split",
+                           BB.getParent(), BB.getNextNode());
 
     // Update original basic block by using the first condition directly by the
     // branch instruction and removing the no longer needed and/or instruction.
@@ -5535,7 +5590,7 @@ bool CodeGenPrepare::splitBranchCondition(Function &F) {
       // Another choice is to assume TrueProb for BB1 equals to TrueProb for
       // TmpBB, but the math is more complicated.
       uint64_t TrueWeight, FalseWeight;
-      if (extractBranchMetadata(Br1, TrueWeight, FalseWeight)) {
+      if (Br1->extractProfMetadata(TrueWeight, FalseWeight)) {
         uint64_t NewTrueWeight = TrueWeight;
         uint64_t NewFalseWeight = TrueWeight + 2 * FalseWeight;
         scaleWeights(NewTrueWeight, NewFalseWeight);
@@ -5568,7 +5623,7 @@ bool CodeGenPrepare::splitBranchCondition(Function &F) {
       // assumes that
       //   FalseProb for BB1 == TrueProb for BB1 * FalseProb for TmpBB.
       uint64_t TrueWeight, FalseWeight;
-      if (extractBranchMetadata(Br1, TrueWeight, FalseWeight)) {
+      if (Br1->extractProfMetadata(TrueWeight, FalseWeight)) {
         uint64_t NewTrueWeight = 2 * TrueWeight + FalseWeight;
         uint64_t NewFalseWeight = FalseWeight;
         scaleWeights(NewTrueWeight, NewFalseWeight);
diff --git a/lib/CodeGen/CoreCLRGC.cpp b/lib/CodeGen/CoreCLRGC.cpp
deleted file mode 100644
index ff7c0d5dc0ac..000000000000
--- a/lib/CodeGen/CoreCLRGC.cpp
+++ /dev/null
@@ -1,54 +0,0 @@
-//===-- CoreCLRGC.cpp - CoreCLR Runtime GC Strategy -----------------------===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-//
-// This file contains a GCStrategy for the CoreCLR Runtime.
-// The strategy is similar to Statepoint-example GC, but differs from it in
-// certain aspects, such as:
-// 1) Base-pointers need not be explicitly tracked and reported for
-//    interior pointers
-// 2) Uses a different format for encoding stack-maps
-// 3) Location of Safe-point polls: polls are only needed before loop-back edges
-//    and before tail-calls (not needed at function-entry)
-//
-// The above differences in behavior are to be implemented in upcoming checkins.
-//
-//===----------------------------------------------------------------------===//
-
-#include "llvm/CodeGen/GCStrategy.h"
-#include "llvm/IR/DerivedTypes.h"
-#include "llvm/IR/Value.h"
-
-using namespace llvm;
-
-namespace {
-class CoreCLRGC : public GCStrategy {
-public:
-  CoreCLRGC() {
-    UseStatepoints = true;
-    // These options are all gc.root specific, we specify them so that the
-    // gc.root lowering code doesn't run.
-    InitRoots = false;
-    NeededSafePoints = 0;
-    UsesMetadata = false;
-    CustomRoots = false;
-  }
-  Optional<bool> isGCManagedPointer(const Type *Ty) const override {
-    // Method is only valid on pointer typed values.
-    const PointerType *PT = cast<PointerType>(Ty);
-    // We pick addrspace(1) as our GC managed heap.
-    return (1 == PT->getAddressSpace());
-  }
-};
-}
-
-static GCRegistry::Add<CoreCLRGC> X("coreclr", "CoreCLR-compatible GC");
-
-namespace llvm {
-void linkCoreCLRGC() {}
-}
diff --git a/lib/CodeGen/CriticalAntiDepBreaker.cpp b/lib/CodeGen/CriticalAntiDepBreaker.cpp
index c924ba30c8a1..a0189a172bfc 100644
--- a/lib/CodeGen/CriticalAntiDepBreaker.cpp
+++ b/lib/CodeGen/CriticalAntiDepBreaker.cpp
@@ -87,7 +87,7 @@ void CriticalAntiDepBreaker::FinishBlock() {
   KeepRegs.reset();
 }
 
-void CriticalAntiDepBreaker::Observe(MachineInstr *MI, unsigned Count,
+void CriticalAntiDepBreaker::Observe(MachineInstr &MI, unsigned Count,
                                      unsigned InsertPosIndex) {
   // Kill instructions can define registers but are really nops, and there might
   // be a real definition earlier that needs to be paired with uses dominated by
@@ -96,7 +96,7 @@ void CriticalAntiDepBreaker::Observe(MachineInstr *MI, unsigned Count,
   // FIXME: It may be possible to remove the isKill() restriction once PR18663
   // has been properly fixed. There can be value in processing kills as seen in
   // the AggressiveAntiDepBreaker class.
-  if (MI->isDebugValue() || MI->isKill())
+  if (MI.isDebugValue() || MI.isKill())
     return;
   assert(Count < InsertPosIndex && "Instruction index out of expected range!");
 
@@ -146,7 +146,7 @@ static const SDep *CriticalPathStep(const SUnit *SU) {
   return Next;
 }
 
-void CriticalAntiDepBreaker::PrescanInstruction(MachineInstr *MI) {
+void CriticalAntiDepBreaker::PrescanInstruction(MachineInstr &MI) {
   // It's not safe to change register allocation for source operands of
   // instructions that have special allocation requirements. Also assume all
   // registers used in a call must not be changed (ABI).
@@ -163,21 +163,20 @@ void CriticalAntiDepBreaker::PrescanInstruction(MachineInstr *MI) {
   // instruction which may not be executed. The second R6 def may or may not
   // re-define R6 so it's not safe to change it since the last R6 use cannot be
   // changed.
-  bool Special = MI->isCall() ||
-    MI->hasExtraSrcRegAllocReq() ||
-    TII->isPredicated(MI);
+  bool Special =
+      MI.isCall() || MI.hasExtraSrcRegAllocReq() || TII->isPredicated(MI);
 
   // Scan the register operands for this instruction and update
   // Classes and RegRefs.
-  for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) {
-    MachineOperand &MO = MI->getOperand(i);
+  for (unsigned i = 0, e = MI.getNumOperands(); i != e; ++i) {
+    MachineOperand &MO = MI.getOperand(i);
     if (!MO.isReg()) continue;
     unsigned Reg = MO.getReg();
     if (Reg == 0) continue;
     const TargetRegisterClass *NewRC = nullptr;
 
-    if (i < MI->getDesc().getNumOperands())
-      NewRC = TII->getRegClass(MI->getDesc(), i, TRI, MF);
+    if (i < MI.getDesc().getNumOperands())
+      NewRC = TII->getRegClass(MI.getDesc(), i, TRI, MF);
 
     // For now, only allow the register to be changed if its register
     // class is consistent across all uses.
@@ -212,7 +211,7 @@ void CriticalAntiDepBreaker::PrescanInstruction(MachineInstr *MI) {
     // of a register? In the above 'xor' example, the uses of %eax are undef, so
     // earlier instructions could still replace %eax even though the 'xor'
     // itself can't be changed.
-    if (MI->isRegTiedToUseOperand(i) &&
+    if (MI.isRegTiedToUseOperand(i) &&
         Classes[Reg] == reinterpret_cast<TargetRegisterClass *>(-1)) {
       for (MCSubRegIterator SubRegs(Reg, TRI, /*IncludeSelf=*/true);
            SubRegs.isValid(); ++SubRegs) {
@@ -234,18 +233,17 @@ void CriticalAntiDepBreaker::PrescanInstruction(MachineInstr *MI) {
   }
 }
 
-void CriticalAntiDepBreaker::ScanInstruction(MachineInstr *MI,
-                                             unsigned Count) {
+void CriticalAntiDepBreaker::ScanInstruction(MachineInstr &MI, unsigned Count) {
   // Update liveness.
   // Proceeding upwards, registers that are defed but not used in this
   // instruction are now dead.
-  assert(!MI->isKill() && "Attempting to scan a kill instruction");
+  assert(!MI.isKill() && "Attempting to scan a kill instruction");
 
   if (!TII->isPredicated(MI)) {
     // Predicated defs are modeled as read + write, i.e. similar to two
     // address updates.
-    for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) {
-      MachineOperand &MO = MI->getOperand(i);
+    for (unsigned i = 0, e = MI.getNumOperands(); i != e; ++i) {
+      MachineOperand &MO = MI.getOperand(i);
 
       if (MO.isRegMask())
         for (unsigned i = 0, e = TRI->getNumRegs(); i != e; ++i)
@@ -262,11 +260,13 @@ void CriticalAntiDepBreaker::ScanInstruction(MachineInstr *MI,
       if (Reg == 0) continue;
       if (!MO.isDef()) continue;
 
-      // If we've already marked this reg as unchangeable, carry on.
-      if (KeepRegs.test(Reg)) continue;
-      
       // Ignore two-addr defs.
-      if (MI->isRegTiedToUseOperand(i)) continue;
+      if (MI.isRegTiedToUseOperand(i))
+        continue;
+
+      // If we've already marked this reg as unchangeable, don't remove
+      // it or any of its subregs from KeepRegs.
+      bool Keep = KeepRegs.test(Reg);
 
       // For the reg itself and all subregs: update the def to current;
       // reset the kill state, any restrictions, and references.
@@ -274,25 +274,26 @@ void CriticalAntiDepBreaker::ScanInstruction(MachineInstr *MI,
         unsigned SubregReg = *SRI;
         DefIndices[SubregReg] = Count;
         KillIndices[SubregReg] = ~0u;
-        KeepRegs.reset(SubregReg);
         Classes[SubregReg] = nullptr;
         RegRefs.erase(SubregReg);
+        if (!Keep)
+          KeepRegs.reset(SubregReg);
       }
       // Conservatively mark super-registers as unusable.
       for (MCSuperRegIterator SR(Reg, TRI); SR.isValid(); ++SR)
         Classes[*SR] = reinterpret_cast<TargetRegisterClass *>(-1);
     }
   }
-  for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) {
-    MachineOperand &MO = MI->getOperand(i);
+  for (unsigned i = 0, e = MI.getNumOperands(); i != e; ++i) {
+    MachineOperand &MO = MI.getOperand(i);
     if (!MO.isReg()) continue;
     unsigned Reg = MO.getReg();
     if (Reg == 0) continue;
     if (!MO.isUse()) continue;
 
     const TargetRegisterClass *NewRC = nullptr;
-    if (i < MI->getDesc().getNumOperands())
-      NewRC = TII->getRegClass(MI->getDesc(), i, TRI, MF);
+    if (i < MI.getDesc().getNumOperands())
+      NewRC = TII->getRegClass(MI.getDesc(), i, TRI, MF);
 
     // For now, only allow the register to be changed if its register
     // class is consistent across all uses.
@@ -510,7 +511,7 @@ BreakAntiDependencies(const std::vector<SUnit>& SUnits,
   unsigned Broken = 0;
   unsigned Count = InsertPosIndex - 1;
   for (MachineBasicBlock::iterator I = End, E = Begin; I != E; --Count) {
-    MachineInstr *MI = --I;
+    MachineInstr &MI = *--I;
     // Kill instructions can define registers but are really nops, and there
     // might be a real definition earlier that needs to be paired with uses
     // dominated by this kill.
@@ -518,7 +519,7 @@ BreakAntiDependencies(const std::vector<SUnit>& SUnits,
     // FIXME: It may be possible to remove the isKill() restriction once PR18663
     // has been properly fixed. There can be value in processing kills as seen
     // in the AggressiveAntiDepBreaker class.
-    if (MI->isDebugValue() || MI->isKill())
+    if (MI.isDebugValue() || MI.isKill())
       continue;
 
     // Check if this instruction has a dependence on the critical path that
@@ -535,7 +536,7 @@ BreakAntiDependencies(const std::vector<SUnit>& SUnits,
     // edge per instruction. Note that we'd have to be able to break all of
     // the anti-dependencies in an instruction in order to be effective.
     unsigned AntiDepReg = 0;
-    if (MI == CriticalPathMI) {
+    if (&MI == CriticalPathMI) {
       if (const SDep *Edge = CriticalPathStep(CriticalPathSU)) {
         const SUnit *NextSU = Edge->getSUnit();
 
@@ -585,7 +586,7 @@ BreakAntiDependencies(const std::vector<SUnit>& SUnits,
     // If MI's defs have a special allocation requirement, don't allow
     // any def registers to be changed. Also assume all registers
     // defined in a call must not be changed (ABI).
-    if (MI->isCall() || MI->hasExtraDefRegAllocReq() || TII->isPredicated(MI))
+    if (MI.isCall() || MI.hasExtraDefRegAllocReq() || TII->isPredicated(MI))
       // If this instruction's defs have special allocation requirement, don't
       // break this anti-dependency.
       AntiDepReg = 0;
@@ -594,8 +595,8 @@ BreakAntiDependencies(const std::vector<SUnit>& SUnits,
       // is invalid.  If the instruction defines other registers,
       // save a list of them so that we don't pick a new register
       // that overlaps any of them.
-      for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) {
-        MachineOperand &MO = MI->getOperand(i);
+      for (unsigned i = 0, e = MI.getNumOperands(); i != e; ++i) {
+        MachineOperand &MO = MI.getOperand(i);
         if (!MO.isReg()) continue;
         unsigned Reg = MO.getReg();
         if (Reg == 0) continue;
@@ -647,7 +648,7 @@ BreakAntiDependencies(const std::vector<SUnit>& SUnits,
           for (DbgValueVector::iterator DVI = DbgValues.begin(),
                  DVE = DbgValues.end(); DVI != DVE; ++DVI)
             if (DVI->second == Q->second->getParent())
-              UpdateDbgValue(DVI->first, AntiDepReg, NewReg);
+              UpdateDbgValue(*DVI->first, AntiDepReg, NewReg);
         }
 
         // We just went back in time and modified history; the
diff --git a/lib/CodeGen/CriticalAntiDepBreaker.h b/lib/CodeGen/CriticalAntiDepBreaker.h
index 10b873959ad0..678779fa1a26 100644
--- a/lib/CodeGen/CriticalAntiDepBreaker.h
+++ b/lib/CodeGen/CriticalAntiDepBreaker.h
@@ -19,17 +19,15 @@
 #include "AntiDepBreaker.h"
 #include "llvm/ADT/BitVector.h"
 #include "llvm/CodeGen/MachineBasicBlock.h"
-#include "llvm/CodeGen/MachineFrameInfo.h"
-#include "llvm/CodeGen/MachineFunction.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
 #include "llvm/CodeGen/RegisterClassInfo.h"
 #include "llvm/CodeGen/ScheduleDAG.h"
-#include <map>
 
 namespace llvm {
 class RegisterClassInfo;
 class TargetInstrInfo;
 class TargetRegisterInfo;
+class MachineFunction;
 
 class LLVM_LIBRARY_VISIBILITY CriticalAntiDepBreaker : public AntiDepBreaker {
     MachineFunction& MF;
@@ -84,15 +82,15 @@ class LLVM_LIBRARY_VISIBILITY CriticalAntiDepBreaker : public AntiDepBreaker {
 
     /// Update liveness information to account for the current
     /// instruction, which will not be scheduled.
-    void Observe(MachineInstr *MI, unsigned Count,
+    void Observe(MachineInstr &MI, unsigned Count,
                  unsigned InsertPosIndex) override;
 
     /// Finish anti-dep breaking for a basic block.
     void FinishBlock() override;
 
   private:
-    void PrescanInstruction(MachineInstr *MI);
-    void ScanInstruction(MachineInstr *MI, unsigned Count);
+    void PrescanInstruction(MachineInstr &MI);
+    void ScanInstruction(MachineInstr &MI, unsigned Count);
     bool isNewRegClobberedByRefs(RegRefIter RegRefBegin,
                                  RegRefIter RegRefEnd,
                                  unsigned NewReg);
diff --git a/lib/CodeGen/DFAPacketizer.cpp b/lib/CodeGen/DFAPacketizer.cpp
index af6b6a392b75..2386af9e6877 100644
--- a/lib/CodeGen/DFAPacketizer.cpp
+++ b/lib/CodeGen/DFAPacketizer.cpp
@@ -23,12 +23,15 @@
 //
 //===----------------------------------------------------------------------===//
 
+#define DEBUG_TYPE "packets"
+
 #include "llvm/CodeGen/DFAPacketizer.h"
 #include "llvm/CodeGen/MachineInstr.h"
 #include "llvm/CodeGen/MachineInstrBundle.h"
 #include "llvm/CodeGen/ScheduleDAGInstrs.h"
 #include "llvm/MC/MCInstrItineraries.h"
 #include "llvm/Target/TargetInstrInfo.h"
+
 using namespace llvm;
 
 // --------------------------------------------------------------------
@@ -44,8 +47,8 @@ namespace {
   /// DFAPacketizerEmitter.cpp.
   DFAInput getDFAInsnInput(const std::vector<unsigned> &InsnClass) {
     DFAInput InsnInput = 0;
-    assert ((InsnClass.size() <= DFA_MAX_RESTERMS) &&
-            "Exceeded maximum number of DFA terms");
+    assert((InsnClass.size() <= DFA_MAX_RESTERMS) &&
+           "Exceeded maximum number of DFA terms");
     for (auto U : InsnClass)
       InsnInput = addDFAFuncUnits(InsnInput, U);
     return InsnInput;
@@ -59,15 +62,16 @@ DFAPacketizer::DFAPacketizer(const InstrItineraryData *I,
   InstrItins(I), CurrentState(0), DFAStateInputTable(SIT),
   DFAStateEntryTable(SET) {
   // Make sure DFA types are large enough for the number of terms & resources.
-  assert((DFA_MAX_RESTERMS * DFA_MAX_RESOURCES) <= (8 * sizeof(DFAInput))
-        && "(DFA_MAX_RESTERMS * DFA_MAX_RESOURCES) too big for DFAInput");
-  assert((DFA_MAX_RESTERMS * DFA_MAX_RESOURCES) <= (8 * sizeof(DFAStateInput))
-        && "(DFA_MAX_RESTERMS * DFA_MAX_RESOURCES) too big for DFAStateInput");
+  static_assert((DFA_MAX_RESTERMS * DFA_MAX_RESOURCES) <=
+                    (8 * sizeof(DFAInput)),
+                "(DFA_MAX_RESTERMS * DFA_MAX_RESOURCES) too big for DFAInput");
+  static_assert(
+      (DFA_MAX_RESTERMS * DFA_MAX_RESOURCES) <= (8 * sizeof(DFAStateInput)),
+      "(DFA_MAX_RESTERMS * DFA_MAX_RESOURCES) too big for DFAStateInput");
 }
 
 
-//
-// ReadTable - Read the DFA transition table and update CachedTable.
+// Read the DFA transition table and update CachedTable.
 //
 // Format of the transition tables:
 // DFAStateInputTable[][2] = pairs of <Input, Transition> for all valid
@@ -80,8 +84,7 @@ void DFAPacketizer::ReadTable(unsigned int state) {
   unsigned NextStateInTable = DFAStateEntryTable[state+1];
   // Early exit in case CachedTable has already contains this
   // state's transitions.
-  if (CachedTable.count(UnsignPair(state,
-                                   DFAStateInputTable[ThisState][0])))
+  if (CachedTable.count(UnsignPair(state, DFAStateInputTable[ThisState][0])))
     return;
 
   for (unsigned i = ThisState; i < NextStateInTable; i++)
@@ -89,38 +92,41 @@ void DFAPacketizer::ReadTable(unsigned int state) {
       DFAStateInputTable[i][1];
 }
 
-//
-// getInsnInput - Return the DFAInput for an instruction class.
-//
+
+// Return the DFAInput for an instruction class.
 DFAInput DFAPacketizer::getInsnInput(unsigned InsnClass) {
   // Note: this logic must match that in DFAPacketizerDefs.h for input vectors.
   DFAInput InsnInput = 0;
   unsigned i = 0;
+  (void)i;
   for (const InstrStage *IS = InstrItins->beginStage(InsnClass),
-        *IE = InstrItins->endStage(InsnClass); IS != IE; ++IS, ++i) {
+       *IE = InstrItins->endStage(InsnClass); IS != IE; ++IS) {
     InsnInput = addDFAFuncUnits(InsnInput, IS->getUnits());
-    assert ((i < DFA_MAX_RESTERMS) && "Exceeded maximum number of DFA inputs");
+    assert((i++ < DFA_MAX_RESTERMS) && "Exceeded maximum number of DFA inputs");
   }
   return InsnInput;
 }
 
-// getInsnInput - Return the DFAInput for an instruction class input vector.
+
+// Return the DFAInput for an instruction class input vector.
 DFAInput DFAPacketizer::getInsnInput(const std::vector<unsigned> &InsnClass) {
   return getDFAInsnInput(InsnClass);
 }
 
-// canReserveResources - Check if the resources occupied by a MCInstrDesc
-// are available in the current state.
+
+// Check if the resources occupied by a MCInstrDesc are available in the
+// current state.
 bool DFAPacketizer::canReserveResources(const llvm::MCInstrDesc *MID) {
   unsigned InsnClass = MID->getSchedClass();
   DFAInput InsnInput = getInsnInput(InsnClass);
   UnsignPair StateTrans = UnsignPair(CurrentState, InsnInput);
   ReadTable(CurrentState);
-  return (CachedTable.count(StateTrans) != 0);
+  return CachedTable.count(StateTrans) != 0;
 }
 
-// reserveResources - Reserve the resources occupied by a MCInstrDesc and
-// change the current state to reflect that change.
+
+// Reserve the resources occupied by a MCInstrDesc and change the current
+// state to reflect that change.
 void DFAPacketizer::reserveResources(const llvm::MCInstrDesc *MID) {
   unsigned InsnClass = MID->getSchedClass();
   DFAInput InsnInput = getInsnInput(InsnClass);
@@ -131,34 +137,46 @@ void DFAPacketizer::reserveResources(const llvm::MCInstrDesc *MID) {
 }
 
 
-// canReserveResources - Check if the resources occupied by a machine
-// instruction are available in the current state.
-bool DFAPacketizer::canReserveResources(llvm::MachineInstr *MI) {
-  const llvm::MCInstrDesc &MID = MI->getDesc();
+// Check if the resources occupied by a machine instruction are available
+// in the current state.
+bool DFAPacketizer::canReserveResources(llvm::MachineInstr &MI) {
+  const llvm::MCInstrDesc &MID = MI.getDesc();
   return canReserveResources(&MID);
 }
 
-// reserveResources - Reserve the resources occupied by a machine
-// instruction and change the current state to reflect that change.
-void DFAPacketizer::reserveResources(llvm::MachineInstr *MI) {
-  const llvm::MCInstrDesc &MID = MI->getDesc();
+
+// Reserve the resources occupied by a machine instruction and change the
+// current state to reflect that change.
+void DFAPacketizer::reserveResources(llvm::MachineInstr &MI) {
+  const llvm::MCInstrDesc &MID = MI.getDesc();
   reserveResources(&MID);
 }
 
+
 namespace llvm {
-// DefaultVLIWScheduler - This class extends ScheduleDAGInstrs and overrides
-// Schedule method to build the dependence graph.
+// This class extends ScheduleDAGInstrs and overrides the schedule method
+// to build the dependence graph.
 class DefaultVLIWScheduler : public ScheduleDAGInstrs {
 private:
   AliasAnalysis *AA;
+  /// Ordered list of DAG postprocessing steps.
+  std::vector<std::unique_ptr<ScheduleDAGMutation>> Mutations;
 public:
   DefaultVLIWScheduler(MachineFunction &MF, MachineLoopInfo &MLI,
                        AliasAnalysis *AA);
-  // Schedule - Actual scheduling work.
+  // Actual scheduling work.
   void schedule() override;
+
+  /// DefaultVLIWScheduler takes ownership of the Mutation object.
+  void addMutation(std::unique_ptr<ScheduleDAGMutation> Mutation) {
+    Mutations.push_back(std::move(Mutation));
+  }
+protected:
+  void postprocessDAG();
 };
 }
 
+
 DefaultVLIWScheduler::DefaultVLIWScheduler(MachineFunction &MF,
                                            MachineLoopInfo &MLI,
                                            AliasAnalysis *AA)
@@ -166,42 +184,51 @@ DefaultVLIWScheduler::DefaultVLIWScheduler(MachineFunction &MF,
   CanHandleTerminators = true;
 }
 
+
+/// Apply each ScheduleDAGMutation step in order.
+void DefaultVLIWScheduler::postprocessDAG() {
+  for (auto &M : Mutations)
+    M->apply(this);
+}
+
+
 void DefaultVLIWScheduler::schedule() {
   // Build the scheduling graph.
   buildSchedGraph(AA);
+  postprocessDAG();
 }
 
-// VLIWPacketizerList Ctor
-VLIWPacketizerList::VLIWPacketizerList(MachineFunction &MF,
-                                       MachineLoopInfo &MLI, AliasAnalysis *AA)
-    : MF(MF), AA(AA) {
-  TII = MF.getSubtarget().getInstrInfo();
+
+VLIWPacketizerList::VLIWPacketizerList(MachineFunction &mf,
+                                       MachineLoopInfo &mli, AliasAnalysis *aa)
+    : MF(mf), TII(mf.getSubtarget().getInstrInfo()), AA(aa) {
   ResourceTracker = TII->CreateTargetScheduleState(MF.getSubtarget());
-  VLIWScheduler = new DefaultVLIWScheduler(MF, MLI, AA);
+  VLIWScheduler = new DefaultVLIWScheduler(MF, mli, AA);
 }
 
-// VLIWPacketizerList Dtor
+
 VLIWPacketizerList::~VLIWPacketizerList() {
   if (VLIWScheduler)
     delete VLIWScheduler;
-
   if (ResourceTracker)
     delete ResourceTracker;
 }
 
-// endPacket - End the current packet, bundle packet instructions and reset
-// DFA state.
+
+// End the current packet, bundle packet instructions and reset DFA state.
 void VLIWPacketizerList::endPacket(MachineBasicBlock *MBB,
-                                         MachineInstr *MI) {
+                                   MachineBasicBlock::iterator MI) {
   if (CurrentPacketMIs.size() > 1) {
-    MachineInstr *MIFirst = CurrentPacketMIs.front();
-    finalizeBundle(*MBB, MIFirst->getIterator(), MI->getIterator());
+    MachineInstr &MIFirst = *CurrentPacketMIs.front();
+    finalizeBundle(*MBB, MIFirst.getIterator(), MI.getInstrIterator());
   }
   CurrentPacketMIs.clear();
   ResourceTracker->clearResources();
+  DEBUG(dbgs() << "End packet\n");
 }
 
-// PacketizeMIs - Bundle machine instructions into packets.
+
+// Bundle machine instructions into packets.
 void VLIWPacketizerList::PacketizeMIs(MachineBasicBlock *MBB,
                                       MachineBasicBlock::iterator BeginItr,
                                       MachineBasicBlock::iterator EndItr) {
@@ -211,64 +238,88 @@ void VLIWPacketizerList::PacketizeMIs(MachineBasicBlock *MBB,
                              std::distance(BeginItr, EndItr));
   VLIWScheduler->schedule();
 
+  DEBUG({
+    dbgs() << "Scheduling DAG of the packetize region\n";
+    for (SUnit &SU : VLIWScheduler->SUnits)
+      SU.dumpAll(VLIWScheduler);
+  });
+
   // Generate MI -> SU map.
   MIToSUnit.clear();
-  for (unsigned i = 0, e = VLIWScheduler->SUnits.size(); i != e; ++i) {
-    SUnit *SU = &VLIWScheduler->SUnits[i];
-    MIToSUnit[SU->getInstr()] = SU;
-  }
+  for (SUnit &SU : VLIWScheduler->SUnits)
+    MIToSUnit[SU.getInstr()] = &SU;
 
   // The main packetizer loop.
   for (; BeginItr != EndItr; ++BeginItr) {
-    MachineInstr *MI = BeginItr;
-
-    this->initPacketizerState();
+    MachineInstr &MI = *BeginItr;
+    initPacketizerState();
 
     // End the current packet if needed.
-    if (this->isSoloInstruction(MI)) {
+    if (isSoloInstruction(MI)) {
       endPacket(MBB, MI);
       continue;
     }
 
     // Ignore pseudo instructions.
-    if (this->ignorePseudoInstruction(MI, MBB))
+    if (ignorePseudoInstruction(MI, MBB))
       continue;
 
-    SUnit *SUI = MIToSUnit[MI];
+    SUnit *SUI = MIToSUnit[&MI];
     assert(SUI && "Missing SUnit Info!");
 
     // Ask DFA if machine resource is available for MI.
+    DEBUG(dbgs() << "Checking resources for adding MI to packet " << MI);
+
     bool ResourceAvail = ResourceTracker->canReserveResources(MI);
+    DEBUG({
+      if (ResourceAvail)
+        dbgs() << "  Resources are available for adding MI to packet\n";
+      else
+        dbgs() << "  Resources NOT available\n";
+    });
     if (ResourceAvail && shouldAddToPacket(MI)) {
       // Dependency check for MI with instructions in CurrentPacketMIs.
-      for (std::vector<MachineInstr*>::iterator VI = CurrentPacketMIs.begin(),
-           VE = CurrentPacketMIs.end(); VI != VE; ++VI) {
-        MachineInstr *MJ = *VI;
+      for (auto MJ : CurrentPacketMIs) {
         SUnit *SUJ = MIToSUnit[MJ];
         assert(SUJ && "Missing SUnit Info!");
 
+        DEBUG(dbgs() << "  Checking against MJ " << *MJ);
         // Is it legal to packetize SUI and SUJ together.
-        if (!this->isLegalToPacketizeTogether(SUI, SUJ)) {
+        if (!isLegalToPacketizeTogether(SUI, SUJ)) {
+          DEBUG(dbgs() << "  Not legal to add MI, try to prune\n");
           // Allow packetization if dependency can be pruned.
-          if (!this->isLegalToPruneDependencies(SUI, SUJ)) {
+          if (!isLegalToPruneDependencies(SUI, SUJ)) {
             // End the packet if dependency cannot be pruned.
+            DEBUG(dbgs() << "  Could not prune dependencies for adding MI\n");
             endPacket(MBB, MI);
             break;
-          } // !isLegalToPruneDependencies.
-        } // !isLegalToPacketizeTogether.
-      } // For all instructions in CurrentPacketMIs.
+          }
+          DEBUG(dbgs() << "  Pruned dependence for adding MI\n");
+        }
+      }
     } else {
+      DEBUG(if (ResourceAvail)
+        dbgs() << "Resources are available, but instruction should not be "
+                  "added to packet\n  " << MI);
       // End the packet if resource is not available, or if the instruction
       // shoud not be added to the current packet.
       endPacket(MBB, MI);
     }
 
     // Add MI to the current packet.
-    BeginItr = this->addToPacket(MI);
-  } // For all instructions in BB.
+    DEBUG(dbgs() << "* Adding MI to packet " << MI << '\n');
+    BeginItr = addToPacket(MI);
+  } // For all instructions in the packetization range.
 
   // End any packet left behind.
   endPacket(MBB, EndItr);
   VLIWScheduler->exitRegion();
   VLIWScheduler->finishBlock();
 }
+
+
+// Add a DAG mutation object to the ordered list.
+void VLIWPacketizerList::addMutation(
+      std::unique_ptr<ScheduleDAGMutation> Mutation) {
+  VLIWScheduler->addMutation(std::move(Mutation));
+}
diff --git a/lib/CodeGen/DeadMachineInstructionElim.cpp b/lib/CodeGen/DeadMachineInstructionElim.cpp
index b11b49717c45..0b8dc7a86ada 100644
--- a/lib/CodeGen/DeadMachineInstructionElim.cpp
+++ b/lib/CodeGen/DeadMachineInstructionElim.cpp
@@ -42,6 +42,11 @@ namespace {
      initializeDeadMachineInstructionElimPass(*PassRegistry::getPassRegistry());
     }
 
+    void getAnalysisUsage(AnalysisUsage &AU) const override {
+      AU.setPreservesCFG();
+      MachineFunctionPass::getAnalysisUsage(AU);
+    }
+
   private:
     bool isDead(const MachineInstr *MI) const;
   };
@@ -90,7 +95,7 @@ bool DeadMachineInstructionElim::isDead(const MachineInstr *MI) const {
 }
 
 bool DeadMachineInstructionElim::runOnMachineFunction(MachineFunction &MF) {
-  if (skipOptnoneFunction(*MF.getFunction()))
+  if (skipFunction(*MF.getFunction()))
     return false;
 
   bool AnyChanges = false;
diff --git a/lib/CodeGen/DetectDeadLanes.cpp b/lib/CodeGen/DetectDeadLanes.cpp
new file mode 100644
index 000000000000..1d9e79c055e0
--- /dev/null
+++ b/lib/CodeGen/DetectDeadLanes.cpp
@@ -0,0 +1,602 @@
+//===- DetectDeadLanes.cpp - SubRegister Lane Usage Analysis --*- C++ -*---===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+/// \file
+/// Analysis that tracks defined/used subregister lanes across COPY instructions
+/// and instructions that get lowered to a COPY (PHI, REG_SEQUENCE,
+/// INSERT_SUBREG, EXTRACT_SUBREG).
+/// The information is used to detect dead definitions and the usage of
+/// (completely) undefined values and mark the operands as such.
+/// This pass is necessary because the dead/undef status is not obvious anymore
+/// when subregisters are involved.
+///
+/// Example:
+///    %vreg0 = some definition
+///    %vreg1 = IMPLICIT_DEF
+///    %vreg2 = REG_SEQUENCE %vreg0, sub0, %vreg1, sub1
+///    %vreg3 = EXTRACT_SUBREG %vreg2, sub1
+///           = use %vreg3
+/// The %vreg0 definition is dead and %vreg3 contains an undefined value.
+//
+//===----------------------------------------------------------------------===//
+
+#include <deque>
+#include <vector>
+
+#include "llvm/ADT/BitVector.h"
+#include "llvm/ADT/SetVector.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/Passes.h"
+#include "llvm/InitializePasses.h"
+#include "llvm/Pass.h"
+#include "llvm/PassRegistry.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Target/TargetInstrInfo.h"
+#include "llvm/Target/TargetRegisterInfo.h"
+#include "llvm/Target/TargetSubtargetInfo.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "detect-dead-lanes"
+
+namespace {
+
+/// Contains a bitmask of which lanes of a given virtual register are
+/// defined and which ones are actually used.
+struct VRegInfo {
+  LaneBitmask UsedLanes;
+  LaneBitmask DefinedLanes;
+};
+
+class DetectDeadLanes : public MachineFunctionPass {
+public:
+  bool runOnMachineFunction(MachineFunction &MF) override;
+
+  static char ID;
+  DetectDeadLanes() : MachineFunctionPass(ID) {}
+
+  const char *getPassName() const override { return "Detect Dead Lanes"; }
+
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    AU.setPreservesCFG();
+    MachineFunctionPass::getAnalysisUsage(AU);
+  }
+
+private:
+  /// Add used lane bits on the register used by operand \p MO. This translates
+  /// the bitmask based on the operands subregister, and puts the register into
+  /// the worklist if any new bits were added.
+  void addUsedLanesOnOperand(const MachineOperand &MO, LaneBitmask UsedLanes);
+
+  /// Given a bitmask \p UsedLanes for the used lanes on a def output of a
+  /// COPY-like instruction determine the lanes used on the use operands
+  /// and call addUsedLanesOnOperand() for them.
+  void transferUsedLanesStep(const MachineInstr &MI, LaneBitmask UsedLanes);
+
+  /// Given a use regiser operand \p Use and a mask of defined lanes, check
+  /// if the operand belongs to a lowersToCopies() instruction, transfer the
+  /// mask to the def and put the instruction into the worklist.
+  void transferDefinedLanesStep(const MachineOperand &Use,
+                                LaneBitmask DefinedLanes);
+
+  /// Given a mask \p DefinedLanes of lanes defined at operand \p OpNum
+  /// of COPY-like instruction, determine which lanes are defined at the output
+  /// operand \p Def.
+  LaneBitmask transferDefinedLanes(const MachineOperand &Def, unsigned OpNum,
+                                   LaneBitmask DefinedLanes) const;
+
+  /// Given a mask \p UsedLanes used from the output of instruction \p MI
+  /// determine which lanes are used from operand \p MO of this instruction.
+  LaneBitmask transferUsedLanes(const MachineInstr &MI, LaneBitmask UsedLanes,
+                                const MachineOperand &MO) const;
+
+  bool runOnce(MachineFunction &MF);
+
+  LaneBitmask determineInitialDefinedLanes(unsigned Reg);
+  LaneBitmask determineInitialUsedLanes(unsigned Reg);
+
+  bool isUndefRegAtInput(const MachineOperand &MO,
+                         const VRegInfo &RegInfo) const;
+
+  bool isUndefInput(const MachineOperand &MO, bool *CrossCopy) const;
+
+  const MachineRegisterInfo *MRI;
+  const TargetRegisterInfo *TRI;
+
+  void PutInWorklist(unsigned RegIdx) {
+    if (WorklistMembers.test(RegIdx))
+      return;
+    WorklistMembers.set(RegIdx);
+    Worklist.push_back(RegIdx);
+  }
+
+  VRegInfo *VRegInfos;
+  /// Worklist containing virtreg indexes.
+  std::deque<unsigned> Worklist;
+  BitVector WorklistMembers;
+  /// This bitvector is set for each vreg index where the vreg is defined
+  /// by an instruction where lowersToCopies()==true.
+  BitVector DefinedByCopy;
+};
+
+} // end anonymous namespace
+
+char DetectDeadLanes::ID = 0;
+char &llvm::DetectDeadLanesID = DetectDeadLanes::ID;
+
+INITIALIZE_PASS(DetectDeadLanes, "detect-dead-lanes", "Detect Dead Lanes",
+                false, false)
+
+/// Returns true if \p MI will get lowered to a series of COPY instructions.
+/// We call this a COPY-like instruction.
+static bool lowersToCopies(const MachineInstr &MI) {
+  // Note: We could support instructions with MCInstrDesc::isRegSequenceLike(),
+  // isExtractSubRegLike(), isInsertSubregLike() in the future even though they
+  // are not lowered to a COPY.
+  switch (MI.getOpcode()) {
+  case TargetOpcode::COPY:
+  case TargetOpcode::PHI:
+  case TargetOpcode::INSERT_SUBREG:
+  case TargetOpcode::REG_SEQUENCE:
+  case TargetOpcode::EXTRACT_SUBREG:
+    return true;
+  }
+  return false;
+}
+
+static bool isCrossCopy(const MachineRegisterInfo &MRI,
+                        const MachineInstr &MI,
+                        const TargetRegisterClass *DstRC,
+                        const MachineOperand &MO) {
+  assert(lowersToCopies(MI));
+  unsigned SrcReg = MO.getReg();
+  const TargetRegisterClass *SrcRC = MRI.getRegClass(SrcReg);
+  if (DstRC == SrcRC)
+    return false;
+
+  unsigned SrcSubIdx = MO.getSubReg();
+
+  const TargetRegisterInfo &TRI = *MRI.getTargetRegisterInfo();
+  unsigned DstSubIdx = 0;
+  switch (MI.getOpcode()) {
+  case TargetOpcode::INSERT_SUBREG:
+    if (MI.getOperandNo(&MO) == 2)
+      DstSubIdx = MI.getOperand(3).getImm();
+    break;
+  case TargetOpcode::REG_SEQUENCE: {
+    unsigned OpNum = MI.getOperandNo(&MO);
+    DstSubIdx = MI.getOperand(OpNum+1).getImm();
+    break;
+  }
+  case TargetOpcode::EXTRACT_SUBREG: {
+    unsigned SubReg = MI.getOperand(2).getImm();
+    SrcSubIdx = TRI.composeSubRegIndices(SubReg, SrcSubIdx);
+  }
+  }
+
+  unsigned PreA, PreB; // Unused.
+  if (SrcSubIdx && DstSubIdx)
+    return !TRI.getCommonSuperRegClass(SrcRC, SrcSubIdx, DstRC, DstSubIdx, PreA,
+                                       PreB);
+  if (SrcSubIdx)
+    return !TRI.getMatchingSuperRegClass(SrcRC, DstRC, SrcSubIdx);
+  if (DstSubIdx)
+    return !TRI.getMatchingSuperRegClass(DstRC, SrcRC, DstSubIdx);
+  return !TRI.getCommonSubClass(SrcRC, DstRC);
+}
+
+void DetectDeadLanes::addUsedLanesOnOperand(const MachineOperand &MO,
+                                            LaneBitmask UsedLanes) {
+  if (!MO.readsReg())
+    return;
+  unsigned MOReg = MO.getReg();
+  if (!TargetRegisterInfo::isVirtualRegister(MOReg))
+    return;
+
+  unsigned MOSubReg = MO.getSubReg();
+  if (MOSubReg != 0)
+    UsedLanes = TRI->composeSubRegIndexLaneMask(MOSubReg, UsedLanes);
+  UsedLanes &= MRI->getMaxLaneMaskForVReg(MOReg);
+
+  unsigned MORegIdx = TargetRegisterInfo::virtReg2Index(MOReg);
+  VRegInfo &MORegInfo = VRegInfos[MORegIdx];
+  LaneBitmask PrevUsedLanes = MORegInfo.UsedLanes;
+  // Any change at all?
+  if ((UsedLanes & ~PrevUsedLanes) == 0)
+    return;
+
+  // Set UsedLanes and remember instruction for further propagation.
+  MORegInfo.UsedLanes = PrevUsedLanes | UsedLanes;
+  if (DefinedByCopy.test(MORegIdx))
+    PutInWorklist(MORegIdx);
+}
+
+void DetectDeadLanes::transferUsedLanesStep(const MachineInstr &MI,
+                                            LaneBitmask UsedLanes) {
+  for (const MachineOperand &MO : MI.uses()) {
+    if (!MO.isReg() || !TargetRegisterInfo::isVirtualRegister(MO.getReg()))
+      continue;
+    LaneBitmask UsedOnMO = transferUsedLanes(MI, UsedLanes, MO);
+    addUsedLanesOnOperand(MO, UsedOnMO);
+  }
+}
+
+LaneBitmask DetectDeadLanes::transferUsedLanes(const MachineInstr &MI,
+                                               LaneBitmask UsedLanes,
+                                               const MachineOperand &MO) const {
+  unsigned OpNum = MI.getOperandNo(&MO);
+  assert(lowersToCopies(MI) && DefinedByCopy[
+           TargetRegisterInfo::virtReg2Index(MI.getOperand(0).getReg())]);
+
+  switch (MI.getOpcode()) {
+  case TargetOpcode::COPY:
+  case TargetOpcode::PHI:
+    return UsedLanes;
+  case TargetOpcode::REG_SEQUENCE: {
+    assert(OpNum % 2 == 1);
+    unsigned SubIdx = MI.getOperand(OpNum + 1).getImm();
+    return TRI->reverseComposeSubRegIndexLaneMask(SubIdx, UsedLanes);
+  }
+  case TargetOpcode::INSERT_SUBREG: {
+    unsigned SubIdx = MI.getOperand(3).getImm();
+    LaneBitmask MO2UsedLanes =
+        TRI->reverseComposeSubRegIndexLaneMask(SubIdx, UsedLanes);
+    if (OpNum == 2)
+      return MO2UsedLanes;
+
+    const MachineOperand &Def = MI.getOperand(0);
+    unsigned DefReg = Def.getReg();
+    const TargetRegisterClass *RC = MRI->getRegClass(DefReg);
+    LaneBitmask MO1UsedLanes;
+    if (RC->CoveredBySubRegs)
+      MO1UsedLanes = UsedLanes & ~TRI->getSubRegIndexLaneMask(SubIdx);
+    else
+      MO1UsedLanes = RC->LaneMask;
+
+    assert(OpNum == 1);
+    return MO1UsedLanes;
+  }
+  case TargetOpcode::EXTRACT_SUBREG: {
+    assert(OpNum == 1);
+    unsigned SubIdx = MI.getOperand(2).getImm();
+    return TRI->composeSubRegIndexLaneMask(SubIdx, UsedLanes);
+  }
+  default:
+    llvm_unreachable("function must be called with COPY-like instruction");
+  }
+}
+
+void DetectDeadLanes::transferDefinedLanesStep(const MachineOperand &Use,
+                                               LaneBitmask DefinedLanes) {
+  if (!Use.readsReg())
+    return;
+  // Check whether the operand writes a vreg and is part of a COPY-like
+  // instruction.
+  const MachineInstr &MI = *Use.getParent();
+  if (MI.getDesc().getNumDefs() != 1)
+    return;
+  // FIXME: PATCHPOINT instructions announce a Def that does not always exist,
+  // they really need to be modeled differently!
+  if (MI.getOpcode() == TargetOpcode::PATCHPOINT)
+    return;
+  const MachineOperand &Def = *MI.defs().begin();
+  unsigned DefReg = Def.getReg();
+  if (!TargetRegisterInfo::isVirtualRegister(DefReg))
+    return;
+  unsigned DefRegIdx = TargetRegisterInfo::virtReg2Index(DefReg);
+  if (!DefinedByCopy.test(DefRegIdx))
+    return;
+
+  unsigned OpNum = MI.getOperandNo(&Use);
+  DefinedLanes =
+      TRI->reverseComposeSubRegIndexLaneMask(Use.getSubReg(), DefinedLanes);
+  DefinedLanes = transferDefinedLanes(Def, OpNum, DefinedLanes);
+
+  VRegInfo &RegInfo = VRegInfos[DefRegIdx];
+  LaneBitmask PrevDefinedLanes = RegInfo.DefinedLanes;
+  // Any change at all?
+  if ((DefinedLanes & ~PrevDefinedLanes) == 0)
+    return;
+
+  RegInfo.DefinedLanes = PrevDefinedLanes | DefinedLanes;
+  PutInWorklist(DefRegIdx);
+}
+
+LaneBitmask DetectDeadLanes::transferDefinedLanes(const MachineOperand &Def,
+    unsigned OpNum, LaneBitmask DefinedLanes) const {
+  const MachineInstr &MI = *Def.getParent();
+  // Translate DefinedLanes if necessary.
+  switch (MI.getOpcode()) {
+  case TargetOpcode::REG_SEQUENCE: {
+    unsigned SubIdx = MI.getOperand(OpNum + 1).getImm();
+    DefinedLanes = TRI->composeSubRegIndexLaneMask(SubIdx, DefinedLanes);
+    DefinedLanes &= TRI->getSubRegIndexLaneMask(SubIdx);
+    break;
+  }
+  case TargetOpcode::INSERT_SUBREG: {
+    unsigned SubIdx = MI.getOperand(3).getImm();
+    if (OpNum == 2) {
+      DefinedLanes = TRI->composeSubRegIndexLaneMask(SubIdx, DefinedLanes);
+      DefinedLanes &= TRI->getSubRegIndexLaneMask(SubIdx);
+    } else {
+      assert(OpNum == 1 && "INSERT_SUBREG must have two operands");
+      // Ignore lanes defined by operand 2.
+      DefinedLanes &= ~TRI->getSubRegIndexLaneMask(SubIdx);
+    }
+    break;
+  }
+  case TargetOpcode::EXTRACT_SUBREG: {
+    unsigned SubIdx = MI.getOperand(2).getImm();
+    assert(OpNum == 1 && "EXTRACT_SUBREG must have one register operand only");
+    DefinedLanes = TRI->reverseComposeSubRegIndexLaneMask(SubIdx, DefinedLanes);
+    break;
+  }
+  case TargetOpcode::COPY:
+  case TargetOpcode::PHI:
+    break;
+  default:
+    llvm_unreachable("function must be called with COPY-like instruction");
+  }
+
+  assert(Def.getSubReg() == 0 &&
+         "Should not have subregister defs in machine SSA phase");
+  DefinedLanes &= MRI->getMaxLaneMaskForVReg(Def.getReg());
+  return DefinedLanes;
+}
+
+LaneBitmask DetectDeadLanes::determineInitialDefinedLanes(unsigned Reg) {
+  // Live-In or unused registers have no definition but are considered fully
+  // defined.
+  if (!MRI->hasOneDef(Reg))
+    return ~0u;
+
+  const MachineOperand &Def = *MRI->def_begin(Reg);
+  const MachineInstr &DefMI = *Def.getParent();
+  if (lowersToCopies(DefMI)) {
+    // Start optimisatically with no used or defined lanes for copy
+    // instructions. The following dataflow analysis will add more bits.
+    unsigned RegIdx = TargetRegisterInfo::virtReg2Index(Reg);
+    DefinedByCopy.set(RegIdx);
+    PutInWorklist(RegIdx);
+
+    if (Def.isDead())
+      return 0;
+
+    // COPY/PHI can copy across unrelated register classes (example: float/int)
+    // with incompatible subregister structure. Do not include these in the
+    // dataflow analysis since we cannot transfer lanemasks in a meaningful way.
+    const TargetRegisterClass *DefRC = MRI->getRegClass(Reg);
+
+    // Determine initially DefinedLanes.
+    LaneBitmask DefinedLanes = 0;
+    for (const MachineOperand &MO : DefMI.uses()) {
+      if (!MO.isReg() || !MO.readsReg())
+        continue;
+      unsigned MOReg = MO.getReg();
+      if (!MOReg)
+        continue;
+
+      LaneBitmask MODefinedLanes;
+      if (TargetRegisterInfo::isPhysicalRegister(MOReg)) {
+        MODefinedLanes = ~0u;
+      } else if (isCrossCopy(*MRI, DefMI, DefRC, MO)) {
+        MODefinedLanes = ~0u;
+      } else {
+        assert(TargetRegisterInfo::isVirtualRegister(MOReg));
+        if (MRI->hasOneDef(MOReg)) {
+          const MachineOperand &MODef = *MRI->def_begin(MOReg);
+          const MachineInstr &MODefMI = *MODef.getParent();
+          // Bits from copy-like operations will be added later.
+          if (lowersToCopies(MODefMI) || MODefMI.isImplicitDef())
+            continue;
+        }
+        unsigned MOSubReg = MO.getSubReg();
+        MODefinedLanes = MRI->getMaxLaneMaskForVReg(MOReg);
+        MODefinedLanes = TRI->reverseComposeSubRegIndexLaneMask(
+            MOSubReg, MODefinedLanes);
+      }
+
+      unsigned OpNum = DefMI.getOperandNo(&MO);
+      DefinedLanes |= transferDefinedLanes(Def, OpNum, MODefinedLanes);
+    }
+    return DefinedLanes;
+  }
+  if (DefMI.isImplicitDef() || Def.isDead())
+    return 0;
+
+  assert(Def.getSubReg() == 0 &&
+         "Should not have subregister defs in machine SSA phase");
+  return MRI->getMaxLaneMaskForVReg(Reg);
+}
+
+LaneBitmask DetectDeadLanes::determineInitialUsedLanes(unsigned Reg) {
+  LaneBitmask UsedLanes = 0;
+  for (const MachineOperand &MO : MRI->use_nodbg_operands(Reg)) {
+    if (!MO.readsReg())
+      continue;
+
+    const MachineInstr &UseMI = *MO.getParent();
+    if (UseMI.isKill())
+      continue;
+
+    unsigned SubReg = MO.getSubReg();
+    if (lowersToCopies(UseMI)) {
+      assert(UseMI.getDesc().getNumDefs() == 1);
+      const MachineOperand &Def = *UseMI.defs().begin();
+      unsigned DefReg = Def.getReg();
+      // The used lanes of COPY-like instruction operands are determined by the
+      // following dataflow analysis.
+      if (TargetRegisterInfo::isVirtualRegister(DefReg)) {
+        // But ignore copies across incompatible register classes.
+        bool CrossCopy = false;
+        if (lowersToCopies(UseMI)) {
+          const TargetRegisterClass *DstRC = MRI->getRegClass(DefReg);
+          CrossCopy = isCrossCopy(*MRI, UseMI, DstRC, MO);
+          if (CrossCopy)
+            DEBUG(dbgs() << "Copy accross incompatible classes: " << UseMI);
+        }
+
+        if (!CrossCopy)
+          continue;
+      }
+    }
+
+    // Shortcut: All lanes are used.
+    if (SubReg == 0)
+      return MRI->getMaxLaneMaskForVReg(Reg);
+
+    UsedLanes |= TRI->getSubRegIndexLaneMask(SubReg);
+  }
+  return UsedLanes;
+}
+
+bool DetectDeadLanes::isUndefRegAtInput(const MachineOperand &MO,
+                                        const VRegInfo &RegInfo) const {
+  unsigned SubReg = MO.getSubReg();
+  LaneBitmask Mask = TRI->getSubRegIndexLaneMask(SubReg);
+  return (RegInfo.DefinedLanes & RegInfo.UsedLanes & Mask) == 0;
+}
+
+bool DetectDeadLanes::isUndefInput(const MachineOperand &MO,
+                                   bool *CrossCopy) const {
+  if (!MO.isUse())
+    return false;
+  const MachineInstr &MI = *MO.getParent();
+  if (!lowersToCopies(MI))
+    return false;
+  const MachineOperand &Def = MI.getOperand(0);
+  unsigned DefReg = Def.getReg();
+  if (!TargetRegisterInfo::isVirtualRegister(DefReg))
+    return false;
+  unsigned DefRegIdx = TargetRegisterInfo::virtReg2Index(DefReg);
+  if (!DefinedByCopy.test(DefRegIdx))
+    return false;
+
+  const VRegInfo &DefRegInfo = VRegInfos[DefRegIdx];
+  LaneBitmask UsedLanes = transferUsedLanes(MI, DefRegInfo.UsedLanes, MO);
+  if (UsedLanes != 0)
+    return false;
+
+  unsigned MOReg = MO.getReg();
+  if (TargetRegisterInfo::isVirtualRegister(MOReg)) {
+    const TargetRegisterClass *DstRC = MRI->getRegClass(DefReg);
+    *CrossCopy = isCrossCopy(*MRI, MI, DstRC, MO);
+  }
+  return true;
+}
+
+bool DetectDeadLanes::runOnce(MachineFunction &MF) {
+  // First pass: Populate defs/uses of vregs with initial values
+  unsigned NumVirtRegs = MRI->getNumVirtRegs();
+  for (unsigned RegIdx = 0; RegIdx < NumVirtRegs; ++RegIdx) {
+    unsigned Reg = TargetRegisterInfo::index2VirtReg(RegIdx);
+
+    // Determine used/defined lanes and add copy instructions to worklist.
+    VRegInfo &Info = VRegInfos[RegIdx];
+    Info.DefinedLanes = determineInitialDefinedLanes(Reg);
+    Info.UsedLanes = determineInitialUsedLanes(Reg);
+  }
+
+  // Iterate as long as defined lanes/used lanes keep changing.
+  while (!Worklist.empty()) {
+    unsigned RegIdx = Worklist.front();
+    Worklist.pop_front();
+    WorklistMembers.reset(RegIdx);
+    VRegInfo &Info = VRegInfos[RegIdx];
+    unsigned Reg = TargetRegisterInfo::index2VirtReg(RegIdx);
+
+    // Transfer UsedLanes to operands of DefMI (backwards dataflow).
+    MachineOperand &Def = *MRI->def_begin(Reg);
+    const MachineInstr &MI = *Def.getParent();
+    transferUsedLanesStep(MI, Info.UsedLanes);
+    // Transfer DefinedLanes to users of Reg (forward dataflow).
+    for (const MachineOperand &MO : MRI->use_nodbg_operands(Reg))
+      transferDefinedLanesStep(MO, Info.DefinedLanes);
+  }
+
+  DEBUG(
+    dbgs() << "Defined/Used lanes:\n";
+    for (unsigned RegIdx = 0; RegIdx < NumVirtRegs; ++RegIdx) {
+      unsigned Reg = TargetRegisterInfo::index2VirtReg(RegIdx);
+      const VRegInfo &Info = VRegInfos[RegIdx];
+      dbgs() << PrintReg(Reg, nullptr)
+             << " Used: " << PrintLaneMask(Info.UsedLanes)
+             << " Def: " << PrintLaneMask(Info.DefinedLanes) << '\n';
+    }
+    dbgs() << "\n";
+  );
+
+  bool Again = false;
+  // Mark operands as dead/unused.
+  for (MachineBasicBlock &MBB : MF) {
+    for (MachineInstr &MI : MBB) {
+      for (MachineOperand &MO : MI.operands()) {
+        if (!MO.isReg())
+          continue;
+        unsigned Reg = MO.getReg();
+        if (!TargetRegisterInfo::isVirtualRegister(Reg))
+          continue;
+        unsigned RegIdx = TargetRegisterInfo::virtReg2Index(Reg);
+        const VRegInfo &RegInfo = VRegInfos[RegIdx];
+        if (MO.isDef() && !MO.isDead() && RegInfo.UsedLanes == 0) {
+          DEBUG(dbgs() << "Marking operand '" << MO << "' as dead in " << MI);
+          MO.setIsDead();
+        }
+        if (MO.readsReg()) {
+          bool CrossCopy = false;
+          if (isUndefRegAtInput(MO, RegInfo)) {
+            DEBUG(dbgs() << "Marking operand '" << MO << "' as undef in "
+                  << MI);
+            MO.setIsUndef();
+          } else if (isUndefInput(MO, &CrossCopy)) {
+            DEBUG(dbgs() << "Marking operand '" << MO << "' as undef in "
+                  << MI);
+            MO.setIsUndef();
+            if (CrossCopy)
+              Again = true;
+          }
+        }
+      }
+    }
+  }
+
+  return Again;
+}
+
+bool DetectDeadLanes::runOnMachineFunction(MachineFunction &MF) {
+  // Don't bother if we won't track subregister liveness later.  This pass is
+  // required for correctness if subregister liveness is enabled because the
+  // register coalescer cannot deal with hidden dead defs. However without
+  // subregister liveness enabled, the expected benefits of this pass are small
+  // so we safe the compile time.
+  if (!MF.getSubtarget().enableSubRegLiveness()) {
+    DEBUG(dbgs() << "Skipping Detect dead lanes pass\n");
+    return false;
+  }
+
+  MRI = &MF.getRegInfo();
+  TRI = MRI->getTargetRegisterInfo();
+
+  unsigned NumVirtRegs = MRI->getNumVirtRegs();
+  VRegInfos = new VRegInfo[NumVirtRegs];
+  WorklistMembers.resize(NumVirtRegs);
+  DefinedByCopy.resize(NumVirtRegs);
+
+  bool Again;
+  do {
+    Again = runOnce(MF);
+  } while(Again);
+
+  DefinedByCopy.clear();
+  WorklistMembers.clear();
+  delete[] VRegInfos;
+  return true;
+}
diff --git a/lib/CodeGen/EarlyIfConversion.cpp b/lib/CodeGen/EarlyIfConversion.cpp
index f3536d74111e..8c96124451f3 100644
--- a/lib/CodeGen/EarlyIfConversion.cpp
+++ b/lib/CodeGen/EarlyIfConversion.cpp
@@ -278,7 +278,7 @@ bool SSAIfConv::findInsertionPoint() {
   while (I != B) {
     --I;
     // Some of the conditional code depends in I.
-    if (InsertAfter.count(I)) {
+    if (InsertAfter.count(&*I)) {
       DEBUG(dbgs() << "Can't insert code after " << *I);
       return false;
     }
@@ -386,7 +386,7 @@ bool SSAIfConv::canConvertIf(MachineBasicBlock *MBB) {
 
   // The branch we're looking to eliminate must be analyzable.
   Cond.clear();
-  if (TII->AnalyzeBranch(*Head, TBB, FBB, Cond)) {
+  if (TII->analyzeBranch(*Head, TBB, FBB, Cond)) {
     DEBUG(dbgs() << "Branch not analyzable.\n");
     return false;
   }
@@ -480,7 +480,7 @@ void SSAIfConv::rewritePHIOperands() {
   for (unsigned i = 0, e = PHIs.size(); i != e; ++i) {
     PHIInfo &PI = PHIs[i];
     unsigned DstReg = 0;
-    
+
     DEBUG(dbgs() << "If-converting " << *PI.PHI);
     if (PI.TReg == PI.FReg) {
       // We do not need the select instruction if both incoming values are
@@ -718,7 +718,7 @@ bool EarlyIfConverter::shouldConvertIf() {
   // TBB / FBB data dependencies may delay the select even more.
   MachineTraceMetrics::Trace HeadTrace = MinInstr->getTrace(IfConv.Head);
   unsigned BranchDepth =
-    HeadTrace.getInstrCycles(IfConv.Head->getFirstTerminator()).Depth;
+      HeadTrace.getInstrCycles(*IfConv.Head->getFirstTerminator()).Depth;
   DEBUG(dbgs() << "Branch depth: " << BranchDepth << '\n');
 
   // Look at all the tail phis, and compute the critical path extension caused
@@ -726,8 +726,8 @@ bool EarlyIfConverter::shouldConvertIf() {
   MachineTraceMetrics::Trace TailTrace = MinInstr->getTrace(IfConv.Tail);
   for (unsigned i = 0, e = IfConv.PHIs.size(); i != e; ++i) {
     SSAIfConv::PHIInfo &PI = IfConv.PHIs[i];
-    unsigned Slack = TailTrace.getInstrSlack(PI.PHI);
-    unsigned MaxDepth = Slack + TailTrace.getInstrCycles(PI.PHI).Depth;
+    unsigned Slack = TailTrace.getInstrSlack(*PI.PHI);
+    unsigned MaxDepth = Slack + TailTrace.getInstrCycles(*PI.PHI).Depth;
     DEBUG(dbgs() << "Slack " << Slack << ":\t" << *PI.PHI);
 
     // The condition is pulled into the critical path.
@@ -742,7 +742,7 @@ bool EarlyIfConverter::shouldConvertIf() {
     }
 
     // The TBB value is pulled into the critical path.
-    unsigned TDepth = adjCycles(TBBTrace.getPHIDepth(PI.PHI), PI.TCycles);
+    unsigned TDepth = adjCycles(TBBTrace.getPHIDepth(*PI.PHI), PI.TCycles);
     if (TDepth > MaxDepth) {
       unsigned Extra = TDepth - MaxDepth;
       DEBUG(dbgs() << "TBB data adds " << Extra << " cycles.\n");
@@ -753,7 +753,7 @@ bool EarlyIfConverter::shouldConvertIf() {
     }
 
     // The FBB value is pulled into the critical path.
-    unsigned FDepth = adjCycles(FBBTrace.getPHIDepth(PI.PHI), PI.FCycles);
+    unsigned FDepth = adjCycles(FBBTrace.getPHIDepth(*PI.PHI), PI.FCycles);
     if (FDepth > MaxDepth) {
       unsigned Extra = FDepth - MaxDepth;
       DEBUG(dbgs() << "FBB data adds " << Extra << " cycles.\n");
@@ -785,6 +785,9 @@ bool EarlyIfConverter::tryConvertIf(MachineBasicBlock *MBB) {
 bool EarlyIfConverter::runOnMachineFunction(MachineFunction &MF) {
   DEBUG(dbgs() << "********** EARLY IF-CONVERSION **********\n"
                << "********** Function: " << MF.getName() << '\n');
+  if (skipFunction(*MF.getFunction()))
+    return false;
+
   // Only run if conversion if the target wants it.
   const TargetSubtargetInfo &STI = MF.getSubtarget();
   if (!STI.enableEarlyIfConversion())
diff --git a/lib/CodeGen/ErlangGC.cpp b/lib/CodeGen/ErlangGC.cpp
deleted file mode 100644
index 024946d14362..000000000000
--- a/lib/CodeGen/ErlangGC.cpp
+++ /dev/null
@@ -1,46 +0,0 @@
-//===-- ErlangGC.cpp - Erlang/OTP GC strategy -------------------*- C++ -*-===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-//
-// This file implements the Erlang/OTP runtime-compatible garbage collector
-// (e.g. defines safe points, root initialization etc.)
-//
-// The frametable emitter is in ErlangGCPrinter.cpp.
-//
-//===----------------------------------------------------------------------===//
-
-#include "llvm/CodeGen/GCs.h"
-#include "llvm/CodeGen/GCStrategy.h"
-#include "llvm/CodeGen/MachineInstrBuilder.h"
-#include "llvm/MC/MCContext.h"
-#include "llvm/MC/MCSymbol.h"
-#include "llvm/Target/TargetInstrInfo.h"
-#include "llvm/Target/TargetMachine.h"
-#include "llvm/Target/TargetSubtargetInfo.h"
-
-using namespace llvm;
-
-namespace {
-
-class ErlangGC : public GCStrategy {
-public:
-  ErlangGC();
-};
-}
-
-static GCRegistry::Add<ErlangGC> X("erlang",
-                                   "erlang-compatible garbage collector");
-
-void llvm::linkErlangGC() {}
-
-ErlangGC::ErlangGC() {
-  InitRoots = false;
-  NeededSafePoints = 1 << GC::PostCall;
-  UsesMetadata = true;
-  CustomRoots = false;
-}
diff --git a/lib/CodeGen/ExecutionDepsFix.cpp b/lib/CodeGen/ExecutionDepsFix.cpp
index c550008da025..566b8d507b2b 100644
--- a/lib/CodeGen/ExecutionDepsFix.cpp
+++ b/lib/CodeGen/ExecutionDepsFix.cpp
@@ -168,6 +168,11 @@ public:
 
   bool runOnMachineFunction(MachineFunction &MF) override;
 
+  MachineFunctionProperties getRequiredProperties() const override {
+    return MachineFunctionProperties().set(
+        MachineFunctionProperties::Property::AllVRegsAllocated);
+  }
+
   const char *getPassName() const override {
     return "Execution dependency fix";
   }
@@ -315,7 +320,7 @@ void ExeDepsFix::collapse(DomainValue *dv, unsigned domain) {
 
   // Collapse all the instructions.
   while (!dv->Instrs.empty())
-    TII->setExecutionDomain(dv->Instrs.pop_back_val(), domain);
+    TII->setExecutionDomain(*dv->Instrs.pop_back_val(), domain);
   dv->setSingleDomain(domain);
 
   // If there are multiple users, give them new, unique DomainValues.
@@ -455,7 +460,7 @@ void ExeDepsFix::visitInstr(MachineInstr *MI) {
     return;
 
   // Update instructions with explicit execution domains.
-  std::pair<uint16_t, uint16_t> DomP = TII->getExecutionDomain(MI);
+  std::pair<uint16_t, uint16_t> DomP = TII->getExecutionDomain(*MI);
   if (DomP.first) {
     if (DomP.second)
       visitSoftInstr(MI, DomP.second);
@@ -503,7 +508,7 @@ void ExeDepsFix::processDefs(MachineInstr *MI, bool Kill) {
 
   // Break dependence on undef uses. Do this before updating LiveRegs below.
   unsigned OpNum;
-  unsigned Pref = TII->getUndefRegClearance(MI, OpNum, TRI);
+  unsigned Pref = TII->getUndefRegClearance(*MI, OpNum, TRI);
   if (Pref) {
     if (shouldBreakDependence(MI, OpNum, Pref))
       UndefReads.push_back(std::make_pair(MI, OpNum));
@@ -526,9 +531,9 @@ void ExeDepsFix::processDefs(MachineInstr *MI, bool Kill) {
 
       // Check clearance before partial register updates.
       // Call breakDependence before setting LiveRegs[rx].Def.
-      unsigned Pref = TII->getPartialRegUpdateClearance(MI, i, TRI);
+      unsigned Pref = TII->getPartialRegUpdateClearance(*MI, i, TRI);
       if (Pref && shouldBreakDependence(MI, i, Pref))
-        TII->breakPartialRegDependency(MI, i, TRI);
+        TII->breakPartialRegDependency(*MI, i, TRI);
 
       // How many instructions since rx was last written?
       LiveRegs[rx].Def = CurInstr;
@@ -553,7 +558,9 @@ void ExeDepsFix::processUndefReads(MachineBasicBlock *MBB) {
 
   // Collect this block's live out register units.
   LiveRegSet.init(TRI);
-  LiveRegSet.addLiveOuts(MBB);
+  // We do not need to care about pristine registers as they are just preserved
+  // but not actually used in the function.
+  LiveRegSet.addLiveOutsNoPristines(*MBB);
 
   MachineInstr *UndefMI = UndefReads.back().first;
   unsigned OpIdx = UndefReads.back().second;
@@ -564,7 +571,7 @@ void ExeDepsFix::processUndefReads(MachineBasicBlock *MBB) {
 
     if (UndefMI == &I) {
       if (!LiveRegSet.contains(UndefMI->getOperand(OpIdx).getReg()))
-        TII->breakPartialRegDependency(UndefMI, OpIdx, TRI);
+        TII->breakPartialRegDependency(*UndefMI, OpIdx, TRI);
 
       UndefReads.pop_back();
       if (UndefReads.empty())
@@ -638,7 +645,7 @@ void ExeDepsFix::visitSoftInstr(MachineInstr *mi, unsigned mask) {
   // If the collapsed operands force a single domain, propagate the collapse.
   if (isPowerOf2_32(available)) {
     unsigned domain = countTrailingZeros(available);
-    TII->setExecutionDomain(mi, domain);
+    TII->setExecutionDomain(*mi, domain);
     visitHardInstr(mi, domain);
     return;
   }
@@ -719,6 +726,8 @@ void ExeDepsFix::visitSoftInstr(MachineInstr *mi, unsigned mask) {
 }
 
 bool ExeDepsFix::runOnMachineFunction(MachineFunction &mf) {
+  if (skipFunction(*mf.getFunction()))
+    return false;
   MF = &mf;
   TII = MF->getSubtarget().getInstrInfo();
   TRI = MF->getSubtarget().getRegisterInfo();
diff --git a/lib/CodeGen/ExpandISelPseudos.cpp b/lib/CodeGen/ExpandISelPseudos.cpp
index 90ddac94f93b..0ec79c2e69f9 100644
--- a/lib/CodeGen/ExpandISelPseudos.cpp
+++ b/lib/CodeGen/ExpandISelPseudos.cpp
@@ -53,13 +53,12 @@ bool ExpandISelPseudos::runOnMachineFunction(MachineFunction &MF) {
     MachineBasicBlock *MBB = &*I;
     for (MachineBasicBlock::iterator MBBI = MBB->begin(), MBBE = MBB->end();
          MBBI != MBBE; ) {
-      MachineInstr *MI = MBBI++;
+      MachineInstr &MI = *MBBI++;
 
       // If MI is a pseudo, expand it.
-      if (MI->usesCustomInsertionHook()) {
+      if (MI.usesCustomInsertionHook()) {
         Changed = true;
-        MachineBasicBlock *NewMBB =
-          TLI->EmitInstrWithCustomInserter(MI, MBB);
+        MachineBasicBlock *NewMBB = TLI->EmitInstrWithCustomInserter(MI, MBB);
         // The expansion may involve new basic blocks.
         if (NewMBB != MBB) {
           MBB = NewMBB;
diff --git a/lib/CodeGen/ExpandPostRAPseudos.cpp b/lib/CodeGen/ExpandPostRAPseudos.cpp
index e7bf143b82ee..ab2382e2db6d 100644
--- a/lib/CodeGen/ExpandPostRAPseudos.cpp
+++ b/lib/CodeGen/ExpandPostRAPseudos.cpp
@@ -51,7 +51,7 @@ private:
   bool LowerSubregToReg(MachineInstr *MI);
   bool LowerCopy(MachineInstr *MI);
 
-  void TransferImplicitDefs(MachineInstr *MI);
+  void TransferImplicitOperands(MachineInstr *MI);
 };
 } // end anonymous namespace
 
@@ -61,20 +61,16 @@ char &llvm::ExpandPostRAPseudosID = ExpandPostRA::ID;
 INITIALIZE_PASS(ExpandPostRA, "postrapseudos",
                 "Post-RA pseudo instruction expansion pass", false, false)
 
-/// TransferImplicitDefs - MI is a pseudo-instruction, and the lowered
-/// replacement instructions immediately precede it.  Copy any implicit-def
+/// TransferImplicitOperands - MI is a pseudo-instruction, and the lowered
+/// replacement instructions immediately precede it.  Copy any implicit
 /// operands from MI to the replacement instruction.
-void
-ExpandPostRA::TransferImplicitDefs(MachineInstr *MI) {
+void ExpandPostRA::TransferImplicitOperands(MachineInstr *MI) {
   MachineBasicBlock::iterator CopyMI = MI;
   --CopyMI;
 
-  for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) {
-    MachineOperand &MO = MI->getOperand(i);
-    if (!MO.isReg() || !MO.isImplicit() || MO.isUse())
-      continue;
-    CopyMI->addOperand(MachineOperand::CreateReg(MO.getReg(), true, true));
-  }
+  for (const MachineOperand &MO : MI->implicit_operands())
+    if (MO.isReg())
+      CopyMI->addOperand(MO);
 }
 
 bool ExpandPostRA::LowerSubregToReg(MachineInstr *MI) {
@@ -167,7 +163,7 @@ bool ExpandPostRA::LowerCopy(MachineInstr *MI) {
                    DstMO.getReg(), SrcMO.getReg(), SrcMO.isKill());
 
   if (MI->getNumOperands() > 2)
-    TransferImplicitDefs(MI);
+    TransferImplicitOperands(MI);
   DEBUG({
     MachineBasicBlock::iterator dMI = MI;
     dbgs() << "replaced by: " << *(--dMI);
@@ -192,12 +188,12 @@ bool ExpandPostRA::runOnMachineFunction(MachineFunction &MF) {
        mbbi != mbbe; ++mbbi) {
     for (MachineBasicBlock::iterator mi = mbbi->begin(), me = mbbi->end();
          mi != me;) {
-      MachineInstr *MI = mi;
+      MachineInstr &MI = *mi;
       // Advance iterator here because MI may be erased.
       ++mi;
 
       // Only expand pseudos.
-      if (!MI->isPseudo())
+      if (!MI.isPseudo())
         continue;
 
       // Give targets a chance to expand even standard pseudos.
@@ -207,12 +203,12 @@ bool ExpandPostRA::runOnMachineFunction(MachineFunction &MF) {
       }
 
       // Expand standard pseudos.
-      switch (MI->getOpcode()) {
+      switch (MI.getOpcode()) {
       case TargetOpcode::SUBREG_TO_REG:
-        MadeChange |= LowerSubregToReg(MI);
+        MadeChange |= LowerSubregToReg(&MI);
         break;
       case TargetOpcode::COPY:
-        MadeChange |= LowerCopy(MI);
+        MadeChange |= LowerCopy(&MI);
         break;
       case TargetOpcode::DBG_VALUE:
         continue;
diff --git a/lib/CodeGen/FuncletLayout.cpp b/lib/CodeGen/FuncletLayout.cpp
index 8b2f505ff028..b16f81c728d0 100644
--- a/lib/CodeGen/FuncletLayout.cpp
+++ b/lib/CodeGen/FuncletLayout.cpp
@@ -28,6 +28,10 @@ public:
   }
 
   bool runOnMachineFunction(MachineFunction &F) override;
+  MachineFunctionProperties getRequiredProperties() const override {
+    return MachineFunctionProperties().set(
+        MachineFunctionProperties::Property::AllVRegsAllocated);
+  }
 };
 }
 
diff --git a/lib/CodeGen/GCRootLowering.cpp b/lib/CodeGen/GCRootLowering.cpp
index 484d31737b2e..326adab2ba64 100644
--- a/lib/CodeGen/GCRootLowering.cpp
+++ b/lib/CodeGen/GCRootLowering.cpp
@@ -64,7 +64,7 @@ class GCMachineCodeAnalysis : public MachineFunctionPass {
   void FindSafePoints(MachineFunction &MF);
   void VisitCallPoint(MachineBasicBlock::iterator MI);
   MCSymbol *InsertLabel(MachineBasicBlock &MBB, MachineBasicBlock::iterator MI,
-                        DebugLoc DL) const;
+                        const DebugLoc &DL) const;
 
   void FindStackOffsets(MachineFunction &MF);
 
@@ -170,8 +170,7 @@ static bool InsertRootInitializers(Function &F, AllocaInst **Roots,
   for (AllocaInst **I = Roots, **E = Roots + Count; I != E; ++I)
     if (!InitedRoots.count(*I)) {
       StoreInst *SI = new StoreInst(
-          ConstantPointerNull::get(cast<PointerType>(
-              cast<PointerType>((*I)->getType())->getElementType())),
+          ConstantPointerNull::get(cast<PointerType>((*I)->getAllocatedType())),
           *I);
       SI->insertAfter(*I);
       MadeChange = true;
@@ -271,7 +270,7 @@ void GCMachineCodeAnalysis::getAnalysisUsage(AnalysisUsage &AU) const {
 
 MCSymbol *GCMachineCodeAnalysis::InsertLabel(MachineBasicBlock &MBB,
                                              MachineBasicBlock::iterator MI,
-                                             DebugLoc DL) const {
+                                             const DebugLoc &DL) const {
   MCSymbol *Label = MBB.getParent()->getContext().createTempSymbol();
   BuildMI(MBB, MI, DL, TII->get(TargetOpcode::GC_LABEL)).addSym(Label);
   return Label;
diff --git a/lib/CodeGen/GlobalISel/CMakeLists.txt b/lib/CodeGen/GlobalISel/CMakeLists.txt
new file mode 100644
index 000000000000..e3e81ae5c4b1
--- /dev/null
+++ b/lib/CodeGen/GlobalISel/CMakeLists.txt
@@ -0,0 +1,27 @@
+# List of all GlobalISel files.
+set(GLOBAL_ISEL_FILES
+      IRTranslator.cpp
+      MachineIRBuilder.cpp
+      RegBankSelect.cpp
+      RegisterBank.cpp
+      RegisterBankInfo.cpp
+      )
+
+# Add GlobalISel files to the dependencies if the user wants to build it.
+if(LLVM_BUILD_GLOBAL_ISEL)
+  set(GLOBAL_ISEL_BUILD_FILES ${GLOBAL_ISEL_FILES})
+else()
+  set(GLOBAL_ISEL_BUILD_FILES"")
+  set(LLVM_OPTIONAL_SOURCES LLVMGlobalISel ${GLOBAL_ISEL_FILES})
+endif()
+
+
+# In LLVMBuild.txt files, it is not possible to mark a dependency to a
+# library as optional. So instead, generate an empty library if we did
+# not ask for it. 
+add_llvm_library(LLVMGlobalISel
+        ${GLOBAL_ISEL_BUILD_FILES}
+        GlobalISel.cpp
+  )
+
+add_dependencies(LLVMGlobalISel intrinsics_gen)
diff --git a/lib/CodeGen/GlobalISel/GlobalISel.cpp b/lib/CodeGen/GlobalISel/GlobalISel.cpp
new file mode 100644
index 000000000000..231e5ac82bec
--- /dev/null
+++ b/lib/CodeGen/GlobalISel/GlobalISel.cpp
@@ -0,0 +1,30 @@
+//===-- llvm/CodeGen/GlobalISel/GlobalIsel.cpp --- GlobalISel ----*- C++ -*-==//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+/// \file
+// This file implements the common initialization routines for the
+// GlobalISel library.
+//===----------------------------------------------------------------------===//
+
+#include "llvm/InitializePasses.h"
+#include "llvm/PassRegistry.h"
+
+using namespace llvm;
+
+#ifndef LLVM_BUILD_GLOBAL_ISEL
+
+void llvm::initializeGlobalISel(PassRegistry &Registry) {
+}
+
+#else
+
+void llvm::initializeGlobalISel(PassRegistry &Registry) {
+  initializeIRTranslatorPass(Registry);
+  initializeRegBankSelectPass(Registry);
+}
+#endif // LLVM_BUILD_GLOBAL_ISEL
diff --git a/lib/CodeGen/GlobalISel/IRTranslator.cpp b/lib/CodeGen/GlobalISel/IRTranslator.cpp
new file mode 100644
index 000000000000..b8a960cfac76
--- /dev/null
+++ b/lib/CodeGen/GlobalISel/IRTranslator.cpp
@@ -0,0 +1,164 @@
+//===-- llvm/CodeGen/GlobalISel/IRTranslator.cpp - IRTranslator --*- C++ -*-==//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+/// \file
+/// This file implements the IRTranslator class.
+//===----------------------------------------------------------------------===//
+
+#include "llvm/CodeGen/GlobalISel/IRTranslator.h"
+
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/CodeGen/GlobalISel/CallLowering.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/IR/Constant.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/Type.h"
+#include "llvm/IR/Value.h"
+#include "llvm/Target/TargetLowering.h"
+
+#define DEBUG_TYPE "irtranslator"
+
+using namespace llvm;
+
+char IRTranslator::ID = 0;
+INITIALIZE_PASS(IRTranslator, "irtranslator", "IRTranslator LLVM IR -> MI",
+                false, false);
+
+IRTranslator::IRTranslator() : MachineFunctionPass(ID), MRI(nullptr) {
+  initializeIRTranslatorPass(*PassRegistry::getPassRegistry());
+}
+
+unsigned IRTranslator::getOrCreateVReg(const Value &Val) {
+  unsigned &ValReg = ValToVReg[&Val];
+  // Check if this is the first time we see Val.
+  if (!ValReg) {
+    // Fill ValRegsSequence with the sequence of registers
+    // we need to concat together to produce the value.
+    assert(Val.getType()->isSized() &&
+           "Don't know how to create an empty vreg");
+    assert(!Val.getType()->isAggregateType() && "Not yet implemented");
+    unsigned Size = Val.getType()->getPrimitiveSizeInBits();
+    unsigned VReg = MRI->createGenericVirtualRegister(Size);
+    ValReg = VReg;
+    assert(!isa<Constant>(Val) && "Not yet implemented");
+  }
+  return ValReg;
+}
+
+MachineBasicBlock &IRTranslator::getOrCreateBB(const BasicBlock &BB) {
+  MachineBasicBlock *&MBB = BBToMBB[&BB];
+  if (!MBB) {
+    MachineFunction &MF = MIRBuilder.getMF();
+    MBB = MF.CreateMachineBasicBlock();
+    MF.push_back(MBB);
+  }
+  return *MBB;
+}
+
+bool IRTranslator::translateBinaryOp(unsigned Opcode, const Instruction &Inst) {
+  // Get or create a virtual register for each value.
+  // Unless the value is a Constant => loadimm cst?
+  // or inline constant each time?
+  // Creation of a virtual register needs to have a size.
+  unsigned Op0 = getOrCreateVReg(*Inst.getOperand(0));
+  unsigned Op1 = getOrCreateVReg(*Inst.getOperand(1));
+  unsigned Res = getOrCreateVReg(Inst);
+  MIRBuilder.buildInstr(Opcode, Inst.getType(), Res, Op0, Op1);
+  return true;
+}
+
+bool IRTranslator::translateReturn(const Instruction &Inst) {
+  assert(isa<ReturnInst>(Inst) && "Return expected");
+  const Value *Ret = cast<ReturnInst>(Inst).getReturnValue();
+  // The target may mess up with the insertion point, but
+  // this is not important as a return is the last instruction
+  // of the block anyway.
+  return CLI->lowerReturn(MIRBuilder, Ret, !Ret ? 0 : getOrCreateVReg(*Ret));
+}
+
+bool IRTranslator::translateBr(const Instruction &Inst) {
+  assert(isa<BranchInst>(Inst) && "Branch expected");
+  const BranchInst &BrInst = *cast<BranchInst>(&Inst);
+  if (BrInst.isUnconditional()) {
+    const BasicBlock &BrTgt = *cast<BasicBlock>(BrInst.getOperand(0));
+    MachineBasicBlock &TgtBB = getOrCreateBB(BrTgt);
+    MIRBuilder.buildInstr(TargetOpcode::G_BR, BrTgt.getType(), TgtBB);
+  } else {
+    assert(0 && "Not yet implemented");
+  }
+  // Link successors.
+  MachineBasicBlock &CurBB = MIRBuilder.getMBB();
+  for (const BasicBlock *Succ : BrInst.successors())
+    CurBB.addSuccessor(&getOrCreateBB(*Succ));
+  return true;
+}
+
+bool IRTranslator::translate(const Instruction &Inst) {
+  MIRBuilder.setDebugLoc(Inst.getDebugLoc());
+  switch(Inst.getOpcode()) {
+  case Instruction::Add:
+    return translateBinaryOp(TargetOpcode::G_ADD, Inst);
+  case Instruction::Or:
+    return translateBinaryOp(TargetOpcode::G_OR, Inst);
+  case Instruction::Br:
+    return translateBr(Inst);
+  case Instruction::Ret:
+    return translateReturn(Inst);
+
+  default:
+    llvm_unreachable("Opcode not supported");
+  }
+}
+
+
+void IRTranslator::finalize() {
+  // Release the memory used by the different maps we
+  // needed during the translation.
+  ValToVReg.clear();
+  Constants.clear();
+}
+
+bool IRTranslator::runOnMachineFunction(MachineFunction &MF) {
+  const Function &F = *MF.getFunction();
+  if (F.empty())
+    return false;
+  CLI = MF.getSubtarget().getCallLowering();
+  MIRBuilder.setMF(MF);
+  MRI = &MF.getRegInfo();
+  // Setup the arguments.
+  MachineBasicBlock &MBB = getOrCreateBB(F.front());
+  MIRBuilder.setMBB(MBB);
+  SmallVector<unsigned, 8> VRegArgs;
+  for (const Argument &Arg: F.args())
+    VRegArgs.push_back(getOrCreateVReg(Arg));
+  bool Succeeded =
+      CLI->lowerFormalArguments(MIRBuilder, F.getArgumentList(), VRegArgs);
+  if (!Succeeded)
+    report_fatal_error("Unable to lower arguments");
+
+  for (const BasicBlock &BB: F) {
+    MachineBasicBlock &MBB = getOrCreateBB(BB);
+    // Set the insertion point of all the following translations to
+    // the end of this basic block.
+    MIRBuilder.setMBB(MBB);
+    for (const Instruction &Inst: BB) {
+      bool Succeeded = translate(Inst);
+      if (!Succeeded) {
+        DEBUG(dbgs() << "Cannot translate: " << Inst << '\n');
+        report_fatal_error("Unable to translate instruction");
+      }
+    }
+  }
+
+  // Now that the MachineFrameInfo has been configured, no further changes to
+  // the reserved registers are possible.
+  MRI->freezeReservedRegs(MF);
+
+  return false;
+}
diff --git a/lib/CodeGen/GlobalISel/LLVMBuild.txt b/lib/CodeGen/GlobalISel/LLVMBuild.txt
new file mode 100644
index 000000000000..a2daa3b222a6
--- /dev/null
+++ b/lib/CodeGen/GlobalISel/LLVMBuild.txt
@@ -0,0 +1,22 @@
+;===- ./lib/CodeGen/GlobalISel/LLVMBuild.txt -----------------*- Conf -*--===;
+;
+;                     The LLVM Compiler Infrastructure
+;
+; This file is distributed under the University of Illinois Open Source
+; License. See LICENSE.TXT for details.
+;
+;===------------------------------------------------------------------------===;
+;
+; This is an LLVMBuild description file for the components in this subdirectory.
+;
+; For more information on the LLVMBuild system, please see:
+;
+;   http://llvm.org/docs/LLVMBuild.html
+;
+;===------------------------------------------------------------------------===;
+
+[component_0]
+type = Library
+name = GlobalISel
+parent = CodeGen
+required_libraries = Analysis CodeGen Core MC Support Target TransformUtils
diff --git a/lib/CodeGen/GlobalISel/MachineIRBuilder.cpp b/lib/CodeGen/GlobalISel/MachineIRBuilder.cpp
new file mode 100644
index 000000000000..2f19bcf1e68b
--- /dev/null
+++ b/lib/CodeGen/GlobalISel/MachineIRBuilder.cpp
@@ -0,0 +1,104 @@
+//===-- llvm/CodeGen/GlobalISel/MachineIRBuilder.cpp - MIBuilder--*- C++ -*-==//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+/// \file
+/// This file implements the MachineIRBuidler class.
+//===----------------------------------------------------------------------===//
+#include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h"
+
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineInstr.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/Target/TargetInstrInfo.h"
+#include "llvm/Target/TargetOpcodes.h"
+#include "llvm/Target/TargetSubtargetInfo.h"
+
+using namespace llvm;
+
+void MachineIRBuilder::setMF(MachineFunction &MF) {
+  this->MF = &MF;
+  this->MBB = nullptr;
+  this->TII = MF.getSubtarget().getInstrInfo();
+  this->DL = DebugLoc();
+  this->MI = nullptr;
+}
+
+void MachineIRBuilder::setMBB(MachineBasicBlock &MBB, bool Beginning) {
+  this->MBB = &MBB;
+  Before = Beginning;
+  assert(&getMF() == MBB.getParent() &&
+         "Basic block is in a different function");
+}
+
+void MachineIRBuilder::setInstr(MachineInstr &MI, bool Before) {
+  assert(MI.getParent() && "Instruction is not part of a basic block");
+  setMBB(*MI.getParent());
+  this->MI = &MI;
+  this->Before = Before;
+}
+
+MachineBasicBlock::iterator MachineIRBuilder::getInsertPt() {
+  if (MI) {
+    if (Before)
+      return MI;
+    if (!MI->getNextNode())
+      return getMBB().end();
+    return MI->getNextNode();
+  }
+  return Before ? getMBB().begin() : getMBB().end();
+}
+
+//------------------------------------------------------------------------------
+// Build instruction variants.
+//------------------------------------------------------------------------------
+MachineInstr *MachineIRBuilder::buildInstr(unsigned Opcode, Type *Ty) {
+  MachineInstr *NewMI = BuildMI(getMF(), DL, getTII().get(Opcode));
+  if (Ty) {
+    assert(isPreISelGenericOpcode(Opcode) &&
+           "Only generic instruction can have a type");
+    NewMI->setType(Ty);
+  } else
+    assert(!isPreISelGenericOpcode(Opcode) &&
+           "Generic instruction must have a type");
+  getMBB().insert(getInsertPt(), NewMI);
+  return NewMI;
+}
+
+MachineInstr *MachineIRBuilder::buildInstr(unsigned Opcode, unsigned Res,
+                                           unsigned Op0, unsigned Op1) {
+  return buildInstr(Opcode, nullptr, Res, Op0, Op1);
+}
+
+MachineInstr *MachineIRBuilder::buildInstr(unsigned Opcode, Type *Ty,
+                                           unsigned Res, unsigned Op0,
+                                           unsigned Op1) {
+  MachineInstr *NewMI = buildInstr(Opcode, Ty);
+  MachineInstrBuilder(getMF(), NewMI)
+      .addReg(Res, RegState::Define)
+      .addReg(Op0)
+      .addReg(Op1);
+  return NewMI;
+}
+
+MachineInstr *MachineIRBuilder::buildInstr(unsigned Opcode, unsigned Res,
+                                           unsigned Op0) {
+  MachineInstr *NewMI = buildInstr(Opcode, nullptr);
+  MachineInstrBuilder(getMF(), NewMI).addReg(Res, RegState::Define).addReg(Op0);
+  return NewMI;
+}
+
+MachineInstr *MachineIRBuilder::buildInstr(unsigned Opcode) {
+  return buildInstr(Opcode, nullptr);
+}
+
+MachineInstr *MachineIRBuilder::buildInstr(unsigned Opcode, Type *Ty,
+                                           MachineBasicBlock &BB) {
+  MachineInstr *NewMI = buildInstr(Opcode, Ty);
+  MachineInstrBuilder(getMF(), NewMI).addMBB(&BB);
+  return NewMI;
+}
diff --git a/lib/CodeGen/GlobalISel/RegBankSelect.cpp b/lib/CodeGen/GlobalISel/RegBankSelect.cpp
new file mode 100644
index 000000000000..419e270c9127
--- /dev/null
+++ b/lib/CodeGen/GlobalISel/RegBankSelect.cpp
@@ -0,0 +1,897 @@
+//===- llvm/CodeGen/GlobalISel/RegBankSelect.cpp - RegBankSelect -*- C++ -*-==//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+/// \file
+/// This file implements the RegBankSelect class.
+//===----------------------------------------------------------------------===//
+
+#include "llvm/CodeGen/GlobalISel/RegBankSelect.h"
+#include "llvm/ADT/PostOrderIterator.h"
+#include "llvm/CodeGen/GlobalISel/RegisterBank.h"
+#include "llvm/CodeGen/MachineBlockFrequencyInfo.h"
+#include "llvm/CodeGen/MachineBranchProbabilityInfo.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/IR/Function.h"
+#include "llvm/Support/BlockFrequency.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Target/TargetSubtargetInfo.h"
+
+#define DEBUG_TYPE "regbankselect"
+
+using namespace llvm;
+
+static cl::opt<RegBankSelect::Mode> RegBankSelectMode(
+    cl::desc("Mode of the RegBankSelect pass"), cl::Hidden, cl::Optional,
+    cl::values(clEnumValN(RegBankSelect::Mode::Fast, "regbankselect-fast",
+                          "Run the Fast mode (default mapping)"),
+               clEnumValN(RegBankSelect::Mode::Greedy, "regbankselect-greedy",
+                          "Use the Greedy mode (best local mapping)"),
+               clEnumValEnd));
+
+char RegBankSelect::ID = 0;
+INITIALIZE_PASS_BEGIN(RegBankSelect, "regbankselect",
+                      "Assign register bank of generic virtual registers",
+                      false, false);
+INITIALIZE_PASS_DEPENDENCY(MachineBlockFrequencyInfo)
+INITIALIZE_PASS_DEPENDENCY(MachineBranchProbabilityInfo)
+INITIALIZE_PASS_END(RegBankSelect, "regbankselect",
+                    "Assign register bank of generic virtual registers", false,
+                    false);
+
+RegBankSelect::RegBankSelect(Mode RunningMode)
+    : MachineFunctionPass(ID), RBI(nullptr), MRI(nullptr), TRI(nullptr),
+      MBFI(nullptr), MBPI(nullptr), OptMode(RunningMode) {
+  initializeRegBankSelectPass(*PassRegistry::getPassRegistry());
+  if (RegBankSelectMode.getNumOccurrences() != 0) {
+    OptMode = RegBankSelectMode;
+    if (RegBankSelectMode != RunningMode)
+      DEBUG(dbgs() << "RegBankSelect mode overrided by command line\n");
+  }
+}
+
+void RegBankSelect::init(MachineFunction &MF) {
+  RBI = MF.getSubtarget().getRegBankInfo();
+  assert(RBI && "Cannot work without RegisterBankInfo");
+  MRI = &MF.getRegInfo();
+  TRI = MF.getSubtarget().getRegisterInfo();
+  if (OptMode != Mode::Fast) {
+    MBFI = &getAnalysis<MachineBlockFrequencyInfo>();
+    MBPI = &getAnalysis<MachineBranchProbabilityInfo>();
+  } else {
+    MBFI = nullptr;
+    MBPI = nullptr;
+  }
+  MIRBuilder.setMF(MF);
+}
+
+void RegBankSelect::getAnalysisUsage(AnalysisUsage &AU) const {
+  if (OptMode != Mode::Fast) {
+    // We could preserve the information from these two analysis but
+    // the APIs do not allow to do so yet.
+    AU.addRequired<MachineBlockFrequencyInfo>();
+    AU.addRequired<MachineBranchProbabilityInfo>();
+  }
+  MachineFunctionPass::getAnalysisUsage(AU);
+}
+
+bool RegBankSelect::assignmentMatch(
+    unsigned Reg, const RegisterBankInfo::ValueMapping &ValMapping,
+    bool &OnlyAssign) const {
+  // By default we assume we will have to repair something.
+  OnlyAssign = false;
+  // Each part of a break down needs to end up in a different register.
+  // In other word, Reg assignement does not match.
+  if (ValMapping.BreakDown.size() > 1)
+    return false;
+
+  const RegisterBank *CurRegBank = RBI->getRegBank(Reg, *MRI, *TRI);
+  const RegisterBank *DesiredRegBrank = ValMapping.BreakDown[0].RegBank;
+  // Reg is free of assignment, a simple assignment will make the
+  // register bank to match.
+  OnlyAssign = CurRegBank == nullptr;
+  DEBUG(dbgs() << "Does assignment already match: ";
+        if (CurRegBank) dbgs() << *CurRegBank; else dbgs() << "none";
+        dbgs() << " against ";
+        assert(DesiredRegBrank && "The mapping must be valid");
+        dbgs() << *DesiredRegBrank << '\n';);
+  return CurRegBank == DesiredRegBrank;
+}
+
+void RegBankSelect::repairReg(
+    MachineOperand &MO, const RegisterBankInfo::ValueMapping &ValMapping,
+    RegBankSelect::RepairingPlacement &RepairPt,
+    const iterator_range<SmallVectorImpl<unsigned>::const_iterator> &NewVRegs) {
+  assert(ValMapping.BreakDown.size() == 1 && "Not yet implemented");
+  // An empty range of new register means no repairing.
+  assert(NewVRegs.begin() != NewVRegs.end() && "We should not have to repair");
+
+  // Assume we are repairing a use and thus, the original reg will be
+  // the source of the repairing.
+  unsigned Src = MO.getReg();
+  unsigned Dst = *NewVRegs.begin();
+
+  // If we repair a definition, swap the source and destination for
+  // the repairing.
+  if (MO.isDef())
+    std::swap(Src, Dst);
+
+  assert((RepairPt.getNumInsertPoints() == 1 ||
+          TargetRegisterInfo::isPhysicalRegister(Dst)) &&
+         "We are about to create several defs for Dst");
+
+  // Build the instruction used to repair, then clone it at the right places.
+  MachineInstr *MI = MIRBuilder.buildInstr(TargetOpcode::COPY, Dst, Src);
+  MI->removeFromParent();
+  DEBUG(dbgs() << "Copy: " << PrintReg(Src) << " to: " << PrintReg(Dst)
+               << '\n');
+  // TODO:
+  // Check if MI is legal. if not, we need to legalize all the
+  // instructions we are going to insert.
+  std::unique_ptr<MachineInstr *[]> NewInstrs(
+      new MachineInstr *[RepairPt.getNumInsertPoints()]);
+  bool IsFirst = true;
+  unsigned Idx = 0;
+  for (const std::unique_ptr<InsertPoint> &InsertPt : RepairPt) {
+    MachineInstr *CurMI;
+    if (IsFirst)
+      CurMI = MI;
+    else
+      CurMI = MIRBuilder.getMF().CloneMachineInstr(MI);
+    InsertPt->insert(*CurMI);
+    NewInstrs[Idx++] = CurMI;
+    IsFirst = false;
+  }
+  // TODO:
+  // Legalize NewInstrs if need be.
+}
+
+uint64_t RegBankSelect::getRepairCost(
+    const MachineOperand &MO,
+    const RegisterBankInfo::ValueMapping &ValMapping) const {
+  assert(MO.isReg() && "We should only repair register operand");
+  assert(!ValMapping.BreakDown.empty() && "Nothing to map??");
+
+  bool IsSameNumOfValues = ValMapping.BreakDown.size() == 1;
+  const RegisterBank *CurRegBank = RBI->getRegBank(MO.getReg(), *MRI, *TRI);
+  // If MO does not have a register bank, we should have just been
+  // able to set one unless we have to break the value down.
+  assert((!IsSameNumOfValues || CurRegBank) && "We should not have to repair");
+  // Def: Val <- NewDefs
+  //     Same number of values: copy
+  //     Different number: Val = build_sequence Defs1, Defs2, ...
+  // Use: NewSources <- Val.
+  //     Same number of values: copy.
+  //     Different number: Src1, Src2, ... =
+  //           extract_value Val, Src1Begin, Src1Len, Src2Begin, Src2Len, ...
+  // We should remember that this value is available somewhere else to
+  // coalesce the value.
+
+  if (IsSameNumOfValues) {
+    const RegisterBank *DesiredRegBrank = ValMapping.BreakDown[0].RegBank;
+    // If we repair a definition, swap the source and destination for
+    // the repairing.
+    if (MO.isDef())
+      std::swap(CurRegBank, DesiredRegBrank);
+    // TODO: It may be possible to actually avoid the copy.
+    // If we repair something where the source is defined by a copy
+    // and the source of that copy is on the right bank, we can reuse
+    // it for free.
+    // E.g.,
+    // RegToRepair<BankA> = copy AlternativeSrc<BankB>
+    // = op RegToRepair<BankA>
+    // We can simply propagate AlternativeSrc instead of copying RegToRepair
+    // into a new virtual register.
+    // We would also need to propagate this information in the
+    // repairing placement.
+    unsigned Cost =
+        RBI->copyCost(*DesiredRegBrank, *CurRegBank,
+                      RegisterBankInfo::getSizeInBits(MO.getReg(), *MRI, *TRI));
+    // TODO: use a dedicated constant for ImpossibleCost.
+    if (Cost != UINT_MAX)
+      return Cost;
+    assert(false && "Legalization not available yet");
+    // Return the legalization cost of that repairing.
+  }
+  assert(false && "Complex repairing not implemented yet");
+  return 1;
+}
+
+RegisterBankInfo::InstructionMapping &RegBankSelect::findBestMapping(
+    MachineInstr &MI, RegisterBankInfo::InstructionMappings &PossibleMappings,
+    SmallVectorImpl<RepairingPlacement> &RepairPts) {
+
+  RegisterBankInfo::InstructionMapping *BestMapping = nullptr;
+  MappingCost Cost = MappingCost::ImpossibleCost();
+  SmallVector<RepairingPlacement, 4> LocalRepairPts;
+  for (RegisterBankInfo::InstructionMapping &CurMapping : PossibleMappings) {
+    MappingCost CurCost = computeMapping(MI, CurMapping, LocalRepairPts, &Cost);
+    if (CurCost < Cost) {
+      Cost = CurCost;
+      BestMapping = &CurMapping;
+      RepairPts.clear();
+      for (RepairingPlacement &RepairPt : LocalRepairPts)
+        RepairPts.emplace_back(std::move(RepairPt));
+    }
+  }
+  assert(BestMapping && "No suitable mapping for instruction");
+  return *BestMapping;
+}
+
+void RegBankSelect::tryAvoidingSplit(
+    RegBankSelect::RepairingPlacement &RepairPt, const MachineOperand &MO,
+    const RegisterBankInfo::ValueMapping &ValMapping) const {
+  const MachineInstr &MI = *MO.getParent();
+  assert(RepairPt.hasSplit() && "We should not have to adjust for split");
+  // Splitting should only occur for PHIs or between terminators,
+  // because we only do local repairing.
+  assert((MI.isPHI() || MI.isTerminator()) && "Why do we split?");
+
+  assert(&MI.getOperand(RepairPt.getOpIdx()) == &MO &&
+         "Repairing placement does not match operand");
+
+  // If we need splitting for phis, that means it is because we
+  // could not find an insertion point before the terminators of
+  // the predecessor block for this argument. In other words,
+  // the input value is defined by one of the terminators.
+  assert((!MI.isPHI() || !MO.isDef()) && "Need split for phi def?");
+
+  // We split to repair the use of a phi or a terminator.
+  if (!MO.isDef()) {
+    if (MI.isTerminator()) {
+      assert(&MI != &(*MI.getParent()->getFirstTerminator()) &&
+             "Need to split for the first terminator?!");
+    } else {
+      // For the PHI case, the split may not be actually required.
+      // In the copy case, a phi is already a copy on the incoming edge,
+      // therefore there is no need to split.
+      if (ValMapping.BreakDown.size() == 1)
+        // This is a already a copy, there is nothing to do.
+        RepairPt.switchTo(RepairingPlacement::RepairingKind::Reassign);
+    }
+    return;
+  }
+
+  // At this point, we need to repair a defintion of a terminator.
+
+  // Technically we need to fix the def of MI on all outgoing
+  // edges of MI to keep the repairing local. In other words, we
+  // will create several definitions of the same register. This
+  // does not work for SSA unless that definition is a physical
+  // register.
+  // However, there are other cases where we can get away with
+  // that while still keeping the repairing local.
+  assert(MI.isTerminator() && MO.isDef() &&
+         "This code is for the def of a terminator");
+
+  // Since we use RPO traversal, if we need to repair a definition
+  // this means this definition could be:
+  // 1. Used by PHIs (i.e., this VReg has been visited as part of the
+  //    uses of a phi.), or
+  // 2. Part of a target specific instruction (i.e., the target applied
+  //    some register class constraints when creating the instruction.)
+  // If the constraints come for #2, the target said that another mapping
+  // is supported so we may just drop them. Indeed, if we do not change
+  // the number of registers holding that value, the uses will get fixed
+  // when we get to them.
+  // Uses in PHIs may have already been proceeded though.
+  // If the constraints come for #1, then, those are weak constraints and
+  // no actual uses may rely on them. However, the problem remains mainly
+  // the same as for #2. If the value stays in one register, we could
+  // just switch the register bank of the definition, but we would need to
+  // account for a repairing cost for each phi we silently change.
+  //
+  // In any case, if the value needs to be broken down into several
+  // registers, the repairing is not local anymore as we need to patch
+  // every uses to rebuild the value in just one register.
+  //
+  // To summarize:
+  // - If the value is in a physical register, we can do the split and
+  //   fix locally.
+  // Otherwise if the value is in a virtual register:
+  // - If the value remains in one register, we do not have to split
+  //   just switching the register bank would do, but we need to account
+  //   in the repairing cost all the phi we changed.
+  // - If the value spans several registers, then we cannot do a local
+  //   repairing.
+
+  // Check if this is a physical or virtual register.
+  unsigned Reg = MO.getReg();
+  if (TargetRegisterInfo::isPhysicalRegister(Reg)) {
+    // We are going to split every outgoing edges.
+    // Check that this is possible.
+    // FIXME: The machine representation is currently broken
+    // since it also several terminators in one basic block.
+    // Because of that we would technically need a way to get
+    // the targets of just one terminator to know which edges
+    // we have to split.
+    // Assert that we do not hit the ill-formed representation.
+
+    // If there are other terminators before that one, some of
+    // the outgoing edges may not be dominated by this definition.
+    assert(&MI == &(*MI.getParent()->getFirstTerminator()) &&
+           "Do not know which outgoing edges are relevant");
+    const MachineInstr *Next = MI.getNextNode();
+    assert((!Next || Next->isUnconditionalBranch()) &&
+           "Do not know where each terminator ends up");
+    if (Next)
+      // If the next terminator uses Reg, this means we have
+      // to split right after MI and thus we need a way to ask
+      // which outgoing edges are affected.
+      assert(!Next->readsRegister(Reg) && "Need to split between terminators");
+    // We will split all the edges and repair there.
+  } else {
+    // This is a virtual register defined by a terminator.
+    if (ValMapping.BreakDown.size() == 1) {
+      // There is nothing to repair, but we may actually lie on
+      // the repairing cost because of the PHIs already proceeded
+      // as already stated.
+      // Though the code will be correct.
+      assert(0 && "Repairing cost may not be accurate");
+    } else {
+      // We need to do non-local repairing. Basically, patch all
+      // the uses (i.e., phis) that we already proceeded.
+      // For now, just say this mapping is not possible.
+      RepairPt.switchTo(RepairingPlacement::RepairingKind::Impossible);
+    }
+  }
+}
+
+RegBankSelect::MappingCost RegBankSelect::computeMapping(
+    MachineInstr &MI, const RegisterBankInfo::InstructionMapping &InstrMapping,
+    SmallVectorImpl<RepairingPlacement> &RepairPts,
+    const RegBankSelect::MappingCost *BestCost) {
+  assert((MBFI || !BestCost) && "Costs comparison require MBFI");
+
+  // If mapped with InstrMapping, MI will have the recorded cost.
+  MappingCost Cost(MBFI ? MBFI->getBlockFreq(MI.getParent()) : 1);
+  bool Saturated = Cost.addLocalCost(InstrMapping.getCost());
+  assert(!Saturated && "Possible mapping saturated the cost");
+  DEBUG(dbgs() << "Evaluating mapping cost for: " << MI);
+  DEBUG(dbgs() << "With: " << InstrMapping << '\n');
+  RepairPts.clear();
+  if (BestCost && Cost > *BestCost)
+    return Cost;
+
+  // Moreover, to realize this mapping, the register bank of each operand must
+  // match this mapping. In other words, we may need to locally reassign the
+  // register banks. Account for that repairing cost as well.
+  // In this context, local means in the surrounding of MI.
+  for (unsigned OpIdx = 0, EndOpIdx = MI.getNumOperands(); OpIdx != EndOpIdx;
+       ++OpIdx) {
+    const MachineOperand &MO = MI.getOperand(OpIdx);
+    if (!MO.isReg())
+      continue;
+    unsigned Reg = MO.getReg();
+    if (!Reg)
+      continue;
+    DEBUG(dbgs() << "Opd" << OpIdx);
+    const RegisterBankInfo::ValueMapping &ValMapping =
+        InstrMapping.getOperandMapping(OpIdx);
+    // If Reg is already properly mapped, this is free.
+    bool Assign;
+    if (assignmentMatch(Reg, ValMapping, Assign)) {
+      DEBUG(dbgs() << " is free (match).\n");
+      continue;
+    }
+    if (Assign) {
+      DEBUG(dbgs() << " is free (simple assignment).\n");
+      RepairPts.emplace_back(RepairingPlacement(MI, OpIdx, *TRI, *this,
+                                                RepairingPlacement::Reassign));
+      continue;
+    }
+
+    // Find the insertion point for the repairing code.
+    RepairPts.emplace_back(
+        RepairingPlacement(MI, OpIdx, *TRI, *this, RepairingPlacement::Insert));
+    RepairingPlacement &RepairPt = RepairPts.back();
+
+    // If we need to split a basic block to materialize this insertion point,
+    // we may give a higher cost to this mapping.
+    // Nevertheless, we may get away with the split, so try that first.
+    if (RepairPt.hasSplit())
+      tryAvoidingSplit(RepairPt, MO, ValMapping);
+
+    // Check that the materialization of the repairing is possible.
+    if (!RepairPt.canMaterialize())
+      return MappingCost::ImpossibleCost();
+
+    // Account for the split cost and repair cost.
+    // Unless the cost is already saturated or we do not care about the cost.
+    if (!BestCost || Saturated)
+      continue;
+
+    // To get accurate information we need MBFI and MBPI.
+    // Thus, if we end up here this information should be here.
+    assert(MBFI && MBPI && "Cost computation requires MBFI and MBPI");
+
+    // FIXME: We will have to rework the repairing cost model.
+    // The repairing cost depends on the register bank that MO has.
+    // However, when we break down the value into different values,
+    // MO may not have a register bank while still needing repairing.
+    // For the fast mode, we don't compute the cost so that is fine,
+    // but still for the repairing code, we will have to make a choice.
+    // For the greedy mode, we should choose greedily what is the best
+    // choice based on the next use of MO.
+
+    // Sums up the repairing cost of MO at each insertion point.
+    uint64_t RepairCost = getRepairCost(MO, ValMapping);
+    // Bias used for splitting: 5%.
+    const uint64_t PercentageForBias = 5;
+    uint64_t Bias = (RepairCost * PercentageForBias + 99) / 100;
+    // We should not need more than a couple of instructions to repair
+    // an assignment. In other words, the computation should not
+    // overflow because the repairing cost is free of basic block
+    // frequency.
+    assert(((RepairCost < RepairCost * PercentageForBias) &&
+            (RepairCost * PercentageForBias <
+             RepairCost * PercentageForBias + 99)) &&
+           "Repairing involves more than a billion of instructions?!");
+    for (const std::unique_ptr<InsertPoint> &InsertPt : RepairPt) {
+      assert(InsertPt->canMaterialize() && "We should not have made it here");
+      // We will applied some basic block frequency and those uses uint64_t.
+      if (!InsertPt->isSplit())
+        Saturated = Cost.addLocalCost(RepairCost);
+      else {
+        uint64_t CostForInsertPt = RepairCost;
+        // Again we shouldn't overflow here givent that
+        // CostForInsertPt is frequency free at this point.
+        assert(CostForInsertPt + Bias > CostForInsertPt &&
+               "Repairing + split bias overflows");
+        CostForInsertPt += Bias;
+        uint64_t PtCost = InsertPt->frequency(*this) * CostForInsertPt;
+        // Check if we just overflowed.
+        if ((Saturated = PtCost < CostForInsertPt))
+          Cost.saturate();
+        else
+          Saturated = Cost.addNonLocalCost(PtCost);
+      }
+
+      // Stop looking into what it takes to repair, this is already
+      // too expensive.
+      if (BestCost && Cost > *BestCost)
+        return Cost;
+
+      // No need to accumulate more cost information.
+      // We need to still gather the repairing information though.
+      if (Saturated)
+        break;
+    }
+  }
+  return Cost;
+}
+
+void RegBankSelect::applyMapping(
+    MachineInstr &MI, const RegisterBankInfo::InstructionMapping &InstrMapping,
+    SmallVectorImpl<RegBankSelect::RepairingPlacement> &RepairPts) {
+  // OpdMapper will hold all the information needed for the rewritting.
+  RegisterBankInfo::OperandsMapper OpdMapper(MI, InstrMapping, *MRI);
+
+  // First, place the repairing code.
+  for (RepairingPlacement &RepairPt : RepairPts) {
+    assert(RepairPt.canMaterialize() &&
+           RepairPt.getKind() != RepairingPlacement::Impossible &&
+           "This mapping is impossible");
+    assert(RepairPt.getKind() != RepairingPlacement::None &&
+           "This should not make its way in the list");
+    unsigned OpIdx = RepairPt.getOpIdx();
+    MachineOperand &MO = MI.getOperand(OpIdx);
+    const RegisterBankInfo::ValueMapping &ValMapping =
+        InstrMapping.getOperandMapping(OpIdx);
+    unsigned BreakDownSize = ValMapping.BreakDown.size();
+    (void)BreakDownSize;
+    unsigned Reg = MO.getReg();
+
+    switch (RepairPt.getKind()) {
+    case RepairingPlacement::Reassign:
+      assert(BreakDownSize == 1 &&
+             "Reassignment should only be for simple mapping");
+      MRI->setRegBank(Reg, *ValMapping.BreakDown[0].RegBank);
+      break;
+    case RepairingPlacement::Insert:
+      OpdMapper.createVRegs(OpIdx);
+      repairReg(MO, ValMapping, RepairPt, OpdMapper.getVRegs(OpIdx));
+      break;
+    default:
+      llvm_unreachable("Other kind should not happen");
+    }
+  }
+  // Second, rewrite the instruction.
+  DEBUG(dbgs() << "Actual mapping of the operands: " << OpdMapper << '\n');
+  RBI->applyMapping(OpdMapper);
+}
+
+void RegBankSelect::assignInstr(MachineInstr &MI) {
+  DEBUG(dbgs() << "Assign: " << MI);
+  // Remember the repairing placement for all the operands.
+  SmallVector<RepairingPlacement, 4> RepairPts;
+
+  RegisterBankInfo::InstructionMapping BestMapping;
+  if (OptMode == RegBankSelect::Mode::Fast) {
+    BestMapping = RBI->getInstrMapping(MI);
+    MappingCost DefaultCost = computeMapping(MI, BestMapping, RepairPts);
+    (void)DefaultCost;
+    assert(DefaultCost != MappingCost::ImpossibleCost() &&
+           "Default mapping is not suited");
+  } else {
+    RegisterBankInfo::InstructionMappings PossibleMappings =
+        RBI->getInstrPossibleMappings(MI);
+    assert(!PossibleMappings.empty() &&
+           "Do not know how to map this instruction");
+    BestMapping = std::move(findBestMapping(MI, PossibleMappings, RepairPts));
+  }
+  // Make sure the mapping is valid for MI.
+  assert(BestMapping.verify(MI) && "Invalid instruction mapping");
+
+  DEBUG(dbgs() << "Mapping: " << BestMapping << '\n');
+
+  // After this call, MI may not be valid anymore.
+  // Do not use it.
+  applyMapping(MI, BestMapping, RepairPts);
+}
+
+bool RegBankSelect::runOnMachineFunction(MachineFunction &MF) {
+  DEBUG(dbgs() << "Assign register banks for: " << MF.getName() << '\n');
+  const Function *F = MF.getFunction();
+  Mode SaveOptMode = OptMode;
+  if (F->hasFnAttribute(Attribute::OptimizeNone))
+    OptMode = Mode::Fast;
+  init(MF);
+  // Walk the function and assign register banks to all operands.
+  // Use a RPOT to make sure all registers are assigned before we choose
+  // the best mapping of the current instruction.
+  ReversePostOrderTraversal<MachineFunction*> RPOT(&MF);
+  for (MachineBasicBlock *MBB : RPOT) {
+    // Set a sensible insertion point so that subsequent calls to
+    // MIRBuilder.
+    MIRBuilder.setMBB(*MBB);
+    for (MachineBasicBlock::iterator MII = MBB->begin(), End = MBB->end();
+         MII != End;) {
+      // MI might be invalidated by the assignment, so move the
+      // iterator before hand.
+      assignInstr(*MII++);
+    }
+  }
+  OptMode = SaveOptMode;
+  return false;
+}
+
+//------------------------------------------------------------------------------
+//                  Helper Classes Implementation
+//------------------------------------------------------------------------------
+RegBankSelect::RepairingPlacement::RepairingPlacement(
+    MachineInstr &MI, unsigned OpIdx, const TargetRegisterInfo &TRI, Pass &P,
+    RepairingPlacement::RepairingKind Kind)
+    // Default is, we are going to insert code to repair OpIdx.
+    : Kind(Kind),
+      OpIdx(OpIdx),
+      CanMaterialize(Kind != RepairingKind::Impossible),
+      HasSplit(false),
+      P(P) {
+  const MachineOperand &MO = MI.getOperand(OpIdx);
+  assert(MO.isReg() && "Trying to repair a non-reg operand");
+
+  if (Kind != RepairingKind::Insert)
+    return;
+
+  // Repairings for definitions happen after MI, uses happen before.
+  bool Before = !MO.isDef();
+
+  // Check if we are done with MI.
+  if (!MI.isPHI() && !MI.isTerminator()) {
+    addInsertPoint(MI, Before);
+    // We are done with the initialization.
+    return;
+  }
+
+  // Now, look for the special cases.
+  if (MI.isPHI()) {
+    // - PHI must be the first instructions:
+    //   * Before, we have to split the related incoming edge.
+    //   * After, move the insertion point past the last phi.
+    if (!Before) {
+      MachineBasicBlock::iterator It = MI.getParent()->getFirstNonPHI();
+      if (It != MI.getParent()->end())
+        addInsertPoint(*It, /*Before*/ true);
+      else
+        addInsertPoint(*(--It), /*Before*/ false);
+      return;
+    }
+    // We repair a use of a phi, we may need to split the related edge.
+    MachineBasicBlock &Pred = *MI.getOperand(OpIdx + 1).getMBB();
+    // Check if we can move the insertion point prior to the
+    // terminators of the predecessor.
+    unsigned Reg = MO.getReg();
+    MachineBasicBlock::iterator It = Pred.getLastNonDebugInstr();
+    for (auto Begin = Pred.begin(); It != Begin && It->isTerminator(); --It)
+      if (It->modifiesRegister(Reg, &TRI)) {
+        // We cannot hoist the repairing code in the predecessor.
+        // Split the edge.
+        addInsertPoint(Pred, *MI.getParent());
+        return;
+      }
+    // At this point, we can insert in Pred.
+
+    // - If It is invalid, Pred is empty and we can insert in Pred
+    //   wherever we want.
+    // - If It is valid, It is the first non-terminator, insert after It.
+    if (It == Pred.end())
+      addInsertPoint(Pred, /*Beginning*/ false);
+    else
+      addInsertPoint(*It, /*Before*/ false);
+  } else {
+    // - Terminators must be the last instructions:
+    //   * Before, move the insert point before the first terminator.
+    //   * After, we have to split the outcoming edges.
+    unsigned Reg = MO.getReg();
+    if (Before) {
+      // Check whether Reg is defined by any terminator.
+      MachineBasicBlock::iterator It = MI;
+      for (auto Begin = MI.getParent()->begin();
+           --It != Begin && It->isTerminator();)
+        if (It->modifiesRegister(Reg, &TRI)) {
+          // Insert the repairing code right after the definition.
+          addInsertPoint(*It, /*Before*/ false);
+          return;
+        }
+      addInsertPoint(*It, /*Before*/ true);
+      return;
+    }
+    // Make sure Reg is not redefined by other terminators, otherwise
+    // we do not know how to split.
+    for (MachineBasicBlock::iterator It = MI, End = MI.getParent()->end();
+         ++It != End;)
+      // The machine verifier should reject this kind of code.
+      assert(It->modifiesRegister(Reg, &TRI) && "Do not know where to split");
+    // Split each outcoming edges.
+    MachineBasicBlock &Src = *MI.getParent();
+    for (auto &Succ : Src.successors())
+      addInsertPoint(Src, Succ);
+  }
+}
+
+void RegBankSelect::RepairingPlacement::addInsertPoint(MachineInstr &MI,
+                                                       bool Before) {
+  addInsertPoint(*new InstrInsertPoint(MI, Before));
+}
+
+void RegBankSelect::RepairingPlacement::addInsertPoint(MachineBasicBlock &MBB,
+                                                       bool Beginning) {
+  addInsertPoint(*new MBBInsertPoint(MBB, Beginning));
+}
+
+void RegBankSelect::RepairingPlacement::addInsertPoint(MachineBasicBlock &Src,
+                                                       MachineBasicBlock &Dst) {
+  addInsertPoint(*new EdgeInsertPoint(Src, Dst, P));
+}
+
+void RegBankSelect::RepairingPlacement::addInsertPoint(
+    RegBankSelect::InsertPoint &Point) {
+  CanMaterialize &= Point.canMaterialize();
+  HasSplit |= Point.isSplit();
+  InsertPoints.emplace_back(&Point);
+}
+
+RegBankSelect::InstrInsertPoint::InstrInsertPoint(MachineInstr &Instr,
+                                                  bool Before)
+    : InsertPoint(), Instr(Instr), Before(Before) {
+  // Since we do not support splitting, we do not need to update
+  // liveness and such, so do not do anything with P.
+  assert((!Before || !Instr.isPHI()) &&
+         "Splitting before phis requires more points");
+  assert((!Before || !Instr.getNextNode() || !Instr.getNextNode()->isPHI()) &&
+         "Splitting between phis does not make sense");
+}
+
+void RegBankSelect::InstrInsertPoint::materialize() {
+  if (isSplit()) {
+    // Slice and return the beginning of the new block.
+    // If we need to split between the terminators, we theoritically
+    // need to know where the first and second set of terminators end
+    // to update the successors properly.
+    // Now, in pratice, we should have a maximum of 2 branch
+    // instructions; one conditional and one unconditional. Therefore
+    // we know how to update the successor by looking at the target of
+    // the unconditional branch.
+    // If we end up splitting at some point, then, we should update
+    // the liveness information and such. I.e., we would need to
+    // access P here.
+    // The machine verifier should actually make sure such cases
+    // cannot happen.
+    llvm_unreachable("Not yet implemented");
+  }
+  // Otherwise the insertion point is just the current or next
+  // instruction depending on Before. I.e., there is nothing to do
+  // here.
+}
+
+bool RegBankSelect::InstrInsertPoint::isSplit() const {
+  // If the insertion point is after a terminator, we need to split.
+  if (!Before)
+    return Instr.isTerminator();
+  // If we insert before an instruction that is after a terminator,
+  // we are still after a terminator.
+  return Instr.getPrevNode() && Instr.getPrevNode()->isTerminator();
+}
+
+uint64_t RegBankSelect::InstrInsertPoint::frequency(const Pass &P) const {
+  // Even if we need to split, because we insert between terminators,
+  // this split has actually the same frequency as the instruction.
+  const MachineBlockFrequencyInfo *MBFI =
+      P.getAnalysisIfAvailable<MachineBlockFrequencyInfo>();
+  if (!MBFI)
+    return 1;
+  return MBFI->getBlockFreq(Instr.getParent()).getFrequency();
+}
+
+uint64_t RegBankSelect::MBBInsertPoint::frequency(const Pass &P) const {
+  const MachineBlockFrequencyInfo *MBFI =
+      P.getAnalysisIfAvailable<MachineBlockFrequencyInfo>();
+  if (!MBFI)
+    return 1;
+  return MBFI->getBlockFreq(&MBB).getFrequency();
+}
+
+void RegBankSelect::EdgeInsertPoint::materialize() {
+  // If we end up repairing twice at the same place before materializing the
+  // insertion point, we may think we have to split an edge twice.
+  // We should have a factory for the insert point such that identical points
+  // are the same instance.
+  assert(Src.isSuccessor(DstOrSplit) && DstOrSplit->isPredecessor(&Src) &&
+         "This point has already been split");
+  MachineBasicBlock *NewBB = Src.SplitCriticalEdge(DstOrSplit, P);
+  assert(NewBB && "Invalid call to materialize");
+  // We reuse the destination block to hold the information of the new block.
+  DstOrSplit = NewBB;
+}
+
+uint64_t RegBankSelect::EdgeInsertPoint::frequency(const Pass &P) const {
+  const MachineBlockFrequencyInfo *MBFI =
+      P.getAnalysisIfAvailable<MachineBlockFrequencyInfo>();
+  if (!MBFI)
+    return 1;
+  if (WasMaterialized)
+    return MBFI->getBlockFreq(DstOrSplit).getFrequency();
+
+  const MachineBranchProbabilityInfo *MBPI =
+      P.getAnalysisIfAvailable<MachineBranchProbabilityInfo>();
+  if (!MBPI)
+    return 1;
+  // The basic block will be on the edge.
+  return (MBFI->getBlockFreq(&Src) * MBPI->getEdgeProbability(&Src, DstOrSplit))
+      .getFrequency();
+}
+
+bool RegBankSelect::EdgeInsertPoint::canMaterialize() const {
+  // If this is not a critical edge, we should not have used this insert
+  // point. Indeed, either the successor or the predecessor should
+  // have do.
+  assert(Src.succ_size() > 1 && DstOrSplit->pred_size() > 1 &&
+         "Edge is not critical");
+  return Src.canSplitCriticalEdge(DstOrSplit);
+}
+
+RegBankSelect::MappingCost::MappingCost(const BlockFrequency &LocalFreq)
+    : LocalCost(0), NonLocalCost(0), LocalFreq(LocalFreq.getFrequency()) {}
+
+bool RegBankSelect::MappingCost::addLocalCost(uint64_t Cost) {
+  // Check if this overflows.
+  if (LocalCost + Cost < LocalCost) {
+    saturate();
+    return true;
+  }
+  LocalCost += Cost;
+  return isSaturated();
+}
+
+bool RegBankSelect::MappingCost::addNonLocalCost(uint64_t Cost) {
+  // Check if this overflows.
+  if (NonLocalCost + Cost < NonLocalCost) {
+    saturate();
+    return true;
+  }
+  NonLocalCost += Cost;
+  return isSaturated();
+}
+
+bool RegBankSelect::MappingCost::isSaturated() const {
+  return LocalCost == UINT64_MAX - 1 && NonLocalCost == UINT64_MAX &&
+         LocalFreq == UINT64_MAX;
+}
+
+void RegBankSelect::MappingCost::saturate() {
+  *this = ImpossibleCost();
+  --LocalCost;
+}
+
+RegBankSelect::MappingCost RegBankSelect::MappingCost::ImpossibleCost() {
+  return MappingCost(UINT64_MAX, UINT64_MAX, UINT64_MAX);
+}
+
+bool RegBankSelect::MappingCost::operator<(const MappingCost &Cost) const {
+  // Sort out the easy cases.
+  if (*this == Cost)
+    return false;
+  // If one is impossible to realize the other is cheaper unless it is
+  // impossible as well.
+  if ((*this == ImpossibleCost()) || (Cost == ImpossibleCost()))
+    return (*this == ImpossibleCost()) < (Cost == ImpossibleCost());
+  // If one is saturated the other is cheaper, unless it is saturated
+  // as well.
+  if (isSaturated() || Cost.isSaturated())
+    return isSaturated() < Cost.isSaturated();
+  // At this point we know both costs hold sensible values.
+
+  // If both values have a different base frequency, there is no much
+  // we can do but to scale everything.
+  // However, if they have the same base frequency we can avoid making
+  // complicated computation.
+  uint64_t ThisLocalAdjust;
+  uint64_t OtherLocalAdjust;
+  if (LLVM_LIKELY(LocalFreq == Cost.LocalFreq)) {
+
+    // At this point, we know the local costs are comparable.
+    // Do the case that do not involve potential overflow first.
+    if (NonLocalCost == Cost.NonLocalCost)
+      // Since the non-local costs do not discriminate on the result,
+      // just compare the local costs.
+      return LocalCost < Cost.LocalCost;
+
+    // The base costs are comparable so we may only keep the relative
+    // value to increase our chances of avoiding overflows.
+    ThisLocalAdjust = 0;
+    OtherLocalAdjust = 0;
+    if (LocalCost < Cost.LocalCost)
+      OtherLocalAdjust = Cost.LocalCost - LocalCost;
+    else
+      ThisLocalAdjust = LocalCost - Cost.LocalCost;
+
+  } else {
+    ThisLocalAdjust = LocalCost;
+    OtherLocalAdjust = Cost.LocalCost;
+  }
+
+  // The non-local costs are comparable, just keep the relative value.
+  uint64_t ThisNonLocalAdjust = 0;
+  uint64_t OtherNonLocalAdjust = 0;
+  if (NonLocalCost < Cost.NonLocalCost)
+    OtherNonLocalAdjust = Cost.NonLocalCost - NonLocalCost;
+  else
+    ThisNonLocalAdjust = NonLocalCost - Cost.NonLocalCost;
+  // Scale everything to make them comparable.
+  uint64_t ThisScaledCost = ThisLocalAdjust * LocalFreq;
+  // Check for overflow on that operation.
+  bool ThisOverflows = ThisLocalAdjust && (ThisScaledCost < ThisLocalAdjust ||
+                                           ThisScaledCost < LocalFreq);
+  uint64_t OtherScaledCost = OtherLocalAdjust * Cost.LocalFreq;
+  // Check for overflow on the last operation.
+  bool OtherOverflows =
+      OtherLocalAdjust &&
+      (OtherScaledCost < OtherLocalAdjust || OtherScaledCost < Cost.LocalFreq);
+  // Add the non-local costs.
+  ThisOverflows |= ThisNonLocalAdjust &&
+                   ThisScaledCost + ThisNonLocalAdjust < ThisNonLocalAdjust;
+  ThisScaledCost += ThisNonLocalAdjust;
+  OtherOverflows |= OtherNonLocalAdjust &&
+                    OtherScaledCost + OtherNonLocalAdjust < OtherNonLocalAdjust;
+  OtherScaledCost += OtherNonLocalAdjust;
+  // If both overflows, we cannot compare without additional
+  // precision, e.g., APInt. Just give up on that case.
+  if (ThisOverflows && OtherOverflows)
+    return false;
+  // If one overflows but not the other, we can still compare.
+  if (ThisOverflows || OtherOverflows)
+    return ThisOverflows < OtherOverflows;
+  // Otherwise, just compare the values.
+  return ThisScaledCost < OtherScaledCost;
+}
+
+bool RegBankSelect::MappingCost::operator==(const MappingCost &Cost) const {
+  return LocalCost == Cost.LocalCost && NonLocalCost == Cost.NonLocalCost &&
+         LocalFreq == Cost.LocalFreq;
+}
diff --git a/lib/CodeGen/GlobalISel/RegisterBank.cpp b/lib/CodeGen/GlobalISel/RegisterBank.cpp
new file mode 100644
index 000000000000..a911225b5af5
--- /dev/null
+++ b/lib/CodeGen/GlobalISel/RegisterBank.cpp
@@ -0,0 +1,107 @@
+//===- llvm/CodeGen/GlobalISel/RegisterBank.cpp - Register Bank --*- C++ -*-==//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+/// \file
+/// This file implements the RegisterBank class.
+//===----------------------------------------------------------------------===//
+
+#include "llvm/CodeGen/GlobalISel/RegisterBank.h"
+#include "llvm/Target/TargetRegisterInfo.h"
+
+#define DEBUG_TYPE "registerbank"
+
+using namespace llvm;
+
+const unsigned RegisterBank::InvalidID = UINT_MAX;
+
+RegisterBank::RegisterBank() : ID(InvalidID), Name(nullptr), Size(0) {}
+
+bool RegisterBank::verify(const TargetRegisterInfo &TRI) const {
+  assert(isValid() && "Invalid register bank");
+  assert(ContainedRegClasses.size() == TRI.getNumRegClasses() &&
+         "TRI does not match the initialization process?");
+  for (unsigned RCId = 0, End = TRI.getNumRegClasses(); RCId != End; ++RCId) {
+    const TargetRegisterClass &RC = *TRI.getRegClass(RCId);
+
+    if (!covers(RC))
+      continue;
+    // Verify that the register bank covers all the sub classes of the
+    // classes it covers.
+
+    // Use a different (slow in that case) method than
+    // RegisterBankInfo to find the subclasses of RC, to make sure
+    // both agree on the covers.
+    for (unsigned SubRCId = 0; SubRCId != End; ++SubRCId) {
+      const TargetRegisterClass &SubRC = *TRI.getRegClass(RCId);
+
+      if (!RC.hasSubClassEq(&SubRC))
+        continue;
+
+      // Verify that the Size of the register bank is big enough to cover
+      // all the register classes it covers.
+      assert((getSize() >= SubRC.getSize() * 8) &&
+             "Size is not big enough for all the subclasses!");
+      assert(covers(SubRC) && "Not all subclasses are covered");
+    }
+  }
+  return true;
+}
+
+bool RegisterBank::covers(const TargetRegisterClass &RC) const {
+  assert(isValid() && "RB hasn't been initialized yet");
+  return ContainedRegClasses.test(RC.getID());
+}
+
+bool RegisterBank::isValid() const {
+  return ID != InvalidID && Name != nullptr && Size != 0 &&
+         // A register bank that does not cover anything is useless.
+         !ContainedRegClasses.empty();
+}
+
+bool RegisterBank::operator==(const RegisterBank &OtherRB) const {
+  // There must be only one instance of a given register bank alive
+  // for the whole compilation.
+  // The RegisterBankInfo is supposed to enforce that.
+  assert((OtherRB.getID() != getID() || &OtherRB == this) &&
+         "ID does not uniquely identify a RegisterBank");
+  return &OtherRB == this;
+}
+
+void RegisterBank::dump(const TargetRegisterInfo *TRI) const {
+  print(dbgs(), /* IsForDebug */ true, TRI);
+}
+
+void RegisterBank::print(raw_ostream &OS, bool IsForDebug,
+                         const TargetRegisterInfo *TRI) const {
+  OS << getName();
+  if (!IsForDebug)
+    return;
+  OS << "(ID:" << getID() << ", Size:" << getSize() << ")\n"
+     << "isValid:" << isValid() << '\n'
+     << "Number of Covered register classes: " << ContainedRegClasses.count()
+     << '\n';
+  // Print all the subclasses if we can.
+  // This register classes may not be properly initialized yet.
+  if (!TRI || ContainedRegClasses.empty())
+    return;
+  assert(ContainedRegClasses.size() == TRI->getNumRegClasses() &&
+         "TRI does not match the initialization process?");
+  bool IsFirst = true;
+  OS << "Covered register classes:\n";
+  for (unsigned RCId = 0, End = TRI->getNumRegClasses(); RCId != End; ++RCId) {
+    const TargetRegisterClass &RC = *TRI->getRegClass(RCId);
+
+    if (!covers(RC))
+      continue;
+
+    if (!IsFirst)
+      OS << ", ";
+    OS << TRI->getRegClassName(&RC);
+    IsFirst = false;
+  }
+}
diff --git a/lib/CodeGen/GlobalISel/RegisterBankInfo.cpp b/lib/CodeGen/GlobalISel/RegisterBankInfo.cpp
new file mode 100644
index 000000000000..ef8e4f6d6851
--- /dev/null
+++ b/lib/CodeGen/GlobalISel/RegisterBankInfo.cpp
@@ -0,0 +1,663 @@
+//===- llvm/CodeGen/GlobalISel/RegisterBankInfo.cpp --------------*- C++ -*-==//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+/// \file
+/// This file implements the RegisterBankInfo class.
+//===----------------------------------------------------------------------===//
+
+#include "llvm/CodeGen/GlobalISel/RegisterBankInfo.h"
+#include "llvm/ADT/SmallString.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/iterator_range.h"
+#include "llvm/CodeGen/GlobalISel/RegisterBank.h"
+#include "llvm/CodeGen/MachineBasicBlock.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/IR/Type.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Target/TargetInstrInfo.h"
+#include "llvm/Target/TargetOpcodes.h"
+#include "llvm/Target/TargetRegisterInfo.h"
+#include "llvm/Target/TargetSubtargetInfo.h"
+
+#include <algorithm> // For std::max.
+
+#define DEBUG_TYPE "registerbankinfo"
+
+using namespace llvm;
+
+const unsigned RegisterBankInfo::DefaultMappingID = UINT_MAX;
+const unsigned RegisterBankInfo::InvalidMappingID = UINT_MAX - 1;
+
+//------------------------------------------------------------------------------
+// RegisterBankInfo implementation.
+//------------------------------------------------------------------------------
+RegisterBankInfo::RegisterBankInfo(unsigned NumRegBanks)
+    : NumRegBanks(NumRegBanks) {
+  RegBanks.reset(new RegisterBank[NumRegBanks]);
+}
+
+bool RegisterBankInfo::verify(const TargetRegisterInfo &TRI) const {
+  DEBUG(for (unsigned Idx = 0, End = getNumRegBanks(); Idx != End; ++Idx) {
+    const RegisterBank &RegBank = getRegBank(Idx);
+    assert(Idx == RegBank.getID() &&
+           "ID does not match the index in the array");
+    dbgs() << "Verify " << RegBank << '\n';
+    assert(RegBank.verify(TRI) && "RegBank is invalid");
+  });
+  return true;
+}
+
+void RegisterBankInfo::createRegisterBank(unsigned ID, const char *Name) {
+  DEBUG(dbgs() << "Create register bank: " << ID << " with name \"" << Name
+               << "\"\n");
+  RegisterBank &RegBank = getRegBank(ID);
+  assert(RegBank.getID() == RegisterBank::InvalidID &&
+         "A register bank should be created only once");
+  RegBank.ID = ID;
+  RegBank.Name = Name;
+}
+
+void RegisterBankInfo::addRegBankCoverage(unsigned ID, unsigned RCId,
+                                          const TargetRegisterInfo &TRI,
+                                          bool AddTypeMapping) {
+  RegisterBank &RB = getRegBank(ID);
+  unsigned NbOfRegClasses = TRI.getNumRegClasses();
+
+  DEBUG(dbgs() << "Add coverage for: " << RB << '\n');
+
+  // Check if RB is underconstruction.
+  if (!RB.isValid())
+    RB.ContainedRegClasses.resize(NbOfRegClasses);
+  else if (RB.covers(*TRI.getRegClass(RCId)))
+    // If RB already covers this register class, there is nothing
+    // to do.
+    return;
+
+  BitVector &Covered = RB.ContainedRegClasses;
+  SmallVector<unsigned, 8> WorkList;
+
+  WorkList.push_back(RCId);
+  Covered.set(RCId);
+
+  unsigned &MaxSize = RB.Size;
+  do {
+    unsigned RCId = WorkList.pop_back_val();
+
+    const TargetRegisterClass &CurRC = *TRI.getRegClass(RCId);
+
+    DEBUG(dbgs() << "Examine: " << TRI.getRegClassName(&CurRC)
+                 << "(Size*8: " << (CurRC.getSize() * 8) << ")\n");
+
+    // Remember the biggest size in bits.
+    MaxSize = std::max(MaxSize, CurRC.getSize() * 8);
+
+    // If we have been asked to record the type supported by this
+    // register bank, do it now.
+    if (AddTypeMapping)
+      for (MVT::SimpleValueType SVT :
+           make_range(CurRC.vt_begin(), CurRC.vt_end()))
+        recordRegBankForType(getRegBank(ID), SVT);
+
+    // Walk through all sub register classes and push them into the worklist.
+    bool First = true;
+    for (BitMaskClassIterator It(CurRC.getSubClassMask(), TRI); It.isValid();
+         ++It) {
+      unsigned SubRCId = It.getID();
+      if (!Covered.test(SubRCId)) {
+        if (First)
+          DEBUG(dbgs() << "  Enqueue sub-class: ");
+        DEBUG(dbgs() << TRI.getRegClassName(TRI.getRegClass(SubRCId)) << ", ");
+        WorkList.push_back(SubRCId);
+        // Remember that we saw the sub class.
+        Covered.set(SubRCId);
+        First = false;
+      }
+    }
+    if (!First)
+      DEBUG(dbgs() << '\n');
+
+    // Push also all the register classes that can be accessed via a
+    // subreg index, i.e., its subreg-class (which is different than
+    // its subclass).
+    //
+    // Note: It would probably be faster to go the other way around
+    // and have this method add only super classes, since this
+    // information is available in a more efficient way. However, it
+    // feels less natural for the client of this APIs plus we will
+    // TableGen the whole bitset at some point, so compile time for
+    // the initialization is not very important.
+    First = true;
+    for (unsigned SubRCId = 0; SubRCId < NbOfRegClasses; ++SubRCId) {
+      if (Covered.test(SubRCId))
+        continue;
+      bool Pushed = false;
+      const TargetRegisterClass *SubRC = TRI.getRegClass(SubRCId);
+      for (SuperRegClassIterator SuperRCIt(SubRC, &TRI); SuperRCIt.isValid();
+           ++SuperRCIt) {
+        if (Pushed)
+          break;
+        for (BitMaskClassIterator It(SuperRCIt.getMask(), TRI); It.isValid();
+             ++It) {
+          unsigned SuperRCId = It.getID();
+          if (SuperRCId == RCId) {
+            if (First)
+              DEBUG(dbgs() << "  Enqueue subreg-class: ");
+            DEBUG(dbgs() << TRI.getRegClassName(SubRC) << ", ");
+            WorkList.push_back(SubRCId);
+            // Remember that we saw the sub class.
+            Covered.set(SubRCId);
+            Pushed = true;
+            First = false;
+            break;
+          }
+        }
+      }
+    }
+    if (!First)
+      DEBUG(dbgs() << '\n');
+  } while (!WorkList.empty());
+}
+
+const RegisterBank *
+RegisterBankInfo::getRegBank(unsigned Reg, const MachineRegisterInfo &MRI,
+                             const TargetRegisterInfo &TRI) const {
+  if (TargetRegisterInfo::isPhysicalRegister(Reg))
+    return &getRegBankFromRegClass(*TRI.getMinimalPhysRegClass(Reg));
+
+  assert(Reg && "NoRegister does not have a register bank");
+  const RegClassOrRegBank &RegClassOrBank = MRI.getRegClassOrRegBank(Reg);
+  if (RegClassOrBank.is<const RegisterBank *>())
+    return RegClassOrBank.get<const RegisterBank *>();
+  const TargetRegisterClass *RC =
+      RegClassOrBank.get<const TargetRegisterClass *>();
+  if (RC)
+    return &getRegBankFromRegClass(*RC);
+  return nullptr;
+}
+
+const RegisterBank *RegisterBankInfo::getRegBankFromConstraints(
+    const MachineInstr &MI, unsigned OpIdx, const TargetInstrInfo &TII,
+    const TargetRegisterInfo &TRI) const {
+  // The mapping of the registers may be available via the
+  // register class constraints.
+  const TargetRegisterClass *RC = MI.getRegClassConstraint(OpIdx, &TII, &TRI);
+
+  if (!RC)
+    return nullptr;
+
+  const RegisterBank &RegBank = getRegBankFromRegClass(*RC);
+  // Sanity check that the target properly implemented getRegBankFromRegClass.
+  assert(RegBank.covers(*RC) &&
+         "The mapping of the register bank does not make sense");
+  return &RegBank;
+}
+
+RegisterBankInfo::InstructionMapping
+RegisterBankInfo::getInstrMappingImpl(const MachineInstr &MI) const {
+  RegisterBankInfo::InstructionMapping Mapping(DefaultMappingID, /*Cost*/ 1,
+                                               MI.getNumOperands());
+  const MachineFunction &MF = *MI.getParent()->getParent();
+  const TargetSubtargetInfo &STI = MF.getSubtarget();
+  const TargetRegisterInfo &TRI = *STI.getRegisterInfo();
+  const MachineRegisterInfo &MRI = MF.getRegInfo();
+  // We may need to query the instruction encoding to guess the mapping.
+  const TargetInstrInfo &TII = *STI.getInstrInfo();
+
+  // Before doing anything complicated check if the mapping is not
+  // directly available.
+  bool CompleteMapping = true;
+  // For copies we want to walk over the operands and try to find one
+  // that has a register bank.
+  bool isCopyLike = MI.isCopy() || MI.isPHI();
+  // Remember the register bank for reuse for copy-like instructions.
+  const RegisterBank *RegBank = nullptr;
+  // Remember the size of the register for reuse for copy-like instructions.
+  unsigned RegSize = 0;
+  for (unsigned OpIdx = 0, End = MI.getNumOperands(); OpIdx != End; ++OpIdx) {
+    const MachineOperand &MO = MI.getOperand(OpIdx);
+    if (!MO.isReg())
+      continue;
+    unsigned Reg = MO.getReg();
+    if (!Reg)
+      continue;
+    // The register bank of Reg is just a side effect of the current
+    // excution and in particular, there is no reason to believe this
+    // is the best default mapping for the current instruction.  Keep
+    // it as an alternative register bank if we cannot figure out
+    // something.
+    const RegisterBank *AltRegBank = getRegBank(Reg, MRI, TRI);
+    // For copy-like instruction, we want to reuse the register bank
+    // that is already set on Reg, if any, since those instructions do
+    // not have any constraints.
+    const RegisterBank *CurRegBank = isCopyLike ? AltRegBank : nullptr;
+    if (!CurRegBank) {
+      // If this is a target specific instruction, we can deduce
+      // the register bank from the encoding constraints.
+      CurRegBank = getRegBankFromConstraints(MI, OpIdx, TII, TRI);
+      if (!CurRegBank) {
+        // Check if we can deduce the register bank from the type of
+        // the instruction.
+        Type *MITy = MI.getType();
+        if (MITy)
+          CurRegBank = getRegBankForType(
+              MVT::getVT(MITy, /*HandleUnknown*/ true).SimpleTy);
+        if (!CurRegBank)
+          // Use the current assigned register bank.
+          // That may not make much sense though.
+          CurRegBank = AltRegBank;
+        if (!CurRegBank) {
+          // All our attempts failed, give up.
+          CompleteMapping = false;
+
+          if (!isCopyLike)
+            // MI does not carry enough information to guess the mapping.
+            return InstructionMapping();
+
+          // For copies, we want to keep interating to find a register
+          // bank for the other operands if we did not find one yet.
+          if (RegBank)
+            break;
+          continue;
+        }
+      }
+    }
+    RegBank = CurRegBank;
+    RegSize = getSizeInBits(Reg, MRI, TRI);
+    Mapping.setOperandMapping(OpIdx, RegSize, *CurRegBank);
+  }
+
+  if (CompleteMapping)
+    return Mapping;
+
+  assert(isCopyLike && "We should have bailed on non-copies at this point");
+  // For copy like instruction, if none of the operand has a register
+  // bank avialable, there is nothing we can propagate.
+  if (!RegBank)
+    return InstructionMapping();
+
+  // This is a copy-like instruction.
+  // Propagate RegBank to all operands that do not have a
+  // mapping yet.
+  for (unsigned OpIdx = 0, End = MI.getNumOperands(); OpIdx != End; ++OpIdx) {
+    const MachineOperand &MO = MI.getOperand(OpIdx);
+    // Don't assign a mapping for non-reg operands.
+    if (!MO.isReg())
+      continue;
+
+    // If a mapping already exists, do not touch it.
+    if (!static_cast<const InstructionMapping *>(&Mapping)
+             ->getOperandMapping(OpIdx)
+             .BreakDown.empty())
+      continue;
+
+    Mapping.setOperandMapping(OpIdx, RegSize, *RegBank);
+  }
+  return Mapping;
+}
+
+RegisterBankInfo::InstructionMapping
+RegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
+    RegisterBankInfo::InstructionMapping Mapping = getInstrMappingImpl(MI);
+    if (Mapping.isValid())
+      return Mapping;
+  llvm_unreachable("The target must implement this");
+}
+
+RegisterBankInfo::InstructionMappings
+RegisterBankInfo::getInstrPossibleMappings(const MachineInstr &MI) const {
+  InstructionMappings PossibleMappings;
+  // Put the default mapping first.
+  PossibleMappings.push_back(getInstrMapping(MI));
+  // Then the alternative mapping, if any.
+  InstructionMappings AltMappings = getInstrAlternativeMappings(MI);
+  for (InstructionMapping &AltMapping : AltMappings)
+    PossibleMappings.emplace_back(std::move(AltMapping));
+#ifndef NDEBUG
+  for (const InstructionMapping &Mapping : PossibleMappings)
+    assert(Mapping.verify(MI) && "Mapping is invalid");
+#endif
+  return PossibleMappings;
+}
+
+RegisterBankInfo::InstructionMappings
+RegisterBankInfo::getInstrAlternativeMappings(const MachineInstr &MI) const {
+  // No alternative for MI.
+  return InstructionMappings();
+}
+
+void RegisterBankInfo::applyDefaultMapping(const OperandsMapper &OpdMapper) {
+  MachineInstr &MI = OpdMapper.getMI();
+  DEBUG(dbgs() << "Applying default-like mapping\n");
+  for (unsigned OpIdx = 0, EndIdx = MI.getNumOperands(); OpIdx != EndIdx;
+       ++OpIdx) {
+    DEBUG(dbgs() << "OpIdx " << OpIdx);
+    MachineOperand &MO = MI.getOperand(OpIdx);
+    if (!MO.isReg()) {
+      DEBUG(dbgs() << " is not a register, nothing to be done\n");
+      continue;
+    }
+    assert(
+        OpdMapper.getInstrMapping().getOperandMapping(OpIdx).BreakDown.size() ==
+            1 &&
+        "This mapping is too complex for this function");
+    iterator_range<SmallVectorImpl<unsigned>::const_iterator> NewRegs =
+        OpdMapper.getVRegs(OpIdx);
+    if (NewRegs.begin() == NewRegs.end()) {
+      DEBUG(dbgs() << " has not been repaired, nothing to be done\n");
+      continue;
+    }
+    DEBUG(dbgs() << " changed, replace " << MO.getReg());
+    MO.setReg(*NewRegs.begin());
+    DEBUG(dbgs() << " with " << MO.getReg());
+  }
+}
+
+unsigned RegisterBankInfo::getSizeInBits(unsigned Reg,
+                                         const MachineRegisterInfo &MRI,
+                                         const TargetRegisterInfo &TRI) {
+  const TargetRegisterClass *RC = nullptr;
+  if (TargetRegisterInfo::isPhysicalRegister(Reg)) {
+    // The size is not directly available for physical registers.
+    // Instead, we need to access a register class that contains Reg and
+    // get the size of that register class.
+    RC = TRI.getMinimalPhysRegClass(Reg);
+  } else {
+    unsigned RegSize = MRI.getSize(Reg);
+    // If Reg is not a generic register, query the register class to
+    // get its size.
+    if (RegSize)
+      return RegSize;
+    // Since Reg is not a generic register, it must have a register class.
+    RC = MRI.getRegClass(Reg);
+  }
+  assert(RC && "Unable to deduce the register class");
+  return RC->getSize() * 8;
+}
+
+//------------------------------------------------------------------------------
+// Helper classes implementation.
+//------------------------------------------------------------------------------
+void RegisterBankInfo::PartialMapping::dump() const {
+  print(dbgs());
+  dbgs() << '\n';
+}
+
+bool RegisterBankInfo::PartialMapping::verify() const {
+  assert(RegBank && "Register bank not set");
+  assert(Length && "Empty mapping");
+  assert((StartIdx < getHighBitIdx()) && "Overflow, switch to APInt?");
+  // Check if the minimum width fits into RegBank.
+  assert(RegBank->getSize() >= Length && "Register bank too small for Mask");
+  return true;
+}
+
+void RegisterBankInfo::PartialMapping::print(raw_ostream &OS) const {
+  OS << "[" << StartIdx << ", " << getHighBitIdx() << "], RegBank = ";
+  if (RegBank)
+    OS << *RegBank;
+  else
+    OS << "nullptr";
+}
+
+bool RegisterBankInfo::ValueMapping::verify(unsigned ExpectedBitWidth) const {
+  assert(!BreakDown.empty() && "Value mapped nowhere?!");
+  unsigned OrigValueBitWidth = 0;
+  for (const RegisterBankInfo::PartialMapping &PartMap : BreakDown) {
+    // Check that each register bank is big enough to hold the partial value:
+    // this check is done by PartialMapping::verify
+    assert(PartMap.verify() && "Partial mapping is invalid");
+    // The original value should completely be mapped.
+    // Thus the maximum accessed index + 1 is the size of the original value.
+    OrigValueBitWidth =
+        std::max(OrigValueBitWidth, PartMap.getHighBitIdx() + 1);
+  }
+  assert(OrigValueBitWidth == ExpectedBitWidth && "BitWidth does not match");
+  APInt ValueMask(OrigValueBitWidth, 0);
+  for (const RegisterBankInfo::PartialMapping &PartMap : BreakDown) {
+    // Check that the union of the partial mappings covers the whole value,
+    // without overlaps.
+    // The high bit is exclusive in the APInt API, thus getHighBitIdx + 1.
+    APInt PartMapMask = APInt::getBitsSet(OrigValueBitWidth, PartMap.StartIdx,
+                                          PartMap.getHighBitIdx() + 1);
+    ValueMask ^= PartMapMask;
+    assert((ValueMask & PartMapMask) == PartMapMask &&
+           "Some partial mappings overlap");
+  }
+  assert(ValueMask.isAllOnesValue() && "Value is not fully mapped");
+  return true;
+}
+
+void RegisterBankInfo::ValueMapping::dump() const {
+  print(dbgs());
+  dbgs() << '\n';
+}
+
+void RegisterBankInfo::ValueMapping::print(raw_ostream &OS) const {
+  OS << "#BreakDown: " << BreakDown.size() << " ";
+  bool IsFirst = true;
+  for (const PartialMapping &PartMap : BreakDown) {
+    if (!IsFirst)
+      OS << ", ";
+    OS << '[' << PartMap << ']';
+    IsFirst = false;
+  }
+}
+
+void RegisterBankInfo::InstructionMapping::setOperandMapping(
+    unsigned OpIdx, unsigned MaskSize, const RegisterBank &RegBank) {
+  // Build the value mapping.
+  assert(MaskSize <= RegBank.getSize() && "Register bank is too small");
+
+  // Create the mapping object.
+  getOperandMapping(OpIdx).BreakDown.push_back(
+      PartialMapping(0, MaskSize, RegBank));
+}
+
+bool RegisterBankInfo::InstructionMapping::verify(
+    const MachineInstr &MI) const {
+  // Check that all the register operands are properly mapped.
+  // Check the constructor invariant.
+  assert(NumOperands == MI.getNumOperands() &&
+         "NumOperands must match, see constructor");
+  assert(MI.getParent() && MI.getParent()->getParent() &&
+         "MI must be connected to a MachineFunction");
+  const MachineFunction &MF = *MI.getParent()->getParent();
+  (void)MF;
+
+  for (unsigned Idx = 0; Idx < NumOperands; ++Idx) {
+    const MachineOperand &MO = MI.getOperand(Idx);
+    const RegisterBankInfo::ValueMapping &MOMapping = getOperandMapping(Idx);
+    (void)MOMapping;
+    if (!MO.isReg()) {
+      assert(MOMapping.BreakDown.empty() &&
+             "We should not care about non-reg mapping");
+      continue;
+    }
+    unsigned Reg = MO.getReg();
+    if (!Reg)
+      continue;
+    // Register size in bits.
+    // This size must match what the mapping expects.
+    assert(MOMapping.verify(getSizeInBits(
+               Reg, MF.getRegInfo(), *MF.getSubtarget().getRegisterInfo())) &&
+           "Value mapping is invalid");
+  }
+  return true;
+}
+
+void RegisterBankInfo::InstructionMapping::dump() const {
+  print(dbgs());
+  dbgs() << '\n';
+}
+
+void RegisterBankInfo::InstructionMapping::print(raw_ostream &OS) const {
+  OS << "ID: " << getID() << " Cost: " << getCost() << " Mapping: ";
+
+  for (unsigned OpIdx = 0; OpIdx != NumOperands; ++OpIdx) {
+    const ValueMapping &ValMapping = getOperandMapping(OpIdx);
+    if (OpIdx)
+      OS << ", ";
+    OS << "{ Idx: " << OpIdx << " Map: " << ValMapping << '}';
+  }
+}
+
+const int RegisterBankInfo::OperandsMapper::DontKnowIdx = -1;
+
+RegisterBankInfo::OperandsMapper::OperandsMapper(
+    MachineInstr &MI, const InstructionMapping &InstrMapping,
+    MachineRegisterInfo &MRI)
+    : MRI(MRI), MI(MI), InstrMapping(InstrMapping) {
+  unsigned NumOpds = MI.getNumOperands();
+  OpToNewVRegIdx.reset(new int[NumOpds]);
+  std::fill(&OpToNewVRegIdx[0], &OpToNewVRegIdx[NumOpds],
+            OperandsMapper::DontKnowIdx);
+  assert(InstrMapping.verify(MI) && "Invalid mapping for MI");
+}
+
+iterator_range<SmallVectorImpl<unsigned>::iterator>
+RegisterBankInfo::OperandsMapper::getVRegsMem(unsigned OpIdx) {
+  assert(OpIdx < getMI().getNumOperands() && "Out-of-bound access");
+  unsigned NumPartialVal =
+      getInstrMapping().getOperandMapping(OpIdx).BreakDown.size();
+  int StartIdx = OpToNewVRegIdx[OpIdx];
+
+  if (StartIdx == OperandsMapper::DontKnowIdx) {
+    // This is the first time we try to access OpIdx.
+    // Create the cells that will hold all the partial values at the
+    // end of the list of NewVReg.
+    StartIdx = NewVRegs.size();
+    OpToNewVRegIdx[OpIdx] = StartIdx;
+    for (unsigned i = 0; i < NumPartialVal; ++i)
+      NewVRegs.push_back(0);
+  }
+  SmallVectorImpl<unsigned>::iterator End =
+      getNewVRegsEnd(StartIdx, NumPartialVal);
+
+  return make_range(&NewVRegs[StartIdx], End);
+}
+
+SmallVectorImpl<unsigned>::const_iterator
+RegisterBankInfo::OperandsMapper::getNewVRegsEnd(unsigned StartIdx,
+                                                 unsigned NumVal) const {
+  return const_cast<OperandsMapper *>(this)->getNewVRegsEnd(StartIdx, NumVal);
+}
+SmallVectorImpl<unsigned>::iterator
+RegisterBankInfo::OperandsMapper::getNewVRegsEnd(unsigned StartIdx,
+                                                 unsigned NumVal) {
+  assert((NewVRegs.size() == StartIdx + NumVal ||
+          NewVRegs.size() > StartIdx + NumVal) &&
+         "NewVRegs too small to contain all the partial mapping");
+  return NewVRegs.size() <= StartIdx + NumVal ? NewVRegs.end()
+                                              : &NewVRegs[StartIdx + NumVal];
+}
+
+void RegisterBankInfo::OperandsMapper::createVRegs(unsigned OpIdx) {
+  assert(OpIdx < getMI().getNumOperands() && "Out-of-bound access");
+  iterator_range<SmallVectorImpl<unsigned>::iterator> NewVRegsForOpIdx =
+      getVRegsMem(OpIdx);
+  const SmallVectorImpl<PartialMapping> &PartMapList =
+      getInstrMapping().getOperandMapping(OpIdx).BreakDown;
+  SmallVectorImpl<PartialMapping>::const_iterator PartMap = PartMapList.begin();
+  for (unsigned &NewVReg : NewVRegsForOpIdx) {
+    assert(PartMap != PartMapList.end() && "Out-of-bound access");
+    assert(NewVReg == 0 && "Register has already been created");
+    NewVReg = MRI.createGenericVirtualRegister(PartMap->Length);
+    MRI.setRegBank(NewVReg, *PartMap->RegBank);
+    ++PartMap;
+  }
+}
+
+void RegisterBankInfo::OperandsMapper::setVRegs(unsigned OpIdx,
+                                                unsigned PartialMapIdx,
+                                                unsigned NewVReg) {
+  assert(OpIdx < getMI().getNumOperands() && "Out-of-bound access");
+  assert(getInstrMapping().getOperandMapping(OpIdx).BreakDown.size() >
+             PartialMapIdx &&
+         "Out-of-bound access for partial mapping");
+  // Make sure the memory is initialized for that operand.
+  (void)getVRegsMem(OpIdx);
+  assert(NewVRegs[OpToNewVRegIdx[OpIdx] + PartialMapIdx] == 0 &&
+         "This value is already set");
+  NewVRegs[OpToNewVRegIdx[OpIdx] + PartialMapIdx] = NewVReg;
+}
+
+iterator_range<SmallVectorImpl<unsigned>::const_iterator>
+RegisterBankInfo::OperandsMapper::getVRegs(unsigned OpIdx,
+                                           bool ForDebug) const {
+  (void)ForDebug;
+  assert(OpIdx < getMI().getNumOperands() && "Out-of-bound access");
+  int StartIdx = OpToNewVRegIdx[OpIdx];
+
+  if (StartIdx == OperandsMapper::DontKnowIdx)
+    return make_range(NewVRegs.end(), NewVRegs.end());
+
+  unsigned PartMapSize =
+      getInstrMapping().getOperandMapping(OpIdx).BreakDown.size();
+  SmallVectorImpl<unsigned>::const_iterator End =
+      getNewVRegsEnd(StartIdx, PartMapSize);
+  iterator_range<SmallVectorImpl<unsigned>::const_iterator> Res =
+      make_range(&NewVRegs[StartIdx], End);
+#ifndef NDEBUG
+  for (unsigned VReg : Res)
+    assert((VReg || ForDebug) && "Some registers are uninitialized");
+#endif
+  return Res;
+}
+
+void RegisterBankInfo::OperandsMapper::dump() const {
+  print(dbgs(), true);
+  dbgs() << '\n';
+}
+
+void RegisterBankInfo::OperandsMapper::print(raw_ostream &OS,
+                                             bool ForDebug) const {
+  unsigned NumOpds = getMI().getNumOperands();
+  if (ForDebug) {
+    OS << "Mapping for " << getMI() << "\nwith " << getInstrMapping() << '\n';
+    // Print out the internal state of the index table.
+    OS << "Populated indices (CellNumber, IndexInNewVRegs): ";
+    bool IsFirst = true;
+    for (unsigned Idx = 0; Idx != NumOpds; ++Idx) {
+      if (OpToNewVRegIdx[Idx] != DontKnowIdx) {
+        if (!IsFirst)
+          OS << ", ";
+        OS << '(' << Idx << ", " << OpToNewVRegIdx[Idx] << ')';
+        IsFirst = false;
+      }
+    }
+    OS << '\n';
+  } else
+    OS << "Mapping ID: " << getInstrMapping().getID() << ' ';
+
+  OS << "Operand Mapping: ";
+  // If we have a function, we can pretty print the name of the registers.
+  // Otherwise we will print the raw numbers.
+  const TargetRegisterInfo *TRI =
+      getMI().getParent() && getMI().getParent()->getParent()
+          ? getMI().getParent()->getParent()->getSubtarget().getRegisterInfo()
+          : nullptr;
+  bool IsFirst = true;
+  for (unsigned Idx = 0; Idx != NumOpds; ++Idx) {
+    if (OpToNewVRegIdx[Idx] == DontKnowIdx)
+      continue;
+    if (!IsFirst)
+      OS << ", ";
+    IsFirst = false;
+    OS << '(' << PrintReg(getMI().getOperand(Idx).getReg(), TRI) << ", [";
+    bool IsFirstNewVReg = true;
+    for (unsigned VReg : getVRegs(Idx)) {
+      if (!IsFirstNewVReg)
+        OS << ", ";
+      IsFirstNewVReg = false;
+      OS << PrintReg(VReg, TRI);
+    }
+    OS << "])";
+  }
+}
diff --git a/lib/CodeGen/GlobalMerge.cpp b/lib/CodeGen/GlobalMerge.cpp
index dd9a84086181..8c760b724d13 100644
--- a/lib/CodeGen/GlobalMerge.cpp
+++ b/lib/CodeGen/GlobalMerge.cpp
@@ -59,7 +59,6 @@
 // We use heuristics to discover the best global grouping we can (cf cl::opts).
 // ===---------------------------------------------------------------------===//
 
-#include "llvm/Transforms/Scalar.h"
 #include "llvm/ADT/DenseMap.h"
 #include "llvm/ADT/SmallBitVector.h"
 #include "llvm/ADT/SmallPtrSet.h"
@@ -92,6 +91,11 @@ EnableGlobalMerge("enable-global-merge", cl::Hidden,
                   cl::desc("Enable the global merge pass"),
                   cl::init(true));
 
+static cl::opt<unsigned>
+GlobalMergeMaxOffset("global-merge-max-offset", cl::Hidden,
+                     cl::desc("Set maximum offset for global merge pass"),
+                     cl::init(0));
+
 static cl::opt<bool> GlobalMergeGroupByUse(
     "global-merge-group-by-use", cl::Hidden,
     cl::desc("Improve global merge pass to look at uses"), cl::init(true));
@@ -131,6 +135,8 @@ namespace {
     /// Whether we should merge global variables that have external linkage.
     bool MergeExternalGlobals;
 
+    bool IsMachO;
+
     bool doMerge(SmallVectorImpl<GlobalVariable*> &Globals,
                  Module &M, bool isConst, unsigned AddrSpace) const;
     /// \brief Merge everything in \p Globals for which the corresponding bit
@@ -158,10 +164,14 @@ namespace {
 
   public:
     static char ID;             // Pass identification, replacement for typeid.
-    explicit GlobalMerge(const TargetMachine *TM = nullptr,
-                         unsigned MaximalOffset = 0,
-                         bool OnlyOptimizeForSize = false,
-                         bool MergeExternalGlobals = false)
+    explicit GlobalMerge()
+        : FunctionPass(ID), TM(nullptr), MaxOffset(GlobalMergeMaxOffset),
+          OnlyOptimizeForSize(false), MergeExternalGlobals(false) {
+      initializeGlobalMergePass(*PassRegistry::getPassRegistry());
+    }
+
+    explicit GlobalMerge(const TargetMachine *TM, unsigned MaximalOffset,
+                         bool OnlyOptimizeForSize, bool MergeExternalGlobals)
         : FunctionPass(ID), TM(TM), MaxOffset(MaximalOffset),
           OnlyOptimizeForSize(OnlyOptimizeForSize),
           MergeExternalGlobals(MergeExternalGlobals) {
@@ -459,8 +469,7 @@ bool GlobalMerge::doMerge(const SmallVectorImpl<GlobalVariable *> &Globals,
       // we can also emit an alias for internal linkage as it's safe to do so.
       // It's not safe on Mach-O as the alias (and thus the portion of the
       // MergedGlobals variable) may be dead stripped at link time.
-      if (Linkage != GlobalValue::InternalLinkage ||
-          !TM->getTargetTriple().isOSBinFormatMachO()) {
+      if (Linkage != GlobalValue::InternalLinkage || !IsMachO) {
         GlobalAlias::create(Tys[idx], AddrSpace, Linkage, Name, GEP, &M);
       }
 
@@ -513,6 +522,8 @@ bool GlobalMerge::doInitialization(Module &M) {
   if (!EnableGlobalMerge)
     return false;
 
+  IsMachO = Triple(M.getTargetTriple()).isOSBinFormatMachO();
+
   auto &DL = M.getDataLayout();
   DenseMap<unsigned, SmallVector<GlobalVariable*, 16> > Globals, ConstGlobals,
                                                         BSSGlobals;
@@ -550,7 +561,8 @@ bool GlobalMerge::doInitialization(Module &M) {
       continue;
 
     if (DL.getTypeAllocSize(Ty) < MaxOffset) {
-      if (TargetLoweringObjectFile::getKindForGlobal(&GV, *TM).isBSSLocal())
+      if (TM &&
+          TargetLoweringObjectFile::getKindForGlobal(&GV, *TM).isBSSLocal())
         BSSGlobals[AddressSpace].push_back(&GV);
       else if (GV.isConstant())
         ConstGlobals[AddressSpace].push_back(&GV);
diff --git a/lib/CodeGen/IfConversion.cpp b/lib/CodeGen/IfConversion.cpp
index c38c9d22266e..d225162860c2 100644
--- a/lib/CodeGen/IfConversion.cpp
+++ b/lib/CodeGen/IfConversion.cpp
@@ -7,7 +7,8 @@
 //
 //===----------------------------------------------------------------------===//
 //
-// This file implements the machine instruction level if-conversion pass.
+// This file implements the machine instruction level if-conversion pass, which
+// tries to convert conditional branches into predicated instructions.
 //
 //===----------------------------------------------------------------------===//
 
@@ -33,6 +34,7 @@
 #include "llvm/Target/TargetRegisterInfo.h"
 #include "llvm/Target/TargetSubtargetInfo.h"
 #include <algorithm>
+#include <utility>
 
 using namespace llvm;
 
@@ -85,7 +87,7 @@ namespace {
 
     /// BBInfo - One per MachineBasicBlock, this is used to cache the result
     /// if-conversion feasibility analysis. This includes results from
-    /// TargetInstrInfo::AnalyzeBranch() (i.e. TBB, FBB, and Cond), and its
+    /// TargetInstrInfo::analyzeBranch() (i.e. TBB, FBB, and Cond), and its
     /// classification, and common tail block of its successors (if it's a
     /// diamond shape), its size, whether it's predicable, and whether any
     /// instruction can clobber the 'would-be' predicate.
@@ -94,7 +96,7 @@ namespace {
     /// IsBeingAnalyzed - True if BB is currently being analyzed.
     /// IsAnalyzed      - True if BB has been analyzed (info is still valid).
     /// IsEnqueued      - True if BB has been enqueued to be ifcvt'ed.
-    /// IsBrAnalyzable  - True if AnalyzeBranch() returns false.
+    /// IsBrAnalyzable  - True if analyzeBranch() returns false.
     /// HasFallThrough  - True if BB may fallthrough to the following BB.
     /// IsUnpredicable  - True if BB is known to be unpredicable.
     /// ClobbersPred    - True if BB could modify predicates (e.g. has
@@ -103,7 +105,7 @@ namespace {
     /// ExtraCost       - Extra cost for multi-cycle instructions.
     /// ExtraCost2      - Some instructions are slower when predicated
     /// BB              - Corresponding MachineBasicBlock.
-    /// TrueBB / FalseBB- See AnalyzeBranch().
+    /// TrueBB / FalseBB- See analyzeBranch().
     /// BrCond          - Conditions for end of block conditional branches.
     /// Predicate       - Predicate used in the BB.
     struct BBInfo {
@@ -161,7 +163,6 @@ namespace {
     const TargetLoweringBase *TLI;
     const TargetInstrInfo *TII;
     const TargetRegisterInfo *TRI;
-    const MachineBlockFrequencyInfo *MBFI;
     const MachineBranchProbabilityInfo *MBPI;
     MachineRegisterInfo *MRI;
 
@@ -176,7 +177,7 @@ namespace {
   public:
     static char ID;
     IfConverter(std::function<bool(const Function &)> Ftor = nullptr)
-        : MachineFunctionPass(ID), FnNum(-1), PredicateFtor(Ftor) {
+        : MachineFunctionPass(ID), FnNum(-1), PredicateFtor(std::move(Ftor)) {
       initializeIfConverterPass(*PassRegistry::getPassRegistry());
     }
 
@@ -188,6 +189,11 @@ namespace {
 
     bool runOnMachineFunction(MachineFunction &MF) override;
 
+    MachineFunctionProperties getRequiredProperties() const override {
+      return MachineFunctionProperties().set(
+          MachineFunctionProperties::Property::AllVRegsAllocated);
+    }
+
   private:
     bool ReverseBranchCondition(BBInfo &BBI);
     bool ValidSimple(BBInfo &TrueBBI, unsigned &Dups,
@@ -198,10 +204,12 @@ namespace {
     bool ValidDiamond(BBInfo &TrueBBI, BBInfo &FalseBBI,
                       unsigned &Dups1, unsigned &Dups2) const;
     void ScanInstructions(BBInfo &BBI);
-    void AnalyzeBlock(MachineBasicBlock *MBB, std::vector<IfcvtToken*> &Tokens);
+    void AnalyzeBlock(MachineBasicBlock *MBB,
+                      std::vector<std::unique_ptr<IfcvtToken>> &Tokens);
     bool FeasibilityAnalysis(BBInfo &BBI, SmallVectorImpl<MachineOperand> &Cond,
                              bool isTriangle = false, bool RevBranch = false);
-    void AnalyzeBlocks(MachineFunction &MF, std::vector<IfcvtToken*> &Tokens);
+    void AnalyzeBlocks(MachineFunction &MF,
+                       std::vector<std::unique_ptr<IfcvtToken>> &Tokens);
     void InvalidatePreds(MachineBasicBlock *BB);
     void RemoveExtraEdges(BBInfo &BBI);
     bool IfConvertSimple(BBInfo &BBI, IfcvtKind Kind);
@@ -240,7 +248,8 @@ namespace {
     }
 
     // IfcvtTokenCmp - Used to sort if-conversion candidates.
-    static bool IfcvtTokenCmp(IfcvtToken *C1, IfcvtToken *C2) {
+    static bool IfcvtTokenCmp(const std::unique_ptr<IfcvtToken> &C1,
+                              const std::unique_ptr<IfcvtToken> &C2) {
       int Incr1 = (C1->Kind == ICDiamond)
         ? -(int)(C1->NumDups + C1->NumDups2) : (int)C1->NumDups;
       int Incr2 = (C2->Kind == ICDiamond)
@@ -273,14 +282,15 @@ INITIALIZE_PASS_DEPENDENCY(MachineBranchProbabilityInfo)
 INITIALIZE_PASS_END(IfConverter, "if-converter", "If Converter", false, false)
 
 bool IfConverter::runOnMachineFunction(MachineFunction &MF) {
-  if (PredicateFtor && !PredicateFtor(*MF.getFunction()))
+  if (skipFunction(*MF.getFunction()) ||
+      (PredicateFtor && !PredicateFtor(*MF.getFunction())))
     return false;
 
   const TargetSubtargetInfo &ST = MF.getSubtarget();
   TLI = ST.getTargetLowering();
   TII = ST.getInstrInfo();
   TRI = ST.getRegisterInfo();
-  MBFI = &getAnalysis<MachineBlockFrequencyInfo>();
+  BranchFolder::MBFIWrapper MBFI(getAnalysis<MachineBlockFrequencyInfo>());
   MBPI = &getAnalysis<MachineBranchProbabilityInfo>();
   MRI = &MF.getRegInfo();
   SchedModel.init(ST.getSchedModel(), &ST, TII);
@@ -292,7 +302,7 @@ bool IfConverter::runOnMachineFunction(MachineFunction &MF) {
   bool BFChange = false;
   if (!PreRegAlloc) {
     // Tail merge tend to expose more if-conversion opportunities.
-    BranchFolder BF(true, false, *MBFI, *MBPI);
+    BranchFolder BF(true, false, MBFI, *MBPI);
     BFChange = BF.OptimizeFunction(MF, TII, ST.getRegisterInfo(),
                                    getAnalysisIfAvailable<MachineModuleInfo>());
   }
@@ -309,7 +319,7 @@ bool IfConverter::runOnMachineFunction(MachineFunction &MF) {
   MF.RenumberBlocks();
   BBAnalysis.resize(MF.getNumBlockIDs());
 
-  std::vector<IfcvtToken*> Tokens;
+  std::vector<std::unique_ptr<IfcvtToken>> Tokens;
   MadeChange = false;
   unsigned NumIfCvts = NumSimple + NumSimpleFalse + NumTriangle +
     NumTriangleRev + NumTriangleFalse + NumTriangleFRev + NumDiamonds;
@@ -319,15 +329,13 @@ bool IfConverter::runOnMachineFunction(MachineFunction &MF) {
     bool Change = false;
     AnalyzeBlocks(MF, Tokens);
     while (!Tokens.empty()) {
-      IfcvtToken *Token = Tokens.back();
+      std::unique_ptr<IfcvtToken> Token = std::move(Tokens.back());
       Tokens.pop_back();
       BBInfo &BBI = Token->BBI;
       IfcvtKind Kind = Token->Kind;
       unsigned NumDups = Token->NumDups;
       unsigned NumDups2 = Token->NumDups2;
 
-      delete Token;
-
       // If the block has been evicted out of the queue or it has already been
       // marked dead (due to it being predicated), then skip it.
       if (BBI.IsDone)
@@ -414,18 +422,11 @@ bool IfConverter::runOnMachineFunction(MachineFunction &MF) {
     MadeChange |= Change;
   }
 
-  // Delete tokens in case of early exit.
-  while (!Tokens.empty()) {
-    IfcvtToken *Token = Tokens.back();
-    Tokens.pop_back();
-    delete Token;
-  }
-
   Tokens.clear();
   BBAnalysis.clear();
 
   if (MadeChange && IfCvtBranchFold) {
-    BranchFolder BF(false, false, *MBFI, *MBPI);
+    BranchFolder BF(false, false, MBFI, *MBPI);
     BF.OptimizeFunction(MF, TII, MF.getSubtarget().getRegisterInfo(),
                         getAnalysisIfAvailable<MachineModuleInfo>());
   }
@@ -586,7 +587,7 @@ bool IfConverter::ValidDiamond(BBInfo &TrueBBI, BBInfo &FalseBBI,
       if (FIB == FIE)
         break;
     }
-    if (!TIB->isIdenticalTo(FIB))
+    if (!TIB->isIdenticalTo(*FIB))
       break;
     ++Dups1;
     ++TIB;
@@ -595,15 +596,19 @@ bool IfConverter::ValidDiamond(BBInfo &TrueBBI, BBInfo &FalseBBI,
 
   // Now, in preparation for counting duplicate instructions at the ends of the
   // blocks, move the end iterators up past any branch instructions.
-  while (TIE != TIB) {
-    --TIE;
-    if (!TIE->isBranch())
-      break;
-  }
-  while (FIE != FIB) {
-    --FIE;
-    if (!FIE->isBranch())
-      break;
+  // If both blocks are returning don't skip the branches, since they will
+  // likely be both identical return instructions. In such cases the return
+  // can be left unpredicated.
+  // Check for already containing all of the block.
+  if (TIB == TIE || FIB == FIE)
+    return true;
+  --TIE;
+  --FIE;
+  if (!TrueBBI.BB->succ_empty() || !FalseBBI.BB->succ_empty()) {
+    while (TIE != TIB && TIE->isBranch())
+      --TIE;
+    while (FIE != FIB && FIE->isBranch())
+      --FIE;
   }
 
   // If Dups1 includes all of a block, then don't count duplicate
@@ -626,7 +631,7 @@ bool IfConverter::ValidDiamond(BBInfo &TrueBBI, BBInfo &FalseBBI,
       if (FIE == FIB)
         break;
     }
-    if (!TIE->isIdenticalTo(FIE))
+    if (!TIE->isIdenticalTo(*FIE))
       break;
     ++Dups2;
     --TIE;
@@ -650,7 +655,7 @@ void IfConverter::ScanInstructions(BBInfo &BBI) {
   BBI.TrueBB = BBI.FalseBB = nullptr;
   BBI.BrCond.clear();
   BBI.IsBrAnalyzable =
-    !TII->AnalyzeBranch(*BBI.BB, BBI.TrueBB, BBI.FalseBB, BBI.BrCond);
+      !TII->analyzeBranch(*BBI.BB, BBI.TrueBB, BBI.FalseBB, BBI.BrCond);
   BBI.HasFallThrough = BBI.IsBrAnalyzable && BBI.FalseBB == nullptr;
 
   if (BBI.BrCond.size()) {
@@ -670,16 +675,45 @@ void IfConverter::ScanInstructions(BBInfo &BBI) {
   BBI.ExtraCost = 0;
   BBI.ExtraCost2 = 0;
   BBI.ClobbersPred = false;
-  for (MachineBasicBlock::iterator I = BBI.BB->begin(), E = BBI.BB->end();
-       I != E; ++I) {
-    if (I->isDebugValue())
+  for (auto &MI : *BBI.BB) {
+    if (MI.isDebugValue())
       continue;
 
-    if (I->isNotDuplicable())
+    // It's unsafe to duplicate convergent instructions in this context, so set
+    // BBI.CannotBeCopied to true if MI is convergent.  To see why, consider the
+    // following CFG, which is subject to our "simple" transformation.
+    //
+    //    BB0     // if (c1) goto BB1; else goto BB2;
+    //   /   \
+    //  BB1   |
+    //   |   BB2  // if (c2) goto TBB; else goto FBB;
+    //   |   / |
+    //   |  /  |
+    //   TBB   |
+    //    |    |
+    //    |   FBB
+    //    |
+    //    exit
+    //
+    // Suppose we want to move TBB's contents up into BB1 and BB2 (in BB1 they'd
+    // be unconditional, and in BB2, they'd be predicated upon c2), and suppose
+    // TBB contains a convergent instruction.  This is safe iff doing so does
+    // not add a control-flow dependency to the convergent instruction -- i.e.,
+    // it's safe iff the set of control flows that leads us to the convergent
+    // instruction does not get smaller after the transformation.
+    //
+    // Originally we executed TBB if c1 || c2.  After the transformation, there
+    // are two copies of TBB's instructions.  We get to the first if c1, and we
+    // get to the second if !c1 && c2.
+    //
+    // There are clearly fewer ways to satisfy the condition "c1" than
+    // "c1 || c2".  Since we've shrunk the set of control flows which lead to
+    // our convergent instruction, the transformation is unsafe.
+    if (MI.isNotDuplicable() || MI.isConvergent())
       BBI.CannotBeCopied = true;
 
-    bool isPredicated = TII->isPredicated(I);
-    bool isCondBr = BBI.IsBrAnalyzable && I->isConditionalBranch();
+    bool isPredicated = TII->isPredicated(MI);
+    bool isCondBr = BBI.IsBrAnalyzable && MI.isConditionalBranch();
 
     // A conditional branch is not predicable, but it may be eliminated.
     if (isCondBr)
@@ -687,8 +721,8 @@ void IfConverter::ScanInstructions(BBInfo &BBI) {
 
     if (!isPredicated) {
       BBI.NonPredSize++;
-      unsigned ExtraPredCost = TII->getPredicationCost(&*I);
-      unsigned NumCycles = SchedModel.computeInstrLatency(&*I, false);
+      unsigned ExtraPredCost = TII->getPredicationCost(MI);
+      unsigned NumCycles = SchedModel.computeInstrLatency(&MI, false);
       if (NumCycles > 1)
         BBI.ExtraCost += NumCycles-1;
       BBI.ExtraCost2 += ExtraPredCost;
@@ -712,10 +746,10 @@ void IfConverter::ScanInstructions(BBInfo &BBI) {
     // FIXME: Make use of PredDefs? e.g. ADDC, SUBC sets predicates but are
     // still potentially predicable.
     std::vector<MachineOperand> PredDefs;
-    if (TII->DefinesPredicate(I, PredDefs))
+    if (TII->DefinesPredicate(MI, PredDefs))
       BBI.ClobbersPred = true;
 
-    if (!TII->isPredicable(I)) {
+    if (!TII->isPredicable(MI)) {
       BBI.IsUnpredicable = true;
       return;
     }
@@ -764,8 +798,8 @@ bool IfConverter::FeasibilityAnalysis(BBInfo &BBI,
 /// AnalyzeBlock - Analyze the structure of the sub-CFG starting from
 /// the specified block. Record its successors and whether it looks like an
 /// if-conversion candidate.
-void IfConverter::AnalyzeBlock(MachineBasicBlock *MBB,
-                               std::vector<IfcvtToken*> &Tokens) {
+void IfConverter::AnalyzeBlock(
+    MachineBasicBlock *MBB, std::vector<std::unique_ptr<IfcvtToken>> &Tokens) {
   struct BBState {
     BBState(MachineBasicBlock *BB) : MBB(BB), SuccsAnalyzed(false) {}
     MachineBasicBlock *MBB;
@@ -863,8 +897,8 @@ void IfConverter::AnalyzeBlock(MachineBasicBlock *MBB,
       //   \ /
       //  TailBB
       // Note TailBB can be empty.
-      Tokens.push_back(new IfcvtToken(BBI, ICDiamond, TNeedSub|FNeedSub, Dups,
-                                      Dups2));
+      Tokens.push_back(llvm::make_unique<IfcvtToken>(
+          BBI, ICDiamond, TNeedSub | FNeedSub, Dups, Dups2));
       Enqueued = true;
     }
 
@@ -879,7 +913,8 @@ void IfConverter::AnalyzeBlock(MachineBasicBlock *MBB,
       //   | TBB
       //   |  /
       //   FBB
-      Tokens.push_back(new IfcvtToken(BBI, ICTriangle, TNeedSub, Dups));
+      Tokens.push_back(
+          llvm::make_unique<IfcvtToken>(BBI, ICTriangle, TNeedSub, Dups));
       Enqueued = true;
     }
 
@@ -887,7 +922,8 @@ void IfConverter::AnalyzeBlock(MachineBasicBlock *MBB,
         MeetIfcvtSizeLimit(*TrueBBI.BB, TrueBBI.NonPredSize + TrueBBI.ExtraCost,
                            TrueBBI.ExtraCost2, Prediction) &&
         FeasibilityAnalysis(TrueBBI, BBI.BrCond, true, true)) {
-      Tokens.push_back(new IfcvtToken(BBI, ICTriangleRev, TNeedSub, Dups));
+      Tokens.push_back(
+          llvm::make_unique<IfcvtToken>(BBI, ICTriangleRev, TNeedSub, Dups));
       Enqueued = true;
     }
 
@@ -902,7 +938,8 @@ void IfConverter::AnalyzeBlock(MachineBasicBlock *MBB,
       //   | TBB---> exit
       //   |
       //   FBB
-      Tokens.push_back(new IfcvtToken(BBI, ICSimple, TNeedSub, Dups));
+      Tokens.push_back(
+          llvm::make_unique<IfcvtToken>(BBI, ICSimple, TNeedSub, Dups));
       Enqueued = true;
     }
 
@@ -914,7 +951,8 @@ void IfConverter::AnalyzeBlock(MachineBasicBlock *MBB,
                              FalseBBI.NonPredSize + FalseBBI.ExtraCost,
                              FalseBBI.ExtraCost2, Prediction.getCompl()) &&
           FeasibilityAnalysis(FalseBBI, RevCond, true)) {
-        Tokens.push_back(new IfcvtToken(BBI, ICTriangleFalse, FNeedSub, Dups));
+        Tokens.push_back(llvm::make_unique<IfcvtToken>(BBI, ICTriangleFalse,
+                                                       FNeedSub, Dups));
         Enqueued = true;
       }
 
@@ -924,7 +962,8 @@ void IfConverter::AnalyzeBlock(MachineBasicBlock *MBB,
                              FalseBBI.NonPredSize + FalseBBI.ExtraCost,
                            FalseBBI.ExtraCost2, Prediction.getCompl()) &&
         FeasibilityAnalysis(FalseBBI, RevCond, true, true)) {
-        Tokens.push_back(new IfcvtToken(BBI, ICTriangleFRev, FNeedSub, Dups));
+        Tokens.push_back(
+            llvm::make_unique<IfcvtToken>(BBI, ICTriangleFRev, FNeedSub, Dups));
         Enqueued = true;
       }
 
@@ -933,7 +972,8 @@ void IfConverter::AnalyzeBlock(MachineBasicBlock *MBB,
                              FalseBBI.NonPredSize + FalseBBI.ExtraCost,
                              FalseBBI.ExtraCost2, Prediction.getCompl()) &&
           FeasibilityAnalysis(FalseBBI, RevCond)) {
-        Tokens.push_back(new IfcvtToken(BBI, ICSimpleFalse, FNeedSub, Dups));
+        Tokens.push_back(
+            llvm::make_unique<IfcvtToken>(BBI, ICSimpleFalse, FNeedSub, Dups));
         Enqueued = true;
       }
     }
@@ -947,8 +987,8 @@ void IfConverter::AnalyzeBlock(MachineBasicBlock *MBB,
 
 /// AnalyzeBlocks - Analyze all blocks and find entries for all if-conversion
 /// candidates.
-void IfConverter::AnalyzeBlocks(MachineFunction &MF,
-                                std::vector<IfcvtToken*> &Tokens) {
+void IfConverter::AnalyzeBlocks(
+    MachineFunction &MF, std::vector<std::unique_ptr<IfcvtToken>> &Tokens) {
   for (auto &BB : MF)
     AnalyzeBlock(&BB, Tokens);
 
@@ -1001,15 +1041,15 @@ static void InsertUncondBranch(MachineBasicBlock *BB, MachineBasicBlock *ToBB,
 void IfConverter::RemoveExtraEdges(BBInfo &BBI) {
   MachineBasicBlock *TBB = nullptr, *FBB = nullptr;
   SmallVector<MachineOperand, 4> Cond;
-  if (!TII->AnalyzeBranch(*BBI.BB, TBB, FBB, Cond))
+  if (!TII->analyzeBranch(*BBI.BB, TBB, FBB, Cond))
     BBI.BB->CorrectExtraCFGEdges(TBB, FBB, !Cond.empty());
 }
 
 /// Behaves like LiveRegUnits::StepForward() but also adds implicit uses to all
 /// values defined in MI which are not live/used by MI.
-static void UpdatePredRedefs(MachineInstr *MI, LivePhysRegs &Redefs) {
+static void UpdatePredRedefs(MachineInstr &MI, LivePhysRegs &Redefs) {
   SmallVector<std::pair<unsigned, const MachineOperand*>, 4> Clobbers;
-  Redefs.stepForward(*MI, Clobbers);
+  Redefs.stepForward(MI, Clobbers);
 
   // Now add the implicit uses for each of the clobbered values.
   for (auto Reg : Clobbers) {
@@ -1046,7 +1086,7 @@ static void UpdatePredRedefs(MachineInstr *MI, LivePhysRegs &Redefs) {
  * Remove kill flags from operands with a registers in the @p DontKill set.
  */
 static void RemoveKills(MachineInstr &MI, const LivePhysRegs &DontKill) {
-  for (MIBundleOperands O(&MI); O.isValid(); ++O) {
+  for (MIBundleOperands O(MI); O.isValid(); ++O) {
     if (!O->isReg() || !O->isKill())
       continue;
     if (DontKill.contains(O->getReg()))
@@ -1097,13 +1137,13 @@ bool IfConverter::IfConvertSimple(BBInfo &BBI, IfcvtKind Kind) {
   // Initialize liveins to the first BB. These are potentiall redefined by
   // predicated instructions.
   Redefs.init(TRI);
-  Redefs.addLiveIns(CvtBBI->BB);
-  Redefs.addLiveIns(NextBBI->BB);
+  Redefs.addLiveIns(*CvtBBI->BB);
+  Redefs.addLiveIns(*NextBBI->BB);
 
   // Compute a set of registers which must not be killed by instructions in
   // BB1: This is everything live-in to BB2.
   DontKill.init(TRI);
-  DontKill.addLiveIns(NextBBI->BB);
+  DontKill.addLiveIns(*NextBBI->BB);
 
   if (CvtBBI->BB->pred_size() > 1) {
     BBI.NonPredSize -= TII->RemoveBranch(*BBI.BB);
@@ -1202,8 +1242,8 @@ bool IfConverter::IfConvertTriangle(BBInfo &BBI, IfcvtKind Kind) {
   // Initialize liveins to the first BB. These are potentially redefined by
   // predicated instructions.
   Redefs.init(TRI);
-  Redefs.addLiveIns(CvtBBI->BB);
-  Redefs.addLiveIns(NextBBI->BB);
+  Redefs.addLiveIns(*CvtBBI->BB);
+  Redefs.addLiveIns(*NextBBI->BB);
 
   DontKill.clear();
 
@@ -1357,7 +1397,7 @@ bool IfConverter::IfConvertDiamond(BBInfo &BBI, IfcvtKind Kind,
   // Initialize liveins to the first BB. These are potentially redefined by
   // predicated instructions.
   Redefs.init(TRI);
-  Redefs.addLiveIns(BBI1->BB);
+  Redefs.addLiveIns(*BBI1->BB);
 
   // Remove the duplicated instructions at the beginnings of both paths.
   // Skip dbg_value instructions
@@ -1395,8 +1435,13 @@ bool IfConverter::IfConvertDiamond(BBInfo &BBI, IfcvtKind Kind,
   BBI.BB->splice(BBI.BB->end(), BBI1->BB, BBI1->BB->begin(), DI1);
   BBI2->BB->erase(BBI2->BB->begin(), DI2);
 
-  // Remove branch from 'true' block and remove duplicated instructions.
-  BBI1->NonPredSize -= TII->RemoveBranch(*BBI1->BB);
+  // Remove branch from the 'true' block, unless it was not analyzable.
+  // Non-analyzable branches need to be preserved, since in such cases,
+  // the CFG structure is not an actual diamond (the join block may not
+  // be present).
+  if (BBI1->IsBrAnalyzable)
+    BBI1->NonPredSize -= TII->RemoveBranch(*BBI1->BB);
+  // Remove duplicated instructions.
   DI1 = BBI1->BB->end();
   for (unsigned i = 0; i != NumDups2; ) {
     // NumDups2 only counted non-dbg_value instructions, so this won't
@@ -1413,8 +1458,10 @@ bool IfConverter::IfConvertDiamond(BBInfo &BBI, IfcvtKind Kind,
   // must be removed.
   RemoveKills(BBI1->BB->begin(), BBI1->BB->end(), DontKill, *TRI);
 
-  // Remove 'false' block branch and find the last instruction to predicate.
-  BBI2->NonPredSize -= TII->RemoveBranch(*BBI2->BB);
+  // Remove 'false' block branch (unless it was not analyzable), and find
+  // the last instruction to predicate.
+  if (BBI2->IsBrAnalyzable)
+    BBI2->NonPredSize -= TII->RemoveBranch(*BBI2->BB);
   DI2 = BBI2->BB->end();
   while (NumDups2 != 0) {
     // NumDups2 only counted non-dbg_value instructions, so this won't
@@ -1473,6 +1520,18 @@ bool IfConverter::IfConvertDiamond(BBInfo &BBI, IfcvtKind Kind,
   // Predicate the 'true' block.
   PredicateBlock(*BBI1, BBI1->BB->end(), *Cond1, &RedefsByFalse);
 
+  // After predicating BBI1, if there is a predicated terminator in BBI1 and
+  // a non-predicated in BBI2, then we don't want to predicate the one from
+  // BBI2. The reason is that if we merged these blocks, we would end up with
+  // two predicated terminators in the same block.
+  if (!BBI2->BB->empty() && (DI2 == BBI2->BB->end())) {
+    MachineBasicBlock::iterator BBI1T = BBI1->BB->getFirstTerminator();
+    MachineBasicBlock::iterator BBI2T = BBI2->BB->getFirstTerminator();
+    if (BBI1T != BBI1->BB->end() && TII->isPredicated(*BBI1T) &&
+        BBI2T != BBI2->BB->end() && !TII->isPredicated(*BBI2T))
+      --DI2;
+  }
+
   // Predicate the 'false' block.
   PredicateBlock(*BBI2, DI2, *Cond2);
 
@@ -1488,6 +1547,12 @@ bool IfConverter::IfConvertDiamond(BBInfo &BBI, IfcvtKind Kind,
     BBInfo &TailBBI = BBAnalysis[TailBB->getNumber()];
     bool CanMergeTail = !TailBBI.HasFallThrough &&
       !TailBBI.BB->hasAddressTaken();
+    // The if-converted block can still have a predicated terminator
+    // (e.g. a predicated return). If that is the case, we cannot merge
+    // it with the tail block.
+    MachineBasicBlock::const_iterator TI = BBI.BB->getFirstTerminator();
+    if (TI != BBI.BB->end() && TII->isPredicated(*TI))
+      CanMergeTail = false;
     // There may still be a fall-through edge from BBI1 or BBI2 to TailBB;
     // check if there are any other predecessors besides those.
     unsigned NumPreds = TailBB->pred_size();
@@ -1523,14 +1588,14 @@ bool IfConverter::IfConvertDiamond(BBInfo &BBI, IfcvtKind Kind,
   return true;
 }
 
-static bool MaySpeculate(const MachineInstr *MI,
+static bool MaySpeculate(const MachineInstr &MI,
                          SmallSet<unsigned, 4> &LaterRedefs) {
   bool SawStore = true;
-  if (!MI->isSafeToMove(nullptr, SawStore))
+  if (!MI.isSafeToMove(nullptr, SawStore))
     return false;
 
-  for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) {
-    const MachineOperand &MO = MI->getOperand(i);
+  for (unsigned i = 0, e = MI.getNumOperands(); i != e; ++i) {
+    const MachineOperand &MO = MI.getOperand(i);
     if (!MO.isReg())
       continue;
     unsigned Reg = MO.getReg();
@@ -1551,8 +1616,8 @@ void IfConverter::PredicateBlock(BBInfo &BBI,
                                  SmallSet<unsigned, 4> *LaterRedefs) {
   bool AnyUnpred = false;
   bool MaySpec = LaterRedefs != nullptr;
-  for (MachineBasicBlock::iterator I = BBI.BB->begin(); I != E; ++I) {
-    if (I->isDebugValue() || TII->isPredicated(I))
+  for (MachineInstr &I : llvm::make_range(BBI.BB->begin(), E)) {
+    if (I.isDebugValue() || TII->isPredicated(I))
       continue;
     // It may be possible not to predicate an instruction if it's the 'true'
     // side of a diamond and the 'false' side may re-define the instruction's
@@ -1566,7 +1631,7 @@ void IfConverter::PredicateBlock(BBInfo &BBI,
     MaySpec = false;
     if (!TII->PredicateInstruction(I, Cond)) {
 #ifndef NDEBUG
-      dbgs() << "Unable to predicate " << *I << "!\n";
+      dbgs() << "Unable to predicate " << I << "!\n";
 #endif
       llvm_unreachable(nullptr);
     }
@@ -1593,25 +1658,24 @@ void IfConverter::CopyAndPredicateBlock(BBInfo &ToBBI, BBInfo &FromBBI,
                                         bool IgnoreBr) {
   MachineFunction &MF = *ToBBI.BB->getParent();
 
-  for (MachineBasicBlock::iterator I = FromBBI.BB->begin(),
-         E = FromBBI.BB->end(); I != E; ++I) {
+  for (auto &I : *FromBBI.BB) {
     // Do not copy the end of the block branches.
-    if (IgnoreBr && I->isBranch())
+    if (IgnoreBr && I.isBranch())
       break;
 
-    MachineInstr *MI = MF.CloneMachineInstr(I);
+    MachineInstr *MI = MF.CloneMachineInstr(&I);
     ToBBI.BB->insert(ToBBI.BB->end(), MI);
     ToBBI.NonPredSize++;
-    unsigned ExtraPredCost = TII->getPredicationCost(&*I);
-    unsigned NumCycles = SchedModel.computeInstrLatency(&*I, false);
+    unsigned ExtraPredCost = TII->getPredicationCost(I);
+    unsigned NumCycles = SchedModel.computeInstrLatency(&I, false);
     if (NumCycles > 1)
       ToBBI.ExtraCost += NumCycles-1;
     ToBBI.ExtraCost2 += ExtraPredCost;
 
     if (!TII->isPredicated(I) && !MI->isDebugValue()) {
-      if (!TII->PredicateInstruction(MI, Cond)) {
+      if (!TII->PredicateInstruction(*MI, Cond)) {
 #ifndef NDEBUG
-        dbgs() << "Unable to predicate " << *I << "!\n";
+        dbgs() << "Unable to predicate " << I << "!\n";
 #endif
         llvm_unreachable(nullptr);
       }
@@ -1619,7 +1683,7 @@ void IfConverter::CopyAndPredicateBlock(BBInfo &ToBBI, BBInfo &FromBBI,
 
     // If the predicated instruction now redefines a register as the result of
     // if-conversion, add an implicit kill.
-    UpdatePredRedefs(MI, Redefs);
+    UpdatePredRedefs(*MI, Redefs);
 
     // Some kill flags may not be correct anymore.
     if (!DontKill.empty())
@@ -1659,8 +1723,16 @@ void IfConverter::MergeBlocks(BBInfo &ToBBI, BBInfo &FromBBI, bool AddEdges) {
   assert(!FromBBI.BB->hasAddressTaken() &&
          "Removing a BB whose address is taken!");
 
-  ToBBI.BB->splice(ToBBI.BB->end(),
-                   FromBBI.BB, FromBBI.BB->begin(), FromBBI.BB->end());
+  // In case FromBBI.BB contains terminators (e.g. return instruction),
+  // first move the non-terminator instructions, then the terminators.
+  MachineBasicBlock::iterator FromTI = FromBBI.BB->getFirstTerminator();
+  MachineBasicBlock::iterator ToTI = ToBBI.BB->getFirstTerminator();
+  ToBBI.BB->splice(ToTI, FromBBI.BB, FromBBI.BB->begin(), FromTI);
+
+  // If FromBB has non-predicated terminator we should copy it at the end.
+  if (FromTI != FromBBI.BB->end() && !TII->isPredicated(*FromTI))
+    ToTI = ToBBI.BB->end();
+  ToBBI.BB->splice(ToTI, FromBBI.BB, FromTI, FromBBI.BB->end());
 
   // Force normalizing the successors' probabilities of ToBBI.BB to convert all
   // unknown probabilities into known ones.
@@ -1768,5 +1840,5 @@ void IfConverter::MergeBlocks(BBInfo &ToBBI, BBInfo &FromBBI, bool AddEdges) {
 
 FunctionPass *
 llvm::createIfConverter(std::function<bool(const Function &)> Ftor) {
-  return new IfConverter(Ftor);
+  return new IfConverter(std::move(Ftor));
 }
diff --git a/lib/CodeGen/ImplicitNullChecks.cpp b/lib/CodeGen/ImplicitNullChecks.cpp
index 39c1b9fb9a66..31d6bd0b6dc6 100644
--- a/lib/CodeGen/ImplicitNullChecks.cpp
+++ b/lib/CodeGen/ImplicitNullChecks.cpp
@@ -28,6 +28,7 @@
 #include "llvm/ADT/DenseSet.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/Statistic.h"
+#include "llvm/Analysis/AliasAnalysis.h"
 #include "llvm/CodeGen/Passes.h"
 #include "llvm/CodeGen/MachineFunction.h"
 #include "llvm/CodeGen/MachineMemOperand.h"
@@ -46,10 +47,9 @@
 
 using namespace llvm;
 
-static cl::opt<unsigned> PageSize("imp-null-check-page-size",
-                                  cl::desc("The page size of the target in "
-                                           "bytes"),
-                                  cl::init(4096));
+static cl::opt<int> PageSize("imp-null-check-page-size",
+                             cl::desc("The page size of the target in bytes"),
+                             cl::init(4096));
 
 #define DEBUG_TYPE "implicit-null-checks"
 
@@ -60,7 +60,7 @@ namespace {
 
 class ImplicitNullChecks : public MachineFunctionPass {
   /// Represents one null check that can be made implicit.
-  struct NullCheck {
+  class NullCheck {
     // The memory operation the null check can be folded into.
     MachineInstr *MemOperation;
 
@@ -76,27 +76,42 @@ class ImplicitNullChecks : public MachineFunctionPass {
     // The block branched to if the pointer is null.
     MachineBasicBlock *NullSucc;
 
-    NullCheck()
-        : MemOperation(), CheckOperation(), CheckBlock(), NotNullSucc(),
-          NullSucc() {}
+    // If this is non-null, then MemOperation has a dependency on on this
+    // instruction; and it needs to be hoisted to execute before MemOperation.
+    MachineInstr *OnlyDependency;
 
+  public:
     explicit NullCheck(MachineInstr *memOperation, MachineInstr *checkOperation,
                        MachineBasicBlock *checkBlock,
                        MachineBasicBlock *notNullSucc,
-                       MachineBasicBlock *nullSucc)
+                       MachineBasicBlock *nullSucc,
+                       MachineInstr *onlyDependency)
         : MemOperation(memOperation), CheckOperation(checkOperation),
-          CheckBlock(checkBlock), NotNullSucc(notNullSucc), NullSucc(nullSucc) {
-    }
+          CheckBlock(checkBlock), NotNullSucc(notNullSucc), NullSucc(nullSucc),
+          OnlyDependency(onlyDependency) {}
+
+    MachineInstr *getMemOperation() const { return MemOperation; }
+
+    MachineInstr *getCheckOperation() const { return CheckOperation; }
+
+    MachineBasicBlock *getCheckBlock() const { return CheckBlock; }
+
+    MachineBasicBlock *getNotNullSucc() const { return NotNullSucc; }
+
+    MachineBasicBlock *getNullSucc() const { return NullSucc; }
+
+    MachineInstr *getOnlyDependency() const { return OnlyDependency; }
   };
 
   const TargetInstrInfo *TII = nullptr;
   const TargetRegisterInfo *TRI = nullptr;
+  AliasAnalysis *AA = nullptr;
   MachineModuleInfo *MMI = nullptr;
 
   bool analyzeBlockForNullChecks(MachineBasicBlock &MBB,
                                  SmallVectorImpl<NullCheck> &NullCheckList);
   MachineInstr *insertFaultingLoad(MachineInstr *LoadMI, MachineBasicBlock *MBB,
-                                   MCSymbol *HandlerLabel);
+                                   MachineBasicBlock *HandlerMBB);
   void rewriteNullChecks(ArrayRef<NullCheck> NullCheckList);
 
 public:
@@ -107,6 +122,15 @@ public:
   }
 
   bool runOnMachineFunction(MachineFunction &MF) override;
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    AU.addRequired<AAResultsWrapperPass>();
+    MachineFunctionPass::getAnalysisUsage(AU);
+  }
+
+  MachineFunctionProperties getRequiredProperties() const override {
+    return MachineFunctionProperties().set(
+        MachineFunctionProperties::Property::AllVRegsAllocated);
+  }
 };
 
 /// \brief Detect re-ordering hazards and dependencies.
@@ -115,14 +139,22 @@ public:
 /// machine instruction can be re-ordered from after the machine instructions
 /// seen so far to before them.
 class HazardDetector {
-  DenseSet<unsigned> RegDefs;
+  static MachineInstr *getUnknownMI() {
+    return DenseMapInfo<MachineInstr *>::getTombstoneKey();
+  }
+
+  // Maps physical registers to the instruction defining them.  If there has
+  // been more than one def of an specific register, that register is mapped to
+  // getUnknownMI().
+  DenseMap<unsigned, MachineInstr *> RegDefs;
   DenseSet<unsigned> RegUses;
   const TargetRegisterInfo &TRI;
   bool hasSeenClobber;
+  AliasAnalysis &AA;
 
 public:
-  explicit HazardDetector(const TargetRegisterInfo &TRI) :
-    TRI(TRI), hasSeenClobber(false) {}
+  explicit HazardDetector(const TargetRegisterInfo &TRI, AliasAnalysis &AA)
+      : TRI(TRI), hasSeenClobber(false), AA(AA) {}
 
   /// \brief Make a note of \p MI for later queries to isSafeToHoist.
   ///
@@ -130,8 +162,10 @@ public:
   void rememberInstruction(MachineInstr *MI);
 
   /// \brief Return true if it is safe to hoist \p MI from after all the
-  /// instructions seen so far (via rememberInstruction) to before it.
-  bool isSafeToHoist(MachineInstr *MI);
+  /// instructions seen so far (via rememberInstruction) to before it.  If \p MI
+  /// has one and only one transitive dependency, set \p Dependency to that
+  /// instruction.  If there are more dependencies, return false.
+  bool isSafeToHoist(MachineInstr *MI, MachineInstr *&Dependency);
 
   /// \brief Return true if this instance of HazardDetector has been clobbered
   /// (i.e. has no more useful information).
@@ -170,15 +204,23 @@ void HazardDetector::rememberInstruction(MachineInstr *MI) {
     if (!MO.isReg() || !MO.getReg())
       continue;
 
-    if (MO.isDef())
-      RegDefs.insert(MO.getReg());
-    else
+    if (MO.isDef()) {
+      auto It = RegDefs.find(MO.getReg());
+      if (It == RegDefs.end())
+        RegDefs.insert({MO.getReg(), MI});
+      else {
+        assert(It->second && "Found null MI?");
+        It->second = getUnknownMI();
+      }
+    } else
       RegUses.insert(MO.getReg());
   }
 }
 
-bool HazardDetector::isSafeToHoist(MachineInstr *MI) {
+bool HazardDetector::isSafeToHoist(MachineInstr *MI,
+                                   MachineInstr *&Dependency) {
   assert(!isClobbered() && "isSafeToHoist cannot do anything useful!");
+  Dependency = nullptr;
 
   // Right now we don't want to worry about LLVM's memory model.  This can be
   // made more precise later.
@@ -188,9 +230,54 @@ bool HazardDetector::isSafeToHoist(MachineInstr *MI) {
 
   for (auto &MO : MI->operands()) {
     if (MO.isReg() && MO.getReg()) {
-      for (unsigned Reg : RegDefs)
-        if (TRI.regsOverlap(Reg, MO.getReg()))
-          return false;  // We found a write-after-write or read-after-write
+      for (auto &RegDef : RegDefs) {
+        unsigned Reg = RegDef.first;
+        MachineInstr *MI = RegDef.second;
+        if (!TRI.regsOverlap(Reg, MO.getReg()))
+          continue;
+
+        // We found a write-after-write or read-after-write, see if the
+        // instruction causing this dependency can be hoisted too.
+
+        if (MI == getUnknownMI())
+          // We don't have precise dependency information.
+          return false;
+
+        if (Dependency) {
+          if (Dependency == MI)
+            continue;
+          // We already have one dependency, and we can track only one.
+          return false;
+        }
+
+        // Now check if MI is actually a dependency that can be hoisted.
+
+        // We don't want to track transitive dependencies.  We already know that
+        // MI is the only instruction that defines Reg, but we need to be sure
+        // that it does not use any registers that have been defined (trivially
+        // checked below by ensuring that there are no register uses), and that
+        // it is the only def for every register it defines (otherwise we could
+        // violate a write after write hazard).
+        auto IsMIOperandSafe = [&](MachineOperand &MO) {
+          if (!MO.isReg() || !MO.getReg())
+            return true;
+          if (MO.isUse())
+            return false;
+          assert((!MO.isDef() || RegDefs.count(MO.getReg())) &&
+                 "All defs must be tracked in RegDefs by now!");
+          return !MO.isDef() || RegDefs.find(MO.getReg())->second == MI;
+        };
+
+        if (!all_of(MI->operands(), IsMIOperandSafe))
+          return false;
+
+        // Now check for speculation safety:
+        bool SawStore = true;
+        if (!MI->isSafeToMove(&AA, SawStore) || MI->mayLoad())
+          return false;
+
+        Dependency = MI;
+      }
 
       if (MO.isDef())
         for (unsigned Reg : RegUses)
@@ -206,6 +293,7 @@ bool ImplicitNullChecks::runOnMachineFunction(MachineFunction &MF) {
   TII = MF.getSubtarget().getInstrInfo();
   TRI = MF.getRegInfo().getTargetRegisterInfo();
   MMI = &MF.getMMI();
+  AA = &getAnalysis<AAResultsWrapperPass>().getAAResults();
 
   SmallVector<NullCheck, 16> NullCheckList;
 
@@ -218,6 +306,16 @@ bool ImplicitNullChecks::runOnMachineFunction(MachineFunction &MF) {
   return !NullCheckList.empty();
 }
 
+// Return true if any register aliasing \p Reg is live-in into \p MBB.
+static bool AnyAliasLiveIn(const TargetRegisterInfo *TRI,
+                           MachineBasicBlock *MBB, unsigned Reg) {
+  for (MCRegAliasIterator AR(Reg, TRI, /*IncludeSelf*/ true); AR.isValid();
+       ++AR)
+    if (MBB->isLiveIn(*AR))
+      return true;
+  return false;
+}
+
 /// Analyze MBB to check if its terminating branch can be turned into an
 /// implicit null check.  If yes, append a description of the said null check to
 /// NullCheckList and return true, else return false.
@@ -234,7 +332,7 @@ bool ImplicitNullChecks::analyzeBlockForNullChecks(
 
   MachineBranchPredicate MBP;
 
-  if (TII->AnalyzeBranchPredicate(MBB, MBP, true))
+  if (TII->analyzeBranchPredicate(MBB, MBP, true))
     return false;
 
   // Is the predicate comparing an integer to zero?
@@ -319,22 +417,59 @@ bool ImplicitNullChecks::analyzeBlockForNullChecks(
 
   unsigned PointerReg = MBP.LHS.getReg();
 
-  HazardDetector HD(*TRI);
+  HazardDetector HD(*TRI, *AA);
 
   for (auto MII = NotNullSucc->begin(), MIE = NotNullSucc->end(); MII != MIE;
        ++MII) {
-    MachineInstr *MI = &*MII;
-    unsigned BaseReg, Offset;
+    MachineInstr &MI = *MII;
+    unsigned BaseReg;
+    int64_t Offset;
+    MachineInstr *Dependency = nullptr;
     if (TII->getMemOpBaseRegImmOfs(MI, BaseReg, Offset, TRI))
-      if (MI->mayLoad() && !MI->isPredicable() && BaseReg == PointerReg &&
-          Offset < PageSize && MI->getDesc().getNumDefs() <= 1 &&
-          HD.isSafeToHoist(MI)) {
-        NullCheckList.emplace_back(MI, MBP.ConditionDef, &MBB, NotNullSucc,
-                                   NullSucc);
-        return true;
+      if (MI.mayLoad() && !MI.isPredicable() && BaseReg == PointerReg &&
+          Offset < PageSize && MI.getDesc().getNumDefs() <= 1 &&
+          HD.isSafeToHoist(&MI, Dependency)) {
+
+        auto DependencyOperandIsOk = [&](MachineOperand &MO) {
+          assert(!(MO.isReg() && MO.isUse()) &&
+                 "No transitive dependendencies please!");
+          if (!MO.isReg() || !MO.getReg() || !MO.isDef())
+            return true;
+
+          // Make sure that we won't clobber any live ins to the sibling block
+          // by hoisting Dependency.  For instance, we can't hoist INST to
+          // before the null check (even if it safe, and does not violate any
+          // dependencies in the non_null_block) if %rdx is live in to
+          // _null_block.
+          //
+          //    test %rcx, %rcx
+          //    je _null_block
+          //  _non_null_block:
+          //    %rdx<def> = INST
+          //    ...
+          if (AnyAliasLiveIn(TRI, NullSucc, MO.getReg()))
+            return false;
+
+          // Make sure Dependency isn't re-defining the base register.  Then we
+          // won't get the memory operation on the address we want.
+          if (TRI->regsOverlap(MO.getReg(), BaseReg))
+            return false;
+
+          return true;
+        };
+
+        bool DependencyOperandsAreOk =
+            !Dependency ||
+            all_of(Dependency->operands(), DependencyOperandIsOk);
+
+        if (DependencyOperandsAreOk) {
+          NullCheckList.emplace_back(&MI, MBP.ConditionDef, &MBB, NotNullSucc,
+                                     NullSucc, Dependency);
+          return true;
+        }
       }
 
-    HD.rememberInstruction(MI);
+    HD.rememberInstruction(&MI);
     if (HD.isClobbered())
       return false;
   }
@@ -344,11 +479,12 @@ bool ImplicitNullChecks::analyzeBlockForNullChecks(
 
 /// Wrap a machine load instruction, LoadMI, into a FAULTING_LOAD_OP machine
 /// instruction.  The FAULTING_LOAD_OP instruction does the same load as LoadMI
-/// (defining the same register), and branches to HandlerLabel if the load
+/// (defining the same register), and branches to HandlerMBB if the load
 /// faults.  The FAULTING_LOAD_OP instruction is inserted at the end of MBB.
-MachineInstr *ImplicitNullChecks::insertFaultingLoad(MachineInstr *LoadMI,
-                                                     MachineBasicBlock *MBB,
-                                                     MCSymbol *HandlerLabel) {
+MachineInstr *
+ImplicitNullChecks::insertFaultingLoad(MachineInstr *LoadMI,
+                                       MachineBasicBlock *MBB,
+                                       MachineBasicBlock *HandlerMBB) {
   const unsigned NoRegister = 0; // Guaranteed to be the NoRegister value for
                                  // all targets.
 
@@ -364,7 +500,7 @@ MachineInstr *ImplicitNullChecks::insertFaultingLoad(MachineInstr *LoadMI,
   }
 
   auto MIB = BuildMI(MBB, DL, TII->get(TargetOpcode::FAULTING_LOAD_OP), DefReg)
-                 .addSym(HandlerLabel)
+                 .addMBB(HandlerMBB)
                  .addImm(LoadMI->getOpcode());
 
   for (auto &MO : LoadMI->uses())
@@ -381,28 +517,51 @@ void ImplicitNullChecks::rewriteNullChecks(
   DebugLoc DL;
 
   for (auto &NC : NullCheckList) {
-    MCSymbol *HandlerLabel = MMI->getContext().createTempSymbol();
-
     // Remove the conditional branch dependent on the null check.
-    unsigned BranchesRemoved = TII->RemoveBranch(*NC.CheckBlock);
+    unsigned BranchesRemoved = TII->RemoveBranch(*NC.getCheckBlock());
     (void)BranchesRemoved;
     assert(BranchesRemoved > 0 && "expected at least one branch!");
 
+    if (auto *DepMI = NC.getOnlyDependency()) {
+      DepMI->removeFromParent();
+      NC.getCheckBlock()->insert(NC.getCheckBlock()->end(), DepMI);
+    }
+
     // Insert a faulting load where the conditional branch was originally.  We
     // check earlier ensures that this bit of code motion is legal.  We do not
     // touch the successors list for any basic block since we haven't changed
     // control flow, we've just made it implicit.
-    insertFaultingLoad(NC.MemOperation, NC.CheckBlock, HandlerLabel);
-    NC.MemOperation->eraseFromParent();
-    NC.CheckOperation->eraseFromParent();
+    MachineInstr *FaultingLoad = insertFaultingLoad(
+        NC.getMemOperation(), NC.getCheckBlock(), NC.getNullSucc());
+    // Now the values defined by MemOperation, if any, are live-in of
+    // the block of MemOperation.
+    // The original load operation may define implicit-defs alongside
+    // the loaded value.
+    MachineBasicBlock *MBB = NC.getMemOperation()->getParent();
+    for (const MachineOperand &MO : FaultingLoad->operands()) {
+      if (!MO.isReg() || !MO.isDef())
+        continue;
+      unsigned Reg = MO.getReg();
+      if (!Reg || MBB->isLiveIn(Reg))
+        continue;
+      MBB->addLiveIn(Reg);
+    }
 
-    // Insert an *unconditional* branch to not-null successor.
-    TII->InsertBranch(*NC.CheckBlock, NC.NotNullSucc, nullptr, /*Cond=*/None,
-                      DL);
+    if (auto *DepMI = NC.getOnlyDependency()) {
+      for (auto &MO : DepMI->operands()) {
+        if (!MO.isReg() || !MO.getReg() || !MO.isDef())
+          continue;
+        if (!NC.getNotNullSucc()->isLiveIn(MO.getReg()))
+          NC.getNotNullSucc()->addLiveIn(MO.getReg());
+      }
+    }
+
+    NC.getMemOperation()->eraseFromParent();
+    NC.getCheckOperation()->eraseFromParent();
 
-    // Emit the HandlerLabel as an EH_LABEL.
-    BuildMI(*NC.NullSucc, NC.NullSucc->begin(), DL,
-            TII->get(TargetOpcode::EH_LABEL)).addSym(HandlerLabel);
+    // Insert an *unconditional* branch to not-null successor.
+    TII->InsertBranch(*NC.getCheckBlock(), NC.getNotNullSucc(), nullptr,
+                      /*Cond=*/None, DL);
 
     NumImplicitNullChecks++;
   }
@@ -412,5 +571,6 @@ char ImplicitNullChecks::ID = 0;
 char &llvm::ImplicitNullChecksID = ImplicitNullChecks::ID;
 INITIALIZE_PASS_BEGIN(ImplicitNullChecks, "implicit-null-checks",
                       "Implicit null checks", false, false)
+INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass)
 INITIALIZE_PASS_END(ImplicitNullChecks, "implicit-null-checks",
                     "Implicit null checks", false, false)
diff --git a/lib/CodeGen/InlineSpiller.cpp b/lib/CodeGen/InlineSpiller.cpp
index e31013266bc7..197db777dd2c 100644
--- a/lib/CodeGen/InlineSpiller.cpp
+++ b/lib/CodeGen/InlineSpiller.cpp
@@ -13,6 +13,8 @@
 //===----------------------------------------------------------------------===//
 
 #include "Spiller.h"
+#include "SplitKit.h"
+#include "llvm/ADT/MapVector.h"
 #include "llvm/ADT/SetVector.h"
 #include "llvm/ADT/Statistic.h"
 #include "llvm/ADT/TinyPtrVector.h"
@@ -30,6 +32,7 @@
 #include "llvm/CodeGen/MachineLoopInfo.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
 #include "llvm/CodeGen/VirtRegMap.h"
+#include "llvm/IR/DebugInfo.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/raw_ostream.h"
@@ -48,13 +51,82 @@ STATISTIC(NumReloadsRemoved,  "Number of reloads removed");
 STATISTIC(NumFolded,          "Number of folded stack accesses");
 STATISTIC(NumFoldedLoads,     "Number of folded loads");
 STATISTIC(NumRemats,          "Number of rematerialized defs for spilling");
-STATISTIC(NumOmitReloadSpill, "Number of omitted spills of reloads");
-STATISTIC(NumHoists,          "Number of hoisted spills");
 
 static cl::opt<bool> DisableHoisting("disable-spill-hoist", cl::Hidden,
                                      cl::desc("Disable inline spill hoisting"));
 
 namespace {
+class HoistSpillHelper : private LiveRangeEdit::Delegate {
+  MachineFunction &MF;
+  LiveIntervals &LIS;
+  LiveStacks &LSS;
+  AliasAnalysis *AA;
+  MachineDominatorTree &MDT;
+  MachineLoopInfo &Loops;
+  VirtRegMap &VRM;
+  MachineFrameInfo &MFI;
+  MachineRegisterInfo &MRI;
+  const TargetInstrInfo &TII;
+  const TargetRegisterInfo &TRI;
+  const MachineBlockFrequencyInfo &MBFI;
+
+  InsertPointAnalysis IPA;
+
+  // Map from StackSlot to its original register.
+  DenseMap<int, unsigned> StackSlotToReg;
+  // Map from pair of (StackSlot and Original VNI) to a set of spills which
+  // have the same stackslot and have equal values defined by Original VNI.
+  // These spills are mergeable and are hoist candiates.
+  typedef MapVector<std::pair<int, VNInfo *>, SmallPtrSet<MachineInstr *, 16>>
+      MergeableSpillsMap;
+  MergeableSpillsMap MergeableSpills;
+
+  /// This is the map from original register to a set containing all its
+  /// siblings. To hoist a spill to another BB, we need to find out a live
+  /// sibling there and use it as the source of the new spill.
+  DenseMap<unsigned, SmallSetVector<unsigned, 16>> Virt2SiblingsMap;
+
+  bool isSpillCandBB(unsigned OrigReg, VNInfo &OrigVNI, MachineBasicBlock &BB,
+                     unsigned &LiveReg);
+
+  void rmRedundantSpills(
+      SmallPtrSet<MachineInstr *, 16> &Spills,
+      SmallVectorImpl<MachineInstr *> &SpillsToRm,
+      DenseMap<MachineDomTreeNode *, MachineInstr *> &SpillBBToSpill);
+
+  void getVisitOrders(
+      MachineBasicBlock *Root, SmallPtrSet<MachineInstr *, 16> &Spills,
+      SmallVectorImpl<MachineDomTreeNode *> &Orders,
+      SmallVectorImpl<MachineInstr *> &SpillsToRm,
+      DenseMap<MachineDomTreeNode *, unsigned> &SpillsToKeep,
+      DenseMap<MachineDomTreeNode *, MachineInstr *> &SpillBBToSpill);
+
+  void runHoistSpills(unsigned OrigReg, VNInfo &OrigVNI,
+                      SmallPtrSet<MachineInstr *, 16> &Spills,
+                      SmallVectorImpl<MachineInstr *> &SpillsToRm,
+                      DenseMap<MachineBasicBlock *, unsigned> &SpillsToIns);
+
+public:
+  HoistSpillHelper(MachineFunctionPass &pass, MachineFunction &mf,
+                   VirtRegMap &vrm)
+      : MF(mf), LIS(pass.getAnalysis<LiveIntervals>()),
+        LSS(pass.getAnalysis<LiveStacks>()),
+        AA(&pass.getAnalysis<AAResultsWrapperPass>().getAAResults()),
+        MDT(pass.getAnalysis<MachineDominatorTree>()),
+        Loops(pass.getAnalysis<MachineLoopInfo>()), VRM(vrm),
+        MFI(*mf.getFrameInfo()), MRI(mf.getRegInfo()),
+        TII(*mf.getSubtarget().getInstrInfo()),
+        TRI(*mf.getSubtarget().getRegisterInfo()),
+        MBFI(pass.getAnalysis<MachineBlockFrequencyInfo>()),
+        IPA(LIS, mf.getNumBlockIDs()) {}
+
+  void addToMergeableSpills(MachineInstr &Spill, int StackSlot,
+                            unsigned Original);
+  bool rmFromMergeableSpills(MachineInstr &Spill, int StackSlot);
+  void hoistAllSpills();
+  void LRE_DidCloneVirtReg(unsigned, unsigned) override;
+};
+
 class InlineSpiller : public Spiller {
   MachineFunction &MF;
   LiveIntervals &LIS;
@@ -85,56 +157,12 @@ class InlineSpiller : public Spiller {
   // Values that failed to remat at some point.
   SmallPtrSet<VNInfo*, 8> UsedValues;
 
-public:
-  // Information about a value that was defined by a copy from a sibling
-  // register.
-  struct SibValueInfo {
-    // True when all reaching defs were reloads: No spill is necessary.
-    bool AllDefsAreReloads;
-
-    // True when value is defined by an original PHI not from splitting.
-    bool DefByOrigPHI;
-
-    // True when the COPY defining this value killed its source.
-    bool KillsSource;
-
-    // The preferred register to spill.
-    unsigned SpillReg;
-
-    // The value of SpillReg that should be spilled.
-    VNInfo *SpillVNI;
-
-    // The block where SpillVNI should be spilled. Currently, this must be the
-    // block containing SpillVNI->def.
-    MachineBasicBlock *SpillMBB;
-
-    // A defining instruction that is not a sibling copy or a reload, or NULL.
-    // This can be used as a template for rematerialization.
-    MachineInstr *DefMI;
-
-    // List of values that depend on this one.  These values are actually the
-    // same, but live range splitting has placed them in different registers,
-    // or SSA update needed to insert PHI-defs to preserve SSA form.  This is
-    // copies of the current value and phi-kills.  Usually only phi-kills cause
-    // more than one dependent value.
-    TinyPtrVector<VNInfo*> Deps;
-
-    SibValueInfo(unsigned Reg, VNInfo *VNI)
-      : AllDefsAreReloads(true), DefByOrigPHI(false), KillsSource(false),
-        SpillReg(Reg), SpillVNI(VNI), SpillMBB(nullptr), DefMI(nullptr) {}
-
-    // Returns true when a def has been found.
-    bool hasDef() const { return DefByOrigPHI || DefMI; }
-  };
-
-private:
-  // Values in RegsToSpill defined by sibling copies.
-  typedef DenseMap<VNInfo*, SibValueInfo> SibValueMap;
-  SibValueMap SibValues;
-
   // Dead defs generated during spilling.
   SmallVector<MachineInstr*, 8> DeadDefs;
 
+  // Object records spills information and does the hoisting.
+  HoistSpillHelper HSpiller;
+
   ~InlineSpiller() override {}
 
 public:
@@ -147,9 +175,11 @@ public:
         MFI(*mf.getFrameInfo()), MRI(mf.getRegInfo()),
         TII(*mf.getSubtarget().getInstrInfo()),
         TRI(*mf.getSubtarget().getRegisterInfo()),
-        MBFI(pass.getAnalysis<MachineBlockFrequencyInfo>()) {}
+        MBFI(pass.getAnalysis<MachineBlockFrequencyInfo>()),
+        HSpiller(pass, mf, vrm) {}
 
   void spill(LiveRangeEdit &) override;
+  void postOptimization() override;
 
 private:
   bool isSnippet(const LiveInterval &SnipLI);
@@ -161,15 +191,11 @@ private:
   }
 
   bool isSibling(unsigned Reg);
-  MachineInstr *traceSiblingValue(unsigned, VNInfo*, VNInfo*);
-  void propagateSiblingValue(SibValueMap::iterator, VNInfo *VNI = nullptr);
-  void analyzeSiblingValues();
-
-  bool hoistSpill(LiveInterval &SpillLI, MachineInstr *CopyMI);
+  bool hoistSpillInsideBB(LiveInterval &SpillLI, MachineInstr &CopyMI);
   void eliminateRedundantSpills(LiveInterval &LI, VNInfo *VNI);
 
   void markValueUsed(LiveInterval*, VNInfo*);
-  bool reMaterializeFor(LiveInterval&, MachineBasicBlock::iterator MI);
+  bool reMaterializeFor(LiveInterval &, MachineInstr &MI);
   void reMaterializeAll();
 
   bool coalesceStackAccess(MachineInstr *MI, unsigned Reg);
@@ -210,13 +236,13 @@ Spiller *createInlineSpiller(MachineFunctionPass &pass,
 
 /// isFullCopyOf - If MI is a COPY to or from Reg, return the other register,
 /// otherwise return 0.
-static unsigned isFullCopyOf(const MachineInstr *MI, unsigned Reg) {
-  if (!MI->isFullCopy())
+static unsigned isFullCopyOf(const MachineInstr &MI, unsigned Reg) {
+  if (!MI.isFullCopy())
     return 0;
-  if (MI->getOperand(0).getReg() == Reg)
-      return MI->getOperand(1).getReg();
-  if (MI->getOperand(1).getReg() == Reg)
-      return MI->getOperand(0).getReg();
+  if (MI.getOperand(0).getReg() == Reg)
+    return MI.getOperand(1).getReg();
+  if (MI.getOperand(1).getReg() == Reg)
+    return MI.getOperand(0).getReg();
   return 0;
 }
 
@@ -242,7 +268,7 @@ bool InlineSpiller::isSnippet(const LiveInterval &SnipLI) {
   for (MachineRegisterInfo::reg_instr_nodbg_iterator
        RI = MRI.reg_instr_nodbg_begin(SnipLI.reg),
        E = MRI.reg_instr_nodbg_end(); RI != E; ) {
-    MachineInstr *MI = &*(RI++);
+    MachineInstr &MI = *RI++;
 
     // Allow copies to/from Reg.
     if (isFullCopyOf(MI, Reg))
@@ -258,9 +284,9 @@ bool InlineSpiller::isSnippet(const LiveInterval &SnipLI) {
       continue;
 
     // Allow a single additional instruction.
-    if (UseMI && MI != UseMI)
+    if (UseMI && &MI != UseMI)
       return false;
-    UseMI = MI;
+    UseMI = &MI;
   }
   return true;
 }
@@ -281,14 +307,14 @@ void InlineSpiller::collectRegsToSpill() {
 
   for (MachineRegisterInfo::reg_instr_iterator
        RI = MRI.reg_instr_begin(Reg), E = MRI.reg_instr_end(); RI != E; ) {
-    MachineInstr *MI = &*(RI++);
+    MachineInstr &MI = *RI++;
     unsigned SnipReg = isFullCopyOf(MI, Reg);
     if (!isSibling(SnipReg))
       continue;
     LiveInterval &SnipLI = LIS.getInterval(SnipReg);
     if (!isSnippet(SnipLI))
       continue;
-    SnippetCopies.insert(MI);
+    SnippetCopies.insert(&MI);
     if (isRegToSpill(SnipReg))
       continue;
     RegsToSpill.push_back(SnipReg);
@@ -297,418 +323,46 @@ void InlineSpiller::collectRegsToSpill() {
   }
 }
 
-
-//===----------------------------------------------------------------------===//
-//                            Sibling Values
-//===----------------------------------------------------------------------===//
-
-// After live range splitting, some values to be spilled may be defined by
-// copies from sibling registers. We trace the sibling copies back to the
-// original value if it still exists. We need it for rematerialization.
-//
-// Even when the value can't be rematerialized, we still want to determine if
-// the value has already been spilled, or we may want to hoist the spill from a
-// loop.
-
 bool InlineSpiller::isSibling(unsigned Reg) {
   return TargetRegisterInfo::isVirtualRegister(Reg) &&
            VRM.getOriginal(Reg) == Original;
 }
 
-#ifndef NDEBUG
-static raw_ostream &operator<<(raw_ostream &OS,
-                               const InlineSpiller::SibValueInfo &SVI) {
-  OS << "spill " << PrintReg(SVI.SpillReg) << ':'
-     << SVI.SpillVNI->id << '@' << SVI.SpillVNI->def;
-  if (SVI.SpillMBB)
-    OS << " in BB#" << SVI.SpillMBB->getNumber();
-  if (SVI.AllDefsAreReloads)
-    OS << " all-reloads";
-  if (SVI.DefByOrigPHI)
-    OS << " orig-phi";
-  if (SVI.KillsSource)
-    OS << " kill";
-  OS << " deps[";
-  for (VNInfo *Dep : SVI.Deps)
-    OS << ' ' << Dep->id << '@' << Dep->def;
-  OS << " ]";
-  if (SVI.DefMI)
-    OS << " def: " << *SVI.DefMI;
-  else
-    OS << '\n';
-  return OS;
-}
-#endif
-
-/// propagateSiblingValue - Propagate the value in SVI to dependents if it is
-/// known.  Otherwise remember the dependency for later.
+/// It is beneficial to spill to earlier place in the same BB in case
+/// as follows:
+/// There is an alternative def earlier in the same MBB.
+/// Hoist the spill as far as possible in SpillMBB. This can ease
+/// register pressure:
 ///
-/// @param SVIIter SibValues entry to propagate.
-/// @param VNI Dependent value, or NULL to propagate to all saved dependents.
-void InlineSpiller::propagateSiblingValue(SibValueMap::iterator SVIIter,
-                                          VNInfo *VNI) {
-  SibValueMap::value_type *SVI = &*SVIIter;
-
-  // When VNI is non-NULL, add it to SVI's deps, and only propagate to that.
-  TinyPtrVector<VNInfo*> FirstDeps;
-  if (VNI) {
-    FirstDeps.push_back(VNI);
-    SVI->second.Deps.push_back(VNI);
-  }
-
-  // Has the value been completely determined yet?  If not, defer propagation.
-  if (!SVI->second.hasDef())
-    return;
-
-  // Work list of values to propagate.
-  SmallSetVector<SibValueMap::value_type *, 8> WorkList;
-  WorkList.insert(SVI);
-
-  do {
-    SVI = WorkList.pop_back_val();
-    TinyPtrVector<VNInfo*> *Deps = VNI ? &FirstDeps : &SVI->second.Deps;
-    VNI = nullptr;
-
-    SibValueInfo &SV = SVI->second;
-    if (!SV.SpillMBB)
-      SV.SpillMBB = LIS.getMBBFromIndex(SV.SpillVNI->def);
-
-    DEBUG(dbgs() << "  prop to " << Deps->size() << ": "
-                 << SVI->first->id << '@' << SVI->first->def << ":\t" << SV);
-
-    assert(SV.hasDef() && "Propagating undefined value");
-
-    // Should this value be propagated as a preferred spill candidate?  We don't
-    // propagate values of registers that are about to spill.
-    bool PropSpill = !DisableHoisting && !isRegToSpill(SV.SpillReg);
-    unsigned SpillDepth = ~0u;
-
-    for (VNInfo *Dep : *Deps) {
-      SibValueMap::iterator DepSVI = SibValues.find(Dep);
-      assert(DepSVI != SibValues.end() && "Dependent value not in SibValues");
-      SibValueInfo &DepSV = DepSVI->second;
-      if (!DepSV.SpillMBB)
-        DepSV.SpillMBB = LIS.getMBBFromIndex(DepSV.SpillVNI->def);
-
-      bool Changed = false;
-
-      // Propagate defining instruction.
-      if (!DepSV.hasDef()) {
-        Changed = true;
-        DepSV.DefMI = SV.DefMI;
-        DepSV.DefByOrigPHI = SV.DefByOrigPHI;
-      }
-
-      // Propagate AllDefsAreReloads.  For PHI values, this computes an AND of
-      // all predecessors.
-      if (!SV.AllDefsAreReloads && DepSV.AllDefsAreReloads) {
-        Changed = true;
-        DepSV.AllDefsAreReloads = false;
-      }
-
-      // Propagate best spill value.
-      if (PropSpill && SV.SpillVNI != DepSV.SpillVNI) {
-        if (SV.SpillMBB == DepSV.SpillMBB) {
-          // DepSV is in the same block.  Hoist when dominated.
-          if (DepSV.KillsSource && SV.SpillVNI->def < DepSV.SpillVNI->def) {
-            // This is an alternative def earlier in the same MBB.
-            // Hoist the spill as far as possible in SpillMBB. This can ease
-            // register pressure:
-            //
-            //   x = def
-            //   y = use x
-            //   s = copy x
-            //
-            // Hoisting the spill of s to immediately after the def removes the
-            // interference between x and y:
-            //
-            //   x = def
-            //   spill x
-            //   y = use x<kill>
-            //
-            // This hoist only helps when the DepSV copy kills its source.
-            Changed = true;
-            DepSV.SpillReg = SV.SpillReg;
-            DepSV.SpillVNI = SV.SpillVNI;
-            DepSV.SpillMBB = SV.SpillMBB;
-          }
-        } else {
-          // DepSV is in a different block.
-          if (SpillDepth == ~0u)
-            SpillDepth = Loops.getLoopDepth(SV.SpillMBB);
-
-          // Also hoist spills to blocks with smaller loop depth, but make sure
-          // that the new value dominates.  Non-phi dependents are always
-          // dominated, phis need checking.
-
-          const BranchProbability MarginProb(4, 5); // 80%
-          // Hoist a spill to outer loop if there are multiple dependents (it
-          // can be beneficial if more than one dependents are hoisted) or
-          // if DepSV (the hoisting source) is hotter than SV (the hoisting
-          // destination) (we add a 80% margin to bias a little towards
-          // loop depth).
-          bool HoistCondition =
-            (MBFI.getBlockFreq(DepSV.SpillMBB) >=
-             (MBFI.getBlockFreq(SV.SpillMBB) * MarginProb)) ||
-            Deps->size() > 1;
-
-          if ((Loops.getLoopDepth(DepSV.SpillMBB) > SpillDepth) &&
-              HoistCondition &&
-              (!DepSVI->first->isPHIDef() ||
-               MDT.dominates(SV.SpillMBB, DepSV.SpillMBB))) {
-            Changed = true;
-            DepSV.SpillReg = SV.SpillReg;
-            DepSV.SpillVNI = SV.SpillVNI;
-            DepSV.SpillMBB = SV.SpillMBB;
-          }
-        }
-      }
-
-      if (!Changed)
-        continue;
-
-      // Something changed in DepSVI. Propagate to dependents.
-      WorkList.insert(&*DepSVI);
-
-      DEBUG(dbgs() << "  update " << DepSVI->first->id << '@'
-            << DepSVI->first->def << " to:\t" << DepSV);
-    }
-  } while (!WorkList.empty());
-}
-
-/// traceSiblingValue - Trace a value that is about to be spilled back to the
-/// real defining instructions by looking through sibling copies. Always stay
-/// within the range of OrigVNI so the registers are known to carry the same
-/// value.
+///   x = def
+///   y = use x
+///   s = copy x
 ///
-/// Determine if the value is defined by all reloads, so spilling isn't
-/// necessary - the value is already in the stack slot.
+/// Hoisting the spill of s to immediately after the def removes the
+/// interference between x and y:
 ///
-/// Return a defining instruction that may be a candidate for rematerialization.
+///   x = def
+///   spill x
+///   y = use x<kill>
 ///
-MachineInstr *InlineSpiller::traceSiblingValue(unsigned UseReg, VNInfo *UseVNI,
-                                               VNInfo *OrigVNI) {
-  // Check if a cached value already exists.
-  SibValueMap::iterator SVI;
-  bool Inserted;
-  std::tie(SVI, Inserted) =
-    SibValues.insert(std::make_pair(UseVNI, SibValueInfo(UseReg, UseVNI)));
-  if (!Inserted) {
-    DEBUG(dbgs() << "Cached value " << PrintReg(UseReg) << ':'
-                 << UseVNI->id << '@' << UseVNI->def << ' ' << SVI->second);
-    return SVI->second.DefMI;
-  }
-
-  DEBUG(dbgs() << "Tracing value " << PrintReg(UseReg) << ':'
-               << UseVNI->id << '@' << UseVNI->def << '\n');
-
-  // List of (Reg, VNI) that have been inserted into SibValues, but need to be
-  // processed.
-  SmallVector<std::pair<unsigned, VNInfo*>, 8> WorkList;
-  WorkList.push_back(std::make_pair(UseReg, UseVNI));
-
-  LiveInterval &OrigLI = LIS.getInterval(Original);
-  do {
-    unsigned Reg;
-    VNInfo *VNI;
-    std::tie(Reg, VNI) = WorkList.pop_back_val();
-    DEBUG(dbgs() << "  " << PrintReg(Reg) << ':' << VNI->id << '@' << VNI->def
-                 << ":\t");
-
-    // First check if this value has already been computed.
-    SVI = SibValues.find(VNI);
-    assert(SVI != SibValues.end() && "Missing SibValues entry");
-
-    // Trace through PHI-defs created by live range splitting.
-    if (VNI->isPHIDef()) {
-      // Stop at original PHIs.  We don't know the value at the
-      // predecessors. Look up the VNInfo for the current definition
-      // in OrigLI, to properly determine whether or not this phi was
-      // added by splitting.
-      if (VNI->def == OrigLI.getVNInfoAt(VNI->def)->def) {
-        DEBUG(dbgs() << "orig phi value\n");
-        SVI->second.DefByOrigPHI = true;
-        SVI->second.AllDefsAreReloads = false;
-        propagateSiblingValue(SVI);
-        continue;
-      }
-
-      // This is a PHI inserted by live range splitting.  We could trace the
-      // live-out value from predecessor blocks, but that search can be very
-      // expensive if there are many predecessors and many more PHIs as
-      // generated by tail-dup when it sees an indirectbr.  Instead, look at
-      // all the non-PHI defs that have the same value as OrigVNI.  They must
-      // jointly dominate VNI->def.  This is not optimal since VNI may actually
-      // be jointly dominated by a smaller subset of defs, so there is a change
-      // we will miss a AllDefsAreReloads optimization.
-
-      // Separate all values dominated by OrigVNI into PHIs and non-PHIs.
-      SmallVector<VNInfo*, 8> PHIs, NonPHIs;
-      LiveInterval &LI = LIS.getInterval(Reg);
-
-      for (LiveInterval::vni_iterator VI = LI.vni_begin(), VE = LI.vni_end();
-           VI != VE; ++VI) {
-        VNInfo *VNI2 = *VI;
-        if (VNI2->isUnused())
-          continue;
-        if (!OrigLI.containsOneValue() &&
-            OrigLI.getVNInfoAt(VNI2->def) != OrigVNI)
-          continue;
-        if (VNI2->isPHIDef() && VNI2->def != OrigVNI->def)
-          PHIs.push_back(VNI2);
-        else
-          NonPHIs.push_back(VNI2);
-      }
-      DEBUG(dbgs() << "split phi value, checking " << PHIs.size()
-                   << " phi-defs, and " << NonPHIs.size()
-                   << " non-phi/orig defs\n");
-
-      // Create entries for all the PHIs.  Don't add them to the worklist, we
-      // are processing all of them in one go here.
-      for (VNInfo *PHI : PHIs)
-        SibValues.insert(std::make_pair(PHI, SibValueInfo(Reg, PHI)));
-
-      // Add every PHI as a dependent of all the non-PHIs.
-      for (VNInfo *NonPHI : NonPHIs) {
-        // Known value? Try an insertion.
-        std::tie(SVI, Inserted) =
-          SibValues.insert(std::make_pair(NonPHI, SibValueInfo(Reg, NonPHI)));
-        // Add all the PHIs as dependents of NonPHI.
-        SVI->second.Deps.insert(SVI->second.Deps.end(), PHIs.begin(),
-                                PHIs.end());
-        // This is the first time we see NonPHI, add it to the worklist.
-        if (Inserted)
-          WorkList.push_back(std::make_pair(Reg, NonPHI));
-        else
-          // Propagate to all inserted PHIs, not just VNI.
-          propagateSiblingValue(SVI);
-      }
-
-      // Next work list item.
-      continue;
-    }
-
-    MachineInstr *MI = LIS.getInstructionFromIndex(VNI->def);
-    assert(MI && "Missing def");
-
-    // Trace through sibling copies.
-    if (unsigned SrcReg = isFullCopyOf(MI, Reg)) {
-      if (isSibling(SrcReg)) {
-        LiveInterval &SrcLI = LIS.getInterval(SrcReg);
-        LiveQueryResult SrcQ = SrcLI.Query(VNI->def);
-        assert(SrcQ.valueIn() && "Copy from non-existing value");
-        // Check if this COPY kills its source.
-        SVI->second.KillsSource = SrcQ.isKill();
-        VNInfo *SrcVNI = SrcQ.valueIn();
-        DEBUG(dbgs() << "copy of " << PrintReg(SrcReg) << ':'
-                     << SrcVNI->id << '@' << SrcVNI->def
-                     << " kill=" << unsigned(SVI->second.KillsSource) << '\n');
-        // Known sibling source value? Try an insertion.
-        std::tie(SVI, Inserted) = SibValues.insert(
-            std::make_pair(SrcVNI, SibValueInfo(SrcReg, SrcVNI)));
-        // This is the first time we see Src, add it to the worklist.
-        if (Inserted)
-          WorkList.push_back(std::make_pair(SrcReg, SrcVNI));
-        propagateSiblingValue(SVI, VNI);
-        // Next work list item.
-        continue;
-      }
-    }
-
-    // Track reachable reloads.
-    SVI->second.DefMI = MI;
-    SVI->second.SpillMBB = MI->getParent();
-    int FI;
-    if (Reg == TII.isLoadFromStackSlot(MI, FI) && FI == StackSlot) {
-      DEBUG(dbgs() << "reload\n");
-      propagateSiblingValue(SVI);
-      // Next work list item.
-      continue;
-    }
-
-    // Potential remat candidate.
-    DEBUG(dbgs() << "def " << *MI);
-    SVI->second.AllDefsAreReloads = false;
-    propagateSiblingValue(SVI);
-  } while (!WorkList.empty());
-
-  // Look up the value we were looking for.  We already did this lookup at the
-  // top of the function, but SibValues may have been invalidated.
-  SVI = SibValues.find(UseVNI);
-  assert(SVI != SibValues.end() && "Didn't compute requested info");
-  DEBUG(dbgs() << "  traced to:\t" << SVI->second);
-  return SVI->second.DefMI;
-}
-
-/// analyzeSiblingValues - Trace values defined by sibling copies back to
-/// something that isn't a sibling copy.
+/// This hoist only helps when the copy kills its source.
 ///
-/// Keep track of values that may be rematerializable.
-void InlineSpiller::analyzeSiblingValues() {
-  SibValues.clear();
-
-  // No siblings at all?
-  if (Edit->getReg() == Original)
-    return;
-
-  LiveInterval &OrigLI = LIS.getInterval(Original);
-  for (unsigned Reg : RegsToSpill) {
-    LiveInterval &LI = LIS.getInterval(Reg);
-    for (LiveInterval::const_vni_iterator VI = LI.vni_begin(),
-         VE = LI.vni_end(); VI != VE; ++VI) {
-      VNInfo *VNI = *VI;
-      if (VNI->isUnused())
-        continue;
-      MachineInstr *DefMI = nullptr;
-      if (!VNI->isPHIDef()) {
-       DefMI = LIS.getInstructionFromIndex(VNI->def);
-       assert(DefMI && "No defining instruction");
-      }
-      // Check possible sibling copies.
-      if (VNI->isPHIDef() || DefMI->isCopy()) {
-        VNInfo *OrigVNI = OrigLI.getVNInfoAt(VNI->def);
-        assert(OrigVNI && "Def outside original live range");
-        if (OrigVNI->def != VNI->def)
-          DefMI = traceSiblingValue(Reg, VNI, OrigVNI);
-      }
-      if (DefMI && Edit->checkRematerializable(VNI, DefMI, AA)) {
-        DEBUG(dbgs() << "Value " << PrintReg(Reg) << ':' << VNI->id << '@'
-                     << VNI->def << " may remat from " << *DefMI);
-      }
-    }
-  }
-}
-
-/// hoistSpill - Given a sibling copy that defines a value to be spilled, insert
-/// a spill at a better location.
-bool InlineSpiller::hoistSpill(LiveInterval &SpillLI, MachineInstr *CopyMI) {
+bool InlineSpiller::hoistSpillInsideBB(LiveInterval &SpillLI,
+                                       MachineInstr &CopyMI) {
   SlotIndex Idx = LIS.getInstructionIndex(CopyMI);
+#ifndef NDEBUG
   VNInfo *VNI = SpillLI.getVNInfoAt(Idx.getRegSlot());
   assert(VNI && VNI->def == Idx.getRegSlot() && "Not defined by copy");
-  SibValueMap::iterator I = SibValues.find(VNI);
-  if (I == SibValues.end())
-    return false;
-
-  const SibValueInfo &SVI = I->second;
+#endif
 
-  // Let the normal folding code deal with the boring case.
-  if (!SVI.AllDefsAreReloads && SVI.SpillVNI == VNI)
+  unsigned SrcReg = CopyMI.getOperand(1).getReg();
+  LiveInterval &SrcLI = LIS.getInterval(SrcReg);
+  VNInfo *SrcVNI = SrcLI.getVNInfoAt(Idx);
+  LiveQueryResult SrcQ = SrcLI.Query(Idx);
+  MachineBasicBlock *DefMBB = LIS.getMBBFromIndex(SrcVNI->def);
+  if (DefMBB != CopyMI.getParent() || !SrcQ.isKill())
     return false;
 
-  // SpillReg may have been deleted by remat and DCE.
-  if (!LIS.hasInterval(SVI.SpillReg)) {
-    DEBUG(dbgs() << "Stale interval: " << PrintReg(SVI.SpillReg) << '\n');
-    SibValues.erase(I);
-    return false;
-  }
-
-  LiveInterval &SibLI = LIS.getInterval(SVI.SpillReg);
-  if (!SibLI.containsValue(SVI.SpillVNI)) {
-    DEBUG(dbgs() << "Stale value: " << PrintReg(SVI.SpillReg) << '\n');
-    SibValues.erase(I);
-    return false;
-  }
-
   // Conservatively extend the stack slot range to the range of the original
   // value. We may be able to do better with stack slot coloring by being more
   // careful here.
@@ -719,35 +373,29 @@ bool InlineSpiller::hoistSpill(LiveInterval &SpillLI, MachineInstr *CopyMI) {
   DEBUG(dbgs() << "\tmerged orig valno " << OrigVNI->id << ": "
                << *StackInt << '\n');
 
-  // Already spilled everywhere.
-  if (SVI.AllDefsAreReloads) {
-    DEBUG(dbgs() << "\tno spill needed: " << SVI);
-    ++NumOmitReloadSpill;
-    return true;
-  }
-  // We are going to spill SVI.SpillVNI immediately after its def, so clear out
+  // We are going to spill SrcVNI immediately after its def, so clear out
   // any later spills of the same value.
-  eliminateRedundantSpills(SibLI, SVI.SpillVNI);
+  eliminateRedundantSpills(SrcLI, SrcVNI);
 
-  MachineBasicBlock *MBB = LIS.getMBBFromIndex(SVI.SpillVNI->def);
+  MachineBasicBlock *MBB = LIS.getMBBFromIndex(SrcVNI->def);
   MachineBasicBlock::iterator MII;
-  if (SVI.SpillVNI->isPHIDef())
+  if (SrcVNI->isPHIDef())
     MII = MBB->SkipPHIsAndLabels(MBB->begin());
   else {
-    MachineInstr *DefMI = LIS.getInstructionFromIndex(SVI.SpillVNI->def);
+    MachineInstr *DefMI = LIS.getInstructionFromIndex(SrcVNI->def);
     assert(DefMI && "Defining instruction disappeared");
     MII = DefMI;
     ++MII;
   }
   // Insert spill without kill flag immediately after def.
-  TII.storeRegToStackSlot(*MBB, MII, SVI.SpillReg, false, StackSlot,
-                          MRI.getRegClass(SVI.SpillReg), &TRI);
+  TII.storeRegToStackSlot(*MBB, MII, SrcReg, false, StackSlot,
+                          MRI.getRegClass(SrcReg), &TRI);
   --MII; // Point to store instruction.
-  LIS.InsertMachineInstrInMaps(MII);
-  DEBUG(dbgs() << "\thoisted: " << SVI.SpillVNI->def << '\t' << *MII);
+  LIS.InsertMachineInstrInMaps(*MII);
+  DEBUG(dbgs() << "\thoisted: " << SrcVNI->def << '\t' << *MII);
 
+  HSpiller.addToMergeableSpills(*MII, StackSlot, Original);
   ++NumSpills;
-  ++NumHoists;
   return true;
 }
 
@@ -778,8 +426,8 @@ void InlineSpiller::eliminateRedundantSpills(LiveInterval &SLI, VNInfo *VNI) {
     for (MachineRegisterInfo::use_instr_nodbg_iterator
          UI = MRI.use_instr_nodbg_begin(Reg), E = MRI.use_instr_nodbg_end();
          UI != E; ) {
-      MachineInstr *MI = &*(UI++);
-      if (!MI->isCopy() && !MI->mayStore())
+      MachineInstr &MI = *UI++;
+      if (!MI.isCopy() && !MI.mayStore())
         continue;
       SlotIndex Idx = LIS.getInstructionIndex(MI);
       if (LI->getVNInfoAt(Idx) != VNI)
@@ -800,12 +448,13 @@ void InlineSpiller::eliminateRedundantSpills(LiveInterval &SLI, VNInfo *VNI) {
       // Erase spills.
       int FI;
       if (Reg == TII.isStoreToStackSlot(MI, FI) && FI == StackSlot) {
-        DEBUG(dbgs() << "Redundant spill " << Idx << '\t' << *MI);
+        DEBUG(dbgs() << "Redundant spill " << Idx << '\t' << MI);
         // eliminateDeadDefs won't normally remove stores, so switch opcode.
-        MI->setDesc(TII.get(TargetOpcode::KILL));
-        DeadDefs.push_back(MI);
+        MI.setDesc(TII.get(TargetOpcode::KILL));
+        DeadDefs.push_back(&MI);
         ++NumSpillsRemoved;
-        --NumSpills;
+        if (HSpiller.rmFromMergeableSpills(MI, StackSlot))
+          --NumSpills;
       }
     }
   } while (!WorkList.empty());
@@ -849,13 +498,12 @@ void InlineSpiller::markValueUsed(LiveInterval *LI, VNInfo *VNI) {
 }
 
 /// reMaterializeFor - Attempt to rematerialize before MI instead of reloading.
-bool InlineSpiller::reMaterializeFor(LiveInterval &VirtReg,
-                                     MachineBasicBlock::iterator MI) {
+bool InlineSpiller::reMaterializeFor(LiveInterval &VirtReg, MachineInstr &MI) {
 
   // Analyze instruction
   SmallVector<std::pair<MachineInstr *, unsigned>, 8> Ops;
   MIBundleOperands::VirtRegInfo RI =
-    MIBundleOperands(MI).analyzeVirtReg(VirtReg.reg, &Ops);
+      MIBundleOperands(MI).analyzeVirtReg(VirtReg.reg, &Ops);
 
   if (!RI.Reads)
     return false;
@@ -865,26 +513,26 @@ bool InlineSpiller::reMaterializeFor(LiveInterval &VirtReg,
 
   if (!ParentVNI) {
     DEBUG(dbgs() << "\tadding <undef> flags: ");
-    for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) {
-      MachineOperand &MO = MI->getOperand(i);
+    for (unsigned i = 0, e = MI.getNumOperands(); i != e; ++i) {
+      MachineOperand &MO = MI.getOperand(i);
       if (MO.isReg() && MO.isUse() && MO.getReg() == VirtReg.reg)
         MO.setIsUndef();
     }
-    DEBUG(dbgs() << UseIdx << '\t' << *MI);
+    DEBUG(dbgs() << UseIdx << '\t' << MI);
     return true;
   }
 
-  if (SnippetCopies.count(MI))
+  if (SnippetCopies.count(&MI))
     return false;
 
-  // Use an OrigVNI from traceSiblingValue when ParentVNI is a sibling copy.
+  LiveInterval &OrigLI = LIS.getInterval(Original);
+  VNInfo *OrigVNI = OrigLI.getVNInfoAt(UseIdx);
   LiveRangeEdit::Remat RM(ParentVNI);
-  SibValueMap::const_iterator SibI = SibValues.find(ParentVNI);
-  if (SibI != SibValues.end())
-    RM.OrigMI = SibI->second.DefMI;
-  if (!Edit->canRematerializeAt(RM, UseIdx, false)) {
+  RM.OrigMI = LIS.getInstructionFromIndex(OrigVNI->def);
+
+  if (!Edit->canRematerializeAt(RM, OrigVNI, UseIdx, false)) {
     markValueUsed(&VirtReg, ParentVNI);
-    DEBUG(dbgs() << "\tcannot remat for " << UseIdx << '\t' << *MI);
+    DEBUG(dbgs() << "\tcannot remat for " << UseIdx << '\t' << MI);
     return false;
   }
 
@@ -892,7 +540,7 @@ bool InlineSpiller::reMaterializeFor(LiveInterval &VirtReg,
   // same register for uses and defs.
   if (RI.Tied) {
     markValueUsed(&VirtReg, ParentVNI);
-    DEBUG(dbgs() << "\tcannot remat tied reg: " << UseIdx << '\t' << *MI);
+    DEBUG(dbgs() << "\tcannot remat tied reg: " << UseIdx << '\t' << MI);
     return false;
   }
 
@@ -909,8 +557,8 @@ bool InlineSpiller::reMaterializeFor(LiveInterval &VirtReg,
   unsigned NewVReg = Edit->createFrom(Original);
 
   // Finally we can rematerialize OrigMI before MI.
-  SlotIndex DefIdx = Edit->rematerializeAt(*MI->getParent(), MI, NewVReg, RM,
-                                           TRI);
+  SlotIndex DefIdx =
+      Edit->rematerializeAt(*MI.getParent(), MI, NewVReg, RM, TRI);
   (void)DefIdx;
   DEBUG(dbgs() << "\tremat:  " << DefIdx << '\t'
                << *LIS.getInstructionFromIndex(DefIdx));
@@ -923,7 +571,7 @@ bool InlineSpiller::reMaterializeFor(LiveInterval &VirtReg,
       MO.setIsKill();
     }
   }
-  DEBUG(dbgs() << "\t        " << UseIdx << '\t' << *MI << '\n');
+  DEBUG(dbgs() << "\t        " << UseIdx << '\t' << MI << '\n');
 
   ++NumRemats;
   return true;
@@ -932,7 +580,6 @@ bool InlineSpiller::reMaterializeFor(LiveInterval &VirtReg,
 /// reMaterializeAll - Try to rematerialize as many uses as possible,
 /// and trim the live ranges after.
 void InlineSpiller::reMaterializeAll() {
-  // analyzeSiblingValues has already tested all relevant defining instructions.
   if (!Edit->anyRematerializable(AA))
     return;
 
@@ -945,10 +592,10 @@ void InlineSpiller::reMaterializeAll() {
     for (MachineRegisterInfo::reg_bundle_iterator
            RegI = MRI.reg_bundle_begin(Reg), E = MRI.reg_bundle_end();
          RegI != E; ) {
-      MachineInstr *MI = &*(RegI++);
+      MachineInstr &MI = *RegI++;
 
       // Debug values are not allowed to affect codegen.
-      if (MI->isDebugValue())
+      if (MI.isDebugValue())
         continue;
 
       anyRemat |= reMaterializeFor(LI, MI);
@@ -979,20 +626,22 @@ void InlineSpiller::reMaterializeAll() {
   if (DeadDefs.empty())
     return;
   DEBUG(dbgs() << "Remat created " << DeadDefs.size() << " dead defs.\n");
-  Edit->eliminateDeadDefs(DeadDefs, RegsToSpill);
-
-  // Get rid of deleted and empty intervals.
+  Edit->eliminateDeadDefs(DeadDefs, RegsToSpill, AA);
+
+  // LiveRangeEdit::eliminateDeadDef is used to remove dead define instructions
+  // after rematerialization.  To remove a VNI for a vreg from its LiveInterval,
+  // LiveIntervals::removeVRegDefAt is used. However, after non-PHI VNIs are all
+  // removed, PHI VNI are still left in the LiveInterval.
+  // So to get rid of unused reg, we need to check whether it has non-dbg
+  // reference instead of whether it has non-empty interval.
   unsigned ResultPos = 0;
   for (unsigned Reg : RegsToSpill) {
-    if (!LIS.hasInterval(Reg))
-      continue;
-
-    LiveInterval &LI = LIS.getInterval(Reg);
-    if (LI.empty()) {
+    if (MRI.reg_nodbg_empty(Reg)) {
       Edit->eraseVirtReg(Reg);
       continue;
     }
-
+    assert((LIS.hasInterval(Reg) && !LIS.getInterval(Reg).empty()) &&
+           "Reg with empty interval has reference");
     RegsToSpill[ResultPos++] = Reg;
   }
   RegsToSpill.erase(RegsToSpill.begin() + ResultPos, RegsToSpill.end());
@@ -1007,17 +656,20 @@ void InlineSpiller::reMaterializeAll() {
 /// If MI is a load or store of StackSlot, it can be removed.
 bool InlineSpiller::coalesceStackAccess(MachineInstr *MI, unsigned Reg) {
   int FI = 0;
-  unsigned InstrReg = TII.isLoadFromStackSlot(MI, FI);
+  unsigned InstrReg = TII.isLoadFromStackSlot(*MI, FI);
   bool IsLoad = InstrReg;
   if (!IsLoad)
-    InstrReg = TII.isStoreToStackSlot(MI, FI);
+    InstrReg = TII.isStoreToStackSlot(*MI, FI);
 
   // We have a stack access. Is it the right register and slot?
   if (InstrReg != Reg || FI != StackSlot)
     return false;
 
+  if (!IsLoad)
+    HSpiller.rmFromMergeableSpills(*MI, StackSlot);
+
   DEBUG(dbgs() << "Coalescing stack access: " << *MI);
-  LIS.RemoveMachineInstrFromMaps(MI);
+  LIS.RemoveMachineInstrFromMaps(*MI);
   MI->eraseFromParent();
 
   if (IsLoad) {
@@ -1049,7 +701,7 @@ static void dumpMachineInstrRangeWithSlotIndex(MachineBasicBlock::iterator B,
   dbgs() << '\t' << header << ": " << NextLine;
 
   for (MachineBasicBlock::iterator I = B; I != E; ++I) {
-    SlotIndex Idx = LIS.getInstructionIndex(I).getRegSlot();
+    SlotIndex Idx = LIS.getInstructionIndex(*I).getRegSlot();
 
     // If a register was passed in and this instruction has it as a
     // destination that is marked as an early clobber, print the
@@ -1113,13 +765,13 @@ foldMemoryOperand(ArrayRef<std::pair<MachineInstr*, unsigned> > Ops,
   MachineInstrSpan MIS(MI);
 
   MachineInstr *FoldMI =
-                LoadMI ? TII.foldMemoryOperand(MI, FoldOps, LoadMI)
-                       : TII.foldMemoryOperand(MI, FoldOps, StackSlot);
+      LoadMI ? TII.foldMemoryOperand(*MI, FoldOps, *LoadMI, &LIS)
+             : TII.foldMemoryOperand(*MI, FoldOps, StackSlot, &LIS);
   if (!FoldMI)
     return false;
 
   // Remove LIS for any dead defs in the original MI not in FoldMI.
-  for (MIBundleOperands MO(MI); MO.isValid(); ++MO) {
+  for (MIBundleOperands MO(*MI); MO.isValid(); ++MO) {
     if (!MO->isReg())
       continue;
     unsigned Reg = MO->getReg();
@@ -1131,23 +783,27 @@ foldMemoryOperand(ArrayRef<std::pair<MachineInstr*, unsigned> > Ops,
     if (MO->isUse())
       continue;
     MIBundleOperands::PhysRegInfo RI =
-      MIBundleOperands(FoldMI).analyzePhysReg(Reg, &TRI);
+        MIBundleOperands(*FoldMI).analyzePhysReg(Reg, &TRI);
     if (RI.FullyDefined)
       continue;
     // FoldMI does not define this physreg. Remove the LI segment.
     assert(MO->isDead() && "Cannot fold physreg def");
-    SlotIndex Idx = LIS.getInstructionIndex(MI).getRegSlot();
+    SlotIndex Idx = LIS.getInstructionIndex(*MI).getRegSlot();
     LIS.removePhysRegDefAt(Reg, Idx);
   }
 
-  LIS.ReplaceMachineInstrInMaps(MI, FoldMI);
+  int FI;
+  if (TII.isStoreToStackSlot(*MI, FI) &&
+      HSpiller.rmFromMergeableSpills(*MI, FI))
+    --NumSpills;
+  LIS.ReplaceMachineInstrInMaps(*MI, *FoldMI);
   MI->eraseFromParent();
 
   // Insert any new instructions other than FoldMI into the LIS maps.
   assert(!MIS.empty() && "Unexpected empty span of instructions!");
   for (MachineInstr &MI : MIS)
     if (&MI != FoldMI)
-      LIS.InsertMachineInstrInMaps(&MI);
+      LIS.InsertMachineInstrInMaps(MI);
 
   // TII.foldMemoryOperand may have left some implicit operands on the
   // instruction.  Strip them.
@@ -1165,9 +821,10 @@ foldMemoryOperand(ArrayRef<std::pair<MachineInstr*, unsigned> > Ops,
 
   if (!WasCopy)
     ++NumFolded;
-  else if (Ops.front().second == 0)
+  else if (Ops.front().second == 0) {
     ++NumSpills;
-  else
+    HSpiller.addToMergeableSpills(*FoldMI, StackSlot, Original);
+  } else
     ++NumReloads;
   return true;
 }
@@ -1202,6 +859,7 @@ void InlineSpiller::insertSpill(unsigned NewVReg, bool isKill,
   DEBUG(dumpMachineInstrRangeWithSlotIndex(std::next(MI), MIS.end(), LIS,
                                            "spill"));
   ++NumSpills;
+  HSpiller.addToMergeableSpills(*std::next(MI), StackSlot, Original);
 }
 
 /// spillAroundUses - insert spill code around each use of Reg.
@@ -1246,17 +904,17 @@ void InlineSpiller::spillAroundUses(unsigned Reg) {
     // Analyze instruction.
     SmallVector<std::pair<MachineInstr*, unsigned>, 8> Ops;
     MIBundleOperands::VirtRegInfo RI =
-      MIBundleOperands(MI).analyzeVirtReg(Reg, &Ops);
+        MIBundleOperands(*MI).analyzeVirtReg(Reg, &Ops);
 
     // Find the slot index where this instruction reads and writes OldLI.
     // This is usually the def slot, except for tied early clobbers.
-    SlotIndex Idx = LIS.getInstructionIndex(MI).getRegSlot();
+    SlotIndex Idx = LIS.getInstructionIndex(*MI).getRegSlot();
     if (VNInfo *VNI = OldLI.getVNInfoAt(Idx.getRegSlot(true)))
       if (SlotIndex::isSameInstr(Idx, VNI->def))
         Idx = VNI->def;
 
     // Check for a sibling copy.
-    unsigned SibReg = isFullCopyOf(MI, Reg);
+    unsigned SibReg = isFullCopyOf(*MI, Reg);
     if (SibReg && isSibling(SibReg)) {
       // This may actually be a copy between snippets.
       if (isRegToSpill(SibReg)) {
@@ -1265,8 +923,7 @@ void InlineSpiller::spillAroundUses(unsigned Reg) {
         continue;
       }
       if (RI.Writes) {
-        // Hoist the spill of a sib-reg copy.
-        if (hoistSpill(OldLI, MI)) {
+        if (hoistSpillInsideBB(OldLI, *MI)) {
           // This COPY is now dead, the value is already in the stack slot.
           MI->getOperand(0).setIsDead();
           DeadDefs.push_back(MI);
@@ -1339,7 +996,7 @@ void InlineSpiller::spillAll() {
   // Hoisted spills may cause dead code.
   if (!DeadDefs.empty()) {
     DEBUG(dbgs() << "Eliminating " << DeadDefs.size() << " dead defs\n");
-    Edit->eliminateDeadDefs(DeadDefs, RegsToSpill);
+    Edit->eliminateDeadDefs(DeadDefs, RegsToSpill, AA);
   }
 
   // Finally delete the SnippetCopies.
@@ -1347,11 +1004,11 @@ void InlineSpiller::spillAll() {
     for (MachineRegisterInfo::reg_instr_iterator
          RI = MRI.reg_instr_begin(Reg), E = MRI.reg_instr_end();
          RI != E; ) {
-      MachineInstr *MI = &*(RI++);
-      assert(SnippetCopies.count(MI) && "Remaining use wasn't a snippet copy");
+      MachineInstr &MI = *(RI++);
+      assert(SnippetCopies.count(&MI) && "Remaining use wasn't a snippet copy");
       // FIXME: Do this with a LiveRangeEdit callback.
       LIS.RemoveMachineInstrFromMaps(MI);
-      MI->eraseFromParent();
+      MI.eraseFromParent();
     }
   }
 
@@ -1379,7 +1036,6 @@ void InlineSpiller::spill(LiveRangeEdit &edit) {
   assert(DeadDefs.empty() && "Previous spill didn't remove dead defs");
 
   collectRegsToSpill();
-  analyzeSiblingValues();
   reMaterializeAll();
 
   // Remat may handle everything.
@@ -1388,3 +1044,413 @@ void InlineSpiller::spill(LiveRangeEdit &edit) {
 
   Edit->calculateRegClassAndHint(MF, Loops, MBFI);
 }
+
+/// Optimizations after all the reg selections and spills are done.
+///
+void InlineSpiller::postOptimization() { HSpiller.hoistAllSpills(); }
+
+/// When a spill is inserted, add the spill to MergeableSpills map.
+///
+void HoistSpillHelper::addToMergeableSpills(MachineInstr &Spill, int StackSlot,
+                                            unsigned Original) {
+  StackSlotToReg[StackSlot] = Original;
+  SlotIndex Idx = LIS.getInstructionIndex(Spill);
+  VNInfo *OrigVNI = LIS.getInterval(Original).getVNInfoAt(Idx.getRegSlot());
+  std::pair<int, VNInfo *> MIdx = std::make_pair(StackSlot, OrigVNI);
+  MergeableSpills[MIdx].insert(&Spill);
+}
+
+/// When a spill is removed, remove the spill from MergeableSpills map.
+/// Return true if the spill is removed successfully.
+///
+bool HoistSpillHelper::rmFromMergeableSpills(MachineInstr &Spill,
+                                             int StackSlot) {
+  int Original = StackSlotToReg[StackSlot];
+  if (!Original)
+    return false;
+  SlotIndex Idx = LIS.getInstructionIndex(Spill);
+  VNInfo *OrigVNI = LIS.getInterval(Original).getVNInfoAt(Idx.getRegSlot());
+  std::pair<int, VNInfo *> MIdx = std::make_pair(StackSlot, OrigVNI);
+  return MergeableSpills[MIdx].erase(&Spill);
+}
+
+/// Check BB to see if it is a possible target BB to place a hoisted spill,
+/// i.e., there should be a living sibling of OrigReg at the insert point.
+///
+bool HoistSpillHelper::isSpillCandBB(unsigned OrigReg, VNInfo &OrigVNI,
+                                     MachineBasicBlock &BB, unsigned &LiveReg) {
+  SlotIndex Idx;
+  LiveInterval &OrigLI = LIS.getInterval(OrigReg);
+  MachineBasicBlock::iterator MI = IPA.getLastInsertPointIter(OrigLI, BB);
+  if (MI != BB.end())
+    Idx = LIS.getInstructionIndex(*MI);
+  else
+    Idx = LIS.getMBBEndIdx(&BB).getPrevSlot();
+  SmallSetVector<unsigned, 16> &Siblings = Virt2SiblingsMap[OrigReg];
+  assert((LIS.getInterval(OrigReg)).getVNInfoAt(Idx) == &OrigVNI &&
+         "Unexpected VNI");
+
+  for (auto const SibReg : Siblings) {
+    LiveInterval &LI = LIS.getInterval(SibReg);
+    VNInfo *VNI = LI.getVNInfoAt(Idx);
+    if (VNI) {
+      LiveReg = SibReg;
+      return true;
+    }
+  }
+  return false;
+}
+
+/// Remove redundant spills in the same BB. Save those redundant spills in
+/// SpillsToRm, and save the spill to keep and its BB in SpillBBToSpill map.
+///
+void HoistSpillHelper::rmRedundantSpills(
+    SmallPtrSet<MachineInstr *, 16> &Spills,
+    SmallVectorImpl<MachineInstr *> &SpillsToRm,
+    DenseMap<MachineDomTreeNode *, MachineInstr *> &SpillBBToSpill) {
+  // For each spill saw, check SpillBBToSpill[] and see if its BB already has
+  // another spill inside. If a BB contains more than one spill, only keep the
+  // earlier spill with smaller SlotIndex.
+  for (const auto CurrentSpill : Spills) {
+    MachineBasicBlock *Block = CurrentSpill->getParent();
+    MachineDomTreeNode *Node = MDT.DT->getNode(Block);
+    MachineInstr *PrevSpill = SpillBBToSpill[Node];
+    if (PrevSpill) {
+      SlotIndex PIdx = LIS.getInstructionIndex(*PrevSpill);
+      SlotIndex CIdx = LIS.getInstructionIndex(*CurrentSpill);
+      MachineInstr *SpillToRm = (CIdx > PIdx) ? CurrentSpill : PrevSpill;
+      MachineInstr *SpillToKeep = (CIdx > PIdx) ? PrevSpill : CurrentSpill;
+      SpillsToRm.push_back(SpillToRm);
+      SpillBBToSpill[MDT.DT->getNode(Block)] = SpillToKeep;
+    } else {
+      SpillBBToSpill[MDT.DT->getNode(Block)] = CurrentSpill;
+    }
+  }
+  for (const auto SpillToRm : SpillsToRm)
+    Spills.erase(SpillToRm);
+}
+
+/// Starting from \p Root find a top-down traversal order of the dominator
+/// tree to visit all basic blocks containing the elements of \p Spills.
+/// Redundant spills will be found and put into \p SpillsToRm at the same
+/// time. \p SpillBBToSpill will be populated as part of the process and
+/// maps a basic block to the first store occurring in the basic block.
+/// \post SpillsToRm.union(Spills\@post) == Spills\@pre
+///
+void HoistSpillHelper::getVisitOrders(
+    MachineBasicBlock *Root, SmallPtrSet<MachineInstr *, 16> &Spills,
+    SmallVectorImpl<MachineDomTreeNode *> &Orders,
+    SmallVectorImpl<MachineInstr *> &SpillsToRm,
+    DenseMap<MachineDomTreeNode *, unsigned> &SpillsToKeep,
+    DenseMap<MachineDomTreeNode *, MachineInstr *> &SpillBBToSpill) {
+  // The set contains all the possible BB nodes to which we may hoist
+  // original spills.
+  SmallPtrSet<MachineDomTreeNode *, 8> WorkSet;
+  // Save the BB nodes on the path from the first BB node containing
+  // non-redundant spill to the Root node.
+  SmallPtrSet<MachineDomTreeNode *, 8> NodesOnPath;
+  // All the spills to be hoisted must originate from a single def instruction
+  // to the OrigReg. It means the def instruction should dominate all the spills
+  // to be hoisted. We choose the BB where the def instruction is located as
+  // the Root.
+  MachineDomTreeNode *RootIDomNode = MDT[Root]->getIDom();
+  // For every node on the dominator tree with spill, walk up on the dominator
+  // tree towards the Root node until it is reached. If there is other node
+  // containing spill in the middle of the path, the previous spill saw will
+  // be redundant and the node containing it will be removed. All the nodes on
+  // the path starting from the first node with non-redundant spill to the Root
+  // node will be added to the WorkSet, which will contain all the possible
+  // locations where spills may be hoisted to after the loop below is done.
+  for (const auto Spill : Spills) {
+    MachineBasicBlock *Block = Spill->getParent();
+    MachineDomTreeNode *Node = MDT[Block];
+    MachineInstr *SpillToRm = nullptr;
+    while (Node != RootIDomNode) {
+      // If Node dominates Block, and it already contains a spill, the spill in
+      // Block will be redundant.
+      if (Node != MDT[Block] && SpillBBToSpill[Node]) {
+        SpillToRm = SpillBBToSpill[MDT[Block]];
+        break;
+        /// If we see the Node already in WorkSet, the path from the Node to
+        /// the Root node must already be traversed by another spill.
+        /// Then no need to repeat.
+      } else if (WorkSet.count(Node)) {
+        break;
+      } else {
+        NodesOnPath.insert(Node);
+      }
+      Node = Node->getIDom();
+    }
+    if (SpillToRm) {
+      SpillsToRm.push_back(SpillToRm);
+    } else {
+      // Add a BB containing the original spills to SpillsToKeep -- i.e.,
+      // set the initial status before hoisting start. The value of BBs
+      // containing original spills is set to 0, in order to descriminate
+      // with BBs containing hoisted spills which will be inserted to
+      // SpillsToKeep later during hoisting.
+      SpillsToKeep[MDT[Block]] = 0;
+      WorkSet.insert(NodesOnPath.begin(), NodesOnPath.end());
+    }
+    NodesOnPath.clear();
+  }
+
+  // Sort the nodes in WorkSet in top-down order and save the nodes
+  // in Orders. Orders will be used for hoisting in runHoistSpills.
+  unsigned idx = 0;
+  Orders.push_back(MDT.DT->getNode(Root));
+  do {
+    MachineDomTreeNode *Node = Orders[idx++];
+    const std::vector<MachineDomTreeNode *> &Children = Node->getChildren();
+    unsigned NumChildren = Children.size();
+    for (unsigned i = 0; i != NumChildren; ++i) {
+      MachineDomTreeNode *Child = Children[i];
+      if (WorkSet.count(Child))
+        Orders.push_back(Child);
+    }
+  } while (idx != Orders.size());
+  assert(Orders.size() == WorkSet.size() &&
+         "Orders have different size with WorkSet");
+
+#ifndef NDEBUG
+  DEBUG(dbgs() << "Orders size is " << Orders.size() << "\n");
+  SmallVector<MachineDomTreeNode *, 32>::reverse_iterator RIt = Orders.rbegin();
+  for (; RIt != Orders.rend(); RIt++)
+    DEBUG(dbgs() << "BB" << (*RIt)->getBlock()->getNumber() << ",");
+  DEBUG(dbgs() << "\n");
+#endif
+}
+
+/// Try to hoist spills according to BB hotness. The spills to removed will
+/// be saved in \p SpillsToRm. The spills to be inserted will be saved in
+/// \p SpillsToIns.
+///
+void HoistSpillHelper::runHoistSpills(
+    unsigned OrigReg, VNInfo &OrigVNI, SmallPtrSet<MachineInstr *, 16> &Spills,
+    SmallVectorImpl<MachineInstr *> &SpillsToRm,
+    DenseMap<MachineBasicBlock *, unsigned> &SpillsToIns) {
+  // Visit order of dominator tree nodes.
+  SmallVector<MachineDomTreeNode *, 32> Orders;
+  // SpillsToKeep contains all the nodes where spills are to be inserted
+  // during hoisting. If the spill to be inserted is an original spill
+  // (not a hoisted one), the value of the map entry is 0. If the spill
+  // is a hoisted spill, the value of the map entry is the VReg to be used
+  // as the source of the spill.
+  DenseMap<MachineDomTreeNode *, unsigned> SpillsToKeep;
+  // Map from BB to the first spill inside of it.
+  DenseMap<MachineDomTreeNode *, MachineInstr *> SpillBBToSpill;
+
+  rmRedundantSpills(Spills, SpillsToRm, SpillBBToSpill);
+
+  MachineBasicBlock *Root = LIS.getMBBFromIndex(OrigVNI.def);
+  getVisitOrders(Root, Spills, Orders, SpillsToRm, SpillsToKeep,
+                 SpillBBToSpill);
+
+  // SpillsInSubTreeMap keeps the map from a dom tree node to a pair of
+  // nodes set and the cost of all the spills inside those nodes.
+  // The nodes set are the locations where spills are to be inserted
+  // in the subtree of current node.
+  typedef std::pair<SmallPtrSet<MachineDomTreeNode *, 16>, BlockFrequency>
+      NodesCostPair;
+  DenseMap<MachineDomTreeNode *, NodesCostPair> SpillsInSubTreeMap;
+  // Iterate Orders set in reverse order, which will be a bottom-up order
+  // in the dominator tree. Once we visit a dom tree node, we know its
+  // children have already been visited and the spill locations in the
+  // subtrees of all the children have been determined.
+  SmallVector<MachineDomTreeNode *, 32>::reverse_iterator RIt = Orders.rbegin();
+  for (; RIt != Orders.rend(); RIt++) {
+    MachineBasicBlock *Block = (*RIt)->getBlock();
+
+    // If Block contains an original spill, simply continue.
+    if (SpillsToKeep.find(*RIt) != SpillsToKeep.end() && !SpillsToKeep[*RIt]) {
+      SpillsInSubTreeMap[*RIt].first.insert(*RIt);
+      // SpillsInSubTreeMap[*RIt].second contains the cost of spill.
+      SpillsInSubTreeMap[*RIt].second = MBFI.getBlockFreq(Block);
+      continue;
+    }
+
+    // Collect spills in subtree of current node (*RIt) to
+    // SpillsInSubTreeMap[*RIt].first.
+    const std::vector<MachineDomTreeNode *> &Children = (*RIt)->getChildren();
+    unsigned NumChildren = Children.size();
+    for (unsigned i = 0; i != NumChildren; ++i) {
+      MachineDomTreeNode *Child = Children[i];
+      if (SpillsInSubTreeMap.find(Child) == SpillsInSubTreeMap.end())
+        continue;
+      // The stmt "SpillsInSubTree = SpillsInSubTreeMap[*RIt].first" below
+      // should be placed before getting the begin and end iterators of
+      // SpillsInSubTreeMap[Child].first, or else the iterators may be
+      // invalidated when SpillsInSubTreeMap[*RIt] is seen the first time
+      // and the map grows and then the original buckets in the map are moved.
+      SmallPtrSet<MachineDomTreeNode *, 16> &SpillsInSubTree =
+          SpillsInSubTreeMap[*RIt].first;
+      BlockFrequency &SubTreeCost = SpillsInSubTreeMap[*RIt].second;
+      SubTreeCost += SpillsInSubTreeMap[Child].second;
+      auto BI = SpillsInSubTreeMap[Child].first.begin();
+      auto EI = SpillsInSubTreeMap[Child].first.end();
+      SpillsInSubTree.insert(BI, EI);
+      SpillsInSubTreeMap.erase(Child);
+    }
+
+    SmallPtrSet<MachineDomTreeNode *, 16> &SpillsInSubTree =
+          SpillsInSubTreeMap[*RIt].first;
+    BlockFrequency &SubTreeCost = SpillsInSubTreeMap[*RIt].second;
+    // No spills in subtree, simply continue.
+    if (SpillsInSubTree.empty())
+      continue;
+
+    // Check whether Block is a possible candidate to insert spill.
+    unsigned LiveReg = 0;
+    if (!isSpillCandBB(OrigReg, OrigVNI, *Block, LiveReg))
+      continue;
+
+    // If there are multiple spills that could be merged, bias a little
+    // to hoist the spill.
+    BranchProbability MarginProb = (SpillsInSubTree.size() > 1)
+                                       ? BranchProbability(9, 10)
+                                       : BranchProbability(1, 1);
+    if (SubTreeCost > MBFI.getBlockFreq(Block) * MarginProb) {
+      // Hoist: Move spills to current Block.
+      for (const auto SpillBB : SpillsInSubTree) {
+        // When SpillBB is a BB contains original spill, insert the spill
+        // to SpillsToRm.
+        if (SpillsToKeep.find(SpillBB) != SpillsToKeep.end() &&
+            !SpillsToKeep[SpillBB]) {
+          MachineInstr *SpillToRm = SpillBBToSpill[SpillBB];
+          SpillsToRm.push_back(SpillToRm);
+        }
+        // SpillBB will not contain spill anymore, remove it from SpillsToKeep.
+        SpillsToKeep.erase(SpillBB);
+      }
+      // Current Block is the BB containing the new hoisted spill. Add it to
+      // SpillsToKeep. LiveReg is the source of the new spill.
+      SpillsToKeep[*RIt] = LiveReg;
+      DEBUG({
+        dbgs() << "spills in BB: ";
+        for (const auto Rspill : SpillsInSubTree)
+          dbgs() << Rspill->getBlock()->getNumber() << " ";
+        dbgs() << "were promoted to BB" << (*RIt)->getBlock()->getNumber()
+               << "\n";
+      });
+      SpillsInSubTree.clear();
+      SpillsInSubTree.insert(*RIt);
+      SubTreeCost = MBFI.getBlockFreq(Block);
+    }
+  }
+  // For spills in SpillsToKeep with LiveReg set (i.e., not original spill),
+  // save them to SpillsToIns.
+  for (const auto Ent : SpillsToKeep) {
+    if (Ent.second)
+      SpillsToIns[Ent.first->getBlock()] = Ent.second;
+  }
+}
+
+/// For spills with equal values, remove redundant spills and hoist those left
+/// to less hot spots.
+///
+/// Spills with equal values will be collected into the same set in
+/// MergeableSpills when spill is inserted. These equal spills are originated
+/// from the same defining instruction and are dominated by the instruction.
+/// Before hoisting all the equal spills, redundant spills inside in the same
+/// BB are first marked to be deleted. Then starting from the spills left, walk
+/// up on the dominator tree towards the Root node where the define instruction
+/// is located, mark the dominated spills to be deleted along the way and
+/// collect the BB nodes on the path from non-dominated spills to the define
+/// instruction into a WorkSet. The nodes in WorkSet are the candidate places
+/// where we are considering to hoist the spills. We iterate the WorkSet in
+/// bottom-up order, and for each node, we will decide whether to hoist spills
+/// inside its subtree to that node. In this way, we can get benefit locally
+/// even if hoisting all the equal spills to one cold place is impossible.
+///
+void HoistSpillHelper::hoistAllSpills() {
+  SmallVector<unsigned, 4> NewVRegs;
+  LiveRangeEdit Edit(nullptr, NewVRegs, MF, LIS, &VRM, this);
+
+  // Save the mapping between stackslot and its original reg.
+  DenseMap<int, unsigned> SlotToOrigReg;
+  for (unsigned i = 0, e = MRI.getNumVirtRegs(); i != e; ++i) {
+    unsigned Reg = TargetRegisterInfo::index2VirtReg(i);
+    int Slot = VRM.getStackSlot(Reg);
+    if (Slot != VirtRegMap::NO_STACK_SLOT)
+      SlotToOrigReg[Slot] = VRM.getOriginal(Reg);
+    unsigned Original = VRM.getPreSplitReg(Reg);
+    if (!MRI.def_empty(Reg))
+      Virt2SiblingsMap[Original].insert(Reg);
+  }
+
+  // Each entry in MergeableSpills contains a spill set with equal values.
+  for (auto &Ent : MergeableSpills) {
+    int Slot = Ent.first.first;
+    unsigned OrigReg = SlotToOrigReg[Slot];
+    LiveInterval &OrigLI = LIS.getInterval(OrigReg);
+    VNInfo *OrigVNI = Ent.first.second;
+    SmallPtrSet<MachineInstr *, 16> &EqValSpills = Ent.second;
+    if (Ent.second.empty())
+      continue;
+
+    DEBUG({
+      dbgs() << "\nFor Slot" << Slot << " and VN" << OrigVNI->id << ":\n"
+             << "Equal spills in BB: ";
+      for (const auto spill : EqValSpills)
+        dbgs() << spill->getParent()->getNumber() << " ";
+      dbgs() << "\n";
+    });
+
+    // SpillsToRm is the spill set to be removed from EqValSpills.
+    SmallVector<MachineInstr *, 16> SpillsToRm;
+    // SpillsToIns is the spill set to be newly inserted after hoisting.
+    DenseMap<MachineBasicBlock *, unsigned> SpillsToIns;
+
+    runHoistSpills(OrigReg, *OrigVNI, EqValSpills, SpillsToRm, SpillsToIns);
+
+    DEBUG({
+      dbgs() << "Finally inserted spills in BB: ";
+      for (const auto Ispill : SpillsToIns)
+        dbgs() << Ispill.first->getNumber() << " ";
+      dbgs() << "\nFinally removed spills in BB: ";
+      for (const auto Rspill : SpillsToRm)
+        dbgs() << Rspill->getParent()->getNumber() << " ";
+      dbgs() << "\n";
+    });
+
+    // Stack live range update.
+    LiveInterval &StackIntvl = LSS.getInterval(Slot);
+    if (!SpillsToIns.empty() || !SpillsToRm.empty())
+      StackIntvl.MergeValueInAsValue(OrigLI, OrigVNI,
+                                     StackIntvl.getValNumInfo(0));
+
+    // Insert hoisted spills.
+    for (auto const Insert : SpillsToIns) {
+      MachineBasicBlock *BB = Insert.first;
+      unsigned LiveReg = Insert.second;
+      MachineBasicBlock::iterator MI = IPA.getLastInsertPointIter(OrigLI, *BB);
+      TII.storeRegToStackSlot(*BB, MI, LiveReg, false, Slot,
+                              MRI.getRegClass(LiveReg), &TRI);
+      LIS.InsertMachineInstrRangeInMaps(std::prev(MI), MI);
+      ++NumSpills;
+    }
+
+    // Remove redundant spills or change them to dead instructions.
+    NumSpills -= SpillsToRm.size();
+    for (auto const RMEnt : SpillsToRm) {
+      RMEnt->setDesc(TII.get(TargetOpcode::KILL));
+      for (unsigned i = RMEnt->getNumOperands(); i; --i) {
+        MachineOperand &MO = RMEnt->getOperand(i - 1);
+        if (MO.isReg() && MO.isImplicit() && MO.isDef() && !MO.isDead())
+          RMEnt->RemoveOperand(i - 1);
+      }
+    }
+    Edit.eliminateDeadDefs(SpillsToRm, None, AA);
+  }
+}
+
+/// For VirtReg clone, the \p New register should have the same physreg or
+/// stackslot as the \p old register.
+void HoistSpillHelper::LRE_DidCloneVirtReg(unsigned New, unsigned Old) {
+  if (VRM.hasPhys(Old))
+    VRM.assignVirt2Phys(New, VRM.getPhys(Old));
+  else if (VRM.getStackSlot(Old) != VirtRegMap::NO_STACK_SLOT)
+    VRM.assignVirt2StackSlot(New, VRM.getStackSlot(Old));
+  else
+    llvm_unreachable("VReg should be assigned either physreg or stackslot");
+}
diff --git a/lib/CodeGen/InterleavedAccessPass.cpp b/lib/CodeGen/InterleavedAccessPass.cpp
index 724f1d61abe2..3f1111976852 100644
--- a/lib/CodeGen/InterleavedAccessPass.cpp
+++ b/lib/CodeGen/InterleavedAccessPass.cpp
@@ -1,6 +1,6 @@
-//=----------------------- InterleavedAccessPass.cpp -----------------------==//
+//===--------------------- InterleavedAccessPass.cpp ----------------------===//
 //
-// The LLVM Compiler Infrastructure
+//                     The LLVM Compiler Infrastructure
 //
 // This file is distributed under the University of Illinois Open Source
 // License. See LICENSE.TXT for details.
@@ -8,16 +8,18 @@
 //===----------------------------------------------------------------------===//
 //
 // This file implements the Interleaved Access pass, which identifies
-// interleaved memory accesses and transforms into target specific intrinsics.
+// interleaved memory accesses and transforms them into target specific
+// intrinsics.
 //
 // An interleaved load reads data from memory into several vectors, with
 // DE-interleaving the data on a factor. An interleaved store writes several
 // vectors to memory with RE-interleaving the data on a factor.
 //
-// As interleaved accesses are hard to be identified in CodeGen (mainly because
-// the VECTOR_SHUFFLE DAG node is quite different from the shufflevector IR),
-// we identify and transform them to intrinsics in this pass. So the intrinsics
-// can be easily matched into target specific instructions later in CodeGen.
+// As interleaved accesses are difficult to identified in CodeGen (mainly
+// because the VECTOR_SHUFFLE DAG node is quite different from the shufflevector
+// IR), we identify and transform them to intrinsics in this pass so the
+// intrinsics can be easily matched into target specific instructions later in
+// CodeGen.
 //
 // E.g. An interleaved load (Factor = 2):
 //        %wide.vec = load <8 x i32>, <8 x i32>* %ptr
@@ -38,6 +40,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/CodeGen/Passes.h"
+#include "llvm/IR/Dominators.h"
 #include "llvm/IR/InstIterator.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/MathExtras.h"
@@ -56,10 +59,6 @@ static cl::opt<bool> LowerInterleavedAccesses(
 
 static unsigned MaxFactor; // The maximum supported interleave factor.
 
-namespace llvm {
-static void initializeInterleavedAccessPass(PassRegistry &);
-}
-
 namespace {
 
 class InterleavedAccess : public FunctionPass {
@@ -67,7 +66,7 @@ class InterleavedAccess : public FunctionPass {
 public:
   static char ID;
   InterleavedAccess(const TargetMachine *TM = nullptr)
-      : FunctionPass(ID), TM(TM), TLI(nullptr) {
+      : FunctionPass(ID), DT(nullptr), TM(TM), TLI(nullptr) {
     initializeInterleavedAccessPass(*PassRegistry::getPassRegistry());
   }
 
@@ -75,7 +74,13 @@ public:
 
   bool runOnFunction(Function &F) override;
 
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    AU.addRequired<DominatorTreeWrapperPass>();
+    AU.addPreserved<DominatorTreeWrapperPass>();
+  }
+
 private:
+  DominatorTree *DT;
   const TargetMachine *TM;
   const TargetLowering *TLI;
 
@@ -86,13 +91,26 @@ private:
   /// \brief Transform an interleaved store into target specific intrinsics.
   bool lowerInterleavedStore(StoreInst *SI,
                              SmallVector<Instruction *, 32> &DeadInsts);
+
+  /// \brief Returns true if the uses of an interleaved load by the
+  /// extractelement instructions in \p Extracts can be replaced by uses of the
+  /// shufflevector instructions in \p Shuffles instead. If so, the necessary
+  /// replacements are also performed.
+  bool tryReplaceExtracts(ArrayRef<ExtractElementInst *> Extracts,
+                          ArrayRef<ShuffleVectorInst *> Shuffles);
 };
 } // end anonymous namespace.
 
 char InterleavedAccess::ID = 0;
-INITIALIZE_TM_PASS(InterleavedAccess, "interleaved-access",
-    "Lower interleaved memory accesses to target specific intrinsics",
-    false, false)
+INITIALIZE_TM_PASS_BEGIN(
+    InterleavedAccess, "interleaved-access",
+    "Lower interleaved memory accesses to target specific intrinsics", false,
+    false)
+INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
+INITIALIZE_TM_PASS_END(
+    InterleavedAccess, "interleaved-access",
+    "Lower interleaved memory accesses to target specific intrinsics", false,
+    false)
 
 FunctionPass *llvm::createInterleavedAccessPass(const TargetMachine *TM) {
   return new InterleavedAccess(TM);
@@ -181,9 +199,18 @@ bool InterleavedAccess::lowerInterleavedLoad(
     return false;
 
   SmallVector<ShuffleVectorInst *, 4> Shuffles;
+  SmallVector<ExtractElementInst *, 4> Extracts;
 
-  // Check if all users of this load are shufflevectors.
+  // Check if all users of this load are shufflevectors. If we encounter any
+  // users that are extractelement instructions, we save them to later check if
+  // they can be modifed to extract from one of the shufflevectors instead of
+  // the load.
   for (auto UI = LI->user_begin(), E = LI->user_end(); UI != E; UI++) {
+    auto *Extract = dyn_cast<ExtractElementInst>(*UI);
+    if (Extract && isa<ConstantInt>(Extract->getIndexOperand())) {
+      Extracts.push_back(Extract);
+      continue;
+    }
     ShuffleVectorInst *SVI = dyn_cast<ShuffleVectorInst>(*UI);
     if (!SVI || !isa<UndefValue>(SVI->getOperand(1)))
       return false;
@@ -219,6 +246,11 @@ bool InterleavedAccess::lowerInterleavedLoad(
     Indices.push_back(Index);
   }
 
+  // Try and modify users of the load that are extractelement instructions to
+  // use the shufflevector instructions instead of the load.
+  if (!tryReplaceExtracts(Extracts, Shuffles))
+    return false;
+
   DEBUG(dbgs() << "IA: Found an interleaved load: " << *LI << "\n");
 
   // Try to create target specific intrinsics to replace the load and shuffles.
@@ -232,6 +264,73 @@ bool InterleavedAccess::lowerInterleavedLoad(
   return true;
 }
 
+bool InterleavedAccess::tryReplaceExtracts(
+    ArrayRef<ExtractElementInst *> Extracts,
+    ArrayRef<ShuffleVectorInst *> Shuffles) {
+
+  // If there aren't any extractelement instructions to modify, there's nothing
+  // to do.
+  if (Extracts.empty())
+    return true;
+
+  // Maps extractelement instructions to vector-index pairs. The extractlement
+  // instructions will be modified to use the new vector and index operands.
+  DenseMap<ExtractElementInst *, std::pair<Value *, int>> ReplacementMap;
+
+  for (auto *Extract : Extracts) {
+
+    // The vector index that is extracted.
+    auto *IndexOperand = cast<ConstantInt>(Extract->getIndexOperand());
+    auto Index = IndexOperand->getSExtValue();
+
+    // Look for a suitable shufflevector instruction. The goal is to modify the
+    // extractelement instruction (which uses an interleaved load) to use one
+    // of the shufflevector instructions instead of the load.
+    for (auto *Shuffle : Shuffles) {
+
+      // If the shufflevector instruction doesn't dominate the extract, we
+      // can't create a use of it.
+      if (!DT->dominates(Shuffle, Extract))
+        continue;
+
+      // Inspect the indices of the shufflevector instruction. If the shuffle
+      // selects the same index that is extracted, we can modify the
+      // extractelement instruction.
+      SmallVector<int, 4> Indices;
+      Shuffle->getShuffleMask(Indices);
+      for (unsigned I = 0; I < Indices.size(); ++I)
+        if (Indices[I] == Index) {
+          assert(Extract->getOperand(0) == Shuffle->getOperand(0) &&
+                 "Vector operations do not match");
+          ReplacementMap[Extract] = std::make_pair(Shuffle, I);
+          break;
+        }
+
+      // If we found a suitable shufflevector instruction, stop looking.
+      if (ReplacementMap.count(Extract))
+        break;
+    }
+
+    // If we did not find a suitable shufflevector instruction, the
+    // extractelement instruction cannot be modified, so we must give up.
+    if (!ReplacementMap.count(Extract))
+      return false;
+  }
+
+  // Finally, perform the replacements.
+  IRBuilder<> Builder(Extracts[0]->getContext());
+  for (auto &Replacement : ReplacementMap) {
+    auto *Extract = Replacement.first;
+    auto *Vector = Replacement.second.first;
+    auto Index = Replacement.second.second;
+    Builder.SetInsertPoint(Extract);
+    Extract->replaceAllUsesWith(Builder.CreateExtractElement(Vector, Index));
+    Extract->eraseFromParent();
+  }
+
+  return true;
+}
+
 bool InterleavedAccess::lowerInterleavedStore(
     StoreInst *SI, SmallVector<Instruction *, 32> &DeadInsts) {
   if (!SI->isSimple())
@@ -264,6 +363,7 @@ bool InterleavedAccess::runOnFunction(Function &F) {
 
   DEBUG(dbgs() << "*** " << getPassName() << ": " << F.getName() << "\n");
 
+  DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
   TLI = TM->getSubtargetImpl(F)->getTargetLowering();
   MaxFactor = TLI->getMaxSupportedInterleaveFactor();
 
diff --git a/lib/CodeGen/LLVMBuild.txt b/lib/CodeGen/LLVMBuild.txt
index 69b6a0f380aa..36d6cc1138ed 100644
--- a/lib/CodeGen/LLVMBuild.txt
+++ b/lib/CodeGen/LLVMBuild.txt
@@ -16,10 +16,10 @@
 ;===------------------------------------------------------------------------===;
 
 [common]
-subdirectories = AsmPrinter SelectionDAG MIRParser
+subdirectories = AsmPrinter SelectionDAG MIRParser GlobalISel
 
 [component_0]
 type = Library
 name = CodeGen
 parent = Libraries
-required_libraries = Analysis BitReader BitWriter Core Instrumentation MC Scalar Support Target TransformUtils
+required_libraries = Analysis BitReader BitWriter Core Instrumentation MC ProfileData Scalar Support Target TransformUtils
diff --git a/lib/CodeGen/LLVMTargetMachine.cpp b/lib/CodeGen/LLVMTargetMachine.cpp
index 1c27377feee7..9eb43d2bec10 100644
--- a/lib/CodeGen/LLVMTargetMachine.cpp
+++ b/lib/CodeGen/LLVMTargetMachine.cpp
@@ -18,6 +18,7 @@
 #include "llvm/CodeGen/MachineFunctionAnalysis.h"
 #include "llvm/CodeGen/MachineModuleInfo.h"
 #include "llvm/CodeGen/Passes.h"
+#include "llvm/CodeGen/TargetPassConfig.h"
 #include "llvm/IR/IRPrintingPasses.h"
 #include "llvm/IR/LegacyPassManager.h"
 #include "llvm/IR/Verifier.h"
@@ -42,6 +43,10 @@ static cl::opt<cl::boolOrDefault>
 EnableFastISelOption("fast-isel", cl::Hidden,
   cl::desc("Enable the \"fast\" instruction selector"));
 
+static cl::opt<bool>
+    EnableGlobalISel("global-isel", cl::Hidden, cl::init(false),
+                     cl::desc("Enable the \"global\" instruction selector"));
+
 void LLVMTargetMachine::initAsmInfo() {
   MRI = TheTarget.createMCRegInfo(getTargetTriple().str());
   MII = TheTarget.createMCInstrInfo();
@@ -65,8 +70,15 @@ void LLVMTargetMachine::initAsmInfo() {
   if (Options.DisableIntegratedAS)
     TmpAsmInfo->setUseIntegratedAssembler(false);
 
+  TmpAsmInfo->setPreserveAsmComments(Options.MCOptions.PreserveAsmComments);
+
   if (Options.CompressDebugSections)
-    TmpAsmInfo->setCompressDebugSections(true);
+    TmpAsmInfo->setCompressDebugSections(DebugCompressionType::DCT_ZlibGnu);
+
+  TmpAsmInfo->setRelaxELFRelocations(Options.RelaxELFRelocations);
+
+  if (Options.ExceptionModel != ExceptionHandling::None)
+    TmpAsmInfo->setExceptionsType(Options.ExceptionModel);
 
   AsmInfo = TmpAsmInfo;
 }
@@ -78,7 +90,10 @@ LLVMTargetMachine::LLVMTargetMachine(const Target &T,
                                      Reloc::Model RM, CodeModel::Model CM,
                                      CodeGenOpt::Level OL)
     : TargetMachine(T, DataLayoutString, TT, CPU, FS, Options) {
-  CodeGenInfo = T.createMCCodeGenInfo(TT.str(), RM, CM, OL);
+  T.adjustCodeGenOpts(TT, RM, CM);
+  this->RM = RM;
+  this->CMModel = CM;
+  this->OptLevel = OL;
 }
 
 TargetIRAnalysis LLVMTargetMachine::getTargetIRAnalysis() {
@@ -87,6 +102,20 @@ TargetIRAnalysis LLVMTargetMachine::getTargetIRAnalysis() {
   });
 }
 
+MachineModuleInfo &
+LLVMTargetMachine::addMachineModuleInfo(PassManagerBase &PM) const {
+  MachineModuleInfo *MMI = new MachineModuleInfo(*getMCAsmInfo(),
+                                                 *getMCRegisterInfo(),
+                                                 getObjFileLowering());
+  PM.add(MMI);
+  return *MMI;
+}
+
+void LLVMTargetMachine::addMachineFunctionAnalysis(PassManagerBase &PM,
+    MachineFunctionInitializer *MFInitializer) const {
+  PM.add(new MachineFunctionAnalysis(*this, MFInitializer));
+}
+
 /// addPassesToX helper drives creation and initialization of TargetPassConfig.
 static MCContext *
 addPassesToGenerateCode(LLVMTargetMachine *TM, PassManagerBase &PM,
@@ -94,6 +123,12 @@ addPassesToGenerateCode(LLVMTargetMachine *TM, PassManagerBase &PM,
                         AnalysisID StartAfter, AnalysisID StopAfter,
                         MachineFunctionInitializer *MFInitializer = nullptr) {
 
+  // When in emulated TLS mode, add the LowerEmuTLS pass.
+  if (TM->Options.EmulatedTLS)
+    PM.add(createLowerEmuTLSPass(TM));
+
+  PM.add(createPreISelIntrinsicLoweringPass());
+
   // Add internal analysis passes from the target machine.
   PM.add(createTargetTransformInfoWrapperPass(TM->getTargetIRAnalysis()));
 
@@ -115,14 +150,8 @@ addPassesToGenerateCode(LLVMTargetMachine *TM, PassManagerBase &PM,
 
   PassConfig->addISelPrepare();
 
-  // Install a MachineModuleInfo class, which is an immutable pass that holds
-  // all the per-module stuff we're generating, including MCContext.
-  MachineModuleInfo *MMI = new MachineModuleInfo(
-      *TM->getMCAsmInfo(), *TM->getMCRegisterInfo(), TM->getObjFileLowering());
-  PM.add(MMI);
-
-  // Set up a MachineFunction for the rest of CodeGen to work on.
-  PM.add(new MachineFunctionAnalysis(*TM, MFInitializer));
+  MachineModuleInfo &MMI = TM->addMachineModuleInfo(PM);
+  TM->addMachineFunctionAnalysis(PM, MFInitializer);
 
   // Enable FastISel with -fast, but allow that to be overridden.
   TM->setO0WantsFastISel(EnableFastISelOption != cl::BOU_FALSE);
@@ -132,14 +161,25 @@ addPassesToGenerateCode(LLVMTargetMachine *TM, PassManagerBase &PM,
     TM->setFastISel(true);
 
   // Ask the target for an isel.
-  if (PassConfig->addInstSelector())
+  if (LLVM_UNLIKELY(EnableGlobalISel)) {
+    if (PassConfig->addIRTranslator())
+      return nullptr;
+
+    // Before running the register bank selector, ask the target if it
+    // wants to run some passes.
+    PassConfig->addPreRegBankSelect();
+
+    if (PassConfig->addRegBankSelect())
+      return nullptr;
+
+  } else if (PassConfig->addInstSelector())
     return nullptr;
 
   PassConfig->addMachinePasses();
 
   PassConfig->setInitialized();
 
-  return &MMI->getContext();
+  return &MMI.getContext();
 }
 
 bool LLVMTargetMachine::addPassesToEmitFile(
@@ -154,7 +194,7 @@ bool LLVMTargetMachine::addPassesToEmitFile(
     return true;
 
   if (StopAfter) {
-    PM.add(createPrintMIRPass(outs()));
+    PM.add(createPrintMIRPass(Out));
     return false;
   }
 
diff --git a/lib/CodeGen/LexicalScopes.cpp b/lib/CodeGen/LexicalScopes.cpp
index be61a20424b4..b810176e6a18 100644
--- a/lib/CodeGen/LexicalScopes.cpp
+++ b/lib/CodeGen/LexicalScopes.cpp
@@ -113,8 +113,7 @@ LexicalScope *LexicalScopes::findLexicalScope(const DILocation *DL) {
 
   // The scope that we were created with could have an extra file - which
   // isn't what we care about in this case.
-  if (auto *File = dyn_cast<DILexicalBlockFile>(Scope))
-    Scope = File->getScope();
+  Scope = Scope->getNonLexicalBlockFileScope();
 
   if (auto *IA = DL->getInlinedAt()) {
     auto I = InlinedLexicalScopeMap.find(std::make_pair(Scope, IA));
@@ -140,8 +139,8 @@ LexicalScope *LexicalScopes::getOrCreateLexicalScope(const DILocalScope *Scope,
 /// getOrCreateRegularScope - Find or create a regular lexical scope.
 LexicalScope *
 LexicalScopes::getOrCreateRegularScope(const DILocalScope *Scope) {
-  if (auto *File = dyn_cast<DILexicalBlockFile>(Scope))
-    Scope = File->getScope();
+  assert(Scope && "Invalid Scope encoding!");
+  Scope = Scope->getNonLexicalBlockFileScope();
 
   auto I = LexicalScopeMap.find(Scope);
   if (I != LexicalScopeMap.end())
@@ -169,6 +168,8 @@ LexicalScopes::getOrCreateRegularScope(const DILocalScope *Scope) {
 LexicalScope *
 LexicalScopes::getOrCreateInlinedScope(const DILocalScope *Scope,
                                        const DILocation *InlinedAt) {
+  assert(Scope && "Invalid Scope encoding!");
+  Scope = Scope->getNonLexicalBlockFileScope();
   std::pair<const DILocalScope *, const DILocation *> P(Scope, InlinedAt);
   auto I = InlinedLexicalScopeMap.find(P);
   if (I != InlinedLexicalScopeMap.end())
@@ -192,9 +193,7 @@ LexicalScopes::getOrCreateInlinedScope(const DILocalScope *Scope,
 LexicalScope *
 LexicalScopes::getOrCreateAbstractScope(const DILocalScope *Scope) {
   assert(Scope && "Invalid Scope encoding!");
-
-  if (auto *File = dyn_cast<DILexicalBlockFile>(Scope))
-    Scope = File->getScope();
+  Scope = Scope->getNonLexicalBlockFileScope();
   auto I = AbstractScopeMap.find(Scope);
   if (I != AbstractScopeMap.end())
     return &I->second;
diff --git a/lib/CodeGen/LiveDebugValues.cpp b/lib/CodeGen/LiveDebugValues.cpp
index b9937e5570d4..4ff88d528108 100644
--- a/lib/CodeGen/LiveDebugValues.cpp
+++ b/lib/CodeGen/LiveDebugValues.cpp
@@ -18,22 +18,24 @@
 ///
 //===----------------------------------------------------------------------===//
 
-#include "llvm/ADT/Statistic.h"
 #include "llvm/ADT/PostOrderIterator.h"
 #include "llvm/ADT/SmallPtrSet.h"
-#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/SparseBitVector.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/ADT/UniqueVector.h"
 #include "llvm/CodeGen/MachineFunction.h"
 #include "llvm/CodeGen/MachineFunctionPass.h"
 #include "llvm/CodeGen/MachineInstrBuilder.h"
 #include "llvm/CodeGen/Passes.h"
-#include "llvm/Support/CommandLine.h"
+#include "llvm/IR/DebugInfo.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/Target/TargetInstrInfo.h"
+#include "llvm/Target/TargetLowering.h"
 #include "llvm/Target/TargetRegisterInfo.h"
 #include "llvm/Target/TargetSubtargetInfo.h"
-#include <queue>
 #include <list>
+#include <queue>
 
 using namespace llvm;
 
@@ -43,48 +45,163 @@ STATISTIC(NumInserted, "Number of DBG_VALUE instructions inserted");
 
 namespace {
 
+// \brief If @MI is a DBG_VALUE with debug value described by a defined
+// register, returns the number of this register. In the other case, returns 0.
+static unsigned isDbgValueDescribedByReg(const MachineInstr &MI) {
+  assert(MI.isDebugValue() && "expected a DBG_VALUE");
+  assert(MI.getNumOperands() == 4 && "malformed DBG_VALUE");
+  // If location of variable is described using a register (directly
+  // or indirectly), this register is always a first operand.
+  return MI.getOperand(0).isReg() ? MI.getOperand(0).getReg() : 0;
+}
+
 class LiveDebugValues : public MachineFunctionPass {
 
 private:
   const TargetRegisterInfo *TRI;
   const TargetInstrInfo *TII;
 
+  /// Based on std::pair so it can be used as an index into a DenseMap.
   typedef std::pair<const DILocalVariable *, const DILocation *>
-      InlinedVariable;
-
+      DebugVariableBase;
   /// A potentially inlined instance of a variable.
-  struct DebugVariable {
-    const DILocalVariable *Var;
-    const DILocation *InlinedAt;
+  struct DebugVariable : public DebugVariableBase {
+    DebugVariable(const DILocalVariable *Var, const DILocation *InlinedAt)
+        : DebugVariableBase(Var, InlinedAt) {}
 
-    DebugVariable(const DILocalVariable *_var, const DILocation *_inlinedAt)
-        : Var(_var), InlinedAt(_inlinedAt) {}
+    const DILocalVariable *getVar() const { return this->first; };
+    const DILocation *getInlinedAt() const { return this->second; };
 
-    bool operator==(const DebugVariable &DV) const {
-      return (Var == DV.Var) && (InlinedAt == DV.InlinedAt);
+    bool operator<(const DebugVariable &DV) const {
+      if (getVar() == DV.getVar())
+        return getInlinedAt() < DV.getInlinedAt();
+      return getVar() < DV.getVar();
     }
   };
 
-  /// Member variables and functions for Range Extension across basic blocks.
+  /// A pair of debug variable and value location.
   struct VarLoc {
-    DebugVariable Var;
-    const MachineInstr *MI; // MachineInstr should be a DBG_VALUE instr.
+    const DebugVariable Var;
+    const MachineInstr &MI; ///< Only used for cloning a new DBG_VALUE.
+
+    enum { InvalidKind = 0, RegisterKind } Kind;
+
+    /// The value location. Stored separately to avoid repeatedly
+    /// extracting it from MI.
+    union {
+      struct {
+        uint32_t RegNo;
+        uint32_t Offset;
+      } RegisterLoc;
+      uint64_t Hash;
+    } Loc;
+
+    VarLoc(const MachineInstr &MI)
+        : Var(MI.getDebugVariable(), MI.getDebugLoc()->getInlinedAt()), MI(MI),
+          Kind(InvalidKind) {
+      static_assert((sizeof(Loc) == sizeof(uint64_t)),
+                    "hash does not cover all members of Loc");
+      assert(MI.isDebugValue() && "not a DBG_VALUE");
+      assert(MI.getNumOperands() == 4 && "malformed DBG_VALUE");
+      if (int RegNo = isDbgValueDescribedByReg(MI)) {
+        Kind = RegisterKind;
+        Loc.RegisterLoc.RegNo = RegNo;
+        uint64_t Offset =
+            MI.isIndirectDebugValue() ? MI.getOperand(1).getImm() : 0;
+        // We don't support offsets larger than 4GiB here. They are
+        // slated to be replaced with DIExpressions anyway.
+        if (Offset >= (1ULL << 32))
+          Kind = InvalidKind;
+        else
+          Loc.RegisterLoc.Offset = Offset;
+      }
+    }
+
+    /// If this variable is described by a register, return it,
+    /// otherwise return 0.
+    unsigned isDescribedByReg() const {
+      if (Kind == RegisterKind)
+        return Loc.RegisterLoc.RegNo;
+      return 0;
+    }
 
-    VarLoc(DebugVariable _var, const MachineInstr *_mi) : Var(_var), MI(_mi) {}
+    void dump() const { MI.dump(); }
 
-    bool operator==(const VarLoc &V) const;
+    bool operator==(const VarLoc &Other) const {
+      return Var == Other.Var && Loc.Hash == Other.Loc.Hash;
+    }
+
+    /// This operator guarantees that VarLocs are sorted by Variable first.
+    bool operator<(const VarLoc &Other) const {
+      if (Var == Other.Var)
+        return Loc.Hash < Other.Loc.Hash;
+      return Var < Other.Var;
+    }
   };
 
-  typedef std::list<VarLoc> VarLocList;
-  typedef SmallDenseMap<const MachineBasicBlock *, VarLocList> VarLocInMBB;
+  typedef UniqueVector<VarLoc> VarLocMap;
+  typedef SparseBitVector<> VarLocSet;
+  typedef SmallDenseMap<const MachineBasicBlock *, VarLocSet> VarLocInMBB;
+
+  /// This holds the working set of currently open ranges. For fast
+  /// access, this is done both as a set of VarLocIDs, and a map of
+  /// DebugVariable to recent VarLocID. Note that a DBG_VALUE ends all
+  /// previous open ranges for the same variable.
+  class OpenRangesSet {
+    VarLocSet VarLocs;
+    SmallDenseMap<DebugVariableBase, unsigned, 8> Vars;
+
+  public:
+    const VarLocSet &getVarLocs() const { return VarLocs; }
+
+    /// Terminate all open ranges for Var by removing it from the set.
+    void erase(DebugVariable Var) {
+      auto It = Vars.find(Var);
+      if (It != Vars.end()) {
+        unsigned ID = It->second;
+        VarLocs.reset(ID);
+        Vars.erase(It);
+      }
+    }
+
+    /// Terminate all open ranges listed in \c KillSet by removing
+    /// them from the set.
+    void erase(const VarLocSet &KillSet, const VarLocMap &VarLocIDs) {
+      VarLocs.intersectWithComplement(KillSet);
+      for (unsigned ID : KillSet)
+        Vars.erase(VarLocIDs[ID].Var);
+    }
+
+    /// Insert a new range into the set.
+    void insert(unsigned VarLocID, DebugVariableBase Var) {
+      VarLocs.set(VarLocID);
+      Vars.insert({Var, VarLocID});
+    }
+
+    /// Empty the set.
+    void clear() {
+      VarLocs.clear();
+      Vars.clear();
+    }
+
+    /// Return whether the set is empty or not.
+    bool empty() const {
+      assert(Vars.empty() == VarLocs.empty() && "open ranges are inconsistent");
+      return VarLocs.empty();
+    }
+  };
 
-  void transferDebugValue(MachineInstr &MI, VarLocList &OpenRanges);
-  void transferRegisterDef(MachineInstr &MI, VarLocList &OpenRanges);
-  bool transferTerminatorInst(MachineInstr &MI, VarLocList &OpenRanges,
-                              VarLocInMBB &OutLocs);
-  bool transfer(MachineInstr &MI, VarLocList &OpenRanges, VarLocInMBB &OutLocs);
+  void transferDebugValue(const MachineInstr &MI, OpenRangesSet &OpenRanges,
+                          VarLocMap &VarLocIDs);
+  void transferRegisterDef(MachineInstr &MI, OpenRangesSet &OpenRanges,
+                           const VarLocMap &VarLocIDs);
+  bool transferTerminatorInst(MachineInstr &MI, OpenRangesSet &OpenRanges,
+                              VarLocInMBB &OutLocs, const VarLocMap &VarLocIDs);
+  bool transfer(MachineInstr &MI, OpenRangesSet &OpenRanges,
+                VarLocInMBB &OutLocs, VarLocMap &VarLocIDs);
 
-  bool join(MachineBasicBlock &MBB, VarLocInMBB &OutLocs, VarLocInMBB &InLocs);
+  bool join(MachineBasicBlock &MBB, VarLocInMBB &OutLocs, VarLocInMBB &InLocs,
+            const VarLocMap &VarLocIDs);
 
   bool ExtendRanges(MachineFunction &MF);
 
@@ -98,8 +215,14 @@ public:
   /// information we preserve.
   void getAnalysisUsage(AnalysisUsage &AU) const override;
 
+  MachineFunctionProperties getRequiredProperties() const override {
+    return MachineFunctionProperties().set(
+        MachineFunctionProperties::Property::AllVRegsAllocated);
+  }
+
   /// Print to ostream with a message.
-  void printVarLocInMBB(const VarLocInMBB &V, const char *msg,
+  void printVarLocInMBB(const MachineFunction &MF, const VarLocInMBB &V,
+                        const VarLocMap &VarLocIDs, const char *msg,
                         raw_ostream &Out) const;
 
   /// Calculate the liveness information for the given machine function.
@@ -124,109 +247,95 @@ LiveDebugValues::LiveDebugValues() : MachineFunctionPass(ID) {
 /// Tell the pass manager which passes we depend on and what information we
 /// preserve.
 void LiveDebugValues::getAnalysisUsage(AnalysisUsage &AU) const {
+  AU.setPreservesCFG();
   MachineFunctionPass::getAnalysisUsage(AU);
 }
 
-// \brief If @MI is a DBG_VALUE with debug value described by a defined
-// register, returns the number of this register. In the other case, returns 0.
-static unsigned isDescribedByReg(const MachineInstr &MI) {
-  assert(MI.isDebugValue());
-  assert(MI.getNumOperands() == 4);
-  // If location of variable is described using a register (directly or
-  // indirecltly), this register is always a first operand.
-  return MI.getOperand(0).isReg() ? MI.getOperand(0).getReg() : 0;
-}
-
-// \brief This function takes two DBG_VALUE instructions and returns true
-// if their offsets are equal; otherwise returns false.
-static bool areOffsetsEqual(const MachineInstr &MI1, const MachineInstr &MI2) {
-  assert(MI1.isDebugValue());
-  assert(MI1.getNumOperands() == 4);
-
-  assert(MI2.isDebugValue());
-  assert(MI2.getNumOperands() == 4);
-
-  if (!MI1.isIndirectDebugValue() && !MI2.isIndirectDebugValue())
-    return true;
-
-  // Check if both MIs are indirect and they are equal.
-  if (MI1.isIndirectDebugValue() && MI2.isIndirectDebugValue())
-    return MI1.getOperand(1).getImm() == MI2.getOperand(1).getImm();
-
-  return false;
-}
-
 //===----------------------------------------------------------------------===//
 //            Debug Range Extension Implementation
 //===----------------------------------------------------------------------===//
 
-void LiveDebugValues::printVarLocInMBB(const VarLocInMBB &V, const char *msg,
+void LiveDebugValues::printVarLocInMBB(const MachineFunction &MF,
+                                       const VarLocInMBB &V,
+                                       const VarLocMap &VarLocIDs,
+                                       const char *msg,
                                        raw_ostream &Out) const {
-  Out << "Printing " << msg << ":\n";
-  for (const auto &L : V) {
-    Out << "MBB: " << L.first->getName() << ":\n";
-    for (const auto &VLL : L.second) {
-      Out << " Var: " << VLL.Var.Var->getName();
+  for (const MachineBasicBlock &BB : MF) {
+    const auto &L = V.lookup(&BB);
+    Out << "MBB: " << BB.getName() << ":\n";
+    for (unsigned VLL : L) {
+      const VarLoc &VL = VarLocIDs[VLL];
+      Out << " Var: " << VL.Var.getVar()->getName();
       Out << " MI: ";
-      (*VLL.MI).dump();
+      VL.dump();
       Out << "\n";
     }
   }
   Out << "\n";
 }
 
-bool LiveDebugValues::VarLoc::operator==(const VarLoc &V) const {
-  return (Var == V.Var) && (isDescribedByReg(*MI) == isDescribedByReg(*V.MI)) &&
-         (areOffsetsEqual(*MI, *V.MI));
-}
-
 /// End all previous ranges related to @MI and start a new range from @MI
 /// if it is a DBG_VALUE instr.
-void LiveDebugValues::transferDebugValue(MachineInstr &MI,
-                                         VarLocList &OpenRanges) {
+void LiveDebugValues::transferDebugValue(const MachineInstr &MI,
+                                         OpenRangesSet &OpenRanges,
+                                         VarLocMap &VarLocIDs) {
   if (!MI.isDebugValue())
     return;
-  const DILocalVariable *RawVar = MI.getDebugVariable();
-  assert(RawVar->isValidLocationForIntrinsic(MI.getDebugLoc()) &&
+  const DILocalVariable *Var = MI.getDebugVariable();
+  const DILocation *DebugLoc = MI.getDebugLoc();
+  const DILocation *InlinedAt = DebugLoc->getInlinedAt();
+  assert(Var->isValidLocationForIntrinsic(DebugLoc) &&
          "Expected inlined-at fields to agree");
-  DebugVariable Var(RawVar, MI.getDebugLoc()->getInlinedAt());
 
   // End all previous ranges of Var.
-  OpenRanges.erase(
-      std::remove_if(OpenRanges.begin(), OpenRanges.end(),
-                     [&](const VarLoc &V) { return (Var == V.Var); }),
-      OpenRanges.end());
+  DebugVariable V(Var, InlinedAt);
+  OpenRanges.erase(V);
 
-  // Add Var to OpenRanges from this DBG_VALUE.
+  // Add the VarLoc to OpenRanges from this DBG_VALUE.
   // TODO: Currently handles DBG_VALUE which has only reg as location.
-  if (isDescribedByReg(MI)) {
-    VarLoc V(Var, &MI);
-    OpenRanges.push_back(std::move(V));
+  if (isDbgValueDescribedByReg(MI)) {
+    VarLoc VL(MI);
+    unsigned ID = VarLocIDs.insert(VL);
+    OpenRanges.insert(ID, VL.Var);
   }
 }
 
 /// A definition of a register may mark the end of a range.
 void LiveDebugValues::transferRegisterDef(MachineInstr &MI,
-                                          VarLocList &OpenRanges) {
+                                          OpenRangesSet &OpenRanges,
+                                          const VarLocMap &VarLocIDs) {
+  MachineFunction *MF = MI.getParent()->getParent();
+  const TargetLowering *TLI = MF->getSubtarget().getTargetLowering();
+  unsigned SP = TLI->getStackPointerRegisterToSaveRestore();
+  SparseBitVector<> KillSet;
   for (const MachineOperand &MO : MI.operands()) {
-    if (!(MO.isReg() && MO.isDef() && MO.getReg() &&
-          TRI->isPhysicalRegister(MO.getReg())))
-      continue;
-    // Remove ranges of all aliased registers.
-    for (MCRegAliasIterator RAI(MO.getReg(), TRI, true); RAI.isValid(); ++RAI)
-      OpenRanges.erase(std::remove_if(OpenRanges.begin(), OpenRanges.end(),
-                                      [&](const VarLoc &V) {
-                                        return (*RAI ==
-                                                isDescribedByReg(*V.MI));
-                                      }),
-                       OpenRanges.end());
+    if (MO.isReg() && MO.isDef() && MO.getReg() &&
+        TRI->isPhysicalRegister(MO.getReg())) {
+      // Remove ranges of all aliased registers.
+      for (MCRegAliasIterator RAI(MO.getReg(), TRI, true); RAI.isValid(); ++RAI)
+        for (unsigned ID : OpenRanges.getVarLocs())
+          if (VarLocIDs[ID].isDescribedByReg() == *RAI)
+            KillSet.set(ID);
+    } else if (MO.isRegMask()) {
+      // Remove ranges of all clobbered registers. Register masks don't usually
+      // list SP as preserved.  While the debug info may be off for an
+      // instruction or two around callee-cleanup calls, transferring the
+      // DEBUG_VALUE across the call is still a better user experience.
+      for (unsigned ID : OpenRanges.getVarLocs()) {
+        unsigned Reg = VarLocIDs[ID].isDescribedByReg();
+        if (Reg && Reg != SP && MO.clobbersPhysReg(Reg))
+          KillSet.set(ID);
+      }
+    }
   }
+  OpenRanges.erase(KillSet, VarLocIDs);
 }
 
 /// Terminate all open ranges at the end of the current basic block.
 bool LiveDebugValues::transferTerminatorInst(MachineInstr &MI,
-                                             VarLocList &OpenRanges,
-                                             VarLocInMBB &OutLocs) {
+                                             OpenRangesSet &OpenRanges,
+                                             VarLocInMBB &OutLocs,
+                                             const VarLocMap &VarLocIDs) {
   bool Changed = false;
   const MachineBasicBlock *CurMBB = MI.getParent();
   if (!(MI.isTerminator() || (&MI == &CurMBB->instr_back())))
@@ -235,29 +344,23 @@ bool LiveDebugValues::transferTerminatorInst(MachineInstr &MI,
   if (OpenRanges.empty())
     return false;
 
-  VarLocList &VLL = OutLocs[CurMBB];
-
-  for (auto OR : OpenRanges) {
-    // Copy OpenRanges to OutLocs, if not already present.
-    assert(OR.MI->isDebugValue());
-    DEBUG(dbgs() << "Add to OutLocs: "; OR.MI->dump(););
-    if (std::find_if(VLL.begin(), VLL.end(),
-                     [&](const VarLoc &V) { return (OR == V); }) == VLL.end()) {
-      VLL.push_back(std::move(OR));
-      Changed = true;
-    }
-  }
+  DEBUG(for (unsigned ID : OpenRanges.getVarLocs()) {
+          // Copy OpenRanges to OutLocs, if not already present.
+          dbgs() << "Add to OutLocs: "; VarLocIDs[ID].dump();
+        });
+  VarLocSet &VLS = OutLocs[CurMBB];
+  Changed = VLS |= OpenRanges.getVarLocs();
   OpenRanges.clear();
   return Changed;
 }
 
 /// This routine creates OpenRanges and OutLocs.
-bool LiveDebugValues::transfer(MachineInstr &MI, VarLocList &OpenRanges,
-                               VarLocInMBB &OutLocs) {
+bool LiveDebugValues::transfer(MachineInstr &MI, OpenRangesSet &OpenRanges,
+                               VarLocInMBB &OutLocs, VarLocMap &VarLocIDs) {
   bool Changed = false;
-  transferDebugValue(MI, OpenRanges);
-  transferRegisterDef(MI, OpenRanges);
-  Changed = transferTerminatorInst(MI, OpenRanges, OutLocs);
+  transferDebugValue(MI, OpenRanges, VarLocIDs);
+  transferRegisterDef(MI, OpenRanges, VarLocIDs);
+  Changed = transferTerminatorInst(MI, OpenRanges, OutLocs, VarLocIDs);
   return Changed;
 }
 
@@ -265,14 +368,14 @@ bool LiveDebugValues::transfer(MachineInstr &MI, VarLocList &OpenRanges,
 /// inserting a new DBG_VALUE instruction at the start of the @MBB - if the same
 /// source variable in all the predecessors of @MBB reside in the same location.
 bool LiveDebugValues::join(MachineBasicBlock &MBB, VarLocInMBB &OutLocs,
-                           VarLocInMBB &InLocs) {
+                           VarLocInMBB &InLocs, const VarLocMap &VarLocIDs) {
   DEBUG(dbgs() << "join MBB: " << MBB.getName() << "\n");
   bool Changed = false;
 
-  VarLocList InLocsT; // Temporary incoming locations.
+  VarLocSet InLocsT; // Temporary incoming locations.
 
-  // For all predecessors of this MBB, find the set of VarLocs that can be
-  // joined.
+  // For all predecessors of this MBB, find the set of VarLocs that
+  // can be joined.
   for (auto p : MBB.predecessors()) {
     auto OL = OutLocs.find(p);
     // Join is null in case of empty OutLocs from any of the pred.
@@ -284,44 +387,34 @@ bool LiveDebugValues::join(MachineBasicBlock &MBB, VarLocInMBB &OutLocs,
       InLocsT = OL->second;
       continue;
     }
-
     // Join with this predecessor.
-    VarLocList &VLL = OL->second;
-    InLocsT.erase(
-        std::remove_if(InLocsT.begin(), InLocsT.end(), [&](VarLoc &ILT) {
-          return (std::find_if(VLL.begin(), VLL.end(), [&](const VarLoc &V) {
-                    return (ILT == V);
-                  }) == VLL.end());
-        }), InLocsT.end());
+    InLocsT &= OL->second;
   }
 
   if (InLocsT.empty())
     return false;
 
-  VarLocList &ILL = InLocs[&MBB];
+  VarLocSet &ILS = InLocs[&MBB];
 
   // Insert DBG_VALUE instructions, if not already inserted.
-  for (auto ILT : InLocsT) {
-    if (std::find_if(ILL.begin(), ILL.end(), [&](const VarLoc &I) {
-          return (ILT == I);
-        }) == ILL.end()) {
-      // This VarLoc is not found in InLocs i.e. it is not yet inserted. So, a
-      // new range is started for the var from the mbb's beginning by inserting
-      // a new DBG_VALUE. transfer() will end this range however appropriate.
-      const MachineInstr *DMI = ILT.MI;
-      MachineInstr *MI =
-          BuildMI(MBB, MBB.instr_begin(), DMI->getDebugLoc(), DMI->getDesc(),
-                  DMI->isIndirectDebugValue(), DMI->getOperand(0).getReg(), 0,
-                  DMI->getDebugVariable(), DMI->getDebugExpression());
-      if (DMI->isIndirectDebugValue())
-        MI->getOperand(1).setImm(DMI->getOperand(1).getImm());
-      DEBUG(dbgs() << "Inserted: "; MI->dump(););
-      ++NumInserted;
-      Changed = true;
-
-      VarLoc V(ILT.Var, MI);
-      ILL.push_back(std::move(V));
-    }
+  VarLocSet Diff = InLocsT;
+  Diff.intersectWithComplement(ILS);
+  for (auto ID : Diff) {
+    // This VarLoc is not found in InLocs i.e. it is not yet inserted. So, a
+    // new range is started for the var from the mbb's beginning by inserting
+    // a new DBG_VALUE. transfer() will end this range however appropriate.
+    const VarLoc &DiffIt = VarLocIDs[ID];
+    const MachineInstr *DMI = &DiffIt.MI;
+    MachineInstr *MI =
+        BuildMI(MBB, MBB.instr_begin(), DMI->getDebugLoc(), DMI->getDesc(),
+                DMI->isIndirectDebugValue(), DMI->getOperand(0).getReg(), 0,
+                DMI->getDebugVariable(), DMI->getDebugExpression());
+    if (DMI->isIndirectDebugValue())
+      MI->getOperand(1).setImm(DMI->getOperand(1).getImm());
+    DEBUG(dbgs() << "Inserted: "; MI->dump(););
+    ILS.set(ID);
+    ++NumInserted;
+    Changed = true;
   }
   return Changed;
 }
@@ -336,21 +429,27 @@ bool LiveDebugValues::ExtendRanges(MachineFunction &MF) {
   bool OLChanged = false;
   bool MBBJoined = false;
 
-  VarLocList OpenRanges; // Ranges that are open until end of bb.
+  VarLocMap VarLocIDs;   // Map VarLoc<>unique ID for use in bitvectors.
+  OpenRangesSet OpenRanges; // Ranges that are open until end of bb.
   VarLocInMBB OutLocs;   // Ranges that exist beyond bb.
   VarLocInMBB InLocs;    // Ranges that are incoming after joining.
 
   DenseMap<unsigned int, MachineBasicBlock *> OrderToBB;
   DenseMap<MachineBasicBlock *, unsigned int> BBToOrder;
   std::priority_queue<unsigned int, std::vector<unsigned int>,
-                      std::greater<unsigned int>> Worklist;
+                      std::greater<unsigned int>>
+      Worklist;
   std::priority_queue<unsigned int, std::vector<unsigned int>,
-                      std::greater<unsigned int>> Pending;
+                      std::greater<unsigned int>>
+      Pending;
+
   // Initialize every mbb with OutLocs.
   for (auto &MBB : MF)
     for (auto &MI : MBB)
-      transfer(MI, OpenRanges, OutLocs);
-  DEBUG(printVarLocInMBB(OutLocs, "OutLocs after initialization", dbgs()));
+      transfer(MI, OpenRanges, OutLocs, VarLocIDs);
+
+  DEBUG(printVarLocInMBB(MF, OutLocs, VarLocIDs, "OutLocs after initialization",
+                         dbgs()));
 
   ReversePostOrderTraversal<MachineFunction *> RPOT(&MF);
   unsigned int RPONumber = 0;
@@ -360,7 +459,6 @@ bool LiveDebugValues::ExtendRanges(MachineFunction &MF) {
     Worklist.push(RPONumber);
     ++RPONumber;
   }
-
   // This is a standard "union of predecessor outs" dataflow problem.
   // To solve it, we perform join() and transfer() using the two worklist method
   // until the ranges converge.
@@ -373,21 +471,23 @@ bool LiveDebugValues::ExtendRanges(MachineFunction &MF) {
     while (!Worklist.empty()) {
       MachineBasicBlock *MBB = OrderToBB[Worklist.top()];
       Worklist.pop();
-      MBBJoined = join(*MBB, OutLocs, InLocs);
+      MBBJoined = join(*MBB, OutLocs, InLocs, VarLocIDs);
 
       if (MBBJoined) {
         MBBJoined = false;
         Changed = true;
         for (auto &MI : *MBB)
-          OLChanged |= transfer(MI, OpenRanges, OutLocs);
-        DEBUG(printVarLocInMBB(OutLocs, "OutLocs after propagating", dbgs()));
-        DEBUG(printVarLocInMBB(InLocs, "InLocs after propagating", dbgs()));
+          OLChanged |= transfer(MI, OpenRanges, OutLocs, VarLocIDs);
+
+        DEBUG(printVarLocInMBB(MF, OutLocs, VarLocIDs,
+                               "OutLocs after propagating", dbgs()));
+        DEBUG(printVarLocInMBB(MF, InLocs, VarLocIDs,
+                               "InLocs after propagating", dbgs()));
 
         if (OLChanged) {
           OLChanged = false;
           for (auto s : MBB->successors())
-            if (!OnPending.count(s)) {
-              OnPending.insert(s);
+            if (OnPending.insert(s).second) {
               Pending.push(BBToOrder[s]);
             }
         }
@@ -399,8 +499,8 @@ bool LiveDebugValues::ExtendRanges(MachineFunction &MF) {
     assert(Pending.empty() && "Pending should be empty");
   }
 
-  DEBUG(printVarLocInMBB(OutLocs, "Final OutLocs", dbgs()));
-  DEBUG(printVarLocInMBB(InLocs, "Final InLocs", dbgs()));
+  DEBUG(printVarLocInMBB(MF, OutLocs, VarLocIDs, "Final OutLocs", dbgs()));
+  DEBUG(printVarLocInMBB(MF, InLocs, VarLocIDs, "Final InLocs", dbgs()));
   return Changed;
 }
 
diff --git a/lib/CodeGen/LiveDebugVariables.cpp b/lib/CodeGen/LiveDebugVariables.cpp
index 6dac7dbd15bf..966b4f1f4e4d 100644
--- a/lib/CodeGen/LiveDebugVariables.cpp
+++ b/lib/CodeGen/LiveDebugVariables.cpp
@@ -42,6 +42,7 @@
 #include "llvm/Target/TargetRegisterInfo.h"
 #include "llvm/Target/TargetSubtargetInfo.h"
 #include <memory>
+#include <utility>
 
 using namespace llvm;
 
@@ -84,7 +85,7 @@ class UserValueScopes {
   SmallPtrSet<const MachineBasicBlock *, 4> LBlocks;
 
 public:
-  UserValueScopes(DebugLoc D, LexicalScopes &L) : DL(D), LS(L) {}
+  UserValueScopes(DebugLoc D, LexicalScopes &L) : DL(std::move(D)), LS(L) {}
 
   /// dominates - Return true if current scope dominates at least one machine
   /// instruction in a given machine basic block.
@@ -141,8 +142,8 @@ public:
   /// UserValue - Create a new UserValue.
   UserValue(const MDNode *var, const MDNode *expr, unsigned o, bool i,
             DebugLoc L, LocMap::Allocator &alloc)
-      : Variable(var), Expression(expr), offset(o), IsIndirect(i), dl(L),
-        leader(this), next(nullptr), locInts(alloc) {}
+      : Variable(var), Expression(expr), offset(o), IsIndirect(i),
+        dl(std::move(L)), leader(this), next(nullptr), locInts(alloc) {}
 
   /// getLeader - Get the leader of this value's equivalence class.
   UserValue *getLeader() {
@@ -172,8 +173,10 @@ public:
       return L1;
     // Splice L2 before L1's members.
     UserValue *End = L2;
-    while (End->next)
-      End->leader = L1, End = End->next;
+    while (End->next) {
+      End->leader = L1;
+      End = End->next;
+    }
     End->leader = L1;
     End->next = L1->next;
     L1->next = L2;
@@ -302,7 +305,7 @@ class LDVImpl {
 
   /// getUserValue - Find or create a UserValue.
   UserValue *getUserValue(const MDNode *Var, const MDNode *Expr,
-                          unsigned Offset, bool IsIndirect, DebugLoc DL);
+                          unsigned Offset, bool IsIndirect, const DebugLoc &DL);
 
   /// lookupVirtReg - Find the EC leader for VirtReg or null.
   UserValue *lookupVirtReg(unsigned VirtReg);
@@ -311,7 +314,7 @@ class LDVImpl {
   /// @param MI  DBG_VALUE instruction
   /// @param Idx Last valid SLotIndex before instruction.
   /// @return    True if the DBG_VALUE instruction should be deleted.
-  bool handleDebugValue(MachineInstr *MI, SlotIndex Idx);
+  bool handleDebugValue(MachineInstr &MI, SlotIndex Idx);
 
   /// collectDebugValues - Collect and erase all DBG_VALUE instructions, adding
   /// a UserValue def for each instruction.
@@ -355,7 +358,7 @@ public:
 };
 } // namespace
 
-static void printDebugLoc(DebugLoc DL, raw_ostream &CommentOS,
+static void printDebugLoc(const DebugLoc &DL, raw_ostream &CommentOS,
                           const LLVMContext &Ctx) {
   if (!DL)
     return;
@@ -456,7 +459,7 @@ void UserValue::mapVirtRegs(LDVImpl *LDV) {
 
 UserValue *LDVImpl::getUserValue(const MDNode *Var, const MDNode *Expr,
                                  unsigned Offset, bool IsIndirect,
-                                 DebugLoc DL) {
+                                 const DebugLoc &DL) {
   UserValue *&Leader = userVarMap[Var];
   if (Leader) {
     UserValue *UV = Leader->getLeader();
@@ -485,24 +488,23 @@ UserValue *LDVImpl::lookupVirtReg(unsigned VirtReg) {
   return nullptr;
 }
 
-bool LDVImpl::handleDebugValue(MachineInstr *MI, SlotIndex Idx) {
+bool LDVImpl::handleDebugValue(MachineInstr &MI, SlotIndex Idx) {
   // DBG_VALUE loc, offset, variable
-  if (MI->getNumOperands() != 4 ||
-      !(MI->getOperand(1).isReg() || MI->getOperand(1).isImm()) ||
-      !MI->getOperand(2).isMetadata()) {
-    DEBUG(dbgs() << "Can't handle " << *MI);
+  if (MI.getNumOperands() != 4 ||
+      !(MI.getOperand(1).isReg() || MI.getOperand(1).isImm()) ||
+      !MI.getOperand(2).isMetadata()) {
+    DEBUG(dbgs() << "Can't handle " << MI);
     return false;
   }
 
   // Get or create the UserValue for (variable,offset).
-  bool IsIndirect = MI->isIndirectDebugValue();
-  unsigned Offset = IsIndirect ? MI->getOperand(1).getImm() : 0;
-  const MDNode *Var = MI->getDebugVariable();
-  const MDNode *Expr = MI->getDebugExpression();
+  bool IsIndirect = MI.isIndirectDebugValue();
+  unsigned Offset = IsIndirect ? MI.getOperand(1).getImm() : 0;
+  const MDNode *Var = MI.getDebugVariable();
+  const MDNode *Expr = MI.getDebugExpression();
   //here.
-  UserValue *UV =
-      getUserValue(Var, Expr, Offset, IsIndirect, MI->getDebugLoc());
-  UV->addDef(Idx, MI->getOperand(0));
+  UserValue *UV = getUserValue(Var, Expr, Offset, IsIndirect, MI.getDebugLoc());
+  UV->addDef(Idx, MI.getOperand(0));
   return true;
 }
 
@@ -518,12 +520,13 @@ bool LDVImpl::collectDebugValues(MachineFunction &mf) {
         continue;
       }
       // DBG_VALUE has no slot index, use the previous instruction instead.
-      SlotIndex Idx = MBBI == MBB->begin() ?
-        LIS->getMBBStartIdx(MBB) :
-        LIS->getInstructionIndex(std::prev(MBBI)).getRegSlot();
+      SlotIndex Idx =
+          MBBI == MBB->begin()
+              ? LIS->getMBBStartIdx(MBB)
+              : LIS->getInstructionIndex(*std::prev(MBBI)).getRegSlot();
       // Handle consecutive DBG_VALUE instructions with the same slot index.
       do {
-        if (handleDebugValue(MBBI, Idx)) {
+        if (handleDebugValue(*MBBI, Idx)) {
           MBBI = MBB->erase(MBBI);
           Changed = true;
         } else
@@ -554,8 +557,10 @@ void UserValue::extendDef(SlotIndex Idx, unsigned LocNo, LiveRange *LR,
         Kills->push_back(Start);
       return;
     }
-    if (Segment->end < Stop)
-      Stop = Segment->end, ToEnd = false;
+    if (Segment->end < Stop) {
+      Stop = Segment->end;
+      ToEnd = false;
+    }
   }
 
   // There could already be a short def at Start.
@@ -569,8 +574,10 @@ void UserValue::extendDef(SlotIndex Idx, unsigned LocNo, LiveRange *LR,
   }
 
   // Limited by the next def.
-  if (I.valid() && I.start() < Stop)
-    Stop = I.start(), ToEnd = false;
+  if (I.valid() && I.start() < Stop) {
+    Stop = I.start();
+    ToEnd = false;
+  }
   // Limited by VNI's live range.
   else if (!ToEnd && Kills)
     Kills->push_back(Stop);
@@ -608,7 +615,7 @@ UserValue::addDefsFromCopies(LiveInterval *LI, unsigned LocNo,
 
     // Is LocNo extended to reach this copy? If not, another def may be blocking
     // it, or we are looking at a wrong value of LI.
-    SlotIndex Idx = LIS.getInstructionIndex(MI);
+    SlotIndex Idx = LIS.getInstructionIndex(*MI);
     LocMap::iterator I = locInts.find(Idx.getRegSlot(true));
     if (!I.valid() || I.value() != LocNo)
       continue;
@@ -1033,7 +1040,7 @@ bool LiveDebugVariables::doInitialization(Module &M) {
 }
 
 #ifndef NDEBUG
-void LiveDebugVariables::dump() {
+LLVM_DUMP_METHOD void LiveDebugVariables::dump() {
   if (pImpl)
     static_cast<LDVImpl*>(pImpl)->print(dbgs());
 }
diff --git a/lib/CodeGen/LiveDebugVariables.h b/lib/CodeGen/LiveDebugVariables.h
index 3d36f4d2494a..afe87a52544d 100644
--- a/lib/CodeGen/LiveDebugVariables.h
+++ b/lib/CodeGen/LiveDebugVariables.h
@@ -21,12 +21,12 @@
 #ifndef LLVM_LIB_CODEGEN_LIVEDEBUGVARIABLES_H
 #define LLVM_LIB_CODEGEN_LIVEDEBUGVARIABLES_H
 
-#include "llvm/ADT/ArrayRef.h"
 #include "llvm/CodeGen/MachineFunctionPass.h"
 #include "llvm/IR/DebugInfo.h"
 
 namespace llvm {
 
+template <typename T> class ArrayRef;
 class LiveInterval;
 class LiveIntervals;
 class VirtRegMap;
diff --git a/lib/CodeGen/LiveInterval.cpp b/lib/CodeGen/LiveInterval.cpp
index 50158006211d..93c5ca785ac9 100644
--- a/lib/CodeGen/LiveInterval.cpp
+++ b/lib/CodeGen/LiveInterval.cpp
@@ -19,8 +19,9 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/CodeGen/LiveInterval.h"
+
+#include "LiveRangeUtils.h"
 #include "RegisterCoalescer.h"
-#include "llvm/ADT/DenseMap.h"
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/SmallSet.h"
 #include "llvm/CodeGen/LiveIntervalAnalysis.h"
@@ -309,10 +310,12 @@ LiveRange::iterator LiveRange::find(SlotIndex Pos) {
   size_t Len = size();
   do {
     size_t Mid = Len >> 1;
-    if (Pos < I[Mid].end)
+    if (Pos < I[Mid].end) {
       Len = Mid;
-    else
-      I += Mid + 1, Len -= Mid + 1;
+    } else {
+      I += Mid + 1;
+      Len -= Mid + 1;
+    }
   } while (Len);
   return I;
 }
@@ -814,239 +817,6 @@ void LiveInterval::clearSubRanges() {
   SubRanges = nullptr;
 }
 
-/// Helper function for constructMainRangeFromSubranges(): Search the CFG
-/// backwards until we find a place covered by a LiveRange segment that actually
-/// has a valno set.
-static VNInfo *searchForVNI(const SlotIndexes &Indexes, LiveRange &LR,
-    const MachineBasicBlock *MBB,
-    SmallPtrSetImpl<const MachineBasicBlock*> &Visited) {
-  // We start the search at the end of MBB.
-  SlotIndex EndIdx = Indexes.getMBBEndIdx(MBB);
-  // In our use case we can't live the area covered by the live segments without
-  // finding an actual VNI def.
-  LiveRange::iterator I = LR.find(EndIdx.getPrevSlot());
-  assert(I != LR.end());
-  LiveRange::Segment &S = *I;
-  if (S.valno != nullptr)
-    return S.valno;
-
-  VNInfo *VNI = nullptr;
-  // Continue at predecessors (we could even go to idom with domtree available).
-  for (const MachineBasicBlock *Pred : MBB->predecessors()) {
-    // Avoid going in circles.
-    if (!Visited.insert(Pred).second)
-      continue;
-
-    VNI = searchForVNI(Indexes, LR, Pred, Visited);
-    if (VNI != nullptr) {
-      S.valno = VNI;
-      break;
-    }
-  }
-
-  return VNI;
-}
-
-static void determineMissingVNIs(const SlotIndexes &Indexes, LiveInterval &LI) {
-  SmallPtrSet<const MachineBasicBlock*, 5> Visited;
-
-  LiveRange::iterator OutIt;
-  VNInfo *PrevValNo = nullptr;
-  for (LiveRange::iterator I = LI.begin(), E = LI.end(); I != E; ++I) {
-    LiveRange::Segment &S = *I;
-    // Determine final VNI if necessary.
-    if (S.valno == nullptr) {
-      // This can only happen at the begin of a basic block.
-      assert(S.start.isBlock() && "valno should only be missing at block begin");
-
-      Visited.clear();
-      const MachineBasicBlock *MBB = Indexes.getMBBFromIndex(S.start);
-      for (const MachineBasicBlock *Pred : MBB->predecessors()) {
-        VNInfo *VNI = searchForVNI(Indexes, LI, Pred, Visited);
-        if (VNI != nullptr) {
-          S.valno = VNI;
-          break;
-        }
-      }
-      assert(S.valno != nullptr && "could not determine valno");
-    }
-    // Merge with previous segment if it has the same VNI.
-    if (PrevValNo == S.valno && OutIt->end == S.start) {
-      OutIt->end = S.end;
-    } else {
-      // Didn't merge. Move OutIt to next segment.
-      if (PrevValNo == nullptr)
-        OutIt = LI.begin();
-      else
-        ++OutIt;
-
-      if (OutIt != I)
-        *OutIt = *I;
-      PrevValNo = S.valno;
-    }
-  }
-  // If we merged some segments chop off the end.
-  ++OutIt;
-  LI.segments.erase(OutIt, LI.end());
-}
-
-void LiveInterval::constructMainRangeFromSubranges(
-    const SlotIndexes &Indexes, VNInfo::Allocator &VNIAllocator) {
-  // The basic observations on which this algorithm is based:
-  // - Each Def/ValNo in a subrange must have a corresponding def on the main
-  //   range, but not further defs/valnos are necessary.
-  // - If any of the subranges is live at a point the main liverange has to be
-  //   live too, conversily if no subrange is live the main range mustn't be
-  //   live either.
-  // We do this by scanning through all the subranges simultaneously creating new
-  // segments in the main range as segments start/ends come up in the subranges.
-  assert(hasSubRanges() && "expected subranges to be present");
-  assert(segments.empty() && valnos.empty() && "expected empty main range");
-
-  // Collect subrange, iterator pairs for the walk and determine first and last
-  // SlotIndex involved.
-  SmallVector<std::pair<const SubRange*, const_iterator>, 4> SRs;
-  SlotIndex First;
-  SlotIndex Last;
-  for (const SubRange &SR : subranges()) {
-    if (SR.empty())
-      continue;
-    SRs.push_back(std::make_pair(&SR, SR.begin()));
-    if (!First.isValid() || SR.segments.front().start < First)
-      First = SR.segments.front().start;
-    if (!Last.isValid() || SR.segments.back().end > Last)
-      Last = SR.segments.back().end;
-  }
-
-  // Walk over all subranges simultaneously.
-  Segment CurrentSegment;
-  bool ConstructingSegment = false;
-  bool NeedVNIFixup = false;
-  LaneBitmask ActiveMask = 0;
-  SlotIndex Pos = First;
-  while (true) {
-    SlotIndex NextPos = Last;
-    enum {
-      NOTHING,
-      BEGIN_SEGMENT,
-      END_SEGMENT,
-    } Event = NOTHING;
-    // Which subregister lanes are affected by the current event.
-    LaneBitmask EventMask = 0;
-    // Whether a BEGIN_SEGMENT is also a valno definition point.
-    bool IsDef = false;
-    // Find the next begin or end of a subrange segment. Combine masks if we
-    // have multiple begins/ends at the same position. Ends take precedence over
-    // Begins.
-    for (auto &SRP : SRs) {
-      const SubRange &SR = *SRP.first;
-      const_iterator &I = SRP.second;
-      // Advance iterator of subrange to a segment involving Pos; the earlier
-      // segments are already merged at this point.
-      while (I != SR.end() &&
-             (I->end < Pos ||
-              (I->end == Pos && (ActiveMask & SR.LaneMask) == 0)))
-        ++I;
-      if (I == SR.end())
-        continue;
-      if ((ActiveMask & SR.LaneMask) == 0 &&
-          Pos <= I->start && I->start <= NextPos) {
-        // Merge multiple begins at the same position.
-        if (I->start == NextPos && Event == BEGIN_SEGMENT) {
-          EventMask |= SR.LaneMask;
-          IsDef |= I->valno->def == I->start;
-        } else if (I->start < NextPos || Event != END_SEGMENT) {
-          Event = BEGIN_SEGMENT;
-          NextPos = I->start;
-          EventMask = SR.LaneMask;
-          IsDef = I->valno->def == I->start;
-        }
-      }
-      if ((ActiveMask & SR.LaneMask) != 0 &&
-          Pos <= I->end && I->end <= NextPos) {
-        // Merge multiple ends at the same position.
-        if (I->end == NextPos && Event == END_SEGMENT)
-          EventMask |= SR.LaneMask;
-        else {
-          Event = END_SEGMENT;
-          NextPos = I->end;
-          EventMask = SR.LaneMask;
-        }
-      }
-    }
-
-    // Advance scan position.
-    Pos = NextPos;
-    if (Event == BEGIN_SEGMENT) {
-      if (ConstructingSegment && IsDef) {
-        // Finish previous segment because we have to start a new one.
-        CurrentSegment.end = Pos;
-        append(CurrentSegment);
-        ConstructingSegment = false;
-      }
-
-      // Start a new segment if necessary.
-      if (!ConstructingSegment) {
-        // Determine value number for the segment.
-        VNInfo *VNI;
-        if (IsDef) {
-          VNI = getNextValue(Pos, VNIAllocator);
-        } else {
-          // We have to reuse an existing value number, if we are lucky
-          // then we already passed one of the predecessor blocks and determined
-          // its value number (with blocks in reverse postorder this would be
-          // always true but we have no such guarantee).
-          assert(Pos.isBlock());
-          const MachineBasicBlock *MBB = Indexes.getMBBFromIndex(Pos);
-          // See if any of the predecessor blocks has a lower number and a VNI
-          for (const MachineBasicBlock *Pred : MBB->predecessors()) {
-            SlotIndex PredEnd = Indexes.getMBBEndIdx(Pred);
-            VNI = getVNInfoBefore(PredEnd);
-            if (VNI != nullptr)
-              break;
-          }
-          // Def will come later: We have to do an extra fixup pass.
-          if (VNI == nullptr)
-            NeedVNIFixup = true;
-        }
-
-        // In rare cases we can produce adjacent segments with the same value
-        // number (if they come from different subranges, but happen to have
-        // the same defining instruction). VNIFixup will fix those cases.
-        if (!empty() && segments.back().end == Pos &&
-            segments.back().valno == VNI)
-          NeedVNIFixup = true;
-        CurrentSegment.start = Pos;
-        CurrentSegment.valno = VNI;
-        ConstructingSegment = true;
-      }
-      ActiveMask |= EventMask;
-    } else if (Event == END_SEGMENT) {
-      assert(ConstructingSegment);
-      // Finish segment if no lane is active anymore.
-      ActiveMask &= ~EventMask;
-      if (ActiveMask == 0) {
-        CurrentSegment.end = Pos;
-        append(CurrentSegment);
-        ConstructingSegment = false;
-      }
-    } else {
-      // We reached the end of the last subranges and can stop.
-      assert(Event == NOTHING);
-      break;
-    }
-  }
-
-  // We might not be able to assign new valnos for all segments if the basic
-  // block containing the definition comes after a segment using the valno.
-  // Do a fixup pass for this uncommon case.
-  if (NeedVNIFixup)
-    determineMissingVNIs(Indexes, *this);
-
-  assert(ActiveMask == 0 && !ConstructingSegment && "all segments ended");
-  verify();
-}
-
 unsigned LiveInterval::getSize() const {
   unsigned Sum = 0;
   for (const Segment &S : segments)
@@ -1055,12 +825,12 @@ unsigned LiveInterval::getSize() const {
 }
 
 raw_ostream& llvm::operator<<(raw_ostream& os, const LiveRange::Segment &S) {
-  return os << '[' << S.start << ',' << S.end << ':' << S.valno->id << ")";
+  return os << '[' << S.start << ',' << S.end << ':' << S.valno->id << ')';
 }
 
 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
-void LiveRange::Segment::dump() const {
-  dbgs() << *this << "\n";
+LLVM_DUMP_METHOD void LiveRange::Segment::dump() const {
+  dbgs() << *this << '\n';
 }
 #endif
 
@@ -1081,10 +851,10 @@ void LiveRange::print(raw_ostream &OS) const {
     for (const_vni_iterator i = vni_begin(), e = vni_end(); i != e;
          ++i, ++vnum) {
       const VNInfo *vni = *i;
-      if (vnum) OS << " ";
-      OS << vnum << "@";
+      if (vnum) OS << ' ';
+      OS << vnum << '@';
       if (vni->isUnused()) {
-        OS << "x";
+        OS << 'x';
       } else {
         OS << vni->def;
         if (vni->isPHIDef())
@@ -1094,22 +864,30 @@ void LiveRange::print(raw_ostream &OS) const {
   }
 }
 
+void LiveInterval::SubRange::print(raw_ostream &OS) const {
+  OS << " L" << PrintLaneMask(LaneMask) << ' '
+     << static_cast<const LiveRange&>(*this);
+}
+
 void LiveInterval::print(raw_ostream &OS) const {
   OS << PrintReg(reg) << ' ';
   super::print(OS);
   // Print subranges
-  for (const SubRange &SR : subranges()) {
-    OS << " L" << PrintLaneMask(SR.LaneMask) << ' ' << SR;
-  }
+  for (const SubRange &SR : subranges())
+    OS << SR;
 }
 
 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
-void LiveRange::dump() const {
-  dbgs() << *this << "\n";
+LLVM_DUMP_METHOD void LiveRange::dump() const {
+  dbgs() << *this << '\n';
+}
+
+LLVM_DUMP_METHOD void LiveInterval::SubRange::dump() const {
+  dbgs() << *this << '\n';
 }
 
-void LiveInterval::dump() const {
-  dbgs() << *this << "\n";
+LLVM_DUMP_METHOD void LiveInterval::dump() const {
+  dbgs() << *this << '\n';
 }
 #endif
 
@@ -1206,8 +984,7 @@ void LiveRangeUpdater::print(raw_ostream &OS) const {
   OS << '\n';
 }
 
-void LiveRangeUpdater::dump() const
-{
+LLVM_DUMP_METHOD void LiveRangeUpdater::dump() const {
   print(errs());
 }
 
@@ -1405,40 +1182,6 @@ unsigned ConnectedVNInfoEqClasses::Classify(const LiveRange &LR) {
   return EqClass.getNumClasses();
 }
 
-template<typename LiveRangeT, typename EqClassesT>
-static void DistributeRange(LiveRangeT &LR, LiveRangeT *SplitLRs[],
-                            EqClassesT VNIClasses) {
-  // Move segments to new intervals.
-  LiveRange::iterator J = LR.begin(), E = LR.end();
-  while (J != E && VNIClasses[J->valno->id] == 0)
-    ++J;
-  for (LiveRange::iterator I = J; I != E; ++I) {
-    if (unsigned eq = VNIClasses[I->valno->id]) {
-      assert((SplitLRs[eq-1]->empty() || SplitLRs[eq-1]->expiredAt(I->start)) &&
-             "New intervals should be empty");
-      SplitLRs[eq-1]->segments.push_back(*I);
-    } else
-      *J++ = *I;
-  }
-  LR.segments.erase(J, E);
-
-  // Transfer VNInfos to their new owners and renumber them.
-  unsigned j = 0, e = LR.getNumValNums();
-  while (j != e && VNIClasses[j] == 0)
-    ++j;
-  for (unsigned i = j; i != e; ++i) {
-    VNInfo *VNI = LR.getValNumInfo(i);
-    if (unsigned eq = VNIClasses[i]) {
-      VNI->id = SplitLRs[eq-1]->getNumValNums();
-      SplitLRs[eq-1]->valnos.push_back(VNI);
-    } else {
-      VNI->id = j;
-      LR.valnos[j++] = VNI;
-    }
-  }
-  LR.valnos.resize(j);
-}
-
 void ConnectedVNInfoEqClasses::Distribute(LiveInterval &LI, LiveInterval *LIV[],
                                           MachineRegisterInfo &MRI) {
   // Rewrite instructions.
@@ -1453,9 +1196,9 @@ void ConnectedVNInfoEqClasses::Distribute(LiveInterval &LI, LiveInterval *LIV[],
     // called, but it is not a requirement.
     SlotIndex Idx;
     if (MI->isDebugValue())
-      Idx = LIS.getSlotIndexes()->getIndexBefore(MI);
+      Idx = LIS.getSlotIndexes()->getIndexBefore(*MI);
     else
-      Idx = LIS.getInstructionIndex(MI);
+      Idx = LIS.getInstructionIndex(*MI);
     LiveQueryResult LRQ = LI.Query(Idx);
     const VNInfo *VNI = MO.readsReg() ? LRQ.valueIn() : LRQ.valueDefined();
     // In the case of an <undef> use that isn't tied to any def, VNI will be
@@ -1482,15 +1225,20 @@ void ConnectedVNInfoEqClasses::Distribute(LiveInterval &LI, LiveInterval *LIV[],
       SubRanges.resize(NumComponents-1, nullptr);
       for (unsigned I = 0; I < NumValNos; ++I) {
         const VNInfo &VNI = *SR.valnos[I];
-        const VNInfo *MainRangeVNI = LI.getVNInfoAt(VNI.def);
-        assert(MainRangeVNI != nullptr
-               && "SubRange def must have corresponding main range def");
-        unsigned ComponentNum = getEqClass(MainRangeVNI);
-        VNIMapping.push_back(ComponentNum);
-        if (ComponentNum > 0 && SubRanges[ComponentNum-1] == nullptr) {
-          SubRanges[ComponentNum-1]
-            = LIV[ComponentNum-1]->createSubRange(Allocator, SR.LaneMask);
+        unsigned ComponentNum;
+        if (VNI.isUnused()) {
+          ComponentNum = 0;
+        } else {
+          const VNInfo *MainRangeVNI = LI.getVNInfoAt(VNI.def);
+          assert(MainRangeVNI != nullptr
+                 && "SubRange def must have corresponding main range def");
+          ComponentNum = getEqClass(MainRangeVNI);
+          if (ComponentNum > 0 && SubRanges[ComponentNum-1] == nullptr) {
+            SubRanges[ComponentNum-1]
+              = LIV[ComponentNum-1]->createSubRange(Allocator, SR.LaneMask);
+          }
         }
+        VNIMapping.push_back(ComponentNum);
       }
       DistributeRange(SR, SubRanges.data(), VNIMapping);
     }
diff --git a/lib/CodeGen/LiveIntervalAnalysis.cpp b/lib/CodeGen/LiveIntervalAnalysis.cpp
index a506e0571c09..5f3281f6771d 100644
--- a/lib/CodeGen/LiveIntervalAnalysis.cpp
+++ b/lib/CodeGen/LiveIntervalAnalysis.cpp
@@ -9,15 +9,13 @@
 //
 // This file implements the LiveInterval analysis pass which is used
 // by the Linear Scan Register allocator. This pass linearizes the
-// basic blocks of the function in DFS order and uses the
-// LiveVariables pass to conservatively compute live intervals for
+// basic blocks of the function in DFS order and computes live intervals for
 // each virtual and physical register.
 //
 //===----------------------------------------------------------------------===//
 
 #include "llvm/CodeGen/LiveIntervalAnalysis.h"
 #include "LiveRangeCalc.h"
-#include "llvm/ADT/DenseSet.h"
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/Analysis/AliasAnalysis.h"
 #include "llvm/CodeGen/LiveVariables.h"
@@ -38,7 +36,6 @@
 #include "llvm/Target/TargetSubtargetInfo.h"
 #include <algorithm>
 #include <cmath>
-#include <limits>
 using namespace llvm;
 
 #define DEBUG_TYPE "regalloc"
@@ -48,7 +45,6 @@ char &llvm::LiveIntervalsID = LiveIntervals::ID;
 INITIALIZE_PASS_BEGIN(LiveIntervals, "liveintervals",
                 "Live Interval Analysis", false, false)
 INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass)
-INITIALIZE_PASS_DEPENDENCY(LiveVariables)
 INITIALIZE_PASS_DEPENDENCY(MachineDominatorTree)
 INITIALIZE_PASS_DEPENDENCY(SlotIndexes)
 INITIALIZE_PASS_END(LiveIntervals, "liveintervals",
@@ -77,10 +73,6 @@ void LiveIntervals::getAnalysisUsage(AnalysisUsage &AU) const {
   AU.setPreservesCFG();
   AU.addRequired<AAResultsWrapperPass>();
   AU.addPreserved<AAResultsWrapperPass>();
-  // LiveVariables isn't really required by this analysis, it is only required
-  // here to make sure it is live during TwoAddressInstructionPass and
-  // PHIElimination. This is temporary.
-  AU.addRequired<LiveVariables>();
   AU.addPreserved<LiveVariables>();
   AU.addPreservedID(MachineLoopInfoID);
   AU.addRequiredTransitiveID(MachineDominatorsID);
@@ -197,16 +189,9 @@ LiveInterval* LiveIntervals::createInterval(unsigned reg) {
 void LiveIntervals::computeVirtRegInterval(LiveInterval &LI) {
   assert(LRCalc && "LRCalc not initialized.");
   assert(LI.empty() && "Should only compute empty intervals.");
-  bool ShouldTrackSubRegLiveness = MRI->shouldTrackSubRegLiveness(LI.reg);
   LRCalc->reset(MF, getSlotIndexes(), DomTree, &getVNInfoAllocator());
-  LRCalc->calculate(LI, ShouldTrackSubRegLiveness);
-  bool SeparatedComponents = computeDeadValues(LI, nullptr);
-  if (SeparatedComponents) {
-    assert(ShouldTrackSubRegLiveness
-           && "Separated components should only occur for unused subreg defs");
-    SmallVector<LiveInterval*, 8> SplitLIs;
-    splitSeparateComponents(LI, SplitLIs);
-  }
+  LRCalc->calculate(LI, MRI->shouldTrackSubRegLiveness(LI.reg));
+  computeDeadValues(LI, nullptr);
 }
 
 void LiveIntervals::computeVirtRegs() {
@@ -236,14 +221,18 @@ void LiveIntervals::computeRegMasks() {
       for (const MachineOperand &MO : MI.operands()) {
         if (!MO.isRegMask())
           continue;
-        RegMaskSlots.push_back(Indexes->getInstructionIndex(&MI).getRegSlot());
+        RegMaskSlots.push_back(Indexes->getInstructionIndex(MI).getRegSlot());
         RegMaskBits.push_back(MO.getRegMask());
       }
     }
 
-    // Some block ends, such as funclet returns, create masks.
+    // Some block ends, such as funclet returns, create masks. Put the mask on
+    // the last instruction of the block, because MBB slot index intervals are
+    // half-open.
     if (const uint32_t *Mask = MBB.getEndClobberMask(TRI)) {
-      RegMaskSlots.push_back(Indexes->getMBBEndIdx(&MBB));
+      assert(!MBB.empty() && "empty return block?");
+      RegMaskSlots.push_back(
+          Indexes->getInstructionIndex(MBB.back()).getRegSlot());
       RegMaskBits.push_back(Mask);
     }
 
@@ -439,7 +428,7 @@ bool LiveIntervals::shrinkToUses(LiveInterval *li,
     MachineInstr *UseMI = &*(I++);
     if (UseMI->isDebugValue() || !UseMI->readsVirtualRegister(li->reg))
       continue;
-    SlotIndex Idx = getInstructionIndex(UseMI).getRegSlot();
+    SlotIndex Idx = getInstructionIndex(*UseMI).getRegSlot();
     LiveQueryResult LRQ = li->Query(Idx);
     VNInfo *VNI = LRQ.valueIn();
     if (!VNI) {
@@ -485,13 +474,11 @@ bool LiveIntervals::computeDeadValues(LiveInterval &LI,
 
     // Is the register live before? Otherwise we may have to add a read-undef
     // flag for subregister defs.
-    bool DeadBeforeDef = false;
     unsigned VReg = LI.reg;
     if (MRI->shouldTrackSubRegLiveness(VReg)) {
       if ((I == LI.begin() || std::prev(I)->end < Def) && !VNI->isPHIDef()) {
         MachineInstr *MI = getInstructionFromIndex(Def);
         MI->setRegisterDefReadUndef(VReg);
-        DeadBeforeDef = true;
       }
     }
 
@@ -507,15 +494,7 @@ bool LiveIntervals::computeDeadValues(LiveInterval &LI,
       // This is a dead def. Make sure the instruction knows.
       MachineInstr *MI = getInstructionFromIndex(Def);
       assert(MI && "No instruction defining live value");
-      MI->addRegisterDead(VReg, TRI);
-
-      // If we have a dead def that is completely separate from the rest of
-      // the liverange then we rewrite it to use a different VReg to not violate
-      // the rule that the liveness of a virtual register forms a connected
-      // component. This should only happen if subregister liveness is tracked.
-      if (DeadBeforeDef)
-        MayHaveSplitComponents = true;
-
+      MI->addRegisterDead(LI.reg, TRI);
       if (dead && MI->allDefsAreDead()) {
         DEBUG(dbgs() << "All defs dead: " << Def << '\t' << *MI);
         dead->push_back(MI);
@@ -547,7 +526,7 @@ void LiveIntervals::shrinkToUses(LiveInterval::SubRange &SR, unsigned Reg)
         continue;
     }
     // We only need to visit each instruction once.
-    SlotIndex Idx = getInstructionIndex(UseMI).getRegSlot();
+    SlotIndex Idx = getInstructionIndex(*UseMI).getRegSlot();
     if (Idx == LastIdx)
       continue;
     LastIdx = Idx;
@@ -585,9 +564,9 @@ void LiveIntervals::shrinkToUses(LiveInterval::SubRange &SR, unsigned Reg)
       continue;
     if (VNI->isPHIDef()) {
       // This is a dead PHI. Remove it.
+      DEBUG(dbgs() << "Dead PHI at " << VNI->def << " may separate interval\n");
       VNI->markUnused();
       SR.removeSegment(*Segment);
-      DEBUG(dbgs() << "Dead PHI at " << VNI->def << " may separate interval\n");
     }
   }
 
@@ -837,24 +816,22 @@ LiveIntervals::hasPHIKill(const LiveInterval &LI, const VNInfo *VNI) const {
   return false;
 }
 
-float
-LiveIntervals::getSpillWeight(bool isDef, bool isUse,
-                              const MachineBlockFrequencyInfo *MBFI,
-                              const MachineInstr *MI) {
-  BlockFrequency Freq = MBFI->getBlockFreq(MI->getParent());
+float LiveIntervals::getSpillWeight(bool isDef, bool isUse,
+                                    const MachineBlockFrequencyInfo *MBFI,
+                                    const MachineInstr &MI) {
+  BlockFrequency Freq = MBFI->getBlockFreq(MI.getParent());
   const float Scale = 1.0f / MBFI->getEntryFreq();
   return (isDef + isUse) * (Freq.getFrequency() * Scale);
 }
 
 LiveRange::Segment
-LiveIntervals::addSegmentToEndOfBlock(unsigned reg, MachineInstr* startInst) {
+LiveIntervals::addSegmentToEndOfBlock(unsigned reg, MachineInstr &startInst) {
   LiveInterval& Interval = createEmptyInterval(reg);
-  VNInfo* VN = Interval.getNextValue(
-    SlotIndex(getInstructionIndex(startInst).getRegSlot()),
-    getVNInfoAllocator());
-  LiveRange::Segment S(
-     SlotIndex(getInstructionIndex(startInst).getRegSlot()),
-     getMBBEndIdx(startInst->getParent()), VN);
+  VNInfo *VN = Interval.getNextValue(
+      SlotIndex(getInstructionIndex(startInst).getRegSlot()),
+      getVNInfoAllocator());
+  LiveRange::Segment S(SlotIndex(getInstructionIndex(startInst).getRegSlot()),
+                       getMBBEndIdx(startInst.getParent()), VN);
   Interval.addSegment(S);
 
   return S;
@@ -962,10 +939,13 @@ public:
         hasRegMask = true;
       if (!MO.isReg())
         continue;
-      // Aggressively clear all kill flags.
-      // They are reinserted by VirtRegRewriter.
-      if (MO.isUse())
+      if (MO.isUse()) {
+        if (!MO.readsReg())
+          continue;
+        // Aggressively clear all kill flags.
+        // They are reinserted by VirtRegRewriter.
         MO.setIsKill(false);
+      }
 
       unsigned Reg = MO.getReg();
       if (!Reg)
@@ -1021,172 +1001,296 @@ private:
   }
 
   /// Update LR to reflect an instruction has been moved downwards from OldIdx
-  /// to NewIdx.
-  ///
-  /// 1. Live def at OldIdx:
-  ///    Move def to NewIdx, assert endpoint after NewIdx.
-  ///
-  /// 2. Live def at OldIdx, killed at NewIdx:
-  ///    Change to dead def at NewIdx.
-  ///    (Happens when bundling def+kill together).
-  ///
-  /// 3. Dead def at OldIdx:
-  ///    Move def to NewIdx, possibly across another live value.
-  ///
-  /// 4. Def at OldIdx AND at NewIdx:
-  ///    Remove segment [OldIdx;NewIdx) and value defined at OldIdx.
-  ///    (Happens when bundling multiple defs together).
-  ///
-  /// 5. Value read at OldIdx, killed before NewIdx:
-  ///    Extend kill to NewIdx.
-  ///
+  /// to NewIdx (OldIdx < NewIdx).
   void handleMoveDown(LiveRange &LR) {
-    // First look for a kill at OldIdx.
-    LiveRange::iterator I = LR.find(OldIdx.getBaseIndex());
     LiveRange::iterator E = LR.end();
-    // Is LR even live at OldIdx?
-    if (I == E || SlotIndex::isEarlierInstr(OldIdx, I->start))
+    // Segment going into OldIdx.
+    LiveRange::iterator OldIdxIn = LR.find(OldIdx.getBaseIndex());
+
+    // No value live before or after OldIdx? Nothing to do.
+    if (OldIdxIn == E || SlotIndex::isEarlierInstr(OldIdx, OldIdxIn->start))
       return;
 
-    // Handle a live-in value.
-    if (!SlotIndex::isSameInstr(I->start, OldIdx)) {
-      bool isKill = SlotIndex::isSameInstr(OldIdx, I->end);
+    LiveRange::iterator OldIdxOut;
+    // Do we have a value live-in to OldIdx?
+    if (SlotIndex::isEarlierInstr(OldIdxIn->start, OldIdx)) {
       // If the live-in value already extends to NewIdx, there is nothing to do.
-      if (!SlotIndex::isEarlierInstr(I->end, NewIdx))
+      if (SlotIndex::isEarlierEqualInstr(NewIdx, OldIdxIn->end))
         return;
       // Aggressively remove all kill flags from the old kill point.
       // Kill flags shouldn't be used while live intervals exist, they will be
       // reinserted by VirtRegRewriter.
-      if (MachineInstr *KillMI = LIS.getInstructionFromIndex(I->end))
-        for (MIBundleOperands MO(KillMI); MO.isValid(); ++MO)
+      if (MachineInstr *KillMI = LIS.getInstructionFromIndex(OldIdxIn->end))
+        for (MIBundleOperands MO(*KillMI); MO.isValid(); ++MO)
           if (MO->isReg() && MO->isUse())
             MO->setIsKill(false);
-      // Adjust I->end to reach NewIdx. This may temporarily make LR invalid by
-      // overlapping ranges. Case 5 above.
-      I->end = NewIdx.getRegSlot(I->end.isEarlyClobber());
-      // If this was a kill, there may also be a def. Otherwise we're done.
+
+      // Is there a def before NewIdx which is not OldIdx?
+      LiveRange::iterator Next = std::next(OldIdxIn);
+      if (Next != E && !SlotIndex::isSameInstr(OldIdx, Next->start) &&
+          SlotIndex::isEarlierInstr(Next->start, NewIdx)) {
+        // If we are here then OldIdx was just a use but not a def. We only have
+        // to ensure liveness extends to NewIdx.
+        LiveRange::iterator NewIdxIn =
+          LR.advanceTo(Next, NewIdx.getBaseIndex());
+        // Extend the segment before NewIdx if necessary.
+        if (NewIdxIn == E ||
+            !SlotIndex::isEarlierInstr(NewIdxIn->start, NewIdx)) {
+          LiveRange::iterator Prev = std::prev(NewIdxIn);
+          Prev->end = NewIdx.getRegSlot();
+        }
+        return;
+      }
+
+      // Adjust OldIdxIn->end to reach NewIdx. This may temporarily make LR
+      // invalid by overlapping ranges.
+      bool isKill = SlotIndex::isSameInstr(OldIdx, OldIdxIn->end);
+      OldIdxIn->end = NewIdx.getRegSlot(OldIdxIn->end.isEarlyClobber());
+      // If this was not a kill, then there was no def and we're done.
       if (!isKill)
         return;
-      ++I;
+
+      // Did we have a Def at OldIdx?
+      OldIdxOut = Next;
+      if (OldIdxOut == E || !SlotIndex::isSameInstr(OldIdx, OldIdxOut->start))
+        return;
+    } else {
+      OldIdxOut = OldIdxIn;
     }
 
-    // Check for a def at OldIdx.
-    if (I == E || !SlotIndex::isSameInstr(OldIdx, I->start))
-      return;
-    // We have a def at OldIdx.
-    VNInfo *DefVNI = I->valno;
-    assert(DefVNI->def == I->start && "Inconsistent def");
-    DefVNI->def = NewIdx.getRegSlot(I->start.isEarlyClobber());
-    // If the defined value extends beyond NewIdx, just move the def down.
-    // This is case 1 above.
-    if (SlotIndex::isEarlierInstr(NewIdx, I->end)) {
-      I->start = DefVNI->def;
+    // If we are here then there is a Definition at OldIdx. OldIdxOut points
+    // to the segment starting there.
+    assert(OldIdxOut != E && SlotIndex::isSameInstr(OldIdx, OldIdxOut->start) &&
+           "No def?");
+    VNInfo *OldIdxVNI = OldIdxOut->valno;
+    assert(OldIdxVNI->def == OldIdxOut->start && "Inconsistent def");
+
+    // If the defined value extends beyond NewIdx, just move the beginning
+    // of the segment to NewIdx.
+    SlotIndex NewIdxDef = NewIdx.getRegSlot(OldIdxOut->start.isEarlyClobber());
+    if (SlotIndex::isEarlierInstr(NewIdxDef, OldIdxOut->end)) {
+      OldIdxVNI->def = NewIdxDef;
+      OldIdxOut->start = OldIdxVNI->def;
       return;
     }
-    // The remaining possibilities are now:
-    // 2. Live def at OldIdx, killed at NewIdx: isSameInstr(I->end, NewIdx).
-    // 3. Dead def at OldIdx: I->end = OldIdx.getDeadSlot().
-    // In either case, it is possible that there is an existing def at NewIdx.
-    assert((I->end == OldIdx.getDeadSlot() ||
-            SlotIndex::isSameInstr(I->end, NewIdx)) &&
-            "Cannot move def below kill");
-    LiveRange::iterator NewI = LR.advanceTo(I, NewIdx.getRegSlot());
-    if (NewI != E && SlotIndex::isSameInstr(NewI->start, NewIdx)) {
-      // There is an existing def at NewIdx, case 4 above. The def at OldIdx is
-      // coalesced into that value.
-      assert(NewI->valno != DefVNI && "Multiple defs of value?");
-      LR.removeValNo(DefVNI);
+
+    // If we are here then we have a Definition at OldIdx which ends before
+    // NewIdx.
+
+    // Is there an existing Def at NewIdx?
+    LiveRange::iterator AfterNewIdx
+      = LR.advanceTo(OldIdxOut, NewIdx.getRegSlot());
+    bool OldIdxDefIsDead = OldIdxOut->end.isDead();
+    if (!OldIdxDefIsDead &&
+        SlotIndex::isEarlierInstr(OldIdxOut->end, NewIdxDef)) {
+      // OldIdx is not a dead def, and NewIdxDef is inside a new interval.
+      VNInfo *DefVNI;
+      if (OldIdxOut != LR.begin() &&
+          !SlotIndex::isEarlierInstr(std::prev(OldIdxOut)->end,
+                                     OldIdxOut->start)) {
+        // There is no gap between OldIdxOut and its predecessor anymore,
+        // merge them.
+        LiveRange::iterator IPrev = std::prev(OldIdxOut);
+        DefVNI = OldIdxVNI;
+        IPrev->end = OldIdxOut->end;
+      } else {
+        // The value is live in to OldIdx
+        LiveRange::iterator INext = std::next(OldIdxOut);
+        assert(INext != E && "Must have following segment");
+        // We merge OldIdxOut and its successor. As we're dealing with subreg
+        // reordering, there is always a successor to OldIdxOut in the same BB
+        // We don't need INext->valno anymore and will reuse for the new segment
+        // we create later.
+        DefVNI = OldIdxVNI;
+        INext->start = OldIdxOut->end;
+        INext->valno->def = INext->start;
+      }
+      // If NewIdx is behind the last segment, extend that and append a new one.
+      if (AfterNewIdx == E) {
+        // OldIdxOut is undef at this point, Slide (OldIdxOut;AfterNewIdx] up
+        // one position.
+        //    |-  ?/OldIdxOut -| |- X0 -| ... |- Xn -| end
+        // => |- X0/OldIdxOut -| ... |- Xn -| |- undef/NewS -| end
+        std::copy(std::next(OldIdxOut), E, OldIdxOut);
+        // The last segment is undefined now, reuse it for a dead def.
+        LiveRange::iterator NewSegment = std::prev(E);
+        *NewSegment = LiveRange::Segment(NewIdxDef, NewIdxDef.getDeadSlot(),
+                                         DefVNI);
+        DefVNI->def = NewIdxDef;
+
+        LiveRange::iterator Prev = std::prev(NewSegment);
+        Prev->end = NewIdxDef;
+      } else {
+        // OldIdxOut is undef at this point, Slide (OldIdxOut;AfterNewIdx] up
+        // one position.
+        //    |-  ?/OldIdxOut -| |- X0 -| ... |- Xn/AfterNewIdx -| |- Next -|
+        // => |- X0/OldIdxOut -| ... |- Xn -| |- Xn/AfterNewIdx -| |- Next -|
+        std::copy(std::next(OldIdxOut), std::next(AfterNewIdx), OldIdxOut);
+        LiveRange::iterator Prev = std::prev(AfterNewIdx);
+        // We have two cases:
+        if (SlotIndex::isEarlierInstr(Prev->start, NewIdxDef)) {
+          // Case 1: NewIdx is inside a liverange. Split this liverange at
+          // NewIdxDef into the segment "Prev" followed by "NewSegment".
+          LiveRange::iterator NewSegment = AfterNewIdx;
+          *NewSegment = LiveRange::Segment(NewIdxDef, Prev->end, Prev->valno);
+          Prev->valno->def = NewIdxDef;
+
+          *Prev = LiveRange::Segment(Prev->start, NewIdxDef, DefVNI);
+          DefVNI->def = Prev->start;
+        } else {
+          // Case 2: NewIdx is in a lifetime hole. Keep AfterNewIdx as is and
+          // turn Prev into a segment from NewIdx to AfterNewIdx->start.
+          *Prev = LiveRange::Segment(NewIdxDef, AfterNewIdx->start, DefVNI);
+          DefVNI->def = NewIdxDef;
+          assert(DefVNI != AfterNewIdx->valno);
+        }
+      }
       return;
     }
-    // There was no existing def at NewIdx. Turn *I into a dead def at NewIdx.
-    // If the def at OldIdx was dead, we allow it to be moved across other LR
-    // values. The new range should be placed immediately before NewI, move any
-    // intermediate ranges up.
-    assert(NewI != I && "Inconsistent iterators");
-    std::copy(std::next(I), NewI, I);
-    *std::prev(NewI)
-      = LiveRange::Segment(DefVNI->def, NewIdx.getDeadSlot(), DefVNI);
+
+    if (AfterNewIdx != E &&
+        SlotIndex::isSameInstr(AfterNewIdx->start, NewIdxDef)) {
+      // There is an existing def at NewIdx. The def at OldIdx is coalesced into
+      // that value.
+      assert(AfterNewIdx->valno != OldIdxVNI && "Multiple defs of value?");
+      LR.removeValNo(OldIdxVNI);
+    } else {
+      // There was no existing def at NewIdx. We need to create a dead def
+      // at NewIdx. Shift segments over the old OldIdxOut segment, this frees
+      // a new segment at the place where we want to construct the dead def.
+      //    |- OldIdxOut -| |- X0 -| ... |- Xn -| |- AfterNewIdx -|
+      // => |- X0/OldIdxOut -| ... |- Xn -| |- undef/NewS. -| |- AfterNewIdx -|
+      assert(AfterNewIdx != OldIdxOut && "Inconsistent iterators");
+      std::copy(std::next(OldIdxOut), AfterNewIdx, OldIdxOut);
+      // We can reuse OldIdxVNI now.
+      LiveRange::iterator NewSegment = std::prev(AfterNewIdx);
+      VNInfo *NewSegmentVNI = OldIdxVNI;
+      NewSegmentVNI->def = NewIdxDef;
+      *NewSegment = LiveRange::Segment(NewIdxDef, NewIdxDef.getDeadSlot(),
+                                       NewSegmentVNI);
+    }
   }
 
   /// Update LR to reflect an instruction has been moved upwards from OldIdx
-  /// to NewIdx.
-  ///
-  /// 1. Live def at OldIdx:
-  ///    Hoist def to NewIdx.
-  ///
-  /// 2. Dead def at OldIdx:
-  ///    Hoist def+end to NewIdx, possibly move across other values.
-  ///
-  /// 3. Dead def at OldIdx AND existing def at NewIdx:
-  ///    Remove value defined at OldIdx, coalescing it with existing value.
-  ///
-  /// 4. Live def at OldIdx AND existing def at NewIdx:
-  ///    Remove value defined at NewIdx, hoist OldIdx def to NewIdx.
-  ///    (Happens when bundling multiple defs together).
-  ///
-  /// 5. Value killed at OldIdx:
-  ///    Hoist kill to NewIdx, then scan for last kill between NewIdx and
-  ///    OldIdx.
-  ///
+  /// to NewIdx (NewIdx < OldIdx).
   void handleMoveUp(LiveRange &LR, unsigned Reg, LaneBitmask LaneMask) {
-    // First look for a kill at OldIdx.
-    LiveRange::iterator I = LR.find(OldIdx.getBaseIndex());
     LiveRange::iterator E = LR.end();
-    // Is LR even live at OldIdx?
-    if (I == E || SlotIndex::isEarlierInstr(OldIdx, I->start))
+    // Segment going into OldIdx.
+    LiveRange::iterator OldIdxIn = LR.find(OldIdx.getBaseIndex());
+
+    // No value live before or after OldIdx? Nothing to do.
+    if (OldIdxIn == E || SlotIndex::isEarlierInstr(OldIdx, OldIdxIn->start))
       return;
 
-    // Handle a live-in value.
-    if (!SlotIndex::isSameInstr(I->start, OldIdx)) {
-      // If the live-in value isn't killed here, there is nothing to do.
-      if (!SlotIndex::isSameInstr(OldIdx, I->end))
-        return;
-      // Adjust I->end to end at NewIdx. If we are hoisting a kill above
-      // another use, we need to search for that use. Case 5 above.
-      I->end = NewIdx.getRegSlot(I->end.isEarlyClobber());
-      ++I;
-      // If OldIdx also defines a value, there couldn't have been another use.
-      if (I == E || !SlotIndex::isSameInstr(I->start, OldIdx)) {
-        // No def, search for the new kill.
-        // This can never be an early clobber kill since there is no def.
-        std::prev(I)->end = findLastUseBefore(Reg, LaneMask).getRegSlot();
+    LiveRange::iterator OldIdxOut;
+    // Do we have a value live-in to OldIdx?
+    if (SlotIndex::isEarlierInstr(OldIdxIn->start, OldIdx)) {
+      // If the live-in value isn't killed here, then we have no Def at
+      // OldIdx, moreover the value must be live at NewIdx so there is nothing
+      // to do.
+      bool isKill = SlotIndex::isSameInstr(OldIdx, OldIdxIn->end);
+      if (!isKill)
         return;
-      }
-    }
 
-    // Now deal with the def at OldIdx.
-    assert(I != E && SlotIndex::isSameInstr(I->start, OldIdx) && "No def?");
-    VNInfo *DefVNI = I->valno;
-    assert(DefVNI->def == I->start && "Inconsistent def");
-    DefVNI->def = NewIdx.getRegSlot(I->start.isEarlyClobber());
-
-    // Check for an existing def at NewIdx.
-    LiveRange::iterator NewI = LR.find(NewIdx.getRegSlot());
-    if (SlotIndex::isSameInstr(NewI->start, NewIdx)) {
-      assert(NewI->valno != DefVNI && "Same value defined more than once?");
-      // There is an existing def at NewIdx.
-      if (I->end.isDead()) {
-        // Case 3: Remove the dead def at OldIdx.
-        LR.removeValNo(DefVNI);
+      // At this point we have to move OldIdxIn->end back to the nearest
+      // previous use or (dead-)def but no further than NewIdx.
+      SlotIndex DefBeforeOldIdx
+        = std::max(OldIdxIn->start.getDeadSlot(),
+                   NewIdx.getRegSlot(OldIdxIn->end.isEarlyClobber()));
+      OldIdxIn->end = findLastUseBefore(DefBeforeOldIdx, Reg, LaneMask);
+
+      // Did we have a Def at OldIdx? If not we are done now.
+      OldIdxOut = std::next(OldIdxIn);
+      if (OldIdxOut == E || !SlotIndex::isSameInstr(OldIdx, OldIdxOut->start))
         return;
-      }
-      // Case 4: Replace def at NewIdx with live def at OldIdx.
-      I->start = DefVNI->def;
-      LR.removeValNo(NewI->valno);
-      return;
+    } else {
+      OldIdxOut = OldIdxIn;
+      OldIdxIn = OldIdxOut != LR.begin() ? std::prev(OldIdxOut) : E;
     }
 
-    // There is no existing def at NewIdx. Hoist DefVNI.
-    if (!I->end.isDead()) {
-      // Leave the end point of a live def.
-      I->start = DefVNI->def;
-      return;
+    // If we are here then there is a Definition at OldIdx. OldIdxOut points
+    // to the segment starting there.
+    assert(OldIdxOut != E && SlotIndex::isSameInstr(OldIdx, OldIdxOut->start) &&
+           "No def?");
+    VNInfo *OldIdxVNI = OldIdxOut->valno;
+    assert(OldIdxVNI->def == OldIdxOut->start && "Inconsistent def");
+    bool OldIdxDefIsDead = OldIdxOut->end.isDead();
+
+    // Is there an existing def at NewIdx?
+    SlotIndex NewIdxDef = NewIdx.getRegSlot(OldIdxOut->start.isEarlyClobber());
+    LiveRange::iterator NewIdxOut = LR.find(NewIdx.getRegSlot());
+    if (SlotIndex::isSameInstr(NewIdxOut->start, NewIdx)) {
+      assert(NewIdxOut->valno != OldIdxVNI &&
+             "Same value defined more than once?");
+      // If OldIdx was a dead def remove it.
+      if (!OldIdxDefIsDead) {
+        // Remove segment starting at NewIdx and move begin of OldIdxOut to
+        // NewIdx so it can take its place.
+        OldIdxVNI->def = NewIdxDef;
+        OldIdxOut->start = NewIdxDef;
+        LR.removeValNo(NewIdxOut->valno);
+      } else {
+        // Simply remove the dead def at OldIdx.
+        LR.removeValNo(OldIdxVNI);
+      }
+    } else {
+      // Previously nothing was live after NewIdx, so all we have to do now is
+      // move the begin of OldIdxOut to NewIdx.
+      if (!OldIdxDefIsDead) {
+        // Do we have any intermediate Defs between OldIdx and NewIdx?
+        if (OldIdxIn != E &&
+            SlotIndex::isEarlierInstr(NewIdxDef, OldIdxIn->start)) {
+          // OldIdx is not a dead def and NewIdx is before predecessor start.
+          LiveRange::iterator NewIdxIn = NewIdxOut;
+          assert(NewIdxIn == LR.find(NewIdx.getBaseIndex()));
+          const SlotIndex SplitPos = NewIdxDef;
+
+          // Merge the OldIdxIn and OldIdxOut segments into OldIdxOut.
+          *OldIdxOut = LiveRange::Segment(OldIdxIn->start, OldIdxOut->end,
+                                          OldIdxIn->valno);
+          // OldIdxIn and OldIdxVNI are now undef and can be overridden.
+          // We Slide [NewIdxIn, OldIdxIn) down one position.
+          //    |- X0/NewIdxIn -| ... |- Xn-1 -||- Xn/OldIdxIn -||- OldIdxOut -|
+          // => |- undef/NexIdxIn -| |- X0 -| ... |- Xn-1 -| |- Xn/OldIdxOut -|
+          std::copy_backward(NewIdxIn, OldIdxIn, OldIdxOut);
+          // NewIdxIn is now considered undef so we can reuse it for the moved
+          // value.
+          LiveRange::iterator NewSegment = NewIdxIn;
+          LiveRange::iterator Next = std::next(NewSegment);
+          if (SlotIndex::isEarlierInstr(Next->start, NewIdx)) {
+            // There is no gap between NewSegment and its predecessor.
+            *NewSegment = LiveRange::Segment(Next->start, SplitPos,
+                                             Next->valno);
+            *Next = LiveRange::Segment(SplitPos, Next->end, OldIdxVNI);
+            Next->valno->def = SplitPos;
+          } else {
+            // There is a gap between NewSegment and its predecessor
+            // Value becomes live in.
+            *NewSegment = LiveRange::Segment(SplitPos, Next->start, OldIdxVNI);
+            NewSegment->valno->def = SplitPos;
+          }
+        } else {
+          // Leave the end point of a live def.
+          OldIdxOut->start = NewIdxDef;
+          OldIdxVNI->def = NewIdxDef;
+          if (OldIdxIn != E && SlotIndex::isEarlierInstr(NewIdx, OldIdxIn->end))
+            OldIdxIn->end = NewIdx.getRegSlot();
+        }
+      } else {
+        // OldIdxVNI is a dead def. It may have been moved across other values
+        // in LR, so move OldIdxOut up to NewIdxOut. Slide [NewIdxOut;OldIdxOut)
+        // down one position.
+        //    |- X0/NewIdxOut -| ... |- Xn-1 -| |- Xn/OldIdxOut -| |- next - |
+        // => |- undef/NewIdxOut -| |- X0 -| ... |- Xn-1 -| |- next -|
+        std::copy_backward(NewIdxOut, OldIdxOut, std::next(OldIdxOut));
+        // OldIdxVNI can be reused now to build a new dead def segment.
+        LiveRange::iterator NewSegment = NewIdxOut;
+        VNInfo *NewSegmentVNI = OldIdxVNI;
+        *NewSegment = LiveRange::Segment(NewIdxDef, NewIdxDef.getDeadSlot(),
+                                         NewSegmentVNI);
+        NewSegmentVNI->def = NewIdxDef;
+      }
     }
-
-    // DefVNI is a dead def. It may have been moved across other values in LR,
-    // so move I up to NewI. Slide [NewI;I) down one position.
-    std::copy_backward(NewI, I, std::next(I));
-    *NewI = LiveRange::Segment(DefVNI->def, NewIdx.getDeadSlot(), DefVNI);
   }
 
   void updateRegMaskSlots() {
@@ -1205,29 +1309,31 @@ private:
   }
 
   // Return the last use of reg between NewIdx and OldIdx.
-  SlotIndex findLastUseBefore(unsigned Reg, LaneBitmask LaneMask) {
-
+  SlotIndex findLastUseBefore(SlotIndex Before, unsigned Reg,
+                              LaneBitmask LaneMask) {
     if (TargetRegisterInfo::isVirtualRegister(Reg)) {
-      SlotIndex LastUse = NewIdx;
+      SlotIndex LastUse = Before;
       for (MachineOperand &MO : MRI.use_nodbg_operands(Reg)) {
+        if (MO.isUndef())
+          continue;
         unsigned SubReg = MO.getSubReg();
         if (SubReg != 0 && LaneMask != 0
             && (TRI.getSubRegIndexLaneMask(SubReg) & LaneMask) == 0)
           continue;
 
-        const MachineInstr *MI = MO.getParent();
+        const MachineInstr &MI = *MO.getParent();
         SlotIndex InstSlot = LIS.getSlotIndexes()->getInstructionIndex(MI);
         if (InstSlot > LastUse && InstSlot < OldIdx)
-          LastUse = InstSlot;
+          LastUse = InstSlot.getRegSlot();
       }
       return LastUse;
     }
 
     // This is a regunit interval, so scanning the use list could be very
     // expensive. Scan upwards from OldIdx instead.
-    assert(NewIdx < OldIdx && "Expected upwards move");
+    assert(Before < OldIdx && "Expected upwards move");
     SlotIndexes *Indexes = LIS.getSlotIndexes();
-    MachineBasicBlock *MBB = Indexes->getMBBFromIndex(NewIdx);
+    MachineBasicBlock *MBB = Indexes->getMBBFromIndex(Before);
 
     // OldIdx may not correspond to an instruction any longer, so set MII to
     // point to the next instruction after OldIdx, or MBB->end().
@@ -1241,44 +1347,44 @@ private:
     while (MII != Begin) {
       if ((--MII)->isDebugValue())
         continue;
-      SlotIndex Idx = Indexes->getInstructionIndex(MII);
+      SlotIndex Idx = Indexes->getInstructionIndex(*MII);
 
-      // Stop searching when NewIdx is reached.
-      if (!SlotIndex::isEarlierInstr(NewIdx, Idx))
-        return NewIdx;
+      // Stop searching when Before is reached.
+      if (!SlotIndex::isEarlierInstr(Before, Idx))
+        return Before;
 
       // Check if MII uses Reg.
-      for (MIBundleOperands MO(MII); MO.isValid(); ++MO)
-        if (MO->isReg() &&
+      for (MIBundleOperands MO(*MII); MO.isValid(); ++MO)
+        if (MO->isReg() && !MO->isUndef() &&
             TargetRegisterInfo::isPhysicalRegister(MO->getReg()) &&
             TRI.hasRegUnit(MO->getReg(), Reg))
-          return Idx;
+          return Idx.getRegSlot();
     }
-    // Didn't reach NewIdx. It must be the first instruction in the block.
-    return NewIdx;
+    // Didn't reach Before. It must be the first instruction in the block.
+    return Before;
   }
 };
 
-void LiveIntervals::handleMove(MachineInstr* MI, bool UpdateFlags) {
-  assert(!MI->isBundled() && "Can't handle bundled instructions yet.");
+void LiveIntervals::handleMove(MachineInstr &MI, bool UpdateFlags) {
+  assert(!MI.isBundled() && "Can't handle bundled instructions yet.");
   SlotIndex OldIndex = Indexes->getInstructionIndex(MI);
   Indexes->removeMachineInstrFromMaps(MI);
   SlotIndex NewIndex = Indexes->insertMachineInstrInMaps(MI);
-  assert(getMBBStartIdx(MI->getParent()) <= OldIndex &&
-         OldIndex < getMBBEndIdx(MI->getParent()) &&
+  assert(getMBBStartIdx(MI.getParent()) <= OldIndex &&
+         OldIndex < getMBBEndIdx(MI.getParent()) &&
          "Cannot handle moves across basic block boundaries.");
 
   HMEditor HME(*this, *MRI, *TRI, OldIndex, NewIndex, UpdateFlags);
-  HME.updateAllRanges(MI);
+  HME.updateAllRanges(&MI);
 }
 
-void LiveIntervals::handleMoveIntoBundle(MachineInstr* MI,
-                                         MachineInstr* BundleStart,
+void LiveIntervals::handleMoveIntoBundle(MachineInstr &MI,
+                                         MachineInstr &BundleStart,
                                          bool UpdateFlags) {
   SlotIndex OldIndex = Indexes->getInstructionIndex(MI);
   SlotIndex NewIndex = Indexes->getInstructionIndex(BundleStart);
   HMEditor HME(*this, *MRI, *TRI, OldIndex, NewIndex, UpdateFlags);
-  HME.updateAllRanges(MI);
+  HME.updateAllRanges(&MI);
 }
 
 void LiveIntervals::repairOldRegInRange(const MachineBasicBlock::iterator Begin,
@@ -1295,8 +1401,8 @@ void LiveIntervals::repairOldRegInRange(const MachineBasicBlock::iterator Begin,
 
   for (MachineBasicBlock::iterator I = End; I != Begin;) {
     --I;
-    MachineInstr *MI = I;
-    if (MI->isDebugValue())
+    MachineInstr &MI = *I;
+    if (MI.isDebugValue())
       continue;
 
     SlotIndex instrIdx = getInstructionIndex(MI);
@@ -1305,8 +1411,9 @@ void LiveIntervals::repairOldRegInRange(const MachineBasicBlock::iterator Begin,
 
     // FIXME: This doesn't currently handle early-clobber or multiple removed
     // defs inside of the region to repair.
-    for (MachineInstr::mop_iterator OI = MI->operands_begin(),
-         OE = MI->operands_end(); OI != OE; ++OI) {
+    for (MachineInstr::mop_iterator OI = MI.operands_begin(),
+                                    OE = MI.operands_end();
+         OI != OE; ++OI) {
       const MachineOperand &MO = *OI;
       if (!MO.isReg() || MO.getReg() != Reg)
         continue;
@@ -1376,26 +1483,27 @@ LiveIntervals::repairIntervalsInRange(MachineBasicBlock *MBB,
                                       ArrayRef<unsigned> OrigRegs) {
   // Find anchor points, which are at the beginning/end of blocks or at
   // instructions that already have indexes.
-  while (Begin != MBB->begin() && !Indexes->hasIndex(Begin))
+  while (Begin != MBB->begin() && !Indexes->hasIndex(*Begin))
     --Begin;
-  while (End != MBB->end() && !Indexes->hasIndex(End))
+  while (End != MBB->end() && !Indexes->hasIndex(*End))
     ++End;
 
   SlotIndex endIdx;
   if (End == MBB->end())
     endIdx = getMBBEndIdx(MBB).getPrevSlot();
   else
-    endIdx = getInstructionIndex(End);
+    endIdx = getInstructionIndex(*End);
 
   Indexes->repairIndexesInRange(MBB, Begin, End);
 
   for (MachineBasicBlock::iterator I = End; I != Begin;) {
     --I;
-    MachineInstr *MI = I;
-    if (MI->isDebugValue())
+    MachineInstr &MI = *I;
+    if (MI.isDebugValue())
       continue;
-    for (MachineInstr::const_mop_iterator MOI = MI->operands_begin(),
-         MOE = MI->operands_end(); MOI != MOE; ++MOI) {
+    for (MachineInstr::const_mop_iterator MOI = MI.operands_begin(),
+                                          MOE = MI.operands_end();
+         MOI != MOE; ++MOI) {
       if (MOI->isReg() &&
           TargetRegisterInfo::isVirtualRegister(MOI->getReg()) &&
           !hasInterval(MOI->getReg())) {
@@ -1459,3 +1567,9 @@ void LiveIntervals::splitSeparateComponents(LiveInterval &LI,
   }
   ConEQ.Distribute(LI, SplitLIs.data(), *MRI);
 }
+
+void LiveIntervals::constructMainRangeFromSubranges(LiveInterval &LI) {
+  assert(LRCalc && "LRCalc not initialized.");
+  LRCalc->reset(MF, getSlotIndexes(), DomTree, &getVNInfoAllocator());
+  LRCalc->constructMainRangeFromSubranges(LI);
+}
diff --git a/lib/CodeGen/LivePhysRegs.cpp b/lib/CodeGen/LivePhysRegs.cpp
index efbbcbe23e15..4e2528f47568 100644
--- a/lib/CodeGen/LivePhysRegs.cpp
+++ b/lib/CodeGen/LivePhysRegs.cpp
@@ -17,6 +17,7 @@
 #include "llvm/CodeGen/MachineFrameInfo.h"
 #include "llvm/CodeGen/MachineFunction.h"
 #include "llvm/CodeGen/MachineInstrBundle.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/raw_ostream.h"
 using namespace llvm;
@@ -43,7 +44,7 @@ void LivePhysRegs::removeRegsInMask(const MachineOperand &MO,
 /// Remove Defs, add uses. This is the recommended way of calculating liveness.
 void LivePhysRegs::stepBackward(const MachineInstr &MI) {
   // Remove defined registers and regmask kills from the set.
-  for (ConstMIBundleOperands O(&MI); O.isValid(); ++O) {
+  for (ConstMIBundleOperands O(MI); O.isValid(); ++O) {
     if (O->isReg()) {
       if (!O->isDef())
         continue;
@@ -56,8 +57,8 @@ void LivePhysRegs::stepBackward(const MachineInstr &MI) {
   }
 
   // Add uses to the set.
-  for (ConstMIBundleOperands O(&MI); O.isValid(); ++O) {
-    if (!O->isReg() || !O->readsReg() || O->isUndef())
+  for (ConstMIBundleOperands O(MI); O.isValid(); ++O) {
+    if (!O->isReg() || !O->readsReg())
       continue;
     unsigned Reg = O->getReg();
     if (Reg == 0)
@@ -73,7 +74,7 @@ void LivePhysRegs::stepBackward(const MachineInstr &MI) {
 void LivePhysRegs::stepForward(const MachineInstr &MI,
         SmallVectorImpl<std::pair<unsigned, const MachineOperand*>> &Clobbers) {
   // Remove killed registers from the set.
-  for (ConstMIBundleOperands O(&MI); O.isValid(); ++O) {
+  for (ConstMIBundleOperands O(MI); O.isValid(); ++O) {
     if (O->isReg()) {
       unsigned Reg = O->getReg();
       if (Reg == 0)
@@ -120,12 +121,25 @@ void LivePhysRegs::print(raw_ostream &OS) const {
 }
 
 /// Dumps the currently live registers to the debug output.
-void LivePhysRegs::dump() const {
+LLVM_DUMP_METHOD void LivePhysRegs::dump() const {
 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
   dbgs() << "  " << *this;
 #endif
 }
 
+bool LivePhysRegs::available(const MachineRegisterInfo &MRI,
+                             unsigned Reg) const {
+  if (LiveRegs.count(Reg))
+    return false;
+  if (MRI.isReserved(Reg))
+    return false;
+  for (MCRegAliasIterator R(Reg, TRI, false); R.isValid(); ++R) {
+    if (LiveRegs.count(*R))
+      return false;
+  }
+  return true;
+}
+
 /// Add live-in registers of basic block \p MBB to \p LiveRegs.
 static void addLiveIns(LivePhysRegs &LiveRegs, const MachineBasicBlock &MBB) {
   for (const auto &LI : MBB.liveins())
@@ -135,40 +149,41 @@ static void addLiveIns(LivePhysRegs &LiveRegs, const MachineBasicBlock &MBB) {
 /// Add pristine registers to the given \p LiveRegs. This function removes
 /// actually saved callee save registers when \p InPrologueEpilogue is false.
 static void addPristines(LivePhysRegs &LiveRegs, const MachineFunction &MF,
+                         const MachineFrameInfo &MFI,
                          const TargetRegisterInfo &TRI) {
-  const MachineFrameInfo &MFI = *MF.getFrameInfo();
-  if (!MFI.isCalleeSavedInfoValid())
-    return;
-
   for (const MCPhysReg *CSR = TRI.getCalleeSavedRegs(&MF); CSR && *CSR; ++CSR)
     LiveRegs.addReg(*CSR);
   for (const CalleeSavedInfo &Info : MFI.getCalleeSavedInfo())
     LiveRegs.removeReg(Info.getReg());
 }
 
-void LivePhysRegs::addLiveOuts(const MachineBasicBlock *MBB,
-                               bool AddPristinesAndCSRs) {
-  if (AddPristinesAndCSRs) {
-    const MachineFunction &MF = *MBB->getParent();
-    addPristines(*this, MF, *TRI);
-    if (!MBB->isReturnBlock()) {
+void LivePhysRegs::addLiveOutsNoPristines(const MachineBasicBlock &MBB) {
+  // To get the live-outs we simply merge the live-ins of all successors.
+  for (const MachineBasicBlock *Succ : MBB.successors())
+    ::addLiveIns(*this, *Succ);
+}
+
+void LivePhysRegs::addLiveOuts(const MachineBasicBlock &MBB) {
+  const MachineFunction &MF = *MBB.getParent();
+  const MachineFrameInfo &MFI = *MF.getFrameInfo();
+  if (MFI.isCalleeSavedInfoValid()) {
+    if (MBB.isReturnBlock()) {
       // The return block has no successors whose live-ins we could merge
       // below. So instead we add the callee saved registers manually.
       for (const MCPhysReg *I = TRI->getCalleeSavedRegs(&MF); *I; ++I)
         addReg(*I);
+    } else {
+      addPristines(*this, MF, MFI, *TRI);
     }
   }
 
-  // To get the live-outs we simply merge the live-ins of all successors.
-  for (const MachineBasicBlock *Succ : MBB->successors())
-    ::addLiveIns(*this, *Succ);
+  addLiveOutsNoPristines(MBB);
 }
 
-void LivePhysRegs::addLiveIns(const MachineBasicBlock *MBB,
-                              bool AddPristines) {
-  if (AddPristines) {
-    const MachineFunction &MF = *MBB->getParent();
-    addPristines(*this, MF, *TRI);
-  }
-  ::addLiveIns(*this, *MBB);
+void LivePhysRegs::addLiveIns(const MachineBasicBlock &MBB) {
+  const MachineFunction &MF = *MBB.getParent();
+  const MachineFrameInfo &MFI = *MF.getFrameInfo();
+  if (MFI.isCalleeSavedInfoValid())
+    addPristines(*this, MF, MFI, *TRI);
+  ::addLiveIns(*this, MBB);
 }
diff --git a/lib/CodeGen/LiveRangeCalc.cpp b/lib/CodeGen/LiveRangeCalc.cpp
index c408615d42e2..db91ca113dc1 100644
--- a/lib/CodeGen/LiveRangeCalc.cpp
+++ b/lib/CodeGen/LiveRangeCalc.cpp
@@ -42,12 +42,12 @@ void LiveRangeCalc::reset(const MachineFunction *mf,
 
 static void createDeadDef(SlotIndexes &Indexes, VNInfo::Allocator &Alloc,
                           LiveRange &LR, const MachineOperand &MO) {
-    const MachineInstr *MI = MO.getParent();
-    SlotIndex DefIdx =
-        Indexes.getInstructionIndex(MI).getRegSlot(MO.isEarlyClobber());
+  const MachineInstr &MI = *MO.getParent();
+  SlotIndex DefIdx =
+      Indexes.getInstructionIndex(MI).getRegSlot(MO.isEarlyClobber());
 
-    // Create the def in LR. This may find an existing def.
-    LR.createDeadDef(DefIdx, Alloc);
+  // Create the def in LR. This may find an existing def.
+  LR.createDeadDef(DefIdx, Alloc);
 }
 
 void LiveRangeCalc::calculate(LiveInterval &LI, bool TrackSubRegs) {
@@ -120,13 +120,29 @@ void LiveRangeCalc::calculate(LiveInterval &LI, bool TrackSubRegs) {
       extendToUses(S, Reg, S.LaneMask);
     }
     LI.clear();
-    LI.constructMainRangeFromSubranges(*Indexes, *Alloc);
+    constructMainRangeFromSubranges(LI);
   } else {
     resetLiveOutMap();
     extendToUses(LI, Reg, ~0u);
   }
 }
 
+void LiveRangeCalc::constructMainRangeFromSubranges(LiveInterval &LI) {
+  // First create dead defs at all defs found in subranges.
+  LiveRange &MainRange = LI;
+  assert(MainRange.segments.empty() && MainRange.valnos.empty() &&
+         "Expect empty main liverange");
+
+  for (const LiveInterval::SubRange &SR : LI.subranges()) {
+    for (const VNInfo *VNI : SR.valnos) {
+      if (!VNI->isUnused() && !VNI->isPHIDef())
+        MainRange.createDeadDef(VNI->def, *Alloc);
+    }
+  }
+
+  resetLiveOutMap();
+  extendToUses(MainRange, LI.reg);
+}
 
 void LiveRangeCalc::createDeadDefs(LiveRange &LR, unsigned Reg) {
   assert(MRI && Indexes && "call reset() first");
@@ -184,7 +200,7 @@ void LiveRangeCalc::extendToUses(LiveRange &LR, unsigned Reg,
         // had an early-clobber flag.
         isEarlyClobber = MI->getOperand(DefIdx).isEarlyClobber();
       }
-      UseIdx = Indexes->getInstructionIndex(MI).getRegSlot(isEarlyClobber);
+      UseIdx = Indexes->getInstructionIndex(*MI).getRegSlot(isEarlyClobber);
     }
 
     // MI is reading Reg. We may have visited MI before if it happens to be
diff --git a/lib/CodeGen/LiveRangeCalc.h b/lib/CodeGen/LiveRangeCalc.h
index ff38c68820f1..9de48b722881 100644
--- a/lib/CodeGen/LiveRangeCalc.h
+++ b/lib/CodeGen/LiveRangeCalc.h
@@ -189,6 +189,11 @@ public:
   /// enabled.
   void calculate(LiveInterval &LI, bool TrackSubRegs);
 
+  /// For live interval \p LI with correct SubRanges construct matching
+  /// information for the main live range. Expects the main live range to not
+  /// have any segments or value numbers.
+  void constructMainRangeFromSubranges(LiveInterval &LI);
+
   //===--------------------------------------------------------------------===//
   // Low-level interface.
   //===--------------------------------------------------------------------===//
diff --git a/lib/CodeGen/LiveRangeEdit.cpp b/lib/CodeGen/LiveRangeEdit.cpp
index 5ce364ae661e..b35c0adfacad 100644
--- a/lib/CodeGen/LiveRangeEdit.cpp
+++ b/lib/CodeGen/LiveRangeEdit.cpp
@@ -53,7 +53,7 @@ bool LiveRangeEdit::checkRematerializable(VNInfo *VNI,
                                           AliasAnalysis *aa) {
   assert(DefMI && "Missing instruction");
   ScannedRemattable = true;
-  if (!TII.isTriviallyReMaterializable(DefMI, aa))
+  if (!TII.isTriviallyReMaterializable(*DefMI, aa))
     return false;
   Remattable.insert(VNI);
   return true;
@@ -63,10 +63,13 @@ void LiveRangeEdit::scanRemattable(AliasAnalysis *aa) {
   for (VNInfo *VNI : getParent().valnos) {
     if (VNI->isUnused())
       continue;
-    MachineInstr *DefMI = LIS.getInstructionFromIndex(VNI->def);
+    unsigned Original = VRM->getOriginal(getReg());
+    LiveInterval &OrigLI = LIS.getInterval(Original);
+    VNInfo *OrigVNI = OrigLI.getVNInfoAt(VNI->def);
+    MachineInstr *DefMI = LIS.getInstructionFromIndex(OrigVNI->def);
     if (!DefMI)
       continue;
-    checkRematerializable(VNI, DefMI, aa);
+    checkRematerializable(OrigVNI, DefMI, aa);
   }
   ScannedRemattable = true;
 }
@@ -113,27 +116,21 @@ bool LiveRangeEdit::allUsesAvailableAt(const MachineInstr *OrigMI,
   return true;
 }
 
-bool LiveRangeEdit::canRematerializeAt(Remat &RM,
-                                       SlotIndex UseIdx,
-                                       bool cheapAsAMove) {
+bool LiveRangeEdit::canRematerializeAt(Remat &RM, VNInfo *OrigVNI,
+                                       SlotIndex UseIdx, bool cheapAsAMove) {
   assert(ScannedRemattable && "Call anyRematerializable first");
 
   // Use scanRemattable info.
-  if (!Remattable.count(RM.ParentVNI))
+  if (!Remattable.count(OrigVNI))
     return false;
 
   // No defining instruction provided.
   SlotIndex DefIdx;
-  if (RM.OrigMI)
-    DefIdx = LIS.getInstructionIndex(RM.OrigMI);
-  else {
-    DefIdx = RM.ParentVNI->def;
-    RM.OrigMI = LIS.getInstructionFromIndex(DefIdx);
-    assert(RM.OrigMI && "No defining instruction for remattable value");
-  }
+  assert(RM.OrigMI && "No defining instruction for remattable value");
+  DefIdx = LIS.getInstructionIndex(*RM.OrigMI);
 
   // If only cheap remats were requested, bail out early.
-  if (cheapAsAMove && !TII.isAsCheapAsAMove(RM.OrigMI))
+  if (cheapAsAMove && !TII.isAsCheapAsAMove(*RM.OrigMI))
     return false;
 
   // Verify that all used registers are available with the same values.
@@ -150,10 +147,13 @@ SlotIndex LiveRangeEdit::rematerializeAt(MachineBasicBlock &MBB,
                                          const TargetRegisterInfo &tri,
                                          bool Late) {
   assert(RM.OrigMI && "Invalid remat");
-  TII.reMaterialize(MBB, MI, DestReg, 0, RM.OrigMI, tri);
+  TII.reMaterialize(MBB, MI, DestReg, 0, *RM.OrigMI, tri);
+  // DestReg of the cloned instruction cannot be Dead. Set isDead of DestReg
+  // to false anyway in case the isDead flag of RM.OrigMI's dest register
+  // is true.
+  (*--MI).getOperand(0).setIsDead(false);
   Rematted.insert(RM.ParentVNI);
-  return LIS.getSlotIndexes()->insertMachineInstrInMaps(--MI, Late)
-           .getRegSlot();
+  return LIS.getSlotIndexes()->insertMachineInstrInMaps(*MI, Late).getRegSlot();
 }
 
 void LiveRangeEdit::eraseVirtReg(unsigned Reg) {
@@ -188,9 +188,8 @@ bool LiveRangeEdit::foldAsLoad(LiveInterval *LI,
 
   // Since we're moving the DefMI load, make sure we're not extending any live
   // ranges.
-  if (!allUsesAvailableAt(DefMI,
-                          LIS.getInstructionIndex(DefMI),
-                          LIS.getInstructionIndex(UseMI)))
+  if (!allUsesAvailableAt(DefMI, LIS.getInstructionIndex(*DefMI),
+                          LIS.getInstructionIndex(*UseMI)))
     return false;
 
   // We also need to make sure it is safe to move the load.
@@ -206,11 +205,11 @@ bool LiveRangeEdit::foldAsLoad(LiveInterval *LI,
   if (UseMI->readsWritesVirtualRegister(LI->reg, &Ops).second)
     return false;
 
-  MachineInstr *FoldMI = TII.foldMemoryOperand(UseMI, Ops, DefMI);
+  MachineInstr *FoldMI = TII.foldMemoryOperand(*UseMI, Ops, *DefMI, &LIS);
   if (!FoldMI)
     return false;
   DEBUG(dbgs() << "                folded: " << *FoldMI);
-  LIS.ReplaceMachineInstrInMaps(UseMI, FoldMI);
+  LIS.ReplaceMachineInstrInMaps(*UseMI, *FoldMI);
   UseMI->eraseFromParent();
   DefMI->addRegisterDead(LI->reg, nullptr);
   Dead.push_back(DefMI);
@@ -220,7 +219,7 @@ bool LiveRangeEdit::foldAsLoad(LiveInterval *LI,
 
 bool LiveRangeEdit::useIsKill(const LiveInterval &LI,
                               const MachineOperand &MO) const {
-  const MachineInstr *MI = MO.getParent();
+  const MachineInstr &MI = *MO.getParent();
   SlotIndex Idx = LIS.getInstructionIndex(MI).getRegSlot();
   if (LI.Query(Idx).isKill())
     return true;
@@ -235,9 +234,10 @@ bool LiveRangeEdit::useIsKill(const LiveInterval &LI,
 }
 
 /// Find all live intervals that need to shrink, then remove the instruction.
-void LiveRangeEdit::eliminateDeadDef(MachineInstr *MI, ToShrinkSet &ToShrink) {
+void LiveRangeEdit::eliminateDeadDef(MachineInstr *MI, ToShrinkSet &ToShrink,
+                                     AliasAnalysis *AA) {
   assert(MI->allDefsAreDead() && "Def isn't really dead");
-  SlotIndex Idx = LIS.getInstructionIndex(MI).getRegSlot();
+  SlotIndex Idx = LIS.getInstructionIndex(*MI).getRegSlot();
 
   // Never delete a bundled instruction.
   if (MI->isBundled()) {
@@ -261,6 +261,20 @@ void LiveRangeEdit::eliminateDeadDef(MachineInstr *MI, ToShrinkSet &ToShrink) {
   // Collect virtual registers to be erased after MI is gone.
   SmallVector<unsigned, 8> RegsToErase;
   bool ReadsPhysRegs = false;
+  bool isOrigDef = false;
+  unsigned Dest;
+  if (VRM && MI->getOperand(0).isReg()) {
+    Dest = MI->getOperand(0).getReg();
+    unsigned Original = VRM->getOriginal(Dest);
+    LiveInterval &OrigLI = LIS.getInterval(Original);
+    VNInfo *OrigVNI = OrigLI.getVNInfoAt(Idx);
+    // The original live-range may have been shrunk to
+    // an empty live-range. It happens when it is dead, but
+    // we still keep it around to be able to rematerialize
+    // other values that depend on it.
+    if (OrigVNI)
+      isOrigDef = SlotIndex::isSameInstr(OrigVNI->def, Idx);
+  }
 
   // Check for live intervals that may shrink
   for (MachineInstr::mop_iterator MOI = MI->operands_begin(),
@@ -314,11 +328,27 @@ void LiveRangeEdit::eliminateDeadDef(MachineInstr *MI, ToShrinkSet &ToShrink) {
     }
     DEBUG(dbgs() << "Converted physregs to:\t" << *MI);
   } else {
-    if (TheDelegate)
-      TheDelegate->LRE_WillEraseInstruction(MI);
-    LIS.RemoveMachineInstrFromMaps(MI);
-    MI->eraseFromParent();
-    ++NumDCEDeleted;
+    // If the dest of MI is an original reg and MI is reMaterializable,
+    // don't delete the inst. Replace the dest with a new reg, and keep
+    // the inst for remat of other siblings. The inst is saved in
+    // LiveRangeEdit::DeadRemats and will be deleted after all the
+    // allocations of the func are done.
+    if (isOrigDef && DeadRemats && TII.isTriviallyReMaterializable(*MI, AA)) {
+      LiveInterval &NewLI = createEmptyIntervalFrom(Dest);
+      VNInfo *VNI = NewLI.getNextValue(Idx, LIS.getVNInfoAllocator());
+      NewLI.addSegment(LiveInterval::Segment(Idx, Idx.getDeadSlot(), VNI));
+      pop_back();
+      markDeadRemat(MI);
+      const TargetRegisterInfo &TRI = *MRI.getTargetRegisterInfo();
+      MI->substituteRegister(Dest, NewLI.reg, 0, TRI);
+      MI->getOperand(0).setIsDead(true);
+    } else {
+      if (TheDelegate)
+        TheDelegate->LRE_WillEraseInstruction(MI);
+      LIS.RemoveMachineInstrFromMaps(*MI);
+      MI->eraseFromParent();
+      ++NumDCEDeleted;
+    }
   }
 
   // Erase any virtregs that are now empty and unused. There may be <undef>
@@ -332,14 +362,15 @@ void LiveRangeEdit::eliminateDeadDef(MachineInstr *MI, ToShrinkSet &ToShrink) {
   }
 }
 
-void LiveRangeEdit::eliminateDeadDefs(SmallVectorImpl<MachineInstr*> &Dead,
-                                      ArrayRef<unsigned> RegsBeingSpilled) {
+void LiveRangeEdit::eliminateDeadDefs(SmallVectorImpl<MachineInstr *> &Dead,
+                                      ArrayRef<unsigned> RegsBeingSpilled,
+                                      AliasAnalysis *AA) {
   ToShrinkSet ToShrink;
 
   for (;;) {
     // Erase all dead defs.
     while (!Dead.empty())
-      eliminateDeadDef(Dead.pop_back_val(), ToShrink);
+      eliminateDeadDef(Dead.pop_back_val(), ToShrink, AA);
 
     if (ToShrink.empty())
       break;
diff --git a/lib/CodeGen/LiveRangeUtils.h b/lib/CodeGen/LiveRangeUtils.h
new file mode 100644
index 000000000000..bd57609c3d84
--- /dev/null
+++ b/lib/CodeGen/LiveRangeUtils.h
@@ -0,0 +1,62 @@
+//===-- LiveRangeUtils.h - Live Range modification utilities ----*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+/// This file contains helper functions to modify live ranges.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_CODEGEN_LIVERANGEUTILS_H
+#define LLVM_LIB_CODEGEN_LIVERANGEUTILS_H
+
+#include "llvm/CodeGen/LiveInterval.h"
+
+namespace llvm {
+
+/// Helper function that distributes live range value numbers and the
+/// corresponding segments of a master live range \p LR to a list of newly
+/// created live ranges \p SplitLRs. \p VNIClasses maps each value number in \p
+/// LR to 0 meaning it should stay or to 1..N meaning it should go to a specific
+/// live range in the \p SplitLRs array.
+template<typename LiveRangeT, typename EqClassesT>
+static void DistributeRange(LiveRangeT &LR, LiveRangeT *SplitLRs[],
+                            EqClassesT VNIClasses) {
+  // Move segments to new intervals.
+  typename LiveRangeT::iterator J = LR.begin(), E = LR.end();
+  while (J != E && VNIClasses[J->valno->id] == 0)
+    ++J;
+  for (typename LiveRangeT::iterator I = J; I != E; ++I) {
+    if (unsigned eq = VNIClasses[I->valno->id]) {
+      assert((SplitLRs[eq-1]->empty() || SplitLRs[eq-1]->expiredAt(I->start)) &&
+             "New intervals should be empty");
+      SplitLRs[eq-1]->segments.push_back(*I);
+    } else
+      *J++ = *I;
+  }
+  LR.segments.erase(J, E);
+
+  // Transfer VNInfos to their new owners and renumber them.
+  unsigned j = 0, e = LR.getNumValNums();
+  while (j != e && VNIClasses[j] == 0)
+    ++j;
+  for (unsigned i = j; i != e; ++i) {
+    VNInfo *VNI = LR.getValNumInfo(i);
+    if (unsigned eq = VNIClasses[i]) {
+      VNI->id = SplitLRs[eq-1]->getNumValNums();
+      SplitLRs[eq-1]->valnos.push_back(VNI);
+    } else {
+      VNI->id = j;
+      LR.valnos[j++] = VNI;
+    }
+  }
+  LR.valnos.resize(j);
+}
+
+} // End llvm namespace
+
+#endif
diff --git a/lib/CodeGen/LiveStackAnalysis.cpp b/lib/CodeGen/LiveStackAnalysis.cpp
index 5c9c679e97ba..dbf1f96102d1 100644
--- a/lib/CodeGen/LiveStackAnalysis.cpp
+++ b/lib/CodeGen/LiveStackAnalysis.cpp
@@ -14,14 +14,12 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/CodeGen/LiveStackAnalysis.h"
-#include "llvm/ADT/Statistic.h"
 #include "llvm/CodeGen/LiveIntervalAnalysis.h"
 #include "llvm/CodeGen/Passes.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/Target/TargetRegisterInfo.h"
 #include "llvm/Target/TargetSubtargetInfo.h"
-#include <limits>
 using namespace llvm;
 
 #define DEBUG_TYPE "livestacks"
diff --git a/lib/CodeGen/LiveVariables.cpp b/lib/CodeGen/LiveVariables.cpp
index 06b86d82daf1..dd87216f5e6b 100644
--- a/lib/CodeGen/LiveVariables.cpp
+++ b/lib/CodeGen/LiveVariables.cpp
@@ -64,7 +64,7 @@ LiveVariables::VarInfo::findKill(const MachineBasicBlock *MBB) const {
   return nullptr;
 }
 
-void LiveVariables::VarInfo::dump() const {
+LLVM_DUMP_METHOD void LiveVariables::VarInfo::dump() const {
 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
   dbgs() << "  Alive in blocks: ";
   for (SparseBitVector<>::iterator I = AliveBlocks.begin(),
@@ -129,7 +129,7 @@ void LiveVariables::MarkVirtRegAliveInBlock(VarInfo &VRInfo,
 }
 
 void LiveVariables::HandleVirtRegUse(unsigned reg, MachineBasicBlock *MBB,
-                                     MachineInstr *MI) {
+                                     MachineInstr &MI) {
   assert(MRI->getVRegDef(reg) && "Register use before def!");
 
   unsigned BBNum = MBB->getNumber();
@@ -140,7 +140,7 @@ void LiveVariables::HandleVirtRegUse(unsigned reg, MachineBasicBlock *MBB,
   if (!VRInfo.Kills.empty() && VRInfo.Kills.back()->getParent() == MBB) {
     // Yes, this register is killed in this basic block already. Increase the
     // live range by updating the kill instruction.
-    VRInfo.Kills.back() = MI;
+    VRInfo.Kills.back() = &MI;
     return;
   }
 
@@ -171,7 +171,7 @@ void LiveVariables::HandleVirtRegUse(unsigned reg, MachineBasicBlock *MBB,
   // already marked as alive in this basic block, that means it is alive in at
   // least one of the successor blocks, it's not a kill.
   if (!VRInfo.AliveBlocks.test(BBNum))
-    VRInfo.Kills.push_back(MI);
+    VRInfo.Kills.push_back(&MI);
 
   // Update all dominating blocks to mark them as "known live".
   for (MachineBasicBlock::const_pred_iterator PI = MBB->pred_begin(),
@@ -179,12 +179,12 @@ void LiveVariables::HandleVirtRegUse(unsigned reg, MachineBasicBlock *MBB,
     MarkVirtRegAliveInBlock(VRInfo, MRI->getVRegDef(reg)->getParent(), *PI);
 }
 
-void LiveVariables::HandleVirtRegDef(unsigned Reg, MachineInstr *MI) {
+void LiveVariables::HandleVirtRegDef(unsigned Reg, MachineInstr &MI) {
   VarInfo &VRInfo = getVarInfo(Reg);
 
   if (VRInfo.AliveBlocks.empty())
     // If vr is not alive in any block, then defaults to dead.
-    VRInfo.Kills.push_back(MI);
+    VRInfo.Kills.push_back(&MI);
 }
 
 /// FindLastPartialDef - Return the last partial def of the specified register.
@@ -228,7 +228,7 @@ MachineInstr *LiveVariables::FindLastPartialDef(unsigned Reg,
 /// HandlePhysRegUse - Turn previous partial def's into read/mod/writes. Add
 /// implicit defs to a machine instruction if there was an earlier def of its
 /// super-register.
-void LiveVariables::HandlePhysRegUse(unsigned Reg, MachineInstr *MI) {
+void LiveVariables::HandlePhysRegUse(unsigned Reg, MachineInstr &MI) {
   MachineInstr *LastDef = PhysRegDef[Reg];
   // If there was a previous use or a "full" def all is well.
   if (!LastDef && !PhysRegUse[Reg]) {
@@ -273,7 +273,7 @@ void LiveVariables::HandlePhysRegUse(unsigned Reg, MachineInstr *MI) {
   // Remember this use.
   for (MCSubRegIterator SubRegs(Reg, TRI, /*IncludeSelf=*/true);
        SubRegs.isValid(); ++SubRegs)
-    PhysRegUse[*SubRegs] =  MI;
+    PhysRegUse[*SubRegs] = &MI;
 }
 
 /// FindLastRefOrPartRef - Return the last reference or partial reference of
@@ -483,7 +483,7 @@ void LiveVariables::HandlePhysRegDef(unsigned Reg, MachineInstr *MI,
     Defs.push_back(Reg);  // Remember this def.
 }
 
-void LiveVariables::UpdatePhysRegDefs(MachineInstr *MI,
+void LiveVariables::UpdatePhysRegDefs(MachineInstr &MI,
                                       SmallVectorImpl<unsigned> &Defs) {
   while (!Defs.empty()) {
     unsigned Reg = Defs.back();
@@ -491,21 +491,21 @@ void LiveVariables::UpdatePhysRegDefs(MachineInstr *MI,
     for (MCSubRegIterator SubRegs(Reg, TRI, /*IncludeSelf=*/true);
          SubRegs.isValid(); ++SubRegs) {
       unsigned SubReg = *SubRegs;
-      PhysRegDef[SubReg]  = MI;
+      PhysRegDef[SubReg] = &MI;
       PhysRegUse[SubReg]  = nullptr;
     }
   }
 }
 
-void LiveVariables::runOnInstr(MachineInstr *MI,
+void LiveVariables::runOnInstr(MachineInstr &MI,
                                SmallVectorImpl<unsigned> &Defs) {
-  assert(!MI->isDebugValue());
+  assert(!MI.isDebugValue());
   // Process all of the operands of the instruction...
-  unsigned NumOperandsToProcess = MI->getNumOperands();
+  unsigned NumOperandsToProcess = MI.getNumOperands();
 
   // Unless it is a PHI node.  In this case, ONLY process the DEF, not any
   // of the uses.  They will be handled in other basic blocks.
-  if (MI->isPHI())
+  if (MI.isPHI())
     NumOperandsToProcess = 1;
 
   // Clear kill and dead markers. LV will recompute them.
@@ -513,7 +513,7 @@ void LiveVariables::runOnInstr(MachineInstr *MI,
   SmallVector<unsigned, 4> DefRegs;
   SmallVector<unsigned, 1> RegMasks;
   for (unsigned i = 0; i != NumOperandsToProcess; ++i) {
-    MachineOperand &MO = MI->getOperand(i);
+    MachineOperand &MO = MI.getOperand(i);
     if (MO.isRegMask()) {
       RegMasks.push_back(i);
       continue;
@@ -527,15 +527,18 @@ void LiveVariables::runOnInstr(MachineInstr *MI,
         MO.setIsKill(false);
       if (MO.readsReg())
         UseRegs.push_back(MOReg);
-    } else /*MO.isDef()*/ {
-      if (!(TargetRegisterInfo::isPhysicalRegister(MOReg) &&
-            MRI->isReserved(MOReg)))
+    } else {
+      assert(MO.isDef());
+      // FIXME: We should not remove any dead flags. However the MIPS RDDSP
+      // instruction needs it at the moment: http://llvm.org/PR27116.
+      if (TargetRegisterInfo::isPhysicalRegister(MOReg) &&
+          !MRI->isReserved(MOReg))
         MO.setIsDead(false);
       DefRegs.push_back(MOReg);
     }
   }
 
-  MachineBasicBlock *MBB = MI->getParent();
+  MachineBasicBlock *MBB = MI.getParent();
   // Process all uses.
   for (unsigned i = 0, e = UseRegs.size(); i != e; ++i) {
     unsigned MOReg = UseRegs[i];
@@ -547,7 +550,7 @@ void LiveVariables::runOnInstr(MachineInstr *MI,
 
   // Process all masked registers. (Call clobbers).
   for (unsigned i = 0, e = RegMasks.size(); i != e; ++i)
-    HandleRegMask(MI->getOperand(RegMasks[i]));
+    HandleRegMask(MI.getOperand(RegMasks[i]));
 
   // Process all defs.
   for (unsigned i = 0, e = DefRegs.size(); i != e; ++i) {
@@ -555,7 +558,7 @@ void LiveVariables::runOnInstr(MachineInstr *MI,
     if (TargetRegisterInfo::isVirtualRegister(MOReg))
       HandleVirtRegDef(MOReg, MI);
     else if (!MRI->isReserved(MOReg))
-      HandlePhysRegDef(MOReg, MI, Defs);
+      HandlePhysRegDef(MOReg, &MI, Defs);
   }
   UpdatePhysRegDefs(MI, Defs);
 }
@@ -572,12 +575,10 @@ void LiveVariables::runOnBlock(MachineBasicBlock *MBB, const unsigned NumRegs) {
   // Loop over all of the instructions, processing them.
   DistanceMap.clear();
   unsigned Dist = 0;
-  for (MachineBasicBlock::iterator I = MBB->begin(), E = MBB->end();
-       I != E; ++I) {
-    MachineInstr *MI = I;
-    if (MI->isDebugValue())
+  for (MachineInstr &MI : *MBB) {
+    if (MI.isDebugValue())
       continue;
-    DistanceMap.insert(std::make_pair(MI, Dist++));
+    DistanceMap.insert(std::make_pair(&MI, Dist++));
 
     runOnInstr(MI, Defs);
   }
@@ -679,17 +680,17 @@ bool LiveVariables::runOnMachineFunction(MachineFunction &mf) {
 
 /// replaceKillInstruction - Update register kill info by replacing a kill
 /// instruction with a new one.
-void LiveVariables::replaceKillInstruction(unsigned Reg, MachineInstr *OldMI,
-                                           MachineInstr *NewMI) {
+void LiveVariables::replaceKillInstruction(unsigned Reg, MachineInstr &OldMI,
+                                           MachineInstr &NewMI) {
   VarInfo &VI = getVarInfo(Reg);
-  std::replace(VI.Kills.begin(), VI.Kills.end(), OldMI, NewMI);
+  std::replace(VI.Kills.begin(), VI.Kills.end(), &OldMI, &NewMI);
 }
 
 /// removeVirtualRegistersKilled - Remove all killed info for the specified
 /// instruction.
-void LiveVariables::removeVirtualRegistersKilled(MachineInstr *MI) {
-  for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) {
-    MachineOperand &MO = MI->getOperand(i);
+void LiveVariables::removeVirtualRegistersKilled(MachineInstr &MI) {
+  for (unsigned i = 0, e = MI.getNumOperands(); i != e; ++i) {
+    MachineOperand &MO = MI.getOperand(i);
     if (MO.isReg() && MO.isKill()) {
       MO.setIsKill(false);
       unsigned Reg = MO.getReg();
diff --git a/lib/CodeGen/LocalStackSlotAllocation.cpp b/lib/CodeGen/LocalStackSlotAllocation.cpp
index eb60005764c0..af7392f4435b 100644
--- a/lib/CodeGen/LocalStackSlotAllocation.cpp
+++ b/lib/CodeGen/LocalStackSlotAllocation.cpp
@@ -256,12 +256,12 @@ lookupCandidateBaseReg(unsigned BaseReg,
                        int64_t BaseOffset,
                        int64_t FrameSizeAdjust,
                        int64_t LocalFrameOffset,
-                       const MachineInstr *MI,
+                       const MachineInstr &MI,
                        const TargetRegisterInfo *TRI) {
   // Check if the relative offset from the where the base register references
   // to the target address is in range for the instruction.
   int64_t Offset = FrameSizeAdjust + LocalFrameOffset - BaseOffset;
-  return TRI->isFrameOffsetLegal(MI, BaseReg, Offset);
+  return TRI->isFrameOffsetLegal(&MI, BaseReg, Offset);
 }
 
 bool LocalStackSlotPass::insertFrameReferenceRegisters(MachineFunction &Fn) {
@@ -285,16 +285,13 @@ bool LocalStackSlotPass::insertFrameReferenceRegisters(MachineFunction &Fn) {
   // choose the first one).
   SmallVector<FrameRef, 64> FrameReferenceInsns;
 
-  for (MachineFunction::iterator BB = Fn.begin(), E = Fn.end(); BB != E; ++BB) {
-    for (MachineBasicBlock::iterator I = BB->begin(); I != BB->end(); ++I) {
-      MachineInstr *MI = I;
-
+  for (MachineBasicBlock &BB : Fn) {
+    for (MachineInstr &MI : BB) {
       // Debug value, stackmap and patchpoint instructions can't be out of
       // range, so they don't need any updates.
-      if (MI->isDebugValue() ||
-          MI->getOpcode() == TargetOpcode::STATEPOINT ||
-          MI->getOpcode() == TargetOpcode::STACKMAP ||
-          MI->getOpcode() == TargetOpcode::PATCHPOINT)
+      if (MI.isDebugValue() || MI.getOpcode() == TargetOpcode::STATEPOINT ||
+          MI.getOpcode() == TargetOpcode::STACKMAP ||
+          MI.getOpcode() == TargetOpcode::PATCHPOINT)
         continue;
 
       // For now, allocate the base register(s) within the basic block
@@ -303,19 +300,18 @@ bool LocalStackSlotPass::insertFrameReferenceRegisters(MachineFunction &Fn) {
       // than that, but the increased register pressure makes that a
       // tricky thing to balance. Investigate if re-materializing these
       // becomes an issue.
-      for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) {
+      for (unsigned i = 0, e = MI.getNumOperands(); i != e; ++i) {
         // Consider replacing all frame index operands that reference
         // an object allocated in the local block.
-        if (MI->getOperand(i).isFI()) {
+        if (MI.getOperand(i).isFI()) {
           // Don't try this with values not in the local block.
-          if (!MFI->isObjectPreAllocated(MI->getOperand(i).getIndex()))
+          if (!MFI->isObjectPreAllocated(MI.getOperand(i).getIndex()))
             break;
-          int Idx = MI->getOperand(i).getIndex();
+          int Idx = MI.getOperand(i).getIndex();
           int64_t LocalOffset = LocalOffsets[Idx];
-          if (!TRI->needsFrameBaseReg(MI, LocalOffset))
+          if (!TRI->needsFrameBaseReg(&MI, LocalOffset))
             break;
-          FrameReferenceInsns.
-            push_back(FrameRef(MI, LocalOffset, Idx));
+          FrameReferenceInsns.push_back(FrameRef(&MI, LocalOffset, Idx));
           break;
         }
       }
@@ -333,46 +329,44 @@ bool LocalStackSlotPass::insertFrameReferenceRegisters(MachineFunction &Fn) {
   // Loop through the frame references and allocate for them as necessary.
   for (int ref = 0, e = FrameReferenceInsns.size(); ref < e ; ++ref) {
     FrameRef &FR = FrameReferenceInsns[ref];
-    MachineBasicBlock::iterator I = FR.getMachineInstr();
-    MachineInstr *MI = I;
+    MachineInstr &MI = *FR.getMachineInstr();
     int64_t LocalOffset = FR.getLocalOffset();
     int FrameIdx = FR.getFrameIndex();
     assert(MFI->isObjectPreAllocated(FrameIdx) &&
            "Only pre-allocated locals expected!");
 
-    DEBUG(dbgs() << "Considering: " << *MI);
+    DEBUG(dbgs() << "Considering: " << MI);
 
     unsigned idx = 0;
-    for (unsigned f = MI->getNumOperands(); idx != f; ++idx) {
-      if (!MI->getOperand(idx).isFI())
+    for (unsigned f = MI.getNumOperands(); idx != f; ++idx) {
+      if (!MI.getOperand(idx).isFI())
         continue;
 
-      if (FrameIdx == I->getOperand(idx).getIndex())
+      if (FrameIdx == MI.getOperand(idx).getIndex())
         break;
     }
 
-    assert(idx < MI->getNumOperands() && "Cannot find FI operand");
+    assert(idx < MI.getNumOperands() && "Cannot find FI operand");
 
     int64_t Offset = 0;
     int64_t FrameSizeAdjust = StackGrowsDown ? MFI->getLocalFrameSize() : 0;
 
-    DEBUG(dbgs() << "  Replacing FI in: " << *MI);
+    DEBUG(dbgs() << "  Replacing FI in: " << MI);
 
     // If we have a suitable base register available, use it; otherwise
     // create a new one. Note that any offset encoded in the
     // instruction itself will be taken into account by the target,
     // so we don't have to adjust for it here when reusing a base
     // register.
-    if (UsedBaseReg && lookupCandidateBaseReg(BaseReg, BaseOffset,
-                                              FrameSizeAdjust, LocalOffset, MI,
-                                              TRI)) {
+    if (UsedBaseReg &&
+        lookupCandidateBaseReg(BaseReg, BaseOffset, FrameSizeAdjust,
+                               LocalOffset, MI, TRI)) {
       DEBUG(dbgs() << "  Reusing base register " << BaseReg << "\n");
       // We found a register to reuse.
       Offset = FrameSizeAdjust + LocalOffset - BaseOffset;
     } else {
-      // No previously defined register was in range, so create a // new one.
- 
-      int64_t InstrOffset = TRI->getFrameIndexInstrOffset(MI, idx);
+      // No previously defined register was in range, so create a new one.
+      int64_t InstrOffset = TRI->getFrameIndexInstrOffset(&MI, idx);
 
       int64_t PrevBaseOffset = BaseOffset;
       BaseOffset = FrameSizeAdjust + LocalOffset + InstrOffset;
@@ -386,12 +380,12 @@ bool LocalStackSlotPass::insertFrameReferenceRegisters(MachineFunction &Fn) {
           !lookupCandidateBaseReg(
               BaseReg, BaseOffset, FrameSizeAdjust,
               FrameReferenceInsns[ref + 1].getLocalOffset(),
-              FrameReferenceInsns[ref + 1].getMachineInstr(), TRI)) {
+              *FrameReferenceInsns[ref + 1].getMachineInstr(), TRI)) {
         BaseOffset = PrevBaseOffset;
         continue;
       }
 
-      const MachineFunction *MF = MI->getParent()->getParent();
+      const MachineFunction *MF = MI.getParent()->getParent();
       const TargetRegisterClass *RC = TRI->getPointerRegClass(*MF);
       BaseReg = Fn.getRegInfo().createVirtualRegister(RC);
 
@@ -416,8 +410,8 @@ bool LocalStackSlotPass::insertFrameReferenceRegisters(MachineFunction &Fn) {
 
     // Modify the instruction to use the new base register rather
     // than the frame index operand.
-    TRI->resolveFrameIndex(*I, BaseReg, Offset);
-    DEBUG(dbgs() << "Resolved: " << *MI);
+    TRI->resolveFrameIndex(MI, BaseReg, Offset);
+    DEBUG(dbgs() << "Resolved: " << MI);
 
     ++NumReplacements;
   }
diff --git a/lib/CodeGen/LowerEmuTLS.cpp b/lib/CodeGen/LowerEmuTLS.cpp
new file mode 100644
index 000000000000..6966c8ca4a5f
--- /dev/null
+++ b/lib/CodeGen/LowerEmuTLS.cpp
@@ -0,0 +1,162 @@
+//===- LowerEmuTLS.cpp - Add __emutls_[vt].* variables --------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This transformation is required for targets depending on libgcc style
+// emulated thread local storage variables. For every defined TLS variable xyz,
+// an __emutls_v.xyz is generated. If there is non-zero initialized value
+// an __emutls_t.xyz is also generated.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/CodeGen/Passes.h"
+#include "llvm/IR/LLVMContext.h"
+#include "llvm/IR/Module.h"
+#include "llvm/Pass.h"
+#include "llvm/Target/TargetLowering.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "loweremutls"
+
+namespace {
+
+class LowerEmuTLS : public ModulePass {
+  const TargetMachine *TM;
+public:
+  static char ID; // Pass identification, replacement for typeid
+  explicit LowerEmuTLS() : ModulePass(ID), TM(nullptr) { }
+  explicit LowerEmuTLS(const TargetMachine *TM)
+      : ModulePass(ID), TM(TM) {
+    initializeLowerEmuTLSPass(*PassRegistry::getPassRegistry());
+  }
+  bool runOnModule(Module &M) override;
+private:
+  bool addEmuTlsVar(Module &M, const GlobalVariable *GV);
+  static void copyLinkageVisibility(Module &M,
+                                    const GlobalVariable *from,
+                                    GlobalVariable *to) {
+    to->setLinkage(from->getLinkage());
+    to->setVisibility(from->getVisibility());
+    if (from->hasComdat()) {
+      to->setComdat(M.getOrInsertComdat(to->getName()));
+      to->getComdat()->setSelectionKind(from->getComdat()->getSelectionKind());
+    }
+  }
+};
+}
+
+char LowerEmuTLS::ID = 0;
+
+INITIALIZE_PASS(LowerEmuTLS, "loweremutls",
+                "Add __emutls_[vt]. variables for emultated TLS model",
+                false, false)
+
+ModulePass *llvm::createLowerEmuTLSPass(const TargetMachine *TM) {
+  return new LowerEmuTLS(TM);
+}
+
+bool LowerEmuTLS::runOnModule(Module &M) {
+  if (skipModule(M))
+    return false;
+
+  if (!TM || !TM->Options.EmulatedTLS)
+    return false;
+
+  bool Changed = false;
+  SmallVector<const GlobalVariable*, 8> TlsVars;
+  for (const auto &G : M.globals()) {
+    if (G.isThreadLocal())
+      TlsVars.append({&G});
+  }
+  for (const auto G : TlsVars)
+    Changed |= addEmuTlsVar(M, G);
+  return Changed;
+}
+
+bool LowerEmuTLS::addEmuTlsVar(Module &M, const GlobalVariable *GV) {
+  LLVMContext &C = M.getContext();
+  PointerType *VoidPtrType = Type::getInt8PtrTy(C);
+
+  std::string EmuTlsVarName = ("__emutls_v." + GV->getName()).str();
+  GlobalVariable *EmuTlsVar = M.getNamedGlobal(EmuTlsVarName);
+  if (EmuTlsVar)
+    return false;  // It has been added before.
+
+  const DataLayout &DL = M.getDataLayout();
+  Constant *NullPtr = ConstantPointerNull::get(VoidPtrType);
+
+  // Get non-zero initializer from GV's initializer.
+  const Constant *InitValue = nullptr;
+  if (GV->hasInitializer()) {
+    InitValue = GV->getInitializer();
+    const ConstantInt *InitIntValue = dyn_cast<ConstantInt>(InitValue);
+    // When GV's init value is all 0, omit the EmuTlsTmplVar and let
+    // the emutls library function to reset newly allocated TLS variables.
+    if (isa<ConstantAggregateZero>(InitValue) ||
+        (InitIntValue && InitIntValue->isZero()))
+      InitValue = nullptr;
+  }
+
+  // Create the __emutls_v. symbol, whose type has 4 fields:
+  //     word size;   // size of GV in bytes
+  //     word align;  // alignment of GV
+  //     void *ptr;   // initialized to 0; set at run time per thread.
+  //     void *templ; // 0 or point to __emutls_t.*
+  // sizeof(word) should be the same as sizeof(void*) on target.
+  IntegerType *WordType = DL.getIntPtrType(C);
+  PointerType *InitPtrType = InitValue ?
+      PointerType::getUnqual(InitValue->getType()) : VoidPtrType;
+  Type *ElementTypes[4] = {WordType, WordType, VoidPtrType, InitPtrType};
+  ArrayRef<Type*> ElementTypeArray(ElementTypes, 4);
+  StructType *EmuTlsVarType = StructType::create(ElementTypeArray);
+  EmuTlsVar = cast<GlobalVariable>(
+      M.getOrInsertGlobal(EmuTlsVarName, EmuTlsVarType));
+  copyLinkageVisibility(M, GV, EmuTlsVar);
+
+  // Define "__emutls_t.*" and "__emutls_v.*" only if GV is defined.
+  if (!GV->hasInitializer())
+    return true;
+
+  Type *GVType = GV->getValueType();
+  unsigned GVAlignment = GV->getAlignment();
+  if (!GVAlignment) {
+    // When LLVM IL declares a variable without alignment, use
+    // the ABI default alignment for the type.
+    GVAlignment = DL.getABITypeAlignment(GVType);
+  }
+
+  // Define "__emutls_t.*" if there is InitValue
+  GlobalVariable *EmuTlsTmplVar = nullptr;
+  if (InitValue) {
+    std::string EmuTlsTmplName = ("__emutls_t." + GV->getName()).str();
+    EmuTlsTmplVar = dyn_cast_or_null<GlobalVariable>(
+        M.getOrInsertGlobal(EmuTlsTmplName, GVType));
+    assert(EmuTlsTmplVar && "Failed to create emualted TLS initializer");
+    EmuTlsTmplVar->setConstant(true);
+    EmuTlsTmplVar->setInitializer(const_cast<Constant*>(InitValue));
+    EmuTlsTmplVar->setAlignment(GVAlignment);
+    copyLinkageVisibility(M, GV, EmuTlsTmplVar);
+  }
+
+  // Define "__emutls_v.*" with initializer and alignment.
+  Constant *ElementValues[4] = {
+      ConstantInt::get(WordType, DL.getTypeStoreSize(GVType)),
+      ConstantInt::get(WordType, GVAlignment),
+      NullPtr, EmuTlsTmplVar ? EmuTlsTmplVar : NullPtr
+  };
+  ArrayRef<Constant*> ElementValueArray(ElementValues, 4);
+  EmuTlsVar->setInitializer(
+      ConstantStruct::get(EmuTlsVarType, ElementValueArray));
+  unsigned MaxAlignment = std::max(
+      DL.getABITypeAlignment(WordType),
+      DL.getABITypeAlignment(VoidPtrType));
+  EmuTlsVar->setAlignment(MaxAlignment);
+  return true;
+}
diff --git a/lib/CodeGen/MIRParser/MILexer.cpp b/lib/CodeGen/MIRParser/MILexer.cpp
index 28f9d4e298f9..6e3de52f1a9c 100644
--- a/lib/CodeGen/MIRParser/MILexer.cpp
+++ b/lib/CodeGen/MIRParser/MILexer.cpp
@@ -12,6 +12,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "MILexer.h"
+#include "llvm/ADT/None.h"
 #include "llvm/ADT/StringExtras.h"
 #include "llvm/ADT/StringSwitch.h"
 #include "llvm/ADT/Twine.h"
@@ -21,6 +22,9 @@ using namespace llvm;
 
 namespace {
 
+typedef function_ref<void(StringRef::iterator Loc, const Twine &)>
+    ErrorCallbackType;
+
 /// This class provides a way to iterate and get characters from the source
 /// string.
 class Cursor {
@@ -133,9 +137,7 @@ static std::string unescapeQuotedString(StringRef Value) {
 }
 
 /// Lex a string constant using the following regular expression: \"[^\"]*\"
-static Cursor lexStringConstant(
-    Cursor C,
-    function_ref<void(StringRef::iterator Loc, const Twine &)> ErrorCallback) {
+static Cursor lexStringConstant(Cursor C, ErrorCallbackType ErrorCallback) {
   assert(C.peek() == '"');
   for (C.advance(); C.peek() != '"'; C.advance()) {
     if (C.isEOF() || isNewlineChar(C.peek())) {
@@ -149,9 +151,8 @@ static Cursor lexStringConstant(
   return C;
 }
 
-static Cursor lexName(
-    Cursor C, MIToken &Token, MIToken::TokenKind Type, unsigned PrefixLength,
-    function_ref<void(StringRef::iterator Loc, const Twine &)> ErrorCallback) {
+static Cursor lexName(Cursor C, MIToken &Token, MIToken::TokenKind Type,
+                      unsigned PrefixLength, ErrorCallbackType ErrorCallback) {
   auto Range = C;
   C.advance(PrefixLength);
   if (C.peek() == '"') {
@@ -241,9 +242,8 @@ static Cursor maybeLexIdentifier(Cursor C, MIToken &Token) {
   return C;
 }
 
-static Cursor maybeLexMachineBasicBlock(
-    Cursor C, MIToken &Token,
-    function_ref<void(StringRef::iterator Loc, const Twine &)> ErrorCallback) {
+static Cursor maybeLexMachineBasicBlock(Cursor C, MIToken &Token,
+                                        ErrorCallbackType ErrorCallback) {
   bool IsReference = C.remaining().startswith("%bb.");
   if (!IsReference && !C.remaining().startswith("bb."))
     return None;
@@ -326,9 +326,17 @@ static Cursor maybeLexConstantPoolItem(Cursor C, MIToken &Token) {
   return maybeLexIndex(C, Token, "%const.", MIToken::ConstantPoolItem);
 }
 
-static Cursor maybeLexIRBlock(
-    Cursor C, MIToken &Token,
-    function_ref<void(StringRef::iterator Loc, const Twine &)> ErrorCallback) {
+static Cursor maybeLexSubRegisterIndex(Cursor C, MIToken &Token,
+                                       ErrorCallbackType ErrorCallback) {
+  const StringRef Rule = "%subreg.";
+  if (!C.remaining().startswith(Rule))
+    return None;
+  return lexName(C, Token, MIToken::SubRegisterIndex, Rule.size(),
+                 ErrorCallback);
+}
+
+static Cursor maybeLexIRBlock(Cursor C, MIToken &Token,
+                              ErrorCallbackType ErrorCallback) {
   const StringRef Rule = "%ir-block.";
   if (!C.remaining().startswith(Rule))
     return None;
@@ -337,9 +345,8 @@ static Cursor maybeLexIRBlock(
   return lexName(C, Token, MIToken::NamedIRBlock, Rule.size(), ErrorCallback);
 }
 
-static Cursor maybeLexIRValue(
-    Cursor C, MIToken &Token,
-    function_ref<void(StringRef::iterator Loc, const Twine &)> ErrorCallback) {
+static Cursor maybeLexIRValue(Cursor C, MIToken &Token,
+                              ErrorCallbackType ErrorCallback) {
   const StringRef Rule = "%ir.";
   if (!C.remaining().startswith(Rule))
     return None;
@@ -373,9 +380,8 @@ static Cursor maybeLexRegister(Cursor C, MIToken &Token) {
   return C;
 }
 
-static Cursor maybeLexGlobalValue(
-    Cursor C, MIToken &Token,
-    function_ref<void(StringRef::iterator Loc, const Twine &)> ErrorCallback) {
+static Cursor maybeLexGlobalValue(Cursor C, MIToken &Token,
+                                  ErrorCallbackType ErrorCallback) {
   if (C.peek() != '@')
     return None;
   if (!isdigit(C.peek(1)))
@@ -391,9 +397,8 @@ static Cursor maybeLexGlobalValue(
   return C;
 }
 
-static Cursor maybeLexExternalSymbol(
-    Cursor C, MIToken &Token,
-    function_ref<void(StringRef::iterator Loc, const Twine &)> ErrorCallback) {
+static Cursor maybeLexExternalSymbol(Cursor C, MIToken &Token,
+                                     ErrorCallbackType ErrorCallback) {
   if (C.peek() != '$')
     return None;
   return lexName(C, Token, MIToken::ExternalSymbol, /*PrefixLength=*/1,
@@ -456,9 +461,8 @@ static MIToken::TokenKind getMetadataKeywordKind(StringRef Identifier) {
       .Default(MIToken::Error);
 }
 
-static Cursor maybeLexExlaim(
-    Cursor C, MIToken &Token,
-    function_ref<void(StringRef::iterator Loc, const Twine &)> ErrorCallback) {
+static Cursor maybeLexExlaim(Cursor C, MIToken &Token,
+                             ErrorCallbackType ErrorCallback) {
   if (C.peek() != '!')
     return None;
   auto Range = C;
@@ -497,6 +501,10 @@ static MIToken::TokenKind symbolToken(char C) {
     return MIToken::plus;
   case '-':
     return MIToken::minus;
+  case '<':
+    return MIToken::less;
+  case '>':
+    return MIToken::greater;
   default:
     return MIToken::Error;
   }
@@ -527,9 +535,8 @@ static Cursor maybeLexNewline(Cursor C, MIToken &Token) {
   return C;
 }
 
-static Cursor maybeLexEscapedIRValue(
-    Cursor C, MIToken &Token,
-    function_ref<void(StringRef::iterator Loc, const Twine &)> ErrorCallback) {
+static Cursor maybeLexEscapedIRValue(Cursor C, MIToken &Token,
+                                     ErrorCallbackType ErrorCallback) {
   if (C.peek() != '`')
     return None;
   auto Range = C;
@@ -551,9 +558,8 @@ static Cursor maybeLexEscapedIRValue(
   return C;
 }
 
-StringRef llvm::lexMIToken(
-    StringRef Source, MIToken &Token,
-    function_ref<void(StringRef::iterator Loc, const Twine &)> ErrorCallback) {
+StringRef llvm::lexMIToken(StringRef Source, MIToken &Token,
+                           ErrorCallbackType ErrorCallback) {
   auto C = skipComment(skipWhitespace(Cursor(Source)));
   if (C.isEOF()) {
     Token.reset(MIToken::Eof, C.remaining());
@@ -574,6 +580,8 @@ StringRef llvm::lexMIToken(
     return R.remaining();
   if (Cursor R = maybeLexConstantPoolItem(C, Token))
     return R.remaining();
+  if (Cursor R = maybeLexSubRegisterIndex(C, Token, ErrorCallback))
+    return R.remaining();
   if (Cursor R = maybeLexIRBlock(C, Token, ErrorCallback))
     return R.remaining();
   if (Cursor R = maybeLexIRValue(C, Token, ErrorCallback))
diff --git a/lib/CodeGen/MIRParser/MILexer.h b/lib/CodeGen/MIRParser/MILexer.h
index ff54aa3554d8..32fc8ab271e6 100644
--- a/lib/CodeGen/MIRParser/MILexer.h
+++ b/lib/CodeGen/MIRParser/MILexer.h
@@ -45,6 +45,8 @@ struct MIToken {
     rbrace,
     plus,
     minus,
+    less,
+    greater,
 
     // Keywords
     kw_implicit,
@@ -116,7 +118,8 @@ struct MIToken {
     IRBlock,
     NamedIRValue,
     IRValue,
-    QuotedIRValue // `<constant value>`
+    QuotedIRValue, // `<constant value>`
+    SubRegisterIndex
   };
 
 private:
diff --git a/lib/CodeGen/MIRParser/MIParser.cpp b/lib/CodeGen/MIRParser/MIParser.cpp
index f2f6584fb6c8..b3fd16f15889 100644
--- a/lib/CodeGen/MIRParser/MIParser.cpp
+++ b/lib/CodeGen/MIRParser/MIParser.cpp
@@ -17,24 +17,30 @@
 #include "llvm/AsmParser/Parser.h"
 #include "llvm/AsmParser/SlotMapping.h"
 #include "llvm/CodeGen/MachineBasicBlock.h"
-#include "llvm/CodeGen/MachineFunction.h"
 #include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/CodeGen/MachineFunction.h"
 #include "llvm/CodeGen/MachineInstr.h"
 #include "llvm/CodeGen/MachineInstrBuilder.h"
 #include "llvm/CodeGen/MachineMemOperand.h"
 #include "llvm/CodeGen/MachineModuleInfo.h"
-#include "llvm/IR/Instructions.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
 #include "llvm/IR/Constants.h"
+#include "llvm/IR/Instructions.h"
 #include "llvm/IR/Module.h"
 #include "llvm/IR/ModuleSlotTracker.h"
 #include "llvm/IR/ValueSymbolTable.h"
-#include "llvm/Support/raw_ostream.h"
 #include "llvm/Support/SourceMgr.h"
-#include "llvm/Target/TargetSubtargetInfo.h"
+#include "llvm/Support/raw_ostream.h"
 #include "llvm/Target/TargetInstrInfo.h"
+#include "llvm/Target/TargetSubtargetInfo.h"
 
 using namespace llvm;
 
+PerFunctionMIParsingState::PerFunctionMIParsingState(MachineFunction &MF,
+    SourceMgr &SM, const SlotMapping &IRSlots)
+  : MF(MF), SM(&SM), IRSlots(IRSlots) {
+}
+
 namespace {
 
 /// A wrapper struct around the 'MachineOperand' struct that includes a source
@@ -55,14 +61,11 @@ struct ParsedMachineOperand {
 };
 
 class MIParser {
-  SourceMgr &SM;
   MachineFunction &MF;
   SMDiagnostic &Error;
   StringRef Source, CurrentSource;
   MIToken Token;
   const PerFunctionMIParsingState &PFS;
-  /// Maps from indices to unnamed global values and metadata nodes.
-  const SlotMapping &IRSlots;
   /// Maps from instruction names to op codes.
   StringMap<unsigned> Names2InstrOpCodes;
   /// Maps from register names to registers.
@@ -83,11 +86,12 @@ class MIParser {
   StringMap<unsigned> Names2BitmaskTargetFlags;
 
 public:
-  MIParser(SourceMgr &SM, MachineFunction &MF, SMDiagnostic &Error,
-           StringRef Source, const PerFunctionMIParsingState &PFS,
-           const SlotMapping &IRSlots);
+  MIParser(const PerFunctionMIParsingState &PFS, SMDiagnostic &Error,
+           StringRef Source);
 
-  void lex();
+  /// \p SkipChar gives the number of characters to skip before looking
+  /// for the next token.
+  void lex(unsigned SkipChar = 0);
 
   /// Report an error at the current location with the given message.
   ///
@@ -119,12 +123,17 @@ public:
   bool parseRegisterFlag(unsigned &Flags);
   bool parseSubRegisterIndex(unsigned &SubReg);
   bool parseRegisterTiedDefIndex(unsigned &TiedDefIdx);
+  bool parseSize(unsigned &Size);
   bool parseRegisterOperand(MachineOperand &Dest,
                             Optional<unsigned> &TiedDefIdx, bool IsDef = false);
   bool parseImmediateOperand(MachineOperand &Dest);
   bool parseIRConstant(StringRef::iterator Loc, StringRef Source,
                        const Constant *&C);
   bool parseIRConstant(StringRef::iterator Loc, const Constant *&C);
+  bool parseIRType(StringRef::iterator Loc, StringRef Source, unsigned &Read,
+                   Type *&Ty);
+  // \p MustBeSized defines whether or not \p Ty must be sized.
+  bool parseIRType(StringRef::iterator Loc, Type *&Ty, bool MustBeSized = true);
   bool parseTypedImmediateOperand(MachineOperand &Dest);
   bool parseFPImmediateOperand(MachineOperand &Dest);
   bool parseMBBReference(MachineBasicBlock *&MBB);
@@ -136,6 +145,7 @@ public:
   bool parseGlobalValue(GlobalValue *&GV);
   bool parseGlobalAddressOperand(MachineOperand &Dest);
   bool parseConstantPoolIndexOperand(MachineOperand &Dest);
+  bool parseSubRegisterIndexOperand(MachineOperand &Dest);
   bool parseJumpTableIndexOperand(MachineOperand &Dest);
   bool parseExternalSymbolOperand(MachineOperand &Dest);
   bool parseMDNode(MDNode *&Node);
@@ -155,7 +165,7 @@ public:
   bool parseAlignment(unsigned &Alignment);
   bool parseOperandsOffset(MachineOperand &Op);
   bool parseIRValue(const Value *&V);
-  bool parseMemoryOperandFlag(unsigned &Flags);
+  bool parseMemoryOperandFlag(MachineMemOperand::Flags &Flags);
   bool parseMemoryPseudoSourceValue(const PseudoSourceValue *&PSV);
   bool parseMachinePointerInfo(MachinePointerInfo &Dest);
   bool parseMachineMemoryOperand(MachineMemOperand *&Dest);
@@ -244,21 +254,21 @@ private:
 
 } // end anonymous namespace
 
-MIParser::MIParser(SourceMgr &SM, MachineFunction &MF, SMDiagnostic &Error,
-                   StringRef Source, const PerFunctionMIParsingState &PFS,
-                   const SlotMapping &IRSlots)
-    : SM(SM), MF(MF), Error(Error), Source(Source), CurrentSource(Source),
-      PFS(PFS), IRSlots(IRSlots) {}
+MIParser::MIParser(const PerFunctionMIParsingState &PFS, SMDiagnostic &Error,
+                   StringRef Source)
+    : MF(PFS.MF), Error(Error), Source(Source), CurrentSource(Source), PFS(PFS)
+{}
 
-void MIParser::lex() {
+void MIParser::lex(unsigned SkipChar) {
   CurrentSource = lexMIToken(
-      CurrentSource, Token,
+      CurrentSource.data() + SkipChar, Token,
       [this](StringRef::iterator Loc, const Twine &Msg) { error(Loc, Msg); });
 }
 
 bool MIParser::error(const Twine &Msg) { return error(Token.location(), Msg); }
 
 bool MIParser::error(StringRef::iterator Loc, const Twine &Msg) {
+  const SourceMgr &SM = *PFS.SM;
   assert(Loc >= Source.data() && Loc <= (Source.data() + Source.size()));
   const MemoryBuffer &Buffer = *SM.getMemoryBuffer(SM.getMainFileID());
   if (Loc >= Buffer.getBufferStart() && Loc <= Buffer.getBufferEnd()) {
@@ -587,6 +597,14 @@ bool MIParser::parse(MachineInstr *&MI) {
   if (Token.isError() || parseInstruction(OpCode, Flags))
     return true;
 
+  Type *Ty = nullptr;
+  if (isPreISelGenericOpcode(OpCode)) {
+    // For generic opcode, a type is mandatory.
+    auto Loc = Token.location();
+    if (parseIRType(Loc, Ty))
+      return true;
+  }
+
   // Parse the remaining machine operands.
   while (!Token.isNewlineOrEOF() && Token.isNot(MIToken::kw_debug_location) &&
          Token.isNot(MIToken::coloncolon) && Token.isNot(MIToken::lbrace)) {
@@ -642,6 +660,8 @@ bool MIParser::parse(MachineInstr *&MI) {
   // TODO: Check for extraneous machine operands.
   MI = MF.CreateMachineInstr(MCID, DebugLocation, /*NoImplicit=*/true);
   MI->setFlags(Flags);
+  if (Ty)
+    MI->setType(Ty);
   for (const auto &Operand : Operands)
     MI->addOperand(MF, Operand.Operand);
   if (assignRegisterTies(*MI, Operands))
@@ -876,6 +896,17 @@ bool MIParser::parseRegisterTiedDefIndex(unsigned &TiedDefIdx) {
   return false;
 }
 
+bool MIParser::parseSize(unsigned &Size) {
+  if (Token.isNot(MIToken::IntegerLiteral))
+    return error("expected an integer literal for the size");
+  if (getUnsigned(Size))
+    return true;
+  lex();
+  if (expectAndConsume(MIToken::rparen))
+    return true;
+  return false;
+}
+
 bool MIParser::assignRegisterTies(MachineInstr &MI,
                                   ArrayRef<ParsedMachineOperand> Operands) {
   SmallVector<std::pair<unsigned, unsigned>, 4> TiedRegisterPairs;
@@ -931,12 +962,31 @@ bool MIParser::parseRegisterOperand(MachineOperand &Dest,
   if (Token.is(MIToken::colon)) {
     if (parseSubRegisterIndex(SubReg))
       return true;
+    if (!TargetRegisterInfo::isVirtualRegister(Reg))
+      return error("subregister index expects a virtual register");
   }
-  if ((Flags & RegState::Define) == 0 && consumeIfPresent(MIToken::lparen)) {
-    unsigned Idx;
-    if (parseRegisterTiedDefIndex(Idx))
+  if ((Flags & RegState::Define) == 0) {
+    if (consumeIfPresent(MIToken::lparen)) {
+      unsigned Idx;
+      if (parseRegisterTiedDefIndex(Idx))
+        return true;
+      TiedDefIdx = Idx;
+    }
+  } else if (consumeIfPresent(MIToken::lparen)) {
+    // Virtual registers may have a size with GlobalISel.
+    if (!TargetRegisterInfo::isVirtualRegister(Reg))
+      return error("unexpected size on physical register");
+    unsigned Size;
+    if (parseSize(Size))
       return true;
-    TiedDefIdx = Idx;
+
+    MachineRegisterInfo &MRI = MF.getRegInfo();
+    MRI.setSize(Reg, Size);
+  } else if (PFS.GenericVRegs.count(Reg)) {
+    // Generic virtual registers must have a size.
+    // If we end up here this means the size hasn't been specified and
+    // this is bad!
+    return error("generic virtual registers must have a size");
   }
   Dest = MachineOperand::CreateReg(
       Reg, Flags & RegState::Define, Flags & RegState::Implicit,
@@ -961,7 +1011,7 @@ bool MIParser::parseIRConstant(StringRef::iterator Loc, StringRef StringValue,
   auto Source = StringValue.str(); // The source has to be null terminated.
   SMDiagnostic Err;
   C = parseConstantValue(Source.c_str(), Err, *MF.getFunction()->getParent(),
-                         &IRSlots);
+                         &PFS.IRSlots);
   if (!C)
     return error(Loc + Err.getColumnNo(), Err.getMessage());
   return false;
@@ -974,6 +1024,38 @@ bool MIParser::parseIRConstant(StringRef::iterator Loc, const Constant *&C) {
   return false;
 }
 
+bool MIParser::parseIRType(StringRef::iterator Loc, StringRef StringValue,
+                           unsigned &Read, Type *&Ty) {
+  auto Source = StringValue.str(); // The source has to be null terminated.
+  SMDiagnostic Err;
+  Ty = parseTypeAtBeginning(Source.c_str(), Read, Err,
+                            *MF.getFunction()->getParent(), &PFS.IRSlots);
+  if (!Ty)
+    return error(Loc + Err.getColumnNo(), Err.getMessage());
+  return false;
+}
+
+bool MIParser::parseIRType(StringRef::iterator Loc, Type *&Ty,
+                           bool MustBeSized) {
+  // At this point we enter in the IR world, i.e., to get the correct type,
+  // we need to hand off the whole string, not just the current token.
+  // E.g., <4 x i64> would give '<' as a token and there is not much
+  // the IR parser can do with that.
+  unsigned Read = 0;
+  if (parseIRType(Loc, StringRef(Loc), Read, Ty))
+    return true;
+  // The type must be sized, otherwise there is not much the backend
+  // can do with it.
+  if (MustBeSized && !Ty->isSized())
+    return error("expected a sized type");
+  // The next token is Read characters from the Loc.
+  // However, the current location is not Loc, but Loc + the length of Token.
+  // Therefore, subtract the length of Token (range().end() - Loc) to the
+  // number of characters to skip before the next token.
+  lex(Read - (Token.range().end() - Loc));
+  return false;
+}
+
 bool MIParser::parseTypedImmediateOperand(MachineOperand &Dest) {
   assert(Token.is(MIToken::IntegerType));
   auto Loc = Token.location();
@@ -1100,10 +1182,10 @@ bool MIParser::parseGlobalValue(GlobalValue *&GV) {
     unsigned GVIdx;
     if (getUnsigned(GVIdx))
       return true;
-    if (GVIdx >= IRSlots.GlobalValues.size())
+    if (GVIdx >= PFS.IRSlots.GlobalValues.size())
       return error(Twine("use of undefined global value '@") + Twine(GVIdx) +
                    "'");
-    GV = IRSlots.GlobalValues[GVIdx];
+    GV = PFS.IRSlots.GlobalValues[GVIdx];
     break;
   }
   default:
@@ -1161,6 +1243,17 @@ bool MIParser::parseExternalSymbolOperand(MachineOperand &Dest) {
   return false;
 }
 
+bool MIParser::parseSubRegisterIndexOperand(MachineOperand &Dest) {
+  assert(Token.is(MIToken::SubRegisterIndex));
+  StringRef Name = Token.stringValue();
+  unsigned SubRegIndex = getSubRegIndex(Token.stringValue());
+  if (SubRegIndex == 0)
+    return error(Twine("unknown subregister index '") + Name + "'");
+  lex();
+  Dest = MachineOperand::CreateImm(SubRegIndex);
+  return false;
+}
+
 bool MIParser::parseMDNode(MDNode *&Node) {
   assert(Token.is(MIToken::exclaim));
   auto Loc = Token.location();
@@ -1170,8 +1263,8 @@ bool MIParser::parseMDNode(MDNode *&Node) {
   unsigned ID;
   if (getUnsigned(ID))
     return true;
-  auto NodeInfo = IRSlots.MetadataNodes.find(ID);
-  if (NodeInfo == IRSlots.MetadataNodes.end())
+  auto NodeInfo = PFS.IRSlots.MetadataNodes.find(ID);
+  if (NodeInfo == PFS.IRSlots.MetadataNodes.end())
     return error(Loc, "use of undefined metadata '!" + Twine(ID) + "'");
   lex();
   Node = NodeInfo->second.get();
@@ -1406,6 +1499,8 @@ bool MIParser::parseMachineOperand(MachineOperand &Dest,
     return parseJumpTableIndexOperand(Dest);
   case MIToken::ExternalSymbol:
     return parseExternalSymbolOperand(Dest);
+  case MIToken::SubRegisterIndex:
+    return parseSubRegisterIndexOperand(Dest);
   case MIToken::exclaim:
     return parseMetadataOperand(Dest);
   case MIToken::kw_cfi_same_value:
@@ -1559,8 +1654,8 @@ bool MIParser::getUint64(uint64_t &Result) {
   return false;
 }
 
-bool MIParser::parseMemoryOperandFlag(unsigned &Flags) {
-  const unsigned OldFlags = Flags;
+bool MIParser::parseMemoryOperandFlag(MachineMemOperand::Flags &Flags) {
+  const auto OldFlags = Flags;
   switch (Token.kind()) {
   case MIToken::kw_volatile:
     Flags |= MachineMemOperand::MOVolatile;
@@ -1605,6 +1700,14 @@ bool MIParser::parseMemoryPseudoSourceValue(const PseudoSourceValue *&PSV) {
     // The token was already consumed, so use return here instead of break.
     return false;
   }
+  case MIToken::StackObject: {
+    int FI;
+    if (parseStackFrameIndex(FI))
+      return true;
+    PSV = MF.getPSVManager().getFixedStack(FI);
+    // The token was already consumed, so use return here instead of break.
+    return false;
+  }
   case MIToken::kw_call_entry: {
     lex();
     switch (Token.kind()) {
@@ -1636,7 +1739,8 @@ bool MIParser::parseMemoryPseudoSourceValue(const PseudoSourceValue *&PSV) {
 bool MIParser::parseMachinePointerInfo(MachinePointerInfo &Dest) {
   if (Token.is(MIToken::kw_constant_pool) || Token.is(MIToken::kw_stack) ||
       Token.is(MIToken::kw_got) || Token.is(MIToken::kw_jump_table) ||
-      Token.is(MIToken::FixedStackObject) || Token.is(MIToken::kw_call_entry)) {
+      Token.is(MIToken::FixedStackObject) || Token.is(MIToken::StackObject) ||
+      Token.is(MIToken::kw_call_entry)) {
     const PseudoSourceValue *PSV = nullptr;
     if (parseMemoryPseudoSourceValue(PSV))
       return true;
@@ -1667,7 +1771,7 @@ bool MIParser::parseMachinePointerInfo(MachinePointerInfo &Dest) {
 bool MIParser::parseMachineMemoryOperand(MachineMemOperand *&Dest) {
   if (expectAndConsume(MIToken::lparen))
     return true;
-  unsigned Flags = 0;
+  MachineMemOperand::Flags Flags = MachineMemOperand::MONone;
   while (Token.isMemoryOperandFlag()) {
     if (parseMemoryOperandFlag(Flags))
       return true;
@@ -1688,14 +1792,16 @@ bool MIParser::parseMachineMemoryOperand(MachineMemOperand *&Dest) {
     return true;
   lex();
 
-  const char *Word = Flags & MachineMemOperand::MOLoad ? "from" : "into";
-  if (Token.isNot(MIToken::Identifier) || Token.stringValue() != Word)
-    return error(Twine("expected '") + Word + "'");
-  lex();
-
   MachinePointerInfo Ptr = MachinePointerInfo();
-  if (parseMachinePointerInfo(Ptr))
-    return true;
+  if (Token.is(MIToken::Identifier)) {
+    const char *Word = Flags & MachineMemOperand::MOLoad ? "from" : "into";
+    if (Token.stringValue() != Word)
+      return error(Twine("expected '") + Word + "'");
+    lex();
+
+    if (parseMachinePointerInfo(Ptr))
+      return true;
+  }
   unsigned BaseAlignment = Size;
   AAMDNodes AAInfo;
   MDNode *Range = nullptr;
@@ -1947,65 +2053,42 @@ bool MIParser::getBitmaskTargetFlag(StringRef Name, unsigned &Flag) {
   return false;
 }
 
-bool llvm::parseMachineBasicBlockDefinitions(MachineFunction &MF, StringRef Src,
-                                             PerFunctionMIParsingState &PFS,
-                                             const SlotMapping &IRSlots,
+bool llvm::parseMachineBasicBlockDefinitions(PerFunctionMIParsingState &PFS,
+                                             StringRef Src,
                                              SMDiagnostic &Error) {
-  SourceMgr SM;
-  SM.AddNewSourceBuffer(
-      MemoryBuffer::getMemBuffer(Src, "", /*RequiresNullTerminator=*/false),
-      SMLoc());
-  return MIParser(SM, MF, Error, Src, PFS, IRSlots)
-      .parseBasicBlockDefinitions(PFS.MBBSlots);
-}
-
-bool llvm::parseMachineInstructions(MachineFunction &MF, StringRef Src,
-                                    const PerFunctionMIParsingState &PFS,
-                                    const SlotMapping &IRSlots,
-                                    SMDiagnostic &Error) {
-  SourceMgr SM;
-  SM.AddNewSourceBuffer(
-      MemoryBuffer::getMemBuffer(Src, "", /*RequiresNullTerminator=*/false),
-      SMLoc());
-  return MIParser(SM, MF, Error, Src, PFS, IRSlots).parseBasicBlocks();
-}
-
-bool llvm::parseMBBReference(MachineBasicBlock *&MBB, SourceMgr &SM,
-                             MachineFunction &MF, StringRef Src,
-                             const PerFunctionMIParsingState &PFS,
-                             const SlotMapping &IRSlots, SMDiagnostic &Error) {
-  return MIParser(SM, MF, Error, Src, PFS, IRSlots).parseStandaloneMBB(MBB);
-}
-
-bool llvm::parseNamedRegisterReference(unsigned &Reg, SourceMgr &SM,
-                                       MachineFunction &MF, StringRef Src,
-                                       const PerFunctionMIParsingState &PFS,
-                                       const SlotMapping &IRSlots,
+  return MIParser(PFS, Error, Src).parseBasicBlockDefinitions(PFS.MBBSlots);
+}
+
+bool llvm::parseMachineInstructions(const PerFunctionMIParsingState &PFS,
+                                    StringRef Src, SMDiagnostic &Error) {
+  return MIParser(PFS, Error, Src).parseBasicBlocks();
+}
+
+bool llvm::parseMBBReference(const PerFunctionMIParsingState &PFS,
+                             MachineBasicBlock *&MBB, StringRef Src,
+                             SMDiagnostic &Error) {
+  return MIParser(PFS, Error, Src).parseStandaloneMBB(MBB);
+}
+
+bool llvm::parseNamedRegisterReference(const PerFunctionMIParsingState &PFS,
+                                       unsigned &Reg, StringRef Src,
                                        SMDiagnostic &Error) {
-  return MIParser(SM, MF, Error, Src, PFS, IRSlots)
-      .parseStandaloneNamedRegister(Reg);
+  return MIParser(PFS, Error, Src).parseStandaloneNamedRegister(Reg);
 }
 
-bool llvm::parseVirtualRegisterReference(unsigned &Reg, SourceMgr &SM,
-                                         MachineFunction &MF, StringRef Src,
-                                         const PerFunctionMIParsingState &PFS,
-                                         const SlotMapping &IRSlots,
+bool llvm::parseVirtualRegisterReference(const PerFunctionMIParsingState &PFS,
+                                         unsigned &Reg, StringRef Src,
                                          SMDiagnostic &Error) {
-  return MIParser(SM, MF, Error, Src, PFS, IRSlots)
-      .parseStandaloneVirtualRegister(Reg);
+  return MIParser(PFS, Error, Src).parseStandaloneVirtualRegister(Reg);
 }
 
-bool llvm::parseStackObjectReference(int &FI, SourceMgr &SM,
-                                     MachineFunction &MF, StringRef Src,
-                                     const PerFunctionMIParsingState &PFS,
-                                     const SlotMapping &IRSlots,
+bool llvm::parseStackObjectReference(const PerFunctionMIParsingState &PFS,
+                                     int &FI, StringRef Src,
                                      SMDiagnostic &Error) {
-  return MIParser(SM, MF, Error, Src, PFS, IRSlots)
-      .parseStandaloneStackObject(FI);
+  return MIParser(PFS, Error, Src).parseStandaloneStackObject(FI);
 }
 
-bool llvm::parseMDNode(MDNode *&Node, SourceMgr &SM, MachineFunction &MF,
-                       StringRef Src, const PerFunctionMIParsingState &PFS,
-                       const SlotMapping &IRSlots, SMDiagnostic &Error) {
-  return MIParser(SM, MF, Error, Src, PFS, IRSlots).parseStandaloneMDNode(Node);
+bool llvm::parseMDNode(const PerFunctionMIParsingState &PFS,
+                       MDNode *&Node, StringRef Src, SMDiagnostic &Error) {
+  return MIParser(PFS, Error, Src).parseStandaloneMDNode(Node);
 }
diff --git a/lib/CodeGen/MIRParser/MIParser.h b/lib/CodeGen/MIRParser/MIParser.h
index 8aef704ab36c..18895b9e54eb 100644
--- a/lib/CodeGen/MIRParser/MIParser.h
+++ b/lib/CodeGen/MIRParser/MIParser.h
@@ -15,26 +15,37 @@
 #define LLVM_LIB_CODEGEN_MIRPARSER_MIPARSER_H
 
 #include "llvm/ADT/DenseMap.h"
-#include "llvm/ADT/StringRef.h"
+#include "llvm/ADT/SmallSet.h"
 
 namespace llvm {
 
+class StringRef;
 class BasicBlock;
 class MachineBasicBlock;
-class MachineInstr;
 class MachineFunction;
+class MachineInstr;
+class MachineRegisterInfo;
 class MDNode;
 struct SlotMapping;
 class SMDiagnostic;
 class SourceMgr;
 
 struct PerFunctionMIParsingState {
+  MachineFunction &MF;
+  SourceMgr *SM;
+  const SlotMapping &IRSlots;
+
   DenseMap<unsigned, MachineBasicBlock *> MBBSlots;
   DenseMap<unsigned, unsigned> VirtualRegisterSlots;
   DenseMap<unsigned, int> FixedStackObjectSlots;
   DenseMap<unsigned, int> StackObjectSlots;
   DenseMap<unsigned, unsigned> ConstantPoolSlots;
   DenseMap<unsigned, unsigned> JumpTableSlots;
+  /// Hold the generic virtual registers.
+  SmallSet<unsigned, 8> GenericVRegs;
+
+  PerFunctionMIParsingState(MachineFunction &MF, SourceMgr &SM,
+                            const SlotMapping &IRSlots);
 };
 
 /// Parse the machine basic block definitions, and skip the machine
@@ -49,10 +60,8 @@ struct PerFunctionMIParsingState {
 /// resolve the machine basic block references.
 ///
 /// Return true if an error occurred.
-bool parseMachineBasicBlockDefinitions(MachineFunction &MF, StringRef Src,
-                                       PerFunctionMIParsingState &PFS,
-                                       const SlotMapping &IRSlots,
-                                       SMDiagnostic &Error);
+bool parseMachineBasicBlockDefinitions(PerFunctionMIParsingState &PFS,
+                                       StringRef Src, SMDiagnostic &Error);
 
 /// Parse the machine instructions.
 ///
@@ -64,35 +73,26 @@ bool parseMachineBasicBlockDefinitions(MachineFunction &MF, StringRef Src,
 /// on the given source string.
 ///
 /// Return true if an error occurred.
-bool parseMachineInstructions(MachineFunction &MF, StringRef Src,
-                              const PerFunctionMIParsingState &PFS,
-                              const SlotMapping &IRSlots, SMDiagnostic &Error);
-
-bool parseMBBReference(MachineBasicBlock *&MBB, SourceMgr &SM,
-                       MachineFunction &MF, StringRef Src,
-                       const PerFunctionMIParsingState &PFS,
-                       const SlotMapping &IRSlots, SMDiagnostic &Error);
-
-bool parseNamedRegisterReference(unsigned &Reg, SourceMgr &SM,
-                                 MachineFunction &MF, StringRef Src,
-                                 const PerFunctionMIParsingState &PFS,
-                                 const SlotMapping &IRSlots,
+bool parseMachineInstructions(const PerFunctionMIParsingState &PFS,
+                              StringRef Src, SMDiagnostic &Error);
+
+bool parseMBBReference(const PerFunctionMIParsingState &PFS,
+                       MachineBasicBlock *&MBB, StringRef Src,
+                       SMDiagnostic &Error);
+
+bool parseNamedRegisterReference(const PerFunctionMIParsingState &PFS,
+                                 unsigned &Reg, StringRef Src,
                                  SMDiagnostic &Error);
 
-bool parseVirtualRegisterReference(unsigned &Reg, SourceMgr &SM,
-                                   MachineFunction &MF, StringRef Src,
-                                   const PerFunctionMIParsingState &PFS,
-                                   const SlotMapping &IRSlots,
+bool parseVirtualRegisterReference(const PerFunctionMIParsingState &PFS,
+                                   unsigned &Reg, StringRef Src,
                                    SMDiagnostic &Error);
 
-bool parseStackObjectReference(int &FI, SourceMgr &SM, MachineFunction &MF,
-                               StringRef Src,
-                               const PerFunctionMIParsingState &PFS,
-                               const SlotMapping &IRSlots, SMDiagnostic &Error);
+bool parseStackObjectReference(const PerFunctionMIParsingState &PFS,
+                               int &FI, StringRef Src, SMDiagnostic &Error);
 
-bool parseMDNode(MDNode *&Node, SourceMgr &SM, MachineFunction &MF,
-                 StringRef Src, const PerFunctionMIParsingState &PFS,
-                 const SlotMapping &IRSlots, SMDiagnostic &Error);
+bool parseMDNode(const PerFunctionMIParsingState &PFS, MDNode *&Node,
+                 StringRef Src, SMDiagnostic &Error);
 
 } // end namespace llvm
 
diff --git a/lib/CodeGen/MIRParser/MIRParser.cpp b/lib/CodeGen/MIRParser/MIRParser.cpp
index 422efbc5ce57..4aa3df6326e9 100644
--- a/lib/CodeGen/MIRParser/MIRParser.cpp
+++ b/lib/CodeGen/MIRParser/MIRParser.cpp
@@ -15,27 +15,30 @@
 #include "llvm/CodeGen/MIRParser/MIRParser.h"
 #include "MIParser.h"
 #include "llvm/ADT/DenseMap.h"
-#include "llvm/ADT/StringRef.h"
-#include "llvm/ADT/StringMap.h"
 #include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/StringMap.h"
+#include "llvm/ADT/StringRef.h"
 #include "llvm/AsmParser/Parser.h"
 #include "llvm/AsmParser/SlotMapping.h"
+#include "llvm/CodeGen/GlobalISel/RegisterBank.h"
+#include "llvm/CodeGen/GlobalISel/RegisterBankInfo.h"
+#include "llvm/CodeGen/MIRYamlMapping.h"
 #include "llvm/CodeGen/MachineConstantPool.h"
-#include "llvm/CodeGen/MachineFunction.h"
 #include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/CodeGen/MachineFunction.h"
 #include "llvm/CodeGen/MachineModuleInfo.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
-#include "llvm/CodeGen/MIRYamlMapping.h"
 #include "llvm/IR/BasicBlock.h"
+#include "llvm/IR/DebugInfo.h"
 #include "llvm/IR/DiagnosticInfo.h"
 #include "llvm/IR/Instructions.h"
 #include "llvm/IR/LLVMContext.h"
 #include "llvm/IR/Module.h"
 #include "llvm/IR/ValueSymbolTable.h"
 #include "llvm/Support/LineIterator.h"
+#include "llvm/Support/MemoryBuffer.h"
 #include "llvm/Support/SMLoc.h"
 #include "llvm/Support/SourceMgr.h"
-#include "llvm/Support/MemoryBuffer.h"
 #include "llvm/Support/YAMLTraits.h"
 #include <memory>
 
@@ -53,6 +56,8 @@ class MIRParserImpl {
   SlotMapping IRSlots;
   /// Maps from register class names to register classes.
   StringMap<const TargetRegisterClass *> Names2RegClasses;
+  /// Maps from register bank names to register banks.
+  StringMap<const RegisterBank *> Names2RegBanks;
 
 public:
   MIRParserImpl(std::unique_ptr<MemoryBuffer> Contents, StringRef Filename,
@@ -97,44 +102,38 @@ public:
   /// Return true if error occurred.
   bool initializeMachineFunction(MachineFunction &MF);
 
-  bool initializeRegisterInfo(MachineFunction &MF,
-                              const yaml::MachineFunction &YamlMF,
-                              PerFunctionMIParsingState &PFS);
+  bool initializeRegisterInfo(PerFunctionMIParsingState &PFS,
+                              const yaml::MachineFunction &YamlMF);
 
-  void inferRegisterInfo(MachineFunction &MF,
+  void inferRegisterInfo(const PerFunctionMIParsingState &PFS,
                          const yaml::MachineFunction &YamlMF);
 
-  bool initializeFrameInfo(MachineFunction &MF,
-                           const yaml::MachineFunction &YamlMF,
-                           PerFunctionMIParsingState &PFS);
+  bool initializeFrameInfo(PerFunctionMIParsingState &PFS,
+                           const yaml::MachineFunction &YamlMF);
 
-  bool parseCalleeSavedRegister(MachineFunction &MF,
-                                PerFunctionMIParsingState &PFS,
+  bool parseCalleeSavedRegister(PerFunctionMIParsingState &PFS,
                                 std::vector<CalleeSavedInfo> &CSIInfo,
                                 const yaml::StringValue &RegisterSource,
                                 int FrameIdx);
 
-  bool parseStackObjectsDebugInfo(MachineFunction &MF,
-                                  PerFunctionMIParsingState &PFS,
+  bool parseStackObjectsDebugInfo(PerFunctionMIParsingState &PFS,
                                   const yaml::MachineStackObject &Object,
                                   int FrameIdx);
 
-  bool initializeConstantPool(MachineConstantPool &ConstantPool,
-                              const yaml::MachineFunction &YamlMF,
-                              const MachineFunction &MF,
-                              DenseMap<unsigned, unsigned> &ConstantPoolSlots);
+  bool initializeConstantPool(PerFunctionMIParsingState &PFS,
+                              MachineConstantPool &ConstantPool,
+                              const yaml::MachineFunction &YamlMF);
 
-  bool initializeJumpTableInfo(MachineFunction &MF,
-                               const yaml::MachineJumpTable &YamlJTI,
-                               PerFunctionMIParsingState &PFS);
+  bool initializeJumpTableInfo(PerFunctionMIParsingState &PFS,
+                               const yaml::MachineJumpTable &YamlJTI);
 
 private:
-  bool parseMDNode(MDNode *&Node, const yaml::StringValue &Source,
-                   MachineFunction &MF, const PerFunctionMIParsingState &PFS);
+  bool parseMDNode(const PerFunctionMIParsingState &PFS, MDNode *&Node,
+                   const yaml::StringValue &Source);
 
-  bool parseMBBReference(MachineBasicBlock *&MBB,
-                         const yaml::StringValue &Source, MachineFunction &MF,
-                         const PerFunctionMIParsingState &PFS);
+  bool parseMBBReference(const PerFunctionMIParsingState &PFS,
+                         MachineBasicBlock *&MBB,
+                         const yaml::StringValue &Source);
 
   /// Return a MIR diagnostic converted from an MI string diagnostic.
   SMDiagnostic diagFromMIStringDiag(const SMDiagnostic &Error,
@@ -149,12 +148,18 @@ private:
   void createDummyFunction(StringRef Name, Module &M);
 
   void initNames2RegClasses(const MachineFunction &MF);
+  void initNames2RegBanks(const MachineFunction &MF);
 
   /// Check if the given identifier is a name of a register class.
   ///
   /// Return null if the name isn't a register class.
   const TargetRegisterClass *getRegClass(const MachineFunction &MF,
                                          StringRef Name);
+
+  /// Check if the given identifier is a name of a register bank.
+  ///
+  /// Return null if the name isn't a register bank.
+  const RegisterBank *getRegBank(const MachineFunction &MF, StringRef Name);
 };
 
 } // end namespace llvm
@@ -226,7 +231,7 @@ std::unique_ptr<Module> MIRParserImpl::parse() {
                       Context, &IRSlots);
     if (!M) {
       reportDiagnostic(diagFromBlockStringDiag(Error, BSN->getSourceRange()));
-      return M;
+      return nullptr;
     }
     In.nextDocument();
     if (!In.setCurrentDocument())
@@ -285,46 +290,60 @@ bool MIRParserImpl::initializeMachineFunction(MachineFunction &MF) {
     MF.setAlignment(YamlMF.Alignment);
   MF.setExposesReturnsTwice(YamlMF.ExposesReturnsTwice);
   MF.setHasInlineAsm(YamlMF.HasInlineAsm);
-  PerFunctionMIParsingState PFS;
-  if (initializeRegisterInfo(MF, YamlMF, PFS))
+  if (YamlMF.AllVRegsAllocated)
+    MF.getProperties().set(MachineFunctionProperties::Property::AllVRegsAllocated);
+  PerFunctionMIParsingState PFS(MF, SM, IRSlots);
+  if (initializeRegisterInfo(PFS, YamlMF))
     return true;
   if (!YamlMF.Constants.empty()) {
     auto *ConstantPool = MF.getConstantPool();
     assert(ConstantPool && "Constant pool must be created");
-    if (initializeConstantPool(*ConstantPool, YamlMF, MF,
-                               PFS.ConstantPoolSlots))
+    if (initializeConstantPool(PFS, *ConstantPool, YamlMF))
       return true;
   }
 
+  StringRef BlockStr = YamlMF.Body.Value.Value;
   SMDiagnostic Error;
-  if (parseMachineBasicBlockDefinitions(MF, YamlMF.Body.Value.Value, PFS,
-                                        IRSlots, Error)) {
+  SourceMgr BlockSM;
+  BlockSM.AddNewSourceBuffer(
+      MemoryBuffer::getMemBuffer(BlockStr, "",/*RequiresNullTerminator=*/false),
+      SMLoc());
+  PFS.SM = &BlockSM;
+  if (parseMachineBasicBlockDefinitions(PFS, BlockStr, Error)) {
     reportDiagnostic(
         diagFromBlockStringDiag(Error, YamlMF.Body.Value.SourceRange));
     return true;
   }
+  PFS.SM = &SM;
 
   if (MF.empty())
     return error(Twine("machine function '") + Twine(MF.getName()) +
                  "' requires at least one machine basic block in its body");
   // Initialize the frame information after creating all the MBBs so that the
   // MBB references in the frame information can be resolved.
-  if (initializeFrameInfo(MF, YamlMF, PFS))
+  if (initializeFrameInfo(PFS, YamlMF))
     return true;
   // Initialize the jump table after creating all the MBBs so that the MBB
   // references can be resolved.
   if (!YamlMF.JumpTableInfo.Entries.empty() &&
-      initializeJumpTableInfo(MF, YamlMF.JumpTableInfo, PFS))
+      initializeJumpTableInfo(PFS, YamlMF.JumpTableInfo))
     return true;
   // Parse the machine instructions after creating all of the MBBs so that the
   // parser can resolve the MBB references.
-  if (parseMachineInstructions(MF, YamlMF.Body.Value.Value, PFS, IRSlots,
-                               Error)) {
+  StringRef InsnStr = YamlMF.Body.Value.Value;
+  SourceMgr InsnSM;
+  InsnSM.AddNewSourceBuffer(
+      MemoryBuffer::getMemBuffer(InsnStr, "", /*RequiresNullTerminator=*/false),
+      SMLoc());
+  PFS.SM = &InsnSM;
+  if (parseMachineInstructions(PFS, InsnStr, Error)) {
     reportDiagnostic(
         diagFromBlockStringDiag(Error, YamlMF.Body.Value.SourceRange));
     return true;
   }
-  inferRegisterInfo(MF, YamlMF);
+  PFS.SM = &SM;
+
+  inferRegisterInfo(PFS, YamlMF);
   // FIXME: This is a temporary workaround until the reserved registers can be
   // serialized.
   MF.getRegInfo().freezeReservedRegs(MF);
@@ -332,9 +351,9 @@ bool MIRParserImpl::initializeMachineFunction(MachineFunction &MF) {
   return false;
 }
 
-bool MIRParserImpl::initializeRegisterInfo(MachineFunction &MF,
-                                           const yaml::MachineFunction &YamlMF,
-                                           PerFunctionMIParsingState &PFS) {
+bool MIRParserImpl::initializeRegisterInfo(PerFunctionMIParsingState &PFS,
+    const yaml::MachineFunction &YamlMF) {
+  MachineFunction &MF = PFS.MF;
   MachineRegisterInfo &RegInfo = MF.getRegInfo();
   assert(RegInfo.isSSA());
   if (!YamlMF.IsSSA)
@@ -347,12 +366,28 @@ bool MIRParserImpl::initializeRegisterInfo(MachineFunction &MF,
   SMDiagnostic Error;
   // Parse the virtual register information.
   for (const auto &VReg : YamlMF.VirtualRegisters) {
-    const auto *RC = getRegClass(MF, VReg.Class.Value);
-    if (!RC)
-      return error(VReg.Class.SourceRange.Start,
-                   Twine("use of undefined register class '") +
-                       VReg.Class.Value + "'");
-    unsigned Reg = RegInfo.createVirtualRegister(RC);
+    unsigned Reg;
+    if (StringRef(VReg.Class.Value).equals("_")) {
+      // This is a generic virtual register.
+      // The size will be set appropriately when we reach the definition.
+      Reg = RegInfo.createGenericVirtualRegister(/*Size*/ 1);
+      PFS.GenericVRegs.insert(Reg);
+    } else {
+      const auto *RC = getRegClass(MF, VReg.Class.Value);
+      if (RC) {
+        Reg = RegInfo.createVirtualRegister(RC);
+      } else {
+        const auto *RegBank = getRegBank(MF, VReg.Class.Value);
+        if (!RegBank)
+          return error(
+              VReg.Class.SourceRange.Start,
+              Twine("use of undefined register class or register bank '") +
+                  VReg.Class.Value + "'");
+        Reg = RegInfo.createGenericVirtualRegister(/*Size*/ 1);
+        RegInfo.setRegBank(Reg, *RegBank);
+        PFS.GenericVRegs.insert(Reg);
+      }
+    }
     if (!PFS.VirtualRegisterSlots.insert(std::make_pair(VReg.ID.Value, Reg))
              .second)
       return error(VReg.ID.SourceRange.Start,
@@ -360,9 +395,8 @@ bool MIRParserImpl::initializeRegisterInfo(MachineFunction &MF,
                        Twine(VReg.ID.Value) + "'");
     if (!VReg.PreferredRegister.Value.empty()) {
       unsigned PreferredReg = 0;
-      if (parseNamedRegisterReference(PreferredReg, SM, MF,
-                                      VReg.PreferredRegister.Value, PFS,
-                                      IRSlots, Error))
+      if (parseNamedRegisterReference(PFS, PreferredReg,
+                                      VReg.PreferredRegister.Value, Error))
         return error(Error, VReg.PreferredRegister.SourceRange);
       RegInfo.setSimpleHint(Reg, PreferredReg);
     }
@@ -371,13 +405,12 @@ bool MIRParserImpl::initializeRegisterInfo(MachineFunction &MF,
   // Parse the liveins.
   for (const auto &LiveIn : YamlMF.LiveIns) {
     unsigned Reg = 0;
-    if (parseNamedRegisterReference(Reg, SM, MF, LiveIn.Register.Value, PFS,
-                                    IRSlots, Error))
+    if (parseNamedRegisterReference(PFS, Reg, LiveIn.Register.Value, Error))
       return error(Error, LiveIn.Register.SourceRange);
     unsigned VReg = 0;
     if (!LiveIn.VirtualRegister.Value.empty()) {
-      if (parseVirtualRegisterReference(
-              VReg, SM, MF, LiveIn.VirtualRegister.Value, PFS, IRSlots, Error))
+      if (parseVirtualRegisterReference(PFS, VReg, LiveIn.VirtualRegister.Value,
+                                        Error))
         return error(Error, LiveIn.VirtualRegister.SourceRange);
     }
     RegInfo.addLiveIn(Reg, VReg);
@@ -389,8 +422,7 @@ bool MIRParserImpl::initializeRegisterInfo(MachineFunction &MF,
     return false;
   for (const auto &RegSource : YamlMF.CalleeSavedRegisters.getValue()) {
     unsigned Reg = 0;
-    if (parseNamedRegisterReference(Reg, SM, MF, RegSource.Value, PFS, IRSlots,
-                                    Error))
+    if (parseNamedRegisterReference(PFS, Reg, RegSource.Value, Error))
       return error(Error, RegSource.SourceRange);
     CalleeSavedRegisterMask[Reg] = true;
   }
@@ -398,24 +430,25 @@ bool MIRParserImpl::initializeRegisterInfo(MachineFunction &MF,
   return false;
 }
 
-void MIRParserImpl::inferRegisterInfo(MachineFunction &MF,
+void MIRParserImpl::inferRegisterInfo(const PerFunctionMIParsingState &PFS,
                                       const yaml::MachineFunction &YamlMF) {
   if (YamlMF.CalleeSavedRegisters)
     return;
-  for (const MachineBasicBlock &MBB : MF) {
+  MachineRegisterInfo &MRI = PFS.MF.getRegInfo();
+  for (const MachineBasicBlock &MBB : PFS.MF) {
     for (const MachineInstr &MI : MBB) {
       for (const MachineOperand &MO : MI.operands()) {
         if (!MO.isRegMask())
           continue;
-        MF.getRegInfo().addPhysRegsUsedFromRegMask(MO.getRegMask());
+        MRI.addPhysRegsUsedFromRegMask(MO.getRegMask());
       }
     }
   }
 }
 
-bool MIRParserImpl::initializeFrameInfo(MachineFunction &MF,
-                                        const yaml::MachineFunction &YamlMF,
-                                        PerFunctionMIParsingState &PFS) {
+bool MIRParserImpl::initializeFrameInfo(PerFunctionMIParsingState &PFS,
+                                        const yaml::MachineFunction &YamlMF) {
+  MachineFunction &MF = PFS.MF;
   MachineFrameInfo &MFI = *MF.getFrameInfo();
   const Function &F = *MF.getFunction();
   const yaml::MachineFrameInfo &YamlMFI = YamlMF.FrameInfo;
@@ -435,13 +468,13 @@ bool MIRParserImpl::initializeFrameInfo(MachineFunction &MF,
   MFI.setHasMustTailInVarArgFunc(YamlMFI.HasMustTailInVarArgFunc);
   if (!YamlMFI.SavePoint.Value.empty()) {
     MachineBasicBlock *MBB = nullptr;
-    if (parseMBBReference(MBB, YamlMFI.SavePoint, MF, PFS))
+    if (parseMBBReference(PFS, MBB, YamlMFI.SavePoint))
       return true;
     MFI.setSavePoint(MBB);
   }
   if (!YamlMFI.RestorePoint.Value.empty()) {
     MachineBasicBlock *MBB = nullptr;
-    if (parseMBBReference(MBB, YamlMFI.RestorePoint, MF, PFS))
+    if (parseMBBReference(PFS, MBB, YamlMFI.RestorePoint))
       return true;
     MFI.setRestorePoint(MBB);
   }
@@ -462,7 +495,7 @@ bool MIRParserImpl::initializeFrameInfo(MachineFunction &MF,
       return error(Object.ID.SourceRange.Start,
                    Twine("redefinition of fixed stack object '%fixed-stack.") +
                        Twine(Object.ID.Value) + "'");
-    if (parseCalleeSavedRegister(MF, PFS, CSIInfo, Object.CalleeSavedRegister,
+    if (parseCalleeSavedRegister(PFS, CSIInfo, Object.CalleeSavedRegister,
                                  ObjectIdx))
       return true;
   }
@@ -493,12 +526,12 @@ bool MIRParserImpl::initializeFrameInfo(MachineFunction &MF,
       return error(Object.ID.SourceRange.Start,
                    Twine("redefinition of stack object '%stack.") +
                        Twine(Object.ID.Value) + "'");
-    if (parseCalleeSavedRegister(MF, PFS, CSIInfo, Object.CalleeSavedRegister,
+    if (parseCalleeSavedRegister(PFS, CSIInfo, Object.CalleeSavedRegister,
                                  ObjectIdx))
       return true;
     if (Object.LocalOffset)
       MFI.mapLocalFrameObject(ObjectIdx, Object.LocalOffset.getValue());
-    if (parseStackObjectsDebugInfo(MF, PFS, Object, ObjectIdx))
+    if (parseStackObjectsDebugInfo(PFS, Object, ObjectIdx))
       return true;
   }
   MFI.setCalleeSavedInfo(CSIInfo);
@@ -510,24 +543,21 @@ bool MIRParserImpl::initializeFrameInfo(MachineFunction &MF,
   if (!YamlMFI.StackProtector.Value.empty()) {
     SMDiagnostic Error;
     int FI;
-    if (parseStackObjectReference(FI, SM, MF, YamlMFI.StackProtector.Value, PFS,
-                                  IRSlots, Error))
+    if (parseStackObjectReference(PFS, FI, YamlMFI.StackProtector.Value, Error))
       return error(Error, YamlMFI.StackProtector.SourceRange);
     MFI.setStackProtectorIndex(FI);
   }
   return false;
 }
 
-bool MIRParserImpl::parseCalleeSavedRegister(
-    MachineFunction &MF, PerFunctionMIParsingState &PFS,
+bool MIRParserImpl::parseCalleeSavedRegister(PerFunctionMIParsingState &PFS,
     std::vector<CalleeSavedInfo> &CSIInfo,
     const yaml::StringValue &RegisterSource, int FrameIdx) {
   if (RegisterSource.Value.empty())
     return false;
   unsigned Reg = 0;
   SMDiagnostic Error;
-  if (parseNamedRegisterReference(Reg, SM, MF, RegisterSource.Value, PFS,
-                                  IRSlots, Error))
+  if (parseNamedRegisterReference(PFS, Reg, RegisterSource.Value, Error))
     return error(Error, RegisterSource.SourceRange);
   CSIInfo.push_back(CalleeSavedInfo(Reg, FrameIdx));
   return false;
@@ -548,16 +578,15 @@ static bool typecheckMDNode(T *&Result, MDNode *Node,
   return false;
 }
 
-bool MIRParserImpl::parseStackObjectsDebugInfo(
-    MachineFunction &MF, PerFunctionMIParsingState &PFS,
+bool MIRParserImpl::parseStackObjectsDebugInfo(PerFunctionMIParsingState &PFS,
     const yaml::MachineStackObject &Object, int FrameIdx) {
   // Debug information can only be attached to stack objects; Fixed stack
   // objects aren't supported.
   assert(FrameIdx >= 0 && "Expected a stack object frame index");
   MDNode *Var = nullptr, *Expr = nullptr, *Loc = nullptr;
-  if (parseMDNode(Var, Object.DebugVar, MF, PFS) ||
-      parseMDNode(Expr, Object.DebugExpr, MF, PFS) ||
-      parseMDNode(Loc, Object.DebugLoc, MF, PFS))
+  if (parseMDNode(PFS, Var, Object.DebugVar) ||
+      parseMDNode(PFS, Expr, Object.DebugExpr) ||
+      parseMDNode(PFS, Loc, Object.DebugLoc))
     return true;
   if (!Var && !Expr && !Loc)
     return false;
@@ -568,25 +597,24 @@ bool MIRParserImpl::parseStackObjectsDebugInfo(
       typecheckMDNode(DIExpr, Expr, Object.DebugExpr, "DIExpression", *this) ||
       typecheckMDNode(DILoc, Loc, Object.DebugLoc, "DILocation", *this))
     return true;
-  MF.getMMI().setVariableDbgInfo(DIVar, DIExpr, unsigned(FrameIdx), DILoc);
+  PFS.MF.getMMI().setVariableDbgInfo(DIVar, DIExpr, unsigned(FrameIdx), DILoc);
   return false;
 }
 
-bool MIRParserImpl::parseMDNode(MDNode *&Node, const yaml::StringValue &Source,
-                                MachineFunction &MF,
-                                const PerFunctionMIParsingState &PFS) {
+bool MIRParserImpl::parseMDNode(const PerFunctionMIParsingState &PFS,
+    MDNode *&Node, const yaml::StringValue &Source) {
   if (Source.Value.empty())
     return false;
   SMDiagnostic Error;
-  if (llvm::parseMDNode(Node, SM, MF, Source.Value, PFS, IRSlots, Error))
+  if (llvm::parseMDNode(PFS, Node, Source.Value, Error))
     return error(Error, Source.SourceRange);
   return false;
 }
 
-bool MIRParserImpl::initializeConstantPool(
-    MachineConstantPool &ConstantPool, const yaml::MachineFunction &YamlMF,
-    const MachineFunction &MF,
-    DenseMap<unsigned, unsigned> &ConstantPoolSlots) {
+bool MIRParserImpl::initializeConstantPool(PerFunctionMIParsingState &PFS,
+    MachineConstantPool &ConstantPool, const yaml::MachineFunction &YamlMF) {
+  DenseMap<unsigned, unsigned> &ConstantPoolSlots = PFS.ConstantPoolSlots;
+  const MachineFunction &MF = PFS.MF;
   const auto &M = *MF.getFunction()->getParent();
   SMDiagnostic Error;
   for (const auto &YamlConstant : YamlMF.Constants) {
@@ -608,15 +636,14 @@ bool MIRParserImpl::initializeConstantPool(
   return false;
 }
 
-bool MIRParserImpl::initializeJumpTableInfo(
-    MachineFunction &MF, const yaml::MachineJumpTable &YamlJTI,
-    PerFunctionMIParsingState &PFS) {
-  MachineJumpTableInfo *JTI = MF.getOrCreateJumpTableInfo(YamlJTI.Kind);
+bool MIRParserImpl::initializeJumpTableInfo(PerFunctionMIParsingState &PFS,
+    const yaml::MachineJumpTable &YamlJTI) {
+  MachineJumpTableInfo *JTI = PFS.MF.getOrCreateJumpTableInfo(YamlJTI.Kind);
   for (const auto &Entry : YamlJTI.Entries) {
     std::vector<MachineBasicBlock *> Blocks;
     for (const auto &MBBSource : Entry.Blocks) {
       MachineBasicBlock *MBB = nullptr;
-      if (parseMBBReference(MBB, MBBSource.Value, MF, PFS))
+      if (parseMBBReference(PFS, MBB, MBBSource.Value))
         return true;
       Blocks.push_back(MBB);
     }
@@ -630,12 +657,11 @@ bool MIRParserImpl::initializeJumpTableInfo(
   return false;
 }
 
-bool MIRParserImpl::parseMBBReference(MachineBasicBlock *&MBB,
-                                      const yaml::StringValue &Source,
-                                      MachineFunction &MF,
-                                      const PerFunctionMIParsingState &PFS) {
+bool MIRParserImpl::parseMBBReference(const PerFunctionMIParsingState &PFS,
+                                      MachineBasicBlock *&MBB,
+                                      const yaml::StringValue &Source) {
   SMDiagnostic Error;
-  if (llvm::parseMBBReference(MBB, SM, MF, Source.Value, PFS, IRSlots, Error))
+  if (llvm::parseMBBReference(PFS, MBB, Source.Value, Error))
     return error(Error, Source.SourceRange);
   return false;
 }
@@ -698,6 +724,21 @@ void MIRParserImpl::initNames2RegClasses(const MachineFunction &MF) {
   }
 }
 
+void MIRParserImpl::initNames2RegBanks(const MachineFunction &MF) {
+  if (!Names2RegBanks.empty())
+    return;
+  const RegisterBankInfo *RBI = MF.getSubtarget().getRegBankInfo();
+  // If the target does not support GlobalISel, we may not have a
+  // register bank info.
+  if (!RBI)
+    return;
+  for (unsigned I = 0, E = RBI->getNumRegBanks(); I < E; ++I) {
+    const auto &RegBank = RBI->getRegBank(I);
+    Names2RegBanks.insert(
+        std::make_pair(StringRef(RegBank.getName()).lower(), &RegBank));
+  }
+}
+
 const TargetRegisterClass *MIRParserImpl::getRegClass(const MachineFunction &MF,
                                                       StringRef Name) {
   initNames2RegClasses(MF);
@@ -707,6 +748,15 @@ const TargetRegisterClass *MIRParserImpl::getRegClass(const MachineFunction &MF,
   return RegClassInfo->getValue();
 }
 
+const RegisterBank *MIRParserImpl::getRegBank(const MachineFunction &MF,
+                                              StringRef Name) {
+  initNames2RegBanks(MF);
+  auto RegBankInfo = Names2RegBanks.find(Name);
+  if (RegBankInfo == Names2RegBanks.end())
+    return nullptr;
+  return RegBankInfo->getValue();
+}
+
 MIRParser::MIRParser(std::unique_ptr<MIRParserImpl> Impl)
     : Impl(std::move(Impl)) {}
 
diff --git a/lib/CodeGen/MIRParser/Makefile b/lib/CodeGen/MIRParser/Makefile
deleted file mode 100644
index c02d18806a9c..000000000000
--- a/lib/CodeGen/MIRParser/Makefile
+++ /dev/null
@@ -1,13 +0,0 @@
-##===- lib/CodeGen/MIRParser/Makefile ----------------------*- Makefile -*-===##
-#
-#                     The LLVM Compiler Infrastructure
-#
-# This file is distributed under the University of Illinois Open Source
-# License. See LICENSE.TXT for details.
-#
-##===----------------------------------------------------------------------===##
-
-LEVEL = ../../..
-LIBRARYNAME = LLVMMIRParser
-
-include $(LEVEL)/Makefile.common
diff --git a/lib/CodeGen/MIRPrinter.cpp b/lib/CodeGen/MIRPrinter.cpp
index 175cb0d51437..703c99d9edd3 100644
--- a/lib/CodeGen/MIRPrinter.cpp
+++ b/lib/CodeGen/MIRPrinter.cpp
@@ -14,23 +14,25 @@
 
 #include "MIRPrinter.h"
 #include "llvm/ADT/STLExtras.h"
+#include "llvm/CodeGen/GlobalISel/RegisterBank.h"
+#include "llvm/CodeGen/MIRYamlMapping.h"
 #include "llvm/CodeGen/MachineConstantPool.h"
-#include "llvm/CodeGen/MachineFunction.h"
 #include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/CodeGen/MachineFunction.h"
 #include "llvm/CodeGen/MachineMemOperand.h"
 #include "llvm/CodeGen/MachineModuleInfo.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
-#include "llvm/CodeGen/MIRYamlMapping.h"
 #include "llvm/IR/BasicBlock.h"
 #include "llvm/IR/Constants.h"
-#include "llvm/IR/Instructions.h"
+#include "llvm/IR/DebugInfo.h"
 #include "llvm/IR/IRPrintingPasses.h"
+#include "llvm/IR/Instructions.h"
 #include "llvm/IR/Module.h"
 #include "llvm/IR/ModuleSlotTracker.h"
 #include "llvm/MC/MCSymbol.h"
 #include "llvm/Support/MemoryBuffer.h"
-#include "llvm/Support/raw_ostream.h"
 #include "llvm/Support/YAMLTraits.h"
+#include "llvm/Support/raw_ostream.h"
 #include "llvm/Target/TargetInstrInfo.h"
 #include "llvm/Target/TargetSubtargetInfo.h"
 
@@ -118,7 +120,8 @@ public:
   void printOffset(int64_t Offset);
   void printTargetFlags(const MachineOperand &Op);
   void print(const MachineOperand &Op, const TargetRegisterInfo *TRI,
-             unsigned I, bool ShouldPrintRegisterTies, bool IsDef = false);
+             unsigned I, bool ShouldPrintRegisterTies,
+             const MachineRegisterInfo *MRI = nullptr, bool IsDef = false);
   void print(const MachineMemOperand &Op);
 
   void print(const MCCFIInstruction &CFI, const TargetRegisterInfo *TRI);
@@ -170,6 +173,9 @@ void MIRPrinter::print(const MachineFunction &MF) {
   YamlMF.Alignment = MF.getAlignment();
   YamlMF.ExposesReturnsTwice = MF.exposesReturnsTwice();
   YamlMF.HasInlineAsm = MF.hasInlineAsm();
+  YamlMF.AllVRegsAllocated = MF.getProperties().hasProperty(
+      MachineFunctionProperties::Property::AllVRegsAllocated);
+
   convert(YamlMF, MF.getRegInfo(), MF.getSubtarget().getRegisterInfo());
   ModuleSlotTracker MST(MF.getFunction()->getParent());
   MST.incorporateFunction(*MF.getFunction());
@@ -206,8 +212,15 @@ void MIRPrinter::convert(yaml::MachineFunction &MF,
     unsigned Reg = TargetRegisterInfo::index2VirtReg(I);
     yaml::VirtualRegisterDefinition VReg;
     VReg.ID = I;
-    VReg.Class =
-        StringRef(TRI->getRegClassName(RegInfo.getRegClass(Reg))).lower();
+    if (RegInfo.getRegClassOrNull(Reg))
+      VReg.Class =
+          StringRef(TRI->getRegClassName(RegInfo.getRegClass(Reg))).lower();
+    else if (RegInfo.getRegBankOrNull(Reg))
+      VReg.Class = StringRef(RegInfo.getRegBankOrNull(Reg)->getName()).lower();
+    else {
+      VReg.Class = std::string("_");
+      assert(RegInfo.getSize(Reg) && "Generic registers must have a size");
+    }
     unsigned PreferredReg = RegInfo.getSimpleHint(Reg);
     if (PreferredReg)
       printReg(PreferredReg, VReg.PreferredRegister, TRI);
@@ -525,7 +538,9 @@ static bool hasComplexRegisterTies(const MachineInstr &MI) {
 }
 
 void MIPrinter::print(const MachineInstr &MI) {
-  const auto &SubTarget = MI.getParent()->getParent()->getSubtarget();
+  const auto *MF = MI.getParent()->getParent();
+  const auto &MRI = MF->getRegInfo();
+  const auto &SubTarget = MF->getSubtarget();
   const auto *TRI = SubTarget.getRegisterInfo();
   assert(TRI && "Expected target register info");
   const auto *TII = SubTarget.getInstrInfo();
@@ -540,7 +555,8 @@ void MIPrinter::print(const MachineInstr &MI) {
        ++I) {
     if (I)
       OS << ", ";
-    print(MI.getOperand(I), TRI, I, ShouldPrintRegisterTies, /*IsDef=*/true);
+    print(MI.getOperand(I), TRI, I, ShouldPrintRegisterTies, &MRI,
+          /*IsDef=*/true);
   }
 
   if (I)
@@ -548,6 +564,11 @@ void MIPrinter::print(const MachineInstr &MI) {
   if (MI.getFlag(MachineInstr::FrameSetup))
     OS << "frame-setup ";
   OS << TII->getName(MI.getOpcode());
+  if (isPreISelGenericOpcode(MI.getOpcode())) {
+    assert(MI.getType() && "Generic instructions must have a type");
+    OS << ' ';
+    MI.getType()->print(OS, /*IsForDebug*/ false, /*NoDetails*/ true);
+  }
   if (I < E)
     OS << ' ';
 
@@ -727,7 +748,8 @@ static const char *getTargetIndexName(const MachineFunction &MF, int Index) {
 }
 
 void MIPrinter::print(const MachineOperand &Op, const TargetRegisterInfo *TRI,
-                      unsigned I, bool ShouldPrintRegisterTies, bool IsDef) {
+                      unsigned I, bool ShouldPrintRegisterTies,
+                      const MachineRegisterInfo *MRI, bool IsDef) {
   printTargetFlags(Op);
   switch (Op.getType()) {
   case MachineOperand::MO_Register:
@@ -754,6 +776,9 @@ void MIPrinter::print(const MachineOperand &Op, const TargetRegisterInfo *TRI,
       OS << ':' << TRI->getSubRegIndexName(Op.getSubReg());
     if (ShouldPrintRegisterTies && Op.isTied() && !Op.isDef())
       OS << "(tied-def " << Op.getParent()->findTiedOperandIdx(I) << ")";
+    assert((!IsDef || MRI) && "for IsDef, MRI must be provided");
+    if (IsDef && MRI->getSize(Op.getReg()))
+      OS << '(' << MRI->getSize(Op.getReg()) << ')';
     break;
   case MachineOperand::MO_Immediate:
     OS << Op.getImm();
@@ -858,11 +883,12 @@ void MIPrinter::print(const MachineMemOperand &Op) {
     assert(Op.isStore() && "Non load machine operand must be a store");
     OS << "store ";
   }
-  OS << Op.getSize() << (Op.isLoad() ? " from " : " into ");
+  OS << Op.getSize();
   if (const Value *Val = Op.getValue()) {
+    OS << (Op.isLoad() ? " from " : " into ");
     printIRValueReference(*Val);
-  } else {
-    const PseudoSourceValue *PVal = Op.getPseudoValue();
+  } else if (const PseudoSourceValue *PVal = Op.getPseudoValue()) {
+    OS << (Op.isLoad() ? " from " : " into ");
     assert(PVal && "Expected a pseudo source value");
     switch (PVal->kind()) {
     case PseudoSourceValue::Stack:
diff --git a/lib/CodeGen/MachineBasicBlock.cpp b/lib/CodeGen/MachineBasicBlock.cpp
index 85d544d94984..689dd0764ce0 100644
--- a/lib/CodeGen/MachineBasicBlock.cpp
+++ b/lib/CodeGen/MachineBasicBlock.cpp
@@ -13,7 +13,6 @@
 
 #include "llvm/CodeGen/MachineBasicBlock.h"
 #include "llvm/ADT/SmallPtrSet.h"
-#include "llvm/ADT/SmallString.h"
 #include "llvm/CodeGen/LiveIntervalAnalysis.h"
 #include "llvm/CodeGen/LiveVariables.h"
 #include "llvm/CodeGen/MachineDominators.h"
@@ -199,16 +198,6 @@ MachineBasicBlock::iterator MachineBasicBlock::getLastNonDebugInstr() {
   return end();
 }
 
-const MachineBasicBlock *MachineBasicBlock::getLandingPadSuccessor() const {
-  // A block with a landing pad successor only has one other successor.
-  if (succ_size() > 2)
-    return nullptr;
-  for (const_succ_iterator I = succ_begin(), E = succ_end(); I != E; ++I)
-    if ((*I)->isEHPad())
-      return *I;
-  return nullptr;
-}
-
 bool MachineBasicBlock::hasEHPadSuccessor() const {
   for (const_succ_iterator I = succ_begin(), E = succ_end(); I != E; ++I)
     if ((*I)->isEHPad())
@@ -217,7 +206,7 @@ bool MachineBasicBlock::hasEHPadSuccessor() const {
 }
 
 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
-void MachineBasicBlock::dump() const {
+LLVM_DUMP_METHOD void MachineBasicBlock::dump() const {
   print(dbgs());
 }
 #endif
@@ -241,7 +230,8 @@ std::string MachineBasicBlock::getFullName() const {
   return Name;
 }
 
-void MachineBasicBlock::print(raw_ostream &OS, SlotIndexes *Indexes) const {
+void MachineBasicBlock::print(raw_ostream &OS, const SlotIndexes *Indexes)
+    const {
   const MachineFunction *MF = getParent();
   if (!MF) {
     OS << "Can't print out MachineBasicBlock because parent MachineFunction"
@@ -255,7 +245,7 @@ void MachineBasicBlock::print(raw_ostream &OS, SlotIndexes *Indexes) const {
 }
 
 void MachineBasicBlock::print(raw_ostream &OS, ModuleSlotTracker &MST,
-                              SlotIndexes *Indexes) const {
+                              const SlotIndexes *Indexes) const {
   const MachineFunction *MF = getParent();
   if (!MF) {
     OS << "Can't print out MachineBasicBlock because parent MachineFunction"
@@ -302,16 +292,16 @@ void MachineBasicBlock::print(raw_ostream &OS, ModuleSlotTracker &MST,
     OS << '\n';
   }
 
-  for (const_instr_iterator I = instr_begin(); I != instr_end(); ++I) {
+  for (auto &I : instrs()) {
     if (Indexes) {
-      if (Indexes->hasIndex(&*I))
-        OS << Indexes->getInstructionIndex(&*I);
+      if (Indexes->hasIndex(I))
+        OS << Indexes->getInstructionIndex(I);
       OS << '\t';
     }
     OS << '\t';
-    if (I->isInsideBundle())
+    if (I.isInsideBundle())
       OS << "  * ";
-    I->print(OS, MST);
+    I.print(OS, MST);
   }
 
   // Print the successors of this block according to the CFG.
@@ -414,24 +404,25 @@ void MachineBasicBlock::moveAfter(MachineBasicBlock *NewBefore) {
 void MachineBasicBlock::updateTerminator() {
   const TargetInstrInfo *TII = getParent()->getSubtarget().getInstrInfo();
   // A block with no successors has no concerns with fall-through edges.
-  if (this->succ_empty()) return;
+  if (this->succ_empty())
+    return;
 
   MachineBasicBlock *TBB = nullptr, *FBB = nullptr;
   SmallVector<MachineOperand, 4> Cond;
   DebugLoc DL;  // FIXME: this is nowhere
-  bool B = TII->AnalyzeBranch(*this, TBB, FBB, Cond);
+  bool B = TII->analyzeBranch(*this, TBB, FBB, Cond);
   (void) B;
   assert(!B && "UpdateTerminators requires analyzable predecessors!");
   if (Cond.empty()) {
     if (TBB) {
-      // The block has an unconditional branch. If its successor is now
-      // its layout successor, delete the branch.
+      // The block has an unconditional branch. If its successor is now its
+      // layout successor, delete the branch.
       if (isLayoutSuccessor(TBB))
         TII->RemoveBranch(*this);
     } else {
-      // The block has an unconditional fallthrough. If its successor is not
-      // its layout successor, insert a branch. First we have to locate the
-      // only non-landing-pad successor, as that is the fallthrough block.
+      // The block has an unconditional fallthrough. If its successor is not its
+      // layout successor, insert a branch. First we have to locate the only
+      // non-landing-pad successor, as that is the fallthrough block.
       for (succ_iterator SI = succ_begin(), SE = succ_end(); SI != SE; ++SI) {
         if ((*SI)->isEHPad())
           continue;
@@ -439,8 +430,8 @@ void MachineBasicBlock::updateTerminator() {
         TBB = *SI;
       }
 
-      // If there is no non-landing-pad successor, the block has no
-      // fall-through edges to be concerned with.
+      // If there is no non-landing-pad successor, the block has no fall-through
+      // edges to be concerned with.
       if (!TBB)
         return;
 
@@ -449,61 +440,73 @@ void MachineBasicBlock::updateTerminator() {
       if (!isLayoutSuccessor(TBB))
         TII->InsertBranch(*this, TBB, nullptr, Cond, DL);
     }
-  } else {
-    if (FBB) {
-      // The block has a non-fallthrough conditional branch. If one of its
-      // successors is its layout successor, rewrite it to a fallthrough
-      // conditional branch.
-      if (isLayoutSuccessor(TBB)) {
-        if (TII->ReverseBranchCondition(Cond))
-          return;
-        TII->RemoveBranch(*this);
-        TII->InsertBranch(*this, FBB, nullptr, Cond, DL);
-      } else if (isLayoutSuccessor(FBB)) {
-        TII->RemoveBranch(*this);
-        TII->InsertBranch(*this, TBB, nullptr, Cond, DL);
-      }
-    } else {
-      // Walk through the successors and find the successor which is not
-      // a landing pad and is not the conditional branch destination (in TBB)
-      // as the fallthrough successor.
-      MachineBasicBlock *FallthroughBB = nullptr;
-      for (succ_iterator SI = succ_begin(), SE = succ_end(); SI != SE; ++SI) {
-        if ((*SI)->isEHPad() || *SI == TBB)
-          continue;
-        assert(!FallthroughBB && "Found more than one fallthrough successor.");
-        FallthroughBB = *SI;
-      }
-      if (!FallthroughBB && canFallThrough()) {
-        // We fallthrough to the same basic block as the conditional jump
-        // targets. Remove the conditional jump, leaving unconditional
-        // fallthrough.
-        // FIXME: This does not seem like a reasonable pattern to support, but
-        // it has been seen in the wild coming out of degenerate ARM test cases.
-        TII->RemoveBranch(*this);
+    return;
+  }
 
-        // Finally update the unconditional successor to be reached via a branch
-        // if it would not be reached by fallthrough.
-        if (!isLayoutSuccessor(TBB))
-          TII->InsertBranch(*this, TBB, nullptr, Cond, DL);
+  if (FBB) {
+    // The block has a non-fallthrough conditional branch. If one of its
+    // successors is its layout successor, rewrite it to a fallthrough
+    // conditional branch.
+    if (isLayoutSuccessor(TBB)) {
+      if (TII->ReverseBranchCondition(Cond))
         return;
-      }
+      TII->RemoveBranch(*this);
+      TII->InsertBranch(*this, FBB, nullptr, Cond, DL);
+    } else if (isLayoutSuccessor(FBB)) {
+      TII->RemoveBranch(*this);
+      TII->InsertBranch(*this, TBB, nullptr, Cond, DL);
+    }
+    return;
+  }
 
-      // The block has a fallthrough conditional branch.
-      if (isLayoutSuccessor(TBB)) {
-        if (TII->ReverseBranchCondition(Cond)) {
-          // We can't reverse the condition, add an unconditional branch.
-          Cond.clear();
-          TII->InsertBranch(*this, FallthroughBB, nullptr, Cond, DL);
-          return;
-        }
-        TII->RemoveBranch(*this);
-        TII->InsertBranch(*this, FallthroughBB, nullptr, Cond, DL);
-      } else if (!isLayoutSuccessor(FallthroughBB)) {
-        TII->RemoveBranch(*this);
-        TII->InsertBranch(*this, TBB, FallthroughBB, Cond, DL);
-      }
+  // Walk through the successors and find the successor which is not a landing
+  // pad and is not the conditional branch destination (in TBB) as the
+  // fallthrough successor.
+  MachineBasicBlock *FallthroughBB = nullptr;
+  for (succ_iterator SI = succ_begin(), SE = succ_end(); SI != SE; ++SI) {
+    if ((*SI)->isEHPad() || *SI == TBB)
+      continue;
+    assert(!FallthroughBB && "Found more than one fallthrough successor.");
+    FallthroughBB = *SI;
+  }
+
+  if (!FallthroughBB) {
+    if (canFallThrough()) {
+      // We fallthrough to the same basic block as the conditional jump targets.
+      // Remove the conditional jump, leaving unconditional fallthrough.
+      // FIXME: This does not seem like a reasonable pattern to support, but it
+      // has been seen in the wild coming out of degenerate ARM test cases.
+      TII->RemoveBranch(*this);
+  
+      // Finally update the unconditional successor to be reached via a branch if
+      // it would not be reached by fallthrough.
+      if (!isLayoutSuccessor(TBB))
+        TII->InsertBranch(*this, TBB, nullptr, Cond, DL);
+      return;
+    }
+
+    // We enter here iff exactly one successor is TBB which cannot fallthrough
+    // and the rest successors if any are EHPads.  In this case, we need to
+    // change the conditional branch into unconditional branch.
+    TII->RemoveBranch(*this);
+    Cond.clear();
+    TII->InsertBranch(*this, TBB, nullptr, Cond, DL);
+    return;
+  }
+
+  // The block has a fallthrough conditional branch.
+  if (isLayoutSuccessor(TBB)) {
+    if (TII->ReverseBranchCondition(Cond)) {
+      // We can't reverse the condition, add an unconditional branch.
+      Cond.clear();
+      TII->InsertBranch(*this, FallthroughBB, nullptr, Cond, DL);
+      return;
     }
+    TII->RemoveBranch(*this);
+    TII->InsertBranch(*this, FallthroughBB, nullptr, Cond, DL);
+  } else if (!isLayoutSuccessor(FallthroughBB)) {
+    TII->RemoveBranch(*this);
+    TII->InsertBranch(*this, TBB, FallthroughBB, Cond, DL);
   }
 }
 
@@ -685,13 +688,13 @@ bool MachineBasicBlock::canFallThrough() {
   MachineBasicBlock *TBB = nullptr, *FBB = nullptr;
   SmallVector<MachineOperand, 4> Cond;
   const TargetInstrInfo *TII = getParent()->getSubtarget().getInstrInfo();
-  if (TII->AnalyzeBranch(*this, TBB, FBB, Cond)) {
+  if (TII->analyzeBranch(*this, TBB, FBB, Cond)) {
     // If we couldn't analyze the branch, examine the last instruction.
     // If the block doesn't end in a known control barrier, assume fallthrough
     // is possible. The isPredicated check is needed because this code can be
     // called during IfConversion, where an instruction which is normally a
     // Barrier is predicated and thus no longer an actual control barrier.
-    return empty() || !back().isBarrier() || TII->isPredicated(&back());
+    return empty() || !back().isBarrier() || TII->isPredicated(back());
   }
 
   // If there is no branch, control always falls through.
@@ -712,39 +715,14 @@ bool MachineBasicBlock::canFallThrough() {
   return FBB == nullptr;
 }
 
-MachineBasicBlock *
-MachineBasicBlock::SplitCriticalEdge(MachineBasicBlock *Succ, Pass *P) {
-  // Splitting the critical edge to a landing pad block is non-trivial. Don't do
-  // it in this generic function.
-  if (Succ->isEHPad())
+MachineBasicBlock *MachineBasicBlock::SplitCriticalEdge(MachineBasicBlock *Succ,
+                                                        Pass &P) {
+  if (!canSplitCriticalEdge(Succ))
     return nullptr;
 
   MachineFunction *MF = getParent();
   DebugLoc DL;  // FIXME: this is nowhere
 
-  // Performance might be harmed on HW that implements branching using exec mask
-  // where both sides of the branches are always executed.
-  if (MF->getTarget().requiresStructuredCFG())
-    return nullptr;
-
-  // We may need to update this's terminator, but we can't do that if
-  // AnalyzeBranch fails. If this uses a jump table, we won't touch it.
-  const TargetInstrInfo *TII = MF->getSubtarget().getInstrInfo();
-  MachineBasicBlock *TBB = nullptr, *FBB = nullptr;
-  SmallVector<MachineOperand, 4> Cond;
-  if (TII->AnalyzeBranch(*this, TBB, FBB, Cond))
-    return nullptr;
-
-  // Avoid bugpoint weirdness: A block may end with a conditional branch but
-  // jumps to the same MBB is either case. We have duplicate CFG edges in that
-  // case that we can't handle. Since this never happens in properly optimized
-  // code, just skip those edges.
-  if (TBB && TBB == FBB) {
-    DEBUG(dbgs() << "Won't split critical edge after degenerate BB#"
-                 << getNumber() << '\n');
-    return nullptr;
-  }
-
   MachineBasicBlock *NMBB = MF->CreateMachineBasicBlock();
   MF->insert(std::next(MachineFunction::iterator(this)), NMBB);
   DEBUG(dbgs() << "Splitting critical edge:"
@@ -752,8 +730,8 @@ MachineBasicBlock::SplitCriticalEdge(MachineBasicBlock *Succ, Pass *P) {
         << " -- BB#" << NMBB->getNumber()
         << " -- BB#" << Succ->getNumber() << '\n');
 
-  LiveIntervals *LIS = P->getAnalysisIfAvailable<LiveIntervals>();
-  SlotIndexes *Indexes = P->getAnalysisIfAvailable<SlotIndexes>();
+  LiveIntervals *LIS = P.getAnalysisIfAvailable<LiveIntervals>();
+  SlotIndexes *Indexes = P.getAnalysisIfAvailable<SlotIndexes>();
   if (LIS)
     LIS->insertMBBInMaps(NMBB);
   else if (Indexes)
@@ -762,7 +740,7 @@ MachineBasicBlock::SplitCriticalEdge(MachineBasicBlock *Succ, Pass *P) {
   // On some targets like Mips, branches may kill virtual registers. Make sure
   // that LiveVariables is properly updated after updateTerminator replaces the
   // terminators.
-  LiveVariables *LV = P->getAnalysisIfAvailable<LiveVariables>();
+  LiveVariables *LV = P.getAnalysisIfAvailable<LiveVariables>();
 
   // Collect a list of virtual registers killed by the terminators.
   SmallVector<unsigned, 4> KilledRegs;
@@ -777,7 +755,7 @@ MachineBasicBlock::SplitCriticalEdge(MachineBasicBlock *Succ, Pass *P) {
           continue;
         unsigned Reg = OI->getReg();
         if (TargetRegisterInfo::isPhysicalRegister(Reg) ||
-            LV->getVarInfo(Reg).removeKill(MI)) {
+            LV->getVarInfo(Reg).removeKill(*MI)) {
           KilledRegs.push_back(Reg);
           DEBUG(dbgs() << "Removing terminator kill: " << *MI);
           OI->setIsKill(false);
@@ -826,24 +804,24 @@ MachineBasicBlock::SplitCriticalEdge(MachineBasicBlock *Succ, Pass *P) {
         E = Terminators.end(); I != E; ++I) {
       if (std::find(NewTerminators.begin(), NewTerminators.end(), *I) ==
           NewTerminators.end())
-       Indexes->removeMachineInstrFromMaps(*I);
+       Indexes->removeMachineInstrFromMaps(**I);
     }
   }
 
   // Insert unconditional "jump Succ" instruction in NMBB if necessary.
   NMBB->addSuccessor(Succ);
   if (!NMBB->isLayoutSuccessor(Succ)) {
-    Cond.clear();
+    SmallVector<MachineOperand, 4> Cond;
+    const TargetInstrInfo *TII = getParent()->getSubtarget().getInstrInfo();
     TII->InsertBranch(*NMBB, Succ, nullptr, Cond, DL);
 
     if (Indexes) {
-      for (instr_iterator I = NMBB->instr_begin(), E = NMBB->instr_end();
-           I != E; ++I) {
+      for (MachineInstr &MI : NMBB->instrs()) {
         // Some instructions may have been moved to NMBB by updateTerminator(),
         // so we first remove any instruction that already has an index.
-        if (Indexes->hasIndex(&*I))
-          Indexes->removeMachineInstrFromMaps(&*I);
-        Indexes->insertMachineInstrInMaps(&*I);
+        if (Indexes->hasIndex(MI))
+          Indexes->removeMachineInstrFromMaps(MI);
+        Indexes->insertMachineInstrInMaps(MI);
       }
     }
   }
@@ -942,10 +920,10 @@ MachineBasicBlock::SplitCriticalEdge(MachineBasicBlock *Succ, Pass *P) {
   }
 
   if (MachineDominatorTree *MDT =
-      P->getAnalysisIfAvailable<MachineDominatorTree>())
+          P.getAnalysisIfAvailable<MachineDominatorTree>())
     MDT->recordSplitCriticalEdge(this, Succ, NMBB);
 
-  if (MachineLoopInfo *MLI = P->getAnalysisIfAvailable<MachineLoopInfo>())
+  if (MachineLoopInfo *MLI = P.getAnalysisIfAvailable<MachineLoopInfo>())
     if (MachineLoop *TIL = MLI->getLoopFor(this)) {
       // If one or the other blocks were not in a loop, the new block is not
       // either, and thus LI doesn't need to be updated.
@@ -975,6 +953,42 @@ MachineBasicBlock::SplitCriticalEdge(MachineBasicBlock *Succ, Pass *P) {
   return NMBB;
 }
 
+bool MachineBasicBlock::canSplitCriticalEdge(
+    const MachineBasicBlock *Succ) const {
+  // Splitting the critical edge to a landing pad block is non-trivial. Don't do
+  // it in this generic function.
+  if (Succ->isEHPad())
+    return false;
+
+  const MachineFunction *MF = getParent();
+
+  // Performance might be harmed on HW that implements branching using exec mask
+  // where both sides of the branches are always executed.
+  if (MF->getTarget().requiresStructuredCFG())
+    return false;
+
+  // We may need to update this's terminator, but we can't do that if
+  // AnalyzeBranch fails. If this uses a jump table, we won't touch it.
+  const TargetInstrInfo *TII = MF->getSubtarget().getInstrInfo();
+  MachineBasicBlock *TBB = nullptr, *FBB = nullptr;
+  SmallVector<MachineOperand, 4> Cond;
+  // AnalyzeBanch should modify this, since we did not allow modification.
+  if (TII->analyzeBranch(*const_cast<MachineBasicBlock *>(this), TBB, FBB, Cond,
+                         /*AllowModify*/ false))
+    return false;
+
+  // Avoid bugpoint weirdness: A block may end with a conditional branch but
+  // jumps to the same MBB is either case. We have duplicate CFG edges in that
+  // case that we can't handle. Since this never happens in properly optimized
+  // code, just skip those edges.
+  if (TBB && TBB == FBB) {
+    DEBUG(dbgs() << "Won't split critical edge after degenerate BB#"
+                 << getNumber() << '\n');
+    return false;
+  }
+  return true;
+}
+
 /// Prepare MI to be removed from its bundle. This fixes bundle flags on MI's
 /// neighboring instructions so the bundle won't be broken by removing MI.
 static void unbundleSingleMI(MachineInstr *MI) {
@@ -1200,7 +1214,7 @@ MachineBasicBlock::computeRegisterLiveness(const TargetRegisterInfo *TRI,
       --I;
 
       MachineOperandIteratorBase::PhysRegInfo Info =
-        ConstMIOperands(I).analyzePhysReg(Reg, TRI);
+          ConstMIOperands(*I).analyzePhysReg(Reg, TRI);
 
       // Defs happen after uses so they take precedence if both are present.
 
@@ -1208,8 +1222,15 @@ MachineBasicBlock::computeRegisterLiveness(const TargetRegisterInfo *TRI,
       if (Info.DeadDef)
         return LQR_Dead;
       // Register is (at least partially) live after a def.
-      if (Info.Defined)
-        return LQR_Live;
+      if (Info.Defined) {
+        if (!Info.PartialDeadDef)
+          return LQR_Live;
+        // As soon as we saw a partial definition (dead or not),
+        // we cannot tell if the value is partial live without
+        // tracking the lanemasks. We are not going to do this,
+        // so fall back on the remaining of the analysis.
+        break;
+      }
       // Register is dead after a full kill or clobber and no def.
       if (Info.Killed || Info.Clobbered)
         return LQR_Dead;
@@ -1238,7 +1259,7 @@ MachineBasicBlock::computeRegisterLiveness(const TargetRegisterInfo *TRI,
   if (I != end()) {
     for (++I; I != end() && N > 0; ++I, --N) {
       MachineOperandIteratorBase::PhysRegInfo Info =
-        ConstMIOperands(I).analyzePhysReg(Reg, TRI);
+          ConstMIOperands(*I).analyzePhysReg(Reg, TRI);
 
       // Register is live when we read it here.
       if (Info.Read)
diff --git a/lib/CodeGen/MachineBlockFrequencyInfo.cpp b/lib/CodeGen/MachineBlockFrequencyInfo.cpp
index 9119e31bdb3c..6c0f99fa111e 100644
--- a/lib/CodeGen/MachineBlockFrequencyInfo.cpp
+++ b/lib/CodeGen/MachineBlockFrequencyInfo.cpp
@@ -20,43 +20,44 @@
 #include "llvm/InitializePasses.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Debug.h"
+#include "llvm/Support/Format.h"
 #include "llvm/Support/GraphWriter.h"
+#include "llvm/Support/raw_ostream.h"
 
 using namespace llvm;
 
 #define DEBUG_TYPE "block-freq"
 
 #ifndef NDEBUG
-enum GVDAGType {
-  GVDT_None,
-  GVDT_Fraction,
-  GVDT_Integer
-};
 
-static cl::opt<GVDAGType>
-ViewMachineBlockFreqPropagationDAG("view-machine-block-freq-propagation-dags",
-                                   cl::Hidden,
-          cl::desc("Pop up a window to show a dag displaying how machine block "
-                   "frequencies propagate through the CFG."),
-          cl::values(
-            clEnumValN(GVDT_None, "none",
-                       "do not display graphs."),
-            clEnumValN(GVDT_Fraction, "fraction", "display a graph using the "
-                       "fractional block frequency representation."),
-            clEnumValN(GVDT_Integer, "integer", "display a graph using the raw "
-                       "integer fractional block frequency representation."),
-            clEnumValEnd));
+static cl::opt<GVDAGType> ViewMachineBlockFreqPropagationDAG(
+    "view-machine-block-freq-propagation-dags", cl::Hidden,
+    cl::desc("Pop up a window to show a dag displaying how machine block "
+             "frequencies propagate through the CFG."),
+    cl::values(clEnumValN(GVDT_None, "none", "do not display graphs."),
+               clEnumValN(GVDT_Fraction, "fraction",
+                          "display a graph using the "
+                          "fractional block frequency representation."),
+               clEnumValN(GVDT_Integer, "integer",
+                          "display a graph using the raw "
+                          "integer fractional block frequency representation."),
+               clEnumValN(GVDT_Count, "count", "display a graph using the real "
+                                               "profile count if available."),
+
+               clEnumValEnd));
+
+extern cl::opt<std::string> ViewBlockFreqFuncName;
+extern cl::opt<unsigned> ViewHotFreqPercent;
 
 namespace llvm {
 
-template <>
-struct GraphTraits<MachineBlockFrequencyInfo *> {
+template <> struct GraphTraits<MachineBlockFrequencyInfo *> {
   typedef const MachineBasicBlock NodeType;
   typedef MachineBasicBlock::const_succ_iterator ChildIteratorType;
   typedef MachineFunction::const_iterator nodes_iterator;
 
-  static inline
-  const NodeType *getEntryNode(const MachineBlockFrequencyInfo *G) {
+  static inline const NodeType *
+  getEntryNode(const MachineBlockFrequencyInfo *G) {
     return &G->getFunction()->front();
   }
 
@@ -77,38 +78,33 @@ struct GraphTraits<MachineBlockFrequencyInfo *> {
   }
 };
 
-template<>
-struct DOTGraphTraits<MachineBlockFrequencyInfo*> :
-    public DefaultDOTGraphTraits {
-  explicit DOTGraphTraits(bool isSimple=false) :
-    DefaultDOTGraphTraits(isSimple) {}
-
-  static std::string getGraphName(const MachineBlockFrequencyInfo *G) {
-    return G->getFunction()->getName();
-  }
+typedef BFIDOTGraphTraitsBase<MachineBlockFrequencyInfo,
+                              MachineBranchProbabilityInfo>
+    MBFIDOTGraphTraitsBase;
+template <>
+struct DOTGraphTraits<MachineBlockFrequencyInfo *>
+    : public MBFIDOTGraphTraitsBase {
+  explicit DOTGraphTraits(bool isSimple = false)
+      : MBFIDOTGraphTraitsBase(isSimple) {}
 
   std::string getNodeLabel(const MachineBasicBlock *Node,
                            const MachineBlockFrequencyInfo *Graph) {
-    std::string Result;
-    raw_string_ostream OS(Result);
-
-    OS << Node->getName().str() << ":";
-    switch (ViewMachineBlockFreqPropagationDAG) {
-    case GVDT_Fraction:
-      Graph->printBlockFreq(OS, Node);
-      break;
-    case GVDT_Integer:
-      OS << Graph->getBlockFreq(Node).getFrequency();
-      break;
-    case GVDT_None:
-      llvm_unreachable("If we are not supposed to render a graph we should "
-                       "never reach this point.");
-    }
-
-    return Result;
+    return MBFIDOTGraphTraitsBase::getNodeLabel(
+        Node, Graph, ViewMachineBlockFreqPropagationDAG);
   }
-};
 
+  std::string getNodeAttributes(const MachineBasicBlock *Node,
+                                const MachineBlockFrequencyInfo *Graph) {
+    return MBFIDOTGraphTraitsBase::getNodeAttributes(Node, Graph,
+                                                     ViewHotFreqPercent);
+  }
+
+  std::string getEdgeAttributes(const MachineBasicBlock *Node, EdgeIter EI,
+                                const MachineBlockFrequencyInfo *MBFI) {
+    return MBFIDOTGraphTraitsBase::getEdgeAttributes(
+        Node, EI, MBFI, MBFI->getMBPI(), ViewHotFreqPercent);
+  }
+};
 
 } // end namespace llvm
 #endif
@@ -122,9 +118,8 @@ INITIALIZE_PASS_END(MachineBlockFrequencyInfo, "machine-block-freq",
 
 char MachineBlockFrequencyInfo::ID = 0;
 
-
-MachineBlockFrequencyInfo::
-MachineBlockFrequencyInfo() :MachineFunctionPass(ID) {
+MachineBlockFrequencyInfo::MachineBlockFrequencyInfo()
+    : MachineFunctionPass(ID) {
   initializeMachineBlockFrequencyInfoPass(*PassRegistry::getPassRegistry());
 }
 
@@ -145,7 +140,9 @@ bool MachineBlockFrequencyInfo::runOnMachineFunction(MachineFunction &F) {
     MBFI.reset(new ImplType);
   MBFI->calculate(F, MBPI, MLI);
 #ifndef NDEBUG
-  if (ViewMachineBlockFreqPropagationDAG != GVDT_None) {
+  if (ViewMachineBlockFreqPropagationDAG != GVDT_None &&
+      (ViewBlockFreqFuncName.empty() ||
+       F.getName().equals(ViewBlockFreqFuncName))) {
     view();
   }
 #endif
@@ -163,19 +160,29 @@ void MachineBlockFrequencyInfo::view() const {
             "MachineBlockFrequencyDAGs");
 #else
   errs() << "MachineBlockFrequencyInfo::view is only available in debug builds "
-    "on systems with Graphviz or gv!\n";
+            "on systems with Graphviz or gv!\n";
 #endif // NDEBUG
 }
 
-BlockFrequency MachineBlockFrequencyInfo::
-getBlockFreq(const MachineBasicBlock *MBB) const {
+BlockFrequency
+MachineBlockFrequencyInfo::getBlockFreq(const MachineBasicBlock *MBB) const {
   return MBFI ? MBFI->getBlockFreq(MBB) : 0;
 }
 
+Optional<uint64_t> MachineBlockFrequencyInfo::getBlockProfileCount(
+    const MachineBasicBlock *MBB) const {
+  const Function *F = MBFI->getFunction()->getFunction();
+  return MBFI ? MBFI->getBlockProfileCount(*F, MBB) : None;
+}
+
 const MachineFunction *MachineBlockFrequencyInfo::getFunction() const {
   return MBFI ? MBFI->getFunction() : nullptr;
 }
 
+const MachineBranchProbabilityInfo *MachineBlockFrequencyInfo::getMBPI() const {
+  return MBFI ? &MBFI->getBPI() : nullptr;
+}
+
 raw_ostream &
 MachineBlockFrequencyInfo::printBlockFreq(raw_ostream &OS,
                                           const BlockFrequency Freq) const {
diff --git a/lib/CodeGen/MachineBlockPlacement.cpp b/lib/CodeGen/MachineBlockPlacement.cpp
index f5e305645011..03dda8b36a71 100644
--- a/lib/CodeGen/MachineBlockPlacement.cpp
+++ b/lib/CodeGen/MachineBlockPlacement.cpp
@@ -26,6 +26,8 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/CodeGen/Passes.h"
+#include "llvm/CodeGen/TargetPassConfig.h"
+#include "BranchFolding.h"
 #include "llvm/ADT/DenseMap.h"
 #include "llvm/ADT/SmallPtrSet.h"
 #include "llvm/ADT/SmallVector.h"
@@ -62,10 +64,12 @@ static cl::opt<unsigned> AlignAllBlock("align-all-blocks",
                                                 "blocks in the function."),
                                        cl::init(0), cl::Hidden);
 
-static cl::opt<unsigned>
-    AlignAllLoops("align-all-loops",
-                  cl::desc("Force the alignment of all loops in the function."),
-                  cl::init(0), cl::Hidden);
+static cl::opt<unsigned> AlignAllNonFallThruBlocks(
+    "align-all-nofallthru-blocks",
+    cl::desc("Force the alignment of all "
+             "blocks that have no fall-through predecessors (i.e. don't add "
+             "nops that are executed)."),
+    cl::init(0), cl::Hidden);
 
 // FIXME: Find a good default for this flag and remove the flag.
 static cl::opt<unsigned> ExitBlockBias(
@@ -97,10 +101,15 @@ static cl::opt<bool>
                         cl::desc("Model the cost of loop rotation more "
                                  "precisely by using profile data."),
                         cl::init(false), cl::Hidden);
+static cl::opt<bool>
+    ForcePreciseRotationCost("force-precise-rotation-cost",
+                             cl::desc("Force the use of precise cost "
+                                      "loop rotation strategy."),
+                             cl::init(false), cl::Hidden);
 
 static cl::opt<unsigned> MisfetchCost(
     "misfetch-cost",
-    cl::desc("Cost that models the probablistic risk of an instruction "
+    cl::desc("Cost that models the probabilistic risk of an instruction "
              "misfetch due to a jump comparing to falling through, whose cost "
              "is zero."),
     cl::init(1), cl::Hidden);
@@ -109,6 +118,15 @@ static cl::opt<unsigned> JumpInstCost("jump-inst-cost",
                                       cl::desc("Cost of jump instructions."),
                                       cl::init(1), cl::Hidden);
 
+static cl::opt<bool>
+BranchFoldPlacement("branch-fold-placement",
+              cl::desc("Perform branch folding during placement. "
+                       "Reduces code size."),
+              cl::init(true), cl::Hidden);
+
+extern cl::opt<unsigned> StaticLikelyProb;
+extern cl::opt<unsigned> ProfileLikelyProb;
+
 namespace {
 class BlockChain;
 /// \brief Type for our function-wide basic block -> block chain mapping.
@@ -149,7 +167,7 @@ public:
   /// function. It also registers itself as the chain that block participates
   /// in with the BlockToChain mapping.
   BlockChain(BlockToChainMapType &BlockToChain, MachineBasicBlock *BB)
-      : Blocks(1, BB), BlockToChain(BlockToChain), LoopPredecessors(0) {
+      : Blocks(1, BB), BlockToChain(BlockToChain), UnscheduledPredecessors(0) {
     assert(BB && "Cannot create a chain with a null basic block");
     BlockToChain[BB] = this;
   }
@@ -201,11 +219,16 @@ public:
   }
 #endif // NDEBUG
 
-  /// \brief Count of predecessors within the loop currently being processed.
+  /// \brief Count of predecessors of any block within the chain which have not
+  /// yet been scheduled.  In general, we will delay scheduling this chain
+  /// until those predecessors are scheduled (or we find a sufficiently good
+  /// reason to override this heuristic.)  Note that when forming loop chains,
+  /// blocks outside the loop are ignored and treated as if they were already
+  /// scheduled.
   ///
-  /// This count is updated at each loop we process to represent the number of
-  /// in-loop predecessors of this chain.
-  unsigned LoopPredecessors;
+  /// Note: This field is reinitialized multiple times - once for each loop,
+  /// and then once for the function as a whole.
+  unsigned UnscheduledPredecessors;
 };
 }
 
@@ -214,14 +237,21 @@ class MachineBlockPlacement : public MachineFunctionPass {
   /// \brief A typedef for a block filter set.
   typedef SmallPtrSet<MachineBasicBlock *, 16> BlockFilterSet;
 
+  /// \brief work lists of blocks that are ready to be laid out
+  SmallVector<MachineBasicBlock *, 16> BlockWorkList;
+  SmallVector<MachineBasicBlock *, 16> EHPadWorkList;
+
+  /// \brief Machine Function
+  MachineFunction *F;
+
   /// \brief A handle to the branch probability pass.
   const MachineBranchProbabilityInfo *MBPI;
 
   /// \brief A handle to the function-wide block frequency pass.
-  const MachineBlockFrequencyInfo *MBFI;
+  std::unique_ptr<BranchFolder::MBFIWrapper> MBFI;
 
   /// \brief A handle to the loop info.
-  const MachineLoopInfo *MLI;
+  MachineLoopInfo *MLI;
 
   /// \brief A handle to the target's instruction info.
   const TargetInstrInfo *TII;
@@ -254,33 +284,56 @@ class MachineBlockPlacement : public MachineFunctionPass {
   DenseMap<MachineBasicBlock *, BlockChain *> BlockToChain;
 
   void markChainSuccessors(BlockChain &Chain, MachineBasicBlock *LoopHeaderBB,
-                           SmallVectorImpl<MachineBasicBlock *> &BlockWorkList,
                            const BlockFilterSet *BlockFilter = nullptr);
+  BranchProbability
+  collectViableSuccessors(MachineBasicBlock *BB, BlockChain &Chain,
+                          const BlockFilterSet *BlockFilter,
+                          SmallVector<MachineBasicBlock *, 4> &Successors);
+  bool shouldPredBlockBeOutlined(MachineBasicBlock *BB, MachineBasicBlock *Succ,
+                                 BlockChain &Chain,
+                                 const BlockFilterSet *BlockFilter,
+                                 BranchProbability SuccProb,
+                                 BranchProbability HotProb);
+  bool
+  hasBetterLayoutPredecessor(MachineBasicBlock *BB, MachineBasicBlock *Succ,
+                             BlockChain &SuccChain, BranchProbability SuccProb,
+                             BranchProbability RealSuccProb, BlockChain &Chain,
+                             const BlockFilterSet *BlockFilter);
   MachineBasicBlock *selectBestSuccessor(MachineBasicBlock *BB,
                                          BlockChain &Chain,
                                          const BlockFilterSet *BlockFilter);
   MachineBasicBlock *
   selectBestCandidateBlock(BlockChain &Chain,
-                           SmallVectorImpl<MachineBasicBlock *> &WorkList,
-                           const BlockFilterSet *BlockFilter);
+                           SmallVectorImpl<MachineBasicBlock *> &WorkList);
   MachineBasicBlock *
-  getFirstUnplacedBlock(MachineFunction &F, const BlockChain &PlacedChain,
+  getFirstUnplacedBlock(const BlockChain &PlacedChain,
                         MachineFunction::iterator &PrevUnplacedBlockIt,
                         const BlockFilterSet *BlockFilter);
+
+  /// \brief Add a basic block to the work list if it is appropriate.
+  ///
+  /// If the optional parameter BlockFilter is provided, only MBB
+  /// present in the set will be added to the worklist. If nullptr
+  /// is provided, no filtering occurs.
+  void fillWorkLists(MachineBasicBlock *MBB,
+                     SmallPtrSetImpl<BlockChain *> &UpdatedPreds,
+                     const BlockFilterSet *BlockFilter);
   void buildChain(MachineBasicBlock *BB, BlockChain &Chain,
-                  SmallVectorImpl<MachineBasicBlock *> &BlockWorkList,
                   const BlockFilterSet *BlockFilter = nullptr);
   MachineBasicBlock *findBestLoopTop(MachineLoop &L,
                                      const BlockFilterSet &LoopBlockSet);
-  MachineBasicBlock *findBestLoopExit(MachineFunction &F, MachineLoop &L,
+  MachineBasicBlock *findBestLoopExit(MachineLoop &L,
                                       const BlockFilterSet &LoopBlockSet);
-  BlockFilterSet collectLoopBlockSet(MachineFunction &F, MachineLoop &L);
-  void buildLoopChains(MachineFunction &F, MachineLoop &L);
+  BlockFilterSet collectLoopBlockSet(MachineLoop &L);
+  void buildLoopChains(MachineLoop &L);
   void rotateLoop(BlockChain &LoopChain, MachineBasicBlock *ExitingBB,
                   const BlockFilterSet &LoopBlockSet);
   void rotateLoopWithProfile(BlockChain &LoopChain, MachineLoop &L,
                              const BlockFilterSet &LoopBlockSet);
-  void buildCFGChains(MachineFunction &F);
+  void collectMustExecuteBBs();
+  void buildCFGChains();
+  void optimizeBranches();
+  void alignBlocks();
 
 public:
   static char ID; // Pass identification, replacement for typeid
@@ -295,6 +348,7 @@ public:
     AU.addRequired<MachineBlockFrequencyInfo>();
     AU.addRequired<MachineDominatorTree>();
     AU.addRequired<MachineLoopInfo>();
+    AU.addRequired<TargetPassConfig>();
     MachineFunctionPass::getAnalysisUsage(AU);
   }
 };
@@ -319,18 +373,7 @@ static std::string getBlockName(MachineBasicBlock *BB) {
   std::string Result;
   raw_string_ostream OS(Result);
   OS << "BB#" << BB->getNumber();
-  OS << " (derived from LLVM BB '" << BB->getName() << "')";
-  OS.flush();
-  return Result;
-}
-
-/// \brief Helper to print the number of a MBB.
-///
-/// Only used by debug logging.
-static std::string getBlockNum(MachineBasicBlock *BB) {
-  std::string Result;
-  raw_string_ostream OS(Result);
-  OS << "BB#" << BB->getNumber();
+  OS << " ('" << BB->getName() << "')";
   OS.flush();
   return Result;
 }
@@ -344,7 +387,6 @@ static std::string getBlockNum(MachineBasicBlock *BB) {
 /// chain which reach the zero-predecessor state to the worklist passed in.
 void MachineBlockPlacement::markChainSuccessors(
     BlockChain &Chain, MachineBasicBlock *LoopHeaderBB,
-    SmallVectorImpl<MachineBasicBlock *> &BlockWorkList,
     const BlockFilterSet *BlockFilter) {
   // Walk all the blocks in this chain, marking their successors as having
   // a predecessor placed.
@@ -363,30 +405,26 @@ void MachineBlockPlacement::markChainSuccessors(
 
       // This is a cross-chain edge that is within the loop, so decrement the
       // loop predecessor count of the destination chain.
-      if (SuccChain.LoopPredecessors > 0 && --SuccChain.LoopPredecessors == 0)
-        BlockWorkList.push_back(*SuccChain.begin());
+      if (SuccChain.UnscheduledPredecessors == 0 ||
+          --SuccChain.UnscheduledPredecessors > 0)
+        continue;
+
+      auto *MBB = *SuccChain.begin();
+      if (MBB->isEHPad())
+        EHPadWorkList.push_back(MBB);
+      else
+        BlockWorkList.push_back(MBB);
     }
   }
 }
 
-/// \brief Select the best successor for a block.
-///
-/// This looks across all successors of a particular block and attempts to
-/// select the "best" one to be the layout successor. It only considers direct
-/// successors which also pass the block filter. It will attempt to avoid
-/// breaking CFG structure, but cave and break such structures in the case of
-/// very hot successor edges.
-///
-/// \returns The best successor block found, or null if none are viable.
-MachineBasicBlock *
-MachineBlockPlacement::selectBestSuccessor(MachineBasicBlock *BB,
-                                           BlockChain &Chain,
-                                           const BlockFilterSet *BlockFilter) {
-  const BranchProbability HotProb(4, 5); // 80%
-
-  MachineBasicBlock *BestSucc = nullptr;
-  auto BestProb = BranchProbability::getZero();
-
+/// This helper function collects the set of successors of block
+/// \p BB that are allowed to be its layout successors, and return
+/// the total branch probability of edges from \p BB to those
+/// blocks.
+BranchProbability MachineBlockPlacement::collectViableSuccessors(
+    MachineBasicBlock *BB, BlockChain &Chain, const BlockFilterSet *BlockFilter,
+    SmallVector<MachineBasicBlock *, 4> &Successors) {
   // Adjust edge probabilities by excluding edges pointing to blocks that is
   // either not in BlockFilter or is already in the current chain. Consider the
   // following CFG:
@@ -400,20 +438,17 @@ MachineBlockPlacement::selectBestSuccessor(MachineBasicBlock *BB,
   // Assume A->C is very hot (>90%), and C->D has a 50% probability, then after
   // A->C is chosen as a fall-through, D won't be selected as a successor of C
   // due to CFG constraint (the probability of C->D is not greater than
-  // HotProb). If we exclude E that is not in BlockFilter when calculating the
-  // probability of C->D, D will be selected and we will get A C D B as the
-  // layout of this loop.
+  // HotProb to break top-order). If we exclude E that is not in BlockFilter
+  // when calculating the  probability of C->D, D will be selected and we
+  // will get A C D B as the layout of this loop.
   auto AdjustedSumProb = BranchProbability::getOne();
-  SmallVector<MachineBasicBlock *, 4> Successors;
   for (MachineBasicBlock *Succ : BB->successors()) {
     bool SkipSucc = false;
-    if (BlockFilter && !BlockFilter->count(Succ)) {
+    if (Succ->isEHPad() || (BlockFilter && !BlockFilter->count(Succ))) {
       SkipSucc = true;
     } else {
       BlockChain *SuccChain = BlockToChain[Succ];
       if (SuccChain == &Chain) {
-        DEBUG(dbgs() << "    " << getBlockName(Succ)
-                     << " -> Already merged!\n");
         SkipSucc = true;
       } else if (Succ != *SuccChain->begin()) {
         DEBUG(dbgs() << "    " << getBlockName(Succ) << " -> Mid chain!\n");
@@ -426,78 +461,267 @@ MachineBlockPlacement::selectBestSuccessor(MachineBasicBlock *BB,
       Successors.push_back(Succ);
   }
 
-  DEBUG(dbgs() << "Attempting merge from: " << getBlockName(BB) << "\n");
-  for (MachineBasicBlock *Succ : Successors) {
-    BranchProbability SuccProb;
-    uint32_t SuccProbN = MBPI->getEdgeProbability(BB, Succ).getNumerator();
-    uint32_t SuccProbD = AdjustedSumProb.getNumerator();
-    if (SuccProbN >= SuccProbD)
-      SuccProb = BranchProbability::getOne();
-    else
-      SuccProb = BranchProbability(SuccProbN, SuccProbD);
-
-    // If we outline optional branches, look whether Succ is unavoidable, i.e.
-    // dominates all terminators of the MachineFunction. If it does, other
-    // successors must be optional. Don't do this for cold branches.
-    if (OutlineOptionalBranches && SuccProb > HotProb.getCompl() &&
-        UnavoidableBlocks.count(Succ) > 0) {
-      auto HasShortOptionalBranch = [&]() {
-        for (MachineBasicBlock *Pred : Succ->predecessors()) {
-          // Check whether there is an unplaced optional branch.
-          if (Pred == Succ || (BlockFilter && !BlockFilter->count(Pred)) ||
-              BlockToChain[Pred] == &Chain)
-            continue;
-          // Check whether the optional branch has exactly one BB.
-          if (Pred->pred_size() > 1 || *Pred->pred_begin() != BB)
-            continue;
-          // Check whether the optional branch is small.
-          if (Pred->size() < OutlineOptionalThreshold)
-            return true;
-        }
+  return AdjustedSumProb;
+}
+
+/// The helper function returns the branch probability that is adjusted
+/// or normalized over the new total \p AdjustedSumProb.
+static BranchProbability
+getAdjustedProbability(BranchProbability OrigProb,
+                       BranchProbability AdjustedSumProb) {
+  BranchProbability SuccProb;
+  uint32_t SuccProbN = OrigProb.getNumerator();
+  uint32_t SuccProbD = AdjustedSumProb.getNumerator();
+  if (SuccProbN >= SuccProbD)
+    SuccProb = BranchProbability::getOne();
+  else
+    SuccProb = BranchProbability(SuccProbN, SuccProbD);
+
+  return SuccProb;
+}
+
+/// When the option OutlineOptionalBranches is on, this method
+/// checks if the fallthrough candidate block \p Succ (of block
+/// \p BB) also has other unscheduled predecessor blocks which
+/// are also successors of \p BB (forming triangular shape CFG).
+/// If none of such predecessors are small, it returns true.
+/// The caller can choose to select \p Succ as the layout successors
+/// so that \p Succ's predecessors (optional branches) can be
+/// outlined.
+/// FIXME: fold this with more general layout cost analysis.
+bool MachineBlockPlacement::shouldPredBlockBeOutlined(
+    MachineBasicBlock *BB, MachineBasicBlock *Succ, BlockChain &Chain,
+    const BlockFilterSet *BlockFilter, BranchProbability SuccProb,
+    BranchProbability HotProb) {
+  if (!OutlineOptionalBranches)
+    return false;
+  // If we outline optional branches, look whether Succ is unavoidable, i.e.
+  // dominates all terminators of the MachineFunction. If it does, other
+  // successors must be optional. Don't do this for cold branches.
+  if (SuccProb > HotProb.getCompl() && UnavoidableBlocks.count(Succ) > 0) {
+    for (MachineBasicBlock *Pred : Succ->predecessors()) {
+      // Check whether there is an unplaced optional branch.
+      if (Pred == Succ || (BlockFilter && !BlockFilter->count(Pred)) ||
+          BlockToChain[Pred] == &Chain)
+        continue;
+      // Check whether the optional branch has exactly one BB.
+      if (Pred->pred_size() > 1 || *Pred->pred_begin() != BB)
+        continue;
+      // Check whether the optional branch is small.
+      if (Pred->size() < OutlineOptionalThreshold)
         return false;
-      };
-      if (!HasShortOptionalBranch())
-        return Succ;
     }
+    return true;
+  } else
+    return false;
+}
 
-    // Only consider successors which are either "hot", or wouldn't violate
-    // any CFG constraints.
-    BlockChain &SuccChain = *BlockToChain[Succ];
-    if (SuccChain.LoopPredecessors != 0) {
-      if (SuccProb < HotProb) {
-        DEBUG(dbgs() << "    " << getBlockName(Succ) << " -> " << SuccProb
-                     << " (prob) (CFG conflict)\n");
-        continue;
-      }
+// When profile is not present, return the StaticLikelyProb.
+// When profile is available, we need to handle the triangle-shape CFG.
+static BranchProbability getLayoutSuccessorProbThreshold(
+      MachineBasicBlock *BB) {
+  if (!BB->getParent()->getFunction()->getEntryCount())
+    return BranchProbability(StaticLikelyProb, 100);
+  if (BB->succ_size() == 2) {
+    const MachineBasicBlock *Succ1 = *BB->succ_begin();
+    const MachineBasicBlock *Succ2 = *(BB->succ_begin() + 1);
+    if (Succ1->isSuccessor(Succ2) || Succ2->isSuccessor(Succ1)) {
+      /* See case 1 below for the cost analysis. For BB->Succ to
+       * be taken with smaller cost, the following needs to hold:
+       *   Prob(BB->Succ) > 2* Prob(BB->Pred)
+       *   So the threshold T
+       *   T = 2 * (1-Prob(BB->Pred). Since T + Prob(BB->Pred) == 1,
+       * We have  T + T/2 = 1, i.e. T = 2/3. Also adding user specified
+       * branch bias, we have
+       *   T = (2/3)*(ProfileLikelyProb/50)
+       *     = (2*ProfileLikelyProb)/150)
+       */
+      return BranchProbability(2 * ProfileLikelyProb, 150);
+    }
+  }
+  return BranchProbability(ProfileLikelyProb, 100);
+}
 
-      // Make sure that a hot successor doesn't have a globally more
-      // important predecessor.
-      auto RealSuccProb = MBPI->getEdgeProbability(BB, Succ);
-      BlockFrequency CandidateEdgeFreq =
-          MBFI->getBlockFreq(BB) * RealSuccProb * HotProb.getCompl();
-      bool BadCFGConflict = false;
-      for (MachineBasicBlock *Pred : Succ->predecessors()) {
-        if (Pred == Succ || (BlockFilter && !BlockFilter->count(Pred)) ||
-            BlockToChain[Pred] == &Chain)
-          continue;
-        BlockFrequency PredEdgeFreq =
-            MBFI->getBlockFreq(Pred) * MBPI->getEdgeProbability(Pred, Succ);
-        if (PredEdgeFreq >= CandidateEdgeFreq) {
-          BadCFGConflict = true;
-          break;
-        }
-      }
-      if (BadCFGConflict) {
-        DEBUG(dbgs() << "    " << getBlockName(Succ) << " -> " << SuccProb
-                     << " (prob) (non-cold CFG conflict)\n");
-        continue;
-      }
+/// Checks to see if the layout candidate block \p Succ has a better layout
+/// predecessor than \c BB. If yes, returns true.
+bool MachineBlockPlacement::hasBetterLayoutPredecessor(
+    MachineBasicBlock *BB, MachineBasicBlock *Succ, BlockChain &SuccChain,
+    BranchProbability SuccProb, BranchProbability RealSuccProb,
+    BlockChain &Chain, const BlockFilterSet *BlockFilter) {
+
+  // There isn't a better layout when there are no unscheduled predecessors.
+  if (SuccChain.UnscheduledPredecessors == 0)
+    return false;
+
+  // There are two basic scenarios here:
+  // -------------------------------------
+  // Case 1: triangular shape CFG (if-then):
+  //     BB
+  //     | \
+  //     |  \
+  //     |   Pred
+  //     |   /
+  //     Succ
+  // In this case, we are evaluating whether to select edge -> Succ, e.g.
+  // set Succ as the layout successor of BB. Picking Succ as BB's
+  // successor breaks the CFG constraints (FIXME: define these constraints).
+  // With this layout, Pred BB
+  // is forced to be outlined, so the overall cost will be cost of the
+  // branch taken from BB to Pred, plus the cost of back taken branch
+  // from Pred to Succ, as well as the additional cost associated
+  // with the needed unconditional jump instruction from Pred To Succ.
+
+  // The cost of the topological order layout is the taken branch cost
+  // from BB to Succ, so to make BB->Succ a viable candidate, the following
+  // must hold:
+  //     2 * freq(BB->Pred) * taken_branch_cost + unconditional_jump_cost
+  //      < freq(BB->Succ) *  taken_branch_cost.
+  // Ignoring unconditional jump cost, we get
+  //    freq(BB->Succ) > 2 * freq(BB->Pred), i.e.,
+  //    prob(BB->Succ) > 2 * prob(BB->Pred)
+  //
+  // When real profile data is available, we can precisely compute the
+  // probability threshold that is needed for edge BB->Succ to be considered.
+  // Without profile data, the heuristic requires the branch bias to be
+  // a lot larger to make sure the signal is very strong (e.g. 80% default).
+  // -----------------------------------------------------------------
+  // Case 2: diamond like CFG (if-then-else):
+  //     S
+  //    / \
+  //   |   \
+  //  BB    Pred
+  //   \    /
+  //    Succ
+  //    ..
+  //
+  // The current block is BB and edge BB->Succ is now being evaluated.
+  // Note that edge S->BB was previously already selected because
+  // prob(S->BB) > prob(S->Pred).
+  // At this point, 2 blocks can be placed after BB: Pred or Succ. If we
+  // choose Pred, we will have a topological ordering as shown on the left
+  // in the picture below. If we choose Succ, we have the solution as shown
+  // on the right:
+  //
+  //   topo-order:
+  //
+  //       S-----                             ---S
+  //       |    |                             |  |
+  //    ---BB   |                             |  BB
+  //    |       |                             |  |
+  //    |  pred--                             |  Succ--
+  //    |  |                                  |       |
+  //    ---succ                               ---pred--
+  //
+  // cost = freq(S->Pred) + freq(BB->Succ)    cost = 2 * freq (S->Pred)
+  //      = freq(S->Pred) + freq(S->BB)
+  //
+  // If we have profile data (i.e, branch probabilities can be trusted), the
+  // cost (number of taken branches) with layout S->BB->Succ->Pred is 2 *
+  // freq(S->Pred) while the cost of topo order is freq(S->Pred) + freq(S->BB).
+  // We know Prob(S->BB) > Prob(S->Pred), so freq(S->BB) > freq(S->Pred), which
+  // means the cost of topological order is greater.
+  // When profile data is not available, however, we need to be more
+  // conservative. If the branch prediction is wrong, breaking the topo-order
+  // will actually yield a layout with large cost. For this reason, we need
+  // strong biased branch at block S with Prob(S->BB) in order to select
+  // BB->Succ. This is equivalent to looking the CFG backward with backward
+  // edge: Prob(Succ->BB) needs to >= HotProb in order to be selected (without
+  // profile data).
+
+  BranchProbability HotProb = getLayoutSuccessorProbThreshold(BB);
+
+  // Forward checking. For case 2, SuccProb will be 1.
+  if (SuccProb < HotProb) {
+    DEBUG(dbgs() << "    " << getBlockName(Succ) << " -> " << SuccProb
+                 << " (prob) (CFG conflict)\n");
+    return true;
+  }
+
+  // Make sure that a hot successor doesn't have a globally more
+  // important predecessor.
+  BlockFrequency CandidateEdgeFreq = MBFI->getBlockFreq(BB) * RealSuccProb;
+  bool BadCFGConflict = false;
+
+  for (MachineBasicBlock *Pred : Succ->predecessors()) {
+    if (Pred == Succ || BlockToChain[Pred] == &SuccChain ||
+        (BlockFilter && !BlockFilter->count(Pred)) ||
+        BlockToChain[Pred] == &Chain)
+      continue;
+    // Do backward checking. For case 1, it is actually redundant check. For
+    // case 2 above, we need a backward checking to filter out edges that are
+    // not 'strongly' biased. With profile data available, the check is mostly
+    // redundant too (when threshold prob is set at 50%) unless S has more than
+    // two successors.
+    // BB  Pred
+    //  \ /
+    //  Succ
+    // We select edge BB->Succ if
+    //      freq(BB->Succ) > freq(Succ) * HotProb
+    //      i.e. freq(BB->Succ) > freq(BB->Succ) * HotProb + freq(Pred->Succ) *
+    //      HotProb
+    //      i.e. freq((BB->Succ) * (1 - HotProb) > freq(Pred->Succ) * HotProb
+    BlockFrequency PredEdgeFreq =
+        MBFI->getBlockFreq(Pred) * MBPI->getEdgeProbability(Pred, Succ);
+    if (PredEdgeFreq * HotProb >= CandidateEdgeFreq * HotProb.getCompl()) {
+      BadCFGConflict = true;
+      break;
     }
+  }
 
+  if (BadCFGConflict) {
     DEBUG(dbgs() << "    " << getBlockName(Succ) << " -> " << SuccProb
-                 << " (prob)"
-                 << (SuccChain.LoopPredecessors != 0 ? " (CFG break)" : "")
-                 << "\n");
+                 << " (prob) (non-cold CFG conflict)\n");
+    return true;
+  }
+
+  return false;
+}
+
+/// \brief Select the best successor for a block.
+///
+/// This looks across all successors of a particular block and attempts to
+/// select the "best" one to be the layout successor. It only considers direct
+/// successors which also pass the block filter. It will attempt to avoid
+/// breaking CFG structure, but cave and break such structures in the case of
+/// very hot successor edges.
+///
+/// \returns The best successor block found, or null if none are viable.
+MachineBasicBlock *
+MachineBlockPlacement::selectBestSuccessor(MachineBasicBlock *BB,
+                                           BlockChain &Chain,
+                                           const BlockFilterSet *BlockFilter) {
+  const BranchProbability HotProb(StaticLikelyProb, 100);
+
+  MachineBasicBlock *BestSucc = nullptr;
+  auto BestProb = BranchProbability::getZero();
+
+  SmallVector<MachineBasicBlock *, 4> Successors;
+  auto AdjustedSumProb =
+      collectViableSuccessors(BB, Chain, BlockFilter, Successors);
+
+  DEBUG(dbgs() << "Attempting merge from: " << getBlockName(BB) << "\n");
+  for (MachineBasicBlock *Succ : Successors) {
+    auto RealSuccProb = MBPI->getEdgeProbability(BB, Succ);
+    BranchProbability SuccProb =
+        getAdjustedProbability(RealSuccProb, AdjustedSumProb);
+
+    // This heuristic is off by default.
+    if (shouldPredBlockBeOutlined(BB, Succ, Chain, BlockFilter, SuccProb,
+                                  HotProb))
+      return Succ;
+
+    BlockChain &SuccChain = *BlockToChain[Succ];
+    // Skip the edge \c BB->Succ if block \c Succ has a better layout
+    // predecessor that yields lower global cost.
+    if (hasBetterLayoutPredecessor(BB, Succ, SuccChain, SuccProb, RealSuccProb,
+                                   Chain, BlockFilter))
+      continue;
+
+    DEBUG(
+        dbgs() << "    " << getBlockName(Succ) << " -> " << SuccProb
+               << " (prob)"
+               << (SuccChain.UnscheduledPredecessors != 0 ? " (CFG break)" : "")
+               << "\n");
     if (BestSucc && BestProb >= SuccProb)
       continue;
     BestSucc = Succ;
@@ -513,12 +737,11 @@ MachineBlockPlacement::selectBestSuccessor(MachineBasicBlock *BB,
 /// profitable only really makes sense in the context of a loop. This returns
 /// the most frequently visited block in the worklist, which in the case of
 /// a loop, is the one most desirable to be physically close to the rest of the
-/// loop body in order to improve icache behavior.
+/// loop body in order to improve i-cache behavior.
 ///
 /// \returns The best block found, or null if none are viable.
 MachineBasicBlock *MachineBlockPlacement::selectBestCandidateBlock(
-    BlockChain &Chain, SmallVectorImpl<MachineBasicBlock *> &WorkList,
-    const BlockFilterSet *BlockFilter) {
+    BlockChain &Chain, SmallVectorImpl<MachineBasicBlock *> &WorkList) {
   // Once we need to walk the worklist looking for a candidate, cleanup the
   // worklist of already placed entries.
   // FIXME: If this shows up on profiles, it could be folded (at the cost of
@@ -529,24 +752,51 @@ MachineBasicBlock *MachineBlockPlacement::selectBestCandidateBlock(
                                 }),
                  WorkList.end());
 
+  if (WorkList.empty())
+    return nullptr;
+
+  bool IsEHPad = WorkList[0]->isEHPad();
+
   MachineBasicBlock *BestBlock = nullptr;
   BlockFrequency BestFreq;
   for (MachineBasicBlock *MBB : WorkList) {
+    assert(MBB->isEHPad() == IsEHPad);
+
     BlockChain &SuccChain = *BlockToChain[MBB];
-    if (&SuccChain == &Chain) {
-      DEBUG(dbgs() << "    " << getBlockName(MBB) << " -> Already merged!\n");
+    if (&SuccChain == &Chain)
       continue;
-    }
-    assert(SuccChain.LoopPredecessors == 0 && "Found CFG-violating block");
+
+    assert(SuccChain.UnscheduledPredecessors == 0 && "Found CFG-violating block");
 
     BlockFrequency CandidateFreq = MBFI->getBlockFreq(MBB);
     DEBUG(dbgs() << "    " << getBlockName(MBB) << " -> ";
           MBFI->printBlockFreq(dbgs(), CandidateFreq) << " (freq)\n");
-    if (BestBlock && BestFreq >= CandidateFreq)
+
+    // For ehpad, we layout the least probable first as to avoid jumping back
+    // from least probable landingpads to more probable ones.
+    //
+    // FIXME: Using probability is probably (!) not the best way to achieve
+    // this. We should probably have a more principled approach to layout
+    // cleanup code.
+    //
+    // The goal is to get:
+    //
+    //                 +--------------------------+
+    //                 |                          V
+    // InnerLp -> InnerCleanup    OuterLp -> OuterCleanup -> Resume
+    //
+    // Rather than:
+    //
+    //                 +-------------------------------------+
+    //                 V                                     |
+    // OuterLp -> OuterCleanup -> Resume     InnerLp -> InnerCleanup
+    if (BestBlock && (IsEHPad ^ (BestFreq >= CandidateFreq)))
       continue;
+
     BestBlock = MBB;
     BestFreq = CandidateFreq;
   }
+
   return BestBlock;
 }
 
@@ -558,10 +808,10 @@ MachineBasicBlock *MachineBlockPlacement::selectBestCandidateBlock(
 /// LastUnplacedBlockIt. We update this iterator on each call to avoid
 /// re-scanning the entire sequence on repeated calls to this routine.
 MachineBasicBlock *MachineBlockPlacement::getFirstUnplacedBlock(
-    MachineFunction &F, const BlockChain &PlacedChain,
+    const BlockChain &PlacedChain,
     MachineFunction::iterator &PrevUnplacedBlockIt,
     const BlockFilterSet *BlockFilter) {
-  for (MachineFunction::iterator I = PrevUnplacedBlockIt, E = F.end(); I != E;
+  for (MachineFunction::iterator I = PrevUnplacedBlockIt, E = F->end(); I != E;
        ++I) {
     if (BlockFilter && !BlockFilter->count(&*I))
       continue;
@@ -576,22 +826,51 @@ MachineBasicBlock *MachineBlockPlacement::getFirstUnplacedBlock(
   return nullptr;
 }
 
+void MachineBlockPlacement::fillWorkLists(
+    MachineBasicBlock *MBB,
+    SmallPtrSetImpl<BlockChain *> &UpdatedPreds,
+    const BlockFilterSet *BlockFilter = nullptr) {
+  BlockChain &Chain = *BlockToChain[MBB];
+  if (!UpdatedPreds.insert(&Chain).second)
+    return;
+
+  assert(Chain.UnscheduledPredecessors == 0);
+  for (MachineBasicBlock *ChainBB : Chain) {
+    assert(BlockToChain[ChainBB] == &Chain);
+    for (MachineBasicBlock *Pred : ChainBB->predecessors()) {
+      if (BlockFilter && !BlockFilter->count(Pred))
+        continue;
+      if (BlockToChain[Pred] == &Chain)
+        continue;
+      ++Chain.UnscheduledPredecessors;
+    }
+  }
+
+  if (Chain.UnscheduledPredecessors != 0)
+    return;
+
+  MBB = *Chain.begin();
+  if (MBB->isEHPad())
+    EHPadWorkList.push_back(MBB);
+  else
+    BlockWorkList.push_back(MBB);
+}
+
 void MachineBlockPlacement::buildChain(
     MachineBasicBlock *BB, BlockChain &Chain,
-    SmallVectorImpl<MachineBasicBlock *> &BlockWorkList,
     const BlockFilterSet *BlockFilter) {
-  assert(BB);
-  assert(BlockToChain[BB] == &Chain);
-  MachineFunction &F = *BB->getParent();
-  MachineFunction::iterator PrevUnplacedBlockIt = F.begin();
+  assert(BB && "BB must not be null.\n");
+  assert(BlockToChain[BB] == &Chain && "BlockToChainMap mis-match.\n");
+  MachineFunction::iterator PrevUnplacedBlockIt = F->begin();
 
   MachineBasicBlock *LoopHeaderBB = BB;
-  markChainSuccessors(Chain, LoopHeaderBB, BlockWorkList, BlockFilter);
+  markChainSuccessors(Chain, LoopHeaderBB, BlockFilter);
   BB = *std::prev(Chain.end());
   for (;;) {
-    assert(BB);
-    assert(BlockToChain[BB] == &Chain);
-    assert(*std::prev(Chain.end()) == BB);
+    assert(BB && "null block found at end of chain in loop.");
+    assert(BlockToChain[BB] == &Chain && "BlockToChainMap mis-match in loop.");
+    assert(*std::prev(Chain.end()) == BB && "BB Not found at end of chain.");
+
 
     // Look for the best viable successor if there is one to place immediately
     // after this block.
@@ -601,11 +880,12 @@ void MachineBlockPlacement::buildChain(
     // block among those we've identified as not violating the loop's CFG at
     // this point. This won't be a fallthrough, but it will increase locality.
     if (!BestSucc)
-      BestSucc = selectBestCandidateBlock(Chain, BlockWorkList, BlockFilter);
+      BestSucc = selectBestCandidateBlock(Chain, BlockWorkList);
+    if (!BestSucc)
+      BestSucc = selectBestCandidateBlock(Chain, EHPadWorkList);
 
     if (!BestSucc) {
-      BestSucc =
-          getFirstUnplacedBlock(F, Chain, PrevUnplacedBlockIt, BlockFilter);
+      BestSucc = getFirstUnplacedBlock(Chain, PrevUnplacedBlockIt, BlockFilter);
       if (!BestSucc)
         break;
 
@@ -615,18 +895,18 @@ void MachineBlockPlacement::buildChain(
 
     // Place this block, updating the datastructures to reflect its placement.
     BlockChain &SuccChain = *BlockToChain[BestSucc];
-    // Zero out LoopPredecessors for the successor we're about to merge in case
+    // Zero out UnscheduledPredecessors for the successor we're about to merge in case
     // we selected a successor that didn't fit naturally into the CFG.
-    SuccChain.LoopPredecessors = 0;
-    DEBUG(dbgs() << "Merging from " << getBlockNum(BB) << " to "
-                 << getBlockNum(BestSucc) << "\n");
-    markChainSuccessors(SuccChain, LoopHeaderBB, BlockWorkList, BlockFilter);
+    SuccChain.UnscheduledPredecessors = 0;
+    DEBUG(dbgs() << "Merging from " << getBlockName(BB) << " to "
+                 << getBlockName(BestSucc) << "\n");
+    markChainSuccessors(SuccChain, LoopHeaderBB, BlockFilter);
     Chain.merge(BestSucc, &SuccChain);
     BB = *std::prev(Chain.end());
   }
 
   DEBUG(dbgs() << "Finished forming chain for header block "
-               << getBlockNum(*Chain.begin()) << "\n");
+               << getBlockName(*Chain.begin()) << "\n");
 }
 
 /// \brief Find the best loop top block for layout.
@@ -673,8 +953,10 @@ MachineBlockPlacement::findBestLoopTop(MachineLoop &L,
   }
 
   // If no direct predecessor is fine, just use the loop header.
-  if (!BestPred)
+  if (!BestPred) {
+    DEBUG(dbgs() << "    final top unchanged\n");
     return L.getHeader();
+  }
 
   // Walk backwards through any straight line of predecessors.
   while (BestPred->pred_size() == 1 &&
@@ -692,7 +974,7 @@ MachineBlockPlacement::findBestLoopTop(MachineLoop &L,
 /// block to layout at the top of the loop. Typically this is done to maximize
 /// fallthrough opportunities.
 MachineBasicBlock *
-MachineBlockPlacement::findBestLoopExit(MachineFunction &F, MachineLoop &L,
+MachineBlockPlacement::findBestLoopExit(MachineLoop &L,
                                         const BlockFilterSet &LoopBlockSet) {
   // We don't want to layout the loop linearly in all cases. If the loop header
   // is just a normal basic block in the loop, we want to look for what block
@@ -710,7 +992,7 @@ MachineBlockPlacement::findBestLoopExit(MachineFunction &F, MachineLoop &L,
   unsigned BestExitLoopDepth = 0;
   MachineBasicBlock *ExitingBB = nullptr;
   // If there are exits to outer loops, loop rotation can severely limit
-  // fallthrough opportunites unless it selects such an exit. Keep a set of
+  // fallthrough opportunities unless it selects such an exit. Keep a set of
   // blocks where rotating to exit with that block will reach an outer loop.
   SmallPtrSet<MachineBasicBlock *, 4> BlocksExitingToOuterLoop;
 
@@ -780,7 +1062,6 @@ MachineBlockPlacement::findBestLoopExit(MachineFunction &F, MachineLoop &L,
       // Restore the old exiting state, no viable looping successor was found.
       ExitingBB = OldExitingBB;
       BestExitEdgeFreq = OldBestExitEdgeFreq;
-      continue;
     }
   }
   // Without a candidate exiting block or with only a single block in the
@@ -973,7 +1254,7 @@ void MachineBlockPlacement::rotateLoopWithProfile(
       }
     }
 
-    DEBUG(dbgs() << "The cost of loop rotation by making " << getBlockNum(*Iter)
+    DEBUG(dbgs() << "The cost of loop rotation by making " << getBlockName(*Iter)
                  << " to the top: " << Cost.getFrequency() << "\n");
 
     if (Cost < SmallestRotationCost) {
@@ -983,7 +1264,7 @@ void MachineBlockPlacement::rotateLoopWithProfile(
   }
 
   if (RotationPos != LoopChain.end()) {
-    DEBUG(dbgs() << "Rotate loop by making " << getBlockNum(*RotationPos)
+    DEBUG(dbgs() << "Rotate loop by making " << getBlockName(*RotationPos)
                  << " to the top\n");
     std::rotate(LoopChain.begin(), RotationPos, LoopChain.end());
   }
@@ -994,7 +1275,7 @@ void MachineBlockPlacement::rotateLoopWithProfile(
 /// When profile data is available, exclude cold blocks from the returned set;
 /// otherwise, collect all blocks in the loop.
 MachineBlockPlacement::BlockFilterSet
-MachineBlockPlacement::collectLoopBlockSet(MachineFunction &F, MachineLoop &L) {
+MachineBlockPlacement::collectLoopBlockSet(MachineLoop &L) {
   BlockFilterSet LoopBlockSet;
 
   // Filter cold blocks off from LoopBlockSet when profile data is available.
@@ -1006,7 +1287,7 @@ MachineBlockPlacement::collectLoopBlockSet(MachineFunction &F, MachineLoop &L) {
   // will be merged into the first outer loop chain for which this block is not
   // cold anymore. This needs precise profile data and we only do this when
   // profile data is available.
-  if (F.getFunction()->getEntryCount()) {
+  if (F->getFunction()->getEntryCount()) {
     BlockFrequency LoopFreq(0);
     for (auto LoopPred : L.getHeader()->predecessors())
       if (!L.contains(LoopPred))
@@ -1031,21 +1312,22 @@ MachineBlockPlacement::collectLoopBlockSet(MachineFunction &F, MachineLoop &L) {
 /// as much as possible. We can then stitch the chains together in a way which
 /// both preserves the topological structure and minimizes taken conditional
 /// branches.
-void MachineBlockPlacement::buildLoopChains(MachineFunction &F,
-                                            MachineLoop &L) {
+void MachineBlockPlacement::buildLoopChains(MachineLoop &L) {
   // First recurse through any nested loops, building chains for those inner
   // loops.
   for (MachineLoop *InnerLoop : L)
-    buildLoopChains(F, *InnerLoop);
+    buildLoopChains(*InnerLoop);
 
-  SmallVector<MachineBasicBlock *, 16> BlockWorkList;
-  BlockFilterSet LoopBlockSet = collectLoopBlockSet(F, L);
+  assert(BlockWorkList.empty());
+  assert(EHPadWorkList.empty());
+  BlockFilterSet LoopBlockSet = collectLoopBlockSet(L);
 
   // Check if we have profile data for this function. If yes, we will rotate
   // this loop by modeling costs more precisely which requires the profile data
   // for better layout.
   bool RotateLoopWithProfile =
-      PreciseRotationCost && F.getFunction()->getEntryCount();
+      ForcePreciseRotationCost ||
+      (PreciseRotationCost && F->getFunction()->getEntryCount());
 
   // First check to see if there is an obviously preferable top block for the
   // loop. This will default to the header, but may end up as one of the
@@ -1060,7 +1342,7 @@ void MachineBlockPlacement::buildLoopChains(MachineFunction &F,
   // branches by placing an exit edge at the bottom.
   MachineBasicBlock *ExitingBB = nullptr;
   if (!RotateLoopWithProfile && LoopTop == L.getHeader())
-    ExitingBB = findBestLoopExit(F, L, LoopBlockSet);
+    ExitingBB = findBestLoopExit(L, LoopBlockSet);
 
   BlockChain &LoopChain = *BlockToChain[LoopTop];
 
@@ -1068,29 +1350,13 @@ void MachineBlockPlacement::buildLoopChains(MachineFunction &F,
   // walk the blocks, and use a set to prevent visiting a particular chain
   // twice.
   SmallPtrSet<BlockChain *, 4> UpdatedPreds;
-  assert(LoopChain.LoopPredecessors == 0);
+  assert(LoopChain.UnscheduledPredecessors == 0);
   UpdatedPreds.insert(&LoopChain);
 
-  for (MachineBasicBlock *LoopBB : LoopBlockSet) {
-    BlockChain &Chain = *BlockToChain[LoopBB];
-    if (!UpdatedPreds.insert(&Chain).second)
-      continue;
+  for (MachineBasicBlock *LoopBB : LoopBlockSet)
+    fillWorkLists(LoopBB, UpdatedPreds, &LoopBlockSet);
 
-    assert(Chain.LoopPredecessors == 0);
-    for (MachineBasicBlock *ChainBB : Chain) {
-      assert(BlockToChain[ChainBB] == &Chain);
-      for (MachineBasicBlock *Pred : ChainBB->predecessors()) {
-        if (BlockToChain[Pred] == &Chain || !LoopBlockSet.count(Pred))
-          continue;
-        ++Chain.LoopPredecessors;
-      }
-    }
-
-    if (Chain.LoopPredecessors == 0)
-      BlockWorkList.push_back(*Chain.begin());
-  }
-
-  buildChain(LoopTop, LoopChain, BlockWorkList, &LoopBlockSet);
+  buildChain(LoopTop, LoopChain, &LoopBlockSet);
 
   if (RotateLoopWithProfile)
     rotateLoopWithProfile(LoopChain, L, LoopBlockSet);
@@ -1100,7 +1366,7 @@ void MachineBlockPlacement::buildLoopChains(MachineFunction &F,
   DEBUG({
     // Crash at the end so we get all of the debugging output first.
     bool BadLoop = false;
-    if (LoopChain.LoopPredecessors) {
+    if (LoopChain.UnscheduledPredecessors) {
       BadLoop = true;
       dbgs() << "Loop chain contains a block without its preds placed!\n"
              << "  Loop header:  " << getBlockName(*L.block_begin()) << "\n"
@@ -1129,13 +1395,42 @@ void MachineBlockPlacement::buildLoopChains(MachineFunction &F,
     }
     assert(!BadLoop && "Detected problems with the placement of this loop.");
   });
+
+  BlockWorkList.clear();
+  EHPadWorkList.clear();
+}
+
+/// When OutlineOpitonalBranches is on, this method collects BBs that
+/// dominates all terminator blocks of the function \p F.
+void MachineBlockPlacement::collectMustExecuteBBs() {
+  if (OutlineOptionalBranches) {
+    // Find the nearest common dominator of all of F's terminators.
+    MachineBasicBlock *Terminator = nullptr;
+    for (MachineBasicBlock &MBB : *F) {
+      if (MBB.succ_size() == 0) {
+        if (Terminator == nullptr)
+          Terminator = &MBB;
+        else
+          Terminator = MDT->findNearestCommonDominator(Terminator, &MBB);
+      }
+    }
+
+    // MBBs dominating this common dominator are unavoidable.
+    UnavoidableBlocks.clear();
+    for (MachineBasicBlock &MBB : *F) {
+      if (MDT->dominates(&MBB, Terminator)) {
+        UnavoidableBlocks.insert(&MBB);
+      }
+    }
+  }
 }
 
-void MachineBlockPlacement::buildCFGChains(MachineFunction &F) {
+void MachineBlockPlacement::buildCFGChains() {
   // Ensure that every BB in the function has an associated chain to simplify
   // the assumptions of the remaining algorithm.
   SmallVector<MachineOperand, 4> Cond; // For AnalyzeBranch.
-  for (MachineFunction::iterator FI = F.begin(), FE = F.end(); FI != FE; ++FI) {
+  for (MachineFunction::iterator FI = F->begin(), FE = F->end(); FI != FE;
+       ++FI) {
     MachineBasicBlock *BB = &*FI;
     BlockChain *Chain =
         new (ChainAllocator.Allocate()) BlockChain(BlockToChain, BB);
@@ -1144,7 +1439,7 @@ void MachineBlockPlacement::buildCFGChains(MachineFunction &F) {
     for (;;) {
       Cond.clear();
       MachineBasicBlock *TBB = nullptr, *FBB = nullptr; // For AnalyzeBranch.
-      if (!TII->AnalyzeBranch(*BB, TBB, FBB, Cond) || !FI->canFallThrough())
+      if (!TII->analyzeBranch(*BB, TBB, FBB, Cond) || !FI->canFallThrough())
         break;
 
       MachineFunction::iterator NextFI = std::next(FI);
@@ -1161,55 +1456,22 @@ void MachineBlockPlacement::buildCFGChains(MachineFunction &F) {
     }
   }
 
-  if (OutlineOptionalBranches) {
-    // Find the nearest common dominator of all of F's terminators.
-    MachineBasicBlock *Terminator = nullptr;
-    for (MachineBasicBlock &MBB : F) {
-      if (MBB.succ_size() == 0) {
-        if (Terminator == nullptr)
-          Terminator = &MBB;
-        else
-          Terminator = MDT->findNearestCommonDominator(Terminator, &MBB);
-      }
-    }
-
-    // MBBs dominating this common dominator are unavoidable.
-    UnavoidableBlocks.clear();
-    for (MachineBasicBlock &MBB : F) {
-      if (MDT->dominates(&MBB, Terminator)) {
-        UnavoidableBlocks.insert(&MBB);
-      }
-    }
-  }
+  // Turned on with OutlineOptionalBranches option
+  collectMustExecuteBBs();
 
   // Build any loop-based chains.
   for (MachineLoop *L : *MLI)
-    buildLoopChains(F, *L);
+    buildLoopChains(*L);
 
-  SmallVector<MachineBasicBlock *, 16> BlockWorkList;
+  assert(BlockWorkList.empty());
+  assert(EHPadWorkList.empty());
 
   SmallPtrSet<BlockChain *, 4> UpdatedPreds;
-  for (MachineBasicBlock &MBB : F) {
-    BlockChain &Chain = *BlockToChain[&MBB];
-    if (!UpdatedPreds.insert(&Chain).second)
-      continue;
-
-    assert(Chain.LoopPredecessors == 0);
-    for (MachineBasicBlock *ChainBB : Chain) {
-      assert(BlockToChain[ChainBB] == &Chain);
-      for (MachineBasicBlock *Pred : ChainBB->predecessors()) {
-        if (BlockToChain[Pred] == &Chain)
-          continue;
-        ++Chain.LoopPredecessors;
-      }
-    }
-
-    if (Chain.LoopPredecessors == 0)
-      BlockWorkList.push_back(*Chain.begin());
-  }
+  for (MachineBasicBlock &MBB : *F)
+    fillWorkLists(&MBB, UpdatedPreds);
 
-  BlockChain &FunctionChain = *BlockToChain[&F.front()];
-  buildChain(&F.front(), FunctionChain, BlockWorkList);
+  BlockChain &FunctionChain = *BlockToChain[&F->front()];
+  buildChain(&F->front(), FunctionChain);
 
 #ifndef NDEBUG
   typedef SmallPtrSet<MachineBasicBlock *, 16> FunctionBlockSetType;
@@ -1218,7 +1480,7 @@ void MachineBlockPlacement::buildCFGChains(MachineFunction &F) {
     // Crash at the end so we get all of the debugging output first.
     bool BadFunc = false;
     FunctionBlockSetType FunctionBlockSet;
-    for (MachineBasicBlock &MBB : F)
+    for (MachineBasicBlock &MBB : *F)
       FunctionBlockSet.insert(&MBB);
 
     for (MachineBasicBlock *ChainBB : FunctionChain)
@@ -1238,13 +1500,14 @@ void MachineBlockPlacement::buildCFGChains(MachineFunction &F) {
   });
 
   // Splice the blocks into place.
-  MachineFunction::iterator InsertPos = F.begin();
+  MachineFunction::iterator InsertPos = F->begin();
+  DEBUG(dbgs() << "[MBP] Function: "<< F->getName() << "\n");
   for (MachineBasicBlock *ChainBB : FunctionChain) {
     DEBUG(dbgs() << (ChainBB == *FunctionChain.begin() ? "Placing chain "
                                                        : "          ... ")
                  << getBlockName(ChainBB) << "\n");
     if (InsertPos != MachineFunction::iterator(ChainBB))
-      F.splice(InsertPos, ChainBB);
+      F->splice(InsertPos, ChainBB);
     else
       ++InsertPos;
 
@@ -1258,69 +1521,90 @@ void MachineBlockPlacement::buildCFGChains(MachineFunction &F) {
     // boiler plate.
     Cond.clear();
     MachineBasicBlock *TBB = nullptr, *FBB = nullptr; // For AnalyzeBranch.
-    if (!TII->AnalyzeBranch(*PrevBB, TBB, FBB, Cond)) {
-      // The "PrevBB" is not yet updated to reflect current code layout, so,
-      //   o. it may fall-through to a block without explict "goto" instruction
-      //      before layout, and no longer fall-through it after layout; or
-      //   o. just opposite.
-      //
-      // AnalyzeBranch() may return erroneous value for FBB when these two
-      // situations take place. For the first scenario FBB is mistakenly set
-      // NULL; for the 2nd scenario, the FBB, which is expected to be NULL,
-      // is mistakenly pointing to "*BI".
-      //
-      bool needUpdateBr = true;
-      if (!Cond.empty() && (!FBB || FBB == ChainBB)) {
-        PrevBB->updateTerminator();
-        needUpdateBr = false;
-        Cond.clear();
-        TBB = FBB = nullptr;
-        if (TII->AnalyzeBranch(*PrevBB, TBB, FBB, Cond)) {
-          // FIXME: This should never take place.
-          TBB = FBB = nullptr;
-        }
-      }
 
+    // The "PrevBB" is not yet updated to reflect current code layout, so,
+    //   o. it may fall-through to a block without explicit "goto" instruction
+    //      before layout, and no longer fall-through it after layout; or
+    //   o. just opposite.
+    //
+    // analyzeBranch() may return erroneous value for FBB when these two
+    // situations take place. For the first scenario FBB is mistakenly set NULL;
+    // for the 2nd scenario, the FBB, which is expected to be NULL, is
+    // mistakenly pointing to "*BI".
+    // Thus, if the future change needs to use FBB before the layout is set, it
+    // has to correct FBB first by using the code similar to the following:
+    //
+    // if (!Cond.empty() && (!FBB || FBB == ChainBB)) {
+    //   PrevBB->updateTerminator();
+    //   Cond.clear();
+    //   TBB = FBB = nullptr;
+    //   if (TII->analyzeBranch(*PrevBB, TBB, FBB, Cond)) {
+    //     // FIXME: This should never take place.
+    //     TBB = FBB = nullptr;
+    //   }
+    // }
+    if (!TII->analyzeBranch(*PrevBB, TBB, FBB, Cond))
+      PrevBB->updateTerminator();
+  }
+
+  // Fixup the last block.
+  Cond.clear();
+  MachineBasicBlock *TBB = nullptr, *FBB = nullptr; // For AnalyzeBranch.
+  if (!TII->analyzeBranch(F->back(), TBB, FBB, Cond))
+    F->back().updateTerminator();
+
+  BlockWorkList.clear();
+  EHPadWorkList.clear();
+}
+
+void MachineBlockPlacement::optimizeBranches() {
+  BlockChain &FunctionChain = *BlockToChain[&F->front()];
+  SmallVector<MachineOperand, 4> Cond; // For AnalyzeBranch.
+
+  // Now that all the basic blocks in the chain have the proper layout,
+  // make a final call to AnalyzeBranch with AllowModify set.
+  // Indeed, the target may be able to optimize the branches in a way we
+  // cannot because all branches may not be analyzable.
+  // E.g., the target may be able to remove an unconditional branch to
+  // a fallthrough when it occurs after predicated terminators.
+  for (MachineBasicBlock *ChainBB : FunctionChain) {
+    Cond.clear();
+    MachineBasicBlock *TBB = nullptr, *FBB = nullptr; // For AnalyzeBranch.
+    if (!TII->analyzeBranch(*ChainBB, TBB, FBB, Cond, /*AllowModify*/ true)) {
       // If PrevBB has a two-way branch, try to re-order the branches
       // such that we branch to the successor with higher probability first.
       if (TBB && !Cond.empty() && FBB &&
-          MBPI->getEdgeProbability(PrevBB, FBB) >
-              MBPI->getEdgeProbability(PrevBB, TBB) &&
+          MBPI->getEdgeProbability(ChainBB, FBB) >
+              MBPI->getEdgeProbability(ChainBB, TBB) &&
           !TII->ReverseBranchCondition(Cond)) {
         DEBUG(dbgs() << "Reverse order of the two branches: "
-                     << getBlockName(PrevBB) << "\n");
+                     << getBlockName(ChainBB) << "\n");
         DEBUG(dbgs() << "    Edge probability: "
-                     << MBPI->getEdgeProbability(PrevBB, FBB) << " vs "
-                     << MBPI->getEdgeProbability(PrevBB, TBB) << "\n");
+                     << MBPI->getEdgeProbability(ChainBB, FBB) << " vs "
+                     << MBPI->getEdgeProbability(ChainBB, TBB) << "\n");
         DebugLoc dl; // FIXME: this is nowhere
-        TII->RemoveBranch(*PrevBB);
-        TII->InsertBranch(*PrevBB, FBB, TBB, Cond, dl);
-        needUpdateBr = true;
+        TII->RemoveBranch(*ChainBB);
+        TII->InsertBranch(*ChainBB, FBB, TBB, Cond, dl);
+        ChainBB->updateTerminator();
       }
-      if (needUpdateBr)
-        PrevBB->updateTerminator();
     }
   }
+}
 
-  // Fixup the last block.
-  Cond.clear();
-  MachineBasicBlock *TBB = nullptr, *FBB = nullptr; // For AnalyzeBranch.
-  if (!TII->AnalyzeBranch(F.back(), TBB, FBB, Cond))
-    F.back().updateTerminator();
-
+void MachineBlockPlacement::alignBlocks() {
   // Walk through the backedges of the function now that we have fully laid out
   // the basic blocks and align the destination of each backedge. We don't rely
   // exclusively on the loop info here so that we can align backedges in
   // unnatural CFGs and backedges that were introduced purely because of the
   // loop rotations done during this layout pass.
-  // FIXME: Use Function::optForSize().
-  if (F.getFunction()->hasFnAttribute(Attribute::OptimizeForSize))
+  if (F->getFunction()->optForSize())
     return;
+  BlockChain &FunctionChain = *BlockToChain[&F->front()];
   if (FunctionChain.begin() == FunctionChain.end())
     return; // Empty chain.
 
   const BranchProbability ColdProb(1, 5); // 20%
-  BlockFrequency EntryFreq = MBFI->getBlockFreq(&F.front());
+  BlockFrequency EntryFreq = MBFI->getBlockFreq(&F->front());
   BlockFrequency WeightedEntryFreq = EntryFreq * ColdProb;
   for (MachineBasicBlock *ChainBB : FunctionChain) {
     if (ChainBB == *FunctionChain.begin())
@@ -1334,11 +1618,6 @@ void MachineBlockPlacement::buildCFGChains(MachineFunction &F) {
     if (!L)
       continue;
 
-    if (AlignAllLoops) {
-      ChainBB->setAlignment(AlignAllLoops);
-      continue;
-    }
-
     unsigned Align = TLI->getPrefLoopAlignment(L);
     if (!Align)
       continue; // Don't care about loop alignment.
@@ -1380,31 +1659,67 @@ void MachineBlockPlacement::buildCFGChains(MachineFunction &F) {
   }
 }
 
-bool MachineBlockPlacement::runOnMachineFunction(MachineFunction &F) {
-  // Check for single-block functions and skip them.
-  if (std::next(F.begin()) == F.end())
+bool MachineBlockPlacement::runOnMachineFunction(MachineFunction &MF) {
+  if (skipFunction(*MF.getFunction()))
     return false;
 
-  if (skipOptnoneFunction(*F.getFunction()))
+  // Check for single-block functions and skip them.
+  if (std::next(MF.begin()) == MF.end())
     return false;
 
+  F = &MF;
   MBPI = &getAnalysis<MachineBranchProbabilityInfo>();
-  MBFI = &getAnalysis<MachineBlockFrequencyInfo>();
+  MBFI = llvm::make_unique<BranchFolder::MBFIWrapper>(
+      getAnalysis<MachineBlockFrequencyInfo>());
   MLI = &getAnalysis<MachineLoopInfo>();
-  TII = F.getSubtarget().getInstrInfo();
-  TLI = F.getSubtarget().getTargetLowering();
+  TII = MF.getSubtarget().getInstrInfo();
+  TLI = MF.getSubtarget().getTargetLowering();
   MDT = &getAnalysis<MachineDominatorTree>();
   assert(BlockToChain.empty());
 
-  buildCFGChains(F);
+  buildCFGChains();
+
+  // Changing the layout can create new tail merging opportunities.
+  TargetPassConfig *PassConfig = &getAnalysis<TargetPassConfig>();
+  // TailMerge can create jump into if branches that make CFG irreducible for
+  // HW that requires structured CFG.
+  bool EnableTailMerge = !MF.getTarget().requiresStructuredCFG() &&
+                         PassConfig->getEnableTailMerge() &&
+                         BranchFoldPlacement;
+  // No tail merging opportunities if the block number is less than four.
+  if (MF.size() > 3 && EnableTailMerge) {
+    BranchFolder BF(/*EnableTailMerge=*/true, /*CommonHoist=*/false, *MBFI,
+                    *MBPI);
+
+    if (BF.OptimizeFunction(MF, TII, MF.getSubtarget().getRegisterInfo(),
+                            getAnalysisIfAvailable<MachineModuleInfo>(), MLI,
+                            /*AfterBlockPlacement=*/true)) {
+      // Redo the layout if tail merging creates/removes/moves blocks.
+      BlockToChain.clear();
+      ChainAllocator.DestroyAll();
+      buildCFGChains();
+    }
+  }
+
+  optimizeBranches();
+  alignBlocks();
 
   BlockToChain.clear();
   ChainAllocator.DestroyAll();
 
   if (AlignAllBlock)
     // Align all of the blocks in the function to a specific alignment.
-    for (MachineBasicBlock &MBB : F)
+    for (MachineBasicBlock &MBB : MF)
       MBB.setAlignment(AlignAllBlock);
+  else if (AlignAllNonFallThruBlocks) {
+    // Align all of the blocks that have no fall-through predecessors to a
+    // specific alignment.
+    for (auto MBI = std::next(MF.begin()), MBE = MF.end(); MBI != MBE; ++MBI) {
+      auto LayoutPred = std::prev(MBI);
+      if (!LayoutPred->isSuccessor(&*MBI))
+        MBI->setAlignment(AlignAllNonFallThruBlocks);
+    }
+  }
 
   // We always return true as we have no way to track whether the final order
   // differs from the original order.
diff --git a/lib/CodeGen/MachineBranchProbabilityInfo.cpp b/lib/CodeGen/MachineBranchProbabilityInfo.cpp
index cf6d4018cb70..fe7340618374 100644
--- a/lib/CodeGen/MachineBranchProbabilityInfo.cpp
+++ b/lib/CodeGen/MachineBranchProbabilityInfo.cpp
@@ -24,9 +24,21 @@ INITIALIZE_PASS_BEGIN(MachineBranchProbabilityInfo, "machine-branch-prob",
 INITIALIZE_PASS_END(MachineBranchProbabilityInfo, "machine-branch-prob",
                     "Machine Branch Probability Analysis", false, true)
 
+cl::opt<unsigned>
+    StaticLikelyProb("static-likely-prob",
+                     cl::desc("branch probability threshold in percentage"
+                              "to be considered very likely"),
+                     cl::init(80), cl::Hidden);
+
+cl::opt<unsigned> ProfileLikelyProb(
+    "profile-likely-prob",
+    cl::desc("branch probability threshold in percentage to be considered"
+             " very likely when profile is available"),
+    cl::init(51), cl::Hidden);
+
 char MachineBranchProbabilityInfo::ID = 0;
 
-void MachineBranchProbabilityInfo::anchor() { }
+void MachineBranchProbabilityInfo::anchor() {}
 
 BranchProbability MachineBranchProbabilityInfo::getEdgeProbability(
     const MachineBasicBlock *Src,
@@ -42,11 +54,9 @@ BranchProbability MachineBranchProbabilityInfo::getEdgeProbability(
                             std::find(Src->succ_begin(), Src->succ_end(), Dst));
 }
 
-bool
-MachineBranchProbabilityInfo::isEdgeHot(const MachineBasicBlock *Src,
-                                        const MachineBasicBlock *Dst) const {
-  // Hot probability is at least 4/5 = 80%
-  static BranchProbability HotProb(4, 5);
+bool MachineBranchProbabilityInfo::isEdgeHot(
+    const MachineBasicBlock *Src, const MachineBasicBlock *Dst) const {
+  BranchProbability HotProb(StaticLikelyProb, 100);
   return getEdgeProbability(Src, Dst) > HotProb;
 }
 
@@ -63,7 +73,7 @@ MachineBranchProbabilityInfo::getHotSucc(MachineBasicBlock *MBB) const {
     }
   }
 
-  static BranchProbability HotProb(4, 5);
+  BranchProbability HotProb(StaticLikelyProb, 100);
   if (getEdgeProbability(MBB, MaxSucc) >= HotProb)
     return MaxSucc;
 
diff --git a/lib/CodeGen/MachineCSE.cpp b/lib/CodeGen/MachineCSE.cpp
index aad376c4702b..1209f73d9601 100644
--- a/lib/CodeGen/MachineCSE.cpp
+++ b/lib/CodeGen/MachineCSE.cpp
@@ -352,6 +352,12 @@ bool MachineCSE::isCSECandidate(MachineInstr *MI) {
       // This is a trivial form of alias analysis.
       return false;
   }
+
+  // Ignore stack guard loads, otherwise the register that holds CSEed value may
+  // be spilled and get loaded back with corrupted data.
+  if (MI->getOpcode() == TargetOpcode::LOAD_STACK_GUARD)
+    return false;
+
   return true;
 }
 
@@ -383,7 +389,7 @@ bool MachineCSE::isProfitableToCSE(unsigned CSReg, unsigned Reg,
   // Heuristics #1: Don't CSE "cheap" computation if the def is not local or in
   // an immediate predecessor. We don't want to increase register pressure and
   // end up causing other computation to be spilled.
-  if (TII->isAsCheapAsAMove(MI)) {
+  if (TII->isAsCheapAsAMove(*MI)) {
     MachineBasicBlock *CSBB = CSMI->getParent();
     MachineBasicBlock *BB = MI->getParent();
     if (CSBB != BB && !CSBB->isSuccessor(BB))
@@ -472,8 +478,7 @@ bool MachineCSE::ProcessBlock(MachineBasicBlock *MBB) {
     // Commute commutable instructions.
     bool Commuted = false;
     if (!FoundCSE && MI->isCommutable()) {
-      MachineInstr *NewMI = TII->commuteInstruction(MI);
-      if (NewMI) {
+      if (MachineInstr *NewMI = TII->commuteInstruction(*MI)) {
         Commuted = true;
         FoundCSE = VNT.count(NewMI);
         if (NewMI != MI) {
@@ -482,7 +487,7 @@ bool MachineCSE::ProcessBlock(MachineBasicBlock *MBB) {
           Changed = true;
         } else if (!FoundCSE)
           // MI was changed but it didn't help, commute it back!
-          (void)TII->commuteInstruction(MI);
+          (void)TII->commuteInstruction(*MI);
       }
     }
 
@@ -698,7 +703,7 @@ bool MachineCSE::PerformCSE(MachineDomTreeNode *Node) {
 }
 
 bool MachineCSE::runOnMachineFunction(MachineFunction &MF) {
-  if (skipOptnoneFunction(*MF.getFunction()))
+  if (skipFunction(*MF.getFunction()))
     return false;
 
   TII = MF.getSubtarget().getInstrInfo();
diff --git a/lib/CodeGen/MachineCombiner.cpp b/lib/CodeGen/MachineCombiner.cpp
index fa43c4dfa05a..6b5c6ba82506 100644
--- a/lib/CodeGen/MachineCombiner.cpp
+++ b/lib/CodeGen/MachineCombiner.cpp
@@ -13,8 +13,8 @@
 
 #define DEBUG_TYPE "machine-combiner"
 
-#include "llvm/ADT/Statistic.h"
 #include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/Statistic.h"
 #include "llvm/CodeGen/MachineDominators.h"
 #include "llvm/CodeGen/MachineFunction.h"
 #include "llvm/CodeGen/MachineFunctionPass.h"
@@ -24,7 +24,6 @@
 #include "llvm/CodeGen/MachineTraceMetrics.h"
 #include "llvm/CodeGen/Passes.h"
 #include "llvm/CodeGen/TargetSchedule.h"
-#include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/Target/TargetInstrInfo.h"
@@ -41,6 +40,7 @@ class MachineCombiner : public MachineFunctionPass {
   const TargetRegisterInfo *TRI;
   MCSchedModel SchedModel;
   MachineRegisterInfo *MRI;
+  MachineLoopInfo *MLI; // Current MachineLoopInfo
   MachineTraceMetrics *Traces;
   MachineTraceMetrics::Ensemble *MinInstr;
 
@@ -87,6 +87,7 @@ char &llvm::MachineCombinerID = MachineCombiner::ID;
 
 INITIALIZE_PASS_BEGIN(MachineCombiner, "machine-combiner",
                       "Machine InstCombiner", false, false)
+INITIALIZE_PASS_DEPENDENCY(MachineLoopInfo)
 INITIALIZE_PASS_DEPENDENCY(MachineTraceMetrics)
 INITIALIZE_PASS_END(MachineCombiner, "machine-combiner", "Machine InstCombiner",
                     false, false)
@@ -94,6 +95,7 @@ INITIALIZE_PASS_END(MachineCombiner, "machine-combiner", "Machine InstCombiner",
 void MachineCombiner::getAnalysisUsage(AnalysisUsage &AU) const {
   AU.setPreservesCFG();
   AU.addPreserved<MachineDominatorTree>();
+  AU.addRequired<MachineLoopInfo>();
   AU.addPreserved<MachineLoopInfo>();
   AU.addRequired<MachineTraceMetrics>();
   AU.addPreserved<MachineTraceMetrics>();
@@ -156,7 +158,7 @@ MachineCombiner::getDepth(SmallVectorImpl<MachineInstr *> &InsInstrs,
       } else {
         MachineInstr *DefInstr = getOperandDef(MO);
         if (DefInstr) {
-          DepthOp = BlockTrace.getInstrCycles(DefInstr).Depth;
+          DepthOp = BlockTrace.getInstrCycles(*DefInstr).Depth;
           LatencyOp = TSchedModel.computeOperandLatency(
               DefInstr, DefInstr->findRegisterDefOperandIdx(MO.getReg()),
               InstrPtr, InstrPtr->findRegisterUseOperandIdx(MO.getReg()));
@@ -198,7 +200,7 @@ unsigned MachineCombiner::getLatency(MachineInstr *Root, MachineInstr *NewRoot,
     RI++;
     MachineInstr *UseMO = RI->getParent();
     unsigned LatencyOp = 0;
-    if (UseMO && BlockTrace.isDepInTrace(Root, UseMO)) {
+    if (UseMO && BlockTrace.isDepInTrace(*Root, *UseMO)) {
       LatencyOp = TSchedModel.computeOperandLatency(
           NewRoot, NewRoot->findRegisterDefOperandIdx(MO.getReg()), UseMO,
           UseMO->findRegisterUseOperandIdx(MO.getReg()));
@@ -250,7 +252,7 @@ bool MachineCombiner::improvesCriticalPathLen(
 
   // Get depth and latency of NewRoot and Root.
   unsigned NewRootDepth = getDepth(InsInstrs, InstrIdxForVirtReg, BlockTrace);
-  unsigned RootDepth = BlockTrace.getInstrCycles(Root).Depth;
+  unsigned RootDepth = BlockTrace.getInstrCycles(*Root).Depth;
 
   DEBUG(dbgs() << "DEPENDENCE DATA FOR " << Root << "\n";
         dbgs() << " NewRootDepth: " << NewRootDepth << "\n";
@@ -269,7 +271,7 @@ bool MachineCombiner::improvesCriticalPathLen(
   // even if the instruction depths (data dependency cycles) become worse.
   unsigned NewRootLatency = getLatency(Root, NewRoot, BlockTrace);
   unsigned RootLatency = TSchedModel.computeInstrLatency(Root);
-  unsigned RootSlack = BlockTrace.getInstrSlack(Root);
+  unsigned RootSlack = BlockTrace.getInstrSlack(*Root);
 
   DEBUG(dbgs() << " NewRootLatency: " << NewRootLatency << "\n";
         dbgs() << " RootLatency: " << RootLatency << "\n";
@@ -281,7 +283,7 @@ bool MachineCombiner::improvesCriticalPathLen(
 
   unsigned NewCycleCount = NewRootDepth + NewRootLatency;
   unsigned OldCycleCount = RootDepth + RootLatency + RootSlack;
-  
+
   return NewCycleCount <= OldCycleCount;
 }
 
@@ -355,6 +357,8 @@ bool MachineCombiner::combineInstructions(MachineBasicBlock *MBB) {
   DEBUG(dbgs() << "Combining MBB " << MBB->getName() << "\n");
 
   auto BlockIter = MBB->begin();
+  // Check if the block is in a loop.
+  const MachineLoop *ML = MLI->getLoopFor(MBB);
 
   while (BlockIter != MBB->end()) {
     auto &MI = *BlockIter++;
@@ -407,11 +411,15 @@ bool MachineCombiner::combineInstructions(MachineBasicBlock *MBB) {
       if (!NewInstCount)
         continue;
 
+      bool SubstituteAlways = false;
+      if (ML && TII->isThroughputPattern(P))
+        SubstituteAlways = true;
+
       // Substitute when we optimize for codesize and the new sequence has
       // fewer instructions OR
       // the new sequence neither lengthens the critical path nor increases
       // resource pressure.
-      if (doSubstitute(NewInstCount, OldInstCount) ||
+      if (SubstituteAlways || doSubstitute(NewInstCount, OldInstCount) ||
           (improvesCriticalPathLen(MBB, &MI, BlockTrace, InsInstrs,
                                    InstrIdxForVirtReg, P) &&
            preservesResourceLen(MBB, BlockTrace, InsInstrs, DelInstrs))) {
@@ -448,6 +456,7 @@ bool MachineCombiner::runOnMachineFunction(MachineFunction &MF) {
   SchedModel = STI.getSchedModel();
   TSchedModel.init(SchedModel, &STI, TII);
   MRI = &MF.getRegInfo();
+  MLI = &getAnalysis<MachineLoopInfo>();
   Traces = &getAnalysis<MachineTraceMetrics>();
   MinInstr = nullptr;
   OptSize = MF.getFunction()->optForSize();
diff --git a/lib/CodeGen/MachineCopyPropagation.cpp b/lib/CodeGen/MachineCopyPropagation.cpp
index a6863412132b..8fdf39d54bd0 100644
--- a/lib/CodeGen/MachineCopyPropagation.cpp
+++ b/lib/CodeGen/MachineCopyPropagation.cpp
@@ -21,7 +21,6 @@
 #include "llvm/CodeGen/MachineRegisterInfo.h"
 #include "llvm/Pass.h"
 #include "llvm/Support/Debug.h"
-#include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/Target/TargetInstrInfo.h"
 #include "llvm/Target/TargetRegisterInfo.h"
@@ -33,27 +32,47 @@ using namespace llvm;
 STATISTIC(NumDeletes, "Number of dead copies deleted");
 
 namespace {
+  typedef SmallVector<unsigned, 4> RegList;
+  typedef DenseMap<unsigned, RegList> SourceMap;
+  typedef DenseMap<unsigned, MachineInstr*> Reg2MIMap;
+
   class MachineCopyPropagation : public MachineFunctionPass {
     const TargetRegisterInfo *TRI;
     const TargetInstrInfo *TII;
-    MachineRegisterInfo *MRI;
+    const MachineRegisterInfo *MRI;
 
   public:
     static char ID; // Pass identification, replacement for typeid
     MachineCopyPropagation() : MachineFunctionPass(ID) {
-     initializeMachineCopyPropagationPass(*PassRegistry::getPassRegistry());
+      initializeMachineCopyPropagationPass(*PassRegistry::getPassRegistry());
+    }
+
+    void getAnalysisUsage(AnalysisUsage &AU) const override {
+      AU.setPreservesCFG();
+      MachineFunctionPass::getAnalysisUsage(AU);
     }
 
     bool runOnMachineFunction(MachineFunction &MF) override;
 
-  private:
-    typedef SmallVector<unsigned, 4> DestList;
-    typedef DenseMap<unsigned, DestList> SourceMap;
+    MachineFunctionProperties getRequiredProperties() const override {
+      return MachineFunctionProperties().set(
+          MachineFunctionProperties::Property::AllVRegsAllocated);
+    }
 
-    void SourceNoLongerAvailable(unsigned Reg,
-                                 SourceMap &SrcMap,
-                                 DenseMap<unsigned, MachineInstr*> &AvailCopyMap);
-    bool CopyPropagateBlock(MachineBasicBlock &MBB);
+  private:
+    void ClobberRegister(unsigned Reg);
+    void CopyPropagateBlock(MachineBasicBlock &MBB);
+    bool eraseIfRedundant(MachineInstr &Copy, unsigned Src, unsigned Def);
+
+    /// Candidates for deletion.
+    SmallSetVector<MachineInstr*, 8> MaybeDeadCopies;
+    /// Def -> available copies map.
+    Reg2MIMap AvailCopyMap;
+    /// Def -> copies map.
+    Reg2MIMap CopyMap;
+    /// Src -> Def map
+    SourceMap SrcMap;
+    bool Changed;
   };
 }
 char MachineCopyPropagation::ID = 0;
@@ -62,79 +81,105 @@ char &llvm::MachineCopyPropagationID = MachineCopyPropagation::ID;
 INITIALIZE_PASS(MachineCopyPropagation, "machine-cp",
                 "Machine Copy Propagation Pass", false, false)
 
-void
-MachineCopyPropagation::SourceNoLongerAvailable(unsigned Reg,
-                              SourceMap &SrcMap,
-                              DenseMap<unsigned, MachineInstr*> &AvailCopyMap) {
+/// Remove any entry in \p Map where the register is a subregister or equal to
+/// a register contained in \p Regs.
+static void removeRegsFromMap(Reg2MIMap &Map, const RegList &Regs,
+                              const TargetRegisterInfo &TRI) {
+  for (unsigned Reg : Regs) {
+    // Source of copy is no longer available for propagation.
+    for (MCSubRegIterator SR(Reg, &TRI, true); SR.isValid(); ++SR)
+      Map.erase(*SR);
+  }
+}
+
+/// Remove any entry in \p Map that is marked clobbered in \p RegMask.
+/// The map will typically have a lot fewer entries than the regmask clobbers,
+/// so this is more efficient than iterating the clobbered registers and calling
+/// ClobberRegister() on them.
+static void removeClobberedRegsFromMap(Reg2MIMap &Map,
+                                       const MachineOperand &RegMask) {
+  for (Reg2MIMap::iterator I = Map.begin(), E = Map.end(), Next; I != E;
+       I = Next) {
+    Next = std::next(I);
+    unsigned Reg = I->first;
+    if (RegMask.clobbersPhysReg(Reg))
+      Map.erase(I);
+  }
+}
+
+void MachineCopyPropagation::ClobberRegister(unsigned Reg) {
   for (MCRegAliasIterator AI(Reg, TRI, true); AI.isValid(); ++AI) {
+    CopyMap.erase(*AI);
+    AvailCopyMap.erase(*AI);
+
     SourceMap::iterator SI = SrcMap.find(*AI);
     if (SI != SrcMap.end()) {
-      const DestList& Defs = SI->second;
-      for (DestList::const_iterator I = Defs.begin(), E = Defs.end();
-           I != E; ++I) {
-        unsigned MappedDef = *I;
-        // Source of copy is no longer available for propagation.
-        AvailCopyMap.erase(MappedDef);
-        for (MCSubRegIterator SR(MappedDef, TRI); SR.isValid(); ++SR)
-          AvailCopyMap.erase(*SR);
-      }
+      removeRegsFromMap(AvailCopyMap, SI->second, *TRI);
+      SrcMap.erase(SI);
     }
   }
 }
 
-static bool NoInterveningSideEffect(const MachineInstr *CopyMI,
-                                    const MachineInstr *MI) {
-  const MachineBasicBlock *MBB = CopyMI->getParent();
-  if (MI->getParent() != MBB)
-    return false;
-  MachineBasicBlock::const_iterator I = CopyMI;
-  MachineBasicBlock::const_iterator E = MBB->end();
-  MachineBasicBlock::const_iterator E2 = MI;
-
-  ++I;
-  while (I != E && I != E2) {
-    if (I->hasUnmodeledSideEffects() || I->isCall() ||
-        I->isTerminator())
-      return false;
-    ++I;
+/// Return true if \p PreviousCopy did copy register \p Src to register \p Def.
+/// This fact may have been obscured by sub register usage or may not be true at
+/// all even though Src and Def are subregisters of the registers used in
+/// PreviousCopy. e.g.
+/// isNopCopy("ecx = COPY eax", AX, CX) == true
+/// isNopCopy("ecx = COPY eax", AH, CL) == false
+static bool isNopCopy(const MachineInstr &PreviousCopy, unsigned Src,
+                      unsigned Def, const TargetRegisterInfo *TRI) {
+  unsigned PreviousSrc = PreviousCopy.getOperand(1).getReg();
+  unsigned PreviousDef = PreviousCopy.getOperand(0).getReg();
+  if (Src == PreviousSrc) {
+    assert(Def == PreviousDef);
+    return true;
   }
-  return true;
+  if (!TRI->isSubRegister(PreviousSrc, Src))
+    return false;
+  unsigned SubIdx = TRI->getSubRegIndex(PreviousSrc, Src);
+  return SubIdx == TRI->getSubRegIndex(PreviousDef, Def);
 }
 
-/// isNopCopy - Return true if the specified copy is really a nop. That is
-/// if the source of the copy is the same of the definition of the copy that
-/// supplied the source. If the source of the copy is a sub-register than it
-/// must check the sub-indices match. e.g.
-/// ecx = mov eax
-/// al  = mov cl
-/// But not
-/// ecx = mov eax
-/// al  = mov ch
-static bool isNopCopy(MachineInstr *CopyMI, unsigned Def, unsigned Src,
-                      const TargetRegisterInfo *TRI) {
-  unsigned SrcSrc = CopyMI->getOperand(1).getReg();
-  if (Def == SrcSrc)
-    return true;
-  if (TRI->isSubRegister(SrcSrc, Def)) {
-    unsigned SrcDef = CopyMI->getOperand(0).getReg();
-    unsigned SubIdx = TRI->getSubRegIndex(SrcSrc, Def);
-    if (!SubIdx)
-      return false;
-    return SubIdx == TRI->getSubRegIndex(SrcDef, Src);
-  }
+/// Remove instruction \p Copy if there exists a previous copy that copies the
+/// register \p Src to the register \p Def; This may happen indirectly by
+/// copying the super registers.
+bool MachineCopyPropagation::eraseIfRedundant(MachineInstr &Copy, unsigned Src,
+                                              unsigned Def) {
+  // Avoid eliminating a copy from/to a reserved registers as we cannot predict
+  // the value (Example: The sparc zero register is writable but stays zero).
+  if (MRI->isReserved(Src) || MRI->isReserved(Def))
+    return false;
 
-  return false;
-}
+  // Search for an existing copy.
+  Reg2MIMap::iterator CI = AvailCopyMap.find(Def);
+  if (CI == AvailCopyMap.end())
+    return false;
+
+  // Check that the existing copy uses the correct sub registers.
+  MachineInstr &PrevCopy = *CI->second;
+  if (!isNopCopy(PrevCopy, Src, Def, TRI))
+    return false;
 
-bool MachineCopyPropagation::CopyPropagateBlock(MachineBasicBlock &MBB) {
-  SmallSetVector<MachineInstr*, 8> MaybeDeadCopies;  // Candidates for deletion
-  DenseMap<unsigned, MachineInstr*> AvailCopyMap;    // Def -> available copies map
-  DenseMap<unsigned, MachineInstr*> CopyMap;         // Def -> copies map
-  SourceMap SrcMap; // Src -> Def map
+  DEBUG(dbgs() << "MCP: copy is a NOP, removing: "; Copy.dump());
 
+  // Copy was redundantly redefining either Src or Def. Remove earlier kill
+  // flags between Copy and PrevCopy because the value will be reused now.
+  assert(Copy.isCopy());
+  unsigned CopyDef = Copy.getOperand(0).getReg();
+  assert(CopyDef == Src || CopyDef == Def);
+  for (MachineInstr &MI :
+       make_range(PrevCopy.getIterator(), Copy.getIterator()))
+    MI.clearRegisterKills(CopyDef, TRI);
+
+  Copy.eraseFromParent();
+  Changed = true;
+  ++NumDeletes;
+  return true;
+}
+
+void MachineCopyPropagation::CopyPropagateBlock(MachineBasicBlock &MBB) {
   DEBUG(dbgs() << "MCP: CopyPropagateBlock " << MBB.getName() << "\n");
 
-  bool Changed = false;
   for (MachineBasicBlock::iterator I = MBB.begin(), E = MBB.end(); I != E; ) {
     MachineInstr *MI = &*I;
     ++I;
@@ -143,48 +188,32 @@ bool MachineCopyPropagation::CopyPropagateBlock(MachineBasicBlock &MBB) {
       unsigned Def = MI->getOperand(0).getReg();
       unsigned Src = MI->getOperand(1).getReg();
 
-      if (TargetRegisterInfo::isVirtualRegister(Def) ||
-          TargetRegisterInfo::isVirtualRegister(Src))
-        report_fatal_error("MachineCopyPropagation should be run after"
-                           " register allocation!");
-
-      DenseMap<unsigned, MachineInstr*>::iterator CI = AvailCopyMap.find(Src);
-      if (CI != AvailCopyMap.end()) {
-        MachineInstr *CopyMI = CI->second;
-        if (!MRI->isReserved(Def) &&
-            (!MRI->isReserved(Src) || NoInterveningSideEffect(CopyMI, MI)) &&
-            isNopCopy(CopyMI, Def, Src, TRI)) {
-          // The two copies cancel out and the source of the first copy
-          // hasn't been overridden, eliminate the second one. e.g.
-          //  %ECX<def> = COPY %EAX<kill>
-          //  ... nothing clobbered EAX.
-          //  %EAX<def> = COPY %ECX
-          // =>
-          //  %ECX<def> = COPY %EAX
-          //
-          // Also avoid eliminating a copy from reserved registers unless the
-          // definition is proven not clobbered. e.g.
-          // %RSP<def> = COPY %RAX
-          // CALL
-          // %RAX<def> = COPY %RSP
-
-          DEBUG(dbgs() << "MCP: copy is a NOP, removing: "; MI->dump());
-
-          // Clear any kills of Def between CopyMI and MI. This extends the
-          // live range.
-          for (MachineBasicBlock::iterator I = CopyMI, E = MI; I != E; ++I)
-            I->clearRegisterKills(Def, TRI);
-
-          MI->eraseFromParent();
-          Changed = true;
-          ++NumDeletes;
-          continue;
-        }
-      }
+      assert(!TargetRegisterInfo::isVirtualRegister(Def) &&
+             !TargetRegisterInfo::isVirtualRegister(Src) &&
+             "MachineCopyPropagation should be run after register allocation!");
+
+      // The two copies cancel out and the source of the first copy
+      // hasn't been overridden, eliminate the second one. e.g.
+      //  %ECX<def> = COPY %EAX
+      //  ... nothing clobbered EAX.
+      //  %EAX<def> = COPY %ECX
+      // =>
+      //  %ECX<def> = COPY %EAX
+      //
+      // or
+      //
+      //  %ECX<def> = COPY %EAX
+      //  ... nothing clobbered EAX.
+      //  %ECX<def> = COPY %EAX
+      // =>
+      //  %ECX<def> = COPY %EAX
+      if (eraseIfRedundant(*MI, Def, Src) || eraseIfRedundant(*MI, Src, Def))
+        continue;
 
-      // If Src is defined by a previous copy, it cannot be eliminated.
+      // If Src is defined by a previous copy, the previous copy cannot be
+      // eliminated.
       for (MCRegAliasIterator AI(Src, TRI, true); AI.isValid(); ++AI) {
-        CI = CopyMap.find(*AI);
+        Reg2MIMap::iterator CI = CopyMap.find(*AI);
         if (CI != CopyMap.end()) {
           DEBUG(dbgs() << "MCP: Copy is no longer dead: "; CI->second->dump());
           MaybeDeadCopies.remove(CI->second);
@@ -194,23 +223,19 @@ bool MachineCopyPropagation::CopyPropagateBlock(MachineBasicBlock &MBB) {
       DEBUG(dbgs() << "MCP: Copy is a deletion candidate: "; MI->dump());
 
       // Copy is now a candidate for deletion.
-      MaybeDeadCopies.insert(MI);
+      if (!MRI->isReserved(Def))
+        MaybeDeadCopies.insert(MI);
 
-      // If 'Src' is previously source of another copy, then this earlier copy's
+      // If 'Def' is previously source of another copy, then this earlier copy's
       // source is no longer available. e.g.
       // %xmm9<def> = copy %xmm2
       // ...
       // %xmm2<def> = copy %xmm0
       // ...
       // %xmm2<def> = copy %xmm9
-      SourceNoLongerAvailable(Def, SrcMap, AvailCopyMap);
+      ClobberRegister(Def);
 
       // Remember Def is defined by the copy.
-      // ... Make sure to clear the def maps of aliases first.
-      for (MCRegAliasIterator AI(Def, TRI, false); AI.isValid(); ++AI) {
-        CopyMap.erase(*AI);
-        AvailCopyMap.erase(*AI);
-      }
       for (MCSubRegIterator SR(Def, TRI, /*IncludeSelf=*/true); SR.isValid();
            ++SR) {
         CopyMap[*SR] = MI;
@@ -219,30 +244,27 @@ bool MachineCopyPropagation::CopyPropagateBlock(MachineBasicBlock &MBB) {
 
       // Remember source that's copied to Def. Once it's clobbered, then
       // it's no longer available for copy propagation.
-      if (std::find(SrcMap[Src].begin(), SrcMap[Src].end(), Def) ==
-          SrcMap[Src].end()) {
-        SrcMap[Src].push_back(Def);
-      }
+      RegList &DestList = SrcMap[Src];
+      if (std::find(DestList.begin(), DestList.end(), Def) == DestList.end())
+        DestList.push_back(Def);
 
       continue;
     }
 
     // Not a copy.
     SmallVector<unsigned, 2> Defs;
-    int RegMaskOpNum = -1;
-    for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) {
-      MachineOperand &MO = MI->getOperand(i);
+    const MachineOperand *RegMask = nullptr;
+    for (const MachineOperand &MO : MI->operands()) {
       if (MO.isRegMask())
-        RegMaskOpNum = i;
+        RegMask = &MO;
       if (!MO.isReg())
         continue;
       unsigned Reg = MO.getReg();
       if (!Reg)
         continue;
 
-      if (TargetRegisterInfo::isVirtualRegister(Reg))
-        report_fatal_error("MachineCopyPropagation should be run after"
-                           " register allocation!");
+      assert(!TargetRegisterInfo::isVirtualRegister(Reg) &&
+             "MachineCopyPropagation should be run after register allocation!");
 
       if (MO.isDef()) {
         Defs.push_back(Reg);
@@ -252,7 +274,7 @@ bool MachineCopyPropagation::CopyPropagateBlock(MachineBasicBlock &MBB) {
       // If 'Reg' is defined by a copy, the copy is no longer a candidate
       // for elimination.
       for (MCRegAliasIterator AI(Reg, TRI, true); AI.isValid(); ++AI) {
-        DenseMap<unsigned, MachineInstr*>::iterator CI = CopyMap.find(*AI);
+        Reg2MIMap::iterator CI = CopyMap.find(*AI);
         if (CI != CopyMap.end()) {
           DEBUG(dbgs() << "MCP: Copy is used - not dead: "; CI->second->dump());
           MaybeDeadCopies.remove(CI->second);
@@ -269,78 +291,81 @@ bool MachineCopyPropagation::CopyPropagateBlock(MachineBasicBlock &MBB) {
     }
 
     // The instruction has a register mask operand which means that it clobbers
-    // a large set of registers.  It is possible to use the register mask to
-    // prune the available copies, but treat it like a basic block boundary for
-    // now.
-    if (RegMaskOpNum >= 0) {
+    // a large set of registers.  Treat clobbered registers the same way as
+    // defined registers.
+    if (RegMask) {
       // Erase any MaybeDeadCopies whose destination register is clobbered.
-      const MachineOperand &MaskMO = MI->getOperand(RegMaskOpNum);
-      for (SmallSetVector<MachineInstr*, 8>::iterator
-           DI = MaybeDeadCopies.begin(), DE = MaybeDeadCopies.end();
-           DI != DE; ++DI) {
-        unsigned Reg = (*DI)->getOperand(0).getReg();
-        if (MRI->isReserved(Reg) || !MaskMO.clobbersPhysReg(Reg))
+      for (SmallSetVector<MachineInstr *, 8>::iterator DI =
+               MaybeDeadCopies.begin();
+           DI != MaybeDeadCopies.end();) {
+        MachineInstr *MaybeDead = *DI;
+        unsigned Reg = MaybeDead->getOperand(0).getReg();
+        assert(!MRI->isReserved(Reg));
+
+        if (!RegMask->clobbersPhysReg(Reg)) {
+          ++DI;
           continue;
+        }
+
         DEBUG(dbgs() << "MCP: Removing copy due to regmask clobbering: ";
-              (*DI)->dump());
-        (*DI)->eraseFromParent();
+              MaybeDead->dump());
+
+        // erase() will return the next valid iterator pointing to the next
+        // element after the erased one.
+        DI = MaybeDeadCopies.erase(DI);
+        MaybeDead->eraseFromParent();
         Changed = true;
         ++NumDeletes;
       }
 
-      // Clear all data structures as if we were beginning a new basic block.
-      MaybeDeadCopies.clear();
-      AvailCopyMap.clear();
-      CopyMap.clear();
-      SrcMap.clear();
-      continue;
-    }
-
-    for (unsigned i = 0, e = Defs.size(); i != e; ++i) {
-      unsigned Reg = Defs[i];
-
-      // No longer defined by a copy.
-      for (MCRegAliasIterator AI(Reg, TRI, true); AI.isValid(); ++AI) {
-        CopyMap.erase(*AI);
-        AvailCopyMap.erase(*AI);
+      removeClobberedRegsFromMap(AvailCopyMap, *RegMask);
+      removeClobberedRegsFromMap(CopyMap, *RegMask);
+      for (SourceMap::iterator I = SrcMap.begin(), E = SrcMap.end(), Next;
+           I != E; I = Next) {
+        Next = std::next(I);
+        if (RegMask->clobbersPhysReg(I->first)) {
+          removeRegsFromMap(AvailCopyMap, I->second, *TRI);
+          SrcMap.erase(I);
+        }
       }
-
-      // If 'Reg' is previously source of a copy, it is no longer available for
-      // copy propagation.
-      SourceNoLongerAvailable(Reg, SrcMap, AvailCopyMap);
     }
+
+    // Any previous copy definition or reading the Defs is no longer available.
+    for (unsigned Reg : Defs)
+      ClobberRegister(Reg);
   }
 
   // If MBB doesn't have successors, delete the copies whose defs are not used.
   // If MBB does have successors, then conservative assume the defs are live-out
   // since we don't want to trust live-in lists.
   if (MBB.succ_empty()) {
-    for (SmallSetVector<MachineInstr*, 8>::iterator
-           DI = MaybeDeadCopies.begin(), DE = MaybeDeadCopies.end();
-         DI != DE; ++DI) {
-      if (!MRI->isReserved((*DI)->getOperand(0).getReg())) {
-        (*DI)->eraseFromParent();
-        Changed = true;
-        ++NumDeletes;
-      }
+    for (MachineInstr *MaybeDead : MaybeDeadCopies) {
+      assert(!MRI->isReserved(MaybeDead->getOperand(0).getReg()));
+      MaybeDead->eraseFromParent();
+      Changed = true;
+      ++NumDeletes;
     }
   }
 
-  return Changed;
+  MaybeDeadCopies.clear();
+  AvailCopyMap.clear();
+  CopyMap.clear();
+  SrcMap.clear();
 }
 
 bool MachineCopyPropagation::runOnMachineFunction(MachineFunction &MF) {
-  if (skipOptnoneFunction(*MF.getFunction()))
+  if (skipFunction(*MF.getFunction()))
     return false;
 
-  bool Changed = false;
+  Changed = false;
 
   TRI = MF.getSubtarget().getRegisterInfo();
   TII = MF.getSubtarget().getInstrInfo();
   MRI = &MF.getRegInfo();
 
-  for (MachineFunction::iterator I = MF.begin(), E = MF.end(); I != E; ++I)
-    Changed |= CopyPropagateBlock(*I);
+  for (MachineBasicBlock &MBB : MF)
+    CopyPropagateBlock(MBB);
 
   return Changed;
 }
+
diff --git a/lib/CodeGen/MachineDominators.cpp b/lib/CodeGen/MachineDominators.cpp
index 3f04bb0b532b..303a6a9263be 100644
--- a/lib/CodeGen/MachineDominators.cpp
+++ b/lib/CodeGen/MachineDominators.cpp
@@ -15,9 +15,20 @@
 #include "llvm/CodeGen/MachineDominators.h"
 #include "llvm/CodeGen/Passes.h"
 #include "llvm/ADT/SmallBitVector.h"
+#include "llvm/Support/CommandLine.h"
 
 using namespace llvm;
 
+// Always verify dominfo if expensive checking is enabled.
+#ifdef EXPENSIVE_CHECKS
+static bool VerifyMachineDomInfo = true;
+#else
+static bool VerifyMachineDomInfo = false;
+#endif
+static cl::opt<bool, true> VerifyMachineDomInfoX(
+    "verify-machine-dom-info", cl::location(VerifyMachineDomInfo),
+    cl::desc("Verify machine dominator info (time consuming)"));
+
 namespace llvm {
 template class DomTreeNodeBase<MachineBasicBlock>;
 template class DominatorTreeBase<MachineBasicBlock>;
@@ -57,6 +68,11 @@ void MachineDominatorTree::releaseMemory() {
   DT->releaseMemory();
 }
 
+void MachineDominatorTree::verifyAnalysis() const {
+  if (VerifyMachineDomInfo)
+    verifyDomTree();
+}
+
 void MachineDominatorTree::print(raw_ostream &OS, const Module*) const {
   DT->print(OS);
 }
@@ -125,3 +141,17 @@ void MachineDominatorTree::applySplitCriticalEdges() const {
   NewBBs.clear();
   CriticalEdgesToSplit.clear();
 }
+
+void MachineDominatorTree::verifyDomTree() const {
+  MachineFunction &F = *getRoot()->getParent();
+
+  MachineDominatorTree OtherDT;
+  OtherDT.DT->recalculate(F);
+  if (compare(OtherDT)) {
+    errs() << "MachineDominatorTree is not up to date!\nComputed:\n";
+    print(errs(), nullptr);
+    errs() << "\nActual:\n";
+    OtherDT.print(errs(), nullptr);
+    abort();
+  }
+}
diff --git a/lib/CodeGen/MachineFunction.cpp b/lib/CodeGen/MachineFunction.cpp
index f6604f38722a..a7c63ef4c852 100644
--- a/lib/CodeGen/MachineFunction.cpp
+++ b/lib/CodeGen/MachineFunction.cpp
@@ -54,6 +54,30 @@ static cl::opt<unsigned>
 
 void MachineFunctionInitializer::anchor() {}
 
+void MachineFunctionProperties::print(raw_ostream &ROS, bool OnlySet) const {
+  // Leave this function even in NDEBUG as an out-of-line anchor.
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
+  for (BitVector::size_type i = 0; i < Properties.size(); ++i) {
+    bool HasProperty = Properties[i];
+    if (OnlySet && !HasProperty)
+      continue;
+    switch(static_cast<Property>(i)) {
+      case Property::IsSSA:
+        ROS << (HasProperty ? "SSA, " : "Post SSA, ");
+        break;
+      case Property::TracksLiveness:
+        ROS << (HasProperty ? "" : "not ") << "tracking liveness, ";
+        break;
+      case Property::AllVRegsAllocated:
+        ROS << (HasProperty ? "AllVRegsAllocated" : "HasVRegs");
+        break;
+      default:
+        break;
+    }
+  }
+#endif
+}
+
 //===----------------------------------------------------------------------===//
 // MachineFunction implementation
 //===----------------------------------------------------------------------===//
@@ -65,20 +89,34 @@ void ilist_traits<MachineBasicBlock>::deleteNode(MachineBasicBlock *MBB) {
   MBB->getParent()->DeleteMachineBasicBlock(MBB);
 }
 
+static inline unsigned getFnStackAlignment(const TargetSubtargetInfo *STI,
+                                           const Function *Fn) {
+  if (Fn->hasFnAttribute(Attribute::StackAlignment))
+    return Fn->getFnStackAlignment();
+  return STI->getFrameLowering()->getStackAlignment();
+}
+
 MachineFunction::MachineFunction(const Function *F, const TargetMachine &TM,
                                  unsigned FunctionNum, MachineModuleInfo &mmi)
     : Fn(F), Target(TM), STI(TM.getSubtargetImpl(*F)), Ctx(mmi.getContext()),
       MMI(mmi) {
+  // Assume the function starts in SSA form with correct liveness.
+  Properties.set(MachineFunctionProperties::Property::IsSSA);
+  Properties.set(MachineFunctionProperties::Property::TracksLiveness);
   if (STI->getRegisterInfo())
     RegInfo = new (Allocator) MachineRegisterInfo(this);
   else
     RegInfo = nullptr;
 
   MFInfo = nullptr;
-  FrameInfo = new (Allocator)
-      MachineFrameInfo(STI->getFrameLowering()->getStackAlignment(),
-                       STI->getFrameLowering()->isStackRealignable(),
-                       !F->hasFnAttribute("no-realign-stack"));
+  // We can realign the stack if the target supports it and the user hasn't
+  // explicitly asked us not to.
+  bool CanRealignSP = STI->getFrameLowering()->isStackRealignable() &&
+                      !F->hasFnAttribute("no-realign-stack");
+  FrameInfo = new (Allocator) MachineFrameInfo(
+      getFnStackAlignment(STI, Fn), /*StackRealignable=*/CanRealignSP,
+      /*ForceRealign=*/CanRealignSP &&
+          F->hasFnAttribute(Attribute::StackAlignment));
 
   if (Fn->hasFnAttribute(Attribute::StackAlignment))
     FrameInfo->ensureMaxAlignment(Fn->getFnStackAlignment());
@@ -209,9 +247,9 @@ void MachineFunction::RenumberBlocks(MachineBasicBlock *MBB) {
 }
 
 /// Allocate a new MachineInstr. Use this instead of `new MachineInstr'.
-MachineInstr *
-MachineFunction::CreateMachineInstr(const MCInstrDesc &MCID,
-                                    DebugLoc DL, bool NoImp) {
+MachineInstr *MachineFunction::CreateMachineInstr(const MCInstrDesc &MCID,
+                                                  const DebugLoc &DL,
+                                                  bool NoImp) {
   return new (InstructionRecycler.Allocate<MachineInstr>(Allocator))
     MachineInstr(*this, MCID, DL, NoImp);
 }
@@ -256,13 +294,11 @@ MachineFunction::DeleteMachineBasicBlock(MachineBasicBlock *MBB) {
   BasicBlockRecycler.Deallocate(Allocator, MBB);
 }
 
-MachineMemOperand *
-MachineFunction::getMachineMemOperand(MachinePointerInfo PtrInfo, unsigned f,
-                                      uint64_t s, unsigned base_alignment,
-                                      const AAMDNodes &AAInfo,
-                                      const MDNode *Ranges) {
-  return new (Allocator) MachineMemOperand(PtrInfo, f, s, base_alignment,
-                                           AAInfo, Ranges);
+MachineMemOperand *MachineFunction::getMachineMemOperand(
+    MachinePointerInfo PtrInfo, MachineMemOperand::Flags f, uint64_t s,
+    unsigned base_alignment, const AAMDNodes &AAInfo, const MDNode *Ranges) {
+  return new (Allocator)
+      MachineMemOperand(PtrInfo, f, s, base_alignment, AAInfo, Ranges);
 }
 
 MachineMemOperand *
@@ -358,7 +394,7 @@ const char *MachineFunction::createExternalSymbolName(StringRef Name) {
 }
 
 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
-void MachineFunction::dump() const {
+LLVM_DUMP_METHOD void MachineFunction::dump() const {
   print(dbgs());
 }
 #endif
@@ -368,14 +404,11 @@ StringRef MachineFunction::getName() const {
   return getFunction()->getName();
 }
 
-void MachineFunction::print(raw_ostream &OS, SlotIndexes *Indexes) const {
+void MachineFunction::print(raw_ostream &OS, const SlotIndexes *Indexes) const {
   OS << "# Machine code for function " << getName() << ": ";
-  if (RegInfo) {
-    OS << (RegInfo->isSSA() ? "SSA" : "Post SSA");
-    if (!RegInfo->tracksLiveness())
-      OS << ", not tracking liveness";
-  }
-  OS << '\n';
+  OS << "Properties: <";
+  getProperties().print(OS);
+  OS << ">\n";
 
   // Print Frame Information
   FrameInfo->print(*this, OS);
@@ -523,7 +556,7 @@ MCSymbol *MachineFunction::getPICBaseSymbol() const {
 
 /// Make sure the function is at least Align bytes aligned.
 void MachineFrameInfo::ensureMaxAlignment(unsigned Align) {
-  if (!StackRealignable || !RealignOption)
+  if (!StackRealignable)
     assert(Align <= StackAlignment &&
            "For targets without stack realignment, Align is out of limit!");
   if (MaxAlignment < Align) MaxAlignment = Align;
@@ -545,8 +578,7 @@ static inline unsigned clampStackAlignment(bool ShouldClamp, unsigned Align,
 int MachineFrameInfo::CreateStackObject(uint64_t Size, unsigned Alignment,
                       bool isSS, const AllocaInst *Alloca) {
   assert(Size != 0 && "Cannot allocate zero size stack objects!");
-  Alignment = clampStackAlignment(!StackRealignable || !RealignOption,
-                                  Alignment, StackAlignment);
+  Alignment = clampStackAlignment(!StackRealignable, Alignment, StackAlignment);
   Objects.push_back(StackObject(Size, Alignment, 0, false, isSS, Alloca,
                                 !isSS));
   int Index = (int)Objects.size() - NumFixedObjects - 1;
@@ -559,8 +591,7 @@ int MachineFrameInfo::CreateStackObject(uint64_t Size, unsigned Alignment,
 /// returning a nonnegative identifier to represent it.
 int MachineFrameInfo::CreateSpillStackObject(uint64_t Size,
                                              unsigned Alignment) {
-  Alignment = clampStackAlignment(!StackRealignable || !RealignOption,
-                                  Alignment, StackAlignment);
+  Alignment = clampStackAlignment(!StackRealignable, Alignment, StackAlignment);
   CreateStackObject(Size, Alignment, true);
   int Index = (int)Objects.size() - NumFixedObjects - 1;
   ensureMaxAlignment(Alignment);
@@ -573,8 +604,7 @@ int MachineFrameInfo::CreateSpillStackObject(uint64_t Size,
 int MachineFrameInfo::CreateVariableSizedObject(unsigned Alignment,
                                                 const AllocaInst *Alloca) {
   HasVarSizedObjects = true;
-  Alignment = clampStackAlignment(!StackRealignable || !RealignOption,
-                                  Alignment, StackAlignment);
+  Alignment = clampStackAlignment(!StackRealignable, Alignment, StackAlignment);
   Objects.push_back(StackObject(0, Alignment, 0, false, false, Alloca, true));
   ensureMaxAlignment(Alignment);
   return (int)Objects.size()-NumFixedObjects-1;
@@ -590,10 +620,11 @@ int MachineFrameInfo::CreateFixedObject(uint64_t Size, int64_t SPOffset,
   // The alignment of the frame index can be determined from its offset from
   // the incoming frame position.  If the frame object is at offset 32 and
   // the stack is guaranteed to be 16-byte aligned, then we know that the
-  // object is 16-byte aligned.
-  unsigned Align = MinAlign(SPOffset, StackAlignment);
-  Align = clampStackAlignment(!StackRealignable || !RealignOption, Align,
-                              StackAlignment);
+  // object is 16-byte aligned. Note that unlike the non-fixed case, if the
+  // stack needs realignment, we can't assume that the stack will in fact be
+  // aligned.
+  unsigned Align = MinAlign(SPOffset, ForcedRealign ? 1 : StackAlignment);
+  Align = clampStackAlignment(!StackRealignable, Align, StackAlignment);
   Objects.insert(Objects.begin(), StackObject(Size, Align, SPOffset, Immutable,
                                               /*isSS*/   false,
                                               /*Alloca*/ nullptr, isAliased));
@@ -604,9 +635,8 @@ int MachineFrameInfo::CreateFixedObject(uint64_t Size, int64_t SPOffset,
 /// Returns an index with a negative value.
 int MachineFrameInfo::CreateFixedSpillStackObject(uint64_t Size,
                                                   int64_t SPOffset) {
-  unsigned Align = MinAlign(SPOffset, StackAlignment);
-  Align = clampStackAlignment(!StackRealignable || !RealignOption, Align,
-                              StackAlignment);
+  unsigned Align = MinAlign(SPOffset, ForcedRealign ? 1 : StackAlignment);
+  Align = clampStackAlignment(!StackRealignable, Align, StackAlignment);
   Objects.insert(Objects.begin(), StackObject(Size, Align, SPOffset,
                                               /*Immutable*/ true,
                                               /*isSS*/ true,
@@ -819,7 +849,7 @@ void MachineJumpTableInfo::print(raw_ostream &OS) const {
 }
 
 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
-void MachineJumpTableInfo::dump() const { print(dbgs()); }
+LLVM_DUMP_METHOD void MachineJumpTableInfo::dump() const { print(dbgs()); }
 #endif
 
 
@@ -852,6 +882,8 @@ MachineConstantPoolEntry::getSectionKind(const DataLayout *DL) const {
     return SectionKind::getMergeableConst8();
   case 16:
     return SectionKind::getMergeableConst16();
+  case 32:
+    return SectionKind::getMergeableConst32();
   default:
     return SectionKind::getReadOnly();
   }
@@ -895,17 +927,17 @@ static bool CanShareConstantPoolEntry(const Constant *A, const Constant *B,
   // the constant folding APIs to do this so that we get the benefit of
   // DataLayout.
   if (isa<PointerType>(A->getType()))
-    A = ConstantFoldInstOperands(Instruction::PtrToInt, IntTy,
-                                 const_cast<Constant *>(A), DL);
+    A = ConstantFoldCastOperand(Instruction::PtrToInt,
+                                const_cast<Constant *>(A), IntTy, DL);
   else if (A->getType() != IntTy)
-    A = ConstantFoldInstOperands(Instruction::BitCast, IntTy,
-                                 const_cast<Constant *>(A), DL);
+    A = ConstantFoldCastOperand(Instruction::BitCast, const_cast<Constant *>(A),
+                                IntTy, DL);
   if (isa<PointerType>(B->getType()))
-    B = ConstantFoldInstOperands(Instruction::PtrToInt, IntTy,
-                                 const_cast<Constant *>(B), DL);
+    B = ConstantFoldCastOperand(Instruction::PtrToInt,
+                                const_cast<Constant *>(B), IntTy, DL);
   else if (B->getType() != IntTy)
-    B = ConstantFoldInstOperands(Instruction::BitCast, IntTy,
-                                 const_cast<Constant *>(B), DL);
+    B = ConstantFoldCastOperand(Instruction::BitCast, const_cast<Constant *>(B),
+                                IntTy, DL);
 
   return A == B;
 }
@@ -966,5 +998,5 @@ void MachineConstantPool::print(raw_ostream &OS) const {
 }
 
 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
-void MachineConstantPool::dump() const { print(dbgs()); }
+LLVM_DUMP_METHOD void MachineConstantPool::dump() const { print(dbgs()); }
 #endif
diff --git a/lib/CodeGen/MachineFunctionPass.cpp b/lib/CodeGen/MachineFunctionPass.cpp
index 05463fc6a1ef..228fe170ab46 100644
--- a/lib/CodeGen/MachineFunctionPass.cpp
+++ b/lib/CodeGen/MachineFunctionPass.cpp
@@ -21,11 +21,13 @@
 #include "llvm/Analysis/MemoryDependenceAnalysis.h"
 #include "llvm/Analysis/ScalarEvolution.h"
 #include "llvm/Analysis/ScalarEvolutionAliasAnalysis.h"
+#include "llvm/CodeGen/MachineFunction.h"
 #include "llvm/CodeGen/MachineFunctionAnalysis.h"
 #include "llvm/CodeGen/Passes.h"
 #include "llvm/CodeGen/StackProtector.h"
 #include "llvm/IR/Dominators.h"
 #include "llvm/IR/Function.h"
+
 using namespace llvm;
 
 Pass *MachineFunctionPass::createPrinterPass(raw_ostream &O,
@@ -40,7 +42,26 @@ bool MachineFunctionPass::runOnFunction(Function &F) {
     return false;
 
   MachineFunction &MF = getAnalysis<MachineFunctionAnalysis>().getMF();
-  return runOnMachineFunction(MF);
+  MachineFunctionProperties &MFProps = MF.getProperties();
+
+#ifndef NDEBUG
+  if (!MFProps.verifyRequiredProperties(RequiredProperties)) {
+    errs() << "MachineFunctionProperties required by " << getPassName()
+           << " pass are not met by function " << F.getName() << ".\n"
+           << "Required properties: ";
+    RequiredProperties.print(errs(), /*OnlySet=*/true);
+    errs() << "\nCurrent properties: ";
+    MFProps.print(errs());
+    errs() << "\n";
+    llvm_unreachable("MachineFunctionProperties check failed");
+  }
+#endif
+
+  bool RV = runOnMachineFunction(MF);
+
+  MFProps.set(SetProperties);
+  MFProps.clear(ClearedProperties);
+  return RV;
 }
 
 void MachineFunctionPass::getAnalysisUsage(AnalysisUsage &AU) const {
@@ -53,13 +74,13 @@ void MachineFunctionPass::getAnalysisUsage(AnalysisUsage &AU) const {
   // because CodeGen overloads that to mean preserving the MachineBasicBlock
   // CFG in addition to the LLVM IR CFG.
   AU.addPreserved<BasicAAWrapperPass>();
-  AU.addPreserved<DominanceFrontier>();
+  AU.addPreserved<DominanceFrontierWrapperPass>();
   AU.addPreserved<DominatorTreeWrapperPass>();
   AU.addPreserved<AAResultsWrapperPass>();
   AU.addPreserved<GlobalsAAWrapperPass>();
-  AU.addPreserved<IVUsers>();
+  AU.addPreserved<IVUsersWrapperPass>();
   AU.addPreserved<LoopInfoWrapperPass>();
-  AU.addPreserved<MemoryDependenceAnalysis>();
+  AU.addPreserved<MemoryDependenceWrapperPass>();
   AU.addPreserved<ScalarEvolutionWrapperPass>();
   AU.addPreserved<SCEVAAWrapperPass>();
   AU.addPreserved<StackProtector>();
diff --git a/lib/CodeGen/MachineInstr.cpp b/lib/CodeGen/MachineInstr.cpp
index 6dca74d60026..3cdf8d2941d3 100644
--- a/lib/CodeGen/MachineInstr.cpp
+++ b/lib/CodeGen/MachineInstr.cpp
@@ -17,6 +17,7 @@
 #include "llvm/Analysis/AliasAnalysis.h"
 #include "llvm/CodeGen/MachineConstantPool.h"
 #include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
 #include "llvm/CodeGen/MachineMemOperand.h"
 #include "llvm/CodeGen/MachineModuleInfo.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
@@ -372,10 +373,16 @@ void MachineOperand::print(raw_ostream &OS, ModuleSlotTracker &MST,
     getCImm()->getValue().print(OS, false);
     break;
   case MachineOperand::MO_FPImmediate:
-    if (getFPImm()->getType()->isFloatTy())
+    if (getFPImm()->getType()->isFloatTy()) {
       OS << getFPImm()->getValueAPF().convertToFloat();
-    else
+    } else if (getFPImm()->getType()->isHalfTy()) {
+      APFloat APF = getFPImm()->getValueAPF();
+      bool Unused;
+      APF.convert(APFloat::IEEEsingle, APFloat::rmNearestTiesToEven, &Unused);
+      OS << "half " << APF.convertToFloat();
+    } else {
       OS << getFPImm()->getValueAPF().convertToDouble();
+    }
     break;
   case MachineOperand::MO_MachineBasicBlock:
     OS << "<BB#" << getMBB()->getNumber() << ">";
@@ -490,13 +497,12 @@ MachinePointerInfo MachinePointerInfo::getStack(MachineFunction &MF,
   return MachinePointerInfo(MF.getPSVManager().getStack(), Offset);
 }
 
-MachineMemOperand::MachineMemOperand(MachinePointerInfo ptrinfo, unsigned f,
+MachineMemOperand::MachineMemOperand(MachinePointerInfo ptrinfo, Flags f,
                                      uint64_t s, unsigned int a,
                                      const AAMDNodes &AAInfo,
                                      const MDNode *Ranges)
-  : PtrInfo(ptrinfo), Size(s),
-    Flags((f & ((1 << MOMaxBits) - 1)) | ((Log2_32(a) + 1) << MOMaxBits)),
-    AAInfo(AAInfo), Ranges(Ranges) {
+    : PtrInfo(ptrinfo), Size(s), FlagVals(f), BaseAlignLog2(Log2_32(a) + 1),
+      AAInfo(AAInfo), Ranges(Ranges) {
   assert((PtrInfo.V.isNull() || PtrInfo.V.is<const PseudoSourceValue*>() ||
           isa<PointerType>(PtrInfo.V.get<const Value*>()->getType())) &&
          "invalid pointer value");
@@ -510,7 +516,8 @@ void MachineMemOperand::Profile(FoldingSetNodeID &ID) const {
   ID.AddInteger(getOffset());
   ID.AddInteger(Size);
   ID.AddPointer(getOpaqueValue());
-  ID.AddInteger(Flags);
+  ID.AddInteger(getFlags());
+  ID.AddInteger(getBaseAlignment());
 }
 
 void MachineMemOperand::refineAlignment(const MachineMemOperand *MMO) {
@@ -521,8 +528,7 @@ void MachineMemOperand::refineAlignment(const MachineMemOperand *MMO) {
 
   if (MMO->getBaseAlignment() >= getBaseAlignment()) {
     // Update the alignment value.
-    Flags = (Flags & ((1 << MOMaxBits) - 1)) |
-      ((Log2_32(MMO->getBaseAlignment()) + 1) << MOMaxBits);
+    BaseAlignLog2 = Log2_32(MMO->getBaseAlignment()) + 1;
     // Also update the base and offset, because the new alignment may
     // not be applicable with the old ones.
     PtrInfo = MMO->PtrInfo;
@@ -647,7 +653,12 @@ MachineInstr::MachineInstr(MachineFunction &MF, const MCInstrDesc &tid,
                            DebugLoc dl, bool NoImp)
     : MCID(&tid), Parent(nullptr), Operands(nullptr), NumOperands(0), Flags(0),
       AsmPrinterFlags(0), NumMemRefs(0), MemRefs(nullptr),
-      debugLoc(std::move(dl)) {
+      debugLoc(std::move(dl))
+#ifdef LLVM_BUILD_GLOBAL_ISEL
+      ,
+      Ty(nullptr)
+#endif
+{
   assert(debugLoc.hasTrivialDestructor() && "Expected trivial destructor");
 
   // Reserve space for the expected number of operands.
@@ -664,10 +675,14 @@ MachineInstr::MachineInstr(MachineFunction &MF, const MCInstrDesc &tid,
 /// MachineInstr ctor - Copies MachineInstr arg exactly
 ///
 MachineInstr::MachineInstr(MachineFunction &MF, const MachineInstr &MI)
-  : MCID(&MI.getDesc()), Parent(nullptr), Operands(nullptr), NumOperands(0),
-    Flags(0), AsmPrinterFlags(0),
-    NumMemRefs(MI.NumMemRefs), MemRefs(MI.MemRefs),
-    debugLoc(MI.getDebugLoc()) {
+    : MCID(&MI.getDesc()), Parent(nullptr), Operands(nullptr), NumOperands(0),
+      Flags(0), AsmPrinterFlags(0), NumMemRefs(MI.NumMemRefs),
+      MemRefs(MI.MemRefs), debugLoc(MI.getDebugLoc())
+#ifdef LLVM_BUILD_GLOBAL_ISEL
+      ,
+      Ty(nullptr)
+#endif
+{
   assert(debugLoc.hasTrivialDestructor() && "Expected trivial destructor");
 
   CapOperands = OperandCapacity::get(MI.getNumOperands());
@@ -690,6 +705,25 @@ MachineRegisterInfo *MachineInstr::getRegInfo() {
   return nullptr;
 }
 
+// Implement dummy setter and getter for type when
+// global-isel is not built.
+// The proper implementation is WIP and is tracked here:
+// PR26576.
+#ifndef LLVM_BUILD_GLOBAL_ISEL
+void MachineInstr::setType(Type *Ty) {}
+
+Type *MachineInstr::getType() const { return nullptr; }
+
+#else
+void MachineInstr::setType(Type *Ty) {
+  assert((!Ty || isPreISelGenericOpcode(getOpcode())) &&
+         "Non generic instructions are not supposed to be typed");
+  this->Ty = Ty;
+}
+
+Type *MachineInstr::getType() const { return Ty; }
+#endif // LLVM_BUILD_GLOBAL_ISEL
+
 /// RemoveRegOperandsFromUseLists - Unlink all of the register operands in
 /// this instruction from their respective use lists.  This requires that the
 /// operands already be on their use lists.
@@ -867,7 +901,7 @@ void MachineInstr::addMemOperand(MachineFunction &MF,
 }
 
 /// Check to see if the MMOs pointed to by the two MemRefs arrays are
-/// identical. 
+/// identical.
 static bool hasIdenticalMMOs(const MachineInstr &MI1, const MachineInstr &MI2) {
   auto I1 = MI1.memoperands_begin(), E1 = MI1.memoperands_end();
   auto I2 = MI2.memoperands_begin(), E2 = MI2.memoperands_end();
@@ -894,7 +928,7 @@ MachineInstr::mergeMemRefsWith(const MachineInstr& Other) {
   // cases in practice.
   if (hasIdenticalMMOs(*this, Other))
     return std::make_pair(MemRefs, NumMemRefs);
-  
+
   // TODO: consider uniquing elements within the operand lists to reduce
   // space usage and fall back to conservative information less often.
   size_t CombinedNumMemRefs = NumMemRefs + Other.NumMemRefs;
@@ -913,7 +947,7 @@ MachineInstr::mergeMemRefsWith(const MachineInstr& Other) {
                      MemEnd);
   assert(MemEnd - MemBegin == (ptrdiff_t)CombinedNumMemRefs &&
          "missing memrefs");
-  
+
   return std::make_pair(MemBegin, CombinedNumMemRefs);
 }
 
@@ -933,23 +967,23 @@ bool MachineInstr::hasPropertyInBundle(unsigned Mask, QueryType Type) const {
   }
 }
 
-bool MachineInstr::isIdenticalTo(const MachineInstr *Other,
+bool MachineInstr::isIdenticalTo(const MachineInstr &Other,
                                  MICheckType Check) const {
   // If opcodes or number of operands are not the same then the two
   // instructions are obviously not identical.
-  if (Other->getOpcode() != getOpcode() ||
-      Other->getNumOperands() != getNumOperands())
+  if (Other.getOpcode() != getOpcode() ||
+      Other.getNumOperands() != getNumOperands())
     return false;
 
   if (isBundle()) {
     // Both instructions are bundles, compare MIs inside the bundle.
     MachineBasicBlock::const_instr_iterator I1 = getIterator();
     MachineBasicBlock::const_instr_iterator E1 = getParent()->instr_end();
-    MachineBasicBlock::const_instr_iterator I2 = Other->getIterator();
-    MachineBasicBlock::const_instr_iterator E2= Other->getParent()->instr_end();
+    MachineBasicBlock::const_instr_iterator I2 = Other.getIterator();
+    MachineBasicBlock::const_instr_iterator E2 = Other.getParent()->instr_end();
     while (++I1 != E1 && I1->isInsideBundle()) {
       ++I2;
-      if (I2 == E2 || !I2->isInsideBundle() || !I1->isIdenticalTo(&*I2, Check))
+      if (I2 == E2 || !I2->isInsideBundle() || !I1->isIdenticalTo(*I2, Check))
         return false;
     }
   }
@@ -957,7 +991,7 @@ bool MachineInstr::isIdenticalTo(const MachineInstr *Other,
   // Check operands to make sure they match.
   for (unsigned i = 0, e = getNumOperands(); i != e; ++i) {
     const MachineOperand &MO = getOperand(i);
-    const MachineOperand &OMO = Other->getOperand(i);
+    const MachineOperand &OMO = Other.getOperand(i);
     if (!MO.isReg()) {
       if (!MO.isIdenticalTo(OMO))
         return false;
@@ -990,8 +1024,8 @@ bool MachineInstr::isIdenticalTo(const MachineInstr *Other,
   }
   // If DebugLoc does not match then two dbg.values are not identical.
   if (isDebugValue())
-    if (getDebugLoc() && Other->getDebugLoc() &&
-        getDebugLoc() != Other->getDebugLoc())
+    if (getDebugLoc() && Other.getDebugLoc() &&
+        getDebugLoc() != Other.getDebugLoc())
       return false;
   return true;
 }
@@ -1130,6 +1164,16 @@ int MachineInstr::findInlineAsmFlagIdx(unsigned OpIdx,
   return -1;
 }
 
+const DILocalVariable *MachineInstr::getDebugVariable() const {
+  assert(isDebugValue() && "not a DBG_VALUE");
+  return cast<DILocalVariable>(getOperand(2).getMetadata());
+}
+
+const DIExpression *MachineInstr::getDebugExpression() const {
+  assert(isDebugValue() && "not a DBG_VALUE");
+  return cast<DIExpression>(getOperand(3).getMetadata());
+}
+
 const TargetRegisterClass*
 MachineInstr::getRegClassConstraint(unsigned OpIdx,
                                     const TargetInstrInfo *TII,
@@ -1157,7 +1201,10 @@ MachineInstr::getRegClassConstraint(unsigned OpIdx,
 
   unsigned Flag = getOperand(FlagIdx).getImm();
   unsigned RCID;
-  if (InlineAsm::hasRegClassConstraint(Flag, RCID))
+  if ((InlineAsm::getKind(Flag) == InlineAsm::Kind_RegUse ||
+       InlineAsm::getKind(Flag) == InlineAsm::Kind_RegDef ||
+       InlineAsm::getKind(Flag) == InlineAsm::Kind_RegDefEarlyClobber) &&
+      InlineAsm::hasRegClassConstraint(Flag, RCID))
     return TRI->getRegClass(RCID);
 
   // Assume that all registers in a memory operand are pointers.
@@ -1173,7 +1220,7 @@ const TargetRegisterClass *MachineInstr::getRegClassConstraintEffectForVReg(
   // Check every operands inside the bundle if we have
   // been asked to.
   if (ExploreBundle)
-    for (ConstMIBundleOperands OpndIt(this); OpndIt.isValid() && CurRC;
+    for (ConstMIBundleOperands OpndIt(*this); OpndIt.isValid() && CurRC;
          ++OpndIt)
       CurRC = OpndIt->getParent()->getRegClassConstraintEffectForVRegImpl(
           OpndIt.getOperandNo(), Reg, CurRC, TII, TRI);
@@ -1219,11 +1266,24 @@ const TargetRegisterClass *MachineInstr::getRegClassConstraintEffect(
 unsigned MachineInstr::getBundleSize() const {
   MachineBasicBlock::const_instr_iterator I = getIterator();
   unsigned Size = 0;
-  while (I->isBundledWithSucc())
-    ++Size, ++I;
+  while (I->isBundledWithSucc()) {
+    ++Size;
+    ++I;
+  }
   return Size;
 }
 
+/// Returns true if the MachineInstr has an implicit-use operand of exactly
+/// the given register (not considering sub/super-registers).
+bool MachineInstr::hasRegisterImplicitUseOperand(unsigned Reg) const {
+  for (unsigned i = 0, e = getNumOperands(); i != e; ++i) {
+    const MachineOperand &MO = getOperand(i);
+    if (MO.isReg() && MO.isUse() && MO.isImplicit() && MO.getReg() == Reg)
+      return true;
+  }
+  return false;
+}
+
 /// findRegisterUseOperandIdx() - Returns the MachineOperand that is a use of
 /// the specific register or -1 if it is not found. It further tightens
 /// the search criteria to a use that kills the register if isKill is true.
@@ -1498,12 +1558,10 @@ bool MachineInstr::hasOrderedMemoryRef() const {
   if (memoperands_empty())
     return true;
 
-  // Check the memory reference information for ordered references.
-  for (mmo_iterator I = memoperands_begin(), E = memoperands_end(); I != E; ++I)
-    if (!(*I)->isUnordered())
-      return true;
-
-  return false;
+  // Check if any of our memory operands are ordered.
+  return any_of(memoperands(), [](const MachineMemOperand *MMO) {
+    return !MMO->isUnordered();
+  });
 }
 
 /// isInvariantLoad - Return true if this instruction is loading from a
@@ -1523,23 +1581,21 @@ bool MachineInstr::isInvariantLoad(AliasAnalysis *AA) const {
 
   const MachineFrameInfo *MFI = getParent()->getParent()->getFrameInfo();
 
-  for (mmo_iterator I = memoperands_begin(),
-       E = memoperands_end(); I != E; ++I) {
-    if ((*I)->isVolatile()) return false;
-    if ((*I)->isStore()) return false;
-    if ((*I)->isInvariant()) return true;
-
+  for (MachineMemOperand *MMO : memoperands()) {
+    if (MMO->isVolatile()) return false;
+    if (MMO->isStore()) return false;
+    if (MMO->isInvariant()) continue;
 
     // A load from a constant PseudoSourceValue is invariant.
-    if (const PseudoSourceValue *PSV = (*I)->getPseudoValue())
+    if (const PseudoSourceValue *PSV = MMO->getPseudoValue())
       if (PSV->isConstant(MFI))
         continue;
 
-    if (const Value *V = (*I)->getValue()) {
+    if (const Value *V = MMO->getValue()) {
       // If we have an AliasAnalysis, ask it whether the memory is constant.
       if (AA &&
           AA->pointsToConstantMemory(
-              MemoryLocation(V, (*I)->getSize(), (*I)->getAAInfo())))
+              MemoryLocation(V, MMO->getSize(), MMO->getAAInfo())))
         continue;
     }
 
@@ -1598,16 +1654,16 @@ bool MachineInstr::allDefsAreDead() const {
 /// copyImplicitOps - Copy implicit register operands from specified
 /// instruction to this instruction.
 void MachineInstr::copyImplicitOps(MachineFunction &MF,
-                                   const MachineInstr *MI) {
-  for (unsigned i = MI->getDesc().getNumOperands(), e = MI->getNumOperands();
+                                   const MachineInstr &MI) {
+  for (unsigned i = MI.getDesc().getNumOperands(), e = MI.getNumOperands();
        i != e; ++i) {
-    const MachineOperand &MO = MI->getOperand(i);
+    const MachineOperand &MO = MI.getOperand(i);
     if ((MO.isReg() && MO.isImplicit()) || MO.isRegMask())
       addOperand(MF, MO);
   }
 }
 
-void MachineInstr::dump() const {
+LLVM_DUMP_METHOD void MachineInstr::dump() const {
 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
   dbgs() << "  " << *this;
 #endif
@@ -1651,8 +1707,12 @@ void MachineInstr::print(raw_ostream &OS, ModuleSlotTracker &MST,
     if (StartOp != 0) OS << ", ";
     getOperand(StartOp).print(OS, MST, TRI);
     unsigned Reg = getOperand(StartOp).getReg();
-    if (TargetRegisterInfo::isVirtualRegister(Reg))
+    if (TargetRegisterInfo::isVirtualRegister(Reg)) {
       VirtRegs.push_back(Reg);
+      unsigned Size;
+      if (MRI && (Size = MRI->getSize(Reg)))
+        OS << '(' << Size << ')';
+    }
   }
 
   if (StartOp != 0)
@@ -1664,6 +1724,12 @@ void MachineInstr::print(raw_ostream &OS, ModuleSlotTracker &MST,
   else
     OS << "UNKNOWN";
 
+  if (getType()) {
+    OS << ' ';
+    getType()->print(OS, /*IsForDebug*/ false, /*NoDetails*/ true);
+    OS << ' ';
+  }
+
   if (SkipOpers)
     return;
 
@@ -1686,6 +1752,8 @@ void MachineInstr::print(raw_ostream &OS, ModuleSlotTracker &MST,
       OS << " [mayload]";
     if (ExtraInfo & InlineAsm::Extra_MayStore)
       OS << " [maystore]";
+    if (ExtraInfo & InlineAsm::Extra_IsConvergent)
+      OS << " [isconvergent]";
     if (ExtraInfo & InlineAsm::Extra_IsAlignStack)
       OS << " [alignstack]";
     if (getInlineAsmDialect() == InlineAsm::AD_ATT)
@@ -1761,13 +1829,41 @@ void MachineInstr::print(raw_ostream &OS, ModuleSlotTracker &MST,
       }
 
       unsigned RCID = 0;
-      if (InlineAsm::hasRegClassConstraint(Flag, RCID)) {
+      if (!InlineAsm::isImmKind(Flag) && !InlineAsm::isMemKind(Flag) &&
+          InlineAsm::hasRegClassConstraint(Flag, RCID)) {
         if (TRI) {
           OS << ':' << TRI->getRegClassName(TRI->getRegClass(RCID));
         } else
           OS << ":RC" << RCID;
       }
 
+      if (InlineAsm::isMemKind(Flag)) {
+        unsigned MCID = InlineAsm::getMemoryConstraintID(Flag);
+        switch (MCID) {
+        case InlineAsm::Constraint_es: OS << ":es"; break;
+        case InlineAsm::Constraint_i:  OS << ":i"; break;
+        case InlineAsm::Constraint_m:  OS << ":m"; break;
+        case InlineAsm::Constraint_o:  OS << ":o"; break;
+        case InlineAsm::Constraint_v:  OS << ":v"; break;
+        case InlineAsm::Constraint_Q:  OS << ":Q"; break;
+        case InlineAsm::Constraint_R:  OS << ":R"; break;
+        case InlineAsm::Constraint_S:  OS << ":S"; break;
+        case InlineAsm::Constraint_T:  OS << ":T"; break;
+        case InlineAsm::Constraint_Um: OS << ":Um"; break;
+        case InlineAsm::Constraint_Un: OS << ":Un"; break;
+        case InlineAsm::Constraint_Uq: OS << ":Uq"; break;
+        case InlineAsm::Constraint_Us: OS << ":Us"; break;
+        case InlineAsm::Constraint_Ut: OS << ":Ut"; break;
+        case InlineAsm::Constraint_Uv: OS << ":Uv"; break;
+        case InlineAsm::Constraint_Uy: OS << ":Uy"; break;
+        case InlineAsm::Constraint_X:  OS << ":X"; break;
+        case InlineAsm::Constraint_Z:  OS << ":Z"; break;
+        case InlineAsm::Constraint_ZC: OS << ":ZC"; break;
+        case InlineAsm::Constraint_Zy: OS << ":Zy"; break;
+        default: OS << ":?"; break;
+        }
+      }
+
       unsigned TiedTo = 0;
       if (InlineAsm::isUseOperandTiedToDef(Flag, TiedTo))
         OS << " tiedto:$" << TiedTo;
@@ -1824,11 +1920,18 @@ void MachineInstr::print(raw_ostream &OS, ModuleSlotTracker &MST,
       HaveSemi = true;
     }
     for (unsigned i = 0; i != VirtRegs.size(); ++i) {
-      const TargetRegisterClass *RC = MRI->getRegClass(VirtRegs[i]);
-      OS << " " << TRI->getRegClassName(RC)
-         << ':' << PrintReg(VirtRegs[i]);
+      const RegClassOrRegBank &RC = MRI->getRegClassOrRegBank(VirtRegs[i]);
+      if (!RC)
+        continue;
+      // Generic virtual registers do not have register classes.
+      if (RC.is<const RegisterBank *>())
+        OS << " " << RC.get<const RegisterBank *>()->getName();
+      else
+        OS << " "
+           << TRI->getRegClassName(RC.get<const TargetRegisterClass *>());
+      OS << ':' << PrintReg(VirtRegs[i]);
       for (unsigned j = i+1; j != VirtRegs.size();) {
-        if (MRI->getRegClass(VirtRegs[j]) != RC) {
+        if (MRI->getRegClassOrRegBank(VirtRegs[j]) != RC) {
           ++j;
           continue;
         }
@@ -1877,6 +1980,13 @@ bool MachineInstr::addRegisterKilled(unsigned IncomingReg,
     MachineOperand &MO = getOperand(i);
     if (!MO.isReg() || !MO.isUse() || MO.isUndef())
       continue;
+
+    // DEBUG_VALUE nodes do not contribute to code generation and should
+    // always be ignored. Failure to do so may result in trying to modify
+    // KILL flags on DEBUG_VALUE nodes.
+    if (MO.isDebug())
+      continue;
+
     unsigned Reg = MO.getReg();
     if (!Reg)
       continue;
@@ -1932,7 +2042,7 @@ void MachineInstr::clearRegisterKills(unsigned Reg,
     if (!MO.isReg() || !MO.isUse() || !MO.isKill())
       continue;
     unsigned OpReg = MO.getReg();
-    if (OpReg == Reg || (RegInfo && RegInfo->isSuperRegister(Reg, OpReg)))
+    if ((RegInfo && RegInfo->regsOverlap(Reg, OpReg)) || Reg == OpReg)
       MO.setIsKill(false);
   }
 }
@@ -2085,3 +2195,42 @@ void MachineInstr::emitError(StringRef Msg) const {
       return MF->getMMI().getModule()->getContext().emitError(LocCookie, Msg);
   report_fatal_error(Msg);
 }
+
+MachineInstrBuilder llvm::BuildMI(MachineFunction &MF, const DebugLoc &DL,
+                                  const MCInstrDesc &MCID, bool IsIndirect,
+                                  unsigned Reg, unsigned Offset,
+                                  const MDNode *Variable, const MDNode *Expr) {
+  assert(isa<DILocalVariable>(Variable) && "not a variable");
+  assert(cast<DIExpression>(Expr)->isValid() && "not an expression");
+  assert(cast<DILocalVariable>(Variable)->isValidLocationForIntrinsic(DL) &&
+         "Expected inlined-at fields to agree");
+  if (IsIndirect)
+    return BuildMI(MF, DL, MCID)
+        .addReg(Reg, RegState::Debug)
+        .addImm(Offset)
+        .addMetadata(Variable)
+        .addMetadata(Expr);
+  else {
+    assert(Offset == 0 && "A direct address cannot have an offset.");
+    return BuildMI(MF, DL, MCID)
+        .addReg(Reg, RegState::Debug)
+        .addReg(0U, RegState::Debug)
+        .addMetadata(Variable)
+        .addMetadata(Expr);
+  }
+}
+
+MachineInstrBuilder llvm::BuildMI(MachineBasicBlock &BB,
+                                  MachineBasicBlock::iterator I,
+                                  const DebugLoc &DL, const MCInstrDesc &MCID,
+                                  bool IsIndirect, unsigned Reg,
+                                  unsigned Offset, const MDNode *Variable,
+                                  const MDNode *Expr) {
+  assert(isa<DILocalVariable>(Variable) && "not a variable");
+  assert(cast<DIExpression>(Expr)->isValid() && "not an expression");
+  MachineFunction &MF = *BB.getParent();
+  MachineInstr *MI =
+      BuildMI(MF, DL, MCID, IsIndirect, Reg, Offset, Variable, Expr);
+  BB.insert(I, MI);
+  return MachineInstrBuilder(MF, MI);
+}
diff --git a/lib/CodeGen/MachineInstrBundle.cpp b/lib/CodeGen/MachineInstrBundle.cpp
index 4619daf30141..e4686b3c5c4e 100644
--- a/lib/CodeGen/MachineInstrBundle.cpp
+++ b/lib/CodeGen/MachineInstrBundle.cpp
@@ -17,6 +17,7 @@
 #include "llvm/Target/TargetMachine.h"
 #include "llvm/Target/TargetRegisterInfo.h"
 #include "llvm/Target/TargetSubtargetInfo.h"
+#include <utility>
 using namespace llvm;
 
 namespace {
@@ -24,7 +25,7 @@ namespace {
   public:
     static char ID; // Pass identification
     UnpackMachineBundles(std::function<bool(const Function &)> Ftor = nullptr)
-        : MachineFunctionPass(ID), PredicateFtor(Ftor) {
+        : MachineFunctionPass(ID), PredicateFtor(std::move(Ftor)) {
       initializeUnpackMachineBundlesPass(*PassRegistry::getPassRegistry());
     }
 
@@ -78,7 +79,7 @@ bool UnpackMachineBundles::runOnMachineFunction(MachineFunction &MF) {
 
 FunctionPass *
 llvm::createUnpackMachineBundles(std::function<bool(const Function &)> Ftor) {
-  return new UnpackMachineBundles(Ftor);
+  return new UnpackMachineBundles(std::move(Ftor));
 }
 
 namespace {
@@ -293,7 +294,7 @@ MachineOperandIteratorBase::PhysRegInfo
 MachineOperandIteratorBase::analyzePhysReg(unsigned Reg,
                                            const TargetRegisterInfo *TRI) {
   bool AllDefsDead = true;
-  PhysRegInfo PRI = {false, false, false, false, false, false, false};
+  PhysRegInfo PRI = {false, false, false, false, false, false, false, false};
 
   assert(TargetRegisterInfo::isPhysicalRegister(Reg) &&
          "analyzePhysReg not given a physical register!");
@@ -332,8 +333,12 @@ MachineOperandIteratorBase::analyzePhysReg(unsigned Reg,
     }
   }
 
-  if (AllDefsDead && PRI.FullyDefined)
-    PRI.DeadDef = true;
+  if (AllDefsDead) {
+    if (PRI.FullyDefined || PRI.Clobbered)
+      PRI.DeadDef = true;
+    else if (PRI.Defined)
+      PRI.PartialDeadDef = true;
+  }
 
   return PRI;
 }
diff --git a/lib/CodeGen/MachineLICM.cpp b/lib/CodeGen/MachineLICM.cpp
index 99a97d2dbd74..119751b17f56 100644
--- a/lib/CodeGen/MachineLICM.cpp
+++ b/lib/CodeGen/MachineLICM.cpp
@@ -260,7 +260,7 @@ static bool LoopIsOuterMostWithPredecessor(MachineLoop *CurLoop) {
 }
 
 bool MachineLICM::runOnMachineFunction(MachineFunction &MF) {
-  if (skipOptnoneFunction(*MF.getFunction()))
+  if (skipFunction(*MF.getFunction()))
     return false;
 
   Changed = FirstInLoop = false;
@@ -428,7 +428,7 @@ void MachineLICM::ProcessMI(MachineInstr *MI,
   if (Def && !RuledOut) {
     int FI = INT_MIN;
     if ((!HasNonInvariantUse && IsLICMCandidate(*MI)) ||
-        (TII->isLoadFromStackSlot(MI, FI) && MFI->isSpillSlotObjectIndex(FI)))
+        (TII->isLoadFromStackSlot(*MI, FI) && MFI->isSpillSlotObjectIndex(FI)))
       Candidates.push_back(CandidateInfo(MI, Def, FI));
   }
 }
@@ -581,14 +581,14 @@ bool MachineLICM::IsGuaranteedToExecute(MachineBasicBlock *BB) {
 }
 
 void MachineLICM::EnterScope(MachineBasicBlock *MBB) {
-  DEBUG(dbgs() << "Entering: " << MBB->getName() << '\n');
+  DEBUG(dbgs() << "Entering BB#" << MBB->getNumber() << '\n');
 
   // Remember livein register pressure.
   BackTrace.push_back(RegPressure);
 }
 
 void MachineLICM::ExitScope(MachineBasicBlock *MBB) {
-  DEBUG(dbgs() << "Exiting: " << MBB->getName() << '\n');
+  DEBUG(dbgs() << "Exiting BB#" << MBB->getNumber() << '\n');
   BackTrace.pop_back();
 }
 
@@ -764,7 +764,7 @@ void MachineLICM::InitRegPressure(MachineBasicBlock *BB) {
   if (BB->pred_size() == 1) {
     MachineBasicBlock *TBB = nullptr, *FBB = nullptr;
     SmallVector<MachineOperand, 4> Cond;
-    if (!TII->AnalyzeBranch(*BB, TBB, FBB, Cond, false) && Cond.empty())
+    if (!TII->analyzeBranch(*BB, TBB, FBB, Cond, false) && Cond.empty())
       InitRegPressure(*BB->pred_begin());
   }
 
@@ -982,7 +982,7 @@ bool MachineLICM::HasHighOperandLatency(MachineInstr &MI,
       if (MOReg != Reg)
         continue;
 
-      if (TII->hasHighOperandLatency(SchedModel, MRI, &MI, DefIdx, &UseMI, i))
+      if (TII->hasHighOperandLatency(SchedModel, MRI, MI, DefIdx, UseMI, i))
         return true;
     }
 
@@ -996,7 +996,7 @@ bool MachineLICM::HasHighOperandLatency(MachineInstr &MI,
 /// Return true if the instruction is marked "cheap" or the operand latency
 /// between its def and a use is one or less.
 bool MachineLICM::IsCheapInstruction(MachineInstr &MI) const {
-  if (TII->isAsCheapAsAMove(&MI) || MI.isCopyLike())
+  if (TII->isAsCheapAsAMove(MI) || MI.isCopyLike())
     return true;
 
   bool isCheap = false;
@@ -1010,7 +1010,7 @@ bool MachineLICM::IsCheapInstruction(MachineInstr &MI) const {
     if (TargetRegisterInfo::isPhysicalRegister(Reg))
       continue;
 
-    if (!TII->hasLowDefLatency(SchedModel, &MI, i))
+    if (!TII->hasLowDefLatency(SchedModel, MI, i))
       return false;
     isCheap = true;
   }
@@ -1086,7 +1086,7 @@ bool MachineLICM::IsProfitableToHoist(MachineInstr &MI) {
 
   // Rematerializable instructions should always be hoisted since the register
   // allocator can just pull them down again when needed.
-  if (TII->isTriviallyReMaterializable(&MI, AA))
+  if (TII->isTriviallyReMaterializable(MI, AA))
     return true;
 
   // FIXME: If there are long latency loop-invariant instructions inside the
@@ -1139,8 +1139,7 @@ bool MachineLICM::IsProfitableToHoist(MachineInstr &MI) {
 
   // High register pressure situation, only hoist if the instruction is going
   // to be remat'ed.
-  if (!TII->isTriviallyReMaterializable(&MI, AA) &&
-      !MI.isInvariantLoad(AA)) {
+  if (!TII->isTriviallyReMaterializable(MI, AA) && !MI.isInvariantLoad(AA)) {
     DEBUG(dbgs() << "Can't remat / high reg-pressure: " << MI);
     return false;
   }
@@ -1171,17 +1170,15 @@ MachineInstr *MachineLICM::ExtractHoistableLoad(MachineInstr *MI) {
                                     &LoadRegIndex);
   if (NewOpc == 0) return nullptr;
   const MCInstrDesc &MID = TII->get(NewOpc);
-  if (MID.getNumDefs() != 1) return nullptr;
   MachineFunction &MF = *MI->getParent()->getParent();
   const TargetRegisterClass *RC = TII->getRegClass(MID, LoadRegIndex, TRI, MF);
   // Ok, we're unfolding. Create a temporary register and do the unfold.
   unsigned Reg = MRI->createVirtualRegister(RC);
 
   SmallVector<MachineInstr *, 2> NewMIs;
-  bool Success =
-    TII->unfoldMemoryOperand(MF, MI, Reg,
-                             /*UnfoldLoad=*/true, /*UnfoldStore=*/false,
-                             NewMIs);
+  bool Success = TII->unfoldMemoryOperand(MF, *MI, Reg,
+                                          /*UnfoldLoad=*/true,
+                                          /*UnfoldStore=*/false, NewMIs);
   (void)Success;
   assert(Success &&
          "unfoldMemoryOperand failed when getOpcodeAfterMemoryUnfold "
@@ -1222,7 +1219,7 @@ const MachineInstr*
 MachineLICM::LookForDuplicate(const MachineInstr *MI,
                               std::vector<const MachineInstr*> &PrevMIs) {
   for (const MachineInstr *PrevMI : PrevMIs)
-    if (TII->produceSameValue(MI, PrevMI, (PreRegAlloc ? MRI : nullptr)))
+    if (TII->produceSameValue(*MI, *PrevMI, (PreRegAlloc ? MRI : nullptr)))
       return PrevMI;
 
   return nullptr;
@@ -1317,12 +1314,10 @@ bool MachineLICM::Hoist(MachineInstr *MI, MachineBasicBlock *Preheader) {
   // terminator instructions.
   DEBUG({
       dbgs() << "Hoisting " << *MI;
-      if (Preheader->getBasicBlock())
-        dbgs() << " to MachineBasicBlock "
-               << Preheader->getName();
       if (MI->getParent()->getBasicBlock())
-        dbgs() << " from MachineBasicBlock "
-               << MI->getParent()->getName();
+        dbgs() << " from BB#" << MI->getParent()->getNumber();
+      if (Preheader->getBasicBlock())
+        dbgs() << " to BB#" << Preheader->getNumber();
       dbgs() << "\n";
     });
 
@@ -1382,7 +1377,7 @@ MachineBasicBlock *MachineLICM::getCurPreheader() {
         return nullptr;
       }
 
-      CurPreheader = Pred->SplitCriticalEdge(CurLoop->getHeader(), this);
+      CurPreheader = Pred->SplitCriticalEdge(CurLoop->getHeader(), *this);
       if (!CurPreheader) {
         CurPreheader = reinterpret_cast<MachineBasicBlock *>(-1);
         return nullptr;
diff --git a/lib/CodeGen/MachineLoopInfo.cpp b/lib/CodeGen/MachineLoopInfo.cpp
index 2f5c9e05cc7b..376f78fda1c4 100644
--- a/lib/CodeGen/MachineLoopInfo.cpp
+++ b/lib/CodeGen/MachineLoopInfo.cpp
@@ -50,11 +50,12 @@ void MachineLoopInfo::getAnalysisUsage(AnalysisUsage &AU) const {
 MachineBasicBlock *MachineLoop::getTopBlock() {
   MachineBasicBlock *TopMBB = getHeader();
   MachineFunction::iterator Begin = TopMBB->getParent()->begin();
-  if (TopMBB != Begin) {
+  if (TopMBB->getIterator() != Begin) {
     MachineBasicBlock *PriorMBB = &*std::prev(TopMBB->getIterator());
     while (contains(PriorMBB)) {
       TopMBB = PriorMBB;
-      if (TopMBB == Begin) break;
+      if (TopMBB->getIterator() == Begin)
+        break;
       PriorMBB = &*std::prev(TopMBB->getIterator());
     }
   }
@@ -64,7 +65,7 @@ MachineBasicBlock *MachineLoop::getTopBlock() {
 MachineBasicBlock *MachineLoop::getBottomBlock() {
   MachineBasicBlock *BotMBB = getHeader();
   MachineFunction::iterator End = BotMBB->getParent()->end();
-  if (BotMBB != std::prev(End)) {
+  if (BotMBB->getIterator() != std::prev(End)) {
     MachineBasicBlock *NextMBB = &*std::next(BotMBB->getIterator());
     while (contains(NextMBB)) {
       BotMBB = NextMBB;
@@ -77,7 +78,7 @@ MachineBasicBlock *MachineLoop::getBottomBlock() {
 }
 
 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
-void MachineLoop::dump() const {
+LLVM_DUMP_METHOD void MachineLoop::dump() const {
   print(dbgs());
 }
 #endif
diff --git a/lib/CodeGen/MachineModuleInfo.cpp b/lib/CodeGen/MachineModuleInfo.cpp
index 1956a701d8e6..244e3fbc4e8f 100644
--- a/lib/CodeGen/MachineModuleInfo.cpp
+++ b/lib/CodeGen/MachineModuleInfo.cpp
@@ -396,7 +396,8 @@ void MachineModuleInfo::TidyLandingPads(DenseMap<MCSymbol*, uintptr_t> *LPMap) {
 
       LandingPad.BeginLabels.erase(LandingPad.BeginLabels.begin() + j);
       LandingPad.EndLabels.erase(LandingPad.EndLabels.begin() + j);
-      --j, --e;
+      --j;
+      --e;
     }
 
     // Remove landing pads with no try-ranges.
diff --git a/lib/CodeGen/MachineRegionInfo.cpp b/lib/CodeGen/MachineRegionInfo.cpp
index 01d2c2eb56fe..fc32183c7f63 100644
--- a/lib/CodeGen/MachineRegionInfo.cpp
+++ b/lib/CodeGen/MachineRegionInfo.cpp
@@ -104,8 +104,8 @@ void MachineRegionInfoPass::verifyAnalysis() const {
 void MachineRegionInfoPass::getAnalysisUsage(AnalysisUsage &AU) const {
   AU.setPreservesAll();
   AU.addRequiredTransitive<DominatorTreeWrapperPass>();
-  AU.addRequired<PostDominatorTree>();
-  AU.addRequired<DominanceFrontier>();
+  AU.addRequired<PostDominatorTreeWrapperPass>();
+  AU.addRequired<DominanceFrontierWrapperPass>();
 }
 
 void MachineRegionInfoPass::print(raw_ostream &OS, const Module *) const {
@@ -113,7 +113,7 @@ void MachineRegionInfoPass::print(raw_ostream &OS, const Module *) const {
 }
 
 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
-void MachineRegionInfoPass::dump() const {
+LLVM_DUMP_METHOD void MachineRegionInfoPass::dump() const {
   RI.dump();
 }
 #endif
diff --git a/lib/CodeGen/MachineRegisterInfo.cpp b/lib/CodeGen/MachineRegisterInfo.cpp
index 03c82f46da63..613598dbe215 100644
--- a/lib/CodeGen/MachineRegisterInfo.cpp
+++ b/lib/CodeGen/MachineRegisterInfo.cpp
@@ -24,9 +24,8 @@ using namespace llvm;
 // Pin the vtable to this file.
 void MachineRegisterInfo::Delegate::anchor() {}
 
-MachineRegisterInfo::MachineRegisterInfo(const MachineFunction *MF)
-  : MF(MF), TheDelegate(nullptr), IsSSA(true), TracksLiveness(true),
-    TracksSubRegLiveness(false) {
+MachineRegisterInfo::MachineRegisterInfo(MachineFunction *MF)
+    : MF(MF), TheDelegate(nullptr), TracksSubRegLiveness(false) {
   unsigned NumRegs = getTargetRegisterInfo()->getNumRegs();
   VRegInfo.reserve(256);
   RegAllocHints.reserve(256);
@@ -42,6 +41,11 @@ MachineRegisterInfo::setRegClass(unsigned Reg, const TargetRegisterClass *RC) {
   VRegInfo[Reg].first = RC;
 }
 
+void MachineRegisterInfo::setRegBank(unsigned Reg,
+                                     const RegisterBank &RegBank) {
+  VRegInfo[Reg].first = &RegBank;
+}
+
 const TargetRegisterClass *
 MachineRegisterInfo::constrainRegClass(unsigned Reg,
                                        const TargetRegisterClass *RC,
@@ -103,6 +107,32 @@ MachineRegisterInfo::createVirtualRegister(const TargetRegisterClass *RegClass){
   return Reg;
 }
 
+unsigned
+MachineRegisterInfo::getSize(unsigned VReg) const {
+  VRegToSizeMap::const_iterator SizeIt = getVRegToSize().find(VReg);
+  return SizeIt != getVRegToSize().end() ? SizeIt->second : 0;
+}
+
+void MachineRegisterInfo::setSize(unsigned VReg, unsigned Size) {
+  getVRegToSize()[VReg] = Size;
+}
+
+unsigned
+MachineRegisterInfo::createGenericVirtualRegister(unsigned Size) {
+  assert(Size && "Cannot create empty virtual register");
+
+  // New virtual register number.
+  unsigned Reg = TargetRegisterInfo::index2VirtReg(getNumVirtRegs());
+  VRegInfo.grow(Reg);
+  // FIXME: Should we use a dummy register class?
+  VRegInfo[Reg].first = static_cast<TargetRegisterClass *>(nullptr);
+  getVRegToSize()[Reg] = Size;
+  RegAllocHints.grow(Reg);
+  if (TheDelegate)
+    TheDelegate->MRI_NoteNewVirtualRegister(Reg);
+  return Reg;
+}
+
 /// clearVirtRegs - Remove all virtual registers (after physreg assignment).
 void MachineRegisterInfo::clearVirtRegs() {
 #ifndef NDEBUG
@@ -471,13 +501,14 @@ static bool isNoReturnDef(const MachineOperand &MO) {
            !Called->hasFnAttribute(Attribute::NoUnwind));
 }
 
-bool MachineRegisterInfo::isPhysRegModified(unsigned PhysReg) const {
+bool MachineRegisterInfo::isPhysRegModified(unsigned PhysReg,
+                                            bool SkipNoReturnDef) const {
   if (UsedPhysRegMask.test(PhysReg))
     return true;
   const TargetRegisterInfo *TRI = getTargetRegisterInfo();
   for (MCRegAliasIterator AI(PhysReg, TRI, true); AI.isValid(); ++AI) {
     for (const MachineOperand &MO : make_range(def_begin(*AI), def_end())) {
-      if (isNoReturnDef(MO))
+      if (!SkipNoReturnDef && isNoReturnDef(MO))
         continue;
       return true;
     }
diff --git a/lib/CodeGen/MachineSSAUpdater.cpp b/lib/CodeGen/MachineSSAUpdater.cpp
index 71a6ebaba243..47ad60c5dd56 100644
--- a/lib/CodeGen/MachineSSAUpdater.cpp
+++ b/lib/CodeGen/MachineSSAUpdater.cpp
@@ -19,7 +19,6 @@
 #include "llvm/CodeGen/MachineInstrBuilder.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
 #include "llvm/Support/AlignOf.h"
-#include "llvm/Support/Allocator.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/raw_ostream.h"
diff --git a/lib/CodeGen/MachineScheduler.cpp b/lib/CodeGen/MachineScheduler.cpp
index bcee15c7c75f..d921e2977cc7 100644
--- a/lib/CodeGen/MachineScheduler.cpp
+++ b/lib/CodeGen/MachineScheduler.cpp
@@ -23,13 +23,13 @@
 #include "llvm/CodeGen/RegisterClassInfo.h"
 #include "llvm/CodeGen/ScheduleDFS.h"
 #include "llvm/CodeGen/ScheduleHazardRecognizer.h"
+#include "llvm/CodeGen/TargetPassConfig.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/GraphWriter.h"
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/Target/TargetInstrInfo.h"
-#include <queue>
 
 using namespace llvm;
 
@@ -65,14 +65,20 @@ static cl::opt<unsigned> SchedOnlyBlock("misched-only-block", cl::Hidden,
 static bool ViewMISchedDAGs = false;
 #endif // NDEBUG
 
+/// Avoid quadratic complexity in unusually large basic blocks by limiting the
+/// size of the ready lists.
+static cl::opt<unsigned> ReadyListLimit("misched-limit", cl::Hidden,
+  cl::desc("Limit ready list to N instructions"), cl::init(256));
+
 static cl::opt<bool> EnableRegPressure("misched-regpressure", cl::Hidden,
   cl::desc("Enable register pressure scheduling."), cl::init(true));
 
 static cl::opt<bool> EnableCyclicPath("misched-cyclicpath", cl::Hidden,
   cl::desc("Enable cyclic critical path analysis."), cl::init(true));
 
-static cl::opt<bool> EnableLoadCluster("misched-cluster", cl::Hidden,
-  cl::desc("Enable load clustering."), cl::init(true));
+static cl::opt<bool> EnableMemOpCluster("misched-cluster", cl::Hidden,
+                                        cl::desc("Enable memop clustering."),
+                                        cl::init(true));
 
 // Experimental heuristics
 static cl::opt<bool> EnableMacroFusion("misched-fusion", cl::Hidden,
@@ -219,6 +225,11 @@ static cl::opt<bool> EnableMachineSched(
     cl::desc("Enable the machine instruction scheduling pass."), cl::init(true),
     cl::Hidden);
 
+static cl::opt<bool> EnablePostRAMachineSched(
+    "enable-post-misched",
+    cl::desc("Enable the post-ra machine instruction scheduling pass."),
+    cl::init(true), cl::Hidden);
+
 /// Forward declare the standard machine scheduler. This will be used as the
 /// default scheduler if the target does not set a default.
 static ScheduleDAGInstrs *createGenericSchedLive(MachineSchedContext *C);
@@ -314,6 +325,9 @@ ScheduleDAGInstrs *PostMachineScheduler::createPostMachineScheduler() {
 /// design would be to split blocks at scheduling boundaries, but LLVM has a
 /// general bias against block splitting purely for implementation simplicity.
 bool MachineScheduler::runOnMachineFunction(MachineFunction &mf) {
+  if (skipFunction(*mf.getFunction()))
+    return false;
+
   if (EnableMachineSched.getNumOccurrences()) {
     if (!EnableMachineSched)
       return false;
@@ -349,10 +363,13 @@ bool MachineScheduler::runOnMachineFunction(MachineFunction &mf) {
 }
 
 bool PostMachineScheduler::runOnMachineFunction(MachineFunction &mf) {
-  if (skipOptnoneFunction(*mf.getFunction()))
+  if (skipFunction(*mf.getFunction()))
     return false;
 
-  if (!mf.getSubtarget().enablePostRAScheduler()) {
+  if (EnablePostRAMachineSched.getNumOccurrences()) {
+    if (!EnablePostRAMachineSched)
+      return false;
+  } else if (!mf.getSubtarget().enablePostRAScheduler()) {
     DEBUG(dbgs() << "Subtarget disables post-MI-sched.\n");
     return false;
   }
@@ -389,7 +406,7 @@ static bool isSchedBoundary(MachineBasicBlock::iterator MI,
                             MachineBasicBlock *MBB,
                             MachineFunction *MF,
                             const TargetInstrInfo *TII) {
-  return MI->isCall() || TII->isSchedulingBoundary(MI, MBB, *MF);
+  return MI->isCall() || TII->isSchedulingBoundary(*MI, MBB, *MF);
 }
 
 /// Main driver for both MachineScheduler and PostMachineScheduler.
@@ -427,7 +444,6 @@ void MachineSchedulerBase::scheduleRegions(ScheduleDAGInstrs &Scheduler,
     //
     // MBB::size() uses instr_iterator to count. Here we need a bundle to count
     // as a single instruction.
-    unsigned RemainingInstrs = std::distance(MBB->begin(), MBB->end());
     for(MachineBasicBlock::iterator RegionEnd = MBB->end();
         RegionEnd != MBB->begin(); RegionEnd = Scheduler.begin()) {
 
@@ -435,15 +451,13 @@ void MachineSchedulerBase::scheduleRegions(ScheduleDAGInstrs &Scheduler,
       if (RegionEnd != MBB->end() ||
           isSchedBoundary(&*std::prev(RegionEnd), &*MBB, MF, TII)) {
         --RegionEnd;
-        // Count the boundary instruction.
-        --RemainingInstrs;
       }
 
       // The next region starts above the previous region. Look backward in the
       // instruction stream until we find the nearest boundary.
       unsigned NumRegionInstrs = 0;
       MachineBasicBlock::iterator I = RegionEnd;
-      for(;I != MBB->begin(); --I, --RemainingInstrs) {
+      for (;I != MBB->begin(); --I) {
         if (isSchedBoundary(&*std::prev(I), &*MBB, MF, TII))
           break;
         if (!I->isDebugValue())
@@ -466,8 +480,7 @@ void MachineSchedulerBase::scheduleRegions(ScheduleDAGInstrs &Scheduler,
             << "\n  From: " << *I << "    To: ";
             if (RegionEnd != MBB->end()) dbgs() << *RegionEnd;
             else dbgs() << "End";
-            dbgs() << " RegionInstrs: " << NumRegionInstrs
-            << " Remaining: " << RemainingInstrs << "\n");
+            dbgs() << " RegionInstrs: " << NumRegionInstrs << '\n');
       if (DumpCriticalPathLength) {
         errs() << MF->getName();
         errs() << ":BB# " << MBB->getNumber();
@@ -485,7 +498,6 @@ void MachineSchedulerBase::scheduleRegions(ScheduleDAGInstrs &Scheduler,
       // scheduler for the top of it's scheduled region.
       RegionEnd = Scheduler.begin();
     }
-    assert(RemainingInstrs == 0 && "Instruction count mismatch!");
     Scheduler.finishBlock();
     // FIXME: Ideally, no further passes should rely on kill flags. However,
     // thumb2 size reduction is currently an exception, so the PostMIScheduler
@@ -640,7 +652,7 @@ void ScheduleDAGMI::moveInstruction(
 
   // Update LiveIntervals
   if (LIS)
-    LIS->handleMove(MI, /*UpdateFlags=*/true);
+    LIS->handleMove(*MI, /*UpdateFlags=*/true);
 
   // Recede RegionBegin if an instruction moves above the first.
   if (RegionBegin == InsertPos)
@@ -704,8 +716,7 @@ void ScheduleDAGMI::schedule() {
         CurrentTop = nextIfDebug(++CurrentTop, CurrentBottom);
       else
         moveInstruction(MI, CurrentTop);
-    }
-    else {
+    } else {
       assert(SU->isBottomReady() && "node still has unscheduled dependencies");
       MachineBasicBlock::iterator priorII =
         priorNonDebug(CurrentBottom, CurrentTop);
@@ -869,13 +880,19 @@ void ScheduleDAGMILive::enterRegion(MachineBasicBlock *bb,
   SUPressureDiffs.clear();
 
   ShouldTrackPressure = SchedImpl->shouldTrackPressure();
+  ShouldTrackLaneMasks = SchedImpl->shouldTrackLaneMasks();
+
+  assert((!ShouldTrackLaneMasks || ShouldTrackPressure) &&
+         "ShouldTrackLaneMasks requires ShouldTrackPressure");
 }
 
 // Setup the register pressure trackers for the top scheduled top and bottom
 // scheduled regions.
 void ScheduleDAGMILive::initRegPressure() {
-  TopRPTracker.init(&MF, RegClassInfo, LIS, BB, RegionBegin);
-  BotRPTracker.init(&MF, RegClassInfo, LIS, BB, LiveRegionEnd);
+  TopRPTracker.init(&MF, RegClassInfo, LIS, BB, RegionBegin,
+                    ShouldTrackLaneMasks, false);
+  BotRPTracker.init(&MF, RegClassInfo, LIS, BB, LiveRegionEnd,
+                    ShouldTrackLaneMasks, false);
 
   // Close the RPTracker to finalize live ins.
   RPTracker.closeRegion();
@@ -905,7 +922,7 @@ void ScheduleDAGMILive::initRegPressure() {
 
   // Account for liveness generated by the region boundary.
   if (LiveRegionEnd != RegionEnd) {
-    SmallVector<unsigned, 8> LiveUses;
+    SmallVector<RegisterMaskPair, 8> LiveUses;
     BotRPTracker.recede(&LiveUses);
     updatePressureDiffs(LiveUses);
   }
@@ -969,47 +986,74 @@ updateScheduledPressure(const SUnit *SU,
 
 /// Update the PressureDiff array for liveness after scheduling this
 /// instruction.
-void ScheduleDAGMILive::updatePressureDiffs(ArrayRef<unsigned> LiveUses) {
-  for (unsigned LUIdx = 0, LUEnd = LiveUses.size(); LUIdx != LUEnd; ++LUIdx) {
+void ScheduleDAGMILive::updatePressureDiffs(
+    ArrayRef<RegisterMaskPair> LiveUses) {
+  for (const RegisterMaskPair &P : LiveUses) {
+    unsigned Reg = P.RegUnit;
     /// FIXME: Currently assuming single-use physregs.
-    unsigned Reg = LiveUses[LUIdx];
-    DEBUG(dbgs() << "  LiveReg: " << PrintVRegOrUnit(Reg, TRI) << "\n");
     if (!TRI->isVirtualRegister(Reg))
       continue;
 
-    // This may be called before CurrentBottom has been initialized. However,
-    // BotRPTracker must have a valid position. We want the value live into the
-    // instruction or live out of the block, so ask for the previous
-    // instruction's live-out.
-    const LiveInterval &LI = LIS->getInterval(Reg);
-    VNInfo *VNI;
-    MachineBasicBlock::const_iterator I =
-      nextIfDebug(BotRPTracker.getPos(), BB->end());
-    if (I == BB->end())
-      VNI = LI.getVNInfoBefore(LIS->getMBBEndIdx(BB));
-    else {
-      LiveQueryResult LRQ = LI.Query(LIS->getInstructionIndex(I));
-      VNI = LRQ.valueIn();
-    }
-    // RegisterPressureTracker guarantees that readsReg is true for LiveUses.
-    assert(VNI && "No live value at use.");
-    for (const VReg2SUnit &V2SU
-         : make_range(VRegUses.find(Reg), VRegUses.end())) {
-      SUnit *SU = V2SU.SU;
-      // If this use comes before the reaching def, it cannot be a last use, so
-      // descrease its pressure change.
-      if (!SU->isScheduled && SU != &ExitSU) {
-        LiveQueryResult LRQ
-          = LI.Query(LIS->getInstructionIndex(SU->getInstr()));
-        if (LRQ.valueIn() == VNI) {
-          PressureDiff &PDiff = getPressureDiff(SU);
-          PDiff.addPressureChange(Reg, true, &MRI);
-          DEBUG(
-            dbgs() << "  UpdateRegP: SU(" << SU->NodeNum << ") "
-                   << *SU->getInstr();
-            dbgs() << "              to ";
-            PDiff.dump(*TRI);
-          );
+    if (ShouldTrackLaneMasks) {
+      // If the register has just become live then other uses won't change
+      // this fact anymore => decrement pressure.
+      // If the register has just become dead then other uses make it come
+      // back to life => increment pressure.
+      bool Decrement = P.LaneMask != 0;
+
+      for (const VReg2SUnit &V2SU
+           : make_range(VRegUses.find(Reg), VRegUses.end())) {
+        SUnit &SU = *V2SU.SU;
+        if (SU.isScheduled || &SU == &ExitSU)
+          continue;
+
+        PressureDiff &PDiff = getPressureDiff(&SU);
+        PDiff.addPressureChange(Reg, Decrement, &MRI);
+        DEBUG(
+          dbgs() << "  UpdateRegP: SU(" << SU.NodeNum << ") "
+                 << PrintReg(Reg, TRI) << ':' << PrintLaneMask(P.LaneMask)
+                 << ' ' << *SU.getInstr();
+          dbgs() << "              to ";
+          PDiff.dump(*TRI);
+        );
+      }
+    } else {
+      assert(P.LaneMask != 0);
+      DEBUG(dbgs() << "  LiveReg: " << PrintVRegOrUnit(Reg, TRI) << "\n");
+      // This may be called before CurrentBottom has been initialized. However,
+      // BotRPTracker must have a valid position. We want the value live into the
+      // instruction or live out of the block, so ask for the previous
+      // instruction's live-out.
+      const LiveInterval &LI = LIS->getInterval(Reg);
+      VNInfo *VNI;
+      MachineBasicBlock::const_iterator I =
+        nextIfDebug(BotRPTracker.getPos(), BB->end());
+      if (I == BB->end())
+        VNI = LI.getVNInfoBefore(LIS->getMBBEndIdx(BB));
+      else {
+        LiveQueryResult LRQ = LI.Query(LIS->getInstructionIndex(*I));
+        VNI = LRQ.valueIn();
+      }
+      // RegisterPressureTracker guarantees that readsReg is true for LiveUses.
+      assert(VNI && "No live value at use.");
+      for (const VReg2SUnit &V2SU
+           : make_range(VRegUses.find(Reg), VRegUses.end())) {
+        SUnit *SU = V2SU.SU;
+        // If this use comes before the reaching def, it cannot be a last use,
+        // so decrease its pressure change.
+        if (!SU->isScheduled && SU != &ExitSU) {
+          LiveQueryResult LRQ =
+              LI.Query(LIS->getInstructionIndex(*SU->getInstr()));
+          if (LRQ.valueIn() == VNI) {
+            PressureDiff &PDiff = getPressureDiff(SU);
+            PDiff.addPressureChange(Reg, true, &MRI);
+            DEBUG(
+              dbgs() << "  UpdateRegP: SU(" << SU->NodeNum << ") "
+                     << *SU->getInstr();
+              dbgs() << "              to ";
+              PDiff.dump(*TRI);
+            );
+          }
         }
       }
     }
@@ -1057,11 +1101,6 @@ void ScheduleDAGMILive::schedule() {
   // Initialize ready queues now that the DAG and priority data are finalized.
   initQueues(TopRoots, BotRoots);
 
-  if (ShouldTrackPressure) {
-    assert(TopRPTracker.getPos() == RegionBegin && "bad initial Top tracker");
-    TopRPTracker.setPos(CurrentTop);
-  }
-
   bool IsTopNode = false;
   while (true) {
     DEBUG(dbgs() << "** ScheduleDAGMILive::schedule picking next node\n");
@@ -1111,14 +1150,14 @@ void ScheduleDAGMILive::buildDAGWithRegPressure() {
 
   // Initialize the register pressure tracker used by buildSchedGraph.
   RPTracker.init(&MF, RegClassInfo, LIS, BB, LiveRegionEnd,
-                 /*TrackUntiedDefs=*/true);
+                 ShouldTrackLaneMasks, /*TrackUntiedDefs=*/true);
 
   // Account for liveness generate by the region boundary.
   if (LiveRegionEnd != RegionEnd)
     RPTracker.recede();
 
   // Build the DAG, and compute current register pressure.
-  buildSchedGraph(AA, &RPTracker, &SUPressureDiffs);
+  buildSchedGraph(AA, &RPTracker, &SUPressureDiffs, LIS, ShouldTrackLaneMasks);
 
   // Initialize top/bottom trackers after computing region pressure.
   initRegPressure();
@@ -1167,10 +1206,8 @@ unsigned ScheduleDAGMILive::computeCyclicCriticalPath() {
 
   unsigned MaxCyclicLatency = 0;
   // Visit each live out vreg def to find def/use pairs that cross iterations.
-  ArrayRef<unsigned> LiveOuts = RPTracker.getPressure().LiveOutRegs;
-  for (ArrayRef<unsigned>::iterator RI = LiveOuts.begin(), RE = LiveOuts.end();
-       RI != RE; ++RI) {
-    unsigned Reg = *RI;
+  for (const RegisterMaskPair &P : RPTracker.getPressure().LiveOutRegs) {
+    unsigned Reg = P.RegUnit;
     if (!TRI->isVirtualRegister(Reg))
         continue;
     const LiveInterval &LI = LIS->getInterval(Reg);
@@ -1193,8 +1230,7 @@ unsigned ScheduleDAGMILive::computeCyclicCriticalPath() {
         continue;
 
       // Only consider uses of the phi.
-      LiveQueryResult LRQ =
-        LI.Query(LIS->getInstructionIndex(SU->getInstr()));
+      LiveQueryResult LRQ = LI.Query(LIS->getInstructionIndex(*SU->getInstr()));
       if (!LRQ.valueIn()->isPHIDef())
         continue;
 
@@ -1209,8 +1245,7 @@ unsigned ScheduleDAGMILive::computeCyclicCriticalPath() {
       if (LiveInHeight > LiveOutHeight) {
         if (LiveInHeight - LiveOutHeight < CyclicLatency)
           CyclicLatency = LiveInHeight - LiveOutHeight;
-      }
-      else
+      } else
         CyclicLatency = 0;
 
       DEBUG(dbgs() << "Cyclic Path: SU(" << DefSU->NodeNum << ") -> SU("
@@ -1223,6 +1258,17 @@ unsigned ScheduleDAGMILive::computeCyclicCriticalPath() {
   return MaxCyclicLatency;
 }
 
+/// Release ExitSU predecessors and setup scheduler queues. Re-position
+/// the Top RP tracker in case the region beginning has changed.
+void ScheduleDAGMILive::initQueues(ArrayRef<SUnit*> TopRoots,
+                                   ArrayRef<SUnit*> BotRoots) {
+  ScheduleDAGMI::initQueues(TopRoots, BotRoots);
+  if (ShouldTrackPressure) {
+    assert(TopRPTracker.getPos() == RegionBegin && "bad initial Top tracker");
+    TopRPTracker.setPos(CurrentTop);
+  }
+}
+
 /// Move an instruction and update register pressure.
 void ScheduleDAGMILive::scheduleMI(SUnit *SU, bool IsTopNode) {
   // Move the instruction to its new location in the instruction stream.
@@ -1239,7 +1285,18 @@ void ScheduleDAGMILive::scheduleMI(SUnit *SU, bool IsTopNode) {
 
     if (ShouldTrackPressure) {
       // Update top scheduled pressure.
-      TopRPTracker.advance();
+      RegisterOperands RegOpers;
+      RegOpers.collect(*MI, *TRI, MRI, ShouldTrackLaneMasks, false);
+      if (ShouldTrackLaneMasks) {
+        // Adjust liveness and add missing dead+read-undef flags.
+        SlotIndex SlotIdx = LIS->getInstructionIndex(*MI).getRegSlot();
+        RegOpers.adjustLaneLiveness(*LIS, MRI, SlotIdx, MI);
+      } else {
+        // Adjust for missing dead-def flags.
+        RegOpers.detectDeadDefs(*MI, *LIS);
+      }
+
+      TopRPTracker.advance(RegOpers);
       assert(TopRPTracker.getPos() == CurrentTop && "out of sync");
       DEBUG(
         dbgs() << "Top Pressure:\n";
@@ -1248,8 +1305,7 @@ void ScheduleDAGMILive::scheduleMI(SUnit *SU, bool IsTopNode) {
 
       updateScheduledPressure(SU, TopRPTracker.getPressure().MaxSetPressure);
     }
-  }
-  else {
+  } else {
     assert(SU->isBottomReady() && "node still has unscheduled dependencies");
     MachineBasicBlock::iterator priorII =
       priorNonDebug(CurrentBottom, CurrentTop);
@@ -1264,9 +1320,20 @@ void ScheduleDAGMILive::scheduleMI(SUnit *SU, bool IsTopNode) {
       CurrentBottom = MI;
     }
     if (ShouldTrackPressure) {
-      // Update bottom scheduled pressure.
-      SmallVector<unsigned, 8> LiveUses;
-      BotRPTracker.recede(&LiveUses);
+      RegisterOperands RegOpers;
+      RegOpers.collect(*MI, *TRI, MRI, ShouldTrackLaneMasks, false);
+      if (ShouldTrackLaneMasks) {
+        // Adjust liveness and add missing dead+read-undef flags.
+        SlotIndex SlotIdx = LIS->getInstructionIndex(*MI).getRegSlot();
+        RegOpers.adjustLaneLiveness(*LIS, MRI, SlotIdx, MI);
+      } else {
+        // Adjust for missing dead-def flags.
+        RegOpers.detectDeadDefs(*MI, *LIS);
+      }
+
+      BotRPTracker.recedeSkipDebugValues();
+      SmallVector<RegisterMaskPair, 8> LiveUses;
+      BotRPTracker.recede(RegOpers, &LiveUses);
       assert(BotRPTracker.getPos() == CurrentBottom && "out of sync");
       DEBUG(
         dbgs() << "Bottom Pressure:\n";
@@ -1280,64 +1347,81 @@ void ScheduleDAGMILive::scheduleMI(SUnit *SU, bool IsTopNode) {
 }
 
 //===----------------------------------------------------------------------===//
-// LoadClusterMutation - DAG post-processing to cluster loads.
+// BaseMemOpClusterMutation - DAG post-processing to cluster loads or stores.
 //===----------------------------------------------------------------------===//
 
 namespace {
 /// \brief Post-process the DAG to create cluster edges between neighboring
-/// loads.
-class LoadClusterMutation : public ScheduleDAGMutation {
-  struct LoadInfo {
+/// loads or between neighboring stores.
+class BaseMemOpClusterMutation : public ScheduleDAGMutation {
+  struct MemOpInfo {
     SUnit *SU;
     unsigned BaseReg;
-    unsigned Offset;
-    LoadInfo(SUnit *su, unsigned reg, unsigned ofs)
-      : SU(su), BaseReg(reg), Offset(ofs) {}
+    int64_t Offset;
+    MemOpInfo(SUnit *su, unsigned reg, int64_t ofs)
+        : SU(su), BaseReg(reg), Offset(ofs) {}
 
-    bool operator<(const LoadInfo &RHS) const {
+    bool operator<(const MemOpInfo&RHS) const {
       return std::tie(BaseReg, Offset) < std::tie(RHS.BaseReg, RHS.Offset);
     }
   };
 
   const TargetInstrInfo *TII;
   const TargetRegisterInfo *TRI;
+  bool IsLoad;
+
 public:
-  LoadClusterMutation(const TargetInstrInfo *tii,
-                      const TargetRegisterInfo *tri)
-    : TII(tii), TRI(tri) {}
+  BaseMemOpClusterMutation(const TargetInstrInfo *tii,
+                           const TargetRegisterInfo *tri, bool IsLoad)
+      : TII(tii), TRI(tri), IsLoad(IsLoad) {}
+
+  void apply(ScheduleDAGInstrs *DAGInstrs) override;
 
-  void apply(ScheduleDAGMI *DAG) override;
 protected:
-  void clusterNeighboringLoads(ArrayRef<SUnit*> Loads, ScheduleDAGMI *DAG);
+  void clusterNeighboringMemOps(ArrayRef<SUnit *> MemOps, ScheduleDAGMI *DAG);
+};
+
+class StoreClusterMutation : public BaseMemOpClusterMutation {
+public:
+  StoreClusterMutation(const TargetInstrInfo *tii,
+                       const TargetRegisterInfo *tri)
+      : BaseMemOpClusterMutation(tii, tri, false) {}
+};
+
+class LoadClusterMutation : public BaseMemOpClusterMutation {
+public:
+  LoadClusterMutation(const TargetInstrInfo *tii, const TargetRegisterInfo *tri)
+      : BaseMemOpClusterMutation(tii, tri, true) {}
 };
 } // anonymous
 
-void LoadClusterMutation::clusterNeighboringLoads(ArrayRef<SUnit*> Loads,
-                                                  ScheduleDAGMI *DAG) {
-  SmallVector<LoadClusterMutation::LoadInfo,32> LoadRecords;
-  for (unsigned Idx = 0, End = Loads.size(); Idx != End; ++Idx) {
-    SUnit *SU = Loads[Idx];
+void BaseMemOpClusterMutation::clusterNeighboringMemOps(
+    ArrayRef<SUnit *> MemOps, ScheduleDAGMI *DAG) {
+  SmallVector<MemOpInfo, 32> MemOpRecords;
+  for (unsigned Idx = 0, End = MemOps.size(); Idx != End; ++Idx) {
+    SUnit *SU = MemOps[Idx];
     unsigned BaseReg;
-    unsigned Offset;
-    if (TII->getMemOpBaseRegImmOfs(SU->getInstr(), BaseReg, Offset, TRI))
-      LoadRecords.push_back(LoadInfo(SU, BaseReg, Offset));
+    int64_t Offset;
+    if (TII->getMemOpBaseRegImmOfs(*SU->getInstr(), BaseReg, Offset, TRI))
+      MemOpRecords.push_back(MemOpInfo(SU, BaseReg, Offset));
   }
-  if (LoadRecords.size() < 2)
+  if (MemOpRecords.size() < 2)
     return;
-  std::sort(LoadRecords.begin(), LoadRecords.end());
+
+  std::sort(MemOpRecords.begin(), MemOpRecords.end());
   unsigned ClusterLength = 1;
-  for (unsigned Idx = 0, End = LoadRecords.size(); Idx < (End - 1); ++Idx) {
-    if (LoadRecords[Idx].BaseReg != LoadRecords[Idx+1].BaseReg) {
+  for (unsigned Idx = 0, End = MemOpRecords.size(); Idx < (End - 1); ++Idx) {
+    if (MemOpRecords[Idx].BaseReg != MemOpRecords[Idx+1].BaseReg) {
       ClusterLength = 1;
       continue;
     }
 
-    SUnit *SUa = LoadRecords[Idx].SU;
-    SUnit *SUb = LoadRecords[Idx+1].SU;
-    if (TII->shouldClusterLoads(SUa->getInstr(), SUb->getInstr(), ClusterLength)
-        && DAG->addEdge(SUb, SDep(SUa, SDep::Cluster))) {
-
-      DEBUG(dbgs() << "Cluster loads SU(" << SUa->NodeNum << ") - SU("
+    SUnit *SUa = MemOpRecords[Idx].SU;
+    SUnit *SUb = MemOpRecords[Idx+1].SU;
+    if (TII->shouldClusterMemOps(*SUa->getInstr(), *SUb->getInstr(),
+                                 ClusterLength) &&
+        DAG->addEdge(SUb, SDep(SUa, SDep::Cluster))) {
+      DEBUG(dbgs() << "Cluster ld/st SU(" << SUa->NodeNum << ") - SU("
             << SUb->NodeNum << ")\n");
       // Copy successor edges from SUa to SUb. Interleaving computation
       // dependent on SUa can prevent load combining due to register reuse.
@@ -1351,22 +1435,26 @@ void LoadClusterMutation::clusterNeighboringLoads(ArrayRef<SUnit*> Loads,
         DAG->addEdge(SI->getSUnit(), SDep(SUb, SDep::Artificial));
       }
       ++ClusterLength;
-    }
-    else
+    } else
       ClusterLength = 1;
   }
 }
 
 /// \brief Callback from DAG postProcessing to create cluster edges for loads.
-void LoadClusterMutation::apply(ScheduleDAGMI *DAG) {
+void BaseMemOpClusterMutation::apply(ScheduleDAGInstrs *DAGInstrs) {
+
+  ScheduleDAGMI *DAG = static_cast<ScheduleDAGMI*>(DAGInstrs);
+
   // Map DAG NodeNum to store chain ID.
   DenseMap<unsigned, unsigned> StoreChainIDs;
-  // Map each store chain to a set of dependent loads.
+  // Map each store chain to a set of dependent MemOps.
   SmallVector<SmallVector<SUnit*,4>, 32> StoreChainDependents;
   for (unsigned Idx = 0, End = DAG->SUnits.size(); Idx != End; ++Idx) {
     SUnit *SU = &DAG->SUnits[Idx];
-    if (!SU->getInstr()->mayLoad())
+    if ((IsLoad && !SU->getInstr()->mayLoad()) ||
+        (!IsLoad && !SU->getInstr()->mayStore()))
       continue;
+
     unsigned ChainPredID = DAG->SUnits.size();
     for (SUnit::const_pred_iterator
            PI = SU->Preds.begin(), PE = SU->Preds.end(); PI != PE; ++PI) {
@@ -1376,7 +1464,7 @@ void LoadClusterMutation::apply(ScheduleDAGMI *DAG) {
       }
     }
     // Check if this chain-like pred has been seen
-    // before. ChainPredID==MaxNodeID for loads at the top of the schedule.
+    // before. ChainPredID==MaxNodeID at the top of the schedule.
     unsigned NumChains = StoreChainDependents.size();
     std::pair<DenseMap<unsigned, unsigned>::iterator, bool> Result =
       StoreChainIDs.insert(std::make_pair(ChainPredID, NumChains));
@@ -1384,9 +1472,10 @@ void LoadClusterMutation::apply(ScheduleDAGMI *DAG) {
       StoreChainDependents.resize(NumChains + 1);
     StoreChainDependents[Result.first->second].push_back(SU);
   }
+
   // Iterate over the store chains.
   for (unsigned Idx = 0, End = StoreChainDependents.size(); Idx != End; ++Idx)
-    clusterNeighboringLoads(StoreChainDependents[Idx], DAG);
+    clusterNeighboringMemOps(StoreChainDependents[Idx], DAG);
 }
 
 //===----------------------------------------------------------------------===//
@@ -1403,7 +1492,7 @@ public:
   MacroFusion(const TargetInstrInfo &TII, const TargetRegisterInfo &TRI)
     : TII(TII), TRI(TRI) {}
 
-  void apply(ScheduleDAGMI *DAG) override;
+  void apply(ScheduleDAGInstrs *DAGInstrs) override;
 };
 } // anonymous
 
@@ -1423,7 +1512,9 @@ static bool HasDataDep(const TargetRegisterInfo &TRI, const MachineInstr &MI,
 
 /// \brief Callback from DAG postProcessing to create cluster edges to encourage
 /// fused operations.
-void MacroFusion::apply(ScheduleDAGMI *DAG) {
+void MacroFusion::apply(ScheduleDAGInstrs *DAGInstrs) {
+  ScheduleDAGMI *DAG = static_cast<ScheduleDAGMI*>(DAGInstrs);
+
   // For now, assume targets can only fuse with the branch.
   SUnit &ExitSU = DAG->ExitSU;
   MachineInstr *Branch = ExitSU.getInstr();
@@ -1439,7 +1530,7 @@ void MacroFusion::apply(ScheduleDAGMI *DAG) {
     if (!HasDataDep(TRI, *Branch, *Pred))
       continue;
 
-    if (!TII.shouldScheduleAdjacent(Pred, Branch))
+    if (!TII.shouldScheduleAdjacent(*Pred, *Branch))
       continue;
 
     // Create a single weak edge from SU to ExitSU. The only effect is to cause
@@ -1474,7 +1565,7 @@ class CopyConstrain : public ScheduleDAGMutation {
 public:
   CopyConstrain(const TargetInstrInfo *, const TargetRegisterInfo *) {}
 
-  void apply(ScheduleDAGMI *DAG) override;
+  void apply(ScheduleDAGInstrs *DAGInstrs) override;
 
 protected:
   void constrainLocalCopy(SUnit *CopySU, ScheduleDAGMILive *DAG);
@@ -1505,12 +1596,14 @@ void CopyConstrain::constrainLocalCopy(SUnit *CopySU, ScheduleDAGMILive *DAG) {
   MachineInstr *Copy = CopySU->getInstr();
 
   // Check for pure vreg copies.
-  unsigned SrcReg = Copy->getOperand(1).getReg();
-  if (!TargetRegisterInfo::isVirtualRegister(SrcReg))
+  const MachineOperand &SrcOp = Copy->getOperand(1);
+  unsigned SrcReg = SrcOp.getReg();
+  if (!TargetRegisterInfo::isVirtualRegister(SrcReg) || !SrcOp.readsReg())
     return;
 
-  unsigned DstReg = Copy->getOperand(0).getReg();
-  if (!TargetRegisterInfo::isVirtualRegister(DstReg))
+  const MachineOperand &DstOp = Copy->getOperand(0);
+  unsigned DstReg = DstOp.getReg();
+  if (!TargetRegisterInfo::isVirtualRegister(DstReg) || DstOp.isDead())
     return;
 
   // Check if either the dest or source is local. If it's live across a back
@@ -1627,15 +1720,16 @@ void CopyConstrain::constrainLocalCopy(SUnit *CopySU, ScheduleDAGMILive *DAG) {
 
 /// \brief Callback from DAG postProcessing to create weak edges to encourage
 /// copy elimination.
-void CopyConstrain::apply(ScheduleDAGMI *DAG) {
+void CopyConstrain::apply(ScheduleDAGInstrs *DAGInstrs) {
+  ScheduleDAGMI *DAG = static_cast<ScheduleDAGMI*>(DAGInstrs);
   assert(DAG->hasVRegLiveness() && "Expect VRegs with LiveIntervals");
 
   MachineBasicBlock::iterator FirstPos = nextIfDebug(DAG->begin(), DAG->end());
   if (FirstPos == DAG->end())
     return;
-  RegionBeginIdx = DAG->getLIS()->getInstructionIndex(&*FirstPos);
+  RegionBeginIdx = DAG->getLIS()->getInstructionIndex(*FirstPos);
   RegionEndIdx = DAG->getLIS()->getInstructionIndex(
-    &*priorNonDebug(DAG->end(), DAG->begin()));
+      *priorNonDebug(DAG->end(), DAG->begin()));
 
   for (unsigned Idx = 0, End = DAG->SUnits.size(); Idx != End; ++Idx) {
     SUnit *SU = &DAG->SUnits[Idx];
@@ -1862,7 +1956,8 @@ void SchedBoundary::releaseNode(SUnit *SU, unsigned ReadyCycle) {
   // Check for interlocks first. For the purpose of other heuristics, an
   // instruction that cannot issue appears as if it's not in the ReadyQueue.
   bool IsBuffered = SchedModel->getMicroOpBufferSize() != 0;
-  if ((!IsBuffered && ReadyCycle > CurrCycle) || checkHazard(SU))
+  if ((!IsBuffered && ReadyCycle > CurrCycle) || checkHazard(SU) ||
+      Available.size() >= ReadyListLimit)
     Pending.push(SU);
   else
     Available.push(SU);
@@ -1905,8 +2000,7 @@ void SchedBoundary::bumpCycle(unsigned NextCycle) {
   if (!HazardRec->isEnabled()) {
     // Bypass HazardRec virtual calls.
     CurrCycle = NextCycle;
-  }
-  else {
+  } else {
     // Bypass getHazardType calls in case of long latency.
     for (; CurrCycle != NextCycle; ++CurrCycle) {
       if (isTop())
@@ -2074,8 +2168,7 @@ void SchedBoundary::bumpNode(SUnit *SU) {
   // If we stall for any reason, bump the cycle.
   if (NextCycle > CurrCycle) {
     bumpCycle(NextCycle);
-  }
-  else {
+  } else {
     // After updating ZoneCritResIdx and ExpectedLatency, check if we're
     // resource limited. If a stall occurred, bumpCycle does this.
     unsigned LFactor = SchedModel->getLatencyFactor();
@@ -2119,11 +2212,13 @@ void SchedBoundary::releasePending() {
     if (checkHazard(SU))
       continue;
 
+    if (Available.size() >= ReadyListLimit)
+      break;
+
     Available.push(SU);
     Pending.remove(Pending.begin()+i);
     --i; --e;
   }
-  DEBUG(if (!Pending.empty()) Pending.dump());
   CheckPending = false;
 }
 
@@ -2163,6 +2258,10 @@ SUnit *SchedBoundary::pickOnlyChoice() {
     bumpCycle(CurrCycle + 1);
     releasePending();
   }
+
+  DEBUG(Pending.dump());
+  DEBUG(Available.dump());
+
   if (Available.size() == 1)
     return *Available.begin();
   return nullptr;
@@ -2177,8 +2276,7 @@ void SchedBoundary::dumpScheduledState() {
   if (ZoneCritResIdx) {
     ResFactor = SchedModel->getResourceFactor(ZoneCritResIdx);
     ResCount = getResourceCount(ZoneCritResIdx);
-  }
-  else {
+  } else {
     ResFactor = SchedModel->getMicroOpFactor();
     ResCount = RetiredMOps * SchedModel->getMicroOpFactor();
   }
@@ -2218,8 +2316,7 @@ initResourceDelta(const ScheduleDAGMI *DAG,
 
 /// Set the CandPolicy given a scheduling zone given the current resources and
 /// latencies inside and outside the zone.
-void GenericSchedulerBase::setPolicy(CandPolicy &Policy,
-                                     bool IsPostRA,
+void GenericSchedulerBase::setPolicy(CandPolicy &Policy, bool IsPostRA,
                                      SchedBoundary &CurrZone,
                                      SchedBoundary *OtherZone) {
   // Apply preemptive heuristics based on the total latency and resources
@@ -2295,7 +2392,8 @@ const char *GenericSchedulerBase::getReasonStr(
   GenericSchedulerBase::CandReason Reason) {
   switch (Reason) {
   case NoCand:         return "NOCAND    ";
-  case PhysRegCopy:    return "PREG-COPY";
+  case Only1:          return "ONLY1     ";
+  case PhysRegCopy:    return "PREG-COPY ";
   case RegExcess:      return "REG-EXCESS";
   case RegCritical:    return "REG-CRIT  ";
   case Stall:          return "STALL     ";
@@ -2381,7 +2479,6 @@ static bool tryLess(int TryVal, int CandVal,
       Cand.Reason = Reason;
     return true;
   }
-  Cand.setRepeat(Reason);
   return false;
 }
 
@@ -2398,7 +2495,6 @@ static bool tryGreater(int TryVal, int CandVal,
       Cand.Reason = Reason;
     return true;
   }
-  Cand.setRepeat(Reason);
   return false;
 }
 
@@ -2414,8 +2510,7 @@ static bool tryLatency(GenericSchedulerBase::SchedCandidate &TryCand,
     if (tryGreater(TryCand.SU->getHeight(), Cand.SU->getHeight(),
                    TryCand, Cand, GenericSchedulerBase::TopPathReduce))
       return true;
-  }
-  else {
+  } else {
     if (Cand.SU->getHeight() > Zone.getScheduledLatency()) {
       if (tryLess(TryCand.SU->getHeight(), Cand.SU->getHeight(),
                   TryCand, Cand, GenericSchedulerBase::BotHeightReduce))
@@ -2428,10 +2523,13 @@ static bool tryLatency(GenericSchedulerBase::SchedCandidate &TryCand,
   return false;
 }
 
-static void tracePick(const GenericSchedulerBase::SchedCandidate &Cand,
-                      bool IsTop) {
+static void tracePick(GenericSchedulerBase::CandReason Reason, bool IsTop) {
   DEBUG(dbgs() << "Pick " << (IsTop ? "Top " : "Bot ")
-        << GenericSchedulerBase::getReasonStr(Cand.Reason) << '\n');
+        << GenericSchedulerBase::getReasonStr(Reason) << '\n');
+}
+
+static void tracePick(const GenericSchedulerBase::SchedCandidate &Cand) {
+  tracePick(Cand.Reason, Cand.AtTop);
 }
 
 void GenericScheduler::initialize(ScheduleDAGMI *dag) {
@@ -2460,6 +2558,8 @@ void GenericScheduler::initialize(ScheduleDAGMI *dag) {
         DAG->MF.getSubtarget().getInstrInfo()->CreateTargetMIHazardRecognizer(
             Itin, DAG);
   }
+  TopCand.SU = nullptr;
+  BotCand.SU = nullptr;
 }
 
 /// Initialize the per-region scheduling policy.
@@ -2487,8 +2587,7 @@ void GenericScheduler::initPolicy(MachineBasicBlock::iterator Begin,
   RegionPolicy.OnlyBottomUp = true;
 
   // Allow the subtarget to override default policy.
-  MF.getSubtarget().overrideSchedPolicy(RegionPolicy, Begin, End,
-                                        NumRegionInstrs);
+  MF.getSubtarget().overrideSchedPolicy(RegionPolicy, NumRegionInstrs);
 
   // After subtarget overrides, apply command line options.
   if (!EnableRegPressure)
@@ -2582,19 +2681,25 @@ static bool tryPressure(const PressureChange &TryP,
                         GenericSchedulerBase::CandReason Reason,
                         const TargetRegisterInfo *TRI,
                         const MachineFunction &MF) {
-  unsigned TryPSet = TryP.getPSetOrMax();
-  unsigned CandPSet = CandP.getPSetOrMax();
-  // If both candidates affect the same set, go with the smallest increase.
-  if (TryPSet == CandPSet) {
-    return tryLess(TryP.getUnitInc(), CandP.getUnitInc(), TryCand, Cand,
-                   Reason);
-  }
   // If one candidate decreases and the other increases, go with it.
   // Invalid candidates have UnitInc==0.
   if (tryGreater(TryP.getUnitInc() < 0, CandP.getUnitInc() < 0, TryCand, Cand,
                  Reason)) {
     return true;
   }
+  // Do not compare the magnitude of pressure changes between top and bottom
+  // boundary.
+  if (Cand.AtTop != TryCand.AtTop)
+    return false;
+
+  // If both candidates affect the same set in the same boundary, go with the
+  // smallest increase.
+  unsigned TryPSet = TryP.getPSetOrMax();
+  unsigned CandPSet = CandP.getPSetOrMax();
+  if (TryPSet == CandPSet) {
+    return tryLess(TryP.getUnitInc(), CandP.getUnitInc(), TryCand, Cand,
+                   Reason);
+  }
 
   int TryRank = TryP.isValid() ? TRI->getRegPressureSetScore(MF, TryPSet) :
                                  std::numeric_limits<int>::max();
@@ -2640,64 +2745,64 @@ static int biasPhysRegCopy(const SUnit *SU, bool isTop) {
   return 0;
 }
 
-/// Apply a set of heursitics to a new candidate. Heuristics are currently
-/// hierarchical. This may be more efficient than a graduated cost model because
-/// we don't need to evaluate all aspects of the model for each node in the
-/// queue. But it's really done to make the heuristics easier to debug and
-/// statistically analyze.
-///
-/// \param Cand provides the policy and current best candidate.
-/// \param TryCand refers to the next SUnit candidate, otherwise uninitialized.
-/// \param Zone describes the scheduled zone that we are extending.
-/// \param RPTracker describes reg pressure within the scheduled zone.
-/// \param TempTracker is a scratch pressure tracker to reuse in queries.
-void GenericScheduler::tryCandidate(SchedCandidate &Cand,
-                                    SchedCandidate &TryCand,
-                                    SchedBoundary &Zone,
-                                    const RegPressureTracker &RPTracker,
-                                    RegPressureTracker &TempTracker) {
-
+void GenericScheduler::initCandidate(SchedCandidate &Cand, SUnit *SU,
+                                     bool AtTop,
+                                     const RegPressureTracker &RPTracker,
+                                     RegPressureTracker &TempTracker) {
+  Cand.SU = SU;
+  Cand.AtTop = AtTop;
   if (DAG->isTrackingPressure()) {
-    // Always initialize TryCand's RPDelta.
-    if (Zone.isTop()) {
+    if (AtTop) {
       TempTracker.getMaxDownwardPressureDelta(
-        TryCand.SU->getInstr(),
-        TryCand.RPDelta,
+        Cand.SU->getInstr(),
+        Cand.RPDelta,
         DAG->getRegionCriticalPSets(),
         DAG->getRegPressure().MaxSetPressure);
-    }
-    else {
+    } else {
       if (VerifyScheduling) {
         TempTracker.getMaxUpwardPressureDelta(
-          TryCand.SU->getInstr(),
-          &DAG->getPressureDiff(TryCand.SU),
-          TryCand.RPDelta,
+          Cand.SU->getInstr(),
+          &DAG->getPressureDiff(Cand.SU),
+          Cand.RPDelta,
           DAG->getRegionCriticalPSets(),
           DAG->getRegPressure().MaxSetPressure);
-      }
-      else {
+      } else {
         RPTracker.getUpwardPressureDelta(
-          TryCand.SU->getInstr(),
-          DAG->getPressureDiff(TryCand.SU),
-          TryCand.RPDelta,
+          Cand.SU->getInstr(),
+          DAG->getPressureDiff(Cand.SU),
+          Cand.RPDelta,
           DAG->getRegionCriticalPSets(),
           DAG->getRegPressure().MaxSetPressure);
       }
     }
   }
-  DEBUG(if (TryCand.RPDelta.Excess.isValid())
-          dbgs() << "  Try  SU(" << TryCand.SU->NodeNum << ") "
-                 << TRI->getRegPressureSetName(TryCand.RPDelta.Excess.getPSet())
-                 << ":" << TryCand.RPDelta.Excess.getUnitInc() << "\n");
+  DEBUG(if (Cand.RPDelta.Excess.isValid())
+          dbgs() << "  Try  SU(" << Cand.SU->NodeNum << ") "
+                 << TRI->getRegPressureSetName(Cand.RPDelta.Excess.getPSet())
+                 << ":" << Cand.RPDelta.Excess.getUnitInc() << "\n");
+}
 
+/// Apply a set of heursitics to a new candidate. Heuristics are currently
+/// hierarchical. This may be more efficient than a graduated cost model because
+/// we don't need to evaluate all aspects of the model for each node in the
+/// queue. But it's really done to make the heuristics easier to debug and
+/// statistically analyze.
+///
+/// \param Cand provides the policy and current best candidate.
+/// \param TryCand refers to the next SUnit candidate, otherwise uninitialized.
+/// \param Zone describes the scheduled zone that we are extending, or nullptr
+//              if Cand is from a different zone than TryCand.
+void GenericScheduler::tryCandidate(SchedCandidate &Cand,
+                                    SchedCandidate &TryCand,
+                                    SchedBoundary *Zone) {
   // Initialize the candidate if needed.
   if (!Cand.isValid()) {
     TryCand.Reason = NodeOrder;
     return;
   }
 
-  if (tryGreater(biasPhysRegCopy(TryCand.SU, Zone.isTop()),
-                 biasPhysRegCopy(Cand.SU, Zone.isTop()),
+  if (tryGreater(biasPhysRegCopy(TryCand.SU, TryCand.AtTop),
+                 biasPhysRegCopy(Cand.SU, Cand.AtTop),
                  TryCand, Cand, PhysRegCopy))
     return;
 
@@ -2715,17 +2820,26 @@ void GenericScheduler::tryCandidate(SchedCandidate &Cand,
                                                DAG->MF))
     return;
 
-  // For loops that are acyclic path limited, aggressively schedule for latency.
-  // This can result in very long dependence chains scheduled in sequence, so
-  // once every cycle (when CurrMOps == 0), switch to normal heuristics.
-  if (Rem.IsAcyclicLatencyLimited && !Zone.getCurrMOps()
-      && tryLatency(TryCand, Cand, Zone))
-    return;
+  // We only compare a subset of features when comparing nodes between
+  // Top and Bottom boundary. Some properties are simply incomparable, in many
+  // other instances we should only override the other boundary if something
+  // is a clear good pick on one boundary. Skip heuristics that are more
+  // "tie-breaking" in nature.
+  bool SameBoundary = Zone != nullptr;
+  if (SameBoundary) {
+    // For loops that are acyclic path limited, aggressively schedule for
+    // latency.  This can result in very long dependence chains scheduled in
+    // sequence, so once every cycle (when CurrMOps == 0), switch to normal
+    // heuristics.
+    if (Rem.IsAcyclicLatencyLimited && !Zone->getCurrMOps() &&
+        tryLatency(TryCand, Cand, *Zone))
+      return;
 
-  // Prioritize instructions that read unbuffered resources by stall cycles.
-  if (tryLess(Zone.getLatencyStallCycles(TryCand.SU),
-              Zone.getLatencyStallCycles(Cand.SU), TryCand, Cand, Stall))
-    return;
+    // Prioritize instructions that read unbuffered resources by stall cycles.
+    if (tryLess(Zone->getLatencyStallCycles(TryCand.SU),
+                Zone->getLatencyStallCycles(Cand.SU), TryCand, Cand, Stall))
+      return;
+  }
 
   // Keep clustered nodes together to encourage downstream peephole
   // optimizations which may reduce resource requirements.
@@ -2733,18 +2847,23 @@ void GenericScheduler::tryCandidate(SchedCandidate &Cand,
   // This is a best effort to set things up for a post-RA pass. Optimizations
   // like generating loads of multiple registers should ideally be done within
   // the scheduler pass by combining the loads during DAG postprocessing.
-  const SUnit *NextClusterSU =
-    Zone.isTop() ? DAG->getNextClusterSucc() : DAG->getNextClusterPred();
-  if (tryGreater(TryCand.SU == NextClusterSU, Cand.SU == NextClusterSU,
+  const SUnit *CandNextClusterSU =
+    Cand.AtTop ? DAG->getNextClusterSucc() : DAG->getNextClusterPred();
+  const SUnit *TryCandNextClusterSU =
+    TryCand.AtTop ? DAG->getNextClusterSucc() : DAG->getNextClusterPred();
+  if (tryGreater(TryCand.SU == TryCandNextClusterSU,
+                 Cand.SU == CandNextClusterSU,
                  TryCand, Cand, Cluster))
     return;
 
-  // Weak edges are for clustering and other constraints.
-  if (tryLess(getWeakLeft(TryCand.SU, Zone.isTop()),
-              getWeakLeft(Cand.SU, Zone.isTop()),
-              TryCand, Cand, Weak)) {
-    return;
+  if (SameBoundary) {
+    // Weak edges are for clustering and other constraints.
+    if (tryLess(getWeakLeft(TryCand.SU, TryCand.AtTop),
+                getWeakLeft(Cand.SU, Cand.AtTop),
+                TryCand, Cand, Weak))
+      return;
   }
+
   // Avoid increasing the max pressure of the entire region.
   if (DAG->isTrackingPressure() && tryPressure(TryCand.RPDelta.CurrentMax,
                                                Cand.RPDelta.CurrentMax,
@@ -2752,34 +2871,35 @@ void GenericScheduler::tryCandidate(SchedCandidate &Cand,
                                                DAG->MF))
     return;
 
-  // Avoid critical resource consumption and balance the schedule.
-  TryCand.initResourceDelta(DAG, SchedModel);
-  if (tryLess(TryCand.ResDelta.CritResources, Cand.ResDelta.CritResources,
-              TryCand, Cand, ResourceReduce))
-    return;
-  if (tryGreater(TryCand.ResDelta.DemandedResources,
-                 Cand.ResDelta.DemandedResources,
-                 TryCand, Cand, ResourceDemand))
-    return;
+  if (SameBoundary) {
+    // Avoid critical resource consumption and balance the schedule.
+    TryCand.initResourceDelta(DAG, SchedModel);
+    if (tryLess(TryCand.ResDelta.CritResources, Cand.ResDelta.CritResources,
+                TryCand, Cand, ResourceReduce))
+      return;
+    if (tryGreater(TryCand.ResDelta.DemandedResources,
+                   Cand.ResDelta.DemandedResources,
+                   TryCand, Cand, ResourceDemand))
+      return;
 
-  // Avoid serializing long latency dependence chains.
-  // For acyclic path limited loops, latency was already checked above.
-  if (!RegionPolicy.DisableLatencyHeuristic && Cand.Policy.ReduceLatency &&
-      !Rem.IsAcyclicLatencyLimited && tryLatency(TryCand, Cand, Zone)) {
-    return;
-  }
+    // Avoid serializing long latency dependence chains.
+    // For acyclic path limited loops, latency was already checked above.
+    if (!RegionPolicy.DisableLatencyHeuristic && TryCand.Policy.ReduceLatency &&
+        !Rem.IsAcyclicLatencyLimited && tryLatency(TryCand, Cand, *Zone))
+      return;
 
-  // Prefer immediate defs/users of the last scheduled instruction. This is a
-  // local pressure avoidance strategy that also makes the machine code
-  // readable.
-  if (tryGreater(Zone.isNextSU(TryCand.SU), Zone.isNextSU(Cand.SU),
-                 TryCand, Cand, NextDefUse))
-    return;
+    // Prefer immediate defs/users of the last scheduled instruction. This is a
+    // local pressure avoidance strategy that also makes the machine code
+    // readable.
+    if (tryGreater(Zone->isNextSU(TryCand.SU), Zone->isNextSU(Cand.SU),
+                   TryCand, Cand, NextDefUse))
+      return;
 
-  // Fall through to original instruction order.
-  if ((Zone.isTop() && TryCand.SU->NodeNum < Cand.SU->NodeNum)
-      || (!Zone.isTop() && TryCand.SU->NodeNum > Cand.SU->NodeNum)) {
-    TryCand.Reason = NodeOrder;
+    // Fall through to original instruction order.
+    if ((Zone->isTop() && TryCand.SU->NodeNum < Cand.SU->NodeNum)
+        || (!Zone->isTop() && TryCand.SU->NodeNum > Cand.SU->NodeNum)) {
+      TryCand.Reason = NodeOrder;
+    }
   }
 }
 
@@ -2789,20 +2909,20 @@ void GenericScheduler::tryCandidate(SchedCandidate &Cand,
 /// DAG building. To adjust for the current scheduling location we need to
 /// maintain the number of vreg uses remaining to be top-scheduled.
 void GenericScheduler::pickNodeFromQueue(SchedBoundary &Zone,
+                                         const CandPolicy &ZonePolicy,
                                          const RegPressureTracker &RPTracker,
                                          SchedCandidate &Cand) {
-  ReadyQueue &Q = Zone.Available;
-
-  DEBUG(Q.dump());
-
   // getMaxPressureDelta temporarily modifies the tracker.
   RegPressureTracker &TempTracker = const_cast<RegPressureTracker&>(RPTracker);
 
+  ReadyQueue &Q = Zone.Available;
   for (ReadyQueue::iterator I = Q.begin(), E = Q.end(); I != E; ++I) {
 
-    SchedCandidate TryCand(Cand.Policy);
-    TryCand.SU = *I;
-    tryCandidate(Cand, TryCand, Zone, RPTracker, TempTracker);
+    SchedCandidate TryCand(ZonePolicy);
+    initCandidate(TryCand, *I, Zone.isTop(), RPTracker, TempTracker);
+    // Pass SchedBoundary only when comparing nodes from the same boundary.
+    SchedBoundary *ZoneArg = Cand.AtTop == TryCand.AtTop ? &Zone : nullptr;
+    tryCandidate(Cand, TryCand, ZoneArg);
     if (TryCand.Reason != NoCand) {
       // Initialize resource delta if needed in case future heuristics query it.
       if (TryCand.ResDelta == SchedResourceDelta())
@@ -2819,57 +2939,77 @@ SUnit *GenericScheduler::pickNodeBidirectional(bool &IsTopNode) {
   // efficient, but also provides the best heuristics for CriticalPSets.
   if (SUnit *SU = Bot.pickOnlyChoice()) {
     IsTopNode = false;
-    DEBUG(dbgs() << "Pick Bot ONLY1\n");
+    tracePick(Only1, false);
     return SU;
   }
   if (SUnit *SU = Top.pickOnlyChoice()) {
     IsTopNode = true;
-    DEBUG(dbgs() << "Pick Top ONLY1\n");
+    tracePick(Only1, true);
     return SU;
   }
-  CandPolicy NoPolicy;
-  SchedCandidate BotCand(NoPolicy);
-  SchedCandidate TopCand(NoPolicy);
   // Set the bottom-up policy based on the state of the current bottom zone and
   // the instructions outside the zone, including the top zone.
-  setPolicy(BotCand.Policy, /*IsPostRA=*/false, Bot, &Top);
+  CandPolicy BotPolicy;
+  setPolicy(BotPolicy, /*IsPostRA=*/false, Bot, &Top);
   // Set the top-down policy based on the state of the current top zone and
   // the instructions outside the zone, including the bottom zone.
-  setPolicy(TopCand.Policy, /*IsPostRA=*/false, Top, &Bot);
-
-  // Prefer bottom scheduling when heuristics are silent.
-  pickNodeFromQueue(Bot, DAG->getBotRPTracker(), BotCand);
-  assert(BotCand.Reason != NoCand && "failed to find the first candidate");
-
-  // If either Q has a single candidate that provides the least increase in
-  // Excess pressure, we can immediately schedule from that Q.
-  //
-  // RegionCriticalPSets summarizes the pressure within the scheduled region and
-  // affects picking from either Q. If scheduling in one direction must
-  // increase pressure for one of the excess PSets, then schedule in that
-  // direction first to provide more freedom in the other direction.
-  if ((BotCand.Reason == RegExcess && !BotCand.isRepeat(RegExcess))
-      || (BotCand.Reason == RegCritical
-          && !BotCand.isRepeat(RegCritical)))
-  {
-    IsTopNode = false;
-    tracePick(BotCand, IsTopNode);
-    return BotCand.SU;
+  CandPolicy TopPolicy;
+  setPolicy(TopPolicy, /*IsPostRA=*/false, Top, &Bot);
+
+  // See if BotCand is still valid (because we previously scheduled from Top).
+  DEBUG(dbgs() << "Picking from Bot:\n");
+  if (!BotCand.isValid() || BotCand.SU->isScheduled ||
+      BotCand.Policy != BotPolicy) {
+    BotCand.reset(CandPolicy());
+    pickNodeFromQueue(Bot, BotPolicy, DAG->getBotRPTracker(), BotCand);
+    assert(BotCand.Reason != NoCand && "failed to find the first candidate");
+  } else {
+    DEBUG(traceCandidate(BotCand));
+#ifndef NDEBUG
+    if (VerifyScheduling) {
+      SchedCandidate TCand;
+      TCand.reset(CandPolicy());
+      pickNodeFromQueue(Bot, BotPolicy, DAG->getBotRPTracker(), TCand);
+      assert(TCand.SU == BotCand.SU &&
+             "Last pick result should correspond to re-picking right now");
+    }
+#endif
   }
+
   // Check if the top Q has a better candidate.
-  pickNodeFromQueue(Top, DAG->getTopRPTracker(), TopCand);
-  assert(TopCand.Reason != NoCand && "failed to find the first candidate");
+  DEBUG(dbgs() << "Picking from Top:\n");
+  if (!TopCand.isValid() || TopCand.SU->isScheduled ||
+      TopCand.Policy != TopPolicy) {
+    TopCand.reset(CandPolicy());
+    pickNodeFromQueue(Top, TopPolicy, DAG->getTopRPTracker(), TopCand);
+    assert(TopCand.Reason != NoCand && "failed to find the first candidate");
+  } else {
+    DEBUG(traceCandidate(TopCand));
+#ifndef NDEBUG
+    if (VerifyScheduling) {
+      SchedCandidate TCand;
+      TCand.reset(CandPolicy());
+      pickNodeFromQueue(Top, TopPolicy, DAG->getTopRPTracker(), TCand);
+      assert(TCand.SU == TopCand.SU &&
+           "Last pick result should correspond to re-picking right now");
+    }
+#endif
+  }
 
-  // Choose the queue with the most important (lowest enum) reason.
-  if (TopCand.Reason < BotCand.Reason) {
-    IsTopNode = true;
-    tracePick(TopCand, IsTopNode);
-    return TopCand.SU;
+  // Pick best from BotCand and TopCand.
+  assert(BotCand.isValid());
+  assert(TopCand.isValid());
+  SchedCandidate Cand = BotCand;
+  TopCand.Reason = NoCand;
+  tryCandidate(Cand, TopCand, nullptr);
+  if (TopCand.Reason != NoCand) {
+    Cand.setBest(TopCand);
+    DEBUG(traceCandidate(Cand));
   }
-  // Otherwise prefer the bottom candidate, in node order if all else failed.
-  IsTopNode = false;
-  tracePick(BotCand, IsTopNode);
-  return BotCand.SU;
+
+  IsTopNode = Cand.AtTop;
+  tracePick(Cand);
+  return Cand.SU;
 }
 
 /// Pick the best node to balance the schedule. Implements MachineSchedStrategy.
@@ -2885,27 +3025,25 @@ SUnit *GenericScheduler::pickNode(bool &IsTopNode) {
       SU = Top.pickOnlyChoice();
       if (!SU) {
         CandPolicy NoPolicy;
-        SchedCandidate TopCand(NoPolicy);
-        pickNodeFromQueue(Top, DAG->getTopRPTracker(), TopCand);
+        TopCand.reset(NoPolicy);
+        pickNodeFromQueue(Top, NoPolicy, DAG->getTopRPTracker(), TopCand);
         assert(TopCand.Reason != NoCand && "failed to find a candidate");
-        tracePick(TopCand, true);
+        tracePick(TopCand);
         SU = TopCand.SU;
       }
       IsTopNode = true;
-    }
-    else if (RegionPolicy.OnlyBottomUp) {
+    } else if (RegionPolicy.OnlyBottomUp) {
       SU = Bot.pickOnlyChoice();
       if (!SU) {
         CandPolicy NoPolicy;
-        SchedCandidate BotCand(NoPolicy);
-        pickNodeFromQueue(Bot, DAG->getBotRPTracker(), BotCand);
+        BotCand.reset(NoPolicy);
+        pickNodeFromQueue(Bot, NoPolicy, DAG->getBotRPTracker(), BotCand);
         assert(BotCand.Reason != NoCand && "failed to find a candidate");
-        tracePick(BotCand, false);
+        tracePick(BotCand);
         SU = BotCand.SU;
       }
       IsTopNode = false;
-    }
-    else {
+    } else {
       SU = pickNodeBidirectional(IsTopNode);
     }
   } while (SU->isScheduled);
@@ -2957,8 +3095,7 @@ void GenericScheduler::schedNode(SUnit *SU, bool IsTopNode) {
     Top.bumpNode(SU);
     if (SU->hasPhysRegUses)
       reschedulePhysRegCopies(SU, true);
-  }
-  else {
+  } else {
     SU->BotReadyCycle = std::max(SU->BotReadyCycle, Bot.getCurrCycle());
     Bot.bumpNode(SU);
     if (SU->hasPhysRegDefs)
@@ -2976,8 +3113,12 @@ static ScheduleDAGInstrs *createGenericSchedLive(MachineSchedContext *C) {
   // data and pass it to later mutations. Have a single mutation that gathers
   // the interesting nodes in one pass.
   DAG->addMutation(make_unique<CopyConstrain>(DAG->TII, DAG->TRI));
-  if (EnableLoadCluster && DAG->TII->enableClusterLoads())
-    DAG->addMutation(make_unique<LoadClusterMutation>(DAG->TII, DAG->TRI));
+  if (EnableMemOpCluster) {
+    if (DAG->TII->enableClusterLoads())
+      DAG->addMutation(make_unique<LoadClusterMutation>(DAG->TII, DAG->TRI));
+    if (DAG->TII->enableClusterStores())
+      DAG->addMutation(make_unique<StoreClusterMutation>(DAG->TII, DAG->TRI));
+  }
   if (EnableMacroFusion)
     DAG->addMutation(make_unique<MacroFusion>(*DAG->TII, *DAG->TRI));
   return DAG;
@@ -3065,12 +3206,10 @@ void PostGenericScheduler::tryCandidate(SchedCandidate &Cand,
 
 void PostGenericScheduler::pickNodeFromQueue(SchedCandidate &Cand) {
   ReadyQueue &Q = Top.Available;
-
-  DEBUG(Q.dump());
-
   for (ReadyQueue::iterator I = Q.begin(), E = Q.end(); I != E; ++I) {
     SchedCandidate TryCand(Cand.Policy);
     TryCand.SU = *I;
+    TryCand.AtTop = true;
     TryCand.initResourceDelta(DAG, SchedModel);
     tryCandidate(Cand, TryCand);
     if (TryCand.Reason != NoCand) {
@@ -3089,7 +3228,9 @@ SUnit *PostGenericScheduler::pickNode(bool &IsTopNode) {
   SUnit *SU;
   do {
     SU = Top.pickOnlyChoice();
-    if (!SU) {
+    if (SU) {
+      tracePick(Only1, true);
+    } else {
       CandPolicy NoPolicy;
       SchedCandidate TopCand(NoPolicy);
       // Set the top-down policy based on the state of the current top zone and
@@ -3097,7 +3238,7 @@ SUnit *PostGenericScheduler::pickNode(bool &IsTopNode) {
       setPolicy(TopCand.Policy, /*IsPostRA=*/true, Top, nullptr);
       pickNodeFromQueue(TopCand);
       assert(TopCand.Reason != NoCand && "failed to find a candidate");
-      tracePick(TopCand, true);
+      tracePick(TopCand);
       SU = TopCand.SU;
     }
   } while (SU->isScheduled);
@@ -3285,8 +3426,7 @@ public:
         TopQ.pop();
       } while (SU->isScheduled);
       IsTopNode = true;
-    }
-    else {
+    } else {
       do {
         if (BottomQ.empty()) return nullptr;
         SU = BottomQ.top();
diff --git a/lib/CodeGen/MachineSink.cpp b/lib/CodeGen/MachineSink.cpp
index 5e6d6190c638..571a5c1d8005 100644
--- a/lib/CodeGen/MachineSink.cpp
+++ b/lib/CodeGen/MachineSink.cpp
@@ -27,6 +27,7 @@
 #include "llvm/CodeGen/MachineLoopInfo.h"
 #include "llvm/CodeGen/MachinePostDominators.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/IR/LLVMContext.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/raw_ostream.h"
@@ -104,7 +105,7 @@ namespace {
 
   private:
     bool ProcessBlock(MachineBasicBlock &MBB);
-    bool isWorthBreakingCriticalEdge(MachineInstr *MI,
+    bool isWorthBreakingCriticalEdge(MachineInstr &MI,
                                      MachineBasicBlock *From,
                                      MachineBasicBlock *To);
     /// \brief Postpone the splitting of the given critical
@@ -119,27 +120,27 @@ namespace {
     ///
     /// \return True if the edge is marked as toSplit, false otherwise.
     /// False can be returned if, for instance, this is not profitable.
-    bool PostponeSplitCriticalEdge(MachineInstr *MI,
+    bool PostponeSplitCriticalEdge(MachineInstr &MI,
                                    MachineBasicBlock *From,
                                    MachineBasicBlock *To,
                                    bool BreakPHIEdge);
-    bool SinkInstruction(MachineInstr *MI, bool &SawStore,
+    bool SinkInstruction(MachineInstr &MI, bool &SawStore,
                          AllSuccsCache &AllSuccessors);
     bool AllUsesDominatedByBlock(unsigned Reg, MachineBasicBlock *MBB,
                                  MachineBasicBlock *DefMBB,
                                  bool &BreakPHIEdge, bool &LocalUse) const;
-    MachineBasicBlock *FindSuccToSinkTo(MachineInstr *MI, MachineBasicBlock *MBB,
+    MachineBasicBlock *FindSuccToSinkTo(MachineInstr &MI, MachineBasicBlock *MBB,
                bool &BreakPHIEdge, AllSuccsCache &AllSuccessors);
-    bool isProfitableToSinkTo(unsigned Reg, MachineInstr *MI,
+    bool isProfitableToSinkTo(unsigned Reg, MachineInstr &MI,
                               MachineBasicBlock *MBB,
                               MachineBasicBlock *SuccToSinkTo,
                               AllSuccsCache &AllSuccessors);
 
-    bool PerformTrivialForwardCoalescing(MachineInstr *MI,
+    bool PerformTrivialForwardCoalescing(MachineInstr &MI,
                                          MachineBasicBlock *MBB);
 
     SmallVector<MachineBasicBlock *, 4> &
-    GetAllSortedSuccessors(MachineInstr *MI, MachineBasicBlock *MBB,
+    GetAllSortedSuccessors(MachineInstr &MI, MachineBasicBlock *MBB,
                            AllSuccsCache &AllSuccessors) const;
   };
 } // end anonymous namespace
@@ -154,13 +155,13 @@ INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass)
 INITIALIZE_PASS_END(MachineSinking, "machine-sink",
                 "Machine code sinking", false, false)
 
-bool MachineSinking::PerformTrivialForwardCoalescing(MachineInstr *MI,
+bool MachineSinking::PerformTrivialForwardCoalescing(MachineInstr &MI,
                                                      MachineBasicBlock *MBB) {
-  if (!MI->isCopy())
+  if (!MI.isCopy())
     return false;
 
-  unsigned SrcReg = MI->getOperand(1).getReg();
-  unsigned DstReg = MI->getOperand(0).getReg();
+  unsigned SrcReg = MI.getOperand(1).getReg();
+  unsigned DstReg = MI.getOperand(0).getReg();
   if (!TargetRegisterInfo::isVirtualRegister(SrcReg) ||
       !TargetRegisterInfo::isVirtualRegister(DstReg) ||
       !MRI->hasOneNonDBGUse(SrcReg))
@@ -175,9 +176,9 @@ bool MachineSinking::PerformTrivialForwardCoalescing(MachineInstr *MI,
   if (DefMI->isCopyLike())
     return false;
   DEBUG(dbgs() << "Coalescing: " << *DefMI);
-  DEBUG(dbgs() << "*** to: " << *MI);
+  DEBUG(dbgs() << "*** to: " << MI);
   MRI->replaceRegWith(DstReg, SrcReg);
-  MI->eraseFromParent();
+  MI.eraseFromParent();
 
   // Conservatively, clear any kill flags, since it's possible that they are no
   // longer correct.
@@ -256,7 +257,7 @@ MachineSinking::AllUsesDominatedByBlock(unsigned Reg,
 }
 
 bool MachineSinking::runOnMachineFunction(MachineFunction &MF) {
-  if (skipOptnoneFunction(*MF.getFunction()))
+  if (skipFunction(*MF.getFunction()))
     return false;
 
   DEBUG(dbgs() << "******** Machine Sinking ********\n");
@@ -283,7 +284,7 @@ bool MachineSinking::runOnMachineFunction(MachineFunction &MF) {
 
     // If we have anything we marked as toSplit, split it now.
     for (auto &Pair : ToSplit) {
-      auto NewSucc = Pair.first->SplitCriticalEdge(Pair.second, this);
+      auto NewSucc = Pair.first->SplitCriticalEdge(Pair.second, *this);
       if (NewSucc != nullptr) {
         DEBUG(dbgs() << " *** Splitting critical edge:"
               " BB#" << Pair.first->getNumber()
@@ -326,7 +327,7 @@ bool MachineSinking::ProcessBlock(MachineBasicBlock &MBB) {
   --I;
   bool ProcessedBegin, SawStore = false;
   do {
-    MachineInstr *MI = I;  // The instruction to sink.
+    MachineInstr &MI = *I;  // The instruction to sink.
 
     // Predecrement I (if it's not begin) so that it isn't invalidated by
     // sinking.
@@ -334,7 +335,7 @@ bool MachineSinking::ProcessBlock(MachineBasicBlock &MBB) {
     if (!ProcessedBegin)
       --I;
 
-    if (MI->isDebugValue())
+    if (MI.isDebugValue())
       continue;
 
     bool Joined = PerformTrivialForwardCoalescing(MI, &MBB);
@@ -343,8 +344,10 @@ bool MachineSinking::ProcessBlock(MachineBasicBlock &MBB) {
       continue;
     }
 
-    if (SinkInstruction(MI, SawStore, AllSuccessors))
-      ++NumSunk, MadeChange = true;
+    if (SinkInstruction(MI, SawStore, AllSuccessors)) {
+      ++NumSunk;
+      MadeChange = true;
+    }
 
     // If we just processed the first instruction in the block, we're done.
   } while (!ProcessedBegin);
@@ -352,7 +355,7 @@ bool MachineSinking::ProcessBlock(MachineBasicBlock &MBB) {
   return MadeChange;
 }
 
-bool MachineSinking::isWorthBreakingCriticalEdge(MachineInstr *MI,
+bool MachineSinking::isWorthBreakingCriticalEdge(MachineInstr &MI,
                                                  MachineBasicBlock *From,
                                                  MachineBasicBlock *To) {
   // FIXME: Need much better heuristics.
@@ -363,14 +366,14 @@ bool MachineSinking::isWorthBreakingCriticalEdge(MachineInstr *MI,
   if (!CEBCandidates.insert(std::make_pair(From, To)).second)
     return true;
 
-  if (!MI->isCopy() && !TII->isAsCheapAsAMove(MI))
+  if (!MI.isCopy() && !TII->isAsCheapAsAMove(MI))
     return true;
 
   // MI is cheap, we probably don't want to break the critical edge for it.
   // However, if this would allow some definitions of its source operands
   // to be sunk then it's probably worth it.
-  for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) {
-    const MachineOperand &MO = MI->getOperand(i);
+  for (unsigned i = 0, e = MI.getNumOperands(); i != e; ++i) {
+    const MachineOperand &MO = MI.getOperand(i);
     if (!MO.isReg() || !MO.isUse())
       continue;
     unsigned Reg = MO.getReg();
@@ -391,7 +394,7 @@ bool MachineSinking::isWorthBreakingCriticalEdge(MachineInstr *MI,
       // If definition resides elsewhere, we aren't
       // blocking it from being sunk so don't break the edge.
       MachineInstr *DefMI = MRI->getVRegDef(Reg);
-      if (DefMI->getParent() == MI->getParent())
+      if (DefMI->getParent() == MI.getParent())
         return true;
     }
   }
@@ -399,7 +402,7 @@ bool MachineSinking::isWorthBreakingCriticalEdge(MachineInstr *MI,
   return false;
 }
 
-bool MachineSinking::PostponeSplitCriticalEdge(MachineInstr *MI,
+bool MachineSinking::PostponeSplitCriticalEdge(MachineInstr &MI,
                                                MachineBasicBlock *FromBB,
                                                MachineBasicBlock *ToBB,
                                                bool BreakPHIEdge) {
@@ -469,35 +472,30 @@ bool MachineSinking::PostponeSplitCriticalEdge(MachineInstr *MI,
   return true;
 }
 
-static bool AvoidsSinking(MachineInstr *MI, MachineRegisterInfo *MRI) {
-  return MI->isInsertSubreg() || MI->isSubregToReg() || MI->isRegSequence();
-}
-
 /// collectDebgValues - Scan instructions following MI and collect any
 /// matching DBG_VALUEs.
-static void collectDebugValues(MachineInstr *MI,
+static void collectDebugValues(MachineInstr &MI,
                                SmallVectorImpl<MachineInstr *> &DbgValues) {
   DbgValues.clear();
-  if (!MI->getOperand(0).isReg())
+  if (!MI.getOperand(0).isReg())
     return;
 
   MachineBasicBlock::iterator DI = MI; ++DI;
-  for (MachineBasicBlock::iterator DE = MI->getParent()->end();
+  for (MachineBasicBlock::iterator DE = MI.getParent()->end();
        DI != DE; ++DI) {
     if (!DI->isDebugValue())
       return;
     if (DI->getOperand(0).isReg() &&
-        DI->getOperand(0).getReg() == MI->getOperand(0).getReg())
-      DbgValues.push_back(DI);
+        DI->getOperand(0).getReg() == MI.getOperand(0).getReg())
+      DbgValues.push_back(&*DI);
   }
 }
 
 /// isProfitableToSinkTo - Return true if it is profitable to sink MI.
-bool MachineSinking::isProfitableToSinkTo(unsigned Reg, MachineInstr *MI,
+bool MachineSinking::isProfitableToSinkTo(unsigned Reg, MachineInstr &MI,
                                           MachineBasicBlock *MBB,
                                           MachineBasicBlock *SuccToSinkTo,
                                           AllSuccsCache &AllSuccessors) {
-  assert (MI && "Invalid MachineInstr!");
   assert (SuccToSinkTo && "Invalid SinkTo Candidate BB");
 
   if (MBB == SuccToSinkTo)
@@ -538,7 +536,7 @@ bool MachineSinking::isProfitableToSinkTo(unsigned Reg, MachineInstr *MI,
 /// Get the sorted sequence of successors for this MachineBasicBlock, possibly
 /// computing it if it was not already cached.
 SmallVector<MachineBasicBlock *, 4> &
-MachineSinking::GetAllSortedSuccessors(MachineInstr *MI, MachineBasicBlock *MBB,
+MachineSinking::GetAllSortedSuccessors(MachineInstr &MI, MachineBasicBlock *MBB,
                                        AllSuccsCache &AllSuccessors) const {
 
   // Do we have the sorted successors in cache ?
@@ -560,7 +558,7 @@ MachineSinking::GetAllSortedSuccessors(MachineInstr *MI, MachineBasicBlock *MBB,
     DT->getNode(MBB)->getChildren();
   for (const auto &DTChild : Children)
     // DomTree children of MBB that have MBB as immediate dominator are added.
-    if (DTChild->getIDom()->getBlock() == MI->getParent() &&
+    if (DTChild->getIDom()->getBlock() == MI.getParent() &&
         // Skip MBBs already added to the AllSuccs vector above.
         !MBB->isSuccessor(DTChild->getBlock()))
       AllSuccs.push_back(DTChild->getBlock());
@@ -582,12 +580,10 @@ MachineSinking::GetAllSortedSuccessors(MachineInstr *MI, MachineBasicBlock *MBB,
 }
 
 /// FindSuccToSinkTo - Find a successor to sink this instruction to.
-MachineBasicBlock *MachineSinking::FindSuccToSinkTo(MachineInstr *MI,
-                                   MachineBasicBlock *MBB,
-                                   bool &BreakPHIEdge,
-                                   AllSuccsCache &AllSuccessors) {
-
-  assert (MI && "Invalid MachineInstr!");
+MachineBasicBlock *
+MachineSinking::FindSuccToSinkTo(MachineInstr &MI, MachineBasicBlock *MBB,
+                                 bool &BreakPHIEdge,
+                                 AllSuccsCache &AllSuccessors) {
   assert (MBB && "Invalid MachineBasicBlock!");
 
   // Loop over all the operands of the specified instruction.  If there is
@@ -596,8 +592,8 @@ MachineBasicBlock *MachineSinking::FindSuccToSinkTo(MachineInstr *MI,
   // SuccToSinkTo - This is the successor to sink this instruction to, once we
   // decide.
   MachineBasicBlock *SuccToSinkTo = nullptr;
-  for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) {
-    const MachineOperand &MO = MI->getOperand(i);
+  for (unsigned i = 0, e = MI.getNumOperands(); i != e; ++i) {
+    const MachineOperand &MO = MI.getOperand(i);
     if (!MO.isReg()) continue;  // Ignore non-register operands.
 
     unsigned Reg = MO.getReg();
@@ -673,22 +669,70 @@ MachineBasicBlock *MachineSinking::FindSuccToSinkTo(MachineInstr *MI,
   return SuccToSinkTo;
 }
 
+/// \brief Return true if MI is likely to be usable as a memory operation by the
+/// implicit null check optimization.
+///
+/// This is a "best effort" heuristic, and should not be relied upon for
+/// correctness.  This returning true does not guarantee that the implicit null
+/// check optimization is legal over MI, and this returning false does not
+/// guarantee MI cannot possibly be used to do a null check.
+static bool SinkingPreventsImplicitNullCheck(MachineInstr &MI,
+                                             const TargetInstrInfo *TII,
+                                             const TargetRegisterInfo *TRI) {
+  typedef TargetInstrInfo::MachineBranchPredicate MachineBranchPredicate;
+
+  auto *MBB = MI.getParent();
+  if (MBB->pred_size() != 1)
+    return false;
+
+  auto *PredMBB = *MBB->pred_begin();
+  auto *PredBB = PredMBB->getBasicBlock();
+
+  // Frontends that don't use implicit null checks have no reason to emit
+  // branches with make.implicit metadata, and this function should always
+  // return false for them.
+  if (!PredBB ||
+      !PredBB->getTerminator()->getMetadata(LLVMContext::MD_make_implicit))
+    return false;
+
+  unsigned BaseReg;
+  int64_t Offset;
+  if (!TII->getMemOpBaseRegImmOfs(MI, BaseReg, Offset, TRI))
+    return false;
+
+  if (!(MI.mayLoad() && !MI.isPredicable()))
+    return false;
+
+  MachineBranchPredicate MBP;
+  if (TII->analyzeBranchPredicate(*PredMBB, MBP, false))
+    return false;
+
+  return MBP.LHS.isReg() && MBP.RHS.isImm() && MBP.RHS.getImm() == 0 &&
+         (MBP.Predicate == MachineBranchPredicate::PRED_NE ||
+          MBP.Predicate == MachineBranchPredicate::PRED_EQ) &&
+         MBP.LHS.getReg() == BaseReg;
+}
+
 /// SinkInstruction - Determine whether it is safe to sink the specified machine
 /// instruction out of its current block into a successor.
-bool MachineSinking::SinkInstruction(MachineInstr *MI, bool &SawStore,
+bool MachineSinking::SinkInstruction(MachineInstr &MI, bool &SawStore,
                                      AllSuccsCache &AllSuccessors) {
-  // Don't sink insert_subreg, subreg_to_reg, reg_sequence. These are meant to
-  // be close to the source to make it easier to coalesce.
-  if (AvoidsSinking(MI, MRI))
+  // Don't sink instructions that the target prefers not to sink.
+  if (!TII->shouldSink(MI))
     return false;
 
   // Check if it's safe to move the instruction.
-  if (!MI->isSafeToMove(AA, SawStore))
+  if (!MI.isSafeToMove(AA, SawStore))
     return false;
 
   // Convergent operations may not be made control-dependent on additional
   // values.
-  if (MI->isConvergent())
+  if (MI.isConvergent())
+    return false;
+
+  // Don't break implicit null checks.  This is a performance heuristic, and not
+  // required for correctness.
+  if (SinkingPreventsImplicitNullCheck(MI, TII, TRI))
     return false;
 
   // FIXME: This should include support for sinking instructions within the
@@ -700,7 +744,7 @@ bool MachineSinking::SinkInstruction(MachineInstr *MI, bool &SawStore,
   // and z and only shrink the live range of x.
 
   bool BreakPHIEdge = false;
-  MachineBasicBlock *ParentBlock = MI->getParent();
+  MachineBasicBlock *ParentBlock = MI.getParent();
   MachineBasicBlock *SuccToSinkTo =
       FindSuccToSinkTo(MI, ParentBlock, BreakPHIEdge, AllSuccessors);
 
@@ -712,8 +756,8 @@ bool MachineSinking::SinkInstruction(MachineInstr *MI, bool &SawStore,
   // If the instruction to move defines a dead physical register which is live
   // when leaving the basic block, don't move it because it could turn into a
   // "zombie" define of that preg. E.g., EFLAGS. (<rdar://problem/8030636>)
-  for (unsigned I = 0, E = MI->getNumOperands(); I != E; ++I) {
-    const MachineOperand &MO = MI->getOperand(I);
+  for (unsigned I = 0, E = MI.getNumOperands(); I != E; ++I) {
+    const MachineOperand &MO = MI.getOperand(I);
     if (!MO.isReg()) continue;
     unsigned Reg = MO.getReg();
     if (Reg == 0 || !TargetRegisterInfo::isPhysicalRegister(Reg)) continue;
@@ -721,7 +765,7 @@ bool MachineSinking::SinkInstruction(MachineInstr *MI, bool &SawStore,
       return false;
   }
 
-  DEBUG(dbgs() << "Sink instr " << *MI << "\tinto block " << *SuccToSinkTo);
+  DEBUG(dbgs() << "Sink instr " << MI << "\tinto block " << *SuccToSinkTo);
 
   // If the block has multiple predecessors, this is a critical edge.
   // Decide if we can sink along it or need to break the edge.
@@ -730,7 +774,7 @@ bool MachineSinking::SinkInstruction(MachineInstr *MI, bool &SawStore,
     // other code paths.
     bool TryBreak = false;
     bool store = true;
-    if (!MI->isSafeToMove(AA, store)) {
+    if (!MI.isSafeToMove(AA, store)) {
       DEBUG(dbgs() << " *** NOTE: Won't sink load along critical edge.\n");
       TryBreak = true;
     }
@@ -804,7 +848,7 @@ bool MachineSinking::SinkInstruction(MachineInstr *MI, bool &SawStore,
   // Note that we have to clear the kill flags for any register this instruction
   // uses as we may sink over another instruction which currently kills the
   // used registers.
-  for (MachineOperand &MO : MI->operands()) {
+  for (MachineOperand &MO : MI.operands()) {
     if (MO.isReg() && MO.isUse())
       RegsToClearKillFlags.set(MO.getReg()); // Remember to clear kill flags.
   }
diff --git a/lib/CodeGen/MachineTraceMetrics.cpp b/lib/CodeGen/MachineTraceMetrics.cpp
index f7edacd5ebaf..86332c8a93a5 100644
--- a/lib/CodeGen/MachineTraceMetrics.cpp
+++ b/lib/CodeGen/MachineTraceMetrics.cpp
@@ -328,8 +328,10 @@ MinInstrCountEnsemble::pickTracePred(const MachineBasicBlock *MBB) {
       continue;
     // Pick the predecessor that would give this block the smallest InstrDepth.
     unsigned Depth = PredTBI->InstrDepth + CurCount;
-    if (!Best || Depth < BestDepth)
-      Best = Pred, BestDepth = Depth;
+    if (!Best || Depth < BestDepth) {
+      Best = Pred;
+      BestDepth = Depth;
+    }
   }
   return Best;
 }
@@ -356,8 +358,10 @@ MinInstrCountEnsemble::pickTraceSucc(const MachineBasicBlock *MBB) {
       continue;
     // Pick the successor that would give this block the smallest InstrHeight.
     unsigned Height = SuccTBI->InstrHeight;
-    if (!Best || Height < BestHeight)
-      Best = Succ, BestHeight = Height;
+    if (!Best || Height < BestHeight) {
+      Best = Succ;
+      BestHeight = Height;
+    }
   }
   return Best;
 }
@@ -621,16 +625,16 @@ struct DataDep {
 
 // Get the input data dependencies that must be ready before UseMI can issue.
 // Return true if UseMI has any physreg operands.
-static bool getDataDeps(const MachineInstr *UseMI,
+static bool getDataDeps(const MachineInstr &UseMI,
                         SmallVectorImpl<DataDep> &Deps,
                         const MachineRegisterInfo *MRI) {
   // Debug values should not be included in any calculations.
-  if (UseMI->isDebugValue())
+  if (UseMI.isDebugValue())
     return false;
   
   bool HasPhysRegs = false;
-  for (MachineInstr::const_mop_iterator I = UseMI->operands_begin(),
-       E = UseMI->operands_end(); I != E; ++I) {
+  for (MachineInstr::const_mop_iterator I = UseMI.operands_begin(),
+       E = UseMI.operands_end(); I != E; ++I) {
     const MachineOperand &MO = *I;
     if (!MO.isReg())
       continue;
@@ -643,7 +647,7 @@ static bool getDataDeps(const MachineInstr *UseMI,
     }
     // Collect virtual register reads.
     if (MO.readsReg())
-      Deps.push_back(DataDep(MRI, Reg, UseMI->getOperandNo(I)));
+      Deps.push_back(DataDep(MRI, Reg, UseMI.getOperandNo(I)));
   }
   return HasPhysRegs;
 }
@@ -651,17 +655,17 @@ static bool getDataDeps(const MachineInstr *UseMI,
 // Get the input data dependencies of a PHI instruction, using Pred as the
 // preferred predecessor.
 // This will add at most one dependency to Deps.
-static void getPHIDeps(const MachineInstr *UseMI,
+static void getPHIDeps(const MachineInstr &UseMI,
                        SmallVectorImpl<DataDep> &Deps,
                        const MachineBasicBlock *Pred,
                        const MachineRegisterInfo *MRI) {
   // No predecessor at the beginning of a trace. Ignore dependencies.
   if (!Pred)
     return;
-  assert(UseMI->isPHI() && UseMI->getNumOperands() % 2 && "Bad PHI");
-  for (unsigned i = 1; i != UseMI->getNumOperands(); i += 2) {
-    if (UseMI->getOperand(i + 1).getMBB() == Pred) {
-      unsigned Reg = UseMI->getOperand(i).getReg();
+  assert(UseMI.isPHI() && UseMI.getNumOperands() % 2 && "Bad PHI");
+  for (unsigned i = 1; i != UseMI.getNumOperands(); i += 2) {
+    if (UseMI.getOperand(i + 1).getMBB() == Pred) {
+      unsigned Reg = UseMI.getOperand(i).getReg();
       Deps.push_back(DataDep(MRI, Reg, i));
       return;
     }
@@ -823,8 +827,8 @@ computeInstrDepths(const MachineBasicBlock *MBB) {
       // Collect all data dependencies.
       Deps.clear();
       if (UseMI.isPHI())
-        getPHIDeps(&UseMI, Deps, TBI.Pred, MTM.MRI);
-      else if (getDataDeps(&UseMI, Deps, MTM.MRI))
+        getPHIDeps(UseMI, Deps, TBI.Pred, MTM.MRI);
+      else if (getDataDeps(UseMI, Deps, MTM.MRI))
         updatePhysDepsDownwards(&UseMI, Deps, RegUnits, MTM.TRI);
 
       // Filter and process dependencies, computing the earliest issue cycle.
@@ -861,15 +865,16 @@ computeInstrDepths(const MachineBasicBlock *MBB) {
 // Identify physreg dependencies for MI when scanning instructions upwards.
 // Return the issue height of MI after considering any live regunits.
 // Height is the issue height computed from virtual register dependencies alone.
-static unsigned updatePhysDepsUpwards(const MachineInstr *MI, unsigned Height,
+static unsigned updatePhysDepsUpwards(const MachineInstr &MI, unsigned Height,
                                       SparseSet<LiveRegUnit> &RegUnits,
                                       const TargetSchedModel &SchedModel,
                                       const TargetInstrInfo *TII,
                                       const TargetRegisterInfo *TRI) {
   SmallVector<unsigned, 8> ReadOps;
 
-  for (MachineInstr::const_mop_iterator MOI = MI->operands_begin(),
-       MOE = MI->operands_end(); MOI != MOE; ++MOI) {
+  for (MachineInstr::const_mop_iterator MOI = MI.operands_begin(),
+                                        MOE = MI.operands_end();
+       MOI != MOE; ++MOI) {
     const MachineOperand &MO = *MOI;
     if (!MO.isReg())
       continue;
@@ -877,7 +882,7 @@ static unsigned updatePhysDepsUpwards(const MachineInstr *MI, unsigned Height,
     if (!TargetRegisterInfo::isPhysicalRegister(Reg))
       continue;
     if (MO.readsReg())
-      ReadOps.push_back(MI->getOperandNo(MOI));
+      ReadOps.push_back(MI.getOperandNo(MOI));
     if (!MO.isDef())
       continue;
     // This is a def of Reg. Remove corresponding entries from RegUnits, and
@@ -887,11 +892,11 @@ static unsigned updatePhysDepsUpwards(const MachineInstr *MI, unsigned Height,
       if (I == RegUnits.end())
         continue;
       unsigned DepHeight = I->Cycle;
-      if (!MI->isTransient()) {
+      if (!MI.isTransient()) {
         // We may not know the UseMI of this dependency, if it came from the
         // live-in list. SchedModel can handle a NULL UseMI.
-        DepHeight += SchedModel
-          .computeOperandLatency(MI, MI->getOperandNo(MOI), I->MI, I->Op);
+        DepHeight += SchedModel.computeOperandLatency(&MI, MI.getOperandNo(MOI),
+                                                      I->MI, I->Op);
       }
       Height = std::max(Height, DepHeight);
       // This regunit is dead above MI.
@@ -901,13 +906,13 @@ static unsigned updatePhysDepsUpwards(const MachineInstr *MI, unsigned Height,
 
   // Now we know the height of MI. Update any regunits read.
   for (unsigned i = 0, e = ReadOps.size(); i != e; ++i) {
-    unsigned Reg = MI->getOperand(ReadOps[i]).getReg();
+    unsigned Reg = MI.getOperand(ReadOps[i]).getReg();
     for (MCRegUnitIterator Units(Reg, TRI); Units.isValid(); ++Units) {
       LiveRegUnit &LRU = RegUnits[*Units];
       // Set the height to the highest reader of the unit.
-      if (LRU.Cycle <= Height && LRU.MI != MI) {
+      if (LRU.Cycle <= Height && LRU.MI != &MI) {
         LRU.Cycle = Height;
-        LRU.MI = MI;
+        LRU.MI = &MI;
         LRU.Op = ReadOps[i];
       }
     }
@@ -921,15 +926,14 @@ typedef DenseMap<const MachineInstr *, unsigned> MIHeightMap;
 
 // Push the height of DefMI upwards if required to match UseMI.
 // Return true if this is the first time DefMI was seen.
-static bool pushDepHeight(const DataDep &Dep,
-                          const MachineInstr *UseMI, unsigned UseHeight,
-                          MIHeightMap &Heights,
+static bool pushDepHeight(const DataDep &Dep, const MachineInstr &UseMI,
+                          unsigned UseHeight, MIHeightMap &Heights,
                           const TargetSchedModel &SchedModel,
                           const TargetInstrInfo *TII) {
   // Adjust height by Dep.DefMI latency.
   if (!Dep.DefMI->isTransient())
-    UseHeight += SchedModel.computeOperandLatency(Dep.DefMI, Dep.DefOp,
-                                                  UseMI, Dep.UseOp);
+    UseHeight += SchedModel.computeOperandLatency(Dep.DefMI, Dep.DefOp, &UseMI,
+                                                  Dep.UseOp);
 
   // Update Heights[DefMI] to be the maximum height seen.
   MIHeightMap::iterator I;
@@ -1048,13 +1052,13 @@ computeInstrHeights(const MachineBasicBlock *MBB) {
         if (!PHI.isPHI())
           break;
         Deps.clear();
-        getPHIDeps(&PHI, Deps, MBB, MTM.MRI);
+        getPHIDeps(PHI, Deps, MBB, MTM.MRI);
         if (!Deps.empty()) {
           // Loop header PHI heights are all 0.
           unsigned Height = TBI.Succ ? Cycles.lookup(&PHI).Height : 0;
           DEBUG(dbgs() << "pred\t" << Height << '\t' << PHI);
-          if (pushDepHeight(Deps.front(), &PHI, Height,
-                            Heights, MTM.SchedModel, MTM.TII))
+          if (pushDepHeight(Deps.front(), PHI, Height, Heights, MTM.SchedModel,
+                            MTM.TII))
             addLiveIns(Deps.front().DefMI, Deps.front().DefOp, Stack);
         }
       }
@@ -1063,12 +1067,12 @@ computeInstrHeights(const MachineBasicBlock *MBB) {
     // Go through the block backwards.
     for (MachineBasicBlock::const_iterator BI = MBB->end(), BB = MBB->begin();
          BI != BB;) {
-      const MachineInstr *MI = --BI;
+      const MachineInstr &MI = *--BI;
 
       // Find the MI height as determined by virtual register uses in the
       // trace below.
       unsigned Cycle = 0;
-      MIHeightMap::iterator HeightI = Heights.find(MI);
+      MIHeightMap::iterator HeightI = Heights.find(&MI);
       if (HeightI != Heights.end()) {
         Cycle = HeightI->second;
         // We won't be seeing any more MI uses.
@@ -1078,27 +1082,27 @@ computeInstrHeights(const MachineBasicBlock *MBB) {
       // Don't process PHI deps. They depend on the specific predecessor, and
       // we'll get them when visiting the predecessor.
       Deps.clear();
-      bool HasPhysRegs = !MI->isPHI() && getDataDeps(MI, Deps, MTM.MRI);
+      bool HasPhysRegs = !MI.isPHI() && getDataDeps(MI, Deps, MTM.MRI);
 
       // There may also be regunit dependencies to include in the height.
       if (HasPhysRegs)
-        Cycle = updatePhysDepsUpwards(MI, Cycle, RegUnits,
-                                      MTM.SchedModel, MTM.TII, MTM.TRI);
+        Cycle = updatePhysDepsUpwards(MI, Cycle, RegUnits, MTM.SchedModel,
+                                      MTM.TII, MTM.TRI);
 
       // Update the required height of any virtual registers read by MI.
       for (const DataDep &Dep : Deps)
         if (pushDepHeight(Dep, MI, Cycle, Heights, MTM.SchedModel, MTM.TII))
           addLiveIns(Dep.DefMI, Dep.DefOp, Stack);
 
-      InstrCycles &MICycles = Cycles[MI];
+      InstrCycles &MICycles = Cycles[&MI];
       MICycles.Height = Cycle;
       if (!TBI.HasValidInstrDepths) {
-        DEBUG(dbgs() << Cycle << '\t' << *MI);
+        DEBUG(dbgs() << Cycle << '\t' << MI);
         continue;
       }
       // Update critical path length.
       TBI.CriticalPath = std::max(TBI.CriticalPath, Cycle + MICycles.Depth);
-      DEBUG(dbgs() << TBI.CriticalPath << '\t' << Cycle << '\t' << *MI);
+      DEBUG(dbgs() << TBI.CriticalPath << '\t' << Cycle << '\t' << MI);
     }
 
     // Update virtual live-in heights. They were added by addLiveIns() with a 0
@@ -1143,26 +1147,25 @@ MachineTraceMetrics::Ensemble::getTrace(const MachineBasicBlock *MBB) {
 }
 
 unsigned
-MachineTraceMetrics::Trace::getInstrSlack(const MachineInstr *MI) const {
-  assert(MI && "Not an instruction.");
-  assert(getBlockNum() == unsigned(MI->getParent()->getNumber()) &&
+MachineTraceMetrics::Trace::getInstrSlack(const MachineInstr &MI) const {
+  assert(getBlockNum() == unsigned(MI.getParent()->getNumber()) &&
          "MI must be in the trace center block");
   InstrCycles Cyc = getInstrCycles(MI);
   return getCriticalPath() - (Cyc.Depth + Cyc.Height);
 }
 
 unsigned
-MachineTraceMetrics::Trace::getPHIDepth(const MachineInstr *PHI) const {
+MachineTraceMetrics::Trace::getPHIDepth(const MachineInstr &PHI) const {
   const MachineBasicBlock *MBB = TE.MTM.MF->getBlockNumbered(getBlockNum());
   SmallVector<DataDep, 1> Deps;
   getPHIDeps(PHI, Deps, MBB, TE.MTM.MRI);
   assert(Deps.size() == 1 && "PHI doesn't have MBB as a predecessor");
   DataDep &Dep = Deps.front();
-  unsigned DepCycle = getInstrCycles(Dep.DefMI).Depth;
+  unsigned DepCycle = getInstrCycles(*Dep.DefMI).Depth;
   // Add latency if DefMI is a real instruction. Transients get latency 0.
   if (!Dep.DefMI->isTransient())
-    DepCycle += TE.MTM.SchedModel
-      .computeOperandLatency(Dep.DefMI, Dep.DefOp, PHI, Dep.UseOp);
+    DepCycle += TE.MTM.SchedModel.computeOperandLatency(Dep.DefMI, Dep.DefOp,
+                                                        &PHI, Dep.UseOp);
   return DepCycle;
 }
 
@@ -1248,13 +1251,13 @@ unsigned MachineTraceMetrics::Trace::getResourceLength(
   return std::max(Instrs, PRMax);
 }
 
-bool MachineTraceMetrics::Trace::isDepInTrace(const MachineInstr *DefMI,
-                                              const MachineInstr *UseMI) const {
-  if (DefMI->getParent() == UseMI->getParent())
+bool MachineTraceMetrics::Trace::isDepInTrace(const MachineInstr &DefMI,
+                                              const MachineInstr &UseMI) const {
+  if (DefMI.getParent() == UseMI.getParent())
     return true;
 
-  const TraceBlockInfo &DepTBI = TE.BlockInfo[DefMI->getParent()->getNumber()];
-  const TraceBlockInfo &TBI = TE.BlockInfo[UseMI->getParent()->getNumber()];
+  const TraceBlockInfo &DepTBI = TE.BlockInfo[DefMI.getParent()->getNumber()];
+  const TraceBlockInfo &TBI = TE.BlockInfo[UseMI.getParent()->getNumber()];
 
   return DepTBI.isUsefulDominator(TBI);
 }
diff --git a/lib/CodeGen/MachineVerifier.cpp b/lib/CodeGen/MachineVerifier.cpp
index 428295ec2cc6..a70adb050455 100644
--- a/lib/CodeGen/MachineVerifier.cpp
+++ b/lib/CodeGen/MachineVerifier.cpp
@@ -58,7 +58,7 @@ namespace {
       Banner(b)
       {}
 
-    bool runOnMachineFunction(MachineFunction &MF);
+    unsigned verify(MachineFunction &MF);
 
     Pass *const PASS;
     const char *Banner;
@@ -217,10 +217,22 @@ namespace {
                         LaneBitmask LaneMask) const;
     void report_context(const LiveRange::Segment &S) const;
     void report_context(const VNInfo &VNI) const;
+    void report_context(SlotIndex Pos) const;
+    void report_context_liverange(const LiveRange &LR) const;
+    void report_context_lanemask(LaneBitmask LaneMask) const;
+    void report_context_vreg(unsigned VReg) const;
+    void report_context_vreg_regunit(unsigned VRegOrRegUnit) const;
 
     void verifyInlineAsm(const MachineInstr *MI);
 
     void checkLiveness(const MachineOperand *MO, unsigned MONum);
+    void checkLivenessAtUse(const MachineOperand *MO, unsigned MONum,
+                            SlotIndex UseIdx, const LiveRange &LR, unsigned Reg,
+                            LaneBitmask LaneMask = 0);
+    void checkLivenessAtDef(const MachineOperand *MO, unsigned MONum,
+                            SlotIndex DefIdx, const LiveRange &LR, unsigned Reg,
+                            LaneBitmask LaneMask = 0);
+
     void markReachable(const MachineBasicBlock *MBB);
     void calcRegsPassed();
     void checkPHIOps(const MachineBasicBlock *MBB);
@@ -239,6 +251,7 @@ namespace {
     void verifyStackFrame();
 
     void verifySlotIndexes() const;
+    void verifyProperties(const MachineFunction &MF);
   };
 
   struct MachineVerifierPass : public MachineFunctionPass {
@@ -256,7 +269,9 @@ namespace {
     }
 
     bool runOnMachineFunction(MachineFunction &MF) override {
-      MF.verify(this, Banner.c_str());
+      unsigned FoundErrors = MachineVerifier(this, Banner.c_str()).verify(MF);
+      if (FoundErrors)
+        report_fatal_error("Found "+Twine(FoundErrors)+" machine code errors.");
       return false;
     }
   };
@@ -271,9 +286,13 @@ FunctionPass *llvm::createMachineVerifierPass(const std::string &Banner) {
   return new MachineVerifierPass(Banner);
 }
 
-void MachineFunction::verify(Pass *p, const char *Banner) const {
-  MachineVerifier(p, Banner)
-    .runOnMachineFunction(const_cast<MachineFunction&>(*this));
+bool MachineFunction::verify(Pass *p, const char *Banner, bool AbortOnErrors)
+    const {
+  MachineFunction &MF = const_cast<MachineFunction&>(*this);
+  unsigned FoundErrors = MachineVerifier(p, Banner).verify(MF);
+  if (AbortOnErrors && FoundErrors)
+    report_fatal_error("Found "+Twine(FoundErrors)+" machine code errors.");
+  return FoundErrors == 0;
 }
 
 void MachineVerifier::verifySlotIndexes() const {
@@ -289,7 +308,20 @@ void MachineVerifier::verifySlotIndexes() const {
   }
 }
 
-bool MachineVerifier::runOnMachineFunction(MachineFunction &MF) {
+void MachineVerifier::verifyProperties(const MachineFunction &MF) {
+  // If a pass has introduced virtual registers without clearing the
+  // AllVRegsAllocated property (or set it without allocating the vregs)
+  // then report an error.
+  if (MF.getProperties().hasProperty(
+          MachineFunctionProperties::Property::AllVRegsAllocated) &&
+      MRI->getNumVirtRegs()) {
+    report(
+        "Function has AllVRegsAllocated property but there are VReg operands",
+        &MF);
+  }
+}
+
+unsigned MachineVerifier::verify(MachineFunction &MF) {
   foundErrors = 0;
 
   this->MF = &MF;
@@ -313,6 +345,8 @@ bool MachineVerifier::runOnMachineFunction(MachineFunction &MF) {
 
   verifySlotIndexes();
 
+  verifyProperties(MF);
+
   visitMachineFunctionBefore();
   for (MachineFunction::const_iterator MFI = MF.begin(), MFE = MF.end();
        MFI!=MFE; ++MFI) {
@@ -374,9 +408,6 @@ bool MachineVerifier::runOnMachineFunction(MachineFunction &MF) {
   }
   visitMachineFunctionAfter();
 
-  if (foundErrors)
-    report_fatal_error("Found "+Twine(foundErrors)+" machine code errors.");
-
   // Clean up.
   regsLive.clear();
   regsDefined.clear();
@@ -386,7 +417,7 @@ bool MachineVerifier::runOnMachineFunction(MachineFunction &MF) {
   regsLiveInButUnused.clear();
   MBBInfoMap.clear();
 
-  return false;                 // no changes
+  return foundErrors;
 }
 
 void MachineVerifier::report(const char *msg, const MachineFunction *MF) {
@@ -420,8 +451,8 @@ void MachineVerifier::report(const char *msg, const MachineInstr *MI) {
   assert(MI);
   report(msg, MI->getParent());
   errs() << "- instruction: ";
-  if (Indexes && Indexes->hasIndex(MI))
-    errs() << Indexes->getInstructionIndex(MI) << '\t';
+  if (Indexes && Indexes->hasIndex(*MI))
+    errs() << Indexes->getInstructionIndex(*MI) << '\t';
   MI->print(errs(), /*SkipOpers=*/true);
   errs() << '\n';
 }
@@ -435,16 +466,20 @@ void MachineVerifier::report(const char *msg,
   errs() << "\n";
 }
 
+void MachineVerifier::report_context(SlotIndex Pos) const {
+  errs() << "- at:          " << Pos << '\n';
+}
+
 void MachineVerifier::report_context(const LiveInterval &LI) const {
   errs() << "- interval:    " << LI << '\n';
 }
 
 void MachineVerifier::report_context(const LiveRange &LR, unsigned Reg,
                                      LaneBitmask LaneMask) const {
+  report_context_liverange(LR);
   errs() << "- register:    " << PrintReg(Reg, TRI) << '\n';
   if (LaneMask != 0)
-    errs() << "- lanemask:    " << PrintLaneMask(LaneMask) << '\n';
-  errs() << "- liverange:   " << LR << '\n';
+    report_context_lanemask(LaneMask);
 }
 
 void MachineVerifier::report_context(const LiveRange::Segment &S) const {
@@ -455,6 +490,26 @@ void MachineVerifier::report_context(const VNInfo &VNI) const {
   errs() << "- ValNo:       " << VNI.id << " (def " << VNI.def << ")\n";
 }
 
+void MachineVerifier::report_context_liverange(const LiveRange &LR) const {
+  errs() << "- liverange:   " << LR << '\n';
+}
+
+void MachineVerifier::report_context_vreg(unsigned VReg) const {
+  errs() << "- v. register: " << PrintReg(VReg, TRI) << '\n';
+}
+
+void MachineVerifier::report_context_vreg_regunit(unsigned VRegOrUnit) const {
+  if (TargetRegisterInfo::isVirtualRegister(VRegOrUnit)) {
+    report_context_vreg(VRegOrUnit);
+  } else {
+    errs() << "- regunit:     " << PrintRegUnit(VRegOrUnit, TRI) << '\n';
+  }
+}
+
+void MachineVerifier::report_context_lanemask(LaneBitmask LaneMask) const {
+  errs() << "- lanemask:    " << PrintLaneMask(LaneMask) << '\n';
+}
+
 void MachineVerifier::markReachable(const MachineBasicBlock *MBB) {
   BBInfo &MInfo = MBBInfoMap[MBB];
   if (!MInfo.reachable) {
@@ -521,7 +576,7 @@ MachineVerifier::visitMachineBasicBlockBefore(const MachineBasicBlock *MBB) {
     // it is an entry block or landing pad.
     for (const auto &LI : MBB->liveins()) {
       if (isAllocatable(LI.PhysReg) && !MBB->isEHPad() &&
-          MBB != MBB->getParent()->begin()) {
+          MBB->getIterator() != MBB->getParent()->begin()) {
         report("MBB has allocable live-in, but isn't entry or landing-pad.", MBB);
       }
     }
@@ -567,8 +622,8 @@ MachineVerifier::visitMachineBasicBlockBefore(const MachineBasicBlock *MBB) {
   // Call AnalyzeBranch. If it succeeds, there several more conditions to check.
   MachineBasicBlock *TBB = nullptr, *FBB = nullptr;
   SmallVector<MachineOperand, 4> Cond;
-  if (!TII->AnalyzeBranch(*const_cast<MachineBasicBlock *>(MBB),
-                          TBB, FBB, Cond)) {
+  if (!TII->analyzeBranch(*const_cast<MachineBasicBlock *>(MBB), TBB, FBB,
+                          Cond)) {
     // Ok, AnalyzeBranch thinks it knows what's going on with this block. Let's
     // check whether its answers match up with reality.
     if (!TBB && !FBB) {
@@ -591,7 +646,7 @@ MachineVerifier::visitMachineBasicBlockBefore(const MachineBasicBlock *MBB) {
                "differs from its CFG successor!", MBB);
       }
       if (!MBB->empty() && MBB->back().isBarrier() &&
-          !TII->isPredicated(&MBB->back())) {
+          !TII->isPredicated(MBB->back())) {
         report("MBB exits via unconditional fall-through but ends with a "
                "barrier instruction!", MBB);
       }
@@ -721,8 +776,8 @@ MachineVerifier::visitMachineBasicBlockBefore(const MachineBasicBlock *MBB) {
 // This function gets called for all bundle headers, including normal
 // stand-alone unbundled instructions.
 void MachineVerifier::visitMachineBundleBefore(const MachineInstr *MI) {
-  if (Indexes && Indexes->hasIndex(MI)) {
-    SlotIndex idx = Indexes->getInstructionIndex(MI);
+  if (Indexes && Indexes->hasIndex(*MI)) {
+    SlotIndex idx = Indexes->getInstructionIndex(*MI);
     if (!(idx > lastIndex)) {
       report("Instruction index out of order", MI);
       errs() << "Last instruction was at " << lastIndex << '\n';
@@ -733,7 +788,7 @@ void MachineVerifier::visitMachineBundleBefore(const MachineInstr *MI) {
   // Ensure non-terminators don't follow terminators.
   // Ignore predicated terminators formed by if conversion.
   // FIXME: If conversion shouldn't need to violate this rule.
-  if (MI->isTerminator() && !TII->isPredicated(MI)) {
+  if (MI->isTerminator() && !TII->isPredicated(*MI)) {
     if (!FirstTerminator)
       FirstTerminator = MI;
   } else if (FirstTerminator) {
@@ -755,8 +810,9 @@ void MachineVerifier::verifyInlineAsm(const MachineInstr *MI) {
   if (!MI->getOperand(1).isImm())
     report("Asm flags must be an immediate", MI);
   // Allowed flags are Extra_HasSideEffects = 1, Extra_IsAlignStack = 2,
-  // Extra_AsmDialect = 4, Extra_MayLoad = 8, and Extra_MayStore = 16.
-  if (!isUInt<5>(MI->getOperand(1).getImm()))
+  // Extra_AsmDialect = 4, Extra_MayLoad = 8, and Extra_MayStore = 16,
+  // and Extra_IsConvergent = 32.
+  if (!isUInt<6>(MI->getOperand(1).getImm()))
     report("Unknown asm flags", &MI->getOperand(1), 1);
 
   static_assert(InlineAsm::MIOp_FirstOperand == 2, "Asm format changed");
@@ -810,7 +866,7 @@ void MachineVerifier::visitMachineInstrBefore(const MachineInstr *MI) {
   // Debug values must not have a slot index.
   // Other instructions must have one, unless they are inside a bundle.
   if (LiveInts) {
-    bool mapped = !LiveInts->isNotInMIMap(MI);
+    bool mapped = !LiveInts->isNotInMIMap(*MI);
     if (MI->isDebugValue()) {
       if (mapped)
         report("Debug instruction has a slot index", MI);
@@ -824,7 +880,7 @@ void MachineVerifier::visitMachineInstrBefore(const MachineInstr *MI) {
   }
 
   StringRef ErrorInfo;
-  if (!TII->verifyInstruction(MI, ErrorInfo))
+  if (!TII->verifyInstruction(*MI, ErrorInfo))
     report(ErrorInfo.data(), MI);
 }
 
@@ -929,7 +985,30 @@ MachineVerifier::visitMachineOperand(const MachineOperand *MO, unsigned MONum) {
         }
       } else {
         // Virtual register.
-        const TargetRegisterClass *RC = MRI->getRegClass(Reg);
+        const TargetRegisterClass *RC = MRI->getRegClassOrNull(Reg);
+        if (!RC) {
+          // This is a generic virtual register.
+          // It must have a size and it must not have a SubIdx.
+          unsigned Size = MRI->getSize(Reg);
+          if (!Size) {
+            report("Generic virtual register must have a size", MO, MONum);
+            return;
+          }
+          // Make sure the register fits into its register bank if any.
+          const RegisterBank *RegBank = MRI->getRegBankOrNull(Reg);
+          if (RegBank && RegBank->getSize() < Size) {
+            report("Register bank is too small for virtual register", MO,
+                   MONum);
+            errs() << "Register bank " << RegBank->getName() << " too small("
+                   << RegBank->getSize() << ") to fit " << Size << "-bits\n";
+            return;
+          }
+          if (SubIdx)  {
+            report("Generic virtual register does not subregister index", MO, MONum);
+            return;
+          }
+          break;
+        }
         if (SubIdx) {
           const TargetRegisterClass *SRC =
             TRI->getSubClassWithSubReg(RC, SubIdx);
@@ -984,10 +1063,10 @@ MachineVerifier::visitMachineOperand(const MachineOperand *MO, unsigned MONum) {
 
   case MachineOperand::MO_FrameIndex:
     if (LiveStks && LiveStks->hasInterval(MO->getIndex()) &&
-        LiveInts && !LiveInts->isNotInMIMap(MI)) {
+        LiveInts && !LiveInts->isNotInMIMap(*MI)) {
       int FI = MO->getIndex();
       LiveInterval &LI = LiveStks->getInterval(FI);
-      SlotIndex Idx = LiveInts->getInstructionIndex(MI);
+      SlotIndex Idx = LiveInts->getInstructionIndex(*MI);
 
       bool stores = MI->mayStore();
       bool loads = MI->mayLoad();
@@ -1028,6 +1107,83 @@ MachineVerifier::visitMachineOperand(const MachineOperand *MO, unsigned MONum) {
   }
 }
 
+void MachineVerifier::checkLivenessAtUse(const MachineOperand *MO,
+    unsigned MONum, SlotIndex UseIdx, const LiveRange &LR, unsigned VRegOrUnit,
+    LaneBitmask LaneMask) {
+  LiveQueryResult LRQ = LR.Query(UseIdx);
+  // Check if we have a segment at the use, note however that we only need one
+  // live subregister range, the others may be dead.
+  if (!LRQ.valueIn() && LaneMask == 0) {
+    report("No live segment at use", MO, MONum);
+    report_context_liverange(LR);
+    report_context_vreg_regunit(VRegOrUnit);
+    report_context(UseIdx);
+  }
+  if (MO->isKill() && !LRQ.isKill()) {
+    report("Live range continues after kill flag", MO, MONum);
+    report_context_liverange(LR);
+    report_context_vreg_regunit(VRegOrUnit);
+    if (LaneMask != 0)
+      report_context_lanemask(LaneMask);
+    report_context(UseIdx);
+  }
+}
+
+void MachineVerifier::checkLivenessAtDef(const MachineOperand *MO,
+    unsigned MONum, SlotIndex DefIdx, const LiveRange &LR, unsigned VRegOrUnit,
+    LaneBitmask LaneMask) {
+  if (const VNInfo *VNI = LR.getVNInfoAt(DefIdx)) {
+    assert(VNI && "NULL valno is not allowed");
+    if (VNI->def != DefIdx) {
+      report("Inconsistent valno->def", MO, MONum);
+      report_context_liverange(LR);
+      report_context_vreg_regunit(VRegOrUnit);
+      if (LaneMask != 0)
+        report_context_lanemask(LaneMask);
+      report_context(*VNI);
+      report_context(DefIdx);
+    }
+  } else {
+    report("No live segment at def", MO, MONum);
+    report_context_liverange(LR);
+    report_context_vreg_regunit(VRegOrUnit);
+    if (LaneMask != 0)
+      report_context_lanemask(LaneMask);
+    report_context(DefIdx);
+  }
+  // Check that, if the dead def flag is present, LiveInts agree.
+  if (MO->isDead()) {
+    LiveQueryResult LRQ = LR.Query(DefIdx);
+    if (!LRQ.isDeadDef()) {
+      // In case of physregs we can have a non-dead definition on another
+      // operand.
+      bool otherDef = false;
+      if (!TargetRegisterInfo::isVirtualRegister(VRegOrUnit)) {
+        const MachineInstr &MI = *MO->getParent();
+        for (const MachineOperand &MO : MI.operands()) {
+          if (!MO.isReg() || !MO.isDef() || MO.isDead())
+            continue;
+          unsigned Reg = MO.getReg();
+          for (MCRegUnitIterator Units(Reg, TRI); Units.isValid(); ++Units) {
+            if (*Units == VRegOrUnit) {
+              otherDef = true;
+              break;
+            }
+          }
+        }
+      }
+
+      if (!otherDef) {
+        report("Live range continues after dead def flag", MO, MONum);
+        report_context_liverange(LR);
+        report_context_vreg_regunit(VRegOrUnit);
+        if (LaneMask != 0)
+          report_context_lanemask(LaneMask);
+      }
+    }
+  }
+}
+
 void MachineVerifier::checkLiveness(const MachineOperand *MO, unsigned MONum) {
   const MachineInstr *MI = MO->getParent();
   const unsigned Reg = MO->getReg();
@@ -1048,23 +1204,13 @@ void MachineVerifier::checkLiveness(const MachineOperand *MO, unsigned MONum) {
     }
 
     // Check LiveInts liveness and kill.
-    if (LiveInts && !LiveInts->isNotInMIMap(MI)) {
-      SlotIndex UseIdx = LiveInts->getInstructionIndex(MI);
+    if (LiveInts && !LiveInts->isNotInMIMap(*MI)) {
+      SlotIndex UseIdx = LiveInts->getInstructionIndex(*MI);
       // Check the cached regunit intervals.
       if (TargetRegisterInfo::isPhysicalRegister(Reg) && !isReserved(Reg)) {
         for (MCRegUnitIterator Units(Reg, TRI); Units.isValid(); ++Units) {
-          if (const LiveRange *LR = LiveInts->getCachedRegUnit(*Units)) {
-            LiveQueryResult LRQ = LR->Query(UseIdx);
-            if (!LRQ.valueIn()) {
-              report("No live segment at use", MO, MONum);
-              errs() << UseIdx << " is not live in " << PrintRegUnit(*Units, TRI)
-                  << ' ' << *LR << '\n';
-            }
-            if (MO->isKill() && !LRQ.isKill()) {
-              report("Live range continues after kill flag", MO, MONum);
-              errs() << PrintRegUnit(*Units, TRI) << ' ' << *LR << '\n';
-            }
-          }
+          if (const LiveRange *LR = LiveInts->getCachedRegUnit(*Units))
+            checkLivenessAtUse(MO, MONum, UseIdx, *LR, *Units);
         }
       }
 
@@ -1072,16 +1218,28 @@ void MachineVerifier::checkLiveness(const MachineOperand *MO, unsigned MONum) {
         if (LiveInts->hasInterval(Reg)) {
           // This is a virtual register interval.
           const LiveInterval &LI = LiveInts->getInterval(Reg);
-          LiveQueryResult LRQ = LI.Query(UseIdx);
-          if (!LRQ.valueIn()) {
-            report("No live segment at use", MO, MONum);
-            errs() << UseIdx << " is not live in " << LI << '\n';
-          }
-          // Check for extra kill flags.
-          // Note that we allow missing kill flags for now.
-          if (MO->isKill() && !LRQ.isKill()) {
-            report("Live range continues after kill flag", MO, MONum);
-            errs() << "Live range: " << LI << '\n';
+          checkLivenessAtUse(MO, MONum, UseIdx, LI, Reg);
+
+          if (LI.hasSubRanges() && !MO->isDef()) {
+            unsigned SubRegIdx = MO->getSubReg();
+            LaneBitmask MOMask = SubRegIdx != 0
+                               ? TRI->getSubRegIndexLaneMask(SubRegIdx)
+                               : MRI->getMaxLaneMaskForVReg(Reg);
+            LaneBitmask LiveInMask = 0;
+            for (const LiveInterval::SubRange &SR : LI.subranges()) {
+              if ((MOMask & SR.LaneMask) == 0)
+                continue;
+              checkLivenessAtUse(MO, MONum, UseIdx, SR, Reg, SR.LaneMask);
+              LiveQueryResult LRQ = SR.Query(UseIdx);
+              if (LRQ.valueIn())
+                LiveInMask |= SR.LaneMask;
+            }
+            // At least parts of the register has to be live at the use.
+            if ((LiveInMask & MOMask) == 0) {
+              report("No live subrange at use", MO, MONum);
+              report_context(LI);
+              report_context(UseIdx);
+            }
           }
         } else {
           report("Virtual register has no live interval", MO, MONum);
@@ -1154,33 +1312,29 @@ void MachineVerifier::checkLiveness(const MachineOperand *MO, unsigned MONum) {
       report("Multiple virtual register defs in SSA form", MO, MONum);
 
     // Check LiveInts for a live segment, but only for virtual registers.
-    if (LiveInts && TargetRegisterInfo::isVirtualRegister(Reg) &&
-        !LiveInts->isNotInMIMap(MI)) {
-      SlotIndex DefIdx = LiveInts->getInstructionIndex(MI);
+    if (LiveInts && !LiveInts->isNotInMIMap(*MI)) {
+      SlotIndex DefIdx = LiveInts->getInstructionIndex(*MI);
       DefIdx = DefIdx.getRegSlot(MO->isEarlyClobber());
-      if (LiveInts->hasInterval(Reg)) {
-        const LiveInterval &LI = LiveInts->getInterval(Reg);
-        if (const VNInfo *VNI = LI.getVNInfoAt(DefIdx)) {
-          assert(VNI && "NULL valno is not allowed");
-          if (VNI->def != DefIdx) {
-            report("Inconsistent valno->def", MO, MONum);
-            errs() << "Valno " << VNI->id << " is not defined at "
-              << DefIdx << " in " << LI << '\n';
+
+      if (TargetRegisterInfo::isVirtualRegister(Reg)) {
+        if (LiveInts->hasInterval(Reg)) {
+          const LiveInterval &LI = LiveInts->getInterval(Reg);
+          checkLivenessAtDef(MO, MONum, DefIdx, LI, Reg);
+
+          if (LI.hasSubRanges()) {
+            unsigned SubRegIdx = MO->getSubReg();
+            LaneBitmask MOMask = SubRegIdx != 0
+              ? TRI->getSubRegIndexLaneMask(SubRegIdx)
+              : MRI->getMaxLaneMaskForVReg(Reg);
+            for (const LiveInterval::SubRange &SR : LI.subranges()) {
+              if ((SR.LaneMask & MOMask) == 0)
+                continue;
+              checkLivenessAtDef(MO, MONum, DefIdx, SR, Reg, SR.LaneMask);
+            }
           }
         } else {
-          report("No live segment at def", MO, MONum);
-          errs() << DefIdx << " is not live in " << LI << '\n';
-        }
-        // Check that, if the dead def flag is present, LiveInts agree.
-        if (MO->isDead()) {
-          LiveQueryResult LRQ = LI.Query(DefIdx);
-          if (!LRQ.isDeadDef()) {
-            report("Live range continues after dead def flag", MO, MONum);
-            errs() << "Live range: " << LI << '\n';
-          }
+          report("Virtual register has no Live interval", MO, MONum);
         }
-      } else {
-        report("Virtual register has no Live interval", MO, MONum);
       }
     }
   }
@@ -1360,9 +1514,10 @@ void MachineVerifier::visitMachineFunctionAfter() {
     BBInfo &MInfo = MBBInfoMap[&MF->front()];
     for (RegSet::iterator
          I = MInfo.vregsRequired.begin(), E = MInfo.vregsRequired.end(); I != E;
-         ++I)
-      report("Virtual register def doesn't dominate all uses.",
-             MRI->getVRegDef(*I));
+         ++I) {
+      report("Virtual register defs don't dominate all uses.", MF);
+      report_context_vreg(*I);
+    }
   }
 
   if (LiveVars)
@@ -1474,7 +1629,7 @@ void MachineVerifier::verifyLiveRangeValue(const LiveRange &LR,
   if (Reg != 0) {
     bool hasDef = false;
     bool isEarlyClobber = false;
-    for (ConstMIBundleOperands MOI(MI); MOI.isValid(); ++MOI) {
+    for (ConstMIBundleOperands MOI(*MI); MOI.isValid(); ++MOI) {
       if (!MOI->isReg() || !MOI->isDef())
         continue;
       if (TargetRegisterInfo::isVirtualRegister(Reg)) {
@@ -1613,18 +1768,33 @@ void MachineVerifier::verifyLiveRangeSegment(const LiveRange &LR,
     // use, or a dead flag on a def.
     bool hasRead = false;
     bool hasSubRegDef = false;
-    for (ConstMIBundleOperands MOI(MI); MOI.isValid(); ++MOI) {
+    bool hasDeadDef = false;
+    for (ConstMIBundleOperands MOI(*MI); MOI.isValid(); ++MOI) {
       if (!MOI->isReg() || MOI->getReg() != Reg)
         continue;
       if (LaneMask != 0 &&
           (LaneMask & TRI->getSubRegIndexLaneMask(MOI->getSubReg())) == 0)
         continue;
-      if (MOI->isDef() && MOI->getSubReg() != 0)
-        hasSubRegDef = true;
+      if (MOI->isDef()) {
+        if (MOI->getSubReg() != 0)
+          hasSubRegDef = true;
+        if (MOI->isDead())
+          hasDeadDef = true;
+      }
       if (MOI->readsReg())
         hasRead = true;
     }
-    if (!S.end.isDead()) {
+    if (S.end.isDead()) {
+      // Make sure that the corresponding machine operand for a "dead" live
+      // range has the dead flag. We cannot perform this check for subregister
+      // liveranges as partially dead values are allowed.
+      if (LaneMask == 0 && !hasDeadDef) {
+        report("Instruction ending live segment on dead slot has no dead flag",
+               MI);
+        report_context(LR, Reg, LaneMask);
+        report_context(S);
+      }
+    } else {
       if (!hasRead) {
         // When tracking subregister liveness, the main range must start new
         // values on partial register writes, even if there is no read.
@@ -1670,8 +1840,9 @@ void MachineVerifier::verifyLiveRangeSegment(const LiveRange &LR,
       SlotIndex PEnd = LiveInts->getMBBEndIdx(*PI);
       const VNInfo *PVNI = LR.getVNInfoBefore(PEnd);
 
-      // All predecessors must have a live-out value.
-      if (!PVNI) {
+      // All predecessors must have a live-out value if this is not a
+      // subregister liverange.
+      if (!PVNI && LaneMask == 0) {
         report("Register not marked live out of predecessor", *PI);
         report_context(LR, Reg, LaneMask);
         report_context(*VNI);
diff --git a/lib/CodeGen/Makefile b/lib/CodeGen/Makefile
deleted file mode 100644
index 96f7ca585138..000000000000
--- a/lib/CodeGen/Makefile
+++ /dev/null
@@ -1,22 +0,0 @@
-##===- lib/CodeGen/Makefile --------------------------------*- Makefile -*-===##
-#
-#                     The LLVM Compiler Infrastructure
-#
-# This file is distributed under the University of Illinois Open Source
-# License. See LICENSE.TXT for details.
-#
-##===----------------------------------------------------------------------===##
-
-LEVEL = ../..
-LIBRARYNAME = LLVMCodeGen
-PARALLEL_DIRS = SelectionDAG AsmPrinter MIRParser
-BUILD_ARCHIVE = 1
-
-include $(LEVEL)/Makefile.common
-
-# Xcode prior to 2.4 generates an error in -pedantic mode with use of HUGE_VAL
-# in this directory.  Disable -pedantic for this broken compiler.
-ifneq ($(HUGE_VAL_SANITY),yes)
-CompileCommonOpts := $(filter-out -pedantic, $(CompileCommonOpts))
-endif
-
diff --git a/lib/CodeGen/OcamlGC.cpp b/lib/CodeGen/OcamlGC.cpp
deleted file mode 100644
index 17654a6ac3a2..000000000000
--- a/lib/CodeGen/OcamlGC.cpp
+++ /dev/null
@@ -1,36 +0,0 @@
-//===-- OcamlGC.cpp - Ocaml frametable GC strategy ------------------------===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-//
-// This file implements lowering for the llvm.gc* intrinsics compatible with
-// Objective Caml 3.10.0, which uses a liveness-accurate static stack map.
-//
-// The frametable emitter is in OcamlGCPrinter.cpp.
-//
-//===----------------------------------------------------------------------===//
-
-#include "llvm/CodeGen/GCs.h"
-#include "llvm/CodeGen/GCStrategy.h"
-
-using namespace llvm;
-
-namespace {
-class OcamlGC : public GCStrategy {
-public:
-  OcamlGC();
-};
-}
-
-static GCRegistry::Add<OcamlGC> X("ocaml", "ocaml 3.10-compatible GC");
-
-void llvm::linkOcamlGC() {}
-
-OcamlGC::OcamlGC() {
-  NeededSafePoints = 1 << GC::PostCall;
-  UsesMetadata = true;
-}
diff --git a/lib/CodeGen/OptimizePHIs.cpp b/lib/CodeGen/OptimizePHIs.cpp
index a1042e720c37..0177e414f8d9 100644
--- a/lib/CodeGen/OptimizePHIs.cpp
+++ b/lib/CodeGen/OptimizePHIs.cpp
@@ -63,7 +63,7 @@ INITIALIZE_PASS(OptimizePHIs, "opt-phis",
                 "Optimize machine instruction PHIs", false, false)
 
 bool OptimizePHIs::runOnMachineFunction(MachineFunction &Fn) {
-  if (skipOptnoneFunction(*Fn.getFunction()))
+  if (skipFunction(*Fn.getFunction()))
     return false;
 
   MRI = &Fn.getRegInfo();
diff --git a/lib/CodeGen/PHIElimination.cpp b/lib/CodeGen/PHIElimination.cpp
index 2c937926d0a7..b8d54315d148 100644
--- a/lib/CodeGen/PHIElimination.cpp
+++ b/lib/CodeGen/PHIElimination.cpp
@@ -13,7 +13,6 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "llvm/CodeGen/Passes.h"
 #include "PHIEliminationUtils.h"
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/SmallPtrSet.h"
@@ -25,9 +24,9 @@
 #include "llvm/CodeGen/MachineInstrBuilder.h"
 #include "llvm/CodeGen/MachineLoopInfo.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/Passes.h"
 #include "llvm/IR/Function.h"
 #include "llvm/Support/CommandLine.h"
-#include "llvm/Support/Compiler.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/Target/TargetInstrInfo.h"
@@ -121,6 +120,7 @@ INITIALIZE_PASS_END(PHIElimination, "phi-node-elimination",
                     "Eliminate PHI nodes for register allocation", false, false)
 
 void PHIElimination::getAnalysisUsage(AnalysisUsage &AU) const {
+  AU.addUsedIfAvailable<LiveVariables>();
   AU.addPreserved<LiveVariables>();
   AU.addPreserved<SlotIndexes>();
   AU.addPreserved<LiveIntervals>();
@@ -159,17 +159,16 @@ bool PHIElimination::runOnMachineFunction(MachineFunction &MF) {
     unsigned DefReg = DefMI->getOperand(0).getReg();
     if (MRI->use_nodbg_empty(DefReg)) {
       if (LIS)
-        LIS->RemoveMachineInstrFromMaps(DefMI);
+        LIS->RemoveMachineInstrFromMaps(*DefMI);
       DefMI->eraseFromParent();
     }
   }
 
   // Clean up the lowered PHI instructions.
-  for (LoweredPHIMap::iterator I = LoweredPHIs.begin(), E = LoweredPHIs.end();
-       I != E; ++I) {
+  for (auto &I : LoweredPHIs) {
     if (LIS)
-      LIS->RemoveMachineInstrFromMaps(I->first);
-    MF.DeleteMachineInstr(I->first);
+      LIS->RemoveMachineInstrFromMaps(*I.first);
+    MF.DeleteMachineInstr(I.first);
   }
 
   LoweredPHIs.clear();
@@ -228,7 +227,7 @@ void PHIElimination::LowerPHINode(MachineBasicBlock &MBB,
   MachineBasicBlock::iterator AfterPHIsIt = std::next(LastPHIIt);
 
   // Unlink the PHI node from the basic block, but don't delete the PHI yet.
-  MachineInstr *MPhi = MBB.remove(MBB.begin());
+  MachineInstr *MPhi = MBB.remove(&*MBB.begin());
 
   unsigned NumSrcs = (MPhi->getNumOperands() - 1) / 2;
   unsigned DestReg = MPhi->getOperand(0).getReg();
@@ -270,7 +269,7 @@ void PHIElimination::LowerPHINode(MachineBasicBlock &MBB,
 
   // Update live variable information if there is any.
   if (LV) {
-    MachineInstr *PHICopy = std::prev(AfterPHIsIt);
+    MachineInstr &PHICopy = *std::prev(AfterPHIsIt);
 
     if (IncomingReg) {
       LiveVariables::VarInfo &VI = LV->getVarInfo(IncomingReg);
@@ -284,7 +283,7 @@ void PHIElimination::LowerPHINode(MachineBasicBlock &MBB,
       if (reusedIncoming)
         if (MachineInstr *OldKill = VI.findKill(&MBB)) {
           DEBUG(dbgs() << "Remove old kill from " << *OldKill);
-          LV->removeVirtualRegisterKilled(IncomingReg, OldKill);
+          LV->removeVirtualRegisterKilled(IncomingReg, *OldKill);
           DEBUG(MBB.dump());
         }
 
@@ -298,19 +297,19 @@ void PHIElimination::LowerPHINode(MachineBasicBlock &MBB,
     // Since we are going to be deleting the PHI node, if it is the last use of
     // any registers, or if the value itself is dead, we need to move this
     // information over to the new copy we just inserted.
-    LV->removeVirtualRegistersKilled(MPhi);
+    LV->removeVirtualRegistersKilled(*MPhi);
 
     // If the result is dead, update LV.
     if (isDead) {
       LV->addVirtualRegisterDead(DestReg, PHICopy);
-      LV->removeVirtualRegisterDead(DestReg, MPhi);
+      LV->removeVirtualRegisterDead(DestReg, *MPhi);
     }
   }
 
   // Update LiveIntervals for the new copy or implicit def.
   if (LIS) {
-    MachineInstr *NewInstr = std::prev(AfterPHIsIt);
-    SlotIndex DestCopyIndex = LIS->InsertMachineInstrInMaps(NewInstr);
+    SlotIndex DestCopyIndex =
+        LIS->InsertMachineInstrInMaps(*std::prev(AfterPHIsIt));
 
     SlotIndex MBBStartIndex = LIS->getMBBStartIdx(&MBB);
     if (IncomingReg) {
@@ -453,7 +452,7 @@ void PHIElimination::LowerPHINode(MachineBasicBlock &MBB,
       assert(KillInst->readsRegister(SrcReg) && "Cannot find kill instruction");
 
       // Finally, mark it killed.
-      LV->addVirtualRegisterKilled(SrcReg, KillInst);
+      LV->addVirtualRegisterKilled(SrcReg, *KillInst);
 
       // This vreg no longer lives all of the way through opBlock.
       unsigned opBlockNum = opBlock.getNumber();
@@ -462,8 +461,8 @@ void PHIElimination::LowerPHINode(MachineBasicBlock &MBB,
 
     if (LIS) {
       if (NewSrcInstr) {
-        LIS->InsertMachineInstrInMaps(NewSrcInstr);
-        LIS->addSegmentToEndOfBlock(IncomingReg, NewSrcInstr);
+        LIS->InsertMachineInstrInMaps(*NewSrcInstr);
+        LIS->addSegmentToEndOfBlock(IncomingReg, *NewSrcInstr);
       }
 
       if (!SrcUndef &&
@@ -513,7 +512,7 @@ void PHIElimination::LowerPHINode(MachineBasicBlock &MBB,
           assert(KillInst->readsRegister(SrcReg) &&
                  "Cannot find kill instruction");
 
-          SlotIndex LastUseIndex = LIS->getInstructionIndex(KillInst);
+          SlotIndex LastUseIndex = LIS->getInstructionIndex(*KillInst);
           SrcLI.removeSegment(LastUseIndex.getRegSlot(),
                               LIS->getMBBEndIdx(&opBlock));
         }
@@ -524,7 +523,7 @@ void PHIElimination::LowerPHINode(MachineBasicBlock &MBB,
   // Really delete the PHI instruction now, if it is not in the LoweredPHIs map.
   if (reusedIncoming || !IncomingReg) {
     if (LIS)
-      LIS->RemoveMachineInstrFromMaps(MPhi);
+      LIS->RemoveMachineInstrFromMaps(*MPhi);
     MF.DeleteMachineInstr(MPhi);
   }
 }
@@ -612,7 +611,7 @@ bool PHIElimination::SplitPHIEdges(MachineFunction &MF,
       }
       if (!ShouldSplit && !SplitAllCriticalEdges)
         continue;
-      if (!PreMBB->SplitCriticalEdge(&MBB, this)) {
+      if (!PreMBB->SplitCriticalEdge(&MBB, *this)) {
         DEBUG(dbgs() << "Failed to split critical edge.\n");
         continue;
       }
diff --git a/lib/CodeGen/ParallelCG.cpp b/lib/CodeGen/ParallelCG.cpp
index e73ba0296045..ccdaec1bc180 100644
--- a/lib/CodeGen/ParallelCG.cpp
+++ b/lib/CodeGen/ParallelCG.cpp
@@ -19,78 +19,81 @@
 #include "llvm/Support/ErrorOr.h"
 #include "llvm/Support/MemoryBuffer.h"
 #include "llvm/Support/TargetRegistry.h"
-#include "llvm/Support/thread.h"
+#include "llvm/Support/ThreadPool.h"
 #include "llvm/Target/TargetMachine.h"
 #include "llvm/Transforms/Utils/SplitModule.h"
 
 using namespace llvm;
 
 static void codegen(Module *M, llvm::raw_pwrite_stream &OS,
-                    const Target *TheTarget, StringRef CPU, StringRef Features,
-                    const TargetOptions &Options, Reloc::Model RM,
-                    CodeModel::Model CM, CodeGenOpt::Level OL,
+                    function_ref<std::unique_ptr<TargetMachine>()> TMFactory,
                     TargetMachine::CodeGenFileType FileType) {
-  std::unique_ptr<TargetMachine> TM(TheTarget->createTargetMachine(
-      M->getTargetTriple(), CPU, Features, Options, RM, CM, OL));
-
+  std::unique_ptr<TargetMachine> TM = TMFactory();
   legacy::PassManager CodeGenPasses;
   if (TM->addPassesToEmitFile(CodeGenPasses, OS, FileType))
     report_fatal_error("Failed to setup codegen");
   CodeGenPasses.run(*M);
 }
 
-std::unique_ptr<Module>
-llvm::splitCodeGen(std::unique_ptr<Module> M,
-                   ArrayRef<llvm::raw_pwrite_stream *> OSs, StringRef CPU,
-                   StringRef Features, const TargetOptions &Options,
-                   Reloc::Model RM, CodeModel::Model CM, CodeGenOpt::Level OL,
-                   TargetMachine::CodeGenFileType FileType) {
-  StringRef TripleStr = M->getTargetTriple();
-  std::string ErrMsg;
-  const Target *TheTarget = TargetRegistry::lookupTarget(TripleStr, ErrMsg);
-  if (!TheTarget)
-    report_fatal_error(Twine("Target not found: ") + ErrMsg);
+std::unique_ptr<Module> llvm::splitCodeGen(
+    std::unique_ptr<Module> M, ArrayRef<llvm::raw_pwrite_stream *> OSs,
+    ArrayRef<llvm::raw_pwrite_stream *> BCOSs,
+    const std::function<std::unique_ptr<TargetMachine>()> &TMFactory,
+    TargetMachine::CodeGenFileType FileType, bool PreserveLocals) {
+  assert(BCOSs.empty() || BCOSs.size() == OSs.size());
 
   if (OSs.size() == 1) {
-    codegen(M.get(), *OSs[0], TheTarget, CPU, Features, Options, RM, CM,
-            OL, FileType);
+    if (!BCOSs.empty())
+      WriteBitcodeToFile(M.get(), *BCOSs[0]);
+    codegen(M.get(), *OSs[0], TMFactory, FileType);
     return M;
   }
 
-  std::vector<thread> Threads;
-  SplitModule(std::move(M), OSs.size(), [&](std::unique_ptr<Module> MPart) {
-    // We want to clone the module in a new context to multi-thread the codegen.
-    // We do it by serializing partition modules to bitcode (while still on the
-    // main thread, in order to avoid data races) and spinning up new threads
-    // which deserialize the partitions into separate contexts.
-    // FIXME: Provide a more direct way to do this in LLVM.
-    SmallVector<char, 0> BC;
-    raw_svector_ostream BCOS(BC);
-    WriteBitcodeToFile(MPart.get(), BCOS);
+  // Create ThreadPool in nested scope so that threads will be joined
+  // on destruction.
+  {
+    ThreadPool CodegenThreadPool(OSs.size());
+    int ThreadCount = 0;
 
-    llvm::raw_pwrite_stream *ThreadOS = OSs[Threads.size()];
-    Threads.emplace_back(
-        [TheTarget, CPU, Features, Options, RM, CM, OL, FileType,
-         ThreadOS](const SmallVector<char, 0> &BC) {
-          LLVMContext Ctx;
-          ErrorOr<std::unique_ptr<Module>> MOrErr =
-              parseBitcodeFile(MemoryBufferRef(StringRef(BC.data(), BC.size()),
-                                               "<split-module>"),
-                               Ctx);
-          if (!MOrErr)
-            report_fatal_error("Failed to read bitcode");
-          std::unique_ptr<Module> MPartInCtx = std::move(MOrErr.get());
+    SplitModule(
+        std::move(M), OSs.size(),
+        [&](std::unique_ptr<Module> MPart) {
+          // We want to clone the module in a new context to multi-thread the
+          // codegen. We do it by serializing partition modules to bitcode
+          // (while still on the main thread, in order to avoid data races) and
+          // spinning up new threads which deserialize the partitions into
+          // separate contexts.
+          // FIXME: Provide a more direct way to do this in LLVM.
+          SmallString<0> BC;
+          raw_svector_ostream BCOS(BC);
+          WriteBitcodeToFile(MPart.get(), BCOS);
 
-          codegen(MPartInCtx.get(), *ThreadOS, TheTarget, CPU, Features,
-                  Options, RM, CM, OL, FileType);
-        },
-        // Pass BC using std::move to ensure that it get moved rather than
-        // copied into the thread's context.
-        std::move(BC));
-  });
+          if (!BCOSs.empty()) {
+            BCOSs[ThreadCount]->write(BC.begin(), BC.size());
+            BCOSs[ThreadCount]->flush();
+          }
+
+          llvm::raw_pwrite_stream *ThreadOS = OSs[ThreadCount++];
+          // Enqueue the task
+          CodegenThreadPool.async(
+              [TMFactory, FileType, ThreadOS](const SmallString<0> &BC) {
+                LLVMContext Ctx;
+                ErrorOr<std::unique_ptr<Module>> MOrErr = parseBitcodeFile(
+                    MemoryBufferRef(StringRef(BC.data(), BC.size()),
+                                    "<split-module>"),
+                    Ctx);
+                if (!MOrErr)
+                  report_fatal_error("Failed to read bitcode");
+                std::unique_ptr<Module> MPartInCtx = std::move(MOrErr.get());
 
-  for (thread &T : Threads)
-    T.join();
+                codegen(MPartInCtx.get(), *ThreadOS, TMFactory, FileType);
+              },
+              // Pass BC using std::move to ensure that it get moved rather than
+              // copied into the thread's context.
+              std::move(BC));
+        },
+        PreserveLocals);
+  }
 
   return {};
 }
diff --git a/lib/CodeGen/PatchableFunction.cpp b/lib/CodeGen/PatchableFunction.cpp
new file mode 100644
index 000000000000..32468c90b864
--- /dev/null
+++ b/lib/CodeGen/PatchableFunction.cpp
@@ -0,0 +1,88 @@
+//===-- PatchableFunction.cpp - Patchable prologues for LLVM -------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements edits function bodies in place to support the
+// "patchable-function" attribute.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/CodeGen/Passes.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/Target/TargetFrameLowering.h"
+#include "llvm/Target/TargetInstrInfo.h"
+#include "llvm/Target/TargetSubtargetInfo.h"
+
+using namespace llvm;
+
+namespace {
+struct PatchableFunction : public MachineFunctionPass {
+  static char ID; // Pass identification, replacement for typeid
+  PatchableFunction() : MachineFunctionPass(ID) {
+    initializePatchableFunctionPass(*PassRegistry::getPassRegistry());
+  }
+
+  bool runOnMachineFunction(MachineFunction &F) override;
+   MachineFunctionProperties getRequiredProperties() const override {
+    return MachineFunctionProperties().set(
+        MachineFunctionProperties::Property::AllVRegsAllocated);
+  }
+};
+}
+
+/// Returns true if instruction \p MI will not result in actual machine code
+/// instructions.
+static bool doesNotGeneratecode(const MachineInstr &MI) {
+  // TODO: Introduce an MCInstrDesc flag for this
+  switch (MI.getOpcode()) {
+  default: return false;
+  case TargetOpcode::IMPLICIT_DEF:
+  case TargetOpcode::KILL:
+  case TargetOpcode::CFI_INSTRUCTION:
+  case TargetOpcode::EH_LABEL:
+  case TargetOpcode::GC_LABEL:
+  case TargetOpcode::DBG_VALUE:
+    return true;
+  }
+}
+
+bool PatchableFunction::runOnMachineFunction(MachineFunction &MF) {
+  if (!MF.getFunction()->hasFnAttribute("patchable-function"))
+    return false;
+
+#ifndef NDEBUG
+  Attribute PatchAttr = MF.getFunction()->getFnAttribute("patchable-function");
+  StringRef PatchType = PatchAttr.getValueAsString();
+  assert(PatchType == "prologue-short-redirect" && "Only possibility today!");
+#endif
+
+  auto &FirstMBB = *MF.begin();
+  MachineBasicBlock::iterator FirstActualI = FirstMBB.begin();
+  for (; doesNotGeneratecode(*FirstActualI); ++FirstActualI)
+    assert(FirstActualI != FirstMBB.end());
+
+  auto *TII = MF.getSubtarget().getInstrInfo();
+  auto MIB = BuildMI(FirstMBB, FirstActualI, FirstActualI->getDebugLoc(),
+                     TII->get(TargetOpcode::PATCHABLE_OP))
+                 .addImm(2)
+                 .addImm(FirstActualI->getOpcode());
+
+  for (auto &MO : FirstActualI->operands())
+    MIB.addOperand(MO);
+
+  FirstActualI->eraseFromParent();
+  MF.ensureAlignment(4);
+  return true;
+}
+
+char PatchableFunction::ID = 0;
+char &llvm::PatchableFunctionID = PatchableFunction::ID;
+INITIALIZE_PASS(PatchableFunction, "patchable-function",
+                "Implement the 'patchable-function' attribute", false, false)
diff --git a/lib/CodeGen/PeepholeOptimizer.cpp b/lib/CodeGen/PeepholeOptimizer.cpp
index 52b42b624ee1..60b27dd75a89 100644
--- a/lib/CodeGen/PeepholeOptimizer.cpp
+++ b/lib/CodeGen/PeepholeOptimizer.cpp
@@ -394,10 +394,10 @@ namespace {
 
 char PeepholeOptimizer::ID = 0;
 char &llvm::PeepholeOptimizerID = PeepholeOptimizer::ID;
-INITIALIZE_PASS_BEGIN(PeepholeOptimizer, "peephole-opts",
+INITIALIZE_PASS_BEGIN(PeepholeOptimizer, DEBUG_TYPE,
                 "Peephole Optimizations", false, false)
 INITIALIZE_PASS_DEPENDENCY(MachineDominatorTree)
-INITIALIZE_PASS_END(PeepholeOptimizer, "peephole-opts",
+INITIALIZE_PASS_END(PeepholeOptimizer, DEBUG_TYPE,
                 "Peephole Optimizations", false, false)
 
 /// If instruction is a copy-like instruction, i.e. it reads a single register
@@ -564,13 +564,13 @@ bool PeepholeOptimizer::optimizeCmpInstr(MachineInstr *MI,
   // physical register, we can try to optimize it.
   unsigned SrcReg, SrcReg2;
   int CmpMask, CmpValue;
-  if (!TII->analyzeCompare(MI, SrcReg, SrcReg2, CmpMask, CmpValue) ||
+  if (!TII->analyzeCompare(*MI, SrcReg, SrcReg2, CmpMask, CmpValue) ||
       TargetRegisterInfo::isPhysicalRegister(SrcReg) ||
       (SrcReg2 != 0 && TargetRegisterInfo::isPhysicalRegister(SrcReg2)))
     return false;
 
   // Attempt to optimize the comparison instruction.
-  if (TII->optimizeCompareInstr(MI, SrcReg, SrcReg2, CmpMask, CmpValue, MRI)) {
+  if (TII->optimizeCompareInstr(*MI, SrcReg, SrcReg2, CmpMask, CmpValue, MRI)) {
     ++NumCmps;
     return true;
   }
@@ -585,11 +585,11 @@ bool PeepholeOptimizer::optimizeSelect(MachineInstr *MI,
   unsigned FalseOp = 0;
   bool Optimizable = false;
   SmallVector<MachineOperand, 4> Cond;
-  if (TII->analyzeSelect(MI, Cond, TrueOp, FalseOp, Optimizable))
+  if (TII->analyzeSelect(*MI, Cond, TrueOp, FalseOp, Optimizable))
     return false;
   if (!Optimizable)
     return false;
-  if (!TII->optimizeSelect(MI, LocalMIs))
+  if (!TII->optimizeSelect(*MI, LocalMIs))
     return false;
   MI->eraseFromParent();
   ++NumSelects;
@@ -599,7 +599,7 @@ bool PeepholeOptimizer::optimizeSelect(MachineInstr *MI,
 /// \brief Check if a simpler conditional branch can be
 // generated
 bool PeepholeOptimizer::optimizeCondBranch(MachineInstr *MI) {
-  return TII->optimizeCondBranch(MI);
+  return TII->optimizeCondBranch(*MI);
 }
 
 /// \brief Try to find the next source that share the same register file
@@ -1351,7 +1351,7 @@ bool PeepholeOptimizer::foldImmediate(
       continue;
     DenseMap<unsigned, MachineInstr*>::iterator II = ImmDefMIs.find(Reg);
     assert(II != ImmDefMIs.end() && "couldn't find immediate definition");
-    if (TII->FoldImmediate(MI, II->second, Reg, MRI)) {
+    if (TII->FoldImmediate(*MI, *II->second, Reg, MRI)) {
       ++NumImmFold;
       return true;
     }
@@ -1471,7 +1471,7 @@ bool PeepholeOptimizer::foldRedundantNAPhysCopy(
 }
 
 bool PeepholeOptimizer::runOnMachineFunction(MachineFunction &MF) {
-  if (skipOptnoneFunction(*MF.getFunction()))
+  if (skipFunction(*MF.getFunction()))
     return false;
 
   DEBUG(dbgs() << "********** PEEPHOLE OPTIMIZER **********\n");
@@ -1636,10 +1636,8 @@ bool PeepholeOptimizer::runOnMachineFunction(MachineFunction &MF) {
             // we need it for markUsesInDebugValueAsUndef().
             unsigned FoldedReg = FoldAsLoadDefReg;
             MachineInstr *DefMI = nullptr;
-            MachineInstr *FoldMI = TII->optimizeLoadInstr(MI, MRI,
-                                                          FoldAsLoadDefReg,
-                                                          DefMI);
-            if (FoldMI) {
+            if (MachineInstr *FoldMI =
+                    TII->optimizeLoadInstr(*MI, MRI, FoldAsLoadDefReg, DefMI)) {
               // Update LocalMIs since we replaced MI with FoldMI and deleted
               // DefMI.
               DEBUG(dbgs() << "Replacing: " << *MI);
@@ -1888,9 +1886,11 @@ ValueTrackerResult ValueTracker::getNextSourceFromPHI() {
 ValueTrackerResult ValueTracker::getNextSourceImpl() {
   assert(Def && "This method needs a valid definition");
 
-  assert(
-      (DefIdx < Def->getDesc().getNumDefs() || Def->getDesc().isVariadic()) &&
-      Def->getOperand(DefIdx).isDef() && "Invalid DefIdx");
+  assert(((Def->getOperand(DefIdx).isDef() &&
+           (DefIdx < Def->getDesc().getNumDefs() ||
+            Def->getDesc().isVariadic())) ||
+          Def->getOperand(DefIdx).isImplicit()) &&
+         "Invalid DefIdx");
   if (Def->isCopy())
     return getNextSourceFromCopy();
   if (Def->isBitcast())
diff --git a/lib/CodeGen/PostRAHazardRecognizer.cpp b/lib/CodeGen/PostRAHazardRecognizer.cpp
new file mode 100644
index 000000000000..5bc5f7524dbf
--- /dev/null
+++ b/lib/CodeGen/PostRAHazardRecognizer.cpp
@@ -0,0 +1,98 @@
+//===----- PostRAHazardRecognizer.cpp - hazard recognizer -----------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+/// \file
+/// This runs the hazard recognizer and emits noops when necessary.  This
+/// gives targets a way to run the hazard recognizer without running one of
+/// the schedulers.  Example use cases for this pass would be:
+///
+/// - Targets that need the hazard recognizer to be run at -O0.
+/// - Targets that want to guarantee that hazards at the beginning of
+///   scheduling regions are handled correctly.  The post-RA scheduler is
+///   a top-down scheduler, but when there are multiple scheduling regions
+///   in a basic block, it visits the regions in bottom-up order.  This
+///   makes it impossible for the scheduler to gauranttee it can correctly
+///   handle hazards at the beginning of scheduling regions.
+///
+/// This pass traverses all the instructions in a program in top-down order.
+/// In contrast to the instruction scheduling passes, this pass never resets
+/// the hazard recognizer to ensure it can correctly handles noop hazards at
+/// the begining of blocks.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/CodeGen/Passes.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/ScheduleHazardRecognizer.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Target/TargetInstrInfo.h"
+#include "llvm/Target/TargetSubtargetInfo.h"
+using namespace llvm;
+
+#define DEBUG_TYPE "post-RA-hazard-rec"
+
+STATISTIC(NumNoops, "Number of noops inserted");
+
+namespace {
+  class PostRAHazardRecognizer : public MachineFunctionPass {
+
+  public:
+    static char ID;
+    PostRAHazardRecognizer() : MachineFunctionPass(ID) {}
+
+    void getAnalysisUsage(AnalysisUsage &AU) const override {
+      AU.setPreservesCFG();
+      MachineFunctionPass::getAnalysisUsage(AU);
+    }
+
+    bool runOnMachineFunction(MachineFunction &Fn) override;
+
+  };
+  char PostRAHazardRecognizer::ID = 0;
+
+}
+
+char &llvm::PostRAHazardRecognizerID = PostRAHazardRecognizer::ID;
+
+INITIALIZE_PASS(PostRAHazardRecognizer, DEBUG_TYPE,
+                "Post RA hazard recognizer", false, false)
+
+bool PostRAHazardRecognizer::runOnMachineFunction(MachineFunction &Fn) {
+  const TargetInstrInfo *TII = Fn.getSubtarget().getInstrInfo();
+  std::unique_ptr<ScheduleHazardRecognizer> HazardRec(
+      TII->CreateTargetPostRAHazardRecognizer(Fn));
+
+  // Return if the target has not implemented a hazard recognizer.
+  if (!HazardRec.get())
+    return false;
+
+  // Loop over all of the basic blocks
+  for (auto &MBB : Fn) {
+    // We do not call HazardRec->reset() here to make sure we are handling noop
+    // hazards at the start of basic blocks.
+    for (MachineInstr &MI : MBB) {
+      // If we need to emit noops prior to this instruction, then do so.
+      unsigned NumPreNoops = HazardRec->PreEmitNoops(&MI);
+      for (unsigned i = 0; i != NumPreNoops; ++i) {
+        HazardRec->EmitNoop();
+        TII->insertNoop(MBB, MachineBasicBlock::iterator(MI));
+        ++NumNoops;
+      }
+
+      HazardRec->EmitInstruction(&MI);
+      if (HazardRec->atIssueLimit()) {
+        HazardRec->AdvanceCycle();
+      }
+    }
+  }
+  return true;
+}
diff --git a/lib/CodeGen/PostRASchedulerList.cpp b/lib/CodeGen/PostRASchedulerList.cpp
index b95dffd05c46..3fce307f3dd4 100644
--- a/lib/CodeGen/PostRASchedulerList.cpp
+++ b/lib/CodeGen/PostRASchedulerList.cpp
@@ -18,11 +18,9 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "llvm/CodeGen/Passes.h"
 #include "AggressiveAntiDepBreaker.h"
 #include "AntiDepBreaker.h"
 #include "CriticalAntiDepBreaker.h"
-#include "llvm/ADT/BitVector.h"
 #include "llvm/ADT/Statistic.h"
 #include "llvm/Analysis/AliasAnalysis.h"
 #include "llvm/CodeGen/LatencyPriorityQueue.h"
@@ -31,10 +29,12 @@
 #include "llvm/CodeGen/MachineFunctionPass.h"
 #include "llvm/CodeGen/MachineLoopInfo.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/Passes.h"
 #include "llvm/CodeGen/RegisterClassInfo.h"
 #include "llvm/CodeGen/ScheduleDAGInstrs.h"
 #include "llvm/CodeGen/ScheduleHazardRecognizer.h"
 #include "llvm/CodeGen/SchedulerRegistry.h"
+#include "llvm/CodeGen/TargetPassConfig.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/ErrorHandling.h"
@@ -96,8 +96,14 @@ namespace {
       MachineFunctionPass::getAnalysisUsage(AU);
     }
 
+    MachineFunctionProperties getRequiredProperties() const override {
+      return MachineFunctionProperties().set(
+          MachineFunctionProperties::Property::AllVRegsAllocated);
+    }
+
     bool runOnMachineFunction(MachineFunction &Fn) override;
 
+  private:
     bool enablePostRAScheduler(
         const TargetSubtargetInfo &ST, CodeGenOpt::Level OptLevel,
         TargetSubtargetInfo::AntiDepBreakMode &Mode,
@@ -128,6 +134,9 @@ namespace {
     /// The schedule. Null SUnit*'s represent noop instructions.
     std::vector<SUnit*> Sequence;
 
+    /// Ordered list of DAG postprocessing steps.
+    std::vector<std::unique_ptr<ScheduleDAGMutation>> Mutations;
+
     /// The index in BB of RegionEnd.
     ///
     /// This is the instruction number from the top of the current block, not
@@ -169,13 +178,16 @@ namespace {
     /// Observe - Update liveness information to account for the current
     /// instruction, which will not be scheduled.
     ///
-    void Observe(MachineInstr *MI, unsigned Count);
+    void Observe(MachineInstr &MI, unsigned Count);
 
     /// finishBlock - Clean up register live-range state.
     ///
     void finishBlock() override;
 
   private:
+    /// Apply each ScheduleDAGMutation step in order.
+    void postprocessDAG();
+
     void ReleaseSucc(SUnit *SU, SDep *SuccEdge);
     void ReleaseSuccessors(SUnit *SU);
     void ScheduleNodeTopDown(SUnit *SU, unsigned CurCycle);
@@ -203,6 +215,7 @@ SchedulePostRATDList::SchedulePostRATDList(
   HazardRec =
       MF.getSubtarget().getInstrInfo()->CreateTargetPostRAHazardRecognizer(
           InstrItins, this);
+  MF.getSubtarget().getPostRAMutations(Mutations);
 
   assert((AntiDepMode == TargetSubtargetInfo::ANTIDEP_NONE ||
           MRI.tracksLiveness()) &&
@@ -257,12 +270,17 @@ bool PostRAScheduler::enablePostRAScheduler(
     TargetSubtargetInfo::RegClassVector &CriticalPathRCs) const {
   Mode = ST.getAntiDepBreakMode();
   ST.getCriticalPathRCs(CriticalPathRCs);
+
+  // Check for explicit enable/disable of post-ra scheduling.
+  if (EnablePostRAScheduler.getPosition() > 0)
+    return EnablePostRAScheduler;
+
   return ST.enablePostRAScheduler() &&
          OptLevel >= ST.getOptLevelToEnablePostRAScheduler();
 }
 
 bool PostRAScheduler::runOnMachineFunction(MachineFunction &Fn) {
-  if (skipOptnoneFunction(*Fn.getFunction()))
+  if (skipFunction(*Fn.getFunction()))
     return false;
 
   TII = Fn.getSubtarget().getInstrInfo();
@@ -272,20 +290,15 @@ bool PostRAScheduler::runOnMachineFunction(MachineFunction &Fn) {
 
   RegClassInfo.runOnMachineFunction(Fn);
 
-  // Check for explicit enable/disable of post-ra scheduling.
   TargetSubtargetInfo::AntiDepBreakMode AntiDepMode =
     TargetSubtargetInfo::ANTIDEP_NONE;
   SmallVector<const TargetRegisterClass*, 4> CriticalPathRCs;
-  if (EnablePostRAScheduler.getPosition() > 0) {
-    if (!EnablePostRAScheduler)
-      return false;
-  } else {
-    // Check that post-RA scheduling is enabled for this target.
-    // This may upgrade the AntiDepMode.
-    if (!enablePostRAScheduler(Fn.getSubtarget(), PassConfig->getOptLevel(),
-                               AntiDepMode, CriticalPathRCs))
-      return false;
-  }
+
+  // Check that post-RA scheduling is enabled for this target.
+  // This may upgrade the AntiDepMode.
+  if (!enablePostRAScheduler(Fn.getSubtarget(), PassConfig->getOptLevel(),
+                             AntiDepMode, CriticalPathRCs))
+    return false;
 
   // Check for antidep breaking override...
   if (EnableAntiDepBreaking.getPosition() > 0) {
@@ -322,24 +335,24 @@ bool PostRAScheduler::runOnMachineFunction(MachineFunction &Fn) {
     MachineBasicBlock::iterator Current = MBB.end();
     unsigned Count = MBB.size(), CurrentCount = Count;
     for (MachineBasicBlock::iterator I = Current; I != MBB.begin();) {
-      MachineInstr *MI = std::prev(I);
+      MachineInstr &MI = *std::prev(I);
       --Count;
       // Calls are not scheduling boundaries before register allocation, but
       // post-ra we don't gain anything by scheduling across calls since we
       // don't need to worry about register pressure.
-      if (MI->isCall() || TII->isSchedulingBoundary(MI, &MBB, Fn)) {
+      if (MI.isCall() || TII->isSchedulingBoundary(MI, &MBB, Fn)) {
         Scheduler.enterRegion(&MBB, I, Current, CurrentCount - Count);
         Scheduler.setEndIndex(CurrentCount);
         Scheduler.schedule();
         Scheduler.exitRegion();
         Scheduler.EmitSchedule();
-        Current = MI;
+        Current = &MI;
         CurrentCount = Count;
         Scheduler.Observe(MI, CurrentCount);
       }
       I = MI;
-      if (MI->isBundle())
-        Count -= MI->getBundleSize();
+      if (MI.isBundle())
+        Count -= MI.getBundleSize();
     }
     assert(Count == 0 && "Instruction count mismatch!");
     assert((MBB.begin() == Current || CurrentCount != 0) &&
@@ -398,6 +411,8 @@ void SchedulePostRATDList::schedule() {
     }
   }
 
+  postprocessDAG();
+
   DEBUG(dbgs() << "********** List Scheduling **********\n");
   DEBUG(
     for (const SUnit &SU : SUnits) {
@@ -414,7 +429,7 @@ void SchedulePostRATDList::schedule() {
 /// Observe - Update liveness information to account for the current
 /// instruction, which will not be scheduled.
 ///
-void SchedulePostRATDList::Observe(MachineInstr *MI, unsigned Count) {
+void SchedulePostRATDList::Observe(MachineInstr &MI, unsigned Count) {
   if (AntiDepBreak)
     AntiDepBreak->Observe(MI, Count, EndIndex);
 }
@@ -429,6 +444,12 @@ void SchedulePostRATDList::finishBlock() {
   ScheduleDAGInstrs::finishBlock();
 }
 
+/// Apply each ScheduleDAGMutation step in order.
+void SchedulePostRATDList::postprocessDAG() {
+  for (auto &M : Mutations)
+    M->apply(this);
+}
+
 //===----------------------------------------------------------------------===//
 //  Top-Down Scheduling
 //===----------------------------------------------------------------------===//
diff --git a/lib/CodeGen/PreISelIntrinsicLowering.cpp b/lib/CodeGen/PreISelIntrinsicLowering.cpp
new file mode 100644
index 000000000000..fbc2bc64f425
--- /dev/null
+++ b/lib/CodeGen/PreISelIntrinsicLowering.cpp
@@ -0,0 +1,94 @@
+//===-- PreISelIntrinsicLowering.cpp - Pre-ISel intrinsic lowering pass ---===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This pass implements IR lowering for the llvm.load.relative intrinsic.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/CodeGen/PreISelIntrinsicLowering.h"
+#include "llvm/CodeGen/Passes.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/Intrinsics.h"
+#include "llvm/IR/Module.h"
+#include "llvm/Pass.h"
+
+using namespace llvm;
+
+namespace {
+
+bool lowerLoadRelative(Function &F) {
+  if (F.use_empty())
+    return false;
+
+  bool Changed = false;
+  Type *Int32Ty = Type::getInt32Ty(F.getContext());
+  Type *Int32PtrTy = Int32Ty->getPointerTo();
+  Type *Int8Ty = Type::getInt8Ty(F.getContext());
+
+  for (auto I = F.use_begin(), E = F.use_end(); I != E;) {
+    auto CI = dyn_cast<CallInst>(I->getUser());
+    ++I;
+    if (!CI || CI->getCalledValue() != &F)
+      continue;
+
+    IRBuilder<> B(CI);
+    Value *OffsetPtr =
+        B.CreateGEP(Int8Ty, CI->getArgOperand(0), CI->getArgOperand(1));
+    Value *OffsetPtrI32 = B.CreateBitCast(OffsetPtr, Int32PtrTy);
+    Value *OffsetI32 = B.CreateAlignedLoad(OffsetPtrI32, 4);
+
+    Value *ResultPtr = B.CreateGEP(Int8Ty, CI->getArgOperand(0), OffsetI32);
+
+    CI->replaceAllUsesWith(ResultPtr);
+    CI->eraseFromParent();
+    Changed = true;
+  }
+
+  return Changed;
+}
+
+bool lowerIntrinsics(Module &M) {
+  bool Changed = false;
+  for (Function &F : M) {
+    if (F.getName().startswith("llvm.load.relative."))
+      Changed |= lowerLoadRelative(F);
+  }
+  return Changed;
+}
+
+class PreISelIntrinsicLoweringLegacyPass : public ModulePass {
+public:
+  static char ID;
+  PreISelIntrinsicLoweringLegacyPass() : ModulePass(ID) {}
+
+  bool runOnModule(Module &M) { return lowerIntrinsics(M); }
+};
+
+char PreISelIntrinsicLoweringLegacyPass::ID;
+}
+
+INITIALIZE_PASS(PreISelIntrinsicLoweringLegacyPass,
+                "pre-isel-intrinsic-lowering", "Pre-ISel Intrinsic Lowering",
+                false, false)
+
+namespace llvm {
+ModulePass *createPreISelIntrinsicLoweringPass() {
+  return new PreISelIntrinsicLoweringLegacyPass;
+}
+
+PreservedAnalyses PreISelIntrinsicLoweringPass::run(Module &M,
+                                                    ModuleAnalysisManager &AM) {
+  if (!lowerIntrinsics(M))
+    return PreservedAnalyses::all();
+  else
+    return PreservedAnalyses::none();
+}
+} // End llvm namespace
diff --git a/lib/CodeGen/PrologEpilogInserter.cpp b/lib/CodeGen/PrologEpilogInserter.cpp
index 939c50027b02..20a9a394ebd0 100644
--- a/lib/CodeGen/PrologEpilogInserter.cpp
+++ b/lib/CodeGen/PrologEpilogInserter.cpp
@@ -16,7 +16,6 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "llvm/ADT/IndexedMap.h"
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/SetVector.h"
 #include "llvm/ADT/SmallSet.h"
@@ -35,7 +34,6 @@
 #include "llvm/IR/InlineAsm.h"
 #include "llvm/IR/LLVMContext.h"
 #include "llvm/Support/CommandLine.h"
-#include "llvm/Support/Compiler.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/Target/TargetFrameLowering.h"
@@ -49,48 +47,83 @@ using namespace llvm;
 
 #define DEBUG_TYPE "pei"
 
+typedef SmallVector<MachineBasicBlock *, 4> MBBVector;
+static void doSpillCalleeSavedRegs(MachineFunction &MF, RegScavenger *RS,
+                                   unsigned &MinCSFrameIndex,
+                                   unsigned &MaxCXFrameIndex,
+                                   const MBBVector &SaveBlocks,
+                                   const MBBVector &RestoreBlocks);
+
+static void doScavengeFrameVirtualRegs(MachineFunction &MF, RegScavenger *RS);
+
 namespace {
 class PEI : public MachineFunctionPass {
 public:
   static char ID;
-  PEI() : MachineFunctionPass(ID) {
+  explicit PEI(const TargetMachine *TM = nullptr) : MachineFunctionPass(ID) {
     initializePEIPass(*PassRegistry::getPassRegistry());
+
+    if (TM && (!TM->usesPhysRegsForPEI())) {
+      SpillCalleeSavedRegisters = [](MachineFunction &, RegScavenger *,
+                                     unsigned &, unsigned &, const MBBVector &,
+                                     const MBBVector &) {};
+      ScavengeFrameVirtualRegs = [](MachineFunction &, RegScavenger *) {};
+    } else {
+      SpillCalleeSavedRegisters = doSpillCalleeSavedRegs;
+      ScavengeFrameVirtualRegs = doScavengeFrameVirtualRegs;
+      UsesCalleeSaves = true;
+    }
   }
 
   void getAnalysisUsage(AnalysisUsage &AU) const override;
 
+  MachineFunctionProperties getRequiredProperties() const override {
+    MachineFunctionProperties MFP;
+    if (UsesCalleeSaves)
+      MFP.set(MachineFunctionProperties::Property::AllVRegsAllocated);
+    return MFP;
+  }
+
   /// runOnMachineFunction - Insert prolog/epilog code and replace abstract
   /// frame indexes with appropriate references.
   ///
   bool runOnMachineFunction(MachineFunction &Fn) override;
 
 private:
+  std::function<void(MachineFunction &MF, RegScavenger *RS,
+                     unsigned &MinCSFrameIndex, unsigned &MaxCSFrameIndex,
+                     const MBBVector &SaveBlocks,
+                     const MBBVector &RestoreBlocks)>
+      SpillCalleeSavedRegisters;
+  std::function<void(MachineFunction &MF, RegScavenger *RS)>
+      ScavengeFrameVirtualRegs;
+
+  bool UsesCalleeSaves = false;
+
   RegScavenger *RS;
 
   // MinCSFrameIndex, MaxCSFrameIndex - Keeps the range of callee saved
   // stack frame indexes.
-  unsigned MinCSFrameIndex, MaxCSFrameIndex;
+  unsigned MinCSFrameIndex = std::numeric_limits<unsigned>::max();
+  unsigned MaxCSFrameIndex = 0;
 
   // Save and Restore blocks of the current function. Typically there is a
   // single save block, unless Windows EH funclets are involved.
-  SmallVector<MachineBasicBlock *, 1> SaveBlocks;
-  SmallVector<MachineBasicBlock *, 4> RestoreBlocks;
+  MBBVector SaveBlocks;
+  MBBVector RestoreBlocks;
 
   // Flag to control whether to use the register scavenger to resolve
   // frame index materialization registers. Set according to
   // TRI->requiresFrameIndexScavenging() for the current function.
   bool FrameIndexVirtualScavenging;
 
-  void calculateSets(MachineFunction &Fn);
-  void calculateCallsInformation(MachineFunction &Fn);
-  void assignCalleeSavedSpillSlots(MachineFunction &Fn,
-                                   const BitVector &SavedRegs);
-  void insertCSRSpillsAndRestores(MachineFunction &Fn);
+  void calculateCallFrameInfo(MachineFunction &Fn);
+  void calculateSaveRestoreBlocks(MachineFunction &Fn);
+
   void calculateFrameObjectOffsets(MachineFunction &Fn);
   void replaceFrameIndices(MachineFunction &Fn);
   void replaceFrameIndices(MachineBasicBlock *BB, MachineFunction &Fn,
                            int &SPAdj);
-  void scavengeFrameVirtualRegs(MachineFunction &Fn);
   void insertPrologEpilogCode(MachineFunction &Fn);
 };
 } // namespace
@@ -103,15 +136,19 @@ WarnStackSize("warn-stack-size", cl::Hidden, cl::init((unsigned)-1),
               cl::desc("Warn for stack size bigger than the given"
                        " number"));
 
-INITIALIZE_PASS_BEGIN(PEI, "prologepilog",
-                "Prologue/Epilogue Insertion", false, false)
+INITIALIZE_TM_PASS_BEGIN(PEI, "prologepilog", "Prologue/Epilogue Insertion",
+                         false, false)
 INITIALIZE_PASS_DEPENDENCY(MachineLoopInfo)
 INITIALIZE_PASS_DEPENDENCY(MachineDominatorTree)
 INITIALIZE_PASS_DEPENDENCY(StackProtector)
-INITIALIZE_PASS_DEPENDENCY(TargetPassConfig)
-INITIALIZE_PASS_END(PEI, "prologepilog",
-                    "Prologue/Epilogue Insertion & Frame Finalization",
-                    false, false)
+INITIALIZE_TM_PASS_END(PEI, "prologepilog",
+                       "Prologue/Epilogue Insertion & Frame Finalization",
+                       false, false)
+
+MachineFunctionPass *
+llvm::createPrologEpilogInserterPass(const TargetMachine *TM) {
+  return new PEI(TM);
+}
 
 STATISTIC(NumScavengedRegs, "Number of frame index regs scavenged");
 STATISTIC(NumBytesStackSpace,
@@ -122,40 +159,9 @@ void PEI::getAnalysisUsage(AnalysisUsage &AU) const {
   AU.addPreserved<MachineLoopInfo>();
   AU.addPreserved<MachineDominatorTree>();
   AU.addRequired<StackProtector>();
-  AU.addRequired<TargetPassConfig>();
   MachineFunctionPass::getAnalysisUsage(AU);
 }
 
-/// Compute the set of return blocks
-void PEI::calculateSets(MachineFunction &Fn) {
-  const MachineFrameInfo *MFI = Fn.getFrameInfo();
-
-  // Even when we do not change any CSR, we still want to insert the
-  // prologue and epilogue of the function.
-  // So set the save points for those.
-
-  // Use the points found by shrink-wrapping, if any.
-  if (MFI->getSavePoint()) {
-    SaveBlocks.push_back(MFI->getSavePoint());
-    assert(MFI->getRestorePoint() && "Both restore and save must be set");
-    MachineBasicBlock *RestoreBlock = MFI->getRestorePoint();
-    // If RestoreBlock does not have any successor and is not a return block
-    // then the end point is unreachable and we do not need to insert any
-    // epilogue.
-    if (!RestoreBlock->succ_empty() || RestoreBlock->isReturnBlock())
-      RestoreBlocks.push_back(RestoreBlock);
-    return;
-  }
-
-  // Save refs to entry and return blocks.
-  SaveBlocks.push_back(&Fn.front());
-  for (MachineBasicBlock &MBB : Fn) {
-    if (MBB.isEHFuncletEntry())
-      SaveBlocks.push_back(&MBB);
-    if (MBB.isReturnBlock())
-      RestoreBlocks.push_back(&MBB);
-  }
-}
 
 /// StackObjSet - A set of stack object indexes
 typedef SmallSetVector<int, 8> StackObjSet;
@@ -168,30 +174,21 @@ bool PEI::runOnMachineFunction(MachineFunction &Fn) {
   const TargetRegisterInfo *TRI = Fn.getSubtarget().getRegisterInfo();
   const TargetFrameLowering *TFI = Fn.getSubtarget().getFrameLowering();
 
-  assert(!Fn.getRegInfo().getNumVirtRegs() && "Regalloc must assign all vregs");
-
   RS = TRI->requiresRegisterScavenging(Fn) ? new RegScavenger() : nullptr;
   FrameIndexVirtualScavenging = TRI->requiresFrameIndexScavenging(Fn);
 
   // Calculate the MaxCallFrameSize and AdjustsStack variables for the
   // function's frame information. Also eliminates call frame pseudo
   // instructions.
-  calculateCallsInformation(Fn);
+  calculateCallFrameInfo(Fn);
 
-  // Determine which of the registers in the callee save list should be saved.
-  BitVector SavedRegs;
-  TFI->determineCalleeSaves(Fn, SavedRegs, RS);
-
-  // Insert spill code for any callee saved registers that are modified.
-  assignCalleeSavedSpillSlots(Fn, SavedRegs);
-
-  // Determine placement of CSR spill/restore code:
+  // Determine placement of CSR spill/restore code and prolog/epilog code:
   // place all spills in the entry block, all restores in return blocks.
-  calculateSets(Fn);
+  calculateSaveRestoreBlocks(Fn);
 
-  // Add the code to save and restore the callee saved registers.
-  if (!F->hasFnAttribute(Attribute::Naked))
-    insertCSRSpillsAndRestores(Fn);
+  // Handle CSR spilling and restoring, for targets that need it.
+  SpillCalleeSavedRegisters(Fn, RS, MinCSFrameIndex, MaxCSFrameIndex,
+                            SaveBlocks, RestoreBlocks);
 
   // Allow the target machine to make final modifications to the function
   // before the frame layout is finalized.
@@ -216,11 +213,12 @@ bool PEI::runOnMachineFunction(MachineFunction &Fn) {
   // If register scavenging is needed, as we've enabled doing it as a
   // post-pass, scavenge the virtual registers that frame index elimination
   // inserted.
-  if (TRI->requiresRegisterScavenging(Fn) && FrameIndexVirtualScavenging)
-    scavengeFrameVirtualRegs(Fn);
+  if (TRI->requiresRegisterScavenging(Fn) && FrameIndexVirtualScavenging) {
+      ScavengeFrameVirtualRegs(Fn, RS);
 
-  // Clear any vregs created by virtual scavenging.
-  Fn.getRegInfo().clearVirtRegs();
+      // Clear any vregs created by virtual scavenging.
+      Fn.getRegInfo().clearVirtRegs();
+  }
 
   // Warn on stack size when we exceeds the given limit.
   MachineFrameInfo *MFI = Fn.getFrameInfo();
@@ -233,13 +231,15 @@ bool PEI::runOnMachineFunction(MachineFunction &Fn) {
   delete RS;
   SaveBlocks.clear();
   RestoreBlocks.clear();
+  MFI->setSavePoint(nullptr);
+  MFI->setRestorePoint(nullptr);
   return true;
 }
 
-/// calculateCallsInformation - Calculate the MaxCallFrameSize and AdjustsStack
+/// Calculate the MaxCallFrameSize and AdjustsStack
 /// variables for the function's frame information and eliminate call frame
 /// pseudo instructions.
-void PEI::calculateCallsInformation(MachineFunction &Fn) {
+void PEI::calculateCallFrameInfo(MachineFunction &Fn) {
   const TargetInstrInfo &TII = *Fn.getSubtarget().getInstrInfo();
   const TargetFrameLowering *TFI = Fn.getSubtarget().getFrameLowering();
   MachineFrameInfo *MFI = Fn.getFrameInfo();
@@ -290,12 +290,42 @@ void PEI::calculateCallsInformation(MachineFunction &Fn) {
   }
 }
 
-void PEI::assignCalleeSavedSpillSlots(MachineFunction &F,
-                                      const BitVector &SavedRegs) {
-  // These are used to keep track the callee-save area. Initialize them.
-  MinCSFrameIndex = INT_MAX;
-  MaxCSFrameIndex = 0;
+/// Compute the sets of entry and return blocks for saving and restoring
+/// callee-saved registers, and placing prolog and epilog code.
+void PEI::calculateSaveRestoreBlocks(MachineFunction &Fn) {
+  const MachineFrameInfo *MFI = Fn.getFrameInfo();
+
+  // Even when we do not change any CSR, we still want to insert the
+  // prologue and epilogue of the function.
+  // So set the save points for those.
 
+  // Use the points found by shrink-wrapping, if any.
+  if (MFI->getSavePoint()) {
+    SaveBlocks.push_back(MFI->getSavePoint());
+    assert(MFI->getRestorePoint() && "Both restore and save must be set");
+    MachineBasicBlock *RestoreBlock = MFI->getRestorePoint();
+    // If RestoreBlock does not have any successor and is not a return block
+    // then the end point is unreachable and we do not need to insert any
+    // epilogue.
+    if (!RestoreBlock->succ_empty() || RestoreBlock->isReturnBlock())
+      RestoreBlocks.push_back(RestoreBlock);
+    return;
+  }
+
+  // Save refs to entry and return blocks.
+  SaveBlocks.push_back(&Fn.front());
+  for (MachineBasicBlock &MBB : Fn) {
+    if (MBB.isEHFuncletEntry())
+      SaveBlocks.push_back(&MBB);
+    if (MBB.isReturnBlock())
+      RestoreBlocks.push_back(&MBB);
+  }
+}
+
+static void assignCalleeSavedSpillSlots(MachineFunction &F,
+                                        const BitVector &SavedRegs,
+                                        unsigned &MinCSFrameIndex,
+                                        unsigned &MaxCSFrameIndex) {
   if (SavedRegs.empty())
     return;
 
@@ -323,14 +353,13 @@ void PEI::assignCalleeSavedSpillSlots(MachineFunction &F,
 
     // Now that we know which registers need to be saved and restored, allocate
     // stack slots for them.
-    for (std::vector<CalleeSavedInfo>::iterator I = CSI.begin(), E = CSI.end();
-         I != E; ++I) {
-      unsigned Reg = I->getReg();
+    for (auto &CS : CSI) {
+      unsigned Reg = CS.getReg();
       const TargetRegisterClass *RC = RegInfo->getMinimalPhysRegClass(Reg);
 
       int FrameIdx;
       if (RegInfo->hasReservedSpillSlot(F, Reg, FrameIdx)) {
-        I->setFrameIdx(FrameIdx);
+        CS.setFrameIdx(FrameIdx);
         continue;
       }
 
@@ -359,7 +388,7 @@ void PEI::assignCalleeSavedSpillSlots(MachineFunction &F,
             MFI->CreateFixedSpillStackObject(RC->getSize(), FixedSlot->Offset);
       }
 
-      I->setFrameIdx(FrameIdx);
+      CS.setFrameIdx(FrameIdx);
     }
   }
 
@@ -427,7 +456,9 @@ static void updateLiveness(MachineFunction &MF) {
 /// insertCSRSpillsAndRestores - Insert spill and restore code for
 /// callee saved registers used in the function.
 ///
-void PEI::insertCSRSpillsAndRestores(MachineFunction &Fn) {
+static void insertCSRSpillsAndRestores(MachineFunction &Fn,
+                                       const MBBVector &SaveBlocks,
+                                       const MBBVector &RestoreBlocks) {
   // Get callee saved register information.
   MachineFrameInfo *MFI = Fn.getFrameInfo();
   const std::vector<CalleeSavedInfo> &CSI = MFI->getCalleeSavedInfo();
@@ -496,6 +527,28 @@ void PEI::insertCSRSpillsAndRestores(MachineFunction &Fn) {
   }
 }
 
+static void doSpillCalleeSavedRegs(MachineFunction &Fn, RegScavenger *RS,
+                                   unsigned &MinCSFrameIndex,
+                                   unsigned &MaxCSFrameIndex,
+                                   const MBBVector &SaveBlocks,
+                                   const MBBVector &RestoreBlocks) {
+  const Function *F = Fn.getFunction();
+  const TargetFrameLowering *TFI = Fn.getSubtarget().getFrameLowering();
+  MinCSFrameIndex = std::numeric_limits<unsigned>::max();
+  MaxCSFrameIndex = 0;
+
+  // Determine which of the registers in the callee save list should be saved.
+  BitVector SavedRegs;
+  TFI->determineCalleeSaves(Fn, SavedRegs, RS);
+
+  // Assign stack slots for any callee-saved registers that must be spilled.
+  assignCalleeSavedSpillSlots(Fn, SavedRegs, MinCSFrameIndex, MaxCSFrameIndex);
+
+  // Add the code to save and restore the callee saved registers.
+  if (!F->hasFnAttribute(Attribute::Naked))
+    insertCSRSpillsAndRestores(Fn, SaveBlocks, RestoreBlocks);
+}
+
 /// AdjustStackOffset - Helper function used to adjust the stack frame offset.
 static inline void
 AdjustStackOffset(MachineFrameInfo *MFI, int FrameIdx,
@@ -512,7 +565,7 @@ AdjustStackOffset(MachineFrameInfo *MFI, int FrameIdx,
   MaxAlign = std::max(MaxAlign, Align);
 
   // Adjust to alignment boundary.
-  Offset = RoundUpToAlignment(Offset, Align, Skew);
+  Offset = alignTo(Offset, Align, Skew);
 
   if (StackGrowsDown) {
     DEBUG(dbgs() << "alloc FI(" << FrameIdx << ") at SP[" << -Offset << "]\n");
@@ -524,6 +577,108 @@ AdjustStackOffset(MachineFrameInfo *MFI, int FrameIdx,
   }
 }
 
+/// Compute which bytes of fixed and callee-save stack area are unused and keep
+/// track of them in StackBytesFree.
+///
+static inline void
+computeFreeStackSlots(MachineFrameInfo *MFI, bool StackGrowsDown,
+                      unsigned MinCSFrameIndex, unsigned MaxCSFrameIndex,
+                      int64_t FixedCSEnd, BitVector &StackBytesFree) {
+  // Avoid undefined int64_t -> int conversion below in extreme case.
+  if (FixedCSEnd > std::numeric_limits<int>::max())
+    return;
+
+  StackBytesFree.resize(FixedCSEnd, true);
+
+  SmallVector<int, 16> AllocatedFrameSlots;
+  // Add fixed objects.
+  for (int i = MFI->getObjectIndexBegin(); i != 0; ++i)
+    AllocatedFrameSlots.push_back(i);
+  // Add callee-save objects.
+  for (int i = MinCSFrameIndex; i <= (int)MaxCSFrameIndex; ++i)
+    AllocatedFrameSlots.push_back(i);
+
+  for (int i : AllocatedFrameSlots) {
+    // These are converted from int64_t, but they should always fit in int
+    // because of the FixedCSEnd check above.
+    int ObjOffset = MFI->getObjectOffset(i);
+    int ObjSize = MFI->getObjectSize(i);
+    int ObjStart, ObjEnd;
+    if (StackGrowsDown) {
+      // ObjOffset is negative when StackGrowsDown is true.
+      ObjStart = -ObjOffset - ObjSize;
+      ObjEnd = -ObjOffset;
+    } else {
+      ObjStart = ObjOffset;
+      ObjEnd = ObjOffset + ObjSize;
+    }
+    // Ignore fixed holes that are in the previous stack frame.
+    if (ObjEnd > 0)
+      StackBytesFree.reset(ObjStart, ObjEnd);
+  }
+}
+
+/// Assign frame object to an unused portion of the stack in the fixed stack
+/// object range.  Return true if the allocation was successful.
+///
+static inline bool scavengeStackSlot(MachineFrameInfo *MFI, int FrameIdx,
+                                     bool StackGrowsDown, unsigned MaxAlign,
+                                     BitVector &StackBytesFree) {
+  if (MFI->isVariableSizedObjectIndex(FrameIdx))
+    return false;
+
+  if (StackBytesFree.none()) {
+    // clear it to speed up later scavengeStackSlot calls to
+    // StackBytesFree.none()
+    StackBytesFree.clear();
+    return false;
+  }
+
+  unsigned ObjAlign = MFI->getObjectAlignment(FrameIdx);
+  if (ObjAlign > MaxAlign)
+    return false;
+
+  int64_t ObjSize = MFI->getObjectSize(FrameIdx);
+  int FreeStart;
+  for (FreeStart = StackBytesFree.find_first(); FreeStart != -1;
+       FreeStart = StackBytesFree.find_next(FreeStart)) {
+
+    // Check that free space has suitable alignment.
+    unsigned ObjStart = StackGrowsDown ? FreeStart + ObjSize : FreeStart;
+    if (alignTo(ObjStart, ObjAlign) != ObjStart)
+      continue;
+
+    if (FreeStart + ObjSize > StackBytesFree.size())
+      return false;
+
+    bool AllBytesFree = true;
+    for (unsigned Byte = 0; Byte < ObjSize; ++Byte)
+      if (!StackBytesFree.test(FreeStart + Byte)) {
+        AllBytesFree = false;
+        break;
+      }
+    if (AllBytesFree)
+      break;
+  }
+
+  if (FreeStart == -1)
+    return false;
+
+  if (StackGrowsDown) {
+    int ObjStart = -(FreeStart + ObjSize);
+    DEBUG(dbgs() << "alloc FI(" << FrameIdx << ") scavenged at SP[" << ObjStart
+                 << "]\n");
+    MFI->setObjectOffset(FrameIdx, ObjStart);
+  } else {
+    DEBUG(dbgs() << "alloc FI(" << FrameIdx << ") scavenged at SP[" << FreeStart
+                 << "]\n");
+    MFI->setObjectOffset(FrameIdx, FreeStart);
+  }
+
+  StackBytesFree.reset(FreeStart, FreeStart + ObjSize);
+  return true;
+}
+
 /// AssignProtectedObjSet - Helper function to assign large stack objects (i.e.,
 /// those required to be close to the Stack Protector) to stack offsets.
 static void
@@ -568,9 +723,8 @@ void PEI::calculateFrameObjectOffsets(MachineFunction &Fn) {
 
   // If there are fixed sized objects that are preallocated in the local area,
   // non-fixed objects can't be allocated right at the start of local area.
-  // We currently don't support filling in holes in between fixed sized
-  // objects, so we adjust 'Offset' to point to the end of last fixed sized
-  // preallocated object.
+  // Adjust 'Offset' to point to the end of last fixed sized preallocated
+  // object.
   for (int i = MFI->getObjectIndexBegin(); i != 0; ++i) {
     int64_t FixedOff;
     if (StackGrowsDown) {
@@ -596,22 +750,27 @@ void PEI::calculateFrameObjectOffsets(MachineFunction &Fn) {
 
       unsigned Align = MFI->getObjectAlignment(i);
       // Adjust to alignment boundary
-      Offset = RoundUpToAlignment(Offset, Align, Skew);
+      Offset = alignTo(Offset, Align, Skew);
 
+      DEBUG(dbgs() << "alloc FI(" << i << ") at SP[" << -Offset << "]\n");
       MFI->setObjectOffset(i, -Offset);        // Set the computed offset
     }
-  } else {
-    int MaxCSFI = MaxCSFrameIndex, MinCSFI = MinCSFrameIndex;
-    for (int i = MaxCSFI; i >= MinCSFI ; --i) {
+  } else if (MaxCSFrameIndex >= MinCSFrameIndex) {
+    // Be careful about underflow in comparisons agains MinCSFrameIndex.
+    for (unsigned i = MaxCSFrameIndex; i != MinCSFrameIndex - 1; --i) {
       unsigned Align = MFI->getObjectAlignment(i);
       // Adjust to alignment boundary
-      Offset = RoundUpToAlignment(Offset, Align, Skew);
+      Offset = alignTo(Offset, Align, Skew);
 
+      DEBUG(dbgs() << "alloc FI(" << i << ") at SP[" << Offset << "]\n");
       MFI->setObjectOffset(i, Offset);
       Offset += MFI->getObjectSize(i);
     }
   }
 
+  // FixedCSEnd is the stack offset to the end of the fixed and callee-save
+  // stack area.
+  int64_t FixedCSEnd = Offset;
   unsigned MaxAlign = MFI->getMaxAlignment();
 
   // Make sure the special register scavenging spill slot is closest to the
@@ -638,7 +797,7 @@ void PEI::calculateFrameObjectOffsets(MachineFunction &Fn) {
     unsigned Align = MFI->getLocalFrameMaxAlign();
 
     // Adjust to alignment boundary.
-    Offset = RoundUpToAlignment(Offset, Align, Skew);
+    Offset = alignTo(Offset, Align, Skew);
 
     DEBUG(dbgs() << "Local frame base offset: " << Offset << "\n");
 
@@ -656,6 +815,11 @@ void PEI::calculateFrameObjectOffsets(MachineFunction &Fn) {
     MaxAlign = std::max(Align, MaxAlign);
   }
 
+  // Retrieve the Exception Handler registration node.
+  int EHRegNodeFrameIndex = INT_MAX;
+  if (const WinEHFuncInfo *FuncInfo = Fn.getWinEHFuncInfo())
+    EHRegNodeFrameIndex = FuncInfo->EHRegNodeFrameIndex;
+
   // Make sure that the stack protector comes before the local variables on the
   // stack.
   SmallSet<int, 16> ProtectedObjs;
@@ -678,7 +842,8 @@ void PEI::calculateFrameObjectOffsets(MachineFunction &Fn) {
         continue;
       if (MFI->isDeadObjectIndex(i))
         continue;
-      if (MFI->getStackProtectorIndex() == (int)i)
+      if (MFI->getStackProtectorIndex() == (int)i ||
+          EHRegNodeFrameIndex == (int)i)
         continue;
 
       switch (SP->getSSPLayout(MFI->getObjectAllocation(i))) {
@@ -705,8 +870,10 @@ void PEI::calculateFrameObjectOffsets(MachineFunction &Fn) {
                           Offset, MaxAlign, Skew);
   }
 
-  // Then assign frame offsets to stack objects that are not used to spill
-  // callee saved registers.
+  SmallVector<int, 8> ObjectsToAllocate;
+
+  // Then prepare to assign frame offsets to stack objects that are not used to
+  // spill callee saved registers.
   for (unsigned i = 0, e = MFI->getObjectIndexEnd(); i != e; ++i) {
     if (MFI->isObjectPreAllocated(i) &&
         MFI->getUseLocalStackAllocationBlock())
@@ -717,14 +884,43 @@ void PEI::calculateFrameObjectOffsets(MachineFunction &Fn) {
       continue;
     if (MFI->isDeadObjectIndex(i))
       continue;
-    if (MFI->getStackProtectorIndex() == (int)i)
+    if (MFI->getStackProtectorIndex() == (int)i ||
+        EHRegNodeFrameIndex == (int)i)
       continue;
     if (ProtectedObjs.count(i))
       continue;
 
-    AdjustStackOffset(MFI, i, StackGrowsDown, Offset, MaxAlign, Skew);
+    // Add the objects that we need to allocate to our working set.
+    ObjectsToAllocate.push_back(i);
   }
 
+  // Allocate the EH registration node first if one is present.
+  if (EHRegNodeFrameIndex != INT_MAX)
+    AdjustStackOffset(MFI, EHRegNodeFrameIndex, StackGrowsDown, Offset,
+                      MaxAlign, Skew);
+
+  // Give the targets a chance to order the objects the way they like it.
+  if (Fn.getTarget().getOptLevel() != CodeGenOpt::None &&
+      Fn.getTarget().Options.StackSymbolOrdering)
+    TFI.orderFrameObjects(Fn, ObjectsToAllocate);
+
+  // Keep track of which bytes in the fixed and callee-save range are used so we
+  // can use the holes when allocating later stack objects.  Only do this if
+  // stack protector isn't being used and the target requests it and we're
+  // optimizing.
+  BitVector StackBytesFree;
+  if (!ObjectsToAllocate.empty() &&
+      Fn.getTarget().getOptLevel() != CodeGenOpt::None &&
+      MFI->getStackProtectorIndex() < 0 && TFI.enableStackSlotScavenging(Fn))
+    computeFreeStackSlots(MFI, StackGrowsDown, MinCSFrameIndex, MaxCSFrameIndex,
+                          FixedCSEnd, StackBytesFree);
+
+  // Now walk the objects and actually assign base offsets to them.
+  for (auto &Object : ObjectsToAllocate)
+    if (!scavengeStackSlot(MFI, Object, StackGrowsDown, MaxAlign,
+                           StackBytesFree))
+      AdjustStackOffset(MFI, Object, StackGrowsDown, Offset, MaxAlign, Skew);
+
   // Make sure the special register scavenging spill slot is closest to the
   // stack pointer.
   if (RS && !EarlyScavengingSlots) {
@@ -757,7 +953,7 @@ void PEI::calculateFrameObjectOffsets(MachineFunction &Fn) {
     // If the frame pointer is eliminated, all frame offsets will be relative to
     // SP not FP. Align to MaxAlign so this works.
     StackAlign = std::max(StackAlign, MaxAlign);
-    Offset = RoundUpToAlignment(Offset, StackAlign, Skew);
+    Offset = alignTo(Offset, StackAlign, Skew);
   }
 
   // Update frame info to pretend that this is part of the stack...
@@ -851,7 +1047,7 @@ void PEI::replaceFrameIndices(MachineBasicBlock *BB, MachineFunction &Fn,
   unsigned FrameSetupOpcode = TII.getCallFrameSetupOpcode();
   unsigned FrameDestroyOpcode = TII.getCallFrameDestroyOpcode();
 
-  if (RS && !FrameIndexVirtualScavenging) RS->enterBasicBlock(BB);
+  if (RS && !FrameIndexVirtualScavenging) RS->enterBasicBlock(*BB);
 
   bool InsideCallSequence = false;
 
@@ -860,38 +1056,31 @@ void PEI::replaceFrameIndices(MachineBasicBlock *BB, MachineFunction &Fn,
     if (I->getOpcode() == FrameSetupOpcode ||
         I->getOpcode() == FrameDestroyOpcode) {
       InsideCallSequence = (I->getOpcode() == FrameSetupOpcode);
-      SPAdj += TII.getSPAdjust(I);
-
-      MachineBasicBlock::iterator PrevI = BB->end();
-      if (I != BB->begin()) PrevI = std::prev(I);
-      TFI->eliminateCallFramePseudoInstr(Fn, *BB, I);
+      SPAdj += TII.getSPAdjust(*I);
 
-      // Visit the instructions created by eliminateCallFramePseudoInstr().
-      if (PrevI == BB->end())
-        I = BB->begin();     // The replaced instr was the first in the block.
-      else
-        I = std::next(PrevI);
+      I = TFI->eliminateCallFramePseudoInstr(Fn, *BB, I);
       continue;
     }
 
-    MachineInstr *MI = I;
+    MachineInstr &MI = *I;
     bool DoIncr = true;
-    for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) {
-      if (!MI->getOperand(i).isFI())
+    bool DidFinishLoop = true;
+    for (unsigned i = 0, e = MI.getNumOperands(); i != e; ++i) {
+      if (!MI.getOperand(i).isFI())
         continue;
 
       // Frame indices in debug values are encoded in a target independent
       // way with simply the frame index and offset rather than any
       // target-specific addressing mode.
-      if (MI->isDebugValue()) {
+      if (MI.isDebugValue()) {
         assert(i == 0 && "Frame indices can only appear as the first "
                          "operand of a DBG_VALUE machine instruction");
         unsigned Reg;
-        MachineOperand &Offset = MI->getOperand(1);
-        Offset.setImm(Offset.getImm() +
-                      TFI->getFrameIndexReference(
-                          Fn, MI->getOperand(0).getIndex(), Reg));
-        MI->getOperand(0).ChangeToRegister(Reg, false /*isDef*/);
+        MachineOperand &Offset = MI.getOperand(1);
+        Offset.setImm(
+            Offset.getImm() +
+            TFI->getFrameIndexReference(Fn, MI.getOperand(0).getIndex(), Reg));
+        MI.getOperand(0).ChangeToRegister(Reg, false /*isDef*/);
         continue;
       }
 
@@ -900,18 +1089,16 @@ void PEI::replaceFrameIndices(MachineBasicBlock *BB, MachineFunction &Fn,
       // implementation other than historical accident.  The only
       // remaining difference is the unconditional use of the stack
       // pointer as the base register.
-      if (MI->getOpcode() == TargetOpcode::STATEPOINT) {
-        assert((!MI->isDebugValue() || i == 0) &&
+      if (MI.getOpcode() == TargetOpcode::STATEPOINT) {
+        assert((!MI.isDebugValue() || i == 0) &&
                "Frame indicies can only appear as the first operand of a "
                "DBG_VALUE machine instruction");
         unsigned Reg;
-        MachineOperand &Offset = MI->getOperand(i + 1);
-        const unsigned refOffset =
-          TFI->getFrameIndexReferenceFromSP(Fn, MI->getOperand(i).getIndex(),
-                                            Reg);
-
+        MachineOperand &Offset = MI.getOperand(i + 1);
+        int refOffset = TFI->getFrameIndexReferencePreferSP(
+            Fn, MI.getOperand(i).getIndex(), Reg, /*IgnoreSPUpdates*/ false);
         Offset.setImm(Offset.getImm() + refOffset);
-        MI->getOperand(i).ChangeToRegister(Reg, false /*isDef*/);
+        MI.getOperand(i).ChangeToRegister(Reg, false /*isDef*/);
         continue;
       }
 
@@ -937,7 +1124,7 @@ void PEI::replaceFrameIndices(MachineBasicBlock *BB, MachineFunction &Fn,
         DoIncr = false;
       }
 
-      MI = nullptr;
+      DidFinishLoop = false;
       break;
     }
 
@@ -948,45 +1135,46 @@ void PEI::replaceFrameIndices(MachineBasicBlock *BB, MachineFunction &Fn,
     // Note that this must come after eliminateFrameIndex, because 
     // if I itself referred to a frame index, we shouldn't count its own
     // adjustment.
-    if (MI && InsideCallSequence)
+    if (DidFinishLoop && InsideCallSequence)
       SPAdj += TII.getSPAdjust(MI);
 
     if (DoIncr && I != BB->end()) ++I;
 
     // Update register states.
-    if (RS && !FrameIndexVirtualScavenging && MI) RS->forward(MI);
+    if (RS && !FrameIndexVirtualScavenging && DidFinishLoop)
+      RS->forward(MI);
   }
 }
 
-/// scavengeFrameVirtualRegs - Replace all frame index virtual registers
+/// doScavengeFrameVirtualRegs - Replace all frame index virtual registers
 /// with physical registers. Use the register scavenger to find an
 /// appropriate register to use.
 ///
 /// FIXME: Iterating over the instruction stream is unnecessary. We can simply
 /// iterate over the vreg use list, which at this point only contains machine
 /// operands for which eliminateFrameIndex need a new scratch reg.
-void
-PEI::scavengeFrameVirtualRegs(MachineFunction &Fn) {
+static void
+doScavengeFrameVirtualRegs(MachineFunction &MF, RegScavenger *RS) {
   // Run through the instructions and find any virtual registers.
-  for (MachineFunction::iterator BB = Fn.begin(),
-       E = Fn.end(); BB != E; ++BB) {
-    RS->enterBasicBlock(&*BB);
+  MachineRegisterInfo &MRI = MF.getRegInfo();
+  for (MachineBasicBlock &MBB : MF) {
+    RS->enterBasicBlock(MBB);
 
     int SPAdj = 0;
 
-    // The instruction stream may change in the loop, so check BB->end()
+    // The instruction stream may change in the loop, so check MBB.end()
     // directly.
-    for (MachineBasicBlock::iterator I = BB->begin(); I != BB->end(); ) {
+    for (MachineBasicBlock::iterator I = MBB.begin(); I != MBB.end(); ) {
       // We might end up here again with a NULL iterator if we scavenged a
       // register for which we inserted spill code for definition by what was
-      // originally the first instruction in BB.
+      // originally the first instruction in MBB.
       if (I == MachineBasicBlock::iterator(nullptr))
-        I = BB->begin();
+        I = MBB.begin();
 
-      MachineInstr *MI = I;
+      const MachineInstr &MI = *I;
       MachineBasicBlock::iterator J = std::next(I);
       MachineBasicBlock::iterator P =
-                         I == BB->begin() ? MachineBasicBlock::iterator(nullptr)
+                         I == MBB.begin() ? MachineBasicBlock::iterator(nullptr)
                                           : std::prev(I);
 
       // RS should process this instruction before we might scavenge at this
@@ -995,35 +1183,31 @@ PEI::scavengeFrameVirtualRegs(MachineFunction &Fn) {
       // instruction are available, and defined registers are not.
       RS->forward(I);
 
-      for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) {
-        if (MI->getOperand(i).isReg()) {
-          MachineOperand &MO = MI->getOperand(i);
-          unsigned Reg = MO.getReg();
-          if (Reg == 0)
-            continue;
-          if (!TargetRegisterInfo::isVirtualRegister(Reg))
-            continue;
-
-          // When we first encounter a new virtual register, it
-          // must be a definition.
-          assert(MI->getOperand(i).isDef() &&
-                 "frame index virtual missing def!");
-          // Scavenge a new scratch register
-          const TargetRegisterClass *RC = Fn.getRegInfo().getRegClass(Reg);
-          unsigned ScratchReg = RS->scavengeRegister(RC, J, SPAdj);
-
-          ++NumScavengedRegs;
-
-          // Replace this reference to the virtual register with the
-          // scratch register.
-          assert (ScratchReg && "Missing scratch register!");
-          Fn.getRegInfo().replaceRegWith(Reg, ScratchReg);
-          
-          // Because this instruction was processed by the RS before this
-          // register was allocated, make sure that the RS now records the
-          // register as being used.
-          RS->setRegUsed(ScratchReg);
-        }
+      for (const MachineOperand &MO : MI.operands()) {
+        if (!MO.isReg())
+          continue;
+        unsigned Reg = MO.getReg();
+        if (!TargetRegisterInfo::isVirtualRegister(Reg))
+          continue;
+
+        // When we first encounter a new virtual register, it
+        // must be a definition.
+        assert(MO.isDef() && "frame index virtual missing def!");
+        // Scavenge a new scratch register
+        const TargetRegisterClass *RC = MRI.getRegClass(Reg);
+        unsigned ScratchReg = RS->scavengeRegister(RC, J, SPAdj);
+
+        ++NumScavengedRegs;
+
+        // Replace this reference to the virtual register with the
+        // scratch register.
+        assert(ScratchReg && "Missing scratch register!");
+        MRI.replaceRegWith(Reg, ScratchReg);
+
+        // Because this instruction was processed by the RS before this
+        // register was allocated, make sure that the RS now records the
+        // register as being used.
+        RS->setRegUsed(ScratchReg);
       }
 
       // If the scavenger needed to use one of its spill slots, the
@@ -1031,7 +1215,7 @@ PEI::scavengeFrameVirtualRegs(MachineFunction &Fn) {
       // problem because we need the spill code before I: Move I to just
       // prior to J.
       if (I != std::prev(J)) {
-        BB->splice(J, &*BB, I);
+        MBB.splice(J, &MBB, I);
 
         // Before we move I, we need to prepare the RS to visit I again.
         // Specifically, RS will assert if it sees uses of registers that
diff --git a/lib/CodeGen/PseudoSourceValue.cpp b/lib/CodeGen/PseudoSourceValue.cpp
index 1f46417e61e7..804a4c3dad66 100644
--- a/lib/CodeGen/PseudoSourceValue.cpp
+++ b/lib/CodeGen/PseudoSourceValue.cpp
@@ -11,16 +11,13 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "llvm/ADT/STLExtras.h"
 #include "llvm/CodeGen/PseudoSourceValue.h"
+#include "llvm/ADT/STLExtras.h"
 #include "llvm/CodeGen/MachineFrameInfo.h"
 #include "llvm/IR/DerivedTypes.h"
 #include "llvm/IR/LLVMContext.h"
 #include "llvm/Support/ErrorHandling.h"
-#include "llvm/Support/ManagedStatic.h"
-#include "llvm/Support/Mutex.h"
 #include "llvm/Support/raw_ostream.h"
-#include <map>
 using namespace llvm;
 
 static const char *const PSVNames[] = {
diff --git a/lib/CodeGen/RegAllocBase.cpp b/lib/CodeGen/RegAllocBase.cpp
index 16ff48e78a79..93eeb9cba457 100644
--- a/lib/CodeGen/RegAllocBase.cpp
+++ b/lib/CodeGen/RegAllocBase.cpp
@@ -22,9 +22,6 @@
 #include "llvm/CodeGen/MachineRegisterInfo.h"
 #include "llvm/CodeGen/VirtRegMap.h"
 #include "llvm/Target/TargetRegisterInfo.h"
-#ifndef NDEBUG
-#include "llvm/ADT/SparseBitVector.h"
-#endif
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/raw_ostream.h"
@@ -153,3 +150,12 @@ void RegAllocBase::allocatePhysRegs() {
     }
   }
 }
+
+void RegAllocBase::postOptimization() {
+  spiller().postOptimization();
+  for (auto DeadInst : DeadRemats) {
+    LIS->RemoveMachineInstrFromMaps(*DeadInst);
+    DeadInst->eraseFromParent();
+  }
+  DeadRemats.clear();
+}
diff --git a/lib/CodeGen/RegAllocBase.h b/lib/CodeGen/RegAllocBase.h
index 659b8f505a22..296ffe8692c6 100644
--- a/lib/CodeGen/RegAllocBase.h
+++ b/lib/CodeGen/RegAllocBase.h
@@ -65,6 +65,12 @@ protected:
   LiveRegMatrix *Matrix;
   RegisterClassInfo RegClassInfo;
 
+  /// Inst which is a def of an original reg and whose defs are already all
+  /// dead after remat is saved in DeadRemats. The deletion of such inst is
+  /// postponed till all the allocations are done, so its remat expr is
+  /// always available for the remat of all the siblings of the original reg.
+  SmallPtrSet<MachineInstr *, 32> DeadRemats;
+
   RegAllocBase()
     : TRI(nullptr), MRI(nullptr), VRM(nullptr), LIS(nullptr), Matrix(nullptr) {}
 
@@ -77,6 +83,10 @@ protected:
   // physical register assignments.
   void allocatePhysRegs();
 
+  // Include spiller post optimization and removing dead defs left because of
+  // rematerialization.
+  virtual void postOptimization();
+
   // Get a temporary reference to a Spiller instance.
   virtual Spiller &spiller() = 0;
 
diff --git a/lib/CodeGen/RegAllocBasic.cpp b/lib/CodeGen/RegAllocBasic.cpp
index cfe367d5115c..11dfda67377f 100644
--- a/lib/CodeGen/RegAllocBasic.cpp
+++ b/lib/CodeGen/RegAllocBasic.cpp
@@ -199,7 +199,7 @@ bool RABasic::spillInterferences(LiveInterval &VirtReg, unsigned PhysReg,
     Matrix->unassign(Spill);
 
     // Spill the extracted interval.
-    LiveRangeEdit LRE(&Spill, SplitVRegs, *MF, *LIS, VRM);
+    LiveRangeEdit LRE(&Spill, SplitVRegs, *MF, *LIS, VRM, nullptr, &DeadRemats);
     spiller().spill(LRE);
   }
   return true;
@@ -258,7 +258,7 @@ unsigned RABasic::selectOrSplit(LiveInterval &VirtReg,
   DEBUG(dbgs() << "spilling: " << VirtReg << '\n');
   if (!VirtReg.isSpillable())
     return ~0u;
-  LiveRangeEdit LRE(&VirtReg, SplitVRegs, *MF, *LIS, VRM);
+  LiveRangeEdit LRE(&VirtReg, SplitVRegs, *MF, *LIS, VRM, nullptr, &DeadRemats);
   spiller().spill(LRE);
 
   // The live virtual register requesting allocation was spilled, so tell
@@ -283,6 +283,7 @@ bool RABasic::runOnMachineFunction(MachineFunction &mf) {
   SpillerInstance.reset(createInlineSpiller(*this, *MF, *VRM));
 
   allocatePhysRegs();
+  postOptimization();
 
   // Diagnostic output before rewriting
   DEBUG(dbgs() << "Post alloc VirtRegMap:\n" << *VRM << "\n");
diff --git a/lib/CodeGen/RegAllocFast.cpp b/lib/CodeGen/RegAllocFast.cpp
index 8d7a7213ba07..55fb33edd720 100644
--- a/lib/CodeGen/RegAllocFast.cpp
+++ b/lib/CodeGen/RegAllocFast.cpp
@@ -12,7 +12,6 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "llvm/CodeGen/Passes.h"
 #include "llvm/ADT/DenseMap.h"
 #include "llvm/ADT/IndexedMap.h"
 #include "llvm/ADT/STLExtras.h"
@@ -25,13 +24,12 @@
 #include "llvm/CodeGen/MachineInstr.h"
 #include "llvm/CodeGen/MachineInstrBuilder.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/Passes.h"
 #include "llvm/CodeGen/RegAllocRegistry.h"
 #include "llvm/CodeGen/RegisterClassInfo.h"
-#include "llvm/IR/BasicBlock.h"
-#include "llvm/Support/CommandLine.h"
+#include "llvm/IR/DebugInfo.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/ErrorHandling.h"
-#include "llvm/Support/raw_ostream.h"
 #include "llvm/Target/TargetInstrInfo.h"
 #include "llvm/Target/TargetSubtargetInfo.h"
 #include <algorithm>
@@ -52,6 +50,7 @@ namespace {
     static char ID;
     RAFast() : MachineFunctionPass(ID), StackSlotForVirtReg(-1),
                isBulkSpilling(false) {}
+
   private:
     MachineFunction *MF;
     MachineRegisterInfo *MRI;
@@ -159,6 +158,11 @@ namespace {
       MachineFunctionPass::getAnalysisUsage(AU);
     }
 
+    MachineFunctionProperties getSetProperties() const override {
+      return MachineFunctionProperties().set(
+          MachineFunctionProperties::Property::AllVRegsAllocated);
+    }
+
   private:
     bool runOnMachineFunction(MachineFunction &Fn) override;
     void AllocateBasicBlock();
@@ -174,7 +178,7 @@ namespace {
     void spillVirtReg(MachineBasicBlock::iterator MI, unsigned VirtReg);
 
     void usePhysReg(MachineOperand&);
-    void definePhysReg(MachineInstr *MI, unsigned PhysReg, RegState NewState);
+    void definePhysReg(MachineInstr &MI, unsigned PhysReg, RegState NewState);
     unsigned calcSpillCost(unsigned PhysReg) const;
     void assignVirtToPhysReg(LiveReg&, unsigned PhysReg);
     LiveRegMap::iterator findLiveVirtReg(unsigned VirtReg) {
@@ -184,11 +188,11 @@ namespace {
       return LiveVirtRegs.find(TargetRegisterInfo::virtReg2Index(VirtReg));
     }
     LiveRegMap::iterator assignVirtToPhysReg(unsigned VReg, unsigned PhysReg);
-    LiveRegMap::iterator allocVirtReg(MachineInstr *MI, LiveRegMap::iterator,
+    LiveRegMap::iterator allocVirtReg(MachineInstr &MI, LiveRegMap::iterator,
                                       unsigned Hint);
-    LiveRegMap::iterator defineVirtReg(MachineInstr *MI, unsigned OpNum,
+    LiveRegMap::iterator defineVirtReg(MachineInstr &MI, unsigned OpNum,
                                        unsigned VirtReg, unsigned Hint);
-    LiveRegMap::iterator reloadVirtReg(MachineInstr *MI, unsigned OpNum,
+    LiveRegMap::iterator reloadVirtReg(MachineInstr &MI, unsigned OpNum,
                                        unsigned VirtReg, unsigned Hint);
     void spillAll(MachineBasicBlock::iterator MI);
     bool setPhysReg(MachineInstr *MI, unsigned OpNum, unsigned PhysReg);
@@ -280,7 +284,7 @@ void RAFast::spillVirtReg(MachineBasicBlock::iterator MI,
   if (LR.Dirty) {
     // If this physreg is used by the instruction, we want to kill it on the
     // instruction, not on the spill.
-    bool SpillKill = LR.LastUse != MI;
+    bool SpillKill = MachineBasicBlock::iterator(LR.LastUse) != MI;
     LR.Dirty = false;
     DEBUG(dbgs() << "Spilling " << PrintReg(LRI->VirtReg, TRI)
                  << " in " << PrintReg(LR.PhysReg, TRI));
@@ -345,6 +349,11 @@ void RAFast::usePhysReg(MachineOperand &MO) {
   unsigned PhysReg = MO.getReg();
   assert(TargetRegisterInfo::isPhysicalRegister(PhysReg) &&
          "Bad usePhysReg operand");
+
+  // Ignore undef uses.
+  if (MO.isUndef())
+    return;
+
   markRegUsedInInstr(PhysReg);
   switch (PhysRegState[PhysReg]) {
   case regDisabled:
@@ -404,7 +413,7 @@ void RAFast::usePhysReg(MachineOperand &MO) {
 /// definePhysReg - Mark PhysReg as reserved or free after spilling any
 /// virtregs. This is very similar to defineVirtReg except the physreg is
 /// reserved instead of allocated.
-void RAFast::definePhysReg(MachineInstr *MI, unsigned PhysReg,
+void RAFast::definePhysReg(MachineInstr &MI, unsigned PhysReg,
                            RegState NewState) {
   markRegUsedInInstr(PhysReg);
   switch (unsigned VirtReg = PhysRegState[PhysReg]) {
@@ -512,7 +521,7 @@ RAFast::assignVirtToPhysReg(unsigned VirtReg, unsigned PhysReg) {
 }
 
 /// allocVirtReg - Allocate a physical register for VirtReg.
-RAFast::LiveRegMap::iterator RAFast::allocVirtReg(MachineInstr *MI,
+RAFast::LiveRegMap::iterator RAFast::allocVirtReg(MachineInstr &MI,
                                                   LiveRegMap::iterator LRI,
                                                   unsigned Hint) {
   const unsigned VirtReg = LRI->VirtReg;
@@ -577,18 +586,19 @@ RAFast::LiveRegMap::iterator RAFast::allocVirtReg(MachineInstr *MI,
   }
 
   // Nothing we can do. Report an error and keep going with a bad allocation.
-  if (MI->isInlineAsm())
-    MI->emitError("inline assembly requires more registers than available");
+  if (MI.isInlineAsm())
+    MI.emitError("inline assembly requires more registers than available");
   else
-    MI->emitError("ran out of registers during register allocation");
+    MI.emitError("ran out of registers during register allocation");
   definePhysReg(MI, *AO.begin(), regFree);
   return assignVirtToPhysReg(VirtReg, *AO.begin());
 }
 
 /// defineVirtReg - Allocate a register for VirtReg and mark it as dirty.
-RAFast::LiveRegMap::iterator
-RAFast::defineVirtReg(MachineInstr *MI, unsigned OpNum,
-                      unsigned VirtReg, unsigned Hint) {
+RAFast::LiveRegMap::iterator RAFast::defineVirtReg(MachineInstr &MI,
+                                                   unsigned OpNum,
+                                                   unsigned VirtReg,
+                                                   unsigned Hint) {
   assert(TargetRegisterInfo::isVirtualRegister(VirtReg) &&
          "Not a virtual register");
   LiveRegMap::iterator LRI;
@@ -607,11 +617,11 @@ RAFast::defineVirtReg(MachineInstr *MI, unsigned OpNum,
   } else if (LRI->LastUse) {
     // Redefining a live register - kill at the last use, unless it is this
     // instruction defining VirtReg multiple times.
-    if (LRI->LastUse != MI || LRI->LastUse->getOperand(LRI->LastOpNum).isUse())
+    if (LRI->LastUse != &MI || LRI->LastUse->getOperand(LRI->LastOpNum).isUse())
       addKillFlag(*LRI);
   }
   assert(LRI->PhysReg && "Register not assigned");
-  LRI->LastUse = MI;
+  LRI->LastUse = &MI;
   LRI->LastOpNum = OpNum;
   LRI->Dirty = true;
   markRegUsedInInstr(LRI->PhysReg);
@@ -619,15 +629,16 @@ RAFast::defineVirtReg(MachineInstr *MI, unsigned OpNum,
 }
 
 /// reloadVirtReg - Make sure VirtReg is available in a physreg and return it.
-RAFast::LiveRegMap::iterator
-RAFast::reloadVirtReg(MachineInstr *MI, unsigned OpNum,
-                      unsigned VirtReg, unsigned Hint) {
+RAFast::LiveRegMap::iterator RAFast::reloadVirtReg(MachineInstr &MI,
+                                                   unsigned OpNum,
+                                                   unsigned VirtReg,
+                                                   unsigned Hint) {
   assert(TargetRegisterInfo::isVirtualRegister(VirtReg) &&
          "Not a virtual register");
   LiveRegMap::iterator LRI;
   bool New;
   std::tie(LRI, New) = LiveVirtRegs.insert(LiveReg(VirtReg));
-  MachineOperand &MO = MI->getOperand(OpNum);
+  MachineOperand &MO = MI.getOperand(OpNum);
   if (New) {
     LRI = allocVirtReg(MI, LRI, Hint);
     const TargetRegisterClass *RC = MRI->getRegClass(VirtReg);
@@ -662,7 +673,7 @@ RAFast::reloadVirtReg(MachineInstr *MI, unsigned OpNum,
     MO.setIsDead(false);
   }
   assert(LRI->PhysReg && "Register not assigned");
-  LRI->LastUse = MI;
+  LRI->LastUse = &MI;
   LRI->LastOpNum = OpNum;
   markRegUsedInInstr(LRI->PhysReg);
   return LRI;
@@ -728,7 +739,7 @@ void RAFast::handleThroughOperands(MachineInstr *MI,
     markRegUsedInInstr(Reg);
     for (MCRegAliasIterator AI(Reg, TRI, true); AI.isValid(); ++AI) {
       if (ThroughRegs.count(PhysRegState[*AI]))
-        definePhysReg(MI, *AI, regFree);
+        definePhysReg(*MI, *AI, regFree);
     }
   }
 
@@ -744,7 +755,7 @@ void RAFast::handleThroughOperands(MachineInstr *MI,
       if (!MI->isRegTiedToDefOperand(i, &DefIdx)) continue;
       DEBUG(dbgs() << "Operand " << i << "("<< MO << ") is tied to operand "
         << DefIdx << ".\n");
-      LiveRegMap::iterator LRI = reloadVirtReg(MI, i, Reg, 0);
+      LiveRegMap::iterator LRI = reloadVirtReg(*MI, i, Reg, 0);
       unsigned PhysReg = LRI->PhysReg;
       setPhysReg(MI, i, PhysReg);
       // Note: we don't update the def operand yet. That would cause the normal
@@ -753,7 +764,7 @@ void RAFast::handleThroughOperands(MachineInstr *MI,
       DEBUG(dbgs() << "Partial redefine: " << MO << "\n");
       // Reload the register, but don't assign to the operand just yet.
       // That would confuse the later phys-def processing pass.
-      LiveRegMap::iterator LRI = reloadVirtReg(MI, i, Reg, 0);
+      LiveRegMap::iterator LRI = reloadVirtReg(*MI, i, Reg, 0);
       PartialDefs.push_back(LRI->PhysReg);
     }
   }
@@ -767,7 +778,7 @@ void RAFast::handleThroughOperands(MachineInstr *MI,
     if (!MO.isEarlyClobber())
       continue;
     // Note: defineVirtReg may invalidate MO.
-    LiveRegMap::iterator LRI = defineVirtReg(MI, i, Reg, 0);
+    LiveRegMap::iterator LRI = defineVirtReg(*MI, i, Reg, 0);
     unsigned PhysReg = LRI->PhysReg;
     if (setPhysReg(MI, i, PhysReg))
       VirtDead.push_back(Reg);
@@ -801,14 +812,14 @@ void RAFast::AllocateBasicBlock() {
   // Add live-in registers as live.
   for (const auto &LI : MBB->liveins())
     if (MRI->isAllocatable(LI.PhysReg))
-      definePhysReg(MII, LI.PhysReg, regReserved);
+      definePhysReg(*MII, LI.PhysReg, regReserved);
 
   SmallVector<unsigned, 8> VirtDead;
   SmallVector<MachineInstr*, 32> Coalesced;
 
   // Otherwise, sequentially allocate each instruction in the MBB.
   while (MII != MBB->end()) {
-    MachineInstr *MI = MII++;
+    MachineInstr *MI = &*MII++;
     const MCInstrDesc &MCID = MI->getDesc();
     DEBUG({
         dbgs() << "\n>> " << *MI << "Regs:";
@@ -943,8 +954,8 @@ void RAFast::AllocateBasicBlock() {
       if (MO.isUse()) {
         usePhysReg(MO);
       } else if (MO.isEarlyClobber()) {
-        definePhysReg(MI, Reg, (MO.isImplicit() || MO.isDead()) ?
-                               regFree : regReserved);
+        definePhysReg(*MI, Reg,
+                      (MO.isImplicit() || MO.isDead()) ? regFree : regReserved);
         hasEarlyClobbers = true;
       } else
         hasPhysDefs = true;
@@ -977,7 +988,7 @@ void RAFast::AllocateBasicBlock() {
       unsigned Reg = MO.getReg();
       if (!TargetRegisterInfo::isVirtualRegister(Reg)) continue;
       if (MO.isUse()) {
-        LiveRegMap::iterator LRI = reloadVirtReg(MI, i, Reg, CopyDst);
+        LiveRegMap::iterator LRI = reloadVirtReg(*MI, i, Reg, CopyDst);
         unsigned PhysReg = LRI->PhysReg;
         CopySrc = (CopySrc == Reg || CopySrc == PhysReg) ? PhysReg : 0;
         if (setPhysReg(MI, i, PhysReg))
@@ -1027,10 +1038,10 @@ void RAFast::AllocateBasicBlock() {
 
       if (TargetRegisterInfo::isPhysicalRegister(Reg)) {
         if (!MRI->isAllocatable(Reg)) continue;
-        definePhysReg(MI, Reg, MO.isDead() ? regFree : regReserved);
+        definePhysReg(*MI, Reg, MO.isDead() ? regFree : regReserved);
         continue;
       }
-      LiveRegMap::iterator LRI = defineVirtReg(MI, i, Reg, CopySrc);
+      LiveRegMap::iterator LRI = defineVirtReg(*MI, i, Reg, CopySrc);
       unsigned PhysReg = LRI->PhysReg;
       if (setPhysReg(MI, i, PhysReg)) {
         VirtDead.push_back(Reg);
diff --git a/lib/CodeGen/RegAllocGreedy.cpp b/lib/CodeGen/RegAllocGreedy.cpp
index 945cb9e2c993..c4d4b1eadf3e 100644
--- a/lib/CodeGen/RegAllocGreedy.cpp
+++ b/lib/CodeGen/RegAllocGreedy.cpp
@@ -12,7 +12,6 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "llvm/CodeGen/Passes.h"
 #include "AllocationOrder.h"
 #include "InterferenceCache.h"
 #include "LiveDebugVariables.h"
@@ -33,6 +32,7 @@
 #include "llvm/CodeGen/MachineFunctionPass.h"
 #include "llvm/CodeGen/MachineLoopInfo.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/Passes.h"
 #include "llvm/CodeGen/RegAllocRegistry.h"
 #include "llvm/CodeGen/RegisterClassInfo.h"
 #include "llvm/CodeGen/VirtRegMap.h"
@@ -44,6 +44,7 @@
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/Timer.h"
 #include "llvm/Support/raw_ostream.h"
+#include "llvm/Target/TargetInstrInfo.h"
 #include "llvm/Target/TargetSubtargetInfo.h"
 #include <queue>
 
@@ -55,14 +56,14 @@ STATISTIC(NumGlobalSplits, "Number of split global live ranges");
 STATISTIC(NumLocalSplits,  "Number of split local live ranges");
 STATISTIC(NumEvicted,      "Number of interferences evicted");
 
-static cl::opt<SplitEditor::ComplementSpillMode>
-SplitSpillMode("split-spill-mode", cl::Hidden,
-  cl::desc("Spill mode for splitting live ranges"),
-  cl::values(clEnumValN(SplitEditor::SM_Partition, "default", "Default"),
-             clEnumValN(SplitEditor::SM_Size,  "size",  "Optimize for size"),
-             clEnumValN(SplitEditor::SM_Speed, "speed", "Optimize for speed"),
-             clEnumValEnd),
-  cl::init(SplitEditor::SM_Partition));
+static cl::opt<SplitEditor::ComplementSpillMode> SplitSpillMode(
+    "split-spill-mode", cl::Hidden,
+    cl::desc("Spill mode for splitting live ranges"),
+    cl::values(clEnumValN(SplitEditor::SM_Partition, "default", "Default"),
+               clEnumValN(SplitEditor::SM_Size, "size", "Optimize for size"),
+               clEnumValN(SplitEditor::SM_Speed, "speed", "Optimize for speed"),
+               clEnumValEnd),
+    cl::init(SplitEditor::SM_Speed));
 
 static cl::opt<unsigned>
 LastChanceRecoloringMaxDepth("lcr-max-depth", cl::Hidden,
@@ -128,6 +129,7 @@ class RAGreedy : public MachineFunctionPass,
   EdgeBundles *Bundles;
   SpillPlacement *SpillPlacer;
   LiveDebugVariables *DebugVars;
+  AliasAnalysis *AA;
 
   // state
   std::unique_ptr<Spiller> SpillerInstance;
@@ -954,22 +956,28 @@ bool RAGreedy::addSplitConstraints(InterferenceCache::Cursor Intf,
 
     // Interference for the live-in value.
     if (BI.LiveIn) {
-      if (Intf.first() <= Indexes->getMBBStartIdx(BC.Number))
-        BC.Entry = SpillPlacement::MustSpill, ++Ins;
-      else if (Intf.first() < BI.FirstInstr)
-        BC.Entry = SpillPlacement::PrefSpill, ++Ins;
-      else if (Intf.first() < BI.LastInstr)
+      if (Intf.first() <= Indexes->getMBBStartIdx(BC.Number)) {
+        BC.Entry = SpillPlacement::MustSpill;
+        ++Ins;
+      } else if (Intf.first() < BI.FirstInstr) {
+        BC.Entry = SpillPlacement::PrefSpill;
         ++Ins;
+      } else if (Intf.first() < BI.LastInstr) {
+        ++Ins;
+      }
     }
 
     // Interference for the live-out value.
     if (BI.LiveOut) {
-      if (Intf.last() >= SA->getLastSplitPoint(BC.Number))
-        BC.Exit = SpillPlacement::MustSpill, ++Ins;
-      else if (Intf.last() > BI.LastInstr)
-        BC.Exit = SpillPlacement::PrefSpill, ++Ins;
-      else if (Intf.last() > BI.FirstInstr)
+      if (Intf.last() >= SA->getLastSplitPoint(BC.Number)) {
+        BC.Exit = SpillPlacement::MustSpill;
         ++Ins;
+      } else if (Intf.last() > BI.LastInstr) {
+        BC.Exit = SpillPlacement::PrefSpill;
+        ++Ins;
+      } else if (Intf.last() > BI.FirstInstr) {
+        ++Ins;
+      }
     }
 
     // Accumulate the total frequency of inserted spill code.
@@ -1392,8 +1400,10 @@ unsigned RAGreedy::calculateRegionSplitCost(LiveInterval &VirtReg,
         if (i == BestCand || !GlobalCand[i].PhysReg)
           continue;
         unsigned Count = GlobalCand[i].LiveBundles.count();
-        if (Count < WorstCount)
-          Worst = i, WorstCount = Count;
+        if (Count < WorstCount) {
+          Worst = i;
+          WorstCount = Count;
+        }
       }
       --NumCands;
       GlobalCand[Worst] = GlobalCand[NumCands];
@@ -1457,7 +1467,7 @@ unsigned RAGreedy::doRegionSplit(LiveInterval &VirtReg, unsigned BestCand,
                                  SmallVectorImpl<unsigned> &NewVRegs) {
   SmallVector<unsigned, 8> UsedCands;
   // Prepare split editor.
-  LiveRangeEdit LREdit(&VirtReg, NewVRegs, *MF, *LIS, VRM, this);
+  LiveRangeEdit LREdit(&VirtReg, NewVRegs, *MF, *LIS, VRM, this, &DeadRemats);
   SE->reset(LREdit, SplitSpillMode);
 
   // Assign all edge bundles to the preferred candidate, or NoCand.
@@ -1505,7 +1515,7 @@ unsigned RAGreedy::tryBlockSplit(LiveInterval &VirtReg, AllocationOrder &Order,
   assert(&SA->getParent() == &VirtReg && "Live range wasn't analyzed");
   unsigned Reg = VirtReg.reg;
   bool SingleInstrs = RegClassInfo.isProperSubClass(MRI->getRegClass(Reg));
-  LiveRangeEdit LREdit(&VirtReg, NewVRegs, *MF, *LIS, VRM, this);
+  LiveRangeEdit LREdit(&VirtReg, NewVRegs, *MF, *LIS, VRM, this, &DeadRemats);
   SE->reset(LREdit, SplitSpillMode);
   ArrayRef<SplitAnalysis::BlockInfo> UseBlocks = SA->getUseBlocks();
   for (unsigned i = 0; i != UseBlocks.size(); ++i) {
@@ -1577,7 +1587,7 @@ RAGreedy::tryInstructionSplit(LiveInterval &VirtReg, AllocationOrder &Order,
 
   // Always enable split spill mode, since we're effectively spilling to a
   // register.
-  LiveRangeEdit LREdit(&VirtReg, NewVRegs, *MF, *LIS, VRM, this);
+  LiveRangeEdit LREdit(&VirtReg, NewVRegs, *MF, *LIS, VRM, this, &DeadRemats);
   SE->reset(LREdit, SplitEditor::SM_Size);
 
   ArrayRef<SlotIndex> Uses = SA->getUseSlots();
@@ -1900,7 +1910,7 @@ unsigned RAGreedy::tryLocalSplit(LiveInterval &VirtReg, AllocationOrder &Order,
                << '-' << Uses[BestAfter] << ", " << BestDiff
                << ", " << (BestAfter - BestBefore + 1) << " instrs\n");
 
-  LiveRangeEdit LREdit(&VirtReg, NewVRegs, *MF, *LIS, VRM, this);
+  LiveRangeEdit LREdit(&VirtReg, NewVRegs, *MF, *LIS, VRM, this, &DeadRemats);
   SE->reset(LREdit);
 
   SE->openIntv();
@@ -2543,7 +2553,7 @@ unsigned RAGreedy::selectOrSplitImpl(LiveInterval &VirtReg,
     NewVRegs.push_back(VirtReg.reg);
   } else {
     NamedRegionTimer T("Spiller", TimerGroupName, TimePassesIsEnabled);
-    LiveRangeEdit LRE(&VirtReg, NewVRegs, *MF, *LIS, VRM, this);
+    LiveRangeEdit LRE(&VirtReg, NewVRegs, *MF, *LIS, VRM, this, &DeadRemats);
     spiller().spill(LRE);
     setStage(NewVRegs.begin(), NewVRegs.end(), RS_Done);
 
@@ -2583,6 +2593,7 @@ bool RAGreedy::runOnMachineFunction(MachineFunction &mf) {
   Bundles = &getAnalysis<EdgeBundles>();
   SpillPlacer = &getAnalysis<SpillPlacement>();
   DebugVars = &getAnalysis<LiveDebugVariables>();
+  AA = &getAnalysis<AAResultsWrapperPass>().getAAResults();
 
   initializeCSRCost();
 
@@ -2591,7 +2602,7 @@ bool RAGreedy::runOnMachineFunction(MachineFunction &mf) {
   DEBUG(LIS->dump());
 
   SA.reset(new SplitAnalysis(*VRM, *LIS, *Loops));
-  SE.reset(new SplitEditor(*SA, *LIS, *VRM, *DomTree, *MBFI));
+  SE.reset(new SplitEditor(*SA, *AA, *LIS, *VRM, *DomTree, *MBFI));
   ExtraRegInfo.clear();
   ExtraRegInfo.resize(MRI->getNumVirtRegs());
   NextCascade = 1;
@@ -2601,6 +2612,8 @@ bool RAGreedy::runOnMachineFunction(MachineFunction &mf) {
 
   allocatePhysRegs();
   tryHintsRecoloring();
+  postOptimization();
+
   releaseMemory();
   return true;
 }
diff --git a/lib/CodeGen/RegAllocPBQP.cpp b/lib/CodeGen/RegAllocPBQP.cpp
index fd28b05ed80a..d1221ec59bd4 100644
--- a/lib/CodeGen/RegAllocPBQP.cpp
+++ b/lib/CodeGen/RegAllocPBQP.cpp
@@ -123,6 +123,12 @@ private:
 
   RegSet VRegsToAlloc, EmptyIntervalVRegs;
 
+  /// Inst which is a def of an original reg and whose defs are already all
+  /// dead after remat is saved in DeadRemats. The deletion of such inst is
+  /// postponed till all the allocations are done, so its remat expr is
+  /// always available for the remat of all the siblings of the original reg.
+  SmallPtrSet<MachineInstr *, 32> DeadRemats;
+
   /// \brief Finds the initial set of vreg intervals to allocate.
   void findVRegIntervalsToAlloc(const MachineFunction &MF, LiveIntervals &LIS);
 
@@ -146,6 +152,7 @@ private:
   void finalizeAlloc(MachineFunction &MF, LiveIntervals &LIS,
                      VirtRegMap &VRM) const;
 
+  void postOptimization(Spiller &VRegSpiller, LiveIntervals &LIS);
 };
 
 char RegAllocPBQP::ID = 0;
@@ -631,7 +638,8 @@ void RegAllocPBQP::spillVReg(unsigned VReg,
                              VirtRegMap &VRM, Spiller &VRegSpiller) {
 
   VRegsToAlloc.erase(VReg);
-  LiveRangeEdit LRE(&LIS.getInterval(VReg), NewIntervals, MF, LIS, &VRM);
+  LiveRangeEdit LRE(&LIS.getInterval(VReg), NewIntervals, MF, LIS, &VRM,
+                    nullptr, &DeadRemats);
   VRegSpiller.spill(LRE);
 
   const TargetRegisterInfo &TRI = *MF.getSubtarget().getRegisterInfo();
@@ -713,6 +721,16 @@ void RegAllocPBQP::finalizeAlloc(MachineFunction &MF,
   }
 }
 
+void RegAllocPBQP::postOptimization(Spiller &VRegSpiller, LiveIntervals &LIS) {
+  VRegSpiller.postOptimization();
+  /// Remove dead defs because of rematerialization.
+  for (auto DeadInst : DeadRemats) {
+    LIS.RemoveMachineInstrFromMaps(*DeadInst);
+    DeadInst->eraseFromParent();
+  }
+  DeadRemats.clear();
+}
+
 static inline float normalizePBQPSpillWeight(float UseDefFreq, unsigned Size,
                                          unsigned NumInstr) {
   // All intervals have a spill weight that is mostly proportional to the number
@@ -798,6 +816,7 @@ bool RegAllocPBQP::runOnMachineFunction(MachineFunction &MF) {
 
   // Finalise allocation, allocate empty ranges.
   finalizeAlloc(MF, LIS, VRM);
+  postOptimization(*VRegSpiller, LIS);
   VRegsToAlloc.clear();
   EmptyIntervalVRegs.clear();
 
@@ -839,7 +858,7 @@ void PBQP::RegAlloc::PBQPRAGraph::dump(raw_ostream &OS) const {
   }
 }
 
-void PBQP::RegAlloc::PBQPRAGraph::dump() const { dump(dbgs()); }
+LLVM_DUMP_METHOD void PBQP::RegAlloc::PBQPRAGraph::dump() const { dump(dbgs()); }
 
 void PBQP::RegAlloc::PBQPRAGraph::printDot(raw_ostream &OS) const {
   OS << "graph {\n";
diff --git a/lib/CodeGen/RegUsageInfoCollector.cpp b/lib/CodeGen/RegUsageInfoCollector.cpp
new file mode 100644
index 000000000000..50b885423375
--- /dev/null
+++ b/lib/CodeGen/RegUsageInfoCollector.cpp
@@ -0,0 +1,142 @@
+//===-- RegUsageInfoCollector.cpp - Register Usage Information Collector --===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+///
+/// This pass is required to take advantage of the interprocedural register
+/// allocation infrastructure.
+///
+/// This pass is simple MachineFunction pass which collects register usage
+/// details by iterating through each physical registers and checking
+/// MRI::isPhysRegUsed() then creates a RegMask based on this details.
+/// The pass then stores this RegMask in PhysicalRegisterUsageInfo.cpp
+///
+//===----------------------------------------------------------------------===//
+
+#include "llvm/ADT/Statistic.h"
+#include "llvm/CodeGen/MachineBasicBlock.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstr.h"
+#include "llvm/CodeGen/MachineOperand.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/Passes.h"
+#include "llvm/CodeGen/RegisterUsageInfo.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Target/TargetFrameLowering.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "ip-regalloc"
+
+STATISTIC(NumCSROpt,
+          "Number of functions optimized for callee saved registers");
+
+namespace llvm {
+void initializeRegUsageInfoCollectorPass(PassRegistry &);
+}
+
+namespace {
+class RegUsageInfoCollector : public MachineFunctionPass {
+public:
+  RegUsageInfoCollector() : MachineFunctionPass(ID) {
+    PassRegistry &Registry = *PassRegistry::getPassRegistry();
+    initializeRegUsageInfoCollectorPass(Registry);
+  }
+
+  const char *getPassName() const override {
+    return "Register Usage Information Collector Pass";
+  }
+
+  void getAnalysisUsage(AnalysisUsage &AU) const override;
+
+  bool runOnMachineFunction(MachineFunction &MF) override;
+
+  static char ID;
+
+private:
+  void markRegClobbered(const TargetRegisterInfo *TRI, uint32_t *RegMask,
+                        unsigned PReg);
+};
+} // end of anonymous namespace
+
+char RegUsageInfoCollector::ID = 0;
+
+INITIALIZE_PASS_BEGIN(RegUsageInfoCollector, "RegUsageInfoCollector",
+                      "Register Usage Information Collector", false, false)
+INITIALIZE_PASS_DEPENDENCY(PhysicalRegisterUsageInfo)
+INITIALIZE_PASS_END(RegUsageInfoCollector, "RegUsageInfoCollector",
+                    "Register Usage Information Collector", false, false)
+
+FunctionPass *llvm::createRegUsageInfoCollector() {
+  return new RegUsageInfoCollector();
+}
+
+void RegUsageInfoCollector::markRegClobbered(const TargetRegisterInfo *TRI,
+                                             uint32_t *RegMask, unsigned PReg) {
+  // If PReg is clobbered then all of its alias are also clobbered.
+  for (MCRegAliasIterator AI(PReg, TRI, true); AI.isValid(); ++AI)
+    RegMask[*AI / 32] &= ~(1u << *AI % 32);
+}
+
+void RegUsageInfoCollector::getAnalysisUsage(AnalysisUsage &AU) const {
+  AU.addRequired<PhysicalRegisterUsageInfo>();
+  AU.setPreservesAll();
+  MachineFunctionPass::getAnalysisUsage(AU);
+}
+
+bool RegUsageInfoCollector::runOnMachineFunction(MachineFunction &MF) {
+  MachineRegisterInfo *MRI = &MF.getRegInfo();
+  const TargetRegisterInfo *TRI = MF.getSubtarget().getRegisterInfo();
+  const TargetMachine &TM = MF.getTarget();
+
+  DEBUG(dbgs() << " -------------------- " << getPassName()
+               << " -------------------- \n");
+  DEBUG(dbgs() << "Function Name : " << MF.getName() << "\n");
+
+  std::vector<uint32_t> RegMask;
+
+  // Compute the size of the bit vector to represent all the registers.
+  // The bit vector is broken into 32-bit chunks, thus takes the ceil of
+  // the number of registers divided by 32 for the size.
+  unsigned RegMaskSize = (TRI->getNumRegs() + 31) / 32;
+  RegMask.resize(RegMaskSize, 0xFFFFFFFF);
+
+  const Function *F = MF.getFunction();
+
+  PhysicalRegisterUsageInfo *PRUI = &getAnalysis<PhysicalRegisterUsageInfo>();
+
+  PRUI->setTargetMachine(&TM);
+
+  DEBUG(dbgs() << "Clobbered Registers: ");
+
+  for (unsigned PReg = 1, PRegE = TRI->getNumRegs(); PReg < PRegE; ++PReg)
+    if (MRI->isPhysRegModified(PReg, true))
+      markRegClobbered(TRI, &RegMask[0], PReg);
+
+  if (!TargetFrameLowering::isSafeForNoCSROpt(F)) {
+    const uint32_t *CallPreservedMask =
+        TRI->getCallPreservedMask(MF, F->getCallingConv());
+    // Set callee saved register as preserved.
+    for (unsigned i = 0; i < RegMaskSize; ++i)
+      RegMask[i] = RegMask[i] | CallPreservedMask[i];
+  } else {
+    ++NumCSROpt;
+    DEBUG(dbgs() << MF.getName()
+                 << " function optimized for not having CSR.\n");
+  }
+
+  for (unsigned PReg = 1, PRegE = TRI->getNumRegs(); PReg < PRegE; ++PReg)
+    if (MachineOperand::clobbersPhysReg(&(RegMask[0]), PReg))
+      DEBUG(dbgs() << TRI->getName(PReg) << " ");
+
+  DEBUG(dbgs() << " \n----------------------------------------\n");
+
+  PRUI->storeUpdateRegUsageInfo(F, std::move(RegMask));
+
+  return false;
+}
diff --git a/lib/CodeGen/RegUsageInfoPropagate.cpp b/lib/CodeGen/RegUsageInfoPropagate.cpp
new file mode 100644
index 000000000000..759566147cc7
--- /dev/null
+++ b/lib/CodeGen/RegUsageInfoPropagate.cpp
@@ -0,0 +1,131 @@
+//=--- RegUsageInfoPropagate.cpp - Register Usage Informartion Propagation --=//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+///
+/// This pass is required to take advantage of the interprocedural register
+/// allocation infrastructure.
+///
+/// This pass iterates through MachineInstrs in a given MachineFunction and at
+/// each callsite queries RegisterUsageInfo for RegMask (calculated based on
+/// actual register allocation) of the callee function, if the RegMask detail
+/// is available then this pass will update the RegMask of the call instruction.
+/// This updated RegMask will be used by the register allocator while allocating
+/// the current MachineFunction.
+///
+//===----------------------------------------------------------------------===//
+
+#include "llvm/CodeGen/MachineBasicBlock.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstr.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/Passes.h"
+#include "llvm/CodeGen/RegisterUsageInfo.h"
+#include "llvm/IR/Module.h"
+#include "llvm/PassAnalysisSupport.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Target/TargetMachine.h"
+#include <map>
+#include <string>
+
+namespace llvm {
+void initializeRegUsageInfoPropagationPassPass(PassRegistry &);
+}
+
+using namespace llvm;
+
+#define DEBUG_TYPE "ip-regalloc"
+
+#define RUIP_NAME "Register Usage Information Propagation"
+
+namespace {
+class RegUsageInfoPropagationPass : public MachineFunctionPass {
+
+public:
+  RegUsageInfoPropagationPass() : MachineFunctionPass(ID) {
+    PassRegistry &Registry = *PassRegistry::getPassRegistry();
+    initializeRegUsageInfoPropagationPassPass(Registry);
+  }
+
+  const char *getPassName() const override { return RUIP_NAME; }
+
+  bool runOnMachineFunction(MachineFunction &MF) override;
+
+  void getAnalysisUsage(AnalysisUsage &AU) const override;
+
+  static char ID;
+
+private:
+  static void setRegMask(MachineInstr &MI, const uint32_t *RegMask) {
+    for (MachineOperand &MO : MI.operands()) {
+      if (MO.isRegMask())
+        MO.setRegMask(RegMask);
+    }
+  }
+};
+} // end of anonymous namespace
+char RegUsageInfoPropagationPass::ID = 0;
+
+INITIALIZE_PASS_BEGIN(RegUsageInfoPropagationPass, "reg-usage-propagation",
+                      RUIP_NAME, false, false)
+INITIALIZE_PASS_DEPENDENCY(PhysicalRegisterUsageInfo)
+INITIALIZE_PASS_END(RegUsageInfoPropagationPass, "reg-usage-propagation",
+                    RUIP_NAME, false, false)
+
+FunctionPass *llvm::createRegUsageInfoPropPass() {
+  return new RegUsageInfoPropagationPass();
+}
+
+void RegUsageInfoPropagationPass::getAnalysisUsage(AnalysisUsage &AU) const {
+  AU.addRequired<PhysicalRegisterUsageInfo>();
+  AU.setPreservesAll();
+  MachineFunctionPass::getAnalysisUsage(AU);
+}
+
+bool RegUsageInfoPropagationPass::runOnMachineFunction(MachineFunction &MF) {
+  const Module *M = MF.getFunction()->getParent();
+  PhysicalRegisterUsageInfo *PRUI = &getAnalysis<PhysicalRegisterUsageInfo>();
+
+  DEBUG(dbgs() << " ++++++++++++++++++++ " << getPassName()
+               << " ++++++++++++++++++++  \n");
+  DEBUG(dbgs() << "MachineFunction : " << MF.getName() << "\n");
+
+  bool Changed = false;
+
+  for (MachineBasicBlock &MBB : MF) {
+    for (MachineInstr &MI : MBB) {
+      if (!MI.isCall())
+        continue;
+      DEBUG(dbgs()
+            << "Call Instruction Before Register Usage Info Propagation : \n");
+      DEBUG(dbgs() << MI << "\n");
+
+      auto UpdateRegMask = [&](const Function *F) {
+        const auto *RegMask = PRUI->getRegUsageInfo(F);
+        if (!RegMask)
+          return;
+        setRegMask(MI, &(*RegMask)[0]);
+        Changed = true;
+      };
+
+      MachineOperand &Operand = MI.getOperand(0);
+      if (Operand.isGlobal())
+        UpdateRegMask(cast<Function>(Operand.getGlobal()));
+      else if (Operand.isSymbol())
+        UpdateRegMask(M->getFunction(Operand.getSymbolName()));
+
+      DEBUG(dbgs()
+            << "Call Instruction After Register Usage Info Propagation : \n");
+      DEBUG(dbgs() << MI << "\n");
+    }
+  }
+
+  DEBUG(dbgs() << " +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++"
+                  "++++++ \n");
+  return Changed;
+}
diff --git a/lib/CodeGen/RegisterCoalescer.cpp b/lib/CodeGen/RegisterCoalescer.cpp
index c1ff13ec7ca0..617ece902e0e 100644
--- a/lib/CodeGen/RegisterCoalescer.cpp
+++ b/lib/CodeGen/RegisterCoalescer.cpp
@@ -203,6 +203,16 @@ namespace {
     /// make sure to set it to the correct physical subregister.
     void updateRegDefsUses(unsigned SrcReg, unsigned DstReg, unsigned SubIdx);
 
+    /// If the given machine operand reads only undefined lanes add an undef
+    /// flag.
+    /// This can happen when undef uses were previously concealed by a copy
+    /// which we coalesced. Example:
+    ///    %vreg0:sub0<def,read-undef> = ...
+    ///    %vreg1 = COPY %vreg0       <-- Coalescing COPY reveals undef
+    ///           = use %vreg1:sub1   <-- hidden undef use
+    void addUndefFlag(const LiveInterval &Int, SlotIndex UseIdx,
+                      MachineOperand &MO, unsigned SubRegIdx);
+
     /// Handle copies of undef values.
     /// Returns true if @p CopyMI was a copy of an undef value and eliminated.
     bool eliminateUndefCopy(MachineInstr *CopyMI);
@@ -467,7 +477,7 @@ bool RegisterCoalescer::adjustCopiesBackFrom(const CoalescerPair &CP,
     LIS->getInterval(CP.isFlipped() ? CP.getDstReg() : CP.getSrcReg());
   LiveInterval &IntB =
     LIS->getInterval(CP.isFlipped() ? CP.getSrcReg() : CP.getDstReg());
-  SlotIndex CopyIdx = LIS->getInstructionIndex(CopyMI).getRegSlot();
+  SlotIndex CopyIdx = LIS->getInstructionIndex(*CopyMI).getRegSlot();
 
   // We have a non-trivially-coalescable copy with IntA being the source and
   // IntB being the dest, thus this defines a value number in IntB.  If the
@@ -642,7 +652,7 @@ bool RegisterCoalescer::removeCopyByCommutingDef(const CoalescerPair &CP,
 
   // BValNo is a value number in B that is defined by a copy from A. 'B1' in
   // the example above.
-  SlotIndex CopyIdx = LIS->getInstructionIndex(CopyMI).getRegSlot();
+  SlotIndex CopyIdx = LIS->getInstructionIndex(*CopyMI).getRegSlot();
   VNInfo *BValNo = IntB.getVNInfoAt(CopyIdx);
   assert(BValNo != nullptr && BValNo->def == CopyIdx);
 
@@ -674,7 +684,7 @@ bool RegisterCoalescer::removeCopyByCommutingDef(const CoalescerPair &CP,
   // operands then all possible variants (i.e. op#1<->op#2, op#1<->op#3,
   // op#2<->op#3) of commute transformation should be considered/tried here.
   unsigned NewDstIdx = TargetInstrInfo::CommuteAnyOperandIndex;
-  if (!TII->findCommutedOpIndices(DefMI, UseOpIdx, NewDstIdx))
+  if (!TII->findCommutedOpIndices(*DefMI, UseOpIdx, NewDstIdx))
     return false;
 
   MachineOperand &NewDstMO = DefMI->getOperand(NewDstIdx);
@@ -692,7 +702,7 @@ bool RegisterCoalescer::removeCopyByCommutingDef(const CoalescerPair &CP,
   for (MachineOperand &MO : MRI->use_nodbg_operands(IntA.reg)) {
     MachineInstr *UseMI = MO.getParent();
     unsigned OpNo = &MO - &UseMI->getOperand(0);
-    SlotIndex UseIdx = LIS->getInstructionIndex(UseMI);
+    SlotIndex UseIdx = LIS->getInstructionIndex(*UseMI);
     LiveInterval::iterator US = IntA.FindSegmentContaining(UseIdx);
     if (US == IntA.end() || US->valno != AValNo)
       continue;
@@ -708,7 +718,7 @@ bool RegisterCoalescer::removeCopyByCommutingDef(const CoalescerPair &CP,
   // transformation.  Start by commuting the instruction.
   MachineBasicBlock *MBB = DefMI->getParent();
   MachineInstr *NewMI =
-      TII->commuteInstruction(DefMI, false, UseOpIdx, NewDstIdx);
+      TII->commuteInstruction(*DefMI, false, UseOpIdx, NewDstIdx);
   if (!NewMI)
     return false;
   if (TargetRegisterInfo::isVirtualRegister(IntA.reg) &&
@@ -716,7 +726,7 @@ bool RegisterCoalescer::removeCopyByCommutingDef(const CoalescerPair &CP,
       !MRI->constrainRegClass(IntB.reg, MRI->getRegClass(IntA.reg)))
     return false;
   if (NewMI != DefMI) {
-    LIS->ReplaceMachineInstrInMaps(DefMI, NewMI);
+    LIS->ReplaceMachineInstrInMaps(*DefMI, *NewMI);
     MachineBasicBlock::iterator Pos = DefMI;
     MBB->insert(Pos, NewMI);
     MBB->erase(DefMI);
@@ -746,7 +756,7 @@ bool RegisterCoalescer::removeCopyByCommutingDef(const CoalescerPair &CP,
       UseMO.setReg(NewReg);
       continue;
     }
-    SlotIndex UseIdx = LIS->getInstructionIndex(UseMI).getRegSlot(true);
+    SlotIndex UseIdx = LIS->getInstructionIndex(*UseMI).getRegSlot(true);
     LiveInterval::iterator US = IntA.FindSegmentContaining(UseIdx);
     assert(US != IntA.end() && "Use must be live");
     if (US->valno != AValNo)
@@ -784,7 +794,7 @@ bool RegisterCoalescer::removeCopyByCommutingDef(const CoalescerPair &CP,
     }
 
     ErasedInstrs.insert(UseMI);
-    LIS->RemoveMachineInstrFromMaps(UseMI);
+    LIS->RemoveMachineInstrFromMaps(*UseMI);
     UseMI->eraseFromParent();
   }
 
@@ -879,7 +889,7 @@ bool RegisterCoalescer::reMaterializeTrivialDef(const CoalescerPair &CP,
     return false;
 
   LiveInterval &SrcInt = LIS->getInterval(SrcReg);
-  SlotIndex CopyIdx = LIS->getInstructionIndex(CopyMI);
+  SlotIndex CopyIdx = LIS->getInstructionIndex(*CopyMI);
   VNInfo *ValNo = SrcInt.Query(CopyIdx).valueIn();
   assert(ValNo && "CopyMI input register not live");
   if (ValNo->isPHIDef() || ValNo->isUnused())
@@ -891,9 +901,9 @@ bool RegisterCoalescer::reMaterializeTrivialDef(const CoalescerPair &CP,
     IsDefCopy = true;
     return false;
   }
-  if (!TII->isAsCheapAsAMove(DefMI))
+  if (!TII->isAsCheapAsAMove(*DefMI))
     return false;
-  if (!TII->isTriviallyReMaterializable(DefMI, AA))
+  if (!TII->isTriviallyReMaterializable(*DefMI, AA))
     return false;
   if (!definesFullReg(*DefMI, SrcReg))
     return false;
@@ -939,11 +949,13 @@ bool RegisterCoalescer::reMaterializeTrivialDef(const CoalescerPair &CP,
     }
   }
 
+  DebugLoc DL = CopyMI->getDebugLoc();
   MachineBasicBlock *MBB = CopyMI->getParent();
   MachineBasicBlock::iterator MII =
     std::next(MachineBasicBlock::iterator(CopyMI));
-  TII->reMaterialize(*MBB, MII, DstReg, SrcIdx, DefMI, *TRI);
-  MachineInstr *NewMI = std::prev(MII);
+  TII->reMaterialize(*MBB, MII, DstReg, SrcIdx, *DefMI, *TRI);
+  MachineInstr &NewMI = *std::prev(MII);
+  NewMI.setDebugLoc(DL);
 
   // In a situation like the following:
   //     %vreg0:subreg = instr              ; DefMI, subreg = DstIdx
@@ -952,7 +964,7 @@ bool RegisterCoalescer::reMaterializeTrivialDef(const CoalescerPair &CP,
   //     %vreg1 = instr
   const TargetRegisterClass *NewRC = CP.getNewRC();
   if (DstIdx != 0) {
-    MachineOperand &DefMO = NewMI->getOperand(0);
+    MachineOperand &DefMO = NewMI.getOperand(0);
     if (DefMO.getSubReg() == DstIdx) {
       assert(SrcIdx == 0 && CP.isFlipped()
              && "Shouldn't have SrcIdx+DstIdx at this point");
@@ -967,7 +979,24 @@ bool RegisterCoalescer::reMaterializeTrivialDef(const CoalescerPair &CP,
     }
   }
 
-  LIS->ReplaceMachineInstrInMaps(CopyMI, NewMI);
+  // CopyMI may have implicit operands, save them so that we can transfer them
+  // over to the newly materialized instruction after CopyMI is removed.
+  SmallVector<MachineOperand, 4> ImplicitOps;
+  ImplicitOps.reserve(CopyMI->getNumOperands() -
+                      CopyMI->getDesc().getNumOperands());
+  for (unsigned I = CopyMI->getDesc().getNumOperands(),
+                E = CopyMI->getNumOperands();
+       I != E; ++I) {
+    MachineOperand &MO = CopyMI->getOperand(I);
+    if (MO.isReg()) {
+      assert(MO.isImplicit() && "No explicit operands after implict operands.");
+      // Discard VReg implicit defs.
+      if (TargetRegisterInfo::isPhysicalRegister(MO.getReg()))
+        ImplicitOps.push_back(MO);
+    }
+  }
+
+  LIS->ReplaceMachineInstrInMaps(*CopyMI, NewMI);
   CopyMI->eraseFromParent();
   ErasedInstrs.insert(CopyMI);
 
@@ -975,9 +1004,10 @@ bool RegisterCoalescer::reMaterializeTrivialDef(const CoalescerPair &CP,
   // We need to remember these so we can add intervals once we insert
   // NewMI into SlotIndexes.
   SmallVector<unsigned, 4> NewMIImplDefs;
-  for (unsigned i = NewMI->getDesc().getNumOperands(),
-         e = NewMI->getNumOperands(); i != e; ++i) {
-    MachineOperand &MO = NewMI->getOperand(i);
+  for (unsigned i = NewMI.getDesc().getNumOperands(),
+                e = NewMI.getNumOperands();
+       i != e; ++i) {
+    MachineOperand &MO = NewMI.getOperand(i);
     if (MO.isReg() && MO.isDef()) {
       assert(MO.isImplicit() && MO.isDead() &&
              TargetRegisterInfo::isPhysicalRegister(MO.getReg()));
@@ -986,7 +1016,7 @@ bool RegisterCoalescer::reMaterializeTrivialDef(const CoalescerPair &CP,
   }
 
   if (TargetRegisterInfo::isVirtualRegister(DstReg)) {
-    unsigned NewIdx = NewMI->getOperand(0).getSubReg();
+    unsigned NewIdx = NewMI.getOperand(0).getSubReg();
 
     if (DefRC != nullptr) {
       if (NewIdx)
@@ -995,20 +1025,54 @@ bool RegisterCoalescer::reMaterializeTrivialDef(const CoalescerPair &CP,
         NewRC = TRI->getCommonSubClass(NewRC, DefRC);
       assert(NewRC && "subreg chosen for remat incompatible with instruction");
     }
+    // Remap subranges to new lanemask and change register class.
+    LiveInterval &DstInt = LIS->getInterval(DstReg);
+    for (LiveInterval::SubRange &SR : DstInt.subranges()) {
+      SR.LaneMask = TRI->composeSubRegIndexLaneMask(DstIdx, SR.LaneMask);
+    }
     MRI->setRegClass(DstReg, NewRC);
 
+    // Update machine operands and add flags.
     updateRegDefsUses(DstReg, DstReg, DstIdx);
-    NewMI->getOperand(0).setSubReg(NewIdx);
-  } else if (NewMI->getOperand(0).getReg() != CopyDstReg) {
+    NewMI.getOperand(0).setSubReg(NewIdx);
+    // Add dead subregister definitions if we are defining the whole register
+    // but only part of it is live.
+    // This could happen if the rematerialization instruction is rematerializing
+    // more than actually is used in the register.
+    // An example would be:
+    // vreg1 = LOAD CONSTANTS 5, 8 ; Loading both 5 and 8 in different subregs
+    // ; Copying only part of the register here, but the rest is undef.
+    // vreg2:sub_16bit<def, read-undef> = COPY vreg1:sub_16bit
+    // ==>
+    // ; Materialize all the constants but only using one
+    // vreg2 = LOAD_CONSTANTS 5, 8
+    //
+    // at this point for the part that wasn't defined before we could have
+    // subranges missing the definition.
+    if (NewIdx == 0 && DstInt.hasSubRanges()) {
+      SlotIndex CurrIdx = LIS->getInstructionIndex(NewMI);
+      SlotIndex DefIndex =
+          CurrIdx.getRegSlot(NewMI.getOperand(0).isEarlyClobber());
+      LaneBitmask MaxMask = MRI->getMaxLaneMaskForVReg(DstReg);
+      VNInfo::Allocator& Alloc = LIS->getVNInfoAllocator();
+      for (LiveInterval::SubRange &SR : DstInt.subranges()) {
+        if (!SR.liveAt(DefIndex))
+          SR.createDeadDef(DefIndex, Alloc);
+        MaxMask &= ~SR.LaneMask;
+      }
+      if (MaxMask != 0) {
+        LiveInterval::SubRange *SR = DstInt.createSubRange(Alloc, MaxMask);
+        SR->createDeadDef(DefIndex, Alloc);
+      }
+    }
+  } else if (NewMI.getOperand(0).getReg() != CopyDstReg) {
     // The New instruction may be defining a sub-register of what's actually
     // been asked for. If so it must implicitly define the whole thing.
     assert(TargetRegisterInfo::isPhysicalRegister(DstReg) &&
            "Only expect virtual or physical registers in remat");
-    NewMI->getOperand(0).setIsDead(true);
-    NewMI->addOperand(MachineOperand::CreateReg(CopyDstReg,
-                                                true  /*IsDef*/,
-                                                true  /*IsImp*/,
-                                                false /*IsKill*/));
+    NewMI.getOperand(0).setIsDead(true);
+    NewMI.addOperand(MachineOperand::CreateReg(
+        CopyDstReg, true /*IsDef*/, true /*IsImp*/, false /*IsKill*/));
     // Record small dead def live-ranges for all the subregisters
     // of the destination register.
     // Otherwise, variables that live through may miss some
@@ -1026,28 +1090,18 @@ bool RegisterCoalescer::reMaterializeTrivialDef(const CoalescerPair &CP,
     // no live-ranges would have been created for ECX.
     // Fix that!
     SlotIndex NewMIIdx = LIS->getInstructionIndex(NewMI);
-    for (MCRegUnitIterator Units(NewMI->getOperand(0).getReg(), TRI);
+    for (MCRegUnitIterator Units(NewMI.getOperand(0).getReg(), TRI);
          Units.isValid(); ++Units)
       if (LiveRange *LR = LIS->getCachedRegUnit(*Units))
         LR->createDeadDef(NewMIIdx.getRegSlot(), LIS->getVNInfoAllocator());
   }
 
-  if (NewMI->getOperand(0).getSubReg())
-    NewMI->getOperand(0).setIsUndef();
+  if (NewMI.getOperand(0).getSubReg())
+    NewMI.getOperand(0).setIsUndef();
 
-  // CopyMI may have implicit operands, transfer them over to the newly
-  // rematerialized instruction. And update implicit def interval valnos.
-  for (unsigned i = CopyMI->getDesc().getNumOperands(),
-         e = CopyMI->getNumOperands(); i != e; ++i) {
-    MachineOperand &MO = CopyMI->getOperand(i);
-    if (MO.isReg()) {
-      assert(MO.isImplicit() && "No explicit operands after implict operands.");
-      // Discard VReg implicit defs.
-      if (TargetRegisterInfo::isPhysicalRegister(MO.getReg())) {
-        NewMI->addOperand(MO);
-      }
-    }
-  }
+  // Transfer over implicit operands to the rematerialized instruction.
+  for (MachineOperand &MO : ImplicitOps)
+    NewMI.addOperand(MO);
 
   SlotIndex NewMIIdx = LIS->getInstructionIndex(NewMI);
   for (unsigned i = 0, e = NewMIImplDefs.size(); i != e; ++i) {
@@ -1057,7 +1111,7 @@ bool RegisterCoalescer::reMaterializeTrivialDef(const CoalescerPair &CP,
         LR->createDeadDef(NewMIIdx.getRegSlot(), LIS->getVNInfoAllocator());
   }
 
-  DEBUG(dbgs() << "Remat: " << *NewMI);
+  DEBUG(dbgs() << "Remat: " << NewMI);
   ++NumReMats;
 
   // The source interval can become smaller because we removed a use.
@@ -1093,7 +1147,7 @@ bool RegisterCoalescer::eliminateUndefCopy(MachineInstr *CopyMI) {
   unsigned SrcReg, DstReg, SrcSubIdx, DstSubIdx;
   isMoveInstr(*TRI, CopyMI, SrcReg, DstReg, SrcSubIdx, DstSubIdx);
 
-  SlotIndex Idx = LIS->getInstructionIndex(CopyMI);
+  SlotIndex Idx = LIS->getInstructionIndex(*CopyMI);
   const LiveInterval &SrcLI = LIS->getInterval(SrcReg);
   // CopyMI is undef iff SrcReg is not live before the instruction.
   if (SrcSubIdx != 0 && SrcLI.hasSubRanges()) {
@@ -1136,7 +1190,7 @@ bool RegisterCoalescer::eliminateUndefCopy(MachineInstr *CopyMI) {
     if (MO.isDef() /*|| MO.isUndef()*/)
       continue;
     const MachineInstr &MI = *MO.getParent();
-    SlotIndex UseIdx = LIS->getInstructionIndex(&MI);
+    SlotIndex UseIdx = LIS->getInstructionIndex(MI);
     LaneBitmask UseMask = TRI->getSubRegIndexLaneMask(MO.getSubReg());
     bool isLive;
     if (UseMask != ~0u && DstLI.hasSubRanges()) {
@@ -1159,12 +1213,51 @@ bool RegisterCoalescer::eliminateUndefCopy(MachineInstr *CopyMI) {
   return true;
 }
 
+void RegisterCoalescer::addUndefFlag(const LiveInterval &Int, SlotIndex UseIdx,
+                                     MachineOperand &MO, unsigned SubRegIdx) {
+  LaneBitmask Mask = TRI->getSubRegIndexLaneMask(SubRegIdx);
+  if (MO.isDef())
+    Mask = ~Mask;
+  bool IsUndef = true;
+  for (const LiveInterval::SubRange &S : Int.subranges()) {
+    if ((S.LaneMask & Mask) == 0)
+      continue;
+    if (S.liveAt(UseIdx)) {
+      IsUndef = false;
+      break;
+    }
+  }
+  if (IsUndef) {
+    MO.setIsUndef(true);
+    // We found out some subregister use is actually reading an undefined
+    // value. In some cases the whole vreg has become undefined at this
+    // point so we have to potentially shrink the main range if the
+    // use was ending a live segment there.
+    LiveQueryResult Q = Int.Query(UseIdx);
+    if (Q.valueOut() == nullptr)
+      ShrinkMainRange = true;
+  }
+}
+
 void RegisterCoalescer::updateRegDefsUses(unsigned SrcReg,
                                           unsigned DstReg,
                                           unsigned SubIdx) {
   bool DstIsPhys = TargetRegisterInfo::isPhysicalRegister(DstReg);
   LiveInterval *DstInt = DstIsPhys ? nullptr : &LIS->getInterval(DstReg);
 
+  if (DstInt && DstInt->hasSubRanges() && DstReg != SrcReg) {
+    for (MachineOperand &MO : MRI->reg_operands(DstReg)) {
+      unsigned SubReg = MO.getSubReg();
+      if (SubReg == 0 || MO.isUndef())
+        continue;
+      MachineInstr &MI = *MO.getParent();
+      if (MI.isDebugValue())
+        continue;
+      SlotIndex UseIdx = LIS->getInstructionIndex(MI).getRegSlot(true);
+      addUndefFlag(*DstInt, UseIdx, MO, SubReg);
+    }
+  }
+
   SmallPtrSet<MachineInstr*, 8> Visited;
   for (MachineRegisterInfo::reg_instr_iterator
        I = MRI->reg_instr_begin(SrcReg), E = MRI->reg_instr_end();
@@ -1186,7 +1279,7 @@ void RegisterCoalescer::updateRegDefsUses(unsigned SrcReg,
     // If SrcReg wasn't read, it may still be the case that DstReg is live-in
     // because SrcReg is a sub-register.
     if (DstInt && !Reads && SubIdx)
-      Reads = DstInt->liveAt(LIS->getInstructionIndex(UseMI));
+      Reads = DstInt->liveAt(LIS->getInstructionIndex(*UseMI));
 
     // Replace SrcReg with DstReg in all UseMI operands.
     for (unsigned i = 0, e = Ops.size(); i != e; ++i) {
@@ -1206,30 +1299,11 @@ void RegisterCoalescer::updateRegDefsUses(unsigned SrcReg,
           LaneBitmask Mask = MRI->getMaxLaneMaskForVReg(DstInt->reg);
           DstInt->createSubRangeFrom(Allocator, Mask, *DstInt);
         }
-        LaneBitmask Mask = TRI->getSubRegIndexLaneMask(SubIdx);
-        bool IsUndef = true;
         SlotIndex MIIdx = UseMI->isDebugValue()
-          ? LIS->getSlotIndexes()->getIndexBefore(UseMI)
-          : LIS->getInstructionIndex(UseMI);
+                              ? LIS->getSlotIndexes()->getIndexBefore(*UseMI)
+                              : LIS->getInstructionIndex(*UseMI);
         SlotIndex UseIdx = MIIdx.getRegSlot(true);
-        for (LiveInterval::SubRange &S : DstInt->subranges()) {
-          if ((S.LaneMask & Mask) == 0)
-            continue;
-          if (S.liveAt(UseIdx)) {
-            IsUndef = false;
-            break;
-          }
-        }
-        if (IsUndef) {
-          MO.setIsUndef(true);
-          // We found out some subregister use is actually reading an undefined
-          // value. In some cases the whole vreg has become undefined at this
-          // point so we have to potentially shrink the main range if the
-          // use was ending a live segment there.
-          LiveQueryResult Q = DstInt->Query(MIIdx);
-          if (Q.valueOut() == nullptr)
-            ShrinkMainRange = true;
-        }
+        addUndefFlag(*DstInt, UseIdx, MO, SubIdx);
       }
 
       if (DstIsPhys)
@@ -1241,7 +1315,7 @@ void RegisterCoalescer::updateRegDefsUses(unsigned SrcReg,
     DEBUG({
         dbgs() << "\t\tupdated: ";
         if (!UseMI->isDebugValue())
-          dbgs() << LIS->getInstructionIndex(UseMI) << "\t";
+          dbgs() << LIS->getInstructionIndex(*UseMI) << "\t";
         dbgs() << *UseMI;
       });
   }
@@ -1267,7 +1341,7 @@ bool RegisterCoalescer::canJoinPhys(const CoalescerPair &CP) {
 bool RegisterCoalescer::joinCopy(MachineInstr *CopyMI, bool &Again) {
 
   Again = false;
-  DEBUG(dbgs() << LIS->getInstructionIndex(CopyMI) << '\t' << *CopyMI);
+  DEBUG(dbgs() << LIS->getInstructionIndex(*CopyMI) << '\t' << *CopyMI);
 
   CoalescerPair CP(*TRI);
   if (!CP.setRegisters(CopyMI)) {
@@ -1303,7 +1377,7 @@ bool RegisterCoalescer::joinCopy(MachineInstr *CopyMI, bool &Again) {
 
   // Eliminate undefs.
   if (!CP.isPhys() && eliminateUndefCopy(CopyMI)) {
-    LIS->RemoveMachineInstrFromMaps(CopyMI);
+    LIS->RemoveMachineInstrFromMaps(*CopyMI);
     CopyMI->eraseFromParent();
     return false;  // Not coalescable.
   }
@@ -1314,7 +1388,7 @@ bool RegisterCoalescer::joinCopy(MachineInstr *CopyMI, bool &Again) {
   if (CP.getSrcReg() == CP.getDstReg()) {
     LiveInterval &LI = LIS->getInterval(CP.getSrcReg());
     DEBUG(dbgs() << "\tCopy already coalesced: " << LI << '\n');
-    const SlotIndex CopyIdx = LIS->getInstructionIndex(CopyMI);
+    const SlotIndex CopyIdx = LIS->getInstructionIndex(*CopyMI);
     LiveQueryResult LRQ = LI.Query(CopyIdx);
     if (VNInfo *DefVNI = LRQ.valueDefined()) {
       VNInfo *ReadVNI = LRQ.valueIn();
@@ -1332,7 +1406,7 @@ bool RegisterCoalescer::joinCopy(MachineInstr *CopyMI, bool &Again) {
       }
       DEBUG(dbgs() << "\tMerged values:          " << LI << '\n');
     }
-    LIS->RemoveMachineInstrFromMaps(CopyMI);
+    LIS->RemoveMachineInstrFromMaps(*CopyMI);
     CopyMI->eraseFromParent();
     return true;
   }
@@ -1393,7 +1467,7 @@ bool RegisterCoalescer::joinCopy(MachineInstr *CopyMI, bool &Again) {
     if (!CP.isPartial() && !CP.isPhys()) {
       if (adjustCopiesBackFrom(CP, CopyMI) ||
           removeCopyByCommutingDef(CP, CopyMI)) {
-        LIS->RemoveMachineInstrFromMaps(CopyMI);
+        LIS->RemoveMachineInstrFromMaps(*CopyMI);
         CopyMI->eraseFromParent();
         DEBUG(dbgs() << "\tTrivial!\n");
         return true;
@@ -1507,8 +1581,8 @@ bool RegisterCoalescer::joinReservedPhysReg(CoalescerPair &CP) {
 
     MachineInstr *DestMI = MRI->getVRegDef(RHS.reg);
     CopyMI = &*MRI->use_instr_nodbg_begin(RHS.reg);
-    const SlotIndex CopyRegIdx = LIS->getInstructionIndex(CopyMI).getRegSlot();
-    const SlotIndex DestRegIdx = LIS->getInstructionIndex(DestMI).getRegSlot();
+    const SlotIndex CopyRegIdx = LIS->getInstructionIndex(*CopyMI).getRegSlot();
+    const SlotIndex DestRegIdx = LIS->getInstructionIndex(*DestMI).getRegSlot();
 
     // We checked above that there are no interfering defs of the physical
     // register. However, for this case, where we intent to move up the def of
@@ -1544,7 +1618,7 @@ bool RegisterCoalescer::joinReservedPhysReg(CoalescerPair &CP) {
     }
   }
 
-  LIS->RemoveMachineInstrFromMaps(CopyMI);
+  LIS->RemoveMachineInstrFromMaps(*CopyMI);
   CopyMI->eraseFromParent();
 
   // We don't track kills for reserved registers.
@@ -1775,7 +1849,7 @@ class JoinVals {
 
   /// Return true if MI uses any of the given Lanes from Reg.
   /// This does not include partial redefinitions of Reg.
-  bool usesLanes(const MachineInstr *MI, unsigned, unsigned, LaneBitmask) const;
+  bool usesLanes(const MachineInstr &MI, unsigned, unsigned, LaneBitmask) const;
 
   /// Determine if ValNo is a copy of a value number in LR or Other.LR that will
   /// be pruned:
@@ -2025,7 +2099,7 @@ JoinVals::analyzeValue(unsigned ValNo, JoinVals &Other) {
   // IMPLICIT_DEF instructions behind, and there is nothing wrong with it
   // technically.
   //
-  // WHen it happens, treat that IMPLICIT_DEF as a normal value, and don't try
+  // When it happens, treat that IMPLICIT_DEF as a normal value, and don't try
   // to erase the IMPLICIT_DEF instruction.
   if (OtherV.ErasableImplicitDef && DefMI &&
       DefMI->getParent() != Indexes->getMBBFromIndex(V.OtherVNI->def)) {
@@ -2219,11 +2293,11 @@ taintExtent(unsigned ValNo, LaneBitmask TaintedLanes, JoinVals &Other,
   return true;
 }
 
-bool JoinVals::usesLanes(const MachineInstr *MI, unsigned Reg, unsigned SubIdx,
+bool JoinVals::usesLanes(const MachineInstr &MI, unsigned Reg, unsigned SubIdx,
                          LaneBitmask Lanes) const {
-  if (MI->isDebugValue())
+  if (MI.isDebugValue())
     return false;
-  for (const MachineOperand &MO : MI->operands()) {
+  for (const MachineOperand &MO : MI.operands()) {
     if (!MO.isReg() || MO.isDef() || MO.getReg() != Reg)
       continue;
     if (!MO.readsReg())
@@ -2278,7 +2352,7 @@ bool JoinVals::resolveConflicts(JoinVals &Other) {
     unsigned TaintNum = 0;
     for(;;) {
       assert(MI != MBB->end() && "Bad LastMI");
-      if (usesLanes(MI, Other.Reg, Other.SubIdx, TaintedLanes)) {
+      if (usesLanes(*MI, Other.Reg, Other.SubIdx, TaintedLanes)) {
         DEBUG(dbgs() << "\t\ttainted lanes used by: " << *MI);
         return false;
       }
@@ -2457,7 +2531,7 @@ void JoinVals::eraseInstrs(SmallPtrSetImpl<MachineInstr*> &ErasedInstrs,
       }
       ErasedInstrs.insert(MI);
       DEBUG(dbgs() << "\t\terased:\t" << Def << '\t' << *MI);
-      LIS->RemoveMachineInstrFromMaps(MI);
+      LIS->RemoveMachineInstrFromMaps(*MI);
       MI->eraseFromParent();
       break;
     }
@@ -2838,16 +2912,15 @@ RegisterCoalescer::copyCoalesceInMBB(MachineBasicBlock *MBB) {
   }
   else {
     SmallVector<MachineInstr*, 2> Terminals;
-     for (MachineBasicBlock::iterator MII = MBB->begin(), E = MBB->end();
-          MII != E; ++MII)
-       if (MII->isCopyLike()) {
-        if (applyTerminalRule(*MII))
-          Terminals.push_back(&(*MII));
+    for (MachineInstr &MII : *MBB)
+      if (MII.isCopyLike()) {
+        if (applyTerminalRule(MII))
+          Terminals.push_back(&MII);
         else
-          WorkList.push_back(MII);
-       }
-     // Append the copies evicted by the terminal rule at the end of the list.
-     WorkList.append(Terminals.begin(), Terminals.end());
+          WorkList.push_back(&MII);
+      }
+    // Append the copies evicted by the terminal rule at the end of the list.
+    WorkList.append(Terminals.begin(), Terminals.end());
   }
   // Try coalescing the collected copies immediately, and remove the nulls.
   // This prevents the WorkList from getting too large since most copies are
diff --git a/lib/CodeGen/RegisterPressure.cpp b/lib/CodeGen/RegisterPressure.cpp
index f33dc3e10492..a21d6c1d4d63 100644
--- a/lib/CodeGen/RegisterPressure.cpp
+++ b/lib/CodeGen/RegisterPressure.cpp
@@ -24,7 +24,13 @@ using namespace llvm;
 
 /// Increase pressure for each pressure set provided by TargetRegisterInfo.
 static void increaseSetPressure(std::vector<unsigned> &CurrSetPressure,
-                                PSetIterator PSetI) {
+                                const MachineRegisterInfo &MRI, unsigned Reg,
+                                LaneBitmask PrevMask, LaneBitmask NewMask) {
+  assert((PrevMask & ~NewMask) == 0 && "Must not remove bits");
+  if (PrevMask != 0 || NewMask == 0)
+    return;
+
+  PSetIterator PSetI = MRI.getPressureSets(Reg);
   unsigned Weight = PSetI.getWeight();
   for (; PSetI.isValid(); ++PSetI)
     CurrSetPressure[*PSetI] += Weight;
@@ -32,7 +38,13 @@ static void increaseSetPressure(std::vector<unsigned> &CurrSetPressure,
 
 /// Decrease pressure for each pressure set provided by TargetRegisterInfo.
 static void decreaseSetPressure(std::vector<unsigned> &CurrSetPressure,
-                                PSetIterator PSetI) {
+                                const MachineRegisterInfo &MRI, unsigned Reg,
+                                LaneBitmask PrevMask, LaneBitmask NewMask) {
+  assert((NewMask & !PrevMask) == 0 && "Must not add bits");
+  if (NewMask != 0 || PrevMask == 0)
+    return;
+
+  PSetIterator PSetI = MRI.getPressureSets(Reg);
   unsigned Weight = PSetI.getWeight();
   for (; PSetI.isValid(); ++PSetI) {
     assert(CurrSetPressure[*PSetI] >= Weight && "register pressure underflow");
@@ -59,12 +71,20 @@ void RegisterPressure::dump(const TargetRegisterInfo *TRI) const {
   dbgs() << "Max Pressure: ";
   dumpRegSetPressure(MaxSetPressure, TRI);
   dbgs() << "Live In: ";
-  for (unsigned Reg : LiveInRegs)
-    dbgs() << PrintVRegOrUnit(Reg, TRI) << " ";
+  for (const RegisterMaskPair &P : LiveInRegs) {
+    dbgs() << PrintVRegOrUnit(P.RegUnit, TRI);
+    if (P.LaneMask != ~0u)
+      dbgs() << ':' << PrintLaneMask(P.LaneMask);
+    dbgs() << ' ';
+  }
   dbgs() << '\n';
   dbgs() << "Live Out: ";
-  for (unsigned Reg : LiveOutRegs)
-    dbgs() << PrintVRegOrUnit(Reg, TRI) << " ";
+  for (const RegisterMaskPair &P : LiveOutRegs) {
+    dbgs() << PrintVRegOrUnit(P.RegUnit, TRI);
+    if (P.LaneMask != ~0u)
+      dbgs() << ':' << PrintLaneMask(P.LaneMask);
+    dbgs() << ' ';
+  }
   dbgs() << '\n';
 }
 
@@ -89,24 +109,25 @@ void PressureDiff::dump(const TargetRegisterInfo &TRI) const {
   dbgs() << '\n';
 }
 
-/// Increase the current pressure as impacted by these registers and bump
-/// the high water mark if needed.
-void RegPressureTracker::increaseRegPressure(ArrayRef<unsigned> RegUnits) {
-  for (unsigned RegUnit : RegUnits) {
-    PSetIterator PSetI = MRI->getPressureSets(RegUnit);
-    unsigned Weight = PSetI.getWeight();
-    for (; PSetI.isValid(); ++PSetI) {
-      CurrSetPressure[*PSetI] += Weight;
-      P.MaxSetPressure[*PSetI] =
-          std::max(P.MaxSetPressure[*PSetI], CurrSetPressure[*PSetI]);
-    }
+void RegPressureTracker::increaseRegPressure(unsigned RegUnit,
+                                             LaneBitmask PreviousMask,
+                                             LaneBitmask NewMask) {
+  if (PreviousMask != 0 || NewMask == 0)
+    return;
+
+  PSetIterator PSetI = MRI->getPressureSets(RegUnit);
+  unsigned Weight = PSetI.getWeight();
+  for (; PSetI.isValid(); ++PSetI) {
+    CurrSetPressure[*PSetI] += Weight;
+    P.MaxSetPressure[*PSetI] =
+        std::max(P.MaxSetPressure[*PSetI], CurrSetPressure[*PSetI]);
   }
 }
 
-/// Simply decrease the current pressure as impacted by these registers.
-void RegPressureTracker::decreaseRegPressure(ArrayRef<unsigned> RegUnits) {
-  for (unsigned RegUnit : RegUnits)
-    decreaseSetPressure(CurrSetPressure, MRI->getPressureSets(RegUnit));
+void RegPressureTracker::decreaseRegPressure(unsigned RegUnit,
+                                             LaneBitmask PreviousMask,
+                                             LaneBitmask NewMask) {
+  decreaseSetPressure(CurrSetPressure, *MRI, RegUnit, PreviousMask, NewMask);
 }
 
 /// Clear the result so it can be used for another round of pressure tracking.
@@ -201,8 +222,7 @@ void RegPressureTracker::init(const MachineFunction *mf,
                               const LiveIntervals *lis,
                               const MachineBasicBlock *mbb,
                               MachineBasicBlock::const_iterator pos,
-                              bool ShouldTrackUntiedDefs)
-{
+                              bool TrackLaneMasks, bool TrackUntiedDefs) {
   reset();
 
   MF = mf;
@@ -210,7 +230,8 @@ void RegPressureTracker::init(const MachineFunction *mf,
   RCI = rci;
   MRI = &MF->getRegInfo();
   MBB = mbb;
-  TrackUntiedDefs = ShouldTrackUntiedDefs;
+  this->TrackUntiedDefs = TrackUntiedDefs;
+  this->TrackLaneMasks = TrackLaneMasks;
 
   if (RequireIntervals) {
     assert(lis && "IntervalPressure requires LiveIntervals");
@@ -250,7 +271,7 @@ SlotIndex RegPressureTracker::getCurrSlot() const {
     ++IdxPos;
   if (IdxPos == MBB->end())
     return LIS->getMBBEndIdx(MBB);
-  return LIS->getInstructionIndex(IdxPos).getRegSlot();
+  return LIS->getInstructionIndex(*IdxPos).getRegSlot();
 }
 
 /// Set the boundary for the top of the region and summarize live ins.
@@ -297,20 +318,106 @@ void RegPressureTracker::closeRegion() {
 void RegPressureTracker::initLiveThru(const RegPressureTracker &RPTracker) {
   LiveThruPressure.assign(TRI->getNumRegPressureSets(), 0);
   assert(isBottomClosed() && "need bottom-up tracking to intialize.");
-  for (unsigned Reg : P.LiveOutRegs) {
-    if (TargetRegisterInfo::isVirtualRegister(Reg)
-        && !RPTracker.hasUntiedDef(Reg)) {
-      increaseSetPressure(LiveThruPressure, MRI->getPressureSets(Reg));
+  for (const RegisterMaskPair &Pair : P.LiveOutRegs) {
+    unsigned RegUnit = Pair.RegUnit;
+    if (TargetRegisterInfo::isVirtualRegister(RegUnit)
+        && !RPTracker.hasUntiedDef(RegUnit))
+      increaseSetPressure(LiveThruPressure, *MRI, RegUnit, 0, Pair.LaneMask);
+  }
+}
+
+static LaneBitmask getRegLanes(ArrayRef<RegisterMaskPair> RegUnits,
+                               unsigned RegUnit) {
+  auto I = std::find_if(RegUnits.begin(), RegUnits.end(),
+                        [RegUnit](const RegisterMaskPair Other) {
+                        return Other.RegUnit == RegUnit;
+                        });
+  if (I == RegUnits.end())
+    return 0;
+  return I->LaneMask;
+}
+
+static void addRegLanes(SmallVectorImpl<RegisterMaskPair> &RegUnits,
+                        RegisterMaskPair Pair) {
+  unsigned RegUnit = Pair.RegUnit;
+  assert(Pair.LaneMask != 0);
+  auto I = std::find_if(RegUnits.begin(), RegUnits.end(),
+                        [RegUnit](const RegisterMaskPair Other) {
+                          return Other.RegUnit == RegUnit;
+                        });
+  if (I == RegUnits.end()) {
+    RegUnits.push_back(Pair);
+  } else {
+    I->LaneMask |= Pair.LaneMask;
+  }
+}
+
+static void setRegZero(SmallVectorImpl<RegisterMaskPair> &RegUnits,
+                       unsigned RegUnit) {
+  auto I = std::find_if(RegUnits.begin(), RegUnits.end(),
+                        [RegUnit](const RegisterMaskPair Other) {
+                          return Other.RegUnit == RegUnit;
+                        });
+  if (I == RegUnits.end()) {
+    RegUnits.push_back(RegisterMaskPair(RegUnit, 0));
+  } else {
+    I->LaneMask = 0;
+  }
+}
+
+static void removeRegLanes(SmallVectorImpl<RegisterMaskPair> &RegUnits,
+                           RegisterMaskPair Pair) {
+  unsigned RegUnit = Pair.RegUnit;
+  assert(Pair.LaneMask != 0);
+  auto I = std::find_if(RegUnits.begin(), RegUnits.end(),
+                        [RegUnit](const RegisterMaskPair Other) {
+                          return Other.RegUnit == RegUnit;
+                        });
+  if (I != RegUnits.end()) {
+    I->LaneMask &= ~Pair.LaneMask;
+    if (I->LaneMask == 0)
+      RegUnits.erase(I);
+  }
+}
+
+static LaneBitmask getLanesWithProperty(const LiveIntervals &LIS,
+    const MachineRegisterInfo &MRI, bool TrackLaneMasks, unsigned RegUnit,
+    SlotIndex Pos, LaneBitmask SafeDefault,
+    bool(*Property)(const LiveRange &LR, SlotIndex Pos)) {
+  if (TargetRegisterInfo::isVirtualRegister(RegUnit)) {
+    const LiveInterval &LI = LIS.getInterval(RegUnit);
+    LaneBitmask Result = 0;
+    if (TrackLaneMasks && LI.hasSubRanges()) {
+        for (const LiveInterval::SubRange &SR : LI.subranges()) {
+          if (Property(SR, Pos))
+            Result |= SR.LaneMask;
+        }
+    } else if (Property(LI, Pos)) {
+      Result = TrackLaneMasks ? MRI.getMaxLaneMaskForVReg(RegUnit) : ~0u;
     }
+
+    return Result;
+  } else {
+    const LiveRange *LR = LIS.getCachedRegUnit(RegUnit);
+    // Be prepared for missing liveranges: We usually do not compute liveranges
+    // for physical registers on targets with many registers (GPUs).
+    if (LR == nullptr)
+      return SafeDefault;
+    return Property(*LR, Pos) ? ~0u : 0;
   }
 }
 
-/// \brief Convenient wrapper for checking membership in RegisterOperands.
-/// (std::count() doesn't have an early exit).
-static bool containsReg(ArrayRef<unsigned> RegUnits, unsigned RegUnit) {
-  return std::find(RegUnits.begin(), RegUnits.end(), RegUnit) != RegUnits.end();
+static LaneBitmask getLiveLanesAt(const LiveIntervals &LIS,
+                                  const MachineRegisterInfo &MRI,
+                                  bool TrackLaneMasks, unsigned RegUnit,
+                                  SlotIndex Pos) {
+  return getLanesWithProperty(LIS, MRI, TrackLaneMasks, RegUnit, Pos, ~0u,
+                              [](const LiveRange &LR, SlotIndex Pos) {
+                                return LR.liveAt(Pos);
+                              });
 }
 
+
 namespace {
 
 /// Collect this instruction's unique uses and defs into SmallVectors for
@@ -325,19 +432,25 @@ class RegisterOperandsCollector {
 
   RegisterOperandsCollector(RegisterOperands &RegOpers,
                             const TargetRegisterInfo &TRI,
-                            const MachineRegisterInfo &MRI,
-                            bool IgnoreDead)
+                            const MachineRegisterInfo &MRI, bool IgnoreDead)
     : RegOpers(RegOpers), TRI(TRI), MRI(MRI), IgnoreDead(IgnoreDead) {}
 
   void collectInstr(const MachineInstr &MI) const {
-    for (ConstMIBundleOperands OperI(&MI); OperI.isValid(); ++OperI)
+    for (ConstMIBundleOperands OperI(MI); OperI.isValid(); ++OperI)
       collectOperand(*OperI);
 
     // Remove redundant physreg dead defs.
-    SmallVectorImpl<unsigned>::iterator I =
-      std::remove_if(RegOpers.DeadDefs.begin(), RegOpers.DeadDefs.end(),
-                     std::bind1st(std::ptr_fun(containsReg), RegOpers.Defs));
-    RegOpers.DeadDefs.erase(I, RegOpers.DeadDefs.end());
+    for (const RegisterMaskPair &P : RegOpers.Defs)
+      removeRegLanes(RegOpers.DeadDefs, P);
+  }
+
+  void collectInstrLanes(const MachineInstr &MI) const {
+    for (ConstMIBundleOperands OperI(MI); OperI.isValid(); ++OperI)
+      collectOperandLanes(*OperI);
+
+    // Remove redundant physreg dead defs.
+    for (const RegisterMaskPair &P : RegOpers.Defs)
+      removeRegLanes(RegOpers.DeadDefs, P);
   }
 
   /// Push this operand's register onto the correct vectors.
@@ -345,28 +458,65 @@ class RegisterOperandsCollector {
     if (!MO.isReg() || !MO.getReg())
       return;
     unsigned Reg = MO.getReg();
-    if (MO.readsReg())
-      pushRegUnits(Reg, RegOpers.Uses);
-    if (MO.isDef()) {
+    if (MO.isUse()) {
+      if (!MO.isUndef() && !MO.isInternalRead())
+        pushReg(Reg, RegOpers.Uses);
+    } else {
+      assert(MO.isDef());
+      // Subregister definitions may imply a register read.
+      if (MO.readsReg())
+        pushReg(Reg, RegOpers.Uses);
+
       if (MO.isDead()) {
         if (!IgnoreDead)
-          pushRegUnits(Reg, RegOpers.DeadDefs);
+          pushReg(Reg, RegOpers.DeadDefs);
       } else
-        pushRegUnits(Reg, RegOpers.Defs);
+        pushReg(Reg, RegOpers.Defs);
     }
   }
 
-  void pushRegUnits(unsigned Reg, SmallVectorImpl<unsigned> &RegUnits) const {
+  void pushReg(unsigned Reg,
+               SmallVectorImpl<RegisterMaskPair> &RegUnits) const {
     if (TargetRegisterInfo::isVirtualRegister(Reg)) {
-      if (containsReg(RegUnits, Reg))
-        return;
-      RegUnits.push_back(Reg);
+      addRegLanes(RegUnits, RegisterMaskPair(Reg, ~0u));
     } else if (MRI.isAllocatable(Reg)) {
-      for (MCRegUnitIterator Units(Reg, &TRI); Units.isValid(); ++Units) {
-        if (containsReg(RegUnits, *Units))
-          continue;
-        RegUnits.push_back(*Units);
-      }
+      for (MCRegUnitIterator Units(Reg, &TRI); Units.isValid(); ++Units)
+        addRegLanes(RegUnits, RegisterMaskPair(*Units, ~0u));
+    }
+  }
+
+  void collectOperandLanes(const MachineOperand &MO) const {
+    if (!MO.isReg() || !MO.getReg())
+      return;
+    unsigned Reg = MO.getReg();
+    unsigned SubRegIdx = MO.getSubReg();
+    if (MO.isUse()) {
+      if (!MO.isUndef() && !MO.isInternalRead())
+        pushRegLanes(Reg, SubRegIdx, RegOpers.Uses);
+    } else {
+      assert(MO.isDef());
+      // Treat read-undef subreg defs as definitions of the whole register.
+      if (MO.isUndef())
+        SubRegIdx = 0;
+
+      if (MO.isDead()) {
+        if (!IgnoreDead)
+          pushRegLanes(Reg, SubRegIdx, RegOpers.DeadDefs);
+      } else
+        pushRegLanes(Reg, SubRegIdx, RegOpers.Defs);
+    }
+  }
+
+  void pushRegLanes(unsigned Reg, unsigned SubRegIdx,
+                    SmallVectorImpl<RegisterMaskPair> &RegUnits) const {
+    if (TargetRegisterInfo::isVirtualRegister(Reg)) {
+      LaneBitmask LaneMask = SubRegIdx != 0
+                             ? TRI.getSubRegIndexLaneMask(SubRegIdx)
+                             : MRI.getMaxLaneMaskForVReg(Reg);
+      addRegLanes(RegUnits, RegisterMaskPair(Reg, LaneMask));
+    } else if (MRI.isAllocatable(Reg)) {
+      for (MCRegUnitIterator Units(Reg, &TRI); Units.isValid(); ++Units)
+        addRegLanes(RegUnits, RegisterMaskPair(*Units, ~0u));
     }
   }
 
@@ -378,24 +528,26 @@ class RegisterOperandsCollector {
 void RegisterOperands::collect(const MachineInstr &MI,
                                const TargetRegisterInfo &TRI,
                                const MachineRegisterInfo &MRI,
-                               bool IgnoreDead) {
+                               bool TrackLaneMasks, bool IgnoreDead) {
   RegisterOperandsCollector Collector(*this, TRI, MRI, IgnoreDead);
-  Collector.collectInstr(MI);
+  if (TrackLaneMasks)
+    Collector.collectInstrLanes(MI);
+  else
+    Collector.collectInstr(MI);
 }
 
 void RegisterOperands::detectDeadDefs(const MachineInstr &MI,
                                       const LiveIntervals &LIS) {
-  SlotIndex SlotIdx = LIS.getInstructionIndex(&MI);
-  for (SmallVectorImpl<unsigned>::iterator RI = Defs.begin();
-       RI != Defs.end(); /*empty*/) {
-    unsigned Reg = *RI;
+  SlotIndex SlotIdx = LIS.getInstructionIndex(MI);
+  for (auto RI = Defs.begin(); RI != Defs.end(); /*empty*/) {
+    unsigned Reg = RI->RegUnit;
     const LiveRange *LR = getLiveRange(LIS, Reg);
     if (LR != nullptr) {
       LiveQueryResult LRQ = LR->Query(SlotIdx);
       if (LRQ.isDeadDef()) {
         // LiveIntervals knows this is a dead even though it's MachineOperand is
         // not flagged as such.
-        DeadDefs.push_back(Reg);
+        DeadDefs.push_back(*RI);
         RI = Defs.erase(RI);
         continue;
       }
@@ -404,6 +556,52 @@ void RegisterOperands::detectDeadDefs(const MachineInstr &MI,
   }
 }
 
+void RegisterOperands::adjustLaneLiveness(const LiveIntervals &LIS,
+                                          const MachineRegisterInfo &MRI,
+                                          SlotIndex Pos,
+                                          MachineInstr *AddFlagsMI) {
+  for (auto I = Defs.begin(); I != Defs.end(); ) {
+    LaneBitmask LiveAfter = getLiveLanesAt(LIS, MRI, true, I->RegUnit,
+                                           Pos.getDeadSlot());
+    // If the the def is all that is live after the instruction, then in case
+    // of a subregister def we need a read-undef flag.
+    unsigned RegUnit = I->RegUnit;
+    if (TargetRegisterInfo::isVirtualRegister(RegUnit) &&
+        AddFlagsMI != nullptr && (LiveAfter & ~I->LaneMask) == 0)
+      AddFlagsMI->setRegisterDefReadUndef(RegUnit);
+
+    LaneBitmask ActualDef = I->LaneMask & LiveAfter;
+    if (ActualDef == 0) {
+      I = Defs.erase(I);
+    } else {
+      I->LaneMask = ActualDef;
+      ++I;
+    }
+  }
+  for (auto I = Uses.begin(); I != Uses.end(); ) {
+    LaneBitmask LiveBefore = getLiveLanesAt(LIS, MRI, true, I->RegUnit,
+                                            Pos.getBaseIndex());
+    LaneBitmask LaneMask = I->LaneMask & LiveBefore;
+    if (LaneMask == 0) {
+      I = Uses.erase(I);
+    } else {
+      I->LaneMask = LaneMask;
+      ++I;
+    }
+  }
+  if (AddFlagsMI != nullptr) {
+    for (const RegisterMaskPair &P : DeadDefs) {
+      unsigned RegUnit = P.RegUnit;
+      if (!TargetRegisterInfo::isVirtualRegister(RegUnit))
+        continue;
+      LaneBitmask LiveAfter = getLiveLanesAt(LIS, MRI, true, RegUnit,
+                                             Pos.getDeadSlot());
+      if (LiveAfter == 0)
+        AddFlagsMI->setRegisterDefReadUndef(RegUnit);
+    }
+  }
+}
+
 /// Initialize an array of N PressureDiffs.
 void PressureDiffs::init(unsigned N) {
   Size = N;
@@ -421,11 +619,11 @@ void PressureDiffs::addInstruction(unsigned Idx,
                                    const MachineRegisterInfo &MRI) {
   PressureDiff &PDiff = (*this)[Idx];
   assert(!PDiff.begin()->isValid() && "stale PDiff");
-  for (unsigned Reg : RegOpers.Defs)
-    PDiff.addPressureChange(Reg, true, &MRI);
+  for (const RegisterMaskPair &P : RegOpers.Defs)
+    PDiff.addPressureChange(P.RegUnit, true, &MRI);
 
-  for (unsigned Reg : RegOpers.Uses)
-    PDiff.addPressureChange(Reg, false, &MRI);
+  for (const RegisterMaskPair &P : RegOpers.Uses)
+    PDiff.addPressureChange(P.RegUnit, false, &MRI);
 }
 
 /// Add a change in pressure to the pressure diff of a given instruction.
@@ -465,33 +663,58 @@ void PressureDiff::addPressureChange(unsigned RegUnit, bool IsDec,
 }
 
 /// Force liveness of registers.
-void RegPressureTracker::addLiveRegs(ArrayRef<unsigned> Regs) {
-  for (unsigned Reg : Regs) {
-    if (LiveRegs.insert(Reg))
-      increaseRegPressure(Reg);
+void RegPressureTracker::addLiveRegs(ArrayRef<RegisterMaskPair> Regs) {
+  for (const RegisterMaskPair &P : Regs) {
+    LaneBitmask PrevMask = LiveRegs.insert(P);
+    LaneBitmask NewMask = PrevMask | P.LaneMask;
+    increaseRegPressure(P.RegUnit, PrevMask, NewMask);
   }
 }
 
-/// Add Reg to the live in set and increase max pressure.
-void RegPressureTracker::discoverLiveIn(unsigned Reg) {
-  assert(!LiveRegs.contains(Reg) && "avoid bumping max pressure twice");
-  if (containsReg(P.LiveInRegs, Reg))
-    return;
+void RegPressureTracker::discoverLiveInOrOut(RegisterMaskPair Pair,
+    SmallVectorImpl<RegisterMaskPair> &LiveInOrOut) {
+  assert(Pair.LaneMask != 0);
+
+  unsigned RegUnit = Pair.RegUnit;
+  auto I = std::find_if(LiveInOrOut.begin(), LiveInOrOut.end(),
+                        [RegUnit](const RegisterMaskPair &Other) {
+                          return Other.RegUnit == RegUnit;
+                        });
+  LaneBitmask PrevMask;
+  LaneBitmask NewMask;
+  if (I == LiveInOrOut.end()) {
+    PrevMask = 0;
+    NewMask = Pair.LaneMask;
+    LiveInOrOut.push_back(Pair);
+  } else {
+    PrevMask = I->LaneMask;
+    NewMask = PrevMask | Pair.LaneMask;
+    I->LaneMask = NewMask;
+  }
+  increaseSetPressure(P.MaxSetPressure, *MRI, RegUnit, PrevMask, NewMask);
+}
 
-  // At live in discovery, unconditionally increase the high water mark.
-  P.LiveInRegs.push_back(Reg);
-  increaseSetPressure(P.MaxSetPressure, MRI->getPressureSets(Reg));
+void RegPressureTracker::discoverLiveIn(RegisterMaskPair Pair) {
+  discoverLiveInOrOut(Pair, P.LiveInRegs);
 }
 
-/// Add Reg to the live out set and increase max pressure.
-void RegPressureTracker::discoverLiveOut(unsigned Reg) {
-  assert(!LiveRegs.contains(Reg) && "avoid bumping max pressure twice");
-  if (containsReg(P.LiveOutRegs, Reg))
-    return;
+void RegPressureTracker::discoverLiveOut(RegisterMaskPair Pair) {
+  discoverLiveInOrOut(Pair, P.LiveOutRegs);
+}
 
-  // At live out discovery, unconditionally increase the high water mark.
-  P.LiveOutRegs.push_back(Reg);
-  increaseSetPressure(P.MaxSetPressure, MRI->getPressureSets(Reg));
+void RegPressureTracker::bumpDeadDefs(ArrayRef<RegisterMaskPair> DeadDefs) {
+  for (const RegisterMaskPair &P : DeadDefs) {
+    unsigned Reg = P.RegUnit;
+    LaneBitmask LiveMask = LiveRegs.contains(Reg);
+    LaneBitmask BumpedMask = LiveMask | P.LaneMask;
+    increaseRegPressure(Reg, LiveMask, BumpedMask);
+  }
+  for (const RegisterMaskPair &P : DeadDefs) {
+    unsigned Reg = P.RegUnit;
+    LaneBitmask LiveMask = LiveRegs.contains(Reg);
+    LaneBitmask BumpedMask = LiveMask | P.LaneMask;
+    decreaseRegPressure(Reg, BumpedMask, LiveMask);
+  }
 }
 
 /// Recede across the previous instruction. If LiveUses is provided, record any
@@ -500,48 +723,88 @@ void RegPressureTracker::discoverLiveOut(unsigned Reg) {
 /// difference pointer is provided record the changes is pressure caused by this
 /// instruction independent of liveness.
 void RegPressureTracker::recede(const RegisterOperands &RegOpers,
-                                SmallVectorImpl<unsigned> *LiveUses) {
+                                SmallVectorImpl<RegisterMaskPair> *LiveUses) {
   assert(!CurrPos->isDebugValue());
 
   // Boost pressure for all dead defs together.
-  increaseRegPressure(RegOpers.DeadDefs);
-  decreaseRegPressure(RegOpers.DeadDefs);
+  bumpDeadDefs(RegOpers.DeadDefs);
 
   // Kill liveness at live defs.
   // TODO: consider earlyclobbers?
-  for (unsigned Reg : RegOpers.Defs) {
-    if (LiveRegs.erase(Reg))
-      decreaseRegPressure(Reg);
-    else
-      discoverLiveOut(Reg);
+  for (const RegisterMaskPair &Def : RegOpers.Defs) {
+    unsigned Reg = Def.RegUnit;
+
+    LaneBitmask PreviousMask = LiveRegs.erase(Def);
+    LaneBitmask NewMask = PreviousMask & ~Def.LaneMask;
+
+    LaneBitmask LiveOut = Def.LaneMask & ~PreviousMask;
+    if (LiveOut != 0) {
+      discoverLiveOut(RegisterMaskPair(Reg, LiveOut));
+      // Retroactively model effects on pressure of the live out lanes.
+      increaseSetPressure(CurrSetPressure, *MRI, Reg, 0, LiveOut);
+      PreviousMask = LiveOut;
+    }
+
+    if (NewMask == 0) {
+      // Add a 0 entry to LiveUses as a marker that the complete vreg has become
+      // dead.
+      if (TrackLaneMasks && LiveUses != nullptr)
+        setRegZero(*LiveUses, Reg);
+    }
+
+    decreaseRegPressure(Reg, PreviousMask, NewMask);
   }
 
   SlotIndex SlotIdx;
   if (RequireIntervals)
-    SlotIdx = LIS->getInstructionIndex(CurrPos).getRegSlot();
+    SlotIdx = LIS->getInstructionIndex(*CurrPos).getRegSlot();
 
   // Generate liveness for uses.
-  for (unsigned Reg : RegOpers.Uses) {
-    if (!LiveRegs.contains(Reg)) {
-      // Adjust liveouts if LiveIntervals are available.
-      if (RequireIntervals) {
-        const LiveRange *LR = getLiveRange(*LIS, Reg);
-        if (LR) {
-          LiveQueryResult LRQ = LR->Query(SlotIdx);
-          if (!LRQ.isKill() && !LRQ.valueDefined())
-            discoverLiveOut(Reg);
+  for (const RegisterMaskPair &Use : RegOpers.Uses) {
+    unsigned Reg = Use.RegUnit;
+    assert(Use.LaneMask != 0);
+    LaneBitmask PreviousMask = LiveRegs.insert(Use);
+    LaneBitmask NewMask = PreviousMask | Use.LaneMask;
+    if (NewMask == PreviousMask)
+      continue;
+
+    // Did the register just become live?
+    if (PreviousMask == 0) {
+      if (LiveUses != nullptr) {
+        if (!TrackLaneMasks) {
+          addRegLanes(*LiveUses, RegisterMaskPair(Reg, NewMask));
+        } else {
+          auto I = std::find_if(LiveUses->begin(), LiveUses->end(),
+                                [Reg](const RegisterMaskPair Other) {
+                                return Other.RegUnit == Reg;
+                                });
+          bool IsRedef = I != LiveUses->end();
+          if (IsRedef) {
+            // ignore re-defs here...
+            assert(I->LaneMask == 0);
+            removeRegLanes(*LiveUses, RegisterMaskPair(Reg, NewMask));
+          } else {
+            addRegLanes(*LiveUses, RegisterMaskPair(Reg, NewMask));
+          }
         }
       }
-      increaseRegPressure(Reg);
-      LiveRegs.insert(Reg);
-      if (LiveUses && !containsReg(*LiveUses, Reg))
-        LiveUses->push_back(Reg);
+
+      // Discover live outs if this may be the first occurance of this register.
+      if (RequireIntervals) {
+        LaneBitmask LiveOut = getLiveThroughAt(Reg, SlotIdx);
+        if (LiveOut != 0)
+          discoverLiveOut(RegisterMaskPair(Reg, LiveOut));
+      }
     }
+
+    increaseRegPressure(Reg, PreviousMask, NewMask);
   }
   if (TrackUntiedDefs) {
-    for (unsigned Reg : RegOpers.Defs) {
-      if (TargetRegisterInfo::isVirtualRegister(Reg) && !LiveRegs.contains(Reg))
-        UntiedDefs.insert(Reg);
+    for (const RegisterMaskPair &Def : RegOpers.Defs) {
+      unsigned RegUnit = Def.RegUnit;
+      if (TargetRegisterInfo::isVirtualRegister(RegUnit) &&
+          (LiveRegs.contains(RegUnit) & Def.LaneMask) == 0)
+        UntiedDefs.insert(RegUnit);
     }
   }
 }
@@ -562,29 +825,32 @@ void RegPressureTracker::recedeSkipDebugValues() {
 
   SlotIndex SlotIdx;
   if (RequireIntervals)
-    SlotIdx = LIS->getInstructionIndex(CurrPos).getRegSlot();
+    SlotIdx = LIS->getInstructionIndex(*CurrPos).getRegSlot();
 
   // Open the top of the region using slot indexes.
   if (RequireIntervals && isTopClosed())
     static_cast<IntervalPressure&>(P).openTop(SlotIdx);
 }
 
-void RegPressureTracker::recede(SmallVectorImpl<unsigned> *LiveUses) {
+void RegPressureTracker::recede(SmallVectorImpl<RegisterMaskPair> *LiveUses) {
   recedeSkipDebugValues();
 
   const MachineInstr &MI = *CurrPos;
   RegisterOperands RegOpers;
-  RegOpers.collect(MI, *TRI, *MRI);
-  if (RequireIntervals)
+  RegOpers.collect(MI, *TRI, *MRI, TrackLaneMasks, false);
+  if (TrackLaneMasks) {
+    SlotIndex SlotIdx = LIS->getInstructionIndex(*CurrPos).getRegSlot();
+    RegOpers.adjustLaneLiveness(*LIS, *MRI, SlotIdx);
+  } else if (RequireIntervals) {
     RegOpers.detectDeadDefs(MI, *LIS);
+  }
 
   recede(RegOpers, LiveUses);
 }
 
 /// Advance across the current instruction.
-void RegPressureTracker::advance() {
+void RegPressureTracker::advance(const RegisterOperands &RegOpers) {
   assert(!TrackUntiedDefs && "unsupported mode");
-
   assert(CurrPos != MBB->end());
   if (!isTopClosed())
     closeTop();
@@ -601,39 +867,34 @@ void RegPressureTracker::advance() {
       static_cast<RegionPressure&>(P).openBottom(CurrPos);
   }
 
-  RegisterOperands RegOpers;
-  RegOpers.collect(*CurrPos, *TRI, *MRI);
-
-  for (unsigned Reg : RegOpers.Uses) {
-    // Discover live-ins.
-    bool isLive = LiveRegs.contains(Reg);
-    if (!isLive)
-      discoverLiveIn(Reg);
+  for (const RegisterMaskPair &Use : RegOpers.Uses) {
+    unsigned Reg = Use.RegUnit;
+    LaneBitmask LiveMask = LiveRegs.contains(Reg);
+    LaneBitmask LiveIn = Use.LaneMask & ~LiveMask;
+    if (LiveIn != 0) {
+      discoverLiveIn(RegisterMaskPair(Reg, LiveIn));
+      increaseRegPressure(Reg, LiveMask, LiveMask | LiveIn);
+      LiveRegs.insert(RegisterMaskPair(Reg, LiveIn));
+    }
     // Kill liveness at last uses.
-    bool lastUse = false;
     if (RequireIntervals) {
-      const LiveRange *LR = getLiveRange(*LIS, Reg);
-      lastUse = LR && LR->Query(SlotIdx).isKill();
-    } else {
-      // Allocatable physregs are always single-use before register rewriting.
-      lastUse = !TargetRegisterInfo::isVirtualRegister(Reg);
+      LaneBitmask LastUseMask = getLastUsedLanes(Reg, SlotIdx);
+      if (LastUseMask != 0) {
+        LiveRegs.erase(RegisterMaskPair(Reg, LastUseMask));
+        decreaseRegPressure(Reg, LiveMask, LiveMask & ~LastUseMask);
+      }
     }
-    if (lastUse && isLive) {
-      LiveRegs.erase(Reg);
-      decreaseRegPressure(Reg);
-    } else if (!lastUse && !isLive)
-      increaseRegPressure(Reg);
   }
 
   // Generate liveness for defs.
-  for (unsigned Reg : RegOpers.Defs) {
-    if (LiveRegs.insert(Reg))
-      increaseRegPressure(Reg);
+  for (const RegisterMaskPair &Def : RegOpers.Defs) {
+    LaneBitmask PreviousMask = LiveRegs.insert(Def);
+    LaneBitmask NewMask = PreviousMask | Def.LaneMask;
+    increaseRegPressure(Def.RegUnit, PreviousMask, NewMask);
   }
 
   // Boost pressure for all dead defs together.
-  increaseRegPressure(RegOpers.DeadDefs);
-  decreaseRegPressure(RegOpers.DeadDefs);
+  bumpDeadDefs(RegOpers.DeadDefs);
 
   // Find the next instruction.
   do
@@ -641,6 +902,17 @@ void RegPressureTracker::advance() {
   while (CurrPos != MBB->end() && CurrPos->isDebugValue());
 }
 
+void RegPressureTracker::advance() {
+  const MachineInstr &MI = *CurrPos;
+  RegisterOperands RegOpers;
+  RegOpers.collect(MI, *TRI, *MRI, TrackLaneMasks, false);
+  if (TrackLaneMasks) {
+    SlotIndex SlotIdx = getCurrSlot();
+    RegOpers.adjustLaneLiveness(*LIS, *MRI, SlotIdx);
+  }
+  advance(RegOpers);
+}
+
 /// Find the max change in excess pressure across all sets.
 static void computeExcessPressureDelta(ArrayRef<unsigned> OldPressureVec,
                                        ArrayRef<unsigned> NewPressureVec,
@@ -728,22 +1000,38 @@ static void computeMaxPressureDelta(ArrayRef<unsigned> OldMaxPressureVec,
 void RegPressureTracker::bumpUpwardPressure(const MachineInstr *MI) {
   assert(!MI->isDebugValue() && "Expect a nondebug instruction.");
 
+  SlotIndex SlotIdx;
+  if (RequireIntervals)
+    SlotIdx = LIS->getInstructionIndex(*MI).getRegSlot();
+
   // Account for register pressure similar to RegPressureTracker::recede().
   RegisterOperands RegOpers;
-  RegOpers.collect(*MI, *TRI, *MRI, /*IgnoreDead=*/true);
+  RegOpers.collect(*MI, *TRI, *MRI, TrackLaneMasks, /*IgnoreDead=*/true);
   assert(RegOpers.DeadDefs.size() == 0);
-  if (RequireIntervals)
+  if (TrackLaneMasks)
+    RegOpers.adjustLaneLiveness(*LIS, *MRI, SlotIdx);
+  else if (RequireIntervals)
     RegOpers.detectDeadDefs(*MI, *LIS);
 
+  // Boost max pressure for all dead defs together.
+  // Since CurrSetPressure and MaxSetPressure
+  bumpDeadDefs(RegOpers.DeadDefs);
+
   // Kill liveness at live defs.
-  for (unsigned Reg : RegOpers.Defs) {
-    if (!containsReg(RegOpers.Uses, Reg))
-      decreaseRegPressure(Reg);
+  for (const RegisterMaskPair &P : RegOpers.Defs) {
+    unsigned Reg = P.RegUnit;
+    LaneBitmask LiveLanes = LiveRegs.contains(Reg);
+    LaneBitmask UseLanes = getRegLanes(RegOpers.Uses, Reg);
+    LaneBitmask DefLanes = P.LaneMask;
+    LaneBitmask LiveAfter = (LiveLanes & ~DefLanes) | UseLanes;
+    decreaseRegPressure(Reg, LiveLanes, LiveAfter);
   }
   // Generate liveness for uses.
-  for (unsigned Reg : RegOpers.Uses) {
-    if (!LiveRegs.contains(Reg))
-      increaseRegPressure(Reg);
+  for (const RegisterMaskPair &P : RegOpers.Uses) {
+    unsigned Reg = P.RegUnit;
+    LaneBitmask LiveLanes = LiveRegs.contains(Reg);
+    LaneBitmask LiveAfter = LiveLanes | P.LaneMask;
+    increaseRegPressure(Reg, LiveLanes, LiveAfter);
   }
 }
 
@@ -888,15 +1176,58 @@ getUpwardPressureDelta(const MachineInstr *MI, /*const*/ PressureDiff &PDiff,
 }
 
 /// Helper to find a vreg use between two indices [PriorUseIdx, NextUseIdx).
-static bool findUseBetween(unsigned Reg, SlotIndex PriorUseIdx,
-                           SlotIndex NextUseIdx, const MachineRegisterInfo &MRI,
-                           const LiveIntervals *LIS) {
-  for (const MachineInstr &MI : MRI.use_nodbg_instructions(Reg)) {
-    SlotIndex InstSlot = LIS->getInstructionIndex(&MI).getRegSlot();
-    if (InstSlot >= PriorUseIdx && InstSlot < NextUseIdx)
-      return true;
+/// The query starts with a lane bitmask which gets lanes/bits removed for every
+/// use we find.
+static LaneBitmask findUseBetween(unsigned Reg, LaneBitmask LastUseMask,
+                                  SlotIndex PriorUseIdx, SlotIndex NextUseIdx,
+                                  const MachineRegisterInfo &MRI,
+                                  const LiveIntervals *LIS) {
+  const TargetRegisterInfo &TRI = *MRI.getTargetRegisterInfo();
+  for (const MachineOperand &MO : MRI.use_nodbg_operands(Reg)) {
+    if (MO.isUndef())
+      continue;
+    const MachineInstr *MI = MO.getParent();
+    SlotIndex InstSlot = LIS->getInstructionIndex(*MI).getRegSlot();
+    if (InstSlot >= PriorUseIdx && InstSlot < NextUseIdx) {
+      unsigned SubRegIdx = MO.getSubReg();
+      LaneBitmask UseMask = TRI.getSubRegIndexLaneMask(SubRegIdx);
+      LastUseMask &= ~UseMask;
+      if (LastUseMask == 0)
+        return 0;
+    }
   }
-  return false;
+  return LastUseMask;
+}
+
+LaneBitmask RegPressureTracker::getLiveLanesAt(unsigned RegUnit,
+                                               SlotIndex Pos) const {
+  assert(RequireIntervals);
+  return getLanesWithProperty(*LIS, *MRI, TrackLaneMasks, RegUnit, Pos, ~0u,
+      [](const LiveRange &LR, SlotIndex Pos) {
+        return LR.liveAt(Pos);
+      });
+}
+
+LaneBitmask RegPressureTracker::getLastUsedLanes(unsigned RegUnit,
+                                                 SlotIndex Pos) const {
+  assert(RequireIntervals);
+  return getLanesWithProperty(*LIS, *MRI, TrackLaneMasks, RegUnit,
+                              Pos.getBaseIndex(), 0,
+      [](const LiveRange &LR, SlotIndex Pos) {
+        const LiveRange::Segment *S = LR.getSegmentContaining(Pos);
+        return S != nullptr && S->end == Pos.getRegSlot();
+      });
+}
+
+LaneBitmask RegPressureTracker::getLiveThroughAt(unsigned RegUnit,
+                                                 SlotIndex Pos) const {
+  assert(RequireIntervals);
+  return getLanesWithProperty(*LIS, *MRI, TrackLaneMasks, RegUnit, Pos, 0u,
+      [](const LiveRange &LR, SlotIndex Pos) {
+        const LiveRange::Segment *S = LR.getSegmentContaining(Pos);
+        return S != nullptr && S->start < Pos.getRegSlot(true) &&
+               S->end != Pos.getDeadSlot();
+      });
 }
 
 /// Record the downward impact of a single instruction on current register
@@ -908,39 +1239,49 @@ static bool findUseBetween(unsigned Reg, SlotIndex PriorUseIdx,
 void RegPressureTracker::bumpDownwardPressure(const MachineInstr *MI) {
   assert(!MI->isDebugValue() && "Expect a nondebug instruction.");
 
-  // Account for register pressure similar to RegPressureTracker::recede().
-  RegisterOperands RegOpers;
-  RegOpers.collect(*MI, *TRI, *MRI);
-
-  // Kill liveness at last uses. Assume allocatable physregs are single-use
-  // rather than checking LiveIntervals.
   SlotIndex SlotIdx;
   if (RequireIntervals)
-    SlotIdx = LIS->getInstructionIndex(MI).getRegSlot();
+    SlotIdx = LIS->getInstructionIndex(*MI).getRegSlot();
 
-  for (unsigned Reg : RegOpers.Uses) {
-    if (RequireIntervals) {
+  // Account for register pressure similar to RegPressureTracker::recede().
+  RegisterOperands RegOpers;
+  RegOpers.collect(*MI, *TRI, *MRI, TrackLaneMasks, false);
+  if (TrackLaneMasks)
+    RegOpers.adjustLaneLiveness(*LIS, *MRI, SlotIdx);
+
+  if (RequireIntervals) {
+    for (const RegisterMaskPair &Use : RegOpers.Uses) {
+      unsigned Reg = Use.RegUnit;
+      LaneBitmask LastUseMask = getLastUsedLanes(Reg, SlotIdx);
+      if (LastUseMask == 0)
+        continue;
+      // The LastUseMask is queried from the liveness information of instruction
+      // which may be further down the schedule. Some lanes may actually not be
+      // last uses for the current position.
       // FIXME: allow the caller to pass in the list of vreg uses that remain
       // to be bottom-scheduled to avoid searching uses at each query.
       SlotIndex CurrIdx = getCurrSlot();
-      const LiveRange *LR = getLiveRange(*LIS, Reg);
-      if (LR) {
-        LiveQueryResult LRQ = LR->Query(SlotIdx);
-        if (LRQ.isKill() && !findUseBetween(Reg, CurrIdx, SlotIdx, *MRI, LIS))
-          decreaseRegPressure(Reg);
-      }
-    } else if (!TargetRegisterInfo::isVirtualRegister(Reg)) {
-      // Allocatable physregs are always single-use before register rewriting.
-      decreaseRegPressure(Reg);
+      LastUseMask
+        = findUseBetween(Reg, LastUseMask, CurrIdx, SlotIdx, *MRI, LIS);
+      if (LastUseMask == 0)
+        continue;
+
+      LaneBitmask LiveMask = LiveRegs.contains(Reg);
+      LaneBitmask NewMask = LiveMask & ~LastUseMask;
+      decreaseRegPressure(Reg, LiveMask, NewMask);
     }
   }
 
   // Generate liveness for defs.
-  increaseRegPressure(RegOpers.Defs);
+  for (const RegisterMaskPair &Def : RegOpers.Defs) {
+    unsigned Reg = Def.RegUnit;
+    LaneBitmask LiveMask = LiveRegs.contains(Reg);
+    LaneBitmask NewMask = LiveMask | Def.LaneMask;
+    increaseRegPressure(Reg, LiveMask, NewMask);
+  }
 
   // Boost pressure for all dead defs together.
-  increaseRegPressure(RegOpers.DeadDefs);
-  decreaseRegPressure(RegOpers.DeadDefs);
+  bumpDeadDefs(RegOpers.DeadDefs);
 }
 
 /// Consider the pressure increase caused by traversing this instruction
diff --git a/lib/CodeGen/RegisterScavenging.cpp b/lib/CodeGen/RegisterScavenging.cpp
index 8fa1bf74b7e2..6b80179190db 100644
--- a/lib/CodeGen/RegisterScavenging.cpp
+++ b/lib/CodeGen/RegisterScavenging.cpp
@@ -7,10 +7,11 @@
 //
 //===----------------------------------------------------------------------===//
 //
-// This file implements the machine register scavenger. It can provide
-// information, such as unused registers, at any point in a machine basic block.
-// It also provides a mechanism to make registers available by evicting them to
-// spill slots.
+/// \file
+/// This file implements the machine register scavenger. It can provide
+/// information, such as unused registers, at any point in a machine basic
+/// block. It also provides a mechanism to make registers available by evicting
+/// them to spill slots.
 //
 //===----------------------------------------------------------------------===//
 
@@ -30,7 +31,6 @@ using namespace llvm;
 
 #define DEBUG_TYPE "reg-scavenging"
 
-/// setUsed - Set the register units of this register as used.
 void RegScavenger::setRegUsed(unsigned Reg, LaneBitmask LaneMask) {
   for (MCRegUnitMaskIterator RUI(Reg, TRI); RUI.isValid(); ++RUI) {
     LaneBitmask UnitMask = (*RUI).second;
@@ -49,9 +49,6 @@ void RegScavenger::initRegState() {
   // All register units start out unused.
   RegUnitsAvailable.set();
 
-  if (!MBB)
-    return;
-
   // Live-in registers are in use.
   for (const auto &LI : MBB->liveins())
     setRegUsed(LI.PhysReg, LI.LaneMask);
@@ -63,8 +60,8 @@ void RegScavenger::initRegState() {
     setRegUsed(I);
 }
 
-void RegScavenger::enterBasicBlock(MachineBasicBlock *mbb) {
-  MachineFunction &MF = *mbb->getParent();
+void RegScavenger::enterBasicBlock(MachineBasicBlock &MBB) {
+  MachineFunction &MF = *MBB.getParent();
   TII = MF.getSubtarget().getInstrInfo();
   TRI = MF.getSubtarget().getRegisterInfo();
   MRI = &MF.getRegInfo();
@@ -78,15 +75,15 @@ void RegScavenger::enterBasicBlock(MachineBasicBlock *mbb) {
          "Cannot use register scavenger with inaccurate liveness");
 
   // Self-initialize.
-  if (!MBB) {
+  if (!this->MBB) {
     NumRegUnits = TRI->getNumRegUnits();
     RegUnitsAvailable.resize(NumRegUnits);
     KillRegUnits.resize(NumRegUnits);
     DefRegUnits.resize(NumRegUnits);
     TmpRegUnits.resize(NumRegUnits);
   }
+  this->MBB = &MBB;
 
-  MBB = mbb;
   initRegState();
 
   Tracking = false;
@@ -100,17 +97,15 @@ void RegScavenger::addRegUnits(BitVector &BV, unsigned Reg) {
 void RegScavenger::determineKillsAndDefs() {
   assert(Tracking && "Must be tracking to determine kills and defs");
 
-  MachineInstr *MI = MBBI;
-  assert(!MI->isDebugValue() && "Debug values have no kills or defs");
+  MachineInstr &MI = *MBBI;
+  assert(!MI.isDebugValue() && "Debug values have no kills or defs");
 
   // Find out which registers are early clobbered, killed, defined, and marked
   // def-dead in this instruction.
   KillRegUnits.reset();
   DefRegUnits.reset();
-  for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) {
-    const MachineOperand &MO = MI->getOperand(i);
+  for (const MachineOperand &MO : MI.operands()) {
     if (MO.isRegMask()) {
-      
       TmpRegUnits.clear();
       for (unsigned RU = 0, RUEnd = TRI->getNumRegUnits(); RU != RUEnd; ++RU) {
         for (MCRegUnitRootIterator RURI(RU, TRI); RURI.isValid(); ++RURI) {
@@ -120,14 +115,14 @@ void RegScavenger::determineKillsAndDefs() {
           }
         }
       }
-      
+
       // Apply the mask.
       KillRegUnits |= TmpRegUnits;
     }
     if (!MO.isReg())
       continue;
     unsigned Reg = MO.getReg();
-    if (!Reg || TargetRegisterInfo::isVirtualRegister(Reg) || isReserved(Reg))
+    if (!TargetRegisterInfo::isPhysicalRegister(Reg) || isReserved(Reg))
       continue;
 
     if (MO.isUse()) {
@@ -149,8 +144,8 @@ void RegScavenger::determineKillsAndDefs() {
 void RegScavenger::unprocess() {
   assert(Tracking && "Cannot unprocess because we're not tracking");
 
-  MachineInstr *MI = MBBI;
-  if (!MI->isDebugValue()) {
+  MachineInstr &MI = *MBBI;
+  if (!MI.isDebugValue()) {
     determineKillsAndDefs();
 
     // Commit the changes.
@@ -176,30 +171,29 @@ void RegScavenger::forward() {
   }
   assert(MBBI != MBB->end() && "Already at the end of the basic block!");
 
-  MachineInstr *MI = MBBI;
+  MachineInstr &MI = *MBBI;
 
   for (SmallVectorImpl<ScavengedInfo>::iterator I = Scavenged.begin(),
          IE = Scavenged.end(); I != IE; ++I) {
-    if (I->Restore != MI)
+    if (I->Restore != &MI)
       continue;
 
     I->Reg = 0;
     I->Restore = nullptr;
   }
 
-  if (MI->isDebugValue())
+  if (MI.isDebugValue())
     return;
 
   determineKillsAndDefs();
 
   // Verify uses and defs.
 #ifndef NDEBUG
-  for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) {
-    const MachineOperand &MO = MI->getOperand(i);
+  for (const MachineOperand &MO : MI.operands()) {
     if (!MO.isReg())
       continue;
     unsigned Reg = MO.getReg();
-    if (!Reg || TargetRegisterInfo::isVirtualRegister(Reg) || isReserved(Reg))
+    if (!TargetRegisterInfo::isPhysicalRegister(Reg) || isReserved(Reg))
       continue;
     if (MO.isUse()) {
       if (MO.isUndef())
@@ -261,33 +255,24 @@ bool RegScavenger::isRegUsed(unsigned Reg, bool includeReserved) const {
 }
 
 unsigned RegScavenger::FindUnusedReg(const TargetRegisterClass *RC) const {
-  for (TargetRegisterClass::iterator I = RC->begin(), E = RC->end();
-       I != E; ++I)
-    if (!isRegUsed(*I)) {
-      DEBUG(dbgs() << "Scavenger found unused reg: " << TRI->getName(*I) <<
+  for (unsigned Reg : *RC) {
+    if (!isRegUsed(Reg)) {
+      DEBUG(dbgs() << "Scavenger found unused reg: " << TRI->getName(Reg) <<
             "\n");
-      return *I;
+      return Reg;
     }
+  }
   return 0;
 }
 
-/// getRegsAvailable - Return all available registers in the register class
-/// in Mask.
 BitVector RegScavenger::getRegsAvailable(const TargetRegisterClass *RC) {
   BitVector Mask(TRI->getNumRegs());
-  for (TargetRegisterClass::iterator I = RC->begin(), E = RC->end();
-       I != E; ++I)
-    if (!isRegUsed(*I))
-      Mask.set(*I);
+  for (unsigned Reg : *RC)
+    if (!isRegUsed(Reg))
+      Mask.set(Reg);
   return Mask;
 }
 
-/// findSurvivorReg - Return the candidate register that is unused for the
-/// longest after StartMII. UseMI is set to the instruction where the search
-/// stopped.
-///
-/// No more than InstrLimit instructions are inspected.
-///
 unsigned RegScavenger::findSurvivorReg(MachineBasicBlock::iterator StartMI,
                                        BitVector &Candidates,
                                        unsigned InstrLimit,
@@ -309,8 +294,7 @@ unsigned RegScavenger::findSurvivorReg(MachineBasicBlock::iterator StartMI,
     bool isVirtKillInsn = false;
     bool isVirtDefInsn = false;
     // Remove any candidates touched by instruction.
-    for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) {
-      const MachineOperand &MO = MI->getOperand(i);
+    for (const MachineOperand &MO : MI->operands()) {
       if (MO.isRegMask())
         Candidates.clearBitsNotInMask(MO.getRegMask());
       if (!MO.isReg() || MO.isUndef() || !MO.getReg())
@@ -345,20 +329,19 @@ unsigned RegScavenger::findSurvivorReg(MachineBasicBlock::iterator StartMI,
   }
   // If we ran off the end, that's where we want to restore.
   if (MI == ME) RestorePointMI = ME;
-  assert (RestorePointMI != StartMI &&
-          "No available scavenger restore location!");
+  assert(RestorePointMI != StartMI &&
+         "No available scavenger restore location!");
 
   // We ran out of candidates, so stop the search.
   UseMI = RestorePointMI;
   return Survivor;
 }
 
-static unsigned getFrameIndexOperandNum(MachineInstr *MI) {
+static unsigned getFrameIndexOperandNum(MachineInstr &MI) {
   unsigned i = 0;
-  while (!MI->getOperand(i).isFI()) {
+  while (!MI.getOperand(i).isFI()) {
     ++i;
-    assert(i < MI->getNumOperands() &&
-           "Instr doesn't have FrameIndex operand!");
+    assert(i < MI.getNumOperands() && "Instr doesn't have FrameIndex operand!");
   }
   return i;
 }
@@ -366,13 +349,13 @@ static unsigned getFrameIndexOperandNum(MachineInstr *MI) {
 unsigned RegScavenger::scavengeRegister(const TargetRegisterClass *RC,
                                         MachineBasicBlock::iterator I,
                                         int SPAdj) {
+  MachineInstr &MI = *I;
+  const MachineFunction &MF = *MI.getParent()->getParent();
   // Consider all allocatable registers in the register class initially
-  BitVector Candidates =
-    TRI->getAllocatableSet(*I->getParent()->getParent(), RC);
+  BitVector Candidates = TRI->getAllocatableSet(MF, RC);
 
   // Exclude all the registers being used by the instruction.
-  for (unsigned i = 0, e = I->getNumOperands(); i != e; ++i) {
-    MachineOperand &MO = I->getOperand(i);
+  for (const MachineOperand &MO : MI.operands()) {
     if (MO.isReg() && MO.getReg() != 0 && !(MO.isUse() && MO.isUndef()) &&
         !TargetRegisterInfo::isVirtualRegister(MO.getReg()))
       Candidates.reset(MO.getReg());
@@ -395,16 +378,42 @@ unsigned RegScavenger::scavengeRegister(const TargetRegisterClass *RC,
     return SReg;
   }
 
-  // Find an available scavenging slot.
-  unsigned SI;
-  for (SI = 0; SI < Scavenged.size(); ++SI)
-    if (Scavenged[SI].Reg == 0)
-      break;
+  // Find an available scavenging slot with size and alignment matching
+  // the requirements of the class RC.
+  const MachineFrameInfo &MFI = *MF.getFrameInfo();
+  unsigned NeedSize = RC->getSize();
+  unsigned NeedAlign = RC->getAlignment();
+
+  unsigned SI = Scavenged.size(), Diff = UINT_MAX;
+  int FIB = MFI.getObjectIndexBegin(), FIE = MFI.getObjectIndexEnd();
+  for (unsigned I = 0; I < Scavenged.size(); ++I) {
+    if (Scavenged[I].Reg != 0)
+      continue;
+    // Verify that this slot is valid for this register.
+    int FI = Scavenged[I].FrameIndex;
+    if (FI < FIB || FI >= FIE)
+      continue;
+    unsigned S = MFI.getObjectSize(FI);
+    unsigned A = MFI.getObjectAlignment(FI);
+    if (NeedSize > S || NeedAlign > A)
+      continue;
+    // Avoid wasting slots with large size and/or large alignment. Pick one
+    // that is the best fit for this register class (in street metric).
+    // Picking a larger slot than necessary could happen if a slot for a
+    // larger register is reserved before a slot for a smaller one. When
+    // trying to spill a smaller register, the large slot would be found
+    // first, thus making it impossible to spill the larger register later.
+    unsigned D = (S-NeedSize) + (A-NeedAlign);
+    if (D < Diff) {
+      SI = I;
+      Diff = D;
+    }
+  }
 
   if (SI == Scavenged.size()) {
     // We need to scavenge a register but have no spill slot, the target
     // must know how to do it (if not, we'll assert below).
-    Scavenged.push_back(ScavengedInfo());
+    Scavenged.push_back(ScavengedInfo(FIE));
   }
 
   // Avoid infinite regress
@@ -414,13 +423,18 @@ unsigned RegScavenger::scavengeRegister(const TargetRegisterClass *RC,
   // otherwise, use the emergency stack spill slot.
   if (!TRI->saveScavengerRegister(*MBB, I, UseMI, RC, SReg)) {
     // Spill the scavenged register before I.
-    assert(Scavenged[SI].FrameIndex >= 0 &&
-           "Cannot scavenge register without an emergency spill slot!");
+    int FI = Scavenged[SI].FrameIndex;
+    if (FI < FIB || FI >= FIE) {
+      std::string Msg = std::string("Error while trying to spill ") +
+          TRI->getName(SReg) + " from class " + TRI->getRegClassName(RC) +
+          ": Cannot scavenge register without an emergency spill slot!";
+      report_fatal_error(Msg.c_str());
+    }
     TII->storeRegToStackSlot(*MBB, I, SReg, true, Scavenged[SI].FrameIndex,
                              RC, TRI);
     MachineBasicBlock::iterator II = std::prev(I);
 
-    unsigned FIOperandNum = getFrameIndexOperandNum(II);
+    unsigned FIOperandNum = getFrameIndexOperandNum(*II);
     TRI->eliminateFrameIndex(II, SPAdj, FIOperandNum, this);
 
     // Restore the scavenged register before its use (or first terminator).
@@ -428,11 +442,11 @@ unsigned RegScavenger::scavengeRegister(const TargetRegisterClass *RC,
                               RC, TRI);
     II = std::prev(UseMI);
 
-    FIOperandNum = getFrameIndexOperandNum(II);
+    FIOperandNum = getFrameIndexOperandNum(*II);
     TRI->eliminateFrameIndex(II, SPAdj, FIOperandNum, this);
   }
 
-  Scavenged[SI].Restore = std::prev(UseMI);
+  Scavenged[SI].Restore = &*std::prev(UseMI);
 
   // Doing this here leads to infinite regress.
   // Scavenged[SI].Reg = SReg;
diff --git a/lib/CodeGen/RegisterUsageInfo.cpp b/lib/CodeGen/RegisterUsageInfo.cpp
new file mode 100644
index 000000000000..5cf3e57eb3d3
--- /dev/null
+++ b/lib/CodeGen/RegisterUsageInfo.cpp
@@ -0,0 +1,93 @@
+//===- RegisterUsageInfo.cpp - Register Usage Informartion Storage --------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+///
+/// This pass is required to take advantage of the interprocedural register
+/// allocation infrastructure.
+///
+//===----------------------------------------------------------------------===//
+
+#include "llvm/CodeGen/RegisterUsageInfo.h"
+#include "llvm/CodeGen/MachineOperand.h"
+#include "llvm/IR/Module.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "ip-regalloc"
+
+cl::opt<bool> DumpRegUsage(
+    "print-regusage", cl::init(false), cl::Hidden,
+    cl::desc("print register usage details collected for analysis."));
+
+INITIALIZE_PASS(PhysicalRegisterUsageInfo, "reg-usage-info",
+                "Register Usage Informartion Stroage", false, true)
+
+char PhysicalRegisterUsageInfo::ID = 0;
+
+void PhysicalRegisterUsageInfo::anchor() {}
+
+bool PhysicalRegisterUsageInfo::doInitialization(Module &M) {
+  RegMasks.grow(M.size());
+  return false;
+}
+
+bool PhysicalRegisterUsageInfo::doFinalization(Module &M) {
+  if (DumpRegUsage)
+    print(errs());
+
+  RegMasks.shrink_and_clear();
+  return false;
+}
+
+void PhysicalRegisterUsageInfo::storeUpdateRegUsageInfo(
+    const Function *FP, std::vector<uint32_t> RegMask) {
+  assert(FP != nullptr && "Function * can't be nullptr.");
+  RegMasks[FP] = std::move(RegMask);
+}
+
+const std::vector<uint32_t> *
+PhysicalRegisterUsageInfo::getRegUsageInfo(const Function *FP) {
+  auto It = RegMasks.find(FP);
+  if (It != RegMasks.end())
+    return &(It->second);
+  return nullptr;
+}
+
+void PhysicalRegisterUsageInfo::print(raw_ostream &OS, const Module *M) const {
+  const TargetRegisterInfo *TRI;
+
+  typedef std::pair<const Function *, std::vector<uint32_t>> FuncPtrRegMaskPair;
+
+  SmallVector<const FuncPtrRegMaskPair *, 64> FPRMPairVector;
+
+  // Create a vector of pointer to RegMasks entries
+  for (const auto &RegMask : RegMasks)
+    FPRMPairVector.push_back(&RegMask);
+
+  // sort the vector to print analysis in alphabatic order of function name.
+  std::sort(
+      FPRMPairVector.begin(), FPRMPairVector.end(),
+      [](const FuncPtrRegMaskPair *A, const FuncPtrRegMaskPair *B) -> bool {
+        return A->first->getName() < B->first->getName();
+      });
+
+  for (const FuncPtrRegMaskPair *FPRMPair : FPRMPairVector) {
+    OS << FPRMPair->first->getName() << " "
+       << "Clobbered Registers: ";
+    TRI = TM->getSubtarget<TargetSubtargetInfo>(*(FPRMPair->first))
+              .getRegisterInfo();
+
+    for (unsigned PReg = 1, PRegE = TRI->getNumRegs(); PReg < PRegE; ++PReg) {
+      if (MachineOperand::clobbersPhysReg(&(FPRMPair->second[0]), PReg))
+        OS << TRI->getName(PReg) << " ";
+    }
+    OS << "\n";
+  }
+}
diff --git a/lib/CodeGen/RenameIndependentSubregs.cpp b/lib/CodeGen/RenameIndependentSubregs.cpp
new file mode 100644
index 000000000000..ea952d9088fc
--- /dev/null
+++ b/lib/CodeGen/RenameIndependentSubregs.cpp
@@ -0,0 +1,388 @@
+//===-- RenameIndependentSubregs.cpp - Live Interval Analysis -------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+/// Rename independent subregisters looks for virtual registers with
+/// independently used subregisters and renames them to new virtual registers.
+/// Example: In the following:
+///   %vreg0:sub0<read-undef> = ...
+///   %vreg0:sub1 = ...
+///   use %vreg0:sub0
+///   %vreg0:sub0 = ...
+///   use %vreg0:sub0
+///   use %vreg0:sub1
+/// sub0 and sub1 are never used together, and we have two independent sub0
+/// definitions. This pass will rename to:
+///   %vreg0:sub0<read-undef> = ...
+///   %vreg1:sub1<read-undef> = ...
+///   use %vreg1:sub1
+///   %vreg2:sub1<read-undef> = ...
+///   use %vreg2:sub1
+///   use %vreg0:sub0
+//
+//===----------------------------------------------------------------------===//
+
+#include "LiveRangeUtils.h"
+#include "PHIEliminationUtils.h"
+#include "llvm/CodeGen/LiveInterval.h"
+#include "llvm/CodeGen/LiveIntervalAnalysis.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/Passes.h"
+#include "llvm/Target/TargetInstrInfo.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "rename-independent-subregs"
+
+namespace {
+
+class RenameIndependentSubregs : public MachineFunctionPass {
+public:
+  static char ID;
+  RenameIndependentSubregs() : MachineFunctionPass(ID) {}
+
+  const char *getPassName() const override {
+    return "Rename Disconnected Subregister Components";
+  }
+
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    AU.setPreservesCFG();
+    AU.addRequired<LiveIntervals>();
+    AU.addPreserved<LiveIntervals>();
+    AU.addRequired<SlotIndexes>();
+    AU.addPreserved<SlotIndexes>();
+    MachineFunctionPass::getAnalysisUsage(AU);
+  }
+
+  bool runOnMachineFunction(MachineFunction &MF) override;
+
+private:
+  struct SubRangeInfo {
+    ConnectedVNInfoEqClasses ConEQ;
+    LiveInterval::SubRange *SR;
+    unsigned Index;
+
+    SubRangeInfo(LiveIntervals &LIS, LiveInterval::SubRange &SR,
+                 unsigned Index)
+      : ConEQ(LIS), SR(&SR), Index(Index) {}
+  };
+
+  /// Split unrelated subregister components and rename them to new vregs.
+  bool renameComponents(LiveInterval &LI) const;
+
+  /// \brief Build a vector of SubRange infos and a union find set of
+  /// equivalence classes.
+  /// Returns true if more than 1 equivalence class was found.
+  bool findComponents(IntEqClasses &Classes,
+                      SmallVectorImpl<SubRangeInfo> &SubRangeInfos,
+                      LiveInterval &LI) const;
+
+  /// \brief Distribute the LiveInterval segments into the new LiveIntervals
+  /// belonging to their class.
+  void distribute(const IntEqClasses &Classes,
+                  const SmallVectorImpl<SubRangeInfo> &SubRangeInfos,
+                  const SmallVectorImpl<LiveInterval*> &Intervals) const;
+
+  /// \brief Constructs main liverange and add missing undef+dead flags.
+  void computeMainRangesFixFlags(const IntEqClasses &Classes,
+      const SmallVectorImpl<SubRangeInfo> &SubRangeInfos,
+      const SmallVectorImpl<LiveInterval*> &Intervals) const;
+
+  /// Rewrite Machine Operands to use the new vreg belonging to their class.
+  void rewriteOperands(const IntEqClasses &Classes,
+                       const SmallVectorImpl<SubRangeInfo> &SubRangeInfos,
+                       const SmallVectorImpl<LiveInterval*> &Intervals) const;
+
+
+  LiveIntervals *LIS;
+  MachineRegisterInfo *MRI;
+  const TargetInstrInfo *TII;
+};
+
+} // end anonymous namespace
+
+char RenameIndependentSubregs::ID;
+
+char &llvm::RenameIndependentSubregsID = RenameIndependentSubregs::ID;
+
+INITIALIZE_PASS_BEGIN(RenameIndependentSubregs, "rename-independent-subregs",
+                      "Rename Independent Subregisters", false, false)
+INITIALIZE_PASS_DEPENDENCY(SlotIndexes)
+INITIALIZE_PASS_DEPENDENCY(LiveIntervals)
+INITIALIZE_PASS_END(RenameIndependentSubregs, "rename-independent-subregs",
+                    "Rename Independent Subregisters", false, false)
+
+bool RenameIndependentSubregs::renameComponents(LiveInterval &LI) const {
+  // Shortcut: We cannot have split components with a single definition.
+  if (LI.valnos.size() < 2)
+    return false;
+
+  SmallVector<SubRangeInfo, 4> SubRangeInfos;
+  IntEqClasses Classes;
+  if (!findComponents(Classes, SubRangeInfos, LI))
+    return false;
+
+  // Create a new VReg for each class.
+  unsigned Reg = LI.reg;
+  const TargetRegisterClass *RegClass = MRI->getRegClass(Reg);
+  SmallVector<LiveInterval*, 4> Intervals;
+  Intervals.push_back(&LI);
+  DEBUG(dbgs() << PrintReg(Reg) << ": Found " << Classes.getNumClasses()
+        << " equivalence classes.\n");
+  DEBUG(dbgs() << PrintReg(Reg) << ": Splitting into newly created:");
+  for (unsigned I = 1, NumClasses = Classes.getNumClasses(); I < NumClasses;
+       ++I) {
+    unsigned NewVReg = MRI->createVirtualRegister(RegClass);
+    LiveInterval &NewLI = LIS->createEmptyInterval(NewVReg);
+    Intervals.push_back(&NewLI);
+    DEBUG(dbgs() << ' ' << PrintReg(NewVReg));
+  }
+  DEBUG(dbgs() << '\n');
+
+  rewriteOperands(Classes, SubRangeInfos, Intervals);
+  distribute(Classes, SubRangeInfos, Intervals);
+  computeMainRangesFixFlags(Classes, SubRangeInfos, Intervals);
+  return true;
+}
+
+bool RenameIndependentSubregs::findComponents(IntEqClasses &Classes,
+    SmallVectorImpl<RenameIndependentSubregs::SubRangeInfo> &SubRangeInfos,
+    LiveInterval &LI) const {
+  // First step: Create connected components for the VNInfos inside the
+  // subranges and count the global number of such components.
+  unsigned NumComponents = 0;
+  for (LiveInterval::SubRange &SR : LI.subranges()) {
+    SubRangeInfos.push_back(SubRangeInfo(*LIS, SR, NumComponents));
+    ConnectedVNInfoEqClasses &ConEQ = SubRangeInfos.back().ConEQ;
+
+    unsigned NumSubComponents = ConEQ.Classify(SR);
+    NumComponents += NumSubComponents;
+  }
+  // Shortcut: With only 1 subrange, the normal separate component tests are
+  // enough and we do not need to perform the union-find on the subregister
+  // segments.
+  if (SubRangeInfos.size() < 2)
+    return false;
+
+  // Next step: Build union-find structure over all subranges and merge classes
+  // across subranges when they are affected by the same MachineOperand.
+  const TargetRegisterInfo &TRI = *MRI->getTargetRegisterInfo();
+  Classes.grow(NumComponents);
+  unsigned Reg = LI.reg;
+  for (const MachineOperand &MO : MRI->reg_nodbg_operands(Reg)) {
+    if (!MO.isDef() && !MO.readsReg())
+      continue;
+    unsigned SubRegIdx = MO.getSubReg();
+    LaneBitmask LaneMask = TRI.getSubRegIndexLaneMask(SubRegIdx);
+    unsigned MergedID = ~0u;
+    for (RenameIndependentSubregs::SubRangeInfo &SRInfo : SubRangeInfos) {
+      const LiveInterval::SubRange &SR = *SRInfo.SR;
+      if ((SR.LaneMask & LaneMask) == 0)
+        continue;
+      SlotIndex Pos = LIS->getInstructionIndex(*MO.getParent());
+      Pos = MO.isDef() ? Pos.getRegSlot(MO.isEarlyClobber())
+                       : Pos.getBaseIndex();
+      const VNInfo *VNI = SR.getVNInfoAt(Pos);
+      if (VNI == nullptr)
+        continue;
+
+      // Map to local representant ID.
+      unsigned LocalID = SRInfo.ConEQ.getEqClass(VNI);
+      // Global ID
+      unsigned ID = LocalID + SRInfo.Index;
+      // Merge other sets
+      MergedID = MergedID == ~0u ? ID : Classes.join(MergedID, ID);
+    }
+  }
+
+  // Early exit if we ended up with a single equivalence class.
+  Classes.compress();
+  unsigned NumClasses = Classes.getNumClasses();
+  return NumClasses > 1;
+}
+
+void RenameIndependentSubregs::rewriteOperands(const IntEqClasses &Classes,
+    const SmallVectorImpl<SubRangeInfo> &SubRangeInfos,
+    const SmallVectorImpl<LiveInterval*> &Intervals) const {
+  const TargetRegisterInfo &TRI = *MRI->getTargetRegisterInfo();
+  unsigned Reg = Intervals[0]->reg;;
+  for (MachineRegisterInfo::reg_nodbg_iterator I = MRI->reg_nodbg_begin(Reg),
+       E = MRI->reg_nodbg_end(); I != E; ) {
+    MachineOperand &MO = *I++;
+    if (!MO.isDef() && !MO.readsReg())
+      continue;
+
+    MachineInstr &MI = *MO.getParent();
+
+    SlotIndex Pos = LIS->getInstructionIndex(MI);
+    unsigned SubRegIdx = MO.getSubReg();
+    LaneBitmask LaneMask = TRI.getSubRegIndexLaneMask(SubRegIdx);
+
+    unsigned ID = ~0u;
+    for (const SubRangeInfo &SRInfo : SubRangeInfos) {
+      const LiveInterval::SubRange &SR = *SRInfo.SR;
+      if ((SR.LaneMask & LaneMask) == 0)
+        continue;
+      LiveRange::const_iterator I = SR.find(Pos);
+      if (I == SR.end())
+        continue;
+
+      const VNInfo &VNI = *I->valno;
+      // Map to local representant ID.
+      unsigned LocalID = SRInfo.ConEQ.getEqClass(&VNI);
+      // Global ID
+      ID = Classes[LocalID + SRInfo.Index];
+      break;
+    }
+
+    unsigned VReg = Intervals[ID]->reg;
+    MO.setReg(VReg);
+  }
+  // TODO: We could attempt to recompute new register classes while visiting
+  // the operands: Some of the split register may be fine with less constraint
+  // classes than the original vreg.
+}
+
+void RenameIndependentSubregs::distribute(const IntEqClasses &Classes,
+    const SmallVectorImpl<SubRangeInfo> &SubRangeInfos,
+    const SmallVectorImpl<LiveInterval*> &Intervals) const {
+  unsigned NumClasses = Classes.getNumClasses();
+  SmallVector<unsigned, 8> VNIMapping;
+  SmallVector<LiveInterval::SubRange*, 8> SubRanges;
+  BumpPtrAllocator &Allocator = LIS->getVNInfoAllocator();
+  for (const SubRangeInfo &SRInfo : SubRangeInfos) {
+    LiveInterval::SubRange &SR = *SRInfo.SR;
+    unsigned NumValNos = SR.valnos.size();
+    VNIMapping.clear();
+    VNIMapping.reserve(NumValNos);
+    SubRanges.clear();
+    SubRanges.resize(NumClasses-1, nullptr);
+    for (unsigned I = 0; I < NumValNos; ++I) {
+      const VNInfo &VNI = *SR.valnos[I];
+      unsigned LocalID = SRInfo.ConEQ.getEqClass(&VNI);
+      unsigned ID = Classes[LocalID + SRInfo.Index];
+      VNIMapping.push_back(ID);
+      if (ID > 0 && SubRanges[ID-1] == nullptr)
+        SubRanges[ID-1] = Intervals[ID]->createSubRange(Allocator, SR.LaneMask);
+    }
+    DistributeRange(SR, SubRanges.data(), VNIMapping);
+  }
+}
+
+static bool subRangeLiveAt(const LiveInterval &LI, SlotIndex Pos) {
+  for (const LiveInterval::SubRange &SR : LI.subranges()) {
+    if (SR.liveAt(Pos))
+      return true;
+  }
+  return false;
+}
+
+void RenameIndependentSubregs::computeMainRangesFixFlags(
+    const IntEqClasses &Classes,
+    const SmallVectorImpl<SubRangeInfo> &SubRangeInfos,
+    const SmallVectorImpl<LiveInterval*> &Intervals) const {
+  BumpPtrAllocator &Allocator = LIS->getVNInfoAllocator();
+  const SlotIndexes &Indexes = *LIS->getSlotIndexes();
+  for (size_t I = 0, E = Intervals.size(); I < E; ++I) {
+    LiveInterval &LI = *Intervals[I];
+    unsigned Reg = LI.reg;
+
+    LI.removeEmptySubRanges();
+
+    // There must be a def (or live-in) before every use. Splitting vregs may
+    // violate this principle as the splitted vreg may not have a definition on
+    // every path. Fix this by creating IMPLICIT_DEF instruction as necessary.
+    for (const LiveInterval::SubRange &SR : LI.subranges()) {
+      // Search for "PHI" value numbers in the subranges. We must find a live
+      // value in each predecessor block, add an IMPLICIT_DEF where it is
+      // missing.
+      for (unsigned I = 0; I < SR.valnos.size(); ++I) {
+        const VNInfo &VNI = *SR.valnos[I];
+        if (VNI.isUnused() || !VNI.isPHIDef())
+          continue;
+
+        SlotIndex Def = VNI.def;
+        MachineBasicBlock &MBB = *Indexes.getMBBFromIndex(Def);
+        for (MachineBasicBlock *PredMBB : MBB.predecessors()) {
+          SlotIndex PredEnd = Indexes.getMBBEndIdx(PredMBB);
+          if (subRangeLiveAt(LI, PredEnd.getPrevSlot()))
+            continue;
+
+          MachineBasicBlock::iterator InsertPos =
+            llvm::findPHICopyInsertPoint(PredMBB, &MBB, Reg);
+          const MCInstrDesc &MCDesc = TII->get(TargetOpcode::IMPLICIT_DEF);
+          MachineInstrBuilder ImpDef = BuildMI(*PredMBB, InsertPos,
+                                               DebugLoc(), MCDesc, Reg);
+          SlotIndex DefIdx = LIS->InsertMachineInstrInMaps(*ImpDef);
+          SlotIndex RegDefIdx = DefIdx.getRegSlot();
+          for (LiveInterval::SubRange &SR : LI.subranges()) {
+            VNInfo *SRVNI = SR.getNextValue(RegDefIdx, Allocator);
+            SR.addSegment(LiveRange::Segment(RegDefIdx, PredEnd, SRVNI));
+          }
+        }
+      }
+    }
+
+    for (MachineOperand &MO : MRI->reg_nodbg_operands(Reg)) {
+      if (!MO.isDef())
+        continue;
+      unsigned SubRegIdx = MO.getSubReg();
+      if (SubRegIdx == 0)
+        continue;
+      // After assigning the new vreg we may not have any other sublanes living
+      // in and out of the instruction anymore. We need to add new dead and
+      // undef flags in these cases.
+      if (!MO.isUndef()) {
+        SlotIndex Pos = LIS->getInstructionIndex(*MO.getParent());
+        if (!subRangeLiveAt(LI, Pos))
+          MO.setIsUndef();
+      }
+      if (!MO.isDead()) {
+        SlotIndex Pos = LIS->getInstructionIndex(*MO.getParent()).getDeadSlot();
+        if (!subRangeLiveAt(LI, Pos))
+          MO.setIsDead();
+      }
+    }
+
+    if (I == 0)
+      LI.clear();
+    LIS->constructMainRangeFromSubranges(LI);
+  }
+}
+
+bool RenameIndependentSubregs::runOnMachineFunction(MachineFunction &MF) {
+  // Skip renaming if liveness of subregister is not tracked.
+  if (!MF.getSubtarget().enableSubRegLiveness())
+    return false;
+
+  DEBUG(dbgs() << "Renaming independent subregister live ranges in "
+        << MF.getName() << '\n');
+
+  LIS = &getAnalysis<LiveIntervals>();
+  MRI = &MF.getRegInfo();
+  TII = MF.getSubtarget().getInstrInfo();
+
+  // Iterate over all vregs. Note that we query getNumVirtRegs() the newly
+  // created vregs end up with higher numbers but do not need to be visited as
+  // there can't be any further splitting.
+  bool Changed = false;
+  for (size_t I = 0, E = MRI->getNumVirtRegs(); I < E; ++I) {
+    unsigned Reg = TargetRegisterInfo::index2VirtReg(I);
+    if (!LIS->hasInterval(Reg))
+      continue;
+    LiveInterval &LI = LIS->getInterval(Reg);
+    if (!LI.hasSubRanges())
+      continue;
+
+    Changed |= renameComponents(LI);
+  }
+
+  return Changed;
+}
diff --git a/lib/Transforms/Instrumentation/SafeStack.cpp b/lib/CodeGen/SafeStack.cpp
index abed465f102d..19cd59b9dba7 100644
--- a/lib/Transforms/Instrumentation/SafeStack.cpp
+++ b/lib/CodeGen/SafeStack.cpp
@@ -15,22 +15,25 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "llvm/Transforms/Instrumentation.h"
+#include "SafeStackColoring.h"
+#include "SafeStackLayout.h"
 #include "llvm/ADT/Statistic.h"
 #include "llvm/ADT/Triple.h"
+#include "llvm/Analysis/BranchProbabilityInfo.h"
 #include "llvm/Analysis/ScalarEvolution.h"
 #include "llvm/Analysis/ScalarEvolutionExpressions.h"
 #include "llvm/CodeGen/Passes.h"
 #include "llvm/IR/Constants.h"
+#include "llvm/IR/DIBuilder.h"
 #include "llvm/IR/DataLayout.h"
 #include "llvm/IR/DerivedTypes.h"
-#include "llvm/IR/DIBuilder.h"
 #include "llvm/IR/Function.h"
+#include "llvm/IR/IRBuilder.h"
 #include "llvm/IR/InstIterator.h"
 #include "llvm/IR/Instructions.h"
 #include "llvm/IR/IntrinsicInst.h"
 #include "llvm/IR/Intrinsics.h"
-#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/MDBuilder.h"
 #include "llvm/IR/Module.h"
 #include "llvm/Pass.h"
 #include "llvm/Support/CommandLine.h"
@@ -40,10 +43,12 @@
 #include "llvm/Support/raw_os_ostream.h"
 #include "llvm/Target/TargetLowering.h"
 #include "llvm/Target/TargetSubtargetInfo.h"
+#include "llvm/Transforms/Utils/BasicBlockUtils.h"
 #include "llvm/Transforms/Utils/Local.h"
 #include "llvm/Transforms/Utils/ModuleUtils.h"
 
 using namespace llvm;
+using namespace llvm::safestack;
 
 #define DEBUG_TYPE "safestack"
 
@@ -122,6 +127,13 @@ class SafeStack : public FunctionPass {
   /// \brief Build a value representing a pointer to the unsafe stack pointer.
   Value *getOrCreateUnsafeStackPtr(IRBuilder<> &IRB, Function &F);
 
+  /// \brief Return the value of the stack canary.
+  Value *getStackGuard(IRBuilder<> &IRB, Function &F);
+
+  /// \brief Load stack guard from the frame and check if it has changed.
+  void checkStackGuard(IRBuilder<> &IRB, Function &F, ReturnInst &RI,
+                       AllocaInst *StackGuardSlot, Value *StackGuard);
+
   /// \brief Find all static allocas, dynamic allocas, return instructions and
   /// stack restore points (exception unwind blocks and setjmp calls) in the
   /// given function and append them to the respective vectors.
@@ -144,7 +156,9 @@ class SafeStack : public FunctionPass {
   Value *moveStaticAllocasToUnsafeStack(IRBuilder<> &IRB, Function &F,
                                         ArrayRef<AllocaInst *> StaticAllocas,
                                         ArrayRef<Argument *> ByValArguments,
-                                        ArrayRef<ReturnInst *> Returns);
+                                        ArrayRef<ReturnInst *> Returns,
+                                        Instruction *BasePointer,
+                                        AllocaInst *StackGuardSlot);
 
   /// \brief Generate code to restore the stack after all stack restore points
   /// in \p StackRestorePoints.
@@ -378,6 +392,16 @@ Value *SafeStack::getOrCreateUnsafeStackPtr(IRBuilder<> &IRB, Function &F) {
   return UnsafeStackPtr;
 }
 
+Value *SafeStack::getStackGuard(IRBuilder<> &IRB, Function &F) {
+  Value *StackGuardVar = nullptr;
+  if (TL)
+    StackGuardVar = TL->getIRStackGuard(IRB);
+  if (!StackGuardVar)
+    StackGuardVar =
+        F.getParent()->getOrInsertGlobal("__stack_chk_guard", StackPtrTy);
+  return IRB.CreateLoad(StackGuardVar, "StackGuard");
+}
+
 void SafeStack::findInsts(Function &F,
                           SmallVectorImpl<AllocaInst *> &StaticAllocas,
                           SmallVectorImpl<AllocaInst *> &DynamicAllocas,
@@ -431,6 +455,8 @@ AllocaInst *
 SafeStack::createStackRestorePoints(IRBuilder<> &IRB, Function &F,
                                     ArrayRef<Instruction *> StackRestorePoints,
                                     Value *StaticTop, bool NeedDynamicTop) {
+  assert(StaticTop && "The stack top isn't set.");
+
   if (StackRestorePoints.empty())
     return nullptr;
 
@@ -441,19 +467,13 @@ SafeStack::createStackRestorePoints(IRBuilder<> &IRB, Function &F,
   // runtime itself.
 
   AllocaInst *DynamicTop = nullptr;
-  if (NeedDynamicTop)
+  if (NeedDynamicTop) {
     // If we also have dynamic alloca's, the stack pointer value changes
     // throughout the function. For now we store it in an alloca.
     DynamicTop = IRB.CreateAlloca(StackPtrTy, /*ArraySize=*/nullptr,
                                   "unsafe_stack_dynamic_ptr");
-
-  if (!StaticTop)
-    // We need the original unsafe stack pointer value, even if there are
-    // no unsafe static allocas.
-    StaticTop = IRB.CreateLoad(UnsafeStackPtr, false, "unsafe_stack_ptr");
-
-  if (NeedDynamicTop)
     IRB.CreateStore(StaticTop, DynamicTop);
+  }
 
   // Restore current stack pointer after longjmp/exception catch.
   for (Instruction *I : StackRestorePoints) {
@@ -467,83 +487,122 @@ SafeStack::createStackRestorePoints(IRBuilder<> &IRB, Function &F,
   return DynamicTop;
 }
 
+void SafeStack::checkStackGuard(IRBuilder<> &IRB, Function &F, ReturnInst &RI,
+                                AllocaInst *StackGuardSlot, Value *StackGuard) {
+  Value *V = IRB.CreateLoad(StackGuardSlot);
+  Value *Cmp = IRB.CreateICmpNE(StackGuard, V);
+
+  auto SuccessProb = BranchProbabilityInfo::getBranchProbStackProtector(true);
+  auto FailureProb = BranchProbabilityInfo::getBranchProbStackProtector(false);
+  MDNode *Weights = MDBuilder(F.getContext())
+                        .createBranchWeights(SuccessProb.getNumerator(),
+                                             FailureProb.getNumerator());
+  Instruction *CheckTerm =
+      SplitBlockAndInsertIfThen(Cmp, &RI,
+                                /* Unreachable */ true, Weights);
+  IRBuilder<> IRBFail(CheckTerm);
+  // FIXME: respect -fsanitize-trap / -ftrap-function here?
+  Constant *StackChkFail = F.getParent()->getOrInsertFunction(
+      "__stack_chk_fail", IRB.getVoidTy(), nullptr);
+  IRBFail.CreateCall(StackChkFail, {});
+}
+
+/// We explicitly compute and set the unsafe stack layout for all unsafe
+/// static alloca instructions. We save the unsafe "base pointer" in the
+/// prologue into a local variable and restore it in the epilogue.
 Value *SafeStack::moveStaticAllocasToUnsafeStack(
     IRBuilder<> &IRB, Function &F, ArrayRef<AllocaInst *> StaticAllocas,
-    ArrayRef<Argument *> ByValArguments, ArrayRef<ReturnInst *> Returns) {
+    ArrayRef<Argument *> ByValArguments, ArrayRef<ReturnInst *> Returns,
+    Instruction *BasePointer, AllocaInst *StackGuardSlot) {
   if (StaticAllocas.empty() && ByValArguments.empty())
-    return nullptr;
+    return BasePointer;
 
   DIBuilder DIB(*F.getParent());
 
-  // We explicitly compute and set the unsafe stack layout for all unsafe
-  // static alloca instructions. We save the unsafe "base pointer" in the
-  // prologue into a local variable and restore it in the epilogue.
-
-  // Load the current stack pointer (we'll also use it as a base pointer).
-  // FIXME: use a dedicated register for it ?
-  Instruction *BasePointer =
-      IRB.CreateLoad(UnsafeStackPtr, false, "unsafe_stack_ptr");
-  assert(BasePointer->getType() == StackPtrTy);
+  StackColoring SSC(F, StaticAllocas);
+  SSC.run();
+  SSC.removeAllMarkers();
 
-  for (ReturnInst *RI : Returns) {
-    IRB.SetInsertPoint(RI);
-    IRB.CreateStore(BasePointer, UnsafeStackPtr);
+  // Unsafe stack always grows down.
+  StackLayout SSL(StackAlignment);
+  if (StackGuardSlot) {
+    Type *Ty = StackGuardSlot->getAllocatedType();
+    unsigned Align =
+        std::max(DL->getPrefTypeAlignment(Ty), StackGuardSlot->getAlignment());
+    SSL.addObject(StackGuardSlot, getStaticAllocaAllocationSize(StackGuardSlot),
+                  Align, SSC.getLiveRange(StackGuardSlot));
   }
 
-  // Compute maximum alignment among static objects on the unsafe stack.
-  unsigned MaxAlignment = 0;
   for (Argument *Arg : ByValArguments) {
     Type *Ty = Arg->getType()->getPointerElementType();
+    uint64_t Size = DL->getTypeStoreSize(Ty);
+    if (Size == 0)
+      Size = 1; // Don't create zero-sized stack objects.
+
+    // Ensure the object is properly aligned.
     unsigned Align = std::max((unsigned)DL->getPrefTypeAlignment(Ty),
                               Arg->getParamAlignment());
-    if (Align > MaxAlignment)
-      MaxAlignment = Align;
+    SSL.addObject(Arg, Size, Align, SSC.getFullLiveRange());
   }
+
   for (AllocaInst *AI : StaticAllocas) {
     Type *Ty = AI->getAllocatedType();
+    uint64_t Size = getStaticAllocaAllocationSize(AI);
+    if (Size == 0)
+      Size = 1; // Don't create zero-sized stack objects.
+
+    // Ensure the object is properly aligned.
     unsigned Align =
         std::max((unsigned)DL->getPrefTypeAlignment(Ty), AI->getAlignment());
-    if (Align > MaxAlignment)
-      MaxAlignment = Align;
+
+    SSL.addObject(AI, Size, Align, SSC.getLiveRange(AI));
   }
 
-  if (MaxAlignment > StackAlignment) {
+  SSL.computeLayout();
+  unsigned FrameAlignment = SSL.getFrameAlignment();
+
+  // FIXME: tell SSL that we start at a less-then-MaxAlignment aligned location
+  // (AlignmentSkew).
+  if (FrameAlignment > StackAlignment) {
     // Re-align the base pointer according to the max requested alignment.
-    assert(isPowerOf2_32(MaxAlignment));
+    assert(isPowerOf2_32(FrameAlignment));
     IRB.SetInsertPoint(BasePointer->getNextNode());
     BasePointer = cast<Instruction>(IRB.CreateIntToPtr(
         IRB.CreateAnd(IRB.CreatePtrToInt(BasePointer, IntPtrTy),
-                      ConstantInt::get(IntPtrTy, ~uint64_t(MaxAlignment - 1))),
+                      ConstantInt::get(IntPtrTy, ~uint64_t(FrameAlignment - 1))),
         StackPtrTy));
   }
 
-  int64_t StaticOffset = 0; // Current stack top.
   IRB.SetInsertPoint(BasePointer->getNextNode());
 
+  if (StackGuardSlot) {
+    unsigned Offset = SSL.getObjectOffset(StackGuardSlot);
+    Value *Off = IRB.CreateGEP(BasePointer, // BasePointer is i8*
+                               ConstantInt::get(Int32Ty, -Offset));
+    Value *NewAI =
+        IRB.CreateBitCast(Off, StackGuardSlot->getType(), "StackGuardSlot");
+
+    // Replace alloc with the new location.
+    StackGuardSlot->replaceAllUsesWith(NewAI);
+    StackGuardSlot->eraseFromParent();
+  }
+
   for (Argument *Arg : ByValArguments) {
+    unsigned Offset = SSL.getObjectOffset(Arg);
     Type *Ty = Arg->getType()->getPointerElementType();
 
     uint64_t Size = DL->getTypeStoreSize(Ty);
     if (Size == 0)
       Size = 1; // Don't create zero-sized stack objects.
 
-    // Ensure the object is properly aligned.
-    unsigned Align = std::max((unsigned)DL->getPrefTypeAlignment(Ty),
-                              Arg->getParamAlignment());
-
-    // Add alignment.
-    // NOTE: we ensure that BasePointer itself is aligned to >= Align.
-    StaticOffset += Size;
-    StaticOffset = RoundUpToAlignment(StaticOffset, Align);
-
     Value *Off = IRB.CreateGEP(BasePointer, // BasePointer is i8*
-                               ConstantInt::get(Int32Ty, -StaticOffset));
+                               ConstantInt::get(Int32Ty, -Offset));
     Value *NewArg = IRB.CreateBitCast(Off, Arg->getType(),
                                      Arg->getName() + ".unsafe-byval");
 
     // Replace alloc with the new location.
     replaceDbgDeclare(Arg, BasePointer, BasePointer->getNextNode(), DIB,
-                      /*Deref=*/true, -StaticOffset);
+                      /*Deref=*/true, -Offset);
     Arg->replaceAllUsesWith(NewArg);
     IRB.SetInsertPoint(cast<Instruction>(NewArg)->getNextNode());
     IRB.CreateMemCpy(Off, Arg, Size, Arg->getParamAlignment());
@@ -552,43 +611,58 @@ Value *SafeStack::moveStaticAllocasToUnsafeStack(
   // Allocate space for every unsafe static AllocaInst on the unsafe stack.
   for (AllocaInst *AI : StaticAllocas) {
     IRB.SetInsertPoint(AI);
+    unsigned Offset = SSL.getObjectOffset(AI);
 
-    Type *Ty = AI->getAllocatedType();
     uint64_t Size = getStaticAllocaAllocationSize(AI);
     if (Size == 0)
       Size = 1; // Don't create zero-sized stack objects.
 
-    // Ensure the object is properly aligned.
-    unsigned Align =
-        std::max((unsigned)DL->getPrefTypeAlignment(Ty), AI->getAlignment());
-
-    // Add alignment.
-    // NOTE: we ensure that BasePointer itself is aligned to >= Align.
-    StaticOffset += Size;
-    StaticOffset = RoundUpToAlignment(StaticOffset, Align);
-
-    Value *Off = IRB.CreateGEP(BasePointer, // BasePointer is i8*
-                               ConstantInt::get(Int32Ty, -StaticOffset));
-    Value *NewAI = IRB.CreateBitCast(Off, AI->getType(), AI->getName());
-    if (AI->hasName() && isa<Instruction>(NewAI))
-      cast<Instruction>(NewAI)->takeName(AI);
+    replaceDbgDeclareForAlloca(AI, BasePointer, DIB, /*Deref=*/true, -Offset);
+    replaceDbgValueForAlloca(AI, BasePointer, DIB, -Offset);
+
+    // Replace uses of the alloca with the new location.
+    // Insert address calculation close to each use to work around PR27844.
+    std::string Name = std::string(AI->getName()) + ".unsafe";
+    while (!AI->use_empty()) {
+      Use &U = *AI->use_begin();
+      Instruction *User = cast<Instruction>(U.getUser());
+
+      Instruction *InsertBefore;
+      if (auto *PHI = dyn_cast<PHINode>(User))
+        InsertBefore = PHI->getIncomingBlock(U)->getTerminator();
+      else
+        InsertBefore = User;
+
+      IRBuilder<> IRBUser(InsertBefore);
+      Value *Off = IRBUser.CreateGEP(BasePointer, // BasePointer is i8*
+                                     ConstantInt::get(Int32Ty, -Offset));
+      Value *Replacement = IRBUser.CreateBitCast(Off, AI->getType(), Name);
+
+      if (auto *PHI = dyn_cast<PHINode>(User)) {
+        // PHI nodes may have multiple incoming edges from the same BB (why??),
+        // all must be updated at once with the same incoming value.
+        auto *BB = PHI->getIncomingBlock(U);
+        for (unsigned I = 0; I < PHI->getNumIncomingValues(); ++I)
+          if (PHI->getIncomingBlock(I) == BB)
+            PHI->setIncomingValue(I, Replacement);
+      } else {
+        U.set(Replacement);
+      }
+    }
 
-    // Replace alloc with the new location.
-    replaceDbgDeclareForAlloca(AI, BasePointer, DIB, /*Deref=*/true, -StaticOffset);
-    AI->replaceAllUsesWith(NewAI);
     AI->eraseFromParent();
   }
 
   // Re-align BasePointer so that our callees would see it aligned as
   // expected.
   // FIXME: no need to update BasePointer in leaf functions.
-  StaticOffset = RoundUpToAlignment(StaticOffset, StackAlignment);
+  unsigned FrameSize = alignTo(SSL.getFrameSize(), StackAlignment);
 
   // Update shadow stack pointer in the function epilogue.
   IRB.SetInsertPoint(BasePointer->getNextNode());
 
   Value *StaticTop =
-      IRB.CreateGEP(BasePointer, ConstantInt::get(Int32Ty, -StaticOffset),
+      IRB.CreateGEP(BasePointer, ConstantInt::get(Int32Ty, -FrameSize),
                     "unsafe_stack_static_top");
   IRB.CreateStore(StaticTop, UnsafeStackPtr);
   return StaticTop;
@@ -681,18 +755,6 @@ bool SafeStack::runOnFunction(Function &F) {
   TL = TM ? TM->getSubtargetImpl(F)->getTargetLowering() : nullptr;
   SE = &getAnalysis<ScalarEvolutionWrapperPass>().getSE();
 
-  {
-    // Make sure the regular stack protector won't run on this function
-    // (safestack attribute takes precedence).
-    AttrBuilder B;
-    B.addAttribute(Attribute::StackProtect)
-        .addAttribute(Attribute::StackProtectReq)
-        .addAttribute(Attribute::StackProtectStrong);
-    F.removeAttributes(
-        AttributeSet::FunctionIndex,
-        AttributeSet::get(F.getContext(), AttributeSet::FunctionIndex, B));
-  }
-
   ++NumFunctions;
 
   SmallVector<AllocaInst *, 16> StaticAllocas;
@@ -726,9 +788,32 @@ bool SafeStack::runOnFunction(Function &F) {
   IRBuilder<> IRB(&F.front(), F.begin()->getFirstInsertionPt());
   UnsafeStackPtr = getOrCreateUnsafeStackPtr(IRB, F);
 
-  // The top of the unsafe stack after all unsafe static allocas are allocated.
-  Value *StaticTop = moveStaticAllocasToUnsafeStack(IRB, F, StaticAllocas,
-                                                    ByValArguments, Returns);
+  // Load the current stack pointer (we'll also use it as a base pointer).
+  // FIXME: use a dedicated register for it ?
+  Instruction *BasePointer =
+      IRB.CreateLoad(UnsafeStackPtr, false, "unsafe_stack_ptr");
+  assert(BasePointer->getType() == StackPtrTy);
+
+  AllocaInst *StackGuardSlot = nullptr;
+  // FIXME: implement weaker forms of stack protector.
+  if (F.hasFnAttribute(Attribute::StackProtect) ||
+      F.hasFnAttribute(Attribute::StackProtectStrong) ||
+      F.hasFnAttribute(Attribute::StackProtectReq)) {
+    Value *StackGuard = getStackGuard(IRB, F);
+    StackGuardSlot = IRB.CreateAlloca(StackPtrTy, nullptr);
+    IRB.CreateStore(StackGuard, StackGuardSlot);
+
+    for (ReturnInst *RI : Returns) {
+      IRBuilder<> IRBRet(RI);
+      checkStackGuard(IRBRet, F, *RI, StackGuardSlot, StackGuard);
+    }
+  }
+
+  // The top of the unsafe stack after all unsafe static allocas are
+  // allocated.
+  Value *StaticTop =
+      moveStaticAllocasToUnsafeStack(IRB, F, StaticAllocas, ByValArguments,
+                                     Returns, BasePointer, StackGuardSlot);
 
   // Safe stack object that stores the current unsafe stack top. It is updated
   // as unsafe dynamic (non-constant-sized) allocas are allocated and freed.
@@ -743,6 +828,12 @@ bool SafeStack::runOnFunction(Function &F) {
   moveDynamicAllocasToUnsafeStack(F, UnsafeStackPtr, DynamicTop,
                                   DynamicAllocas);
 
+  // Restore the unsafe stack pointer before each return.
+  for (ReturnInst *RI : Returns) {
+    IRB.SetInsertPoint(RI);
+    IRB.CreateStore(BasePointer, UnsafeStackPtr);
+  }
+
   DEBUG(dbgs() << "[SafeStack]     safestack applied\n");
   return true;
 }
diff --git a/lib/CodeGen/SafeStackColoring.cpp b/lib/CodeGen/SafeStackColoring.cpp
new file mode 100644
index 000000000000..709614f57e7d
--- /dev/null
+++ b/lib/CodeGen/SafeStackColoring.cpp
@@ -0,0 +1,289 @@
+//===-- SafeStackColoring.cpp - SafeStack frame coloring -------*- C++ -*--===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "SafeStackColoring.h"
+
+#include "llvm/ADT/DepthFirstIterator.h"
+#include "llvm/IR/CFG.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/Support/Debug.h"
+
+using namespace llvm;
+using namespace llvm::safestack;
+
+#define DEBUG_TYPE "safestackcoloring"
+
+static cl::opt<bool> ClColoring("safe-stack-coloring",
+                                cl::desc("enable safe stack coloring"),
+                                cl::Hidden, cl::init(true));
+
+const StackColoring::LiveRange &StackColoring::getLiveRange(AllocaInst *AI) {
+  return LiveRanges[AllocaNumbering[AI]];
+}
+
+bool StackColoring::readMarker(Instruction *I, bool *IsStart) {
+  auto *II = dyn_cast<IntrinsicInst>(I);
+  if (!II || (II->getIntrinsicID() != Intrinsic::lifetime_start &&
+              II->getIntrinsicID() != Intrinsic::lifetime_end))
+    return false;
+
+  *IsStart = II->getIntrinsicID() == Intrinsic::lifetime_start;
+  return true;
+}
+
+void StackColoring::removeAllMarkers() {
+  for (auto *I : Markers) {
+    auto *Op = dyn_cast<Instruction>(I->getOperand(1));
+    I->eraseFromParent();
+    // Remove the operand bitcast, too, if it has no more uses left.
+    if (Op && Op->use_empty())
+      Op->eraseFromParent();
+  }
+}
+
+void StackColoring::collectMarkers() {
+  InterestingAllocas.resize(NumAllocas);
+  DenseMap<BasicBlock *, SmallDenseMap<Instruction *, Marker>> BBMarkerSet;
+
+  // Compute the set of start/end markers per basic block.
+  for (unsigned AllocaNo = 0; AllocaNo < NumAllocas; ++AllocaNo) {
+    AllocaInst *AI = Allocas[AllocaNo];
+    SmallVector<Instruction *, 8> WorkList;
+    WorkList.push_back(AI);
+    while (!WorkList.empty()) {
+      Instruction *I = WorkList.pop_back_val();
+      for (User *U : I->users()) {
+        if (auto *BI = dyn_cast<BitCastInst>(U)) {
+          WorkList.push_back(BI);
+          continue;
+        }
+        auto *UI = dyn_cast<Instruction>(U);
+        if (!UI)
+          continue;
+        bool IsStart;
+        if (!readMarker(UI, &IsStart))
+          continue;
+        if (IsStart)
+          InterestingAllocas.set(AllocaNo);
+        BBMarkerSet[UI->getParent()][UI] = {AllocaNo, IsStart};
+        Markers.push_back(UI);
+      }
+    }
+  }
+
+  // Compute instruction numbering. Only the following instructions are
+  // considered:
+  // * Basic block entries
+  // * Lifetime markers
+  // For each basic block, compute
+  // * the list of markers in the instruction order
+  // * the sets of allocas whose lifetime starts or ends in this BB
+  DEBUG(dbgs() << "Instructions:\n");
+  unsigned InstNo = 0;
+  for (BasicBlock *BB : depth_first(&F)) {
+    DEBUG(dbgs() << "  " << InstNo << ": BB " << BB->getName() << "\n");
+    unsigned BBStart = InstNo++;
+
+    BlockLifetimeInfo &BlockInfo = BlockLiveness[BB];
+    BlockInfo.Begin.resize(NumAllocas);
+    BlockInfo.End.resize(NumAllocas);
+    BlockInfo.LiveIn.resize(NumAllocas);
+    BlockInfo.LiveOut.resize(NumAllocas);
+
+    auto &BlockMarkerSet = BBMarkerSet[BB];
+    if (BlockMarkerSet.empty()) {
+      unsigned BBEnd = InstNo;
+      BlockInstRange[BB] = std::make_pair(BBStart, BBEnd);
+      continue;
+    }
+
+    auto ProcessMarker = [&](Instruction *I, const Marker &M) {
+      DEBUG(dbgs() << "  " << InstNo << ":  "
+                   << (M.IsStart ? "start " : "end   ") << M.AllocaNo << ", "
+                   << *I << "\n");
+
+      BBMarkers[BB].push_back({InstNo, M});
+
+      InstructionNumbering[I] = InstNo++;
+
+      if (M.IsStart) {
+        if (BlockInfo.End.test(M.AllocaNo))
+          BlockInfo.End.reset(M.AllocaNo);
+        BlockInfo.Begin.set(M.AllocaNo);
+      } else {
+        if (BlockInfo.Begin.test(M.AllocaNo))
+          BlockInfo.Begin.reset(M.AllocaNo);
+        BlockInfo.End.set(M.AllocaNo);
+      }
+    };
+
+    if (BlockMarkerSet.size() == 1) {
+      ProcessMarker(BlockMarkerSet.begin()->getFirst(),
+                    BlockMarkerSet.begin()->getSecond());
+    } else {
+      // Scan the BB to determine the marker order.
+      for (Instruction &I : *BB) {
+        auto It = BlockMarkerSet.find(&I);
+        if (It == BlockMarkerSet.end())
+          continue;
+        ProcessMarker(&I, It->getSecond());
+      }
+    }
+
+    unsigned BBEnd = InstNo;
+    BlockInstRange[BB] = std::make_pair(BBStart, BBEnd);
+  }
+  NumInst = InstNo;
+}
+
+void StackColoring::calculateLocalLiveness() {
+  bool changed = true;
+  while (changed) {
+    changed = false;
+
+    for (BasicBlock *BB : depth_first(&F)) {
+      BlockLifetimeInfo &BlockInfo = BlockLiveness[BB];
+
+      // Compute LiveIn by unioning together the LiveOut sets of all preds.
+      BitVector LocalLiveIn;
+      for (auto *PredBB : predecessors(BB)) {
+        LivenessMap::const_iterator I = BlockLiveness.find(PredBB);
+        assert(I != BlockLiveness.end() && "Predecessor not found");
+        LocalLiveIn |= I->second.LiveOut;
+      }
+
+      // Compute LiveOut by subtracting out lifetimes that end in this
+      // block, then adding in lifetimes that begin in this block.  If
+      // we have both BEGIN and END markers in the same basic block
+      // then we know that the BEGIN marker comes after the END,
+      // because we already handle the case where the BEGIN comes
+      // before the END when collecting the markers (and building the
+      // BEGIN/END vectors).
+      BitVector LocalLiveOut = LocalLiveIn;
+      LocalLiveOut.reset(BlockInfo.End);
+      LocalLiveOut |= BlockInfo.Begin;
+
+      // Update block LiveIn set, noting whether it has changed.
+      if (LocalLiveIn.test(BlockInfo.LiveIn)) {
+        changed = true;
+        BlockInfo.LiveIn |= LocalLiveIn;
+      }
+
+      // Update block LiveOut set, noting whether it has changed.
+      if (LocalLiveOut.test(BlockInfo.LiveOut)) {
+        changed = true;
+        BlockInfo.LiveOut |= LocalLiveOut;
+      }
+    }
+  } // while changed.
+}
+
+void StackColoring::calculateLiveIntervals() {
+  for (auto IT : BlockLiveness) {
+    BasicBlock *BB = IT.getFirst();
+    BlockLifetimeInfo &BlockInfo = IT.getSecond();
+    unsigned BBStart, BBEnd;
+    std::tie(BBStart, BBEnd) = BlockInstRange[BB];
+
+    BitVector Started, Ended;
+    Started.resize(NumAllocas);
+    Ended.resize(NumAllocas);
+    SmallVector<unsigned, 8> Start;
+    Start.resize(NumAllocas);
+
+    // LiveIn ranges start at the first instruction.
+    for (unsigned AllocaNo = 0; AllocaNo < NumAllocas; ++AllocaNo) {
+      if (BlockInfo.LiveIn.test(AllocaNo)) {
+        Started.set(AllocaNo);
+        Start[AllocaNo] = BBStart;
+      }
+    }
+
+    for (auto &It : BBMarkers[BB]) {
+      unsigned InstNo = It.first;
+      bool IsStart = It.second.IsStart;
+      unsigned AllocaNo = It.second.AllocaNo;
+
+      if (IsStart) {
+        assert(!Started.test(AllocaNo));
+        Started.set(AllocaNo);
+        Ended.reset(AllocaNo);
+        Start[AllocaNo] = InstNo;
+      } else {
+        assert(!Ended.test(AllocaNo));
+        if (Started.test(AllocaNo)) {
+          LiveRanges[AllocaNo].AddRange(Start[AllocaNo], InstNo);
+          Started.reset(AllocaNo);
+        }
+        Ended.set(AllocaNo);
+      }
+    }
+
+    for (unsigned AllocaNo = 0; AllocaNo < NumAllocas; ++AllocaNo)
+      if (Started.test(AllocaNo))
+        LiveRanges[AllocaNo].AddRange(Start[AllocaNo], BBEnd);
+  }
+}
+
+LLVM_DUMP_METHOD void StackColoring::dumpAllocas() {
+  dbgs() << "Allocas:\n";
+  for (unsigned AllocaNo = 0; AllocaNo < NumAllocas; ++AllocaNo)
+    dbgs() << "  " << AllocaNo << ": " << *Allocas[AllocaNo] << "\n";
+}
+
+LLVM_DUMP_METHOD void StackColoring::dumpBlockLiveness() {
+  dbgs() << "Block liveness:\n";
+  for (auto IT : BlockLiveness) {
+    BasicBlock *BB = IT.getFirst();
+    BlockLifetimeInfo &BlockInfo = BlockLiveness[BB];
+    auto BlockRange = BlockInstRange[BB];
+    dbgs() << "  BB [" << BlockRange.first << ", " << BlockRange.second
+           << "): begin " << BlockInfo.Begin << ", end " << BlockInfo.End
+           << ", livein " << BlockInfo.LiveIn << ", liveout "
+           << BlockInfo.LiveOut << "\n";
+  }
+}
+
+LLVM_DUMP_METHOD void StackColoring::dumpLiveRanges() {
+  dbgs() << "Alloca liveness:\n";
+  for (unsigned AllocaNo = 0; AllocaNo < NumAllocas; ++AllocaNo) {
+    LiveRange &Range = LiveRanges[AllocaNo];
+    dbgs() << "  " << AllocaNo << ": " << Range << "\n";
+  }
+}
+
+void StackColoring::run() {
+  DEBUG(dumpAllocas());
+
+  for (unsigned I = 0; I < NumAllocas; ++I)
+    AllocaNumbering[Allocas[I]] = I;
+  LiveRanges.resize(NumAllocas);
+
+  collectMarkers();
+
+  if (!ClColoring) {
+    for (auto &R : LiveRanges) {
+      R.SetMaximum(1);
+      R.AddRange(0, 1);
+    }
+    return;
+  }
+
+  for (auto &R : LiveRanges)
+    R.SetMaximum(NumInst);
+  for (unsigned I = 0; I < NumAllocas; ++I)
+    if (!InterestingAllocas.test(I))
+      LiveRanges[I] = getFullLiveRange();
+
+  calculateLocalLiveness();
+  DEBUG(dumpBlockLiveness());
+  calculateLiveIntervals();
+  DEBUG(dumpLiveRanges());
+}
diff --git a/lib/CodeGen/SafeStackColoring.h b/lib/CodeGen/SafeStackColoring.h
new file mode 100644
index 000000000000..08b179ccb7f1
--- /dev/null
+++ b/lib/CodeGen/SafeStackColoring.h
@@ -0,0 +1,149 @@
+//===-- SafeStackColoring.h - SafeStack frame coloring ---------*- C++ -*--===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_CODEGEN_SAFESTACKCOLORING_H
+#define LLVM_LIB_CODEGEN_SAFESTACKCOLORING_H
+
+#include "llvm/ADT/BitVector.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/IR/Function.h"
+#include "llvm/Support/raw_os_ostream.h"
+
+namespace llvm {
+class AllocaInst;
+
+namespace safestack {
+/// Compute live ranges of allocas.
+/// Live ranges are represented as sets of "interesting" instructions, which are
+/// defined as instructions that may start or end an alloca's lifetime. These
+/// are:
+/// * lifetime.start and lifetime.end intrinsics
+/// * first instruction of any basic block
+/// Interesting instructions are numbered in the depth-first walk of the CFG,
+/// and in the program order inside each basic block.
+class StackColoring {
+  /// A class representing liveness information for a single basic block.
+  /// Each bit in the BitVector represents the liveness property
+  /// for a different stack slot.
+  struct BlockLifetimeInfo {
+    /// Which slots BEGINs in each basic block.
+    BitVector Begin;
+    /// Which slots ENDs in each basic block.
+    BitVector End;
+    /// Which slots are marked as LIVE_IN, coming into each basic block.
+    BitVector LiveIn;
+    /// Which slots are marked as LIVE_OUT, coming out of each basic block.
+    BitVector LiveOut;
+  };
+
+public:
+  /// This class represents a set of interesting instructions where an alloca is
+  /// live.
+  struct LiveRange {
+    BitVector bv;
+    void SetMaximum(int size) { bv.resize(size); }
+    void AddRange(unsigned start, unsigned end) { bv.set(start, end); }
+    bool Overlaps(const LiveRange &Other) const {
+      return bv.anyCommon(Other.bv);
+    }
+    void Join(const LiveRange &Other) { bv |= Other.bv; }
+  };
+
+private:
+  Function &F;
+
+  /// Maps active slots (per bit) for each basic block.
+  typedef DenseMap<BasicBlock *, BlockLifetimeInfo> LivenessMap;
+  LivenessMap BlockLiveness;
+
+  /// Number of interesting instructions.
+  int NumInst;
+  /// Numeric ids for interesting instructions.
+  DenseMap<Instruction *, unsigned> InstructionNumbering;
+  /// A range [Start, End) of instruction ids for each basic block.
+  /// Instructions inside each BB have monotonic and consecutive ids.
+  DenseMap<const BasicBlock *, std::pair<unsigned, unsigned>> BlockInstRange;
+
+  ArrayRef<AllocaInst *> Allocas;
+  unsigned NumAllocas;
+  DenseMap<AllocaInst *, unsigned> AllocaNumbering;
+  /// LiveRange for allocas.
+  SmallVector<LiveRange, 8> LiveRanges;
+
+  /// The set of allocas that have at least one lifetime.start. All other
+  /// allocas get LiveRange that corresponds to the entire function.
+  BitVector InterestingAllocas;
+  SmallVector<Instruction *, 8> Markers;
+
+  struct Marker {
+    unsigned AllocaNo;
+    bool IsStart;
+  };
+
+  /// List of {InstNo, {AllocaNo, IsStart}} for each BB, ordered by InstNo.
+  DenseMap<BasicBlock *, SmallVector<std::pair<unsigned, Marker>, 4>> BBMarkers;
+
+  void dumpAllocas();
+  void dumpBlockLiveness();
+  void dumpLiveRanges();
+
+  bool readMarker(Instruction *I, bool *IsStart);
+  void collectMarkers();
+  void calculateLocalLiveness();
+  void calculateLiveIntervals();
+
+public:
+  StackColoring(Function &F, ArrayRef<AllocaInst *> Allocas)
+      : F(F), NumInst(-1), Allocas(Allocas), NumAllocas(Allocas.size()) {}
+
+  void run();
+  void removeAllMarkers();
+
+  /// Returns a set of "interesting" instructions where the given alloca is
+  /// live. Not all instructions in a function are interesting: we pick a set
+  /// that is large enough for LiveRange::Overlaps to be correct.
+  const LiveRange &getLiveRange(AllocaInst *AI);
+
+  /// Returns a live range that represents an alloca that is live throughout the
+  /// entire function.
+  LiveRange getFullLiveRange() {
+    assert(NumInst >= 0);
+    LiveRange R;
+    R.SetMaximum(NumInst);
+    R.AddRange(0, NumInst);
+    return R;
+  }
+};
+
+static inline raw_ostream &operator<<(raw_ostream &OS, const BitVector &V) {
+  OS << "{";
+  int idx = V.find_first();
+  bool first = true;
+  while (idx >= 0) {
+    if (!first) {
+      OS << ", ";
+    }
+    first = false;
+    OS << idx;
+    idx = V.find_next(idx);
+  }
+  OS << "}";
+  return OS;
+}
+
+static inline raw_ostream &operator<<(raw_ostream &OS,
+                                      const StackColoring::LiveRange &R) {
+  return OS << R.bv;
+}
+
+} // namespace safestack
+} // namespace llvm
+
+#endif // LLVM_LIB_CODEGEN_SAFESTACKCOLORING_H
diff --git a/lib/CodeGen/SafeStackLayout.cpp b/lib/CodeGen/SafeStackLayout.cpp
new file mode 100644
index 000000000000..b8190e0f2153
--- /dev/null
+++ b/lib/CodeGen/SafeStackLayout.cpp
@@ -0,0 +1,138 @@
+//===-- SafeStackLayout.cpp - SafeStack frame layout -----------*- C++ -*--===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "SafeStackLayout.h"
+
+#include "llvm/IR/Instructions.h"
+#include "llvm/Support/Debug.h"
+
+using namespace llvm;
+using namespace llvm::safestack;
+
+#define DEBUG_TYPE "safestacklayout"
+
+static cl::opt<bool> ClLayout("safe-stack-layout",
+                              cl::desc("enable safe stack layout"), cl::Hidden,
+                              cl::init(true));
+
+LLVM_DUMP_METHOD void StackLayout::print(raw_ostream &OS) {
+  OS << "Stack regions:\n";
+  for (unsigned i = 0; i < Regions.size(); ++i) {
+    OS << "  " << i << ": [" << Regions[i].Start << ", " << Regions[i].End
+       << "), range " << Regions[i].Range << "\n";
+  }
+  OS << "Stack objects:\n";
+  for (auto &IT : ObjectOffsets) {
+    OS << "  at " << IT.getSecond() << ": " << *IT.getFirst() << "\n";
+  }
+}
+
+void StackLayout::addObject(const Value *V, unsigned Size, unsigned Alignment,
+                            const StackColoring::LiveRange &Range) {
+  StackObjects.push_back({V, Size, Alignment, Range});
+  MaxAlignment = std::max(MaxAlignment, Alignment);
+}
+
+static unsigned AdjustStackOffset(unsigned Offset, unsigned Size,
+                                  unsigned Alignment) {
+  return alignTo(Offset + Size, Alignment) - Size;
+}
+
+void StackLayout::layoutObject(StackObject &Obj) {
+  if (!ClLayout) {
+    // If layout is disabled, just grab the next aligned address.
+    // This effectively disables stack coloring as well.
+    unsigned LastRegionEnd = Regions.empty() ? 0 : Regions.back().End;
+    unsigned Start = AdjustStackOffset(LastRegionEnd, Obj.Size, Obj.Alignment);
+    unsigned End = Start + Obj.Size;
+    Regions.emplace_back(Start, End, Obj.Range);
+    ObjectOffsets[Obj.Handle] = End;
+    return;
+  }
+
+  DEBUG(dbgs() << "Layout: size " << Obj.Size << ", align " << Obj.Alignment
+               << ", range " << Obj.Range << "\n");
+  assert(Obj.Alignment <= MaxAlignment);
+  unsigned Start = AdjustStackOffset(0, Obj.Size, Obj.Alignment);
+  unsigned End = Start + Obj.Size;
+  DEBUG(dbgs() << "  First candidate: " << Start << " .. " << End << "\n");
+  for (const StackRegion &R : Regions) {
+    DEBUG(dbgs() << "  Examining region: " << R.Start << " .. " << R.End
+                 << ", range " << R.Range << "\n");
+    assert(End >= R.Start);
+    if (Start >= R.End) {
+      DEBUG(dbgs() << "  Does not intersect, skip.\n");
+      continue;
+    }
+    if (Obj.Range.Overlaps(R.Range)) {
+      // Find the next appropriate location.
+      Start = AdjustStackOffset(R.End, Obj.Size, Obj.Alignment);
+      End = Start + Obj.Size;
+      DEBUG(dbgs() << "  Overlaps. Next candidate: " << Start << " .. " << End
+                   << "\n");
+      continue;
+    }
+    if (End <= R.End) {
+      DEBUG(dbgs() << "  Reusing region(s).\n");
+      break;
+    }
+  }
+
+  unsigned LastRegionEnd = Regions.empty() ? 0 : Regions.back().End;
+  if (End > LastRegionEnd) {
+    // Insert a new region at the end. Maybe two.
+    if (Start > LastRegionEnd) {
+      DEBUG(dbgs() << "  Creating gap region: " << LastRegionEnd << " .. "
+                   << Start << "\n");
+      Regions.emplace_back(LastRegionEnd, Start, StackColoring::LiveRange());
+      LastRegionEnd = Start;
+    }
+    DEBUG(dbgs() << "  Creating new region: " << LastRegionEnd << " .. " << End
+                 << ", range " << Obj.Range << "\n");
+    Regions.emplace_back(LastRegionEnd, End, Obj.Range);
+    LastRegionEnd = End;
+  }
+
+  // Split starting and ending regions if necessary.
+  for (StackRegion &R : Regions) {
+    if (Start > R.Start && Start < R.End) {
+      StackRegion R0 = R;
+      R.Start = R0.End = Start;
+      Regions.insert(&R, R0);
+      continue;
+    }
+    if (End > R.Start && End < R.End) {
+      StackRegion R0 = R;
+      R0.End = R.Start = End;
+      Regions.insert(&R, R0);
+      break;
+    }
+  }
+
+  // Update live ranges for all affected regions.
+  for (StackRegion &R : Regions) {
+    if (Start < R.End && End > R.Start)
+      R.Range.Join(Obj.Range);
+    if (End <= R.End)
+      break;
+  }
+
+  ObjectOffsets[Obj.Handle] = End;
+}
+
+void StackLayout::computeLayout() {
+  // Simple greedy algorithm.
+  // If this is replaced with something smarter, it must preserve the property
+  // that the first object is always at the offset 0 in the stack frame (for
+  // StackProtectorSlot), or handle stack protector in some other way.
+  for (auto &Obj : StackObjects)
+    layoutObject(Obj);
+
+  DEBUG(print(dbgs()));
+}
diff --git a/lib/CodeGen/SafeStackLayout.h b/lib/CodeGen/SafeStackLayout.h
new file mode 100644
index 000000000000..313ed21c8869
--- /dev/null
+++ b/lib/CodeGen/SafeStackLayout.h
@@ -0,0 +1,68 @@
+//===-- SafeStackLayout.h - SafeStack frame layout -------------*- C++ -*--===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_CODEGEN_SAFESTACKLAYOUT_H
+#define LLVM_LIB_CODEGEN_SAFESTACKLAYOUT_H
+
+#include "SafeStackColoring.h"
+
+namespace llvm {
+namespace safestack {
+
+/// Compute the layout of an unsafe stack frame.
+class StackLayout {
+  unsigned MaxAlignment;
+
+  struct StackRegion {
+    unsigned Start;
+    unsigned End;
+    StackColoring::LiveRange Range;
+    StackRegion(unsigned Start, unsigned End,
+                const StackColoring::LiveRange &Range)
+        : Start(Start), End(End), Range(Range) {}
+  };
+  /// The list of current stack regions, sorted by StackRegion::Start.
+  SmallVector<StackRegion, 16> Regions;
+
+  struct StackObject {
+    const Value *Handle;
+    unsigned Size, Alignment;
+    StackColoring::LiveRange Range;
+  };
+  SmallVector<StackObject, 8> StackObjects;
+
+  DenseMap<const Value *, unsigned> ObjectOffsets;
+
+  void layoutObject(StackObject &Obj);
+
+public:
+  StackLayout(unsigned StackAlignment) : MaxAlignment(StackAlignment) {}
+  /// Add an object to the stack frame. Value pointer is opaque and used as a
+  /// handle to retrieve the object's offset in the frame later.
+  void addObject(const Value *V, unsigned Size, unsigned Alignment,
+                 const StackColoring::LiveRange &Range);
+
+  /// Run the layout computation for all previously added objects.
+  void computeLayout();
+
+  /// Returns the offset to the object start in the stack frame.
+  unsigned getObjectOffset(const Value *V) { return ObjectOffsets[V]; }
+
+  /// Returns the size of the entire frame.
+  unsigned getFrameSize() { return Regions.empty() ? 0 : Regions.back().End; }
+
+  /// Returns the alignment of the frame.
+  unsigned getFrameAlignment() { return MaxAlignment; }
+  void print(raw_ostream &OS);
+};
+
+} // namespace safestack
+} // namespace llvm
+
+#endif // LLVM_LIB_CODEGEN_SAFESTACKLAYOUT_H
diff --git a/lib/CodeGen/ScheduleDAGInstrs.cpp b/lib/CodeGen/ScheduleDAGInstrs.cpp
index 11b246a8de25..22bfd4d1964c 100644
--- a/lib/CodeGen/ScheduleDAGInstrs.cpp
+++ b/lib/CodeGen/ScheduleDAGInstrs.cpp
@@ -14,11 +14,11 @@
 
 #include "llvm/CodeGen/ScheduleDAGInstrs.h"
 #include "llvm/ADT/IntEqClasses.h"
-#include "llvm/ADT/MapVector.h"
 #include "llvm/ADT/SmallPtrSet.h"
 #include "llvm/ADT/SmallSet.h"
 #include "llvm/Analysis/AliasAnalysis.h"
 #include "llvm/Analysis/ValueTracking.h"
+#include "llvm/CodeGen/LiveIntervalAnalysis.h"
 #include "llvm/CodeGen/MachineFunctionPass.h"
 #include "llvm/CodeGen/MachineFrameInfo.h"
 #include "llvm/CodeGen/MachineInstrBuilder.h"
@@ -27,6 +27,8 @@
 #include "llvm/CodeGen/PseudoSourceValue.h"
 #include "llvm/CodeGen/RegisterPressure.h"
 #include "llvm/CodeGen/ScheduleDFS.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/Type.h"
 #include "llvm/IR/Operator.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Debug.h"
@@ -36,7 +38,6 @@
 #include "llvm/Target/TargetMachine.h"
 #include "llvm/Target/TargetRegisterInfo.h"
 #include "llvm/Target/TargetSubtargetInfo.h"
-#include <queue>
 
 using namespace llvm;
 
@@ -49,12 +50,51 @@ static cl::opt<bool> EnableAASchedMI("enable-aa-sched-mi", cl::Hidden,
 static cl::opt<bool> UseTBAA("use-tbaa-in-sched-mi", cl::Hidden,
     cl::init(true), cl::desc("Enable use of TBAA during MI DAG construction"));
 
+// Note: the two options below might be used in tuning compile time vs
+// output quality. Setting HugeRegion so large that it will never be
+// reached means best-effort, but may be slow.
+
+// When Stores and Loads maps (or NonAliasStores and NonAliasLoads)
+// together hold this many SUs, a reduction of maps will be done.
+static cl::opt<unsigned> HugeRegion("dag-maps-huge-region", cl::Hidden,
+    cl::init(1000), cl::desc("The limit to use while constructing the DAG "
+                             "prior to scheduling, at which point a trade-off "
+                             "is made to avoid excessive compile time."));
+
+static cl::opt<unsigned> ReductionSize(
+    "dag-maps-reduction-size", cl::Hidden,
+    cl::desc("A huge scheduling region will have maps reduced by this many "
+             "nodes at a time. Defaults to HugeRegion / 2."));
+
+static unsigned getReductionSize() {
+  // Always reduce a huge region with half of the elements, except
+  // when user sets this number explicitly.
+  if (ReductionSize.getNumOccurrences() == 0)
+    return HugeRegion / 2;
+  return ReductionSize;
+}
+
+static void dumpSUList(ScheduleDAGInstrs::SUList &L) {
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
+  dbgs() << "{ ";
+  for (auto *su : L) {
+    dbgs() << "SU(" << su->NodeNum << ")";
+    if (su != L.back())
+      dbgs() << ", ";
+  }
+  dbgs() << "}\n";
+#endif
+}
+
 ScheduleDAGInstrs::ScheduleDAGInstrs(MachineFunction &mf,
                                      const MachineLoopInfo *mli,
                                      bool RemoveKillFlags)
     : ScheduleDAG(mf), MLI(mli), MFI(mf.getFrameInfo()),
       RemoveKillFlags(RemoveKillFlags), CanHandleTerminators(false),
-      TrackLaneMasks(false), FirstDbgValue(nullptr) {
+      TrackLaneMasks(false), AAForDep(nullptr), BarrierChain(nullptr),
+      UnknownValue(UndefValue::get(
+                     Type::getVoidTy(mf.getFunction()->getContext()))),
+      FirstDbgValue(nullptr) {
   DbgValues.clear();
 
   const TargetSubtargetInfo &ST = mf.getSubtarget();
@@ -120,10 +160,6 @@ static void getUnderlyingObjects(const Value *V,
   } while (!Working.empty());
 }
 
-typedef PointerUnion<const Value *, const PseudoSourceValue *> ValueType;
-typedef SmallVector<PointerIntPair<ValueType, 1, bool>, 4>
-UnderlyingObjectsVector;
-
 /// getUnderlyingObjectsForInstr - If this machine instr has memory reference
 /// information and it can be tracked to a normal reference to a known
 /// object, return the Value for that object.
@@ -131,46 +167,46 @@ static void getUnderlyingObjectsForInstr(const MachineInstr *MI,
                                          const MachineFrameInfo *MFI,
                                          UnderlyingObjectsVector &Objects,
                                          const DataLayout &DL) {
-  if (!MI->hasOneMemOperand() ||
-      (!(*MI->memoperands_begin())->getValue() &&
-       !(*MI->memoperands_begin())->getPseudoValue()) ||
-      (*MI->memoperands_begin())->isVolatile())
-    return;
-
-  if (const PseudoSourceValue *PSV =
-      (*MI->memoperands_begin())->getPseudoValue()) {
-    // Function that contain tail calls don't have unique PseudoSourceValue
-    // objects. Two PseudoSourceValues might refer to the same or overlapping
-    // locations. The client code calling this function assumes this is not the
-    // case. So return a conservative answer of no known object.
-    if (MFI->hasTailCall())
-      return;
+  auto allMMOsOkay = [&]() {
+    for (const MachineMemOperand *MMO : MI->memoperands()) {
+      if (MMO->isVolatile())
+        return false;
+
+      if (const PseudoSourceValue *PSV = MMO->getPseudoValue()) {
+        // Function that contain tail calls don't have unique PseudoSourceValue
+        // objects. Two PseudoSourceValues might refer to the same or
+        // overlapping locations. The client code calling this function assumes
+        // this is not the case. So return a conservative answer of no known
+        // object.
+        if (MFI->hasTailCall())
+          return false;
 
-    // For now, ignore PseudoSourceValues which may alias LLVM IR values
-    // because the code that uses this function has no way to cope with
-    // such aliases.
-    if (!PSV->isAliased(MFI)) {
-      bool MayAlias = PSV->mayAlias(MFI);
-      Objects.push_back(UnderlyingObjectsVector::value_type(PSV, MayAlias));
-    }
-    return;
-  }
+        // For now, ignore PseudoSourceValues which may alias LLVM IR values
+        // because the code that uses this function has no way to cope with
+        // such aliases.
+        if (PSV->isAliased(MFI))
+          return false;
 
-  const Value *V = (*MI->memoperands_begin())->getValue();
-  if (!V)
-    return;
+        bool MayAlias = PSV->mayAlias(MFI);
+        Objects.push_back(UnderlyingObjectsVector::value_type(PSV, MayAlias));
+      } else if (const Value *V = MMO->getValue()) {
+        SmallVector<Value *, 4> Objs;
+        getUnderlyingObjects(V, Objs, DL);
 
-  SmallVector<Value *, 4> Objs;
-  getUnderlyingObjects(V, Objs, DL);
+        for (Value *V : Objs) {
+          if (!isIdentifiedObject(V))
+            return false;
 
-  for (Value *V : Objs) {
-    if (!isIdentifiedObject(V)) {
-      Objects.clear();
-      return;
+          Objects.push_back(UnderlyingObjectsVector::value_type(V, true));
+        }
+      } else
+        return false;
     }
+    return true;
+  };
 
-    Objects.push_back(UnderlyingObjectsVector::value_type(V, true));
-  }
+  if (!allMMOsOkay())
+    Objects.clear();
 }
 
 void ScheduleDAGInstrs::startBlock(MachineBasicBlock *bb) {
@@ -475,10 +511,10 @@ void ScheduleDAGInstrs::addVRegDefDeps(SUnit *SU, unsigned OperIdx) {
     // VReg2SUnit for the non-overlapping part.
     LaneBitmask OverlapMask = V2SU.LaneMask & LaneMask;
     LaneBitmask NonOverlapMask = V2SU.LaneMask & ~LaneMask;
-    if (NonOverlapMask != 0)
-      CurrentVRegDefs.insert(VReg2SUnit(Reg, NonOverlapMask, V2SU.SU));
     V2SU.SU = SU;
     V2SU.LaneMask = OverlapMask;
+    if (NonOverlapMask != 0)
+      CurrentVRegDefs.insert(VReg2SUnit(Reg, NonOverlapMask, DefSU));
   }
   // If there was no CurrentVRegDefs entry for some lanes yet, create one.
   if (LaneMask != 0)
@@ -518,84 +554,32 @@ void ScheduleDAGInstrs::addVRegUseDeps(SUnit *SU, unsigned OperIdx) {
 /// (like a call or something with unmodeled side effects).
 static inline bool isGlobalMemoryObject(AliasAnalysis *AA, MachineInstr *MI) {
   return MI->isCall() || MI->hasUnmodeledSideEffects() ||
-         (MI->hasOrderedMemoryRef() &&
-          (!MI->mayLoad() || !MI->isInvariantLoad(AA)));
-}
-
-// This MI might have either incomplete info, or known to be unsafe
-// to deal with (i.e. volatile object).
-static inline bool isUnsafeMemoryObject(MachineInstr *MI,
-                                        const MachineFrameInfo *MFI,
-                                        const DataLayout &DL) {
-  if (!MI || MI->memoperands_empty())
-    return true;
-  // We purposefully do no check for hasOneMemOperand() here
-  // in hope to trigger an assert downstream in order to
-  // finish implementation.
-  if ((*MI->memoperands_begin())->isVolatile() ||
-       MI->hasUnmodeledSideEffects())
-    return true;
-
-  if ((*MI->memoperands_begin())->getPseudoValue()) {
-    // Similarly to getUnderlyingObjectForInstr:
-    // For now, ignore PseudoSourceValues which may alias LLVM IR values
-    // because the code that uses this function has no way to cope with
-    // such aliases.
-    return true;
-  }
-
-  const Value *V = (*MI->memoperands_begin())->getValue();
-  if (!V)
-    return true;
-
-  SmallVector<Value *, 4> Objs;
-  getUnderlyingObjects(V, Objs, DL);
-  for (Value *V : Objs) {
-    // Does this pointer refer to a distinct and identifiable object?
-    if (!isIdentifiedObject(V))
-      return true;
-  }
-
-  return false;
+         (MI->hasOrderedMemoryRef() && !MI->isInvariantLoad(AA));
 }
 
 /// This returns true if the two MIs need a chain edge between them.
-/// If these are not even memory operations, we still may need
-/// chain deps between them. The question really is - could
-/// these two MIs be reordered during scheduling from memory dependency
-/// point of view.
+/// This is called on normal stores and loads.
 static bool MIsNeedChainEdge(AliasAnalysis *AA, const MachineFrameInfo *MFI,
                              const DataLayout &DL, MachineInstr *MIa,
                              MachineInstr *MIb) {
   const MachineFunction *MF = MIa->getParent()->getParent();
   const TargetInstrInfo *TII = MF->getSubtarget().getInstrInfo();
 
-  // Cover a trivial case - no edge is need to itself.
-  if (MIa == MIb)
-    return false;
- 
-  // Let the target decide if memory accesses cannot possibly overlap.
-  if ((MIa->mayLoad() || MIa->mayStore()) &&
-      (MIb->mayLoad() || MIb->mayStore()))
-    if (TII->areMemAccessesTriviallyDisjoint(MIa, MIb, AA))
-      return false;
+  assert ((MIa->mayStore() || MIb->mayStore()) &&
+          "Dependency checked between two loads");
 
-  // FIXME: Need to handle multiple memory operands to support all targets.
-  if (!MIa->hasOneMemOperand() || !MIb->hasOneMemOperand())
-    return true;
-
-  if (isUnsafeMemoryObject(MIa, MFI, DL) || isUnsafeMemoryObject(MIb, MFI, DL))
-    return true;
-
-  // If we are dealing with two "normal" loads, we do not need an edge
-  // between them - they could be reordered.
-  if (!MIa->mayStore() && !MIb->mayStore())
+  // Let the target decide if memory accesses cannot possibly overlap.
+  if (TII->areMemAccessesTriviallyDisjoint(*MIa, *MIb, AA))
     return false;
 
   // To this point analysis is generic. From here on we do need AA.
   if (!AA)
     return true;
 
+  // FIXME: Need to handle multiple memory operands to support all targets.
+  if (!MIa->hasOneMemOperand() || !MIb->hasOneMemOperand())
+    return true;
+
   MachineMemOperand *MMOa = *MIa->memoperands_begin();
   MachineMemOperand *MMOb = *MIb->memoperands_begin();
 
@@ -634,106 +618,15 @@ static bool MIsNeedChainEdge(AliasAnalysis *AA, const MachineFrameInfo *MFI,
   return (AAResult != NoAlias);
 }
 
-/// This recursive function iterates over chain deps of SUb looking for
-/// "latest" node that needs a chain edge to SUa.
-static unsigned iterateChainSucc(AliasAnalysis *AA, const MachineFrameInfo *MFI,
-                                 const DataLayout &DL, SUnit *SUa, SUnit *SUb,
-                                 SUnit *ExitSU, unsigned *Depth,
-                                 SmallPtrSetImpl<const SUnit *> &Visited) {
-  if (!SUa || !SUb || SUb == ExitSU)
-    return *Depth;
-
-  // Remember visited nodes.
-  if (!Visited.insert(SUb).second)
-      return *Depth;
-  // If there is _some_ dependency already in place, do not
-  // descend any further.
-  // TODO: Need to make sure that if that dependency got eliminated or ignored
-  // for any reason in the future, we would not violate DAG topology.
-  // Currently it does not happen, but makes an implicit assumption about
-  // future implementation.
-  //
-  // Independently, if we encounter node that is some sort of global
-  // object (like a call) we already have full set of dependencies to it
-  // and we can stop descending.
-  if (SUa->isSucc(SUb) ||
-      isGlobalMemoryObject(AA, SUb->getInstr()))
-    return *Depth;
-
-  // If we do need an edge, or we have exceeded depth budget,
-  // add that edge to the predecessors chain of SUb,
-  // and stop descending.
-  if (*Depth > 200 ||
-      MIsNeedChainEdge(AA, MFI, DL, SUa->getInstr(), SUb->getInstr())) {
-    SUb->addPred(SDep(SUa, SDep::MayAliasMem));
-    return *Depth;
-  }
-  // Track current depth.
-  (*Depth)++;
-  // Iterate over memory dependencies only.
-  for (SUnit::const_succ_iterator I = SUb->Succs.begin(), E = SUb->Succs.end();
-       I != E; ++I)
-    if (I->isNormalMemoryOrBarrier())
-      iterateChainSucc(AA, MFI, DL, SUa, I->getSUnit(), ExitSU, Depth, Visited);
-  return *Depth;
-}
-
-/// This function assumes that "downward" from SU there exist
-/// tail/leaf of already constructed DAG. It iterates downward and
-/// checks whether SU can be aliasing any node dominated
-/// by it.
-static void adjustChainDeps(AliasAnalysis *AA, const MachineFrameInfo *MFI,
-                            const DataLayout &DL, SUnit *SU, SUnit *ExitSU,
-                            std::set<SUnit *> &CheckList,
-                            unsigned LatencyToLoad) {
-  if (!SU)
-    return;
-
-  SmallPtrSet<const SUnit*, 16> Visited;
-  unsigned Depth = 0;
-
-  for (std::set<SUnit *>::iterator I = CheckList.begin(), IE = CheckList.end();
-       I != IE; ++I) {
-    if (SU == *I)
-      continue;
-    if (MIsNeedChainEdge(AA, MFI, DL, SU->getInstr(), (*I)->getInstr())) {
-      SDep Dep(SU, SDep::MayAliasMem);
-      Dep.setLatency(((*I)->getInstr()->mayLoad()) ? LatencyToLoad : 0);
-      (*I)->addPred(Dep);
-    }
-
-    // Iterate recursively over all previously added memory chain
-    // successors. Keep track of visited nodes.
-    for (SUnit::const_succ_iterator J = (*I)->Succs.begin(),
-         JE = (*I)->Succs.end(); J != JE; ++J)
-      if (J->isNormalMemoryOrBarrier())
-        iterateChainSucc(AA, MFI, DL, SU, J->getSUnit(), ExitSU, &Depth,
-                         Visited);
-  }
-}
-
-/// Check whether two objects need a chain edge, if so, add it
-/// otherwise remember the rejected SU.
-static inline void addChainDependency(AliasAnalysis *AA,
-                                      const MachineFrameInfo *MFI,
-                                      const DataLayout &DL, SUnit *SUa,
-                                      SUnit *SUb, std::set<SUnit *> &RejectList,
-                                      unsigned TrueMemOrderLatency = 0,
-                                      bool isNormalMemory = false) {
-  // If this is a false dependency,
-  // do not add the edge, but remember the rejected node.
-  if (MIsNeedChainEdge(AA, MFI, DL, SUa->getInstr(), SUb->getInstr())) {
-    SDep Dep(SUa, isNormalMemory ? SDep::MayAliasMem : SDep::Barrier);
-    Dep.setLatency(TrueMemOrderLatency);
+/// Check whether two objects need a chain edge and add it if needed.
+void ScheduleDAGInstrs::addChainDependency (SUnit *SUa, SUnit *SUb,
+                                            unsigned Latency) {
+  if (MIsNeedChainEdge(AAForDep, MFI, MF.getDataLayout(), SUa->getInstr(),
+		       SUb->getInstr())) {
+    SDep Dep(SUa, SDep::MayAliasMem);
+    Dep.setLatency(Latency);
     SUb->addPred(Dep);
   }
-  else {
-    // Duplicate entries should be ignored.
-    RejectList.insert(SUb);
-    DEBUG(dbgs() << "\tReject chain dep between SU("
-          << SUa->NodeNum << ") and SU("
-          << SUb->NodeNum << ")\n");
-  }
 }
 
 /// Create an SUnit for each real instruction, numbered in top-down topological
@@ -752,16 +645,15 @@ void ScheduleDAGInstrs::initSUnits() {
   // which is contained within a basic block.
   SUnits.reserve(NumRegionInstrs);
 
-  for (MachineBasicBlock::iterator I = RegionBegin; I != RegionEnd; ++I) {
-    MachineInstr *MI = I;
-    if (MI->isDebugValue())
+  for (MachineInstr &MI : llvm::make_range(RegionBegin, RegionEnd)) {
+    if (MI.isDebugValue())
       continue;
 
-    SUnit *SU = newSUnit(MI);
-    MISUnitMap[MI] = SU;
+    SUnit *SU = newSUnit(&MI);
+    MISUnitMap[&MI] = SU;
 
-    SU->isCall = MI->isCall();
-    SU->isCommutable = MI->isCommutable();
+    SU->isCall = MI.isCall();
+    SU->isCommutable = MI.isCommutable();
 
     // Assign the Latency field of SU using target-provided information.
     SU->Latency = SchedModel.computeInstrLatency(SU->getInstr());
@@ -808,6 +700,19 @@ void ScheduleDAGInstrs::collectVRegUses(SUnit *SU) {
     if (!TargetRegisterInfo::isVirtualRegister(Reg))
       continue;
 
+    // Ignore re-defs.
+    if (TrackLaneMasks) {
+      bool FoundDef = false;
+      for (const MachineOperand &MO2 : MI->operands()) {
+        if (MO2.isReg() && MO2.isDef() && MO2.getReg() == Reg && !MO2.isDead()) {
+          FoundDef = true;
+          break;
+        }
+      }
+      if (FoundDef)
+        continue;
+    }
+
     // Record this local VReg use.
     VReg2SUnitMultiMap::iterator UI = VRegUses.find(Reg);
     for (; UI != VRegUses.end(); ++UI) {
@@ -819,17 +724,136 @@ void ScheduleDAGInstrs::collectVRegUses(SUnit *SU) {
   }
 }
 
+class ScheduleDAGInstrs::Value2SUsMap : public MapVector<ValueType, SUList> {
+
+  /// Current total number of SUs in map.
+  unsigned NumNodes;
+
+  /// 1 for loads, 0 for stores. (see comment in SUList)
+  unsigned TrueMemOrderLatency;
+public:
+
+  Value2SUsMap(unsigned lat = 0) : NumNodes(0), TrueMemOrderLatency(lat) {}
+
+  /// To keep NumNodes up to date, insert() is used instead of
+  /// this operator w/ push_back().
+  ValueType &operator[](const SUList &Key) {
+    llvm_unreachable("Don't use. Use insert() instead."); };
+
+  /// Add SU to the SUList of V. If Map grows huge, reduce its size
+  /// by calling reduce().
+  void inline insert(SUnit *SU, ValueType V) {
+    MapVector::operator[](V).push_back(SU);
+    NumNodes++;
+  }
+
+  /// Clears the list of SUs mapped to V.
+  void inline clearList(ValueType V) {
+    iterator Itr = find(V);
+    if (Itr != end()) {
+      assert (NumNodes >= Itr->second.size());
+      NumNodes -= Itr->second.size();
+
+      Itr->second.clear();
+    }
+  }
+
+  /// Clears map from all contents.
+  void clear() {
+    MapVector<ValueType, SUList>::clear();
+    NumNodes = 0;
+  }
+
+  unsigned inline size() const { return NumNodes; }
+
+  /// Count the number of SUs in this map after a reduction.
+  void reComputeSize(void) {
+    NumNodes = 0;
+    for (auto &I : *this)
+      NumNodes += I.second.size();
+  }
+
+  unsigned inline getTrueMemOrderLatency() const {
+    return TrueMemOrderLatency;
+  }
+
+  void dump();
+};
+
+void ScheduleDAGInstrs::addChainDependencies(SUnit *SU,
+                                             Value2SUsMap &Val2SUsMap) {
+  for (auto &I : Val2SUsMap)
+    addChainDependencies(SU, I.second,
+                         Val2SUsMap.getTrueMemOrderLatency());
+}
+
+void ScheduleDAGInstrs::addChainDependencies(SUnit *SU,
+                                             Value2SUsMap &Val2SUsMap,
+                                             ValueType V) {
+  Value2SUsMap::iterator Itr = Val2SUsMap.find(V);
+  if (Itr != Val2SUsMap.end())
+    addChainDependencies(SU, Itr->second,
+                         Val2SUsMap.getTrueMemOrderLatency());
+}
+
+void ScheduleDAGInstrs::addBarrierChain(Value2SUsMap &map) {
+  assert (BarrierChain != nullptr);
+
+  for (auto &I : map) {
+    SUList &sus = I.second;
+    for (auto *SU : sus)
+      SU->addPredBarrier(BarrierChain);
+  }
+  map.clear();
+}
+
+void ScheduleDAGInstrs::insertBarrierChain(Value2SUsMap &map) {
+  assert (BarrierChain != nullptr);
+
+  // Go through all lists of SUs.
+  for (Value2SUsMap::iterator I = map.begin(), EE = map.end(); I != EE;) {
+    Value2SUsMap::iterator CurrItr = I++;
+    SUList &sus = CurrItr->second;
+    SUList::iterator SUItr = sus.begin(), SUEE = sus.end();
+    for (; SUItr != SUEE; ++SUItr) {
+      // Stop on BarrierChain or any instruction above it.
+      if ((*SUItr)->NodeNum <= BarrierChain->NodeNum)
+        break;
+
+      (*SUItr)->addPredBarrier(BarrierChain);
+    }
+
+    // Remove also the BarrierChain from list if present.
+    if (SUItr != SUEE && *SUItr == BarrierChain)
+      SUItr++;
+
+    // Remove all SUs that are now successors of BarrierChain.
+    if (SUItr != sus.begin())
+      sus.erase(sus.begin(), SUItr);
+  }
+
+  // Remove all entries with empty su lists.
+  map.remove_if([&](std::pair<ValueType, SUList> &mapEntry) {
+      return (mapEntry.second.empty()); });
+
+  // Recompute the size of the map (NumNodes).
+  map.reComputeSize();
+}
+
 /// If RegPressure is non-null, compute register pressure as a side effect. The
 /// DAG builder is an efficient place to do it because it already visits
 /// operands.
 void ScheduleDAGInstrs::buildSchedGraph(AliasAnalysis *AA,
                                         RegPressureTracker *RPTracker,
                                         PressureDiffs *PDiffs,
+                                        LiveIntervals *LIS,
                                         bool TrackLaneMasks) {
   const TargetSubtargetInfo &ST = MF.getSubtarget();
   bool UseAA = EnableAASchedMI.getNumOccurrences() > 0 ? EnableAASchedMI
                                                        : ST.useAA();
-  AliasAnalysis *AAForDep = UseAA ? AA : nullptr;
+  AAForDep = UseAA ? AA : nullptr;
+
+  BarrierChain = nullptr;
 
   this->TrackLaneMasks = TrackLaneMasks;
   MISUnitMap.clear();
@@ -841,19 +865,25 @@ void ScheduleDAGInstrs::buildSchedGraph(AliasAnalysis *AA,
   if (PDiffs)
     PDiffs->init(SUnits.size());
 
-  // We build scheduling units by walking a block's instruction list from bottom
-  // to top.
-
-  // Remember where a generic side-effecting instruction is as we proceed.
-  SUnit *BarrierChain = nullptr, *AliasChain = nullptr;
-
-  // Memory references to specific known memory locations are tracked
-  // so that they can be given more precise dependencies. We track
-  // separately the known memory locations that may alias and those
-  // that are known not to alias
-  MapVector<ValueType, std::vector<SUnit *> > AliasMemDefs, NonAliasMemDefs;
-  MapVector<ValueType, std::vector<SUnit *> > AliasMemUses, NonAliasMemUses;
-  std::set<SUnit*> RejectMemNodes;
+  // We build scheduling units by walking a block's instruction list
+  // from bottom to top.
+
+  // Each MIs' memory operand(s) is analyzed to a list of underlying
+  // objects. The SU is then inserted in the SUList(s) mapped from the
+  // Value(s). Each Value thus gets mapped to lists of SUs depending
+  // on it, stores and loads kept separately. Two SUs are trivially
+  // non-aliasing if they both depend on only identified Values and do
+  // not share any common Value.
+  Value2SUsMap Stores, Loads(1 /*TrueMemOrderLatency*/);
+
+  // Certain memory accesses are known to not alias any SU in Stores
+  // or Loads, and have therefore their own 'NonAlias'
+  // domain. E.g. spill / reload instructions never alias LLVM I/R
+  // Values. It would be nice to assume that this type of memory
+  // accesses always have a proper memory operand modelling, and are
+  // therefore never unanalyzable, but this is conservatively not
+  // done.
+  Value2SUsMap NonAliasStores, NonAliasLoads(1 /*TrueMemOrderLatency*/);
 
   // Remove any stale debug info; sometimes BuildSchedGraph is called again
   // without emitting the info from the previous call.
@@ -882,283 +912,201 @@ void ScheduleDAGInstrs::buildSchedGraph(AliasAnalysis *AA,
   MachineInstr *DbgMI = nullptr;
   for (MachineBasicBlock::iterator MII = RegionEnd, MIE = RegionBegin;
        MII != MIE; --MII) {
-    MachineInstr *MI = std::prev(MII);
-    if (MI && DbgMI) {
-      DbgValues.push_back(std::make_pair(DbgMI, MI));
+    MachineInstr &MI = *std::prev(MII);
+    if (DbgMI) {
+      DbgValues.push_back(std::make_pair(DbgMI, &MI));
       DbgMI = nullptr;
     }
 
-    if (MI->isDebugValue()) {
-      DbgMI = MI;
+    if (MI.isDebugValue()) {
+      DbgMI = &MI;
       continue;
     }
-    SUnit *SU = MISUnitMap[MI];
+    SUnit *SU = MISUnitMap[&MI];
     assert(SU && "No SUnit mapped to this MI");
 
     if (RPTracker) {
       collectVRegUses(SU);
 
       RegisterOperands RegOpers;
-      RegOpers.collect(*MI, *TRI, MRI);
+      RegOpers.collect(MI, *TRI, MRI, TrackLaneMasks, false);
+      if (TrackLaneMasks) {
+        SlotIndex SlotIdx = LIS->getInstructionIndex(MI);
+        RegOpers.adjustLaneLiveness(*LIS, MRI, SlotIdx);
+      }
       if (PDiffs != nullptr)
         PDiffs->addInstruction(SU->NodeNum, RegOpers, MRI);
 
       RPTracker->recedeSkipDebugValues();
-      assert(&*RPTracker->getPos() == MI && "RPTracker in sync");
+      assert(&*RPTracker->getPos() == &MI && "RPTracker in sync");
       RPTracker->recede(RegOpers);
     }
 
     assert(
-        (CanHandleTerminators || (!MI->isTerminator() && !MI->isPosition())) &&
+        (CanHandleTerminators || (!MI.isTerminator() && !MI.isPosition())) &&
         "Cannot schedule terminators or labels!");
 
     // Add register-based dependencies (data, anti, and output).
+    // For some instructions (calls, returns, inline-asm, etc.) there can
+    // be explicit uses and implicit defs, in which case the use will appear
+    // on the operand list before the def. Do two passes over the operand
+    // list to make sure that defs are processed before any uses.
     bool HasVRegDef = false;
-    for (unsigned j = 0, n = MI->getNumOperands(); j != n; ++j) {
-      const MachineOperand &MO = MI->getOperand(j);
-      if (!MO.isReg()) continue;
+    for (unsigned j = 0, n = MI.getNumOperands(); j != n; ++j) {
+      const MachineOperand &MO = MI.getOperand(j);
+      if (!MO.isReg() || !MO.isDef())
+        continue;
       unsigned Reg = MO.getReg();
-      if (Reg == 0) continue;
+      if (Reg == 0)
+        continue;
 
       if (TRI->isPhysicalRegister(Reg))
         addPhysRegDeps(SU, j);
       else {
-        if (MO.isDef()) {
-          HasVRegDef = true;
-          addVRegDefDeps(SU, j);
-        }
-        else if (MO.readsReg()) // ignore undef operands
-          addVRegUseDeps(SU, j);
+        HasVRegDef = true;
+        addVRegDefDeps(SU, j);
       }
     }
+    // Now process all uses.
+    for (unsigned j = 0, n = MI.getNumOperands(); j != n; ++j) {
+      const MachineOperand &MO = MI.getOperand(j);
+      // Only look at use operands.
+      // We do not need to check for MO.readsReg() here because subsequent
+      // subregister defs will get output dependence edges and need no
+      // additional use dependencies.
+      if (!MO.isReg() || !MO.isUse())
+        continue;
+      unsigned Reg = MO.getReg();
+      if (Reg == 0)
+        continue;
+
+      if (TRI->isPhysicalRegister(Reg))
+        addPhysRegDeps(SU, j);
+      else if (MO.readsReg()) // ignore undef operands
+        addVRegUseDeps(SU, j);
+    }
+
     // If we haven't seen any uses in this scheduling region, create a
     // dependence edge to ExitSU to model the live-out latency. This is required
     // for vreg defs with no in-region use, and prefetches with no vreg def.
     //
     // FIXME: NumDataSuccs would be more precise than NumSuccs here. This
     // check currently relies on being called before adding chain deps.
-    if (SU->NumSuccs == 0 && SU->Latency > 1
-        && (HasVRegDef || MI->mayLoad())) {
+    if (SU->NumSuccs == 0 && SU->Latency > 1 && (HasVRegDef || MI.mayLoad())) {
       SDep Dep(SU, SDep::Artificial);
       Dep.setLatency(SU->Latency - 1);
       ExitSU.addPred(Dep);
     }
 
-    // Add chain dependencies.
-    // Chain dependencies used to enforce memory order should have
-    // latency of 0 (except for true dependency of Store followed by
-    // aliased Load... we estimate that with a single cycle of latency
-    // assuming the hardware will bypass)
-    // Note that isStoreToStackSlot and isLoadFromStackSLot are not usable
-    // after stack slots are lowered to actual addresses.
-    // TODO: Use an AliasAnalysis and do real alias-analysis queries, and
-    // produce more precise dependence information.
-    unsigned TrueMemOrderLatency = MI->mayStore() ? 1 : 0;
-    if (isGlobalMemoryObject(AA, MI)) {
-      // Be conservative with these and add dependencies on all memory
-      // references, even those that are known to not alias.
-      for (MapVector<ValueType, std::vector<SUnit *> >::iterator I =
-             NonAliasMemDefs.begin(), E = NonAliasMemDefs.end(); I != E; ++I) {
-        for (unsigned i = 0, e = I->second.size(); i != e; ++i) {
-          I->second[i]->addPred(SDep(SU, SDep::Barrier));
-        }
-      }
-      for (MapVector<ValueType, std::vector<SUnit *> >::iterator I =
-             NonAliasMemUses.begin(), E = NonAliasMemUses.end(); I != E; ++I) {
-        for (unsigned i = 0, e = I->second.size(); i != e; ++i) {
-          SDep Dep(SU, SDep::Barrier);
-          Dep.setLatency(TrueMemOrderLatency);
-          I->second[i]->addPred(Dep);
-        }
-      }
-      // Add SU to the barrier chain.
+    // Add memory dependencies (Note: isStoreToStackSlot and
+    // isLoadFromStackSLot are not usable after stack slots are lowered to
+    // actual addresses).
+
+    // This is a barrier event that acts as a pivotal node in the DAG.
+    if (isGlobalMemoryObject(AA, &MI)) {
+
+      // Become the barrier chain.
       if (BarrierChain)
-        BarrierChain->addPred(SDep(SU, SDep::Barrier));
+        BarrierChain->addPredBarrier(SU);
       BarrierChain = SU;
-      // This is a barrier event that acts as a pivotal node in the DAG,
-      // so it is safe to clear list of exposed nodes.
-      adjustChainDeps(AA, MFI, MF.getDataLayout(), SU, &ExitSU, RejectMemNodes,
-                      TrueMemOrderLatency);
-      RejectMemNodes.clear();
-      NonAliasMemDefs.clear();
-      NonAliasMemUses.clear();
-
-      // fall-through
-    new_alias_chain:
-      // Chain all possibly aliasing memory references through SU.
-      if (AliasChain) {
-        unsigned ChainLatency = 0;
-        if (AliasChain->getInstr()->mayLoad())
-          ChainLatency = TrueMemOrderLatency;
-        addChainDependency(AAForDep, MFI, MF.getDataLayout(), SU, AliasChain,
-                           RejectMemNodes, ChainLatency);
-      }
-      AliasChain = SU;
-      for (unsigned k = 0, m = PendingLoads.size(); k != m; ++k)
-        addChainDependency(AAForDep, MFI, MF.getDataLayout(), SU,
-                           PendingLoads[k], RejectMemNodes,
-                           TrueMemOrderLatency);
-      for (MapVector<ValueType, std::vector<SUnit *> >::iterator I =
-           AliasMemDefs.begin(), E = AliasMemDefs.end(); I != E; ++I) {
-        for (unsigned i = 0, e = I->second.size(); i != e; ++i)
-          addChainDependency(AAForDep, MFI, MF.getDataLayout(), SU,
-                             I->second[i], RejectMemNodes);
-      }
-      for (MapVector<ValueType, std::vector<SUnit *> >::iterator I =
-           AliasMemUses.begin(), E = AliasMemUses.end(); I != E; ++I) {
-        for (unsigned i = 0, e = I->second.size(); i != e; ++i)
-          addChainDependency(AAForDep, MFI, MF.getDataLayout(), SU,
-                             I->second[i], RejectMemNodes, TrueMemOrderLatency);
-      }
-      // This call must come after calls to addChainDependency() since it
-      // consumes the 'RejectMemNodes' list that addChainDependency() possibly
-      // adds to.
-      adjustChainDeps(AA, MFI, MF.getDataLayout(), SU, &ExitSU, RejectMemNodes,
-                      TrueMemOrderLatency);
-      PendingLoads.clear();
-      AliasMemDefs.clear();
-      AliasMemUses.clear();
-    } else if (MI->mayStore()) {
-      // Add dependence on barrier chain, if needed.
-      // There is no point to check aliasing on barrier event. Even if
-      // SU and barrier _could_ be reordered, they should not. In addition,
-      // we have lost all RejectMemNodes below barrier.
-      if (BarrierChain)
-        BarrierChain->addPred(SDep(SU, SDep::Barrier));
 
-      UnderlyingObjectsVector Objs;
-      getUnderlyingObjectsForInstr(MI, MFI, Objs, MF.getDataLayout());
+      DEBUG(dbgs() << "Global memory object and new barrier chain: SU("
+            << BarrierChain->NodeNum << ").\n";);
 
-      if (Objs.empty()) {
-        // Treat all other stores conservatively.
-        goto new_alias_chain;
-      }
+      // Add dependencies against everything below it and clear maps.
+      addBarrierChain(Stores);
+      addBarrierChain(Loads);
+      addBarrierChain(NonAliasStores);
+      addBarrierChain(NonAliasLoads);
 
-      bool MayAlias = false;
-      for (UnderlyingObjectsVector::iterator K = Objs.begin(), KE = Objs.end();
-           K != KE; ++K) {
-        ValueType V = K->getPointer();
-        bool ThisMayAlias = K->getInt();
-        if (ThisMayAlias)
-          MayAlias = true;
-
-        // A store to a specific PseudoSourceValue. Add precise dependencies.
-        // Record the def in MemDefs, first adding a dep if there is
-        // an existing def.
-        MapVector<ValueType, std::vector<SUnit *> >::iterator I =
-          ((ThisMayAlias) ? AliasMemDefs.find(V) : NonAliasMemDefs.find(V));
-        MapVector<ValueType, std::vector<SUnit *> >::iterator IE =
-          ((ThisMayAlias) ? AliasMemDefs.end() : NonAliasMemDefs.end());
-        if (I != IE) {
-          for (unsigned i = 0, e = I->second.size(); i != e; ++i)
-            addChainDependency(AAForDep, MFI, MF.getDataLayout(), SU,
-                               I->second[i], RejectMemNodes, 0, true);
-
-          // If we're not using AA, then we only need one store per object.
-          if (!AAForDep)
-            I->second.clear();
-          I->second.push_back(SU);
-        } else {
-          if (ThisMayAlias) {
-            if (!AAForDep)
-              AliasMemDefs[V].clear();
-            AliasMemDefs[V].push_back(SU);
-          } else {
-            if (!AAForDep)
-              NonAliasMemDefs[V].clear();
-            NonAliasMemDefs[V].push_back(SU);
-          }
+      continue;
+    }
+
+    // If it's not a store or a variant load, we're done.
+    if (!MI.mayStore() && !(MI.mayLoad() && !MI.isInvariantLoad(AA)))
+      continue;
+
+    // Always add dependecy edge to BarrierChain if present.
+    if (BarrierChain)
+      BarrierChain->addPredBarrier(SU);
+
+    // Find the underlying objects for MI. The Objs vector is either
+    // empty, or filled with the Values of memory locations which this
+    // SU depends on. An empty vector means the memory location is
+    // unknown, and may alias anything.
+    UnderlyingObjectsVector Objs;
+    getUnderlyingObjectsForInstr(&MI, MFI, Objs, MF.getDataLayout());
+
+    if (MI.mayStore()) {
+      if (Objs.empty()) {
+        // An unknown store depends on all stores and loads.
+        addChainDependencies(SU, Stores);
+        addChainDependencies(SU, NonAliasStores);
+        addChainDependencies(SU, Loads);
+        addChainDependencies(SU, NonAliasLoads);
+
+        // Map this store to 'UnknownValue'.
+        Stores.insert(SU, UnknownValue);
+      } else {
+        // Add precise dependencies against all previously seen memory
+        // accesses mapped to the same Value(s).
+        for (const UnderlyingObject &UnderlObj : Objs) {
+          ValueType V = UnderlObj.getValue();
+          bool ThisMayAlias = UnderlObj.mayAlias();
+
+          // Add dependencies to previous stores and loads mapped to V.
+          addChainDependencies(SU, (ThisMayAlias ? Stores : NonAliasStores), V);
+          addChainDependencies(SU, (ThisMayAlias ? Loads : NonAliasLoads), V);
         }
-        // Handle the uses in MemUses, if there are any.
-        MapVector<ValueType, std::vector<SUnit *> >::iterator J =
-          ((ThisMayAlias) ? AliasMemUses.find(V) : NonAliasMemUses.find(V));
-        MapVector<ValueType, std::vector<SUnit *> >::iterator JE =
-          ((ThisMayAlias) ? AliasMemUses.end() : NonAliasMemUses.end());
-        if (J != JE) {
-          for (unsigned i = 0, e = J->second.size(); i != e; ++i)
-            addChainDependency(AAForDep, MFI, MF.getDataLayout(), SU,
-                               J->second[i], RejectMemNodes,
-                               TrueMemOrderLatency, true);
-          J->second.clear();
+        // Update the store map after all chains have been added to avoid adding
+        // self-loop edge if multiple underlying objects are present.
+        for (const UnderlyingObject &UnderlObj : Objs) {
+          ValueType V = UnderlObj.getValue();
+          bool ThisMayAlias = UnderlObj.mayAlias();
+
+          // Map this store to V.
+          (ThisMayAlias ? Stores : NonAliasStores).insert(SU, V);
         }
+        // The store may have dependencies to unanalyzable loads and
+        // stores.
+        addChainDependencies(SU, Loads, UnknownValue);
+        addChainDependencies(SU, Stores, UnknownValue);
       }
-      if (MayAlias) {
-        // Add dependencies from all the PendingLoads, i.e. loads
-        // with no underlying object.
-        for (unsigned k = 0, m = PendingLoads.size(); k != m; ++k)
-          addChainDependency(AAForDep, MFI, MF.getDataLayout(), SU,
-                             PendingLoads[k], RejectMemNodes,
-                             TrueMemOrderLatency);
-        // Add dependence on alias chain, if needed.
-        if (AliasChain)
-          addChainDependency(AAForDep, MFI, MF.getDataLayout(), SU, AliasChain,
-                             RejectMemNodes);
-      }
-      // This call must come after calls to addChainDependency() since it
-      // consumes the 'RejectMemNodes' list that addChainDependency() possibly
-      // adds to.
-      adjustChainDeps(AA, MFI, MF.getDataLayout(), SU, &ExitSU, RejectMemNodes,
-                      TrueMemOrderLatency);
-    } else if (MI->mayLoad()) {
-      bool MayAlias = true;
-      if (MI->isInvariantLoad(AA)) {
-        // Invariant load, no chain dependencies needed!
+    } else { // SU is a load.
+      if (Objs.empty()) {
+        // An unknown load depends on all stores.
+        addChainDependencies(SU, Stores);
+        addChainDependencies(SU, NonAliasStores);
+
+        Loads.insert(SU, UnknownValue);
       } else {
-        UnderlyingObjectsVector Objs;
-        getUnderlyingObjectsForInstr(MI, MFI, Objs, MF.getDataLayout());
-
-        if (Objs.empty()) {
-          // A load with no underlying object. Depend on all
-          // potentially aliasing stores.
-          for (MapVector<ValueType, std::vector<SUnit *> >::iterator I =
-                 AliasMemDefs.begin(), E = AliasMemDefs.end(); I != E; ++I)
-            for (unsigned i = 0, e = I->second.size(); i != e; ++i)
-              addChainDependency(AAForDep, MFI, MF.getDataLayout(), SU,
-                                 I->second[i], RejectMemNodes);
-
-          PendingLoads.push_back(SU);
-          MayAlias = true;
-        } else {
-          MayAlias = false;
-        }
+        for (const UnderlyingObject &UnderlObj : Objs) {
+          ValueType V = UnderlObj.getValue();
+          bool ThisMayAlias = UnderlObj.mayAlias();
+
+          // Add precise dependencies against all previously seen stores
+          // mapping to the same Value(s).
+          addChainDependencies(SU, (ThisMayAlias ? Stores : NonAliasStores), V);
 
-        for (UnderlyingObjectsVector::iterator
-             J = Objs.begin(), JE = Objs.end(); J != JE; ++J) {
-          ValueType V = J->getPointer();
-          bool ThisMayAlias = J->getInt();
-
-          if (ThisMayAlias)
-            MayAlias = true;
-
-          // A load from a specific PseudoSourceValue. Add precise dependencies.
-          MapVector<ValueType, std::vector<SUnit *> >::iterator I =
-            ((ThisMayAlias) ? AliasMemDefs.find(V) : NonAliasMemDefs.find(V));
-          MapVector<ValueType, std::vector<SUnit *> >::iterator IE =
-            ((ThisMayAlias) ? AliasMemDefs.end() : NonAliasMemDefs.end());
-          if (I != IE)
-            for (unsigned i = 0, e = I->second.size(); i != e; ++i)
-              addChainDependency(AAForDep, MFI, MF.getDataLayout(), SU,
-                                 I->second[i], RejectMemNodes, 0, true);
-          if (ThisMayAlias)
-            AliasMemUses[V].push_back(SU);
-          else
-            NonAliasMemUses[V].push_back(SU);
+          // Map this load to V.
+          (ThisMayAlias ? Loads : NonAliasLoads).insert(SU, V);
         }
-        // Add dependencies on alias and barrier chains, if needed.
-        if (MayAlias && AliasChain)
-          addChainDependency(AAForDep, MFI, MF.getDataLayout(), SU, AliasChain,
-                             RejectMemNodes);
-        if (MayAlias)
-          // This call must come after calls to addChainDependency() since it
-          // consumes the 'RejectMemNodes' list that addChainDependency()
-          // possibly adds to.
-          adjustChainDeps(AA, MFI, MF.getDataLayout(), SU, &ExitSU,
-                          RejectMemNodes, /*Latency=*/0);
-        if (BarrierChain)
-          BarrierChain->addPred(SDep(SU, SDep::Barrier));
+        // The load may have dependencies to unanalyzable stores.
+        addChainDependencies(SU, Stores, UnknownValue);
       }
     }
+
+    // Reduce maps if they grow huge.
+    if (Stores.size() + Loads.size() >= HugeRegion) {
+      DEBUG(dbgs() << "Reducing Stores and Loads maps.\n";);
+      reduceHugeMemNodeMaps(Stores, Loads, getReductionSize());
+    }
+    if (NonAliasStores.size() + NonAliasLoads.size() >= HugeRegion) {
+      DEBUG(dbgs() << "Reducing NonAliasStores and NonAliasLoads maps.\n";);
+      reduceHugeMemNodeMaps(NonAliasStores, NonAliasLoads, getReductionSize());
+    }
   }
+
   if (DbgMI)
     FirstDbgValue = DbgMI;
 
@@ -1166,7 +1114,84 @@ void ScheduleDAGInstrs::buildSchedGraph(AliasAnalysis *AA,
   Uses.clear();
   CurrentVRegDefs.clear();
   CurrentVRegUses.clear();
-  PendingLoads.clear();
+}
+
+raw_ostream &llvm::operator<<(raw_ostream &OS, const PseudoSourceValue* PSV) {
+  PSV->printCustom(OS);
+  return OS;
+}
+
+void ScheduleDAGInstrs::Value2SUsMap::dump() {
+  for (auto &Itr : *this) {
+    if (Itr.first.is<const Value*>()) {
+      const Value *V = Itr.first.get<const Value*>();
+      if (isa<UndefValue>(V))
+        dbgs() << "Unknown";
+      else
+        V->printAsOperand(dbgs());
+    }
+    else if (Itr.first.is<const PseudoSourceValue*>())
+      dbgs() <<  Itr.first.get<const PseudoSourceValue*>();
+    else
+      llvm_unreachable("Unknown Value type.");
+
+    dbgs() << " : ";
+    dumpSUList(Itr.second);
+  }
+}
+
+/// Reduce maps in FIFO order, by N SUs. This is better than turning
+/// every Nth memory SU into BarrierChain in buildSchedGraph(), since
+/// it avoids unnecessary edges between seen SUs above the new
+/// BarrierChain, and those below it.
+void ScheduleDAGInstrs::reduceHugeMemNodeMaps(Value2SUsMap &stores,
+                                              Value2SUsMap &loads, unsigned N) {
+  DEBUG(dbgs() << "Before reduction:\nStoring SUnits:\n";
+        stores.dump();
+        dbgs() << "Loading SUnits:\n";
+        loads.dump());
+
+  // Insert all SU's NodeNums into a vector and sort it.
+  std::vector<unsigned> NodeNums;
+  NodeNums.reserve(stores.size() + loads.size());
+  for (auto &I : stores)
+    for (auto *SU : I.second)
+      NodeNums.push_back(SU->NodeNum);
+  for (auto &I : loads)
+    for (auto *SU : I.second)
+      NodeNums.push_back(SU->NodeNum);
+  std::sort(NodeNums.begin(), NodeNums.end());
+
+  // The N last elements in NodeNums will be removed, and the SU with
+  // the lowest NodeNum of them will become the new BarrierChain to
+  // let the not yet seen SUs have a dependency to the removed SUs.
+  assert (N <= NodeNums.size());
+  SUnit *newBarrierChain = &SUnits[*(NodeNums.end() - N)];
+  if (BarrierChain) {
+    // The aliasing and non-aliasing maps reduce independently of each
+    // other, but share a common BarrierChain. Check if the
+    // newBarrierChain is above the former one. If it is not, it may
+    // introduce a loop to use newBarrierChain, so keep the old one.
+    if (newBarrierChain->NodeNum < BarrierChain->NodeNum) {
+      BarrierChain->addPredBarrier(newBarrierChain);
+      BarrierChain = newBarrierChain;
+      DEBUG(dbgs() << "Inserting new barrier chain: SU("
+            << BarrierChain->NodeNum << ").\n";);
+    }
+    else
+      DEBUG(dbgs() << "Keeping old barrier chain: SU("
+            << BarrierChain->NodeNum << ").\n";);
+  }
+  else
+    BarrierChain = newBarrierChain;
+
+  insertBarrierChain(stores);
+  insertBarrierChain(loads);
+
+  DEBUG(dbgs() << "After reduction:\nStoring SUnits:\n";
+        stores.dump();
+        dbgs() << "Loading SUnits:\n";
+        loads.dump());
 }
 
 /// \brief Initialize register live-range state for updating kills.
@@ -1190,7 +1215,8 @@ void ScheduleDAGInstrs::startBlockForKills(MachineBasicBlock *BB) {
 /// operands, then we also need to propagate that to any instructions inside
 /// the bundle which had the same kill state.
 static void toggleBundleKillFlag(MachineInstr *MI, unsigned Reg,
-                                 bool NewKillState) {
+                                 bool NewKillState,
+                                 const TargetRegisterInfo *TRI) {
   if (MI->getOpcode() != TargetOpcode::BUNDLE)
     return;
 
@@ -1199,30 +1225,13 @@ static void toggleBundleKillFlag(MachineInstr *MI, unsigned Reg,
   // might set it on too many operands.  We will clear as many flags as we
   // can though.
   MachineBasicBlock::instr_iterator Begin = MI->getIterator();
-  MachineBasicBlock::instr_iterator End = getBundleEnd(MI);
+  MachineBasicBlock::instr_iterator End = getBundleEnd(*MI);
   while (Begin != End) {
-    for (MachineOperand &MO : (--End)->operands()) {
-      if (!MO.isReg() || MO.isDef() || Reg != MO.getReg())
-        continue;
-
-      // DEBUG_VALUE nodes do not contribute to code generation and should
-      // always be ignored.  Failure to do so may result in trying to modify
-      // KILL flags on DEBUG_VALUE nodes, which is distressing.
-      if (MO.isDebug())
-        continue;
-
-      // If the register has the internal flag then it could be killing an
-      // internal def of the register.  In this case, just skip.  We only want
-      // to toggle the flag on operands visible outside the bundle.
-      if (MO.isInternalRead())
-        continue;
-
-      if (MO.isKill() == NewKillState)
-        continue;
-      MO.setIsKill(NewKillState);
-      if (NewKillState)
-        return;
-    }
+    if (NewKillState) {
+      if ((--End)->addRegisterKilled(Reg, TRI, /* addIfNotFound= */ false))
+         return;
+    } else
+        (--End)->clearRegisterKills(Reg, TRI);
   }
 }
 
@@ -1230,21 +1239,21 @@ bool ScheduleDAGInstrs::toggleKillFlag(MachineInstr *MI, MachineOperand &MO) {
   // Setting kill flag...
   if (!MO.isKill()) {
     MO.setIsKill(true);
-    toggleBundleKillFlag(MI, MO.getReg(), true);
+    toggleBundleKillFlag(MI, MO.getReg(), true, TRI);
     return false;
   }
 
   // If MO itself is live, clear the kill flag...
   if (LiveRegs.test(MO.getReg())) {
     MO.setIsKill(false);
-    toggleBundleKillFlag(MI, MO.getReg(), false);
+    toggleBundleKillFlag(MI, MO.getReg(), false, TRI);
     return false;
   }
 
   // If any subreg of MO is live, then create an imp-def for that
   // subreg and keep MO marked as killed.
   MO.setIsKill(false);
-  toggleBundleKillFlag(MI, MO.getReg(), false);
+  toggleBundleKillFlag(MI, MO.getReg(), false, TRI);
   bool AllDead = true;
   const unsigned SuperReg = MO.getReg();
   MachineInstrBuilder MIB(MF, MI);
@@ -1257,7 +1266,7 @@ bool ScheduleDAGInstrs::toggleKillFlag(MachineInstr *MI, MachineOperand &MO) {
 
   if(AllDead) {
     MO.setIsKill(true);
-    toggleBundleKillFlag(MI, MO.getReg(), true);
+    toggleBundleKillFlag(MI, MO.getReg(), true, TRI);
   }
   return false;
 }
@@ -1275,15 +1284,15 @@ void ScheduleDAGInstrs::fixupKills(MachineBasicBlock *MBB) {
   unsigned Count = MBB->size();
   for (MachineBasicBlock::iterator I = MBB->end(), E = MBB->begin();
        I != E; --Count) {
-    MachineInstr *MI = --I;
-    if (MI->isDebugValue())
+    MachineInstr &MI = *--I;
+    if (MI.isDebugValue())
       continue;
 
     // Update liveness.  Registers that are defed but not used in this
     // instruction are now dead. Mark register and all subregs as they
     // are completely defined.
-    for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) {
-      MachineOperand &MO = MI->getOperand(i);
+    for (unsigned i = 0, e = MI.getNumOperands(); i != e; ++i) {
+      MachineOperand &MO = MI.getOperand(i);
       if (MO.isRegMask())
         LiveRegs.clearBitsNotInMask(MO.getRegMask());
       if (!MO.isReg()) continue;
@@ -1291,7 +1300,7 @@ void ScheduleDAGInstrs::fixupKills(MachineBasicBlock *MBB) {
       if (Reg == 0) continue;
       if (!MO.isDef()) continue;
       // Ignore two-addr defs.
-      if (MI->isRegTiedToUseOperand(i)) continue;
+      if (MI.isRegTiedToUseOperand(i)) continue;
 
       // Repeat for reg and all subregs.
       for (MCSubRegIterator SubRegs(Reg, TRI, /*IncludeSelf=*/true);
@@ -1303,8 +1312,8 @@ void ScheduleDAGInstrs::fixupKills(MachineBasicBlock *MBB) {
     // register is used multiple times we only set the kill flag on
     // the first use. Don't set kill flags on undef operands.
     killedRegs.reset();
-    for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) {
-      MachineOperand &MO = MI->getOperand(i);
+    for (unsigned i = 0, e = MI.getNumOperands(); i != e; ++i) {
+      MachineOperand &MO = MI.getOperand(i);
       if (!MO.isReg() || !MO.isUse() || MO.isUndef()) continue;
       unsigned Reg = MO.getReg();
       if ((Reg == 0) || MRI.isReserved(Reg)) continue;
@@ -1329,13 +1338,15 @@ void ScheduleDAGInstrs::fixupKills(MachineBasicBlock *MBB) {
       if (MO.isKill() != kill) {
         DEBUG(dbgs() << "Fixing " << MO << " in ");
         // Warning: toggleKillFlag may invalidate MO.
-        toggleKillFlag(MI, MO);
-        DEBUG(MI->dump());
-        DEBUG(if (MI->getOpcode() == TargetOpcode::BUNDLE) {
-          MachineBasicBlock::instr_iterator Begin = MI->getIterator();
-          MachineBasicBlock::instr_iterator End = getBundleEnd(MI);
-          while (++Begin != End)
-            DEBUG(Begin->dump());
+        toggleKillFlag(&MI, MO);
+        DEBUG(MI.dump());
+        DEBUG({
+          if (MI.getOpcode() == TargetOpcode::BUNDLE) {
+            MachineBasicBlock::instr_iterator Begin = MI.getIterator();
+            MachineBasicBlock::instr_iterator End = getBundleEnd(MI);
+            while (++Begin != End)
+              DEBUG(Begin->dump());
+          }
         });
       }
 
@@ -1344,8 +1355,8 @@ void ScheduleDAGInstrs::fixupKills(MachineBasicBlock *MBB) {
 
     // Mark any used register (that is not using undef) and subregs as
     // now live...
-    for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) {
-      MachineOperand &MO = MI->getOperand(i);
+    for (unsigned i = 0, e = MI.getNumOperands(); i != e; ++i) {
+      MachineOperand &MO = MI.getOperand(i);
       if (!MO.isReg() || !MO.isUse() || MO.isUndef()) continue;
       unsigned Reg = MO.getReg();
       if ((Reg == 0) || MRI.isReserved(Reg)) continue;
diff --git a/lib/CodeGen/ScheduleDAGPrinter.cpp b/lib/CodeGen/ScheduleDAGPrinter.cpp
index 1150d26e559b..ca2881cb91e0 100644
--- a/lib/CodeGen/ScheduleDAGPrinter.cpp
+++ b/lib/CodeGen/ScheduleDAGPrinter.cpp
@@ -12,7 +12,6 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/CodeGen/ScheduleDAG.h"
-#include "llvm/ADT/DenseSet.h"
 #include "llvm/ADT/StringExtras.h"
 #include "llvm/CodeGen/MachineConstantPool.h"
 #include "llvm/CodeGen/MachineFunction.h"
diff --git a/lib/CodeGen/ScoreboardHazardRecognizer.cpp b/lib/CodeGen/ScoreboardHazardRecognizer.cpp
index 38833a4165a3..69c487033015 100644
--- a/lib/CodeGen/ScoreboardHazardRecognizer.cpp
+++ b/lib/CodeGen/ScoreboardHazardRecognizer.cpp
@@ -23,22 +23,13 @@
 
 using namespace llvm;
 
-#define DEBUG_TYPE ::llvm::ScoreboardHazardRecognizer::DebugType
+#define DEBUG_TYPE DebugType
 
-#ifndef NDEBUG
-const char *ScoreboardHazardRecognizer::DebugType = "";
-#endif
-
-ScoreboardHazardRecognizer::
-ScoreboardHazardRecognizer(const InstrItineraryData *II,
-                           const ScheduleDAG *SchedDAG,
-                           const char *ParentDebugType) :
-  ScheduleHazardRecognizer(), ItinData(II), DAG(SchedDAG), IssueWidth(0),
-  IssueCount(0) {
-
-#ifndef NDEBUG
-  DebugType = ParentDebugType;
-#endif
+ScoreboardHazardRecognizer::ScoreboardHazardRecognizer(
+    const InstrItineraryData *II, const ScheduleDAG *SchedDAG,
+    const char *ParentDebugType)
+    : ScheduleHazardRecognizer(), DebugType(ParentDebugType), ItinData(II),
+      DAG(SchedDAG), IssueWidth(0), IssueCount(0) {
 
   // Determine the maximum depth of any itinerary. This determines the depth of
   // the scoreboard. We always make the scoreboard at least 1 cycle deep to
@@ -91,7 +82,7 @@ void ScoreboardHazardRecognizer::Reset() {
 }
 
 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
-void ScoreboardHazardRecognizer::Scoreboard::dump() const {
+LLVM_DUMP_METHOD void ScoreboardHazardRecognizer::Scoreboard::dump() const {
   dbgs() << "Scoreboard:\n";
 
   unsigned last = Depth - 1;
diff --git a/lib/CodeGen/SelectionDAG/CMakeLists.txt b/lib/CodeGen/SelectionDAG/CMakeLists.txt
index fbedf2c1d17a..b3d26c41acf7 100644
--- a/lib/CodeGen/SelectionDAG/CMakeLists.txt
+++ b/lib/CodeGen/SelectionDAG/CMakeLists.txt
@@ -14,15 +14,15 @@ add_llvm_library(LLVMSelectionDAG
   ScheduleDAGFast.cpp
   ScheduleDAGRRList.cpp
   ScheduleDAGSDNodes.cpp
-  SelectionDAG.cpp
+  ScheduleDAGVLIW.cpp
   SelectionDAGBuilder.cpp
+  SelectionDAG.cpp
   SelectionDAGDumper.cpp
   SelectionDAGISel.cpp
   SelectionDAGPrinter.cpp
+  SelectionDAGTargetInfo.cpp
   StatepointLowering.cpp
-  ScheduleDAGVLIW.cpp
   TargetLowering.cpp
-  TargetSelectionDAGInfo.cpp
   )
 
 add_dependencies(LLVMSelectionDAG intrinsics_gen)
diff --git a/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
index c741982bc08d..d888676583f3 100644
--- a/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
+++ b/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
@@ -24,6 +24,7 @@
 #include "llvm/Analysis/AliasAnalysis.h"
 #include "llvm/CodeGen/MachineFrameInfo.h"
 #include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/SelectionDAGTargetInfo.h"
 #include "llvm/IR/DataLayout.h"
 #include "llvm/IR/DerivedTypes.h"
 #include "llvm/IR/Function.h"
@@ -112,7 +113,7 @@ namespace {
     ///
     /// This is used to allow us to reliably add any operands of a DAG node
     /// which have not yet been combined to the worklist.
-    SmallPtrSet<SDNode *, 64> CombinedNodes;
+    SmallPtrSet<SDNode *, 32> CombinedNodes;
 
     // AA - Used for DAG load/store alias analysis.
     AliasAnalysis &AA;
@@ -211,8 +212,8 @@ namespace {
     SDValue PromoteExtend(SDValue Op);
     bool PromoteLoad(SDValue Op);
 
-    void ExtendSetCCUses(const SmallVectorImpl<SDNode *> &SetCCs,
-                         SDValue Trunc, SDValue ExtLoad, SDLoc DL,
+    void ExtendSetCCUses(const SmallVectorImpl<SDNode *> &SetCCs, SDValue Trunc,
+                         SDValue ExtLoad, const SDLoc &DL,
                          ISD::NodeType ExtType);
 
     /// Call the node-specific routine that knows how to fold each
@@ -258,6 +259,7 @@ namespace {
     SDValue visitSRL(SDNode *N);
     SDValue visitRotate(SDNode *N);
     SDValue visitBSWAP(SDNode *N);
+    SDValue visitBITREVERSE(SDNode *N);
     SDValue visitCTLZ(SDNode *N);
     SDValue visitCTLZ_ZERO_UNDEF(SDNode *N);
     SDValue visitCTTZ(SDNode *N);
@@ -273,6 +275,7 @@ namespace {
     SDValue visitANY_EXTEND(SDNode *N);
     SDValue visitSIGN_EXTEND_INREG(SDNode *N);
     SDValue visitSIGN_EXTEND_VECTOR_INREG(SDNode *N);
+    SDValue visitZERO_EXTEND_VECTOR_INREG(SDNode *N);
     SDValue visitTRUNCATE(SDNode *N);
     SDValue visitBITCAST(SDNode *N);
     SDValue visitBUILD_PAIR(SDNode *N);
@@ -326,18 +329,19 @@ namespace {
     SDValue visitFMULForFMACombine(SDNode *N);
 
     SDValue XformToShuffleWithZero(SDNode *N);
-    SDValue ReassociateOps(unsigned Opc, SDLoc DL, SDValue LHS, SDValue RHS);
+    SDValue ReassociateOps(unsigned Opc, const SDLoc &DL, SDValue LHS,
+                           SDValue RHS);
 
     SDValue visitShiftByConstant(SDNode *N, ConstantSDNode *Amt);
 
     bool SimplifySelectOps(SDNode *SELECT, SDValue LHS, SDValue RHS);
     SDValue SimplifyBinOpWithSameOpcodeHands(SDNode *N);
-    SDValue SimplifySelect(SDLoc DL, SDValue N0, SDValue N1, SDValue N2);
-    SDValue SimplifySelectCC(SDLoc DL, SDValue N0, SDValue N1, SDValue N2,
-                             SDValue N3, ISD::CondCode CC,
+    SDValue SimplifySelect(const SDLoc &DL, SDValue N0, SDValue N1, SDValue N2);
+    SDValue SimplifySelectCC(const SDLoc &DL, SDValue N0, SDValue N1,
+                             SDValue N2, SDValue N3, ISD::CondCode CC,
                              bool NotExtCompare = false);
     SDValue SimplifySetCC(EVT VT, SDValue N0, SDValue N1, ISD::CondCode Cond,
-                          SDLoc DL, bool foldBooleans = true);
+                          const SDLoc &DL, bool foldBooleans = true);
 
     bool isSetCCEquivalent(SDValue N, SDValue &LHS, SDValue &RHS,
                            SDValue &CC) const;
@@ -353,19 +357,21 @@ namespace {
     SDValue BuildSDIVPow2(SDNode *N);
     SDValue BuildUDIV(SDNode *N);
     SDValue BuildReciprocalEstimate(SDValue Op, SDNodeFlags *Flags);
-    SDValue BuildRsqrtEstimate(SDValue Op, SDNodeFlags *Flags);
-    SDValue BuildRsqrtNROneConst(SDValue Op, SDValue Est, unsigned Iterations,
-                                 SDNodeFlags *Flags);
-    SDValue BuildRsqrtNRTwoConst(SDValue Op, SDValue Est, unsigned Iterations,
-                                 SDNodeFlags *Flags);
+    SDValue buildRsqrtEstimate(SDValue Op, SDNodeFlags *Flags);
+    SDValue buildSqrtEstimate(SDValue Op, SDNodeFlags *Flags);
+    SDValue buildSqrtEstimateImpl(SDValue Op, SDNodeFlags *Flags, bool Recip);
+    SDValue buildSqrtNROneConst(SDValue Op, SDValue Est, unsigned Iterations,
+                                SDNodeFlags *Flags, bool Reciprocal);
+    SDValue buildSqrtNRTwoConst(SDValue Op, SDValue Est, unsigned Iterations,
+                                SDNodeFlags *Flags, bool Reciprocal);
     SDValue MatchBSwapHWordLow(SDNode *N, SDValue N0, SDValue N1,
                                bool DemandHighBits = true);
     SDValue MatchBSwapHWord(SDNode *N, SDValue N0, SDValue N1);
     SDNode *MatchRotatePosNeg(SDValue Shifted, SDValue Pos, SDValue Neg,
                               SDValue InnerPos, SDValue InnerNeg,
                               unsigned PosOpcode, unsigned NegOpcode,
-                              SDLoc DL);
-    SDNode *MatchRotate(SDValue LHS, SDValue RHS, SDLoc DL);
+                              const SDLoc &DL);
+    SDNode *MatchRotate(SDValue LHS, SDValue RHS, const SDLoc &DL);
     SDValue ReduceLoadWidth(SDNode *N);
     SDValue ReduceLoadOpStoreWidth(SDNode *N);
     SDValue TransformFPLoadStorePair(SDNode *N);
@@ -386,10 +392,17 @@ namespace {
     /// chain (aliasing node.)
     SDValue FindBetterChain(SDNode *N, SDValue Chain);
 
-    /// Do FindBetterChain for a store and any possibly adjacent stores on
-    /// consecutive chains.
+    /// Try to replace a store and any possibly adjacent stores on
+    /// consecutive chains with better chains. Return true only if St is
+    /// replaced.
+    ///
+    /// Notice that other chains may still be replaced even if the function
+    /// returns false.
     bool findBetterNeighborChains(StoreSDNode *St);
 
+    /// Match "(X shl/srl V1) & V2" where V2 may not be present.
+    bool MatchRotateHalf(SDValue Op, SDValue &Shift, SDValue &Mask);
+
     /// Holds a pointer to an LSBaseSDNode as well as information on where it
     /// is located in a sequence of memory operations connected by a chain.
     struct MemOpLink {
@@ -414,8 +427,7 @@ namespace {
 
     /// This is a helper function for MergeStoresOfConstantsOrVecElts. Returns a
     /// constant build_vector of the stored constant values in Stores.
-    SDValue getMergedConstantVectorStore(SelectionDAG &DAG,
-                                         SDLoc SL,
+    SDValue getMergedConstantVectorStore(SelectionDAG &DAG, const SDLoc &SL,
                                          ArrayRef<MemOpLink> Stores,
                                          SmallVectorImpl<SDValue> &Chains,
                                          EVT Ty) const;
@@ -444,6 +456,12 @@ namespace {
         StoreSDNode* St, SmallVectorImpl<MemOpLink> &StoreNodes,
         SmallVectorImpl<LSBaseSDNode*> &AliasLoadNodes);
 
+    /// Helper function for MergeConsecutiveStores. Checks if
+    /// Candidate stores have indirect dependency through their
+    /// operands. \return True if safe to merge
+    bool checkMergeStoreCandidatesForDependencies(
+        SmallVectorImpl<MemOpLink> &StoreNodes);
+
     /// Merge consecutive store operations into a wide store.
     /// This optimization uses wide integers or vectors when possible.
     /// \return True if some memory operations were changed.
@@ -747,32 +765,6 @@ bool DAGCombiner::isOneUseSetCC(SDValue N) const {
   return false;
 }
 
-/// Returns true if N is a BUILD_VECTOR node whose
-/// elements are all the same constant or undefined.
-static bool isConstantSplatVector(SDNode *N, APInt& SplatValue) {
-  BuildVectorSDNode *C = dyn_cast<BuildVectorSDNode>(N);
-  if (!C)
-    return false;
-
-  APInt SplatUndef;
-  unsigned SplatBitSize;
-  bool HasAnyUndefs;
-  EVT EltVT = N->getValueType(0).getVectorElementType();
-  return (C->isConstantSplat(SplatValue, SplatUndef, SplatBitSize,
-                             HasAnyUndefs) &&
-          EltVT.getSizeInBits() >= SplatBitSize);
-}
-
-// \brief Returns the SDNode if it is a constant integer BuildVector
-// or constant integer.
-static SDNode *isConstantIntBuildVectorOrConstantInt(SDValue N) {
-  if (isa<ConstantSDNode>(N))
-    return N.getNode();
-  if (ISD::isBuildVectorOfConstantSDNodes(N.getNode()))
-    return N.getNode();
-  return nullptr;
-}
-
 // \brief Returns the SDNode if it is a constant float BuildVector
 // or constant float.
 static SDNode *isConstantFPBuildVectorOrConstantFP(SDValue N) {
@@ -821,12 +813,12 @@ static ConstantFPSDNode *isConstOrConstSplatFP(SDValue N) {
   return nullptr;
 }
 
-SDValue DAGCombiner::ReassociateOps(unsigned Opc, SDLoc DL,
-                                    SDValue N0, SDValue N1) {
+SDValue DAGCombiner::ReassociateOps(unsigned Opc, const SDLoc &DL, SDValue N0,
+                                    SDValue N1) {
   EVT VT = N0.getValueType();
   if (N0.getOpcode() == Opc) {
-    if (SDNode *L = isConstantIntBuildVectorOrConstantInt(N0.getOperand(1))) {
-      if (SDNode *R = isConstantIntBuildVectorOrConstantInt(N1)) {
+    if (SDNode *L = DAG.isConstantIntBuildVectorOrConstantInt(N0.getOperand(1))) {
+      if (SDNode *R = DAG.isConstantIntBuildVectorOrConstantInt(N1)) {
         // reassoc. (op (op x, c1), c2) -> (op x, (op c1, c2))
         if (SDValue OpNode = DAG.FoldConstantArithmetic(Opc, DL, VT, L, R))
           return DAG.getNode(Opc, DL, VT, N0.getOperand(0), OpNode);
@@ -845,17 +837,17 @@ SDValue DAGCombiner::ReassociateOps(unsigned Opc, SDLoc DL,
   }
 
   if (N1.getOpcode() == Opc) {
-    if (SDNode *R = isConstantIntBuildVectorOrConstantInt(N1.getOperand(1))) {
-      if (SDNode *L = isConstantIntBuildVectorOrConstantInt(N0)) {
+    if (SDNode *R = DAG.isConstantIntBuildVectorOrConstantInt(N1.getOperand(1))) {
+      if (SDNode *L = DAG.isConstantIntBuildVectorOrConstantInt(N0)) {
         // reassoc. (op c2, (op x, c1)) -> (op x, (op c1, c2))
         if (SDValue OpNode = DAG.FoldConstantArithmetic(Opc, DL, VT, R, L))
           return DAG.getNode(Opc, DL, VT, N1.getOperand(0), OpNode);
         return SDValue();
       }
       if (N1.hasOneUse()) {
-        // reassoc. (op y, (op x, c1)) -> (op (op x, y), c1) iff x+c1 has one
+        // reassoc. (op x, (op y, c1)) -> (op (op x, y), c1) iff x+c1 has one
         // use
-        SDValue OpNode = DAG.getNode(Opc, SDLoc(N0), VT, N1.getOperand(0), N0);
+        SDValue OpNode = DAG.getNode(Opc, SDLoc(N0), VT, N0, N1.getOperand(0));
         if (!OpNode.getNode())
           return SDValue();
         AddToWorklist(OpNode.getNode());
@@ -962,7 +954,8 @@ void DAGCombiner::ReplaceLoadWithPromotedLoad(SDNode *Load, SDNode *ExtLoad) {
 SDValue DAGCombiner::PromoteOperand(SDValue Op, EVT PVT, bool &Replace) {
   Replace = false;
   SDLoc dl(Op);
-  if (LoadSDNode *LD = dyn_cast<LoadSDNode>(Op)) {
+  if (ISD::isUNINDEXEDLoad(Op.getNode())) {
+    LoadSDNode *LD = cast<LoadSDNode>(Op);
     EVT MemVT = LD->getMemoryVT();
     ISD::LoadExtType ExtType = ISD::isNON_EXTLoad(LD)
       ? (TLI.isLoadExtLegal(ISD::ZEXTLOAD, PVT, MemVT) ? ISD::ZEXTLOAD
@@ -1166,6 +1159,9 @@ bool DAGCombiner::PromoteLoad(SDValue Op) {
   if (!LegalOperations)
     return false;
 
+  if (!ISD::isUNINDEXEDLoad(Op.getNode()))
+    return false;
+
   EVT VT = Op.getValueType();
   if (VT.isVector() || !VT.isInteger())
     return false;
@@ -1259,8 +1255,7 @@ void DAGCombiner::Run(CombineLevel AtLevel) {
   // changes of the root.
   HandleSDNode Dummy(DAG.getRoot());
 
-  // while the worklist isn't empty, find a node and
-  // try and combine it.
+  // While the worklist isn't empty, find a node and try to combine it.
   while (!WorklistMap.empty()) {
     SDNode *N;
     // The Worklist holds the SDNodes in order, but it may contain null entries.
@@ -1326,8 +1321,6 @@ void DAGCombiner::Run(CombineLevel AtLevel) {
     DEBUG(dbgs() << " ... into: ";
           RV.getNode()->dump(&DAG));
 
-    // Transfer debug value.
-    DAG.TransferDbgValues(SDValue(N, 0), RV);
     if (N->getNumValues() == RV.getNode()->getNumValues())
       DAG.ReplaceAllUsesWith(N, RV.getNode());
     else {
@@ -1388,6 +1381,7 @@ SDValue DAGCombiner::visit(SDNode *N) {
   case ISD::ROTR:
   case ISD::ROTL:               return visitRotate(N);
   case ISD::BSWAP:              return visitBSWAP(N);
+  case ISD::BITREVERSE:         return visitBITREVERSE(N);
   case ISD::CTLZ:               return visitCTLZ(N);
   case ISD::CTLZ_ZERO_UNDEF:    return visitCTLZ_ZERO_UNDEF(N);
   case ISD::CTTZ:               return visitCTTZ(N);
@@ -1403,6 +1397,7 @@ SDValue DAGCombiner::visit(SDNode *N) {
   case ISD::ANY_EXTEND:         return visitANY_EXTEND(N);
   case ISD::SIGN_EXTEND_INREG:  return visitSIGN_EXTEND_INREG(N);
   case ISD::SIGN_EXTEND_VECTOR_INREG: return visitSIGN_EXTEND_VECTOR_INREG(N);
+  case ISD::ZERO_EXTEND_VECTOR_INREG: return visitZERO_EXTEND_VECTOR_INREG(N);
   case ISD::TRUNCATE:           return visitTRUNCATE(N);
   case ISD::BITCAST:            return visitBITCAST(N);
   case ISD::BUILD_PAIR:         return visitBUILD_PAIR(N);
@@ -1628,8 +1623,8 @@ SDValue DAGCombiner::visitMERGE_VALUES(SDNode *N) {
   return SDValue(N, 0);   // Return N so it doesn't get rechecked!
 }
 
-/// If \p N is a ContantSDNode with isOpaque() == false return it casted to a
-/// ContantSDNode pointer else nullptr.
+/// If \p N is a ConstantSDNode with isOpaque() == false return it casted to a
+/// ConstantSDNode pointer else nullptr.
 static ConstantSDNode *getAsNonOpaqueConstant(SDValue N) {
   ConstantSDNode *Const = dyn_cast<ConstantSDNode>(N);
   return Const != nullptr && !Const->isOpaque() ? Const : nullptr;
@@ -1653,38 +1648,32 @@ SDValue DAGCombiner::visitADD(SDNode *N) {
   }
 
   // fold (add x, undef) -> undef
-  if (N0.getOpcode() == ISD::UNDEF)
+  if (N0.isUndef())
     return N0;
-  if (N1.getOpcode() == ISD::UNDEF)
+  if (N1.isUndef())
     return N1;
-  // fold (add c1, c2) -> c1+c2
-  ConstantSDNode *N0C = getAsNonOpaqueConstant(N0);
-  ConstantSDNode *N1C = getAsNonOpaqueConstant(N1);
-  if (N0C && N1C)
-    return DAG.FoldConstantArithmetic(ISD::ADD, SDLoc(N), VT, N0C, N1C);
-  // canonicalize constant to RHS
-  if (isConstantIntBuildVectorOrConstantInt(N0) &&
-     !isConstantIntBuildVectorOrConstantInt(N1))
-    return DAG.getNode(ISD::ADD, SDLoc(N), VT, N1, N0);
+  if (DAG.isConstantIntBuildVectorOrConstantInt(N0)) {
+    // canonicalize constant to RHS
+    if (!DAG.isConstantIntBuildVectorOrConstantInt(N1))
+      return DAG.getNode(ISD::ADD, SDLoc(N), VT, N1, N0);
+    // fold (add c1, c2) -> c1+c2
+    return DAG.FoldConstantArithmetic(ISD::ADD, SDLoc(N), VT,
+                                      N0.getNode(), N1.getNode());
+  }
   // fold (add x, 0) -> x
   if (isNullConstant(N1))
     return N0;
-  // fold (add Sym, c) -> Sym+c
-  if (GlobalAddressSDNode *GA = dyn_cast<GlobalAddressSDNode>(N0))
-    if (!LegalOperations && TLI.isOffsetFoldingLegal(GA) && N1C &&
-        GA->getOpcode() == ISD::GlobalAddress)
-      return DAG.getGlobalAddress(GA->getGlobal(), SDLoc(N1C), VT,
-                                  GA->getOffset() +
-                                    (uint64_t)N1C->getSExtValue());
   // fold ((c1-A)+c2) -> (c1+c2)-A
-  if (N1C && N0.getOpcode() == ISD::SUB)
-    if (ConstantSDNode *N0C = getAsNonOpaqueConstant(N0.getOperand(0))) {
-      SDLoc DL(N);
-      return DAG.getNode(ISD::SUB, DL, VT,
-                         DAG.getConstant(N1C->getAPIntValue()+
-                                         N0C->getAPIntValue(), DL, VT),
-                         N0.getOperand(1));
-    }
+  if (ConstantSDNode *N1C = getAsNonOpaqueConstant(N1)) {
+    if (N0.getOpcode() == ISD::SUB)
+      if (ConstantSDNode *N0C = getAsNonOpaqueConstant(N0.getOperand(0))) {
+        SDLoc DL(N);
+        return DAG.getNode(ISD::SUB, DL, VT,
+                           DAG.getConstant(N1C->getAPIntValue()+
+                                           N0C->getAPIntValue(), DL, VT),
+                           N0.getOperand(1));
+      }
+  }
   // reassociate add
   if (SDValue RADD = ReassociateOps(ISD::ADD, SDLoc(N), N0, N1))
     return RADD;
@@ -1850,9 +1839,9 @@ SDValue DAGCombiner::visitADDE(SDNode *N) {
 
 // Since it may not be valid to emit a fold to zero for vector initializers
 // check if we can before folding.
-static SDValue tryFoldToZero(SDLoc DL, const TargetLowering &TLI, EVT VT,
-                             SelectionDAG &DAG,
-                             bool LegalOperations, bool LegalTypes) {
+static SDValue tryFoldToZero(const SDLoc &DL, const TargetLowering &TLI, EVT VT,
+                             SelectionDAG &DAG, bool LegalOperations,
+                             bool LegalTypes) {
   if (!VT.isVector())
     return DAG.getConstant(0, DL, VT);
   if (!LegalOperations || TLI.isOperationLegal(ISD::BUILD_VECTOR, VT))
@@ -1879,11 +1868,14 @@ SDValue DAGCombiner::visitSUB(SDNode *N) {
   // FIXME: Refactor this and xor and other similar operations together.
   if (N0 == N1)
     return tryFoldToZero(SDLoc(N), TLI, VT, DAG, LegalOperations, LegalTypes);
-  // fold (sub c1, c2) -> c1-c2
+  if (DAG.isConstantIntBuildVectorOrConstantInt(N0) &&
+      DAG.isConstantIntBuildVectorOrConstantInt(N1)) {
+    // fold (sub c1, c2) -> c1-c2
+    return DAG.FoldConstantArithmetic(ISD::SUB, SDLoc(N), VT,
+                                      N0.getNode(), N1.getNode());
+  }
   ConstantSDNode *N0C = getAsNonOpaqueConstant(N0);
   ConstantSDNode *N1C = getAsNonOpaqueConstant(N1);
-  if (N0C && N1C)
-    return DAG.FoldConstantArithmetic(ISD::SUB, SDLoc(N), VT, N0C, N1C);
   // fold (sub x, c) -> (add x, -c)
   if (N1C) {
     SDLoc DL(N);
@@ -1933,9 +1925,9 @@ SDValue DAGCombiner::visitSUB(SDNode *N) {
                        N0.getOperand(0), N0.getOperand(1).getOperand(0));
 
   // If either operand of a sub is undef, the result is undef
-  if (N0.getOpcode() == ISD::UNDEF)
+  if (N0.isUndef())
     return N0;
-  if (N1.getOpcode() == ISD::UNDEF)
+  if (N1.isUndef())
     return N1;
 
   // If the relocation model supports it, consider symbol offsets.
@@ -2013,7 +2005,7 @@ SDValue DAGCombiner::visitMUL(SDNode *N) {
   EVT VT = N0.getValueType();
 
   // fold (mul x, undef) -> 0
-  if (N0.getOpcode() == ISD::UNDEF || N1.getOpcode() == ISD::UNDEF)
+  if (N0.isUndef() || N1.isUndef())
     return DAG.getConstant(0, SDLoc(N), VT);
 
   bool N0IsConst = false;
@@ -2026,8 +2018,8 @@ SDValue DAGCombiner::visitMUL(SDNode *N) {
     if (SDValue FoldedVOp = SimplifyVBinOp(N))
       return FoldedVOp;
 
-    N0IsConst = isConstantSplatVector(N0.getNode(), ConstValue0);
-    N1IsConst = isConstantSplatVector(N1.getNode(), ConstValue1);
+    N0IsConst = ISD::isConstantSplatVector(N0.getNode(), ConstValue0);
+    N1IsConst = ISD::isConstantSplatVector(N1.getNode(), ConstValue1);
   } else {
     N0IsConst = isa<ConstantSDNode>(N0);
     if (N0IsConst) {
@@ -2047,8 +2039,8 @@ SDValue DAGCombiner::visitMUL(SDNode *N) {
                                       N0.getNode(), N1.getNode());
 
   // canonicalize constant to RHS (vector doesn't have to splat)
-  if (isConstantIntBuildVectorOrConstantInt(N0) &&
-     !isConstantIntBuildVectorOrConstantInt(N1))
+  if (DAG.isConstantIntBuildVectorOrConstantInt(N0) &&
+     !DAG.isConstantIntBuildVectorOrConstantInt(N1))
     return DAG.getNode(ISD::MUL, SDLoc(N), VT, N1, N0);
   // fold (mul x, 0) -> 0
   if (N1IsConst && ConstValue1 == 0)
@@ -2091,23 +2083,21 @@ SDValue DAGCombiner::visitMUL(SDNode *N) {
   APInt Val;
   // (mul (shl X, c1), c2) -> (mul X, c2 << c1)
   if (N1IsConst && N0.getOpcode() == ISD::SHL &&
-      (isConstantSplatVector(N0.getOperand(1).getNode(), Val) ||
-                     isa<ConstantSDNode>(N0.getOperand(1)))) {
-    SDValue C3 = DAG.getNode(ISD::SHL, SDLoc(N), VT,
-                             N1, N0.getOperand(1));
+      (ISD::isConstantSplatVector(N0.getOperand(1).getNode(), Val) ||
+       isa<ConstantSDNode>(N0.getOperand(1)))) {
+    SDValue C3 = DAG.getNode(ISD::SHL, SDLoc(N), VT, N1, N0.getOperand(1));
     AddToWorklist(C3.getNode());
-    return DAG.getNode(ISD::MUL, SDLoc(N), VT,
-                       N0.getOperand(0), C3);
+    return DAG.getNode(ISD::MUL, SDLoc(N), VT, N0.getOperand(0), C3);
   }
 
   // Change (mul (shl X, C), Y) -> (shl (mul X, Y), C) when the shift has one
   // use.
   {
-    SDValue Sh(nullptr,0), Y(nullptr,0);
+    SDValue Sh(nullptr, 0), Y(nullptr, 0);
     // Check for both (mul (shl X, C), Y)  and  (mul Y, (shl X, C)).
     if (N0.getOpcode() == ISD::SHL &&
-        (isConstantSplatVector(N0.getOperand(1).getNode(), Val) ||
-                       isa<ConstantSDNode>(N0.getOperand(1))) &&
+        (ISD::isConstantSplatVector(N0.getOperand(1).getNode(), Val) ||
+         isa<ConstantSDNode>(N0.getOperand(1))) &&
         N0.getNode()->hasOneUse()) {
       Sh = N0; Y = N1;
     } else if (N1.getOpcode() == ISD::SHL &&
@@ -2117,17 +2107,15 @@ SDValue DAGCombiner::visitMUL(SDNode *N) {
     }
 
     if (Sh.getNode()) {
-      SDValue Mul = DAG.getNode(ISD::MUL, SDLoc(N), VT,
-                                Sh.getOperand(0), Y);
-      return DAG.getNode(ISD::SHL, SDLoc(N), VT,
-                         Mul, Sh.getOperand(1));
+      SDValue Mul = DAG.getNode(ISD::MUL, SDLoc(N), VT, Sh.getOperand(0), Y);
+      return DAG.getNode(ISD::SHL, SDLoc(N), VT, Mul, Sh.getOperand(1));
     }
   }
 
   // fold (mul (add x, c1), c2) -> (add (mul x, c2), c1*c2)
-  if (isConstantIntBuildVectorOrConstantInt(N1) &&
+  if (DAG.isConstantIntBuildVectorOrConstantInt(N1) &&
       N0.getOpcode() == ISD::ADD &&
-      isConstantIntBuildVectorOrConstantInt(N0.getOperand(1)) &&
+      DAG.isConstantIntBuildVectorOrConstantInt(N0.getOperand(1)) &&
       isMulAddWithConstProfitable(N, N0, N1))
       return DAG.getNode(ISD::ADD, SDLoc(N), VT,
                          DAG.getNode(ISD::MUL, SDLoc(N0), VT,
@@ -2146,7 +2134,10 @@ SDValue DAGCombiner::visitMUL(SDNode *N) {
 static bool isDivRemLibcallAvailable(SDNode *Node, bool isSigned,
                                      const TargetLowering &TLI) {
   RTLIB::Libcall LC;
-  switch (Node->getSimpleValueType(0).SimpleTy) {
+  EVT NodeType = Node->getValueType(0);
+  if (!NodeType.isSimple())
+    return false;
+  switch (NodeType.getSimpleVT().SimpleTy) {
   default: return false; // No libcall for vector types.
   case MVT::i8:   LC= isSigned ? RTLIB::SDIVREM_I8  : RTLIB::UDIVREM_I8;  break;
   case MVT::i16:  LC= isSigned ? RTLIB::SDIVREM_I16 : RTLIB::UDIVREM_I16; break;
@@ -2163,14 +2154,18 @@ SDValue DAGCombiner::useDivRem(SDNode *Node) {
   if (Node->use_empty())
     return SDValue(); // This is a dead node, leave it alone.
 
+  unsigned Opcode = Node->getOpcode();
+  bool isSigned = (Opcode == ISD::SDIV) || (Opcode == ISD::SREM);
+  unsigned DivRemOpc = isSigned ? ISD::SDIVREM : ISD::UDIVREM;
+
+  // DivMod lib calls can still work on non-legal types if using lib-calls.
   EVT VT = Node->getValueType(0);
-  if (!TLI.isTypeLegal(VT))
+  if (VT.isVector() || !VT.isInteger())
     return SDValue();
 
-  unsigned Opcode = Node->getOpcode();
-  bool isSigned = (Opcode == ISD::SDIV) || (Opcode == ISD::SREM);
+  if (!TLI.isTypeLegal(VT) && !TLI.isOperationCustom(DivRemOpc, VT))
+    return SDValue();
 
-  unsigned DivRemOpc = isSigned ? ISD::SDIVREM : ISD::UDIVREM;
   // If DIVREM is going to get expanded into a libcall,
   // but there is no libcall available, then don't combine.
   if (!TLI.isOperationLegalOrCustom(DivRemOpc, VT) &&
@@ -2314,10 +2309,10 @@ SDValue DAGCombiner::visitSDIV(SDNode *N) {
         return DivRem;
 
   // undef / X -> 0
-  if (N0.getOpcode() == ISD::UNDEF)
+  if (N0.isUndef())
     return DAG.getConstant(0, DL, VT);
   // X / undef -> undef
-  if (N1.getOpcode() == ISD::UNDEF)
+  if (N1.isUndef())
     return N1;
 
   return SDValue();
@@ -2378,10 +2373,10 @@ SDValue DAGCombiner::visitUDIV(SDNode *N) {
         return DivRem;
 
   // undef / X -> 0
-  if (N0.getOpcode() == ISD::UNDEF)
+  if (N0.isUndef())
     return DAG.getConstant(0, DL, VT);
   // X / undef -> undef
-  if (N1.getOpcode() == ISD::UNDEF)
+  if (N1.isUndef())
     return N1;
 
   return SDValue();
@@ -2419,15 +2414,13 @@ SDValue DAGCombiner::visitREM(SDNode *N) {
     }
     // fold (urem x, (shl pow2, y)) -> (and x, (add (shl pow2, y), -1))
     if (N1.getOpcode() == ISD::SHL) {
-      if (ConstantSDNode *SHC = getAsNonOpaqueConstant(N1.getOperand(0))) {
-        if (SHC->getAPIntValue().isPowerOf2()) {
-          SDValue Add =
-            DAG.getNode(ISD::ADD, DL, VT, N1,
-                 DAG.getConstant(APInt::getAllOnesValue(VT.getSizeInBits()), DL,
-                                 VT));
-          AddToWorklist(Add.getNode());
-          return DAG.getNode(ISD::AND, DL, VT, N0, Add);
-        }
+      ConstantSDNode *SHC = getAsNonOpaqueConstant(N1.getOperand(0));
+      if (SHC && SHC->getAPIntValue().isPowerOf2()) {
+        APInt NegOne = APInt::getAllOnesValue(VT.getSizeInBits());
+        SDValue Add =
+            DAG.getNode(ISD::ADD, DL, VT, N1, DAG.getConstant(NegOne, DL, VT));
+        AddToWorklist(Add.getNode());
+        return DAG.getNode(ISD::AND, DL, VT, N0, Add);
       }
     }
   }
@@ -2462,10 +2455,10 @@ SDValue DAGCombiner::visitREM(SDNode *N) {
     return DivRem.getValue(1);
 
   // undef % X -> 0
-  if (N0.getOpcode() == ISD::UNDEF)
+  if (N0.isUndef())
     return DAG.getConstant(0, DL, VT);
   // X % undef -> undef
-  if (N1.getOpcode() == ISD::UNDEF)
+  if (N1.isUndef())
     return N1;
 
   return SDValue();
@@ -2489,7 +2482,7 @@ SDValue DAGCombiner::visitMULHS(SDNode *N) {
                                        getShiftAmountTy(N0.getValueType())));
   }
   // fold (mulhs x, undef) -> 0
-  if (N0.getOpcode() == ISD::UNDEF || N1.getOpcode() == ISD::UNDEF)
+  if (N0.isUndef() || N1.isUndef())
     return DAG.getConstant(0, SDLoc(N), VT);
 
   // If the type twice as wide is legal, transform the mulhs to a wider multiply
@@ -2525,7 +2518,7 @@ SDValue DAGCombiner::visitMULHU(SDNode *N) {
   if (isOneConstant(N1))
     return DAG.getConstant(0, DL, N0.getValueType());
   // fold (mulhu x, undef) -> 0
-  if (N0.getOpcode() == ISD::UNDEF || N1.getOpcode() == ISD::UNDEF)
+  if (N0.isUndef() || N1.isUndef())
     return DAG.getConstant(0, DL, VT);
 
   // If the type twice as wide is legal, transform the mulhu to a wider multiply
@@ -2698,8 +2691,8 @@ SDValue DAGCombiner::visitIMINMAX(SDNode *N) {
     return DAG.FoldConstantArithmetic(N->getOpcode(), SDLoc(N), VT, N0C, N1C);
 
   // canonicalize constant to RHS
-  if (isConstantIntBuildVectorOrConstantInt(N0) &&
-     !isConstantIntBuildVectorOrConstantInt(N1))
+  if (DAG.isConstantIntBuildVectorOrConstantInt(N0) &&
+     !DAG.isConstantIntBuildVectorOrConstantInt(N1))
     return DAG.getNode(N->getOpcode(), SDLoc(N), VT, N1, N0);
 
   return SDValue();
@@ -2761,7 +2754,7 @@ SDValue DAGCombiner::SimplifyBinOpWithSameOpcodeHands(SDNode *N) {
   }
 
   // Simplify xor/and/or (bitcast(A), bitcast(B)) -> bitcast(op (A,B))
-  // Only perform this optimization after type legalization and before
+  // Only perform this optimization up until type legalization, before
   // LegalizeVectorOprs. LegalizeVectorOprs promotes vector operations by
   // adding bitcasts. For example (xor v4i32) is promoted to (v2i64), and
   // we don't want to undo this promotion.
@@ -2769,7 +2762,7 @@ SDValue DAGCombiner::SimplifyBinOpWithSameOpcodeHands(SDNode *N) {
   // on scalars.
   if ((N0.getOpcode() == ISD::BITCAST ||
        N0.getOpcode() == ISD::SCALAR_TO_VECTOR) &&
-      Level == AfterLegalizeTypes) {
+       Level <= AfterLegalizeTypes) {
     SDValue In0 = N0.getOperand(0);
     SDValue In1 = N1.getOperand(0);
     EVT In0Ty = In0.getValueType();
@@ -2814,7 +2807,7 @@ SDValue DAGCombiner::SimplifyBinOpWithSameOpcodeHands(SDNode *N) {
 
       // Don't try to fold this node if it requires introducing a
       // build vector of all zeros that might be illegal at this stage.
-      if (N->getOpcode() == ISD::XOR && ShOp.getOpcode() != ISD::UNDEF) {
+      if (N->getOpcode() == ISD::XOR && !ShOp.isUndef()) {
         if (!LegalTypes)
           ShOp = DAG.getConstant(0, SDLoc(N), VT);
         else
@@ -2829,13 +2822,13 @@ SDValue DAGCombiner::SimplifyBinOpWithSameOpcodeHands(SDNode *N) {
                                       N0->getOperand(0), N1->getOperand(0));
         AddToWorklist(NewNode.getNode());
         return DAG.getVectorShuffle(VT, SDLoc(N), NewNode, ShOp,
-                                    &SVN0->getMask()[0]);
+                                    SVN0->getMask());
       }
 
       // Don't try to fold this node if it requires introducing a
       // build vector of all zeros that might be illegal at this stage.
       ShOp = N0->getOperand(0);
-      if (N->getOpcode() == ISD::XOR && ShOp.getOpcode() != ISD::UNDEF) {
+      if (N->getOpcode() == ISD::XOR && !ShOp.isUndef()) {
         if (!LegalTypes)
           ShOp = DAG.getConstant(0, SDLoc(N), VT);
         else
@@ -2850,7 +2843,7 @@ SDValue DAGCombiner::SimplifyBinOpWithSameOpcodeHands(SDNode *N) {
                                       N0->getOperand(1), N1->getOperand(1));
         AddToWorklist(NewNode.getNode());
         return DAG.getVectorShuffle(VT, SDLoc(N), ShOp, NewNode,
-                                    &SVN0->getMask()[0]);
+                                    SVN0->getMask());
       }
     }
   }
@@ -2867,7 +2860,7 @@ SDValue DAGCombiner::visitANDLike(SDValue N0, SDValue N1,
   EVT VT = N1.getValueType();
 
   // fold (and x, undef) -> 0
-  if (N0.getOpcode() == ISD::UNDEF || N1.getOpcode() == ISD::UNDEF)
+  if (N0.isUndef() || N1.isUndef())
     return DAG.getConstant(0, SDLoc(LocReference), VT);
   // fold (and (setcc x), (setcc y)) -> (setcc (and x, y))
   SDValue LL, LR, RL, RR, CC0, CC1;
@@ -2965,6 +2958,50 @@ SDValue DAGCombiner::visitANDLike(SDValue N0, SDValue N1,
     }
   }
 
+  // Reduce bit extract of low half of an integer to the narrower type.
+  // (and (srl i64:x, K), KMask) ->
+  //   (i64 zero_extend (and (srl (i32 (trunc i64:x)), K)), KMask)
+  if (N0.getOpcode() == ISD::SRL && N0.hasOneUse()) {
+    if (ConstantSDNode *CAnd = dyn_cast<ConstantSDNode>(N1)) {
+      if (ConstantSDNode *CShift = dyn_cast<ConstantSDNode>(N0.getOperand(1))) {
+        unsigned Size = VT.getSizeInBits();
+        const APInt &AndMask = CAnd->getAPIntValue();
+        unsigned ShiftBits = CShift->getZExtValue();
+        unsigned MaskBits = AndMask.countTrailingOnes();
+        EVT HalfVT = EVT::getIntegerVT(*DAG.getContext(), Size / 2);
+
+        if (APIntOps::isMask(AndMask) &&
+            // Required bits must not span the two halves of the integer and
+            // must fit in the half size type.
+            (ShiftBits + MaskBits <= Size / 2) &&
+            TLI.isNarrowingProfitable(VT, HalfVT) &&
+            TLI.isTypeDesirableForOp(ISD::AND, HalfVT) &&
+            TLI.isTypeDesirableForOp(ISD::SRL, HalfVT) &&
+            TLI.isTruncateFree(VT, HalfVT) &&
+            TLI.isZExtFree(HalfVT, VT)) {
+          // The isNarrowingProfitable is to avoid regressions on PPC and
+          // AArch64 which match a few 64-bit bit insert / bit extract patterns
+          // on downstream users of this. Those patterns could probably be
+          // extended to handle extensions mixed in.
+
+          SDValue SL(N0);
+          assert(ShiftBits != 0 && MaskBits <= Size);
+
+          // Extracting the highest bit of the low half.
+          EVT ShiftVT = TLI.getShiftAmountTy(HalfVT, DAG.getDataLayout());
+          SDValue Trunc = DAG.getNode(ISD::TRUNCATE, SL, HalfVT,
+                                      N0.getOperand(0));
+
+          SDValue NewMask = DAG.getConstant(AndMask.trunc(Size / 2), SL, HalfVT);
+          SDValue ShiftK = DAG.getConstant(ShiftBits, SL, ShiftVT);
+          SDValue Shift = DAG.getNode(ISD::SRL, SL, HalfVT, Trunc, ShiftK);
+          SDValue And = DAG.getNode(ISD::AND, SL, HalfVT, Shift, NewMask);
+          return DAG.getNode(ISD::ZERO_EXTEND, SL, VT, And);
+        }
+      }
+    }
+  }
+
   return SDValue();
 }
 
@@ -3045,8 +3082,8 @@ SDValue DAGCombiner::visitAND(SDNode *N) {
   if (N0C && N1C && !N1C->isOpaque())
     return DAG.FoldConstantArithmetic(ISD::AND, SDLoc(N), VT, N0C, N1C);
   // canonicalize constant to RHS
-  if (isConstantIntBuildVectorOrConstantInt(N0) &&
-     !isConstantIntBuildVectorOrConstantInt(N1))
+  if (DAG.isConstantIntBuildVectorOrConstantInt(N0) &&
+     !DAG.isConstantIntBuildVectorOrConstantInt(N1))
     return DAG.getNode(ISD::AND, SDLoc(N), VT, N1, N0);
   // fold (and x, -1) -> x
   if (isAllOnesConstant(N1))
@@ -3090,8 +3127,10 @@ SDValue DAGCombiner::visitAND(SDNode *N) {
   // the 'X' node here can either be nothing or an extract_vector_elt to catch
   // more cases.
   if ((N0.getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
-       N0.getOperand(0).getOpcode() == ISD::LOAD) ||
-      N0.getOpcode() == ISD::LOAD) {
+       N0.getValueSizeInBits() == N0.getOperand(0).getScalarValueSizeInBits() &&
+       N0.getOperand(0).getOpcode() == ISD::LOAD &&
+       N0.getOperand(0).getResNo() == 0) ||
+      (N0.getOpcode() == ISD::LOAD && N0.getResNo() == 0)) {
     LoadSDNode *Load = cast<LoadSDNode>( (N0.getOpcode() == ISD::LOAD) ?
                                          N0 : N0.getOperand(0) );
 
@@ -3234,12 +3273,10 @@ SDValue DAGCombiner::visitAND(SDNode *N) {
 
           AddToWorklist(NewPtr.getNode());
 
-          SDValue Load =
-            DAG.getExtLoad(ISD::ZEXTLOAD, SDLoc(LN0), LoadResultTy,
-                           LN0->getChain(), NewPtr,
-                           LN0->getPointerInfo(),
-                           ExtVT, LN0->isVolatile(), LN0->isNonTemporal(),
-                           LN0->isInvariant(), Alignment, LN0->getAAInfo());
+          SDValue Load = DAG.getExtLoad(
+              ISD::ZEXTLOAD, SDLoc(LN0), LoadResultTy, LN0->getChain(), NewPtr,
+              LN0->getPointerInfo(), ExtVT, Alignment,
+              LN0->getMemOperand()->getFlags(), LN0->getAAInfo());
           AddToWorklist(N);
           CombineTo(LN0, Load, Load.getValue(1));
           return SDValue(N, 0);   // Return N so it doesn't get rechecked!
@@ -3303,9 +3340,8 @@ SDValue DAGCombiner::visitAND(SDNode *N) {
   }
   // fold (and (or (srl N, 8), (shl N, 8)), 0xffff) -> (srl (bswap N), const)
   if (N1C && N1C->getAPIntValue() == 0xffff && N0.getOpcode() == ISD::OR) {
-    SDValue BSwap = MatchBSwapHWordLow(N0.getNode(), N0.getOperand(0),
-                                       N0.getOperand(1), false);
-    if (BSwap.getNode())
+    if (SDValue BSwap = MatchBSwapHWordLow(N0.getNode(), N0.getOperand(0),
+                                           N0.getOperand(1), false))
       return BSwap;
   }
 
@@ -3576,7 +3612,7 @@ SDValue DAGCombiner::visitORLike(SDValue N0, SDValue N1, SDNode *LocReference) {
   EVT VT = N1.getValueType();
   // fold (or x, undef) -> -1
   if (!LegalOperations &&
-      (N0.getOpcode() == ISD::UNDEF || N1.getOpcode() == ISD::UNDEF)) {
+      (N0.isUndef() || N1.isUndef())) {
     EVT EltVT = VT.isVector() ? VT.getVectorElementType() : VT;
     return DAG.getConstant(APInt::getAllOnesValue(EltVT.getSizeInBits()),
                            SDLoc(LocReference), VT);
@@ -3697,59 +3733,70 @@ SDValue DAGCombiner::visitOR(SDNode *N) {
               N1.getValueType().getScalarType().getSizeInBits()),
           SDLoc(N), N1.getValueType());
 
-    // fold (or (shuf A, V_0, MA), (shuf B, V_0, MB)) -> (shuf A, B, Mask1)
-    // fold (or (shuf A, V_0, MA), (shuf B, V_0, MB)) -> (shuf B, A, Mask2)
+    // fold (or (shuf A, V_0, MA), (shuf B, V_0, MB)) -> (shuf A, B, Mask)
     // Do this only if the resulting shuffle is legal.
     if (isa<ShuffleVectorSDNode>(N0) &&
         isa<ShuffleVectorSDNode>(N1) &&
         // Avoid folding a node with illegal type.
-        TLI.isTypeLegal(VT) &&
-        N0->getOperand(1) == N1->getOperand(1) &&
-        ISD::isBuildVectorAllZeros(N0.getOperand(1).getNode())) {
-      bool CanFold = true;
-      unsigned NumElts = VT.getVectorNumElements();
-      const ShuffleVectorSDNode *SV0 = cast<ShuffleVectorSDNode>(N0);
-      const ShuffleVectorSDNode *SV1 = cast<ShuffleVectorSDNode>(N1);
-      // We construct two shuffle masks:
-      // - Mask1 is a shuffle mask for a shuffle with N0 as the first operand
-      // and N1 as the second operand.
-      // - Mask2 is a shuffle mask for a shuffle with N1 as the first operand
-      // and N0 as the second operand.
-      // We do this because OR is commutable and therefore there might be
-      // two ways to fold this node into a shuffle.
-      SmallVector<int,4> Mask1;
-      SmallVector<int,4> Mask2;
-
-      for (unsigned i = 0; i != NumElts && CanFold; ++i) {
-        int M0 = SV0->getMaskElt(i);
-        int M1 = SV1->getMaskElt(i);
-
-        // Both shuffle indexes are undef. Propagate Undef.
-        if (M0 < 0 && M1 < 0) {
-          Mask1.push_back(M0);
-          Mask2.push_back(M0);
-          continue;
-        }
+        TLI.isTypeLegal(VT)) {
+      bool ZeroN00 = ISD::isBuildVectorAllZeros(N0.getOperand(0).getNode());
+      bool ZeroN01 = ISD::isBuildVectorAllZeros(N0.getOperand(1).getNode());
+      bool ZeroN10 = ISD::isBuildVectorAllZeros(N1.getOperand(0).getNode());
+      bool ZeroN11 = ISD::isBuildVectorAllZeros(N1.getOperand(1).getNode());
+      // Ensure both shuffles have a zero input.
+      if ((ZeroN00 || ZeroN01) && (ZeroN10 || ZeroN11)) {
+        assert((!ZeroN00 || !ZeroN01) && "Both inputs zero!");
+        assert((!ZeroN10 || !ZeroN11) && "Both inputs zero!");
+        const ShuffleVectorSDNode *SV0 = cast<ShuffleVectorSDNode>(N0);
+        const ShuffleVectorSDNode *SV1 = cast<ShuffleVectorSDNode>(N1);
+        bool CanFold = true;
+        int NumElts = VT.getVectorNumElements();
+        SmallVector<int, 4> Mask(NumElts);
+
+        for (int i = 0; i != NumElts; ++i) {
+          int M0 = SV0->getMaskElt(i);
+          int M1 = SV1->getMaskElt(i);
+
+          // Determine if either index is pointing to a zero vector.
+          bool M0Zero = M0 < 0 || (ZeroN00 == (M0 < NumElts));
+          bool M1Zero = M1 < 0 || (ZeroN10 == (M1 < NumElts));
+
+          // If one element is zero and the otherside is undef, keep undef.
+          // This also handles the case that both are undef.
+          if ((M0Zero && M1 < 0) || (M1Zero && M0 < 0)) {
+            Mask[i] = -1;
+            continue;
+          }
 
-        if (M0 < 0 || M1 < 0 ||
-            (M0 < (int)NumElts && M1 < (int)NumElts) ||
-            (M0 >= (int)NumElts && M1 >= (int)NumElts)) {
-          CanFold = false;
-          break;
+          // Make sure only one of the elements is zero.
+          if (M0Zero == M1Zero) {
+            CanFold = false;
+            break;
+          }
+
+          assert((M0 >= 0 || M1 >= 0) && "Undef index!");
+
+          // We have a zero and non-zero element. If the non-zero came from
+          // SV0 make the index a LHS index. If it came from SV1, make it
+          // a RHS index. We need to mod by NumElts because we don't care
+          // which operand it came from in the original shuffles.
+          Mask[i] = M1Zero ? M0 % NumElts : (M1 % NumElts) + NumElts;
         }
 
-        Mask1.push_back(M0 < (int)NumElts ? M0 : M1 + NumElts);
-        Mask2.push_back(M1 < (int)NumElts ? M1 : M0 + NumElts);
-      }
+        if (CanFold) {
+          SDValue NewLHS = ZeroN00 ? N0.getOperand(1) : N0.getOperand(0);
+          SDValue NewRHS = ZeroN10 ? N1.getOperand(1) : N1.getOperand(0);
 
-      if (CanFold) {
-        // Fold this sequence only if the resulting shuffle is 'legal'.
-        if (TLI.isShuffleMaskLegal(Mask1, VT))
-          return DAG.getVectorShuffle(VT, SDLoc(N), N0->getOperand(0),
-                                      N1->getOperand(0), &Mask1[0]);
-        if (TLI.isShuffleMaskLegal(Mask2, VT))
-          return DAG.getVectorShuffle(VT, SDLoc(N), N1->getOperand(0),
-                                      N0->getOperand(0), &Mask2[0]);
+          bool LegalMask = TLI.isShuffleMaskLegal(Mask, VT);
+          if (!LegalMask) {
+            std::swap(NewLHS, NewRHS);
+            ShuffleVectorSDNode::commuteMask(Mask);
+            LegalMask = TLI.isShuffleMaskLegal(Mask, VT);
+          }
+
+          if (LegalMask)
+            return DAG.getVectorShuffle(VT, SDLoc(N), NewLHS, NewRHS, Mask);
+        }
       }
     }
   }
@@ -3760,8 +3807,8 @@ SDValue DAGCombiner::visitOR(SDNode *N) {
   if (N0C && N1C && !N1C->isOpaque())
     return DAG.FoldConstantArithmetic(ISD::OR, SDLoc(N), VT, N0C, N1C);
   // canonicalize constant to RHS
-  if (isConstantIntBuildVectorOrConstantInt(N0) &&
-     !isConstantIntBuildVectorOrConstantInt(N1))
+  if (DAG.isConstantIntBuildVectorOrConstantInt(N0) &&
+     !DAG.isConstantIntBuildVectorOrConstantInt(N1))
     return DAG.getNode(ISD::OR, SDLoc(N), VT, N1, N0);
   // fold (or x, 0) -> x
   if (isNullConstant(N1))
@@ -3817,9 +3864,9 @@ SDValue DAGCombiner::visitOR(SDNode *N) {
 }
 
 /// Match "(X shl/srl V1) & V2" where V2 may not be present.
-static bool MatchRotateHalf(SDValue Op, SDValue &Shift, SDValue &Mask) {
+bool DAGCombiner::MatchRotateHalf(SDValue Op, SDValue &Shift, SDValue &Mask) {
   if (Op.getOpcode() == ISD::AND) {
-    if (isConstantIntBuildVectorOrConstantInt(Op.getOperand(1))) {
+    if (DAG.isConstantIntBuildVectorOrConstantInt(Op.getOperand(1))) {
       Mask = Op.getOperand(1);
       Op = Op.getOperand(0);
     } else {
@@ -3946,7 +3993,7 @@ static bool matchRotateSub(SDValue Pos, SDValue Neg, unsigned EltSize) {
 SDNode *DAGCombiner::MatchRotatePosNeg(SDValue Shifted, SDValue Pos,
                                        SDValue Neg, SDValue InnerPos,
                                        SDValue InnerNeg, unsigned PosOpcode,
-                                       unsigned NegOpcode, SDLoc DL) {
+                                       unsigned NegOpcode, const SDLoc &DL) {
   // fold (or (shl x, (*ext y)),
   //          (srl x, (*ext (sub 32, y)))) ->
   //   (rotl x, y) or (rotr x, (sub 32, y))
@@ -3967,7 +4014,7 @@ SDNode *DAGCombiner::MatchRotatePosNeg(SDValue Shifted, SDValue Pos,
 // MatchRotate - Handle an 'or' of two operands.  If this is one of the many
 // idioms for rotate, and if the target supports rotation instructions, generate
 // a rot[lr].
-SDNode *DAGCombiner::MatchRotate(SDValue LHS, SDValue RHS, SDLoc DL) {
+SDNode *DAGCombiner::MatchRotate(SDValue LHS, SDValue RHS, const SDLoc &DL) {
   // Must be a legal type.  Expanded 'n promoted things won't work with rotates.
   EVT VT = LHS.getValueType();
   if (!TLI.isTypeLegal(VT)) return nullptr;
@@ -4093,12 +4140,12 @@ SDValue DAGCombiner::visitXOR(SDNode *N) {
   }
 
   // fold (xor undef, undef) -> 0. This is a common idiom (misuse).
-  if (N0.getOpcode() == ISD::UNDEF && N1.getOpcode() == ISD::UNDEF)
+  if (N0.isUndef() && N1.isUndef())
     return DAG.getConstant(0, SDLoc(N), VT);
   // fold (xor x, undef) -> undef
-  if (N0.getOpcode() == ISD::UNDEF)
+  if (N0.isUndef())
     return N0;
-  if (N1.getOpcode() == ISD::UNDEF)
+  if (N1.isUndef())
     return N1;
   // fold (xor c1, c2) -> c1^c2
   ConstantSDNode *N0C = getAsNonOpaqueConstant(N0);
@@ -4106,8 +4153,8 @@ SDValue DAGCombiner::visitXOR(SDNode *N) {
   if (N0C && N1C)
     return DAG.FoldConstantArithmetic(ISD::XOR, SDLoc(N), VT, N0C, N1C);
   // canonicalize constant to RHS
-  if (isConstantIntBuildVectorOrConstantInt(N0) &&
-     !isConstantIntBuildVectorOrConstantInt(N1))
+  if (DAG.isConstantIntBuildVectorOrConstantInt(N0) &&
+     !DAG.isConstantIntBuildVectorOrConstantInt(N1))
     return DAG.getNode(ISD::XOR, SDLoc(N), VT, N1, N0);
   // fold (xor x, 0) -> x
   if (isNullConstant(N1))
@@ -4342,8 +4389,8 @@ SDValue DAGCombiner::visitRotate(SDNode *N) {
   // fold (rot* x, (trunc (and y, c))) -> (rot* x, (and (trunc y), (trunc c))).
   if (N->getOperand(1).getOpcode() == ISD::TRUNCATE &&
       N->getOperand(1).getOperand(0).getOpcode() == ISD::AND) {
-    SDValue NewOp1 = distributeTruncateThroughAnd(N->getOperand(1).getNode());
-    if (NewOp1.getNode())
+    if (SDValue NewOp1 =
+            distributeTruncateThroughAnd(N->getOperand(1).getNode()))
       return DAG.getNode(N->getOpcode(), SDLoc(N), N->getValueType(0),
                          N->getOperand(0), NewOp1);
   }
@@ -4398,7 +4445,7 @@ SDValue DAGCombiner::visitSHL(SDNode *N) {
   if (N1C && N1C->isNullValue())
     return N0;
   // fold (shl undef, x) -> 0
-  if (N0.getOpcode() == ISD::UNDEF)
+  if (N0.isUndef())
     return DAG.getConstant(0, SDLoc(N), VT);
   // if (shl x, c) is known to be zero, return 0
   if (DAG.MaskedValueIsZero(SDValue(N, 0),
@@ -4407,8 +4454,7 @@ SDValue DAGCombiner::visitSHL(SDNode *N) {
   // fold (shl x, (trunc (and y, c))) -> (shl x, (and (trunc y), (trunc c))).
   if (N1.getOpcode() == ISD::TRUNCATE &&
       N1.getOperand(0).getOpcode() == ISD::AND) {
-    SDValue NewOp1 = distributeTruncateThroughAnd(N1.getNode());
-    if (NewOp1.getNode())
+    if (SDValue NewOp1 = distributeTruncateThroughAnd(N1.getNode()))
       return DAG.getNode(ISD::SHL, SDLoc(N), VT, N0, NewOp1);
   }
 
@@ -4541,7 +4587,7 @@ SDValue DAGCombiner::visitSHL(SDNode *N) {
   APInt Val;
   if (N1C && N0.getOpcode() == ISD::ADD && N0.getNode()->hasOneUse() &&
       (isa<ConstantSDNode>(N0.getOperand(1)) ||
-       isConstantSplatVector(N0.getOperand(1).getNode(), Val))) {
+       ISD::isConstantSplatVector(N0.getOperand(1).getNode(), Val))) {
     SDValue Shl0 = DAG.getNode(ISD::SHL, SDLoc(N0), VT, N0.getOperand(0), N1);
     SDValue Shl1 = DAG.getNode(ISD::SHL, SDLoc(N1), VT, N0.getOperand(1), N1);
     return DAG.getNode(ISD::ADD, SDLoc(N), VT, Shl0, Shl1);
@@ -4637,7 +4683,7 @@ SDValue DAGCombiner::visitSRA(SDNode *N) {
         TruncVT = EVT::getVectorVT(Ctx, TruncVT, VT.getVectorNumElements());
 
       // Determine the residual right-shift amount.
-      signed ShiftAmt = N1C->getZExtValue() - N01C->getZExtValue();
+      int ShiftAmt = N1C->getZExtValue() - N01C->getZExtValue();
 
       // If the shift is not a no-op (in which case this should be just a sign
       // extend already), the truncated to type is legal, sign_extend is legal
@@ -4664,8 +4710,7 @@ SDValue DAGCombiner::visitSRA(SDNode *N) {
   // fold (sra x, (trunc (and y, c))) -> (sra x, (and (trunc y), (trunc c))).
   if (N1.getOpcode() == ISD::TRUNCATE &&
       N1.getOperand(0).getOpcode() == ISD::AND) {
-    SDValue NewOp1 = distributeTruncateThroughAnd(N1.getNode());
-    if (NewOp1.getNode())
+    if (SDValue NewOp1 = distributeTruncateThroughAnd(N1.getNode()))
       return DAG.getNode(ISD::SRA, SDLoc(N), VT, N0, NewOp1);
   }
 
@@ -4916,7 +4961,7 @@ SDValue DAGCombiner::visitBSWAP(SDNode *N) {
   EVT VT = N->getValueType(0);
 
   // fold (bswap c1) -> c2
-  if (isConstantIntBuildVectorOrConstantInt(N0))
+  if (DAG.isConstantIntBuildVectorOrConstantInt(N0))
     return DAG.getNode(ISD::BSWAP, SDLoc(N), VT, N0);
   // fold (bswap (bswap x)) -> x
   if (N0.getOpcode() == ISD::BSWAP)
@@ -4924,12 +4969,21 @@ SDValue DAGCombiner::visitBSWAP(SDNode *N) {
   return SDValue();
 }
 
+SDValue DAGCombiner::visitBITREVERSE(SDNode *N) {
+  SDValue N0 = N->getOperand(0);
+
+  // fold (bitreverse (bitreverse x)) -> x
+  if (N0.getOpcode() == ISD::BITREVERSE)
+    return N0.getOperand(0);
+  return SDValue();
+}
+
 SDValue DAGCombiner::visitCTLZ(SDNode *N) {
   SDValue N0 = N->getOperand(0);
   EVT VT = N->getValueType(0);
 
   // fold (ctlz c1) -> c2
-  if (isConstantIntBuildVectorOrConstantInt(N0))
+  if (DAG.isConstantIntBuildVectorOrConstantInt(N0))
     return DAG.getNode(ISD::CTLZ, SDLoc(N), VT, N0);
   return SDValue();
 }
@@ -4939,7 +4993,7 @@ SDValue DAGCombiner::visitCTLZ_ZERO_UNDEF(SDNode *N) {
   EVT VT = N->getValueType(0);
 
   // fold (ctlz_zero_undef c1) -> c2
-  if (isConstantIntBuildVectorOrConstantInt(N0))
+  if (DAG.isConstantIntBuildVectorOrConstantInt(N0))
     return DAG.getNode(ISD::CTLZ_ZERO_UNDEF, SDLoc(N), VT, N0);
   return SDValue();
 }
@@ -4949,7 +5003,7 @@ SDValue DAGCombiner::visitCTTZ(SDNode *N) {
   EVT VT = N->getValueType(0);
 
   // fold (cttz c1) -> c2
-  if (isConstantIntBuildVectorOrConstantInt(N0))
+  if (DAG.isConstantIntBuildVectorOrConstantInt(N0))
     return DAG.getNode(ISD::CTTZ, SDLoc(N), VT, N0);
   return SDValue();
 }
@@ -4959,7 +5013,7 @@ SDValue DAGCombiner::visitCTTZ_ZERO_UNDEF(SDNode *N) {
   EVT VT = N->getValueType(0);
 
   // fold (cttz_zero_undef c1) -> c2
-  if (isConstantIntBuildVectorOrConstantInt(N0))
+  if (DAG.isConstantIntBuildVectorOrConstantInt(N0))
     return DAG.getNode(ISD::CTTZ_ZERO_UNDEF, SDLoc(N), VT, N0);
   return SDValue();
 }
@@ -4969,15 +5023,15 @@ SDValue DAGCombiner::visitCTPOP(SDNode *N) {
   EVT VT = N->getValueType(0);
 
   // fold (ctpop c1) -> c2
-  if (isConstantIntBuildVectorOrConstantInt(N0))
+  if (DAG.isConstantIntBuildVectorOrConstantInt(N0))
     return DAG.getNode(ISD::CTPOP, SDLoc(N), VT, N0);
   return SDValue();
 }
 
 
 /// \brief Generate Min/Max node
-static SDValue combineMinNumMaxNum(SDLoc DL, EVT VT, SDValue LHS, SDValue RHS,
-                                   SDValue True, SDValue False,
+static SDValue combineMinNumMaxNum(const SDLoc &DL, EVT VT, SDValue LHS,
+                                   SDValue RHS, SDValue True, SDValue False,
                                    ISD::CondCode CC, const TargetLowering &TLI,
                                    SelectionDAG &DAG) {
   if (!(LHS == True && RHS == False) && !(LHS == False && RHS == True))
@@ -5237,7 +5291,7 @@ static SDValue ConvertSelectToConcatVector(SDNode *N, SelectionDAG &DAG) {
   // length of the BV and see if all the non-undef nodes are the same.
   ConstantSDNode *BottomHalf = nullptr;
   for (int i = 0; i < NumElems / 2; ++i) {
-    if (Cond->getOperand(i)->getOpcode() == ISD::UNDEF)
+    if (Cond->getOperand(i)->isUndef())
       continue;
 
     if (BottomHalf == nullptr)
@@ -5249,7 +5303,7 @@ static SDValue ConvertSelectToConcatVector(SDNode *N, SelectionDAG &DAG) {
   // Do the same for the second half of the BuildVector
   ConstantSDNode *TopHalf = nullptr;
   for (int i = NumElems / 2; i < NumElems; ++i) {
-    if (Cond->getOperand(i)->getOpcode() == ISD::UNDEF)
+    if (Cond->getOperand(i)->isUndef())
       continue;
 
     if (TopHalf == nullptr)
@@ -5666,9 +5720,8 @@ SDValue DAGCombiner::visitSELECT_CC(SDNode *N) {
     return N2;
 
   // Determine if the condition we're dealing with is constant
-  SDValue SCC = SimplifySetCC(getSetCCResultType(N0.getValueType()),
-                              N0, N1, CC, SDLoc(N), false);
-  if (SCC.getNode()) {
+  if (SDValue SCC = SimplifySetCC(getSetCCResultType(N0.getValueType()), N0, N1,
+                                  CC, SDLoc(N), false)) {
     AddToWorklist(SCC.getNode());
 
     if (ConstantSDNode *SCCC = dyn_cast<ConstantSDNode>(SCC.getNode())) {
@@ -5676,7 +5729,7 @@ SDValue DAGCombiner::visitSELECT_CC(SDNode *N) {
         return N2;    // cond always true -> true val
       else
         return N3;    // cond always false -> false val
-    } else if (SCC->getOpcode() == ISD::UNDEF) {
+    } else if (SCC->isUndef()) {
       // When the condition is UNDEF, just return the first operand. This is
       // coherent the DAG creation, no setcc node is created in this case
       return N2;
@@ -5729,7 +5782,8 @@ static SDNode *tryToFoldExtendOfConstant(SDNode *N, const TargetLowering &TLI,
   EVT VT = N->getValueType(0);
 
   assert((Opcode == ISD::SIGN_EXTEND || Opcode == ISD::ZERO_EXTEND ||
-         Opcode == ISD::ANY_EXTEND || Opcode == ISD::SIGN_EXTEND_VECTOR_INREG)
+         Opcode == ISD::ANY_EXTEND || Opcode == ISD::SIGN_EXTEND_VECTOR_INREG ||
+         Opcode == ISD::ZERO_EXTEND_VECTOR_INREG)
          && "Expected EXTEND dag node in input!");
 
   // fold (sext c1) -> c1
@@ -5756,7 +5810,7 @@ static SDNode *tryToFoldExtendOfConstant(SDNode *N, const TargetLowering &TLI,
 
   for (unsigned i=0; i != NumElts; ++i) {
     SDValue Op = N0->getOperand(i);
-    if (Op->getOpcode() == ISD::UNDEF) {
+    if (Op->isUndef()) {
       Elts.push_back(DAG.getUNDEF(SVT));
       continue;
     }
@@ -5771,7 +5825,7 @@ static SDNode *tryToFoldExtendOfConstant(SDNode *N, const TargetLowering &TLI,
       Elts.push_back(DAG.getConstant(C.zext(VTBits), DL, SVT));
   }
 
-  return DAG.getNode(ISD::BUILD_VECTOR, DL, VT, Elts).getNode();
+  return DAG.getBuildVector(VT, DL, Elts).getNode();
 }
 
 // ExtendUsesToFormExtLoad - Trying to extend uses of a load to enable this:
@@ -5839,8 +5893,8 @@ static bool ExtendUsesToFormExtLoad(SDNode *N, SDValue N0,
 }
 
 void DAGCombiner::ExtendSetCCUses(const SmallVectorImpl<SDNode *> &SetCCs,
-                                  SDValue Trunc, SDValue ExtLoad, SDLoc DL,
-                                  ISD::NodeType ExtType) {
+                                  SDValue Trunc, SDValue ExtLoad,
+                                  const SDLoc &DL, ISD::NodeType ExtType) {
   // Extend SetCC uses if necessary.
   for (unsigned i = 0, e = SetCCs.size(); i != e; ++i) {
     SDNode *SetCC = SetCCs[i];
@@ -5929,9 +5983,8 @@ SDValue DAGCombiner::CombineExtLoad(SDNode *N) {
 
     SDValue SplitLoad = DAG.getExtLoad(
         ExtType, DL, SplitDstVT, LN0->getChain(), BasePtr,
-        LN0->getPointerInfo().getWithOffset(Offset), SplitSrcVT,
-        LN0->isVolatile(), LN0->isNonTemporal(), LN0->isInvariant(),
-        Align, LN0->getAAInfo());
+        LN0->getPointerInfo().getWithOffset(Offset), SplitSrcVT, Align,
+        LN0->getMemOperand()->getFlags(), LN0->getAAInfo());
 
     BasePtr = DAG.getNode(ISD::ADD, DL, BasePtr.getValueType(), BasePtr,
                           DAG.getConstant(Stride, DL, BasePtr.getValueType()));
@@ -6150,11 +6203,11 @@ SDValue DAGCombiner::visitSIGN_EXTEND(SDNode *N) {
     SDLoc DL(N);
     SDValue NegOne =
       DAG.getConstant(APInt::getAllOnesValue(ElementWidth), DL, VT);
-    SDValue SCC =
-      SimplifySelectCC(DL, N0.getOperand(0), N0.getOperand(1),
-                       NegOne, DAG.getConstant(0, DL, VT),
-                       cast<CondCodeSDNode>(N0.getOperand(2))->get(), true);
-    if (SCC.getNode()) return SCC;
+    if (SDValue SCC = SimplifySelectCC(
+            DL, N0.getOperand(0), N0.getOperand(1), NegOne,
+            DAG.getConstant(0, DL, VT),
+            cast<CondCodeSDNode>(N0.getOperand(2))->get(), true))
+      return SCC;
 
     if (!VT.isVector()) {
       EVT SetCCVT = getSetCCResultType(N0.getOperand(0).getValueType());
@@ -6436,56 +6489,48 @@ SDValue DAGCombiner::visitZERO_EXTEND(SDNode *N) {
   }
 
   if (N0.getOpcode() == ISD::SETCC) {
+    // Only do this before legalize for now.
     if (!LegalOperations && VT.isVector() &&
         N0.getValueType().getVectorElementType() == MVT::i1) {
-      EVT N0VT = N0.getOperand(0).getValueType();
-      if (getSetCCResultType(N0VT) == N0.getValueType())
+      EVT N00VT = N0.getOperand(0).getValueType();
+      if (getSetCCResultType(N00VT) == N0.getValueType())
         return SDValue();
 
-      // zext(setcc) -> (and (vsetcc), (1, 1, ...) for vectors.
-      // Only do this before legalize for now.
-      EVT EltVT = VT.getVectorElementType();
+      // We know that the # elements of the results is the same as the #
+      // elements of the compare (and the # elements of the compare result for
+      // that matter). Check to see that they are the same size. If so, we know
+      // that the element size of the sext'd result matches the element size of
+      // the compare operands.
       SDLoc DL(N);
-      SmallVector<SDValue,8> OneOps(VT.getVectorNumElements(),
-                                    DAG.getConstant(1, DL, EltVT));
-      if (VT.getSizeInBits() == N0VT.getSizeInBits())
-        // We know that the # elements of the results is the same as the
-        // # elements of the compare (and the # elements of the compare result
-        // for that matter).  Check to see that they are the same size.  If so,
-        // we know that the element size of the sext'd result matches the
-        // element size of the compare operands.
-        return DAG.getNode(ISD::AND, DL, VT,
-                           DAG.getSetCC(DL, VT, N0.getOperand(0),
-                                         N0.getOperand(1),
-                                 cast<CondCodeSDNode>(N0.getOperand(2))->get()),
-                           DAG.getNode(ISD::BUILD_VECTOR, DL, VT,
-                                       OneOps));
+      SDValue VecOnes = DAG.getConstant(1, DL, VT);
+      if (VT.getSizeInBits() == N00VT.getSizeInBits()) {
+        // zext(setcc) -> (and (vsetcc), (1, 1, ...) for vectors.
+        SDValue VSetCC = DAG.getNode(ISD::SETCC, DL, VT, N0.getOperand(0),
+                                     N0.getOperand(1), N0.getOperand(2));
+        return DAG.getNode(ISD::AND, DL, VT, VSetCC, VecOnes);
+      }
 
       // If the desired elements are smaller or larger than the source
       // elements we can use a matching integer vector type and then
-      // truncate/sign extend
-      EVT MatchingElementType =
-        EVT::getIntegerVT(*DAG.getContext(),
-                          N0VT.getScalarType().getSizeInBits());
-      EVT MatchingVectorType =
-        EVT::getVectorVT(*DAG.getContext(), MatchingElementType,
-                         N0VT.getVectorNumElements());
+      // truncate/sign extend.
+      EVT MatchingElementType = EVT::getIntegerVT(
+          *DAG.getContext(), N00VT.getScalarType().getSizeInBits());
+      EVT MatchingVectorType = EVT::getVectorVT(
+          *DAG.getContext(), MatchingElementType, N00VT.getVectorNumElements());
       SDValue VsetCC =
-        DAG.getSetCC(DL, MatchingVectorType, N0.getOperand(0),
-                      N0.getOperand(1),
-                      cast<CondCodeSDNode>(N0.getOperand(2))->get());
-      return DAG.getNode(ISD::AND, DL, VT,
-                         DAG.getSExtOrTrunc(VsetCC, DL, VT),
-                         DAG.getNode(ISD::BUILD_VECTOR, DL, VT, OneOps));
+          DAG.getNode(ISD::SETCC, DL, MatchingVectorType, N0.getOperand(0),
+                      N0.getOperand(1), N0.getOperand(2));
+      return DAG.getNode(ISD::AND, DL, VT, DAG.getSExtOrTrunc(VsetCC, DL, VT),
+                         VecOnes);
     }
 
     // zext(setcc x,y,cc) -> select_cc x, y, 1, 0, cc
     SDLoc DL(N);
-    SDValue SCC =
-      SimplifySelectCC(DL, N0.getOperand(0), N0.getOperand(1),
-                       DAG.getConstant(1, DL, VT), DAG.getConstant(0, DL, VT),
-                       cast<CondCodeSDNode>(N0.getOperand(2))->get(), true);
-    if (SCC.getNode()) return SCC;
+    if (SDValue SCC = SimplifySelectCC(
+            DL, N0.getOperand(0), N0.getOperand(1), DAG.getConstant(1, DL, VT),
+            DAG.getConstant(0, DL, VT),
+            cast<CondCodeSDNode>(N0.getOperand(2))->get(), true))
+      return SCC;
   }
 
   // (zext (shl (zext x), cst)) -> (shl (zext x), cst)
@@ -6660,11 +6705,10 @@ SDValue DAGCombiner::visitANY_EXTEND(SDNode *N) {
 
     // aext(setcc x,y,cc) -> select_cc x, y, 1, 0, cc
     SDLoc DL(N);
-    SDValue SCC =
-      SimplifySelectCC(DL, N0.getOperand(0), N0.getOperand(1),
-                       DAG.getConstant(1, DL, VT), DAG.getConstant(0, DL, VT),
-                       cast<CondCodeSDNode>(N0.getOperand(2))->get(), true);
-    if (SCC.getNode())
+    if (SDValue SCC = SimplifySelectCC(
+            DL, N0.getOperand(0), N0.getOperand(1), DAG.getConstant(1, DL, VT),
+            DAG.getConstant(0, DL, VT),
+            cast<CondCodeSDNode>(N0.getOperand(2))->get(), true))
       return SCC;
   }
 
@@ -6854,15 +6898,14 @@ SDValue DAGCombiner::ReduceLoadWidth(SDNode *N) {
 
   SDValue Load;
   if (ExtType == ISD::NON_EXTLOAD)
-    Load =  DAG.getLoad(VT, SDLoc(N0), LN0->getChain(), NewPtr,
-                        LN0->getPointerInfo().getWithOffset(PtrOff),
-                        LN0->isVolatile(), LN0->isNonTemporal(),
-                        LN0->isInvariant(), NewAlign, LN0->getAAInfo());
+    Load = DAG.getLoad(VT, SDLoc(N0), LN0->getChain(), NewPtr,
+                       LN0->getPointerInfo().getWithOffset(PtrOff), NewAlign,
+                       LN0->getMemOperand()->getFlags(), LN0->getAAInfo());
   else
-    Load = DAG.getExtLoad(ExtType, SDLoc(N0), VT, LN0->getChain(),NewPtr,
-                          LN0->getPointerInfo().getWithOffset(PtrOff),
-                          ExtVT, LN0->isVolatile(), LN0->isNonTemporal(),
-                          LN0->isInvariant(), NewAlign, LN0->getAAInfo());
+    Load = DAG.getExtLoad(ExtType, SDLoc(N0), VT, LN0->getChain(), NewPtr,
+                          LN0->getPointerInfo().getWithOffset(PtrOff), ExtVT,
+                          NewAlign, LN0->getMemOperand()->getFlags(),
+                          LN0->getAAInfo());
 
   // Replace the old load's chain with the new load's chain.
   WorklistRemover DeadNodes(*this);
@@ -6902,7 +6945,7 @@ SDValue DAGCombiner::visitSIGN_EXTEND_INREG(SDNode *N) {
     return DAG.getUNDEF(VT);
 
   // fold (sext_in_reg c1) -> c1
-  if (isConstantIntBuildVectorOrConstantInt(N0))
+  if (DAG.isConstantIntBuildVectorOrConstantInt(N0))
     return DAG.getNode(ISD::SIGN_EXTEND_INREG, SDLoc(N), VT, N0, N1);
 
   // If the input is already sign extended, just drop the extension.
@@ -6988,9 +7031,8 @@ SDValue DAGCombiner::visitSIGN_EXTEND_INREG(SDNode *N) {
 
   // Form (sext_inreg (bswap >> 16)) or (sext_inreg (rotl (bswap) 16))
   if (EVTBits <= 16 && N0.getOpcode() == ISD::OR) {
-    SDValue BSwap = MatchBSwapHWordLow(N0.getNode(), N0.getOperand(0),
-                                       N0.getOperand(1), false);
-    if (BSwap.getNode())
+    if (SDValue BSwap = MatchBSwapHWordLow(N0.getNode(), N0.getOperand(0),
+                                           N0.getOperand(1), false))
       return DAG.getNode(ISD::SIGN_EXTEND_INREG, SDLoc(N), VT,
                          BSwap, N1);
   }
@@ -7002,7 +7044,21 @@ SDValue DAGCombiner::visitSIGN_EXTEND_VECTOR_INREG(SDNode *N) {
   SDValue N0 = N->getOperand(0);
   EVT VT = N->getValueType(0);
 
-  if (N0.getOpcode() == ISD::UNDEF)
+  if (N0.isUndef())
+    return DAG.getUNDEF(VT);
+
+  if (SDNode *Res = tryToFoldExtendOfConstant(N, TLI, DAG, LegalTypes,
+                                              LegalOperations))
+    return SDValue(Res, 0);
+
+  return SDValue();
+}
+
+SDValue DAGCombiner::visitZERO_EXTEND_VECTOR_INREG(SDNode *N) {
+  SDValue N0 = N->getOperand(0);
+  EVT VT = N->getValueType(0);
+
+  if (N0.isUndef())
     return DAG.getUNDEF(VT);
 
   if (SDNode *Res = tryToFoldExtendOfConstant(N, TLI, DAG, LegalTypes,
@@ -7021,7 +7077,7 @@ SDValue DAGCombiner::visitTRUNCATE(SDNode *N) {
   if (N0.getValueType() == N->getValueType(0))
     return N0;
   // fold (truncate c1) -> c1
-  if (isConstantIntBuildVectorOrConstantInt(N0))
+  if (DAG.isConstantIntBuildVectorOrConstantInt(N0))
     return DAG.getNode(ISD::TRUNCATE, SDLoc(N), VT, N0);
   // fold (truncate (truncate x)) -> (truncate x)
   if (N0.getOpcode() == ISD::TRUNCATE)
@@ -7030,12 +7086,11 @@ SDValue DAGCombiner::visitTRUNCATE(SDNode *N) {
   if (N0.getOpcode() == ISD::ZERO_EXTEND ||
       N0.getOpcode() == ISD::SIGN_EXTEND ||
       N0.getOpcode() == ISD::ANY_EXTEND) {
+    // if the source is smaller than the dest, we still need an extend.
     if (N0.getOperand(0).getValueType().bitsLT(VT))
-      // if the source is smaller than the dest, we still need an extend
-      return DAG.getNode(N0.getOpcode(), SDLoc(N), VT,
-                         N0.getOperand(0));
+      return DAG.getNode(N0.getOpcode(), SDLoc(N), VT, N0.getOperand(0));
+    // if the source is larger than the dest, than we just need the truncate.
     if (N0.getOperand(0).getValueType().bitsGT(VT))
-      // if the source is larger than the dest, than we just need the truncate
       return DAG.getNode(ISD::TRUNCATE, SDLoc(N), VT, N0.getOperand(0));
     // if the source and dest are the same type, we can drop both the extend
     // and the truncate.
@@ -7071,12 +7126,9 @@ SDValue DAGCombiner::visitTRUNCATE(SDNode *N) {
       EVT IndexTy = TLI.getVectorIdxTy(DAG.getDataLayout());
       int Index = isLE ? (Elt*SizeRatio) : (Elt*SizeRatio + (SizeRatio-1));
 
-      SDValue V = DAG.getNode(ISD::BITCAST, SDLoc(N),
-                              NVT, N0.getOperand(0));
-
       SDLoc DL(N);
-      return DAG.getNode(ISD::EXTRACT_VECTOR_ELT,
-                         DL, TrTy, V,
+      return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, TrTy,
+                         DAG.getBitcast(NVT, N0.getOperand(0)),
                          DAG.getConstant(Index, DL, IndexTy));
     }
   }
@@ -7094,6 +7146,25 @@ SDValue DAGCombiner::visitTRUNCATE(SDNode *N) {
     }
   }
 
+  // trunc (shl x, K) -> shl (trunc x), K => K < vt.size / 2
+  if (N0.getOpcode() == ISD::SHL && N0.hasOneUse() &&
+      (!LegalOperations || TLI.isOperationLegalOrCustom(ISD::SHL, VT)) &&
+      TLI.isTypeDesirableForOp(ISD::SHL, VT)) {
+    if (const ConstantSDNode *CAmt = isConstOrConstSplat(N0.getOperand(1))) {
+      uint64_t Amt = CAmt->getZExtValue();
+      unsigned Size = VT.getSizeInBits();
+
+      if (Amt < Size / 2) {
+        SDLoc SL(N);
+        EVT AmtVT = TLI.getShiftAmountTy(VT, DAG.getDataLayout());
+
+        SDValue Trunc = DAG.getNode(ISD::TRUNCATE, SL, VT, N0.getOperand(0));
+        return DAG.getNode(ISD::SHL, SL, VT, Trunc,
+                           DAG.getConstant(Amt, SL, AmtVT));
+      }
+    }
+  }
+
   // Fold a series of buildvector, bitcast, and truncate if possible.
   // For example fold
   //   (2xi32 trunc (bitcast ((4xi32)buildvector x, x, y, y) 2xi64)) to
@@ -7121,7 +7192,7 @@ SDValue DAGCombiner::visitTRUNCATE(SDNode *N) {
       for (unsigned i = 0, e = BuildVecNumElts; i != e; i += TruncEltOffset)
         Opnds.push_back(BuildVect.getOperand(i));
 
-      return DAG.getNode(ISD::BUILD_VECTOR, SDLoc(N), VT, Opnds);
+      return DAG.getBuildVector(VT, SDLoc(N), Opnds);
     }
   }
 
@@ -7131,10 +7202,9 @@ SDValue DAGCombiner::visitTRUNCATE(SDNode *N) {
   // Currently we only perform this optimization on scalars because vectors
   // may have different active low bits.
   if (!VT.isVector()) {
-    SDValue Shorter =
-      GetDemandedBits(N0, APInt::getLowBitsSet(N0.getValueSizeInBits(),
-                                               VT.getSizeInBits()));
-    if (Shorter.getNode())
+    if (SDValue Shorter =
+            GetDemandedBits(N0, APInt::getLowBitsSet(N0.getValueSizeInBits(),
+                                                     VT.getSizeInBits())))
       return DAG.getNode(ISD::TRUNCATE, SDLoc(N), VT, Shorter);
   }
   // fold (truncate (load x)) -> (smaller load x)
@@ -7168,7 +7238,7 @@ SDValue DAGCombiner::visitTRUNCATE(SDNode *N) {
 
     for (unsigned i = 0, e = N0.getNumOperands(); i != e; ++i) {
       SDValue X = N0.getOperand(i);
-      if (X.getOpcode() != ISD::UNDEF) {
+      if (!X.isUndef()) {
         V = X;
         Idx = i;
         NumDefs++;
@@ -7200,6 +7270,24 @@ SDValue DAGCombiner::visitTRUNCATE(SDNode *N) {
     }
   }
 
+  // Fold truncate of a bitcast of a vector to an extract of the low vector
+  // element.
+  //
+  // e.g. trunc (i64 (bitcast v2i32:x)) -> extract_vector_elt v2i32:x, 0
+  if (N0.getOpcode() == ISD::BITCAST && !VT.isVector()) {
+    SDValue VecSrc = N0.getOperand(0);
+    EVT SrcVT = VecSrc.getValueType();
+    if (SrcVT.isVector() && SrcVT.getScalarType() == VT &&
+        (!LegalOperations ||
+         TLI.isOperationLegal(ISD::EXTRACT_VECTOR_ELT, SrcVT))) {
+      SDLoc SL(N);
+
+      EVT IdxVT = TLI.getVectorIdxTy(DAG.getDataLayout());
+      return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, VT,
+                         VecSrc, DAG.getConstant(0, SL, IdxVT));
+    }
+  }
+
   // Simplify the operands using demanded-bits information.
   if (!VT.isVector() &&
       SimplifyDemandedBits(SDValue(N, 0)))
@@ -7226,23 +7314,17 @@ SDValue DAGCombiner::CombineConsecutiveLoads(SDNode *N, EVT VT) {
       LD1->getAddressSpace() != LD2->getAddressSpace())
     return SDValue();
   EVT LD1VT = LD1->getValueType(0);
-
-  if (ISD::isNON_EXTLoad(LD2) &&
-      LD2->hasOneUse() &&
-      // If both are volatile this would reduce the number of volatile loads.
-      // If one is volatile it might be ok, but play conservative and bail out.
-      !LD1->isVolatile() &&
-      !LD2->isVolatile() &&
-      DAG.isConsecutiveLoad(LD2, LD1, LD1VT.getSizeInBits()/8, 1)) {
+  unsigned LD1Bytes = LD1VT.getSizeInBits() / 8;
+  if (ISD::isNON_EXTLoad(LD2) && LD2->hasOneUse() &&
+      DAG.areNonVolatileConsecutiveLoads(LD2, LD1, LD1Bytes, 1)) {
     unsigned Align = LD1->getAlignment();
     unsigned NewAlign = DAG.getDataLayout().getABITypeAlignment(
         VT.getTypeForEVT(*DAG.getContext()));
 
     if (NewAlign <= Align &&
         (!LegalOperations || TLI.isOperationLegal(ISD::LOAD, VT)))
-      return DAG.getLoad(VT, SDLoc(N), LD1->getChain(),
-                         LD1->getBasePtr(), LD1->getPointerInfo(),
-                         false, false, false, Align);
+      return DAG.getLoad(VT, SDLoc(N), LD1->getChain(), LD1->getBasePtr(),
+                         LD1->getPointerInfo(), Align);
   }
 
   return SDValue();
@@ -7254,6 +7336,49 @@ static unsigned getPPCf128HiElementSelector(const SelectionDAG &DAG) {
   return DAG.getDataLayout().isBigEndian() ? 1 : 0;
 }
 
+static SDValue foldBitcastedFPLogic(SDNode *N, SelectionDAG &DAG,
+                                    const TargetLowering &TLI) {
+  // If this is not a bitcast to an FP type or if the target doesn't have
+  // IEEE754-compliant FP logic, we're done.
+  EVT VT = N->getValueType(0);
+  if (!VT.isFloatingPoint() || !TLI.hasBitPreservingFPLogic(VT))
+    return SDValue();
+
+  // TODO: Use splat values for the constant-checking below and remove this
+  // restriction.
+  SDValue N0 = N->getOperand(0);
+  EVT SourceVT = N0.getValueType();
+  if (SourceVT.isVector())
+    return SDValue();
+
+  unsigned FPOpcode;
+  APInt SignMask;
+  switch (N0.getOpcode()) {
+  case ISD::AND:
+    FPOpcode = ISD::FABS;
+    SignMask = ~APInt::getSignBit(SourceVT.getSizeInBits());
+    break;
+  case ISD::XOR:
+    FPOpcode = ISD::FNEG;
+    SignMask = APInt::getSignBit(SourceVT.getSizeInBits());
+    break;
+  // TODO: ISD::OR --> ISD::FNABS?
+  default:
+    return SDValue();
+  }
+
+  // Fold (bitcast int (and (bitcast fp X to int), 0x7fff...) to fp) -> fabs X
+  // Fold (bitcast int (xor (bitcast fp X to int), 0x8000...) to fp) -> fneg X
+  SDValue LogicOp0 = N0.getOperand(0);
+  ConstantSDNode *LogicOp1 = dyn_cast<ConstantSDNode>(N0.getOperand(1));
+  if (LogicOp1 && LogicOp1->getAPIntValue() == SignMask &&
+      LogicOp0.getOpcode() == ISD::BITCAST &&
+      LogicOp0->getOperand(0).getValueType() == VT)
+    return DAG.getNode(FPOpcode, SDLoc(N), VT, LogicOp0->getOperand(0));
+
+  return SDValue();
+}
+
 SDValue DAGCombiner::visitBITCAST(SDNode *N) {
   SDValue N0 = N->getOperand(0);
   EVT VT = N->getValueType(0);
@@ -7284,13 +7409,12 @@ SDValue DAGCombiner::visitBITCAST(SDNode *N) {
          TLI.isOperationLegal(ISD::ConstantFP, VT)) ||
         (isa<ConstantFPSDNode>(N0) && VT.isInteger() && !VT.isVector() &&
          TLI.isOperationLegal(ISD::Constant, VT)))
-      return DAG.getNode(ISD::BITCAST, SDLoc(N), VT, N0);
+      return DAG.getBitcast(VT, N0);
   }
 
   // (conv (conv x, t1), t2) -> (conv x, t2)
   if (N0.getOpcode() == ISD::BITCAST)
-    return DAG.getNode(ISD::BITCAST, SDLoc(N), VT,
-                       N0.getOperand(0));
+    return DAG.getBitcast(VT, N0.getOperand(0));
 
   // fold (conv (load x)) -> (load (conv*)x)
   // If the resultant load doesn't need a higher alignment than the original!
@@ -7303,21 +7427,24 @@ SDValue DAGCombiner::visitBITCAST(SDNode *N) {
       (!LegalOperations || TLI.isOperationLegal(ISD::LOAD, VT)) &&
       TLI.isLoadBitCastBeneficial(N0.getValueType(), VT)) {
     LoadSDNode *LN0 = cast<LoadSDNode>(N0);
-    unsigned Align = DAG.getDataLayout().getABITypeAlignment(
-        VT.getTypeForEVT(*DAG.getContext()));
     unsigned OrigAlign = LN0->getAlignment();
 
-    if (Align <= OrigAlign) {
-      SDValue Load = DAG.getLoad(VT, SDLoc(N), LN0->getChain(),
-                                 LN0->getBasePtr(), LN0->getPointerInfo(),
-                                 LN0->isVolatile(), LN0->isNonTemporal(),
-                                 LN0->isInvariant(), OrigAlign,
-                                 LN0->getAAInfo());
+    bool Fast = false;
+    if (TLI.allowsMemoryAccess(*DAG.getContext(), DAG.getDataLayout(), VT,
+                               LN0->getAddressSpace(), OrigAlign, &Fast) &&
+        Fast) {
+      SDValue Load =
+          DAG.getLoad(VT, SDLoc(N), LN0->getChain(), LN0->getBasePtr(),
+                      LN0->getPointerInfo(), OrigAlign,
+                      LN0->getMemOperand()->getFlags(), LN0->getAAInfo());
       DAG.ReplaceAllUsesOfValueWith(N0.getValue(1), Load.getValue(1));
       return Load;
     }
   }
 
+  if (SDValue V = foldBitcastedFPLogic(N, DAG, TLI))
+    return V;
+
   // fold (bitconvert (fneg x)) -> (xor (bitconvert x), signbit)
   // fold (bitconvert (fabs x)) -> (and (bitconvert x), (not signbit))
   //
@@ -7334,8 +7461,7 @@ SDValue DAGCombiner::visitBITCAST(SDNode *N) {
        (N0.getOpcode() == ISD::FABS && !TLI.isFAbsFree(N0.getValueType()))) &&
       N0.getNode()->hasOneUse() && VT.isInteger() &&
       !VT.isVector() && !N0.getValueType().isVector()) {
-    SDValue NewConv = DAG.getNode(ISD::BITCAST, SDLoc(N0), VT,
-                                  N0.getOperand(0));
+    SDValue NewConv = DAG.getBitcast(VT, N0.getOperand(0));
     AddToWorklist(NewConv.getNode());
 
     SDLoc DL(N);
@@ -7388,8 +7514,7 @@ SDValue DAGCombiner::visitBITCAST(SDNode *N) {
     unsigned OrigXWidth = N0.getOperand(1).getValueType().getSizeInBits();
     EVT IntXVT = EVT::getIntegerVT(*DAG.getContext(), OrigXWidth);
     if (isTypeLegal(IntXVT)) {
-      SDValue X = DAG.getNode(ISD::BITCAST, SDLoc(N0),
-                              IntXVT, N0.getOperand(1));
+      SDValue X = DAG.getBitcast(IntXVT, N0.getOperand(1));
       AddToWorklist(X.getNode());
 
       // If X has a different width than the result/lhs, sext it or truncate it.
@@ -7412,11 +7537,9 @@ SDValue DAGCombiner::visitBITCAST(SDNode *N) {
 
       if (N0.getValueType() == MVT::ppcf128 && !LegalTypes) {
         APInt SignBit = APInt::getSignBit(VT.getSizeInBits() / 2);
-        SDValue Cst = DAG.getNode(ISD::BITCAST, SDLoc(N0.getOperand(0)), VT,
-                                  N0.getOperand(0));
+        SDValue Cst = DAG.getBitcast(VT, N0.getOperand(0));
         AddToWorklist(Cst.getNode());
-        SDValue X = DAG.getNode(ISD::BITCAST, SDLoc(N0.getOperand(1)), VT,
-                                N0.getOperand(1));
+        SDValue X = DAG.getBitcast(VT, N0.getOperand(1));
         AddToWorklist(X.getNode());
         SDValue XorResult = DAG.getNode(ISD::XOR, SDLoc(N0), VT, Cst, X);
         AddToWorklist(XorResult.getNode());
@@ -7439,8 +7562,7 @@ SDValue DAGCombiner::visitBITCAST(SDNode *N) {
                       X, DAG.getConstant(SignBit, SDLoc(X), VT));
       AddToWorklist(X.getNode());
 
-      SDValue Cst = DAG.getNode(ISD::BITCAST, SDLoc(N0),
-                                VT, N0.getOperand(0));
+      SDValue Cst = DAG.getBitcast(VT, N0.getOperand(0));
       Cst = DAG.getNode(ISD::AND, SDLoc(Cst), VT,
                         Cst, DAG.getConstant(~SignBit, SDLoc(Cst), VT));
       AddToWorklist(Cst.getNode());
@@ -7472,7 +7594,7 @@ SDValue DAGCombiner::visitBITCAST(SDNode *N) {
         return SDValue(Op.getOperand(0));
       if (ISD::isBuildVectorOfConstantSDNodes(Op.getNode()) ||
           ISD::isBuildVectorOfConstantFPSDNodes(Op.getNode()))
-        return DAG.getNode(ISD::BITCAST, SDLoc(N), VT, Op);
+        return DAG.getBitcast(VT, Op);
       return SDValue();
     };
 
@@ -7529,8 +7651,7 @@ ConstantFoldBITCASTofBUILD_VECTOR(SDNode *BV, EVT DstEltVT) {
     // we can end up with a scalar-to-vector node here.
     if (BV->getOpcode() == ISD::SCALAR_TO_VECTOR)
       return DAG.getNode(ISD::SCALAR_TO_VECTOR, SDLoc(BV), VT,
-                         DAG.getNode(ISD::BITCAST, SDLoc(BV),
-                                     DstEltVT, BV->getOperand(0)));
+                         DAG.getBitcast(DstEltVT, BV->getOperand(0)));
 
     SmallVector<SDValue, 8> Ops;
     for (SDValue Op : BV->op_values()) {
@@ -7538,11 +7659,10 @@ ConstantFoldBITCASTofBUILD_VECTOR(SDNode *BV, EVT DstEltVT) {
       // are promoted and implicitly truncated.  Make that explicit here.
       if (Op.getValueType() != SrcEltVT)
         Op = DAG.getNode(ISD::TRUNCATE, SDLoc(BV), SrcEltVT, Op);
-      Ops.push_back(DAG.getNode(ISD::BITCAST, SDLoc(BV),
-                                DstEltVT, Op));
+      Ops.push_back(DAG.getBitcast(DstEltVT, Op));
       AddToWorklist(Ops.back().getNode());
     }
-    return DAG.getNode(ISD::BUILD_VECTOR, SDLoc(BV), VT, Ops);
+    return DAG.getBuildVector(VT, SDLoc(BV), Ops);
   }
 
   // Otherwise, we're growing or shrinking the elements.  To avoid having to
@@ -7584,7 +7704,7 @@ ConstantFoldBITCASTofBUILD_VECTOR(SDNode *BV, EVT DstEltVT) {
         // Shift the previously computed bits over.
         NewBits <<= SrcBitSize;
         SDValue Op = BV->getOperand(i+ (isLE ? (NumInputsPerOutput-j-1) : j));
-        if (Op.getOpcode() == ISD::UNDEF) continue;
+        if (Op.isUndef()) continue;
         EltIsUndef = false;
 
         NewBits |= cast<ConstantSDNode>(Op)->getAPIntValue().
@@ -7598,7 +7718,7 @@ ConstantFoldBITCASTofBUILD_VECTOR(SDNode *BV, EVT DstEltVT) {
     }
 
     EVT VT = EVT::getVectorVT(*DAG.getContext(), DstEltVT, Ops.size());
-    return DAG.getNode(ISD::BUILD_VECTOR, DL, VT, Ops);
+    return DAG.getBuildVector(VT, DL, Ops);
   }
 
   // Finally, this must be the case where we are shrinking elements: each input
@@ -7609,7 +7729,7 @@ ConstantFoldBITCASTofBUILD_VECTOR(SDNode *BV, EVT DstEltVT) {
   SmallVector<SDValue, 8> Ops;
 
   for (const SDValue &Op : BV->op_values()) {
-    if (Op.getOpcode() == ISD::UNDEF) {
+    if (Op.isUndef()) {
       Ops.append(NumOutputsPerInput, DAG.getUNDEF(DstEltVT));
       continue;
     }
@@ -7628,7 +7748,7 @@ ConstantFoldBITCASTofBUILD_VECTOR(SDNode *BV, EVT DstEltVT) {
       std::reverse(Ops.end()-NumOutputsPerInput, Ops.end());
   }
 
-  return DAG.getNode(ISD::BUILD_VECTOR, DL, VT, Ops);
+  return DAG.getBuildVector(VT, DL, Ops);
 }
 
 /// Try to perform FMA combining on a given FADD node.
@@ -7654,6 +7774,11 @@ SDValue DAGCombiner::visitFADDForFMACombine(SDNode *N) {
   if (!HasFMAD && !HasFMA)
     return SDValue();
 
+  const SelectionDAGTargetInfo *STI = DAG.getSubtarget().getSelectionDAGInfo();
+  ;
+  if (AllowFusion && STI && STI->generateFMAsInMachineCombiner(OptLevel))
+    return SDValue();
+
   // Always prefer FMAD to FMA for precision.
   unsigned PreferredFusedOpcode = HasFMAD ? ISD::FMAD : ISD::FMA;
   bool Aggressive = TLI.enableAggressiveFMAFusion(VT);
@@ -7837,6 +7962,10 @@ SDValue DAGCombiner::visitFSUBForFMACombine(SDNode *N) {
   if (!HasFMAD && !HasFMA)
     return SDValue();
 
+  const SelectionDAGTargetInfo *STI = DAG.getSubtarget().getSelectionDAGInfo();
+  if (AllowFusion && STI && STI->generateFMAsInMachineCombiner(OptLevel))
+    return SDValue();
+
   // Always prefer FMAD to FMA for precision.
   unsigned PreferredFusedOpcode = HasFMAD ? ISD::FMAD : ISD::FMA;
   bool Aggressive = TLI.enableAggressiveFMAFusion(VT);
@@ -8305,7 +8434,6 @@ SDValue DAGCombiner::visitFADD(SDNode *N) {
     AddToWorklist(Fused.getNode());
     return Fused;
   }
-
   return SDValue();
 }
 
@@ -8662,7 +8790,7 @@ SDValue DAGCombiner::visitFDIV(SDNode *N) {
     // fold (fdiv X, c2) -> fmul X, 1/c2 if losing precision is acceptable.
     if (N1CFP) {
       // Compute the reciprocal 1.0 / c2.
-      APFloat N1APF = N1CFP->getValueAPF();
+      const APFloat &N1APF = N1CFP->getValueAPF();
       APFloat Recip(N1APF.getSemantics(), 1); // 1.0
       APFloat::opStatus st = Recip.divide(N1APF, APFloat::rmNearestTiesToEven);
       // Only do the transform if the reciprocal is a legal fp immediate that
@@ -8681,12 +8809,12 @@ SDValue DAGCombiner::visitFDIV(SDNode *N) {
     // If this FDIV is part of a reciprocal square root, it may be folded
     // into a target-specific square root estimate instruction.
     if (N1.getOpcode() == ISD::FSQRT) {
-      if (SDValue RV = BuildRsqrtEstimate(N1.getOperand(0), Flags)) {
+      if (SDValue RV = buildRsqrtEstimate(N1.getOperand(0), Flags)) {
         return DAG.getNode(ISD::FMUL, DL, VT, N0, RV, Flags);
       }
     } else if (N1.getOpcode() == ISD::FP_EXTEND &&
                N1.getOperand(0).getOpcode() == ISD::FSQRT) {
-      if (SDValue RV = BuildRsqrtEstimate(N1.getOperand(0).getOperand(0),
+      if (SDValue RV = buildRsqrtEstimate(N1.getOperand(0).getOperand(0),
                                           Flags)) {
         RV = DAG.getNode(ISD::FP_EXTEND, SDLoc(N1), VT, RV);
         AddToWorklist(RV.getNode());
@@ -8694,7 +8822,7 @@ SDValue DAGCombiner::visitFDIV(SDNode *N) {
       }
     } else if (N1.getOpcode() == ISD::FP_ROUND &&
                N1.getOperand(0).getOpcode() == ISD::FSQRT) {
-      if (SDValue RV = BuildRsqrtEstimate(N1.getOperand(0).getOperand(0),
+      if (SDValue RV = buildRsqrtEstimate(N1.getOperand(0).getOperand(0),
                                           Flags)) {
         RV = DAG.getNode(ISD::FP_ROUND, SDLoc(N1), VT, RV, N1.getOperand(1));
         AddToWorklist(RV.getNode());
@@ -8715,7 +8843,7 @@ SDValue DAGCombiner::visitFDIV(SDNode *N) {
       if (SqrtOp.getNode()) {
         // We found a FSQRT, so try to make this fold:
         // x / (y * sqrt(z)) -> x * (rsqrt(z) / y)
-        if (SDValue RV = BuildRsqrtEstimate(SqrtOp.getOperand(0), Flags)) {
+        if (SDValue RV = buildRsqrtEstimate(SqrtOp.getOperand(0), Flags)) {
           RV = DAG.getNode(ISD::FDIV, SDLoc(N1), VT, RV, OtherOp, Flags);
           AddToWorklist(RV.getNode());
           return DAG.getNode(ISD::FMUL, DL, VT, N0, RV, Flags);
@@ -8772,27 +8900,7 @@ SDValue DAGCombiner::visitFSQRT(SDNode *N) {
   // For now, create a Flags object for use with all unsafe math transforms.
   SDNodeFlags Flags;
   Flags.setUnsafeAlgebra(true);
-
-  // Compute this as X * (1/sqrt(X)) = X * (X ** -0.5)
-  SDValue RV = BuildRsqrtEstimate(N->getOperand(0), &Flags);
-  if (!RV)
-    return SDValue();
-
-  EVT VT = RV.getValueType();
-  SDLoc DL(N);
-  RV = DAG.getNode(ISD::FMUL, DL, VT, N->getOperand(0), RV, &Flags);
-  AddToWorklist(RV.getNode());
-
-  // Unfortunately, RV is now NaN if the input was exactly 0.
-  // Select out this case and force the answer to 0.
-  SDValue Zero = DAG.getConstantFP(0.0, DL, VT);
-  EVT CCVT = getSetCCResultType(VT);
-  SDValue ZeroCmp = DAG.getSetCC(DL, CCVT, N->getOperand(0), Zero, ISD::SETEQ);
-  AddToWorklist(ZeroCmp.getNode());
-  AddToWorklist(RV.getNode());
-
-  return DAG.getNode(VT.isVector() ? ISD::VSELECT : ISD::SELECT, DL, VT,
-                     ZeroCmp, Zero, RV);
+  return buildSqrtEstimate(N->getOperand(0), &Flags);
 }
 
 /// copysign(x, fp_extend(y)) -> copysign(x, y)
@@ -8868,7 +8976,7 @@ SDValue DAGCombiner::visitSINT_TO_FP(SDNode *N) {
   EVT OpVT = N0.getValueType();
 
   // fold (sint_to_fp c1) -> c1fp
-  if (isConstantIntBuildVectorOrConstantInt(N0) &&
+  if (DAG.isConstantIntBuildVectorOrConstantInt(N0) &&
       // ...but only if the target supports immediate floating-point values
       (!LegalOperations ||
        TLI.isOperationLegalOrCustom(llvm::ISD::ConstantFP, VT)))
@@ -8922,7 +9030,7 @@ SDValue DAGCombiner::visitUINT_TO_FP(SDNode *N) {
   EVT OpVT = N0.getValueType();
 
   // fold (uint_to_fp c1) -> c1fp
-  if (isConstantIntBuildVectorOrConstantInt(N0) &&
+  if (DAG.isConstantIntBuildVectorOrConstantInt(N0) &&
       // ...but only if the target supports immediate floating-point values
       (!LegalOperations ||
        TLI.isOperationLegalOrCustom(llvm::ISD::ConstantFP, VT)))
@@ -8993,9 +9101,7 @@ static SDValue FoldIntToFPToInt(SDNode *N, SelectionDAG &DAG) {
     }
     if (VT.getScalarSizeInBits() < SrcVT.getScalarSizeInBits())
       return DAG.getNode(ISD::TRUNCATE, SDLoc(N), VT, Src);
-    if (SrcVT == VT)
-      return Src;
-    return DAG.getNode(ISD::BITCAST, SDLoc(N), VT, Src);
+    return DAG.getBitcast(VT, Src);
   }
   return SDValue();
 }
@@ -9040,6 +9146,17 @@ SDValue DAGCombiner::visitFP_ROUND(SDNode *N) {
   if (N0.getOpcode() == ISD::FP_ROUND) {
     const bool NIsTrunc = N->getConstantOperandVal(1) == 1;
     const bool N0IsTrunc = N0.getNode()->getConstantOperandVal(1) == 1;
+
+    // Skip this folding if it results in an fp_round from f80 to f16.
+    //
+    // f80 to f16 always generates an expensive (and as yet, unimplemented)
+    // libcall to __truncxfhf2 instead of selecting native f16 conversion
+    // instructions from f32 or f64.  Moreover, the first (value-preserving)
+    // fp_round from f80 to either f32 or f64 may become a NOP in platforms like
+    // x86.
+    if (N0.getOperand(0).getValueType() == MVT::f80 && VT == MVT::f16)
+      return SDValue();
+
     // If the first fp_round isn't a value preserving truncation, it might
     // introduce a tie in the second fp_round, that wouldn't occur in the
     // single-step fp_round we want to fold to.
@@ -9198,7 +9315,7 @@ SDValue DAGCombiner::visitFNEG(SDNode *N) {
       Int = DAG.getNode(ISD::XOR, DL0, IntVT, Int,
                         DAG.getConstant(SignMask, DL0, IntVT));
       AddToWorklist(Int.getNode());
-      return DAG.getNode(ISD::BITCAST, SDLoc(N), VT, Int);
+      return DAG.getBitcast(VT, Int);
     }
   }
 
@@ -9303,7 +9420,7 @@ SDValue DAGCombiner::visitFABS(SDNode *N) {
       Int = DAG.getNode(ISD::AND, DL, IntVT, Int,
                         DAG.getConstant(SignMask, DL, IntVT));
       AddToWorklist(Int.getNode());
-      return DAG.getNode(ISD::BITCAST, SDLoc(N), N->getValueType(0), Int);
+      return DAG.getBitcast(N->getValueType(0), Int);
     }
   }
 
@@ -9607,6 +9724,11 @@ bool DAGCombiner::CombineToPreIndexedLoadStore(SDNode *N) {
       return false;
   }
 
+  // Caches for hasPredecessorHelper.
+  SmallPtrSet<const SDNode *, 32> Visited;
+  SmallVector<const SDNode *, 16> Worklist;
+  Worklist.push_back(N);
+
   // If the offset is a constant, there may be other adds of constants that
   // can be folded with this one. We should do this to avoid having to keep
   // a copy of the original base pointer.
@@ -9621,7 +9743,7 @@ bool DAGCombiner::CombineToPreIndexedLoadStore(SDNode *N) {
       if (Use.getUser() == Ptr.getNode() || Use != BasePtr)
         continue;
 
-      if (Use.getUser()->isPredecessorOf(N))
+      if (SDNode::hasPredecessorHelper(Use.getUser(), Visited, Worklist))
         continue;
 
       if (Use.getUser()->getOpcode() != ISD::ADD &&
@@ -9651,14 +9773,10 @@ bool DAGCombiner::CombineToPreIndexedLoadStore(SDNode *N) {
   // Now check for #3 and #4.
   bool RealUse = false;
 
-  // Caches for hasPredecessorHelper
-  SmallPtrSet<const SDNode *, 32> Visited;
-  SmallVector<const SDNode *, 16> Worklist;
-
   for (SDNode *Use : Ptr.getNode()->uses()) {
     if (Use == N)
       continue;
-    if (N->hasPredecessorHelper(Use, Visited, Worklist))
+    if (SDNode::hasPredecessorHelper(Use, Visited, Worklist))
       return false;
 
     // If Ptr may be folded in addressing mode of other use, then it's
@@ -9720,7 +9838,7 @@ bool DAGCombiner::CombineToPreIndexedLoadStore(SDNode *N) {
     ConstantSDNode *CN =
       cast<ConstantSDNode>(OtherUses[i]->getOperand(OffsetIdx));
     int X0, X1, Y0, Y1;
-    APInt Offset0 = CN->getAPIntValue();
+    const APInt &Offset0 = CN->getAPIntValue();
     APInt Offset1 = cast<ConstantSDNode>(Offset)->getAPIntValue();
 
     X0 = (OtherUses[i]->getOpcode() == ISD::SUB && OffsetIdx == 1) ? -1 : 1;
@@ -9984,13 +10102,10 @@ SDValue DAGCombiner::visitLOAD(SDNode *N) {
   if (OptLevel != CodeGenOpt::None && LD->isUnindexed()) {
     if (unsigned Align = DAG.InferPtrAlignment(Ptr)) {
       if (Align > LD->getMemOperand()->getBaseAlignment()) {
-        SDValue NewLoad =
-               DAG.getExtLoad(LD->getExtensionType(), SDLoc(N),
-                              LD->getValueType(0),
-                              Chain, Ptr, LD->getPointerInfo(),
-                              LD->getMemoryVT(),
-                              LD->isVolatile(), LD->isNonTemporal(),
-                              LD->isInvariant(), Align, LD->getAAInfo());
+        SDValue NewLoad = DAG.getExtLoad(
+            LD->getExtensionType(), SDLoc(N), LD->getValueType(0), Chain, Ptr,
+            LD->getPointerInfo(), LD->getMemoryVT(), Align,
+            LD->getMemOperand()->getFlags(), LD->getAAInfo());
         if (NewLoad.getNode() != N)
           return CombineTo(N, NewLoad, SDValue(NewLoad.getNode(), 1), true);
       }
@@ -10208,7 +10323,7 @@ struct LoadedSlice {
       return false;
 
     // Offsets are for indexed load only, we do not handle that.
-    if (Origin->getOffset().getOpcode() != ISD::UNDEF)
+    if (!Origin->getOffset().isUndef())
       return false;
 
     const TargetLowering &TLI = DAG->getTargetLoweringInfo();
@@ -10291,10 +10406,10 @@ struct LoadedSlice {
     EVT SliceType = getLoadedType();
 
     // Create the load for the slice.
-    SDValue LastInst = DAG->getLoad(
-        SliceType, SDLoc(Origin), Origin->getChain(), BaseAddr,
-        Origin->getPointerInfo().getWithOffset(Offset), Origin->isVolatile(),
-        Origin->isNonTemporal(), Origin->isInvariant(), getAlignment());
+    SDValue LastInst =
+        DAG->getLoad(SliceType, SDLoc(Origin), Origin->getChain(), BaseAddr,
+                     Origin->getPointerInfo().getWithOffset(Offset),
+                     getAlignment(), Origin->getMemOperand()->getFlags());
     // If the final type is not the same as the loaded type, this means that
     // we have to pad with zero. Create a zero extend for that.
     EVT FinalType = Inst->getValueType(0);
@@ -10718,9 +10833,10 @@ ShrinkLoadReplaceStoreWithStore(const std::pair<unsigned, unsigned> &MaskInfo,
   IVal = DAG.getNode(ISD::TRUNCATE, SDLoc(IVal), VT, IVal);
 
   ++OpsNarrowed;
-  return DAG.getStore(St->getChain(), SDLoc(St), IVal, Ptr,
-                      St->getPointerInfo().getWithOffset(StOffset),
-                      false, false, NewAlign).getNode();
+  return DAG
+      .getStore(St->getChain(), SDLoc(St), IVal, Ptr,
+                St->getPointerInfo().getWithOffset(StOffset), NewAlign)
+      .getNode();
 }
 
 
@@ -10826,19 +10942,16 @@ SDValue DAGCombiner::ReduceLoadOpStoreWidth(SDNode *N) {
                                    Ptr.getValueType(), Ptr,
                                    DAG.getConstant(PtrOff, SDLoc(LD),
                                                    Ptr.getValueType()));
-      SDValue NewLD = DAG.getLoad(NewVT, SDLoc(N0),
-                                  LD->getChain(), NewPtr,
-                                  LD->getPointerInfo().getWithOffset(PtrOff),
-                                  LD->isVolatile(), LD->isNonTemporal(),
-                                  LD->isInvariant(), NewAlign,
-                                  LD->getAAInfo());
+      SDValue NewLD =
+          DAG.getLoad(NewVT, SDLoc(N0), LD->getChain(), NewPtr,
+                      LD->getPointerInfo().getWithOffset(PtrOff), NewAlign,
+                      LD->getMemOperand()->getFlags(), LD->getAAInfo());
       SDValue NewVal = DAG.getNode(Opc, SDLoc(Value), NewVT, NewLD,
                                    DAG.getConstant(NewImm, SDLoc(Value),
                                                    NewVT));
-      SDValue NewST = DAG.getStore(Chain, SDLoc(N),
-                                   NewVal, NewPtr,
-                                   ST->getPointerInfo().getWithOffset(PtrOff),
-                                   false, false, NewAlign);
+      SDValue NewST =
+          DAG.getStore(Chain, SDLoc(N), NewVal, NewPtr,
+                       ST->getPointerInfo().getWithOffset(PtrOff), NewAlign);
 
       AddToWorklist(NewPtr.getNode());
       AddToWorklist(NewLD.getNode());
@@ -10887,15 +11000,13 @@ SDValue DAGCombiner::TransformFPLoadStorePair(SDNode *N) {
     if (LDAlign < ABIAlign || STAlign < ABIAlign)
       return SDValue();
 
-    SDValue NewLD = DAG.getLoad(IntVT, SDLoc(Value),
-                                LD->getChain(), LD->getBasePtr(),
-                                LD->getPointerInfo(),
-                                false, false, false, LDAlign);
+    SDValue NewLD =
+        DAG.getLoad(IntVT, SDLoc(Value), LD->getChain(), LD->getBasePtr(),
+                    LD->getPointerInfo(), LDAlign);
 
-    SDValue NewST = DAG.getStore(NewLD.getValue(1), SDLoc(N),
-                                 NewLD, ST->getBasePtr(),
-                                 ST->getPointerInfo(),
-                                 false, false, STAlign);
+    SDValue NewST =
+        DAG.getStore(NewLD.getValue(1), SDLoc(N), NewLD, ST->getBasePtr(),
+                     ST->getPointerInfo(), STAlign);
 
     AddToWorklist(NewLD.getNode());
     AddToWorklist(NewST.getNode());
@@ -10940,9 +11051,23 @@ struct BaseIndexOffset {
   }
 
   /// Parses tree in Ptr for base, index, offset addresses.
-  static BaseIndexOffset match(SDValue Ptr) {
+  static BaseIndexOffset match(SDValue Ptr, SelectionDAG &DAG) {
     bool IsIndexSignExt = false;
 
+    // Split up a folded GlobalAddress+Offset into its component parts.
+    if (GlobalAddressSDNode *GA = dyn_cast<GlobalAddressSDNode>(Ptr))
+      if (GA->getOpcode() == ISD::GlobalAddress && GA->getOffset() != 0) {
+        return BaseIndexOffset(DAG.getGlobalAddress(GA->getGlobal(),
+                                                    SDLoc(GA),
+                                                    GA->getValueType(0),
+                                                    /*Offset=*/0,
+                                                    /*isTargetGA=*/false,
+                                                    GA->getTargetFlags()),
+                               SDValue(),
+                               GA->getOffset(),
+                               IsIndexSignExt);
+      }
+
     // We only can pattern match BASE + INDEX + OFFSET. If Ptr is not an ADD
     // instruction, then it could be just the BASE or everything else we don't
     // know how to handle. Just use Ptr as BASE and give up.
@@ -11063,7 +11188,7 @@ bool DAGCombiner::isMulAddWithConstProfitable(SDNode *MulNode,
       // multiply (CONST * A) after we also do the same transformation
       // to the "t2" instruction.
       if (OtherOp->getOpcode() == ISD::ADD &&
-          isConstantIntBuildVectorOrConstantInt(OtherOp->getOperand(1)) &&
+          DAG.isConstantIntBuildVectorOrConstantInt(OtherOp->getOperand(1)) &&
           OtherOp->getOperand(0).getNode() == MulVar)
         return true;
     }
@@ -11073,11 +11198,9 @@ bool DAGCombiner::isMulAddWithConstProfitable(SDNode *MulNode,
   return false;
 }
 
-SDValue DAGCombiner::getMergedConstantVectorStore(SelectionDAG &DAG,
-                                                  SDLoc SL,
-                                                  ArrayRef<MemOpLink> Stores,
-                                                  SmallVectorImpl<SDValue> &Chains,
-                                                  EVT Ty) const {
+SDValue DAGCombiner::getMergedConstantVectorStore(
+    SelectionDAG &DAG, const SDLoc &SL, ArrayRef<MemOpLink> Stores,
+    SmallVectorImpl<SDValue> &Chains, EVT Ty) const {
   SmallVector<SDValue, 8> BuildVector;
 
   for (unsigned I = 0, E = Ty.getVectorNumElements(); I != E; ++I) {
@@ -11086,7 +11209,7 @@ SDValue DAGCombiner::getMergedConstantVectorStore(SelectionDAG &DAG,
     BuildVector.push_back(St->getValue());
   }
 
-  return DAG.getNode(ISD::BUILD_VECTOR, SL, Ty, BuildVector);
+  return DAG.getBuildVector(Ty, SL, BuildVector);
 }
 
 bool DAGCombiner::MergeStoresOfConstantsOrVecElts(
@@ -11182,29 +11305,36 @@ bool DAGCombiner::MergeStoresOfConstantsOrVecElts(
   SDValue NewStore = DAG.getStore(NewChain, DL, StoredVal,
                                   FirstInChain->getBasePtr(),
                                   FirstInChain->getPointerInfo(),
-                                  false, false,
                                   FirstInChain->getAlignment());
 
-  // Replace the last store with the new store
-  CombineTo(LatestOp, NewStore);
-  // Erase all other stores.
-  for (unsigned i = 0; i < NumStores; ++i) {
-    if (StoreNodes[i].MemNode == LatestOp)
-      continue;
-    StoreSDNode *St = cast<StoreSDNode>(StoreNodes[i].MemNode);
-    // ReplaceAllUsesWith will replace all uses that existed when it was
-    // called, but graph optimizations may cause new ones to appear. For
-    // example, the case in pr14333 looks like
-    //
-    //  St's chain -> St -> another store -> X
-    //
-    // And the only difference from St to the other store is the chain.
-    // When we change it's chain to be St's chain they become identical,
-    // get CSEed and the net result is that X is now a use of St.
-    // Since we know that St is redundant, just iterate.
-    while (!St->use_empty())
-      DAG.ReplaceAllUsesWith(SDValue(St, 0), St->getChain());
-    deleteAndRecombine(St);
+  bool UseAA = CombinerAA.getNumOccurrences() > 0 ? CombinerAA
+                                                  : DAG.getSubtarget().useAA();
+  if (UseAA) {
+    // Replace all merged stores with the new store.
+    for (unsigned i = 0; i < NumStores; ++i)
+      CombineTo(StoreNodes[i].MemNode, NewStore);
+  } else {
+    // Replace the last store with the new store.
+    CombineTo(LatestOp, NewStore);
+    // Erase all other stores.
+    for (unsigned i = 0; i < NumStores; ++i) {
+      if (StoreNodes[i].MemNode == LatestOp)
+        continue;
+      StoreSDNode *St = cast<StoreSDNode>(StoreNodes[i].MemNode);
+      // ReplaceAllUsesWith will replace all uses that existed when it was
+      // called, but graph optimizations may cause new ones to appear. For
+      // example, the case in pr14333 looks like
+      //
+      //  St's chain -> St -> another store -> X
+      //
+      // And the only difference from St to the other store is the chain.
+      // When we change it's chain to be St's chain they become identical,
+      // get CSEed and the net result is that X is now a use of St.
+      // Since we know that St is redundant, just iterate.
+      while (!St->use_empty())
+        DAG.ReplaceAllUsesWith(SDValue(St, 0), St->getChain());
+      deleteAndRecombine(St);
+    }
   }
 
   return true;
@@ -11215,14 +11345,14 @@ void DAGCombiner::getStoreMergeAndAliasCandidates(
     SmallVectorImpl<LSBaseSDNode*> &AliasLoadNodes) {
   // This holds the base pointer, index, and the offset in bytes from the base
   // pointer.
-  BaseIndexOffset BasePtr = BaseIndexOffset::match(St->getBasePtr());
+  BaseIndexOffset BasePtr = BaseIndexOffset::match(St->getBasePtr(), DAG);
 
   // We must have a base and an offset.
   if (!BasePtr.Base.getNode())
     return;
 
   // Do not handle stores to undef base pointers.
-  if (BasePtr.Base.getOpcode() == ISD::UNDEF)
+  if (BasePtr.Base.isUndef())
     return;
 
   // Walk up the chain and look for nodes with offsets from the same
@@ -11253,7 +11383,7 @@ void DAGCombiner::getStoreMergeAndAliasCandidates(
         if (OtherST->getMemoryVT() != MemVT)
           continue;
 
-        BaseIndexOffset Ptr = BaseIndexOffset::match(OtherST->getBasePtr());
+        BaseIndexOffset Ptr = BaseIndexOffset::match(OtherST->getBasePtr(), DAG);
 
         if (Ptr.equalBaseIndex(BasePtr))
           StoreNodes.push_back(MemOpLink(OtherST, Ptr.Offset, Seq++));
@@ -11269,7 +11399,7 @@ void DAGCombiner::getStoreMergeAndAliasCandidates(
       break;
 
     // Find the base pointer and offset for this memory node.
-    BaseIndexOffset Ptr = BaseIndexOffset::match(Index->getBasePtr());
+    BaseIndexOffset Ptr = BaseIndexOffset::match(Index->getBasePtr(), DAG);
 
     // Check that the base pointer is the same as the original one.
     if (!Ptr.equalBaseIndex(BasePtr))
@@ -11280,9 +11410,8 @@ void DAGCombiner::getStoreMergeAndAliasCandidates(
       break;
 
     // No truncation.
-    if (StoreSDNode *St = dyn_cast<StoreSDNode>(Index))
-      if (St->isTruncatingStore())
-        break;
+    if (Index->isTruncatingStore())
+      break;
 
     // The stored memory type must be the same.
     if (Index->getMemoryVT() != MemVT)
@@ -11326,6 +11455,30 @@ void DAGCombiner::getStoreMergeAndAliasCandidates(
   }
 }
 
+// We need to check that merging these stores does not cause a loop
+// in the DAG. Any store candidate may depend on another candidate
+// indirectly through its operand (we already consider dependencies
+// through the chain). Check in parallel by searching up from
+// non-chain operands of candidates.
+bool DAGCombiner::checkMergeStoreCandidatesForDependencies(
+    SmallVectorImpl<MemOpLink> &StoreNodes) {
+  SmallPtrSet<const SDNode *, 16> Visited;
+  SmallVector<const SDNode *, 8> Worklist;
+  // search ops of store candidates
+  for (unsigned i = 0; i < StoreNodes.size(); ++i) {
+    SDNode *n = StoreNodes[i].MemNode;
+    // Potential loops may happen only through non-chain operands
+    for (unsigned j = 1; j < n->getNumOperands(); ++j)
+      Worklist.push_back(n->getOperand(j).getNode());
+  }
+  // search through DAG. We can stop early if we find a storenode
+  for (unsigned i = 0; i < StoreNodes.size(); ++i) {
+    if (SDNode::hasPredecessorHelper(StoreNodes[i].MemNode, Visited, Worklist))
+      return false;
+  }
+  return true;
+}
+
 bool DAGCombiner::MergeConsecutiveStores(StoreSDNode* St) {
   if (OptLevel == CodeGenOpt::None)
     return false;
@@ -11379,6 +11532,12 @@ bool DAGCombiner::MergeConsecutiveStores(StoreSDNode* St) {
   if (StoreNodes.size() < 2)
     return false;
 
+  // only do dep endence check in AA case
+  bool UseAA = CombinerAA.getNumOccurrences() > 0 ? CombinerAA
+                                                  : DAG.getSubtarget().useAA();
+  if (UseAA && !checkMergeStoreCandidatesForDependencies(StoreNodes))
+    return false;
+
   // Sort the memory operands according to their distance from the
   // base pointer.  As a secondary criteria: make sure stores coming
   // later in the code come first in the list. This is important for
@@ -11557,7 +11716,7 @@ bool DAGCombiner::MergeConsecutiveStores(StoreSDNode* St) {
     if (Ld->getMemoryVT() != MemVT)
       break;
 
-    BaseIndexOffset LdPtr = BaseIndexOffset::match(Ld->getBasePtr());
+    BaseIndexOffset LdPtr = BaseIndexOffset::match(Ld->getBasePtr(), DAG);
     // If this is not the first ptr that we check.
     if (LdBasePtr.Base.getNode()) {
       // The base ptr must be the same.
@@ -11690,16 +11849,16 @@ bool DAGCombiner::MergeConsecutiveStores(StoreSDNode* St) {
 
   // The merged loads are required to have the same incoming chain, so
   // using the first's chain is acceptable.
-  SDValue NewLoad = DAG.getLoad(
-      JointMemOpVT, LoadDL, FirstLoad->getChain(), FirstLoad->getBasePtr(),
-      FirstLoad->getPointerInfo(), false, false, false, FirstLoadAlign);
+  SDValue NewLoad = DAG.getLoad(JointMemOpVT, LoadDL, FirstLoad->getChain(),
+                                FirstLoad->getBasePtr(),
+                                FirstLoad->getPointerInfo(), FirstLoadAlign);
 
   SDValue NewStoreChain =
     DAG.getNode(ISD::TokenFactor, StoreDL, MVT::Other, MergeStoreChains);
 
-  SDValue NewStore = DAG.getStore(
-    NewStoreChain, StoreDL, NewLoad, FirstInChain->getBasePtr(),
-      FirstInChain->getPointerInfo(), false, false, FirstStoreAlign);
+  SDValue NewStore =
+      DAG.getStore(NewStoreChain, StoreDL, NewLoad, FirstInChain->getBasePtr(),
+                   FirstInChain->getPointerInfo(), FirstStoreAlign);
 
   // Transfer chain users from old loads to the new load.
   for (unsigned i = 0; i < NumElem; ++i) {
@@ -11708,16 +11867,22 @@ bool DAGCombiner::MergeConsecutiveStores(StoreSDNode* St) {
                                   SDValue(NewLoad.getNode(), 1));
   }
 
-  // Replace the last store with the new store.
-  CombineTo(LatestOp, NewStore);
-  // Erase all other stores.
-  for (unsigned i = 0; i < NumElem ; ++i) {
-    // Remove all Store nodes.
-    if (StoreNodes[i].MemNode == LatestOp)
-      continue;
-    StoreSDNode *St = cast<StoreSDNode>(StoreNodes[i].MemNode);
-    DAG.ReplaceAllUsesOfValueWith(SDValue(St, 0), St->getChain());
-    deleteAndRecombine(St);
+  if (UseAA) {
+    // Replace the all stores with the new store.
+    for (unsigned i = 0; i < NumElem; ++i)
+      CombineTo(StoreNodes[i].MemNode, NewStore);
+  } else {
+    // Replace the last store with the new store.
+    CombineTo(LatestOp, NewStore);
+    // Erase all other stores.
+    for (unsigned i = 0; i < NumElem; ++i) {
+      // Remove all Store nodes.
+      if (StoreNodes[i].MemNode == LatestOp)
+        continue;
+      StoreSDNode *St = cast<StoreSDNode>(StoreNodes[i].MemNode);
+      DAG.ReplaceAllUsesOfValueWith(SDValue(St, 0), St->getChain());
+      deleteAndRecombine(St);
+    }
   }
 
   return true;
@@ -11808,21 +11973,17 @@ SDValue DAGCombiner::replaceStoreOfFPConstant(StoreSDNode *ST) {
         std::swap(Lo, Hi);
 
       unsigned Alignment = ST->getAlignment();
-      bool isVolatile = ST->isVolatile();
-      bool isNonTemporal = ST->isNonTemporal();
+      MachineMemOperand::Flags MMOFlags = ST->getMemOperand()->getFlags();
       AAMDNodes AAInfo = ST->getAAInfo();
 
-      SDValue St0 = DAG.getStore(Chain, DL, Lo,
-                                 Ptr, ST->getPointerInfo(),
-                                 isVolatile, isNonTemporal,
-                                 ST->getAlignment(), AAInfo);
+      SDValue St0 = DAG.getStore(Chain, DL, Lo, Ptr, ST->getPointerInfo(),
+                                 ST->getAlignment(), MMOFlags, AAInfo);
       Ptr = DAG.getNode(ISD::ADD, DL, Ptr.getValueType(), Ptr,
                         DAG.getConstant(4, DL, Ptr.getValueType()));
       Alignment = MinAlign(Alignment, 4U);
-      SDValue St1 = DAG.getStore(Chain, DL, Hi,
-                                 Ptr, ST->getPointerInfo().getWithOffset(4),
-                                 isVolatile, isNonTemporal,
-                                 Alignment, AAInfo);
+      SDValue St1 = DAG.getStore(Chain, DL, Hi, Ptr,
+                                 ST->getPointerInfo().getWithOffset(4),
+                                 Alignment, MMOFlags, AAInfo);
       return DAG.getNode(ISD::TokenFactor, DL, MVT::Other,
                          St0, St1);
     }
@@ -11841,21 +12002,24 @@ SDValue DAGCombiner::visitSTORE(SDNode *N) {
   // resultant store does not need a higher alignment than the original.
   if (Value.getOpcode() == ISD::BITCAST && !ST->isTruncatingStore() &&
       ST->isUnindexed()) {
-    unsigned OrigAlign = ST->getAlignment();
     EVT SVT = Value.getOperand(0).getValueType();
-    unsigned Align = DAG.getDataLayout().getABITypeAlignment(
-        SVT.getTypeForEVT(*DAG.getContext()));
-    if (Align <= OrigAlign &&
-        ((!LegalOperations && !ST->isVolatile()) ||
-         TLI.isOperationLegalOrCustom(ISD::STORE, SVT)))
-      return DAG.getStore(Chain, SDLoc(N), Value.getOperand(0),
-                          Ptr, ST->getPointerInfo(), ST->isVolatile(),
-                          ST->isNonTemporal(), OrigAlign,
-                          ST->getAAInfo());
+    if (((!LegalOperations && !ST->isVolatile()) ||
+         TLI.isOperationLegalOrCustom(ISD::STORE, SVT)) &&
+        TLI.isStoreBitCastBeneficial(Value.getValueType(), SVT)) {
+      unsigned OrigAlign = ST->getAlignment();
+      bool Fast = false;
+      if (TLI.allowsMemoryAccess(*DAG.getContext(), DAG.getDataLayout(), SVT,
+                                 ST->getAddressSpace(), OrigAlign, &Fast) &&
+          Fast) {
+        return DAG.getStore(Chain, SDLoc(N), Value.getOperand(0), Ptr,
+                            ST->getPointerInfo(), OrigAlign,
+                            ST->getMemOperand()->getFlags(), ST->getAAInfo());
+      }
+    }
   }
 
   // Turn 'store undef, Ptr' -> nothing.
-  if (Value.getOpcode() == ISD::UNDEF && ST->isUnindexed())
+  if (Value.isUndef() && ST->isUnindexed())
     return Chain;
 
   // Try to infer better alignment information than the store already has.
@@ -11863,10 +12027,9 @@ SDValue DAGCombiner::visitSTORE(SDNode *N) {
     if (unsigned Align = DAG.InferPtrAlignment(Ptr)) {
       if (Align > ST->getAlignment()) {
         SDValue NewStore =
-               DAG.getTruncStore(Chain, SDLoc(N), Value,
-                                 Ptr, ST->getPointerInfo(), ST->getMemoryVT(),
-                                 ST->isVolatile(), ST->isNonTemporal(), Align,
-                                 ST->getAAInfo());
+            DAG.getTruncStore(Chain, SDLoc(N), Value, Ptr, ST->getPointerInfo(),
+                              ST->getMemoryVT(), Align,
+                              ST->getMemOperand()->getFlags(), ST->getAAInfo());
         if (NewStore.getNode() != N)
           return CombineTo(ST, NewStore, true);
       }
@@ -11898,6 +12061,7 @@ SDValue DAGCombiner::visitSTORE(SDNode *N) {
       // manipulation. Return the original node to not do anything else.
       return SDValue(ST, 0);
     }
+    Chain = ST->getChain();
   }
 
   // Try transforming N to an indexed store.
@@ -12001,7 +12165,7 @@ SDValue DAGCombiner::visitINSERT_VECTOR_ELT(SDNode *N) {
   SDLoc dl(N);
 
   // If the inserted element is an UNDEF, just use the input vector.
-  if (InVal.getOpcode() == ISD::UNDEF)
+  if (InVal.isUndef())
     return InVec;
 
   EVT VT = InVec.getValueType();
@@ -12045,7 +12209,7 @@ SDValue DAGCombiner::visitINSERT_VECTOR_ELT(SDNode *N) {
   if (InVec.getOpcode() == ISD::BUILD_VECTOR && InVec.hasOneUse()) {
     Ops.append(InVec.getNode()->op_begin(),
                InVec.getNode()->op_end());
-  } else if (InVec.getOpcode() == ISD::UNDEF) {
+  } else if (InVec.isUndef()) {
     unsigned NElts = VT.getVectorNumElements();
     Ops.append(NElts, DAG.getUNDEF(InVal.getValueType()));
   } else {
@@ -12065,11 +12229,13 @@ SDValue DAGCombiner::visitINSERT_VECTOR_ELT(SDNode *N) {
   }
 
   // Return the new vector
-  return DAG.getNode(ISD::BUILD_VECTOR, dl, VT, Ops);
+  return DAG.getBuildVector(VT, dl, Ops);
 }
 
 SDValue DAGCombiner::ReplaceExtractVectorEltOfLoadWithNarrowedLoad(
     SDNode *EVE, EVT InVecVT, SDValue EltNo, LoadSDNode *OriginalLoad) {
+  assert(!OriginalLoad->isVolatile());
+
   EVT ResultVT = EVE->getValueType(0);
   EVT VecEltVT = InVecVT.getVectorElementType();
   unsigned Align = OriginalLoad->getAlignment();
@@ -12115,21 +12281,20 @@ SDValue DAGCombiner::ReplaceExtractVectorEltOfLoadWithNarrowedLoad(
                                                   VecEltVT)
                                    ? ISD::ZEXTLOAD
                                    : ISD::EXTLOAD;
-    Load = DAG.getExtLoad(
-        ExtType, SDLoc(EVE), ResultVT, OriginalLoad->getChain(), NewPtr, MPI,
-        VecEltVT, OriginalLoad->isVolatile(), OriginalLoad->isNonTemporal(),
-        OriginalLoad->isInvariant(), Align, OriginalLoad->getAAInfo());
+    Load = DAG.getExtLoad(ExtType, SDLoc(EVE), ResultVT,
+                          OriginalLoad->getChain(), NewPtr, MPI, VecEltVT,
+                          Align, OriginalLoad->getMemOperand()->getFlags(),
+                          OriginalLoad->getAAInfo());
     Chain = Load.getValue(1);
   } else {
-    Load = DAG.getLoad(
-        VecEltVT, SDLoc(EVE), OriginalLoad->getChain(), NewPtr, MPI,
-        OriginalLoad->isVolatile(), OriginalLoad->isNonTemporal(),
-        OriginalLoad->isInvariant(), Align, OriginalLoad->getAAInfo());
+    Load = DAG.getLoad(VecEltVT, SDLoc(EVE), OriginalLoad->getChain(), NewPtr,
+                       MPI, Align, OriginalLoad->getMemOperand()->getFlags(),
+                       OriginalLoad->getAAInfo());
     Chain = Load.getValue(1);
     if (ResultVT.bitsLT(VecEltVT))
       Load = DAG.getNode(ISD::TRUNCATE, SDLoc(EVE), ResultVT, Load);
     else
-      Load = DAG.getNode(ISD::BITCAST, SDLoc(EVE), ResultVT, Load);
+      Load = DAG.getBitcast(ResultVT, Load);
   }
   WorklistRemover DeadNodes(*this);
   SDValue From[] = { SDValue(EVE, 0), SDValue(OriginalLoad, 1) };
@@ -12183,6 +12348,24 @@ SDValue DAGCombiner::visitEXTRACT_VECTOR_ELT(SDNode *N) {
     // converts.
   }
 
+  // extract_vector_elt (v2i32 (bitcast i64:x)), 0 -> i32 (trunc i64:x)
+  if (ConstEltNo && InVec.getOpcode() == ISD::BITCAST && InVec.hasOneUse() &&
+      ConstEltNo->isNullValue() && VT.isInteger()) {
+    SDValue BCSrc = InVec.getOperand(0);
+    if (BCSrc.getValueType().isScalarInteger())
+      return DAG.getNode(ISD::TRUNCATE, SDLoc(N), NVT, BCSrc);
+  }
+
+  // extract_vector_elt (insert_vector_elt vec, val, idx), idx) -> val
+  //
+  // This only really matters if the index is non-constant since other combines
+  // on the constant elements already work.
+  if (InVec.getOpcode() == ISD::INSERT_VECTOR_ELT &&
+      EltNo == InVec.getOperand(2)) {
+    SDValue Elt = InVec.getOperand(1);
+    return VT.isInteger() ? DAG.getAnyExtOrTrunc(Elt, SDLoc(N), NVT) : Elt;
+  }
+
   // Transform: (EXTRACT_VECTOR_ELT( VECTOR_SHUFFLE )) -> EXTRACT_VECTOR_ELT.
   // We only perform this optimization before the op legalization phase because
   // we may introduce new vector instructions which are not backed by TD
@@ -12256,9 +12439,12 @@ SDValue DAGCombiner::visitEXTRACT_VECTOR_ELT(SDNode *N) {
       ISD::isNormalLoad(InVec.getNode()) &&
       !N->getOperand(1)->hasPredecessor(InVec.getNode())) {
     SDValue Index = N->getOperand(1);
-    if (LoadSDNode *OrigLoad = dyn_cast<LoadSDNode>(InVec))
-      return ReplaceExtractVectorEltOfLoadWithNarrowedLoad(N, VT, Index,
-                                                           OrigLoad);
+    if (LoadSDNode *OrigLoad = dyn_cast<LoadSDNode>(InVec)) {
+      if (!OrigLoad->isVolatile()) {
+        return ReplaceExtractVectorEltOfLoadWithNarrowedLoad(N, VT, Index,
+                                                             OrigLoad);
+      }
+    }
   }
 
   // Perform only after legalization to ensure build_vector / vector_shuffle
@@ -12358,7 +12544,7 @@ SDValue DAGCombiner::reduceBuildVecExtToExtBuildVec(SDNode *N) {
   for (unsigned i = 0; i != NumInScalars; ++i) {
     SDValue In = N->getOperand(i);
     // Ignore undef inputs.
-    if (In.getOpcode() == ISD::UNDEF) continue;
+    if (In.isUndef()) continue;
 
     bool AnyExt  = In.getOpcode() == ISD::ANY_EXTEND;
     bool ZeroExt = In.getOpcode() == ISD::ZERO_EXTEND;
@@ -12413,9 +12599,9 @@ SDValue DAGCombiner::reduceBuildVecExtToExtBuildVec(SDNode *N) {
     SDValue Cast = N->getOperand(i);
     assert((Cast.getOpcode() == ISD::ANY_EXTEND ||
             Cast.getOpcode() == ISD::ZERO_EXTEND ||
-            Cast.getOpcode() == ISD::UNDEF) && "Invalid cast opcode");
+            Cast.isUndef()) && "Invalid cast opcode");
     SDValue In;
-    if (Cast.getOpcode() == ISD::UNDEF)
+    if (Cast.isUndef())
       In = DAG.getUNDEF(SourceType);
     else
       In = Cast->getOperand(0);
@@ -12434,12 +12620,12 @@ SDValue DAGCombiner::reduceBuildVecExtToExtBuildVec(SDNode *N) {
   if (!isTypeLegal(VecVT)) return SDValue();
 
   // Make the new BUILD_VECTOR.
-  SDValue BV = DAG.getNode(ISD::BUILD_VECTOR, dl, VecVT, Ops);
+  SDValue BV = DAG.getBuildVector(VecVT, dl, Ops);
 
   // The new BUILD_VECTOR node has the potential to be further optimized.
   AddToWorklist(BV.getNode());
   // Bitcast to the desired type.
-  return DAG.getNode(ISD::BITCAST, dl, VT, BV);
+  return DAG.getBitcast(VT, BV);
 }
 
 SDValue DAGCombiner::reduceBuildVecConvertToConvertBuildVec(SDNode *N) {
@@ -12502,12 +12688,12 @@ SDValue DAGCombiner::reduceBuildVecConvertToConvertBuildVec(SDNode *N) {
   for (unsigned i = 0; i != NumInScalars; ++i) {
     SDValue In = N->getOperand(i);
 
-    if (In.getOpcode() == ISD::UNDEF)
+    if (In.isUndef())
       Opnds.push_back(DAG.getUNDEF(SrcVT));
     else
       Opnds.push_back(In.getOperand(0));
   }
-  SDValue BV = DAG.getNode(ISD::BUILD_VECTOR, dl, NVT, Opnds);
+  SDValue BV = DAG.getBuildVector(NVT, dl, Opnds);
   AddToWorklist(BV.getNode());
 
   return DAG.getNode(Opcode, dl, VT, BV);
@@ -12545,7 +12731,7 @@ SDValue DAGCombiner::visitBUILD_VECTOR(SDNode *N) {
   for (unsigned i = 0; i != NumInScalars; ++i) {
     SDValue Op = N->getOperand(i);
     // Ignore undef inputs.
-    if (Op.getOpcode() == ISD::UNDEF) continue;
+    if (Op.isUndef()) continue;
 
     // See if we can combine this build_vector into a blend with a zero vector.
     if (!VecIn2.getNode() && (isNullConstant(Op) || isNullFPConstant(Op))) {
@@ -12681,7 +12867,7 @@ SDValue DAGCombiner::visitBUILD_VECTOR(SDNode *N) {
     SDValue Ops[2];
     Ops[0] = VecIn1;
     Ops[1] = VecIn2;
-    return DAG.getVectorShuffle(VT, dl, Ops[0], Ops[1], &Mask[0]);
+    return DAG.getVectorShuffle(VT, dl, Ops[0], Ops[1], Mask);
   }
 
   return SDValue();
@@ -12735,18 +12921,17 @@ static SDValue combineConcatVectorOfScalars(SDNode *N, SelectionDAG &DAG) {
       for (SDValue &Op : Ops) {
         if (Op.getValueType() == SVT)
           continue;
-        if (Op.getOpcode() == ISD::UNDEF)
+        if (Op.isUndef())
           Op = ScalarUndef;
         else
-          Op = DAG.getNode(ISD::BITCAST, DL, SVT, Op);
+          Op = DAG.getBitcast(SVT, Op);
       }
     }
   }
 
   EVT VecVT = EVT::getVectorVT(*DAG.getContext(), SVT,
                                VT.getSizeInBits() / SVT.getSizeInBits());
-  return DAG.getNode(ISD::BITCAST, DL, VT,
-                     DAG.getNode(ISD::BUILD_VECTOR, DL, VecVT, Ops));
+  return DAG.getBitcast(VT, DAG.getBuildVector(VecVT, DL, Ops));
 }
 
 // Check to see if this is a CONCAT_VECTORS of a bunch of EXTRACT_SUBVECTOR
@@ -12768,7 +12953,7 @@ static SDValue combineConcatVectorOfExtracts(SDNode *N, SelectionDAG &DAG) {
       Op = Op.getOperand(0);
 
     // UNDEF nodes convert to UNDEF shuffle mask values.
-    if (Op.getOpcode() == ISD::UNDEF) {
+    if (Op.isUndef()) {
       Mask.append((unsigned)NumOpElts, -1);
       continue;
     }
@@ -12788,7 +12973,7 @@ static SDValue combineConcatVectorOfExtracts(SDNode *N, SelectionDAG &DAG) {
       ExtVec = ExtVec.getOperand(0);
 
     // UNDEF nodes convert to UNDEF shuffle mask values.
-    if (ExtVec.getOpcode() == ISD::UNDEF) {
+    if (ExtVec.isUndef()) {
       Mask.append((unsigned)NumOpElts, -1);
       continue;
     }
@@ -12812,11 +12997,11 @@ static SDValue combineConcatVectorOfExtracts(SDNode *N, SelectionDAG &DAG) {
       return SDValue();
 
     // At most we can reference 2 inputs in the final shuffle.
-    if (SV0.getOpcode() == ISD::UNDEF || SV0 == ExtVec) {
+    if (SV0.isUndef() || SV0 == ExtVec) {
       SV0 = ExtVec;
       for (int i = 0; i != NumOpElts; ++i)
         Mask.push_back(i + ExtIdx);
-    } else if (SV1.getOpcode() == ISD::UNDEF || SV1 == ExtVec) {
+    } else if (SV1.isUndef() || SV1 == ExtVec) {
       SV1 = ExtVec;
       for (int i = 0; i != NumOpElts; ++i)
         Mask.push_back(i + ExtIdx + NumElts);
@@ -12844,7 +13029,7 @@ SDValue DAGCombiner::visitCONCAT_VECTORS(SDNode *N) {
 
   // Optimize concat_vectors where all but the first of the vectors are undef.
   if (std::all_of(std::next(N->op_begin()), N->op_end(), [](const SDValue &Op) {
-        return Op.getOpcode() == ISD::UNDEF;
+        return Op.isUndef();
       })) {
     SDValue In = N->getOperand(0);
     assert(In.getValueType().isVector() && "Must concat vectors");
@@ -12874,7 +13059,7 @@ SDValue DAGCombiner::visitCONCAT_VECTORS(SDNode *N) {
 
       SDLoc dl = SDLoc(N);
       SDValue Res = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, NVT, Scalar);
-      return DAG.getNode(ISD::BITCAST, dl, VT, Res);
+      return DAG.getBitcast(VT, Res);
     }
   }
 
@@ -12885,9 +13070,7 @@ SDValue DAGCombiner::visitCONCAT_VECTORS(SDNode *N) {
   auto IsBuildVectorOrUndef = [](const SDValue &Op) {
     return ISD::UNDEF == Op.getOpcode() || ISD::BUILD_VECTOR == Op.getOpcode();
   };
-  bool AllBuildVectorsOrUndefs =
-      std::all_of(N->op_begin(), N->op_end(), IsBuildVectorOrUndef);
-  if (AllBuildVectorsOrUndefs) {
+  if (llvm::all_of(N->ops(), IsBuildVectorOrUndef)) {
     SmallVector<SDValue, 8> Opnds;
     EVT SVT = VT.getScalarType();
 
@@ -12926,7 +13109,7 @@ SDValue DAGCombiner::visitCONCAT_VECTORS(SDNode *N) {
 
     assert(VT.getVectorNumElements() == Opnds.size() &&
            "Concat vector type mismatch");
-    return DAG.getNode(ISD::BUILD_VECTOR, SDLoc(N), VT, Opnds);
+    return DAG.getBuildVector(VT, SDLoc(N), Opnds);
   }
 
   // Fold CONCAT_VECTORS of only bitcast scalars (or undef) to BUILD_VECTOR.
@@ -12948,7 +13131,7 @@ SDValue DAGCombiner::visitCONCAT_VECTORS(SDNode *N) {
   for (unsigned i = 0, e = N->getNumOperands(); i != e; ++i) {
     SDValue Op = N->getOperand(i);
 
-    if (Op.getOpcode() == ISD::UNDEF)
+    if (Op.isUndef())
       continue;
 
     // Check if this is the identity extract:
@@ -13033,11 +13216,11 @@ SDValue DAGCombiner::visitEXTRACT_SUBVECTOR(SDNode* N) {
       //    otherwise => (extract_subvec V1, ExtIdx)
       if (InsIdx->getZExtValue() * SmallVT.getScalarType().getSizeInBits() ==
           ExtIdx->getZExtValue() * NVT.getScalarType().getSizeInBits())
-        return DAG.getNode(ISD::BITCAST, dl, NVT, V->getOperand(1));
-      return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, NVT,
-                         DAG.getNode(ISD::BITCAST, dl,
-                                     N->getOperand(0).getValueType(),
-                                     V->getOperand(0)), N->getOperand(1));
+        return DAG.getBitcast(NVT, V->getOperand(1));
+      return DAG.getNode(
+          ISD::EXTRACT_SUBVECTOR, dl, NVT,
+          DAG.getBitcast(N->getOperand(0).getValueType(), V->getOperand(0)),
+          N->getOperand(1));
     }
   }
 
@@ -13148,7 +13331,7 @@ static SDValue partitionShuffleOfConcats(SDNode *N, SelectionDAG &DAG) {
   // Special case: shuffle(concat(A,B)) can be more efficiently represented
   // as concat(shuffle(A,B),UNDEF) if the shuffle doesn't set any of the high
   // half vector elements.
-  if (NumElemsPerConcat * 2 == NumElts && N1.getOpcode() == ISD::UNDEF &&
+  if (NumElemsPerConcat * 2 == NumElts && N1.isUndef() &&
       std::all_of(SVN->getMask().begin() + NumElemsPerConcat,
                   SVN->getMask().end(), [](int i) { return i == -1; })) {
     N0 = DAG.getVectorShuffle(ConcatVT, SDLoc(N), N0.getOperand(0), N0.getOperand(1),
@@ -13204,7 +13387,7 @@ SDValue DAGCombiner::visitVECTOR_SHUFFLE(SDNode *N) {
   assert(N0.getValueType() == VT && "Vector shuffle must be normalized in DAG");
 
   // Canonicalize shuffle undef, undef -> undef
-  if (N0.getOpcode() == ISD::UNDEF && N1.getOpcode() == ISD::UNDEF)
+  if (N0.isUndef() && N1.isUndef())
     return DAG.getUNDEF(VT);
 
   ShuffleVectorSDNode *SVN = cast<ShuffleVectorSDNode>(N);
@@ -13217,29 +13400,15 @@ SDValue DAGCombiner::visitVECTOR_SHUFFLE(SDNode *N) {
       if (Idx >= (int)NumElts) Idx -= NumElts;
       NewMask.push_back(Idx);
     }
-    return DAG.getVectorShuffle(VT, SDLoc(N), N0, DAG.getUNDEF(VT),
-                                &NewMask[0]);
+    return DAG.getVectorShuffle(VT, SDLoc(N), N0, DAG.getUNDEF(VT), NewMask);
   }
 
   // Canonicalize shuffle undef, v -> v, undef.  Commute the shuffle mask.
-  if (N0.getOpcode() == ISD::UNDEF) {
-    SmallVector<int, 8> NewMask;
-    for (unsigned i = 0; i != NumElts; ++i) {
-      int Idx = SVN->getMaskElt(i);
-      if (Idx >= 0) {
-        if (Idx >= (int)NumElts)
-          Idx -= NumElts;
-        else
-          Idx = -1; // remove reference to lhs
-      }
-      NewMask.push_back(Idx);
-    }
-    return DAG.getVectorShuffle(VT, SDLoc(N), N1, DAG.getUNDEF(VT),
-                                &NewMask[0]);
-  }
+  if (N0.isUndef())
+    return DAG.getCommutedVectorShuffle(*SVN);
 
   // Remove references to rhs if it is undef
-  if (N1.getOpcode() == ISD::UNDEF) {
+  if (N1.isUndef()) {
     bool Changed = false;
     SmallVector<int, 8> NewMask;
     for (unsigned i = 0; i != NumElts; ++i) {
@@ -13251,7 +13420,7 @@ SDValue DAGCombiner::visitVECTOR_SHUFFLE(SDNode *N) {
       NewMask.push_back(Idx);
     }
     if (Changed)
-      return DAG.getVectorShuffle(VT, SDLoc(N), N0, N1, &NewMask[0]);
+      return DAG.getVectorShuffle(VT, SDLoc(N), N0, N1, NewMask);
   }
 
   // If it is a splat, check if the argument vector is another splat or a
@@ -13275,7 +13444,7 @@ SDValue DAGCombiner::visitVECTOR_SHUFFLE(SDNode *N) {
       SDValue Base;
       bool AllSame = true;
       for (unsigned i = 0; i != NumElts; ++i) {
-        if (V->getOperand(i).getOpcode() != ISD::UNDEF) {
+        if (!V->getOperand(i).isUndef()) {
           Base = V->getOperand(i);
           break;
         }
@@ -13296,13 +13465,12 @@ SDValue DAGCombiner::visitVECTOR_SHUFFLE(SDNode *N) {
       // Canonicalize any other splat as a build_vector.
       const SDValue &Splatted = V->getOperand(SVN->getSplatIndex());
       SmallVector<SDValue, 8> Ops(NumElts, Splatted);
-      SDValue NewBV = DAG.getNode(ISD::BUILD_VECTOR, SDLoc(N),
-                                  V->getValueType(0), Ops);
+      SDValue NewBV = DAG.getBuildVector(V->getValueType(0), SDLoc(N), Ops);
 
       // We may have jumped through bitcasts, so the type of the
       // BUILD_VECTOR may not match the type of the shuffle.
       if (V->getValueType(0) != VT)
-        NewBV = DAG.getNode(ISD::BITCAST, SDLoc(N), VT, NewBV);
+        NewBV = DAG.getBitcast(VT, NewBV);
       return NewBV;
     }
   }
@@ -13315,12 +13483,10 @@ SDValue DAGCombiner::visitVECTOR_SHUFFLE(SDNode *N) {
 
   if (N0.getOpcode() == ISD::CONCAT_VECTORS &&
       Level < AfterLegalizeVectorOps &&
-      (N1.getOpcode() == ISD::UNDEF ||
+      (N1.isUndef() ||
       (N1.getOpcode() == ISD::CONCAT_VECTORS &&
        N0.getOperand(0).getValueType() == N1.getOperand(0).getValueType()))) {
-    SDValue V = partitionShuffleOfConcats(N, DAG);
-
-    if (V.getNode())
+    if (SDValue V = partitionShuffleOfConcats(N, DAG))
       return V;
   }
 
@@ -13357,7 +13523,7 @@ SDValue DAGCombiner::visitVECTOR_SHUFFLE(SDNode *N) {
           Op = TLI.isZExtFree(Op.getValueType(), SVT)
                    ? DAG.getZExtOrTrunc(Op, SDLoc(N), SVT)
                    : DAG.getSExtOrTrunc(Op, SDLoc(N), SVT);
-      return DAG.getNode(ISD::BUILD_VECTOR, SDLoc(N), VT, Ops);
+      return DAG.getBuildVector(VT, SDLoc(N), Ops);
     }
   }
 
@@ -13365,7 +13531,7 @@ SDValue DAGCombiner::visitVECTOR_SHUFFLE(SDNode *N) {
   // attempt to merge the 2 shuffles and suitably bitcast the inputs/output
   // back to their original types.
   if (N0.getOpcode() == ISD::BITCAST && N0.hasOneUse() &&
-      N1.getOpcode() == ISD::UNDEF && Level < AfterLegalizeVectorOps &&
+      N1.isUndef() && Level < AfterLegalizeVectorOps &&
       TLI.isTypeLegal(VT)) {
 
     // Peek through the bitcast only if there is one user.
@@ -13426,11 +13592,10 @@ SDValue DAGCombiner::visitVECTOR_SHUFFLE(SDNode *N) {
         }
 
         if (LegalMask) {
-          SV0 = DAG.getNode(ISD::BITCAST, SDLoc(N), ScaleVT, SV0);
-          SV1 = DAG.getNode(ISD::BITCAST, SDLoc(N), ScaleVT, SV1);
-          return DAG.getNode(
-              ISD::BITCAST, SDLoc(N), VT,
-              DAG.getVectorShuffle(ScaleVT, SDLoc(N), SV0, SV1, NewMask));
+          SV0 = DAG.getBitcast(ScaleVT, SV0);
+          SV1 = DAG.getBitcast(ScaleVT, SV1);
+          return DAG.getBitcast(
+              VT, DAG.getVectorShuffle(ScaleVT, SDLoc(N), SV0, SV1, NewMask));
         }
       }
     }
@@ -13451,7 +13616,7 @@ SDValue DAGCombiner::visitVECTOR_SHUFFLE(SDNode *N) {
     SDValue SV0 = N1->getOperand(0);
     SDValue SV1 = N1->getOperand(1);
     bool HasSameOp0 = N0 == SV0;
-    bool IsSV1Undef = SV1.getOpcode() == ISD::UNDEF;
+    bool IsSV1Undef = SV1.isUndef();
     if (HasSameOp0 || IsSV1Undef || N0 == SV1)
       // Commute the operands of this shuffle so that next rule
       // will trigger.
@@ -13504,7 +13669,7 @@ SDValue DAGCombiner::visitVECTOR_SHUFFLE(SDNode *N) {
       }
 
       // Simple case where 'CurrentVec' is UNDEF.
-      if (CurrentVec.getOpcode() == ISD::UNDEF) {
+      if (CurrentVec.isUndef()) {
         Mask.push_back(-1);
         continue;
       }
@@ -13559,7 +13724,7 @@ SDValue DAGCombiner::visitVECTOR_SHUFFLE(SDNode *N) {
     //   shuffle(shuffle(A, B, M0), C, M1) -> shuffle(A, B, M2)
     //   shuffle(shuffle(A, B, M0), C, M1) -> shuffle(A, C, M2)
     //   shuffle(shuffle(A, B, M0), C, M1) -> shuffle(B, C, M2)
-    return DAG.getVectorShuffle(VT, SDLoc(N), SV0, SV1, &Mask[0]);
+    return DAG.getVectorShuffle(VT, SDLoc(N), SV0, SV1, Mask);
   }
 
   return SDValue();
@@ -13595,26 +13760,30 @@ SDValue DAGCombiner::visitSCALAR_TO_VECTOR(SDNode *N) {
 
 SDValue DAGCombiner::visitINSERT_SUBVECTOR(SDNode *N) {
   SDValue N0 = N->getOperand(0);
+  SDValue N1 = N->getOperand(1);
   SDValue N2 = N->getOperand(2);
 
+  if (N0.getValueType() != N1.getValueType())
+    return SDValue();
+
   // If the input vector is a concatenation, and the insert replaces
   // one of the halves, we can optimize into a single concat_vectors.
-  if (N0.getOpcode() == ISD::CONCAT_VECTORS &&
-      N0->getNumOperands() == 2 && N2.getOpcode() == ISD::Constant) {
+  if (N0.getOpcode() == ISD::CONCAT_VECTORS && N0->getNumOperands() == 2 &&
+      N2.getOpcode() == ISD::Constant) {
     APInt InsIdx = cast<ConstantSDNode>(N2)->getAPIntValue();
     EVT VT = N->getValueType(0);
 
     // Lower half: fold (insert_subvector (concat_vectors X, Y), Z) ->
     // (concat_vectors Z, Y)
     if (InsIdx == 0)
-      return DAG.getNode(ISD::CONCAT_VECTORS, SDLoc(N), VT,
-                         N->getOperand(1), N0.getOperand(1));
+      return DAG.getNode(ISD::CONCAT_VECTORS, SDLoc(N), VT, N1,
+                         N0.getOperand(1));
 
     // Upper half: fold (insert_subvector (concat_vectors X, Y), Z) ->
     // (concat_vectors X, Z)
-    if (InsIdx == VT.getVectorNumElements()/2)
-      return DAG.getNode(ISD::CONCAT_VECTORS, SDLoc(N), VT,
-                         N0.getOperand(0), N->getOperand(1));
+    if (InsIdx == VT.getVectorNumElements() / 2)
+      return DAG.getNode(ISD::CONCAT_VECTORS, SDLoc(N), VT, N0.getOperand(0),
+                         N1);
   }
 
   return SDValue();
@@ -13684,7 +13853,7 @@ SDValue DAGCombiner::XformToShuffleWithZero(SDNode *N) {
       int EltIdx = i / Split;
       int SubIdx = i % Split;
       SDValue Elt = RHS.getOperand(EltIdx);
-      if (Elt.getOpcode() == ISD::UNDEF) {
+      if (Elt.isUndef()) {
         Indices.push_back(-1);
         continue;
       }
@@ -13724,7 +13893,7 @@ SDValue DAGCombiner::XformToShuffleWithZero(SDNode *N) {
     SDValue Zero = DAG.getConstant(0, dl, ClearVT);
     return DAG.getBitcast(VT, DAG.getVectorShuffle(ClearVT, dl,
                                                    DAG.getBitcast(ClearVT, LHS),
-                                                   Zero, &Indices[0]));
+                                                   Zero, Indices));
   };
 
   // Determine maximum split level (byte level masking).
@@ -13763,8 +13932,8 @@ SDValue DAGCombiner::SimplifyVBinOp(SDNode *N) {
   //   -> (shuffle (VBinOp (A, B)), Undef, Mask).
   if (LegalTypes && isa<ShuffleVectorSDNode>(LHS) &&
       isa<ShuffleVectorSDNode>(RHS) && LHS.hasOneUse() && RHS.hasOneUse() &&
-      LHS.getOperand(1).getOpcode() == ISD::UNDEF &&
-      RHS.getOperand(1).getOpcode() == ISD::UNDEF) {
+      LHS.getOperand(1).isUndef() &&
+      RHS.getOperand(1).isUndef()) {
     ShuffleVectorSDNode *SVN0 = cast<ShuffleVectorSDNode>(LHS);
     ShuffleVectorSDNode *SVN1 = cast<ShuffleVectorSDNode>(RHS);
 
@@ -13776,15 +13945,15 @@ SDValue DAGCombiner::SimplifyVBinOp(SDNode *N) {
                                      N->getFlags());
       AddUsersToWorklist(N);
       return DAG.getVectorShuffle(VT, SDLoc(N), NewBinOp, UndefVector,
-                                  &SVN0->getMask()[0]);
+                                  SVN0->getMask());
     }
   }
 
   return SDValue();
 }
 
-SDValue DAGCombiner::SimplifySelect(SDLoc DL, SDValue N0,
-                                    SDValue N1, SDValue N2){
+SDValue DAGCombiner::SimplifySelect(const SDLoc &DL, SDValue N0, SDValue N1,
+                                    SDValue N2) {
   assert(N0.getOpcode() ==ISD::SETCC && "First argument must be a SetCC node!");
 
   SDValue SCC = SimplifySelectCC(DL, N0.getOperand(0), N0.getOperand(1), N1, N2,
@@ -13819,33 +13988,33 @@ SDValue DAGCombiner::SimplifySelect(SDLoc DL, SDValue N0,
 bool DAGCombiner::SimplifySelectOps(SDNode *TheSelect, SDValue LHS,
                                     SDValue RHS) {
 
-  // fold (select (setcc x, -0.0, *lt), NaN, (fsqrt x))
-  // The select + setcc is redundant, because fsqrt returns NaN for X < -0.
+  // fold (select (setcc x, [+-]0.0, *lt), NaN, (fsqrt x))
+  // The select + setcc is redundant, because fsqrt returns NaN for X < 0.
   if (const ConstantFPSDNode *NaN = isConstOrConstSplatFP(LHS)) {
     if (NaN->isNaN() && RHS.getOpcode() == ISD::FSQRT) {
       // We have: (select (setcc ?, ?, ?), NaN, (fsqrt ?))
       SDValue Sqrt = RHS;
       ISD::CondCode CC;
       SDValue CmpLHS;
-      const ConstantFPSDNode *NegZero = nullptr;
+      const ConstantFPSDNode *Zero = nullptr;
 
       if (TheSelect->getOpcode() == ISD::SELECT_CC) {
         CC = dyn_cast<CondCodeSDNode>(TheSelect->getOperand(4))->get();
         CmpLHS = TheSelect->getOperand(0);
-        NegZero = isConstOrConstSplatFP(TheSelect->getOperand(1));
+        Zero = isConstOrConstSplatFP(TheSelect->getOperand(1));
       } else {
         // SELECT or VSELECT
         SDValue Cmp = TheSelect->getOperand(0);
         if (Cmp.getOpcode() == ISD::SETCC) {
           CC = dyn_cast<CondCodeSDNode>(Cmp.getOperand(2))->get();
           CmpLHS = Cmp.getOperand(0);
-          NegZero = isConstOrConstSplatFP(Cmp.getOperand(1));
+          Zero = isConstOrConstSplatFP(Cmp.getOperand(1));
         }
       }
-      if (NegZero && NegZero->isNegative() && NegZero->isZero() &&
+      if (Zero && Zero->isZero() &&
           Sqrt.getOperand(0) == CmpLHS && (CC == ISD::SETOLT ||
           CC == ISD::SETULT || CC == ISD::SETLT)) {
-        // We have: (select (setcc x, -0.0, *lt), NaN, (fsqrt x))
+        // We have: (select (setcc x, [+-]0.0, *lt), NaN, (fsqrt x))
         CombineTo(TheSelect, Sqrt);
         return true;
       }
@@ -13932,24 +14101,22 @@ bool DAGCombiner::SimplifySelectOps(SDNode *TheSelect, SDValue LHS,
     // It is safe to replace the two loads if they have different alignments,
     // but the new load must be the minimum (most restrictive) alignment of the
     // inputs.
-    bool isInvariant = LLD->isInvariant() & RLD->isInvariant();
     unsigned Alignment = std::min(LLD->getAlignment(), RLD->getAlignment());
+    MachineMemOperand::Flags MMOFlags = LLD->getMemOperand()->getFlags();
+    if (!RLD->isInvariant())
+      MMOFlags &= ~MachineMemOperand::MOInvariant;
     if (LLD->getExtensionType() == ISD::NON_EXTLOAD) {
-      Load = DAG.getLoad(TheSelect->getValueType(0),
-                         SDLoc(TheSelect),
-                         // FIXME: Discards pointer and AA info.
-                         LLD->getChain(), Addr, MachinePointerInfo(),
-                         LLD->isVolatile(), LLD->isNonTemporal(),
-                         isInvariant, Alignment);
+      // FIXME: Discards pointer and AA info.
+      Load = DAG.getLoad(TheSelect->getValueType(0), SDLoc(TheSelect),
+                         LLD->getChain(), Addr, MachinePointerInfo(), Alignment,
+                         MMOFlags);
     } else {
-      Load = DAG.getExtLoad(LLD->getExtensionType() == ISD::EXTLOAD ?
-                            RLD->getExtensionType() : LLD->getExtensionType(),
-                            SDLoc(TheSelect),
-                            TheSelect->getValueType(0),
-                            // FIXME: Discards pointer and AA info.
-                            LLD->getChain(), Addr, MachinePointerInfo(),
-                            LLD->getMemoryVT(), LLD->isVolatile(),
-                            LLD->isNonTemporal(), isInvariant, Alignment);
+      // FIXME: Discards pointer and AA info.
+      Load = DAG.getExtLoad(
+          LLD->getExtensionType() == ISD::EXTLOAD ? RLD->getExtensionType()
+                                                  : LLD->getExtensionType(),
+          SDLoc(TheSelect), TheSelect->getValueType(0), LLD->getChain(), Addr,
+          MachinePointerInfo(), LLD->getMemoryVT(), Alignment, MMOFlags);
     }
 
     // Users of the select now use the result of the load.
@@ -13967,9 +14134,9 @@ bool DAGCombiner::SimplifySelectOps(SDNode *TheSelect, SDValue LHS,
 
 /// Simplify an expression of the form (N0 cond N1) ? N2 : N3
 /// where 'cond' is the comparison specified by CC.
-SDValue DAGCombiner::SimplifySelectCC(SDLoc DL, SDValue N0, SDValue N1,
-                                      SDValue N2, SDValue N3,
-                                      ISD::CondCode CC, bool NotExtCompare) {
+SDValue DAGCombiner::SimplifySelectCC(const SDLoc &DL, SDValue N0, SDValue N1,
+                                      SDValue N2, SDValue N3, ISD::CondCode CC,
+                                      bool NotExtCompare) {
   // (x ? y : y) -> y.
   if (N2 == N3) return N2;
 
@@ -14057,7 +14224,7 @@ SDValue DAGCombiner::SimplifySelectCC(SDLoc DL, SDValue N0, SDValue N1,
         return DAG.getLoad(
             TV->getValueType(0), DL, DAG.getEntryNode(), CPIdx,
             MachinePointerInfo::getConstantPool(DAG.getMachineFunction()),
-            false, false, false, Alignment);
+            Alignment);
       }
     }
 
@@ -14116,7 +14283,7 @@ SDValue DAGCombiner::SimplifySelectCC(SDLoc DL, SDValue N0, SDValue N1,
     ConstantSDNode *ConstAndRHS = dyn_cast<ConstantSDNode>(N0->getOperand(1));
     if (ConstAndRHS && ConstAndRHS->getAPIntValue().countPopulation() == 1) {
       // Shift the tested bit over the sign bit.
-      APInt AndMask = ConstAndRHS->getAPIntValue();
+      const APInt &AndMask = ConstAndRHS->getAPIntValue();
       SDValue ShlAmt =
         DAG.getConstant(AndMask.countLeadingZeros(), SDLoc(AndLHS),
                         getShiftAmountTy(AndLHS.getValueType()));
@@ -14210,13 +14377,48 @@ SDValue DAGCombiner::SimplifySelectCC(SDLoc DL, SDValue N0, SDValue N1,
     }
   }
 
+  // select_cc seteq X, 0, sizeof(X), ctlz(X) -> ctlz(X)
+  // select_cc seteq X, 0, sizeof(X), ctlz_zero_undef(X) -> ctlz(X)
+  // select_cc seteq X, 0, sizeof(X), cttz(X) -> cttz(X)
+  // select_cc seteq X, 0, sizeof(X), cttz_zero_undef(X) -> cttz(X)
+  // select_cc setne X, 0, ctlz(X), sizeof(X) -> ctlz(X)
+  // select_cc setne X, 0, ctlz_zero_undef(X), sizeof(X) -> ctlz(X)
+  // select_cc setne X, 0, cttz(X), sizeof(X) -> cttz(X)
+  // select_cc setne X, 0, cttz_zero_undef(X), sizeof(X) -> cttz(X)
+  if (N1C && N1C->isNullValue() && (CC == ISD::SETEQ || CC == ISD::SETNE)) {
+    SDValue ValueOnZero = N2;
+    SDValue Count = N3;
+    // If the condition is NE instead of E, swap the operands.
+    if (CC == ISD::SETNE)
+      std::swap(ValueOnZero, Count);
+    // Check if the value on zero is a constant equal to the bits in the type.
+    if (auto *ValueOnZeroC = dyn_cast<ConstantSDNode>(ValueOnZero)) {
+      if (ValueOnZeroC->getAPIntValue() == VT.getSizeInBits()) {
+        // If the other operand is cttz/cttz_zero_undef of N0, and cttz is
+        // legal, combine to just cttz.
+        if ((Count.getOpcode() == ISD::CTTZ ||
+             Count.getOpcode() == ISD::CTTZ_ZERO_UNDEF) &&
+            N0 == Count.getOperand(0) &&
+            (!LegalOperations || TLI.isOperationLegal(ISD::CTTZ, VT)))
+          return DAG.getNode(ISD::CTTZ, DL, VT, N0);
+        // If the other operand is ctlz/ctlz_zero_undef of N0, and ctlz is
+        // legal, combine to just ctlz.
+        if ((Count.getOpcode() == ISD::CTLZ ||
+             Count.getOpcode() == ISD::CTLZ_ZERO_UNDEF) &&
+            N0 == Count.getOperand(0) &&
+            (!LegalOperations || TLI.isOperationLegal(ISD::CTLZ, VT)))
+          return DAG.getNode(ISD::CTLZ, DL, VT, N0);
+      }
+    }
+  }
+
   return SDValue();
 }
 
 /// This is a stub for TargetLowering::SimplifySetCC.
-SDValue DAGCombiner::SimplifySetCC(EVT VT, SDValue N0,
-                                   SDValue N1, ISD::CondCode Cond,
-                                   SDLoc DL, bool foldBooleans) {
+SDValue DAGCombiner::SimplifySetCC(EVT VT, SDValue N0, SDValue N1,
+                                   ISD::CondCode Cond, const SDLoc &DL,
+                                   bool foldBooleans) {
   TargetLowering::DAGCombinerInfo
     DagCombineInfo(DAG, Level, false, this);
   return TLI.SimplifySetCC(VT, N0, N1, Cond, foldBooleans, DagCombineInfo, DL);
@@ -14227,6 +14429,11 @@ SDValue DAGCombiner::SimplifySetCC(EVT VT, SDValue N0,
 /// by a magic number.
 /// Ref: "Hacker's Delight" or "The PowerPC Compiler Writer's Guide".
 SDValue DAGCombiner::BuildSDIV(SDNode *N) {
+  // when optimising for minimum size, we don't want to expand a div to a mul
+  // and a shift.
+  if (DAG.getMachineFunction().getFunction()->optForMinSize())
+    return SDValue();
+
   ConstantSDNode *C = isConstOrConstSplat(N->getOperand(1));
   if (!C)
     return SDValue();
@@ -14268,6 +14475,11 @@ SDValue DAGCombiner::BuildSDIVPow2(SDNode *N) {
 /// number.
 /// Ref: "Hacker's Delight" or "The PowerPC Compiler Writer's Guide".
 SDValue DAGCombiner::BuildUDIV(SDNode *N) {
+  // when optimising for minimum size, we don't want to expand a div to a mul
+  // and a shift.
+  if (DAG.getMachineFunction().getFunction()->optForMinSize())
+    return SDValue();
+
   ConstantSDNode *C = isConstOrConstSplat(N->getOperand(1));
   if (!C)
     return SDValue();
@@ -14334,9 +14546,9 @@ SDValue DAGCombiner::BuildReciprocalEstimate(SDValue Op, SDNodeFlags *Flags) {
 ///     =>
 ///   X_{i+1} = X_i (1.5 - A X_i^2 / 2)
 /// As a result, we precompute A/2 prior to the iteration loop.
-SDValue DAGCombiner::BuildRsqrtNROneConst(SDValue Arg, SDValue Est,
-                                          unsigned Iterations,
-                                          SDNodeFlags *Flags) {
+SDValue DAGCombiner::buildSqrtNROneConst(SDValue Arg, SDValue Est,
+                                         unsigned Iterations,
+                                         SDNodeFlags *Flags, bool Reciprocal) {
   EVT VT = Arg.getValueType();
   SDLoc DL(Arg);
   SDValue ThreeHalves = DAG.getConstantFP(1.5, DL, VT);
@@ -14363,6 +14575,13 @@ SDValue DAGCombiner::BuildRsqrtNROneConst(SDValue Arg, SDValue Est,
     Est = DAG.getNode(ISD::FMUL, DL, VT, Est, NewEst, Flags);
     AddToWorklist(Est.getNode());
   }
+
+  // If non-reciprocal square root is requested, multiply the result by Arg.
+  if (!Reciprocal) {
+    Est = DAG.getNode(ISD::FMUL, DL, VT, Est, Arg, Flags);
+    AddToWorklist(Est.getNode());
+  }
+
   return Est;
 }
 
@@ -14371,35 +14590,55 @@ SDValue DAGCombiner::BuildRsqrtNROneConst(SDValue Arg, SDValue Est,
 ///   F(X) = 1/X^2 - A [which has a zero at X = 1/sqrt(A)]
 ///     =>
 ///   X_{i+1} = (-0.5 * X_i) * (A * X_i * X_i + (-3.0))
-SDValue DAGCombiner::BuildRsqrtNRTwoConst(SDValue Arg, SDValue Est,
-                                          unsigned Iterations,
-                                          SDNodeFlags *Flags) {
+SDValue DAGCombiner::buildSqrtNRTwoConst(SDValue Arg, SDValue Est,
+                                         unsigned Iterations,
+                                         SDNodeFlags *Flags, bool Reciprocal) {
   EVT VT = Arg.getValueType();
   SDLoc DL(Arg);
   SDValue MinusThree = DAG.getConstantFP(-3.0, DL, VT);
   SDValue MinusHalf = DAG.getConstantFP(-0.5, DL, VT);
 
-  // Newton iterations: Est = -0.5 * Est * (-3.0 + Arg * Est * Est)
-  for (unsigned i = 0; i < Iterations; ++i) {
-    SDValue HalfEst = DAG.getNode(ISD::FMUL, DL, VT, Est, MinusHalf, Flags);
-    AddToWorklist(HalfEst.getNode());
-
-    Est = DAG.getNode(ISD::FMUL, DL, VT, Est, Est, Flags);
-    AddToWorklist(Est.getNode());
+  // This routine must enter the loop below to work correctly
+  // when (Reciprocal == false).
+  assert(Iterations > 0);
 
-    Est = DAG.getNode(ISD::FMUL, DL, VT, Est, Arg, Flags);
-    AddToWorklist(Est.getNode());
-
-    Est = DAG.getNode(ISD::FADD, DL, VT, Est, MinusThree, Flags);
-    AddToWorklist(Est.getNode());
+  // Newton iterations for reciprocal square root:
+  // E = (E * -0.5) * ((A * E) * E + -3.0)
+  for (unsigned i = 0; i < Iterations; ++i) {
+    SDValue AE = DAG.getNode(ISD::FMUL, DL, VT, Arg, Est, Flags);
+    AddToWorklist(AE.getNode());
+
+    SDValue AEE = DAG.getNode(ISD::FMUL, DL, VT, AE, Est, Flags);
+    AddToWorklist(AEE.getNode());
+
+    SDValue RHS = DAG.getNode(ISD::FADD, DL, VT, AEE, MinusThree, Flags);
+    AddToWorklist(RHS.getNode());
+
+    // When calculating a square root at the last iteration build:
+    // S = ((A * E) * -0.5) * ((A * E) * E + -3.0)
+    // (notice a common subexpression)
+    SDValue LHS;
+    if (Reciprocal || (i + 1) < Iterations) {
+      // RSQRT: LHS = (E * -0.5)
+      LHS = DAG.getNode(ISD::FMUL, DL, VT, Est, MinusHalf, Flags);
+    } else {
+      // SQRT: LHS = (A * E) * -0.5
+      LHS = DAG.getNode(ISD::FMUL, DL, VT, AE, MinusHalf, Flags);
+    }
+    AddToWorklist(LHS.getNode());
 
-    Est = DAG.getNode(ISD::FMUL, DL, VT, Est, HalfEst, Flags);
+    Est = DAG.getNode(ISD::FMUL, DL, VT, LHS, RHS, Flags);
     AddToWorklist(Est.getNode());
   }
+
   return Est;
 }
 
-SDValue DAGCombiner::BuildRsqrtEstimate(SDValue Op, SDNodeFlags *Flags) {
+/// Build code to calculate either rsqrt(Op) or sqrt(Op). In the latter case
+/// Op*rsqrt(Op) is actually computed, so additional postprocessing is needed if
+/// Op can be zero.
+SDValue DAGCombiner::buildSqrtEstimateImpl(SDValue Op, SDNodeFlags *Flags,
+                                           bool Reciprocal) {
   if (Level >= AfterLegalizeDAG)
     return SDValue();
 
@@ -14410,9 +14649,9 @@ SDValue DAGCombiner::BuildRsqrtEstimate(SDValue Op, SDNodeFlags *Flags) {
   if (SDValue Est = TLI.getRsqrtEstimate(Op, DCI, Iterations, UseOneConstNR)) {
     AddToWorklist(Est.getNode());
     if (Iterations) {
-      Est = UseOneConstNR ?
-        BuildRsqrtNROneConst(Op, Est, Iterations, Flags) :
-        BuildRsqrtNRTwoConst(Op, Est, Iterations, Flags);
+      Est = UseOneConstNR
+                ? buildSqrtNROneConst(Op, Est, Iterations, Flags, Reciprocal)
+                : buildSqrtNRTwoConst(Op, Est, Iterations, Flags, Reciprocal);
     }
     return Est;
   }
@@ -14420,6 +14659,30 @@ SDValue DAGCombiner::BuildRsqrtEstimate(SDValue Op, SDNodeFlags *Flags) {
   return SDValue();
 }
 
+SDValue DAGCombiner::buildRsqrtEstimate(SDValue Op, SDNodeFlags *Flags) {
+  return buildSqrtEstimateImpl(Op, Flags, true);
+}
+
+SDValue DAGCombiner::buildSqrtEstimate(SDValue Op, SDNodeFlags *Flags) {
+  SDValue Est = buildSqrtEstimateImpl(Op, Flags, false);
+  if (!Est)
+    return SDValue();
+
+  // Unfortunately, Est is now NaN if the input was exactly 0.
+  // Select out this case and force the answer to 0.
+  EVT VT = Est.getValueType();
+  SDLoc DL(Op);
+  SDValue Zero = DAG.getConstantFP(0.0, DL, VT);
+  EVT CCVT = getSetCCResultType(VT);
+  SDValue ZeroCmp = DAG.getSetCC(DL, CCVT, Op, Zero, ISD::SETEQ);
+  AddToWorklist(ZeroCmp.getNode());
+
+  Est = DAG.getNode(VT.isVector() ? ISD::VSELECT : ISD::SELECT, DL, VT, ZeroCmp,
+                    Zero, Est);
+  AddToWorklist(Est.getNode());
+  return Est;
+}
+
 /// Return true if base is a frame index, which is known not to alias with
 /// anything but itself.  Provides base object and offset as results.
 static bool FindBaseOffset(SDValue Ptr, SDValue &Base, int64_t &Offset,
@@ -14514,7 +14777,7 @@ bool DAGCombiner::isAlias(LSBaseSDNode *Op0, LSBaseSDNode *Op1) const {
       (Op0->getSrcValueOffset() != Op1->getSrcValueOffset()) &&
       (Op0->getMemoryVT().getSizeInBits() >> 3 ==
        Op1->getMemoryVT().getSizeInBits() >> 3) &&
-      (Op0->getOriginalAlignment() > Op0->getMemoryVT().getSizeInBits()) >> 3) {
+      (Op0->getOriginalAlignment() > (Op0->getMemoryVT().getSizeInBits() >> 3))) {
     int64_t OffAlign1 = Op0->getSrcValueOffset() % Op0->getOriginalAlignment();
     int64_t OffAlign2 = Op1->getSrcValueOffset() % Op1->getOriginalAlignment();
 
@@ -14634,63 +14897,6 @@ void DAGCombiner::GatherAllAliases(SDNode *N, SDValue OriginalChain,
       break;
     }
   }
-
-  // We need to be careful here to also search for aliases through the
-  // value operand of a store, etc. Consider the following situation:
-  //   Token1 = ...
-  //   L1 = load Token1, %52
-  //   S1 = store Token1, L1, %51
-  //   L2 = load Token1, %52+8
-  //   S2 = store Token1, L2, %51+8
-  //   Token2 = Token(S1, S2)
-  //   L3 = load Token2, %53
-  //   S3 = store Token2, L3, %52
-  //   L4 = load Token2, %53+8
-  //   S4 = store Token2, L4, %52+8
-  // If we search for aliases of S3 (which loads address %52), and we look
-  // only through the chain, then we'll miss the trivial dependence on L1
-  // (which also loads from %52). We then might change all loads and
-  // stores to use Token1 as their chain operand, which could result in
-  // copying %53 into %52 before copying %52 into %51 (which should
-  // happen first).
-  //
-  // The problem is, however, that searching for such data dependencies
-  // can become expensive, and the cost is not directly related to the
-  // chain depth. Instead, we'll rule out such configurations here by
-  // insisting that we've visited all chain users (except for users
-  // of the original chain, which is not necessary). When doing this,
-  // we need to look through nodes we don't care about (otherwise, things
-  // like register copies will interfere with trivial cases).
-
-  SmallVector<const SDNode *, 16> Worklist;
-  for (const SDNode *N : Visited)
-    if (N != OriginalChain.getNode())
-      Worklist.push_back(N);
-
-  while (!Worklist.empty()) {
-    const SDNode *M = Worklist.pop_back_val();
-
-    // We have already visited M, and want to make sure we've visited any uses
-    // of M that we care about. For uses that we've not visisted, and don't
-    // care about, queue them to the worklist.
-
-    for (SDNode::use_iterator UI = M->use_begin(),
-         UIE = M->use_end(); UI != UIE; ++UI)
-      if (UI.getUse().getValueType() == MVT::Other &&
-          Visited.insert(*UI).second) {
-        if (isa<MemSDNode>(*UI)) {
-          // We've not visited this use, and we care about it (it could have an
-          // ordering dependency with the original node).
-          Aliases.clear();
-          Aliases.push_back(OriginalChain);
-          return;
-        }
-
-        // We've not visited this use, but we don't care about it. Mark it as
-        // visited and enqueue it to the worklist.
-        Worklist.push_back(*UI);
-      }
-  }
 }
 
 /// Walk up chain skipping non-aliasing memory nodes, looking for a better chain
@@ -14713,17 +14919,17 @@ SDValue DAGCombiner::FindBetterChain(SDNode *N, SDValue OldChain) {
   return DAG.getNode(ISD::TokenFactor, SDLoc(N), MVT::Other, Aliases);
 }
 
-bool DAGCombiner::findBetterNeighborChains(StoreSDNode* St) {
+bool DAGCombiner::findBetterNeighborChains(StoreSDNode *St) {
   // This holds the base pointer, index, and the offset in bytes from the base
   // pointer.
-  BaseIndexOffset BasePtr = BaseIndexOffset::match(St->getBasePtr());
+  BaseIndexOffset BasePtr = BaseIndexOffset::match(St->getBasePtr(), DAG);
 
   // We must have a base and an offset.
   if (!BasePtr.Base.getNode())
     return false;
 
   // Do not handle stores to undef base pointers.
-  if (BasePtr.Base.getOpcode() == ISD::UNDEF)
+  if (BasePtr.Base.isUndef())
     return false;
 
   SmallVector<StoreSDNode *, 8> ChainedStores;
@@ -14742,7 +14948,7 @@ bool DAGCombiner::findBetterNeighborChains(StoreSDNode* St) {
       break;
 
     // Find the base pointer and offset for this memory node.
-    BaseIndexOffset Ptr = BaseIndexOffset::match(Index->getBasePtr());
+    BaseIndexOffset Ptr = BaseIndexOffset::match(Index->getBasePtr(), DAG);
 
     // Check that the base pointer is the same as the original one.
     if (!Ptr.equalBaseIndex(BasePtr))
@@ -14756,6 +14962,10 @@ bool DAGCombiner::findBetterNeighborChains(StoreSDNode* St) {
     while (true) {
       if (StoreSDNode *STn = dyn_cast<StoreSDNode>(NextInChain)) {
         // We found a store node. Use it for the next iteration.
+        if (STn->isVolatile() || STn->isIndexed()) {
+          Index = nullptr;
+          break;
+        }
         ChainedStores.push_back(STn);
         Index = STn;
         break;
@@ -14769,7 +14979,7 @@ bool DAGCombiner::findBetterNeighborChains(StoreSDNode* St) {
     }
   }
 
-  bool MadeChange = false;
+  bool MadeChangeToSt = false;
   SmallVector<std::pair<StoreSDNode *, SDValue>, 8> BetterChains;
 
   for (StoreSDNode *ChainedStore : ChainedStores) {
@@ -14777,7 +14987,8 @@ bool DAGCombiner::findBetterNeighborChains(StoreSDNode* St) {
     SDValue BetterChain = FindBetterChain(ChainedStore, Chain);
 
     if (Chain != BetterChain) {
-      MadeChange = true;
+      if (ChainedStore == St)
+        MadeChangeToSt = true;
       BetterChains.push_back(std::make_pair(ChainedStore, BetterChain));
     }
   }
@@ -14787,7 +14998,7 @@ bool DAGCombiner::findBetterNeighborChains(StoreSDNode* St) {
   for (auto Replacement : BetterChains)
     replaceStoreChain(Replacement.first, Replacement.second);
 
-  return MadeChange;
+  return MadeChangeToSt;
 }
 
 /// This is the entry point for the file.
diff --git a/lib/CodeGen/SelectionDAG/FastISel.cpp b/lib/CodeGen/SelectionDAG/FastISel.cpp
index cfbb20947acc..b10da002fcfe 100644
--- a/lib/CodeGen/SelectionDAG/FastISel.cpp
+++ b/lib/CodeGen/SelectionDAG/FastISel.cpp
@@ -39,7 +39,6 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "llvm/CodeGen/Analysis.h"
 #include "llvm/ADT/Optional.h"
 #include "llvm/ADT/Statistic.h"
 #include "llvm/Analysis/BranchProbabilityInfo.h"
@@ -56,6 +55,7 @@
 #include "llvm/IR/DataLayout.h"
 #include "llvm/IR/DebugInfo.h"
 #include "llvm/IR/Function.h"
+#include "llvm/IR/GetElementPtrTypeIterator.h"
 #include "llvm/IR/GlobalVariable.h"
 #include "llvm/IR/Instructions.h"
 #include "llvm/IR/IntrinsicInst.h"
@@ -88,6 +88,8 @@ void FastISel::ArgListEntry::setAttributes(ImmutableCallSite *CS,
   IsByVal = CS->paramHasAttr(AttrIdx, Attribute::ByVal);
   IsInAlloca = CS->paramHasAttr(AttrIdx, Attribute::InAlloca);
   IsReturned = CS->paramHasAttr(AttrIdx, Attribute::Returned);
+  IsSwiftSelf = CS->paramHasAttr(AttrIdx, Attribute::SwiftSelf);
+  IsSwiftError = CS->paramHasAttr(AttrIdx, Attribute::SwiftError);
   Alignment = CS->getParamAlignment(AttrIdx);
 }
 
@@ -351,7 +353,8 @@ void FastISel::recomputeInsertPt() {
 
 void FastISel::removeDeadCode(MachineBasicBlock::iterator I,
                               MachineBasicBlock::iterator E) {
-  assert(I && E && std::distance(I, E) > 0 && "Invalid iterator!");
+  assert(static_cast<MachineInstr *>(I) && static_cast<MachineInstr *>(E) &&
+         std::distance(I, E) > 0 && "Invalid iterator!");
   while (I != E) {
     MachineInstr *Dead = &*I;
     ++I;
@@ -372,7 +375,7 @@ FastISel::SavePoint FastISel::enterLocalValueArea() {
 
 void FastISel::leaveLocalValueArea(SavePoint OldInsertPt) {
   if (FuncInfo.InsertPt != FuncInfo.MBB->begin())
-    LastLocalValue = std::prev(FuncInfo.InsertPt);
+    LastLocalValue = &*std::prev(FuncInfo.InsertPt);
 
   // Restore the previous insert position.
   FuncInfo.InsertPt = OldInsertPt.InsertPt;
@@ -492,13 +495,11 @@ bool FastISel::selectGetElementPtr(const User *I) {
   uint64_t TotalOffs = 0;
   // FIXME: What's a good SWAG number for MaxOffs?
   uint64_t MaxOffs = 2048;
-  Type *Ty = I->getOperand(0)->getType();
   MVT VT = TLI.getPointerTy(DL);
-  for (GetElementPtrInst::const_op_iterator OI = I->op_begin() + 1,
-                                            E = I->op_end();
-       OI != E; ++OI) {
-    const Value *Idx = *OI;
-    if (auto *StTy = dyn_cast<StructType>(Ty)) {
+  for (gep_type_iterator GTI = gep_type_begin(I), E = gep_type_end(I);
+       GTI != E; ++GTI) {
+    const Value *Idx = GTI.getOperand();
+    if (auto *StTy = dyn_cast<StructType>(*GTI)) {
       uint64_t Field = cast<ConstantInt>(Idx)->getZExtValue();
       if (Field) {
         // N = N + Offset
@@ -511,9 +512,8 @@ bool FastISel::selectGetElementPtr(const User *I) {
           TotalOffs = 0;
         }
       }
-      Ty = StTy->getElementType(Field);
     } else {
-      Ty = cast<SequentialType>(Ty)->getElementType();
+      Type *Ty = GTI.getIndexedType();
 
       // If this is a constant subscript, handle it quickly.
       if (const auto *CI = dyn_cast<ConstantInt>(Idx)) {
@@ -880,9 +880,8 @@ bool FastISel::lowerCallTo(const CallInst *CI, MCSymbol *Symbol,
                            unsigned NumArgs) {
   ImmutableCallSite CS(CI);
 
-  PointerType *PT = cast<PointerType>(CS.getCalledValue()->getType());
-  FunctionType *FTy = cast<FunctionType>(PT->getElementType());
-  Type *RetTy = FTy->getReturnType();
+  FunctionType *FTy = CS.getFunctionType();
+  Type *RetTy = CS.getType();
 
   ArgListTy Args;
   Args.reserve(NumArgs);
@@ -960,6 +959,10 @@ bool FastISel::lowerCallTo(CallLoweringInfo &CLI) {
       Flags.setInReg();
     if (Arg.IsSRet)
       Flags.setSRet();
+    if (Arg.IsSwiftSelf)
+      Flags.setSwiftSelf();
+    if (Arg.IsSwiftError)
+      Flags.setSwiftError();
     if (Arg.IsByVal)
       Flags.setByVal();
     if (Arg.IsInAlloca) {
@@ -1010,9 +1013,8 @@ bool FastISel::lowerCallTo(CallLoweringInfo &CLI) {
 bool FastISel::lowerCall(const CallInst *CI) {
   ImmutableCallSite CS(CI);
 
-  PointerType *PT = cast<PointerType>(CS.getCalledValue()->getType());
-  FunctionType *FuncTy = cast<FunctionType>(PT->getElementType());
-  Type *RetTy = FuncTy->getReturnType();
+  FunctionType *FuncTy = CS.getFunctionType();
+  Type *RetTy = CS.getType();
 
   ArgListTy Args;
   ArgListEntry Entry;
@@ -1322,6 +1324,15 @@ bool FastISel::selectBitCast(const User *I) {
   return true;
 }
 
+// Return true if we should copy from swift error to the final vreg as specified
+// by SwiftErrorWorklist.
+static bool shouldCopySwiftErrorsToFinalVRegs(const TargetLowering &TLI,
+                                              FunctionLoweringInfo &FuncInfo) {
+  if (!TLI.supportSwiftError())
+    return false;
+  return FuncInfo.SwiftErrorWorklist.count(FuncInfo.MBB);
+}
+
 // Remove local value instructions starting from the instruction after
 // SavedLastLocalValue to the current function insert point.
 void FastISel::removeDeadLocalValueCode(MachineInstr *SavedLastLocalValue)
@@ -1345,7 +1356,11 @@ bool FastISel::selectInstruction(const Instruction *I) {
   MachineInstr *SavedLastLocalValue = getLastLocalValue();
   // Just before the terminator instruction, insert instructions to
   // feed PHI nodes in successor blocks.
-  if (isa<TerminatorInst>(I))
+  if (isa<TerminatorInst>(I)) {
+    // If we need to materialize any vreg from worklist, we bail out of
+    // FastISel.
+    if (shouldCopySwiftErrorsToFinalVRegs(TLI, FuncInfo))
+      return false;
     if (!handlePHINodesInSuccessorBlocks(I->getParent())) {
       // PHI node handling may have generated local value instructions,
       // even though it failed to handle all PHI nodes.
@@ -1354,6 +1369,13 @@ bool FastISel::selectInstruction(const Instruction *I) {
       removeDeadLocalValueCode(SavedLastLocalValue);
       return false;
     }
+  }
+
+  // FastISel does not handle any operand bundles except OB_funclet.
+  if (ImmutableCallSite CS = ImmutableCallSite(I))
+    for (unsigned i = 0, e = CS.getNumOperandBundles(); i != e; ++i)
+      if (CS.getOperandBundleAt(i).getTagID() != LLVMContext::OB_funclet)
+        return false;
 
   DbgLoc = I->getDebugLoc();
 
@@ -1413,7 +1435,8 @@ bool FastISel::selectInstruction(const Instruction *I) {
 
 /// Emit an unconditional branch to the given block, unless it is the immediate
 /// (fall-through) successor, and update the CFG.
-void FastISel::fastEmitBranch(MachineBasicBlock *MSucc, DebugLoc DbgLoc) {
+void FastISel::fastEmitBranch(MachineBasicBlock *MSucc,
+                              const DebugLoc &DbgLoc) {
   if (FuncInfo.MBB->getBasicBlock()->size() > 1 &&
       FuncInfo.MBB->isLayoutSuccessor(MSucc)) {
     // For more accurate line information if this is the only instruction
@@ -2053,7 +2076,7 @@ bool FastISel::handlePHINodesInSuccessorBlocks(const BasicBlock *LLVMBB) {
         FuncInfo.PHINodesToUpdate.resize(FuncInfo.OrigNumPHINodesToUpdate);
         return false;
       }
-      FuncInfo.PHINodesToUpdate.push_back(std::make_pair(MBBI++, Reg));
+      FuncInfo.PHINodesToUpdate.push_back(std::make_pair(&*MBBI++, Reg));
       DbgLoc = DebugLoc();
     }
   }
@@ -2138,7 +2161,7 @@ FastISel::createMachineMemOperandFor(const Instruction *I) const {
   const Value *Ptr;
   Type *ValTy;
   unsigned Alignment;
-  unsigned Flags;
+  MachineMemOperand::Flags Flags;
   bool IsVolatile;
 
   if (const auto *LI = dyn_cast<LoadInst>(I)) {
diff --git a/lib/CodeGen/SelectionDAG/FunctionLoweringInfo.cpp b/lib/CodeGen/SelectionDAG/FunctionLoweringInfo.cpp
index 08815ed787dc..e669ffc3d02a 100644
--- a/lib/CodeGen/SelectionDAG/FunctionLoweringInfo.cpp
+++ b/lib/CodeGen/SelectionDAG/FunctionLoweringInfo.cpp
@@ -88,6 +88,7 @@ void FunctionLoweringInfo::set(const Function &fn, MachineFunction &mf,
   RegInfo = &MF->getRegInfo();
   MachineModuleInfo &MMI = MF->getMMI();
   const TargetFrameLowering *TFI = MF->getSubtarget().getFrameLowering();
+  unsigned StackAlign = TFI->getStackAlignment();
 
   // Check whether the function can return without sret-demotion.
   SmallVector<ISD::OutputArg, 4> Outs;
@@ -96,6 +97,31 @@ void FunctionLoweringInfo::set(const Function &fn, MachineFunction &mf,
   CanLowerReturn = TLI->CanLowerReturn(Fn->getCallingConv(), *MF,
                                        Fn->isVarArg(), Outs, Fn->getContext());
 
+  // If this personality uses funclets, we need to do a bit more work.
+  DenseMap<const AllocaInst *, int *> CatchObjects;
+  EHPersonality Personality = classifyEHPersonality(
+      Fn->hasPersonalityFn() ? Fn->getPersonalityFn() : nullptr);
+  if (isFuncletEHPersonality(Personality)) {
+    // Calculate state numbers if we haven't already.
+    WinEHFuncInfo &EHInfo = *MF->getWinEHFuncInfo();
+    if (Personality == EHPersonality::MSVC_CXX)
+      calculateWinCXXEHStateNumbers(&fn, EHInfo);
+    else if (isAsynchronousEHPersonality(Personality))
+      calculateSEHStateNumbers(&fn, EHInfo);
+    else if (Personality == EHPersonality::CoreCLR)
+      calculateClrEHStateNumbers(&fn, EHInfo);
+
+    // Map all BB references in the WinEH data to MBBs.
+    for (WinEHTryBlockMapEntry &TBME : EHInfo.TryBlockMap) {
+      for (WinEHHandlerType &H : TBME.HandlerArray) {
+        if (const AllocaInst *AI = H.CatchObj.Alloca)
+          CatchObjects.insert({AI, &H.CatchObj.FrameIndex});
+        else
+          H.CatchObj.FrameIndex = INT_MAX;
+      }
+    }
+  }
+
   // Initialize the mapping of values to registers.  This is only set up for
   // instruction values that are used outside of the block that defines
   // them.
@@ -108,7 +134,6 @@ void FunctionLoweringInfo::set(const Function &fn, MachineFunction &mf,
         unsigned Align =
           std::max((unsigned)MF->getDataLayout().getPrefTypeAlignment(Ty),
                    AI->getAlignment());
-        unsigned StackAlign = TFI->getStackAlignment();
 
         // Static allocas can be folded into the initial stack frame
         // adjustment. For targets that don't realign the stack, don't
@@ -120,9 +145,21 @@ void FunctionLoweringInfo::set(const Function &fn, MachineFunction &mf,
 
           TySize *= CUI->getZExtValue();   // Get total allocated size.
           if (TySize == 0) TySize = 1; // Don't create zero-sized stack objects.
+          int FrameIndex = INT_MAX;
+          auto Iter = CatchObjects.find(AI);
+          if (Iter != CatchObjects.end() && TLI->needsFixedCatchObjects()) {
+            FrameIndex = MF->getFrameInfo()->CreateFixedObject(
+                TySize, 0, /*Immutable=*/false, /*isAliased=*/true);
+            MF->getFrameInfo()->setObjectAlignment(FrameIndex, Align);
+          } else {
+            FrameIndex =
+                MF->getFrameInfo()->CreateStackObject(TySize, Align, false, AI);
+          }
 
-          StaticAllocaMap[AI] =
-            MF->getFrameInfo()->CreateStackObject(TySize, Align, false, AI);
+          StaticAllocaMap[AI] = FrameIndex;
+          // Update the catch handler information.
+          if (Iter != CatchObjects.end())
+            *Iter->second = FrameIndex;
         } else {
           // FIXME: Overaligned static allocas should be grouped into
           // a single dynamic allocation instead of using a separate
@@ -281,31 +318,14 @@ void FunctionLoweringInfo::set(const Function &fn, MachineFunction &mf,
       LPads.push_back(LPI);
   }
 
-  // If this personality uses funclets, we need to do a bit more work.
-  if (!Fn->hasPersonalityFn())
-    return;
-  EHPersonality Personality = classifyEHPersonality(Fn->getPersonalityFn());
   if (!isFuncletEHPersonality(Personality))
     return;
 
-  // Calculate state numbers if we haven't already.
   WinEHFuncInfo &EHInfo = *MF->getWinEHFuncInfo();
-  if (Personality == EHPersonality::MSVC_CXX)
-    calculateWinCXXEHStateNumbers(&fn, EHInfo);
-  else if (isAsynchronousEHPersonality(Personality))
-    calculateSEHStateNumbers(&fn, EHInfo);
-  else if (Personality == EHPersonality::CoreCLR)
-    calculateClrEHStateNumbers(&fn, EHInfo);
 
   // Map all BB references in the WinEH data to MBBs.
   for (WinEHTryBlockMapEntry &TBME : EHInfo.TryBlockMap) {
     for (WinEHHandlerType &H : TBME.HandlerArray) {
-      if (H.CatchObj.Alloca) {
-        assert(StaticAllocaMap.count(H.CatchObj.Alloca));
-        H.CatchObj.FrameIndex = StaticAllocaMap[H.CatchObj.Alloca];
-      } else {
-        H.CatchObj.FrameIndex = INT_MAX;
-      }
       if (H.Handler)
         H.Handler = MBBMap[H.Handler.get<const BasicBlock *>()];
     }
@@ -336,7 +356,7 @@ void FunctionLoweringInfo::clear() {
   ByValArgFrameIndexMap.clear();
   RegFixups.clear();
   StatepointStackSlots.clear();
-  StatepointRelocatedValues.clear();
+  StatepointSpillMaps.clear();
   PreferredExtendType.clear();
 }
 
@@ -575,3 +595,21 @@ void llvm::AddLandingPadInfo(const LandingPadInst &I, MachineModuleInfo &MMI,
     }
   }
 }
+
+unsigned FunctionLoweringInfo::findSwiftErrorVReg(const MachineBasicBlock *MBB,
+                                                  const Value* Val) const {
+  // Find the index in SwiftErrorVals.
+  SwiftErrorValues::const_iterator I =
+      std::find(SwiftErrorVals.begin(), SwiftErrorVals.end(), Val);
+  assert(I != SwiftErrorVals.end() && "Can't find value in SwiftErrorVals");
+  return SwiftErrorMap.lookup(MBB)[I - SwiftErrorVals.begin()];
+}
+
+void FunctionLoweringInfo::setSwiftErrorVReg(const MachineBasicBlock *MBB,
+                                             const Value* Val, unsigned VReg) {
+  // Find the index in SwiftErrorVals.
+  SwiftErrorValues::iterator I =
+      std::find(SwiftErrorVals.begin(), SwiftErrorVals.end(), Val);
+  assert(I != SwiftErrorVals.end() && "Can't find value in SwiftErrorVals");
+  SwiftErrorMap[MBB][I - SwiftErrorVals.begin()] = VReg;
+}
diff --git a/lib/CodeGen/SelectionDAG/InstrEmitter.cpp b/lib/CodeGen/SelectionDAG/InstrEmitter.cpp
index a1e2d410ab00..c8af73a3b441 100644
--- a/lib/CodeGen/SelectionDAG/InstrEmitter.cpp
+++ b/lib/CodeGen/SelectionDAG/InstrEmitter.cpp
@@ -22,6 +22,7 @@
 #include "llvm/CodeGen/MachineRegisterInfo.h"
 #include "llvm/CodeGen/StackMaps.h"
 #include "llvm/IR/DataLayout.h"
+#include "llvm/IR/DebugInfo.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/MathExtras.h"
@@ -319,7 +320,6 @@ InstrEmitter::AddRegisterOperand(MachineInstrBuilder &MIB,
          "Chain and glue operands should occur at end of operand list!");
   // Get/emit the operand.
   unsigned VReg = getVR(Op, VRBaseMap);
-  assert(TargetRegisterInfo::isVirtualRegister(VReg) && "Not a vreg?");
 
   const MCInstrDesc &MCID = MIB->getDesc();
   bool isOptDef = IIOpNum < MCID.getNumOperands() &&
@@ -333,6 +333,8 @@ InstrEmitter::AddRegisterOperand(MachineInstrBuilder &MIB,
     const TargetRegisterClass *DstRC = nullptr;
     if (IIOpNum < II->getNumOperands())
       DstRC = TRI->getAllocatableClass(TII->getRegClass(*II,IIOpNum,TRI,*MF));
+    assert((!DstRC || TargetRegisterInfo::isVirtualRegister(VReg)) &&
+           "Expected VReg");
     if (DstRC && !MRI->constrainRegClass(VReg, DstRC, MinRCSize)) {
       unsigned NewVReg = MRI->createVirtualRegister(DstRC);
       BuildMI(*MBB, InsertPos, Op.getNode()->getDebugLoc(),
@@ -440,7 +442,7 @@ void InstrEmitter::AddOperand(MachineInstrBuilder &MIB,
 }
 
 unsigned InstrEmitter::ConstrainForSubReg(unsigned VReg, unsigned SubIdx,
-                                          MVT VT, DebugLoc DL) {
+                                          MVT VT, const DebugLoc &DL) {
   const TargetRegisterClass *VRC = MRI->getRegClass(VReg);
   const TargetRegisterClass *RC = TRI->getSubClassWithSubReg(VRC, SubIdx);
 
@@ -873,7 +875,7 @@ EmitMachineNode(SDNode *Node, bool IsClone, bool IsCloned,
 
   // Run post-isel target hook to adjust this instruction if needed.
   if (II.hasPostISelHook())
-    TLI->AdjustInstrPostInstrSelection(MIB, Node);
+    TLI->AdjustInstrPostInstrSelection(*MIB, Node);
 }
 
 /// EmitSpecialNode - Generate machine code for a target-independent node and
diff --git a/lib/CodeGen/SelectionDAG/InstrEmitter.h b/lib/CodeGen/SelectionDAG/InstrEmitter.h
index 3b24d93c74fa..8a8a1bbd18f7 100644
--- a/lib/CodeGen/SelectionDAG/InstrEmitter.h
+++ b/lib/CodeGen/SelectionDAG/InstrEmitter.h
@@ -83,8 +83,8 @@ class LLVM_LIBRARY_VISIBILITY InstrEmitter {
   /// ConstrainForSubReg - Try to constrain VReg to a register class that
   /// supports SubIdx sub-registers.  Emit a copy if that isn't possible.
   /// Return the virtual register to use.
-  unsigned ConstrainForSubReg(unsigned VReg, unsigned SubIdx,
-                              MVT VT, DebugLoc DL);
+  unsigned ConstrainForSubReg(unsigned VReg, unsigned SubIdx, MVT VT,
+                              const DebugLoc &DL);
 
   /// EmitSubregNode - Generate machine code for subreg nodes.
   ///
@@ -132,7 +132,7 @@ public:
   /// InstrEmitter - Construct an InstrEmitter and set it to start inserting
   /// at the given position in the given block.
   InstrEmitter(MachineBasicBlock *mbb, MachineBasicBlock::iterator insertpos);
-  
+
 private:
   void EmitMachineNode(SDNode *Node, bool IsClone, bool IsCloned,
                        DenseMap<SDValue, unsigned> &VRBaseMap);
diff --git a/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp b/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp
index f7836345f720..81634096c1ba 100644
--- a/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp
+++ b/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp
@@ -11,15 +11,15 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "llvm/CodeGen/SelectionDAG.h"
 #include "llvm/ADT/SetVector.h"
 #include "llvm/ADT/SmallPtrSet.h"
 #include "llvm/ADT/SmallSet.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/Triple.h"
-#include "llvm/CodeGen/Analysis.h"
 #include "llvm/CodeGen/MachineFunction.h"
 #include "llvm/CodeGen/MachineJumpTableInfo.h"
+#include "llvm/CodeGen/SelectionDAG.h"
+#include "llvm/CodeGen/SelectionDAGNodes.h"
 #include "llvm/IR/CallingConv.h"
 #include "llvm/IR/Constants.h"
 #include "llvm/IR/DataLayout.h"
@@ -93,25 +93,25 @@ private:
   /// insertion index for the INSERT_VECTOR_ELT instruction.  In this case, it
   /// is necessary to spill the vector being inserted into to memory, perform
   /// the insert there, and then read the result back.
-  SDValue PerformInsertVectorEltInMemory(SDValue Vec, SDValue Val,
-                                         SDValue Idx, SDLoc dl);
-  SDValue ExpandINSERT_VECTOR_ELT(SDValue Vec, SDValue Val,
-                                  SDValue Idx, SDLoc dl);
+  SDValue PerformInsertVectorEltInMemory(SDValue Vec, SDValue Val, SDValue Idx,
+                                         const SDLoc &dl);
+  SDValue ExpandINSERT_VECTOR_ELT(SDValue Vec, SDValue Val, SDValue Idx,
+                                  const SDLoc &dl);
 
   /// Return a vector shuffle operation which
   /// performs the same shuffe in terms of order or result bytes, but on a type
   /// whose vector element type is narrower than the original shuffle type.
   /// e.g. <v4i32> <0, 1, 0, 1> -> v8i16 <0, 1, 2, 3, 0, 1, 2, 3>
-  SDValue ShuffleWithNarrowerEltType(EVT NVT, EVT VT, SDLoc dl,
+  SDValue ShuffleWithNarrowerEltType(EVT NVT, EVT VT, const SDLoc &dl,
                                      SDValue N1, SDValue N2,
                                      ArrayRef<int> Mask) const;
 
   bool LegalizeSetCCCondCode(EVT VT, SDValue &LHS, SDValue &RHS, SDValue &CC,
-                             bool &NeedInvert, SDLoc dl);
+                             bool &NeedInvert, const SDLoc &dl);
 
   SDValue ExpandLibCall(RTLIB::Libcall LC, SDNode *Node, bool isSigned);
   SDValue ExpandLibCall(RTLIB::Libcall LC, EVT RetVT, const SDValue *Ops,
-                        unsigned NumOps, bool isSigned, SDLoc dl);
+                        unsigned NumOps, bool isSigned, const SDLoc &dl);
 
   std::pair<SDValue, SDValue> ExpandChainLibCall(RTLIB::Libcall LC,
                                                  SDNode *Node, bool isSigned);
@@ -128,26 +128,28 @@ private:
   void ExpandDivRemLibCall(SDNode *Node, SmallVectorImpl<SDValue> &Results);
   void ExpandSinCosLibCall(SDNode *Node, SmallVectorImpl<SDValue> &Results);
 
-  SDValue EmitStackConvert(SDValue SrcOp, EVT SlotVT, EVT DestVT, SDLoc dl);
+  SDValue EmitStackConvert(SDValue SrcOp, EVT SlotVT, EVT DestVT,
+                           const SDLoc &dl);
   SDValue ExpandBUILD_VECTOR(SDNode *Node);
   SDValue ExpandSCALAR_TO_VECTOR(SDNode *Node);
   void ExpandDYNAMIC_STACKALLOC(SDNode *Node,
                                 SmallVectorImpl<SDValue> &Results);
-  void getSignAsIntValue(FloatSignAsInt &State, SDLoc DL, SDValue Value) const;
-  SDValue modifySignAsInt(const FloatSignAsInt &State, SDLoc DL,
+  void getSignAsIntValue(FloatSignAsInt &State, const SDLoc &DL,
+                         SDValue Value) const;
+  SDValue modifySignAsInt(const FloatSignAsInt &State, const SDLoc &DL,
                           SDValue NewIntValue) const;
   SDValue ExpandFCOPYSIGN(SDNode *Node) const;
   SDValue ExpandFABS(SDNode *Node) const;
   SDValue ExpandLegalINT_TO_FP(bool isSigned, SDValue LegalOp, EVT DestVT,
-                               SDLoc dl);
+                               const SDLoc &dl);
   SDValue PromoteLegalINT_TO_FP(SDValue LegalOp, EVT DestVT, bool isSigned,
-                                SDLoc dl);
+                                const SDLoc &dl);
   SDValue PromoteLegalFP_TO_INT(SDValue LegalOp, EVT DestVT, bool isSigned,
-                                SDLoc dl);
+                                const SDLoc &dl);
 
-  SDValue ExpandBITREVERSE(SDValue Op, SDLoc dl);
-  SDValue ExpandBSWAP(SDValue Op, SDLoc dl);
-  SDValue ExpandBitCount(unsigned Opc, SDValue Op, SDLoc dl);
+  SDValue ExpandBITREVERSE(SDValue Op, const SDLoc &dl);
+  SDValue ExpandBSWAP(SDValue Op, const SDLoc &dl);
+  SDValue ExpandBitCount(unsigned Opc, SDValue Op, const SDLoc &dl);
 
   SDValue ExpandExtractFromVectorThroughStack(SDValue Op);
   SDValue ExpandInsertToVectorThroughStack(SDValue Op);
@@ -176,8 +178,6 @@ public:
            "Replacing one node with another that produces a different number "
            "of values!");
     DAG.ReplaceAllUsesWith(Old, New);
-    for (unsigned i = 0, e = Old->getNumValues(); i != e; ++i)
-      DAG.TransferDbgValues(SDValue(Old, i), SDValue(New, i));
     if (UpdatedNodes)
       UpdatedNodes->insert(New);
     ReplacedNode(Old);
@@ -187,7 +187,6 @@ public:
           dbgs() << "     with:      "; New->dump(&DAG));
 
     DAG.ReplaceAllUsesWith(Old, New);
-    DAG.TransferDbgValues(Old, New);
     if (UpdatedNodes)
       UpdatedNodes->insert(New.getNode());
     ReplacedNode(Old.getNode());
@@ -200,7 +199,6 @@ public:
       DEBUG(dbgs() << (i == 0 ? "     with:      "
                               : "      and:      ");
             New[i]->dump(&DAG));
-      DAG.TransferDbgValues(SDValue(Old, i), New[i]);
       if (UpdatedNodes)
         UpdatedNodes->insert(New[i].getNode());
     }
@@ -213,10 +211,9 @@ public:
 /// performs the same shuffe in terms of order or result bytes, but on a type
 /// whose vector element type is narrower than the original shuffle type.
 /// e.g. <v4i32> <0, 1, 0, 1> -> v8i16 <0, 1, 2, 3, 0, 1, 2, 3>
-SDValue
-SelectionDAGLegalize::ShuffleWithNarrowerEltType(EVT NVT, EVT VT,  SDLoc dl,
-                                                 SDValue N1, SDValue N2,
-                                                 ArrayRef<int> Mask) const {
+SDValue SelectionDAGLegalize::ShuffleWithNarrowerEltType(
+    EVT NVT, EVT VT, const SDLoc &dl, SDValue N1, SDValue N2,
+    ArrayRef<int> Mask) const {
   unsigned NumMaskElts = VT.getVectorNumElements();
   unsigned NumDestElts = NVT.getVectorNumElements();
   unsigned NumEltsGrowth = NumDestElts / NumMaskElts;
@@ -224,7 +221,7 @@ SelectionDAGLegalize::ShuffleWithNarrowerEltType(EVT NVT, EVT VT,  SDLoc dl,
   assert(NumEltsGrowth && "Cannot promote to vector type with fewer elts!");
 
   if (NumEltsGrowth == 1)
-    return DAG.getVectorShuffle(NVT, dl, N1, N2, &Mask[0]);
+    return DAG.getVectorShuffle(NVT, dl, N1, N2, Mask);
 
   SmallVector<int, 8> NewMask;
   for (unsigned i = 0; i != NumMaskElts; ++i) {
@@ -238,7 +235,7 @@ SelectionDAGLegalize::ShuffleWithNarrowerEltType(EVT NVT, EVT VT,  SDLoc dl,
   }
   assert(NewMask.size() == NumDestElts && "Non-integer NumEltsGrowth?");
   assert(TLI.isShuffleMaskLegal(NewMask, NVT) && "Shuffle not legal?");
-  return DAG.getVectorShuffle(NVT, dl, N1, N2, &NewMask[0]);
+  return DAG.getVectorShuffle(NVT, dl, N1, N2, NewMask);
 }
 
 /// Expands the ConstantFP node to an integer constant or
@@ -285,13 +282,12 @@ SelectionDAGLegalize::ExpandConstantFP(ConstantFPSDNode *CFP, bool UseCP) {
     SDValue Result = DAG.getExtLoad(
         ISD::EXTLOAD, dl, OrigVT, DAG.getEntryNode(), CPIdx,
         MachinePointerInfo::getConstantPool(DAG.getMachineFunction()), VT,
-        false, false, false, Alignment);
+        Alignment);
     return Result;
   }
-  SDValue Result =
-      DAG.getLoad(OrigVT, dl, DAG.getEntryNode(), CPIdx,
-                  MachinePointerInfo::getConstantPool(DAG.getMachineFunction()),
-                  false, false, false, Alignment);
+  SDValue Result = DAG.getLoad(
+      OrigVT, dl, DAG.getEntryNode(), CPIdx,
+      MachinePointerInfo::getConstantPool(DAG.getMachineFunction()), Alignment);
   return Result;
 }
 
@@ -302,301 +298,20 @@ SDValue SelectionDAGLegalize::ExpandConstant(ConstantSDNode *CP) {
   SDValue CPIdx = DAG.getConstantPool(CP->getConstantIntValue(),
                                       TLI.getPointerTy(DAG.getDataLayout()));
   unsigned Alignment = cast<ConstantPoolSDNode>(CPIdx)->getAlignment();
-  SDValue Result =
-    DAG.getLoad(VT, dl, DAG.getEntryNode(), CPIdx,
-                MachinePointerInfo::getConstantPool(DAG.getMachineFunction()),
-                false, false, false, Alignment);
+  SDValue Result = DAG.getLoad(
+      VT, dl, DAG.getEntryNode(), CPIdx,
+      MachinePointerInfo::getConstantPool(DAG.getMachineFunction()), Alignment);
   return Result;
 }
 
-/// Expands an unaligned store to 2 half-size stores.
-static void ExpandUnalignedStore(StoreSDNode *ST, SelectionDAG &DAG,
-                                 const TargetLowering &TLI,
-                                 SelectionDAGLegalize *DAGLegalize) {
-  assert(ST->getAddressingMode() == ISD::UNINDEXED &&
-         "unaligned indexed stores not implemented!");
-  SDValue Chain = ST->getChain();
-  SDValue Ptr = ST->getBasePtr();
-  SDValue Val = ST->getValue();
-  EVT VT = Val.getValueType();
-  int Alignment = ST->getAlignment();
-  unsigned AS = ST->getAddressSpace();
-
-  SDLoc dl(ST);
-  if (ST->getMemoryVT().isFloatingPoint() ||
-      ST->getMemoryVT().isVector()) {
-    EVT intVT = EVT::getIntegerVT(*DAG.getContext(), VT.getSizeInBits());
-    if (TLI.isTypeLegal(intVT)) {
-      // Expand to a bitconvert of the value to the integer type of the
-      // same size, then a (misaligned) int store.
-      // FIXME: Does not handle truncating floating point stores!
-      SDValue Result = DAG.getNode(ISD::BITCAST, dl, intVT, Val);
-      Result = DAG.getStore(Chain, dl, Result, Ptr, ST->getPointerInfo(),
-                           ST->isVolatile(), ST->isNonTemporal(), Alignment);
-      DAGLegalize->ReplaceNode(SDValue(ST, 0), Result);
-      return;
-    }
-    // Do a (aligned) store to a stack slot, then copy from the stack slot
-    // to the final destination using (unaligned) integer loads and stores.
-    EVT StoredVT = ST->getMemoryVT();
-    MVT RegVT =
-      TLI.getRegisterType(*DAG.getContext(),
-                          EVT::getIntegerVT(*DAG.getContext(),
-                                            StoredVT.getSizeInBits()));
-    unsigned StoredBytes = StoredVT.getSizeInBits() / 8;
-    unsigned RegBytes = RegVT.getSizeInBits() / 8;
-    unsigned NumRegs = (StoredBytes + RegBytes - 1) / RegBytes;
-
-    // Make sure the stack slot is also aligned for the register type.
-    SDValue StackPtr = DAG.CreateStackTemporary(StoredVT, RegVT);
-
-    // Perform the original store, only redirected to the stack slot.
-    SDValue Store = DAG.getTruncStore(Chain, dl,
-                                      Val, StackPtr, MachinePointerInfo(),
-                                      StoredVT, false, false, 0);
-    SDValue Increment = DAG.getConstant(
-        RegBytes, dl, TLI.getPointerTy(DAG.getDataLayout(), AS));
-    SmallVector<SDValue, 8> Stores;
-    unsigned Offset = 0;
-
-    // Do all but one copies using the full register width.
-    for (unsigned i = 1; i < NumRegs; i++) {
-      // Load one integer register's worth from the stack slot.
-      SDValue Load = DAG.getLoad(RegVT, dl, Store, StackPtr,
-                                 MachinePointerInfo(),
-                                 false, false, false, 0);
-      // Store it to the final location.  Remember the store.
-      Stores.push_back(DAG.getStore(Load.getValue(1), dl, Load, Ptr,
-                                  ST->getPointerInfo().getWithOffset(Offset),
-                                    ST->isVolatile(), ST->isNonTemporal(),
-                                    MinAlign(ST->getAlignment(), Offset)));
-      // Increment the pointers.
-      Offset += RegBytes;
-      StackPtr = DAG.getNode(ISD::ADD, dl, StackPtr.getValueType(), StackPtr,
-                             Increment);
-      Ptr = DAG.getNode(ISD::ADD, dl, Ptr.getValueType(), Ptr, Increment);
-    }
-
-    // The last store may be partial.  Do a truncating store.  On big-endian
-    // machines this requires an extending load from the stack slot to ensure
-    // that the bits are in the right place.
-    EVT MemVT = EVT::getIntegerVT(*DAG.getContext(),
-                                  8 * (StoredBytes - Offset));
-
-    // Load from the stack slot.
-    SDValue Load = DAG.getExtLoad(ISD::EXTLOAD, dl, RegVT, Store, StackPtr,
-                                  MachinePointerInfo(),
-                                  MemVT, false, false, false, 0);
-
-    Stores.push_back(DAG.getTruncStore(Load.getValue(1), dl, Load, Ptr,
-                                       ST->getPointerInfo()
-                                         .getWithOffset(Offset),
-                                       MemVT, ST->isVolatile(),
-                                       ST->isNonTemporal(),
-                                       MinAlign(ST->getAlignment(), Offset),
-                                       ST->getAAInfo()));
-    // The order of the stores doesn't matter - say it with a TokenFactor.
-    SDValue Result = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Stores);
-    DAGLegalize->ReplaceNode(SDValue(ST, 0), Result);
-    return;
-  }
-  assert(ST->getMemoryVT().isInteger() &&
-         !ST->getMemoryVT().isVector() &&
-         "Unaligned store of unknown type.");
-  // Get the half-size VT
-  EVT NewStoredVT = ST->getMemoryVT().getHalfSizedIntegerVT(*DAG.getContext());
-  int NumBits = NewStoredVT.getSizeInBits();
-  int IncrementSize = NumBits / 8;
-
-  // Divide the stored value in two parts.
-  SDValue ShiftAmount =
-      DAG.getConstant(NumBits, dl, TLI.getShiftAmountTy(Val.getValueType(),
-                                                        DAG.getDataLayout()));
-  SDValue Lo = Val;
-  SDValue Hi = DAG.getNode(ISD::SRL, dl, VT, Val, ShiftAmount);
-
-  // Store the two parts
-  SDValue Store1, Store2;
-  Store1 = DAG.getTruncStore(Chain, dl,
-                             DAG.getDataLayout().isLittleEndian() ? Lo : Hi,
-                             Ptr, ST->getPointerInfo(), NewStoredVT,
-                             ST->isVolatile(), ST->isNonTemporal(), Alignment);
-
-  Ptr = DAG.getNode(ISD::ADD, dl, Ptr.getValueType(), Ptr,
-                    DAG.getConstant(IncrementSize, dl,
-                                    TLI.getPointerTy(DAG.getDataLayout(), AS)));
-  Alignment = MinAlign(Alignment, IncrementSize);
-  Store2 = DAG.getTruncStore(
-      Chain, dl, DAG.getDataLayout().isLittleEndian() ? Hi : Lo, Ptr,
-      ST->getPointerInfo().getWithOffset(IncrementSize), NewStoredVT,
-      ST->isVolatile(), ST->isNonTemporal(), Alignment, ST->getAAInfo());
-
-  SDValue Result =
-    DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Store1, Store2);
-  DAGLegalize->ReplaceNode(SDValue(ST, 0), Result);
-}
-
-/// Expands an unaligned load to 2 half-size loads.
-static void
-ExpandUnalignedLoad(LoadSDNode *LD, SelectionDAG &DAG,
-                    const TargetLowering &TLI,
-                    SDValue &ValResult, SDValue &ChainResult) {
-  assert(LD->getAddressingMode() == ISD::UNINDEXED &&
-         "unaligned indexed loads not implemented!");
-  SDValue Chain = LD->getChain();
-  SDValue Ptr = LD->getBasePtr();
-  EVT VT = LD->getValueType(0);
-  EVT LoadedVT = LD->getMemoryVT();
-  SDLoc dl(LD);
-  if (VT.isFloatingPoint() || VT.isVector()) {
-    EVT intVT = EVT::getIntegerVT(*DAG.getContext(), LoadedVT.getSizeInBits());
-    if (TLI.isTypeLegal(intVT) && TLI.isTypeLegal(LoadedVT)) {
-      // Expand to a (misaligned) integer load of the same size,
-      // then bitconvert to floating point or vector.
-      SDValue newLoad = DAG.getLoad(intVT, dl, Chain, Ptr,
-                                    LD->getMemOperand());
-      SDValue Result = DAG.getNode(ISD::BITCAST, dl, LoadedVT, newLoad);
-      if (LoadedVT != VT)
-        Result = DAG.getNode(VT.isFloatingPoint() ? ISD::FP_EXTEND :
-                             ISD::ANY_EXTEND, dl, VT, Result);
-
-      ValResult = Result;
-      ChainResult = newLoad.getValue(1);
-      return;
-    }
-
-    // Copy the value to a (aligned) stack slot using (unaligned) integer
-    // loads and stores, then do a (aligned) load from the stack slot.
-    MVT RegVT = TLI.getRegisterType(*DAG.getContext(), intVT);
-    unsigned LoadedBytes = LoadedVT.getSizeInBits() / 8;
-    unsigned RegBytes = RegVT.getSizeInBits() / 8;
-    unsigned NumRegs = (LoadedBytes + RegBytes - 1) / RegBytes;
-
-    // Make sure the stack slot is also aligned for the register type.
-    SDValue StackBase = DAG.CreateStackTemporary(LoadedVT, RegVT);
-
-    SDValue Increment =
-        DAG.getConstant(RegBytes, dl, TLI.getPointerTy(DAG.getDataLayout()));
-    SmallVector<SDValue, 8> Stores;
-    SDValue StackPtr = StackBase;
-    unsigned Offset = 0;
-
-    // Do all but one copies using the full register width.
-    for (unsigned i = 1; i < NumRegs; i++) {
-      // Load one integer register's worth from the original location.
-      SDValue Load = DAG.getLoad(RegVT, dl, Chain, Ptr,
-                                 LD->getPointerInfo().getWithOffset(Offset),
-                                 LD->isVolatile(), LD->isNonTemporal(),
-                                 LD->isInvariant(),
-                                 MinAlign(LD->getAlignment(), Offset),
-                                 LD->getAAInfo());
-      // Follow the load with a store to the stack slot.  Remember the store.
-      Stores.push_back(DAG.getStore(Load.getValue(1), dl, Load, StackPtr,
-                                    MachinePointerInfo(), false, false, 0));
-      // Increment the pointers.
-      Offset += RegBytes;
-      Ptr = DAG.getNode(ISD::ADD, dl, Ptr.getValueType(), Ptr, Increment);
-      StackPtr = DAG.getNode(ISD::ADD, dl, StackPtr.getValueType(), StackPtr,
-                             Increment);
-    }
-
-    // The last copy may be partial.  Do an extending load.
-    EVT MemVT = EVT::getIntegerVT(*DAG.getContext(),
-                                  8 * (LoadedBytes - Offset));
-    SDValue Load = DAG.getExtLoad(ISD::EXTLOAD, dl, RegVT, Chain, Ptr,
-                                  LD->getPointerInfo().getWithOffset(Offset),
-                                  MemVT, LD->isVolatile(),
-                                  LD->isNonTemporal(),
-                                  LD->isInvariant(),
-                                  MinAlign(LD->getAlignment(), Offset),
-                                  LD->getAAInfo());
-    // Follow the load with a store to the stack slot.  Remember the store.
-    // On big-endian machines this requires a truncating store to ensure
-    // that the bits end up in the right place.
-    Stores.push_back(DAG.getTruncStore(Load.getValue(1), dl, Load, StackPtr,
-                                       MachinePointerInfo(), MemVT,
-                                       false, false, 0));
-
-    // The order of the stores doesn't matter - say it with a TokenFactor.
-    SDValue TF = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Stores);
-
-    // Finally, perform the original load only redirected to the stack slot.
-    Load = DAG.getExtLoad(LD->getExtensionType(), dl, VT, TF, StackBase,
-                          MachinePointerInfo(), LoadedVT, false,false, false,
-                          0);
-
-    // Callers expect a MERGE_VALUES node.
-    ValResult = Load;
-    ChainResult = TF;
-    return;
-  }
-  assert(LoadedVT.isInteger() && !LoadedVT.isVector() &&
-         "Unaligned load of unsupported type.");
-
-  // Compute the new VT that is half the size of the old one.  This is an
-  // integer MVT.
-  unsigned NumBits = LoadedVT.getSizeInBits();
-  EVT NewLoadedVT;
-  NewLoadedVT = EVT::getIntegerVT(*DAG.getContext(), NumBits/2);
-  NumBits >>= 1;
-
-  unsigned Alignment = LD->getAlignment();
-  unsigned IncrementSize = NumBits / 8;
-  ISD::LoadExtType HiExtType = LD->getExtensionType();
-
-  // If the original load is NON_EXTLOAD, the hi part load must be ZEXTLOAD.
-  if (HiExtType == ISD::NON_EXTLOAD)
-    HiExtType = ISD::ZEXTLOAD;
-
-  // Load the value in two parts
-  SDValue Lo, Hi;
-  if (DAG.getDataLayout().isLittleEndian()) {
-    Lo = DAG.getExtLoad(ISD::ZEXTLOAD, dl, VT, Chain, Ptr, LD->getPointerInfo(),
-                        NewLoadedVT, LD->isVolatile(),
-                        LD->isNonTemporal(), LD->isInvariant(), Alignment,
-                        LD->getAAInfo());
-    Ptr = DAG.getNode(ISD::ADD, dl, Ptr.getValueType(), Ptr,
-                      DAG.getConstant(IncrementSize, dl, Ptr.getValueType()));
-    Hi = DAG.getExtLoad(HiExtType, dl, VT, Chain, Ptr,
-                        LD->getPointerInfo().getWithOffset(IncrementSize),
-                        NewLoadedVT, LD->isVolatile(),
-                        LD->isNonTemporal(),LD->isInvariant(),
-                        MinAlign(Alignment, IncrementSize), LD->getAAInfo());
-  } else {
-    Hi = DAG.getExtLoad(HiExtType, dl, VT, Chain, Ptr, LD->getPointerInfo(),
-                        NewLoadedVT, LD->isVolatile(),
-                        LD->isNonTemporal(), LD->isInvariant(), Alignment,
-                        LD->getAAInfo());
-    Ptr = DAG.getNode(ISD::ADD, dl, Ptr.getValueType(), Ptr,
-                      DAG.getConstant(IncrementSize, dl, Ptr.getValueType()));
-    Lo = DAG.getExtLoad(ISD::ZEXTLOAD, dl, VT, Chain, Ptr,
-                        LD->getPointerInfo().getWithOffset(IncrementSize),
-                        NewLoadedVT, LD->isVolatile(),
-                        LD->isNonTemporal(), LD->isInvariant(),
-                        MinAlign(Alignment, IncrementSize), LD->getAAInfo());
-  }
-
-  // aggregate the two parts
-  SDValue ShiftAmount =
-      DAG.getConstant(NumBits, dl, TLI.getShiftAmountTy(Hi.getValueType(),
-                                                        DAG.getDataLayout()));
-  SDValue Result = DAG.getNode(ISD::SHL, dl, VT, Hi, ShiftAmount);
-  Result = DAG.getNode(ISD::OR, dl, VT, Result, Lo);
-
-  SDValue TF = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Lo.getValue(1),
-                             Hi.getValue(1));
-
-  ValResult = Result;
-  ChainResult = TF;
-}
-
 /// Some target cannot handle a variable insertion index for the
 /// INSERT_VECTOR_ELT instruction.  In this case, it
 /// is necessary to spill the vector being inserted into to memory, perform
 /// the insert there, and then read the result back.
-SDValue SelectionDAGLegalize::
-PerformInsertVectorEltInMemory(SDValue Vec, SDValue Val, SDValue Idx,
-                               SDLoc dl) {
+SDValue SelectionDAGLegalize::PerformInsertVectorEltInMemory(SDValue Vec,
+                                                             SDValue Val,
+                                                             SDValue Idx,
+                                                             const SDLoc &dl) {
   SDValue Tmp1 = Vec;
   SDValue Tmp2 = Val;
   SDValue Tmp3 = Idx;
@@ -618,8 +333,7 @@ PerformInsertVectorEltInMemory(SDValue Vec, SDValue Val, SDValue Idx,
   // Store the vector.
   SDValue Ch = DAG.getStore(
       DAG.getEntryNode(), dl, Tmp1, StackPtr,
-      MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), SPFI), false,
-      false, 0);
+      MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), SPFI));
 
   // Truncate or zero extend offset to target pointer type.
   Tmp3 = DAG.getZExtOrTrunc(Tmp3, dl, PtrVT);
@@ -629,17 +343,15 @@ PerformInsertVectorEltInMemory(SDValue Vec, SDValue Val, SDValue Idx,
                      DAG.getConstant(EltSize, dl, IdxVT));
   SDValue StackPtr2 = DAG.getNode(ISD::ADD, dl, IdxVT, Tmp3, StackPtr);
   // Store the scalar value.
-  Ch = DAG.getTruncStore(Ch, dl, Tmp2, StackPtr2, MachinePointerInfo(), EltVT,
-                         false, false, 0);
+  Ch = DAG.getTruncStore(Ch, dl, Tmp2, StackPtr2, MachinePointerInfo(), EltVT);
   // Load the updated vector.
   return DAG.getLoad(VT, dl, Ch, StackPtr, MachinePointerInfo::getFixedStack(
-                                               DAG.getMachineFunction(), SPFI),
-                     false, false, false, 0);
+                                               DAG.getMachineFunction(), SPFI));
 }
 
-
-SDValue SelectionDAGLegalize::
-ExpandINSERT_VECTOR_ELT(SDValue Vec, SDValue Val, SDValue Idx, SDLoc dl) {
+SDValue SelectionDAGLegalize::ExpandINSERT_VECTOR_ELT(SDValue Vec, SDValue Val,
+                                                      SDValue Idx,
+                                                      const SDLoc &dl) {
   if (ConstantSDNode *InsertPos = dyn_cast<ConstantSDNode>(Idx)) {
     // SCALAR_TO_VECTOR requires that the type of the value being inserted
     // match the element type of the vector being created, except for
@@ -658,8 +370,7 @@ ExpandINSERT_VECTOR_ELT(SDValue Vec, SDValue Val, SDValue Idx, SDLoc dl) {
       for (unsigned i = 0; i != NumElts; ++i)
         ShufOps.push_back(i != InsertPos->getZExtValue() ? i : NumElts);
 
-      return DAG.getVectorShuffle(Vec.getValueType(), dl, Vec, ScVec,
-                                  &ShufOps[0]);
+      return DAG.getVectorShuffle(Vec.getValueType(), dl, Vec, ScVec, ShufOps);
     }
   }
   return PerformInsertVectorEltInMemory(Vec, Val, Idx, dl);
@@ -676,8 +387,7 @@ SDValue SelectionDAGLegalize::OptimizeFloatStore(StoreSDNode* ST) {
   SDValue Chain = ST->getChain();
   SDValue Ptr = ST->getBasePtr();
   unsigned Alignment = ST->getAlignment();
-  bool isVolatile = ST->isVolatile();
-  bool isNonTemporal = ST->isNonTemporal();
+  MachineMemOperand::Flags MMOFlags = ST->getMemOperand()->getFlags();
   AAMDNodes AAInfo = ST->getAAInfo();
   SDLoc dl(ST);
   if (ConstantFPSDNode *CFP = dyn_cast<ConstantFPSDNode>(ST->getValue())) {
@@ -686,8 +396,8 @@ SDValue SelectionDAGLegalize::OptimizeFloatStore(StoreSDNode* ST) {
       SDValue Con = DAG.getConstant(CFP->getValueAPF().
                                       bitcastToAPInt().zextOrTrunc(32),
                                     SDLoc(CFP), MVT::i32);
-      return DAG.getStore(Chain, dl, Con, Ptr, ST->getPointerInfo(),
-                          isVolatile, isNonTemporal, Alignment, AAInfo);
+      return DAG.getStore(Chain, dl, Con, Ptr, ST->getPointerInfo(), Alignment,
+                          MMOFlags, AAInfo);
     }
 
     if (CFP->getValueType(0) == MVT::f64) {
@@ -696,7 +406,7 @@ SDValue SelectionDAGLegalize::OptimizeFloatStore(StoreSDNode* ST) {
         SDValue Con = DAG.getConstant(CFP->getValueAPF().bitcastToAPInt().
                                       zextOrTrunc(64), SDLoc(CFP), MVT::i64);
         return DAG.getStore(Chain, dl, Con, Ptr, ST->getPointerInfo(),
-                            isVolatile, isNonTemporal, Alignment, AAInfo);
+                            Alignment, MMOFlags, AAInfo);
       }
 
       if (TLI.isTypeLegal(MVT::i32) && !ST->isVolatile()) {
@@ -709,14 +419,13 @@ SDValue SelectionDAGLegalize::OptimizeFloatStore(StoreSDNode* ST) {
         if (DAG.getDataLayout().isBigEndian())
           std::swap(Lo, Hi);
 
-        Lo = DAG.getStore(Chain, dl, Lo, Ptr, ST->getPointerInfo(), isVolatile,
-                          isNonTemporal, Alignment, AAInfo);
+        Lo = DAG.getStore(Chain, dl, Lo, Ptr, ST->getPointerInfo(), Alignment,
+                          MMOFlags, AAInfo);
         Ptr = DAG.getNode(ISD::ADD, dl, Ptr.getValueType(), Ptr,
                           DAG.getConstant(4, dl, Ptr.getValueType()));
         Hi = DAG.getStore(Chain, dl, Hi, Ptr,
                           ST->getPointerInfo().getWithOffset(4),
-                          isVolatile, isNonTemporal, MinAlign(Alignment, 4U),
-                          AAInfo);
+                          MinAlign(Alignment, 4U), MMOFlags, AAInfo);
 
         return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Lo, Hi);
       }
@@ -732,8 +441,7 @@ void SelectionDAGLegalize::LegalizeStoreOps(SDNode *Node) {
     SDLoc dl(Node);
 
     unsigned Alignment = ST->getAlignment();
-    bool isVolatile = ST->isVolatile();
-    bool isNonTemporal = ST->isNonTemporal();
+    MachineMemOperand::Flags MMOFlags = ST->getMemOperand()->getFlags();
     AAMDNodes AAInfo = ST->getAAInfo();
 
     if (!ST->isTruncatingStore()) {
@@ -754,8 +462,10 @@ void SelectionDAGLegalize::LegalizeStoreOps(SDNode *Node) {
           unsigned AS = ST->getAddressSpace();
           unsigned Align = ST->getAlignment();
           const DataLayout &DL = DAG.getDataLayout();
-          if (!TLI.allowsMemoryAccess(*DAG.getContext(), DL, MemVT, AS, Align))
-            ExpandUnalignedStore(cast<StoreSDNode>(Node), DAG, TLI, this);
+          if (!TLI.allowsMemoryAccess(*DAG.getContext(), DL, MemVT, AS, Align)) {
+            SDValue Result = TLI.expandUnalignedStore(ST, DAG);
+            ReplaceNode(SDValue(ST, 0), Result);
+          }
           break;
         }
         case TargetLowering::Custom: {
@@ -770,9 +480,8 @@ void SelectionDAGLegalize::LegalizeStoreOps(SDNode *Node) {
                  "Can only promote stores to same size type");
           Value = DAG.getNode(ISD::BITCAST, dl, NVT, Value);
           SDValue Result =
-            DAG.getStore(Chain, dl, Value, Ptr,
-                         ST->getPointerInfo(), isVolatile,
-                         isNonTemporal, Alignment, AAInfo);
+              DAG.getStore(Chain, dl, Value, Ptr, ST->getPointerInfo(),
+                           Alignment, MMOFlags, AAInfo);
           ReplaceNode(SDValue(Node, 0), Result);
           break;
         }
@@ -794,8 +503,8 @@ void SelectionDAGLegalize::LegalizeStoreOps(SDNode *Node) {
                                     StVT.getStoreSizeInBits());
         Value = DAG.getZeroExtendInReg(Value, dl, StVT);
         SDValue Result =
-          DAG.getTruncStore(Chain, dl, Value, Ptr, ST->getPointerInfo(),
-                            NVT, isVolatile, isNonTemporal, Alignment, AAInfo);
+            DAG.getTruncStore(Chain, dl, Value, Ptr, ST->getPointerInfo(), NVT,
+                              Alignment, MMOFlags, AAInfo);
         ReplaceNode(SDValue(Node, 0), Result);
       } else if (StWidth & (StWidth - 1)) {
         // If not storing a power-of-2 number of bits, expand as two stores.
@@ -815,9 +524,7 @@ void SelectionDAGLegalize::LegalizeStoreOps(SDNode *Node) {
           // TRUNCSTORE:i24 X -> TRUNCSTORE:i16 X, TRUNCSTORE@+2:i8 (srl X, 16)
           // Store the bottom RoundWidth bits.
           Lo = DAG.getTruncStore(Chain, dl, Value, Ptr, ST->getPointerInfo(),
-                                 RoundVT,
-                                 isVolatile, isNonTemporal, Alignment,
-                                 AAInfo);
+                                 RoundVT, Alignment, MMOFlags, AAInfo);
 
           // Store the remaining ExtraWidth bits.
           IncrementSize = RoundWidth / 8;
@@ -828,10 +535,10 @@ void SelectionDAGLegalize::LegalizeStoreOps(SDNode *Node) {
               ISD::SRL, dl, Value.getValueType(), Value,
               DAG.getConstant(RoundWidth, dl,
                               TLI.getShiftAmountTy(Value.getValueType(), DL)));
-          Hi = DAG.getTruncStore(Chain, dl, Hi, Ptr,
-                             ST->getPointerInfo().getWithOffset(IncrementSize),
-                                 ExtraVT, isVolatile, isNonTemporal,
-                                 MinAlign(Alignment, IncrementSize), AAInfo);
+          Hi = DAG.getTruncStore(
+              Chain, dl, Hi, Ptr,
+              ST->getPointerInfo().getWithOffset(IncrementSize), ExtraVT,
+              MinAlign(Alignment, IncrementSize), MMOFlags, AAInfo);
         } else {
           // Big endian - avoid unaligned stores.
           // TRUNCSTORE:i24 X -> TRUNCSTORE:i16 (srl X, 8), TRUNCSTORE@+2:i8 X
@@ -841,18 +548,17 @@ void SelectionDAGLegalize::LegalizeStoreOps(SDNode *Node) {
               DAG.getConstant(ExtraWidth, dl,
                               TLI.getShiftAmountTy(Value.getValueType(), DL)));
           Hi = DAG.getTruncStore(Chain, dl, Hi, Ptr, ST->getPointerInfo(),
-                                 RoundVT, isVolatile, isNonTemporal, Alignment,
-                                 AAInfo);
+                                 RoundVT, Alignment, MMOFlags, AAInfo);
 
           // Store the remaining ExtraWidth bits.
           IncrementSize = RoundWidth / 8;
           Ptr = DAG.getNode(ISD::ADD, dl, Ptr.getValueType(), Ptr,
                             DAG.getConstant(IncrementSize, dl,
                                             Ptr.getValueType()));
-          Lo = DAG.getTruncStore(Chain, dl, Value, Ptr,
-                              ST->getPointerInfo().getWithOffset(IncrementSize),
-                                 ExtraVT, isVolatile, isNonTemporal,
-                                 MinAlign(Alignment, IncrementSize), AAInfo);
+          Lo = DAG.getTruncStore(
+              Chain, dl, Value, Ptr,
+              ST->getPointerInfo().getWithOffset(IncrementSize), ExtraVT,
+              MinAlign(Alignment, IncrementSize), MMOFlags, AAInfo);
         }
 
         // The order of the stores doesn't matter.
@@ -867,8 +573,10 @@ void SelectionDAGLegalize::LegalizeStoreOps(SDNode *Node) {
           unsigned Align = ST->getAlignment();
           // If this is an unaligned store and the target doesn't support it,
           // expand it.
-          if (!TLI.allowsMemoryAccess(*DAG.getContext(), DL, MemVT, AS, Align))
-            ExpandUnalignedStore(cast<StoreSDNode>(Node), DAG, TLI, this);
+          if (!TLI.allowsMemoryAccess(*DAG.getContext(), DL, MemVT, AS, Align)) {
+            SDValue Result = TLI.expandUnalignedStore(ST, DAG);
+            ReplaceNode(SDValue(ST, 0), Result);
+          }
           break;
         }
         case TargetLowering::Custom: {
@@ -886,8 +594,8 @@ void SelectionDAGLegalize::LegalizeStoreOps(SDNode *Node) {
                  "Do not know how to expand this store!");
           Value = DAG.getNode(ISD::TRUNCATE, dl, StVT, Value);
           SDValue Result =
-            DAG.getStore(Chain, dl, Value, Ptr, ST->getPointerInfo(),
-                         isVolatile, isNonTemporal, Alignment, AAInfo);
+              DAG.getStore(Chain, dl, Value, Ptr, ST->getPointerInfo(),
+                           Alignment, MMOFlags, AAInfo);
           ReplaceNode(SDValue(Node, 0), Result);
           break;
         }
@@ -917,13 +625,13 @@ void SelectionDAGLegalize::LegalizeLoadOps(SDNode *Node) {
       const DataLayout &DL = DAG.getDataLayout();
       // If this is an unaligned load and the target doesn't support it,
       // expand it.
-      if (!TLI.allowsMemoryAccess(*DAG.getContext(), DL, MemVT, AS, Align))
-        ExpandUnalignedLoad(cast<LoadSDNode>(Node), DAG, TLI, RVal, RChain);
+      if (!TLI.allowsMemoryAccess(*DAG.getContext(), DL, MemVT, AS, Align)) {
+        std::tie(RVal, RChain) =  TLI.expandUnalignedLoad(LD, DAG);
+      }
       break;
     }
     case TargetLowering::Custom: {
-      SDValue Res = TLI.LowerOperation(RVal, DAG);
-      if (Res.getNode()) {
+      if (SDValue Res = TLI.LowerOperation(RVal, DAG)) {
         RVal = Res;
         RChain = Res.getValue(1);
       }
@@ -956,9 +664,7 @@ void SelectionDAGLegalize::LegalizeLoadOps(SDNode *Node) {
   EVT SrcVT = LD->getMemoryVT();
   unsigned SrcWidth = SrcVT.getSizeInBits();
   unsigned Alignment = LD->getAlignment();
-  bool isVolatile = LD->isVolatile();
-  bool isNonTemporal = LD->isNonTemporal();
-  bool isInvariant = LD->isInvariant();
+  MachineMemOperand::Flags MMOFlags = LD->getMemOperand()->getFlags();
   AAMDNodes AAInfo = LD->getAAInfo();
 
   if (SrcWidth != SrcVT.getStoreSizeInBits() &&
@@ -985,10 +691,8 @@ void SelectionDAGLegalize::LegalizeLoadOps(SDNode *Node) {
       ExtType == ISD::ZEXTLOAD ? ISD::ZEXTLOAD : ISD::EXTLOAD;
 
     SDValue Result =
-      DAG.getExtLoad(NewExtType, dl, Node->getValueType(0),
-                     Chain, Ptr, LD->getPointerInfo(),
-                     NVT, isVolatile, isNonTemporal, isInvariant, Alignment,
-                     AAInfo);
+        DAG.getExtLoad(NewExtType, dl, Node->getValueType(0), Chain, Ptr,
+                       LD->getPointerInfo(), NVT, Alignment, MMOFlags, AAInfo);
 
     Ch = Result.getValue(1); // The chain.
 
@@ -1023,10 +727,9 @@ void SelectionDAGLegalize::LegalizeLoadOps(SDNode *Node) {
     if (DL.isLittleEndian()) {
       // EXTLOAD:i24 -> ZEXTLOAD:i16 | (shl EXTLOAD@+2:i8, 16)
       // Load the bottom RoundWidth bits.
-      Lo = DAG.getExtLoad(ISD::ZEXTLOAD, dl, Node->getValueType(0),
-                          Chain, Ptr,
-                          LD->getPointerInfo(), RoundVT, isVolatile,
-                          isNonTemporal, isInvariant, Alignment, AAInfo);
+      Lo = DAG.getExtLoad(ISD::ZEXTLOAD, dl, Node->getValueType(0), Chain, Ptr,
+                          LD->getPointerInfo(), RoundVT, Alignment, MMOFlags,
+                          AAInfo);
 
       // Load the remaining ExtraWidth bits.
       IncrementSize = RoundWidth / 8;
@@ -1035,8 +738,8 @@ void SelectionDAGLegalize::LegalizeLoadOps(SDNode *Node) {
                                          Ptr.getValueType()));
       Hi = DAG.getExtLoad(ExtType, dl, Node->getValueType(0), Chain, Ptr,
                           LD->getPointerInfo().getWithOffset(IncrementSize),
-                          ExtraVT, isVolatile, isNonTemporal, isInvariant,
-                          MinAlign(Alignment, IncrementSize), AAInfo);
+                          ExtraVT, MinAlign(Alignment, IncrementSize), MMOFlags,
+                          AAInfo);
 
       // Build a factor node to remember that this load is independent of
       // the other one.
@@ -1056,19 +759,18 @@ void SelectionDAGLegalize::LegalizeLoadOps(SDNode *Node) {
       // EXTLOAD:i24 -> (shl EXTLOAD:i16, 8) | ZEXTLOAD@+2:i8
       // Load the top RoundWidth bits.
       Hi = DAG.getExtLoad(ExtType, dl, Node->getValueType(0), Chain, Ptr,
-                          LD->getPointerInfo(), RoundVT, isVolatile,
-                          isNonTemporal, isInvariant, Alignment, AAInfo);
+                          LD->getPointerInfo(), RoundVT, Alignment, MMOFlags,
+                          AAInfo);
 
       // Load the remaining ExtraWidth bits.
       IncrementSize = RoundWidth / 8;
       Ptr = DAG.getNode(ISD::ADD, dl, Ptr.getValueType(), Ptr,
                          DAG.getConstant(IncrementSize, dl,
                                          Ptr.getValueType()));
-      Lo = DAG.getExtLoad(ISD::ZEXTLOAD,
-                          dl, Node->getValueType(0), Chain, Ptr,
+      Lo = DAG.getExtLoad(ISD::ZEXTLOAD, dl, Node->getValueType(0), Chain, Ptr,
                           LD->getPointerInfo().getWithOffset(IncrementSize),
-                          ExtraVT, isVolatile, isNonTemporal, isInvariant,
-                          MinAlign(Alignment, IncrementSize), AAInfo);
+                          ExtraVT, MinAlign(Alignment, IncrementSize), MMOFlags,
+                          AAInfo);
 
       // Build a factor node to remember that this load is independent of
       // the other one.
@@ -1099,8 +801,7 @@ void SelectionDAGLegalize::LegalizeLoadOps(SDNode *Node) {
       Chain = SDValue(Node, 1);
 
       if (isCustom) {
-        SDValue Res = TLI.LowerOperation(SDValue(Node, 0), DAG);
-        if (Res.getNode()) {
+        if (SDValue Res = TLI.LowerOperation(SDValue(Node, 0), DAG)) {
           Value = Res;
           Chain = Res.getValue(1);
         }
@@ -1111,8 +812,9 @@ void SelectionDAGLegalize::LegalizeLoadOps(SDNode *Node) {
         unsigned AS = LD->getAddressSpace();
         unsigned Align = LD->getAlignment();
         const DataLayout &DL = DAG.getDataLayout();
-        if (!TLI.allowsMemoryAccess(*DAG.getContext(), DL, MemVT, AS, Align))
-          ExpandUnalignedLoad(cast<LoadSDNode>(Node), DAG, TLI, Value, Chain);
+        if (!TLI.allowsMemoryAccess(*DAG.getContext(), DL, MemVT, AS, Align)) {
+          std::tie(Value, Chain) = TLI.expandUnalignedLoad(LD, DAG);
+        }
       }
       break;
     }
@@ -1399,8 +1101,7 @@ void SelectionDAGLegalize::LegalizeOp(SDNode *Node) {
     case TargetLowering::Custom: {
       // FIXME: The handling for custom lowering with multiple results is
       // a complete mess.
-      SDValue Res = TLI.LowerOperation(SDValue(Node, 0), DAG);
-      if (Res.getNode()) {
+      if (SDValue Res = TLI.LowerOperation(SDValue(Node, 0), DAG)) {
         if (!(Res.getNode() != Node || Res.getResNo() != 0))
           return;
 
@@ -1467,7 +1168,7 @@ SDValue SelectionDAGLegalize::ExpandExtractFromVectorThroughStack(SDValue Op) {
   // Caches for hasPredecessorHelper
   SmallPtrSet<const SDNode *, 32> Visited;
   SmallVector<const SDNode *, 16> Worklist;
-
+  Worklist.push_back(Idx.getNode());
   SDValue StackPtr, Ch;
   for (SDNode::use_iterator UI = Vec.getNode()->use_begin(),
        UE = Vec.getNode()->use_end(); UI != UE; ++UI) {
@@ -1485,7 +1186,7 @@ SDValue SelectionDAGLegalize::ExpandExtractFromVectorThroughStack(SDValue Op) {
       // If the index is dependent on the store we will introduce a cycle when
       // creating the load (the load uses the index, and by replacing the chain
       // we will make the index dependent on the load).
-      if (Idx.getNode()->hasPredecessorHelper(ST, Visited, Worklist))
+      if (SDNode::hasPredecessorHelper(ST, Visited, Worklist))
         continue;
 
       StackPtr = ST->getBasePtr();
@@ -1498,7 +1199,7 @@ SDValue SelectionDAGLegalize::ExpandExtractFromVectorThroughStack(SDValue Op) {
     // Store the value to a temporary stack slot, then LOAD the returned part.
     StackPtr = DAG.CreateStackTemporary(Vec.getValueType());
     Ch = DAG.getStore(DAG.getEntryNode(), dl, Vec, StackPtr,
-                      MachinePointerInfo(), false, false, 0);
+                      MachinePointerInfo());
   }
 
   // Add the offset to the index.
@@ -1513,12 +1214,12 @@ SDValue SelectionDAGLegalize::ExpandExtractFromVectorThroughStack(SDValue Op) {
   SDValue NewLoad;
 
   if (Op.getValueType().isVector())
-    NewLoad = DAG.getLoad(Op.getValueType(), dl, Ch, StackPtr,
-                          MachinePointerInfo(), false, false, false, 0);
+    NewLoad =
+        DAG.getLoad(Op.getValueType(), dl, Ch, StackPtr, MachinePointerInfo());
   else
-    NewLoad = DAG.getExtLoad(
-        ISD::EXTLOAD, dl, Op.getValueType(), Ch, StackPtr, MachinePointerInfo(),
-        Vec.getValueType().getVectorElementType(), false, false, false, 0);
+    NewLoad = DAG.getExtLoad(ISD::EXTLOAD, dl, Op.getValueType(), Ch, StackPtr,
+                             MachinePointerInfo(),
+                             Vec.getValueType().getVectorElementType());
 
   // Replace the chain going out of the store, by the one out of the load.
   DAG.ReplaceAllUsesOfValueWith(Ch, SDValue(NewLoad.getNode(), 1));
@@ -1549,8 +1250,7 @@ SDValue SelectionDAGLegalize::ExpandInsertToVectorThroughStack(SDValue Op) {
       MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI);
 
   // First store the whole vector.
-  SDValue Ch = DAG.getStore(DAG.getEntryNode(), dl, Vec, StackPtr, PtrInfo,
-                            false, false, 0);
+  SDValue Ch = DAG.getStore(DAG.getEntryNode(), dl, Vec, StackPtr, PtrInfo);
 
   // Then store the inserted part.
 
@@ -1566,12 +1266,10 @@ SDValue SelectionDAGLegalize::ExpandInsertToVectorThroughStack(SDValue Op) {
                                     StackPtr);
 
   // Store the subvector.
-  Ch = DAG.getStore(Ch, dl, Part, SubStackPtr,
-                    MachinePointerInfo(), false, false, 0);
+  Ch = DAG.getStore(Ch, dl, Part, SubStackPtr, MachinePointerInfo());
 
   // Finally, load the updated vector.
-  return DAG.getLoad(Op.getValueType(), dl, Ch, StackPtr, PtrInfo,
-                     false, false, false, 0);
+  return DAG.getLoad(Op.getValueType(), dl, Ch, StackPtr, PtrInfo);
 }
 
 SDValue SelectionDAGLegalize::ExpandVectorBuildThroughStack(SDNode* Node) {
@@ -1593,7 +1291,7 @@ SDValue SelectionDAGLegalize::ExpandVectorBuildThroughStack(SDNode* Node) {
   // Store (in the right endianness) the elements to memory.
   for (unsigned i = 0, e = Node->getNumOperands(); i != e; ++i) {
     // Ignore undef elements.
-    if (Node->getOperand(i).getOpcode() == ISD::UNDEF) continue;
+    if (Node->getOperand(i).isUndef()) continue;
 
     unsigned Offset = TypeByteSize*i;
 
@@ -1605,13 +1303,10 @@ SDValue SelectionDAGLegalize::ExpandVectorBuildThroughStack(SDNode* Node) {
     if (EltVT.bitsLT(Node->getOperand(i).getValueType().getScalarType())) {
       Stores.push_back(DAG.getTruncStore(DAG.getEntryNode(), dl,
                                          Node->getOperand(i), Idx,
-                                         PtrInfo.getWithOffset(Offset),
-                                         EltVT, false, false, 0));
+                                         PtrInfo.getWithOffset(Offset), EltVT));
     } else
-      Stores.push_back(DAG.getStore(DAG.getEntryNode(), dl,
-                                    Node->getOperand(i), Idx,
-                                    PtrInfo.getWithOffset(Offset),
-                                    false, false, 0));
+      Stores.push_back(DAG.getStore(DAG.getEntryNode(), dl, Node->getOperand(i),
+                                    Idx, PtrInfo.getWithOffset(Offset)));
   }
 
   SDValue StoreChain;
@@ -1621,8 +1316,7 @@ SDValue SelectionDAGLegalize::ExpandVectorBuildThroughStack(SDNode* Node) {
     StoreChain = DAG.getEntryNode();
 
   // Result is a load from the stack slot.
-  return DAG.getLoad(VT, dl, StoreChain, FIPtr, PtrInfo,
-                     false, false, false, 0);
+  return DAG.getLoad(VT, dl, StoreChain, FIPtr, PtrInfo);
 }
 
 namespace {
@@ -1645,7 +1339,8 @@ struct FloatSignAsInt {
 /// containing the sign bit if the target has no integer value capable of
 /// holding all bits of the floating-point value.
 void SelectionDAGLegalize::getSignAsIntValue(FloatSignAsInt &State,
-                                             SDLoc DL, SDValue Value) const {
+                                             const SDLoc &DL,
+                                             SDValue Value) const {
   EVT FloatVT = Value.getValueType();
   unsigned NumBits = FloatVT.getSizeInBits();
   State.FloatVT = FloatVT;
@@ -1669,7 +1364,7 @@ void SelectionDAGLegalize::getSignAsIntValue(FloatSignAsInt &State,
   MachineFunction &MF = DAG.getMachineFunction();
   State.FloatPointerInfo = MachinePointerInfo::getFixedStack(MF, FI);
   State.Chain = DAG.getStore(DAG.getEntryNode(), DL, Value, State.FloatPtr,
-                             State.FloatPointerInfo, false, false, 0);
+                             State.FloatPointerInfo);
 
   SDValue IntPtr;
   if (DataLayout.isBigEndian()) {
@@ -1687,9 +1382,8 @@ void SelectionDAGLegalize::getSignAsIntValue(FloatSignAsInt &State,
   }
 
   State.IntPtr = IntPtr;
-  State.IntValue = DAG.getExtLoad(ISD::EXTLOAD, DL, LoadTy, State.Chain,
-                                  IntPtr, State.IntPointerInfo, MVT::i8,
-                                  false, false, false, 0);
+  State.IntValue = DAG.getExtLoad(ISD::EXTLOAD, DL, LoadTy, State.Chain, IntPtr,
+                                  State.IntPointerInfo, MVT::i8);
   State.SignMask = APInt::getOneBitSet(LoadTy.getSizeInBits(), 7);
   State.SignBit = 7;
 }
@@ -1697,16 +1391,16 @@ void SelectionDAGLegalize::getSignAsIntValue(FloatSignAsInt &State,
 /// Replace the integer value produced by getSignAsIntValue() with a new value
 /// and cast the result back to a floating-point type.
 SDValue SelectionDAGLegalize::modifySignAsInt(const FloatSignAsInt &State,
-                                          SDLoc DL, SDValue NewIntValue) const {
+                                              const SDLoc &DL,
+                                              SDValue NewIntValue) const {
   if (!State.Chain)
     return DAG.getNode(ISD::BITCAST, DL, State.FloatVT, NewIntValue);
 
   // Override the part containing the sign bit in the value stored on the stack.
   SDValue Chain = DAG.getTruncStore(State.Chain, DL, NewIntValue, State.IntPtr,
-                                    State.IntPointerInfo, MVT::i8, false, false,
-                                    0);
+                                    State.IntPointerInfo, MVT::i8);
   return DAG.getLoad(State.FloatVT, DL, Chain, State.FloatPtr,
-                     State.FloatPointerInfo, false, false, false, 0);
+                     State.FloatPointerInfo);
 }
 
 SDValue SelectionDAGLegalize::ExpandFCOPYSIGN(SDNode *Node) const {
@@ -1843,11 +1537,10 @@ void SelectionDAGLegalize::ExpandDYNAMIC_STACKALLOC(SDNode* Node,
 /// of a true/false result.
 ///
 /// \returns true if the SetCC has been legalized, false if it hasn't.
-bool SelectionDAGLegalize::LegalizeSetCCCondCode(EVT VT,
-                                                 SDValue &LHS, SDValue &RHS,
-                                                 SDValue &CC,
+bool SelectionDAGLegalize::LegalizeSetCCCondCode(EVT VT, SDValue &LHS,
+                                                 SDValue &RHS, SDValue &CC,
                                                  bool &NeedInvert,
-                                                 SDLoc dl) {
+                                                 const SDLoc &dl) {
   MVT OpVT = LHS.getSimpleValueType();
   ISD::CondCode CCCode = cast<CondCodeSDNode>(CC)->get();
   NeedInvert = false;
@@ -1944,10 +1637,8 @@ bool SelectionDAGLegalize::LegalizeSetCCCondCode(EVT VT,
 /// SrcOp to a stack slot of type SlotVT, truncating it if needed.  It then does
 /// a load from the stack slot to DestVT, extending it if needed.
 /// The resultant code need not be legal.
-SDValue SelectionDAGLegalize::EmitStackConvert(SDValue SrcOp,
-                                               EVT SlotVT,
-                                               EVT DestVT,
-                                               SDLoc dl) {
+SDValue SelectionDAGLegalize::EmitStackConvert(SDValue SrcOp, EVT SlotVT,
+                                               EVT DestVT, const SDLoc &dl) {
   // Create the stack frame object.
   unsigned SrcAlign = DAG.getDataLayout().getPrefTypeAlignment(
       SrcOp.getValueType().getTypeForEVT(*DAG.getContext()));
@@ -1969,22 +1660,21 @@ SDValue SelectionDAGLegalize::EmitStackConvert(SDValue SrcOp,
   SDValue Store;
 
   if (SrcSize > SlotSize)
-    Store = DAG.getTruncStore(DAG.getEntryNode(), dl, SrcOp, FIPtr,
-                              PtrInfo, SlotVT, false, false, SrcAlign);
+    Store = DAG.getTruncStore(DAG.getEntryNode(), dl, SrcOp, FIPtr, PtrInfo,
+                              SlotVT, SrcAlign);
   else {
     assert(SrcSize == SlotSize && "Invalid store");
-    Store = DAG.getStore(DAG.getEntryNode(), dl, SrcOp, FIPtr,
-                         PtrInfo, false, false, SrcAlign);
+    Store =
+        DAG.getStore(DAG.getEntryNode(), dl, SrcOp, FIPtr, PtrInfo, SrcAlign);
   }
 
   // Result is a load from the stack slot.
   if (SlotSize == DestSize)
-    return DAG.getLoad(DestVT, dl, Store, FIPtr, PtrInfo,
-                       false, false, false, DestAlign);
+    return DAG.getLoad(DestVT, dl, Store, FIPtr, PtrInfo, DestAlign);
 
   assert(SlotSize < DestSize && "Unknown extension!");
-  return DAG.getExtLoad(ISD::EXTLOAD, dl, DestVT, Store, FIPtr,
-                        PtrInfo, SlotVT, false, false, false, DestAlign);
+  return DAG.getExtLoad(ISD::EXTLOAD, dl, DestVT, Store, FIPtr, PtrInfo, SlotVT,
+                        DestAlign);
 }
 
 SDValue SelectionDAGLegalize::ExpandSCALAR_TO_VECTOR(SDNode *Node) {
@@ -1999,11 +1689,10 @@ SDValue SelectionDAGLegalize::ExpandSCALAR_TO_VECTOR(SDNode *Node) {
   SDValue Ch = DAG.getTruncStore(
       DAG.getEntryNode(), dl, Node->getOperand(0), StackPtr,
       MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), SPFI),
-      Node->getValueType(0).getVectorElementType(), false, false, 0);
+      Node->getValueType(0).getVectorElementType());
   return DAG.getLoad(
       Node->getValueType(0), dl, Ch, StackPtr,
-      MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), SPFI), false,
-      false, false, 0);
+      MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), SPFI));
 }
 
 static bool
@@ -2025,7 +1714,7 @@ ExpandBVWithShuffles(SDNode *Node, SelectionDAG &DAG,
                                                                NewIntermedVals;
     for (unsigned i = 0; i < NumElems; ++i) {
       SDValue V = Node->getOperand(i);
-      if (V.getOpcode() == ISD::UNDEF)
+      if (V.isUndef())
         continue;
 
       SDValue Vec;
@@ -2044,7 +1733,7 @@ ExpandBVWithShuffles(SDNode *Node, SelectionDAG &DAG,
         SmallVector<int, 16> FinalIndices;
         FinalIndices.reserve(IntermedVals[i].second.size() +
                              IntermedVals[i+1].second.size());
-        
+
         int k = 0;
         for (unsigned j = 0, f = IntermedVals[i].second.size(); j != f;
              ++j, ++k) {
@@ -2061,7 +1750,7 @@ ExpandBVWithShuffles(SDNode *Node, SelectionDAG &DAG,
         if (Phase)
           Shuffle = DAG.getVectorShuffle(VT, dl, IntermedVals[i].first,
                                          IntermedVals[i+1].first,
-                                         ShuffleVec.data());
+                                         ShuffleVec);
         else if (!TLI.isShuffleMaskLegal(ShuffleVec, VT))
           return false;
         NewIntermedVals.push_back(
@@ -2092,7 +1781,7 @@ ExpandBVWithShuffles(SDNode *Node, SelectionDAG &DAG,
       ShuffleVec[IntermedVals[1].second[i]] = NumElems + i;
 
     if (Phase)
-      Res = DAG.getVectorShuffle(VT, dl, Vec1, Vec2, ShuffleVec.data());
+      Res = DAG.getVectorShuffle(VT, dl, Vec1, Vec2, ShuffleVec);
     else if (!TLI.isShuffleMaskLegal(ShuffleVec, VT))
       return false;
   }
@@ -2117,7 +1806,7 @@ SDValue SelectionDAGLegalize::ExpandBUILD_VECTOR(SDNode *Node) {
   bool isConstant = true;
   for (unsigned i = 0; i < NumElems; ++i) {
     SDValue V = Node->getOperand(i);
-    if (V.getOpcode() == ISD::UNDEF)
+    if (V.isUndef())
       continue;
     if (i > 0)
       isOnlyLowElement = false;
@@ -2160,7 +1849,7 @@ SDValue SelectionDAGLegalize::ExpandBUILD_VECTOR(SDNode *Node) {
                                         CI->getZExtValue()));
         }
       } else {
-        assert(Node->getOperand(i).getOpcode() == ISD::UNDEF);
+        assert(Node->getOperand(i).isUndef());
         Type *OpNTy = EltVT.getTypeForEVT(*DAG.getContext());
         CV.push_back(UndefValue::get(OpNTy));
       }
@@ -2171,13 +1860,13 @@ SDValue SelectionDAGLegalize::ExpandBUILD_VECTOR(SDNode *Node) {
     unsigned Alignment = cast<ConstantPoolSDNode>(CPIdx)->getAlignment();
     return DAG.getLoad(
         VT, dl, DAG.getEntryNode(), CPIdx,
-        MachinePointerInfo::getConstantPool(DAG.getMachineFunction()), false,
-        false, false, Alignment);
+        MachinePointerInfo::getConstantPool(DAG.getMachineFunction()),
+        Alignment);
   }
 
   SmallSet<SDValue, 16> DefinedValues;
   for (unsigned i = 0; i < NumElems; ++i) {
-    if (Node->getOperand(i).getOpcode() == ISD::UNDEF)
+    if (Node->getOperand(i).isUndef())
       continue;
     DefinedValues.insert(Node->getOperand(i));
   }
@@ -2187,7 +1876,7 @@ SDValue SelectionDAGLegalize::ExpandBUILD_VECTOR(SDNode *Node) {
       SmallVector<int, 8> ShuffleVec(NumElems, -1);
       for (unsigned i = 0; i < NumElems; ++i) {
         SDValue V = Node->getOperand(i);
-        if (V.getOpcode() == ISD::UNDEF)
+        if (V.isUndef())
           continue;
         ShuffleVec[i] = V == Value1 ? 0 : NumElems;
       }
@@ -2201,7 +1890,7 @@ SDValue SelectionDAGLegalize::ExpandBUILD_VECTOR(SDNode *Node) {
           Vec2 = DAG.getUNDEF(VT);
 
         // Return shuffle(LowValVec, undef, <0,0,0,0>)
-        return DAG.getVectorShuffle(VT, dl, Vec1, Vec2, ShuffleVec.data());
+        return DAG.getVectorShuffle(VT, dl, Vec1, Vec2, ShuffleVec);
       }
     } else {
       SDValue Res;
@@ -2243,15 +1932,18 @@ SDValue SelectionDAGLegalize::ExpandLibCall(RTLIB::Libcall LC, SDNode *Node,
   SDValue InChain = DAG.getEntryNode();
 
   // isTailCall may be true since the callee does not reference caller stack
-  // frame. Check if it's in the right position.
+  // frame. Check if it's in the right position and that the return types match.
   SDValue TCChain = InChain;
-  bool isTailCall = TLI.isInTailCallPosition(DAG, Node, TCChain);
+  const Function *F = DAG.getMachineFunction().getFunction();
+  bool isTailCall =
+      TLI.isInTailCallPosition(DAG, Node, TCChain) &&
+      (RetTy == F->getReturnType() || F->getReturnType()->isVoidTy());
   if (isTailCall)
     InChain = TCChain;
 
   TargetLowering::CallLoweringInfo CLI(DAG);
   CLI.setDebugLoc(SDLoc(Node)).setChain(InChain)
-    .setCallee(TLI.getLibcallCallingConv(LC), RetTy, Callee, std::move(Args), 0)
+    .setCallee(TLI.getLibcallCallingConv(LC), RetTy, Callee, std::move(Args))
     .setTailCall(isTailCall).setSExtResult(isSigned).setZExtResult(!isSigned);
 
   std::pair<SDValue, SDValue> CallInfo = TLI.LowerCallTo(CLI);
@@ -2267,7 +1959,7 @@ SDValue SelectionDAGLegalize::ExpandLibCall(RTLIB::Libcall LC, SDNode *Node,
 /// and returning a result of type RetVT.
 SDValue SelectionDAGLegalize::ExpandLibCall(RTLIB::Libcall LC, EVT RetVT,
                                             const SDValue *Ops, unsigned NumOps,
-                                            bool isSigned, SDLoc dl) {
+                                            bool isSigned, const SDLoc &dl) {
   TargetLowering::ArgListTy Args;
   Args.reserve(NumOps);
 
@@ -2286,7 +1978,7 @@ SDValue SelectionDAGLegalize::ExpandLibCall(RTLIB::Libcall LC, EVT RetVT,
 
   TargetLowering::CallLoweringInfo CLI(DAG);
   CLI.setDebugLoc(dl).setChain(DAG.getEntryNode())
-    .setCallee(TLI.getLibcallCallingConv(LC), RetTy, Callee, std::move(Args), 0)
+    .setCallee(TLI.getLibcallCallingConv(LC), RetTy, Callee, std::move(Args))
     .setSExtResult(isSigned).setZExtResult(!isSigned);
 
   std::pair<SDValue,SDValue> CallInfo = TLI.LowerCallTo(CLI);
@@ -2320,7 +2012,7 @@ SelectionDAGLegalize::ExpandChainLibCall(RTLIB::Libcall LC,
 
   TargetLowering::CallLoweringInfo CLI(DAG);
   CLI.setDebugLoc(SDLoc(Node)).setChain(InChain)
-    .setCallee(TLI.getLibcallCallingConv(LC), RetTy, Callee, std::move(Args), 0)
+    .setCallee(TLI.getLibcallCallingConv(LC), RetTy, Callee, std::move(Args))
     .setSExtResult(isSigned).setZExtResult(!isSigned);
 
   std::pair<SDValue, SDValue> CallInfo = TLI.LowerCallTo(CLI);
@@ -2415,14 +2107,14 @@ SelectionDAGLegalize::ExpandDivRemLibCall(SDNode *Node,
   SDLoc dl(Node);
   TargetLowering::CallLoweringInfo CLI(DAG);
   CLI.setDebugLoc(dl).setChain(InChain)
-    .setCallee(TLI.getLibcallCallingConv(LC), RetTy, Callee, std::move(Args), 0)
+    .setCallee(TLI.getLibcallCallingConv(LC), RetTy, Callee, std::move(Args))
     .setSExtResult(isSigned).setZExtResult(!isSigned);
 
   std::pair<SDValue, SDValue> CallInfo = TLI.LowerCallTo(CLI);
 
   // Remainder is loaded back from the stack frame.
-  SDValue Rem = DAG.getLoad(RetVT, dl, CallInfo.second, FIPtr,
-                            MachinePointerInfo(), false, false, false, 0);
+  SDValue Rem =
+      DAG.getLoad(RetVT, dl, CallInfo.second, FIPtr, MachinePointerInfo());
   Results.push_back(CallInfo.first);
   Results.push_back(Rem);
 }
@@ -2449,8 +2141,7 @@ static bool canCombineSinCosLibcall(SDNode *Node, const TargetLowering &TLI,
     return false;
   // GNU sin/cos functions set errno while sincos does not. Therefore
   // combining sin and cos is only safe if unsafe-fpmath is enabled.
-  bool isGNU = Triple(TM.getTargetTriple()).getEnvironment() == Triple::GNU;
-  if (isGNU && !TM.Options.UnsafeFPMath)
+  if (TM.getTargetTriple().isGNUEnvironment() && !TM.Options.UnsafeFPMath)
     return false;
   return true;
 }
@@ -2528,26 +2219,25 @@ SelectionDAGLegalize::ExpandSinCosLibCall(SDNode *Node,
   TargetLowering::CallLoweringInfo CLI(DAG);
   CLI.setDebugLoc(dl).setChain(InChain)
     .setCallee(TLI.getLibcallCallingConv(LC),
-               Type::getVoidTy(*DAG.getContext()), Callee, std::move(Args), 0);
+               Type::getVoidTy(*DAG.getContext()), Callee, std::move(Args));
 
   std::pair<SDValue, SDValue> CallInfo = TLI.LowerCallTo(CLI);
 
-  Results.push_back(DAG.getLoad(RetVT, dl, CallInfo.second, SinPtr,
-                                MachinePointerInfo(), false, false, false, 0));
-  Results.push_back(DAG.getLoad(RetVT, dl, CallInfo.second, CosPtr,
-                                MachinePointerInfo(), false, false, false, 0));
+  Results.push_back(
+      DAG.getLoad(RetVT, dl, CallInfo.second, SinPtr, MachinePointerInfo()));
+  Results.push_back(
+      DAG.getLoad(RetVT, dl, CallInfo.second, CosPtr, MachinePointerInfo()));
 }
 
 /// This function is responsible for legalizing a
 /// INT_TO_FP operation of the specified operand when the target requests that
 /// we expand it.  At this point, we know that the result and operand types are
 /// legal for the target.
-SDValue SelectionDAGLegalize::ExpandLegalINT_TO_FP(bool isSigned,
-                                                   SDValue Op0,
+SDValue SelectionDAGLegalize::ExpandLegalINT_TO_FP(bool isSigned, SDValue Op0,
                                                    EVT DestVT,
-                                                   SDLoc dl) {
+                                                   const SDLoc &dl) {
   // TODO: Should any fast-math-flags be set for the created nodes?
-  
+
   if (Op0.getValueType() == MVT::i32 && TLI.isTypeLegal(MVT::f64)) {
     // simple 32-bit [signed|unsigned] integer to float/double expansion
 
@@ -2574,18 +2264,16 @@ SDValue SelectionDAGLegalize::ExpandLegalINT_TO_FP(bool isSigned,
       Op0Mapped = Op0;
     }
     // store the lo of the constructed double - based on integer input
-    SDValue Store1 = DAG.getStore(DAG.getEntryNode(), dl,
-                                  Op0Mapped, Lo, MachinePointerInfo(),
-                                  false, false, 0);
+    SDValue Store1 = DAG.getStore(DAG.getEntryNode(), dl, Op0Mapped, Lo,
+                                  MachinePointerInfo());
     // initial hi portion of constructed double
     SDValue InitialHi = DAG.getConstant(0x43300000u, dl, MVT::i32);
     // store the hi of the constructed double - biased exponent
-    SDValue Store2 = DAG.getStore(Store1, dl, InitialHi, Hi,
-                                  MachinePointerInfo(),
-                                  false, false, 0);
+    SDValue Store2 =
+        DAG.getStore(Store1, dl, InitialHi, Hi, MachinePointerInfo());
     // load the constructed double
-    SDValue Load = DAG.getLoad(MVT::f64, dl, Store2, StackSlot,
-                               MachinePointerInfo(), false, false, false, 0);
+    SDValue Load =
+        DAG.getLoad(MVT::f64, dl, Store2, StackSlot, MachinePointerInfo());
     // FP constant to bias correct the final result
     SDValue Bias = DAG.getConstantFP(isSigned ?
                                      BitsToDouble(0x4330000080000000ULL) :
@@ -2733,13 +2421,13 @@ SDValue SelectionDAGLegalize::ExpandLegalINT_TO_FP(bool isSigned,
   if (DestVT == MVT::f32)
     FudgeInReg = DAG.getLoad(
         MVT::f32, dl, DAG.getEntryNode(), CPIdx,
-        MachinePointerInfo::getConstantPool(DAG.getMachineFunction()), false,
-        false, false, Alignment);
+        MachinePointerInfo::getConstantPool(DAG.getMachineFunction()),
+        Alignment);
   else {
     SDValue Load = DAG.getExtLoad(
         ISD::EXTLOAD, dl, DestVT, DAG.getEntryNode(), CPIdx,
         MachinePointerInfo::getConstantPool(DAG.getMachineFunction()), MVT::f32,
-        false, false, false, Alignment);
+        Alignment);
     HandleSDNode Handle(Load);
     LegalizeOp(Load.getNode());
     FudgeInReg = Handle.getValue();
@@ -2753,10 +2441,9 @@ SDValue SelectionDAGLegalize::ExpandLegalINT_TO_FP(bool isSigned,
 /// we promote it.  At this point, we know that the result and operand types are
 /// legal for the target, and that there is a legal UINT_TO_FP or SINT_TO_FP
 /// operation that takes a larger input.
-SDValue SelectionDAGLegalize::PromoteLegalINT_TO_FP(SDValue LegalOp,
-                                                    EVT DestVT,
+SDValue SelectionDAGLegalize::PromoteLegalINT_TO_FP(SDValue LegalOp, EVT DestVT,
                                                     bool isSigned,
-                                                    SDLoc dl) {
+                                                    const SDLoc &dl) {
   // First step, figure out the appropriate *INT_TO_FP operation to use.
   EVT NewInTy = LegalOp.getValueType();
 
@@ -2795,10 +2482,9 @@ SDValue SelectionDAGLegalize::PromoteLegalINT_TO_FP(SDValue LegalOp,
 /// we promote it.  At this point, we know that the result and operand types are
 /// legal for the target, and that there is a legal FP_TO_UINT or FP_TO_SINT
 /// operation that returns a larger result.
-SDValue SelectionDAGLegalize::PromoteLegalFP_TO_INT(SDValue LegalOp,
-                                                    EVT DestVT,
+SDValue SelectionDAGLegalize::PromoteLegalFP_TO_INT(SDValue LegalOp, EVT DestVT,
                                                     bool isSigned,
-                                                    SDLoc dl) {
+                                                    const SDLoc &dl) {
   // First step, figure out the appropriate FP_TO*INT operation to use.
   EVT NewOutTy = DestVT;
 
@@ -2835,11 +2521,11 @@ SDValue SelectionDAGLegalize::PromoteLegalFP_TO_INT(SDValue LegalOp,
 }
 
 /// Open code the operations for BITREVERSE.
-SDValue SelectionDAGLegalize::ExpandBITREVERSE(SDValue Op, SDLoc dl) {
+SDValue SelectionDAGLegalize::ExpandBITREVERSE(SDValue Op, const SDLoc &dl) {
   EVT VT = Op.getValueType();
   EVT SHVT = TLI.getShiftAmountTy(VT, DAG.getDataLayout());
   unsigned Sz = VT.getScalarSizeInBits();
-  
+
   SDValue Tmp, Tmp2;
   Tmp = DAG.getConstant(0, dl, VT);
   for (unsigned I = 0, J = Sz-1; I < Sz; ++I, --J) {
@@ -2849,7 +2535,7 @@ SDValue SelectionDAGLegalize::ExpandBITREVERSE(SDValue Op, SDLoc dl) {
     else
       Tmp2 =
           DAG.getNode(ISD::SRL, dl, VT, Op, DAG.getConstant(I - J, dl, SHVT));
-    
+
     APInt Shift(Sz, 1);
     Shift = Shift.shl(J);
     Tmp2 = DAG.getNode(ISD::AND, dl, VT, Tmp2, DAG.getConstant(Shift, dl, VT));
@@ -2860,7 +2546,7 @@ SDValue SelectionDAGLegalize::ExpandBITREVERSE(SDValue Op, SDLoc dl) {
 }
 
 /// Open code the operations for BSWAP of the specified operation.
-SDValue SelectionDAGLegalize::ExpandBSWAP(SDValue Op, SDLoc dl) {
+SDValue SelectionDAGLegalize::ExpandBSWAP(SDValue Op, const SDLoc &dl) {
   EVT VT = Op.getValueType();
   EVT SHVT = TLI.getShiftAmountTy(VT, DAG.getDataLayout());
   SDValue Tmp1, Tmp2, Tmp3, Tmp4, Tmp5, Tmp6, Tmp7, Tmp8;
@@ -2914,7 +2600,7 @@ SDValue SelectionDAGLegalize::ExpandBSWAP(SDValue Op, SDLoc dl) {
 
 /// Expand the specified bitcount instruction into operations.
 SDValue SelectionDAGLegalize::ExpandBitCount(unsigned Opc, SDValue Op,
-                                             SDLoc dl) {
+                                             const SDLoc &dl) {
   switch (Opc) {
   default: llvm_unreachable("Cannot expand this yet!");
   case ISD::CTPOP: {
@@ -3111,10 +2797,38 @@ bool SelectionDAGLegalize::ExpandNode(SDNode *Node) {
         cast<AtomicSDNode>(Node)->getFailureOrdering(),
         cast<AtomicSDNode>(Node)->getSynchScope());
 
-    SDValue Success = DAG.getSetCC(SDLoc(Node), Node->getValueType(1),
-                                   Res, Node->getOperand(2), ISD::SETEQ);
+    SDValue ExtRes = Res;
+    SDValue LHS = Res;
+    SDValue RHS = Node->getOperand(1);
+
+    EVT AtomicType = cast<AtomicSDNode>(Node)->getMemoryVT();
+    EVT OuterType = Node->getValueType(0);
+    switch (TLI.getExtendForAtomicOps()) {
+    case ISD::SIGN_EXTEND:
+      LHS = DAG.getNode(ISD::AssertSext, dl, OuterType, Res,
+                        DAG.getValueType(AtomicType));
+      RHS = DAG.getNode(ISD::SIGN_EXTEND_INREG, dl, OuterType,
+                        Node->getOperand(2), DAG.getValueType(AtomicType));
+      ExtRes = LHS;
+      break;
+    case ISD::ZERO_EXTEND:
+      LHS = DAG.getNode(ISD::AssertZext, dl, OuterType, Res,
+                        DAG.getValueType(AtomicType));
+      RHS = DAG.getNode(ISD::ZERO_EXTEND, dl, OuterType, Node->getOperand(2));
+      ExtRes = LHS;
+      break;
+    case ISD::ANY_EXTEND:
+      LHS = DAG.getZeroExtendInReg(Res, dl, AtomicType);
+      RHS = DAG.getNode(ISD::ZERO_EXTEND, dl, OuterType, Node->getOperand(2));
+      break;
+    default:
+      llvm_unreachable("Invalid atomic op extension");
+    }
+
+    SDValue Success =
+        DAG.getSetCC(dl, Node->getValueType(1), LHS, RHS, ISD::SETEQ);
 
-    Results.push_back(Res.getValue(0));
+    Results.push_back(ExtRes.getValue(0));
     Results.push_back(Success);
     Results.push_back(Res.getValue(1));
     break;
@@ -3400,7 +3114,7 @@ bool SelectionDAGLegalize::ExpandNode(SDNode *Node) {
     Results.push_back(Tmp1);
     break;
   }
-    
+
   case ISD::FSIN:
   case ISD::FCOS: {
     EVT VT = Node->getValueType(0);
@@ -3442,7 +3156,7 @@ bool SelectionDAGLegalize::ExpandNode(SDNode *Node) {
         SDValue FloatVal = DAG.getNode(ISD::FP_ROUND, dl, MVT::f32, Op,
                                        DAG.getIntPtrConstant(0, dl));
         Results.push_back(
-            DAG.getNode(ISD::FP_TO_FP16, dl, MVT::i16, FloatVal));
+            DAG.getNode(ISD::FP_TO_FP16, dl, Node->getValueType(0), FloatVal));
       }
     }
     break;
@@ -3760,10 +3474,9 @@ bool SelectionDAGLegalize::ExpandNode(SDNode *Node) {
     EVT MemVT = EVT::getIntegerVT(*DAG.getContext(), EntrySize * 8);
     SDValue LD = DAG.getExtLoad(
         ISD::SEXTLOAD, dl, PTy, Chain, Addr,
-        MachinePointerInfo::getJumpTable(DAG.getMachineFunction()), MemVT,
-        false, false, false, 0);
+        MachinePointerInfo::getJumpTable(DAG.getMachineFunction()), MemVT);
     Addr = LD;
-    if (TM.getRelocationModel() == Reloc::PIC_) {
+    if (TM.isPositionIndependent()) {
       // For PIC, the sequence is:
       // BRIND(load(Jumptable + index) + RelocBase)
       // RelocBase can be JumpTable, GOT or some sort of global base.
@@ -3786,7 +3499,7 @@ bool SelectionDAGLegalize::ExpandNode(SDNode *Node) {
                          Node->getOperand(2));
     } else {
       // We test only the i1 bit.  Skip the AND if UNDEF.
-      Tmp3 = (Tmp2.getOpcode() == ISD::UNDEF) ? Tmp2 :
+      Tmp3 = (Tmp2.isUndef()) ? Tmp2 :
         DAG.getNode(ISD::AND, dl, Tmp2.getValueType(), Tmp2,
                     DAG.getConstant(1, dl, Tmp2.getValueType()));
       Tmp1 = DAG.getNode(ISD::BR_CC, dl, MVT::Other, Tmp1,
@@ -4008,7 +3721,7 @@ void SelectionDAGLegalize::ConvertNodeToLibcall(SDNode *Node) {
         .setCallee(CallingConv::C, Type::getVoidTy(*DAG.getContext()),
                    DAG.getExternalSymbol("__sync_synchronize",
                                          TLI.getPointerTy(DAG.getDataLayout())),
-                   std::move(Args), 0);
+                   std::move(Args));
 
     std::pair<SDValue, SDValue> CallResult = TLI.LowerCallTo(CLI);
 
@@ -4031,7 +3744,7 @@ void SelectionDAGLegalize::ConvertNodeToLibcall(SDNode *Node) {
   case ISD::ATOMIC_LOAD_UMAX:
   case ISD::ATOMIC_CMP_SWAP: {
     MVT VT = cast<AtomicSDNode>(Node)->getMemoryVT().getSimpleVT();
-    RTLIB::Libcall LC = RTLIB::getATOMIC(Opc, VT);
+    RTLIB::Libcall LC = RTLIB::getSYNC(Opc, VT);
     assert(LC != RTLIB::UNKNOWN_LIBCALL && "Unexpected atomic op or value type!");
 
     std::pair<SDValue, SDValue> Tmp = ExpandChainLibCall(LC, Node, false);
@@ -4048,7 +3761,7 @@ void SelectionDAGLegalize::ConvertNodeToLibcall(SDNode *Node) {
         .setCallee(CallingConv::C, Type::getVoidTy(*DAG.getContext()),
                    DAG.getExternalSymbol("abort",
                                          TLI.getPointerTy(DAG.getDataLayout())),
-                   std::move(Args), 0);
+                   std::move(Args));
     std::pair<SDValue, SDValue> CallResult = TLI.LowerCallTo(CLI);
 
     Results.push_back(CallResult.second);
@@ -4269,18 +3982,20 @@ void SelectionDAGLegalize::PromoteNode(SDNode *Node) {
   case ISD::CTPOP:
     // Zero extend the argument.
     Tmp1 = DAG.getNode(ISD::ZERO_EXTEND, dl, NVT, Node->getOperand(0));
+    if (Node->getOpcode() == ISD::CTTZ) {
+      // The count is the same in the promoted type except if the original
+      // value was zero.  This can be handled by setting the bit just off
+      // the top of the original type.
+      auto TopBit = APInt::getOneBitSet(NVT.getSizeInBits(),
+                                        OVT.getSizeInBits());
+      Tmp1 = DAG.getNode(ISD::OR, dl, NVT, Tmp1,
+                         DAG.getConstant(TopBit, dl, NVT));
+    }
     // Perform the larger operation. For CTPOP and CTTZ_ZERO_UNDEF, this is
     // already the correct result.
     Tmp1 = DAG.getNode(Node->getOpcode(), dl, NVT, Tmp1);
-    if (Node->getOpcode() == ISD::CTTZ) {
-      // FIXME: This should set a bit in the zero extended value instead.
-      Tmp2 = DAG.getSetCC(dl, getSetCCResultType(NVT),
-                          Tmp1, DAG.getConstant(NVT.getSizeInBits(), dl, NVT),
-                          ISD::SETEQ);
-      Tmp1 = DAG.getSelect(dl, NVT, Tmp2,
-                           DAG.getConstant(OVT.getSizeInBits(), dl, NVT), Tmp1);
-    } else if (Node->getOpcode() == ISD::CTLZ ||
-               Node->getOpcode() == ISD::CTLZ_ZERO_UNDEF) {
+    if (Node->getOpcode() == ISD::CTLZ ||
+        Node->getOpcode() == ISD::CTLZ_ZERO_UNDEF) {
       // Tmp1 = Tmp1 - (sizeinbits(NVT) - sizeinbits(Old VT))
       Tmp1 = DAG.getNode(ISD::SUB, dl, NVT, Tmp1,
                           DAG.getConstant(NVT.getSizeInBits() -
diff --git a/lib/CodeGen/SelectionDAG/LegalizeFloatTypes.cpp b/lib/CodeGen/SelectionDAG/LegalizeFloatTypes.cpp
index 6c0193a76732..31ebf7bbec13 100644
--- a/lib/CodeGen/SelectionDAG/LegalizeFloatTypes.cpp
+++ b/lib/CodeGen/SelectionDAG/LegalizeFloatTypes.cpp
@@ -149,9 +149,26 @@ SDValue DAGTypeLegalizer::SoftenFloatRes_ConstantFP(SDNode *N, unsigned ResNo) {
   if (isLegalInHWReg(N->getValueType(ResNo)))
     return SDValue(N, ResNo);
   ConstantFPSDNode *CN = cast<ConstantFPSDNode>(N);
-  return DAG.getConstant(CN->getValueAPF().bitcastToAPInt(), SDLoc(CN),
-                         TLI.getTypeToTransformTo(*DAG.getContext(),
-                                                  CN->getValueType(0)));
+  // In ppcf128, the high 64 bits are always first in memory regardless
+  // of Endianness. LLVM's APFloat representation is not Endian sensitive,
+  // and so always converts into a 128-bit APInt in a non-Endian-sensitive
+  // way. However, APInt's are serialized in an Endian-sensitive fashion,
+  // so on big-Endian targets, the two doubles are output in the wrong 
+  // order. Fix this by manually flipping the order of the high 64 bits
+  // and the low 64 bits here.
+  if (DAG.getDataLayout().isBigEndian() &&
+      CN->getValueType(0).getSimpleVT() == llvm::MVT::ppcf128) {
+    uint64_t words[2] = { CN->getValueAPF().bitcastToAPInt().getRawData()[1],
+                          CN->getValueAPF().bitcastToAPInt().getRawData()[0] };
+    APInt Val(128, words);
+    return DAG.getConstant(Val, SDLoc(CN),
+                           TLI.getTypeToTransformTo(*DAG.getContext(),
+                                                    CN->getValueType(0)));
+  } else {
+    return DAG.getConstant(CN->getValueAPF().bitcastToAPInt(), SDLoc(CN),
+                           TLI.getTypeToTransformTo(*DAG.getContext(),
+                                                    CN->getValueType(0)));
+  }
 }
 
 SDValue DAGTypeLegalizer::SoftenFloatRes_EXTRACT_VECTOR_ELT(SDNode *N) {
@@ -614,12 +631,13 @@ SDValue DAGTypeLegalizer::SoftenFloatRes_LOAD(SDNode *N, unsigned ResNo) {
   EVT NVT = TLI.getTypeToTransformTo(*DAG.getContext(), VT);
   SDLoc dl(N);
 
+  auto MMOFlags =
+      L->getMemOperand()->getFlags() & ~MachineMemOperand::MOInvariant;
   SDValue NewL;
   if (L->getExtensionType() == ISD::NON_EXTLOAD) {
-    NewL = DAG.getLoad(L->getAddressingMode(), L->getExtensionType(),
-                       NVT, dl, L->getChain(), L->getBasePtr(), L->getOffset(),
-                       L->getPointerInfo(), NVT, L->isVolatile(),
-                       L->isNonTemporal(), false, L->getAlignment(),
+    NewL = DAG.getLoad(L->getAddressingMode(), L->getExtensionType(), NVT, dl,
+                       L->getChain(), L->getBasePtr(), L->getOffset(),
+                       L->getPointerInfo(), NVT, L->getAlignment(), MMOFlags,
                        L->getAAInfo());
     // Legalized the chain result - switch anything that used the old chain to
     // use the new one.
@@ -629,12 +647,10 @@ SDValue DAGTypeLegalizer::SoftenFloatRes_LOAD(SDNode *N, unsigned ResNo) {
   }
 
   // Do a non-extending load followed by FP_EXTEND.
-  NewL = DAG.getLoad(L->getAddressingMode(), ISD::NON_EXTLOAD,
-                     L->getMemoryVT(), dl, L->getChain(),
-                     L->getBasePtr(), L->getOffset(), L->getPointerInfo(),
-                     L->getMemoryVT(), L->isVolatile(),
-                     L->isNonTemporal(), false, L->getAlignment(),
-                     L->getAAInfo());
+  NewL = DAG.getLoad(L->getAddressingMode(), ISD::NON_EXTLOAD, L->getMemoryVT(),
+                     dl, L->getChain(), L->getBasePtr(), L->getOffset(),
+                     L->getPointerInfo(), L->getMemoryVT(), L->getAlignment(),
+                     MMOFlags, L->getAAInfo());
   // Legalized the chain result - switch anything that used the old chain to
   // use the new one.
   ReplaceValueWith(SDValue(N, 1), NewL.getValue(1));
@@ -800,6 +816,7 @@ bool DAGTypeLegalizer::CanSkipSoftenFloatOperand(SDNode *N, unsigned OpNo) {
     case ISD::FCOPYSIGN:
     case ISD::FNEG:
     case ISD::Register:
+    case ISD::SELECT:
       return true;
   }
   return false;
@@ -1516,7 +1533,7 @@ bool DAGTypeLegalizer::ExpandFloatOperand(SDNode *N, unsigned OpNo) {
 void DAGTypeLegalizer::FloatExpandSetCCOperands(SDValue &NewLHS,
                                                 SDValue &NewRHS,
                                                 ISD::CondCode &CCCode,
-                                                SDLoc dl) {
+                                                const SDLoc &dl) {
   SDValue LHSLo, LHSHi, RHSLo, RHSHi;
   GetExpandedFloat(NewLHS, LHSLo, LHSHi);
   GetExpandedFloat(NewRHS, RHSLo, RHSHi);
@@ -1868,6 +1885,8 @@ void DAGTypeLegalizer::PromoteFloatResult(SDNode *N, unsigned ResNo) {
     // Binary FP Operations
     case ISD::FADD:
     case ISD::FDIV:
+    case ISD::FMAXNAN:
+    case ISD::FMINNAN:
     case ISD::FMAXNUM:
     case ISD::FMINNUM:
     case ISD::FMUL:
@@ -2063,13 +2082,14 @@ SDValue DAGTypeLegalizer::PromoteFloatRes_LOAD(SDNode *N) {
   LoadSDNode *L = cast<LoadSDNode>(N);
   EVT VT = N->getValueType(0);
 
-  // Load the value as an integer value with the same number of bits
+  // Load the value as an integer value with the same number of bits.
   EVT IVT = EVT::getIntegerVT(*DAG.getContext(), VT.getSizeInBits());
-  SDValue newL = DAG.getLoad(L->getAddressingMode(), L->getExtensionType(),
-                   IVT, SDLoc(N), L->getChain(), L->getBasePtr(),
-                   L->getOffset(), L->getPointerInfo(), IVT, L->isVolatile(),
-                   L->isNonTemporal(), false, L->getAlignment(),
-                   L->getAAInfo());
+  auto MMOFlags =
+      L->getMemOperand()->getFlags() & ~MachineMemOperand::MOInvariant;
+  SDValue newL = DAG.getLoad(L->getAddressingMode(), L->getExtensionType(), IVT,
+                             SDLoc(N), L->getChain(), L->getBasePtr(),
+                             L->getOffset(), L->getPointerInfo(), IVT,
+                             L->getAlignment(), MMOFlags, L->getAAInfo());
   // Legalize the chain result by replacing uses of the old value chain with the
   // new one
   ReplaceValueWith(SDValue(N, 1), newL.getValue(1));
@@ -2102,9 +2122,14 @@ SDValue DAGTypeLegalizer::PromoteFloatRes_SELECT_CC(SDNode *N) {
 // Construct a SDNode that transforms the SINT or UINT operand to the promoted
 // float type.
 SDValue DAGTypeLegalizer::PromoteFloatRes_XINT_TO_FP(SDNode *N) {
+  SDLoc DL(N);
   EVT VT = N->getValueType(0);
   EVT NVT = TLI.getTypeToTransformTo(*DAG.getContext(), VT);
-  return DAG.getNode(N->getOpcode(), SDLoc(N), NVT, N->getOperand(0));
+  SDValue NV = DAG.getNode(N->getOpcode(), DL, NVT, N->getOperand(0));
+  // Round the value to the desired precision (that of the source type).
+  return DAG.getNode(
+      ISD::FP_EXTEND, DL, NVT,
+      DAG.getNode(ISD::FP_ROUND, DL, VT, NV, DAG.getIntPtrConstant(0, DL)));
 }
 
 SDValue DAGTypeLegalizer::PromoteFloatRes_UNDEF(SDNode *N) {
diff --git a/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp b/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp
index 74f80db6d01b..3ab9459c8af7 100644
--- a/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp
+++ b/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp
@@ -436,10 +436,7 @@ SDValue DAGTypeLegalizer::PromoteIntRes_FP_TO_FP16(SDNode *N) {
   EVT NVT = TLI.getTypeToTransformTo(*DAG.getContext(), N->getValueType(0));
   SDLoc dl(N);
 
-  SDValue Res = DAG.getNode(N->getOpcode(), dl, NVT, N->getOperand(0));
-
-  return DAG.getNode(ISD::AssertZext, dl,
-                     NVT, Res, DAG.getValueType(N->getValueType(0)));
+  return DAG.getNode(N->getOpcode(), dl, NVT, N->getOperand(0));
 }
 
 SDValue DAGTypeLegalizer::PromoteIntRes_INT_EXTEND(SDNode *N) {
@@ -1374,6 +1371,11 @@ void DAGTypeLegalizer::ExpandIntegerResult(SDNode *N, unsigned ResNo) {
   case ISD::OR:
   case ISD::XOR: ExpandIntRes_Logical(N, Lo, Hi); break;
 
+  case ISD::UMAX:
+  case ISD::SMAX:
+  case ISD::UMIN:
+  case ISD::SMIN: ExpandIntRes_MINMAX(N, Lo, Hi); break;
+
   case ISD::ADD:
   case ISD::SUB: ExpandIntRes_ADDSUB(N, Lo, Hi); break;
 
@@ -1404,7 +1406,7 @@ void DAGTypeLegalizer::ExpandIntegerResult(SDNode *N, unsigned ResNo) {
 std::pair <SDValue, SDValue> DAGTypeLegalizer::ExpandAtomic(SDNode *Node) {
   unsigned Opc = Node->getOpcode();
   MVT VT = cast<AtomicSDNode>(Node)->getMemoryVT().getSimpleVT();
-  RTLIB::Libcall LC = RTLIB::getATOMIC(Opc, VT);
+  RTLIB::Libcall LC = RTLIB::getSYNC(Opc, VT);
   assert(LC != RTLIB::UNKNOWN_LIBCALL && "Unexpected atomic op or value type!");
 
   return ExpandChainLibCall(LC, Node, false);
@@ -1442,15 +1444,6 @@ void DAGTypeLegalizer::ExpandShiftByConstant(SDNode *N, const APInt &Amt,
     } else if (Amt == NVTBits) {
       Lo = DAG.getConstant(0, DL, NVT);
       Hi = InL;
-    } else if (Amt == 1 &&
-               TLI.isOperationLegalOrCustom(ISD::ADDC,
-                              TLI.getTypeToExpandTo(*DAG.getContext(), NVT))) {
-      // Emit this X << 1 as X+X.
-      SDVTList VTList = DAG.getVTList(NVT, MVT::Glue);
-      SDValue LoOps[2] = { InL, InL };
-      Lo = DAG.getNode(ISD::ADDC, DL, VTList, LoOps);
-      SDValue HiOps[3] = { InH, InH, Lo.getValue(1) };
-      Hi = DAG.getNode(ISD::ADDE, DL, VTList, HiOps);
     } else {
       Lo = DAG.getNode(ISD::SHL, DL, NVT, InL, DAG.getConstant(Amt, DL, ShTy));
       Hi = DAG.getNode(ISD::OR, DL, NVT,
@@ -1675,6 +1668,54 @@ ExpandShiftWithUnknownAmountBit(SDNode *N, SDValue &Lo, SDValue &Hi) {
   }
 }
 
+static std::pair<ISD::CondCode, ISD::NodeType> getExpandedMinMaxOps(int Op) {
+
+  switch (Op) {
+    default: llvm_unreachable("invalid min/max opcode");
+    case ISD::SMAX:
+      return std::make_pair(ISD::SETGT, ISD::UMAX);
+    case ISD::UMAX:
+      return std::make_pair(ISD::SETUGT, ISD::UMAX);
+    case ISD::SMIN:
+      return std::make_pair(ISD::SETLT, ISD::UMIN);
+    case ISD::UMIN:
+      return std::make_pair(ISD::SETULT, ISD::UMIN);
+  }
+}
+
+void DAGTypeLegalizer::ExpandIntRes_MINMAX(SDNode *N,
+                                           SDValue &Lo, SDValue &Hi) {
+  SDLoc DL(N);
+  ISD::NodeType LoOpc;
+  ISD::CondCode CondC;
+  std::tie(CondC, LoOpc) = getExpandedMinMaxOps(N->getOpcode());
+
+  // Expand the subcomponents.
+  SDValue LHSL, LHSH, RHSL, RHSH;
+  GetExpandedInteger(N->getOperand(0), LHSL, LHSH);
+  GetExpandedInteger(N->getOperand(1), RHSL, RHSH);
+
+  // Value types
+  EVT NVT = LHSL.getValueType();
+  EVT CCT = getSetCCResultType(NVT);
+
+  // Hi part is always the same op
+  Hi = DAG.getNode(N->getOpcode(), DL, {NVT, NVT}, {LHSH, RHSH});
+
+  // We need to know whether to select Lo part that corresponds to 'winning'
+  // Hi part or if Hi parts are equal.
+  SDValue IsHiLeft = DAG.getSetCC(DL, CCT, LHSH, RHSH, CondC);
+  SDValue IsHiEq = DAG.getSetCC(DL, CCT, LHSH, RHSH, ISD::SETEQ);
+
+  // Lo part corresponding to the 'winning' Hi part
+  SDValue LoCmp = DAG.getSelect(DL, NVT, IsHiLeft, LHSL, RHSL);
+
+  // Recursed Lo part if Hi parts are equal, this uses unsigned version
+  SDValue LoMinMax = DAG.getNode(LoOpc, DL, {NVT, NVT}, {LHSL, RHSL});
+
+  Lo = DAG.getSelect(DL, NVT, IsHiEq, LoMinMax, LoCmp);
+}
+
 void DAGTypeLegalizer::ExpandIntRes_ADDSUB(SDNode *N,
                                            SDValue &Lo, SDValue &Hi) {
   SDLoc dl(N);
@@ -2006,9 +2047,7 @@ void DAGTypeLegalizer::ExpandIntRes_LOAD(LoadSDNode *N,
   SDValue Ptr = N->getBasePtr();
   ISD::LoadExtType ExtType = N->getExtensionType();
   unsigned Alignment = N->getAlignment();
-  bool isVolatile = N->isVolatile();
-  bool isNonTemporal = N->isNonTemporal();
-  bool isInvariant = N->isInvariant();
+  MachineMemOperand::Flags MMOFlags = N->getMemOperand()->getFlags();
   AAMDNodes AAInfo = N->getAAInfo();
   SDLoc dl(N);
 
@@ -2017,9 +2056,8 @@ void DAGTypeLegalizer::ExpandIntRes_LOAD(LoadSDNode *N,
   if (N->getMemoryVT().bitsLE(NVT)) {
     EVT MemVT = N->getMemoryVT();
 
-    Lo = DAG.getExtLoad(ExtType, dl, NVT, Ch, Ptr, N->getPointerInfo(),
-                        MemVT, isVolatile, isNonTemporal, isInvariant,
-                        Alignment, AAInfo);
+    Lo = DAG.getExtLoad(ExtType, dl, NVT, Ch, Ptr, N->getPointerInfo(), MemVT,
+                        Alignment, MMOFlags, AAInfo);
 
     // Remember the chain.
     Ch = Lo.getValue(1);
@@ -2041,8 +2079,7 @@ void DAGTypeLegalizer::ExpandIntRes_LOAD(LoadSDNode *N,
     }
   } else if (DAG.getDataLayout().isLittleEndian()) {
     // Little-endian - low bits are at low addresses.
-    Lo = DAG.getLoad(NVT, dl, Ch, Ptr, N->getPointerInfo(),
-                     isVolatile, isNonTemporal, isInvariant, Alignment,
+    Lo = DAG.getLoad(NVT, dl, Ch, Ptr, N->getPointerInfo(), Alignment, MMOFlags,
                      AAInfo);
 
     unsigned ExcessBits =
@@ -2055,8 +2092,7 @@ void DAGTypeLegalizer::ExpandIntRes_LOAD(LoadSDNode *N,
                       DAG.getConstant(IncrementSize, dl, Ptr.getValueType()));
     Hi = DAG.getExtLoad(ExtType, dl, NVT, Ch, Ptr,
                         N->getPointerInfo().getWithOffset(IncrementSize), NEVT,
-                        isVolatile, isNonTemporal, isInvariant,
-                        MinAlign(Alignment, IncrementSize), AAInfo);
+                        MinAlign(Alignment, IncrementSize), MMOFlags, AAInfo);
 
     // Build a factor node to remember that this load is independent of the
     // other one.
@@ -2074,8 +2110,7 @@ void DAGTypeLegalizer::ExpandIntRes_LOAD(LoadSDNode *N,
     Hi = DAG.getExtLoad(ExtType, dl, NVT, Ch, Ptr, N->getPointerInfo(),
                         EVT::getIntegerVT(*DAG.getContext(),
                                           MemVT.getSizeInBits() - ExcessBits),
-                        isVolatile, isNonTemporal, isInvariant, Alignment,
-                        AAInfo);
+                        Alignment, MMOFlags, AAInfo);
 
     // Increment the pointer to the other half.
     Ptr = DAG.getNode(ISD::ADD, dl, Ptr.getValueType(), Ptr,
@@ -2084,8 +2119,7 @@ void DAGTypeLegalizer::ExpandIntRes_LOAD(LoadSDNode *N,
     Lo = DAG.getExtLoad(ISD::ZEXTLOAD, dl, NVT, Ch, Ptr,
                         N->getPointerInfo().getWithOffset(IncrementSize),
                         EVT::getIntegerVT(*DAG.getContext(), ExcessBits),
-                        isVolatile, isNonTemporal, isInvariant,
-                        MinAlign(Alignment, IncrementSize), AAInfo);
+                        MinAlign(Alignment, IncrementSize), MMOFlags, AAInfo);
 
     // Build a factor node to remember that this load is independent of the
     // other one.
@@ -2145,7 +2179,49 @@ void DAGTypeLegalizer::ExpandIntRes_MUL(SDNode *N,
     LC = RTLIB::MUL_I64;
   else if (VT == MVT::i128)
     LC = RTLIB::MUL_I128;
-  assert(LC != RTLIB::UNKNOWN_LIBCALL && "Unsupported MUL!");
+
+  if (LC == RTLIB::UNKNOWN_LIBCALL) {
+    // We'll expand the multiplication by brute force because we have no other
+    // options. This is a trivially-generalized version of the code from
+    // Hacker's Delight (itself derived from Knuth's Algorithm M from section
+    // 4.3.1).
+    SDValue Mask =
+      DAG.getConstant(APInt::getLowBitsSet(NVT.getSizeInBits(),
+                                           NVT.getSizeInBits() >> 1), dl, NVT);
+    SDValue LLL = DAG.getNode(ISD::AND, dl, NVT, LL, Mask);
+    SDValue RLL = DAG.getNode(ISD::AND, dl, NVT, RL, Mask);
+
+    SDValue T = DAG.getNode(ISD::MUL, dl, NVT, LLL, RLL);
+    SDValue TL = DAG.getNode(ISD::AND, dl, NVT, T, Mask);
+
+    SDValue Shift =
+      DAG.getConstant(NVT.getSizeInBits() >> 1, dl,
+                      TLI.getShiftAmountTy(NVT, DAG.getDataLayout()));
+    SDValue TH = DAG.getNode(ISD::SRL, dl, NVT, T, Shift);
+    SDValue LLH = DAG.getNode(ISD::SRL, dl, NVT, LL, Shift);
+    SDValue RLH = DAG.getNode(ISD::SRL, dl, NVT, RL, Shift);
+
+    SDValue U = DAG.getNode(ISD::ADD, dl, NVT,
+                            DAG.getNode(ISD::MUL, dl, NVT, LLH, RLL), TL);
+    SDValue UL = DAG.getNode(ISD::AND, dl, NVT, U, Mask);
+    SDValue UH = DAG.getNode(ISD::SRL, dl, NVT, U, Shift);
+
+    SDValue V = DAG.getNode(ISD::ADD, dl, NVT,
+                            DAG.getNode(ISD::MUL, dl, NVT, LLL, RLH), UL);
+    SDValue VH = DAG.getNode(ISD::SRL, dl, NVT, V, Shift);
+
+    SDValue W = DAG.getNode(ISD::ADD, dl, NVT,
+                            DAG.getNode(ISD::MUL, dl, NVT, LL, RL),
+                            DAG.getNode(ISD::ADD, dl, NVT, UH, VH));
+    Lo = DAG.getNode(ISD::ADD, dl, NVT, TH,
+                     DAG.getNode(ISD::SHL, dl, NVT, V, Shift));
+
+    Hi = DAG.getNode(ISD::ADD, dl, NVT, W,
+                     DAG.getNode(ISD::ADD, dl, NVT,
+                                 DAG.getNode(ISD::MUL, dl, NVT, RH, LL), 
+                                 DAG.getNode(ISD::MUL, dl, NVT, RL, LH)));
+    return;
+  }
 
   SDValue Ops[2] = { N->getOperand(0), N->getOperand(1) };
   SplitInteger(TLI.makeLibCall(DAG, LC, VT, Ops, true/*irrelevant*/, dl).first,
@@ -2495,9 +2571,9 @@ void DAGTypeLegalizer::ExpandIntRes_XMULO(SDNode *N,
 
   SDValue Temp = DAG.CreateStackTemporary(PtrVT);
   // Temporary for the overflow value, default it to zero.
-  SDValue Chain = DAG.getStore(DAG.getEntryNode(), dl,
-                               DAG.getConstant(0, dl, PtrVT), Temp,
-                               MachinePointerInfo(), false, false, 0);
+  SDValue Chain =
+      DAG.getStore(DAG.getEntryNode(), dl, DAG.getConstant(0, dl, PtrVT), Temp,
+                   MachinePointerInfo());
 
   TargetLowering::ArgListTy Args;
   TargetLowering::ArgListEntry Entry;
@@ -2522,14 +2598,14 @@ void DAGTypeLegalizer::ExpandIntRes_XMULO(SDNode *N,
 
   TargetLowering::CallLoweringInfo CLI(DAG);
   CLI.setDebugLoc(dl).setChain(Chain)
-    .setCallee(TLI.getLibcallCallingConv(LC), RetTy, Func, std::move(Args), 0)
+    .setCallee(TLI.getLibcallCallingConv(LC), RetTy, Func, std::move(Args))
     .setSExtResult();
 
   std::pair<SDValue, SDValue> CallInfo = TLI.LowerCallTo(CLI);
 
   SplitInteger(CallInfo.first, Lo, Hi);
-  SDValue Temp2 = DAG.getLoad(PtrVT, dl, CallInfo.second, Temp,
-                              MachinePointerInfo(), false, false, false, 0);
+  SDValue Temp2 =
+      DAG.getLoad(PtrVT, dl, CallInfo.second, Temp, MachinePointerInfo());
   SDValue Ofl = DAG.getSetCC(dl, N->getValueType(1), Temp2,
                              DAG.getConstant(0, dl, PtrVT),
                              ISD::SETNE);
@@ -2703,7 +2779,7 @@ bool DAGTypeLegalizer::ExpandIntegerOperand(SDNode *N, unsigned OpNo) {
 void DAGTypeLegalizer::IntegerExpandSetCCOperands(SDValue &NewLHS,
                                                   SDValue &NewRHS,
                                                   ISD::CondCode &CCCode,
-                                                  SDLoc dl) {
+                                                  const SDLoc &dl) {
   SDValue LHSLo, LHSHi, RHSLo, RHSHi;
   GetExpandedInteger(NewLHS, LHSLo, LHSHi);
   GetExpandedInteger(NewRHS, RHSLo, RHSHi);
@@ -2956,8 +3032,7 @@ SDValue DAGTypeLegalizer::ExpandIntOp_STORE(StoreSDNode *N, unsigned OpNo) {
   SDValue Ch  = N->getChain();
   SDValue Ptr = N->getBasePtr();
   unsigned Alignment = N->getAlignment();
-  bool isVolatile = N->isVolatile();
-  bool isNonTemporal = N->isNonTemporal();
+  MachineMemOperand::Flags MMOFlags = N->getMemOperand()->getFlags();
   AAMDNodes AAInfo = N->getAAInfo();
   SDLoc dl(N);
   SDValue Lo, Hi;
@@ -2967,16 +3042,15 @@ SDValue DAGTypeLegalizer::ExpandIntOp_STORE(StoreSDNode *N, unsigned OpNo) {
   if (N->getMemoryVT().bitsLE(NVT)) {
     GetExpandedInteger(N->getValue(), Lo, Hi);
     return DAG.getTruncStore(Ch, dl, Lo, Ptr, N->getPointerInfo(),
-                             N->getMemoryVT(), isVolatile, isNonTemporal,
-                             Alignment, AAInfo);
+                             N->getMemoryVT(), Alignment, MMOFlags, AAInfo);
   }
 
   if (DAG.getDataLayout().isLittleEndian()) {
     // Little-endian - low bits are at low addresses.
     GetExpandedInteger(N->getValue(), Lo, Hi);
 
-    Lo = DAG.getStore(Ch, dl, Lo, Ptr, N->getPointerInfo(),
-                      isVolatile, isNonTemporal, Alignment, AAInfo);
+    Lo = DAG.getStore(Ch, dl, Lo, Ptr, N->getPointerInfo(), Alignment, MMOFlags,
+                      AAInfo);
 
     unsigned ExcessBits =
       N->getMemoryVT().getSizeInBits() - NVT.getSizeInBits();
@@ -2986,10 +3060,9 @@ SDValue DAGTypeLegalizer::ExpandIntOp_STORE(StoreSDNode *N, unsigned OpNo) {
     unsigned IncrementSize = NVT.getSizeInBits()/8;
     Ptr = DAG.getNode(ISD::ADD, dl, Ptr.getValueType(), Ptr,
                       DAG.getConstant(IncrementSize, dl, Ptr.getValueType()));
-    Hi = DAG.getTruncStore(Ch, dl, Hi, Ptr,
-                           N->getPointerInfo().getWithOffset(IncrementSize),
-                           NEVT, isVolatile, isNonTemporal,
-                           MinAlign(Alignment, IncrementSize), AAInfo);
+    Hi = DAG.getTruncStore(
+        Ch, dl, Hi, Ptr, N->getPointerInfo().getWithOffset(IncrementSize), NEVT,
+        MinAlign(Alignment, IncrementSize), MMOFlags, AAInfo);
     return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Lo, Hi);
   }
 
@@ -3017,8 +3090,8 @@ SDValue DAGTypeLegalizer::ExpandIntOp_STORE(StoreSDNode *N, unsigned OpNo) {
   }
 
   // Store both the high bits and maybe some of the low bits.
-  Hi = DAG.getTruncStore(Ch, dl, Hi, Ptr, N->getPointerInfo(),
-                         HiVT, isVolatile, isNonTemporal, Alignment, AAInfo);
+  Hi = DAG.getTruncStore(Ch, dl, Hi, Ptr, N->getPointerInfo(), HiVT, Alignment,
+                         MMOFlags, AAInfo);
 
   // Increment the pointer to the other half.
   Ptr = DAG.getNode(ISD::ADD, dl, Ptr.getValueType(), Ptr,
@@ -3027,8 +3100,7 @@ SDValue DAGTypeLegalizer::ExpandIntOp_STORE(StoreSDNode *N, unsigned OpNo) {
   Lo = DAG.getTruncStore(Ch, dl, Lo, Ptr,
                          N->getPointerInfo().getWithOffset(IncrementSize),
                          EVT::getIntegerVT(*DAG.getContext(), ExcessBits),
-                         isVolatile, isNonTemporal,
-                         MinAlign(Alignment, IncrementSize), AAInfo);
+                         MinAlign(Alignment, IncrementSize), MMOFlags, AAInfo);
   return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Lo, Hi);
 }
 
@@ -3104,7 +3176,7 @@ SDValue DAGTypeLegalizer::ExpandIntOp_UINT_TO_FP(SDNode *N) {
     SDValue Fudge = DAG.getExtLoad(
         ISD::EXTLOAD, dl, DstVT, DAG.getEntryNode(), FudgePtr,
         MachinePointerInfo::getConstantPool(DAG.getMachineFunction()), MVT::f32,
-        false, false, false, Alignment);
+        Alignment);
     return DAG.getNode(ISD::FADD, dl, DstVT, SignedConv, Fudge);
   }
 
diff --git a/lib/CodeGen/SelectionDAG/LegalizeTypes.cpp b/lib/CodeGen/SelectionDAG/LegalizeTypes.cpp
index 2a0b0aa44794..144bed241ee7 100644
--- a/lib/CodeGen/SelectionDAG/LegalizeTypes.cpp
+++ b/lib/CodeGen/SelectionDAG/LegalizeTypes.cpp
@@ -27,7 +27,7 @@ using namespace llvm;
 static cl::opt<bool>
 EnableExpensiveChecks("enable-legalize-types-checking", cl::Hidden);
 
-/// PerformExpensiveChecks - Do extensive, expensive, sanity checking.
+/// Do extensive, expensive, sanity checking.
 void DAGTypeLegalizer::PerformExpensiveChecks() {
   // If a node is not processed, then none of its values should be mapped by any
   // of PromotedIntegers, ExpandedIntegers, ..., ReplacedValues.
@@ -174,9 +174,9 @@ void DAGTypeLegalizer::PerformExpensiveChecks() {
   }
 }
 
-/// run - This is the main entry point for the type legalizer.  This does a
-/// top-down traversal of the dag, legalizing types as it goes.  Returns "true"
-/// if it made any changes.
+/// This is the main entry point for the type legalizer. This does a top-down
+/// traversal of the dag, legalizing types as it goes. Returns "true" if it made
+/// any changes.
 bool DAGTypeLegalizer::run() {
   bool Changed = false;
 
@@ -204,7 +204,7 @@ bool DAGTypeLegalizer::run() {
 
   // Now that we have a set of nodes to process, handle them all.
   while (!Worklist.empty()) {
-#ifndef XDEBUG
+#ifndef EXPENSIVE_CHECKS
     if (EnableExpensiveChecks)
 #endif
       PerformExpensiveChecks();
@@ -394,7 +394,7 @@ NodeDone:
     }
   }
 
-#ifndef XDEBUG
+#ifndef EXPENSIVE_CHECKS
   if (EnableExpensiveChecks)
 #endif
     PerformExpensiveChecks();
@@ -461,11 +461,10 @@ NodeDone:
   return Changed;
 }
 
-/// AnalyzeNewNode - The specified node is the root of a subtree of potentially
-/// new nodes.  Correct any processed operands (this may change the node) and
-/// calculate the NodeId.  If the node itself changes to a processed node, it
-/// is not remapped - the caller needs to take care of this.
-/// Returns the potentially changed node.
+/// The specified node is the root of a subtree of potentially new nodes.
+/// Correct any processed operands (this may change the node) and calculate the
+/// NodeId. If the node itself changes to a processed node, it is not remapped -
+/// the caller needs to take care of this. Returns the potentially changed node.
 SDNode *DAGTypeLegalizer::AnalyzeNewNode(SDNode *N) {
   // If this was an existing node that is already done, we're done.
   if (N->getNodeId() != NewNode && N->getNodeId() != Unanalyzed)
@@ -536,7 +535,7 @@ SDNode *DAGTypeLegalizer::AnalyzeNewNode(SDNode *N) {
   return N;
 }
 
-/// AnalyzeNewValue - Call AnalyzeNewNode, updating the node in Val if needed.
+/// Call AnalyzeNewNode, updating the node in Val if needed.
 /// If the node changes to a processed node, then remap it.
 void DAGTypeLegalizer::AnalyzeNewValue(SDValue &Val) {
   Val.setNode(AnalyzeNewNode(Val.getNode()));
@@ -545,7 +544,7 @@ void DAGTypeLegalizer::AnalyzeNewValue(SDValue &Val) {
     RemapValue(Val);
 }
 
-/// ExpungeNode - If N has a bogus mapping in ReplacedValues, eliminate it.
+/// If N has a bogus mapping in ReplacedValues, eliminate it.
 /// This can occur when a node is deleted then reallocated as a new node -
 /// the mapping in ReplacedValues applies to the deleted node, not the new
 /// one.
@@ -626,7 +625,7 @@ void DAGTypeLegalizer::ExpungeNode(SDNode *N) {
     ReplacedValues.erase(SDValue(N, i));
 }
 
-/// RemapValue - If the specified value was already legalized to another value,
+/// If the specified value was already legalized to another value,
 /// replace it by that value.
 void DAGTypeLegalizer::RemapValue(SDValue &N) {
   DenseMap<SDValue, SDValue>::iterator I = ReplacedValues.find(N);
@@ -643,8 +642,8 @@ void DAGTypeLegalizer::RemapValue(SDValue &N) {
 }
 
 namespace {
-  /// NodeUpdateListener - This class is a DAGUpdateListener that listens for
-  /// updates to nodes and recomputes their ready state.
+  /// This class is a DAGUpdateListener that listens for updates to nodes and
+  /// recomputes their ready state.
   class NodeUpdateListener : public SelectionDAG::DAGUpdateListener {
     DAGTypeLegalizer &DTL;
     SmallSetVector<SDNode*, 16> &NodesToAnalyze;
@@ -689,9 +688,8 @@ namespace {
 }
 
 
-/// ReplaceValueWith - The specified value was legalized to the specified other
-/// value.  Update the DAG and NodeIds replacing any uses of From to use To
-/// instead.
+/// The specified value was legalized to the specified other value.
+/// Update the DAG and NodeIds replacing any uses of From to use To instead.
 void DAGTypeLegalizer::ReplaceValueWith(SDValue From, SDValue To) {
   assert(From.getNode() != To.getNode() && "Potential legalization loop!");
 
@@ -905,15 +903,14 @@ void DAGTypeLegalizer::SetWidenedVector(SDValue Op, SDValue Result) {
 // Utilities.
 //===----------------------------------------------------------------------===//
 
-/// BitConvertToInteger - Convert to an integer of the same size.
+/// Convert to an integer of the same size.
 SDValue DAGTypeLegalizer::BitConvertToInteger(SDValue Op) {
   unsigned BitWidth = Op.getValueType().getSizeInBits();
   return DAG.getNode(ISD::BITCAST, SDLoc(Op),
                      EVT::getIntegerVT(*DAG.getContext(), BitWidth), Op);
 }
 
-/// BitConvertVectorToIntegerVector - Convert to a vector of integers of the
-/// same size.
+/// Convert to a vector of integers of the same size.
 SDValue DAGTypeLegalizer::BitConvertVectorToIntegerVector(SDValue Op) {
   assert(Op.getValueType().isVector() && "Only applies to vectors!");
   unsigned EltWidth = Op.getValueType().getVectorElementType().getSizeInBits();
@@ -930,15 +927,14 @@ SDValue DAGTypeLegalizer::CreateStackStoreLoad(SDValue Op,
   // the source and destination types.
   SDValue StackPtr = DAG.CreateStackTemporary(Op.getValueType(), DestVT);
   // Emit a store to the stack slot.
-  SDValue Store = DAG.getStore(DAG.getEntryNode(), dl, Op, StackPtr,
-                               MachinePointerInfo(), false, false, 0);
+  SDValue Store =
+      DAG.getStore(DAG.getEntryNode(), dl, Op, StackPtr, MachinePointerInfo());
   // Result is a load from the stack slot.
-  return DAG.getLoad(DestVT, dl, Store, StackPtr, MachinePointerInfo(),
-                     false, false, false, 0);
+  return DAG.getLoad(DestVT, dl, Store, StackPtr, MachinePointerInfo());
 }
 
-/// CustomLowerNode - Replace the node's results with custom code provided
-/// by the target and return "true", or do nothing and return "false".
+/// Replace the node's results with custom code provided by the target and
+/// return "true", or do nothing and return "false".
 /// The last parameter is FALSE if we are dealing with a node with legal
 /// result types and illegal operand. The second parameter denotes the type of
 /// illegal OperandNo in that case.
@@ -981,8 +977,8 @@ bool DAGTypeLegalizer::CustomLowerNode(SDNode *N, EVT VT, bool LegalizeResult) {
 }
 
 
-/// CustomWidenLowerNode - Widen the node's results with custom code provided
-/// by the target and return "true", or do nothing and return "false".
+/// Widen the node's results with custom code provided by the target and return
+/// "true", or do nothing and return "false".
 bool DAGTypeLegalizer::CustomWidenLowerNode(SDNode *N, EVT VT) {
   // See if the target wants to custom lower this node.
   if (TLI.getOperationAction(N->getOpcode(), VT) != TargetLowering::Custom)
@@ -992,7 +988,7 @@ bool DAGTypeLegalizer::CustomWidenLowerNode(SDNode *N, EVT VT) {
   TLI.ReplaceNodeResults(N, Results, DAG);
 
   if (Results.empty())
-    // The target didn't want to custom widen lower its result  after all.
+    // The target didn't want to custom widen lower its result after all.
     return false;
 
   // Update the widening map.
@@ -1010,8 +1006,8 @@ SDValue DAGTypeLegalizer::DisintegrateMERGE_VALUES(SDNode *N, unsigned ResNo) {
   return SDValue(N->getOperand(ResNo));
 }
 
-/// GetPairElements - Use ISD::EXTRACT_ELEMENT nodes to extract the low and
-/// high parts of the given value.
+/// Use ISD::EXTRACT_ELEMENT nodes to extract the low and high parts of the
+/// given value.
 void DAGTypeLegalizer::GetPairElements(SDValue Pair,
                                        SDValue &Lo, SDValue &Hi) {
   SDLoc dl(Pair);
@@ -1038,7 +1034,7 @@ SDValue DAGTypeLegalizer::GetVectorElementPointer(SDValue VecPtr, EVT EltVT,
   return DAG.getNode(ISD::ADD, dl, Index.getValueType(), Index, VecPtr);
 }
 
-/// JoinIntegers - Build an integer with low bits Lo and high bits Hi.
+/// Build an integer with low bits Lo and high bits Hi.
 SDValue DAGTypeLegalizer::JoinIntegers(SDValue Lo, SDValue Hi) {
   // Arbitrarily use dlHi for result SDLoc
   SDLoc dlHi(Hi);
@@ -1056,7 +1052,7 @@ SDValue DAGTypeLegalizer::JoinIntegers(SDValue Lo, SDValue Hi) {
   return DAG.getNode(ISD::OR, dlHi, NVT, Lo, Hi);
 }
 
-/// LibCallify - Convert the node into a libcall with the same prototype.
+/// Convert the node into a libcall with the same prototype.
 SDValue DAGTypeLegalizer::LibCallify(RTLIB::Libcall LC, SDNode *N,
                                      bool isSigned) {
   unsigned NumOps = N->getNumOperands();
@@ -1080,12 +1076,11 @@ SDValue DAGTypeLegalizer::LibCallify(RTLIB::Libcall LC, SDNode *N,
   return TLI.makeLibCall(DAG, LC, N->getValueType(0), Ops, isSigned, dl).first;
 }
 
-// ExpandChainLibCall - Expand a node into a call to a libcall. Similar to
-// ExpandLibCall except that the first operand is the in-chain.
+/// Expand a node into a call to a libcall. Similar to ExpandLibCall except that
+/// the first operand is the in-chain.
 std::pair<SDValue, SDValue>
-DAGTypeLegalizer::ExpandChainLibCall(RTLIB::Libcall LC,
-                                         SDNode *Node,
-                                         bool isSigned) {
+DAGTypeLegalizer::ExpandChainLibCall(RTLIB::Libcall LC, SDNode *Node,
+                                     bool isSigned) {
   SDValue InChain = Node->getOperand(0);
 
   TargetLowering::ArgListTy Args;
@@ -1106,7 +1101,7 @@ DAGTypeLegalizer::ExpandChainLibCall(RTLIB::Libcall LC,
 
   TargetLowering::CallLoweringInfo CLI(DAG);
   CLI.setDebugLoc(SDLoc(Node)).setChain(InChain)
-    .setCallee(TLI.getLibcallCallingConv(LC), RetTy, Callee, std::move(Args), 0)
+    .setCallee(TLI.getLibcallCallingConv(LC), RetTy, Callee, std::move(Args))
     .setSExtResult(isSigned).setZExtResult(!isSigned);
 
   std::pair<SDValue, SDValue> CallInfo = TLI.LowerCallTo(CLI);
@@ -1114,9 +1109,9 @@ DAGTypeLegalizer::ExpandChainLibCall(RTLIB::Libcall LC,
   return CallInfo;
 }
 
-/// PromoteTargetBoolean - Promote the given target boolean to a target boolean
-/// of the given type.  A target boolean is an integer value, not necessarily of
-/// type i1, the bits of which conform to getBooleanContents.
+/// Promote the given target boolean to a target boolean of the given type.
+/// A target boolean is an integer value, not necessarily of type i1, the bits
+/// of which conform to getBooleanContents.
 ///
 /// ValVT is the type of values that produced the boolean.
 SDValue DAGTypeLegalizer::PromoteTargetBoolean(SDValue Bool, EVT ValVT) {
@@ -1127,9 +1122,9 @@ SDValue DAGTypeLegalizer::PromoteTargetBoolean(SDValue Bool, EVT ValVT) {
   return DAG.getNode(ExtendCode, dl, BoolVT, Bool);
 }
 
-/// WidenTargetBoolean - Widen the given target boolean to a target boolean
-/// of the given type. The boolean vector is widened and then promoted to match
-/// the target boolean type of the given ValVT.
+/// Widen the given target boolean to a target boolean of the given type.
+/// The boolean vector is widened and then promoted to match the target boolean
+/// type of the given ValVT.
 SDValue DAGTypeLegalizer::WidenTargetBoolean(SDValue Bool, EVT ValVT,
                                              bool WithZeroes) {
   SDLoc dl(Bool);
@@ -1144,8 +1139,7 @@ SDValue DAGTypeLegalizer::WidenTargetBoolean(SDValue Bool, EVT ValVT,
   return PromoteTargetBoolean(Bool, ValVT);
 }
 
-/// SplitInteger - Return the lower LoVT bits of Op in Lo and the upper HiVT
-/// bits in Hi.
+/// Return the lower LoVT bits of Op in Lo and the upper HiVT bits in Hi.
 void DAGTypeLegalizer::SplitInteger(SDValue Op,
                                     EVT LoVT, EVT HiVT,
                                     SDValue &Lo, SDValue &Hi) {
@@ -1159,8 +1153,8 @@ void DAGTypeLegalizer::SplitInteger(SDValue Op,
   Hi = DAG.getNode(ISD::TRUNCATE, dl, HiVT, Hi);
 }
 
-/// SplitInteger - Return the lower and upper halves of Op's bits in a value
-/// type half the size of Op's.
+/// Return the lower and upper halves of Op's bits in a value type half the
+/// size of Op's.
 void DAGTypeLegalizer::SplitInteger(SDValue Op,
                                     SDValue &Lo, SDValue &Hi) {
   EVT HalfVT = EVT::getIntegerVT(*DAG.getContext(),
@@ -1173,9 +1167,8 @@ void DAGTypeLegalizer::SplitInteger(SDValue Op,
 //  Entry Point
 //===----------------------------------------------------------------------===//
 
-/// LegalizeTypes - This transforms the SelectionDAG into a SelectionDAG that
-/// only uses types natively supported by the target.  Returns "true" if it made
-/// any changes.
+/// This transforms the SelectionDAG into a SelectionDAG that only uses types
+/// natively supported by the target. Returns "true" if it made any changes.
 ///
 /// Note that this is an involved process that may invalidate pointers into
 /// the graph.
diff --git a/lib/CodeGen/SelectionDAG/LegalizeTypes.h b/lib/CodeGen/SelectionDAG/LegalizeTypes.h
index 8ba19f76797f..84ad8f83d906 100644
--- a/lib/CodeGen/SelectionDAG/LegalizeTypes.h
+++ b/lib/CodeGen/SelectionDAG/LegalizeTypes.h
@@ -17,7 +17,6 @@
 #define LLVM_LIB_CODEGEN_SELECTIONDAG_LEGALIZETYPES_H
 
 #include "llvm/ADT/DenseMap.h"
-#include "llvm/ADT/DenseSet.h"
 #include "llvm/CodeGen/SelectionDAG.h"
 #include "llvm/Support/Compiler.h"
 #include "llvm/Support/Debug.h"
@@ -26,58 +25,56 @@
 namespace llvm {
 
 //===----------------------------------------------------------------------===//
-/// DAGTypeLegalizer - This takes an arbitrary SelectionDAG as input and hacks
-/// on it until only value types the target machine can handle are left.  This
-/// involves promoting small sizes to large sizes or splitting up large values
-/// into small values.
+/// This takes an arbitrary SelectionDAG as input and hacks on it until only
+/// value types the target machine can handle are left. This involves promoting
+/// small sizes to large sizes or splitting up large values into small values.
 ///
 class LLVM_LIBRARY_VISIBILITY DAGTypeLegalizer {
   const TargetLowering &TLI;
   SelectionDAG &DAG;
 public:
-  // NodeIdFlags - This pass uses the NodeId on the SDNodes to hold information
-  // about the state of the node.  The enum has all the values.
+  /// This pass uses the NodeId on the SDNodes to hold information about the
+  /// state of the node. The enum has all the values.
   enum NodeIdFlags {
-    /// ReadyToProcess - All operands have been processed, so this node is ready
-    /// to be handled.
+    /// All operands have been processed, so this node is ready to be handled.
     ReadyToProcess = 0,
 
-    /// NewNode - This is a new node, not before seen, that was created in the
-    /// process of legalizing some other node.
+    /// This is a new node, not before seen, that was created in the process of
+    /// legalizing some other node.
     NewNode = -1,
 
-    /// Unanalyzed - This node's ID needs to be set to the number of its
-    /// unprocessed operands.
+    /// This node's ID needs to be set to the number of its unprocessed
+    /// operands.
     Unanalyzed = -2,
 
-    /// Processed - This is a node that has already been processed.
+    /// This is a node that has already been processed.
     Processed = -3
 
     // 1+ - This is a node which has this many unprocessed operands.
   };
 private:
 
-  /// ValueTypeActions - This is a bitvector that contains two bits for each
-  /// simple value type, where the two bits correspond to the LegalizeAction
-  /// enum from TargetLowering.  This can be queried with "getTypeAction(VT)".
+  /// This is a bitvector that contains two bits for each simple value type,
+  /// where the two bits correspond to the LegalizeAction enum from
+  /// TargetLowering. This can be queried with "getTypeAction(VT)".
   TargetLowering::ValueTypeActionImpl ValueTypeActions;
 
-  /// getTypeAction - Return how we should legalize values of this type.
+  /// Return how we should legalize values of this type.
   TargetLowering::LegalizeTypeAction getTypeAction(EVT VT) const {
     return TLI.getTypeAction(*DAG.getContext(), VT);
   }
 
-  /// isTypeLegal - Return true if this type is legal on this target.
+  /// Return true if this type is legal on this target.
   bool isTypeLegal(EVT VT) const {
     return TLI.getTypeAction(*DAG.getContext(), VT) == TargetLowering::TypeLegal;
   }
 
-  /// isSimpleLegalType - Return true if this is a simple legal type.
+  /// Return true if this is a simple legal type.
   bool isSimpleLegalType(EVT VT) const {
     return VT.isSimple() && TLI.isTypeLegal(VT);
   }
 
-  /// isLegalInHWReg - Return true if this type can be passed in registers.
+  /// Return true if this type can be passed in registers.
   /// For example, x86_64's f128, should to be legally in registers
   /// and only some operations converted to library calls or integer
   /// bitwise operations.
@@ -90,51 +87,49 @@ private:
     return TLI.getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT);
   }
 
-  /// IgnoreNodeResults - Pretend all of this node's results are legal.
+  /// Pretend all of this node's results are legal.
   bool IgnoreNodeResults(SDNode *N) const {
     return N->getOpcode() == ISD::TargetConstant;
   }
 
-  /// PromotedIntegers - For integer nodes that are below legal width, this map
-  /// indicates what promoted value to use.
+  /// For integer nodes that are below legal width, this map indicates what
+  /// promoted value to use.
   SmallDenseMap<SDValue, SDValue, 8> PromotedIntegers;
 
-  /// ExpandedIntegers - For integer nodes that need to be expanded this map
-  /// indicates which operands are the expanded version of the input.
+  /// For integer nodes that need to be expanded this map indicates which
+  /// operands are the expanded version of the input.
   SmallDenseMap<SDValue, std::pair<SDValue, SDValue>, 8> ExpandedIntegers;
 
-  /// SoftenedFloats - For floating point nodes converted to integers of
-  /// the same size, this map indicates the converted value to use.
+  /// For floating-point nodes converted to integers of the same size, this map
+  /// indicates the converted value to use.
   SmallDenseMap<SDValue, SDValue, 8> SoftenedFloats;
 
-  /// PromotedFloats - For floating point nodes that have a smaller precision
-  /// than the smallest supported precision, this map indicates what promoted
-  /// value to use.
+  /// For floating-point nodes that have a smaller precision than the smallest
+  /// supported precision, this map indicates what promoted value to use.
   SmallDenseMap<SDValue, SDValue, 8> PromotedFloats;
 
-  /// ExpandedFloats - For float nodes that need to be expanded this map
-  /// indicates which operands are the expanded version of the input.
+  /// For float nodes that need to be expanded this map indicates which operands
+  /// are the expanded version of the input.
   SmallDenseMap<SDValue, std::pair<SDValue, SDValue>, 8> ExpandedFloats;
 
-  /// ScalarizedVectors - For nodes that are <1 x ty>, this map indicates the
-  /// scalar value of type 'ty' to use.
+  /// For nodes that are <1 x ty>, this map indicates the scalar value of type
+  /// 'ty' to use.
   SmallDenseMap<SDValue, SDValue, 8> ScalarizedVectors;
 
-  /// SplitVectors - For nodes that need to be split this map indicates
-  /// which operands are the expanded version of the input.
+  /// For nodes that need to be split this map indicates which operands are the
+  /// expanded version of the input.
   SmallDenseMap<SDValue, std::pair<SDValue, SDValue>, 8> SplitVectors;
 
-  /// WidenedVectors - For vector nodes that need to be widened, indicates
-  /// the widened value to use.
+  /// For vector nodes that need to be widened, indicates the widened value to
+  /// use.
   SmallDenseMap<SDValue, SDValue, 8> WidenedVectors;
 
-  /// ReplacedValues - For values that have been replaced with another,
-  /// indicates the replacement value to use.
+  /// For values that have been replaced with another, indicates the replacement
+  /// value to use.
   SmallDenseMap<SDValue, SDValue, 8> ReplacedValues;
 
-  /// Worklist - This defines a worklist of nodes to process.  In order to be
-  /// pushed onto this worklist, all operands of a node must have already been
-  /// processed.
+  /// This defines a worklist of nodes to process. In order to be pushed onto
+  /// this worklist, all operands of a node must have already been processed.
   SmallVector<SDNode*, 128> Worklist;
 
 public:
@@ -145,7 +140,7 @@ public:
                   "Too many value types for ValueTypeActions to hold!");
   }
 
-  /// run - This is the main entry point for the type legalizer.  This does a
+  /// This is the main entry point for the type legalizer.  This does a
   /// top-down traversal of the dag, legalizing types as it goes.  Returns
   /// "true" if it made any changes.
   bool run();
@@ -173,9 +168,9 @@ private:
   bool CustomLowerNode(SDNode *N, EVT VT, bool LegalizeResult);
   bool CustomWidenLowerNode(SDNode *N, EVT VT);
 
-  /// DisintegrateMERGE_VALUES - Replace each result of the given MERGE_VALUES
-  /// node with the corresponding input operand, except for the result 'ResNo',
-  /// for which the corresponding input operand is returned.
+  /// Replace each result of the given MERGE_VALUES node with the corresponding
+  /// input operand, except for the result 'ResNo', for which the corresponding
+  /// input operand is returned.
   SDValue DisintegrateMERGE_VALUES(SDNode *N, unsigned ResNo);
 
   SDValue GetVectorElementPointer(SDValue VecPtr, EVT EltVT, SDValue Index);
@@ -201,9 +196,9 @@ private:
   // Integer Promotion Support: LegalizeIntegerTypes.cpp
   //===--------------------------------------------------------------------===//
 
-  /// GetPromotedInteger - Given a processed operand Op which was promoted to a
-  /// larger integer type, this returns the promoted value.  The low bits of the
-  /// promoted value corresponding to the original type are exactly equal to Op.
+  /// Given a processed operand Op which was promoted to a larger integer type,
+  /// this returns the promoted value. The low bits of the promoted value
+  /// corresponding to the original type are exactly equal to Op.
   /// The extra bits contain rubbish, so the promoted value may need to be zero-
   /// or sign-extended from the original type before it is usable (the helpers
   /// SExtPromotedInteger and ZExtPromotedInteger can do this for you).
@@ -218,8 +213,7 @@ private:
   }
   void SetPromotedInteger(SDValue Op, SDValue Result);
 
-  /// SExtPromotedInteger - Get a promoted operand and sign extend it to the
-  /// final size.
+  /// Get a promoted operand and sign extend it to the final size.
   SDValue SExtPromotedInteger(SDValue Op) {
     EVT OldVT = Op.getValueType();
     SDLoc dl(Op);
@@ -228,8 +222,7 @@ private:
                        DAG.getValueType(OldVT));
   }
 
-  /// ZExtPromotedInteger - Get a promoted operand and zero extend it to the
-  /// final size.
+  /// Get a promoted operand and zero extend it to the final size.
   SDValue ZExtPromotedInteger(SDValue Op) {
     EVT OldVT = Op.getValueType();
     SDLoc dl(Op);
@@ -322,9 +315,9 @@ private:
   // Integer Expansion Support: LegalizeIntegerTypes.cpp
   //===--------------------------------------------------------------------===//
 
-  /// GetExpandedInteger - Given a processed operand Op which was expanded into
-  /// two integers of half the size, this returns the two halves.  The low bits
-  /// of Op are exactly equal to the bits of Lo; the high bits exactly equal Hi.
+  /// Given a processed operand Op which was expanded into two integers of half
+  /// the size, this returns the two halves. The low bits of Op are exactly
+  /// equal to the bits of Lo; the high bits exactly equal Hi.
   /// For example, if Op is an i64 which was expanded into two i32's, then this
   /// method returns the two i32's, with Lo being equal to the lower 32 bits of
   /// Op, and Hi being equal to the upper 32 bits.
@@ -362,6 +355,8 @@ private:
   void ExpandIntRes_UREM              (SDNode *N, SDValue &Lo, SDValue &Hi);
   void ExpandIntRes_Shift             (SDNode *N, SDValue &Lo, SDValue &Hi);
 
+  void ExpandIntRes_MINMAX            (SDNode *N, SDValue &Lo, SDValue &Hi);
+
   void ExpandIntRes_SADDSUBO          (SDNode *N, SDValue &Lo, SDValue &Hi);
   void ExpandIntRes_UADDSUBO          (SDNode *N, SDValue &Lo, SDValue &Hi);
   void ExpandIntRes_XMULO             (SDNode *N, SDValue &Lo, SDValue &Hi);
@@ -388,14 +383,14 @@ private:
   SDValue ExpandIntOp_ATOMIC_STORE(SDNode *N);
 
   void IntegerExpandSetCCOperands(SDValue &NewLHS, SDValue &NewRHS,
-                                  ISD::CondCode &CCCode, SDLoc dl);
+                                  ISD::CondCode &CCCode, const SDLoc &dl);
 
   //===--------------------------------------------------------------------===//
   // Float to Integer Conversion Support: LegalizeFloatTypes.cpp
   //===--------------------------------------------------------------------===//
 
-  /// GetSoftenedFloat - Given an operand Op of Float type, returns the integer
-  /// if the Op is not supported in target HW and converted to the integer.
+  /// Given an operand Op of Float type, returns the integer if the Op is not
+  /// supported in target HW and converted to the integer.
   /// The integer contains exactly the same bits as Op - only the type changed.
   /// For example, if Op is an f32 which was softened to an i32, then this method
   /// returns an i32, the bits of which coincide with those of Op.
@@ -487,8 +482,8 @@ private:
   // Float Expansion Support: LegalizeFloatTypes.cpp
   //===--------------------------------------------------------------------===//
 
-  /// GetExpandedFloat - Given a processed operand Op which was expanded into
-  /// two floating point values of half the size, this returns the two halves.
+  /// Given a processed operand Op which was expanded into two floating-point
+  /// values of half the size, this returns the two halves.
   /// The low bits of Op are exactly equal to the bits of Lo; the high bits
   /// exactly equal Hi.  For example, if Op is a ppcf128 which was expanded
   /// into two f64's, then this method returns the two f64's, with Lo being
@@ -542,8 +537,7 @@ private:
   SDValue ExpandFloatOp_STORE(SDNode *N, unsigned OpNo);
 
   void FloatExpandSetCCOperands(SDValue &NewLHS, SDValue &NewRHS,
-                                ISD::CondCode &CCCode, SDLoc dl);
-
+                                ISD::CondCode &CCCode, const SDLoc &dl);
 
   //===--------------------------------------------------------------------===//
   // Float promotion support: LegalizeFloatTypes.cpp
@@ -586,9 +580,9 @@ private:
   // Scalarization Support: LegalizeVectorTypes.cpp
   //===--------------------------------------------------------------------===//
 
-  /// GetScalarizedVector - Given a processed one-element vector Op which was
-  /// scalarized to its element type, this returns the element.  For example,
-  /// if Op is a v1i32, Op = < i32 val >, this method returns val, an i32.
+  /// Given a processed one-element vector Op which was scalarized to its
+  /// element type, this returns the element. For example, if Op is a v1i32,
+  /// Op = < i32 val >, this method returns val, an i32.
   SDValue GetScalarizedVector(SDValue Op) {
     SDValue &ScalarizedOp = ScalarizedVectors[Op];
     RemapValue(ScalarizedOp);
@@ -636,12 +630,12 @@ private:
   // Vector Splitting Support: LegalizeVectorTypes.cpp
   //===--------------------------------------------------------------------===//
 
-  /// GetSplitVector - Given a processed vector Op which was split into vectors
-  /// of half the size, this method returns the halves.  The first elements of
-  /// Op coincide with the elements of Lo; the remaining elements of Op coincide
-  /// with the elements of Hi: Op is what you would get by concatenating Lo and
-  /// Hi.  For example, if Op is a v8i32 that was split into two v4i32's, then
-  /// this method returns the two v4i32's, with Lo corresponding to the first 4
+  /// Given a processed vector Op which was split into vectors of half the size,
+  /// this method returns the halves. The first elements of Op coincide with the
+  /// elements of Lo; the remaining elements of Op coincide with the elements of
+  /// Hi: Op is what you would get by concatenating Lo and Hi.
+  /// For example, if Op is a v8i32 that was split into two v4i32's, then this
+  /// method returns the two v4i32's, with Lo corresponding to the first 4
   /// elements of Op, and Hi to the last 4 elements.
   void GetSplitVector(SDValue Op, SDValue &Lo, SDValue &Hi);
   void SetSplitVector(SDValue Op, SDValue Lo, SDValue Hi);
@@ -653,6 +647,7 @@ private:
   void SplitVecRes_UnaryOp(SDNode *N, SDValue &Lo, SDValue &Hi);
   void SplitVecRes_ExtendOp(SDNode *N, SDValue &Lo, SDValue &Hi);
   void SplitVecRes_InregOp(SDNode *N, SDValue &Lo, SDValue &Hi);
+  void SplitVecRes_ExtVecInRegOp(SDNode *N, SDValue &Lo, SDValue &Hi);
 
   void SplitVecRes_BITCAST(SDNode *N, SDValue &Lo, SDValue &Hi);
   void SplitVecRes_BUILD_VECTOR(SDNode *N, SDValue &Lo, SDValue &Hi);
@@ -692,12 +687,12 @@ private:
   // Vector Widening Support: LegalizeVectorTypes.cpp
   //===--------------------------------------------------------------------===//
 
-  /// GetWidenedVector - Given a processed vector Op which was widened into a
-  /// larger vector, this method returns the larger vector.  The elements of
-  /// the returned vector consist of the elements of Op followed by elements
-  /// containing rubbish.  For example, if Op is a v2i32 that was widened to a
-  /// v4i32, then this method returns a v4i32 for which the first two elements
-  /// are the same as those of Op, while the last two elements contain rubbish.
+  /// Given a processed vector Op which was widened into a larger vector, this
+  /// method returns the larger vector. The elements of the returned vector
+  /// consist of the elements of Op followed by elements containing rubbish.
+  /// For example, if Op is a v2i32 that was widened to a v4i32, then this
+  /// method returns a v4i32 for which the first two elements are the same as
+  /// those of Op, while the last two elements contain rubbish.
   SDValue GetWidenedVector(SDValue Op) {
     SDValue &WidenedOp = WidenedVectors[Op];
     RemapValue(WidenedOp);
@@ -713,6 +708,7 @@ private:
   SDValue WidenVecRes_BUILD_VECTOR(SDNode* N);
   SDValue WidenVecRes_CONCAT_VECTORS(SDNode* N);
   SDValue WidenVecRes_CONVERT_RNDSAT(SDNode* N);
+  SDValue WidenVecRes_EXTEND_VECTOR_INREG(SDNode* N);
   SDValue WidenVecRes_EXTRACT_SUBVECTOR(SDNode* N);
   SDValue WidenVecRes_INSERT_VECTOR_ELT(SDNode* N);
   SDValue WidenVecRes_LOAD(SDNode* N);
@@ -755,29 +751,29 @@ private:
   // Vector Widening Utilities Support: LegalizeVectorTypes.cpp
   //===--------------------------------------------------------------------===//
 
-  /// Helper GenWidenVectorLoads - Helper function to generate a set of
-  /// loads to load a vector with a resulting wider type. It takes
+  /// Helper function to generate a set of loads to load a vector with a
+  /// resulting wider type. It takes:
   ///   LdChain: list of chains for the load to be generated.
   ///   Ld:      load to widen
   SDValue GenWidenVectorLoads(SmallVectorImpl<SDValue> &LdChain,
                               LoadSDNode *LD);
 
-  /// GenWidenVectorExtLoads - Helper function to generate a set of extension
-  /// loads to load a ector with a resulting wider type.  It takes
+  /// Helper function to generate a set of extension loads to load a vector with
+  /// a resulting wider type. It takes:
   ///   LdChain: list of chains for the load to be generated.
   ///   Ld:      load to widen
   ///   ExtType: extension element type
   SDValue GenWidenVectorExtLoads(SmallVectorImpl<SDValue> &LdChain,
                                  LoadSDNode *LD, ISD::LoadExtType ExtType);
 
-  /// Helper genWidenVectorStores - Helper function to generate a set of
-  /// stores to store a widen vector into non-widen memory
+  /// Helper function to generate a set of stores to store a widen vector into
+  /// non-widen memory.
   ///   StChain: list of chains for the stores we have generated
   ///   ST:      store of a widen value
   void GenWidenVectorStores(SmallVectorImpl<SDValue> &StChain, StoreSDNode *ST);
 
-  /// Helper genWidenVectorTruncStores - Helper function to generate a set of
-  /// stores to store a truncate widen vector into non-widen memory
+  /// Helper function to generate a set of stores to store a truncate widen
+  /// vector into non-widen memory.
   ///   StChain: list of chains for the stores we have generated
   ///   ST:      store of a widen value
   void GenWidenVectorTruncStores(SmallVectorImpl<SDValue> &StChain,
@@ -785,8 +781,7 @@ private:
 
   /// Modifies a vector input (widen or narrows) to a vector of NVT.  The
   /// input vector must have the same element type as NVT.
-  /// When FillWithZeroes is "on" the vector will be widened with
-  /// zeroes.
+  /// When FillWithZeroes is "on" the vector will be widened with zeroes.
   /// By default, the vector will be widened with undefined values.
   SDValue ModifyToType(SDValue InOp, EVT NVT, bool FillWithZeroes = false);
 
@@ -807,8 +802,8 @@ private:
       GetExpandedFloat(Op, Lo, Hi);
   }
 
-  /// GetPairElements - Use ISD::EXTRACT_ELEMENT nodes to extract the low and
-  /// high parts of the given value.
+  /// Use ISD::EXTRACT_ELEMENT nodes to extract the low and high parts of the
+  /// given value.
   void GetPairElements(SDValue Pair, SDValue &Lo, SDValue &Hi);
 
   // Generic Result Splitting.
diff --git a/lib/CodeGen/SelectionDAG/LegalizeTypesGeneric.cpp b/lib/CodeGen/SelectionDAG/LegalizeTypesGeneric.cpp
index 593c346df770..665180e119b7 100644
--- a/lib/CodeGen/SelectionDAG/LegalizeTypesGeneric.cpp
+++ b/lib/CodeGen/SelectionDAG/LegalizeTypesGeneric.cpp
@@ -170,12 +170,10 @@ void DAGTypeLegalizer::ExpandRes_BITCAST(SDNode *N, SDValue &Lo, SDValue &Hi) {
       MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), SPFI);
 
   // Emit a store to the stack slot.
-  SDValue Store = DAG.getStore(DAG.getEntryNode(), dl, InOp, StackPtr, PtrInfo,
-                               false, false, 0);
+  SDValue Store = DAG.getStore(DAG.getEntryNode(), dl, InOp, StackPtr, PtrInfo);
 
   // Load the first half from the stack slot.
-  Lo = DAG.getLoad(NOutVT, dl, Store, StackPtr, PtrInfo,
-                   false, false, false, 0);
+  Lo = DAG.getLoad(NOutVT, dl, Store, StackPtr, PtrInfo);
 
   // Increment the pointer to the other half.
   unsigned IncrementSize = NOutVT.getSizeInBits() / 8;
@@ -185,8 +183,8 @@ void DAGTypeLegalizer::ExpandRes_BITCAST(SDNode *N, SDValue &Lo, SDValue &Hi) {
 
   // Load the second half from the stack slot.
   Hi = DAG.getLoad(NOutVT, dl, Store, StackPtr,
-                   PtrInfo.getWithOffset(IncrementSize), false,
-                   false, false, MinAlign(Alignment, IncrementSize));
+                   PtrInfo.getWithOffset(IncrementSize),
+                   MinAlign(Alignment, IncrementSize));
 
   // Handle endianness of the load.
   if (TLI.hasBigEndianPartOrdering(OutVT, DAG.getDataLayout()))
@@ -263,16 +261,12 @@ void DAGTypeLegalizer::ExpandRes_NormalLoad(SDNode *N, SDValue &Lo,
   SDValue Chain = LD->getChain();
   SDValue Ptr = LD->getBasePtr();
   unsigned Alignment = LD->getAlignment();
-  bool isVolatile = LD->isVolatile();
-  bool isNonTemporal = LD->isNonTemporal();
-  bool isInvariant = LD->isInvariant();
   AAMDNodes AAInfo = LD->getAAInfo();
 
   assert(NVT.isByteSized() && "Expanded type not byte sized!");
 
-  Lo = DAG.getLoad(NVT, dl, Chain, Ptr, LD->getPointerInfo(),
-                   isVolatile, isNonTemporal, isInvariant, Alignment,
-                   AAInfo);
+  Lo = DAG.getLoad(NVT, dl, Chain, Ptr, LD->getPointerInfo(), Alignment,
+                   LD->getMemOperand()->getFlags(), AAInfo);
 
   // Increment the pointer to the other half.
   unsigned IncrementSize = NVT.getSizeInBits() / 8;
@@ -280,8 +274,8 @@ void DAGTypeLegalizer::ExpandRes_NormalLoad(SDNode *N, SDValue &Lo,
                     DAG.getConstant(IncrementSize, dl, Ptr.getValueType()));
   Hi = DAG.getLoad(NVT, dl, Chain, Ptr,
                    LD->getPointerInfo().getWithOffset(IncrementSize),
-                   isVolatile, isNonTemporal, isInvariant,
-                   MinAlign(Alignment, IncrementSize), AAInfo);
+                   MinAlign(Alignment, IncrementSize),
+                   LD->getMemOperand()->getFlags(), AAInfo);
 
   // Build a factor node to remember that this load is independent of the
   // other one.
@@ -478,8 +472,6 @@ SDValue DAGTypeLegalizer::ExpandOp_NormalStore(SDNode *N, unsigned OpNo) {
   SDValue Chain = St->getChain();
   SDValue Ptr = St->getBasePtr();
   unsigned Alignment = St->getAlignment();
-  bool isVolatile = St->isVolatile();
-  bool isNonTemporal = St->isNonTemporal();
   AAMDNodes AAInfo = St->getAAInfo();
 
   assert(NVT.isByteSized() && "Expanded type not byte sized!");
@@ -491,15 +483,15 @@ SDValue DAGTypeLegalizer::ExpandOp_NormalStore(SDNode *N, unsigned OpNo) {
   if (TLI.hasBigEndianPartOrdering(ValueVT, DAG.getDataLayout()))
     std::swap(Lo, Hi);
 
-  Lo = DAG.getStore(Chain, dl, Lo, Ptr, St->getPointerInfo(),
-                    isVolatile, isNonTemporal, Alignment, AAInfo);
+  Lo = DAG.getStore(Chain, dl, Lo, Ptr, St->getPointerInfo(), Alignment,
+                    St->getMemOperand()->getFlags(), AAInfo);
 
   Ptr = DAG.getNode(ISD::ADD, dl, Ptr.getValueType(), Ptr,
                     DAG.getConstant(IncrementSize, dl, Ptr.getValueType()));
   Hi = DAG.getStore(Chain, dl, Hi, Ptr,
                     St->getPointerInfo().getWithOffset(IncrementSize),
-                    isVolatile, isNonTemporal,
-                    MinAlign(Alignment, IncrementSize), AAInfo);
+                    MinAlign(Alignment, IncrementSize),
+                    St->getMemOperand()->getFlags(), AAInfo);
 
   return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Lo, Hi);
 }
diff --git a/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp b/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp
index f61f631e2ff8..3c9cb17b58b2 100644
--- a/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp
+++ b/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp
@@ -358,8 +358,7 @@ SDValue VectorLegalizer::LegalizeOp(SDValue Op) {
   case TargetLowering::Legal:
     break;
   case TargetLowering::Custom: {
-    SDValue Tmp1 = TLI.LowerOperation(Op, DAG);
-    if (Tmp1.getNode()) {
+    if (SDValue Tmp1 = TLI.LowerOperation(Op, DAG)) {
       Result = Tmp1;
       break;
     }
@@ -493,21 +492,26 @@ SDValue VectorLegalizer::PromoteFP_TO_INT(SDValue Op, bool isSigned) {
 
 
 SDValue VectorLegalizer::ExpandLoad(SDValue Op) {
-  SDLoc dl(Op);
   LoadSDNode *LD = cast<LoadSDNode>(Op.getNode());
-  SDValue Chain = LD->getChain();
-  SDValue BasePTR = LD->getBasePtr();
-  EVT SrcVT = LD->getMemoryVT();
-  ISD::LoadExtType ExtType = LD->getExtensionType();
 
-  SmallVector<SDValue, 8> Vals;
-  SmallVector<SDValue, 8> LoadChains;
+  EVT SrcVT = LD->getMemoryVT();
+  EVT SrcEltVT = SrcVT.getScalarType();
   unsigned NumElem = SrcVT.getVectorNumElements();
 
-  EVT SrcEltVT = SrcVT.getScalarType();
-  EVT DstEltVT = Op.getNode()->getValueType(0).getScalarType();
 
+  SDValue NewChain;
+  SDValue Value;
   if (SrcVT.getVectorNumElements() > 1 && !SrcEltVT.isByteSized()) {
+    SDLoc dl(Op);
+
+    SmallVector<SDValue, 8> Vals;
+    SmallVector<SDValue, 8> LoadChains;
+
+    EVT DstEltVT = LD->getValueType(0).getScalarType();
+    SDValue Chain = LD->getChain();
+    SDValue BasePTR = LD->getBasePtr();
+    ISD::LoadExtType ExtType = LD->getExtensionType();
+
     // When elements in a vector is not byte-addressable, we cannot directly
     // load each element by advancing pointer, which could only address bytes.
     // Instead, we load all significant words, mask bits off, and concatenate
@@ -531,24 +535,22 @@ SDValue VectorLegalizer::ExpandLoad(SDValue Op) {
       unsigned LoadBytes = WideBytes;
 
       if (RemainingBytes >= LoadBytes) {
-        ScalarLoad = DAG.getLoad(WideVT, dl, Chain, BasePTR,
-                                 LD->getPointerInfo().getWithOffset(Offset),
-                                 LD->isVolatile(), LD->isNonTemporal(),
-                                 LD->isInvariant(),
-                                 MinAlign(LD->getAlignment(), Offset),
-                                 LD->getAAInfo());
+        ScalarLoad =
+            DAG.getLoad(WideVT, dl, Chain, BasePTR,
+                        LD->getPointerInfo().getWithOffset(Offset),
+                        MinAlign(LD->getAlignment(), Offset),
+                        LD->getMemOperand()->getFlags(), LD->getAAInfo());
       } else {
         EVT LoadVT = WideVT;
         while (RemainingBytes < LoadBytes) {
           LoadBytes >>= 1; // Reduce the load size by half.
           LoadVT = EVT::getIntegerVT(*DAG.getContext(), LoadBytes << 3);
         }
-        ScalarLoad = DAG.getExtLoad(ISD::EXTLOAD, dl, WideVT, Chain, BasePTR,
-                                    LD->getPointerInfo().getWithOffset(Offset),
-                                    LoadVT, LD->isVolatile(),
-                                    LD->isNonTemporal(), LD->isInvariant(),
-                                    MinAlign(LD->getAlignment(), Offset),
-                                    LD->getAAInfo());
+        ScalarLoad =
+            DAG.getExtLoad(ISD::EXTLOAD, dl, WideVT, Chain, BasePTR,
+                           LD->getPointerInfo().getWithOffset(Offset), LoadVT,
+                           MinAlign(LD->getAlignment(), Offset),
+                           LD->getMemOperand()->getFlags(), LD->getAAInfo());
       }
 
       RemainingBytes -= LoadBytes;
@@ -614,29 +616,17 @@ SDValue VectorLegalizer::ExpandLoad(SDValue Op) {
       }
       Vals.push_back(Lo);
     }
-  } else {
-    unsigned Stride = SrcVT.getScalarType().getSizeInBits()/8;
 
-    for (unsigned Idx=0; Idx<NumElem; Idx++) {
-      SDValue ScalarLoad = DAG.getExtLoad(ExtType, dl,
-                Op.getNode()->getValueType(0).getScalarType(),
-                Chain, BasePTR, LD->getPointerInfo().getWithOffset(Idx * Stride),
-                SrcVT.getScalarType(),
-                LD->isVolatile(), LD->isNonTemporal(), LD->isInvariant(),
-                MinAlign(LD->getAlignment(), Idx * Stride), LD->getAAInfo());
-
-      BasePTR = DAG.getNode(ISD::ADD, dl, BasePTR.getValueType(), BasePTR,
-                         DAG.getConstant(Stride, dl, BasePTR.getValueType()));
+    NewChain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, LoadChains);
+    Value = DAG.getNode(ISD::BUILD_VECTOR, dl,
+                        Op.getNode()->getValueType(0), Vals);
+  } else {
+    SDValue Scalarized = TLI.scalarizeVectorLoad(LD, DAG);
 
-      Vals.push_back(ScalarLoad.getValue(0));
-      LoadChains.push_back(ScalarLoad.getValue(1));
-    }
+    NewChain = Scalarized.getValue(1);
+    Value = Scalarized.getValue(0);
   }
 
-  SDValue NewChain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, LoadChains);
-  SDValue Value = DAG.getNode(ISD::BUILD_VECTOR, dl,
-                              Op.getNode()->getValueType(0), Vals);
-
   AddLegalizedOperand(Op.getValue(0), Value);
   AddLegalizedOperand(Op.getValue(1), NewChain);
 
@@ -644,54 +634,37 @@ SDValue VectorLegalizer::ExpandLoad(SDValue Op) {
 }
 
 SDValue VectorLegalizer::ExpandStore(SDValue Op) {
-  SDLoc dl(Op);
   StoreSDNode *ST = cast<StoreSDNode>(Op.getNode());
-  SDValue Chain = ST->getChain();
-  SDValue BasePTR = ST->getBasePtr();
-  SDValue Value = ST->getValue();
-  EVT StVT = ST->getMemoryVT();
-
-  unsigned Alignment = ST->getAlignment();
-  bool isVolatile = ST->isVolatile();
-  bool isNonTemporal = ST->isNonTemporal();
-  AAMDNodes AAInfo = ST->getAAInfo();
 
-  unsigned NumElem = StVT.getVectorNumElements();
-  // The type of the data we want to save
-  EVT RegVT = Value.getValueType();
-  EVT RegSclVT = RegVT.getScalarType();
-  // The type of data as saved in memory.
+  EVT StVT = ST->getMemoryVT();
   EVT MemSclVT = StVT.getScalarType();
-
-  // Cast floats into integers
   unsigned ScalarSize = MemSclVT.getSizeInBits();
 
   // Round odd types to the next pow of two.
-  if (!isPowerOf2_32(ScalarSize))
-    ScalarSize = NextPowerOf2(ScalarSize);
-
-  // Store Stride in bytes
-  unsigned Stride = ScalarSize/8;
-  // Extract each of the elements from the original vector
-  // and save them into memory individually.
-  SmallVector<SDValue, 8> Stores;
-  for (unsigned Idx = 0; Idx < NumElem; Idx++) {
-    SDValue Ex = DAG.getNode(
-        ISD::EXTRACT_VECTOR_ELT, dl, RegSclVT, Value,
-        DAG.getConstant(Idx, dl, TLI.getVectorIdxTy(DAG.getDataLayout())));
-
-    // This scalar TruncStore may be illegal, but we legalize it later.
-    SDValue Store = DAG.getTruncStore(Chain, dl, Ex, BasePTR,
-               ST->getPointerInfo().getWithOffset(Idx*Stride), MemSclVT,
-               isVolatile, isNonTemporal, MinAlign(Alignment, Idx*Stride),
-               AAInfo);
-
-    BasePTR = DAG.getNode(ISD::ADD, dl, BasePTR.getValueType(), BasePTR,
-                          DAG.getConstant(Stride, dl, BasePTR.getValueType()));
-
-    Stores.push_back(Store);
+  if (!isPowerOf2_32(ScalarSize)) {
+    // FIXME: This is completely broken and inconsistent with ExpandLoad
+    // handling.
+
+    // For sub-byte element sizes, this ends up with 0 stride between elements,
+    // so the same element just gets re-written to the same location. There seem
+    // to be tests explicitly testing for this broken behavior though.  tests
+    // for this broken behavior.
+
+    LLVMContext &Ctx = *DAG.getContext();
+
+    EVT NewMemVT
+      = EVT::getVectorVT(Ctx,
+                         MemSclVT.getIntegerVT(Ctx, NextPowerOf2(ScalarSize)),
+                         StVT.getVectorNumElements());
+
+    SDValue NewVectorStore = DAG.getTruncStore(
+        ST->getChain(), SDLoc(Op), ST->getValue(), ST->getBasePtr(),
+        ST->getPointerInfo(), NewMemVT, ST->getAlignment(),
+        ST->getMemOperand()->getFlags(), ST->getAAInfo());
+    ST = cast<StoreSDNode>(NewVectorStore.getNode());
   }
-  SDValue TF =  DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Stores);
+
+  SDValue TF = TLI.scalarizeVectorStore(ST, DAG);
   AddLegalizedOperand(Op, TF);
   return TF;
 }
@@ -864,10 +837,7 @@ SDValue VectorLegalizer::ExpandZERO_EXTEND_VECTOR_INREG(SDValue Op) {
   int NumSrcElements = SrcVT.getVectorNumElements();
 
   // Build up a zero vector to blend into this one.
-  EVT SrcScalarVT = SrcVT.getScalarType();
-  SDValue ScalarZero = DAG.getTargetConstant(0, DL, SrcScalarVT);
-  SmallVector<SDValue, 4> BuildVectorOperands(NumSrcElements, ScalarZero);
-  SDValue Zero = DAG.getNode(ISD::BUILD_VECTOR, DL, SrcVT, BuildVectorOperands);
+  SDValue Zero = DAG.getConstant(0, DL, SrcVT);
 
   // Shuffle the incoming lanes into the correct position, and pull all other
   // lanes from the zero vector.
@@ -885,16 +855,19 @@ SDValue VectorLegalizer::ExpandZERO_EXTEND_VECTOR_INREG(SDValue Op) {
                      DAG.getVectorShuffle(SrcVT, DL, Zero, Src, ShuffleMask));
 }
 
-SDValue VectorLegalizer::ExpandBSWAP(SDValue Op) {
-  EVT VT = Op.getValueType();
-
-  // Generate a byte wise shuffle mask for the BSWAP.
-  SmallVector<int, 16> ShuffleMask;
+static void createBSWAPShuffleMask(EVT VT, SmallVectorImpl<int> &ShuffleMask) {
   int ScalarSizeInBytes = VT.getScalarSizeInBits() / 8;
   for (int I = 0, E = VT.getVectorNumElements(); I != E; ++I)
     for (int J = ScalarSizeInBytes - 1; J >= 0; --J)
       ShuffleMask.push_back((I * ScalarSizeInBytes) + J);
+}
 
+SDValue VectorLegalizer::ExpandBSWAP(SDValue Op) {
+  EVT VT = Op.getValueType();
+
+  // Generate a byte wise shuffle mask for the BSWAP.
+  SmallVector<int, 16> ShuffleMask;
+  createBSWAPShuffleMask(VT, ShuffleMask);
   EVT ByteVT = EVT::getVectorVT(*DAG.getContext(), MVT::i8, ShuffleMask.size());
 
   // Only emit a shuffle if the mask is legal.
@@ -903,8 +876,7 @@ SDValue VectorLegalizer::ExpandBSWAP(SDValue Op) {
 
   SDLoc DL(Op);
   Op = DAG.getNode(ISD::BITCAST, DL, ByteVT, Op.getOperand(0));
-  Op = DAG.getVectorShuffle(ByteVT, DL, Op, DAG.getUNDEF(ByteVT),
-                            ShuffleMask.data());
+  Op = DAG.getVectorShuffle(ByteVT, DL, Op, DAG.getUNDEF(ByteVT), ShuffleMask);
   return DAG.getNode(ISD::BITCAST, DL, VT, Op);
 }
 
@@ -915,12 +887,36 @@ SDValue VectorLegalizer::ExpandBITREVERSE(SDValue Op) {
   if (TLI.isOperationLegalOrCustom(ISD::BITREVERSE, VT.getScalarType()))
     return DAG.UnrollVectorOp(Op.getNode());
 
+  // If the vector element width is a whole number of bytes, test if its legal
+  // to BSWAP shuffle the bytes and then perform the BITREVERSE on the byte
+  // vector. This greatly reduces the number of bit shifts necessary.
+  unsigned ScalarSizeInBits = VT.getScalarSizeInBits();
+  if (ScalarSizeInBits > 8 && (ScalarSizeInBits % 8) == 0) {
+    SmallVector<int, 16> BSWAPMask;
+    createBSWAPShuffleMask(VT, BSWAPMask);
+
+    EVT ByteVT = EVT::getVectorVT(*DAG.getContext(), MVT::i8, BSWAPMask.size());
+    if (TLI.isShuffleMaskLegal(BSWAPMask, ByteVT) &&
+        (TLI.isOperationLegalOrCustom(ISD::BITREVERSE, ByteVT) ||
+         (TLI.isOperationLegalOrCustom(ISD::SHL, ByteVT) &&
+          TLI.isOperationLegalOrCustom(ISD::SRL, ByteVT) &&
+          TLI.isOperationLegalOrCustomOrPromote(ISD::AND, ByteVT) &&
+          TLI.isOperationLegalOrCustomOrPromote(ISD::OR, ByteVT)))) {
+      SDLoc DL(Op);
+      Op = DAG.getNode(ISD::BITCAST, DL, ByteVT, Op.getOperand(0));
+      Op = DAG.getVectorShuffle(ByteVT, DL, Op, DAG.getUNDEF(ByteVT),
+                                BSWAPMask);
+      Op = DAG.getNode(ISD::BITREVERSE, DL, ByteVT, Op);
+      return DAG.getNode(ISD::BITCAST, DL, VT, Op);
+    }
+  }
+
   // If we have the appropriate vector bit operations, it is better to use them
   // than unrolling and expanding each component.
   if (!TLI.isOperationLegalOrCustom(ISD::SHL, VT) ||
       !TLI.isOperationLegalOrCustom(ISD::SRL, VT) ||
-      !TLI.isOperationLegalOrCustom(ISD::AND, VT) ||
-      !TLI.isOperationLegalOrCustom(ISD::OR, VT))
+      !TLI.isOperationLegalOrCustomOrPromote(ISD::AND, VT) ||
+      !TLI.isOperationLegalOrCustomOrPromote(ISD::OR, VT))
     return DAG.UnrollVectorOp(Op.getNode());
 
   // Let LegalizeDAG handle this later.
@@ -1027,10 +1023,12 @@ SDValue VectorLegalizer::ExpandFNEG(SDValue Op) {
 }
 
 SDValue VectorLegalizer::ExpandCTLZ_CTTZ_ZERO_UNDEF(SDValue Op) {
-  // If the non-ZERO_UNDEF version is supported we can let LegalizeDAG handle.
+  // If the non-ZERO_UNDEF version is supported we can use that instead.
   unsigned Opc = Op.getOpcode() == ISD::CTLZ_ZERO_UNDEF ? ISD::CTLZ : ISD::CTTZ;
-  if (TLI.isOperationLegalOrCustom(Opc, Op.getValueType()))
-    return Op;
+  if (TLI.isOperationLegalOrCustom(Opc, Op.getValueType())) {
+    SDLoc DL(Op);
+    return DAG.getNode(Opc, DL, Op.getValueType(), Op.getOperand(0));
+  }
 
   // Otherwise go ahead and unroll.
   return DAG.UnrollVectorOp(Op.getNode());
diff --git a/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp b/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp
index d0187d36dee2..f3adca49ccfe 100644
--- a/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp
+++ b/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp
@@ -223,17 +223,13 @@ SDValue DAGTypeLegalizer::ScalarizeVecRes_INSERT_VECTOR_ELT(SDNode *N) {
 SDValue DAGTypeLegalizer::ScalarizeVecRes_LOAD(LoadSDNode *N) {
   assert(N->isUnindexed() && "Indexed vector load?");
 
-  SDValue Result = DAG.getLoad(ISD::UNINDEXED,
-                               N->getExtensionType(),
-                               N->getValueType(0).getVectorElementType(),
-                               SDLoc(N),
-                               N->getChain(), N->getBasePtr(),
-                               DAG.getUNDEF(N->getBasePtr().getValueType()),
-                               N->getPointerInfo(),
-                               N->getMemoryVT().getVectorElementType(),
-                               N->isVolatile(), N->isNonTemporal(),
-                               N->isInvariant(), N->getOriginalAlignment(),
-                               N->getAAInfo());
+  SDValue Result = DAG.getLoad(
+      ISD::UNINDEXED, N->getExtensionType(),
+      N->getValueType(0).getVectorElementType(), SDLoc(N), N->getChain(),
+      N->getBasePtr(), DAG.getUNDEF(N->getBasePtr().getValueType()),
+      N->getPointerInfo(), N->getMemoryVT().getVectorElementType(),
+      N->getOriginalAlignment(), N->getMemOperand()->getFlags(),
+      N->getAAInfo());
 
   // Legalize the chain result - switch anything that used the old chain to
   // use the new one.
@@ -370,7 +366,7 @@ SDValue DAGTypeLegalizer::ScalarizeVecRes_UNDEF(SDNode *N) {
 SDValue DAGTypeLegalizer::ScalarizeVecRes_VECTOR_SHUFFLE(SDNode *N) {
   // Figure out if the scalar is the LHS or RHS and return it.
   SDValue Arg = N->getOperand(2).getOperand(0);
-  if (Arg.getOpcode() == ISD::UNDEF)
+  if (Arg.isUndef())
     return DAG.getUNDEF(N->getValueType(0).getVectorElementType());
   unsigned Op = !cast<ConstantSDNode>(Arg)->isNullValue();
   return GetScalarizedVector(N->getOperand(Op));
@@ -476,16 +472,16 @@ bool DAGTypeLegalizer::ScalarizeVectorOperand(SDNode *N, unsigned OpNo) {
   return false;
 }
 
-/// ScalarizeVecOp_BITCAST - If the value to convert is a vector that needs
-/// to be scalarized, it must be <1 x ty>.  Convert the element instead.
+/// If the value to convert is a vector that needs to be scalarized, it must be
+/// <1 x ty>. Convert the element instead.
 SDValue DAGTypeLegalizer::ScalarizeVecOp_BITCAST(SDNode *N) {
   SDValue Elt = GetScalarizedVector(N->getOperand(0));
   return DAG.getNode(ISD::BITCAST, SDLoc(N),
                      N->getValueType(0), Elt);
 }
 
-/// ScalarizeVecOp_UnaryOp - If the input is a vector that needs to be
-/// scalarized, it must be <1 x ty>.  Do the operation on the element instead.
+/// If the input is a vector that needs to be scalarized, it must be <1 x ty>.
+/// Do the operation on the element instead.
 SDValue DAGTypeLegalizer::ScalarizeVecOp_UnaryOp(SDNode *N) {
   assert(N->getValueType(0).getVectorNumElements() == 1 &&
          "Unexpected vector type!");
@@ -497,8 +493,7 @@ SDValue DAGTypeLegalizer::ScalarizeVecOp_UnaryOp(SDNode *N) {
   return DAG.getNode(ISD::BUILD_VECTOR, SDLoc(N), N->getValueType(0), Op);
 }
 
-/// ScalarizeVecOp_CONCAT_VECTORS - The vectors to concatenate have length one -
-/// use a BUILD_VECTOR instead.
+/// The vectors to concatenate have length one - use a BUILD_VECTOR instead.
 SDValue DAGTypeLegalizer::ScalarizeVecOp_CONCAT_VECTORS(SDNode *N) {
   SmallVector<SDValue, 8> Ops(N->getNumOperands());
   for (unsigned i = 0, e = N->getNumOperands(); i < e; ++i)
@@ -506,9 +501,8 @@ SDValue DAGTypeLegalizer::ScalarizeVecOp_CONCAT_VECTORS(SDNode *N) {
   return DAG.getNode(ISD::BUILD_VECTOR, SDLoc(N), N->getValueType(0), Ops);
 }
 
-/// ScalarizeVecOp_EXTRACT_VECTOR_ELT - If the input is a vector that needs to
-/// be scalarized, it must be <1 x ty>, so just return the element, ignoring the
-/// index.
+/// If the input is a vector that needs to be scalarized, it must be <1 x ty>,
+/// so just return the element, ignoring the index.
 SDValue DAGTypeLegalizer::ScalarizeVecOp_EXTRACT_VECTOR_ELT(SDNode *N) {
   SDValue Res = GetScalarizedVector(N->getOperand(0));
   if (Res.getValueType() != N->getValueType(0))
@@ -518,8 +512,8 @@ SDValue DAGTypeLegalizer::ScalarizeVecOp_EXTRACT_VECTOR_ELT(SDNode *N) {
 }
 
 
-/// ScalarizeVecOp_VSELECT - If the input condition is a vector that needs to be
-/// scalarized, it must be <1 x i1>, so just convert to a normal ISD::SELECT
+/// If the input condition is a vector that needs to be scalarized, it must be
+/// <1 x i1>, so just convert to a normal ISD::SELECT
 /// (still with vector output type since that was acceptable if we got here).
 SDValue DAGTypeLegalizer::ScalarizeVecOp_VSELECT(SDNode *N) {
   SDValue ScalarCond = GetScalarizedVector(N->getOperand(0));
@@ -529,29 +523,28 @@ SDValue DAGTypeLegalizer::ScalarizeVecOp_VSELECT(SDNode *N) {
                      N->getOperand(2));
 }
 
-/// ScalarizeVecOp_STORE - If the value to store is a vector that needs to be
-/// scalarized, it must be <1 x ty>.  Just store the element.
+/// If the value to store is a vector that needs to be scalarized, it must be
+/// <1 x ty>. Just store the element.
 SDValue DAGTypeLegalizer::ScalarizeVecOp_STORE(StoreSDNode *N, unsigned OpNo){
   assert(N->isUnindexed() && "Indexed store of one-element vector?");
   assert(OpNo == 1 && "Do not know how to scalarize this operand!");
   SDLoc dl(N);
 
   if (N->isTruncatingStore())
-    return DAG.getTruncStore(N->getChain(), dl,
-                             GetScalarizedVector(N->getOperand(1)),
-                             N->getBasePtr(), N->getPointerInfo(),
-                             N->getMemoryVT().getVectorElementType(),
-                             N->isVolatile(), N->isNonTemporal(),
-                             N->getAlignment(), N->getAAInfo());
+    return DAG.getTruncStore(
+        N->getChain(), dl, GetScalarizedVector(N->getOperand(1)),
+        N->getBasePtr(), N->getPointerInfo(),
+        N->getMemoryVT().getVectorElementType(), N->getAlignment(),
+        N->getMemOperand()->getFlags(), N->getAAInfo());
 
   return DAG.getStore(N->getChain(), dl, GetScalarizedVector(N->getOperand(1)),
                       N->getBasePtr(), N->getPointerInfo(),
-                      N->isVolatile(), N->isNonTemporal(),
-                      N->getOriginalAlignment(), N->getAAInfo());
+                      N->getOriginalAlignment(), N->getMemOperand()->getFlags(),
+                      N->getAAInfo());
 }
 
-/// ScalarizeVecOp_FP_ROUND - If the value to round is a vector that needs
-/// to be scalarized, it must be <1 x ty>.  Convert the element instead.
+/// If the value to round is a vector that needs to be scalarized, it must be
+/// <1 x ty>. Convert the element instead.
 SDValue DAGTypeLegalizer::ScalarizeVecOp_FP_ROUND(SDNode *N, unsigned OpNo) {
   SDValue Elt = GetScalarizedVector(N->getOperand(0));
   SDValue Res = DAG.getNode(ISD::FP_ROUND, SDLoc(N),
@@ -564,11 +557,10 @@ SDValue DAGTypeLegalizer::ScalarizeVecOp_FP_ROUND(SDNode *N, unsigned OpNo) {
 //  Result Vector Splitting
 //===----------------------------------------------------------------------===//
 
-/// SplitVectorResult - This method is called when the specified result of the
-/// specified node is found to need vector splitting.  At this point, the node
-/// may also have invalid operands or may have other results that need
-/// legalization, we just know that (at least) one result needs vector
-/// splitting.
+/// This method is called when the specified result of the specified node is
+/// found to need vector splitting. At this point, the node may also have
+/// invalid operands or may have other results that need legalization, we just
+/// know that (at least) one result needs vector splitting.
 void DAGTypeLegalizer::SplitVectorResult(SDNode *N, unsigned ResNo) {
   DEBUG(dbgs() << "Split node result: ";
         N->dump(&DAG);
@@ -621,6 +613,12 @@ void DAGTypeLegalizer::SplitVectorResult(SDNode *N, unsigned ResNo) {
     SplitVecRes_VECTOR_SHUFFLE(cast<ShuffleVectorSDNode>(N), Lo, Hi);
     break;
 
+  case ISD::ANY_EXTEND_VECTOR_INREG:
+  case ISD::SIGN_EXTEND_VECTOR_INREG:
+  case ISD::ZERO_EXTEND_VECTOR_INREG:
+    SplitVecRes_ExtVecInRegOp(N, Lo, Hi);
+    break;
+
   case ISD::BITREVERSE:
   case ISD::BSWAP:
   case ISD::CONVERT_RNDSAT:
@@ -664,6 +662,8 @@ void DAGTypeLegalizer::SplitVectorResult(SDNode *N, unsigned ResNo) {
   case ISD::ADD:
   case ISD::SUB:
   case ISD::MUL:
+  case ISD::MULHS:
+  case ISD::MULHU:
   case ISD::FADD:
   case ISD::FSUB:
   case ISD::FMUL:
@@ -845,23 +845,41 @@ void DAGTypeLegalizer::SplitVecRes_INSERT_SUBVECTOR(SDNode *N, SDValue &Lo,
   SDLoc dl(N);
   GetSplitVector(Vec, Lo, Hi);
 
-  // Spill the vector to the stack.
   EVT VecVT = Vec.getValueType();
-  EVT SubVecVT = VecVT.getVectorElementType();
+  EVT VecElemVT = VecVT.getVectorElementType();
+  unsigned VecElems = VecVT.getVectorNumElements();
+  unsigned SubElems = SubVec.getValueType().getVectorNumElements();
+
+  // If we know the index is 0, and we know the subvector doesn't cross the
+  // boundary between the halves, we can avoid spilling the vector, and insert
+  // into the lower half of the split vector directly.
+  // TODO: The IdxVal == 0 constraint is artificial, we could do this whenever
+  // the index is constant and there is no boundary crossing. But those cases
+  // don't seem to get hit in practice.
+  if (ConstantSDNode *ConstIdx = dyn_cast<ConstantSDNode>(Idx)) {
+    unsigned IdxVal = ConstIdx->getZExtValue();
+    if ((IdxVal == 0) && (IdxVal + SubElems <= VecElems / 2)) {
+      EVT LoVT, HiVT;
+      std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(N->getValueType(0));
+      Lo = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, LoVT, Lo, SubVec, Idx);
+      return;
+    }
+  }
+
+  // Spill the vector to the stack.
   SDValue StackPtr = DAG.CreateStackTemporary(VecVT);
-  SDValue Store = DAG.getStore(DAG.getEntryNode(), dl, Vec, StackPtr,
-                               MachinePointerInfo(), false, false, 0);
+  SDValue Store =
+      DAG.getStore(DAG.getEntryNode(), dl, Vec, StackPtr, MachinePointerInfo());
 
   // Store the new subvector into the specified index.
-  SDValue SubVecPtr = GetVectorElementPointer(StackPtr, SubVecVT, Idx);
+  SDValue SubVecPtr = GetVectorElementPointer(StackPtr, VecElemVT, Idx);
   Type *VecType = VecVT.getTypeForEVT(*DAG.getContext());
   unsigned Alignment = DAG.getDataLayout().getPrefTypeAlignment(VecType);
-  Store = DAG.getStore(Store, dl, SubVec, SubVecPtr, MachinePointerInfo(),
-                       false, false, 0);
+  Store = DAG.getStore(Store, dl, SubVec, SubVecPtr, MachinePointerInfo());
 
   // Load the Lo part from the stack slot.
-  Lo = DAG.getLoad(Lo.getValueType(), dl, Store, StackPtr, MachinePointerInfo(),
-                   false, false, false, 0);
+  Lo =
+      DAG.getLoad(Lo.getValueType(), dl, Store, StackPtr, MachinePointerInfo());
 
   // Increment the pointer to the other part.
   unsigned IncrementSize = Lo.getValueType().getSizeInBits() / 8;
@@ -871,7 +889,7 @@ void DAGTypeLegalizer::SplitVecRes_INSERT_SUBVECTOR(SDNode *N, SDValue &Lo,
 
   // Load the Hi part from the stack slot.
   Hi = DAG.getLoad(Hi.getValueType(), dl, Store, StackPtr, MachinePointerInfo(),
-                   false, false, false, MinAlign(Alignment, IncrementSize));
+                   MinAlign(Alignment, IncrementSize));
 }
 
 void DAGTypeLegalizer::SplitVecRes_FPOWI(SDNode *N, SDValue &Lo,
@@ -917,6 +935,39 @@ void DAGTypeLegalizer::SplitVecRes_InregOp(SDNode *N, SDValue &Lo,
                    DAG.getValueType(HiVT));
 }
 
+void DAGTypeLegalizer::SplitVecRes_ExtVecInRegOp(SDNode *N, SDValue &Lo,
+                                                 SDValue &Hi) {
+  unsigned Opcode = N->getOpcode();
+  SDValue N0 = N->getOperand(0);
+
+  SDLoc dl(N);
+  SDValue InLo, InHi;
+  GetSplitVector(N0, InLo, InHi);
+  EVT InLoVT = InLo.getValueType();
+  unsigned InNumElements = InLoVT.getVectorNumElements();
+
+  EVT OutLoVT, OutHiVT;
+  std::tie(OutLoVT, OutHiVT) = DAG.GetSplitDestVTs(N->getValueType(0));
+  unsigned OutNumElements = OutLoVT.getVectorNumElements();
+  assert((2 * OutNumElements) <= InNumElements &&
+         "Illegal extend vector in reg split");
+
+  // *_EXTEND_VECTOR_INREG instructions extend the lowest elements of the
+  // input vector (i.e. we only use InLo):
+  // OutLo will extend the first OutNumElements from InLo.
+  // OutHi will extend the next OutNumElements from InLo.
+
+  // Shuffle the elements from InLo for OutHi into the bottom elements to
+  // create a 'fake' InHi.
+  SmallVector<int, 8> SplitHi(InNumElements, -1);
+  for (unsigned i = 0; i != OutNumElements; ++i)
+    SplitHi[i] = i + OutNumElements;
+  InHi = DAG.getVectorShuffle(InLoVT, dl, InLo, DAG.getUNDEF(InLoVT), SplitHi);
+
+  Lo = DAG.getNode(Opcode, dl, OutLoVT, InLo);
+  Hi = DAG.getNode(Opcode, dl, OutHiVT, InHi);
+}
+
 void DAGTypeLegalizer::SplitVecRes_INSERT_VECTOR_ELT(SDNode *N, SDValue &Lo,
                                                      SDValue &Hi) {
   SDValue Vec = N->getOperand(0);
@@ -947,20 +998,20 @@ void DAGTypeLegalizer::SplitVecRes_INSERT_VECTOR_ELT(SDNode *N, SDValue &Lo,
   EVT VecVT = Vec.getValueType();
   EVT EltVT = VecVT.getVectorElementType();
   SDValue StackPtr = DAG.CreateStackTemporary(VecVT);
-  SDValue Store = DAG.getStore(DAG.getEntryNode(), dl, Vec, StackPtr,
-                               MachinePointerInfo(), false, false, 0);
+  SDValue Store =
+      DAG.getStore(DAG.getEntryNode(), dl, Vec, StackPtr, MachinePointerInfo());
 
   // Store the new element.  This may be larger than the vector element type,
   // so use a truncating store.
   SDValue EltPtr = GetVectorElementPointer(StackPtr, EltVT, Idx);
   Type *VecType = VecVT.getTypeForEVT(*DAG.getContext());
   unsigned Alignment = DAG.getDataLayout().getPrefTypeAlignment(VecType);
-  Store = DAG.getTruncStore(Store, dl, Elt, EltPtr, MachinePointerInfo(), EltVT,
-                            false, false, 0);
+  Store =
+      DAG.getTruncStore(Store, dl, Elt, EltPtr, MachinePointerInfo(), EltVT);
 
   // Load the Lo part from the stack slot.
-  Lo = DAG.getLoad(Lo.getValueType(), dl, Store, StackPtr, MachinePointerInfo(),
-                   false, false, false, 0);
+  Lo =
+      DAG.getLoad(Lo.getValueType(), dl, Store, StackPtr, MachinePointerInfo());
 
   // Increment the pointer to the other part.
   unsigned IncrementSize = Lo.getValueType().getSizeInBits() / 8;
@@ -970,7 +1021,7 @@ void DAGTypeLegalizer::SplitVecRes_INSERT_VECTOR_ELT(SDNode *N, SDValue &Lo,
 
   // Load the Hi part from the stack slot.
   Hi = DAG.getLoad(Hi.getValueType(), dl, Store, StackPtr, MachinePointerInfo(),
-                   false, false, false, MinAlign(Alignment, IncrementSize));
+                   MinAlign(Alignment, IncrementSize));
 }
 
 void DAGTypeLegalizer::SplitVecRes_SCALAR_TO_VECTOR(SDNode *N, SDValue &Lo,
@@ -995,25 +1046,21 @@ void DAGTypeLegalizer::SplitVecRes_LOAD(LoadSDNode *LD, SDValue &Lo,
   SDValue Offset = DAG.getUNDEF(Ptr.getValueType());
   EVT MemoryVT = LD->getMemoryVT();
   unsigned Alignment = LD->getOriginalAlignment();
-  bool isVolatile = LD->isVolatile();
-  bool isNonTemporal = LD->isNonTemporal();
-  bool isInvariant = LD->isInvariant();
+  MachineMemOperand::Flags MMOFlags = LD->getMemOperand()->getFlags();
   AAMDNodes AAInfo = LD->getAAInfo();
 
   EVT LoMemVT, HiMemVT;
   std::tie(LoMemVT, HiMemVT) = DAG.GetSplitDestVTs(MemoryVT);
 
   Lo = DAG.getLoad(ISD::UNINDEXED, ExtType, LoVT, dl, Ch, Ptr, Offset,
-                   LD->getPointerInfo(), LoMemVT, isVolatile, isNonTemporal,
-                   isInvariant, Alignment, AAInfo);
+                   LD->getPointerInfo(), LoMemVT, Alignment, MMOFlags, AAInfo);
 
   unsigned IncrementSize = LoMemVT.getSizeInBits()/8;
   Ptr = DAG.getNode(ISD::ADD, dl, Ptr.getValueType(), Ptr,
                     DAG.getConstant(IncrementSize, dl, Ptr.getValueType()));
   Hi = DAG.getLoad(ISD::UNINDEXED, ExtType, HiVT, dl, Ch, Ptr, Offset,
-                   LD->getPointerInfo().getWithOffset(IncrementSize),
-                   HiMemVT, isVolatile, isNonTemporal, isInvariant, Alignment,
-                   AAInfo);
+                   LD->getPointerInfo().getWithOffset(IncrementSize), HiMemVT,
+                   Alignment, MMOFlags, AAInfo);
 
   // Build a factor node to remember that this load is independent of the
   // other one.
@@ -1062,7 +1109,7 @@ void DAGTypeLegalizer::SplitVecRes_MLOAD(MaskedLoadSDNode *MLD,
     std::tie(Src0Lo, Src0Hi) = DAG.SplitVector(Src0, dl);
 
   MachineMemOperand *MMO = DAG.getMachineFunction().
-    getMachineMemOperand(MLD->getPointerInfo(), 
+    getMachineMemOperand(MLD->getPointerInfo(),
                          MachineMemOperand::MOLoad,  LoMemVT.getStoreSize(),
                          Alignment, MLD->getAAInfo(), MLD->getRanges());
 
@@ -1074,7 +1121,7 @@ void DAGTypeLegalizer::SplitVecRes_MLOAD(MaskedLoadSDNode *MLD,
                     DAG.getConstant(IncrementSize, dl, Ptr.getValueType()));
 
   MMO = DAG.getMachineFunction().
-    getMachineMemOperand(MLD->getPointerInfo(), 
+    getMachineMemOperand(MLD->getPointerInfo(),
                          MachineMemOperand::MOLoad,  HiMemVT.getStoreSize(),
                          SecondHalfAlignment, MLD->getAAInfo(), MLD->getRanges());
 
@@ -1131,7 +1178,7 @@ void DAGTypeLegalizer::SplitVecRes_MGATHER(MaskedGatherSDNode *MGT,
     std::tie(IndexLo, IndexHi) = DAG.SplitVector(Index, dl);
 
   MachineMemOperand *MMO = DAG.getMachineFunction().
-    getMachineMemOperand(MGT->getPointerInfo(), 
+    getMachineMemOperand(MGT->getPointerInfo(),
                          MachineMemOperand::MOLoad,  LoMemVT.getStoreSize(),
                          Alignment, MGT->getAAInfo(), MGT->getRanges());
 
@@ -1362,7 +1409,7 @@ void DAGTypeLegalizer::SplitVecRes_VECTOR_SHUFFLE(ShuffleVectorSDNode *N,
       SDValue Op1 = InputUsed[1] == -1U ?
         DAG.getUNDEF(NewVT) : Inputs[InputUsed[1]];
       // At least one input vector was used.  Create a new shuffle vector.
-      Output =  DAG.getVectorShuffle(NewVT, dl, Op0, Op1, &Ops[0]);
+      Output =  DAG.getVectorShuffle(NewVT, dl, Op0, Op1, Ops);
     }
 
     Ops.clear();
@@ -1374,10 +1421,10 @@ void DAGTypeLegalizer::SplitVecRes_VECTOR_SHUFFLE(ShuffleVectorSDNode *N,
 //  Operand Vector Splitting
 //===----------------------------------------------------------------------===//
 
-/// SplitVectorOperand - This method is called when the specified operand of the
-/// specified node is found to need vector splitting.  At this point, all of the
-/// result types of the node are known to be legal, but other operands of the
-/// node may need legalization as well as the specified one.
+/// This method is called when the specified operand of the specified node is
+/// found to need vector splitting. At this point, all of the result types of
+/// the node are known to be legal, but other operands of the node may need
+/// legalization as well as the specified one.
 bool DAGTypeLegalizer::SplitVectorOperand(SDNode *N, unsigned OpNo) {
   DEBUG(dbgs() << "Split node operand: ";
         N->dump(&DAG);
@@ -1600,13 +1647,13 @@ SDValue DAGTypeLegalizer::SplitVecOp_EXTRACT_VECTOR_ELT(SDNode *N) {
 
   // Store the vector to the stack.
   SDValue StackPtr = DAG.CreateStackTemporary(VecVT);
-  SDValue Store = DAG.getStore(DAG.getEntryNode(), dl, Vec, StackPtr,
-                               MachinePointerInfo(), false, false, 0);
+  SDValue Store =
+      DAG.getStore(DAG.getEntryNode(), dl, Vec, StackPtr, MachinePointerInfo());
 
   // Load back the required element.
   StackPtr = GetVectorElementPointer(StackPtr, EltVT, Idx);
   return DAG.getExtLoad(ISD::EXTLOAD, dl, N->getValueType(0), Store, StackPtr,
-                        MachinePointerInfo(), EltVT, false, false, false, 0);
+                        MachinePointerInfo(), EltVT);
 }
 
 SDValue DAGTypeLegalizer::SplitVecOp_MGATHER(MaskedGatherSDNode *MGT,
@@ -1646,7 +1693,7 @@ SDValue DAGTypeLegalizer::SplitVecOp_MGATHER(MaskedGatherSDNode *MGT,
     std::tie(IndexLo, IndexHi) = DAG.SplitVector(Index, dl);
 
   MachineMemOperand *MMO = DAG.getMachineFunction().
-    getMachineMemOperand(MGT->getPointerInfo(), 
+    getMachineMemOperand(MGT->getPointerInfo(),
                          MachineMemOperand::MOLoad,  LoMemVT.getStoreSize(),
                          Alignment, MGT->getAAInfo(), MGT->getRanges());
 
@@ -1655,7 +1702,7 @@ SDValue DAGTypeLegalizer::SplitVecOp_MGATHER(MaskedGatherSDNode *MGT,
                                    OpsLo, MMO);
 
   MMO = DAG.getMachineFunction().
-    getMachineMemOperand(MGT->getPointerInfo(), 
+    getMachineMemOperand(MGT->getPointerInfo(),
                          MachineMemOperand::MOLoad,  HiMemVT.getStoreSize(),
                          Alignment, MGT->getAAInfo(),
                          MGT->getRanges());
@@ -1688,7 +1735,7 @@ SDValue DAGTypeLegalizer::SplitVecOp_MSTORE(MaskedStoreSDNode *N,
   EVT MemoryVT = N->getMemoryVT();
   unsigned Alignment = N->getOriginalAlignment();
   SDLoc DL(N);
-  
+
   EVT LoMemVT, HiMemVT;
   std::tie(LoMemVT, HiMemVT) = DAG.GetSplitDestVTs(MemoryVT);
 
@@ -1717,7 +1764,7 @@ SDValue DAGTypeLegalizer::SplitVecOp_MSTORE(MaskedStoreSDNode *N,
 
   SDValue Lo, Hi;
   MachineMemOperand *MMO = DAG.getMachineFunction().
-    getMachineMemOperand(N->getPointerInfo(), 
+    getMachineMemOperand(N->getPointerInfo(),
                          MachineMemOperand::MOStore, LoMemVT.getStoreSize(),
                          Alignment, N->getAAInfo(), N->getRanges());
 
@@ -1729,7 +1776,7 @@ SDValue DAGTypeLegalizer::SplitVecOp_MSTORE(MaskedStoreSDNode *N,
                     DAG.getConstant(IncrementSize, DL, Ptr.getValueType()));
 
   MMO = DAG.getMachineFunction().
-    getMachineMemOperand(N->getPointerInfo(), 
+    getMachineMemOperand(N->getPointerInfo(),
                          MachineMemOperand::MOStore,  HiMemVT.getStoreSize(),
                          SecondHalfAlignment, N->getAAInfo(), N->getRanges());
 
@@ -1778,7 +1825,7 @@ SDValue DAGTypeLegalizer::SplitVecOp_MSCATTER(MaskedScatterSDNode *N,
 
   SDValue Lo, Hi;
   MachineMemOperand *MMO = DAG.getMachineFunction().
-    getMachineMemOperand(N->getPointerInfo(), 
+    getMachineMemOperand(N->getPointerInfo(),
                          MachineMemOperand::MOStore, LoMemVT.getStoreSize(),
                          Alignment, N->getAAInfo(), N->getRanges());
 
@@ -1787,7 +1834,7 @@ SDValue DAGTypeLegalizer::SplitVecOp_MSCATTER(MaskedScatterSDNode *N,
                             DL, OpsLo, MMO);
 
   MMO = DAG.getMachineFunction().
-    getMachineMemOperand(N->getPointerInfo(), 
+    getMachineMemOperand(N->getPointerInfo(),
                          MachineMemOperand::MOStore,  HiMemVT.getStoreSize(),
                          Alignment, N->getAAInfo(), N->getRanges());
 
@@ -1810,8 +1857,7 @@ SDValue DAGTypeLegalizer::SplitVecOp_STORE(StoreSDNode *N, unsigned OpNo) {
   SDValue Ptr = N->getBasePtr();
   EVT MemoryVT = N->getMemoryVT();
   unsigned Alignment = N->getOriginalAlignment();
-  bool isVol = N->isVolatile();
-  bool isNT = N->isNonTemporal();
+  MachineMemOperand::Flags MMOFlags = N->getMemOperand()->getFlags();
   AAMDNodes AAInfo = N->getAAInfo();
   SDValue Lo, Hi;
   GetSplitVector(N->getOperand(1), Lo, Hi);
@@ -1822,11 +1868,11 @@ SDValue DAGTypeLegalizer::SplitVecOp_STORE(StoreSDNode *N, unsigned OpNo) {
   unsigned IncrementSize = LoMemVT.getSizeInBits()/8;
 
   if (isTruncating)
-    Lo = DAG.getTruncStore(Ch, DL, Lo, Ptr, N->getPointerInfo(),
-                           LoMemVT, isVol, isNT, Alignment, AAInfo);
+    Lo = DAG.getTruncStore(Ch, DL, Lo, Ptr, N->getPointerInfo(), LoMemVT,
+                           Alignment, MMOFlags, AAInfo);
   else
-    Lo = DAG.getStore(Ch, DL, Lo, Ptr, N->getPointerInfo(),
-                      isVol, isNT, Alignment, AAInfo);
+    Lo = DAG.getStore(Ch, DL, Lo, Ptr, N->getPointerInfo(), Alignment, MMOFlags,
+                      AAInfo);
 
   // Increment the pointer to the other half.
   Ptr = DAG.getNode(ISD::ADD, DL, Ptr.getValueType(), Ptr,
@@ -1835,11 +1881,11 @@ SDValue DAGTypeLegalizer::SplitVecOp_STORE(StoreSDNode *N, unsigned OpNo) {
   if (isTruncating)
     Hi = DAG.getTruncStore(Ch, DL, Hi, Ptr,
                            N->getPointerInfo().getWithOffset(IncrementSize),
-                           HiMemVT, isVol, isNT, Alignment, AAInfo);
+                           HiMemVT, Alignment, MMOFlags, AAInfo);
   else
     Hi = DAG.getStore(Ch, DL, Hi, Ptr,
                       N->getPointerInfo().getWithOffset(IncrementSize),
-                      isVol, isNT, Alignment, AAInfo);
+                      Alignment, MMOFlags, AAInfo);
 
   return DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Lo, Hi);
 }
@@ -1889,7 +1935,7 @@ SDValue DAGTypeLegalizer::SplitVecOp_TruncateHelper(SDNode *N) {
   EVT OutVT = N->getValueType(0);
   unsigned NumElements = OutVT.getVectorNumElements();
   bool IsFloat = OutVT.isFloatingPoint();
-  
+
   // Widening should have already made sure this is a power-two vector
   // if we're trying to split it at all. assert() that's true, just in case.
   assert(!(NumElements & 1) && "Splitting vector, but not in half!");
@@ -2069,6 +2115,12 @@ void DAGTypeLegalizer::WidenVectorResult(SDNode *N, unsigned ResNo) {
     Res = WidenVecRes_Shift(N);
     break;
 
+  case ISD::ANY_EXTEND_VECTOR_INREG:
+  case ISD::SIGN_EXTEND_VECTOR_INREG:
+  case ISD::ZERO_EXTEND_VECTOR_INREG:
+    Res = WidenVecRes_EXTEND_VECTOR_INREG(N);
+    break;
+
   case ISD::ANY_EXTEND:
   case ISD::FP_EXTEND:
   case ISD::FP_ROUND:
@@ -2355,6 +2407,61 @@ SDValue DAGTypeLegalizer::WidenVecRes_Convert(SDNode *N) {
   return DAG.getNode(ISD::BUILD_VECTOR, DL, WidenVT, Ops);
 }
 
+SDValue DAGTypeLegalizer::WidenVecRes_EXTEND_VECTOR_INREG(SDNode *N) {
+  unsigned Opcode = N->getOpcode();
+  SDValue InOp = N->getOperand(0);
+  SDLoc DL(N);
+
+  EVT WidenVT = TLI.getTypeToTransformTo(*DAG.getContext(), N->getValueType(0));
+  EVT WidenSVT = WidenVT.getVectorElementType();
+  unsigned WidenNumElts = WidenVT.getVectorNumElements();
+
+  EVT InVT = InOp.getValueType();
+  EVT InSVT = InVT.getVectorElementType();
+  unsigned InVTNumElts = InVT.getVectorNumElements();
+
+  if (getTypeAction(InVT) == TargetLowering::TypeWidenVector) {
+    InOp = GetWidenedVector(InOp);
+    InVT = InOp.getValueType();
+    if (InVT.getSizeInBits() == WidenVT.getSizeInBits()) {
+      switch (Opcode) {
+      case ISD::ANY_EXTEND_VECTOR_INREG:
+        return DAG.getAnyExtendVectorInReg(InOp, DL, WidenVT);
+      case ISD::SIGN_EXTEND_VECTOR_INREG:
+        return DAG.getSignExtendVectorInReg(InOp, DL, WidenVT);
+      case ISD::ZERO_EXTEND_VECTOR_INREG:
+        return DAG.getZeroExtendVectorInReg(InOp, DL, WidenVT);
+      }
+    }
+  }
+
+  // Unroll, extend the scalars and rebuild the vector.
+  SmallVector<SDValue, 16> Ops;
+  for (unsigned i = 0, e = std::min(InVTNumElts, WidenNumElts); i != e; ++i) {
+    SDValue Val = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, InSVT, InOp,
+      DAG.getConstant(i, DL, TLI.getVectorIdxTy(DAG.getDataLayout())));
+    switch (Opcode) {
+    case ISD::ANY_EXTEND_VECTOR_INREG:
+      Val = DAG.getNode(ISD::ANY_EXTEND, DL, WidenSVT, Val);
+      break;
+    case ISD::SIGN_EXTEND_VECTOR_INREG:
+      Val = DAG.getNode(ISD::SIGN_EXTEND, DL, WidenSVT, Val);
+      break;
+    case ISD::ZERO_EXTEND_VECTOR_INREG:
+      Val = DAG.getNode(ISD::ZERO_EXTEND, DL, WidenSVT, Val);
+      break;
+    default:
+      llvm_unreachable("A *_EXTEND_VECTOR_INREG node was expected");
+    }
+    Ops.push_back(Val);
+  }
+
+  while (Ops.size() != WidenNumElts)
+    Ops.push_back(DAG.getUNDEF(WidenSVT));
+
+  return DAG.getNode(ISD::BUILD_VECTOR, DL, WidenVT, Ops);
+}
+
 SDValue DAGTypeLegalizer::WidenVecRes_FCOPYSIGN(SDNode *N) {
   // If this is an FCOPYSIGN with same input types, we can treat it as a
   // normal (can trap) binary op.
@@ -2546,7 +2653,7 @@ SDValue DAGTypeLegalizer::WidenVecRes_CONCAT_VECTORS(SDNode *N) {
       // The inputs and the result are widen to the same value.
       unsigned i;
       for (i=1; i < NumOperands; ++i)
-        if (N->getOperand(i).getOpcode() != ISD::UNDEF)
+        if (!N->getOperand(i).isUndef())
           break;
 
       if (i == NumOperands)
@@ -2564,7 +2671,7 @@ SDValue DAGTypeLegalizer::WidenVecRes_CONCAT_VECTORS(SDNode *N) {
         return DAG.getVectorShuffle(WidenVT, dl,
                                     GetWidenedVector(N->getOperand(0)),
                                     GetWidenedVector(N->getOperand(1)),
-                                    &MaskOps[0]);
+                                    MaskOps);
       }
     }
   }
@@ -2744,7 +2851,7 @@ SDValue DAGTypeLegalizer::WidenVecRes_LOAD(SDNode *N) {
 }
 
 SDValue DAGTypeLegalizer::WidenVecRes_MLOAD(MaskedLoadSDNode *N) {
-  
+
   EVT WidenVT = TLI.getTypeToTransformTo(*DAG.getContext(),N->getValueType(0));
   SDValue Mask = N->getMask();
   EVT MaskVT = Mask.getValueType();
@@ -2898,7 +3005,7 @@ SDValue DAGTypeLegalizer::WidenVecRes_VECTOR_SHUFFLE(ShuffleVectorSDNode *N) {
   }
   for (unsigned i = NumElts; i != WidenNumElts; ++i)
     NewMask.push_back(-1);
-  return DAG.getVectorShuffle(WidenVT, dl, InOp1, InOp2, &NewMask[0]);
+  return DAG.getVectorShuffle(WidenVT, dl, InOp1, InOp2, NewMask);
 }
 
 SDValue DAGTypeLegalizer::WidenVecRes_VSETCC(SDNode *N) {
@@ -3072,9 +3179,9 @@ SDValue DAGTypeLegalizer::WidenVecOp_FCOPYSIGN(SDNode *N) {
 }
 
 SDValue DAGTypeLegalizer::WidenVecOp_Convert(SDNode *N) {
-  // Since the result is legal and the input is illegal, it is unlikely
-  // that we can fix the input to a legal type so unroll the convert
-  // into some scalar code and create a nasty build vector.
+  // Since the result is legal and the input is illegal, it is unlikely that we
+  // can fix the input to a legal type so unroll the convert into some scalar
+  // code and create a nasty build vector.
   EVT VT = N->getValueType(0);
   EVT EltVT = VT.getVectorElementType();
   SDLoc dl(N);
@@ -3161,7 +3268,7 @@ SDValue DAGTypeLegalizer::WidenVecOp_EXTRACT_VECTOR_ELT(SDNode *N) {
 }
 
 SDValue DAGTypeLegalizer::WidenVecOp_STORE(SDNode *N) {
-  // We have to widen the value but we want only to store the original
+  // We have to widen the value, but we want only to store the original
   // vector type.
   StoreSDNode *ST = cast<StoreSDNode>(N);
 
@@ -3189,10 +3296,10 @@ SDValue DAGTypeLegalizer::WidenVecOp_MSTORE(SDNode *N, unsigned OpNo) {
   if (OpNo == 2 || getTypeAction(MaskVT) == TargetLowering::TypeWidenVector)
     Mask = GetWidenedVector(Mask);
   else {
-    // The mask should be widened as well
+    // The mask should be widened as well.
     EVT BoolVT = getSetCCResultType(WideVal.getValueType());
     // We can't use ModifyToType() because we should fill the mask with
-    // zeroes
+    // zeroes.
     unsigned WidenNumElts = BoolVT.getVectorNumElements();
     unsigned MaskNumElts = MaskVT.getVectorNumElements();
 
@@ -3219,16 +3326,16 @@ SDValue DAGTypeLegalizer::WidenVecOp_MSCATTER(SDNode *N, unsigned OpNo) {
   SDValue DataOp = MSC->getValue();
   SDValue Mask = MSC->getMask();
 
-  // Widen the value
+  // Widen the value.
   SDValue WideVal = GetWidenedVector(DataOp);
   EVT WideVT = WideVal.getValueType();
   unsigned NumElts = WideVal.getValueType().getVectorNumElements();
   SDLoc dl(N);
 
-  // The mask should be widened as well
+  // The mask should be widened as well.
   Mask = WidenTargetBoolean(Mask, WideVT, true);
 
-  // Widen index
+  // Widen index.
   SDValue Index = MSC->getIndex();
   EVT WideIndexVT = EVT::getVectorVT(*DAG.getContext(),
                                      Index.getValueType().getScalarType(),
@@ -3293,7 +3400,7 @@ static EVT FindMemType(SelectionDAG& DAG, const TargetLowering &TLI,
   if (Width == WidenEltWidth)
     return RetVT;
 
-  // See if there is larger legal integer than the element type to load/store
+  // See if there is larger legal integer than the element type to load/store.
   unsigned VT;
   for (VT = (unsigned)MVT::LAST_INTEGER_VALUETYPE;
        VT >= (unsigned)MVT::FIRST_INTEGER_VALUETYPE; --VT) {
@@ -3355,7 +3462,7 @@ static SDValue BuildVectorFromScalar(SelectionDAG& DAG, EVT VecTy,
       NumElts = Width / NewLdTy.getSizeInBits();
       NewVecVT = EVT::getVectorVT(*DAG.getContext(), NewLdTy, NumElts);
       VecOp = DAG.getNode(ISD::BITCAST, dl, NewVecVT, VecOp);
-      // Readjust position and vector position based on new load type
+      // Readjust position and vector position based on new load type.
       Idx = Idx * LdTy.getSizeInBits() / NewLdTy.getSizeInBits();
       LdTy = NewLdTy;
     }
@@ -3368,8 +3475,8 @@ static SDValue BuildVectorFromScalar(SelectionDAG& DAG, EVT VecTy,
 
 SDValue DAGTypeLegalizer::GenWidenVectorLoads(SmallVectorImpl<SDValue> &LdChain,
                                               LoadSDNode *LD) {
-  // The strategy assumes that we can efficiently load powers of two widths.
-  // The routines chops the vector into the largest vector loads with the same
+  // The strategy assumes that we can efficiently load power-of-two widths.
+  // The routine chops the vector into the largest vector loads with the same
   // element type or scalar loads and then recombines it to the widen vector
   // type.
   EVT WidenVT = TLI.getTypeToTransformTo(*DAG.getContext(),LD->getValueType(0));
@@ -3380,27 +3487,24 @@ SDValue DAGTypeLegalizer::GenWidenVectorLoads(SmallVectorImpl<SDValue> &LdChain,
   assert(LdVT.getVectorElementType() == WidenVT.getVectorElementType());
 
   // Load information
-  SDValue   Chain = LD->getChain();
-  SDValue   BasePtr = LD->getBasePtr();
-  unsigned  Align    = LD->getAlignment();
-  bool      isVolatile = LD->isVolatile();
-  bool      isNonTemporal = LD->isNonTemporal();
-  bool      isInvariant = LD->isInvariant();
+  SDValue Chain = LD->getChain();
+  SDValue BasePtr = LD->getBasePtr();
+  unsigned Align = LD->getAlignment();
+  MachineMemOperand::Flags MMOFlags = LD->getMemOperand()->getFlags();
   AAMDNodes AAInfo = LD->getAAInfo();
 
   int LdWidth = LdVT.getSizeInBits();
-  int WidthDiff = WidenWidth - LdWidth;          // Difference
-  unsigned LdAlign = (isVolatile) ? 0 : Align; // Allow wider loads
+  int WidthDiff = WidenWidth - LdWidth;
+  unsigned LdAlign = LD->isVolatile() ? 0 : Align; // Allow wider loads.
 
   // Find the vector type that can load from.
   EVT NewVT = FindMemType(DAG, TLI, LdWidth, WidenVT, LdAlign, WidthDiff);
   int NewVTWidth = NewVT.getSizeInBits();
   SDValue LdOp = DAG.getLoad(NewVT, dl, Chain, BasePtr, LD->getPointerInfo(),
-                             isVolatile, isNonTemporal, isInvariant, Align,
-                             AAInfo);
+                             Align, MMOFlags, AAInfo);
   LdChain.push_back(LdOp.getValue(1));
 
-  // Check if we can load the element with one instruction
+  // Check if we can load the element with one instruction.
   if (LdWidth <= NewVTWidth) {
     if (!NewVT.isVector()) {
       unsigned NumElts = WidenWidth / NewVTWidth;
@@ -3421,7 +3525,7 @@ SDValue DAGTypeLegalizer::GenWidenVectorLoads(SmallVectorImpl<SDValue> &LdChain,
     return DAG.getNode(ISD::CONCAT_VECTORS, dl, WidenVT, ConcatOps);
   }
 
-  // Load vector by using multiple loads from largest vector to scalar
+  // Load vector by using multiple loads from largest vector to scalar.
   SmallVector<SDValue, 16> LdOps;
   LdOps.push_back(LdOp);
 
@@ -3436,13 +3540,12 @@ SDValue DAGTypeLegalizer::GenWidenVectorLoads(SmallVectorImpl<SDValue> &LdChain,
 
     SDValue L;
     if (LdWidth < NewVTWidth) {
-      // Our current type we are using is too large, find a better size
+      // The current type we are using is too large. Find a better size.
       NewVT = FindMemType(DAG, TLI, LdWidth, WidenVT, LdAlign, WidthDiff);
       NewVTWidth = NewVT.getSizeInBits();
       L = DAG.getLoad(NewVT, dl, Chain, BasePtr,
-                      LD->getPointerInfo().getWithOffset(Offset), isVolatile,
-                      isNonTemporal, isInvariant, MinAlign(Align, Increment),
-                      AAInfo);
+                      LD->getPointerInfo().getWithOffset(Offset),
+                      MinAlign(Align, Increment), MMOFlags, AAInfo);
       LdChain.push_back(L.getValue(1));
       if (L->getValueType(0).isVector()) {
         SmallVector<SDValue, 16> Loads;
@@ -3456,9 +3559,8 @@ SDValue DAGTypeLegalizer::GenWidenVectorLoads(SmallVectorImpl<SDValue> &LdChain,
       }
     } else {
       L = DAG.getLoad(NewVT, dl, Chain, BasePtr,
-                      LD->getPointerInfo().getWithOffset(Offset), isVolatile,
-                      isNonTemporal, isInvariant, MinAlign(Align, Increment),
-                      AAInfo);
+                      LD->getPointerInfo().getWithOffset(Offset),
+                      MinAlign(Align, Increment), MMOFlags, AAInfo);
       LdChain.push_back(L.getValue(1));
     }
 
@@ -3468,33 +3570,33 @@ SDValue DAGTypeLegalizer::GenWidenVectorLoads(SmallVectorImpl<SDValue> &LdChain,
     LdWidth -= NewVTWidth;
   }
 
-  // Build the vector from the loads operations
+  // Build the vector from the load operations.
   unsigned End = LdOps.size();
   if (!LdOps[0].getValueType().isVector())
     // All the loads are scalar loads.
     return BuildVectorFromScalar(DAG, WidenVT, LdOps, 0, End);
 
   // If the load contains vectors, build the vector using concat vector.
-  // All of the vectors used to loads are power of 2 and the scalars load
-  // can be combined to make a power of 2 vector.
+  // All of the vectors used to load are power-of-2, and the scalar loads can be
+  // combined to make a power-of-2 vector.
   SmallVector<SDValue, 16> ConcatOps(End);
   int i = End - 1;
   int Idx = End;
   EVT LdTy = LdOps[i].getValueType();
-  // First combine the scalar loads to a vector
+  // First, combine the scalar loads to a vector.
   if (!LdTy.isVector())  {
     for (--i; i >= 0; --i) {
       LdTy = LdOps[i].getValueType();
       if (LdTy.isVector())
         break;
     }
-    ConcatOps[--Idx] = BuildVectorFromScalar(DAG, LdTy, LdOps, i+1, End);
+    ConcatOps[--Idx] = BuildVectorFromScalar(DAG, LdTy, LdOps, i + 1, End);
   }
   ConcatOps[--Idx] = LdOps[i];
   for (--i; i >= 0; --i) {
     EVT NewLdTy = LdOps[i].getValueType();
     if (NewLdTy != LdTy) {
-      // Create a larger vector
+      // Create a larger vector.
       ConcatOps[End-1] = DAG.getNode(ISD::CONCAT_VECTORS, dl, NewLdTy,
                                      makeArrayRef(&ConcatOps[Idx], End - Idx));
       Idx = End - 1;
@@ -3503,11 +3605,11 @@ SDValue DAGTypeLegalizer::GenWidenVectorLoads(SmallVectorImpl<SDValue> &LdChain,
     ConcatOps[--Idx] = LdOps[i];
   }
 
-  if (WidenWidth == LdTy.getSizeInBits()*(End - Idx))
+  if (WidenWidth == LdTy.getSizeInBits() * (End - Idx))
     return DAG.getNode(ISD::CONCAT_VECTORS, dl, WidenVT,
                        makeArrayRef(&ConcatOps[Idx], End - Idx));
 
-  // We need to fill the rest with undefs to build the vector
+  // We need to fill the rest with undefs to build the vector.
   unsigned NumOps = WidenWidth / LdTy.getSizeInBits();
   SmallVector<SDValue, 16> WidenOps(NumOps);
   SDValue UndefVal = DAG.getUNDEF(LdTy);
@@ -3526,33 +3628,30 @@ DAGTypeLegalizer::GenWidenVectorExtLoads(SmallVectorImpl<SDValue> &LdChain,
                                          LoadSDNode *LD,
                                          ISD::LoadExtType ExtType) {
   // For extension loads, it may not be more efficient to chop up the vector
-  // and then extended it.  Instead, we unroll the load and build a new vector.
+  // and then extend it. Instead, we unroll the load and build a new vector.
   EVT WidenVT = TLI.getTypeToTransformTo(*DAG.getContext(),LD->getValueType(0));
   EVT LdVT    = LD->getMemoryVT();
   SDLoc dl(LD);
   assert(LdVT.isVector() && WidenVT.isVector());
 
   // Load information
-  SDValue   Chain = LD->getChain();
-  SDValue   BasePtr = LD->getBasePtr();
-  unsigned  Align    = LD->getAlignment();
-  bool      isVolatile = LD->isVolatile();
-  bool      isNonTemporal = LD->isNonTemporal();
-  bool      isInvariant = LD->isInvariant();
+  SDValue Chain = LD->getChain();
+  SDValue BasePtr = LD->getBasePtr();
+  unsigned Align = LD->getAlignment();
+  MachineMemOperand::Flags MMOFlags = LD->getMemOperand()->getFlags();
   AAMDNodes AAInfo = LD->getAAInfo();
 
   EVT EltVT = WidenVT.getVectorElementType();
   EVT LdEltVT = LdVT.getVectorElementType();
   unsigned NumElts = LdVT.getVectorNumElements();
 
-  // Load each element and widen
+  // Load each element and widen.
   unsigned WidenNumElts = WidenVT.getVectorNumElements();
   SmallVector<SDValue, 16> Ops(WidenNumElts);
   unsigned Increment = LdEltVT.getSizeInBits() / 8;
-  Ops[0] = DAG.getExtLoad(ExtType, dl, EltVT, Chain, BasePtr,
-                          LD->getPointerInfo(),
-                          LdEltVT, isVolatile, isNonTemporal, isInvariant,
-                          Align, AAInfo);
+  Ops[0] =
+      DAG.getExtLoad(ExtType, dl, EltVT, Chain, BasePtr, LD->getPointerInfo(),
+                     LdEltVT, Align, MMOFlags, AAInfo);
   LdChain.push_back(Ops[0].getValue(1));
   unsigned i = 0, Offset = Increment;
   for (i=1; i < NumElts; ++i, Offset += Increment) {
@@ -3562,12 +3661,11 @@ DAGTypeLegalizer::GenWidenVectorExtLoads(SmallVectorImpl<SDValue> &LdChain,
                                                      BasePtr.getValueType()));
     Ops[i] = DAG.getExtLoad(ExtType, dl, EltVT, Chain, NewBasePtr,
                             LD->getPointerInfo().getWithOffset(Offset), LdEltVT,
-                            isVolatile, isNonTemporal, isInvariant, Align,
-                            AAInfo);
+                            Align, MMOFlags, AAInfo);
     LdChain.push_back(Ops[i].getValue(1));
   }
 
-  // Fill the rest with undefs
+  // Fill the rest with undefs.
   SDValue UndefVal = DAG.getUNDEF(EltVT);
   for (; i != WidenNumElts; ++i)
     Ops[i] = UndefVal;
@@ -3578,14 +3676,13 @@ DAGTypeLegalizer::GenWidenVectorExtLoads(SmallVectorImpl<SDValue> &LdChain,
 
 void DAGTypeLegalizer::GenWidenVectorStores(SmallVectorImpl<SDValue> &StChain,
                                             StoreSDNode *ST) {
-  // The strategy assumes that we can efficiently store powers of two widths.
-  // The routines chops the vector into the largest vector stores with the same
+  // The strategy assumes that we can efficiently store power-of-two widths.
+  // The routine chops the vector into the largest vector stores with the same
   // element type or scalar stores.
   SDValue  Chain = ST->getChain();
   SDValue  BasePtr = ST->getBasePtr();
   unsigned Align = ST->getAlignment();
-  bool     isVolatile = ST->isVolatile();
-  bool     isNonTemporal = ST->isNonTemporal();
+  MachineMemOperand::Flags MMOFlags = ST->getMemOperand()->getFlags();
   AAMDNodes AAInfo = ST->getAAInfo();
   SDValue  ValOp = GetWidenedVector(ST->getValue());
   SDLoc dl(ST);
@@ -3601,7 +3698,7 @@ void DAGTypeLegalizer::GenWidenVectorStores(SmallVectorImpl<SDValue> &StChain,
   int Idx = 0;          // current index to store
   unsigned Offset = 0;  // offset from base to store
   while (StWidth != 0) {
-    // Find the largest vector type we can store with
+    // Find the largest vector type we can store with.
     EVT NewVT = FindMemType(DAG, TLI, StWidth, ValVT);
     unsigned NewVTWidth = NewVT.getSizeInBits();
     unsigned Increment = NewVTWidth / 8;
@@ -3611,10 +3708,9 @@ void DAGTypeLegalizer::GenWidenVectorStores(SmallVectorImpl<SDValue> &StChain,
         SDValue EOp = DAG.getNode(
             ISD::EXTRACT_SUBVECTOR, dl, NewVT, ValOp,
             DAG.getConstant(Idx, dl, TLI.getVectorIdxTy(DAG.getDataLayout())));
-        StChain.push_back(DAG.getStore(Chain, dl, EOp, BasePtr,
-                                    ST->getPointerInfo().getWithOffset(Offset),
-                                       isVolatile, isNonTemporal,
-                                       MinAlign(Align, Offset), AAInfo));
+        StChain.push_back(DAG.getStore(
+            Chain, dl, EOp, BasePtr, ST->getPointerInfo().getWithOffset(Offset),
+            MinAlign(Align, Offset), MMOFlags, AAInfo));
         StWidth -= NewVTWidth;
         Offset += Increment;
         Idx += NumVTElts;
@@ -3623,28 +3719,27 @@ void DAGTypeLegalizer::GenWidenVectorStores(SmallVectorImpl<SDValue> &StChain,
                                               BasePtr.getValueType()));
       } while (StWidth != 0 && StWidth >= NewVTWidth);
     } else {
-      // Cast the vector to the scalar type we can store
+      // Cast the vector to the scalar type we can store.
       unsigned NumElts = ValWidth / NewVTWidth;
       EVT NewVecVT = EVT::getVectorVT(*DAG.getContext(), NewVT, NumElts);
       SDValue VecOp = DAG.getNode(ISD::BITCAST, dl, NewVecVT, ValOp);
-      // Readjust index position based on new vector type
+      // Readjust index position based on new vector type.
       Idx = Idx * ValEltWidth / NewVTWidth;
       do {
         SDValue EOp = DAG.getNode(
             ISD::EXTRACT_VECTOR_ELT, dl, NewVT, VecOp,
             DAG.getConstant(Idx++, dl,
                             TLI.getVectorIdxTy(DAG.getDataLayout())));
-        StChain.push_back(DAG.getStore(Chain, dl, EOp, BasePtr,
-                                    ST->getPointerInfo().getWithOffset(Offset),
-                                       isVolatile, isNonTemporal,
-                                       MinAlign(Align, Offset), AAInfo));
+        StChain.push_back(DAG.getStore(
+            Chain, dl, EOp, BasePtr, ST->getPointerInfo().getWithOffset(Offset),
+            MinAlign(Align, Offset), MMOFlags, AAInfo));
         StWidth -= NewVTWidth;
         Offset += Increment;
         BasePtr = DAG.getNode(ISD::ADD, dl, BasePtr.getValueType(), BasePtr,
                               DAG.getConstant(Increment, dl,
                                               BasePtr.getValueType()));
       } while (StWidth != 0 && StWidth >= NewVTWidth);
-      // Restore index back to be relative to the original widen element type
+      // Restore index back to be relative to the original widen element type.
       Idx = Idx * NewVTWidth / ValEltWidth;
     }
   }
@@ -3654,27 +3749,25 @@ void
 DAGTypeLegalizer::GenWidenVectorTruncStores(SmallVectorImpl<SDValue> &StChain,
                                             StoreSDNode *ST) {
   // For extension loads, it may not be more efficient to truncate the vector
-  // and then store it.  Instead, we extract each element and then store it.
-  SDValue  Chain = ST->getChain();
-  SDValue  BasePtr = ST->getBasePtr();
+  // and then store it. Instead, we extract each element and then store it.
+  SDValue Chain = ST->getChain();
+  SDValue BasePtr = ST->getBasePtr();
   unsigned Align = ST->getAlignment();
-  bool     isVolatile = ST->isVolatile();
-  bool     isNonTemporal = ST->isNonTemporal();
+  MachineMemOperand::Flags MMOFlags = ST->getMemOperand()->getFlags();
   AAMDNodes AAInfo = ST->getAAInfo();
-  SDValue  ValOp = GetWidenedVector(ST->getValue());
+  SDValue ValOp = GetWidenedVector(ST->getValue());
   SDLoc dl(ST);
 
   EVT StVT = ST->getMemoryVT();
   EVT ValVT = ValOp.getValueType();
 
-  // It must be true that we the widen vector type is bigger than where
-  // we need to store.
+  // It must be true that the wide vector type is bigger than where we need to
+  // store.
   assert(StVT.isVector() && ValOp.getValueType().isVector());
   assert(StVT.bitsLT(ValOp.getValueType()));
 
-  // For truncating stores, we can not play the tricks of chopping legal
-  // vector types and bit cast it to the right type.  Instead, we unroll
-  // the store.
+  // For truncating stores, we can not play the tricks of chopping legal vector
+  // types and bitcast it to the right type. Instead, we unroll the store.
   EVT StEltVT  = StVT.getVectorElementType();
   EVT ValEltVT = ValVT.getVectorElementType();
   unsigned Increment = ValEltVT.getSizeInBits() / 8;
@@ -3683,9 +3776,8 @@ DAGTypeLegalizer::GenWidenVectorTruncStores(SmallVectorImpl<SDValue> &StChain,
       ISD::EXTRACT_VECTOR_ELT, dl, ValEltVT, ValOp,
       DAG.getConstant(0, dl, TLI.getVectorIdxTy(DAG.getDataLayout())));
   StChain.push_back(DAG.getTruncStore(Chain, dl, EOp, BasePtr,
-                                      ST->getPointerInfo(), StEltVT,
-                                      isVolatile, isNonTemporal, Align,
-                                      AAInfo));
+                                      ST->getPointerInfo(), StEltVT, Align,
+                                      MMOFlags, AAInfo));
   unsigned Offset = Increment;
   for (unsigned i=1; i < NumElts; ++i, Offset += Increment) {
     SDValue NewBasePtr = DAG.getNode(ISD::ADD, dl, BasePtr.getValueType(),
@@ -3695,10 +3787,9 @@ DAGTypeLegalizer::GenWidenVectorTruncStores(SmallVectorImpl<SDValue> &StChain,
     SDValue EOp = DAG.getNode(
         ISD::EXTRACT_VECTOR_ELT, dl, ValEltVT, ValOp,
         DAG.getConstant(0, dl, TLI.getVectorIdxTy(DAG.getDataLayout())));
-    StChain.push_back(DAG.getTruncStore(Chain, dl, EOp, NewBasePtr,
-                                      ST->getPointerInfo().getWithOffset(Offset),
-                                        StEltVT, isVolatile, isNonTemporal,
-                                        MinAlign(Align, Offset), AAInfo));
+    StChain.push_back(DAG.getTruncStore(
+        Chain, dl, EOp, NewBasePtr, ST->getPointerInfo().getWithOffset(Offset),
+        StEltVT, MinAlign(Align, Offset), MMOFlags, AAInfo));
   }
 }
 
diff --git a/lib/CodeGen/SelectionDAG/Makefile b/lib/CodeGen/SelectionDAG/Makefile
deleted file mode 100644
index ea716fdaabba..000000000000
--- a/lib/CodeGen/SelectionDAG/Makefile
+++ /dev/null
@@ -1,13 +0,0 @@
-##===- lib/CodeGen/SelectionDAG/Makefile -------------------*- Makefile -*-===##
-#
-#                     The LLVM Compiler Infrastructure
-#
-# This file is distributed under the University of Illinois Open Source
-# License. See LICENSE.TXT for details.
-#
-##===----------------------------------------------------------------------===##
-
-LEVEL = ../../..
-LIBRARYNAME = LLVMSelectionDAG
-
-include $(LEVEL)/Makefile.common
diff --git a/lib/CodeGen/SelectionDAG/ResourcePriorityQueue.cpp b/lib/CodeGen/SelectionDAG/ResourcePriorityQueue.cpp
index 622e06f0da2a..1e5c4a73693f 100644
--- a/lib/CodeGen/SelectionDAG/ResourcePriorityQueue.cpp
+++ b/lib/CodeGen/SelectionDAG/ResourcePriorityQueue.cpp
@@ -37,7 +37,7 @@ static cl::opt<bool> DisableDFASched("disable-dfa-sched", cl::Hidden,
   cl::ZeroOrMore, cl::init(false),
   cl::desc("Disable use of DFA during scheduling"));
 
-static cl::opt<signed> RegPressureThreshold(
+static cl::opt<int> RegPressureThreshold(
   "dfa-sched-reg-pressure-threshold", cl::Hidden, cl::ZeroOrMore, cl::init(5),
   cl::desc("Track reg pressure and switch priority to in-depth"));
 
@@ -323,8 +323,8 @@ void ResourcePriorityQueue::reserveResources(SUnit *SU) {
   }
 }
 
-signed ResourcePriorityQueue::rawRegPressureDelta(SUnit *SU, unsigned RCId) {
-  signed RegBalance    = 0;
+int ResourcePriorityQueue::rawRegPressureDelta(SUnit *SU, unsigned RCId) {
+  int RegBalance = 0;
 
   if (!SU || !SU->getNode() || !SU->getNode()->isMachineOpcode())
     return RegBalance;
@@ -357,8 +357,8 @@ signed ResourcePriorityQueue::rawRegPressureDelta(SUnit *SU, unsigned RCId) {
 /// The RawPressure flag makes this function to ignore
 /// existing reg file sizes, and report raw def/use
 /// balance.
-signed ResourcePriorityQueue::regPressureDelta(SUnit *SU, bool RawPressure) {
-  signed RegBalance    = 0;
+int ResourcePriorityQueue::regPressureDelta(SUnit *SU, bool RawPressure) {
+  int RegBalance = 0;
 
   if (!SU || !SU->getNode() || !SU->getNode()->isMachineOpcode())
     return RegBalance;
@@ -398,9 +398,9 @@ static const unsigned FactorOne = 2;
 
 /// Returns single number reflecting benefit of scheduling SU
 /// in the current cycle.
-signed ResourcePriorityQueue::SUSchedulingCost(SUnit *SU) {
+int ResourcePriorityQueue::SUSchedulingCost(SUnit *SU) {
   // Initial trivial priority.
-  signed ResCount = 1;
+  int ResCount = 1;
 
   // Do not waste time on a node that is already scheduled.
   if (SU->isScheduled)
@@ -601,7 +601,7 @@ SUnit *ResourcePriorityQueue::pop() {
 
   std::vector<SUnit *>::iterator Best = Queue.begin();
   if (!DisableDFASched) {
-    signed BestCost = SUSchedulingCost(*Best);
+    int BestCost = SUSchedulingCost(*Best);
     for (std::vector<SUnit *>::iterator I = std::next(Queue.begin()),
            E = Queue.end(); I != E; ++I) {
 
diff --git a/lib/CodeGen/SelectionDAG/SDNodeDbgValue.h b/lib/CodeGen/SelectionDAG/SDNodeDbgValue.h
index c27f8de601f2..237d541b4cb9 100644
--- a/lib/CodeGen/SelectionDAG/SDNodeDbgValue.h
+++ b/lib/CodeGen/SelectionDAG/SDNodeDbgValue.h
@@ -14,9 +14,9 @@
 #ifndef LLVM_LIB_CODEGEN_SELECTIONDAG_SDNODEDBGVALUE_H
 #define LLVM_LIB_CODEGEN_SELECTIONDAG_SDNODEDBGVALUE_H
 
-#include "llvm/ADT/SmallVector.h"
 #include "llvm/IR/DebugLoc.h"
 #include "llvm/Support/DataTypes.h"
+#include <utility>
 
 namespace llvm {
 
@@ -56,7 +56,8 @@ public:
   // Constructor for non-constants.
   SDDbgValue(MDNode *Var, MDNode *Expr, SDNode *N, unsigned R, bool indir,
              uint64_t off, DebugLoc dl, unsigned O)
-      : Var(Var), Expr(Expr), Offset(off), DL(dl), Order(O), IsIndirect(indir) {
+      : Var(Var), Expr(Expr), Offset(off), DL(std::move(dl)), Order(O),
+        IsIndirect(indir) {
     kind = SDNODE;
     u.s.Node = N;
     u.s.ResNo = R;
@@ -65,7 +66,8 @@ public:
   // Constructor for constants.
   SDDbgValue(MDNode *Var, MDNode *Expr, const Value *C, uint64_t off,
              DebugLoc dl, unsigned O)
-      : Var(Var), Expr(Expr), Offset(off), DL(dl), Order(O), IsIndirect(false) {
+      : Var(Var), Expr(Expr), Offset(off), DL(std::move(dl)), Order(O),
+        IsIndirect(false) {
     kind = CONST;
     u.Const = C;
   }
@@ -73,7 +75,8 @@ public:
   // Constructor for frame indices.
   SDDbgValue(MDNode *Var, MDNode *Expr, unsigned FI, uint64_t off, DebugLoc dl,
              unsigned O)
-      : Var(Var), Expr(Expr), Offset(off), DL(dl), Order(O), IsIndirect(false) {
+      : Var(Var), Expr(Expr), Offset(off), DL(std::move(dl)), Order(O),
+        IsIndirect(false) {
     kind = FRAMEIX;
     u.FrameIx = FI;
   }
diff --git a/lib/CodeGen/SelectionDAG/ScheduleDAGRRList.cpp b/lib/CodeGen/SelectionDAG/ScheduleDAGRRList.cpp
index 91024e672f9c..802c459a0223 100644
--- a/lib/CodeGen/SelectionDAG/ScheduleDAGRRList.cpp
+++ b/lib/CodeGen/SelectionDAG/ScheduleDAGRRList.cpp
@@ -336,8 +336,8 @@ void ScheduleDAGRRList::Schedule() {
   // Build the scheduling graph.
   BuildSchedGraph(nullptr);
 
-  DEBUG(for (unsigned su = 0, e = SUnits.size(); su != e; ++su)
-          SUnits[su].dumpAll(this));
+  DEBUG(for (SUnit &SU : SUnits)
+          SU.dumpAll(this));
   Topo.InitDAGTopologicalSorting();
 
   AvailableQueue->initNodes(SUnits);
@@ -1027,43 +1027,37 @@ SUnit *ScheduleDAGRRList::CopyAndMoveSuccessors(SUnit *SU) {
     SmallVector<SDep, 4> LoadPreds;
     SmallVector<SDep, 4> NodePreds;
     SmallVector<SDep, 4> NodeSuccs;
-    for (SUnit::pred_iterator I = SU->Preds.begin(), E = SU->Preds.end();
-         I != E; ++I) {
-      if (I->isCtrl())
-        ChainPreds.push_back(*I);
-      else if (isOperandOf(I->getSUnit(), LoadNode))
-        LoadPreds.push_back(*I);
+    for (SDep &Pred : SU->Preds) {
+      if (Pred.isCtrl())
+        ChainPreds.push_back(Pred);
+      else if (isOperandOf(Pred.getSUnit(), LoadNode))
+        LoadPreds.push_back(Pred);
       else
-        NodePreds.push_back(*I);
+        NodePreds.push_back(Pred);
     }
-    for (SUnit::succ_iterator I = SU->Succs.begin(), E = SU->Succs.end();
-         I != E; ++I) {
-      if (I->isCtrl())
-        ChainSuccs.push_back(*I);
+    for (SDep &Succ : SU->Succs) {
+      if (Succ.isCtrl())
+        ChainSuccs.push_back(Succ);
       else
-        NodeSuccs.push_back(*I);
+        NodeSuccs.push_back(Succ);
     }
 
     // Now assign edges to the newly-created nodes.
-    for (unsigned i = 0, e = ChainPreds.size(); i != e; ++i) {
-      const SDep &Pred = ChainPreds[i];
+    for (const SDep &Pred : ChainPreds) {
       RemovePred(SU, Pred);
       if (isNewLoad)
         AddPred(LoadSU, Pred);
     }
-    for (unsigned i = 0, e = LoadPreds.size(); i != e; ++i) {
-      const SDep &Pred = LoadPreds[i];
+    for (const SDep &Pred : LoadPreds) {
       RemovePred(SU, Pred);
       if (isNewLoad)
         AddPred(LoadSU, Pred);
     }
-    for (unsigned i = 0, e = NodePreds.size(); i != e; ++i) {
-      const SDep &Pred = NodePreds[i];
+    for (const SDep &Pred : NodePreds) {
       RemovePred(SU, Pred);
       AddPred(NewSU, Pred);
     }
-    for (unsigned i = 0, e = NodeSuccs.size(); i != e; ++i) {
-      SDep D = NodeSuccs[i];
+    for (SDep D : NodeSuccs) {
       SUnit *SuccDep = D.getSUnit();
       D.setSUnit(SU);
       RemovePred(SuccDep, D);
@@ -1074,8 +1068,7 @@ SUnit *ScheduleDAGRRList::CopyAndMoveSuccessors(SUnit *SU) {
           && !D.isCtrl() && NewSU->NumRegDefsLeft > 0)
         --NewSU->NumRegDefsLeft;
     }
-    for (unsigned i = 0, e = ChainSuccs.size(); i != e; ++i) {
-      SDep D = ChainSuccs[i];
+    for (SDep D : ChainSuccs) {
       SUnit *SuccDep = D.getSUnit();
       D.setSUnit(SU);
       RemovePred(SuccDep, D);
@@ -1108,29 +1101,27 @@ SUnit *ScheduleDAGRRList::CopyAndMoveSuccessors(SUnit *SU) {
   NewSU = CreateClone(SU);
 
   // New SUnit has the exact same predecessors.
-  for (SUnit::pred_iterator I = SU->Preds.begin(), E = SU->Preds.end();
-       I != E; ++I)
-    if (!I->isArtificial())
-      AddPred(NewSU, *I);
+  for (SDep &Pred : SU->Preds)
+    if (!Pred.isArtificial())
+      AddPred(NewSU, Pred);
 
   // Only copy scheduled successors. Cut them from old node's successor
   // list and move them over.
   SmallVector<std::pair<SUnit *, SDep>, 4> DelDeps;
-  for (SUnit::succ_iterator I = SU->Succs.begin(), E = SU->Succs.end();
-       I != E; ++I) {
-    if (I->isArtificial())
+  for (SDep &Succ : SU->Succs) {
+    if (Succ.isArtificial())
       continue;
-    SUnit *SuccSU = I->getSUnit();
+    SUnit *SuccSU = Succ.getSUnit();
     if (SuccSU->isScheduled) {
-      SDep D = *I;
+      SDep D = Succ;
       D.setSUnit(NewSU);
       AddPred(SuccSU, D);
       D.setSUnit(SU);
       DelDeps.push_back(std::make_pair(SuccSU, D));
     }
   }
-  for (unsigned i = 0, e = DelDeps.size(); i != e; ++i)
-    RemovePred(DelDeps[i].first, DelDeps[i].second);
+  for (auto &DelDep : DelDeps)
+    RemovePred(DelDep.first, DelDep.second);
 
   AvailableQueue->updateNode(SU);
   AvailableQueue->addNode(NewSU);
@@ -1156,16 +1147,15 @@ void ScheduleDAGRRList::InsertCopiesAndMoveSuccs(SUnit *SU, unsigned Reg,
   // Only copy scheduled successors. Cut them from old node's successor
   // list and move them over.
   SmallVector<std::pair<SUnit *, SDep>, 4> DelDeps;
-  for (SUnit::succ_iterator I = SU->Succs.begin(), E = SU->Succs.end();
-       I != E; ++I) {
-    if (I->isArtificial())
+  for (SDep &Succ : SU->Succs) {
+    if (Succ.isArtificial())
       continue;
-    SUnit *SuccSU = I->getSUnit();
+    SUnit *SuccSU = Succ.getSUnit();
     if (SuccSU->isScheduled) {
-      SDep D = *I;
+      SDep D = Succ;
       D.setSUnit(CopyToSU);
       AddPred(SuccSU, D);
-      DelDeps.push_back(std::make_pair(SuccSU, *I));
+      DelDeps.push_back(std::make_pair(SuccSU, Succ));
     }
     else {
       // Avoid scheduling the def-side copy before other successors. Otherwise
@@ -1174,8 +1164,8 @@ void ScheduleDAGRRList::InsertCopiesAndMoveSuccs(SUnit *SU, unsigned Reg,
       AddPred(SuccSU, SDep(CopyFromSU, SDep::Artificial));
     }
   }
-  for (unsigned i = 0, e = DelDeps.size(); i != e; ++i)
-    RemovePred(DelDeps[i].first, DelDeps[i].second);
+  for (auto &DelDep : DelDeps)
+    RemovePred(DelDep.first, DelDep.second);
 
   SDep FromDep(SU, SDep::Data, Reg);
   FromDep.setLatency(SU->Latency);
@@ -1400,16 +1390,14 @@ SUnit *ScheduleDAGRRList::PickNodeToScheduleBottomUp() {
   // All candidates are delayed due to live physical reg dependencies.
   // Try backtracking, code duplication, or inserting cross class copies
   // to resolve it.
-  for (unsigned i = 0, e = Interferences.size(); i != e; ++i) {
-    SUnit *TrySU = Interferences[i];
+  for (SUnit *TrySU : Interferences) {
     SmallVectorImpl<unsigned> &LRegs = LRegsMap[TrySU];
 
     // Try unscheduling up to the point where it's safe to schedule
     // this node.
     SUnit *BtSU = nullptr;
     unsigned LiveCycle = UINT_MAX;
-    for (unsigned j = 0, ee = LRegs.size(); j != ee; ++j) {
-      unsigned Reg = LRegs[j];
+    for (unsigned Reg : LRegs) {
       if (LiveRegGens[Reg]->getHeight() < LiveCycle) {
         BtSU = LiveRegGens[Reg];
         LiveCycle = BtSU->getHeight();
@@ -1854,10 +1842,9 @@ CalcNodeSethiUllmanNumber(const SUnit *SU, std::vector<unsigned> &SUNumbers) {
     return SethiUllmanNumber;
 
   unsigned Extra = 0;
-  for (SUnit::const_pred_iterator I = SU->Preds.begin(), E = SU->Preds.end();
-       I != E; ++I) {
-    if (I->isCtrl()) continue;  // ignore chain preds
-    SUnit *PredSU = I->getSUnit();
+  for (const SDep &Pred : SU->Preds) {
+    if (Pred.isCtrl()) continue;  // ignore chain preds
+    SUnit *PredSU = Pred.getSUnit();
     unsigned PredSethiUllman = CalcNodeSethiUllmanNumber(PredSU, SUNumbers);
     if (PredSethiUllman > SethiUllmanNumber) {
       SethiUllmanNumber = PredSethiUllman;
@@ -1879,8 +1866,8 @@ CalcNodeSethiUllmanNumber(const SUnit *SU, std::vector<unsigned> &SUNumbers) {
 void RegReductionPQBase::CalculateSethiUllmanNumbers() {
   SethiUllmanNumbers.assign(SUnits->size(), 0);
 
-  for (unsigned i = 0, e = SUnits->size(); i != e; ++i)
-    CalcNodeSethiUllmanNumber(&(*SUnits)[i], SethiUllmanNumbers);
+  for (const SUnit &SU : *SUnits)
+    CalcNodeSethiUllmanNumber(&SU, SethiUllmanNumbers);
 }
 
 void RegReductionPQBase::addNode(const SUnit *SU) {
@@ -1956,11 +1943,10 @@ bool RegReductionPQBase::HighRegPressure(const SUnit *SU) const {
   if (!TLI)
     return false;
 
-  for (SUnit::const_pred_iterator I = SU->Preds.begin(),E = SU->Preds.end();
-       I != E; ++I) {
-    if (I->isCtrl())
+  for (const SDep &Pred : SU->Preds) {
+    if (Pred.isCtrl())
       continue;
-    SUnit *PredSU = I->getSUnit();
+    SUnit *PredSU = Pred.getSUnit();
     // NumRegDefsLeft is zero when enough uses of this node have been scheduled
     // to cover the number of registers defined (they are all live).
     if (PredSU->NumRegDefsLeft == 0) {
@@ -2006,11 +1992,10 @@ bool RegReductionPQBase::MayReduceRegPressure(SUnit *SU) const {
 int RegReductionPQBase::RegPressureDiff(SUnit *SU, unsigned &LiveUses) const {
   LiveUses = 0;
   int PDiff = 0;
-  for (SUnit::const_pred_iterator I = SU->Preds.begin(),E = SU->Preds.end();
-       I != E; ++I) {
-    if (I->isCtrl())
+  for (const SDep &Pred : SU->Preds) {
+    if (Pred.isCtrl())
       continue;
-    SUnit *PredSU = I->getSUnit();
+    SUnit *PredSU = Pred.getSUnit();
     // NumRegDefsLeft is zero when enough uses of this node have been scheduled
     // to cover the number of registers defined (they are all live).
     if (PredSU->NumRegDefsLeft == 0) {
@@ -2050,11 +2035,10 @@ void RegReductionPQBase::scheduledNode(SUnit *SU) {
   if (!SU->getNode())
     return;
 
-  for (SUnit::pred_iterator I = SU->Preds.begin(), E = SU->Preds.end();
-       I != E; ++I) {
-    if (I->isCtrl())
+  for (const SDep &Pred : SU->Preds) {
+    if (Pred.isCtrl())
       continue;
-    SUnit *PredSU = I->getSUnit();
+    SUnit *PredSU = Pred.getSUnit();
     // NumRegDefsLeft is zero when enough uses of this node have been scheduled
     // to cover the number of registers defined (they are all live).
     if (PredSU->NumRegDefsLeft == 0) {
@@ -2132,11 +2116,10 @@ void RegReductionPQBase::unscheduledNode(SUnit *SU) {
       return;
   }
 
-  for (SUnit::pred_iterator I = SU->Preds.begin(), E = SU->Preds.end();
-       I != E; ++I) {
-    if (I->isCtrl())
+  for (const SDep &Pred : SU->Preds) {
+    if (Pred.isCtrl())
       continue;
-    SUnit *PredSU = I->getSUnit();
+    SUnit *PredSU = Pred.getSUnit();
     // NumSuccsLeft counts all deps. Don't compare it with NumSuccs which only
     // counts data deps.
     if (PredSU->NumSuccsLeft != PredSU->Succs.size())
@@ -2201,15 +2184,14 @@ void RegReductionPQBase::unscheduledNode(SUnit *SU) {
 /// closest to the current cycle.
 static unsigned closestSucc(const SUnit *SU) {
   unsigned MaxHeight = 0;
-  for (SUnit::const_succ_iterator I = SU->Succs.begin(), E = SU->Succs.end();
-       I != E; ++I) {
-    if (I->isCtrl()) continue;  // ignore chain succs
-    unsigned Height = I->getSUnit()->getHeight();
+  for (const SDep &Succ : SU->Succs) {
+    if (Succ.isCtrl()) continue;  // ignore chain succs
+    unsigned Height = Succ.getSUnit()->getHeight();
     // If there are bunch of CopyToRegs stacked up, they should be considered
     // to be at the same position.
-    if (I->getSUnit()->getNode() &&
-        I->getSUnit()->getNode()->getOpcode() == ISD::CopyToReg)
-      Height = closestSucc(I->getSUnit())+1;
+    if (Succ.getSUnit()->getNode() &&
+        Succ.getSUnit()->getNode()->getOpcode() == ISD::CopyToReg)
+      Height = closestSucc(Succ.getSUnit())+1;
     if (Height > MaxHeight)
       MaxHeight = Height;
   }
@@ -2220,9 +2202,8 @@ static unsigned closestSucc(const SUnit *SU) {
 /// for scratch registers, i.e. number of data dependencies.
 static unsigned calcMaxScratches(const SUnit *SU) {
   unsigned Scratches = 0;
-  for (SUnit::const_pred_iterator I = SU->Preds.begin(), E = SU->Preds.end();
-       I != E; ++I) {
-    if (I->isCtrl()) continue;  // ignore chain preds
+  for (const SDep &Pred : SU->Preds) {
+    if (Pred.isCtrl()) continue;  // ignore chain preds
     Scratches++;
   }
   return Scratches;
@@ -2232,10 +2213,9 @@ static unsigned calcMaxScratches(const SUnit *SU) {
 /// CopyFromReg from a virtual register.
 static bool hasOnlyLiveInOpers(const SUnit *SU) {
   bool RetVal = false;
-  for (SUnit::const_pred_iterator I = SU->Preds.begin(), E = SU->Preds.end();
-       I != E; ++I) {
-    if (I->isCtrl()) continue;
-    const SUnit *PredSU = I->getSUnit();
+  for (const SDep &Pred : SU->Preds) {
+    if (Pred.isCtrl()) continue;
+    const SUnit *PredSU = Pred.getSUnit();
     if (PredSU->getNode() &&
         PredSU->getNode()->getOpcode() == ISD::CopyFromReg) {
       unsigned Reg =
@@ -2255,10 +2235,9 @@ static bool hasOnlyLiveInOpers(const SUnit *SU) {
 /// it has no other use. It should be scheduled closer to the terminator.
 static bool hasOnlyLiveOutUses(const SUnit *SU) {
   bool RetVal = false;
-  for (SUnit::const_succ_iterator I = SU->Succs.begin(), E = SU->Succs.end();
-       I != E; ++I) {
-    if (I->isCtrl()) continue;
-    const SUnit *SuccSU = I->getSUnit();
+  for (const SDep &Succ : SU->Succs) {
+    if (Succ.isCtrl()) continue;
+    const SUnit *SuccSU = Succ.getSUnit();
     if (SuccSU->getNode() && SuccSU->getNode()->getOpcode() == ISD::CopyToReg) {
       unsigned Reg =
         cast<RegisterSDNode>(SuccSU->getNode()->getOperand(1))->getReg();
@@ -2293,10 +2272,9 @@ static void initVRegCycle(SUnit *SU) {
 
   SU->isVRegCycle = true;
 
-  for (SUnit::const_pred_iterator I = SU->Preds.begin(), E = SU->Preds.end();
-       I != E; ++I) {
-    if (I->isCtrl()) continue;
-    I->getSUnit()->isVRegCycle = true;
+  for (const SDep &Pred : SU->Preds) {
+    if (Pred.isCtrl()) continue;
+    Pred.getSUnit()->isVRegCycle = true;
   }
 }
 
@@ -2306,14 +2284,13 @@ static void resetVRegCycle(SUnit *SU) {
   if (!SU->isVRegCycle)
     return;
 
-  for (SUnit::const_pred_iterator I = SU->Preds.begin(),E = SU->Preds.end();
-       I != E; ++I) {
-    if (I->isCtrl()) continue;  // ignore chain preds
-    SUnit *PredSU = I->getSUnit();
+  for (const SDep &Pred : SU->Preds) {
+    if (Pred.isCtrl()) continue;  // ignore chain preds
+    SUnit *PredSU = Pred.getSUnit();
     if (PredSU->isVRegCycle) {
       assert(PredSU->getNode()->getOpcode() == ISD::CopyFromReg &&
              "VRegCycle def must be CopyFromReg");
-      I->getSUnit()->isVRegCycle = 0;
+      Pred.getSUnit()->isVRegCycle = false;
     }
   }
 }
@@ -2325,11 +2302,10 @@ static bool hasVRegCycleUse(const SUnit *SU) {
   if (SU->isVRegCycle)
     return false;
 
-  for (SUnit::const_pred_iterator I = SU->Preds.begin(),E = SU->Preds.end();
-       I != E; ++I) {
-    if (I->isCtrl()) continue;  // ignore chain preds
-    if (I->getSUnit()->isVRegCycle &&
-        I->getSUnit()->getNode()->getOpcode() == ISD::CopyFromReg) {
+  for (const SDep &Pred : SU->Preds) {
+    if (Pred.isCtrl()) continue;  // ignore chain preds
+    if (Pred.getSUnit()->isVRegCycle &&
+        Pred.getSUnit()->getNode()->getOpcode() == ISD::CopyFromReg) {
       DEBUG(dbgs() << "  VReg cycle use: SU (" << SU->NodeNum << ")\n");
       return true;
     }
@@ -2684,11 +2660,9 @@ void RegReductionPQBase::initNodes(std::vector<SUnit> &sunits) {
   CalculateSethiUllmanNumbers();
 
   // For single block loops, mark nodes that look like canonical IV increments.
-  if (scheduleDAG->BB->isSuccessor(scheduleDAG->BB)) {
-    for (unsigned i = 0, e = sunits.size(); i != e; ++i) {
-      initVRegCycle(&sunits[i]);
-    }
-  }
+  if (scheduleDAG->BB->isSuccessor(scheduleDAG->BB))
+    for (SUnit &SU : sunits)
+      initVRegCycle(&SU);
 }
 
 //===----------------------------------------------------------------------===//
@@ -2726,16 +2700,15 @@ static bool canClobberReachingPhysRegUse(const SUnit *DepSU, const SUnit *SU,
   if(!ImpDefs && !RegMask)
     return false;
 
-  for (SUnit::const_succ_iterator SI = SU->Succs.begin(), SE = SU->Succs.end();
-       SI != SE; ++SI) {
-    SUnit *SuccSU = SI->getSUnit();
-    for (SUnit::const_pred_iterator PI = SuccSU->Preds.begin(),
-           PE = SuccSU->Preds.end(); PI != PE; ++PI) {
-      if (!PI->isAssignedRegDep())
+  for (const SDep &Succ : SU->Succs) {
+    SUnit *SuccSU = Succ.getSUnit();
+    for (const SDep &SuccPred : SuccSU->Preds) {
+      if (!SuccPred.isAssignedRegDep())
         continue;
 
-      if (RegMask && MachineOperand::clobbersPhysReg(RegMask, PI->getReg()) &&
-          scheduleDAG->IsReachable(DepSU, PI->getSUnit()))
+      if (RegMask &&
+          MachineOperand::clobbersPhysReg(RegMask, SuccPred.getReg()) &&
+          scheduleDAG->IsReachable(DepSU, SuccPred.getSUnit()))
         return true;
 
       if (ImpDefs)
@@ -2743,8 +2716,8 @@ static bool canClobberReachingPhysRegUse(const SUnit *DepSU, const SUnit *SU,
           // Return true if SU clobbers this physical register use and the
           // definition of the register reaches from DepSU. IsReachable queries
           // a topological forward sort of the DAG (following the successors).
-          if (TRI->regsOverlap(*ImpDef, PI->getReg()) &&
-              scheduleDAG->IsReachable(DepSU, PI->getSUnit()))
+          if (TRI->regsOverlap(*ImpDef, SuccPred.getReg()) &&
+              scheduleDAG->IsReachable(DepSU, SuccPred.getSUnit()))
             return true;
     }
   }
@@ -2823,19 +2796,18 @@ static bool canClobberPhysRegDefs(const SUnit *SuccSU, const SUnit *SU,
 ///
 void RegReductionPQBase::PrescheduleNodesWithMultipleUses() {
   // Visit all the nodes in topological order, working top-down.
-  for (unsigned i = 0, e = SUnits->size(); i != e; ++i) {
-    SUnit *SU = &(*SUnits)[i];
+  for (SUnit &SU : *SUnits) {
     // For now, only look at nodes with no data successors, such as stores.
     // These are especially important, due to the heuristics in
     // getNodePriority for nodes with no data successors.
-    if (SU->NumSuccs != 0)
+    if (SU.NumSuccs != 0)
       continue;
     // For now, only look at nodes with exactly one data predecessor.
-    if (SU->NumPreds != 1)
+    if (SU.NumPreds != 1)
       continue;
     // Avoid prescheduling copies to virtual registers, which don't behave
     // like other nodes from the perspective of scheduling heuristics.
-    if (SDNode *N = SU->getNode())
+    if (SDNode *N = SU.getNode())
       if (N->getOpcode() == ISD::CopyToReg &&
           TargetRegisterInfo::isVirtualRegister
             (cast<RegisterSDNode>(N->getOperand(1))->getReg()))
@@ -2843,10 +2815,9 @@ void RegReductionPQBase::PrescheduleNodesWithMultipleUses() {
 
     // Locate the single data predecessor.
     SUnit *PredSU = nullptr;
-    for (SUnit::const_pred_iterator II = SU->Preds.begin(),
-         EE = SU->Preds.end(); II != EE; ++II)
-      if (!II->isCtrl()) {
-        PredSU = II->getSUnit();
+    for (const SDep &Pred : SU.Preds)
+      if (!Pred.isCtrl()) {
+        PredSU = Pred.getSUnit();
         break;
       }
     assert(PredSU);
@@ -2860,44 +2831,43 @@ void RegReductionPQBase::PrescheduleNodesWithMultipleUses() {
       continue;
     // Avoid prescheduling to copies from virtual registers, which don't behave
     // like other nodes from the perspective of scheduling heuristics.
-    if (SDNode *N = SU->getNode())
+    if (SDNode *N = SU.getNode())
       if (N->getOpcode() == ISD::CopyFromReg &&
           TargetRegisterInfo::isVirtualRegister
             (cast<RegisterSDNode>(N->getOperand(1))->getReg()))
         continue;
 
     // Perform checks on the successors of PredSU.
-    for (SUnit::const_succ_iterator II = PredSU->Succs.begin(),
-         EE = PredSU->Succs.end(); II != EE; ++II) {
-      SUnit *PredSuccSU = II->getSUnit();
-      if (PredSuccSU == SU) continue;
+    for (const SDep &PredSucc : PredSU->Succs) {
+      SUnit *PredSuccSU = PredSucc.getSUnit();
+      if (PredSuccSU == &SU) continue;
       // If PredSU has another successor with no data successors, for
       // now don't attempt to choose either over the other.
       if (PredSuccSU->NumSuccs == 0)
         goto outer_loop_continue;
       // Don't break physical register dependencies.
-      if (SU->hasPhysRegClobbers && PredSuccSU->hasPhysRegDefs)
-        if (canClobberPhysRegDefs(PredSuccSU, SU, TII, TRI))
+      if (SU.hasPhysRegClobbers && PredSuccSU->hasPhysRegDefs)
+        if (canClobberPhysRegDefs(PredSuccSU, &SU, TII, TRI))
           goto outer_loop_continue;
       // Don't introduce graph cycles.
-      if (scheduleDAG->IsReachable(SU, PredSuccSU))
+      if (scheduleDAG->IsReachable(&SU, PredSuccSU))
         goto outer_loop_continue;
     }
 
     // Ok, the transformation is safe and the heuristics suggest it is
     // profitable. Update the graph.
-    DEBUG(dbgs() << "    Prescheduling SU #" << SU->NodeNum
+    DEBUG(dbgs() << "    Prescheduling SU #" << SU.NodeNum
                  << " next to PredSU #" << PredSU->NodeNum
                  << " to guide scheduling in the presence of multiple uses\n");
     for (unsigned i = 0; i != PredSU->Succs.size(); ++i) {
       SDep Edge = PredSU->Succs[i];
       assert(!Edge.isAssignedRegDep());
       SUnit *SuccSU = Edge.getSUnit();
-      if (SuccSU != SU) {
+      if (SuccSU != &SU) {
         Edge.setSUnit(PredSU);
         scheduleDAG->RemovePred(SuccSU, Edge);
-        scheduleDAG->AddPred(SU, Edge);
-        Edge.setSUnit(SU);
+        scheduleDAG->AddPred(&SU, Edge);
+        Edge.setSUnit(&SU);
         scheduleDAG->AddPred(SuccSU, Edge);
         --i;
       }
@@ -2914,16 +2884,15 @@ void RegReductionPQBase::PrescheduleNodesWithMultipleUses() {
 /// If both are two-address, but one is commutable while the other is not
 /// commutable, favor the one that's not commutable.
 void RegReductionPQBase::AddPseudoTwoAddrDeps() {
-  for (unsigned i = 0, e = SUnits->size(); i != e; ++i) {
-    SUnit *SU = &(*SUnits)[i];
-    if (!SU->isTwoAddress)
+  for (SUnit &SU : *SUnits) {
+    if (!SU.isTwoAddress)
       continue;
 
-    SDNode *Node = SU->getNode();
-    if (!Node || !Node->isMachineOpcode() || SU->getNode()->getGluedNode())
+    SDNode *Node = SU.getNode();
+    if (!Node || !Node->isMachineOpcode() || SU.getNode()->getGluedNode())
       continue;
 
-    bool isLiveOut = hasOnlyLiveOutUses(SU);
+    bool isLiveOut = hasOnlyLiveOutUses(&SU);
     unsigned Opc = Node->getMachineOpcode();
     const MCInstrDesc &MCID = TII->get(Opc);
     unsigned NumRes = MCID.getNumDefs();
@@ -2931,21 +2900,22 @@ void RegReductionPQBase::AddPseudoTwoAddrDeps() {
     for (unsigned j = 0; j != NumOps; ++j) {
       if (MCID.getOperandConstraint(j+NumRes, MCOI::TIED_TO) == -1)
         continue;
-      SDNode *DU = SU->getNode()->getOperand(j).getNode();
+      SDNode *DU = SU.getNode()->getOperand(j).getNode();
       if (DU->getNodeId() == -1)
         continue;
       const SUnit *DUSU = &(*SUnits)[DU->getNodeId()];
-      if (!DUSU) continue;
-      for (SUnit::const_succ_iterator I = DUSU->Succs.begin(),
-           E = DUSU->Succs.end(); I != E; ++I) {
-        if (I->isCtrl()) continue;
-        SUnit *SuccSU = I->getSUnit();
-        if (SuccSU == SU)
+      if (!DUSU)
+        continue;
+      for (const SDep &Succ : DUSU->Succs) {
+        if (Succ.isCtrl())
+          continue;
+        SUnit *SuccSU = Succ.getSUnit();
+        if (SuccSU == &SU)
           continue;
         // Be conservative. Ignore if nodes aren't at roughly the same
         // depth and height.
-        if (SuccSU->getHeight() < SU->getHeight() &&
-            (SU->getHeight() - SuccSU->getHeight()) > 1)
+        if (SuccSU->getHeight() < SU.getHeight() &&
+            (SU.getHeight() - SuccSU->getHeight()) > 1)
           continue;
         // Skip past COPY_TO_REGCLASS nodes, so that the pseudo edge
         // constrains whatever is using the copy, instead of the copy
@@ -2961,8 +2931,8 @@ void RegReductionPQBase::AddPseudoTwoAddrDeps() {
           continue;
         // Don't constrain nodes with physical register defs if the
         // predecessor can clobber them.
-        if (SuccSU->hasPhysRegDefs && SU->hasPhysRegClobbers) {
-          if (canClobberPhysRegDefs(SuccSU, SU, TII, TRI))
+        if (SuccSU->hasPhysRegDefs && SU.hasPhysRegClobbers) {
+          if (canClobberPhysRegDefs(SuccSU, &SU, TII, TRI))
             continue;
         }
         // Don't constrain EXTRACT_SUBREG, INSERT_SUBREG, and SUBREG_TO_REG;
@@ -2972,14 +2942,14 @@ void RegReductionPQBase::AddPseudoTwoAddrDeps() {
             SuccOpc == TargetOpcode::INSERT_SUBREG ||
             SuccOpc == TargetOpcode::SUBREG_TO_REG)
           continue;
-        if (!canClobberReachingPhysRegUse(SuccSU, SU, scheduleDAG, TII, TRI) &&
+        if (!canClobberReachingPhysRegUse(SuccSU, &SU, scheduleDAG, TII, TRI) &&
             (!canClobber(SuccSU, DUSU) ||
              (isLiveOut && !hasOnlyLiveOutUses(SuccSU)) ||
-             (!SU->isCommutable && SuccSU->isCommutable)) &&
-            !scheduleDAG->IsReachable(SuccSU, SU)) {
+             (!SU.isCommutable && SuccSU->isCommutable)) &&
+            !scheduleDAG->IsReachable(SuccSU, &SU)) {
           DEBUG(dbgs() << "    Adding a pseudo-two-addr edge from SU #"
-                       << SU->NodeNum << " to SU #" << SuccSU->NodeNum << "\n");
-          scheduleDAG->AddPred(SU, SDep(SuccSU, SDep::Artificial));
+                       << SU.NodeNum << " to SU #" << SuccSU->NodeNum << "\n");
+          scheduleDAG->AddPred(&SU, SDep(SuccSU, SDep::Artificial));
         }
       }
     }
diff --git a/lib/CodeGen/SelectionDAG/ScheduleDAGSDNodes.cpp b/lib/CodeGen/SelectionDAG/ScheduleDAGSDNodes.cpp
index 2a6c853a1d11..3be622f8c179 100644
--- a/lib/CodeGen/SelectionDAG/ScheduleDAGSDNodes.cpp
+++ b/lib/CodeGen/SelectionDAG/ScheduleDAGSDNodes.cpp
@@ -321,7 +321,7 @@ void ScheduleDAGSDNodes::BuildSchedUnits() {
 
   // Add all nodes in depth first order.
   SmallVector<SDNode*, 64> Worklist;
-  SmallPtrSet<SDNode*, 64> Visited;
+  SmallPtrSet<SDNode*, 32> Visited;
   Worklist.push_back(DAG->getRoot().getNode());
   Visited.insert(DAG->getRoot().getNode());
 
@@ -750,7 +750,7 @@ ProcessSourceNode(SDNode *N, SelectionDAG *DAG, InstrEmitter &Emitter,
     return;
   }
 
-  Orders.push_back(std::make_pair(Order, std::prev(Emitter.getInsertPos())));
+  Orders.push_back(std::make_pair(Order, &*std::prev(Emitter.getInsertPos())));
   ProcessSDDbgValues(N, DAG, Emitter, Orders, VRBaseMap, Order);
 }
 
diff --git a/lib/CodeGen/SelectionDAG/SelectionDAG.cpp b/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
index 893871f94485..8235522b14bd 100644
--- a/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
+++ b/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
@@ -24,6 +24,7 @@
 #include "llvm/CodeGen/MachineConstantPool.h"
 #include "llvm/CodeGen/MachineFrameInfo.h"
 #include "llvm/CodeGen/MachineModuleInfo.h"
+#include "llvm/CodeGen/SelectionDAGTargetInfo.h"
 #include "llvm/IR/CallingConv.h"
 #include "llvm/IR/Constants.h"
 #include "llvm/IR/DataLayout.h"
@@ -33,7 +34,6 @@
 #include "llvm/IR/GlobalAlias.h"
 #include "llvm/IR/GlobalVariable.h"
 #include "llvm/IR/Intrinsics.h"
-#include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/ManagedStatic.h"
@@ -46,7 +46,6 @@
 #include "llvm/Target/TargetMachine.h"
 #include "llvm/Target/TargetOptions.h"
 #include "llvm/Target/TargetRegisterInfo.h"
-#include "llvm/Target/TargetSelectionDAGInfo.h"
 #include "llvm/Target/TargetSubtargetInfo.h"
 #include <algorithm>
 #include <cmath>
@@ -94,8 +93,22 @@ bool ConstantFPSDNode::isValueValidForType(EVT VT,
 //                              ISD Namespace
 //===----------------------------------------------------------------------===//
 
-/// isBuildVectorAllOnes - Return true if the specified node is a
-/// BUILD_VECTOR where all of the elements are ~0 or undef.
+bool ISD::isConstantSplatVector(const SDNode *N, APInt &SplatVal) {
+  auto *BV = dyn_cast<BuildVectorSDNode>(N);
+  if (!BV)
+    return false;
+
+  APInt SplatUndef;
+  unsigned SplatBitSize;
+  bool HasUndefs;
+  EVT EltVT = N->getValueType(0).getVectorElementType();
+  return BV->isConstantSplat(SplatVal, SplatUndef, SplatBitSize, HasUndefs) &&
+         EltVT.getSizeInBits() >= SplatBitSize;
+}
+
+// FIXME: AllOnes and AllZeros duplicate a lot of code. Could these be
+// specializations of the more general isConstantSplatVector()?
+
 bool ISD::isBuildVectorAllOnes(const SDNode *N) {
   // Look through a bit convert.
   while (N->getOpcode() == ISD::BITCAST)
@@ -106,7 +119,7 @@ bool ISD::isBuildVectorAllOnes(const SDNode *N) {
   unsigned i = 0, e = N->getNumOperands();
 
   // Skip over all of the undef values.
-  while (i != e && N->getOperand(i).getOpcode() == ISD::UNDEF)
+  while (i != e && N->getOperand(i).isUndef())
     ++i;
 
   // Do not accept an all-undef vector.
@@ -135,15 +148,11 @@ bool ISD::isBuildVectorAllOnes(const SDNode *N) {
   // undefs. Even with the above element type twiddling, this should be OK, as
   // the same type legalization should have applied to all the elements.
   for (++i; i != e; ++i)
-    if (N->getOperand(i) != NotZero &&
-        N->getOperand(i).getOpcode() != ISD::UNDEF)
+    if (N->getOperand(i) != NotZero && !N->getOperand(i).isUndef())
       return false;
   return true;
 }
 
-
-/// isBuildVectorAllZeros - Return true if the specified node is a
-/// BUILD_VECTOR where all of the elements are 0 or undef.
 bool ISD::isBuildVectorAllZeros(const SDNode *N) {
   // Look through a bit convert.
   while (N->getOpcode() == ISD::BITCAST)
@@ -153,7 +162,7 @@ bool ISD::isBuildVectorAllZeros(const SDNode *N) {
 
   bool IsAllUndef = true;
   for (const SDValue &Op : N->op_values()) {
-    if (Op.getOpcode() == ISD::UNDEF)
+    if (Op.isUndef())
       continue;
     IsAllUndef = false;
     // Do not accept build_vectors that aren't all constants or which have non-0
@@ -181,14 +190,12 @@ bool ISD::isBuildVectorAllZeros(const SDNode *N) {
   return true;
 }
 
-/// \brief Return true if the specified node is a BUILD_VECTOR node of
-/// all ConstantSDNode or undef.
 bool ISD::isBuildVectorOfConstantSDNodes(const SDNode *N) {
   if (N->getOpcode() != ISD::BUILD_VECTOR)
     return false;
 
   for (const SDValue &Op : N->op_values()) {
-    if (Op.getOpcode() == ISD::UNDEF)
+    if (Op.isUndef())
       continue;
     if (!isa<ConstantSDNode>(Op))
       return false;
@@ -196,14 +203,12 @@ bool ISD::isBuildVectorOfConstantSDNodes(const SDNode *N) {
   return true;
 }
 
-/// \brief Return true if the specified node is a BUILD_VECTOR node of
-/// all ConstantFPSDNode or undef.
 bool ISD::isBuildVectorOfConstantFPSDNodes(const SDNode *N) {
   if (N->getOpcode() != ISD::BUILD_VECTOR)
     return false;
 
   for (const SDValue &Op : N->op_values()) {
-    if (Op.getOpcode() == ISD::UNDEF)
+    if (Op.isUndef())
       continue;
     if (!isa<ConstantFPSDNode>(Op))
       return false;
@@ -211,8 +216,6 @@ bool ISD::isBuildVectorOfConstantFPSDNodes(const SDNode *N) {
   return true;
 }
 
-/// allOperandsUndef - Return true if the node has at least one operand
-/// and all operands of the specified node are ISD::UNDEF.
 bool ISD::allOperandsUndef(const SDNode *N) {
   // Return false if the node has no operands.
   // This is "logically inconsistent" with the definition of "all" but
@@ -221,7 +224,7 @@ bool ISD::allOperandsUndef(const SDNode *N) {
     return false;
 
   for (const SDValue &Op : N->op_values())
-    if (Op.getOpcode() != ISD::UNDEF)
+    if (!Op.isUndef())
       return false;
 
   return true;
@@ -242,8 +245,6 @@ ISD::NodeType ISD::getExtForLoadExtType(bool IsFP, ISD::LoadExtType ExtType) {
   llvm_unreachable("Invalid LoadExtType");
 }
 
-/// getSetCCSwappedOperands - Return the operation corresponding to (Y op X)
-/// when given the operation for (X op Y).
 ISD::CondCode ISD::getSetCCSwappedOperands(ISD::CondCode Operation) {
   // To perform this operation, we just need to swap the L and G bits of the
   // operation.
@@ -254,8 +255,6 @@ ISD::CondCode ISD::getSetCCSwappedOperands(ISD::CondCode Operation) {
                        (OldG << 2));       // New L bit.
 }
 
-/// getSetCCInverse - Return the operation corresponding to !(X op Y), where
-/// 'op' is a valid SetCC operation.
 ISD::CondCode ISD::getSetCCInverse(ISD::CondCode Op, bool isInteger) {
   unsigned Operation = Op;
   if (isInteger)
@@ -270,9 +269,9 @@ ISD::CondCode ISD::getSetCCInverse(ISD::CondCode Op, bool isInteger) {
 }
 
 
-/// isSignedOp - For an integer comparison, return 1 if the comparison is a
-/// signed operation and 2 if the result is an unsigned comparison.  Return zero
-/// if the operation does not depend on the sign of the input (setne and seteq).
+/// For an integer comparison, return 1 if the comparison is a signed operation
+/// and 2 if the result is an unsigned comparison. Return zero if the operation
+/// does not depend on the sign of the input (setne and seteq).
 static int isSignedOp(ISD::CondCode Opcode) {
   switch (Opcode) {
   default: llvm_unreachable("Illegal integer setcc operation!");
@@ -289,10 +288,6 @@ static int isSignedOp(ISD::CondCode Opcode) {
   }
 }
 
-/// getSetCCOrOperation - Return the result of a logical OR between different
-/// comparisons of identical values: ((X op1 Y) | (X op2 Y)).  This function
-/// returns SETCC_INVALID if it is not possible to represent the resultant
-/// comparison.
 ISD::CondCode ISD::getSetCCOrOperation(ISD::CondCode Op1, ISD::CondCode Op2,
                                        bool isInteger) {
   if (isInteger && (isSignedOp(Op1) | isSignedOp(Op2)) == 3)
@@ -313,10 +308,6 @@ ISD::CondCode ISD::getSetCCOrOperation(ISD::CondCode Op1, ISD::CondCode Op2,
   return ISD::CondCode(Op);
 }
 
-/// getSetCCAndOperation - Return the result of a logical AND between different
-/// comparisons of identical values: ((X op1 Y) & (X op2 Y)).  This
-/// function returns zero if it is not possible to represent the resultant
-/// comparison.
 ISD::CondCode ISD::getSetCCAndOperation(ISD::CondCode Op1, ISD::CondCode Op2,
                                         bool isInteger) {
   if (isInteger && (isSignedOp(Op1) | isSignedOp(Op2)) == 3)
@@ -644,7 +635,8 @@ void SelectionDAG::DeleteNode(SDNode *N) {
 }
 
 void SelectionDAG::DeleteNodeNotInCSEMaps(SDNode *N) {
-  assert(N != AllNodes.begin() && "Cannot delete the entry node!");
+  assert(N->getIterator() != AllNodes.begin() &&
+         "Cannot delete the entry node!");
   assert(N->use_empty() && "Cannot delete a node that is not dead!");
 
   // Drop all of the operands and decrement used node's use counts.
@@ -663,8 +655,8 @@ void SDDbgInfo::erase(const SDNode *Node) {
 }
 
 void SelectionDAG::DeallocateNode(SDNode *N) {
-  if (N->OperandsNeedDelete)
-    delete[] N->OperandList;
+  // If we have operands, deallocate them.
+  removeOperands(N);
 
   // Set the opcode to DELETED_NODE to help catch bugs when node
   // memory is reallocated.
@@ -832,7 +824,7 @@ SDNode *SelectionDAG::FindModifiedNodeSlot(SDNode *N, SDValue Op,
   FoldingSetNodeID ID;
   AddNodeIDNode(ID, N->getOpcode(), N->getVTList(), Ops);
   AddNodeIDCustom(ID, N);
-  SDNode *Node = FindNodeOrInsertPos(ID, N->getDebugLoc(), InsertPos);
+  SDNode *Node = FindNodeOrInsertPos(ID, SDLoc(N), InsertPos);
   if (Node)
     if (const SDNodeFlags *Flags = N->getFlags())
       Node->intersectFlagsWith(Flags);
@@ -853,7 +845,7 @@ SDNode *SelectionDAG::FindModifiedNodeSlot(SDNode *N,
   FoldingSetNodeID ID;
   AddNodeIDNode(ID, N->getOpcode(), N->getVTList(), Ops);
   AddNodeIDCustom(ID, N);
-  SDNode *Node = FindNodeOrInsertPos(ID, N->getDebugLoc(), InsertPos);
+  SDNode *Node = FindNodeOrInsertPos(ID, SDLoc(N), InsertPos);
   if (Node)
     if (const SDNodeFlags *Flags = N->getFlags())
       Node->intersectFlagsWith(Flags);
@@ -873,16 +865,13 @@ SDNode *SelectionDAG::FindModifiedNodeSlot(SDNode *N, ArrayRef<SDValue> Ops,
   FoldingSetNodeID ID;
   AddNodeIDNode(ID, N->getOpcode(), N->getVTList(), Ops);
   AddNodeIDCustom(ID, N);
-  SDNode *Node = FindNodeOrInsertPos(ID, N->getDebugLoc(), InsertPos);
+  SDNode *Node = FindNodeOrInsertPos(ID, SDLoc(N), InsertPos);
   if (Node)
     if (const SDNodeFlags *Flags = N->getFlags())
       Node->intersectFlagsWith(Flags);
   return Node;
 }
 
-/// getEVTAlignment - Compute the default alignment value for the
-/// given type.
-///
 unsigned SelectionDAG::getEVTAlignment(EVT VT) const {
   Type *Ty = VT == MVT::iPTR ?
                    PointerType::get(Type::getInt8Ty(*getContext()), 0) :
@@ -911,6 +900,7 @@ void SelectionDAG::init(MachineFunction &mf) {
 SelectionDAG::~SelectionDAG() {
   assert(!UpdateListeners && "Dangling registered DAGUpdateListeners");
   allnodes_clear();
+  OperandRecycler.clear(OperandAllocator);
   delete DbgInfo;
 }
 
@@ -924,24 +914,26 @@ void SelectionDAG::allnodes_clear() {
 #endif
 }
 
-BinarySDNode *SelectionDAG::GetBinarySDNode(unsigned Opcode, SDLoc DL,
-                                            SDVTList VTs, SDValue N1,
-                                            SDValue N2,
-                                            const SDNodeFlags *Flags) {
+SDNode *SelectionDAG::GetBinarySDNode(unsigned Opcode, const SDLoc &DL,
+                                      SDVTList VTs, SDValue N1, SDValue N2,
+                                      const SDNodeFlags *Flags) {
+  SDValue Ops[] = {N1, N2};
+
   if (isBinOpWithFlags(Opcode)) {
     // If no flags were passed in, use a default flags object.
     SDNodeFlags F;
     if (Flags == nullptr)
       Flags = &F;
 
-    BinaryWithFlagsSDNode *FN = new (NodeAllocator) BinaryWithFlagsSDNode(
-        Opcode, DL.getIROrder(), DL.getDebugLoc(), VTs, N1, N2, *Flags);
+    auto *FN = newSDNode<BinaryWithFlagsSDNode>(Opcode, DL.getIROrder(),
+                                                DL.getDebugLoc(), VTs, *Flags);
+    createOperands(FN, Ops);
 
     return FN;
   }
 
-  BinarySDNode *N = new (NodeAllocator)
-      BinarySDNode(Opcode, DL.getIROrder(), DL.getDebugLoc(), VTs, N1, N2);
+  auto *N = newSDNode<SDNode>(Opcode, DL.getIROrder(), DL.getDebugLoc(), VTs);
+  createOperands(N, Ops);
   return N;
 }
 
@@ -961,19 +953,25 @@ SDNode *SelectionDAG::FindNodeOrInsertPos(const FoldingSetNodeID &ID,
 }
 
 SDNode *SelectionDAG::FindNodeOrInsertPos(const FoldingSetNodeID &ID,
-                                          DebugLoc DL, void *&InsertPos) {
+                                          const SDLoc &DL, void *&InsertPos) {
   SDNode *N = CSEMap.FindNodeOrInsertPos(ID, InsertPos);
   if (N) {
     switch (N->getOpcode()) {
-    default: break; // Process only regular (non-target) constant nodes.
     case ISD::Constant:
     case ISD::ConstantFP:
       // Erase debug location from the node if the node is used at several
-      // different places to do not propagate one location to all uses as it
-      // leads to incorrect debug info.
-      if (N->getDebugLoc() != DL)
+      // different places. Do not propagate one location to all uses as it
+      // will cause a worse single stepping debugging experience.
+      if (N->getDebugLoc() != DL.getDebugLoc())
         N->setDebugLoc(DebugLoc());
       break;
+    default:
+      // When the node's point of use is located earlier in the instruction
+      // sequence than its prior point of use, update its debug info to the
+      // earlier location.
+      if (DL.getIROrder() && DL.getIROrder() < N->getIROrder())
+        N->setDebugLoc(DL.getDebugLoc());
+      break;
     }
   }
   return N;
@@ -981,6 +979,7 @@ SDNode *SelectionDAG::FindNodeOrInsertPos(const FoldingSetNodeID &ID,
 
 void SelectionDAG::clear() {
   allnodes_clear();
+  OperandRecycler.clear(OperandAllocator);
   OperandAllocator.Reset();
   CSEMap.clear();
 
@@ -999,25 +998,25 @@ void SelectionDAG::clear() {
   DbgInfo->clear();
 }
 
-SDValue SelectionDAG::getAnyExtOrTrunc(SDValue Op, SDLoc DL, EVT VT) {
+SDValue SelectionDAG::getAnyExtOrTrunc(SDValue Op, const SDLoc &DL, EVT VT) {
   return VT.bitsGT(Op.getValueType()) ?
     getNode(ISD::ANY_EXTEND, DL, VT, Op) :
     getNode(ISD::TRUNCATE, DL, VT, Op);
 }
 
-SDValue SelectionDAG::getSExtOrTrunc(SDValue Op, SDLoc DL, EVT VT) {
+SDValue SelectionDAG::getSExtOrTrunc(SDValue Op, const SDLoc &DL, EVT VT) {
   return VT.bitsGT(Op.getValueType()) ?
     getNode(ISD::SIGN_EXTEND, DL, VT, Op) :
     getNode(ISD::TRUNCATE, DL, VT, Op);
 }
 
-SDValue SelectionDAG::getZExtOrTrunc(SDValue Op, SDLoc DL, EVT VT) {
+SDValue SelectionDAG::getZExtOrTrunc(SDValue Op, const SDLoc &DL, EVT VT) {
   return VT.bitsGT(Op.getValueType()) ?
     getNode(ISD::ZERO_EXTEND, DL, VT, Op) :
     getNode(ISD::TRUNCATE, DL, VT, Op);
 }
 
-SDValue SelectionDAG::getBoolExtOrTrunc(SDValue Op, SDLoc SL, EVT VT,
+SDValue SelectionDAG::getBoolExtOrTrunc(SDValue Op, const SDLoc &SL, EVT VT,
                                         EVT OpVT) {
   if (VT.bitsLE(Op.getValueType()))
     return getNode(ISD::TRUNCATE, SL, VT, Op);
@@ -1026,7 +1025,7 @@ SDValue SelectionDAG::getBoolExtOrTrunc(SDValue Op, SDLoc SL, EVT VT,
   return getNode(TLI->getExtendForContent(BType), SL, VT, Op);
 }
 
-SDValue SelectionDAG::getZeroExtendInReg(SDValue Op, SDLoc DL, EVT VT) {
+SDValue SelectionDAG::getZeroExtendInReg(SDValue Op, const SDLoc &DL, EVT VT) {
   assert(!VT.isVector() &&
          "getZeroExtendInReg should use the vector element type instead of "
          "the vector type!");
@@ -1038,7 +1037,8 @@ SDValue SelectionDAG::getZeroExtendInReg(SDValue Op, SDLoc DL, EVT VT) {
                  getConstant(Imm, DL, Op.getValueType()));
 }
 
-SDValue SelectionDAG::getAnyExtendVectorInReg(SDValue Op, SDLoc DL, EVT VT) {
+SDValue SelectionDAG::getAnyExtendVectorInReg(SDValue Op, const SDLoc &DL,
+                                              EVT VT) {
   assert(VT.isVector() && "This DAG node is restricted to vector types.");
   assert(VT.getSizeInBits() == Op.getValueType().getSizeInBits() &&
          "The sizes of the input and result must match in order to perform the "
@@ -1048,7 +1048,8 @@ SDValue SelectionDAG::getAnyExtendVectorInReg(SDValue Op, SDLoc DL, EVT VT) {
   return getNode(ISD::ANY_EXTEND_VECTOR_INREG, DL, VT, Op);
 }
 
-SDValue SelectionDAG::getSignExtendVectorInReg(SDValue Op, SDLoc DL, EVT VT) {
+SDValue SelectionDAG::getSignExtendVectorInReg(SDValue Op, const SDLoc &DL,
+                                               EVT VT) {
   assert(VT.isVector() && "This DAG node is restricted to vector types.");
   assert(VT.getSizeInBits() == Op.getValueType().getSizeInBits() &&
          "The sizes of the input and result must match in order to perform the "
@@ -1058,7 +1059,8 @@ SDValue SelectionDAG::getSignExtendVectorInReg(SDValue Op, SDLoc DL, EVT VT) {
   return getNode(ISD::SIGN_EXTEND_VECTOR_INREG, DL, VT, Op);
 }
 
-SDValue SelectionDAG::getZeroExtendVectorInReg(SDValue Op, SDLoc DL, EVT VT) {
+SDValue SelectionDAG::getZeroExtendVectorInReg(SDValue Op, const SDLoc &DL,
+                                               EVT VT) {
   assert(VT.isVector() && "This DAG node is restricted to vector types.");
   assert(VT.getSizeInBits() == Op.getValueType().getSizeInBits() &&
          "The sizes of the input and result must match in order to perform the "
@@ -1070,14 +1072,14 @@ SDValue SelectionDAG::getZeroExtendVectorInReg(SDValue Op, SDLoc DL, EVT VT) {
 
 /// getNOT - Create a bitwise NOT operation as (XOR Val, -1).
 ///
-SDValue SelectionDAG::getNOT(SDLoc DL, SDValue Val, EVT VT) {
+SDValue SelectionDAG::getNOT(const SDLoc &DL, SDValue Val, EVT VT) {
   EVT EltVT = VT.getScalarType();
   SDValue NegOne =
     getConstant(APInt::getAllOnesValue(EltVT.getSizeInBits()), DL, VT);
   return getNode(ISD::XOR, DL, VT, Val, NegOne);
 }
 
-SDValue SelectionDAG::getLogicalNOT(SDLoc DL, SDValue Val, EVT VT) {
+SDValue SelectionDAG::getLogicalNOT(const SDLoc &DL, SDValue Val, EVT VT) {
   EVT EltVT = VT.getScalarType();
   SDValue TrueValue;
   switch (TLI->getBooleanContents(VT)) {
@@ -1093,8 +1095,8 @@ SDValue SelectionDAG::getLogicalNOT(SDLoc DL, SDValue Val, EVT VT) {
   return getNode(ISD::XOR, DL, VT, Val, TrueValue);
 }
 
-SDValue SelectionDAG::getConstant(uint64_t Val, SDLoc DL, EVT VT, bool isT,
-                                  bool isO) {
+SDValue SelectionDAG::getConstant(uint64_t Val, const SDLoc &DL, EVT VT,
+                                  bool isT, bool isO) {
   EVT EltVT = VT.getScalarType();
   assert((EltVT.getSizeInBits() >= 64 ||
          (uint64_t)((int64_t)Val >> EltVT.getSizeInBits()) + 1 < 2) &&
@@ -1102,14 +1104,13 @@ SDValue SelectionDAG::getConstant(uint64_t Val, SDLoc DL, EVT VT, bool isT,
   return getConstant(APInt(EltVT.getSizeInBits(), Val), DL, VT, isT, isO);
 }
 
-SDValue SelectionDAG::getConstant(const APInt &Val, SDLoc DL, EVT VT, bool isT,
-                                  bool isO)
-{
+SDValue SelectionDAG::getConstant(const APInt &Val, const SDLoc &DL, EVT VT,
+                                  bool isT, bool isO) {
   return getConstant(*ConstantInt::get(*Context, Val), DL, VT, isT, isO);
 }
 
-SDValue SelectionDAG::getConstant(const ConstantInt &Val, SDLoc DL, EVT VT,
-                                  bool isT, bool isO) {
+SDValue SelectionDAG::getConstant(const ConstantInt &Val, const SDLoc &DL,
+                                  EVT VT, bool isT, bool isO) {
   assert(VT.isInteger() && "Cannot create FP integer constant!");
 
   EVT EltVT = VT.getScalarType();
@@ -1134,7 +1135,7 @@ SDValue SelectionDAG::getConstant(const ConstantInt &Val, SDLoc DL, EVT VT,
   else if (NewNodesMustHaveLegalTypes && VT.isVector() &&
            TLI->getTypeAction(*getContext(), EltVT) ==
            TargetLowering::TypeExpandInteger) {
-    APInt NewVal = Elt->getValue();
+    const APInt &NewVal = Elt->getValue();
     EVT ViaEltVT = TLI->getTypeToTransformTo(*getContext(), EltVT);
     unsigned ViaEltSizeInBits = ViaEltVT.getSizeInBits();
     unsigned ViaVecNumElts = VT.getSizeInBits() / ViaEltSizeInBits;
@@ -1168,9 +1169,8 @@ SDValue SelectionDAG::getConstant(const ConstantInt &Val, SDLoc DL, EVT VT,
     for (unsigned i = 0; i < VT.getVectorNumElements(); ++i)
       Ops.insert(Ops.end(), EltParts.begin(), EltParts.end());
 
-    SDValue Result = getNode(ISD::BITCAST, SDLoc(), VT,
-                             getNode(ISD::BUILD_VECTOR, SDLoc(), ViaVecVT,
-                                     Ops));
+    SDValue Result = getNode(ISD::BITCAST, DL, VT,
+                             getNode(ISD::BUILD_VECTOR, DL, ViaVecVT, Ops));
     return Result;
   }
 
@@ -1183,37 +1183,34 @@ SDValue SelectionDAG::getConstant(const ConstantInt &Val, SDLoc DL, EVT VT,
   ID.AddBoolean(isO);
   void *IP = nullptr;
   SDNode *N = nullptr;
-  if ((N = FindNodeOrInsertPos(ID, DL.getDebugLoc(), IP)))
+  if ((N = FindNodeOrInsertPos(ID, DL, IP)))
     if (!VT.isVector())
       return SDValue(N, 0);
 
   if (!N) {
-    N = new (NodeAllocator) ConstantSDNode(isT, isO, Elt, DL.getDebugLoc(),
-                                           EltVT);
+    N = newSDNode<ConstantSDNode>(isT, isO, Elt, DL.getDebugLoc(), EltVT);
     CSEMap.InsertNode(N, IP);
     InsertNode(N);
   }
 
   SDValue Result(N, 0);
-  if (VT.isVector()) {
-    SmallVector<SDValue, 8> Ops;
-    Ops.assign(VT.getVectorNumElements(), Result);
-    Result = getNode(ISD::BUILD_VECTOR, SDLoc(), VT, Ops);
-  }
+  if (VT.isVector())
+    Result = getSplatBuildVector(VT, DL, Result);
   return Result;
 }
 
-SDValue SelectionDAG::getIntPtrConstant(uint64_t Val, SDLoc DL, bool isTarget) {
+SDValue SelectionDAG::getIntPtrConstant(uint64_t Val, const SDLoc &DL,
+                                        bool isTarget) {
   return getConstant(Val, DL, TLI->getPointerTy(getDataLayout()), isTarget);
 }
 
-SDValue SelectionDAG::getConstantFP(const APFloat& V, SDLoc DL, EVT VT,
+SDValue SelectionDAG::getConstantFP(const APFloat &V, const SDLoc &DL, EVT VT,
                                     bool isTarget) {
   return getConstantFP(*ConstantFP::get(*getContext(), V), DL, VT, isTarget);
 }
 
-SDValue SelectionDAG::getConstantFP(const ConstantFP& V, SDLoc DL, EVT VT,
-                                    bool isTarget){
+SDValue SelectionDAG::getConstantFP(const ConstantFP &V, const SDLoc &DL,
+                                    EVT VT, bool isTarget) {
   assert(VT.isFloatingPoint() && "Cannot create integer FP constant!");
 
   EVT EltVT = VT.getScalarType();
@@ -1227,47 +1224,42 @@ SDValue SelectionDAG::getConstantFP(const ConstantFP& V, SDLoc DL, EVT VT,
   ID.AddPointer(&V);
   void *IP = nullptr;
   SDNode *N = nullptr;
-  if ((N = FindNodeOrInsertPos(ID, DL.getDebugLoc(), IP)))
+  if ((N = FindNodeOrInsertPos(ID, DL, IP)))
     if (!VT.isVector())
       return SDValue(N, 0);
 
   if (!N) {
-    N = new (NodeAllocator) ConstantFPSDNode(isTarget, &V, DL.getDebugLoc(),
-                                             EltVT);
+    N = newSDNode<ConstantFPSDNode>(isTarget, &V, DL.getDebugLoc(), EltVT);
     CSEMap.InsertNode(N, IP);
     InsertNode(N);
   }
 
   SDValue Result(N, 0);
-  if (VT.isVector()) {
-    SmallVector<SDValue, 8> Ops;
-    Ops.assign(VT.getVectorNumElements(), Result);
-    Result = getNode(ISD::BUILD_VECTOR, SDLoc(), VT, Ops);
-  }
+  if (VT.isVector())
+    Result = getSplatBuildVector(VT, DL, Result);
   return Result;
 }
 
-SDValue SelectionDAG::getConstantFP(double Val, SDLoc DL, EVT VT,
+SDValue SelectionDAG::getConstantFP(double Val, const SDLoc &DL, EVT VT,
                                     bool isTarget) {
   EVT EltVT = VT.getScalarType();
-  if (EltVT==MVT::f32)
+  if (EltVT == MVT::f32)
     return getConstantFP(APFloat((float)Val), DL, VT, isTarget);
-  else if (EltVT==MVT::f64)
+  else if (EltVT == MVT::f64)
     return getConstantFP(APFloat(Val), DL, VT, isTarget);
-  else if (EltVT==MVT::f80 || EltVT==MVT::f128 || EltVT==MVT::ppcf128 ||
-           EltVT==MVT::f16) {
-    bool ignored;
-    APFloat apf = APFloat(Val);
-    apf.convert(EVTToAPFloatSemantics(EltVT), APFloat::rmNearestTiesToEven,
-                &ignored);
-    return getConstantFP(apf, DL, VT, isTarget);
+  else if (EltVT == MVT::f80 || EltVT == MVT::f128 || EltVT == MVT::ppcf128 ||
+           EltVT == MVT::f16) {
+    bool Ignored;
+    APFloat APF = APFloat(Val);
+    APF.convert(EVTToAPFloatSemantics(EltVT), APFloat::rmNearestTiesToEven,
+                &Ignored);
+    return getConstantFP(APF, DL, VT, isTarget);
   } else
     llvm_unreachable("Unsupported type in getConstantFP");
 }
 
-SDValue SelectionDAG::getGlobalAddress(const GlobalValue *GV, SDLoc DL,
-                                       EVT VT, int64_t Offset,
-                                       bool isTargetGA,
+SDValue SelectionDAG::getGlobalAddress(const GlobalValue *GV, const SDLoc &DL,
+                                       EVT VT, int64_t Offset, bool isTargetGA,
                                        unsigned char TargetFlags) {
   assert((TargetFlags == 0 || isTargetGA) &&
          "Cannot set target flags on target-independent globals");
@@ -1290,12 +1282,11 @@ SDValue SelectionDAG::getGlobalAddress(const GlobalValue *GV, SDLoc DL,
   ID.AddInteger(TargetFlags);
   ID.AddInteger(GV->getType()->getAddressSpace());
   void *IP = nullptr;
-  if (SDNode *E = FindNodeOrInsertPos(ID, DL.getDebugLoc(), IP))
+  if (SDNode *E = FindNodeOrInsertPos(ID, DL, IP))
     return SDValue(E, 0);
 
-  SDNode *N = new (NodeAllocator) GlobalAddressSDNode(Opc, DL.getIROrder(),
-                                                      DL.getDebugLoc(), GV, VT,
-                                                      Offset, TargetFlags);
+  auto *N = newSDNode<GlobalAddressSDNode>(
+      Opc, DL.getIROrder(), DL.getDebugLoc(), GV, VT, Offset, TargetFlags);
   CSEMap.InsertNode(N, IP);
     InsertNode(N);
   return SDValue(N, 0);
@@ -1310,7 +1301,7 @@ SDValue SelectionDAG::getFrameIndex(int FI, EVT VT, bool isTarget) {
   if (SDNode *E = FindNodeOrInsertPos(ID, IP))
     return SDValue(E, 0);
 
-  SDNode *N = new (NodeAllocator) FrameIndexSDNode(FI, VT, isTarget);
+  auto *N = newSDNode<FrameIndexSDNode>(FI, VT, isTarget);
   CSEMap.InsertNode(N, IP);
   InsertNode(N);
   return SDValue(N, 0);
@@ -1329,8 +1320,7 @@ SDValue SelectionDAG::getJumpTable(int JTI, EVT VT, bool isTarget,
   if (SDNode *E = FindNodeOrInsertPos(ID, IP))
     return SDValue(E, 0);
 
-  SDNode *N = new (NodeAllocator) JumpTableSDNode(JTI, VT, isTarget,
-                                                  TargetFlags);
+  auto *N = newSDNode<JumpTableSDNode>(JTI, VT, isTarget, TargetFlags);
   CSEMap.InsertNode(N, IP);
   InsertNode(N);
   return SDValue(N, 0);
@@ -1355,8 +1345,8 @@ SDValue SelectionDAG::getConstantPool(const Constant *C, EVT VT,
   if (SDNode *E = FindNodeOrInsertPos(ID, IP))
     return SDValue(E, 0);
 
-  SDNode *N = new (NodeAllocator) ConstantPoolSDNode(isTarget, C, VT, Offset,
-                                                     Alignment, TargetFlags);
+  auto *N = newSDNode<ConstantPoolSDNode>(isTarget, C, VT, Offset, Alignment,
+                                          TargetFlags);
   CSEMap.InsertNode(N, IP);
   InsertNode(N);
   return SDValue(N, 0);
@@ -1382,8 +1372,8 @@ SDValue SelectionDAG::getConstantPool(MachineConstantPoolValue *C, EVT VT,
   if (SDNode *E = FindNodeOrInsertPos(ID, IP))
     return SDValue(E, 0);
 
-  SDNode *N = new (NodeAllocator) ConstantPoolSDNode(isTarget, C, VT, Offset,
-                                                     Alignment, TargetFlags);
+  auto *N = newSDNode<ConstantPoolSDNode>(isTarget, C, VT, Offset, Alignment,
+                                          TargetFlags);
   CSEMap.InsertNode(N, IP);
   InsertNode(N);
   return SDValue(N, 0);
@@ -1400,8 +1390,7 @@ SDValue SelectionDAG::getTargetIndex(int Index, EVT VT, int64_t Offset,
   if (SDNode *E = FindNodeOrInsertPos(ID, IP))
     return SDValue(E, 0);
 
-  SDNode *N =
-      new (NodeAllocator) TargetIndexSDNode(Index, VT, Offset, TargetFlags);
+  auto *N = newSDNode<TargetIndexSDNode>(Index, VT, Offset, TargetFlags);
   CSEMap.InsertNode(N, IP);
   InsertNode(N);
   return SDValue(N, 0);
@@ -1415,7 +1404,7 @@ SDValue SelectionDAG::getBasicBlock(MachineBasicBlock *MBB) {
   if (SDNode *E = FindNodeOrInsertPos(ID, IP))
     return SDValue(E, 0);
 
-  SDNode *N = new (NodeAllocator) BasicBlockSDNode(MBB);
+  auto *N = newSDNode<BasicBlockSDNode>(MBB);
   CSEMap.InsertNode(N, IP);
   InsertNode(N);
   return SDValue(N, 0);
@@ -1430,7 +1419,7 @@ SDValue SelectionDAG::getValueType(EVT VT) {
     ExtendedValueTypeNodes[VT] : ValueTypeNodes[VT.getSimpleVT().SimpleTy];
 
   if (N) return SDValue(N, 0);
-  N = new (NodeAllocator) VTSDNode(VT);
+  N = newSDNode<VTSDNode>(VT);
   InsertNode(N);
   return SDValue(N, 0);
 }
@@ -1438,7 +1427,7 @@ SDValue SelectionDAG::getValueType(EVT VT) {
 SDValue SelectionDAG::getExternalSymbol(const char *Sym, EVT VT) {
   SDNode *&N = ExternalSymbols[Sym];
   if (N) return SDValue(N, 0);
-  N = new (NodeAllocator) ExternalSymbolSDNode(false, Sym, 0, VT);
+  N = newSDNode<ExternalSymbolSDNode>(false, Sym, 0, VT);
   InsertNode(N);
   return SDValue(N, 0);
 }
@@ -1447,7 +1436,7 @@ SDValue SelectionDAG::getMCSymbol(MCSymbol *Sym, EVT VT) {
   SDNode *&N = MCSymbols[Sym];
   if (N)
     return SDValue(N, 0);
-  N = new (NodeAllocator) MCSymbolSDNode(Sym, VT);
+  N = newSDNode<MCSymbolSDNode>(Sym, VT);
   InsertNode(N);
   return SDValue(N, 0);
 }
@@ -1458,7 +1447,7 @@ SDValue SelectionDAG::getTargetExternalSymbol(const char *Sym, EVT VT,
     TargetExternalSymbols[std::pair<std::string,unsigned char>(Sym,
                                                                TargetFlags)];
   if (N) return SDValue(N, 0);
-  N = new (NodeAllocator) ExternalSymbolSDNode(true, Sym, TargetFlags, VT);
+  N = newSDNode<ExternalSymbolSDNode>(true, Sym, TargetFlags, VT);
   InsertNode(N);
   return SDValue(N, 0);
 }
@@ -1468,7 +1457,7 @@ SDValue SelectionDAG::getCondCode(ISD::CondCode Cond) {
     CondCodeNodes.resize(Cond+1);
 
   if (!CondCodeNodes[Cond]) {
-    CondCodeSDNode *N = new (NodeAllocator) CondCodeSDNode(Cond);
+    auto *N = newSDNode<CondCodeSDNode>(Cond);
     CondCodeNodes[Cond] = N;
     InsertNode(N);
   }
@@ -1476,41 +1465,42 @@ SDValue SelectionDAG::getCondCode(ISD::CondCode Cond) {
   return SDValue(CondCodeNodes[Cond], 0);
 }
 
-// commuteShuffle - swaps the values of N1 and N2, and swaps all indices in
-// the shuffle mask M that point at N1 to point at N2, and indices that point
-// N2 to point at N1.
-static void commuteShuffle(SDValue &N1, SDValue &N2, SmallVectorImpl<int> &M) {
+/// Swaps the values of N1 and N2. Swaps all indices in the shuffle mask M that
+/// point at N1 to point at N2 and indices that point at N2 to point at N1.
+static void commuteShuffle(SDValue &N1, SDValue &N2, MutableArrayRef<int> M) {
   std::swap(N1, N2);
   ShuffleVectorSDNode::commuteMask(M);
 }
 
-SDValue SelectionDAG::getVectorShuffle(EVT VT, SDLoc dl, SDValue N1,
-                                       SDValue N2, const int *Mask) {
+SDValue SelectionDAG::getVectorShuffle(EVT VT, const SDLoc &dl, SDValue N1,
+                                       SDValue N2, ArrayRef<int> Mask) {
+  assert(VT.getVectorNumElements() == Mask.size() &&
+           "Must have the same number of vector elements as mask elements!");
   assert(VT == N1.getValueType() && VT == N2.getValueType() &&
          "Invalid VECTOR_SHUFFLE");
 
   // Canonicalize shuffle undef, undef -> undef
-  if (N1.getOpcode() == ISD::UNDEF && N2.getOpcode() == ISD::UNDEF)
+  if (N1.isUndef() && N2.isUndef())
     return getUNDEF(VT);
 
   // Validate that all indices in Mask are within the range of the elements
   // input to the shuffle.
-  unsigned NElts = VT.getVectorNumElements();
-  SmallVector<int, 8> MaskVec;
-  for (unsigned i = 0; i != NElts; ++i) {
-    assert(Mask[i] < (int)(NElts * 2) && "Index out of range");
-    MaskVec.push_back(Mask[i]);
-  }
+  int NElts = Mask.size();
+  assert(all_of(Mask, [&](int M) { return M < (NElts * 2); }) &&
+         "Index out of range");
+
+  // Copy the mask so we can do any needed cleanup.
+  SmallVector<int, 8> MaskVec(Mask.begin(), Mask.end());
 
   // Canonicalize shuffle v, v -> v, undef
   if (N1 == N2) {
     N2 = getUNDEF(VT);
-    for (unsigned i = 0; i != NElts; ++i)
-      if (MaskVec[i] >= (int)NElts) MaskVec[i] -= NElts;
+    for (int i = 0; i != NElts; ++i)
+      if (MaskVec[i] >= NElts) MaskVec[i] -= NElts;
   }
 
   // Canonicalize shuffle undef, v -> v, undef.  Commute the shuffle mask.
-  if (N1.getOpcode() == ISD::UNDEF)
+  if (N1.isUndef())
     commuteShuffle(N1, N2, MaskVec);
 
   // If shuffling a splat, try to blend the splat instead. We do this here so
@@ -1521,8 +1511,8 @@ SDValue SelectionDAG::getVectorShuffle(EVT VT, SDLoc dl, SDValue N1,
     if (!Splat)
       return;
 
-    for (int i = 0; i < (int)NElts; ++i) {
-      if (MaskVec[i] < Offset || MaskVec[i] >= (Offset + (int)NElts))
+    for (int i = 0; i < NElts; ++i) {
+      if (MaskVec[i] < Offset || MaskVec[i] >= (Offset + NElts))
         continue;
 
       // If this input comes from undef, mark it as such.
@@ -1544,9 +1534,9 @@ SDValue SelectionDAG::getVectorShuffle(EVT VT, SDLoc dl, SDValue N1,
   // Canonicalize all index into lhs, -> shuffle lhs, undef
   // Canonicalize all index into rhs, -> shuffle rhs, undef
   bool AllLHS = true, AllRHS = true;
-  bool N2Undef = N2.getOpcode() == ISD::UNDEF;
-  for (unsigned i = 0; i != NElts; ++i) {
-    if (MaskVec[i] >= (int)NElts) {
+  bool N2Undef = N2.isUndef();
+  for (int i = 0; i != NElts; ++i) {
+    if (MaskVec[i] >= NElts) {
       if (N2Undef)
         MaskVec[i] = -1;
       else
@@ -1564,15 +1554,15 @@ SDValue SelectionDAG::getVectorShuffle(EVT VT, SDLoc dl, SDValue N1,
     commuteShuffle(N1, N2, MaskVec);
   }
   // Reset our undef status after accounting for the mask.
-  N2Undef = N2.getOpcode() == ISD::UNDEF;
+  N2Undef = N2.isUndef();
   // Re-check whether both sides ended up undef.
-  if (N1.getOpcode() == ISD::UNDEF && N2Undef)
+  if (N1.isUndef() && N2Undef)
     return getUNDEF(VT);
 
   // If Identity shuffle return that node.
   bool Identity = true, AllSame = true;
-  for (unsigned i = 0; i != NElts; ++i) {
-    if (MaskVec[i] >= 0 && MaskVec[i] != (int)i) Identity = false;
+  for (int i = 0; i != NElts; ++i) {
+    if (MaskVec[i] >= 0 && MaskVec[i] != i) Identity = false;
     if (MaskVec[i] != MaskVec[0]) AllSame = false;
   }
   if (Identity && NElts)
@@ -1592,7 +1582,7 @@ SDValue SelectionDAG::getVectorShuffle(EVT VT, SDLoc dl, SDValue N1,
       BitVector UndefElements;
       SDValue Splat = BV->getSplatValue(&UndefElements);
       // If this is a splat of an undef, shuffling it is also undef.
-      if (Splat && Splat.getOpcode() == ISD::UNDEF)
+      if (Splat && Splat.isUndef())
         return getUNDEF(VT);
 
       bool SameNumElts =
@@ -1612,11 +1602,9 @@ SDValue SelectionDAG::getVectorShuffle(EVT VT, SDLoc dl, SDValue N1,
 
       // If the shuffle itself creates a splat, build the vector directly.
       if (AllSame && SameNumElts) {
-        const SDValue &Splatted = BV->getOperand(MaskVec[0]);
-        SmallVector<SDValue, 8> Ops(NElts, Splatted);
-
         EVT BuildVT = BV->getValueType(0);
-        SDValue NewBV = getNode(ISD::BUILD_VECTOR, dl, BuildVT, Ops);
+        const SDValue &Splatted = BV->getOperand(MaskVec[0]);
+        SDValue NewBV = getSplatBuildVector(BuildVT, dl, Splatted);
 
         // We may have jumped through bitcasts, so the type of the
         // BUILD_VECTOR may not match the type of the shuffle.
@@ -1630,23 +1618,23 @@ SDValue SelectionDAG::getVectorShuffle(EVT VT, SDLoc dl, SDValue N1,
   FoldingSetNodeID ID;
   SDValue Ops[2] = { N1, N2 };
   AddNodeIDNode(ID, ISD::VECTOR_SHUFFLE, getVTList(VT), Ops);
-  for (unsigned i = 0; i != NElts; ++i)
+  for (int i = 0; i != NElts; ++i)
     ID.AddInteger(MaskVec[i]);
 
   void* IP = nullptr;
-  if (SDNode *E = FindNodeOrInsertPos(ID, dl.getDebugLoc(), IP))
+  if (SDNode *E = FindNodeOrInsertPos(ID, dl, IP))
     return SDValue(E, 0);
 
   // Allocate the mask array for the node out of the BumpPtrAllocator, since
   // SDNode doesn't have access to it.  This memory will be "leaked" when
   // the node is deallocated, but recovered when the NodeAllocator is released.
   int *MaskAlloc = OperandAllocator.Allocate<int>(NElts);
-  memcpy(MaskAlloc, &MaskVec[0], NElts * sizeof(int));
+  std::copy(MaskVec.begin(), MaskVec.end(), MaskAlloc);
+
+  auto *N = newSDNode<ShuffleVectorSDNode>(VT, dl.getIROrder(),
+                                           dl.getDebugLoc(), MaskAlloc);
+  createOperands(N, Ops);
 
-  ShuffleVectorSDNode *N =
-    new (NodeAllocator) ShuffleVectorSDNode(VT, dl.getIROrder(),
-                                            dl.getDebugLoc(), N1, N2,
-                                            MaskAlloc);
   CSEMap.InsertNode(N, IP);
   InsertNode(N);
   return SDValue(N, 0);
@@ -1659,13 +1647,12 @@ SDValue SelectionDAG::getCommutedVectorShuffle(const ShuffleVectorSDNode &SV) {
 
   SDValue Op0 = SV.getOperand(0);
   SDValue Op1 = SV.getOperand(1);
-  return getVectorShuffle(VT, SDLoc(&SV), Op1, Op0, &MaskVec[0]);
+  return getVectorShuffle(VT, SDLoc(&SV), Op1, Op0, MaskVec);
 }
 
-SDValue SelectionDAG::getConvertRndSat(EVT VT, SDLoc dl,
-                                       SDValue Val, SDValue DTy,
-                                       SDValue STy, SDValue Rnd, SDValue Sat,
-                                       ISD::CvtCode Code) {
+SDValue SelectionDAG::getConvertRndSat(EVT VT, const SDLoc &dl, SDValue Val,
+                                       SDValue DTy, SDValue STy, SDValue Rnd,
+                                       SDValue Sat, ISD::CvtCode Code) {
   // If the src and dest types are the same and the conversion is between
   // integer types of the same sign or two floats, no conversion is necessary.
   if (DTy == STy &&
@@ -1676,12 +1663,13 @@ SDValue SelectionDAG::getConvertRndSat(EVT VT, SDLoc dl,
   SDValue Ops[] = { Val, DTy, STy, Rnd, Sat };
   AddNodeIDNode(ID, ISD::CONVERT_RNDSAT, getVTList(VT), Ops);
   void* IP = nullptr;
-  if (SDNode *E = FindNodeOrInsertPos(ID, dl.getDebugLoc(), IP))
+  if (SDNode *E = FindNodeOrInsertPos(ID, dl, IP))
     return SDValue(E, 0);
 
-  CvtRndSatSDNode *N = new (NodeAllocator) CvtRndSatSDNode(VT, dl.getIROrder(),
-                                                           dl.getDebugLoc(),
-                                                           Ops, Code);
+  auto *N =
+      newSDNode<CvtRndSatSDNode>(VT, dl.getIROrder(), dl.getDebugLoc(), Code);
+  createOperands(N, Ops);
+
   CSEMap.InsertNode(N, IP);
   InsertNode(N);
   return SDValue(N, 0);
@@ -1695,7 +1683,7 @@ SDValue SelectionDAG::getRegister(unsigned RegNo, EVT VT) {
   if (SDNode *E = FindNodeOrInsertPos(ID, IP))
     return SDValue(E, 0);
 
-  SDNode *N = new (NodeAllocator) RegisterSDNode(RegNo, VT);
+  auto *N = newSDNode<RegisterSDNode>(RegNo, VT);
   CSEMap.InsertNode(N, IP);
   InsertNode(N);
   return SDValue(N, 0);
@@ -1709,13 +1697,14 @@ SDValue SelectionDAG::getRegisterMask(const uint32_t *RegMask) {
   if (SDNode *E = FindNodeOrInsertPos(ID, IP))
     return SDValue(E, 0);
 
-  SDNode *N = new (NodeAllocator) RegisterMaskSDNode(RegMask);
+  auto *N = newSDNode<RegisterMaskSDNode>(RegMask);
   CSEMap.InsertNode(N, IP);
   InsertNode(N);
   return SDValue(N, 0);
 }
 
-SDValue SelectionDAG::getEHLabel(SDLoc dl, SDValue Root, MCSymbol *Label) {
+SDValue SelectionDAG::getEHLabel(const SDLoc &dl, SDValue Root,
+                                 MCSymbol *Label) {
   FoldingSetNodeID ID;
   SDValue Ops[] = { Root };
   AddNodeIDNode(ID, ISD::EH_LABEL, getVTList(MVT::Other), Ops);
@@ -1724,14 +1713,14 @@ SDValue SelectionDAG::getEHLabel(SDLoc dl, SDValue Root, MCSymbol *Label) {
   if (SDNode *E = FindNodeOrInsertPos(ID, IP))
     return SDValue(E, 0);
 
-  SDNode *N = new (NodeAllocator) EHLabelSDNode(dl.getIROrder(),
-                                                dl.getDebugLoc(), Root, Label);
+  auto *N = newSDNode<EHLabelSDNode>(dl.getIROrder(), dl.getDebugLoc(), Label);
+  createOperands(N, Ops);
+
   CSEMap.InsertNode(N, IP);
   InsertNode(N);
   return SDValue(N, 0);
 }
 
-
 SDValue SelectionDAG::getBlockAddress(const BlockAddress *BA, EVT VT,
                                       int64_t Offset,
                                       bool isTarget,
@@ -1747,8 +1736,7 @@ SDValue SelectionDAG::getBlockAddress(const BlockAddress *BA, EVT VT,
   if (SDNode *E = FindNodeOrInsertPos(ID, IP))
     return SDValue(E, 0);
 
-  SDNode *N = new (NodeAllocator) BlockAddressSDNode(Opc, VT, BA, Offset,
-                                                     TargetFlags);
+  auto *N = newSDNode<BlockAddressSDNode>(Opc, VT, BA, Offset, TargetFlags);
   CSEMap.InsertNode(N, IP);
   InsertNode(N);
   return SDValue(N, 0);
@@ -1766,13 +1754,12 @@ SDValue SelectionDAG::getSrcValue(const Value *V) {
   if (SDNode *E = FindNodeOrInsertPos(ID, IP))
     return SDValue(E, 0);
 
-  SDNode *N = new (NodeAllocator) SrcValueSDNode(V);
+  auto *N = newSDNode<SrcValueSDNode>(V);
   CSEMap.InsertNode(N, IP);
   InsertNode(N);
   return SDValue(N, 0);
 }
 
-/// getMDNode - Return an MDNodeSDNode which holds an MDNode.
 SDValue SelectionDAG::getMDNode(const MDNode *MD) {
   FoldingSetNodeID ID;
   AddNodeIDNode(ID, ISD::MDNODE_SDNODE, getVTList(MVT::Other), None);
@@ -1782,7 +1769,7 @@ SDValue SelectionDAG::getMDNode(const MDNode *MD) {
   if (SDNode *E = FindNodeOrInsertPos(ID, IP))
     return SDValue(E, 0);
 
-  SDNode *N = new (NodeAllocator) MDNodeSDNode(MD);
+  auto *N = newSDNode<MDNodeSDNode>(MD);
   CSEMap.InsertNode(N, IP);
   InsertNode(N);
   return SDValue(N, 0);
@@ -1795,8 +1782,7 @@ SDValue SelectionDAG::getBitcast(EVT VT, SDValue V) {
   return getNode(ISD::BITCAST, SDLoc(V), VT, V);
 }
 
-/// getAddrSpaceCast - Return an AddrSpaceCastSDNode.
-SDValue SelectionDAG::getAddrSpaceCast(SDLoc dl, EVT VT, SDValue Ptr,
+SDValue SelectionDAG::getAddrSpaceCast(const SDLoc &dl, EVT VT, SDValue Ptr,
                                        unsigned SrcAS, unsigned DestAS) {
   SDValue Ops[] = {Ptr};
   FoldingSetNodeID ID;
@@ -1805,12 +1791,13 @@ SDValue SelectionDAG::getAddrSpaceCast(SDLoc dl, EVT VT, SDValue Ptr,
   ID.AddInteger(DestAS);
 
   void *IP = nullptr;
-  if (SDNode *E = FindNodeOrInsertPos(ID, dl.getDebugLoc(), IP))
+  if (SDNode *E = FindNodeOrInsertPos(ID, dl, IP))
     return SDValue(E, 0);
 
-  SDNode *N = new (NodeAllocator) AddrSpaceCastSDNode(dl.getIROrder(),
-                                                      dl.getDebugLoc(),
-                                                      VT, Ptr, SrcAS, DestAS);
+  auto *N = newSDNode<AddrSpaceCastSDNode>(dl.getIROrder(), dl.getDebugLoc(),
+                                           VT, SrcAS, DestAS);
+  createOperands(N, Ops);
+
   CSEMap.InsertNode(N, IP);
   InsertNode(N);
   return SDValue(N, 0);
@@ -1835,9 +1822,8 @@ SDValue SelectionDAG::expandVAArg(SDNode *Node) {
   SDValue Tmp2 = Node->getOperand(1);
   unsigned Align = Node->getConstantOperandVal(3);
 
-  SDValue VAListLoad =
-    getLoad(TLI.getPointerTy(getDataLayout()), dl, Tmp1, Tmp2,
-            MachinePointerInfo(V), false, false, false, 0);
+  SDValue VAListLoad = getLoad(TLI.getPointerTy(getDataLayout()), dl, Tmp1,
+                               Tmp2, MachinePointerInfo(V));
   SDValue VAList = VAListLoad;
 
   if (Align > TLI.getMinStackArgumentAlignment()) {
@@ -1856,11 +1842,10 @@ SDValue SelectionDAG::expandVAArg(SDNode *Node) {
                                                VT.getTypeForEVT(*getContext())),
                              dl, VAList.getValueType()));
   // Store the incremented VAList to the legalized pointer
-  Tmp1 = getStore(VAListLoad.getValue(1), dl, Tmp1, Tmp2,
-                  MachinePointerInfo(V), false, false, 0);
+  Tmp1 =
+      getStore(VAListLoad.getValue(1), dl, Tmp1, Tmp2, MachinePointerInfo(V));
   // Load the actual argument out of the pointer VAList
-  return getLoad(VT, dl, Tmp1, VAList, MachinePointerInfo(),
-                 false, false, false, 0);
+  return getLoad(VT, dl, Tmp1, VAList, MachinePointerInfo());
 }
 
 SDValue SelectionDAG::expandVACopy(SDNode *Node) {
@@ -1870,15 +1855,13 @@ SDValue SelectionDAG::expandVACopy(SDNode *Node) {
   // output, returning the chain.
   const Value *VD = cast<SrcValueSDNode>(Node->getOperand(3))->getValue();
   const Value *VS = cast<SrcValueSDNode>(Node->getOperand(4))->getValue();
-  SDValue Tmp1 = getLoad(TLI.getPointerTy(getDataLayout()), dl,
-                         Node->getOperand(0), Node->getOperand(2),
-                         MachinePointerInfo(VS), false, false, false, 0);
+  SDValue Tmp1 =
+      getLoad(TLI.getPointerTy(getDataLayout()), dl, Node->getOperand(0),
+              Node->getOperand(2), MachinePointerInfo(VS));
   return getStore(Tmp1.getValue(1), dl, Tmp1, Node->getOperand(1),
-                  MachinePointerInfo(VD), false, false, 0);
+                  MachinePointerInfo(VD));
 }
 
-/// CreateStackTemporary - Create a stack temporary, suitable for holding the
-/// specified value type.
 SDValue SelectionDAG::CreateStackTemporary(EVT VT, unsigned minAlign) {
   MachineFrameInfo *FrameInfo = getMachineFunction().getFrameInfo();
   unsigned ByteSize = VT.getStoreSize();
@@ -1890,8 +1873,6 @@ SDValue SelectionDAG::CreateStackTemporary(EVT VT, unsigned minAlign) {
   return getFrameIndex(FrameIdx, TLI->getPointerTy(getDataLayout()));
 }
 
-/// CreateStackTemporary - Create a stack temporary suitable for holding
-/// either of the specified value types.
 SDValue SelectionDAG::CreateStackTemporary(EVT VT1, EVT VT2) {
   unsigned Bytes = std::max(VT1.getStoreSize(), VT2.getStoreSize());
   Type *Ty1 = VT1.getTypeForEVT(*getContext());
@@ -1905,8 +1886,8 @@ SDValue SelectionDAG::CreateStackTemporary(EVT VT1, EVT VT2) {
   return getFrameIndex(FrameIdx, TLI->getPointerTy(getDataLayout()));
 }
 
-SDValue SelectionDAG::FoldSetCC(EVT VT, SDValue N1,
-                                SDValue N2, ISD::CondCode Cond, SDLoc dl) {
+SDValue SelectionDAG::FoldSetCC(EVT VT, SDValue N1, SDValue N2,
+                                ISD::CondCode Cond, const SDLoc &dl) {
   // These setcc operations always fold.
   switch (Cond) {
   default: break;
@@ -2469,6 +2450,12 @@ void SelectionDAG::computeKnownBits(SDValue Op, APInt &KnownZero,
     KnownOne = KnownOne.trunc(BitWidth);
     break;
   }
+  case ISD::BSWAP: {
+    computeKnownBits(Op.getOperand(0), KnownZero2, KnownOne2, Depth+1);
+    KnownZero = KnownZero2.byteSwap();
+    KnownOne = KnownOne2.byteSwap();
+    break;
+  }
   case ISD::SMIN:
   case ISD::SMAX:
   case ISD::UMIN:
@@ -2506,12 +2493,36 @@ void SelectionDAG::computeKnownBits(SDValue Op, APInt &KnownZero,
   assert((KnownZero & KnownOne) == 0 && "Bits known to be one AND zero?");
 }
 
-/// ComputeNumSignBits - Return the number of times the sign bit of the
-/// register is replicated into the other bits.  We know that at least 1 bit
-/// is always equal to the sign bit (itself), but other cases can give us
-/// information.  For example, immediately after an "SRA X, 2", we know that
-/// the top 3 bits are all equal to each other, so we return 3.
-unsigned SelectionDAG::ComputeNumSignBits(SDValue Op, unsigned Depth) const{
+bool SelectionDAG::isKnownToBeAPowerOfTwo(SDValue Val) const {
+  // A left-shift of a constant one will have exactly one bit set because
+  // shifting the bit off the end is undefined.
+  if (Val.getOpcode() == ISD::SHL) {
+    auto *C = dyn_cast<ConstantSDNode>(Val.getOperand(0));
+    if (C && C->getAPIntValue() == 1)
+      return true;
+  }
+
+  // Similarly, a logical right-shift of a constant sign-bit will have exactly
+  // one bit set.
+  if (Val.getOpcode() == ISD::SRL) {
+    auto *C = dyn_cast<ConstantSDNode>(Val.getOperand(0));
+    if (C && C->getAPIntValue().isSignBit())
+      return true;
+  }
+
+  // More could be done here, though the above checks are enough
+  // to handle some common cases.
+
+  // Fall back to computeKnownBits to catch other known cases.
+  EVT OpVT = Val.getValueType();
+  unsigned BitWidth = OpVT.getScalarType().getSizeInBits();
+  APInt KnownZero, KnownOne;
+  computeKnownBits(Val, KnownZero, KnownOne);
+  return (KnownZero.countPopulation() == BitWidth - 1) &&
+         (KnownOne.countPopulation() == 1);
+}
+
+unsigned SelectionDAG::ComputeNumSignBits(SDValue Op, unsigned Depth) const {
   EVT VT = Op.getValueType();
   assert(VT.isInteger() && "Invalid VT!");
   unsigned VTBits = VT.getScalarType().getSizeInBits();
@@ -2761,11 +2772,6 @@ unsigned SelectionDAG::ComputeNumSignBits(SDValue Op, unsigned Depth) const{
   return std::max(FirstAnswer, std::min(VTBits, Mask.countLeadingZeros()));
 }
 
-/// isBaseWithConstantOffset - Return true if the specified operand is an
-/// ISD::ADD with a ConstantSDNode on the right-hand side, or if it is an
-/// ISD::OR with a ConstantSDNode that is guaranteed to have the same
-/// semantics as an ADD.  This handles the equivalence:
-///     X|Cst == X+Cst iff X&Cst = 0.
 bool SelectionDAG::isBaseWithConstantOffset(SDValue Op) const {
   if ((Op.getOpcode() != ISD::ADD && Op.getOpcode() != ISD::OR) ||
       !isa<ConstantSDNode>(Op.getOperand(1)))
@@ -2779,7 +2785,6 @@ bool SelectionDAG::isBaseWithConstantOffset(SDValue Op) const {
   return true;
 }
 
-
 bool SelectionDAG::isKnownNeverNaN(SDValue Op) const {
   // If we're told that NaNs won't happen, assume they won't.
   if (getTarget().Options.NoNaNsFPMath)
@@ -2834,28 +2839,30 @@ bool SelectionDAG::haveNoCommonBitsSet(SDValue A, SDValue B) const {
   return (AZero | BZero).isAllOnesValue();
 }
 
-static SDValue FoldCONCAT_VECTORS(SDLoc DL, EVT VT, ArrayRef<SDValue> Ops,
+static SDValue FoldCONCAT_VECTORS(const SDLoc &DL, EVT VT,
+                                  ArrayRef<SDValue> Ops,
                                   llvm::SelectionDAG &DAG) {
   if (Ops.size() == 1)
     return Ops[0];
 
   // Concat of UNDEFs is UNDEF.
-  if (std::all_of(Ops.begin(), Ops.end(),
-                  [](SDValue Op) { return Op.isUndef(); }))
+  if (llvm::all_of(Ops, [](SDValue Op) { return Op.isUndef(); }))
     return DAG.getUNDEF(VT);
 
-  // A CONCAT_VECTOR with all operands BUILD_VECTOR can be simplified
-  // to one big BUILD_VECTOR.
-  // FIXME: Add support for UNDEF and SCALAR_TO_VECTOR as well.
-  if (!std::all_of(Ops.begin(), Ops.end(), [](SDValue Op) {
-        return Op.getOpcode() == ISD::BUILD_VECTOR;
-      }))
-    return SDValue();
-
+  // A CONCAT_VECTOR with all UNDEF/BUILD_VECTOR operands can be
+  // simplified to one big BUILD_VECTOR.
+  // FIXME: Add support for SCALAR_TO_VECTOR as well.
   EVT SVT = VT.getScalarType();
   SmallVector<SDValue, 16> Elts;
-  for (SDValue Op : Ops)
-    Elts.append(Op->op_begin(), Op->op_end());
+  for (SDValue Op : Ops) {
+    EVT OpVT = Op.getValueType();
+    if (Op.isUndef())
+      Elts.append(OpVT.getVectorNumElements(), DAG.getUNDEF(SVT));
+    else if (Op.getOpcode() == ISD::BUILD_VECTOR)
+      Elts.append(Op->op_begin(), Op->op_end());
+    else
+      return SDValue();
+  }
 
   // BUILD_VECTOR requires all inputs to be of the same type, find the
   // maximum type and extend them all.
@@ -2871,25 +2878,24 @@ static SDValue FoldCONCAT_VECTORS(SDLoc DL, EVT VT, ArrayRef<SDValue> Ops,
   return DAG.getNode(ISD::BUILD_VECTOR, DL, VT, Elts);
 }
 
-/// getNode - Gets or creates the specified node.
-///
-SDValue SelectionDAG::getNode(unsigned Opcode, SDLoc DL, EVT VT) {
+/// Gets or creates the specified node.
+SDValue SelectionDAG::getNode(unsigned Opcode, const SDLoc &DL, EVT VT) {
   FoldingSetNodeID ID;
   AddNodeIDNode(ID, Opcode, getVTList(VT), None);
   void *IP = nullptr;
-  if (SDNode *E = FindNodeOrInsertPos(ID, DL.getDebugLoc(), IP))
+  if (SDNode *E = FindNodeOrInsertPos(ID, DL, IP))
     return SDValue(E, 0);
 
-  SDNode *N = new (NodeAllocator) SDNode(Opcode, DL.getIROrder(),
-                                         DL.getDebugLoc(), getVTList(VT));
+  auto *N = newSDNode<SDNode>(Opcode, DL.getIROrder(), DL.getDebugLoc(),
+                              getVTList(VT));
   CSEMap.InsertNode(N, IP);
 
   InsertNode(N);
   return SDValue(N, 0);
 }
 
-SDValue SelectionDAG::getNode(unsigned Opcode, SDLoc DL,
-                              EVT VT, SDValue Operand) {
+SDValue SelectionDAG::getNode(unsigned Opcode, const SDLoc &DL, EVT VT,
+                              SDValue Operand) {
   // Constant fold unary operations with an integer constant operand. Even
   // opaque constant will be folded, because the folding of unary operations
   // doesn't create new constants with different values. Nevertheless, the
@@ -3054,7 +3060,7 @@ SDValue SelectionDAG::getNode(unsigned Opcode, SDLoc DL,
            "Vector element count mismatch!");
     assert(Operand.getValueType().bitsLT(VT) &&
            "Invalid fpext node, dst < src!");
-    if (Operand.getOpcode() == ISD::UNDEF)
+    if (Operand.isUndef())
       return getUNDEF(VT);
     break;
   case ISD::SIGN_EXTEND:
@@ -3148,6 +3154,12 @@ SDValue SelectionDAG::getNode(unsigned Opcode, SDLoc DL,
     if (OpOpcode == ISD::UNDEF)
       return getUNDEF(VT);
     break;
+  case ISD::BITREVERSE:
+    assert(VT.isInteger() && VT == Operand.getValueType() &&
+           "Invalid BITREVERSE!");
+    if (OpOpcode == ISD::UNDEF)
+      return getUNDEF(VT);
+    break;
   case ISD::BITCAST:
     // Basic sanity checking.
     assert(VT.getSizeInBits() == Operand.getValueType().getSizeInBits()
@@ -3192,20 +3204,20 @@ SDValue SelectionDAG::getNode(unsigned Opcode, SDLoc DL,
 
   SDNode *N;
   SDVTList VTs = getVTList(VT);
+  SDValue Ops[] = {Operand};
   if (VT != MVT::Glue) { // Don't CSE flag producing nodes
     FoldingSetNodeID ID;
-    SDValue Ops[1] = { Operand };
     AddNodeIDNode(ID, Opcode, VTs, Ops);
     void *IP = nullptr;
-    if (SDNode *E = FindNodeOrInsertPos(ID, DL.getDebugLoc(), IP))
+    if (SDNode *E = FindNodeOrInsertPos(ID, DL, IP))
       return SDValue(E, 0);
 
-    N = new (NodeAllocator) UnarySDNode(Opcode, DL.getIROrder(),
-                                        DL.getDebugLoc(), VTs, Operand);
+    N = newSDNode<SDNode>(Opcode, DL.getIROrder(), DL.getDebugLoc(), VTs);
+    createOperands(N, Ops);
     CSEMap.InsertNode(N, IP);
   } else {
-    N = new (NodeAllocator) UnarySDNode(Opcode, DL.getIROrder(),
-                                        DL.getDebugLoc(), VTs, Operand);
+    N = newSDNode<SDNode>(Opcode, DL.getIROrder(), DL.getDebugLoc(), VTs);
+    createOperands(N, Ops);
   }
 
   InsertNode(N);
@@ -3250,8 +3262,8 @@ static std::pair<APInt, bool> FoldValue(unsigned Opcode, const APInt &C1,
   return std::make_pair(APInt(1, 0), false);
 }
 
-SDValue SelectionDAG::FoldConstantArithmetic(unsigned Opcode, SDLoc DL, EVT VT,
-                                             const ConstantSDNode *Cst1,
+SDValue SelectionDAG::FoldConstantArithmetic(unsigned Opcode, const SDLoc &DL,
+                                             EVT VT, const ConstantSDNode *Cst1,
                                              const ConstantSDNode *Cst2) {
   if (Cst1->isOpaque() || Cst2->isOpaque())
     return SDValue();
@@ -3263,8 +3275,29 @@ SDValue SelectionDAG::FoldConstantArithmetic(unsigned Opcode, SDLoc DL, EVT VT,
   return getConstant(Folded.first, DL, VT);
 }
 
-SDValue SelectionDAG::FoldConstantArithmetic(unsigned Opcode, SDLoc DL, EVT VT,
-                                             SDNode *Cst1, SDNode *Cst2) {
+SDValue SelectionDAG::FoldSymbolOffset(unsigned Opcode, EVT VT,
+                                       const GlobalAddressSDNode *GA,
+                                       const SDNode *N2) {
+  if (GA->getOpcode() != ISD::GlobalAddress)
+    return SDValue();
+  if (!TLI->isOffsetFoldingLegal(GA))
+    return SDValue();
+  const ConstantSDNode *Cst2 = dyn_cast<ConstantSDNode>(N2);
+  if (!Cst2)
+    return SDValue();
+  int64_t Offset = Cst2->getSExtValue();
+  switch (Opcode) {
+  case ISD::ADD: break;
+  case ISD::SUB: Offset = -uint64_t(Offset); break;
+  default: return SDValue();
+  }
+  return getGlobalAddress(GA->getGlobal(), SDLoc(Cst2), VT,
+                          GA->getOffset() + uint64_t(Offset));
+}
+
+SDValue SelectionDAG::FoldConstantArithmetic(unsigned Opcode, const SDLoc &DL,
+                                             EVT VT, SDNode *Cst1,
+                                             SDNode *Cst2) {
   // If the opcode is a target-specific ISD node, there's nothing we can
   // do here and the operand rules may not line up with the below, so
   // bail early.
@@ -3274,21 +3307,20 @@ SDValue SelectionDAG::FoldConstantArithmetic(unsigned Opcode, SDLoc DL, EVT VT,
   // Handle the case of two scalars.
   if (const ConstantSDNode *Scalar1 = dyn_cast<ConstantSDNode>(Cst1)) {
     if (const ConstantSDNode *Scalar2 = dyn_cast<ConstantSDNode>(Cst2)) {
-      if (SDValue Folded =
-          FoldConstantArithmetic(Opcode, DL, VT, Scalar1, Scalar2)) {
-        if (!VT.isVector())
-          return Folded;
-        SmallVector<SDValue, 4> Outputs;
-        // We may have a vector type but a scalar result. Create a splat.
-        Outputs.resize(VT.getVectorNumElements(), Outputs.back());
-        // Build a big vector out of the scalar elements we generated.
-        return getNode(ISD::BUILD_VECTOR, SDLoc(), VT, Outputs);
-      } else {
-        return SDValue();
-      }
+      SDValue Folded = FoldConstantArithmetic(Opcode, DL, VT, Scalar1, Scalar2);
+      assert((!Folded || !VT.isVector()) &&
+             "Can't fold vectors ops with scalar operands");
+      return Folded;
     }
   }
 
+  // fold (add Sym, c) -> Sym+c
+  if (GlobalAddressSDNode *GA = dyn_cast<GlobalAddressSDNode>(Cst1))
+    return FoldSymbolOffset(Opcode, VT, GA, Cst2);
+  if (isCommutativeBinOp(Opcode))
+    if (GlobalAddressSDNode *GA = dyn_cast<GlobalAddressSDNode>(Cst2))
+      return FoldSymbolOffset(Opcode, VT, GA, Cst1);
+
   // For vectors extract each constant element into Inputs so we can constant
   // fold them individually.
   BuildVectorSDNode *BV1 = dyn_cast<BuildVectorSDNode>(Cst1);
@@ -3329,11 +3361,11 @@ SDValue SelectionDAG::FoldConstantArithmetic(unsigned Opcode, SDLoc DL, EVT VT,
   Outputs.resize(VT.getVectorNumElements(), Outputs.back());
 
   // Build a big vector out of the scalar elements we generated.
-  return getNode(ISD::BUILD_VECTOR, SDLoc(), VT, Outputs);
+  return getBuildVector(VT, SDLoc(), Outputs);
 }
 
-SDValue SelectionDAG::FoldConstantVectorArithmetic(unsigned Opcode, SDLoc DL,
-                                                   EVT VT,
+SDValue SelectionDAG::FoldConstantVectorArithmetic(unsigned Opcode,
+                                                   const SDLoc &DL, EVT VT,
                                                    ArrayRef<SDValue> Ops,
                                                    const SDNodeFlags *Flags) {
   // If the opcode is a target-specific ISD node, there's nothing we can
@@ -3355,8 +3387,8 @@ SDValue SelectionDAG::FoldConstantVectorArithmetic(unsigned Opcode, SDLoc DL,
 
   auto IsConstantBuildVectorOrUndef = [&](const SDValue &Op) {
     BuildVectorSDNode *BV = dyn_cast<BuildVectorSDNode>(Op);
-    return (Op.getOpcode() == ISD::UNDEF) ||
-           (Op.getOpcode() == ISD::CONDCODE) || (BV && BV->isConstant());
+    return (Op.isUndef()) || (Op.getOpcode() == ISD::CONDCODE) ||
+           (BV && BV->isConstant());
   };
 
   // All operands must be vector types with the same number of elements as
@@ -3375,7 +3407,7 @@ SDValue SelectionDAG::FoldConstantVectorArithmetic(unsigned Opcode, SDLoc DL,
   EVT LegalSVT = VT.getScalarType();
   if (LegalSVT.isInteger()) {
     LegalSVT = TLI->getTypeToTransformTo(*getContext(), LegalSVT);
-    if (LegalSVT.bitsLT(SVT))
+    if (LegalSVT.bitsLT(VT.getScalarType()))
       return SDValue();
   }
 
@@ -3414,20 +3446,18 @@ SDValue SelectionDAG::FoldConstantVectorArithmetic(unsigned Opcode, SDLoc DL,
       ScalarResult = getNode(ISD::SIGN_EXTEND, DL, LegalSVT, ScalarResult);
 
     // Scalar folding only succeeded if the result is a constant or UNDEF.
-    if (ScalarResult.getOpcode() != ISD::UNDEF &&
-        ScalarResult.getOpcode() != ISD::Constant &&
+    if (!ScalarResult.isUndef() && ScalarResult.getOpcode() != ISD::Constant &&
         ScalarResult.getOpcode() != ISD::ConstantFP)
       return SDValue();
     ScalarResults.push_back(ScalarResult);
   }
 
-  assert(ScalarResults.size() == NumElts &&
-         "Unexpected number of scalar results for BUILD_VECTOR");
-  return getNode(ISD::BUILD_VECTOR, DL, VT, ScalarResults);
+  return getBuildVector(VT, DL, ScalarResults);
 }
 
-SDValue SelectionDAG::getNode(unsigned Opcode, SDLoc DL, EVT VT, SDValue N1,
-                              SDValue N2, const SDNodeFlags *Flags) {
+SDValue SelectionDAG::getNode(unsigned Opcode, const SDLoc &DL, EVT VT,
+                              SDValue N1, SDValue N2,
+                              const SDNodeFlags *Flags) {
   ConstantSDNode *N1C = dyn_cast<ConstantSDNode>(N1);
   ConstantSDNode *N2C = dyn_cast<ConstantSDNode>(N2);
   ConstantFPSDNode *N1CFP = dyn_cast<ConstantFPSDNode>(N1);
@@ -3617,14 +3647,14 @@ SDValue SelectionDAG::getNode(unsigned Opcode, SDLoc DL, EVT VT, SDValue N1,
     };
 
     if (N1C) {
-      APInt Val = N1C->getAPIntValue();
+      const APInt &Val = N1C->getAPIntValue();
       return SignExtendInReg(Val);
     }
     if (ISD::isBuildVectorOfConstantSDNodes(N1.getNode())) {
       SmallVector<SDValue, 8> Ops;
       for (int i = 0, e = VT.getVectorNumElements(); i != e; ++i) {
         SDValue Op = N1.getOperand(i);
-        if (Op.getOpcode() == ISD::UNDEF) {
+        if (Op.isUndef()) {
           Ops.push_back(getUNDEF(VT.getScalarType()));
           continue;
         }
@@ -3637,13 +3667,13 @@ SDValue SelectionDAG::getNode(unsigned Opcode, SDLoc DL, EVT VT, SDValue N1,
         break;
       }
       if (Ops.size() == VT.getVectorNumElements())
-        return getNode(ISD::BUILD_VECTOR, DL, VT, Ops);
+        return getBuildVector(VT, DL, Ops);
     }
     break;
   }
   case ISD::EXTRACT_VECTOR_ELT:
     // EXTRACT_VECTOR_ELT of an UNDEF is an UNDEF.
-    if (N1.getOpcode() == ISD::UNDEF)
+    if (N1.isUndef())
       return getUNDEF(VT);
 
     // EXTRACT_VECTOR_ELT of out-of-bounds element is an UNDEF
@@ -3802,7 +3832,7 @@ SDValue SelectionDAG::getNode(unsigned Opcode, SDLoc DL, EVT VT, SDValue N1,
   }
 
   // Canonicalize an UNDEF to the RHS, even over a constant.
-  if (N1.getOpcode() == ISD::UNDEF) {
+  if (N1.isUndef()) {
     if (isCommutativeBinOp(Opcode)) {
       std::swap(N1, N2);
     } else {
@@ -3831,10 +3861,10 @@ SDValue SelectionDAG::getNode(unsigned Opcode, SDLoc DL, EVT VT, SDValue N1,
   }
 
   // Fold a bunch of operators when the RHS is undef.
-  if (N2.getOpcode() == ISD::UNDEF) {
+  if (N2.isUndef()) {
     switch (Opcode) {
     case ISD::XOR:
-      if (N1.getOpcode() == ISD::UNDEF)
+      if (N1.isUndef())
         // Handle undef ^ undef -> 0 special case. This is a common
         // idiom (misuse).
         return getConstant(0, DL, VT);
@@ -3877,21 +3907,20 @@ SDValue SelectionDAG::getNode(unsigned Opcode, SDLoc DL, EVT VT, SDValue N1,
   }
 
   // Memoize this node if possible.
-  BinarySDNode *N;
+  SDNode *N;
   SDVTList VTs = getVTList(VT);
   if (VT != MVT::Glue) {
     SDValue Ops[] = {N1, N2};
     FoldingSetNodeID ID;
     AddNodeIDNode(ID, Opcode, VTs, Ops);
     void *IP = nullptr;
-    if (SDNode *E = FindNodeOrInsertPos(ID, DL.getDebugLoc(), IP)) {
+    if (SDNode *E = FindNodeOrInsertPos(ID, DL, IP)) {
       if (Flags)
         E->intersectFlagsWith(Flags);
       return SDValue(E, 0);
     }
 
     N = GetBinarySDNode(Opcode, DL, VTs, N1, N2, Flags);
-
     CSEMap.InsertNode(N, IP);
   } else {
     N = GetBinarySDNode(Opcode, DL, VTs, N1, N2, Flags);
@@ -3901,7 +3930,7 @@ SDValue SelectionDAG::getNode(unsigned Opcode, SDLoc DL, EVT VT, SDValue N1,
   return SDValue(N, 0);
 }
 
-SDValue SelectionDAG::getNode(unsigned Opcode, SDLoc DL, EVT VT,
+SDValue SelectionDAG::getNode(unsigned Opcode, const SDLoc &DL, EVT VT,
                               SDValue N1, SDValue N2, SDValue N3) {
   // Perform various simplifications.
   switch (Opcode) {
@@ -3982,36 +4011,35 @@ SDValue SelectionDAG::getNode(unsigned Opcode, SDLoc DL, EVT VT,
   // Memoize node if it doesn't produce a flag.
   SDNode *N;
   SDVTList VTs = getVTList(VT);
+  SDValue Ops[] = {N1, N2, N3};
   if (VT != MVT::Glue) {
-    SDValue Ops[] = { N1, N2, N3 };
     FoldingSetNodeID ID;
     AddNodeIDNode(ID, Opcode, VTs, Ops);
     void *IP = nullptr;
-    if (SDNode *E = FindNodeOrInsertPos(ID, DL.getDebugLoc(), IP))
+    if (SDNode *E = FindNodeOrInsertPos(ID, DL, IP))
       return SDValue(E, 0);
 
-    N = new (NodeAllocator) TernarySDNode(Opcode, DL.getIROrder(),
-                                          DL.getDebugLoc(), VTs, N1, N2, N3);
+    N = newSDNode<SDNode>(Opcode, DL.getIROrder(), DL.getDebugLoc(), VTs);
+    createOperands(N, Ops);
     CSEMap.InsertNode(N, IP);
   } else {
-    N = new (NodeAllocator) TernarySDNode(Opcode, DL.getIROrder(),
-                                          DL.getDebugLoc(), VTs, N1, N2, N3);
+    N = newSDNode<SDNode>(Opcode, DL.getIROrder(), DL.getDebugLoc(), VTs);
+    createOperands(N, Ops);
   }
 
   InsertNode(N);
   return SDValue(N, 0);
 }
 
-SDValue SelectionDAG::getNode(unsigned Opcode, SDLoc DL, EVT VT,
-                              SDValue N1, SDValue N2, SDValue N3,
-                              SDValue N4) {
+SDValue SelectionDAG::getNode(unsigned Opcode, const SDLoc &DL, EVT VT,
+                              SDValue N1, SDValue N2, SDValue N3, SDValue N4) {
   SDValue Ops[] = { N1, N2, N3, N4 };
   return getNode(Opcode, DL, VT, Ops);
 }
 
-SDValue SelectionDAG::getNode(unsigned Opcode, SDLoc DL, EVT VT,
-                              SDValue N1, SDValue N2, SDValue N3,
-                              SDValue N4, SDValue N5) {
+SDValue SelectionDAG::getNode(unsigned Opcode, const SDLoc &DL, EVT VT,
+                              SDValue N1, SDValue N2, SDValue N3, SDValue N4,
+                              SDValue N5) {
   SDValue Ops[] = { N1, N2, N3, N4, N5 };
   return getNode(Opcode, DL, VT, Ops);
 }
@@ -4041,8 +4069,8 @@ SDValue SelectionDAG::getStackArgumentTokenFactor(SDValue Chain) {
 /// getMemsetValue - Vectorized representation of the memset value
 /// operand.
 static SDValue getMemsetValue(SDValue Value, EVT VT, SelectionDAG &DAG,
-                              SDLoc dl) {
-  assert(Value.getOpcode() != ISD::UNDEF);
+                              const SDLoc &dl) {
+  assert(!Value.isUndef());
 
   unsigned NumBits = VT.getScalarType().getSizeInBits();
   if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Value)) {
@@ -4069,13 +4097,9 @@ static SDValue getMemsetValue(SDValue Value, EVT VT, SelectionDAG &DAG,
   }
 
   if (VT != Value.getValueType() && !VT.isInteger())
-    Value = DAG.getNode(ISD::BITCAST, dl, VT.getScalarType(), Value);
-  if (VT != Value.getValueType()) {
-    assert(VT.getVectorElementType() == Value.getValueType() &&
-           "value type should be one vector element here");
-    SmallVector<SDValue, 8> BVOps(VT.getVectorNumElements(), Value);
-    Value = DAG.getNode(ISD::BUILD_VECTOR, dl, VT, BVOps);
-  }
+    Value = DAG.getBitcast(VT.getScalarType(), Value);
+  if (VT != Value.getValueType())
+    Value = DAG.getSplatBuildVector(VT, dl, Value);
 
   return Value;
 }
@@ -4083,7 +4107,7 @@ static SDValue getMemsetValue(SDValue Value, EVT VT, SelectionDAG &DAG,
 /// getMemsetStringVal - Similar to getMemsetValue. Except this is only
 /// used when a memcpy is turned into a memset when the source is a constant
 /// string ptr.
-static SDValue getMemsetStringVal(EVT VT, SDLoc dl, SelectionDAG &DAG,
+static SDValue getMemsetStringVal(EVT VT, const SDLoc &dl, SelectionDAG &DAG,
                                   const TargetLowering &TLI, StringRef Str) {
   // Handle vector with all elements zero.
   if (Str.empty()) {
@@ -4124,19 +4148,16 @@ static SDValue getMemsetStringVal(EVT VT, SDLoc dl, SelectionDAG &DAG,
   return SDValue(nullptr, 0);
 }
 
-/// getMemBasePlusOffset - Returns base and offset node for the
-///
-static SDValue getMemBasePlusOffset(SDValue Base, unsigned Offset, SDLoc dl,
-                                      SelectionDAG &DAG) {
+SDValue SelectionDAG::getMemBasePlusOffset(SDValue Base, unsigned Offset,
+                                           const SDLoc &DL) {
   EVT VT = Base.getValueType();
-  return DAG.getNode(ISD::ADD, dl,
-                     VT, Base, DAG.getConstant(Offset, dl, VT));
+  return getNode(ISD::ADD, DL, VT, Base, getConstant(Offset, DL, VT));
 }
 
 /// isMemSrcFromString - Returns true if memcpy source is a string constant.
 ///
 static bool isMemSrcFromString(SDValue Src, StringRef &Str) {
-  unsigned SrcDelta = 0;
+  uint64_t SrcDelta = 0;
   GlobalAddressSDNode *G = nullptr;
   if (Src.getOpcode() == ISD::GlobalAddress)
     G = cast<GlobalAddressSDNode>(Src);
@@ -4149,7 +4170,8 @@ static bool isMemSrcFromString(SDValue Src, StringRef &Str) {
   if (!G)
     return false;
 
-  return getConstantStringInfo(G->getGlobal(), Str, SrcDelta, false);
+  return getConstantStringInfo(G->getGlobal(), Str,
+                               SrcDelta + G->getOffset(), false);
 }
 
 /// Determines the optimal series of memory ops to replace the memset / memcpy.
@@ -4163,6 +4185,7 @@ static bool FindOptimalMemOpLowering(std::vector<EVT> &MemOps,
                                      bool ZeroMemset,
                                      bool MemcpyStrSrc,
                                      bool AllowOverlap,
+                                     unsigned DstAS, unsigned SrcAS,
                                      SelectionDAG &DAG,
                                      const TargetLowering &TLI) {
   assert((SrcAlign == 0 || SrcAlign >= DstAlign) &&
@@ -4179,10 +4202,9 @@ static bool FindOptimalMemOpLowering(std::vector<EVT> &MemOps,
                                    DAG.getMachineFunction());
 
   if (VT == MVT::Other) {
-    unsigned AS = 0;
-    if (DstAlign >= DAG.getDataLayout().getPointerPrefAlignment(AS) ||
-        TLI.allowsMisalignedMemoryAccesses(VT, AS, DstAlign)) {
-      VT = TLI.getPointerTy(DAG.getDataLayout());
+    if (DstAlign >= DAG.getDataLayout().getPointerPrefAlignment(DstAS) ||
+        TLI.allowsMisalignedMemoryAccesses(VT, DstAS, DstAlign)) {
+      VT = TLI.getPointerTy(DAG.getDataLayout(), DstAS);
     } else {
       switch (DstAlign & 7) {
       case 0:  VT = MVT::i64; break;
@@ -4238,10 +4260,9 @@ static bool FindOptimalMemOpLowering(std::vector<EVT> &MemOps,
       // FIXME: Only does this for 64-bit or more since we don't have proper
       // cost model for unaligned load / store.
       bool Fast;
-      unsigned AS = 0;
       if (NumMemOps && AllowOverlap &&
           VTSize >= 8 && NewVTSize < Size &&
-          TLI.allowsMisalignedMemoryAccesses(VT, AS, DstAlign, &Fast) && Fast)
+          TLI.allowsMisalignedMemoryAccesses(VT, DstAS, DstAlign, &Fast) && Fast)
         VTSize = Size;
       else {
         VT = NewVT;
@@ -4267,15 +4288,14 @@ static bool shouldLowerMemFuncForSize(const MachineFunction &MF) {
   return MF.getFunction()->optForSize();
 }
 
-static SDValue getMemcpyLoadsAndStores(SelectionDAG &DAG, SDLoc dl,
-                                       SDValue Chain, SDValue Dst,
-                                       SDValue Src, uint64_t Size,
-                                       unsigned Align, bool isVol,
-                                       bool AlwaysInline,
+static SDValue getMemcpyLoadsAndStores(SelectionDAG &DAG, const SDLoc &dl,
+                                       SDValue Chain, SDValue Dst, SDValue Src,
+                                       uint64_t Size, unsigned Align,
+                                       bool isVol, bool AlwaysInline,
                                        MachinePointerInfo DstPtrInfo,
                                        MachinePointerInfo SrcPtrInfo) {
   // Turn a memcpy of undef to nop.
-  if (Src.getOpcode() == ISD::UNDEF)
+  if (Src.isUndef())
     return Chain;
 
   // Expand memcpy to a series of load and store ops if the size operand falls
@@ -4302,7 +4322,10 @@ static SDValue getMemcpyLoadsAndStores(SelectionDAG &DAG, SDLoc dl,
   if (!FindOptimalMemOpLowering(MemOps, Limit, Size,
                                 (DstAlignCanChange ? 0 : Align),
                                 (isZeroStr ? 0 : SrcAlign),
-                                false, false, CopyFromStr, true, DAG, TLI))
+                                false, false, CopyFromStr, true,
+                                DstPtrInfo.getAddrSpace(),
+                                SrcPtrInfo.getAddrSpace(),
+                                DAG, TLI))
     return SDValue();
 
   if (DstAlignCanChange) {
@@ -4325,6 +4348,8 @@ static SDValue getMemcpyLoadsAndStores(SelectionDAG &DAG, SDLoc dl,
     }
   }
 
+  MachineMemOperand::Flags MMOFlags =
+      isVol ? MachineMemOperand::MOVolatile : MachineMemOperand::MONone;
   SmallVector<SDValue, 8> OutChains;
   unsigned NumMemOps = MemOps.size();
   uint64_t SrcOff = 0, DstOff = 0;
@@ -4351,9 +4376,8 @@ static SDValue getMemcpyLoadsAndStores(SelectionDAG &DAG, SDLoc dl,
       Value = getMemsetStringVal(VT, dl, DAG, TLI, Str.substr(SrcOff));
       if (Value.getNode())
         Store = DAG.getStore(Chain, dl, Value,
-                             getMemBasePlusOffset(Dst, DstOff, dl, DAG),
-                             DstPtrInfo.getWithOffset(DstOff), isVol,
-                             false, Align);
+                             DAG.getMemBasePlusOffset(Dst, DstOff, dl),
+                             DstPtrInfo.getWithOffset(DstOff), Align, MMOFlags);
     }
 
     if (!Store.getNode()) {
@@ -4365,13 +4389,13 @@ static SDValue getMemcpyLoadsAndStores(SelectionDAG &DAG, SDLoc dl,
       EVT NVT = TLI.getTypeToTransformTo(*DAG.getContext(), VT);
       assert(NVT.bitsGE(VT));
       Value = DAG.getExtLoad(ISD::EXTLOAD, dl, NVT, Chain,
-                             getMemBasePlusOffset(Src, SrcOff, dl, DAG),
-                             SrcPtrInfo.getWithOffset(SrcOff), VT, isVol, false,
-                             false, MinAlign(SrcAlign, SrcOff));
-      Store = DAG.getTruncStore(Chain, dl, Value,
-                                getMemBasePlusOffset(Dst, DstOff, dl, DAG),
-                                DstPtrInfo.getWithOffset(DstOff), VT, isVol,
-                                false, Align);
+                             DAG.getMemBasePlusOffset(Src, SrcOff, dl),
+                             SrcPtrInfo.getWithOffset(SrcOff), VT,
+                             MinAlign(SrcAlign, SrcOff), MMOFlags);
+      OutChains.push_back(Value.getValue(1));
+      Store = DAG.getTruncStore(
+          Chain, dl, Value, DAG.getMemBasePlusOffset(Dst, DstOff, dl),
+          DstPtrInfo.getWithOffset(DstOff), VT, Align, MMOFlags);
     }
     OutChains.push_back(Store);
     SrcOff += VTSize;
@@ -4382,15 +4406,14 @@ static SDValue getMemcpyLoadsAndStores(SelectionDAG &DAG, SDLoc dl,
   return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, OutChains);
 }
 
-static SDValue getMemmoveLoadsAndStores(SelectionDAG &DAG, SDLoc dl,
-                                        SDValue Chain, SDValue Dst,
-                                        SDValue Src, uint64_t Size,
-                                        unsigned Align,  bool isVol,
-                                        bool AlwaysInline,
+static SDValue getMemmoveLoadsAndStores(SelectionDAG &DAG, const SDLoc &dl,
+                                        SDValue Chain, SDValue Dst, SDValue Src,
+                                        uint64_t Size, unsigned Align,
+                                        bool isVol, bool AlwaysInline,
                                         MachinePointerInfo DstPtrInfo,
                                         MachinePointerInfo SrcPtrInfo) {
   // Turn a memmove of undef to nop.
-  if (Src.getOpcode() == ISD::UNDEF)
+  if (Src.isUndef())
     return Chain;
 
   // Expand memmove to a series of load and store ops if the size operand falls
@@ -4411,7 +4434,10 @@ static SDValue getMemmoveLoadsAndStores(SelectionDAG &DAG, SDLoc dl,
 
   if (!FindOptimalMemOpLowering(MemOps, Limit, Size,
                                 (DstAlignCanChange ? 0 : Align), SrcAlign,
-                                false, false, false, false, DAG, TLI))
+                                false, false, false, false,
+                                DstPtrInfo.getAddrSpace(),
+                                SrcPtrInfo.getAddrSpace(),
+                                DAG, TLI))
     return SDValue();
 
   if (DstAlignCanChange) {
@@ -4425,6 +4451,8 @@ static SDValue getMemmoveLoadsAndStores(SelectionDAG &DAG, SDLoc dl,
     }
   }
 
+  MachineMemOperand::Flags MMOFlags =
+      isVol ? MachineMemOperand::MOVolatile : MachineMemOperand::MONone;
   uint64_t SrcOff = 0, DstOff = 0;
   SmallVector<SDValue, 8> LoadValues;
   SmallVector<SDValue, 8> LoadChains;
@@ -4435,10 +4463,9 @@ static SDValue getMemmoveLoadsAndStores(SelectionDAG &DAG, SDLoc dl,
     unsigned VTSize = VT.getSizeInBits() / 8;
     SDValue Value;
 
-    Value = DAG.getLoad(VT, dl, Chain,
-                        getMemBasePlusOffset(Src, SrcOff, dl, DAG),
-                        SrcPtrInfo.getWithOffset(SrcOff), isVol,
-                        false, false, SrcAlign);
+    Value =
+        DAG.getLoad(VT, dl, Chain, DAG.getMemBasePlusOffset(Src, SrcOff, dl),
+                    SrcPtrInfo.getWithOffset(SrcOff), SrcAlign, MMOFlags);
     LoadValues.push_back(Value);
     LoadChains.push_back(Value.getValue(1));
     SrcOff += VTSize;
@@ -4451,8 +4478,8 @@ static SDValue getMemmoveLoadsAndStores(SelectionDAG &DAG, SDLoc dl,
     SDValue Store;
 
     Store = DAG.getStore(Chain, dl, LoadValues[i],
-                         getMemBasePlusOffset(Dst, DstOff, dl, DAG),
-                         DstPtrInfo.getWithOffset(DstOff), isVol, false, Align);
+                         DAG.getMemBasePlusOffset(Dst, DstOff, dl),
+                         DstPtrInfo.getWithOffset(DstOff), Align, MMOFlags);
     OutChains.push_back(Store);
     DstOff += VTSize;
   }
@@ -4478,13 +4505,12 @@ static SDValue getMemmoveLoadsAndStores(SelectionDAG &DAG, SDLoc dl,
 /// The function tries to replace 'llvm.memset' intrinsic with several store
 /// operations and value calculation code. This is usually profitable for small
 /// memory size.
-static SDValue getMemsetStores(SelectionDAG &DAG, SDLoc dl,
-                               SDValue Chain, SDValue Dst,
-                               SDValue Src, uint64_t Size,
-                               unsigned Align, bool isVol,
+static SDValue getMemsetStores(SelectionDAG &DAG, const SDLoc &dl,
+                               SDValue Chain, SDValue Dst, SDValue Src,
+                               uint64_t Size, unsigned Align, bool isVol,
                                MachinePointerInfo DstPtrInfo) {
   // Turn a memset of undef to nop.
-  if (Src.getOpcode() == ISD::UNDEF)
+  if (Src.isUndef())
     return Chain;
 
   // Expand memset to a series of load/store ops if the size operand
@@ -4502,7 +4528,9 @@ static SDValue getMemsetStores(SelectionDAG &DAG, SDLoc dl,
     isa<ConstantSDNode>(Src) && cast<ConstantSDNode>(Src)->isNullValue();
   if (!FindOptimalMemOpLowering(MemOps, TLI.getMaxStoresPerMemset(OptSize),
                                 Size, (DstAlignCanChange ? 0 : Align), 0,
-                                true, IsZeroVal, false, true, DAG, TLI))
+                                true, IsZeroVal, false, true,
+                                DstPtrInfo.getAddrSpace(), ~0u,
+                                DAG, TLI))
     return SDValue();
 
   if (DstAlignCanChange) {
@@ -4548,10 +4576,10 @@ static SDValue getMemsetStores(SelectionDAG &DAG, SDLoc dl,
         Value = getMemsetValue(Src, VT, DAG, dl);
     }
     assert(Value.getValueType() == VT && "Value with wrong type.");
-    SDValue Store = DAG.getStore(Chain, dl, Value,
-                                 getMemBasePlusOffset(Dst, DstOff, dl, DAG),
-                                 DstPtrInfo.getWithOffset(DstOff),
-                                 isVol, false, Align);
+    SDValue Store = DAG.getStore(
+        Chain, dl, Value, DAG.getMemBasePlusOffset(Dst, DstOff, dl),
+        DstPtrInfo.getWithOffset(DstOff), Align,
+        isVol ? MachineMemOperand::MOVolatile : MachineMemOperand::MONone);
     OutChains.push_back(Store);
     DstOff += VT.getSizeInBits() / 8;
     Size -= VTSize;
@@ -4570,10 +4598,10 @@ static void checkAddrSpaceIsValidForLibcall(const TargetLowering *TLI,
   }
 }
 
-SDValue SelectionDAG::getMemcpy(SDValue Chain, SDLoc dl, SDValue Dst,
-                                SDValue Src, SDValue Size,
-                                unsigned Align, bool isVol, bool AlwaysInline,
-                                bool isTailCall, MachinePointerInfo DstPtrInfo,
+SDValue SelectionDAG::getMemcpy(SDValue Chain, const SDLoc &dl, SDValue Dst,
+                                SDValue Src, SDValue Size, unsigned Align,
+                                bool isVol, bool AlwaysInline, bool isTailCall,
+                                MachinePointerInfo DstPtrInfo,
                                 MachinePointerInfo SrcPtrInfo) {
   assert(Align && "The SDAG layer expects explicit alignment and reserves 0");
 
@@ -4632,10 +4660,10 @@ SDValue SelectionDAG::getMemcpy(SDValue Chain, SDLoc dl, SDValue Dst,
   CLI.setDebugLoc(dl)
       .setChain(Chain)
       .setCallee(TLI->getLibcallCallingConv(RTLIB::MEMCPY),
-                 Type::getVoidTy(*getContext()),
+                 Dst.getValueType().getTypeForEVT(*getContext()),
                  getExternalSymbol(TLI->getLibcallName(RTLIB::MEMCPY),
                                    TLI->getPointerTy(getDataLayout())),
-                 std::move(Args), 0)
+                 std::move(Args))
       .setDiscardResult()
       .setTailCall(isTailCall);
 
@@ -4643,9 +4671,9 @@ SDValue SelectionDAG::getMemcpy(SDValue Chain, SDLoc dl, SDValue Dst,
   return CallResult.second;
 }
 
-SDValue SelectionDAG::getMemmove(SDValue Chain, SDLoc dl, SDValue Dst,
-                                 SDValue Src, SDValue Size,
-                                 unsigned Align, bool isVol, bool isTailCall,
+SDValue SelectionDAG::getMemmove(SDValue Chain, const SDLoc &dl, SDValue Dst,
+                                 SDValue Src, SDValue Size, unsigned Align,
+                                 bool isVol, bool isTailCall,
                                  MachinePointerInfo DstPtrInfo,
                                  MachinePointerInfo SrcPtrInfo) {
   assert(Align && "The SDAG layer expects explicit alignment and reserves 0");
@@ -4693,10 +4721,10 @@ SDValue SelectionDAG::getMemmove(SDValue Chain, SDLoc dl, SDValue Dst,
   CLI.setDebugLoc(dl)
       .setChain(Chain)
       .setCallee(TLI->getLibcallCallingConv(RTLIB::MEMMOVE),
-                 Type::getVoidTy(*getContext()),
+                 Dst.getValueType().getTypeForEVT(*getContext()),
                  getExternalSymbol(TLI->getLibcallName(RTLIB::MEMMOVE),
                                    TLI->getPointerTy(getDataLayout())),
-                 std::move(Args), 0)
+                 std::move(Args))
       .setDiscardResult()
       .setTailCall(isTailCall);
 
@@ -4704,9 +4732,9 @@ SDValue SelectionDAG::getMemmove(SDValue Chain, SDLoc dl, SDValue Dst,
   return CallResult.second;
 }
 
-SDValue SelectionDAG::getMemset(SDValue Chain, SDLoc dl, SDValue Dst,
-                                SDValue Src, SDValue Size,
-                                unsigned Align, bool isVol, bool isTailCall,
+SDValue SelectionDAG::getMemset(SDValue Chain, const SDLoc &dl, SDValue Dst,
+                                SDValue Src, SDValue Size, unsigned Align,
+                                bool isVol, bool isTailCall,
                                 MachinePointerInfo DstPtrInfo) {
   assert(Align && "The SDAG layer expects explicit alignment and reserves 0");
 
@@ -4755,10 +4783,10 @@ SDValue SelectionDAG::getMemset(SDValue Chain, SDLoc dl, SDValue Dst,
   CLI.setDebugLoc(dl)
       .setChain(Chain)
       .setCallee(TLI->getLibcallCallingConv(RTLIB::MEMSET),
-                 Type::getVoidTy(*getContext()),
+                 Dst.getValueType().getTypeForEVT(*getContext()),
                  getExternalSymbol(TLI->getLibcallName(RTLIB::MEMSET),
                                    TLI->getPointerTy(getDataLayout())),
-                 std::move(Args), 0)
+                 std::move(Args))
       .setDiscardResult()
       .setTailCall(isTailCall);
 
@@ -4766,7 +4794,7 @@ SDValue SelectionDAG::getMemset(SDValue Chain, SDLoc dl, SDValue Dst,
   return CallResult.second;
 }
 
-SDValue SelectionDAG::getAtomic(unsigned Opcode, SDLoc dl, EVT MemVT,
+SDValue SelectionDAG::getAtomic(unsigned Opcode, const SDLoc &dl, EVT MemVT,
                                 SDVTList VTList, ArrayRef<SDValue> Ops,
                                 MachineMemOperand *MMO,
                                 AtomicOrdering SuccessOrdering,
@@ -4777,41 +4805,31 @@ SDValue SelectionDAG::getAtomic(unsigned Opcode, SDLoc dl, EVT MemVT,
   AddNodeIDNode(ID, Opcode, VTList, Ops);
   ID.AddInteger(MMO->getPointerInfo().getAddrSpace());
   void* IP = nullptr;
-  if (SDNode *E = FindNodeOrInsertPos(ID, dl.getDebugLoc(), IP)) {
+  if (SDNode *E = FindNodeOrInsertPos(ID, dl, IP)) {
     cast<AtomicSDNode>(E)->refineAlignment(MMO);
     return SDValue(E, 0);
   }
 
-  // Allocate the operands array for the node out of the BumpPtrAllocator, since
-  // SDNode doesn't have access to it.  This memory will be "leaked" when
-  // the node is deallocated, but recovered when the allocator is released.
-  // If the number of operands is less than 5 we use AtomicSDNode's internal
-  // storage.
-  unsigned NumOps = Ops.size();
-  SDUse *DynOps = NumOps > 4 ? OperandAllocator.Allocate<SDUse>(NumOps)
-                             : nullptr;
-
-  SDNode *N = new (NodeAllocator) AtomicSDNode(Opcode, dl.getIROrder(),
-                                               dl.getDebugLoc(), VTList, MemVT,
-                                               Ops.data(), DynOps, NumOps, MMO,
-                                               SuccessOrdering, FailureOrdering,
-                                               SynchScope);
+  auto *N = newSDNode<AtomicSDNode>(Opcode, dl.getIROrder(), dl.getDebugLoc(),
+                                    VTList, MemVT, MMO, SuccessOrdering,
+                                    FailureOrdering, SynchScope);
+  createOperands(N, Ops);
+
   CSEMap.InsertNode(N, IP);
   InsertNode(N);
   return SDValue(N, 0);
 }
 
-SDValue SelectionDAG::getAtomic(unsigned Opcode, SDLoc dl, EVT MemVT,
+SDValue SelectionDAG::getAtomic(unsigned Opcode, const SDLoc &dl, EVT MemVT,
                                 SDVTList VTList, ArrayRef<SDValue> Ops,
-                                MachineMemOperand *MMO,
-                                AtomicOrdering Ordering,
+                                MachineMemOperand *MMO, AtomicOrdering Ordering,
                                 SynchronizationScope SynchScope) {
   return getAtomic(Opcode, dl, MemVT, VTList, Ops, MMO, Ordering,
                    Ordering, SynchScope);
 }
 
 SDValue SelectionDAG::getAtomicCmpSwap(
-    unsigned Opcode, SDLoc dl, EVT MemVT, SDVTList VTs, SDValue Chain,
+    unsigned Opcode, const SDLoc &dl, EVT MemVT, SDVTList VTs, SDValue Chain,
     SDValue Ptr, SDValue Cmp, SDValue Swp, MachinePointerInfo PtrInfo,
     unsigned Alignment, AtomicOrdering SuccessOrdering,
     AtomicOrdering FailureOrdering, SynchronizationScope SynchScope) {
@@ -4826,10 +4844,8 @@ SDValue SelectionDAG::getAtomicCmpSwap(
 
   // FIXME: Volatile isn't really correct; we should keep track of atomic
   // orderings in the memoperand.
-  unsigned Flags = MachineMemOperand::MOVolatile;
-  Flags |= MachineMemOperand::MOLoad;
-  Flags |= MachineMemOperand::MOStore;
-
+  auto Flags = MachineMemOperand::MOVolatile | MachineMemOperand::MOLoad |
+               MachineMemOperand::MOStore;
   MachineMemOperand *MMO =
     MF.getMachineMemOperand(PtrInfo, Flags, MemVT.getStoreSize(), Alignment);
 
@@ -4837,9 +4853,9 @@ SDValue SelectionDAG::getAtomicCmpSwap(
                           SuccessOrdering, FailureOrdering, SynchScope);
 }
 
-SDValue SelectionDAG::getAtomicCmpSwap(unsigned Opcode, SDLoc dl, EVT MemVT,
-                                       SDVTList VTs, SDValue Chain, SDValue Ptr,
-                                       SDValue Cmp, SDValue Swp,
+SDValue SelectionDAG::getAtomicCmpSwap(unsigned Opcode, const SDLoc &dl,
+                                       EVT MemVT, SDVTList VTs, SDValue Chain,
+                                       SDValue Ptr, SDValue Cmp, SDValue Swp,
                                        MachineMemOperand *MMO,
                                        AtomicOrdering SuccessOrdering,
                                        AtomicOrdering FailureOrdering,
@@ -4853,11 +4869,9 @@ SDValue SelectionDAG::getAtomicCmpSwap(unsigned Opcode, SDLoc dl, EVT MemVT,
                    SuccessOrdering, FailureOrdering, SynchScope);
 }
 
-SDValue SelectionDAG::getAtomic(unsigned Opcode, SDLoc dl, EVT MemVT,
-                                SDValue Chain,
-                                SDValue Ptr, SDValue Val,
-                                const Value* PtrVal,
-                                unsigned Alignment,
+SDValue SelectionDAG::getAtomic(unsigned Opcode, const SDLoc &dl, EVT MemVT,
+                                SDValue Chain, SDValue Ptr, SDValue Val,
+                                const Value *PtrVal, unsigned Alignment,
                                 AtomicOrdering Ordering,
                                 SynchronizationScope SynchScope) {
   if (Alignment == 0)  // Ensure that codegen never sees alignment 0
@@ -4870,7 +4884,7 @@ SDValue SelectionDAG::getAtomic(unsigned Opcode, SDLoc dl, EVT MemVT,
   // chained as such.
   // FIXME: Volatile isn't really correct; we should keep track of atomic
   // orderings in the memoperand.
-  unsigned Flags = MachineMemOperand::MOVolatile;
+  auto Flags = MachineMemOperand::MOVolatile;
   if (Opcode != ISD::ATOMIC_STORE)
     Flags |= MachineMemOperand::MOLoad;
   if (Opcode != ISD::ATOMIC_LOAD)
@@ -4884,11 +4898,9 @@ SDValue SelectionDAG::getAtomic(unsigned Opcode, SDLoc dl, EVT MemVT,
                    Ordering, SynchScope);
 }
 
-SDValue SelectionDAG::getAtomic(unsigned Opcode, SDLoc dl, EVT MemVT,
-                                SDValue Chain,
-                                SDValue Ptr, SDValue Val,
-                                MachineMemOperand *MMO,
-                                AtomicOrdering Ordering,
+SDValue SelectionDAG::getAtomic(unsigned Opcode, const SDLoc &dl, EVT MemVT,
+                                SDValue Chain, SDValue Ptr, SDValue Val,
+                                MachineMemOperand *MMO, AtomicOrdering Ordering,
                                 SynchronizationScope SynchScope) {
   assert((Opcode == ISD::ATOMIC_LOAD_ADD ||
           Opcode == ISD::ATOMIC_LOAD_SUB ||
@@ -4912,11 +4924,9 @@ SDValue SelectionDAG::getAtomic(unsigned Opcode, SDLoc dl, EVT MemVT,
   return getAtomic(Opcode, dl, MemVT, VTs, Ops, MMO, Ordering, SynchScope);
 }
 
-SDValue SelectionDAG::getAtomic(unsigned Opcode, SDLoc dl, EVT MemVT,
-                                EVT VT, SDValue Chain,
-                                SDValue Ptr,
-                                MachineMemOperand *MMO,
-                                AtomicOrdering Ordering,
+SDValue SelectionDAG::getAtomic(unsigned Opcode, const SDLoc &dl, EVT MemVT,
+                                EVT VT, SDValue Chain, SDValue Ptr,
+                                MachineMemOperand *MMO, AtomicOrdering Ordering,
                                 SynchronizationScope SynchScope) {
   assert(Opcode == ISD::ATOMIC_LOAD && "Invalid Atomic Op");
 
@@ -4926,7 +4936,7 @@ SDValue SelectionDAG::getAtomic(unsigned Opcode, SDLoc dl, EVT MemVT,
 }
 
 /// getMergeValues - Create a MERGE_VALUES node from the given operands.
-SDValue SelectionDAG::getMergeValues(ArrayRef<SDValue> Ops, SDLoc dl) {
+SDValue SelectionDAG::getMergeValues(ArrayRef<SDValue> Ops, const SDLoc &dl) {
   if (Ops.size() == 1)
     return Ops[0];
 
@@ -4937,17 +4947,15 @@ SDValue SelectionDAG::getMergeValues(ArrayRef<SDValue> Ops, SDLoc dl) {
   return getNode(ISD::MERGE_VALUES, dl, getVTList(VTs), Ops);
 }
 
-SDValue
-SelectionDAG::getMemIntrinsicNode(unsigned Opcode, SDLoc dl, SDVTList VTList,
-                                  ArrayRef<SDValue> Ops,
-                                  EVT MemVT, MachinePointerInfo PtrInfo,
-                                  unsigned Align, bool Vol,
-                                  bool ReadMem, bool WriteMem, unsigned Size) {
+SDValue SelectionDAG::getMemIntrinsicNode(
+    unsigned Opcode, const SDLoc &dl, SDVTList VTList, ArrayRef<SDValue> Ops,
+    EVT MemVT, MachinePointerInfo PtrInfo, unsigned Align, bool Vol,
+    bool ReadMem, bool WriteMem, unsigned Size) {
   if (Align == 0)  // Ensure that codegen never sees alignment 0
     Align = getEVTAlignment(MemVT);
 
   MachineFunction &MF = getMachineFunction();
-  unsigned Flags = 0;
+  auto Flags = MachineMemOperand::MONone;
   if (WriteMem)
     Flags |= MachineMemOperand::MOStore;
   if (ReadMem)
@@ -4962,10 +4970,10 @@ SelectionDAG::getMemIntrinsicNode(unsigned Opcode, SDLoc dl, SDVTList VTList,
   return getMemIntrinsicNode(Opcode, dl, VTList, Ops, MemVT, MMO);
 }
 
-SDValue
-SelectionDAG::getMemIntrinsicNode(unsigned Opcode, SDLoc dl, SDVTList VTList,
-                                  ArrayRef<SDValue> Ops, EVT MemVT,
-                                  MachineMemOperand *MMO) {
+SDValue SelectionDAG::getMemIntrinsicNode(unsigned Opcode, const SDLoc &dl,
+                                          SDVTList VTList,
+                                          ArrayRef<SDValue> Ops, EVT MemVT,
+                                          MachineMemOperand *MMO) {
   assert((Opcode == ISD::INTRINSIC_VOID ||
           Opcode == ISD::INTRINSIC_W_CHAIN ||
           Opcode == ISD::PREFETCH ||
@@ -4982,19 +4990,20 @@ SelectionDAG::getMemIntrinsicNode(unsigned Opcode, SDLoc dl, SDVTList VTList,
     AddNodeIDNode(ID, Opcode, VTList, Ops);
     ID.AddInteger(MMO->getPointerInfo().getAddrSpace());
     void *IP = nullptr;
-    if (SDNode *E = FindNodeOrInsertPos(ID, dl.getDebugLoc(), IP)) {
+    if (SDNode *E = FindNodeOrInsertPos(ID, dl, IP)) {
       cast<MemIntrinsicSDNode>(E)->refineAlignment(MMO);
       return SDValue(E, 0);
     }
 
-    N = new (NodeAllocator) MemIntrinsicSDNode(Opcode, dl.getIROrder(),
-                                               dl.getDebugLoc(), VTList, Ops,
-                                               MemVT, MMO);
-    CSEMap.InsertNode(N, IP);
+    N = newSDNode<MemIntrinsicSDNode>(Opcode, dl.getIROrder(), dl.getDebugLoc(),
+                                      VTList, MemVT, MMO);
+    createOperands(N, Ops);
+
+  CSEMap.InsertNode(N, IP);
   } else {
-    N = new (NodeAllocator) MemIntrinsicSDNode(Opcode, dl.getIROrder(),
-                                               dl.getDebugLoc(), VTList, Ops,
-                                               MemVT, MMO);
+    N = newSDNode<MemIntrinsicSDNode>(Opcode, dl.getIROrder(), dl.getDebugLoc(),
+                                      VTList, MemVT, MMO);
+    createOperands(N, Ops);
   }
   InsertNode(N);
   return SDValue(N, 0);
@@ -5032,50 +5041,40 @@ static MachinePointerInfo InferPointerInfo(SelectionDAG &DAG, SDValue Ptr,
   // If the 'Offset' value isn't a constant, we can't handle this.
   if (ConstantSDNode *OffsetNode = dyn_cast<ConstantSDNode>(OffsetOp))
     return InferPointerInfo(DAG, Ptr, OffsetNode->getSExtValue());
-  if (OffsetOp.getOpcode() == ISD::UNDEF)
+  if (OffsetOp.isUndef())
     return InferPointerInfo(DAG, Ptr);
   return MachinePointerInfo();
 }
 
-
-SDValue
-SelectionDAG::getLoad(ISD::MemIndexedMode AM, ISD::LoadExtType ExtType,
-                      EVT VT, SDLoc dl, SDValue Chain,
-                      SDValue Ptr, SDValue Offset,
-                      MachinePointerInfo PtrInfo, EVT MemVT,
-                      bool isVolatile, bool isNonTemporal, bool isInvariant,
-                      unsigned Alignment, const AAMDNodes &AAInfo,
-                      const MDNode *Ranges) {
+SDValue SelectionDAG::getLoad(ISD::MemIndexedMode AM, ISD::LoadExtType ExtType,
+                              EVT VT, const SDLoc &dl, SDValue Chain,
+                              SDValue Ptr, SDValue Offset,
+                              MachinePointerInfo PtrInfo, EVT MemVT,
+                              unsigned Alignment,
+                              MachineMemOperand::Flags MMOFlags,
+                              const AAMDNodes &AAInfo, const MDNode *Ranges) {
   assert(Chain.getValueType() == MVT::Other &&
         "Invalid chain type");
   if (Alignment == 0)  // Ensure that codegen never sees alignment 0
     Alignment = getEVTAlignment(VT);
 
-  unsigned Flags = MachineMemOperand::MOLoad;
-  if (isVolatile)
-    Flags |= MachineMemOperand::MOVolatile;
-  if (isNonTemporal)
-    Flags |= MachineMemOperand::MONonTemporal;
-  if (isInvariant)
-    Flags |= MachineMemOperand::MOInvariant;
-
+  MMOFlags |= MachineMemOperand::MOLoad;
+  assert((MMOFlags & MachineMemOperand::MOStore) == 0);
   // If we don't have a PtrInfo, infer the trivial frame index case to simplify
   // clients.
   if (PtrInfo.V.isNull())
     PtrInfo = InferPointerInfo(*this, Ptr, Offset);
 
   MachineFunction &MF = getMachineFunction();
-  MachineMemOperand *MMO =
-    MF.getMachineMemOperand(PtrInfo, Flags, MemVT.getStoreSize(), Alignment,
-                            AAInfo, Ranges);
+  MachineMemOperand *MMO = MF.getMachineMemOperand(
+      PtrInfo, MMOFlags, MemVT.getStoreSize(), Alignment, AAInfo, Ranges);
   return getLoad(AM, ExtType, VT, dl, Chain, Ptr, Offset, MemVT, MMO);
 }
 
-SDValue
-SelectionDAG::getLoad(ISD::MemIndexedMode AM, ISD::LoadExtType ExtType,
-                      EVT VT, SDLoc dl, SDValue Chain,
-                      SDValue Ptr, SDValue Offset, EVT MemVT,
-                      MachineMemOperand *MMO) {
+SDValue SelectionDAG::getLoad(ISD::MemIndexedMode AM, ISD::LoadExtType ExtType,
+                              EVT VT, const SDLoc &dl, SDValue Chain,
+                              SDValue Ptr, SDValue Offset, EVT MemVT,
+                              MachineMemOperand *MMO) {
   if (VT == MemVT) {
     ExtType = ISD::NON_EXTLOAD;
   } else if (ExtType == ISD::NON_EXTLOAD) {
@@ -5094,8 +5093,7 @@ SelectionDAG::getLoad(ISD::MemIndexedMode AM, ISD::LoadExtType ExtType,
   }
 
   bool Indexed = AM != ISD::UNINDEXED;
-  assert((Indexed || Offset.getOpcode() == ISD::UNDEF) &&
-         "Unindexed load with an offset!");
+  assert((Indexed || Offset.isUndef()) && "Unindexed load with an offset!");
 
   SDVTList VTs = Indexed ?
     getVTList(VT, Ptr.getValueType(), MVT::Other) : getVTList(VT, MVT::Other);
@@ -5108,100 +5106,90 @@ SelectionDAG::getLoad(ISD::MemIndexedMode AM, ISD::LoadExtType ExtType,
                                      MMO->isInvariant()));
   ID.AddInteger(MMO->getPointerInfo().getAddrSpace());
   void *IP = nullptr;
-  if (SDNode *E = FindNodeOrInsertPos(ID, dl.getDebugLoc(), IP)) {
+  if (SDNode *E = FindNodeOrInsertPos(ID, dl, IP)) {
     cast<LoadSDNode>(E)->refineAlignment(MMO);
     return SDValue(E, 0);
   }
-  SDNode *N = new (NodeAllocator) LoadSDNode(Ops, dl.getIROrder(),
-                                             dl.getDebugLoc(), VTs, AM, ExtType,
-                                             MemVT, MMO);
+  auto *N = newSDNode<LoadSDNode>(dl.getIROrder(), dl.getDebugLoc(), VTs, AM,
+                                  ExtType, MemVT, MMO);
+  createOperands(N, Ops);
+
   CSEMap.InsertNode(N, IP);
   InsertNode(N);
   return SDValue(N, 0);
 }
 
-SDValue SelectionDAG::getLoad(EVT VT, SDLoc dl,
-                              SDValue Chain, SDValue Ptr,
-                              MachinePointerInfo PtrInfo,
-                              bool isVolatile, bool isNonTemporal,
-                              bool isInvariant, unsigned Alignment,
-                              const AAMDNodes &AAInfo,
-                              const MDNode *Ranges) {
+SDValue SelectionDAG::getLoad(EVT VT, const SDLoc &dl, SDValue Chain,
+                              SDValue Ptr, MachinePointerInfo PtrInfo,
+                              unsigned Alignment,
+                              MachineMemOperand::Flags MMOFlags,
+                              const AAMDNodes &AAInfo, const MDNode *Ranges) {
   SDValue Undef = getUNDEF(Ptr.getValueType());
   return getLoad(ISD::UNINDEXED, ISD::NON_EXTLOAD, VT, dl, Chain, Ptr, Undef,
-                 PtrInfo, VT, isVolatile, isNonTemporal, isInvariant, Alignment,
-                 AAInfo, Ranges);
+                 PtrInfo, VT, Alignment, MMOFlags, AAInfo, Ranges);
 }
 
-SDValue SelectionDAG::getLoad(EVT VT, SDLoc dl,
-                              SDValue Chain, SDValue Ptr,
-                              MachineMemOperand *MMO) {
+SDValue SelectionDAG::getLoad(EVT VT, const SDLoc &dl, SDValue Chain,
+                              SDValue Ptr, MachineMemOperand *MMO) {
   SDValue Undef = getUNDEF(Ptr.getValueType());
   return getLoad(ISD::UNINDEXED, ISD::NON_EXTLOAD, VT, dl, Chain, Ptr, Undef,
                  VT, MMO);
 }
 
-SDValue SelectionDAG::getExtLoad(ISD::LoadExtType ExtType, SDLoc dl, EVT VT,
-                                 SDValue Chain, SDValue Ptr,
+SDValue SelectionDAG::getExtLoad(ISD::LoadExtType ExtType, const SDLoc &dl,
+                                 EVT VT, SDValue Chain, SDValue Ptr,
                                  MachinePointerInfo PtrInfo, EVT MemVT,
-                                 bool isVolatile, bool isNonTemporal,
-                                 bool isInvariant, unsigned Alignment,
+                                 unsigned Alignment,
+                                 MachineMemOperand::Flags MMOFlags,
                                  const AAMDNodes &AAInfo) {
   SDValue Undef = getUNDEF(Ptr.getValueType());
-  return getLoad(ISD::UNINDEXED, ExtType, VT, dl, Chain, Ptr, Undef,
-                 PtrInfo, MemVT, isVolatile, isNonTemporal, isInvariant,
-                 Alignment, AAInfo);
+  return getLoad(ISD::UNINDEXED, ExtType, VT, dl, Chain, Ptr, Undef, PtrInfo,
+                 MemVT, Alignment, MMOFlags, AAInfo);
 }
 
-
-SDValue SelectionDAG::getExtLoad(ISD::LoadExtType ExtType, SDLoc dl, EVT VT,
-                                 SDValue Chain, SDValue Ptr, EVT MemVT,
+SDValue SelectionDAG::getExtLoad(ISD::LoadExtType ExtType, const SDLoc &dl,
+                                 EVT VT, SDValue Chain, SDValue Ptr, EVT MemVT,
                                  MachineMemOperand *MMO) {
   SDValue Undef = getUNDEF(Ptr.getValueType());
   return getLoad(ISD::UNINDEXED, ExtType, VT, dl, Chain, Ptr, Undef,
                  MemVT, MMO);
 }
 
-SDValue
-SelectionDAG::getIndexedLoad(SDValue OrigLoad, SDLoc dl, SDValue Base,
-                             SDValue Offset, ISD::MemIndexedMode AM) {
+SDValue SelectionDAG::getIndexedLoad(SDValue OrigLoad, const SDLoc &dl,
+                                     SDValue Base, SDValue Offset,
+                                     ISD::MemIndexedMode AM) {
   LoadSDNode *LD = cast<LoadSDNode>(OrigLoad);
-  assert(LD->getOffset().getOpcode() == ISD::UNDEF &&
-         "Load is already a indexed load!");
+  assert(LD->getOffset().isUndef() && "Load is already a indexed load!");
+  // Don't propagate the invariant flag.
+  auto MMOFlags =
+      LD->getMemOperand()->getFlags() & ~MachineMemOperand::MOInvariant;
   return getLoad(AM, LD->getExtensionType(), OrigLoad.getValueType(), dl,
                  LD->getChain(), Base, Offset, LD->getPointerInfo(),
-                 LD->getMemoryVT(), LD->isVolatile(), LD->isNonTemporal(),
-                 false, LD->getAlignment());
+                 LD->getMemoryVT(), LD->getAlignment(), MMOFlags);
 }
 
-SDValue SelectionDAG::getStore(SDValue Chain, SDLoc dl, SDValue Val,
+SDValue SelectionDAG::getStore(SDValue Chain, const SDLoc &dl, SDValue Val,
                                SDValue Ptr, MachinePointerInfo PtrInfo,
-                               bool isVolatile, bool isNonTemporal,
-                               unsigned Alignment, const AAMDNodes &AAInfo) {
-  assert(Chain.getValueType() == MVT::Other &&
-        "Invalid chain type");
+                               unsigned Alignment,
+                               MachineMemOperand::Flags MMOFlags,
+                               const AAMDNodes &AAInfo) {
+  assert(Chain.getValueType() == MVT::Other && "Invalid chain type");
   if (Alignment == 0)  // Ensure that codegen never sees alignment 0
     Alignment = getEVTAlignment(Val.getValueType());
 
-  unsigned Flags = MachineMemOperand::MOStore;
-  if (isVolatile)
-    Flags |= MachineMemOperand::MOVolatile;
-  if (isNonTemporal)
-    Flags |= MachineMemOperand::MONonTemporal;
+  MMOFlags |= MachineMemOperand::MOStore;
+  assert((MMOFlags & MachineMemOperand::MOLoad) == 0);
 
   if (PtrInfo.V.isNull())
     PtrInfo = InferPointerInfo(*this, Ptr);
 
   MachineFunction &MF = getMachineFunction();
-  MachineMemOperand *MMO =
-    MF.getMachineMemOperand(PtrInfo, Flags,
-                            Val.getValueType().getStoreSize(), Alignment,
-                            AAInfo);
-
+  MachineMemOperand *MMO = MF.getMachineMemOperand(
+      PtrInfo, MMOFlags, Val.getValueType().getStoreSize(), Alignment, AAInfo);
   return getStore(Chain, dl, Val, Ptr, MMO);
 }
 
-SDValue SelectionDAG::getStore(SDValue Chain, SDLoc dl, SDValue Val,
+SDValue SelectionDAG::getStore(SDValue Chain, const SDLoc &dl, SDValue Val,
                                SDValue Ptr, MachineMemOperand *MMO) {
   assert(Chain.getValueType() == MVT::Other &&
         "Invalid chain type");
@@ -5216,46 +5204,42 @@ SDValue SelectionDAG::getStore(SDValue Chain, SDLoc dl, SDValue Val,
                                      MMO->isNonTemporal(), MMO->isInvariant()));
   ID.AddInteger(MMO->getPointerInfo().getAddrSpace());
   void *IP = nullptr;
-  if (SDNode *E = FindNodeOrInsertPos(ID, dl.getDebugLoc(), IP)) {
+  if (SDNode *E = FindNodeOrInsertPos(ID, dl, IP)) {
     cast<StoreSDNode>(E)->refineAlignment(MMO);
     return SDValue(E, 0);
   }
-  SDNode *N = new (NodeAllocator) StoreSDNode(Ops, dl.getIROrder(),
-                                              dl.getDebugLoc(), VTs,
-                                              ISD::UNINDEXED, false, VT, MMO);
+  auto *N = newSDNode<StoreSDNode>(dl.getIROrder(), dl.getDebugLoc(), VTs,
+                                   ISD::UNINDEXED, false, VT, MMO);
+  createOperands(N, Ops);
+
   CSEMap.InsertNode(N, IP);
   InsertNode(N);
   return SDValue(N, 0);
 }
 
-SDValue SelectionDAG::getTruncStore(SDValue Chain, SDLoc dl, SDValue Val,
+SDValue SelectionDAG::getTruncStore(SDValue Chain, const SDLoc &dl, SDValue Val,
                                     SDValue Ptr, MachinePointerInfo PtrInfo,
-                                    EVT SVT,bool isVolatile, bool isNonTemporal,
-                                    unsigned Alignment,
+                                    EVT SVT, unsigned Alignment,
+                                    MachineMemOperand::Flags MMOFlags,
                                     const AAMDNodes &AAInfo) {
   assert(Chain.getValueType() == MVT::Other &&
         "Invalid chain type");
   if (Alignment == 0)  // Ensure that codegen never sees alignment 0
     Alignment = getEVTAlignment(SVT);
 
-  unsigned Flags = MachineMemOperand::MOStore;
-  if (isVolatile)
-    Flags |= MachineMemOperand::MOVolatile;
-  if (isNonTemporal)
-    Flags |= MachineMemOperand::MONonTemporal;
+  MMOFlags |= MachineMemOperand::MOStore;
+  assert((MMOFlags & MachineMemOperand::MOLoad) == 0);
 
   if (PtrInfo.V.isNull())
     PtrInfo = InferPointerInfo(*this, Ptr);
 
   MachineFunction &MF = getMachineFunction();
-  MachineMemOperand *MMO =
-    MF.getMachineMemOperand(PtrInfo, Flags, SVT.getStoreSize(), Alignment,
-                            AAInfo);
-
+  MachineMemOperand *MMO = MF.getMachineMemOperand(
+      PtrInfo, MMOFlags, SVT.getStoreSize(), Alignment, AAInfo);
   return getTruncStore(Chain, dl, Val, Ptr, SVT, MMO);
 }
 
-SDValue SelectionDAG::getTruncStore(SDValue Chain, SDLoc dl, SDValue Val,
+SDValue SelectionDAG::getTruncStore(SDValue Chain, const SDLoc &dl, SDValue Val,
                                     SDValue Ptr, EVT SVT,
                                     MachineMemOperand *MMO) {
   EVT VT = Val.getValueType();
@@ -5285,24 +5269,24 @@ SDValue SelectionDAG::getTruncStore(SDValue Chain, SDLoc dl, SDValue Val,
                                      MMO->isNonTemporal(), MMO->isInvariant()));
   ID.AddInteger(MMO->getPointerInfo().getAddrSpace());
   void *IP = nullptr;
-  if (SDNode *E = FindNodeOrInsertPos(ID, dl.getDebugLoc(), IP)) {
+  if (SDNode *E = FindNodeOrInsertPos(ID, dl, IP)) {
     cast<StoreSDNode>(E)->refineAlignment(MMO);
     return SDValue(E, 0);
   }
-  SDNode *N = new (NodeAllocator) StoreSDNode(Ops, dl.getIROrder(),
-                                              dl.getDebugLoc(), VTs,
-                                              ISD::UNINDEXED, true, SVT, MMO);
+  auto *N = newSDNode<StoreSDNode>(dl.getIROrder(), dl.getDebugLoc(), VTs,
+                                   ISD::UNINDEXED, true, SVT, MMO);
+  createOperands(N, Ops);
+
   CSEMap.InsertNode(N, IP);
   InsertNode(N);
   return SDValue(N, 0);
 }
 
-SDValue
-SelectionDAG::getIndexedStore(SDValue OrigStore, SDLoc dl, SDValue Base,
-                              SDValue Offset, ISD::MemIndexedMode AM) {
+SDValue SelectionDAG::getIndexedStore(SDValue OrigStore, const SDLoc &dl,
+                                      SDValue Base, SDValue Offset,
+                                      ISD::MemIndexedMode AM) {
   StoreSDNode *ST = cast<StoreSDNode>(OrigStore);
-  assert(ST->getOffset().getOpcode() == ISD::UNDEF &&
-         "Store is already a indexed store!");
+  assert(ST->getOffset().isUndef() && "Store is already a indexed store!");
   SDVTList VTs = getVTList(Base.getValueType(), MVT::Other);
   SDValue Ops[] = { ST->getChain(), ST->getValue(), Base, Offset };
   FoldingSetNodeID ID;
@@ -5311,23 +5295,23 @@ SelectionDAG::getIndexedStore(SDValue OrigStore, SDLoc dl, SDValue Base,
   ID.AddInteger(ST->getRawSubclassData());
   ID.AddInteger(ST->getPointerInfo().getAddrSpace());
   void *IP = nullptr;
-  if (SDNode *E = FindNodeOrInsertPos(ID, dl.getDebugLoc(), IP))
+  if (SDNode *E = FindNodeOrInsertPos(ID, dl, IP))
     return SDValue(E, 0);
 
-  SDNode *N = new (NodeAllocator) StoreSDNode(Ops, dl.getIROrder(),
-                                              dl.getDebugLoc(), VTs, AM,
-                                              ST->isTruncatingStore(),
-                                              ST->getMemoryVT(),
-                                              ST->getMemOperand());
+  auto *N = newSDNode<StoreSDNode>(dl.getIROrder(), dl.getDebugLoc(), VTs, AM,
+                                   ST->isTruncatingStore(), ST->getMemoryVT(),
+                                   ST->getMemOperand());
+  createOperands(N, Ops);
+
   CSEMap.InsertNode(N, IP);
   InsertNode(N);
   return SDValue(N, 0);
 }
 
-SDValue
-SelectionDAG::getMaskedLoad(EVT VT, SDLoc dl, SDValue Chain,
-                            SDValue Ptr, SDValue Mask, SDValue Src0, EVT MemVT,
-                            MachineMemOperand *MMO, ISD::LoadExtType ExtTy) {
+SDValue SelectionDAG::getMaskedLoad(EVT VT, const SDLoc &dl, SDValue Chain,
+                                    SDValue Ptr, SDValue Mask, SDValue Src0,
+                                    EVT MemVT, MachineMemOperand *MMO,
+                                    ISD::LoadExtType ExtTy) {
 
   SDVTList VTs = getVTList(VT, MVT::Other);
   SDValue Ops[] = { Chain, Ptr, Mask, Src0 };
@@ -5340,21 +5324,23 @@ SelectionDAG::getMaskedLoad(EVT VT, SDLoc dl, SDValue Chain,
                                      MMO->isInvariant()));
   ID.AddInteger(MMO->getPointerInfo().getAddrSpace());
   void *IP = nullptr;
-  if (SDNode *E = FindNodeOrInsertPos(ID, dl.getDebugLoc(), IP)) {
+  if (SDNode *E = FindNodeOrInsertPos(ID, dl, IP)) {
     cast<MaskedLoadSDNode>(E)->refineAlignment(MMO);
     return SDValue(E, 0);
   }
-  SDNode *N = new (NodeAllocator) MaskedLoadSDNode(dl.getIROrder(),
-                                             dl.getDebugLoc(), Ops, 4, VTs,
-                                             ExtTy, MemVT, MMO);
+  auto *N = newSDNode<MaskedLoadSDNode>(dl.getIROrder(), dl.getDebugLoc(), VTs,
+                                        ExtTy, MemVT, MMO);
+  createOperands(N, Ops);
+
   CSEMap.InsertNode(N, IP);
   InsertNode(N);
   return SDValue(N, 0);
 }
 
-SDValue SelectionDAG::getMaskedStore(SDValue Chain, SDLoc dl, SDValue Val,
-                                     SDValue Ptr, SDValue Mask, EVT MemVT,
-                                     MachineMemOperand *MMO, bool isTrunc) {
+SDValue SelectionDAG::getMaskedStore(SDValue Chain, const SDLoc &dl,
+                                     SDValue Val, SDValue Ptr, SDValue Mask,
+                                     EVT MemVT, MachineMemOperand *MMO,
+                                     bool isTrunc) {
   assert(Chain.getValueType() == MVT::Other &&
         "Invalid chain type");
   EVT VT = Val.getValueType();
@@ -5367,22 +5353,23 @@ SDValue SelectionDAG::getMaskedStore(SDValue Chain, SDLoc dl, SDValue Val,
                                      MMO->isNonTemporal(), MMO->isInvariant()));
   ID.AddInteger(MMO->getPointerInfo().getAddrSpace());
   void *IP = nullptr;
-  if (SDNode *E = FindNodeOrInsertPos(ID, dl.getDebugLoc(), IP)) {
+  if (SDNode *E = FindNodeOrInsertPos(ID, dl, IP)) {
     cast<MaskedStoreSDNode>(E)->refineAlignment(MMO);
     return SDValue(E, 0);
   }
-  SDNode *N = new (NodeAllocator) MaskedStoreSDNode(dl.getIROrder(),
-                                                    dl.getDebugLoc(), Ops, 4,
-                                                    VTs, isTrunc, MemVT, MMO);
+  auto *N = newSDNode<MaskedStoreSDNode>(dl.getIROrder(), dl.getDebugLoc(), VTs,
+                                         isTrunc, MemVT, MMO);
+  createOperands(N, Ops);
+
   CSEMap.InsertNode(N, IP);
   InsertNode(N);
   return SDValue(N, 0);
 }
 
-SDValue
-SelectionDAG::getMaskedGather(SDVTList VTs, EVT VT, SDLoc dl,
-                              ArrayRef<SDValue> Ops,
-                              MachineMemOperand *MMO) {
+SDValue SelectionDAG::getMaskedGather(SDVTList VTs, EVT VT, const SDLoc &dl,
+                                      ArrayRef<SDValue> Ops,
+                                      MachineMemOperand *MMO) {
+  assert(Ops.size() == 5 && "Incompatible number of operands");
 
   FoldingSetNodeID ID;
   AddNodeIDNode(ID, ISD::MGATHER, VTs, Ops);
@@ -5393,21 +5380,34 @@ SelectionDAG::getMaskedGather(SDVTList VTs, EVT VT, SDLoc dl,
                                      MMO->isInvariant()));
   ID.AddInteger(MMO->getPointerInfo().getAddrSpace());
   void *IP = nullptr;
-  if (SDNode *E = FindNodeOrInsertPos(ID, dl.getDebugLoc(), IP)) {
+  if (SDNode *E = FindNodeOrInsertPos(ID, dl, IP)) {
     cast<MaskedGatherSDNode>(E)->refineAlignment(MMO);
     return SDValue(E, 0);
   }
-  MaskedGatherSDNode *N =
-    new (NodeAllocator) MaskedGatherSDNode(dl.getIROrder(), dl.getDebugLoc(),
-                                           Ops, VTs, VT, MMO);
+
+  auto *N = newSDNode<MaskedGatherSDNode>(dl.getIROrder(), dl.getDebugLoc(),
+                                          VTs, VT, MMO);
+  createOperands(N, Ops);
+
+  assert(N->getValue().getValueType() == N->getValueType(0) &&
+         "Incompatible type of the PassThru value in MaskedGatherSDNode");
+  assert(N->getMask().getValueType().getVectorNumElements() ==
+             N->getValueType(0).getVectorNumElements() &&
+         "Vector width mismatch between mask and data");
+  assert(N->getIndex().getValueType().getVectorNumElements() ==
+             N->getValueType(0).getVectorNumElements() &&
+         "Vector width mismatch between index and data");
+
   CSEMap.InsertNode(N, IP);
   InsertNode(N);
   return SDValue(N, 0);
 }
 
-SDValue SelectionDAG::getMaskedScatter(SDVTList VTs, EVT VT, SDLoc dl,
+SDValue SelectionDAG::getMaskedScatter(SDVTList VTs, EVT VT, const SDLoc &dl,
                                        ArrayRef<SDValue> Ops,
                                        MachineMemOperand *MMO) {
+  assert(Ops.size() == 5 && "Incompatible number of operands");
+
   FoldingSetNodeID ID;
   AddNodeIDNode(ID, ISD::MSCATTER, VTs, Ops);
   ID.AddInteger(VT.getRawBits());
@@ -5416,27 +5416,33 @@ SDValue SelectionDAG::getMaskedScatter(SDVTList VTs, EVT VT, SDLoc dl,
                                      MMO->isInvariant()));
   ID.AddInteger(MMO->getPointerInfo().getAddrSpace());
   void *IP = nullptr;
-  if (SDNode *E = FindNodeOrInsertPos(ID, dl.getDebugLoc(), IP)) {
+  if (SDNode *E = FindNodeOrInsertPos(ID, dl, IP)) {
     cast<MaskedScatterSDNode>(E)->refineAlignment(MMO);
     return SDValue(E, 0);
   }
-  SDNode *N =
-    new (NodeAllocator) MaskedScatterSDNode(dl.getIROrder(), dl.getDebugLoc(),
-                                            Ops, VTs, VT, MMO);
+  auto *N = newSDNode<MaskedScatterSDNode>(dl.getIROrder(), dl.getDebugLoc(),
+                                           VTs, VT, MMO);
+  createOperands(N, Ops);
+
+  assert(N->getMask().getValueType().getVectorNumElements() ==
+             N->getValue().getValueType().getVectorNumElements() &&
+         "Vector width mismatch between mask and data");
+  assert(N->getIndex().getValueType().getVectorNumElements() ==
+             N->getValue().getValueType().getVectorNumElements() &&
+         "Vector width mismatch between index and data");
+
   CSEMap.InsertNode(N, IP);
   InsertNode(N);
   return SDValue(N, 0);
 }
 
-SDValue SelectionDAG::getVAArg(EVT VT, SDLoc dl,
-                               SDValue Chain, SDValue Ptr,
-                               SDValue SV,
-                               unsigned Align) {
+SDValue SelectionDAG::getVAArg(EVT VT, const SDLoc &dl, SDValue Chain,
+                               SDValue Ptr, SDValue SV, unsigned Align) {
   SDValue Ops[] = { Chain, Ptr, SV, getTargetConstant(Align, dl, MVT::i32) };
   return getNode(ISD::VAARG, dl, getVTList(VT, MVT::Other), Ops);
 }
 
-SDValue SelectionDAG::getNode(unsigned Opcode, SDLoc DL, EVT VT,
+SDValue SelectionDAG::getNode(unsigned Opcode, const SDLoc &DL, EVT VT,
                               ArrayRef<SDUse> Ops) {
   switch (Ops.size()) {
   case 0: return getNode(Opcode, DL, VT);
@@ -5452,7 +5458,7 @@ SDValue SelectionDAG::getNode(unsigned Opcode, SDLoc DL, EVT VT,
   return getNode(Opcode, DL, VT, NewOps);
 }
 
-SDValue SelectionDAG::getNode(unsigned Opcode, SDLoc DL, EVT VT,
+SDValue SelectionDAG::getNode(unsigned Opcode, const SDLoc &DL, EVT VT,
                               ArrayRef<SDValue> Ops, const SDNodeFlags *Flags) {
   unsigned NumOps = Ops.size();
   switch (NumOps) {
@@ -5498,27 +5504,28 @@ SDValue SelectionDAG::getNode(unsigned Opcode, SDLoc DL, EVT VT,
     AddNodeIDNode(ID, Opcode, VTs, Ops);
     void *IP = nullptr;
 
-    if (SDNode *E = FindNodeOrInsertPos(ID, DL.getDebugLoc(), IP))
+    if (SDNode *E = FindNodeOrInsertPos(ID, DL, IP))
       return SDValue(E, 0);
 
-    N = new (NodeAllocator) SDNode(Opcode, DL.getIROrder(), DL.getDebugLoc(),
-                                   VTs, Ops);
+    N = newSDNode<SDNode>(Opcode, DL.getIROrder(), DL.getDebugLoc(), VTs);
+    createOperands(N, Ops);
+
     CSEMap.InsertNode(N, IP);
   } else {
-    N = new (NodeAllocator) SDNode(Opcode, DL.getIROrder(), DL.getDebugLoc(),
-                                   VTs, Ops);
+    N = newSDNode<SDNode>(Opcode, DL.getIROrder(), DL.getDebugLoc(), VTs);
+    createOperands(N, Ops);
   }
 
   InsertNode(N);
   return SDValue(N, 0);
 }
 
-SDValue SelectionDAG::getNode(unsigned Opcode, SDLoc DL,
+SDValue SelectionDAG::getNode(unsigned Opcode, const SDLoc &DL,
                               ArrayRef<EVT> ResultTys, ArrayRef<SDValue> Ops) {
   return getNode(Opcode, DL, getVTList(ResultTys), Ops);
 }
 
-SDValue SelectionDAG::getNode(unsigned Opcode, SDLoc DL, SDVTList VTList,
+SDValue SelectionDAG::getNode(unsigned Opcode, const SDLoc &DL, SDVTList VTList,
                               ArrayRef<SDValue> Ops) {
   if (VTList.NumVTs == 1)
     return getNode(Opcode, DL, VTList.VTs[0], Ops);
@@ -5548,83 +5555,56 @@ SDValue SelectionDAG::getNode(unsigned Opcode, SDLoc DL, SDVTList VTList,
 
   // Memoize the node unless it returns a flag.
   SDNode *N;
-  unsigned NumOps = Ops.size();
   if (VTList.VTs[VTList.NumVTs-1] != MVT::Glue) {
     FoldingSetNodeID ID;
     AddNodeIDNode(ID, Opcode, VTList, Ops);
     void *IP = nullptr;
-    if (SDNode *E = FindNodeOrInsertPos(ID, DL.getDebugLoc(), IP))
+    if (SDNode *E = FindNodeOrInsertPos(ID, DL, IP))
       return SDValue(E, 0);
 
-    if (NumOps == 1) {
-      N = new (NodeAllocator) UnarySDNode(Opcode, DL.getIROrder(),
-                                          DL.getDebugLoc(), VTList, Ops[0]);
-    } else if (NumOps == 2) {
-      N = new (NodeAllocator) BinarySDNode(Opcode, DL.getIROrder(),
-                                           DL.getDebugLoc(), VTList, Ops[0],
-                                           Ops[1]);
-    } else if (NumOps == 3) {
-      N = new (NodeAllocator) TernarySDNode(Opcode, DL.getIROrder(),
-                                            DL.getDebugLoc(), VTList, Ops[0],
-                                            Ops[1], Ops[2]);
-    } else {
-      N = new (NodeAllocator) SDNode(Opcode, DL.getIROrder(), DL.getDebugLoc(),
-                                     VTList, Ops);
-    }
+    N = newSDNode<SDNode>(Opcode, DL.getIROrder(), DL.getDebugLoc(), VTList);
+    createOperands(N, Ops);
     CSEMap.InsertNode(N, IP);
   } else {
-    if (NumOps == 1) {
-      N = new (NodeAllocator) UnarySDNode(Opcode, DL.getIROrder(),
-                                          DL.getDebugLoc(), VTList, Ops[0]);
-    } else if (NumOps == 2) {
-      N = new (NodeAllocator) BinarySDNode(Opcode, DL.getIROrder(),
-                                           DL.getDebugLoc(), VTList, Ops[0],
-                                           Ops[1]);
-    } else if (NumOps == 3) {
-      N = new (NodeAllocator) TernarySDNode(Opcode, DL.getIROrder(),
-                                            DL.getDebugLoc(), VTList, Ops[0],
-                                            Ops[1], Ops[2]);
-    } else {
-      N = new (NodeAllocator) SDNode(Opcode, DL.getIROrder(), DL.getDebugLoc(),
-                                     VTList, Ops);
-    }
+    N = newSDNode<SDNode>(Opcode, DL.getIROrder(), DL.getDebugLoc(), VTList);
+    createOperands(N, Ops);
   }
   InsertNode(N);
   return SDValue(N, 0);
 }
 
-SDValue SelectionDAG::getNode(unsigned Opcode, SDLoc DL, SDVTList VTList) {
+SDValue SelectionDAG::getNode(unsigned Opcode, const SDLoc &DL,
+                              SDVTList VTList) {
   return getNode(Opcode, DL, VTList, None);
 }
 
-SDValue SelectionDAG::getNode(unsigned Opcode, SDLoc DL, SDVTList VTList,
+SDValue SelectionDAG::getNode(unsigned Opcode, const SDLoc &DL, SDVTList VTList,
                               SDValue N1) {
   SDValue Ops[] = { N1 };
   return getNode(Opcode, DL, VTList, Ops);
 }
 
-SDValue SelectionDAG::getNode(unsigned Opcode, SDLoc DL, SDVTList VTList,
+SDValue SelectionDAG::getNode(unsigned Opcode, const SDLoc &DL, SDVTList VTList,
                               SDValue N1, SDValue N2) {
   SDValue Ops[] = { N1, N2 };
   return getNode(Opcode, DL, VTList, Ops);
 }
 
-SDValue SelectionDAG::getNode(unsigned Opcode, SDLoc DL, SDVTList VTList,
+SDValue SelectionDAG::getNode(unsigned Opcode, const SDLoc &DL, SDVTList VTList,
                               SDValue N1, SDValue N2, SDValue N3) {
   SDValue Ops[] = { N1, N2, N3 };
   return getNode(Opcode, DL, VTList, Ops);
 }
 
-SDValue SelectionDAG::getNode(unsigned Opcode, SDLoc DL, SDVTList VTList,
-                              SDValue N1, SDValue N2, SDValue N3,
-                              SDValue N4) {
+SDValue SelectionDAG::getNode(unsigned Opcode, const SDLoc &DL, SDVTList VTList,
+                              SDValue N1, SDValue N2, SDValue N3, SDValue N4) {
   SDValue Ops[] = { N1, N2, N3, N4 };
   return getNode(Opcode, DL, VTList, Ops);
 }
 
-SDValue SelectionDAG::getNode(unsigned Opcode, SDLoc DL, SDVTList VTList,
-                              SDValue N1, SDValue N2, SDValue N3,
-                              SDValue N4, SDValue N5) {
+SDValue SelectionDAG::getNode(unsigned Opcode, const SDLoc &DL, SDVTList VTList,
+                              SDValue N1, SDValue N2, SDValue N3, SDValue N4,
+                              SDValue N5) {
   SDValue Ops[] = { N1, N2, N3, N4, N5 };
   return getNode(Opcode, DL, VTList, Ops);
 }
@@ -5932,10 +5912,14 @@ SDNode *SelectionDAG::SelectNodeTo(SDNode *N, unsigned MachineOpc,
 
 SDNode *SelectionDAG::SelectNodeTo(SDNode *N, unsigned MachineOpc,
                                    SDVTList VTs,ArrayRef<SDValue> Ops) {
-  N = MorphNodeTo(N, ~MachineOpc, VTs, Ops);
+  SDNode *New = MorphNodeTo(N, ~MachineOpc, VTs, Ops);
   // Reset the NodeID to -1.
-  N->setNodeId(-1);
-  return N;
+  New->setNodeId(-1);
+  if (New != N) {
+    ReplaceAllUsesWith(N, New);
+    RemoveDeadNode(N);
+  }
+  return New;
 }
 
 /// UpdadeSDLocOnMergedSDNode - If the opt level is -O0 then it throws away
@@ -5945,7 +5929,7 @@ SDNode *SelectionDAG::SelectNodeTo(SDNode *N, unsigned MachineOpc,
 /// probability having other instructions associated with that line.
 ///
 /// For IROrder, we keep the smaller of the two
-SDNode *SelectionDAG::UpdadeSDLocOnMergedSDNode(SDNode *N, SDLoc OLoc) {
+SDNode *SelectionDAG::UpdadeSDLocOnMergedSDNode(SDNode *N, const SDLoc &OLoc) {
   DebugLoc NLoc = N->getDebugLoc();
   if (NLoc && OptLevel == CodeGenOpt::None && OLoc.getDebugLoc() != NLoc) {
     N->setDebugLoc(DebugLoc());
@@ -5973,13 +5957,12 @@ SDNode *SelectionDAG::UpdadeSDLocOnMergedSDNode(SDNode *N, SDLoc OLoc) {
 /// deleting things.
 SDNode *SelectionDAG::MorphNodeTo(SDNode *N, unsigned Opc,
                                   SDVTList VTs, ArrayRef<SDValue> Ops) {
-  unsigned NumOps = Ops.size();
   // If an identical node already exists, use it.
   void *IP = nullptr;
   if (VTs.VTs[VTs.NumVTs-1] != MVT::Glue) {
     FoldingSetNodeID ID;
     AddNodeIDNode(ID, Opc, VTs, Ops);
-    if (SDNode *ON = FindNodeOrInsertPos(ID, N->getDebugLoc(), IP))
+    if (SDNode *ON = FindNodeOrInsertPos(ID, SDLoc(N), IP))
       return UpdadeSDLocOnMergedSDNode(ON, SDLoc(N));
   }
 
@@ -6002,36 +5985,13 @@ SDNode *SelectionDAG::MorphNodeTo(SDNode *N, unsigned Opc,
       DeadNodeSet.insert(Used);
   }
 
-  if (MachineSDNode *MN = dyn_cast<MachineSDNode>(N)) {
-    // Initialize the memory references information.
+  // For MachineNode, initialize the memory references information.
+  if (MachineSDNode *MN = dyn_cast<MachineSDNode>(N))
     MN->setMemRefs(nullptr, nullptr);
-    // If NumOps is larger than the # of operands we can have in a
-    // MachineSDNode, reallocate the operand list.
-    if (NumOps > MN->NumOperands || !MN->OperandsNeedDelete) {
-      if (MN->OperandsNeedDelete)
-        delete[] MN->OperandList;
-      if (NumOps > array_lengthof(MN->LocalOperands))
-        // We're creating a final node that will live unmorphed for the
-        // remainder of the current SelectionDAG iteration, so we can allocate
-        // the operands directly out of a pool with no recycling metadata.
-        MN->InitOperands(OperandAllocator.Allocate<SDUse>(NumOps),
-                         Ops.data(), NumOps);
-      else
-        MN->InitOperands(MN->LocalOperands, Ops.data(), NumOps);
-      MN->OperandsNeedDelete = false;
-    } else
-      MN->InitOperands(MN->OperandList, Ops.data(), NumOps);
-  } else {
-    // If NumOps is larger than the # of operands we currently have, reallocate
-    // the operand list.
-    if (NumOps > N->NumOperands) {
-      if (N->OperandsNeedDelete)
-        delete[] N->OperandList;
-      N->InitOperands(new SDUse[NumOps], Ops.data(), NumOps);
-      N->OperandsNeedDelete = true;
-    } else
-      N->InitOperands(N->OperandList, Ops.data(), NumOps);
-  }
+
+  // Swap for an appropriately sized array from the recycler.
+  removeOperands(N);
+  createOperands(N, Ops);
 
   // Delete any nodes that are still dead after adding the uses for the
   // new operands.
@@ -6055,155 +6015,133 @@ SDNode *SelectionDAG::MorphNodeTo(SDNode *N, unsigned Opc,
 /// Note that getMachineNode returns the resultant node.  If there is already a
 /// node of the specified opcode and operands, it returns that node instead of
 /// the current one.
-MachineSDNode *
-SelectionDAG::getMachineNode(unsigned Opcode, SDLoc dl, EVT VT) {
+MachineSDNode *SelectionDAG::getMachineNode(unsigned Opcode, const SDLoc &dl,
+                                            EVT VT) {
   SDVTList VTs = getVTList(VT);
   return getMachineNode(Opcode, dl, VTs, None);
 }
 
-MachineSDNode *
-SelectionDAG::getMachineNode(unsigned Opcode, SDLoc dl, EVT VT, SDValue Op1) {
+MachineSDNode *SelectionDAG::getMachineNode(unsigned Opcode, const SDLoc &dl,
+                                            EVT VT, SDValue Op1) {
   SDVTList VTs = getVTList(VT);
   SDValue Ops[] = { Op1 };
   return getMachineNode(Opcode, dl, VTs, Ops);
 }
 
-MachineSDNode *
-SelectionDAG::getMachineNode(unsigned Opcode, SDLoc dl, EVT VT,
-                             SDValue Op1, SDValue Op2) {
+MachineSDNode *SelectionDAG::getMachineNode(unsigned Opcode, const SDLoc &dl,
+                                            EVT VT, SDValue Op1, SDValue Op2) {
   SDVTList VTs = getVTList(VT);
   SDValue Ops[] = { Op1, Op2 };
   return getMachineNode(Opcode, dl, VTs, Ops);
 }
 
-MachineSDNode *
-SelectionDAG::getMachineNode(unsigned Opcode, SDLoc dl, EVT VT,
-                             SDValue Op1, SDValue Op2, SDValue Op3) {
+MachineSDNode *SelectionDAG::getMachineNode(unsigned Opcode, const SDLoc &dl,
+                                            EVT VT, SDValue Op1, SDValue Op2,
+                                            SDValue Op3) {
   SDVTList VTs = getVTList(VT);
   SDValue Ops[] = { Op1, Op2, Op3 };
   return getMachineNode(Opcode, dl, VTs, Ops);
 }
 
-MachineSDNode *
-SelectionDAG::getMachineNode(unsigned Opcode, SDLoc dl, EVT VT,
-                             ArrayRef<SDValue> Ops) {
+MachineSDNode *SelectionDAG::getMachineNode(unsigned Opcode, const SDLoc &dl,
+                                            EVT VT, ArrayRef<SDValue> Ops) {
   SDVTList VTs = getVTList(VT);
   return getMachineNode(Opcode, dl, VTs, Ops);
 }
 
-MachineSDNode *
-SelectionDAG::getMachineNode(unsigned Opcode, SDLoc dl, EVT VT1, EVT VT2) {
+MachineSDNode *SelectionDAG::getMachineNode(unsigned Opcode, const SDLoc &dl,
+                                            EVT VT1, EVT VT2) {
   SDVTList VTs = getVTList(VT1, VT2);
   return getMachineNode(Opcode, dl, VTs, None);
 }
 
-MachineSDNode *
-SelectionDAG::getMachineNode(unsigned Opcode, SDLoc dl,
-                             EVT VT1, EVT VT2, SDValue Op1) {
+MachineSDNode *SelectionDAG::getMachineNode(unsigned Opcode, const SDLoc &dl,
+                                            EVT VT1, EVT VT2, SDValue Op1) {
   SDVTList VTs = getVTList(VT1, VT2);
   SDValue Ops[] = { Op1 };
   return getMachineNode(Opcode, dl, VTs, Ops);
 }
 
-MachineSDNode *
-SelectionDAG::getMachineNode(unsigned Opcode, SDLoc dl,
-                             EVT VT1, EVT VT2, SDValue Op1, SDValue Op2) {
+MachineSDNode *SelectionDAG::getMachineNode(unsigned Opcode, const SDLoc &dl,
+                                            EVT VT1, EVT VT2, SDValue Op1,
+                                            SDValue Op2) {
   SDVTList VTs = getVTList(VT1, VT2);
   SDValue Ops[] = { Op1, Op2 };
   return getMachineNode(Opcode, dl, VTs, Ops);
 }
 
-MachineSDNode *
-SelectionDAG::getMachineNode(unsigned Opcode, SDLoc dl,
-                             EVT VT1, EVT VT2, SDValue Op1,
-                             SDValue Op2, SDValue Op3) {
+MachineSDNode *SelectionDAG::getMachineNode(unsigned Opcode, const SDLoc &dl,
+                                            EVT VT1, EVT VT2, SDValue Op1,
+                                            SDValue Op2, SDValue Op3) {
   SDVTList VTs = getVTList(VT1, VT2);
   SDValue Ops[] = { Op1, Op2, Op3 };
   return getMachineNode(Opcode, dl, VTs, Ops);
 }
 
-MachineSDNode *
-SelectionDAG::getMachineNode(unsigned Opcode, SDLoc dl,
-                             EVT VT1, EVT VT2,
-                             ArrayRef<SDValue> Ops) {
+MachineSDNode *SelectionDAG::getMachineNode(unsigned Opcode, const SDLoc &dl,
+                                            EVT VT1, EVT VT2,
+                                            ArrayRef<SDValue> Ops) {
   SDVTList VTs = getVTList(VT1, VT2);
   return getMachineNode(Opcode, dl, VTs, Ops);
 }
 
-MachineSDNode *
-SelectionDAG::getMachineNode(unsigned Opcode, SDLoc dl,
-                             EVT VT1, EVT VT2, EVT VT3,
-                             SDValue Op1, SDValue Op2) {
+MachineSDNode *SelectionDAG::getMachineNode(unsigned Opcode, const SDLoc &dl,
+                                            EVT VT1, EVT VT2, EVT VT3,
+                                            SDValue Op1, SDValue Op2) {
   SDVTList VTs = getVTList(VT1, VT2, VT3);
   SDValue Ops[] = { Op1, Op2 };
   return getMachineNode(Opcode, dl, VTs, Ops);
 }
 
-MachineSDNode *
-SelectionDAG::getMachineNode(unsigned Opcode, SDLoc dl,
-                             EVT VT1, EVT VT2, EVT VT3,
-                             SDValue Op1, SDValue Op2, SDValue Op3) {
+MachineSDNode *SelectionDAG::getMachineNode(unsigned Opcode, const SDLoc &dl,
+                                            EVT VT1, EVT VT2, EVT VT3,
+                                            SDValue Op1, SDValue Op2,
+                                            SDValue Op3) {
   SDVTList VTs = getVTList(VT1, VT2, VT3);
   SDValue Ops[] = { Op1, Op2, Op3 };
   return getMachineNode(Opcode, dl, VTs, Ops);
 }
 
-MachineSDNode *
-SelectionDAG::getMachineNode(unsigned Opcode, SDLoc dl,
-                             EVT VT1, EVT VT2, EVT VT3,
-                             ArrayRef<SDValue> Ops) {
+MachineSDNode *SelectionDAG::getMachineNode(unsigned Opcode, const SDLoc &dl,
+                                            EVT VT1, EVT VT2, EVT VT3,
+                                            ArrayRef<SDValue> Ops) {
   SDVTList VTs = getVTList(VT1, VT2, VT3);
   return getMachineNode(Opcode, dl, VTs, Ops);
 }
 
-MachineSDNode *
-SelectionDAG::getMachineNode(unsigned Opcode, SDLoc dl, EVT VT1,
-                             EVT VT2, EVT VT3, EVT VT4,
-                             ArrayRef<SDValue> Ops) {
+MachineSDNode *SelectionDAG::getMachineNode(unsigned Opcode, const SDLoc &dl,
+                                            EVT VT1, EVT VT2, EVT VT3, EVT VT4,
+                                            ArrayRef<SDValue> Ops) {
   SDVTList VTs = getVTList(VT1, VT2, VT3, VT4);
   return getMachineNode(Opcode, dl, VTs, Ops);
 }
 
-MachineSDNode *
-SelectionDAG::getMachineNode(unsigned Opcode, SDLoc dl,
-                             ArrayRef<EVT> ResultTys,
-                             ArrayRef<SDValue> Ops) {
+MachineSDNode *SelectionDAG::getMachineNode(unsigned Opcode, const SDLoc &dl,
+                                            ArrayRef<EVT> ResultTys,
+                                            ArrayRef<SDValue> Ops) {
   SDVTList VTs = getVTList(ResultTys);
   return getMachineNode(Opcode, dl, VTs, Ops);
 }
 
-MachineSDNode *
-SelectionDAG::getMachineNode(unsigned Opcode, SDLoc DL, SDVTList VTs,
-                             ArrayRef<SDValue> OpsArray) {
+MachineSDNode *SelectionDAG::getMachineNode(unsigned Opcode, const SDLoc &DL,
+                                            SDVTList VTs,
+                                            ArrayRef<SDValue> Ops) {
   bool DoCSE = VTs.VTs[VTs.NumVTs-1] != MVT::Glue;
   MachineSDNode *N;
   void *IP = nullptr;
-  const SDValue *Ops = OpsArray.data();
-  unsigned NumOps = OpsArray.size();
 
   if (DoCSE) {
     FoldingSetNodeID ID;
-    AddNodeIDNode(ID, ~Opcode, VTs, OpsArray);
+    AddNodeIDNode(ID, ~Opcode, VTs, Ops);
     IP = nullptr;
-    if (SDNode *E = FindNodeOrInsertPos(ID, DL.getDebugLoc(), IP)) {
+    if (SDNode *E = FindNodeOrInsertPos(ID, DL, IP)) {
       return cast<MachineSDNode>(UpdadeSDLocOnMergedSDNode(E, DL));
     }
   }
 
   // Allocate a new MachineSDNode.
-  N = new (NodeAllocator) MachineSDNode(~Opcode, DL.getIROrder(),
-                                        DL.getDebugLoc(), VTs);
-
-  // Initialize the operands list.
-  if (NumOps > array_lengthof(N->LocalOperands))
-    // We're creating a final node that will live unmorphed for the
-    // remainder of the current SelectionDAG iteration, so we can allocate
-    // the operands directly out of a pool with no recycling metadata.
-    N->InitOperands(OperandAllocator.Allocate<SDUse>(NumOps),
-                    Ops, NumOps);
-  else
-    N->InitOperands(N->LocalOperands, Ops, NumOps);
-  N->OperandsNeedDelete = false;
+  N = newSDNode<MachineSDNode>(~Opcode, DL.getIROrder(), DL.getDebugLoc(), VTs);
+  createOperands(N, Ops);
 
   if (DoCSE)
     CSEMap.InsertNode(N, IP);
@@ -6214,9 +6152,8 @@ SelectionDAG::getMachineNode(unsigned Opcode, SDLoc DL, SDVTList VTs,
 
 /// getTargetExtractSubreg - A convenience function for creating
 /// TargetOpcode::EXTRACT_SUBREG nodes.
-SDValue
-SelectionDAG::getTargetExtractSubreg(int SRIdx, SDLoc DL, EVT VT,
-                                     SDValue Operand) {
+SDValue SelectionDAG::getTargetExtractSubreg(int SRIdx, const SDLoc &DL, EVT VT,
+                                             SDValue Operand) {
   SDValue SRIdxVal = getTargetConstant(SRIdx, DL, MVT::i32);
   SDNode *Subreg = getMachineNode(TargetOpcode::EXTRACT_SUBREG, DL,
                                   VT, Operand, SRIdxVal);
@@ -6225,9 +6162,8 @@ SelectionDAG::getTargetExtractSubreg(int SRIdx, SDLoc DL, EVT VT,
 
 /// getTargetInsertSubreg - A convenience function for creating
 /// TargetOpcode::INSERT_SUBREG nodes.
-SDValue
-SelectionDAG::getTargetInsertSubreg(int SRIdx, SDLoc DL, EVT VT,
-                                    SDValue Operand, SDValue Subreg) {
+SDValue SelectionDAG::getTargetInsertSubreg(int SRIdx, const SDLoc &DL, EVT VT,
+                                            SDValue Operand, SDValue Subreg) {
   SDValue SRIdxVal = getTargetConstant(SRIdx, DL, MVT::i32);
   SDNode *Result = getMachineNode(TargetOpcode::INSERT_SUBREG, DL,
                                   VT, Operand, Subreg, SRIdxVal);
@@ -6243,7 +6179,7 @@ SDNode *SelectionDAG::getNodeIfExists(unsigned Opcode, SDVTList VTList,
     FoldingSetNodeID ID;
     AddNodeIDNode(ID, Opcode, VTList, Ops);
     void *IP = nullptr;
-    if (SDNode *E = FindNodeOrInsertPos(ID, DebugLoc(), IP)) {
+    if (SDNode *E = FindNodeOrInsertPos(ID, SDLoc(), IP)) {
       if (Flags)
         E->intersectFlagsWith(Flags);
       return E;
@@ -6257,7 +6193,7 @@ SDNode *SelectionDAG::getNodeIfExists(unsigned Opcode, SDVTList VTList,
 /// SDNode
 SDDbgValue *SelectionDAG::getDbgValue(MDNode *Var, MDNode *Expr, SDNode *N,
                                       unsigned R, bool IsIndirect, uint64_t Off,
-                                      DebugLoc DL, unsigned O) {
+                                      const DebugLoc &DL, unsigned O) {
   assert(cast<DILocalVariable>(Var)->isValidLocationForIntrinsic(DL) &&
          "Expected inlined-at fields to agree");
   return new (DbgInfo->getAlloc())
@@ -6267,7 +6203,7 @@ SDDbgValue *SelectionDAG::getDbgValue(MDNode *Var, MDNode *Expr, SDNode *N,
 /// Constant
 SDDbgValue *SelectionDAG::getConstantDbgValue(MDNode *Var, MDNode *Expr,
                                               const Value *C, uint64_t Off,
-                                              DebugLoc DL, unsigned O) {
+                                              const DebugLoc &DL, unsigned O) {
   assert(cast<DILocalVariable>(Var)->isValidLocationForIntrinsic(DL) &&
          "Expected inlined-at fields to agree");
   return new (DbgInfo->getAlloc()) SDDbgValue(Var, Expr, C, Off, DL, O);
@@ -6276,7 +6212,8 @@ SDDbgValue *SelectionDAG::getConstantDbgValue(MDNode *Var, MDNode *Expr,
 /// FrameIndex
 SDDbgValue *SelectionDAG::getFrameIndexDbgValue(MDNode *Var, MDNode *Expr,
                                                 unsigned FI, uint64_t Off,
-                                                DebugLoc DL, unsigned O) {
+                                                const DebugLoc &DL,
+                                                unsigned O) {
   assert(cast<DILocalVariable>(Var)->isValidLocationForIntrinsic(DL) &&
          "Expected inlined-at fields to agree");
   return new (DbgInfo->getAlloc()) SDDbgValue(Var, Expr, FI, Off, DL, O);
@@ -6348,6 +6285,9 @@ void SelectionDAG::ReplaceAllUsesWith(SDValue FromN, SDValue To) {
     AddModifiedNodeToCSEMaps(User);
   }
 
+  // Preserve Debug Values
+  TransferDbgValues(FromN, To);
+
   // If we just RAUW'd the root, take note.
   if (FromN == getRoot())
     setRoot(To);
@@ -6371,6 +6311,13 @@ void SelectionDAG::ReplaceAllUsesWith(SDNode *From, SDNode *To) {
   if (From == To)
     return;
 
+  // Preserve Debug Info. Only do this if there's a use.
+  for (unsigned i = 0, e = From->getNumValues(); i != e; ++i)
+    if (From->hasAnyUseOfValue(i)) {
+      assert((i < To->getNumValues()) && "Invalid To location");
+      TransferDbgValues(SDValue(From, i), SDValue(To, i));
+    }
+
   // Iterate over just the existing users of From. See the comments in
   // the ReplaceAllUsesWith above.
   SDNode::use_iterator UI = From->use_begin(), UE = From->use_end();
@@ -6410,6 +6357,10 @@ void SelectionDAG::ReplaceAllUsesWith(SDNode *From, const SDValue *To) {
   if (From->getNumValues() == 1)  // Handle the simple case efficiently.
     return ReplaceAllUsesWith(SDValue(From, 0), To[0]);
 
+  // Preserve Debug Info.
+  for (unsigned i = 0, e = From->getNumValues(); i != e; ++i)
+    TransferDbgValues(SDValue(From, i), *To);
+
   // Iterate over just the existing users of From. See the comments in
   // the ReplaceAllUsesWith above.
   SDNode::use_iterator UI = From->use_begin(), UE = From->use_end();
@@ -6454,6 +6405,9 @@ void SelectionDAG::ReplaceAllUsesOfValueWith(SDValue From, SDValue To){
     return;
   }
 
+  // Preserve Debug Info.
+  TransferDbgValues(From, To);
+
   // Iterate over just the existing users of From. See the comments in
   // the ReplaceAllUsesWith above.
   SDNode::use_iterator UI = From.getNode()->use_begin(),
@@ -6528,6 +6482,8 @@ void SelectionDAG::ReplaceAllUsesOfValuesWith(const SDValue *From,
   if (Num == 1)
     return ReplaceAllUsesOfValueWith(*From, *To);
 
+  TransferDbgValues(*From, *To);
+
   // Read up all the uses and make records of them. This helps
   // processing new uses that are introduced during the
   // replacement process.
@@ -6628,7 +6584,7 @@ unsigned SelectionDAG::AssignTopologicalOrder() {
       if (Degree == 0) {
         // All of P's operands are sorted, so P may sorted now.
         P->setNodeId(DAGSize++);
-        if (P != SortedPos)
+        if (P->getIterator() != SortedPos)
           SortedPos = AllNodes.insert(SortedPos, AllNodes.remove(P));
         assert(SortedPos != AllNodes.end() && "Overran node list");
         ++SortedPos;
@@ -6637,7 +6593,7 @@ unsigned SelectionDAG::AssignTopologicalOrder() {
         P->setNodeId(Degree);
       }
     }
-    if (&Node == SortedPos) {
+    if (Node.getIterator() == SortedPos) {
 #ifndef NDEBUG
       allnodes_iterator I(N);
       SDNode *S = &*++I;
@@ -6676,28 +6632,26 @@ void SelectionDAG::AddDbgValue(SDDbgValue *DB, SDNode *SD, bool isParameter) {
   DbgInfo->add(DB, SD, isParameter);
 }
 
-/// TransferDbgValues - Transfer SDDbgValues.
+/// TransferDbgValues - Transfer SDDbgValues. Called in replace nodes.
 void SelectionDAG::TransferDbgValues(SDValue From, SDValue To) {
   if (From == To || !From.getNode()->getHasDebugValue())
     return;
   SDNode *FromNode = From.getNode();
   SDNode *ToNode = To.getNode();
   ArrayRef<SDDbgValue *> DVs = GetDbgValues(FromNode);
-  SmallVector<SDDbgValue *, 2> ClonedDVs;
   for (ArrayRef<SDDbgValue *>::iterator I = DVs.begin(), E = DVs.end();
        I != E; ++I) {
     SDDbgValue *Dbg = *I;
-    if (Dbg->getKind() == SDDbgValue::SDNODE) {
+    // Only add Dbgvalues attached to same ResNo.
+    if (Dbg->getKind() == SDDbgValue::SDNODE &&
+        Dbg->getResNo() == From.getResNo()) {
       SDDbgValue *Clone =
           getDbgValue(Dbg->getVariable(), Dbg->getExpression(), ToNode,
                       To.getResNo(), Dbg->isIndirect(), Dbg->getOffset(),
                       Dbg->getDebugLoc(), Dbg->getOrder());
-      ClonedDVs.push_back(Clone);
+      AddDbgValue(Clone, ToNode, false);
     }
   }
-  for (SmallVectorImpl<SDDbgValue *>::iterator I = ClonedDVs.begin(),
-         E = ClonedDVs.end(); I != E; ++I)
-    AddDbgValue(*I, ToNode, false);
 }
 
 //===----------------------------------------------------------------------===//
@@ -6724,26 +6678,31 @@ bool llvm::isOneConstant(SDValue V) {
   return Const != nullptr && Const->isOne();
 }
 
+bool llvm::isBitwiseNot(SDValue V) {
+  return V.getOpcode() == ISD::XOR && isAllOnesConstant(V.getOperand(1));
+}
+
 HandleSDNode::~HandleSDNode() {
   DropOperands();
 }
 
 GlobalAddressSDNode::GlobalAddressSDNode(unsigned Opc, unsigned Order,
-                                         DebugLoc DL, const GlobalValue *GA,
-                                         EVT VT, int64_t o, unsigned char TF)
-  : SDNode(Opc, Order, DL, getSDVTList(VT)), Offset(o), TargetFlags(TF) {
+                                         const DebugLoc &DL,
+                                         const GlobalValue *GA, EVT VT,
+                                         int64_t o, unsigned char TF)
+    : SDNode(Opc, Order, DL, getSDVTList(VT)), Offset(o), TargetFlags(TF) {
   TheGlobal = GA;
 }
 
-AddrSpaceCastSDNode::AddrSpaceCastSDNode(unsigned Order, DebugLoc dl, EVT VT,
-                                         SDValue X, unsigned SrcAS,
+AddrSpaceCastSDNode::AddrSpaceCastSDNode(unsigned Order, const DebugLoc &dl,
+                                         EVT VT, unsigned SrcAS,
                                          unsigned DestAS)
- : UnarySDNode(ISD::ADDRSPACECAST, Order, dl, getSDVTList(VT), X),
-   SrcAddrSpace(SrcAS), DestAddrSpace(DestAS) {}
+    : SDNode(ISD::ADDRSPACECAST, Order, dl, getSDVTList(VT)),
+      SrcAddrSpace(SrcAS), DestAddrSpace(DestAS) {}
 
-MemSDNode::MemSDNode(unsigned Opc, unsigned Order, DebugLoc dl, SDVTList VTs,
-                     EVT memvt, MachineMemOperand *mmo)
- : SDNode(Opc, Order, dl, VTs), MemoryVT(memvt), MMO(mmo) {
+MemSDNode::MemSDNode(unsigned Opc, unsigned Order, const DebugLoc &dl,
+                     SDVTList VTs, EVT memvt, MachineMemOperand *mmo)
+    : SDNode(Opc, Order, dl, VTs), MemoryVT(memvt), MMO(mmo) {
   SubclassData = encodeMemSDNodeFlags(0, ISD::UNINDEXED, MMO->isVolatile(),
                                       MMO->isNonTemporal(), MMO->isInvariant());
   assert(isVolatile() == MMO->isVolatile() && "Volatile encoding error!");
@@ -6755,16 +6714,6 @@ MemSDNode::MemSDNode(unsigned Opc, unsigned Order, DebugLoc dl, SDVTList VTs,
   assert(memvt.getStoreSize() <= MMO->getSize() && "Size mismatch!");
 }
 
-MemSDNode::MemSDNode(unsigned Opc, unsigned Order, DebugLoc dl, SDVTList VTs,
-                     ArrayRef<SDValue> Ops, EVT memvt, MachineMemOperand *mmo)
-   : SDNode(Opc, Order, dl, VTs, Ops),
-     MemoryVT(memvt), MMO(mmo) {
-  SubclassData = encodeMemSDNodeFlags(0, ISD::UNINDEXED, MMO->isVolatile(),
-                                      MMO->isNonTemporal(), MMO->isInvariant());
-  assert(isVolatile() == MMO->isVolatile() && "Volatile encoding error!");
-  assert(memvt.getStoreSize() <= MMO->getSize() && "Size mismatch!");
-}
-
 /// Profile - Gather unique data for the node.
 ///
 void SDNode::Profile(FoldingSetNodeID &ID) const {
@@ -6894,44 +6843,13 @@ bool SDValue::reachesChainWithoutSideEffects(SDValue Dest,
   return false;
 }
 
-/// hasPredecessor - Return true if N is a predecessor of this node.
-/// N is either an operand of this node, or can be reached by recursively
-/// traversing up the operands.
-/// NOTE: This is an expensive method. Use it carefully.
 bool SDNode::hasPredecessor(const SDNode *N) const {
   SmallPtrSet<const SDNode *, 32> Visited;
   SmallVector<const SDNode *, 16> Worklist;
+  Worklist.push_back(this);
   return hasPredecessorHelper(N, Visited, Worklist);
 }
 
-bool
-SDNode::hasPredecessorHelper(const SDNode *N,
-                             SmallPtrSetImpl<const SDNode *> &Visited,
-                             SmallVectorImpl<const SDNode *> &Worklist) const {
-  if (Visited.empty()) {
-    Worklist.push_back(this);
-  } else {
-    // Take a look in the visited set. If we've already encountered this node
-    // we needn't search further.
-    if (Visited.count(N))
-      return true;
-  }
-
-  // Haven't visited N yet. Continue the search.
-  while (!Worklist.empty()) {
-    const SDNode *M = Worklist.pop_back_val();
-    for (const SDValue &OpV : M->op_values()) {
-      SDNode *Op = OpV.getNode();
-      if (Visited.insert(Op).second)
-        Worklist.push_back(Op);
-      if (Op == N)
-        return true;
-    }
-  }
-
-  return false;
-}
-
 uint64_t SDNode::getConstantOperandVal(unsigned Num) const {
   assert(Num < NumOperands && "Invalid child # of SDNode!");
   return cast<ConstantSDNode>(OperandList[Num])->getZExtValue();
@@ -7018,12 +6936,14 @@ SDValue SelectionDAG::UnrollVectorOp(SDNode *N, unsigned ResNE) {
                  EVT::getVectorVT(*getContext(), EltVT, ResNE), Scalars);
 }
 
-
-/// isConsecutiveLoad - Return true if LD is loading 'Bytes' bytes from a
-/// location that is 'Dist' units away from the location that the 'Base' load
-/// is loading from.
-bool SelectionDAG::isConsecutiveLoad(LoadSDNode *LD, LoadSDNode *Base,
-                                     unsigned Bytes, int Dist) const {
+bool SelectionDAG::areNonVolatileConsecutiveLoads(LoadSDNode *LD,
+                                                  LoadSDNode *Base,
+                                                  unsigned Bytes,
+                                                  int Dist) const {
+  if (LD->isVolatile() || Base->isVolatile())
+    return false;
+  if (LD->isIndexed() || Base->isIndexed())
+    return false;
   if (LD->getChain() != Base->getChain())
     return false;
   EVT VT = LD->getValueType(0);
@@ -7204,7 +7124,7 @@ bool BuildVectorSDNode::isConstantSplat(APInt &SplatValue,
     SDValue OpVal = getOperand(i);
     unsigned BitPos = j * EltBitSize;
 
-    if (OpVal.getOpcode() == ISD::UNDEF)
+    if (OpVal.isUndef())
       SplatUndef |= APInt::getBitsSet(sz, BitPos, BitPos + EltBitSize);
     else if (ConstantSDNode *CN = dyn_cast<ConstantSDNode>(OpVal))
       SplatValue |= CN->getAPIntValue().zextOrTrunc(EltBitSize).
@@ -7250,7 +7170,7 @@ SDValue BuildVectorSDNode::getSplatValue(BitVector *UndefElements) const {
   SDValue Splatted;
   for (unsigned i = 0, e = getNumOperands(); i != e; ++i) {
     SDValue Op = getOperand(i);
-    if (Op.getOpcode() == ISD::UNDEF) {
+    if (Op.isUndef()) {
       if (UndefElements)
         (*UndefElements)[i] = true;
     } else if (!Splatted) {
@@ -7261,7 +7181,7 @@ SDValue BuildVectorSDNode::getSplatValue(BitVector *UndefElements) const {
   }
 
   if (!Splatted) {
-    assert(getOperand(0).getOpcode() == ISD::UNDEF &&
+    assert(getOperand(0).isUndef() &&
            "Can only have a splat without a constant for all undefs.");
     return getOperand(0);
   }
@@ -7286,7 +7206,7 @@ BuildVectorSDNode::getConstantFPSplatPow2ToLog2Int(BitVector *UndefElements,
           dyn_cast_or_null<ConstantFPSDNode>(getSplatValue(UndefElements))) {
     bool IsExact;
     APSInt IntVal(BitWidth);
-    APFloat APF = CN->getValueAPF();
+    const APFloat &APF = CN->getValueAPF();
     if (APF.convertToInteger(IntVal, APFloat::rmTowardZero, &IsExact) !=
             APFloat::opOK ||
         !IsExact)
@@ -7322,6 +7242,22 @@ bool ShuffleVectorSDNode::isSplatMask(const int *Mask, EVT VT) {
   return true;
 }
 
+// \brief Returns the SDNode if it is a constant integer BuildVector
+// or constant integer.
+SDNode *SelectionDAG::isConstantIntBuildVectorOrConstantInt(SDValue N) {
+  if (isa<ConstantSDNode>(N))
+    return N.getNode();
+  if (ISD::isBuildVectorOfConstantSDNodes(N.getNode()))
+    return N.getNode();
+  // Treat a GlobalAddress supporting constant offset folding as a
+  // constant integer.
+  if (GlobalAddressSDNode *GA = dyn_cast<GlobalAddressSDNode>(N))
+    if (GA->getOpcode() == ISD::GlobalAddress &&
+        TLI->isOffsetFoldingLegal(GA))
+      return GA;
+  return nullptr;
+}
+
 #ifndef NDEBUG
 static void checkForCyclesHelper(const SDNode *N,
                                  SmallPtrSetImpl<const SDNode*> &Visited,
@@ -7353,9 +7289,9 @@ void llvm::checkForCycles(const llvm::SDNode *N,
                           bool force) {
 #ifndef NDEBUG
   bool check = force;
-#ifdef XDEBUG
+#ifdef EXPENSIVE_CHECKS
   check = true;
-#endif  // XDEBUG
+#endif  // EXPENSIVE_CHECKS
   if (check) {
     assert(N && "Checking nonexistent SDNode");
     SmallPtrSet<const SDNode*, 32> visited;
diff --git a/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp b/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
index 45ae39af7600..e03282cad6b8 100644
--- a/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
+++ b/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
@@ -20,9 +20,11 @@
 #include "llvm/Analysis/AliasAnalysis.h"
 #include "llvm/Analysis/BranchProbabilityInfo.h"
 #include "llvm/Analysis/ConstantFolding.h"
+#include "llvm/Analysis/Loads.h"
 #include "llvm/Analysis/TargetLibraryInfo.h"
 #include "llvm/Analysis/ValueTracking.h"
 #include "llvm/Analysis/VectorUtils.h"
+#include "llvm/CodeGen/Analysis.h"
 #include "llvm/CodeGen/FastISel.h"
 #include "llvm/CodeGen/FunctionLoweringInfo.h"
 #include "llvm/CodeGen/GCMetadata.h"
@@ -34,6 +36,7 @@
 #include "llvm/CodeGen/MachineModuleInfo.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
 #include "llvm/CodeGen/SelectionDAG.h"
+#include "llvm/CodeGen/SelectionDAGTargetInfo.h"
 #include "llvm/CodeGen/StackMaps.h"
 #include "llvm/CodeGen/WinEHFuncInfo.h"
 #include "llvm/IR/CallingConv.h"
@@ -42,6 +45,7 @@
 #include "llvm/IR/DebugInfo.h"
 #include "llvm/IR/DerivedTypes.h"
 #include "llvm/IR/Function.h"
+#include "llvm/IR/GetElementPtrTypeIterator.h"
 #include "llvm/IR/GlobalVariable.h"
 #include "llvm/IR/InlineAsm.h"
 #include "llvm/IR/Instructions.h"
@@ -61,7 +65,6 @@
 #include "llvm/Target/TargetIntrinsicInfo.h"
 #include "llvm/Target/TargetLowering.h"
 #include "llvm/Target/TargetOptions.h"
-#include "llvm/Target/TargetSelectionDAGInfo.h"
 #include "llvm/Target/TargetSubtargetInfo.h"
 #include <algorithm>
 #include <utility>
@@ -84,6 +87,19 @@ static cl::opt<bool>
 EnableFMFInDAG("enable-fmf-dag", cl::init(true), cl::Hidden,
                 cl::desc("Enable fast-math-flags for DAG nodes"));
 
+/// Minimum jump table density for normal functions.
+static cl::opt<unsigned>
+JumpTableDensity("jump-table-density", cl::init(10), cl::Hidden,
+                 cl::desc("Minimum density for building a jump table in "
+                          "a normal function"));
+
+/// Minimum jump table density for -Os or -Oz functions.
+static cl::opt<unsigned>
+OptsizeJumpTableDensity("optsize-jump-table-density", cl::init(40), cl::Hidden,
+                        cl::desc("Minimum density for building a jump table in "
+                                 "an optsize function"));
+
+
 // Limit the width of DAG chains. This is important in general to prevent
 // DAG-based analysis from blowing up. For example, alias analysis and
 // load clustering may not complete in reasonable time. It is difficult to
@@ -94,26 +110,25 @@ EnableFMFInDAG("enable-fmf-dag", cl::init(true), cl::Hidden,
 // MaxParallelChains default is arbitrarily high to avoid affecting
 // optimization, but could be lowered to improve compile time. Any ld-ld-st-st
 // sequence over this should have been converted to llvm.memcpy by the
-// frontend. It easy to induce this behavior with .ll code such as:
+// frontend. It is easy to induce this behavior with .ll code such as:
 // %buffer = alloca [4096 x i8]
 // %data = load [4096 x i8]* %argPtr
 // store [4096 x i8] %data, [4096 x i8]* %buffer
 static const unsigned MaxParallelChains = 64;
 
-static SDValue getCopyFromPartsVector(SelectionDAG &DAG, SDLoc DL,
+static SDValue getCopyFromPartsVector(SelectionDAG &DAG, const SDLoc &DL,
                                       const SDValue *Parts, unsigned NumParts,
                                       MVT PartVT, EVT ValueVT, const Value *V);
 
 /// getCopyFromParts - Create a value that contains the specified legal parts
 /// combined into the value they represent.  If the parts combine to a type
-/// larger then ValueVT then AssertOp can be used to specify whether the extra
+/// larger than ValueVT then AssertOp can be used to specify whether the extra
 /// bits are known to be zero (ISD::AssertZext) or sign extended from ValueVT
 /// (ISD::AssertSext).
-static SDValue getCopyFromParts(SelectionDAG &DAG, SDLoc DL,
-                                const SDValue *Parts,
-                                unsigned NumParts, MVT PartVT, EVT ValueVT,
-                                const Value *V,
-                                ISD::NodeType AssertOp = ISD::DELETED_NODE) {
+static SDValue getCopyFromParts(SelectionDAG &DAG, const SDLoc &DL,
+                                const SDValue *Parts, unsigned NumParts,
+                                MVT PartVT, EVT ValueVT, const Value *V,
+                                Optional<ISD::NodeType> AssertOp = None) {
   if (ValueVT.isVector())
     return getCopyFromPartsVector(DAG, DL, Parts, NumParts,
                                   PartVT, ValueVT, V);
@@ -193,6 +208,8 @@ static SDValue getCopyFromParts(SelectionDAG &DAG, SDLoc DL,
   }
 
   // There is now one part, held in Val.  Correct it to match ValueVT.
+  // PartEVT is the type of the register class that holds the value.
+  // ValueVT is the type of the inline asm operation.
   EVT PartEVT = Val.getValueType();
 
   if (PartEVT == ValueVT)
@@ -206,13 +223,18 @@ static SDValue getCopyFromParts(SelectionDAG &DAG, SDLoc DL,
     Val = DAG.getNode(ISD::TRUNCATE, DL, PartEVT, Val);
   }
 
+  // Handle types that have the same size.
+  if (PartEVT.getSizeInBits() == ValueVT.getSizeInBits())
+    return DAG.getNode(ISD::BITCAST, DL, ValueVT, Val);
+
+  // Handle types with different sizes.
   if (PartEVT.isInteger() && ValueVT.isInteger()) {
     if (ValueVT.bitsLT(PartEVT)) {
       // For a truncate, see if we have any information to
       // indicate whether the truncated bits will always be
       // zero or sign-extension.
-      if (AssertOp != ISD::DELETED_NODE)
-        Val = DAG.getNode(AssertOp, DL, PartEVT, Val,
+      if (AssertOp.hasValue())
+        Val = DAG.getNode(*AssertOp, DL, PartEVT, Val,
                           DAG.getValueType(ValueVT));
       return DAG.getNode(ISD::TRUNCATE, DL, ValueVT, Val);
     }
@@ -229,9 +251,6 @@ static SDValue getCopyFromParts(SelectionDAG &DAG, SDLoc DL,
     return DAG.getNode(ISD::FP_EXTEND, DL, ValueVT, Val);
   }
 
-  if (PartEVT.getSizeInBits() == ValueVT.getSizeInBits())
-    return DAG.getNode(ISD::BITCAST, DL, ValueVT, Val);
-
   llvm_unreachable("Unknown mismatch!");
 }
 
@@ -251,10 +270,10 @@ static void diagnosePossiblyInvalidConstraint(LLVMContext &Ctx, const Value *V,
 
 /// getCopyFromPartsVector - Create a value that contains the specified legal
 /// parts combined into the value they represent.  If the parts combine to a
-/// type larger then ValueVT then AssertOp can be used to specify whether the
+/// type larger than ValueVT then AssertOp can be used to specify whether the
 /// extra bits are known to be zero (ISD::AssertZext) or sign extended from
 /// ValueVT (ISD::AssertSext).
-static SDValue getCopyFromPartsVector(SelectionDAG &DAG, SDLoc DL,
+static SDValue getCopyFromPartsVector(SelectionDAG &DAG, const SDLoc &DL,
                                       const SDValue *Parts, unsigned NumParts,
                                       MVT PartVT, EVT ValueVT, const Value *V) {
   assert(ValueVT.isVector() && "Not a vector value");
@@ -353,16 +372,16 @@ static SDValue getCopyFromPartsVector(SelectionDAG &DAG, SDLoc DL,
   return DAG.getNode(ISD::BUILD_VECTOR, DL, ValueVT, Val);
 }
 
-static void getCopyToPartsVector(SelectionDAG &DAG, SDLoc dl,
+static void getCopyToPartsVector(SelectionDAG &DAG, const SDLoc &dl,
                                  SDValue Val, SDValue *Parts, unsigned NumParts,
                                  MVT PartVT, const Value *V);
 
 /// getCopyToParts - Create a series of nodes that contain the specified value
 /// split into legal parts.  If the parts contain more bits than Val, then, for
 /// integers, ExtendKind can be used to specify how to generate the extra bits.
-static void getCopyToParts(SelectionDAG &DAG, SDLoc DL,
-                           SDValue Val, SDValue *Parts, unsigned NumParts,
-                           MVT PartVT, const Value *V,
+static void getCopyToParts(SelectionDAG &DAG, const SDLoc &DL, SDValue Val,
+                           SDValue *Parts, unsigned NumParts, MVT PartVT,
+                           const Value *V,
                            ISD::NodeType ExtendKind = ISD::ANY_EXTEND) {
   EVT ValueVT = Val.getValueType();
 
@@ -427,9 +446,11 @@ static void getCopyToParts(SelectionDAG &DAG, SDLoc DL,
          "Failed to tile the value with PartVT!");
 
   if (NumParts == 1) {
-    if (PartEVT != ValueVT)
+    if (PartEVT != ValueVT) {
       diagnosePossiblyInvalidConstraint(*DAG.getContext(), V,
                                         "scalar-to-vector conversion failed");
+      Val = DAG.getNode(ISD::BITCAST, DL, PartVT, Val);
+    }
 
     Parts[0] = Val;
     return;
@@ -489,7 +510,7 @@ static void getCopyToParts(SelectionDAG &DAG, SDLoc DL,
 
 /// getCopyToPartsVector - Create a series of nodes that contain the specified
 /// value split into legal parts.
-static void getCopyToPartsVector(SelectionDAG &DAG, SDLoc DL,
+static void getCopyToPartsVector(SelectionDAG &DAG, const SDLoc &DL,
                                  SDValue Val, SDValue *Parts, unsigned NumParts,
                                  MVT PartVT, const Value *V) {
   EVT ValueVT = Val.getValueType();
@@ -618,9 +639,8 @@ RegsForValue::RegsForValue(LLVMContext &Context, const TargetLowering &TLI,
 /// If the Flag pointer is NULL, no flag is used.
 SDValue RegsForValue::getCopyFromRegs(SelectionDAG &DAG,
                                       FunctionLoweringInfo &FuncInfo,
-                                      SDLoc dl,
-                                      SDValue &Chain, SDValue *Flag,
-                                      const Value *V) const {
+                                      const SDLoc &dl, SDValue &Chain,
+                                      SDValue *Flag, const Value *V) const {
   // A Value with type {} or [0 x %t] needs no registers.
   if (ValueVTs.empty())
     return SDValue();
@@ -676,25 +696,33 @@ SDValue RegsForValue::getCopyFromRegs(SelectionDAG &DAG,
       // now, just use the tightest assertzext/assertsext possible.
       bool isSExt = true;
       EVT FromVT(MVT::Other);
-      if (NumSignBits == RegSize)
-        isSExt = true, FromVT = MVT::i1;   // ASSERT SEXT 1
-      else if (NumZeroBits >= RegSize-1)
-        isSExt = false, FromVT = MVT::i1;  // ASSERT ZEXT 1
-      else if (NumSignBits > RegSize-8)
-        isSExt = true, FromVT = MVT::i8;   // ASSERT SEXT 8
-      else if (NumZeroBits >= RegSize-8)
-        isSExt = false, FromVT = MVT::i8;  // ASSERT ZEXT 8
-      else if (NumSignBits > RegSize-16)
-        isSExt = true, FromVT = MVT::i16;  // ASSERT SEXT 16
-      else if (NumZeroBits >= RegSize-16)
-        isSExt = false, FromVT = MVT::i16; // ASSERT ZEXT 16
-      else if (NumSignBits > RegSize-32)
-        isSExt = true, FromVT = MVT::i32;  // ASSERT SEXT 32
-      else if (NumZeroBits >= RegSize-32)
-        isSExt = false, FromVT = MVT::i32; // ASSERT ZEXT 32
-      else
+      if (NumSignBits == RegSize) {
+        isSExt = true;   // ASSERT SEXT 1
+        FromVT = MVT::i1;
+      } else if (NumZeroBits >= RegSize - 1) {
+        isSExt = false;  // ASSERT ZEXT 1
+        FromVT = MVT::i1;
+      } else if (NumSignBits > RegSize - 8) {
+        isSExt = true;   // ASSERT SEXT 8
+        FromVT = MVT::i8;
+      } else if (NumZeroBits >= RegSize - 8) {
+        isSExt = false;  // ASSERT ZEXT 8
+        FromVT = MVT::i8;
+      } else if (NumSignBits > RegSize - 16) {
+        isSExt = true;   // ASSERT SEXT 16
+        FromVT = MVT::i16;
+      } else if (NumZeroBits >= RegSize - 16) {
+        isSExt = false;  // ASSERT ZEXT 16
+        FromVT = MVT::i16;
+      } else if (NumSignBits > RegSize - 32) {
+        isSExt = true;   // ASSERT SEXT 32
+        FromVT = MVT::i32;
+      } else if (NumZeroBits >= RegSize - 32) {
+        isSExt = false;  // ASSERT ZEXT 32
+        FromVT = MVT::i32;
+      } else {
         continue;
-
+      }
       // Add an assertion node.
       assert(FromVT != MVT::Other);
       Parts[i] = DAG.getNode(isSExt ? ISD::AssertSext : ISD::AssertZext, dl,
@@ -714,8 +742,9 @@ SDValue RegsForValue::getCopyFromRegs(SelectionDAG &DAG,
 /// specified value into the registers specified by this object.  This uses
 /// Chain/Flag as the input and updates them for the output Chain/Flag.
 /// If the Flag pointer is NULL, no flag is used.
-void RegsForValue::getCopyToRegs(SDValue Val, SelectionDAG &DAG, SDLoc dl,
-                                 SDValue &Chain, SDValue *Flag, const Value *V,
+void RegsForValue::getCopyToRegs(SDValue Val, SelectionDAG &DAG,
+                                 const SDLoc &dl, SDValue &Chain, SDValue *Flag,
+                                 const Value *V,
                                  ISD::NodeType PreferredExtendType) const {
   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
   ISD::NodeType ExtendKind = PreferredExtendType;
@@ -770,7 +799,7 @@ void RegsForValue::getCopyToRegs(SDValue Val, SelectionDAG &DAG, SDLoc dl,
 /// operand list.  This adds the code marker and includes the number of
 /// values added into it.
 void RegsForValue::AddInlineAsmOperands(unsigned Code, bool HasMatching,
-                                        unsigned MatchingIdx, SDLoc dl,
+                                        unsigned MatchingIdx, const SDLoc &dl,
                                         SelectionDAG &DAG,
                                         std::vector<SDValue> &Ops) const {
   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
@@ -902,10 +931,48 @@ SDValue SelectionDAGBuilder::getControlRoot() {
   return Root;
 }
 
+/// Copy swift error to the final virtual register at end of a basic block, as
+/// specified by SwiftErrorWorklist, if necessary.
+static void copySwiftErrorsToFinalVRegs(SelectionDAGBuilder &SDB) {
+  const TargetLowering &TLI = SDB.DAG.getTargetLoweringInfo();
+  if (!TLI.supportSwiftError())
+    return;
+
+  if (!SDB.FuncInfo.SwiftErrorWorklist.count(SDB.FuncInfo.MBB))
+    return;
+
+  // Go through entries in SwiftErrorWorklist, and create copy as necessary.
+  FunctionLoweringInfo::SwiftErrorVRegs &WorklistEntry =
+      SDB.FuncInfo.SwiftErrorWorklist[SDB.FuncInfo.MBB];
+  FunctionLoweringInfo::SwiftErrorVRegs &MapEntry =
+      SDB.FuncInfo.SwiftErrorMap[SDB.FuncInfo.MBB];
+  for (unsigned I = 0, E = WorklistEntry.size(); I < E; I++) {
+    unsigned WorkReg = WorklistEntry[I];
+
+    // Find the swifterror virtual register for the value in SwiftErrorMap.
+    unsigned MapReg = MapEntry[I];
+    assert(TargetRegisterInfo::isVirtualRegister(MapReg) &&
+           "Entries in SwiftErrorMap should be virtual registers");
+
+    if (WorkReg == MapReg)
+      continue;
+
+    // Create copy from SwiftErrorMap to SwiftWorklist.
+    auto &DL = SDB.DAG.getDataLayout();
+    SDValue CopyNode = SDB.DAG.getCopyToReg(
+        SDB.getRoot(), SDB.getCurSDLoc(), WorkReg,
+        SDB.DAG.getRegister(MapReg, EVT(TLI.getPointerTy(DL))));
+    MapEntry[I] = WorkReg;
+    SDB.DAG.setRoot(CopyNode);
+  }
+}
+
 void SelectionDAGBuilder::visit(const Instruction &I) {
   // Set up outgoing PHI node register values before emitting the terminator.
-  if (isa<TerminatorInst>(&I))
+  if (isa<TerminatorInst>(&I)) {
+    copySwiftErrorsToFinalVRegs(*this);
     HandlePHINodesInSuccessorBlocks(I.getParent());
+  }
 
   ++SDNodeOrder;
 
@@ -992,10 +1059,8 @@ SDValue SelectionDAGBuilder::getValue(const Value *V) {
 
   // If there's a virtual register allocated and initialized for this
   // value, use it.
-  SDValue copyFromReg = getCopyFromRegs(V, V->getType());
-  if (copyFromReg.getNode()) {
+  if (SDValue copyFromReg = getCopyFromRegs(V, V->getType()))
     return copyFromReg;
-  }
 
   // Otherwise create a new SDValue and remember it.
   SDValue Val = getValueImpl(V);
@@ -1206,7 +1271,7 @@ void SelectionDAGBuilder::visitCatchRet(const CatchReturnInst &I) {
   // This will be used by the FuncletLayout pass to determine how to order the
   // BB's.
   // A 'catchret' returns to the outer scope's color.
-  Value *ParentPad = I.getParentPad();
+  Value *ParentPad = I.getCatchSwitchParentPad();
   const BasicBlock *SuccessorColor;
   if (isa<ConstantTokenNone>(ParentPad))
     SuccessorColor = &FuncInfo.Fn->getEntryBlock();
@@ -1314,6 +1379,18 @@ void SelectionDAGBuilder::visitRet(const ReturnInst &I) {
   SmallVector<ISD::OutputArg, 8> Outs;
   SmallVector<SDValue, 8> OutVals;
 
+  // Calls to @llvm.experimental.deoptimize don't generate a return value, so
+  // lower
+  //
+  //   %val = call <ty> @llvm.experimental.deoptimize()
+  //   ret <ty> %val
+  //
+  // differently.
+  if (I.getParent()->getTerminatingDeoptimizeCall()) {
+    LowerDeoptimizingReturn();
+    return;
+  }
+
   if (!FuncInfo.CanLowerReturn) {
     unsigned DemoteReg = FuncInfo.DemoteRegister;
     const Function *F = I.getParent()->getParent();
@@ -1346,11 +1423,10 @@ void SelectionDAGBuilder::visitRet(const ReturnInst &I) {
                                 DAG.getIntPtrConstant(Offsets[i],
                                                       getCurSDLoc()),
                                 &Flags);
-      Chains[i] =
-        DAG.getStore(Chain, getCurSDLoc(),
-                     SDValue(RetOp.getNode(), RetOp.getResNo() + i),
-                     // FIXME: better loc info would be nice.
-                     Add, MachinePointerInfo(), false, false, 0);
+      Chains[i] = DAG.getStore(Chain, getCurSDLoc(),
+                               SDValue(RetOp.getNode(), RetOp.getResNo() + i),
+                               // FIXME: better loc info would be nice.
+                               Add, MachinePointerInfo());
     }
 
     Chain = DAG.getNode(ISD::TokenFactor, getCurSDLoc(),
@@ -1380,7 +1456,7 @@ void SelectionDAGBuilder::visitRet(const ReturnInst &I) {
         EVT VT = ValueVTs[j];
 
         if (ExtendKind != ISD::ANY_EXTEND && VT.isInteger())
-          VT = TLI.getTypeForExtArgOrReturn(Context, VT, ExtendKind);
+          VT = TLI.getTypeForExtReturn(Context, VT, ExtendKind);
 
         unsigned NumParts = TLI.getNumRegisters(Context, VT);
         MVT PartVT = TLI.getRegisterType(Context, VT);
@@ -1409,6 +1485,23 @@ void SelectionDAGBuilder::visitRet(const ReturnInst &I) {
     }
   }
 
+  // Push in swifterror virtual register as the last element of Outs. This makes
+  // sure swifterror virtual register will be returned in the swifterror
+  // physical register.
+  const Function *F = I.getParent()->getParent();
+  if (TLI.supportSwiftError() &&
+      F->getAttributes().hasAttrSomewhere(Attribute::SwiftError)) {
+    ISD::ArgFlagsTy Flags = ISD::ArgFlagsTy();
+    Flags.setSwiftError();
+    Outs.push_back(ISD::OutputArg(Flags, EVT(TLI.getPointerTy(DL)) /*vt*/,
+                                  EVT(TLI.getPointerTy(DL)) /*argvt*/,
+                                  true /*isfixed*/, 1 /*origidx*/,
+                                  0 /*partOffs*/));
+    // Create SDNode for the swifterror virtual register.
+    OutVals.push_back(DAG.getRegister(FuncInfo.SwiftErrorMap[FuncInfo.MBB][0],
+                                      EVT(TLI.getPointerTy(DL))));
+  }
+
   bool isVarArg = DAG.getMachineFunction().getFunction()->isVarArg();
   CallingConv::ID CallConv =
     DAG.getMachineFunction().getFunction()->getCallingConv();
@@ -1906,6 +1999,27 @@ void SelectionDAGBuilder::visitJumpTableHeader(JumpTable &JT,
   DAG.setRoot(BrCond);
 }
 
+/// Create a LOAD_STACK_GUARD node, and let it carry the target specific global
+/// variable if there exists one.
+static SDValue getLoadStackGuard(SelectionDAG &DAG, const SDLoc &DL,
+                                 SDValue &Chain) {
+  const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+  EVT PtrTy = TLI.getPointerTy(DAG.getDataLayout());
+  MachineFunction &MF = DAG.getMachineFunction();
+  Value *Global = TLI.getSDagStackGuard(*MF.getFunction()->getParent());
+  MachineSDNode *Node =
+      DAG.getMachineNode(TargetOpcode::LOAD_STACK_GUARD, DL, PtrTy, Chain);
+  if (Global) {
+    MachinePointerInfo MPInfo(Global);
+    MachineInstr::mmo_iterator MemRefs = MF.allocateMemRefsArray(1);
+    auto Flags = MachineMemOperand::MOLoad | MachineMemOperand::MOInvariant;
+    *MemRefs = MF.getMachineMemOperand(MPInfo, Flags, PtrTy.getSizeInBits() / 8,
+                                       DAG.getEVTAlignment(PtrTy));
+    Node->setMemRefs(MemRefs, MemRefs + 1);
+  }
+  return SDValue(Node, 0);
+}
+
 /// Codegen a new tail for a stack protector check ParentMBB which has had its
 /// tail spliced into a stack protector check success bb.
 ///
@@ -1922,32 +2036,59 @@ void SelectionDAGBuilder::visitSPDescriptorParent(StackProtectorDescriptor &SPD,
   MachineFrameInfo *MFI = ParentBB->getParent()->getFrameInfo();
   int FI = MFI->getStackProtectorIndex();
 
-  const Value *IRGuard = SPD.getGuard();
-  SDValue GuardPtr = getValue(IRGuard);
+  SDValue Guard;
+  SDLoc dl = getCurSDLoc();
   SDValue StackSlotPtr = DAG.getFrameIndex(FI, PtrTy);
+  const Module &M = *ParentBB->getParent()->getFunction()->getParent();
+  unsigned Align = DL->getPrefTypeAlignment(Type::getInt8PtrTy(M.getContext()));
 
-  unsigned Align = DL->getPrefTypeAlignment(IRGuard->getType());
+  // Generate code to load the content of the guard slot.
+  SDValue StackSlot = DAG.getLoad(
+      PtrTy, dl, DAG.getEntryNode(), StackSlotPtr,
+      MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI), Align,
+      MachineMemOperand::MOVolatile);
+
+  // Retrieve guard check function, nullptr if instrumentation is inlined.
+  if (const Value *GuardCheck = TLI.getSSPStackGuardCheck(M)) {
+    // The target provides a guard check function to validate the guard value.
+    // Generate a call to that function with the content of the guard slot as
+    // argument.
+    auto *Fn = cast<Function>(GuardCheck);
+    FunctionType *FnTy = Fn->getFunctionType();
+    assert(FnTy->getNumParams() == 1 && "Invalid function signature");
 
-  SDValue Guard;
-  SDLoc dl = getCurSDLoc();
+    TargetLowering::ArgListTy Args;
+    TargetLowering::ArgListEntry Entry;
+    Entry.Node = StackSlot;
+    Entry.Ty = FnTy->getParamType(0);
+    if (Fn->hasAttribute(1, Attribute::AttrKind::InReg))
+      Entry.isInReg = true;
+    Args.push_back(Entry);
 
-  // If GuardReg is set and useLoadStackGuardNode returns true, retrieve the
-  // guard value from the virtual register holding the value. Otherwise, emit a
-  // volatile load to retrieve the stack guard value.
-  unsigned GuardReg = SPD.getGuardReg();
+    TargetLowering::CallLoweringInfo CLI(DAG);
+    CLI.setDebugLoc(getCurSDLoc())
+      .setChain(DAG.getEntryNode())
+      .setCallee(Fn->getCallingConv(), FnTy->getReturnType(),
+                 getValue(GuardCheck), std::move(Args));
 
-  if (GuardReg && TLI.useLoadStackGuardNode())
-    Guard = DAG.getCopyFromReg(DAG.getEntryNode(), dl, GuardReg,
-                               PtrTy);
-  else
-    Guard = DAG.getLoad(PtrTy, dl, DAG.getEntryNode(),
-                        GuardPtr, MachinePointerInfo(IRGuard, 0),
-                        true, false, false, Align);
+    std::pair<SDValue, SDValue> Result = TLI.LowerCallTo(CLI);
+    DAG.setRoot(Result.second);
+    return;
+  }
 
-  SDValue StackSlot = DAG.getLoad(
-      PtrTy, dl, DAG.getEntryNode(), StackSlotPtr,
-      MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI), true,
-      false, false, Align);
+  // If useLoadStackGuardNode returns true, generate LOAD_STACK_GUARD.
+  // Otherwise, emit a volatile load to retrieve the stack guard value.
+  SDValue Chain = DAG.getEntryNode();
+  if (TLI.useLoadStackGuardNode()) {
+    Guard = getLoadStackGuard(DAG, dl, Chain);
+  } else {
+    const Value *IRGuard = TLI.getSDagStackGuard(M);
+    SDValue GuardPtr = getValue(IRGuard);
+
+    Guard =
+        DAG.getLoad(PtrTy, dl, Chain, GuardPtr, MachinePointerInfo(IRGuard, 0),
+                    Align, MachineMemOperand::MOVolatile);
+  }
 
   // Perform the comparison via a subtract/getsetcc.
   EVT VT = Guard.getValueType();
@@ -2115,6 +2256,12 @@ void SelectionDAGBuilder::visitInvoke(const InvokeInst &I) {
   MachineBasicBlock *Return = FuncInfo.MBBMap[I.getSuccessor(0)];
   const BasicBlock *EHPadBB = I.getSuccessor(1);
 
+  // Deopt bundles are lowered in LowerCallSiteWithDeoptBundle, and we don't
+  // have to do anything here to lower funclet bundles.
+  assert(!I.hasOperandBundlesOtherThan(
+             {LLVMContext::OB_deopt, LLVMContext::OB_funclet}) &&
+         "Cannot lower invokes with arbitrary operand bundles yet!");
+
   const Value *Callee(I.getCalledValue());
   const Function *Fn = dyn_cast<Function>(Callee);
   if (isa<InlineAsm>(Callee))
@@ -2134,8 +2281,15 @@ void SelectionDAGBuilder::visitInvoke(const InvokeInst &I) {
       LowerStatepoint(ImmutableStatepoint(&I), EHPadBB);
       break;
     }
-  } else
+  } else if (I.countOperandBundlesOfType(LLVMContext::OB_deopt)) {
+    // Currently we do not lower any intrinsic calls with deopt operand bundles.
+    // Eventually we will support lowering the @llvm.experimental.deoptimize
+    // intrinsic, and right now there are no plans to support other intrinsics
+    // with deopt state.
+    LowerCallSiteWithDeoptBundle(&I, getValue(Callee), EHPadBB);
+  } else {
     LowerCallTo(&I, getValue(Callee), false, EHPadBB);
+  }
 
   // If the value of the invoke is used outside of its defining block, make it
   // available as a virtual register.
@@ -2309,6 +2463,129 @@ void SelectionDAGBuilder::visitFSub(const User &I) {
   visitBinary(I, ISD::FSUB);
 }
 
+/// Checks if the given instruction performs a vector reduction, in which case
+/// we have the freedom to alter the elements in the result as long as the
+/// reduction of them stays unchanged.
+static bool isVectorReductionOp(const User *I) {
+  const Instruction *Inst = dyn_cast<Instruction>(I);
+  if (!Inst || !Inst->getType()->isVectorTy())
+    return false;
+
+  auto OpCode = Inst->getOpcode();
+  switch (OpCode) {
+  case Instruction::Add:
+  case Instruction::Mul:
+  case Instruction::And:
+  case Instruction::Or:
+  case Instruction::Xor:
+    break;
+  case Instruction::FAdd:
+  case Instruction::FMul:
+    if (const FPMathOperator *FPOp = dyn_cast<const FPMathOperator>(Inst))
+      if (FPOp->getFastMathFlags().unsafeAlgebra())
+        break;
+    // Fall through.
+  default:
+    return false;
+  }
+
+  unsigned ElemNum = Inst->getType()->getVectorNumElements();
+  unsigned ElemNumToReduce = ElemNum;
+
+  // Do DFS search on the def-use chain from the given instruction. We only
+  // allow four kinds of operations during the search until we reach the
+  // instruction that extracts the first element from the vector:
+  //
+  //   1. The reduction operation of the same opcode as the given instruction.
+  //
+  //   2. PHI node.
+  //
+  //   3. ShuffleVector instruction together with a reduction operation that
+  //      does a partial reduction.
+  //
+  //   4. ExtractElement that extracts the first element from the vector, and we
+  //      stop searching the def-use chain here.
+  //
+  // 3 & 4 above perform a reduction on all elements of the vector. We push defs
+  // from 1-3 to the stack to continue the DFS. The given instruction is not
+  // a reduction operation if we meet any other instructions other than those
+  // listed above.
+
+  SmallVector<const User *, 16> UsersToVisit{Inst};
+  SmallPtrSet<const User *, 16> Visited;
+  bool ReduxExtracted = false;
+
+  while (!UsersToVisit.empty()) {
+    auto User = UsersToVisit.back();
+    UsersToVisit.pop_back();
+    if (!Visited.insert(User).second)
+      continue;
+
+    for (const auto &U : User->users()) {
+      auto Inst = dyn_cast<Instruction>(U);
+      if (!Inst)
+        return false;
+
+      if (Inst->getOpcode() == OpCode || isa<PHINode>(U)) {
+        if (const FPMathOperator *FPOp = dyn_cast<const FPMathOperator>(Inst))
+          if (!isa<PHINode>(FPOp) && !FPOp->getFastMathFlags().unsafeAlgebra())
+            return false;
+        UsersToVisit.push_back(U);
+      } else if (const ShuffleVectorInst *ShufInst =
+                     dyn_cast<ShuffleVectorInst>(U)) {
+        // Detect the following pattern: A ShuffleVector instruction together
+        // with a reduction that do partial reduction on the first and second
+        // ElemNumToReduce / 2 elements, and store the result in
+        // ElemNumToReduce / 2 elements in another vector.
+
+        unsigned ResultElements = ShufInst->getType()->getVectorNumElements();
+        if (ResultElements < ElemNum)
+          return false;
+
+        if (ElemNumToReduce == 1)
+          return false;
+        if (!isa<UndefValue>(U->getOperand(1)))
+          return false;
+        for (unsigned i = 0; i < ElemNumToReduce / 2; ++i)
+          if (ShufInst->getMaskValue(i) != int(i + ElemNumToReduce / 2))
+            return false;
+        for (unsigned i = ElemNumToReduce / 2; i < ElemNum; ++i)
+          if (ShufInst->getMaskValue(i) != -1)
+            return false;
+
+        // There is only one user of this ShuffleVector instruction, which
+        // must be a reduction operation.
+        if (!U->hasOneUse())
+          return false;
+
+        auto U2 = dyn_cast<Instruction>(*U->user_begin());
+        if (!U2 || U2->getOpcode() != OpCode)
+          return false;
+
+        // Check operands of the reduction operation.
+        if ((U2->getOperand(0) == U->getOperand(0) && U2->getOperand(1) == U) ||
+            (U2->getOperand(1) == U->getOperand(0) && U2->getOperand(0) == U)) {
+          UsersToVisit.push_back(U2);
+          ElemNumToReduce /= 2;
+        } else
+          return false;
+      } else if (isa<ExtractElementInst>(U)) {
+        // At this moment we should have reduced all elements in the vector.
+        if (ElemNumToReduce != 1)
+          return false;
+
+        const ConstantInt *Val = dyn_cast<ConstantInt>(U->getOperand(1));
+        if (!Val || Val->getZExtValue() != 0)
+          return false;
+
+        ReduxExtracted = true;
+      } else
+        return false;
+    }
+  }
+  return ReduxExtracted;
+}
+
 void SelectionDAGBuilder::visitBinary(const User &I, unsigned OpCode) {
   SDValue Op1 = getValue(I.getOperand(0));
   SDValue Op2 = getValue(I.getOperand(1));
@@ -2316,6 +2593,7 @@ void SelectionDAGBuilder::visitBinary(const User &I, unsigned OpCode) {
   bool nuw = false;
   bool nsw = false;
   bool exact = false;
+  bool vec_redux = false;
   FastMathFlags FMF;
 
   if (const OverflowingBinaryOperator *OFBinOp =
@@ -2329,10 +2607,16 @@ void SelectionDAGBuilder::visitBinary(const User &I, unsigned OpCode) {
   if (const FPMathOperator *FPOp = dyn_cast<const FPMathOperator>(&I))
     FMF = FPOp->getFastMathFlags();
 
+  if (isVectorReductionOp(&I)) {
+    vec_redux = true;
+    DEBUG(dbgs() << "Detected a reduction operation:" << I << "\n");
+  }
+
   SDNodeFlags Flags;
   Flags.setExact(exact);
   Flags.setNoSignedWrap(nsw);
   Flags.setNoUnsignedWrap(nuw);
+  Flags.setVectorReduction(vec_redux);
   if (EnableFMFInDAG) {
     Flags.setAllowReciprocal(FMF.allowReciprocal());
     Flags.setNoInfs(FMF.noInfs());
@@ -2433,7 +2717,7 @@ void SelectionDAGBuilder::visitFCmp(const User &I) {
   SDValue Op1 = getValue(I.getOperand(0));
   SDValue Op2 = getValue(I.getOperand(1));
   ISD::CondCode Condition = getFCmpCondCode(predicate);
-  
+
   // FIXME: Fcmp instructions have fast-math-flags in IR, so we should use them.
   // FIXME: We should propagate the fast-math-flags to the DAG node itself for
   // further optimization, but currently FMF is only applicable to binary nodes.
@@ -2444,6 +2728,14 @@ void SelectionDAGBuilder::visitFCmp(const User &I) {
   setValue(&I, DAG.getSetCC(getCurSDLoc(), DestVT, Op1, Op2, Condition));
 }
 
+// Check if the condition of the select has one use or two users that are both
+// selects with the same condition.
+static bool hasOnlySelectUsers(const Value *Cond) {
+  return std::all_of(Cond->user_begin(), Cond->user_end(), [](const Value *V) {
+    return isa<SelectInst>(V);
+  });
+}
+
 void SelectionDAGBuilder::visitSelect(const User &I) {
   SmallVector<EVT, 4> ValueVTs;
   ComputeValueVTs(DAG.getTargetLoweringInfo(), DAG.getDataLayout(), I.getType(),
@@ -2529,7 +2821,7 @@ void SelectionDAGBuilder::visitSelect(const User &I) {
         // If the underlying comparison instruction is used by any other
         // instruction, the consumed instructions won't be destroyed, so it is
         // not profitable to convert to a min/max.
-        cast<SelectInst>(&I)->getCondition()->hasOneUse()) {
+        hasOnlySelectUsers(cast<SelectInst>(I).getCondition())) {
       OpCode = Opc;
       LHSVal = getValue(LHS);
       RHSVal = getValue(RHS);
@@ -2703,17 +2995,6 @@ void SelectionDAGBuilder::visitExtractElement(const User &I) {
                            InVec, InIdx));
 }
 
-// Utility for visitShuffleVector - Return true if every element in Mask,
-// beginning from position Pos and ending in Pos+Size, falls within the
-// specified sequential range [L, L+Pos). or is undef.
-static bool isSequentialInRange(const SmallVectorImpl<int> &Mask,
-                                unsigned Pos, unsigned Size, int Low) {
-  for (unsigned i = Pos, e = Pos+Size; i != e; ++i, ++Low)
-    if (Mask[i] >= 0 && Mask[i] != Low)
-      return false;
-  return true;
-}
-
 void SelectionDAGBuilder::visitShuffleVector(const User &I) {
   SDValue Src1 = getValue(I.getOperand(0));
   SDValue Src2 = getValue(I.getOperand(1));
@@ -2728,8 +3009,7 @@ void SelectionDAGBuilder::visitShuffleVector(const User &I) {
   unsigned SrcNumElts = SrcVT.getVectorNumElements();
 
   if (SrcNumElts == MaskNumElts) {
-    setValue(&I, DAG.getVectorShuffle(VT, getCurSDLoc(), Src1, Src2,
-                                      &Mask[0]));
+    setValue(&I, DAG.getVectorShuffle(VT, getCurSDLoc(), Src1, Src2, Mask));
     return;
   }
 
@@ -2738,29 +3018,46 @@ void SelectionDAGBuilder::visitShuffleVector(const User &I) {
     // Mask is longer than the source vectors and is a multiple of the source
     // vectors.  We can use concatenate vector to make the mask and vectors
     // lengths match.
-    if (SrcNumElts*2 == MaskNumElts) {
-      // First check for Src1 in low and Src2 in high
-      if (isSequentialInRange(Mask, 0, SrcNumElts, 0) &&
-          isSequentialInRange(Mask, SrcNumElts, SrcNumElts, SrcNumElts)) {
-        // The shuffle is concatenating two vectors together.
-        setValue(&I, DAG.getNode(ISD::CONCAT_VECTORS, getCurSDLoc(),
-                                 VT, Src1, Src2));
-        return;
+
+    unsigned NumConcat = MaskNumElts / SrcNumElts;
+
+    // Check if the shuffle is some kind of concatenation of the input vectors.
+    bool IsConcat = true;
+    SmallVector<int, 8> ConcatSrcs(NumConcat, -1);
+    for (unsigned i = 0; i != MaskNumElts; ++i) {
+      int Idx = Mask[i];
+      if (Idx < 0)
+        continue;
+      // Ensure the indices in each SrcVT sized piece are sequential and that
+      // the same source is used for the whole piece.
+      if ((Idx % SrcNumElts != (i % SrcNumElts)) ||
+          (ConcatSrcs[i / SrcNumElts] >= 0 &&
+           ConcatSrcs[i / SrcNumElts] != (int)(Idx / SrcNumElts))) {
+        IsConcat = false;
+        break;
       }
-      // Then check for Src2 in low and Src1 in high
-      if (isSequentialInRange(Mask, 0, SrcNumElts, SrcNumElts) &&
-          isSequentialInRange(Mask, SrcNumElts, SrcNumElts, 0)) {
-        // The shuffle is concatenating two vectors together.
-        setValue(&I, DAG.getNode(ISD::CONCAT_VECTORS, getCurSDLoc(),
-                                 VT, Src2, Src1));
-        return;
+      // Remember which source this index came from.
+      ConcatSrcs[i / SrcNumElts] = Idx / SrcNumElts;
+    }
+
+    // The shuffle is concatenating multiple vectors together. Just emit
+    // a CONCAT_VECTORS operation.
+    if (IsConcat) {
+      SmallVector<SDValue, 8> ConcatOps;
+      for (auto Src : ConcatSrcs) {
+        if (Src < 0)
+          ConcatOps.push_back(DAG.getUNDEF(SrcVT));
+        else if (Src == 0)
+          ConcatOps.push_back(Src1);
+        else
+          ConcatOps.push_back(Src2);
       }
+      setValue(&I, DAG.getNode(ISD::CONCAT_VECTORS, getCurSDLoc(),
+                               VT, ConcatOps));
+      return;
     }
 
     // Pad both vectors with undefs to make them the same length as the mask.
-    unsigned NumConcat = MaskNumElts / SrcNumElts;
-    bool Src1U = Src1.getOpcode() == ISD::UNDEF;
-    bool Src2U = Src2.getOpcode() == ISD::UNDEF;
     SDValue UndefVal = DAG.getUNDEF(SrcVT);
 
     SmallVector<SDValue, 8> MOps1(NumConcat, UndefVal);
@@ -2768,10 +3065,12 @@ void SelectionDAGBuilder::visitShuffleVector(const User &I) {
     MOps1[0] = Src1;
     MOps2[0] = Src2;
 
-    Src1 = Src1U ? DAG.getUNDEF(VT) : DAG.getNode(ISD::CONCAT_VECTORS,
-                                                  getCurSDLoc(), VT, MOps1);
-    Src2 = Src2U ? DAG.getUNDEF(VT) : DAG.getNode(ISD::CONCAT_VECTORS,
-                                                  getCurSDLoc(), VT, MOps2);
+    Src1 = Src1.isUndef() ? DAG.getUNDEF(VT)
+                          : DAG.getNode(ISD::CONCAT_VECTORS,
+                                        getCurSDLoc(), VT, MOps1);
+    Src2 = Src2.isUndef() ? DAG.getUNDEF(VT)
+                          : DAG.getNode(ISD::CONCAT_VECTORS,
+                                        getCurSDLoc(), VT, MOps2);
 
     // Readjust mask for new input vector length.
     SmallVector<int, 8> MappedOps;
@@ -2783,7 +3082,7 @@ void SelectionDAGBuilder::visitShuffleVector(const User &I) {
     }
 
     setValue(&I, DAG.getVectorShuffle(VT, getCurSDLoc(), Src1, Src2,
-                                      &MappedOps[0]));
+                                      MappedOps));
     return;
   }
 
@@ -2864,7 +3163,7 @@ void SelectionDAGBuilder::visitShuffleVector(const User &I) {
       }
 
       setValue(&I, DAG.getVectorShuffle(VT, getCurSDLoc(), Src1, Src2,
-                                        &MappedOps[0]));
+                                        MappedOps));
       return;
     }
   }
@@ -2982,8 +3281,7 @@ void SelectionDAGBuilder::visitGetElementPtr(const User &I) {
   Value *Op0 = I.getOperand(0);
   // Note that the pointer operand may be a vector of pointers. Take the scalar
   // element which holds a pointer.
-  Type *Ty = Op0->getType()->getScalarType();
-  unsigned AS = Ty->getPointerAddressSpace();
+  unsigned AS = Op0->getType()->getScalarType()->getPointerAddressSpace();
   SDValue N = getValue(Op0);
   SDLoc dl = getCurSDLoc();
 
@@ -2993,14 +3291,15 @@ void SelectionDAGBuilder::visitGetElementPtr(const User &I) {
     cast<VectorType>(I.getType())->getVectorNumElements() : 0;
 
   if (VectorWidth && !N.getValueType().isVector()) {
-    MVT VT = MVT::getVectorVT(N.getValueType().getSimpleVT(), VectorWidth);
+    LLVMContext &Context = *DAG.getContext();
+    EVT VT = EVT::getVectorVT(Context, N.getValueType(), VectorWidth);
     SmallVector<SDValue, 16> Ops(VectorWidth, N);
     N = DAG.getNode(ISD::BUILD_VECTOR, dl, VT, Ops);
   }
-  for (GetElementPtrInst::const_op_iterator OI = I.op_begin()+1, E = I.op_end();
-       OI != E; ++OI) {
-    const Value *Idx = *OI;
-    if (StructType *StTy = dyn_cast<StructType>(Ty)) {
+  for (gep_type_iterator GTI = gep_type_begin(&I), E = gep_type_end(&I);
+       GTI != E; ++GTI) {
+    const Value *Idx = GTI.getOperand();
+    if (StructType *StTy = dyn_cast<StructType>(*GTI)) {
       unsigned Field = cast<Constant>(Idx)->getUniqueInteger().getZExtValue();
       if (Field) {
         // N = N + Offset
@@ -3015,14 +3314,11 @@ void SelectionDAGBuilder::visitGetElementPtr(const User &I) {
         N = DAG.getNode(ISD::ADD, dl, N.getValueType(), N,
                         DAG.getConstant(Offset, dl, N.getValueType()), &Flags);
       }
-
-      Ty = StTy->getElementType(Field);
     } else {
-      Ty = cast<SequentialType>(Ty)->getElementType();
       MVT PtrTy =
           DAG.getTargetLoweringInfo().getPointerTy(DAG.getDataLayout(), AS);
       unsigned PtrSize = PtrTy.getSizeInBits();
-      APInt ElementSize(PtrSize, DL->getTypeAllocSize(Ty));
+      APInt ElementSize(PtrSize, DL->getTypeAllocSize(GTI.getIndexedType()));
 
       // If this is a scalar constant or a splat vector of constants,
       // handle it quickly.
@@ -3055,7 +3351,7 @@ void SelectionDAGBuilder::visitGetElementPtr(const User &I) {
       if (!IdxN.getValueType().isVector() && VectorWidth) {
         MVT VT = MVT::getVectorVT(IdxN.getValueType().getSimpleVT(), VectorWidth);
         SmallVector<SDValue, 16> Ops(VectorWidth, IdxN);
-        IdxN = DAG.getNode(ISD::BUILD_VECTOR, dl, VT, Ops);      
+        IdxN = DAG.getNode(ISD::BUILD_VECTOR, dl, VT, Ops);
       }
       // If the index is smaller or larger than intptr_t, truncate or extend
       // it.
@@ -3144,7 +3440,22 @@ void SelectionDAGBuilder::visitLoad(const LoadInst &I) {
   if (I.isAtomic())
     return visitAtomicLoad(I);
 
+  const TargetLowering &TLI = DAG.getTargetLoweringInfo();
   const Value *SV = I.getOperand(0);
+  if (TLI.supportSwiftError()) {
+    // Swifterror values can come from either a function parameter with
+    // swifterror attribute or an alloca with swifterror attribute.
+    if (const Argument *Arg = dyn_cast<Argument>(SV)) {
+      if (Arg->hasSwiftErrorAttr())
+        return visitLoadFromSwiftError(I);
+    }
+
+    if (const AllocaInst *Alloca = dyn_cast<AllocaInst>(SV)) {
+      if (Alloca->isSwiftError())
+        return visitLoadFromSwiftError(I);
+    }
+  }
+
   SDValue Ptr = getValue(SV);
 
   Type *Ty = I.getType();
@@ -3168,7 +3479,6 @@ void SelectionDAGBuilder::visitLoad(const LoadInst &I) {
   I.getAAMetadata(AAInfo);
   const MDNode *Ranges = I.getMetadata(LLVMContext::MD_range);
 
-  const TargetLowering &TLI = DAG.getTargetLoweringInfo();
   SmallVector<EVT, 4> ValueVTs;
   SmallVector<uint64_t, 4> Offsets;
   ComputeValueVTs(TLI, DAG.getDataLayout(), Ty, ValueVTs, &Offsets);
@@ -3223,10 +3533,17 @@ void SelectionDAGBuilder::visitLoad(const LoadInst &I) {
                             PtrVT, Ptr,
                             DAG.getConstant(Offsets[i], dl, PtrVT),
                             &Flags);
-    SDValue L = DAG.getLoad(ValueVTs[i], dl, Root,
-                            A, MachinePointerInfo(SV, Offsets[i]), isVolatile,
-                            isNonTemporal, isInvariant, Alignment, AAInfo,
-                            Ranges);
+    auto MMOFlags = MachineMemOperand::MONone;
+    if (isVolatile)
+      MMOFlags |= MachineMemOperand::MOVolatile;
+    if (isNonTemporal)
+      MMOFlags |= MachineMemOperand::MONonTemporal;
+    if (isInvariant)
+      MMOFlags |= MachineMemOperand::MOInvariant;
+
+    SDValue L = DAG.getLoad(ValueVTs[i], dl, Root, A,
+                            MachinePointerInfo(SV, Offsets[i]), Alignment,
+                            MMOFlags, AAInfo, Ranges);
 
     Values[i] = L;
     Chains[ChainI] = L.getValue(1);
@@ -3245,6 +3562,64 @@ void SelectionDAGBuilder::visitLoad(const LoadInst &I) {
                            DAG.getVTList(ValueVTs), Values));
 }
 
+void SelectionDAGBuilder::visitStoreToSwiftError(const StoreInst &I) {
+  const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+  assert(TLI.supportSwiftError() &&
+         "call visitStoreToSwiftError when backend supports swifterror");
+
+  SmallVector<EVT, 4> ValueVTs;
+  SmallVector<uint64_t, 4> Offsets;
+  const Value *SrcV = I.getOperand(0);
+  ComputeValueVTs(DAG.getTargetLoweringInfo(), DAG.getDataLayout(),
+                  SrcV->getType(), ValueVTs, &Offsets);
+  assert(ValueVTs.size() == 1 && Offsets[0] == 0 &&
+         "expect a single EVT for swifterror");
+
+  SDValue Src = getValue(SrcV);
+  // Create a virtual register, then update the virtual register.
+  auto &DL = DAG.getDataLayout();
+  const TargetRegisterClass *RC = TLI.getRegClassFor(TLI.getPointerTy(DL));
+  unsigned VReg = FuncInfo.MF->getRegInfo().createVirtualRegister(RC);
+  // Chain, DL, Reg, N or Chain, DL, Reg, N, Glue
+  // Chain can be getRoot or getControlRoot.
+  SDValue CopyNode = DAG.getCopyToReg(getRoot(), getCurSDLoc(), VReg,
+                                      SDValue(Src.getNode(), Src.getResNo()));
+  DAG.setRoot(CopyNode);
+  FuncInfo.setSwiftErrorVReg(FuncInfo.MBB, I.getOperand(1), VReg);
+}
+
+void SelectionDAGBuilder::visitLoadFromSwiftError(const LoadInst &I) {
+  assert(DAG.getTargetLoweringInfo().supportSwiftError() &&
+         "call visitLoadFromSwiftError when backend supports swifterror");
+
+  assert(!I.isVolatile() &&
+         I.getMetadata(LLVMContext::MD_nontemporal) == nullptr &&
+         I.getMetadata(LLVMContext::MD_invariant_load) == nullptr &&
+         "Support volatile, non temporal, invariant for load_from_swift_error");
+
+  const Value *SV = I.getOperand(0);
+  Type *Ty = I.getType();
+  AAMDNodes AAInfo;
+  I.getAAMetadata(AAInfo);
+  assert(!AA->pointsToConstantMemory(MemoryLocation(
+             SV, DAG.getDataLayout().getTypeStoreSize(Ty), AAInfo)) &&
+         "load_from_swift_error should not be constant memory");
+
+  SmallVector<EVT, 4> ValueVTs;
+  SmallVector<uint64_t, 4> Offsets;
+  ComputeValueVTs(DAG.getTargetLoweringInfo(), DAG.getDataLayout(), Ty,
+                  ValueVTs, &Offsets);
+  assert(ValueVTs.size() == 1 && Offsets[0] == 0 &&
+         "expect a single EVT for swifterror");
+
+  // Chain, DL, Reg, VT, Glue or Chain, DL, Reg, VT
+  SDValue L = DAG.getCopyFromReg(getRoot(), getCurSDLoc(),
+                                 FuncInfo.findSwiftErrorVReg(FuncInfo.MBB, SV),
+                                 ValueVTs[0]);
+
+  setValue(&I, L);
+}
+
 void SelectionDAGBuilder::visitStore(const StoreInst &I) {
   if (I.isAtomic())
     return visitAtomicStore(I);
@@ -3252,6 +3627,21 @@ void SelectionDAGBuilder::visitStore(const StoreInst &I) {
   const Value *SrcV = I.getOperand(0);
   const Value *PtrV = I.getOperand(1);
 
+  const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+  if (TLI.supportSwiftError()) {
+    // Swifterror values can come from either a function parameter with
+    // swifterror attribute or an alloca with swifterror attribute.
+    if (const Argument *Arg = dyn_cast<Argument>(PtrV)) {
+      if (Arg->hasSwiftErrorAttr())
+        return visitStoreToSwiftError(I);
+    }
+
+    if (const AllocaInst *Alloca = dyn_cast<AllocaInst>(PtrV)) {
+      if (Alloca->isSwiftError())
+        return visitStoreToSwiftError(I);
+    }
+  }
+
   SmallVector<EVT, 4> ValueVTs;
   SmallVector<uint64_t, 4> Offsets;
   ComputeValueVTs(DAG.getTargetLoweringInfo(), DAG.getDataLayout(),
@@ -3268,15 +3658,18 @@ void SelectionDAGBuilder::visitStore(const StoreInst &I) {
 
   SDValue Root = getRoot();
   SmallVector<SDValue, 4> Chains(std::min(MaxParallelChains, NumValues));
+  SDLoc dl = getCurSDLoc();
   EVT PtrVT = Ptr.getValueType();
-  bool isVolatile = I.isVolatile();
-  bool isNonTemporal = I.getMetadata(LLVMContext::MD_nontemporal) != nullptr;
   unsigned Alignment = I.getAlignment();
-  SDLoc dl = getCurSDLoc();
-
   AAMDNodes AAInfo;
   I.getAAMetadata(AAInfo);
 
+  auto MMOFlags = MachineMemOperand::MONone;
+  if (I.isVolatile())
+    MMOFlags |= MachineMemOperand::MOVolatile;
+  if (I.getMetadata(LLVMContext::MD_nontemporal) != nullptr)
+    MMOFlags |= MachineMemOperand::MONonTemporal;
+
   // An aggregate load cannot wrap around the address space, so offsets to its
   // parts don't wrap either.
   SDNodeFlags Flags;
@@ -3293,10 +3686,9 @@ void SelectionDAGBuilder::visitStore(const StoreInst &I) {
     }
     SDValue Add = DAG.getNode(ISD::ADD, dl, PtrVT, Ptr,
                               DAG.getConstant(Offsets[i], dl, PtrVT), &Flags);
-    SDValue St = DAG.getStore(Root, dl,
-                              SDValue(Src.getNode(), Src.getResNo() + i),
-                              Add, MachinePointerInfo(PtrV, Offsets[i]),
-                              isVolatile, isNonTemporal, Alignment, AAInfo);
+    SDValue St = DAG.getStore(
+        Root, dl, SDValue(Src.getNode(), Src.getResNo() + i), Add,
+        MachinePointerInfo(PtrV, Offsets[i]), Alignment, MMOFlags, AAInfo);
     Chains[ChainI] = St;
   }
 
@@ -3447,13 +3839,10 @@ void SelectionDAGBuilder::visitMaskedLoad(const CallInst &I) {
   I.getAAMetadata(AAInfo);
   const MDNode *Ranges = I.getMetadata(LLVMContext::MD_range);
 
-  SDValue InChain = DAG.getRoot();
-  if (AA->pointsToConstantMemory(MemoryLocation(
-          PtrOperand, DAG.getDataLayout().getTypeStoreSize(I.getType()),
-          AAInfo))) {
-    // Do not serialize (non-volatile) loads of constant memory with anything.
-    InChain = DAG.getEntryNode();
-  }
+  // Do not serialize masked loads of constant memory with anything.
+  bool AddToChain = !AA->pointsToConstantMemory(MemoryLocation(
+      PtrOperand, DAG.getDataLayout().getTypeStoreSize(I.getType()), AAInfo));
+  SDValue InChain = AddToChain ? DAG.getRoot() : DAG.getEntryNode();
 
   MachineMemOperand *MMO =
     DAG.getMachineFunction().
@@ -3463,8 +3852,10 @@ void SelectionDAGBuilder::visitMaskedLoad(const CallInst &I) {
 
   SDValue Load = DAG.getMaskedLoad(VT, sdl, InChain, Ptr, Mask, Src0, VT, MMO,
                                    ISD::NON_EXTLOAD);
-  SDValue OutChain = Load.getValue(1);
-  DAG.setRoot(OutChain);
+  if (AddToChain) {
+    SDValue OutChain = Load.getValue(1);
+    DAG.setRoot(OutChain);
+  }
   setValue(&I, Load);
 }
 
@@ -3585,7 +3976,7 @@ void SelectionDAGBuilder::visitFence(const FenceInst &I) {
   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
   SDValue Ops[3];
   Ops[0] = getRoot();
-  Ops[1] = DAG.getConstant(I.getOrdering(), dl,
+  Ops[1] = DAG.getConstant((unsigned)I.getOrdering(), dl,
                            TLI.getPointerTy(DAG.getDataLayout()));
   Ops[2] = DAG.getConstant(I.getSynchScope(), dl,
                            TLI.getPointerTy(DAG.getDataLayout()));
@@ -3724,7 +4115,8 @@ void SelectionDAGBuilder::visitTargetIntrinsic(const CallInst &I,
     if (VectorType *PTy = dyn_cast<VectorType>(I.getType())) {
       EVT VT = TLI.getValueType(DAG.getDataLayout(), PTy);
       Result = DAG.getNode(ISD::BITCAST, getCurSDLoc(), VT, Result);
-    }
+    } else
+      Result = lowerRangeToAssertZExt(DAG, I, Result);
 
     setValue(&I, Result);
   }
@@ -3736,8 +4128,7 @@ void SelectionDAGBuilder::visitTargetIntrinsic(const CallInst &I,
 ///   Op = (Op & 0x007fffff) | 0x3f800000;
 ///
 /// where Op is the hexadecimal representation of floating point value.
-static SDValue
-GetSignificand(SelectionDAG &DAG, SDValue Op, SDLoc dl) {
+static SDValue GetSignificand(SelectionDAG &DAG, SDValue Op, const SDLoc &dl) {
   SDValue t1 = DAG.getNode(ISD::AND, dl, MVT::i32, Op,
                            DAG.getConstant(0x007fffff, dl, MVT::i32));
   SDValue t2 = DAG.getNode(ISD::OR, dl, MVT::i32, t1,
@@ -3750,9 +4141,8 @@ GetSignificand(SelectionDAG &DAG, SDValue Op, SDLoc dl) {
 ///   (float)(int)(((Op & 0x7f800000) >> 23) - 127);
 ///
 /// where Op is the hexadecimal representation of floating point value.
-static SDValue
-GetExponent(SelectionDAG &DAG, SDValue Op, const TargetLowering &TLI,
-            SDLoc dl) {
+static SDValue GetExponent(SelectionDAG &DAG, SDValue Op,
+                           const TargetLowering &TLI, const SDLoc &dl) {
   SDValue t0 = DAG.getNode(ISD::AND, dl, MVT::i32, Op,
                            DAG.getConstant(0x7f800000, dl, MVT::i32));
   SDValue t1 = DAG.getNode(
@@ -3764,13 +4154,13 @@ GetExponent(SelectionDAG &DAG, SDValue Op, const TargetLowering &TLI,
 }
 
 /// getF32Constant - Get 32-bit floating point constant.
-static SDValue
-getF32Constant(SelectionDAG &DAG, unsigned Flt, SDLoc dl) {
+static SDValue getF32Constant(SelectionDAG &DAG, unsigned Flt,
+                              const SDLoc &dl) {
   return DAG.getConstantFP(APFloat(APFloat::IEEEsingle, APInt(32, Flt)), dl,
                            MVT::f32);
 }
 
-static SDValue getLimitedPrecisionExp2(SDValue t0, SDLoc dl,
+static SDValue getLimitedPrecisionExp2(SDValue t0, const SDLoc &dl,
                                        SelectionDAG &DAG) {
   // TODO: What fast-math-flags should be set on the floating-point nodes?
 
@@ -3862,7 +4252,7 @@ static SDValue getLimitedPrecisionExp2(SDValue t0, SDLoc dl,
 
 /// expandExp - Lower an exp intrinsic. Handles the special sequences for
 /// limited-precision mode.
-static SDValue expandExp(SDLoc dl, SDValue Op, SelectionDAG &DAG,
+static SDValue expandExp(const SDLoc &dl, SDValue Op, SelectionDAG &DAG,
                          const TargetLowering &TLI) {
   if (Op.getValueType() == MVT::f32 &&
       LimitFloatPrecision > 0 && LimitFloatPrecision <= 18) {
@@ -3885,9 +4275,9 @@ static SDValue expandExp(SDLoc dl, SDValue Op, SelectionDAG &DAG,
 
 /// expandLog - Lower a log intrinsic. Handles the special sequences for
 /// limited-precision mode.
-static SDValue expandLog(SDLoc dl, SDValue Op, SelectionDAG &DAG,
+static SDValue expandLog(const SDLoc &dl, SDValue Op, SelectionDAG &DAG,
                          const TargetLowering &TLI) {
- 
+
   // TODO: What fast-math-flags should be set on the floating-point nodes?
 
   if (Op.getValueType() == MVT::f32 &&
@@ -3984,9 +4374,9 @@ static SDValue expandLog(SDLoc dl, SDValue Op, SelectionDAG &DAG,
 
 /// expandLog2 - Lower a log2 intrinsic. Handles the special sequences for
 /// limited-precision mode.
-static SDValue expandLog2(SDLoc dl, SDValue Op, SelectionDAG &DAG,
+static SDValue expandLog2(const SDLoc &dl, SDValue Op, SelectionDAG &DAG,
                           const TargetLowering &TLI) {
-  
+
   // TODO: What fast-math-flags should be set on the floating-point nodes?
 
   if (Op.getValueType() == MVT::f32 &&
@@ -4082,7 +4472,7 @@ static SDValue expandLog2(SDLoc dl, SDValue Op, SelectionDAG &DAG,
 
 /// expandLog10 - Lower a log10 intrinsic. Handles the special sequences for
 /// limited-precision mode.
-static SDValue expandLog10(SDLoc dl, SDValue Op, SelectionDAG &DAG,
+static SDValue expandLog10(const SDLoc &dl, SDValue Op, SelectionDAG &DAG,
                            const TargetLowering &TLI) {
 
   // TODO: What fast-math-flags should be set on the floating-point nodes?
@@ -4173,7 +4563,7 @@ static SDValue expandLog10(SDLoc dl, SDValue Op, SelectionDAG &DAG,
 
 /// expandExp2 - Lower an exp2 intrinsic. Handles the special sequences for
 /// limited-precision mode.
-static SDValue expandExp2(SDLoc dl, SDValue Op, SelectionDAG &DAG,
+static SDValue expandExp2(const SDLoc &dl, SDValue Op, SelectionDAG &DAG,
                           const TargetLowering &TLI) {
   if (Op.getValueType() == MVT::f32 &&
       LimitFloatPrecision > 0 && LimitFloatPrecision <= 18)
@@ -4185,7 +4575,7 @@ static SDValue expandExp2(SDLoc dl, SDValue Op, SelectionDAG &DAG,
 
 /// visitPow - Lower a pow intrinsic. Handles the special sequences for
 /// limited-precision mode with x == 10.0f.
-static SDValue expandPow(SDLoc dl, SDValue LHS, SDValue RHS,
+static SDValue expandPow(const SDLoc &dl, SDValue LHS, SDValue RHS,
                          SelectionDAG &DAG, const TargetLowering &TLI) {
   bool IsExp10 = false;
   if (LHS.getValueType() == MVT::f32 && RHS.getValueType() == MVT::f32 &&
@@ -4214,7 +4604,7 @@ static SDValue expandPow(SDLoc dl, SDValue LHS, SDValue RHS,
 
 
 /// ExpandPowI - Expand a llvm.powi intrinsic.
-static SDValue ExpandPowI(SDLoc DL, SDValue LHS, SDValue RHS,
+static SDValue ExpandPowI(const SDLoc &DL, SDValue LHS, SDValue RHS,
                           SelectionDAG &DAG) {
   // If RHS is a constant, we can expand this out to a multiplication tree,
   // otherwise we end up lowering to a call to __powidf2 (for example).  When
@@ -4798,7 +5188,8 @@ SelectionDAGBuilder::visitIntrinsicCall(const CallInst &I, unsigned Intrinsic) {
   case Intrinsic::trunc:
   case Intrinsic::rint:
   case Intrinsic::nearbyint:
-  case Intrinsic::round: {
+  case Intrinsic::round:
+  case Intrinsic::canonicalize: {
     unsigned Opcode;
     switch (Intrinsic) {
     default: llvm_unreachable("Impossible intrinsic");  // Can't reach here.
@@ -4812,6 +5203,7 @@ SelectionDAGBuilder::visitIntrinsicCall(const CallInst &I, unsigned Intrinsic) {
     case Intrinsic::rint:      Opcode = ISD::FRINT;      break;
     case Intrinsic::nearbyint: Opcode = ISD::FNEARBYINT; break;
     case Intrinsic::round:     Opcode = ISD::FROUND;     break;
+    case Intrinsic::canonicalize: Opcode = ISD::FCANONICALIZE; break;
     }
 
     setValue(&I, DAG.getNode(Opcode, sdl,
@@ -4819,18 +5211,28 @@ SelectionDAGBuilder::visitIntrinsicCall(const CallInst &I, unsigned Intrinsic) {
                              getValue(I.getArgOperand(0))));
     return nullptr;
   }
-  case Intrinsic::minnum:
-    setValue(&I, DAG.getNode(ISD::FMINNUM, sdl,
-                             getValue(I.getArgOperand(0)).getValueType(),
+  case Intrinsic::minnum: {
+    auto VT = getValue(I.getArgOperand(0)).getValueType();
+    unsigned Opc =
+        I.hasNoNaNs() && TLI.isOperationLegalOrCustom(ISD::FMINNAN, VT)
+            ? ISD::FMINNAN
+            : ISD::FMINNUM;
+    setValue(&I, DAG.getNode(Opc, sdl, VT,
                              getValue(I.getArgOperand(0)),
                              getValue(I.getArgOperand(1))));
     return nullptr;
-  case Intrinsic::maxnum:
-    setValue(&I, DAG.getNode(ISD::FMAXNUM, sdl,
-                             getValue(I.getArgOperand(0)).getValueType(),
+  }
+  case Intrinsic::maxnum: {
+    auto VT = getValue(I.getArgOperand(0)).getValueType();
+    unsigned Opc =
+        I.hasNoNaNs() && TLI.isOperationLegalOrCustom(ISD::FMAXNAN, VT)
+            ? ISD::FMAXNAN
+            : ISD::FMAXNUM;
+    setValue(&I, DAG.getNode(Opc, sdl, VT,
                              getValue(I.getArgOperand(0)),
                              getValue(I.getArgOperand(1))));
     return nullptr;
+  }
   case Intrinsic::copysign:
     setValue(&I, DAG.getNode(ISD::FCOPYSIGN, sdl,
                              getValue(I.getArgOperand(0)).getValueType(),
@@ -4954,47 +5356,35 @@ SelectionDAGBuilder::visitIntrinsicCall(const CallInst &I, unsigned Intrinsic) {
     setValue(&I, Res);
     return nullptr;
   }
+  case Intrinsic::stackguard: {
+    EVT PtrTy = TLI.getPointerTy(DAG.getDataLayout());
+    MachineFunction &MF = DAG.getMachineFunction();
+    const Module &M = *MF.getFunction()->getParent();
+    SDValue Chain = getRoot();
+    if (TLI.useLoadStackGuardNode()) {
+      Res = getLoadStackGuard(DAG, sdl, Chain);
+    } else {
+      const Value *Global = TLI.getSDagStackGuard(M);
+      unsigned Align = DL->getPrefTypeAlignment(Global->getType());
+      Res = DAG.getLoad(PtrTy, sdl, Chain, getValue(Global),
+                        MachinePointerInfo(Global, 0), Align,
+                        MachineMemOperand::MOVolatile);
+    }
+    DAG.setRoot(Chain);
+    setValue(&I, Res);
+    return nullptr;
+  }
   case Intrinsic::stackprotector: {
     // Emit code into the DAG to store the stack guard onto the stack.
     MachineFunction &MF = DAG.getMachineFunction();
     MachineFrameInfo *MFI = MF.getFrameInfo();
     EVT PtrTy = TLI.getPointerTy(DAG.getDataLayout());
     SDValue Src, Chain = getRoot();
-    const Value *Ptr = cast<LoadInst>(I.getArgOperand(0))->getPointerOperand();
-    const GlobalVariable *GV = dyn_cast<GlobalVariable>(Ptr);
-
-    // See if Ptr is a bitcast. If it is, look through it and see if we can get
-    // global variable __stack_chk_guard.
-    if (!GV)
-      if (const Operator *BC = dyn_cast<Operator>(Ptr))
-        if (BC->getOpcode() == Instruction::BitCast)
-          GV = dyn_cast<GlobalVariable>(BC->getOperand(0));
-
-    if (GV && TLI.useLoadStackGuardNode()) {
-      // Emit a LOAD_STACK_GUARD node.
-      MachineSDNode *Node = DAG.getMachineNode(TargetOpcode::LOAD_STACK_GUARD,
-                                               sdl, PtrTy, Chain);
-      MachinePointerInfo MPInfo(GV);
-      MachineInstr::mmo_iterator MemRefs = MF.allocateMemRefsArray(1);
-      unsigned Flags = MachineMemOperand::MOLoad |
-                       MachineMemOperand::MOInvariant;
-      *MemRefs = MF.getMachineMemOperand(MPInfo, Flags,
-                                         PtrTy.getSizeInBits() / 8,
-                                         DAG.getEVTAlignment(PtrTy));
-      Node->setMemRefs(MemRefs, MemRefs + 1);
-
-      // Copy the guard value to a virtual register so that it can be
-      // retrieved in the epilogue.
-      Src = SDValue(Node, 0);
-      const TargetRegisterClass *RC =
-          TLI.getRegClassFor(Src.getSimpleValueType());
-      unsigned Reg = MF.getRegInfo().createVirtualRegister(RC);
-
-      SPDescriptor.setGuardReg(Reg);
-      Chain = DAG.getCopyToReg(Chain, sdl, Reg, Src);
-    } else {
+
+    if (TLI.useLoadStackGuardNode())
+      Src = getLoadStackGuard(DAG, sdl, Chain);
+    else
       Src = getValue(I.getArgOperand(0));   // The guard's value.
-    }
 
     AllocaInst *Slot = cast<AllocaInst>(I.getArgOperand(1));
 
@@ -5006,7 +5396,7 @@ SelectionDAGBuilder::visitIntrinsicCall(const CallInst &I, unsigned Intrinsic) {
     // Store the stack protector onto the stack.
     Res = DAG.getStore(Chain, sdl, Src, FIN, MachinePointerInfo::getFixedStack(
                                                  DAG.getMachineFunction(), FI),
-                       true, false, 0);
+                       /* Alignment = */ 0, MachineMemOperand::MOVolatile);
     setValue(&I, Res);
     DAG.setRoot(Res);
     return nullptr;
@@ -5060,15 +5450,20 @@ SelectionDAGBuilder::visitIntrinsicCall(const CallInst &I, unsigned Intrinsic) {
                              getValue(I.getArgOperand(0))));
     return nullptr;
   }
-  case Intrinsic::gcroot:
-    if (GFI) {
-      const Value *Alloca = I.getArgOperand(0)->stripPointerCasts();
-      const Constant *TypeMap = cast<Constant>(I.getArgOperand(1));
-
-      FrameIndexSDNode *FI = cast<FrameIndexSDNode>(getValue(Alloca).getNode());
-      GFI->addStackRoot(FI->getIndex(), TypeMap);
-    }
+  case Intrinsic::gcroot: {
+    MachineFunction &MF = DAG.getMachineFunction();
+    const Function *F = MF.getFunction();
+    (void)F;
+    assert(F->hasGC() &&
+           "only valid in functions with gc specified, enforced by Verifier");
+    assert(GFI && "implied by previous");
+    const Value *Alloca = I.getArgOperand(0)->stripPointerCasts();
+    const Constant *TypeMap = cast<Constant>(I.getArgOperand(1));
+
+    FrameIndexSDNode *FI = cast<FrameIndexSDNode>(getValue(Alloca).getNode());
+    GFI->addStackRoot(FI->getIndex(), TypeMap);
     return nullptr;
+  }
   case Intrinsic::gcread:
   case Intrinsic::gcwrite:
     llvm_unreachable("GC failed to lower gcread/gcwrite intrinsics!");
@@ -5101,7 +5496,7 @@ SelectionDAGBuilder::visitIntrinsicCall(const CallInst &I, unsigned Intrinsic) {
         CallingConv::C, I.getType(),
         DAG.getExternalSymbol(TrapFuncName.data(),
                               TLI.getPointerTy(DAG.getDataLayout())),
-        std::move(Args), 0);
+        std::move(Args));
 
     std::pair<SDValue, SDValue> Result = TLI.LowerCallTo(CLI);
     DAG.setRoot(Result.second);
@@ -5193,18 +5588,6 @@ SelectionDAGBuilder::visitIntrinsicCall(const CallInst &I, unsigned Intrinsic) {
   case Intrinsic::invariant_end:
     // Discard region information.
     return nullptr;
-  case Intrinsic::stackprotectorcheck: {
-    // Do not actually emit anything for this basic block. Instead we initialize
-    // the stack protector descriptor and export the guard variable so we can
-    // access it in FinishBasicBlock.
-    const BasicBlock *BB = I.getParent();
-    SPDescriptor.initialize(BB, FuncInfo.MBBMap[BB], I);
-    ExportFromCurrentBlock(SPDescriptor.getGuard());
-
-    // Flush our exports since we are going to process a terminator.
-    (void)getControlRoot();
-    return nullptr;
-  }
   case Intrinsic::clear_cache:
     return TLI.getClearCacheBuiltinName();
   case Intrinsic::donothing:
@@ -5220,11 +5603,11 @@ SelectionDAGBuilder::visitIntrinsicCall(const CallInst &I, unsigned Intrinsic) {
     return nullptr;
   }
   case Intrinsic::experimental_gc_statepoint: {
-    visitStatepoint(I);
+    LowerStatepoint(ImmutableStatepoint(&I));
     return nullptr;
   }
   case Intrinsic::experimental_gc_result: {
-    visitGCResult(I);
+    visitGCResult(cast<GCResultInst>(I));
     return nullptr;
   }
   case Intrinsic::experimental_gc_relocate: {
@@ -5303,6 +5686,10 @@ SelectionDAGBuilder::visitIntrinsicCall(const CallInst &I, unsigned Intrinsic) {
     setValue(&I, N);
     return nullptr;
   }
+
+  case Intrinsic::experimental_deoptimize:
+    LowerDeoptimizeCall(&I);
+    return nullptr;
   }
 }
 
@@ -5378,14 +5765,16 @@ SelectionDAGBuilder::lowerInvokable(TargetLowering::CallLoweringInfo &CLI,
 void SelectionDAGBuilder::LowerCallTo(ImmutableCallSite CS, SDValue Callee,
                                       bool isTailCall,
                                       const BasicBlock *EHPadBB) {
-  PointerType *PT = cast<PointerType>(CS.getCalledValue()->getType());
-  FunctionType *FTy = cast<FunctionType>(PT->getElementType());
-  Type *RetTy = FTy->getReturnType();
+  auto &DL = DAG.getDataLayout();
+  FunctionType *FTy = CS.getFunctionType();
+  Type *RetTy = CS.getType();
 
   TargetLowering::ArgListTy Args;
   TargetLowering::ArgListEntry Entry;
   Args.reserve(CS.arg_size());
 
+  const Value *SwiftErrorVal = nullptr;
+  const TargetLowering &TLI = DAG.getTargetLoweringInfo();
   for (ImmutableCallSite::arg_iterator i = CS.arg_begin(), e = CS.arg_end();
        i != e; ++i) {
     const Value *V = *i;
@@ -5399,6 +5788,17 @@ void SelectionDAGBuilder::LowerCallTo(ImmutableCallSite CS, SDValue Callee,
 
     // Skip the first return-type Attribute to get to params.
     Entry.setAttributes(&CS, i - CS.arg_begin() + 1);
+
+    // Use swifterror virtual register as input to the call.
+    if (Entry.isSwiftError && TLI.supportSwiftError()) {
+      SwiftErrorVal = V;
+      // We find the virtual register for the actual swifterror argument.
+      // Instead of using the Value, we use the virtual register instead.
+      Entry.Node = DAG.getRegister(
+          FuncInfo.findSwiftErrorVReg(FuncInfo.MBB, V),
+          EVT(TLI.getPointerTy(DL)));
+    }
+
     Args.push_back(Entry);
 
     // If we have an explicit sret argument that is an Instruction, (i.e., it
@@ -5413,13 +5813,32 @@ void SelectionDAGBuilder::LowerCallTo(ImmutableCallSite CS, SDValue Callee,
     isTailCall = false;
 
   TargetLowering::CallLoweringInfo CLI(DAG);
-  CLI.setDebugLoc(getCurSDLoc()).setChain(getRoot())
-    .setCallee(RetTy, FTy, Callee, std::move(Args), CS)
-    .setTailCall(isTailCall);
+  CLI.setDebugLoc(getCurSDLoc())
+      .setChain(getRoot())
+      .setCallee(RetTy, FTy, Callee, std::move(Args), CS)
+      .setTailCall(isTailCall)
+      .setConvergent(CS.isConvergent());
   std::pair<SDValue, SDValue> Result = lowerInvokable(CLI, EHPadBB);
 
-  if (Result.first.getNode())
-    setValue(CS.getInstruction(), Result.first);
+  if (Result.first.getNode()) {
+    const Instruction *Inst = CS.getInstruction();
+    Result.first = lowerRangeToAssertZExt(DAG, *Inst, Result.first);
+    setValue(Inst, Result.first);
+  }
+
+  // The last element of CLI.InVals has the SDValue for swifterror return.
+  // Here we copy it to a virtual register and update SwiftErrorMap for
+  // book-keeping.
+  if (SwiftErrorVal && TLI.supportSwiftError()) {
+    // Get the last element of InVals.
+    SDValue Src = CLI.InVals.back();
+    const TargetRegisterClass *RC = TLI.getRegClassFor(TLI.getPointerTy(DL));
+    unsigned VReg = FuncInfo.MF->getRegInfo().createVirtualRegister(RC);
+    SDValue CopyNode = CLI.DAG.getCopyToReg(Result.second, CLI.DL, VReg, Src);
+    // We update the virtual register for the actual swifterror argument.
+    FuncInfo.setSwiftErrorVReg(FuncInfo.MBB, SwiftErrorVal, VReg);
+    DAG.setRoot(CopyNode);
+  }
 }
 
 /// IsOnlyUsedInZeroEqualityComparison - Return true if it only matters that the
@@ -5449,7 +5868,7 @@ static SDValue getMemCmpLoad(const Value *PtrVal, MVT LoadVT,
                                          PointerType::getUnqual(LoadTy));
 
     if (const Constant *LoadCst = ConstantFoldLoadFromConstPtr(
-            const_cast<Constant *>(LoadInput), *Builder.DL))
+            const_cast<Constant *>(LoadInput), LoadTy, *Builder.DL))
       return Builder.getValue(LoadCst);
   }
 
@@ -5470,9 +5889,7 @@ static SDValue getMemCmpLoad(const Value *PtrVal, MVT LoadVT,
   SDValue Ptr = Builder.getValue(PtrVal);
   SDValue LoadVal = Builder.DAG.getLoad(LoadVT, Builder.getCurSDLoc(), Root,
                                         Ptr, MachinePointerInfo(PtrVal),
-                                        false /*volatile*/,
-                                        false /*nontemporal*/,
-                                        false /*isinvariant*/, 1 /* align=1 */);
+                                        /* Alignment = */ 1);
 
   if (!ConstantMemory)
     Builder.PendingLoads.push_back(LoadVal.getValue(1));
@@ -5516,7 +5933,7 @@ bool SelectionDAGBuilder::visitMemCmpCall(const CallInst &I) {
     return true;
   }
 
-  const TargetSelectionDAGInfo &TSI = DAG.getSelectionDAGInfo();
+  const SelectionDAGTargetInfo &TSI = DAG.getSelectionDAGInfo();
   std::pair<SDValue, SDValue> Res =
     TSI.EmitTargetCodeForMemcmp(DAG, getCurSDLoc(), DAG.getRoot(),
                                 getValue(LHS), getValue(RHS), getValue(Size),
@@ -5613,7 +6030,7 @@ bool SelectionDAGBuilder::visitMemChrCall(const CallInst &I) {
       !I.getType()->isPointerTy())
     return false;
 
-  const TargetSelectionDAGInfo &TSI = DAG.getSelectionDAGInfo();
+  const SelectionDAGTargetInfo &TSI = DAG.getSelectionDAGInfo();
   std::pair<SDValue, SDValue> Res =
     TSI.EmitTargetCodeForMemchr(DAG, getCurSDLoc(), DAG.getRoot(),
                                 getValue(Src), getValue(Char), getValue(Length),
@@ -5641,7 +6058,7 @@ bool SelectionDAGBuilder::visitStrCpyCall(const CallInst &I, bool isStpcpy) {
       !I.getType()->isPointerTy())
     return false;
 
-  const TargetSelectionDAGInfo &TSI = DAG.getSelectionDAGInfo();
+  const SelectionDAGTargetInfo &TSI = DAG.getSelectionDAGInfo();
   std::pair<SDValue, SDValue> Res =
     TSI.EmitTargetCodeForStrcpy(DAG, getCurSDLoc(), getRoot(),
                                 getValue(Arg0), getValue(Arg1),
@@ -5670,7 +6087,7 @@ bool SelectionDAGBuilder::visitStrCmpCall(const CallInst &I) {
       !I.getType()->isIntegerTy())
     return false;
 
-  const TargetSelectionDAGInfo &TSI = DAG.getSelectionDAGInfo();
+  const SelectionDAGTargetInfo &TSI = DAG.getSelectionDAGInfo();
   std::pair<SDValue, SDValue> Res =
     TSI.EmitTargetCodeForStrcmp(DAG, getCurSDLoc(), DAG.getRoot(),
                                 getValue(Arg0), getValue(Arg1),
@@ -5697,7 +6114,7 @@ bool SelectionDAGBuilder::visitStrLenCall(const CallInst &I) {
   if (!Arg0->getType()->isPointerTy() || !I.getType()->isIntegerTy())
     return false;
 
-  const TargetSelectionDAGInfo &TSI = DAG.getSelectionDAGInfo();
+  const SelectionDAGTargetInfo &TSI = DAG.getSelectionDAGInfo();
   std::pair<SDValue, SDValue> Res =
     TSI.EmitTargetCodeForStrlen(DAG, getCurSDLoc(), DAG.getRoot(),
                                 getValue(Arg0), MachinePointerInfo(Arg0));
@@ -5724,7 +6141,7 @@ bool SelectionDAGBuilder::visitStrNLenCall(const CallInst &I) {
       !I.getType()->isIntegerTy())
     return false;
 
-  const TargetSelectionDAGInfo &TSI = DAG.getSelectionDAGInfo();
+  const SelectionDAGTargetInfo &TSI = DAG.getSelectionDAGInfo();
   std::pair<SDValue, SDValue> Res =
     TSI.EmitTargetCodeForStrnlen(DAG, getCurSDLoc(), DAG.getRoot(),
                                  getValue(Arg0), getValue(Arg1),
@@ -5803,9 +6220,10 @@ void SelectionDAGBuilder::visitCall(const CallInst &I) {
     }
 
     // Check for well-known libc/libm calls.  If the function is internal, it
-    // can't be a library call.
+    // can't be a library call.  Don't do the check if marked as nobuiltin for
+    // some reason.
     LibFunc::Func Func;
-    if (!F->hasLocalLinkage() && F->hasName() &&
+    if (!I.isNoBuiltin() && !F->hasLocalLinkage() && F->hasName() &&
         LibInfo->getLibFunc(F->getName(), Func) &&
         LibInfo->hasOptimizedCodeGen(Func)) {
       switch (Func) {
@@ -5952,9 +6370,19 @@ void SelectionDAGBuilder::visitCall(const CallInst &I) {
         RenameFn,
         DAG.getTargetLoweringInfo().getPointerTy(DAG.getDataLayout()));
 
-  // Check if we can potentially perform a tail call. More detailed checking is
-  // be done within LowerCallTo, after more information about the call is known.
-  LowerCallTo(&I, Callee, I.isTailCall());
+  // Deopt bundles are lowered in LowerCallSiteWithDeoptBundle, and we don't
+  // have to do anything here to lower funclet bundles.
+  assert(!I.hasOperandBundlesOtherThan(
+             {LLVMContext::OB_deopt, LLVMContext::OB_funclet}) &&
+         "Cannot lower calls with arbitrary operand bundles!");
+
+  if (I.countOperandBundlesOfType(LLVMContext::OB_deopt))
+    LowerCallSiteWithDeoptBundle(&I, Callee, nullptr);
+  else
+    // Check if we can potentially perform a tail call. More detailed checking
+    // is be done within LowerCallTo, after more information about the call is
+    // known.
+    LowerCallTo(&I, Callee, I.isTailCall());
 }
 
 namespace {
@@ -6036,9 +6464,8 @@ typedef SmallVector<SDISelAsmOperandInfo,16> SDISelAsmOperandInfoVector;
 ///
 ///   OpInfo describes the operand.
 ///
-static void GetRegistersForValue(SelectionDAG &DAG,
-                                 const TargetLowering &TLI,
-                                 SDLoc DL,
+static void GetRegistersForValue(SelectionDAG &DAG, const TargetLowering &TLI,
+                                 const SDLoc &DL,
                                  SDISelAsmOperandInfo &OpInfo) {
   LLVMContext &Context = *DAG.getContext();
 
@@ -6301,8 +6728,7 @@ void SelectionDAGBuilder::visitInlineAsm(ImmutableCallSite CS) {
             DAG.getFrameIndex(SSFI, TLI.getPointerTy(DAG.getDataLayout()));
         Chain = DAG.getStore(
             Chain, getCurSDLoc(), OpInfo.CallOperand, StackSlot,
-            MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), SSFI),
-            false, false, 0);
+            MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), SSFI));
         OpInfo.CallOperand = StackSlot;
       }
 
@@ -6349,6 +6775,8 @@ void SelectionDAGBuilder::visitInlineAsm(ImmutableCallSite CS) {
     ExtraInfo |= InlineAsm::Extra_HasSideEffects;
   if (IA->isAlignStack())
     ExtraInfo |= InlineAsm::Extra_IsAlignStack;
+  if (CS.isConvergent())
+    ExtraInfo |= InlineAsm::Extra_IsConvergent;
   // Set the asm dialect.
   ExtraInfo |= IA->getDialect() * InlineAsm::Extra_AsmDialect;
 
@@ -6413,10 +6841,9 @@ void SelectionDAGBuilder::visitInlineAsm(ImmutableCallSite CS) {
       // Copy the output from the appropriate register.  Find a register that
       // we can use.
       if (OpInfo.AssignedRegs.Regs.empty()) {
-        LLVMContext &Ctx = *DAG.getContext();
-        Ctx.emitError(CS.getInstruction(),
-                      "couldn't allocate output register for constraint '" +
-                          Twine(OpInfo.ConstraintCode) + "'");
+        emitInlineAsmError(
+            CS, "couldn't allocate output register for constraint '" +
+                    Twine(OpInfo.ConstraintCode) + "'");
         return;
       }
 
@@ -6469,10 +6896,9 @@ void SelectionDAGBuilder::visitInlineAsm(ImmutableCallSite CS) {
           // Add (OpFlag&0xffff)>>3 registers to MatchedRegs.
           if (OpInfo.isIndirect) {
             // This happens on gcc/testsuite/gcc.dg/pr8788-1.c
-            LLVMContext &Ctx = *DAG.getContext();
-            Ctx.emitError(CS.getInstruction(), "inline asm not supported yet:"
-                                               " don't know how to handle tied "
-                                               "indirect register inputs");
+            emitInlineAsmError(CS, "inline asm not supported yet:"
+                                   " don't know how to handle tied "
+                                   "indirect register inputs");
             return;
           }
 
@@ -6486,10 +6912,9 @@ void SelectionDAGBuilder::visitInlineAsm(ImmutableCallSite CS) {
             if (const TargetRegisterClass *RC = TLI.getRegClassFor(RegVT))
               MatchedRegs.Regs.push_back(RegInfo.createVirtualRegister(RC));
             else {
-              LLVMContext &Ctx = *DAG.getContext();
-              Ctx.emitError(CS.getInstruction(),
-                            "inline asm error: This value"
-                            " type register class is not natively supported!");
+              emitInlineAsmError(
+                  CS, "inline asm error: This value"
+                      " type register class is not natively supported!");
               return;
             }
           }
@@ -6527,10 +6952,8 @@ void SelectionDAGBuilder::visitInlineAsm(ImmutableCallSite CS) {
         TLI.LowerAsmOperandForConstraint(InOperandVal, OpInfo.ConstraintCode,
                                           Ops, DAG);
         if (Ops.empty()) {
-          LLVMContext &Ctx = *DAG.getContext();
-          Ctx.emitError(CS.getInstruction(),
-                        "invalid operand for inline asm constraint '" +
-                            Twine(OpInfo.ConstraintCode) + "'");
+          emitInlineAsmError(CS, "invalid operand for inline asm constraint '" +
+                                     Twine(OpInfo.ConstraintCode) + "'");
           return;
         }
 
@@ -6570,20 +6993,17 @@ void SelectionDAGBuilder::visitInlineAsm(ImmutableCallSite CS) {
 
       // TODO: Support this.
       if (OpInfo.isIndirect) {
-        LLVMContext &Ctx = *DAG.getContext();
-        Ctx.emitError(CS.getInstruction(),
-                      "Don't know how to handle indirect register inputs yet "
-                      "for constraint '" +
-                          Twine(OpInfo.ConstraintCode) + "'");
+        emitInlineAsmError(
+            CS, "Don't know how to handle indirect register inputs yet "
+                "for constraint '" +
+                    Twine(OpInfo.ConstraintCode) + "'");
         return;
       }
 
       // Copy the input into the appropriate registers.
       if (OpInfo.AssignedRegs.Regs.empty()) {
-        LLVMContext &Ctx = *DAG.getContext();
-        Ctx.emitError(CS.getInstruction(),
-                      "couldn't allocate input reg for constraint '" +
-                          Twine(OpInfo.ConstraintCode) + "'");
+        emitInlineAsmError(CS, "couldn't allocate input reg for constraint '" +
+                                   Twine(OpInfo.ConstraintCode) + "'");
         return;
       }
 
@@ -6667,11 +7087,9 @@ void SelectionDAGBuilder::visitInlineAsm(ImmutableCallSite CS) {
   // Emit the non-flagged stores from the physregs.
   SmallVector<SDValue, 8> OutChains;
   for (unsigned i = 0, e = StoresToEmit.size(); i != e; ++i) {
-    SDValue Val = DAG.getStore(Chain, getCurSDLoc(),
-                               StoresToEmit[i].first,
+    SDValue Val = DAG.getStore(Chain, getCurSDLoc(), StoresToEmit[i].first,
                                getValue(StoresToEmit[i].second),
-                               MachinePointerInfo(StoresToEmit[i].second),
-                               false, false, 0);
+                               MachinePointerInfo(StoresToEmit[i].second));
     OutChains.push_back(Val);
   }
 
@@ -6681,6 +7099,17 @@ void SelectionDAGBuilder::visitInlineAsm(ImmutableCallSite CS) {
   DAG.setRoot(Chain);
 }
 
+void SelectionDAGBuilder::emitInlineAsmError(ImmutableCallSite CS,
+                                             const Twine &Message) {
+  LLVMContext &Ctx = *DAG.getContext();
+  Ctx.emitError(CS.getInstruction(), Message);
+
+  // Make sure we leave the DAG in a valid state
+  const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+  auto VT = TLI.getValueType(DAG.getDataLayout(), CS.getType());
+  setValue(CS.getInstruction(), DAG.getUNDEF(VT));
+}
+
 void SelectionDAGBuilder::visitVAStart(const CallInst &I) {
   DAG.setRoot(DAG.getNode(ISD::VASTART, getCurSDLoc(),
                           MVT::Other, getRoot(),
@@ -6715,16 +7144,49 @@ void SelectionDAGBuilder::visitVACopy(const CallInst &I) {
                           DAG.getSrcValue(I.getArgOperand(1))));
 }
 
-/// \brief Lower an argument list according to the target calling convention.
-///
-/// \return A tuple of <return-value, token-chain>
+SDValue SelectionDAGBuilder::lowerRangeToAssertZExt(SelectionDAG &DAG,
+                                                    const Instruction &I,
+                                                    SDValue Op) {
+  const MDNode *Range = I.getMetadata(LLVMContext::MD_range);
+  if (!Range)
+    return Op;
+
+  Constant *Lo = cast<ConstantAsMetadata>(Range->getOperand(0))->getValue();
+  if (!Lo->isNullValue())
+    return Op;
+
+  Constant *Hi = cast<ConstantAsMetadata>(Range->getOperand(1))->getValue();
+  unsigned Bits = cast<ConstantInt>(Hi)->getValue().logBase2();
+
+  EVT SmallVT = EVT::getIntegerVT(*DAG.getContext(), Bits);
+
+  SDLoc SL = getCurSDLoc();
+
+  SDValue ZExt = DAG.getNode(ISD::AssertZext, SL, Op.getValueType(),
+                             Op, DAG.getValueType(SmallVT));
+  unsigned NumVals = Op.getNode()->getNumValues();
+  if (NumVals == 1)
+    return ZExt;
+
+  SmallVector<SDValue, 4> Ops;
+
+  Ops.push_back(ZExt);
+  for (unsigned I = 1; I != NumVals; ++I)
+    Ops.push_back(Op.getValue(I));
+
+  return DAG.getMergeValues(Ops, SL);
+}
+
+/// \brief Populate a CallLowerinInfo (into \p CLI) based on the properties of
+/// the call being lowered.
 ///
 /// This is a helper for lowering intrinsics that follow a target calling
 /// convention or require stack pointer adjustment. Only a subset of the
 /// intrinsic's operands need to participate in the calling convention.
-std::pair<SDValue, SDValue> SelectionDAGBuilder::lowerCallOperands(
-    ImmutableCallSite CS, unsigned ArgIdx, unsigned NumArgs, SDValue Callee,
-    Type *ReturnTy, const BasicBlock *EHPadBB, bool IsPatchPoint) {
+void SelectionDAGBuilder::populateCallLoweringInfo(
+    TargetLowering::CallLoweringInfo &CLI, ImmutableCallSite CS,
+    unsigned ArgIdx, unsigned NumArgs, SDValue Callee, Type *ReturnTy,
+    bool IsPatchPoint) {
   TargetLowering::ArgListTy Args;
   Args.reserve(NumArgs);
 
@@ -6743,12 +7205,11 @@ std::pair<SDValue, SDValue> SelectionDAGBuilder::lowerCallOperands(
     Args.push_back(Entry);
   }
 
-  TargetLowering::CallLoweringInfo CLI(DAG);
-  CLI.setDebugLoc(getCurSDLoc()).setChain(getRoot())
-    .setCallee(CS.getCallingConv(), ReturnTy, Callee, std::move(Args), NumArgs)
-    .setDiscardResult(CS->use_empty()).setIsPatchPoint(IsPatchPoint);
-
-  return lowerInvokable(CLI, EHPadBB);
+  CLI.setDebugLoc(getCurSDLoc())
+      .setChain(getRoot())
+      .setCallee(CS.getCallingConv(), ReturnTy, Callee, std::move(Args))
+      .setDiscardResult(CS->use_empty())
+      .setIsPatchPoint(IsPatchPoint);
 }
 
 /// \brief Add a stack map intrinsic call's live variable operands to a stackmap
@@ -6769,7 +7230,7 @@ std::pair<SDValue, SDValue> SelectionDAGBuilder::lowerCallOperands(
 /// only available in a register, then the runtime would need to trap when
 /// execution reaches the StackMap in order to read the alloca's location.
 static void addStackMapLiveVars(ImmutableCallSite CS, unsigned StartIdx,
-                                SDLoc DL, SmallVectorImpl<SDValue> &Ops,
+                                const SDLoc &DL, SmallVectorImpl<SDValue> &Ops,
                                 SelectionDAGBuilder &Builder) {
   for (unsigned i = StartIdx, e = CS.arg_size(); i != e; ++i) {
     SDValue OpVal = Builder.getValue(CS.getArgument(i));
@@ -6889,8 +7350,11 @@ void SelectionDAGBuilder::visitPatchpoint(ImmutableCallSite CS,
   unsigned NumCallArgs = IsAnyRegCC ? 0 : NumArgs;
   Type *ReturnTy =
     IsAnyRegCC ? Type::getVoidTy(*DAG.getContext()) : CS->getType();
-  std::pair<SDValue, SDValue> Result = lowerCallOperands(
-      CS, NumMetaOpers, NumCallArgs, Callee, ReturnTy, EHPadBB, true);
+
+  TargetLowering::CallLoweringInfo CLI(DAG);
+  populateCallLoweringInfo(CLI, CS, NumMetaOpers, NumCallArgs, Callee, ReturnTy,
+                           true);
+  std::pair<SDValue, SDValue> Result = lowerInvokable(CLI, EHPadBB);
 
   SDNode *CallEnd = Result.second.getNode();
   if (HasDef && (CallEnd->getOpcode() == ISD::CopyFromReg))
@@ -7057,6 +7521,8 @@ TargetLowering::LowerCallTo(TargetLowering::CallLoweringInfo &CLI) const {
     Entry.isNest = false;
     Entry.isByVal = false;
     Entry.isReturned = false;
+    Entry.isSwiftSelf = false;
+    Entry.isSwiftError = false;
     Entry.Alignment = Align;
     CLI.getArgs().insert(CLI.getArgs().begin(), Entry);
     CLI.RetTy = Type::getVoidTy(CLI.RetTy->getContext());
@@ -7085,10 +7551,23 @@ TargetLowering::LowerCallTo(TargetLowering::CallLoweringInfo &CLI) const {
     }
   }
 
+  // We push in swifterror return as the last element of CLI.Ins.
+  ArgListTy &Args = CLI.getArgs();
+  if (supportSwiftError()) {
+    for (unsigned i = 0, e = Args.size(); i != e; ++i) {
+      if (Args[i].isSwiftError) {
+        ISD::InputArg MyFlags;
+        MyFlags.VT = getPointerTy(DL);
+        MyFlags.ArgVT = EVT(getPointerTy(DL));
+        MyFlags.Flags.setSwiftError();
+        CLI.Ins.push_back(MyFlags);
+      }
+    }
+  }
+
   // Handle all of the outgoing arguments.
   CLI.Outs.clear();
   CLI.OutVals.clear();
-  ArgListTy &Args = CLI.getArgs();
   for (unsigned i = 0, e = Args.size(); i != e; ++i) {
     SmallVector<EVT, 4> ValueVTs;
     ComputeValueVTs(*this, DL, Args[i].Ty, ValueVTs);
@@ -7114,6 +7593,10 @@ TargetLowering::LowerCallTo(TargetLowering::CallLoweringInfo &CLI) const {
         Flags.setInReg();
       if (Args[i].isSRet)
         Flags.setSRet();
+      if (Args[i].isSwiftSelf)
+        Flags.setSwiftSelf();
+      if (Args[i].isSwiftError)
+        Flags.setSwiftError();
       if (Args[i].isByVal)
         Flags.setByVal();
       if (Args[i].isInAlloca) {
@@ -7202,6 +7685,9 @@ TargetLowering::LowerCallTo(TargetLowering::CallLoweringInfo &CLI) const {
   SmallVector<SDValue, 4> InVals;
   CLI.Chain = LowerCall(CLI, InVals);
 
+  // Update CLI.InVals to use outside of this function.
+  CLI.InVals = InVals;
+
   // Verify that the target's LowerCall behaved as expected.
   assert(CLI.Chain.getNode() && CLI.Chain.getValueType() == MVT::Other &&
          "LowerCall didn't return a valid chain!");
@@ -7219,12 +7705,13 @@ TargetLowering::LowerCallTo(TargetLowering::CallLoweringInfo &CLI) const {
     return std::make_pair(SDValue(), SDValue());
   }
 
-  DEBUG(for (unsigned i = 0, e = CLI.Ins.size(); i != e; ++i) {
-          assert(InVals[i].getNode() &&
-                 "LowerCall emitted a null value!");
-          assert(EVT(CLI.Ins[i].VT) == InVals[i].getValueType() &&
-                 "LowerCall emitted a value with the wrong type!");
-        });
+#ifndef NDEBUG
+  for (unsigned i = 0, e = CLI.Ins.size(); i != e; ++i) {
+    assert(InVals[i].getNode() && "LowerCall emitted a null value!");
+    assert(EVT(CLI.Ins[i].VT) == InVals[i].getValueType() &&
+           "LowerCall emitted a value with the wrong type!");
+  }
+#endif
 
   SmallVector<SDValue, 4> ReturnValues;
   if (!CanLowerReturn) {
@@ -7254,7 +7741,7 @@ TargetLowering::LowerCallTo(TargetLowering::CallLoweringInfo &CLI) const {
           RetTys[i], CLI.DL, CLI.Chain, Add,
           MachinePointerInfo::getFixedStack(CLI.DAG.getMachineFunction(),
                                             DemoteStackIdx, Offsets[i]),
-          false, false, false, 1);
+          /* Alignment = */ 1);
       ReturnValues[i] = L;
       Chains[i] = L.getValue(1);
     }
@@ -7263,7 +7750,7 @@ TargetLowering::LowerCallTo(TargetLowering::CallLoweringInfo &CLI) const {
   } else {
     // Collect the legal value parts into potentially illegal values
     // that correspond to the original function's return values.
-    ISD::NodeType AssertOp = ISD::DELETED_NODE;
+    Optional<ISD::NodeType> AssertOp;
     if (CLI.RetSExt)
       AssertOp = ISD::AssertSext;
     else if (CLI.RetZExt)
@@ -7295,8 +7782,7 @@ TargetLowering::LowerCallTo(TargetLowering::CallLoweringInfo &CLI) const {
 void TargetLowering::LowerOperationWrapper(SDNode *N,
                                            SmallVectorImpl<SDValue> &Results,
                                            SelectionDAG &DAG) const {
-  SDValue Res = LowerOperation(SDValue(N, 0), DAG);
-  if (Res.getNode())
+  if (SDValue Res = LowerOperation(SDValue(N, 0), DAG))
     Results.push_back(Res);
 }
 
@@ -7394,6 +7880,10 @@ void SelectionDAGISel::LowerArguments(const Function &F) {
         Flags.setInReg();
       if (F.getAttributes().hasAttribute(Idx, Attribute::StructRet))
         Flags.setSRet();
+      if (F.getAttributes().hasAttribute(Idx, Attribute::SwiftSelf))
+        Flags.setSwiftSelf();
+      if (F.getAttributes().hasAttribute(Idx, Attribute::SwiftError))
+        Flags.setSwiftError();
       if (F.getAttributes().hasAttribute(Idx, Attribute::ByVal))
         Flags.setByVal();
       if (F.getAttributes().hasAttribute(Idx, Attribute::InAlloca)) {
@@ -7483,7 +7973,7 @@ void SelectionDAGISel::LowerArguments(const Function &F) {
                     PointerType::getUnqual(F.getReturnType()), ValueVTs);
     MVT VT = ValueVTs[0].getSimpleVT();
     MVT RegVT = TLI->getRegisterType(*CurDAG->getContext(), VT);
-    ISD::NodeType AssertOp = ISD::DELETED_NODE;
+    Optional<ISD::NodeType> AssertOp = None;
     SDValue ArgValue = getCopyFromParts(DAG, dl, &InVals[0], 1,
                                         RegVT, VT, nullptr, AssertOp);
 
@@ -7524,7 +8014,7 @@ void SelectionDAGISel::LowerArguments(const Function &F) {
       unsigned NumParts = TLI->getNumRegisters(*CurDAG->getContext(), VT);
 
       if (!I->use_empty()) {
-        ISD::NodeType AssertOp = ISD::DELETED_NODE;
+        Optional<ISD::NodeType> AssertOp;
         if (F.getAttributes().hasAttribute(Idx, Attribute::SExt))
           AssertOp = ISD::AssertSext;
         else if (F.getAttributes().hasAttribute(Idx, Attribute::ZExt))
@@ -7559,6 +8049,14 @@ void SelectionDAGISel::LowerArguments(const Function &F) {
         FuncInfo->setArgumentFrameIndex(&*I, FI->getIndex());
     }
 
+    // Update SwiftErrorMap.
+    if (Res.getOpcode() == ISD::CopyFromReg && TLI->supportSwiftError() &&
+        F.getAttributes().hasAttribute(Idx, Attribute::SwiftError)) {
+      unsigned Reg = cast<RegisterSDNode>(Res.getOperand(1))->getReg();
+      if (TargetRegisterInfo::isVirtualRegister(Reg))
+        FuncInfo->SwiftErrorMap[FuncInfo->MBB][0] = Reg;
+    }
+
     // If this argument is live outside of the entry block, insert a copy from
     // wherever we got it to the vreg that other BB's will reference it as.
     if (!TM.Options.EnableFastISel && Res.getOpcode() == ISD::CopyFromReg) {
@@ -7656,7 +8154,8 @@ SelectionDAGBuilder::HandlePHINodesInSuccessorBlocks(const BasicBlock *LLVMBB) {
         EVT VT = ValueVTs[vti];
         unsigned NumRegisters = TLI.getNumRegisters(*DAG.getContext(), VT);
         for (unsigned i = 0, e = NumRegisters; i != e; ++i)
-          FuncInfo.PHINodesToUpdate.push_back(std::make_pair(MBBI++, Reg+i));
+          FuncInfo.PHINodesToUpdate.push_back(
+              std::make_pair(&*MBBI++, Reg + i));
         Reg += NumRegisters;
       }
     }
@@ -7708,7 +8207,8 @@ void SelectionDAGBuilder::updateDAGForMaybeTailCall(SDValue MaybeTC) {
 
 bool SelectionDAGBuilder::isDense(const CaseClusterVector &Clusters,
                                   unsigned *TotalCases, unsigned First,
-                                  unsigned Last) {
+                                  unsigned Last,
+                                  unsigned Density) {
   assert(Last >= First);
   assert(TotalCases[Last] >= TotalCases[First]);
 
@@ -7729,10 +8229,15 @@ bool SelectionDAGBuilder::isDense(const CaseClusterVector &Clusters,
   assert(NumCases < UINT64_MAX / 100);
   assert(Range >= NumCases);
 
-  return NumCases * 100 >= Range * MinJumpTableDensity;
+  return NumCases * 100 >= Range * Density;
 }
 
-static inline bool areJTsAllowed(const TargetLowering &TLI) {
+static inline bool areJTsAllowed(const TargetLowering &TLI,
+                                 const SwitchInst *SI) {
+  const Function *Fn = SI->getParent()->getParent();
+  if (Fn->getFnAttribute("no-jump-tables").getValueAsString() == "true")
+    return false;
+
   return TLI.isOperationLegalOrCustom(ISD::BR_JT, MVT::Other) ||
          TLI.isOperationLegalOrCustom(ISD::BRIND, MVT::Other);
 }
@@ -7826,7 +8331,7 @@ void SelectionDAGBuilder::findJumpTables(CaseClusterVector &Clusters,
 #endif
 
   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
-  if (!areJTsAllowed(TLI))
+  if (!areJTsAllowed(TLI, SI))
     return;
 
   const int64_t N = Clusters.size();
@@ -7843,7 +8348,11 @@ void SelectionDAGBuilder::findJumpTables(CaseClusterVector &Clusters,
       TotalCases[i] += TotalCases[i - 1];
   }
 
-  if (N >= MinJumpTableSize && isDense(Clusters, &TotalCases[0], 0, N - 1)) {
+  unsigned MinDensity = JumpTableDensity;
+  if (DefaultMBB->getParent()->getFunction()->optForSize())
+    MinDensity = OptsizeJumpTableDensity;
+  if (N >= MinJumpTableSize
+      && isDense(Clusters, &TotalCases[0], 0, N - 1, MinDensity)) {
     // Cheap case: the whole range might be suitable for jump table.
     CaseCluster JTCluster;
     if (buildJumpTable(Clusters, 0, N - 1, SI, DefaultMBB, JTCluster)) {
@@ -7888,7 +8397,7 @@ void SelectionDAGBuilder::findJumpTables(CaseClusterVector &Clusters,
     // Search for a solution that results in fewer partitions.
     for (int64_t j = N - 1; j > i; j--) {
       // Try building a partition from Clusters[i..j].
-      if (isDense(Clusters, &TotalCases[0], i, j)) {
+      if (isDense(Clusters, &TotalCases[0], i, j, MinDensity)) {
         unsigned NumPartitions = 1 + (j == N - 1 ? 0 : MinPartitions[j + 1]);
         bool IsTable = j - i + 1 >= MinJumpTableSize;
         unsigned Tables = IsTable + (j == N - 1 ? 0 : NumTables[j + 1]);
diff --git a/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.h b/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.h
index 8fb85ff6ecc7..b9888ae87639 100644
--- a/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.h
+++ b/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.h
@@ -18,14 +18,14 @@
 #include "llvm/ADT/APInt.h"
 #include "llvm/ADT/DenseMap.h"
 #include "llvm/Analysis/AliasAnalysis.h"
-#include "llvm/CodeGen/Analysis.h"
 #include "llvm/CodeGen/SelectionDAG.h"
 #include "llvm/CodeGen/SelectionDAGNodes.h"
 #include "llvm/IR/CallSite.h"
-#include "llvm/IR/Statepoint.h"
 #include "llvm/IR/Constants.h"
+#include "llvm/IR/Statepoint.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Target/TargetLowering.h"
+#include <utility>
 #include <vector>
 
 namespace llvm {
@@ -101,8 +101,8 @@ class SelectionDAGBuilder {
     unsigned SDNodeOrder;
   public:
     DanglingDebugInfo() : DI(nullptr), dl(DebugLoc()), SDNodeOrder(0) { }
-    DanglingDebugInfo(const DbgValueInst *di, DebugLoc DL, unsigned SDNO) :
-      DI(di), dl(DL), SDNodeOrder(SDNO) { }
+    DanglingDebugInfo(const DbgValueInst *di, DebugLoc DL, unsigned SDNO)
+        : DI(di), dl(std::move(DL)), SDNodeOrder(SDNO) {}
     const DbgValueInst* getDI() { return DI; }
     DebugLoc getdl() { return dl; }
     unsigned getSDNodeOrder() { return SDNodeOrder; }
@@ -260,8 +260,9 @@ private:
   };
   struct JumpTableHeader {
     JumpTableHeader(APInt F, APInt L, const Value *SV, MachineBasicBlock *H,
-                    bool E = false):
-      First(F), Last(L), SValue(SV), HeaderBB(H), Emitted(E) {}
+                    bool E = false)
+        : First(std::move(F)), Last(std::move(L)), SValue(SV), HeaderBB(H),
+          Emitted(E) {}
     APInt First;
     APInt Last;
     const Value *SValue;
@@ -286,9 +287,9 @@ private:
     BitTestBlock(APInt F, APInt R, const Value *SV, unsigned Rg, MVT RgVT,
                  bool E, bool CR, MachineBasicBlock *P, MachineBasicBlock *D,
                  BitTestInfo C, BranchProbability Pr)
-        : First(F), Range(R), SValue(SV), Reg(Rg), RegVT(RgVT), Emitted(E),
-          ContiguousRange(CR), Parent(P), Default(D), Cases(std::move(C)),
-          Prob(Pr) {}
+        : First(std::move(F)), Range(std::move(R)), SValue(SV), Reg(Rg),
+          RegVT(RgVT), Emitted(E), ContiguousRange(CR), Parent(P), Default(D),
+          Cases(std::move(C)), Prob(Pr) {}
     APInt First;
     APInt Range;
     const Value *SValue;
@@ -303,12 +304,9 @@ private:
     BranchProbability DefaultProb;
   };
 
-  /// Minimum jump table density, in percent.
-  enum { MinJumpTableDensity = 40 };
-
   /// Check whether a range of clusters is dense enough for a jump table.
   bool isDense(const CaseClusterVector &Clusters, unsigned *TotalCases,
-               unsigned First, unsigned Last);
+               unsigned First, unsigned Last, unsigned MinDensity);
 
   /// Build a jump table cluster from Clusters[First..Last]. Returns false if it
   /// decides it's not a good idea.
@@ -457,7 +455,14 @@ private:
   ///
   ///     c. After we finish selecting the basic block, in FinishBasicBlock if
   ///        the StackProtectorDescriptor attached to the SelectionDAGBuilder is
-  ///        initialized, we first find a splice point in the parent basic block
+  ///        initialized, we produce the validation code with one of these
+  ///        techniques:
+  ///          1) with a call to a guard check function
+  ///          2) with inlined instrumentation
+  ///
+  ///        1) We insert a call to the check function before the terminator.
+  ///
+  ///        2) We first find a splice point in the parent basic block
   ///        before the terminator and then splice the terminator of said basic
   ///        block into the success basic block. Then we code-gen a new tail for
   ///        the parent basic block consisting of the two loads, the comparison,
@@ -467,29 +472,31 @@ private:
   ///        the same function, use the same failure basic block).
   class StackProtectorDescriptor {
   public:
-    StackProtectorDescriptor() : ParentMBB(nullptr), SuccessMBB(nullptr),
-                                 FailureMBB(nullptr), Guard(nullptr),
-                                 GuardReg(0) { }
+    StackProtectorDescriptor()
+        : ParentMBB(nullptr), SuccessMBB(nullptr), FailureMBB(nullptr) {}
 
     /// Returns true if all fields of the stack protector descriptor are
     /// initialized implying that we should/are ready to emit a stack protector.
     bool shouldEmitStackProtector() const {
-      return ParentMBB && SuccessMBB && FailureMBB && Guard;
+      return ParentMBB && SuccessMBB && FailureMBB;
+    }
+
+    bool shouldEmitFunctionBasedCheckStackProtector() const {
+      return ParentMBB && !SuccessMBB && !FailureMBB;
     }
 
     /// Initialize the stack protector descriptor structure for a new basic
     /// block.
-    void initialize(const BasicBlock *BB,
-                    MachineBasicBlock *MBB,
-                    const CallInst &StackProtCheckCall) {
+    void initialize(const BasicBlock *BB, MachineBasicBlock *MBB,
+                    bool FunctionBasedInstrumentation) {
       // Make sure we are not initialized yet.
       assert(!shouldEmitStackProtector() && "Stack Protector Descriptor is "
              "already initialized!");
       ParentMBB = MBB;
-      SuccessMBB = AddSuccessorMBB(BB, MBB, /* IsLikely */ true);
-      FailureMBB = AddSuccessorMBB(BB, MBB, /* IsLikely */ false, FailureMBB);
-      if (!Guard)
-        Guard = StackProtCheckCall.getArgOperand(0);
+      if (!FunctionBasedInstrumentation) {
+        SuccessMBB = AddSuccessorMBB(BB, MBB, /* IsLikely */ true);
+        FailureMBB = AddSuccessorMBB(BB, MBB, /* IsLikely */ false, FailureMBB);
+      }
     }
 
     /// Reset state that changes when we handle different basic blocks.
@@ -518,17 +525,11 @@ private:
     /// always the same.
     void resetPerFunctionState() {
       FailureMBB = nullptr;
-      Guard = nullptr;
-      GuardReg = 0;
     }
 
     MachineBasicBlock *getParentMBB() { return ParentMBB; }
     MachineBasicBlock *getSuccessMBB() { return SuccessMBB; }
     MachineBasicBlock *getFailureMBB() { return FailureMBB; }
-    const Value *getGuard() { return Guard; }
-
-    unsigned getGuardReg() const { return GuardReg; }
-    void setGuardReg(unsigned R) { GuardReg = R; }
 
   private:
     /// The basic block for which we are generating the stack protector.
@@ -548,13 +549,6 @@ private:
     /// contain a call to __stack_chk_fail().
     MachineBasicBlock *FailureMBB;
 
-    /// The guard variable which we will compare against the stored value in the
-    /// stack protector stack slot.
-    const Value *Guard;
-
-    /// The virtual register holding the stack guard value.
-    unsigned GuardReg;
-
     /// Add a successor machine basic block to ParentMBB. If the successor mbb
     /// has not been created yet (i.e. if SuccMBB = 0), then the machine basic
     /// block will be created. Assign a large weight if IsLikely is true.
@@ -708,28 +702,88 @@ public:
   void LowerCallTo(ImmutableCallSite CS, SDValue Callee, bool IsTailCall,
                    const BasicBlock *EHPadBB = nullptr);
 
-  std::pair<SDValue, SDValue> lowerCallOperands(
-          ImmutableCallSite CS,
-          unsigned ArgIdx,
-          unsigned NumArgs,
-          SDValue Callee,
-          Type *ReturnTy,
-          const BasicBlock *EHPadBB = nullptr,
-          bool IsPatchPoint = false);
+  // Lower range metadata from 0 to N to assert zext to an integer of nearest
+  // floor power of two.
+  SDValue lowerRangeToAssertZExt(SelectionDAG &DAG, const Instruction &I,
+                                 SDValue Op);
+
+  void populateCallLoweringInfo(TargetLowering::CallLoweringInfo &CLI,
+                                ImmutableCallSite CS, unsigned ArgIdx,
+                                unsigned NumArgs, SDValue Callee,
+                                Type *ReturnTy, bool IsPatchPoint);
+
+  std::pair<SDValue, SDValue>
+  lowerInvokable(TargetLowering::CallLoweringInfo &CLI,
+                 const BasicBlock *EHPadBB = nullptr);
 
   /// UpdateSplitBlock - When an MBB was split during scheduling, update the
   /// references that need to refer to the last resulting block.
   void UpdateSplitBlock(MachineBasicBlock *First, MachineBasicBlock *Last);
 
+  /// Describes a gc.statepoint or a gc.statepoint like thing for the purposes
+  /// of lowering into a STATEPOINT node.
+  struct StatepointLoweringInfo {
+    /// Bases[i] is the base pointer for Ptrs[i].  Together they denote the set
+    /// of gc pointers this STATEPOINT has to relocate.
+    SmallVector<const Value *, 16> Bases;
+    SmallVector<const Value *, 16> Ptrs;
+
+    /// The set of gc.relocate calls associated with this gc.statepoint.
+    SmallVector<const GCRelocateInst *, 16> GCRelocates;
+
+    /// The full list of gc arguments to the gc.statepoint being lowered.
+    ArrayRef<const Use> GCArgs;
+
+    /// The gc.statepoint instruction.
+    const Instruction *StatepointInstr = nullptr;
+
+    /// The list of gc transition arguments present in the gc.statepoint being
+    /// lowered.
+    ArrayRef<const Use> GCTransitionArgs;
+
+    /// The ID that the resulting STATEPOINT instruction has to report.
+    unsigned ID = -1;
+
+    /// Information regarding the underlying call instruction.
+    TargetLowering::CallLoweringInfo CLI;
+
+    /// The deoptimization state associated with this gc.statepoint call, if
+    /// any.
+    ArrayRef<const Use> DeoptState;
+
+    /// Flags associated with the meta arguments being lowered.
+    uint64_t StatepointFlags = -1;
+
+    /// The number of patchable bytes the call needs to get lowered into.
+    unsigned NumPatchBytes = -1;
+
+    /// The exception handling unwind destination, in case this represents an
+    /// invoke of gc.statepoint.
+    const BasicBlock *EHPadBB = nullptr;
+
+    explicit StatepointLoweringInfo(SelectionDAG &DAG) : CLI(DAG) {}
+  };
+
+  /// Lower \p SLI into a STATEPOINT instruction.
+  SDValue LowerAsSTATEPOINT(StatepointLoweringInfo &SLI);
+
   // This function is responsible for the whole statepoint lowering process.
   // It uniformly handles invoke and call statepoints.
   void LowerStatepoint(ImmutableStatepoint Statepoint,
                        const BasicBlock *EHPadBB = nullptr);
-private:
-  std::pair<SDValue, SDValue>
-  lowerInvokable(TargetLowering::CallLoweringInfo &CLI,
-                 const BasicBlock *EHPadBB = nullptr);
 
+  void LowerCallSiteWithDeoptBundle(ImmutableCallSite CS, SDValue Callee,
+                                    const BasicBlock *EHPadBB);
+
+  void LowerDeoptimizeCall(const CallInst *CI);
+  void LowerDeoptimizingReturn();
+
+  void LowerCallSiteWithDeoptBundleImpl(ImmutableCallSite CS, SDValue Callee,
+                                        const BasicBlock *EHPadBB,
+                                        bool VarArgDisallowed,
+                                        bool ForceVoidReturnTy);
+
+private:
   // Terminator instructions.
   void visitRet(const ReturnInst &I);
   void visitBr(const BranchInst &I);
@@ -840,6 +894,8 @@ private:
   bool visitBinaryFloatCall(const CallInst &I, unsigned Opcode);
   void visitAtomicLoad(const LoadInst &I);
   void visitAtomicStore(const StoreInst &I);
+  void visitLoadFromSwiftError(const LoadInst &I);
+  void visitStoreToSwiftError(const StoreInst &I);
 
   void visitInlineAsm(ImmutableCallSite CS);
   const char *visitIntrinsicCall(const CallInst &I, unsigned Intrinsic);
@@ -853,10 +909,9 @@ private:
   void visitPatchpoint(ImmutableCallSite CS,
                        const BasicBlock *EHPadBB = nullptr);
 
-  // These three are implemented in StatepointLowering.cpp
-  void visitStatepoint(const CallInst &I);
+  // These two are implemented in StatepointLowering.cpp
   void visitGCRelocate(const GCRelocateInst &I);
-  void visitGCResult(const CallInst &I);
+  void visitGCResult(const GCResultInst &I);
 
   void visitUserOp1(const Instruction &I) {
     llvm_unreachable("UserOp1 should not exist at instruction selection time!");
@@ -870,6 +925,8 @@ private:
 
   void HandlePHINodesInSuccessorBlocks(const BasicBlock *LLVMBB);
 
+  void emitInlineAsmError(ImmutableCallSite CS, const Twine &Message);
+
   /// EmitFuncArgumentDbgValue - If V is an function argument then create
   /// corresponding DBG_VALUE machine instruction for it now. At the end of
   /// instruction selection, they will be inserted to the entry BB.
@@ -937,8 +994,7 @@ struct RegsForValue {
   /// Chain/Flag as the input and updates them for the output Chain/Flag.
   /// If the Flag pointer is NULL, no flag is used.
   SDValue getCopyFromRegs(SelectionDAG &DAG, FunctionLoweringInfo &FuncInfo,
-                          SDLoc dl,
-                          SDValue &Chain, SDValue *Flag,
+                          const SDLoc &dl, SDValue &Chain, SDValue *Flag,
                           const Value *V = nullptr) const;
 
   /// getCopyToRegs - Emit a series of CopyToReg nodes that copies the specified
@@ -946,18 +1002,16 @@ struct RegsForValue {
   /// as the input and updates them for the output Chain/Flag.  If the Flag
   /// pointer is nullptr, no flag is used.  If V is not nullptr, then it is used
   /// in printing better diagnostic messages on error.
-  void
-  getCopyToRegs(SDValue Val, SelectionDAG &DAG, SDLoc dl, SDValue &Chain,
-                SDValue *Flag, const Value *V = nullptr,
-                ISD::NodeType PreferredExtendType = ISD::ANY_EXTEND) const;
+  void getCopyToRegs(SDValue Val, SelectionDAG &DAG, const SDLoc &dl,
+                     SDValue &Chain, SDValue *Flag, const Value *V = nullptr,
+                     ISD::NodeType PreferredExtendType = ISD::ANY_EXTEND) const;
 
   /// AddInlineAsmOperands - Add this value to the specified inlineasm node
   /// operand list.  This adds the code marker, matching input operand index
   /// (if applicable), and includes the number of values added into it.
-  void AddInlineAsmOperands(unsigned Kind,
-                            bool HasMatching, unsigned MatchingIdx, SDLoc dl,
-                            SelectionDAG &DAG,
-                            std::vector<SDValue> &Ops) const;
+  void AddInlineAsmOperands(unsigned Kind, bool HasMatching,
+                            unsigned MatchingIdx, const SDLoc &dl,
+                            SelectionDAG &DAG, std::vector<SDValue> &Ops) const;
 };
 
 } // end namespace llvm
diff --git a/lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp b/lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp
index a1c6c4c1dd63..401da059dedc 100644
--- a/lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp
+++ b/lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp
@@ -101,7 +101,7 @@ std::string SDNode::getOperationName(const SelectionDAG *G) const {
   case ISD::GLOBAL_OFFSET_TABLE:        return "GLOBAL_OFFSET_TABLE";
   case ISD::RETURNADDR:                 return "RETURNADDR";
   case ISD::FRAMEADDR:                  return "FRAMEADDR";
-  case ISD::LOCAL_RECOVER:        return "LOCAL_RECOVER";
+  case ISD::LOCAL_RECOVER:              return "LOCAL_RECOVER";
   case ISD::READ_REGISTER:              return "READ_REGISTER";
   case ISD::WRITE_REGISTER:             return "WRITE_REGISTER";
   case ISD::FRAME_TO_ARGS_OFFSET:       return "FRAME_TO_ARGS_OFFSET";
@@ -202,6 +202,7 @@ std::string SDNode::getOperationName(const SelectionDAG *G) const {
   case ISD::FREM:                       return "frem";
   case ISD::FCOPYSIGN:                  return "fcopysign";
   case ISD::FGETSIGN:                   return "fgetsign";
+  case ISD::FCANONICALIZE:              return "fcanonicalize";
   case ISD::FPOW:                       return "fpow";
   case ISD::SMIN:                       return "smin";
   case ISD::SMAX:                       return "smax";
@@ -378,7 +379,7 @@ static Printable PrintNodeId(const SDNode &Node) {
   });
 }
 
-void SDNode::dump() const { dump(nullptr); }
+LLVM_DUMP_METHOD void SDNode::dump() const { dump(nullptr); }
 void SDNode::dump(const SelectionDAG *G) const {
   print(dbgs(), G);
   dbgs() << '\n';
@@ -590,7 +591,7 @@ static void DumpNodes(const SDNode *N, unsigned indent, const SelectionDAG *G) {
   N->dump(G);
 }
 
-void SelectionDAG::dump() const {
+LLVM_DUMP_METHOD void SelectionDAG::dump() const {
   dbgs() << "SelectionDAG has " << AllNodes.size() << " nodes:\n";
 
   for (allnodes_const_iterator I = allnodes_begin(), E = allnodes_end();
@@ -630,7 +631,7 @@ static bool printOperand(raw_ostream &OS, const SelectionDAG *G,
   }
 }
 
-typedef SmallPtrSet<const SDNode *, 128> VisitedSDNodeSet;
+typedef SmallPtrSet<const SDNode *, 32> VisitedSDNodeSet;
 static void DumpNodesr(raw_ostream &OS, const SDNode *N, unsigned indent,
                        const SelectionDAG *G, VisitedSDNodeSet &once) {
   if (!once.insert(N).second) // If we've been here before, return now.
diff --git a/lib/CodeGen/SelectionDAG/SelectionDAGISel.cpp b/lib/CodeGen/SelectionDAG/SelectionDAGISel.cpp
index c075da4738ad..1d61657194c5 100644
--- a/lib/CodeGen/SelectionDAG/SelectionDAGISel.cpp
+++ b/lib/CodeGen/SelectionDAG/SelectionDAGISel.cpp
@@ -11,7 +11,7 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "llvm/CodeGen/GCStrategy.h"
+#include "llvm/CodeGen/SelectionDAG.h"
 #include "ScheduleDAGSDNodes.h"
 #include "SelectionDAGBuilder.h"
 #include "llvm/ADT/PostOrderIterator.h"
@@ -21,10 +21,10 @@
 #include "llvm/Analysis/CFG.h"
 #include "llvm/Analysis/EHPersonalities.h"
 #include "llvm/Analysis/TargetLibraryInfo.h"
-#include "llvm/CodeGen/Analysis.h"
 #include "llvm/CodeGen/FastISel.h"
 #include "llvm/CodeGen/FunctionLoweringInfo.h"
 #include "llvm/CodeGen/GCMetadata.h"
+#include "llvm/CodeGen/GCStrategy.h"
 #include "llvm/CodeGen/MachineFrameInfo.h"
 #include "llvm/CodeGen/MachineFunction.h"
 #include "llvm/CodeGen/MachineInstrBuilder.h"
@@ -32,8 +32,8 @@
 #include "llvm/CodeGen/MachineRegisterInfo.h"
 #include "llvm/CodeGen/ScheduleHazardRecognizer.h"
 #include "llvm/CodeGen/SchedulerRegistry.h"
-#include "llvm/CodeGen/SelectionDAG.h"
 #include "llvm/CodeGen/SelectionDAGISel.h"
+#include "llvm/CodeGen/StackProtector.h"
 #include "llvm/CodeGen/WinEHFuncInfo.h"
 #include "llvm/IR/Constants.h"
 #include "llvm/IR/DebugInfo.h"
@@ -59,6 +59,7 @@
 #include "llvm/Target/TargetSubtargetInfo.h"
 #include "llvm/Transforms/Utils/BasicBlockUtils.h"
 #include <algorithm>
+
 using namespace llvm;
 
 #define DEBUG_TYPE "isel"
@@ -317,7 +318,7 @@ namespace llvm {
            "Unknown sched type!");
     return createILPListDAGScheduler(IS, OptLevel);
   }
-}
+} // end namespace llvm
 
 // EmitInstrWithCustomInserter - This method should be implemented by targets
 // that mark instructions with the 'usesCustomInserter' flag.  These
@@ -329,7 +330,7 @@ namespace llvm {
 // are modified, the method should insert pairs of <OldSucc, NewSucc> into the
 // DenseMap.
 MachineBasicBlock *
-TargetLowering::EmitInstrWithCustomInserter(MachineInstr *MI,
+TargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI,
                                             MachineBasicBlock *MBB) const {
 #ifndef NDEBUG
   dbgs() << "If a target marks an instruction with "
@@ -339,9 +340,9 @@ TargetLowering::EmitInstrWithCustomInserter(MachineInstr *MI,
   llvm_unreachable(nullptr);
 }
 
-void TargetLowering::AdjustInstrPostInstrSelection(MachineInstr *MI,
+void TargetLowering::AdjustInstrPostInstrSelection(MachineInstr &MI,
                                                    SDNode *Node) const {
-  assert(!MI->hasPostISelHook() &&
+  assert(!MI.hasPostISelHook() &&
          "If a target marks an instruction with 'hasPostISelHook', "
          "it must implement TargetLowering::AdjustInstrPostInstrSelection!");
 }
@@ -376,6 +377,8 @@ SelectionDAGISel::~SelectionDAGISel() {
 void SelectionDAGISel::getAnalysisUsage(AnalysisUsage &AU) const {
   AU.addRequired<AAResultsWrapperPass>();
   AU.addRequired<GCModuleInfo>();
+  AU.addRequired<StackProtector>();
+  AU.addPreserved<StackProtector>();
   AU.addPreserved<GCModuleInfo>();
   AU.addRequired<TargetLibraryInfoWrapperPass>();
   if (UseMBPI && OptLevel != CodeGenOpt::None)
@@ -440,7 +443,7 @@ bool SelectionDAGISel::runOnMachineFunction(MachineFunction &mf) {
   TM.resetTargetOptions(Fn);
   // Reset OptLevel to None for optnone functions.
   CodeGenOpt::Level NewOptLevel = OptLevel;
-  if (Fn.hasFnAttribute(Attribute::OptimizeNone))
+  if (OptLevel != CodeGenOpt::None && skipFunction(Fn))
     NewOptLevel = CodeGenOpt::None;
   OptLevelChanger OLC(*this, NewOptLevel);
 
@@ -468,11 +471,10 @@ bool SelectionDAGISel::runOnMachineFunction(MachineFunction &mf) {
   MF->setHasInlineAsm(false);
 
   FuncInfo->SplitCSR = false;
-  SmallVector<MachineBasicBlock*, 4> Returns;
 
   // We split CSR if the target supports it for the given function
   // and the function has only return exits.
-  if (TLI->supportSplitCSR(MF)) {
+  if (OptLevel != CodeGenOpt::None && TLI->supportSplitCSR(MF)) {
     FuncInfo->SplitCSR = true;
 
     // Collect all the return blocks.
@@ -481,12 +483,8 @@ bool SelectionDAGISel::runOnMachineFunction(MachineFunction &mf) {
         continue;
 
       const TerminatorInst *Term = BB.getTerminator();
-      if (isa<UnreachableInst>(Term))
+      if (isa<UnreachableInst>(Term) || isa<ReturnInst>(Term))
         continue;
-      if (isa<ReturnInst>(Term)) {
-        Returns.push_back(FuncInfo->MBBMap[&BB]);
-        continue;
-      }
 
       // Bail out if the exit block is not Return nor Unreachable.
       FuncInfo->SplitCSR = false;
@@ -508,8 +506,21 @@ bool SelectionDAGISel::runOnMachineFunction(MachineFunction &mf) {
   RegInfo->EmitLiveInCopies(EntryMBB, TRI, *TII);
 
   // Insert copies in the entry block and the return blocks.
-  if (FuncInfo->SplitCSR)
+  if (FuncInfo->SplitCSR) {
+    SmallVector<MachineBasicBlock*, 4> Returns;
+    // Collect all the return blocks.
+    for (MachineBasicBlock &MBB : mf) {
+      if (!MBB.succ_empty())
+        continue;
+
+      MachineBasicBlock::iterator Term = MBB.getFirstTerminator();
+      if (Term != MBB.end() && Term->isReturn()) {
+        Returns.push_back(&MBB);
+        continue;
+      }
+    }
     TLI->insertCopiesSplitCSR(EntryMBB, Returns);
+  }
 
   DenseMap<unsigned, unsigned> LiveInMap;
   if (!FuncInfo->ArgDbgValues.empty())
@@ -669,7 +680,7 @@ void SelectionDAGISel::SelectBasicBlock(BasicBlock::const_iterator Begin,
 }
 
 void SelectionDAGISel::ComputeLiveOutVRegInfo() {
-  SmallPtrSet<SDNode*, 128> VisitedNodes;
+  SmallPtrSet<SDNode*, 16> VisitedNodes;
   SmallVector<SDNode*, 128> Worklist;
 
   Worklist.push_back(CurDAG->getRoot().getNode());
@@ -854,7 +865,8 @@ void SelectionDAGISel::CodeGenAndEmitDAG() {
     Scheduler->Run(CurDAG, FuncInfo->MBB);
   }
 
-  if (ViewSUnitDAGs && MatchFilterBB) Scheduler->viewGraph();
+  if (ViewSUnitDAGs && MatchFilterBB)
+    Scheduler->viewGraph();
 
   // Emit machine code to BB.  This can change 'BB' to the last block being
   // inserted into.
@@ -937,23 +949,7 @@ void SelectionDAGISel::DoInstructionSelection() {
       if (Node->use_empty())
         continue;
 
-      SDNode *ResNode = Select(Node);
-
-      // FIXME: This is pretty gross.  'Select' should be changed to not return
-      // anything at all and this code should be nuked with a tactical strike.
-
-      // If node should not be replaced, continue with the next one.
-      if (ResNode == Node || Node->getOpcode() == ISD::DELETED_NODE)
-        continue;
-      // Replace node.
-      if (ResNode) {
-        ReplaceUses(Node, ResNode);
-      }
-
-      // If after the replacement this node is not used any more,
-      // remove this dead node.
-      if (Node->use_empty()) // Don't delete EntryToken, etc.
-        CurDAG->RemoveDeadNode(Node);
+      Select(Node);
     }
 
     CurDAG->setRoot(Dummy.getValue());
@@ -1147,7 +1143,125 @@ static void collectFailStats(const Instruction *I) {
   case Instruction::LandingPad:     NumFastIselFailLandingPad++; return;
   }
 }
-#endif
+#endif // NDEBUG
+
+/// Set up SwiftErrorVals by going through the function. If the function has
+/// swifterror argument, it will be the first entry.
+static void setupSwiftErrorVals(const Function &Fn, const TargetLowering *TLI,
+                                FunctionLoweringInfo *FuncInfo) {
+  if (!TLI->supportSwiftError())
+    return;
+
+  FuncInfo->SwiftErrorVals.clear();
+  FuncInfo->SwiftErrorMap.clear();
+  FuncInfo->SwiftErrorWorklist.clear();
+
+  // Check if function has a swifterror argument.
+  for (Function::const_arg_iterator AI = Fn.arg_begin(), AE = Fn.arg_end();
+       AI != AE; ++AI)
+    if (AI->hasSwiftErrorAttr())
+      FuncInfo->SwiftErrorVals.push_back(&*AI);
+
+  for (const auto &LLVMBB : Fn)
+    for (const auto &Inst : LLVMBB) {
+      if (const AllocaInst *Alloca = dyn_cast<AllocaInst>(&Inst))
+        if (Alloca->isSwiftError())
+          FuncInfo->SwiftErrorVals.push_back(Alloca);
+    }
+}
+
+/// For each basic block, merge incoming swifterror values or simply propagate
+/// them. The merged results will be saved in SwiftErrorMap. For predecessors
+/// that are not yet visited, we create virtual registers to hold the swifterror
+/// values and save them in SwiftErrorWorklist.
+static void mergeIncomingSwiftErrors(FunctionLoweringInfo *FuncInfo,
+                            const TargetLowering *TLI,
+                            const TargetInstrInfo *TII,
+                            const BasicBlock *LLVMBB,
+                            SelectionDAGBuilder *SDB) {
+  if (!TLI->supportSwiftError())
+    return;
+
+  // We should only do this when we have swifterror parameter or swifterror
+  // alloc.
+  if (FuncInfo->SwiftErrorVals.empty())
+    return;
+
+  // At beginning of a basic block, insert PHI nodes or get the virtual
+  // register from the only predecessor, and update SwiftErrorMap; if one
+  // of the predecessors is not visited, update SwiftErrorWorklist.
+  // At end of a basic block, if a block is in SwiftErrorWorklist, insert copy
+  // to sync up the virtual register assignment.
+
+  // Always create a virtual register for each swifterror value in entry block.
+  auto &DL = SDB->DAG.getDataLayout();
+  const TargetRegisterClass *RC = TLI->getRegClassFor(TLI->getPointerTy(DL));
+  if (pred_begin(LLVMBB) == pred_end(LLVMBB)) {
+    for (unsigned I = 0, E = FuncInfo->SwiftErrorVals.size(); I < E; I++) {
+      unsigned VReg = FuncInfo->MF->getRegInfo().createVirtualRegister(RC);
+      // Assign Undef to Vreg. We construct MI directly to make sure it works
+      // with FastISel.
+      BuildMI(*FuncInfo->MBB, FuncInfo->InsertPt, SDB->getCurDebugLoc(),
+          TII->get(TargetOpcode::IMPLICIT_DEF), VReg);
+      FuncInfo->SwiftErrorMap[FuncInfo->MBB].push_back(VReg);
+    }
+    return;
+  }
+
+  if (auto *UniquePred = LLVMBB->getUniquePredecessor()) {
+    auto *UniquePredMBB = FuncInfo->MBBMap[UniquePred];
+    if (!FuncInfo->SwiftErrorMap.count(UniquePredMBB)) {
+      // Update SwiftErrorWorklist with a new virtual register.
+      for (unsigned I = 0, E = FuncInfo->SwiftErrorVals.size(); I < E; I++) {
+        unsigned VReg = FuncInfo->MF->getRegInfo().createVirtualRegister(RC);
+        FuncInfo->SwiftErrorWorklist[UniquePredMBB].push_back(VReg);
+        // Propagate the information from the single predecessor.
+        FuncInfo->SwiftErrorMap[FuncInfo->MBB].push_back(VReg);
+      }
+      return;
+    }
+    // Propagate the information from the single predecessor.
+    FuncInfo->SwiftErrorMap[FuncInfo->MBB] =
+      FuncInfo->SwiftErrorMap[UniquePredMBB];
+    return;
+  }
+
+  // For the case of multiple predecessors, update SwiftErrorWorklist.
+  // Handle the case where we have two or more predecessors being the same.
+  for (const_pred_iterator PI = pred_begin(LLVMBB), PE = pred_end(LLVMBB);
+       PI != PE; ++PI) {
+    auto *PredMBB = FuncInfo->MBBMap[*PI];
+    if (!FuncInfo->SwiftErrorMap.count(PredMBB) &&
+        !FuncInfo->SwiftErrorWorklist.count(PredMBB)) {
+      for (unsigned I = 0, E = FuncInfo->SwiftErrorVals.size(); I < E; I++) {
+        unsigned VReg = FuncInfo->MF->getRegInfo().createVirtualRegister(RC);
+        // When we actually visit the basic block PredMBB, we will materialize
+        // the virtual register assignment in copySwiftErrorsToFinalVRegs.
+        FuncInfo->SwiftErrorWorklist[PredMBB].push_back(VReg);
+      }
+    }
+  }
+
+  // For the case of multiple predecessors, create a virtual register for
+  // each swifterror value and generate Phi node.
+  for (unsigned I = 0, E = FuncInfo->SwiftErrorVals.size(); I < E; I++) {
+    unsigned VReg = FuncInfo->MF->getRegInfo().createVirtualRegister(RC);
+    FuncInfo->SwiftErrorMap[FuncInfo->MBB].push_back(VReg);
+
+    MachineInstrBuilder SwiftErrorPHI = BuildMI(*FuncInfo->MBB,
+        FuncInfo->MBB->begin(), SDB->getCurDebugLoc(),
+        TII->get(TargetOpcode::PHI), VReg);
+    for (const_pred_iterator PI = pred_begin(LLVMBB), PE = pred_end(LLVMBB);
+         PI != PE; ++PI) {
+      auto *PredMBB = FuncInfo->MBBMap[*PI];
+      unsigned SwiftErrorReg = FuncInfo->SwiftErrorMap.count(PredMBB) ?
+        FuncInfo->SwiftErrorMap[PredMBB][I] :
+        FuncInfo->SwiftErrorWorklist[PredMBB][I];
+      SwiftErrorPHI.addReg(SwiftErrorReg)
+                   .addMBB(PredMBB);
+    }
+  }
+}
 
 void SelectionDAGISel::SelectAllBasicBlocks(const Function &Fn) {
   // Initialize the Fast-ISel state, if needed.
@@ -1155,6 +1269,8 @@ void SelectionDAGISel::SelectAllBasicBlocks(const Function &Fn) {
   if (TM.Options.EnableFastISel)
     FastIS = TLI->createFastISel(*FuncInfo, LibInfo);
 
+  setupSwiftErrorVals(Fn, TLI, FuncInfo);
+
   // Iterate over all basic blocks in the function.
   ReversePostOrderTraversal<const Function*> RPOT(&Fn);
   for (ReversePostOrderTraversal<const Function*>::rpo_iterator
@@ -1193,6 +1309,7 @@ void SelectionDAGISel::SelectAllBasicBlocks(const Function &Fn) {
     if (!FuncInfo->MBB)
       continue; // Some blocks like catchpads have no code or MBB.
     FuncInfo->InsertPt = FuncInfo->MBB->getFirstNonPHI();
+    mergeIncomingSwiftErrors(FuncInfo, TLI, TII, LLVMBB, SDB);
 
     // Setup an EH landing-pad block.
     FuncInfo->ExceptionPointerVirtReg = 0;
@@ -1228,7 +1345,7 @@ void SelectionDAGISel::SelectAllBasicBlocks(const Function &Fn) {
         // where they are, so we can be sure to emit subsequent instructions
         // after them.
         if (FuncInfo->InsertPt != FuncInfo->MBB->begin())
-          FastIS->setLastLocalValue(std::prev(FuncInfo->InsertPt));
+          FastIS->setLastLocalValue(&*std::prev(FuncInfo->InsertPt));
         else
           FastIS->setLastLocalValue(nullptr);
       }
@@ -1345,6 +1462,12 @@ void SelectionDAGISel::SelectAllBasicBlocks(const Function &Fn) {
         LowerArguments(Fn);
       }
     }
+    if (getAnalysis<StackProtector>().shouldEmitSDCheck(*LLVMBB)) {
+      bool FunctionBasedInstrumentation =
+          TLI->getSSPStackGuardCheck(*Fn.getParent());
+      SDB->SPDescriptor.initialize(LLVMBB, FuncInfo->MBBMap[LLVMBB],
+                                   FunctionBasedInstrumentation);
+    }
 
     if (Begin != BI)
       ++NumDAGBlocks;
@@ -1376,15 +1499,15 @@ void SelectionDAGISel::SelectAllBasicBlocks(const Function &Fn) {
 /// terminator instructors so we can satisfy ABI constraints. A partial
 /// terminator sequence is an improper subset of a terminator sequence (i.e. it
 /// may be the whole terminator sequence).
-static bool MIIsInTerminatorSequence(const MachineInstr *MI) {
+static bool MIIsInTerminatorSequence(const MachineInstr &MI) {
   // If we do not have a copy or an implicit def, we return true if and only if
   // MI is a debug value.
-  if (!MI->isCopy() && !MI->isImplicitDef())
+  if (!MI.isCopy() && !MI.isImplicitDef())
     // Sometimes DBG_VALUE MI sneak in between the copies from the vregs to the
     // physical registers if there is debug info associated with the terminator
     // of our mbb. We want to include said debug info in our terminator
     // sequence, so we return true in that case.
-    return MI->isDebugValue();
+    return MI.isDebugValue();
 
   // We have left the terminator sequence if we are not doing one of the
   // following:
@@ -1394,18 +1517,18 @@ static bool MIIsInTerminatorSequence(const MachineInstr *MI) {
   // 3. Defining a register via an implicit def.
 
   // OPI should always be a register definition...
-  MachineInstr::const_mop_iterator OPI = MI->operands_begin();
+  MachineInstr::const_mop_iterator OPI = MI.operands_begin();
   if (!OPI->isReg() || !OPI->isDef())
     return false;
 
   // Defining any register via an implicit def is always ok.
-  if (MI->isImplicitDef())
+  if (MI.isImplicitDef())
     return true;
 
   // Grab the copy source...
   MachineInstr::const_mop_iterator OPI2 = OPI;
   ++OPI2;
-  assert(OPI2 != MI->operands_end()
+  assert(OPI2 != MI.operands_end()
          && "Should have a copy implying we should have 2 arguments.");
 
   // Make sure that the copy dest is not a vreg when the copy source is a
@@ -1432,7 +1555,7 @@ static bool MIIsInTerminatorSequence(const MachineInstr *MI) {
 /// terminator, but additionally the copies that move the vregs into the
 /// physical registers.
 static MachineBasicBlock::iterator
-FindSplitPointForStackProtector(MachineBasicBlock *BB, DebugLoc DL) {
+FindSplitPointForStackProtector(MachineBasicBlock *BB) {
   MachineBasicBlock::iterator SplitPoint = BB->getFirstTerminator();
   //
   if (SplitPoint == BB->begin())
@@ -1442,7 +1565,7 @@ FindSplitPointForStackProtector(MachineBasicBlock *BB, DebugLoc DL) {
   MachineBasicBlock::iterator Previous = SplitPoint;
   --Previous;
 
-  while (MIIsInTerminatorSequence(Previous)) {
+  while (MIIsInTerminatorSequence(*Previous)) {
     SplitPoint = Previous;
     if (Previous == Start)
       break;
@@ -1454,7 +1577,6 @@ FindSplitPointForStackProtector(MachineBasicBlock *BB, DebugLoc DL) {
 
 void
 SelectionDAGISel::FinishBasicBlock() {
-
   DEBUG(dbgs() << "Total amount of phi nodes to update: "
                << FuncInfo->PHINodesToUpdate.size() << "\n";
         for (unsigned i = 0, e = FuncInfo->PHINodesToUpdate.size(); i != e; ++i)
@@ -1474,7 +1596,23 @@ SelectionDAGISel::FinishBasicBlock() {
   }
 
   // Handle stack protector.
-  if (SDB->SPDescriptor.shouldEmitStackProtector()) {
+  if (SDB->SPDescriptor.shouldEmitFunctionBasedCheckStackProtector()) {
+    // The target provides a guard check function. There is no need to
+    // generate error handling code or to split current basic block.
+    MachineBasicBlock *ParentMBB = SDB->SPDescriptor.getParentMBB();
+
+    // Add load and check to the basicblock.
+    FuncInfo->MBB = ParentMBB;
+    FuncInfo->InsertPt =
+        FindSplitPointForStackProtector(ParentMBB);
+    SDB->visitSPDescriptorParent(SDB->SPDescriptor, ParentMBB);
+    CurDAG->setRoot(SDB->getRoot());
+    SDB->clear();
+    CodeGenAndEmitDAG();
+
+    // Clear the Per-BB State.
+    SDB->SPDescriptor.resetPerBBState();
+  } else if (SDB->SPDescriptor.shouldEmitStackProtector()) {
     MachineBasicBlock *ParentMBB = SDB->SPDescriptor.getParentMBB();
     MachineBasicBlock *SuccessMBB = SDB->SPDescriptor.getSuccessMBB();
 
@@ -1485,7 +1623,7 @@ SelectionDAGISel::FinishBasicBlock() {
     // register allocation issues caused by us splitting the parent mbb. The
     // register allocator will clean up said virtual copies later on.
     MachineBasicBlock::iterator SplitPoint =
-      FindSplitPointForStackProtector(ParentMBB, SDB->getCurDebugLoc());
+        FindSplitPointForStackProtector(ParentMBB);
 
     // Splice the terminator of ParentMBB into SuccessMBB.
     SuccessMBB->splice(SuccessMBB->end(), ParentMBB,
@@ -1502,7 +1640,7 @@ SelectionDAGISel::FinishBasicBlock() {
 
     // CodeGen Failure MBB if we have not codegened it yet.
     MachineBasicBlock *FailureMBB = SDB->SPDescriptor.getFailureMBB();
-    if (!FailureMBB->size()) {
+    if (FailureMBB->empty()) {
       FuncInfo->MBB = FailureMBB;
       FuncInfo->InsertPt = FailureMBB->end();
       SDB->visitSPDescriptorFailure(SDB->SPDescriptor);
@@ -1515,52 +1653,61 @@ SelectionDAGISel::FinishBasicBlock() {
     SDB->SPDescriptor.resetPerBBState();
   }
 
-  for (unsigned i = 0, e = SDB->BitTestCases.size(); i != e; ++i) {
+  // Lower each BitTestBlock.
+  for (auto &BTB : SDB->BitTestCases) {
     // Lower header first, if it wasn't already lowered
-    if (!SDB->BitTestCases[i].Emitted) {
+    if (!BTB.Emitted) {
       // Set the current basic block to the mbb we wish to insert the code into
-      FuncInfo->MBB = SDB->BitTestCases[i].Parent;
+      FuncInfo->MBB = BTB.Parent;
       FuncInfo->InsertPt = FuncInfo->MBB->end();
       // Emit the code
-      SDB->visitBitTestHeader(SDB->BitTestCases[i], FuncInfo->MBB);
+      SDB->visitBitTestHeader(BTB, FuncInfo->MBB);
       CurDAG->setRoot(SDB->getRoot());
       SDB->clear();
       CodeGenAndEmitDAG();
     }
 
-    BranchProbability UnhandledProb = SDB->BitTestCases[i].Prob;
-    for (unsigned j = 0, ej = SDB->BitTestCases[i].Cases.size(); j != ej; ++j) {
-      UnhandledProb -= SDB->BitTestCases[i].Cases[j].ExtraProb;
+    BranchProbability UnhandledProb = BTB.Prob;
+    for (unsigned j = 0, ej = BTB.Cases.size(); j != ej; ++j) {
+      UnhandledProb -= BTB.Cases[j].ExtraProb;
       // Set the current basic block to the mbb we wish to insert the code into
-      FuncInfo->MBB = SDB->BitTestCases[i].Cases[j].ThisBB;
+      FuncInfo->MBB = BTB.Cases[j].ThisBB;
       FuncInfo->InsertPt = FuncInfo->MBB->end();
       // Emit the code
 
       // If all cases cover a contiguous range, it is not necessary to jump to
       // the default block after the last bit test fails. This is because the
       // range check during bit test header creation has guaranteed that every
-      // case here doesn't go outside the range.
+      // case here doesn't go outside the range. In this case, there is no need
+      // to perform the last bit test, as it will always be true. Instead, make
+      // the second-to-last bit-test fall through to the target of the last bit
+      // test, and delete the last bit test.
+
       MachineBasicBlock *NextMBB;
-      if (SDB->BitTestCases[i].ContiguousRange && j + 2 == ej)
-        NextMBB = SDB->BitTestCases[i].Cases[j + 1].TargetBB;
-      else if (j + 1 != ej)
-        NextMBB = SDB->BitTestCases[i].Cases[j + 1].ThisBB;
-      else
-        NextMBB = SDB->BitTestCases[i].Default;
+      if (BTB.ContiguousRange && j + 2 == ej) {
+        // Second-to-last bit-test with contiguous range: fall through to the
+        // target of the final bit test.
+        NextMBB = BTB.Cases[j + 1].TargetBB;
+      } else if (j + 1 == ej) {
+        // For the last bit test, fall through to Default.
+        NextMBB = BTB.Default;
+      } else {
+        // Otherwise, fall through to the next bit test.
+        NextMBB = BTB.Cases[j + 1].ThisBB;
+      }
 
-      SDB->visitBitTestCase(SDB->BitTestCases[i],
-                            NextMBB,
-                            UnhandledProb,
-                            SDB->BitTestCases[i].Reg,
-                            SDB->BitTestCases[i].Cases[j],
+      SDB->visitBitTestCase(BTB, NextMBB, UnhandledProb, BTB.Reg, BTB.Cases[j],
                             FuncInfo->MBB);
 
       CurDAG->setRoot(SDB->getRoot());
       SDB->clear();
       CodeGenAndEmitDAG();
 
-      if (SDB->BitTestCases[i].ContiguousRange && j + 2 == ej)
+      if (BTB.ContiguousRange && j + 2 == ej) {
+        // Since we're not going to use the final bit test, remove it.
+        BTB.Cases.pop_back();
         break;
+      }
     }
 
     // Update PHI Nodes
@@ -1571,16 +1718,18 @@ SelectionDAGISel::FinishBasicBlock() {
       assert(PHI->isPHI() &&
              "This is not a machine PHI node that we are updating!");
       // This is "default" BB. We have two jumps to it. From "header" BB and
-      // from last "case" BB.
-      if (PHIBB == SDB->BitTestCases[i].Default)
-        PHI.addReg(FuncInfo->PHINodesToUpdate[pi].second)
-           .addMBB(SDB->BitTestCases[i].Parent)
-           .addReg(FuncInfo->PHINodesToUpdate[pi].second)
-           .addMBB(SDB->BitTestCases[i].Cases.back().ThisBB);
+      // from last "case" BB, unless the latter was skipped.
+      if (PHIBB == BTB.Default) {
+        PHI.addReg(FuncInfo->PHINodesToUpdate[pi].second).addMBB(BTB.Parent);
+        if (!BTB.ContiguousRange) {
+          PHI.addReg(FuncInfo->PHINodesToUpdate[pi].second)
+              .addMBB(BTB.Cases.back().ThisBB);
+         }
+      }
       // One of "cases" BB.
-      for (unsigned j = 0, ej = SDB->BitTestCases[i].Cases.size();
+      for (unsigned j = 0, ej = BTB.Cases.size();
            j != ej; ++j) {
-        MachineBasicBlock* cBB = SDB->BitTestCases[i].Cases[j].ThisBB;
+        MachineBasicBlock* cBB = BTB.Cases[j].ThisBB;
         if (cBB->isSuccessor(PHIBB))
           PHI.addReg(FuncInfo->PHINodesToUpdate[pi].second).addMBB(cBB);
       }
@@ -1685,7 +1834,6 @@ SelectionDAGISel::FinishBasicBlock() {
   SDB->SwitchCases.clear();
 }
 
-
 /// Create the scheduler. If a specific scheduler was specified
 /// via the SchedulerRegistry, use it, otherwise select the
 /// one preferred by the target.
@@ -1764,8 +1912,8 @@ bool SelectionDAGISel::CheckOrMask(SDValue LHS, ConstantSDNode *RHS,
 
 /// SelectInlineAsmMemoryOperands - Calls to this are automatically generated
 /// by tblgen.  Others should not call it.
-void SelectionDAGISel::
-SelectInlineAsmMemoryOperands(std::vector<SDValue> &Ops, SDLoc DL) {
+void SelectionDAGISel::SelectInlineAsmMemoryOperands(std::vector<SDValue> &Ops,
+                                                     const SDLoc &DL) {
   std::vector<SDValue> InOps;
   std::swap(InOps, Ops);
 
@@ -1802,15 +1950,15 @@ SelectInlineAsmMemoryOperands(std::vector<SDValue> &Ops, SDLoc DL) {
 
       // Otherwise, this is a memory operand.  Ask the target to select it.
       std::vector<SDValue> SelOps;
-      if (SelectInlineAsmMemoryOperand(InOps[i+1],
-                                       InlineAsm::getMemoryConstraintID(Flags),
-                                       SelOps))
+      unsigned ConstraintID = InlineAsm::getMemoryConstraintID(Flags);
+      if (SelectInlineAsmMemoryOperand(InOps[i+1], ConstraintID, SelOps))
         report_fatal_error("Could not match memory address.  Inline asm"
                            " failure!");
 
       // Add this to the output node.
       unsigned NewFlags =
         InlineAsm::getFlagWord(InlineAsm::Kind_Mem, SelOps.size());
+      NewFlags = InlineAsm::getFlagWordForMem(NewFlags, ConstraintID);
       Ops.push_back(CurDAG->getTargetConstant(NewFlags, DL, MVT::i32));
       Ops.insert(Ops.end(), SelOps.begin(), SelOps.end());
       i += 2;
@@ -1956,7 +2104,7 @@ bool SelectionDAGISel::IsLegalToFold(SDValue N, SDNode *U, SDNode *Root,
   return !findNonImmUse(Root, N.getNode(), U, Root, Visited, IgnoreChains);
 }
 
-SDNode *SelectionDAGISel::Select_INLINEASM(SDNode *N) {
+void SelectionDAGISel::Select_INLINEASM(SDNode *N) {
   SDLoc DL(N);
 
   std::vector<SDValue> Ops(N->op_begin(), N->op_end());
@@ -1965,11 +2113,11 @@ SDNode *SelectionDAGISel::Select_INLINEASM(SDNode *N) {
   const EVT VTs[] = {MVT::Other, MVT::Glue};
   SDValue New = CurDAG->getNode(ISD::INLINEASM, DL, VTs, Ops);
   New->setNodeId(-1);
-  return New.getNode();
+  ReplaceUses(N, New.getNode());
+  CurDAG->RemoveDeadNode(N);
 }
 
-SDNode
-*SelectionDAGISel::Select_READ_REGISTER(SDNode *Op) {
+void SelectionDAGISel::Select_READ_REGISTER(SDNode *Op) {
   SDLoc dl(Op);
   MDNodeSDNode *MD = dyn_cast<MDNodeSDNode>(Op->getOperand(1));
   const MDString *RegStr = dyn_cast<MDString>(MD->getMD()->getOperand(0));
@@ -1979,11 +2127,11 @@ SDNode
   SDValue New = CurDAG->getCopyFromReg(
                         Op->getOperand(0), dl, Reg, Op->getValueType(0));
   New->setNodeId(-1);
-  return New.getNode();
+  ReplaceUses(Op, New.getNode());
+  CurDAG->RemoveDeadNode(Op);
 }
 
-SDNode
-*SelectionDAGISel::Select_WRITE_REGISTER(SDNode *Op) {
+void SelectionDAGISel::Select_WRITE_REGISTER(SDNode *Op) {
   SDLoc dl(Op);
   MDNodeSDNode *MD = dyn_cast<MDNodeSDNode>(Op->getOperand(1));
   const MDString *RegStr = dyn_cast<MDString>(MD->getMD()->getOperand(0));
@@ -1993,13 +2141,12 @@ SDNode
   SDValue New = CurDAG->getCopyToReg(
                         Op->getOperand(0), dl, Reg, Op->getOperand(2));
   New->setNodeId(-1);
-  return New.getNode();
+  ReplaceUses(Op, New.getNode());
+  CurDAG->RemoveDeadNode(Op);
 }
 
-
-
-SDNode *SelectionDAGISel::Select_UNDEF(SDNode *N) {
-  return CurDAG->SelectNodeTo(N, TargetOpcode::IMPLICIT_DEF,N->getValueType(0));
+void SelectionDAGISel::Select_UNDEF(SDNode *N) {
+  CurDAG->SelectNodeTo(N, TargetOpcode::IMPLICIT_DEF, N->getValueType(0));
 }
 
 /// GetVBR - decode a vbr encoding whose top bit is set.
@@ -2019,15 +2166,11 @@ GetVBR(uint64_t Val, const unsigned char *MatcherTable, unsigned &Idx) {
   return Val;
 }
 
-
-/// UpdateChainsAndGlue - When a match is complete, this method updates uses of
-/// interior glue and chain results to use the new glue and chain results.
-void SelectionDAGISel::
-UpdateChainsAndGlue(SDNode *NodeToMatch, SDValue InputChain,
-                    const SmallVectorImpl<SDNode*> &ChainNodesMatched,
-                    SDValue InputGlue,
-                    const SmallVectorImpl<SDNode*> &GlueResultNodesMatched,
-                    bool isMorphNodeTo) {
+/// When a match is complete, this method updates uses of interior chain results
+/// to use the new results.
+void SelectionDAGISel::UpdateChains(
+    SDNode *NodeToMatch, SDValue InputChain,
+    const SmallVectorImpl<SDNode *> &ChainNodesMatched, bool isMorphNodeTo) {
   SmallVector<SDNode*, 4> NowDeadNodes;
 
   // Now that all the normal results are replaced, we replace the chain and
@@ -2039,10 +2182,8 @@ UpdateChainsAndGlue(SDNode *NodeToMatch, SDValue InputChain,
     // Replace all the chain results with the final chain we ended up with.
     for (unsigned i = 0, e = ChainNodesMatched.size(); i != e; ++i) {
       SDNode *ChainNode = ChainNodesMatched[i];
-
-      // If this node was already deleted, don't look at it.
-      if (ChainNode->getOpcode() == ISD::DELETED_NODE)
-        continue;
+      assert(ChainNode->getOpcode() != ISD::DELETED_NODE &&
+             "Deleted node left in chain");
 
       // Don't replace the results of the root node if we're doing a
       // MorphNodeTo.
@@ -2056,35 +2197,12 @@ UpdateChainsAndGlue(SDNode *NodeToMatch, SDValue InputChain,
       CurDAG->ReplaceAllUsesOfValueWith(ChainVal, InputChain);
 
       // If the node became dead and we haven't already seen it, delete it.
-      if (ChainNode->use_empty() &&
+      if (ChainNode != NodeToMatch && ChainNode->use_empty() &&
           !std::count(NowDeadNodes.begin(), NowDeadNodes.end(), ChainNode))
         NowDeadNodes.push_back(ChainNode);
     }
   }
 
-  // If the result produces glue, update any glue results in the matched
-  // pattern with the glue result.
-  if (InputGlue.getNode()) {
-    // Handle any interior nodes explicitly marked.
-    for (unsigned i = 0, e = GlueResultNodesMatched.size(); i != e; ++i) {
-      SDNode *FRN = GlueResultNodesMatched[i];
-
-      // If this node was already deleted, don't look at it.
-      if (FRN->getOpcode() == ISD::DELETED_NODE)
-        continue;
-
-      assert(FRN->getValueType(FRN->getNumValues()-1) == MVT::Glue &&
-             "Doesn't have a glue result");
-      CurDAG->ReplaceAllUsesOfValueWith(SDValue(FRN, FRN->getNumValues()-1),
-                                        InputGlue);
-
-      // If the node became dead and we haven't already seen it, delete it.
-      if (FRN->use_empty() &&
-          !std::count(NowDeadNodes.begin(), NowDeadNodes.end(), FRN))
-        NowDeadNodes.push_back(FRN);
-    }
-  }
-
   if (!NowDeadNodes.empty())
     CurDAG->RemoveDeadNodes(NowDeadNodes);
 
@@ -2108,8 +2226,9 @@ enum ChainResult {
 /// already selected nodes "below" us.
 static ChainResult
 WalkChainUsers(const SDNode *ChainedNode,
-               SmallVectorImpl<SDNode*> &ChainedNodesInPattern,
-               SmallVectorImpl<SDNode*> &InteriorChainedNodes) {
+               SmallVectorImpl<SDNode *> &ChainedNodesInPattern,
+               DenseMap<const SDNode *, ChainResult> &TokenFactorResult,
+               SmallVectorImpl<SDNode *> &InteriorChainedNodes) {
   ChainResult Result = CR_Simple;
 
   for (SDNode::use_iterator UI = ChainedNode->use_begin(),
@@ -2190,7 +2309,15 @@ WalkChainUsers(const SDNode *ChainedNode,
     // as a new TokenFactor.
     //
     // To distinguish these two cases, do a recursive walk down the uses.
-    switch (WalkChainUsers(User, ChainedNodesInPattern, InteriorChainedNodes)) {
+    auto MemoizeResult = TokenFactorResult.find(User);
+    bool Visited = MemoizeResult != TokenFactorResult.end();
+    // Recursively walk chain users only if the result is not memoized.
+    if (!Visited) {
+      auto Res = WalkChainUsers(User, ChainedNodesInPattern, TokenFactorResult,
+                                InteriorChainedNodes);
+      MemoizeResult = TokenFactorResult.insert(std::make_pair(User, Res)).first;
+    }
+    switch (MemoizeResult->second) {
     case CR_Simple:
       // If the uses of the TokenFactor are just already-selected nodes, ignore
       // it, it is "below" our pattern.
@@ -2210,9 +2337,10 @@ WalkChainUsers(const SDNode *ChainedNode,
     // ultimate chain result of the generated code.  We will also add its chain
     // inputs as inputs to the ultimate TokenFactor we create.
     Result = CR_LeadsToInteriorNode;
-    ChainedNodesInPattern.push_back(User);
-    InteriorChainedNodes.push_back(User);
-    continue;
+    if (!Visited) {
+      ChainedNodesInPattern.push_back(User);
+      InteriorChainedNodes.push_back(User);
+    }
   }
 
   return Result;
@@ -2227,12 +2355,16 @@ WalkChainUsers(const SDNode *ChainedNode,
 static SDValue
 HandleMergeInputChains(SmallVectorImpl<SDNode*> &ChainNodesMatched,
                        SelectionDAG *CurDAG) {
+  // Used for memoization. Without it WalkChainUsers could take exponential
+  // time to run.
+  DenseMap<const SDNode *, ChainResult> TokenFactorResult;
   // Walk all of the chained nodes we've matched, recursively scanning down the
   // users of the chain result. This adds any TokenFactor nodes that are caught
   // in between chained nodes to the chained and interior nodes list.
   SmallVector<SDNode*, 3> InteriorChainedNodes;
   for (unsigned i = 0, e = ChainNodesMatched.size(); i != e; ++i) {
     if (WalkChainUsers(ChainNodesMatched[i], ChainNodesMatched,
+                       TokenFactorResult,
                        InteriorChainedNodes) == CR_InducesCycle)
       return SDValue(); // Would induce a cycle.
   }
@@ -2322,8 +2454,10 @@ MorphNode(SDNode *Node, unsigned TargetOpc, SDVTList VTList,
 
   // Otherwise, no replacement happened because the node already exists. Replace
   // Uses of the old node with the new one.
-  if (Res != Node)
+  if (Res != Node) {
     CurDAG->ReplaceAllUsesWith(Node, Res);
+    CurDAG->RemoveDeadNode(Node);
+  }
 
   return Res;
 }
@@ -2534,7 +2668,6 @@ static unsigned IsPredicateKnownToFail(const unsigned char *Table,
 }
 
 namespace {
-
 struct MatchScope {
   /// FailIndex - If this match fails, this is the index to continue with.
   unsigned FailIndex;
@@ -2552,7 +2685,7 @@ struct MatchScope {
   SDValue InputChain, InputGlue;
 
   /// HasChainNodesMatched - True if the ChainNodesMatched list is non-empty.
-  bool HasChainNodesMatched, HasGlueResultNodesMatched;
+  bool HasChainNodesMatched;
 };
 
 /// \\brief A DAG update listener to keep the matching state
@@ -2591,11 +2724,11 @@ public:
           J.setNode(E);
   }
 };
-}
+} // end anonymous namespace
 
-SDNode *SelectionDAGISel::
-SelectCodeCommon(SDNode *NodeToMatch, const unsigned char *MatcherTable,
-                 unsigned TableSize) {
+void SelectionDAGISel::SelectCodeCommon(SDNode *NodeToMatch,
+                                        const unsigned char *MatcherTable,
+                                        unsigned TableSize) {
   // FIXME: Should these even be selected?  Handle these cases in the caller?
   switch (NodeToMatch->getOpcode()) {
   default:
@@ -2623,16 +2756,25 @@ SelectCodeCommon(SDNode *NodeToMatch, const unsigned char *MatcherTable,
   case ISD::LIFETIME_START:
   case ISD::LIFETIME_END:
     NodeToMatch->setNodeId(-1); // Mark selected.
-    return nullptr;
+    return;
   case ISD::AssertSext:
   case ISD::AssertZext:
     CurDAG->ReplaceAllUsesOfValueWith(SDValue(NodeToMatch, 0),
                                       NodeToMatch->getOperand(0));
-    return nullptr;
-  case ISD::INLINEASM: return Select_INLINEASM(NodeToMatch);
-  case ISD::READ_REGISTER: return Select_READ_REGISTER(NodeToMatch);
-  case ISD::WRITE_REGISTER: return Select_WRITE_REGISTER(NodeToMatch);
-  case ISD::UNDEF:     return Select_UNDEF(NodeToMatch);
+    CurDAG->RemoveDeadNode(NodeToMatch);
+    return;
+  case ISD::INLINEASM:
+    Select_INLINEASM(NodeToMatch);
+    return;
+  case ISD::READ_REGISTER:
+    Select_READ_REGISTER(NodeToMatch);
+    return;
+  case ISD::WRITE_REGISTER:
+    Select_WRITE_REGISTER(NodeToMatch);
+    return;
+  case ISD::UNDEF:
+    Select_UNDEF(NodeToMatch);
+    return;
   }
 
   assert(!NodeToMatch->isMachineOpcode() && "Node already selected!");
@@ -2665,7 +2807,6 @@ SelectCodeCommon(SDNode *NodeToMatch, const unsigned char *MatcherTable,
   // which ones they are.  The result is captured into this list so that we can
   // update the chain results when the pattern is complete.
   SmallVector<SDNode*, 3> ChainNodesMatched;
-  SmallVector<SDNode*, 3> GlueResultNodesMatched;
 
   DEBUG(dbgs() << "ISEL: Starting pattern match on root node: ";
         NodeToMatch->dump(CurDAG);
@@ -2771,7 +2912,6 @@ SelectCodeCommon(SDNode *NodeToMatch, const unsigned char *MatcherTable,
       NewEntry.InputChain = InputChain;
       NewEntry.InputGlue = InputGlue;
       NewEntry.HasChainNodesMatched = !ChainNodesMatched.empty();
-      NewEntry.HasGlueResultNodesMatched = !GlueResultNodesMatched.empty();
       MatchScopes.push_back(NewEntry);
       continue;
     }
@@ -2816,6 +2956,18 @@ SelectCodeCommon(SDNode *NodeToMatch, const unsigned char *MatcherTable,
       continue;
     }
 
+    case OPC_MoveChild0: case OPC_MoveChild1:
+    case OPC_MoveChild2: case OPC_MoveChild3:
+    case OPC_MoveChild4: case OPC_MoveChild5:
+    case OPC_MoveChild6: case OPC_MoveChild7: {
+      unsigned ChildNo = Opcode-OPC_MoveChild0;
+      if (ChildNo >= N.getNumOperands())
+        break;  // Match fails if out of range child #.
+      N = N.getOperand(ChildNo);
+      NodeStack.push_back(N);
+      continue;
+    }
+
     case OPC_MoveParent:
       // Pop the current node off the NodeStack.
       NodeStack.pop_back();
@@ -3028,12 +3180,12 @@ SelectCodeCommon(SDNode *NodeToMatch, const unsigned char *MatcherTable,
 
       if (Imm->getOpcode() == ISD::Constant) {
         const ConstantInt *Val=cast<ConstantSDNode>(Imm)->getConstantIntValue();
-        Imm = CurDAG->getConstant(*Val, SDLoc(NodeToMatch), Imm.getValueType(),
-                                  true);
+        Imm = CurDAG->getTargetConstant(*Val, SDLoc(NodeToMatch),
+                                        Imm.getValueType());
       } else if (Imm->getOpcode() == ISD::ConstantFP) {
         const ConstantFP *Val=cast<ConstantFPSDNode>(Imm)->getConstantFPValue();
-        Imm = CurDAG->getConstantFP(*Val, SDLoc(NodeToMatch),
-                                    Imm.getValueType(), true);
+        Imm = CurDAG->getTargetConstantFP(*Val, SDLoc(NodeToMatch),
+                                          Imm.getValueType());
       }
 
       RecordedNodes.push_back(std::make_pair(Imm, RecordedNodes[RecNo].second));
@@ -3041,7 +3193,8 @@ SelectCodeCommon(SDNode *NodeToMatch, const unsigned char *MatcherTable,
     }
 
     case OPC_EmitMergeInputChains1_0:    // OPC_EmitMergeInputChains, 1, 0
-    case OPC_EmitMergeInputChains1_1: {  // OPC_EmitMergeInputChains, 1, 1
+    case OPC_EmitMergeInputChains1_1:    // OPC_EmitMergeInputChains, 1, 1
+    case OPC_EmitMergeInputChains1_2: {  // OPC_EmitMergeInputChains, 1, 2
       // These are space-optimized forms of OPC_EmitMergeInputChains.
       assert(!InputChain.getNode() &&
              "EmitMergeInputChains should be the first chain producing node");
@@ -3049,7 +3202,7 @@ SelectCodeCommon(SDNode *NodeToMatch, const unsigned char *MatcherTable,
              "Should only have one EmitMergeInputChains per match");
 
       // Read all of the chained nodes.
-      unsigned RecNo = Opcode == OPC_EmitMergeInputChains1_1;
+      unsigned RecNo = Opcode - OPC_EmitMergeInputChains1_0;
       assert(RecNo < RecordedNodes.size() && "Invalid EmitMergeInputChains");
       ChainNodesMatched.push_back(RecordedNodes[RecNo].first.getNode());
 
@@ -3137,13 +3290,22 @@ SelectCodeCommon(SDNode *NodeToMatch, const unsigned char *MatcherTable,
       continue;
     }
 
-    case OPC_EmitNode:
-    case OPC_MorphNodeTo: {
+    case OPC_EmitNode:     case OPC_MorphNodeTo:
+    case OPC_EmitNode0:    case OPC_EmitNode1:    case OPC_EmitNode2:
+    case OPC_MorphNodeTo0: case OPC_MorphNodeTo1: case OPC_MorphNodeTo2: {
       uint16_t TargetOpc = MatcherTable[MatcherIndex++];
       TargetOpc |= (unsigned short)MatcherTable[MatcherIndex++] << 8;
       unsigned EmitNodeInfo = MatcherTable[MatcherIndex++];
       // Get the result VT list.
-      unsigned NumVTs = MatcherTable[MatcherIndex++];
+      unsigned NumVTs;
+      // If this is one of the compressed forms, get the number of VTs based
+      // on the Opcode. Otherwise read the next byte from the table.
+      if (Opcode >= OPC_MorphNodeTo0 && Opcode <= OPC_MorphNodeTo2)
+        NumVTs = Opcode - OPC_MorphNodeTo0;
+      else if (Opcode >= OPC_EmitNode0 && Opcode <= OPC_EmitNode2)
+        NumVTs = Opcode - OPC_EmitNode0;
+      else
+        NumVTs = MatcherTable[MatcherIndex++];
       SmallVector<EVT, 4> VTs;
       for (unsigned i = 0; i != NumVTs; ++i) {
         MVT::SimpleValueType VT =
@@ -3205,7 +3367,9 @@ SelectCodeCommon(SDNode *NodeToMatch, const unsigned char *MatcherTable,
 
       // Create the node.
       SDNode *Res = nullptr;
-      if (Opcode != OPC_MorphNodeTo) {
+      bool IsMorphNodeTo = Opcode == OPC_MorphNodeTo ||
+                     (Opcode >= OPC_MorphNodeTo0 && Opcode <= OPC_MorphNodeTo2);
+      if (!IsMorphNodeTo) {
         // If this is a normal EmitNode command, just create the new node and
         // add the results to the RecordedNodes list.
         Res = CurDAG->getMachineNode(TargetOpc, SDLoc(NodeToMatch),
@@ -3218,13 +3382,17 @@ SelectCodeCommon(SDNode *NodeToMatch, const unsigned char *MatcherTable,
                                                              nullptr));
         }
 
-      } else if (NodeToMatch->getOpcode() != ISD::DELETED_NODE) {
-        Res = MorphNode(NodeToMatch, TargetOpc, VTList, Ops, EmitNodeInfo);
       } else {
-        // NodeToMatch was eliminated by CSE when the target changed the DAG.
-        // We will visit the equivalent node later.
-        DEBUG(dbgs() << "Node was eliminated by CSE\n");
-        return nullptr;
+        assert(NodeToMatch->getOpcode() != ISD::DELETED_NODE &&
+               "NodeToMatch was removed partway through selection");
+        SelectionDAG::DAGNodeDeletedListener NDL(*CurDAG, [&](SDNode *N,
+                                                              SDNode *E) {
+          auto &Chain = ChainNodesMatched;
+          assert((!E || llvm::find(Chain, N) == Chain.end()) &&
+                 "Chain node replaced during MorphNode");
+          Chain.erase(std::remove(Chain.begin(), Chain.end(), N), Chain.end());
+        });
+        Res = MorphNode(NodeToMatch, TargetOpc, VTList, Ops, EmitNodeInfo);
       }
 
       // If the node had chain/glue results, update our notion of the current
@@ -3285,31 +3453,14 @@ SelectCodeCommon(SDNode *NodeToMatch, const unsigned char *MatcherTable,
       }
 
       DEBUG(dbgs() << "  "
-                   << (Opcode == OPC_MorphNodeTo ? "Morphed" : "Created")
+                   << (IsMorphNodeTo ? "Morphed" : "Created")
                    << " node: "; Res->dump(CurDAG); dbgs() << "\n");
 
       // If this was a MorphNodeTo then we're completely done!
-      if (Opcode == OPC_MorphNodeTo) {
-        // Update chain and glue uses.
-        UpdateChainsAndGlue(NodeToMatch, InputChain, ChainNodesMatched,
-                            InputGlue, GlueResultNodesMatched, true);
-        return Res;
-      }
-
-      continue;
-    }
-
-    case OPC_MarkGlueResults: {
-      unsigned NumNodes = MatcherTable[MatcherIndex++];
-
-      // Read and remember all the glue-result nodes.
-      for (unsigned i = 0; i != NumNodes; ++i) {
-        unsigned RecNo = MatcherTable[MatcherIndex++];
-        if (RecNo & 128)
-          RecNo = GetVBR(RecNo, MatcherTable, MatcherIndex);
-
-        assert(RecNo < RecordedNodes.size() && "Invalid MarkGlueResults");
-        GlueResultNodesMatched.push_back(RecordedNodes[RecNo].first.getNode());
+      if (IsMorphNodeTo) {
+        // Update chain uses.
+        UpdateChains(Res, InputChain, ChainNodesMatched, true);
+        return;
       }
       continue;
     }
@@ -3341,20 +3492,24 @@ SelectCodeCommon(SDNode *NodeToMatch, const unsigned char *MatcherTable,
         CurDAG->ReplaceAllUsesOfValueWith(SDValue(NodeToMatch, i), Res);
       }
 
-      // If the root node defines glue, add it to the glue nodes to update list.
-      if (NodeToMatch->getValueType(NodeToMatch->getNumValues()-1) == MVT::Glue)
-        GlueResultNodesMatched.push_back(NodeToMatch);
+      // Update chain uses.
+      UpdateChains(NodeToMatch, InputChain, ChainNodesMatched, false);
 
-      // Update chain and glue uses.
-      UpdateChainsAndGlue(NodeToMatch, InputChain, ChainNodesMatched,
-                          InputGlue, GlueResultNodesMatched, false);
+      // If the root node defines glue, we need to update it to the glue result.
+      // TODO: This never happens in our tests and I think it can be removed /
+      // replaced with an assert, but if we do it this the way the change is
+      // NFC.
+      if (NodeToMatch->getValueType(NodeToMatch->getNumValues() - 1) ==
+              MVT::Glue &&
+          InputGlue.getNode())
+        CurDAG->ReplaceAllUsesOfValueWith(
+            SDValue(NodeToMatch, NodeToMatch->getNumValues() - 1), InputGlue);
 
       assert(NodeToMatch->use_empty() &&
              "Didn't replace all uses of the node?");
+      CurDAG->RemoveDeadNode(NodeToMatch);
 
-      // FIXME: We just return here, which interacts correctly with SelectRoot
-      // above.  We should fix this to not return an SDNode* anymore.
-      return nullptr;
+      return;
     }
     }
 
@@ -3366,7 +3521,7 @@ SelectCodeCommon(SDNode *NodeToMatch, const unsigned char *MatcherTable,
     while (1) {
       if (MatchScopes.empty()) {
         CannotYetSelect(NodeToMatch);
-        return nullptr;
+        return;
       }
 
       // Restore the interpreter state back to the point where the scope was
@@ -3387,8 +3542,6 @@ SelectCodeCommon(SDNode *NodeToMatch, const unsigned char *MatcherTable,
       InputGlue = LastScope.InputGlue;
       if (!LastScope.HasChainNodesMatched)
         ChainNodesMatched.clear();
-      if (!LastScope.HasGlueResultNodesMatched)
-        GlueResultNodesMatched.clear();
 
       // Check to see what the offset is at the new MatcherIndex.  If it is zero
       // we have reached the end of this scope, otherwise we have another child
@@ -3411,8 +3564,6 @@ SelectCodeCommon(SDNode *NodeToMatch, const unsigned char *MatcherTable,
   }
 }
 
-
-
 void SelectionDAGISel::CannotYetSelect(SDNode *N) {
   std::string msg;
   raw_string_ostream Msg(msg);
diff --git a/lib/CodeGen/SelectionDAG/TargetSelectionDAGInfo.cpp b/lib/CodeGen/SelectionDAG/SelectionDAGTargetInfo.cpp
index 00db94256844..55f70f7d9fd3 100644
--- a/lib/CodeGen/SelectionDAG/TargetSelectionDAGInfo.cpp
+++ b/lib/CodeGen/SelectionDAG/SelectionDAGTargetInfo.cpp
@@ -1,4 +1,4 @@
-//===-- TargetSelectionDAGInfo.cpp - SelectionDAG Info --------------------===//
+//===-- SelectionDAGTargetInfo.cpp - SelectionDAG Info --------------------===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -7,13 +7,11 @@
 //
 //===----------------------------------------------------------------------===//
 //
-// This implements the TargetSelectionDAGInfo class.
+// This implements the SelectionDAGTargetInfo class.
 //
 //===----------------------------------------------------------------------===//
 
-#include "llvm/Target/TargetSelectionDAGInfo.h"
-#include "llvm/Target/TargetMachine.h"
+#include "llvm/CodeGen/SelectionDAGTargetInfo.h"
 using namespace llvm;
 
-TargetSelectionDAGInfo::~TargetSelectionDAGInfo() {
-}
+SelectionDAGTargetInfo::~SelectionDAGTargetInfo() {}
diff --git a/lib/CodeGen/SelectionDAG/StatepointLowering.cpp b/lib/CodeGen/SelectionDAG/StatepointLowering.cpp
index 02545a730656..90aaba247265 100644
--- a/lib/CodeGen/SelectionDAG/StatepointLowering.cpp
+++ b/lib/CodeGen/SelectionDAG/StatepointLowering.cpp
@@ -53,13 +53,10 @@ void StatepointLoweringState::startNewStatepoint(SelectionDAGBuilder &Builder) {
          "Trying to visit statepoint before finished processing previous one");
   Locations.clear();
   NextSlotToAllocate = 0;
-  // Need to resize this on each safepoint - we need the two to stay in
-  // sync and the clear patterns of a SelectionDAGBuilder have no relation
-  // to FunctionLoweringInfo.
+  // Need to resize this on each safepoint - we need the two to stay in sync and
+  // the clear patterns of a SelectionDAGBuilder have no relation to
+  // FunctionLoweringInfo.  SmallBitVector::reset initializes all bits to false.
   AllocatedStackSlots.resize(Builder.FuncInfo.StatepointStackSlots.size());
-  for (size_t i = 0; i < AllocatedStackSlots.size(); i++) {
-    AllocatedStackSlots[i] = false;
-  }
 }
 
 void StatepointLoweringState::clear() {
@@ -72,49 +69,46 @@ void StatepointLoweringState::clear() {
 SDValue
 StatepointLoweringState::allocateStackSlot(EVT ValueType,
                                            SelectionDAGBuilder &Builder) {
-
   NumSlotsAllocatedForStatepoints++;
+  auto *MFI = Builder.DAG.getMachineFunction().getFrameInfo();
 
-  // The basic scheme here is to first look for a previously created stack slot
-  // which is not in use (accounting for the fact arbitrary slots may already
-  // be reserved), or to create a new stack slot and use it.
-
-  // If this doesn't succeed in 40000 iterations, something is seriously wrong
-  for (int i = 0; i < 40000; i++) {
-    assert(Builder.FuncInfo.StatepointStackSlots.size() ==
-               AllocatedStackSlots.size() &&
-           "broken invariant");
-    const size_t NumSlots = AllocatedStackSlots.size();
-    assert(NextSlotToAllocate <= NumSlots && "broken invariant");
-
-    if (NextSlotToAllocate >= NumSlots) {
-      assert(NextSlotToAllocate == NumSlots);
-      // record stats
-      if (NumSlots + 1 > StatepointMaxSlotsRequired) {
-        StatepointMaxSlotsRequired = NumSlots + 1;
-      }
+  unsigned SpillSize = ValueType.getSizeInBits() / 8;
+  assert((SpillSize * 8) == ValueType.getSizeInBits() && "Size not in bytes?");
 
-      SDValue SpillSlot = Builder.DAG.CreateStackTemporary(ValueType);
-      const unsigned FI = cast<FrameIndexSDNode>(SpillSlot)->getIndex();
-      auto *MFI = Builder.DAG.getMachineFunction().getFrameInfo();
-      MFI->markAsStatepointSpillSlotObjectIndex(FI);
+  // First look for a previously created stack slot which is not in
+  // use (accounting for the fact arbitrary slots may already be
+  // reserved), or to create a new stack slot and use it.
 
-      Builder.FuncInfo.StatepointStackSlots.push_back(FI);
-      AllocatedStackSlots.push_back(true);
-      return SpillSlot;
-    }
-    if (!AllocatedStackSlots[NextSlotToAllocate]) {
+  const size_t NumSlots = AllocatedStackSlots.size();
+  assert(NextSlotToAllocate <= NumSlots && "Broken invariant");
+
+  // The stack slots in StatepointStackSlots beyond the first NumSlots were
+  // added in this instance of StatepointLoweringState, and cannot be re-used.
+  assert(NumSlots <= Builder.FuncInfo.StatepointStackSlots.size() &&
+         "Broken invariant");
+
+  for (; NextSlotToAllocate < NumSlots; NextSlotToAllocate++) {
+    if (!AllocatedStackSlots.test(NextSlotToAllocate)) {
       const int FI = Builder.FuncInfo.StatepointStackSlots[NextSlotToAllocate];
-      AllocatedStackSlots[NextSlotToAllocate] = true;
-      return Builder.DAG.getFrameIndex(FI, ValueType);
+      if (MFI->getObjectSize(FI) == SpillSize) {
+        AllocatedStackSlots.set(NextSlotToAllocate);
+        return Builder.DAG.getFrameIndex(FI, ValueType);
+      }
     }
-    // Note: We deliberately choose to advance this only on the failing path.
-    // Doing so on the succeeding path involves a bit of complexity that caused
-    // a minor bug previously.  Unless performance shows this matters, please
-    // keep this code as simple as possible.
-    NextSlotToAllocate++;
   }
-  llvm_unreachable("infinite loop?");
+
+  // Couldn't find a free slot, so create a new one:
+
+  SDValue SpillSlot = Builder.DAG.CreateStackTemporary(ValueType);
+  const unsigned FI = cast<FrameIndexSDNode>(SpillSlot)->getIndex();
+  MFI->markAsStatepointSpillSlotObjectIndex(FI);
+
+  Builder.FuncInfo.StatepointStackSlots.push_back(FI);
+
+  StatepointMaxSlotsRequired = std::max<unsigned long>(
+      StatepointMaxSlotsRequired, Builder.FuncInfo.StatepointStackSlots.size());
+
+  return SpillSlot;
 }
 
 /// Utility function for reservePreviousStackSlotForValue. Tries to find
@@ -125,24 +119,23 @@ static Optional<int> findPreviousSpillSlot(const Value *Val,
                                            int LookUpDepth) {
   // Can not look any further - give up now
   if (LookUpDepth <= 0)
-    return Optional<int>();
+    return None;
 
   // Spill location is known for gc relocates
   if (const auto *Relocate = dyn_cast<GCRelocateInst>(Val)) {
-    FunctionLoweringInfo::StatepointSpilledValueMapTy &SpillMap =
-        Builder.FuncInfo.StatepointRelocatedValues[Relocate->getStatepoint()];
+    const auto &SpillMap =
+        Builder.FuncInfo.StatepointSpillMaps[Relocate->getStatepoint()];
 
     auto It = SpillMap.find(Relocate->getDerivedPtr());
     if (It == SpillMap.end())
-      return Optional<int>();
+      return None;
 
     return It->second;
   }
 
   // Look through bitcast instructions.
-  if (const BitCastInst *Cast = dyn_cast<BitCastInst>(Val)) {
+  if (const BitCastInst *Cast = dyn_cast<BitCastInst>(Val))
     return findPreviousSpillSlot(Cast->getOperand(0), Builder, LookUpDepth - 1);
-  }
 
   // Look through phi nodes
   // All incoming values should have same known stack slot, otherwise result
@@ -154,10 +147,10 @@ static Optional<int> findPreviousSpillSlot(const Value *Val,
       Optional<int> SpillSlot =
           findPreviousSpillSlot(IncomingValue, Builder, LookUpDepth - 1);
       if (!SpillSlot.hasValue())
-        return Optional<int>();
+        return None;
 
       if (MergedResult.hasValue() && *MergedResult != *SpillSlot)
-        return Optional<int>();
+        return None;
 
       MergedResult = SpillSlot;
     }
@@ -192,7 +185,7 @@ static Optional<int> findPreviousSpillSlot(const Value *Val,
   // which we visit values is unspecified.
 
   // Don't know any information about this instruction
-  return Optional<int>();
+  return None;
 }
 
 /// Try to find existing copies of the incoming values in stack slots used for
@@ -213,7 +206,7 @@ static void reservePreviousStackSlotForValue(const Value *IncomingValue,
 
   SDValue OldLocation = Builder.StatepointLowering.getLocation(Incoming);
   if (OldLocation.getNode())
-    // duplicates in input
+    // Duplicates in input
     return;
 
   const int LookUpDepth = 6;
@@ -222,14 +215,14 @@ static void reservePreviousStackSlotForValue(const Value *IncomingValue,
   if (!Index.hasValue())
     return;
 
-  auto Itr = std::find(Builder.FuncInfo.StatepointStackSlots.begin(),
-                       Builder.FuncInfo.StatepointStackSlots.end(), *Index);
-  assert(Itr != Builder.FuncInfo.StatepointStackSlots.end() &&
-         "value spilled to the unknown stack slot");
+  const auto &StatepointSlots = Builder.FuncInfo.StatepointStackSlots;
+
+  auto SlotIt = find(StatepointSlots, *Index);
+  assert(SlotIt != StatepointSlots.end() &&
+         "Value spilled to the unknown stack slot");
 
   // This is one of our dedicated lowering slots
-  const int Offset =
-      std::distance(Builder.FuncInfo.StatepointStackSlots.begin(), Itr);
+  const int Offset = std::distance(StatepointSlots.begin(), SlotIt);
   if (Builder.StatepointLowering.isStackSlotAllocated(Offset)) {
     // stack slot already assigned to someone else, can't use it!
     // TODO: currently we reserve space for gc arguments after doing
@@ -252,24 +245,30 @@ static void reservePreviousStackSlotForValue(const Value *IncomingValue,
 /// is not required for correctness.  It's purpose is to reduce the size of
 /// StackMap section.  It has no effect on the number of spill slots required
 /// or the actual lowering.
-static void removeDuplicatesGCPtrs(SmallVectorImpl<const Value *> &Bases,
-                                   SmallVectorImpl<const Value *> &Ptrs,
-                                   SmallVectorImpl<const Value *> &Relocs,
-                                   SelectionDAGBuilder &Builder) {
-
-  // This is horribly inefficient, but I don't care right now
-  SmallSet<SDValue, 64> Seen;
-
-  SmallVector<const Value *, 64> NewBases, NewPtrs, NewRelocs;
-  for (size_t i = 0; i < Ptrs.size(); i++) {
+static void
+removeDuplicateGCPtrs(SmallVectorImpl<const Value *> &Bases,
+                      SmallVectorImpl<const Value *> &Ptrs,
+                      SmallVectorImpl<const GCRelocateInst *> &Relocs,
+                      SelectionDAGBuilder &Builder,
+                      FunctionLoweringInfo::StatepointSpillMap &SSM) {
+  DenseMap<SDValue, const Value *> Seen;
+
+  SmallVector<const Value *, 64> NewBases, NewPtrs;
+  SmallVector<const GCRelocateInst *, 64> NewRelocs;
+  for (size_t i = 0, e = Ptrs.size(); i < e; i++) {
     SDValue SD = Builder.getValue(Ptrs[i]);
-    // Only add non-duplicates
-    if (Seen.count(SD) == 0) {
+    auto SeenIt = Seen.find(SD);
+
+    if (SeenIt == Seen.end()) {
+      // Only add non-duplicates
       NewBases.push_back(Bases[i]);
       NewPtrs.push_back(Ptrs[i]);
       NewRelocs.push_back(Relocs[i]);
+      Seen[SD] = Ptrs[i];
+    } else {
+      // Duplicate pointer found, note in SSM and move on:
+      SSM.DuplicateMap[Ptrs[i]] = SeenIt->second;
     }
-    Seen.insert(SD);
   }
   assert(Bases.size() >= NewBases.size());
   assert(Ptrs.size() >= NewPtrs.size());
@@ -284,43 +283,13 @@ static void removeDuplicatesGCPtrs(SmallVectorImpl<const Value *> &Bases,
 /// Extract call from statepoint, lower it and return pointer to the
 /// call node. Also update NodeMap so that getValue(statepoint) will
 /// reference lowered call result
-static SDNode *
-lowerCallFromStatepoint(ImmutableStatepoint ISP, const BasicBlock *EHPadBB,
-                        SelectionDAGBuilder &Builder,
-                        SmallVectorImpl<SDValue> &PendingExports) {
-
-  ImmutableCallSite CS(ISP.getCallSite());
-
-  SDValue ActualCallee;
-
-  if (ISP.getNumPatchBytes() > 0) {
-    // If we've been asked to emit a nop sequence instead of a call instruction
-    // for this statepoint then don't lower the call target, but use a constant
-    // `null` instead.  Not lowering the call target lets statepoint clients get
-    // away without providing a physical address for the symbolic call target at
-    // link time.
-
-    const auto &TLI = Builder.DAG.getTargetLoweringInfo();
-    const auto &DL = Builder.DAG.getDataLayout();
-
-    unsigned AS = ISP.getCalledValue()->getType()->getPointerAddressSpace();
-    ActualCallee = Builder.DAG.getConstant(0, Builder.getCurSDLoc(),
-                                           TLI.getPointerTy(DL, AS));
-  } else
-    ActualCallee = Builder.getValue(ISP.getCalledValue());
-
-  assert(CS.getCallingConv() != CallingConv::AnyReg &&
-         "anyregcc is not supported on statepoints!");
-
-  Type *DefTy = ISP.getActualReturnType();
-  bool HasDef = !DefTy->isVoidTy();
+static std::pair<SDValue, SDNode *> lowerCallFromStatepointLoweringInfo(
+    SelectionDAGBuilder::StatepointLoweringInfo &SI,
+    SelectionDAGBuilder &Builder, SmallVectorImpl<SDValue> &PendingExports) {
 
   SDValue ReturnValue, CallEndVal;
-  std::tie(ReturnValue, CallEndVal) = Builder.lowerCallOperands(
-      ISP.getCallSite(), ImmutableStatepoint::CallArgsBeginPos,
-      ISP.getNumCallArgs(), ActualCallee, DefTy, EHPadBB,
-      false /* IsPatchPoint */);
-
+  std::tie(ReturnValue, CallEndVal) =
+      Builder.lowerInvokable(SI.CLI, SI.EHPadBB);
   SDNode *CallEnd = CallEndVal.getNode();
 
   // Get a call instruction from the call sequence chain.  Tail calls are not
@@ -339,6 +308,7 @@ lowerCallFromStatepoint(ImmutableStatepoint ISP, const BasicBlock *EHPadBB,
   // to grab the return value from the return register(s), or it can be a LOAD
   // to load a value returned by reference via a stack slot.
 
+  bool HasDef = !SI.CLI.RetTy->isVoidTy();
   if (HasDef) {
     if (CallEnd->getOpcode() == ISD::LOAD)
       CallEnd = CallEnd->getOperand(0).getNode();
@@ -348,70 +318,7 @@ lowerCallFromStatepoint(ImmutableStatepoint ISP, const BasicBlock *EHPadBB,
   }
 
   assert(CallEnd->getOpcode() == ISD::CALLSEQ_END && "expected!");
-
-  // Export the result value if needed
-  const Instruction *GCResult = ISP.getGCResult();
-  if (HasDef && GCResult) {
-    if (GCResult->getParent() != CS.getParent()) {
-      // Result value will be used in a different basic block so we need to
-      // export it now.
-      // Default exporting mechanism will not work here because statepoint call
-      // has a different type than the actual call. It means that by default
-      // llvm will create export register of the wrong type (always i32 in our
-      // case). So instead we need to create export register with correct type
-      // manually.
-      // TODO: To eliminate this problem we can remove gc.result intrinsics
-      //       completely and make statepoint call to return a tuple.
-      unsigned Reg = Builder.FuncInfo.CreateRegs(ISP.getActualReturnType());
-      RegsForValue RFV(
-          *Builder.DAG.getContext(), Builder.DAG.getTargetLoweringInfo(),
-          Builder.DAG.getDataLayout(), Reg, ISP.getActualReturnType());
-      SDValue Chain = Builder.DAG.getEntryNode();
-
-      RFV.getCopyToRegs(ReturnValue, Builder.DAG, Builder.getCurSDLoc(), Chain,
-                        nullptr);
-      PendingExports.push_back(Chain);
-      Builder.FuncInfo.ValueMap[CS.getInstruction()] = Reg;
-    } else {
-      // Result value will be used in a same basic block. Don't export it or
-      // perform any explicit register copies.
-      // We'll replace the actuall call node shortly. gc_result will grab
-      // this value.
-      Builder.setValue(CS.getInstruction(), ReturnValue);
-    }
-  } else {
-    // The token value is never used from here on, just generate a poison value
-    Builder.setValue(CS.getInstruction(),
-                     Builder.DAG.getIntPtrConstant(-1, Builder.getCurSDLoc()));
-  }
-
-  return CallEnd->getOperand(0).getNode();
-}
-
-/// Callect all gc pointers coming into statepoint intrinsic, clean them up,
-/// and return two arrays:
-///   Bases - base pointers incoming to this statepoint
-///   Ptrs - derived pointers incoming to this statepoint
-///   Relocs - the gc_relocate corresponding to each base/ptr pair
-/// Elements of this arrays should be in one-to-one correspondence with each
-/// other i.e Bases[i], Ptrs[i] are from the same gcrelocate call
-static void getIncomingStatepointGCValues(
-    SmallVectorImpl<const Value *> &Bases, SmallVectorImpl<const Value *> &Ptrs,
-    SmallVectorImpl<const Value *> &Relocs, ImmutableStatepoint StatepointSite,
-    SelectionDAGBuilder &Builder) {
-  for (const GCRelocateInst *Relocate : StatepointSite.getRelocates()) {
-    Relocs.push_back(Relocate);
-    Bases.push_back(Relocate->getBasePtr());
-    Ptrs.push_back(Relocate->getDerivedPtr());
-  }
-
-  // Remove any redundant llvm::Values which map to the same SDValue as another
-  // input.  Also has the effect of removing duplicates in the original
-  // llvm::Value input list as well.  This is a useful optimization for
-  // reducing the size of the StackMap section.  It has no other impact.
-  removeDuplicatesGCPtrs(Bases, Ptrs, Relocs, Builder);
-
-  assert(Bases.size() == Ptrs.size() && Ptrs.size() == Relocs.size());
+  return std::make_pair(ReturnValue, CallEnd->getOperand(0).getNode());
 }
 
 /// Spill a value incoming to the statepoint. It might be either part of
@@ -429,7 +336,6 @@ spillIncomingStatepointValue(SDValue Incoming, SDValue Chain,
   if (!Loc.getNode()) {
     Loc = Builder.StatepointLowering.allocateStackSlot(Incoming.getValueType(),
                                                        Builder);
-    assert(isa<FrameIndexSDNode>(Loc));
     int Index = cast<FrameIndexSDNode>(Loc)->getIndex();
     // We use TargetFrameIndex so that isel will not select it into LEA
     Loc = Builder.DAG.getTargetFrameIndex(Index, Incoming.getValueType());
@@ -437,10 +343,22 @@ spillIncomingStatepointValue(SDValue Incoming, SDValue Chain,
     // TODO: We can create TokenFactor node instead of
     //       chaining stores one after another, this may allow
     //       a bit more optimal scheduling for them
+
+#ifndef NDEBUG
+    // Right now we always allocate spill slots that are of the same
+    // size as the value we're about to spill (the size of spillee can
+    // vary since we spill vectors of pointers too).  At some point we
+    // can consider allowing spills of smaller values to larger slots
+    // (i.e. change the '==' in the assert below to a '>=').
+    auto *MFI = Builder.DAG.getMachineFunction().getFrameInfo();
+    assert((MFI->getObjectSize(Index) * 8) ==
+               Incoming.getValueType().getSizeInBits() &&
+           "Bad spill:  stack slot does not match!");
+#endif
+
     Chain = Builder.DAG.getStore(Chain, Builder.getCurSDLoc(), Incoming, Loc,
                                  MachinePointerInfo::getFixedStack(
-                                     Builder.DAG.getMachineFunction(), Index),
-                                 false, false, 0);
+                                     Builder.DAG.getMachineFunction(), Index));
 
     Builder.StatepointLowering.setLocation(Incoming, Loc);
   }
@@ -478,8 +396,7 @@ static void lowerIncomingStatepointValue(SDValue Incoming,
     // spill location.  This would be a useful optimization, but would
     // need to be optional since it requires a lot of complexity on the
     // runtime side which not all would support.
-    std::pair<SDValue, SDValue> Res =
-        spillIncomingStatepointValue(Incoming, Chain, Builder);
+    auto Res = spillIncomingStatepointValue(Incoming, Chain, Builder);
     Ops.push_back(Res.first);
     Chain = Res.second;
   }
@@ -494,43 +411,37 @@ static void lowerIncomingStatepointValue(SDValue Incoming,
 /// completion, 'Ops' will contain ready to use operands for machine code
 /// statepoint. The chain nodes will have already been created and the DAG root
 /// will be set to the last value spilled (if any were).
-static void lowerStatepointMetaArgs(SmallVectorImpl<SDValue> &Ops,
-                                    ImmutableStatepoint StatepointSite,
-                                    SelectionDAGBuilder &Builder) {
-
-  // Lower the deopt and gc arguments for this statepoint.  Layout will
-  // be: deopt argument length, deopt arguments.., gc arguments...
-
-  SmallVector<const Value *, 64> Bases, Ptrs, Relocations;
-  getIncomingStatepointGCValues(Bases, Ptrs, Relocations, StatepointSite,
-                                Builder);
-
+static void
+lowerStatepointMetaArgs(SmallVectorImpl<SDValue> &Ops,
+                        SelectionDAGBuilder::StatepointLoweringInfo &SI,
+                        SelectionDAGBuilder &Builder) {
+  // Lower the deopt and gc arguments for this statepoint.  Layout will be:
+  // deopt argument length, deopt arguments.., gc arguments...
 #ifndef NDEBUG
-  // Check that each of the gc pointer and bases we've gotten out of the
-  // safepoint is something the strategy thinks might be a pointer (or vector
-  // of pointers) into the GC heap.  This is basically just here to help catch
-  // errors during statepoint insertion. TODO: This should actually be in the
-  // Verifier, but we can't get to the GCStrategy from there (yet).
-  GCStrategy &S = Builder.GFI->getStrategy();
-  for (const Value *V : Bases) {
-    auto Opt = S.isGCManagedPointer(V->getType()->getScalarType());
-    if (Opt.hasValue()) {
-      assert(Opt.getValue() &&
-             "non gc managed base pointer found in statepoint");
-    }
-  }
-  for (const Value *V : Ptrs) {
-    auto Opt = S.isGCManagedPointer(V->getType()->getScalarType());
-    if (Opt.hasValue()) {
-      assert(Opt.getValue() &&
-             "non gc managed derived pointer found in statepoint");
+  if (auto *GFI = Builder.GFI) {
+    // Check that each of the gc pointer and bases we've gotten out of the
+    // safepoint is something the strategy thinks might be a pointer (or vector
+    // of pointers) into the GC heap.  This is basically just here to help catch
+    // errors during statepoint insertion. TODO: This should actually be in the
+    // Verifier, but we can't get to the GCStrategy from there (yet).
+    GCStrategy &S = GFI->getStrategy();
+    for (const Value *V : SI.Bases) {
+      auto Opt = S.isGCManagedPointer(V->getType()->getScalarType());
+      if (Opt.hasValue()) {
+        assert(Opt.getValue() &&
+               "non gc managed base pointer found in statepoint");
+      }
     }
-  }
-  for (const Value *V : Relocations) {
-    auto Opt = S.isGCManagedPointer(V->getType()->getScalarType());
-    if (Opt.hasValue()) {
-      assert(Opt.getValue() && "non gc managed pointer relocated");
+    for (const Value *V : SI.Ptrs) {
+      auto Opt = S.isGCManagedPointer(V->getType()->getScalarType());
+      if (Opt.hasValue()) {
+        assert(Opt.getValue() &&
+               "non gc managed derived pointer found in statepoint");
+      }
     }
+  } else {
+    assert(SI.Bases.empty() && "No gc specified, so cannot relocate pointers!");
+    assert(SI.Ptrs.empty() && "No gc specified, so cannot relocate pointers!");
   }
 #endif
 
@@ -539,30 +450,23 @@ static void lowerStatepointMetaArgs(SmallVectorImpl<SDValue> &Ops,
   // particular value.  This is purely an optimization over the code below and
   // doesn't change semantics at all.  It is important for performance that we
   // reserve slots for both deopt and gc values before lowering either.
-  for (const Value *V : StatepointSite.vm_state_args()) {
+  for (const Value *V : SI.DeoptState) {
     reservePreviousStackSlotForValue(V, Builder);
   }
-  for (unsigned i = 0; i < Bases.size(); ++i) {
-    reservePreviousStackSlotForValue(Bases[i], Builder);
-    reservePreviousStackSlotForValue(Ptrs[i], Builder);
+  for (unsigned i = 0; i < SI.Bases.size(); ++i) {
+    reservePreviousStackSlotForValue(SI.Bases[i], Builder);
+    reservePreviousStackSlotForValue(SI.Ptrs[i], Builder);
   }
 
   // First, prefix the list with the number of unique values to be
   // lowered.  Note that this is the number of *Values* not the
   // number of SDValues required to lower them.
-  const int NumVMSArgs = StatepointSite.getNumTotalVMSArgs();
+  const int NumVMSArgs = SI.DeoptState.size();
   pushStackMapConstant(Ops, Builder, NumVMSArgs);
 
-  assert(NumVMSArgs == std::distance(StatepointSite.vm_state_begin(),
-                                     StatepointSite.vm_state_end()));
-
-  // The vm state arguments are lowered in an opaque manner.  We do
-  // not know what type of values are contained within.  We skip the
-  // first one since that happens to be the total number we lowered
-  // explicitly just above.  We could have left it in the loop and
-  // not done it explicitly, but it's far easier to understand this
-  // way.
-  for (const Value *V : StatepointSite.vm_state_args()) {
+  // The vm state arguments are lowered in an opaque manner.  We do not know
+  // what type of values are contained within.
+  for (const Value *V : SI.DeoptState) {
     SDValue Incoming = Builder.getValue(V);
     lowerIncomingStatepointValue(Incoming, Ops, Builder);
   }
@@ -572,11 +476,11 @@ static void lowerStatepointMetaArgs(SmallVectorImpl<SDValue> &Ops,
   // arrays interwoven with each (lowered) base pointer immediately followed by
   // it's (lowered) derived pointer.  i.e
   // (base[0], ptr[0], base[1], ptr[1], ...)
-  for (unsigned i = 0; i < Bases.size(); ++i) {
-    const Value *Base = Bases[i];
+  for (unsigned i = 0; i < SI.Bases.size(); ++i) {
+    const Value *Base = SI.Bases[i];
     lowerIncomingStatepointValue(Builder.getValue(Base), Ops, Builder);
 
-    const Value *Ptr = Ptrs[i];
+    const Value *Ptr = SI.Ptrs[i];
     lowerIncomingStatepointValue(Builder.getValue(Ptr), Ops, Builder);
   }
 
@@ -585,7 +489,7 @@ static void lowerStatepointMetaArgs(SmallVectorImpl<SDValue> &Ops,
   // allocas and give control over placement to the consumer.  In this case,
   // it is the contents of the slot which may get updated, not the pointer to
   // the alloca
-  for (Value *V : StatepointSite.gc_args()) {
+  for (Value *V : SI.GCArgs) {
     SDValue Incoming = Builder.getValue(V);
     if (FrameIndexSDNode *FI = dyn_cast<FrameIndexSDNode>(Incoming)) {
       // This handles allocas as arguments to the statepoint
@@ -597,18 +501,16 @@ static void lowerStatepointMetaArgs(SmallVectorImpl<SDValue> &Ops,
   // Record computed locations for all lowered values.
   // This can not be embedded in lowering loops as we need to record *all*
   // values, while previous loops account only values with unique SDValues.
-  const Instruction *StatepointInstr =
-    StatepointSite.getCallSite().getInstruction();
-  FunctionLoweringInfo::StatepointSpilledValueMapTy &SpillMap =
-    Builder.FuncInfo.StatepointRelocatedValues[StatepointInstr];
+  const Instruction *StatepointInstr = SI.StatepointInstr;
+  auto &SpillMap = Builder.FuncInfo.StatepointSpillMaps[StatepointInstr];
 
-  for (const GCRelocateInst *Relocate : StatepointSite.getRelocates()) {
+  for (const GCRelocateInst *Relocate : SI.GCRelocates) {
     const Value *V = Relocate->getDerivedPtr();
     SDValue SDV = Builder.getValue(V);
     SDValue Loc = Builder.StatepointLowering.getLocation(SDV);
 
     if (Loc.getNode()) {
-      SpillMap[V] = cast<FrameIndexSDNode>(Loc)->getIndex();
+      SpillMap.SlotMap[V] = cast<FrameIndexSDNode>(Loc)->getIndex();
     } else {
       // Record value as visited, but not spilled. This is case for allocas
       // and constants. For this values we can avoid emitting spill load while
@@ -616,7 +518,7 @@ static void lowerStatepointMetaArgs(SmallVectorImpl<SDValue> &Ops,
       // Actually we do not need to record them in this map at all.
       // We do this only to check that we are not relocating any unvisited
       // value.
-      SpillMap[V] = None;
+      SpillMap.SlotMap[V] = None;
 
       // Default llvm mechanisms for exporting values which are used in
       // different basic blocks does not work for gc relocates.
@@ -630,16 +532,8 @@ static void lowerStatepointMetaArgs(SmallVectorImpl<SDValue> &Ops,
   }
 }
 
-void SelectionDAGBuilder::visitStatepoint(const CallInst &CI) {
-  // Check some preconditions for sanity
-  assert(isStatepoint(&CI) &&
-         "function called must be the statepoint function");
-
-  LowerStatepoint(ImmutableStatepoint(&CI));
-}
-
-void SelectionDAGBuilder::LowerStatepoint(
-    ImmutableStatepoint ISP, const BasicBlock *EHPadBB /*= nullptr*/) {
+SDValue SelectionDAGBuilder::LowerAsSTATEPOINT(
+    SelectionDAGBuilder::StatepointLoweringInfo &SI) {
   // The basic scheme here is that information about both the original call and
   // the safepoint is encoded in the CallInst.  We create a temporary call and
   // lower it, then reverse engineer the calling sequence.
@@ -648,36 +542,36 @@ void SelectionDAGBuilder::LowerStatepoint(
   // Clear state
   StatepointLowering.startNewStatepoint(*this);
 
-  ImmutableCallSite CS(ISP.getCallSite());
-
 #ifndef NDEBUG
-  // Consistency check. Check only relocates in the same basic block as thier
-  // statepoint.
-  for (const User *U : CS->users()) {
-    const CallInst *Call = cast<CallInst>(U);
-    if (isa<GCRelocateInst>(Call) && Call->getParent() == CS.getParent())
-      StatepointLowering.scheduleRelocCall(*Call);
-  }
+  // We schedule gc relocates before removeDuplicateGCPtrs since we _will_
+  // encounter the duplicate gc relocates we elide in removeDuplicateGCPtrs.
+  for (auto *Reloc : SI.GCRelocates)
+    if (Reloc->getParent() == SI.StatepointInstr->getParent())
+      StatepointLowering.scheduleRelocCall(*Reloc);
 #endif
 
-#ifndef NDEBUG
-  // If this is a malformed statepoint, report it early to simplify debugging.
-  // This should catch any IR level mistake that's made when constructing or
-  // transforming statepoints.
-  ISP.verify();
-
-  // Check that the associated GCStrategy expects to encounter statepoints.
-  assert(GFI->getStrategy().useStatepoints() &&
-         "GCStrategy does not expect to encounter statepoints");
-#endif
+  // Remove any redundant llvm::Values which map to the same SDValue as another
+  // input.  Also has the effect of removing duplicates in the original
+  // llvm::Value input list as well.  This is a useful optimization for
+  // reducing the size of the StackMap section.  It has no other impact.
+  removeDuplicateGCPtrs(SI.Bases, SI.Ptrs, SI.GCRelocates, *this,
+                        FuncInfo.StatepointSpillMaps[SI.StatepointInstr]);
+  assert(SI.Bases.size() == SI.Ptrs.size() &&
+         SI.Ptrs.size() == SI.GCRelocates.size());
 
   // Lower statepoint vmstate and gcstate arguments
   SmallVector<SDValue, 10> LoweredMetaArgs;
-  lowerStatepointMetaArgs(LoweredMetaArgs, ISP, *this);
+  lowerStatepointMetaArgs(LoweredMetaArgs, SI, *this);
+
+  // Now that we've emitted the spills, we need to update the root so that the
+  // call sequence is ordered correctly.
+  SI.CLI.setChain(getRoot());
 
   // Get call node, we will replace it later with statepoint
-  SDNode *CallNode =
-      lowerCallFromStatepoint(ISP, EHPadBB, *this, PendingExports);
+  SDValue ReturnVal;
+  SDNode *CallNode;
+  std::tie(ReturnVal, CallNode) =
+      lowerCallFromStatepointLoweringInfo(SI, *this, PendingExports);
 
   // Construct the actual GC_TRANSITION_START, STATEPOINT, and GC_TRANSITION_END
   // nodes with all the appropriate arguments and return values.
@@ -700,8 +594,8 @@ void SelectionDAGBuilder::LowerStatepoint(
   // followed by a SRCVALUE for the pointer that may be used during lowering
   // (e.g. to form MachinePointerInfo values for loads/stores).
   const bool IsGCTransition =
-      (ISP.getFlags() & (uint64_t)StatepointFlags::GCTransition) ==
-          (uint64_t)StatepointFlags::GCTransition;
+      (SI.StatepointFlags & (uint64_t)StatepointFlags::GCTransition) ==
+      (uint64_t)StatepointFlags::GCTransition;
   if (IsGCTransition) {
     SmallVector<SDValue, 8> TSOps;
 
@@ -709,7 +603,7 @@ void SelectionDAGBuilder::LowerStatepoint(
     TSOps.push_back(Chain);
 
     // Add GC transition arguments
-    for (const Value *V : ISP.gc_transition_args()) {
+    for (const Value *V : SI.GCTransitionArgs) {
       TSOps.push_back(getValue(V));
       if (V->getType()->isPointerTy())
         TSOps.push_back(DAG.getSrcValue(V));
@@ -734,9 +628,9 @@ void SelectionDAGBuilder::LowerStatepoint(
   SmallVector<SDValue, 40> Ops;
 
   // Add the <id> and <numBytes> constants.
-  Ops.push_back(DAG.getTargetConstant(ISP.getID(), getCurSDLoc(), MVT::i64));
+  Ops.push_back(DAG.getTargetConstant(SI.ID, getCurSDLoc(), MVT::i64));
   Ops.push_back(
-      DAG.getTargetConstant(ISP.getNumPatchBytes(), getCurSDLoc(), MVT::i32));
+      DAG.getTargetConstant(SI.NumPatchBytes, getCurSDLoc(), MVT::i32));
 
   // Calculate and push starting position of vmstate arguments
   // Get number of arguments incoming directly into call node
@@ -758,13 +652,12 @@ void SelectionDAGBuilder::LowerStatepoint(
   Ops.insert(Ops.end(), CallNode->op_begin() + 2, RegMaskIt);
 
   // Add a constant argument for the calling convention
-  pushStackMapConstant(Ops, *this, CS.getCallingConv());
+  pushStackMapConstant(Ops, *this, SI.CLI.CallConv);
 
   // Add a constant argument for the flags
-  uint64_t Flags = ISP.getFlags();
-  assert(
-      ((Flags & ~(uint64_t)StatepointFlags::MaskAll) == 0)
-          && "unknown flag used");
+  uint64_t Flags = SI.StatepointFlags;
+  assert(((Flags & ~(uint64_t)StatepointFlags::MaskAll) == 0) &&
+         "Unknown flag used");
   pushStackMapConstant(Ops, *this, Flags);
 
   // Insert all vmstate and gcstate arguments
@@ -800,7 +693,7 @@ void SelectionDAGBuilder::LowerStatepoint(
     TEOps.push_back(SDValue(StatepointMCNode, 0));
 
     // Add GC transition arguments
-    for (const Value *V : ISP.gc_transition_args()) {
+    for (const Value *V : SI.GCTransitionArgs) {
       TEOps.push_back(getValue(V));
       if (V->getType()->isPointerTy())
         TEOps.push_back(DAG.getSrcValue(V));
@@ -830,19 +723,154 @@ void SelectionDAGBuilder::LowerStatepoint(
   // return value of each gc.relocate to the respective output of the
   // previously emitted STATEPOINT value.  Unfortunately, this doesn't appear
   // to actually be possible today.
+
+  return ReturnVal;
+}
+
+void
+SelectionDAGBuilder::LowerStatepoint(ImmutableStatepoint ISP,
+                                     const BasicBlock *EHPadBB /*= nullptr*/) {
+  assert(ISP.getCallSite().getCallingConv() != CallingConv::AnyReg &&
+         "anyregcc is not supported on statepoints!");
+
+#ifndef NDEBUG
+  // If this is a malformed statepoint, report it early to simplify debugging.
+  // This should catch any IR level mistake that's made when constructing or
+  // transforming statepoints.
+  ISP.verify();
+
+  // Check that the associated GCStrategy expects to encounter statepoints.
+  assert(GFI->getStrategy().useStatepoints() &&
+         "GCStrategy does not expect to encounter statepoints");
+#endif
+
+  SDValue ActualCallee;
+
+  if (ISP.getNumPatchBytes() > 0) {
+    // If we've been asked to emit a nop sequence instead of a call instruction
+    // for this statepoint then don't lower the call target, but use a constant
+    // `null` instead.  Not lowering the call target lets statepoint clients get
+    // away without providing a physical address for the symbolic call target at
+    // link time.
+
+    const auto &TLI = DAG.getTargetLoweringInfo();
+    const auto &DL = DAG.getDataLayout();
+
+    unsigned AS = ISP.getCalledValue()->getType()->getPointerAddressSpace();
+    ActualCallee = DAG.getConstant(0, getCurSDLoc(), TLI.getPointerTy(DL, AS));
+  } else {
+    ActualCallee = getValue(ISP.getCalledValue());
+  }
+
+  StatepointLoweringInfo SI(DAG);
+  populateCallLoweringInfo(SI.CLI, ISP.getCallSite(),
+                           ImmutableStatepoint::CallArgsBeginPos,
+                           ISP.getNumCallArgs(), ActualCallee,
+                           ISP.getActualReturnType(), false /* IsPatchPoint */);
+
+  for (const GCRelocateInst *Relocate : ISP.getRelocates()) {
+    SI.GCRelocates.push_back(Relocate);
+    SI.Bases.push_back(Relocate->getBasePtr());
+    SI.Ptrs.push_back(Relocate->getDerivedPtr());
+  }
+
+  SI.GCArgs = ArrayRef<const Use>(ISP.gc_args_begin(), ISP.gc_args_end());
+  SI.StatepointInstr = ISP.getInstruction();
+  SI.GCTransitionArgs =
+      ArrayRef<const Use>(ISP.gc_args_begin(), ISP.gc_args_end());
+  SI.ID = ISP.getID();
+  SI.DeoptState = ArrayRef<const Use>(ISP.vm_state_begin(), ISP.vm_state_end());
+  SI.StatepointFlags = ISP.getFlags();
+  SI.NumPatchBytes = ISP.getNumPatchBytes();
+  SI.EHPadBB = EHPadBB;
+
+  SDValue ReturnValue = LowerAsSTATEPOINT(SI);
+
+  // Export the result value if needed
+  const GCResultInst *GCResult = ISP.getGCResult();
+  Type *RetTy = ISP.getActualReturnType();
+  if (!RetTy->isVoidTy() && GCResult) {
+    if (GCResult->getParent() != ISP.getCallSite().getParent()) {
+      // Result value will be used in a different basic block so we need to
+      // export it now.  Default exporting mechanism will not work here because
+      // statepoint call has a different type than the actual call. It means
+      // that by default llvm will create export register of the wrong type
+      // (always i32 in our case). So instead we need to create export register
+      // with correct type manually.
+      // TODO: To eliminate this problem we can remove gc.result intrinsics
+      //       completely and make statepoint call to return a tuple.
+      unsigned Reg = FuncInfo.CreateRegs(RetTy);
+      RegsForValue RFV(*DAG.getContext(), DAG.getTargetLoweringInfo(),
+                       DAG.getDataLayout(), Reg, RetTy);
+      SDValue Chain = DAG.getEntryNode();
+
+      RFV.getCopyToRegs(ReturnValue, DAG, getCurSDLoc(), Chain, nullptr);
+      PendingExports.push_back(Chain);
+      FuncInfo.ValueMap[ISP.getInstruction()] = Reg;
+    } else {
+      // Result value will be used in a same basic block. Don't export it or
+      // perform any explicit register copies.
+      // We'll replace the actuall call node shortly. gc_result will grab
+      // this value.
+      setValue(ISP.getInstruction(), ReturnValue);
+    }
+  } else {
+    // The token value is never used from here on, just generate a poison value
+    setValue(ISP.getInstruction(), DAG.getIntPtrConstant(-1, getCurSDLoc()));
+  }
+}
+
+void SelectionDAGBuilder::LowerCallSiteWithDeoptBundleImpl(
+    ImmutableCallSite CS, SDValue Callee, const BasicBlock *EHPadBB,
+    bool VarArgDisallowed, bool ForceVoidReturnTy) {
+  StatepointLoweringInfo SI(DAG);
+  unsigned ArgBeginIndex = CS.arg_begin() - CS.getInstruction()->op_begin();
+  populateCallLoweringInfo(
+      SI.CLI, CS, ArgBeginIndex, CS.getNumArgOperands(), Callee,
+      ForceVoidReturnTy ? Type::getVoidTy(*DAG.getContext()) : CS.getType(),
+      false);
+  if (!VarArgDisallowed)
+    SI.CLI.IsVarArg = CS.getFunctionType()->isVarArg();
+
+  auto DeoptBundle = *CS.getOperandBundle(LLVMContext::OB_deopt);
+
+  unsigned DefaultID = StatepointDirectives::DeoptBundleStatepointID;
+
+  auto SD = parseStatepointDirectivesFromAttrs(CS.getAttributes());
+  SI.ID = SD.StatepointID.getValueOr(DefaultID);
+  SI.NumPatchBytes = SD.NumPatchBytes.getValueOr(0);
+
+  SI.DeoptState =
+      ArrayRef<const Use>(DeoptBundle.Inputs.begin(), DeoptBundle.Inputs.end());
+  SI.StatepointFlags = static_cast<uint64_t>(StatepointFlags::None);
+  SI.EHPadBB = EHPadBB;
+
+  // NB! The GC arguments are deliberately left empty.
+
+  if (SDValue ReturnVal = LowerAsSTATEPOINT(SI)) {
+    const Instruction *Inst = CS.getInstruction();
+    ReturnVal = lowerRangeToAssertZExt(DAG, *Inst, ReturnVal);
+    setValue(Inst, ReturnVal);
+  }
 }
 
-void SelectionDAGBuilder::visitGCResult(const CallInst &CI) {
+void SelectionDAGBuilder::LowerCallSiteWithDeoptBundle(
+    ImmutableCallSite CS, SDValue Callee, const BasicBlock *EHPadBB) {
+  LowerCallSiteWithDeoptBundleImpl(CS, Callee, EHPadBB,
+                                   /* VarArgDisallowed = */ false,
+                                   /* ForceVoidReturnTy  = */ false);
+}
+
+void SelectionDAGBuilder::visitGCResult(const GCResultInst &CI) {
   // The result value of the gc_result is simply the result of the actual
   // call.  We've already emitted this, so just grab the value.
-  Instruction *I = cast<Instruction>(CI.getArgOperand(0));
-  assert(isStatepoint(I) && "first argument must be a statepoint token");
+  const Instruction *I = CI.getStatepoint();
 
   if (I->getParent() != CI.getParent()) {
     // Statepoint is in different basic block so we should have stored call
     // result in a virtual register.
     // We can not use default getValue() functionality to copy value from this
-    // register because statepoint and actuall call return types can be
+    // register because statepoint and actual call return types can be
     // different, and getValue() will use CopyFromReg of the wrong type,
     // which is always i32 in our case.
     PointerType *CalleeType = cast<PointerType>(
@@ -864,20 +892,21 @@ void SelectionDAGBuilder::visitGCRelocate(const GCRelocateInst &Relocate) {
   // We skip this check for relocates not in the same basic block as thier
   // statepoint. It would be too expensive to preserve validation info through
   // different basic blocks.
-  if (Relocate.getStatepoint()->getParent() == Relocate.getParent()) {
+  if (Relocate.getStatepoint()->getParent() == Relocate.getParent())
     StatepointLowering.relocCallVisited(Relocate);
-  }
+
+  auto *Ty = Relocate.getType()->getScalarType();
+  if (auto IsManaged = GFI->getStrategy().isGCManagedPointer(Ty))
+    assert(*IsManaged && "Non gc managed pointer relocated!");
 #endif
 
   const Value *DerivedPtr = Relocate.getDerivedPtr();
   SDValue SD = getValue(DerivedPtr);
 
-  FunctionLoweringInfo::StatepointSpilledValueMapTy &SpillMap =
-    FuncInfo.StatepointRelocatedValues[Relocate.getStatepoint()];
-
-  // We should have recorded location for this pointer
-  assert(SpillMap.count(DerivedPtr) && "Relocating not lowered gc value");
-  Optional<int> DerivedPtrLocation = SpillMap[DerivedPtr];
+  auto &SpillMap = FuncInfo.StatepointSpillMaps[Relocate.getStatepoint()];
+  auto SlotIt = SpillMap.find(DerivedPtr);
+  assert(SlotIt != SpillMap.end() && "Relocating not lowered gc value");
+  Optional<int> DerivedPtrLocation = SlotIt->second;
 
   // We didn't need to spill these special cases (constants and allocas).
   // See the handling in spillIncomingValueForStatepoint for detail.
@@ -897,8 +926,7 @@ void SelectionDAGBuilder::visitGCRelocate(const GCRelocateInst &Relocate) {
   SDValue SpillLoad =
       DAG.getLoad(SpillSlot.getValueType(), getCurSDLoc(), Chain, SpillSlot,
                   MachinePointerInfo::getFixedStack(DAG.getMachineFunction(),
-                                                    *DerivedPtrLocation),
-                  false, false, false, 0);
+                                                    *DerivedPtrLocation));
 
   // Again, be conservative, don't emit pending loads
   DAG.setRoot(SpillLoad.getValue(1));
@@ -906,3 +934,25 @@ void SelectionDAGBuilder::visitGCRelocate(const GCRelocateInst &Relocate) {
   assert(SpillLoad.getNode());
   setValue(&Relocate, SpillLoad);
 }
+
+void SelectionDAGBuilder::LowerDeoptimizeCall(const CallInst *CI) {
+  const auto &TLI = DAG.getTargetLoweringInfo();
+  SDValue Callee = DAG.getExternalSymbol(TLI.getLibcallName(RTLIB::DEOPTIMIZE),
+                                         TLI.getPointerTy(DAG.getDataLayout()));
+
+  // We don't lower calls to __llvm_deoptimize as varargs, but as a regular
+  // call.  We also do not lower the return value to any virtual register, and
+  // change the immediately following return to a trap instruction.
+  LowerCallSiteWithDeoptBundleImpl(CI, Callee, /* EHPadBB = */ nullptr,
+                                   /* VarArgDisallowed = */ true,
+                                   /* ForceVoidReturnTy = */ true);
+}
+
+void SelectionDAGBuilder::LowerDeoptimizingReturn() {
+  // We do not lower the return value from llvm.deoptimize to any virtual
+  // register, and change the immediately following return to a trap
+  // instruction.
+  if (DAG.getTarget().Options.TrapUnreachable)
+    DAG.setRoot(
+        DAG.getNode(ISD::TRAP, getCurSDLoc(), MVT::Other, DAG.getRoot()));
+}
diff --git a/lib/CodeGen/SelectionDAG/StatepointLowering.h b/lib/CodeGen/SelectionDAG/StatepointLowering.h
index 82d0c62f1c30..b043184003a0 100644
--- a/lib/CodeGen/SelectionDAG/StatepointLowering.h
+++ b/lib/CodeGen/SelectionDAG/StatepointLowering.h
@@ -16,9 +16,9 @@
 #define LLVM_LIB_CODEGEN_SELECTIONDAG_STATEPOINTLOWERING_H
 
 #include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/SmallBitVector.h"
 #include "llvm/CodeGen/SelectionDAG.h"
 #include "llvm/CodeGen/SelectionDAGNodes.h"
-#include <vector>
 
 namespace llvm {
 class SelectionDAGBuilder;
@@ -45,15 +45,17 @@ public:
   /// statepoint.  Will return SDValue() if this value hasn't been
   /// spilled.  Otherwise, the value has already been spilled and no
   /// further action is required by the caller.
-  SDValue getLocation(SDValue val) {
-    if (!Locations.count(val))
+  SDValue getLocation(SDValue Val) {
+    auto I = Locations.find(Val);
+    if (I == Locations.end())
       return SDValue();
-    return Locations[val];
+    return I->second;
   }
-  void setLocation(SDValue val, SDValue Location) {
-    assert(!Locations.count(val) &&
+
+  void setLocation(SDValue Val, SDValue Location) {
+    assert(!Locations.count(Val) &&
            "Trying to allocate already allocated location");
-    Locations[val] = Location;
+    Locations[Val] = Location;
   }
 
   /// Record the fact that we expect to encounter a given gc_relocate
@@ -62,16 +64,15 @@ public:
   void scheduleRelocCall(const CallInst &RelocCall) {
     PendingGCRelocateCalls.push_back(&RelocCall);
   }
+
   /// Remove this gc_relocate from the list we're expecting to see
   /// before the next statepoint.  If we weren't expecting to see
   /// it, we'll report an assertion.
   void relocCallVisited(const CallInst &RelocCall) {
-    SmallVectorImpl<const CallInst *>::iterator itr =
-        std::find(PendingGCRelocateCalls.begin(), PendingGCRelocateCalls.end(),
-                  &RelocCall);
-    assert(itr != PendingGCRelocateCalls.end() &&
+    auto I = find(PendingGCRelocateCalls, &RelocCall);
+    assert(I != PendingGCRelocateCalls.end() &&
            "Visited unexpected gcrelocate call");
-    PendingGCRelocateCalls.erase(itr);
+    PendingGCRelocateCalls.erase(I);
   }
 
   // TODO: Should add consistency tracking to ensure we encounter
@@ -84,14 +85,15 @@ public:
   void reserveStackSlot(int Offset) {
     assert(Offset >= 0 && Offset < (int)AllocatedStackSlots.size() &&
            "out of bounds");
-    assert(!AllocatedStackSlots[Offset] && "already reserved!");
+    assert(!AllocatedStackSlots.test(Offset) && "already reserved!");
     assert(NextSlotToAllocate <= (unsigned)Offset && "consistency!");
-    AllocatedStackSlots[Offset] = true;
+    AllocatedStackSlots.set(Offset);
   }
+
   bool isStackSlotAllocated(int Offset) {
     assert(Offset >= 0 && Offset < (int)AllocatedStackSlots.size() &&
            "out of bounds");
-    return AllocatedStackSlots[Offset];
+    return AllocatedStackSlots.test(Offset);
   }
 
 private:
@@ -103,7 +105,7 @@ private:
   /// whether it has been used in the current statepoint.  Since we try to
   /// preserve stack slots across safepoints, there can be gaps in which
   /// slots have been allocated.
-  SmallVector<bool, 50> AllocatedStackSlots;
+  SmallBitVector AllocatedStackSlots;
 
   /// Points just beyond the last slot known to have been allocated
   unsigned NextSlotToAllocate;
diff --git a/lib/CodeGen/SelectionDAG/TargetLowering.cpp b/lib/CodeGen/SelectionDAG/TargetLowering.cpp
index c64d882d69a4..f2bc88a98597 100644
--- a/lib/CodeGen/SelectionDAG/TargetLowering.cpp
+++ b/lib/CodeGen/SelectionDAG/TargetLowering.cpp
@@ -14,10 +14,11 @@
 #include "llvm/Target/TargetLowering.h"
 #include "llvm/ADT/BitVector.h"
 #include "llvm/ADT/STLExtras.h"
-#include "llvm/CodeGen/Analysis.h"
+#include "llvm/CodeGen/CallingConvLower.h"
 #include "llvm/CodeGen/MachineFrameInfo.h"
 #include "llvm/CodeGen/MachineFunction.h"
 #include "llvm/CodeGen/MachineJumpTableInfo.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
 #include "llvm/CodeGen/SelectionDAG.h"
 #include "llvm/IR/DataLayout.h"
 #include "llvm/IR/DerivedTypes.h"
@@ -25,7 +26,6 @@
 #include "llvm/IR/LLVMContext.h"
 #include "llvm/MC/MCAsmInfo.h"
 #include "llvm/MC/MCExpr.h"
-#include "llvm/Support/CommandLine.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/MathExtras.h"
 #include "llvm/Target/TargetLoweringObjectFile.h"
@@ -43,6 +43,10 @@ const char *TargetLowering::getTargetNodeName(unsigned Opcode) const {
   return nullptr;
 }
 
+bool TargetLowering::isPositionIndependent() const {
+  return getTargetMachine().isPositionIndependent();
+}
+
 /// Check whether a given call node is in tail position within its function. If
 /// so, it sets Chain to the input chain of the tail call.
 bool TargetLowering::isInTailCallPosition(SelectionDAG &DAG, SDNode *Node,
@@ -65,6 +69,31 @@ bool TargetLowering::isInTailCallPosition(SelectionDAG &DAG, SDNode *Node,
   return isUsedByReturnOnly(Node, Chain);
 }
 
+bool TargetLowering::parametersInCSRMatch(const MachineRegisterInfo &MRI,
+    const uint32_t *CallerPreservedMask,
+    const SmallVectorImpl<CCValAssign> &ArgLocs,
+    const SmallVectorImpl<SDValue> &OutVals) const {
+  for (unsigned I = 0, E = ArgLocs.size(); I != E; ++I) {
+    const CCValAssign &ArgLoc = ArgLocs[I];
+    if (!ArgLoc.isRegLoc())
+      continue;
+    unsigned Reg = ArgLoc.getLocReg();
+    // Only look at callee saved registers.
+    if (MachineOperand::clobbersPhysReg(CallerPreservedMask, Reg))
+      continue;
+    // Check that we pass the value used for the caller.
+    // (We look for a CopyFromReg reading a virtual register that is used
+    //  for the function live-in value of register Reg)
+    SDValue Value = OutVals[I];
+    if (Value->getOpcode() != ISD::CopyFromReg)
+      return false;
+    unsigned ArgReg = cast<RegisterSDNode>(Value->getOperand(1))->getReg();
+    if (MRI.getLiveInPhysReg(ArgReg) != Reg)
+      return false;
+  }
+  return true;
+}
+
 /// \brief Set CallLoweringInfo attribute flags based on a call instruction
 /// and called function attributes.
 void TargetLowering::ArgListEntry::setAttributes(ImmutableCallSite *CS,
@@ -77,17 +106,17 @@ void TargetLowering::ArgListEntry::setAttributes(ImmutableCallSite *CS,
   isByVal    = CS->paramHasAttr(AttrIdx, Attribute::ByVal);
   isInAlloca = CS->paramHasAttr(AttrIdx, Attribute::InAlloca);
   isReturned = CS->paramHasAttr(AttrIdx, Attribute::Returned);
+  isSwiftSelf = CS->paramHasAttr(AttrIdx, Attribute::SwiftSelf);
+  isSwiftError = CS->paramHasAttr(AttrIdx, Attribute::SwiftError);
   Alignment  = CS->getParamAlignment(AttrIdx);
 }
 
 /// Generate a libcall taking the given operands as arguments and returning a
 /// result of type RetVT.
 std::pair<SDValue, SDValue>
-TargetLowering::makeLibCall(SelectionDAG &DAG,
-                            RTLIB::Libcall LC, EVT RetVT,
-                            ArrayRef<SDValue> Ops,
-                            bool isSigned, SDLoc dl,
-                            bool doesNotReturn,
+TargetLowering::makeLibCall(SelectionDAG &DAG, RTLIB::Libcall LC, EVT RetVT,
+                            ArrayRef<SDValue> Ops, bool isSigned,
+                            const SDLoc &dl, bool doesNotReturn,
                             bool isReturnValueUsed) const {
   TargetLowering::ArgListTy Args;
   Args.reserve(Ops.size());
@@ -110,7 +139,7 @@ TargetLowering::makeLibCall(SelectionDAG &DAG,
   TargetLowering::CallLoweringInfo CLI(DAG);
   bool signExtend = shouldSignExtendTypeInLibCall(RetVT, isSigned);
   CLI.setDebugLoc(dl).setChain(DAG.getEntryNode())
-    .setCallee(getLibcallCallingConv(LC), RetTy, Callee, std::move(Args), 0)
+    .setCallee(getLibcallCallingConv(LC), RetTy, Callee, std::move(Args))
     .setNoReturn(doesNotReturn).setDiscardResult(!isReturnValueUsed)
     .setSExtResult(signExtend).setZExtResult(!signExtend);
   return LowerCallTo(CLI);
@@ -121,8 +150,8 @@ TargetLowering::makeLibCall(SelectionDAG &DAG,
 void TargetLowering::softenSetCCOperands(SelectionDAG &DAG, EVT VT,
                                          SDValue &NewLHS, SDValue &NewRHS,
                                          ISD::CondCode &CCCode,
-                                         SDLoc dl) const {
-  assert((VT == MVT::f32 || VT == MVT::f64 || VT == MVT::f128)
+                                         const SDLoc &dl) const {
+  assert((VT == MVT::f32 || VT == MVT::f64 || VT == MVT::f128 || VT == MVT::ppcf128)
          && "Unsupported setcc type!");
 
   // Expand into one or more soft-fp libcall(s).
@@ -132,53 +161,65 @@ void TargetLowering::softenSetCCOperands(SelectionDAG &DAG, EVT VT,
   case ISD::SETEQ:
   case ISD::SETOEQ:
     LC1 = (VT == MVT::f32) ? RTLIB::OEQ_F32 :
-          (VT == MVT::f64) ? RTLIB::OEQ_F64 : RTLIB::OEQ_F128;
+          (VT == MVT::f64) ? RTLIB::OEQ_F64 :
+          (VT == MVT::f128) ? RTLIB::OEQ_F128 : RTLIB::OEQ_PPCF128;
     break;
   case ISD::SETNE:
   case ISD::SETUNE:
     LC1 = (VT == MVT::f32) ? RTLIB::UNE_F32 :
-          (VT == MVT::f64) ? RTLIB::UNE_F64 : RTLIB::UNE_F128;
+          (VT == MVT::f64) ? RTLIB::UNE_F64 :
+          (VT == MVT::f128) ? RTLIB::UNE_F128 : RTLIB::UNE_PPCF128;
     break;
   case ISD::SETGE:
   case ISD::SETOGE:
     LC1 = (VT == MVT::f32) ? RTLIB::OGE_F32 :
-          (VT == MVT::f64) ? RTLIB::OGE_F64 : RTLIB::OGE_F128;
+          (VT == MVT::f64) ? RTLIB::OGE_F64 :
+          (VT == MVT::f128) ? RTLIB::OGE_F128 : RTLIB::OGE_PPCF128;
     break;
   case ISD::SETLT:
   case ISD::SETOLT:
     LC1 = (VT == MVT::f32) ? RTLIB::OLT_F32 :
-          (VT == MVT::f64) ? RTLIB::OLT_F64 : RTLIB::OLT_F128;
+          (VT == MVT::f64) ? RTLIB::OLT_F64 :
+          (VT == MVT::f128) ? RTLIB::OLT_F128 : RTLIB::OLT_PPCF128;
     break;
   case ISD::SETLE:
   case ISD::SETOLE:
     LC1 = (VT == MVT::f32) ? RTLIB::OLE_F32 :
-          (VT == MVT::f64) ? RTLIB::OLE_F64 : RTLIB::OLE_F128;
+          (VT == MVT::f64) ? RTLIB::OLE_F64 :
+          (VT == MVT::f128) ? RTLIB::OLE_F128 : RTLIB::OLE_PPCF128;
     break;
   case ISD::SETGT:
   case ISD::SETOGT:
     LC1 = (VT == MVT::f32) ? RTLIB::OGT_F32 :
-          (VT == MVT::f64) ? RTLIB::OGT_F64 : RTLIB::OGT_F128;
+          (VT == MVT::f64) ? RTLIB::OGT_F64 :
+          (VT == MVT::f128) ? RTLIB::OGT_F128 : RTLIB::OGT_PPCF128;
     break;
   case ISD::SETUO:
     LC1 = (VT == MVT::f32) ? RTLIB::UO_F32 :
-          (VT == MVT::f64) ? RTLIB::UO_F64 : RTLIB::UO_F128;
+          (VT == MVT::f64) ? RTLIB::UO_F64 :
+          (VT == MVT::f128) ? RTLIB::UO_F128 : RTLIB::UO_PPCF128;
     break;
   case ISD::SETO:
     LC1 = (VT == MVT::f32) ? RTLIB::O_F32 :
-          (VT == MVT::f64) ? RTLIB::O_F64 : RTLIB::O_F128;
+          (VT == MVT::f64) ? RTLIB::O_F64 :
+          (VT == MVT::f128) ? RTLIB::O_F128 : RTLIB::O_PPCF128;
     break;
   case ISD::SETONE:
     // SETONE = SETOLT | SETOGT
     LC1 = (VT == MVT::f32) ? RTLIB::OLT_F32 :
-          (VT == MVT::f64) ? RTLIB::OLT_F64 : RTLIB::OLT_F128;
+          (VT == MVT::f64) ? RTLIB::OLT_F64 :
+          (VT == MVT::f128) ? RTLIB::OLT_F128 : RTLIB::OLT_PPCF128;
     LC2 = (VT == MVT::f32) ? RTLIB::OGT_F32 :
-          (VT == MVT::f64) ? RTLIB::OGT_F64 : RTLIB::OGT_F128;
+          (VT == MVT::f64) ? RTLIB::OGT_F64 :
+          (VT == MVT::f128) ? RTLIB::OGT_F128 : RTLIB::OGT_PPCF128;
     break;
   case ISD::SETUEQ:
     LC1 = (VT == MVT::f32) ? RTLIB::UO_F32 :
-          (VT == MVT::f64) ? RTLIB::UO_F64 : RTLIB::UO_F128;
+          (VT == MVT::f64) ? RTLIB::UO_F64 :
+          (VT == MVT::f128) ? RTLIB::UO_F64 : RTLIB::UO_PPCF128;
     LC2 = (VT == MVT::f32) ? RTLIB::OEQ_F32 :
-          (VT == MVT::f64) ? RTLIB::OEQ_F64 : RTLIB::OEQ_F128;
+          (VT == MVT::f64) ? RTLIB::OEQ_F64 :
+          (VT == MVT::f128) ? RTLIB::OEQ_F128 : RTLIB::OEQ_PPCF128;
     break;
   default:
     // Invert CC for unordered comparisons
@@ -186,19 +227,23 @@ void TargetLowering::softenSetCCOperands(SelectionDAG &DAG, EVT VT,
     switch (CCCode) {
     case ISD::SETULT:
       LC1 = (VT == MVT::f32) ? RTLIB::OGE_F32 :
-            (VT == MVT::f64) ? RTLIB::OGE_F64 : RTLIB::OGE_F128;
+            (VT == MVT::f64) ? RTLIB::OGE_F64 :
+            (VT == MVT::f128) ? RTLIB::OGE_F128 : RTLIB::OGE_PPCF128;
       break;
     case ISD::SETULE:
       LC1 = (VT == MVT::f32) ? RTLIB::OGT_F32 :
-            (VT == MVT::f64) ? RTLIB::OGT_F64 : RTLIB::OGT_F128;
+            (VT == MVT::f64) ? RTLIB::OGT_F64 :
+            (VT == MVT::f128) ? RTLIB::OGT_F128 : RTLIB::OGT_PPCF128;
       break;
     case ISD::SETUGT:
       LC1 = (VT == MVT::f32) ? RTLIB::OLE_F32 :
-            (VT == MVT::f64) ? RTLIB::OLE_F64 : RTLIB::OLE_F128;
+            (VT == MVT::f64) ? RTLIB::OLE_F64 :
+            (VT == MVT::f128) ? RTLIB::OLE_F128 : RTLIB::OLE_PPCF128;
       break;
     case ISD::SETUGE:
       LC1 = (VT == MVT::f32) ? RTLIB::OLT_F32 :
-            (VT == MVT::f64) ? RTLIB::OLT_F64 : RTLIB::OLT_F128;
+            (VT == MVT::f64) ? RTLIB::OLT_F64 :
+            (VT == MVT::f128) ? RTLIB::OLT_F128 : RTLIB::OLT_PPCF128;
       break;
     default: llvm_unreachable("Do not know how to soften this setcc!");
     }
@@ -235,7 +280,7 @@ void TargetLowering::softenSetCCOperands(SelectionDAG &DAG, EVT VT,
 /// returned value is a member of the MachineJumpTableInfo::JTEntryKind enum.
 unsigned TargetLowering::getJumpTableEncoding() const {
   // In non-pic modes, just use the address of a block.
-  if (getTargetMachine().getRelocationModel() != Reloc::PIC_)
+  if (!isPositionIndependent())
     return MachineJumpTableInfo::EK_BlockAddress;
 
   // In PIC mode, if the target supports a GPRel32 directive, use it.
@@ -269,17 +314,20 @@ TargetLowering::getPICJumpTableRelocBaseExpr(const MachineFunction *MF,
 
 bool
 TargetLowering::isOffsetFoldingLegal(const GlobalAddressSDNode *GA) const {
-  // Assume that everything is safe in static mode.
-  if (getTargetMachine().getRelocationModel() == Reloc::Static)
-    return true;
+  const TargetMachine &TM = getTargetMachine();
+  const GlobalValue *GV = GA->getGlobal();
 
-  // In dynamic-no-pic mode, assume that known defined values are safe.
-  if (getTargetMachine().getRelocationModel() == Reloc::DynamicNoPIC &&
-      GA && GA->getGlobal()->isStrongDefinitionForLinker())
-    return true;
+  // If the address is not even local to this DSO we will have to load it from
+  // a got and then add the offset.
+  if (!TM.shouldAssumeDSOLocal(*GV->getParent(), GV))
+    return false;
 
-  // Otherwise assume nothing is safe.
-  return false;
+  // If the code is position independent we will have to add a base register.
+  if (isPositionIndependent())
+    return false;
+
+  // Otherwise we can do it.
+  return true;
 }
 
 //===----------------------------------------------------------------------===//
@@ -326,11 +374,10 @@ bool TargetLowering::TargetLoweringOpt::ShrinkDemandedConstant(SDValue Op,
 /// Convert x+y to (VT)((SmallVT)x+(SmallVT)y) if the casts are free.
 /// This uses isZExtFree and ZERO_EXTEND for the widening cast, but it could be
 /// generalized for targets with other types of implicit widening casts.
-bool
-TargetLowering::TargetLoweringOpt::ShrinkDemandedOp(SDValue Op,
-                                                    unsigned BitWidth,
-                                                    const APInt &Demanded,
-                                                    SDLoc dl) {
+bool TargetLowering::TargetLoweringOpt::ShrinkDemandedOp(SDValue Op,
+                                                         unsigned BitWidth,
+                                                         const APInt &Demanded,
+                                                         const SDLoc &dl) {
   assert(Op.getNumOperands() == 2 &&
          "ShrinkDemandedOp only supports binary operators!");
   assert(Op.getNode()->getNumValues() == 1 &&
@@ -407,7 +454,7 @@ bool TargetLowering::SimplifyDemandedBits(SDValue Op,
     NewMask = APInt::getAllOnesValue(BitWidth);
   } else if (DemandedMask == 0) {
     // Not demanding any bits from Op.
-    if (Op.getOpcode() != ISD::UNDEF)
+    if (!Op.isUndef())
       return TLO.CombineTo(Op, TLO.DAG.getUNDEF(Op.getValueType()));
     return false;
   } else if (Depth == 6) {        // Limit search depth.
@@ -1157,37 +1204,6 @@ unsigned TargetLowering::ComputeNumSignBitsForTargetNode(SDValue Op,
   return 1;
 }
 
-/// Test if the given value is known to have exactly one bit set. This differs
-/// from computeKnownBits in that it doesn't need to determine which bit is set.
-static bool ValueHasExactlyOneBitSet(SDValue Val, const SelectionDAG &DAG) {
-  // A left-shift of a constant one will have exactly one bit set, because
-  // shifting the bit off the end is undefined.
-  if (Val.getOpcode() == ISD::SHL)
-    if (ConstantSDNode *C =
-         dyn_cast<ConstantSDNode>(Val.getNode()->getOperand(0)))
-      if (C->getAPIntValue() == 1)
-        return true;
-
-  // Similarly, a right-shift of a constant sign-bit will have exactly
-  // one bit set.
-  if (Val.getOpcode() == ISD::SRL)
-    if (ConstantSDNode *C =
-         dyn_cast<ConstantSDNode>(Val.getNode()->getOperand(0)))
-      if (C->getAPIntValue().isSignBit())
-        return true;
-
-  // More could be done here, though the above checks are enough
-  // to handle some common cases.
-
-  // Fall back to computeKnownBits to catch other known cases.
-  EVT OpVT = Val.getValueType();
-  unsigned BitWidth = OpVT.getScalarType().getSizeInBits();
-  APInt KnownZero, KnownOne;
-  DAG.computeKnownBits(Val, KnownZero, KnownOne);
-  return (KnownZero.countPopulation() == BitWidth - 1) &&
-         (KnownOne.countPopulation() == 1);
-}
-
 bool TargetLowering::isConstTrueVal(const SDNode *N) const {
   if (!N)
     return false;
@@ -1242,12 +1258,91 @@ bool TargetLowering::isConstFalseVal(const SDNode *N) const {
   return CN->isNullValue();
 }
 
+bool TargetLowering::isExtendedTrueVal(const ConstantSDNode *N, EVT VT,
+                                       bool SExt) const {
+  if (VT == MVT::i1)
+    return N->isOne();
+
+  TargetLowering::BooleanContent Cnt = getBooleanContents(VT);
+  switch (Cnt) {
+  case TargetLowering::ZeroOrOneBooleanContent:
+    // An extended value of 1 is always true, unless its original type is i1,
+    // in which case it will be sign extended to -1.
+    return (N->isOne() && !SExt) || (SExt && (N->getValueType(0) != MVT::i1));
+  case TargetLowering::UndefinedBooleanContent:
+  case TargetLowering::ZeroOrNegativeOneBooleanContent:
+    return N->isAllOnesValue() && SExt;
+  }
+  llvm_unreachable("Unexpected enumeration.");
+}
+
+/// This helper function of SimplifySetCC tries to optimize the comparison when
+/// either operand of the SetCC node is a bitwise-and instruction.
+SDValue TargetLowering::simplifySetCCWithAnd(EVT VT, SDValue N0, SDValue N1,
+                                             ISD::CondCode Cond,
+                                             DAGCombinerInfo &DCI,
+                                             const SDLoc &DL) const {
+  // Match these patterns in any of their permutations:
+  // (X & Y) == Y
+  // (X & Y) != Y
+  if (N1.getOpcode() == ISD::AND && N0.getOpcode() != ISD::AND)
+    std::swap(N0, N1);
+
+  EVT OpVT = N0.getValueType();
+  if (N0.getOpcode() != ISD::AND || !OpVT.isInteger() ||
+      (Cond != ISD::SETEQ && Cond != ISD::SETNE))
+    return SDValue();
+
+  SDValue X, Y;
+  if (N0.getOperand(0) == N1) {
+    X = N0.getOperand(1);
+    Y = N0.getOperand(0);
+  } else if (N0.getOperand(1) == N1) {
+    X = N0.getOperand(0);
+    Y = N0.getOperand(1);
+  } else {
+    return SDValue();
+  }
+
+  SelectionDAG &DAG = DCI.DAG;
+  SDValue Zero = DAG.getConstant(0, DL, OpVT);
+  if (DAG.isKnownToBeAPowerOfTwo(Y)) {
+    // Simplify X & Y == Y to X & Y != 0 if Y has exactly one bit set.
+    // Note that where Y is variable and is known to have at most one bit set
+    // (for example, if it is Z & 1) we cannot do this; the expressions are not
+    // equivalent when Y == 0.
+    Cond = ISD::getSetCCInverse(Cond, /*isInteger=*/true);
+    if (DCI.isBeforeLegalizeOps() ||
+        isCondCodeLegal(Cond, N0.getSimpleValueType()))
+      return DAG.getSetCC(DL, VT, N0, Zero, Cond);
+  } else if (N0.hasOneUse() && hasAndNotCompare(Y)) {
+    // If the target supports an 'and-not' or 'and-complement' logic operation,
+    // try to use that to make a comparison operation more efficient.
+    // But don't do this transform if the mask is a single bit because there are
+    // more efficient ways to deal with that case (for example, 'bt' on x86 or
+    // 'rlwinm' on PPC).
+
+    // Bail out if the compare operand that we want to turn into a zero is
+    // already a zero (otherwise, infinite loop).
+    auto *YConst = dyn_cast<ConstantSDNode>(Y);
+    if (YConst && YConst->isNullValue())
+      return SDValue();
+
+    // Transform this into: ~X & Y == 0.
+    SDValue NotX = DAG.getNOT(SDLoc(X), X, OpVT);
+    SDValue NewAnd = DAG.getNode(ISD::AND, SDLoc(N0), OpVT, NotX, Y);
+    return DAG.getSetCC(DL, VT, NewAnd, Zero, Cond);
+  }
+
+  return SDValue();
+}
+
 /// Try to simplify a setcc built with the specified operands and cc. If it is
 /// unable to simplify it, return a null SDValue.
-SDValue
-TargetLowering::SimplifySetCC(EVT VT, SDValue N0, SDValue N1,
-                              ISD::CondCode Cond, bool foldBooleans,
-                              DAGCombinerInfo &DCI, SDLoc dl) const {
+SDValue TargetLowering::SimplifySetCC(EVT VT, SDValue N0, SDValue N1,
+                                      ISD::CondCode Cond, bool foldBooleans,
+                                      DAGCombinerInfo &DCI,
+                                      const SDLoc &dl) const {
   SelectionDAG &DAG = DCI.DAG;
 
   // These setcc operations always fold.
@@ -1376,6 +1471,38 @@ TargetLowering::SimplifySetCC(EVT VT, SDValue N0, SDValue N1,
           SDValue C = DAG.getConstant(C1.trunc(MinBits), dl, MinVT);
           return DAG.getSetCC(dl, VT, Trunc, C, Cond);
         }
+
+        // If truncating the setcc operands is not desirable, we can still
+        // simplify the expression in some cases:
+        // setcc ([sz]ext (setcc x, y, cc)), 0, setne) -> setcc (x, y, cc)
+        // setcc ([sz]ext (setcc x, y, cc)), 0, seteq) -> setcc (x, y, inv(cc))
+        // setcc (zext (setcc x, y, cc)), 1, setne) -> setcc (x, y, inv(cc))
+        // setcc (zext (setcc x, y, cc)), 1, seteq) -> setcc (x, y, cc)
+        // setcc (sext (setcc x, y, cc)), -1, setne) -> setcc (x, y, inv(cc))
+        // setcc (sext (setcc x, y, cc)), -1, seteq) -> setcc (x, y, cc)
+        SDValue TopSetCC = N0->getOperand(0);
+        unsigned N0Opc = N0->getOpcode();
+        bool SExt = (N0Opc == ISD::SIGN_EXTEND);
+        if (TopSetCC.getValueType() == MVT::i1 && VT == MVT::i1 &&
+            TopSetCC.getOpcode() == ISD::SETCC &&
+            (N0Opc == ISD::ZERO_EXTEND || N0Opc == ISD::SIGN_EXTEND) &&
+            (isConstFalseVal(N1C) ||
+             isExtendedTrueVal(N1C, N0->getValueType(0), SExt))) {
+
+          bool Inverse = (N1C->isNullValue() && Cond == ISD::SETEQ) ||
+                         (!N1C->isNullValue() && Cond == ISD::SETNE);
+
+          if (!Inverse)
+            return TopSetCC;
+
+          ISD::CondCode InvCond = ISD::getSetCCInverse(
+              cast<CondCodeSDNode>(TopSetCC.getOperand(2))->get(),
+              TopSetCC.getOperand(0).getValueType().isInteger());
+          return DAG.getSetCC(dl, VT, TopSetCC.getOperand(0),
+                                      TopSetCC.getOperand(1),
+                                      InvCond);
+
+        }
       }
     }
 
@@ -1426,9 +1553,9 @@ TargetLowering::SimplifySetCC(EVT VT, SDValue N0, SDValue N1,
             Ptr = DAG.getNode(ISD::ADD, dl, PtrType, Lod->getBasePtr(),
                               DAG.getConstant(bestOffset, dl, PtrType));
           unsigned NewAlign = MinAlign(Lod->getAlignment(), bestOffset);
-          SDValue NewLoad = DAG.getLoad(newVT, dl, Lod->getChain(), Ptr,
-                                Lod->getPointerInfo().getWithOffset(bestOffset),
-                                        false, false, false, NewAlign);
+          SDValue NewLoad = DAG.getLoad(
+              newVT, dl, Lod->getChain(), Ptr,
+              Lod->getPointerInfo().getWithOffset(bestOffset), NewAlign);
           return DAG.getSetCC(dl, VT,
                               DAG.getNode(ISD::AND, dl, newVT, NewLoad,
                                       DAG.getConstant(bestMask.trunc(bestWidth),
@@ -1994,32 +2121,8 @@ TargetLowering::SimplifySetCC(EVT VT, SDValue N0, SDValue N1,
       }
     }
 
-    // Simplify x&y == y to x&y != 0 if y has exactly one bit set.
-    // Note that where y is variable and is known to have at most
-    // one bit set (for example, if it is z&1) we cannot do this;
-    // the expressions are not equivalent when y==0.
-    if (N0.getOpcode() == ISD::AND)
-      if (N0.getOperand(0) == N1 || N0.getOperand(1) == N1) {
-        if (ValueHasExactlyOneBitSet(N1, DAG)) {
-          Cond = ISD::getSetCCInverse(Cond, /*isInteger=*/true);
-          if (DCI.isBeforeLegalizeOps() ||
-              isCondCodeLegal(Cond, N0.getSimpleValueType())) {
-            SDValue Zero = DAG.getConstant(0, dl, N1.getValueType());
-            return DAG.getSetCC(dl, VT, N0, Zero, Cond);
-          }
-        }
-      }
-    if (N1.getOpcode() == ISD::AND)
-      if (N1.getOperand(0) == N0 || N1.getOperand(1) == N0) {
-        if (ValueHasExactlyOneBitSet(N0, DAG)) {
-          Cond = ISD::getSetCCInverse(Cond, /*isInteger=*/true);
-          if (DCI.isBeforeLegalizeOps() ||
-              isCondCodeLegal(Cond, N1.getSimpleValueType())) {
-            SDValue Zero = DAG.getConstant(0, dl, N0.getValueType());
-            return DAG.getSetCC(dl, VT, N1, Zero, Cond);
-          }
-        }
-      }
+    if (SDValue V = simplifySetCCWithAnd(VT, N0, N1, Cond, DCI, dl))
+      return V;
   }
 
   // Fold away ALL boolean setcc's.
@@ -2202,8 +2305,10 @@ void TargetLowering::LowerAsmOperandForConstraint(SDValue Op,
         C = dyn_cast<ConstantSDNode>(Op.getOperand(0));
         GA = dyn_cast<GlobalAddressSDNode>(Op.getOperand(1));
       }
-      if (!C || !GA)
-        C = nullptr, GA = nullptr;
+      if (!C || !GA) {
+        C = nullptr;
+        GA = nullptr;
+      }
     }
 
     // If we find a valid operand, map to the TargetXXX version so that the
@@ -2260,7 +2365,7 @@ TargetLowering::getRegForInlineAsmConstraint(const TargetRegisterInfo *RI,
 
     for (TargetRegisterClass::iterator I = RC->begin(), E = RC->end();
          I != E; ++I) {
-      if (RegName.equals_lower(RI->getName(*I))) {
+      if (RegName.equals_lower(RI->getRegAsmName(*I))) {
         std::pair<unsigned, const TargetRegisterClass*> S =
           std::make_pair(*I, RC);
 
@@ -2680,7 +2785,7 @@ void TargetLowering::ComputeConstraintToUse(AsmOperandInfo &OpInfo,
 /// \brief Given an exact SDIV by a constant, create a multiplication
 /// with the multiplicative inverse of the constant.
 static SDValue BuildExactSDIV(const TargetLowering &TLI, SDValue Op1, APInt d,
-                              SDLoc dl, SelectionDAG &DAG,
+                              const SDLoc &dl, SelectionDAG &DAG,
                               std::vector<SDNode *> &Created) {
   assert(d != 0 && "Division by zero!");
 
@@ -3039,6 +3144,370 @@ bool TargetLowering::expandFP_TO_SINT(SDNode *Node, SDValue &Result,
   return true;
 }
 
+SDValue TargetLowering::scalarizeVectorLoad(LoadSDNode *LD,
+                                            SelectionDAG &DAG) const {
+  SDLoc SL(LD);
+  SDValue Chain = LD->getChain();
+  SDValue BasePTR = LD->getBasePtr();
+  EVT SrcVT = LD->getMemoryVT();
+  ISD::LoadExtType ExtType = LD->getExtensionType();
+
+  unsigned NumElem = SrcVT.getVectorNumElements();
+
+  EVT SrcEltVT = SrcVT.getScalarType();
+  EVT DstEltVT = LD->getValueType(0).getScalarType();
+
+  unsigned Stride = SrcEltVT.getSizeInBits() / 8;
+  assert(SrcEltVT.isByteSized());
+
+  EVT PtrVT = BasePTR.getValueType();
+
+  SmallVector<SDValue, 8> Vals;
+  SmallVector<SDValue, 8> LoadChains;
+
+  for (unsigned Idx = 0; Idx < NumElem; ++Idx) {
+    SDValue ScalarLoad =
+        DAG.getExtLoad(ExtType, SL, DstEltVT, Chain, BasePTR,
+                       LD->getPointerInfo().getWithOffset(Idx * Stride),
+                       SrcEltVT, MinAlign(LD->getAlignment(), Idx * Stride),
+                       LD->getMemOperand()->getFlags(), LD->getAAInfo());
+
+    BasePTR = DAG.getNode(ISD::ADD, SL, PtrVT, BasePTR,
+                          DAG.getConstant(Stride, SL, PtrVT));
+
+    Vals.push_back(ScalarLoad.getValue(0));
+    LoadChains.push_back(ScalarLoad.getValue(1));
+  }
+
+  SDValue NewChain = DAG.getNode(ISD::TokenFactor, SL, MVT::Other, LoadChains);
+  SDValue Value = DAG.getNode(ISD::BUILD_VECTOR, SL, LD->getValueType(0), Vals);
+
+  return DAG.getMergeValues({ Value, NewChain }, SL);
+}
+
+// FIXME: This relies on each element having a byte size, otherwise the stride
+// is 0 and just overwrites the same location. ExpandStore currently expects
+// this broken behavior.
+SDValue TargetLowering::scalarizeVectorStore(StoreSDNode *ST,
+                                             SelectionDAG &DAG) const {
+  SDLoc SL(ST);
+
+  SDValue Chain = ST->getChain();
+  SDValue BasePtr = ST->getBasePtr();
+  SDValue Value = ST->getValue();
+  EVT StVT = ST->getMemoryVT();
+
+  // The type of the data we want to save
+  EVT RegVT = Value.getValueType();
+  EVT RegSclVT = RegVT.getScalarType();
+
+  // The type of data as saved in memory.
+  EVT MemSclVT = StVT.getScalarType();
+
+  EVT PtrVT = BasePtr.getValueType();
+
+  // Store Stride in bytes
+  unsigned Stride = MemSclVT.getSizeInBits() / 8;
+  EVT IdxVT = getVectorIdxTy(DAG.getDataLayout());
+  unsigned NumElem = StVT.getVectorNumElements();
+
+  // Extract each of the elements from the original vector and save them into
+  // memory individually.
+  SmallVector<SDValue, 8> Stores;
+  for (unsigned Idx = 0; Idx < NumElem; ++Idx) {
+    SDValue Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, RegSclVT, Value,
+                              DAG.getConstant(Idx, SL, IdxVT));
+
+    SDValue Ptr = DAG.getNode(ISD::ADD, SL, PtrVT, BasePtr,
+                              DAG.getConstant(Idx * Stride, SL, PtrVT));
+
+    // This scalar TruncStore may be illegal, but we legalize it later.
+    SDValue Store = DAG.getTruncStore(
+        Chain, SL, Elt, Ptr, ST->getPointerInfo().getWithOffset(Idx * Stride),
+        MemSclVT, MinAlign(ST->getAlignment(), Idx * Stride),
+        ST->getMemOperand()->getFlags(), ST->getAAInfo());
+
+    Stores.push_back(Store);
+  }
+
+  return DAG.getNode(ISD::TokenFactor, SL, MVT::Other, Stores);
+}
+
+std::pair<SDValue, SDValue>
+TargetLowering::expandUnalignedLoad(LoadSDNode *LD, SelectionDAG &DAG) const {
+  assert(LD->getAddressingMode() == ISD::UNINDEXED &&
+         "unaligned indexed loads not implemented!");
+  SDValue Chain = LD->getChain();
+  SDValue Ptr = LD->getBasePtr();
+  EVT VT = LD->getValueType(0);
+  EVT LoadedVT = LD->getMemoryVT();
+  SDLoc dl(LD);
+  if (VT.isFloatingPoint() || VT.isVector()) {
+    EVT intVT = EVT::getIntegerVT(*DAG.getContext(), LoadedVT.getSizeInBits());
+    if (isTypeLegal(intVT) && isTypeLegal(LoadedVT)) {
+      if (!isOperationLegalOrCustom(ISD::LOAD, intVT)) {
+        // Scalarize the load and let the individual components be handled.
+        SDValue Scalarized = scalarizeVectorLoad(LD, DAG);
+        return std::make_pair(Scalarized.getValue(0), Scalarized.getValue(1));
+      }
+
+      // Expand to a (misaligned) integer load of the same size,
+      // then bitconvert to floating point or vector.
+      SDValue newLoad = DAG.getLoad(intVT, dl, Chain, Ptr,
+                                    LD->getMemOperand());
+      SDValue Result = DAG.getNode(ISD::BITCAST, dl, LoadedVT, newLoad);
+      if (LoadedVT != VT)
+        Result = DAG.getNode(VT.isFloatingPoint() ? ISD::FP_EXTEND :
+                             ISD::ANY_EXTEND, dl, VT, Result);
+
+      return std::make_pair(Result, newLoad.getValue(1));
+    }
+
+    // Copy the value to a (aligned) stack slot using (unaligned) integer
+    // loads and stores, then do a (aligned) load from the stack slot.
+    MVT RegVT = getRegisterType(*DAG.getContext(), intVT);
+    unsigned LoadedBytes = LoadedVT.getSizeInBits() / 8;
+    unsigned RegBytes = RegVT.getSizeInBits() / 8;
+    unsigned NumRegs = (LoadedBytes + RegBytes - 1) / RegBytes;
+
+    // Make sure the stack slot is also aligned for the register type.
+    SDValue StackBase = DAG.CreateStackTemporary(LoadedVT, RegVT);
+
+    SmallVector<SDValue, 8> Stores;
+    SDValue StackPtr = StackBase;
+    unsigned Offset = 0;
+
+    EVT PtrVT = Ptr.getValueType();
+    EVT StackPtrVT = StackPtr.getValueType();
+
+    SDValue PtrIncrement = DAG.getConstant(RegBytes, dl, PtrVT);
+    SDValue StackPtrIncrement = DAG.getConstant(RegBytes, dl, StackPtrVT);
+
+    // Do all but one copies using the full register width.
+    for (unsigned i = 1; i < NumRegs; i++) {
+      // Load one integer register's worth from the original location.
+      SDValue Load = DAG.getLoad(
+          RegVT, dl, Chain, Ptr, LD->getPointerInfo().getWithOffset(Offset),
+          MinAlign(LD->getAlignment(), Offset), LD->getMemOperand()->getFlags(),
+          LD->getAAInfo());
+      // Follow the load with a store to the stack slot.  Remember the store.
+      Stores.push_back(DAG.getStore(Load.getValue(1), dl, Load, StackPtr,
+                                    MachinePointerInfo()));
+      // Increment the pointers.
+      Offset += RegBytes;
+      Ptr = DAG.getNode(ISD::ADD, dl, PtrVT, Ptr, PtrIncrement);
+      StackPtr = DAG.getNode(ISD::ADD, dl, StackPtrVT, StackPtr,
+                             StackPtrIncrement);
+    }
+
+    // The last copy may be partial.  Do an extending load.
+    EVT MemVT = EVT::getIntegerVT(*DAG.getContext(),
+                                  8 * (LoadedBytes - Offset));
+    SDValue Load =
+        DAG.getExtLoad(ISD::EXTLOAD, dl, RegVT, Chain, Ptr,
+                       LD->getPointerInfo().getWithOffset(Offset), MemVT,
+                       MinAlign(LD->getAlignment(), Offset),
+                       LD->getMemOperand()->getFlags(), LD->getAAInfo());
+    // Follow the load with a store to the stack slot.  Remember the store.
+    // On big-endian machines this requires a truncating store to ensure
+    // that the bits end up in the right place.
+    Stores.push_back(DAG.getTruncStore(Load.getValue(1), dl, Load, StackPtr,
+                                       MachinePointerInfo(), MemVT));
+
+    // The order of the stores doesn't matter - say it with a TokenFactor.
+    SDValue TF = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Stores);
+
+    // Finally, perform the original load only redirected to the stack slot.
+    Load = DAG.getExtLoad(LD->getExtensionType(), dl, VT, TF, StackBase,
+                          MachinePointerInfo(), LoadedVT);
+
+    // Callers expect a MERGE_VALUES node.
+    return std::make_pair(Load, TF);
+  }
+
+  assert(LoadedVT.isInteger() && !LoadedVT.isVector() &&
+         "Unaligned load of unsupported type.");
+
+  // Compute the new VT that is half the size of the old one.  This is an
+  // integer MVT.
+  unsigned NumBits = LoadedVT.getSizeInBits();
+  EVT NewLoadedVT;
+  NewLoadedVT = EVT::getIntegerVT(*DAG.getContext(), NumBits/2);
+  NumBits >>= 1;
+
+  unsigned Alignment = LD->getAlignment();
+  unsigned IncrementSize = NumBits / 8;
+  ISD::LoadExtType HiExtType = LD->getExtensionType();
+
+  // If the original load is NON_EXTLOAD, the hi part load must be ZEXTLOAD.
+  if (HiExtType == ISD::NON_EXTLOAD)
+    HiExtType = ISD::ZEXTLOAD;
+
+  // Load the value in two parts
+  SDValue Lo, Hi;
+  if (DAG.getDataLayout().isLittleEndian()) {
+    Lo = DAG.getExtLoad(ISD::ZEXTLOAD, dl, VT, Chain, Ptr, LD->getPointerInfo(),
+                        NewLoadedVT, Alignment, LD->getMemOperand()->getFlags(),
+                        LD->getAAInfo());
+    Ptr = DAG.getNode(ISD::ADD, dl, Ptr.getValueType(), Ptr,
+                      DAG.getConstant(IncrementSize, dl, Ptr.getValueType()));
+    Hi = DAG.getExtLoad(HiExtType, dl, VT, Chain, Ptr,
+                        LD->getPointerInfo().getWithOffset(IncrementSize),
+                        NewLoadedVT, MinAlign(Alignment, IncrementSize),
+                        LD->getMemOperand()->getFlags(), LD->getAAInfo());
+  } else {
+    Hi = DAG.getExtLoad(HiExtType, dl, VT, Chain, Ptr, LD->getPointerInfo(),
+                        NewLoadedVT, Alignment, LD->getMemOperand()->getFlags(),
+                        LD->getAAInfo());
+    Ptr = DAG.getNode(ISD::ADD, dl, Ptr.getValueType(), Ptr,
+                      DAG.getConstant(IncrementSize, dl, Ptr.getValueType()));
+    Lo = DAG.getExtLoad(ISD::ZEXTLOAD, dl, VT, Chain, Ptr,
+                        LD->getPointerInfo().getWithOffset(IncrementSize),
+                        NewLoadedVT, MinAlign(Alignment, IncrementSize),
+                        LD->getMemOperand()->getFlags(), LD->getAAInfo());
+  }
+
+  // aggregate the two parts
+  SDValue ShiftAmount =
+      DAG.getConstant(NumBits, dl, getShiftAmountTy(Hi.getValueType(),
+                                                    DAG.getDataLayout()));
+  SDValue Result = DAG.getNode(ISD::SHL, dl, VT, Hi, ShiftAmount);
+  Result = DAG.getNode(ISD::OR, dl, VT, Result, Lo);
+
+  SDValue TF = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Lo.getValue(1),
+                             Hi.getValue(1));
+
+  return std::make_pair(Result, TF);
+}
+
+SDValue TargetLowering::expandUnalignedStore(StoreSDNode *ST,
+                                             SelectionDAG &DAG) const {
+  assert(ST->getAddressingMode() == ISD::UNINDEXED &&
+         "unaligned indexed stores not implemented!");
+  SDValue Chain = ST->getChain();
+  SDValue Ptr = ST->getBasePtr();
+  SDValue Val = ST->getValue();
+  EVT VT = Val.getValueType();
+  int Alignment = ST->getAlignment();
+
+  SDLoc dl(ST);
+  if (ST->getMemoryVT().isFloatingPoint() ||
+      ST->getMemoryVT().isVector()) {
+    EVT intVT = EVT::getIntegerVT(*DAG.getContext(), VT.getSizeInBits());
+    if (isTypeLegal(intVT)) {
+      if (!isOperationLegalOrCustom(ISD::STORE, intVT)) {
+        // Scalarize the store and let the individual components be handled.
+        SDValue Result = scalarizeVectorStore(ST, DAG);
+
+        return Result;
+      }
+      // Expand to a bitconvert of the value to the integer type of the
+      // same size, then a (misaligned) int store.
+      // FIXME: Does not handle truncating floating point stores!
+      SDValue Result = DAG.getNode(ISD::BITCAST, dl, intVT, Val);
+      Result = DAG.getStore(Chain, dl, Result, Ptr, ST->getPointerInfo(),
+                            Alignment, ST->getMemOperand()->getFlags());
+      return Result;
+    }
+    // Do a (aligned) store to a stack slot, then copy from the stack slot
+    // to the final destination using (unaligned) integer loads and stores.
+    EVT StoredVT = ST->getMemoryVT();
+    MVT RegVT =
+      getRegisterType(*DAG.getContext(),
+                      EVT::getIntegerVT(*DAG.getContext(),
+                                        StoredVT.getSizeInBits()));
+    EVT PtrVT = Ptr.getValueType();
+    unsigned StoredBytes = StoredVT.getSizeInBits() / 8;
+    unsigned RegBytes = RegVT.getSizeInBits() / 8;
+    unsigned NumRegs = (StoredBytes + RegBytes - 1) / RegBytes;
+
+    // Make sure the stack slot is also aligned for the register type.
+    SDValue StackPtr = DAG.CreateStackTemporary(StoredVT, RegVT);
+
+    // Perform the original store, only redirected to the stack slot.
+    SDValue Store = DAG.getTruncStore(Chain, dl, Val, StackPtr,
+                                      MachinePointerInfo(), StoredVT);
+
+    EVT StackPtrVT = StackPtr.getValueType();
+
+    SDValue PtrIncrement = DAG.getConstant(RegBytes, dl, PtrVT);
+    SDValue StackPtrIncrement = DAG.getConstant(RegBytes, dl, StackPtrVT);
+    SmallVector<SDValue, 8> Stores;
+    unsigned Offset = 0;
+
+    // Do all but one copies using the full register width.
+    for (unsigned i = 1; i < NumRegs; i++) {
+      // Load one integer register's worth from the stack slot.
+      SDValue Load =
+          DAG.getLoad(RegVT, dl, Store, StackPtr, MachinePointerInfo());
+      // Store it to the final location.  Remember the store.
+      Stores.push_back(DAG.getStore(Load.getValue(1), dl, Load, Ptr,
+                                    ST->getPointerInfo().getWithOffset(Offset),
+                                    MinAlign(ST->getAlignment(), Offset),
+                                    ST->getMemOperand()->getFlags()));
+      // Increment the pointers.
+      Offset += RegBytes;
+      StackPtr = DAG.getNode(ISD::ADD, dl, StackPtrVT,
+                             StackPtr, StackPtrIncrement);
+      Ptr = DAG.getNode(ISD::ADD, dl, PtrVT, Ptr, PtrIncrement);
+    }
+
+    // The last store may be partial.  Do a truncating store.  On big-endian
+    // machines this requires an extending load from the stack slot to ensure
+    // that the bits are in the right place.
+    EVT MemVT = EVT::getIntegerVT(*DAG.getContext(),
+                                  8 * (StoredBytes - Offset));
+
+    // Load from the stack slot.
+    SDValue Load = DAG.getExtLoad(ISD::EXTLOAD, dl, RegVT, Store, StackPtr,
+                                  MachinePointerInfo(), MemVT);
+
+    Stores.push_back(
+        DAG.getTruncStore(Load.getValue(1), dl, Load, Ptr,
+                          ST->getPointerInfo().getWithOffset(Offset), MemVT,
+                          MinAlign(ST->getAlignment(), Offset),
+                          ST->getMemOperand()->getFlags(), ST->getAAInfo()));
+    // The order of the stores doesn't matter - say it with a TokenFactor.
+    SDValue Result = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Stores);
+    return Result;
+  }
+
+  assert(ST->getMemoryVT().isInteger() &&
+         !ST->getMemoryVT().isVector() &&
+         "Unaligned store of unknown type.");
+  // Get the half-size VT
+  EVT NewStoredVT = ST->getMemoryVT().getHalfSizedIntegerVT(*DAG.getContext());
+  int NumBits = NewStoredVT.getSizeInBits();
+  int IncrementSize = NumBits / 8;
+
+  // Divide the stored value in two parts.
+  SDValue ShiftAmount =
+      DAG.getConstant(NumBits, dl, getShiftAmountTy(Val.getValueType(),
+                                                    DAG.getDataLayout()));
+  SDValue Lo = Val;
+  SDValue Hi = DAG.getNode(ISD::SRL, dl, VT, Val, ShiftAmount);
+
+  // Store the two parts
+  SDValue Store1, Store2;
+  Store1 = DAG.getTruncStore(Chain, dl,
+                             DAG.getDataLayout().isLittleEndian() ? Lo : Hi,
+                             Ptr, ST->getPointerInfo(), NewStoredVT, Alignment,
+                             ST->getMemOperand()->getFlags());
+
+  EVT PtrVT = Ptr.getValueType();
+  Ptr = DAG.getNode(ISD::ADD, dl, PtrVT, Ptr,
+                    DAG.getConstant(IncrementSize, dl, PtrVT));
+  Alignment = MinAlign(Alignment, IncrementSize);
+  Store2 = DAG.getTruncStore(
+      Chain, dl, DAG.getDataLayout().isLittleEndian() ? Hi : Lo, Ptr,
+      ST->getPointerInfo().getWithOffset(IncrementSize), NewStoredVT, Alignment,
+      ST->getMemOperand()->getFlags(), ST->getAAInfo());
+
+  SDValue Result =
+    DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Store1, Store2);
+  return Result;
+}
+
 //===----------------------------------------------------------------------===//
 // Implementation of Emulated TLS Model
 //===----------------------------------------------------------------------===//
@@ -3057,9 +3526,7 @@ SDValue TargetLowering::LowerToTLSEmulatedModel(const GlobalAddressSDNode *GA,
   Module *VariableModule = const_cast<Module*>(GA->getGlobal()->getParent());
   StringRef EmuTlsVarName(NameString);
   GlobalVariable *EmuTlsVar = VariableModule->getNamedGlobal(EmuTlsVarName);
-  if (!EmuTlsVar)
-    EmuTlsVar = dyn_cast_or_null<GlobalVariable>(
-        VariableModule->getOrInsertGlobal(EmuTlsVarName, VoidPtrType));
+  assert(EmuTlsVar && "Cannot find EmuTlsVar ");
   Entry.Node = DAG.getGlobalAddress(EmuTlsVar, dl, PtrVT);
   Entry.Ty = VoidPtrType;
   Args.push_back(Entry);
@@ -3068,7 +3535,7 @@ SDValue TargetLowering::LowerToTLSEmulatedModel(const GlobalAddressSDNode *GA,
 
   TargetLowering::CallLoweringInfo CLI(DAG);
   CLI.setDebugLoc(dl).setChain(DAG.getEntryNode());
-  CLI.setCallee(CallingConv::C, VoidPtrType, EmuTlsGetAddr, std::move(Args), 0);
+  CLI.setCallee(CallingConv::C, VoidPtrType, EmuTlsGetAddr, std::move(Args));
   std::pair<SDValue, SDValue> CallResult = LowerCallTo(CLI);
 
   // TLSADDR will be codegen'ed as call. Inform MFI that function has calls.
diff --git a/lib/CodeGen/ShadowStackGC.cpp b/lib/CodeGen/ShadowStackGC.cpp
deleted file mode 100644
index b12e943eb35d..000000000000
--- a/lib/CodeGen/ShadowStackGC.cpp
+++ /dev/null
@@ -1,55 +0,0 @@
-//===-- ShadowStackGC.cpp - GC support for uncooperative targets ----------===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-//
-// This file implements lowering for the llvm.gc* intrinsics for targets that do
-// not natively support them (which includes the C backend). Note that the code
-// generated is not quite as efficient as algorithms which generate stack maps
-// to identify roots.
-//
-// This pass implements the code transformation described in this paper:
-//   "Accurate Garbage Collection in an Uncooperative Environment"
-//   Fergus Henderson, ISMM, 2002
-//
-// In runtime/GC/SemiSpace.cpp is a prototype runtime which is compatible with
-// ShadowStackGC.
-//
-// In order to support this particular transformation, all stack roots are
-// coallocated in the stack. This allows a fully target-independent stack map
-// while introducing only minor runtime overhead.
-//
-//===----------------------------------------------------------------------===//
-
-#include "llvm/CodeGen/GCs.h"
-#include "llvm/ADT/StringExtras.h"
-#include "llvm/CodeGen/GCStrategy.h"
-#include "llvm/IR/CallSite.h"
-#include "llvm/IR/IRBuilder.h"
-#include "llvm/IR/IntrinsicInst.h"
-#include "llvm/IR/Module.h"
-
-using namespace llvm;
-
-#define DEBUG_TYPE "shadowstackgc"
-
-namespace {
-class ShadowStackGC : public GCStrategy {
-public:
-  ShadowStackGC();
-};
-}
-
-static GCRegistry::Add<ShadowStackGC>
-    X("shadow-stack", "Very portable GC for uncooperative code generators");
-
-void llvm::linkShadowStackGC() {}
-
-ShadowStackGC::ShadowStackGC() {
-  InitRoots = true;
-  CustomRoots = true;
-}
diff --git a/lib/CodeGen/ShadowStackGCLowering.cpp b/lib/CodeGen/ShadowStackGCLowering.cpp
index 878eeeed0f6a..1efc440cd701 100644
--- a/lib/CodeGen/ShadowStackGCLowering.cpp
+++ b/lib/CodeGen/ShadowStackGCLowering.cpp
@@ -8,7 +8,11 @@
 //===----------------------------------------------------------------------===//
 //
 // This file contains the custom lowering code required by the shadow-stack GC
-// strategy.  
+// strategy.
+//
+// This pass implements the code transformation described in this paper:
+//   "Accurate Garbage Collection in an Uncooperative Environment"
+//   Fergus Henderson, ISMM, 2002
 //
 //===----------------------------------------------------------------------===//
 
diff --git a/lib/CodeGen/SjLjEHPrepare.cpp b/lib/CodeGen/SjLjEHPrepare.cpp
index e1f242a08de1..ce01c5f23e57 100644
--- a/lib/CodeGen/SjLjEHPrepare.cpp
+++ b/lib/CodeGen/SjLjEHPrepare.cpp
@@ -13,7 +13,6 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/CodeGen/Passes.h"
-#include "llvm/ADT/DenseMap.h"
 #include "llvm/ADT/SetVector.h"
 #include "llvm/ADT/SmallPtrSet.h"
 #include "llvm/ADT/SmallVector.h"
@@ -24,18 +23,11 @@
 #include "llvm/IR/IRBuilder.h"
 #include "llvm/IR/Instructions.h"
 #include "llvm/IR/Intrinsics.h"
-#include "llvm/IR/LLVMContext.h"
 #include "llvm/IR/Module.h"
 #include "llvm/Pass.h"
-#include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/raw_ostream.h"
-#include "llvm/Target/TargetLowering.h"
-#include "llvm/Target/TargetSubtargetInfo.h"
-#include "llvm/Transforms/Scalar.h"
-#include "llvm/Transforms/Utils/BasicBlockUtils.h"
 #include "llvm/Transforms/Utils/Local.h"
-#include <set>
 using namespace llvm;
 
 #define DEBUG_TYPE "sjljehprepare"
@@ -55,7 +47,6 @@ class SjLjEHPrepare : public FunctionPass {
   Constant *StackAddrFn;
   Constant *StackRestoreFn;
   Constant *LSDAAddrFn;
-  Value *PersonalityFn;
   Constant *CallSiteFn;
   Constant *FuncCtxFn;
   AllocaInst *FuncCtx;
@@ -103,21 +94,6 @@ bool SjLjEHPrepare::doInitialization(Module &M) {
                                       VoidPtrTy,         // __lsda
                                       doubleUnderJBufTy, // __jbuf
                                       nullptr);
-  RegisterFn = M.getOrInsertFunction(
-      "_Unwind_SjLj_Register", Type::getVoidTy(M.getContext()),
-      PointerType::getUnqual(FunctionContextTy), (Type *)nullptr);
-  UnregisterFn = M.getOrInsertFunction(
-      "_Unwind_SjLj_Unregister", Type::getVoidTy(M.getContext()),
-      PointerType::getUnqual(FunctionContextTy), (Type *)nullptr);
-  FrameAddrFn = Intrinsic::getDeclaration(&M, Intrinsic::frameaddress);
-  StackAddrFn = Intrinsic::getDeclaration(&M, Intrinsic::stacksave);
-  StackRestoreFn = Intrinsic::getDeclaration(&M, Intrinsic::stackrestore);
-  BuiltinSetupDispatchFn =
-    Intrinsic::getDeclaration(&M, Intrinsic::eh_sjlj_setup_dispatch);
-  LSDAAddrFn = Intrinsic::getDeclaration(&M, Intrinsic::eh_sjlj_lsda);
-  CallSiteFn = Intrinsic::getDeclaration(&M, Intrinsic::eh_sjlj_callsite);
-  FuncCtxFn = Intrinsic::getDeclaration(&M, Intrinsic::eh_sjlj_functioncontext);
-  PersonalityFn = nullptr;
 
   return true;
 }
@@ -141,15 +117,15 @@ void SjLjEHPrepare::insertCallSiteStore(Instruction *I, int Number) {
   Builder.CreateStore(CallSiteNoC, CallSite, true /*volatile*/);
 }
 
-/// MarkBlocksLiveIn - Insert BB and all of its predescessors into LiveBBs until
+/// MarkBlocksLiveIn - Insert BB and all of its predecessors into LiveBBs until
 /// we reach blocks we've already seen.
 static void MarkBlocksLiveIn(BasicBlock *BB,
                              SmallPtrSetImpl<BasicBlock *> &LiveBBs) {
   if (!LiveBBs.insert(BB).second)
     return; // already been here.
 
-  for (pred_iterator PI = pred_begin(BB), E = pred_end(BB); PI != E; ++PI)
-    MarkBlocksLiveIn(*PI, LiveBBs);
+  for (BasicBlock *PredBB : predecessors(BB))
+    MarkBlocksLiveIn(PredBB, LiveBBs);
 }
 
 /// substituteLPadValues - Substitute the values returned by the landingpad
@@ -159,7 +135,7 @@ void SjLjEHPrepare::substituteLPadValues(LandingPadInst *LPI, Value *ExnVal,
   SmallVector<Value *, 8> UseWorkList(LPI->user_begin(), LPI->user_end());
   while (!UseWorkList.empty()) {
     Value *Val = UseWorkList.pop_back_val();
-    ExtractValueInst *EVI = dyn_cast<ExtractValueInst>(Val);
+    auto *EVI = dyn_cast<ExtractValueInst>(Val);
     if (!EVI)
       continue;
     if (EVI->getNumIndices() != 1)
@@ -168,11 +144,11 @@ void SjLjEHPrepare::substituteLPadValues(LandingPadInst *LPI, Value *ExnVal,
       EVI->replaceAllUsesWith(ExnVal);
     else if (*EVI->idx_begin() == 1)
       EVI->replaceAllUsesWith(SelVal);
-    if (EVI->getNumUses() == 0)
+    if (EVI->use_empty())
       EVI->eraseFromParent();
   }
 
-  if (LPI->getNumUses() == 0)
+  if (LPI->use_empty())
     return;
 
   // There are still some uses of LPI. Construct an aggregate with the exception
@@ -202,8 +178,7 @@ Value *SjLjEHPrepare::setupFunctionContext(Function &F,
                            &EntryBB->front());
 
   // Fill in the function context structure.
-  for (unsigned I = 0, E = LPads.size(); I != E; ++I) {
-    LandingPadInst *LPI = LPads[I];
+  for (LandingPadInst *LPI : LPads) {
     IRBuilder<> Builder(LPI->getParent(),
                         LPI->getParent()->getFirstInsertionPt());
 
@@ -226,8 +201,7 @@ Value *SjLjEHPrepare::setupFunctionContext(Function &F,
 
   // Personality function
   IRBuilder<> Builder(EntryBB->getTerminator());
-  if (!PersonalityFn)
-    PersonalityFn = F.getPersonalityFn();
+  Value *PersonalityFn = F.getPersonalityFn();
   Value *PersonalityFieldPtr = Builder.CreateConstGEP2_32(
       FunctionContextTy, FuncCtx, 0, 3, "pers_fn_gep");
   Builder.CreateStore(
@@ -250,7 +224,7 @@ Value *SjLjEHPrepare::setupFunctionContext(Function &F,
 void SjLjEHPrepare::lowerIncomingArguments(Function &F) {
   BasicBlock::iterator AfterAllocaInsPt = F.begin()->begin();
   while (isa<AllocaInst>(AfterAllocaInsPt) &&
-         isa<ConstantInt>(cast<AllocaInst>(AfterAllocaInsPt)->getArraySize()))
+         cast<AllocaInst>(AfterAllocaInsPt)->isStaticAlloca())
     ++AfterAllocaInsPt;
   assert(AfterAllocaInsPt != F.front().end());
 
@@ -274,40 +248,37 @@ void SjLjEHPrepare::lowerIncomingArguments(Function &F) {
 void SjLjEHPrepare::lowerAcrossUnwindEdges(Function &F,
                                            ArrayRef<InvokeInst *> Invokes) {
   // Finally, scan the code looking for instructions with bad live ranges.
-  for (Function::iterator BB = F.begin(), BBE = F.end(); BB != BBE; ++BB) {
-    for (BasicBlock::iterator II = BB->begin(), IIE = BB->end(); II != IIE;
-         ++II) {
+  for (BasicBlock &BB : F) {
+    for (Instruction &Inst : BB) {
       // Ignore obvious cases we don't have to handle. In particular, most
       // instructions either have no uses or only have a single use inside the
       // current block. Ignore them quickly.
-      Instruction *Inst = &*II;
-      if (Inst->use_empty())
+      if (Inst.use_empty())
         continue;
-      if (Inst->hasOneUse() &&
-          cast<Instruction>(Inst->user_back())->getParent() == BB &&
-          !isa<PHINode>(Inst->user_back()))
+      if (Inst.hasOneUse() &&
+          cast<Instruction>(Inst.user_back())->getParent() == &BB &&
+          !isa<PHINode>(Inst.user_back()))
         continue;
 
       // If this is an alloca in the entry block, it's not a real register
       // value.
-      if (AllocaInst *AI = dyn_cast<AllocaInst>(Inst))
-        if (isa<ConstantInt>(AI->getArraySize()) && BB == F.begin())
+      if (auto *AI = dyn_cast<AllocaInst>(&Inst))
+        if (AI->isStaticAlloca())
           continue;
 
       // Avoid iterator invalidation by copying users to a temporary vector.
       SmallVector<Instruction *, 16> Users;
-      for (User *U : Inst->users()) {
+      for (User *U : Inst.users()) {
         Instruction *UI = cast<Instruction>(U);
-        if (UI->getParent() != BB || isa<PHINode>(UI))
+        if (UI->getParent() != &BB || isa<PHINode>(UI))
           Users.push_back(UI);
       }
 
       // Find all of the blocks that this value is live in.
-      SmallPtrSet<BasicBlock *, 64> LiveBBs;
-      LiveBBs.insert(Inst->getParent());
+      SmallPtrSet<BasicBlock *, 32> LiveBBs;
+      LiveBBs.insert(&BB);
       while (!Users.empty()) {
-        Instruction *U = Users.back();
-        Users.pop_back();
+        Instruction *U = Users.pop_back_val();
 
         if (!isa<PHINode>(U)) {
           MarkBlocksLiveIn(U->getParent(), LiveBBs);
@@ -315,7 +286,7 @@ void SjLjEHPrepare::lowerAcrossUnwindEdges(Function &F,
           // Uses for a PHI node occur in their predecessor block.
           PHINode *PN = cast<PHINode>(U);
           for (unsigned i = 0, e = PN->getNumIncomingValues(); i != e; ++i)
-            if (PN->getIncomingValue(i) == Inst)
+            if (PN->getIncomingValue(i) == &Inst)
               MarkBlocksLiveIn(PN->getIncomingBlock(i), LiveBBs);
         }
       }
@@ -323,10 +294,10 @@ void SjLjEHPrepare::lowerAcrossUnwindEdges(Function &F,
       // Now that we know all of the blocks that this thing is live in, see if
       // it includes any of the unwind locations.
       bool NeedsSpill = false;
-      for (unsigned i = 0, e = Invokes.size(); i != e; ++i) {
-        BasicBlock *UnwindBlock = Invokes[i]->getUnwindDest();
-        if (UnwindBlock != BB && LiveBBs.count(UnwindBlock)) {
-          DEBUG(dbgs() << "SJLJ Spill: " << *Inst << " around "
+      for (InvokeInst *Invoke : Invokes) {
+        BasicBlock *UnwindBlock = Invoke->getUnwindDest();
+        if (UnwindBlock != &BB && LiveBBs.count(UnwindBlock)) {
+          DEBUG(dbgs() << "SJLJ Spill: " << Inst << " around "
                        << UnwindBlock->getName() << "\n");
           NeedsSpill = true;
           break;
@@ -338,15 +309,15 @@ void SjLjEHPrepare::lowerAcrossUnwindEdges(Function &F,
       // the value to be reloaded from the stack slot, even those that aren't
       // in the unwind blocks. We should be more selective.
       if (NeedsSpill) {
-        DemoteRegToStack(*Inst, true);
+        DemoteRegToStack(Inst, true);
         ++NumSpilled;
       }
     }
   }
 
   // Go through the landing pads and remove any PHIs there.
-  for (unsigned i = 0, e = Invokes.size(); i != e; ++i) {
-    BasicBlock *UnwindBlock = Invokes[i]->getUnwindDest();
+  for (InvokeInst *Invoke : Invokes) {
+    BasicBlock *UnwindBlock = Invoke->getUnwindDest();
     LandingPadInst *LPI = UnwindBlock->getLandingPadInst();
 
     // Place PHIs into a set to avoid invalidating the iterator.
@@ -374,11 +345,10 @@ bool SjLjEHPrepare::setupEntryBlockAndCallSites(Function &F) {
   SmallSetVector<LandingPadInst *, 16> LPads;
 
   // Look through the terminators of the basic blocks to find invokes.
-  for (Function::iterator BB = F.begin(), E = F.end(); BB != E; ++BB)
-    if (InvokeInst *II = dyn_cast<InvokeInst>(BB->getTerminator())) {
+  for (BasicBlock &BB : F)
+    if (auto *II = dyn_cast<InvokeInst>(BB.getTerminator())) {
       if (Function *Callee = II->getCalledFunction())
-        if (Callee->isIntrinsic() &&
-            Callee->getIntrinsicID() == Intrinsic::donothing) {
+        if (Callee->getIntrinsicID() == Intrinsic::donothing) {
           // Remove the NOP invoke.
           BranchInst::Create(II->getNormalDest(), II);
           II->eraseFromParent();
@@ -387,7 +357,7 @@ bool SjLjEHPrepare::setupEntryBlockAndCallSites(Function &F) {
 
       Invokes.push_back(II);
       LPads.insert(II->getUnwindDest()->getLandingPadInst());
-    } else if (ReturnInst *RI = dyn_cast<ReturnInst>(BB->getTerminator())) {
+    } else if (auto *RI = dyn_cast<ReturnInst>(BB.getTerminator())) {
       Returns.push_back(RI);
     }
 
@@ -448,14 +418,13 @@ bool SjLjEHPrepare::setupEntryBlockAndCallSites(Function &F) {
   // created for this function and any unexpected exceptions thrown will go
   // directly to the caller's context, which is what we want anyway, so no need
   // to do anything here.
-  for (Function::iterator BB = F.begin(), E = F.end(); ++BB != E;)
-    for (BasicBlock::iterator I = BB->begin(), end = BB->end(); I != end; ++I)
-      if (CallInst *CI = dyn_cast<CallInst>(I)) {
-        if (!CI->doesNotThrow())
-          insertCallSiteStore(CI, -1);
-      } else if (ResumeInst *RI = dyn_cast<ResumeInst>(I)) {
-        insertCallSiteStore(RI, -1);
-      }
+  for (BasicBlock &BB : F) {
+    if (&BB == &F.front())
+      continue;
+    for (Instruction &I : BB)
+      if (I.mayThrow())
+        insertCallSiteStore(&I, -1);
+  }
 
   // Register the function context and make sure it's known to not throw
   CallInst *Register =
@@ -464,18 +433,18 @@ bool SjLjEHPrepare::setupEntryBlockAndCallSites(Function &F) {
 
   // Following any allocas not in the entry block, update the saved SP in the
   // jmpbuf to the new value.
-  for (Function::iterator BB = F.begin(), E = F.end(); BB != E; ++BB) {
-    if (BB == F.begin())
+  for (BasicBlock &BB : F) {
+    if (&BB == &F.front())
       continue;
-    for (BasicBlock::iterator I = BB->begin(), E = BB->end(); I != E; ++I) {
-      if (CallInst *CI = dyn_cast<CallInst>(I)) {
+    for (Instruction &I : BB) {
+      if (auto *CI = dyn_cast<CallInst>(&I)) {
         if (CI->getCalledFunction() != StackRestoreFn)
           continue;
-      } else if (!isa<AllocaInst>(I)) {
+      } else if (!isa<AllocaInst>(&I)) {
         continue;
       }
       Instruction *StackAddr = CallInst::Create(StackAddrFn, "sp");
-      StackAddr->insertAfter(&*I);
+      StackAddr->insertAfter(&I);
       Instruction *StoreStackAddr = new StoreInst(StackAddr, StackPtr, true);
       StoreStackAddr->insertAfter(StackAddr);
     }
@@ -483,13 +452,29 @@ bool SjLjEHPrepare::setupEntryBlockAndCallSites(Function &F) {
 
   // Finally, for any returns from this function, if this function contains an
   // invoke, add a call to unregister the function context.
-  for (unsigned I = 0, E = Returns.size(); I != E; ++I)
-    CallInst::Create(UnregisterFn, FuncCtx, "", Returns[I]);
+  for (ReturnInst *Return : Returns)
+    CallInst::Create(UnregisterFn, FuncCtx, "", Return);
 
   return true;
 }
 
 bool SjLjEHPrepare::runOnFunction(Function &F) {
+  Module &M = *F.getParent();
+  RegisterFn = M.getOrInsertFunction(
+      "_Unwind_SjLj_Register", Type::getVoidTy(M.getContext()),
+      PointerType::getUnqual(FunctionContextTy), nullptr);
+  UnregisterFn = M.getOrInsertFunction(
+      "_Unwind_SjLj_Unregister", Type::getVoidTy(M.getContext()),
+      PointerType::getUnqual(FunctionContextTy), nullptr);
+  FrameAddrFn = Intrinsic::getDeclaration(&M, Intrinsic::frameaddress);
+  StackAddrFn = Intrinsic::getDeclaration(&M, Intrinsic::stacksave);
+  StackRestoreFn = Intrinsic::getDeclaration(&M, Intrinsic::stackrestore);
+  BuiltinSetupDispatchFn =
+    Intrinsic::getDeclaration(&M, Intrinsic::eh_sjlj_setup_dispatch);
+  LSDAAddrFn = Intrinsic::getDeclaration(&M, Intrinsic::eh_sjlj_lsda);
+  CallSiteFn = Intrinsic::getDeclaration(&M, Intrinsic::eh_sjlj_callsite);
+  FuncCtxFn = Intrinsic::getDeclaration(&M, Intrinsic::eh_sjlj_functioncontext);
+
   bool Res = setupEntryBlockAndCallSites(F);
   return Res;
 }
diff --git a/lib/CodeGen/SlotIndexes.cpp b/lib/CodeGen/SlotIndexes.cpp
index c9d23f67bdee..dba103e9bfb1 100644
--- a/lib/CodeGen/SlotIndexes.cpp
+++ b/lib/CodeGen/SlotIndexes.cpp
@@ -69,34 +69,29 @@ bool SlotIndexes::runOnMachineFunction(MachineFunction &fn) {
   indexList.push_back(createEntry(nullptr, index));
 
   // Iterate over the function.
-  for (MachineFunction::iterator mbbItr = mf->begin(), mbbEnd = mf->end();
-       mbbItr != mbbEnd; ++mbbItr) {
-    MachineBasicBlock *mbb = &*mbbItr;
-
+  for (MachineBasicBlock &MBB : *mf) {
     // Insert an index for the MBB start.
     SlotIndex blockStartIndex(&indexList.back(), SlotIndex::Slot_Block);
 
-    for (MachineBasicBlock::iterator miItr = mbb->begin(), miEnd = mbb->end();
-         miItr != miEnd; ++miItr) {
-      MachineInstr *mi = miItr;
-      if (mi->isDebugValue())
+    for (MachineInstr &MI : MBB) {
+      if (MI.isDebugValue())
         continue;
 
       // Insert a store index for the instr.
-      indexList.push_back(createEntry(mi, index += SlotIndex::InstrDist));
+      indexList.push_back(createEntry(&MI, index += SlotIndex::InstrDist));
 
       // Save this base index in the maps.
-      mi2iMap.insert(std::make_pair(mi, SlotIndex(&indexList.back(),
-                                                  SlotIndex::Slot_Block)));
+      mi2iMap.insert(std::make_pair(
+          &MI, SlotIndex(&indexList.back(), SlotIndex::Slot_Block)));
     }
 
     // We insert one blank instructions between basic blocks.
     indexList.push_back(createEntry(nullptr, index += SlotIndex::InstrDist));
 
-    MBBRanges[mbb->getNumber()].first = blockStartIndex;
-    MBBRanges[mbb->getNumber()].second = SlotIndex(&indexList.back(),
+    MBBRanges[MBB.getNumber()].first = blockStartIndex;
+    MBBRanges[MBB.getNumber()].second = SlotIndex(&indexList.back(),
                                                    SlotIndex::Slot_Block);
-    idx2MBBMap.push_back(IdxMBBPair(blockStartIndex, mbb));
+    idx2MBBMap.push_back(IdxMBBPair(blockStartIndex, &MBB));
   }
 
   // Sort the Idx2MBBMap
@@ -150,9 +145,9 @@ void SlotIndexes::repairIndexesInRange(MachineBasicBlock *MBB,
   // does the same thing.
   // Find anchor points, which are at the beginning/end of blocks or at
   // instructions that already have indexes.
-  while (Begin != MBB->begin() && !hasIndex(Begin))
+  while (Begin != MBB->begin() && !hasIndex(*Begin))
     --Begin;
-  while (End != MBB->end() && !hasIndex(End))
+  while (End != MBB->end() && !hasIndex(*End))
     ++End;
 
   bool includeStart = (Begin == MBB->begin());
@@ -160,13 +155,13 @@ void SlotIndexes::repairIndexesInRange(MachineBasicBlock *MBB,
   if (includeStart)
     startIdx = getMBBStartIdx(MBB);
   else
-    startIdx = getInstructionIndex(Begin);
+    startIdx = getInstructionIndex(*Begin);
 
   SlotIndex endIdx;
   if (End == MBB->end())
     endIdx = getMBBEndIdx(MBB);
   else
-    endIdx = getInstructionIndex(End);
+    endIdx = getInstructionIndex(*End);
 
   // FIXME: Conceptually, this code is implementing an iterator on MBB that
   // optionally includes an additional position prior to MBB->begin(), indicated
@@ -182,7 +177,7 @@ void SlotIndexes::repairIndexesInRange(MachineBasicBlock *MBB,
            "Decremented past the beginning of region to repair.");
 
     MachineInstr *SlotMI = ListI->getInstr();
-    MachineInstr *MI = (MBBI != MBB->end() && !pastStart) ? MBBI : nullptr;
+    MachineInstr *MI = (MBBI != MBB->end() && !pastStart) ? &*MBBI : nullptr;
     bool MBBIAtBegin = MBBI == Begin && (!includeStart || pastStart);
 
     if (SlotMI == MI && !MBBIAtBegin) {
@@ -199,7 +194,7 @@ void SlotIndexes::repairIndexesInRange(MachineBasicBlock *MBB,
     } else {
       --ListI;
       if (SlotMI)
-        removeMachineInstrFromMaps(SlotMI);
+        removeMachineInstrFromMaps(*SlotMI);
     }
   }
 
@@ -207,14 +202,14 @@ void SlotIndexes::repairIndexesInRange(MachineBasicBlock *MBB,
   // to update the IndexList while we are iterating it.
   for (MachineBasicBlock::iterator I = End; I != Begin;) {
     --I;
-    MachineInstr *MI = I;
-    if (!MI->isDebugValue() && mi2iMap.find(MI) == mi2iMap.end())
+    MachineInstr &MI = *I;
+    if (!MI.isDebugValue() && mi2iMap.find(&MI) == mi2iMap.end())
       insertMachineInstrInMaps(MI);
   }
 }
 
 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
-void SlotIndexes::dump() const {
+LLVM_DUMP_METHOD void SlotIndexes::dump() const {
   for (IndexList::const_iterator itr = indexList.begin();
        itr != indexList.end(); ++itr) {
     dbgs() << itr->getIndex() << " ";
@@ -242,7 +237,7 @@ void SlotIndex::print(raw_ostream &os) const {
 
 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
 // Dump a SlotIndex to stderr.
-void SlotIndex::dump() const {
+LLVM_DUMP_METHOD void SlotIndex::dump() const {
   print(dbgs());
   dbgs() << "\n";
 }
diff --git a/lib/CodeGen/SpillPlacement.cpp b/lib/CodeGen/SpillPlacement.cpp
index d30cfc27bf4b..f10c98ef4e50 100644
--- a/lib/CodeGen/SpillPlacement.cpp
+++ b/lib/CodeGen/SpillPlacement.cpp
@@ -173,6 +173,17 @@ struct SpillPlacement::Node {
       Value = 0;
     return Before != preferReg();
   }
+
+  void getDissentingNeighbors(SparseSet<unsigned> &List,
+                              const Node nodes[]) const {
+    for (const auto &Elt : Links) {
+      unsigned n = Elt.second;
+      // Neighbors that already have the same value are not going to
+      // change because of this node changing.
+      if (Value != nodes[n].Value)
+        List.insert(n);
+    }
+  }
 };
 
 bool SpillPlacement::runOnMachineFunction(MachineFunction &mf) {
@@ -182,6 +193,8 @@ bool SpillPlacement::runOnMachineFunction(MachineFunction &mf) {
 
   assert(!nodes && "Leaking node array");
   nodes = new Node[bundles->getNumBundles()];
+  TodoList.clear();
+  TodoList.setUniverse(bundles->getNumBundles());
 
   // Compute total ingoing and outgoing block frequencies for all bundles.
   BlockFrequencies.resize(mf.getNumBlockIDs());
@@ -199,10 +212,12 @@ bool SpillPlacement::runOnMachineFunction(MachineFunction &mf) {
 void SpillPlacement::releaseMemory() {
   delete[] nodes;
   nodes = nullptr;
+  TodoList.clear();
 }
 
 /// activate - mark node n as active if it wasn't already.
 void SpillPlacement::activate(unsigned n) {
+  TodoList.insert(n);
   if (ActiveNodes->test(n))
     return;
   ActiveNodes->set(n);
@@ -287,10 +302,6 @@ void SpillPlacement::addLinks(ArrayRef<unsigned> Links) {
       continue;
     activate(ib);
     activate(ob);
-    if (nodes[ib].Links.empty() && !nodes[ib].mustSpill())
-      Linked.push_back(ib);
-    if (nodes[ob].Links.empty() && !nodes[ob].mustSpill())
-      Linked.push_back(ob);
     BlockFrequency Freq = BlockFrequencies[Number];
     nodes[ib].addLink(ob, Freq);
     nodes[ob].addLink(ib, Freq);
@@ -298,76 +309,50 @@ void SpillPlacement::addLinks(ArrayRef<unsigned> Links) {
 }
 
 bool SpillPlacement::scanActiveBundles() {
-  Linked.clear();
   RecentPositive.clear();
   for (int n = ActiveNodes->find_first(); n>=0; n = ActiveNodes->find_next(n)) {
-    nodes[n].update(nodes, Threshold);
+    update(n);
     // A node that must spill, or a node without any links is not going to
     // change its value ever again, so exclude it from iterations.
     if (nodes[n].mustSpill())
       continue;
-    if (!nodes[n].Links.empty())
-      Linked.push_back(n);
     if (nodes[n].preferReg())
       RecentPositive.push_back(n);
   }
   return !RecentPositive.empty();
 }
 
+bool SpillPlacement::update(unsigned n) {
+  if (!nodes[n].update(nodes, Threshold))
+    return false;
+  nodes[n].getDissentingNeighbors(TodoList, nodes);
+  return true;
+}
+
 /// iterate - Repeatedly update the Hopfield nodes until stability or the
 /// maximum number of iterations is reached.
-/// @param Linked - Numbers of linked nodes that need updating.
 void SpillPlacement::iterate() {
-  // First update the recently positive nodes. They have likely received new
-  // negative bias that will turn them off.
-  while (!RecentPositive.empty())
-    nodes[RecentPositive.pop_back_val()].update(nodes, Threshold);
-
-  if (Linked.empty())
-    return;
+  // We do not need to push those node in the todolist.
+  // They are already been proceeded as part of the previous iteration.
+  RecentPositive.clear();
 
-  // Run up to 10 iterations. The edge bundle numbering is closely related to
-  // basic block numbering, so there is a strong tendency towards chains of
-  // linked nodes with sequential numbers. By scanning the linked nodes
-  // backwards and forwards, we make it very likely that a single node can
-  // affect the entire network in a single iteration. That means very fast
-  // convergence, usually in a single iteration.
-  for (unsigned iteration = 0; iteration != 10; ++iteration) {
-    // Scan backwards, skipping the last node when iteration is not zero. When
-    // iteration is not zero, the last node was just updated.
-    bool Changed = false;
-    for (SmallVectorImpl<unsigned>::const_reverse_iterator I =
-           iteration == 0 ? Linked.rbegin() : std::next(Linked.rbegin()),
-           E = Linked.rend(); I != E; ++I) {
-      unsigned n = *I;
-      if (nodes[n].update(nodes, Threshold)) {
-        Changed = true;
-        if (nodes[n].preferReg())
-          RecentPositive.push_back(n);
-      }
-    }
-    if (!Changed || !RecentPositive.empty())
-      return;
-
-    // Scan forwards, skipping the first node which was just updated.
-    Changed = false;
-    for (SmallVectorImpl<unsigned>::const_iterator I =
-           std::next(Linked.begin()), E = Linked.end(); I != E; ++I) {
-      unsigned n = *I;
-      if (nodes[n].update(nodes, Threshold)) {
-        Changed = true;
-        if (nodes[n].preferReg())
-          RecentPositive.push_back(n);
-      }
-    }
-    if (!Changed || !RecentPositive.empty())
-      return;
+  // Since the last iteration, the todolist have been augmented by calls
+  // to addConstraints, addLinks, and co.
+  // Update the network energy starting at this new frontier.
+  // The call to ::update will add the nodes that changed into the todolist.
+  unsigned Limit = bundles->getNumBundles() * 10;
+  while(Limit-- > 0 && !TodoList.empty()) {
+    unsigned n = TodoList.pop_back_val();
+    if (!update(n))
+      continue;
+    if (nodes[n].preferReg())
+      RecentPositive.push_back(n);
   }
 }
 
 void SpillPlacement::prepare(BitVector &RegBundles) {
-  Linked.clear();
   RecentPositive.clear();
+  TodoList.clear();
   // Reuse RegBundles as our ActiveNodes vector.
   ActiveNodes = &RegBundles;
   ActiveNodes->clear();
diff --git a/lib/CodeGen/SpillPlacement.h b/lib/CodeGen/SpillPlacement.h
index 03dd58d6e9a9..9b9ecccf9049 100644
--- a/lib/CodeGen/SpillPlacement.h
+++ b/lib/CodeGen/SpillPlacement.h
@@ -29,6 +29,7 @@
 
 #include "llvm/ADT/ArrayRef.h"
 #include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/SparseSet.h"
 #include "llvm/CodeGen/MachineFunctionPass.h"
 #include "llvm/Support/BlockFrequency.h"
 
@@ -66,6 +67,9 @@ class SpillPlacement : public MachineFunctionPass {
   /// its inputs falls in the open interval (-Threshold;Threshold).
   BlockFrequency Threshold;
 
+  /// List of nodes that need to be updated in ::iterate.
+  SparseSet<unsigned> TodoList;
+
 public:
   static char ID; // Pass identification, replacement for typeid.
 
@@ -157,6 +161,8 @@ private:
 
   void activate(unsigned);
   void setThreshold(const BlockFrequency &Entry);
+
+  bool update(unsigned);
 };
 
 } // end namespace llvm
diff --git a/lib/CodeGen/Spiller.h b/lib/CodeGen/Spiller.h
index 08f99ec78adc..61ee508c8394 100644
--- a/lib/CodeGen/Spiller.h
+++ b/lib/CodeGen/Spiller.h
@@ -16,6 +16,7 @@ namespace llvm {
   class MachineFunction;
   class MachineFunctionPass;
   class VirtRegMap;
+  class LiveIntervals;
 
   /// Spiller interface.
   ///
@@ -28,7 +29,7 @@ namespace llvm {
 
     /// spill - Spill the LRE.getParent() live interval.
     virtual void spill(LiveRangeEdit &LRE) = 0;
-
+    virtual void postOptimization(){};
   };
 
   /// Create and return a spiller that will insert spill code directly instead
@@ -36,7 +37,6 @@ namespace llvm {
   Spiller *createInlineSpiller(MachineFunctionPass &pass,
                                MachineFunction &mf,
                                VirtRegMap &vrm);
-
 }
 
 #endif
diff --git a/lib/CodeGen/SplitKit.cpp b/lib/CodeGen/SplitKit.cpp
index 51dddabed2d9..07be24b18dd5 100644
--- a/lib/CodeGen/SplitKit.cpp
+++ b/lib/CodeGen/SplitKit.cpp
@@ -16,6 +16,7 @@
 #include "llvm/ADT/Statistic.h"
 #include "llvm/CodeGen/LiveIntervalAnalysis.h"
 #include "llvm/CodeGen/LiveRangeEdit.h"
+#include "llvm/CodeGen/MachineBlockFrequencyInfo.h"
 #include "llvm/CodeGen/MachineDominators.h"
 #include "llvm/CodeGen/MachineInstrBuilder.h"
 #include "llvm/CodeGen/MachineLoopInfo.h"
@@ -37,82 +38,101 @@ STATISTIC(NumRemats,   "Number of rematerialized defs for splitting");
 STATISTIC(NumRepairs,  "Number of invalid live ranges repaired");
 
 //===----------------------------------------------------------------------===//
-//                                 Split Analysis
+//                     Last Insert Point Analysis
 //===----------------------------------------------------------------------===//
 
-SplitAnalysis::SplitAnalysis(const VirtRegMap &vrm, const LiveIntervals &lis,
-                             const MachineLoopInfo &mli)
-    : MF(vrm.getMachineFunction()), VRM(vrm), LIS(lis), Loops(mli),
-      TII(*MF.getSubtarget().getInstrInfo()), CurLI(nullptr),
-      LastSplitPoint(MF.getNumBlockIDs()) {}
+InsertPointAnalysis::InsertPointAnalysis(const LiveIntervals &lis,
+                                         unsigned BBNum)
+    : LIS(lis), LastInsertPoint(BBNum) {}
 
-void SplitAnalysis::clear() {
-  UseSlots.clear();
-  UseBlocks.clear();
-  ThroughBlocks.clear();
-  CurLI = nullptr;
-  DidRepairRange = false;
-}
+SlotIndex
+InsertPointAnalysis::computeLastInsertPoint(const LiveInterval &CurLI,
+                                            const MachineBasicBlock &MBB) {
+  unsigned Num = MBB.getNumber();
+  std::pair<SlotIndex, SlotIndex> &LIP = LastInsertPoint[Num];
+  SlotIndex MBBEnd = LIS.getMBBEndIdx(&MBB);
 
-SlotIndex SplitAnalysis::computeLastSplitPoint(unsigned Num) {
-  const MachineBasicBlock *MBB = MF.getBlockNumbered(Num);
-  // FIXME: Handle multiple EH pad successors.
-  const MachineBasicBlock *LPad = MBB->getLandingPadSuccessor();
-  std::pair<SlotIndex, SlotIndex> &LSP = LastSplitPoint[Num];
-  SlotIndex MBBEnd = LIS.getMBBEndIdx(MBB);
+  SmallVector<const MachineBasicBlock *, 1> EHPadSucessors;
+  for (const MachineBasicBlock *SMBB : MBB.successors())
+    if (SMBB->isEHPad())
+      EHPadSucessors.push_back(SMBB);
 
-  // Compute split points on the first call. The pair is independent of the
+  // Compute insert points on the first call. The pair is independent of the
   // current live interval.
-  if (!LSP.first.isValid()) {
-    MachineBasicBlock::const_iterator FirstTerm = MBB->getFirstTerminator();
-    if (FirstTerm == MBB->end())
-      LSP.first = MBBEnd;
+  if (!LIP.first.isValid()) {
+    MachineBasicBlock::const_iterator FirstTerm = MBB.getFirstTerminator();
+    if (FirstTerm == MBB.end())
+      LIP.first = MBBEnd;
     else
-      LSP.first = LIS.getInstructionIndex(FirstTerm);
+      LIP.first = LIS.getInstructionIndex(*FirstTerm);
 
     // If there is a landing pad successor, also find the call instruction.
-    if (!LPad)
-      return LSP.first;
+    if (EHPadSucessors.empty())
+      return LIP.first;
     // There may not be a call instruction (?) in which case we ignore LPad.
-    LSP.second = LSP.first;
-    for (MachineBasicBlock::const_iterator I = MBB->end(), E = MBB->begin();
+    LIP.second = LIP.first;
+    for (MachineBasicBlock::const_iterator I = MBB.end(), E = MBB.begin();
          I != E;) {
       --I;
       if (I->isCall()) {
-        LSP.second = LIS.getInstructionIndex(I);
+        LIP.second = LIS.getInstructionIndex(*I);
         break;
       }
     }
   }
 
-  // If CurLI is live into a landing pad successor, move the last split point
+  // If CurLI is live into a landing pad successor, move the last insert point
   // back to the call that may throw.
-  if (!LPad || !LSP.second || !LIS.isLiveInToMBB(*CurLI, LPad))
-    return LSP.first;
+  if (!LIP.second)
+    return LIP.first;
+
+  if (none_of(EHPadSucessors, [&](const MachineBasicBlock *EHPad) {
+        return LIS.isLiveInToMBB(CurLI, EHPad);
+      }))
+    return LIP.first;
 
   // Find the value leaving MBB.
-  const VNInfo *VNI = CurLI->getVNInfoBefore(MBBEnd);
+  const VNInfo *VNI = CurLI.getVNInfoBefore(MBBEnd);
   if (!VNI)
-    return LSP.first;
+    return LIP.first;
 
   // If the value leaving MBB was defined after the call in MBB, it can't
   // really be live-in to the landing pad.  This can happen if the landing pad
   // has a PHI, and this register is undef on the exceptional edge.
   // <rdar://problem/10664933>
-  if (!SlotIndex::isEarlierInstr(VNI->def, LSP.second) && VNI->def < MBBEnd)
-    return LSP.first;
+  if (!SlotIndex::isEarlierInstr(VNI->def, LIP.second) && VNI->def < MBBEnd)
+    return LIP.first;
 
   // Value is properly live-in to the landing pad.
-  // Only allow splits before the call.
-  return LSP.second;
+  // Only allow inserts before the call.
+  return LIP.second;
 }
 
 MachineBasicBlock::iterator
-SplitAnalysis::getLastSplitPointIter(MachineBasicBlock *MBB) {
-  SlotIndex LSP = getLastSplitPoint(MBB->getNumber());
-  if (LSP == LIS.getMBBEndIdx(MBB))
-    return MBB->end();
-  return LIS.getInstructionFromIndex(LSP);
+InsertPointAnalysis::getLastInsertPointIter(const LiveInterval &CurLI,
+                                            MachineBasicBlock &MBB) {
+  SlotIndex LIP = getLastInsertPoint(CurLI, MBB);
+  if (LIP == LIS.getMBBEndIdx(&MBB))
+    return MBB.end();
+  return LIS.getInstructionFromIndex(LIP);
+}
+
+//===----------------------------------------------------------------------===//
+//                                 Split Analysis
+//===----------------------------------------------------------------------===//
+
+SplitAnalysis::SplitAnalysis(const VirtRegMap &vrm, const LiveIntervals &lis,
+                             const MachineLoopInfo &mli)
+    : MF(vrm.getMachineFunction()), VRM(vrm), LIS(lis), Loops(mli),
+      TII(*MF.getSubtarget().getInstrInfo()), CurLI(nullptr),
+      IPA(lis, MF.getNumBlockIDs()) {}
+
+void SplitAnalysis::clear() {
+  UseSlots.clear();
+  UseBlocks.clear();
+  ThroughBlocks.clear();
+  CurLI = nullptr;
+  DidRepairRange = false;
 }
 
 /// analyzeUses - Count instructions, basic blocks, and loops using CurLI.
@@ -129,7 +149,7 @@ void SplitAnalysis::analyzeUses() {
   const MachineRegisterInfo &MRI = MF.getRegInfo();
   for (MachineOperand &MO : MRI.use_nodbg_operands(CurLI->reg))
     if (!MO.isUndef())
-      UseSlots.push_back(LIS.getInstructionIndex(MO.getParent()).getRegSlot());
+      UseSlots.push_back(LIS.getInstructionIndex(*MO.getParent()).getRegSlot());
 
   array_pod_sort(UseSlots.begin(), UseSlots.end());
 
@@ -318,11 +338,13 @@ void SplitAnalysis::analyze(const LiveInterval *li) {
 //===----------------------------------------------------------------------===//
 
 /// Create a new SplitEditor for editing the LiveInterval analyzed by SA.
-SplitEditor::SplitEditor(SplitAnalysis &sa, LiveIntervals &lis, VirtRegMap &vrm,
+SplitEditor::SplitEditor(SplitAnalysis &sa, AliasAnalysis &aa,
+                         LiveIntervals &lis, VirtRegMap &vrm,
                          MachineDominatorTree &mdt,
                          MachineBlockFrequencyInfo &mbfi)
-    : SA(sa), LIS(lis), VRM(vrm), MRI(vrm.getMachineFunction().getRegInfo()),
-      MDT(mdt), TII(*vrm.getMachineFunction().getSubtarget().getInstrInfo()),
+    : SA(sa), AA(aa), LIS(lis), VRM(vrm),
+      MRI(vrm.getMachineFunction().getRegInfo()), MDT(mdt),
+      TII(*vrm.getMachineFunction().getSubtarget().getInstrInfo()),
       TRI(*vrm.getMachineFunction().getSubtarget().getRegisterInfo()),
       MBFI(mbfi), Edit(nullptr), OpenIdx(0), SpillMode(SM_Partition),
       RegAssign(Allocator) {}
@@ -347,7 +369,7 @@ void SplitEditor::reset(LiveRangeEdit &LRE, ComplementSpillMode SM) {
 }
 
 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
-void SplitEditor::dump() const {
+LLVM_DUMP_METHOD void SplitEditor::dump() const {
   if (RegAssign.empty()) {
     dbgs() << " empty\n";
     return;
@@ -430,16 +452,22 @@ VNInfo *SplitEditor::defFromParent(unsigned RegIdx,
   bool Late = RegIdx != 0;
 
   // Attempt cheap-as-a-copy rematerialization.
+  unsigned Original = VRM.getOriginal(Edit->get(RegIdx));
+  LiveInterval &OrigLI = LIS.getInterval(Original);
+  VNInfo *OrigVNI = OrigLI.getVNInfoAt(UseIdx);
   LiveRangeEdit::Remat RM(ParentVNI);
-  if (Edit->canRematerializeAt(RM, UseIdx, true)) {
+  RM.OrigMI = LIS.getInstructionFromIndex(OrigVNI->def);
+
+  if (Edit->canRematerializeAt(RM, OrigVNI, UseIdx, true)) {
     Def = Edit->rematerializeAt(MBB, I, LI->reg, RM, TRI, Late);
     ++NumRemats;
   } else {
     // Can't remat, just insert a copy from parent.
     CopyMI = BuildMI(MBB, I, DebugLoc(), TII.get(TargetOpcode::COPY), LI->reg)
                .addReg(Edit->getReg());
-    Def = LIS.getSlotIndexes()->insertMachineInstrInMaps(CopyMI, Late)
-            .getRegSlot();
+    Def = LIS.getSlotIndexes()
+              ->insertMachineInstrInMaps(*CopyMI, Late)
+              .getRegSlot();
     ++NumCopies;
   }
 
@@ -638,7 +666,7 @@ void SplitEditor::removeBackCopies(SmallVectorImpl<VNInfo*> &Copies) {
 
     DEBUG(dbgs() << "Removing " << Def << '\t' << *MI);
     LIS.removeVRegDefAt(*LI, Def);
-    LIS.RemoveMachineInstrFromMaps(MI);
+    LIS.RemoveMachineInstrFromMaps(*MI);
     MI->eraseFromParent();
 
     // Adjust RegAssign if a register assignment is killed at Def. We want to
@@ -654,7 +682,7 @@ void SplitEditor::removeBackCopies(SmallVectorImpl<VNInfo*> &Copies) {
       DEBUG(dbgs() << "  cannot find simple kill of RegIdx " << RegIdx << '\n');
       forceRecompute(RegIdx, Edit->getParent().getVNInfoAt(Def));
     } else {
-      SlotIndex Kill = LIS.getInstructionIndex(MBBI).getRegSlot();
+      SlotIndex Kill = LIS.getInstructionIndex(*MBBI).getRegSlot();
       DEBUG(dbgs() << "  move kill to " << Kill << '\t' << *MBBI);
       AssignI.setStop(Kill);
     }
@@ -715,7 +743,62 @@ SplitEditor::findShallowDominator(MachineBasicBlock *MBB,
   }
 }
 
-void SplitEditor::hoistCopiesForSize() {
+void SplitEditor::computeRedundantBackCopies(
+    DenseSet<unsigned> &NotToHoistSet, SmallVectorImpl<VNInfo *> &BackCopies) {
+  LiveInterval *LI = &LIS.getInterval(Edit->get(0));
+  LiveInterval *Parent = &Edit->getParent();
+  SmallVector<SmallPtrSet<VNInfo *, 8>, 8> EqualVNs(Parent->getNumValNums());
+  SmallPtrSet<VNInfo *, 8> DominatedVNIs;
+
+  // Aggregate VNIs having the same value as ParentVNI.
+  for (VNInfo *VNI : LI->valnos) {
+    if (VNI->isUnused())
+      continue;
+    VNInfo *ParentVNI = Edit->getParent().getVNInfoAt(VNI->def);
+    EqualVNs[ParentVNI->id].insert(VNI);
+  }
+
+  // For VNI aggregation of each ParentVNI, collect dominated, i.e.,
+  // redundant VNIs to BackCopies.
+  for (unsigned i = 0, e = Parent->getNumValNums(); i != e; ++i) {
+    VNInfo *ParentVNI = Parent->getValNumInfo(i);
+    if (!NotToHoistSet.count(ParentVNI->id))
+      continue;
+    SmallPtrSetIterator<VNInfo *> It1 = EqualVNs[ParentVNI->id].begin();
+    SmallPtrSetIterator<VNInfo *> It2 = It1;
+    for (; It1 != EqualVNs[ParentVNI->id].end(); ++It1) {
+      It2 = It1;
+      for (++It2; It2 != EqualVNs[ParentVNI->id].end(); ++It2) {
+        if (DominatedVNIs.count(*It1) || DominatedVNIs.count(*It2))
+          continue;
+
+        MachineBasicBlock *MBB1 = LIS.getMBBFromIndex((*It1)->def);
+        MachineBasicBlock *MBB2 = LIS.getMBBFromIndex((*It2)->def);
+        if (MBB1 == MBB2) {
+          DominatedVNIs.insert((*It1)->def < (*It2)->def ? (*It2) : (*It1));
+        } else if (MDT.dominates(MBB1, MBB2)) {
+          DominatedVNIs.insert(*It2);
+        } else if (MDT.dominates(MBB2, MBB1)) {
+          DominatedVNIs.insert(*It1);
+        }
+      }
+    }
+    if (!DominatedVNIs.empty()) {
+      forceRecompute(0, ParentVNI);
+      for (auto VNI : DominatedVNIs) {
+        BackCopies.push_back(VNI);
+      }
+      DominatedVNIs.clear();
+    }
+  }
+}
+
+/// For SM_Size mode, find a common dominator for all the back-copies for
+/// the same ParentVNI and hoist the backcopies to the dominator BB.
+/// For SM_Speed mode, if the common dominator is hot and it is not beneficial
+/// to do the hoisting, simply remove the dominated backcopies for the same
+/// ParentVNI.
+void SplitEditor::hoistCopies() {
   // Get the complement interval, always RegIdx 0.
   LiveInterval *LI = &LIS.getInterval(Edit->get(0));
   LiveInterval *Parent = &Edit->getParent();
@@ -724,6 +807,11 @@ void SplitEditor::hoistCopiesForSize() {
   // indexed by ParentVNI->id.
   typedef std::pair<MachineBasicBlock*, SlotIndex> DomPair;
   SmallVector<DomPair, 8> NearestDom(Parent->getNumValNums());
+  // The total cost of all the back-copies for each ParentVNI.
+  SmallVector<BlockFrequency, 8> Costs(Parent->getNumValNums());
+  // The ParentVNI->id set for which hoisting back-copies are not beneficial
+  // for Speed.
+  DenseSet<unsigned> NotToHoistSet;
 
   // Find the nearest common dominator for parent values with multiple
   // back-copies.  If a single back-copy dominates, put it in DomPair.second.
@@ -739,6 +827,7 @@ void SplitEditor::hoistCopiesForSize() {
       continue;
 
     MachineBasicBlock *ValMBB = LIS.getMBBFromIndex(VNI->def);
+
     DomPair &Dom = NearestDom[ParentVNI->id];
 
     // Keep directly defined parent values.  This is either a PHI or an
@@ -773,6 +862,7 @@ void SplitEditor::hoistCopiesForSize() {
       else if (Near != Dom.first)
         // None dominate. Hoist to common dominator, need new def.
         Dom = DomPair(Near, SlotIndex());
+      Costs[ParentVNI->id] += MBFI.getBlockFreq(ValMBB);
     }
 
     DEBUG(dbgs() << "Multi-mapped complement " << VNI->id << '@' << VNI->def
@@ -791,6 +881,11 @@ void SplitEditor::hoistCopiesForSize() {
     MachineBasicBlock *DefMBB = LIS.getMBBFromIndex(ParentVNI->def);
     // Get a less loopy dominator than Dom.first.
     Dom.first = findShallowDominator(Dom.first, DefMBB);
+    if (SpillMode == SM_Speed &&
+        MBFI.getBlockFreq(Dom.first) > Costs[ParentVNI->id]) {
+      NotToHoistSet.insert(ParentVNI->id);
+      continue;
+    }
     SlotIndex Last = LIS.getMBBEndIdx(Dom.first).getPrevSlot();
     Dom.second =
       defFromParent(0, ParentVNI, Last, *Dom.first,
@@ -805,11 +900,18 @@ void SplitEditor::hoistCopiesForSize() {
       continue;
     VNInfo *ParentVNI = Edit->getParent().getVNInfoAt(VNI->def);
     const DomPair &Dom = NearestDom[ParentVNI->id];
-    if (!Dom.first || Dom.second == VNI->def)
+    if (!Dom.first || Dom.second == VNI->def ||
+        NotToHoistSet.count(ParentVNI->id))
       continue;
     BackCopies.push_back(VNI);
     forceRecompute(0, ParentVNI);
   }
+
+  // If it is not beneficial to hoist all the BackCopies, simply remove
+  // redundant BackCopies in speed mode.
+  if (SpillMode == SM_Speed && !NotToHoistSet.empty())
+    computeRedundantBackCopies(NotToHoistSet, BackCopies);
+
   removeBackCopies(BackCopies);
 }
 
@@ -924,12 +1026,22 @@ bool SplitEditor::transferValues() {
 }
 
 void SplitEditor::extendPHIKillRanges() {
-    // Extend live ranges to be live-out for successor PHI values.
+  // Extend live ranges to be live-out for successor PHI values.
   for (const VNInfo *PHIVNI : Edit->getParent().valnos) {
     if (PHIVNI->isUnused() || !PHIVNI->isPHIDef())
       continue;
     unsigned RegIdx = RegAssign.lookup(PHIVNI->def);
     LiveRange &LR = LIS.getInterval(Edit->get(RegIdx));
+
+    // Check whether PHI is dead.
+    const LiveRange::Segment *Segment = LR.getSegmentContaining(PHIVNI->def);
+    assert(Segment != nullptr && "Missing segment for VNI");
+    if (Segment->end == PHIVNI->def.getDeadSlot()) {
+      // This is a dead PHI. Remove it.
+      LR.removeSegment(*Segment, true);
+      continue;
+    }
+
     LiveRangeCalc &LRC = getLRCalc(RegIdx);
     MachineBasicBlock *MBB = LIS.getMBBFromIndex(PHIVNI->def);
     for (MachineBasicBlock::pred_iterator PI = MBB->pred_begin(),
@@ -964,7 +1076,7 @@ void SplitEditor::rewriteAssigned(bool ExtendRanges) {
     // <undef> operands don't really read the register, so it doesn't matter
     // which register we choose.  When the use operand is tied to a def, we must
     // use the same register as the def, so just do that always.
-    SlotIndex Idx = LIS.getInstructionIndex(MI);
+    SlotIndex Idx = LIS.getInstructionIndex(*MI);
     if (MO.isDef() || MO.isUndef())
       Idx = Idx.getRegSlot(MO.isEarlyClobber());
 
@@ -1003,6 +1115,8 @@ void SplitEditor::deleteRematVictims() {
       // Dead defs end at the dead slot.
       if (S.end != S.valno->def.getDeadSlot())
         continue;
+      if (S.valno->isPHIDef())
+        continue;
       MachineInstr *MI = LIS.getInstructionFromIndex(S.valno->def);
       assert(MI && "Missing instruction for dead def");
       MI->addRegisterDead(LI->reg, &TRI);
@@ -1018,7 +1132,7 @@ void SplitEditor::deleteRematVictims() {
   if (Dead.empty())
     return;
 
-  Edit->eliminateDeadDefs(Dead);
+  Edit->eliminateDeadDefs(Dead, None, &AA);
 }
 
 void SplitEditor::finish(SmallVectorImpl<unsigned> *LRMap) {
@@ -1047,22 +1161,22 @@ void SplitEditor::finish(SmallVectorImpl<unsigned> *LRMap) {
     // Leave all back-copies as is.
     break;
   case SM_Size:
-    hoistCopiesForSize();
-    break;
   case SM_Speed:
-    llvm_unreachable("Spill mode 'speed' not implemented yet");
+    // hoistCopies will behave differently between size and speed.
+    hoistCopies();
   }
 
   // Transfer the simply mapped values, check if any are skipped.
   bool Skipped = transferValues();
+
+  // Rewrite virtual registers, possibly extending ranges.
+  rewriteAssigned(Skipped);
+
   if (Skipped)
     extendPHIKillRanges();
   else
     ++NumSimple;
 
-  // Rewrite virtual registers, possibly extending ranges.
-  rewriteAssigned(Skipped);
-
   // Delete defs that were rematted everywhere.
   if (Skipped)
     deleteRematVictims();
diff --git a/lib/CodeGen/SplitKit.h b/lib/CodeGen/SplitKit.h
index 69c65ff3f61d..a9684942885e 100644
--- a/lib/CodeGen/SplitKit.h
+++ b/lib/CodeGen/SplitKit.h
@@ -18,6 +18,7 @@
 #include "LiveRangeCalc.h"
 #include "llvm/ADT/ArrayRef.h"
 #include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/DenseSet.h"
 #include "llvm/ADT/IntervalMap.h"
 #include "llvm/ADT/SmallPtrSet.h"
 
@@ -37,6 +38,40 @@ class VirtRegMap;
 class VNInfo;
 class raw_ostream;
 
+/// Determines the latest safe point in a block in which we can insert a split,
+/// spill or other instruction related with CurLI.
+class LLVM_LIBRARY_VISIBILITY InsertPointAnalysis {
+private:
+  const LiveIntervals &LIS;
+
+  /// Last legal insert point in each basic block in the current function.
+  /// The first entry is the first terminator, the second entry is the
+  /// last valid point to insert a split or spill for a variable that is
+  /// live into a landing pad successor.
+  SmallVector<std::pair<SlotIndex, SlotIndex>, 8> LastInsertPoint;
+
+  SlotIndex computeLastInsertPoint(const LiveInterval &CurLI,
+                                   const MachineBasicBlock &MBB);
+
+public:
+  InsertPointAnalysis(const LiveIntervals &lis, unsigned BBNum);
+
+  /// Return the base index of the last valid insert point for \pCurLI in \pMBB.
+  SlotIndex getLastInsertPoint(const LiveInterval &CurLI,
+                               const MachineBasicBlock &MBB) {
+    unsigned Num = MBB.getNumber();
+    // Inline the common simple case.
+    if (LastInsertPoint[Num].first.isValid() &&
+        !LastInsertPoint[Num].second.isValid())
+      return LastInsertPoint[Num].first;
+    return computeLastInsertPoint(CurLI, MBB);
+  }
+
+  /// Returns the last insert point as an iterator for \pCurLI in \pMBB.
+  MachineBasicBlock::iterator getLastInsertPointIter(const LiveInterval &CurLI,
+                                                     MachineBasicBlock &MBB);
+};
+
 /// SplitAnalysis - Analyze a LiveInterval, looking for live range splitting
 /// opportunities.
 class LLVM_LIBRARY_VISIBILITY SplitAnalysis {
@@ -83,15 +118,12 @@ private:
   // Current live interval.
   const LiveInterval *CurLI;
 
+  /// Insert Point Analysis.
+  InsertPointAnalysis IPA;
+
   // Sorted slot indexes of using instructions.
   SmallVector<SlotIndex, 8> UseSlots;
 
-  /// LastSplitPoint - Last legal split point in each basic block in the current
-  /// function. The first entry is the first terminator, the second entry is the
-  /// last valid split point for a variable that is live in to a landing pad
-  /// successor.
-  SmallVector<std::pair<SlotIndex, SlotIndex>, 8> LastSplitPoint;
-
   /// UseBlocks - Blocks where CurLI has uses.
   SmallVector<BlockInfo, 8> UseBlocks;
 
@@ -108,8 +140,6 @@ private:
   /// DidRepairRange - analyze was forced to shrinkToUses().
   bool DidRepairRange;
 
-  SlotIndex computeLastSplitPoint(unsigned Num);
-
   // Sumarize statistics by counting instructions using CurLI.
   void analyzeUses();
 
@@ -136,19 +166,6 @@ public:
   /// getParent - Return the last analyzed interval.
   const LiveInterval &getParent() const { return *CurLI; }
 
-  /// getLastSplitPoint - Return the base index of the last valid split point
-  /// in the basic block numbered Num.
-  SlotIndex getLastSplitPoint(unsigned Num) {
-    // Inline the common simple case.
-    if (LastSplitPoint[Num].first.isValid() &&
-        !LastSplitPoint[Num].second.isValid())
-      return LastSplitPoint[Num].first;
-    return computeLastSplitPoint(Num);
-  }
-
-  /// getLastSplitPointIter - Returns the last split point as an iterator.
-  MachineBasicBlock::iterator getLastSplitPointIter(MachineBasicBlock*);
-
   /// isOriginalEndpoint - Return true if the original live range was killed or
   /// (re-)defined at Idx. Idx should be the 'def' slot for a normal kill/def,
   /// and 'use' for an early-clobber def.
@@ -194,6 +211,14 @@ public:
   /// @param BI           The block to be isolated.
   /// @param SingleInstrs True when single instructions should be isolated.
   bool shouldSplitSingleBlock(const BlockInfo &BI, bool SingleInstrs) const;
+
+  SlotIndex getLastSplitPoint(unsigned Num) {
+    return IPA.getLastInsertPoint(*CurLI, *MF.getBlockNumbered(Num));
+  }
+
+  MachineBasicBlock::iterator getLastSplitPointIter(MachineBasicBlock *BB) {
+    return IPA.getLastInsertPointIter(*CurLI, *BB);
+  }
 };
 
 
@@ -210,6 +235,7 @@ public:
 ///
 class LLVM_LIBRARY_VISIBILITY SplitEditor {
   SplitAnalysis &SA;
+  AliasAnalysis &AA;
   LiveIntervals &LIS;
   VirtRegMap &VRM;
   MachineRegisterInfo &MRI;
@@ -329,9 +355,14 @@ private:
   MachineBasicBlock *findShallowDominator(MachineBasicBlock *MBB,
                                           MachineBasicBlock *DefMBB);
 
-  /// hoistCopiesForSize - Hoist back-copies to the complement interval in a
-  /// way that minimizes code size. This implements the SM_Size spill mode.
-  void hoistCopiesForSize();
+  /// Find out all the backCopies dominated by others.
+  void computeRedundantBackCopies(DenseSet<unsigned> &NotToHoistSet,
+                                  SmallVectorImpl<VNInfo *> &BackCopies);
+
+  /// Hoist back-copies to the complement interval. It tries to hoist all
+  /// the back-copies to one BB if it is beneficial, or else simply remove
+  /// redundant backcopies dominated by others.
+  void hoistCopies();
 
   /// transferValues - Transfer values to the new ranges.
   /// Return true if any ranges were skipped.
@@ -350,8 +381,9 @@ private:
 public:
   /// Create a new SplitEditor for editing the LiveInterval analyzed by SA.
   /// Newly created intervals will be appended to newIntervals.
-  SplitEditor(SplitAnalysis &SA, LiveIntervals&, VirtRegMap&,
-              MachineDominatorTree&, MachineBlockFrequencyInfo &);
+  SplitEditor(SplitAnalysis &SA, AliasAnalysis &AA, LiveIntervals&,
+              VirtRegMap&, MachineDominatorTree&,
+              MachineBlockFrequencyInfo &);
 
   /// reset - Prepare for a new split.
   void reset(LiveRangeEdit&, ComplementSpillMode = SM_Partition);
diff --git a/lib/CodeGen/StackColoring.cpp b/lib/CodeGen/StackColoring.cpp
index 7b5203815172..87cd470d5690 100644
--- a/lib/CodeGen/StackColoring.cpp
+++ b/lib/CodeGen/StackColoring.cpp
@@ -21,33 +21,30 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "llvm/CodeGen/Passes.h"
 #include "llvm/ADT/BitVector.h"
 #include "llvm/ADT/DepthFirstIterator.h"
 #include "llvm/ADT/PostOrderIterator.h"
 #include "llvm/ADT/SetVector.h"
 #include "llvm/ADT/SmallPtrSet.h"
-#include "llvm/ADT/SparseSet.h"
 #include "llvm/ADT/Statistic.h"
 #include "llvm/Analysis/ValueTracking.h"
 #include "llvm/CodeGen/LiveInterval.h"
 #include "llvm/CodeGen/MachineBasicBlock.h"
-#include "llvm/CodeGen/MachineBranchProbabilityInfo.h"
-#include "llvm/CodeGen/MachineDominators.h"
 #include "llvm/CodeGen/MachineFrameInfo.h"
 #include "llvm/CodeGen/MachineFunctionPass.h"
 #include "llvm/CodeGen/MachineLoopInfo.h"
 #include "llvm/CodeGen/MachineMemOperand.h"
 #include "llvm/CodeGen/MachineModuleInfo.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/Passes.h"
 #include "llvm/CodeGen/PseudoSourceValue.h"
 #include "llvm/CodeGen/SlotIndexes.h"
 #include "llvm/CodeGen/StackProtector.h"
 #include "llvm/CodeGen/WinEHFuncInfo.h"
 #include "llvm/IR/DebugInfo.h"
-#include "llvm/IR/Dominators.h"
 #include "llvm/IR/Function.h"
 #include "llvm/IR/Instructions.h"
+#include "llvm/IR/IntrinsicInst.h"
 #include "llvm/IR/Module.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Debug.h"
@@ -67,18 +64,180 @@ DisableColoring("no-stack-coloring",
 /// The user may write code that uses allocas outside of the declared lifetime
 /// zone. This can happen when the user returns a reference to a local
 /// data-structure. We can detect these cases and decide not to optimize the
-/// code. If this flag is enabled, we try to save the user.
+/// code. If this flag is enabled, we try to save the user. This option
+/// is treated as overriding LifetimeStartOnFirstUse below.
 static cl::opt<bool>
 ProtectFromEscapedAllocas("protect-from-escaped-allocas",
                           cl::init(false), cl::Hidden,
                           cl::desc("Do not optimize lifetime zones that "
                                    "are broken"));
 
+/// Enable enhanced dataflow scheme for lifetime analysis (treat first
+/// use of stack slot as start of slot lifetime, as opposed to looking
+/// for LIFETIME_START marker). See "Implementation notes" below for
+/// more info.
+static cl::opt<bool>
+LifetimeStartOnFirstUse("stackcoloring-lifetime-start-on-first-use",
+        cl::init(true), cl::Hidden,
+        cl::desc("Treat stack lifetimes as starting on first use, not on START marker."));
+
+
 STATISTIC(NumMarkerSeen,  "Number of lifetime markers found.");
 STATISTIC(StackSpaceSaved, "Number of bytes saved due to merging slots.");
 STATISTIC(StackSlotMerged, "Number of stack slot merged.");
 STATISTIC(EscapedAllocas, "Number of allocas that escaped the lifetime region");
 
+//
+// Implementation Notes:
+// ---------------------
+//
+// Consider the following motivating example:
+//
+//     int foo() {
+//       char b1[1024], b2[1024];
+//       if (...) {
+//         char b3[1024];
+//         <uses of b1, b3>;
+//         return x;
+//       } else {
+//         char b4[1024], b5[1024];
+//         <uses of b2, b4, b5>;
+//         return y;
+//       }
+//     }
+//
+// In the code above, "b3" and "b4" are declared in distinct lexical
+// scopes, meaning that it is easy to prove that they can share the
+// same stack slot. Variables "b1" and "b2" are declared in the same
+// scope, meaning that from a lexical point of view, their lifetimes
+// overlap. From a control flow pointer of view, however, the two
+// variables are accessed in disjoint regions of the CFG, thus it
+// should be possible for them to share the same stack slot. An ideal
+// stack allocation for the function above would look like:
+//
+//     slot 0: b1, b2
+//     slot 1: b3, b4
+//     slot 2: b5
+//
+// Achieving this allocation is tricky, however, due to the way
+// lifetime markers are inserted. Here is a simplified view of the
+// control flow graph for the code above:
+//
+//                +------  block 0 -------+
+//               0| LIFETIME_START b1, b2 |
+//               1| <test 'if' condition> |
+//                +-----------------------+
+//                   ./              \.
+//   +------  block 1 -------+   +------  block 2 -------+
+//  2| LIFETIME_START b3     |  5| LIFETIME_START b4, b5 |
+//  3| <uses of b1, b3>      |  6| <uses of b2, b4, b5>  |
+//  4| LIFETIME_END b3       |  7| LIFETIME_END b4, b5   |
+//   +-----------------------+   +-----------------------+
+//                   \.              /.
+//                +------  block 3 -------+
+//               8| <cleanupcode>         |
+//               9| LIFETIME_END b1, b2   |
+//              10| return                |
+//                +-----------------------+
+//
+// If we create live intervals for the variables above strictly based
+// on the lifetime markers, we'll get the set of intervals on the
+// left. If we ignore the lifetime start markers and instead treat a
+// variable's lifetime as beginning with the first reference to the
+// var, then we get the intervals on the right.
+//
+//            LIFETIME_START      First Use
+//     b1:    [0,9]               [3,4] [8,9]
+//     b2:    [0,9]               [6,9]
+//     b3:    [2,4]               [3,4]
+//     b4:    [5,7]               [6,7]
+//     b5:    [5,7]               [6,7]
+//
+// For the intervals on the left, the best we can do is overlap two
+// variables (b3 and b4, for example); this gives us a stack size of
+// 4*1024 bytes, not ideal. When treating first-use as the start of a
+// lifetime, we can additionally overlap b1 and b5, giving us a 3*1024
+// byte stack (better).
+//
+// Relying entirely on first-use of stack slots is problematic,
+// however, due to the fact that optimizations can sometimes migrate
+// uses of a variable outside of its lifetime start/end region. Here
+// is an example:
+//
+//     int bar() {
+//       char b1[1024], b2[1024];
+//       if (...) {
+//         <uses of b2>
+//         return y;
+//       } else {
+//         <uses of b1>
+//         while (...) {
+//           char b3[1024];
+//           <uses of b3>
+//         }
+//       }
+//     }
+//
+// Before optimization, the control flow graph for the code above
+// might look like the following:
+//
+//                +------  block 0 -------+
+//               0| LIFETIME_START b1, b2 |
+//               1| <test 'if' condition> |
+//                +-----------------------+
+//                   ./              \.
+//   +------  block 1 -------+    +------- block 2 -------+
+//  2| <uses of b2>          |   3| <uses of b1>          |
+//   +-----------------------+    +-----------------------+
+//              |                            |
+//              |                 +------- block 3 -------+ <-\.
+//              |                4| <while condition>     |    |
+//              |                 +-----------------------+    |
+//              |               /          |                   |
+//              |              /  +------- block 4 -------+
+//              \             /  5| LIFETIME_START b3     |    |
+//               \           /   6| <uses of b3>          |    |
+//                \         /    7| LIFETIME_END b3       |    |
+//                 \        |    +------------------------+    |
+//                  \       |                 \                /
+//                +------  block 5 -----+      \---------------
+//               8| <cleanupcode>       |
+//               9| LIFETIME_END b1, b2 |
+//              10| return              |
+//                +---------------------+
+//
+// During optimization, however, it can happen that an instruction
+// computing an address in "b3" (for example, a loop-invariant GEP) is
+// hoisted up out of the loop from block 4 to block 2.  [Note that
+// this is not an actual load from the stack, only an instruction that
+// computes the address to be loaded]. If this happens, there is now a
+// path leading from the first use of b3 to the return instruction
+// that does not encounter the b3 LIFETIME_END, hence b3's lifetime is
+// now larger than if we were computing live intervals strictly based
+// on lifetime markers. In the example above, this lengthened lifetime
+// would mean that it would appear illegal to overlap b3 with b2.
+//
+// To deal with this such cases, the code in ::collectMarkers() below
+// tries to identify "degenerate" slots -- those slots where on a single
+// forward pass through the CFG we encounter a first reference to slot
+// K before we hit the slot K lifetime start marker. For such slots,
+// we fall back on using the lifetime start marker as the beginning of
+// the variable's lifetime.  NB: with this implementation, slots can
+// appear degenerate in cases where there is unstructured control flow:
+//
+//    if (q) goto mid;
+//    if (x > 9) {
+//         int b[100];
+//         memcpy(&b[0], ...);
+//    mid: b[k] = ...;
+//         abc(&b);
+//    }
+//
+// If in RPO ordering chosen to walk the CFG  we happen to visit the b[k]
+// before visiting the memcpy block (which will contain the lifetime start
+// for "b" then it will appear that 'b' has a degenerate lifetime.
+//
+
 //===----------------------------------------------------------------------===//
 //                           StackColoring Pass
 //===----------------------------------------------------------------------===//
@@ -126,6 +285,17 @@ class StackColoring : public MachineFunctionPass {
   /// once the coloring is done.
   SmallVector<MachineInstr*, 8> Markers;
 
+  /// Record the FI slots for which we have seen some sort of
+  /// lifetime marker (either start or end).
+  BitVector InterestingSlots;
+
+  /// FI slots that need to be handled conservatively (for these
+  /// slots lifetime-start-on-first-use is disabled).
+  BitVector ConservativeSlots;
+
+  /// Number of iterations taken during data flow analysis.
+  unsigned NumIterations;
+
 public:
   static char ID;
   StackColoring() : MachineFunctionPass(ID) {
@@ -137,6 +307,9 @@ public:
 private:
   /// Debug.
   void dump() const;
+  void dumpIntervals() const;
+  void dumpBB(MachineBasicBlock *MBB) const;
+  void dumpBV(const char *tag, const BitVector &BV) const;
 
   /// Removes all of the lifetime marker instructions from the function.
   /// \returns true if any markers were removed.
@@ -153,6 +326,25 @@ private:
   /// in and out blocks.
   void calculateLocalLiveness();
 
+  /// Returns TRUE if we're using the first-use-begins-lifetime method for
+  /// this slot (if FALSE, then the start marker is treated as start of lifetime).
+  bool applyFirstUse(int Slot) {
+    if (!LifetimeStartOnFirstUse || ProtectFromEscapedAllocas)
+      return false;
+    if (ConservativeSlots.test(Slot))
+      return false;
+    return true;
+  }
+
+  /// Examines the specified instruction and returns TRUE if the instruction
+  /// represents the start or end of an interesting lifetime. The slot or slots
+  /// starting or ending are added to the vector "slots" and "isStart" is set
+  /// accordingly.
+  /// \returns True if inst contains a lifetime start or end
+  bool isLifetimeStartOrEnd(const MachineInstr &MI,
+                            SmallVector<int, 4> &slots,
+                            bool &isStart);
+
   /// Construct the LiveIntervals for the slots.
   void calculateLiveIntervals(unsigned NumSlots);
 
@@ -170,7 +362,10 @@ private:
 
   /// Map entries which point to other entries to their destination.
   ///   A->B->C becomes A->C.
-   void expungeSlotMap(DenseMap<int, int> &SlotRemap, unsigned NumSlots);
+  void expungeSlotMap(DenseMap<int, int> &SlotRemap, unsigned NumSlots);
+
+  /// Used in collectMarkers
+  typedef DenseMap<const MachineBasicBlock*, BitVector> BlockBitVecMap;
 };
 } // end anonymous namespace
 
@@ -179,55 +374,202 @@ char &llvm::StackColoringID = StackColoring::ID;
 
 INITIALIZE_PASS_BEGIN(StackColoring,
                    "stack-coloring", "Merge disjoint stack slots", false, false)
-INITIALIZE_PASS_DEPENDENCY(MachineDominatorTree)
 INITIALIZE_PASS_DEPENDENCY(SlotIndexes)
 INITIALIZE_PASS_DEPENDENCY(StackProtector)
 INITIALIZE_PASS_END(StackColoring,
                    "stack-coloring", "Merge disjoint stack slots", false, false)
 
 void StackColoring::getAnalysisUsage(AnalysisUsage &AU) const {
-  AU.addRequired<MachineDominatorTree>();
-  AU.addPreserved<MachineDominatorTree>();
   AU.addRequired<SlotIndexes>();
   AU.addRequired<StackProtector>();
   MachineFunctionPass::getAnalysisUsage(AU);
 }
 
-void StackColoring::dump() const {
-  for (MachineBasicBlock *MBB : depth_first(MF)) {
-    DEBUG(dbgs() << "Inspecting block #" << BasicBlocks.lookup(MBB) << " ["
-                 << MBB->getName() << "]\n");
+#ifndef NDEBUG
 
-    LivenessMap::const_iterator BI = BlockLiveness.find(MBB);
-    assert(BI != BlockLiveness.end() && "Block not found");
-    const BlockLifetimeInfo &BlockInfo = BI->second;
+LLVM_DUMP_METHOD void StackColoring::dumpBV(const char *tag,
+                                            const BitVector &BV) const {
+  DEBUG(dbgs() << tag << " : { ");
+  for (unsigned I = 0, E = BV.size(); I != E; ++I)
+    DEBUG(dbgs() << BV.test(I) << " ");
+  DEBUG(dbgs() << "}\n");
+}
+
+LLVM_DUMP_METHOD void StackColoring::dumpBB(MachineBasicBlock *MBB) const {
+  LivenessMap::const_iterator BI = BlockLiveness.find(MBB);
+  assert(BI != BlockLiveness.end() && "Block not found");
+  const BlockLifetimeInfo &BlockInfo = BI->second;
 
-    DEBUG(dbgs()<<"BEGIN  : {");
-    for (unsigned i=0; i < BlockInfo.Begin.size(); ++i)
-      DEBUG(dbgs()<<BlockInfo.Begin.test(i)<<" ");
-    DEBUG(dbgs()<<"}\n");
+  dumpBV("BEGIN", BlockInfo.Begin);
+  dumpBV("END", BlockInfo.End);
+  dumpBV("LIVE_IN", BlockInfo.LiveIn);
+  dumpBV("LIVE_OUT", BlockInfo.LiveOut);
+}
 
-    DEBUG(dbgs()<<"END    : {");
-    for (unsigned i=0; i < BlockInfo.End.size(); ++i)
-      DEBUG(dbgs()<<BlockInfo.End.test(i)<<" ");
+LLVM_DUMP_METHOD void StackColoring::dump() const {
+  for (MachineBasicBlock *MBB : depth_first(MF)) {
+    DEBUG(dbgs() << "Inspecting block #" << MBB->getNumber() << " ["
+                 << MBB->getName() << "]\n");
+    DEBUG(dumpBB(MBB));
+  }
+}
 
-    DEBUG(dbgs()<<"}\n");
+LLVM_DUMP_METHOD void StackColoring::dumpIntervals() const {
+  for (unsigned I = 0, E = Intervals.size(); I != E; ++I) {
+    DEBUG(dbgs() << "Interval[" << I << "]:\n");
+    DEBUG(Intervals[I]->dump());
+  }
+}
 
-    DEBUG(dbgs()<<"LIVE_IN: {");
-    for (unsigned i=0; i < BlockInfo.LiveIn.size(); ++i)
-      DEBUG(dbgs()<<BlockInfo.LiveIn.test(i)<<" ");
+#endif // not NDEBUG
+
+static inline int getStartOrEndSlot(const MachineInstr &MI)
+{
+  assert((MI.getOpcode() == TargetOpcode::LIFETIME_START ||
+          MI.getOpcode() == TargetOpcode::LIFETIME_END) &&
+         "Expected LIFETIME_START or LIFETIME_END op");
+  const MachineOperand &MO = MI.getOperand(0);
+  int Slot = MO.getIndex();
+  if (Slot >= 0)
+    return Slot;
+  return -1;
+}
 
-    DEBUG(dbgs()<<"}\n");
-    DEBUG(dbgs()<<"LIVEOUT: {");
-    for (unsigned i=0; i < BlockInfo.LiveOut.size(); ++i)
-      DEBUG(dbgs()<<BlockInfo.LiveOut.test(i)<<" ");
-    DEBUG(dbgs()<<"}\n");
+//
+// At the moment the only way to end a variable lifetime is with
+// a VARIABLE_LIFETIME op (which can't contain a start). If things
+// change and the IR allows for a single inst that both begins
+// and ends lifetime(s), this interface will need to be reworked.
+//
+bool StackColoring::isLifetimeStartOrEnd(const MachineInstr &MI,
+                                         SmallVector<int, 4> &slots,
+                                         bool &isStart)
+{
+  if (MI.getOpcode() == TargetOpcode::LIFETIME_START ||
+      MI.getOpcode() == TargetOpcode::LIFETIME_END) {
+    int Slot = getStartOrEndSlot(MI);
+    if (Slot < 0)
+      return false;
+    if (!InterestingSlots.test(Slot))
+      return false;
+    slots.push_back(Slot);
+    if (MI.getOpcode() == TargetOpcode::LIFETIME_END) {
+      isStart = false;
+      return true;
+    }
+    if (! applyFirstUse(Slot)) {
+      isStart = true;
+      return true;
+    }
+  } else if (LifetimeStartOnFirstUse && !ProtectFromEscapedAllocas) {
+    if (! MI.isDebugValue()) {
+      bool found = false;
+      for (const MachineOperand &MO : MI.operands()) {
+        if (!MO.isFI())
+          continue;
+        int Slot = MO.getIndex();
+        if (Slot<0)
+          continue;
+        if (InterestingSlots.test(Slot) && applyFirstUse(Slot)) {
+          slots.push_back(Slot);
+          found = true;
+        }
+      }
+      if (found) {
+        isStart = true;
+        return true;
+      }
+    }
   }
+  return false;
 }
 
-unsigned StackColoring::collectMarkers(unsigned NumSlot) {
+unsigned StackColoring::collectMarkers(unsigned NumSlot)
+{
   unsigned MarkersFound = 0;
-  // Scan the function to find all lifetime markers.
+  BlockBitVecMap SeenStartMap;
+  InterestingSlots.clear();
+  InterestingSlots.resize(NumSlot);
+  ConservativeSlots.clear();
+  ConservativeSlots.resize(NumSlot);
+
+  // number of start and end lifetime ops for each slot
+  SmallVector<int, 8> NumStartLifetimes(NumSlot, 0);
+  SmallVector<int, 8> NumEndLifetimes(NumSlot, 0);
+
+  // Step 1: collect markers and populate the "InterestingSlots"
+  // and "ConservativeSlots" sets.
+  for (MachineBasicBlock *MBB : depth_first(MF)) {
+
+    // Compute the set of slots for which we've seen a START marker but have
+    // not yet seen an END marker at this point in the walk (e.g. on entry
+    // to this bb).
+    BitVector BetweenStartEnd;
+    BetweenStartEnd.resize(NumSlot);
+    for (MachineBasicBlock::const_pred_iterator PI = MBB->pred_begin(),
+             PE = MBB->pred_end(); PI != PE; ++PI) {
+      BlockBitVecMap::const_iterator I = SeenStartMap.find(*PI);
+      if (I != SeenStartMap.end()) {
+        BetweenStartEnd |= I->second;
+      }
+    }
+
+    // Walk the instructions in the block to look for start/end ops.
+    for (MachineInstr &MI : *MBB) {
+      if (MI.getOpcode() == TargetOpcode::LIFETIME_START ||
+          MI.getOpcode() == TargetOpcode::LIFETIME_END) {
+        int Slot = getStartOrEndSlot(MI);
+        if (Slot < 0)
+          continue;
+        InterestingSlots.set(Slot);
+        if (MI.getOpcode() == TargetOpcode::LIFETIME_START) {
+          BetweenStartEnd.set(Slot);
+          NumStartLifetimes[Slot] += 1;
+        } else {
+          BetweenStartEnd.reset(Slot);
+          NumEndLifetimes[Slot] += 1;
+        }
+        const AllocaInst *Allocation = MFI->getObjectAllocation(Slot);
+        if (Allocation) {
+          DEBUG(dbgs() << "Found a lifetime ");
+          DEBUG(dbgs() << (MI.getOpcode() == TargetOpcode::LIFETIME_START
+                               ? "start"
+                               : "end"));
+          DEBUG(dbgs() << " marker for slot #" << Slot);
+          DEBUG(dbgs() << " with allocation: " << Allocation->getName()
+                       << "\n");
+        }
+        Markers.push_back(&MI);
+        MarkersFound += 1;
+      } else {
+        for (const MachineOperand &MO : MI.operands()) {
+          if (!MO.isFI())
+            continue;
+          int Slot = MO.getIndex();
+          if (Slot < 0)
+            continue;
+          if (! BetweenStartEnd.test(Slot)) {
+            ConservativeSlots.set(Slot);
+          }
+        }
+      }
+    }
+    BitVector &SeenStart = SeenStartMap[MBB];
+    SeenStart |= BetweenStartEnd;
+  }
+  if (!MarkersFound) {
+    return 0;
+  }
+
+  // PR27903: slots with multiple start or end lifetime ops are not
+  // safe to enable for "lifetime-start-on-first-use".
+  for (unsigned slot = 0; slot < NumSlot; ++slot)
+    if (NumStartLifetimes[slot] > 1 || NumEndLifetimes[slot] > 1)
+      ConservativeSlots.set(slot);
+  DEBUG(dumpBV("Conservative slots", ConservativeSlots));
+
+  // Step 2: compute begin/end sets for each block
+
   // NOTE: We use a reverse-post-order iteration to ensure that we obtain a
   // deterministic numbering, and because we'll need a post-order iteration
   // later for solving the liveness dataflow problem.
@@ -243,35 +585,33 @@ unsigned StackColoring::collectMarkers(unsigned NumSlot) {
     BlockInfo.Begin.resize(NumSlot);
     BlockInfo.End.resize(NumSlot);
 
+    SmallVector<int, 4> slots;
     for (MachineInstr &MI : *MBB) {
-      if (MI.getOpcode() != TargetOpcode::LIFETIME_START &&
-          MI.getOpcode() != TargetOpcode::LIFETIME_END)
-        continue;
-
-      Markers.push_back(&MI);
-
-      bool IsStart = MI.getOpcode() == TargetOpcode::LIFETIME_START;
-      const MachineOperand &MO = MI.getOperand(0);
-      unsigned Slot = MO.getIndex();
-
-      MarkersFound++;
-
-      const AllocaInst *Allocation = MFI->getObjectAllocation(Slot);
-      if (Allocation) {
-        DEBUG(dbgs()<<"Found a lifetime marker for slot #"<<Slot<<
-              " with allocation: "<< Allocation->getName()<<"\n");
-      }
-
-      if (IsStart) {
-        BlockInfo.Begin.set(Slot);
-      } else {
-        if (BlockInfo.Begin.test(Slot)) {
-          // Allocas that start and end within a single block are handled
-          // specially when computing the LiveIntervals to avoid pessimizing
-          // the liveness propagation.
-          BlockInfo.Begin.reset(Slot);
-        } else {
+      bool isStart = false;
+      slots.clear();
+      if (isLifetimeStartOrEnd(MI, slots, isStart)) {
+        if (!isStart) {
+          assert(slots.size() == 1 && "unexpected: MI ends multiple slots");
+          int Slot = slots[0];
+          if (BlockInfo.Begin.test(Slot)) {
+            BlockInfo.Begin.reset(Slot);
+          }
           BlockInfo.End.set(Slot);
+        } else {
+          for (auto Slot : slots) {
+            DEBUG(dbgs() << "Found a use of slot #" << Slot);
+            DEBUG(dbgs() << " at BB#" << MBB->getNumber() << " index ");
+            DEBUG(Indexes->getInstructionIndex(MI).print(dbgs()));
+            const AllocaInst *Allocation = MFI->getObjectAllocation(Slot);
+            if (Allocation) {
+              DEBUG(dbgs() << " with allocation: "<< Allocation->getName());
+            }
+            DEBUG(dbgs() << "\n");
+            if (BlockInfo.End.test(Slot)) {
+              BlockInfo.End.reset(Slot);
+            }
+            BlockInfo.Begin.set(Slot);
+          }
         }
       }
     }
@@ -282,90 +622,56 @@ unsigned StackColoring::collectMarkers(unsigned NumSlot) {
   return MarkersFound;
 }
 
-void StackColoring::calculateLocalLiveness() {
-  // Perform a standard reverse dataflow computation to solve for
-  // global liveness.  The BEGIN set here is equivalent to KILL in the standard
-  // formulation, and END is equivalent to GEN.  The result of this computation
-  // is a map from blocks to bitvectors where the bitvectors represent which
-  // allocas are live in/out of that block.
-  SmallPtrSet<const MachineBasicBlock*, 8> BBSet(BasicBlockNumbering.begin(),
-                                                 BasicBlockNumbering.end());
-  unsigned NumSSMIters = 0;
+void StackColoring::calculateLocalLiveness()
+{
+  unsigned NumIters = 0;
   bool changed = true;
   while (changed) {
     changed = false;
-    ++NumSSMIters;
-
-    SmallPtrSet<const MachineBasicBlock*, 8> NextBBSet;
+    ++NumIters;
 
     for (const MachineBasicBlock *BB : BasicBlockNumbering) {
-      if (!BBSet.count(BB)) continue;
 
       // Use an iterator to avoid repeated lookups.
       LivenessMap::iterator BI = BlockLiveness.find(BB);
       assert(BI != BlockLiveness.end() && "Block not found");
       BlockLifetimeInfo &BlockInfo = BI->second;
 
+      // Compute LiveIn by unioning together the LiveOut sets of all preds.
       BitVector LocalLiveIn;
-      BitVector LocalLiveOut;
-
-      // Forward propagation from begins to ends.
       for (MachineBasicBlock::const_pred_iterator PI = BB->pred_begin(),
            PE = BB->pred_end(); PI != PE; ++PI) {
         LivenessMap::const_iterator I = BlockLiveness.find(*PI);
         assert(I != BlockLiveness.end() && "Predecessor not found");
         LocalLiveIn |= I->second.LiveOut;
       }
-      LocalLiveIn |= BlockInfo.End;
-      LocalLiveIn.reset(BlockInfo.Begin);
-
-      // Reverse propagation from ends to begins.
-      for (MachineBasicBlock::const_succ_iterator SI = BB->succ_begin(),
-           SE = BB->succ_end(); SI != SE; ++SI) {
-        LivenessMap::const_iterator I = BlockLiveness.find(*SI);
-        assert(I != BlockLiveness.end() && "Successor not found");
-        LocalLiveOut |= I->second.LiveIn;
-      }
-      LocalLiveOut |= BlockInfo.Begin;
-      LocalLiveOut.reset(BlockInfo.End);
-
-      LocalLiveIn |= LocalLiveOut;
-      LocalLiveOut |= LocalLiveIn;
 
-      // After adopting the live bits, we need to turn-off the bits which
-      // are de-activated in this block.
+      // Compute LiveOut by subtracting out lifetimes that end in this
+      // block, then adding in lifetimes that begin in this block.  If
+      // we have both BEGIN and END markers in the same basic block
+      // then we know that the BEGIN marker comes after the END,
+      // because we already handle the case where the BEGIN comes
+      // before the END when collecting the markers (and building the
+      // BEGIN/END vectors).
+      BitVector LocalLiveOut = LocalLiveIn;
       LocalLiveOut.reset(BlockInfo.End);
-      LocalLiveIn.reset(BlockInfo.Begin);
-
-      // If we have both BEGIN and END markers in the same basic block then
-      // we know that the BEGIN marker comes after the END, because we already
-      // handle the case where the BEGIN comes before the END when collecting
-      // the markers (and building the BEGIN/END vectore).
-      // Want to enable the LIVE_IN and LIVE_OUT of slots that have both
-      // BEGIN and END because it means that the value lives before and after
-      // this basic block.
-      BitVector LocalEndBegin = BlockInfo.End;
-      LocalEndBegin &= BlockInfo.Begin;
-      LocalLiveIn |= LocalEndBegin;
-      LocalLiveOut |= LocalEndBegin;
+      LocalLiveOut |= BlockInfo.Begin;
 
+      // Update block LiveIn set, noting whether it has changed.
       if (LocalLiveIn.test(BlockInfo.LiveIn)) {
         changed = true;
         BlockInfo.LiveIn |= LocalLiveIn;
-
-        NextBBSet.insert(BB->pred_begin(), BB->pred_end());
       }
 
+      // Update block LiveOut set, noting whether it has changed.
       if (LocalLiveOut.test(BlockInfo.LiveOut)) {
         changed = true;
         BlockInfo.LiveOut |= LocalLiveOut;
-
-        NextBBSet.insert(BB->succ_begin(), BB->succ_end());
       }
     }
-
-    BBSet = std::move(NextBBSet);
   }// while changed.
+
+  NumIterations = NumIters;
 }
 
 void StackColoring::calculateLiveIntervals(unsigned NumSlots) {
@@ -380,28 +686,22 @@ void StackColoring::calculateLiveIntervals(unsigned NumSlots) {
     Finishes.clear();
     Finishes.resize(NumSlots);
 
-    // Create the interval for the basic blocks with lifetime markers in them.
-    for (const MachineInstr *MI : Markers) {
-      if (MI->getParent() != &MBB)
-        continue;
-
-      assert((MI->getOpcode() == TargetOpcode::LIFETIME_START ||
-              MI->getOpcode() == TargetOpcode::LIFETIME_END) &&
-             "Invalid Lifetime marker");
-
-      bool IsStart = MI->getOpcode() == TargetOpcode::LIFETIME_START;
-      const MachineOperand &Mo = MI->getOperand(0);
-      int Slot = Mo.getIndex();
-      assert(Slot >= 0 && "Invalid slot");
+    // Create the interval for the basic blocks containing lifetime begin/end.
+    for (const MachineInstr &MI : MBB) {
 
+      SmallVector<int, 4> slots;
+      bool IsStart = false;
+      if (!isLifetimeStartOrEnd(MI, slots, IsStart))
+        continue;
       SlotIndex ThisIndex = Indexes->getInstructionIndex(MI);
-
-      if (IsStart) {
-        if (!Starts[Slot].isValid() || Starts[Slot] > ThisIndex)
-          Starts[Slot] = ThisIndex;
-      } else {
-        if (!Finishes[Slot].isValid() || Finishes[Slot] < ThisIndex)
-          Finishes[Slot] = ThisIndex;
+      for (auto Slot : slots) {
+        if (IsStart) {
+          if (!Starts[Slot].isValid() || Starts[Slot] > ThisIndex)
+            Starts[Slot] = ThisIndex;
+        } else {
+          if (!Finishes[Slot].isValid() || Finishes[Slot] < ThisIndex)
+            Finishes[Slot] = ThisIndex;
+        }
       }
     }
 
@@ -417,7 +717,29 @@ void StackColoring::calculateLiveIntervals(unsigned NumSlots) {
     }
 
     for (unsigned i = 0; i < NumSlots; ++i) {
-      assert(Starts[i].isValid() == Finishes[i].isValid() && "Unmatched range");
+      //
+      // When LifetimeStartOnFirstUse is turned on, data flow analysis
+      // is forward (from starts to ends), not bidirectional. A
+      // consequence of this is that we can wind up in situations
+      // where Starts[i] is invalid but Finishes[i] is valid and vice
+      // versa. Example:
+      //
+      //     LIFETIME_START x
+      //     if (...) {
+      //       <use of x>
+      //       throw ...;
+      //     }
+      //     LIFETIME_END x
+      //     return 2;
+      //
+      //
+      // Here the slot for "x" will not be live into the block
+      // containing the "return 2" (since lifetimes start with first
+      // use, not at the dominating LIFETIME_START marker).
+      //
+      if (Starts[i].isValid() && !Finishes[i].isValid()) {
+        Finishes[i] = Indexes->getMBBEndIdx(&MBB);
+      }
       if (!Starts[i].isValid())
         continue;
 
@@ -495,10 +817,21 @@ void StackColoring::remapInstructions(DenseMap<int, int> &SlotRemap) {
     // upcoming replacement.
     SP->adjustForColoring(From, To);
 
+    // The new alloca might not be valid in a llvm.dbg.declare for this
+    // variable, so undef out the use to make the verifier happy.
+    AllocaInst *FromAI = const_cast<AllocaInst *>(From);
+    if (FromAI->isUsedByMetadata())
+      ValueAsMetadata::handleRAUW(FromAI, UndefValue::get(FromAI->getType()));
+    for (auto &Use : FromAI->uses()) {
+      if (BitCastInst *BCI = dyn_cast<BitCastInst>(Use.get()))
+        if (BCI->isUsedByMetadata())
+          ValueAsMetadata::handleRAUW(BCI, UndefValue::get(BCI->getType()));
+    }
+
     // Note that this will not replace uses in MMOs (which we'll update below),
     // or anywhere else (which is why we won't delete the original
     // instruction).
-    const_cast<AllocaInst *>(From)->replaceAllUsesWith(Inst);
+    FromAI->replaceAllUsesWith(Inst);
   }
 
   // Remap all instructions to the new stack slots.
@@ -557,7 +890,7 @@ void StackColoring::remapInstructions(DenseMap<int, int> &SlotRemap) {
         // If we *don't* protect the user from escaped allocas, don't bother
         // validating the instructions.
         if (!I.isDebugValue() && TouchesMemory && ProtectFromEscapedAllocas) {
-          SlotIndex Index = Indexes->getInstructionIndex(&I);
+          SlotIndex Index = Indexes->getInstructionIndex(I);
           const LiveInterval *Interval = &*Intervals[FromSlot];
           assert(Interval->find(Index) != Interval->end() &&
                  "Found instruction usage outside of live range.");
@@ -616,7 +949,7 @@ void StackColoring::removeInvalidSlotRanges() {
         // Check that the used slot is inside the calculated lifetime range.
         // If it is not, warn about it and invalidate the range.
         LiveInterval *Interval = &*Intervals[Slot];
-        SlotIndex Index = Indexes->getInstructionIndex(&I);
+        SlotIndex Index = Indexes->getInstructionIndex(I);
         if (Interval->find(Index) == Interval->end()) {
           Interval->clear();
           DEBUG(dbgs()<<"Invalidating range #"<<Slot<<"\n");
@@ -643,9 +976,6 @@ void StackColoring::expungeSlotMap(DenseMap<int, int> &SlotRemap,
 }
 
 bool StackColoring::runOnMachineFunction(MachineFunction &Func) {
-  if (skipOptnoneFunction(*Func.getFunction()))
-    return false;
-
   DEBUG(dbgs() << "********** Stack Coloring **********\n"
                << "********** Function: "
                << ((const Value*)Func.getFunction())->getName() << '\n');
@@ -667,7 +997,6 @@ bool StackColoring::runOnMachineFunction(MachineFunction &Func) {
     return false;
 
   SmallVector<int, 8> SortedSlots;
-
   SortedSlots.reserve(NumSlots);
   Intervals.reserve(NumSlots);
 
@@ -686,7 +1015,8 @@ bool StackColoring::runOnMachineFunction(MachineFunction &Func) {
 
   // Don't continue because there are not enough lifetime markers, or the
   // stack is too small, or we are told not to optimize the slots.
-  if (NumMarkers < 2 || TotalSize < 16 || DisableColoring) {
+  if (NumMarkers < 2 || TotalSize < 16 || DisableColoring ||
+      skipFunction(*Func.getFunction())) {
     DEBUG(dbgs()<<"Will not try to merge slots.\n");
     return removeAllMarkers();
   }
@@ -700,9 +1030,12 @@ bool StackColoring::runOnMachineFunction(MachineFunction &Func) {
 
   // Calculate the liveness of each block.
   calculateLocalLiveness();
+  DEBUG(dbgs() << "Dataflow iterations: " << NumIterations << "\n");
+  DEBUG(dump());
 
   // Propagate the liveness information.
   calculateLiveIntervals(NumSlots);
+  DEBUG(dumpIntervals());
 
   // Search for allocas which are used outside of the declared lifetime
   // markers.
diff --git a/lib/CodeGen/StackMapLivenessAnalysis.cpp b/lib/CodeGen/StackMapLivenessAnalysis.cpp
index 855058358fe4..87e4eb66c9c9 100644
--- a/lib/CodeGen/StackMapLivenessAnalysis.cpp
+++ b/lib/CodeGen/StackMapLivenessAnalysis.cpp
@@ -62,6 +62,11 @@ public:
   /// information we preserve.
   void getAnalysisUsage(AnalysisUsage &AU) const override;
 
+  MachineFunctionProperties getRequiredProperties() const override {
+    return MachineFunctionProperties().set(
+        MachineFunctionProperties::Property::AllVRegsAllocated);
+  }
+
   /// \brief Calculate the liveness information for the given machine function.
   bool runOnMachineFunction(MachineFunction &MF) override;
 
@@ -122,7 +127,8 @@ bool StackMapLiveness::calculateLiveness(MachineFunction &MF) {
   for (auto &MBB : MF) {
     DEBUG(dbgs() << "****** BB " << MBB.getName() << " ******\n");
     LiveRegs.init(TRI);
-    LiveRegs.addLiveOuts(&MBB);
+    // FIXME: This should probably be addLiveOuts().
+    LiveRegs.addLiveOutsNoPristines(MBB);
     bool HasStackMap = false;
     // Reverse iterate over all instructions and add the current live register
     // set to an instruction if we encounter a patchpoint instruction.
diff --git a/lib/CodeGen/StackMaps.cpp b/lib/CodeGen/StackMaps.cpp
index b3cd8b3d80bb..d91bb8066aed 100644
--- a/lib/CodeGen/StackMaps.cpp
+++ b/lib/CodeGen/StackMaps.cpp
@@ -520,9 +520,9 @@ void StackMaps::emitCallsiteEntries(MCStreamer &OS) {
 void StackMaps::serializeToStackMapSection() {
   (void)WSMP;
   // Bail out if there's no stack map data.
-  assert((!CSInfos.empty() || (CSInfos.empty() && ConstPool.empty())) &&
+  assert((!CSInfos.empty() || ConstPool.empty()) &&
          "Expected empty constant pool too!");
-  assert((!CSInfos.empty() || (CSInfos.empty() && FnStackSize.empty())) &&
+  assert((!CSInfos.empty() || FnStackSize.empty()) &&
          "Expected empty function record too!");
   if (CSInfos.empty())
     return;
diff --git a/lib/CodeGen/StackProtector.cpp b/lib/CodeGen/StackProtector.cpp
index db3fef524b30..89868e43aba4 100644
--- a/lib/CodeGen/StackProtector.cpp
+++ b/lib/CodeGen/StackProtector.cpp
@@ -18,12 +18,13 @@
 #include "llvm/ADT/SmallPtrSet.h"
 #include "llvm/ADT/Statistic.h"
 #include "llvm/Analysis/BranchProbabilityInfo.h"
+#include "llvm/Analysis/EHPersonalities.h"
 #include "llvm/Analysis/ValueTracking.h"
-#include "llvm/CodeGen/Analysis.h"
 #include "llvm/CodeGen/Passes.h"
 #include "llvm/IR/Attributes.h"
 #include "llvm/IR/Constants.h"
 #include "llvm/IR/DataLayout.h"
+#include "llvm/IR/DebugInfo.h"
 #include "llvm/IR/DerivedTypes.h"
 #include "llvm/IR/Function.h"
 #include "llvm/IR/GlobalValue.h"
@@ -89,15 +90,25 @@ bool StackProtector::runOnFunction(Function &Fn) {
       getAnalysisIfAvailable<DominatorTreeWrapperPass>();
   DT = DTWP ? &DTWP->getDomTree() : nullptr;
   TLI = TM->getSubtargetImpl(Fn)->getTargetLowering();
+  HasPrologue = false;
+  HasIRCheck = false;
 
   Attribute Attr = Fn.getFnAttribute("stack-protector-buffer-size");
   if (Attr.isStringAttribute() &&
       Attr.getValueAsString().getAsInteger(10, SSPBufferSize))
-      return false; // Invalid integer string
+    return false; // Invalid integer string
 
   if (!RequiresStackProtector())
     return false;
 
+  // TODO(etienneb): Functions with funclets are not correctly supported now.
+  // Do nothing if this is funclet-based personality.
+  if (Fn.hasPersonalityFn()) {
+    EHPersonality Personality = classifyEHPersonality(Fn.getPersonalityFn());
+    if (isFuncletEHPersonality(Personality))
+      return false;
+  }
+
   ++NumFunProtected;
   return InsertStackProtectors();
 }
@@ -200,11 +211,24 @@ bool StackProtector::HasAddressTaken(const Instruction *AI) {
 bool StackProtector::RequiresStackProtector() {
   bool Strong = false;
   bool NeedsProtector = false;
+  for (const BasicBlock &BB : *F)
+    for (const Instruction &I : BB)
+      if (const CallInst *CI = dyn_cast<CallInst>(&I))
+        if (CI->getCalledFunction() ==
+            Intrinsic::getDeclaration(F->getParent(),
+                                      Intrinsic::stackprotector))
+          HasPrologue = true;
+
+  if (F->hasFnAttribute(Attribute::SafeStack))
+    return false;
+
   if (F->hasFnAttribute(Attribute::StackProtectReq)) {
     NeedsProtector = true;
     Strong = true; // Use the same heuristic as strong to determine SSPLayout
   } else if (F->hasFnAttribute(Attribute::StackProtectStrong))
     Strong = true;
+  else if (HasPrologue)
+    NeedsProtector = true;
   else if (!F->hasFnAttribute(Attribute::StackProtect))
     return false;
 
@@ -256,106 +280,51 @@ bool StackProtector::RequiresStackProtector() {
   return NeedsProtector;
 }
 
-static bool InstructionWillNotHaveChain(const Instruction *I) {
-  return !I->mayHaveSideEffects() && !I->mayReadFromMemory() &&
-         isSafeToSpeculativelyExecute(I);
-}
-
-/// Identify if RI has a previous instruction in the "Tail Position" and return
-/// it. Otherwise return 0.
-///
-/// This is based off of the code in llvm::isInTailCallPosition. The difference
-/// is that it inverts the first part of llvm::isInTailCallPosition since
-/// isInTailCallPosition is checking if a call is in a tail call position, and
-/// we are searching for an unknown tail call that might be in the tail call
-/// position. Once we find the call though, the code uses the same refactored
-/// code, returnTypeIsEligibleForTailCall.
-static CallInst *FindPotentialTailCall(BasicBlock *BB, ReturnInst *RI,
-                                       const TargetLoweringBase *TLI) {
-  // Establish a reasonable upper bound on the maximum amount of instructions we
-  // will look through to find a tail call.
-  unsigned SearchCounter = 0;
-  const unsigned MaxSearch = 4;
-  bool NoInterposingChain = true;
-
-  for (BasicBlock::reverse_iterator I = std::next(BB->rbegin()), E = BB->rend();
-       I != E && SearchCounter < MaxSearch; ++I) {
-    Instruction *Inst = &*I;
-
-    // Skip over debug intrinsics and do not allow them to affect our MaxSearch
-    // counter.
-    if (isa<DbgInfoIntrinsic>(Inst))
-      continue;
-
-    // If we find a call and the following conditions are satisifed, then we
-    // have found a tail call that satisfies at least the target independent
-    // requirements of a tail call:
-    //
-    // 1. The call site has the tail marker.
-    //
-    // 2. The call site either will not cause the creation of a chain or if a
-    // chain is necessary there are no instructions in between the callsite and
-    // the call which would create an interposing chain.
-    //
-    // 3. The return type of the function does not impede tail call
-    // optimization.
-    if (CallInst *CI = dyn_cast<CallInst>(Inst)) {
-      if (CI->isTailCall() &&
-          (InstructionWillNotHaveChain(CI) || NoInterposingChain) &&
-          returnTypeIsEligibleForTailCall(BB->getParent(), CI, RI, *TLI))
-        return CI;
-    }
-
-    // If we did not find a call see if we have an instruction that may create
-    // an interposing chain.
-    NoInterposingChain =
-        NoInterposingChain && InstructionWillNotHaveChain(Inst);
-
-    // Increment max search.
-    SearchCounter++;
-  }
-
-  return nullptr;
+/// Create a stack guard loading and populate whether SelectionDAG SSP is
+/// supported.
+static Value *getStackGuard(const TargetLoweringBase *TLI, Module *M,
+                            IRBuilder<> &B,
+                            bool *SupportsSelectionDAGSP = nullptr) {
+  if (Value *Guard = TLI->getIRStackGuard(B))
+    return B.CreateLoad(Guard, true, "StackGuard");
+
+  // Use SelectionDAG SSP handling, since there isn't an IR guard.
+  //
+  // This is more or less weird, since we optionally output whether we
+  // should perform a SelectionDAG SP here. The reason is that it's strictly
+  // defined as !TLI->getIRStackGuard(B), where getIRStackGuard is also
+  // mutating. There is no way to get this bit without mutating the IR, so
+  // getting this bit has to happen in this right time.
+  //
+  // We could have define a new function TLI::supportsSelectionDAGSP(), but that
+  // will put more burden on the backends' overriding work, especially when it
+  // actually conveys the same information getIRStackGuard() already gives.
+  if (SupportsSelectionDAGSP)
+    *SupportsSelectionDAGSP = true;
+  TLI->insertSSPDeclarations(*M);
+  return B.CreateCall(Intrinsic::getDeclaration(M, Intrinsic::stackguard));
 }
 
-/// Insert code into the entry block that stores the __stack_chk_guard
+/// Insert code into the entry block that stores the stack guard
 /// variable onto the stack:
 ///
 ///   entry:
 ///     StackGuardSlot = alloca i8*
-///     StackGuard = load __stack_chk_guard
-///     call void @llvm.stackprotect.create(StackGuard, StackGuardSlot)
+///     StackGuard = <stack guard>
+///     call void @llvm.stackprotector(StackGuard, StackGuardSlot)
 ///
 /// Returns true if the platform/triple supports the stackprotectorcreate pseudo
 /// node.
 static bool CreatePrologue(Function *F, Module *M, ReturnInst *RI,
-                           const TargetLoweringBase *TLI, const Triple &TT,
-                           AllocaInst *&AI, Value *&StackGuardVar) {
+                           const TargetLoweringBase *TLI, AllocaInst *&AI) {
   bool SupportsSelectionDAGSP = false;
-  PointerType *PtrTy = Type::getInt8PtrTy(RI->getContext());
-  unsigned AddressSpace, Offset;
-  if (TLI->getStackCookieLocation(AddressSpace, Offset)) {
-    Constant *OffsetVal =
-        ConstantInt::get(Type::getInt32Ty(RI->getContext()), Offset);
-
-    StackGuardVar =
-        ConstantExpr::getIntToPtr(OffsetVal, PointerType::get(PtrTy,
-                                                              AddressSpace));
-  } else if (TT.isOSOpenBSD()) {
-    StackGuardVar = M->getOrInsertGlobal("__guard_local", PtrTy);
-    cast<GlobalValue>(StackGuardVar)
-        ->setVisibility(GlobalValue::HiddenVisibility);
-  } else {
-    SupportsSelectionDAGSP = true;
-    StackGuardVar = M->getOrInsertGlobal("__stack_chk_guard", PtrTy);
-  }
-
   IRBuilder<> B(&F->getEntryBlock().front());
+  PointerType *PtrTy = Type::getInt8PtrTy(RI->getContext());
   AI = B.CreateAlloca(PtrTy, nullptr, "StackGuardSlot");
-  LoadInst *LI = B.CreateLoad(StackGuardVar, "StackGuard");
-  B.CreateCall(Intrinsic::getDeclaration(M, Intrinsic::stackprotector),
-               {LI, AI});
 
+  Value *GuardSlot = getStackGuard(TLI, M, B, &SupportsSelectionDAGSP);
+  B.CreateCall(Intrinsic::getDeclaration(M, Intrinsic::stackprotector),
+               {GuardSlot, AI});
   return SupportsSelectionDAGSP;
 }
 
@@ -366,11 +335,9 @@ static bool CreatePrologue(Function *F, Module *M, ReturnInst *RI,
 ///  - The epilogue checks the value stored in the prologue against the original
 ///    value. It calls __stack_chk_fail if they differ.
 bool StackProtector::InsertStackProtectors() {
-  bool HasPrologue = false;
   bool SupportsSelectionDAGSP =
       EnableSelectionDAGSP && !TM->Options.EnableFastISel;
   AllocaInst *AI = nullptr;       // Place on stack that stores the stack guard.
-  Value *StackGuardVar = nullptr; // The stack guard variable.
 
   for (Function::iterator I = F->begin(), E = F->end(); I != E;) {
     BasicBlock *BB = &*I++;
@@ -378,30 +345,36 @@ bool StackProtector::InsertStackProtectors() {
     if (!RI)
       continue;
 
+    // Generate prologue instrumentation if not already generated.
     if (!HasPrologue) {
       HasPrologue = true;
-      SupportsSelectionDAGSP &=
-          CreatePrologue(F, M, RI, TLI, Trip, AI, StackGuardVar);
+      SupportsSelectionDAGSP &= CreatePrologue(F, M, RI, TLI, AI);
     }
 
-    if (SupportsSelectionDAGSP) {
-      // Since we have a potential tail call, insert the special stack check
-      // intrinsic.
-      Instruction *InsertionPt = nullptr;
-      if (CallInst *CI = FindPotentialTailCall(BB, RI, TLI)) {
-        InsertionPt = CI;
-      } else {
-        InsertionPt = RI;
-        // At this point we know that BB has a return statement so it *DOES*
-        // have a terminator.
-        assert(InsertionPt != nullptr &&
-               "BB must have a terminator instruction at this point.");
-      }
-
-      Function *Intrinsic =
-          Intrinsic::getDeclaration(M, Intrinsic::stackprotectorcheck);
-      CallInst::Create(Intrinsic, StackGuardVar, "", InsertionPt);
+    // SelectionDAG based code generation. Nothing else needs to be done here.
+    // The epilogue instrumentation is postponed to SelectionDAG.
+    if (SupportsSelectionDAGSP)
+      break;
+
+    // Set HasIRCheck to true, so that SelectionDAG will not generate its own
+    // version. SelectionDAG called 'shouldEmitSDCheck' to check whether
+    // instrumentation has already been generated.
+    HasIRCheck = true;
+
+    // Generate epilogue instrumentation. The epilogue intrumentation can be
+    // function-based or inlined depending on which mechanism the target is
+    // providing.
+    if (Value* GuardCheck = TLI->getSSPStackGuardCheck(*M)) {
+      // Generate the function-based epilogue instrumentation.
+      // The target provides a guard check function, generate a call to it.
+      IRBuilder<> B(RI);
+      LoadInst *Guard = B.CreateLoad(AI, true, "Guard");
+      CallInst *Call = B.CreateCall(GuardCheck, {Guard});
+      llvm::Function *Function = cast<llvm::Function>(GuardCheck);
+      Call->setAttributes(Function->getAttributes());
+      Call->setCallingConv(Function->getCallingConv());
     } else {
+      // Generate the epilogue with inline instrumentation.
       // If we do not support SelectionDAG based tail calls, generate IR level
       // tail calls.
       //
@@ -415,7 +388,7 @@ bool StackProtector::InsertStackProtectors() {
       //
       //   return:
       //     ...
-      //     %1 = load __stack_chk_guard
+      //     %1 = <stack guard>
       //     %2 = load StackGuardSlot
       //     %3 = cmp i1 %1, %2
       //     br i1 %3, label %SP_return, label %CallStackCheckFailBlk
@@ -450,9 +423,9 @@ bool StackProtector::InsertStackProtectors() {
 
       // Generate the stack protector instructions in the old basic block.
       IRBuilder<> B(BB);
-      LoadInst *LI1 = B.CreateLoad(StackGuardVar);
-      LoadInst *LI2 = B.CreateLoad(AI);
-      Value *Cmp = B.CreateICmpEQ(LI1, LI2);
+      Value *Guard = getStackGuard(TLI, M, B);
+      LoadInst *LI2 = B.CreateLoad(AI, true);
+      Value *Cmp = B.CreateICmpEQ(Guard, LI2);
       auto SuccessProb =
           BranchProbabilityInfo::getBranchProbStackProtector(true);
       auto FailureProb =
@@ -475,6 +448,7 @@ BasicBlock *StackProtector::CreateFailBB() {
   LLVMContext &Context = F->getContext();
   BasicBlock *FailBB = BasicBlock::Create(Context, "CallStackCheckFailBlk", F);
   IRBuilder<> B(FailBB);
+  B.SetCurrentDebugLocation(DebugLoc::get(0, 0, F->getSubprogram()));
   if (Trip.isOSOpenBSD()) {
     Constant *StackChkFail =
         M->getOrInsertFunction("__stack_smash_handler",
@@ -491,3 +465,7 @@ BasicBlock *StackProtector::CreateFailBB() {
   B.CreateUnreachable();
   return FailBB;
 }
+
+bool StackProtector::shouldEmitSDCheck(const BasicBlock &BB) const {
+  return HasPrologue && !HasIRCheck && dyn_cast<ReturnInst>(BB.getTerminator());
+}
diff --git a/lib/CodeGen/StackSlotColoring.cpp b/lib/CodeGen/StackSlotColoring.cpp
index 51f4d0e68172..d996714a414a 100644
--- a/lib/CodeGen/StackSlotColoring.cpp
+++ b/lib/CodeGen/StackSlotColoring.cpp
@@ -107,7 +107,7 @@ namespace {
     bool OverlapWithAssignments(LiveInterval *li, int Color) const;
     int ColorSlot(LiveInterval *li);
     bool ColorSlots(MachineFunction &MF);
-    void RewriteInstruction(MachineInstr *MI, SmallVectorImpl<int> &SlotMapping,
+    void RewriteInstruction(MachineInstr &MI, SmallVectorImpl<int> &SlotMapping,
                             MachineFunction &MF);
     bool RemoveDeadStores(MachineBasicBlock* MBB);
   };
@@ -145,9 +145,9 @@ void StackSlotColoring::ScanForSpillSlotRefs(MachineFunction &MF) {
     MachineBasicBlock *MBB = &*MBBI;
     for (MachineBasicBlock::iterator MII = MBB->begin(), EE = MBB->end();
          MII != EE; ++MII) {
-      MachineInstr *MI = &*MII;
-      for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) {
-        MachineOperand &MO = MI->getOperand(i);
+      MachineInstr &MI = *MII;
+      for (unsigned i = 0, e = MI.getNumOperands(); i != e; ++i) {
+        MachineOperand &MO = MI.getOperand(i);
         if (!MO.isFI())
           continue;
         int FI = MO.getIndex();
@@ -156,11 +156,12 @@ void StackSlotColoring::ScanForSpillSlotRefs(MachineFunction &MF) {
         if (!LS->hasInterval(FI))
           continue;
         LiveInterval &li = LS->getInterval(FI);
-        if (!MI->isDebugValue())
+        if (!MI.isDebugValue())
           li.weight += LiveIntervals::getSpillWeight(false, true, MBFI, MI);
       }
-      for (MachineInstr::mmo_iterator MMOI = MI->memoperands_begin(),
-           EE = MI->memoperands_end(); MMOI != EE; ++MMOI) {
+      for (MachineInstr::mmo_iterator MMOI = MI.memoperands_begin(),
+                                      EE = MI.memoperands_end();
+           MMOI != EE; ++MMOI) {
         MachineMemOperand *MMO = *MMOI;
         if (const FixedStackPseudoSourceValue *FSV =
             dyn_cast_or_null<FixedStackPseudoSourceValue>(
@@ -325,13 +326,10 @@ bool StackSlotColoring::ColorSlots(MachineFunction &MF) {
   }
 
   // Rewrite all MO_FrameIndex operands.  Look for dead stores.
-  for (MachineFunction::iterator MBBI = MF.begin(), E = MF.end();
-       MBBI != E; ++MBBI) {
-    MachineBasicBlock *MBB = &*MBBI;
-    for (MachineBasicBlock::iterator MII = MBB->begin(), EE = MBB->end();
-         MII != EE; ++MII)
-      RewriteInstruction(MII, SlotMapping, MF);
-    RemoveDeadStores(MBB);
+  for (MachineBasicBlock &MBB : MF) {
+    for (MachineInstr &MI : MBB)
+      RewriteInstruction(MI, SlotMapping, MF);
+    RemoveDeadStores(&MBB);
   }
 
   // Delete unused stack slots.
@@ -346,12 +344,12 @@ bool StackSlotColoring::ColorSlots(MachineFunction &MF) {
 
 /// RewriteInstruction - Rewrite specified instruction by replacing references
 /// to old frame index with new one.
-void StackSlotColoring::RewriteInstruction(MachineInstr *MI,
+void StackSlotColoring::RewriteInstruction(MachineInstr &MI,
                                            SmallVectorImpl<int> &SlotMapping,
                                            MachineFunction &MF) {
   // Update the operands.
-  for (unsigned i = 0, ee = MI->getNumOperands(); i != ee; ++i) {
-    MachineOperand &MO = MI->getOperand(i);
+  for (unsigned i = 0, ee = MI.getNumOperands(); i != ee; ++i) {
+    MachineOperand &MO = MI.getOperand(i);
     if (!MO.isFI())
       continue;
     int OldFI = MO.getIndex();
@@ -385,12 +383,11 @@ bool StackSlotColoring::RemoveDeadStores(MachineBasicBlock* MBB) {
       break;
 
     int FirstSS, SecondSS;
-    if (TII->isStackSlotCopy(I, FirstSS, SecondSS) &&
-        FirstSS == SecondSS &&
+    if (TII->isStackSlotCopy(*I, FirstSS, SecondSS) && FirstSS == SecondSS &&
         FirstSS != -1) {
       ++NumDead;
       changed = true;
-      toErase.push_back(I);
+      toErase.push_back(&*I);
       continue;
     }
 
@@ -399,8 +396,10 @@ bool StackSlotColoring::RemoveDeadStores(MachineBasicBlock* MBB) {
 
     unsigned LoadReg = 0;
     unsigned StoreReg = 0;
-    if (!(LoadReg = TII->isLoadFromStackSlot(I, FirstSS))) continue;
-    if (!(StoreReg = TII->isStoreToStackSlot(NextMI, SecondSS))) continue;
+    if (!(LoadReg = TII->isLoadFromStackSlot(*I, FirstSS)))
+      continue;
+    if (!(StoreReg = TII->isStoreToStackSlot(*NextMI, SecondSS)))
+      continue;
     if (FirstSS != SecondSS || LoadReg != StoreReg || FirstSS == -1) continue;
 
     ++NumDead;
@@ -408,10 +407,10 @@ bool StackSlotColoring::RemoveDeadStores(MachineBasicBlock* MBB) {
 
     if (NextMI->findRegisterUseOperandIdx(LoadReg, true, nullptr) != -1) {
       ++NumDead;
-      toErase.push_back(I);
+      toErase.push_back(&*I);
     }
 
-    toErase.push_back(NextMI);
+    toErase.push_back(&*NextMI);
     ++I;
   }
 
diff --git a/lib/CodeGen/StatepointExampleGC.cpp b/lib/CodeGen/StatepointExampleGC.cpp
deleted file mode 100644
index 3f60e18fafa9..000000000000
--- a/lib/CodeGen/StatepointExampleGC.cpp
+++ /dev/null
@@ -1,55 +0,0 @@
-//===-- StatepointDefaultGC.cpp - The default statepoint GC strategy ------===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-//
-// This file contains a GCStrategy which serves as an example for the usage
-// of a statepoint based lowering strategy.  This GCStrategy is intended to
-// suitable as a default implementation usable with any collector which can
-// consume the standard stackmap format generated by statepoints, uses the
-// default addrespace to distinguish between gc managed and non-gc managed
-// pointers, and has reasonable relocation semantics.
-//
-//===----------------------------------------------------------------------===//
-
-#include "llvm/CodeGen/GCStrategy.h"
-#include "llvm/IR/DerivedTypes.h"
-#include "llvm/IR/Value.h"
-
-using namespace llvm;
-
-namespace {
-class StatepointGC : public GCStrategy {
-public:
-  StatepointGC() {
-    UseStatepoints = true;
-    // These options are all gc.root specific, we specify them so that the
-    // gc.root lowering code doesn't run.
-    InitRoots = false;
-    NeededSafePoints = 0;
-    UsesMetadata = false;
-    CustomRoots = false;
-  }
-  Optional<bool> isGCManagedPointer(const Type *Ty) const override {
-    // Method is only valid on pointer typed values.
-    const PointerType *PT = cast<PointerType>(Ty);
-    // For the sake of this example GC, we arbitrarily pick addrspace(1) as our
-    // GC managed heap.  We know that a pointer into this heap needs to be
-    // updated and that no other pointer does.  Note that addrspace(1) is used
-    // only as an example, it has no special meaning, and is not reserved for
-    // GC usage.
-    return (1 == PT->getAddressSpace());
-  }
-};
-}
-
-static GCRegistry::Add<StatepointGC> X("statepoint-example",
-                                       "an example strategy for statepoint");
-
-namespace llvm {
-void linkStatepointExampleGC() {}
-}
diff --git a/lib/CodeGen/TailDuplication.cpp b/lib/CodeGen/TailDuplication.cpp
index d2fbf533a787..2b1fb127497a 100644
--- a/lib/CodeGen/TailDuplication.cpp
+++ b/lib/CodeGen/TailDuplication.cpp
@@ -8,147 +8,52 @@
 //===----------------------------------------------------------------------===//
 //
 // This pass duplicates basic blocks ending in unconditional branches into
-// the tails of their predecessors.
+// the tails of their predecessors, using the TailDuplicator utility class.
 //
 //===----------------------------------------------------------------------===//
 
-#include "llvm/CodeGen/Passes.h"
-#include "llvm/ADT/DenseSet.h"
-#include "llvm/ADT/SetVector.h"
-#include "llvm/ADT/SmallSet.h"
-#include "llvm/ADT/Statistic.h"
-#include "llvm/CodeGen/MachineBranchProbabilityInfo.h"
 #include "llvm/CodeGen/MachineFunctionPass.h"
-#include "llvm/CodeGen/MachineInstrBuilder.h"
-#include "llvm/CodeGen/MachineModuleInfo.h"
-#include "llvm/CodeGen/MachineRegisterInfo.h"
-#include "llvm/CodeGen/MachineSSAUpdater.h"
-#include "llvm/CodeGen/RegisterScavenging.h"
+#include "llvm/CodeGen/Passes.h"
+#include "llvm/CodeGen/TailDuplicator.h"
 #include "llvm/IR/Function.h"
-#include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Debug.h"
-#include "llvm/Support/ErrorHandling.h"
-#include "llvm/Support/raw_ostream.h"
-#include "llvm/Target/TargetInstrInfo.h"
-#include "llvm/Target/TargetRegisterInfo.h"
-#include "llvm/Target/TargetSubtargetInfo.h"
 using namespace llvm;
 
 #define DEBUG_TYPE "tailduplication"
 
-STATISTIC(NumTails     , "Number of tails duplicated");
-STATISTIC(NumTailDups  , "Number of tail duplicated blocks");
-STATISTIC(NumInstrDups , "Additional instructions due to tail duplication");
-STATISTIC(NumDeadBlocks, "Number of dead blocks removed");
-STATISTIC(NumAddedPHIs , "Number of phis added");
-
-// Heuristic for tail duplication.
-static cl::opt<unsigned>
-TailDuplicateSize("tail-dup-size",
-                  cl::desc("Maximum instructions to consider tail duplicating"),
-                  cl::init(2), cl::Hidden);
-
-static cl::opt<bool>
-TailDupVerify("tail-dup-verify",
-              cl::desc("Verify sanity of PHI instructions during taildup"),
-              cl::init(false), cl::Hidden);
-
-static cl::opt<unsigned>
-TailDupLimit("tail-dup-limit", cl::init(~0U), cl::Hidden);
-
-typedef std::vector<std::pair<MachineBasicBlock*,unsigned> > AvailableValsTy;
-
 namespace {
-  /// Perform tail duplication.
-  class TailDuplicatePass : public MachineFunctionPass {
-    const TargetInstrInfo *TII;
-    const TargetRegisterInfo *TRI;
-    const MachineBranchProbabilityInfo *MBPI;
-    MachineModuleInfo *MMI;
-    MachineRegisterInfo *MRI;
-    std::unique_ptr<RegScavenger> RS;
-    bool PreRegAlloc;
-
-    // A list of virtual registers for which to update SSA form.
-    SmallVector<unsigned, 16> SSAUpdateVRs;
-
-    // For each virtual register in SSAUpdateVals keep a list of source virtual
-    // registers.
-    DenseMap<unsigned, AvailableValsTy> SSAUpdateVals;
+/// Perform tail duplication. Delegates to TailDuplicator
+class TailDuplicatePass : public MachineFunctionPass {
+  TailDuplicator Duplicator;
 
-  public:
-    static char ID;
-    explicit TailDuplicatePass() :
-      MachineFunctionPass(ID), PreRegAlloc(false) {}
+public:
+  static char ID;
+  explicit TailDuplicatePass() : MachineFunctionPass(ID) {}
 
-    bool runOnMachineFunction(MachineFunction &MF) override;
+  bool runOnMachineFunction(MachineFunction &MF) override;
 
-    void getAnalysisUsage(AnalysisUsage &AU) const override;
+  void getAnalysisUsage(AnalysisUsage &AU) const override;
+};
 
-  private:
-    void AddSSAUpdateEntry(unsigned OrigReg, unsigned NewReg,
-                           MachineBasicBlock *BB);
-    void ProcessPHI(MachineInstr *MI, MachineBasicBlock *TailBB,
-                    MachineBasicBlock *PredBB,
-                    DenseMap<unsigned, unsigned> &LocalVRMap,
-                    SmallVectorImpl<std::pair<unsigned,unsigned> > &Copies,
-                    const DenseSet<unsigned> &UsedByPhi,
-                    bool Remove);
-    void DuplicateInstruction(MachineInstr *MI,
-                              MachineBasicBlock *TailBB,
-                              MachineBasicBlock *PredBB,
-                              MachineFunction &MF,
-                              DenseMap<unsigned, unsigned> &LocalVRMap,
-                              const DenseSet<unsigned> &UsedByPhi);
-    void UpdateSuccessorsPHIs(MachineBasicBlock *FromBB, bool isDead,
-                              SmallVectorImpl<MachineBasicBlock *> &TDBBs,
-                              SmallSetVector<MachineBasicBlock*, 8> &Succs);
-    bool TailDuplicateBlocks(MachineFunction &MF);
-    bool shouldTailDuplicate(const MachineFunction &MF,
-                             bool IsSimple, MachineBasicBlock &TailBB);
-    bool isSimpleBB(MachineBasicBlock *TailBB);
-    bool canCompletelyDuplicateBB(MachineBasicBlock &BB);
-    bool duplicateSimpleBB(MachineBasicBlock *TailBB,
-                           SmallVectorImpl<MachineBasicBlock *> &TDBBs,
-                           const DenseSet<unsigned> &RegsUsedByPhi,
-                           SmallVectorImpl<MachineInstr *> &Copies);
-    bool TailDuplicate(MachineBasicBlock *TailBB,
-                       bool IsSimple,
-                       MachineFunction &MF,
-                       SmallVectorImpl<MachineBasicBlock *> &TDBBs,
-                       SmallVectorImpl<MachineInstr *> &Copies);
-    bool TailDuplicateAndUpdate(MachineBasicBlock *MBB,
-                                bool IsSimple,
-                                MachineFunction &MF);
-
-    void RemoveDeadBlock(MachineBasicBlock *MBB);
-  };
-
-  char TailDuplicatePass::ID = 0;
+char TailDuplicatePass::ID = 0;
 }
 
 char &llvm::TailDuplicateID = TailDuplicatePass::ID;
 
-INITIALIZE_PASS(TailDuplicatePass, "tailduplication", "Tail Duplication",
-                false, false)
+INITIALIZE_PASS(TailDuplicatePass, "tailduplication", "Tail Duplication", false,
+                false)
 
 bool TailDuplicatePass::runOnMachineFunction(MachineFunction &MF) {
-  if (skipOptnoneFunction(*MF.getFunction()))
+  if (skipFunction(*MF.getFunction()))
     return false;
 
-  TII = MF.getSubtarget().getInstrInfo();
-  TRI = MF.getSubtarget().getRegisterInfo();
-  MRI = &MF.getRegInfo();
-  MMI = getAnalysisIfAvailable<MachineModuleInfo>();
-  MBPI = &getAnalysis<MachineBranchProbabilityInfo>();
+  auto MMI = getAnalysisIfAvailable<MachineModuleInfo>();
+  auto MBPI = &getAnalysis<MachineBranchProbabilityInfo>();
 
-  PreRegAlloc = MRI->isSSA();
-  RS.reset();
-  if (MRI->tracksLiveness() && TRI->trackLivenessAfterRegAlloc(MF))
-    RS.reset(new RegScavenger());
+  Duplicator.initMF(MF, MMI, MBPI);
 
   bool MadeChange = false;
-  while (TailDuplicateBlocks(MF))
+  while (Duplicator.tailDuplicateBlocks(MF))
     MadeChange = true;
 
   return MadeChange;
@@ -158,831 +63,3 @@ void TailDuplicatePass::getAnalysisUsage(AnalysisUsage &AU) const {
   AU.addRequired<MachineBranchProbabilityInfo>();
   MachineFunctionPass::getAnalysisUsage(AU);
 }
-
-static void VerifyPHIs(MachineFunction &MF, bool CheckExtra) {
-  for (MachineFunction::iterator I = ++MF.begin(), E = MF.end(); I != E; ++I) {
-    MachineBasicBlock *MBB = &*I;
-    SmallSetVector<MachineBasicBlock*, 8> Preds(MBB->pred_begin(),
-                                                MBB->pred_end());
-    MachineBasicBlock::iterator MI = MBB->begin();
-    while (MI != MBB->end()) {
-      if (!MI->isPHI())
-        break;
-      for (SmallSetVector<MachineBasicBlock *, 8>::iterator PI = Preds.begin(),
-             PE = Preds.end(); PI != PE; ++PI) {
-        MachineBasicBlock *PredBB = *PI;
-        bool Found = false;
-        for (unsigned i = 1, e = MI->getNumOperands(); i != e; i += 2) {
-          MachineBasicBlock *PHIBB = MI->getOperand(i+1).getMBB();
-          if (PHIBB == PredBB) {
-            Found = true;
-            break;
-          }
-        }
-        if (!Found) {
-          dbgs() << "Malformed PHI in BB#" << MBB->getNumber() << ": " << *MI;
-          dbgs() << "  missing input from predecessor BB#"
-                 << PredBB->getNumber() << '\n';
-          llvm_unreachable(nullptr);
-        }
-      }
-
-      for (unsigned i = 1, e = MI->getNumOperands(); i != e; i += 2) {
-        MachineBasicBlock *PHIBB = MI->getOperand(i+1).getMBB();
-        if (CheckExtra && !Preds.count(PHIBB)) {
-          dbgs() << "Warning: malformed PHI in BB#" << MBB->getNumber()
-                 << ": " << *MI;
-          dbgs() << "  extra input from predecessor BB#"
-                 << PHIBB->getNumber() << '\n';
-          llvm_unreachable(nullptr);
-        }
-        if (PHIBB->getNumber() < 0) {
-          dbgs() << "Malformed PHI in BB#" << MBB->getNumber() << ": " << *MI;
-          dbgs() << "  non-existing BB#" << PHIBB->getNumber() << '\n';
-          llvm_unreachable(nullptr);
-        }
-      }
-      ++MI;
-    }
-  }
-}
-
-/// Tail duplicate the block and cleanup.
-bool
-TailDuplicatePass::TailDuplicateAndUpdate(MachineBasicBlock *MBB,
-                                          bool IsSimple,
-                                          MachineFunction &MF) {
-  // Save the successors list.
-  SmallSetVector<MachineBasicBlock*, 8> Succs(MBB->succ_begin(),
-                                              MBB->succ_end());
-
-  SmallVector<MachineBasicBlock*, 8> TDBBs;
-  SmallVector<MachineInstr*, 16> Copies;
-  if (!TailDuplicate(MBB, IsSimple, MF, TDBBs, Copies))
-    return false;
-
-  ++NumTails;
-
-  SmallVector<MachineInstr*, 8> NewPHIs;
-  MachineSSAUpdater SSAUpdate(MF, &NewPHIs);
-
-  // TailBB's immediate successors are now successors of those predecessors
-  // which duplicated TailBB. Add the predecessors as sources to the PHI
-  // instructions.
-  bool isDead = MBB->pred_empty() && !MBB->hasAddressTaken();
-  if (PreRegAlloc)
-    UpdateSuccessorsPHIs(MBB, isDead, TDBBs, Succs);
-
-  // If it is dead, remove it.
-  if (isDead) {
-    NumInstrDups -= MBB->size();
-    RemoveDeadBlock(MBB);
-    ++NumDeadBlocks;
-  }
-
-  // Update SSA form.
-  if (!SSAUpdateVRs.empty()) {
-    for (unsigned i = 0, e = SSAUpdateVRs.size(); i != e; ++i) {
-      unsigned VReg = SSAUpdateVRs[i];
-      SSAUpdate.Initialize(VReg);
-
-      // If the original definition is still around, add it as an available
-      // value.
-      MachineInstr *DefMI = MRI->getVRegDef(VReg);
-      MachineBasicBlock *DefBB = nullptr;
-      if (DefMI) {
-        DefBB = DefMI->getParent();
-        SSAUpdate.AddAvailableValue(DefBB, VReg);
-      }
-
-      // Add the new vregs as available values.
-      DenseMap<unsigned, AvailableValsTy>::iterator LI =
-        SSAUpdateVals.find(VReg);
-      for (unsigned j = 0, ee = LI->second.size(); j != ee; ++j) {
-        MachineBasicBlock *SrcBB = LI->second[j].first;
-        unsigned SrcReg = LI->second[j].second;
-        SSAUpdate.AddAvailableValue(SrcBB, SrcReg);
-      }
-
-      // Rewrite uses that are outside of the original def's block.
-      MachineRegisterInfo::use_iterator UI = MRI->use_begin(VReg);
-      while (UI != MRI->use_end()) {
-        MachineOperand &UseMO = *UI;
-        MachineInstr *UseMI = UseMO.getParent();
-        ++UI;
-        if (UseMI->isDebugValue()) {
-          // SSAUpdate can replace the use with an undef. That creates
-          // a debug instruction that is a kill.
-          // FIXME: Should it SSAUpdate job to delete debug instructions
-          // instead of replacing the use with undef?
-          UseMI->eraseFromParent();
-          continue;
-        }
-        if (UseMI->getParent() == DefBB && !UseMI->isPHI())
-          continue;
-        SSAUpdate.RewriteUse(UseMO);
-      }
-    }
-
-    SSAUpdateVRs.clear();
-    SSAUpdateVals.clear();
-  }
-
-  // Eliminate some of the copies inserted by tail duplication to maintain
-  // SSA form.
-  for (unsigned i = 0, e = Copies.size(); i != e; ++i) {
-    MachineInstr *Copy = Copies[i];
-    if (!Copy->isCopy())
-      continue;
-    unsigned Dst = Copy->getOperand(0).getReg();
-    unsigned Src = Copy->getOperand(1).getReg();
-    if (MRI->hasOneNonDBGUse(Src) &&
-        MRI->constrainRegClass(Src, MRI->getRegClass(Dst))) {
-      // Copy is the only use. Do trivial copy propagation here.
-      MRI->replaceRegWith(Dst, Src);
-      Copy->eraseFromParent();
-    }
-  }
-
-  if (NewPHIs.size())
-    NumAddedPHIs += NewPHIs.size();
-
-  return true;
-}
-
-/// Look for small blocks that are unconditionally branched to and do not fall
-/// through. Tail-duplicate their instructions into their predecessors to
-/// eliminate (dynamic) branches.
-bool TailDuplicatePass::TailDuplicateBlocks(MachineFunction &MF) {
-  bool MadeChange = false;
-
-  if (PreRegAlloc && TailDupVerify) {
-    DEBUG(dbgs() << "\n*** Before tail-duplicating\n");
-    VerifyPHIs(MF, true);
-  }
-
-  for (MachineFunction::iterator I = ++MF.begin(), E = MF.end(); I != E; ) {
-    MachineBasicBlock *MBB = &*I++;
-
-    if (NumTails == TailDupLimit)
-      break;
-
-    bool IsSimple = isSimpleBB(MBB);
-
-    if (!shouldTailDuplicate(MF, IsSimple, *MBB))
-      continue;
-
-    MadeChange |= TailDuplicateAndUpdate(MBB, IsSimple, MF);
-  }
-
-  if (PreRegAlloc && TailDupVerify)
-    VerifyPHIs(MF, false);
-
-  return MadeChange;
-}
-
-static bool isDefLiveOut(unsigned Reg, MachineBasicBlock *BB,
-                         const MachineRegisterInfo *MRI) {
-  for (MachineInstr &UseMI : MRI->use_instructions(Reg)) {
-    if (UseMI.isDebugValue())
-      continue;
-    if (UseMI.getParent() != BB)
-      return true;
-  }
-  return false;
-}
-
-static unsigned getPHISrcRegOpIdx(MachineInstr *MI, MachineBasicBlock *SrcBB) {
-  for (unsigned i = 1, e = MI->getNumOperands(); i != e; i += 2)
-    if (MI->getOperand(i+1).getMBB() == SrcBB)
-      return i;
-  return 0;
-}
-
-
-// Remember which registers are used by phis in this block. This is
-// used to determine which registers are liveout while modifying the
-// block (which is why we need to copy the information).
-static void getRegsUsedByPHIs(const MachineBasicBlock &BB,
-                              DenseSet<unsigned> *UsedByPhi) {
-  for (const auto &MI : BB) {
-    if (!MI.isPHI())
-      break;
-    for (unsigned i = 1, e = MI.getNumOperands(); i != e; i += 2) {
-      unsigned SrcReg = MI.getOperand(i).getReg();
-      UsedByPhi->insert(SrcReg);
-    }
-  }
-}
-
-/// Add a definition and source virtual registers pair for SSA update.
-void TailDuplicatePass::AddSSAUpdateEntry(unsigned OrigReg, unsigned NewReg,
-                                          MachineBasicBlock *BB) {
-  DenseMap<unsigned, AvailableValsTy>::iterator LI= SSAUpdateVals.find(OrigReg);
-  if (LI != SSAUpdateVals.end())
-    LI->second.push_back(std::make_pair(BB, NewReg));
-  else {
-    AvailableValsTy Vals;
-    Vals.push_back(std::make_pair(BB, NewReg));
-    SSAUpdateVals.insert(std::make_pair(OrigReg, Vals));
-    SSAUpdateVRs.push_back(OrigReg);
-  }
-}
-
-/// Process PHI node in TailBB by turning it into a copy in PredBB. Remember the
-/// source register that's contributed by PredBB and update SSA update map.
-void TailDuplicatePass::ProcessPHI(
-    MachineInstr *MI, MachineBasicBlock *TailBB, MachineBasicBlock *PredBB,
-    DenseMap<unsigned, unsigned> &LocalVRMap,
-    SmallVectorImpl<std::pair<unsigned, unsigned> > &Copies,
-    const DenseSet<unsigned> &RegsUsedByPhi, bool Remove) {
-  unsigned DefReg = MI->getOperand(0).getReg();
-  unsigned SrcOpIdx = getPHISrcRegOpIdx(MI, PredBB);
-  assert(SrcOpIdx && "Unable to find matching PHI source?");
-  unsigned SrcReg = MI->getOperand(SrcOpIdx).getReg();
-  const TargetRegisterClass *RC = MRI->getRegClass(DefReg);
-  LocalVRMap.insert(std::make_pair(DefReg, SrcReg));
-
-  // Insert a copy from source to the end of the block. The def register is the
-  // available value liveout of the block.
-  unsigned NewDef = MRI->createVirtualRegister(RC);
-  Copies.push_back(std::make_pair(NewDef, SrcReg));
-  if (isDefLiveOut(DefReg, TailBB, MRI) || RegsUsedByPhi.count(DefReg))
-    AddSSAUpdateEntry(DefReg, NewDef, PredBB);
-
-  if (!Remove)
-    return;
-
-  // Remove PredBB from the PHI node.
-  MI->RemoveOperand(SrcOpIdx+1);
-  MI->RemoveOperand(SrcOpIdx);
-  if (MI->getNumOperands() == 1)
-    MI->eraseFromParent();
-}
-
-/// Duplicate a TailBB instruction to PredBB and update
-/// the source operands due to earlier PHI translation.
-void TailDuplicatePass::DuplicateInstruction(MachineInstr *MI,
-                                     MachineBasicBlock *TailBB,
-                                     MachineBasicBlock *PredBB,
-                                     MachineFunction &MF,
-                                     DenseMap<unsigned, unsigned> &LocalVRMap,
-                                     const DenseSet<unsigned> &UsedByPhi) {
-  MachineInstr *NewMI = TII->duplicate(MI, MF);
-  for (unsigned i = 0, e = NewMI->getNumOperands(); i != e; ++i) {
-    MachineOperand &MO = NewMI->getOperand(i);
-    if (!MO.isReg())
-      continue;
-    unsigned Reg = MO.getReg();
-    if (!TargetRegisterInfo::isVirtualRegister(Reg))
-      continue;
-    if (MO.isDef()) {
-      const TargetRegisterClass *RC = MRI->getRegClass(Reg);
-      unsigned NewReg = MRI->createVirtualRegister(RC);
-      MO.setReg(NewReg);
-      LocalVRMap.insert(std::make_pair(Reg, NewReg));
-      if (isDefLiveOut(Reg, TailBB, MRI) || UsedByPhi.count(Reg))
-        AddSSAUpdateEntry(Reg, NewReg, PredBB);
-    } else {
-      DenseMap<unsigned, unsigned>::iterator VI = LocalVRMap.find(Reg);
-      if (VI != LocalVRMap.end()) {
-        MO.setReg(VI->second);
-        // Clear any kill flags from this operand.  The new register could have
-        // uses after this one, so kills are not valid here.
-        MO.setIsKill(false);
-        MRI->constrainRegClass(VI->second, MRI->getRegClass(Reg));
-      }
-    }
-  }
-  PredBB->insert(PredBB->instr_end(), NewMI);
-}
-
-/// After FromBB is tail duplicated into its predecessor blocks, the successors
-/// have gained new predecessors. Update the PHI instructions in them
-/// accordingly.
-void
-TailDuplicatePass::UpdateSuccessorsPHIs(MachineBasicBlock *FromBB, bool isDead,
-                                  SmallVectorImpl<MachineBasicBlock *> &TDBBs,
-                                  SmallSetVector<MachineBasicBlock*,8> &Succs) {
-  for (SmallSetVector<MachineBasicBlock*, 8>::iterator SI = Succs.begin(),
-         SE = Succs.end(); SI != SE; ++SI) {
-    MachineBasicBlock *SuccBB = *SI;
-    for (MachineBasicBlock::iterator II = SuccBB->begin(), EE = SuccBB->end();
-         II != EE; ++II) {
-      if (!II->isPHI())
-        break;
-      MachineInstrBuilder MIB(*FromBB->getParent(), II);
-      unsigned Idx = 0;
-      for (unsigned i = 1, e = II->getNumOperands(); i != e; i += 2) {
-        MachineOperand &MO = II->getOperand(i+1);
-        if (MO.getMBB() == FromBB) {
-          Idx = i;
-          break;
-        }
-      }
-
-      assert(Idx != 0);
-      MachineOperand &MO0 = II->getOperand(Idx);
-      unsigned Reg = MO0.getReg();
-      if (isDead) {
-        // Folded into the previous BB.
-        // There could be duplicate phi source entries. FIXME: Should sdisel
-        // or earlier pass fixed this?
-        for (unsigned i = II->getNumOperands()-2; i != Idx; i -= 2) {
-          MachineOperand &MO = II->getOperand(i+1);
-          if (MO.getMBB() == FromBB) {
-            II->RemoveOperand(i+1);
-            II->RemoveOperand(i);
-          }
-        }
-      } else
-        Idx = 0;
-
-      // If Idx is set, the operands at Idx and Idx+1 must be removed.
-      // We reuse the location to avoid expensive RemoveOperand calls.
-
-      DenseMap<unsigned,AvailableValsTy>::iterator LI=SSAUpdateVals.find(Reg);
-      if (LI != SSAUpdateVals.end()) {
-        // This register is defined in the tail block.
-        for (unsigned j = 0, ee = LI->second.size(); j != ee; ++j) {
-          MachineBasicBlock *SrcBB = LI->second[j].first;
-          // If we didn't duplicate a bb into a particular predecessor, we
-          // might still have added an entry to SSAUpdateVals to correcly
-          // recompute SSA. If that case, avoid adding a dummy extra argument
-          // this PHI.
-          if (!SrcBB->isSuccessor(SuccBB))
-            continue;
-
-          unsigned SrcReg = LI->second[j].second;
-          if (Idx != 0) {
-            II->getOperand(Idx).setReg(SrcReg);
-            II->getOperand(Idx+1).setMBB(SrcBB);
-            Idx = 0;
-          } else {
-            MIB.addReg(SrcReg).addMBB(SrcBB);
-          }
-        }
-      } else {
-        // Live in tail block, must also be live in predecessors.
-        for (unsigned j = 0, ee = TDBBs.size(); j != ee; ++j) {
-          MachineBasicBlock *SrcBB = TDBBs[j];
-          if (Idx != 0) {
-            II->getOperand(Idx).setReg(Reg);
-            II->getOperand(Idx+1).setMBB(SrcBB);
-            Idx = 0;
-          } else {
-            MIB.addReg(Reg).addMBB(SrcBB);
-          }
-        }
-      }
-      if (Idx != 0) {
-        II->RemoveOperand(Idx+1);
-        II->RemoveOperand(Idx);
-      }
-    }
-  }
-}
-
-/// Determine if it is profitable to duplicate this block.
-bool
-TailDuplicatePass::shouldTailDuplicate(const MachineFunction &MF,
-                                       bool IsSimple,
-                                       MachineBasicBlock &TailBB) {
-  // Only duplicate blocks that end with unconditional branches.
-  if (TailBB.canFallThrough())
-    return false;
-
-  // Don't try to tail-duplicate single-block loops.
-  if (TailBB.isSuccessor(&TailBB))
-    return false;
-
-  // Set the limit on the cost to duplicate. When optimizing for size,
-  // duplicate only one, because one branch instruction can be eliminated to
-  // compensate for the duplication.
-  unsigned MaxDuplicateCount;
-  if (TailDuplicateSize.getNumOccurrences() == 0 &&
-      // FIXME: Use Function::optForSize().
-      MF.getFunction()->hasFnAttribute(Attribute::OptimizeForSize))
-    MaxDuplicateCount = 1;
-  else
-    MaxDuplicateCount = TailDuplicateSize;
-
-  // If the target has hardware branch prediction that can handle indirect
-  // branches, duplicating them can often make them predictable when there
-  // are common paths through the code.  The limit needs to be high enough
-  // to allow undoing the effects of tail merging and other optimizations
-  // that rearrange the predecessors of the indirect branch.
-
-  bool HasIndirectbr = false;
-  if (!TailBB.empty())
-    HasIndirectbr = TailBB.back().isIndirectBranch();
-
-  if (HasIndirectbr && PreRegAlloc)
-    MaxDuplicateCount = 20;
-
-  // Check the instructions in the block to determine whether tail-duplication
-  // is invalid or unlikely to be profitable.
-  unsigned InstrCount = 0;
-  for (MachineInstr &MI : TailBB) {
-    // Non-duplicable things shouldn't be tail-duplicated.
-    if (MI.isNotDuplicable())
-      return false;
-
-    // Do not duplicate 'return' instructions if this is a pre-regalloc run.
-    // A return may expand into a lot more instructions (e.g. reload of callee
-    // saved registers) after PEI.
-    if (PreRegAlloc && MI.isReturn())
-      return false;
-
-    // Avoid duplicating calls before register allocation. Calls presents a
-    // barrier to register allocation so duplicating them may end up increasing
-    // spills.
-    if (PreRegAlloc && MI.isCall())
-      return false;
-
-    if (!MI.isPHI() && !MI.isDebugValue())
-      InstrCount += 1;
-
-    if (InstrCount > MaxDuplicateCount)
-      return false;
-  }
-
-  // Check if any of the successors of TailBB has a PHI node in which the
-  // value corresponding to TailBB uses a subregister.
-  // If a phi node uses a register paired with a subregister, the actual
-  // "value type" of the phi may differ from the type of the register without
-  // any subregisters. Due to a bug, tail duplication may add a new operand
-  // without a necessary subregister, producing an invalid code. This is
-  // demonstrated by test/CodeGen/Hexagon/tail-dup-subreg-abort.ll.
-  // Disable tail duplication for this case for now, until the problem is
-  // fixed.
-  for (auto SB : TailBB.successors()) {
-    for (auto &I : *SB) {
-      if (!I.isPHI())
-        break;
-      unsigned Idx = getPHISrcRegOpIdx(&I, &TailBB);
-      assert(Idx != 0);
-      MachineOperand &PU = I.getOperand(Idx);
-      if (PU.getSubReg() != 0)
-        return false;
-    }
-  }
-
-  if (HasIndirectbr && PreRegAlloc)
-    return true;
-
-  if (IsSimple)
-    return true;
-
-  if (!PreRegAlloc)
-    return true;
-
-  return canCompletelyDuplicateBB(TailBB);
-}
-
-/// True if this BB has only one unconditional jump.
-bool
-TailDuplicatePass::isSimpleBB(MachineBasicBlock *TailBB) {
-  if (TailBB->succ_size() != 1)
-    return false;
-  if (TailBB->pred_empty())
-    return false;
-  MachineBasicBlock::iterator I = TailBB->getFirstNonDebugInstr();
-  if (I == TailBB->end())
-    return true;
-  return I->isUnconditionalBranch();
-}
-
-static bool
-bothUsedInPHI(const MachineBasicBlock &A,
-              SmallPtrSet<MachineBasicBlock*, 8> SuccsB) {
-  for (MachineBasicBlock *BB : A.successors())
-    if (SuccsB.count(BB) && !BB->empty() && BB->begin()->isPHI())
-      return true;
-
-  return false;
-}
-
-bool
-TailDuplicatePass::canCompletelyDuplicateBB(MachineBasicBlock &BB) {
-  for (MachineBasicBlock *PredBB : BB.predecessors()) {
-    if (PredBB->succ_size() > 1)
-      return false;
-
-    MachineBasicBlock *PredTBB = nullptr, *PredFBB = nullptr;
-    SmallVector<MachineOperand, 4> PredCond;
-    if (TII->AnalyzeBranch(*PredBB, PredTBB, PredFBB, PredCond, true))
-      return false;
-
-    if (!PredCond.empty())
-      return false;
-  }
-  return true;
-}
-
-bool
-TailDuplicatePass::duplicateSimpleBB(MachineBasicBlock *TailBB,
-                                    SmallVectorImpl<MachineBasicBlock *> &TDBBs,
-                                    const DenseSet<unsigned> &UsedByPhi,
-                                    SmallVectorImpl<MachineInstr *> &Copies) {
-  SmallPtrSet<MachineBasicBlock*, 8> Succs(TailBB->succ_begin(),
-                                           TailBB->succ_end());
-  SmallVector<MachineBasicBlock*, 8> Preds(TailBB->pred_begin(),
-                                           TailBB->pred_end());
-  bool Changed = false;
-  for (SmallSetVector<MachineBasicBlock *, 8>::iterator PI = Preds.begin(),
-       PE = Preds.end(); PI != PE; ++PI) {
-    MachineBasicBlock *PredBB = *PI;
-
-    if (PredBB->hasEHPadSuccessor())
-      continue;
-
-    if (bothUsedInPHI(*PredBB, Succs))
-      continue;
-
-    MachineBasicBlock *PredTBB = nullptr, *PredFBB = nullptr;
-    SmallVector<MachineOperand, 4> PredCond;
-    if (TII->AnalyzeBranch(*PredBB, PredTBB, PredFBB, PredCond, true))
-      continue;
-
-    Changed = true;
-    DEBUG(dbgs() << "\nTail-duplicating into PredBB: " << *PredBB
-                 << "From simple Succ: " << *TailBB);
-
-    MachineBasicBlock *NewTarget = *TailBB->succ_begin();
-    MachineBasicBlock *NextBB = &*std::next(PredBB->getIterator());
-
-    // Make PredFBB explicit.
-    if (PredCond.empty())
-      PredFBB = PredTBB;
-
-    // Make fall through explicit.
-    if (!PredTBB)
-      PredTBB = NextBB;
-    if (!PredFBB)
-      PredFBB = NextBB;
-
-    // Redirect
-    if (PredFBB == TailBB)
-      PredFBB = NewTarget;
-    if (PredTBB == TailBB)
-      PredTBB = NewTarget;
-
-    // Make the branch unconditional if possible
-    if (PredTBB == PredFBB) {
-      PredCond.clear();
-      PredFBB = nullptr;
-    }
-
-    // Avoid adding fall through branches.
-    if (PredFBB == NextBB)
-      PredFBB = nullptr;
-    if (PredTBB == NextBB && PredFBB == nullptr)
-      PredTBB = nullptr;
-
-    TII->RemoveBranch(*PredBB);
-
-    if (PredTBB)
-      TII->InsertBranch(*PredBB, PredTBB, PredFBB, PredCond, DebugLoc());
-
-    if (!PredBB->isSuccessor(NewTarget))
-      PredBB->replaceSuccessor(TailBB, NewTarget);
-    else {
-      PredBB->removeSuccessor(TailBB, true);
-      assert(PredBB->succ_size() <= 1);
-    }
-
-    TDBBs.push_back(PredBB);
-  }
-  return Changed;
-}
-
-/// If it is profitable, duplicate TailBB's contents in each
-/// of its predecessors.
-bool
-TailDuplicatePass::TailDuplicate(MachineBasicBlock *TailBB,
-                                 bool IsSimple,
-                                 MachineFunction &MF,
-                                 SmallVectorImpl<MachineBasicBlock *> &TDBBs,
-                                 SmallVectorImpl<MachineInstr *> &Copies) {
-  DEBUG(dbgs() << "\n*** Tail-duplicating BB#" << TailBB->getNumber() << '\n');
-
-  DenseSet<unsigned> UsedByPhi;
-  getRegsUsedByPHIs(*TailBB, &UsedByPhi);
-
-  if (IsSimple)
-    return duplicateSimpleBB(TailBB, TDBBs, UsedByPhi, Copies);
-
-  // Iterate through all the unique predecessors and tail-duplicate this
-  // block into them, if possible. Copying the list ahead of time also
-  // avoids trouble with the predecessor list reallocating.
-  bool Changed = false;
-  SmallSetVector<MachineBasicBlock*, 8> Preds(TailBB->pred_begin(),
-                                              TailBB->pred_end());
-  for (SmallSetVector<MachineBasicBlock *, 8>::iterator PI = Preds.begin(),
-       PE = Preds.end(); PI != PE; ++PI) {
-    MachineBasicBlock *PredBB = *PI;
-
-    assert(TailBB != PredBB &&
-           "Single-block loop should have been rejected earlier!");
-    // EH edges are ignored by AnalyzeBranch.
-    if (PredBB->succ_size() > 1)
-      continue;
-
-    MachineBasicBlock *PredTBB, *PredFBB;
-    SmallVector<MachineOperand, 4> PredCond;
-    if (TII->AnalyzeBranch(*PredBB, PredTBB, PredFBB, PredCond, true))
-      continue;
-    if (!PredCond.empty())
-      continue;
-    // Don't duplicate into a fall-through predecessor (at least for now).
-    if (PredBB->isLayoutSuccessor(TailBB) && PredBB->canFallThrough())
-      continue;
-
-    DEBUG(dbgs() << "\nTail-duplicating into PredBB: " << *PredBB
-                 << "From Succ: " << *TailBB);
-
-    TDBBs.push_back(PredBB);
-
-    // Remove PredBB's unconditional branch.
-    TII->RemoveBranch(*PredBB);
-
-    if (RS && !TailBB->livein_empty()) {
-      // Update PredBB livein.
-      RS->enterBasicBlock(PredBB);
-      if (!PredBB->empty())
-        RS->forward(std::prev(PredBB->end()));
-      for (const auto &LI : TailBB->liveins()) {
-        if (!RS->isRegUsed(LI.PhysReg, false))
-          // If a register is previously livein to the tail but it's not live
-          // at the end of predecessor BB, then it should be added to its
-          // livein list.
-          PredBB->addLiveIn(LI);
-      }
-    }
-
-    // Clone the contents of TailBB into PredBB.
-    DenseMap<unsigned, unsigned> LocalVRMap;
-    SmallVector<std::pair<unsigned,unsigned>, 4> CopyInfos;
-    // Use instr_iterator here to properly handle bundles, e.g.
-    // ARM Thumb2 IT block.
-    MachineBasicBlock::instr_iterator I = TailBB->instr_begin();
-    while (I != TailBB->instr_end()) {
-      MachineInstr *MI = &*I;
-      ++I;
-      if (MI->isPHI()) {
-        // Replace the uses of the def of the PHI with the register coming
-        // from PredBB.
-        ProcessPHI(MI, TailBB, PredBB, LocalVRMap, CopyInfos, UsedByPhi, true);
-      } else {
-        // Replace def of virtual registers with new registers, and update
-        // uses with PHI source register or the new registers.
-        DuplicateInstruction(MI, TailBB, PredBB, MF, LocalVRMap, UsedByPhi);
-      }
-    }
-    MachineBasicBlock::iterator Loc = PredBB->getFirstTerminator();
-    for (unsigned i = 0, e = CopyInfos.size(); i != e; ++i) {
-      Copies.push_back(BuildMI(*PredBB, Loc, DebugLoc(),
-                               TII->get(TargetOpcode::COPY),
-                               CopyInfos[i].first).addReg(CopyInfos[i].second));
-    }
-
-    // Simplify
-    TII->AnalyzeBranch(*PredBB, PredTBB, PredFBB, PredCond, true);
-
-    NumInstrDups += TailBB->size() - 1; // subtract one for removed branch
-
-    // Update the CFG.
-    PredBB->removeSuccessor(PredBB->succ_begin());
-    assert(PredBB->succ_empty() &&
-           "TailDuplicate called on block with multiple successors!");
-    for (MachineBasicBlock::succ_iterator I = TailBB->succ_begin(),
-           E = TailBB->succ_end(); I != E; ++I)
-      PredBB->addSuccessor(*I, MBPI->getEdgeProbability(TailBB, I));
-
-    Changed = true;
-    ++NumTailDups;
-  }
-
-  // If TailBB was duplicated into all its predecessors except for the prior
-  // block, which falls through unconditionally, move the contents of this
-  // block into the prior block.
-  MachineBasicBlock *PrevBB = &*std::prev(TailBB->getIterator());
-  MachineBasicBlock *PriorTBB = nullptr, *PriorFBB = nullptr;
-  SmallVector<MachineOperand, 4> PriorCond;
-  // This has to check PrevBB->succ_size() because EH edges are ignored by
-  // AnalyzeBranch.
-  if (PrevBB->succ_size() == 1 &&
-      !TII->AnalyzeBranch(*PrevBB, PriorTBB, PriorFBB, PriorCond, true) &&
-      PriorCond.empty() && !PriorTBB && TailBB->pred_size() == 1 &&
-      !TailBB->hasAddressTaken()) {
-    DEBUG(dbgs() << "\nMerging into block: " << *PrevBB
-          << "From MBB: " << *TailBB);
-    if (PreRegAlloc) {
-      DenseMap<unsigned, unsigned> LocalVRMap;
-      SmallVector<std::pair<unsigned,unsigned>, 4> CopyInfos;
-      MachineBasicBlock::iterator I = TailBB->begin();
-      // Process PHI instructions first.
-      while (I != TailBB->end() && I->isPHI()) {
-        // Replace the uses of the def of the PHI with the register coming
-        // from PredBB.
-        MachineInstr *MI = &*I++;
-        ProcessPHI(MI, TailBB, PrevBB, LocalVRMap, CopyInfos, UsedByPhi, true);
-        if (MI->getParent())
-          MI->eraseFromParent();
-      }
-
-      // Now copy the non-PHI instructions.
-      while (I != TailBB->end()) {
-        // Replace def of virtual registers with new registers, and update
-        // uses with PHI source register or the new registers.
-        MachineInstr *MI = &*I++;
-        assert(!MI->isBundle() && "Not expecting bundles before regalloc!");
-        DuplicateInstruction(MI, TailBB, PrevBB, MF, LocalVRMap, UsedByPhi);
-        MI->eraseFromParent();
-      }
-      MachineBasicBlock::iterator Loc = PrevBB->getFirstTerminator();
-      for (unsigned i = 0, e = CopyInfos.size(); i != e; ++i) {
-        Copies.push_back(BuildMI(*PrevBB, Loc, DebugLoc(),
-                                 TII->get(TargetOpcode::COPY),
-                                 CopyInfos[i].first)
-                           .addReg(CopyInfos[i].second));
-      }
-    } else {
-      // No PHIs to worry about, just splice the instructions over.
-      PrevBB->splice(PrevBB->end(), TailBB, TailBB->begin(), TailBB->end());
-    }
-    PrevBB->removeSuccessor(PrevBB->succ_begin());
-    assert(PrevBB->succ_empty());
-    PrevBB->transferSuccessors(TailBB);
-    TDBBs.push_back(PrevBB);
-    Changed = true;
-  }
-
-  // If this is after register allocation, there are no phis to fix.
-  if (!PreRegAlloc)
-    return Changed;
-
-  // If we made no changes so far, we are safe.
-  if (!Changed)
-    return Changed;
-
-
-  // Handle the nasty case in that we duplicated a block that is part of a loop
-  // into some but not all of its predecessors. For example:
-  //    1 -> 2 <-> 3                 |
-  //          \                      |
-  //           \---> rest            |
-  // if we duplicate 2 into 1 but not into 3, we end up with
-  // 12 -> 3 <-> 2 -> rest           |
-  //   \             /               |
-  //    \----->-----/                |
-  // If there was a "var = phi(1, 3)" in 2, it has to be ultimately replaced
-  // with a phi in 3 (which now dominates 2).
-  // What we do here is introduce a copy in 3 of the register defined by the
-  // phi, just like when we are duplicating 2 into 3, but we don't copy any
-  // real instructions or remove the 3 -> 2 edge from the phi in 2.
-  for (SmallSetVector<MachineBasicBlock *, 8>::iterator PI = Preds.begin(),
-       PE = Preds.end(); PI != PE; ++PI) {
-    MachineBasicBlock *PredBB = *PI;
-    if (std::find(TDBBs.begin(), TDBBs.end(), PredBB) != TDBBs.end())
-      continue;
-
-    // EH edges
-    if (PredBB->succ_size() != 1)
-      continue;
-
-    DenseMap<unsigned, unsigned> LocalVRMap;
-    SmallVector<std::pair<unsigned,unsigned>, 4> CopyInfos;
-    MachineBasicBlock::iterator I = TailBB->begin();
-    // Process PHI instructions first.
-    while (I != TailBB->end() && I->isPHI()) {
-      // Replace the uses of the def of the PHI with the register coming
-      // from PredBB.
-      MachineInstr *MI = &*I++;
-      ProcessPHI(MI, TailBB, PredBB, LocalVRMap, CopyInfos, UsedByPhi, false);
-    }
-    MachineBasicBlock::iterator Loc = PredBB->getFirstTerminator();
-    for (unsigned i = 0, e = CopyInfos.size(); i != e; ++i) {
-      Copies.push_back(BuildMI(*PredBB, Loc, DebugLoc(),
-                               TII->get(TargetOpcode::COPY),
-                               CopyInfos[i].first).addReg(CopyInfos[i].second));
-    }
-  }
-
-  return Changed;
-}
-
-/// Remove the specified dead machine basic block from the function, updating
-/// the CFG.
-void TailDuplicatePass::RemoveDeadBlock(MachineBasicBlock *MBB) {
-  assert(MBB->pred_empty() && "MBB must be dead!");
-  DEBUG(dbgs() << "\nRemoving MBB: " << *MBB);
-
-  // Remove all successors.
-  while (!MBB->succ_empty())
-    MBB->removeSuccessor(MBB->succ_end()-1);
-
-  // Remove the block.
-  MBB->eraseFromParent();
-}
diff --git a/lib/CodeGen/TailDuplicator.cpp b/lib/CodeGen/TailDuplicator.cpp
new file mode 100644
index 000000000000..847a09349a59
--- /dev/null
+++ b/lib/CodeGen/TailDuplicator.cpp
@@ -0,0 +1,932 @@
+//===-- TailDuplicator.cpp - Duplicate blocks into predecessors' tails ---===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This utility class duplicates basic blocks ending in unconditional branches
+// into the tails of their predecessors.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/CodeGen/TailDuplicator.h"
+#include "llvm/ADT/DenseSet.h"
+#include "llvm/ADT/SetVector.h"
+#include "llvm/ADT/SmallSet.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/CodeGen/MachineBranchProbabilityInfo.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineModuleInfo.h"
+#include "llvm/CodeGen/Passes.h"
+#include "llvm/IR/Function.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/raw_ostream.h"
+using namespace llvm;
+
+#define DEBUG_TYPE "tailduplication"
+
+STATISTIC(NumTails, "Number of tails duplicated");
+STATISTIC(NumTailDups, "Number of tail duplicated blocks");
+STATISTIC(NumTailDupAdded,
+          "Number of instructions added due to tail duplication");
+STATISTIC(NumTailDupRemoved,
+          "Number of instructions removed due to tail duplication");
+STATISTIC(NumDeadBlocks, "Number of dead blocks removed");
+STATISTIC(NumAddedPHIs, "Number of phis added");
+
+// Heuristic for tail duplication.
+static cl::opt<unsigned> TailDuplicateSize(
+    "tail-dup-size",
+    cl::desc("Maximum instructions to consider tail duplicating"), cl::init(2),
+    cl::Hidden);
+
+static cl::opt<bool>
+    TailDupVerify("tail-dup-verify",
+                  cl::desc("Verify sanity of PHI instructions during taildup"),
+                  cl::init(false), cl::Hidden);
+
+static cl::opt<unsigned> TailDupLimit("tail-dup-limit", cl::init(~0U),
+                                      cl::Hidden);
+
+namespace llvm {
+
+void TailDuplicator::initMF(MachineFunction &MF, const MachineModuleInfo *MMIin,
+                            const MachineBranchProbabilityInfo *MBPIin) {
+  TII = MF.getSubtarget().getInstrInfo();
+  TRI = MF.getSubtarget().getRegisterInfo();
+  MRI = &MF.getRegInfo();
+  MMI = MMIin;
+  MBPI = MBPIin;
+
+  assert(MBPI != nullptr && "Machine Branch Probability Info required");
+
+  PreRegAlloc = MRI->isSSA();
+}
+
+static void VerifyPHIs(MachineFunction &MF, bool CheckExtra) {
+  for (MachineFunction::iterator I = ++MF.begin(), E = MF.end(); I != E; ++I) {
+    MachineBasicBlock *MBB = &*I;
+    SmallSetVector<MachineBasicBlock *, 8> Preds(MBB->pred_begin(),
+                                                 MBB->pred_end());
+    MachineBasicBlock::iterator MI = MBB->begin();
+    while (MI != MBB->end()) {
+      if (!MI->isPHI())
+        break;
+      for (SmallSetVector<MachineBasicBlock *, 8>::iterator PI = Preds.begin(),
+                                                            PE = Preds.end();
+           PI != PE; ++PI) {
+        MachineBasicBlock *PredBB = *PI;
+        bool Found = false;
+        for (unsigned i = 1, e = MI->getNumOperands(); i != e; i += 2) {
+          MachineBasicBlock *PHIBB = MI->getOperand(i + 1).getMBB();
+          if (PHIBB == PredBB) {
+            Found = true;
+            break;
+          }
+        }
+        if (!Found) {
+          dbgs() << "Malformed PHI in BB#" << MBB->getNumber() << ": " << *MI;
+          dbgs() << "  missing input from predecessor BB#"
+                 << PredBB->getNumber() << '\n';
+          llvm_unreachable(nullptr);
+        }
+      }
+
+      for (unsigned i = 1, e = MI->getNumOperands(); i != e; i += 2) {
+        MachineBasicBlock *PHIBB = MI->getOperand(i + 1).getMBB();
+        if (CheckExtra && !Preds.count(PHIBB)) {
+          dbgs() << "Warning: malformed PHI in BB#" << MBB->getNumber() << ": "
+                 << *MI;
+          dbgs() << "  extra input from predecessor BB#" << PHIBB->getNumber()
+                 << '\n';
+          llvm_unreachable(nullptr);
+        }
+        if (PHIBB->getNumber() < 0) {
+          dbgs() << "Malformed PHI in BB#" << MBB->getNumber() << ": " << *MI;
+          dbgs() << "  non-existing BB#" << PHIBB->getNumber() << '\n';
+          llvm_unreachable(nullptr);
+        }
+      }
+      ++MI;
+    }
+  }
+}
+
+/// Tail duplicate the block and cleanup.
+bool TailDuplicator::tailDuplicateAndUpdate(MachineFunction &MF, bool IsSimple,
+                                            MachineBasicBlock *MBB) {
+  // Save the successors list.
+  SmallSetVector<MachineBasicBlock *, 8> Succs(MBB->succ_begin(),
+                                               MBB->succ_end());
+
+  SmallVector<MachineBasicBlock *, 8> TDBBs;
+  SmallVector<MachineInstr *, 16> Copies;
+  if (!tailDuplicate(MF, IsSimple, MBB, TDBBs, Copies))
+    return false;
+
+  ++NumTails;
+
+  SmallVector<MachineInstr *, 8> NewPHIs;
+  MachineSSAUpdater SSAUpdate(MF, &NewPHIs);
+
+  // TailBB's immediate successors are now successors of those predecessors
+  // which duplicated TailBB. Add the predecessors as sources to the PHI
+  // instructions.
+  bool isDead = MBB->pred_empty() && !MBB->hasAddressTaken();
+  if (PreRegAlloc)
+    updateSuccessorsPHIs(MBB, isDead, TDBBs, Succs);
+
+  // If it is dead, remove it.
+  if (isDead) {
+    NumTailDupRemoved += MBB->size();
+    removeDeadBlock(MBB);
+    ++NumDeadBlocks;
+  }
+
+  // Update SSA form.
+  if (!SSAUpdateVRs.empty()) {
+    for (unsigned i = 0, e = SSAUpdateVRs.size(); i != e; ++i) {
+      unsigned VReg = SSAUpdateVRs[i];
+      SSAUpdate.Initialize(VReg);
+
+      // If the original definition is still around, add it as an available
+      // value.
+      MachineInstr *DefMI = MRI->getVRegDef(VReg);
+      MachineBasicBlock *DefBB = nullptr;
+      if (DefMI) {
+        DefBB = DefMI->getParent();
+        SSAUpdate.AddAvailableValue(DefBB, VReg);
+      }
+
+      // Add the new vregs as available values.
+      DenseMap<unsigned, AvailableValsTy>::iterator LI =
+          SSAUpdateVals.find(VReg);
+      for (unsigned j = 0, ee = LI->second.size(); j != ee; ++j) {
+        MachineBasicBlock *SrcBB = LI->second[j].first;
+        unsigned SrcReg = LI->second[j].second;
+        SSAUpdate.AddAvailableValue(SrcBB, SrcReg);
+      }
+
+      // Rewrite uses that are outside of the original def's block.
+      MachineRegisterInfo::use_iterator UI = MRI->use_begin(VReg);
+      while (UI != MRI->use_end()) {
+        MachineOperand &UseMO = *UI;
+        MachineInstr *UseMI = UseMO.getParent();
+        ++UI;
+        if (UseMI->isDebugValue()) {
+          // SSAUpdate can replace the use with an undef. That creates
+          // a debug instruction that is a kill.
+          // FIXME: Should it SSAUpdate job to delete debug instructions
+          // instead of replacing the use with undef?
+          UseMI->eraseFromParent();
+          continue;
+        }
+        if (UseMI->getParent() == DefBB && !UseMI->isPHI())
+          continue;
+        SSAUpdate.RewriteUse(UseMO);
+      }
+    }
+
+    SSAUpdateVRs.clear();
+    SSAUpdateVals.clear();
+  }
+
+  // Eliminate some of the copies inserted by tail duplication to maintain
+  // SSA form.
+  for (unsigned i = 0, e = Copies.size(); i != e; ++i) {
+    MachineInstr *Copy = Copies[i];
+    if (!Copy->isCopy())
+      continue;
+    unsigned Dst = Copy->getOperand(0).getReg();
+    unsigned Src = Copy->getOperand(1).getReg();
+    if (MRI->hasOneNonDBGUse(Src) &&
+        MRI->constrainRegClass(Src, MRI->getRegClass(Dst))) {
+      // Copy is the only use. Do trivial copy propagation here.
+      MRI->replaceRegWith(Dst, Src);
+      Copy->eraseFromParent();
+    }
+  }
+
+  if (NewPHIs.size())
+    NumAddedPHIs += NewPHIs.size();
+
+  return true;
+}
+
+/// Look for small blocks that are unconditionally branched to and do not fall
+/// through. Tail-duplicate their instructions into their predecessors to
+/// eliminate (dynamic) branches.
+bool TailDuplicator::tailDuplicateBlocks(MachineFunction &MF) {
+  bool MadeChange = false;
+
+  if (PreRegAlloc && TailDupVerify) {
+    DEBUG(dbgs() << "\n*** Before tail-duplicating\n");
+    VerifyPHIs(MF, true);
+  }
+
+  for (MachineFunction::iterator I = ++MF.begin(), E = MF.end(); I != E;) {
+    MachineBasicBlock *MBB = &*I++;
+
+    if (NumTails == TailDupLimit)
+      break;
+
+    bool IsSimple = isSimpleBB(MBB);
+
+    if (!shouldTailDuplicate(MF, IsSimple, *MBB))
+      continue;
+
+    MadeChange |= tailDuplicateAndUpdate(MF, IsSimple, MBB);
+  }
+
+  if (PreRegAlloc && TailDupVerify)
+    VerifyPHIs(MF, false);
+
+  return MadeChange;
+}
+
+static bool isDefLiveOut(unsigned Reg, MachineBasicBlock *BB,
+                         const MachineRegisterInfo *MRI) {
+  for (MachineInstr &UseMI : MRI->use_instructions(Reg)) {
+    if (UseMI.isDebugValue())
+      continue;
+    if (UseMI.getParent() != BB)
+      return true;
+  }
+  return false;
+}
+
+static unsigned getPHISrcRegOpIdx(MachineInstr *MI, MachineBasicBlock *SrcBB) {
+  for (unsigned i = 1, e = MI->getNumOperands(); i != e; i += 2)
+    if (MI->getOperand(i + 1).getMBB() == SrcBB)
+      return i;
+  return 0;
+}
+
+// Remember which registers are used by phis in this block. This is
+// used to determine which registers are liveout while modifying the
+// block (which is why we need to copy the information).
+static void getRegsUsedByPHIs(const MachineBasicBlock &BB,
+                              DenseSet<unsigned> *UsedByPhi) {
+  for (const auto &MI : BB) {
+    if (!MI.isPHI())
+      break;
+    for (unsigned i = 1, e = MI.getNumOperands(); i != e; i += 2) {
+      unsigned SrcReg = MI.getOperand(i).getReg();
+      UsedByPhi->insert(SrcReg);
+    }
+  }
+}
+
+/// Add a definition and source virtual registers pair for SSA update.
+void TailDuplicator::addSSAUpdateEntry(unsigned OrigReg, unsigned NewReg,
+                                       MachineBasicBlock *BB) {
+  DenseMap<unsigned, AvailableValsTy>::iterator LI =
+      SSAUpdateVals.find(OrigReg);
+  if (LI != SSAUpdateVals.end())
+    LI->second.push_back(std::make_pair(BB, NewReg));
+  else {
+    AvailableValsTy Vals;
+    Vals.push_back(std::make_pair(BB, NewReg));
+    SSAUpdateVals.insert(std::make_pair(OrigReg, Vals));
+    SSAUpdateVRs.push_back(OrigReg);
+  }
+}
+
+/// Process PHI node in TailBB by turning it into a copy in PredBB. Remember the
+/// source register that's contributed by PredBB and update SSA update map.
+void TailDuplicator::processPHI(
+    MachineInstr *MI, MachineBasicBlock *TailBB, MachineBasicBlock *PredBB,
+    DenseMap<unsigned, RegSubRegPair> &LocalVRMap,
+    SmallVectorImpl<std::pair<unsigned, RegSubRegPair>> &Copies,
+    const DenseSet<unsigned> &RegsUsedByPhi, bool Remove) {
+  unsigned DefReg = MI->getOperand(0).getReg();
+  unsigned SrcOpIdx = getPHISrcRegOpIdx(MI, PredBB);
+  assert(SrcOpIdx && "Unable to find matching PHI source?");
+  unsigned SrcReg = MI->getOperand(SrcOpIdx).getReg();
+  unsigned SrcSubReg = MI->getOperand(SrcOpIdx).getSubReg();
+  const TargetRegisterClass *RC = MRI->getRegClass(DefReg);
+  LocalVRMap.insert(std::make_pair(DefReg, RegSubRegPair(SrcReg, SrcSubReg)));
+
+  // Insert a copy from source to the end of the block. The def register is the
+  // available value liveout of the block.
+  unsigned NewDef = MRI->createVirtualRegister(RC);
+  Copies.push_back(std::make_pair(NewDef, RegSubRegPair(SrcReg, SrcSubReg)));
+  if (isDefLiveOut(DefReg, TailBB, MRI) || RegsUsedByPhi.count(DefReg))
+    addSSAUpdateEntry(DefReg, NewDef, PredBB);
+
+  if (!Remove)
+    return;
+
+  // Remove PredBB from the PHI node.
+  MI->RemoveOperand(SrcOpIdx + 1);
+  MI->RemoveOperand(SrcOpIdx);
+  if (MI->getNumOperands() == 1)
+    MI->eraseFromParent();
+}
+
+/// Duplicate a TailBB instruction to PredBB and update
+/// the source operands due to earlier PHI translation.
+void TailDuplicator::duplicateInstruction(
+    MachineInstr *MI, MachineBasicBlock *TailBB, MachineBasicBlock *PredBB,
+    MachineFunction &MF,
+    DenseMap<unsigned, RegSubRegPair> &LocalVRMap,
+    const DenseSet<unsigned> &UsedByPhi) {
+  MachineInstr *NewMI = TII->duplicate(*MI, MF);
+  if (PreRegAlloc) {
+    for (unsigned i = 0, e = NewMI->getNumOperands(); i != e; ++i) {
+      MachineOperand &MO = NewMI->getOperand(i);
+      if (!MO.isReg())
+        continue;
+      unsigned Reg = MO.getReg();
+      if (!TargetRegisterInfo::isVirtualRegister(Reg))
+        continue;
+      if (MO.isDef()) {
+        const TargetRegisterClass *RC = MRI->getRegClass(Reg);
+        unsigned NewReg = MRI->createVirtualRegister(RC);
+        MO.setReg(NewReg);
+        LocalVRMap.insert(std::make_pair(Reg, RegSubRegPair(NewReg, 0)));
+        if (isDefLiveOut(Reg, TailBB, MRI) || UsedByPhi.count(Reg))
+          addSSAUpdateEntry(Reg, NewReg, PredBB);
+      } else {
+        auto VI = LocalVRMap.find(Reg);
+        if (VI != LocalVRMap.end()) {
+          // Need to make sure that the register class of the mapped register
+          // will satisfy the constraints of the class of the register being
+          // replaced.
+          auto *OrigRC = MRI->getRegClass(Reg);
+          auto *MappedRC = MRI->getRegClass(VI->second.Reg);
+          const TargetRegisterClass *ConstrRC;
+          if (VI->second.SubReg != 0) {
+            ConstrRC = TRI->getMatchingSuperRegClass(MappedRC, OrigRC,
+                                                     VI->second.SubReg);
+            if (ConstrRC) {
+              // The actual constraining (as in "find appropriate new class")
+              // is done by getMatchingSuperRegClass, so now we only need to
+              // change the class of the mapped register.
+              MRI->setRegClass(VI->second.Reg, ConstrRC);
+            }
+          } else {
+            // For mapped registers that do not have sub-registers, simply
+            // restrict their class to match the original one.
+            ConstrRC = MRI->constrainRegClass(VI->second.Reg, OrigRC);
+          }
+
+          if (ConstrRC) {
+            // If the class constraining succeeded, we can simply replace
+            // the old register with the mapped one.
+            MO.setReg(VI->second.Reg);
+            // We have Reg -> VI.Reg:VI.SubReg, so if Reg is used with a
+            // sub-register, we need to compose the sub-register indices.
+            MO.setSubReg(TRI->composeSubRegIndices(MO.getSubReg(),
+                                                   VI->second.SubReg));
+          } else {
+            // The direct replacement is not possible, due to failing register
+            // class constraints. An explicit COPY is necessary. Create one
+            // that can be reused
+            auto *NewRC = MI->getRegClassConstraint(i, TII, TRI);
+            if (NewRC == nullptr)
+              NewRC = OrigRC;
+            unsigned NewReg = MRI->createVirtualRegister(NewRC);
+            BuildMI(*PredBB, MI, MI->getDebugLoc(),
+                    TII->get(TargetOpcode::COPY), NewReg)
+                .addReg(VI->second.Reg, 0, VI->second.SubReg);
+            LocalVRMap.erase(VI);
+            LocalVRMap.insert(std::make_pair(Reg, RegSubRegPair(NewReg, 0)));
+            MO.setReg(NewReg);
+            // The composed VI.Reg:VI.SubReg is replaced with NewReg, which
+            // is equivalent to the whole register Reg. Hence, Reg:subreg
+            // is same as NewReg:subreg, so keep the sub-register index
+            // unchanged.
+          }
+          // Clear any kill flags from this operand.  The new register could
+          // have uses after this one, so kills are not valid here.
+          MO.setIsKill(false);
+        }
+      }
+    }
+  }
+  PredBB->insert(PredBB->instr_end(), NewMI);
+}
+
+/// After FromBB is tail duplicated into its predecessor blocks, the successors
+/// have gained new predecessors. Update the PHI instructions in them
+/// accordingly.
+void TailDuplicator::updateSuccessorsPHIs(
+    MachineBasicBlock *FromBB, bool isDead,
+    SmallVectorImpl<MachineBasicBlock *> &TDBBs,
+    SmallSetVector<MachineBasicBlock *, 8> &Succs) {
+  for (SmallSetVector<MachineBasicBlock *, 8>::iterator SI = Succs.begin(),
+                                                        SE = Succs.end();
+       SI != SE; ++SI) {
+    MachineBasicBlock *SuccBB = *SI;
+    for (MachineBasicBlock::iterator II = SuccBB->begin(), EE = SuccBB->end();
+         II != EE; ++II) {
+      if (!II->isPHI())
+        break;
+      MachineInstrBuilder MIB(*FromBB->getParent(), II);
+      unsigned Idx = 0;
+      for (unsigned i = 1, e = II->getNumOperands(); i != e; i += 2) {
+        MachineOperand &MO = II->getOperand(i + 1);
+        if (MO.getMBB() == FromBB) {
+          Idx = i;
+          break;
+        }
+      }
+
+      assert(Idx != 0);
+      MachineOperand &MO0 = II->getOperand(Idx);
+      unsigned Reg = MO0.getReg();
+      if (isDead) {
+        // Folded into the previous BB.
+        // There could be duplicate phi source entries. FIXME: Should sdisel
+        // or earlier pass fixed this?
+        for (unsigned i = II->getNumOperands() - 2; i != Idx; i -= 2) {
+          MachineOperand &MO = II->getOperand(i + 1);
+          if (MO.getMBB() == FromBB) {
+            II->RemoveOperand(i + 1);
+            II->RemoveOperand(i);
+          }
+        }
+      } else
+        Idx = 0;
+
+      // If Idx is set, the operands at Idx and Idx+1 must be removed.
+      // We reuse the location to avoid expensive RemoveOperand calls.
+
+      DenseMap<unsigned, AvailableValsTy>::iterator LI =
+          SSAUpdateVals.find(Reg);
+      if (LI != SSAUpdateVals.end()) {
+        // This register is defined in the tail block.
+        for (unsigned j = 0, ee = LI->second.size(); j != ee; ++j) {
+          MachineBasicBlock *SrcBB = LI->second[j].first;
+          // If we didn't duplicate a bb into a particular predecessor, we
+          // might still have added an entry to SSAUpdateVals to correcly
+          // recompute SSA. If that case, avoid adding a dummy extra argument
+          // this PHI.
+          if (!SrcBB->isSuccessor(SuccBB))
+            continue;
+
+          unsigned SrcReg = LI->second[j].second;
+          if (Idx != 0) {
+            II->getOperand(Idx).setReg(SrcReg);
+            II->getOperand(Idx + 1).setMBB(SrcBB);
+            Idx = 0;
+          } else {
+            MIB.addReg(SrcReg).addMBB(SrcBB);
+          }
+        }
+      } else {
+        // Live in tail block, must also be live in predecessors.
+        for (unsigned j = 0, ee = TDBBs.size(); j != ee; ++j) {
+          MachineBasicBlock *SrcBB = TDBBs[j];
+          if (Idx != 0) {
+            II->getOperand(Idx).setReg(Reg);
+            II->getOperand(Idx + 1).setMBB(SrcBB);
+            Idx = 0;
+          } else {
+            MIB.addReg(Reg).addMBB(SrcBB);
+          }
+        }
+      }
+      if (Idx != 0) {
+        II->RemoveOperand(Idx + 1);
+        II->RemoveOperand(Idx);
+      }
+    }
+  }
+}
+
+/// Determine if it is profitable to duplicate this block.
+bool TailDuplicator::shouldTailDuplicate(const MachineFunction &MF,
+                                         bool IsSimple,
+                                         MachineBasicBlock &TailBB) {
+  // Only duplicate blocks that end with unconditional branches.
+  if (TailBB.canFallThrough())
+    return false;
+
+  // Don't try to tail-duplicate single-block loops.
+  if (TailBB.isSuccessor(&TailBB))
+    return false;
+
+  // Set the limit on the cost to duplicate. When optimizing for size,
+  // duplicate only one, because one branch instruction can be eliminated to
+  // compensate for the duplication.
+  unsigned MaxDuplicateCount;
+  if (TailDuplicateSize.getNumOccurrences() == 0 &&
+      // FIXME: Use Function::optForSize().
+      MF.getFunction()->hasFnAttribute(Attribute::OptimizeForSize))
+    MaxDuplicateCount = 1;
+  else
+    MaxDuplicateCount = TailDuplicateSize;
+
+  // If the target has hardware branch prediction that can handle indirect
+  // branches, duplicating them can often make them predictable when there
+  // are common paths through the code.  The limit needs to be high enough
+  // to allow undoing the effects of tail merging and other optimizations
+  // that rearrange the predecessors of the indirect branch.
+
+  bool HasIndirectbr = false;
+  if (!TailBB.empty())
+    HasIndirectbr = TailBB.back().isIndirectBranch();
+
+  if (HasIndirectbr && PreRegAlloc)
+    MaxDuplicateCount = 20;
+
+  // Check the instructions in the block to determine whether tail-duplication
+  // is invalid or unlikely to be profitable.
+  unsigned InstrCount = 0;
+  for (MachineInstr &MI : TailBB) {
+    // Non-duplicable things shouldn't be tail-duplicated.
+    if (MI.isNotDuplicable())
+      return false;
+
+    // Convergent instructions can be duplicated only if doing so doesn't add
+    // new control dependencies, which is what we're going to do here.
+    if (MI.isConvergent())
+      return false;
+
+    // Do not duplicate 'return' instructions if this is a pre-regalloc run.
+    // A return may expand into a lot more instructions (e.g. reload of callee
+    // saved registers) after PEI.
+    if (PreRegAlloc && MI.isReturn())
+      return false;
+
+    // Avoid duplicating calls before register allocation. Calls presents a
+    // barrier to register allocation so duplicating them may end up increasing
+    // spills.
+    if (PreRegAlloc && MI.isCall())
+      return false;
+
+    if (!MI.isPHI() && !MI.isDebugValue())
+      InstrCount += 1;
+
+    if (InstrCount > MaxDuplicateCount)
+      return false;
+  }
+
+  // Check if any of the successors of TailBB has a PHI node in which the
+  // value corresponding to TailBB uses a subregister.
+  // If a phi node uses a register paired with a subregister, the actual
+  // "value type" of the phi may differ from the type of the register without
+  // any subregisters. Due to a bug, tail duplication may add a new operand
+  // without a necessary subregister, producing an invalid code. This is
+  // demonstrated by test/CodeGen/Hexagon/tail-dup-subreg-abort.ll.
+  // Disable tail duplication for this case for now, until the problem is
+  // fixed.
+  for (auto SB : TailBB.successors()) {
+    for (auto &I : *SB) {
+      if (!I.isPHI())
+        break;
+      unsigned Idx = getPHISrcRegOpIdx(&I, &TailBB);
+      assert(Idx != 0);
+      MachineOperand &PU = I.getOperand(Idx);
+      if (PU.getSubReg() != 0)
+        return false;
+    }
+  }
+
+  if (HasIndirectbr && PreRegAlloc)
+    return true;
+
+  if (IsSimple)
+    return true;
+
+  if (!PreRegAlloc)
+    return true;
+
+  return canCompletelyDuplicateBB(TailBB);
+}
+
+/// True if this BB has only one unconditional jump.
+bool TailDuplicator::isSimpleBB(MachineBasicBlock *TailBB) {
+  if (TailBB->succ_size() != 1)
+    return false;
+  if (TailBB->pred_empty())
+    return false;
+  MachineBasicBlock::iterator I = TailBB->getFirstNonDebugInstr();
+  if (I == TailBB->end())
+    return true;
+  return I->isUnconditionalBranch();
+}
+
+static bool bothUsedInPHI(const MachineBasicBlock &A,
+                          const SmallPtrSet<MachineBasicBlock *, 8> &SuccsB) {
+  for (MachineBasicBlock *BB : A.successors())
+    if (SuccsB.count(BB) && !BB->empty() && BB->begin()->isPHI())
+      return true;
+
+  return false;
+}
+
+bool TailDuplicator::canCompletelyDuplicateBB(MachineBasicBlock &BB) {
+  for (MachineBasicBlock *PredBB : BB.predecessors()) {
+    if (PredBB->succ_size() > 1)
+      return false;
+
+    MachineBasicBlock *PredTBB = nullptr, *PredFBB = nullptr;
+    SmallVector<MachineOperand, 4> PredCond;
+    if (TII->analyzeBranch(*PredBB, PredTBB, PredFBB, PredCond, true))
+      return false;
+
+    if (!PredCond.empty())
+      return false;
+  }
+  return true;
+}
+
+bool TailDuplicator::duplicateSimpleBB(
+    MachineBasicBlock *TailBB, SmallVectorImpl<MachineBasicBlock *> &TDBBs,
+    const DenseSet<unsigned> &UsedByPhi,
+    SmallVectorImpl<MachineInstr *> &Copies) {
+  SmallPtrSet<MachineBasicBlock *, 8> Succs(TailBB->succ_begin(),
+                                            TailBB->succ_end());
+  SmallVector<MachineBasicBlock *, 8> Preds(TailBB->pred_begin(),
+                                            TailBB->pred_end());
+  bool Changed = false;
+  for (SmallSetVector<MachineBasicBlock *, 8>::iterator PI = Preds.begin(),
+                                                        PE = Preds.end();
+       PI != PE; ++PI) {
+    MachineBasicBlock *PredBB = *PI;
+
+    if (PredBB->hasEHPadSuccessor())
+      continue;
+
+    if (bothUsedInPHI(*PredBB, Succs))
+      continue;
+
+    MachineBasicBlock *PredTBB = nullptr, *PredFBB = nullptr;
+    SmallVector<MachineOperand, 4> PredCond;
+    if (TII->analyzeBranch(*PredBB, PredTBB, PredFBB, PredCond, true))
+      continue;
+
+    Changed = true;
+    DEBUG(dbgs() << "\nTail-duplicating into PredBB: " << *PredBB
+                 << "From simple Succ: " << *TailBB);
+
+    MachineBasicBlock *NewTarget = *TailBB->succ_begin();
+    MachineBasicBlock *NextBB = &*std::next(PredBB->getIterator());
+
+    // Make PredFBB explicit.
+    if (PredCond.empty())
+      PredFBB = PredTBB;
+
+    // Make fall through explicit.
+    if (!PredTBB)
+      PredTBB = NextBB;
+    if (!PredFBB)
+      PredFBB = NextBB;
+
+    // Redirect
+    if (PredFBB == TailBB)
+      PredFBB = NewTarget;
+    if (PredTBB == TailBB)
+      PredTBB = NewTarget;
+
+    // Make the branch unconditional if possible
+    if (PredTBB == PredFBB) {
+      PredCond.clear();
+      PredFBB = nullptr;
+    }
+
+    // Avoid adding fall through branches.
+    if (PredFBB == NextBB)
+      PredFBB = nullptr;
+    if (PredTBB == NextBB && PredFBB == nullptr)
+      PredTBB = nullptr;
+
+    TII->RemoveBranch(*PredBB);
+
+    if (!PredBB->isSuccessor(NewTarget))
+      PredBB->replaceSuccessor(TailBB, NewTarget);
+    else {
+      PredBB->removeSuccessor(TailBB, true);
+      assert(PredBB->succ_size() <= 1);
+    }
+
+    if (PredTBB)
+      TII->InsertBranch(*PredBB, PredTBB, PredFBB, PredCond, DebugLoc());
+
+    TDBBs.push_back(PredBB);
+  }
+  return Changed;
+}
+
+/// If it is profitable, duplicate TailBB's contents in each
+/// of its predecessors.
+bool TailDuplicator::tailDuplicate(MachineFunction &MF, bool IsSimple,
+                                   MachineBasicBlock *TailBB,
+                                   SmallVectorImpl<MachineBasicBlock *> &TDBBs,
+                                   SmallVectorImpl<MachineInstr *> &Copies) {
+  DEBUG(dbgs() << "\n*** Tail-duplicating BB#" << TailBB->getNumber() << '\n');
+
+  DenseSet<unsigned> UsedByPhi;
+  getRegsUsedByPHIs(*TailBB, &UsedByPhi);
+
+  if (IsSimple)
+    return duplicateSimpleBB(TailBB, TDBBs, UsedByPhi, Copies);
+
+  // Iterate through all the unique predecessors and tail-duplicate this
+  // block into them, if possible. Copying the list ahead of time also
+  // avoids trouble with the predecessor list reallocating.
+  bool Changed = false;
+  SmallSetVector<MachineBasicBlock *, 8> Preds(TailBB->pred_begin(),
+                                               TailBB->pred_end());
+  for (SmallSetVector<MachineBasicBlock *, 8>::iterator PI = Preds.begin(),
+                                                        PE = Preds.end();
+       PI != PE; ++PI) {
+    MachineBasicBlock *PredBB = *PI;
+
+    assert(TailBB != PredBB &&
+           "Single-block loop should have been rejected earlier!");
+    // EH edges are ignored by AnalyzeBranch.
+    if (PredBB->succ_size() > 1)
+      continue;
+
+    MachineBasicBlock *PredTBB, *PredFBB;
+    SmallVector<MachineOperand, 4> PredCond;
+    if (TII->analyzeBranch(*PredBB, PredTBB, PredFBB, PredCond, true))
+      continue;
+    if (!PredCond.empty())
+      continue;
+    // Don't duplicate into a fall-through predecessor (at least for now).
+    if (PredBB->isLayoutSuccessor(TailBB) && PredBB->canFallThrough())
+      continue;
+
+    DEBUG(dbgs() << "\nTail-duplicating into PredBB: " << *PredBB
+                 << "From Succ: " << *TailBB);
+
+    TDBBs.push_back(PredBB);
+
+    // Remove PredBB's unconditional branch.
+    TII->RemoveBranch(*PredBB);
+
+    // Clone the contents of TailBB into PredBB.
+    DenseMap<unsigned, RegSubRegPair> LocalVRMap;
+    SmallVector<std::pair<unsigned, RegSubRegPair>, 4> CopyInfos;
+    // Use instr_iterator here to properly handle bundles, e.g.
+    // ARM Thumb2 IT block.
+    MachineBasicBlock::instr_iterator I = TailBB->instr_begin();
+    while (I != TailBB->instr_end()) {
+      MachineInstr *MI = &*I;
+      ++I;
+      if (MI->isPHI()) {
+        // Replace the uses of the def of the PHI with the register coming
+        // from PredBB.
+        processPHI(MI, TailBB, PredBB, LocalVRMap, CopyInfos, UsedByPhi, true);
+      } else {
+        // Replace def of virtual registers with new registers, and update
+        // uses with PHI source register or the new registers.
+        duplicateInstruction(MI, TailBB, PredBB, MF, LocalVRMap, UsedByPhi);
+      }
+    }
+    appendCopies(PredBB, CopyInfos, Copies);
+
+    // Simplify
+    TII->analyzeBranch(*PredBB, PredTBB, PredFBB, PredCond, true);
+
+    NumTailDupAdded += TailBB->size() - 1; // subtract one for removed branch
+
+    // Update the CFG.
+    PredBB->removeSuccessor(PredBB->succ_begin());
+    assert(PredBB->succ_empty() &&
+           "TailDuplicate called on block with multiple successors!");
+    for (MachineBasicBlock::succ_iterator I = TailBB->succ_begin(),
+                                          E = TailBB->succ_end();
+         I != E; ++I)
+      PredBB->addSuccessor(*I, MBPI->getEdgeProbability(TailBB, I));
+
+    Changed = true;
+    ++NumTailDups;
+  }
+
+  // If TailBB was duplicated into all its predecessors except for the prior
+  // block, which falls through unconditionally, move the contents of this
+  // block into the prior block.
+  MachineBasicBlock *PrevBB = &*std::prev(TailBB->getIterator());
+  MachineBasicBlock *PriorTBB = nullptr, *PriorFBB = nullptr;
+  SmallVector<MachineOperand, 4> PriorCond;
+  // This has to check PrevBB->succ_size() because EH edges are ignored by
+  // AnalyzeBranch.
+  if (PrevBB->succ_size() == 1 &&
+      !TII->analyzeBranch(*PrevBB, PriorTBB, PriorFBB, PriorCond, true) &&
+      PriorCond.empty() && !PriorTBB && TailBB->pred_size() == 1 &&
+      !TailBB->hasAddressTaken()) {
+    DEBUG(dbgs() << "\nMerging into block: " << *PrevBB
+                 << "From MBB: " << *TailBB);
+    if (PreRegAlloc) {
+      DenseMap<unsigned, RegSubRegPair> LocalVRMap;
+      SmallVector<std::pair<unsigned, RegSubRegPair>, 4> CopyInfos;
+      MachineBasicBlock::iterator I = TailBB->begin();
+      // Process PHI instructions first.
+      while (I != TailBB->end() && I->isPHI()) {
+        // Replace the uses of the def of the PHI with the register coming
+        // from PredBB.
+        MachineInstr *MI = &*I++;
+        processPHI(MI, TailBB, PrevBB, LocalVRMap, CopyInfos, UsedByPhi, true);
+      }
+
+      // Now copy the non-PHI instructions.
+      while (I != TailBB->end()) {
+        // Replace def of virtual registers with new registers, and update
+        // uses with PHI source register or the new registers.
+        MachineInstr *MI = &*I++;
+        assert(!MI->isBundle() && "Not expecting bundles before regalloc!");
+        duplicateInstruction(MI, TailBB, PrevBB, MF, LocalVRMap, UsedByPhi);
+        MI->eraseFromParent();
+      }
+      appendCopies(PrevBB, CopyInfos, Copies);
+    } else {
+      // No PHIs to worry about, just splice the instructions over.
+      PrevBB->splice(PrevBB->end(), TailBB, TailBB->begin(), TailBB->end());
+    }
+    PrevBB->removeSuccessor(PrevBB->succ_begin());
+    assert(PrevBB->succ_empty());
+    PrevBB->transferSuccessors(TailBB);
+    TDBBs.push_back(PrevBB);
+    Changed = true;
+  }
+
+  // If this is after register allocation, there are no phis to fix.
+  if (!PreRegAlloc)
+    return Changed;
+
+  // If we made no changes so far, we are safe.
+  if (!Changed)
+    return Changed;
+
+  // Handle the nasty case in that we duplicated a block that is part of a loop
+  // into some but not all of its predecessors. For example:
+  //    1 -> 2 <-> 3                 |
+  //          \                      |
+  //           \---> rest            |
+  // if we duplicate 2 into 1 but not into 3, we end up with
+  // 12 -> 3 <-> 2 -> rest           |
+  //   \             /               |
+  //    \----->-----/                |
+  // If there was a "var = phi(1, 3)" in 2, it has to be ultimately replaced
+  // with a phi in 3 (which now dominates 2).
+  // What we do here is introduce a copy in 3 of the register defined by the
+  // phi, just like when we are duplicating 2 into 3, but we don't copy any
+  // real instructions or remove the 3 -> 2 edge from the phi in 2.
+  for (SmallSetVector<MachineBasicBlock *, 8>::iterator PI = Preds.begin(),
+                                                        PE = Preds.end();
+       PI != PE; ++PI) {
+    MachineBasicBlock *PredBB = *PI;
+    if (std::find(TDBBs.begin(), TDBBs.end(), PredBB) != TDBBs.end())
+      continue;
+
+    // EH edges
+    if (PredBB->succ_size() != 1)
+      continue;
+
+    DenseMap<unsigned, RegSubRegPair> LocalVRMap;
+    SmallVector<std::pair<unsigned, RegSubRegPair>, 4> CopyInfos;
+    MachineBasicBlock::iterator I = TailBB->begin();
+    // Process PHI instructions first.
+    while (I != TailBB->end() && I->isPHI()) {
+      // Replace the uses of the def of the PHI with the register coming
+      // from PredBB.
+      MachineInstr *MI = &*I++;
+      processPHI(MI, TailBB, PredBB, LocalVRMap, CopyInfos, UsedByPhi, false);
+    }
+    appendCopies(PredBB, CopyInfos, Copies);
+  }
+
+  return Changed;
+}
+
+/// At the end of the block \p MBB generate COPY instructions between registers
+/// described by \p CopyInfos. Append resulting instructions to \p Copies.
+void TailDuplicator::appendCopies(MachineBasicBlock *MBB,
+      SmallVectorImpl<std::pair<unsigned,RegSubRegPair>> &CopyInfos,
+      SmallVectorImpl<MachineInstr*> &Copies) {
+  MachineBasicBlock::iterator Loc = MBB->getFirstTerminator();
+  const MCInstrDesc &CopyD = TII->get(TargetOpcode::COPY);
+  for (auto &CI : CopyInfos) {
+    auto C = BuildMI(*MBB, Loc, DebugLoc(), CopyD, CI.first)
+                .addReg(CI.second.Reg, 0, CI.second.SubReg);
+    Copies.push_back(C);
+  }
+}
+
+/// Remove the specified dead machine basic block from the function, updating
+/// the CFG.
+void TailDuplicator::removeDeadBlock(MachineBasicBlock *MBB) {
+  assert(MBB->pred_empty() && "MBB must be dead!");
+  DEBUG(dbgs() << "\nRemoving MBB: " << *MBB);
+
+  // Remove all successors.
+  while (!MBB->succ_empty())
+    MBB->removeSuccessor(MBB->succ_end() - 1);
+
+  // Remove the block.
+  MBB->eraseFromParent();
+}
+
+} // End llvm namespace
diff --git a/lib/CodeGen/TargetFrameLoweringImpl.cpp b/lib/CodeGen/TargetFrameLoweringImpl.cpp
index 679ade185e1c..cac7e63af328 100644
--- a/lib/CodeGen/TargetFrameLoweringImpl.cpp
+++ b/lib/CodeGen/TargetFrameLoweringImpl.cpp
@@ -12,13 +12,14 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/ADT/BitVector.h"
-#include "llvm/Target/TargetFrameLowering.h"
 #include "llvm/CodeGen/MachineFrameInfo.h"
 #include "llvm/CodeGen/MachineFunction.h"
 #include "llvm/CodeGen/MachineModuleInfo.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/TargetPassConfig.h"
 #include "llvm/IR/CallingConv.h"
 #include "llvm/IR/Function.h"
+#include "llvm/Target/TargetFrameLowering.h"
 #include "llvm/Target/TargetRegisterInfo.h"
 #include "llvm/Target/TargetSubtargetInfo.h"
 #include <cstdlib>
@@ -59,16 +60,25 @@ bool TargetFrameLowering::needsFrameIndexResolution(
 void TargetFrameLowering::determineCalleeSaves(MachineFunction &MF,
                                                BitVector &SavedRegs,
                                                RegScavenger *RS) const {
-  // Get the callee saved register list...
   const TargetRegisterInfo &TRI = *MF.getSubtarget().getRegisterInfo();
+
+  // Resize before the early returns. Some backends expect that
+  // SavedRegs.size() == TRI.getNumRegs() after this call even if there are no
+  // saved registers.
+  SavedRegs.resize(TRI.getNumRegs());
+
+  // When interprocedural register allocation is enabled caller saved registers
+  // are preferred over callee saved registers.
+  if (MF.getTarget().Options.EnableIPRA && isSafeForNoCSROpt(MF.getFunction()))
+    return;
+
+  // Get the callee saved register list...
   const MCPhysReg *CSRegs = TRI.getCalleeSavedRegs(&MF);
 
   // Early exit if there are no callee saved registers.
   if (!CSRegs || CSRegs[0] == 0)
     return;
 
-  SavedRegs.resize(TRI.getNumRegs());
-
   // In Naked functions we aren't going to save any registers.
   if (MF.getFunction()->hasFnAttribute(Attribute::Naked))
     return;
diff --git a/lib/CodeGen/TargetInstrInfo.cpp b/lib/CodeGen/TargetInstrInfo.cpp
index 6eaf991ac700..e7330c60ed23 100644
--- a/lib/CodeGen/TargetInstrInfo.cpp
+++ b/lib/CodeGen/TargetInstrInfo.cpp
@@ -31,6 +31,7 @@
 #include "llvm/Target/TargetMachine.h"
 #include "llvm/Target/TargetRegisterInfo.h"
 #include <cctype>
+
 using namespace llvm;
 
 static cl::opt<bool> DisableHazardRecognizer(
@@ -76,25 +77,27 @@ void TargetInstrInfo::insertNoop(MachineBasicBlock &MBB,
 /// may be overloaded in the target code to do that.
 unsigned TargetInstrInfo::getInlineAsmLength(const char *Str,
                                              const MCAsmInfo &MAI) const {
-
-
   // Count the number of instructions in the asm.
   bool atInsnStart = true;
-  unsigned Length = 0;
+  unsigned InstCount = 0;
   for (; *Str; ++Str) {
     if (*Str == '\n' || strncmp(Str, MAI.getSeparatorString(),
-                                strlen(MAI.getSeparatorString())) == 0)
+                                strlen(MAI.getSeparatorString())) == 0) {
       atInsnStart = true;
-    if (atInsnStart && !std::isspace(static_cast<unsigned char>(*Str))) {
-      Length += MAI.getMaxInstLength();
+    } else if (strncmp(Str, MAI.getCommentString(),
+                       strlen(MAI.getCommentString())) == 0) {
+      // Stop counting as an instruction after a comment until the next
+      // separator.
       atInsnStart = false;
     }
-    if (atInsnStart && strncmp(Str, MAI.getCommentString(),
-                               strlen(MAI.getCommentString())) == 0)
+
+    if (atInsnStart && !std::isspace(static_cast<unsigned char>(*Str))) {
+      ++InstCount;
       atInsnStart = false;
+    }
   }
 
-  return Length;
+  return InstCount * MAI.getMaxInstLength();
 }
 
 /// ReplaceTailWithBranchTo - Delete the instruction OldInst and everything
@@ -108,23 +111,24 @@ TargetInstrInfo::ReplaceTailWithBranchTo(MachineBasicBlock::iterator Tail,
   while (!MBB->succ_empty())
     MBB->removeSuccessor(MBB->succ_begin());
 
+  // Save off the debug loc before erasing the instruction.
+  DebugLoc DL = Tail->getDebugLoc();
+
   // Remove all the dead instructions from the end of MBB.
   MBB->erase(Tail, MBB->end());
 
   // If MBB isn't immediately before MBB, insert a branch to it.
   if (++MachineFunction::iterator(MBB) != MachineFunction::iterator(NewDest))
-    InsertBranch(*MBB, NewDest, nullptr, SmallVector<MachineOperand, 0>(),
-                 Tail->getDebugLoc());
+    InsertBranch(*MBB, NewDest, nullptr, SmallVector<MachineOperand, 0>(), DL);
   MBB->addSuccessor(NewDest);
 }
 
-MachineInstr *TargetInstrInfo::commuteInstructionImpl(MachineInstr *MI,
-                                                      bool NewMI,
-                                                      unsigned Idx1,
+MachineInstr *TargetInstrInfo::commuteInstructionImpl(MachineInstr &MI,
+                                                      bool NewMI, unsigned Idx1,
                                                       unsigned Idx2) const {
-  const MCInstrDesc &MCID = MI->getDesc();
+  const MCInstrDesc &MCID = MI.getDesc();
   bool HasDef = MCID.getNumDefs();
-  if (HasDef && !MI->getOperand(0).isReg())
+  if (HasDef && !MI.getOperand(0).isReg())
     // No idea how to commute this instruction. Target should implement its own.
     return nullptr;
 
@@ -133,60 +137,62 @@ MachineInstr *TargetInstrInfo::commuteInstructionImpl(MachineInstr *MI,
   assert(findCommutedOpIndices(MI, CommutableOpIdx1, CommutableOpIdx2) &&
          CommutableOpIdx1 == Idx1 && CommutableOpIdx2 == Idx2 &&
          "TargetInstrInfo::CommuteInstructionImpl(): not commutable operands.");
-  assert(MI->getOperand(Idx1).isReg() && MI->getOperand(Idx2).isReg() &&
+  assert(MI.getOperand(Idx1).isReg() && MI.getOperand(Idx2).isReg() &&
          "This only knows how to commute register operands so far");
 
-  unsigned Reg0 = HasDef ? MI->getOperand(0).getReg() : 0;
-  unsigned Reg1 = MI->getOperand(Idx1).getReg();
-  unsigned Reg2 = MI->getOperand(Idx2).getReg();
-  unsigned SubReg0 = HasDef ? MI->getOperand(0).getSubReg() : 0;
-  unsigned SubReg1 = MI->getOperand(Idx1).getSubReg();
-  unsigned SubReg2 = MI->getOperand(Idx2).getSubReg();
-  bool Reg1IsKill = MI->getOperand(Idx1).isKill();
-  bool Reg2IsKill = MI->getOperand(Idx2).isKill();
-  bool Reg1IsUndef = MI->getOperand(Idx1).isUndef();
-  bool Reg2IsUndef = MI->getOperand(Idx2).isUndef();
-  bool Reg1IsInternal = MI->getOperand(Idx1).isInternalRead();
-  bool Reg2IsInternal = MI->getOperand(Idx2).isInternalRead();
+  unsigned Reg0 = HasDef ? MI.getOperand(0).getReg() : 0;
+  unsigned Reg1 = MI.getOperand(Idx1).getReg();
+  unsigned Reg2 = MI.getOperand(Idx2).getReg();
+  unsigned SubReg0 = HasDef ? MI.getOperand(0).getSubReg() : 0;
+  unsigned SubReg1 = MI.getOperand(Idx1).getSubReg();
+  unsigned SubReg2 = MI.getOperand(Idx2).getSubReg();
+  bool Reg1IsKill = MI.getOperand(Idx1).isKill();
+  bool Reg2IsKill = MI.getOperand(Idx2).isKill();
+  bool Reg1IsUndef = MI.getOperand(Idx1).isUndef();
+  bool Reg2IsUndef = MI.getOperand(Idx2).isUndef();
+  bool Reg1IsInternal = MI.getOperand(Idx1).isInternalRead();
+  bool Reg2IsInternal = MI.getOperand(Idx2).isInternalRead();
   // If destination is tied to either of the commuted source register, then
   // it must be updated.
   if (HasDef && Reg0 == Reg1 &&
-      MI->getDesc().getOperandConstraint(Idx1, MCOI::TIED_TO) == 0) {
+      MI.getDesc().getOperandConstraint(Idx1, MCOI::TIED_TO) == 0) {
     Reg2IsKill = false;
     Reg0 = Reg2;
     SubReg0 = SubReg2;
   } else if (HasDef && Reg0 == Reg2 &&
-             MI->getDesc().getOperandConstraint(Idx2, MCOI::TIED_TO) == 0) {
+             MI.getDesc().getOperandConstraint(Idx2, MCOI::TIED_TO) == 0) {
     Reg1IsKill = false;
     Reg0 = Reg1;
     SubReg0 = SubReg1;
   }
 
+  MachineInstr *CommutedMI = nullptr;
   if (NewMI) {
     // Create a new instruction.
-    MachineFunction &MF = *MI->getParent()->getParent();
-    MI = MF.CloneMachineInstr(MI);
+    MachineFunction &MF = *MI.getParent()->getParent();
+    CommutedMI = MF.CloneMachineInstr(&MI);
+  } else {
+    CommutedMI = &MI;
   }
 
   if (HasDef) {
-    MI->getOperand(0).setReg(Reg0);
-    MI->getOperand(0).setSubReg(SubReg0);
+    CommutedMI->getOperand(0).setReg(Reg0);
+    CommutedMI->getOperand(0).setSubReg(SubReg0);
   }
-  MI->getOperand(Idx2).setReg(Reg1);
-  MI->getOperand(Idx1).setReg(Reg2);
-  MI->getOperand(Idx2).setSubReg(SubReg1);
-  MI->getOperand(Idx1).setSubReg(SubReg2);
-  MI->getOperand(Idx2).setIsKill(Reg1IsKill);
-  MI->getOperand(Idx1).setIsKill(Reg2IsKill);
-  MI->getOperand(Idx2).setIsUndef(Reg1IsUndef);
-  MI->getOperand(Idx1).setIsUndef(Reg2IsUndef);
-  MI->getOperand(Idx2).setIsInternalRead(Reg1IsInternal);
-  MI->getOperand(Idx1).setIsInternalRead(Reg2IsInternal);
-  return MI;
+  CommutedMI->getOperand(Idx2).setReg(Reg1);
+  CommutedMI->getOperand(Idx1).setReg(Reg2);
+  CommutedMI->getOperand(Idx2).setSubReg(SubReg1);
+  CommutedMI->getOperand(Idx1).setSubReg(SubReg2);
+  CommutedMI->getOperand(Idx2).setIsKill(Reg1IsKill);
+  CommutedMI->getOperand(Idx1).setIsKill(Reg2IsKill);
+  CommutedMI->getOperand(Idx2).setIsUndef(Reg1IsUndef);
+  CommutedMI->getOperand(Idx1).setIsUndef(Reg2IsUndef);
+  CommutedMI->getOperand(Idx2).setIsInternalRead(Reg1IsInternal);
+  CommutedMI->getOperand(Idx1).setIsInternalRead(Reg2IsInternal);
+  return CommutedMI;
 }
 
-MachineInstr *TargetInstrInfo::commuteInstruction(MachineInstr *MI,
-                                                  bool NewMI,
+MachineInstr *TargetInstrInfo::commuteInstruction(MachineInstr &MI, bool NewMI,
                                                   unsigned OpIdx1,
                                                   unsigned OpIdx2) const {
   // If OpIdx1 or OpIdx2 is not specified, then this method is free to choose
@@ -194,7 +200,7 @@ MachineInstr *TargetInstrInfo::commuteInstruction(MachineInstr *MI,
   // called below.
   if ((OpIdx1 == CommuteAnyOperandIndex || OpIdx2 == CommuteAnyOperandIndex) &&
       !findCommutedOpIndices(MI, OpIdx1, OpIdx2)) {
-    assert(MI->isCommutable() &&
+    assert(MI.isCommutable() &&
            "Precondition violation: MI must be commutable.");
     return nullptr;
   }
@@ -232,13 +238,13 @@ bool TargetInstrInfo::fixCommutedOpIndices(unsigned &ResultIdx1,
   return true;
 }
 
-bool TargetInstrInfo::findCommutedOpIndices(MachineInstr *MI,
+bool TargetInstrInfo::findCommutedOpIndices(MachineInstr &MI,
                                             unsigned &SrcOpIdx1,
                                             unsigned &SrcOpIdx2) const {
-  assert(!MI->isBundle() &&
+  assert(!MI.isBundle() &&
          "TargetInstrInfo::findCommutedOpIndices() can't handle bundles");
 
-  const MCInstrDesc &MCID = MI->getDesc();
+  const MCInstrDesc &MCID = MI.getDesc();
   if (!MCID.isCommutable())
     return false;
 
@@ -250,39 +256,37 @@ bool TargetInstrInfo::findCommutedOpIndices(MachineInstr *MI,
                             CommutableOpIdx1, CommutableOpIdx2))
     return false;
 
-  if (!MI->getOperand(SrcOpIdx1).isReg() ||
-      !MI->getOperand(SrcOpIdx2).isReg())
+  if (!MI.getOperand(SrcOpIdx1).isReg() || !MI.getOperand(SrcOpIdx2).isReg())
     // No idea.
     return false;
   return true;
 }
 
-bool
-TargetInstrInfo::isUnpredicatedTerminator(const MachineInstr *MI) const {
-  if (!MI->isTerminator()) return false;
+bool TargetInstrInfo::isUnpredicatedTerminator(const MachineInstr &MI) const {
+  if (!MI.isTerminator()) return false;
 
   // Conditional branch is a special case.
-  if (MI->isBranch() && !MI->isBarrier())
+  if (MI.isBranch() && !MI.isBarrier())
     return true;
-  if (!MI->isPredicable())
+  if (!MI.isPredicable())
     return true;
   return !isPredicated(MI);
 }
 
 bool TargetInstrInfo::PredicateInstruction(
-    MachineInstr *MI, ArrayRef<MachineOperand> Pred) const {
+    MachineInstr &MI, ArrayRef<MachineOperand> Pred) const {
   bool MadeChange = false;
 
-  assert(!MI->isBundle() &&
+  assert(!MI.isBundle() &&
          "TargetInstrInfo::PredicateInstruction() can't handle bundles");
 
-  const MCInstrDesc &MCID = MI->getDesc();
-  if (!MI->isPredicable())
+  const MCInstrDesc &MCID = MI.getDesc();
+  if (!MI.isPredicable())
     return false;
 
-  for (unsigned j = 0, i = 0, e = MI->getNumOperands(); i != e; ++i) {
+  for (unsigned j = 0, i = 0, e = MI.getNumOperands(); i != e; ++i) {
     if (MCID.OpInfo[i].isPredicate()) {
-      MachineOperand &MO = MI->getOperand(i);
+      MachineOperand &MO = MI.getOperand(i);
       if (MO.isReg()) {
         MO.setReg(Pred[j].getReg());
         MadeChange = true;
@@ -299,13 +303,12 @@ bool TargetInstrInfo::PredicateInstruction(
   return MadeChange;
 }
 
-bool TargetInstrInfo::hasLoadFromStackSlot(const MachineInstr *MI,
+bool TargetInstrInfo::hasLoadFromStackSlot(const MachineInstr &MI,
                                            const MachineMemOperand *&MMO,
                                            int &FrameIndex) const {
-  for (MachineInstr::mmo_iterator o = MI->memoperands_begin(),
-         oe = MI->memoperands_end();
-       o != oe;
-       ++o) {
+  for (MachineInstr::mmo_iterator o = MI.memoperands_begin(),
+                                  oe = MI.memoperands_end();
+       o != oe; ++o) {
     if ((*o)->isLoad()) {
       if (const FixedStackPseudoSourceValue *Value =
           dyn_cast_or_null<FixedStackPseudoSourceValue>(
@@ -319,13 +322,12 @@ bool TargetInstrInfo::hasLoadFromStackSlot(const MachineInstr *MI,
   return false;
 }
 
-bool TargetInstrInfo::hasStoreToStackSlot(const MachineInstr *MI,
+bool TargetInstrInfo::hasStoreToStackSlot(const MachineInstr &MI,
                                           const MachineMemOperand *&MMO,
                                           int &FrameIndex) const {
-  for (MachineInstr::mmo_iterator o = MI->memoperands_begin(),
-         oe = MI->memoperands_end();
-       o != oe;
-       ++o) {
+  for (MachineInstr::mmo_iterator o = MI.memoperands_begin(),
+                                  oe = MI.memoperands_end();
+       o != oe; ++o) {
     if ((*o)->isStore()) {
       if (const FixedStackPseudoSourceValue *Value =
           dyn_cast_or_null<FixedStackPseudoSourceValue>(
@@ -372,40 +374,37 @@ bool TargetInstrInfo::getStackSlotRange(const TargetRegisterClass *RC,
 
 void TargetInstrInfo::reMaterialize(MachineBasicBlock &MBB,
                                     MachineBasicBlock::iterator I,
-                                    unsigned DestReg,
-                                    unsigned SubIdx,
-                                    const MachineInstr *Orig,
+                                    unsigned DestReg, unsigned SubIdx,
+                                    const MachineInstr &Orig,
                                     const TargetRegisterInfo &TRI) const {
-  MachineInstr *MI = MBB.getParent()->CloneMachineInstr(Orig);
+  MachineInstr *MI = MBB.getParent()->CloneMachineInstr(&Orig);
   MI->substituteRegister(MI->getOperand(0).getReg(), DestReg, SubIdx, TRI);
   MBB.insert(I, MI);
 }
 
-bool
-TargetInstrInfo::produceSameValue(const MachineInstr *MI0,
-                                  const MachineInstr *MI1,
-                                  const MachineRegisterInfo *MRI) const {
-  return MI0->isIdenticalTo(MI1, MachineInstr::IgnoreVRegDefs);
+bool TargetInstrInfo::produceSameValue(const MachineInstr &MI0,
+                                       const MachineInstr &MI1,
+                                       const MachineRegisterInfo *MRI) const {
+  return MI0.isIdenticalTo(MI1, MachineInstr::IgnoreVRegDefs);
 }
 
-MachineInstr *TargetInstrInfo::duplicate(MachineInstr *Orig,
+MachineInstr *TargetInstrInfo::duplicate(MachineInstr &Orig,
                                          MachineFunction &MF) const {
-  assert(!Orig->isNotDuplicable() &&
-         "Instruction cannot be duplicated");
-  return MF.CloneMachineInstr(Orig);
+  assert(!Orig.isNotDuplicable() && "Instruction cannot be duplicated");
+  return MF.CloneMachineInstr(&Orig);
 }
 
 // If the COPY instruction in MI can be folded to a stack operation, return
 // the register class to use.
-static const TargetRegisterClass *canFoldCopy(const MachineInstr *MI,
+static const TargetRegisterClass *canFoldCopy(const MachineInstr &MI,
                                               unsigned FoldIdx) {
-  assert(MI->isCopy() && "MI must be a COPY instruction");
-  if (MI->getNumOperands() != 2)
+  assert(MI.isCopy() && "MI must be a COPY instruction");
+  if (MI.getNumOperands() != 2)
     return nullptr;
   assert(FoldIdx<2 && "FoldIdx refers no nonexistent operand");
 
-  const MachineOperand &FoldOp = MI->getOperand(FoldIdx);
-  const MachineOperand &LiveOp = MI->getOperand(1-FoldIdx);
+  const MachineOperand &FoldOp = MI.getOperand(FoldIdx);
+  const MachineOperand &LiveOp = MI.getOperand(1 - FoldIdx);
 
   if (FoldOp.getSubReg() || LiveOp.getSubReg())
     return nullptr;
@@ -416,7 +415,7 @@ static const TargetRegisterClass *canFoldCopy(const MachineInstr *MI,
   assert(TargetRegisterInfo::isVirtualRegister(FoldReg) &&
          "Cannot fold physregs");
 
-  const MachineRegisterInfo &MRI = MI->getParent()->getParent()->getRegInfo();
+  const MachineRegisterInfo &MRI = MI.getParent()->getParent()->getRegInfo();
   const TargetRegisterClass *RC = MRI.getRegClass(FoldReg);
 
   if (TargetRegisterInfo::isPhysicalRegister(LiveOp.getReg()))
@@ -433,17 +432,17 @@ void TargetInstrInfo::getNoopForMachoTarget(MCInst &NopInst) const {
   llvm_unreachable("Not a MachO target");
 }
 
-static MachineInstr *foldPatchpoint(MachineFunction &MF, MachineInstr *MI,
+static MachineInstr *foldPatchpoint(MachineFunction &MF, MachineInstr &MI,
                                     ArrayRef<unsigned> Ops, int FrameIndex,
                                     const TargetInstrInfo &TII) {
   unsigned StartIdx = 0;
-  switch (MI->getOpcode()) {
+  switch (MI.getOpcode()) {
   case TargetOpcode::STACKMAP:
     StartIdx = 2; // Skip ID, nShadowBytes.
     break;
   case TargetOpcode::PATCHPOINT: {
     // For PatchPoint, the call args are not foldable.
-    PatchPointOpers opers(MI);
+    PatchPointOpers opers(&MI);
     StartIdx = opers.getVarIdx();
     break;
   }
@@ -459,15 +458,15 @@ static MachineInstr *foldPatchpoint(MachineFunction &MF, MachineInstr *MI,
   }
 
   MachineInstr *NewMI =
-    MF.CreateMachineInstr(TII.get(MI->getOpcode()), MI->getDebugLoc(), true);
+      MF.CreateMachineInstr(TII.get(MI.getOpcode()), MI.getDebugLoc(), true);
   MachineInstrBuilder MIB(MF, NewMI);
 
   // No need to fold return, the meta data, and function arguments
   for (unsigned i = 0; i < StartIdx; ++i)
-    MIB.addOperand(MI->getOperand(i));
+    MIB.addOperand(MI.getOperand(i));
 
-  for (unsigned i = StartIdx; i < MI->getNumOperands(); ++i) {
-    MachineOperand &MO = MI->getOperand(i);
+  for (unsigned i = StartIdx; i < MI.getNumOperands(); ++i) {
+    MachineOperand &MO = MI.getOperand(i);
     if (std::find(Ops.begin(), Ops.end(), i) != Ops.end()) {
       unsigned SpillSize;
       unsigned SpillOffset;
@@ -495,35 +494,35 @@ static MachineInstr *foldPatchpoint(MachineFunction &MF, MachineInstr *MI,
 /// operand folded, otherwise NULL is returned. The client is responsible for
 /// removing the old instruction and adding the new one in the instruction
 /// stream.
-MachineInstr *TargetInstrInfo::foldMemoryOperand(MachineBasicBlock::iterator MI,
-                                                 ArrayRef<unsigned> Ops,
-                                                 int FI) const {
-  unsigned Flags = 0;
+MachineInstr *TargetInstrInfo::foldMemoryOperand(MachineInstr &MI,
+                                                 ArrayRef<unsigned> Ops, int FI,
+                                                 LiveIntervals *LIS) const {
+  auto Flags = MachineMemOperand::MONone;
   for (unsigned i = 0, e = Ops.size(); i != e; ++i)
-    if (MI->getOperand(Ops[i]).isDef())
+    if (MI.getOperand(Ops[i]).isDef())
       Flags |= MachineMemOperand::MOStore;
     else
       Flags |= MachineMemOperand::MOLoad;
 
-  MachineBasicBlock *MBB = MI->getParent();
+  MachineBasicBlock *MBB = MI.getParent();
   assert(MBB && "foldMemoryOperand needs an inserted instruction");
   MachineFunction &MF = *MBB->getParent();
 
   MachineInstr *NewMI = nullptr;
 
-  if (MI->getOpcode() == TargetOpcode::STACKMAP ||
-      MI->getOpcode() == TargetOpcode::PATCHPOINT) {
+  if (MI.getOpcode() == TargetOpcode::STACKMAP ||
+      MI.getOpcode() == TargetOpcode::PATCHPOINT) {
     // Fold stackmap/patchpoint.
     NewMI = foldPatchpoint(MF, MI, Ops, FI, *this);
     if (NewMI)
       MBB->insert(MI, NewMI);
   } else {
     // Ask the target to do the actual folding.
-    NewMI = foldMemoryOperandImpl(MF, MI, Ops, MI, FI);
+    NewMI = foldMemoryOperandImpl(MF, MI, Ops, MI, FI, LIS);
   }
 
   if (NewMI) {
-    NewMI->setMemRefs(MI->memoperands_begin(), MI->memoperands_end());
+    NewMI->setMemRefs(MI.memoperands_begin(), MI.memoperands_end());
     // Add a memory operand, foldMemoryOperandImpl doesn't do that.
     assert((!(Flags & MachineMemOperand::MOStore) ||
             NewMI->mayStore()) &&
@@ -542,14 +541,14 @@ MachineInstr *TargetInstrInfo::foldMemoryOperand(MachineBasicBlock::iterator MI,
   }
 
   // Straight COPY may fold as load/store.
-  if (!MI->isCopy() || Ops.size() != 1)
+  if (!MI.isCopy() || Ops.size() != 1)
     return nullptr;
 
   const TargetRegisterClass *RC = canFoldCopy(MI, Ops[0]);
   if (!RC)
     return nullptr;
 
-  const MachineOperand &MO = MI->getOperand(1-Ops[0]);
+  const MachineOperand &MO = MI.getOperand(1 - Ops[0]);
   MachineBasicBlock::iterator Pos = MI;
   const TargetRegisterInfo *TRI = MF.getSubtarget().getRegisterInfo();
 
@@ -557,7 +556,7 @@ MachineInstr *TargetInstrInfo::foldMemoryOperand(MachineBasicBlock::iterator MI,
     storeRegToStackSlot(*MBB, Pos, MO.getReg(), MO.isKill(), FI, RC, TRI);
   else
     loadRegFromStackSlot(*MBB, Pos, MO.getReg(), FI, RC, TRI);
-  return --Pos;
+  return &*--Pos;
 }
 
 bool TargetInstrInfo::hasReassociableOperands(
@@ -637,7 +636,6 @@ bool TargetInstrInfo::isReassociationCandidate(const MachineInstr &Inst,
 bool TargetInstrInfo::getMachineCombinerPatterns(
     MachineInstr &Root,
     SmallVectorImpl<MachineCombinerPattern> &Patterns) const {
-
   bool Commute;
   if (isReassociationCandidate(Root, Commute)) {
     // We found a sequence of instructions that may be suitable for a
@@ -656,7 +654,11 @@ bool TargetInstrInfo::getMachineCombinerPatterns(
 
   return false;
 }
-
+/// Return true when a code sequence can improve loop throughput.
+bool
+TargetInstrInfo::isThroughputPattern(MachineCombinerPattern Pattern) const {
+  return false;
+}
 /// Attempt the reassociation transformation to reduce critical path length.
 /// See the above comments before getMachineCombinerPatterns().
 void TargetInstrInfo::reassociateOps(
@@ -768,75 +770,73 @@ void TargetInstrInfo::genAlternativeCodeSequence(
   assert(Prev && "Unknown pattern for machine combiner");
 
   reassociateOps(Root, *Prev, Pattern, InsInstrs, DelInstrs, InstIdxForVirtReg);
-  return;
 }
 
 /// foldMemoryOperand - Same as the previous version except it allows folding
 /// of any load and store from / to any address, not just from a specific
 /// stack slot.
-MachineInstr *TargetInstrInfo::foldMemoryOperand(MachineBasicBlock::iterator MI,
+MachineInstr *TargetInstrInfo::foldMemoryOperand(MachineInstr &MI,
                                                  ArrayRef<unsigned> Ops,
-                                                 MachineInstr *LoadMI) const {
-  assert(LoadMI->canFoldAsLoad() && "LoadMI isn't foldable!");
+                                                 MachineInstr &LoadMI,
+                                                 LiveIntervals *LIS) const {
+  assert(LoadMI.canFoldAsLoad() && "LoadMI isn't foldable!");
 #ifndef NDEBUG
   for (unsigned i = 0, e = Ops.size(); i != e; ++i)
-    assert(MI->getOperand(Ops[i]).isUse() && "Folding load into def!");
+    assert(MI.getOperand(Ops[i]).isUse() && "Folding load into def!");
 #endif
-  MachineBasicBlock &MBB = *MI->getParent();
+  MachineBasicBlock &MBB = *MI.getParent();
   MachineFunction &MF = *MBB.getParent();
 
   // Ask the target to do the actual folding.
   MachineInstr *NewMI = nullptr;
   int FrameIndex = 0;
 
-  if ((MI->getOpcode() == TargetOpcode::STACKMAP ||
-       MI->getOpcode() == TargetOpcode::PATCHPOINT) &&
+  if ((MI.getOpcode() == TargetOpcode::STACKMAP ||
+       MI.getOpcode() == TargetOpcode::PATCHPOINT) &&
       isLoadFromStackSlot(LoadMI, FrameIndex)) {
     // Fold stackmap/patchpoint.
     NewMI = foldPatchpoint(MF, MI, Ops, FrameIndex, *this);
     if (NewMI)
-      NewMI = MBB.insert(MI, NewMI);
+      NewMI = &*MBB.insert(MI, NewMI);
   } else {
     // Ask the target to do the actual folding.
-    NewMI = foldMemoryOperandImpl(MF, MI, Ops, MI, LoadMI);
+    NewMI = foldMemoryOperandImpl(MF, MI, Ops, MI, LoadMI, LIS);
   }
 
   if (!NewMI) return nullptr;
 
   // Copy the memoperands from the load to the folded instruction.
-  if (MI->memoperands_empty()) {
-    NewMI->setMemRefs(LoadMI->memoperands_begin(),
-                      LoadMI->memoperands_end());
+  if (MI.memoperands_empty()) {
+    NewMI->setMemRefs(LoadMI.memoperands_begin(), LoadMI.memoperands_end());
   }
   else {
     // Handle the rare case of folding multiple loads.
-    NewMI->setMemRefs(MI->memoperands_begin(),
-                      MI->memoperands_end());
-    for (MachineInstr::mmo_iterator I = LoadMI->memoperands_begin(),
-           E = LoadMI->memoperands_end(); I != E; ++I) {
+    NewMI->setMemRefs(MI.memoperands_begin(), MI.memoperands_end());
+    for (MachineInstr::mmo_iterator I = LoadMI.memoperands_begin(),
+                                    E = LoadMI.memoperands_end();
+         I != E; ++I) {
       NewMI->addMemOperand(MF, *I);
     }
   }
   return NewMI;
 }
 
-bool TargetInstrInfo::
-isReallyTriviallyReMaterializableGeneric(const MachineInstr *MI,
-                                         AliasAnalysis *AA) const {
-  const MachineFunction &MF = *MI->getParent()->getParent();
+bool TargetInstrInfo::isReallyTriviallyReMaterializableGeneric(
+    const MachineInstr &MI, AliasAnalysis *AA) const {
+  const MachineFunction &MF = *MI.getParent()->getParent();
   const MachineRegisterInfo &MRI = MF.getRegInfo();
 
   // Remat clients assume operand 0 is the defined register.
-  if (!MI->getNumOperands() || !MI->getOperand(0).isReg())
+  if (!MI.getNumOperands() || !MI.getOperand(0).isReg())
     return false;
-  unsigned DefReg = MI->getOperand(0).getReg();
+  unsigned DefReg = MI.getOperand(0).getReg();
 
   // A sub-register definition can only be rematerialized if the instruction
   // doesn't read the other parts of the register.  Otherwise it is really a
   // read-modify-write operation on the full virtual register which cannot be
   // moved safely.
   if (TargetRegisterInfo::isVirtualRegister(DefReg) &&
-      MI->getOperand(0).getSubReg() && MI->readsVirtualRegister(DefReg))
+      MI.getOperand(0).getSubReg() && MI.readsVirtualRegister(DefReg))
     return false;
 
   // A load from a fixed stack slot can be rematerialized. This may be
@@ -848,23 +848,22 @@ isReallyTriviallyReMaterializableGeneric(const MachineInstr *MI,
     return true;
 
   // Avoid instructions obviously unsafe for remat.
-  if (MI->isNotDuplicable() || MI->mayStore() ||
-      MI->hasUnmodeledSideEffects())
+  if (MI.isNotDuplicable() || MI.mayStore() || MI.hasUnmodeledSideEffects())
     return false;
 
   // Don't remat inline asm. We have no idea how expensive it is
   // even if it's side effect free.
-  if (MI->isInlineAsm())
+  if (MI.isInlineAsm())
     return false;
 
   // Avoid instructions which load from potentially varying memory.
-  if (MI->mayLoad() && !MI->isInvariantLoad(AA))
+  if (MI.mayLoad() && !MI.isInvariantLoad(AA))
     return false;
 
   // If any of the registers accessed are non-constant, conservatively assume
   // the instruction is not rematerializable.
-  for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) {
-    const MachineOperand &MO = MI->getOperand(i);
+  for (unsigned i = 0, e = MI.getNumOperands(); i != e; ++i) {
+    const MachineOperand &MO = MI.getOperand(i);
     if (!MO.isReg()) continue;
     unsigned Reg = MO.getReg();
     if (Reg == 0)
@@ -901,8 +900,8 @@ isReallyTriviallyReMaterializableGeneric(const MachineInstr *MI,
   return true;
 }
 
-int TargetInstrInfo::getSPAdjust(const MachineInstr *MI) const {
-  const MachineFunction *MF = MI->getParent()->getParent();
+int TargetInstrInfo::getSPAdjust(const MachineInstr &MI) const {
+  const MachineFunction *MF = MI.getParent()->getParent();
   const TargetFrameLowering *TFI = MF->getSubtarget().getFrameLowering();
   bool StackGrowsDown =
     TFI->getStackGrowthDirection() == TargetFrameLowering::StackGrowsDown;
@@ -910,15 +909,15 @@ int TargetInstrInfo::getSPAdjust(const MachineInstr *MI) const {
   unsigned FrameSetupOpcode = getCallFrameSetupOpcode();
   unsigned FrameDestroyOpcode = getCallFrameDestroyOpcode();
 
-  if (MI->getOpcode() != FrameSetupOpcode &&
-      MI->getOpcode() != FrameDestroyOpcode)
+  if (MI.getOpcode() != FrameSetupOpcode &&
+      MI.getOpcode() != FrameDestroyOpcode)
     return 0;
- 
-  int SPAdj = MI->getOperand(0).getImm();
+
+  int SPAdj = MI.getOperand(0).getImm();
   SPAdj = TFI->alignSPAdjust(SPAdj);
 
-  if ((!StackGrowsDown && MI->getOpcode() == FrameSetupOpcode) ||
-       (StackGrowsDown && MI->getOpcode() == FrameDestroyOpcode))
+  if ((!StackGrowsDown && MI.getOpcode() == FrameSetupOpcode) ||
+      (StackGrowsDown && MI.getOpcode() == FrameDestroyOpcode))
     SPAdj = -SPAdj;
 
   return SPAdj;
@@ -927,11 +926,11 @@ int TargetInstrInfo::getSPAdjust(const MachineInstr *MI) const {
 /// isSchedulingBoundary - Test if the given instruction should be
 /// considered a scheduling boundary. This primarily includes labels
 /// and terminators.
-bool TargetInstrInfo::isSchedulingBoundary(const MachineInstr *MI,
+bool TargetInstrInfo::isSchedulingBoundary(const MachineInstr &MI,
                                            const MachineBasicBlock *MBB,
                                            const MachineFunction &MF) const {
   // Terminators and labels can't be scheduled around.
-  if (MI->isTerminator() || MI->isPosition())
+  if (MI.isTerminator() || MI.isPosition())
     return true;
 
   // Don't attempt to schedule around any instruction that defines
@@ -941,7 +940,7 @@ bool TargetInstrInfo::isSchedulingBoundary(const MachineInstr *MI,
   // modification.
   const TargetLowering &TLI = *MF.getSubtarget().getTargetLowering();
   const TargetRegisterInfo *TRI = MF.getSubtarget().getRegisterInfo();
-  return MI->modifiesRegister(TLI.getStackPointerRegisterToSaveRestore(), TRI);
+  return MI.modifiesRegister(TLI.getStackPointerRegisterToSaveRestore(), TRI);
 }
 
 // Provide a global flag for disabling the PreRA hazard recognizer that targets
@@ -1010,13 +1009,12 @@ int TargetInstrInfo::getInstrLatency(const InstrItineraryData *ItinData,
 //  MachineInstr latency interface.
 //===----------------------------------------------------------------------===//
 
-unsigned
-TargetInstrInfo::getNumMicroOps(const InstrItineraryData *ItinData,
-                                const MachineInstr *MI) const {
+unsigned TargetInstrInfo::getNumMicroOps(const InstrItineraryData *ItinData,
+                                         const MachineInstr &MI) const {
   if (!ItinData || ItinData->isEmpty())
     return 1;
 
-  unsigned Class = MI->getDesc().getSchedClass();
+  unsigned Class = MI.getDesc().getSchedClass();
   int UOps = ItinData->Itineraries[Class].NumMicroOps;
   if (UOps >= 0)
     return UOps;
@@ -1028,60 +1026,59 @@ TargetInstrInfo::getNumMicroOps(const InstrItineraryData *ItinData,
 
 /// Return the default expected latency for a def based on it's opcode.
 unsigned TargetInstrInfo::defaultDefLatency(const MCSchedModel &SchedModel,
-                                            const MachineInstr *DefMI) const {
-  if (DefMI->isTransient())
+                                            const MachineInstr &DefMI) const {
+  if (DefMI.isTransient())
     return 0;
-  if (DefMI->mayLoad())
+  if (DefMI.mayLoad())
     return SchedModel.LoadLatency;
-  if (isHighLatencyDef(DefMI->getOpcode()))
+  if (isHighLatencyDef(DefMI.getOpcode()))
     return SchedModel.HighLatency;
   return 1;
 }
 
-unsigned TargetInstrInfo::getPredicationCost(const MachineInstr *) const {
+unsigned TargetInstrInfo::getPredicationCost(const MachineInstr &) const {
   return 0;
 }
 
-unsigned TargetInstrInfo::
-getInstrLatency(const InstrItineraryData *ItinData,
-                const MachineInstr *MI,
-                unsigned *PredCost) const {
+unsigned TargetInstrInfo::getInstrLatency(const InstrItineraryData *ItinData,
+                                          const MachineInstr &MI,
+                                          unsigned *PredCost) const {
   // Default to one cycle for no itinerary. However, an "empty" itinerary may
   // still have a MinLatency property, which getStageLatency checks.
   if (!ItinData)
-    return MI->mayLoad() ? 2 : 1;
+    return MI.mayLoad() ? 2 : 1;
 
-  return ItinData->getStageLatency(MI->getDesc().getSchedClass());
+  return ItinData->getStageLatency(MI.getDesc().getSchedClass());
 }
 
 bool TargetInstrInfo::hasLowDefLatency(const TargetSchedModel &SchedModel,
-                                       const MachineInstr *DefMI,
+                                       const MachineInstr &DefMI,
                                        unsigned DefIdx) const {
   const InstrItineraryData *ItinData = SchedModel.getInstrItineraries();
   if (!ItinData || ItinData->isEmpty())
     return false;
 
-  unsigned DefClass = DefMI->getDesc().getSchedClass();
+  unsigned DefClass = DefMI.getDesc().getSchedClass();
   int DefCycle = ItinData->getOperandCycle(DefClass, DefIdx);
   return (DefCycle != -1 && DefCycle <= 1);
 }
 
 /// Both DefMI and UseMI must be valid.  By default, call directly to the
 /// itinerary. This may be overriden by the target.
-int TargetInstrInfo::
-getOperandLatency(const InstrItineraryData *ItinData,
-                  const MachineInstr *DefMI, unsigned DefIdx,
-                  const MachineInstr *UseMI, unsigned UseIdx) const {
-  unsigned DefClass = DefMI->getDesc().getSchedClass();
-  unsigned UseClass = UseMI->getDesc().getSchedClass();
+int TargetInstrInfo::getOperandLatency(const InstrItineraryData *ItinData,
+                                       const MachineInstr &DefMI,
+                                       unsigned DefIdx,
+                                       const MachineInstr &UseMI,
+                                       unsigned UseIdx) const {
+  unsigned DefClass = DefMI.getDesc().getSchedClass();
+  unsigned UseClass = UseMI.getDesc().getSchedClass();
   return ItinData->getOperandLatency(DefClass, DefIdx, UseClass, UseIdx);
 }
 
 /// If we can determine the operand latency from the def only, without itinerary
 /// lookup, do so. Otherwise return -1.
 int TargetInstrInfo::computeDefOperandLatency(
-  const InstrItineraryData *ItinData,
-  const MachineInstr *DefMI) const {
+    const InstrItineraryData *ItinData, const MachineInstr &DefMI) const {
 
   // Let the target hook getInstrLatency handle missing itineraries.
   if (!ItinData)
@@ -1094,21 +1091,9 @@ int TargetInstrInfo::computeDefOperandLatency(
   return -1;
 }
 
-/// computeOperandLatency - Compute and return the latency of the given data
-/// dependent def and use when the operand indices are already known. UseMI may
-/// be NULL for an unknown use.
-///
-/// FindMin may be set to get the minimum vs. expected latency. Minimum
-/// latency is used for scheduling groups, while expected latency is for
-/// instruction cost and critical path.
-///
-/// Depending on the subtarget's itinerary properties, this may or may not need
-/// to call getOperandLatency(). For most subtargets, we don't need DefIdx or
-/// UseIdx to compute min latency.
-unsigned TargetInstrInfo::
-computeOperandLatency(const InstrItineraryData *ItinData,
-                      const MachineInstr *DefMI, unsigned DefIdx,
-                      const MachineInstr *UseMI, unsigned UseIdx) const {
+unsigned TargetInstrInfo::computeOperandLatency(
+    const InstrItineraryData *ItinData, const MachineInstr &DefMI,
+    unsigned DefIdx, const MachineInstr *UseMI, unsigned UseIdx) const {
 
   int DefLatency = computeDefOperandLatency(ItinData, DefMI);
   if (DefLatency >= 0)
@@ -1118,9 +1103,9 @@ computeOperandLatency(const InstrItineraryData *ItinData,
 
   int OperLatency = 0;
   if (UseMI)
-    OperLatency = getOperandLatency(ItinData, DefMI, DefIdx, UseMI, UseIdx);
+    OperLatency = getOperandLatency(ItinData, DefMI, DefIdx, *UseMI, UseIdx);
   else {
-    unsigned DefClass = DefMI->getDesc().getSchedClass();
+    unsigned DefClass = DefMI.getDesc().getSchedClass();
     OperLatency = ItinData->getOperandCycle(DefClass, DefIdx);
   }
   if (OperLatency >= 0)
diff --git a/lib/CodeGen/TargetLoweringBase.cpp b/lib/CodeGen/TargetLoweringBase.cpp
index 36a31c9d6461..6d3fe8ca6473 100644
--- a/lib/CodeGen/TargetLoweringBase.cpp
+++ b/lib/CodeGen/TargetLoweringBase.cpp
@@ -28,6 +28,7 @@
 #include "llvm/MC/MCAsmInfo.h"
 #include "llvm/MC/MCContext.h"
 #include "llvm/MC/MCExpr.h"
+#include "llvm/Support/BranchProbability.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/MathExtras.h"
@@ -43,6 +44,17 @@ static cl::opt<bool> JumpIsExpensiveOverride(
     cl::desc("Do not create extra branches to split comparison logic."),
     cl::Hidden);
 
+// Although this default value is arbitrary, it is not random. It is assumed
+// that a condition that evaluates the same way by a higher percentage than this
+// is best represented as control flow. Therefore, the default value N should be
+// set such that the win from N% correct executions is greater than the loss
+// from (100 - N)% mispredicted executions for the majority of intended targets.
+static cl::opt<int> MinPercentageForPredictableBranch(
+    "min-predictable-branch", cl::init(99),
+    cl::desc("Minimum percentage (0-100) that a condition must be either true "
+             "or false to assume that the condition is predictable"),
+    cl::Hidden);
+
 /// InitLibcallNames - Set default libcall names.
 ///
 static void InitLibcallNames(const char **Names, const Triple &TT) {
@@ -87,18 +99,6 @@ static void InitLibcallNames(const char **Names, const Triple &TT) {
   Names[RTLIB::UREM_I64] = "__umoddi3";
   Names[RTLIB::UREM_I128] = "__umodti3";
 
-  // These are generally not available.
-  Names[RTLIB::SDIVREM_I8] = nullptr;
-  Names[RTLIB::SDIVREM_I16] = nullptr;
-  Names[RTLIB::SDIVREM_I32] = nullptr;
-  Names[RTLIB::SDIVREM_I64] = nullptr;
-  Names[RTLIB::SDIVREM_I128] = nullptr;
-  Names[RTLIB::UDIVREM_I8] = nullptr;
-  Names[RTLIB::UDIVREM_I16] = nullptr;
-  Names[RTLIB::UDIVREM_I32] = nullptr;
-  Names[RTLIB::UDIVREM_I64] = nullptr;
-  Names[RTLIB::UDIVREM_I128] = nullptr;
-
   Names[RTLIB::NEG_I32] = "__negsi2";
   Names[RTLIB::NEG_I64] = "__negdi2";
   Names[RTLIB::ADD_F32] = "__addsf3";
@@ -231,11 +231,21 @@ static void InitLibcallNames(const char **Names, const Triple &TT) {
   Names[RTLIB::COPYSIGN_F80] = "copysignl";
   Names[RTLIB::COPYSIGN_F128] = "copysignl";
   Names[RTLIB::COPYSIGN_PPCF128] = "copysignl";
+  Names[RTLIB::FPEXT_F32_PPCF128] = "__gcc_stoq";
+  Names[RTLIB::FPEXT_F64_PPCF128] = "__gcc_dtoq";
   Names[RTLIB::FPEXT_F64_F128] = "__extenddftf2";
   Names[RTLIB::FPEXT_F32_F128] = "__extendsftf2";
   Names[RTLIB::FPEXT_F32_F64] = "__extendsfdf2";
-  Names[RTLIB::FPEXT_F16_F32] = "__gnu_h2f_ieee";
-  Names[RTLIB::FPROUND_F32_F16] = "__gnu_f2h_ieee";
+  if (TT.isOSDarwin()) {
+    // For f16/f32 conversions, Darwin uses the standard naming scheme, instead
+    // of the gnueabi-style __gnu_*_ieee.
+    // FIXME: What about other targets?
+    Names[RTLIB::FPEXT_F16_F32] = "__extendhfsf2";
+    Names[RTLIB::FPROUND_F32_F16] = "__truncsfhf2";
+  } else {
+    Names[RTLIB::FPEXT_F16_F32] = "__gnu_h2f_ieee";
+    Names[RTLIB::FPROUND_F32_F16] = "__gnu_f2h_ieee";
+  }
   Names[RTLIB::FPROUND_F64_F16] = "__truncdfhf2";
   Names[RTLIB::FPROUND_F80_F16] = "__truncxfhf2";
   Names[RTLIB::FPROUND_F128_F16] = "__trunctfhf2";
@@ -243,10 +253,10 @@ static void InitLibcallNames(const char **Names, const Triple &TT) {
   Names[RTLIB::FPROUND_F64_F32] = "__truncdfsf2";
   Names[RTLIB::FPROUND_F80_F32] = "__truncxfsf2";
   Names[RTLIB::FPROUND_F128_F32] = "__trunctfsf2";
-  Names[RTLIB::FPROUND_PPCF128_F32] = "__trunctfsf2";
+  Names[RTLIB::FPROUND_PPCF128_F32] = "__gcc_qtos";
   Names[RTLIB::FPROUND_F80_F64] = "__truncxfdf2";
   Names[RTLIB::FPROUND_F128_F64] = "__trunctfdf2";
-  Names[RTLIB::FPROUND_PPCF128_F64] = "__trunctfdf2";
+  Names[RTLIB::FPROUND_PPCF128_F64] = "__gcc_qtod";
   Names[RTLIB::FPTOSINT_F32_I32] = "__fixsfsi";
   Names[RTLIB::FPTOSINT_F32_I64] = "__fixsfdi";
   Names[RTLIB::FPTOSINT_F32_I128] = "__fixsfti";
@@ -259,7 +269,7 @@ static void InitLibcallNames(const char **Names, const Triple &TT) {
   Names[RTLIB::FPTOSINT_F128_I32] = "__fixtfsi";
   Names[RTLIB::FPTOSINT_F128_I64] = "__fixtfdi";
   Names[RTLIB::FPTOSINT_F128_I128] = "__fixtfti";
-  Names[RTLIB::FPTOSINT_PPCF128_I32] = "__fixtfsi";
+  Names[RTLIB::FPTOSINT_PPCF128_I32] = "__gcc_qtou";
   Names[RTLIB::FPTOSINT_PPCF128_I64] = "__fixtfdi";
   Names[RTLIB::FPTOSINT_PPCF128_I128] = "__fixtfti";
   Names[RTLIB::FPTOUINT_F32_I32] = "__fixunssfsi";
@@ -281,7 +291,7 @@ static void InitLibcallNames(const char **Names, const Triple &TT) {
   Names[RTLIB::SINTTOFP_I32_F64] = "__floatsidf";
   Names[RTLIB::SINTTOFP_I32_F80] = "__floatsixf";
   Names[RTLIB::SINTTOFP_I32_F128] = "__floatsitf";
-  Names[RTLIB::SINTTOFP_I32_PPCF128] = "__floatsitf";
+  Names[RTLIB::SINTTOFP_I32_PPCF128] = "__gcc_itoq";
   Names[RTLIB::SINTTOFP_I64_F32] = "__floatdisf";
   Names[RTLIB::SINTTOFP_I64_F64] = "__floatdidf";
   Names[RTLIB::SINTTOFP_I64_F80] = "__floatdixf";
@@ -296,7 +306,7 @@ static void InitLibcallNames(const char **Names, const Triple &TT) {
   Names[RTLIB::UINTTOFP_I32_F64] = "__floatunsidf";
   Names[RTLIB::UINTTOFP_I32_F80] = "__floatunsixf";
   Names[RTLIB::UINTTOFP_I32_F128] = "__floatunsitf";
-  Names[RTLIB::UINTTOFP_I32_PPCF128] = "__floatunsitf";
+  Names[RTLIB::UINTTOFP_I32_PPCF128] = "__gcc_utoq";
   Names[RTLIB::UINTTOFP_I64_F32] = "__floatundisf";
   Names[RTLIB::UINTTOFP_I64_F64] = "__floatundidf";
   Names[RTLIB::UINTTOFP_I64_F80] = "__floatundixf";
@@ -310,27 +320,35 @@ static void InitLibcallNames(const char **Names, const Triple &TT) {
   Names[RTLIB::OEQ_F32] = "__eqsf2";
   Names[RTLIB::OEQ_F64] = "__eqdf2";
   Names[RTLIB::OEQ_F128] = "__eqtf2";
+  Names[RTLIB::OEQ_PPCF128] = "__gcc_qeq";
   Names[RTLIB::UNE_F32] = "__nesf2";
   Names[RTLIB::UNE_F64] = "__nedf2";
   Names[RTLIB::UNE_F128] = "__netf2";
+  Names[RTLIB::UNE_PPCF128] = "__gcc_qne";
   Names[RTLIB::OGE_F32] = "__gesf2";
   Names[RTLIB::OGE_F64] = "__gedf2";
   Names[RTLIB::OGE_F128] = "__getf2";
+  Names[RTLIB::OGE_PPCF128] = "__gcc_qge";
   Names[RTLIB::OLT_F32] = "__ltsf2";
   Names[RTLIB::OLT_F64] = "__ltdf2";
   Names[RTLIB::OLT_F128] = "__lttf2";
+  Names[RTLIB::OLT_PPCF128] = "__gcc_qlt";
   Names[RTLIB::OLE_F32] = "__lesf2";
   Names[RTLIB::OLE_F64] = "__ledf2";
   Names[RTLIB::OLE_F128] = "__letf2";
+  Names[RTLIB::OLE_PPCF128] = "__gcc_qle";
   Names[RTLIB::OGT_F32] = "__gtsf2";
   Names[RTLIB::OGT_F64] = "__gtdf2";
   Names[RTLIB::OGT_F128] = "__gttf2";
+  Names[RTLIB::OGT_PPCF128] = "__gcc_qgt";
   Names[RTLIB::UO_F32] = "__unordsf2";
   Names[RTLIB::UO_F64] = "__unorddf2";
   Names[RTLIB::UO_F128] = "__unordtf2";
+  Names[RTLIB::UO_PPCF128] = "__gcc_qunord";
   Names[RTLIB::O_F32] = "__unordsf2";
   Names[RTLIB::O_F64] = "__unorddf2";
   Names[RTLIB::O_F128] = "__unordtf2";
+  Names[RTLIB::O_PPCF128] = "__gcc_qunord";
   Names[RTLIB::MEMCPY] = "memcpy";
   Names[RTLIB::MEMMOVE] = "memmove";
   Names[RTLIB::MEMSET] = "memset";
@@ -395,36 +413,79 @@ static void InitLibcallNames(const char **Names, const Triple &TT) {
   Names[RTLIB::SYNC_FETCH_AND_UMIN_4] = "__sync_fetch_and_umin_4";
   Names[RTLIB::SYNC_FETCH_AND_UMIN_8] = "__sync_fetch_and_umin_8";
   Names[RTLIB::SYNC_FETCH_AND_UMIN_16] = "__sync_fetch_and_umin_16";
-  
-  if (TT.getEnvironment() == Triple::GNU) {
+
+  Names[RTLIB::ATOMIC_LOAD] = "__atomic_load";
+  Names[RTLIB::ATOMIC_LOAD_1] = "__atomic_load_1";
+  Names[RTLIB::ATOMIC_LOAD_2] = "__atomic_load_2";
+  Names[RTLIB::ATOMIC_LOAD_4] = "__atomic_load_4";
+  Names[RTLIB::ATOMIC_LOAD_8] = "__atomic_load_8";
+  Names[RTLIB::ATOMIC_LOAD_16] = "__atomic_load_16";
+
+  Names[RTLIB::ATOMIC_STORE] = "__atomic_store";
+  Names[RTLIB::ATOMIC_STORE_1] = "__atomic_store_1";
+  Names[RTLIB::ATOMIC_STORE_2] = "__atomic_store_2";
+  Names[RTLIB::ATOMIC_STORE_4] = "__atomic_store_4";
+  Names[RTLIB::ATOMIC_STORE_8] = "__atomic_store_8";
+  Names[RTLIB::ATOMIC_STORE_16] = "__atomic_store_16";
+
+  Names[RTLIB::ATOMIC_EXCHANGE] = "__atomic_exchange";
+  Names[RTLIB::ATOMIC_EXCHANGE_1] = "__atomic_exchange_1";
+  Names[RTLIB::ATOMIC_EXCHANGE_2] = "__atomic_exchange_2";
+  Names[RTLIB::ATOMIC_EXCHANGE_4] = "__atomic_exchange_4";
+  Names[RTLIB::ATOMIC_EXCHANGE_8] = "__atomic_exchange_8";
+  Names[RTLIB::ATOMIC_EXCHANGE_16] = "__atomic_exchange_16";
+
+  Names[RTLIB::ATOMIC_COMPARE_EXCHANGE] = "__atomic_compare_exchange";
+  Names[RTLIB::ATOMIC_COMPARE_EXCHANGE_1] = "__atomic_compare_exchange_1";
+  Names[RTLIB::ATOMIC_COMPARE_EXCHANGE_2] = "__atomic_compare_exchange_2";
+  Names[RTLIB::ATOMIC_COMPARE_EXCHANGE_4] = "__atomic_compare_exchange_4";
+  Names[RTLIB::ATOMIC_COMPARE_EXCHANGE_8] = "__atomic_compare_exchange_8";
+  Names[RTLIB::ATOMIC_COMPARE_EXCHANGE_16] = "__atomic_compare_exchange_16";
+
+  Names[RTLIB::ATOMIC_FETCH_ADD_1] = "__atomic_fetch_add_1";
+  Names[RTLIB::ATOMIC_FETCH_ADD_2] = "__atomic_fetch_add_2";
+  Names[RTLIB::ATOMIC_FETCH_ADD_4] = "__atomic_fetch_add_4";
+  Names[RTLIB::ATOMIC_FETCH_ADD_8] = "__atomic_fetch_add_8";
+  Names[RTLIB::ATOMIC_FETCH_ADD_16] = "__atomic_fetch_add_16";
+  Names[RTLIB::ATOMIC_FETCH_SUB_1] = "__atomic_fetch_sub_1";
+  Names[RTLIB::ATOMIC_FETCH_SUB_2] = "__atomic_fetch_sub_2";
+  Names[RTLIB::ATOMIC_FETCH_SUB_4] = "__atomic_fetch_sub_4";
+  Names[RTLIB::ATOMIC_FETCH_SUB_8] = "__atomic_fetch_sub_8";
+  Names[RTLIB::ATOMIC_FETCH_SUB_16] = "__atomic_fetch_sub_16";
+  Names[RTLIB::ATOMIC_FETCH_AND_1] = "__atomic_fetch_and_1";
+  Names[RTLIB::ATOMIC_FETCH_AND_2] = "__atomic_fetch_and_2";
+  Names[RTLIB::ATOMIC_FETCH_AND_4] = "__atomic_fetch_and_4";
+  Names[RTLIB::ATOMIC_FETCH_AND_8] = "__atomic_fetch_and_8";
+  Names[RTLIB::ATOMIC_FETCH_AND_16] = "__atomic_fetch_and_16";
+  Names[RTLIB::ATOMIC_FETCH_OR_1] = "__atomic_fetch_or_1";
+  Names[RTLIB::ATOMIC_FETCH_OR_2] = "__atomic_fetch_or_2";
+  Names[RTLIB::ATOMIC_FETCH_OR_4] = "__atomic_fetch_or_4";
+  Names[RTLIB::ATOMIC_FETCH_OR_8] = "__atomic_fetch_or_8";
+  Names[RTLIB::ATOMIC_FETCH_OR_16] = "__atomic_fetch_or_16";
+  Names[RTLIB::ATOMIC_FETCH_XOR_1] = "__atomic_fetch_xor_1";
+  Names[RTLIB::ATOMIC_FETCH_XOR_2] = "__atomic_fetch_xor_2";
+  Names[RTLIB::ATOMIC_FETCH_XOR_4] = "__atomic_fetch_xor_4";
+  Names[RTLIB::ATOMIC_FETCH_XOR_8] = "__atomic_fetch_xor_8";
+  Names[RTLIB::ATOMIC_FETCH_XOR_16] = "__atomic_fetch_xor_16";
+  Names[RTLIB::ATOMIC_FETCH_NAND_1] = "__atomic_fetch_nand_1";
+  Names[RTLIB::ATOMIC_FETCH_NAND_2] = "__atomic_fetch_nand_2";
+  Names[RTLIB::ATOMIC_FETCH_NAND_4] = "__atomic_fetch_nand_4";
+  Names[RTLIB::ATOMIC_FETCH_NAND_8] = "__atomic_fetch_nand_8";
+  Names[RTLIB::ATOMIC_FETCH_NAND_16] = "__atomic_fetch_nand_16";
+
+  if (TT.isGNUEnvironment()) {
     Names[RTLIB::SINCOS_F32] = "sincosf";
     Names[RTLIB::SINCOS_F64] = "sincos";
     Names[RTLIB::SINCOS_F80] = "sincosl";
     Names[RTLIB::SINCOS_F128] = "sincosl";
     Names[RTLIB::SINCOS_PPCF128] = "sincosl";
-  } else {
-    // These are generally not available.
-    Names[RTLIB::SINCOS_F32] = nullptr;
-    Names[RTLIB::SINCOS_F64] = nullptr;
-    Names[RTLIB::SINCOS_F80] = nullptr;
-    Names[RTLIB::SINCOS_F128] = nullptr;
-    Names[RTLIB::SINCOS_PPCF128] = nullptr;
   }
 
   if (!TT.isOSOpenBSD()) {
     Names[RTLIB::STACKPROTECTOR_CHECK_FAIL] = "__stack_chk_fail";
-  } else {
-    // These are generally not available.
-    Names[RTLIB::STACKPROTECTOR_CHECK_FAIL] = nullptr;
   }
 
-  // For f16/f32 conversions, Darwin uses the standard naming scheme, instead
-  // of the gnueabi-style __gnu_*_ieee.
-  // FIXME: What about other targets?
-  if (TT.isOSDarwin()) {
-    Names[RTLIB::FPEXT_F16_F32] = "__extendhfsf2";
-    Names[RTLIB::FPROUND_F32_F16] = "__truncsfhf2";
-  }
+  Names[RTLIB::DEOPTIMIZE] = "__llvm_deoptimize";
 }
 
 /// InitLibcallCallingConvs - Set default libcall CallingConvs.
@@ -446,9 +507,13 @@ RTLIB::Libcall RTLIB::getFPEXT(EVT OpVT, EVT RetVT) {
       return FPEXT_F32_F64;
     if (RetVT == MVT::f128)
       return FPEXT_F32_F128;
+    if (RetVT == MVT::ppcf128)
+      return FPEXT_F32_PPCF128;
   } else if (OpVT == MVT::f64) {
     if (RetVT == MVT::f128)
       return FPEXT_F64_F128;
+    else if (RetVT == MVT::ppcf128)
+      return FPEXT_F64_PPCF128;
   }
 
   return UNKNOWN_LIBCALL;
@@ -653,7 +718,7 @@ RTLIB::Libcall RTLIB::getUINTTOFP(EVT OpVT, EVT RetVT) {
   return UNKNOWN_LIBCALL;
 }
 
-RTLIB::Libcall RTLIB::getATOMIC(unsigned Opc, MVT VT) {
+RTLIB::Libcall RTLIB::getSYNC(unsigned Opc, MVT VT) {
 #define OP_TO_LIBCALL(Name, Enum)                                              \
   case Name:                                                                   \
     switch (VT.SimpleTy) {                                                     \
@@ -698,27 +763,35 @@ static void InitCmpLibcallCCs(ISD::CondCode *CCs) {
   CCs[RTLIB::OEQ_F32] = ISD::SETEQ;
   CCs[RTLIB::OEQ_F64] = ISD::SETEQ;
   CCs[RTLIB::OEQ_F128] = ISD::SETEQ;
+  CCs[RTLIB::OEQ_PPCF128] = ISD::SETEQ;
   CCs[RTLIB::UNE_F32] = ISD::SETNE;
   CCs[RTLIB::UNE_F64] = ISD::SETNE;
   CCs[RTLIB::UNE_F128] = ISD::SETNE;
+  CCs[RTLIB::UNE_PPCF128] = ISD::SETNE;
   CCs[RTLIB::OGE_F32] = ISD::SETGE;
   CCs[RTLIB::OGE_F64] = ISD::SETGE;
   CCs[RTLIB::OGE_F128] = ISD::SETGE;
+  CCs[RTLIB::OGE_PPCF128] = ISD::SETGE;
   CCs[RTLIB::OLT_F32] = ISD::SETLT;
   CCs[RTLIB::OLT_F64] = ISD::SETLT;
   CCs[RTLIB::OLT_F128] = ISD::SETLT;
+  CCs[RTLIB::OLT_PPCF128] = ISD::SETLT;
   CCs[RTLIB::OLE_F32] = ISD::SETLE;
   CCs[RTLIB::OLE_F64] = ISD::SETLE;
   CCs[RTLIB::OLE_F128] = ISD::SETLE;
+  CCs[RTLIB::OLE_PPCF128] = ISD::SETLE;
   CCs[RTLIB::OGT_F32] = ISD::SETGT;
   CCs[RTLIB::OGT_F64] = ISD::SETGT;
   CCs[RTLIB::OGT_F128] = ISD::SETGT;
+  CCs[RTLIB::OGT_PPCF128] = ISD::SETGT;
   CCs[RTLIB::UO_F32] = ISD::SETNE;
   CCs[RTLIB::UO_F64] = ISD::SETNE;
   CCs[RTLIB::UO_F128] = ISD::SETNE;
+  CCs[RTLIB::UO_PPCF128] = ISD::SETNE;
   CCs[RTLIB::O_F32] = ISD::SETEQ;
   CCs[RTLIB::O_F64] = ISD::SETEQ;
   CCs[RTLIB::O_F128] = ISD::SETEQ;
+  CCs[RTLIB::O_PPCF128] = ISD::SETEQ;
 }
 
 /// NOTE: The TargetMachine owns TLOF.
@@ -752,8 +825,14 @@ TargetLoweringBase::TargetLoweringBase(const TargetMachine &tm) : TM(tm) {
   PrefLoopAlignment = 0;
   GatherAllAliasesMaxDepth = 6;
   MinStackArgumentAlignment = 1;
-  InsertFencesForAtomic = false;
   MinimumJumpTableEntries = 4;
+  // TODO: the default will be switched to 0 in the next commit, along
+  // with the Target-specific changes necessary.
+  MaxAtomicSizeInBitsSupported = 1024;
+
+  MinCmpXchgSizeInBits = 0;
+
+  std::fill(std::begin(LibcallRoutineNames), std::end(LibcallRoutineNames), nullptr);
 
   InitLibcallNames(LibcallRoutineNames, TM.getTargetTriple());
   InitCmpLibcallCCs(CmpLibcallCCs);
@@ -767,8 +846,9 @@ void TargetLoweringBase::initActions() {
   memset(TruncStoreActions, 0, sizeof(TruncStoreActions));
   memset(IndexedModeActions, 0, sizeof(IndexedModeActions));
   memset(CondCodeActions, 0, sizeof(CondCodeActions));
-  memset(RegClassForVT, 0,MVT::LAST_VALUETYPE*sizeof(TargetRegisterClass*));
-  memset(TargetDAGCombineArray, 0, array_lengthof(TargetDAGCombineArray));
+  std::fill(std::begin(RegClassForVT), std::end(RegClassForVT), nullptr);
+  std::fill(std::begin(TargetDAGCombineArray),
+            std::end(TargetDAGCombineArray), 0);
 
   // Set default actions for various operations.
   for (MVT VT : MVT::all_valuetypes()) {
@@ -803,6 +883,10 @@ void TargetLoweringBase::initActions() {
     setOperationAction(ISD::SMULO, VT, Expand);
     setOperationAction(ISD::UMULO, VT, Expand);
 
+    // These default to Expand so they will be expanded to CTLZ/CTTZ by default.
+    setOperationAction(ISD::CTLZ_ZERO_UNDEF, VT, Expand);
+    setOperationAction(ISD::CTTZ_ZERO_UNDEF, VT, Expand);
+
     setOperationAction(ISD::BITREVERSE, VT, Expand);
     
     // These library functions default to expand.
@@ -816,7 +900,7 @@ void TargetLoweringBase::initActions() {
       setOperationAction(ISD::ZERO_EXTEND_VECTOR_INREG, VT, Expand);
     }
 
-    // For most targets @llvm.get.dynamic.area.offest just returns 0.
+    // For most targets @llvm.get.dynamic.area.offset just returns 0.
     setOperationAction(ISD::GET_DYNAMIC_AREA_OFFSET, VT, Expand);
   }
 
@@ -843,8 +927,6 @@ void TargetLoweringBase::initActions() {
     setOperationAction(ISD::FEXP ,      VT, Expand);
     setOperationAction(ISD::FEXP2,      VT, Expand);
     setOperationAction(ISD::FFLOOR,     VT, Expand);
-    setOperationAction(ISD::FMINNUM,    VT, Expand);
-    setOperationAction(ISD::FMAXNUM,    VT, Expand);
     setOperationAction(ISD::FNEARBYINT, VT, Expand);
     setOperationAction(ISD::FCEIL,      VT, Expand);
     setOperationAction(ISD::FRINT,      VT, Expand);
@@ -1090,9 +1172,10 @@ bool TargetLoweringBase::isLegalRC(const TargetRegisterClass *RC) const {
 
 /// Replace/modify any TargetFrameIndex operands with a targte-dependent
 /// sequence of memory operands that is recognized by PrologEpilogInserter.
-MachineBasicBlock*
-TargetLoweringBase::emitPatchPoint(MachineInstr *MI,
+MachineBasicBlock *
+TargetLoweringBase::emitPatchPoint(MachineInstr &InitialMI,
                                    MachineBasicBlock *MBB) const {
+  MachineInstr *MI = &InitialMI;
   MachineFunction &MF = *MI->getParent()->getParent();
   MachineFrameInfo &MFI = *MF.getFrameInfo();
 
@@ -1151,7 +1234,7 @@ TargetLoweringBase::emitPatchPoint(MachineInstr *MI,
     // Add a new memory operand for this FI.
     assert(MFI.getObjectOffset(FI) != -1);
 
-    unsigned Flags = MachineMemOperand::MOLoad;
+    auto Flags = MachineMemOperand::MOLoad;
     if (MI->getOpcode() == TargetOpcode::STATEPOINT) {
       Flags |= MachineMemOperand::MOStore;
       Flags |= MachineMemOperand::MOVolatile;
@@ -1250,10 +1333,17 @@ void TargetLoweringBase::computeRegisterProperties(
 
   // ppcf128 type is really two f64's.
   if (!isTypeLegal(MVT::ppcf128)) {
-    NumRegistersForVT[MVT::ppcf128] = 2*NumRegistersForVT[MVT::f64];
-    RegisterTypeForVT[MVT::ppcf128] = MVT::f64;
-    TransformToType[MVT::ppcf128] = MVT::f64;
-    ValueTypeActions.setTypeAction(MVT::ppcf128, TypeExpandFloat);
+    if (isTypeLegal(MVT::f64)) {
+      NumRegistersForVT[MVT::ppcf128] = 2*NumRegistersForVT[MVT::f64];
+      RegisterTypeForVT[MVT::ppcf128] = MVT::f64;
+      TransformToType[MVT::ppcf128] = MVT::f64;
+      ValueTypeActions.setTypeAction(MVT::ppcf128, TypeExpandFloat);
+    } else {
+      NumRegistersForVT[MVT::ppcf128] = NumRegistersForVT[MVT::i128];
+      RegisterTypeForVT[MVT::ppcf128] = RegisterTypeForVT[MVT::i128];
+      TransformToType[MVT::ppcf128] = MVT::i128;
+      ValueTypeActions.setTypeAction(MVT::ppcf128, TypeSoftenFloat);
+    }
   }
 
   // Decide how to handle f128. If the target does not have native f128 support,
@@ -1308,13 +1398,12 @@ void TargetLoweringBase::computeRegisterProperties(
     case TypePromoteInteger: {
       // Try to promote the elements of integer vectors. If no legal
       // promotion was found, fall through to the widen-vector method.
-      for (unsigned nVT = i + 1; nVT <= MVT::LAST_VECTOR_VALUETYPE; ++nVT) {
+      for (unsigned nVT = i + 1; nVT <= MVT::LAST_INTEGER_VECTOR_VALUETYPE; ++nVT) {
         MVT SVT = (MVT::SimpleValueType) nVT;
         // Promote vectors of integers to vectors with the same number
         // of elements, with a wider element type.
-        if (SVT.getVectorElementType().getSizeInBits() > EltVT.getSizeInBits()
-            && SVT.getVectorNumElements() == NElts && isTypeLegal(SVT)
-            && SVT.getScalarType().isInteger()) {
+        if (SVT.getVectorElementType().getSizeInBits() > EltVT.getSizeInBits() &&
+            SVT.getVectorNumElements() == NElts && isTypeLegal(SVT)) {
           TransformToType[i] = SVT;
           RegisterTypeForVT[i] = SVT;
           NumRegistersForVT[i] = 1;
@@ -1553,6 +1642,9 @@ bool TargetLoweringBase::allowsMemoryAccess(LLVMContext &Context,
   return allowsMisalignedMemoryAccesses(VT, AddrSpace, Alignment, Fast);
 }
 
+BranchProbability TargetLoweringBase::getPredictableBranchThreshold() const {
+  return BranchProbability(MinPercentageForPredictableBranch, 100);
+}
 
 //===----------------------------------------------------------------------===//
 //  TargetTransformInfo Helpers
@@ -1715,3 +1807,36 @@ bool TargetLoweringBase::isLegalAddressingMode(const DataLayout &DL,
 
   return true;
 }
+
+//===----------------------------------------------------------------------===//
+//  Stack Protector
+//===----------------------------------------------------------------------===//
+
+// For OpenBSD return its special guard variable. Otherwise return nullptr,
+// so that SelectionDAG handle SSP.
+Value *TargetLoweringBase::getIRStackGuard(IRBuilder<> &IRB) const {
+  if (getTargetMachine().getTargetTriple().isOSOpenBSD()) {
+    Module &M = *IRB.GetInsertBlock()->getParent()->getParent();
+    PointerType *PtrTy = Type::getInt8PtrTy(M.getContext());
+    auto Guard = cast<GlobalValue>(M.getOrInsertGlobal("__guard_local", PtrTy));
+    Guard->setVisibility(GlobalValue::HiddenVisibility);
+    return Guard;
+  }
+  return nullptr;
+}
+
+// Currently only support "standard" __stack_chk_guard.
+// TODO: add LOAD_STACK_GUARD support.
+void TargetLoweringBase::insertSSPDeclarations(Module &M) const {
+  M.getOrInsertGlobal("__stack_chk_guard", Type::getInt8PtrTy(M.getContext()));
+}
+
+// Currently only support "standard" __stack_chk_guard.
+// TODO: add LOAD_STACK_GUARD support.
+Value *TargetLoweringBase::getSDagStackGuard(const Module &M) const {
+  return M.getGlobalVariable("__stack_chk_guard", true);
+}
+
+Value *TargetLoweringBase::getSSPStackGuardCheck(const Module &M) const {
+  return nullptr;
+}
diff --git a/lib/CodeGen/TargetLoweringObjectFileImpl.cpp b/lib/CodeGen/TargetLoweringObjectFileImpl.cpp
index 58ae9cc53bda..5f814c957e92 100644
--- a/lib/CodeGen/TargetLoweringObjectFileImpl.cpp
+++ b/lib/CodeGen/TargetLoweringObjectFileImpl.cpp
@@ -33,6 +33,7 @@
 #include "llvm/MC/MCStreamer.h"
 #include "llvm/MC/MCSymbolELF.h"
 #include "llvm/MC/MCValue.h"
+#include "llvm/ProfileData/InstrProf.h"
 #include "llvm/Support/COFF.h"
 #include "llvm/Support/Dwarf.h"
 #include "llvm/Support/ELF.h"
@@ -68,11 +69,9 @@ void TargetLoweringObjectFileELF::emitPersonalityValue(
       cast<MCSymbolELF>(getContext().getOrCreateSymbol(NameData));
   Streamer.EmitSymbolAttribute(Label, MCSA_Hidden);
   Streamer.EmitSymbolAttribute(Label, MCSA_Weak);
-  StringRef Prefix = ".data.";
-  NameData.insert(NameData.begin(), Prefix.begin(), Prefix.end());
   unsigned Flags = ELF::SHF_ALLOC | ELF::SHF_WRITE | ELF::SHF_GROUP;
-  MCSection *Sec = getContext().getELFSection(NameData, ELF::SHT_PROGBITS,
-                                              Flags, 0, Label->getName());
+  MCSection *Sec = getContext().getELFNamedSection(".data", Label->getName(),
+                                                   ELF::SHT_PROGBITS, Flags, 0);
   unsigned Size = DL.getPointerSize();
   Streamer.SwitchSection(Sec);
   Streamer.EmitValueToAlignment(DL.getPointerABIAlignment());
@@ -119,6 +118,10 @@ getELFKindForNamedSection(StringRef Name, SectionKind K) {
   // section(".eh_frame") gcc will produce:
   //
   //   .section   .eh_frame,"a",@progbits
+  
+  if (Name == getInstrProfCoverageSectionName(false))
+    return SectionKind::getMetadata();
+
   if (Name.empty() || Name[0] != '.') return K;
 
   // Some lame default implementation based on some magic section names.
@@ -259,9 +262,11 @@ selectELFSectionForGlobal(MCContext &Ctx, const GlobalValue *GV,
       EntrySize = 4;
     } else if (Kind.isMergeableConst8()) {
       EntrySize = 8;
-    } else {
-      assert(Kind.isMergeableConst16() && "unknown data width");
+    } else if (Kind.isMergeableConst16()) {
       EntrySize = 16;
+    } else {
+      assert(Kind.isMergeableConst32() && "unknown data width");
+      EntrySize = 32;
     }
   }
 
@@ -288,12 +293,14 @@ selectELFSectionForGlobal(MCContext &Ctx, const GlobalValue *GV,
   } else {
     Name = getSectionPrefixForGlobal(Kind);
   }
+  // FIXME: Extend the section prefix to include hotness catagories such as .hot
+  //  or .unlikely for functions.
 
   if (EmitUniqueSection && UniqueSectionNames) {
     Name.push_back('.');
     TM.getNameWithPrefix(Name, GV, Mang, true);
   }
-  unsigned UniqueID = ~0;
+  unsigned UniqueID = MCContext::GenericSectionID;
   if (EmitUniqueSection && !UniqueSectionNames) {
     UniqueID = *NextUniqueID;
     (*NextUniqueID)++;
@@ -346,13 +353,16 @@ bool TargetLoweringObjectFileELF::shouldPutJumpTableInFunctionSection(
 /// Given a mergeable constant with the specified size and relocation
 /// information, return a section that it should be placed in.
 MCSection *TargetLoweringObjectFileELF::getSectionForConstant(
-    const DataLayout &DL, SectionKind Kind, const Constant *C) const {
+    const DataLayout &DL, SectionKind Kind, const Constant *C,
+    unsigned &Align) const {
   if (Kind.isMergeableConst4() && MergeableConst4Section)
     return MergeableConst4Section;
   if (Kind.isMergeableConst8() && MergeableConst8Section)
     return MergeableConst8Section;
   if (Kind.isMergeableConst16() && MergeableConst16Section)
     return MergeableConst16Section;
+  if (Kind.isMergeableConst32() && MergeableConst32Section)
+    return MergeableConst32Section;
   if (Kind.isReadOnly())
     return ReadOnlySection;
 
@@ -412,6 +422,27 @@ MCSection *TargetLoweringObjectFileELF::getStaticDtorSection(
                                   KeySym);
 }
 
+const MCExpr *TargetLoweringObjectFileELF::lowerRelativeReference(
+    const GlobalValue *LHS, const GlobalValue *RHS, Mangler &Mang,
+    const TargetMachine &TM) const {
+  // We may only use a PLT-relative relocation to refer to unnamed_addr
+  // functions.
+  if (!LHS->hasGlobalUnnamedAddr() || !LHS->getValueType()->isFunctionTy())
+    return nullptr;
+
+  // Basic sanity checks.
+  if (LHS->getType()->getPointerAddressSpace() != 0 ||
+      RHS->getType()->getPointerAddressSpace() != 0 || LHS->isThreadLocal() ||
+      RHS->isThreadLocal())
+    return nullptr;
+
+  return MCBinaryExpr::createSub(
+      MCSymbolRefExpr::create(TM.getSymbol(LHS, Mang), PLTRelativeVariantKind,
+                              getContext()),
+      MCSymbolRefExpr::create(TM.getSymbol(RHS, Mang), getContext()),
+      getContext());
+}
+
 void
 TargetLoweringObjectFileELF::InitializeELF(bool UseInitArray_) {
   UseInitArray = UseInitArray_;
@@ -443,10 +474,7 @@ emitModuleFlags(MCStreamer &Streamer,
   MDNode *LinkerOptions = nullptr;
   StringRef SectionVal;
 
-  for (ArrayRef<Module::ModuleFlagEntry>::iterator
-         i = ModuleFlags.begin(), e = ModuleFlags.end(); i != e; ++i) {
-    const Module::ModuleFlagEntry &MFE = *i;
-
+  for (const auto &MFE : ModuleFlags) {
     // Ignore flags with 'Require' behavior.
     if (MFE.Behavior == Module::Require)
       continue;
@@ -459,6 +487,7 @@ emitModuleFlags(MCStreamer &Streamer,
     } else if (Key == "Objective-C Garbage Collection" ||
                Key == "Objective-C GC Only" ||
                Key == "Objective-C Is Simulated" ||
+               Key == "Objective-C Class Properties" ||
                Key == "Objective-C Image Swift Version") {
       ImageInfoFlags |= mdconst::extract<ConstantInt>(Val)->getZExtValue();
     } else if (Key == "Objective-C Image Info Section") {
@@ -470,16 +499,10 @@ emitModuleFlags(MCStreamer &Streamer,
 
   // Emit the linker options if present.
   if (LinkerOptions) {
-    for (unsigned i = 0, e = LinkerOptions->getNumOperands(); i != e; ++i) {
-      MDNode *MDOptions = cast<MDNode>(LinkerOptions->getOperand(i));
+    for (const auto &Option : LinkerOptions->operands()) {
       SmallVector<std::string, 4> StrOptions;
-
-      // Convert to strings.
-      for (unsigned ii = 0, ie = MDOptions->getNumOperands(); ii != ie; ++ii) {
-        MDString *MDOption = cast<MDString>(MDOptions->getOperand(ii));
-        StrOptions.push_back(MDOption->getString());
-      }
-
+      for (const auto &Piece : cast<MDNode>(Option)->operands())
+        StrOptions.push_back(cast<MDString>(Piece)->getString());
       Streamer.EmitLinkerOptions(StrOptions);
     }
   }
@@ -630,7 +653,8 @@ MCSection *TargetLoweringObjectFileMachO::SelectSectionForGlobal(
 }
 
 MCSection *TargetLoweringObjectFileMachO::getSectionForConstant(
-    const DataLayout &DL, SectionKind Kind, const Constant *C) const {
+    const DataLayout &DL, SectionKind Kind, const Constant *C,
+    unsigned &Align) const {
   // If this constant requires a relocation, we have to put it in the data
   // segment, not in the text segment.
   if (Kind.isData() || Kind.isReadOnlyWithRel())
@@ -660,9 +684,7 @@ const MCExpr *TargetLoweringObjectFileMachO::getTTypeGlobalReference(
 
     // Add information about the stub reference to MachOMMI so that the stub
     // gets emitted by the asmprinter.
-    MachineModuleInfoImpl::StubValueTy &StubSym =
-      GV->hasHiddenVisibility() ? MachOMMI.getHiddenGVStubEntry(SSym) :
-                                  MachOMMI.getGVStubEntry(SSym);
+    MachineModuleInfoImpl::StubValueTy &StubSym = MachOMMI.getGVStubEntry(SSym);
     if (!StubSym.getPointer()) {
       MCSymbol *Sym = TM.getSymbol(GV, Mang);
       StubSym = MachineModuleInfoImpl::StubValueTy(Sym, !GV->hasLocalLinkage());
@@ -785,8 +807,9 @@ void TargetLoweringObjectFileMachO::getNameWithPrefix(
 //===----------------------------------------------------------------------===//
 
 static unsigned
-getCOFFSectionFlags(SectionKind K) {
+getCOFFSectionFlags(SectionKind K, const TargetMachine &TM) {
   unsigned Flags = 0;
+  bool isThumb = TM.getTargetTriple().getArch() == Triple::thumb;
 
   if (K.isMetadata())
     Flags |=
@@ -795,7 +818,8 @@ getCOFFSectionFlags(SectionKind K) {
     Flags |=
       COFF::IMAGE_SCN_MEM_EXECUTE |
       COFF::IMAGE_SCN_MEM_READ |
-      COFF::IMAGE_SCN_CNT_CODE;
+      COFF::IMAGE_SCN_CNT_CODE |
+      (isThumb ? COFF::IMAGE_SCN_MEM_16BIT : (COFF::SectionCharacteristics)0);
   else if (K.isBSS())
     Flags |=
       COFF::IMAGE_SCN_CNT_UNINITIALIZED_DATA |
@@ -865,7 +889,7 @@ MCSection *TargetLoweringObjectFileCOFF::getExplicitSectionGlobal(
     const GlobalValue *GV, SectionKind Kind, Mangler &Mang,
     const TargetMachine &TM) const {
   int Selection = 0;
-  unsigned Characteristics = getCOFFSectionFlags(Kind);
+  unsigned Characteristics = getCOFFSectionFlags(Kind, TM);
   StringRef Name = GV->getSection();
   StringRef COMDATSymName = "";
   if (GV->hasComdat()) {
@@ -884,10 +908,8 @@ MCSection *TargetLoweringObjectFileCOFF::getExplicitSectionGlobal(
       Selection = 0;
     }
   }
-  return getContext().getCOFFSection(Name,
-                                     Characteristics,
-                                     Kind,
-                                     COMDATSymName,
+
+  return getContext().getCOFFSection(Name, Characteristics, Kind, COMDATSymName,
                                      Selection);
 }
 
@@ -916,7 +938,7 @@ MCSection *TargetLoweringObjectFileCOFF::SelectSectionForGlobal(
 
   if ((EmitUniquedSection && !Kind.isCommon()) || GV->hasComdat()) {
     const char *Name = getCOFFSectionNameForUniqueGlobal(Kind);
-    unsigned Characteristics = getCOFFSectionFlags(Kind);
+    unsigned Characteristics = getCOFFSectionFlags(Kind, TM);
 
     Characteristics |= COFF::IMAGE_SCN_LNK_COMDAT;
     int Selection = getSelectionForCOFF(GV);
@@ -928,16 +950,20 @@ MCSection *TargetLoweringObjectFileCOFF::SelectSectionForGlobal(
     else
       ComdatGV = GV;
 
+    unsigned UniqueID = MCContext::GenericSectionID;
+    if (EmitUniquedSection)
+      UniqueID = NextUniqueID++;
+
     if (!ComdatGV->hasPrivateLinkage()) {
       MCSymbol *Sym = TM.getSymbol(ComdatGV, Mang);
       StringRef COMDATSymName = Sym->getName();
       return getContext().getCOFFSection(Name, Characteristics, Kind,
-                                         COMDATSymName, Selection);
+                                         COMDATSymName, Selection, UniqueID);
     } else {
       SmallString<256> TmpData;
       Mang.getNameWithPrefix(TmpData, GV, /*CannotUsePrivateLabel=*/true);
       return getContext().getCOFFSection(Name, Characteristics, Kind, TmpData,
-                                         Selection);
+                                         Selection, UniqueID);
     }
   }
 
@@ -989,11 +1015,12 @@ MCSection *TargetLoweringObjectFileCOFF::getSectionForJumpTable(
 
   SectionKind Kind = SectionKind::getReadOnly();
   const char *Name = getCOFFSectionNameForUniqueGlobal(Kind);
-  unsigned Characteristics = getCOFFSectionFlags(Kind);
+  unsigned Characteristics = getCOFFSectionFlags(Kind, TM);
   Characteristics |= COFF::IMAGE_SCN_LNK_COMDAT;
+  unsigned UniqueID = NextUniqueID++;
 
   return getContext().getCOFFSection(Name, Characteristics, Kind, COMDATSymName,
-                                     COFF::IMAGE_COMDAT_SELECT_ASSOCIATIVE);
+                                     COFF::IMAGE_COMDAT_SELECT_ASSOCIATIVE, UniqueID);
 }
 
 void TargetLoweringObjectFileCOFF::
@@ -1002,32 +1029,25 @@ emitModuleFlags(MCStreamer &Streamer,
                 Mangler &Mang, const TargetMachine &TM) const {
   MDNode *LinkerOptions = nullptr;
 
-  // Look for the "Linker Options" flag, since it's the only one we support.
-  for (ArrayRef<Module::ModuleFlagEntry>::iterator
-       i = ModuleFlags.begin(), e = ModuleFlags.end(); i != e; ++i) {
-    const Module::ModuleFlagEntry &MFE = *i;
+  for (const auto &MFE : ModuleFlags) {
     StringRef Key = MFE.Key->getString();
-    Metadata *Val = MFE.Val;
-    if (Key == "Linker Options") {
-      LinkerOptions = cast<MDNode>(Val);
-      break;
-    }
+    if (Key == "Linker Options")
+      LinkerOptions = cast<MDNode>(MFE.Val);
   }
-  if (!LinkerOptions)
-    return;
 
-  // Emit the linker options to the linker .drectve section.  According to the
-  // spec, this section is a space-separated string containing flags for linker.
-  MCSection *Sec = getDrectveSection();
-  Streamer.SwitchSection(Sec);
-  for (unsigned i = 0, e = LinkerOptions->getNumOperands(); i != e; ++i) {
-    MDNode *MDOptions = cast<MDNode>(LinkerOptions->getOperand(i));
-    for (unsigned ii = 0, ie = MDOptions->getNumOperands(); ii != ie; ++ii) {
-      MDString *MDOption = cast<MDString>(MDOptions->getOperand(ii));
-      // Lead with a space for consistency with our dllexport implementation.
-      std::string Directive(" ");
-      Directive.append(MDOption->getString());
-      Streamer.EmitBytes(Directive);
+  if (LinkerOptions) {
+    // Emit the linker options to the linker .drectve section.  According to the
+    // spec, this section is a space-separated string containing flags for
+    // linker.
+    MCSection *Sec = getDrectveSection();
+    Streamer.SwitchSection(Sec);
+    for (const auto &Option : LinkerOptions->operands()) {
+      for (const auto &Piece : cast<MDNode>(Option)->operands()) {
+        // Lead with a space for consistency with our dllexport implementation.
+        std::string Directive(" ");
+        Directive.append(cast<MDString>(Piece)->getString());
+        Streamer.EmitBytes(Directive);
+      }
     }
   }
 }
@@ -1035,13 +1055,13 @@ emitModuleFlags(MCStreamer &Streamer,
 MCSection *TargetLoweringObjectFileCOFF::getStaticCtorSection(
     unsigned Priority, const MCSymbol *KeySym) const {
   return getContext().getAssociativeCOFFSection(
-      cast<MCSectionCOFF>(StaticCtorSection), KeySym);
+      cast<MCSectionCOFF>(StaticCtorSection), KeySym, 0);
 }
 
 MCSection *TargetLoweringObjectFileCOFF::getStaticDtorSection(
     unsigned Priority, const MCSymbol *KeySym) const {
   return getContext().getAssociativeCOFFSection(
-      cast<MCSectionCOFF>(StaticDtorSection), KeySym);
+      cast<MCSectionCOFF>(StaticDtorSection), KeySym, 0);
 }
 
 void TargetLoweringObjectFileCOFF::emitLinkerFlagsForGlobal(
diff --git a/lib/CodeGen/Passes.cpp b/lib/CodeGen/TargetPassConfig.cpp
index 873f7125b82a..b8c820942cb5 100644
--- a/lib/CodeGen/Passes.cpp
+++ b/lib/CodeGen/TargetPassConfig.cpp
@@ -1,4 +1,4 @@
-//===-- Passes.cpp - Target independent code generation passes ------------===//
+//===-- TargetPassConfig.cpp - Target independent code generation passes --===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -12,22 +12,26 @@
 //
 //===---------------------------------------------------------------------===//
 
-#include "llvm/CodeGen/Passes.h"
+#include "llvm/CodeGen/TargetPassConfig.h"
+
 #include "llvm/Analysis/BasicAliasAnalysis.h"
-#include "llvm/Analysis/CFLAliasAnalysis.h"
+#include "llvm/Analysis/CFLAndersAliasAnalysis.h"
+#include "llvm/Analysis/CFLSteensAliasAnalysis.h"
+#include "llvm/Analysis/CallGraphSCCPass.h"
 #include "llvm/Analysis/Passes.h"
 #include "llvm/Analysis/ScopedNoAliasAA.h"
 #include "llvm/Analysis/TypeBasedAliasAnalysis.h"
 #include "llvm/CodeGen/MachineFunctionPass.h"
 #include "llvm/CodeGen/RegAllocRegistry.h"
+#include "llvm/CodeGen/RegisterUsageInfo.h"
 #include "llvm/IR/IRPrintingPasses.h"
 #include "llvm/IR/LegacyPassManager.h"
 #include "llvm/IR/Verifier.h"
 #include "llvm/MC/MCAsmInfo.h"
-#include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/raw_ostream.h"
+#include "llvm/Target/TargetMachine.h"
 #include "llvm/Transforms/Instrumentation.h"
 #include "llvm/Transforms/Scalar.h"
 #include "llvm/Transforms/Utils/SymbolRewriter.h"
@@ -106,9 +110,19 @@ cl::opt<bool> MISchedPostRA("misched-postra", cl::Hidden,
 static cl::opt<bool> EarlyLiveIntervals("early-live-intervals", cl::Hidden,
     cl::desc("Run live interval analysis earlier in the pipeline"));
 
-static cl::opt<bool> UseCFLAA("use-cfl-aa-in-codegen",
-  cl::init(false), cl::Hidden,
-  cl::desc("Enable the new, experimental CFL alias analysis in CodeGen"));
+// Experimental option to use CFL-AA in codegen
+enum class CFLAAType { None, Steensgaard, Andersen, Both };
+static cl::opt<CFLAAType> UseCFLAA(
+    "use-cfl-aa-in-codegen", cl::init(CFLAAType::None), cl::Hidden,
+    cl::desc("Enable the new, experimental CFL alias analysis in CodeGen"),
+    cl::values(clEnumValN(CFLAAType::None, "none", "Disable CFL-AA"),
+               clEnumValN(CFLAAType::Steensgaard, "steens",
+                          "Enable unification-based CFL-AA"),
+               clEnumValN(CFLAAType::Andersen, "anders",
+                          "Enable inclusion-based CFL-AA"),
+               clEnumValN(CFLAAType::Both, "both", 
+                          "Enable both variants of CFL-AA"),
+               clEnumValEnd));
 
 /// Allow standard passes to be disabled by command line options. This supports
 /// simple binary flags that either suppress the pass or do nothing.
@@ -241,7 +255,7 @@ TargetPassConfig::TargetPassConfig(TargetMachine *tm, PassManagerBase &pm)
     : ImmutablePass(ID), PM(&pm), StartBefore(nullptr), StartAfter(nullptr),
       StopAfter(nullptr), Started(true), Stopped(false),
       AddingMachinePasses(false), TM(tm), Impl(nullptr), Initialized(false),
-      DisableVerify(false), EnableTailMerge(true) { 
+      DisableVerify(false), EnableTailMerge(true) {
 
   Impl = new PassConfigImpl();
 
@@ -256,6 +270,13 @@ TargetPassConfig::TargetPassConfig(TargetMachine *tm, PassManagerBase &pm)
   // Substitute Pseudo Pass IDs for real ones.
   substitutePass(&EarlyTailDuplicateID, &TailDuplicateID);
   substitutePass(&PostRAMachineLICMID, &MachineLICMID);
+
+  if (StringRef(PrintMachineInstrs.getValue()).equals(""))
+    TM->Options.PrintMachineCode = true;
+}
+
+CodeGenOpt::Level TargetPassConfig::getOptLevel() const {
+  return TM->getOptLevel();
 }
 
 /// Insert InsertedPassID pass after TargetPassID.
@@ -303,6 +324,13 @@ IdentifyingPassPtr TargetPassConfig::getPassSubstitution(AnalysisID ID) const {
   return I->second;
 }
 
+bool TargetPassConfig::isPassSubstitutedOrOverridden(AnalysisID ID) const {
+  IdentifyingPassPtr TargetID = getPassSubstitution(ID);
+  IdentifyingPassPtr FinalPtr = overridePass(ID, TargetID);
+  return !FinalPtr.isValid() || FinalPtr.isInstance() ||
+      FinalPtr.getID() != ID;
+}
+
 /// Add a pass to the PassManager if that pass is supposed to be run.  If the
 /// Started/Stopped flags indicate either that the compilation should start at
 /// a later pass or that it should stop after an earlier pass, then do not add
@@ -392,12 +420,25 @@ void TargetPassConfig::addVerifyPass(const std::string &Banner) {
 /// Add common target configurable passes that perform LLVM IR to IR transforms
 /// following machine independent optimization.
 void TargetPassConfig::addIRPasses() {
+  switch (UseCFLAA) {
+  case CFLAAType::Steensgaard:
+    addPass(createCFLSteensAAWrapperPass());
+    break;
+  case CFLAAType::Andersen:
+    addPass(createCFLAndersAAWrapperPass());
+    break;
+  case CFLAAType::Both:
+    addPass(createCFLAndersAAWrapperPass());
+    addPass(createCFLSteensAAWrapperPass());
+    break;
+  default:
+    break;
+  }
+
   // Basic AliasAnalysis support.
   // Add TypeBasedAliasAnalysis before BasicAliasAnalysis so that
   // BasicAliasAnalysis wins if they disagree. This is intended to help
   // support "obvious" type-punning idioms.
-  if (UseCFLAA)
-    addPass(createCFLAAWrapperPass());
   addPass(createTypeBasedAAWrapperPass());
   addPass(createScopedNoAliasAAWrapperPass());
   addPass(createBasicAAWrapperPass());
@@ -476,6 +517,10 @@ void TargetPassConfig::addCodeGenPrepare() {
 void TargetPassConfig::addISelPrepare() {
   addPreISel();
 
+  // Force codegen to run according to the callgraph.
+  if (TM->Options.EnableIPRA)
+    addPass(new DummyCGSCCPass);
+
   // Add both the safe stack and the stack protection passes: each of them will
   // only protect functions that have corresponding attributes.
   addPass(createSafeStackPass(TM));
@@ -512,12 +557,12 @@ void TargetPassConfig::addISelPrepare() {
 void TargetPassConfig::addMachinePasses() {
   AddingMachinePasses = true;
 
+  if (TM->Options.EnableIPRA)
+    addPass(createRegUsageInfoPropPass());
+
   // Insert a machine instr printer pass after the specified pass.
-  // If -print-machineinstrs specified, print machineinstrs after all passes.
-  if (StringRef(PrintMachineInstrs.getValue()).equals(""))
-    TM->Options.PrintMachineCode = true;
-  else if (!StringRef(PrintMachineInstrs.getValue())
-           .equals("option-unspecified")) {
+  if (!StringRef(PrintMachineInstrs.getValue()).equals("") &&
+      !StringRef(PrintMachineInstrs.getValue()).equals("option-unspecified")) {
     const PassRegistry *PR = PassRegistry::getPassRegistry();
     const PassInfo *TPI = PR->getPassInfo(PrintMachineInstrs.getValue());
     const PassInfo *IPI = PR->getPassInfo(StringRef("machineinstr-printer"));
@@ -556,10 +601,13 @@ void TargetPassConfig::addMachinePasses() {
   addPostRegAlloc();
 
   // Insert prolog/epilog code.  Eliminate abstract frame index references...
-  if (getOptLevel() != CodeGenOpt::None) 
+  if (getOptLevel() != CodeGenOpt::None)
     addPass(&ShrinkWrapID);
 
-  addPass(&PrologEpilogCodeInserterID);
+  // Prolog/Epilog inserter needs a TargetMachine to instantiate. But only
+  // do so if it hasn't been disabled, substituted, or overridden.
+  if (!isPassSubstitutedOrOverridden(&PrologEpilogCodeInserterID))
+      addPass(createPrologEpilogInserterPass(TM));
 
   /// Add passes that optimize machine instructions after register allocation.
   if (getOptLevel() != CodeGenOpt::None)
@@ -597,11 +645,19 @@ void TargetPassConfig::addMachinePasses() {
 
   addPreEmitPass();
 
+  if (TM->Options.EnableIPRA)
+    // Collect register usage information and produce a register mask of
+    // clobbered registers, to be used to optimize call sites.
+    addPass(createRegUsageInfoCollector());
+
   addPass(&FuncletLayoutID, false);
 
   addPass(&StackMapLivenessID, false);
   addPass(&LiveDebugValuesID, false);
 
+  addPass(&XRayInstrumentationID, false);
+  addPass(&PatchableFunctionID, false);
+
   AddingMachinePasses = false;
 }
 
@@ -661,6 +717,7 @@ MachinePassRegistry RegisterRegAlloc::Registry;
 
 /// A dummy default pass factory indicates whether the register allocator is
 /// overridden on the command line.
+LLVM_DEFINE_ONCE_FLAG(InitializeDefaultRegisterAllocatorFlag);
 static FunctionPass *useDefaultRegisterAllocator() { return nullptr; }
 static RegisterRegAlloc
 defaultRegAlloc("default",
@@ -674,6 +731,15 @@ RegAlloc("regalloc",
          cl::init(&useDefaultRegisterAllocator),
          cl::desc("Register allocator to use"));
 
+static void initializeDefaultRegisterAllocatorOnce() {
+  RegisterRegAlloc::FunctionPassCtor Ctor = RegisterRegAlloc::getDefault();
+
+  if (!Ctor) {
+    Ctor = RegAlloc;
+    RegisterRegAlloc::setDefault(RegAlloc);
+  }
+}
+
 
 /// Instantiate the default register allocator pass for this target for either
 /// the optimized or unoptimized allocation path. This will be added to the pass
@@ -700,13 +766,11 @@ FunctionPass *TargetPassConfig::createTargetRegisterAllocator(bool Optimized) {
 /// FIXME: When MachinePassRegistry register pass IDs instead of function ptrs,
 /// this can be folded into addPass.
 FunctionPass *TargetPassConfig::createRegAllocPass(bool Optimized) {
-  RegisterRegAlloc::FunctionPassCtor Ctor = RegisterRegAlloc::getDefault();
-
   // Initialize the global default.
-  if (!Ctor) {
-    Ctor = RegAlloc;
-    RegisterRegAlloc::setDefault(RegAlloc);
-  }
+  llvm::call_once(InitializeDefaultRegisterAllocatorFlag,
+                  initializeDefaultRegisterAllocatorOnce);
+
+  RegisterRegAlloc::FunctionPassCtor Ctor = RegisterRegAlloc::getDefault();
   if (Ctor != useDefaultRegisterAllocator)
     return Ctor();
 
@@ -734,6 +798,8 @@ void TargetPassConfig::addFastRegAlloc(FunctionPass *RegAllocPass) {
 /// optimized register allocation, including coalescing, machine instruction
 /// scheduling, and register allocation itself.
 void TargetPassConfig::addOptimizedRegAlloc(FunctionPass *RegAllocPass) {
+  addPass(&DetectDeadLanesID, false);
+
   addPass(&ProcessImplicitDefsID, false);
 
   // LiveVariables currently requires pure SSA form.
@@ -755,6 +821,11 @@ void TargetPassConfig::addOptimizedRegAlloc(FunctionPass *RegAllocPass) {
   addPass(&TwoAddressInstructionPassID, false);
   addPass(&RegisterCoalescerID);
 
+  // The machine scheduler may accidentally create disconnected components
+  // when moving subregister definitions around, avoid this by splitting them to
+  // separate vregs before. Splitting can also improve reg. allocation quality.
+  addPass(&RenameIndependentSubregsID);
+
   // PreRA instruction scheduling.
   addPass(&MachineSchedulerID);
 
@@ -809,7 +880,7 @@ bool TargetPassConfig::addGCPasses() {
 
 /// Add standard basic block placement passes.
 void TargetPassConfig::addBlockPlacement() {
-  if (addPass(&MachineBlockPlacementID, false)) {
+  if (addPass(&MachineBlockPlacementID)) {
     // Run a separate pass to collect block placement statistics.
     if (EnableBlockPlacementStats)
       addPass(&MachineBlockPlacementStatsID);
diff --git a/lib/CodeGen/TargetRegisterInfo.cpp b/lib/CodeGen/TargetRegisterInfo.cpp
index 0a7042ac3db5..e1d90cb913e5 100644
--- a/lib/CodeGen/TargetRegisterInfo.cpp
+++ b/lib/CodeGen/TargetRegisterInfo.cpp
@@ -112,18 +112,11 @@ TargetRegisterInfo::getAllocatableClass(const TargetRegisterClass *RC) const {
   if (!RC || RC->isAllocatable())
     return RC;
 
-  const unsigned *SubClass = RC->getSubClassMask();
-  for (unsigned Base = 0, BaseE = getNumRegClasses();
-       Base < BaseE; Base += 32) {
-    unsigned Idx = Base;
-    for (unsigned Mask = *SubClass++; Mask; Mask >>= 1) {
-      unsigned Offset = countTrailingZeros(Mask);
-      const TargetRegisterClass *SubRC = getRegClass(Idx + Offset);
-      if (SubRC->isAllocatable())
-        return SubRC;
-      Mask >>= Offset;
-      Idx += Offset + 1;
-    }
+  for (BitMaskClassIterator It(RC->getSubClassMask(), *this); It.isValid();
+       ++It) {
+    const TargetRegisterClass *SubRC = getRegClass(It.getID());
+    if (SubRC->isAllocatable())
+      return SubRC;
   }
   return nullptr;
 }
@@ -388,6 +381,15 @@ bool TargetRegisterInfo::needsStackRealignment(
   return false;
 }
 
+bool TargetRegisterInfo::regmaskSubsetEqual(const uint32_t *mask0,
+                                            const uint32_t *mask1) const {
+  unsigned N = (getNumRegs()+31) / 32;
+  for (unsigned I = 0; I < N; ++I)
+    if ((mask0[I] & mask1[I]) != mask0[I])
+      return false;
+  return true;
+}
+
 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
 void
 TargetRegisterInfo::dumpReg(unsigned Reg, unsigned SubRegIndex,
diff --git a/lib/CodeGen/TargetSchedule.cpp b/lib/CodeGen/TargetSchedule.cpp
index 1c4558cea5f5..022e912aa84f 100644
--- a/lib/CodeGen/TargetSchedule.cpp
+++ b/lib/CodeGen/TargetSchedule.cpp
@@ -77,7 +77,7 @@ unsigned TargetSchedModel::getNumMicroOps(const MachineInstr *MI,
                                           const MCSchedClassDesc *SC) const {
   if (hasInstrItineraries()) {
     int UOps = InstrItins.getNumMicroOps(MI->getDesc().getSchedClass());
-    return (UOps >= 0) ? UOps : TII->getNumMicroOps(&InstrItins, MI);
+    return (UOps >= 0) ? UOps : TII->getNumMicroOps(&InstrItins, *MI);
   }
   if (hasInstrSchedModel()) {
     if (!SC)
@@ -156,13 +156,13 @@ unsigned TargetSchedModel::computeOperandLatency(
   const MachineInstr *UseMI, unsigned UseOperIdx) const {
 
   if (!hasInstrSchedModel() && !hasInstrItineraries())
-    return TII->defaultDefLatency(SchedModel, DefMI);
+    return TII->defaultDefLatency(SchedModel, *DefMI);
 
   if (hasInstrItineraries()) {
     int OperLatency = 0;
     if (UseMI) {
-      OperLatency = TII->getOperandLatency(&InstrItins, DefMI, DefOperIdx,
-                                           UseMI, UseOperIdx);
+      OperLatency = TII->getOperandLatency(&InstrItins, *DefMI, DefOperIdx,
+                                           *UseMI, UseOperIdx);
     }
     else {
       unsigned DefClass = DefMI->getDesc().getSchedClass();
@@ -172,15 +172,15 @@ unsigned TargetSchedModel::computeOperandLatency(
       return OperLatency;
 
     // No operand latency was found.
-    unsigned InstrLatency = TII->getInstrLatency(&InstrItins, DefMI);
+    unsigned InstrLatency = TII->getInstrLatency(&InstrItins, *DefMI);
 
     // Expected latency is the max of the stage latency and itinerary props.
     // Rather than directly querying InstrItins stage latency, we call a TII
     // hook to allow subtargets to specialize latency. This hook is only
     // applicable to the InstrItins model. InstrSchedModel should model all
     // special cases without TII hooks.
-    InstrLatency = std::max(InstrLatency,
-                            TII->defaultDefLatency(SchedModel, DefMI));
+    InstrLatency =
+        std::max(InstrLatency, TII->defaultDefLatency(SchedModel, *DefMI));
     return InstrLatency;
   }
   // hasInstrSchedModel()
@@ -219,7 +219,7 @@ unsigned TargetSchedModel::computeOperandLatency(
   // FIXME: Automatically giving all implicit defs defaultDefLatency is
   // undesirable. We should only do it for defs that are known to the MC
   // desc like flags. Truly implicit defs should get 1 cycle latency.
-  return DefMI->isTransient() ? 0 : TII->defaultDefLatency(SchedModel, DefMI);
+  return DefMI->isTransient() ? 0 : TII->defaultDefLatency(SchedModel, *DefMI);
 }
 
 unsigned
@@ -254,24 +254,23 @@ TargetSchedModel::computeInstrLatency(const MachineInstr *MI,
   // Allow subtargets to compute Bundle latencies outside the machine model.
   if (hasInstrItineraries() || MI->isBundle() ||
       (!hasInstrSchedModel() && !UseDefaultDefLatency))
-    return TII->getInstrLatency(&InstrItins, MI);
+    return TII->getInstrLatency(&InstrItins, *MI);
 
   if (hasInstrSchedModel()) {
     const MCSchedClassDesc *SCDesc = resolveSchedClass(MI);
     if (SCDesc->isValid())
       return computeInstrLatency(*SCDesc);
   }
-  return TII->defaultDefLatency(SchedModel, MI);
+  return TII->defaultDefLatency(SchedModel, *MI);
 }
 
 unsigned TargetSchedModel::
 computeOutputLatency(const MachineInstr *DefMI, unsigned DefOperIdx,
                      const MachineInstr *DepMI) const {
-  if (SchedModel.MicroOpBufferSize <= 1)
+  if (!SchedModel.isOutOfOrder())
     return 1;
 
-  // MicroOpBufferSize > 1 indicates an out-of-order processor that can dispatch
-  // WAW dependencies in the same cycle.
+  // Out-of-order processor can dispatch WAW dependencies in the same cycle.
 
   // Treat predication as a data dependency for out-of-order cpus. In-order
   // cpus do not need to treat predicated writes specially.
@@ -282,7 +281,7 @@ computeOutputLatency(const MachineInstr *DefMI, unsigned DefOperIdx,
   unsigned Reg = DefMI->getOperand(DefOperIdx).getReg();
   const MachineFunction &MF = *DefMI->getParent()->getParent();
   const TargetRegisterInfo *TRI = MF.getSubtarget().getRegisterInfo();
-  if (!DepMI->readsRegister(Reg, TRI) && TII->isPredicated(DepMI))
+  if (!DepMI->readsRegister(Reg, TRI) && TII->isPredicated(*DepMI))
     return computeInstrLatency(DefMI);
 
   // If we have a per operand scheduling model, check if this def is writing
diff --git a/lib/CodeGen/TwoAddressInstructionPass.cpp b/lib/CodeGen/TwoAddressInstructionPass.cpp
index c6bae2434586..3d9a51864b6c 100644
--- a/lib/CodeGen/TwoAddressInstructionPass.cpp
+++ b/lib/CodeGen/TwoAddressInstructionPass.cpp
@@ -27,8 +27,6 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "llvm/CodeGen/Passes.h"
-#include "llvm/ADT/BitVector.h"
 #include "llvm/ADT/DenseMap.h"
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/SmallSet.h"
@@ -40,6 +38,7 @@
 #include "llvm/CodeGen/MachineInstr.h"
 #include "llvm/CodeGen/MachineInstrBuilder.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/Passes.h"
 #include "llvm/IR/Function.h"
 #include "llvm/MC/MCInstrItineraries.h"
 #include "llvm/Support/CommandLine.h"
@@ -50,6 +49,7 @@
 #include "llvm/Target/TargetMachine.h"
 #include "llvm/Target/TargetRegisterInfo.h"
 #include "llvm/Target/TargetSubtargetInfo.h"
+
 using namespace llvm;
 
 #define DEBUG_TYPE "twoaddrinstr"
@@ -156,6 +156,7 @@ public:
   void getAnalysisUsage(AnalysisUsage &AU) const override {
     AU.setPreservesCFG();
     AU.addRequired<AAResultsWrapperPass>();
+    AU.addUsedIfAvailable<LiveVariables>();
     AU.addPreserved<LiveVariables>();
     AU.addPreserved<SlotIndexes>();
     AU.addPreserved<LiveIntervals>();
@@ -245,7 +246,7 @@ sink3AddrInstruction(MachineInstr *MI, unsigned SavedReg,
   // appropriate location, we can try to sink the current instruction
   // past it.
   if (!KillMI || KillMI->getParent() != MBB || KillMI == MI ||
-      KillMI == OldPos || KillMI->isTerminator())
+      MachineBasicBlock::iterator(KillMI) == OldPos || KillMI->isTerminator())
     return false;
 
   // If any of the definitions are used by another instruction between the
@@ -259,16 +260,15 @@ sink3AddrInstruction(MachineInstr *MI, unsigned SavedReg,
   ++KillPos;
 
   unsigned NumVisited = 0;
-  for (MachineBasicBlock::iterator I = std::next(OldPos); I != KillPos; ++I) {
-    MachineInstr *OtherMI = I;
+  for (MachineInstr &OtherMI : llvm::make_range(std::next(OldPos), KillPos)) {
     // DBG_VALUE cannot be counted against the limit.
-    if (OtherMI->isDebugValue())
+    if (OtherMI.isDebugValue())
       continue;
     if (NumVisited > 30)  // FIXME: Arbitrary limit to reduce compile time cost.
       return false;
     ++NumVisited;
-    for (unsigned i = 0, e = OtherMI->getNumOperands(); i != e; ++i) {
-      MachineOperand &MO = OtherMI->getOperand(i);
+    for (unsigned i = 0, e = OtherMI.getNumOperands(); i != e; ++i) {
+      MachineOperand &MO = OtherMI.getOperand(i);
       if (!MO.isReg())
         continue;
       unsigned MOReg = MO.getReg();
@@ -277,8 +277,8 @@ sink3AddrInstruction(MachineInstr *MI, unsigned SavedReg,
       if (DefReg == MOReg)
         return false;
 
-      if (MO.isKill() || (LIS && isPlainlyKilled(OtherMI, MOReg, LIS))) {
-        if (OtherMI == KillMI && MOReg == SavedReg)
+      if (MO.isKill() || (LIS && isPlainlyKilled(&OtherMI, MOReg, LIS))) {
+        if (&OtherMI == KillMI && MOReg == SavedReg)
           // Save the operand that kills the register. We want to unset the kill
           // marker if we can sink MI past it.
           KillMO = &MO;
@@ -297,7 +297,7 @@ sink3AddrInstruction(MachineInstr *MI, unsigned SavedReg,
     KillMO->setIsKill(true);
 
     if (LV)
-      LV->replaceKillInstruction(SavedReg, KillMI, MI);
+      LV->replaceKillInstruction(SavedReg, *KillMI, *MI);
   }
 
   // Move instruction to its destination.
@@ -305,7 +305,7 @@ sink3AddrInstruction(MachineInstr *MI, unsigned SavedReg,
   MBB->insert(KillPos, MI);
 
   if (LIS)
-    LIS->handleMove(MI);
+    LIS->handleMove(*MI);
 
   ++Num3AddrSunk;
   return true;
@@ -400,7 +400,7 @@ static bool isCopyToReg(MachineInstr &MI, const TargetInstrInfo *TII,
 static bool isPlainlyKilled(MachineInstr *MI, unsigned Reg,
                             LiveIntervals *LIS) {
   if (LIS && TargetRegisterInfo::isVirtualRegister(Reg) &&
-      !LIS->isNotInMIMap(MI)) {
+      !LIS->isNotInMIMap(*MI)) {
     // FIXME: Sometimes tryInstructionTransform() will add instructions and
     // test whether they can be folded before keeping them. In this case it
     // sets a kill before recursively calling tryInstructionTransform() again.
@@ -413,7 +413,7 @@ static bool isPlainlyKilled(MachineInstr *MI, unsigned Reg,
     if (!LI.hasAtLeastOneValue())
       return false;
 
-    SlotIndex useIdx = LIS->getInstructionIndex(MI);
+    SlotIndex useIdx = LIS->getInstructionIndex(*MI);
     LiveInterval::const_iterator I = LI.find(useIdx);
     assert(I != LI.end() && "Reg must be live-in to use.");
     return !I->end.isBlock() && SlotIndex::isSameInstr(I->end, useIdx);
@@ -539,7 +539,6 @@ regsAreCompatible(unsigned RegA, unsigned RegB, const TargetRegisterInfo *TRI) {
   return TRI->regsOverlap(RegA, RegB);
 }
 
-
 /// Return true if it's potentially profitable to commute the two-address
 /// instruction that's being processed.
 bool
@@ -647,7 +646,7 @@ bool TwoAddressInstructionPass::commuteInstruction(MachineInstr *MI,
                                                    unsigned Dist) {
   unsigned RegC = MI->getOperand(RegCIdx).getReg();
   DEBUG(dbgs() << "2addr: COMMUTING  : " << *MI);
-  MachineInstr *NewMI = TII->commuteInstruction(MI, false, RegBIdx, RegCIdx);
+  MachineInstr *NewMI = TII->commuteInstruction(*MI, false, RegBIdx, RegCIdx);
 
   if (NewMI == nullptr) {
     DEBUG(dbgs() << "2addr: COMMUTING FAILED!\n");
@@ -695,7 +694,7 @@ TwoAddressInstructionPass::convertInstTo3Addr(MachineBasicBlock::iterator &mi,
                                               unsigned Dist) {
   // FIXME: Why does convertToThreeAddress() need an iterator reference?
   MachineFunction::iterator MFI = MBB->getIterator();
-  MachineInstr *NewMI = TII->convertToThreeAddress(MFI, mi, LV);
+  MachineInstr *NewMI = TII->convertToThreeAddress(MFI, *mi, LV);
   assert(MBB->getIterator() == MFI &&
          "convertToThreeAddress changed iterator reference");
   if (!NewMI)
@@ -706,7 +705,7 @@ TwoAddressInstructionPass::convertInstTo3Addr(MachineBasicBlock::iterator &mi,
   bool Sunk = false;
 
   if (LIS)
-    LIS->ReplaceMachineInstrInMaps(mi, NewMI);
+    LIS->ReplaceMachineInstrInMaps(*mi, *NewMI);
 
   if (NewMI->findRegisterUseOperand(RegB, false, TRI))
     // FIXME: Temporary workaround. If the new instruction doesn't
@@ -808,7 +807,6 @@ void TwoAddressInstructionPass::processCopy(MachineInstr *MI) {
   }
 
   Processed.insert(MI);
-  return;
 }
 
 /// If there is one more local instruction that reads 'Reg' and it kills 'Reg,
@@ -862,7 +860,7 @@ rescheduleMIBelowKill(MachineBasicBlock::iterator &mi,
   if (!MI->isSafeToMove(AA, SeenStore))
     return false;
 
-  if (TII->getInstrLatency(InstrItins, MI) > 1)
+  if (TII->getInstrLatency(InstrItins, *MI) > 1)
     // FIXME: Needs more sophisticated heuristics.
     return false;
 
@@ -899,19 +897,18 @@ rescheduleMIBelowKill(MachineBasicBlock::iterator &mi,
   unsigned NumVisited = 0;
   MachineBasicBlock::iterator KillPos = KillMI;
   ++KillPos;
-  for (MachineBasicBlock::iterator I = End; I != KillPos; ++I) {
-    MachineInstr *OtherMI = I;
+  for (MachineInstr &OtherMI : llvm::make_range(End, KillPos)) {
     // DBG_VALUE cannot be counted against the limit.
-    if (OtherMI->isDebugValue())
+    if (OtherMI.isDebugValue())
       continue;
     if (NumVisited > 10)  // FIXME: Arbitrary limit to reduce compile time cost.
       return false;
     ++NumVisited;
-    if (OtherMI->hasUnmodeledSideEffects() || OtherMI->isCall() ||
-        OtherMI->isBranch() || OtherMI->isTerminator())
+    if (OtherMI.hasUnmodeledSideEffects() || OtherMI.isCall() ||
+        OtherMI.isBranch() || OtherMI.isTerminator())
       // Don't move pass calls, etc.
       return false;
-    for (const MachineOperand &MO : OtherMI->operands()) {
+    for (const MachineOperand &MO : OtherMI.operands()) {
       if (!MO.isReg())
         continue;
       unsigned MOReg = MO.getReg();
@@ -929,8 +926,8 @@ rescheduleMIBelowKill(MachineBasicBlock::iterator &mi,
       } else {
         if (Defs.count(MOReg))
           return false;
-        bool isKill = MO.isKill() ||
-                      (LIS && isPlainlyKilled(OtherMI, MOReg, LIS));
+        bool isKill =
+            MO.isKill() || (LIS && isPlainlyKilled(&OtherMI, MOReg, LIS));
         if (MOReg != Reg &&
             ((isKill && Uses.count(MOReg)) || Kills.count(MOReg)))
           // Don't want to extend other live ranges and update kills.
@@ -939,7 +936,7 @@ rescheduleMIBelowKill(MachineBasicBlock::iterator &mi,
           // We can't schedule across a use of the register in question.
           return false;
         // Ensure that if this is register in question, its the kill we expect.
-        assert((MOReg != Reg || OtherMI == KillMI) &&
+        assert((MOReg != Reg || &OtherMI == KillMI) &&
                "Found multiple kills of a register in a basic block");
       }
     }
@@ -955,10 +952,9 @@ rescheduleMIBelowKill(MachineBasicBlock::iterator &mi,
     // We have to move the copies first so that the MBB is still well-formed
     // when calling handleMove().
     for (MachineBasicBlock::iterator MBBI = AfterMI; MBBI != End;) {
-      MachineInstr *CopyMI = MBBI;
-      ++MBBI;
+      auto CopyMI = MBBI++;
       MBB->splice(InsertPos, MBB, CopyMI);
-      LIS->handleMove(CopyMI);
+      LIS->handleMove(*CopyMI);
       InsertPos = CopyMI;
     }
     End = std::next(MachineBasicBlock::iterator(MI));
@@ -970,10 +966,10 @@ rescheduleMIBelowKill(MachineBasicBlock::iterator &mi,
 
   // Update live variables
   if (LIS) {
-    LIS->handleMove(MI);
+    LIS->handleMove(*MI);
   } else {
-    LV->removeVirtualRegisterKilled(Reg, KillMI);
-    LV->addVirtualRegisterKilled(Reg, MI);
+    LV->removeVirtualRegisterKilled(Reg, *KillMI);
+    LV->addVirtualRegisterKilled(Reg, *MI);
   }
 
   DEBUG(dbgs() << "\trescheduled below kill: " << *KillMI);
@@ -994,7 +990,7 @@ bool TwoAddressInstructionPass::isDefTooClose(unsigned Reg, unsigned Dist,
       return true;  // Below MI
     unsigned DefDist = DDI->second;
     assert(Dist > DefDist && "Visited def already?");
-    if (TII->getInstrLatency(InstrItins, &DefMI) > (Dist - DefDist))
+    if (TII->getInstrLatency(InstrItins, DefMI) > (Dist - DefDist))
       return true;
   }
   return false;
@@ -1074,21 +1070,20 @@ rescheduleKillAboveMI(MachineBasicBlock::iterator &mi,
 
   // Check if the reschedule will not break depedencies.
   unsigned NumVisited = 0;
-  MachineBasicBlock::iterator KillPos = KillMI;
-  for (MachineBasicBlock::iterator I = mi; I != KillPos; ++I) {
-    MachineInstr *OtherMI = I;
+  for (MachineInstr &OtherMI :
+       llvm::make_range(mi, MachineBasicBlock::iterator(KillMI))) {
     // DBG_VALUE cannot be counted against the limit.
-    if (OtherMI->isDebugValue())
+    if (OtherMI.isDebugValue())
       continue;
     if (NumVisited > 10)  // FIXME: Arbitrary limit to reduce compile time cost.
       return false;
     ++NumVisited;
-    if (OtherMI->hasUnmodeledSideEffects() || OtherMI->isCall() ||
-        OtherMI->isBranch() || OtherMI->isTerminator())
+    if (OtherMI.hasUnmodeledSideEffects() || OtherMI.isCall() ||
+        OtherMI.isBranch() || OtherMI.isTerminator())
       // Don't move pass calls, etc.
       return false;
     SmallVector<unsigned, 2> OtherDefs;
-    for (const MachineOperand &MO : OtherMI->operands()) {
+    for (const MachineOperand &MO : OtherMI.operands()) {
       if (!MO.isReg())
         continue;
       unsigned MOReg = MO.getReg();
@@ -1102,8 +1097,8 @@ rescheduleKillAboveMI(MachineBasicBlock::iterator &mi,
         if (Kills.count(MOReg))
           // Don't want to extend other live ranges and update kills.
           return false;
-        if (OtherMI != MI && MOReg == Reg &&
-            !(MO.isKill() || (LIS && isPlainlyKilled(OtherMI, MOReg, LIS))))
+        if (&OtherMI != MI && MOReg == Reg &&
+            !(MO.isKill() || (LIS && isPlainlyKilled(&OtherMI, MOReg, LIS))))
           // We can't schedule across a use of the register in question.
           return false;
       } else {
@@ -1138,10 +1133,10 @@ rescheduleKillAboveMI(MachineBasicBlock::iterator &mi,
 
   // Update live variables
   if (LIS) {
-    LIS->handleMove(KillMI);
+    LIS->handleMove(*KillMI);
   } else {
-    LV->removeVirtualRegisterKilled(Reg, KillMI);
-    LV->addVirtualRegisterKilled(Reg, MI);
+    LV->removeVirtualRegisterKilled(Reg, *KillMI);
+    LV->addVirtualRegisterKilled(Reg, *MI);
   }
 
   DEBUG(dbgs() << "\trescheduled kill: " << *KillMI);
@@ -1175,7 +1170,7 @@ bool TwoAddressInstructionPass::tryInstructionCommute(MachineInstr *MI,
     // other commutable operands and does not change the values of passed
     // variables.
     if (OtherOpIdx == BaseOpIdx ||
-        !TII->findCommutedOpIndices(MI, BaseOpIdx, OtherOpIdx))
+        !TII->findCommutedOpIndices(*MI, BaseOpIdx, OtherOpIdx))
       continue;
 
     unsigned OtherOpReg = MI->getOperand(OtherOpIdx).getReg();
@@ -1308,9 +1303,9 @@ tryInstructionTransform(MachineBasicBlock::iterator &mi,
             TII->getRegClass(UnfoldMCID, LoadRegIndex, TRI, *MF));
         unsigned Reg = MRI->createVirtualRegister(RC);
         SmallVector<MachineInstr *, 2> NewMIs;
-        if (!TII->unfoldMemoryOperand(*MF, &MI, Reg,
-                                      /*UnfoldLoad=*/true,/*UnfoldStore=*/false,
-                                      NewMIs)) {
+        if (!TII->unfoldMemoryOperand(*MF, MI, Reg,
+                                      /*UnfoldLoad=*/true,
+                                      /*UnfoldStore=*/false, NewMIs)) {
           DEBUG(dbgs() << "2addr: ABANDONING UNFOLD\n");
           return false;
         }
@@ -1347,25 +1342,25 @@ tryInstructionTransform(MachineBasicBlock::iterator &mi,
                 if (MO.isUse()) {
                   if (MO.isKill()) {
                     if (NewMIs[0]->killsRegister(MO.getReg()))
-                      LV->replaceKillInstruction(MO.getReg(), &MI, NewMIs[0]);
+                      LV->replaceKillInstruction(MO.getReg(), MI, *NewMIs[0]);
                     else {
                       assert(NewMIs[1]->killsRegister(MO.getReg()) &&
                              "Kill missing after load unfold!");
-                      LV->replaceKillInstruction(MO.getReg(), &MI, NewMIs[1]);
+                      LV->replaceKillInstruction(MO.getReg(), MI, *NewMIs[1]);
                     }
                   }
-                } else if (LV->removeVirtualRegisterDead(MO.getReg(), &MI)) {
+                } else if (LV->removeVirtualRegisterDead(MO.getReg(), MI)) {
                   if (NewMIs[1]->registerDefIsDead(MO.getReg()))
-                    LV->addVirtualRegisterDead(MO.getReg(), NewMIs[1]);
+                    LV->addVirtualRegisterDead(MO.getReg(), *NewMIs[1]);
                   else {
                     assert(NewMIs[0]->registerDefIsDead(MO.getReg()) &&
                            "Dead flag missing after load unfold!");
-                    LV->addVirtualRegisterDead(MO.getReg(), NewMIs[0]);
+                    LV->addVirtualRegisterDead(MO.getReg(), *NewMIs[0]);
                   }
                 }
               }
             }
-            LV->addVirtualRegisterKilled(Reg, NewMIs[1]);
+            LV->addVirtualRegisterKilled(Reg, *NewMIs[1]);
           }
 
           SmallVector<unsigned, 4> OrigRegs;
@@ -1518,17 +1513,17 @@ TwoAddressInstructionPass::processTiedPairs(MachineInstr *MI,
     // Update DistanceMap.
     MachineBasicBlock::iterator PrevMI = MI;
     --PrevMI;
-    DistanceMap.insert(std::make_pair(PrevMI, Dist));
+    DistanceMap.insert(std::make_pair(&*PrevMI, Dist));
     DistanceMap[MI] = ++Dist;
 
     if (LIS) {
-      LastCopyIdx = LIS->InsertMachineInstrInMaps(PrevMI).getRegSlot();
+      LastCopyIdx = LIS->InsertMachineInstrInMaps(*PrevMI).getRegSlot();
 
       if (TargetRegisterInfo::isVirtualRegister(RegA)) {
         LiveInterval &LI = LIS->getInterval(RegA);
         VNInfo *VNI = LI.getNextValue(LastCopyIdx, LIS->getVNInfoAllocator());
         SlotIndex endIdx =
-          LIS->getInstructionIndex(MI).getRegSlot(IsEarlyClobber);
+            LIS->getInstructionIndex(*MI).getRegSlot(IsEarlyClobber);
         LI.addSegment(LiveInterval::Segment(LastCopyIdx, endIdx, VNI));
       }
     }
@@ -1574,16 +1569,16 @@ TwoAddressInstructionPass::processTiedPairs(MachineInstr *MI,
     }
 
     // Update live variables for regB.
-    if (RemovedKillFlag && LV && LV->getVarInfo(RegB).removeKill(MI)) {
+    if (RemovedKillFlag && LV && LV->getVarInfo(RegB).removeKill(*MI)) {
       MachineBasicBlock::iterator PrevMI = MI;
       --PrevMI;
-      LV->addVirtualRegisterKilled(RegB, PrevMI);
+      LV->addVirtualRegisterKilled(RegB, *PrevMI);
     }
 
     // Update LiveIntervals.
     if (LIS) {
       LiveInterval &LI = LIS->getInterval(RegB);
-      SlotIndex MIIdx = LIS->getInstructionIndex(MI);
+      SlotIndex MIIdx = LIS->getInstructionIndex(*MI);
       LiveInterval::const_iterator I = LI.find(MIIdx);
       assert(I != LI.end() && "RegB must be live-in to use.");
 
@@ -1650,13 +1645,13 @@ bool TwoAddressInstructionPass::runOnMachineFunction(MachineFunction &Func) {
       if (mi->isRegSequence())
         eliminateRegSequence(mi);
 
-      DistanceMap.insert(std::make_pair(mi, ++Dist));
+      DistanceMap.insert(std::make_pair(&*mi, ++Dist));
 
       processCopy(&*mi);
 
       // First scan through all the tied register uses in this instruction
       // and record a list of pairs of tied operands for each register.
-      if (!collectTiedOperands(mi, TiedOperands)) {
+      if (!collectTiedOperands(&*mi, TiedOperands)) {
         mi = nmi;
         continue;
       }
@@ -1689,7 +1684,7 @@ bool TwoAddressInstructionPass::runOnMachineFunction(MachineFunction &Func) {
 
       // Now iterate over the information collected above.
       for (auto &TO : TiedOperands) {
-        processTiedPairs(mi, TO.second, Dist);
+        processTiedPairs(&*mi, TO.second, Dist);
         DEBUG(dbgs() << "\t\trewrite to:\t" << *mi);
       }
 
@@ -1733,27 +1728,27 @@ bool TwoAddressInstructionPass::runOnMachineFunction(MachineFunction &Func) {
 ///
 void TwoAddressInstructionPass::
 eliminateRegSequence(MachineBasicBlock::iterator &MBBI) {
-  MachineInstr *MI = MBBI;
-  unsigned DstReg = MI->getOperand(0).getReg();
-  if (MI->getOperand(0).getSubReg() ||
+  MachineInstr &MI = *MBBI;
+  unsigned DstReg = MI.getOperand(0).getReg();
+  if (MI.getOperand(0).getSubReg() ||
       TargetRegisterInfo::isPhysicalRegister(DstReg) ||
-      !(MI->getNumOperands() & 1)) {
-    DEBUG(dbgs() << "Illegal REG_SEQUENCE instruction:" << *MI);
+      !(MI.getNumOperands() & 1)) {
+    DEBUG(dbgs() << "Illegal REG_SEQUENCE instruction:" << MI);
     llvm_unreachable(nullptr);
   }
 
   SmallVector<unsigned, 4> OrigRegs;
   if (LIS) {
-    OrigRegs.push_back(MI->getOperand(0).getReg());
-    for (unsigned i = 1, e = MI->getNumOperands(); i < e; i += 2)
-      OrigRegs.push_back(MI->getOperand(i).getReg());
+    OrigRegs.push_back(MI.getOperand(0).getReg());
+    for (unsigned i = 1, e = MI.getNumOperands(); i < e; i += 2)
+      OrigRegs.push_back(MI.getOperand(i).getReg());
   }
 
   bool DefEmitted = false;
-  for (unsigned i = 1, e = MI->getNumOperands(); i < e; i += 2) {
-    MachineOperand &UseMO = MI->getOperand(i);
+  for (unsigned i = 1, e = MI.getNumOperands(); i < e; i += 2) {
+    MachineOperand &UseMO = MI.getOperand(i);
     unsigned SrcReg = UseMO.getReg();
-    unsigned SubIdx = MI->getOperand(i+1).getImm();
+    unsigned SubIdx = MI.getOperand(i+1).getImm();
     // Nothing needs to be inserted for <undef> operands.
     if (UseMO.isUndef())
       continue;
@@ -1763,18 +1758,18 @@ eliminateRegSequence(MachineBasicBlock::iterator &MBBI) {
     bool isKill = UseMO.isKill();
     if (isKill)
       for (unsigned j = i + 2; j < e; j += 2)
-        if (MI->getOperand(j).getReg() == SrcReg) {
-          MI->getOperand(j).setIsKill();
+        if (MI.getOperand(j).getReg() == SrcReg) {
+          MI.getOperand(j).setIsKill();
           UseMO.setIsKill(false);
           isKill = false;
           break;
         }
 
     // Insert the sub-register copy.
-    MachineInstr *CopyMI = BuildMI(*MI->getParent(), MI, MI->getDebugLoc(),
+    MachineInstr *CopyMI = BuildMI(*MI.getParent(), MI, MI.getDebugLoc(),
                                    TII->get(TargetOpcode::COPY))
-      .addReg(DstReg, RegState::Define, SubIdx)
-      .addOperand(UseMO);
+                               .addReg(DstReg, RegState::Define, SubIdx)
+                               .addOperand(UseMO);
 
     // The first def needs an <undef> flag because there is no live register
     // before it.
@@ -1787,7 +1782,7 @@ eliminateRegSequence(MachineBasicBlock::iterator &MBBI) {
 
     // Update LiveVariables' kill info.
     if (LV && isKill && !TargetRegisterInfo::isPhysicalRegister(SrcReg))
-      LV->replaceKillInstruction(SrcReg, MI, CopyMI);
+      LV->replaceKillInstruction(SrcReg, MI, *CopyMI);
 
     DEBUG(dbgs() << "Inserted: " << *CopyMI);
   }
@@ -1796,13 +1791,13 @@ eliminateRegSequence(MachineBasicBlock::iterator &MBBI) {
       std::next(MachineBasicBlock::iterator(MI));
 
   if (!DefEmitted) {
-    DEBUG(dbgs() << "Turned: " << *MI << " into an IMPLICIT_DEF");
-    MI->setDesc(TII->get(TargetOpcode::IMPLICIT_DEF));
-    for (int j = MI->getNumOperands() - 1, ee = 0; j > ee; --j)
-      MI->RemoveOperand(j);
+    DEBUG(dbgs() << "Turned: " << MI << " into an IMPLICIT_DEF");
+    MI.setDesc(TII->get(TargetOpcode::IMPLICIT_DEF));
+    for (int j = MI.getNumOperands() - 1, ee = 0; j > ee; --j)
+      MI.RemoveOperand(j);
   } else {
-    DEBUG(dbgs() << "Eliminated: " << *MI);
-    MI->eraseFromParent();
+    DEBUG(dbgs() << "Eliminated: " << MI);
+    MI.eraseFromParent();
   }
 
   // Udpate LiveIntervals.
diff --git a/lib/CodeGen/UnreachableBlockElim.cpp b/lib/CodeGen/UnreachableBlockElim.cpp
index 8c9631e435bf..501e01c45a8b 100644
--- a/lib/CodeGen/UnreachableBlockElim.cpp
+++ b/lib/CodeGen/UnreachableBlockElim.cpp
@@ -20,7 +20,7 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "llvm/CodeGen/Passes.h"
+#include "llvm/CodeGen/UnreachableBlockElim.h"
 #include "llvm/ADT/DepthFirstIterator.h"
 #include "llvm/ADT/SmallPtrSet.h"
 #include "llvm/CodeGen/MachineDominators.h"
@@ -28,6 +28,7 @@
 #include "llvm/CodeGen/MachineLoopInfo.h"
 #include "llvm/CodeGen/MachineModuleInfo.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/Passes.h"
 #include "llvm/IR/CFG.h"
 #include "llvm/IR/Constant.h"
 #include "llvm/IR/Dominators.h"
@@ -38,29 +39,7 @@
 #include "llvm/Target/TargetInstrInfo.h"
 using namespace llvm;
 
-namespace {
-  class UnreachableBlockElim : public FunctionPass {
-    bool runOnFunction(Function &F) override;
-  public:
-    static char ID; // Pass identification, replacement for typeid
-    UnreachableBlockElim() : FunctionPass(ID) {
-      initializeUnreachableBlockElimPass(*PassRegistry::getPassRegistry());
-    }
-
-    void getAnalysisUsage(AnalysisUsage &AU) const override {
-      AU.addPreserved<DominatorTreeWrapperPass>();
-    }
-  };
-}
-char UnreachableBlockElim::ID = 0;
-INITIALIZE_PASS(UnreachableBlockElim, "unreachableblockelim",
-                "Remove unreachable blocks from the CFG", false, false)
-
-FunctionPass *llvm::createUnreachableBlockEliminationPass() {
-  return new UnreachableBlockElim();
-}
-
-bool UnreachableBlockElim::runOnFunction(Function &F) {
+static bool eliminateUnreachableBlock(Function &F) {
   SmallPtrSet<BasicBlock*, 8> Reachable;
 
   // Mark all reachable blocks.
@@ -91,6 +70,41 @@ bool UnreachableBlockElim::runOnFunction(Function &F) {
   return !DeadBlocks.empty();
 }
 
+namespace {
+class UnreachableBlockElimLegacyPass : public FunctionPass {
+  bool runOnFunction(Function &F) override {
+    return eliminateUnreachableBlock(F);
+  }
+
+public:
+  static char ID; // Pass identification, replacement for typeid
+  UnreachableBlockElimLegacyPass() : FunctionPass(ID) {
+    initializeUnreachableBlockElimLegacyPassPass(
+        *PassRegistry::getPassRegistry());
+  }
+
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    AU.addPreserved<DominatorTreeWrapperPass>();
+  }
+};
+}
+char UnreachableBlockElimLegacyPass::ID = 0;
+INITIALIZE_PASS(UnreachableBlockElimLegacyPass, "unreachableblockelim",
+                "Remove unreachable blocks from the CFG", false, false)
+
+FunctionPass *llvm::createUnreachableBlockEliminationPass() {
+  return new UnreachableBlockElimLegacyPass();
+}
+
+PreservedAnalyses UnreachableBlockElimPass::run(Function &F,
+                                                FunctionAnalysisManager &AM) {
+  bool Changed = eliminateUnreachableBlock(F);
+  if (!Changed)
+    return PreservedAnalyses::all();
+  PreservedAnalyses PA;
+  PA.preserve<DominatorTreeAnalysis>();
+  return PA;
+}
 
 namespace {
   class UnreachableMachineBlockElim : public MachineFunctionPass {
@@ -184,9 +198,7 @@ bool UnreachableMachineBlockElim::runOnMachineFunction(MachineFunction &F) {
         unsigned Input = phi->getOperand(1).getReg();
         unsigned Output = phi->getOperand(0).getReg();
 
-        MachineInstr* temp = phi;
-        ++phi;
-        temp->eraseFromParent();
+        phi++->eraseFromParent();
         ModifiedPHI = true;
 
         if (Input != Output) {
diff --git a/lib/CodeGen/VirtRegMap.cpp b/lib/CodeGen/VirtRegMap.cpp
index bf1c0dce9e56..8a3a0328870d 100644
--- a/lib/CodeGen/VirtRegMap.cpp
+++ b/lib/CodeGen/VirtRegMap.cpp
@@ -19,7 +19,6 @@
 #include "llvm/CodeGen/VirtRegMap.h"
 #include "LiveDebugVariables.h"
 #include "llvm/ADT/STLExtras.h"
-#include "llvm/ADT/SparseSet.h"
 #include "llvm/ADT/Statistic.h"
 #include "llvm/CodeGen/LiveIntervalAnalysis.h"
 #include "llvm/CodeGen/LiveStackAnalysis.h"
@@ -29,7 +28,6 @@
 #include "llvm/CodeGen/MachineRegisterInfo.h"
 #include "llvm/CodeGen/Passes.h"
 #include "llvm/IR/Function.h"
-#include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Compiler.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/raw_ostream.h"
@@ -84,7 +82,7 @@ unsigned VirtRegMap::createSpillSlot(const TargetRegisterClass *RC) {
 bool VirtRegMap::hasPreferredPhys(unsigned VirtReg) {
   unsigned Hint = MRI->getSimpleHint(VirtReg);
   if (!Hint)
-    return 0;
+    return false;
   if (TargetRegisterInfo::isVirtualRegister(Hint))
     Hint = getPhys(Hint);
   return getPhys(VirtReg) == Hint;
@@ -139,7 +137,7 @@ void VirtRegMap::print(raw_ostream &OS, const Module*) const {
 }
 
 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
-void VirtRegMap::dump() const {
+LLVM_DUMP_METHOD void VirtRegMap::dump() const {
   print(dbgs());
 }
 #endif
@@ -168,6 +166,7 @@ class VirtRegRewriter : public MachineFunctionPass {
   void addMBBLiveIns();
   bool readsUndefSubreg(const MachineOperand &MO) const;
   void addLiveInsForSubRanges(const LiveInterval &LI, unsigned PhysReg) const;
+  void handleIdentityCopy(MachineInstr &MI) const;
 
 public:
   static char ID;
@@ -176,6 +175,10 @@ public:
   void getAnalysisUsage(AnalysisUsage &AU) const override;
 
   bool runOnMachineFunction(MachineFunction&) override;
+  MachineFunctionProperties getSetProperties() const override {
+    return MachineFunctionProperties().set(
+        MachineFunctionProperties::Property::AllVRegsAllocated);
+  }
 };
 } // end anonymous namespace
 
@@ -329,7 +332,7 @@ bool VirtRegRewriter::readsUndefSubreg(const MachineOperand &MO) const {
   unsigned Reg = MO.getReg();
   const LiveInterval &LI = LIS->getInterval(Reg);
   const MachineInstr &MI = *MO.getParent();
-  SlotIndex BaseIndex = LIS->getInstructionIndex(&MI);
+  SlotIndex BaseIndex = LIS->getInstructionIndex(MI);
   // This code is only meant to handle reading undefined subregisters which
   // we couldn't properly detect before.
   assert(LI.liveAt(BaseIndex) &&
@@ -344,6 +347,30 @@ bool VirtRegRewriter::readsUndefSubreg(const MachineOperand &MO) const {
   return true;
 }
 
+void VirtRegRewriter::handleIdentityCopy(MachineInstr &MI) const {
+  if (!MI.isIdentityCopy())
+    return;
+  DEBUG(dbgs() << "Identity copy: " << MI);
+  ++NumIdCopies;
+
+  // Copies like:
+  //    %R0 = COPY %R0<undef>
+  //    %AL = COPY %AL, %EAX<imp-def>
+  // give us additional liveness information: The target (super-)register
+  // must not be valid before this point. Replace the COPY with a KILL
+  // instruction to maintain this information.
+  if (MI.getOperand(0).isUndef() || MI.getNumOperands() > 2) {
+    MI.setDesc(TII->get(TargetOpcode::KILL));
+    DEBUG(dbgs() << "  replace by: " << MI);
+    return;
+  }
+
+  if (Indexes)
+    Indexes->removeMachineInstrFromMaps(MI);
+  MI.eraseFromParent();
+  DEBUG(dbgs() << "  deleted.\n");
+}
+
 void VirtRegRewriter::rewrite() {
   bool NoSubRegLiveness = !MRI->subRegLivenessEnabled();
   SmallVector<unsigned, 8> SuperDeads;
@@ -433,16 +460,8 @@ void VirtRegRewriter::rewrite() {
 
       DEBUG(dbgs() << "> " << *MI);
 
-      // Finally, remove any identity copies.
-      if (MI->isIdentityCopy()) {
-        ++NumIdCopies;
-        DEBUG(dbgs() << "Deleting identity copy.\n");
-        if (Indexes)
-          Indexes->removeMachineInstrFromMaps(MI);
-        // It's safe to erase MI because MII has already been incremented.
-        MI->eraseFromParent();
-      }
+      // We can remove identity copies right now.
+      handleIdentityCopy(*MI);
     }
   }
 }
-
diff --git a/lib/CodeGen/WinEHPrepare.cpp b/lib/CodeGen/WinEHPrepare.cpp
index 14ec91159809..041fb7b912bf 100644
--- a/lib/CodeGen/WinEHPrepare.cpp
+++ b/lib/CodeGen/WinEHPrepare.cpp
@@ -254,9 +254,11 @@ static void calculateCXXStateNumbers(WinEHFuncInfo &FuncInfo,
       FuncInfo.FuncletBaseStateMap[CatchPad] = CatchLow;
       for (const User *U : CatchPad->users()) {
         const auto *UserI = cast<Instruction>(U);
-        if (auto *InnerCatchSwitch = dyn_cast<CatchSwitchInst>(UserI))
-          if (InnerCatchSwitch->getUnwindDest() == CatchSwitch->getUnwindDest())
+        if (auto *InnerCatchSwitch = dyn_cast<CatchSwitchInst>(UserI)) {
+          BasicBlock *UnwindDest = InnerCatchSwitch->getUnwindDest();
+          if (!UnwindDest || UnwindDest == CatchSwitch->getUnwindDest())
             calculateCXXStateNumbers(FuncInfo, UserI, CatchLow);
+        }
         if (auto *InnerCleanupPad = dyn_cast<CleanupPadInst>(UserI)) {
           BasicBlock *UnwindDest = getCleanupRetUnwindDest(InnerCleanupPad);
           // If a nested cleanup pad reports a null unwind destination and the
@@ -361,9 +363,11 @@ static void calculateSEHStateNumbers(WinEHFuncInfo &FuncInfo,
     // outside the __try.
     for (const User *U : CatchPad->users()) {
       const auto *UserI = cast<Instruction>(U);
-      if (auto *InnerCatchSwitch = dyn_cast<CatchSwitchInst>(UserI))
-        if (InnerCatchSwitch->getUnwindDest() == CatchSwitch->getUnwindDest())
+      if (auto *InnerCatchSwitch = dyn_cast<CatchSwitchInst>(UserI)) {
+        BasicBlock *UnwindDest = InnerCatchSwitch->getUnwindDest();
+        if (!UnwindDest || UnwindDest == CatchSwitch->getUnwindDest())
           calculateSEHStateNumbers(FuncInfo, UserI, ParentState);
+      }
       if (auto *InnerCleanupPad = dyn_cast<CleanupPadInst>(UserI)) {
         BasicBlock *UnwindDest = getCleanupRetUnwindDest(InnerCleanupPad);
         // If a nested cleanup pad reports a null unwind destination and the
@@ -783,7 +787,7 @@ void WinEHPrepare::cloneCommonBlocks(Function &F) {
       // Loop over all instructions, fixing each one as we find it...
       for (Instruction &I : *BB)
         RemapInstruction(&I, VMap,
-                         RF_IgnoreMissingEntries | RF_NoModuleLevelChanges);
+                         RF_IgnoreMissingLocals | RF_NoModuleLevelChanges);
 
     // Catchrets targeting cloned blocks need to be updated separately from
     // the loop above because they are not in the current funclet.
@@ -795,7 +799,7 @@ void WinEHPrepare::cloneCommonBlocks(Function &F) {
       FixupCatchrets.clear();
       for (BasicBlock *Pred : predecessors(OldBlock))
         if (auto *CatchRet = dyn_cast<CatchReturnInst>(Pred->getTerminator()))
-          if (CatchRet->getParentPad() == FuncletToken)
+          if (CatchRet->getCatchSwitchParentPad() == FuncletToken)
             FixupCatchrets.push_back(CatchRet);
 
       for (CatchReturnInst *CatchRet : FixupCatchrets)
@@ -810,7 +814,7 @@ void WinEHPrepare::cloneCommonBlocks(Function &F) {
         bool EdgeTargetsFunclet;
         if (auto *CRI =
                 dyn_cast<CatchReturnInst>(IncomingBlock->getTerminator())) {
-          EdgeTargetsFunclet = (CRI->getParentPad() == FuncletToken);
+          EdgeTargetsFunclet = (CRI->getCatchSwitchParentPad() == FuncletToken);
         } else {
           ColorVector &IncomingColors = BlockColors[IncomingBlock];
           assert(!IncomingColors.empty() && "Block not colored!");
@@ -944,10 +948,11 @@ void WinEHPrepare::removeImplausibleInstructions(Function &F) {
         if (FuncletBundleOperand == FuncletPad)
           continue;
 
-        // Skip call sites which are nounwind intrinsics.
+        // Skip call sites which are nounwind intrinsics or inline asm.
         auto *CalledFn =
             dyn_cast<Function>(CS.getCalledValue()->stripPointerCasts());
-        if (CalledFn && CalledFn->isIntrinsic() && CS.doesNotThrow())
+        if (CalledFn && ((CalledFn->isIntrinsic() && CS.doesNotThrow()) ||
+                         CS.isInlineAsm()))
           continue;
 
         // This call site was not part of this funclet, remove it.
diff --git a/lib/CodeGen/XRayInstrumentation.cpp b/lib/CodeGen/XRayInstrumentation.cpp
new file mode 100644
index 000000000000..1f9570895f9d
--- /dev/null
+++ b/lib/CodeGen/XRayInstrumentation.cpp
@@ -0,0 +1,96 @@
+//===-- XRayInstrumentation.cpp - Adds XRay instrumentation to functions. -===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements a MachineFunctionPass that inserts the appropriate
+// XRay instrumentation instructions. We look for XRay-specific attributes
+// on the function to determine whether we should insert the replacement
+// operations.
+//
+//===---------------------------------------------------------------------===//
+
+#include "llvm/CodeGen/Analysis.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/Passes.h"
+#include "llvm/Support/TargetRegistry.h"
+#include "llvm/Target/TargetInstrInfo.h"
+#include "llvm/Target/TargetSubtargetInfo.h"
+
+using namespace llvm;
+
+namespace {
+struct XRayInstrumentation : public MachineFunctionPass {
+  static char ID;
+
+  XRayInstrumentation() : MachineFunctionPass(ID) {
+    initializeXRayInstrumentationPass(*PassRegistry::getPassRegistry());
+  }
+
+  bool runOnMachineFunction(MachineFunction &MF) override;
+};
+}
+
+bool XRayInstrumentation::runOnMachineFunction(MachineFunction &MF) {
+  auto &F = *MF.getFunction();
+  auto InstrAttr = F.getFnAttribute("function-instrument");
+  bool AlwaysInstrument = !InstrAttr.hasAttribute(Attribute::None) &&
+                          InstrAttr.isStringAttribute() &&
+                          InstrAttr.getValueAsString() == "xray-always";
+  Attribute Attr = F.getFnAttribute("xray-instruction-threshold");
+  unsigned XRayThreshold = 0;
+  if (!AlwaysInstrument) {
+    if (Attr.hasAttribute(Attribute::None) || !Attr.isStringAttribute())
+      return false; // XRay threshold attribute not found.
+    if (Attr.getValueAsString().getAsInteger(10, XRayThreshold))
+      return false; // Invalid value for threshold.
+    if (F.size() < XRayThreshold)
+      return false; // Function is too small.
+  }
+
+  // FIXME: Do the loop triviality analysis here or in an earlier pass.
+
+  // First, insert an PATCHABLE_FUNCTION_ENTER as the first instruction of the
+  // MachineFunction.
+  auto &FirstMBB = *MF.begin();
+  auto &FirstMI = *FirstMBB.begin();
+  auto *TII = MF.getSubtarget().getInstrInfo();
+  BuildMI(FirstMBB, FirstMI, FirstMI.getDebugLoc(),
+          TII->get(TargetOpcode::PATCHABLE_FUNCTION_ENTER));
+
+  // Then we look for *all* terminators and returns, then replace those with
+  // PATCHABLE_RET instructions.
+  SmallVector<MachineInstr *, 4> Terminators;
+  for (auto &MBB : MF) {
+    for (auto &T : MBB.terminators()) {
+      // FIXME: Handle tail calls here too?
+      if (T.isReturn() && T.getOpcode() == TII->getReturnOpcode()) {
+        // Replace return instructions with:
+        //   PATCHABLE_RET <Opcode>, <Operand>...
+        auto MIB = BuildMI(MBB, T, T.getDebugLoc(),
+                           TII->get(TargetOpcode::PATCHABLE_RET))
+                       .addImm(T.getOpcode());
+        for (auto &MO : T.operands())
+          MIB.addOperand(MO);
+        Terminators.push_back(&T);
+        break;
+      }
+    }
+  }
+
+  for (auto &I : Terminators)
+    I->eraseFromParent();
+
+  return true;
+}
+
+char XRayInstrumentation::ID = 0;
+char &llvm::XRayInstrumentationID = XRayInstrumentation::ID;
+INITIALIZE_PASS(XRayInstrumentation, "xray-instrumentation", "Insert XRay ops",
+                false, false)
diff --git a/lib/CodeGen/module.modulemap b/lib/CodeGen/module.modulemap
deleted file mode 100644
index d4f68bcc6eed..000000000000
--- a/lib/CodeGen/module.modulemap
+++ /dev/null
@@ -1 +0,0 @@
-module CodeGen { requires cplusplus umbrella "." module * { export * } }
diff --git a/lib/DebugInfo/CodeView/ByteStream.cpp b/lib/DebugInfo/CodeView/ByteStream.cpp
new file mode 100644
index 000000000000..2c43bc6958d2
--- /dev/null
+++ b/lib/DebugInfo/CodeView/ByteStream.cpp
@@ -0,0 +1,79 @@
+//===- ByteStream.cpp - Reads stream data from a byte sequence ------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/DebugInfo/CodeView/ByteStream.h"
+#include "llvm/DebugInfo/CodeView/CodeViewError.h"
+#include "llvm/DebugInfo/CodeView/StreamReader.h"
+#include <cstring>
+
+using namespace llvm;
+using namespace llvm::codeview;
+
+static Error writeBytes(uint32_t Offset, ArrayRef<uint8_t> Src,
+                        ArrayRef<uint8_t> Dest) {
+  return make_error<CodeViewError>(cv_error_code::operation_unsupported,
+                                   "ByteStream is immutable.");
+}
+
+static Error writeBytes(uint32_t Offset, ArrayRef<uint8_t> Src,
+                        MutableArrayRef<uint8_t> Dest) {
+  if (Dest.size() < Src.size())
+    return make_error<CodeViewError>(cv_error_code::insufficient_buffer);
+  if (Offset > Src.size() - Dest.size())
+    return make_error<CodeViewError>(cv_error_code::insufficient_buffer);
+
+  ::memcpy(Dest.data() + Offset, Src.data(), Src.size());
+  return Error::success();
+}
+
+template <bool Writable>
+Error ByteStream<Writable>::readBytes(uint32_t Offset, uint32_t Size,
+                                      ArrayRef<uint8_t> &Buffer) const {
+  if (Offset > Data.size())
+    return make_error<CodeViewError>(cv_error_code::insufficient_buffer);
+  if (Data.size() < Size + Offset)
+    return make_error<CodeViewError>(cv_error_code::insufficient_buffer);
+  Buffer = Data.slice(Offset, Size);
+  return Error::success();
+}
+
+template <bool Writable>
+Error ByteStream<Writable>::readLongestContiguousChunk(
+    uint32_t Offset, ArrayRef<uint8_t> &Buffer) const {
+  if (Offset >= Data.size())
+    return make_error<CodeViewError>(cv_error_code::insufficient_buffer);
+  Buffer = Data.slice(Offset);
+  return Error::success();
+}
+
+template <bool Writable>
+Error ByteStream<Writable>::writeBytes(uint32_t Offset,
+                                       ArrayRef<uint8_t> Buffer) const {
+  return ::writeBytes(Offset, Buffer, Data);
+}
+
+template <bool Writable> uint32_t ByteStream<Writable>::getLength() const {
+  return Data.size();
+}
+
+template <bool Writable> Error ByteStream<Writable>::commit() const {
+  return Error::success();
+}
+
+template <bool Writable> StringRef ByteStream<Writable>::str() const {
+  const char *CharData = reinterpret_cast<const char *>(Data.data());
+  return StringRef(CharData, Data.size());
+}
+
+namespace llvm {
+namespace codeview {
+template class ByteStream<true>;
+template class ByteStream<false>;
+}
+}
diff --git a/lib/DebugInfo/CodeView/CMakeLists.txt b/lib/DebugInfo/CodeView/CMakeLists.txt
index cfa0e4d8b401..47297a9131ee 100644
--- a/lib/DebugInfo/CodeView/CMakeLists.txt
+++ b/lib/DebugInfo/CodeView/CMakeLists.txt
@@ -1,10 +1,23 @@
 add_llvm_library(LLVMDebugInfoCodeView
+  ByteStream.cpp
+  CodeViewError.cpp
+  CVTypeVisitor.cpp
+  EnumTables.cpp
   FieldListRecordBuilder.cpp
   Line.cpp
   ListRecordBuilder.cpp
   MemoryTypeTableBuilder.cpp
   MethodListRecordBuilder.cpp
+  ModuleSubstream.cpp
+  ModuleSubstreamVisitor.cpp
+  RecordSerialization.cpp
+  StreamReader.cpp
+  StreamWriter.cpp
+  SymbolDumper.cpp
+  TypeDumper.cpp
+  TypeRecord.cpp
   TypeRecordBuilder.cpp
+  TypeStreamMerger.cpp
   TypeTableBuilder.cpp
 
   ADDITIONAL_HEADER_DIRS
diff --git a/lib/DebugInfo/CodeView/CVTypeVisitor.cpp b/lib/DebugInfo/CodeView/CVTypeVisitor.cpp
new file mode 100644
index 000000000000..09f72214c52b
--- /dev/null
+++ b/lib/DebugInfo/CodeView/CVTypeVisitor.cpp
@@ -0,0 +1,123 @@
+//===- CVTypeVisitor.cpp ----------------------------------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/DebugInfo/CodeView/CVTypeVisitor.h"
+
+using namespace llvm;
+using namespace llvm::codeview;
+
+template <typename T>
+static Error takeObject(ArrayRef<uint8_t> &Data, const T *&Res) {
+  if (Data.size() < sizeof(*Res))
+    return llvm::make_error<CodeViewError>(cv_error_code::insufficient_buffer);
+  Res = reinterpret_cast<const T *>(Data.data());
+  Data = Data.drop_front(sizeof(*Res));
+  return Error::success();
+}
+
+CVTypeVisitor::CVTypeVisitor(TypeVisitorCallbacks &Callbacks)
+    : Callbacks(Callbacks) {}
+
+Error CVTypeVisitor::visitTypeRecord(const CVRecord<TypeLeafKind> &Record) {
+  ArrayRef<uint8_t> LeafData = Record.Data;
+  if (auto EC = Callbacks.visitTypeBegin(Record))
+    return EC;
+  switch (Record.Type) {
+  default:
+    if (auto EC = Callbacks.visitUnknownType(Record))
+      return EC;
+    break;
+  case LF_FIELDLIST:
+    if (auto EC = Callbacks.visitFieldListBegin(Record))
+      return EC;
+    if (auto EC = visitFieldList(Record))
+      return EC;
+    if (auto EC = Callbacks.visitFieldListEnd(Record))
+      return EC;
+    break;
+#define TYPE_RECORD(EnumName, EnumVal, Name)                                   \
+  case EnumName: {                                                             \
+    TypeRecordKind RK = static_cast<TypeRecordKind>(EnumName);                 \
+    auto Result = Name##Record::deserialize(RK, LeafData);                     \
+    if (Result.getError())                                                     \
+      return llvm::make_error<CodeViewError>(cv_error_code::corrupt_record);   \
+    if (auto EC = Callbacks.visit##Name(*Result))                              \
+      return EC;                                                               \
+    break;                                                                     \
+  }
+#define TYPE_RECORD_ALIAS(EnumName, EnumVal, Name, AliasName)                  \
+  TYPE_RECORD(EnumVal, EnumVal, AliasName)
+#define MEMBER_RECORD(EnumName, EnumVal, Name)
+#include "llvm/DebugInfo/CodeView/TypeRecords.def"
+  }
+  if (auto EC = Callbacks.visitTypeEnd(Record))
+    return EC;
+  return Error::success();
+}
+
+/// Visits the type records in Data. Sets the error flag on parse failures.
+Error CVTypeVisitor::visitTypeStream(const CVTypeArray &Types) {
+  for (const auto &I : Types) {
+    if (auto EC = visitTypeRecord(I))
+      return EC;
+  }
+  return Error::success();
+}
+
+Error CVTypeVisitor::skipPadding(ArrayRef<uint8_t> &Data) {
+  if (Data.empty())
+    return Error::success();
+  uint8_t Leaf = Data.front();
+  if (Leaf < LF_PAD0)
+    return Error::success();
+  // Leaf is greater than 0xf0. We should advance by the number of bytes in
+  // the low 4 bits.
+  unsigned BytesToAdvance = Leaf & 0x0F;
+  if (Data.size() < BytesToAdvance) {
+    return llvm::make_error<CodeViewError>(cv_error_code::corrupt_record,
+                                           "Invalid padding bytes!");
+  }
+  Data = Data.drop_front(BytesToAdvance);
+  return Error::success();
+}
+
+/// Visits individual member records of a field list record. Member records do
+/// not describe their own length, and need special handling.
+Error CVTypeVisitor::visitFieldList(const CVRecord<TypeLeafKind> &Record) {
+  ArrayRef<uint8_t> RecordData = Record.Data;
+  while (!RecordData.empty()) {
+    const ulittle16_t *LeafPtr;
+    if (auto EC = takeObject(RecordData, LeafPtr))
+      return EC;
+    TypeLeafKind Leaf = TypeLeafKind(unsigned(*LeafPtr));
+    switch (Leaf) {
+    default:
+      // Field list records do not describe their own length, so we cannot
+      // continue parsing past an unknown member type.
+      if (auto EC = Callbacks.visitUnknownMember(Record))
+        return llvm::make_error<CodeViewError>(cv_error_code::corrupt_record);
+#define MEMBER_RECORD(EnumName, EnumVal, Name)                                 \
+  case EnumName: {                                                             \
+    TypeRecordKind RK = static_cast<TypeRecordKind>(EnumName);                 \
+    auto Result = Name##Record::deserialize(RK, RecordData);                   \
+    if (Result.getError())                                                     \
+      return llvm::make_error<CodeViewError>(cv_error_code::corrupt_record);   \
+    if (auto EC = Callbacks.visit##Name(*Result))                              \
+      return EC;                                                               \
+    break;                                                                     \
+  }
+#define MEMBER_RECORD_ALIAS(EnumName, EnumVal, Name, AliasName)                \
+  MEMBER_RECORD(EnumVal, EnumVal, AliasName)
+#include "llvm/DebugInfo/CodeView/TypeRecords.def"
+    }
+    if (auto EC = skipPadding(RecordData))
+      return EC;
+  }
+  return Error::success();
+}
diff --git a/lib/DebugInfo/CodeView/CodeViewError.cpp b/lib/DebugInfo/CodeView/CodeViewError.cpp
new file mode 100644
index 000000000000..aad1d8b25cd0
--- /dev/null
+++ b/lib/DebugInfo/CodeView/CodeViewError.cpp
@@ -0,0 +1,67 @@
+//===- CodeViewError.cpp - Error extensions for CodeView --------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/DebugInfo/CodeView/CodeViewError.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/ManagedStatic.h"
+
+using namespace llvm;
+using namespace llvm::codeview;
+
+namespace {
+// FIXME: This class is only here to support the transition to llvm::Error. It
+// will be removed once this transition is complete. Clients should prefer to
+// deal with the Error value directly, rather than converting to error_code.
+class CodeViewErrorCategory : public std::error_category {
+public:
+  const char *name() const LLVM_NOEXCEPT override { return "llvm.codeview"; }
+
+  std::string message(int Condition) const override {
+    switch (static_cast<cv_error_code>(Condition)) {
+    case cv_error_code::unspecified:
+      return "An unknown error has occurred.";
+    case cv_error_code::insufficient_buffer:
+      return "The buffer is not large enough to read the requested number of "
+             "bytes.";
+    case cv_error_code::corrupt_record:
+      return "The CodeView record is corrupted.";
+    case cv_error_code::operation_unsupported:
+      return "The requested operation is not supported.";
+    }
+    llvm_unreachable("Unrecognized cv_error_code");
+  }
+};
+} // end anonymous namespace
+
+static ManagedStatic<CodeViewErrorCategory> Category;
+
+char CodeViewError::ID = 0;
+
+CodeViewError::CodeViewError(cv_error_code C) : CodeViewError(C, "") {}
+
+CodeViewError::CodeViewError(const std::string &Context)
+    : CodeViewError(cv_error_code::unspecified, Context) {}
+
+CodeViewError::CodeViewError(cv_error_code C, const std::string &Context)
+    : Code(C) {
+  ErrMsg = "CodeView Error: ";
+  std::error_code EC = convertToErrorCode();
+  if (Code != cv_error_code::unspecified)
+    ErrMsg += EC.message() + "  ";
+  if (!Context.empty())
+    ErrMsg += Context;
+}
+
+void CodeViewError::log(raw_ostream &OS) const { OS << ErrMsg << "\n"; }
+
+const std::string &CodeViewError::getErrorMessage() const { return ErrMsg; }
+
+std::error_code CodeViewError::convertToErrorCode() const {
+  return std::error_code(static_cast<int>(Code), *Category);
+}
diff --git a/lib/DebugInfo/CodeView/EnumTables.cpp b/lib/DebugInfo/CodeView/EnumTables.cpp
new file mode 100644
index 000000000000..d59271b2367e
--- /dev/null
+++ b/lib/DebugInfo/CodeView/EnumTables.cpp
@@ -0,0 +1,375 @@
+//===- EnumTables.cpp - Enum to string conversion tables --------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/DebugInfo/CodeView/EnumTables.h"
+
+using namespace llvm;
+using namespace codeview;
+
+#define CV_ENUM_CLASS_ENT(enum_class, enum)                                    \
+  { #enum, std::underlying_type < enum_class > ::type(enum_class::enum) }
+
+#define CV_ENUM_ENT(ns, enum)                                                  \
+  { #enum, ns::enum }
+
+static const EnumEntry<SymbolKind> SymbolTypeNames[] = {
+#define CV_SYMBOL(enum, val) {#enum, enum},
+#include "llvm/DebugInfo/CodeView/CVSymbolTypes.def"
+#undef CV_SYMBOL
+};
+
+static const EnumEntry<uint16_t> RegisterNames[] = {
+    CV_ENUM_CLASS_ENT(RegisterId, Unknown),
+    CV_ENUM_CLASS_ENT(RegisterId, VFrame),
+    CV_ENUM_CLASS_ENT(RegisterId, AL),
+    CV_ENUM_CLASS_ENT(RegisterId, CL),
+    CV_ENUM_CLASS_ENT(RegisterId, DL),
+    CV_ENUM_CLASS_ENT(RegisterId, BL),
+    CV_ENUM_CLASS_ENT(RegisterId, AH),
+    CV_ENUM_CLASS_ENT(RegisterId, CH),
+    CV_ENUM_CLASS_ENT(RegisterId, DH),
+    CV_ENUM_CLASS_ENT(RegisterId, BH),
+    CV_ENUM_CLASS_ENT(RegisterId, AX),
+    CV_ENUM_CLASS_ENT(RegisterId, CX),
+    CV_ENUM_CLASS_ENT(RegisterId, DX),
+    CV_ENUM_CLASS_ENT(RegisterId, BX),
+    CV_ENUM_CLASS_ENT(RegisterId, SP),
+    CV_ENUM_CLASS_ENT(RegisterId, BP),
+    CV_ENUM_CLASS_ENT(RegisterId, SI),
+    CV_ENUM_CLASS_ENT(RegisterId, DI),
+    CV_ENUM_CLASS_ENT(RegisterId, EAX),
+    CV_ENUM_CLASS_ENT(RegisterId, ECX),
+    CV_ENUM_CLASS_ENT(RegisterId, EDX),
+    CV_ENUM_CLASS_ENT(RegisterId, EBX),
+    CV_ENUM_CLASS_ENT(RegisterId, ESP),
+    CV_ENUM_CLASS_ENT(RegisterId, EBP),
+    CV_ENUM_CLASS_ENT(RegisterId, ESI),
+    CV_ENUM_CLASS_ENT(RegisterId, EDI),
+    CV_ENUM_CLASS_ENT(RegisterId, ES),
+    CV_ENUM_CLASS_ENT(RegisterId, CS),
+    CV_ENUM_CLASS_ENT(RegisterId, SS),
+    CV_ENUM_CLASS_ENT(RegisterId, DS),
+    CV_ENUM_CLASS_ENT(RegisterId, FS),
+    CV_ENUM_CLASS_ENT(RegisterId, GS),
+    CV_ENUM_CLASS_ENT(RegisterId, IP),
+    CV_ENUM_CLASS_ENT(RegisterId, RAX),
+    CV_ENUM_CLASS_ENT(RegisterId, RBX),
+    CV_ENUM_CLASS_ENT(RegisterId, RCX),
+    CV_ENUM_CLASS_ENT(RegisterId, RDX),
+    CV_ENUM_CLASS_ENT(RegisterId, RSI),
+    CV_ENUM_CLASS_ENT(RegisterId, RDI),
+    CV_ENUM_CLASS_ENT(RegisterId, RBP),
+    CV_ENUM_CLASS_ENT(RegisterId, RSP),
+    CV_ENUM_CLASS_ENT(RegisterId, R8),
+    CV_ENUM_CLASS_ENT(RegisterId, R9),
+    CV_ENUM_CLASS_ENT(RegisterId, R10),
+    CV_ENUM_CLASS_ENT(RegisterId, R11),
+    CV_ENUM_CLASS_ENT(RegisterId, R12),
+    CV_ENUM_CLASS_ENT(RegisterId, R13),
+    CV_ENUM_CLASS_ENT(RegisterId, R14),
+    CV_ENUM_CLASS_ENT(RegisterId, R15),
+};
+
+static const EnumEntry<uint8_t> ProcSymFlagNames[] = {
+    CV_ENUM_CLASS_ENT(ProcSymFlags, HasFP),
+    CV_ENUM_CLASS_ENT(ProcSymFlags, HasIRET),
+    CV_ENUM_CLASS_ENT(ProcSymFlags, HasFRET),
+    CV_ENUM_CLASS_ENT(ProcSymFlags, IsNoReturn),
+    CV_ENUM_CLASS_ENT(ProcSymFlags, IsUnreachable),
+    CV_ENUM_CLASS_ENT(ProcSymFlags, HasCustomCallingConv),
+    CV_ENUM_CLASS_ENT(ProcSymFlags, IsNoInline),
+    CV_ENUM_CLASS_ENT(ProcSymFlags, HasOptimizedDebugInfo),
+};
+
+static const EnumEntry<uint16_t> LocalFlags[] = {
+    CV_ENUM_CLASS_ENT(LocalSymFlags, IsParameter),
+    CV_ENUM_CLASS_ENT(LocalSymFlags, IsAddressTaken),
+    CV_ENUM_CLASS_ENT(LocalSymFlags, IsCompilerGenerated),
+    CV_ENUM_CLASS_ENT(LocalSymFlags, IsAggregate),
+    CV_ENUM_CLASS_ENT(LocalSymFlags, IsAggregated),
+    CV_ENUM_CLASS_ENT(LocalSymFlags, IsAliased),
+    CV_ENUM_CLASS_ENT(LocalSymFlags, IsAlias),
+    CV_ENUM_CLASS_ENT(LocalSymFlags, IsReturnValue),
+    CV_ENUM_CLASS_ENT(LocalSymFlags, IsOptimizedOut),
+    CV_ENUM_CLASS_ENT(LocalSymFlags, IsEnregisteredGlobal),
+    CV_ENUM_CLASS_ENT(LocalSymFlags, IsEnregisteredStatic),
+};
+
+static const EnumEntry<uint8_t> FrameCookieKinds[] = {
+    CV_ENUM_CLASS_ENT(FrameCookieKind, Copy),
+    CV_ENUM_CLASS_ENT(FrameCookieKind, XorStackPointer),
+    CV_ENUM_CLASS_ENT(FrameCookieKind, XorFramePointer),
+    CV_ENUM_CLASS_ENT(FrameCookieKind, XorR13),
+};
+
+static const EnumEntry<codeview::SourceLanguage> SourceLanguages[] = {
+    CV_ENUM_ENT(SourceLanguage, C),       CV_ENUM_ENT(SourceLanguage, Cpp),
+    CV_ENUM_ENT(SourceLanguage, Fortran), CV_ENUM_ENT(SourceLanguage, Masm),
+    CV_ENUM_ENT(SourceLanguage, Pascal),  CV_ENUM_ENT(SourceLanguage, Basic),
+    CV_ENUM_ENT(SourceLanguage, Cobol),   CV_ENUM_ENT(SourceLanguage, Link),
+    CV_ENUM_ENT(SourceLanguage, Cvtres),  CV_ENUM_ENT(SourceLanguage, Cvtpgd),
+    CV_ENUM_ENT(SourceLanguage, CSharp),  CV_ENUM_ENT(SourceLanguage, VB),
+    CV_ENUM_ENT(SourceLanguage, ILAsm),   CV_ENUM_ENT(SourceLanguage, Java),
+    CV_ENUM_ENT(SourceLanguage, JScript), CV_ENUM_ENT(SourceLanguage, MSIL),
+    CV_ENUM_ENT(SourceLanguage, HLSL),
+};
+
+static const EnumEntry<uint32_t> CompileSym2FlagNames[] = {
+    CV_ENUM_CLASS_ENT(CompileSym2Flags, EC),
+    CV_ENUM_CLASS_ENT(CompileSym2Flags, NoDbgInfo),
+    CV_ENUM_CLASS_ENT(CompileSym2Flags, LTCG),
+    CV_ENUM_CLASS_ENT(CompileSym2Flags, NoDataAlign),
+    CV_ENUM_CLASS_ENT(CompileSym2Flags, ManagedPresent),
+    CV_ENUM_CLASS_ENT(CompileSym2Flags, SecurityChecks),
+    CV_ENUM_CLASS_ENT(CompileSym2Flags, HotPatch),
+    CV_ENUM_CLASS_ENT(CompileSym2Flags, CVTCIL),
+    CV_ENUM_CLASS_ENT(CompileSym2Flags, MSILModule),
+};
+
+static const EnumEntry<uint32_t> CompileSym3FlagNames[] = {
+    CV_ENUM_CLASS_ENT(CompileSym3Flags, EC),
+    CV_ENUM_CLASS_ENT(CompileSym3Flags, NoDbgInfo),
+    CV_ENUM_CLASS_ENT(CompileSym3Flags, LTCG),
+    CV_ENUM_CLASS_ENT(CompileSym3Flags, NoDataAlign),
+    CV_ENUM_CLASS_ENT(CompileSym3Flags, ManagedPresent),
+    CV_ENUM_CLASS_ENT(CompileSym3Flags, SecurityChecks),
+    CV_ENUM_CLASS_ENT(CompileSym3Flags, HotPatch),
+    CV_ENUM_CLASS_ENT(CompileSym3Flags, CVTCIL),
+    CV_ENUM_CLASS_ENT(CompileSym3Flags, MSILModule),
+    CV_ENUM_CLASS_ENT(CompileSym3Flags, Sdl),
+    CV_ENUM_CLASS_ENT(CompileSym3Flags, PGO),
+    CV_ENUM_CLASS_ENT(CompileSym3Flags, Exp),
+};
+
+static const EnumEntry<uint32_t> FileChecksumNames[] = {
+    CV_ENUM_CLASS_ENT(FileChecksumKind, None),
+    CV_ENUM_CLASS_ENT(FileChecksumKind, MD5),
+    CV_ENUM_CLASS_ENT(FileChecksumKind, SHA1),
+    CV_ENUM_CLASS_ENT(FileChecksumKind, SHA256),
+};
+
+static const EnumEntry<unsigned> CPUTypeNames[] = {
+    CV_ENUM_CLASS_ENT(CPUType, Intel8080),
+    CV_ENUM_CLASS_ENT(CPUType, Intel8086),
+    CV_ENUM_CLASS_ENT(CPUType, Intel80286),
+    CV_ENUM_CLASS_ENT(CPUType, Intel80386),
+    CV_ENUM_CLASS_ENT(CPUType, Intel80486),
+    CV_ENUM_CLASS_ENT(CPUType, Pentium),
+    CV_ENUM_CLASS_ENT(CPUType, PentiumPro),
+    CV_ENUM_CLASS_ENT(CPUType, Pentium3),
+    CV_ENUM_CLASS_ENT(CPUType, MIPS),
+    CV_ENUM_CLASS_ENT(CPUType, MIPS16),
+    CV_ENUM_CLASS_ENT(CPUType, MIPS32),
+    CV_ENUM_CLASS_ENT(CPUType, MIPS64),
+    CV_ENUM_CLASS_ENT(CPUType, MIPSI),
+    CV_ENUM_CLASS_ENT(CPUType, MIPSII),
+    CV_ENUM_CLASS_ENT(CPUType, MIPSIII),
+    CV_ENUM_CLASS_ENT(CPUType, MIPSIV),
+    CV_ENUM_CLASS_ENT(CPUType, MIPSV),
+    CV_ENUM_CLASS_ENT(CPUType, M68000),
+    CV_ENUM_CLASS_ENT(CPUType, M68010),
+    CV_ENUM_CLASS_ENT(CPUType, M68020),
+    CV_ENUM_CLASS_ENT(CPUType, M68030),
+    CV_ENUM_CLASS_ENT(CPUType, M68040),
+    CV_ENUM_CLASS_ENT(CPUType, Alpha),
+    CV_ENUM_CLASS_ENT(CPUType, Alpha21164),
+    CV_ENUM_CLASS_ENT(CPUType, Alpha21164A),
+    CV_ENUM_CLASS_ENT(CPUType, Alpha21264),
+    CV_ENUM_CLASS_ENT(CPUType, Alpha21364),
+    CV_ENUM_CLASS_ENT(CPUType, PPC601),
+    CV_ENUM_CLASS_ENT(CPUType, PPC603),
+    CV_ENUM_CLASS_ENT(CPUType, PPC604),
+    CV_ENUM_CLASS_ENT(CPUType, PPC620),
+    CV_ENUM_CLASS_ENT(CPUType, PPCFP),
+    CV_ENUM_CLASS_ENT(CPUType, PPCBE),
+    CV_ENUM_CLASS_ENT(CPUType, SH3),
+    CV_ENUM_CLASS_ENT(CPUType, SH3E),
+    CV_ENUM_CLASS_ENT(CPUType, SH3DSP),
+    CV_ENUM_CLASS_ENT(CPUType, SH4),
+    CV_ENUM_CLASS_ENT(CPUType, SHMedia),
+    CV_ENUM_CLASS_ENT(CPUType, ARM3),
+    CV_ENUM_CLASS_ENT(CPUType, ARM4),
+    CV_ENUM_CLASS_ENT(CPUType, ARM4T),
+    CV_ENUM_CLASS_ENT(CPUType, ARM5),
+    CV_ENUM_CLASS_ENT(CPUType, ARM5T),
+    CV_ENUM_CLASS_ENT(CPUType, ARM6),
+    CV_ENUM_CLASS_ENT(CPUType, ARM_XMAC),
+    CV_ENUM_CLASS_ENT(CPUType, ARM_WMMX),
+    CV_ENUM_CLASS_ENT(CPUType, ARM7),
+    CV_ENUM_CLASS_ENT(CPUType, Omni),
+    CV_ENUM_CLASS_ENT(CPUType, Ia64),
+    CV_ENUM_CLASS_ENT(CPUType, Ia64_2),
+    CV_ENUM_CLASS_ENT(CPUType, CEE),
+    CV_ENUM_CLASS_ENT(CPUType, AM33),
+    CV_ENUM_CLASS_ENT(CPUType, M32R),
+    CV_ENUM_CLASS_ENT(CPUType, TriCore),
+    CV_ENUM_CLASS_ENT(CPUType, X64),
+    CV_ENUM_CLASS_ENT(CPUType, EBC),
+    CV_ENUM_CLASS_ENT(CPUType, Thumb),
+    CV_ENUM_CLASS_ENT(CPUType, ARMNT),
+    CV_ENUM_CLASS_ENT(CPUType, D3D11_Shader),
+};
+
+static const EnumEntry<uint32_t> FrameProcSymFlagNames[] = {
+    CV_ENUM_CLASS_ENT(FrameProcedureOptions, HasAlloca),
+    CV_ENUM_CLASS_ENT(FrameProcedureOptions, HasSetJmp),
+    CV_ENUM_CLASS_ENT(FrameProcedureOptions, HasLongJmp),
+    CV_ENUM_CLASS_ENT(FrameProcedureOptions, HasInlineAssembly),
+    CV_ENUM_CLASS_ENT(FrameProcedureOptions, HasExceptionHandling),
+    CV_ENUM_CLASS_ENT(FrameProcedureOptions, MarkedInline),
+    CV_ENUM_CLASS_ENT(FrameProcedureOptions, HasStructuredExceptionHandling),
+    CV_ENUM_CLASS_ENT(FrameProcedureOptions, Naked),
+    CV_ENUM_CLASS_ENT(FrameProcedureOptions, SecurityChecks),
+    CV_ENUM_CLASS_ENT(FrameProcedureOptions, AsynchronousExceptionHandling),
+    CV_ENUM_CLASS_ENT(FrameProcedureOptions, NoStackOrderingForSecurityChecks),
+    CV_ENUM_CLASS_ENT(FrameProcedureOptions, Inlined),
+    CV_ENUM_CLASS_ENT(FrameProcedureOptions, StrictSecurityChecks),
+    CV_ENUM_CLASS_ENT(FrameProcedureOptions, SafeBuffers),
+    CV_ENUM_CLASS_ENT(FrameProcedureOptions, ProfileGuidedOptimization),
+    CV_ENUM_CLASS_ENT(FrameProcedureOptions, ValidProfileCounts),
+    CV_ENUM_CLASS_ENT(FrameProcedureOptions, OptimizedForSpeed),
+    CV_ENUM_CLASS_ENT(FrameProcedureOptions, GuardCfg),
+    CV_ENUM_CLASS_ENT(FrameProcedureOptions, GuardCfw),
+};
+
+static const EnumEntry<uint32_t> ModuleSubstreamKindNames[] = {
+    CV_ENUM_CLASS_ENT(ModuleSubstreamKind, None),
+    CV_ENUM_CLASS_ENT(ModuleSubstreamKind, Symbols),
+    CV_ENUM_CLASS_ENT(ModuleSubstreamKind, Lines),
+    CV_ENUM_CLASS_ENT(ModuleSubstreamKind, StringTable),
+    CV_ENUM_CLASS_ENT(ModuleSubstreamKind, FileChecksums),
+    CV_ENUM_CLASS_ENT(ModuleSubstreamKind, FrameData),
+    CV_ENUM_CLASS_ENT(ModuleSubstreamKind, InlineeLines),
+    CV_ENUM_CLASS_ENT(ModuleSubstreamKind, CrossScopeImports),
+    CV_ENUM_CLASS_ENT(ModuleSubstreamKind, CrossScopeExports),
+    CV_ENUM_CLASS_ENT(ModuleSubstreamKind, ILLines),
+    CV_ENUM_CLASS_ENT(ModuleSubstreamKind, FuncMDTokenMap),
+    CV_ENUM_CLASS_ENT(ModuleSubstreamKind, TypeMDTokenMap),
+    CV_ENUM_CLASS_ENT(ModuleSubstreamKind, MergedAssemblyInput),
+    CV_ENUM_CLASS_ENT(ModuleSubstreamKind, CoffSymbolRVA),
+};
+
+static const EnumEntry<uint16_t> ExportSymFlagNames[] = {
+    CV_ENUM_CLASS_ENT(ExportFlags, IsConstant),
+    CV_ENUM_CLASS_ENT(ExportFlags, IsData),
+    CV_ENUM_CLASS_ENT(ExportFlags, IsPrivate),
+    CV_ENUM_CLASS_ENT(ExportFlags, HasNoName),
+    CV_ENUM_CLASS_ENT(ExportFlags, HasExplicitOrdinal),
+    CV_ENUM_CLASS_ENT(ExportFlags, IsForwarder),
+};
+
+static const EnumEntry<uint8_t> ThunkOrdinalNames[] = {
+    CV_ENUM_CLASS_ENT(ThunkOrdinal, Standard),
+    CV_ENUM_CLASS_ENT(ThunkOrdinal, ThisAdjustor),
+    CV_ENUM_CLASS_ENT(ThunkOrdinal, Vcall),
+    CV_ENUM_CLASS_ENT(ThunkOrdinal, Pcode),
+    CV_ENUM_CLASS_ENT(ThunkOrdinal, UnknownLoad),
+    CV_ENUM_CLASS_ENT(ThunkOrdinal, TrampIncremental),
+    CV_ENUM_CLASS_ENT(ThunkOrdinal, BranchIsland),
+};
+
+static const EnumEntry<uint16_t> TrampolineNames[] = {
+    CV_ENUM_CLASS_ENT(TrampolineType, TrampIncremental),
+    CV_ENUM_CLASS_ENT(TrampolineType, BranchIsland),
+};
+
+static const EnumEntry<COFF::SectionCharacteristics>
+    ImageSectionCharacteristicNames[] = {
+        CV_ENUM_ENT(COFF, IMAGE_SCN_TYPE_NOLOAD),
+        CV_ENUM_ENT(COFF, IMAGE_SCN_TYPE_NO_PAD),
+        CV_ENUM_ENT(COFF, IMAGE_SCN_CNT_CODE),
+        CV_ENUM_ENT(COFF, IMAGE_SCN_CNT_INITIALIZED_DATA),
+        CV_ENUM_ENT(COFF, IMAGE_SCN_CNT_UNINITIALIZED_DATA),
+        CV_ENUM_ENT(COFF, IMAGE_SCN_LNK_OTHER),
+        CV_ENUM_ENT(COFF, IMAGE_SCN_LNK_INFO),
+        CV_ENUM_ENT(COFF, IMAGE_SCN_LNK_REMOVE),
+        CV_ENUM_ENT(COFF, IMAGE_SCN_LNK_COMDAT),
+        CV_ENUM_ENT(COFF, IMAGE_SCN_GPREL),
+        CV_ENUM_ENT(COFF, IMAGE_SCN_MEM_PURGEABLE),
+        CV_ENUM_ENT(COFF, IMAGE_SCN_MEM_16BIT),
+        CV_ENUM_ENT(COFF, IMAGE_SCN_MEM_LOCKED),
+        CV_ENUM_ENT(COFF, IMAGE_SCN_MEM_PRELOAD),
+        CV_ENUM_ENT(COFF, IMAGE_SCN_ALIGN_1BYTES),
+        CV_ENUM_ENT(COFF, IMAGE_SCN_ALIGN_2BYTES),
+        CV_ENUM_ENT(COFF, IMAGE_SCN_ALIGN_4BYTES),
+        CV_ENUM_ENT(COFF, IMAGE_SCN_ALIGN_8BYTES),
+        CV_ENUM_ENT(COFF, IMAGE_SCN_ALIGN_16BYTES),
+        CV_ENUM_ENT(COFF, IMAGE_SCN_ALIGN_32BYTES),
+        CV_ENUM_ENT(COFF, IMAGE_SCN_ALIGN_64BYTES),
+        CV_ENUM_ENT(COFF, IMAGE_SCN_ALIGN_128BYTES),
+        CV_ENUM_ENT(COFF, IMAGE_SCN_ALIGN_256BYTES),
+        CV_ENUM_ENT(COFF, IMAGE_SCN_ALIGN_512BYTES),
+        CV_ENUM_ENT(COFF, IMAGE_SCN_ALIGN_1024BYTES),
+        CV_ENUM_ENT(COFF, IMAGE_SCN_ALIGN_2048BYTES),
+        CV_ENUM_ENT(COFF, IMAGE_SCN_ALIGN_4096BYTES),
+        CV_ENUM_ENT(COFF, IMAGE_SCN_ALIGN_8192BYTES),
+        CV_ENUM_ENT(COFF, IMAGE_SCN_LNK_NRELOC_OVFL),
+        CV_ENUM_ENT(COFF, IMAGE_SCN_MEM_DISCARDABLE),
+        CV_ENUM_ENT(COFF, IMAGE_SCN_MEM_NOT_CACHED),
+        CV_ENUM_ENT(COFF, IMAGE_SCN_MEM_NOT_PAGED),
+        CV_ENUM_ENT(COFF, IMAGE_SCN_MEM_SHARED),
+        CV_ENUM_ENT(COFF, IMAGE_SCN_MEM_EXECUTE),
+        CV_ENUM_ENT(COFF, IMAGE_SCN_MEM_READ),
+        CV_ENUM_ENT(COFF, IMAGE_SCN_MEM_WRITE)};
+
+namespace llvm {
+namespace codeview {
+ArrayRef<EnumEntry<SymbolKind>> getSymbolTypeNames() {
+  return makeArrayRef(SymbolTypeNames);
+}
+
+ArrayRef<EnumEntry<uint16_t>> getRegisterNames() {
+  return makeArrayRef(RegisterNames);
+}
+
+ArrayRef<EnumEntry<uint8_t>> getProcSymFlagNames() {
+  return makeArrayRef(ProcSymFlagNames);
+}
+ArrayRef<EnumEntry<uint16_t>> getLocalFlagNames() {
+  return makeArrayRef(LocalFlags);
+}
+ArrayRef<EnumEntry<uint8_t>> getFrameCookieKindNames() {
+  return makeArrayRef(FrameCookieKinds);
+}
+ArrayRef<EnumEntry<SourceLanguage>> getSourceLanguageNames() {
+  return makeArrayRef(SourceLanguages);
+}
+ArrayRef<EnumEntry<uint32_t>> getCompileSym2FlagNames() {
+  return makeArrayRef(CompileSym2FlagNames);
+}
+ArrayRef<EnumEntry<uint32_t>> getCompileSym3FlagNames() {
+  return makeArrayRef(CompileSym3FlagNames);
+}
+ArrayRef<EnumEntry<uint32_t>> getFileChecksumNames() {
+  return makeArrayRef(FileChecksumNames);
+}
+ArrayRef<EnumEntry<unsigned>> getCPUTypeNames() {
+  return makeArrayRef(CPUTypeNames);
+}
+ArrayRef<EnumEntry<uint32_t>> getFrameProcSymFlagNames() {
+  return makeArrayRef(FrameProcSymFlagNames);
+}
+ArrayRef<EnumEntry<uint16_t>> getExportSymFlagNames() {
+  return makeArrayRef(ExportSymFlagNames);
+}
+ArrayRef<EnumEntry<uint32_t>> getModuleSubstreamKindNames() {
+  return makeArrayRef(ModuleSubstreamKindNames);
+}
+ArrayRef<EnumEntry<uint8_t>> getThunkOrdinalNames() {
+  return makeArrayRef(ThunkOrdinalNames);
+}
+ArrayRef<EnumEntry<uint16_t>> getTrampolineNames() {
+  return makeArrayRef(TrampolineNames);
+}
+ArrayRef<EnumEntry<COFF::SectionCharacteristics>>
+getImageSectionCharacteristicNames() {
+  return makeArrayRef(ImageSectionCharacteristicNames);
+}
+}
+}
diff --git a/lib/DebugInfo/CodeView/FieldListRecordBuilder.cpp b/lib/DebugInfo/CodeView/FieldListRecordBuilder.cpp
index 91b71cc4b119..5f229e3d9f94 100644
--- a/lib/DebugInfo/CodeView/FieldListRecordBuilder.cpp
+++ b/lib/DebugInfo/CodeView/FieldListRecordBuilder.cpp
@@ -15,151 +15,118 @@ using namespace codeview;
 FieldListRecordBuilder::FieldListRecordBuilder()
     : ListRecordBuilder(TypeRecordKind::FieldList) {}
 
-void FieldListRecordBuilder::writeBaseClass(MemberAccess Access, TypeIndex Type,
-                                            uint64_t Offset) {
+void FieldListRecordBuilder::writeBaseClass(const BaseClassRecord &Record) {
   TypeRecordBuilder &Builder = getBuilder();
 
   Builder.writeTypeRecordKind(TypeRecordKind::BaseClass);
-  Builder.writeUInt16(static_cast<uint16_t>(Access));
-  Builder.writeTypeIndex(Type);
-  Builder.writeEncodedUnsignedInteger(Offset);
+  Builder.writeUInt16(static_cast<uint16_t>(Record.getAccess()));
+  Builder.writeTypeIndex(Record.getBaseType());
+  Builder.writeEncodedUnsignedInteger(Record.getBaseOffset());
 
   finishSubRecord();
 }
 
-void FieldListRecordBuilder::writeEnumerate(MemberAccess Access, uint64_t Value,
-                                            StringRef Name) {
+void FieldListRecordBuilder::writeEnumerator(const EnumeratorRecord &Record) {
   TypeRecordBuilder &Builder = getBuilder();
 
-  Builder.writeTypeRecordKind(TypeRecordKind::Enumerate);
-  Builder.writeUInt16(static_cast<uint16_t>(Access));
-  Builder.writeEncodedUnsignedInteger(Value);
-  Builder.writeNullTerminatedString(Name);
+  Builder.writeTypeRecordKind(TypeRecordKind::Enumerator);
+  Builder.writeUInt16(static_cast<uint16_t>(Record.getAccess()));
+  // FIXME: Handle full APInt such as __int128.
+  Builder.writeEncodedUnsignedInteger(Record.getValue().getZExtValue());
+  Builder.writeNullTerminatedString(Record.getName());
 
   finishSubRecord();
 }
 
-void FieldListRecordBuilder::writeMember(MemberAccess Access, TypeIndex Type,
-                                         uint64_t Offset, StringRef Name) {
+void FieldListRecordBuilder::writeDataMember(const DataMemberRecord &Record) {
   TypeRecordBuilder &Builder = getBuilder();
 
-  Builder.writeTypeRecordKind(TypeRecordKind::Member);
-  Builder.writeUInt16(static_cast<uint16_t>(Access));
-  Builder.writeTypeIndex(Type);
-  Builder.writeEncodedUnsignedInteger(Offset);
-  Builder.writeNullTerminatedString(Name);
+  Builder.writeTypeRecordKind(Record.getKind());
+  Builder.writeUInt16(static_cast<uint16_t>(Record.getAccess()));
+  Builder.writeTypeIndex(Record.getType());
+  Builder.writeEncodedUnsignedInteger(Record.getFieldOffset());
+  Builder.writeNullTerminatedString(Record.getName());
 
   finishSubRecord();
 }
 
-void FieldListRecordBuilder::writeMethod(uint16_t OverloadCount,
-                                         TypeIndex MethodList, StringRef Name) {
+void FieldListRecordBuilder::writeOverloadedMethod(
+    const OverloadedMethodRecord &Record) {
   TypeRecordBuilder &Builder = getBuilder();
 
-  Builder.writeTypeRecordKind(TypeRecordKind::Method);
-  Builder.writeUInt16(OverloadCount);
-  Builder.writeTypeIndex(MethodList);
-  Builder.writeNullTerminatedString(Name);
+  Builder.writeTypeRecordKind(TypeRecordKind::OverloadedMethod);
+  Builder.writeUInt16(Record.getNumOverloads());
+  Builder.writeTypeIndex(Record.getMethodList());
+  Builder.writeNullTerminatedString(Record.getName());
 
   finishSubRecord();
 }
 
-void FieldListRecordBuilder::writeOneMethod(
-    MemberAccess Access, MethodKind Kind, MethodOptions Options, TypeIndex Type,
-    int32_t VTableSlotOffset, StringRef Name) {
+void FieldListRecordBuilder::writeOneMethod(const OneMethodRecord &Record) {
   TypeRecordBuilder &Builder = getBuilder();
 
-  uint16_t Flags = static_cast<uint16_t>(Access);
-  Flags |= static_cast<uint16_t>(Kind) << MethodKindShift;
-  Flags |= static_cast<uint16_t>(Options);
+  uint16_t Flags = static_cast<uint16_t>(Record.getAccess());
+  Flags |= static_cast<uint16_t>(Record.getKind()) << MethodKindShift;
+  Flags |= static_cast<uint16_t>(Record.getOptions());
 
   Builder.writeTypeRecordKind(TypeRecordKind::OneMethod);
   Builder.writeUInt16(Flags);
-  Builder.writeTypeIndex(Type);
-  switch (Kind) {
-  case MethodKind::IntroducingVirtual:
-  case MethodKind::PureIntroducingVirtual:
-    assert(VTableSlotOffset >= 0);
-    Builder.writeInt32(VTableSlotOffset);
-    break;
-
-  default:
-    assert(VTableSlotOffset == -1);
-    break;
+  Builder.writeTypeIndex(Record.getType());
+  if (Record.isIntroducingVirtual()) {
+    assert(Record.getVFTableOffset() >= 0);
+    Builder.writeInt32(Record.getVFTableOffset());
+  } else {
+    assert(Record.getVFTableOffset() == -1);
   }
 
-  Builder.writeNullTerminatedString(Name);
+  Builder.writeNullTerminatedString(Record.getName());
 
   finishSubRecord();
 }
 
-void FieldListRecordBuilder::writeOneMethod(const MethodInfo &Method,
-                                            StringRef Name) {
-  writeOneMethod(Method.getAccess(), Method.getKind(), Method.getOptions(),
-                 Method.getType(), Method.getVTableSlotOffset(), Name);
-}
-
-void FieldListRecordBuilder::writeNestedType(TypeIndex Type, StringRef Name) {
+void FieldListRecordBuilder::writeNestedType(const NestedTypeRecord &Record) {
   TypeRecordBuilder &Builder = getBuilder();
 
-  Builder.writeTypeRecordKind(TypeRecordKind::NestedType);
+  Builder.writeTypeRecordKind(Record.getKind());
   Builder.writeUInt16(0);
-  Builder.writeTypeIndex(Type);
-  Builder.writeNullTerminatedString(Name);
+  Builder.writeTypeIndex(Record.getNestedType());
+  Builder.writeNullTerminatedString(Record.getName());
 
   finishSubRecord();
 }
 
-void FieldListRecordBuilder::writeStaticMember(MemberAccess Access,
-                                               TypeIndex Type, StringRef Name) {
+void FieldListRecordBuilder::writeStaticDataMember(
+    const StaticDataMemberRecord &Record) {
   TypeRecordBuilder &Builder = getBuilder();
 
-  Builder.writeTypeRecordKind(TypeRecordKind::StaticMember);
-  Builder.writeUInt16(static_cast<uint16_t>(Access));
-  Builder.writeTypeIndex(Type);
-  Builder.writeNullTerminatedString(Name);
+  Builder.writeTypeRecordKind(Record.getKind());
+  Builder.writeUInt16(static_cast<uint16_t>(Record.getAccess()));
+  Builder.writeTypeIndex(Record.getType());
+  Builder.writeNullTerminatedString(Record.getName());
 
   finishSubRecord();
 }
 
-void FieldListRecordBuilder::writeIndirectVirtualBaseClass(
-    MemberAccess Access, TypeIndex Type, TypeIndex VirtualBasePointerType,
-    int64_t VirtualBasePointerOffset, uint64_t SlotIndex) {
-  writeVirtualBaseClass(TypeRecordKind::IndirectVirtualBaseClass, Access, Type,
-                        VirtualBasePointerType, VirtualBasePointerOffset,
-                        SlotIndex);
-}
-
 void FieldListRecordBuilder::writeVirtualBaseClass(
-    MemberAccess Access, TypeIndex Type, TypeIndex VirtualBasePointerType,
-    int64_t VirtualBasePointerOffset, uint64_t SlotIndex) {
-  writeVirtualBaseClass(TypeRecordKind::VirtualBaseClass, Access, Type,
-                        VirtualBasePointerType, VirtualBasePointerOffset,
-                        SlotIndex);
-}
-
-void FieldListRecordBuilder::writeVirtualBaseClass(
-    TypeRecordKind Kind, MemberAccess Access, TypeIndex Type,
-    TypeIndex VirtualBasePointerType, int64_t VirtualBasePointerOffset,
-    uint64_t SlotIndex) {
+    const VirtualBaseClassRecord &Record) {
   TypeRecordBuilder &Builder = getBuilder();
 
-  Builder.writeTypeRecordKind(Kind);
-  Builder.writeUInt16(static_cast<uint16_t>(Access));
-  Builder.writeTypeIndex(Type);
-  Builder.writeTypeIndex(VirtualBasePointerType);
-  Builder.writeEncodedInteger(VirtualBasePointerOffset);
-  Builder.writeEncodedUnsignedInteger(SlotIndex);
+  Builder.writeTypeRecordKind(Record.getKind());
+  Builder.writeUInt16(static_cast<uint16_t>(Record.getAccess()));
+  Builder.writeTypeIndex(Record.getBaseType());
+  Builder.writeTypeIndex(Record.getVBPtrType());
+  Builder.writeEncodedInteger(Record.getVBPtrOffset());
+  Builder.writeEncodedUnsignedInteger(Record.getVTableIndex());
 
   finishSubRecord();
 }
 
-void FieldListRecordBuilder::writeVirtualFunctionTablePointer(TypeIndex Type) {
+void FieldListRecordBuilder::writeVFPtr(const VFPtrRecord &Record) {
   TypeRecordBuilder &Builder = getBuilder();
 
-  Builder.writeTypeRecordKind(TypeRecordKind::VirtualFunctionTablePointer);
+  Builder.writeTypeRecordKind(TypeRecordKind::VFPtr);
   Builder.writeUInt16(0);
-  Builder.writeTypeIndex(Type);
+  Builder.writeTypeIndex(Record.getType());
 
   finishSubRecord();
-}
-\ No newline at end of file
+}
diff --git a/lib/DebugInfo/CodeView/ListRecordBuilder.cpp b/lib/DebugInfo/CodeView/ListRecordBuilder.cpp
index 69c7e87330e6..eb79e8ac9a3f 100644
--- a/lib/DebugInfo/CodeView/ListRecordBuilder.cpp
+++ b/lib/DebugInfo/CodeView/ListRecordBuilder.cpp
@@ -7,25 +7,96 @@
 //
 //===----------------------------------------------------------------------===//
 
+#include "llvm/ADT/SmallString.h"
 #include "llvm/DebugInfo/CodeView/ListRecordBuilder.h"
+#include "llvm/DebugInfo/CodeView/TypeTableBuilder.h"
 
 using namespace llvm;
 using namespace codeview;
 
-ListRecordBuilder::ListRecordBuilder(TypeRecordKind Kind) : Builder(Kind) {}
+ListRecordBuilder::ListRecordBuilder(TypeRecordKind Kind)
+    : Kind(Kind), Builder(Kind) {}
+
+void ListRecordBuilder::writeListContinuation(const ListContinuationRecord &R) {
+  TypeRecordBuilder &Builder = getBuilder();
+
+  assert(getLastContinuationSize() < 65535 - 8 && "continuation won't fit");
+
+  Builder.writeTypeRecordKind(TypeRecordKind::ListContinuation);
+  Builder.writeUInt16(0);
+  Builder.writeTypeIndex(R.getContinuationIndex());
+
+  // End the current segment manually so that nothing comes after the
+  // continuation.
+  ContinuationOffsets.push_back(Builder.size());
+  SubrecordStart = Builder.size();
+}
 
 void ListRecordBuilder::finishSubRecord() {
-  // The builder starts at offset 2 in the actual CodeView buffer, so add an
-  // additional offset of 2 before computing the alignment.
-  uint32_t Remainder = (Builder.size() + 2) % 4;
+  // The type table inserts a 16 bit size field before each list, so factor that
+  // into our alignment padding.
+  uint32_t Remainder =
+      (Builder.size() + 2 * (ContinuationOffsets.size() + 1)) % 4;
   if (Remainder != 0) {
     for (int32_t PaddingBytesLeft = 4 - Remainder; PaddingBytesLeft > 0;
          --PaddingBytesLeft) {
-      Builder.writeUInt8(0xf0 + PaddingBytesLeft);
+      Builder.writeUInt8(LF_PAD0 + PaddingBytesLeft);
     }
   }
 
-  // TODO: Split the list into multiple records if it's longer than 64KB, using
-  // a subrecord of TypeRecordKind::Index to chain the records together.
-  assert(Builder.size() < 65536);
+  // Check if this subrecord makes the current segment not fit in 64K minus the
+  // space for a continuation record (8 bytes). If the segment does not fit,
+  // back up and insert a continuation record, sliding the current subrecord
+  // down.
+  if (getLastContinuationSize() > 65535 - 8) {
+    assert(SubrecordStart != 0 && "can't slide from the start!");
+    SmallString<128> SubrecordCopy(
+        Builder.str().slice(SubrecordStart, Builder.size()));
+    assert(SubrecordCopy.size() < 65530 && "subrecord is too large to slide!");
+    Builder.truncate(SubrecordStart);
+
+    // Write a placeholder continuation record.
+    Builder.writeTypeRecordKind(TypeRecordKind::ListContinuation);
+    Builder.writeUInt16(0);
+    Builder.writeUInt32(0);
+    ContinuationOffsets.push_back(Builder.size());
+    assert(Builder.size() == SubrecordStart + 8 && "wrong continuation size");
+    assert(getLastContinuationSize() < 65535 && "segment too big");
+
+    // Start a new list record of the appropriate kind, and copy the previous
+    // subrecord into place.
+    Builder.writeTypeRecordKind(Kind);
+    Builder.writeBytes(SubrecordCopy);
+  }
+
+  SubrecordStart = Builder.size();
+}
+
+TypeIndex ListRecordBuilder::writeListRecord(TypeTableBuilder &Table) {
+  // Get the continuation segments as a reversed vector of StringRefs for
+  // convenience.
+  SmallVector<StringRef, 1> Segments;
+  StringRef Data = str();
+  size_t LastEnd = 0;
+  for (size_t SegEnd : ContinuationOffsets) {
+    Segments.push_back(Data.slice(LastEnd, SegEnd));
+    LastEnd = SegEnd;
+  }
+  Segments.push_back(Data.slice(LastEnd, Builder.size()));
+
+  // Pop the last record off and emit it directly.
+  StringRef LastRec = Segments.pop_back_val();
+  TypeIndex ContinuationIndex = Table.writeRecord(LastRec);
+
+  // Emit each record with a continuation in reverse order, so that each one
+  // references the previous record.
+  for (StringRef Rec : reverse(Segments)) {
+    assert(*reinterpret_cast<const ulittle16_t *>(Rec.data()) ==
+           unsigned(Kind));
+    ulittle32_t *ContinuationPtr =
+        reinterpret_cast<ulittle32_t *>(const_cast<char *>(Rec.end())) - 1;
+    *ContinuationPtr = ContinuationIndex.getIndex();
+    ContinuationIndex = Table.writeRecord(Rec);
+  }
+  return ContinuationIndex;
 }
diff --git a/lib/DebugInfo/CodeView/Makefile b/lib/DebugInfo/CodeView/Makefile
deleted file mode 100644
index 535bc10b7442..000000000000
--- a/lib/DebugInfo/CodeView/Makefile
+++ /dev/null
@@ -1,14 +0,0 @@
-##===- lib/DebugInfo/CodeView/Makefile ---------------------*- Makefile -*-===##
-#
-#                     The LLVM Compiler Infrastructure
-#
-# This file is distributed under the University of Illinois Open Source
-# License. See LICENSE.TXT for details.
-#
-##===----------------------------------------------------------------------===##
-
-LEVEL = ../../..
-LIBRARYNAME = LLVMDebugInfoCodeView
-BUILD_ARCHIVE := 1
-
-include $(LEVEL)/Makefile.common
diff --git a/lib/DebugInfo/CodeView/MemoryTypeTableBuilder.cpp b/lib/DebugInfo/CodeView/MemoryTypeTableBuilder.cpp
index 9afce92eeb1d..8b9e73b94ff5 100644
--- a/lib/DebugInfo/CodeView/MemoryTypeTableBuilder.cpp
+++ b/lib/DebugInfo/CodeView/MemoryTypeTableBuilder.cpp
@@ -13,23 +13,34 @@
 using namespace llvm;
 using namespace codeview;
 
-MemoryTypeTableBuilder::Record::Record(StringRef RData)
-    : Size(RData.size()), Data(new char[RData.size()]) {
-  memcpy(Data.get(), RData.data(), RData.size());
-}
-
 TypeIndex MemoryTypeTableBuilder::writeRecord(StringRef Data) {
+  assert(Data.size() <= UINT16_MAX);
   auto I = HashedRecords.find(Data);
   if (I != HashedRecords.end()) {
     return I->second;
   }
 
-  std::unique_ptr<Record> R(new Record(Data));
+  // The record provided by the user lacks the 2 byte size field prefix and is
+  // not padded to 4 bytes. Ultimately, that is what gets emitted in the object
+  // file, so pad it out now.
+  const int SizeOfRecLen = 2;
+  const int Align = 4;
+  int TotalSize = alignTo(Data.size() + SizeOfRecLen, Align);
+  assert(TotalSize - SizeOfRecLen <= UINT16_MAX);
+  char *Mem =
+      reinterpret_cast<char *>(RecordStorage.Allocate(TotalSize, Align));
+  *reinterpret_cast<ulittle16_t *>(Mem) = uint16_t(TotalSize - SizeOfRecLen);
+  memcpy(Mem + SizeOfRecLen, Data.data(), Data.size());
+  for (int I = Data.size() + SizeOfRecLen; I < TotalSize; ++I)
+    Mem[I] = LF_PAD0 + (TotalSize - I);
 
   TypeIndex TI(static_cast<uint32_t>(Records.size()) +
                TypeIndex::FirstNonSimpleIndex);
-  HashedRecords.insert(std::make_pair(StringRef(R->data(), R->size()), TI));
-  Records.push_back(std::move(R));
+
+  // Use only the data supplied by the user as a key to the hash table, so that
+  // future lookups will succeed.
+  HashedRecords.insert(std::make_pair(StringRef(Mem + SizeOfRecLen, Data.size()), TI));
+  Records.push_back(StringRef(Mem, TotalSize));
 
   return TI;
 }
diff --git a/lib/DebugInfo/CodeView/MethodListRecordBuilder.cpp b/lib/DebugInfo/CodeView/MethodListRecordBuilder.cpp
index 889302556b2d..ae089a352081 100644
--- a/lib/DebugInfo/CodeView/MethodListRecordBuilder.cpp
+++ b/lib/DebugInfo/CodeView/MethodListRecordBuilder.cpp
@@ -14,7 +14,7 @@ using namespace llvm;
 using namespace codeview;
 
 MethodListRecordBuilder::MethodListRecordBuilder()
-    : ListRecordBuilder(TypeRecordKind::MethodList) {}
+    : ListRecordBuilder(TypeRecordKind::MethodOverloadList) {}
 
 void MethodListRecordBuilder::writeMethod(MemberAccess Access, MethodKind Kind,
                                           MethodOptions Options, TypeIndex Type,
diff --git a/lib/DebugInfo/CodeView/ModuleSubstream.cpp b/lib/DebugInfo/CodeView/ModuleSubstream.cpp
new file mode 100644
index 000000000000..2e31ed6b5b7f
--- /dev/null
+++ b/lib/DebugInfo/CodeView/ModuleSubstream.cpp
@@ -0,0 +1,42 @@
+//===- ModuleSubstream.cpp --------------------------------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/DebugInfo/CodeView/ModuleSubstream.h"
+
+#include "llvm/DebugInfo/CodeView/StreamReader.h"
+
+using namespace llvm;
+using namespace llvm::codeview;
+
+ModuleSubstream::ModuleSubstream() : Kind(ModuleSubstreamKind::None) {}
+
+ModuleSubstream::ModuleSubstream(ModuleSubstreamKind Kind, StreamRef Data)
+    : Kind(Kind), Data(Data) {}
+
+Error ModuleSubstream::initialize(StreamRef Stream, ModuleSubstream &Info) {
+  const ModuleSubsectionHeader *Header;
+  StreamReader Reader(Stream);
+  if (auto EC = Reader.readObject(Header))
+    return EC;
+
+  ModuleSubstreamKind Kind =
+      static_cast<ModuleSubstreamKind>(uint32_t(Header->Kind));
+  if (auto EC = Reader.readStreamRef(Info.Data, Header->Length))
+    return EC;
+  Info.Kind = Kind;
+  return Error::success();
+}
+
+uint32_t ModuleSubstream::getRecordLength() const {
+  return sizeof(ModuleSubsectionHeader) + Data.getLength();
+}
+
+ModuleSubstreamKind ModuleSubstream::getSubstreamKind() const { return Kind; }
+
+StreamRef ModuleSubstream::getRecordData() const { return Data; }
diff --git a/lib/DebugInfo/CodeView/ModuleSubstreamVisitor.cpp b/lib/DebugInfo/CodeView/ModuleSubstreamVisitor.cpp
new file mode 100644
index 000000000000..6f237ee67fe4
--- /dev/null
+++ b/lib/DebugInfo/CodeView/ModuleSubstreamVisitor.cpp
@@ -0,0 +1,104 @@
+//===- ModuleSubstreamVisitor.cpp -------------------------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/DebugInfo/CodeView/ModuleSubstreamVisitor.h"
+
+using namespace llvm;
+using namespace llvm::codeview;
+
+Error IModuleSubstreamVisitor::visitSymbols(StreamRef Data) {
+  return visitUnknown(ModuleSubstreamKind::Symbols, Data);
+}
+Error IModuleSubstreamVisitor::visitLines(StreamRef Data,
+                                          const LineSubstreamHeader *Header,
+                                          const LineInfoArray &Lines) {
+  return visitUnknown(ModuleSubstreamKind::Lines, Data);
+}
+Error IModuleSubstreamVisitor::visitStringTable(StreamRef Data) {
+  return visitUnknown(ModuleSubstreamKind::StringTable, Data);
+}
+Error IModuleSubstreamVisitor::visitFileChecksums(
+    StreamRef Data, const FileChecksumArray &Checksums) {
+  return visitUnknown(ModuleSubstreamKind::FileChecksums, Data);
+}
+Error IModuleSubstreamVisitor::visitFrameData(StreamRef Data) {
+  return visitUnknown(ModuleSubstreamKind::FrameData, Data);
+}
+Error IModuleSubstreamVisitor::visitInlineeLines(StreamRef Data) {
+  return visitUnknown(ModuleSubstreamKind::InlineeLines, Data);
+}
+Error IModuleSubstreamVisitor::visitCrossScopeImports(StreamRef Data) {
+  return visitUnknown(ModuleSubstreamKind::CrossScopeExports, Data);
+}
+Error IModuleSubstreamVisitor::visitCrossScopeExports(StreamRef Data) {
+  return visitUnknown(ModuleSubstreamKind::CrossScopeImports, Data);
+}
+Error IModuleSubstreamVisitor::visitILLines(StreamRef Data) {
+  return visitUnknown(ModuleSubstreamKind::ILLines, Data);
+}
+Error IModuleSubstreamVisitor::visitFuncMDTokenMap(StreamRef Data) {
+  return visitUnknown(ModuleSubstreamKind::FuncMDTokenMap, Data);
+}
+Error IModuleSubstreamVisitor::visitTypeMDTokenMap(StreamRef Data) {
+  return visitUnknown(ModuleSubstreamKind::TypeMDTokenMap, Data);
+}
+Error IModuleSubstreamVisitor::visitMergedAssemblyInput(StreamRef Data) {
+  return visitUnknown(ModuleSubstreamKind::MergedAssemblyInput, Data);
+}
+Error IModuleSubstreamVisitor::visitCoffSymbolRVA(StreamRef Data) {
+  return visitUnknown(ModuleSubstreamKind::CoffSymbolRVA, Data);
+}
+
+Error llvm::codeview::visitModuleSubstream(const ModuleSubstream &R,
+                                           IModuleSubstreamVisitor &V) {
+  switch (R.getSubstreamKind()) {
+  case ModuleSubstreamKind::Symbols:
+    return V.visitSymbols(R.getRecordData());
+  case ModuleSubstreamKind::Lines: {
+    StreamReader Reader(R.getRecordData());
+    const LineSubstreamHeader *Header;
+    if (auto EC = Reader.readObject(Header))
+      return EC;
+    VarStreamArrayExtractor<LineColumnEntry> E(Header);
+    LineInfoArray LineInfos(E);
+    if (auto EC = Reader.readArray(LineInfos, Reader.bytesRemaining()))
+      return EC;
+    return V.visitLines(R.getRecordData(), Header, LineInfos);
+  }
+  case ModuleSubstreamKind::StringTable:
+    return V.visitStringTable(R.getRecordData());
+  case ModuleSubstreamKind::FileChecksums: {
+    StreamReader Reader(R.getRecordData());
+    FileChecksumArray Checksums;
+    if (auto EC = Reader.readArray(Checksums, Reader.bytesRemaining()))
+      return EC;
+    return V.visitFileChecksums(R.getRecordData(), Checksums);
+  }
+  case ModuleSubstreamKind::FrameData:
+    return V.visitFrameData(R.getRecordData());
+  case ModuleSubstreamKind::InlineeLines:
+    return V.visitInlineeLines(R.getRecordData());
+  case ModuleSubstreamKind::CrossScopeImports:
+    return V.visitCrossScopeImports(R.getRecordData());
+  case ModuleSubstreamKind::CrossScopeExports:
+    return V.visitCrossScopeExports(R.getRecordData());
+  case ModuleSubstreamKind::ILLines:
+    return V.visitILLines(R.getRecordData());
+  case ModuleSubstreamKind::FuncMDTokenMap:
+    return V.visitFuncMDTokenMap(R.getRecordData());
+  case ModuleSubstreamKind::TypeMDTokenMap:
+    return V.visitTypeMDTokenMap(R.getRecordData());
+  case ModuleSubstreamKind::MergedAssemblyInput:
+    return V.visitMergedAssemblyInput(R.getRecordData());
+  case ModuleSubstreamKind::CoffSymbolRVA:
+    return V.visitCoffSymbolRVA(R.getRecordData());
+  default:
+    return V.visitUnknown(R.getSubstreamKind(), R.getRecordData());
+  }
+}
diff --git a/lib/DebugInfo/CodeView/RecordSerialization.cpp b/lib/DebugInfo/CodeView/RecordSerialization.cpp
new file mode 100644
index 000000000000..ab9206a33ec0
--- /dev/null
+++ b/lib/DebugInfo/CodeView/RecordSerialization.cpp
@@ -0,0 +1,171 @@
+//===-- RecordSerialization.cpp -------------------------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// Utilities for serializing and deserializing CodeView records.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/DebugInfo/CodeView/RecordSerialization.h"
+#include "llvm/ADT/APInt.h"
+#include "llvm/ADT/APSInt.h"
+#include "llvm/DebugInfo/CodeView/TypeRecord.h"
+
+using namespace llvm;
+using namespace llvm::codeview;
+using namespace llvm::support;
+
+/// Reinterpret a byte array as an array of characters. Does not interpret as
+/// a C string, as StringRef has several helpers (split) that make that easy.
+StringRef llvm::codeview::getBytesAsCharacters(ArrayRef<uint8_t> LeafData) {
+  return StringRef(reinterpret_cast<const char *>(LeafData.data()),
+                   LeafData.size());
+}
+
+StringRef llvm::codeview::getBytesAsCString(ArrayRef<uint8_t> LeafData) {
+  return getBytesAsCharacters(LeafData).split('\0').first;
+}
+
+std::error_code llvm::codeview::consume(ArrayRef<uint8_t> &Data, APSInt &Num) {
+  // Used to avoid overload ambiguity on APInt construtor.
+  bool FalseVal = false;
+  if (Data.size() < 2)
+    return std::make_error_code(std::errc::illegal_byte_sequence);
+  uint16_t Short = *reinterpret_cast<const ulittle16_t *>(Data.data());
+  Data = Data.drop_front(2);
+  if (Short < LF_NUMERIC) {
+    Num = APSInt(APInt(/*numBits=*/16, Short, /*isSigned=*/false),
+                 /*isUnsigned=*/true);
+    return std::error_code();
+  }
+  switch (Short) {
+  case LF_CHAR:
+    if (Data.size() < 1)
+      return std::make_error_code(std::errc::illegal_byte_sequence);
+    Num = APSInt(APInt(/*numBits=*/8,
+                       *reinterpret_cast<const int8_t *>(Data.data()),
+                       /*isSigned=*/true),
+                 /*isUnsigned=*/false);
+    Data = Data.drop_front(1);
+    return std::error_code();
+  case LF_SHORT:
+    if (Data.size() < 2)
+      return std::make_error_code(std::errc::illegal_byte_sequence);
+    Num = APSInt(APInt(/*numBits=*/16,
+                       *reinterpret_cast<const little16_t *>(Data.data()),
+                       /*isSigned=*/true),
+                 /*isUnsigned=*/false);
+    Data = Data.drop_front(2);
+    return std::error_code();
+  case LF_USHORT:
+    if (Data.size() < 2)
+      return std::make_error_code(std::errc::illegal_byte_sequence);
+    Num = APSInt(APInt(/*numBits=*/16,
+                       *reinterpret_cast<const ulittle16_t *>(Data.data()),
+                       /*isSigned=*/false),
+                 /*isUnsigned=*/true);
+    Data = Data.drop_front(2);
+    return std::error_code();
+  case LF_LONG:
+    if (Data.size() < 4)
+      return std::make_error_code(std::errc::illegal_byte_sequence);
+    Num = APSInt(APInt(/*numBits=*/32,
+                       *reinterpret_cast<const little32_t *>(Data.data()),
+                       /*isSigned=*/true),
+                 /*isUnsigned=*/false);
+    Data = Data.drop_front(4);
+    return std::error_code();
+  case LF_ULONG:
+    if (Data.size() < 4)
+      return std::make_error_code(std::errc::illegal_byte_sequence);
+    Num = APSInt(APInt(/*numBits=*/32,
+                       *reinterpret_cast<const ulittle32_t *>(Data.data()),
+                       /*isSigned=*/FalseVal),
+                 /*isUnsigned=*/true);
+    Data = Data.drop_front(4);
+    return std::error_code();
+  case LF_QUADWORD:
+    if (Data.size() < 8)
+      return std::make_error_code(std::errc::illegal_byte_sequence);
+    Num = APSInt(APInt(/*numBits=*/64,
+                       *reinterpret_cast<const little64_t *>(Data.data()),
+                       /*isSigned=*/true),
+                 /*isUnsigned=*/false);
+    Data = Data.drop_front(8);
+    return std::error_code();
+  case LF_UQUADWORD:
+    if (Data.size() < 8)
+      return std::make_error_code(std::errc::illegal_byte_sequence);
+    Num = APSInt(APInt(/*numBits=*/64,
+                       *reinterpret_cast<const ulittle64_t *>(Data.data()),
+                       /*isSigned=*/false),
+                 /*isUnsigned=*/true);
+    Data = Data.drop_front(8);
+    return std::error_code();
+  }
+  return std::make_error_code(std::errc::illegal_byte_sequence);
+}
+
+std::error_code llvm::codeview::consume(StringRef &Data, APSInt &Num) {
+  ArrayRef<uint8_t> Bytes(Data.bytes_begin(), Data.bytes_end());
+  auto EC = consume(Bytes, Num);
+  Data = StringRef(reinterpret_cast<const char *>(Bytes.data()), Bytes.size());
+  return EC;
+}
+
+/// Decode a numeric leaf value that is known to be a uint64_t.
+std::error_code llvm::codeview::consume_numeric(ArrayRef<uint8_t> &Data,
+                                                uint64_t &Num) {
+  APSInt N;
+  if (auto EC = consume(Data, N))
+    return EC;
+  if (N.isSigned() || !N.isIntN(64))
+    return std::make_error_code(std::errc::illegal_byte_sequence);
+  Num = N.getLimitedValue();
+  return std::error_code();
+}
+
+std::error_code llvm::codeview::consume(ArrayRef<uint8_t> &Data,
+                                        uint32_t &Item) {
+  const support::ulittle32_t *IntPtr;
+  if (auto EC = consumeObject(Data, IntPtr))
+    return EC;
+  Item = *IntPtr;
+  return std::error_code();
+}
+
+std::error_code llvm::codeview::consume(StringRef &Data, uint32_t &Item) {
+  ArrayRef<uint8_t> Bytes(Data.bytes_begin(), Data.bytes_end());
+  auto EC = consume(Bytes, Item);
+  Data = StringRef(reinterpret_cast<const char *>(Bytes.data()), Bytes.size());
+  return EC;
+}
+
+std::error_code llvm::codeview::consume(ArrayRef<uint8_t> &Data,
+                                        int32_t &Item) {
+  const support::little32_t *IntPtr;
+  if (auto EC = consumeObject(Data, IntPtr))
+    return EC;
+  Item = *IntPtr;
+  return std::error_code();
+}
+
+std::error_code llvm::codeview::consume(ArrayRef<uint8_t> &Data,
+                                        StringRef &Item) {
+  if (Data.empty())
+    return std::make_error_code(std::errc::illegal_byte_sequence);
+
+  StringRef Rest;
+  std::tie(Item, Rest) = getBytesAsCharacters(Data).split('\0');
+  // We expect this to be null terminated.  If it was not, it is an error.
+  if (Data.size() == Item.size())
+    return std::make_error_code(std::errc::illegal_byte_sequence);
+
+  Data = ArrayRef<uint8_t>(Rest.bytes_begin(), Rest.bytes_end());
+  return std::error_code();
+}
diff --git a/lib/DebugInfo/CodeView/StreamReader.cpp b/lib/DebugInfo/CodeView/StreamReader.cpp
new file mode 100644
index 000000000000..64e45487322e
--- /dev/null
+++ b/lib/DebugInfo/CodeView/StreamReader.cpp
@@ -0,0 +1,93 @@
+//===- StreamReader.cpp - Reads bytes and objects from a stream -----------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/DebugInfo/CodeView/StreamReader.h"
+
+#include "llvm/DebugInfo/CodeView/CodeViewError.h"
+#include "llvm/DebugInfo/CodeView/StreamRef.h"
+
+using namespace llvm;
+using namespace llvm::codeview;
+
+StreamReader::StreamReader(StreamRef S) : Stream(S), Offset(0) {}
+
+Error StreamReader::readLongestContiguousChunk(ArrayRef<uint8_t> &Buffer) {
+  if (auto EC = Stream.readLongestContiguousChunk(Offset, Buffer))
+    return EC;
+  Offset += Buffer.size();
+  return Error::success();
+}
+
+Error StreamReader::readBytes(ArrayRef<uint8_t> &Buffer, uint32_t Size) {
+  if (auto EC = Stream.readBytes(Offset, Size, Buffer))
+    return EC;
+  Offset += Size;
+  return Error::success();
+}
+
+Error StreamReader::readInteger(uint16_t &Dest) {
+  const support::ulittle16_t *P;
+  if (auto EC = readObject(P))
+    return EC;
+  Dest = *P;
+  return Error::success();
+}
+
+Error StreamReader::readInteger(uint32_t &Dest) {
+  const support::ulittle32_t *P;
+  if (auto EC = readObject(P))
+    return EC;
+  Dest = *P;
+  return Error::success();
+}
+
+Error StreamReader::readZeroString(StringRef &Dest) {
+  uint32_t Length = 0;
+  // First compute the length of the string by reading 1 byte at a time.
+  uint32_t OriginalOffset = getOffset();
+  const char *C;
+  do {
+    if (auto EC = readObject(C))
+      return EC;
+    if (*C != '\0')
+      ++Length;
+  } while (*C != '\0');
+  // Now go back and request a reference for that many bytes.
+  uint32_t NewOffset = getOffset();
+  setOffset(OriginalOffset);
+
+  ArrayRef<uint8_t> Data;
+  if (auto EC = readBytes(Data, Length))
+    return EC;
+  Dest = StringRef(reinterpret_cast<const char *>(Data.begin()), Data.size());
+
+  // Now set the offset back to where it was after we calculated the length.
+  setOffset(NewOffset);
+  return Error::success();
+}
+
+Error StreamReader::readFixedString(StringRef &Dest, uint32_t Length) {
+  ArrayRef<uint8_t> Bytes;
+  if (auto EC = readBytes(Bytes, Length))
+    return EC;
+  Dest = StringRef(reinterpret_cast<const char *>(Bytes.begin()), Bytes.size());
+  return Error::success();
+}
+
+Error StreamReader::readStreamRef(StreamRef &Ref) {
+  return readStreamRef(Ref, bytesRemaining());
+}
+
+Error StreamReader::readStreamRef(StreamRef &Ref, uint32_t Length) {
+  if (bytesRemaining() < Length)
+    return make_error<CodeViewError>(cv_error_code::insufficient_buffer);
+  Ref = Stream.slice(Offset, Length);
+  Offset += Length;
+  return Error::success();
+}
diff --git a/lib/DebugInfo/CodeView/StreamWriter.cpp b/lib/DebugInfo/CodeView/StreamWriter.cpp
new file mode 100644
index 000000000000..f61c6b522f57
--- /dev/null
+++ b/lib/DebugInfo/CodeView/StreamWriter.cpp
@@ -0,0 +1,77 @@
+//===- StreamWrite.cpp - Writes bytes and objects to a stream -------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/DebugInfo/CodeView/StreamWriter.h"
+
+#include "llvm/DebugInfo/CodeView/CodeViewError.h"
+#include "llvm/DebugInfo/CodeView/StreamReader.h"
+#include "llvm/DebugInfo/CodeView/StreamRef.h"
+
+using namespace llvm;
+using namespace llvm::codeview;
+
+StreamWriter::StreamWriter(StreamRef S) : Stream(S), Offset(0) {}
+
+Error StreamWriter::writeBytes(ArrayRef<uint8_t> Buffer) {
+  if (auto EC = Stream.writeBytes(Offset, Buffer))
+    return EC;
+  Offset += Buffer.size();
+  return Error::success();
+}
+
+Error StreamWriter::writeInteger(uint16_t Int) {
+  return writeObject(support::ulittle16_t(Int));
+}
+
+Error StreamWriter::writeInteger(uint32_t Int) {
+  return writeObject(support::ulittle32_t(Int));
+}
+
+Error StreamWriter::writeZeroString(StringRef Str) {
+  if (auto EC = writeFixedString(Str))
+    return EC;
+  if (auto EC = writeObject('\0'))
+    return EC;
+
+  return Error::success();
+}
+
+Error StreamWriter::writeFixedString(StringRef Str) {
+  ArrayRef<uint8_t> Bytes(Str.bytes_begin(), Str.bytes_end());
+  if (auto EC = Stream.writeBytes(Offset, Bytes))
+    return EC;
+
+  Offset += Str.size();
+  return Error::success();
+}
+
+Error StreamWriter::writeStreamRef(StreamRef Ref) {
+  if (auto EC = writeStreamRef(Ref, Ref.getLength()))
+    return EC;
+  Offset += Ref.getLength();
+  return Error::success();
+}
+
+Error StreamWriter::writeStreamRef(StreamRef Ref, uint32_t Length) {
+  Ref = Ref.slice(0, Length);
+
+  StreamReader SrcReader(Ref);
+  // This is a bit tricky.  If we just call readBytes, we are requiring that it
+  // return us the entire stream as a contiguous buffer.  For large streams this
+  // will allocate a huge amount of space from the pool.  Instead, iterate over
+  // each contiguous chunk until we've consumed the entire stream.
+  while (SrcReader.bytesRemaining() > 0) {
+    ArrayRef<uint8_t> Chunk;
+    if (auto EC = SrcReader.readLongestContiguousChunk(Chunk))
+      return EC;
+    if (auto EC = writeBytes(Chunk))
+      return EC;
+  }
+  return Error::success();
+}
diff --git a/lib/DebugInfo/CodeView/SymbolDumper.cpp b/lib/DebugInfo/CodeView/SymbolDumper.cpp
new file mode 100644
index 000000000000..6763c3d562d7
--- /dev/null
+++ b/lib/DebugInfo/CodeView/SymbolDumper.cpp
@@ -0,0 +1,642 @@
+//===-- SymbolDumper.cpp - CodeView symbol info dumper ----------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/DebugInfo/CodeView/SymbolDumper.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/SmallString.h"
+#include "llvm/DebugInfo/CodeView/CVSymbolVisitor.h"
+#include "llvm/DebugInfo/CodeView/EnumTables.h"
+#include "llvm/DebugInfo/CodeView/SymbolDumpDelegate.h"
+#include "llvm/DebugInfo/CodeView/SymbolRecord.h"
+#include "llvm/DebugInfo/CodeView/TypeDumper.h"
+#include "llvm/DebugInfo/CodeView/TypeIndex.h"
+#include "llvm/Support/ScopedPrinter.h"
+
+#include <system_error>
+
+using namespace llvm;
+using namespace llvm::codeview;
+
+namespace {
+/// Use this private dumper implementation to keep implementation details about
+/// the visitor out of SymbolDumper.h.
+class CVSymbolDumperImpl : public CVSymbolVisitor<CVSymbolDumperImpl> {
+public:
+  CVSymbolDumperImpl(CVTypeDumper &CVTD, SymbolDumpDelegate *ObjDelegate,
+                     ScopedPrinter &W, bool PrintRecordBytes)
+      : CVSymbolVisitor(ObjDelegate), CVTD(CVTD), ObjDelegate(ObjDelegate),
+        W(W), PrintRecordBytes(PrintRecordBytes), InFunctionScope(false) {}
+
+/// CVSymbolVisitor overrides.
+#define SYMBOL_RECORD(EnumName, EnumVal, Name)                                 \
+  void visit##Name(SymbolKind Kind, Name &Record);
+#define SYMBOL_RECORD_ALIAS(EnumName, EnumVal, Name, AliasName)
+#include "llvm/DebugInfo/CodeView/CVSymbolTypes.def"
+
+  void visitSymbolBegin(SymbolKind Kind, ArrayRef<uint8_t> Data);
+  void visitSymbolEnd(SymbolKind Kind, ArrayRef<uint8_t> OriginalSymData);
+  void visitUnknownSymbol(SymbolKind Kind, ArrayRef<uint8_t> Data);
+
+private:
+  void printLocalVariableAddrRange(const LocalVariableAddrRange &Range,
+                                   uint32_t RelocationOffset);
+  void printLocalVariableAddrGap(ArrayRef<LocalVariableAddrGap> Gaps);
+
+  CVTypeDumper &CVTD;
+  SymbolDumpDelegate *ObjDelegate;
+  ScopedPrinter &W;
+
+  bool PrintRecordBytes;
+  bool InFunctionScope;
+};
+}
+
+void CVSymbolDumperImpl::printLocalVariableAddrRange(
+    const LocalVariableAddrRange &Range, uint32_t RelocationOffset) {
+  DictScope S(W, "LocalVariableAddrRange");
+  if (ObjDelegate)
+    ObjDelegate->printRelocatedField("OffsetStart", RelocationOffset,
+                                     Range.OffsetStart);
+  W.printHex("ISectStart", Range.ISectStart);
+  W.printHex("Range", Range.Range);
+}
+
+void CVSymbolDumperImpl::printLocalVariableAddrGap(
+    ArrayRef<LocalVariableAddrGap> Gaps) {
+  for (auto &Gap : Gaps) {
+    ListScope S(W, "LocalVariableAddrGap");
+    W.printHex("GapStartOffset", Gap.GapStartOffset);
+    W.printHex("Range", Gap.Range);
+  }
+}
+
+void CVSymbolDumperImpl::visitSymbolBegin(SymbolKind Kind,
+                                          ArrayRef<uint8_t> Data) {}
+
+void CVSymbolDumperImpl::visitSymbolEnd(SymbolKind Kind,
+                                        ArrayRef<uint8_t> OriginalSymData) {
+  if (PrintRecordBytes && ObjDelegate)
+    ObjDelegate->printBinaryBlockWithRelocs("SymData", OriginalSymData);
+}
+
+void CVSymbolDumperImpl::visitBlockSym(SymbolKind Kind, BlockSym &Block) {
+  DictScope S(W, "BlockStart");
+
+  StringRef LinkageName;
+  W.printHex("PtrParent", Block.Header.PtrParent);
+  W.printHex("PtrEnd", Block.Header.PtrEnd);
+  W.printHex("CodeSize", Block.Header.CodeSize);
+  if (ObjDelegate) {
+    ObjDelegate->printRelocatedField("CodeOffset", Block.getRelocationOffset(),
+                                     Block.Header.CodeOffset, &LinkageName);
+  }
+  W.printHex("Segment", Block.Header.Segment);
+  W.printString("BlockName", Block.Name);
+  W.printString("LinkageName", LinkageName);
+}
+
+void CVSymbolDumperImpl::visitThunk32Sym(SymbolKind Kind, Thunk32Sym &Thunk) {
+  DictScope S(W, "Thunk32");
+  W.printNumber("Parent", Thunk.Header.Parent);
+  W.printNumber("End", Thunk.Header.End);
+  W.printNumber("Next", Thunk.Header.Next);
+  W.printNumber("Off", Thunk.Header.Off);
+  W.printNumber("Seg", Thunk.Header.Seg);
+  W.printNumber("Len", Thunk.Header.Len);
+  W.printEnum("Ordinal", Thunk.Header.Ord, getThunkOrdinalNames());
+}
+
+void CVSymbolDumperImpl::visitTrampolineSym(SymbolKind Kind,
+                                            TrampolineSym &Tramp) {
+  DictScope S(W, "Trampoline");
+  W.printEnum("Type", Tramp.Header.Type, getTrampolineNames());
+  W.printNumber("Size", Tramp.Header.Size);
+  W.printNumber("ThunkOff", Tramp.Header.ThunkOff);
+  W.printNumber("TargetOff", Tramp.Header.TargetOff);
+  W.printNumber("ThunkSection", Tramp.Header.ThunkSection);
+  W.printNumber("TargetSection", Tramp.Header.TargetSection);
+}
+
+void CVSymbolDumperImpl::visitSectionSym(SymbolKind Kind, SectionSym &Section) {
+  DictScope S(W, "Section");
+  W.printNumber("SectionNumber", Section.Header.SectionNumber);
+  W.printNumber("Alignment", Section.Header.Alignment);
+  W.printNumber("Reserved", Section.Header.Reserved);
+  W.printNumber("Rva", Section.Header.Rva);
+  W.printNumber("Length", Section.Header.Length);
+  W.printFlags("Characteristics", Section.Header.Characteristics,
+               getImageSectionCharacteristicNames(),
+               COFF::SectionCharacteristics(0x00F00000));
+
+  W.printString("Name", Section.Name);
+}
+
+void CVSymbolDumperImpl::visitCoffGroupSym(SymbolKind Kind,
+                                           CoffGroupSym &CoffGroup) {
+  DictScope S(W, "COFF Group");
+  W.printNumber("Size", CoffGroup.Header.Size);
+  W.printFlags("Characteristics", CoffGroup.Header.Characteristics,
+               getImageSectionCharacteristicNames(),
+               COFF::SectionCharacteristics(0x00F00000));
+  W.printNumber("Offset", CoffGroup.Header.Offset);
+  W.printNumber("Segment", CoffGroup.Header.Segment);
+  W.printString("Name", CoffGroup.Name);
+}
+
+void CVSymbolDumperImpl::visitBPRelativeSym(SymbolKind Kind,
+                                            BPRelativeSym &BPRel) {
+  DictScope S(W, "BPRelativeSym");
+
+  W.printNumber("Offset", BPRel.Header.Offset);
+  CVTD.printTypeIndex("Type", BPRel.Header.Type);
+  W.printString("VarName", BPRel.Name);
+}
+
+void CVSymbolDumperImpl::visitBuildInfoSym(SymbolKind Kind,
+                                           BuildInfoSym &BuildInfo) {
+  DictScope S(W, "BuildInfo");
+
+  W.printNumber("BuildId", BuildInfo.Header.BuildId);
+}
+
+void CVSymbolDumperImpl::visitCallSiteInfoSym(SymbolKind Kind,
+                                              CallSiteInfoSym &CallSiteInfo) {
+  DictScope S(W, "CallSiteInfo");
+
+  StringRef LinkageName;
+  if (ObjDelegate) {
+    ObjDelegate->printRelocatedField(
+        "CodeOffset", CallSiteInfo.getRelocationOffset(),
+        CallSiteInfo.Header.CodeOffset, &LinkageName);
+  }
+  W.printHex("Segment", CallSiteInfo.Header.Segment);
+  W.printHex("Reserved", CallSiteInfo.Header.Reserved);
+  CVTD.printTypeIndex("Type", CallSiteInfo.Header.Type);
+  if (!LinkageName.empty())
+    W.printString("LinkageName", LinkageName);
+}
+
+void CVSymbolDumperImpl::visitEnvBlockSym(SymbolKind Kind,
+                                          EnvBlockSym &EnvBlock) {
+  DictScope S(W, "EnvBlock");
+
+  W.printNumber("Reserved", EnvBlock.Header.Reserved);
+  ListScope L(W, "Entries");
+  for (auto Entry : EnvBlock.Fields) {
+    W.printString(Entry);
+  }
+}
+
+void CVSymbolDumperImpl::visitFileStaticSym(SymbolKind Kind,
+                                            FileStaticSym &FileStatic) {
+  DictScope S(W, "FileStatic");
+  W.printNumber("Index", FileStatic.Header.Index);
+  W.printNumber("ModFilenameOffset", FileStatic.Header.ModFilenameOffset);
+  W.printFlags("Flags", uint16_t(FileStatic.Header.Flags), getLocalFlagNames());
+  W.printString("Name", FileStatic.Name);
+}
+
+void CVSymbolDumperImpl::visitExportSym(SymbolKind Kind, ExportSym &Export) {
+  DictScope S(W, "Export");
+  W.printNumber("Ordinal", Export.Header.Ordinal);
+  W.printFlags("Flags", Export.Header.Flags, getExportSymFlagNames());
+  W.printString("Name", Export.Name);
+}
+
+void CVSymbolDumperImpl::visitCompile2Sym(SymbolKind Kind,
+                                          Compile2Sym &Compile2) {
+  DictScope S(W, "CompilerFlags2");
+
+  W.printEnum("Language", Compile2.Header.getLanguage(),
+              getSourceLanguageNames());
+  W.printFlags("Flags", Compile2.Header.flags & ~0xff,
+               getCompileSym2FlagNames());
+  W.printEnum("Machine", unsigned(Compile2.Header.Machine), getCPUTypeNames());
+  std::string FrontendVersion;
+  {
+    raw_string_ostream Out(FrontendVersion);
+    Out << Compile2.Header.VersionFrontendMajor << '.'
+        << Compile2.Header.VersionFrontendMinor << '.'
+        << Compile2.Header.VersionFrontendBuild;
+  }
+  std::string BackendVersion;
+  {
+    raw_string_ostream Out(BackendVersion);
+    Out << Compile2.Header.VersionBackendMajor << '.'
+        << Compile2.Header.VersionBackendMinor << '.'
+        << Compile2.Header.VersionBackendBuild;
+  }
+  W.printString("FrontendVersion", FrontendVersion);
+  W.printString("BackendVersion", BackendVersion);
+  W.printString("VersionName", Compile2.Version);
+}
+
+void CVSymbolDumperImpl::visitCompile3Sym(SymbolKind Kind,
+                                          Compile3Sym &Compile3) {
+  DictScope S(W, "CompilerFlags3");
+
+  W.printEnum("Language", Compile3.Header.getLanguage(),
+              getSourceLanguageNames());
+  W.printFlags("Flags", Compile3.Header.flags & ~0xff,
+               getCompileSym3FlagNames());
+  W.printEnum("Machine", unsigned(Compile3.Header.Machine), getCPUTypeNames());
+  std::string FrontendVersion;
+  {
+    raw_string_ostream Out(FrontendVersion);
+    Out << Compile3.Header.VersionFrontendMajor << '.'
+        << Compile3.Header.VersionFrontendMinor << '.'
+        << Compile3.Header.VersionFrontendBuild << '.'
+        << Compile3.Header.VersionFrontendQFE;
+  }
+  std::string BackendVersion;
+  {
+    raw_string_ostream Out(BackendVersion);
+    Out << Compile3.Header.VersionBackendMajor << '.'
+        << Compile3.Header.VersionBackendMinor << '.'
+        << Compile3.Header.VersionBackendBuild << '.'
+        << Compile3.Header.VersionBackendQFE;
+  }
+  W.printString("FrontendVersion", FrontendVersion);
+  W.printString("BackendVersion", BackendVersion);
+  W.printString("VersionName", Compile3.Version);
+}
+
+void CVSymbolDumperImpl::visitConstantSym(SymbolKind Kind,
+                                          ConstantSym &Constant) {
+  DictScope S(W, "Constant");
+
+  CVTD.printTypeIndex("Type", Constant.Header.Type);
+  W.printNumber("Value", Constant.Value);
+  W.printString("Name", Constant.Name);
+}
+
+void CVSymbolDumperImpl::visitDataSym(SymbolKind Kind, DataSym &Data) {
+  DictScope S(W, "DataSym");
+
+  W.printEnum("Kind", uint16_t(Kind), getSymbolTypeNames());
+  StringRef LinkageName;
+  if (ObjDelegate) {
+    ObjDelegate->printRelocatedField("DataOffset", Data.getRelocationOffset(),
+                                     Data.Header.DataOffset, &LinkageName);
+  }
+  CVTD.printTypeIndex("Type", Data.Header.Type);
+  W.printString("DisplayName", Data.Name);
+  if (!LinkageName.empty())
+    W.printString("LinkageName", LinkageName);
+}
+
+void CVSymbolDumperImpl::visitDefRangeFramePointerRelFullScopeSym(
+    SymbolKind Kind,
+    DefRangeFramePointerRelFullScopeSym &DefRangeFramePointerRelFullScope) {
+  DictScope S(W, "DefRangeFramePointerRelFullScope");
+  W.printNumber("Offset", DefRangeFramePointerRelFullScope.Header.Offset);
+}
+
+void CVSymbolDumperImpl::visitDefRangeFramePointerRelSym(
+    SymbolKind Kind, DefRangeFramePointerRelSym &DefRangeFramePointerRel) {
+  DictScope S(W, "DefRangeFramePointerRel");
+
+  W.printNumber("Offset", DefRangeFramePointerRel.Header.Offset);
+  printLocalVariableAddrRange(DefRangeFramePointerRel.Header.Range,
+                              DefRangeFramePointerRel.getRelocationOffset());
+  printLocalVariableAddrGap(DefRangeFramePointerRel.Gaps);
+}
+
+void CVSymbolDumperImpl::visitDefRangeRegisterRelSym(
+    SymbolKind Kind, DefRangeRegisterRelSym &DefRangeRegisterRel) {
+  DictScope S(W, "DefRangeRegisterRel");
+
+  W.printNumber("BaseRegister", DefRangeRegisterRel.Header.BaseRegister);
+  W.printBoolean("HasSpilledUDTMember",
+                 DefRangeRegisterRel.hasSpilledUDTMember());
+  W.printNumber("OffsetInParent", DefRangeRegisterRel.offsetInParent());
+  W.printNumber("BasePointerOffset",
+                DefRangeRegisterRel.Header.BasePointerOffset);
+  printLocalVariableAddrRange(DefRangeRegisterRel.Header.Range,
+                              DefRangeRegisterRel.getRelocationOffset());
+  printLocalVariableAddrGap(DefRangeRegisterRel.Gaps);
+}
+
+void CVSymbolDumperImpl::visitDefRangeRegisterSym(
+    SymbolKind Kind, DefRangeRegisterSym &DefRangeRegister) {
+  DictScope S(W, "DefRangeRegister");
+
+  W.printNumber("Register", DefRangeRegister.Header.Register);
+  W.printNumber("MayHaveNoName", DefRangeRegister.Header.MayHaveNoName);
+  printLocalVariableAddrRange(DefRangeRegister.Header.Range,
+                              DefRangeRegister.getRelocationOffset());
+  printLocalVariableAddrGap(DefRangeRegister.Gaps);
+}
+
+void CVSymbolDumperImpl::visitDefRangeSubfieldRegisterSym(
+    SymbolKind Kind, DefRangeSubfieldRegisterSym &DefRangeSubfieldRegister) {
+  DictScope S(W, "DefRangeSubfieldRegister");
+
+  W.printNumber("Register", DefRangeSubfieldRegister.Header.Register);
+  W.printNumber("MayHaveNoName", DefRangeSubfieldRegister.Header.MayHaveNoName);
+  W.printNumber("OffsetInParent",
+                DefRangeSubfieldRegister.Header.OffsetInParent);
+  printLocalVariableAddrRange(DefRangeSubfieldRegister.Header.Range,
+                              DefRangeSubfieldRegister.getRelocationOffset());
+  printLocalVariableAddrGap(DefRangeSubfieldRegister.Gaps);
+}
+
+void CVSymbolDumperImpl::visitDefRangeSubfieldSym(
+    SymbolKind Kind, DefRangeSubfieldSym &DefRangeSubfield) {
+  DictScope S(W, "DefRangeSubfield");
+
+  if (ObjDelegate) {
+    StringRef StringTable = ObjDelegate->getStringTable();
+    auto ProgramStringTableOffset = DefRangeSubfield.Header.Program;
+    if (ProgramStringTableOffset >= StringTable.size())
+      return parseError();
+    StringRef Program =
+        StringTable.drop_front(ProgramStringTableOffset).split('\0').first;
+    W.printString("Program", Program);
+  }
+  W.printNumber("OffsetInParent", DefRangeSubfield.Header.OffsetInParent);
+  printLocalVariableAddrRange(DefRangeSubfield.Header.Range,
+                              DefRangeSubfield.getRelocationOffset());
+  printLocalVariableAddrGap(DefRangeSubfield.Gaps);
+}
+
+void CVSymbolDumperImpl::visitDefRangeSym(SymbolKind Kind,
+                                          DefRangeSym &DefRange) {
+  DictScope S(W, "DefRange");
+
+  if (ObjDelegate) {
+    StringRef StringTable = ObjDelegate->getStringTable();
+    auto ProgramStringTableOffset = DefRange.Header.Program;
+    if (ProgramStringTableOffset >= StringTable.size())
+      return parseError();
+    StringRef Program =
+        StringTable.drop_front(ProgramStringTableOffset).split('\0').first;
+    W.printString("Program", Program);
+  }
+  printLocalVariableAddrRange(DefRange.Header.Range,
+                              DefRange.getRelocationOffset());
+  printLocalVariableAddrGap(DefRange.Gaps);
+}
+
+void CVSymbolDumperImpl::visitFrameCookieSym(SymbolKind Kind,
+                                             FrameCookieSym &FrameCookie) {
+  DictScope S(W, "FrameCookie");
+
+  StringRef LinkageName;
+  if (ObjDelegate) {
+    ObjDelegate->printRelocatedField(
+        "CodeOffset", FrameCookie.getRelocationOffset(),
+        FrameCookie.Header.CodeOffset, &LinkageName);
+  }
+  W.printHex("Register", FrameCookie.Header.Register);
+  W.printEnum("CookieKind", uint16_t(FrameCookie.Header.CookieKind),
+              getFrameCookieKindNames());
+  W.printHex("Flags", FrameCookie.Header.Flags);
+}
+
+void CVSymbolDumperImpl::visitFrameProcSym(SymbolKind Kind,
+                                           FrameProcSym &FrameProc) {
+  DictScope S(W, "FrameProc");
+
+  W.printHex("TotalFrameBytes", FrameProc.Header.TotalFrameBytes);
+  W.printHex("PaddingFrameBytes", FrameProc.Header.PaddingFrameBytes);
+  W.printHex("OffsetToPadding", FrameProc.Header.OffsetToPadding);
+  W.printHex("BytesOfCalleeSavedRegisters",
+             FrameProc.Header.BytesOfCalleeSavedRegisters);
+  W.printHex("OffsetOfExceptionHandler",
+             FrameProc.Header.OffsetOfExceptionHandler);
+  W.printHex("SectionIdOfExceptionHandler",
+             FrameProc.Header.SectionIdOfExceptionHandler);
+  W.printFlags("Flags", FrameProc.Header.Flags, getFrameProcSymFlagNames());
+}
+
+void CVSymbolDumperImpl::visitHeapAllocationSiteSym(
+    SymbolKind Kind, HeapAllocationSiteSym &HeapAllocSite) {
+  DictScope S(W, "HeapAllocationSite");
+
+  StringRef LinkageName;
+  if (ObjDelegate) {
+    ObjDelegate->printRelocatedField(
+        "CodeOffset", HeapAllocSite.getRelocationOffset(),
+        HeapAllocSite.Header.CodeOffset, &LinkageName);
+  }
+  W.printHex("Segment", HeapAllocSite.Header.Segment);
+  W.printHex("CallInstructionSize", HeapAllocSite.Header.CallInstructionSize);
+  CVTD.printTypeIndex("Type", HeapAllocSite.Header.Type);
+  if (!LinkageName.empty())
+    W.printString("LinkageName", LinkageName);
+}
+
+void CVSymbolDumperImpl::visitInlineSiteSym(SymbolKind Kind,
+                                            InlineSiteSym &InlineSite) {
+  DictScope S(W, "InlineSite");
+
+  W.printHex("PtrParent", InlineSite.Header.PtrParent);
+  W.printHex("PtrEnd", InlineSite.Header.PtrEnd);
+  CVTD.printTypeIndex("Inlinee", InlineSite.Header.Inlinee);
+
+  ListScope BinaryAnnotations(W, "BinaryAnnotations");
+  for (auto &Annotation : InlineSite.annotations()) {
+    switch (Annotation.OpCode) {
+    case BinaryAnnotationsOpCode::Invalid:
+      return parseError();
+    case BinaryAnnotationsOpCode::CodeOffset:
+    case BinaryAnnotationsOpCode::ChangeCodeOffset:
+    case BinaryAnnotationsOpCode::ChangeCodeLength:
+      W.printHex(Annotation.Name, Annotation.U1);
+      break;
+    case BinaryAnnotationsOpCode::ChangeCodeOffsetBase:
+    case BinaryAnnotationsOpCode::ChangeLineEndDelta:
+    case BinaryAnnotationsOpCode::ChangeRangeKind:
+    case BinaryAnnotationsOpCode::ChangeColumnStart:
+    case BinaryAnnotationsOpCode::ChangeColumnEnd:
+      W.printNumber(Annotation.Name, Annotation.U1);
+      break;
+    case BinaryAnnotationsOpCode::ChangeLineOffset:
+    case BinaryAnnotationsOpCode::ChangeColumnEndDelta:
+      W.printNumber(Annotation.Name, Annotation.S1);
+      break;
+    case BinaryAnnotationsOpCode::ChangeFile:
+      if (ObjDelegate) {
+        W.printHex("ChangeFile",
+                   ObjDelegate->getFileNameForFileOffset(Annotation.U1),
+                   Annotation.U1);
+      } else {
+        W.printHex("ChangeFile", Annotation.U1);
+      }
+
+      break;
+    case BinaryAnnotationsOpCode::ChangeCodeOffsetAndLineOffset: {
+      W.startLine() << "ChangeCodeOffsetAndLineOffset: {CodeOffset: "
+                    << W.hex(Annotation.U1) << ", LineOffset: " << Annotation.S1
+                    << "}\n";
+      break;
+    }
+    case BinaryAnnotationsOpCode::ChangeCodeLengthAndCodeOffset: {
+      W.startLine() << "ChangeCodeLengthAndCodeOffset: {CodeOffset: "
+                    << W.hex(Annotation.U2)
+                    << ", Length: " << W.hex(Annotation.U1) << "}\n";
+      break;
+    }
+    }
+  }
+}
+
+void CVSymbolDumperImpl::visitRegisterSym(SymbolKind Kind,
+                                          RegisterSym &Register) {
+  DictScope S(W, "RegisterSym");
+  W.printNumber("Type", Register.Header.Index);
+  W.printEnum("Seg", uint16_t(Register.Header.Register), getRegisterNames());
+  W.printString("Name", Register.Name);
+}
+
+void CVSymbolDumperImpl::visitPublicSym32(SymbolKind Kind,
+                                          PublicSym32 &Public) {
+  DictScope S(W, "PublicSym");
+  W.printNumber("Type", Public.Header.Index);
+  W.printNumber("Seg", Public.Header.Seg);
+  W.printNumber("Off", Public.Header.Off);
+  W.printString("Name", Public.Name);
+}
+
+void CVSymbolDumperImpl::visitProcRefSym(SymbolKind Kind, ProcRefSym &ProcRef) {
+  DictScope S(W, "ProcRef");
+  W.printNumber("SumName", ProcRef.Header.SumName);
+  W.printNumber("SymOffset", ProcRef.Header.SymOffset);
+  W.printNumber("Mod", ProcRef.Header.Mod);
+  W.printString("Name", ProcRef.Name);
+}
+
+void CVSymbolDumperImpl::visitLabelSym(SymbolKind Kind, LabelSym &Label) {
+  DictScope S(W, "Label");
+
+  StringRef LinkageName;
+  if (ObjDelegate) {
+    ObjDelegate->printRelocatedField("CodeOffset", Label.getRelocationOffset(),
+                                     Label.Header.CodeOffset, &LinkageName);
+  }
+  W.printHex("Segment", Label.Header.Segment);
+  W.printHex("Flags", Label.Header.Flags);
+  W.printFlags("Flags", Label.Header.Flags, getProcSymFlagNames());
+  W.printString("DisplayName", Label.Name);
+  if (!LinkageName.empty())
+    W.printString("LinkageName", LinkageName);
+}
+
+void CVSymbolDumperImpl::visitLocalSym(SymbolKind Kind, LocalSym &Local) {
+  DictScope S(W, "Local");
+
+  CVTD.printTypeIndex("Type", Local.Header.Type);
+  W.printFlags("Flags", uint16_t(Local.Header.Flags), getLocalFlagNames());
+  W.printString("VarName", Local.Name);
+}
+
+void CVSymbolDumperImpl::visitObjNameSym(SymbolKind Kind, ObjNameSym &ObjName) {
+  DictScope S(W, "ObjectName");
+
+  W.printHex("Signature", ObjName.Header.Signature);
+  W.printString("ObjectName", ObjName.Name);
+}
+
+void CVSymbolDumperImpl::visitProcSym(SymbolKind Kind, ProcSym &Proc) {
+  DictScope S(W, "ProcStart");
+
+  if (InFunctionScope)
+    return parseError();
+
+  InFunctionScope = true;
+
+  StringRef LinkageName;
+  W.printEnum("Kind", uint16_t(Kind), getSymbolTypeNames());
+  W.printHex("PtrParent", Proc.Header.PtrParent);
+  W.printHex("PtrEnd", Proc.Header.PtrEnd);
+  W.printHex("PtrNext", Proc.Header.PtrNext);
+  W.printHex("CodeSize", Proc.Header.CodeSize);
+  W.printHex("DbgStart", Proc.Header.DbgStart);
+  W.printHex("DbgEnd", Proc.Header.DbgEnd);
+  CVTD.printTypeIndex("FunctionType", Proc.Header.FunctionType);
+  if (ObjDelegate) {
+    ObjDelegate->printRelocatedField("CodeOffset", Proc.getRelocationOffset(),
+                                     Proc.Header.CodeOffset, &LinkageName);
+  }
+  W.printHex("Segment", Proc.Header.Segment);
+  W.printFlags("Flags", static_cast<uint8_t>(Proc.Header.Flags),
+               getProcSymFlagNames());
+  W.printString("DisplayName", Proc.Name);
+  if (!LinkageName.empty())
+    W.printString("LinkageName", LinkageName);
+}
+
+void CVSymbolDumperImpl::visitScopeEndSym(SymbolKind Kind,
+                                          ScopeEndSym &ScopeEnd) {
+  if (Kind == SymbolKind::S_END)
+    DictScope S(W, "BlockEnd");
+  else if (Kind == SymbolKind::S_PROC_ID_END)
+    DictScope S(W, "ProcEnd");
+  else if (Kind == SymbolKind::S_INLINESITE_END)
+    DictScope S(W, "InlineSiteEnd");
+
+  InFunctionScope = false;
+}
+
+void CVSymbolDumperImpl::visitCallerSym(SymbolKind Kind, CallerSym &Caller) {
+  ListScope S(W, Kind == S_CALLEES ? "Callees" : "Callers");
+  for (auto FuncID : Caller.Indices)
+    CVTD.printTypeIndex("FuncID", FuncID);
+}
+
+void CVSymbolDumperImpl::visitRegRelativeSym(SymbolKind Kind,
+                                             RegRelativeSym &RegRel) {
+  DictScope S(W, "RegRelativeSym");
+
+  W.printHex("Offset", RegRel.Header.Offset);
+  CVTD.printTypeIndex("Type", RegRel.Header.Type);
+  W.printHex("Register", RegRel.Header.Register);
+  W.printString("VarName", RegRel.Name);
+}
+
+void CVSymbolDumperImpl::visitThreadLocalDataSym(SymbolKind Kind,
+                                                 ThreadLocalDataSym &Data) {
+  DictScope S(W, "ThreadLocalDataSym");
+
+  StringRef LinkageName;
+  if (ObjDelegate) {
+    ObjDelegate->printRelocatedField("DataOffset", Data.getRelocationOffset(),
+                                     Data.Header.DataOffset, &LinkageName);
+  }
+  CVTD.printTypeIndex("Type", Data.Header.Type);
+  W.printString("DisplayName", Data.Name);
+  if (!LinkageName.empty())
+    W.printString("LinkageName", LinkageName);
+}
+
+void CVSymbolDumperImpl::visitUDTSym(SymbolKind Kind, UDTSym &UDT) {
+  DictScope S(W, "UDT");
+  CVTD.printTypeIndex("Type", UDT.Header.Type);
+  W.printString("UDTName", UDT.Name);
+}
+
+void CVSymbolDumperImpl::visitUnknownSymbol(SymbolKind Kind,
+                                            ArrayRef<uint8_t> Data) {
+  DictScope S(W, "UnknownSym");
+  W.printEnum("Kind", uint16_t(Kind), getSymbolTypeNames());
+  W.printNumber("Length", uint32_t(Data.size()));
+}
+
+bool CVSymbolDumper::dump(const CVRecord<SymbolKind> &Record) {
+  CVSymbolDumperImpl Dumper(CVTD, ObjDelegate.get(), W, PrintRecordBytes);
+  Dumper.visitSymbolRecord(Record);
+  return !Dumper.hadError();
+}
+
+bool CVSymbolDumper::dump(const CVSymbolArray &Symbols) {
+  CVSymbolDumperImpl Dumper(CVTD, ObjDelegate.get(), W, PrintRecordBytes);
+  Dumper.visitSymbolStream(Symbols);
+  return !Dumper.hadError();
+}
diff --git a/lib/DebugInfo/CodeView/TypeDumper.cpp b/lib/DebugInfo/CodeView/TypeDumper.cpp
new file mode 100644
index 000000000000..345e2a49888c
--- /dev/null
+++ b/lib/DebugInfo/CodeView/TypeDumper.cpp
@@ -0,0 +1,696 @@
+//===-- TypeDumper.cpp - CodeView type info dumper --------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/DebugInfo/CodeView/TypeDumper.h"
+#include "llvm/ADT/SmallString.h"
+#include "llvm/DebugInfo/CodeView/CVTypeVisitor.h"
+#include "llvm/DebugInfo/CodeView/TypeIndex.h"
+#include "llvm/DebugInfo/CodeView/TypeRecord.h"
+#include "llvm/DebugInfo/CodeView/ByteStream.h"
+#include "llvm/Support/ScopedPrinter.h"
+
+using namespace llvm;
+using namespace llvm::codeview;
+
+/// The names here all end in "*". If the simple type is a pointer type, we
+/// return the whole name. Otherwise we lop off the last character in our
+/// StringRef.
+static const EnumEntry<SimpleTypeKind> SimpleTypeNames[] = {
+    {"void*", SimpleTypeKind::Void},
+    {"<not translated>*", SimpleTypeKind::NotTranslated},
+    {"HRESULT*", SimpleTypeKind::HResult},
+    {"signed char*", SimpleTypeKind::SignedCharacter},
+    {"unsigned char*", SimpleTypeKind::UnsignedCharacter},
+    {"char*", SimpleTypeKind::NarrowCharacter},
+    {"wchar_t*", SimpleTypeKind::WideCharacter},
+    {"char16_t*", SimpleTypeKind::Character16},
+    {"char32_t*", SimpleTypeKind::Character32},
+    {"__int8*", SimpleTypeKind::SByte},
+    {"unsigned __int8*", SimpleTypeKind::Byte},
+    {"short*", SimpleTypeKind::Int16Short},
+    {"unsigned short*", SimpleTypeKind::UInt16Short},
+    {"__int16*", SimpleTypeKind::Int16},
+    {"unsigned __int16*", SimpleTypeKind::UInt16},
+    {"long*", SimpleTypeKind::Int32Long},
+    {"unsigned long*", SimpleTypeKind::UInt32Long},
+    {"int*", SimpleTypeKind::Int32},
+    {"unsigned*", SimpleTypeKind::UInt32},
+    {"__int64*", SimpleTypeKind::Int64Quad},
+    {"unsigned __int64*", SimpleTypeKind::UInt64Quad},
+    {"__int64*", SimpleTypeKind::Int64},
+    {"unsigned __int64*", SimpleTypeKind::UInt64},
+    {"__int128*", SimpleTypeKind::Int128},
+    {"unsigned __int128*", SimpleTypeKind::UInt128},
+    {"__half*", SimpleTypeKind::Float16},
+    {"float*", SimpleTypeKind::Float32},
+    {"float*", SimpleTypeKind::Float32PartialPrecision},
+    {"__float48*", SimpleTypeKind::Float48},
+    {"double*", SimpleTypeKind::Float64},
+    {"long double*", SimpleTypeKind::Float80},
+    {"__float128*", SimpleTypeKind::Float128},
+    {"_Complex float*", SimpleTypeKind::Complex32},
+    {"_Complex double*", SimpleTypeKind::Complex64},
+    {"_Complex long double*", SimpleTypeKind::Complex80},
+    {"_Complex __float128*", SimpleTypeKind::Complex128},
+    {"bool*", SimpleTypeKind::Boolean8},
+    {"__bool16*", SimpleTypeKind::Boolean16},
+    {"__bool32*", SimpleTypeKind::Boolean32},
+    {"__bool64*", SimpleTypeKind::Boolean64},
+};
+
+static const EnumEntry<TypeLeafKind> LeafTypeNames[] = {
+#define CV_TYPE(enum, val) {#enum, enum},
+#include "llvm/DebugInfo/CodeView/TypeRecords.def"
+};
+
+#define ENUM_ENTRY(enum_class, enum)                                           \
+  { #enum, std::underlying_type < enum_class > ::type(enum_class::enum) }
+
+static const EnumEntry<uint16_t> ClassOptionNames[] = {
+    ENUM_ENTRY(ClassOptions, Packed),
+    ENUM_ENTRY(ClassOptions, HasConstructorOrDestructor),
+    ENUM_ENTRY(ClassOptions, HasOverloadedOperator),
+    ENUM_ENTRY(ClassOptions, Nested),
+    ENUM_ENTRY(ClassOptions, ContainsNestedClass),
+    ENUM_ENTRY(ClassOptions, HasOverloadedAssignmentOperator),
+    ENUM_ENTRY(ClassOptions, HasConversionOperator),
+    ENUM_ENTRY(ClassOptions, ForwardReference),
+    ENUM_ENTRY(ClassOptions, Scoped),
+    ENUM_ENTRY(ClassOptions, HasUniqueName),
+    ENUM_ENTRY(ClassOptions, Sealed),
+    ENUM_ENTRY(ClassOptions, Intrinsic),
+};
+
+static const EnumEntry<uint8_t> MemberAccessNames[] = {
+    ENUM_ENTRY(MemberAccess, None),
+    ENUM_ENTRY(MemberAccess, Private),
+    ENUM_ENTRY(MemberAccess, Protected),
+    ENUM_ENTRY(MemberAccess, Public),
+};
+
+static const EnumEntry<uint16_t> MethodOptionNames[] = {
+    ENUM_ENTRY(MethodOptions, Pseudo),
+    ENUM_ENTRY(MethodOptions, NoInherit),
+    ENUM_ENTRY(MethodOptions, NoConstruct),
+    ENUM_ENTRY(MethodOptions, CompilerGenerated),
+    ENUM_ENTRY(MethodOptions, Sealed),
+};
+
+static const EnumEntry<uint16_t> MemberKindNames[] = {
+    ENUM_ENTRY(MethodKind, Vanilla),
+    ENUM_ENTRY(MethodKind, Virtual),
+    ENUM_ENTRY(MethodKind, Static),
+    ENUM_ENTRY(MethodKind, Friend),
+    ENUM_ENTRY(MethodKind, IntroducingVirtual),
+    ENUM_ENTRY(MethodKind, PureVirtual),
+    ENUM_ENTRY(MethodKind, PureIntroducingVirtual),
+};
+
+static const EnumEntry<uint8_t> PtrKindNames[] = {
+    ENUM_ENTRY(PointerKind, Near16),
+    ENUM_ENTRY(PointerKind, Far16),
+    ENUM_ENTRY(PointerKind, Huge16),
+    ENUM_ENTRY(PointerKind, BasedOnSegment),
+    ENUM_ENTRY(PointerKind, BasedOnValue),
+    ENUM_ENTRY(PointerKind, BasedOnSegmentValue),
+    ENUM_ENTRY(PointerKind, BasedOnAddress),
+    ENUM_ENTRY(PointerKind, BasedOnSegmentAddress),
+    ENUM_ENTRY(PointerKind, BasedOnType),
+    ENUM_ENTRY(PointerKind, BasedOnSelf),
+    ENUM_ENTRY(PointerKind, Near32),
+    ENUM_ENTRY(PointerKind, Far32),
+    ENUM_ENTRY(PointerKind, Near64),
+};
+
+static const EnumEntry<uint8_t> PtrModeNames[] = {
+    ENUM_ENTRY(PointerMode, Pointer),
+    ENUM_ENTRY(PointerMode, LValueReference),
+    ENUM_ENTRY(PointerMode, PointerToDataMember),
+    ENUM_ENTRY(PointerMode, PointerToMemberFunction),
+    ENUM_ENTRY(PointerMode, RValueReference),
+};
+
+static const EnumEntry<uint16_t> PtrMemberRepNames[] = {
+    ENUM_ENTRY(PointerToMemberRepresentation, Unknown),
+    ENUM_ENTRY(PointerToMemberRepresentation, SingleInheritanceData),
+    ENUM_ENTRY(PointerToMemberRepresentation, MultipleInheritanceData),
+    ENUM_ENTRY(PointerToMemberRepresentation, VirtualInheritanceData),
+    ENUM_ENTRY(PointerToMemberRepresentation, GeneralData),
+    ENUM_ENTRY(PointerToMemberRepresentation, SingleInheritanceFunction),
+    ENUM_ENTRY(PointerToMemberRepresentation, MultipleInheritanceFunction),
+    ENUM_ENTRY(PointerToMemberRepresentation, VirtualInheritanceFunction),
+    ENUM_ENTRY(PointerToMemberRepresentation, GeneralFunction),
+};
+
+static const EnumEntry<uint16_t> TypeModifierNames[] = {
+    ENUM_ENTRY(ModifierOptions, Const),
+    ENUM_ENTRY(ModifierOptions, Volatile),
+    ENUM_ENTRY(ModifierOptions, Unaligned),
+};
+
+static const EnumEntry<uint8_t> CallingConventions[] = {
+    ENUM_ENTRY(CallingConvention, NearC),
+    ENUM_ENTRY(CallingConvention, FarC),
+    ENUM_ENTRY(CallingConvention, NearPascal),
+    ENUM_ENTRY(CallingConvention, FarPascal),
+    ENUM_ENTRY(CallingConvention, NearFast),
+    ENUM_ENTRY(CallingConvention, FarFast),
+    ENUM_ENTRY(CallingConvention, NearStdCall),
+    ENUM_ENTRY(CallingConvention, FarStdCall),
+    ENUM_ENTRY(CallingConvention, NearSysCall),
+    ENUM_ENTRY(CallingConvention, FarSysCall),
+    ENUM_ENTRY(CallingConvention, ThisCall),
+    ENUM_ENTRY(CallingConvention, MipsCall),
+    ENUM_ENTRY(CallingConvention, Generic),
+    ENUM_ENTRY(CallingConvention, AlphaCall),
+    ENUM_ENTRY(CallingConvention, PpcCall),
+    ENUM_ENTRY(CallingConvention, SHCall),
+    ENUM_ENTRY(CallingConvention, ArmCall),
+    ENUM_ENTRY(CallingConvention, AM33Call),
+    ENUM_ENTRY(CallingConvention, TriCall),
+    ENUM_ENTRY(CallingConvention, SH5Call),
+    ENUM_ENTRY(CallingConvention, M32RCall),
+    ENUM_ENTRY(CallingConvention, ClrCall),
+    ENUM_ENTRY(CallingConvention, Inline),
+    ENUM_ENTRY(CallingConvention, NearVector),
+};
+
+static const EnumEntry<uint8_t> FunctionOptionEnum[] = {
+    ENUM_ENTRY(FunctionOptions, CxxReturnUdt),
+    ENUM_ENTRY(FunctionOptions, Constructor),
+    ENUM_ENTRY(FunctionOptions, ConstructorWithVirtualBases),
+};
+
+#undef ENUM_ENTRY
+
+static StringRef getLeafTypeName(TypeLeafKind LT) {
+  switch (LT) {
+#define TYPE_RECORD(ename, value, name)                                        \
+  case ename:                                                                  \
+    return #name;
+#include "llvm/DebugInfo/CodeView/TypeRecords.def"
+  case LF_FIELDLIST:
+    return "FieldList";
+  default:
+    break;
+  }
+  return "UnknownLeaf";
+}
+
+Error CVTypeDumper::visitTypeBegin(const CVRecord<TypeLeafKind> &Record) {
+  // Reset Name to the empty string. If the visitor sets it, we know it.
+  Name = "";
+
+  W->startLine() << getLeafTypeName(Record.Type) << " ("
+                 << HexNumber(getNextTypeIndex()) << ") {\n";
+  W->indent();
+  W->printEnum("TypeLeafKind", unsigned(Record.Type),
+               makeArrayRef(LeafTypeNames));
+  return Error::success();
+}
+
+Error CVTypeDumper::visitTypeEnd(const CVRecord<TypeLeafKind> &Record) {
+  if (Record.Type == LF_FIELDLIST)
+    Name = "<field list>";
+
+  // Always record some name for every type, even if Name is empty. CVUDTNames
+  // is indexed by type index, and must have one entry for every type.
+  recordType(Name);
+
+  if (PrintRecordBytes)
+    W->printBinaryBlock("LeafData", getBytesAsCharacters(Record.Data));
+
+  W->unindent();
+  W->startLine() << "}\n";
+  return Error::success();
+}
+
+Error CVTypeDumper::visitStringId(StringIdRecord &String) {
+  printTypeIndex("Id", String.getId());
+  W->printString("StringData", String.getString());
+  // Put this in CVUDTNames so it gets printed with LF_UDT_SRC_LINE.
+  Name = String.getString();
+  return Error::success();
+}
+
+Error CVTypeDumper::visitArgList(ArgListRecord &Args) {
+  auto Indices = Args.getIndices();
+  uint32_t Size = Indices.size();
+  W->printNumber("NumArgs", Size);
+  ListScope Arguments(*W, "Arguments");
+  SmallString<256> TypeName("(");
+  for (uint32_t I = 0; I < Size; ++I) {
+    printTypeIndex("ArgType", Indices[I]);
+    StringRef ArgTypeName = getTypeName(Indices[I]);
+    TypeName.append(ArgTypeName);
+    if (I + 1 != Size)
+      TypeName.append(", ");
+  }
+  TypeName.push_back(')');
+  Name = saveName(TypeName);
+  return Error::success();
+}
+
+Error CVTypeDumper::visitClass(ClassRecord &Class) {
+  uint16_t Props = static_cast<uint16_t>(Class.getOptions());
+  W->printNumber("MemberCount", Class.getMemberCount());
+  W->printFlags("Properties", Props, makeArrayRef(ClassOptionNames));
+  printTypeIndex("FieldList", Class.getFieldList());
+  printTypeIndex("DerivedFrom", Class.getDerivationList());
+  printTypeIndex("VShape", Class.getVTableShape());
+  W->printNumber("SizeOf", Class.getSize());
+  W->printString("Name", Class.getName());
+  if (Props & uint16_t(ClassOptions::HasUniqueName))
+    W->printString("LinkageName", Class.getUniqueName());
+  Name = Class.getName();
+  return Error::success();
+}
+
+Error CVTypeDumper::visitUnion(UnionRecord &Union) {
+  uint16_t Props = static_cast<uint16_t>(Union.getOptions());
+  W->printNumber("MemberCount", Union.getMemberCount());
+  W->printFlags("Properties", Props, makeArrayRef(ClassOptionNames));
+  printTypeIndex("FieldList", Union.getFieldList());
+  W->printNumber("SizeOf", Union.getSize());
+  W->printString("Name", Union.getName());
+  if (Props & uint16_t(ClassOptions::HasUniqueName))
+    W->printString("LinkageName", Union.getUniqueName());
+  Name = Union.getName();
+  return Error::success();
+}
+
+Error CVTypeDumper::visitEnum(EnumRecord &Enum) {
+  uint16_t Props = static_cast<uint16_t>(Enum.getOptions());
+  W->printNumber("NumEnumerators", Enum.getMemberCount());
+  W->printFlags("Properties", uint16_t(Enum.getOptions()),
+                makeArrayRef(ClassOptionNames));
+  printTypeIndex("UnderlyingType", Enum.getUnderlyingType());
+  printTypeIndex("FieldListType", Enum.getFieldList());
+  W->printString("Name", Enum.getName());
+  if (Props & uint16_t(ClassOptions::HasUniqueName))
+    W->printString("LinkageName", Enum.getUniqueName());
+  Name = Enum.getName();
+  return Error::success();
+}
+
+Error CVTypeDumper::visitArray(ArrayRecord &AT) {
+  printTypeIndex("ElementType", AT.getElementType());
+  printTypeIndex("IndexType", AT.getIndexType());
+  W->printNumber("SizeOf", AT.getSize());
+  W->printString("Name", AT.getName());
+  Name = AT.getName();
+  return Error::success();
+}
+
+Error CVTypeDumper::visitVFTable(VFTableRecord &VFT) {
+  printTypeIndex("CompleteClass", VFT.getCompleteClass());
+  printTypeIndex("OverriddenVFTable", VFT.getOverriddenVTable());
+  W->printHex("VFPtrOffset", VFT.getVFPtrOffset());
+  W->printString("VFTableName", VFT.getName());
+  for (auto N : VFT.getMethodNames())
+    W->printString("MethodName", N);
+  Name = VFT.getName();
+  return Error::success();
+}
+
+Error CVTypeDumper::visitMemberFuncId(MemberFuncIdRecord &Id) {
+  printTypeIndex("ClassType", Id.getClassType());
+  printTypeIndex("FunctionType", Id.getFunctionType());
+  W->printString("Name", Id.getName());
+  Name = Id.getName();
+  return Error::success();
+}
+
+Error CVTypeDumper::visitProcedure(ProcedureRecord &Proc) {
+  printTypeIndex("ReturnType", Proc.getReturnType());
+  W->printEnum("CallingConvention", uint8_t(Proc.getCallConv()),
+               makeArrayRef(CallingConventions));
+  W->printFlags("FunctionOptions", uint8_t(Proc.getOptions()),
+                makeArrayRef(FunctionOptionEnum));
+  W->printNumber("NumParameters", Proc.getParameterCount());
+  printTypeIndex("ArgListType", Proc.getArgumentList());
+
+  StringRef ReturnTypeName = getTypeName(Proc.getReturnType());
+  StringRef ArgListTypeName = getTypeName(Proc.getArgumentList());
+  SmallString<256> TypeName(ReturnTypeName);
+  TypeName.push_back(' ');
+  TypeName.append(ArgListTypeName);
+  Name = saveName(TypeName);
+  return Error::success();
+}
+
+Error CVTypeDumper::visitMemberFunction(MemberFunctionRecord &MF) {
+  printTypeIndex("ReturnType", MF.getReturnType());
+  printTypeIndex("ClassType", MF.getClassType());
+  printTypeIndex("ThisType", MF.getThisType());
+  W->printEnum("CallingConvention", uint8_t(MF.getCallConv()),
+               makeArrayRef(CallingConventions));
+  W->printFlags("FunctionOptions", uint8_t(MF.getOptions()),
+                makeArrayRef(FunctionOptionEnum));
+  W->printNumber("NumParameters", MF.getParameterCount());
+  printTypeIndex("ArgListType", MF.getArgumentList());
+  W->printNumber("ThisAdjustment", MF.getThisPointerAdjustment());
+
+  StringRef ReturnTypeName = getTypeName(MF.getReturnType());
+  StringRef ClassTypeName = getTypeName(MF.getClassType());
+  StringRef ArgListTypeName = getTypeName(MF.getArgumentList());
+  SmallString<256> TypeName(ReturnTypeName);
+  TypeName.push_back(' ');
+  TypeName.append(ClassTypeName);
+  TypeName.append("::");
+  TypeName.append(ArgListTypeName);
+  Name = saveName(TypeName);
+  return Error::success();
+}
+
+Error CVTypeDumper::visitMethodOverloadList(
+    MethodOverloadListRecord &MethodList) {
+  for (auto &M : MethodList.getMethods()) {
+    ListScope S(*W, "Method");
+    printMemberAttributes(M.getAccess(), M.getKind(), M.getOptions());
+    printTypeIndex("Type", M.getType());
+    if (M.isIntroducingVirtual())
+      W->printHex("VFTableOffset", M.getVFTableOffset());
+  }
+  return Error::success();
+}
+
+Error CVTypeDumper::visitFuncId(FuncIdRecord &Func) {
+  printTypeIndex("ParentScope", Func.getParentScope());
+  printTypeIndex("FunctionType", Func.getFunctionType());
+  W->printString("Name", Func.getName());
+  Name = Func.getName();
+  return Error::success();
+}
+
+Error CVTypeDumper::visitTypeServer2(TypeServer2Record &TS) {
+  W->printBinary("Signature", TS.getGuid());
+  W->printNumber("Age", TS.getAge());
+  W->printString("Name", TS.getName());
+  Name = TS.getName();
+  return Error::success();
+}
+
+Error CVTypeDumper::visitPointer(PointerRecord &Ptr) {
+  printTypeIndex("PointeeType", Ptr.getReferentType());
+  W->printHex("PointerAttributes", uint32_t(Ptr.getOptions()));
+  W->printEnum("PtrType", unsigned(Ptr.getPointerKind()),
+               makeArrayRef(PtrKindNames));
+  W->printEnum("PtrMode", unsigned(Ptr.getMode()), makeArrayRef(PtrModeNames));
+
+  W->printNumber("IsFlat", Ptr.isFlat());
+  W->printNumber("IsConst", Ptr.isConst());
+  W->printNumber("IsVolatile", Ptr.isVolatile());
+  W->printNumber("IsUnaligned", Ptr.isUnaligned());
+  W->printNumber("SizeOf", Ptr.getSize());
+
+  if (Ptr.isPointerToMember()) {
+    const MemberPointerInfo &MI = Ptr.getMemberInfo();
+
+    printTypeIndex("ClassType", MI.getContainingType());
+    W->printEnum("Representation", uint16_t(MI.getRepresentation()),
+                 makeArrayRef(PtrMemberRepNames));
+
+    StringRef PointeeName = getTypeName(Ptr.getReferentType());
+    StringRef ClassName = getTypeName(MI.getContainingType());
+    SmallString<256> TypeName(PointeeName);
+    TypeName.push_back(' ');
+    TypeName.append(ClassName);
+    TypeName.append("::*");
+    Name = saveName(TypeName);
+  } else {
+    SmallString<256> TypeName;
+    if (Ptr.isConst())
+      TypeName.append("const ");
+    if (Ptr.isVolatile())
+      TypeName.append("volatile ");
+    if (Ptr.isUnaligned())
+      TypeName.append("__unaligned ");
+
+    TypeName.append(getTypeName(Ptr.getReferentType()));
+
+    if (Ptr.getMode() == PointerMode::LValueReference)
+      TypeName.append("&");
+    else if (Ptr.getMode() == PointerMode::RValueReference)
+      TypeName.append("&&");
+    else if (Ptr.getMode() == PointerMode::Pointer)
+      TypeName.append("*");
+
+    if (!TypeName.empty())
+      Name = saveName(TypeName);
+  }
+  return Error::success();
+}
+
+Error CVTypeDumper::visitModifier(ModifierRecord &Mod) {
+  uint16_t Mods = static_cast<uint16_t>(Mod.getModifiers());
+  printTypeIndex("ModifiedType", Mod.getModifiedType());
+  W->printFlags("Modifiers", Mods, makeArrayRef(TypeModifierNames));
+
+  StringRef ModifiedName = getTypeName(Mod.getModifiedType());
+  SmallString<256> TypeName;
+  if (Mods & uint16_t(ModifierOptions::Const))
+    TypeName.append("const ");
+  if (Mods & uint16_t(ModifierOptions::Volatile))
+    TypeName.append("volatile ");
+  if (Mods & uint16_t(ModifierOptions::Unaligned))
+    TypeName.append("__unaligned ");
+  TypeName.append(ModifiedName);
+  Name = saveName(TypeName);
+  return Error::success();
+}
+
+Error CVTypeDumper::visitBitField(BitFieldRecord &BitField) {
+  printTypeIndex("Type", BitField.getType());
+  W->printNumber("BitSize", BitField.getBitSize());
+  W->printNumber("BitOffset", BitField.getBitOffset());
+  return Error::success();
+}
+
+Error CVTypeDumper::visitVFTableShape(VFTableShapeRecord &Shape) {
+  W->printNumber("VFEntryCount", Shape.getEntryCount());
+  return Error::success();
+}
+
+Error CVTypeDumper::visitUdtSourceLine(UdtSourceLineRecord &Line) {
+  printTypeIndex("UDT", Line.getUDT());
+  printTypeIndex("SourceFile", Line.getSourceFile());
+  W->printNumber("LineNumber", Line.getLineNumber());
+  return Error::success();
+}
+
+Error CVTypeDumper::visitUdtModSourceLine(UdtModSourceLineRecord &Line) {
+  printTypeIndex("UDT", Line.getUDT());
+  printTypeIndex("SourceFile", Line.getSourceFile());
+  W->printNumber("LineNumber", Line.getLineNumber());
+  W->printNumber("Module", Line.getModule());
+  return Error::success();
+}
+
+Error CVTypeDumper::visitBuildInfo(BuildInfoRecord &Args) {
+  W->printNumber("NumArgs", static_cast<uint32_t>(Args.getArgs().size()));
+
+  ListScope Arguments(*W, "Arguments");
+  for (auto Arg : Args.getArgs()) {
+    printTypeIndex("ArgType", Arg);
+  }
+  return Error::success();
+}
+
+void CVTypeDumper::printMemberAttributes(MemberAttributes Attrs) {
+  return printMemberAttributes(Attrs.getAccess(), Attrs.getMethodKind(),
+                               Attrs.getFlags());
+}
+
+void CVTypeDumper::printMemberAttributes(MemberAccess Access, MethodKind Kind,
+                                         MethodOptions Options) {
+  W->printEnum("AccessSpecifier", uint8_t(Access),
+               makeArrayRef(MemberAccessNames));
+  // Data members will be vanilla. Don't try to print a method kind for them.
+  if (Kind != MethodKind::Vanilla)
+    W->printEnum("MethodKind", unsigned(Kind), makeArrayRef(MemberKindNames));
+  if (Options != MethodOptions::None) {
+    W->printFlags("MethodOptions", unsigned(Options),
+                  makeArrayRef(MethodOptionNames));
+  }
+}
+
+Error CVTypeDumper::visitUnknownMember(const CVRecord<TypeLeafKind> &Record) {
+  W->printHex("UnknownMember", unsigned(Record.Type));
+  return Error::success();
+}
+
+Error CVTypeDumper::visitUnknownType(const CVRecord<TypeLeafKind> &Record) {
+  DictScope S(*W, "UnknownType");
+  W->printEnum("Kind", uint16_t(Record.Type), makeArrayRef(LeafTypeNames));
+  W->printNumber("Length", uint32_t(Record.Data.size()));
+  return Error::success();
+}
+
+Error CVTypeDumper::visitNestedType(NestedTypeRecord &Nested) {
+  DictScope S(*W, "NestedType");
+  printTypeIndex("Type", Nested.getNestedType());
+  W->printString("Name", Nested.getName());
+  Name = Nested.getName();
+  return Error::success();
+}
+
+Error CVTypeDumper::visitOneMethod(OneMethodRecord &Method) {
+  DictScope S(*W, "OneMethod");
+  MethodKind K = Method.getKind();
+  printMemberAttributes(Method.getAccess(), K, Method.getOptions());
+  printTypeIndex("Type", Method.getType());
+  // If virtual, then read the vftable offset.
+  if (Method.isIntroducingVirtual())
+    W->printHex("VFTableOffset", Method.getVFTableOffset());
+  W->printString("Name", Method.getName());
+  Name = Method.getName();
+  return Error::success();
+}
+
+Error CVTypeDumper::visitOverloadedMethod(OverloadedMethodRecord &Method) {
+  DictScope S(*W, "OverloadedMethod");
+  W->printHex("MethodCount", Method.getNumOverloads());
+  printTypeIndex("MethodListIndex", Method.getMethodList());
+  W->printString("Name", Method.getName());
+  Name = Method.getName();
+  return Error::success();
+}
+
+Error CVTypeDumper::visitDataMember(DataMemberRecord &Field) {
+  DictScope S(*W, "DataMember");
+  printMemberAttributes(Field.getAccess(), MethodKind::Vanilla,
+                        MethodOptions::None);
+  printTypeIndex("Type", Field.getType());
+  W->printHex("FieldOffset", Field.getFieldOffset());
+  W->printString("Name", Field.getName());
+  Name = Field.getName();
+  return Error::success();
+}
+
+Error CVTypeDumper::visitStaticDataMember(StaticDataMemberRecord &Field) {
+  DictScope S(*W, "StaticDataMember");
+  printMemberAttributes(Field.getAccess(), MethodKind::Vanilla,
+                        MethodOptions::None);
+  printTypeIndex("Type", Field.getType());
+  W->printString("Name", Field.getName());
+  Name = Field.getName();
+  return Error::success();
+}
+
+Error CVTypeDumper::visitVFPtr(VFPtrRecord &VFTable) {
+  DictScope S(*W, "VFPtr");
+  printTypeIndex("Type", VFTable.getType());
+  return Error::success();
+}
+
+Error CVTypeDumper::visitEnumerator(EnumeratorRecord &Enum) {
+  DictScope S(*W, "Enumerator");
+  printMemberAttributes(Enum.getAccess(), MethodKind::Vanilla,
+                        MethodOptions::None);
+  W->printNumber("EnumValue", Enum.getValue());
+  W->printString("Name", Enum.getName());
+  Name = Enum.getName();
+  return Error::success();
+}
+
+Error CVTypeDumper::visitBaseClass(BaseClassRecord &Base) {
+  DictScope S(*W, "BaseClass");
+  printMemberAttributes(Base.getAccess(), MethodKind::Vanilla,
+                        MethodOptions::None);
+  printTypeIndex("BaseType", Base.getBaseType());
+  W->printHex("BaseOffset", Base.getBaseOffset());
+  return Error::success();
+}
+
+Error CVTypeDumper::visitVirtualBaseClass(VirtualBaseClassRecord &Base) {
+  DictScope S(*W, "VirtualBaseClass");
+  printMemberAttributes(Base.getAccess(), MethodKind::Vanilla,
+                        MethodOptions::None);
+  printTypeIndex("BaseType", Base.getBaseType());
+  printTypeIndex("VBPtrType", Base.getVBPtrType());
+  W->printHex("VBPtrOffset", Base.getVBPtrOffset());
+  W->printHex("VBTableIndex", Base.getVTableIndex());
+  return Error::success();
+}
+
+Error CVTypeDumper::visitListContinuation(ListContinuationRecord &Cont) {
+  DictScope S(*W, "ListContinuation");
+  printTypeIndex("ContinuationIndex", Cont.getContinuationIndex());
+  return Error::success();
+}
+
+StringRef CVTypeDumper::getTypeName(TypeIndex TI) {
+  if (TI.isNoneType())
+    return "<no type>";
+
+  if (TI.isSimple()) {
+    // This is a simple type.
+    for (const auto &SimpleTypeName : SimpleTypeNames) {
+      if (SimpleTypeName.Value == TI.getSimpleKind()) {
+        if (TI.getSimpleMode() == SimpleTypeMode::Direct)
+          return SimpleTypeName.Name.drop_back(1);
+        // Otherwise, this is a pointer type. We gloss over the distinction
+        // between near, far, 64, 32, etc, and just give a pointer type.
+        return SimpleTypeName.Name;
+      }
+    }
+    return "<unknown simple type>";
+  }
+
+  // User-defined type.
+  StringRef UDTName;
+  unsigned UDTIndex = TI.getIndex() - 0x1000;
+  if (UDTIndex < CVUDTNames.size())
+    return CVUDTNames[UDTIndex];
+
+  return "<unknown UDT>";
+}
+
+void CVTypeDumper::printTypeIndex(StringRef FieldName, TypeIndex TI) {
+  StringRef TypeName;
+  if (!TI.isNoneType())
+    TypeName = getTypeName(TI);
+  if (!TypeName.empty())
+    W->printHex(FieldName, TypeName, TI.getIndex());
+  else
+    W->printHex(FieldName, TI.getIndex());
+}
+
+Error CVTypeDumper::dump(const CVRecord<TypeLeafKind> &Record) {
+  assert(W && "printer should not be null");
+  CVTypeVisitor Visitor(*this);
+
+  if (auto EC = Visitor.visitTypeRecord(Record))
+    return EC;
+  return Error::success();
+}
+
+Error CVTypeDumper::dump(const CVTypeArray &Types) {
+  assert(W && "printer should not be null");
+  CVTypeVisitor Visitor(*this);
+  if (auto EC = Visitor.visitTypeStream(Types))
+    return EC;
+  return Error::success();
+}
+
+Error CVTypeDumper::dump(ArrayRef<uint8_t> Data) {
+  ByteStream<> Stream(Data);
+  CVTypeArray Types;
+  StreamReader Reader(Stream);
+  if (auto EC = Reader.readArray(Types, Reader.getLength()))
+    return EC;
+
+  return dump(Types);
+}
+
+void CVTypeDumper::setPrinter(ScopedPrinter *P) {
+  static ScopedPrinter NullP(llvm::nulls());
+  W = P ? P : &NullP;
+}
diff --git a/lib/DebugInfo/CodeView/TypeRecord.cpp b/lib/DebugInfo/CodeView/TypeRecord.cpp
new file mode 100644
index 000000000000..f63371e8c14f
--- /dev/null
+++ b/lib/DebugInfo/CodeView/TypeRecord.cpp
@@ -0,0 +1,572 @@
+//===-- TypeRecord.cpp ------------------------------------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/DebugInfo/CodeView/TypeRecord.h"
+#include "llvm/DebugInfo/CodeView/TypeIndex.h"
+#include "llvm/DebugInfo/CodeView/RecordSerialization.h"
+
+using namespace llvm;
+using namespace llvm::codeview;
+
+//===----------------------------------------------------------------------===//
+// Type record deserialization
+//===----------------------------------------------------------------------===//
+
+ErrorOr<MemberPointerInfo>
+MemberPointerInfo::deserialize(ArrayRef<uint8_t> &Data) {
+  const Layout *L = nullptr;
+  if (auto EC = consumeObject(Data, L))
+    return EC;
+
+  TypeIndex T = L->ClassType;
+  uint16_t R = L->Representation;
+  PointerToMemberRepresentation PMR =
+      static_cast<PointerToMemberRepresentation>(R);
+  return MemberPointerInfo(T, PMR);
+}
+
+ErrorOr<ModifierRecord> ModifierRecord::deserialize(TypeRecordKind Kind,
+                                                    ArrayRef<uint8_t> &Data) {
+  const Layout *L = nullptr;
+  if (auto EC = consumeObject(Data, L))
+    return EC;
+
+  TypeIndex M = L->ModifiedType;
+  uint16_t O = L->Modifiers;
+  ModifierOptions MO = static_cast<ModifierOptions>(O);
+  return ModifierRecord(M, MO);
+}
+
+ErrorOr<ProcedureRecord> ProcedureRecord::deserialize(TypeRecordKind Kind,
+                                                      ArrayRef<uint8_t> &Data) {
+  const Layout *L = nullptr;
+  if (auto EC = consumeObject(Data, L))
+    return EC;
+  return ProcedureRecord(L->ReturnType, L->CallConv, L->Options,
+                         L->NumParameters, L->ArgListType);
+}
+
+ErrorOr<MemberFunctionRecord>
+MemberFunctionRecord::deserialize(TypeRecordKind Kind,
+                                  ArrayRef<uint8_t> &Data) {
+  const Layout *L = nullptr;
+  CV_DESERIALIZE(Data, L);
+  return MemberFunctionRecord(L->ReturnType, L->ClassType, L->ThisType,
+                              L->CallConv, L->Options, L->NumParameters,
+                              L->ArgListType, L->ThisAdjustment);
+}
+
+ErrorOr<MemberFuncIdRecord>
+MemberFuncIdRecord::deserialize(TypeRecordKind Kind, ArrayRef<uint8_t> &Data) {
+  const Layout *L = nullptr;
+  StringRef Name;
+  CV_DESERIALIZE(Data, L, Name);
+  return MemberFuncIdRecord(L->ClassType, L->FunctionType, Name);
+}
+
+ErrorOr<ArgListRecord> ArgListRecord::deserialize(TypeRecordKind Kind,
+                                                  ArrayRef<uint8_t> &Data) {
+  if (Kind != TypeRecordKind::StringList && Kind != TypeRecordKind::ArgList)
+    return std::make_error_code(std::errc::illegal_byte_sequence);
+
+  const Layout *L = nullptr;
+  ArrayRef<TypeIndex> Indices;
+  CV_DESERIALIZE(Data, L, CV_ARRAY_FIELD_N(Indices, L->NumArgs));
+  return ArgListRecord(Kind, Indices);
+}
+
+ErrorOr<PointerRecord> PointerRecord::deserialize(TypeRecordKind Kind,
+                                                  ArrayRef<uint8_t> &Data) {
+  const Layout *L = nullptr;
+  if (auto EC = consumeObject(Data, L))
+    return EC;
+
+  PointerKind PtrKind = L->getPtrKind();
+  PointerMode Mode = L->getPtrMode();
+  uint32_t Opts = L->Attrs;
+  PointerOptions Options = static_cast<PointerOptions>(Opts);
+  uint8_t Size = L->getPtrSize();
+
+  if (L->isPointerToMember()) {
+    auto E = MemberPointerInfo::deserialize(Data);
+    if (E.getError())
+      return std::make_error_code(std::errc::illegal_byte_sequence);
+    return PointerRecord(L->PointeeType, PtrKind, Mode, Options, Size, *E);
+  }
+
+  return PointerRecord(L->PointeeType, PtrKind, Mode, Options, Size);
+}
+
+ErrorOr<NestedTypeRecord>
+NestedTypeRecord::deserialize(TypeRecordKind Kind, ArrayRef<uint8_t> &Data) {
+  const Layout *L = nullptr;
+  StringRef Name;
+  CV_DESERIALIZE(Data, L, Name);
+  return NestedTypeRecord(L->Type, Name);
+}
+
+ErrorOr<ArrayRecord> ArrayRecord::deserialize(TypeRecordKind Kind,
+                                              ArrayRef<uint8_t> &Data) {
+  const Layout *L = nullptr;
+  uint64_t Size;
+  StringRef Name;
+  CV_DESERIALIZE(Data, L, CV_NUMERIC_FIELD(Size), Name);
+  return ArrayRecord(L->ElementType, L->IndexType, Size, Name);
+}
+
+ErrorOr<ClassRecord> ClassRecord::deserialize(TypeRecordKind Kind,
+                                              ArrayRef<uint8_t> &Data) {
+  uint64_t Size = 0;
+  StringRef Name;
+  StringRef UniqueName;
+  uint16_t Props;
+  const Layout *L = nullptr;
+
+  CV_DESERIALIZE(Data, L, CV_NUMERIC_FIELD(Size), Name,
+                 CV_CONDITIONAL_FIELD(UniqueName, L->hasUniqueName()));
+
+  Props = L->Properties;
+  uint16_t WrtValue = (Props & WinRTKindMask) >> WinRTKindShift;
+  WindowsRTClassKind WRT = static_cast<WindowsRTClassKind>(WrtValue);
+  uint16_t HfaMask = (Props & HfaKindMask) >> HfaKindShift;
+  HfaKind Hfa = static_cast<HfaKind>(HfaMask);
+
+  ClassOptions Options = static_cast<ClassOptions>(Props);
+  return ClassRecord(Kind, L->MemberCount, Options, Hfa, WRT, L->FieldList,
+                     L->DerivedFrom, L->VShape, Size, Name, UniqueName);
+}
+
+ErrorOr<UnionRecord> UnionRecord::deserialize(TypeRecordKind Kind,
+                                              ArrayRef<uint8_t> &Data) {
+  uint64_t Size = 0;
+  StringRef Name;
+  StringRef UniqueName;
+  uint16_t Props;
+
+  const Layout *L = nullptr;
+  CV_DESERIALIZE(Data, L, CV_NUMERIC_FIELD(Size), Name,
+                 CV_CONDITIONAL_FIELD(UniqueName, L->hasUniqueName()));
+
+  Props = L->Properties;
+
+  uint16_t HfaMask = (Props & HfaKindMask) >> HfaKindShift;
+  HfaKind Hfa = static_cast<HfaKind>(HfaMask);
+  ClassOptions Options = static_cast<ClassOptions>(Props);
+  return UnionRecord(L->MemberCount, Options, Hfa, L->FieldList, Size, Name,
+                     UniqueName);
+}
+
+ErrorOr<EnumRecord> EnumRecord::deserialize(TypeRecordKind Kind,
+                                            ArrayRef<uint8_t> &Data) {
+  const Layout *L = nullptr;
+  StringRef Name;
+  StringRef UniqueName;
+  CV_DESERIALIZE(Data, L, Name,
+                 CV_CONDITIONAL_FIELD(UniqueName, L->hasUniqueName()));
+
+  uint16_t P = L->Properties;
+  ClassOptions Options = static_cast<ClassOptions>(P);
+  return EnumRecord(L->NumEnumerators, Options, L->FieldListType, Name,
+                    UniqueName, L->UnderlyingType);
+}
+
+ErrorOr<BitFieldRecord> BitFieldRecord::deserialize(TypeRecordKind Kind,
+                                                    ArrayRef<uint8_t> &Data) {
+  const Layout *L = nullptr;
+  CV_DESERIALIZE(Data, L);
+  return BitFieldRecord(L->Type, L->BitSize, L->BitOffset);
+}
+
+ErrorOr<VFTableShapeRecord>
+VFTableShapeRecord::deserialize(TypeRecordKind Kind, ArrayRef<uint8_t> &Data) {
+  const Layout *L = nullptr;
+  if (auto EC = consumeObject(Data, L))
+    return EC;
+
+  std::vector<VFTableSlotKind> Slots;
+  uint16_t Count = L->VFEntryCount;
+  while (Count > 0) {
+    if (Data.empty())
+      return std::make_error_code(std::errc::illegal_byte_sequence);
+
+    // Process up to 2 nibbles at a time (if there are at least 2 remaining)
+    uint8_t Value = Data[0] & 0x0F;
+    Slots.push_back(static_cast<VFTableSlotKind>(Value));
+    if (--Count > 0) {
+      Value = (Data[0] & 0xF0) >> 4;
+      Slots.push_back(static_cast<VFTableSlotKind>(Value));
+      --Count;
+    }
+    Data = Data.slice(1);
+  }
+
+  return VFTableShapeRecord(Slots);
+}
+
+ErrorOr<TypeServer2Record>
+TypeServer2Record::deserialize(TypeRecordKind Kind, ArrayRef<uint8_t> &Data) {
+  const Layout *L = nullptr;
+  StringRef Name;
+  CV_DESERIALIZE(Data, L, Name);
+  return TypeServer2Record(StringRef(L->Guid, 16), L->Age, Name);
+}
+
+ErrorOr<StringIdRecord> StringIdRecord::deserialize(TypeRecordKind Kind,
+                                                    ArrayRef<uint8_t> &Data) {
+  const Layout *L = nullptr;
+  StringRef Name;
+  CV_DESERIALIZE(Data, L, Name);
+  return StringIdRecord(L->id, Name);
+}
+
+ErrorOr<FuncIdRecord> FuncIdRecord::deserialize(TypeRecordKind Kind,
+                                                ArrayRef<uint8_t> &Data) {
+  const Layout *L = nullptr;
+  StringRef Name;
+  CV_DESERIALIZE(Data, L, Name);
+  return FuncIdRecord(L->ParentScope, L->FunctionType, Name);
+}
+
+ErrorOr<UdtSourceLineRecord>
+UdtSourceLineRecord::deserialize(TypeRecordKind Kind, ArrayRef<uint8_t> &Data) {
+  const Layout *L = nullptr;
+  CV_DESERIALIZE(Data, L);
+  return UdtSourceLineRecord(L->UDT, L->SourceFile, L->LineNumber);
+}
+
+ErrorOr<BuildInfoRecord> BuildInfoRecord::deserialize(TypeRecordKind Kind,
+                                                      ArrayRef<uint8_t> &Data) {
+  const Layout *L = nullptr;
+  ArrayRef<TypeIndex> Indices;
+  CV_DESERIALIZE(Data, L, CV_ARRAY_FIELD_N(Indices, L->NumArgs));
+  return BuildInfoRecord(Indices);
+}
+
+ErrorOr<VFTableRecord> VFTableRecord::deserialize(TypeRecordKind Kind,
+                                                  ArrayRef<uint8_t> &Data) {
+  const Layout *L = nullptr;
+  StringRef Name;
+  std::vector<StringRef> Names;
+  CV_DESERIALIZE(Data, L, Name, CV_ARRAY_FIELD_TAIL(Names));
+  return VFTableRecord(L->CompleteClass, L->OverriddenVFTable, L->VFPtrOffset,
+                       Name, Names);
+}
+
+ErrorOr<OneMethodRecord> OneMethodRecord::deserialize(TypeRecordKind Kind,
+                                                      ArrayRef<uint8_t> &Data) {
+  const Layout *L = nullptr;
+  StringRef Name;
+  int32_t VFTableOffset = -1;
+
+  CV_DESERIALIZE(Data, L, CV_CONDITIONAL_FIELD(VFTableOffset,
+                                               L->Attrs.isIntroducedVirtual()),
+                 Name);
+
+  MethodOptions Options = L->Attrs.getFlags();
+  MethodKind MethKind = L->Attrs.getMethodKind();
+  MemberAccess Access = L->Attrs.getAccess();
+  OneMethodRecord Method(L->Type, MethKind, Options, Access, VFTableOffset,
+                         Name);
+  // Validate the vftable offset.
+  if (Method.isIntroducingVirtual() && Method.getVFTableOffset() < 0)
+    return std::make_error_code(std::errc::illegal_byte_sequence);
+  return Method;
+}
+
+ErrorOr<MethodOverloadListRecord>
+MethodOverloadListRecord::deserialize(TypeRecordKind Kind,
+                                      ArrayRef<uint8_t> &Data) {
+  std::vector<OneMethodRecord> Methods;
+  while (!Data.empty()) {
+    const Layout *L = nullptr;
+    int32_t VFTableOffset = -1;
+    CV_DESERIALIZE(Data, L, CV_CONDITIONAL_FIELD(
+                                VFTableOffset, L->Attrs.isIntroducedVirtual()));
+
+    MethodOptions Options = L->Attrs.getFlags();
+    MethodKind MethKind = L->Attrs.getMethodKind();
+    MemberAccess Access = L->Attrs.getAccess();
+
+    Methods.emplace_back(L->Type, MethKind, Options, Access, VFTableOffset,
+                         StringRef());
+
+    // Validate the vftable offset.
+    auto &Method = Methods.back();
+    if (Method.isIntroducingVirtual() && Method.getVFTableOffset() < 0)
+      return std::make_error_code(std::errc::illegal_byte_sequence);
+  }
+  return MethodOverloadListRecord(Methods);
+}
+
+ErrorOr<OverloadedMethodRecord>
+OverloadedMethodRecord::deserialize(TypeRecordKind Kind,
+                                    ArrayRef<uint8_t> &Data) {
+  const Layout *L = nullptr;
+  StringRef Name;
+  CV_DESERIALIZE(Data, L, Name);
+  return OverloadedMethodRecord(L->MethodCount, L->MethList, Name);
+}
+
+ErrorOr<DataMemberRecord>
+DataMemberRecord::deserialize(TypeRecordKind Kind, ArrayRef<uint8_t> &Data) {
+  const Layout *L = nullptr;
+  uint64_t Offset;
+  StringRef Name;
+  CV_DESERIALIZE(Data, L, CV_NUMERIC_FIELD(Offset), Name);
+  return DataMemberRecord(L->Attrs.getAccess(), L->Type, Offset, Name);
+}
+
+ErrorOr<StaticDataMemberRecord>
+StaticDataMemberRecord::deserialize(TypeRecordKind Kind,
+                                    ArrayRef<uint8_t> &Data) {
+  const Layout *L = nullptr;
+  StringRef Name;
+  CV_DESERIALIZE(Data, L, Name);
+  return StaticDataMemberRecord(L->Attrs.getAccess(), L->Type, Name);
+}
+
+ErrorOr<EnumeratorRecord>
+EnumeratorRecord::deserialize(TypeRecordKind Kind, ArrayRef<uint8_t> &Data) {
+  const Layout *L = nullptr;
+  APSInt Value;
+  StringRef Name;
+  CV_DESERIALIZE(Data, L, Value, Name);
+  return EnumeratorRecord(L->Attrs.getAccess(), Value, Name);
+}
+
+ErrorOr<VFPtrRecord> VFPtrRecord::deserialize(TypeRecordKind Kind,
+                                              ArrayRef<uint8_t> &Data) {
+  const Layout *L = nullptr;
+  if (auto EC = consumeObject(Data, L))
+    return EC;
+  return VFPtrRecord(L->Type);
+}
+
+ErrorOr<BaseClassRecord> BaseClassRecord::deserialize(TypeRecordKind Kind,
+                                                      ArrayRef<uint8_t> &Data) {
+  const Layout *L = nullptr;
+  uint64_t Offset;
+  CV_DESERIALIZE(Data, L, CV_NUMERIC_FIELD(Offset));
+  return BaseClassRecord(L->Attrs.getAccess(), L->BaseType, Offset);
+}
+
+ErrorOr<VirtualBaseClassRecord>
+VirtualBaseClassRecord::deserialize(TypeRecordKind Kind,
+                                    ArrayRef<uint8_t> &Data) {
+  const Layout *L = nullptr;
+  uint64_t Offset;
+  uint64_t Index;
+  CV_DESERIALIZE(Data, L, CV_NUMERIC_FIELD(Offset), CV_NUMERIC_FIELD(Index));
+  return VirtualBaseClassRecord(L->Attrs.getAccess(), L->BaseType, L->VBPtrType,
+                                Offset, Index);
+}
+
+ErrorOr<ListContinuationRecord>
+ListContinuationRecord::deserialize(TypeRecordKind Kind,
+                                    ArrayRef<uint8_t> &Data) {
+  const Layout *L = nullptr;
+  CV_DESERIALIZE(Data, L);
+  return ListContinuationRecord(L->ContinuationIndex);
+}
+
+//===----------------------------------------------------------------------===//
+// Type index remapping
+//===----------------------------------------------------------------------===//
+
+static bool remapIndex(ArrayRef<TypeIndex> IndexMap, TypeIndex &Idx) {
+  // Simple types are unchanged.
+  if (Idx.isSimple())
+    return true;
+  unsigned MapPos = Idx.getIndex() - TypeIndex::FirstNonSimpleIndex;
+  if (MapPos < IndexMap.size()) {
+    Idx = IndexMap[MapPos];
+    return true;
+  }
+
+  // This type index is invalid. Remap this to "not translated by cvpack",
+  // and return failure.
+  Idx = TypeIndex(SimpleTypeKind::NotTranslated, SimpleTypeMode::Direct);
+  return false;
+}
+
+bool ModifierRecord::remapTypeIndices(ArrayRef<TypeIndex> IndexMap) {
+  return remapIndex(IndexMap, ModifiedType);
+}
+
+bool ProcedureRecord::remapTypeIndices(ArrayRef<TypeIndex> IndexMap) {
+  bool Success = true;
+  Success &= remapIndex(IndexMap, ReturnType);
+  Success &= remapIndex(IndexMap, ArgumentList);
+  return Success;
+}
+
+bool MemberFunctionRecord::remapTypeIndices(ArrayRef<TypeIndex> IndexMap) {
+  bool Success = true;
+  Success &= remapIndex(IndexMap, ReturnType);
+  Success &= remapIndex(IndexMap, ClassType);
+  Success &= remapIndex(IndexMap, ThisType);
+  Success &= remapIndex(IndexMap, ArgumentList);
+  return Success;
+}
+
+bool MemberFuncIdRecord::remapTypeIndices(ArrayRef<TypeIndex> IndexMap) {
+  bool Success = true;
+  Success &= remapIndex(IndexMap, ClassType);
+  Success &= remapIndex(IndexMap, FunctionType);
+  return Success;
+}
+
+bool ArgListRecord::remapTypeIndices(ArrayRef<TypeIndex> IndexMap) {
+  bool Success = true;
+  for (TypeIndex &Str : StringIndices)
+    Success &= remapIndex(IndexMap, Str);
+  return Success;
+}
+
+bool MemberPointerInfo::remapTypeIndices(ArrayRef<TypeIndex> IndexMap) {
+  return remapIndex(IndexMap, ContainingType);
+}
+
+bool PointerRecord::remapTypeIndices(ArrayRef<TypeIndex> IndexMap) {
+  bool Success = true;
+  Success &= remapIndex(IndexMap, ReferentType);
+  if (isPointerToMember())
+    Success &= MemberInfo.remapTypeIndices(IndexMap);
+  return Success;
+}
+
+bool NestedTypeRecord::remapTypeIndices(ArrayRef<TypeIndex> IndexMap) {
+  return remapIndex(IndexMap, Type);
+}
+
+bool ArrayRecord::remapTypeIndices(ArrayRef<TypeIndex> IndexMap) {
+  bool Success = true;
+  Success &= remapIndex(IndexMap, ElementType);
+  Success &= remapIndex(IndexMap, IndexType);
+  return Success;
+}
+
+bool TagRecord::remapTypeIndices(ArrayRef<TypeIndex> IndexMap) {
+  return remapIndex(IndexMap, FieldList);
+}
+
+bool ClassRecord::remapTypeIndices(ArrayRef<TypeIndex> IndexMap) {
+  bool Success = true;
+  Success &= TagRecord::remapTypeIndices(IndexMap);
+  Success &= remapIndex(IndexMap, DerivationList);
+  Success &= remapIndex(IndexMap, VTableShape);
+  return Success;
+}
+
+bool EnumRecord::remapTypeIndices(ArrayRef<TypeIndex> IndexMap) {
+  bool Success = true;
+  Success &= TagRecord::remapTypeIndices(IndexMap);
+  Success &= remapIndex(IndexMap, UnderlyingType);
+  return Success;
+}
+
+bool BitFieldRecord::remapTypeIndices(ArrayRef<TypeIndex> IndexMap) {
+  return remapIndex(IndexMap, Type);
+}
+
+bool VFTableShapeRecord::remapTypeIndices(ArrayRef<TypeIndex> IndexMap) {
+  return true;
+}
+
+bool TypeServer2Record::remapTypeIndices(ArrayRef<TypeIndex> IndexMap) {
+  return true;
+}
+
+bool StringIdRecord::remapTypeIndices(ArrayRef<TypeIndex> IndexMap) {
+  return remapIndex(IndexMap, Id);
+}
+
+bool FuncIdRecord::remapTypeIndices(ArrayRef<TypeIndex> IndexMap) {
+  bool Success = true;
+  Success &= remapIndex(IndexMap, ParentScope);
+  Success &= remapIndex(IndexMap, FunctionType);
+  return Success;
+}
+
+bool UdtSourceLineRecord::remapTypeIndices(ArrayRef<TypeIndex> IndexMap) {
+  bool Success = true;
+  Success &= remapIndex(IndexMap, UDT);
+  Success &= remapIndex(IndexMap, SourceFile);
+  return Success;
+}
+
+bool UdtModSourceLineRecord::remapTypeIndices(ArrayRef<TypeIndex> IndexMap) {
+  bool Success = true;
+  Success &= remapIndex(IndexMap, UDT);
+  Success &= remapIndex(IndexMap, SourceFile);
+  return Success;
+}
+
+bool BuildInfoRecord::remapTypeIndices(ArrayRef<TypeIndex> IndexMap) {
+  bool Success = true;
+  for (TypeIndex &Arg : ArgIndices)
+    Success &= remapIndex(IndexMap, Arg);
+  return Success;
+}
+
+bool VFTableRecord::remapTypeIndices(ArrayRef<TypeIndex> IndexMap) {
+  bool Success = true;
+  Success &= remapIndex(IndexMap, CompleteClass);
+  Success &= remapIndex(IndexMap, OverriddenVFTable);
+  return Success;
+}
+
+bool OneMethodRecord::remapTypeIndices(ArrayRef<TypeIndex> IndexMap) {
+  bool Success = true;
+  Success &= remapIndex(IndexMap, Type);
+  return Success;
+}
+
+bool MethodOverloadListRecord::remapTypeIndices(ArrayRef<TypeIndex> IndexMap) {
+  bool Success = true;
+  for (OneMethodRecord &Meth : Methods)
+    if ((Success = Meth.remapTypeIndices(IndexMap)))
+      return Success;
+  return Success;
+}
+
+bool OverloadedMethodRecord::remapTypeIndices(ArrayRef<TypeIndex> IndexMap) {
+  return remapIndex(IndexMap, MethodList);
+}
+
+bool DataMemberRecord::remapTypeIndices(ArrayRef<TypeIndex> IndexMap) {
+  return remapIndex(IndexMap, Type);
+}
+
+bool StaticDataMemberRecord::remapTypeIndices(ArrayRef<TypeIndex> IndexMap) {
+  return remapIndex(IndexMap, Type);
+}
+
+bool EnumeratorRecord::remapTypeIndices(ArrayRef<TypeIndex> IndexMap) {
+  return true;
+}
+
+bool VFPtrRecord::remapTypeIndices(ArrayRef<TypeIndex> IndexMap) {
+  return remapIndex(IndexMap, Type);
+}
+
+bool BaseClassRecord::remapTypeIndices(ArrayRef<TypeIndex> IndexMap) {
+  return remapIndex(IndexMap, Type);
+}
+
+bool VirtualBaseClassRecord::remapTypeIndices(ArrayRef<TypeIndex> IndexMap) {
+  bool Success = true;
+  Success &= remapIndex(IndexMap, BaseType);
+  Success &= remapIndex(IndexMap, VBPtrType);
+  return Success;
+}
+
+bool ListContinuationRecord::remapTypeIndices(ArrayRef<TypeIndex> IndexMap) {
+  return remapIndex(IndexMap, ContinuationIndex);
+}
diff --git a/lib/DebugInfo/CodeView/TypeRecordBuilder.cpp b/lib/DebugInfo/CodeView/TypeRecordBuilder.cpp
index cbf464fd7668..112612cc85ea 100644
--- a/lib/DebugInfo/CodeView/TypeRecordBuilder.cpp
+++ b/lib/DebugInfo/CodeView/TypeRecordBuilder.cpp
@@ -12,8 +12,8 @@
 using namespace llvm;
 using namespace codeview;
 
-TypeRecordBuilder::TypeRecordBuilder(TypeRecordKind Kind) : Stream(Buffer),
-  Writer(Stream) {
+TypeRecordBuilder::TypeRecordBuilder(TypeRecordKind Kind)
+    : Stream(Buffer), Writer(Stream) {
   writeTypeRecordKind(Kind);
 }
 
@@ -60,50 +60,50 @@ void TypeRecordBuilder::writeEncodedInteger(int64_t Value) {
 void TypeRecordBuilder::writeEncodedSignedInteger(int64_t Value) {
   if (Value >= std::numeric_limits<int8_t>::min() &&
       Value <= std::numeric_limits<int8_t>::max()) {
-    writeUInt16(static_cast<uint16_t>(TypeRecordKind::SByte));
+    writeUInt16(LF_CHAR);
     writeInt16(static_cast<int8_t>(Value));
   } else if (Value >= std::numeric_limits<int16_t>::min() &&
              Value <= std::numeric_limits<int16_t>::max()) {
-    writeUInt16(static_cast<uint16_t>(TypeRecordKind::Int16));
+    writeUInt16(LF_SHORT);
     writeInt16(static_cast<int16_t>(Value));
   } else if (Value >= std::numeric_limits<int32_t>::min() &&
              Value <= std::numeric_limits<int32_t>::max()) {
-    writeUInt16(static_cast<uint32_t>(TypeRecordKind::Int32));
+    writeUInt16(LF_LONG);
     writeInt32(static_cast<int32_t>(Value));
   } else {
-    writeUInt16(static_cast<uint16_t>(TypeRecordKind::Int64));
+    writeUInt16(LF_QUADWORD);
     writeInt64(Value);
   }
 }
 
 void TypeRecordBuilder::writeEncodedUnsignedInteger(uint64_t Value) {
-  if (Value < static_cast<uint16_t>(TypeRecordKind::SByte)) {
+  if (Value < LF_CHAR) {
     writeUInt16(static_cast<uint16_t>(Value));
   } else if (Value <= std::numeric_limits<uint16_t>::max()) {
-    writeUInt16(static_cast<uint16_t>(TypeRecordKind::UInt16));
+    writeUInt16(LF_USHORT);
     writeUInt16(static_cast<uint16_t>(Value));
   } else if (Value <= std::numeric_limits<uint32_t>::max()) {
-    writeUInt16(static_cast<uint16_t>(TypeRecordKind::UInt32));
+    writeUInt16(LF_ULONG);
     writeUInt32(static_cast<uint32_t>(Value));
   } else {
-    writeUInt16(static_cast<uint16_t>(TypeRecordKind::UInt64));
+    writeUInt16(LF_UQUADWORD);
     writeUInt64(Value);
   }
 }
 
-void TypeRecordBuilder::writeNullTerminatedString(const char *Value) {
-  assert(Value != nullptr);
-
-  size_t Length = strlen(Value);
-  Stream.write(Value, Length);
-  writeUInt8(0);
-}
-
 void TypeRecordBuilder::writeNullTerminatedString(StringRef Value) {
+  // Microsoft's linker seems to have trouble with symbol names longer than
+  // 0xffd8 bytes.
+  Value = Value.substr(0, 0xffd8);
   Stream.write(Value.data(), Value.size());
   writeUInt8(0);
 }
 
+void TypeRecordBuilder::writeGuid(StringRef Guid) {
+  assert(Guid.size() == 16);
+  Stream.write(Guid.data(), 16);
+}
+
 void TypeRecordBuilder::writeTypeIndex(TypeIndex TypeInd) {
   writeUInt32(TypeInd.getIndex());
 }
diff --git a/lib/DebugInfo/CodeView/TypeStreamMerger.cpp b/lib/DebugInfo/CodeView/TypeStreamMerger.cpp
new file mode 100644
index 000000000000..ebfda2462be1
--- /dev/null
+++ b/lib/DebugInfo/CodeView/TypeStreamMerger.cpp
@@ -0,0 +1,149 @@
+//===-- TypeStreamMerger.cpp ------------------------------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/DebugInfo/CodeView/TypeStreamMerger.h"
+#include "llvm/ADT/SmallString.h"
+#include "llvm/ADT/StringExtras.h"
+#include "llvm/DebugInfo/CodeView/CVTypeVisitor.h"
+#include "llvm/DebugInfo/CodeView/FieldListRecordBuilder.h"
+#include "llvm/DebugInfo/CodeView/StreamRef.h"
+#include "llvm/DebugInfo/CodeView/TypeIndex.h"
+#include "llvm/DebugInfo/CodeView/TypeRecord.h"
+#include "llvm/DebugInfo/CodeView/TypeVisitorCallbacks.h"
+#include "llvm/Support/Error.h"
+#include "llvm/Support/ScopedPrinter.h"
+
+using namespace llvm;
+using namespace llvm::codeview;
+
+namespace {
+
+/// Implementation of CodeView type stream merging.
+///
+/// A CodeView type stream is a series of records that reference each other
+/// through type indices. A type index is either "simple", meaning it is less
+/// than 0x1000 and refers to a builtin type, or it is complex, meaning it
+/// refers to a prior type record in the current stream. The type index of a
+/// record is equal to the number of records before it in the stream plus
+/// 0x1000.
+///
+/// Type records are only allowed to use type indices smaller than their own, so
+/// a type stream is effectively a topologically sorted DAG. Cycles occuring in
+/// the type graph of the source program are resolved with forward declarations
+/// of composite types. This class implements the following type stream merging
+/// algorithm, which relies on this DAG structure:
+///
+/// - Begin with a new empty stream, and a new empty hash table that maps from
+///   type record contents to new type index.
+/// - For each new type stream, maintain a map from source type index to
+///   destination type index.
+/// - For each record, copy it and rewrite its type indices to be valid in the
+///   destination type stream.
+/// - If the new type record is not already present in the destination stream
+///   hash table, append it to the destination type stream, assign it the next
+///   type index, and update the two hash tables.
+/// - If the type record already exists in the destination stream, discard it
+///   and update the type index map to forward the source type index to the
+///   existing destination type index.
+class TypeStreamMerger : public TypeVisitorCallbacks {
+public:
+  TypeStreamMerger(TypeTableBuilder &DestStream) : DestStream(DestStream) {
+    assert(!hadError());
+  }
+
+/// TypeVisitorCallbacks overrides.
+#define TYPE_RECORD(EnumName, EnumVal, Name)                                   \
+  Error visit##Name(Name##Record &Record) override;
+#define MEMBER_RECORD(EnumName, EnumVal, Name)                                 \
+  TYPE_RECORD(EnumName, EnumVal, Name)
+#define TYPE_RECORD_ALIAS(EnumName, EnumVal, Name, AliasName)
+#define MEMBER_RECORD_ALIAS(EnumName, EnumVal, Name, AliasName)
+#include "llvm/DebugInfo/CodeView/TypeRecords.def"
+
+  Error visitUnknownType(const CVRecord<TypeLeafKind> &Record) override;
+
+  Error visitTypeBegin(const CVRecord<TypeLeafKind> &Record) override;
+  Error visitTypeEnd(const CVRecord<TypeLeafKind> &Record) override;
+
+  Error visitFieldListEnd(const CVRecord<TypeLeafKind> &Record) override;
+
+  bool mergeStream(const CVTypeArray &Types);
+
+private:
+  bool hadError() { return FoundBadTypeIndex; }
+
+  bool FoundBadTypeIndex = false;
+
+  FieldListRecordBuilder FieldBuilder;
+
+  TypeTableBuilder &DestStream;
+
+  size_t BeginIndexMapSize = 0;
+
+  /// Map from source type index to destination type index. Indexed by source
+  /// type index minus 0x1000.
+  SmallVector<TypeIndex, 0> IndexMap;
+};
+
+} // end anonymous namespace
+
+Error TypeStreamMerger::visitTypeBegin(const CVRecord<TypeLeafKind> &Rec) {
+  BeginIndexMapSize = IndexMap.size();
+  return Error::success();
+}
+
+Error TypeStreamMerger::visitTypeEnd(const CVRecord<TypeLeafKind> &Rec) {
+  assert(IndexMap.size() == BeginIndexMapSize + 1);
+  return Error::success();
+}
+
+Error TypeStreamMerger::visitFieldListEnd(const CVRecord<TypeLeafKind> &Rec) {
+  IndexMap.push_back(DestStream.writeFieldList(FieldBuilder));
+  FieldBuilder.reset();
+  return Error::success();
+}
+
+#define TYPE_RECORD(EnumName, EnumVal, Name)                                   \
+  Error TypeStreamMerger::visit##Name(Name##Record &Record) {                  \
+    FoundBadTypeIndex |= !Record.remapTypeIndices(IndexMap);                   \
+    IndexMap.push_back(DestStream.write##Name(Record));                        \
+    return Error::success();                                                   \
+  }
+#define TYPE_RECORD_ALIAS(EnumName, EnumVal, Name, AliasName)
+#define MEMBER_RECORD(EnumName, EnumVal, Name)                                 \
+  Error TypeStreamMerger::visit##Name(Name##Record &Record) {                  \
+    FoundBadTypeIndex |= !Record.remapTypeIndices(IndexMap);                   \
+    FieldBuilder.write##Name(Record);                                          \
+    return Error::success();                                                   \
+  }
+#define MEMBER_RECORD_ALIAS(EnumName, EnumVal, Name, AliasName)
+#include "llvm/DebugInfo/CodeView/TypeRecords.def"
+
+Error TypeStreamMerger::visitUnknownType(const CVRecord<TypeLeafKind> &Rec) {
+  // We failed to translate a type. Translate this index as "not translated".
+  IndexMap.push_back(
+      TypeIndex(SimpleTypeKind::NotTranslated, SimpleTypeMode::Direct));
+  return llvm::make_error<CodeViewError>(cv_error_code::corrupt_record);
+}
+
+bool TypeStreamMerger::mergeStream(const CVTypeArray &Types) {
+  assert(IndexMap.empty());
+  CVTypeVisitor Visitor(*this);
+  if (auto EC = Visitor.visitTypeStream(Types)) {
+    consumeError(std::move(EC));
+    return false;
+  }
+  IndexMap.clear();
+  return !hadError();
+}
+
+bool llvm::codeview::mergeTypeStreams(TypeTableBuilder &DestStream,
+                                      const CVTypeArray &Types) {
+  return TypeStreamMerger(DestStream).mergeStream(Types);
+}
diff --git a/lib/DebugInfo/CodeView/TypeTableBuilder.cpp b/lib/DebugInfo/CodeView/TypeTableBuilder.cpp
index 4af5dcaf7228..647538ee8ceb 100644
--- a/lib/DebugInfo/CodeView/TypeTableBuilder.cpp
+++ b/lib/DebugInfo/CodeView/TypeTableBuilder.cpp
@@ -8,7 +8,6 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/DebugInfo/CodeView/TypeTableBuilder.h"
-#include "llvm/ADT/SmallVector.h"
 #include "llvm/DebugInfo/CodeView/FieldListRecordBuilder.h"
 #include "llvm/DebugInfo/CodeView/MethodListRecordBuilder.h"
 #include "llvm/DebugInfo/CodeView/TypeIndex.h"
@@ -18,42 +17,21 @@
 using namespace llvm;
 using namespace codeview;
 
-namespace {
-
-const int PointerKindShift = 0;
-const int PointerModeShift = 5;
-const int PointerSizeShift = 13;
-
-const int ClassHfaKindShift = 11;
-const int ClassWindowsRTClassKindShift = 14;
-
-void writePointerBase(TypeRecordBuilder &Builder,
-                      const PointerRecordBase &Record) {
-  Builder.writeTypeIndex(Record.getReferentType());
-  uint32_t flags =
-      static_cast<uint32_t>(Record.getOptions()) |
-      (Record.getSize() << PointerSizeShift) |
-      (static_cast<uint32_t>(Record.getMode()) << PointerModeShift) |
-      (static_cast<uint32_t>(Record.getPointerKind()) << PointerKindShift);
-  Builder.writeUInt32(flags);
-}
-}
-
 TypeTableBuilder::TypeTableBuilder() {}
 
 TypeTableBuilder::~TypeTableBuilder() {}
 
 TypeIndex TypeTableBuilder::writeModifier(const ModifierRecord &Record) {
-  TypeRecordBuilder Builder(TypeRecordKind::Modifier);
+  TypeRecordBuilder Builder(Record.getKind());
 
   Builder.writeTypeIndex(Record.getModifiedType());
-  Builder.writeUInt16(static_cast<uint16_t>(Record.getOptions()));
+  Builder.writeUInt16(static_cast<uint16_t>(Record.getModifiers()));
 
   return writeRecord(Builder);
 }
 
 TypeIndex TypeTableBuilder::writeProcedure(const ProcedureRecord &Record) {
-  TypeRecordBuilder Builder(TypeRecordKind::Procedure);
+  TypeRecordBuilder Builder(Record.getKind());
 
   Builder.writeTypeIndex(Record.getReturnType());
   Builder.writeUInt8(static_cast<uint8_t>(Record.getCallConv()));
@@ -66,7 +44,7 @@ TypeIndex TypeTableBuilder::writeProcedure(const ProcedureRecord &Record) {
 
 TypeIndex
 TypeTableBuilder::writeMemberFunction(const MemberFunctionRecord &Record) {
-  TypeRecordBuilder Builder(TypeRecordKind::MemberFunction);
+  TypeRecordBuilder Builder(Record.getKind());
 
   Builder.writeTypeIndex(Record.getReturnType());
   Builder.writeTypeIndex(Record.getClassType());
@@ -80,12 +58,11 @@ TypeTableBuilder::writeMemberFunction(const MemberFunctionRecord &Record) {
   return writeRecord(Builder);
 }
 
-TypeIndex
-TypeTableBuilder::writeArgumentList(const ArgumentListRecord &Record) {
-  TypeRecordBuilder Builder(TypeRecordKind::ArgumentList);
+TypeIndex TypeTableBuilder::writeArgList(const ArgListRecord &Record) {
+  TypeRecordBuilder Builder(Record.getKind());
 
-  Builder.writeUInt32(Record.getArgumentTypes().size());
-  for (TypeIndex TI : Record.getArgumentTypes()) {
+  Builder.writeUInt32(Record.getIndices().size());
+  for (TypeIndex TI : Record.getIndices()) {
     Builder.writeTypeIndex(TI);
   }
 
@@ -93,27 +70,28 @@ TypeTableBuilder::writeArgumentList(const ArgumentListRecord &Record) {
 }
 
 TypeIndex TypeTableBuilder::writePointer(const PointerRecord &Record) {
-  TypeRecordBuilder Builder(TypeRecordKind::Pointer);
-
-  writePointerBase(Builder, Record);
-
-  return writeRecord(Builder);
-}
-
-TypeIndex
-TypeTableBuilder::writePointerToMember(const PointerToMemberRecord &Record) {
-  TypeRecordBuilder Builder(TypeRecordKind::Pointer);
+  TypeRecordBuilder Builder(Record.getKind());
 
-  writePointerBase(Builder, Record);
+  Builder.writeTypeIndex(Record.getReferentType());
+  uint32_t flags = static_cast<uint32_t>(Record.getOptions()) |
+                   (Record.getSize() << PointerRecord::PointerSizeShift) |
+                   (static_cast<uint32_t>(Record.getMode())
+                    << PointerRecord::PointerModeShift) |
+                   (static_cast<uint32_t>(Record.getPointerKind())
+                    << PointerRecord::PointerKindShift);
+  Builder.writeUInt32(flags);
 
-  Builder.writeTypeIndex(Record.getContainingType());
-  Builder.writeUInt16(static_cast<uint16_t>(Record.getRepresentation()));
+  if (Record.isPointerToMember()) {
+    const MemberPointerInfo &M = Record.getMemberInfo();
+    Builder.writeTypeIndex(M.getContainingType());
+    Builder.writeUInt16(static_cast<uint16_t>(M.getRepresentation()));
+  }
 
   return writeRecord(Builder);
 }
 
 TypeIndex TypeTableBuilder::writeArray(const ArrayRecord &Record) {
-  TypeRecordBuilder Builder(TypeRecordKind::Array);
+  TypeRecordBuilder Builder(Record.getKind());
 
   Builder.writeTypeIndex(Record.getElementType());
   Builder.writeTypeIndex(Record.getIndexType());
@@ -123,28 +101,23 @@ TypeIndex TypeTableBuilder::writeArray(const ArrayRecord &Record) {
   return writeRecord(Builder);
 }
 
-TypeIndex TypeTableBuilder::writeAggregate(const AggregateRecord &Record) {
-  assert((Record.getKind() == TypeRecordKind::Structure) ||
+TypeIndex TypeTableBuilder::writeClass(const ClassRecord &Record) {
+  assert((Record.getKind() == TypeRecordKind::Struct) ||
          (Record.getKind() == TypeRecordKind::Class) ||
-         (Record.getKind() == TypeRecordKind::Union));
+         (Record.getKind() == TypeRecordKind::Interface));
 
   TypeRecordBuilder Builder(Record.getKind());
 
   Builder.writeUInt16(Record.getMemberCount());
   uint16_t Flags =
       static_cast<uint16_t>(Record.getOptions()) |
-      (static_cast<uint16_t>(Record.getHfa()) << ClassHfaKindShift) |
+      (static_cast<uint16_t>(Record.getHfa()) << ClassRecord::HfaKindShift) |
       (static_cast<uint16_t>(Record.getWinRTKind())
-       << ClassWindowsRTClassKindShift);
+       << ClassRecord::WinRTKindShift);
   Builder.writeUInt16(Flags);
   Builder.writeTypeIndex(Record.getFieldList());
-  if (Record.getKind() != TypeRecordKind::Union) {
-    Builder.writeTypeIndex(Record.getDerivationList());
-    Builder.writeTypeIndex(Record.getVTableShape());
-  } else {
-    assert(Record.getDerivationList() == TypeIndex());
-    assert(Record.getVTableShape() == TypeIndex());
-  }
+  Builder.writeTypeIndex(Record.getDerivationList());
+  Builder.writeTypeIndex(Record.getVTableShape());
   Builder.writeEncodedUnsignedInteger(Record.getSize());
   Builder.writeNullTerminatedString(Record.getName());
   if ((Record.getOptions() & ClassOptions::HasUniqueName) !=
@@ -155,8 +128,25 @@ TypeIndex TypeTableBuilder::writeAggregate(const AggregateRecord &Record) {
   return writeRecord(Builder);
 }
 
+TypeIndex TypeTableBuilder::writeUnion(const UnionRecord &Record) {
+  TypeRecordBuilder Builder(TypeRecordKind::Union);
+  Builder.writeUInt16(Record.getMemberCount());
+  uint16_t Flags =
+      static_cast<uint16_t>(Record.getOptions()) |
+      (static_cast<uint16_t>(Record.getHfa()) << ClassRecord::HfaKindShift);
+  Builder.writeUInt16(Flags);
+  Builder.writeTypeIndex(Record.getFieldList());
+  Builder.writeEncodedUnsignedInteger(Record.getSize());
+  Builder.writeNullTerminatedString(Record.getName());
+  if ((Record.getOptions() & ClassOptions::HasUniqueName) !=
+      ClassOptions::None) {
+    Builder.writeNullTerminatedString(Record.getUniqueName());
+  }
+  return writeRecord(Builder);
+}
+
 TypeIndex TypeTableBuilder::writeEnum(const EnumRecord &Record) {
-  TypeRecordBuilder Builder(TypeRecordKind::Enum);
+  TypeRecordBuilder Builder(Record.getKind());
 
   Builder.writeUInt16(Record.getMemberCount());
   Builder.writeUInt16(static_cast<uint16_t>(Record.getOptions()));
@@ -172,7 +162,7 @@ TypeIndex TypeTableBuilder::writeEnum(const EnumRecord &Record) {
 }
 
 TypeIndex TypeTableBuilder::writeBitField(const BitFieldRecord &Record) {
-  TypeRecordBuilder Builder(TypeRecordKind::BitField);
+  TypeRecordBuilder Builder(Record.getKind());
 
   Builder.writeTypeIndex(Record.getType());
   Builder.writeUInt8(Record.getBitSize());
@@ -181,11 +171,11 @@ TypeIndex TypeTableBuilder::writeBitField(const BitFieldRecord &Record) {
   return writeRecord(Builder);
 }
 
-TypeIndex TypeTableBuilder::writeVirtualTableShape(
-    const VirtualTableShapeRecord &Record) {
-  TypeRecordBuilder Builder(TypeRecordKind::VirtualTableShape);
+TypeIndex
+TypeTableBuilder::writeVFTableShape(const VFTableShapeRecord &Record) {
+  TypeRecordBuilder Builder(Record.getKind());
 
-  ArrayRef<VirtualTableSlotKind> Slots = Record.getSlots();
+  ArrayRef<VFTableSlotKind> Slots = Record.getSlots();
 
   Builder.writeUInt16(Slots.size());
   for (size_t SlotIndex = 0; SlotIndex < Slots.size(); SlotIndex += 2) {
@@ -199,19 +189,115 @@ TypeIndex TypeTableBuilder::writeVirtualTableShape(
   return writeRecord(Builder);
 }
 
+TypeIndex
+TypeTableBuilder::writeVFTable(const VFTableRecord &Record) {
+  TypeRecordBuilder Builder(Record.getKind());
+  Builder.writeTypeIndex(Record.getCompleteClass());
+  Builder.writeTypeIndex(Record.getOverriddenVTable());
+  Builder.writeUInt32(Record.getVFPtrOffset());
+
+  // Sum up the lengths of the null-terminated names.
+  size_t NamesLen = Record.getName().size() + 1;
+  for (StringRef MethodName : Record.getMethodNames())
+    NamesLen += MethodName.size() + 1;
+
+  Builder.writeUInt32(NamesLen);
+  Builder.writeNullTerminatedString(Record.getName());
+  for (StringRef MethodName : Record.getMethodNames())
+    Builder.writeNullTerminatedString(MethodName);
+
+  return writeRecord(Builder);
+}
+
+TypeIndex TypeTableBuilder::writeStringId(const StringIdRecord &Record) {
+  TypeRecordBuilder Builder(TypeRecordKind::StringId);
+  Builder.writeTypeIndex(Record.getId());
+  Builder.writeNullTerminatedString(Record.getString());
+  return writeRecord(Builder);
+}
+
+TypeIndex
+TypeTableBuilder::writeUdtSourceLine(const UdtSourceLineRecord &Record) {
+  TypeRecordBuilder Builder(Record.getKind());
+  Builder.writeTypeIndex(Record.getUDT());
+  Builder.writeTypeIndex(Record.getSourceFile());
+  Builder.writeUInt32(Record.getLineNumber());
+  return writeRecord(Builder);
+}
+
+TypeIndex
+TypeTableBuilder::writeUdtModSourceLine(const UdtModSourceLineRecord &Record) {
+  TypeRecordBuilder Builder(Record.getKind());
+  Builder.writeTypeIndex(Record.getUDT());
+  Builder.writeTypeIndex(Record.getSourceFile());
+  Builder.writeUInt32(Record.getLineNumber());
+  Builder.writeUInt16(Record.getModule());
+  return writeRecord(Builder);
+}
+
+TypeIndex TypeTableBuilder::writeFuncId(const FuncIdRecord &Record) {
+  TypeRecordBuilder Builder(Record.getKind());
+  Builder.writeTypeIndex(Record.getParentScope());
+  Builder.writeTypeIndex(Record.getFunctionType());
+  Builder.writeNullTerminatedString(Record.getName());
+  return writeRecord(Builder);
+}
+
+TypeIndex
+TypeTableBuilder::writeMemberFuncId(const MemberFuncIdRecord &Record) {
+  TypeRecordBuilder Builder(Record.getKind());
+  Builder.writeTypeIndex(Record.getClassType());
+  Builder.writeTypeIndex(Record.getFunctionType());
+  Builder.writeNullTerminatedString(Record.getName());
+  return writeRecord(Builder);
+}
+
+TypeIndex
+TypeTableBuilder::writeBuildInfo(const BuildInfoRecord &Record) {
+  TypeRecordBuilder Builder(Record.getKind());
+  assert(Record.getArgs().size() <= UINT16_MAX);
+  Builder.writeUInt16(Record.getArgs().size());
+  for (TypeIndex Arg : Record.getArgs())
+    Builder.writeTypeIndex(Arg);
+  return writeRecord(Builder);
+}
+
 TypeIndex TypeTableBuilder::writeRecord(TypeRecordBuilder &Builder) {
   return writeRecord(Builder.str());
 }
 
 TypeIndex TypeTableBuilder::writeFieldList(FieldListRecordBuilder &FieldList) {
-  // TODO: Split the list into multiple records if it's longer than 64KB, using
-  // a subrecord of TypeRecordKind::Index to chain the records together.
-  return writeRecord(FieldList.str());
+  return FieldList.writeListRecord(*this);
 }
 
-TypeIndex
-TypeTableBuilder::writeMethodList(MethodListRecordBuilder &MethodList) {
+TypeIndex TypeTableBuilder::writeMethodOverloadList(
+    const MethodOverloadListRecord &Record) {
+  TypeRecordBuilder Builder(Record.getKind());
+  for (const OneMethodRecord &Method : Record.getMethods()) {
+    uint16_t Flags = static_cast<uint16_t>(Method.getAccess());
+    Flags |= static_cast<uint16_t>(Method.getKind())
+             << MemberAttributes::MethodKindShift;
+    Flags |= static_cast<uint16_t>(Method.getOptions());
+    Builder.writeUInt16(Flags);
+    Builder.writeUInt16(0); // padding
+    Builder.writeTypeIndex(Method.getType());
+    if (Method.isIntroducingVirtual()) {
+      assert(Method.getVFTableOffset() >= 0);
+      Builder.writeInt32(Method.getVFTableOffset());
+    } else {
+      assert(Method.getVFTableOffset() == -1);
+    }
+  }
+
   // TODO: Split the list into multiple records if it's longer than 64KB, using
   // a subrecord of TypeRecordKind::Index to chain the records together.
-  return writeRecord(MethodList.str());
+  return writeRecord(Builder);
+}
+
+TypeIndex TypeTableBuilder::writeTypeServer2(const TypeServer2Record &Record) {
+  TypeRecordBuilder Builder(Record.getKind());
+  Builder.writeGuid(Record.getGuid());
+  Builder.writeUInt32(Record.getAge());
+  Builder.writeNullTerminatedString(Record.getName());
+  return writeRecord(Builder);
 }
diff --git a/lib/DebugInfo/DWARF/DWARFContext.cpp b/lib/DebugInfo/DWARF/DWARFContext.cpp
index a4195b75c47d..e8ea71b325ae 100644
--- a/lib/DebugInfo/DWARF/DWARFContext.cpp
+++ b/lib/DebugInfo/DWARF/DWARFContext.cpp
@@ -13,8 +13,11 @@
 #include "llvm/DebugInfo/DWARF/DWARFAcceleratorTable.h"
 #include "llvm/DebugInfo/DWARF/DWARFDebugArangeSet.h"
 #include "llvm/DebugInfo/DWARF/DWARFUnitIndex.h"
+#include "llvm/Object/MachO.h"
+#include "llvm/Object/RelocVisitor.h"
 #include "llvm/Support/Compression.h"
 #include "llvm/Support/Dwarf.h"
+#include "llvm/Support/ELF.h"
 #include "llvm/Support/Format.h"
 #include "llvm/Support/Path.h"
 #include "llvm/Support/raw_ostream.h"
@@ -72,7 +75,7 @@ static void dumpAccelSection(raw_ostream &OS, StringRef Name,
   Accel.dump(OS);
 }
 
-void DWARFContext::dump(raw_ostream &OS, DIDumpType DumpType) {
+void DWARFContext::dump(raw_ostream &OS, DIDumpType DumpType, bool DumpEH) {
   if (DumpType == DIDT_All || DumpType == DIDT_Abbrev) {
     OS << ".debug_abbrev contents:\n";
     getDebugAbbrev()->dump(OS);
@@ -125,6 +128,10 @@ void DWARFContext::dump(raw_ostream &OS, DIDumpType DumpType) {
   if (DumpType == DIDT_All || DumpType == DIDT_Frames) {
     OS << "\n.debug_frame contents:\n";
     getDebugFrame()->dump(OS);
+    if (DumpEH) {
+      OS << "\n.eh_frame contents:\n";
+      getEHFrame()->dump(OS);
+    }
   }
 
   if (DumpType == DIDT_All || DumpType == DIDT_Macro) {
@@ -355,7 +362,18 @@ const DWARFDebugFrame *DWARFContext::getDebugFrame() {
   // http://lists.dwarfstd.org/htdig.cgi/dwarf-discuss-dwarfstd.org/2011-December/001173.html
   DataExtractor debugFrameData(getDebugFrameSection(), isLittleEndian(),
                                getAddressSize());
-  DebugFrame.reset(new DWARFDebugFrame());
+  DebugFrame.reset(new DWARFDebugFrame(false /* IsEH */));
+  DebugFrame->parse(debugFrameData);
+  return DebugFrame.get();
+}
+
+const DWARFDebugFrame *DWARFContext::getEHFrame() {
+  if (EHFrame)
+    return EHFrame.get();
+
+  DataExtractor debugFrameData(getEHFrameSection(), isLittleEndian(),
+                               getAddressSize());
+  DebugFrame.reset(new DWARFDebugFrame(true /* IsEH */));
   DebugFrame->parse(debugFrameData);
   return DebugFrame.get();
 }
@@ -575,8 +593,8 @@ DWARFContext::getInliningInfoForAddress(uint64_t Address,
   return InliningInfo;
 }
 
-static bool consumeCompressedDebugSectionHeader(StringRef &data,
-                                                uint64_t &OriginalSize) {
+static bool consumeCompressedGnuHeader(StringRef &data,
+                                       uint64_t &OriginalSize) {
   // Consume "ZLIB" prefix.
   if (!data.startswith("ZLIB"))
     return false;
@@ -591,6 +609,50 @@ static bool consumeCompressedDebugSectionHeader(StringRef &data,
   return true;
 }
 
+static bool consumeCompressedZLibHeader(StringRef &Data, uint64_t &OriginalSize,
+                                        bool IsLE, bool Is64Bit) {
+  using namespace ELF;
+  uint64_t HdrSize = Is64Bit ? sizeof(Elf64_Chdr) : sizeof(Elf32_Chdr);
+  if (Data.size() < HdrSize)
+    return false;
+
+  DataExtractor Extractor(Data, IsLE, 0);
+  uint32_t Offset = 0;
+  if (Extractor.getUnsigned(&Offset, Is64Bit ? sizeof(Elf64_Word)
+                                             : sizeof(Elf32_Word)) !=
+      ELFCOMPRESS_ZLIB)
+    return false;
+
+  // Skip Elf64_Chdr::ch_reserved field.
+  if (Is64Bit)
+    Offset += sizeof(Elf64_Word);
+
+  OriginalSize = Extractor.getUnsigned(&Offset, Is64Bit ? sizeof(Elf64_Xword)
+                                                        : sizeof(Elf32_Word));
+  Data = Data.substr(HdrSize);
+  return true;
+}
+
+static bool tryDecompress(StringRef &Name, StringRef &Data,
+                          SmallString<32> &Out, bool ZLibStyle, bool IsLE,
+                          bool Is64Bit) {
+  if (!zlib::isAvailable())
+    return false;
+
+  uint64_t OriginalSize;
+  bool Result =
+      ZLibStyle ? consumeCompressedZLibHeader(Data, OriginalSize, IsLE, Is64Bit)
+                : consumeCompressedGnuHeader(Data, OriginalSize);
+
+  if (!Result || zlib::uncompress(Data, Out, OriginalSize) != zlib::StatusOK)
+    return false;
+
+  // gnu-style names are started from "z", consume that.
+  if (!ZLibStyle)
+    Name = Name.substr(1);
+  return true;
+}
+
 DWARFContextInMemory::DWARFContextInMemory(const object::ObjectFile &Obj,
     const LoadedObjectInfo *L)
     : IsLittleEndian(Obj.isLittleEndian()),
@@ -616,20 +678,13 @@ DWARFContextInMemory::DWARFContextInMemory(const object::ObjectFile &Obj,
 
     name = name.substr(name.find_first_not_of("._")); // Skip . and _ prefixes.
 
-    // Check if debug info section is compressed with zlib.
-    if (name.startswith("zdebug_")) {
-      uint64_t OriginalSize;
-      if (!zlib::isAvailable() ||
-          !consumeCompressedDebugSectionHeader(data, OriginalSize))
-        continue;
-      UncompressedSections.resize(UncompressedSections.size() + 1);
-      if (zlib::uncompress(data, UncompressedSections.back(), OriginalSize) !=
-          zlib::StatusOK) {
-        UncompressedSections.pop_back();
+    bool ZLibStyleCompressed = Section.isCompressed();
+    if (ZLibStyleCompressed || name.startswith("zdebug_")) {
+      SmallString<32> Out;
+      if (!tryDecompress(name, data, Out, ZLibStyleCompressed, IsLittleEndian,
+                         AddressSize == 8))
         continue;
-      }
-      // Make data point to uncompressed section contents and save its contents.
-      name = name.substr(1);
+      UncompressedSections.emplace_back(std::move(Out));
       data = UncompressedSections.back();
     }
 
@@ -641,6 +696,7 @@ DWARFContextInMemory::DWARFContextInMemory(const object::ObjectFile &Obj,
             .Case("debug_line", &LineSection.Data)
             .Case("debug_aranges", &ARangeSection)
             .Case("debug_frame", &DebugFrameSection)
+            .Case("eh_frame", &EHFrameSection)
             .Case("debug_str", &StringSection)
             .Case("debug_ranges", &RangeSection)
             .Case("debug_macinfo", &MacinfoSection)
@@ -739,15 +795,29 @@ DWARFContextInMemory::DWARFContextInMemory(const object::ObjectFile &Obj,
         // First calculate the address of the symbol or section as it appears
         // in the objct file
         if (Sym != Obj.symbol_end()) {
-          ErrorOr<uint64_t> SymAddrOrErr = Sym->getAddress();
-          if (std::error_code EC = SymAddrOrErr.getError()) {
+          Expected<uint64_t> SymAddrOrErr = Sym->getAddress();
+          if (!SymAddrOrErr) {
+            std::string Buf;
+            raw_string_ostream OS(Buf);
+            logAllUnhandledErrors(SymAddrOrErr.takeError(), OS, "");
+            OS.flush();
             errs() << "error: failed to compute symbol address: "
-                   << EC.message() << '\n';
+                   << Buf << '\n';
             continue;
           }
           SymAddr = *SymAddrOrErr;
           // Also remember what section this symbol is in for later
-          RSec = *Sym->getSection();
+          auto SectOrErr = Sym->getSection();
+          if (!SectOrErr) {
+            std::string Buf;
+            raw_string_ostream OS(Buf);
+            logAllUnhandledErrors(SectOrErr.takeError(), OS, "");
+            OS.flush();
+            errs() << "error: failed to get symbol section: "
+                   << Buf << '\n';
+            continue;
+          }
+          RSec = *SectOrErr;
         } else if (auto *MObj = dyn_cast<MachOObjectFile>(&Obj)) {
           // MachO also has relocations that point to sections and
           // scattered relocations.
diff --git a/lib/DebugInfo/DWARF/DWARFDebugFrame.cpp b/lib/DebugInfo/DWARF/DWARFDebugFrame.cpp
index 1aa31be71fee..9b6a9a788230 100644
--- a/lib/DebugInfo/DWARF/DWARFDebugFrame.cpp
+++ b/lib/DebugInfo/DWARF/DWARFDebugFrame.cpp
@@ -10,7 +10,9 @@
 #include "llvm/DebugInfo/DWARF/DWARFDebugFrame.h"
 #include "llvm/ADT/ArrayRef.h"
 #include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/Optional.h"
 #include "llvm/ADT/SmallString.h"
+#include "llvm/ADT/StringExtras.h"
 #include "llvm/Support/Casting.h"
 #include "llvm/Support/DataTypes.h"
 #include "llvm/Support/Dwarf.h"
@@ -18,6 +20,7 @@
 #include "llvm/Support/Format.h"
 #include "llvm/Support/raw_ostream.h"
 #include <string>
+#include <utility>
 #include <vector>
 
 using namespace llvm;
@@ -160,18 +163,26 @@ void FrameEntry::parseInstructions(DataExtractor Data, uint32_t *Offset,
         case DW_CFA_offset_extended:
         case DW_CFA_register:
         case DW_CFA_def_cfa:
-        case DW_CFA_val_offset:
+        case DW_CFA_val_offset: {
           // Operands: ULEB128, ULEB128
-          addInstruction(Opcode, Data.getULEB128(Offset),
-                                 Data.getULEB128(Offset));
+          // Note: We can not embed getULEB128 directly into function
+          // argument list. getULEB128 changes Offset and order of evaluation
+          // for arguments is unspecified.
+          auto op1 = Data.getULEB128(Offset);
+          auto op2 = Data.getULEB128(Offset);
+          addInstruction(Opcode, op1, op2);
           break;
+        }
         case DW_CFA_offset_extended_sf:
         case DW_CFA_def_cfa_sf:
-        case DW_CFA_val_offset_sf:
+        case DW_CFA_val_offset_sf: {
           // Operands: ULEB128, SLEB128
-          addInstruction(Opcode, Data.getULEB128(Offset),
-                                 Data.getSLEB128(Offset));
+          // Note: see comment for the previous case
+          auto op1 = Data.getULEB128(Offset);
+          auto op2 = (uint64_t)Data.getSLEB128(Offset);
+          addInstruction(Opcode, op1, op2);
           break;
+        }
         case DW_CFA_def_cfa_expression:
         case DW_CFA_expression:
         case DW_CFA_val_expression:
@@ -191,19 +202,30 @@ public:
   CIE(uint64_t Offset, uint64_t Length, uint8_t Version,
       SmallString<8> Augmentation, uint8_t AddressSize,
       uint8_t SegmentDescriptorSize, uint64_t CodeAlignmentFactor,
-      int64_t DataAlignmentFactor, uint64_t ReturnAddressRegister)
+      int64_t DataAlignmentFactor, uint64_t ReturnAddressRegister,
+      SmallString<8> AugmentationData, uint32_t FDEPointerEncoding,
+      uint32_t LSDAPointerEncoding)
       : FrameEntry(FK_CIE, Offset, Length), Version(Version),
-        Augmentation(std::move(Augmentation)),
-        AddressSize(AddressSize),
+        Augmentation(std::move(Augmentation)), AddressSize(AddressSize),
         SegmentDescriptorSize(SegmentDescriptorSize),
         CodeAlignmentFactor(CodeAlignmentFactor),
         DataAlignmentFactor(DataAlignmentFactor),
-        ReturnAddressRegister(ReturnAddressRegister) {}
+        ReturnAddressRegister(ReturnAddressRegister),
+        AugmentationData(std::move(AugmentationData)),
+        FDEPointerEncoding(FDEPointerEncoding),
+        LSDAPointerEncoding(LSDAPointerEncoding) {}
 
   ~CIE() override {}
 
+  StringRef getAugmentationString() const { return Augmentation; }
   uint64_t getCodeAlignmentFactor() const { return CodeAlignmentFactor; }
   int64_t getDataAlignmentFactor() const { return DataAlignmentFactor; }
+  uint32_t getFDEPointerEncoding() const {
+    return FDEPointerEncoding;
+  }
+  uint32_t getLSDAPointerEncoding() const {
+    return LSDAPointerEncoding;
+  }
 
   void dumpHeader(raw_ostream &OS) const override {
     OS << format("%08x %08x %08x CIE",
@@ -223,6 +245,12 @@ public:
                  (int32_t)DataAlignmentFactor);
     OS << format("  Return address column: %d\n",
                  (int32_t)ReturnAddressRegister);
+    if (!AugmentationData.empty()) {
+      OS << "  Augmentation data:    ";
+      for (uint8_t Byte : AugmentationData)
+        OS << ' ' << hexdigit(Byte >> 4) << hexdigit(Byte & 0xf);
+      OS << "\n";
+    }
     OS << "\n";
   }
 
@@ -239,6 +267,11 @@ private:
   uint64_t CodeAlignmentFactor;
   int64_t DataAlignmentFactor;
   uint64_t ReturnAddressRegister;
+
+  // The following are used when the CIE represents an EH frame entry.
+  SmallString<8> AugmentationData;
+  uint32_t FDEPointerEncoding;
+  uint32_t LSDAPointerEncoding;
 };
 
 
@@ -423,7 +456,7 @@ void FrameEntry::dumpInstructions(raw_ostream &OS) const {
   }
 }
 
-DWARFDebugFrame::DWARFDebugFrame() {
+DWARFDebugFrame::DWARFDebugFrame(bool IsEH) : IsEH(IsEH) {
 }
 
 DWARFDebugFrame::~DWARFDebugFrame() {
@@ -439,6 +472,39 @@ static void LLVM_ATTRIBUTE_UNUSED dumpDataAux(DataExtractor Data,
   errs() << "\n";
 }
 
+static unsigned getSizeForEncoding(const DataExtractor &Data,
+                                   unsigned symbolEncoding) {
+  unsigned format = symbolEncoding & 0x0f;
+  switch (format) {
+    default: llvm_unreachable("Unknown Encoding");
+    case dwarf::DW_EH_PE_absptr:
+    case dwarf::DW_EH_PE_signed:
+      return Data.getAddressSize();
+    case dwarf::DW_EH_PE_udata2:
+    case dwarf::DW_EH_PE_sdata2:
+      return 2;
+    case dwarf::DW_EH_PE_udata4:
+    case dwarf::DW_EH_PE_sdata4:
+      return 4;
+    case dwarf::DW_EH_PE_udata8:
+    case dwarf::DW_EH_PE_sdata8:
+      return 8;
+  }
+}
+
+static uint64_t readPointer(const DataExtractor &Data, uint32_t &Offset,
+                            unsigned Encoding) {
+  switch (getSizeForEncoding(Data, Encoding)) {
+    case 2:
+      return Data.getU16(&Offset);
+    case 4:
+      return Data.getU32(&Offset);
+    case 8:
+      return Data.getU64(&Offset);
+    default:
+      llvm_unreachable("Illegal data size");
+  }
+}
 
 void DWARFDebugFrame::parse(DataExtractor Data) {
   uint32_t Offset = 0;
@@ -447,6 +513,14 @@ void DWARFDebugFrame::parse(DataExtractor Data) {
   while (Data.isValidOffset(Offset)) {
     uint32_t StartOffset = Offset;
 
+    auto ReportError = [StartOffset](const char *ErrorMsg) {
+      std::string Str;
+      raw_string_ostream OS(Str);
+      OS << format(ErrorMsg, StartOffset);
+      OS.flush();
+      report_fatal_error(Str);
+    };
+
     bool IsDWARF64 = false;
     uint64_t Length = Data.getU32(&Offset);
     uint64_t Id;
@@ -465,47 +539,132 @@ void DWARFDebugFrame::parse(DataExtractor Data) {
     // read).
     // TODO: For honest DWARF64 support, DataExtractor will have to treat
     //       offset_ptr as uint64_t*
+    uint32_t StartStructureOffset = Offset;
     uint32_t EndStructureOffset = Offset + static_cast<uint32_t>(Length);
 
     // The Id field's size depends on the DWARF format
-    Id = Data.getUnsigned(&Offset, IsDWARF64 ? 8 : 4);
-    bool IsCIE = ((IsDWARF64 && Id == DW64_CIE_ID) || Id == DW_CIE_ID);
+    Id = Data.getUnsigned(&Offset, (IsDWARF64 && !IsEH) ? 8 : 4);
+    bool IsCIE = ((IsDWARF64 && Id == DW64_CIE_ID) ||
+                  Id == DW_CIE_ID ||
+                  (IsEH && !Id));
 
     if (IsCIE) {
       uint8_t Version = Data.getU8(&Offset);
       const char *Augmentation = Data.getCStr(&Offset);
-      uint8_t AddressSize = Version < 4 ? Data.getAddressSize() : Data.getU8(&Offset);
+      StringRef AugmentationString(Augmentation ? Augmentation : "");
+      uint8_t AddressSize = Version < 4 ? Data.getAddressSize() :
+                                          Data.getU8(&Offset);
       Data.setAddressSize(AddressSize);
       uint8_t SegmentDescriptorSize = Version < 4 ? 0 : Data.getU8(&Offset);
       uint64_t CodeAlignmentFactor = Data.getULEB128(&Offset);
       int64_t DataAlignmentFactor = Data.getSLEB128(&Offset);
       uint64_t ReturnAddressRegister = Data.getULEB128(&Offset);
 
+      // Parse the augmentation data for EH CIEs
+      StringRef AugmentationData("");
+      uint32_t FDEPointerEncoding = DW_EH_PE_omit;
+      uint32_t LSDAPointerEncoding = DW_EH_PE_omit;
+      if (IsEH) {
+        Optional<uint32_t> PersonalityEncoding;
+        Optional<uint64_t> Personality;
+
+        Optional<uint64_t> AugmentationLength;
+        uint32_t StartAugmentationOffset;
+        uint32_t EndAugmentationOffset;
+
+        // Walk the augmentation string to get all the augmentation data.
+        for (unsigned i = 0, e = AugmentationString.size(); i != e; ++i) {
+          switch (AugmentationString[i]) {
+            default:
+              ReportError("Unknown augmentation character in entry at %lx");
+            case 'L':
+              LSDAPointerEncoding = Data.getU8(&Offset);
+              break;
+            case 'P': {
+              if (Personality)
+                ReportError("Duplicate personality in entry at %lx");
+              PersonalityEncoding = Data.getU8(&Offset);
+              Personality = readPointer(Data, Offset, *PersonalityEncoding);
+              break;
+            }
+            case 'R':
+              FDEPointerEncoding = Data.getU8(&Offset);
+              break;
+            case 'z':
+              if (i)
+                ReportError("'z' must be the first character at %lx");
+              // Parse the augmentation length first.  We only parse it if
+              // the string contains a 'z'.
+              AugmentationLength = Data.getULEB128(&Offset);
+              StartAugmentationOffset = Offset;
+              EndAugmentationOffset = Offset +
+                static_cast<uint32_t>(*AugmentationLength);
+          }
+        }
+
+        if (AugmentationLength.hasValue()) {
+          if (Offset != EndAugmentationOffset)
+            ReportError("Parsing augmentation data at %lx failed");
+
+          AugmentationData = Data.getData().slice(StartAugmentationOffset,
+                                                  EndAugmentationOffset);
+        }
+      }
+
       auto Cie = make_unique<CIE>(StartOffset, Length, Version,
-                                  StringRef(Augmentation), AddressSize,
+                                  AugmentationString, AddressSize,
                                   SegmentDescriptorSize, CodeAlignmentFactor,
-                                  DataAlignmentFactor, ReturnAddressRegister);
+                                  DataAlignmentFactor, ReturnAddressRegister,
+                                  AugmentationData, FDEPointerEncoding,
+                                  LSDAPointerEncoding);
       CIEs[StartOffset] = Cie.get();
       Entries.emplace_back(std::move(Cie));
     } else {
       // FDE
       uint64_t CIEPointer = Id;
-      uint64_t InitialLocation = Data.getAddress(&Offset);
-      uint64_t AddressRange = Data.getAddress(&Offset);
+      uint64_t InitialLocation = 0;
+      uint64_t AddressRange = 0;
+      CIE *Cie = CIEs[IsEH ? (StartStructureOffset - CIEPointer) : CIEPointer];
+
+      if (IsEH) {
+        // The address size is encoded in the CIE we reference.
+        if (!Cie)
+          ReportError("Parsing FDE data at %lx failed due to missing CIE");
+
+        InitialLocation = readPointer(Data, Offset,
+                                      Cie->getFDEPointerEncoding());
+        AddressRange = readPointer(Data, Offset,
+                                   Cie->getFDEPointerEncoding());
+
+        StringRef AugmentationString = Cie->getAugmentationString();
+        if (!AugmentationString.empty()) {
+          // Parse the augmentation length and data for this FDE.
+          uint64_t AugmentationLength = Data.getULEB128(&Offset);
+
+          uint32_t EndAugmentationOffset =
+            Offset + static_cast<uint32_t>(AugmentationLength);
+
+          // Decode the LSDA if the CIE augmentation string said we should.
+          if (Cie->getLSDAPointerEncoding() != DW_EH_PE_omit)
+            readPointer(Data, Offset, Cie->getLSDAPointerEncoding());
+
+          if (Offset != EndAugmentationOffset)
+            ReportError("Parsing augmentation data at %lx failed");
+        }
+      } else {
+        InitialLocation = Data.getAddress(&Offset);
+        AddressRange = Data.getAddress(&Offset);
+      }
 
       Entries.emplace_back(new FDE(StartOffset, Length, CIEPointer,
                                    InitialLocation, AddressRange,
-                                   CIEs[CIEPointer]));
+                                   Cie));
     }
 
     Entries.back()->parseInstructions(Data, &Offset, EndStructureOffset);
 
-    if (Offset != EndStructureOffset) {
-      std::string Str;
-      raw_string_ostream OS(Str);
-      OS << format("Parsing entry instructions at %lx failed", StartOffset);
-      report_fatal_error(Str);
-    }
+    if (Offset != EndStructureOffset)
+      ReportError("Parsing entry instructions at %lx failed");
   }
 }
 
diff --git a/lib/DebugInfo/DWARF/DWARFDebugLine.cpp b/lib/DebugInfo/DWARF/DWARFDebugLine.cpp
index a0bee0da1765..30cb83398dd4 100644
--- a/lib/DebugInfo/DWARF/DWARFDebugLine.cpp
+++ b/lib/DebugInfo/DWARF/DWARFDebugLine.cpp
@@ -17,9 +17,7 @@ using namespace llvm;
 using namespace dwarf;
 typedef DILineInfoSpecifier::FileLineInfoKind FileLineInfoKind;
 
-DWARFDebugLine::Prologue::Prologue() {
-  clear();
-}
+DWARFDebugLine::Prologue::Prologue() { clear(); }
 
 void DWARFDebugLine::Prologue::clear() {
   TotalLength = Version = PrologueLength = 0;
@@ -44,12 +42,12 @@ void DWARFDebugLine::Prologue::dump(raw_ostream &OS) const {
      << format("     opcode_base: %u\n", OpcodeBase);
 
   for (uint32_t i = 0; i < StandardOpcodeLengths.size(); ++i)
-    OS << format("standard_opcode_lengths[%s] = %u\n", LNStandardString(i+1),
+    OS << format("standard_opcode_lengths[%s] = %u\n", LNStandardString(i + 1),
                  StandardOpcodeLengths[i]);
 
   if (!IncludeDirectories.empty())
     for (uint32_t i = 0; i < IncludeDirectories.size(); ++i)
-      OS << format("include_directories[%3u] = '", i+1)
+      OS << format("include_directories[%3u] = '", i + 1)
          << IncludeDirectories[i] << "'\n";
 
   if (!FileNames.empty()) {
@@ -57,10 +55,10 @@ void DWARFDebugLine::Prologue::dump(raw_ostream &OS) const {
        << "                ---- ---------- ---------- -----------"
           "----------------\n";
     for (uint32_t i = 0; i < FileNames.size(); ++i) {
-      const FileNameEntry& fileEntry = FileNames[i];
-      OS << format("file_names[%3u] %4" PRIu64 " ", i+1, fileEntry.DirIdx)
-         << format("0x%8.8" PRIx64 " 0x%8.8" PRIx64 " ",
-                   fileEntry.ModTime, fileEntry.Length)
+      const FileNameEntry &fileEntry = FileNames[i];
+      OS << format("file_names[%3u] %4" PRIu64 " ", i + 1, fileEntry.DirIdx)
+         << format("0x%8.8" PRIx64 " 0x%8.8" PRIx64 " ", fileEntry.ModTime,
+                   fileEntry.Length)
          << fileEntry.Name << '\n';
     }
   }
@@ -82,8 +80,8 @@ bool DWARFDebugLine::Prologue::parse(DataExtractor debug_line_data,
   if (Version < 2)
     return false;
 
-  PrologueLength = debug_line_data.getUnsigned(offset_ptr,
-                                               sizeofPrologueLength());
+  PrologueLength =
+      debug_line_data.getUnsigned(offset_ptr, sizeofPrologueLength());
   const uint64_t end_prologue_offset = PrologueLength + *offset_ptr;
   MinInstLength = debug_line_data.getU8(offset_ptr);
   if (Version >= 4)
@@ -131,9 +129,7 @@ bool DWARFDebugLine::Prologue::parse(DataExtractor debug_line_data,
   return true;
 }
 
-DWARFDebugLine::Row::Row(bool default_is_stmt) {
-  reset(default_is_stmt);
-}
+DWARFDebugLine::Row::Row(bool default_is_stmt) { reset(default_is_stmt); }
 
 void DWARFDebugLine::Row::postAppend() {
   BasicBlock = false;
@@ -158,17 +154,13 @@ void DWARFDebugLine::Row::reset(bool default_is_stmt) {
 void DWARFDebugLine::Row::dump(raw_ostream &OS) const {
   OS << format("0x%16.16" PRIx64 " %6u %6u", Address, Line, Column)
      << format(" %6u %3u %13u ", File, Isa, Discriminator)
-     << (IsStmt ? " is_stmt" : "")
-     << (BasicBlock ? " basic_block" : "")
+     << (IsStmt ? " is_stmt" : "") << (BasicBlock ? " basic_block" : "")
      << (PrologueEnd ? " prologue_end" : "")
      << (EpilogueBegin ? " epilogue_begin" : "")
-     << (EndSequence ? " end_sequence" : "")
-     << '\n';
+     << (EndSequence ? " end_sequence" : "") << '\n';
 }
 
-DWARFDebugLine::Sequence::Sequence() {
-  reset();
-}
+DWARFDebugLine::Sequence::Sequence() { reset(); }
 
 void DWARFDebugLine::Sequence::reset() {
   LowPC = 0;
@@ -178,9 +170,7 @@ void DWARFDebugLine::Sequence::reset() {
   Empty = true;
 }
 
-DWARFDebugLine::LineTable::LineTable() {
-  clear();
-}
+DWARFDebugLine::LineTable::LineTable() { clear(); }
 
 void DWARFDebugLine::LineTable::dump(raw_ostream &OS) const {
   Prologue.dump(OS);
@@ -244,7 +234,7 @@ const DWARFDebugLine::LineTable *
 DWARFDebugLine::getOrParseLineTable(DataExtractor debug_line_data,
                                     uint32_t offset) {
   std::pair<LineTableIter, bool> pos =
-    LineTableMap.insert(LineTableMapTy::value_type(offset, LineTable()));
+      LineTableMap.insert(LineTableMapTy::value_type(offset, LineTable()));
   LineTable *LT = &pos.first->second;
   if (pos.second) {
     if (!LT->parse(debug_line_data, RelocMap, &offset))
@@ -266,8 +256,8 @@ bool DWARFDebugLine::LineTable::parse(DataExtractor debug_line_data,
     return false;
   }
 
-  const uint32_t end_offset = debug_line_offset + Prologue.TotalLength +
-                              Prologue.sizeofTotalLength();
+  const uint32_t end_offset =
+      debug_line_offset + Prologue.TotalLength + Prologue.sizeofTotalLength();
 
   ParsingState State(this);
 
@@ -307,9 +297,9 @@ bool DWARFDebugLine::LineTable::parse(DataExtractor debug_line_data,
           // If this address is in our relocation map, apply the relocation.
           RelocAddrMap::const_iterator AI = RMap->find(*offset_ptr);
           if (AI != RMap->end()) {
-             const std::pair<uint8_t, int64_t> &R = AI->second;
-             State.Row.Address =
-                 debug_line_data.getAddress(offset_ptr) + R.second;
+            const std::pair<uint8_t, int64_t> &R = AI->second;
+            State.Row.Address =
+                debug_line_data.getAddress(offset_ptr) + R.second;
           } else
             State.Row.Address = debug_line_data.getAddress(offset_ptr);
         }
@@ -509,6 +499,8 @@ bool DWARFDebugLine::LineTable::parse(DataExtractor debug_line_data,
       State.Row.Line += line_offset;
       State.Row.Address += addr_offset;
       State.appendRowToMatrix(*offset_ptr);
+      // Reset discriminator to 0.
+      State.Row.Discriminator = 0;
     }
   }
 
@@ -566,8 +558,8 @@ uint32_t DWARFDebugLine::LineTable::lookupAddress(uint64_t address) const {
   sequence.LowPC = address;
   SequenceIter first_seq = Sequences.begin();
   SequenceIter last_seq = Sequences.end();
-  SequenceIter seq_pos = std::lower_bound(first_seq, last_seq, sequence,
-      DWARFDebugLine::Sequence::orderByLowPC);
+  SequenceIter seq_pos = std::lower_bound(
+      first_seq, last_seq, sequence, DWARFDebugLine::Sequence::orderByLowPC);
   DWARFDebugLine::Sequence found_seq;
   if (seq_pos == last_seq) {
     found_seq = Sequences.back();
@@ -591,8 +583,8 @@ bool DWARFDebugLine::LineTable::lookupAddressRange(
   sequence.LowPC = address;
   SequenceIter first_seq = Sequences.begin();
   SequenceIter last_seq = Sequences.end();
-  SequenceIter seq_pos = std::lower_bound(first_seq, last_seq, sequence,
-      DWARFDebugLine::Sequence::orderByLowPC);
+  SequenceIter seq_pos = std::lower_bound(
+      first_seq, last_seq, sequence, DWARFDebugLine::Sequence::orderByLowPC);
   if (seq_pos == last_seq || seq_pos->LowPC != address) {
     if (seq_pos == first_seq)
       return false;
@@ -632,11 +624,10 @@ bool DWARFDebugLine::LineTable::lookupAddressRange(
   return true;
 }
 
-bool
-DWARFDebugLine::LineTable::getFileNameByIndex(uint64_t FileIndex,
-                                              const char *CompDir,
-                                              FileLineInfoKind Kind,
-                                              std::string &Result) const {
+bool DWARFDebugLine::LineTable::getFileNameByIndex(uint64_t FileIndex,
+                                                   const char *CompDir,
+                                                   FileLineInfoKind Kind,
+                                                   std::string &Result) const {
   if (FileIndex == 0 || FileIndex > Prologue.FileNames.size() ||
       Kind == FileLineInfoKind::None)
     return false;
@@ -669,11 +660,9 @@ DWARFDebugLine::LineTable::getFileNameByIndex(uint64_t FileIndex,
   return true;
 }
 
-bool
-DWARFDebugLine::LineTable::getFileLineInfoForAddress(uint64_t Address,
-                                                     const char *CompDir,
-                                                     FileLineInfoKind Kind,
-                                                     DILineInfo &Result) const {
+bool DWARFDebugLine::LineTable::getFileLineInfoForAddress(
+    uint64_t Address, const char *CompDir, FileLineInfoKind Kind,
+    DILineInfo &Result) const {
   // Get the index of row we're looking for in the line table.
   uint32_t RowIndex = lookupAddress(Address);
   if (RowIndex == -1U)
diff --git a/lib/DebugInfo/DWARF/DWARFDebugLoc.cpp b/lib/DebugInfo/DWARF/DWARFDebugLoc.cpp
index cd6fbefd05dd..a7b46b842fee 100644
--- a/lib/DebugInfo/DWARF/DWARFDebugLoc.cpp
+++ b/lib/DebugInfo/DWARF/DWARFDebugLoc.cpp
@@ -8,7 +8,6 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/DebugInfo/DWARF/DWARFDebugLoc.h"
-#include "llvm/Support/Compiler.h"
 #include "llvm/Support/Dwarf.h"
 #include "llvm/Support/Format.h"
 #include "llvm/Support/raw_ostream.h"
diff --git a/lib/DebugInfo/DWARF/DWARFDebugMacro.cpp b/lib/DebugInfo/DWARF/DWARFDebugMacro.cpp
index b6555fa6272e..375ff7c99a04 100644
--- a/lib/DebugInfo/DWARF/DWARFDebugMacro.cpp
+++ b/lib/DebugInfo/DWARF/DWARFDebugMacro.cpp
@@ -7,9 +7,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "SyntaxHighlighting.h"
 #include "llvm/DebugInfo/DWARF/DWARFDebugMacro.h"
-#include "llvm/Support/Compiler.h"
+#include "SyntaxHighlighting.h"
 #include "llvm/Support/Dwarf.h"
 #include "llvm/Support/Format.h"
 #include "llvm/Support/raw_ostream.h"
diff --git a/lib/DebugInfo/DWARF/DWARFUnit.cpp b/lib/DebugInfo/DWARF/DWARFUnit.cpp
index 92ca2d4c3ff0..13c2b508bfa3 100644
--- a/lib/DebugInfo/DWARF/DWARFUnit.cpp
+++ b/lib/DebugInfo/DWARF/DWARFUnit.cpp
@@ -20,7 +20,7 @@ using namespace dwarf;
 void DWARFUnitSectionBase::parse(DWARFContext &C, const DWARFSection &Section) {
   parseImpl(C, Section, C.getDebugAbbrev(), C.getRangeSection(),
             C.getStringSection(), StringRef(), C.getAddrSection(),
-            C.getLineSection().Data, C.isLittleEndian());
+            C.getLineSection().Data, C.isLittleEndian(), false);
 }
 
 void DWARFUnitSectionBase::parseDWO(DWARFContext &C,
@@ -28,13 +28,14 @@ void DWARFUnitSectionBase::parseDWO(DWARFContext &C,
                                     DWARFUnitIndex *Index) {
   parseImpl(C, DWOSection, C.getDebugAbbrevDWO(), C.getRangeDWOSection(),
             C.getStringDWOSection(), C.getStringOffsetDWOSection(),
-            C.getAddrSection(), C.getLineDWOSection().Data, C.isLittleEndian());
+            C.getAddrSection(), C.getLineDWOSection().Data, C.isLittleEndian(),
+            true);
 }
 
 DWARFUnit::DWARFUnit(DWARFContext &DC, const DWARFSection &Section,
                      const DWARFDebugAbbrev *DA, StringRef RS, StringRef SS,
                      StringRef SOS, StringRef AOS, StringRef LS, bool LE,
-                     const DWARFUnitSectionBase &UnitSection,
+                     bool IsDWO, const DWARFUnitSectionBase &UnitSection,
                      const DWARFUnitIndex::Entry *IndexEntry)
     : Context(DC), InfoSection(Section), Abbrev(DA), RangeSection(RS),
       LineSection(LS), StringSection(SS), StringOffsetSection([&]() {
@@ -43,8 +44,8 @@ DWARFUnit::DWARFUnit(DWARFContext &DC, const DWARFSection &Section,
             return SOS.slice(C->Offset, C->Offset + C->Length);
         return SOS;
       }()),
-      AddrOffsetSection(AOS), isLittleEndian(LE), UnitSection(UnitSection),
-      IndexEntry(IndexEntry) {
+      AddrOffsetSection(AOS), isLittleEndian(LE), isDWO(IsDWO),
+      UnitSection(UnitSection), IndexEntry(IndexEntry) {
   clear();
 }
 
@@ -268,8 +269,11 @@ size_t DWARFUnit::extractDIEsIfNeeded(bool CUDieOnly) {
 DWARFUnit::DWOHolder::DWOHolder(StringRef DWOPath)
     : DWOFile(), DWOContext(), DWOU(nullptr) {
   auto Obj = object::ObjectFile::createObjectFile(DWOPath);
-  if (!Obj)
+  if (!Obj) {
+    // TODO: Actually report errors helpfully.
+    consumeError(Obj.takeError());
     return;
+  }
   DWOFile = std::move(Obj.get());
   DWOContext.reset(
       cast<DWARFContext>(new DWARFContextInMemory(*DWOFile.getBinary())));
@@ -278,6 +282,8 @@ DWARFUnit::DWOHolder::DWOHolder(StringRef DWOPath)
 }
 
 bool DWARFUnit::parseDWO() {
+  if (isDWO)
+    return false;
   if (DWO.get())
     return false;
   extractDIEsIfNeeded(true);
@@ -375,19 +381,14 @@ DWARFUnit::getInlinedChainForAddress(uint64_t Address) {
   // First, find a subprogram that contains the given address (the root
   // of inlined chain).
   const DWARFUnit *ChainCU = nullptr;
-  const DWARFDebugInfoEntryMinimal *SubprogramDIE =
-      getSubprogramForAddress(Address);
-  if (SubprogramDIE) {
+  const DWARFDebugInfoEntryMinimal *SubprogramDIE;
+  // Try to look for subprogram DIEs in the DWO file.
+  parseDWO();
+  if (DWO) {
+    if ((SubprogramDIE = DWO->getUnit()->getSubprogramForAddress(Address)))
+      ChainCU = DWO->getUnit();
+  } else if ((SubprogramDIE = getSubprogramForAddress(Address)))
     ChainCU = this;
-  } else {
-    // Try to look for subprogram DIEs in the DWO file.
-    parseDWO();
-    if (DWO.get()) {
-      SubprogramDIE = DWO->getUnit()->getSubprogramForAddress(Address);
-      if (SubprogramDIE)
-        ChainCU = DWO->getUnit();
-    }
-  }
 
   // Get inlined chain rooted at this subprogram DIE.
   if (!SubprogramDIE)
diff --git a/lib/DebugInfo/DWARF/Makefile b/lib/DebugInfo/DWARF/Makefile
deleted file mode 100644
index 863337353d0a..000000000000
--- a/lib/DebugInfo/DWARF/Makefile
+++ /dev/null
@@ -1,14 +0,0 @@
-##===- lib/DebugInfo/DWARF/Makefile ------------------------*- Makefile -*-===##
-#
-#                     The LLVM Compiler Infrastructure
-#
-# This file is distributed under the University of Illinois Open Source
-# License. See LICENSE.TXT for details.
-#
-##===----------------------------------------------------------------------===##
-
-LEVEL = ../../..
-LIBRARYNAME = LLVMDebugInfoDWARF
-BUILD_ARCHIVE := 1
-
-include $(LEVEL)/Makefile.common
diff --git a/lib/DebugInfo/DWARF/module.modulemap b/lib/DebugInfo/DWARF/module.modulemap
deleted file mode 100644
index c2f624fd4b6c..000000000000
--- a/lib/DebugInfo/DWARF/module.modulemap
+++ /dev/null
@@ -1 +0,0 @@
-module DebugInfoDWARF { requires cplusplus umbrella "." module * { export * } }
diff --git a/lib/DebugInfo/Makefile b/lib/DebugInfo/Makefile
deleted file mode 100644
index 6072af314416..000000000000
--- a/lib/DebugInfo/Makefile
+++ /dev/null
@@ -1,15 +0,0 @@
-##===- lib/DebugInfo/Makefile ------------------------------*- Makefile -*-===##
-#
-#                     The LLVM Compiler Infrastructure
-#
-# This file is distributed under the University of Illinois Open Source
-# License. See LICENSE.TXT for details.
-#
-##===----------------------------------------------------------------------===##
-LEVEL = ../..
-
-include $(LEVEL)/Makefile.config
-
-PARALLEL_DIRS := CodeView DWARF PDB Symbolize
-
-include $(LEVEL)/Makefile.common
diff --git a/lib/DebugInfo/PDB/CMakeLists.txt b/lib/DebugInfo/PDB/CMakeLists.txt
index 1645a95aac36..b5a2bc1600fc 100644
--- a/lib/DebugInfo/PDB/CMakeLists.txt
+++ b/lib/DebugInfo/PDB/CMakeLists.txt
@@ -17,6 +17,7 @@ if(HAVE_DIA_SDK)
     DIA/DIAEnumLineNumbers.cpp
     DIA/DIAEnumSourceFiles.cpp
     DIA/DIAEnumSymbols.cpp
+    DIA/DIAError.cpp
     DIA/DIALineNumber.cpp
     DIA/DIARawSymbol.cpp
     DIA/DIASession.cpp
@@ -24,12 +25,37 @@ if(HAVE_DIA_SDK)
     )
 
     set(LIBPDB_ADDITIONAL_HEADER_DIRS "${LLVM_MAIN_INCLUDE_DIR}/llvm/DebugInfo/PDB/DIA")
-
 endif()
 
+add_pdb_impl_folder(Raw
+  Raw/DbiStream.cpp
+  Raw/DbiStreamBuilder.cpp
+  Raw/EnumTables.cpp
+  Raw/Hash.cpp
+  Raw/IndexedStreamData.cpp
+  Raw/InfoStream.cpp
+  Raw/InfoStreamBuilder.cpp
+  Raw/MappedBlockStream.cpp
+  Raw/ModInfo.cpp
+  Raw/ModStream.cpp
+  Raw/MsfBuilder.cpp
+  Raw/MsfCommon.cpp
+  Raw/NameHashTable.cpp
+  Raw/NameMap.cpp
+  Raw/NameMapBuilder.cpp
+  Raw/PDBFile.cpp
+  Raw/PDBFileBuilder.cpp
+  Raw/PublicsStream.cpp
+  Raw/RawError.cpp
+  Raw/RawSession.cpp
+  Raw/SymbolStream.cpp
+  Raw/TpiStream.cpp)
+
+list(APPEND LIBPDB_ADDITIONAL_HEADER_DIRS "${LLVM_MAIN_INCLUDE_DIR}/llvm/DebugInfo/PDB/Raw")
 list(APPEND LIBPDB_ADDITIONAL_HEADER_DIRS "${LLVM_MAIN_INCLUDE_DIR}/llvm/DebugInfo/PDB")
 
 add_llvm_library(LLVMDebugInfoPDB
+  GenericError.cpp
   IPDBSourceFile.cpp
   PDB.cpp
   PDBContext.cpp
diff --git a/lib/DebugInfo/PDB/DIA/DIADataStream.cpp b/lib/DebugInfo/PDB/DIA/DIADataStream.cpp
index e0e1b2712467..7eabed8cad48 100644
--- a/lib/DebugInfo/PDB/DIA/DIADataStream.cpp
+++ b/lib/DebugInfo/PDB/DIA/DIADataStream.cpp
@@ -8,9 +8,11 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/DebugInfo/PDB/DIA/DIADataStream.h"
+#include "llvm/ADT/ArrayRef.h"
 #include "llvm/Support/ConvertUTF.h"
 
 using namespace llvm;
+using namespace llvm::pdb;
 
 DIADataStream::DIADataStream(CComPtr<IDiaEnumDebugStreamData> DiaStreamData)
     : StreamData(DiaStreamData) {}
diff --git a/lib/DebugInfo/PDB/DIA/DIAEnumDebugStreams.cpp b/lib/DebugInfo/PDB/DIA/DIAEnumDebugStreams.cpp
index 23c6489c7e18..cae817c1b367 100644
--- a/lib/DebugInfo/PDB/DIA/DIAEnumDebugStreams.cpp
+++ b/lib/DebugInfo/PDB/DIA/DIAEnumDebugStreams.cpp
@@ -12,6 +12,7 @@
 #include "llvm/DebugInfo/PDB/DIA/DIAEnumDebugStreams.h"
 
 using namespace llvm;
+using namespace llvm::pdb;
 
 DIAEnumDebugStreams::DIAEnumDebugStreams(
     CComPtr<IDiaEnumDebugStreams> DiaEnumerator)
diff --git a/lib/DebugInfo/PDB/DIA/DIAEnumLineNumbers.cpp b/lib/DebugInfo/PDB/DIA/DIAEnumLineNumbers.cpp
index 32a9af214dc0..4741d9c9a849 100644
--- a/lib/DebugInfo/PDB/DIA/DIAEnumLineNumbers.cpp
+++ b/lib/DebugInfo/PDB/DIA/DIAEnumLineNumbers.cpp
@@ -12,6 +12,7 @@
 #include "llvm/DebugInfo/PDB/DIA/DIALineNumber.h"
 
 using namespace llvm;
+using namespace llvm::pdb;
 
 DIAEnumLineNumbers::DIAEnumLineNumbers(
     CComPtr<IDiaEnumLineNumbers> DiaEnumerator)
diff --git a/lib/DebugInfo/PDB/DIA/DIAEnumSourceFiles.cpp b/lib/DebugInfo/PDB/DIA/DIAEnumSourceFiles.cpp
index 1a946100fef2..ccf8c4e622cc 100644
--- a/lib/DebugInfo/PDB/DIA/DIAEnumSourceFiles.cpp
+++ b/lib/DebugInfo/PDB/DIA/DIAEnumSourceFiles.cpp
@@ -12,6 +12,7 @@
 #include "llvm/DebugInfo/PDB/DIA/DIASourceFile.h"
 
 using namespace llvm;
+using namespace llvm::pdb;
 
 DIAEnumSourceFiles::DIAEnumSourceFiles(
     const DIASession &PDBSession, CComPtr<IDiaEnumSourceFiles> DiaEnumerator)
diff --git a/lib/DebugInfo/PDB/DIA/DIAEnumSymbols.cpp b/lib/DebugInfo/PDB/DIA/DIAEnumSymbols.cpp
index 6754d9a97d70..3c211b569044 100644
--- a/lib/DebugInfo/PDB/DIA/DIAEnumSymbols.cpp
+++ b/lib/DebugInfo/PDB/DIA/DIAEnumSymbols.cpp
@@ -13,6 +13,7 @@
 #include "llvm/DebugInfo/PDB/DIA/DIASession.h"
 
 using namespace llvm;
+using namespace llvm::pdb;
 
 DIAEnumSymbols::DIAEnumSymbols(const DIASession &PDBSession,
                                CComPtr<IDiaEnumSymbols> DiaEnumerator)
diff --git a/lib/DebugInfo/PDB/DIA/DIAError.cpp b/lib/DebugInfo/PDB/DIA/DIAError.cpp
new file mode 100644
index 000000000000..1d72a92b5145
--- /dev/null
+++ b/lib/DebugInfo/PDB/DIA/DIAError.cpp
@@ -0,0 +1,59 @@
+#include "llvm/DebugInfo/PDB/DIA/DIAError.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/ManagedStatic.h"
+
+using namespace llvm;
+using namespace llvm::pdb;
+
+// FIXME: This class is only here to support the transition to llvm::Error. It
+// will be removed once this transition is complete. Clients should prefer to
+// deal with the Error value directly, rather than converting to error_code.
+class DIAErrorCategory : public std::error_category {
+public:
+  const char *name() const LLVM_NOEXCEPT override { return "llvm.pdb.dia"; }
+
+  std::string message(int Condition) const override {
+    switch (static_cast<dia_error_code>(Condition)) {
+    case dia_error_code::could_not_create_impl:
+      return "Failed to connect to DIA at runtime.  Verify that Visual Studio "
+             "is properly installed, or that msdiaXX.dll is in your PATH.";
+    case dia_error_code::invalid_file_format:
+      return "Unable to load PDB.  The file has an unrecognized format.";
+    case dia_error_code::invalid_parameter:
+      return "The parameter is incorrect.";
+    case dia_error_code::already_loaded:
+      return "Unable to load the PDB or EXE, because it is already loaded.";
+    case dia_error_code::debug_info_mismatch:
+      return "The PDB file and the EXE file do not match.";
+    case dia_error_code::unspecified:
+      return "An unknown error has occurred.";
+    }
+    llvm_unreachable("Unrecognized DIAErrorCode");
+  }
+};
+
+static ManagedStatic<DIAErrorCategory> Category;
+
+char DIAError::ID = 0;
+
+DIAError::DIAError(dia_error_code C) : DIAError(C, "") {}
+
+DIAError::DIAError(const std::string &Context)
+    : DIAError(dia_error_code::unspecified, Context) {}
+
+DIAError::DIAError(dia_error_code C, const std::string &Context) : Code(C) {
+  ErrMsg = "DIA Error: ";
+  std::error_code EC = convertToErrorCode();
+  if (Code != dia_error_code::unspecified)
+    ErrMsg += EC.message() + "  ";
+  if (!Context.empty())
+    ErrMsg += Context;
+}
+
+void DIAError::log(raw_ostream &OS) const { OS << ErrMsg << "\n"; }
+
+const std::string &DIAError::getErrorMessage() const { return ErrMsg; }
+
+std::error_code DIAError::convertToErrorCode() const {
+  return std::error_code(static_cast<int>(Code), *Category);
+}
diff --git a/lib/DebugInfo/PDB/DIA/DIALineNumber.cpp b/lib/DebugInfo/PDB/DIA/DIALineNumber.cpp
index c5577f15eb8e..b19be6b595ab 100644
--- a/lib/DebugInfo/PDB/DIA/DIALineNumber.cpp
+++ b/lib/DebugInfo/PDB/DIA/DIALineNumber.cpp
@@ -10,6 +10,7 @@
 #include "llvm/DebugInfo/PDB/DIA/DIALineNumber.h"
 
 using namespace llvm;
+using namespace llvm::pdb;
 
 DIALineNumber::DIALineNumber(CComPtr<IDiaLineNumber> DiaLineNumber)
     : LineNumber(DiaLineNumber) {}
diff --git a/lib/DebugInfo/PDB/DIA/DIARawSymbol.cpp b/lib/DebugInfo/PDB/DIA/DIARawSymbol.cpp
index abe0ab55e56c..bba5b0f94dca 100644
--- a/lib/DebugInfo/PDB/DIA/DIARawSymbol.cpp
+++ b/lib/DebugInfo/PDB/DIA/DIARawSymbol.cpp
@@ -7,64 +7,77 @@
 //
 //===----------------------------------------------------------------------===//
 
+#include "llvm/DebugInfo/PDB/DIA/DIARawSymbol.h"
+#include "llvm/ADT/ArrayRef.h"
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/DebugInfo/PDB/DIA/DIAEnumSymbols.h"
-#include "llvm/DebugInfo/PDB/DIA/DIARawSymbol.h"
 #include "llvm/DebugInfo/PDB/DIA/DIASession.h"
 #include "llvm/DebugInfo/PDB/PDBExtras.h"
 #include "llvm/Support/ConvertUTF.h"
 #include "llvm/Support/raw_ostream.h"
 
 using namespace llvm;
+using namespace llvm::pdb;
 
 namespace {
 Variant VariantFromVARIANT(const VARIANT &V) {
   Variant Result;
   switch (V.vt) {
   case VT_I1:
-    Result.Int8 = V.cVal;
+    Result.Value.Int8 = V.cVal;
     Result.Type = PDB_VariantType::Int8;
     break;
   case VT_I2:
-    Result.Int16 = V.iVal;
+    Result.Value.Int16 = V.iVal;
     Result.Type = PDB_VariantType::Int16;
     break;
   case VT_I4:
-    Result.Int32 = V.intVal;
+    Result.Value.Int32 = V.intVal;
     Result.Type = PDB_VariantType::Int32;
     break;
   case VT_I8:
-    Result.Int64 = V.llVal;
+    Result.Value.Int64 = V.llVal;
     Result.Type = PDB_VariantType::Int64;
     break;
   case VT_UI1:
-    Result.UInt8 = V.bVal;
+    Result.Value.UInt8 = V.bVal;
     Result.Type = PDB_VariantType::UInt8;
     break;
   case VT_UI2:
-    Result.UInt16 = V.uiVal;
+    Result.Value.UInt16 = V.uiVal;
     Result.Type = PDB_VariantType::UInt16;
     break;
   case VT_UI4:
-    Result.UInt32 = V.uintVal;
+    Result.Value.UInt32 = V.uintVal;
     Result.Type = PDB_VariantType::UInt32;
     break;
   case VT_UI8:
-    Result.UInt64 = V.ullVal;
+    Result.Value.UInt64 = V.ullVal;
     Result.Type = PDB_VariantType::UInt64;
     break;
   case VT_BOOL:
-    Result.Bool = (V.boolVal == VARIANT_TRUE) ? true : false;
+    Result.Value.Bool = (V.boolVal == VARIANT_TRUE) ? true : false;
     Result.Type = PDB_VariantType::Bool;
     break;
   case VT_R4:
-    Result.Single = V.fltVal;
+    Result.Value.Single = V.fltVal;
     Result.Type = PDB_VariantType::Single;
     break;
   case VT_R8:
-    Result.Double = V.dblVal;
+    Result.Value.Double = V.dblVal;
     Result.Type = PDB_VariantType::Double;
     break;
+  case VT_BSTR: {
+    const char *SrcBytes = reinterpret_cast<const char *>(V.bstrVal);
+    llvm::ArrayRef<char> SrcByteArray(SrcBytes, SysStringByteLen(V.bstrVal));
+    std::string Result8;
+    if (!llvm::convertUTF16ToUTF8String(SrcByteArray, Result8))
+      Result.Value.String = nullptr;
+    Result.Value.String = new char[Result8.length() + 1];
+    ::strcpy(Result.Value.String, Result8.c_str());
+    Result.Type = PDB_VariantType::String;
+    break;
+  }
   default:
     Result.Type = PDB_VariantType::Unknown;
     break;
@@ -521,8 +534,8 @@ uint32_t DIARawSymbol::getLiveRangeStartRelativeVirtualAddress() const {
       Symbol, &IDiaSymbol::get_liveRangeStartRelativeVirtualAddress);
 }
 
-PDB_RegisterId DIARawSymbol::getLocalBasePointerRegisterId() const {
-  return PrivateGetDIAValue<DWORD, PDB_RegisterId>(
+codeview::RegisterId DIARawSymbol::getLocalBasePointerRegisterId() const {
+  return PrivateGetDIAValue<DWORD, codeview::RegisterId>(
       Symbol, &IDiaSymbol::get_localBasePointerRegisterId);
 }
 
@@ -583,9 +596,9 @@ uint32_t DIARawSymbol::getRank() const {
   return PrivateGetDIAValue(Symbol, &IDiaSymbol::get_rank);
 }
 
-PDB_RegisterId DIARawSymbol::getRegisterId() const {
-  return PrivateGetDIAValue<DWORD, PDB_RegisterId>(Symbol,
-                                                   &IDiaSymbol::get_registerId);
+codeview::RegisterId DIARawSymbol::getRegisterId() const {
+  return PrivateGetDIAValue<DWORD, codeview::RegisterId>(
+      Symbol, &IDiaSymbol::get_registerId);
 }
 
 uint32_t DIARawSymbol::getRegisterType() const {
@@ -738,8 +751,8 @@ PDB_Machine DIARawSymbol::getMachineType() const {
                                                 &IDiaSymbol::get_machineType);
 }
 
-PDB_ThunkOrdinal DIARawSymbol::getThunkOrdinal() const {
-  return PrivateGetDIAValue<DWORD, PDB_ThunkOrdinal>(
+codeview::ThunkOrdinal DIARawSymbol::getThunkOrdinal() const {
+  return PrivateGetDIAValue<DWORD, codeview::ThunkOrdinal>(
       Symbol, &IDiaSymbol::get_thunkOrdinal);
 }
 
diff --git a/lib/DebugInfo/PDB/DIA/DIASession.cpp b/lib/DebugInfo/PDB/DIA/DIASession.cpp
index 99fe750ebac6..fa224af8cb87 100644
--- a/lib/DebugInfo/PDB/DIA/DIASession.cpp
+++ b/lib/DebugInfo/PDB/DIA/DIASession.cpp
@@ -6,107 +6,125 @@
 // License. See LICENSE.TXT for details.
 //
 //===----------------------------------------------------------------------===//
-
+#include "llvm/DebugInfo/PDB/DIA/DIASession.h"
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/DebugInfo/PDB/DIA/DIAEnumDebugStreams.h"
 #include "llvm/DebugInfo/PDB/DIA/DIAEnumLineNumbers.h"
 #include "llvm/DebugInfo/PDB/DIA/DIAEnumSourceFiles.h"
+#include "llvm/DebugInfo/PDB/DIA/DIAError.h"
 #include "llvm/DebugInfo/PDB/DIA/DIARawSymbol.h"
-#include "llvm/DebugInfo/PDB/DIA/DIASession.h"
 #include "llvm/DebugInfo/PDB/DIA/DIASourceFile.h"
+#include "llvm/DebugInfo/PDB/DIA/DIASupport.h"
+#include "llvm/DebugInfo/PDB/GenericError.h"
+#include "llvm/DebugInfo/PDB/PDB.h"
 #include "llvm/DebugInfo/PDB/PDBSymbolCompiland.h"
 #include "llvm/DebugInfo/PDB/PDBSymbolExe.h"
 #include "llvm/Support/ConvertUTF.h"
 
 using namespace llvm;
+using namespace llvm::pdb;
+
+namespace {
+
+Error ErrorFromHResult(HRESULT Result) {
+  switch (Result) {
+  case E_PDB_NOT_FOUND:
+    return make_error<GenericError>(generic_error_code::invalid_path);
+  case E_PDB_FORMAT:
+    return make_error<DIAError>(dia_error_code::invalid_file_format);
+  case E_INVALIDARG:
+    return make_error<DIAError>(dia_error_code::invalid_parameter);
+  case E_UNEXPECTED:
+    return make_error<DIAError>(dia_error_code::already_loaded);
+  case E_PDB_INVALID_SIG:
+  case E_PDB_INVALID_AGE:
+    return make_error<DIAError>(dia_error_code::debug_info_mismatch);
+  default:
+    return make_error<DIAError>(dia_error_code::unspecified);
+  }
+}
 
-namespace {}
+Error LoadDIA(CComPtr<IDiaDataSource> &DiaDataSource) {
+  if (SUCCEEDED(CoCreateInstance(CLSID_DiaSource, nullptr, CLSCTX_INPROC_SERVER,
+                                 IID_IDiaDataSource,
+                                 reinterpret_cast<LPVOID *>(&DiaDataSource))))
+    return Error::success();
+
+// If the CoCreateInstance call above failed, msdia*.dll is not registered.
+// Try loading the DLL corresponding to the #included DIA SDK.
+#if !defined(_MSC_VER)
+  return llvm::make_error<GenericError>(
+      "DIA is only supported when using MSVC.");
+#endif
+
+  const wchar_t *msdia_dll = nullptr;
+#if _MSC_VER == 1900
+  msdia_dll = L"msdia140.dll"; // VS2015
+#elif _MSC_VER == 1800
+  msdia_dll = L"msdia120.dll"; // VS2013
+#else
+#error "Unknown Visual Studio version."
+#endif
+
+  HRESULT HR;
+  if (FAILED(HR = NoRegCoCreate(msdia_dll, CLSID_DiaSource, IID_IDiaDataSource,
+                                reinterpret_cast<LPVOID *>(&DiaDataSource))))
+    return ErrorFromHResult(HR);
+  return Error::success();
+}
+
+}
 
 DIASession::DIASession(CComPtr<IDiaSession> DiaSession) : Session(DiaSession) {}
 
-PDB_ErrorCode DIASession::createFromPdb(StringRef Path,
-                                        std::unique_ptr<IPDBSession> &Session) {
+Error DIASession::createFromPdb(StringRef Path,
+                                std::unique_ptr<IPDBSession> &Session) {
   CComPtr<IDiaDataSource> DiaDataSource;
   CComPtr<IDiaSession> DiaSession;
 
   // We assume that CoInitializeEx has already been called by the executable.
-  HRESULT Result = ::CoCreateInstance(
-      CLSID_DiaSource, nullptr, CLSCTX_INPROC_SERVER, IID_IDiaDataSource,
-      reinterpret_cast<LPVOID *>(&DiaDataSource));
-  if (FAILED(Result))
-    return PDB_ErrorCode::NoPdbImpl;
+  if (auto E = LoadDIA(DiaDataSource))
+    return E;
 
   llvm::SmallVector<UTF16, 128> Path16;
   if (!llvm::convertUTF8ToUTF16String(Path, Path16))
-    return PDB_ErrorCode::InvalidPath;
+    return make_error<GenericError>(generic_error_code::invalid_path);
 
   const wchar_t *Path16Str = reinterpret_cast<const wchar_t*>(Path16.data());
-  if (FAILED(Result = DiaDataSource->loadDataFromPdb(Path16Str))) {
-    if (Result == E_PDB_NOT_FOUND)
-      return PDB_ErrorCode::InvalidPath;
-    else if (Result == E_PDB_FORMAT)
-      return PDB_ErrorCode::InvalidFileFormat;
-    else if (Result == E_INVALIDARG)
-      return PDB_ErrorCode::InvalidParameter;
-    else if (Result == E_UNEXPECTED)
-      return PDB_ErrorCode::AlreadyLoaded;
-    else
-      return PDB_ErrorCode::UnknownError;
-  }
+  HRESULT HR;
+  if (FAILED(HR = DiaDataSource->loadDataFromPdb(Path16Str)))
+    return ErrorFromHResult(HR);
 
-  if (FAILED(Result = DiaDataSource->openSession(&DiaSession))) {
-    if (Result == E_OUTOFMEMORY)
-      return PDB_ErrorCode::NoMemory;
-    else
-      return PDB_ErrorCode::UnknownError;
-  }
+  if (FAILED(HR = DiaDataSource->openSession(&DiaSession)))
+    return ErrorFromHResult(HR);
 
   Session.reset(new DIASession(DiaSession));
-  return PDB_ErrorCode::Success;
+  return Error::success();
 }
 
-PDB_ErrorCode DIASession::createFromExe(StringRef Path,
-                                        std::unique_ptr<IPDBSession> &Session) {
+Error DIASession::createFromExe(StringRef Path,
+                                std::unique_ptr<IPDBSession> &Session) {
   CComPtr<IDiaDataSource> DiaDataSource;
   CComPtr<IDiaSession> DiaSession;
 
   // We assume that CoInitializeEx has already been called by the executable.
-  HRESULT Result = ::CoCreateInstance(
-      CLSID_DiaSource, nullptr, CLSCTX_INPROC_SERVER, IID_IDiaDataSource,
-      reinterpret_cast<LPVOID *>(&DiaDataSource));
-  if (FAILED(Result))
-    return PDB_ErrorCode::NoPdbImpl;
+  if (auto EC = LoadDIA(DiaDataSource))
+    return EC;
 
   llvm::SmallVector<UTF16, 128> Path16;
   if (!llvm::convertUTF8ToUTF16String(Path, Path16))
-    return PDB_ErrorCode::InvalidPath;
+    return make_error<GenericError>(generic_error_code::invalid_path, Path);
 
   const wchar_t *Path16Str = reinterpret_cast<const wchar_t *>(Path16.data());
-  if (FAILED(Result =
-                 DiaDataSource->loadDataForExe(Path16Str, nullptr, nullptr))) {
-    if (Result == E_PDB_NOT_FOUND)
-      return PDB_ErrorCode::InvalidPath;
-    else if (Result == E_PDB_FORMAT)
-      return PDB_ErrorCode::InvalidFileFormat;
-    else if (Result == E_PDB_INVALID_SIG || Result == E_PDB_INVALID_AGE)
-      return PDB_ErrorCode::DebugInfoMismatch;
-    else if (Result == E_INVALIDARG)
-      return PDB_ErrorCode::InvalidParameter;
-    else if (Result == E_UNEXPECTED)
-      return PDB_ErrorCode::AlreadyLoaded;
-    else
-      return PDB_ErrorCode::UnknownError;
-  }
+  HRESULT HR;
+  if (FAILED(HR = DiaDataSource->loadDataForExe(Path16Str, nullptr, nullptr)))
+    return ErrorFromHResult(HR);
 
-  if (FAILED(Result = DiaDataSource->openSession(&DiaSession))) {
-    if (Result == E_OUTOFMEMORY)
-      return PDB_ErrorCode::NoMemory;
-    else
-      return PDB_ErrorCode::UnknownError;
-  }
+  if (FAILED(HR = DiaDataSource->openSession(&DiaSession)))
+    return ErrorFromHResult(HR);
 
   Session.reset(new DIASession(DiaSession));
-  return PDB_ErrorCode::Success;
+  return Error::success();
 }
 
 uint64_t DIASession::getLoadAddress() const {
@@ -158,6 +176,22 @@ DIASession::findSymbolByAddress(uint64_t Address, PDB_SymType Type) const {
 }
 
 std::unique_ptr<IPDBEnumLineNumbers>
+DIASession::findLineNumbers(const PDBSymbolCompiland &Compiland,
+                            const IPDBSourceFile &File) const {
+  const DIARawSymbol &RawCompiland =
+      static_cast<const DIARawSymbol &>(Compiland.getRawSymbol());
+  const DIASourceFile &RawFile = static_cast<const DIASourceFile &>(File);
+
+  CComPtr<IDiaEnumLineNumbers> LineNumbers;
+  if (S_OK !=
+      Session->findLines(RawCompiland.getDiaSymbol(), RawFile.getDiaFile(),
+                         &LineNumbers))
+    return nullptr;
+
+  return llvm::make_unique<DIAEnumLineNumbers>(LineNumbers);
+}
+
+std::unique_ptr<IPDBEnumLineNumbers>
 DIASession::findLineNumbersByAddress(uint64_t Address, uint32_t Length) const {
   CComPtr<IDiaEnumLineNumbers> LineNumbers;
   if (S_OK != Session->findLinesByVA(Address, Length, &LineNumbers))
@@ -166,6 +200,56 @@ DIASession::findLineNumbersByAddress(uint64_t Address, uint32_t Length) const {
   return llvm::make_unique<DIAEnumLineNumbers>(LineNumbers);
 }
 
+std::unique_ptr<IPDBEnumSourceFiles>
+DIASession::findSourceFiles(const PDBSymbolCompiland *Compiland,
+                            llvm::StringRef Pattern,
+                            PDB_NameSearchFlags Flags) const {
+  IDiaSymbol *DiaCompiland = nullptr;
+  CComBSTR Utf16Pattern;
+  if (!Pattern.empty())
+    Utf16Pattern = CComBSTR(Pattern.data());
+
+  if (Compiland)
+    DiaCompiland = static_cast<const DIARawSymbol &>(Compiland->getRawSymbol())
+                       .getDiaSymbol();
+
+  Flags = static_cast<PDB_NameSearchFlags>(
+      Flags | PDB_NameSearchFlags::NS_FileNameExtMatch);
+  CComPtr<IDiaEnumSourceFiles> SourceFiles;
+  if (S_OK !=
+      Session->findFile(DiaCompiland, Utf16Pattern.m_str, Flags, &SourceFiles))
+    return nullptr;
+  return llvm::make_unique<DIAEnumSourceFiles>(*this, SourceFiles);
+}
+
+std::unique_ptr<IPDBSourceFile>
+DIASession::findOneSourceFile(const PDBSymbolCompiland *Compiland,
+                              llvm::StringRef Pattern,
+                              PDB_NameSearchFlags Flags) const {
+  auto SourceFiles = findSourceFiles(Compiland, Pattern, Flags);
+  if (!SourceFiles || SourceFiles->getChildCount() == 0)
+    return nullptr;
+  return SourceFiles->getNext();
+}
+
+std::unique_ptr<IPDBEnumChildren<PDBSymbolCompiland>>
+DIASession::findCompilandsForSourceFile(llvm::StringRef Pattern,
+                                        PDB_NameSearchFlags Flags) const {
+  auto File = findOneSourceFile(nullptr, Pattern, Flags);
+  if (!File)
+    return nullptr;
+  return File->getCompilands();
+}
+
+std::unique_ptr<PDBSymbolCompiland>
+DIASession::findOneCompilandForSourceFile(llvm::StringRef Pattern,
+                                          PDB_NameSearchFlags Flags) const {
+  auto Compilands = findCompilandsForSourceFile(Pattern, Flags);
+  if (!Compilands || Compilands->getChildCount() == 0)
+    return nullptr;
+  return Compilands->getNext();
+}
+
 std::unique_ptr<IPDBEnumSourceFiles> DIASession::getAllSourceFiles() const {
   CComPtr<IDiaEnumSourceFiles> Files;
   if (S_OK != Session->findFile(nullptr, nullptr, nsNone, &Files))
diff --git a/lib/DebugInfo/PDB/DIA/DIASourceFile.cpp b/lib/DebugInfo/PDB/DIA/DIASourceFile.cpp
index 0a9c444f5e6e..8605f55b402c 100644
--- a/lib/DebugInfo/PDB/DIA/DIASourceFile.cpp
+++ b/lib/DebugInfo/PDB/DIA/DIASourceFile.cpp
@@ -7,12 +7,16 @@
 //
 //===----------------------------------------------------------------------===//
 
+#include "llvm/DebugInfo/PDB/DIA/DIASourceFile.h"
+#include "llvm/ADT/ArrayRef.h"
+#include "llvm/DebugInfo/PDB/ConcreteSymbolEnumerator.h"
 #include "llvm/DebugInfo/PDB/DIA/DIAEnumSymbols.h"
 #include "llvm/DebugInfo/PDB/DIA/DIASession.h"
-#include "llvm/DebugInfo/PDB/DIA/DIASourceFile.h"
+#include "llvm/DebugInfo/PDB/PDBSymbolCompiland.h"
 #include "llvm/Support/ConvertUTF.h"
 
 using namespace llvm;
+using namespace llvm::pdb;
 
 DIASourceFile::DIASourceFile(const DIASession &PDBSession,
                              CComPtr<IDiaSourceFile> DiaSourceFile)
@@ -56,12 +60,15 @@ PDB_Checksum DIASourceFile::getChecksumType() const {
   return static_cast<PDB_Checksum>(Type);
 }
 
-std::unique_ptr<IPDBEnumSymbols> DIASourceFile::getCompilands() const {
+std::unique_ptr<IPDBEnumChildren<PDBSymbolCompiland>>
+DIASourceFile::getCompilands() const {
   CComPtr<IDiaEnumSymbols> DiaEnumerator;
   HRESULT Result = SourceFile->get_compilands(&DiaEnumerator);
   if (S_OK != Result)
     return nullptr;
 
-  return std::unique_ptr<IPDBEnumSymbols>(
+  auto Enumerator = std::unique_ptr<IPDBEnumSymbols>(
       new DIAEnumSymbols(Session, DiaEnumerator));
+  return std::unique_ptr<IPDBEnumChildren<PDBSymbolCompiland>>(
+      new ConcreteSymbolEnumerator<PDBSymbolCompiland>(std::move(Enumerator)));
 }
diff --git a/lib/DebugInfo/PDB/GenericError.cpp b/lib/DebugInfo/PDB/GenericError.cpp
new file mode 100644
index 000000000000..34e179998021
--- /dev/null
+++ b/lib/DebugInfo/PDB/GenericError.cpp
@@ -0,0 +1,67 @@
+//===- Error.cpp - system_error extensions for PDB --------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/DebugInfo/PDB/GenericError.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/ManagedStatic.h"
+
+using namespace llvm;
+using namespace llvm::pdb;
+
+namespace {
+// FIXME: This class is only here to support the transition to llvm::Error. It
+// will be removed once this transition is complete. Clients should prefer to
+// deal with the Error value directly, rather than converting to error_code.
+class GenericErrorCategory : public std::error_category {
+public:
+  const char *name() const LLVM_NOEXCEPT override { return "llvm.pdb"; }
+
+  std::string message(int Condition) const override {
+    switch (static_cast<generic_error_code>(Condition)) {
+    case generic_error_code::unspecified:
+      return "An unknown error has occurred.";
+    case generic_error_code::dia_sdk_not_present:
+      return "LLVM was not compiled with support for DIA.  This usually means "
+             "that you are are not using MSVC, or your Visual Studio "
+             "installation "
+             "is corrupt.";
+    case generic_error_code::invalid_path:
+      return "Unable to load PDB.  Make sure the file exists and is readable.";
+    }
+    llvm_unreachable("Unrecognized generic_error_code");
+  }
+};
+} // end anonymous namespace
+
+static ManagedStatic<GenericErrorCategory> Category;
+
+char GenericError::ID = 0;
+
+GenericError::GenericError(generic_error_code C) : GenericError(C, "") {}
+
+GenericError::GenericError(const std::string &Context)
+    : GenericError(generic_error_code::unspecified, Context) {}
+
+GenericError::GenericError(generic_error_code C, const std::string &Context)
+    : Code(C) {
+  ErrMsg = "PDB Error: ";
+  std::error_code EC = convertToErrorCode();
+  if (Code != generic_error_code::unspecified)
+    ErrMsg += EC.message() + "  ";
+  if (!Context.empty())
+    ErrMsg += Context;
+}
+
+void GenericError::log(raw_ostream &OS) const { OS << ErrMsg << "\n"; }
+
+const std::string &GenericError::getErrorMessage() const { return ErrMsg; }
+
+std::error_code GenericError::convertToErrorCode() const {
+  return std::error_code(static_cast<int>(Code), *Category);
+}
diff --git a/lib/DebugInfo/PDB/IPDBSourceFile.cpp b/lib/DebugInfo/PDB/IPDBSourceFile.cpp
index 3abe59dba623..46b422f5a76a 100644
--- a/lib/DebugInfo/PDB/IPDBSourceFile.cpp
+++ b/lib/DebugInfo/PDB/IPDBSourceFile.cpp
@@ -14,6 +14,7 @@
 #include "llvm/Support/raw_ostream.h"
 
 using namespace llvm;
+using namespace llvm::pdb;
 
 IPDBSourceFile::~IPDBSourceFile() {}
 
diff --git a/lib/DebugInfo/PDB/LLVMBuild.txt b/lib/DebugInfo/PDB/LLVMBuild.txt
index 9ee9f4067e99..76e537a57fc1 100644
--- a/lib/DebugInfo/PDB/LLVMBuild.txt
+++ b/lib/DebugInfo/PDB/LLVMBuild.txt
@@ -19,5 +19,5 @@
 type = Library
 name = DebugInfoPDB
 parent = DebugInfo
-required_libraries = Object Support
+required_libraries = Object Support DebugInfoCodeView
 
diff --git a/lib/DebugInfo/PDB/Makefile b/lib/DebugInfo/PDB/Makefile
deleted file mode 100644
index 444019e5a184..000000000000
--- a/lib/DebugInfo/PDB/Makefile
+++ /dev/null
@@ -1,14 +0,0 @@
-##===- lib/DebugInfo/PDB/Makefile --------------------------*- Makefile -*-===##
-#
-#                     The LLVM Compiler Infrastructure
-#
-# This file is distributed under the University of Illinois Open Source
-# License. See LICENSE.TXT for details.
-#
-##===----------------------------------------------------------------------===##
-
-LEVEL = ../../..
-LIBRARYNAME = LLVMDebugInfoPDB
-BUILD_ARCHIVE := 1
-
-include $(LEVEL)/Makefile.common
diff --git a/lib/DebugInfo/PDB/PDB.cpp b/lib/DebugInfo/PDB/PDB.cpp
index 613407eb1346..69a908eb341c 100644
--- a/lib/DebugInfo/PDB/PDB.cpp
+++ b/lib/DebugInfo/PDB/PDB.cpp
@@ -11,29 +11,41 @@
 
 #include "llvm/ADT/StringRef.h"
 #include "llvm/Config/config.h"
+#include "llvm/DebugInfo/PDB/GenericError.h"
 #include "llvm/DebugInfo/PDB/IPDBSession.h"
 #include "llvm/DebugInfo/PDB/PDB.h"
-
 #if HAVE_DIA_SDK
 #include "llvm/DebugInfo/PDB/DIA/DIASession.h"
 #endif
+#include "llvm/DebugInfo/PDB/Raw/RawSession.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/ManagedStatic.h"
 
 using namespace llvm;
+using namespace llvm::pdb;
 
-PDB_ErrorCode llvm::loadDataForPDB(PDB_ReaderType Type, StringRef Path,
-                                   std::unique_ptr<IPDBSession> &Session) {
+Error llvm::pdb::loadDataForPDB(PDB_ReaderType Type, StringRef Path,
+                                std::unique_ptr<IPDBSession> &Session) {
   // Create the correct concrete instance type based on the value of Type.
+  if (Type == PDB_ReaderType::Raw)
+    return RawSession::createFromPdb(Path, Session);
+
 #if HAVE_DIA_SDK
   return DIASession::createFromPdb(Path, Session);
+#else
+  return llvm::make_error<GenericError>("DIA is not installed on the system");
 #endif
-  return PDB_ErrorCode::NoPdbImpl;
 }
 
-PDB_ErrorCode llvm::loadDataForEXE(PDB_ReaderType Type, StringRef Path,
-                                   std::unique_ptr<IPDBSession> &Session) {
+Error llvm::pdb::loadDataForEXE(PDB_ReaderType Type, StringRef Path,
+                                std::unique_ptr<IPDBSession> &Session) {
   // Create the correct concrete instance type based on the value of Type.
+  if (Type == PDB_ReaderType::Raw)
+    return RawSession::createFromExe(Path, Session);
+
 #if HAVE_DIA_SDK
   return DIASession::createFromExe(Path, Session);
+#else
+  return llvm::make_error<GenericError>("DIA is not installed on the system");
 #endif
-  return PDB_ErrorCode::NoPdbImpl;
 }
diff --git a/lib/DebugInfo/PDB/PDBContext.cpp b/lib/DebugInfo/PDB/PDBContext.cpp
index ca2ae6665ce8..773230263da8 100644
--- a/lib/DebugInfo/PDB/PDBContext.cpp
+++ b/lib/DebugInfo/PDB/PDBContext.cpp
@@ -19,6 +19,7 @@
 
 using namespace llvm;
 using namespace llvm::object;
+using namespace llvm::pdb;
 
 PDBContext::PDBContext(const COFFObjectFile &Object,
                        std::unique_ptr<IPDBSession> PDBSession)
@@ -28,7 +29,8 @@ PDBContext::PDBContext(const COFFObjectFile &Object,
     Session->setLoadAddress(ImageBase.get());
 }
 
-void PDBContext::dump(raw_ostream &OS, DIDumpType DumpType) {}
+void PDBContext::dump(raw_ostream &OS, DIDumpType DumpType,
+                      bool DumpEH) {}
 
 DILineInfo PDBContext::getLineInfoForAddress(uint64_t Address,
                                              DILineInfoSpecifier Specifier) {
@@ -95,26 +97,24 @@ std::string PDBContext::getFunctionName(uint64_t Address,
   if (NameKind == DINameKind::None)
     return std::string();
 
+  std::unique_ptr<PDBSymbol> FuncSymbol =
+      Session->findSymbolByAddress(Address, PDB_SymType::Function);
+  auto *Func = dyn_cast_or_null<PDBSymbolFunc>(FuncSymbol.get());
+
   if (NameKind == DINameKind::LinkageName) {
     // It is not possible to get the mangled linkage name through a
     // PDBSymbolFunc.  For that we have to specifically request a
     // PDBSymbolPublicSymbol.
     auto PublicSym =
         Session->findSymbolByAddress(Address, PDB_SymType::PublicSymbol);
-    if (auto PS = dyn_cast_or_null<PDBSymbolPublicSymbol>(PublicSym.get()))
-      return PS->getName();
+    if (auto *PS = dyn_cast_or_null<PDBSymbolPublicSymbol>(PublicSym.get())) {
+      // If we also have a function symbol, prefer the use of public symbol name
+      // only if it refers to the same address. The public symbol uses the
+      // linkage name while the function does not.
+      if (!Func || Func->getVirtualAddress() == PS->getVirtualAddress())
+        return PS->getName();
+    }
   }
 
-  auto FuncSymbol =
-      Session->findSymbolByAddress(Address, PDB_SymType::Function);
-
-  // This could happen either if there was no public symbol (e.g. not
-  // external) or the user requested the short name.  In the former case,
-  // although they technically requested the linkage name, if the linkage
-  // name is not available we fallback to at least returning a non-empty
-  // string.
-  if (auto Func = dyn_cast_or_null<PDBSymbolFunc>(FuncSymbol.get()))
-      return Func->getName();
-
-  return std::string();
+  return Func ? Func->getName() : std::string();
 }
diff --git a/lib/DebugInfo/PDB/PDBExtras.cpp b/lib/DebugInfo/PDB/PDBExtras.cpp
index 4b9437c58243..b7eee6e53941 100644
--- a/lib/DebugInfo/PDB/PDBExtras.cpp
+++ b/lib/DebugInfo/PDB/PDBExtras.cpp
@@ -12,6 +12,7 @@
 #include "llvm/ADT/ArrayRef.h"
 
 using namespace llvm;
+using namespace llvm::pdb;
 
 #define CASE_OUTPUT_ENUM_CLASS_STR(Class, Value, Str, Stream)                  \
   case Class::Value:                                                           \
@@ -21,7 +22,8 @@ using namespace llvm;
 #define CASE_OUTPUT_ENUM_CLASS_NAME(Class, Value, Stream)                      \
   CASE_OUTPUT_ENUM_CLASS_STR(Class, Value, #Value, Stream)
 
-raw_ostream &llvm::operator<<(raw_ostream &OS, const PDB_VariantType &Type) {
+raw_ostream &llvm::pdb::operator<<(raw_ostream &OS,
+                                   const PDB_VariantType &Type) {
   switch (Type) {
     CASE_OUTPUT_ENUM_CLASS_NAME(PDB_VariantType, Bool, OS)
     CASE_OUTPUT_ENUM_CLASS_NAME(PDB_VariantType, Single, OS)
@@ -40,42 +42,39 @@ raw_ostream &llvm::operator<<(raw_ostream &OS, const PDB_VariantType &Type) {
   return OS;
 }
 
-raw_ostream &llvm::operator<<(raw_ostream &OS, const PDB_CallingConv &Conv) {
+raw_ostream &llvm::pdb::operator<<(raw_ostream &OS,
+                                   const PDB_CallingConv &Conv) {
   OS << "__";
   switch (Conv) {
-    CASE_OUTPUT_ENUM_CLASS_STR(PDB_CallingConv, NearCdecl, "cdecl", OS)
-    CASE_OUTPUT_ENUM_CLASS_STR(PDB_CallingConv, FarCdecl, "cdecl", OS)
-    CASE_OUTPUT_ENUM_CLASS_STR(PDB_CallingConv, NearPascal, "pascal", OS)
-    CASE_OUTPUT_ENUM_CLASS_STR(PDB_CallingConv, FarPascal, "pascal", OS)
-    CASE_OUTPUT_ENUM_CLASS_STR(PDB_CallingConv, NearFastcall, "fastcall", OS)
-    CASE_OUTPUT_ENUM_CLASS_STR(PDB_CallingConv, FarFastcall, "fastcall", OS)
-    CASE_OUTPUT_ENUM_CLASS_STR(PDB_CallingConv, Skipped, "skippedcall", OS)
-    CASE_OUTPUT_ENUM_CLASS_STR(PDB_CallingConv, NearStdcall, "stdcall", OS)
-    CASE_OUTPUT_ENUM_CLASS_STR(PDB_CallingConv, FarStdcall, "stdcall", OS)
-    CASE_OUTPUT_ENUM_CLASS_STR(PDB_CallingConv, NearSyscall, "syscall", OS)
-    CASE_OUTPUT_ENUM_CLASS_STR(PDB_CallingConv, FarSyscall, "syscall", OS)
-    CASE_OUTPUT_ENUM_CLASS_STR(PDB_CallingConv, Thiscall, "thiscall", OS)
-    CASE_OUTPUT_ENUM_CLASS_STR(PDB_CallingConv, MipsCall, "mipscall", OS)
-    CASE_OUTPUT_ENUM_CLASS_STR(PDB_CallingConv, Generic, "genericcall", OS)
-    CASE_OUTPUT_ENUM_CLASS_STR(PDB_CallingConv, Alphacall, "alphacall", OS)
-    CASE_OUTPUT_ENUM_CLASS_STR(PDB_CallingConv, Ppccall, "ppccall", OS)
-    CASE_OUTPUT_ENUM_CLASS_STR(PDB_CallingConv, SuperHCall, "superhcall", OS)
-    CASE_OUTPUT_ENUM_CLASS_STR(PDB_CallingConv, Armcall, "armcall", OS)
-    CASE_OUTPUT_ENUM_CLASS_STR(PDB_CallingConv, AM33call, "am33call", OS)
-    CASE_OUTPUT_ENUM_CLASS_STR(PDB_CallingConv, Tricall, "tricall", OS)
-    CASE_OUTPUT_ENUM_CLASS_STR(PDB_CallingConv, Sh5call, "sh5call", OS)
-    CASE_OUTPUT_ENUM_CLASS_STR(PDB_CallingConv, M32R, "m32rcall", OS)
-    CASE_OUTPUT_ENUM_CLASS_STR(PDB_CallingConv, Clrcall, "clrcall", OS)
-    CASE_OUTPUT_ENUM_CLASS_STR(PDB_CallingConv, Inline, "inlinecall", OS)
-    CASE_OUTPUT_ENUM_CLASS_STR(PDB_CallingConv, NearVectorcall, "vectorcall",
-                               OS)
-  default:
-    OS << "unknowncall";
+    CASE_OUTPUT_ENUM_CLASS_STR(PDB_CallingConv, NearC      , "cdecl", OS)
+    CASE_OUTPUT_ENUM_CLASS_STR(PDB_CallingConv, FarC       , "cdecl", OS)
+    CASE_OUTPUT_ENUM_CLASS_STR(PDB_CallingConv, NearPascal , "pascal", OS)
+    CASE_OUTPUT_ENUM_CLASS_STR(PDB_CallingConv, FarPascal  , "pascal", OS)
+    CASE_OUTPUT_ENUM_CLASS_STR(PDB_CallingConv, NearFast   , "fastcall", OS)
+    CASE_OUTPUT_ENUM_CLASS_STR(PDB_CallingConv, FarFast    , "fastcall", OS)
+    CASE_OUTPUT_ENUM_CLASS_STR(PDB_CallingConv, NearStdCall, "stdcall", OS)
+    CASE_OUTPUT_ENUM_CLASS_STR(PDB_CallingConv, FarStdCall , "stdcall", OS)
+    CASE_OUTPUT_ENUM_CLASS_STR(PDB_CallingConv, NearSysCall, "syscall", OS)
+    CASE_OUTPUT_ENUM_CLASS_STR(PDB_CallingConv, FarSysCall , "syscall", OS)
+    CASE_OUTPUT_ENUM_CLASS_STR(PDB_CallingConv, ThisCall   , "thiscall", OS)
+    CASE_OUTPUT_ENUM_CLASS_STR(PDB_CallingConv, MipsCall   , "mipscall", OS)
+    CASE_OUTPUT_ENUM_CLASS_STR(PDB_CallingConv, Generic    , "genericcall", OS)
+    CASE_OUTPUT_ENUM_CLASS_STR(PDB_CallingConv, AlphaCall  , "alphacall", OS)
+    CASE_OUTPUT_ENUM_CLASS_STR(PDB_CallingConv, PpcCall    , "ppccall", OS)
+    CASE_OUTPUT_ENUM_CLASS_STR(PDB_CallingConv, SHCall     , "superhcall", OS)
+    CASE_OUTPUT_ENUM_CLASS_STR(PDB_CallingConv, ArmCall    , "armcall", OS)
+    CASE_OUTPUT_ENUM_CLASS_STR(PDB_CallingConv, AM33Call   , "am33call", OS)
+    CASE_OUTPUT_ENUM_CLASS_STR(PDB_CallingConv, TriCall    , "tricall", OS)
+    CASE_OUTPUT_ENUM_CLASS_STR(PDB_CallingConv, SH5Call    , "sh5call", OS)
+    CASE_OUTPUT_ENUM_CLASS_STR(PDB_CallingConv, M32RCall   , "m32rcall", OS)
+    CASE_OUTPUT_ENUM_CLASS_STR(PDB_CallingConv, ClrCall    , "clrcall", OS)
+    CASE_OUTPUT_ENUM_CLASS_STR(PDB_CallingConv, Inline     , "inlinecall", OS)
+    CASE_OUTPUT_ENUM_CLASS_STR(PDB_CallingConv, NearVector , "vectorcall", OS)
   }
   return OS;
 }
 
-raw_ostream &llvm::operator<<(raw_ostream &OS, const PDB_DataKind &Data) {
+raw_ostream &llvm::pdb::operator<<(raw_ostream &OS, const PDB_DataKind &Data) {
   switch (Data) {
     CASE_OUTPUT_ENUM_CLASS_STR(PDB_DataKind, Unknown, "unknown", OS)
     CASE_OUTPUT_ENUM_CLASS_STR(PDB_DataKind, Local, "local", OS)
@@ -91,62 +90,63 @@ raw_ostream &llvm::operator<<(raw_ostream &OS, const PDB_DataKind &Data) {
   return OS;
 }
 
-raw_ostream &llvm::operator<<(raw_ostream &OS, const PDB_RegisterId &Reg) {
+raw_ostream &llvm::pdb::operator<<(raw_ostream &OS,
+                                   const codeview::RegisterId &Reg) {
   switch (Reg) {
-    CASE_OUTPUT_ENUM_CLASS_NAME(PDB_RegisterId, AL, OS)
-    CASE_OUTPUT_ENUM_CLASS_NAME(PDB_RegisterId, CL, OS)
-    CASE_OUTPUT_ENUM_CLASS_NAME(PDB_RegisterId, DL, OS)
-    CASE_OUTPUT_ENUM_CLASS_NAME(PDB_RegisterId, BL, OS)
-    CASE_OUTPUT_ENUM_CLASS_NAME(PDB_RegisterId, AH, OS)
-    CASE_OUTPUT_ENUM_CLASS_NAME(PDB_RegisterId, CH, OS)
-    CASE_OUTPUT_ENUM_CLASS_NAME(PDB_RegisterId, DH, OS)
-    CASE_OUTPUT_ENUM_CLASS_NAME(PDB_RegisterId, BH, OS)
-    CASE_OUTPUT_ENUM_CLASS_NAME(PDB_RegisterId, AX, OS)
-    CASE_OUTPUT_ENUM_CLASS_NAME(PDB_RegisterId, CX, OS)
-    CASE_OUTPUT_ENUM_CLASS_NAME(PDB_RegisterId, DX, OS)
-    CASE_OUTPUT_ENUM_CLASS_NAME(PDB_RegisterId, BX, OS)
-    CASE_OUTPUT_ENUM_CLASS_NAME(PDB_RegisterId, SP, OS)
-    CASE_OUTPUT_ENUM_CLASS_NAME(PDB_RegisterId, BP, OS)
-    CASE_OUTPUT_ENUM_CLASS_NAME(PDB_RegisterId, SI, OS)
-    CASE_OUTPUT_ENUM_CLASS_NAME(PDB_RegisterId, DI, OS)
-    CASE_OUTPUT_ENUM_CLASS_NAME(PDB_RegisterId, EAX, OS)
-    CASE_OUTPUT_ENUM_CLASS_NAME(PDB_RegisterId, ECX, OS)
-    CASE_OUTPUT_ENUM_CLASS_NAME(PDB_RegisterId, EDX, OS)
-    CASE_OUTPUT_ENUM_CLASS_NAME(PDB_RegisterId, EBX, OS)
-    CASE_OUTPUT_ENUM_CLASS_NAME(PDB_RegisterId, ESP, OS)
-    CASE_OUTPUT_ENUM_CLASS_NAME(PDB_RegisterId, EBP, OS)
-    CASE_OUTPUT_ENUM_CLASS_NAME(PDB_RegisterId, ESI, OS)
-    CASE_OUTPUT_ENUM_CLASS_NAME(PDB_RegisterId, EDI, OS)
-    CASE_OUTPUT_ENUM_CLASS_NAME(PDB_RegisterId, ES, OS)
-    CASE_OUTPUT_ENUM_CLASS_NAME(PDB_RegisterId, CS, OS)
-    CASE_OUTPUT_ENUM_CLASS_NAME(PDB_RegisterId, SS, OS)
-    CASE_OUTPUT_ENUM_CLASS_NAME(PDB_RegisterId, DS, OS)
-    CASE_OUTPUT_ENUM_CLASS_NAME(PDB_RegisterId, FS, OS)
-    CASE_OUTPUT_ENUM_CLASS_NAME(PDB_RegisterId, GS, OS)
-    CASE_OUTPUT_ENUM_CLASS_NAME(PDB_RegisterId, IP, OS)
-    CASE_OUTPUT_ENUM_CLASS_NAME(PDB_RegisterId, RAX, OS)
-    CASE_OUTPUT_ENUM_CLASS_NAME(PDB_RegisterId, RBX, OS)
-    CASE_OUTPUT_ENUM_CLASS_NAME(PDB_RegisterId, RCX, OS)
-    CASE_OUTPUT_ENUM_CLASS_NAME(PDB_RegisterId, RDX, OS)
-    CASE_OUTPUT_ENUM_CLASS_NAME(PDB_RegisterId, RSI, OS)
-    CASE_OUTPUT_ENUM_CLASS_NAME(PDB_RegisterId, RDI, OS)
-    CASE_OUTPUT_ENUM_CLASS_NAME(PDB_RegisterId, RBP, OS)
-    CASE_OUTPUT_ENUM_CLASS_NAME(PDB_RegisterId, RSP, OS)
-    CASE_OUTPUT_ENUM_CLASS_NAME(PDB_RegisterId, R8, OS)
-    CASE_OUTPUT_ENUM_CLASS_NAME(PDB_RegisterId, R9, OS)
-    CASE_OUTPUT_ENUM_CLASS_NAME(PDB_RegisterId, R10, OS)
-    CASE_OUTPUT_ENUM_CLASS_NAME(PDB_RegisterId, R11, OS)
-    CASE_OUTPUT_ENUM_CLASS_NAME(PDB_RegisterId, R12, OS)
-    CASE_OUTPUT_ENUM_CLASS_NAME(PDB_RegisterId, R13, OS)
-    CASE_OUTPUT_ENUM_CLASS_NAME(PDB_RegisterId, R14, OS)
-    CASE_OUTPUT_ENUM_CLASS_NAME(PDB_RegisterId, R15, OS)
+    CASE_OUTPUT_ENUM_CLASS_NAME(codeview::RegisterId, AL, OS)
+    CASE_OUTPUT_ENUM_CLASS_NAME(codeview::RegisterId, CL, OS)
+    CASE_OUTPUT_ENUM_CLASS_NAME(codeview::RegisterId, DL, OS)
+    CASE_OUTPUT_ENUM_CLASS_NAME(codeview::RegisterId, BL, OS)
+    CASE_OUTPUT_ENUM_CLASS_NAME(codeview::RegisterId, AH, OS)
+    CASE_OUTPUT_ENUM_CLASS_NAME(codeview::RegisterId, CH, OS)
+    CASE_OUTPUT_ENUM_CLASS_NAME(codeview::RegisterId, DH, OS)
+    CASE_OUTPUT_ENUM_CLASS_NAME(codeview::RegisterId, BH, OS)
+    CASE_OUTPUT_ENUM_CLASS_NAME(codeview::RegisterId, AX, OS)
+    CASE_OUTPUT_ENUM_CLASS_NAME(codeview::RegisterId, CX, OS)
+    CASE_OUTPUT_ENUM_CLASS_NAME(codeview::RegisterId, DX, OS)
+    CASE_OUTPUT_ENUM_CLASS_NAME(codeview::RegisterId, BX, OS)
+    CASE_OUTPUT_ENUM_CLASS_NAME(codeview::RegisterId, SP, OS)
+    CASE_OUTPUT_ENUM_CLASS_NAME(codeview::RegisterId, BP, OS)
+    CASE_OUTPUT_ENUM_CLASS_NAME(codeview::RegisterId, SI, OS)
+    CASE_OUTPUT_ENUM_CLASS_NAME(codeview::RegisterId, DI, OS)
+    CASE_OUTPUT_ENUM_CLASS_NAME(codeview::RegisterId, EAX, OS)
+    CASE_OUTPUT_ENUM_CLASS_NAME(codeview::RegisterId, ECX, OS)
+    CASE_OUTPUT_ENUM_CLASS_NAME(codeview::RegisterId, EDX, OS)
+    CASE_OUTPUT_ENUM_CLASS_NAME(codeview::RegisterId, EBX, OS)
+    CASE_OUTPUT_ENUM_CLASS_NAME(codeview::RegisterId, ESP, OS)
+    CASE_OUTPUT_ENUM_CLASS_NAME(codeview::RegisterId, EBP, OS)
+    CASE_OUTPUT_ENUM_CLASS_NAME(codeview::RegisterId, ESI, OS)
+    CASE_OUTPUT_ENUM_CLASS_NAME(codeview::RegisterId, EDI, OS)
+    CASE_OUTPUT_ENUM_CLASS_NAME(codeview::RegisterId, ES, OS)
+    CASE_OUTPUT_ENUM_CLASS_NAME(codeview::RegisterId, CS, OS)
+    CASE_OUTPUT_ENUM_CLASS_NAME(codeview::RegisterId, SS, OS)
+    CASE_OUTPUT_ENUM_CLASS_NAME(codeview::RegisterId, DS, OS)
+    CASE_OUTPUT_ENUM_CLASS_NAME(codeview::RegisterId, FS, OS)
+    CASE_OUTPUT_ENUM_CLASS_NAME(codeview::RegisterId, GS, OS)
+    CASE_OUTPUT_ENUM_CLASS_NAME(codeview::RegisterId, IP, OS)
+    CASE_OUTPUT_ENUM_CLASS_NAME(codeview::RegisterId, RAX, OS)
+    CASE_OUTPUT_ENUM_CLASS_NAME(codeview::RegisterId, RBX, OS)
+    CASE_OUTPUT_ENUM_CLASS_NAME(codeview::RegisterId, RCX, OS)
+    CASE_OUTPUT_ENUM_CLASS_NAME(codeview::RegisterId, RDX, OS)
+    CASE_OUTPUT_ENUM_CLASS_NAME(codeview::RegisterId, RSI, OS)
+    CASE_OUTPUT_ENUM_CLASS_NAME(codeview::RegisterId, RDI, OS)
+    CASE_OUTPUT_ENUM_CLASS_NAME(codeview::RegisterId, RBP, OS)
+    CASE_OUTPUT_ENUM_CLASS_NAME(codeview::RegisterId, RSP, OS)
+    CASE_OUTPUT_ENUM_CLASS_NAME(codeview::RegisterId, R8, OS)
+    CASE_OUTPUT_ENUM_CLASS_NAME(codeview::RegisterId, R9, OS)
+    CASE_OUTPUT_ENUM_CLASS_NAME(codeview::RegisterId, R10, OS)
+    CASE_OUTPUT_ENUM_CLASS_NAME(codeview::RegisterId, R11, OS)
+    CASE_OUTPUT_ENUM_CLASS_NAME(codeview::RegisterId, R12, OS)
+    CASE_OUTPUT_ENUM_CLASS_NAME(codeview::RegisterId, R13, OS)
+    CASE_OUTPUT_ENUM_CLASS_NAME(codeview::RegisterId, R14, OS)
+    CASE_OUTPUT_ENUM_CLASS_NAME(codeview::RegisterId, R15, OS)
   default:
     OS << static_cast<int>(Reg);
   }
   return OS;
 }
 
-raw_ostream &llvm::operator<<(raw_ostream &OS, const PDB_LocType &Loc) {
+raw_ostream &llvm::pdb::operator<<(raw_ostream &OS, const PDB_LocType &Loc) {
   switch (Loc) {
     CASE_OUTPUT_ENUM_CLASS_STR(PDB_LocType, Static, "static", OS)
     CASE_OUTPUT_ENUM_CLASS_STR(PDB_LocType, TLS, "tls", OS)
@@ -164,20 +164,22 @@ raw_ostream &llvm::operator<<(raw_ostream &OS, const PDB_LocType &Loc) {
   return OS;
 }
 
-raw_ostream &llvm::operator<<(raw_ostream &OS, const PDB_ThunkOrdinal &Thunk) {
+raw_ostream &llvm::pdb::operator<<(raw_ostream &OS,
+                                   const codeview::ThunkOrdinal &Thunk) {
   switch (Thunk) {
-    CASE_OUTPUT_ENUM_CLASS_NAME(PDB_ThunkOrdinal, BranchIsland, OS)
-    CASE_OUTPUT_ENUM_CLASS_NAME(PDB_ThunkOrdinal, Pcode, OS)
-    CASE_OUTPUT_ENUM_CLASS_NAME(PDB_ThunkOrdinal, Standard, OS)
-    CASE_OUTPUT_ENUM_CLASS_NAME(PDB_ThunkOrdinal, ThisAdjustor, OS)
-    CASE_OUTPUT_ENUM_CLASS_NAME(PDB_ThunkOrdinal, TrampIncremental, OS)
-    CASE_OUTPUT_ENUM_CLASS_NAME(PDB_ThunkOrdinal, UnknownLoad, OS)
-    CASE_OUTPUT_ENUM_CLASS_NAME(PDB_ThunkOrdinal, Vcall, OS)
+    CASE_OUTPUT_ENUM_CLASS_NAME(codeview::ThunkOrdinal, BranchIsland, OS)
+    CASE_OUTPUT_ENUM_CLASS_NAME(codeview::ThunkOrdinal, Pcode, OS)
+    CASE_OUTPUT_ENUM_CLASS_NAME(codeview::ThunkOrdinal, Standard, OS)
+    CASE_OUTPUT_ENUM_CLASS_NAME(codeview::ThunkOrdinal, ThisAdjustor, OS)
+    CASE_OUTPUT_ENUM_CLASS_NAME(codeview::ThunkOrdinal, TrampIncremental, OS)
+    CASE_OUTPUT_ENUM_CLASS_NAME(codeview::ThunkOrdinal, UnknownLoad, OS)
+    CASE_OUTPUT_ENUM_CLASS_NAME(codeview::ThunkOrdinal, Vcall, OS)
   }
   return OS;
 }
 
-raw_ostream &llvm::operator<<(raw_ostream &OS, const PDB_Checksum &Checksum) {
+raw_ostream &llvm::pdb::operator<<(raw_ostream &OS,
+                                   const PDB_Checksum &Checksum) {
   switch (Checksum) {
     CASE_OUTPUT_ENUM_CLASS_NAME(PDB_Checksum, None, OS)
     CASE_OUTPUT_ENUM_CLASS_NAME(PDB_Checksum, MD5, OS)
@@ -186,7 +188,7 @@ raw_ostream &llvm::operator<<(raw_ostream &OS, const PDB_Checksum &Checksum) {
   return OS;
 }
 
-raw_ostream &llvm::operator<<(raw_ostream &OS, const PDB_Lang &Lang) {
+raw_ostream &llvm::pdb::operator<<(raw_ostream &OS, const PDB_Lang &Lang) {
   switch (Lang) {
     CASE_OUTPUT_ENUM_CLASS_NAME(PDB_Lang, C, OS)
     CASE_OUTPUT_ENUM_CLASS_STR(PDB_Lang, Cpp, "C++", OS)
@@ -209,7 +211,7 @@ raw_ostream &llvm::operator<<(raw_ostream &OS, const PDB_Lang &Lang) {
   return OS;
 }
 
-raw_ostream &llvm::operator<<(raw_ostream &OS, const PDB_SymType &Tag) {
+raw_ostream &llvm::pdb::operator<<(raw_ostream &OS, const PDB_SymType &Tag) {
   switch (Tag) {
     CASE_OUTPUT_ENUM_CLASS_NAME(PDB_SymType, Exe, OS)
     CASE_OUTPUT_ENUM_CLASS_NAME(PDB_SymType, Compiland, OS)
@@ -247,7 +249,8 @@ raw_ostream &llvm::operator<<(raw_ostream &OS, const PDB_SymType &Tag) {
   return OS;
 }
 
-raw_ostream &llvm::operator<<(raw_ostream &OS, const PDB_MemberAccess &Access) {
+raw_ostream &llvm::pdb::operator<<(raw_ostream &OS,
+                                   const PDB_MemberAccess &Access) {
   switch (Access) {
     CASE_OUTPUT_ENUM_CLASS_STR(PDB_MemberAccess, Public, "public", OS)
     CASE_OUTPUT_ENUM_CLASS_STR(PDB_MemberAccess, Protected, "protected", OS)
@@ -256,7 +259,7 @@ raw_ostream &llvm::operator<<(raw_ostream &OS, const PDB_MemberAccess &Access) {
   return OS;
 }
 
-raw_ostream &llvm::operator<<(raw_ostream &OS, const PDB_UdtType &Type) {
+raw_ostream &llvm::pdb::operator<<(raw_ostream &OS, const PDB_UdtType &Type) {
   switch (Type) {
     CASE_OUTPUT_ENUM_CLASS_STR(PDB_UdtType, Class, "class", OS)
     CASE_OUTPUT_ENUM_CLASS_STR(PDB_UdtType, Struct, "struct", OS)
@@ -266,7 +269,7 @@ raw_ostream &llvm::operator<<(raw_ostream &OS, const PDB_UdtType &Type) {
   return OS;
 }
 
-raw_ostream &llvm::operator<<(raw_ostream &OS, const PDB_UniqueId &Id) {
+raw_ostream &llvm::pdb::operator<<(raw_ostream &OS, const PDB_UniqueId &Id) {
   static const char *Lookup = "0123456789ABCDEF";
 
   static_assert(sizeof(PDB_UniqueId) == 16, "Expected 16-byte GUID");
@@ -285,40 +288,72 @@ raw_ostream &llvm::operator<<(raw_ostream &OS, const PDB_UniqueId &Id) {
   return OS;
 }
 
-raw_ostream &llvm::operator<<(raw_ostream &OS, const Variant &Value) {
+raw_ostream &llvm::pdb::operator<<(raw_ostream &OS,
+                                   const PDB_Machine &Machine) {
+  switch (Machine) {
+    CASE_OUTPUT_ENUM_CLASS_NAME(PDB_Machine, Am33, OS)
+    CASE_OUTPUT_ENUM_CLASS_NAME(PDB_Machine, Amd64, OS)
+    CASE_OUTPUT_ENUM_CLASS_NAME(PDB_Machine, Arm, OS)
+    CASE_OUTPUT_ENUM_CLASS_NAME(PDB_Machine, ArmNT, OS)
+    CASE_OUTPUT_ENUM_CLASS_NAME(PDB_Machine, Ebc, OS)
+    CASE_OUTPUT_ENUM_CLASS_NAME(PDB_Machine, x86, OS)
+    CASE_OUTPUT_ENUM_CLASS_NAME(PDB_Machine, Ia64, OS)
+    CASE_OUTPUT_ENUM_CLASS_NAME(PDB_Machine, M32R, OS)
+    CASE_OUTPUT_ENUM_CLASS_NAME(PDB_Machine, Mips16, OS)
+    CASE_OUTPUT_ENUM_CLASS_NAME(PDB_Machine, MipsFpu, OS)
+    CASE_OUTPUT_ENUM_CLASS_NAME(PDB_Machine, MipsFpu16, OS)
+    CASE_OUTPUT_ENUM_CLASS_NAME(PDB_Machine, PowerPC, OS)
+    CASE_OUTPUT_ENUM_CLASS_NAME(PDB_Machine, PowerPCFP, OS)
+    CASE_OUTPUT_ENUM_CLASS_NAME(PDB_Machine, R4000, OS)
+    CASE_OUTPUT_ENUM_CLASS_NAME(PDB_Machine, SH3, OS)
+    CASE_OUTPUT_ENUM_CLASS_NAME(PDB_Machine, SH3DSP, OS)
+    CASE_OUTPUT_ENUM_CLASS_NAME(PDB_Machine, SH4, OS)
+    CASE_OUTPUT_ENUM_CLASS_NAME(PDB_Machine, SH5, OS)
+    CASE_OUTPUT_ENUM_CLASS_NAME(PDB_Machine, Thumb, OS)
+    CASE_OUTPUT_ENUM_CLASS_NAME(PDB_Machine, WceMipsV2, OS)
+  default:
+    OS << "Unknown";
+  }
+  return OS;
+}
+
+raw_ostream &llvm::pdb::operator<<(raw_ostream &OS, const Variant &Value) {
   switch (Value.Type) {
     case PDB_VariantType::Bool:
-      OS << (Value.Bool ? "true" : "false");
+      OS << (Value.Value.Bool ? "true" : "false");
       break;
     case PDB_VariantType::Double:
-      OS << Value.Double;
+      OS << Value.Value.Double;
       break;
     case PDB_VariantType::Int16:
-      OS << Value.Int16;
+      OS << Value.Value.Int16;
       break;
     case PDB_VariantType::Int32:
-      OS << Value.Int32;
+      OS << Value.Value.Int32;
       break;
     case PDB_VariantType::Int64:
-      OS << Value.Int64;
+      OS << Value.Value.Int64;
       break;
     case PDB_VariantType::Int8:
-      OS << static_cast<int>(Value.Int8);
+      OS << static_cast<int>(Value.Value.Int8);
       break;
     case PDB_VariantType::Single:
-      OS << Value.Single;
+      OS << Value.Value.Single;
       break;
     case PDB_VariantType::UInt16:
-      OS << Value.Double;
+      OS << Value.Value.Double;
       break;
     case PDB_VariantType::UInt32:
-      OS << Value.UInt32;
+      OS << Value.Value.UInt32;
       break;
     case PDB_VariantType::UInt64:
-      OS << Value.UInt64;
+      OS << Value.Value.UInt64;
       break;
     case PDB_VariantType::UInt8:
-      OS << static_cast<unsigned>(Value.UInt8);
+      OS << static_cast<unsigned>(Value.Value.UInt8);
+      break;
+    case PDB_VariantType::String:
+      OS << Value.Value.String;
       break;
     default:
       OS << Value.Type;
@@ -326,12 +361,13 @@ raw_ostream &llvm::operator<<(raw_ostream &OS, const Variant &Value) {
   return OS;
 }
 
-raw_ostream &llvm::operator<<(raw_ostream &OS, const VersionInfo &Version) {
+raw_ostream &llvm::pdb::operator<<(raw_ostream &OS,
+                                   const VersionInfo &Version) {
   OS << Version.Major << "." << Version.Minor << "." << Version.Build;
   return OS;
 }
 
-raw_ostream &llvm::operator<<(raw_ostream &OS, const TagStats &Stats) {
+raw_ostream &llvm::pdb::operator<<(raw_ostream &OS, const TagStats &Stats) {
   for (auto Tag : Stats) {
     OS << Tag.first << ":" << Tag.second << " ";
   }
diff --git a/lib/DebugInfo/PDB/PDBInterfaceAnchors.cpp b/lib/DebugInfo/PDB/PDBInterfaceAnchors.cpp
index 7b6268d8f655..a347c67ba8e1 100644
--- a/lib/DebugInfo/PDB/PDBInterfaceAnchors.cpp
+++ b/lib/DebugInfo/PDB/PDBInterfaceAnchors.cpp
@@ -18,6 +18,7 @@
 #include "llvm/DebugInfo/PDB/IPDBRawSymbol.h"
 
 using namespace llvm;
+using namespace llvm::pdb;
 
 IPDBSession::~IPDBSession() {}
 
diff --git a/lib/DebugInfo/PDB/PDBSymDumper.cpp b/lib/DebugInfo/PDB/PDBSymDumper.cpp
index 121e2d13d0c1..9450a988dd6f 100644
--- a/lib/DebugInfo/PDB/PDBSymDumper.cpp
+++ b/lib/DebugInfo/PDB/PDBSymDumper.cpp
@@ -11,6 +11,7 @@
 #include "llvm/Support/ErrorHandling.h"
 
 using namespace llvm;
+using namespace llvm::pdb;
 
 #define PDB_SYMDUMP_UNREACHABLE(Type)                                          \
   if (RequireImpl)                                                             \
diff --git a/lib/DebugInfo/PDB/PDBSymbol.cpp b/lib/DebugInfo/PDB/PDBSymbol.cpp
index f9aaf3ae934d..78b3afc6079a 100644
--- a/lib/DebugInfo/PDB/PDBSymbol.cpp
+++ b/lib/DebugInfo/PDB/PDBSymbol.cpp
@@ -50,6 +50,7 @@
 #include <utility>
 
 using namespace llvm;
+using namespace llvm::pdb;
 
 PDBSymbol::PDBSymbol(const IPDBSession &PDBSession,
                      std::unique_ptr<IPDBRawSymbol> Symbol)
@@ -112,6 +113,7 @@ void PDBSymbol::defaultDump(raw_ostream &OS, int Indent) const {
 }
 
 PDB_SymType PDBSymbol::getSymTag() const { return RawSymbol->getSymTag(); }
+uint32_t PDBSymbol::getSymIndexId() const { return RawSymbol->getSymIndexId(); }
 
 std::unique_ptr<IPDBEnumSymbols> PDBSymbol::findAllChildren() const {
   return findAllChildren(PDB_SymType::None);
diff --git a/lib/DebugInfo/PDB/PDBSymbolAnnotation.cpp b/lib/DebugInfo/PDB/PDBSymbolAnnotation.cpp
index a782cad00ae9..cdb167b6191c 100644
--- a/lib/DebugInfo/PDB/PDBSymbolAnnotation.cpp
+++ b/lib/DebugInfo/PDB/PDBSymbolAnnotation.cpp
@@ -14,6 +14,7 @@
 #include <utility>
 
 using namespace llvm;
+using namespace llvm::pdb;
 
 PDBSymbolAnnotation::PDBSymbolAnnotation(const IPDBSession &PDBSession,
                                          std::unique_ptr<IPDBRawSymbol> Symbol)
diff --git a/lib/DebugInfo/PDB/PDBSymbolBlock.cpp b/lib/DebugInfo/PDB/PDBSymbolBlock.cpp
index 46b0ea553e7b..fd5dc9427abf 100644
--- a/lib/DebugInfo/PDB/PDBSymbolBlock.cpp
+++ b/lib/DebugInfo/PDB/PDBSymbolBlock.cpp
@@ -15,6 +15,7 @@
 #include <utility>
 
 using namespace llvm;
+using namespace llvm::pdb;
 
 PDBSymbolBlock::PDBSymbolBlock(const IPDBSession &PDBSession,
                                std::unique_ptr<IPDBRawSymbol> Symbol)
diff --git a/lib/DebugInfo/PDB/PDBSymbolCompiland.cpp b/lib/DebugInfo/PDB/PDBSymbolCompiland.cpp
index 74369148e26e..ebff08846cac 100644
--- a/lib/DebugInfo/PDB/PDBSymbolCompiland.cpp
+++ b/lib/DebugInfo/PDB/PDBSymbolCompiland.cpp
@@ -8,12 +8,14 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/DebugInfo/PDB/PDBSymbolCompiland.h"
+#include "llvm/DebugInfo/PDB/PDBSymbolCompilandEnv.h"
 
 #include "llvm/DebugInfo/PDB/PDBSymDumper.h"
 
 #include <utility>
 
 using namespace llvm;
+using namespace llvm::pdb;
 
 PDBSymbolCompiland::PDBSymbolCompiland(const IPDBSession &PDBSession,
                                        std::unique_ptr<IPDBRawSymbol> Symbol)
@@ -22,3 +24,21 @@ PDBSymbolCompiland::PDBSymbolCompiland(const IPDBSession &PDBSession,
 void PDBSymbolCompiland::dump(PDBSymDumper &Dumper) const {
   Dumper.dump(*this);
 }
+
+std::string PDBSymbolCompiland::getSourceFileName() const
+{
+    std::string Result = RawSymbol->getSourceFileName();
+    if (!Result.empty())
+        return Result;
+    auto Envs = findAllChildren<PDBSymbolCompilandEnv>();
+    if (!Envs)
+        return std::string();
+    while (auto Env = Envs->getNext()) {
+        std::string Var = Env->getName();
+        if (Var != "src")
+            continue;
+        std::string Value = Env->getValue();
+        return Value;
+    }
+    return std::string();
+}
diff --git a/lib/DebugInfo/PDB/PDBSymbolCompilandDetails.cpp b/lib/DebugInfo/PDB/PDBSymbolCompilandDetails.cpp
index 7b351a042d05..6dbd5228f2cd 100644
--- a/lib/DebugInfo/PDB/PDBSymbolCompilandDetails.cpp
+++ b/lib/DebugInfo/PDB/PDBSymbolCompilandDetails.cpp
@@ -15,6 +15,7 @@
 #include <utility>
 
 using namespace llvm;
+using namespace llvm::pdb;
 
 PDBSymbolCompilandDetails::PDBSymbolCompilandDetails(
     const IPDBSession &PDBSession, std::unique_ptr<IPDBRawSymbol> Symbol)
diff --git a/lib/DebugInfo/PDB/PDBSymbolCompilandEnv.cpp b/lib/DebugInfo/PDB/PDBSymbolCompilandEnv.cpp
index e863ccf1ffa3..9c7f0b1be56f 100644
--- a/lib/DebugInfo/PDB/PDBSymbolCompilandEnv.cpp
+++ b/lib/DebugInfo/PDB/PDBSymbolCompilandEnv.cpp
@@ -16,14 +16,17 @@
 #include <utility>
 
 using namespace llvm;
+using namespace llvm::pdb;
 
 PDBSymbolCompilandEnv::PDBSymbolCompilandEnv(
     const IPDBSession &PDBSession, std::unique_ptr<IPDBRawSymbol> Symbol)
     : PDBSymbol(PDBSession, std::move(Symbol)) {}
 
 std::string PDBSymbolCompilandEnv::getValue() const {
-  // call RawSymbol->getValue() and convert the result to an std::string.
-  return std::string();
+  Variant Value = RawSymbol->getValue();
+  if (Value.Type != PDB_VariantType::String)
+    return std::string();
+  return std::string(Value.Value.String);
 }
 
 void PDBSymbolCompilandEnv::dump(PDBSymDumper &Dumper) const {
diff --git a/lib/DebugInfo/PDB/PDBSymbolCustom.cpp b/lib/DebugInfo/PDB/PDBSymbolCustom.cpp
index bd7d9cda2181..0ea387a0eabb 100644
--- a/lib/DebugInfo/PDB/PDBSymbolCustom.cpp
+++ b/lib/DebugInfo/PDB/PDBSymbolCustom.cpp
@@ -16,6 +16,7 @@
 #include <utility>
 
 using namespace llvm;
+using namespace llvm::pdb;
 
 PDBSymbolCustom::PDBSymbolCustom(const IPDBSession &PDBSession,
                                  std::unique_ptr<IPDBRawSymbol> CustomSymbol)
diff --git a/lib/DebugInfo/PDB/PDBSymbolData.cpp b/lib/DebugInfo/PDB/PDBSymbolData.cpp
index a948c2d08afb..62bb6f3f41e2 100644
--- a/lib/DebugInfo/PDB/PDBSymbolData.cpp
+++ b/lib/DebugInfo/PDB/PDBSymbolData.cpp
@@ -15,6 +15,7 @@
 #include <utility>
 
 using namespace llvm;
+using namespace llvm::pdb;
 
 PDBSymbolData::PDBSymbolData(const IPDBSession &PDBSession,
                              std::unique_ptr<IPDBRawSymbol> DataSymbol)
diff --git a/lib/DebugInfo/PDB/PDBSymbolExe.cpp b/lib/DebugInfo/PDB/PDBSymbolExe.cpp
index c9e34ea501dd..60101c168a79 100644
--- a/lib/DebugInfo/PDB/PDBSymbolExe.cpp
+++ b/lib/DebugInfo/PDB/PDBSymbolExe.cpp
@@ -14,6 +14,7 @@
 #include <utility>
 
 using namespace llvm;
+using namespace llvm::pdb;
 
 PDBSymbolExe::PDBSymbolExe(const IPDBSession &PDBSession,
                            std::unique_ptr<IPDBRawSymbol> Symbol)
diff --git a/lib/DebugInfo/PDB/PDBSymbolFunc.cpp b/lib/DebugInfo/PDB/PDBSymbolFunc.cpp
index 0aff327366cb..35251c0cc1c1 100644
--- a/lib/DebugInfo/PDB/PDBSymbolFunc.cpp
+++ b/lib/DebugInfo/PDB/PDBSymbolFunc.cpp
@@ -23,6 +23,7 @@
 #include <vector>
 
 using namespace llvm;
+using namespace llvm::pdb;
 
 namespace {
 class FunctionArgEnumerator : public IPDBEnumChildren<PDBSymbolData> {
diff --git a/lib/DebugInfo/PDB/PDBSymbolFuncDebugEnd.cpp b/lib/DebugInfo/PDB/PDBSymbolFuncDebugEnd.cpp
index 8e559b324059..77e996f651df 100644
--- a/lib/DebugInfo/PDB/PDBSymbolFuncDebugEnd.cpp
+++ b/lib/DebugInfo/PDB/PDBSymbolFuncDebugEnd.cpp
@@ -15,6 +15,7 @@
 #include <utility>
 
 using namespace llvm;
+using namespace llvm::pdb;
 
 PDBSymbolFuncDebugEnd::PDBSymbolFuncDebugEnd(
     const IPDBSession &PDBSession, std::unique_ptr<IPDBRawSymbol> Symbol)
diff --git a/lib/DebugInfo/PDB/PDBSymbolFuncDebugStart.cpp b/lib/DebugInfo/PDB/PDBSymbolFuncDebugStart.cpp
index ff4254f11504..9c653879176b 100644
--- a/lib/DebugInfo/PDB/PDBSymbolFuncDebugStart.cpp
+++ b/lib/DebugInfo/PDB/PDBSymbolFuncDebugStart.cpp
@@ -15,6 +15,7 @@
 #include <utility>
 
 using namespace llvm;
+using namespace llvm::pdb;
 
 PDBSymbolFuncDebugStart::PDBSymbolFuncDebugStart(
     const IPDBSession &PDBSession, std::unique_ptr<IPDBRawSymbol> Symbol)
diff --git a/lib/DebugInfo/PDB/PDBSymbolLabel.cpp b/lib/DebugInfo/PDB/PDBSymbolLabel.cpp
index f39dee8d949a..d2cfd11c35e4 100644
--- a/lib/DebugInfo/PDB/PDBSymbolLabel.cpp
+++ b/lib/DebugInfo/PDB/PDBSymbolLabel.cpp
@@ -14,6 +14,7 @@
 #include <utility>
 
 using namespace llvm;
+using namespace llvm::pdb;
 
 PDBSymbolLabel::PDBSymbolLabel(const IPDBSession &PDBSession,
                                std::unique_ptr<IPDBRawSymbol> Symbol)
diff --git a/lib/DebugInfo/PDB/PDBSymbolPublicSymbol.cpp b/lib/DebugInfo/PDB/PDBSymbolPublicSymbol.cpp
index bd6fe89ac325..97d668740818 100644
--- a/lib/DebugInfo/PDB/PDBSymbolPublicSymbol.cpp
+++ b/lib/DebugInfo/PDB/PDBSymbolPublicSymbol.cpp
@@ -15,6 +15,7 @@
 #include <utility>
 
 using namespace llvm;
+using namespace llvm::pdb;
 
 PDBSymbolPublicSymbol::PDBSymbolPublicSymbol(
     const IPDBSession &PDBSession, std::unique_ptr<IPDBRawSymbol> Symbol)
diff --git a/lib/DebugInfo/PDB/PDBSymbolThunk.cpp b/lib/DebugInfo/PDB/PDBSymbolThunk.cpp
index 733eb5f75031..ef8897d12af4 100644
--- a/lib/DebugInfo/PDB/PDBSymbolThunk.cpp
+++ b/lib/DebugInfo/PDB/PDBSymbolThunk.cpp
@@ -14,6 +14,7 @@
 #include <utility>
 
 using namespace llvm;
+using namespace llvm::pdb;
 
 PDBSymbolThunk::PDBSymbolThunk(const IPDBSession &PDBSession,
                                std::unique_ptr<IPDBRawSymbol> Symbol)
diff --git a/lib/DebugInfo/PDB/PDBSymbolTypeArray.cpp b/lib/DebugInfo/PDB/PDBSymbolTypeArray.cpp
index 19809650361f..c010cc5d7678 100644
--- a/lib/DebugInfo/PDB/PDBSymbolTypeArray.cpp
+++ b/lib/DebugInfo/PDB/PDBSymbolTypeArray.cpp
@@ -15,6 +15,7 @@
 #include <utility>
 
 using namespace llvm;
+using namespace llvm::pdb;
 
 PDBSymbolTypeArray::PDBSymbolTypeArray(const IPDBSession &PDBSession,
                                        std::unique_ptr<IPDBRawSymbol> Symbol)
diff --git a/lib/DebugInfo/PDB/PDBSymbolTypeBaseClass.cpp b/lib/DebugInfo/PDB/PDBSymbolTypeBaseClass.cpp
index c1f0d2f6db11..382c397b24d2 100644
--- a/lib/DebugInfo/PDB/PDBSymbolTypeBaseClass.cpp
+++ b/lib/DebugInfo/PDB/PDBSymbolTypeBaseClass.cpp
@@ -15,6 +15,7 @@
 #include <utility>
 
 using namespace llvm;
+using namespace llvm::pdb;
 
 PDBSymbolTypeBaseClass::PDBSymbolTypeBaseClass(
     const IPDBSession &PDBSession, std::unique_ptr<IPDBRawSymbol> Symbol)
diff --git a/lib/DebugInfo/PDB/PDBSymbolTypeBuiltin.cpp b/lib/DebugInfo/PDB/PDBSymbolTypeBuiltin.cpp
index b302b6629898..e5d65bf5d1fd 100644
--- a/lib/DebugInfo/PDB/PDBSymbolTypeBuiltin.cpp
+++ b/lib/DebugInfo/PDB/PDBSymbolTypeBuiltin.cpp
@@ -14,6 +14,7 @@
 #include <utility>
 
 using namespace llvm;
+using namespace llvm::pdb;
 
 PDBSymbolTypeBuiltin::PDBSymbolTypeBuiltin(
     const IPDBSession &PDBSession, std::unique_ptr<IPDBRawSymbol> Symbol)
diff --git a/lib/DebugInfo/PDB/PDBSymbolTypeCustom.cpp b/lib/DebugInfo/PDB/PDBSymbolTypeCustom.cpp
index cc391f1856c8..1d80c97f9ede 100644
--- a/lib/DebugInfo/PDB/PDBSymbolTypeCustom.cpp
+++ b/lib/DebugInfo/PDB/PDBSymbolTypeCustom.cpp
@@ -15,6 +15,7 @@
 #include <utility>
 
 using namespace llvm;
+using namespace llvm::pdb;
 
 PDBSymbolTypeCustom::PDBSymbolTypeCustom(const IPDBSession &PDBSession,
                                          std::unique_ptr<IPDBRawSymbol> Symbol)
diff --git a/lib/DebugInfo/PDB/PDBSymbolTypeDimension.cpp b/lib/DebugInfo/PDB/PDBSymbolTypeDimension.cpp
index 1e19d0b00122..535d97dcd21e 100644
--- a/lib/DebugInfo/PDB/PDBSymbolTypeDimension.cpp
+++ b/lib/DebugInfo/PDB/PDBSymbolTypeDimension.cpp
@@ -16,6 +16,7 @@
 #include <utility>
 
 using namespace llvm;
+using namespace llvm::pdb;
 
 PDBSymbolTypeDimension::PDBSymbolTypeDimension(
     const IPDBSession &PDBSession, std::unique_ptr<IPDBRawSymbol> Symbol)
diff --git a/lib/DebugInfo/PDB/PDBSymbolTypeEnum.cpp b/lib/DebugInfo/PDB/PDBSymbolTypeEnum.cpp
index 8dd26a342e73..788f2b732aaa 100644
--- a/lib/DebugInfo/PDB/PDBSymbolTypeEnum.cpp
+++ b/lib/DebugInfo/PDB/PDBSymbolTypeEnum.cpp
@@ -17,6 +17,7 @@
 #include <utility>
 
 using namespace llvm;
+using namespace llvm::pdb;
 
 PDBSymbolTypeEnum::PDBSymbolTypeEnum(const IPDBSession &PDBSession,
                                      std::unique_ptr<IPDBRawSymbol> Symbol)
diff --git a/lib/DebugInfo/PDB/PDBSymbolTypeFriend.cpp b/lib/DebugInfo/PDB/PDBSymbolTypeFriend.cpp
index d33266094542..5831baebb993 100644
--- a/lib/DebugInfo/PDB/PDBSymbolTypeFriend.cpp
+++ b/lib/DebugInfo/PDB/PDBSymbolTypeFriend.cpp
@@ -15,6 +15,7 @@
 #include <utility>
 
 using namespace llvm;
+using namespace llvm::pdb;
 
 PDBSymbolTypeFriend::PDBSymbolTypeFriend(const IPDBSession &PDBSession,
                                          std::unique_ptr<IPDBRawSymbol> Symbol)
diff --git a/lib/DebugInfo/PDB/PDBSymbolTypeFunctionArg.cpp b/lib/DebugInfo/PDB/PDBSymbolTypeFunctionArg.cpp
index f8f71ead88b9..c6f586db9e57 100644
--- a/lib/DebugInfo/PDB/PDBSymbolTypeFunctionArg.cpp
+++ b/lib/DebugInfo/PDB/PDBSymbolTypeFunctionArg.cpp
@@ -14,6 +14,7 @@
 #include <utility>
 
 using namespace llvm;
+using namespace llvm::pdb;
 
 PDBSymbolTypeFunctionArg::PDBSymbolTypeFunctionArg(
     const IPDBSession &PDBSession, std::unique_ptr<IPDBRawSymbol> Symbol)
diff --git a/lib/DebugInfo/PDB/PDBSymbolTypeFunctionSig.cpp b/lib/DebugInfo/PDB/PDBSymbolTypeFunctionSig.cpp
index af3563f891f8..057ae260885f 100644
--- a/lib/DebugInfo/PDB/PDBSymbolTypeFunctionSig.cpp
+++ b/lib/DebugInfo/PDB/PDBSymbolTypeFunctionSig.cpp
@@ -19,6 +19,7 @@
 #include <utility>
 
 using namespace llvm;
+using namespace llvm::pdb;
 
 namespace {
 class FunctionArgEnumerator : public IPDBEnumSymbols {
diff --git a/lib/DebugInfo/PDB/PDBSymbolTypeManaged.cpp b/lib/DebugInfo/PDB/PDBSymbolTypeManaged.cpp
index a7fac3030e99..072d2cfd42fb 100644
--- a/lib/DebugInfo/PDB/PDBSymbolTypeManaged.cpp
+++ b/lib/DebugInfo/PDB/PDBSymbolTypeManaged.cpp
@@ -15,6 +15,7 @@
 #include <utility>
 
 using namespace llvm;
+using namespace llvm::pdb;
 
 PDBSymbolTypeManaged::PDBSymbolTypeManaged(
     const IPDBSession &PDBSession, std::unique_ptr<IPDBRawSymbol> Symbol)
diff --git a/lib/DebugInfo/PDB/PDBSymbolTypePointer.cpp b/lib/DebugInfo/PDB/PDBSymbolTypePointer.cpp
index 082ed83fcf4e..699771450a5d 100644
--- a/lib/DebugInfo/PDB/PDBSymbolTypePointer.cpp
+++ b/lib/DebugInfo/PDB/PDBSymbolTypePointer.cpp
@@ -15,6 +15,7 @@
 #include <utility>
 
 using namespace llvm;
+using namespace llvm::pdb;
 
 PDBSymbolTypePointer::PDBSymbolTypePointer(
     const IPDBSession &PDBSession, std::unique_ptr<IPDBRawSymbol> Symbol)
diff --git a/lib/DebugInfo/PDB/PDBSymbolTypeTypedef.cpp b/lib/DebugInfo/PDB/PDBSymbolTypeTypedef.cpp
index 5a426993869d..0f283b9e21a4 100644
--- a/lib/DebugInfo/PDB/PDBSymbolTypeTypedef.cpp
+++ b/lib/DebugInfo/PDB/PDBSymbolTypeTypedef.cpp
@@ -14,6 +14,7 @@
 #include <utility>
 
 using namespace llvm;
+using namespace llvm::pdb;
 
 PDBSymbolTypeTypedef::PDBSymbolTypeTypedef(
     const IPDBSession &PDBSession, std::unique_ptr<IPDBRawSymbol> Symbol)
diff --git a/lib/DebugInfo/PDB/PDBSymbolTypeUDT.cpp b/lib/DebugInfo/PDB/PDBSymbolTypeUDT.cpp
index 2b5da295fde3..c71838cc7a6f 100644
--- a/lib/DebugInfo/PDB/PDBSymbolTypeUDT.cpp
+++ b/lib/DebugInfo/PDB/PDBSymbolTypeUDT.cpp
@@ -14,6 +14,7 @@
 #include <utility>
 
 using namespace llvm;
+using namespace llvm::pdb;
 
 PDBSymbolTypeUDT::PDBSymbolTypeUDT(const IPDBSession &PDBSession,
                                    std::unique_ptr<IPDBRawSymbol> Symbol)
diff --git a/lib/DebugInfo/PDB/PDBSymbolTypeVTable.cpp b/lib/DebugInfo/PDB/PDBSymbolTypeVTable.cpp
index b465d023c59e..6b76db5912ce 100644
--- a/lib/DebugInfo/PDB/PDBSymbolTypeVTable.cpp
+++ b/lib/DebugInfo/PDB/PDBSymbolTypeVTable.cpp
@@ -14,6 +14,7 @@
 #include <utility>
 
 using namespace llvm;
+using namespace llvm::pdb;
 
 PDBSymbolTypeVTable::PDBSymbolTypeVTable(const IPDBSession &PDBSession,
                                          std::unique_ptr<IPDBRawSymbol> Symbol)
diff --git a/lib/DebugInfo/PDB/PDBSymbolTypeVTableShape.cpp b/lib/DebugInfo/PDB/PDBSymbolTypeVTableShape.cpp
index 16052f1e6810..ef509d64bf60 100644
--- a/lib/DebugInfo/PDB/PDBSymbolTypeVTableShape.cpp
+++ b/lib/DebugInfo/PDB/PDBSymbolTypeVTableShape.cpp
@@ -15,6 +15,7 @@
 #include <utility>
 
 using namespace llvm;
+using namespace llvm::pdb;
 
 PDBSymbolTypeVTableShape::PDBSymbolTypeVTableShape(
     const IPDBSession &PDBSession, std::unique_ptr<IPDBRawSymbol> Symbol)
diff --git a/lib/DebugInfo/PDB/PDBSymbolUnknown.cpp b/lib/DebugInfo/PDB/PDBSymbolUnknown.cpp
index 48dc11558ccb..dbbea9c93e20 100644
--- a/lib/DebugInfo/PDB/PDBSymbolUnknown.cpp
+++ b/lib/DebugInfo/PDB/PDBSymbolUnknown.cpp
@@ -15,6 +15,7 @@
 #include <utility>
 
 using namespace llvm;
+using namespace llvm::pdb;
 
 PDBSymbolUnknown::PDBSymbolUnknown(const IPDBSession &PDBSession,
                                    std::unique_ptr<IPDBRawSymbol> Symbol)
diff --git a/lib/DebugInfo/PDB/PDBSymbolUsingNamespace.cpp b/lib/DebugInfo/PDB/PDBSymbolUsingNamespace.cpp
index 6cf13de08512..6a62d554f42c 100644
--- a/lib/DebugInfo/PDB/PDBSymbolUsingNamespace.cpp
+++ b/lib/DebugInfo/PDB/PDBSymbolUsingNamespace.cpp
@@ -15,6 +15,7 @@
 #include <utility>
 
 using namespace llvm;
+using namespace llvm::pdb;
 
 PDBSymbolUsingNamespace::PDBSymbolUsingNamespace(
     const IPDBSession &PDBSession, std::unique_ptr<IPDBRawSymbol> Symbol)
diff --git a/lib/DebugInfo/PDB/Raw/DbiStream.cpp b/lib/DebugInfo/PDB/Raw/DbiStream.cpp
new file mode 100644
index 000000000000..3c0586c728f9
--- /dev/null
+++ b/lib/DebugInfo/PDB/Raw/DbiStream.cpp
@@ -0,0 +1,462 @@
+//===- DbiStream.cpp - PDB Dbi Stream (Stream 3) Access -------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/DebugInfo/PDB/Raw/DbiStream.h"
+
+#include "llvm/DebugInfo/CodeView/StreamArray.h"
+#include "llvm/DebugInfo/CodeView/StreamReader.h"
+#include "llvm/DebugInfo/CodeView/StreamWriter.h"
+#include "llvm/DebugInfo/PDB/Raw/ISectionContribVisitor.h"
+#include "llvm/DebugInfo/PDB/Raw/IndexedStreamData.h"
+#include "llvm/DebugInfo/PDB/Raw/InfoStream.h"
+#include "llvm/DebugInfo/PDB/Raw/ModInfo.h"
+#include "llvm/DebugInfo/PDB/Raw/NameHashTable.h"
+#include "llvm/DebugInfo/PDB/Raw/PDBFile.h"
+#include "llvm/DebugInfo/PDB/Raw/RawConstants.h"
+#include "llvm/DebugInfo/PDB/Raw/RawError.h"
+#include "llvm/DebugInfo/PDB/Raw/RawTypes.h"
+#include "llvm/Object/COFF.h"
+
+using namespace llvm;
+using namespace llvm::codeview;
+using namespace llvm::pdb;
+using namespace llvm::support;
+
+namespace {
+// Some of the values are stored in bitfields.  Since this needs to be portable
+// across compilers and architectures (big / little endian in particular) we
+// can't use the actual structures below, but must instead do the shifting
+// and masking ourselves.  The struct definitions are provided for reference.
+
+// struct DbiFlags {
+//  uint16_t IncrementalLinking : 1;  // True if linked incrementally
+//  uint16_t IsStripped : 1;          // True if private symbols were stripped.
+//  uint16_t HasCTypes : 1;           // True if linked with /debug:ctypes.
+//  uint16_t Reserved : 13;
+//};
+const uint16_t FlagIncrementalMask = 0x0001;
+const uint16_t FlagStrippedMask = 0x0002;
+const uint16_t FlagHasCTypesMask = 0x0004;
+
+// struct DbiBuildNo {
+//  uint16_t MinorVersion : 8;
+//  uint16_t MajorVersion : 7;
+//  uint16_t NewVersionFormat : 1;
+//};
+const uint16_t BuildMinorMask = 0x00FF;
+const uint16_t BuildMinorShift = 0;
+
+const uint16_t BuildMajorMask = 0x7F00;
+const uint16_t BuildMajorShift = 8;
+
+struct FileInfoSubstreamHeader {
+  ulittle16_t NumModules;     // Total # of modules, should match number of
+                              // records in the ModuleInfo substream.
+  ulittle16_t NumSourceFiles; // Total # of source files.  This value is not
+                              // accurate because PDB actually supports more
+                              // than 64k source files, so we ignore it and
+                              // compute the value from other stream fields.
+};
+}
+
+template <typename ContribType>
+static Error loadSectionContribs(FixedStreamArray<ContribType> &Output,
+                                 StreamReader &Reader) {
+  if (Reader.bytesRemaining() % sizeof(ContribType) != 0)
+    return make_error<RawError>(
+        raw_error_code::corrupt_file,
+        "Invalid number of bytes of section contributions");
+
+  uint32_t Count = Reader.bytesRemaining() / sizeof(ContribType);
+  if (auto EC = Reader.readArray(Output, Count))
+    return EC;
+  return Error::success();
+}
+
+DbiStream::DbiStream(PDBFile &File, std::unique_ptr<MappedBlockStream> Stream)
+    : Pdb(File), Stream(std::move(Stream)), Header(nullptr) {
+  static_assert(sizeof(HeaderInfo) == 64, "Invalid HeaderInfo size!");
+}
+
+DbiStream::~DbiStream() {}
+
+Error DbiStream::reload() {
+  StreamReader Reader(*Stream);
+
+  if (Stream->getLength() < sizeof(HeaderInfo))
+    return make_error<RawError>(raw_error_code::corrupt_file,
+                                "DBI Stream does not contain a header.");
+  if (auto EC = Reader.readObject(Header))
+    return make_error<RawError>(raw_error_code::corrupt_file,
+                                "DBI Stream does not contain a header.");
+
+  if (Header->VersionSignature != -1)
+    return make_error<RawError>(raw_error_code::corrupt_file,
+                                "Invalid DBI version signature.");
+
+  // Require at least version 7, which should be present in all PDBs
+  // produced in the last decade and allows us to avoid having to
+  // special case all kinds of complicated arcane formats.
+  if (Header->VersionHeader < PdbDbiV70)
+    return make_error<RawError>(raw_error_code::feature_unsupported,
+                                "Unsupported DBI version.");
+
+  auto IS = Pdb.getPDBInfoStream();
+  if (!IS)
+    return IS.takeError();
+
+  if (Header->Age != IS->getAge())
+    return make_error<RawError>(raw_error_code::corrupt_file,
+                                "DBI Age does not match PDB Age.");
+
+  if (Stream->getLength() !=
+      sizeof(HeaderInfo) + Header->ModiSubstreamSize +
+          Header->SecContrSubstreamSize + Header->SectionMapSize +
+          Header->FileInfoSize + Header->TypeServerSize +
+          Header->OptionalDbgHdrSize + Header->ECSubstreamSize)
+    return make_error<RawError>(raw_error_code::corrupt_file,
+                                "DBI Length does not equal sum of substreams.");
+
+  // Only certain substreams are guaranteed to be aligned.  Validate
+  // them here.
+  if (Header->ModiSubstreamSize % sizeof(uint32_t) != 0)
+    return make_error<RawError>(raw_error_code::corrupt_file,
+                                "DBI MODI substream not aligned.");
+  if (Header->SecContrSubstreamSize % sizeof(uint32_t) != 0)
+    return make_error<RawError>(
+        raw_error_code::corrupt_file,
+        "DBI section contribution substream not aligned.");
+  if (Header->SectionMapSize % sizeof(uint32_t) != 0)
+    return make_error<RawError>(raw_error_code::corrupt_file,
+                                "DBI section map substream not aligned.");
+  if (Header->FileInfoSize % sizeof(uint32_t) != 0)
+    return make_error<RawError>(raw_error_code::corrupt_file,
+                                "DBI file info substream not aligned.");
+  if (Header->TypeServerSize % sizeof(uint32_t) != 0)
+    return make_error<RawError>(raw_error_code::corrupt_file,
+                                "DBI type server substream not aligned.");
+
+  // Since each ModInfo in the stream is a variable length, we have to iterate
+  // them to know how many there actually are.
+  VarStreamArray<ModInfo> ModInfoArray;
+  if (auto EC = Reader.readArray(ModInfoArray, Header->ModiSubstreamSize))
+    return EC;
+  for (auto &Info : ModInfoArray) {
+    ModuleInfos.emplace_back(Info);
+  }
+
+  if (auto EC = Reader.readStreamRef(SecContrSubstream,
+                                     Header->SecContrSubstreamSize))
+    return EC;
+  if (auto EC = Reader.readStreamRef(SecMapSubstream, Header->SectionMapSize))
+    return EC;
+  if (auto EC = Reader.readStreamRef(FileInfoSubstream, Header->FileInfoSize))
+    return EC;
+  if (auto EC =
+          Reader.readStreamRef(TypeServerMapSubstream, Header->TypeServerSize))
+    return EC;
+  if (auto EC = Reader.readStreamRef(ECSubstream, Header->ECSubstreamSize))
+    return EC;
+  if (auto EC = Reader.readArray(DbgStreams, Header->OptionalDbgHdrSize /
+                                                 sizeof(ulittle16_t)))
+    return EC;
+
+  if (auto EC = initializeSectionContributionData())
+    return EC;
+  if (auto EC = initializeSectionHeadersData())
+    return EC;
+  if (auto EC = initializeSectionMapData())
+    return EC;
+  if (auto EC = initializeFileInfo())
+    return EC;
+  if (auto EC = initializeFpoRecords())
+    return EC;
+
+  if (Reader.bytesRemaining() > 0)
+    return make_error<RawError>(raw_error_code::corrupt_file,
+                                "Found unexpected bytes in DBI Stream.");
+
+  if (ECSubstream.getLength() > 0) {
+    StreamReader ECReader(ECSubstream);
+    if (auto EC = ECNames.load(ECReader))
+      return EC;
+  }
+
+  return Error::success();
+}
+
+PdbRaw_DbiVer DbiStream::getDbiVersion() const {
+  uint32_t Value = Header->VersionHeader;
+  return static_cast<PdbRaw_DbiVer>(Value);
+}
+
+uint32_t DbiStream::getAge() const { return Header->Age; }
+
+uint16_t DbiStream::getPublicSymbolStreamIndex() const {
+  return Header->PublicSymbolStreamIndex;
+}
+
+uint16_t DbiStream::getGlobalSymbolStreamIndex() const {
+  return Header->GlobalSymbolStreamIndex;
+}
+
+uint16_t DbiStream::getFlags() const { return Header->Flags; }
+
+bool DbiStream::isIncrementallyLinked() const {
+  return (Header->Flags & FlagIncrementalMask) != 0;
+}
+
+bool DbiStream::hasCTypes() const {
+  return (Header->Flags & FlagHasCTypesMask) != 0;
+}
+
+bool DbiStream::isStripped() const {
+  return (Header->Flags & FlagStrippedMask) != 0;
+}
+
+uint16_t DbiStream::getBuildNumber() const { return Header->BuildNumber; }
+
+uint16_t DbiStream::getBuildMajorVersion() const {
+  return (Header->BuildNumber & BuildMajorMask) >> BuildMajorShift;
+}
+
+uint16_t DbiStream::getBuildMinorVersion() const {
+  return (Header->BuildNumber & BuildMinorMask) >> BuildMinorShift;
+}
+
+uint16_t DbiStream::getPdbDllRbld() const { return Header->PdbDllRbld; }
+
+uint32_t DbiStream::getPdbDllVersion() const { return Header->PdbDllVersion; }
+
+uint32_t DbiStream::getSymRecordStreamIndex() const {
+  return Header->SymRecordStreamIndex;
+}
+
+PDB_Machine DbiStream::getMachineType() const {
+  uint16_t Machine = Header->MachineType;
+  return static_cast<PDB_Machine>(Machine);
+}
+
+codeview::FixedStreamArray<object::coff_section>
+DbiStream::getSectionHeaders() {
+  return SectionHeaders;
+}
+
+codeview::FixedStreamArray<object::FpoData> DbiStream::getFpoRecords() {
+  return FpoRecords;
+}
+
+ArrayRef<ModuleInfoEx> DbiStream::modules() const { return ModuleInfos; }
+codeview::FixedStreamArray<SecMapEntry> DbiStream::getSectionMap() const {
+  return SectionMap;
+}
+
+void llvm::pdb::DbiStream::visitSectionContributions(
+    ISectionContribVisitor &Visitor) const {
+  if (SectionContribVersion == DbiSecContribVer60) {
+    for (auto &SC : SectionContribs)
+      Visitor.visit(SC);
+  } else if (SectionContribVersion == DbiSecContribV2) {
+    for (auto &SC : SectionContribs2)
+      Visitor.visit(SC);
+  }
+}
+
+Error DbiStream::initializeSectionContributionData() {
+  if (SecContrSubstream.getLength() == 0)
+    return Error::success();
+
+  StreamReader SCReader(SecContrSubstream);
+  if (auto EC = SCReader.readEnum(SectionContribVersion))
+    return EC;
+
+  if (SectionContribVersion == DbiSecContribVer60)
+    return loadSectionContribs<SectionContrib>(SectionContribs, SCReader);
+  if (SectionContribVersion == DbiSecContribV2)
+    return loadSectionContribs<SectionContrib2>(SectionContribs2, SCReader);
+
+  return make_error<RawError>(raw_error_code::feature_unsupported,
+                              "Unsupported DBI Section Contribution version");
+}
+
+// Initializes this->SectionHeaders.
+Error DbiStream::initializeSectionHeadersData() {
+  if (DbgStreams.size() == 0)
+    return Error::success();
+
+  uint32_t StreamNum = getDebugStreamIndex(DbgHeaderType::SectionHdr);
+  if (StreamNum >= Pdb.getNumStreams())
+    return make_error<RawError>(raw_error_code::no_stream);
+
+  auto SHS = MappedBlockStream::createIndexedStream(StreamNum, Pdb);
+  if (!SHS)
+    return SHS.takeError();
+
+  size_t StreamLen = (*SHS)->getLength();
+  if (StreamLen % sizeof(object::coff_section))
+    return make_error<RawError>(raw_error_code::corrupt_file,
+                                "Corrupted section header stream.");
+
+  size_t NumSections = StreamLen / sizeof(object::coff_section);
+  codeview::StreamReader Reader(**SHS);
+  if (auto EC = Reader.readArray(SectionHeaders, NumSections))
+    return make_error<RawError>(raw_error_code::corrupt_file,
+                                "Could not read a bitmap.");
+
+  SectionHeaderStream = std::move(*SHS);
+  return Error::success();
+}
+
+// Initializes this->Fpos.
+Error DbiStream::initializeFpoRecords() {
+  if (DbgStreams.size() == 0)
+    return Error::success();
+
+  uint32_t StreamNum = getDebugStreamIndex(DbgHeaderType::NewFPO);
+
+  // This means there is no FPO data.
+  if (StreamNum == InvalidStreamIndex)
+    return Error::success();
+
+  if (StreamNum >= Pdb.getNumStreams())
+    return make_error<RawError>(raw_error_code::no_stream);
+
+  auto FS = MappedBlockStream::createIndexedStream(StreamNum, Pdb);
+  if (!FS)
+    return FS.takeError();
+
+  size_t StreamLen = (*FS)->getLength();
+  if (StreamLen % sizeof(object::FpoData))
+    return make_error<RawError>(raw_error_code::corrupt_file,
+                                "Corrupted New FPO stream.");
+
+  size_t NumRecords = StreamLen / sizeof(object::FpoData);
+  codeview::StreamReader Reader(**FS);
+  if (auto EC = Reader.readArray(FpoRecords, NumRecords))
+    return make_error<RawError>(raw_error_code::corrupt_file,
+                                "Corrupted New FPO stream.");
+  FpoStream = std::move(*FS);
+  return Error::success();
+}
+
+Error DbiStream::initializeSectionMapData() {
+  if (SecMapSubstream.getLength() == 0)
+    return Error::success();
+
+  StreamReader SMReader(SecMapSubstream);
+  const SecMapHeader *Header;
+  if (auto EC = SMReader.readObject(Header))
+    return EC;
+  if (auto EC = SMReader.readArray(SectionMap, Header->SecCount))
+    return EC;
+  return Error::success();
+}
+
+Error DbiStream::initializeFileInfo() {
+  // The layout of the FileInfoSubstream is like this:
+  // struct {
+  //   ulittle16_t NumModules;
+  //   ulittle16_t NumSourceFiles;
+  //   ulittle16_t ModIndices[NumModules];
+  //   ulittle16_t ModFileCounts[NumModules];
+  //   ulittle32_t FileNameOffsets[NumSourceFiles];
+  //   char Names[][NumSourceFiles];
+  // };
+  // with the caveat that `NumSourceFiles` cannot be trusted, so
+  // it is computed by summing `ModFileCounts`.
+  //
+  if (FileInfoSubstream.getLength() == 0)
+    return Error::success();
+
+  const FileInfoSubstreamHeader *FH;
+  StreamReader FISR(FileInfoSubstream);
+  if (auto EC = FISR.readObject(FH))
+    return EC;
+
+  // The number of modules in the stream should be the same as reported by
+  // the FileInfoSubstreamHeader.
+  if (FH->NumModules != ModuleInfos.size())
+    return make_error<RawError>(raw_error_code::corrupt_file,
+                                "FileInfo substream count doesn't match DBI.");
+
+  FixedStreamArray<ulittle16_t> ModIndexArray;
+  FixedStreamArray<ulittle16_t> ModFileCountArray;
+
+  // First is an array of `NumModules` module indices.  This is not used for the
+  // same reason that `NumSourceFiles` is not used.  It's an array of uint16's,
+  // but it's possible there are more than 64k source files, which would imply
+  // more than 64k modules (e.g. object files) as well.  So we ignore this
+  // field.
+  if (auto EC = FISR.readArray(ModIndexArray, ModuleInfos.size()))
+    return EC;
+  if (auto EC = FISR.readArray(ModFileCountArray, ModuleInfos.size()))
+    return EC;
+
+  // Compute the real number of source files.
+  uint32_t NumSourceFiles = 0;
+  for (auto Count : ModFileCountArray)
+    NumSourceFiles += Count;
+
+  // This is the array that in the reference implementation corresponds to
+  // `ModInfo::FileLayout::FileNameOffs`, which is commented there as being a
+  // pointer. Due to the mentioned problems of pointers causing difficulty
+  // when reading from the file on 64-bit systems, we continue to ignore that
+  // field in `ModInfo`, and instead build a vector of StringRefs and stores
+  // them in `ModuleInfoEx`.  The value written to and read from the file is
+  // not used anyway, it is only there as a way to store the offsets for the
+  // purposes of later accessing the names at runtime.
+  if (auto EC = FISR.readArray(FileNameOffsets, NumSourceFiles))
+    return EC;
+
+  if (auto EC = FISR.readStreamRef(NamesBuffer))
+    return EC;
+
+  // We go through each ModuleInfo, determine the number N of source files for
+  // that module, and then get the next N offsets from the Offsets array, using
+  // them to get the corresponding N names from the Names buffer and associating
+  // each one with the corresponding module.
+  uint32_t NextFileIndex = 0;
+  for (size_t I = 0; I < ModuleInfos.size(); ++I) {
+    uint32_t NumFiles = ModFileCountArray[I];
+    ModuleInfos[I].SourceFiles.resize(NumFiles);
+    for (size_t J = 0; J < NumFiles; ++J, ++NextFileIndex) {
+      auto ThisName = getFileNameForIndex(NextFileIndex);
+      if (!ThisName)
+        return ThisName.takeError();
+      ModuleInfos[I].SourceFiles[J] = *ThisName;
+    }
+  }
+
+  return Error::success();
+}
+
+uint32_t DbiStream::getDebugStreamIndex(DbgHeaderType Type) const {
+  return DbgStreams[static_cast<uint16_t>(Type)];
+}
+
+Expected<StringRef> DbiStream::getFileNameForIndex(uint32_t Index) const {
+  StreamReader Names(NamesBuffer);
+  if (Index >= FileNameOffsets.size())
+    return make_error<RawError>(raw_error_code::index_out_of_bounds);
+
+  uint32_t FileOffset = FileNameOffsets[Index];
+  Names.setOffset(FileOffset);
+  StringRef Name;
+  if (auto EC = Names.readZeroString(Name))
+    return std::move(EC);
+  return Name;
+}
+
+Error DbiStream::commit() {
+  StreamWriter Writer(*Stream);
+  if (auto EC = Writer.writeObject(*Header))
+    return EC;
+
+  return Error::success();
+}
diff --git a/lib/DebugInfo/PDB/Raw/DbiStreamBuilder.cpp b/lib/DebugInfo/PDB/Raw/DbiStreamBuilder.cpp
new file mode 100644
index 000000000000..34ff8ae3a907
--- /dev/null
+++ b/lib/DebugInfo/PDB/Raw/DbiStreamBuilder.cpp
@@ -0,0 +1,81 @@
+//===- DbiStreamBuilder.cpp - PDB Dbi Stream Creation -----------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/DebugInfo/PDB/Raw/DbiStreamBuilder.h"
+
+#include "llvm/DebugInfo/CodeView/StreamWriter.h"
+#include "llvm/DebugInfo/PDB/Raw/DbiStream.h"
+#include "llvm/DebugInfo/PDB/Raw/MappedBlockStream.h"
+#include "llvm/DebugInfo/PDB/Raw/RawError.h"
+
+using namespace llvm;
+using namespace llvm::codeview;
+using namespace llvm::pdb;
+
+DbiStreamBuilder::DbiStreamBuilder()
+    : Age(1), BuildNumber(0), PdbDllVersion(0), PdbDllRbld(0), Flags(0),
+      MachineType(PDB_Machine::x86) {}
+
+void DbiStreamBuilder::setVersionHeader(PdbRaw_DbiVer V) { VerHeader = V; }
+
+void DbiStreamBuilder::setAge(uint32_t A) { Age = A; }
+
+void DbiStreamBuilder::setBuildNumber(uint16_t B) { BuildNumber = B; }
+
+void DbiStreamBuilder::setPdbDllVersion(uint16_t V) { PdbDllVersion = V; }
+
+void DbiStreamBuilder::setPdbDllRbld(uint16_t R) { PdbDllRbld = R; }
+
+void DbiStreamBuilder::setFlags(uint16_t F) { Flags = F; }
+
+void DbiStreamBuilder::setMachineType(PDB_Machine M) { MachineType = M; }
+
+uint32_t DbiStreamBuilder::calculateSerializedLength() const {
+  // For now we only support serializing the header.
+  return sizeof(DbiStream::HeaderInfo);
+}
+
+Expected<std::unique_ptr<DbiStream>> DbiStreamBuilder::build(PDBFile &File) {
+  if (!VerHeader.hasValue())
+    return make_error<RawError>(raw_error_code::unspecified,
+                                "Missing DBI Stream Version");
+
+  auto DbiS = MappedBlockStream::createIndexedStream(StreamDBI, File);
+  if (!DbiS)
+    return DbiS.takeError();
+  auto DS = std::move(*DbiS);
+  DbiStream::HeaderInfo *H =
+      static_cast<DbiStream::HeaderInfo *>(DS->getAllocator().Allocate(
+          sizeof(DbiStream::HeaderInfo),
+          llvm::AlignOf<DbiStream::HeaderInfo>::Alignment));
+  H->VersionHeader = *VerHeader;
+  H->VersionSignature = -1;
+  H->Age = Age;
+  H->BuildNumber = BuildNumber;
+  H->Flags = Flags;
+  H->PdbDllRbld = PdbDllRbld;
+  H->PdbDllVersion = PdbDllVersion;
+  H->MachineType = static_cast<uint16_t>(MachineType);
+
+  H->ECSubstreamSize = 0;
+  H->FileInfoSize = 0;
+  H->ModiSubstreamSize = 0;
+  H->OptionalDbgHdrSize = 0;
+  H->SecContrSubstreamSize = 0;
+  H->SectionMapSize = 0;
+  H->TypeServerSize = 0;
+  H->SymRecordStreamIndex = DbiStream::InvalidStreamIndex;
+  H->PublicSymbolStreamIndex = DbiStream::InvalidStreamIndex;
+  H->MFCTypeServerIndex = DbiStream::InvalidStreamIndex;
+  H->GlobalSymbolStreamIndex = DbiStream::InvalidStreamIndex;
+
+  auto Dbi = llvm::make_unique<DbiStream>(File, std::move(DS));
+  Dbi->Header = H;
+  return std::move(Dbi);
+}
diff --git a/lib/DebugInfo/PDB/Raw/EnumTables.cpp b/lib/DebugInfo/PDB/Raw/EnumTables.cpp
new file mode 100644
index 000000000000..fc9270c69947
--- /dev/null
+++ b/lib/DebugInfo/PDB/Raw/EnumTables.cpp
@@ -0,0 +1,38 @@
+//===- EnumTables.cpp - Enum to string conversion tables --------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/DebugInfo/PDB/Raw/EnumTables.h"
+#include "llvm/DebugInfo/PDB/Raw/RawConstants.h"
+
+using namespace llvm;
+using namespace llvm::pdb;
+
+#define PDB_ENUM_CLASS_ENT(enum_class, enum)                                   \
+  { #enum, std::underlying_type < enum_class > ::type(enum_class::enum) }
+
+#define PDB_ENUM_ENT(ns, enum)                                                 \
+  { #enum, ns::enum }
+
+static const EnumEntry<uint16_t> OMFSegMapDescFlagNames[] = {
+    PDB_ENUM_CLASS_ENT(OMFSegDescFlags, Read),
+    PDB_ENUM_CLASS_ENT(OMFSegDescFlags, Write),
+    PDB_ENUM_CLASS_ENT(OMFSegDescFlags, Execute),
+    PDB_ENUM_CLASS_ENT(OMFSegDescFlags, AddressIs32Bit),
+    PDB_ENUM_CLASS_ENT(OMFSegDescFlags, IsSelector),
+    PDB_ENUM_CLASS_ENT(OMFSegDescFlags, IsAbsoluteAddress),
+    PDB_ENUM_CLASS_ENT(OMFSegDescFlags, IsGroup),
+};
+
+namespace llvm {
+namespace pdb {
+ArrayRef<EnumEntry<uint16_t>> getOMFSegMapDescFlagNames() {
+  return makeArrayRef(OMFSegMapDescFlagNames);
+}
+}
+}
+\ No newline at end of file
diff --git a/lib/DebugInfo/PDB/Raw/Hash.cpp b/lib/DebugInfo/PDB/Raw/Hash.cpp
new file mode 100644
index 000000000000..23cb55786d78
--- /dev/null
+++ b/lib/DebugInfo/PDB/Raw/Hash.cpp
@@ -0,0 +1,131 @@
+//===- Hash.cpp - PDB Hash Functions --------------------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/DebugInfo/PDB/Raw/Hash.h"
+
+#include "llvm/ADT/ArrayRef.h"
+#include "llvm/Support/Endian.h"
+
+using namespace llvm;
+using namespace llvm::support;
+
+// Corresponds to `Hasher::lhashPbCb` in PDB/include/misc.h.
+// Used for name hash table and TPI/IPI hashes.
+uint32_t pdb::hashStringV1(StringRef Str) {
+  uint32_t Result = 0;
+  uint32_t Size = Str.size();
+
+  ArrayRef<ulittle32_t> Longs(reinterpret_cast<const ulittle32_t *>(Str.data()),
+                              Size / 4);
+
+  for (auto Value : Longs)
+    Result ^= Value;
+
+  const uint8_t *Remainder = reinterpret_cast<const uint8_t *>(Longs.end());
+  uint32_t RemainderSize = Size % 4;
+
+  // Maximum of 3 bytes left.  Hash a 2 byte word if possible, then hash the
+  // possibly remaining 1 byte.
+  if (RemainderSize >= 2) {
+    uint16_t Value = *reinterpret_cast<const ulittle16_t *>(Remainder);
+    Result ^= static_cast<uint32_t>(Value);
+    Remainder += 2;
+    RemainderSize -= 2;
+  }
+
+  // hash possible odd byte
+  if (RemainderSize == 1) {
+    Result ^= *(Remainder++);
+  }
+
+  const uint32_t toLowerMask = 0x20202020;
+  Result |= toLowerMask;
+  Result ^= (Result >> 11);
+
+  return Result ^ (Result >> 16);
+}
+
+// Corresponds to `HasherV2::HashULONG` in PDB/include/misc.h.
+// Used for name hash table.
+uint32_t pdb::hashStringV2(StringRef Str) {
+  uint32_t Hash = 0xb170a1bf;
+
+  ArrayRef<char> Buffer(Str.begin(), Str.end());
+
+  ArrayRef<ulittle32_t> Items(
+      reinterpret_cast<const ulittle32_t *>(Buffer.data()),
+      Buffer.size() / sizeof(ulittle32_t));
+  for (ulittle32_t Item : Items) {
+    Hash += Item;
+    Hash += (Hash << 10);
+    Hash ^= (Hash >> 6);
+  }
+  Buffer = Buffer.slice(Items.size() * sizeof(ulittle32_t));
+  for (uint8_t Item : Buffer) {
+    Hash += Item;
+    Hash += (Hash << 10);
+    Hash ^= (Hash >> 6);
+  }
+
+  return Hash * 1664525L + 1013904223L;
+}
+
+static const uint32_t V8HashTable[] = {
+    0x00000000, 0x77073096, 0xEE0E612C, 0x990951BA, 0x076DC419, 0x706AF48F,
+    0xE963A535, 0x9E6495A3, 0x0EDB8832, 0x79DCB8A4, 0xE0D5E91E, 0x97D2D988,
+    0x09B64C2B, 0x7EB17CBD, 0xE7B82D07, 0x90BF1D91, 0x1DB71064, 0x6AB020F2,
+    0xF3B97148, 0x84BE41DE, 0x1ADAD47D, 0x6DDDE4EB, 0xF4D4B551, 0x83D385C7,
+    0x136C9856, 0x646BA8C0, 0xFD62F97A, 0x8A65C9EC, 0x14015C4F, 0x63066CD9,
+    0xFA0F3D63, 0x8D080DF5, 0x3B6E20C8, 0x4C69105E, 0xD56041E4, 0xA2677172,
+    0x3C03E4D1, 0x4B04D447, 0xD20D85FD, 0xA50AB56B, 0x35B5A8FA, 0x42B2986C,
+    0xDBBBC9D6, 0xACBCF940, 0x32D86CE3, 0x45DF5C75, 0xDCD60DCF, 0xABD13D59,
+    0x26D930AC, 0x51DE003A, 0xC8D75180, 0xBFD06116, 0x21B4F4B5, 0x56B3C423,
+    0xCFBA9599, 0xB8BDA50F, 0x2802B89E, 0x5F058808, 0xC60CD9B2, 0xB10BE924,
+    0x2F6F7C87, 0x58684C11, 0xC1611DAB, 0xB6662D3D, 0x76DC4190, 0x01DB7106,
+    0x98D220BC, 0xEFD5102A, 0x71B18589, 0x06B6B51F, 0x9FBFE4A5, 0xE8B8D433,
+    0x7807C9A2, 0x0F00F934, 0x9609A88E, 0xE10E9818, 0x7F6A0DBB, 0x086D3D2D,
+    0x91646C97, 0xE6635C01, 0x6B6B51F4, 0x1C6C6162, 0x856530D8, 0xF262004E,
+    0x6C0695ED, 0x1B01A57B, 0x8208F4C1, 0xF50FC457, 0x65B0D9C6, 0x12B7E950,
+    0x8BBEB8EA, 0xFCB9887C, 0x62DD1DDF, 0x15DA2D49, 0x8CD37CF3, 0xFBD44C65,
+    0x4DB26158, 0x3AB551CE, 0xA3BC0074, 0xD4BB30E2, 0x4ADFA541, 0x3DD895D7,
+    0xA4D1C46D, 0xD3D6F4FB, 0x4369E96A, 0x346ED9FC, 0xAD678846, 0xDA60B8D0,
+    0x44042D73, 0x33031DE5, 0xAA0A4C5F, 0xDD0D7CC9, 0x5005713C, 0x270241AA,
+    0xBE0B1010, 0xC90C2086, 0x5768B525, 0x206F85B3, 0xB966D409, 0xCE61E49F,
+    0x5EDEF90E, 0x29D9C998, 0xB0D09822, 0xC7D7A8B4, 0x59B33D17, 0x2EB40D81,
+    0xB7BD5C3B, 0xC0BA6CAD, 0xEDB88320, 0x9ABFB3B6, 0x03B6E20C, 0x74B1D29A,
+    0xEAD54739, 0x9DD277AF, 0x04DB2615, 0x73DC1683, 0xE3630B12, 0x94643B84,
+    0x0D6D6A3E, 0x7A6A5AA8, 0xE40ECF0B, 0x9309FF9D, 0x0A00AE27, 0x7D079EB1,
+    0xF00F9344, 0x8708A3D2, 0x1E01F268, 0x6906C2FE, 0xF762575D, 0x806567CB,
+    0x196C3671, 0x6E6B06E7, 0xFED41B76, 0x89D32BE0, 0x10DA7A5A, 0x67DD4ACC,
+    0xF9B9DF6F, 0x8EBEEFF9, 0x17B7BE43, 0x60B08ED5, 0xD6D6A3E8, 0xA1D1937E,
+    0x38D8C2C4, 0x4FDFF252, 0xD1BB67F1, 0xA6BC5767, 0x3FB506DD, 0x48B2364B,
+    0xD80D2BDA, 0xAF0A1B4C, 0x36034AF6, 0x41047A60, 0xDF60EFC3, 0xA867DF55,
+    0x316E8EEF, 0x4669BE79, 0xCB61B38C, 0xBC66831A, 0x256FD2A0, 0x5268E236,
+    0xCC0C7795, 0xBB0B4703, 0x220216B9, 0x5505262F, 0xC5BA3BBE, 0xB2BD0B28,
+    0x2BB45A92, 0x5CB36A04, 0xC2D7FFA7, 0xB5D0CF31, 0x2CD99E8B, 0x5BDEAE1D,
+    0x9B64C2B0, 0xEC63F226, 0x756AA39C, 0x026D930A, 0x9C0906A9, 0xEB0E363F,
+    0x72076785, 0x05005713, 0x95BF4A82, 0xE2B87A14, 0x7BB12BAE, 0x0CB61B38,
+    0x92D28E9B, 0xE5D5BE0D, 0x7CDCEFB7, 0x0BDBDF21, 0x86D3D2D4, 0xF1D4E242,
+    0x68DDB3F8, 0x1FDA836E, 0x81BE16CD, 0xF6B9265B, 0x6FB077E1, 0x18B74777,
+    0x88085AE6, 0xFF0F6A70, 0x66063BCA, 0x11010B5C, 0x8F659EFF, 0xF862AE69,
+    0x616BFFD3, 0x166CCF45, 0xA00AE278, 0xD70DD2EE, 0x4E048354, 0x3903B3C2,
+    0xA7672661, 0xD06016F7, 0x4969474D, 0x3E6E77DB, 0xAED16A4A, 0xD9D65ADC,
+    0x40DF0B66, 0x37D83BF0, 0xA9BCAE53, 0xDEBB9EC5, 0x47B2CF7F, 0x30B5FFE9,
+    0xBDBDF21C, 0xCABAC28A, 0x53B39330, 0x24B4A3A6, 0xBAD03605, 0xCDD70693,
+    0x54DE5729, 0x23D967BF, 0xB3667A2E, 0xC4614AB8, 0x5D681B02, 0x2A6F2B94,
+    0xB40BBE37, 0xC30C8EA1, 0x5A05DF1B, 0x2D02EF8D,
+};
+
+// Corresponds to `SigForPbCb` in langapi/shared/crc32.h.
+uint32_t pdb::hashBufferV8(ArrayRef<uint8_t> Buf) {
+  uint32_t Hash = 0;
+  for (uint8_t Byte : Buf)
+    Hash = (Hash >> 8) ^ V8HashTable[(Hash & 0xff) ^ Byte];
+  return Hash;
+}
diff --git a/lib/DebugInfo/PDB/Raw/IndexedStreamData.cpp b/lib/DebugInfo/PDB/Raw/IndexedStreamData.cpp
new file mode 100644
index 000000000000..9bd16ea76efc
--- /dev/null
+++ b/lib/DebugInfo/PDB/Raw/IndexedStreamData.cpp
@@ -0,0 +1,25 @@
+//===- IndexedStreamData.cpp - Standard PDB Stream Data ---------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/DebugInfo/PDB/Raw/IndexedStreamData.h"
+#include "llvm/DebugInfo/PDB/Raw/IPDBFile.h"
+
+using namespace llvm;
+using namespace llvm::pdb;
+
+IndexedStreamData::IndexedStreamData(uint32_t StreamIdx, const IPDBFile &File)
+    : StreamIdx(StreamIdx), File(File) {}
+
+uint32_t IndexedStreamData::getLength() {
+  return File.getStreamByteSize(StreamIdx);
+}
+
+ArrayRef<support::ulittle32_t> IndexedStreamData::getStreamBlocks() {
+  return File.getStreamBlockList(StreamIdx);
+}
diff --git a/lib/DebugInfo/PDB/Raw/InfoStream.cpp b/lib/DebugInfo/PDB/Raw/InfoStream.cpp
new file mode 100644
index 000000000000..c33a764587c7
--- /dev/null
+++ b/lib/DebugInfo/PDB/Raw/InfoStream.cpp
@@ -0,0 +1,90 @@
+//===- InfoStream.cpp - PDB Info Stream (Stream 1) Access -------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/DebugInfo/PDB/Raw/InfoStream.h"
+#include "llvm/ADT/BitVector.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/DebugInfo/CodeView/StreamReader.h"
+#include "llvm/DebugInfo/CodeView/StreamWriter.h"
+#include "llvm/DebugInfo/PDB/Raw/IndexedStreamData.h"
+#include "llvm/DebugInfo/PDB/Raw/PDBFile.h"
+#include "llvm/DebugInfo/PDB/Raw/RawConstants.h"
+#include "llvm/DebugInfo/PDB/Raw/RawError.h"
+
+using namespace llvm;
+using namespace llvm::codeview;
+using namespace llvm::pdb;
+
+InfoStream::InfoStream(std::unique_ptr<MappedBlockStream> Stream)
+    : Stream(std::move(Stream)) {}
+
+Error InfoStream::reload() {
+  codeview::StreamReader Reader(*Stream);
+
+  const HeaderInfo *H;
+  if (auto EC = Reader.readObject(H))
+    return joinErrors(
+        std::move(EC),
+        make_error<RawError>(raw_error_code::corrupt_file,
+                             "PDB Stream does not contain a header."));
+
+  switch (H->Version) {
+  case PdbImplVC70:
+  case PdbImplVC80:
+  case PdbImplVC110:
+  case PdbImplVC140:
+    break;
+  default:
+    return make_error<RawError>(raw_error_code::corrupt_file,
+                                "Unsupported PDB stream version.");
+  }
+
+  Version = H->Version;
+  Signature = H->Signature;
+  Age = H->Age;
+  Guid = H->Guid;
+
+  return NamedStreams.load(Reader);
+}
+
+uint32_t InfoStream::getNamedStreamIndex(llvm::StringRef Name) const {
+  uint32_t Result;
+  if (!NamedStreams.tryGetValue(Name, Result))
+    return 0;
+  return Result;
+}
+
+iterator_range<StringMapConstIterator<uint32_t>>
+InfoStream::named_streams() const {
+  return NamedStreams.entries();
+}
+
+PdbRaw_ImplVer InfoStream::getVersion() const {
+  return static_cast<PdbRaw_ImplVer>(Version);
+}
+
+uint32_t InfoStream::getSignature() const { return Signature; }
+
+uint32_t InfoStream::getAge() const { return Age; }
+
+PDB_UniqueId InfoStream::getGuid() const { return Guid; }
+
+Error InfoStream::commit() {
+  StreamWriter Writer(*Stream);
+
+  HeaderInfo H;
+  H.Age = Age;
+  H.Signature = Signature;
+  H.Version = Version;
+  H.Guid = Guid;
+  if (auto EC = Writer.writeObject(H))
+    return EC;
+
+  return NamedStreams.commit(Writer);
+}
diff --git a/lib/DebugInfo/PDB/Raw/InfoStreamBuilder.cpp b/lib/DebugInfo/PDB/Raw/InfoStreamBuilder.cpp
new file mode 100644
index 000000000000..7be9cc32db96
--- /dev/null
+++ b/lib/DebugInfo/PDB/Raw/InfoStreamBuilder.cpp
@@ -0,0 +1,67 @@
+//===- InfoStreamBuilder.cpp - PDB Info Stream Creation ---------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/DebugInfo/PDB/Raw/InfoStreamBuilder.h"
+
+#include "llvm/DebugInfo/CodeView/StreamWriter.h"
+#include "llvm/DebugInfo/PDB/Raw/InfoStream.h"
+#include "llvm/DebugInfo/PDB/Raw/MappedBlockStream.h"
+#include "llvm/DebugInfo/PDB/Raw/RawError.h"
+
+using namespace llvm;
+using namespace llvm::codeview;
+using namespace llvm::pdb;
+
+InfoStreamBuilder::InfoStreamBuilder() {}
+
+void InfoStreamBuilder::setVersion(PdbRaw_ImplVer V) { Ver = V; }
+
+void InfoStreamBuilder::setSignature(uint32_t S) { Sig = S; }
+
+void InfoStreamBuilder::setAge(uint32_t A) { Age = A; }
+
+void InfoStreamBuilder::setGuid(PDB_UniqueId G) { Guid = G; }
+
+NameMapBuilder &InfoStreamBuilder::getNamedStreamsBuilder() {
+  return NamedStreams;
+}
+
+uint32_t InfoStreamBuilder::calculateSerializedLength() const {
+  return sizeof(InfoStream::HeaderInfo) +
+         NamedStreams.calculateSerializedLength();
+}
+
+Expected<std::unique_ptr<InfoStream>> InfoStreamBuilder::build(PDBFile &File) {
+  if (!Ver.hasValue())
+    return make_error<RawError>(raw_error_code::unspecified,
+                                "Missing PDB Stream Version");
+  if (!Sig.hasValue())
+    return make_error<RawError>(raw_error_code::unspecified,
+                                "Missing PDB Stream Signature");
+  if (!Age.hasValue())
+    return make_error<RawError>(raw_error_code::unspecified,
+                                "Missing PDB Stream Age");
+  if (!Guid.hasValue())
+    return make_error<RawError>(raw_error_code::unspecified,
+                                "Missing PDB Stream Guid");
+
+  auto InfoS = MappedBlockStream::createIndexedStream(StreamPDB, File);
+  if (!InfoS)
+    return InfoS.takeError();
+  auto Info = llvm::make_unique<InfoStream>(std::move(*InfoS));
+  Info->Version = *Ver;
+  Info->Signature = *Sig;
+  Info->Age = *Age;
+  Info->Guid = *Guid;
+  auto NS = NamedStreams.build();
+  if (!NS)
+    return NS.takeError();
+  Info->NamedStreams = **NS;
+  return std::move(Info);
+}
diff --git a/lib/DebugInfo/PDB/Raw/MappedBlockStream.cpp b/lib/DebugInfo/PDB/Raw/MappedBlockStream.cpp
new file mode 100644
index 000000000000..92b2048c3c22
--- /dev/null
+++ b/lib/DebugInfo/PDB/Raw/MappedBlockStream.cpp
@@ -0,0 +1,310 @@
+//===- MappedBlockStream.cpp - Reads stream data from a PDBFile -----------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/DebugInfo/PDB/Raw/MappedBlockStream.h"
+#include "llvm/DebugInfo/PDB/Raw/DirectoryStreamData.h"
+#include "llvm/DebugInfo/PDB/Raw/IPDBStreamData.h"
+#include "llvm/DebugInfo/PDB/Raw/IndexedStreamData.h"
+#include "llvm/DebugInfo/PDB/Raw/PDBFile.h"
+#include "llvm/DebugInfo/PDB/Raw/RawError.h"
+
+using namespace llvm;
+using namespace llvm::pdb;
+
+namespace {
+// This exists so that we can use make_unique while still keeping the
+// constructor of MappedBlockStream private, forcing users to go through
+// the `create` interface.
+class MappedBlockStreamImpl : public MappedBlockStream {
+public:
+  MappedBlockStreamImpl(std::unique_ptr<IPDBStreamData> Data,
+                        const IPDBFile &File)
+      : MappedBlockStream(std::move(Data), File) {}
+};
+}
+
+typedef std::pair<uint32_t, uint32_t> Interval;
+static Interval intersect(const Interval &I1, const Interval &I2) {
+  return std::make_pair(std::max(I1.first, I2.first),
+                        std::min(I1.second, I2.second));
+}
+
+MappedBlockStream::MappedBlockStream(std::unique_ptr<IPDBStreamData> Data,
+                                     const IPDBFile &Pdb)
+    : Pdb(Pdb), Data(std::move(Data)) {}
+
+Error MappedBlockStream::readBytes(uint32_t Offset, uint32_t Size,
+                                   ArrayRef<uint8_t> &Buffer) const {
+  // Make sure we aren't trying to read beyond the end of the stream.
+  if (Size > Data->getLength())
+    return make_error<RawError>(raw_error_code::insufficient_buffer);
+  if (Offset > Data->getLength() - Size)
+    return make_error<RawError>(raw_error_code::insufficient_buffer);
+
+  if (tryReadContiguously(Offset, Size, Buffer))
+    return Error::success();
+
+  auto CacheIter = CacheMap.find(Offset);
+  if (CacheIter != CacheMap.end()) {
+    // Try to find an alloc that was large enough for this request.
+    for (auto &Entry : CacheIter->second) {
+      if (Entry.size() >= Size) {
+        Buffer = Entry.slice(0, Size);
+        return Error::success();
+      }
+    }
+  }
+
+  // We couldn't find a buffer that started at the correct offset (the most
+  // common scenario).  Try to see if there is a buffer that starts at some
+  // other offset but overlaps the desired range.
+  for (auto &CacheItem : CacheMap) {
+    Interval RequestExtent = std::make_pair(Offset, Offset + Size);
+
+    // We already checked this one on the fast path above.
+    if (CacheItem.first == Offset)
+      continue;
+    // If the initial extent of the cached item is beyond the ending extent
+    // of the request, there is no overlap.
+    if (CacheItem.first >= Offset + Size)
+      continue;
+
+    // We really only have to check the last item in the list, since we append
+    // in order of increasing length.
+    if (CacheItem.second.empty())
+      continue;
+
+    auto CachedAlloc = CacheItem.second.back();
+    // If the initial extent of the request is beyond the ending extent of
+    // the cached item, there is no overlap.
+    Interval CachedExtent =
+        std::make_pair(CacheItem.first, CacheItem.first + CachedAlloc.size());
+    if (RequestExtent.first >= CachedExtent.first + CachedExtent.second)
+      continue;
+
+    Interval Intersection = intersect(CachedExtent, RequestExtent);
+    // Only use this if the entire request extent is contained in the cached
+    // extent.
+    if (Intersection != RequestExtent)
+      continue;
+
+    uint32_t CacheRangeOffset =
+        AbsoluteDifference(CachedExtent.first, Intersection.first);
+    Buffer = CachedAlloc.slice(CacheRangeOffset, Size);
+    return Error::success();
+  }
+
+  // Otherwise allocate a large enough buffer in the pool, memcpy the data
+  // into it, and return an ArrayRef to that.  Do not touch existing pool
+  // allocations, as existing clients may be holding a pointer which must
+  // not be invalidated.
+  uint8_t *WriteBuffer = static_cast<uint8_t *>(Pool.Allocate(Size, 8));
+  if (auto EC = readBytes(Offset, MutableArrayRef<uint8_t>(WriteBuffer, Size)))
+    return EC;
+
+  if (CacheIter != CacheMap.end()) {
+    CacheIter->second.emplace_back(WriteBuffer, Size);
+  } else {
+    std::vector<CacheEntry> List;
+    List.emplace_back(WriteBuffer, Size);
+    CacheMap.insert(std::make_pair(Offset, List));
+  }
+  Buffer = ArrayRef<uint8_t>(WriteBuffer, Size);
+  return Error::success();
+}
+
+Error MappedBlockStream::readLongestContiguousChunk(
+    uint32_t Offset, ArrayRef<uint8_t> &Buffer) const {
+  // Make sure we aren't trying to read beyond the end of the stream.
+  if (Offset >= Data->getLength())
+    return make_error<RawError>(raw_error_code::insufficient_buffer);
+  uint32_t First = Offset / Pdb.getBlockSize();
+  uint32_t Last = First;
+
+  auto BlockList = Data->getStreamBlocks();
+  while (Last < Pdb.getBlockCount() - 1) {
+    if (BlockList[Last] != BlockList[Last + 1] - 1)
+      break;
+    ++Last;
+  }
+
+  uint32_t OffsetInFirstBlock = Offset % Pdb.getBlockSize();
+  uint32_t BytesFromFirstBlock = Pdb.getBlockSize() - OffsetInFirstBlock;
+  uint32_t BlockSpan = Last - First + 1;
+  uint32_t ByteSpan =
+      BytesFromFirstBlock + (BlockSpan - 1) * Pdb.getBlockSize();
+  auto Result = Pdb.getBlockData(BlockList[First], Pdb.getBlockSize());
+  if (!Result)
+    return Result.takeError();
+  Buffer = Result->drop_front(OffsetInFirstBlock);
+  Buffer = ArrayRef<uint8_t>(Buffer.data(), ByteSpan);
+  return Error::success();
+}
+
+uint32_t MappedBlockStream::getLength() const { return Data->getLength(); }
+
+Error MappedBlockStream::commit() const { return Error::success(); }
+
+bool MappedBlockStream::tryReadContiguously(uint32_t Offset, uint32_t Size,
+                                            ArrayRef<uint8_t> &Buffer) const {
+  // Attempt to fulfill the request with a reference directly into the stream.
+  // This can work even if the request crosses a block boundary, provided that
+  // all subsequent blocks are contiguous.  For example, a 10k read with a 4k
+  // block size can be filled with a reference if, from the starting offset,
+  // 3 blocks in a row are contiguous.
+  uint32_t BlockNum = Offset / Pdb.getBlockSize();
+  uint32_t OffsetInBlock = Offset % Pdb.getBlockSize();
+  uint32_t BytesFromFirstBlock =
+      std::min(Size, Pdb.getBlockSize() - OffsetInBlock);
+  uint32_t NumAdditionalBlocks =
+      llvm::alignTo(Size - BytesFromFirstBlock, Pdb.getBlockSize()) /
+      Pdb.getBlockSize();
+
+  auto BlockList = Data->getStreamBlocks();
+  uint32_t RequiredContiguousBlocks = NumAdditionalBlocks + 1;
+  uint32_t E = BlockList[BlockNum];
+  for (uint32_t I = 0; I < RequiredContiguousBlocks; ++I, ++E) {
+    if (BlockList[I + BlockNum] != E)
+      return false;
+  }
+
+  uint32_t FirstBlockAddr = BlockList[BlockNum];
+  auto Result = Pdb.getBlockData(FirstBlockAddr, Pdb.getBlockSize());
+  if (!Result) {
+    consumeError(Result.takeError());
+    return false;
+  }
+  auto Data = Result->drop_front(OffsetInBlock);
+  Buffer = ArrayRef<uint8_t>(Data.data(), Size);
+  return true;
+}
+
+Error MappedBlockStream::readBytes(uint32_t Offset,
+                                   MutableArrayRef<uint8_t> Buffer) const {
+  uint32_t BlockNum = Offset / Pdb.getBlockSize();
+  uint32_t OffsetInBlock = Offset % Pdb.getBlockSize();
+
+  // Make sure we aren't trying to read beyond the end of the stream.
+  if (Buffer.size() > Data->getLength())
+    return make_error<RawError>(raw_error_code::insufficient_buffer);
+  if (Offset > Data->getLength() - Buffer.size())
+    return make_error<RawError>(raw_error_code::insufficient_buffer);
+
+  uint32_t BytesLeft = Buffer.size();
+  uint32_t BytesWritten = 0;
+  uint8_t *WriteBuffer = Buffer.data();
+  auto BlockList = Data->getStreamBlocks();
+  while (BytesLeft > 0) {
+    uint32_t StreamBlockAddr = BlockList[BlockNum];
+
+    auto Result = Pdb.getBlockData(StreamBlockAddr, Pdb.getBlockSize());
+    if (!Result)
+      return Result.takeError();
+
+    auto Data = *Result;
+    const uint8_t *ChunkStart = Data.data() + OffsetInBlock;
+    uint32_t BytesInChunk =
+        std::min(BytesLeft, Pdb.getBlockSize() - OffsetInBlock);
+    ::memcpy(WriteBuffer + BytesWritten, ChunkStart, BytesInChunk);
+
+    BytesWritten += BytesInChunk;
+    BytesLeft -= BytesInChunk;
+    ++BlockNum;
+    OffsetInBlock = 0;
+  }
+
+  return Error::success();
+}
+
+Error MappedBlockStream::writeBytes(uint32_t Offset,
+                                    ArrayRef<uint8_t> Buffer) const {
+  // Make sure we aren't trying to write beyond the end of the stream.
+  if (Buffer.size() > Data->getLength())
+    return make_error<RawError>(raw_error_code::insufficient_buffer);
+
+  if (Offset > Data->getLength() - Buffer.size())
+    return make_error<RawError>(raw_error_code::insufficient_buffer);
+
+  uint32_t BlockNum = Offset / Pdb.getBlockSize();
+  uint32_t OffsetInBlock = Offset % Pdb.getBlockSize();
+
+  uint32_t BytesLeft = Buffer.size();
+  auto BlockList = Data->getStreamBlocks();
+  uint32_t BytesWritten = 0;
+  while (BytesLeft > 0) {
+    uint32_t StreamBlockAddr = BlockList[BlockNum];
+    uint32_t BytesToWriteInChunk =
+        std::min(BytesLeft, Pdb.getBlockSize() - OffsetInBlock);
+
+    const uint8_t *Chunk = Buffer.data() + BytesWritten;
+    ArrayRef<uint8_t> ChunkData(Chunk, BytesToWriteInChunk);
+    if (auto EC = Pdb.setBlockData(StreamBlockAddr, OffsetInBlock, ChunkData))
+      return EC;
+
+    BytesLeft -= BytesToWriteInChunk;
+    BytesWritten += BytesToWriteInChunk;
+    ++BlockNum;
+    OffsetInBlock = 0;
+  }
+
+  // If this write overlapped a read which previously came from the pool,
+  // someone may still be holding a pointer to that alloc which is now invalid.
+  // Compute the overlapping range and update the cache entry, so any
+  // outstanding buffers are automatically updated.
+  for (const auto &MapEntry : CacheMap) {
+    // If the end of the written extent precedes the beginning of the cached
+    // extent, ignore this map entry.
+    if (Offset + BytesWritten < MapEntry.first)
+      continue;
+    for (const auto &Alloc : MapEntry.second) {
+      // If the end of the cached extent precedes the beginning of the written
+      // extent, ignore this alloc.
+      if (MapEntry.first + Alloc.size() < Offset)
+        continue;
+
+      // If we get here, they are guaranteed to overlap.
+      Interval WriteInterval = std::make_pair(Offset, Offset + BytesWritten);
+      Interval CachedInterval =
+          std::make_pair(MapEntry.first, MapEntry.first + Alloc.size());
+      // If they overlap, we need to write the new data into the overlapping
+      // range.
+      auto Intersection = intersect(WriteInterval, CachedInterval);
+      assert(Intersection.first <= Intersection.second);
+
+      uint32_t Length = Intersection.second - Intersection.first;
+      uint32_t SrcOffset =
+          AbsoluteDifference(WriteInterval.first, Intersection.first);
+      uint32_t DestOffset =
+          AbsoluteDifference(CachedInterval.first, Intersection.first);
+      ::memcpy(Alloc.data() + DestOffset, Buffer.data() + SrcOffset, Length);
+    }
+  }
+
+  return Error::success();
+}
+
+uint32_t MappedBlockStream::getNumBytesCopied() const {
+  return static_cast<uint32_t>(Pool.getBytesAllocated());
+}
+
+Expected<std::unique_ptr<MappedBlockStream>>
+MappedBlockStream::createIndexedStream(uint32_t StreamIdx,
+                                       const IPDBFile &File) {
+  if (StreamIdx >= File.getNumStreams())
+    return make_error<RawError>(raw_error_code::no_stream);
+
+  auto Data = llvm::make_unique<IndexedStreamData>(StreamIdx, File);
+  return llvm::make_unique<MappedBlockStreamImpl>(std::move(Data), File);
+}
+
+Expected<std::unique_ptr<MappedBlockStream>>
+MappedBlockStream::createDirectoryStream(const PDBFile &File) {
+  auto Data = llvm::make_unique<DirectoryStreamData>(File);
+  return llvm::make_unique<MappedBlockStreamImpl>(std::move(Data), File);
+}
diff --git a/lib/DebugInfo/PDB/Raw/ModInfo.cpp b/lib/DebugInfo/PDB/Raw/ModInfo.cpp
new file mode 100644
index 000000000000..bae135f77bc0
--- /dev/null
+++ b/lib/DebugInfo/PDB/Raw/ModInfo.cpp
@@ -0,0 +1,127 @@
+//===- ModInfo.cpp - PDB module information -------------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/DebugInfo/PDB/Raw/ModInfo.h"
+
+#include "llvm/DebugInfo/CodeView/StreamReader.h"
+#include "llvm/DebugInfo/PDB/Raw/PDBFile.h"
+#include "llvm/Support/Endian.h"
+
+using namespace llvm;
+using namespace llvm::pdb;
+using namespace llvm::support;
+
+namespace {
+
+struct SCBytes {
+  ulittle16_t Section;
+  char Padding1[2];
+  little32_t Offset;
+  little32_t Size;
+  ulittle32_t Characteristics;
+  ulittle16_t ModuleIndex;
+  char Padding2[2];
+  ulittle32_t DataCrc;
+  ulittle32_t RelocCrc;
+};
+
+// struct Flags {
+//  uint16_t fWritten : 1;   // True if ModInfo is dirty
+//  uint16_t fECEnabled : 1; // Is EC symbolic info present?  (What is EC?)
+//  uint16_t unused : 6;     // Reserved
+//  uint16_t iTSM : 8;       // Type Server Index for this module
+//};
+const uint16_t HasECFlagMask = 0x2;
+
+const uint16_t TypeServerIndexMask = 0xFF00;
+const uint16_t TypeServerIndexShift = 8;
+}
+
+struct ModInfo::FileLayout {
+  ulittle32_t Mod;           // Currently opened module.  This field is a
+                             // pointer in the reference implementation, but
+                             // that won't work on 64-bit systems, and anyway
+                             // it doesn't make sense to read a pointer from a
+                             // file.  For now it is unused, so just ignore it.
+  SCBytes SC;                // First section contribution of this module.
+  ulittle16_t Flags;         // See Flags definition.
+  ulittle16_t ModDiStream;   // Stream Number of module debug info
+  ulittle32_t SymBytes;      // Size of local symbol debug info in above stream
+  ulittle32_t LineBytes;     // Size of line number debug info in above stream
+  ulittle32_t C13Bytes;      // Size of C13 line number info in above stream
+  ulittle16_t NumFiles;      // Number of files contributing to this module
+  char Padding1[2];          // Padding so the next field is 4-byte aligned.
+  ulittle32_t FileNameOffs;  // array of [0..NumFiles) DBI name buffer offsets.
+                             // This field is a pointer in the reference
+                             // implementation, but as with `Mod`, we ignore it
+                             // for now since it is unused.
+  ulittle32_t SrcFileNameNI; // Name Index for src file name
+  ulittle32_t PdbFilePathNI; // Name Index for path to compiler PDB
+                             // Null terminated Module name
+                             // Null terminated Obj File Name
+};
+
+ModInfo::ModInfo() : Layout(nullptr) {}
+
+ModInfo::ModInfo(const ModInfo &Info)
+    : ModuleName(Info.ModuleName), ObjFileName(Info.ObjFileName),
+      Layout(Info.Layout) {}
+
+ModInfo::~ModInfo() {}
+
+Error ModInfo::initialize(codeview::StreamRef Stream, ModInfo &Info) {
+  codeview::StreamReader Reader(Stream);
+  if (auto EC = Reader.readObject(Info.Layout))
+    return EC;
+
+  if (auto EC = Reader.readZeroString(Info.ModuleName))
+    return EC;
+
+  if (auto EC = Reader.readZeroString(Info.ObjFileName))
+    return EC;
+  return Error::success();
+}
+
+bool ModInfo::hasECInfo() const { return (Layout->Flags & HasECFlagMask) != 0; }
+
+uint16_t ModInfo::getTypeServerIndex() const {
+  return (Layout->Flags & TypeServerIndexMask) >> TypeServerIndexShift;
+}
+
+uint16_t ModInfo::getModuleStreamIndex() const { return Layout->ModDiStream; }
+
+uint32_t ModInfo::getSymbolDebugInfoByteSize() const {
+  return Layout->SymBytes;
+}
+
+uint32_t ModInfo::getLineInfoByteSize() const { return Layout->LineBytes; }
+
+uint32_t ModInfo::getC13LineInfoByteSize() const { return Layout->C13Bytes; }
+
+uint32_t ModInfo::getNumberOfFiles() const { return Layout->NumFiles; }
+
+uint32_t ModInfo::getSourceFileNameIndex() const {
+  return Layout->SrcFileNameNI;
+}
+
+uint32_t ModInfo::getPdbFilePathNameIndex() const {
+  return Layout->PdbFilePathNI;
+}
+
+StringRef ModInfo::getModuleName() const { return ModuleName; }
+
+StringRef ModInfo::getObjFileName() const { return ObjFileName; }
+
+uint32_t ModInfo::getRecordLength() const {
+  uint32_t M = ModuleName.str().size() + 1;
+  uint32_t O = ObjFileName.str().size() + 1;
+  uint32_t Size = sizeof(FileLayout) + M + O;
+  Size = llvm::alignTo(Size, 4);
+  return Size;
+}
diff --git a/lib/DebugInfo/PDB/Raw/ModStream.cpp b/lib/DebugInfo/PDB/Raw/ModStream.cpp
new file mode 100644
index 000000000000..3415fcd47790
--- /dev/null
+++ b/lib/DebugInfo/PDB/Raw/ModStream.cpp
@@ -0,0 +1,82 @@
+//===- ModStream.cpp - PDB Module Info Stream Access ----------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/DebugInfo/PDB/Raw/ModStream.h"
+
+#include "llvm/DebugInfo/CodeView/StreamReader.h"
+#include "llvm/DebugInfo/PDB/Raw/IndexedStreamData.h"
+#include "llvm/DebugInfo/PDB/Raw/ModInfo.h"
+#include "llvm/DebugInfo/PDB/Raw/PDBFile.h"
+#include "llvm/DebugInfo/PDB/Raw/RawError.h"
+#include "llvm/DebugInfo/PDB/Raw/RawTypes.h"
+
+using namespace llvm;
+using namespace llvm::pdb;
+
+ModStream::ModStream(const ModInfo &Module,
+                     std::unique_ptr<MappedBlockStream> Stream)
+    : Mod(Module), Stream(std::move(Stream)) {}
+
+ModStream::~ModStream() {}
+
+Error ModStream::reload() {
+  codeview::StreamReader Reader(*Stream);
+
+  uint32_t SymbolSize = Mod.getSymbolDebugInfoByteSize();
+  uint32_t C11Size = Mod.getLineInfoByteSize();
+  uint32_t C13Size = Mod.getC13LineInfoByteSize();
+
+  if (C11Size > 0 && C13Size > 0)
+    return llvm::make_error<RawError>(raw_error_code::corrupt_file,
+                                      "Module has both C11 and C13 line info");
+
+  codeview::StreamRef S;
+
+  uint32_t SymbolSubstreamSig = 0;
+  if (auto EC = Reader.readInteger(SymbolSubstreamSig))
+    return EC;
+  if (auto EC = Reader.readArray(SymbolsSubstream, SymbolSize - 4))
+    return EC;
+
+  if (auto EC = Reader.readStreamRef(LinesSubstream, C11Size))
+    return EC;
+  if (auto EC = Reader.readStreamRef(C13LinesSubstream, C13Size))
+    return EC;
+
+  codeview::StreamReader LineReader(C13LinesSubstream);
+  if (auto EC = LineReader.readArray(LineInfo, LineReader.bytesRemaining()))
+    return EC;
+
+  uint32_t GlobalRefsSize;
+  if (auto EC = Reader.readInteger(GlobalRefsSize))
+    return EC;
+  if (auto EC = Reader.readStreamRef(GlobalRefsSubstream, GlobalRefsSize))
+    return EC;
+  if (Reader.bytesRemaining() > 0)
+    return llvm::make_error<RawError>(raw_error_code::corrupt_file,
+                                      "Unexpected bytes in module stream.");
+
+  return Error::success();
+}
+
+iterator_range<codeview::CVSymbolArray::Iterator>
+ModStream::symbols(bool *HadError) const {
+  // It's OK if the stream is empty.
+  if (SymbolsSubstream.getUnderlyingStream().getLength() == 0)
+    return llvm::make_range(SymbolsSubstream.end(), SymbolsSubstream.end());
+  return llvm::make_range(SymbolsSubstream.begin(HadError),
+                          SymbolsSubstream.end());
+}
+
+iterator_range<codeview::ModuleSubstreamArray::Iterator>
+ModStream::lines(bool *HadError) const {
+  return llvm::make_range(LineInfo.begin(HadError), LineInfo.end());
+}
+
+Error ModStream::commit() { return Error::success(); }
diff --git a/lib/DebugInfo/PDB/Raw/MsfBuilder.cpp b/lib/DebugInfo/PDB/Raw/MsfBuilder.cpp
new file mode 100644
index 000000000000..16b086bdaa98
--- /dev/null
+++ b/lib/DebugInfo/PDB/Raw/MsfBuilder.cpp
@@ -0,0 +1,279 @@
+//===- MSFBuilder.cpp - MSF Directory & Metadata Builder --------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/DebugInfo/PDB/Raw/MsfBuilder.h"
+#include "llvm/DebugInfo/PDB/Raw/RawError.h"
+
+using namespace llvm;
+using namespace llvm::pdb;
+using namespace llvm::pdb::msf;
+using namespace llvm::support;
+
+namespace {
+const uint32_t kSuperBlockBlock = 0;
+const uint32_t kFreePageMap0Block = 1;
+const uint32_t kFreePageMap1Block = 2;
+const uint32_t kNumReservedPages = 3;
+
+const uint32_t kDefaultBlockMapAddr = kNumReservedPages;
+}
+
+MsfBuilder::MsfBuilder(uint32_t BlockSize, uint32_t MinBlockCount, bool CanGrow,
+                       BumpPtrAllocator &Allocator)
+    : Allocator(Allocator), IsGrowable(CanGrow), BlockSize(BlockSize),
+      MininumBlocks(MinBlockCount), BlockMapAddr(kDefaultBlockMapAddr),
+      FreeBlocks(MinBlockCount, true) {
+  FreeBlocks[kSuperBlockBlock] = false;
+  FreeBlocks[kFreePageMap0Block] = false;
+  FreeBlocks[kFreePageMap1Block] = false;
+  FreeBlocks[BlockMapAddr] = false;
+}
+
+Expected<MsfBuilder> MsfBuilder::create(BumpPtrAllocator &Allocator,
+                                        uint32_t BlockSize,
+                                        uint32_t MinBlockCount, bool CanGrow) {
+  if (!msf::isValidBlockSize(BlockSize))
+    return make_error<RawError>(raw_error_code::unspecified,
+                                "The requested block size is unsupported");
+
+  return MsfBuilder(BlockSize,
+                    std::max(MinBlockCount, msf::getMinimumBlockCount()),
+                    CanGrow, Allocator);
+}
+
+Error MsfBuilder::setBlockMapAddr(uint32_t Addr) {
+  if (Addr == BlockMapAddr)
+    return Error::success();
+
+  if (Addr >= FreeBlocks.size()) {
+    if (!IsGrowable)
+      return make_error<RawError>(raw_error_code::unspecified,
+                                  "Cannot grow the number of blocks");
+    FreeBlocks.resize(Addr + 1);
+  }
+
+  if (!isBlockFree(Addr))
+    return make_error<RawError>(raw_error_code::unspecified,
+                                "Attempt to reuse an allocated block");
+  FreeBlocks[BlockMapAddr] = true;
+  FreeBlocks[Addr] = false;
+  BlockMapAddr = Addr;
+  return Error::success();
+}
+
+void MsfBuilder::setFreePageMap(uint32_t Fpm) { FreePageMap = Fpm; }
+
+void MsfBuilder::setUnknown1(uint32_t Unk1) { Unknown1 = Unk1; }
+
+Error MsfBuilder::setDirectoryBlocksHint(ArrayRef<uint32_t> DirBlocks) {
+  for (auto B : DirectoryBlocks)
+    FreeBlocks[B] = true;
+  for (auto B : DirBlocks) {
+    if (!isBlockFree(B)) {
+      return make_error<RawError>(raw_error_code::unspecified,
+                                  "Attempt to reuse an allocated block");
+    }
+    FreeBlocks[B] = false;
+  }
+
+  DirectoryBlocks = DirBlocks;
+  return Error::success();
+}
+
+Error MsfBuilder::allocateBlocks(uint32_t NumBlocks,
+                                 MutableArrayRef<uint32_t> Blocks) {
+  if (NumBlocks == 0)
+    return Error::success();
+
+  uint32_t NumFreeBlocks = FreeBlocks.count();
+  if (NumFreeBlocks < NumBlocks) {
+    if (!IsGrowable)
+      return make_error<RawError>(raw_error_code::unspecified,
+                                  "There are no free Blocks in the file");
+    uint32_t AllocBlocks = NumBlocks - NumFreeBlocks;
+    FreeBlocks.resize(AllocBlocks + FreeBlocks.size(), true);
+  }
+
+  int I = 0;
+  int Block = FreeBlocks.find_first();
+  do {
+    assert(Block != -1 && "We ran out of Blocks!");
+
+    uint32_t NextBlock = static_cast<uint32_t>(Block);
+    Blocks[I++] = NextBlock;
+    FreeBlocks.reset(NextBlock);
+    Block = FreeBlocks.find_next(Block);
+  } while (--NumBlocks > 0);
+  return Error::success();
+}
+
+uint32_t MsfBuilder::getNumUsedBlocks() const {
+  return getTotalBlockCount() - getNumFreeBlocks();
+}
+
+uint32_t MsfBuilder::getNumFreeBlocks() const { return FreeBlocks.count(); }
+
+uint32_t MsfBuilder::getTotalBlockCount() const { return FreeBlocks.size(); }
+
+bool MsfBuilder::isBlockFree(uint32_t Idx) const { return FreeBlocks[Idx]; }
+
+Error MsfBuilder::addStream(uint32_t Size, ArrayRef<uint32_t> Blocks) {
+  // Add a new stream mapped to the specified blocks.  Verify that the specified
+  // blocks are both necessary and sufficient for holding the requested number
+  // of bytes, and verify that all requested blocks are free.
+  uint32_t ReqBlocks = bytesToBlocks(Size, BlockSize);
+  if (ReqBlocks != Blocks.size())
+    return make_error<RawError>(
+        raw_error_code::unspecified,
+        "Incorrect number of blocks for requested stream size");
+  for (auto Block : Blocks) {
+    if (Block >= FreeBlocks.size())
+      FreeBlocks.resize(Block + 1, true);
+
+    if (!FreeBlocks.test(Block))
+      return make_error<RawError>(
+          raw_error_code::unspecified,
+          "Attempt to re-use an already allocated block");
+  }
+  // Mark all the blocks occupied by the new stream as not free.
+  for (auto Block : Blocks) {
+    FreeBlocks.reset(Block);
+  }
+  StreamData.push_back(std::make_pair(Size, Blocks));
+  return Error::success();
+}
+
+Error MsfBuilder::addStream(uint32_t Size) {
+  uint32_t ReqBlocks = bytesToBlocks(Size, BlockSize);
+  std::vector<uint32_t> NewBlocks;
+  NewBlocks.resize(ReqBlocks);
+  if (auto EC = allocateBlocks(ReqBlocks, NewBlocks))
+    return EC;
+  StreamData.push_back(std::make_pair(Size, NewBlocks));
+  return Error::success();
+}
+
+Error MsfBuilder::setStreamSize(uint32_t Idx, uint32_t Size) {
+  uint32_t OldSize = getStreamSize(Idx);
+  if (OldSize == Size)
+    return Error::success();
+
+  uint32_t NewBlocks = bytesToBlocks(Size, BlockSize);
+  uint32_t OldBlocks = bytesToBlocks(OldSize, BlockSize);
+
+  if (NewBlocks > OldBlocks) {
+    uint32_t AddedBlocks = NewBlocks - OldBlocks;
+    // If we're growing, we have to allocate new Blocks.
+    std::vector<uint32_t> AddedBlockList;
+    AddedBlockList.resize(AddedBlocks);
+    if (auto EC = allocateBlocks(AddedBlocks, AddedBlockList))
+      return EC;
+    auto &CurrentBlocks = StreamData[Idx].second;
+    CurrentBlocks.insert(CurrentBlocks.end(), AddedBlockList.begin(),
+                         AddedBlockList.end());
+  } else if (OldBlocks > NewBlocks) {
+    // For shrinking, free all the Blocks in the Block map, update the stream
+    // data, then shrink the directory.
+    uint32_t RemovedBlocks = OldBlocks - NewBlocks;
+    auto CurrentBlocks = ArrayRef<uint32_t>(StreamData[Idx].second);
+    auto RemovedBlockList = CurrentBlocks.drop_front(NewBlocks);
+    for (auto P : RemovedBlockList)
+      FreeBlocks[P] = true;
+    StreamData[Idx].second = CurrentBlocks.drop_back(RemovedBlocks);
+  }
+
+  StreamData[Idx].first = Size;
+  return Error::success();
+}
+
+uint32_t MsfBuilder::getNumStreams() const { return StreamData.size(); }
+
+uint32_t MsfBuilder::getStreamSize(uint32_t StreamIdx) const {
+  return StreamData[StreamIdx].first;
+}
+
+ArrayRef<uint32_t> MsfBuilder::getStreamBlocks(uint32_t StreamIdx) const {
+  return StreamData[StreamIdx].second;
+}
+
+uint32_t MsfBuilder::computeDirectoryByteSize() const {
+  // The directory has the following layout, where each item is a ulittle32_t:
+  //    NumStreams
+  //    StreamSizes[NumStreams]
+  //    StreamBlocks[NumStreams][]
+  uint32_t Size = sizeof(ulittle32_t);             // NumStreams
+  Size += StreamData.size() * sizeof(ulittle32_t); // StreamSizes
+  for (const auto &D : StreamData) {
+    uint32_t ExpectedNumBlocks = bytesToBlocks(D.first, BlockSize);
+    assert(ExpectedNumBlocks == D.second.size() &&
+           "Unexpected number of blocks");
+    Size += ExpectedNumBlocks * sizeof(ulittle32_t);
+  }
+  return Size;
+}
+
+Expected<Layout> MsfBuilder::build() {
+  Layout L;
+  L.SB = Allocator.Allocate<SuperBlock>();
+  std::memcpy(L.SB->MagicBytes, Magic, sizeof(Magic));
+  L.SB->BlockMapAddr = BlockMapAddr;
+  L.SB->BlockSize = BlockSize;
+  L.SB->NumDirectoryBytes = computeDirectoryByteSize();
+  L.SB->FreeBlockMapBlock = FreePageMap;
+  L.SB->Unknown1 = Unknown1;
+
+  uint32_t NumDirectoryBlocks =
+      bytesToBlocks(L.SB->NumDirectoryBytes, BlockSize);
+  if (NumDirectoryBlocks > DirectoryBlocks.size()) {
+    // Our hint wasn't enough to satisfy the entire directory.  Allocate
+    // remaining pages.
+    std::vector<uint32_t> ExtraBlocks;
+    uint32_t NumExtraBlocks = NumDirectoryBlocks - DirectoryBlocks.size();
+    ExtraBlocks.resize(NumExtraBlocks);
+    if (auto EC = allocateBlocks(NumExtraBlocks, ExtraBlocks))
+      return std::move(EC);
+    DirectoryBlocks.insert(DirectoryBlocks.end(), ExtraBlocks.begin(),
+                           ExtraBlocks.end());
+  } else if (NumDirectoryBlocks < DirectoryBlocks.size()) {
+    uint32_t NumUnnecessaryBlocks = DirectoryBlocks.size() - NumDirectoryBlocks;
+    for (auto B :
+         ArrayRef<uint32_t>(DirectoryBlocks).drop_back(NumUnnecessaryBlocks))
+      FreeBlocks[B] = true;
+    DirectoryBlocks.resize(NumDirectoryBlocks);
+  }
+
+  // Don't set the number of blocks in the file until after allocating Blocks
+  // for
+  // the directory, since the allocation might cause the file to need to grow.
+  L.SB->NumBlocks = FreeBlocks.size();
+
+  ulittle32_t *DirBlocks = Allocator.Allocate<ulittle32_t>(NumDirectoryBlocks);
+  std::uninitialized_copy_n(DirectoryBlocks.begin(), NumDirectoryBlocks,
+                            DirBlocks);
+  L.DirectoryBlocks = ArrayRef<ulittle32_t>(DirBlocks, NumDirectoryBlocks);
+
+  // The stream sizes should be re-allocated as a stable pointer and the stream
+  // map should have each of its entries allocated as a separate stable pointer.
+  if (StreamData.size() > 0) {
+    ulittle32_t *Sizes = Allocator.Allocate<ulittle32_t>(StreamData.size());
+    L.StreamSizes = ArrayRef<ulittle32_t>(Sizes, StreamData.size());
+    L.StreamMap.resize(StreamData.size());
+    for (uint32_t I = 0; I < StreamData.size(); ++I) {
+      Sizes[I] = StreamData[I].first;
+      ulittle32_t *BlockList =
+          Allocator.Allocate<ulittle32_t>(StreamData[I].second.size());
+      std::uninitialized_copy_n(StreamData[I].second.begin(),
+                                StreamData[I].second.size(), BlockList);
+      L.StreamMap[I] =
+          ArrayRef<ulittle32_t>(BlockList, StreamData[I].second.size());
+    }
+  }
+
+  return L;
+}
diff --git a/lib/DebugInfo/PDB/Raw/MsfCommon.cpp b/lib/DebugInfo/PDB/Raw/MsfCommon.cpp
new file mode 100644
index 000000000000..5d97f33e1103
--- /dev/null
+++ b/lib/DebugInfo/PDB/Raw/MsfCommon.cpp
@@ -0,0 +1,48 @@
+//===- MsfCommon.cpp - Common types and functions for MSF files -*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/DebugInfo/PDB/Raw/MsfCommon.h"
+#include "llvm/DebugInfo/PDB/Raw/RawError.h"
+
+using namespace llvm;
+using namespace llvm::pdb::msf;
+
+Error llvm::pdb::msf::validateSuperBlock(const SuperBlock &SB) {
+  // Check the magic bytes.
+  if (std::memcmp(SB.MagicBytes, Magic, sizeof(Magic)) != 0)
+    return make_error<RawError>(raw_error_code::corrupt_file,
+                                "MSF magic header doesn't match");
+
+  if (!isValidBlockSize(SB.BlockSize))
+    return make_error<RawError>(raw_error_code::corrupt_file,
+                                "Unsupported block size.");
+
+  // We don't support directories whose sizes aren't a multiple of four bytes.
+  if (SB.NumDirectoryBytes % sizeof(support::ulittle32_t) != 0)
+    return make_error<RawError>(raw_error_code::corrupt_file,
+                                "Directory size is not multiple of 4.");
+
+  // The number of blocks which comprise the directory is a simple function of
+  // the number of bytes it contains.
+  uint64_t NumDirectoryBlocks =
+      bytesToBlocks(SB.NumDirectoryBytes, SB.BlockSize);
+
+  // The directory, as we understand it, is a block which consists of a list of
+  // block numbers.  It is unclear what would happen if the number of blocks
+  // couldn't fit on a single block.
+  if (NumDirectoryBlocks > SB.BlockSize / sizeof(support::ulittle32_t))
+    return make_error<RawError>(raw_error_code::corrupt_file,
+                                "Too many directory blocks.");
+
+  if (SB.BlockMapAddr == 0)
+    return make_error<RawError>(raw_error_code::corrupt_file,
+                                "Block 0 is reserved");
+
+  return Error::success();
+}
diff --git a/lib/DebugInfo/PDB/Raw/NameHashTable.cpp b/lib/DebugInfo/PDB/Raw/NameHashTable.cpp
new file mode 100644
index 000000000000..ae4ebf27721e
--- /dev/null
+++ b/lib/DebugInfo/PDB/Raw/NameHashTable.cpp
@@ -0,0 +1,104 @@
+//===- NameHashTable.cpp - PDB Name Hash Table ------------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/DebugInfo/PDB/Raw/NameHashTable.h"
+
+#include "llvm/ADT/ArrayRef.h"
+#include "llvm/DebugInfo/CodeView/StreamReader.h"
+#include "llvm/DebugInfo/PDB/Raw/Hash.h"
+#include "llvm/DebugInfo/PDB/Raw/RawError.h"
+#include "llvm/Support/Endian.h"
+
+using namespace llvm;
+using namespace llvm::support;
+using namespace llvm::pdb;
+
+NameHashTable::NameHashTable() : Signature(0), HashVersion(0), NameCount(0) {}
+
+Error NameHashTable::load(codeview::StreamReader &Stream) {
+  struct Header {
+    support::ulittle32_t Signature;
+    support::ulittle32_t HashVersion;
+    support::ulittle32_t ByteSize;
+  };
+
+  const Header *H;
+  if (auto EC = Stream.readObject(H))
+    return EC;
+
+  if (H->Signature != 0xEFFEEFFE)
+    return make_error<RawError>(raw_error_code::corrupt_file,
+                                "Invalid hash table signature");
+  if (H->HashVersion != 1 && H->HashVersion != 2)
+    return make_error<RawError>(raw_error_code::corrupt_file,
+                                "Unsupported hash version");
+
+  Signature = H->Signature;
+  HashVersion = H->HashVersion;
+  if (auto EC = Stream.readStreamRef(NamesBuffer, H->ByteSize))
+    return joinErrors(std::move(EC),
+                      make_error<RawError>(raw_error_code::corrupt_file,
+                                           "Invalid hash table byte length"));
+
+  const support::ulittle32_t *HashCount;
+  if (auto EC = Stream.readObject(HashCount))
+    return EC;
+
+  if (auto EC = Stream.readArray(IDs, *HashCount))
+    return joinErrors(std::move(EC),
+                      make_error<RawError>(raw_error_code::corrupt_file,
+                                           "Could not read bucket array"));
+
+  if (Stream.bytesRemaining() < sizeof(support::ulittle32_t))
+    return make_error<RawError>(raw_error_code::corrupt_file,
+                                "Missing name count");
+
+  if (auto EC = Stream.readInteger(NameCount))
+    return EC;
+  return Error::success();
+}
+
+StringRef NameHashTable::getStringForID(uint32_t ID) const {
+  if (ID == IDs[0])
+    return StringRef();
+
+  // NamesBuffer is a buffer of null terminated strings back to back.  ID is
+  // the starting offset of the string we're looking for.  So just seek into
+  // the desired offset and a read a null terminated stream from that offset.
+  StringRef Result;
+  codeview::StreamReader NameReader(NamesBuffer);
+  NameReader.setOffset(ID);
+  if (auto EC = NameReader.readZeroString(Result))
+    consumeError(std::move(EC));
+  return Result;
+}
+
+uint32_t NameHashTable::getIDForString(StringRef Str) const {
+  uint32_t Hash = (HashVersion == 1) ? hashStringV1(Str) : hashStringV2(Str);
+  size_t Count = IDs.size();
+  uint32_t Start = Hash % Count;
+  for (size_t I = 0; I < Count; ++I) {
+    // The hash is just a starting point for the search, but if it
+    // doesn't work we should find the string no matter what, because
+    // we iterate the entire array.
+    uint32_t Index = (Start + I) % Count;
+
+    uint32_t ID = IDs[Index];
+    StringRef S = getStringForID(ID);
+    if (S == Str)
+      return ID;
+  }
+  // IDs[0] contains the ID of the "invalid" entry.
+  return IDs[0];
+}
+
+codeview::FixedStreamArray<support::ulittle32_t>
+NameHashTable::name_ids() const {
+  return IDs;
+}
diff --git a/lib/DebugInfo/PDB/Raw/NameMap.cpp b/lib/DebugInfo/PDB/Raw/NameMap.cpp
new file mode 100644
index 000000000000..b8a4eb79a48c
--- /dev/null
+++ b/lib/DebugInfo/PDB/Raw/NameMap.cpp
@@ -0,0 +1,213 @@
+//===- NameMap.cpp - PDB Name Map -------------------------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/DebugInfo/PDB/Raw/NameMap.h"
+#include "llvm/ADT/SparseBitVector.h"
+#include "llvm/DebugInfo/CodeView/StreamReader.h"
+#include "llvm/DebugInfo/CodeView/StreamWriter.h"
+#include "llvm/DebugInfo/PDB/Raw/RawError.h"
+
+using namespace llvm;
+using namespace llvm::codeview;
+using namespace llvm::pdb;
+
+NameMap::NameMap() {}
+
+Error NameMap::load(codeview::StreamReader &Stream) {
+
+  // This is some sort of weird string-set/hash table encoded in the stream.
+  // It starts with the number of bytes in the table.
+  uint32_t NumberOfBytes;
+  if (auto EC = Stream.readInteger(NumberOfBytes))
+    return joinErrors(std::move(EC),
+                      make_error<RawError>(raw_error_code::corrupt_file,
+                                           "Expected name map length"));
+  if (Stream.bytesRemaining() < NumberOfBytes)
+    return make_error<RawError>(raw_error_code::corrupt_file,
+                                "Invalid name map length");
+
+  // Following that field is the starting offset of strings in the name table.
+  uint32_t StringsOffset = Stream.getOffset();
+  Stream.setOffset(StringsOffset + NumberOfBytes);
+
+  // This appears to be equivalent to the total number of strings *actually*
+  // in the name table.
+  uint32_t HashSize;
+  if (auto EC = Stream.readInteger(HashSize))
+    return joinErrors(std::move(EC),
+                      make_error<RawError>(raw_error_code::corrupt_file,
+                                           "Expected name map hash size"));
+
+  // This appears to be an upper bound on the number of strings in the name
+  // table.
+  uint32_t MaxNumberOfStrings;
+  if (auto EC = Stream.readInteger(MaxNumberOfStrings))
+    return joinErrors(std::move(EC),
+                      make_error<RawError>(raw_error_code::corrupt_file,
+                                           "Expected name map max strings"));
+
+  if (MaxNumberOfStrings > (UINT32_MAX / sizeof(uint32_t)))
+    return make_error<RawError>(raw_error_code::corrupt_file,
+                                "Implausible number of strings");
+
+  const uint32_t MaxNumberOfWords = UINT32_MAX / (sizeof(uint32_t) * 8);
+
+  // This appears to be a hash table which uses bitfields to determine whether
+  // or not a bucket is 'present'.
+  uint32_t NumPresentWords;
+  if (auto EC = Stream.readInteger(NumPresentWords))
+    return joinErrors(std::move(EC),
+                      make_error<RawError>(raw_error_code::corrupt_file,
+                                           "Expected name map num words"));
+
+  if (NumPresentWords > MaxNumberOfWords)
+    return make_error<RawError>(raw_error_code::corrupt_file,
+                                "Number of present words is too large");
+
+  SparseBitVector<> Present;
+  for (uint32_t I = 0; I != NumPresentWords; ++I) {
+    uint32_t Word;
+    if (auto EC = Stream.readInteger(Word))
+      return joinErrors(std::move(EC),
+                        make_error<RawError>(raw_error_code::corrupt_file,
+                                             "Expected name map word"));
+    for (unsigned Idx = 0; Idx < 32; ++Idx)
+      if (Word & (1U << Idx))
+        Present.set((I * 32) + Idx);
+  }
+
+  // This appears to be a hash table which uses bitfields to determine whether
+  // or not a bucket is 'deleted'.
+  uint32_t NumDeletedWords;
+  if (auto EC = Stream.readInteger(NumDeletedWords))
+    return joinErrors(
+        std::move(EC),
+        make_error<RawError>(raw_error_code::corrupt_file,
+                             "Expected name map num deleted words"));
+
+  if (NumDeletedWords > MaxNumberOfWords)
+    return make_error<RawError>(raw_error_code::corrupt_file,
+                                "Number of deleted words is too large");
+
+  SparseBitVector<> Deleted;
+  for (uint32_t I = 0; I != NumDeletedWords; ++I) {
+    uint32_t Word;
+    if (auto EC = Stream.readInteger(Word))
+      return joinErrors(std::move(EC),
+                        make_error<RawError>(raw_error_code::corrupt_file,
+                                             "Expected name map word"));
+    for (unsigned Idx = 0; Idx < 32; ++Idx)
+      if (Word & (1U << Idx))
+        Deleted.set((I * 32) + Idx);
+  }
+
+  for (unsigned I : Present) {
+    // For all present entries, dump out their mapping.
+    (void)I;
+
+    // This appears to be an offset relative to the start of the strings.
+    // It tells us where the null-terminated string begins.
+    uint32_t NameOffset;
+    if (auto EC = Stream.readInteger(NameOffset))
+      return joinErrors(std::move(EC),
+                        make_error<RawError>(raw_error_code::corrupt_file,
+                                             "Expected name map name offset"));
+
+    // This appears to be a stream number into the stream directory.
+    uint32_t NameIndex;
+    if (auto EC = Stream.readInteger(NameIndex))
+      return joinErrors(std::move(EC),
+                        make_error<RawError>(raw_error_code::corrupt_file,
+                                             "Expected name map name index"));
+
+    // Compute the offset of the start of the string relative to the stream.
+    uint32_t StringOffset = StringsOffset + NameOffset;
+    uint32_t OldOffset = Stream.getOffset();
+    // Pump out our c-string from the stream.
+    StringRef Str;
+    Stream.setOffset(StringOffset);
+    if (auto EC = Stream.readZeroString(Str))
+      return joinErrors(std::move(EC),
+                        make_error<RawError>(raw_error_code::corrupt_file,
+                                             "Expected name map name"));
+
+    Stream.setOffset(OldOffset);
+    // Add this to a string-map from name to stream number.
+    Mapping.insert({Str, NameIndex});
+  }
+
+  return Error::success();
+}
+
+Error NameMap::commit(codeview::StreamWriter &Writer) {
+  // The first field is the number of bytes of string data.  So add
+  // up the length of all strings plus a null terminator for each
+  // one.
+  uint32_t NumBytes = 0;
+  for (auto B = Mapping.begin(), E = Mapping.end(); B != E; ++B) {
+    NumBytes += B->getKeyLength() + 1;
+  }
+
+  if (auto EC = Writer.writeInteger(NumBytes)) // Number of bytes of string data
+    return EC;
+  // Now all of the string data itself.
+  for (auto B = Mapping.begin(), E = Mapping.end(); B != E; ++B) {
+    if (auto EC = Writer.writeZeroString(B->getKey()))
+      return EC;
+  }
+
+  if (auto EC = Writer.writeInteger(Mapping.size())) // Hash Size
+    return EC;
+
+  if (auto EC = Writer.writeInteger(Mapping.size())) // Max Number of Strings
+    return EC;
+
+  if (auto EC = Writer.writeInteger(Mapping.size())) // Num Present Words
+    return EC;
+
+  // For each entry in the mapping, write a bit mask which represents a bucket
+  // to store it in.  We don't use this, so the value we write isn't important
+  // to us, it just has to be there.
+  for (auto B = Mapping.begin(), E = Mapping.end(); B != E; ++B) {
+    if (auto EC = Writer.writeInteger(1U))
+      return EC;
+  }
+
+  if (auto EC = Writer.writeInteger(0U)) // Num Deleted Words
+    return EC;
+
+  // Mappings of each word.
+  uint32_t OffsetSoFar = 0;
+  for (auto B = Mapping.begin(), E = Mapping.end(); B != E; ++B) {
+    // This is a list of key value pairs where the key is the offset into the
+    // strings buffer, and the value is a stream number.  Write each pair.
+    if (auto EC = Writer.writeInteger(OffsetSoFar))
+      return EC;
+
+    if (auto EC = Writer.writeInteger(B->second))
+      return EC;
+
+    OffsetSoFar += B->getKeyLength() + 1;
+  }
+
+  return Error::success();
+}
+
+iterator_range<StringMapConstIterator<uint32_t>> NameMap::entries() const {
+  return llvm::make_range<StringMapConstIterator<uint32_t>>(Mapping.begin(),
+                                                            Mapping.end());
+}
+
+bool NameMap::tryGetValue(StringRef Name, uint32_t &Value) const {
+  auto Iter = Mapping.find(Name);
+  if (Iter == Mapping.end())
+    return false;
+  Value = Iter->second;
+  return true;
+}
diff --git a/lib/DebugInfo/PDB/Raw/NameMapBuilder.cpp b/lib/DebugInfo/PDB/Raw/NameMapBuilder.cpp
new file mode 100644
index 000000000000..41c6c2cd810a
--- /dev/null
+++ b/lib/DebugInfo/PDB/Raw/NameMapBuilder.cpp
@@ -0,0 +1,50 @@
+//===- NameMapBuilder.cpp - PDB Name Map Builder ----------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/DebugInfo/PDB/Raw/NameMapBuilder.h"
+
+#include "llvm/DebugInfo/PDB/Raw/NameMap.h"
+#include "llvm/Support/Endian.h"
+
+using namespace llvm;
+using namespace llvm::pdb;
+
+NameMapBuilder::NameMapBuilder() {}
+
+void NameMapBuilder::addMapping(StringRef Name, uint32_t Mapping) {
+  StringDataBytes += Name.size() + 1;
+  Map.insert({Name, Mapping});
+}
+
+Expected<std::unique_ptr<NameMap>> NameMapBuilder::build() {
+  auto Result = llvm::make_unique<NameMap>();
+  Result->Mapping = Map;
+  return std::move(Result);
+}
+
+uint32_t NameMapBuilder::calculateSerializedLength() const {
+  uint32_t TotalLength = 0;
+
+  TotalLength += sizeof(support::ulittle32_t); // StringDataBytes value
+  TotalLength += StringDataBytes;              // actual string data
+
+  TotalLength += sizeof(support::ulittle32_t); // Hash Size
+  TotalLength += sizeof(support::ulittle32_t); // Max Number of Strings
+  TotalLength += sizeof(support::ulittle32_t); // Num Present Words
+  // One bitmask word for each present entry
+  TotalLength += Map.size() * sizeof(support::ulittle32_t);
+  TotalLength += sizeof(support::ulittle32_t); // Num Deleted Words
+
+  // For each present word, which we are treating as equivalent to the number of
+  // entries in the table, we have a pair of integers.  An offset into the
+  // string data, and a corresponding stream number.
+  TotalLength += Map.size() * 2 * sizeof(support::ulittle32_t);
+
+  return TotalLength;
+}
diff --git a/lib/DebugInfo/PDB/Raw/PDBFile.cpp b/lib/DebugInfo/PDB/Raw/PDBFile.cpp
new file mode 100644
index 000000000000..95016753dc14
--- /dev/null
+++ b/lib/DebugInfo/PDB/Raw/PDBFile.cpp
@@ -0,0 +1,365 @@
+//===- PDBFile.cpp - Low level interface to a PDB file ----------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/DebugInfo/PDB/Raw/PDBFile.h"
+
+#include "llvm/ADT/ArrayRef.h"
+#include "llvm/DebugInfo/CodeView/StreamArray.h"
+#include "llvm/DebugInfo/CodeView/StreamInterface.h"
+#include "llvm/DebugInfo/CodeView/StreamReader.h"
+#include "llvm/DebugInfo/CodeView/StreamWriter.h"
+#include "llvm/DebugInfo/PDB/Raw/DbiStream.h"
+#include "llvm/DebugInfo/PDB/Raw/DirectoryStreamData.h"
+#include "llvm/DebugInfo/PDB/Raw/IndexedStreamData.h"
+#include "llvm/DebugInfo/PDB/Raw/InfoStream.h"
+#include "llvm/DebugInfo/PDB/Raw/NameHashTable.h"
+#include "llvm/DebugInfo/PDB/Raw/PublicsStream.h"
+#include "llvm/DebugInfo/PDB/Raw/RawError.h"
+#include "llvm/DebugInfo/PDB/Raw/SymbolStream.h"
+#include "llvm/DebugInfo/PDB/Raw/TpiStream.h"
+#include "llvm/Support/Endian.h"
+#include "llvm/Support/FileOutputBuffer.h"
+#include "llvm/Support/MemoryBuffer.h"
+
+using namespace llvm;
+using namespace llvm::codeview;
+using namespace llvm::pdb;
+
+namespace {
+typedef FixedStreamArray<support::ulittle32_t> ulittle_array;
+}
+
+PDBFile::PDBFile(std::unique_ptr<StreamInterface> PdbFileBuffer)
+    : Buffer(std::move(PdbFileBuffer)), SB(nullptr) {}
+
+PDBFile::~PDBFile() {}
+
+uint32_t PDBFile::getBlockSize() const { return SB->BlockSize; }
+
+uint32_t PDBFile::getFreeBlockMapBlock() const { return SB->FreeBlockMapBlock; }
+
+uint32_t PDBFile::getBlockCount() const { return SB->NumBlocks; }
+
+uint32_t PDBFile::getNumDirectoryBytes() const { return SB->NumDirectoryBytes; }
+
+uint32_t PDBFile::getBlockMapIndex() const { return SB->BlockMapAddr; }
+
+uint32_t PDBFile::getUnknown1() const { return SB->Unknown1; }
+
+uint32_t PDBFile::getNumDirectoryBlocks() const {
+  return msf::bytesToBlocks(SB->NumDirectoryBytes, SB->BlockSize);
+}
+
+uint64_t PDBFile::getBlockMapOffset() const {
+  return (uint64_t)SB->BlockMapAddr * SB->BlockSize;
+}
+
+uint32_t PDBFile::getNumStreams() const { return StreamSizes.size(); }
+
+uint32_t PDBFile::getStreamByteSize(uint32_t StreamIndex) const {
+  return StreamSizes[StreamIndex];
+}
+
+ArrayRef<support::ulittle32_t>
+PDBFile::getStreamBlockList(uint32_t StreamIndex) const {
+  return StreamMap[StreamIndex];
+}
+
+uint32_t PDBFile::getFileSize() const { return Buffer->getLength(); }
+
+Expected<ArrayRef<uint8_t>> PDBFile::getBlockData(uint32_t BlockIndex,
+                                                  uint32_t NumBytes) const {
+  uint64_t StreamBlockOffset = msf::blockToOffset(BlockIndex, getBlockSize());
+
+  ArrayRef<uint8_t> Result;
+  if (auto EC = Buffer->readBytes(StreamBlockOffset, NumBytes, Result))
+    return std::move(EC);
+  return Result;
+}
+
+Error PDBFile::setBlockData(uint32_t BlockIndex, uint32_t Offset,
+                            ArrayRef<uint8_t> Data) const {
+  if (Offset >= getBlockSize())
+    return make_error<RawError>(
+        raw_error_code::invalid_block_address,
+        "setBlockData attempted to write out of block bounds.");
+  if (Data.size() > getBlockSize() - Offset)
+    return make_error<RawError>(
+        raw_error_code::invalid_block_address,
+        "setBlockData attempted to write out of block bounds.");
+
+  uint64_t StreamBlockOffset = msf::blockToOffset(BlockIndex, getBlockSize());
+  StreamBlockOffset += Offset;
+  return Buffer->writeBytes(StreamBlockOffset, Data);
+}
+
+Error PDBFile::parseFileHeaders() {
+  StreamReader Reader(*Buffer);
+
+  if (auto EC = Reader.readObject(SB)) {
+    consumeError(std::move(EC));
+    return make_error<RawError>(raw_error_code::corrupt_file,
+                                "Does not contain superblock");
+  }
+
+  if (auto EC = setSuperBlock(SB))
+    return EC;
+
+  Reader.setOffset(getBlockMapOffset());
+  if (auto EC = Reader.readArray(DirectoryBlocks, getNumDirectoryBlocks()))
+    return EC;
+
+  return Error::success();
+}
+
+Error PDBFile::parseStreamData() {
+  assert(SB);
+  if (DirectoryStream)
+    return Error::success();
+
+  uint32_t NumStreams = 0;
+
+  // Normally you can't use a MappedBlockStream without having fully parsed the
+  // PDB file, because it accesses the directory and various other things, which
+  // is exactly what we are attempting to parse.  By specifying a custom
+  // subclass of IPDBStreamData which only accesses the fields that have already
+  // been parsed, we can avoid this and reuse MappedBlockStream.
+  auto DS = MappedBlockStream::createDirectoryStream(*this);
+  if (!DS)
+    return DS.takeError();
+  StreamReader Reader(**DS);
+  if (auto EC = Reader.readInteger(NumStreams))
+    return EC;
+
+  if (auto EC = Reader.readArray(StreamSizes, NumStreams))
+    return EC;
+  for (uint32_t I = 0; I < NumStreams; ++I) {
+    uint32_t StreamSize = getStreamByteSize(I);
+    // FIXME: What does StreamSize ~0U mean?
+    uint64_t NumExpectedStreamBlocks =
+        StreamSize == UINT32_MAX ? 0 : msf::bytesToBlocks(StreamSize,
+                                                          SB->BlockSize);
+
+    // For convenience, we store the block array contiguously.  This is because
+    // if someone calls setStreamMap(), it is more convenient to be able to call
+    // it with an ArrayRef instead of setting up a StreamRef.  Since the
+    // DirectoryStream is cached in the class and thus lives for the life of the
+    // class, we can be guaranteed that readArray() will return a stable
+    // reference, even if it has to allocate from its internal pool.
+    ArrayRef<support::ulittle32_t> Blocks;
+    if (auto EC = Reader.readArray(Blocks, NumExpectedStreamBlocks))
+      return EC;
+    for (uint32_t Block : Blocks) {
+      uint64_t BlockEndOffset = (uint64_t)(Block + 1) * SB->BlockSize;
+      if (BlockEndOffset > getFileSize())
+        return make_error<RawError>(raw_error_code::corrupt_file,
+                                    "Stream block map is corrupt.");
+    }
+    StreamMap.push_back(Blocks);
+  }
+
+  // We should have read exactly SB->NumDirectoryBytes bytes.
+  assert(Reader.bytesRemaining() == 0);
+  DirectoryStream = std::move(*DS);
+  return Error::success();
+}
+
+llvm::ArrayRef<support::ulittle32_t> PDBFile::getDirectoryBlockArray() const {
+  return DirectoryBlocks;
+}
+
+Expected<InfoStream &> PDBFile::getPDBInfoStream() {
+  if (!Info) {
+    auto InfoS = MappedBlockStream::createIndexedStream(StreamPDB, *this);
+    if (!InfoS)
+      return InfoS.takeError();
+    auto TempInfo = llvm::make_unique<InfoStream>(std::move(*InfoS));
+    if (auto EC = TempInfo->reload())
+      return std::move(EC);
+    Info = std::move(TempInfo);
+  }
+  return *Info;
+}
+
+Expected<DbiStream &> PDBFile::getPDBDbiStream() {
+  if (!Dbi) {
+    auto DbiS = MappedBlockStream::createIndexedStream(StreamDBI, *this);
+    if (!DbiS)
+      return DbiS.takeError();
+    auto TempDbi = llvm::make_unique<DbiStream>(*this, std::move(*DbiS));
+    if (auto EC = TempDbi->reload())
+      return std::move(EC);
+    Dbi = std::move(TempDbi);
+  }
+  return *Dbi;
+}
+
+Expected<TpiStream &> PDBFile::getPDBTpiStream() {
+  if (!Tpi) {
+    auto TpiS = MappedBlockStream::createIndexedStream(StreamTPI, *this);
+    if (!TpiS)
+      return TpiS.takeError();
+    auto TempTpi = llvm::make_unique<TpiStream>(*this, std::move(*TpiS));
+    if (auto EC = TempTpi->reload())
+      return std::move(EC);
+    Tpi = std::move(TempTpi);
+  }
+  return *Tpi;
+}
+
+Expected<TpiStream &> PDBFile::getPDBIpiStream() {
+  if (!Ipi) {
+    auto IpiS = MappedBlockStream::createIndexedStream(StreamIPI, *this);
+    if (!IpiS)
+      return IpiS.takeError();
+    auto TempIpi = llvm::make_unique<TpiStream>(*this, std::move(*IpiS));
+    if (auto EC = TempIpi->reload())
+      return std::move(EC);
+    Ipi = std::move(TempIpi);
+  }
+  return *Ipi;
+}
+
+Expected<PublicsStream &> PDBFile::getPDBPublicsStream() {
+  if (!Publics) {
+    auto DbiS = getPDBDbiStream();
+    if (!DbiS)
+      return DbiS.takeError();
+
+    uint32_t PublicsStreamNum = DbiS->getPublicSymbolStreamIndex();
+
+    auto PublicS =
+        MappedBlockStream::createIndexedStream(PublicsStreamNum, *this);
+    if (!PublicS)
+      return PublicS.takeError();
+    auto TempPublics =
+        llvm::make_unique<PublicsStream>(*this, std::move(*PublicS));
+    if (auto EC = TempPublics->reload())
+      return std::move(EC);
+    Publics = std::move(TempPublics);
+  }
+  return *Publics;
+}
+
+Expected<SymbolStream &> PDBFile::getPDBSymbolStream() {
+  if (!Symbols) {
+    auto DbiS = getPDBDbiStream();
+    if (!DbiS)
+      return DbiS.takeError();
+
+    uint32_t SymbolStreamNum = DbiS->getSymRecordStreamIndex();
+
+    auto SymbolS =
+        MappedBlockStream::createIndexedStream(SymbolStreamNum, *this);
+    if (!SymbolS)
+      return SymbolS.takeError();
+    auto TempSymbols = llvm::make_unique<SymbolStream>(std::move(*SymbolS));
+    if (auto EC = TempSymbols->reload())
+      return std::move(EC);
+    Symbols = std::move(TempSymbols);
+  }
+  return *Symbols;
+}
+
+Expected<NameHashTable &> PDBFile::getStringTable() {
+  if (!StringTable || !StringTableStream) {
+    auto IS = getPDBInfoStream();
+    if (!IS)
+      return IS.takeError();
+
+    uint32_t NameStreamIndex = IS->getNamedStreamIndex("/names");
+
+    if (NameStreamIndex == 0)
+      return make_error<RawError>(raw_error_code::no_stream);
+    if (NameStreamIndex >= getNumStreams())
+      return make_error<RawError>(raw_error_code::no_stream);
+
+    auto NS = MappedBlockStream::createIndexedStream(NameStreamIndex, *this);
+    if (!NS)
+      return NS.takeError();
+
+    StreamReader Reader(**NS);
+    auto N = llvm::make_unique<NameHashTable>();
+    if (auto EC = N->load(Reader))
+      return std::move(EC);
+    StringTable = std::move(N);
+    StringTableStream = std::move(*NS);
+  }
+  return *StringTable;
+}
+
+Error PDBFile::setSuperBlock(const msf::SuperBlock *Block) {
+  if (auto EC = msf::validateSuperBlock(*Block))
+    return EC;
+
+  if (Buffer->getLength() % SB->BlockSize != 0)
+    return make_error<RawError>(raw_error_code::corrupt_file,
+                                "File size is not a multiple of block size");
+
+  SB = Block;
+  return Error::success();
+}
+
+Error PDBFile::commit() {
+  StreamWriter Writer(*Buffer);
+
+  if (auto EC = Writer.writeObject(*SB))
+    return EC;
+  Writer.setOffset(getBlockMapOffset());
+  if (auto EC = Writer.writeArray(DirectoryBlocks))
+    return EC;
+
+  auto DS = MappedBlockStream::createDirectoryStream(*this);
+  if (!DS)
+    return DS.takeError();
+  auto DirStream = std::move(*DS);
+  StreamWriter DW(*DirStream);
+  if (auto EC = DW.writeInteger(this->getNumStreams()))
+    return EC;
+
+  if (auto EC = DW.writeArray(StreamSizes))
+    return EC;
+
+  for (const auto &Blocks : StreamMap) {
+    if (auto EC = DW.writeArray(Blocks))
+      return EC;
+  }
+
+  if (Info) {
+    if (auto EC = Info->commit())
+      return EC;
+  }
+
+  if (Dbi) {
+    if (auto EC = Dbi->commit())
+      return EC;
+  }
+
+  if (Symbols) {
+    if (auto EC = Symbols->commit())
+      return EC;
+  }
+
+  if (Publics) {
+    if (auto EC = Publics->commit())
+      return EC;
+  }
+
+  if (Tpi) {
+    if (auto EC = Tpi->commit())
+      return EC;
+  }
+
+  if (Ipi) {
+    if (auto EC = Ipi->commit())
+      return EC;
+  }
+
+  return Buffer->commit();
+}
diff --git a/lib/DebugInfo/PDB/Raw/PDBFileBuilder.cpp b/lib/DebugInfo/PDB/Raw/PDBFileBuilder.cpp
new file mode 100644
index 000000000000..9063fd62d295
--- /dev/null
+++ b/lib/DebugInfo/PDB/Raw/PDBFileBuilder.cpp
@@ -0,0 +1,102 @@
+//===- PDBFileBuilder.cpp - PDB File Creation -------------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/DebugInfo/PDB/Raw/PDBFileBuilder.h"
+
+#include "llvm/ADT/BitVector.h"
+
+#include "llvm/DebugInfo/CodeView/StreamInterface.h"
+#include "llvm/DebugInfo/CodeView/StreamWriter.h"
+#include "llvm/DebugInfo/PDB/Raw/DbiStream.h"
+#include "llvm/DebugInfo/PDB/Raw/DbiStreamBuilder.h"
+#include "llvm/DebugInfo/PDB/Raw/InfoStream.h"
+#include "llvm/DebugInfo/PDB/Raw/InfoStreamBuilder.h"
+#include "llvm/DebugInfo/PDB/Raw/RawError.h"
+
+using namespace llvm;
+using namespace llvm::codeview;
+using namespace llvm::pdb;
+using namespace llvm::support;
+
+PDBFileBuilder::PDBFileBuilder(
+    std::unique_ptr<codeview::StreamInterface> FileBuffer)
+    : File(llvm::make_unique<PDBFile>(std::move(FileBuffer))) {}
+
+Error PDBFileBuilder::initialize(const msf::SuperBlock &Super) {
+  auto ExpectedMsf =
+      MsfBuilder::create(File->Allocator, Super.BlockSize, Super.NumBlocks);
+  if (!ExpectedMsf)
+    return ExpectedMsf.takeError();
+
+  auto &MsfResult = *ExpectedMsf;
+  if (auto EC = MsfResult.setBlockMapAddr(Super.BlockMapAddr))
+    return EC;
+  Msf = llvm::make_unique<MsfBuilder>(std::move(MsfResult));
+  Msf->setFreePageMap(Super.FreeBlockMapBlock);
+  Msf->setUnknown1(Super.Unknown1);
+  return Error::success();
+}
+
+MsfBuilder &PDBFileBuilder::getMsfBuilder() { return *Msf; }
+
+InfoStreamBuilder &PDBFileBuilder::getInfoBuilder() {
+  if (!Info)
+    Info = llvm::make_unique<InfoStreamBuilder>();
+  return *Info;
+}
+
+DbiStreamBuilder &PDBFileBuilder::getDbiBuilder() {
+  if (!Dbi)
+    Dbi = llvm::make_unique<DbiStreamBuilder>();
+  return *Dbi;
+}
+
+Expected<std::unique_ptr<PDBFile>> PDBFileBuilder::build() {
+  if (Info) {
+    uint32_t Length = Info->calculateSerializedLength();
+    if (auto EC = Msf->setStreamSize(StreamPDB, Length))
+      return std::move(EC);
+  }
+  if (Dbi) {
+    uint32_t Length = Dbi->calculateSerializedLength();
+    if (auto EC = Msf->setStreamSize(StreamDBI, Length))
+      return std::move(EC);
+  }
+
+  auto ExpectedLayout = Msf->build();
+  if (!ExpectedLayout)
+    return ExpectedLayout.takeError();
+
+  const msf::Layout &L = *ExpectedLayout;
+  File->StreamMap = L.StreamMap;
+  File->StreamSizes = L.StreamSizes;
+  File->DirectoryBlocks = L.DirectoryBlocks;
+  File->SB = L.SB;
+
+  if (Info) {
+    auto ExpectedInfo = Info->build(*File);
+    if (!ExpectedInfo)
+      return ExpectedInfo.takeError();
+    File->Info = std::move(*ExpectedInfo);
+  }
+
+  if (Dbi) {
+    auto ExpectedDbi = Dbi->build(*File);
+    if (!ExpectedDbi)
+      return ExpectedDbi.takeError();
+    File->Dbi = std::move(*ExpectedDbi);
+  }
+
+  if (File->Info && File->Dbi && File->Info->getAge() != File->Dbi->getAge())
+    return llvm::make_error<RawError>(
+        raw_error_code::corrupt_file,
+        "PDB Stream Age doesn't match Dbi Stream Age!");
+
+  return std::move(File);
+}
diff --git a/lib/DebugInfo/PDB/Raw/PublicsStream.cpp b/lib/DebugInfo/PDB/Raw/PublicsStream.cpp
new file mode 100644
index 000000000000..af3d2d026b48
--- /dev/null
+++ b/lib/DebugInfo/PDB/Raw/PublicsStream.cpp
@@ -0,0 +1,173 @@
+//===- PublicsStream.cpp - PDB Public Symbol Stream -----------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// The data structures defined in this file are based on the reference
+// implementation which is available at
+// https://github.com/Microsoft/microsoft-pdb/blob/master/PDB/dbi/gsi.h
+//
+// When you are reading the reference source code, you'd find the
+// information below useful.
+//
+//  - ppdb1->m_fMinimalDbgInfo seems to be always true.
+//  - SMALLBUCKETS macro is defined.
+//
+// The reference doesn't compile, so I learned just by reading code.
+// It's not guaranteed to be correct.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/DebugInfo/PDB/Raw/PublicsStream.h"
+
+#include "llvm/DebugInfo/CodeView/CodeView.h"
+#include "llvm/DebugInfo/CodeView/StreamReader.h"
+#include "llvm/DebugInfo/CodeView/TypeRecord.h"
+#include "llvm/DebugInfo/PDB/Raw/IndexedStreamData.h"
+#include "llvm/DebugInfo/PDB/Raw/MappedBlockStream.h"
+#include "llvm/DebugInfo/PDB/Raw/PDBFile.h"
+#include "llvm/DebugInfo/PDB/Raw/RawConstants.h"
+#include "llvm/DebugInfo/PDB/Raw/RawError.h"
+#include "llvm/DebugInfo/PDB/Raw/SymbolStream.h"
+
+#include "llvm/ADT/BitVector.h"
+#include "llvm/Support/Endian.h"
+#include "llvm/Support/Format.h"
+#include "llvm/Support/MathExtras.h"
+
+using namespace llvm;
+using namespace llvm::support;
+using namespace llvm::pdb;
+
+
+static const unsigned IPHR_HASH = 4096;
+
+// This is PSGSIHDR struct defined in
+// https://github.com/Microsoft/microsoft-pdb/blob/master/PDB/dbi/gsi.h
+struct PublicsStream::HeaderInfo {
+  ulittle32_t SymHash;
+  ulittle32_t AddrMap;
+  ulittle32_t NumThunks;
+  ulittle32_t SizeOfThunk;
+  ulittle16_t ISectThunkTable;
+  char Padding[2];
+  ulittle32_t OffThunkTable;
+  ulittle32_t NumSections;
+};
+
+// This is GSIHashHdr.
+struct PublicsStream::GSIHashHeader {
+  enum : unsigned {
+    HdrSignature = ~0U,
+    HdrVersion = 0xeffe0000 + 19990810,
+  };
+  ulittle32_t VerSignature;
+  ulittle32_t VerHdr;
+  ulittle32_t HrSize;
+  ulittle32_t NumBuckets;
+};
+
+PublicsStream::PublicsStream(PDBFile &File,
+                             std::unique_ptr<MappedBlockStream> Stream)
+    : Pdb(File), Stream(std::move(Stream)) {}
+
+PublicsStream::~PublicsStream() {}
+
+uint32_t PublicsStream::getSymHash() const { return Header->SymHash; }
+uint32_t PublicsStream::getAddrMap() const { return Header->AddrMap; }
+
+// Publics stream contains fixed-size headers and a serialized hash table.
+// This implementation is not complete yet. It reads till the end of the
+// stream so that we verify the stream is at least not corrupted. However,
+// we skip over the hash table which we believe contains information about
+// public symbols.
+Error PublicsStream::reload() {
+  codeview::StreamReader Reader(*Stream);
+
+  // Check stream size.
+  if (Reader.bytesRemaining() < sizeof(HeaderInfo) + sizeof(GSIHashHeader))
+    return make_error<RawError>(raw_error_code::corrupt_file,
+                                "Publics Stream does not contain a header.");
+
+  // Read PSGSIHDR and GSIHashHdr structs.
+  if (Reader.readObject(Header))
+    return make_error<RawError>(raw_error_code::corrupt_file,
+                                "Publics Stream does not contain a header.");
+
+  if (Reader.readObject(HashHdr))
+    return make_error<RawError>(raw_error_code::corrupt_file,
+                                "Publics Stream does not contain a header.");
+
+  // An array of HashRecord follows. Read them.
+  if (HashHdr->HrSize % sizeof(PSHashRecord))
+    return make_error<RawError>(raw_error_code::corrupt_file,
+                                "Invalid HR array size.");
+  uint32_t NumHashRecords = HashHdr->HrSize / sizeof(PSHashRecord);
+  if (auto EC = Reader.readArray(HashRecords, NumHashRecords))
+    return joinErrors(std::move(EC),
+                      make_error<RawError>(raw_error_code::corrupt_file,
+                                           "Could not read an HR array"));
+
+  // A bitmap of a fixed length follows.
+  size_t BitmapSizeInBits = alignTo(IPHR_HASH + 1, 32);
+  uint32_t NumBitmapEntries = BitmapSizeInBits / 8;
+  if (auto EC = Reader.readBytes(Bitmap, NumBitmapEntries))
+    return joinErrors(std::move(EC),
+                      make_error<RawError>(raw_error_code::corrupt_file,
+                                           "Could not read a bitmap."));
+  for (uint8_t B : Bitmap)
+    NumBuckets += countPopulation(B);
+
+  // We don't yet understand the following data structures completely,
+  // but we at least know the types and sizes. Here we are trying
+  // to read the stream till end so that we at least can detect
+  // corrupted streams.
+
+  // Hash buckets follow.
+  if (auto EC = Reader.readArray(HashBuckets, NumBuckets))
+    return joinErrors(std::move(EC),
+                      make_error<RawError>(raw_error_code::corrupt_file,
+                                           "Hash buckets corrupted."));
+
+  // Something called "address map" follows.
+  uint32_t NumAddressMapEntries = Header->AddrMap / sizeof(uint32_t);
+  if (auto EC = Reader.readArray(AddressMap, NumAddressMapEntries))
+    return joinErrors(std::move(EC),
+                      make_error<RawError>(raw_error_code::corrupt_file,
+                                           "Could not read an address map."));
+
+  // Something called "thunk map" follows.
+  if (auto EC = Reader.readArray(ThunkMap, Header->NumThunks))
+    return joinErrors(std::move(EC),
+                      make_error<RawError>(raw_error_code::corrupt_file,
+                                           "Could not read a thunk map."));
+
+  // Something called "section map" follows.
+  if (auto EC = Reader.readArray(SectionOffsets, Header->NumSections))
+    return joinErrors(std::move(EC),
+                      make_error<RawError>(raw_error_code::corrupt_file,
+                                           "Could not read a section map."));
+
+  if (Reader.bytesRemaining() > 0)
+    return make_error<RawError>(raw_error_code::corrupt_file,
+                                "Corrupted publics stream.");
+  return Error::success();
+}
+
+iterator_range<codeview::CVSymbolArray::Iterator>
+PublicsStream::getSymbols(bool *HadError) const {
+  auto SymbolS = Pdb.getPDBSymbolStream();
+  if (SymbolS.takeError()) {
+    codeview::CVSymbolArray::Iterator Iter;
+    return llvm::make_range(Iter, Iter);
+  }
+  SymbolStream &SS = SymbolS.get();
+
+  return SS.getSymbols(HadError);
+}
+
+Error PublicsStream::commit() { return Error::success(); }
diff --git a/lib/DebugInfo/PDB/Raw/RawError.cpp b/lib/DebugInfo/PDB/Raw/RawError.cpp
new file mode 100644
index 000000000000..eb169f70e11c
--- /dev/null
+++ b/lib/DebugInfo/PDB/Raw/RawError.cpp
@@ -0,0 +1,67 @@
+#include "llvm/DebugInfo/PDB/Raw/RawError.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/ManagedStatic.h"
+
+using namespace llvm;
+using namespace llvm::pdb;
+
+namespace {
+// FIXME: This class is only here to support the transition to llvm::Error. It
+// will be removed once this transition is complete. Clients should prefer to
+// deal with the Error value directly, rather than converting to error_code.
+class RawErrorCategory : public std::error_category {
+public:
+  const char *name() const LLVM_NOEXCEPT override { return "llvm.pdb.raw"; }
+
+  std::string message(int Condition) const override {
+    switch (static_cast<raw_error_code>(Condition)) {
+    case raw_error_code::unspecified:
+      return "An unknown error has occurred.";
+    case raw_error_code::feature_unsupported:
+      return "The feature is unsupported by the implementation.";
+    case raw_error_code::corrupt_file:
+      return "The PDB file is corrupt.";
+    case raw_error_code::insufficient_buffer:
+      return "The buffer is not large enough to read the requested number of "
+             "bytes.";
+    case raw_error_code::no_stream:
+      return "The specified stream could not be loaded.";
+    case raw_error_code::index_out_of_bounds:
+      return "The specified item does not exist in the array.";
+    case raw_error_code::invalid_block_address:
+      return "The specified block address is not valid.";
+    case raw_error_code::not_writable:
+      return "The PDB does not support writing.";
+    case raw_error_code::invalid_tpi_hash:
+      return "The Type record has an invalid hash value.";
+    }
+    llvm_unreachable("Unrecognized raw_error_code");
+  }
+};
+} // end anonymous namespace
+
+static ManagedStatic<RawErrorCategory> Category;
+
+char RawError::ID = 0;
+
+RawError::RawError(raw_error_code C) : RawError(C, "") {}
+
+RawError::RawError(const std::string &Context)
+    : RawError(raw_error_code::unspecified, Context) {}
+
+RawError::RawError(raw_error_code C, const std::string &Context) : Code(C) {
+  ErrMsg = "Native PDB Error: ";
+  std::error_code EC = convertToErrorCode();
+  if (Code != raw_error_code::unspecified)
+    ErrMsg += EC.message() + "  ";
+  if (!Context.empty())
+    ErrMsg += Context;
+}
+
+void RawError::log(raw_ostream &OS) const { OS << ErrMsg << "\n"; }
+
+const std::string &RawError::getErrorMessage() const { return ErrMsg; }
+
+std::error_code RawError::convertToErrorCode() const {
+  return std::error_code(static_cast<int>(Code), *Category);
+}
diff --git a/lib/DebugInfo/PDB/Raw/RawSession.cpp b/lib/DebugInfo/PDB/Raw/RawSession.cpp
new file mode 100644
index 000000000000..455d33140dd4
--- /dev/null
+++ b/lib/DebugInfo/PDB/Raw/RawSession.cpp
@@ -0,0 +1,146 @@
+//===- RawSession.cpp - Raw implementation of IPDBSession -------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/DebugInfo/PDB/Raw/RawSession.h"
+
+#include "llvm/DebugInfo/CodeView/ByteStream.h"
+#include "llvm/DebugInfo/CodeView/StreamInterface.h"
+#include "llvm/DebugInfo/PDB/GenericError.h"
+#include "llvm/DebugInfo/PDB/IPDBEnumChildren.h"
+#include "llvm/DebugInfo/PDB/IPDBSourceFile.h"
+#include "llvm/DebugInfo/PDB/PDBSymbolCompiland.h"
+#include "llvm/DebugInfo/PDB/PDBSymbolExe.h"
+#include "llvm/DebugInfo/PDB/Raw/PDBFile.h"
+#include "llvm/DebugInfo/PDB/Raw/RawError.h"
+
+#include "llvm/Support/ErrorOr.h"
+#include "llvm/Support/MemoryBuffer.h"
+
+using namespace llvm;
+using namespace llvm::pdb;
+
+namespace {
+// We need a class which behaves like an immutable ByteStream, but whose data
+// is backed by an llvm::MemoryBuffer.  It also needs to own the underlying
+// MemoryBuffer, so this simple adapter is a good way to achieve that.
+class InputByteStream : public codeview::ByteStream<false> {
+public:
+  explicit InputByteStream(std::unique_ptr<MemoryBuffer> Buffer)
+      : ByteStream(ArrayRef<uint8_t>(Buffer->getBuffer().bytes_begin(),
+                                     Buffer->getBuffer().bytes_end())),
+        MemBuffer(std::move(Buffer)) {}
+
+  std::unique_ptr<MemoryBuffer> MemBuffer;
+};
+}
+
+RawSession::RawSession(std::unique_ptr<PDBFile> PdbFile)
+    : Pdb(std::move(PdbFile)) {}
+
+RawSession::~RawSession() {}
+
+Error RawSession::createFromPdb(StringRef Path,
+                                std::unique_ptr<IPDBSession> &Session) {
+
+  ErrorOr<std::unique_ptr<MemoryBuffer>> ErrorOrBuffer =
+      MemoryBuffer::getFileOrSTDIN(Path, /*FileSize=*/-1,
+                                   /*RequiresNullTerminator=*/false);
+  if (!ErrorOrBuffer)
+    return llvm::make_error<GenericError>(generic_error_code::invalid_path);
+
+  std::unique_ptr<MemoryBuffer> Buffer = std::move(*ErrorOrBuffer);
+  auto Stream = llvm::make_unique<InputByteStream>(std::move(Buffer));
+
+  std::unique_ptr<PDBFile> File(new PDBFile(std::move(Stream)));
+  if (auto EC = File->parseFileHeaders())
+    return EC;
+  if (auto EC = File->parseStreamData())
+    return EC;
+
+  Session.reset(new RawSession(std::move(File)));
+
+  return Error::success();
+}
+
+Error RawSession::createFromExe(StringRef Path,
+                                std::unique_ptr<IPDBSession> &Session) {
+  return llvm::make_error<RawError>(raw_error_code::feature_unsupported);
+}
+
+uint64_t RawSession::getLoadAddress() const { return 0; }
+
+void RawSession::setLoadAddress(uint64_t Address) {}
+
+std::unique_ptr<PDBSymbolExe> RawSession::getGlobalScope() const {
+  return nullptr;
+}
+
+std::unique_ptr<PDBSymbol> RawSession::getSymbolById(uint32_t SymbolId) const {
+  return nullptr;
+}
+
+std::unique_ptr<PDBSymbol>
+RawSession::findSymbolByAddress(uint64_t Address, PDB_SymType Type) const {
+  return nullptr;
+}
+
+std::unique_ptr<IPDBEnumLineNumbers>
+RawSession::findLineNumbers(const PDBSymbolCompiland &Compiland,
+                            const IPDBSourceFile &File) const {
+  return nullptr;
+}
+
+std::unique_ptr<IPDBEnumLineNumbers>
+RawSession::findLineNumbersByAddress(uint64_t Address, uint32_t Length) const {
+  return nullptr;
+}
+
+std::unique_ptr<IPDBEnumSourceFiles>
+RawSession::findSourceFiles(const PDBSymbolCompiland *Compiland,
+                            llvm::StringRef Pattern,
+                            PDB_NameSearchFlags Flags) const {
+  return nullptr;
+}
+
+std::unique_ptr<IPDBSourceFile>
+RawSession::findOneSourceFile(const PDBSymbolCompiland *Compiland,
+                              llvm::StringRef Pattern,
+                              PDB_NameSearchFlags Flags) const {
+  return nullptr;
+}
+
+std::unique_ptr<IPDBEnumChildren<PDBSymbolCompiland>>
+RawSession::findCompilandsForSourceFile(llvm::StringRef Pattern,
+                                        PDB_NameSearchFlags Flags) const {
+  return nullptr;
+}
+
+std::unique_ptr<PDBSymbolCompiland>
+RawSession::findOneCompilandForSourceFile(llvm::StringRef Pattern,
+                                          PDB_NameSearchFlags Flags) const {
+  return nullptr;
+}
+
+std::unique_ptr<IPDBEnumSourceFiles> RawSession::getAllSourceFiles() const {
+  return nullptr;
+}
+
+std::unique_ptr<IPDBEnumSourceFiles> RawSession::getSourceFilesForCompiland(
+    const PDBSymbolCompiland &Compiland) const {
+  return nullptr;
+}
+
+std::unique_ptr<IPDBSourceFile>
+RawSession::getSourceFileById(uint32_t FileId) const {
+  return nullptr;
+}
+
+std::unique_ptr<IPDBEnumDataStreams> RawSession::getDebugStreams() const {
+  return nullptr;
+}
diff --git a/lib/DebugInfo/PDB/Raw/SymbolStream.cpp b/lib/DebugInfo/PDB/Raw/SymbolStream.cpp
new file mode 100644
index 000000000000..41b2a64bfb14
--- /dev/null
+++ b/lib/DebugInfo/PDB/Raw/SymbolStream.cpp
@@ -0,0 +1,46 @@
+//===- SymbolStream.cpp - PDB Symbol Stream Access ------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/DebugInfo/PDB/Raw/SymbolStream.h"
+
+#include "llvm/DebugInfo/CodeView/CodeView.h"
+#include "llvm/DebugInfo/CodeView/StreamReader.h"
+#include "llvm/DebugInfo/CodeView/TypeRecord.h"
+#include "llvm/DebugInfo/PDB/Raw/IndexedStreamData.h"
+#include "llvm/DebugInfo/PDB/Raw/MappedBlockStream.h"
+#include "llvm/DebugInfo/PDB/Raw/PDBFile.h"
+#include "llvm/DebugInfo/PDB/Raw/RawConstants.h"
+#include "llvm/DebugInfo/PDB/Raw/RawError.h"
+
+#include "llvm/Support/Endian.h"
+
+using namespace llvm;
+using namespace llvm::support;
+using namespace llvm::pdb;
+
+SymbolStream::SymbolStream(std::unique_ptr<MappedBlockStream> Stream)
+    : Stream(std::move(Stream)) {}
+
+SymbolStream::~SymbolStream() {}
+
+Error SymbolStream::reload() {
+  codeview::StreamReader Reader(*Stream);
+
+  if (auto EC = Reader.readArray(SymbolRecords, Stream->getLength()))
+    return EC;
+
+  return Error::success();
+}
+
+iterator_range<codeview::CVSymbolArray::Iterator>
+SymbolStream::getSymbols(bool *HadError) const {
+  return llvm::make_range(SymbolRecords.begin(HadError), SymbolRecords.end());
+}
+
+Error SymbolStream::commit() { return Error::success(); }
diff --git a/lib/DebugInfo/PDB/Raw/TpiStream.cpp b/lib/DebugInfo/PDB/Raw/TpiStream.cpp
new file mode 100644
index 000000000000..5617e57ccf67
--- /dev/null
+++ b/lib/DebugInfo/PDB/Raw/TpiStream.cpp
@@ -0,0 +1,273 @@
+//===- TpiStream.cpp - PDB Type Info (TPI) Stream 2 Access ----------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/DebugInfo/PDB/Raw/TpiStream.h"
+
+#include "llvm/DebugInfo/CodeView/CVTypeVisitor.h"
+#include "llvm/DebugInfo/CodeView/CodeView.h"
+#include "llvm/DebugInfo/CodeView/StreamReader.h"
+#include "llvm/DebugInfo/CodeView/TypeIndex.h"
+#include "llvm/DebugInfo/CodeView/TypeRecord.h"
+#include "llvm/DebugInfo/PDB/Raw/Hash.h"
+#include "llvm/DebugInfo/PDB/Raw/IndexedStreamData.h"
+#include "llvm/DebugInfo/PDB/Raw/MappedBlockStream.h"
+#include "llvm/DebugInfo/PDB/Raw/PDBFile.h"
+#include "llvm/DebugInfo/PDB/Raw/RawConstants.h"
+#include "llvm/DebugInfo/PDB/Raw/RawError.h"
+#include "llvm/DebugInfo/PDB/Raw/RawTypes.h"
+
+#include "llvm/Support/Endian.h"
+
+using namespace llvm;
+using namespace llvm::codeview;
+using namespace llvm::support;
+using namespace llvm::pdb;
+
+namespace {
+const uint32_t MinHashBuckets = 0x1000;
+const uint32_t MaxHashBuckets = 0x40000;
+}
+
+// This corresponds to `HDR` in PDB/dbi/tpi.h.
+struct TpiStream::HeaderInfo {
+  struct EmbeddedBuf {
+    little32_t Off;
+    ulittle32_t Length;
+  };
+
+  ulittle32_t Version;
+  ulittle32_t HeaderSize;
+  ulittle32_t TypeIndexBegin;
+  ulittle32_t TypeIndexEnd;
+  ulittle32_t TypeRecordBytes;
+
+  // The following members correspond to `TpiHash` in PDB/dbi/tpi.h.
+  ulittle16_t HashStreamIndex;
+  ulittle16_t HashAuxStreamIndex;
+  ulittle32_t HashKeySize;
+  ulittle32_t NumHashBuckets;
+
+  EmbeddedBuf HashValueBuffer;
+  EmbeddedBuf IndexOffsetBuffer;
+  EmbeddedBuf HashAdjBuffer;
+};
+
+TpiStream::TpiStream(const PDBFile &File,
+                     std::unique_ptr<MappedBlockStream> Stream)
+    : Pdb(File), Stream(std::move(Stream)) {}
+
+TpiStream::~TpiStream() {}
+
+// Corresponds to `fUDTAnon`.
+template <typename T> static bool isAnonymous(T &Rec) {
+  StringRef Name = Rec.getName();
+  return Name == "<unnamed-tag>" || Name == "__unnamed" ||
+      Name.endswith("::<unnamed-tag>") || Name.endswith("::__unnamed");
+}
+
+// Computes a hash for a given TPI record.
+template <typename T>
+static uint32_t getTpiHash(T &Rec, const CVRecord<TypeLeafKind> &RawRec) {
+  auto Opts = static_cast<uint16_t>(Rec.getOptions());
+
+  bool ForwardRef =
+      Opts & static_cast<uint16_t>(ClassOptions::ForwardReference);
+  bool Scoped = Opts & static_cast<uint16_t>(ClassOptions::Scoped);
+  bool UniqueName = Opts & static_cast<uint16_t>(ClassOptions::HasUniqueName);
+  bool IsAnon = UniqueName && isAnonymous(Rec);
+
+  if (!ForwardRef && !Scoped && !IsAnon)
+    return hashStringV1(Rec.getName());
+  if (!ForwardRef && UniqueName && !IsAnon)
+    return hashStringV1(Rec.getUniqueName());
+  return hashBufferV8(RawRec.RawData);
+}
+
+namespace {
+class TpiHashVerifier : public TypeVisitorCallbacks {
+public:
+  TpiHashVerifier(FixedStreamArray<support::ulittle32_t> &HashValues,
+                  uint32_t NumHashBuckets)
+      : HashValues(HashValues), NumHashBuckets(NumHashBuckets) {}
+
+  Error visitUdtSourceLine(UdtSourceLineRecord &Rec) override {
+    return verifySourceLine(Rec);
+  }
+
+  Error visitUdtModSourceLine(UdtModSourceLineRecord &Rec) override {
+    return verifySourceLine(Rec);
+  }
+
+  Error visitClass(ClassRecord &Rec) override { return verify(Rec); }
+  Error visitEnum(EnumRecord &Rec) override { return verify(Rec); }
+  Error visitUnion(UnionRecord &Rec) override { return verify(Rec); }
+
+  Error visitTypeBegin(const CVRecord<TypeLeafKind> &Rec) override {
+    ++Index;
+    RawRecord = &Rec;
+    return Error::success();
+  }
+
+private:
+  template <typename T> Error verify(T &Rec) {
+    uint32_t Hash = getTpiHash(Rec, *RawRecord);
+    if (Hash % NumHashBuckets != HashValues[Index])
+      return errorInvalidHash();
+    return Error::success();
+  }
+
+  template <typename T> Error verifySourceLine(T &Rec) {
+    char Buf[4];
+    support::endian::write32le(Buf, Rec.getUDT().getIndex());
+    uint32_t Hash = hashStringV1(StringRef(Buf, 4));
+    if (Hash % NumHashBuckets != HashValues[Index])
+      return errorInvalidHash();
+    return Error::success();
+  }
+
+  Error errorInvalidHash() {
+    return make_error<RawError>(
+        raw_error_code::invalid_tpi_hash,
+        "Type index is 0x" + utohexstr(TypeIndex::FirstNonSimpleIndex + Index));
+  }
+
+  FixedStreamArray<support::ulittle32_t> HashValues;
+  const CVRecord<TypeLeafKind> *RawRecord;
+  uint32_t NumHashBuckets;
+  uint32_t Index = -1;
+};
+}
+
+// Verifies that a given type record matches with a given hash value.
+// Currently we only verify SRC_LINE records.
+Error TpiStream::verifyHashValues() {
+  TpiHashVerifier Verifier(HashValues, Header->NumHashBuckets);
+  CVTypeVisitor Visitor(Verifier);
+  return Visitor.visitTypeStream(TypeRecords);
+}
+
+Error TpiStream::reload() {
+  StreamReader Reader(*Stream);
+
+  if (Reader.bytesRemaining() < sizeof(HeaderInfo))
+    return make_error<RawError>(raw_error_code::corrupt_file,
+                                "TPI Stream does not contain a header.");
+
+  if (Reader.readObject(Header))
+    return make_error<RawError>(raw_error_code::corrupt_file,
+                                "TPI Stream does not contain a header.");
+
+  if (Header->Version != PdbTpiV80)
+    return make_error<RawError>(raw_error_code::corrupt_file,
+                                "Unsupported TPI Version.");
+
+  if (Header->HeaderSize != sizeof(HeaderInfo))
+    return make_error<RawError>(raw_error_code::corrupt_file,
+                                "Corrupt TPI Header size.");
+
+  if (Header->HashKeySize != sizeof(ulittle32_t))
+    return make_error<RawError>(raw_error_code::corrupt_file,
+                                "TPI Stream expected 4 byte hash key size.");
+
+  if (Header->NumHashBuckets < MinHashBuckets ||
+      Header->NumHashBuckets > MaxHashBuckets)
+    return make_error<RawError>(raw_error_code::corrupt_file,
+                                "TPI Stream Invalid number of hash buckets.");
+
+  // The actual type records themselves come from this stream
+  if (auto EC = Reader.readArray(TypeRecords, Header->TypeRecordBytes))
+    return EC;
+
+  // Hash indices, hash values, etc come from the hash stream.
+  if (Header->HashStreamIndex >= Pdb.getNumStreams())
+    return make_error<RawError>(raw_error_code::corrupt_file,
+                                "Invalid TPI hash stream index.");
+
+  auto HS =
+      MappedBlockStream::createIndexedStream(Header->HashStreamIndex, Pdb);
+  if (!HS)
+    return HS.takeError();
+  StreamReader HSR(**HS);
+
+  uint32_t NumHashValues = Header->HashValueBuffer.Length / sizeof(ulittle32_t);
+  if (NumHashValues != NumTypeRecords())
+    return make_error<RawError>(
+        raw_error_code::corrupt_file,
+        "TPI hash count does not match with the number of type records.");
+  HSR.setOffset(Header->HashValueBuffer.Off);
+  if (auto EC = HSR.readArray(HashValues, NumHashValues))
+    return EC;
+
+  HSR.setOffset(Header->IndexOffsetBuffer.Off);
+  uint32_t NumTypeIndexOffsets =
+      Header->IndexOffsetBuffer.Length / sizeof(TypeIndexOffset);
+  if (auto EC = HSR.readArray(TypeIndexOffsets, NumTypeIndexOffsets))
+    return EC;
+
+  HSR.setOffset(Header->HashAdjBuffer.Off);
+  uint32_t NumHashAdjustments =
+      Header->HashAdjBuffer.Length / sizeof(TypeIndexOffset);
+  if (auto EC = HSR.readArray(HashAdjustments, NumHashAdjustments))
+    return EC;
+
+  HashStream = std::move(*HS);
+
+  // TPI hash table is a parallel array for the type records.
+  // Verify that the hash values match with type records.
+  if (auto EC = verifyHashValues())
+    return EC;
+
+  return Error::success();
+}
+
+PdbRaw_TpiVer TpiStream::getTpiVersion() const {
+  uint32_t Value = Header->Version;
+  return static_cast<PdbRaw_TpiVer>(Value);
+}
+
+uint32_t TpiStream::TypeIndexBegin() const { return Header->TypeIndexBegin; }
+
+uint32_t TpiStream::TypeIndexEnd() const { return Header->TypeIndexEnd; }
+
+uint32_t TpiStream::NumTypeRecords() const {
+  return TypeIndexEnd() - TypeIndexBegin();
+}
+
+uint16_t TpiStream::getTypeHashStreamIndex() const {
+  return Header->HashStreamIndex;
+}
+
+uint16_t TpiStream::getTypeHashStreamAuxIndex() const {
+  return Header->HashAuxStreamIndex;
+}
+
+uint32_t TpiStream::NumHashBuckets() const { return Header->NumHashBuckets; }
+uint32_t TpiStream::getHashKeySize() const { return Header->HashKeySize; }
+
+FixedStreamArray<support::ulittle32_t>
+TpiStream::getHashValues() const {
+  return HashValues;
+}
+
+FixedStreamArray<TypeIndexOffset>
+TpiStream::getTypeIndexOffsets() const {
+  return TypeIndexOffsets;
+}
+
+FixedStreamArray<TypeIndexOffset>
+TpiStream::getHashAdjustments() const {
+  return HashAdjustments;
+}
+
+iterator_range<CVTypeArray::Iterator>
+TpiStream::types(bool *HadError) const {
+  return llvm::make_range(TypeRecords.begin(HadError), TypeRecords.end());
+}
+
+Error TpiStream::commit() { return Error::success(); }
diff --git a/lib/DebugInfo/Symbolize/DIPrinter.cpp b/lib/DebugInfo/Symbolize/DIPrinter.cpp
index a9dee7abeed1..be5c603a38ef 100644
--- a/lib/DebugInfo/Symbolize/DIPrinter.cpp
+++ b/lib/DebugInfo/Symbolize/DIPrinter.cpp
@@ -13,9 +13,19 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/DebugInfo/Symbolize/DIPrinter.h"
-
+#include "llvm/ADT/StringRef.h"
 #include "llvm/DebugInfo/DIContext.h"
+#include "llvm/Support/ErrorOr.h"
+#include "llvm/Support/Format.h"
 #include "llvm/Support/LineIterator.h"
+#include "llvm/Support/MemoryBuffer.h"
+#include "llvm/Support/raw_ostream.h"
+#include <algorithm>
+#include <cmath>
+#include <cstddef>
+#include <cstdint>
+#include <memory>
+#include <string>
 
 namespace llvm {
 namespace symbolize {
@@ -26,7 +36,7 @@ static const char kDILineInfoBadString[] = "<invalid>";
 static const char kBadString[] = "??";
 
 // Prints source code around in the FileName the Line.
-void DIPrinter::printContext(std::string FileName, int64_t Line) {
+void DIPrinter::printContext(const std::string &FileName, int64_t Line) {
   if (PrintSourceContext <= 0)
     return;
 
@@ -61,7 +71,7 @@ void DIPrinter::print(const DILineInfo &Info, bool Inlined) {
     if (FunctionName == kDILineInfoBadString)
       FunctionName = kBadString;
 
-    StringRef Delimiter = (PrintPretty == true) ? " at " : "\n";
+    StringRef Delimiter = PrintPretty ? " at " : "\n";
     StringRef Prefix = (PrintPretty && Inlined) ? " (inlined by) " : "";
     OS << Prefix << FunctionName << Delimiter;
   }
@@ -97,5 +107,5 @@ DIPrinter &DIPrinter::operator<<(const DIGlobal &Global) {
   return *this;
 }
 
-}
-}
+} // end namespace symbolize
+} // end namespace llvm
diff --git a/lib/DebugInfo/Symbolize/Makefile b/lib/DebugInfo/Symbolize/Makefile
deleted file mode 100644
index 17aac9396585..000000000000
--- a/lib/DebugInfo/Symbolize/Makefile
+++ /dev/null
@@ -1,15 +0,0 @@
-##===- lib/DebugInfo/Symbolize/Makefile --------------------*- Makefile -*-===##
-#
-#                     The LLVM Compiler Infrastructure
-#
-# This file is distributed under the University of Illinois Open Source
-# License. See LICENSE.TXT for details.
-#
-##===----------------------------------------------------------------------===##
-
-LEVEL = ../../..
-LIBRARYNAME = LLVMSymbolize
-BUILD_ARCHIVE := 1
-
-include $(LEVEL)/Makefile.common
-
diff --git a/lib/DebugInfo/Symbolize/SymbolizableObjectFile.cpp b/lib/DebugInfo/Symbolize/SymbolizableObjectFile.cpp
index e31462459844..f6940080089f 100644
--- a/lib/DebugInfo/Symbolize/SymbolizableObjectFile.cpp
+++ b/lib/DebugInfo/Symbolize/SymbolizableObjectFile.cpp
@@ -12,6 +12,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "SymbolizableObjectFile.h"
+#include "llvm/Object/COFF.h"
 #include "llvm/Object/SymbolSize.h"
 #include "llvm/Support/DataExtractor.h"
 #include "llvm/DebugInfo/DWARF/DWARFContext.h"
@@ -119,12 +120,15 @@ std::error_code SymbolizableObjectFile::addSymbol(const SymbolRef &Symbol,
                                                   uint64_t SymbolSize,
                                                   DataExtractor *OpdExtractor,
                                                   uint64_t OpdAddress) {
-  SymbolRef::Type SymbolType = Symbol.getType();
+  Expected<SymbolRef::Type> SymbolTypeOrErr = Symbol.getType();
+  if (!SymbolTypeOrErr)
+    return errorToErrorCode(SymbolTypeOrErr.takeError());
+  SymbolRef::Type SymbolType = *SymbolTypeOrErr;
   if (SymbolType != SymbolRef::ST_Function && SymbolType != SymbolRef::ST_Data)
     return std::error_code();
-  ErrorOr<uint64_t> SymbolAddressOrErr = Symbol.getAddress();
-  if (auto EC = SymbolAddressOrErr.getError())
-    return EC;
+  Expected<uint64_t> SymbolAddressOrErr = Symbol.getAddress();
+  if (!SymbolAddressOrErr)
+    return errorToErrorCode(SymbolAddressOrErr.takeError());
   uint64_t SymbolAddress = *SymbolAddressOrErr;
   if (OpdExtractor) {
     // For big-endian PowerPC64 ELF, symbols in the .opd section refer to
@@ -138,9 +142,9 @@ std::error_code SymbolizableObjectFile::addSymbol(const SymbolRef &Symbol,
         OpdExtractor->isValidOffsetForAddress(OpdOffset32))
       SymbolAddress = OpdExtractor->getAddress(&OpdOffset32);
   }
-  ErrorOr<StringRef> SymbolNameOrErr = Symbol.getName();
-  if (auto EC = SymbolNameOrErr.getError())
-    return EC;
+  Expected<StringRef> SymbolNameOrErr = Symbol.getName();
+  if (!SymbolNameOrErr)
+    return errorToErrorCode(SymbolNameOrErr.takeError());
   StringRef SymbolName = *SymbolNameOrErr;
   // Mach-O symbol table names have leading underscore, skip it.
   if (Module->isMachO() && SymbolName.size() > 0 && SymbolName[0] == '_')
diff --git a/lib/DebugInfo/Symbolize/Symbolize.cpp b/lib/DebugInfo/Symbolize/Symbolize.cpp
index 3da1963bb791..adbe0cb69edb 100644
--- a/lib/DebugInfo/Symbolize/Symbolize.cpp
+++ b/lib/DebugInfo/Symbolize/Symbolize.cpp
@@ -20,6 +20,7 @@
 #include "llvm/DebugInfo/DWARF/DWARFContext.h"
 #include "llvm/DebugInfo/PDB/PDB.h"
 #include "llvm/DebugInfo/PDB/PDBContext.h"
+#include "llvm/Object/COFF.h"
 #include "llvm/Object/ELFObjectFile.h"
 #include "llvm/Object/MachO.h"
 #include "llvm/Object/MachOUniversal.h"
@@ -31,7 +32,10 @@
 #include "llvm/Support/FileSystem.h"
 #include "llvm/Support/MemoryBuffer.h"
 #include "llvm/Support/Path.h"
-#include <stdlib.h>
+#include <algorithm>
+#include <cassert>
+#include <cstdlib>
+#include <cstring>
 
 #if defined(_MSC_VER)
 #include <Windows.h>
@@ -47,12 +51,18 @@
 namespace llvm {
 namespace symbolize {
 
-ErrorOr<DILineInfo> LLVMSymbolizer::symbolizeCode(const std::string &ModuleName,
+Expected<DILineInfo> LLVMSymbolizer::symbolizeCode(const std::string &ModuleName,
                                                   uint64_t ModuleOffset) {
-  auto InfoOrErr = getOrCreateModuleInfo(ModuleName);
-  if (auto EC = InfoOrErr.getError())
-    return EC;
-  SymbolizableModule *Info = InfoOrErr.get();
+  SymbolizableModule *Info;
+  if (auto InfoOrErr = getOrCreateModuleInfo(ModuleName))
+    Info = InfoOrErr.get();
+  else
+    return InfoOrErr.takeError();
+
+  // A null module means an error has already been reported. Return an empty
+  // result.
+  if (!Info)
+    return DILineInfo();
 
   // If the user is giving us relative addresses, add the preferred base of the
   // object to the offset before we do the query. It's what DIContext expects.
@@ -66,13 +76,19 @@ ErrorOr<DILineInfo> LLVMSymbolizer::symbolizeCode(const std::string &ModuleName,
   return LineInfo;
 }
 
-ErrorOr<DIInliningInfo>
+Expected<DIInliningInfo>
 LLVMSymbolizer::symbolizeInlinedCode(const std::string &ModuleName,
                                      uint64_t ModuleOffset) {
-  auto InfoOrErr = getOrCreateModuleInfo(ModuleName);
-  if (auto EC = InfoOrErr.getError())
-    return EC;
-  SymbolizableModule *Info = InfoOrErr.get();
+  SymbolizableModule *Info;
+  if (auto InfoOrErr = getOrCreateModuleInfo(ModuleName))
+    Info = InfoOrErr.get();
+  else
+    return InfoOrErr.takeError();
+
+  // A null module means an error has already been reported. Return an empty
+  // result.
+  if (!Info)
+    return DIInliningInfo();
 
   // If the user is giving us relative addresses, add the preferred base of the
   // object to the offset before we do the query. It's what DIContext expects.
@@ -90,12 +106,18 @@ LLVMSymbolizer::symbolizeInlinedCode(const std::string &ModuleName,
   return InlinedContext;
 }
 
-ErrorOr<DIGlobal> LLVMSymbolizer::symbolizeData(const std::string &ModuleName,
-                                                uint64_t ModuleOffset) {
-  auto InfoOrErr = getOrCreateModuleInfo(ModuleName);
-  if (auto EC = InfoOrErr.getError())
-    return EC;
-  SymbolizableModule *Info = InfoOrErr.get();
+Expected<DIGlobal> LLVMSymbolizer::symbolizeData(const std::string &ModuleName,
+                                                 uint64_t ModuleOffset) {
+  SymbolizableModule *Info;
+  if (auto InfoOrErr = getOrCreateModuleInfo(ModuleName))
+    Info = InfoOrErr.get();
+  else
+    return InfoOrErr.takeError();
+
+  // A null module means an error has already been reported. Return an empty
+  // result.
+  if (!Info)
+    return DIGlobal();
 
   // If the user is giving us relative addresses, add the preferred base of
   // the object to the offset before we do the query. It's what DIContext
@@ -116,11 +138,12 @@ void LLVMSymbolizer::flush() {
   Modules.clear();
 }
 
+namespace {
+
 // For Path="/path/to/foo" and Basename="foo" assume that debug info is in
 // /path/to/foo.dSYM/Contents/Resources/DWARF/foo.
 // For Path="/path/to/bar.dSYM" and Basename="foo" assume that debug info is in
 // /path/to/bar.dSYM/Contents/Resources/DWARF/foo.
-static
 std::string getDarwinDWARFResourceForPath(
     const std::string &Path, const std::string &Basename) {
   SmallString<16> ResourceName = StringRef(Path);
@@ -132,7 +155,7 @@ std::string getDarwinDWARFResourceForPath(
   return ResourceName.str();
 }
 
-static bool checkFileCRC(StringRef Path, uint32_t CRCHash) {
+bool checkFileCRC(StringRef Path, uint32_t CRCHash) {
   ErrorOr<std::unique_ptr<MemoryBuffer>> MB =
       MemoryBuffer::getFileOrSTDIN(Path);
   if (!MB)
@@ -140,9 +163,9 @@ static bool checkFileCRC(StringRef Path, uint32_t CRCHash) {
   return !zlib::isAvailable() || CRCHash == zlib::crc32(MB.get()->getBuffer());
 }
 
-static bool findDebugBinary(const std::string &OrigPath,
-                            const std::string &DebuglinkName, uint32_t CRCHash,
-                            std::string &Result) {
+bool findDebugBinary(const std::string &OrigPath,
+                     const std::string &DebuglinkName, uint32_t CRCHash,
+                     std::string &Result) {
   std::string OrigRealPath = OrigPath;
 #if defined(HAVE_REALPATH)
   if (char *RP = realpath(OrigPath.c_str(), nullptr)) {
@@ -177,8 +200,8 @@ static bool findDebugBinary(const std::string &OrigPath,
   return false;
 }
 
-static bool getGNUDebuglinkContents(const ObjectFile *Obj, std::string &DebugName,
-                                    uint32_t &CRCHash) {
+bool getGNUDebuglinkContents(const ObjectFile *Obj, std::string &DebugName,
+                             uint32_t &CRCHash) {
   if (!Obj)
     return false;
   for (const SectionRef &Section : Obj->sections()) {
@@ -205,7 +228,6 @@ static bool getGNUDebuglinkContents(const ObjectFile *Obj, std::string &DebugNam
   return false;
 }
 
-static
 bool darwinDsymMatchesBinary(const MachOObjectFile *DbgObj,
                              const MachOObjectFile *Obj) {
   ArrayRef<uint8_t> dbg_uuid = DbgObj->getUuid();
@@ -215,6 +237,8 @@ bool darwinDsymMatchesBinary(const MachOObjectFile *DbgObj,
   return !memcmp(dbg_uuid.data(), bin_uuid.data(), dbg_uuid.size());
 }
 
+} // end anonymous namespace
+
 ObjectFile *LLVMSymbolizer::lookUpDsymFile(const std::string &ExePath,
     const MachOObjectFile *MachExeObj, const std::string &ArchName) {
   // On Darwin we may find DWARF in separate object file in
@@ -227,9 +251,14 @@ ObjectFile *LLVMSymbolizer::lookUpDsymFile(const std::string &ExePath,
   }
   for (const auto &Path : DsymPaths) {
     auto DbgObjOrErr = getOrCreateObject(Path, ArchName);
-    if (!DbgObjOrErr)
+    if (!DbgObjOrErr) {
+      // Ignore errors, the file might not exist.
+      consumeError(DbgObjOrErr.takeError());
       continue;
+    }
     ObjectFile *DbgObj = DbgObjOrErr.get();
+    if (!DbgObj)
+      continue;
     const MachOObjectFile *MachDbgObj = dyn_cast<const MachOObjectFile>(DbgObj);
     if (!MachDbgObj)
       continue;
@@ -250,23 +279,27 @@ ObjectFile *LLVMSymbolizer::lookUpDebuglinkObject(const std::string &Path,
   if (!findDebugBinary(Path, DebuglinkName, CRCHash, DebugBinaryPath))
     return nullptr;
   auto DbgObjOrErr = getOrCreateObject(DebugBinaryPath, ArchName);
-  if (!DbgObjOrErr)
+  if (!DbgObjOrErr) {
+    // Ignore errors, the file might not exist.
+    consumeError(DbgObjOrErr.takeError());
     return nullptr;
+  }
   return DbgObjOrErr.get();
 }
 
-ErrorOr<LLVMSymbolizer::ObjectPair>
+Expected<LLVMSymbolizer::ObjectPair>
 LLVMSymbolizer::getOrCreateObjectPair(const std::string &Path,
                                       const std::string &ArchName) {
   const auto &I = ObjectPairForPathArch.find(std::make_pair(Path, ArchName));
-  if (I != ObjectPairForPathArch.end())
+  if (I != ObjectPairForPathArch.end()) {
     return I->second;
+  }
 
   auto ObjOrErr = getOrCreateObject(Path, ArchName);
-  if (auto EC = ObjOrErr.getError()) {
-    ObjectPairForPathArch.insert(
-        std::make_pair(std::make_pair(Path, ArchName), EC));
-    return EC;
+  if (!ObjOrErr) {
+    ObjectPairForPathArch.insert(std::make_pair(std::make_pair(Path, ArchName),
+                                                ObjectPair(nullptr, nullptr)));
+    return ObjOrErr.takeError();
   }
 
   ObjectFile *Obj = ObjOrErr.get();
@@ -285,40 +318,37 @@ LLVMSymbolizer::getOrCreateObjectPair(const std::string &Path,
   return Res;
 }
 
-ErrorOr<ObjectFile *>
+Expected<ObjectFile *>
 LLVMSymbolizer::getOrCreateObject(const std::string &Path,
                                   const std::string &ArchName) {
   const auto &I = BinaryForPath.find(Path);
   Binary *Bin = nullptr;
   if (I == BinaryForPath.end()) {
-    ErrorOr<OwningBinary<Binary>> BinOrErr = createBinary(Path);
-    if (auto EC = BinOrErr.getError()) {
-      BinaryForPath.insert(std::make_pair(Path, EC));
-      return EC;
+    Expected<OwningBinary<Binary>> BinOrErr = createBinary(Path);
+    if (!BinOrErr) {
+      BinaryForPath.insert(std::make_pair(Path, OwningBinary<Binary>()));
+      return BinOrErr.takeError();
     }
     Bin = BinOrErr->getBinary();
     BinaryForPath.insert(std::make_pair(Path, std::move(BinOrErr.get())));
-  } else if (auto EC = I->second.getError()) {
-    return EC;
   } else {
-    Bin = I->second->getBinary();
+    Bin = I->second.getBinary();
   }
 
-  assert(Bin != nullptr);
+  if (!Bin)
+    return static_cast<ObjectFile *>(nullptr);
 
-  if (MachOUniversalBinary *UB = dyn_cast<MachOUniversalBinary>(Bin)) {
+  if (MachOUniversalBinary *UB = dyn_cast_or_null<MachOUniversalBinary>(Bin)) {
     const auto &I = ObjectForUBPathAndArch.find(std::make_pair(Path, ArchName));
     if (I != ObjectForUBPathAndArch.end()) {
-      if (auto EC = I->second.getError())
-        return EC;
-      return I->second->get();
+      return I->second.get();
     }
-    ErrorOr<std::unique_ptr<ObjectFile>> ObjOrErr =
+    Expected<std::unique_ptr<ObjectFile>> ObjOrErr =
         UB->getObjectForArch(ArchName);
-    if (auto EC = ObjOrErr.getError()) {
-      ObjectForUBPathAndArch.insert(
-          std::make_pair(std::make_pair(Path, ArchName), EC));
-      return EC;
+    if (!ObjOrErr) {
+      ObjectForUBPathAndArch.insert(std::make_pair(
+          std::make_pair(Path, ArchName), std::unique_ptr<ObjectFile>()));
+      return ObjOrErr.takeError();
     }
     ObjectFile *Res = ObjOrErr->get();
     ObjectForUBPathAndArch.insert(std::make_pair(std::make_pair(Path, ArchName),
@@ -328,17 +358,14 @@ LLVMSymbolizer::getOrCreateObject(const std::string &Path,
   if (Bin->isObject()) {
     return cast<ObjectFile>(Bin);
   }
-  return object_error::arch_not_found;
+  return errorCodeToError(object_error::arch_not_found);
 }
 
-ErrorOr<SymbolizableModule *>
+Expected<SymbolizableModule *>
 LLVMSymbolizer::getOrCreateModuleInfo(const std::string &ModuleName) {
   const auto &I = Modules.find(ModuleName);
   if (I != Modules.end()) {
-    auto &InfoOrErr = I->second;
-    if (auto EC = InfoOrErr.getError())
-      return EC;
-    return InfoOrErr->get();
+    return I->second.get();
   }
   std::string BinaryName = ModuleName;
   std::string ArchName = Opts.DefaultArch;
@@ -352,21 +379,30 @@ LLVMSymbolizer::getOrCreateModuleInfo(const std::string &ModuleName) {
     }
   }
   auto ObjectsOrErr = getOrCreateObjectPair(BinaryName, ArchName);
-  if (auto EC = ObjectsOrErr.getError()) {
+  if (!ObjectsOrErr) {
     // Failed to find valid object file.
-    Modules.insert(std::make_pair(ModuleName, EC));
-    return EC;
+    Modules.insert(
+        std::make_pair(ModuleName, std::unique_ptr<SymbolizableModule>()));
+    return ObjectsOrErr.takeError();
   }
   ObjectPair Objects = ObjectsOrErr.get();
 
   std::unique_ptr<DIContext> Context;
+  // If this is a COFF object containing PDB info, use a PDBContext to
+  // symbolize. Otherwise, use DWARF.
   if (auto CoffObject = dyn_cast<COFFObjectFile>(Objects.first)) {
-    // If this is a COFF object, assume it contains PDB debug information.  If
-    // we don't find any we will fall back to the DWARF case.
-    std::unique_ptr<IPDBSession> Session;
-    PDB_ErrorCode Error = loadDataForEXE(PDB_ReaderType::DIA,
-                                         Objects.first->getFileName(), Session);
-    if (Error == PDB_ErrorCode::Success) {
+    const debug_pdb_info *PDBInfo;
+    StringRef PDBFileName;
+    auto EC = CoffObject->getDebugPDBInfo(PDBInfo, PDBFileName);
+    if (!EC && PDBInfo != nullptr) {
+      using namespace pdb;
+      std::unique_ptr<IPDBSession> Session;
+      if (auto Err = loadDataForEXE(PDB_ReaderType::DIA,
+                                    Objects.first->getFileName(), Session)) {
+        Modules.insert(
+            std::make_pair(ModuleName, std::unique_ptr<SymbolizableModule>()));
+        return std::move(Err);
+      }
       Context.reset(new PDBContext(*CoffObject, std::move(Session)));
     }
   }
@@ -375,21 +411,26 @@ LLVMSymbolizer::getOrCreateModuleInfo(const std::string &ModuleName) {
   assert(Context);
   auto InfoOrErr =
       SymbolizableObjectFile::create(Objects.first, std::move(Context));
+  std::unique_ptr<SymbolizableModule> SymMod;
+  if (InfoOrErr)
+    SymMod = std::move(InfoOrErr.get());
   auto InsertResult =
-      Modules.insert(std::make_pair(ModuleName, std::move(InfoOrErr)));
+      Modules.insert(std::make_pair(ModuleName, std::move(SymMod)));
   assert(InsertResult.second);
-  if (auto EC = InsertResult.first->second.getError())
-    return EC;
-  return InsertResult.first->second->get();
+  if (auto EC = InfoOrErr.getError())
+    return errorCodeToError(EC);
+  return InsertResult.first->second.get();
 }
 
+namespace {
+
 // Undo these various manglings for Win32 extern "C" functions:
 // cdecl       - _foo
 // stdcall     - _foo@12
 // fastcall    - @foo@12
 // vectorcall  - foo@@12
 // These are all different linkage names for 'foo'.
-static StringRef demanglePE32ExternCFunc(StringRef SymbolName) {
+StringRef demanglePE32ExternCFunc(StringRef SymbolName) {
   // Remove any '_' or '@' prefix.
   char Front = SymbolName.empty() ? '\0' : SymbolName[0];
   if (Front == '_' || Front == '@')
@@ -412,6 +453,8 @@ static StringRef demanglePE32ExternCFunc(StringRef SymbolName) {
   return SymbolName;
 }
 
+} // end anonymous namespace
+
 #if !defined(_MSC_VER)
 // Assume that __cxa_demangle is provided by libcxxabi (except for Windows).
 extern "C" char *__cxa_demangle(const char *mangled_name, char *output_buffer,
diff --git a/lib/ExecutionEngine/ExecutionEngine.cpp b/lib/ExecutionEngine/ExecutionEngine.cpp
index 41c8da40346a..a8e68bf49abc 100644
--- a/lib/ExecutionEngine/ExecutionEngine.cpp
+++ b/lib/ExecutionEngine/ExecutionEngine.cpp
@@ -103,12 +103,10 @@ public:
   /// \brief Returns the address the GlobalVariable should be written into.  The
   /// GVMemoryBlock object prefixes that.
   static char *Create(const GlobalVariable *GV, const DataLayout& TD) {
-    Type *ElTy = GV->getType()->getElementType();
+    Type *ElTy = GV->getValueType();
     size_t GVSize = (size_t)TD.getTypeAllocSize(ElTy);
     void *RawMemory = ::operator new(
-      RoundUpToAlignment(sizeof(GVMemoryBlock),
-                         TD.getPreferredAlignment(GV))
-      + GVSize);
+        alignTo(sizeof(GVMemoryBlock), TD.getPreferredAlignment(GV)) + GVSize);
     new(RawMemory) GVMemoryBlock(GV);
     return static_cast<char*>(RawMemory) + sizeof(GVMemoryBlock);
   }
@@ -237,10 +235,8 @@ void ExecutionEngine::clearAllGlobalMappings() {
 void ExecutionEngine::clearGlobalMappingsFromModule(Module *M) {
   MutexGuard locked(lock);
 
-  for (Function &FI : *M)
-    EEState.RemoveMapping(getMangledName(&FI));
-  for (GlobalVariable &GI : M->globals())
-    EEState.RemoveMapping(getMangledName(&GI));
+  for (GlobalObject &GO : M->global_objects())
+    EEState.RemoveMapping(getMangledName(&GO));
 }
 
 uint64_t ExecutionEngine::updateGlobalMapping(const GlobalValue *GV,
@@ -476,8 +472,7 @@ EngineBuilder::EngineBuilder() : EngineBuilder(nullptr) {}
 EngineBuilder::EngineBuilder(std::unique_ptr<Module> M)
     : M(std::move(M)), WhichEngine(EngineKind::Either), ErrorStr(nullptr),
       OptLevel(CodeGenOpt::Default), MemMgr(nullptr), Resolver(nullptr),
-      RelocModel(Reloc::Default), CMModel(CodeModel::JITDefault),
-      UseOrcMCJITReplacement(false) {
+      CMModel(CodeModel::JITDefault), UseOrcMCJITReplacement(false) {
 // IR module verification is enabled by default in debug builds, and disabled
 // by default in release builds.
 #ifndef NDEBUG
@@ -1355,7 +1350,7 @@ void ExecutionEngine::EmitGlobalVariable(const GlobalVariable *GV) {
   if (!GV->isThreadLocal())
     InitializeMemory(GV->getInitializer(), GA);
 
-  Type *ElTy = GV->getType()->getElementType();
+  Type *ElTy = GV->getValueType();
   size_t GVSize = (size_t)getDataLayout().getTypeAllocSize(ElTy);
   NumInitBytes += (unsigned)GVSize;
   ++NumGlobals;
diff --git a/lib/ExecutionEngine/ExecutionEngineBindings.cpp b/lib/ExecutionEngine/ExecutionEngineBindings.cpp
index ff7c4dce0d5d..d6b209a91d76 100644
--- a/lib/ExecutionEngine/ExecutionEngineBindings.cpp
+++ b/lib/ExecutionEngine/ExecutionEngineBindings.cpp
@@ -17,6 +17,7 @@
 #include "llvm/ExecutionEngine/RTDyldMemoryManager.h"
 #include "llvm/IR/DerivedTypes.h"
 #include "llvm/IR/Module.h"
+#include "llvm/Support/CodeGenCWrappers.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Target/TargetOptions.h"
 #include <cstring>
@@ -215,10 +216,12 @@ void LLVMDisposeExecutionEngine(LLVMExecutionEngineRef EE) {
 }
 
 void LLVMRunStaticConstructors(LLVMExecutionEngineRef EE) {
+  unwrap(EE)->finalizeObject();
   unwrap(EE)->runStaticConstructorsDestructors(false);
 }
 
 void LLVMRunStaticDestructors(LLVMExecutionEngineRef EE) {
+  unwrap(EE)->finalizeObject();
   unwrap(EE)->runStaticConstructorsDestructors(true);
 }
 
diff --git a/lib/ExecutionEngine/IntelJITEvents/CMakeLists.txt b/lib/ExecutionEngine/IntelJITEvents/CMakeLists.txt
index 65f2a2f51f9b..3b8c4b973e68 100644
--- a/lib/ExecutionEngine/IntelJITEvents/CMakeLists.txt
+++ b/lib/ExecutionEngine/IntelJITEvents/CMakeLists.txt
@@ -3,9 +3,9 @@ include_directories( ${CMAKE_CURRENT_SOURCE_DIR}/.. )
 if( HAVE_LIBDL )
     set(LLVM_INTEL_JIT_LIBS ${CMAKE_DL_LIBS})
 endif()
-if( HAVE_LIBPTHREAD )
-    set(LLVM_INTEL_JIT_LIBS pthread ${LLVM_INTEL_JIT_LIBS})
-endif()
+
+set(LLVM_INTEL_JIT_LIBS ${PTHREAD_LIB} ${LLVM_INTEL_JIT_LIBS})
+
 
 add_llvm_library(LLVMIntelJITEvents
   IntelJITEventListener.cpp
diff --git a/lib/ExecutionEngine/IntelJITEvents/IntelJITEventListener.cpp b/lib/ExecutionEngine/IntelJITEvents/IntelJITEventListener.cpp
index a131763193c0..0051c69efb7d 100644
--- a/lib/ExecutionEngine/IntelJITEvents/IntelJITEventListener.cpp
+++ b/lib/ExecutionEngine/IntelJITEvents/IntelJITEventListener.cpp
@@ -113,16 +113,29 @@ void IntelJITEventListener::NotifyObjectEmitted(
     std::vector<LineNumberInfo> LineInfo;
     std::string SourceFileName;
 
-    if (Sym.getType() != SymbolRef::ST_Function)
+    Expected<SymbolRef::Type> SymTypeOrErr = Sym.getType();
+    if (!SymTypeOrErr) {
+      // TODO: Actually report errors helpfully.
+      consumeError(SymTypeOrErr.takeError());
+      continue;
+    }
+    SymbolRef::Type SymType = *SymTypeOrErr;
+    if (SymType != SymbolRef::ST_Function)
       continue;
 
-    ErrorOr<StringRef> Name = Sym.getName();
-    if (!Name)
+    Expected<StringRef> Name = Sym.getName();
+    if (!Name) {
+      // TODO: Actually report errors helpfully.
+      consumeError(Name.takeError());
       continue;
+    }
 
-    ErrorOr<uint64_t> AddrOrErr = Sym.getAddress();
-    if (AddrOrErr.getError())
+    Expected<uint64_t> AddrOrErr = Sym.getAddress();
+    if (!AddrOrErr) {
+      // TODO: Actually report errors helpfully.
+      consumeError(AddrOrErr.takeError());
       continue;
+    }
     uint64_t Addr = *AddrOrErr;
     uint64_t Size = P.second;
 
diff --git a/lib/ExecutionEngine/IntelJITEvents/Makefile b/lib/ExecutionEngine/IntelJITEvents/Makefile
deleted file mode 100644
index dcf3126cc529..000000000000
--- a/lib/ExecutionEngine/IntelJITEvents/Makefile
+++ /dev/null
@@ -1,18 +0,0 @@
-##===- lib/ExecutionEngine/JITProfile/Makefile -------------*- Makefile -*-===##
-#
-#                     The LLVM Compiler Infrastructure
-#
-# This file is distributed under the University of Illinois Open Source
-# License. See LICENSE.TXT for details.
-#
-##===----------------------------------------------------------------------===##
-LEVEL = ../../..
-LIBRARYNAME = LLVMIntelJITEvents
-
-include $(LEVEL)/Makefile.config
-
-SOURCES := IntelJITEventListener.cpp \
-  jitprofiling.c
-CPPFLAGS += -I$(PROJ_OBJ_DIR)/.. -I$(PROJ_SRC_DIR)/..
-
-include $(LLVM_SRC_ROOT)/Makefile.rules
diff --git a/lib/ExecutionEngine/Interpreter/Makefile b/lib/ExecutionEngine/Interpreter/Makefile
deleted file mode 100644
index 5def1365c61a..000000000000
--- a/lib/ExecutionEngine/Interpreter/Makefile
+++ /dev/null
@@ -1,13 +0,0 @@
-##===- lib/ExecutionEngine/Interpreter/Makefile ------------*- Makefile -*-===##
-#
-#                     The LLVM Compiler Infrastructure
-#
-# This file is distributed under the University of Illinois Open Source
-# License. See LICENSE.TXT for details.
-#
-##===----------------------------------------------------------------------===##
-
-LEVEL = ../../..
-LIBRARYNAME = LLVMInterpreter
-
-include $(LEVEL)/Makefile.common
diff --git a/lib/ExecutionEngine/MCJIT/MCJIT.cpp b/lib/ExecutionEngine/MCJIT/MCJIT.cpp
index 6cbebe98e7c9..7fb328babfe8 100644
--- a/lib/ExecutionEngine/MCJIT/MCJIT.cpp
+++ b/lib/ExecutionEngine/MCJIT/MCJIT.cpp
@@ -85,6 +85,9 @@ MCJIT::MCJIT(std::unique_ptr<Module> M, std::unique_ptr<TargetMachine> TM,
   std::unique_ptr<Module> First = std::move(Modules[0]);
   Modules.clear();
 
+  if (First->getDataLayout().isDefault())
+    First->setDataLayout(getDataLayout());
+
   OwnedModules.addModule(std::move(First));
   RegisterJITEventListener(JITEventListener::createGDBRegistrationListener());
 }
@@ -103,6 +106,10 @@ MCJIT::~MCJIT() {
 
 void MCJIT::addModule(std::unique_ptr<Module> M) {
   MutexGuard locked(lock);
+
+  if (M->getDataLayout().isDefault())
+    M->setDataLayout(getDataLayout());
+
   OwnedModules.addModule(std::move(M));
 }
 
@@ -192,11 +199,7 @@ void MCJIT::generateCodeForModule(Module *M) {
   if (ObjCache)
     ObjectToLoad = ObjCache->getObject(M);
 
-  if (M->getDataLayout().isDefault()) {
-    M->setDataLayout(getDataLayout());
-  } else {
-    assert(M->getDataLayout() == getDataLayout() && "DataLayout Mismatch");
-  }
+  assert(M->getDataLayout() == getDataLayout() && "DataLayout Mismatch");
 
   // If the cache did not contain a suitable object, compile the object
   if (!ObjectToLoad) {
@@ -206,8 +209,15 @@ void MCJIT::generateCodeForModule(Module *M) {
 
   // Load the object into the dynamic linker.
   // MCJIT now owns the ObjectImage pointer (via its LoadedObjects list).
-  ErrorOr<std::unique_ptr<object::ObjectFile>> LoadedObject =
+  Expected<std::unique_ptr<object::ObjectFile>> LoadedObject =
     object::ObjectFile::createObjectFile(ObjectToLoad->getMemBufferRef());
+  if (!LoadedObject) {
+    std::string Buf;
+    raw_string_ostream OS(Buf);
+    logAllUnhandledErrors(LoadedObject.takeError(), OS, "");
+    OS.flush();
+    report_fatal_error(Buf);
+  }
   std::unique_ptr<RuntimeDyld::LoadedObjectInfo> L =
     Dyld.loadObject(*LoadedObject.get());
 
@@ -317,15 +327,19 @@ RuntimeDyld::SymbolInfo MCJIT::findSymbol(const std::string &Name,
   for (object::OwningBinary<object::Archive> &OB : Archives) {
     object::Archive *A = OB.getBinary();
     // Look for our symbols in each Archive
-    object::Archive::child_iterator ChildIt = A->findSym(Name);
-    if (std::error_code EC = ChildIt->getError())
-      report_fatal_error(EC.message());
-    if (ChildIt != A->child_end()) {
+    auto OptionalChildOrErr = A->findSym(Name);
+    if (!OptionalChildOrErr)
+      report_fatal_error(OptionalChildOrErr.takeError());
+    auto &OptionalChild = *OptionalChildOrErr;
+    if (OptionalChild) {
       // FIXME: Support nested archives?
-      ErrorOr<std::unique_ptr<object::Binary>> ChildBinOrErr =
-          (*ChildIt)->getAsBinary();
-      if (ChildBinOrErr.getError())
+      Expected<std::unique_ptr<object::Binary>> ChildBinOrErr =
+          OptionalChild->getAsBinary();
+      if (!ChildBinOrErr) {
+        // TODO: Actually report errors helpfully.
+        consumeError(ChildBinOrErr.takeError());
         continue;
+      }
       std::unique_ptr<object::Binary> &ChildBin = ChildBinOrErr.get();
       if (ChildBin->isObject()) {
         std::unique_ptr<object::ObjectFile> OF(
@@ -480,6 +494,7 @@ GenericValue MCJIT::runFunction(Function *F, ArrayRef<GenericValue> ArgValues) {
   assert(F && "Function *F was null at entry to run()");
 
   void *FPtr = getPointerToFunction(F);
+  finalizeModule(F->getParent());
   assert(FPtr && "Pointer to fn's code was null after getPointerToFunction");
   FunctionType *FTy = F->getFunctionType();
   Type *RetTy = FTy->getReturnType();
diff --git a/lib/ExecutionEngine/MCJIT/MCJIT.h b/lib/ExecutionEngine/MCJIT/MCJIT.h
index 3c9d2fd50336..e25f76cd57f3 100644
--- a/lib/ExecutionEngine/MCJIT/MCJIT.h
+++ b/lib/ExecutionEngine/MCJIT/MCJIT.h
@@ -10,7 +10,6 @@
 #ifndef LLVM_LIB_EXECUTIONENGINE_MCJIT_MCJIT_H
 #define LLVM_LIB_EXECUTIONENGINE_MCJIT_MCJIT_H
 
-#include "llvm/ADT/DenseMap.h"
 #include "llvm/ADT/SmallPtrSet.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/ExecutionEngine/ExecutionEngine.h"
diff --git a/lib/ExecutionEngine/MCJIT/Makefile b/lib/ExecutionEngine/MCJIT/Makefile
deleted file mode 100644
index 967efbc0efa4..000000000000
--- a/lib/ExecutionEngine/MCJIT/Makefile
+++ /dev/null
@@ -1,13 +0,0 @@
-##===- lib/ExecutionEngine/MCJIT/Makefile ------------------*- Makefile -*-===##
-#
-#                     The LLVM Compiler Infrastructure
-#
-# This file is distributed under the University of Illinois Open Source
-# License. See LICENSE.TXT for details.
-#
-##===----------------------------------------------------------------------===##
-
-LEVEL = ../../..
-LIBRARYNAME = LLVMMCJIT
-
-include $(LEVEL)/Makefile.common
diff --git a/lib/ExecutionEngine/Makefile b/lib/ExecutionEngine/Makefile
deleted file mode 100644
index e9a5b79ddf62..000000000000
--- a/lib/ExecutionEngine/Makefile
+++ /dev/null
@@ -1,24 +0,0 @@
-##===- lib/ExecutionEngine/Makefile ------------------------*- Makefile -*-===##
-#
-#                     The LLVM Compiler Infrastructure
-#
-# This file is distributed under the University of Illinois Open Source
-# License. See LICENSE.TXT for details.
-#
-##===----------------------------------------------------------------------===##
-LEVEL = ../..
-LIBRARYNAME = LLVMExecutionEngine
-
-include $(LEVEL)/Makefile.config
-
-PARALLEL_DIRS = Interpreter MCJIT Orc RuntimeDyld
-
-ifeq ($(USE_INTEL_JITEVENTS), 1)
-PARALLEL_DIRS += IntelJITEvents
-endif
-
-ifeq ($(USE_OPROFILE), 1)
-PARALLEL_DIRS += OProfileJIT
-endif
-
-include $(LLVM_SRC_ROOT)/Makefile.rules
diff --git a/lib/ExecutionEngine/OProfileJIT/Makefile b/lib/ExecutionEngine/OProfileJIT/Makefile
deleted file mode 100644
index fd3adce26c1f..000000000000
--- a/lib/ExecutionEngine/OProfileJIT/Makefile
+++ /dev/null
@@ -1,18 +0,0 @@
-##===- lib/ExecutionEngine/OProfileJIT/Makefile ------------*- Makefile -*-===##
-#
-#                     The LLVM Compiler Infrastructure
-#
-# This file is distributed under the University of Illinois Open Source
-# License. See LICENSE.TXT for details.
-#
-##===----------------------------------------------------------------------===##
-LEVEL = ../../..
-LIBRARYNAME = LLVMOProfileJIT
-
-include $(LEVEL)/Makefile.config
-
-SOURCES += OProfileJITEventListener.cpp \
-  OProfileWrapper.cpp
-CPPFLAGS += -I$(PROJ_OBJ_DIR)/.. -I$(PROJ_SRC_DIR)/..
-
-include $(LLVM_SRC_ROOT)/Makefile.rules
diff --git a/lib/ExecutionEngine/OProfileJIT/OProfileWrapper.cpp b/lib/ExecutionEngine/OProfileJIT/OProfileWrapper.cpp
index 04edbd2a300e..d96278a8137b 100644
--- a/lib/ExecutionEngine/OProfileJIT/OProfileWrapper.cpp
+++ b/lib/ExecutionEngine/OProfileJIT/OProfileWrapper.cpp
@@ -23,7 +23,6 @@
 #include <cstring>
 #include <dirent.h>
 #include <fcntl.h>
-#include <sstream>
 #include <stddef.h>
 #include <sys/stat.h>
 #include <unistd.h>
diff --git a/lib/ExecutionEngine/Orc/CMakeLists.txt b/lib/ExecutionEngine/Orc/CMakeLists.txt
index d26f212e00c9..76720a7c52ec 100644
--- a/lib/ExecutionEngine/Orc/CMakeLists.txt
+++ b/lib/ExecutionEngine/Orc/CMakeLists.txt
@@ -2,9 +2,8 @@ add_llvm_library(LLVMOrcJIT
   ExecutionUtils.cpp
   IndirectionUtils.cpp
   NullResolver.cpp
-  OrcArchitectureSupport.cpp
+  OrcABISupport.cpp
   OrcCBindings.cpp
-  OrcCBindingsStack.cpp
   OrcError.cpp
   OrcMCJITReplacement.cpp
   OrcRemoteTargetRPCAPI.cpp
diff --git a/lib/ExecutionEngine/Orc/IndirectionUtils.cpp b/lib/ExecutionEngine/Orc/IndirectionUtils.cpp
index 34564e42b10f..6f7c29feef0d 100644
--- a/lib/ExecutionEngine/Orc/IndirectionUtils.cpp
+++ b/lib/ExecutionEngine/Orc/IndirectionUtils.cpp
@@ -10,10 +10,10 @@
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/Triple.h"
 #include "llvm/ExecutionEngine/Orc/IndirectionUtils.h"
+#include "llvm/ExecutionEngine/Orc/OrcABISupport.h"
 #include "llvm/IR/CallSite.h"
 #include "llvm/IR/IRBuilder.h"
 #include "llvm/Transforms/Utils/Cloning.h"
-#include <set>
 #include <sstream>
 
 namespace llvm {
@@ -22,6 +22,55 @@ namespace orc {
 void JITCompileCallbackManager::anchor() {}
 void IndirectStubsManager::anchor() {}
 
+std::unique_ptr<JITCompileCallbackManager>
+createLocalCompileCallbackManager(const Triple &T,
+                                  TargetAddress ErrorHandlerAddress) {
+  switch (T.getArch()) {
+    default: return nullptr;
+
+    case Triple::x86: {
+      typedef orc::LocalJITCompileCallbackManager<orc::OrcI386> CCMgrT;
+      return llvm::make_unique<CCMgrT>(ErrorHandlerAddress);
+    }
+
+    case Triple::x86_64: {
+      if ( T.getOS() == Triple::OSType::Win32 ) {
+        typedef orc::LocalJITCompileCallbackManager<orc::OrcX86_64_Win32> CCMgrT;
+        return llvm::make_unique<CCMgrT>(ErrorHandlerAddress);
+      } else {
+        typedef orc::LocalJITCompileCallbackManager<orc::OrcX86_64_SysV> CCMgrT;
+        return llvm::make_unique<CCMgrT>(ErrorHandlerAddress);
+      }
+    }
+  }
+}
+
+std::function<std::unique_ptr<IndirectStubsManager>()>
+createLocalIndirectStubsManagerBuilder(const Triple &T) {
+  switch (T.getArch()) {
+    default: return nullptr;
+
+    case Triple::x86:
+      return [](){
+        return llvm::make_unique<
+                       orc::LocalIndirectStubsManager<orc::OrcI386>>();
+      };
+
+    case Triple::x86_64:
+      if (T.getOS() == Triple::OSType::Win32) {
+        return [](){
+          return llvm::make_unique<
+                     orc::LocalIndirectStubsManager<orc::OrcX86_64_Win32>>();
+        };
+      } else {
+        return [](){
+          return llvm::make_unique<
+                     orc::LocalIndirectStubsManager<orc::OrcX86_64_SysV>>();
+        };
+      }
+  }
+}
+
 Constant* createIRTypedAddress(FunctionType &FT, TargetAddress Addr) {
   Constant *AddrIntVal =
     ConstantInt::get(Type::getInt64Ty(FT.getContext()), Addr);
@@ -95,7 +144,7 @@ static void raiseVisibilityOnValue(GlobalValue &V, GlobalRenamer &R) {
     V.setLinkage(GlobalValue::ExternalLinkage);
     V.setVisibility(GlobalValue::HiddenVisibility);
   }
-  V.setUnnamedAddr(false);
+  V.setUnnamedAddr(GlobalValue::UnnamedAddr::None);
   assert(!R.needsRenaming(V) && "Invalid global name.");
 }
 
@@ -116,7 +165,7 @@ Function* cloneFunctionDecl(Module &Dst, const Function &F,
                             ValueToValueMapTy *VMap) {
   assert(F.getParent() != &Dst && "Can't copy decl over existing function.");
   Function *NewF =
-    Function::Create(cast<FunctionType>(F.getType()->getElementType()),
+    Function::Create(cast<FunctionType>(F.getValueType()),
                      F.getLinkage(), F.getName(), &Dst);
   NewF->copyAttributesFrom(&F);
 
@@ -154,7 +203,7 @@ GlobalVariable* cloneGlobalVariableDecl(Module &Dst, const GlobalVariable &GV,
                                         ValueToValueMapTy *VMap) {
   assert(GV.getParent() != &Dst && "Can't copy decl over existing global var.");
   GlobalVariable *NewGV = new GlobalVariable(
-      Dst, GV.getType()->getElementType(), GV.isConstant(),
+      Dst, GV.getValueType(), GV.isConstant(),
       GV.getLinkage(), nullptr, GV.getName(), nullptr,
       GV.getThreadLocalMode(), GV.getType()->getAddressSpace());
   NewGV->copyAttributesFrom(&GV);
diff --git a/lib/ExecutionEngine/Orc/Makefile b/lib/ExecutionEngine/Orc/Makefile
deleted file mode 100644
index ac302348ee7e..000000000000
--- a/lib/ExecutionEngine/Orc/Makefile
+++ /dev/null
@@ -1,13 +0,0 @@
-##===- lib/ExecutionEngine/OrcJIT/Makefile -----------------*- Makefile -*-===##
-#
-#                     The LLVM Compiler Infrastructure
-#
-# This file is distributed under the University of Illinois Open Source
-# License. See LICENSE.TXT for details.
-#
-##===----------------------------------------------------------------------===##
-
-LEVEL = ../../..
-LIBRARYNAME = LLVMOrcJIT
-
-include $(LEVEL)/Makefile.common
diff --git a/lib/ExecutionEngine/Orc/OrcABISupport.cpp b/lib/ExecutionEngine/Orc/OrcABISupport.cpp
new file mode 100644
index 000000000000..9869b6c7050c
--- /dev/null
+++ b/lib/ExecutionEngine/Orc/OrcABISupport.cpp
@@ -0,0 +1,542 @@
+//===------------- OrcABISupport.cpp - ABI specific support code ----------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/ExecutionEngine/Orc/OrcABISupport.h"
+#include "llvm/ADT/Triple.h"
+#include "llvm/Support/Process.h"
+
+namespace llvm {
+namespace orc {
+
+void OrcAArch64::writeResolverCode(uint8_t *ResolverMem, JITReentryFn ReentryFn,
+                                   void *CallbackMgr) {
+
+  const uint32_t ResolverCode[] = {
+    // resolver_entry:
+    0xa9bf47fd,        // 0x000:  stp  x29, x17, [sp, #-16]!
+    0x910003fd,        // 0x004:  mov  x29, sp
+    0xa9bf73fb,        // 0x008:  stp  x27, x28, [sp, #-16]!
+    0xa9bf6bf9,        // 0x00c:  stp  x25, x26, [sp, #-16]!
+    0xa9bf63f7,        // 0x010:  stp  x23, x24, [sp, #-16]!
+    0xa9bf5bf5,        // 0x014:  stp  x21, x22, [sp, #-16]!
+    0xa9bf53f3,        // 0x018:  stp  x19, x20, [sp, #-16]!
+    0xa9bf3fee,        // 0x01c:  stp  x14, x15, [sp, #-16]!
+    0xa9bf37ec,        // 0x020:  stp  x12, x13, [sp, #-16]!
+    0xa9bf2fea,        // 0x024:  stp  x10, x11, [sp, #-16]!
+    0xa9bf27e8,        // 0x028:  stp   x8,  x9, [sp, #-16]!
+    0xa9bf1fe6,        // 0x02c:  stp   x6,  x7, [sp, #-16]!
+    0xa9bf17e4,        // 0x030:  stp   x4,  x5, [sp, #-16]!
+    0xa9bf0fe2,        // 0x034:  stp   x2,  x3, [sp, #-16]!
+    0xa9bf07e0,        // 0x038:  stp   x0,  x1, [sp, #-16]!
+    0xadbf7ffe,        // 0x03c:  stp  q30, q31, [sp, #-32]!
+    0xadbf77fc,        // 0x040:  stp  q28, q29, [sp, #-32]!
+    0xadbf6ffa,        // 0x044:  stp  q26, q27, [sp, #-32]!
+    0xadbf67f8,        // 0x048:  stp  q24, q25, [sp, #-32]!
+    0xadbf5ff6,        // 0x04c:  stp  q22, q23, [sp, #-32]!
+    0xadbf57f4,        // 0x050:  stp  q20, q21, [sp, #-32]!
+    0xadbf4ff2,        // 0x054:  stp  q18, q19, [sp, #-32]!
+    0xadbf47f0,        // 0x058:  stp  q16, q17, [sp, #-32]!
+    0xadbf3fee,        // 0x05c:  stp  q14, q15, [sp, #-32]!
+    0xadbf37ec,        // 0x060:  stp  q12, q13, [sp, #-32]!
+    0xadbf2fea,        // 0x064:  stp  q10, q11, [sp, #-32]!
+    0xadbf27e8,        // 0x068:  stp   q8,  q9, [sp, #-32]!
+    0xadbf1fe6,        // 0x06c:  stp   q6,  q7, [sp, #-32]!
+    0xadbf17e4,        // 0x070:  stp   q4,  q5, [sp, #-32]!
+    0xadbf0fe2,        // 0x074:  stp   q2,  q3, [sp, #-32]!
+    0xadbf07e0,        // 0x078:  stp   q0,  q1, [sp, #-32]!
+    0x580004e0,        // 0x07c:  ldr   x0, Lcallbackmgr
+    0xaa1e03e1,        // 0x080:  mov   x1, x30
+    0xd1003021,        // 0x084:  sub   x1,  x1, #12
+    0x58000442,        // 0x088:  ldr   x2, Lreentry_fn_ptr
+    0xd63f0040,        // 0x08c:  blr   x2
+    0xaa0003f1,        // 0x090:  mov   x17, x0
+    0xacc107e0,        // 0x094:  ldp   q0,  q1, [sp], #32
+    0xacc10fe2,        // 0x098:  ldp   q2,  q3, [sp], #32
+    0xacc117e4,        // 0x09c:  ldp   q4,  q5, [sp], #32
+    0xacc11fe6,        // 0x0a0:  ldp   q6,  q7, [sp], #32
+    0xacc127e8,        // 0x0a4:  ldp   q8,  q9, [sp], #32
+    0xacc12fea,        // 0x0a8:  ldp  q10, q11, [sp], #32
+    0xacc137ec,        // 0x0ac:  ldp  q12, q13, [sp], #32
+    0xacc13fee,        // 0x0b0:  ldp  q14, q15, [sp], #32
+    0xacc147f0,        // 0x0b4:  ldp  q16, q17, [sp], #32
+    0xacc14ff2,        // 0x0b8:  ldp  q18, q19, [sp], #32
+    0xacc157f4,        // 0x0bc:  ldp  q20, q21, [sp], #32
+    0xacc15ff6,        // 0x0c0:  ldp  q22, q23, [sp], #32
+    0xacc167f8,        // 0x0c4:  ldp  q24, q25, [sp], #32
+    0xacc16ffa,        // 0x0c8:  ldp  q26, q27, [sp], #32
+    0xacc177fc,        // 0x0cc:  ldp  q28, q29, [sp], #32
+    0xacc17ffe,        // 0x0d0:  ldp  q30, q31, [sp], #32
+    0xa8c107e0,        // 0x0d4:  ldp   x0,  x1, [sp], #16
+    0xa8c10fe2,        // 0x0d8:  ldp   x2,  x3, [sp], #16
+    0xa8c117e4,        // 0x0dc:  ldp   x4,  x5, [sp], #16
+    0xa8c11fe6,        // 0x0e0:  ldp   x6,  x7, [sp], #16
+    0xa8c127e8,        // 0x0e4:  ldp   x8,  x9, [sp], #16
+    0xa8c12fea,        // 0x0e8:  ldp  x10, x11, [sp], #16
+    0xa8c137ec,        // 0x0ec:  ldp  x12, x13, [sp], #16
+    0xa8c13fee,        // 0x0f0:  ldp  x14, x15, [sp], #16
+    0xa8c153f3,        // 0x0f4:  ldp  x19, x20, [sp], #16
+    0xa8c15bf5,        // 0x0f8:  ldp  x21, x22, [sp], #16
+    0xa8c163f7,        // 0x0fc:  ldp  x23, x24, [sp], #16
+    0xa8c16bf9,        // 0x100:  ldp  x25, x26, [sp], #16
+    0xa8c173fb,        // 0x104:  ldp  x27, x28, [sp], #16
+    0xa8c17bfd,        // 0x108:  ldp  x29, x30, [sp], #16
+    0xd65f0220,        // 0x10c:  ret  x17
+    0x01234567,        // 0x110:  Lreentry_fn_ptr:
+    0xdeadbeef,        // 0x114:      .quad 0
+    0x98765432,        // 0x118:  Lcallbackmgr:
+    0xcafef00d         // 0x11c:      .quad 0
+  };
+
+  const unsigned ReentryFnAddrOffset = 0x110;
+  const unsigned CallbackMgrAddrOffset = 0x118;
+
+  memcpy(ResolverMem, ResolverCode, sizeof(ResolverCode));
+  memcpy(ResolverMem + ReentryFnAddrOffset, &ReentryFn, sizeof(ReentryFn));
+  memcpy(ResolverMem + CallbackMgrAddrOffset, &CallbackMgr,
+         sizeof(CallbackMgr));
+}
+
+void OrcAArch64::writeTrampolines(uint8_t *TrampolineMem, void *ResolverAddr,
+                                  unsigned NumTrampolines) {
+
+  unsigned OffsetToPtr = alignTo(NumTrampolines * TrampolineSize, 8);
+
+  memcpy(TrampolineMem + OffsetToPtr, &ResolverAddr, sizeof(void *));
+
+  // OffsetToPtr is actually the offset from the PC for the 2nd instruction, so
+  // subtract 32-bits.
+  OffsetToPtr -= 4;
+
+  uint32_t *Trampolines = reinterpret_cast<uint32_t *>(TrampolineMem);
+
+  for (unsigned I = 0; I < NumTrampolines; ++I, OffsetToPtr -= TrampolineSize) {
+    Trampolines[3 * I + 0] = 0xaa1e03f1;                      // mov x17, x30
+    Trampolines[3 * I + 1] = 0x58000010 | (OffsetToPtr << 3); // mov x16, Lptr
+    Trampolines[3 * I + 2] = 0xd63f0200;                      // blr x16
+  }
+
+}
+
+Error OrcAArch64::emitIndirectStubsBlock(IndirectStubsInfo &StubsInfo,
+                                         unsigned MinStubs,
+                                         void *InitialPtrVal) {
+  // Stub format is:
+  //
+  // .section __orc_stubs
+  // stub1:
+  //                 ldr     x0, ptr1       ; PC-rel load of ptr1
+  //                 br      x0             ; Jump to resolver
+  // stub2:
+  //                 ldr     x0, ptr2       ; PC-rel load of ptr2
+  //                 br      x0             ; Jump to resolver
+  //
+  // ...
+  //
+  // .section __orc_ptrs
+  // ptr1:
+  //                 .quad 0x0
+  // ptr2:
+  //                 .quad 0x0
+  //
+  // ...
+
+  const unsigned StubSize = IndirectStubsInfo::StubSize;
+
+  // Emit at least MinStubs, rounded up to fill the pages allocated.
+  unsigned PageSize = sys::Process::getPageSize();
+  unsigned NumPages = ((MinStubs * StubSize) + (PageSize - 1)) / PageSize;
+  unsigned NumStubs = (NumPages * PageSize) / StubSize;
+
+  // Allocate memory for stubs and pointers in one call.
+  std::error_code EC;
+  auto StubsMem = sys::OwningMemoryBlock(sys::Memory::allocateMappedMemory(
+      2 * NumPages * PageSize, nullptr,
+      sys::Memory::MF_READ | sys::Memory::MF_WRITE, EC));
+
+  if (EC)
+    return errorCodeToError(EC);
+
+  // Create separate MemoryBlocks representing the stubs and pointers.
+  sys::MemoryBlock StubsBlock(StubsMem.base(), NumPages * PageSize);
+  sys::MemoryBlock PtrsBlock(static_cast<char *>(StubsMem.base()) +
+                                 NumPages * PageSize,
+                             NumPages * PageSize);
+
+  // Populate the stubs page stubs and mark it executable.
+  uint64_t *Stub = reinterpret_cast<uint64_t *>(StubsBlock.base());
+  uint64_t PtrOffsetField = static_cast<uint64_t>(NumPages * PageSize)
+                            << 3;
+
+  for (unsigned I = 0; I < NumStubs; ++I)
+    Stub[I] = 0xd61f020058000010 | PtrOffsetField;
+
+  if (auto EC = sys::Memory::protectMappedMemory(
+          StubsBlock, sys::Memory::MF_READ | sys::Memory::MF_EXEC))
+    return errorCodeToError(EC);
+
+  // Initialize all pointers to point at FailureAddress.
+  void **Ptr = reinterpret_cast<void **>(PtrsBlock.base());
+  for (unsigned I = 0; I < NumStubs; ++I)
+    Ptr[I] = InitialPtrVal;
+
+  StubsInfo = IndirectStubsInfo(NumStubs, std::move(StubsMem));
+
+  return Error::success();
+}
+
+void OrcX86_64_Base::writeTrampolines(uint8_t *TrampolineMem,
+                                      void *ResolverAddr,
+                                      unsigned NumTrampolines) {
+
+  unsigned OffsetToPtr = NumTrampolines * TrampolineSize;
+
+  memcpy(TrampolineMem + OffsetToPtr, &ResolverAddr, sizeof(void *));
+
+  uint64_t *Trampolines = reinterpret_cast<uint64_t *>(TrampolineMem);
+  uint64_t CallIndirPCRel = 0xf1c40000000015ff;
+
+  for (unsigned I = 0; I < NumTrampolines; ++I, OffsetToPtr -= TrampolineSize)
+    Trampolines[I] = CallIndirPCRel | ((OffsetToPtr - 6) << 16);
+}
+
+Error OrcX86_64_Base::emitIndirectStubsBlock(IndirectStubsInfo &StubsInfo,
+                                             unsigned MinStubs,
+                                             void *InitialPtrVal) {
+  // Stub format is:
+  //
+  // .section __orc_stubs
+  // stub1:
+  //                 jmpq    *ptr1(%rip)
+  //                 .byte   0xC4         ; <- Invalid opcode padding.
+  //                 .byte   0xF1
+  // stub2:
+  //                 jmpq    *ptr2(%rip)
+  //
+  // ...
+  //
+  // .section __orc_ptrs
+  // ptr1:
+  //                 .quad 0x0
+  // ptr2:
+  //                 .quad 0x0
+  //
+  // ...
+
+  const unsigned StubSize = IndirectStubsInfo::StubSize;
+
+  // Emit at least MinStubs, rounded up to fill the pages allocated.
+  unsigned PageSize = sys::Process::getPageSize();
+  unsigned NumPages = ((MinStubs * StubSize) + (PageSize - 1)) / PageSize;
+  unsigned NumStubs = (NumPages * PageSize) / StubSize;
+
+  // Allocate memory for stubs and pointers in one call.
+  std::error_code EC;
+  auto StubsMem = sys::OwningMemoryBlock(sys::Memory::allocateMappedMemory(
+      2 * NumPages * PageSize, nullptr,
+      sys::Memory::MF_READ | sys::Memory::MF_WRITE, EC));
+
+  if (EC)
+    return errorCodeToError(EC);
+
+  // Create separate MemoryBlocks representing the stubs and pointers.
+  sys::MemoryBlock StubsBlock(StubsMem.base(), NumPages * PageSize);
+  sys::MemoryBlock PtrsBlock(static_cast<char *>(StubsMem.base()) +
+                                 NumPages * PageSize,
+                             NumPages * PageSize);
+
+  // Populate the stubs page stubs and mark it executable.
+  uint64_t *Stub = reinterpret_cast<uint64_t *>(StubsBlock.base());
+  uint64_t PtrOffsetField = static_cast<uint64_t>(NumPages * PageSize - 6)
+                            << 16;
+  for (unsigned I = 0; I < NumStubs; ++I)
+    Stub[I] = 0xF1C40000000025ff | PtrOffsetField;
+
+  if (auto EC = sys::Memory::protectMappedMemory(
+          StubsBlock, sys::Memory::MF_READ | sys::Memory::MF_EXEC))
+    return errorCodeToError(EC);
+
+  // Initialize all pointers to point at FailureAddress.
+  void **Ptr = reinterpret_cast<void **>(PtrsBlock.base());
+  for (unsigned I = 0; I < NumStubs; ++I)
+    Ptr[I] = InitialPtrVal;
+
+  StubsInfo = IndirectStubsInfo(NumStubs, std::move(StubsMem));
+
+  return Error::success();
+}
+
+void OrcX86_64_SysV::writeResolverCode(uint8_t *ResolverMem,
+                                       JITReentryFn ReentryFn,
+                                       void *CallbackMgr) {
+
+  const uint8_t ResolverCode[] = {
+      // resolver_entry:
+      0x55,                                     // 0x00: pushq     %rbp
+      0x48, 0x89, 0xe5,                         // 0x01: movq      %rsp, %rbp
+      0x50,                                     // 0x04: pushq     %rax
+      0x53,                                     // 0x05: pushq     %rbx
+      0x51,                                     // 0x06: pushq     %rcx
+      0x52,                                     // 0x07: pushq     %rdx
+      0x56,                                     // 0x08: pushq     %rsi
+      0x57,                                     // 0x09: pushq     %rdi
+      0x41, 0x50,                               // 0x0a: pushq     %r8
+      0x41, 0x51,                               // 0x0c: pushq     %r9
+      0x41, 0x52,                               // 0x0e: pushq     %r10
+      0x41, 0x53,                               // 0x10: pushq     %r11
+      0x41, 0x54,                               // 0x12: pushq     %r12
+      0x41, 0x55,                               // 0x14: pushq     %r13
+      0x41, 0x56,                               // 0x16: pushq     %r14
+      0x41, 0x57,                               // 0x18: pushq     %r15
+      0x48, 0x81, 0xec, 0x08, 0x02, 0x00, 0x00, // 0x1a: subq      0x208, %rsp
+      0x48, 0x0f, 0xae, 0x04, 0x24,             // 0x21: fxsave64  (%rsp)
+      0x48, 0xbf,                               // 0x26: movabsq   <CBMgr>, %rdi
+
+      // 0x28: Callback manager addr.
+      0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+
+      0x48, 0x8b, 0x75, 0x08,                   // 0x30: movq      8(%rbp), %rsi
+      0x48, 0x83, 0xee, 0x06,                   // 0x34: subq      $6, %rsi
+      0x48, 0xb8,                               // 0x38: movabsq   <REntry>, %rax
+
+      // 0x3a: JIT re-entry fn addr:
+      0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+
+      0xff, 0xd0,                               // 0x42: callq     *%rax
+      0x48, 0x89, 0x45, 0x08,                   // 0x44: movq      %rax, 8(%rbp)
+      0x48, 0x0f, 0xae, 0x0c, 0x24,             // 0x48: fxrstor64 (%rsp)
+      0x48, 0x81, 0xc4, 0x08, 0x02, 0x00, 0x00, // 0x4d: addq      0x208, %rsp
+      0x41, 0x5f,                               // 0x54: popq      %r15
+      0x41, 0x5e,                               // 0x56: popq      %r14
+      0x41, 0x5d,                               // 0x58: popq      %r13
+      0x41, 0x5c,                               // 0x5a: popq      %r12
+      0x41, 0x5b,                               // 0x5c: popq      %r11
+      0x41, 0x5a,                               // 0x5e: popq      %r10
+      0x41, 0x59,                               // 0x60: popq      %r9
+      0x41, 0x58,                               // 0x62: popq      %r8
+      0x5f,                                     // 0x64: popq      %rdi
+      0x5e,                                     // 0x65: popq      %rsi
+      0x5a,                                     // 0x66: popq      %rdx
+      0x59,                                     // 0x67: popq      %rcx
+      0x5b,                                     // 0x68: popq      %rbx
+      0x58,                                     // 0x69: popq      %rax
+      0x5d,                                     // 0x6a: popq      %rbp
+      0xc3,                                     // 0x6b: retq
+  };
+
+  const unsigned ReentryFnAddrOffset = 0x3a;
+  const unsigned CallbackMgrAddrOffset = 0x28;
+
+  memcpy(ResolverMem, ResolverCode, sizeof(ResolverCode));
+  memcpy(ResolverMem + ReentryFnAddrOffset, &ReentryFn, sizeof(ReentryFn));
+  memcpy(ResolverMem + CallbackMgrAddrOffset, &CallbackMgr,
+         sizeof(CallbackMgr));
+}
+
+void OrcX86_64_Win32::writeResolverCode(uint8_t *ResolverMem,
+                                        JITReentryFn ReentryFn,
+                                        void *CallbackMgr) {
+
+  // resolverCode is similar to OrcX86_64 with differences specific to windows x64 calling convention:
+  // arguments go into rcx, rdx and come in reverse order, shadow space allocation on stack
+  const uint8_t ResolverCode[] = {
+      // resolver_entry:
+      0x55,                                      // 0x00: pushq     %rbp
+      0x48, 0x89, 0xe5,                          // 0x01: movq      %rsp, %rbp
+      0x50,                                      // 0x04: pushq     %rax
+      0x53,                                      // 0x05: pushq     %rbx
+      0x51,                                      // 0x06: pushq     %rcx
+      0x52,                                      // 0x07: pushq     %rdx
+      0x56,                                      // 0x08: pushq     %rsi
+      0x57,                                      // 0x09: pushq     %rdi
+      0x41, 0x50,                                // 0x0a: pushq     %r8
+      0x41, 0x51,                                // 0x0c: pushq     %r9
+      0x41, 0x52,                                // 0x0e: pushq     %r10
+      0x41, 0x53,                                // 0x10: pushq     %r11
+      0x41, 0x54,                                // 0x12: pushq     %r12
+      0x41, 0x55,                                // 0x14: pushq     %r13
+      0x41, 0x56,                                // 0x16: pushq     %r14
+      0x41, 0x57,                                // 0x18: pushq     %r15
+      0x48, 0x81, 0xec, 0x08, 0x02, 0x00, 0x00,  // 0x1a: subq      0x208, %rsp
+      0x48, 0x0f, 0xae, 0x04, 0x24,              // 0x21: fxsave64  (%rsp)
+
+      0x48, 0xb9,                                // 0x26: movabsq   <CBMgr>, %rcx
+      // 0x28: Callback manager addr.
+      0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+
+      0x48, 0x8B, 0x55, 0x08,                    // 0x30: mov       rdx, [rbp+0x8]
+      0x48, 0x83, 0xea, 0x06,                    // 0x34: sub       rdx, 0x6
+
+      0x48, 0xb8,                                // 0x38: movabsq   <REntry>, %rax
+      // 0x3a: JIT re-entry fn addr:
+      0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+
+      // 0x42: sub       rsp, 0x20 (Allocate shadow space)
+      0x48, 0x83, 0xEC, 0x20,
+      0xff, 0xd0,                                // 0x46: callq     *%rax
+
+      // 0x48: add       rsp, 0x20 (Free shadow space)
+      0x48, 0x83, 0xC4, 0x20,
+
+      0x48, 0x89, 0x45, 0x08,                    // 0x4C: movq      %rax, 8(%rbp)
+      0x48, 0x0f, 0xae, 0x0c, 0x24,              // 0x50: fxrstor64 (%rsp)
+      0x48, 0x81, 0xc4, 0x08, 0x02, 0x00, 0x00,  // 0x55: addq      0x208, %rsp
+      0x41, 0x5f,                                // 0x5C: popq      %r15
+      0x41, 0x5e,                                // 0x5E: popq      %r14
+      0x41, 0x5d,                                // 0x60: popq      %r13
+      0x41, 0x5c,                                // 0x62: popq      %r12
+      0x41, 0x5b,                                // 0x64: popq      %r11
+      0x41, 0x5a,                                // 0x66: popq      %r10
+      0x41, 0x59,                                // 0x68: popq      %r9
+      0x41, 0x58,                                // 0x6a: popq      %r8
+      0x5f,                                      // 0x6c: popq      %rdi
+      0x5e,                                      // 0x6d: popq      %rsi
+      0x5a,                                      // 0x6e: popq      %rdx
+      0x59,                                      // 0x6f: popq      %rcx
+      0x5b,                                      // 0x70: popq      %rbx
+      0x58,                                      // 0x71: popq      %rax
+      0x5d,                                      // 0x72: popq      %rbp
+      0xc3,                                      // 0x73: retq
+  };
+
+
+  const unsigned ReentryFnAddrOffset = 0x3a;
+  const unsigned CallbackMgrAddrOffset = 0x28;
+
+  memcpy(ResolverMem, ResolverCode, sizeof(ResolverCode));
+  memcpy(ResolverMem + ReentryFnAddrOffset, &ReentryFn, sizeof(ReentryFn));
+  memcpy(ResolverMem + CallbackMgrAddrOffset, &CallbackMgr,
+         sizeof(CallbackMgr));
+}
+
+void OrcI386::writeResolverCode(uint8_t *ResolverMem, JITReentryFn ReentryFn,
+                                void *CallbackMgr) {
+
+  const uint8_t ResolverCode[] = {
+      // resolver_entry:
+      0x55,                               // 0x00: pushl    %ebp
+      0x89, 0xe5,                         // 0x01: movl     %esp, %ebp
+      0x54,                               // 0x03: pushl    %esp
+      0x83, 0xe4, 0xf0,                   // 0x04: andl     $-0x10, %esp
+      0x50,                               // 0x07: pushl    %eax
+      0x53,                               // 0x08: pushl    %ebx
+      0x51,                               // 0x09: pushl    %ecx
+      0x52,                               // 0x0a: pushl    %edx
+      0x56,                               // 0x0b: pushl    %esi
+      0x57,                               // 0x0c: pushl    %edi
+      0x81, 0xec, 0x18, 0x02, 0x00, 0x00, // 0x0d: subl     $0x218, %esp
+      0x0f, 0xae, 0x44, 0x24, 0x10,       // 0x13: fxsave   0x10(%esp)
+      0x8b, 0x75, 0x04,                   // 0x18: movl     0x4(%ebp), %esi
+      0x83, 0xee, 0x05,                   // 0x1b: subl     $0x5, %esi
+      0x89, 0x74, 0x24, 0x04,             // 0x1e: movl     %esi, 0x4(%esp)
+      0xc7, 0x04, 0x24, 0x00, 0x00, 0x00,
+      0x00,                               // 0x22: movl     <cbmgr>, (%esp)
+      0xb8, 0x00, 0x00, 0x00, 0x00,       // 0x29: movl     <reentry>, %eax
+      0xff, 0xd0,                         // 0x2e: calll    *%eax
+      0x89, 0x45, 0x04,                   // 0x30: movl     %eax, 0x4(%ebp)
+      0x0f, 0xae, 0x4c, 0x24, 0x10,       // 0x33: fxrstor  0x10(%esp)
+      0x81, 0xc4, 0x18, 0x02, 0x00, 0x00, // 0x38: addl     $0x218, %esp
+      0x5f,                               // 0x3e: popl     %edi
+      0x5e,                               // 0x3f: popl     %esi
+      0x5a,                               // 0x40: popl     %edx
+      0x59,                               // 0x41: popl     %ecx
+      0x5b,                               // 0x42: popl     %ebx
+      0x58,                               // 0x43: popl     %eax
+      0x8b, 0x65, 0xfc,                   // 0x44: movl     -0x4(%ebp), %esp
+      0x5d,                               // 0x48: popl     %ebp
+      0xc3                                // 0x49: retl
+  };
+
+  const unsigned ReentryFnAddrOffset = 0x2a;
+  const unsigned CallbackMgrAddrOffset = 0x25;
+
+  memcpy(ResolverMem, ResolverCode, sizeof(ResolverCode));
+  memcpy(ResolverMem + ReentryFnAddrOffset, &ReentryFn, sizeof(ReentryFn));
+  memcpy(ResolverMem + CallbackMgrAddrOffset, &CallbackMgr,
+         sizeof(CallbackMgr));
+}
+
+void OrcI386::writeTrampolines(uint8_t *TrampolineMem, void *ResolverAddr,
+                               unsigned NumTrampolines) {
+
+  uint64_t CallRelImm = 0xF1C4C400000000e8;
+  uint64_t Resolver = reinterpret_cast<uint64_t>(ResolverAddr);
+  uint64_t ResolverRel =
+      Resolver - reinterpret_cast<uint64_t>(TrampolineMem) - 5;
+
+  uint64_t *Trampolines = reinterpret_cast<uint64_t *>(TrampolineMem);
+  for (unsigned I = 0; I < NumTrampolines; ++I, ResolverRel -= TrampolineSize)
+    Trampolines[I] = CallRelImm | (ResolverRel << 8);
+}
+
+Error OrcI386::emitIndirectStubsBlock(IndirectStubsInfo &StubsInfo,
+                                      unsigned MinStubs, void *InitialPtrVal) {
+  // Stub format is:
+  //
+  // .section __orc_stubs
+  // stub1:
+  //                 jmpq    *ptr1
+  //                 .byte   0xC4         ; <- Invalid opcode padding.
+  //                 .byte   0xF1
+  // stub2:
+  //                 jmpq    *ptr2
+  //
+  // ...
+  //
+  // .section __orc_ptrs
+  // ptr1:
+  //                 .quad 0x0
+  // ptr2:
+  //                 .quad 0x0
+  //
+  // ...
+
+  const unsigned StubSize = IndirectStubsInfo::StubSize;
+
+  // Emit at least MinStubs, rounded up to fill the pages allocated.
+  unsigned PageSize = sys::Process::getPageSize();
+  unsigned NumPages = ((MinStubs * StubSize) + (PageSize - 1)) / PageSize;
+  unsigned NumStubs = (NumPages * PageSize) / StubSize;
+
+  // Allocate memory for stubs and pointers in one call.
+  std::error_code EC;
+  auto StubsMem = sys::OwningMemoryBlock(sys::Memory::allocateMappedMemory(
+      2 * NumPages * PageSize, nullptr,
+      sys::Memory::MF_READ | sys::Memory::MF_WRITE, EC));
+
+  if (EC)
+    return errorCodeToError(EC);
+
+  // Create separate MemoryBlocks representing the stubs and pointers.
+  sys::MemoryBlock StubsBlock(StubsMem.base(), NumPages * PageSize);
+  sys::MemoryBlock PtrsBlock(static_cast<char *>(StubsMem.base()) +
+                                 NumPages * PageSize,
+                             NumPages * PageSize);
+
+  // Populate the stubs page stubs and mark it executable.
+  uint64_t *Stub = reinterpret_cast<uint64_t *>(StubsBlock.base());
+  uint64_t PtrAddr = reinterpret_cast<uint64_t>(PtrsBlock.base());
+  for (unsigned I = 0; I < NumStubs; ++I, PtrAddr += 4)
+    Stub[I] = 0xF1C40000000025ff | (PtrAddr << 16);
+
+  if (auto EC = sys::Memory::protectMappedMemory(
+          StubsBlock, sys::Memory::MF_READ | sys::Memory::MF_EXEC))
+    return errorCodeToError(EC);
+
+  // Initialize all pointers to point at FailureAddress.
+  void **Ptr = reinterpret_cast<void **>(PtrsBlock.base());
+  for (unsigned I = 0; I < NumStubs; ++I)
+    Ptr[I] = InitialPtrVal;
+
+  StubsInfo = IndirectStubsInfo(NumStubs, std::move(StubsMem));
+
+  return Error::success();
+}
+
+} // End namespace orc.
+} // End namespace llvm.
diff --git a/lib/ExecutionEngine/Orc/OrcArchitectureSupport.cpp b/lib/ExecutionEngine/Orc/OrcArchitectureSupport.cpp
deleted file mode 100644
index 01e829f7909e..000000000000
--- a/lib/ExecutionEngine/Orc/OrcArchitectureSupport.cpp
+++ /dev/null
@@ -1,171 +0,0 @@
-//===------ OrcArchSupport.cpp - Architecture specific support code -------===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-
-#include "llvm/ADT/Triple.h"
-#include "llvm/ExecutionEngine/Orc/OrcArchitectureSupport.h"
-#include "llvm/Support/Process.h"
-#include <array>
-
-namespace llvm {
-namespace orc {
-
-void OrcX86_64::writeResolverCode(uint8_t *ResolverMem, JITReentryFn ReentryFn,
-                                  void *CallbackMgr) {
-
-  const uint8_t ResolverCode[] = {
-                                               // resolver_entry:
-    0x55,                                      // 0x00: pushq     %rbp
-    0x48, 0x89, 0xe5,                          // 0x01: movq      %rsp, %rbp
-    0x50,                                      // 0x04: pushq     %rax
-    0x53,                                      // 0x05: pushq     %rbx
-    0x51,                                      // 0x06: pushq     %rcx
-    0x52,                                      // 0x07: pushq     %rdx
-    0x56,                                      // 0x08: pushq     %rsi
-    0x57,                                      // 0x09: pushq     %rdi
-    0x41, 0x50,                                // 0x0a: pushq     %r8
-    0x41, 0x51,                                // 0x0c: pushq     %r9
-    0x41, 0x52,                                // 0x0e: pushq     %r10
-    0x41, 0x53,                                // 0x10: pushq     %r11
-    0x41, 0x54,                                // 0x12: pushq     %r12
-    0x41, 0x55,                                // 0x14: pushq     %r13
-    0x41, 0x56,                                // 0x16: pushq     %r14
-    0x41, 0x57,                                // 0x18: pushq     %r15
-    0x48, 0x81, 0xec, 0x08, 0x02, 0x00, 0x00,  // 0x1a: subq      20, %rsp
-    0x48, 0x0f, 0xae, 0x04, 0x24,              // 0x21: fxsave64  (%rsp)
-    0x48, 0x8d, 0x3d, 0x43, 0x00, 0x00, 0x00,  // 0x26: leaq      67(%rip), %rdi
-    0x48, 0x8b, 0x3f,                          // 0x2d: movq      (%rdi), %rdi
-    0x48, 0x8b, 0x75, 0x08,                    // 0x30: movq      8(%rbp), %rsi
-    0x48, 0x83, 0xee, 0x06,                    // 0x34: subq      $6, %rsi
-    0x48, 0xb8,                                // 0x38: movabsq   $0, %rax
-
-    // 0x3a: JIT re-entry fn addr:
-    0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-
-    0xff, 0xd0,                                // 0x42: callq     *%rax
-    0x48, 0x89, 0x45, 0x08,                    // 0x44: movq      %rax, 8(%rbp)
-    0x48, 0x0f, 0xae, 0x0c, 0x24,              // 0x48: fxrstor64 (%rsp)
-    0x48, 0x81, 0xc4, 0x08, 0x02, 0x00, 0x00,  // 0x4d: addq      20, %rsp
-    0x41, 0x5f,                                // 0x54: popq      %r15
-    0x41, 0x5e,                                // 0x56: popq      %r14
-    0x41, 0x5d,                                // 0x58: popq      %r13
-    0x41, 0x5c,                                // 0x5a: popq      %r12
-    0x41, 0x5b,                                // 0x5c: popq      %r11
-    0x41, 0x5a,                                // 0x5e: popq      %r10
-    0x41, 0x59,                                // 0x60: popq      %r9
-    0x41, 0x58,                                // 0x62: popq      %r8
-    0x5f,                                      // 0x64: popq      %rdi
-    0x5e,                                      // 0x65: popq      %rsi
-    0x5a,                                      // 0x66: popq      %rdx
-    0x59,                                      // 0x67: popq      %rcx
-    0x5b,                                      // 0x68: popq      %rbx
-    0x58,                                      // 0x69: popq      %rax
-    0x5d,                                      // 0x6a: popq      %rbp
-    0xc3,                                      // 0x6b: retq
-    0x00, 0x00, 0x00, 0x00,                    // 0x6c: <padding>
-
-    // 0x70: Callback mgr address.
-    0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-  };
-
-  const unsigned ReentryFnAddrOffset = 0x3a;
-  const unsigned CallbackMgrAddrOffset = 0x70;
-  
-  memcpy(ResolverMem, ResolverCode, sizeof(ResolverCode));
-  memcpy(ResolverMem + ReentryFnAddrOffset, &ReentryFn, sizeof(ReentryFn));
-  memcpy(ResolverMem + CallbackMgrAddrOffset, &CallbackMgr,
-         sizeof(CallbackMgr));
-}
-
-void OrcX86_64::writeTrampolines(uint8_t *TrampolineMem, void *ResolverAddr,
-				 unsigned NumTrampolines) {
-
-  unsigned OffsetToPtr = NumTrampolines * TrampolineSize;
-
-  memcpy(TrampolineMem + OffsetToPtr, &ResolverAddr, sizeof(void*));
-
-  uint64_t *Trampolines = reinterpret_cast<uint64_t*>(TrampolineMem);
-  uint64_t CallIndirPCRel = 0xf1c40000000015ff;
-
-  for (unsigned I = 0; I < NumTrampolines; ++I, OffsetToPtr -= TrampolineSize)
-    Trampolines[I] = CallIndirPCRel | ((OffsetToPtr - 6) << 16);
-}
-
-std::error_code OrcX86_64::emitIndirectStubsBlock(IndirectStubsInfo &StubsInfo,
-                                                  unsigned MinStubs,
-                                                  void *InitialPtrVal) {
-  // Stub format is:
-  //
-  // .section __orc_stubs
-  // stub1:
-  //                 jmpq    *ptr1(%rip)
-  //                 .byte   0xC4         ; <- Invalid opcode padding.
-  //                 .byte   0xF1
-  // stub2:
-  //                 jmpq    *ptr2(%rip)
-  //
-  // ...
-  //
-  // .section __orc_ptrs
-  // ptr1:
-  //                 .quad 0x0
-  // ptr2:
-  //                 .quad 0x0
-  //
-  // ...
-
-  const unsigned StubSize = IndirectStubsInfo::StubSize;
-
-  // Emit at least MinStubs, rounded up to fill the pages allocated.
-  unsigned PageSize = sys::Process::getPageSize();
-  unsigned NumPages = ((MinStubs * StubSize) + (PageSize - 1)) / PageSize;
-  unsigned NumStubs = (NumPages * PageSize) / StubSize;
-
-  // Allocate memory for stubs and pointers in one call.
-  std::error_code EC;
-  auto StubsMem =
-    sys::OwningMemoryBlock(
-      sys::Memory::allocateMappedMemory(2 * NumPages * PageSize, nullptr,
-                                        sys::Memory::MF_READ |
-                                        sys::Memory::MF_WRITE,
-                                        EC));
-
-  if (EC)
-    return EC;
-
-  // Create separate MemoryBlocks representing the stubs and pointers.
-  sys::MemoryBlock StubsBlock(StubsMem.base(), NumPages * PageSize);
-  sys::MemoryBlock PtrsBlock(static_cast<char*>(StubsMem.base()) +
-                               NumPages * PageSize,
-                             NumPages * PageSize);
-
-  // Populate the stubs page stubs and mark it executable.
-  uint64_t *Stub = reinterpret_cast<uint64_t*>(StubsBlock.base());
-  uint64_t PtrOffsetField =
-    static_cast<uint64_t>(NumPages * PageSize - 6) << 16;
-  for (unsigned I = 0; I < NumStubs; ++I)
-    Stub[I] = 0xF1C40000000025ff | PtrOffsetField;
-
-  if (auto EC = sys::Memory::protectMappedMemory(StubsBlock,
-                                                 sys::Memory::MF_READ |
-                                                 sys::Memory::MF_EXEC))
-    return EC;
-
-  // Initialize all pointers to point at FailureAddress.
-  void **Ptr = reinterpret_cast<void**>(PtrsBlock.base());
-  for (unsigned I = 0; I < NumStubs; ++I)
-    Ptr[I] = InitialPtrVal;
-
-  StubsInfo.NumStubs = NumStubs;
-  StubsInfo.StubsMem = std::move(StubsMem);
-
-  return std::error_code();
-}
-
-} // End namespace orc.
-} // End namespace llvm.
diff --git a/lib/ExecutionEngine/Orc/OrcCBindings.cpp b/lib/ExecutionEngine/Orc/OrcCBindings.cpp
index d2379cd441d5..8dcd49aaab5b 100644
--- a/lib/ExecutionEngine/Orc/OrcCBindings.cpp
+++ b/lib/ExecutionEngine/Orc/OrcCBindings.cpp
@@ -17,17 +17,21 @@ LLVMOrcJITStackRef LLVMOrcCreateInstance(LLVMTargetMachineRef TM) {
 
   Triple T(TM2->getTargetTriple());
 
-  auto CompileCallbackMgr = OrcCBindingsStack::createCompileCallbackMgr(T);
+  auto CompileCallbackMgr = orc::createLocalCompileCallbackManager(T, 0);
   auto IndirectStubsMgrBuilder =
-    OrcCBindingsStack::createIndirectStubsMgrBuilder(T);
+      orc::createLocalIndirectStubsManagerBuilder(T);
 
-  OrcCBindingsStack *JITStack =
-    new OrcCBindingsStack(*TM2, std::move(CompileCallbackMgr),
-			  IndirectStubsMgrBuilder);
+  OrcCBindingsStack *JITStack = new OrcCBindingsStack(
+      *TM2, std::move(CompileCallbackMgr), IndirectStubsMgrBuilder);
 
   return wrap(JITStack);
 }
 
+const char *LLVMOrcGetErrorMsg(LLVMOrcJITStackRef JITStack) {
+  OrcCBindingsStack &J = *unwrap(JITStack);
+  return J.getErrorMessage().c_str();
+}
+
 void LLVMOrcGetMangledSymbol(LLVMOrcJITStackRef JITStack, char **MangledName,
                              const char *SymbolName) {
   OrcCBindingsStack &J = *unwrap(JITStack);
@@ -36,9 +40,7 @@ void LLVMOrcGetMangledSymbol(LLVMOrcJITStackRef JITStack, char **MangledName,
   strcpy(*MangledName, Mangled.c_str());
 }
 
-void LLVMOrcDisposeMangledSymbol(char *MangledName) {
-  delete[] MangledName;
-}
+void LLVMOrcDisposeMangledSymbol(char *MangledName) { delete[] MangledName; }
 
 LLVMOrcTargetAddress
 LLVMOrcCreateLazyCompileCallback(LLVMOrcJITStackRef JITStack,
@@ -48,18 +50,18 @@ LLVMOrcCreateLazyCompileCallback(LLVMOrcJITStackRef JITStack,
   return J.createLazyCompileCallback(Callback, CallbackCtx);
 }
 
-void LLVMOrcCreateIndirectStub(LLVMOrcJITStackRef JITStack,
-                               const char *StubName,
-                               LLVMOrcTargetAddress InitAddr) {
+LLVMOrcErrorCode LLVMOrcCreateIndirectStub(LLVMOrcJITStackRef JITStack,
+                                           const char *StubName,
+                                           LLVMOrcTargetAddress InitAddr) {
   OrcCBindingsStack &J = *unwrap(JITStack);
-  J.createIndirectStub(StubName, InitAddr);
+  return J.createIndirectStub(StubName, InitAddr);
 }
 
-void LLVMOrcSetIndirectStubPointer(LLVMOrcJITStackRef JITStack,
-                                   const char *StubName,
-                                   LLVMOrcTargetAddress NewAddr) {
+LLVMOrcErrorCode LLVMOrcSetIndirectStubPointer(LLVMOrcJITStackRef JITStack,
+                                               const char *StubName,
+                                               LLVMOrcTargetAddress NewAddr) {
   OrcCBindingsStack &J = *unwrap(JITStack);
-  J.setIndirectStubPointer(StubName, NewAddr);
+  return J.setIndirectStubPointer(StubName, NewAddr);
 }
 
 LLVMOrcModuleHandle
diff --git a/lib/ExecutionEngine/Orc/OrcCBindingsStack.cpp b/lib/ExecutionEngine/Orc/OrcCBindingsStack.cpp
deleted file mode 100644
index 956daae372da..000000000000
--- a/lib/ExecutionEngine/Orc/OrcCBindingsStack.cpp
+++ /dev/null
@@ -1,43 +0,0 @@
-//===-------- OrcCBindingsStack.cpp - Orc JIT stack for C bindings --------===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-
-#include "OrcCBindingsStack.h"
-
-#include "llvm/ExecutionEngine/Orc/OrcArchitectureSupport.h"
-#include "llvm/Support/Debug.h"
-#include "llvm/Support/DynamicLibrary.h"
-#include <cstdio>
-#include <system_error>
-
-using namespace llvm;
-
-std::unique_ptr<OrcCBindingsStack::CompileCallbackMgr>
-OrcCBindingsStack::createCompileCallbackMgr(Triple T) {
-  switch (T.getArch()) {
-    default: return nullptr;
-
-    case Triple::x86_64: {
-      typedef orc::LocalJITCompileCallbackManager<orc::OrcX86_64> CCMgrT;
-      return llvm::make_unique<CCMgrT>(0);
-    }
-  }
-}
-
-OrcCBindingsStack::IndirectStubsManagerBuilder
-OrcCBindingsStack::createIndirectStubsMgrBuilder(Triple T) {
-  switch (T.getArch()) {
-    default: return nullptr;
-
-    case Triple::x86_64:
-      return [](){
-        return llvm::make_unique<
-                 orc::LocalIndirectStubsManager<orc::OrcX86_64>>();
-      };
-  }
-}
diff --git a/lib/ExecutionEngine/Orc/OrcCBindingsStack.h b/lib/ExecutionEngine/Orc/OrcCBindingsStack.h
index aae6a99432bc..9ae9b20feb0a 100644
--- a/lib/ExecutionEngine/Orc/OrcCBindingsStack.h
+++ b/lib/ExecutionEngine/Orc/OrcCBindingsStack.h
@@ -10,6 +10,7 @@
 #ifndef LLVM_LIB_EXECUTIONENGINE_ORC_ORCCBINDINGSSTACK_H
 #define LLVM_LIB_EXECUTIONENGINE_ORC_ORCCBINDINGSSTACK_H
 
+#include "llvm-c/OrcBindings.h"
 #include "llvm/ADT/Triple.h"
 #include "llvm/ExecutionEngine/Orc/CompileOnDemandLayer.h"
 #include "llvm/ExecutionEngine/Orc/CompileUtils.h"
@@ -17,7 +18,7 @@
 #include "llvm/ExecutionEngine/Orc/IRCompileLayer.h"
 #include "llvm/ExecutionEngine/Orc/ObjectLinkingLayer.h"
 #include "llvm/IR/LLVMContext.h"
-#include "llvm-c/OrcBindings.h"
+#include "llvm/Support/Error.h"
 
 namespace llvm {
 
@@ -28,19 +29,18 @@ DEFINE_SIMPLE_CONVERSION_FUNCTIONS(TargetMachine, LLVMTargetMachineRef)
 
 class OrcCBindingsStack {
 public:
-
   typedef orc::JITCompileCallbackManager CompileCallbackMgr;
   typedef orc::ObjectLinkingLayer<> ObjLayerT;
   typedef orc::IRCompileLayer<ObjLayerT> CompileLayerT;
-  typedef orc::CompileOnDemandLayer<CompileLayerT, CompileCallbackMgr> CODLayerT;
+  typedef orc::CompileOnDemandLayer<CompileLayerT, CompileCallbackMgr>
+      CODLayerT;
 
   typedef std::function<std::unique_ptr<CompileCallbackMgr>()>
-    CallbackManagerBuilder;
+      CallbackManagerBuilder;
 
   typedef CODLayerT::IndirectStubsManagerBuilderT IndirectStubsManagerBuilder;
 
 private:
-
   class GenericHandle {
   public:
     virtual ~GenericHandle() {}
@@ -49,20 +49,17 @@ private:
     virtual void removeModule() = 0;
   };
 
-  template <typename LayerT>
-  class GenericHandleImpl : public GenericHandle {
+  template <typename LayerT> class GenericHandleImpl : public GenericHandle {
   public:
     GenericHandleImpl(LayerT &Layer, typename LayerT::ModuleSetHandleT Handle)
-      : Layer(Layer), Handle(std::move(Handle)) {}
+        : Layer(Layer), Handle(std::move(Handle)) {}
 
     orc::JITSymbol findSymbolIn(const std::string &Name,
                                 bool ExportedSymbolsOnly) override {
       return Layer.findSymbolIn(Handle, Name, ExportedSymbolsOnly);
     }
 
-    void removeModule() override {
-      return Layer.removeModuleSet(Handle);
-    }
+    void removeModule() override { return Layer.removeModuleSet(Handle); }
 
   private:
     LayerT &Layer;
@@ -77,26 +74,22 @@ private:
   }
 
 public:
-
   // We need a 'ModuleSetHandleT' to conform to the layer concept.
   typedef unsigned ModuleSetHandleT;
 
   typedef unsigned ModuleHandleT;
 
-  static std::unique_ptr<CompileCallbackMgr> createCompileCallbackMgr(Triple T);
-  static IndirectStubsManagerBuilder createIndirectStubsMgrBuilder(Triple T);
-
   OrcCBindingsStack(TargetMachine &TM,
-		    std::unique_ptr<CompileCallbackMgr> CCMgr, 
+                    std::unique_ptr<CompileCallbackMgr> CCMgr,
                     IndirectStubsManagerBuilder IndirectStubsMgrBuilder)
-    : DL(TM.createDataLayout()), CCMgr(std::move(CCMgr)),
-      ObjectLayer(),
-      CompileLayer(ObjectLayer, orc::SimpleCompiler(TM)),
-      CODLayer(CompileLayer,
-               [](Function &F) { std::set<Function*> S; S.insert(&F); return S; },
-               *this->CCMgr, std::move(IndirectStubsMgrBuilder), false),
-      IndirectStubsMgr(IndirectStubsMgrBuilder()),
-      CXXRuntimeOverrides([this](const std::string &S) { return mangle(S); }) {}
+      : DL(TM.createDataLayout()), IndirectStubsMgr(IndirectStubsMgrBuilder()),
+        CCMgr(std::move(CCMgr)), ObjectLayer(),
+        CompileLayer(ObjectLayer, orc::SimpleCompiler(TM)),
+        CODLayer(CompileLayer,
+                 [](Function &F) { return std::set<Function *>({&F}); },
+                 *this->CCMgr, std::move(IndirectStubsMgrBuilder), false),
+        CXXRuntimeOverrides(
+            [this](const std::string &S) { return mangle(S); }) {}
 
   ~OrcCBindingsStack() {
     // Run any destructors registered with __cxa_atexit.
@@ -124,55 +117,52 @@ public:
   createLazyCompileCallback(LLVMOrcLazyCompileCallbackFn Callback,
                             void *CallbackCtx) {
     auto CCInfo = CCMgr->getCompileCallback();
-    CCInfo.setCompileAction(
-      [=]() -> orc::TargetAddress {
-        return Callback(wrap(this), CallbackCtx);
-      });
+    CCInfo.setCompileAction([=]() -> orc::TargetAddress {
+      return Callback(wrap(this), CallbackCtx);
+    });
     return CCInfo.getAddress();
   }
 
-  void createIndirectStub(StringRef StubName, orc::TargetAddress Addr) {
-    IndirectStubsMgr->createStub(StubName, Addr, JITSymbolFlags::Exported);
+  LLVMOrcErrorCode createIndirectStub(StringRef StubName,
+                                      orc::TargetAddress Addr) {
+    return mapError(
+        IndirectStubsMgr->createStub(StubName, Addr, JITSymbolFlags::Exported));
   }
 
-  void setIndirectStubPointer(StringRef Name, orc::TargetAddress Addr) {
-    IndirectStubsMgr->updatePointer(Name, Addr);
+  LLVMOrcErrorCode setIndirectStubPointer(StringRef Name,
+                                          orc::TargetAddress Addr) {
+    return mapError(IndirectStubsMgr->updatePointer(Name, Addr));
   }
 
-  std::shared_ptr<RuntimeDyld::SymbolResolver>
+  std::unique_ptr<RuntimeDyld::SymbolResolver>
   createResolver(LLVMOrcSymbolResolverFn ExternalResolver,
                  void *ExternalResolverCtx) {
-    auto Resolver = orc::createLambdaResolver(
-      [this, ExternalResolver, ExternalResolverCtx](const std::string &Name) {
-        // Search order:
-        // 1. JIT'd symbols.
-        // 2. Runtime overrides.
-        // 3. External resolver (if present).
-
-        if (auto Sym = CODLayer.findSymbol(Name, true))
-          return RuntimeDyld::SymbolInfo(Sym.getAddress(),
-                                         Sym.getFlags());
-        if (auto Sym = CXXRuntimeOverrides.searchOverrides(Name))
-          return Sym;
-
-        if (ExternalResolver)
-          return RuntimeDyld::SymbolInfo(ExternalResolver(Name.c_str(),
-                                                          ExternalResolverCtx),
-                                         llvm::JITSymbolFlags::Exported);
-
-        return RuntimeDyld::SymbolInfo(nullptr);
-      },
-      [](const std::string &Name) {
-        return RuntimeDyld::SymbolInfo(nullptr);
-      }
-    );
-
-    return std::shared_ptr<RuntimeDyld::SymbolResolver>(std::move(Resolver));
+    return orc::createLambdaResolver(
+        [this, ExternalResolver, ExternalResolverCtx](const std::string &Name) {
+          // Search order:
+          // 1. JIT'd symbols.
+          // 2. Runtime overrides.
+          // 3. External resolver (if present).
+
+          if (auto Sym = CODLayer.findSymbol(Name, true))
+            return Sym.toRuntimeDyldSymbol();
+          if (auto Sym = CXXRuntimeOverrides.searchOverrides(Name))
+            return Sym;
+
+          if (ExternalResolver)
+            return RuntimeDyld::SymbolInfo(
+                ExternalResolver(Name.c_str(), ExternalResolverCtx),
+                llvm::JITSymbolFlags::Exported);
+
+          return RuntimeDyld::SymbolInfo(nullptr);
+        },
+        [](const std::string &Name) {
+          return RuntimeDyld::SymbolInfo(nullptr);
+        });
   }
 
   template <typename LayerT>
-  ModuleHandleT addIRModule(LayerT &Layer,
-                            Module *M,
+  ModuleHandleT addIRModule(LayerT &Layer, Module *M,
                             std::unique_ptr<RuntimeDyld::MemoryManager> MemMgr,
                             LLVMOrcSymbolResolverFn ExternalResolver,
                             void *ExternalResolverCtx) {
@@ -193,7 +183,7 @@ public:
     auto Resolver = createResolver(ExternalResolver, ExternalResolverCtx);
 
     // Add the module to the JIT.
-    std::vector<Module*> S;
+    std::vector<Module *> S;
     S.push_back(std::move(M));
 
     auto LH = Layer.addModuleSet(std::move(S), std::move(MemMgr),
@@ -210,7 +200,7 @@ public:
     return H;
   }
 
-  ModuleHandleT addIRModuleEager(Module* M,
+  ModuleHandleT addIRModuleEager(Module *M,
                                  LLVMOrcSymbolResolverFn ExternalResolver,
                                  void *ExternalResolverCtx) {
     return addIRModule(CompileLayer, std::move(M),
@@ -218,11 +208,11 @@ public:
                        std::move(ExternalResolver), ExternalResolverCtx);
   }
 
-  ModuleHandleT addIRModuleLazy(Module* M,
+  ModuleHandleT addIRModuleLazy(Module *M,
                                 LLVMOrcSymbolResolverFn ExternalResolver,
                                 void *ExternalResolverCtx) {
     return addIRModule(CODLayer, std::move(M),
-		       llvm::make_unique<SectionMemoryManager>(),
+                       llvm::make_unique<SectionMemoryManager>(),
                        std::move(ExternalResolver), ExternalResolverCtx);
   }
 
@@ -243,8 +233,9 @@ public:
     return GenericHandles[H]->findSymbolIn(Name, ExportedSymbolsOnly);
   }
 
-private:
+  const std::string &getErrorMessage() const { return ErrMsg; }
 
+private:
   template <typename LayerT>
   unsigned createHandle(LayerT &Layer,
                         typename LayerT::ModuleSetHandleT Handle) {
@@ -261,21 +252,34 @@ private:
     return NewHandle;
   }
 
+  LLVMOrcErrorCode mapError(Error Err) {
+    LLVMOrcErrorCode Result = LLVMOrcErrSuccess;
+    handleAllErrors(std::move(Err), [&](ErrorInfoBase &EIB) {
+      // Handler of last resort.
+      Result = LLVMOrcErrGeneric;
+      ErrMsg = "";
+      raw_string_ostream ErrStream(ErrMsg);
+      EIB.log(ErrStream);
+    });
+    return Result;
+  }
+
   DataLayout DL;
   SectionMemoryManager CCMgrMemMgr;
 
+  std::unique_ptr<orc::IndirectStubsManager> IndirectStubsMgr;
+
   std::unique_ptr<CompileCallbackMgr> CCMgr;
   ObjLayerT ObjectLayer;
   CompileLayerT CompileLayer;
   CODLayerT CODLayer;
 
-  std::unique_ptr<orc::IndirectStubsManager> IndirectStubsMgr;
-
   std::vector<std::unique_ptr<GenericHandle>> GenericHandles;
   std::vector<unsigned> FreeHandleIndexes;
 
   orc::LocalCXXRuntimeOverrides CXXRuntimeOverrides;
   std::vector<orc::CtorDtorRunner<OrcCBindingsStack>> IRStaticDestructorRunners;
+  std::string ErrMsg;
 };
 
 } // end namespace llvm
diff --git a/lib/ExecutionEngine/Orc/OrcError.cpp b/lib/ExecutionEngine/Orc/OrcError.cpp
index e95115ec6fed..22f1303f4330 100644
--- a/lib/ExecutionEngine/Orc/OrcError.cpp
+++ b/lib/ExecutionEngine/Orc/OrcError.cpp
@@ -20,6 +20,9 @@ using namespace llvm::orc;
 
 namespace {
 
+// FIXME: This class is only here to support the transition to llvm::Error. It
+// will be removed once this transition is complete. Clients should prefer to
+// deal with the Error value directly, rather than converting to error_code.
 class OrcErrorCategory : public std::error_category {
 public:
   const char *name() const LLVM_NOEXCEPT override { return "orc"; }
@@ -38,6 +41,8 @@ public:
       return "Remote indirect stubs owner Id already in use";
     case OrcErrorCode::UnexpectedRPCCall:
       return "Unexpected RPC call";
+    case OrcErrorCode::UnexpectedRPCResponse:
+      return "Unexpected RPC response";
     }
     llvm_unreachable("Unhandled error code");
   }
@@ -49,9 +54,10 @@ static ManagedStatic<OrcErrorCategory> OrcErrCat;
 namespace llvm {
 namespace orc {
 
-std::error_code orcError(OrcErrorCode ErrCode) {
+Error orcError(OrcErrorCode ErrCode) {
   typedef std::underlying_type<OrcErrorCode>::type UT;
-  return std::error_code(static_cast<UT>(ErrCode), *OrcErrCat);
+  return errorCodeToError(
+      std::error_code(static_cast<UT>(ErrCode), *OrcErrCat));
 }
 }
 }
diff --git a/lib/ExecutionEngine/Orc/OrcMCJITReplacement.h b/lib/ExecutionEngine/Orc/OrcMCJITReplacement.h
index 2ab70a9fee86..d1083072c98b 100644
--- a/lib/ExecutionEngine/Orc/OrcMCJITReplacement.h
+++ b/lib/ExecutionEngine/Orc/OrcMCJITReplacement.h
@@ -121,7 +121,7 @@ class OrcMCJITReplacement : public ExecutionEngine {
 
     RuntimeDyld::SymbolInfo
     findSymbolInLogicalDylib(const std::string &Name) override {
-      return M.ClientResolver->findSymbolInLogicalDylib(Name);
+      return M.ClientResolver->findSymbol(Name);
     }
 
   private:
@@ -178,11 +178,10 @@ public:
   }
 
   void addObjectFile(object::OwningBinary<object::ObjectFile> O) override {
-    std::unique_ptr<object::ObjectFile> Obj;
-    std::unique_ptr<MemoryBuffer> Buf;
-    std::tie(Obj, Buf) = O.takeBinary();
-    std::vector<std::unique_ptr<object::ObjectFile>> Objs;
-    Objs.push_back(std::move(Obj));
+    std::vector<std::unique_ptr<object::OwningBinary<object::ObjectFile>>> Objs;
+    Objs.push_back(
+      llvm::make_unique<object::OwningBinary<object::ObjectFile>>(
+        std::move(O)));
     ObjectLayer.addObjectSet(std::move(Objs), &MemMgr, &Resolver);
   }
 
@@ -246,11 +245,11 @@ private:
 
   RuntimeDyld::SymbolInfo findMangledSymbol(StringRef Name) {
     if (auto Sym = LazyEmitLayer.findSymbol(Name, false))
-      return RuntimeDyld::SymbolInfo(Sym.getAddress(), Sym.getFlags());
+      return Sym.toRuntimeDyldSymbol();
     if (auto Sym = ClientResolver->findSymbol(Name))
-      return RuntimeDyld::SymbolInfo(Sym.getAddress(), Sym.getFlags());
+      return Sym;
     if (auto Sym = scanArchives(Name))
-      return RuntimeDyld::SymbolInfo(Sym.getAddress(), Sym.getFlags());
+      return Sym.toRuntimeDyldSymbol();
 
     return nullptr;
   }
@@ -259,15 +258,19 @@ private:
     for (object::OwningBinary<object::Archive> &OB : Archives) {
       object::Archive *A = OB.getBinary();
       // Look for our symbols in each Archive
-      object::Archive::child_iterator ChildIt = A->findSym(Name);
-      if (std::error_code EC = ChildIt->getError())
-        report_fatal_error(EC.message());
-      if (ChildIt != A->child_end()) {
+      auto OptionalChildOrErr = A->findSym(Name);
+      if (!OptionalChildOrErr)
+        report_fatal_error(OptionalChildOrErr.takeError());
+      auto &OptionalChild = *OptionalChildOrErr;
+      if (OptionalChild) {
         // FIXME: Support nested archives?
-        ErrorOr<std::unique_ptr<object::Binary>> ChildBinOrErr =
-            (*ChildIt)->getAsBinary();
-        if (ChildBinOrErr.getError())
+        Expected<std::unique_ptr<object::Binary>> ChildBinOrErr =
+            OptionalChild->getAsBinary();
+        if (!ChildBinOrErr) {
+          // TODO: Actually report errors helpfully.
+          consumeError(ChildBinOrErr.takeError());
           continue;
+        }
         std::unique_ptr<object::Binary> &ChildBin = ChildBinOrErr.get();
         if (ChildBin->isObject()) {
           std::vector<std::unique_ptr<object::ObjectFile>> ObjSet;
@@ -284,12 +287,12 @@ private:
 
   class NotifyObjectLoadedT {
   public:
-    typedef std::vector<std::unique_ptr<object::ObjectFile>> ObjListT;
     typedef std::vector<std::unique_ptr<RuntimeDyld::LoadedObjectInfo>>
         LoadedObjInfoListT;
 
     NotifyObjectLoadedT(OrcMCJITReplacement &M) : M(M) {}
 
+    template <typename ObjListT>
     void operator()(ObjectLinkingLayerBase::ObjSetHandleT H,
                     const ObjListT &Objects,
                     const LoadedObjInfoListT &Infos) const {
@@ -298,10 +301,21 @@ private:
       assert(Objects.size() == Infos.size() &&
              "Incorrect number of Infos for Objects.");
       for (unsigned I = 0; I < Objects.size(); ++I)
-        M.MemMgr.notifyObjectLoaded(&M, *Objects[I]);
+        M.MemMgr.notifyObjectLoaded(&M, getObject(*Objects[I]));
     }
 
   private:
+
+    static const object::ObjectFile& getObject(const object::ObjectFile &Obj) {
+      return Obj;
+    }
+
+    template <typename ObjT>
+    static const object::ObjectFile&
+    getObject(const object::OwningBinary<ObjT> &Obj) {
+      return *Obj.getBinary();
+    }
+
     OrcMCJITReplacement &M;
   };
 
diff --git a/lib/ExecutionEngine/Orc/OrcRemoteTargetRPCAPI.cpp b/lib/ExecutionEngine/Orc/OrcRemoteTargetRPCAPI.cpp
index 064633b4e490..d1a021aee3ab 100644
--- a/lib/ExecutionEngine/Orc/OrcRemoteTargetRPCAPI.cpp
+++ b/lib/ExecutionEngine/Orc/OrcRemoteTargetRPCAPI.cpp
@@ -13,71 +13,41 @@ namespace llvm {
 namespace orc {
 namespace remote {
 
-const char *OrcRemoteTargetRPCAPI::getJITProcIdName(JITProcId Id) {
+#define FUNCNAME(X) \
+  case X ## Id: \
+  return #X
+
+const char *OrcRemoteTargetRPCAPI::getJITFuncIdName(JITFuncId Id) {
   switch (Id) {
   case InvalidId:
-    return "*** Invalid JITProcId ***";
-  case CallIntVoidId:
-    return "CallIntVoid";
-  case CallIntVoidResponseId:
-    return "CallIntVoidResponse";
-  case CallMainId:
-    return "CallMain";
-  case CallMainResponseId:
-    return "CallMainResponse";
-  case CallVoidVoidId:
-    return "CallVoidVoid";
-  case CallVoidVoidResponseId:
-    return "CallVoidVoidResponse";
-  case CreateRemoteAllocatorId:
-    return "CreateRemoteAllocator";
-  case CreateIndirectStubsOwnerId:
-    return "CreateIndirectStubsOwner";
-  case DestroyRemoteAllocatorId:
-    return "DestroyRemoteAllocator";
-  case DestroyIndirectStubsOwnerId:
-    return "DestroyIndirectStubsOwner";
-  case EmitIndirectStubsId:
-    return "EmitIndirectStubs";
-  case EmitIndirectStubsResponseId:
-    return "EmitIndirectStubsResponse";
-  case EmitResolverBlockId:
-    return "EmitResolverBlock";
-  case EmitTrampolineBlockId:
-    return "EmitTrampolineBlock";
-  case EmitTrampolineBlockResponseId:
-    return "EmitTrampolineBlockResponse";
-  case GetSymbolAddressId:
-    return "GetSymbolAddress";
-  case GetSymbolAddressResponseId:
-    return "GetSymbolAddressResponse";
-  case GetRemoteInfoId:
-    return "GetRemoteInfo";
-  case GetRemoteInfoResponseId:
-    return "GetRemoteInfoResponse";
-  case ReadMemId:
-    return "ReadMem";
-  case ReadMemResponseId:
-    return "ReadMemResponse";
-  case ReserveMemId:
-    return "ReserveMem";
-  case ReserveMemResponseId:
-    return "ReserveMemResponse";
-  case RequestCompileId:
-    return "RequestCompile";
-  case RequestCompileResponseId:
-    return "RequestCompileResponse";
-  case SetProtectionsId:
-    return "SetProtections";
-  case TerminateSessionId:
-    return "TerminateSession";
-  case WriteMemId:
-    return "WriteMem";
-  case WritePtrId:
-    return "WritePtr";
+    return "*** Invalid JITFuncId ***";
+  FUNCNAME(CallIntVoid);
+  FUNCNAME(CallMain);
+  FUNCNAME(CallVoidVoid);
+  FUNCNAME(CreateRemoteAllocator);
+  FUNCNAME(CreateIndirectStubsOwner);
+  FUNCNAME(DeregisterEHFrames);
+  FUNCNAME(DestroyRemoteAllocator);
+  FUNCNAME(DestroyIndirectStubsOwner);
+  FUNCNAME(EmitIndirectStubs);
+  FUNCNAME(EmitResolverBlock);
+  FUNCNAME(EmitTrampolineBlock);
+  FUNCNAME(GetSymbolAddress);
+  FUNCNAME(GetRemoteInfo);
+  FUNCNAME(ReadMem);
+  FUNCNAME(RegisterEHFrames);
+  FUNCNAME(ReserveMem);
+  FUNCNAME(RequestCompile);
+  FUNCNAME(SetProtections);
+  FUNCNAME(TerminateSession);
+  FUNCNAME(WriteMem);
+  FUNCNAME(WritePtr);
   };
   return nullptr;
 }
-}
-}
-}
+
+#undef FUNCNAME
+
+} // end namespace remote
+} // end namespace orc
+} // end namespace llvm
diff --git a/lib/ExecutionEngine/RuntimeDyld/Makefile b/lib/ExecutionEngine/RuntimeDyld/Makefile
deleted file mode 100644
index 5d6f26d950fe..000000000000
--- a/lib/ExecutionEngine/RuntimeDyld/Makefile
+++ /dev/null
@@ -1,13 +0,0 @@
-##===- lib/ExecutionEngine/MCJIT/Makefile ------------------*- Makefile -*-===##
-#
-#                     The LLVM Compiler Infrastructure
-#
-# This file is distributed under the University of Illinois Open Source
-# License. See LICENSE.TXT for details.
-#
-##===----------------------------------------------------------------------===##
-
-LEVEL = ../../..
-LIBRARYNAME = LLVMRuntimeDyld
-
-include $(LEVEL)/Makefile.common
diff --git a/lib/ExecutionEngine/RuntimeDyld/RTDyldMemoryManager.cpp b/lib/ExecutionEngine/RuntimeDyld/RTDyldMemoryManager.cpp
index ecd99004bade..e39acc7ee144 100644
--- a/lib/ExecutionEngine/RuntimeDyld/RTDyldMemoryManager.cpp
+++ b/lib/ExecutionEngine/RuntimeDyld/RTDyldMemoryManager.cpp
@@ -94,9 +94,8 @@ static const char *processFDE(const char *Entry, bool isDeregister) {
 // This implementation handles frame registration for local targets.
 // Memory managers for remote targets should re-implement this function
 // and use the LoadAddr parameter.
-void RTDyldMemoryManager::registerEHFrames(uint8_t *Addr,
-                                           uint64_t LoadAddr,
-                                           size_t Size) {
+void RTDyldMemoryManager::registerEHFramesInProcess(uint8_t *Addr,
+                                                    size_t Size) {
   // On OS X OS X __register_frame takes a single FDE as an argument.
   // See http://lists.llvm.org/pipermail/llvm-dev/2013-April/061768.html
   const char *P = (const char *)Addr;
@@ -106,9 +105,8 @@ void RTDyldMemoryManager::registerEHFrames(uint8_t *Addr,
   } while(P != End);
 }
 
-void RTDyldMemoryManager::deregisterEHFrames(uint8_t *Addr,
-                                           uint64_t LoadAddr,
-                                           size_t Size) {
+void RTDyldMemoryManager::deregisterEHFramesInProcess(uint8_t *Addr,
+                                                      size_t Size) {
   const char *P = (const char *)Addr;
   const char *End = P + Size;
   do  {
@@ -118,9 +116,8 @@ void RTDyldMemoryManager::deregisterEHFrames(uint8_t *Addr,
 
 #else
 
-void RTDyldMemoryManager::registerEHFrames(uint8_t *Addr,
-                                           uint64_t LoadAddr,
-                                           size_t Size) {
+void RTDyldMemoryManager::registerEHFramesInProcess(uint8_t *Addr,
+                                                    size_t Size) {
   // On Linux __register_frame takes a single argument: 
   // a pointer to the start of the .eh_frame section.
 
@@ -129,9 +126,8 @@ void RTDyldMemoryManager::registerEHFrames(uint8_t *Addr,
   __register_frame(Addr);
 }
 
-void RTDyldMemoryManager::deregisterEHFrames(uint8_t *Addr,
-                                           uint64_t LoadAddr,
-                                           size_t Size) {
+void RTDyldMemoryManager::deregisterEHFramesInProcess(uint8_t *Addr,
+                                                      size_t Size) {
   __deregister_frame(Addr);
 }
 
@@ -266,18 +262,15 @@ RTDyldMemoryManager::getSymbolAddressInProcess(const std::string &Name) {
   // is called before ExecutionEngine::runFunctionAsMain() is called.
   if (Name == "__main") return (uint64_t)&jit_noop;
 
-  // Try to demangle Name before looking it up in the process, otherwise symbol
-  // '_<Name>' (if present) will shadow '<Name>', and there will be no way to
-  // refer to the latter.
-
   const char *NameStr = Name.c_str();
 
+  // DynamicLibrary::SearchForAddresOfSymbol expects an unmangled 'C' symbol
+  // name so ff we're on Darwin, strip the leading '_' off.
+#ifdef __APPLE__
   if (NameStr[0] == '_')
-    if (void *Ptr = sys::DynamicLibrary::SearchForAddressOfSymbol(NameStr + 1))
-      return (uint64_t)Ptr;
+    ++NameStr;
+#endif
 
-  // If we Name did not require demangling, or we failed to find the demangled
-  // name, try again without demangling.
   return (uint64_t)sys::DynamicLibrary::SearchForAddressOfSymbol(NameStr);
 }
 
@@ -288,6 +281,7 @@ void *RTDyldMemoryManager::getPointerToNamedFunction(const std::string &Name,
   if (!Addr && AbortOnFailure)
     report_fatal_error("Program used external function '" + Name +
                        "' which could not be resolved!");
+
   return (void*)Addr;
 }
 
diff --git a/lib/ExecutionEngine/RuntimeDyld/RuntimeDyld.cpp b/lib/ExecutionEngine/RuntimeDyld/RuntimeDyld.cpp
index d16b2db24e1a..1dfbe31f2717 100644
--- a/lib/ExecutionEngine/RuntimeDyld/RuntimeDyld.cpp
+++ b/lib/ExecutionEngine/RuntimeDyld/RuntimeDyld.cpp
@@ -19,6 +19,7 @@
 #include "RuntimeDyldMachO.h"
 #include "llvm/Object/ELFObjectFile.h"
 #include "llvm/Object/COFF.h"
+#include "llvm/Support/ManagedStatic.h"
 #include "llvm/Support/MathExtras.h"
 #include "llvm/Support/MutexGuard.h"
 
@@ -27,6 +28,41 @@ using namespace llvm::object;
 
 #define DEBUG_TYPE "dyld"
 
+namespace {
+
+enum RuntimeDyldErrorCode {
+  GenericRTDyldError = 1
+};
+
+// FIXME: This class is only here to support the transition to llvm::Error. It
+// will be removed once this transition is complete. Clients should prefer to
+// deal with the Error value directly, rather than converting to error_code.
+class RuntimeDyldErrorCategory : public std::error_category {
+public:
+  const char *name() const LLVM_NOEXCEPT override { return "runtimedyld"; }
+
+  std::string message(int Condition) const override {
+    switch (static_cast<RuntimeDyldErrorCode>(Condition)) {
+      case GenericRTDyldError: return "Generic RuntimeDyld error";
+    }
+    llvm_unreachable("Unrecognized RuntimeDyldErrorCode");
+  }
+};
+
+static ManagedStatic<RuntimeDyldErrorCategory> RTDyldErrorCategory;
+
+}
+
+char RuntimeDyldError::ID = 0;
+
+void RuntimeDyldError::log(raw_ostream &OS) const {
+  OS << ErrMsg << "\n";
+}
+
+std::error_code RuntimeDyldError::convertToErrorCode() const {
+  return std::error_code(GenericRTDyldError, *RTDyldErrorCategory);
+}
+
 // Empty out-of-line virtual destructor as the key function.
 RuntimeDyldImpl::~RuntimeDyldImpl() {}
 
@@ -125,16 +161,16 @@ void RuntimeDyldImpl::mapSectionAddress(const void *LocalAddress,
   llvm_unreachable("Attempting to remap address of unknown section!");
 }
 
-static std::error_code getOffset(const SymbolRef &Sym, SectionRef Sec,
-                                 uint64_t &Result) {
-  ErrorOr<uint64_t> AddressOrErr = Sym.getAddress();
-  if (std::error_code EC = AddressOrErr.getError())
-    return EC;
+static Error getOffset(const SymbolRef &Sym, SectionRef Sec,
+                       uint64_t &Result) {
+  Expected<uint64_t> AddressOrErr = Sym.getAddress();
+  if (!AddressOrErr)
+    return AddressOrErr.takeError();
   Result = *AddressOrErr - Sec.getAddress();
-  return std::error_code();
+  return Error::success();
 }
 
-RuntimeDyldImpl::ObjSectionToIDMap
+Expected<RuntimeDyldImpl::ObjSectionToIDMap>
 RuntimeDyldImpl::loadObjectImpl(const object::ObjectFile &Obj) {
   MutexGuard locked(lock);
 
@@ -148,8 +184,11 @@ RuntimeDyldImpl::loadObjectImpl(const object::ObjectFile &Obj) {
   if (MemMgr.needsToReserveAllocationSpace()) {
     uint64_t CodeSize = 0, RODataSize = 0, RWDataSize = 0;
     uint32_t CodeAlign = 1, RODataAlign = 1, RWDataAlign = 1;
-    computeTotalAllocSize(Obj, CodeSize, CodeAlign, RODataSize, RODataAlign,
-                          RWDataSize, RWDataAlign);
+    if (auto Err = computeTotalAllocSize(Obj,
+                                         CodeSize, CodeAlign,
+                                         RODataSize, RODataAlign,
+                                         RWDataSize, RWDataAlign))
+      return std::move(Err);
     MemMgr.reserveAllocationSpace(CodeSize, CodeAlign, RODataSize, RODataAlign,
                                   RWDataSize, RWDataAlign);
   }
@@ -169,13 +208,21 @@ RuntimeDyldImpl::loadObjectImpl(const object::ObjectFile &Obj) {
     if (Flags & SymbolRef::SF_Common)
       CommonSymbols.push_back(*I);
     else {
-      object::SymbolRef::Type SymType = I->getType();
+
+      // Get the symbol type.
+      object::SymbolRef::Type SymType;
+      if (auto SymTypeOrErr = I->getType())
+        SymType =  *SymTypeOrErr;
+      else
+        return SymTypeOrErr.takeError();
 
       // Get symbol name.
-      ErrorOr<StringRef> NameOrErr = I->getName();
-      Check(NameOrErr.getError());
-      StringRef Name = *NameOrErr;
-  
+      StringRef Name;
+      if (auto NameOrErr = I->getName())
+        Name = *NameOrErr;
+      else
+        return NameOrErr.takeError();
+
       // Compute JIT symbol flags.
       JITSymbolFlags RTDyldSymFlags = JITSymbolFlags::None;
       if (Flags & SymbolRef::SF_Weak)
@@ -185,32 +232,46 @@ RuntimeDyldImpl::loadObjectImpl(const object::ObjectFile &Obj) {
 
       if (Flags & SymbolRef::SF_Absolute &&
           SymType != object::SymbolRef::ST_File) {
-        auto Addr = I->getAddress();
-        Check(Addr.getError());
-        uint64_t SectOffset = *Addr;
+        uint64_t Addr = 0;
+        if (auto AddrOrErr = I->getAddress())
+          Addr = *AddrOrErr;
+        else
+          return AddrOrErr.takeError();
+
         unsigned SectionID = AbsoluteSymbolSection;
 
         DEBUG(dbgs() << "\tType: " << SymType << " (absolute) Name: " << Name
                      << " SID: " << SectionID << " Offset: "
-                     << format("%p", (uintptr_t)SectOffset)
+                     << format("%p", (uintptr_t)Addr)
                      << " flags: " << Flags << "\n");
         GlobalSymbolTable[Name] =
-          SymbolTableEntry(SectionID, SectOffset, RTDyldSymFlags);
+          SymbolTableEntry(SectionID, Addr, RTDyldSymFlags);
       } else if (SymType == object::SymbolRef::ST_Function ||
                  SymType == object::SymbolRef::ST_Data ||
                  SymType == object::SymbolRef::ST_Unknown ||
                  SymType == object::SymbolRef::ST_Other) {
 
-        ErrorOr<section_iterator> SIOrErr = I->getSection();
-        Check(SIOrErr.getError());
-        section_iterator SI = *SIOrErr;
+        section_iterator SI = Obj.section_end();
+        if (auto SIOrErr = I->getSection())
+          SI = *SIOrErr;
+        else
+          return SIOrErr.takeError();
+
         if (SI == Obj.section_end())
           continue;
+
         // Get symbol offset.
         uint64_t SectOffset;
-        Check(getOffset(*I, *SI, SectOffset));
+        if (auto Err = getOffset(*I, *SI, SectOffset))
+          return std::move(Err);
+
         bool IsCode = SI->isText();
-        unsigned SectionID = findOrEmitSection(Obj, *SI, IsCode, LocalSections);
+        unsigned SectionID;
+        if (auto SectionIDOrErr = findOrEmitSection(Obj, *SI, IsCode,
+                                                    LocalSections))
+          SectionID = *SectionIDOrErr;
+        else
+          return SectionIDOrErr.takeError();
 
         DEBUG(dbgs() << "\tType: " << SymType << " Name: " << Name
                      << " SID: " << SectionID << " Offset: "
@@ -223,13 +284,13 @@ RuntimeDyldImpl::loadObjectImpl(const object::ObjectFile &Obj) {
   }
 
   // Allocate common symbols
-  emitCommonSymbols(Obj, CommonSymbols);
+  if (auto Err = emitCommonSymbols(Obj, CommonSymbols))
+    return std::move(Err);
 
   // Parse and process relocations
   DEBUG(dbgs() << "Parse relocations:\n");
   for (section_iterator SI = Obj.section_begin(), SE = Obj.section_end();
        SI != SE; ++SI) {
-    unsigned SectionID = 0;
     StubMap Stubs;
     section_iterator RelocatedSection = SI->getRelocatedSection();
 
@@ -243,12 +304,20 @@ RuntimeDyldImpl::loadObjectImpl(const object::ObjectFile &Obj) {
       continue;
 
     bool IsCode = RelocatedSection->isText();
-    SectionID =
-        findOrEmitSection(Obj, *RelocatedSection, IsCode, LocalSections);
+    unsigned SectionID = 0;
+    if (auto SectionIDOrErr = findOrEmitSection(Obj, *RelocatedSection, IsCode,
+                                                LocalSections))
+      SectionID = *SectionIDOrErr;
+    else
+      return SectionIDOrErr.takeError();
+
     DEBUG(dbgs() << "\tSectionID: " << SectionID << "\n");
 
     for (; I != E;)
-      I = processRelocationRef(SectionID, I, Obj, LocalSections, Stubs);
+      if (auto IOrErr = processRelocationRef(SectionID, I, Obj, LocalSections, Stubs))
+        I = *IOrErr;
+      else
+        return IOrErr.takeError();
 
     // If there is an attached checker, notify it about the stubs for this
     // section so that they can be verified.
@@ -257,7 +326,8 @@ RuntimeDyldImpl::loadObjectImpl(const object::ObjectFile &Obj) {
   }
 
   // Give the subclasses a chance to tie-up any loose ends.
-  finalizeLoad(Obj, LocalSections);
+  if (auto Err = finalizeLoad(Obj, LocalSections))
+    return std::move(Err);
 
 //   for (auto E : LocalSections)
 //     llvm::dbgs() << "Added: " << E.first.getRawDataRefImpl() << " -> " << E.second << "\n";
@@ -288,16 +358,17 @@ static bool isRequiredForExecution(const SectionRef Section) {
     const coff_section *CoffSection = COFFObj->getCOFFSection(Section);
     // Avoid loading zero-sized COFF sections.
     // In PE files, VirtualSize gives the section size, and SizeOfRawData
-    // may be zero for sections with content. In Obj files, SizeOfRawData 
+    // may be zero for sections with content. In Obj files, SizeOfRawData
     // gives the section size, and VirtualSize is always zero. Hence
     // the need to check for both cases below.
-    bool HasContent = (CoffSection->VirtualSize > 0) 
-      || (CoffSection->SizeOfRawData > 0);
-    bool IsDiscardable = CoffSection->Characteristics &
-      (COFF::IMAGE_SCN_MEM_DISCARDABLE | COFF::IMAGE_SCN_LNK_INFO);
+    bool HasContent =
+        (CoffSection->VirtualSize > 0) || (CoffSection->SizeOfRawData > 0);
+    bool IsDiscardable =
+        CoffSection->Characteristics &
+        (COFF::IMAGE_SCN_MEM_DISCARDABLE | COFF::IMAGE_SCN_LNK_INFO);
     return HasContent && !IsDiscardable;
   }
-  
+
   assert(isa<MachOObjectFile>(Obj));
   return true;
 }
@@ -336,13 +407,13 @@ static bool isZeroInit(const SectionRef Section) {
 
 // Compute an upper bound of the memory size that is required to load all
 // sections
-void RuntimeDyldImpl::computeTotalAllocSize(const ObjectFile &Obj,
-                                            uint64_t &CodeSize,
-                                            uint32_t &CodeAlign,
-                                            uint64_t &RODataSize,
-                                            uint32_t &RODataAlign,
-                                            uint64_t &RWDataSize,
-                                            uint32_t &RWDataAlign) {
+Error RuntimeDyldImpl::computeTotalAllocSize(const ObjectFile &Obj,
+                                             uint64_t &CodeSize,
+                                             uint32_t &CodeAlign,
+                                             uint64_t &RODataSize,
+                                             uint32_t &RODataAlign,
+                                             uint64_t &RWDataSize,
+                                             uint32_t &RWDataAlign) {
   // Compute the size of all sections required for execution
   std::vector<uint64_t> CodeSectionSizes;
   std::vector<uint64_t> ROSectionSizes;
@@ -358,13 +429,15 @@ void RuntimeDyldImpl::computeTotalAllocSize(const ObjectFile &Obj,
 
     // Consider only the sections that are required to be loaded for execution
     if (IsRequired) {
-      StringRef Name;
       uint64_t DataSize = Section.getSize();
       uint64_t Alignment64 = Section.getAlignment();
+      unsigned Alignment = (unsigned)Alignment64 & 0xffffffffL;
       bool IsCode = Section.isText();
       bool IsReadOnly = isReadOnlyData(Section);
-      Check(Section.getName(Name));
-      unsigned Alignment = (unsigned)Alignment64 & 0xffffffffL;
+
+      StringRef Name;
+      if (auto EC = Section.getName(Name))
+        return errorCodeToError(EC);
 
       uint64_t StubBufSize = computeSectionStubBufSize(Obj, Section);
       uint64_t SectionSize = DataSize + StubBufSize;
@@ -395,17 +468,24 @@ void RuntimeDyldImpl::computeTotalAllocSize(const ObjectFile &Obj,
 
   // Compute the size of all common symbols
   uint64_t CommonSize = 0;
+  uint32_t CommonAlign = 1;
   for (symbol_iterator I = Obj.symbol_begin(), E = Obj.symbol_end(); I != E;
        ++I) {
     uint32_t Flags = I->getFlags();
     if (Flags & SymbolRef::SF_Common) {
       // Add the common symbols to a list.  We'll allocate them all below.
       uint64_t Size = I->getCommonSize();
-      CommonSize += Size;
+      uint32_t Align = I->getAlignment();
+      // If this is the first common symbol, use its alignment as the alignment
+      // for the common symbols section.
+      if (CommonSize == 0)
+        CommonAlign = Align;
+      CommonSize = alignTo(CommonSize, Align) + Size;
     }
   }
   if (CommonSize != 0) {
     RWSectionSizes.push_back(CommonSize);
+    RWDataAlign = std::max(RWDataAlign, CommonAlign);
   }
 
   // Compute the required allocation space for each different type of sections
@@ -416,6 +496,8 @@ void RuntimeDyldImpl::computeTotalAllocSize(const ObjectFile &Obj,
   CodeSize = computeAllocationSizeForSections(CodeSectionSizes, CodeAlign);
   RODataSize = computeAllocationSizeForSections(ROSectionSizes, RODataAlign);
   RWDataSize = computeAllocationSizeForSections(RWSectionSizes, RWDataAlign);
+
+  return Error::success();
 }
 
 // compute stub buffer size for the given section
@@ -483,20 +565,23 @@ void RuntimeDyldImpl::writeBytesUnaligned(uint64_t Value, uint8_t *Dst,
   }
 }
 
-void RuntimeDyldImpl::emitCommonSymbols(const ObjectFile &Obj,
-                                        CommonSymbolList &CommonSymbols) {
+Error RuntimeDyldImpl::emitCommonSymbols(const ObjectFile &Obj,
+                                         CommonSymbolList &CommonSymbols) {
   if (CommonSymbols.empty())
-    return;
+    return Error::success();
 
   uint64_t CommonSize = 0;
+  uint32_t CommonAlign = CommonSymbols.begin()->getAlignment();
   CommonSymbolList SymbolsToAllocate;
 
   DEBUG(dbgs() << "Processing common symbols...\n");
 
   for (const auto &Sym : CommonSymbols) {
-    ErrorOr<StringRef> NameOrErr = Sym.getName();
-    Check(NameOrErr.getError());
-    StringRef Name = *NameOrErr;
+    StringRef Name;
+    if (auto NameOrErr = Sym.getName())
+      Name = *NameOrErr;
+    else
+      return NameOrErr.takeError();
 
     // Skip common symbols already elsewhere.
     if (GlobalSymbolTable.count(Name) ||
@@ -509,14 +594,15 @@ void RuntimeDyldImpl::emitCommonSymbols(const ObjectFile &Obj,
     uint32_t Align = Sym.getAlignment();
     uint64_t Size = Sym.getCommonSize();
 
-    CommonSize += Align + Size;
+    CommonSize = alignTo(CommonSize, Align) + Size;
+
     SymbolsToAllocate.push_back(Sym);
   }
 
   // Allocate memory for the section
   unsigned SectionID = Sections.size();
-  uint8_t *Addr = MemMgr.allocateDataSection(CommonSize, sizeof(void *),
-                                             SectionID, StringRef(), false);
+  uint8_t *Addr = MemMgr.allocateDataSection(CommonSize, CommonAlign, SectionID,
+                                             "<common symbols>", false);
   if (!Addr)
     report_fatal_error("Unable to allocate memory for common symbols!");
   uint64_t Offset = 0;
@@ -531,9 +617,11 @@ void RuntimeDyldImpl::emitCommonSymbols(const ObjectFile &Obj,
   for (auto &Sym : SymbolsToAllocate) {
     uint32_t Align = Sym.getAlignment();
     uint64_t Size = Sym.getCommonSize();
-    ErrorOr<StringRef> NameOrErr = Sym.getName();
-    Check(NameOrErr.getError());
-    StringRef Name = *NameOrErr;
+    StringRef Name;
+    if (auto NameOrErr = Sym.getName())
+      Name = *NameOrErr;
+    else
+      return NameOrErr.takeError();
     if (Align) {
       // This symbol has an alignment requirement.
       uint64_t AlignOffset = OffsetToAlignment((uint64_t)Addr, Align);
@@ -556,24 +644,29 @@ void RuntimeDyldImpl::emitCommonSymbols(const ObjectFile &Obj,
 
   if (Checker)
     Checker->registerSection(Obj.getFileName(), SectionID);
-}
 
-unsigned RuntimeDyldImpl::emitSection(const ObjectFile &Obj,
-                                      const SectionRef &Section, bool IsCode) {
+  return Error::success();
+}
 
+Expected<unsigned>
+RuntimeDyldImpl::emitSection(const ObjectFile &Obj,
+                             const SectionRef &Section,
+                             bool IsCode) {
   StringRef data;
   uint64_t Alignment64 = Section.getAlignment();
 
   unsigned Alignment = (unsigned)Alignment64 & 0xffffffffL;
   unsigned PaddingSize = 0;
   unsigned StubBufSize = 0;
-  StringRef Name;
   bool IsRequired = isRequiredForExecution(Section);
   bool IsVirtual = Section.isVirtual();
   bool IsZeroInit = isZeroInit(Section);
   bool IsReadOnly = isReadOnlyData(Section);
   uint64_t DataSize = Section.getSize();
-  Check(Section.getName(Name));
+
+  StringRef Name;
+  if (auto EC = Section.getName(Name))
+    return errorCodeToError(EC);
 
   StubBufSize = computeSectionStubBufSize(Obj, Section);
 
@@ -593,7 +686,8 @@ unsigned RuntimeDyldImpl::emitSection(const ObjectFile &Obj,
   if (!IsVirtual && !IsZeroInit) {
     // In either case, set the location of the unrelocated section in memory,
     // since we still process relocations for it even if we're not applying them.
-    Check(Section.getContents(data));
+    if (auto EC = Section.getContents(data))
+      return errorCodeToError(EC);
     pData = data.data();
   }
 
@@ -655,17 +749,21 @@ unsigned RuntimeDyldImpl::emitSection(const ObjectFile &Obj,
   return SectionID;
 }
 
-unsigned RuntimeDyldImpl::findOrEmitSection(const ObjectFile &Obj,
-                                            const SectionRef &Section,
-                                            bool IsCode,
-                                            ObjSectionToIDMap &LocalSections) {
+Expected<unsigned>
+RuntimeDyldImpl::findOrEmitSection(const ObjectFile &Obj,
+                                   const SectionRef &Section,
+                                   bool IsCode,
+                                   ObjSectionToIDMap &LocalSections) {
 
   unsigned SectionID = 0;
   ObjSectionToIDMap::iterator i = LocalSections.find(Section);
   if (i != LocalSections.end())
     SectionID = i->second;
   else {
-    SectionID = emitSection(Obj, Section, IsCode);
+    if (auto SectionIDOrErr = emitSection(Obj, Section, IsCode))
+      SectionID = *SectionIDOrErr;
+    else
+      return SectionIDOrErr.takeError();
     LocalSections[Section] = SectionID;
   }
   return SectionID;
@@ -718,7 +816,10 @@ uint8_t *RuntimeDyldImpl::createStubFunction(uint8_t *Addr,
     // 8:   03200008        jr      t9.
     // c:   00000000        nop.
     const unsigned LuiT9Instr = 0x3c190000, AdduiT9Instr = 0x27390000;
-    const unsigned JrT9Instr = 0x03200008, NopInstr = 0x0;
+    const unsigned NopInstr = 0x0;
+    unsigned JrT9Instr = 0x03200008;
+    if ((AbiVariant & ELF::EF_MIPS_ARCH) == ELF::EF_MIPS_ARCH_32R6)
+        JrT9Instr = 0x03200009;
 
     writeBytesUnaligned(LuiT9Instr, Addr, 4);
     writeBytesUnaligned(AdduiT9Instr, Addr+4, 4);
@@ -818,7 +919,11 @@ void RuntimeDyldImpl::resolveExternalSymbols() {
       if (Loc == GlobalSymbolTable.end()) {
         // This is an external symbol, try to get its address from the symbol
         // resolver.
-        Addr = Resolver.findSymbol(Name.data()).getAddress();
+        // First search for the symbol in this logical dylib.
+        Addr = Resolver.findSymbolInLogicalDylib(Name.data()).getAddress();
+        // If that fails, try searching for an external symbol.
+        if (!Addr)
+          Addr = Resolver.findSymbol(Name.data()).getAddress();
         // The call to getSymbolAddress may have caused additional modules to
         // be loaded, which may have added new entries to the
         // ExternalSymbolRelocations map.  Consquently, we need to update our
diff --git a/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldCOFF.cpp b/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldCOFF.cpp
index e5fab929ea29..24bd9a002c20 100644
--- a/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldCOFF.cpp
+++ b/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldCOFF.cpp
@@ -13,6 +13,7 @@
 
 #include "RuntimeDyldCOFF.h"
 #include "Targets/RuntimeDyldCOFFI386.h"
+#include "Targets/RuntimeDyldCOFFThumb.h"
 #include "Targets/RuntimeDyldCOFFX86_64.h"
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/Triple.h"
@@ -45,11 +46,11 @@ llvm::RuntimeDyldCOFF::create(Triple::ArchType Arch,
                               RuntimeDyld::MemoryManager &MemMgr,
                               RuntimeDyld::SymbolResolver &Resolver) {
   switch (Arch) {
-  default:
-    llvm_unreachable("Unsupported target for RuntimeDyldCOFF.");
-    break;
+  default: llvm_unreachable("Unsupported target for RuntimeDyldCOFF.");
   case Triple::x86:
     return make_unique<RuntimeDyldCOFFI386>(MemMgr, Resolver);
+  case Triple::thumb:
+    return make_unique<RuntimeDyldCOFFThumb>(MemMgr, Resolver);
   case Triple::x86_64:
     return make_unique<RuntimeDyldCOFFX86_64>(MemMgr, Resolver);
   }
@@ -57,7 +58,14 @@ llvm::RuntimeDyldCOFF::create(Triple::ArchType Arch,
 
 std::unique_ptr<RuntimeDyld::LoadedObjectInfo>
 RuntimeDyldCOFF::loadObject(const object::ObjectFile &O) {
-  return llvm::make_unique<LoadedCOFFObjectInfo>(*this, loadObjectImpl(O));
+  if (auto ObjSectionToIDOrErr = loadObjectImpl(O)) {
+    return llvm::make_unique<LoadedCOFFObjectInfo>(*this, *ObjSectionToIDOrErr);
+  } else {
+    HasError = true;
+    raw_string_ostream ErrStream(ErrorStr);
+    logAllUnhandledErrors(ObjSectionToIDOrErr.takeError(), ErrStream, "");
+    return nullptr;
+  }
 }
 
 uint64_t RuntimeDyldCOFF::getSymbolOffset(const SymbolRef &Sym) {
diff --git a/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldCOFF.h b/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldCOFF.h
index 32b8fa269be0..03a91f6cf690 100644
--- a/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldCOFF.h
+++ b/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldCOFF.h
@@ -15,7 +15,6 @@
 #define LLVM_RUNTIME_DYLD_COFF_H
 
 #include "RuntimeDyldImpl.h"
-#include "llvm/ADT/DenseMap.h"
 
 #define DEBUG_TYPE "dyld"
 
diff --git a/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldChecker.cpp b/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldChecker.cpp
index 58ce88a68f23..090b9a3857e3 100644
--- a/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldChecker.cpp
+++ b/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldChecker.cpp
@@ -7,16 +7,17 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "llvm/ADT/STLExtras.h"
+#include "llvm/ExecutionEngine/RuntimeDyldChecker.h"
 #include "RuntimeDyldCheckerImpl.h"
 #include "RuntimeDyldImpl.h"
-#include "llvm/ExecutionEngine/RuntimeDyldChecker.h"
+#include "llvm/ADT/STLExtras.h"
 #include "llvm/MC/MCContext.h"
-#include "llvm/MC/MCDisassembler.h"
+#include "llvm/MC/MCDisassembler/MCDisassembler.h"
 #include "llvm/MC/MCInst.h"
 #include "llvm/Support/Path.h"
 #include <cctype>
 #include <memory>
+#include <utility>
 
 #define DEBUG_TYPE "rtdyld"
 
@@ -97,7 +98,8 @@ private:
   public:
     EvalResult() : Value(0), ErrorMsg("") {}
     EvalResult(uint64_t Value) : Value(Value), ErrorMsg("") {}
-    EvalResult(std::string ErrorMsg) : Value(0), ErrorMsg(ErrorMsg) {}
+    EvalResult(std::string ErrorMsg)
+        : Value(0), ErrorMsg(std::move(ErrorMsg)) {}
     uint64_t getValue() const { return Value; }
     bool hasError() const { return ErrorMsg != ""; }
     const std::string &getErrorMsg() const { return ErrorMsg; }
@@ -582,7 +584,7 @@ private:
   // Returns a pair containing the result of the slice operation, plus the
   // expression remaining to be parsed.
   std::pair<EvalResult, StringRef>
-  evalSliceExpr(std::pair<EvalResult, StringRef> Ctx) const {
+  evalSliceExpr(const std::pair<EvalResult, StringRef> &Ctx) const {
     EvalResult SubExprResult;
     StringRef RemainingExpr;
     std::tie(SubExprResult, RemainingExpr) = Ctx;
@@ -626,7 +628,7 @@ private:
   // Returns a pair containing the ultimate result of evaluating the
   // expression, plus the expression remaining to be evaluated.
   std::pair<EvalResult, StringRef>
-  evalComplexExpr(std::pair<EvalResult, StringRef> LHSAndRemaining,
+  evalComplexExpr(const std::pair<EvalResult, StringRef> &LHSAndRemaining,
                   ParseContext PCtx) const {
     EvalResult LHSResult;
     StringRef RemainingExpr;
diff --git a/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldCheckerImpl.h b/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldCheckerImpl.h
index 69d2a7d6b668..b7263be09934 100644
--- a/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldCheckerImpl.h
+++ b/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldCheckerImpl.h
@@ -11,7 +11,6 @@
 #define LLVM_LIB_EXECUTIONENGINE_RUNTIMEDYLD_RUNTIMEDYLDCHECKERIMPL_H
 
 #include "RuntimeDyldImpl.h"
-#include <set>
 
 namespace llvm {
 
diff --git a/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldELF.cpp b/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldELF.cpp
index e09b71af18a5..9cbdb13a3572 100644
--- a/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldELF.cpp
+++ b/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldELF.cpp
@@ -30,13 +30,6 @@ using namespace llvm::object;
 
 #define DEBUG_TYPE "dyld"
 
-static inline std::error_code check(std::error_code Err) {
-  if (Err) {
-    report_fatal_error(Err.message());
-  }
-  return Err;
-}
-
 namespace {
 
 template <class ELFT> class DyldELFObject : public ELFObjectFile<ELFT> {
@@ -220,7 +213,14 @@ void RuntimeDyldELF::deregisterEHFrames() {
 
 std::unique_ptr<RuntimeDyld::LoadedObjectInfo>
 RuntimeDyldELF::loadObject(const object::ObjectFile &O) {
-  return llvm::make_unique<LoadedELFObjectInfo>(*this, loadObjectImpl(O));
+  if (auto ObjSectionToIDOrErr = loadObjectImpl(O))
+    return llvm::make_unique<LoadedELFObjectInfo>(*this, *ObjSectionToIDOrErr);
+  else {
+    HasError = true;
+    raw_string_ostream ErrStream(ErrorStr);
+    logAllUnhandledErrors(ObjSectionToIDOrErr.takeError(), ErrStream, "");
+    return nullptr;
+  }
 }
 
 void RuntimeDyldELF::resolveX86_64Relocation(const SectionEntry &Section,
@@ -781,9 +781,9 @@ void RuntimeDyldELF::applyMIPS64Relocation(uint8_t *TargetPtr,
 }
 
 // Return the .TOC. section and offset.
-void RuntimeDyldELF::findPPC64TOCSection(const ELFObjectFileBase &Obj,
-                                         ObjSectionToIDMap &LocalSections,
-                                         RelocationValueRef &Rel) {
+Error RuntimeDyldELF::findPPC64TOCSection(const ELFObjectFileBase &Obj,
+                                          ObjSectionToIDMap &LocalSections,
+                                          RelocationValueRef &Rel) {
   // Set a default SectionID in case we do not find a TOC section below.
   // This may happen for references to TOC base base (sym@toc, .odp
   // relocation) without a .toc directive.  In this case just use the
@@ -796,13 +796,18 @@ void RuntimeDyldELF::findPPC64TOCSection(const ELFObjectFileBase &Obj,
   // order. The TOC starts where the first of these sections starts.
   for (auto &Section: Obj.sections()) {
     StringRef SectionName;
-    check(Section.getName(SectionName));
+    if (auto EC = Section.getName(SectionName))
+      return errorCodeToError(EC);
 
     if (SectionName == ".got"
         || SectionName == ".toc"
         || SectionName == ".tocbss"
         || SectionName == ".plt") {
-      Rel.SectionID = findOrEmitSection(Obj, Section, false, LocalSections);
+      if (auto SectionIDOrErr =
+            findOrEmitSection(Obj, Section, false, LocalSections))
+        Rel.SectionID = *SectionIDOrErr;
+      else
+        return SectionIDOrErr.takeError();
       break;
     }
   }
@@ -810,13 +815,15 @@ void RuntimeDyldELF::findPPC64TOCSection(const ELFObjectFileBase &Obj,
   // Per the ppc64-elf-linux ABI, The TOC base is TOC value plus 0x8000
   // thus permitting a full 64 Kbytes segment.
   Rel.Addend = 0x8000;
+
+  return Error::success();
 }
 
 // Returns the sections and offset associated with the ODP entry referenced
 // by Symbol.
-void RuntimeDyldELF::findOPDEntrySection(const ELFObjectFileBase &Obj,
-                                         ObjSectionToIDMap &LocalSections,
-                                         RelocationValueRef &Rel) {
+Error RuntimeDyldELF::findOPDEntrySection(const ELFObjectFileBase &Obj,
+                                          ObjSectionToIDMap &LocalSections,
+                                          RelocationValueRef &Rel) {
   // Get the ELF symbol value (st_value) to compare with Relocation offset in
   // .opd entries
   for (section_iterator si = Obj.section_begin(), se = Obj.section_end();
@@ -826,7 +833,9 @@ void RuntimeDyldELF::findOPDEntrySection(const ELFObjectFileBase &Obj,
       continue;
 
     StringRef RelSectionName;
-    check(RelSecI->getName(RelSectionName));
+    if (auto EC = RelSecI->getName(RelSectionName))
+      return errorCodeToError(EC);
+
     if (RelSectionName != ".opd")
       continue;
 
@@ -843,9 +852,11 @@ void RuntimeDyldELF::findOPDEntrySection(const ELFObjectFileBase &Obj,
 
       uint64_t TargetSymbolOffset = i->getOffset();
       symbol_iterator TargetSymbol = i->getSymbol();
-      ErrorOr<int64_t> AddendOrErr = i->getAddend();
-      Check(AddendOrErr.getError());
-      int64_t Addend = *AddendOrErr;
+      int64_t Addend;
+      if (auto AddendOrErr = i->getAddend())
+        Addend = *AddendOrErr;
+      else
+        return errorCodeToError(AddendOrErr.getError());
 
       ++i;
       if (i == e)
@@ -862,13 +873,21 @@ void RuntimeDyldELF::findOPDEntrySection(const ELFObjectFileBase &Obj,
       if (Rel.Addend != (int64_t)TargetSymbolOffset)
         continue;
 
-      ErrorOr<section_iterator> TSIOrErr = TargetSymbol->getSection();
-      check(TSIOrErr.getError());
-      section_iterator tsi = *TSIOrErr;
-      bool IsCode = tsi->isText();
-      Rel.SectionID = findOrEmitSection(Obj, (*tsi), IsCode, LocalSections);
+      section_iterator TSI = Obj.section_end();
+      if (auto TSIOrErr = TargetSymbol->getSection())
+        TSI = *TSIOrErr;
+      else
+        return TSIOrErr.takeError();
+      assert(TSI != Obj.section_end() && "TSI should refer to a valid section");
+
+      bool IsCode = TSI->isText();
+      if (auto SectionIDOrErr = findOrEmitSection(Obj, *TSI, IsCode,
+                                                  LocalSections))
+        Rel.SectionID = *SectionIDOrErr;
+      else
+        return SectionIDOrErr.takeError();
       Rel.Addend = (intptr_t)Addend;
-      return;
+      return Error::success();
     }
   }
   llvm_unreachable("Attempting to get address of ODP entry!");
@@ -1047,6 +1066,11 @@ void RuntimeDyldELF::resolveSystemZRelocation(const SectionEntry &Section,
   case ELF::R_390_64:
     writeInt64BE(LocalAddress, Value + Addend);
     break;
+  case ELF::R_390_PC64: {
+    int64_t Delta = (Value + Addend) - Section.getLoadAddressWithOffset(Offset);
+    writeInt64BE(LocalAddress, Delta);
+    break;
+  }
   }
 }
 
@@ -1163,7 +1187,8 @@ uint32_t RuntimeDyldELF::getMatchingLoRelocation(uint32_t RelType,
   return ELF::R_MIPS_NONE;
 }
 
-relocation_iterator RuntimeDyldELF::processRelocationRef(
+Expected<relocation_iterator>
+RuntimeDyldELF::processRelocationRef(
     unsigned SectionID, relocation_iterator RelI, const ObjectFile &O,
     ObjSectionToIDMap &ObjSectionToID, StubMap &Stubs) {
   const auto &Obj = cast<ELFObjectFileBase>(O);
@@ -1175,10 +1200,10 @@ relocation_iterator RuntimeDyldELF::processRelocationRef(
   // Obtain the symbol name which is referenced in the relocation
   StringRef TargetName;
   if (Symbol != Obj.symbol_end()) {
-    ErrorOr<StringRef> TargetNameOrErr = Symbol->getName();
-    if (std::error_code EC = TargetNameOrErr.getError())
-      report_fatal_error(EC.message());
-    TargetName = *TargetNameOrErr;
+    if (auto TargetNameOrErr = Symbol->getName())
+      TargetName = *TargetNameOrErr;
+    else
+      return TargetNameOrErr.takeError();
   }
   DEBUG(dbgs() << "\t\tRelType: " << RelType << " Addend: " << Addend
                << " TargetName: " << TargetName << "\n");
@@ -1190,7 +1215,15 @@ relocation_iterator RuntimeDyldELF::processRelocationRef(
   RTDyldSymbolTable::const_iterator gsi = GlobalSymbolTable.end();
   if (Symbol != Obj.symbol_end()) {
     gsi = GlobalSymbolTable.find(TargetName.data());
-    SymType = Symbol->getType();
+    Expected<SymbolRef::Type> SymTypeOrErr = Symbol->getType();
+    if (!SymTypeOrErr) {
+      std::string Buf;
+      raw_string_ostream OS(Buf);
+      logAllUnhandledErrors(SymTypeOrErr.takeError(), OS, "");
+      OS.flush();
+      report_fatal_error(Buf);
+    }
+    SymType = *SymTypeOrErr;
   }
   if (gsi != GlobalSymbolTable.end()) {
     const auto &SymInfo = gsi->second;
@@ -1203,12 +1236,24 @@ relocation_iterator RuntimeDyldELF::processRelocationRef(
       // TODO: Now ELF SymbolRef::ST_Debug = STT_SECTION, it's not obviously
       // and can be changed by another developers. Maybe best way is add
       // a new symbol type ST_Section to SymbolRef and use it.
-      section_iterator si = *Symbol->getSection();
+      auto SectionOrErr = Symbol->getSection();
+      if (!SectionOrErr) {
+        std::string Buf;
+        raw_string_ostream OS(Buf);
+        logAllUnhandledErrors(SectionOrErr.takeError(), OS, "");
+        OS.flush();
+        report_fatal_error(Buf);
+      }
+      section_iterator si = *SectionOrErr;
       if (si == Obj.section_end())
         llvm_unreachable("Symbol section not found, bad object file format!");
       DEBUG(dbgs() << "\t\tThis is section symbol\n");
       bool isCode = si->isText();
-      Value.SectionID = findOrEmitSection(Obj, (*si), isCode, ObjSectionToID);
+      if (auto SectionIDOrErr = findOrEmitSection(Obj, (*si), isCode,
+                                                  ObjSectionToID))
+        Value.SectionID = *SectionIDOrErr;
+      else
+        return SectionIDOrErr.takeError();
       Value.Addend = Addend;
       break;
     }
@@ -1289,7 +1334,7 @@ relocation_iterator RuntimeDyldELF::processRelocationRef(
     if (RelType == ELF::R_ARM_PC24 || RelType == ELF::R_ARM_CALL ||
       RelType == ELF::R_ARM_JUMP24) {
       // This is an ARM branch relocation, need to use a stub function.
-      DEBUG(dbgs() << "\t\tThis is an ARM branch relocation.");
+      DEBUG(dbgs() << "\t\tThis is an ARM branch relocation.\n");
       SectionEntry &Section = Sections[SectionID];
 
       // Look for an existing stub.
@@ -1357,8 +1402,12 @@ relocation_iterator RuntimeDyldELF::processRelocationRef(
         // Create a new stub function.
         DEBUG(dbgs() << " Create a new stub function\n");
         Stubs[Value] = Section.getStubOffset();
+
+        unsigned AbiVariant;
+        O.getPlatformFlags(AbiVariant);
+
         uint8_t *StubTargetAddr = createStubFunction(
-            Section.getAddressWithOffset(Section.getStubOffset()));
+            Section.getAddressWithOffset(Section.getStubOffset()), AbiVariant);
 
         // Creating Hi and Lo relocations for the filled stub instructions.
         RelocationEntry REHi(SectionID, StubTargetAddr - Section.getAddress(),
@@ -1453,7 +1502,8 @@ relocation_iterator RuntimeDyldELF::processRelocationRef(
           // In the ELFv1 ABI, a function call may point to the .opd entry,
           // so the final symbol value is calculated based on the relocation
           // values in the .opd section.
-          findOPDEntrySection(Obj, ObjSectionToID, Value);
+          if (auto Err = findOPDEntrySection(Obj, ObjSectionToID, Value))
+            return std::move(Err);
         } else {
           // In the ELFv2 ABI, a function symbol may provide a local entry
           // point, which must be used for direct calls.
@@ -1565,7 +1615,8 @@ relocation_iterator RuntimeDyldELF::processRelocationRef(
       }
 
       RelocationValueRef TOCValue;
-      findPPC64TOCSection(Obj, ObjSectionToID, TOCValue);
+      if (auto Err = findPPC64TOCSection(Obj, ObjSectionToID, TOCValue))
+        return std::move(Err);
       if (Value.SymbolName || Value.SectionID != TOCValue.SectionID)
         llvm_unreachable("Unsupported TOC relocation.");
       Value.Addend -= TOCValue.Addend;
@@ -1577,9 +1628,11 @@ relocation_iterator RuntimeDyldELF::processRelocationRef(
       // symbols (in which case the addend is respected).
       if (RelType == ELF::R_PPC64_TOC) {
         RelType = ELF::R_PPC64_ADDR64;
-        findPPC64TOCSection(Obj, ObjSectionToID, Value);
+        if (auto Err = findPPC64TOCSection(Obj, ObjSectionToID, Value))
+          return std::move(Err);
       } else if (TargetName == ".TOC.") {
-        findPPC64TOCSection(Obj, ObjSectionToID, Value);
+        if (auto Err = findPPC64TOCSection(Obj, ObjSectionToID, Value))
+          return std::move(Err);
         Value.Addend += Addend;
       }
 
@@ -1700,7 +1753,9 @@ relocation_iterator RuntimeDyldELF::processRelocationRef(
                   Value.Offset);
         addRelocationForSection(RE, Value.SectionID);
       }
-    } else if (RelType == ELF::R_X86_64_GOTPCREL) {
+    } else if (RelType == ELF::R_X86_64_GOTPCREL ||
+               RelType == ELF::R_X86_64_GOTPCRELX ||
+               RelType == ELF::R_X86_64_REX_GOTPCRELX) {
       uint64_t GOTOffset = allocateGOTEntries(SectionID, 1);
       resolveGOTOffsetRelocation(SectionID, Offset, GOTOffset + Addend);
 
@@ -1791,11 +1846,11 @@ RelocationEntry RuntimeDyldELF::computeGOTOffsetRE(unsigned SectionID, uint64_t
   return RelocationEntry(GOTSectionID, GOTOffset, Type, SymbolOffset);
 }
 
-void RuntimeDyldELF::finalizeLoad(const ObjectFile &Obj,
+Error RuntimeDyldELF::finalizeLoad(const ObjectFile &Obj,
                                   ObjSectionToIDMap &SectionMap) {
   if (IsMipsO32ABI)
     if (!PendingRelocs.empty())
-      report_fatal_error("Can't find matching LO16 reloc");
+      return make_error<RuntimeDyldError>("Can't find matching LO16 reloc");
 
   // If necessary, allocate the global offset table
   if (GOTSectionID != 0) {
@@ -1804,7 +1859,7 @@ void RuntimeDyldELF::finalizeLoad(const ObjectFile &Obj,
     uint8_t *Addr = MemMgr.allocateDataSection(TotalSize, getGOTEntrySize(),
                                                 GOTSectionID, ".got", false);
     if (!Addr)
-      report_fatal_error("Unable to allocate memory for GOT!");
+      return make_error<RuntimeDyldError>("Unable to allocate memory for GOT!");
 
     Sections[GOTSectionID] =
         SectionEntry(".got", Addr, TotalSize, TotalSize, 0);
@@ -1845,6 +1900,8 @@ void RuntimeDyldELF::finalizeLoad(const ObjectFile &Obj,
 
   GOTSectionID = 0;
   CurrentGOTIndex = 0;
+
+  return Error::success();
 }
 
 bool RuntimeDyldELF::isCompatibleFile(const object::ObjectFile &Obj) const {
@@ -1861,6 +1918,8 @@ bool RuntimeDyldELF::relocationNeedsStub(const RelocationRef &R) const {
 
 
   case ELF::R_X86_64_GOTPCREL:
+  case ELF::R_X86_64_GOTPCRELX:
+  case ELF::R_X86_64_REX_GOTPCRELX:
   case ELF::R_X86_64_PC32:
   case ELF::R_X86_64_PC64:
   case ELF::R_X86_64_64:
diff --git a/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldELF.h b/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldELF.h
index 041811d3e285..82931b9f45a6 100644
--- a/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldELF.h
+++ b/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldELF.h
@@ -20,6 +20,9 @@
 using namespace llvm;
 
 namespace llvm {
+namespace object {
+class ELFObjectFileBase;
+}
 
 class RuntimeDyldELF : public RuntimeDyldImpl {
 
@@ -90,12 +93,12 @@ class RuntimeDyldELF : public RuntimeDyldImpl {
 
   void setMipsABI(const ObjectFile &Obj) override;
 
-  void findPPC64TOCSection(const ELFObjectFileBase &Obj,
-                           ObjSectionToIDMap &LocalSections,
-                           RelocationValueRef &Rel);
-  void findOPDEntrySection(const ELFObjectFileBase &Obj,
-                           ObjSectionToIDMap &LocalSections,
-                           RelocationValueRef &Rel);
+  Error findPPC64TOCSection(const ELFObjectFileBase &Obj,
+                            ObjSectionToIDMap &LocalSections,
+                            RelocationValueRef &Rel);
+  Error findOPDEntrySection(const ELFObjectFileBase &Obj,
+                            ObjSectionToIDMap &LocalSections,
+                            RelocationValueRef &Rel);
 
   size_t getGOTEntrySize();
 
@@ -163,7 +166,7 @@ public:
   loadObject(const object::ObjectFile &O) override;
 
   void resolveRelocation(const RelocationEntry &RE, uint64_t Value) override;
-  relocation_iterator
+  Expected<relocation_iterator>
   processRelocationRef(unsigned SectionID, relocation_iterator RelI,
                        const ObjectFile &Obj,
                        ObjSectionToIDMap &ObjSectionToID,
@@ -171,8 +174,8 @@ public:
   bool isCompatibleFile(const object::ObjectFile &Obj) const override;
   void registerEHFrames() override;
   void deregisterEHFrames() override;
-  void finalizeLoad(const ObjectFile &Obj,
-                    ObjSectionToIDMap &SectionMap) override;
+  Error finalizeLoad(const ObjectFile &Obj,
+                     ObjSectionToIDMap &SectionMap) override;
 };
 
 } // end namespace llvm
diff --git a/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldImpl.h b/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldImpl.h
index ab732c69ee2f..76bd3fc295b8 100644
--- a/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldImpl.h
+++ b/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldImpl.h
@@ -14,7 +14,6 @@
 #ifndef LLVM_LIB_EXECUTIONENGINE_RUNTIMEDYLD_RUNTIMEDYLDIMPL_H
 #define LLVM_LIB_EXECUTIONENGINE_RUNTIMEDYLD_RUNTIMEDYLDIMPL_H
 
-#include "llvm/ADT/DenseMap.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/StringMap.h"
 #include "llvm/ADT/Triple.h"
@@ -28,7 +27,6 @@
 #include "llvm/Support/Host.h"
 #include "llvm/Support/Mutex.h"
 #include "llvm/Support/SwapByteOrder.h"
-#include "llvm/Support/raw_ostream.h"
 #include <map>
 #include <unordered_map>
 #include <system_error>
@@ -38,16 +36,12 @@ using namespace llvm::object;
 
 namespace llvm {
 
-  // Helper for extensive error checking in debug builds.
-inline std::error_code Check(std::error_code Err) {
-  if (Err) {
-    report_fatal_error(Err.message());
-  }
-  return Err;
-}
-
 class Twine;
 
+#define UNIMPLEMENTED_RELOC(RelType) \
+  case RelType: \
+    return make_error<RuntimeDyldError>("Unimplemented relocation: " #RelType)
+
 /// SectionEntry - represents a section emitted into memory by the dynamic
 /// linker.
 class SectionEntry {
@@ -302,13 +296,6 @@ protected:
   bool HasError;
   std::string ErrorStr;
 
-  // Set the error state and record an error string.
-  bool Error(const Twine &Msg) {
-    ErrorStr = Msg.str();
-    HasError = true;
-    return true;
-  }
-
   uint64_t getSectionLoadAddress(unsigned SectionID) const {
     return Sections[SectionID].getLoadAddress();
   }
@@ -361,22 +348,25 @@ protected:
   /// \brief Given the common symbols discovered in the object file, emit a
   /// new section for them and update the symbol mappings in the object and
   /// symbol table.
-  void emitCommonSymbols(const ObjectFile &Obj, CommonSymbolList &CommonSymbols);
+  Error emitCommonSymbols(const ObjectFile &Obj,
+                          CommonSymbolList &CommonSymbols);
 
   /// \brief Emits section data from the object file to the MemoryManager.
   /// \param IsCode if it's true then allocateCodeSection() will be
   ///        used for emits, else allocateDataSection() will be used.
   /// \return SectionID.
-  unsigned emitSection(const ObjectFile &Obj, const SectionRef &Section,
-                       bool IsCode);
+  Expected<unsigned> emitSection(const ObjectFile &Obj,
+                                 const SectionRef &Section,
+                                 bool IsCode);
 
   /// \brief Find Section in LocalSections. If the secton is not found - emit
   ///        it and store in LocalSections.
   /// \param IsCode if it's true then allocateCodeSection() will be
   ///        used for emmits, else allocateDataSection() will be used.
   /// \return SectionID.
-  unsigned findOrEmitSection(const ObjectFile &Obj, const SectionRef &Section,
-                             bool IsCode, ObjSectionToIDMap &LocalSections);
+  Expected<unsigned> findOrEmitSection(const ObjectFile &Obj,
+                                       const SectionRef &Section, bool IsCode,
+                                       ObjSectionToIDMap &LocalSections);
 
   // \brief Add a relocation entry that uses the given section.
   void addRelocationForSection(const RelocationEntry &RE, unsigned SectionID);
@@ -401,7 +391,7 @@ protected:
   ///        relocation pairs) and stores it to Relocations or SymbolRelocations
   ///        (this depends on the object file type).
   /// \return Iterator to the next relocation that needs to be parsed.
-  virtual relocation_iterator
+  virtual Expected<relocation_iterator>
   processRelocationRef(unsigned SectionID, relocation_iterator RelI,
                        const ObjectFile &Obj, ObjSectionToIDMap &ObjSectionToID,
                        StubMap &Stubs) = 0;
@@ -411,17 +401,17 @@ protected:
 
   // \brief Compute an upper bound of the memory that is required to load all
   // sections
-  void computeTotalAllocSize(const ObjectFile &Obj,
-                             uint64_t &CodeSize, uint32_t &CodeAlign,
-                             uint64_t &RODataSize, uint32_t &RODataAlign,
-                             uint64_t &RWDataSize, uint32_t &RWDataAlign);
+  Error computeTotalAllocSize(const ObjectFile &Obj,
+                              uint64_t &CodeSize, uint32_t &CodeAlign,
+                              uint64_t &RODataSize, uint32_t &RODataAlign,
+                              uint64_t &RWDataSize, uint32_t &RWDataAlign);
 
   // \brief Compute the stub buffer size required for a section
   unsigned computeSectionStubBufSize(const ObjectFile &Obj,
                                      const SectionRef &Section);
 
   // \brief Implementation of the generic part of the loadObject algorithm.
-  ObjSectionToIDMap loadObjectImpl(const object::ObjectFile &Obj);
+  Expected<ObjSectionToIDMap> loadObjectImpl(const object::ObjectFile &Obj);
 
   // \brief Return true if the relocation R may require allocating a stub.
   virtual bool relocationNeedsStub(const RelocationRef &R) const {
@@ -496,8 +486,10 @@ public:
 
   virtual void deregisterEHFrames();
 
-  virtual void finalizeLoad(const ObjectFile &ObjImg,
-                            ObjSectionToIDMap &SectionMap) {}
+  virtual Error finalizeLoad(const ObjectFile &ObjImg,
+                             ObjSectionToIDMap &SectionMap) {
+    return Error::success();
+  }
 };
 
 } // end namespace llvm
diff --git a/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldMachO.cpp b/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldMachO.cpp
index 739e8d65dbf4..fd109aea91d4 100644
--- a/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldMachO.cpp
+++ b/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldMachO.cpp
@@ -50,7 +50,8 @@ int64_t RuntimeDyldMachO::memcpyAddend(const RelocationEntry &RE) const {
   return static_cast<int64_t>(readBytesUnaligned(Src, NumBytes));
 }
 
-relocation_iterator RuntimeDyldMachO::processScatteredVANILLA(
+Expected<relocation_iterator>
+RuntimeDyldMachO::processScatteredVANILLA(
                           unsigned SectionID, relocation_iterator RelI,
                           const ObjectFile &BaseObjT,
                           RuntimeDyldMachO::ObjSectionToIDMap &ObjSectionToID) {
@@ -74,8 +75,12 @@ relocation_iterator RuntimeDyldMachO::processScatteredVANILLA(
   uint64_t SectionBaseAddr = TargetSI->getAddress();
   SectionRef TargetSection = *TargetSI;
   bool IsCode = TargetSection.isText();
-  uint32_t TargetSectionID =
-    findOrEmitSection(Obj, TargetSection, IsCode, ObjSectionToID);
+  uint32_t TargetSectionID = ~0U;
+  if (auto TargetSectionIDOrErr =
+        findOrEmitSection(Obj, TargetSection, IsCode, ObjSectionToID))
+    TargetSectionID = *TargetSectionIDOrErr;
+  else
+    return TargetSectionIDOrErr.takeError();
 
   Addend -= SectionBaseAddr;
   RelocationEntry R(SectionID, Offset, RelocType, Addend, IsPCRel, Size);
@@ -86,7 +91,8 @@ relocation_iterator RuntimeDyldMachO::processScatteredVANILLA(
 }
 
 
-RelocationValueRef RuntimeDyldMachO::getRelocationValueRef(
+Expected<RelocationValueRef>
+RuntimeDyldMachO::getRelocationValueRef(
     const ObjectFile &BaseTObj, const relocation_iterator &RI,
     const RelocationEntry &RE, ObjSectionToIDMap &ObjSectionToID) {
 
@@ -99,10 +105,11 @@ RelocationValueRef RuntimeDyldMachO::getRelocationValueRef(
   bool IsExternal = Obj.getPlainRelocationExternal(RelInfo);
   if (IsExternal) {
     symbol_iterator Symbol = RI->getSymbol();
-    ErrorOr<StringRef> TargetNameOrErr = Symbol->getName();
-    if (std::error_code EC = TargetNameOrErr.getError())
-      report_fatal_error(EC.message());
-    StringRef TargetName = *TargetNameOrErr;
+    StringRef TargetName;
+    if (auto TargetNameOrErr = Symbol->getName())
+      TargetName = *TargetNameOrErr;
+    else
+      return TargetNameOrErr.takeError();
     RTDyldSymbolTable::const_iterator SI =
       GlobalSymbolTable.find(TargetName.data());
     if (SI != GlobalSymbolTable.end()) {
@@ -116,7 +123,11 @@ RelocationValueRef RuntimeDyldMachO::getRelocationValueRef(
   } else {
     SectionRef Sec = Obj.getAnyRelocationSection(RelInfo);
     bool IsCode = Sec.isText();
-    Value.SectionID = findOrEmitSection(Obj, Sec, IsCode, ObjSectionToID);
+    if (auto SectionIDOrErr = findOrEmitSection(Obj, Sec, IsCode,
+                                                ObjSectionToID))
+      Value.SectionID = *SectionIDOrErr;
+    else
+      return SectionIDOrErr.takeError();
     uint64_t Addr = Sec.getAddress();
     Value.Offset = RE.Addend - Addr;
   }
@@ -164,7 +175,7 @@ RuntimeDyldMachO::getSectionByAddress(const MachOObjectFile &Obj,
 
 
 // Populate __pointers section.
-void RuntimeDyldMachO::populateIndirectSymbolPointersSection(
+Error RuntimeDyldMachO::populateIndirectSymbolPointersSection(
                                                     const MachOObjectFile &Obj,
                                                     const SectionRef &PTSection,
                                                     unsigned PTSectionID) {
@@ -191,10 +202,11 @@ void RuntimeDyldMachO::populateIndirectSymbolPointersSection(
     unsigned SymbolIndex =
       Obj.getIndirectSymbolTableEntry(DySymTabCmd, FirstIndirectSymbol + i);
     symbol_iterator SI = Obj.getSymbolByIndex(SymbolIndex);
-    ErrorOr<StringRef> IndirectSymbolNameOrErr = SI->getName();
-    if (std::error_code EC = IndirectSymbolNameOrErr.getError())
-      report_fatal_error(EC.message());
-    StringRef IndirectSymbolName = *IndirectSymbolNameOrErr;
+    StringRef IndirectSymbolName;
+    if (auto IndirectSymbolNameOrErr = SI->getName())
+      IndirectSymbolName = *IndirectSymbolNameOrErr;
+    else
+      return IndirectSymbolNameOrErr.takeError();
     DEBUG(dbgs() << "  " << IndirectSymbolName << ": index " << SymbolIndex
           << ", PT offset: " << PTEntryOffset << "\n");
     RelocationEntry RE(PTSectionID, PTEntryOffset,
@@ -202,6 +214,7 @@ void RuntimeDyldMachO::populateIndirectSymbolPointersSection(
     addRelocationForSymbol(RE, IndirectSymbolName);
     PTEntryOffset += PTEntrySize;
   }
+  return Error::success();
 }
 
 bool RuntimeDyldMachO::isCompatibleFile(const object::ObjectFile &Obj) const {
@@ -209,8 +222,9 @@ bool RuntimeDyldMachO::isCompatibleFile(const object::ObjectFile &Obj) const {
 }
 
 template <typename Impl>
-void RuntimeDyldMachOCRTPBase<Impl>::finalizeLoad(const ObjectFile &Obj,
-                                                  ObjSectionToIDMap &SectionMap) {
+Error
+RuntimeDyldMachOCRTPBase<Impl>::finalizeLoad(const ObjectFile &Obj,
+                                             ObjSectionToIDMap &SectionMap) {
   unsigned EHFrameSID = RTDYLD_INVALID_SECTION_ID;
   unsigned TextSID = RTDYLD_INVALID_SECTION_ID;
   unsigned ExceptTabSID = RTDYLD_INVALID_SECTION_ID;
@@ -222,20 +236,34 @@ void RuntimeDyldMachOCRTPBase<Impl>::finalizeLoad(const ObjectFile &Obj,
     // Force emission of the __text, __eh_frame, and __gcc_except_tab sections
     // if they're present. Otherwise call down to the impl to handle other
     // sections that have already been emitted.
-    if (Name == "__text")
-      TextSID = findOrEmitSection(Obj, Section, true, SectionMap);
-    else if (Name == "__eh_frame")
-      EHFrameSID = findOrEmitSection(Obj, Section, false, SectionMap);
-    else if (Name == "__gcc_except_tab")
-      ExceptTabSID = findOrEmitSection(Obj, Section, true, SectionMap);
-    else {
+    if (Name == "__text") {
+      if (auto TextSIDOrErr = findOrEmitSection(Obj, Section, true, SectionMap))
+        TextSID = *TextSIDOrErr;
+      else
+        return TextSIDOrErr.takeError();
+    } else if (Name == "__eh_frame") {
+      if (auto EHFrameSIDOrErr = findOrEmitSection(Obj, Section, false,
+                                                   SectionMap))
+        EHFrameSID = *EHFrameSIDOrErr;
+      else
+        return EHFrameSIDOrErr.takeError();
+    } else if (Name == "__gcc_except_tab") {
+      if (auto ExceptTabSIDOrErr = findOrEmitSection(Obj, Section, true,
+                                                     SectionMap))
+        ExceptTabSID = *ExceptTabSIDOrErr;
+      else
+        return ExceptTabSIDOrErr.takeError();
+    } else {
       auto I = SectionMap.find(Section);
       if (I != SectionMap.end())
-        impl().finalizeSection(Obj, I->second, Section);
+        if (auto Err = impl().finalizeSection(Obj, I->second, Section))
+          return Err;
     }
   }
   UnregisteredEHFrameSections.push_back(
     EHFrameRelatedSections(EHFrameSID, TextSID, ExceptTabSID));
+
+  return Error::success();
 }
 
 template <typename Impl>
@@ -302,9 +330,9 @@ void RuntimeDyldMachOCRTPBase<Impl>::registerEHFrames() {
 
     uint8_t *P = EHFrame->getAddress();
     uint8_t *End = P + EHFrame->getSize();
-    do {
+    while (P != End) {
       P = processFDE(P, DeltaForText, DeltaForEH);
-    } while (P != End);
+    }
 
     MemMgr.registerEHFrames(EHFrame->getAddress(), EHFrame->getLoadAddress(),
                             EHFrame->getSize());
@@ -333,7 +361,15 @@ RuntimeDyldMachO::create(Triple::ArchType Arch,
 
 std::unique_ptr<RuntimeDyld::LoadedObjectInfo>
 RuntimeDyldMachO::loadObject(const object::ObjectFile &O) {
-  return llvm::make_unique<LoadedMachOObjectInfo>(*this, loadObjectImpl(O));
+  if (auto ObjSectionToIDOrErr = loadObjectImpl(O))
+    return llvm::make_unique<LoadedMachOObjectInfo>(*this,
+                                                    *ObjSectionToIDOrErr);
+  else {
+    HasError = true;
+    raw_string_ostream ErrStream(ErrorStr);
+    logAllUnhandledErrors(ObjSectionToIDOrErr.takeError(), ErrStream, "");
+    return nullptr;
+  }
 }
 
 } // end namespace llvm
diff --git a/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldMachO.h b/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldMachO.h
index c8ae47b0db22..30f3bb3bf07d 100644
--- a/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldMachO.h
+++ b/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldMachO.h
@@ -80,10 +80,10 @@ protected:
   }
 
   /// Process a scattered vanilla relocation.
-  relocation_iterator processScatteredVANILLA(
-                           unsigned SectionID, relocation_iterator RelI,
-                           const ObjectFile &BaseObjT,
-                           RuntimeDyldMachO::ObjSectionToIDMap &ObjSectionToID);
+  Expected<relocation_iterator>
+  processScatteredVANILLA(unsigned SectionID, relocation_iterator RelI,
+                          const ObjectFile &BaseObjT,
+                          RuntimeDyldMachO::ObjSectionToIDMap &ObjSectionToID);
 
   /// Construct a RelocationValueRef representing the relocation target.
   /// For Symbols in known sections, this will return a RelocationValueRef
@@ -94,10 +94,11 @@ protected:
   /// In both cases the Addend field is *NOT* fixed up to be PC-relative. That
   /// should be done by the caller where appropriate by calling makePCRel on
   /// the RelocationValueRef.
-  RelocationValueRef getRelocationValueRef(const ObjectFile &BaseTObj,
-                                           const relocation_iterator &RI,
-                                           const RelocationEntry &RE,
-                                           ObjSectionToIDMap &ObjSectionToID);
+  Expected<RelocationValueRef>
+  getRelocationValueRef(const ObjectFile &BaseTObj,
+                        const relocation_iterator &RI,
+                        const RelocationEntry &RE,
+                        ObjSectionToIDMap &ObjSectionToID);
 
   /// Make the RelocationValueRef addend PC-relative.
   void makeValueAddendPCRel(RelocationValueRef &Value,
@@ -113,9 +114,9 @@ protected:
 
 
   // Populate __pointers section.
-  void populateIndirectSymbolPointersSection(const MachOObjectFile &Obj,
-                                             const SectionRef &PTSection,
-                                             unsigned PTSectionID);
+  Error populateIndirectSymbolPointersSection(const MachOObjectFile &Obj,
+                                              const SectionRef &PTSection,
+                                              unsigned PTSectionID);
 
 public:
 
@@ -154,8 +155,8 @@ public:
                            RuntimeDyld::SymbolResolver &Resolver)
     : RuntimeDyldMachO(MemMgr, Resolver) {}
 
-  void finalizeLoad(const ObjectFile &Obj,
-                    ObjSectionToIDMap &SectionMap) override;
+  Error finalizeLoad(const ObjectFile &Obj,
+                     ObjSectionToIDMap &SectionMap) override;
   void registerEHFrames() override;
 };
 
diff --git a/lib/ExecutionEngine/RuntimeDyld/Targets/RuntimeDyldCOFFI386.h b/lib/ExecutionEngine/RuntimeDyld/Targets/RuntimeDyldCOFFI386.h
index fbfbb3285233..44fda87e0f94 100644
--- a/lib/ExecutionEngine/RuntimeDyld/Targets/RuntimeDyldCOFFI386.h
+++ b/lib/ExecutionEngine/RuntimeDyld/Targets/RuntimeDyldCOFFI386.h
@@ -34,40 +34,66 @@ public:
 
   unsigned getStubAlignment() override { return 1; }
 
-  relocation_iterator processRelocationRef(unsigned SectionID,
-                                           relocation_iterator RelI,
-                                           const ObjectFile &Obj,
-                                           ObjSectionToIDMap &ObjSectionToID,
-                                           StubMap &Stubs) override {
+  Expected<relocation_iterator>
+  processRelocationRef(unsigned SectionID,
+                       relocation_iterator RelI,
+                       const ObjectFile &Obj,
+                       ObjSectionToIDMap &ObjSectionToID,
+                       StubMap &Stubs) override {
+
     auto Symbol = RelI->getSymbol();
     if (Symbol == Obj.symbol_end())
       report_fatal_error("Unknown symbol in relocation");
 
-    ErrorOr<StringRef> TargetNameOrErr = Symbol->getName();
-    if (auto EC = TargetNameOrErr.getError())
-      report_fatal_error(EC.message());
+    Expected<StringRef> TargetNameOrErr = Symbol->getName();
+    if (!TargetNameOrErr)
+      return TargetNameOrErr.takeError();
     StringRef TargetName = *TargetNameOrErr;
 
-    auto Section = *Symbol->getSection();
+    auto SectionOrErr = Symbol->getSection();
+    if (!SectionOrErr)
+      return SectionOrErr.takeError();
+    auto Section = *SectionOrErr;
 
     uint64_t RelType = RelI->getType();
     uint64_t Offset = RelI->getOffset();
 
+    // Determine the Addend used to adjust the relocation value.
+    uint64_t Addend = 0;
+    SectionEntry &AddendSection = Sections[SectionID];
+    uintptr_t ObjTarget = AddendSection.getObjAddress() + Offset;
+    uint8_t *Displacement = (uint8_t *)ObjTarget;
+
+    switch (RelType) {
+    case COFF::IMAGE_REL_I386_DIR32:
+    case COFF::IMAGE_REL_I386_DIR32NB:
+    case COFF::IMAGE_REL_I386_SECREL:
+    case COFF::IMAGE_REL_I386_REL32: {
+      Addend = readBytesUnaligned(Displacement, 4);
+      break;
+    }
+    default:
+      break;
+    }
+
 #if !defined(NDEBUG)
     SmallString<32> RelTypeName;
     RelI->getTypeName(RelTypeName);
 #endif
     DEBUG(dbgs() << "\t\tIn Section " << SectionID << " Offset " << Offset
                  << " RelType: " << RelTypeName << " TargetName: " << TargetName
-                 << "\n");
+                 << " Addend " << Addend << "\n");
 
     unsigned TargetSectionID = -1;
     if (Section == Obj.section_end()) {
       RelocationEntry RE(SectionID, Offset, RelType, 0, -1, 0, 0, 0, false, 0);
       addRelocationForSymbol(RE, TargetName);
     } else {
-      TargetSectionID =
-          findOrEmitSection(Obj, *Section, Section->isText(), ObjSectionToID);
+      if (auto TargetSectionIDOrErr =
+          findOrEmitSection(Obj, *Section, Section->isText(), ObjSectionToID))
+        TargetSectionID = *TargetSectionIDOrErr;
+      else
+        return TargetSectionIDOrErr.takeError();
 
       switch (RelType) {
       case COFF::IMAGE_REL_I386_ABSOLUTE:
@@ -77,7 +103,7 @@ public:
       case COFF::IMAGE_REL_I386_DIR32NB:
       case COFF::IMAGE_REL_I386_REL32: {
         RelocationEntry RE =
-            RelocationEntry(SectionID, Offset, RelType, 0, TargetSectionID,
+            RelocationEntry(SectionID, Offset, RelType, Addend, TargetSectionID,
                             getSymbolOffset(*Symbol), 0, 0, false, 0);
         addRelocationForSection(RE, TargetSectionID);
         break;
@@ -90,7 +116,7 @@ public:
       }
       case COFF::IMAGE_REL_I386_SECREL: {
         RelocationEntry RE = RelocationEntry(SectionID, Offset, RelType,
-                                             getSymbolOffset(*Symbol));
+                                             getSymbolOffset(*Symbol) + Addend);
         addRelocationForSection(RE, TargetSectionID);
         break;
       }
@@ -148,8 +174,10 @@ public:
     }
     case COFF::IMAGE_REL_I386_REL32: {
       // 32-bit relative displacement to the target.
-      uint64_t Result = Sections[RE.Sections.SectionA].getLoadAddress() -
-                        Section.getLoadAddress() + RE.Addend - 4 - RE.Offset;
+      uint64_t Result = RE.Sections.SectionA == static_cast<uint32_t>(-1)
+                            ? Value
+                            : Sections[RE.Sections.SectionA].getLoadAddress();
+      Result = Result - Section.getLoadAddress() + RE.Addend - 4 - RE.Offset;
       assert(static_cast<int32_t>(Result) <= INT32_MAX &&
              "relocation overflow");
       assert(static_cast<int32_t>(Result) >= INT32_MIN &&
@@ -190,9 +218,6 @@ public:
 
   void registerEHFrames() override {}
   void deregisterEHFrames() override {}
-
-  void finalizeLoad(const ObjectFile &Obj,
-                    ObjSectionToIDMap &SectionMap) override {}
 };
 
 }
diff --git a/lib/ExecutionEngine/RuntimeDyld/Targets/RuntimeDyldCOFFThumb.h b/lib/ExecutionEngine/RuntimeDyld/Targets/RuntimeDyldCOFFThumb.h
new file mode 100644
index 000000000000..ff7d1d439252
--- /dev/null
+++ b/lib/ExecutionEngine/RuntimeDyld/Targets/RuntimeDyldCOFFThumb.h
@@ -0,0 +1,291 @@
+//===--- RuntimeDyldCOFFThumb.h --- COFF/Thumb specific code ---*- C++ --*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// COFF thumb support for MC-JIT runtime dynamic linker.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_EXECUTIONENGINE_RUNTIMEDYLD_TARGETS_RUNTIMEDYLDCOFFTHUMB_H
+#define LLVM_LIB_EXECUTIONENGINE_RUNTIMEDYLD_TARGETS_RUNTIMEDYLDCOFFTHUMB_H
+
+#include "llvm/Object/COFF.h"
+#include "llvm/Support/COFF.h"
+#include "../RuntimeDyldCOFF.h"
+
+#define DEBUG_TYPE "dyld"
+
+namespace llvm {
+
+class RuntimeDyldCOFFThumb : public RuntimeDyldCOFF {
+public:
+  RuntimeDyldCOFFThumb(RuntimeDyld::MemoryManager &MM,
+                       RuntimeDyld::SymbolResolver &Resolver)
+      : RuntimeDyldCOFF(MM, Resolver) {}
+
+  unsigned getMaxStubSize() override {
+    return 16; // 8-byte load instructions, 4-byte jump, 4-byte padding
+  }
+
+  unsigned getStubAlignment() override { return 1; }
+
+  Expected<relocation_iterator>
+  processRelocationRef(unsigned SectionID,
+                       relocation_iterator RelI,
+                       const ObjectFile &Obj,
+                       ObjSectionToIDMap &ObjSectionToID,
+                       StubMap &Stubs) override {
+    auto Symbol = RelI->getSymbol();
+    if (Symbol == Obj.symbol_end())
+      report_fatal_error("Unknown symbol in relocation");
+
+    Expected<StringRef> TargetNameOrErr = Symbol->getName();
+    if (!TargetNameOrErr)
+      return TargetNameOrErr.takeError();
+    StringRef TargetName = *TargetNameOrErr;
+
+    auto SectionOrErr = Symbol->getSection();
+    if (!SectionOrErr)
+      return SectionOrErr.takeError();
+    auto Section = *SectionOrErr;
+
+    uint64_t RelType = RelI->getType();
+    uint64_t Offset = RelI->getOffset();
+
+    // Determine the Addend used to adjust the relocation value.
+    uint64_t Addend = 0;
+    SectionEntry &AddendSection = Sections[SectionID];
+    uintptr_t ObjTarget = AddendSection.getObjAddress() + Offset;
+    uint8_t *Displacement = (uint8_t *)ObjTarget;
+
+    switch (RelType) {
+    case COFF::IMAGE_REL_ARM_ADDR32:
+    case COFF::IMAGE_REL_ARM_ADDR32NB:
+    case COFF::IMAGE_REL_ARM_SECREL:
+      Addend = readBytesUnaligned(Displacement, 4);
+      break;
+    default:
+      break;
+    }
+
+#if !defined(NDEBUG)
+    SmallString<32> RelTypeName;
+    RelI->getTypeName(RelTypeName);
+#endif
+    DEBUG(dbgs() << "\t\tIn Section " << SectionID << " Offset " << Offset
+                 << " RelType: " << RelTypeName << " TargetName: " << TargetName
+                 << " Addend " << Addend << "\n");
+
+    unsigned TargetSectionID = -1;
+    if (Section == Obj.section_end()) {
+      RelocationEntry RE(SectionID, Offset, RelType, 0, -1, 0, 0, 0, false, 0);
+      addRelocationForSymbol(RE, TargetName);
+    } else {
+      if (auto TargetSectionIDOrErr =
+          findOrEmitSection(Obj, *Section, Section->isText(), ObjSectionToID))
+        TargetSectionID = *TargetSectionIDOrErr;
+      else
+        return TargetSectionIDOrErr.takeError();
+
+      switch (RelType) {
+      default: llvm_unreachable("unsupported relocation type");
+      case COFF::IMAGE_REL_ARM_ABSOLUTE:
+        // This relocation is ignored.
+        break;
+      case COFF::IMAGE_REL_ARM_ADDR32:
+      case COFF::IMAGE_REL_ARM_ADDR32NB: {
+        RelocationEntry RE =
+            RelocationEntry(SectionID, Offset, RelType, Addend, TargetSectionID,
+                            getSymbolOffset(*Symbol), 0, 0, false, 0);
+        addRelocationForSection(RE, TargetSectionID);
+        break;
+      }
+      case COFF::IMAGE_REL_ARM_SECTION: {
+        RelocationEntry RE =
+            RelocationEntry(TargetSectionID, Offset, RelType, 0);
+        addRelocationForSection(RE, TargetSectionID);
+        break;
+      }
+      case COFF::IMAGE_REL_ARM_SECREL: {
+        RelocationEntry RE = RelocationEntry(SectionID, Offset, RelType,
+                                             getSymbolOffset(*Symbol) + Addend);
+        addRelocationForSection(RE, TargetSectionID);
+        break;
+      }
+      case COFF::IMAGE_REL_ARM_MOV32T: {
+        RelocationEntry RE =
+            RelocationEntry(SectionID, Offset, RelType, Addend, TargetSectionID,
+                            getSymbolOffset(*Symbol), 0, 0, false, 0);
+        addRelocationForSection(RE, TargetSectionID);
+        break;
+      }
+      case COFF::IMAGE_REL_ARM_BRANCH20T:
+      case COFF::IMAGE_REL_ARM_BRANCH24T:
+      case COFF::IMAGE_REL_ARM_BLX23T: {
+        RelocationEntry RE =
+            RelocationEntry(SectionID, Offset, RelType,
+                            getSymbolOffset(*Symbol) + Addend, true, 0);
+        addRelocationForSection(RE, TargetSectionID);
+        break;
+      }
+      }
+    }
+
+    return ++RelI;
+  }
+
+  void resolveRelocation(const RelocationEntry &RE, uint64_t Value) override {
+    const auto Section = Sections[RE.SectionID];
+    uint8_t *Target = Section.getAddressWithOffset(RE.Offset);
+
+    switch (RE.RelType) {
+    default: llvm_unreachable("unsupported relocation type");
+    case COFF::IMAGE_REL_ARM_ABSOLUTE:
+      // This relocation is ignored.
+      break;
+    case COFF::IMAGE_REL_ARM_ADDR32: {
+      // The target's 32-bit VA.
+      uint64_t Result =
+          RE.Sections.SectionA == static_cast<uint32_t>(-1)
+              ? Value
+              : Sections[RE.Sections.SectionA].getLoadAddressWithOffset(RE.Addend);
+      assert(static_cast<int32_t>(Result) <= INT32_MAX &&
+             "relocation overflow");
+      assert(static_cast<int32_t>(Result) >= INT32_MIN &&
+             "relocation underflow");
+      DEBUG(dbgs() << "\t\tOffset: " << RE.Offset
+                   << " RelType: IMAGE_REL_ARM_ADDR32"
+                   << " TargetSection: " << RE.Sections.SectionA
+                   << " Value: " << format("0x%08" PRIx32, Result) << '\n');
+      writeBytesUnaligned(Result, Target, 4);
+      break;
+    }
+    case COFF::IMAGE_REL_ARM_ADDR32NB: {
+      // The target's 32-bit RVA.
+      // NOTE: use Section[0].getLoadAddress() as an approximation of ImageBase
+      uint64_t Result = Sections[RE.Sections.SectionA].getLoadAddress() -
+                        Sections[0].getLoadAddress() + RE.Addend;
+      assert(static_cast<int32_t>(Result) <= INT32_MAX &&
+             "relocation overflow");
+      assert(static_cast<int32_t>(Result) >= INT32_MIN &&
+             "relocation underflow");
+      DEBUG(dbgs() << "\t\tOffset: " << RE.Offset
+                   << " RelType: IMAGE_REL_ARM_ADDR32NB"
+                   << " TargetSection: " << RE.Sections.SectionA
+                   << " Value: " << format("0x%08" PRIx32, Result) << '\n');
+      writeBytesUnaligned(Result, Target, 4);
+      break;
+    }
+    case COFF::IMAGE_REL_ARM_SECTION:
+      // 16-bit section index of the section that contains the target.
+      assert(static_cast<int32_t>(RE.SectionID) <= INT16_MAX &&
+             "relocation overflow");
+      assert(static_cast<int32_t>(RE.SectionID) >= INT16_MIN &&
+             "relocation underflow");
+      DEBUG(dbgs() << "\t\tOffset: " << RE.Offset
+                   << " RelType: IMAGE_REL_ARM_SECTION Value: " << RE.SectionID
+                   << '\n');
+      writeBytesUnaligned(RE.SectionID, Target, 2);
+      break;
+    case COFF::IMAGE_REL_ARM_SECREL:
+      // 32-bit offset of the target from the beginning of its section.
+      assert(static_cast<int32_t>(RE.Addend) <= INT32_MAX &&
+             "relocation overflow");
+      assert(static_cast<int32_t>(RE.Addend) >= INT32_MIN &&
+             "relocation underflow");
+      DEBUG(dbgs() << "\t\tOffset: " << RE.Offset
+                   << " RelType: IMAGE_REL_ARM_SECREL Value: " << RE.Addend
+                   << '\n');
+      writeBytesUnaligned(RE.Addend, Target, 2);
+      break;
+    case COFF::IMAGE_REL_ARM_MOV32T: {
+      // 32-bit VA of the target applied to a contiguous MOVW+MOVT pair.
+      uint64_t Result =
+          Sections[RE.Sections.SectionA].getLoadAddressWithOffset(RE.Addend);
+      assert(static_cast<int32_t>(Result) <= INT32_MAX &&
+             "relocation overflow");
+      assert(static_cast<int32_t>(Result) >= INT32_MIN &&
+             "relocation underflow");
+      DEBUG(dbgs() << "\t\tOffset: " << RE.Offset
+                   << " RelType: IMAGE_REL_ARM_MOV32T"
+                   << " TargetSection: " << RE.Sections.SectionA
+                   << " Value: " << format("0x%08" PRIx32, Result) << '\n');
+
+      // MOVW(T3): |11110|i|10|0|1|0|0|imm4|0|imm3|Rd|imm8|
+      //            imm32 = zext imm4:i:imm3:imm8
+      // MOVT(T1): |11110|i|10|1|1|0|0|imm4|0|imm3|Rd|imm8|
+      //            imm16 =      imm4:i:imm3:imm8
+
+      auto EncodeImmediate = [](uint8_t *Bytes, uint16_t Immediate)  {
+        Bytes[0] |= ((Immediate & 0xf000) >> 12);
+        Bytes[1] |= ((Immediate & 0x0800) >> 11);
+        Bytes[2] |= ((Immediate & 0x00ff) >>  0);
+        Bytes[3] |= ((Immediate & 0x0700) >>  8);
+      };
+
+      EncodeImmediate(&Target[0], static_cast<uint32_t>(Result) >> 00);
+      EncodeImmediate(&Target[4], static_cast<uint32_t>(Result) >> 16);
+
+      break;
+    }
+    case COFF::IMAGE_REL_ARM_BRANCH20T: {
+      // The most significant 20-bits of the signed 21-bit relative displacement
+      uint64_t Value =
+          RE.Addend - (Sections[RE.SectionID].getLoadAddress() + RE.Offset) - 4;
+      assert(static_cast<int32_t>(RE.Addend) <= INT32_MAX &&
+             "relocation overflow");
+      assert(static_cast<int32_t>(RE.Addend) >= INT32_MIN &&
+             "relocation underflow");
+      DEBUG(dbgs() << "\t\tOffset: " << RE.Offset
+                   << " RelType: IMAGE_REL_ARM_BRANCH20T"
+                   << " Value: " << static_cast<int32_t>(Value) << '\n');
+      static_cast<void>(Value);
+      llvm_unreachable("unimplemented relocation");
+      break;
+    }
+    case COFF::IMAGE_REL_ARM_BRANCH24T: {
+      // The most significant 24-bits of the signed 25-bit relative displacement
+      uint64_t Value =
+          RE.Addend - (Sections[RE.SectionID].getLoadAddress() + RE.Offset) - 4;
+      assert(static_cast<int32_t>(RE.Addend) <= INT32_MAX &&
+             "relocation overflow");
+      assert(static_cast<int32_t>(RE.Addend) >= INT32_MIN &&
+             "relocation underflow");
+      DEBUG(dbgs() << "\t\tOffset: " << RE.Offset
+                   << " RelType: IMAGE_REL_ARM_BRANCH24T"
+                   << " Value: " << static_cast<int32_t>(Value) << '\n');
+      static_cast<void>(Value);
+      llvm_unreachable("unimplemented relocation");
+      break;
+    }
+    case COFF::IMAGE_REL_ARM_BLX23T: {
+      // The most significant 24-bits of the signed 25-bit relative displacement
+      uint64_t Value =
+          RE.Addend - (Sections[RE.SectionID].getLoadAddress() + RE.Offset) - 4;
+      assert(static_cast<int32_t>(RE.Addend) <= INT32_MAX &&
+             "relocation overflow");
+      assert(static_cast<int32_t>(RE.Addend) >= INT32_MIN &&
+             "relocation underflow");
+      DEBUG(dbgs() << "\t\tOffset: " << RE.Offset
+                   << " RelType: IMAGE_REL_ARM_BLX23T"
+                   << " Value: " << static_cast<int32_t>(Value) << '\n');
+      static_cast<void>(Value);
+      llvm_unreachable("unimplemented relocation");
+      break;
+    }
+    }
+  }
+
+  void registerEHFrames() override {}
+  void deregisterEHFrames() override {}
+};
+
+}
+
+#endif
+
diff --git a/lib/ExecutionEngine/RuntimeDyld/Targets/RuntimeDyldCOFFX86_64.h b/lib/ExecutionEngine/RuntimeDyld/Targets/RuntimeDyldCOFFX86_64.h
index 25f538d8f3da..df8681da24d1 100644
--- a/lib/ExecutionEngine/RuntimeDyld/Targets/RuntimeDyldCOFFX86_64.h
+++ b/lib/ExecutionEngine/RuntimeDyld/Targets/RuntimeDyldCOFFX86_64.h
@@ -106,17 +106,21 @@ public:
     }
   }
 
-  relocation_iterator processRelocationRef(unsigned SectionID,
-                                           relocation_iterator RelI,
-                                           const ObjectFile &Obj,
-                                           ObjSectionToIDMap &ObjSectionToID,
-                                           StubMap &Stubs) override {
+  Expected<relocation_iterator>
+  processRelocationRef(unsigned SectionID,
+                       relocation_iterator RelI,
+                       const ObjectFile &Obj,
+                       ObjSectionToIDMap &ObjSectionToID,
+                       StubMap &Stubs) override {
     // If possible, find the symbol referred to in the relocation,
     // and the section that contains it.
     symbol_iterator Symbol = RelI->getSymbol();
     if (Symbol == Obj.symbol_end())
       report_fatal_error("Unknown symbol in relocation");
-    section_iterator SecI = *Symbol->getSection();
+    auto SectionOrError = Symbol->getSection();
+    if (!SectionOrError)
+      return SectionOrError.takeError();
+    section_iterator SecI = *SectionOrError;
     // If there is no section, this must be an external reference.
     const bool IsExtern = SecI == Obj.section_end();
 
@@ -151,9 +155,9 @@ public:
       break;
     }
 
-    ErrorOr<StringRef> TargetNameOrErr = Symbol->getName();
-    if (std::error_code EC = TargetNameOrErr.getError())
-      report_fatal_error(EC.message());
+    Expected<StringRef> TargetNameOrErr = Symbol->getName();
+    if (!TargetNameOrErr)
+      return TargetNameOrErr.takeError();
     StringRef TargetName = *TargetNameOrErr;
 
     DEBUG(dbgs() << "\t\tIn Section " << SectionID << " Offset " << Offset
@@ -165,8 +169,12 @@ public:
       addRelocationForSymbol(RE, TargetName);
     } else {
       bool IsCode = SecI->isText();
-      unsigned TargetSectionID =
-          findOrEmitSection(Obj, *SecI, IsCode, ObjSectionToID);
+      unsigned TargetSectionID;
+      if (auto TargetSectionIDOrErr =
+          findOrEmitSection(Obj, *SecI, IsCode, ObjSectionToID))
+        TargetSectionID = *TargetSectionIDOrErr;
+      else
+        return TargetSectionIDOrErr.takeError();
       uint64_t TargetOffset = getSymbolOffset(*Symbol);
       RelocationEntry RE(SectionID, Offset, RelType, TargetOffset + Addend);
       addRelocationForSection(RE, TargetSectionID);
@@ -189,19 +197,21 @@ public:
   void deregisterEHFrames() override {
     // Stub
   }
-  void finalizeLoad(const ObjectFile &Obj,
-                    ObjSectionToIDMap &SectionMap) override {
+  Error finalizeLoad(const ObjectFile &Obj,
+                     ObjSectionToIDMap &SectionMap) override {
     // Look for and record the EH frame section IDs.
     for (const auto &SectionPair : SectionMap) {
       const SectionRef &Section = SectionPair.first;
       StringRef Name;
-      Check(Section.getName(Name));
+      if (auto EC = Section.getName(Name))
+        return errorCodeToError(EC);
       // Note unwind info is split across .pdata and .xdata, so this
       // may not be sufficiently general for all users.
       if (Name == ".xdata") {
         UnregisteredEHFrameSections.push_back(SectionPair.second);
       }
     }
+    return Error::success();
   }
 };
 
diff --git a/lib/ExecutionEngine/RuntimeDyld/Targets/RuntimeDyldMachOAArch64.h b/lib/ExecutionEngine/RuntimeDyld/Targets/RuntimeDyldMachOAArch64.h
index dbca37747ce8..63598f197070 100644
--- a/lib/ExecutionEngine/RuntimeDyld/Targets/RuntimeDyldMachOAArch64.h
+++ b/lib/ExecutionEngine/RuntimeDyld/Targets/RuntimeDyldMachOAArch64.h
@@ -242,7 +242,7 @@ public:
     }
   }
 
-  relocation_iterator
+  Expected<relocation_iterator>
   processRelocationRef(unsigned SectionID, relocation_iterator RelI,
                        const ObjectFile &BaseObjT,
                        ObjSectionToIDMap &ObjSectionToID,
@@ -252,7 +252,9 @@ public:
     MachO::any_relocation_info RelInfo =
         Obj.getRelocation(RelI->getRawDataRefImpl());
 
-    assert(!Obj.isRelocationScattered(RelInfo) && "");
+    if (Obj.isRelocationScattered(RelInfo))
+      return make_error<RuntimeDyldError>("Scattered relocations not supported "
+                                          "for MachO AArch64");
 
     // ARM64 has an ARM64_RELOC_ADDEND relocation type that carries an explicit
     // addend for the following relocation. If found: (1) store the associated
@@ -270,6 +272,9 @@ public:
       RelInfo = Obj.getRelocation(RelI->getRawDataRefImpl());
     }
 
+    if (Obj.getAnyRelocationType(RelInfo) == MachO::ARM64_RELOC_SUBTRACTOR)
+      return processSubtractRelocation(SectionID, RelI, Obj, ObjSectionToID);
+
     RelocationEntry RE(getRelocationEntry(SectionID, Obj, RelI));
     RE.Addend = decodeAddend(RE);
 
@@ -278,8 +283,11 @@ public:
     if (ExplicitAddend)
       RE.Addend = ExplicitAddend;
 
-    RelocationValueRef Value(
-        getRelocationValueRef(Obj, RelI, RE, ObjSectionToID));
+    RelocationValueRef Value;
+    if (auto ValueOrErr = getRelocationValueRef(Obj, RelI, RE, ObjSectionToID))
+      Value = *ValueOrErr;
+    else
+      return ValueOrErr.takeError();
 
     bool IsExtern = Obj.getPlainRelocationExternal(RelInfo);
     if (!IsExtern && RE.IsPCRel)
@@ -349,7 +357,15 @@ public:
       encodeAddend(LocalAddress, /*Size=*/4, RelType, Value);
       break;
     }
-    case MachO::ARM64_RELOC_SUBTRACTOR:
+    case MachO::ARM64_RELOC_SUBTRACTOR: {
+      uint64_t SectionABase = Sections[RE.Sections.SectionA].getLoadAddress();
+      uint64_t SectionBBase = Sections[RE.Sections.SectionB].getLoadAddress();
+      assert((Value == SectionABase || Value == SectionBBase) &&
+             "Unexpected SUBTRACTOR relocation value.");
+      Value = SectionABase - SectionBBase + RE.Addend;
+      writeBytesUnaligned(Value, LocalAddress, 1 << RE.Size);
+      break;
+    }
     case MachO::ARM64_RELOC_POINTER_TO_GOT:
     case MachO::ARM64_RELOC_TLVP_LOAD_PAGE21:
     case MachO::ARM64_RELOC_TLVP_LOAD_PAGEOFF12:
@@ -360,8 +376,10 @@ public:
     }
   }
 
-  void finalizeSection(const ObjectFile &Obj, unsigned SectionID,
-                       const SectionRef &Section) {}
+  Error finalizeSection(const ObjectFile &Obj, unsigned SectionID,
+                       const SectionRef &Section) {
+    return Error::success();
+  }
 
 private:
   void processGOTRelocation(const RelocationEntry &RE,
@@ -398,6 +416,47 @@ private:
                              RE.IsPCRel, RE.Size);
     addRelocationForSection(TargetRE, RE.SectionID);
   }
+
+  Expected<relocation_iterator>
+  processSubtractRelocation(unsigned SectionID, relocation_iterator RelI,
+                            const ObjectFile &BaseObjT,
+                            ObjSectionToIDMap &ObjSectionToID) {
+    const MachOObjectFile &Obj =
+        static_cast<const MachOObjectFile&>(BaseObjT);
+    MachO::any_relocation_info RE =
+        Obj.getRelocation(RelI->getRawDataRefImpl());
+
+    unsigned Size = Obj.getAnyRelocationLength(RE);
+    uint64_t Offset = RelI->getOffset();
+    uint8_t *LocalAddress = Sections[SectionID].getAddressWithOffset(Offset);
+    unsigned NumBytes = 1 << Size;
+
+    Expected<StringRef> SubtrahendNameOrErr = RelI->getSymbol()->getName();
+    if (!SubtrahendNameOrErr)
+      return SubtrahendNameOrErr.takeError();
+    auto SubtrahendI = GlobalSymbolTable.find(*SubtrahendNameOrErr);
+    unsigned SectionBID = SubtrahendI->second.getSectionID();
+    uint64_t SectionBOffset = SubtrahendI->second.getOffset();
+    int64_t Addend =
+      SignExtend64(readBytesUnaligned(LocalAddress, NumBytes), NumBytes * 8);
+
+    ++RelI;
+    Expected<StringRef> MinuendNameOrErr = RelI->getSymbol()->getName();
+    if (!MinuendNameOrErr)
+      return MinuendNameOrErr.takeError();
+    auto MinuendI = GlobalSymbolTable.find(*MinuendNameOrErr);
+    unsigned SectionAID = MinuendI->second.getSectionID();
+    uint64_t SectionAOffset = MinuendI->second.getOffset();
+
+    RelocationEntry R(SectionID, Offset, MachO::ARM64_RELOC_SUBTRACTOR, (uint64_t)Addend,
+                      SectionAID, SectionAOffset, SectionBID, SectionBOffset,
+                      false, Size);
+
+    addRelocationForSection(R, SectionAID);
+
+    return ++RelI;
+  }
+
 };
 }
 
diff --git a/lib/ExecutionEngine/RuntimeDyld/Targets/RuntimeDyldMachOARM.h b/lib/ExecutionEngine/RuntimeDyld/Targets/RuntimeDyldMachOARM.h
index 7731df09bd21..0abf9daba505 100644
--- a/lib/ExecutionEngine/RuntimeDyld/Targets/RuntimeDyldMachOARM.h
+++ b/lib/ExecutionEngine/RuntimeDyld/Targets/RuntimeDyldMachOARM.h
@@ -11,6 +11,7 @@
 #define LLVM_LIB_EXECUTIONENGINE_RUNTIMEDYLD_TARGETS_RUNTIMEDYLDMACHOARM_H
 
 #include "../RuntimeDyldMachO.h"
+#include <string>
 
 #define DEBUG_TYPE "dyld"
 
@@ -49,7 +50,7 @@ public:
     }
   }
 
-  relocation_iterator
+  Expected<relocation_iterator>
   processRelocationRef(unsigned SectionID, relocation_iterator RelI,
                        const ObjectFile &BaseObjT,
                        ObjSectionToIDMap &ObjSectionToID,
@@ -70,10 +71,30 @@ public:
         return ++RelI;
     }
 
+    // Sanity check relocation type.
+    switch (RelType) {
+    UNIMPLEMENTED_RELOC(MachO::ARM_RELOC_PAIR);
+    UNIMPLEMENTED_RELOC(MachO::ARM_RELOC_SECTDIFF);
+    UNIMPLEMENTED_RELOC(MachO::ARM_RELOC_LOCAL_SECTDIFF);
+    UNIMPLEMENTED_RELOC(MachO::ARM_RELOC_PB_LA_PTR);
+    UNIMPLEMENTED_RELOC(MachO::ARM_THUMB_RELOC_BR22);
+    UNIMPLEMENTED_RELOC(MachO::ARM_THUMB_32BIT_BRANCH);
+    UNIMPLEMENTED_RELOC(MachO::ARM_RELOC_HALF);
+    default:
+      if (RelType > MachO::ARM_RELOC_HALF_SECTDIFF)
+        return make_error<RuntimeDyldError>(("MachO ARM relocation type " +
+                                             Twine(RelType) +
+                                             " is out of range").str());
+      break;
+    }
+
     RelocationEntry RE(getRelocationEntry(SectionID, Obj, RelI));
     RE.Addend = decodeAddend(RE);
-    RelocationValueRef Value(
-        getRelocationValueRef(Obj, RelI, RE, ObjSectionToID));
+    RelocationValueRef Value;
+    if (auto ValueOrErr = getRelocationValueRef(Obj, RelI, RE, ObjSectionToID))
+      Value = *ValueOrErr;
+    else
+      return ValueOrErr.takeError();
 
     if (RE.IsPCRel)
       makeValueAddendPCRel(Value, RelI, 8);
@@ -108,8 +129,6 @@ public:
     }
 
     switch (RE.RelType) {
-    default:
-      llvm_unreachable("Invalid relocation type!");
     case MachO::ARM_RELOC_VANILLA:
       writeBytesUnaligned(Value + RE.Addend, LocalAddress, 1 << RE.Size);
       break;
@@ -147,26 +166,20 @@ public:
       break;
     }
 
-    case MachO::ARM_THUMB_RELOC_BR22:
-    case MachO::ARM_THUMB_32BIT_BRANCH:
-    case MachO::ARM_RELOC_HALF:
-    case MachO::ARM_RELOC_PAIR:
-    case MachO::ARM_RELOC_SECTDIFF:
-    case MachO::ARM_RELOC_LOCAL_SECTDIFF:
-    case MachO::ARM_RELOC_PB_LA_PTR:
-      Error("Relocation type not implemented yet!");
-      return;
+    default:
+      llvm_unreachable("Invalid relocation type");
     }
   }
 
-  void finalizeSection(const ObjectFile &Obj, unsigned SectionID,
+  Error finalizeSection(const ObjectFile &Obj, unsigned SectionID,
                        const SectionRef &Section) {
     StringRef Name;
     Section.getName(Name);
 
     if (Name == "__nl_symbol_ptr")
-      populateIndirectSymbolPointersSection(cast<MachOObjectFile>(Obj),
-                                            Section, SectionID);
+      return populateIndirectSymbolPointersSection(cast<MachOObjectFile>(Obj),
+                                                   Section, SectionID);
+    return Error::success();
   }
 
 private:
@@ -201,7 +214,7 @@ private:
     resolveRelocation(TargetRE, (uint64_t)Addr);
   }
 
-  relocation_iterator
+  Expected<relocation_iterator>
   processHALFSECTDIFFRelocation(unsigned SectionID, relocation_iterator RelI,
                                 const ObjectFile &BaseTObj,
                                 ObjSectionToIDMap &ObjSectionToID) {
@@ -237,8 +250,12 @@ private:
     uint64_t SectionAOffset = AddrA - SectionABase;
     SectionRef SectionA = *SAI;
     bool IsCode = SectionA.isText();
-    uint32_t SectionAID =
-        findOrEmitSection(MachO, SectionA, IsCode, ObjSectionToID);
+    uint32_t SectionAID = ~0U;
+    if (auto SectionAIDOrErr =
+          findOrEmitSection(MachO, SectionA, IsCode, ObjSectionToID))
+      SectionAID = *SectionAIDOrErr;
+    else
+      return SectionAIDOrErr.takeError();
 
     uint32_t AddrB = MachO.getScatteredRelocationValue(RE2);
     section_iterator SBI = getSectionByAddress(MachO, AddrB);
@@ -246,8 +263,12 @@ private:
     uint64_t SectionBBase = SBI->getAddress();
     uint64_t SectionBOffset = AddrB - SectionBBase;
     SectionRef SectionB = *SBI;
-    uint32_t SectionBID =
-        findOrEmitSection(MachO, SectionB, IsCode, ObjSectionToID);
+    uint32_t SectionBID = ~0U;
+    if (auto SectionBIDOrErr =
+          findOrEmitSection(MachO, SectionB, IsCode, ObjSectionToID))
+      SectionBID = *SectionBIDOrErr;
+    else
+      return SectionBIDOrErr.takeError();
 
     uint32_t OtherHalf = MachO.getAnyRelocationAddress(RE2) & 0xffff;
     unsigned Shift = (HalfDiffKindBits & 0x1) ? 16 : 0;
diff --git a/lib/ExecutionEngine/RuntimeDyld/Targets/RuntimeDyldMachOI386.h b/lib/ExecutionEngine/RuntimeDyld/Targets/RuntimeDyldMachOI386.h
index 85059d70a3eb..2c79b3f7c819 100644
--- a/lib/ExecutionEngine/RuntimeDyld/Targets/RuntimeDyldMachOI386.h
+++ b/lib/ExecutionEngine/RuntimeDyld/Targets/RuntimeDyldMachOI386.h
@@ -11,6 +11,7 @@
 #define LLVM_LIB_EXECUTIONENGINE_RUNTIMEDYLD_TARGETS_RUNTIMEDYLDMACHOI386_H
 
 #include "../RuntimeDyldMachO.h"
+#include <string>
 
 #define DEBUG_TYPE "dyld"
 
@@ -30,7 +31,7 @@ public:
 
   unsigned getStubAlignment() override { return 1; }
 
-  relocation_iterator
+  Expected<relocation_iterator>
   processRelocationRef(unsigned SectionID, relocation_iterator RelI,
                        const ObjectFile &BaseObjT,
                        ObjSectionToIDMap &ObjSectionToID,
@@ -48,13 +49,29 @@ public:
                                          ObjSectionToID);
       else if (RelType == MachO::GENERIC_RELOC_VANILLA)
         return processScatteredVANILLA(SectionID, RelI, Obj, ObjSectionToID);
-      llvm_unreachable("Unhandled scattered relocation.");
+      return make_error<RuntimeDyldError>(("Unhandled I386 scattered relocation "
+                                           "type: " + Twine(RelType)).str());
+    }
+
+    switch (RelType) {
+    UNIMPLEMENTED_RELOC(MachO::GENERIC_RELOC_PAIR);
+    UNIMPLEMENTED_RELOC(MachO::GENERIC_RELOC_PB_LA_PTR);
+    UNIMPLEMENTED_RELOC(MachO::GENERIC_RELOC_TLV);
+    default:
+      if (RelType > MachO::GENERIC_RELOC_TLV)
+        return make_error<RuntimeDyldError>(("MachO I386 relocation type " +
+                                             Twine(RelType) +
+                                             " is out of range").str());
+      break;
     }
 
     RelocationEntry RE(getRelocationEntry(SectionID, Obj, RelI));
     RE.Addend = memcpyAddend(RE);
-    RelocationValueRef Value(
-        getRelocationValueRef(Obj, RelI, RE, ObjSectionToID));
+    RelocationValueRef Value;
+    if (auto ValueOrErr = getRelocationValueRef(Obj, RelI, RE, ObjSectionToID))
+      Value = *ValueOrErr;
+    else
+      return ValueOrErr.takeError();
 
     // Addends for external, PC-rel relocations on i386 point back to the zero
     // offset. Calculate the final offset from the relocation target instead.
@@ -91,8 +108,6 @@ public:
     }
 
     switch (RE.RelType) {
-    default:
-      llvm_unreachable("Invalid relocation type!");
     case MachO::GENERIC_RELOC_VANILLA:
       writeBytesUnaligned(Value + RE.Addend, LocalAddress, 1 << RE.Size);
       break;
@@ -106,25 +121,26 @@ public:
       writeBytesUnaligned(Value, LocalAddress, 1 << RE.Size);
       break;
     }
-    case MachO::GENERIC_RELOC_PB_LA_PTR:
-      Error("Relocation type not implemented yet!");
+    default:
+      llvm_unreachable("Invalid relocation type!");
     }
   }
 
-  void finalizeSection(const ObjectFile &Obj, unsigned SectionID,
+  Error finalizeSection(const ObjectFile &Obj, unsigned SectionID,
                        const SectionRef &Section) {
     StringRef Name;
     Section.getName(Name);
 
     if (Name == "__jump_table")
-      populateJumpTable(cast<MachOObjectFile>(Obj), Section, SectionID);
+      return populateJumpTable(cast<MachOObjectFile>(Obj), Section, SectionID);
     else if (Name == "__pointers")
-      populateIndirectSymbolPointersSection(cast<MachOObjectFile>(Obj),
-                                            Section, SectionID);
+      return populateIndirectSymbolPointersSection(cast<MachOObjectFile>(Obj),
+                                                   Section, SectionID);
+    return Error::success();
   }
 
 private:
-  relocation_iterator
+  Expected<relocation_iterator>
   processSECTDIFFRelocation(unsigned SectionID, relocation_iterator RelI,
                             const ObjectFile &BaseObjT,
                             ObjSectionToIDMap &ObjSectionToID) {
@@ -153,8 +169,12 @@ private:
     uint64_t SectionAOffset = AddrA - SectionABase;
     SectionRef SectionA = *SAI;
     bool IsCode = SectionA.isText();
-    uint32_t SectionAID =
-        findOrEmitSection(Obj, SectionA, IsCode, ObjSectionToID);
+    uint32_t SectionAID = ~0U;
+    if (auto SectionAIDOrErr =
+        findOrEmitSection(Obj, SectionA, IsCode, ObjSectionToID))
+      SectionAID = *SectionAIDOrErr;
+    else
+      return SectionAIDOrErr.takeError();
 
     uint32_t AddrB = Obj.getScatteredRelocationValue(RE2);
     section_iterator SBI = getSectionByAddress(Obj, AddrB);
@@ -162,8 +182,12 @@ private:
     uint64_t SectionBBase = SBI->getAddress();
     uint64_t SectionBOffset = AddrB - SectionBBase;
     SectionRef SectionB = *SBI;
-    uint32_t SectionBID =
-        findOrEmitSection(Obj, SectionB, IsCode, ObjSectionToID);
+    uint32_t SectionBID = ~0U;
+    if (auto SectionBIDOrErr =
+        findOrEmitSection(Obj, SectionB, IsCode, ObjSectionToID))
+      SectionBID = *SectionBIDOrErr;
+    else
+      return SectionBIDOrErr.takeError();
 
     // Compute the addend 'C' from the original expression 'A - B + C'.
     Addend -= AddrA - AddrB;
@@ -183,11 +207,9 @@ private:
   }
 
   // Populate stubs in __jump_table section.
-  void populateJumpTable(const MachOObjectFile &Obj, const SectionRef &JTSection,
+  Error populateJumpTable(const MachOObjectFile &Obj,
+                          const SectionRef &JTSection,
                          unsigned JTSectionID) {
-    assert(!Obj.is64Bit() &&
-           "__jump_table section not supported in 64-bit MachO.");
-
     MachO::dysymtab_command DySymTabCmd = Obj.getDysymtabLoadCommand();
     MachO::section Sec32 = Obj.getSection(JTSection.getRawDataRefImpl());
     uint32_t JTSectionSize = Sec32.size;
@@ -197,16 +219,17 @@ private:
     uint8_t *JTSectionAddr = getSectionAddress(JTSectionID);
     unsigned JTEntryOffset = 0;
 
-    assert((JTSectionSize % JTEntrySize) == 0 &&
-           "Jump-table section does not contain a whole number of stubs?");
+    if (JTSectionSize % JTEntrySize != 0)
+      return make_error<RuntimeDyldError>("Jump-table section does not contain "
+                                          "a whole number of stubs?");
 
     for (unsigned i = 0; i < NumJTEntries; ++i) {
       unsigned SymbolIndex =
           Obj.getIndirectSymbolTableEntry(DySymTabCmd, FirstIndirectSymbol + i);
       symbol_iterator SI = Obj.getSymbolByIndex(SymbolIndex);
-      ErrorOr<StringRef> IndirectSymbolName = SI->getName();
-      if (std::error_code EC = IndirectSymbolName.getError())
-        report_fatal_error(EC.message());
+      Expected<StringRef> IndirectSymbolName = SI->getName();
+      if (!IndirectSymbolName)
+        return IndirectSymbolName.takeError();
       uint8_t *JTEntryAddr = JTSectionAddr + JTEntryOffset;
       createStubFunction(JTEntryAddr);
       RelocationEntry RE(JTSectionID, JTEntryOffset + 1,
@@ -214,6 +237,8 @@ private:
       addRelocationForSymbol(RE, *IndirectSymbolName);
       JTEntryOffset += JTEntrySize;
     }
+
+    return Error::success();
   }
 
 };
diff --git a/lib/ExecutionEngine/RuntimeDyld/Targets/RuntimeDyldMachOX86_64.h b/lib/ExecutionEngine/RuntimeDyld/Targets/RuntimeDyldMachOX86_64.h
index 2242295bc1ee..bc4822983244 100644
--- a/lib/ExecutionEngine/RuntimeDyld/Targets/RuntimeDyldMachOX86_64.h
+++ b/lib/ExecutionEngine/RuntimeDyld/Targets/RuntimeDyldMachOX86_64.h
@@ -11,6 +11,7 @@
 #define LLVM_LIB_EXECUTIONENGINE_RUNTIMEDYLD_TARGETS_RUNTIMEDYLDMACHOX86_64_H
 
 #include "../RuntimeDyldMachO.h"
+#include <string>
 
 #define DEBUG_TYPE "dyld"
 
@@ -30,7 +31,7 @@ public:
 
   unsigned getStubAlignment() override { return 1; }
 
-  relocation_iterator
+  Expected<relocation_iterator>
   processRelocationRef(unsigned SectionID, relocation_iterator RelI,
                        const ObjectFile &BaseObjT,
                        ObjSectionToIDMap &ObjSectionToID,
@@ -49,13 +50,26 @@ public:
 
     RelocationEntry RE(getRelocationEntry(SectionID, Obj, RelI));
     RE.Addend = memcpyAddend(RE);
-    RelocationValueRef Value(
-        getRelocationValueRef(Obj, RelI, RE, ObjSectionToID));
+    RelocationValueRef Value;
+    if (auto ValueOrErr = getRelocationValueRef(Obj, RelI, RE, ObjSectionToID))
+      Value = *ValueOrErr;
+    else
+      return ValueOrErr.takeError();
 
     bool IsExtern = Obj.getPlainRelocationExternal(RelInfo);
     if (!IsExtern && RE.IsPCRel)
       makeValueAddendPCRel(Value, RelI, 1 << RE.Size);
 
+    switch (RelType) {
+    UNIMPLEMENTED_RELOC(MachO::X86_64_RELOC_TLV);
+    default:
+      if (RelType > MachO::X86_64_RELOC_TLV)
+        return make_error<RuntimeDyldError>(("MachO X86_64 relocation type " +
+                                             Twine(RelType) +
+                                             " is out of range").str());
+      break;
+    }
+
     if (RE.RelType == MachO::X86_64_RELOC_GOT ||
         RE.RelType == MachO::X86_64_RELOC_GOT_LOAD)
       processGOTRelocation(RE, Value, Stubs);
@@ -104,15 +118,13 @@ public:
       writeBytesUnaligned(Value, LocalAddress, 1 << RE.Size);
       break;
     }
-    case MachO::X86_64_RELOC_GOT_LOAD:
-    case MachO::X86_64_RELOC_GOT:
-    case MachO::X86_64_RELOC_TLV:
-      Error("Relocation type not implemented yet!");
     }
   }
 
-  void finalizeSection(const ObjectFile &Obj, unsigned SectionID,
-                       const SectionRef &Section) {}
+  Error finalizeSection(const ObjectFile &Obj, unsigned SectionID,
+                        const SectionRef &Section) {
+    return Error::success();
+  }
 
 private:
   void processGOTRelocation(const RelocationEntry &RE,
@@ -143,12 +155,12 @@ private:
     resolveRelocation(TargetRE, (uint64_t)Addr);
   }
 
-  relocation_iterator
+  Expected<relocation_iterator>
   processSubtractRelocation(unsigned SectionID, relocation_iterator RelI,
-                            const ObjectFile &BaseObjT,
+                            const MachOObjectFile &BaseObj,
                             ObjSectionToIDMap &ObjSectionToID) {
     const MachOObjectFile &Obj =
-        static_cast<const MachOObjectFile&>(BaseObjT);
+        static_cast<const MachOObjectFile&>(BaseObj);
     MachO::any_relocation_info RE =
         Obj.getRelocation(RelI->getRawDataRefImpl());
 
@@ -156,23 +168,60 @@ private:
     uint64_t Offset = RelI->getOffset();
     uint8_t *LocalAddress = Sections[SectionID].getAddressWithOffset(Offset);
     unsigned NumBytes = 1 << Size;
-
-    ErrorOr<StringRef> SubtrahendNameOrErr = RelI->getSymbol()->getName();
-    if (auto EC = SubtrahendNameOrErr.getError())
-      report_fatal_error(EC.message());
-    auto SubtrahendI = GlobalSymbolTable.find(*SubtrahendNameOrErr);
-    unsigned SectionBID = SubtrahendI->second.getSectionID();
-    uint64_t SectionBOffset = SubtrahendI->second.getOffset();
     int64_t Addend =
       SignExtend64(readBytesUnaligned(LocalAddress, NumBytes), NumBytes * 8);
 
+    unsigned SectionBID = ~0U;
+    uint64_t SectionBOffset = 0;
+
+    MachO::any_relocation_info RelInfo =
+      Obj.getRelocation(RelI->getRawDataRefImpl());
+
+    bool AIsExternal = BaseObj.getPlainRelocationExternal(RelInfo);
+
+    if (AIsExternal) {
+      Expected<StringRef> SubtrahendNameOrErr = RelI->getSymbol()->getName();
+      if (!SubtrahendNameOrErr)
+        return SubtrahendNameOrErr.takeError();
+      auto SubtrahendI = GlobalSymbolTable.find(*SubtrahendNameOrErr);
+      SectionBID = SubtrahendI->second.getSectionID();
+      SectionBOffset = SubtrahendI->second.getOffset();
+    } else {
+      SectionRef SecB = Obj.getAnyRelocationSection(RelInfo);
+      bool IsCode = SecB.isText();
+      Expected<unsigned> SectionBIDOrErr =
+        findOrEmitSection(Obj, SecB, IsCode, ObjSectionToID);
+      if (!SectionBIDOrErr)
+        return SectionBIDOrErr.takeError();
+      SectionBID = *SectionBIDOrErr;
+      Addend += SecB.getAddress();
+    }
+
     ++RelI;
-    ErrorOr<StringRef> MinuendNameOrErr = RelI->getSymbol()->getName();
-    if (auto EC = MinuendNameOrErr.getError())
-      report_fatal_error(EC.message());
-    auto MinuendI = GlobalSymbolTable.find(*MinuendNameOrErr);
-    unsigned SectionAID = MinuendI->second.getSectionID();
-    uint64_t SectionAOffset = MinuendI->second.getOffset();
+
+    unsigned SectionAID = ~0U;
+    uint64_t SectionAOffset = 0;
+
+    RelInfo = Obj.getRelocation(RelI->getRawDataRefImpl());
+
+    bool BIsExternal = BaseObj.getPlainRelocationExternal(RelInfo);
+    if (BIsExternal) {
+      Expected<StringRef> MinuendNameOrErr = RelI->getSymbol()->getName();
+      if (!MinuendNameOrErr)
+        return MinuendNameOrErr.takeError();
+      auto MinuendI = GlobalSymbolTable.find(*MinuendNameOrErr);
+      SectionAID = MinuendI->second.getSectionID();
+      SectionAOffset = MinuendI->second.getOffset();
+    } else {
+      SectionRef SecA = Obj.getAnyRelocationSection(RelInfo);
+      bool IsCode = SecA.isText();
+      Expected<unsigned> SectionAIDOrErr =
+        findOrEmitSection(Obj, SecA, IsCode, ObjSectionToID);
+      if (!SectionAIDOrErr)
+        return SectionAIDOrErr.takeError();
+      SectionAID = *SectionAIDOrErr;
+      Addend -= SecA.getAddress();
+    }
 
     RelocationEntry R(SectionID, Offset, MachO::X86_64_RELOC_SUBTRACTOR, (uint64_t)Addend,
                       SectionAID, SectionAOffset, SectionBID, SectionBOffset,
diff --git a/lib/ExecutionEngine/TargetSelect.cpp b/lib/ExecutionEngine/TargetSelect.cpp
index 57f6e0804143..b45f0c89de8b 100644
--- a/lib/ExecutionEngine/TargetSelect.cpp
+++ b/lib/ExecutionEngine/TargetSelect.cpp
@@ -14,11 +14,10 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "llvm/ExecutionEngine/ExecutionEngine.h"
 #include "llvm/ADT/Triple.h"
+#include "llvm/ExecutionEngine/ExecutionEngine.h"
 #include "llvm/IR/Module.h"
 #include "llvm/MC/SubtargetFeature.h"
-#include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Host.h"
 #include "llvm/Support/TargetRegistry.h"
 #include "llvm/Target/TargetMachine.h"
diff --git a/lib/Fuzzer/CMakeLists.txt b/lib/Fuzzer/CMakeLists.txt
index d4d85041d218..16a769fafb9b 100644
--- a/lib/Fuzzer/CMakeLists.txt
+++ b/lib/Fuzzer/CMakeLists.txt
@@ -1,32 +1,35 @@
-set(LIBFUZZER_FLAGS_BASE "${CMAKE_CXX_FLAGS_RELEASE}")
+set(LIBFUZZER_FLAGS_BASE "${CMAKE_CXX_FLAGS}")
 # Disable the coverage and sanitizer instrumentation for the fuzzer itself.
-set(CMAKE_CXX_FLAGS_RELEASE "${LIBFUZZER_FLAGS_BASE} -O2 -fno-sanitize=all")
+set(CMAKE_CXX_FLAGS "${LIBFUZZER_FLAGS_BASE} -fno-sanitize=all -fno-sanitize-coverage=edge,trace-cmp,indirect-calls,8bit-counters -Werror")
 if( LLVM_USE_SANITIZE_COVERAGE )
+  if(NOT "${LLVM_USE_SANITIZER}" STREQUAL "Address")
+    message(FATAL_ERROR
+      "LibFuzzer and its tests require LLVM_USE_SANITIZER=Address and "
+      "LLVM_USE_SANITIZE_COVERAGE=YES to be set."
+      )
+  endif()
   add_library(LLVMFuzzerNoMainObjects OBJECT
     FuzzerCrossOver.cpp
-    FuzzerInterface.cpp
     FuzzerTraceState.cpp
     FuzzerDriver.cpp
+    FuzzerExtFunctionsDlsym.cpp
+    FuzzerExtFunctionsWeak.cpp
     FuzzerIO.cpp
     FuzzerLoop.cpp
     FuzzerMutate.cpp
-    FuzzerSanitizerOptions.cpp
     FuzzerSHA1.cpp
+    FuzzerTracePC.cpp
     FuzzerUtil.cpp
     )
   add_library(LLVMFuzzerNoMain STATIC
     $<TARGET_OBJECTS:LLVMFuzzerNoMainObjects>
     )
-  if( HAVE_LIBPTHREAD )
-    target_link_libraries(LLVMFuzzerNoMain pthread)
-  endif()
+  target_link_libraries(LLVMFuzzerNoMain ${PTHREAD_LIB})
   add_library(LLVMFuzzer STATIC
     FuzzerMain.cpp
     $<TARGET_OBJECTS:LLVMFuzzerNoMainObjects>
     )
-  if( HAVE_LIBPTHREAD )
-    target_link_libraries(LLVMFuzzer pthread)
-  endif()
+  target_link_libraries(LLVMFuzzer ${PTHREAD_LIB})
 
   if( LLVM_INCLUDE_TESTS )
     add_subdirectory(test)
diff --git a/lib/Fuzzer/FuzzerDriver.cpp b/lib/Fuzzer/FuzzerDriver.cpp
index 66e46dbf3aad..f520a5cfdc40 100644
--- a/lib/Fuzzer/FuzzerDriver.cpp
+++ b/lib/Fuzzer/FuzzerDriver.cpp
@@ -12,16 +12,18 @@
 #include "FuzzerInterface.h"
 #include "FuzzerInternal.h"
 
-#include <cstring>
-#include <chrono>
-#include <unistd.h>
-#include <thread>
+#include <algorithm>
 #include <atomic>
+#include <chrono>
+#include <cstring>
 #include <mutex>
 #include <string>
-#include <sstream>
-#include <algorithm>
-#include <iterator>
+#include <thread>
+#include <unistd.h>
+
+// This function should be present in the libFuzzer so that the client
+// binary can test for its existence.
+extern "C" __attribute__((used)) void __libfuzzer_is_present() {}
 
 namespace fuzzer {
 
@@ -36,16 +38,20 @@ struct FlagDescription {
 };
 
 struct {
+#define FUZZER_DEPRECATED_FLAG(Name)
 #define FUZZER_FLAG_INT(Name, Default, Description) int Name;
 #define FUZZER_FLAG_UNSIGNED(Name, Default, Description) unsigned int Name;
 #define FUZZER_FLAG_STRING(Name, Description) const char *Name;
 #include "FuzzerFlags.def"
+#undef FUZZER_DEPRECATED_FLAG
 #undef FUZZER_FLAG_INT
 #undef FUZZER_FLAG_UNSIGNED
 #undef FUZZER_FLAG_STRING
 } Flags;
 
 static const FlagDescription FlagDescriptions [] {
+#define FUZZER_DEPRECATED_FLAG(Name)                                           \
+  {#Name, "Deprecated; don't use", 0, nullptr, nullptr, nullptr},
 #define FUZZER_FLAG_INT(Name, Default, Description)                            \
   {#Name, Description, Default, &Flags.Name, nullptr, nullptr},
 #define FUZZER_FLAG_UNSIGNED(Name, Default, Description)                       \
@@ -54,6 +60,7 @@ static const FlagDescription FlagDescriptions [] {
 #define FUZZER_FLAG_STRING(Name, Description)                                  \
   {#Name, Description, 0, nullptr, &Flags.Name, nullptr},
 #include "FuzzerFlags.def"
+#undef FUZZER_DEPRECATED_FLAG
 #undef FUZZER_FLAG_INT
 #undef FUZZER_FLAG_UNSIGNED
 #undef FUZZER_FLAG_STRING
@@ -66,8 +73,14 @@ static std::vector<std::string> *Inputs;
 static std::string *ProgName;
 
 static void PrintHelp() {
-  Printf("Usage: %s [-flag1=val1 [-flag2=val2 ...] ] [dir1 [dir2 ...] ]\n",
-         ProgName->c_str());
+  Printf("Usage:\n");
+  auto Prog = ProgName->c_str();
+  Printf("\nTo run fuzzing pass 0 or more directories.\n");
+  Printf("%s [-flag1=val1 [-flag2=val2 ...] ] [dir1 [dir2 ...] ]\n", Prog);
+
+  Printf("\nTo run individual tests without fuzzing pass 1 or more files:\n");
+  Printf("%s [-flag1=val1 [-flag2=val2 ...] ] file1 [file2 ...]\n", Prog);
+
   Printf("\nFlags: (strictly in form -flag=value)\n");
   size_t MaxFlagLen = 0;
   for (size_t F = 0; F < kNumFlags; F++)
@@ -93,14 +106,34 @@ static const char *FlagValue(const char *Param, const char *Name) {
   return nullptr;
 }
 
+// Avoid calling stol as it triggers a bug in clang/glibc build.
+static long MyStol(const char *Str) {
+  long Res = 0;
+  long Sign = 1;
+  if (*Str == '-') {
+    Str++;
+    Sign = -1;
+  }
+  for (size_t i = 0; Str[i]; i++) {
+    char Ch = Str[i];
+    if (Ch < '0' || Ch > '9')
+      return Res;
+    Res = Res * 10 + (Ch - '0');
+  }
+  return Res * Sign;
+}
+
 static bool ParseOneFlag(const char *Param) {
   if (Param[0] != '-') return false;
   if (Param[1] == '-') {
     static bool PrintedWarning = false;
     if (!PrintedWarning) {
       PrintedWarning = true;
-      Printf("WARNING: libFuzzer ignores flags that start with '--'\n");
+      Printf("INFO: libFuzzer ignores flags that start with '--'\n");
     }
+    for (size_t F = 0; F < kNumFlags; F++)
+      if (FlagValue(Param + 1, FlagDescriptions[F].Name))
+        Printf("WARNING: did you mean '%s' (single dash)?\n", Param + 1);
     return true;
   }
   for (size_t F = 0; F < kNumFlags; F++) {
@@ -108,7 +141,7 @@ static bool ParseOneFlag(const char *Param) {
     const char *Str = FlagValue(Param, Name);
     if (Str)  {
       if (FlagDescriptions[F].IntFlag) {
-        int Val = std::stol(Str);
+        int Val = MyStol(Str);
         *FlagDescriptions[F].IntFlag = Val;
         if (Flags.verbosity >= 2)
           Printf("Flag: %s %d\n", Name, Val);;
@@ -124,11 +157,15 @@ static bool ParseOneFlag(const char *Param) {
         if (Flags.verbosity >= 2)
           Printf("Flag: %s %s\n", Name, Str);
         return true;
+      } else {  // Deprecated flag.
+        Printf("Flag: %s: deprecated, don't use\n", Name);
+        return true;
       }
     }
   }
-  PrintHelp();
-  exit(1);
+  Printf("\n\nWARNING: unrecognized flag '%s'; "
+         "use -help=1 to list all flags\n\n", Param);
+  return true;
 }
 
 // We don't use any library to minimize dependencies.
@@ -153,7 +190,7 @@ static std::mutex Mu;
 
 static void PulseThread() {
   while (true) {
-    std::this_thread::sleep_for(std::chrono::seconds(600));
+    SleepSeconds(600);
     std::lock_guard<std::mutex> Lock(Mu);
     Printf("pulse...\n");
   }
@@ -168,7 +205,7 @@ static void WorkerThread(const std::string &Cmd, std::atomic<int> *Counter,
     std::string ToRun = Cmd + " > " + Log + " 2>&1\n";
     if (Flags.verbosity)
       Printf("%s", ToRun.c_str());
-    int ExitCode = ExecuteCommand(ToRun.c_str());
+    int ExitCode = ExecuteCommand(ToRun);
     if (ExitCode != 0)
       *HasErrors = true;
     std::lock_guard<std::mutex> Lock(Mu);
@@ -198,34 +235,44 @@ static int RunInMultipleProcesses(const std::vector<std::string> &Args,
   return HasErrors ? 1 : 0;
 }
 
+static void RssThread(Fuzzer *F, size_t RssLimitMb) {
+  while (true) {
+    SleepSeconds(1);
+    size_t Peak = GetPeakRSSMb();
+    if (Peak > RssLimitMb)
+      F->RssLimitCallback();
+  }
+}
+
+static void StartRssThread(Fuzzer *F, size_t RssLimitMb) {
+  if (!RssLimitMb) return;
+  std::thread T(RssThread, F, RssLimitMb);
+  T.detach();
+}
+
 int RunOneTest(Fuzzer *F, const char *InputFilePath) {
   Unit U = FileToVector(InputFilePath);
   Unit PreciseSizedU(U);
   assert(PreciseSizedU.size() == PreciseSizedU.capacity());
-  F->ExecuteCallback(PreciseSizedU);
+  F->RunOne(PreciseSizedU.data(), PreciseSizedU.size());
   return 0;
 }
 
-int FuzzerDriver(int argc, char **argv, UserCallback Callback) {
-  FuzzerRandomLibc Rand(0);
-  SimpleUserSuppliedFuzzer SUSF(&Rand, Callback);
-  return FuzzerDriver(argc, argv, SUSF);
-}
-
-int FuzzerDriver(int argc, char **argv, UserSuppliedFuzzer &USF) {
-  std::vector<std::string> Args(argv, argv + argc);
-  return FuzzerDriver(Args, USF);
+static bool AllInputsAreFiles() {
+  if (Inputs->empty()) return false;
+  for (auto &Path : *Inputs)
+    if (!IsFile(Path))
+      return false;
+  return true;
 }
 
-int FuzzerDriver(const std::vector<std::string> &Args, UserCallback Callback) {
-  FuzzerRandomLibc Rand(0);
-  SimpleUserSuppliedFuzzer SUSF(&Rand, Callback);
-  return FuzzerDriver(Args, SUSF);
-}
-
-int FuzzerDriver(const std::vector<std::string> &Args,
-                 UserSuppliedFuzzer &USF) {
+int FuzzerDriver(int *argc, char ***argv, UserCallback Callback) {
   using namespace fuzzer;
+  assert(argc && argv && "Argument pointers cannot be nullptr");
+  EF = new ExternalFunctions();
+  if (EF->LLVMFuzzerInitialize)
+    EF->LLVMFuzzerInitialize(argc, argv);
+  const std::vector<std::string> Args(*argv, *argv + *argc);
   assert(!Args.empty());
   ProgName = new std::string(Args[0]);
   ParseFlags(Args);
@@ -234,6 +281,11 @@ int FuzzerDriver(const std::vector<std::string> &Args,
     return 0;
   }
 
+  if (Flags.close_fd_mask & 2)
+    DupAndCloseStderr();
+  if (Flags.close_fd_mask & 1)
+    CloseStdout();
+
   if (Flags.jobs > 0 && Flags.workers == 0) {
     Flags.workers = std::min(NumberOfCpuCores() / 2, Flags.jobs);
     if (Flags.workers > 1)
@@ -243,30 +295,32 @@ int FuzzerDriver(const std::vector<std::string> &Args,
   if (Flags.workers > 0 && Flags.jobs > 0)
     return RunInMultipleProcesses(Args, Flags.workers, Flags.jobs);
 
-  Fuzzer::FuzzingOptions Options;
+  const size_t kMaxSaneLen = 1 << 20;
+  const size_t kMinDefaultLen = 64;
+  FuzzingOptions Options;
   Options.Verbosity = Flags.verbosity;
   Options.MaxLen = Flags.max_len;
   Options.UnitTimeoutSec = Flags.timeout;
+  Options.TimeoutExitCode = Flags.timeout_exitcode;
   Options.MaxTotalTimeSec = Flags.max_total_time;
   Options.DoCrossOver = Flags.cross_over;
   Options.MutateDepth = Flags.mutate_depth;
-  Options.ExitOnFirst = Flags.exit_on_first;
   Options.UseCounters = Flags.use_counters;
   Options.UseIndirCalls = Flags.use_indir_calls;
   Options.UseTraces = Flags.use_traces;
+  Options.UseMemcmp = Flags.use_memcmp;
+  Options.UseMemmem = Flags.use_memmem;
   Options.ShuffleAtStartUp = Flags.shuffle;
-  Options.PreferSmallDuringInitialShuffle =
-      Flags.prefer_small_during_initial_shuffle;
+  Options.PreferSmall = Flags.prefer_small;
   Options.Reload = Flags.reload;
   Options.OnlyASCII = Flags.only_ascii;
   Options.OutputCSV = Flags.output_csv;
+  Options.DetectLeaks = Flags.detect_leaks;
+  Options.RssLimitMb = Flags.rss_limit_mb;
   if (Flags.runs >= 0)
     Options.MaxNumberOfRuns = Flags.runs;
   if (!Inputs->empty())
     Options.OutputCorpus = (*Inputs)[0];
-  if (Flags.sync_command)
-    Options.SyncCommand = Flags.sync_command;
-  Options.SyncTimeout = Flags.sync_timeout;
   Options.ReportSlowUnits = Flags.report_slow_units;
   if (Flags.artifact_prefix)
     Options.ArtifactPrefix = Flags.artifact_prefix;
@@ -278,48 +332,84 @@ int FuzzerDriver(const std::vector<std::string> &Args,
       return 1;
   if (Flags.verbosity > 0 && !Dictionary.empty())
     Printf("Dictionary: %zd entries\n", Dictionary.size());
-  Options.SaveArtifacts = !Flags.test_single_input;
+  bool DoPlainRun = AllInputsAreFiles();
+  Options.SaveArtifacts = !DoPlainRun;
   Options.PrintNewCovPcs = Flags.print_new_cov_pcs;
+  Options.PrintFinalStats = Flags.print_final_stats;
+  Options.TruncateUnits = Flags.truncate_units;
+  Options.PruneCorpus = Flags.prune_corpus;
+
+  unsigned Seed = Flags.seed;
+  // Initialize Seed.
+  if (Seed == 0)
+    Seed = (std::chrono::system_clock::now().time_since_epoch().count() << 10) +
+           getpid();
+  if (Flags.verbosity)
+    Printf("INFO: Seed: %u\n", Seed);
 
-  Fuzzer F(USF, Options);
+  Random Rand(Seed);
+  MutationDispatcher MD(Rand, Options);
+  Fuzzer F(Callback, MD, Options);
 
   for (auto &U: Dictionary)
-    USF.GetMD().AddWordToManualDictionary(U);
+    if (U.size() <= Word::GetMaxSize())
+      MD.AddWordToManualDictionary(Word(U.data(), U.size()));
+
+  StartRssThread(&F, Flags.rss_limit_mb);
 
   // Timer
   if (Flags.timeout > 0)
     SetTimer(Flags.timeout / 2 + 1);
-
-  if (Flags.test_single_input) {
-    RunOneTest(&F, Flags.test_single_input);
+  if (Flags.handle_segv) SetSigSegvHandler();
+  if (Flags.handle_bus) SetSigBusHandler();
+  if (Flags.handle_abrt) SetSigAbrtHandler();
+  if (Flags.handle_ill) SetSigIllHandler();
+  if (Flags.handle_fpe) SetSigFpeHandler();
+  if (Flags.handle_int) SetSigIntHandler();
+  if (Flags.handle_term) SetSigTermHandler();
+
+  if (DoPlainRun) {
+    Options.SaveArtifacts = false;
+    int Runs = std::max(1, Flags.runs);
+    Printf("%s: Running %zd inputs %d time(s) each.\n", ProgName->c_str(),
+           Inputs->size(), Runs);
+    for (auto &Path : *Inputs) {
+      auto StartTime = system_clock::now();
+      Printf("Running: %s\n", Path.c_str());
+      for (int Iter = 0; Iter < Runs; Iter++)
+        RunOneTest(&F, Path.c_str());
+      auto StopTime = system_clock::now();
+      auto MS = duration_cast<milliseconds>(StopTime - StartTime).count();
+      Printf("Executed %s in %zd ms\n", Path.c_str(), (long)MS);
+    }
+    F.PrintFinalStats();
     exit(0);
   }
 
-  if (Flags.save_minimized_corpus) {
-    Printf("The flag -save_minimized_corpus is deprecated; use -merge=1\n");
-    exit(1);
-  }
 
   if (Flags.merge) {
+    if (Options.MaxLen == 0)
+      F.SetMaxLen(kMaxSaneLen);
     F.Merge(*Inputs);
     exit(0);
   }
 
-  unsigned Seed = Flags.seed;
-  // Initialize Seed.
-  if (Seed == 0)
-    Seed = time(0) * 10000 + getpid();
-  if (Flags.verbosity)
-    Printf("Seed: %u\n", Seed);
-  USF.GetRand().ResetSeed(Seed);
+  size_t TemporaryMaxLen = Options.MaxLen ? Options.MaxLen : kMaxSaneLen;
 
-  F.RereadOutputCorpus();
+  F.RereadOutputCorpus(TemporaryMaxLen);
   for (auto &inp : *Inputs)
     if (inp != Options.OutputCorpus)
-      F.ReadDir(inp, nullptr);
+      F.ReadDir(inp, nullptr, TemporaryMaxLen);
+
+  if (Options.MaxLen == 0)
+    F.SetMaxLen(
+        std::min(std::max(kMinDefaultLen, F.MaxUnitSizeInCorpus()), kMaxSaneLen));
 
-  if (F.CorpusSize() == 0)
+  if (F.CorpusSize() == 0) {
     F.AddToCorpus(Unit());  // Can't fuzz empty corpus, so add an empty input.
+    if (Options.Verbosity)
+      Printf("INFO: A corpus is not provided, starting from an empty corpus\n");
+  }
   F.ShuffleAndMinimize();
   if (Flags.drill)
     F.Drill();
@@ -329,8 +419,12 @@ int FuzzerDriver(const std::vector<std::string> &Args,
   if (Flags.verbosity)
     Printf("Done %d runs in %zd second(s)\n", F.getTotalNumberOfRuns(),
            F.secondsSinceProcessStartUp());
+  F.PrintFinalStats();
 
   exit(0);  // Don't let F destroy itself.
 }
 
+// Storage for global ExternalFunctions object.
+ExternalFunctions *EF = nullptr;
+
 }  // namespace fuzzer
diff --git a/lib/Fuzzer/FuzzerExtFunctions.def b/lib/Fuzzer/FuzzerExtFunctions.def
new file mode 100644
index 000000000000..f7dcf9b54fbc
--- /dev/null
+++ b/lib/Fuzzer/FuzzerExtFunctions.def
@@ -0,0 +1,46 @@
+//===- FuzzerExtFunctions.def - External functions --------------*- C++ -* ===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+// This defines the external function pointers that
+// ``fuzzer::ExternalFunctions`` should contain and try to initialize.  The
+// EXT_FUNC macro must be defined at the point of inclusion. The signature of
+// the macro is:
+//
+// EXT_FUNC(<name>, <return_type>, <function_signature>, <warn_if_missing>)
+//===----------------------------------------------------------------------===//
+
+// Optional user functions
+EXT_FUNC(LLVMFuzzerInitialize, int, (int *argc, char ***argv), false);
+EXT_FUNC(LLVMFuzzerCustomMutator, size_t,
+         (uint8_t * Data, size_t Size, size_t MaxSize, unsigned int Seed),
+         false);
+EXT_FUNC(LLVMFuzzerCustomCrossOver, size_t,
+         (const uint8_t * Data1, size_t Size1,
+          const uint8_t * Data2, size_t Size2,
+          uint8_t * Out, size_t MaxOutSize, unsigned int Seed),
+         false);
+
+// Sanitizer functions
+EXT_FUNC(__lsan_enable, void, (), false);
+EXT_FUNC(__lsan_disable, void, (), false);
+EXT_FUNC(__lsan_do_recoverable_leak_check, int, (), false);
+EXT_FUNC(__sanitizer_get_coverage_pc_buffer, uintptr_t, (uintptr_t**), true);
+EXT_FUNC(__sanitizer_get_number_of_counters, size_t, (), false);
+EXT_FUNC(__sanitizer_install_malloc_and_free_hooks, int,
+         (void (*malloc_hook)(const volatile void *, size_t),
+          void (*free_hook)(const volatile void *)),
+         false);
+EXT_FUNC(__sanitizer_get_total_unique_caller_callee_pairs, size_t, (), false);
+EXT_FUNC(__sanitizer_get_total_unique_coverage, size_t, (), true);
+EXT_FUNC(__sanitizer_print_memory_profile, int, (size_t), false);
+EXT_FUNC(__sanitizer_print_stack_trace, void, (), true);
+EXT_FUNC(__sanitizer_reset_coverage, void, (), true);
+EXT_FUNC(__sanitizer_set_death_callback, void, (void (*)(void)), true);
+EXT_FUNC(__sanitizer_set_report_fd, void, (void*), false);
+EXT_FUNC(__sanitizer_update_counter_bitset_and_clear_counters, uintptr_t,
+  (uint8_t*), false);
diff --git a/lib/Fuzzer/FuzzerExtFunctions.h b/lib/Fuzzer/FuzzerExtFunctions.h
new file mode 100644
index 000000000000..2ec86cb92316
--- /dev/null
+++ b/lib/Fuzzer/FuzzerExtFunctions.h
@@ -0,0 +1,33 @@
+//===- FuzzerExtFunctions.h - Interface to external functions ---*- C++ -* ===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+// Defines an interface to (possibly optional) functions.
+//===----------------------------------------------------------------------===//
+#ifndef LLVM_FUZZER_EXT_FUNCTIONS_H
+#define LLVM_FUZZER_EXT_FUNCTIONS_H
+
+#include <stddef.h>
+#include <stdint.h>
+
+namespace fuzzer {
+
+struct ExternalFunctions {
+  // Initialize function pointers. Functions that are not available will be set
+  // to nullptr.  Do not call this constructor  before ``main()`` has been
+  // entered.
+  ExternalFunctions();
+
+#define EXT_FUNC(NAME, RETURN_TYPE, FUNC_SIG, WARN)                            \
+  RETURN_TYPE(*NAME) FUNC_SIG = nullptr
+
+#include "FuzzerExtFunctions.def"
+
+#undef EXT_FUNC
+};
+} // namespace fuzzer
+#endif
diff --git a/lib/Fuzzer/FuzzerExtFunctionsDlsym.cpp b/lib/Fuzzer/FuzzerExtFunctionsDlsym.cpp
new file mode 100644
index 000000000000..7b9681a61932
--- /dev/null
+++ b/lib/Fuzzer/FuzzerExtFunctionsDlsym.cpp
@@ -0,0 +1,49 @@
+//===- FuzzerExtFunctionsDlsym.cpp - Interface to external functions ------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+// Implementation for operating systems that support dlsym(). We only use it on
+// Apple platforms for now. We don't use this approach on Linux because it
+// requires that clients of LibFuzzer pass ``--export-dynamic`` to the linker.
+// That is a complication we don't wish to expose to clients right now.
+//===----------------------------------------------------------------------===//
+#include "FuzzerInternal.h"
+#if LIBFUZZER_APPLE
+
+#include "FuzzerExtFunctions.h"
+#include <dlfcn.h>
+
+using namespace fuzzer;
+
+template <typename T>
+static T GetFnPtr(const char *FnName, bool WarnIfMissing) {
+  dlerror(); // Clear any previous errors.
+  void *Fn = dlsym(RTLD_DEFAULT, FnName);
+  if (Fn == nullptr) {
+    if (WarnIfMissing) {
+      const char *ErrorMsg = dlerror();
+      Printf("WARNING: Failed to find function \"%s\".", FnName);
+      if (ErrorMsg)
+        Printf(" Reason %s.", ErrorMsg);
+      Printf("\n");
+    }
+  }
+  return reinterpret_cast<T>(Fn);
+}
+
+namespace fuzzer {
+
+ExternalFunctions::ExternalFunctions() {
+#define EXT_FUNC(NAME, RETURN_TYPE, FUNC_SIG, WARN)                            \
+  this->NAME = GetFnPtr<decltype(ExternalFunctions::NAME)>(#NAME, WARN)
+
+#include "FuzzerExtFunctions.def"
+
+#undef EXT_FUNC
+}
+} // namespace fuzzer
+#endif // LIBFUZZER_APPLE
diff --git a/lib/Fuzzer/FuzzerExtFunctionsWeak.cpp b/lib/Fuzzer/FuzzerExtFunctionsWeak.cpp
new file mode 100644
index 000000000000..75c0ed9f830f
--- /dev/null
+++ b/lib/Fuzzer/FuzzerExtFunctionsWeak.cpp
@@ -0,0 +1,50 @@
+//===- FuzzerExtFunctionsWeak.cpp - Interface to external functions -------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+// Implementation for Linux. This relies on the linker's support for weak
+// symbols. We don't use this approach on Apple platforms because it requires
+// clients of LibFuzzer to pass ``-U _<symbol_name>`` to the linker to allow
+// weak symbols to be undefined. That is a complication we don't want to expose
+// to clients right now.
+//===----------------------------------------------------------------------===//
+#include "FuzzerInternal.h"
+#if LIBFUZZER_LINUX
+
+#include "FuzzerExtFunctions.h"
+
+extern "C" {
+// Declare these symbols as weak to allow them to be optionally defined.
+#define EXT_FUNC(NAME, RETURN_TYPE, FUNC_SIG, WARN)                            \
+  __attribute__((weak)) RETURN_TYPE NAME FUNC_SIG
+
+#include "FuzzerExtFunctions.def"
+
+#undef EXT_FUNC
+}
+
+using namespace fuzzer;
+
+static void CheckFnPtr(void *FnPtr, const char *FnName, bool WarnIfMissing) {
+  if (FnPtr == nullptr && WarnIfMissing) {
+    Printf("WARNING: Failed to find function \"%s\".\n", FnName);
+  }
+}
+
+namespace fuzzer {
+
+ExternalFunctions::ExternalFunctions() {
+#define EXT_FUNC(NAME, RETURN_TYPE, FUNC_SIG, WARN)                            \
+  this->NAME = ::NAME;                                                         \
+  CheckFnPtr((void *)::NAME, #NAME, WARN);
+
+#include "FuzzerExtFunctions.def"
+
+#undef EXT_FUNC
+}
+} // namespace fuzzer
+#endif // LIBFUZZER_LINUX
diff --git a/lib/Fuzzer/FuzzerFlags.def b/lib/Fuzzer/FuzzerFlags.def
index 977efb76922b..599ac3686344 100644
--- a/lib/Fuzzer/FuzzerFlags.def
+++ b/lib/Fuzzer/FuzzerFlags.def
@@ -14,30 +14,36 @@ FUZZER_FLAG_INT(verbosity, 1, "Verbosity level.")
 FUZZER_FLAG_UNSIGNED(seed, 0, "Random seed. If 0, seed is generated.")
 FUZZER_FLAG_INT(runs, -1,
             "Number of individual test runs (-1 for infinite runs).")
-FUZZER_FLAG_INT(max_len, 64, "Maximum length of the test input.")
+FUZZER_FLAG_INT(max_len, 0, "Maximum length of the test input. "
+    "If 0, libFuzzer tries to guess a good value based on the corpus "
+    "and reports it. ")
 FUZZER_FLAG_INT(cross_over, 1, "If 1, cross over inputs.")
 FUZZER_FLAG_INT(mutate_depth, 5,
             "Apply this number of consecutive mutations to each input.")
 FUZZER_FLAG_INT(shuffle, 1, "Shuffle inputs at startup")
-FUZZER_FLAG_INT(
-    prefer_small_during_initial_shuffle, -1,
-    "If 1, always prefer smaller inputs during the initial corpus shuffle."
-    " If 0, never do that. If -1, do it sometimes.")
-FUZZER_FLAG_INT(exit_on_first, 0,
-            "If 1, exit after the first new interesting input is found.")
+FUZZER_FLAG_INT(prefer_small, 1,
+    "If 1, always prefer smaller inputs during the corpus shuffle.")
 FUZZER_FLAG_INT(
     timeout, 1200,
     "Timeout in seconds (if positive). "
     "If one unit runs more than this number of seconds the process will abort.")
+FUZZER_FLAG_INT(timeout_exitcode, 77,
+                "Unless abort_on_timeout is set, use this exitcode on timeout.")
+FUZZER_FLAG_INT(error_exit_code, 77, "When libFuzzer's signal handlers are in "
+  "use exit with this exitcode after catching a deadly signal.")
 FUZZER_FLAG_INT(max_total_time, 0, "If positive, indicates the maximal total "
                                    "time in seconds to run the fuzzer.")
 FUZZER_FLAG_INT(help, 0, "Print help.")
-FUZZER_FLAG_INT(save_minimized_corpus, 0, "Deprecated. Use -merge=1")
 FUZZER_FLAG_INT(merge, 0, "If 1, the 2-nd, 3-rd, etc corpora will be "
-  "merged into the 1-st corpus. Only interesting units will be taken.")
+  "merged into the 1-st corpus. Only interesting units will be taken. "
+  "This flag can be used to minimize a corpus.")
 FUZZER_FLAG_INT(use_counters, 1, "Use coverage counters")
 FUZZER_FLAG_INT(use_indir_calls, 1, "Use indirect caller-callee counters")
 FUZZER_FLAG_INT(use_traces, 0, "Experimental: use instruction traces")
+FUZZER_FLAG_INT(use_memcmp, 1,
+                "Use hints from intercepting memcmp, strcmp, etc")
+FUZZER_FLAG_INT(use_memmem, 1,
+                "Use hints from intercepting memmem, strstr, etc")
 FUZZER_FLAG_INT(jobs, 0, "Number of jobs to run. If jobs >= 1 we spawn"
                           " this number of jobs in separate worker processes"
                           " with stdout/stderr redirected to fuzz-JOB.log.")
@@ -47,16 +53,11 @@ FUZZER_FLAG_INT(workers, 0,
 FUZZER_FLAG_INT(reload, 1,
                 "Reload the main corpus periodically to get new units"
                 " discovered by other processes.")
-FUZZER_FLAG_STRING(sync_command, "Execute an external command "
-                                 "\"<sync_command> <test_corpus>\" "
-                                 "to synchronize the test corpus.")
-FUZZER_FLAG_INT(sync_timeout, 600, "Minimum timeout between syncs.")
 FUZZER_FLAG_INT(report_slow_units, 10,
     "Report slowest units if they run for more than this number of seconds.")
 FUZZER_FLAG_INT(only_ascii, 0,
                 "If 1, generate only ASCII (isprint+isspace) inputs.")
 FUZZER_FLAG_STRING(dict, "Experimental. Use the dictionary file.")
-FUZZER_FLAG_STRING(test_single_input, "Use specified file as test input.")
 FUZZER_FLAG_STRING(artifact_prefix, "Write fuzzing artifacts (crash, "
                                     "timeout, or slow inputs) as "
                                     "$(artifact_prefix)file")
@@ -69,4 +70,28 @@ FUZZER_FLAG_INT(drill, 0, "Experimental: fuzz using a single unit as the seed "
                           "corpus, then merge with the initial corpus")
 FUZZER_FLAG_INT(output_csv, 0, "Enable pulse output in CSV format.")
 FUZZER_FLAG_INT(print_new_cov_pcs, 0, "If 1, print out new covered pcs.")
+FUZZER_FLAG_INT(print_final_stats, 0, "If 1, print statistics at exit.")
+
+FUZZER_FLAG_INT(handle_segv, 1, "If 1, try to intercept SIGSEGV.")
+FUZZER_FLAG_INT(handle_bus, 1, "If 1, try to intercept SIGSEGV.")
+FUZZER_FLAG_INT(handle_abrt, 1, "If 1, try to intercept SIGABRT.")
+FUZZER_FLAG_INT(handle_ill, 1, "If 1, try to intercept SIGILL.")
+FUZZER_FLAG_INT(handle_fpe, 1, "If 1, try to intercept SIGFPE.")
+FUZZER_FLAG_INT(handle_int, 1, "If 1, try to intercept SIGINT.")
+FUZZER_FLAG_INT(handle_term, 1, "If 1, try to intercept SIGTERM.")
+FUZZER_FLAG_INT(close_fd_mask, 0, "If 1, close stdout at startup; "
+    "if 2, close stderr; if 3, close both. "
+    "Be careful, this will also close e.g. asan's stderr/stdout.")
+FUZZER_FLAG_INT(detect_leaks, 1, "If 1, and if LeakSanitizer is enabled "
+    "try to detect memory leaks during fuzzing (i.e. not only at shut down).")
+FUZZER_FLAG_INT(rss_limit_mb, 2048, "If non-zero, the fuzzer will exit upon"
+    "reaching this limit of RSS memory usage.")
+FUZZER_FLAG_INT(truncate_units, 0, "Try truncated units when loading corpus.")
+FUZZER_FLAG_INT(prune_corpus, 1, "Prune corpus items without new coverage when "
+                                 "loading corpus.")
 
+FUZZER_DEPRECATED_FLAG(exit_on_first)
+FUZZER_DEPRECATED_FLAG(save_minimized_corpus)
+FUZZER_DEPRECATED_FLAG(sync_command)
+FUZZER_DEPRECATED_FLAG(sync_timeout)
+FUZZER_DEPRECATED_FLAG(test_single_input)
diff --git a/lib/Fuzzer/FuzzerFnAdapter.h b/lib/Fuzzer/FuzzerFnAdapter.h
new file mode 100644
index 000000000000..eb2c219b8703
--- /dev/null
+++ b/lib/Fuzzer/FuzzerFnAdapter.h
@@ -0,0 +1,187 @@
+//===- FuzzerAdapter.h - Arbitrary function Fuzzer adapter -------*- C++ -*===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// W A R N I N G :  E X P E R I M E N T A L.
+//
+// Defines an adapter to fuzz functions with (almost) arbitrary signatures.
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_FUZZER_ADAPTER_H
+#define LLVM_FUZZER_ADAPTER_H
+
+#include <stddef.h>
+#include <stdint.h>
+
+#include <algorithm>
+#include <string>
+#include <tuple>
+#include <vector>
+
+namespace fuzzer {
+
+/// Unpacks bytes from \p Data according to \p F argument types
+/// and calls the function.
+/// Use to automatically adapt LLVMFuzzerTestOneInput interface to
+/// a specific function.
+/// Supported argument types: primitive types, std::vector<uint8_t>.
+template <typename Fn> bool Adapt(Fn F, const uint8_t *Data, size_t Size);
+
+// The implementation performs several steps:
+// - function argument types are obtained (Args...)
+// - data is unpacked into std::tuple<Args...> one by one
+// - function is called with std::tuple<Args...> containing arguments.
+namespace impl {
+
+// Single argument unpacking.
+
+template <typename T>
+size_t UnpackPrimitive(const uint8_t *Data, size_t Size, T *Value) {
+  if (Size < sizeof(T))
+    return Size;
+  *Value = *reinterpret_cast<const T *>(Data);
+  return Size - sizeof(T);
+}
+
+/// Unpacks into a given Value and returns the Size - num_consumed_bytes.
+/// Return value equal to Size signals inability to unpack the data (typically
+/// because there are not enough bytes).
+template <typename T>
+size_t UnpackSingle(const uint8_t *Data, size_t Size, T *Value);
+
+#define UNPACK_SINGLE_PRIMITIVE(Type)                                          \
+  template <>                                                                  \
+  size_t UnpackSingle<Type>(const uint8_t *Data, size_t Size, Type *Value) {   \
+    return UnpackPrimitive(Data, Size, Value);                                 \
+  }
+
+UNPACK_SINGLE_PRIMITIVE(char)
+UNPACK_SINGLE_PRIMITIVE(signed char)
+UNPACK_SINGLE_PRIMITIVE(unsigned char)
+
+UNPACK_SINGLE_PRIMITIVE(short int)
+UNPACK_SINGLE_PRIMITIVE(unsigned short int)
+
+UNPACK_SINGLE_PRIMITIVE(int)
+UNPACK_SINGLE_PRIMITIVE(unsigned int)
+
+UNPACK_SINGLE_PRIMITIVE(long int)
+UNPACK_SINGLE_PRIMITIVE(unsigned long int)
+
+UNPACK_SINGLE_PRIMITIVE(bool)
+UNPACK_SINGLE_PRIMITIVE(wchar_t)
+
+UNPACK_SINGLE_PRIMITIVE(float)
+UNPACK_SINGLE_PRIMITIVE(double)
+UNPACK_SINGLE_PRIMITIVE(long double)
+
+#undef UNPACK_SINGLE_PRIMITIVE
+
+template <>
+size_t UnpackSingle<std::vector<uint8_t>>(const uint8_t *Data, size_t Size,
+                                          std::vector<uint8_t> *Value) {
+  if (Size < 1)
+    return Size;
+  size_t Len = std::min(static_cast<size_t>(*Data), Size - 1);
+  std::vector<uint8_t> V(Data + 1, Data + 1 + Len);
+  Value->swap(V);
+  return Size - Len - 1;
+}
+
+template <>
+size_t UnpackSingle<std::string>(const uint8_t *Data, size_t Size,
+    std::string *Value) {
+  if (Size < 1)
+    return Size;
+  size_t Len = std::min(static_cast<size_t>(*Data), Size - 1);
+  std::string S(Data + 1, Data + 1 + Len);
+  Value->swap(S);
+  return Size - Len - 1;
+}
+
+// Unpacking into arbitrary tuple.
+
+// Recursion guard.
+template <int N, typename TupleT>
+typename std::enable_if<N == std::tuple_size<TupleT>::value, bool>::type
+UnpackImpl(const uint8_t *Data, size_t Size, TupleT *Tuple) {
+  return true;
+}
+
+// Unpack tuple elements starting from Nth.
+template <int N, typename TupleT>
+typename std::enable_if<N < std::tuple_size<TupleT>::value, bool>::type
+UnpackImpl(const uint8_t *Data, size_t Size, TupleT *Tuple) {
+  size_t NewSize = UnpackSingle(Data, Size, &std::get<N>(*Tuple));
+  if (NewSize == Size) {
+    return false;
+  }
+
+  return UnpackImpl<N + 1, TupleT>(Data + (Size - NewSize), NewSize, Tuple);
+}
+
+// Unpacks into arbitrary tuple and returns true if successful.
+template <typename... Args>
+bool Unpack(const uint8_t *Data, size_t Size, std::tuple<Args...> *Tuple) {
+  return UnpackImpl<0, std::tuple<Args...>>(Data, Size, Tuple);
+}
+
+// Helper integer sequence templates.
+
+template <int...> struct Seq {};
+
+template <int N, int... S> struct GenSeq : GenSeq<N - 1, N - 1, S...> {};
+
+// GenSeq<N>::type is Seq<0, 1, ..., N-1>
+template <int... S> struct GenSeq<0, S...> { typedef Seq<S...> type; };
+
+// Function signature introspection.
+
+template <typename T> struct FnTraits {};
+
+template <typename ReturnType, typename... Args>
+struct FnTraits<ReturnType (*)(Args...)> {
+  enum { Arity = sizeof...(Args) };
+  typedef std::tuple<Args...> ArgsTupleT;
+};
+
+// Calling a function with arguments in a tuple.
+
+template <typename Fn, int... S>
+void ApplyImpl(Fn F, const typename FnTraits<Fn>::ArgsTupleT &Params,
+               Seq<S...>) {
+  F(std::get<S>(Params)...);
+}
+
+template <typename Fn>
+void Apply(Fn F, const typename FnTraits<Fn>::ArgsTupleT &Params) {
+  // S is Seq<0, ..., Arity-1>
+  auto S = typename GenSeq<FnTraits<Fn>::Arity>::type();
+  ApplyImpl(F, Params, S);
+}
+
+// Unpacking data into arguments tuple of correct type and calling the function.
+template <typename Fn>
+bool UnpackAndApply(Fn F, const uint8_t *Data, size_t Size) {
+  typename FnTraits<Fn>::ArgsTupleT Tuple;
+  if (!Unpack(Data, Size, &Tuple))
+    return false;
+
+  Apply(F, Tuple);
+  return true;
+}
+
+} // namespace impl
+
+template <typename Fn> bool Adapt(Fn F, const uint8_t *Data, size_t Size) {
+  return impl::UnpackAndApply(F, Data, Size);
+}
+
+} // namespace fuzzer
+
+#endif
diff --git a/lib/Fuzzer/FuzzerIO.cpp b/lib/Fuzzer/FuzzerIO.cpp
index 043fad396d51..0e0c4e989cc4 100644
--- a/lib/Fuzzer/FuzzerIO.cpp
+++ b/lib/Fuzzer/FuzzerIO.cpp
@@ -8,6 +8,7 @@
 //===----------------------------------------------------------------------===//
 // IO functions.
 //===----------------------------------------------------------------------===//
+#include "FuzzerExtFunctions.h"
 #include "FuzzerInternal.h"
 #include <iterator>
 #include <fstream>
@@ -20,6 +21,15 @@
 
 namespace fuzzer {
 
+static FILE *OutputFile = stderr;
+
+bool IsFile(const std::string &Path) {
+  struct stat St;
+  if (stat(Path.c_str(), &St))
+    return false;
+  return S_ISREG(St.st_mode);
+}
+
 static long GetEpoch(const std::string &Path) {
   struct stat St;
   if (stat(Path.c_str(), &St))
@@ -27,35 +37,45 @@ static long GetEpoch(const std::string &Path) {
   return St.st_mtime;
 }
 
-static std::vector<std::string> ListFilesInDir(const std::string &Dir,
-                                               long *Epoch) {
-  std::vector<std::string> V;
-  if (Epoch) {
-    auto E = GetEpoch(Dir);
-    if (*Epoch >= E) return V;
-    *Epoch = E;
-  }
+static void ListFilesInDirRecursive(const std::string &Dir, long *Epoch,
+                                    std::vector<std::string> *V, bool TopDir) {
+  auto E = GetEpoch(Dir);
+  if (Epoch)
+    if (E && *Epoch >= E) return;
+
   DIR *D = opendir(Dir.c_str());
   if (!D) {
     Printf("No such directory: %s; exiting\n", Dir.c_str());
     exit(1);
   }
   while (auto E = readdir(D)) {
+    std::string Path = DirPlusFile(Dir, E->d_name);
     if (E->d_type == DT_REG || E->d_type == DT_LNK)
-      V.push_back(E->d_name);
+      V->push_back(Path);
+    else if (E->d_type == DT_DIR && *E->d_name != '.')
+      ListFilesInDirRecursive(Path, Epoch, V, false);
   }
   closedir(D);
-  return V;
+  if (Epoch && TopDir)
+    *Epoch = E;
 }
 
-Unit FileToVector(const std::string &Path) {
+Unit FileToVector(const std::string &Path, size_t MaxSize) {
   std::ifstream T(Path);
   if (!T) {
     Printf("No such directory: %s; exiting\n", Path.c_str());
     exit(1);
   }
-  return Unit((std::istreambuf_iterator<char>(T)),
-              std::istreambuf_iterator<char>());
+
+  T.seekg(0, T.end);
+  size_t FileLen = T.tellg();
+  if (MaxSize)
+    FileLen = std::min(FileLen, MaxSize);
+
+  T.seekg(0, T.beg);
+  Unit Res(FileLen);
+  T.read(reinterpret_cast<char *>(Res.data()), FileLen);
+  return Res;
 }
 
 std::string FileToString(const std::string &Path) {
@@ -77,12 +97,18 @@ void WriteToFile(const Unit &U, const std::string &Path) {
 }
 
 void ReadDirToVectorOfUnits(const char *Path, std::vector<Unit> *V,
-                            long *Epoch) {
+                            long *Epoch, size_t MaxSize) {
   long E = Epoch ? *Epoch : 0;
-  for (auto &X : ListFilesInDir(Path, Epoch)) {
-    auto FilePath = DirPlusFile(Path, X);
-    if (Epoch && GetEpoch(FilePath) < E) continue;
-    V->push_back(FileToVector(FilePath));
+  std::vector<std::string> Files;
+  ListFilesInDirRecursive(Path, Epoch, &Files, /*TopDir*/true);
+  size_t NumLoaded = 0;
+  for (size_t i = 0; i < Files.size(); i++) {
+    auto &X = Files[i];
+    if (Epoch && GetEpoch(X) < E) continue;
+    NumLoaded++;
+    if ((NumLoaded & (NumLoaded - 1)) == 0 && NumLoaded >= 1024)
+      Printf("Loaded %zd/%zd files from %s\n", NumLoaded, Files.size(), Path);
+    V->push_back(FileToVector(X, MaxSize));
   }
 }
 
@@ -91,11 +117,27 @@ std::string DirPlusFile(const std::string &DirPath,
   return DirPath + "/" + FileName;
 }
 
+void DupAndCloseStderr() {
+  int OutputFd = dup(2);
+  if (OutputFd > 0) {
+    FILE *NewOutputFile = fdopen(OutputFd, "w");
+    if (NewOutputFile) {
+      OutputFile = NewOutputFile;
+      if (EF->__sanitizer_set_report_fd)
+        EF->__sanitizer_set_report_fd(reinterpret_cast<void *>(OutputFd));
+      close(2);
+    }
+  }
+}
+
+void CloseStdout() { close(1); }
+
 void Printf(const char *Fmt, ...) {
   va_list ap;
   va_start(ap, Fmt);
-  vfprintf(stderr, Fmt, ap);
+  vfprintf(OutputFile, Fmt, ap);
   va_end(ap);
+  fflush(OutputFile);
 }
 
 }  // namespace fuzzer
diff --git a/lib/Fuzzer/FuzzerInterface.cpp b/lib/Fuzzer/FuzzerInterface.cpp
deleted file mode 100644
index bcd726fc08e4..000000000000
--- a/lib/Fuzzer/FuzzerInterface.cpp
+++ /dev/null
@@ -1,30 +0,0 @@
-//===- FuzzerInterface.cpp - Mutate a test input --------------------------===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-// Parts of public interface for libFuzzer.
-//===----------------------------------------------------------------------===//
-
-
-#include "FuzzerInterface.h"
-#include "FuzzerInternal.h"
-
-namespace fuzzer {
-
-void FuzzerRandomLibc::ResetSeed(unsigned int seed) { srand(seed); }
-
-size_t FuzzerRandomLibc::Rand() { return rand(); }
-
-UserSuppliedFuzzer::UserSuppliedFuzzer(FuzzerRandomBase *Rand)
-    : Rand(Rand), MD(*Rand) {}
-
-UserSuppliedFuzzer::~UserSuppliedFuzzer() {
-  if (OwnRand)
-    delete Rand;
-}
-
-}  // namespace fuzzer.
diff --git a/lib/Fuzzer/FuzzerInterface.h b/lib/Fuzzer/FuzzerInterface.h
index e22b27a3dd2b..d47e20e3a2b9 100644
--- a/lib/Fuzzer/FuzzerInterface.h
+++ b/lib/Fuzzer/FuzzerInterface.h
@@ -6,193 +6,62 @@
 // License. See LICENSE.TXT for details.
 //
 //===----------------------------------------------------------------------===//
-// Define the interface between the Fuzzer and the library being tested.
+// Define the interface between libFuzzer and the library being tested.
 //===----------------------------------------------------------------------===//
 
-// WARNING: keep the interface free of STL or any other header-based C++ lib,
-// to avoid bad interactions between the code used in the fuzzer and
-// the code used in the target function.
+// NOTE: the libFuzzer interface is thin and in the majority of cases
+// you should not include this file into your target. In 95% of cases
+// all you need is to define the following function in your file:
+// extern "C" int LLVMFuzzerTestOneInput(const uint8_t *Data, size_t Size);
+
+// WARNING: keep the interface in C.
 
 #ifndef LLVM_FUZZER_INTERFACE_H
 #define LLVM_FUZZER_INTERFACE_H
 
-#include <limits>
-#include <cstddef>
-#include <cstdint>
-#include <vector>
-#include <string>
-
-namespace fuzzer {
-typedef std::vector<uint8_t> Unit;
-
-/// Returns an int 0. Values other than zero are reserved for future.
-typedef int (*UserCallback)(const uint8_t *Data, size_t Size);
-/** Simple C-like interface with a single user-supplied callback.
-
-Usage:
-
-#\code
-#include "FuzzerInterface.h"
-
-int LLVMFuzzerTestOneInput(const uint8_t *Data, size_t Size) {
-  DoStuffWithData(Data, Size);
-  return 0;
-}
-
-// Implement your own main() or use the one from FuzzerMain.cpp.
-int main(int argc, char **argv) {
-  InitializeMeIfNeeded();
-  return fuzzer::FuzzerDriver(argc, argv, LLVMFuzzerTestOneInput);
-}
-#\endcode
-*/
-int FuzzerDriver(int argc, char **argv, UserCallback Callback);
-
-class FuzzerRandomBase {
- public:
-  FuzzerRandomBase(){}
-  virtual ~FuzzerRandomBase(){};
-  virtual void ResetSeed(unsigned int seed) = 0;
-  // Return a random number.
-  virtual size_t Rand() = 0;
-  // Return a random number in range [0,n).
-  size_t operator()(size_t n) { return n ? Rand() % n : 0; }
-  bool RandBool() { return Rand() % 2; }
-};
-
-class FuzzerRandomLibc : public FuzzerRandomBase {
- public:
-  FuzzerRandomLibc(unsigned int seed) { ResetSeed(seed); }
-  void ResetSeed(unsigned int seed) override;
-  ~FuzzerRandomLibc() override {}
-  size_t Rand() override;
-};
-
-class MutationDispatcher {
- public:
-  MutationDispatcher(FuzzerRandomBase &Rand);
-  ~MutationDispatcher();
-  /// Indicate that we are about to start a new sequence of mutations.
-  void StartMutationSequence();
-  /// Print the current sequence of mutations.
-  void PrintMutationSequence();
-  /// Mutates data by shuffling bytes.
-  size_t Mutate_ShuffleBytes(uint8_t *Data, size_t Size, size_t MaxSize);
-  /// Mutates data by erasing a byte.
-  size_t Mutate_EraseByte(uint8_t *Data, size_t Size, size_t MaxSize);
-  /// Mutates data by inserting a byte.
-  size_t Mutate_InsertByte(uint8_t *Data, size_t Size, size_t MaxSize);
-  /// Mutates data by chanding one byte.
-  size_t Mutate_ChangeByte(uint8_t *Data, size_t Size, size_t MaxSize);
-  /// Mutates data by chanding one bit.
-  size_t Mutate_ChangeBit(uint8_t *Data, size_t Size, size_t MaxSize);
-
-  /// Mutates data by adding a word from the manual dictionary.
-  size_t Mutate_AddWordFromManualDictionary(uint8_t *Data, size_t Size,
-                                            size_t MaxSize);
-
-  /// Mutates data by adding a word from the automatic dictionary.
-  size_t Mutate_AddWordFromAutoDictionary(uint8_t *Data, size_t Size,
-                                          size_t MaxSize);
-
-  /// Tries to find an ASCII integer in Data, changes it to another ASCII int.
-  size_t Mutate_ChangeASCIIInteger(uint8_t *Data, size_t Size, size_t MaxSize);
-
-  /// CrossOver Data with some other element of the corpus.
-  size_t Mutate_CrossOver(uint8_t *Data, size_t Size, size_t MaxSize);
-
-  /// Applies one of the above mutations.
-  /// Returns the new size of data which could be up to MaxSize.
-  size_t Mutate(uint8_t *Data, size_t Size, size_t MaxSize);
-
-  /// Creates a cross-over of two pieces of Data, returns its size.
-  size_t CrossOver(const uint8_t *Data1, size_t Size1, const uint8_t *Data2,
-                   size_t Size2, uint8_t *Out, size_t MaxOutSize);
-
-  void AddWordToManualDictionary(const Unit &Word);
-
-  void AddWordToAutoDictionary(const Unit &Word, size_t PositionHint);
-  void ClearAutoDictionary();
-
-  void SetCorpus(const std::vector<Unit> *Corpus);
-
- private:
-  FuzzerRandomBase &Rand;
-  struct Impl;
-  Impl *MDImpl;
-};
-
-// For backward compatibility only, deprecated.
-static inline size_t Mutate(uint8_t *Data, size_t Size, size_t MaxSize,
-                            FuzzerRandomBase &Rand) {
-  MutationDispatcher MD(Rand);
-  return MD.Mutate(Data, Size, MaxSize);
-}
-
-/** An abstract class that allows to use user-supplied mutators with libFuzzer.
-
-Usage:
-
-#\code
-#include "FuzzerInterface.h"
-class MyFuzzer : public fuzzer::UserSuppliedFuzzer {
- public:
-  MyFuzzer(fuzzer::FuzzerRandomBase *Rand);
-  // Must define the target function.
-  int TargetFunction(...) { ...; return 0; }
-  // Optionally define the mutator.
-  size_t Mutate(...) { ... }
-  // Optionally define the CrossOver method.
-  size_t CrossOver(...) { ... }
-};
-
-int main(int argc, char **argv) {
-  MyFuzzer F;
-  fuzzer::FuzzerDriver(argc, argv, F);
-}
-#\endcode
-*/
-class UserSuppliedFuzzer {
- public:
-  UserSuppliedFuzzer(FuzzerRandomBase *Rand);
-  /// Executes the target function on 'Size' bytes of 'Data'.
-  virtual int TargetFunction(const uint8_t *Data, size_t Size) = 0;
-  virtual void StartMutationSequence() { MD.StartMutationSequence(); }
-  virtual void PrintMutationSequence() { MD.PrintMutationSequence(); }
-  virtual void SetCorpus(const std::vector<Unit> *Corpus) {
-    MD.SetCorpus(Corpus);
-  }
-  /// Mutates 'Size' bytes of data in 'Data' inplace into up to 'MaxSize' bytes,
-  /// returns the new size of the data, which should be positive.
-  virtual size_t Mutate(uint8_t *Data, size_t Size, size_t MaxSize) {
-    return MD.Mutate(Data, Size, MaxSize);
-  }
-  /// Crosses 'Data1' and 'Data2', writes up to 'MaxOutSize' bytes into Out,
-  /// returns the number of bytes written, which should be positive.
-  virtual size_t CrossOver(const uint8_t *Data1, size_t Size1,
-                           const uint8_t *Data2, size_t Size2,
-                           uint8_t *Out, size_t MaxOutSize) {
-    return MD.CrossOver(Data1, Size1, Data2, Size2, Out, MaxOutSize);
-  }
-  virtual ~UserSuppliedFuzzer();
-
-  FuzzerRandomBase &GetRand() { return *Rand; }
-
-  MutationDispatcher &GetMD() { return MD; }
-
- private:
-  bool OwnRand = false;
-  FuzzerRandomBase *Rand;
-  MutationDispatcher MD;
-};
-
-/// Runs the fuzzing with the UserSuppliedFuzzer.
-int FuzzerDriver(int argc, char **argv, UserSuppliedFuzzer &USF);
-
-/// More C++-ish interface.
-int FuzzerDriver(const std::vector<std::string> &Args, UserSuppliedFuzzer &USF);
-int FuzzerDriver(const std::vector<std::string> &Args, UserCallback Callback);
-
-}  // namespace fuzzer
+#include <stddef.h>
+#include <stdint.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif  // __cplusplus
+
+// Mandatory user-provided target function.
+// Executes the code under test with [Data, Data+Size) as the input.
+// libFuzzer will invoke this function *many* times with different inputs.
+// Must return 0.
+int LLVMFuzzerTestOneInput(const uint8_t *Data, size_t Size);
+
+// Optional user-provided initialization function.
+// If provided, this function will be called by libFuzzer once at startup.
+// It may read and modify argc/argv.
+// Must return 0.
+int LLVMFuzzerInitialize(int *argc, char ***argv);
+
+// Optional user-provided custom mutator.
+// Mutates raw data in [Data, Data+Size) inplace.
+// Returns the new size, which is not greater than MaxSize.
+// Given the same Seed produces the same mutation.
+size_t LLVMFuzzerCustomMutator(uint8_t *Data, size_t Size, size_t MaxSize,
+                               unsigned int Seed);
+
+// Optional user-provided custom cross-over function.
+// Combines pieces of Data1 & Data2 together into Out.
+// Returns the new size, which is not greater than MaxOutSize.
+// Should produce the same mutation given the same Seed.
+size_t LLVMFuzzerCustomCrossOver(const uint8_t *Data1, size_t Size1,
+                                 const uint8_t *Data2, size_t Size2,
+                                 uint8_t *Out, size_t MaxOutSize,
+                                 unsigned int Seed);
+
+// Experimental, may go away in future.
+// libFuzzer-provided function to be used inside LLVMFuzzerTestOneInput.
+// Mutates raw data in [Data, Data+Size) inplace.
+// Returns the new size, which is not greater than MaxSize.
+size_t LLVMFuzzerMutate(uint8_t *Data, size_t Size, size_t MaxSize);
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif  // __cplusplus
 
 #endif  // LLVM_FUZZER_INTERFACE_H
diff --git a/lib/Fuzzer/FuzzerInternal.h b/lib/Fuzzer/FuzzerInternal.h
index c1e9daac9808..08f8801ac5fd 100644
--- a/lib/Fuzzer/FuzzerInternal.h
+++ b/lib/Fuzzer/FuzzerInternal.h
@@ -12,38 +12,108 @@
 #ifndef LLVM_FUZZER_INTERNAL_H
 #define LLVM_FUZZER_INTERNAL_H
 
+#include <algorithm>
+#include <atomic>
 #include <cassert>
-#include <climits>
 #include <chrono>
+#include <climits>
 #include <cstddef>
 #include <cstdlib>
+#include <random>
+#include <string.h>
 #include <string>
-#include <vector>
 #include <unordered_set>
+#include <vector>
 
+#include "FuzzerExtFunctions.h"
 #include "FuzzerInterface.h"
+#include "FuzzerTracePC.h"
+
+// Platform detection.
+#ifdef __linux__
+#define LIBFUZZER_LINUX 1
+#define LIBFUZZER_APPLE 0
+#elif __APPLE__
+#define LIBFUZZER_LINUX 0
+#define LIBFUZZER_APPLE 1
+#else
+#error "Support for your platform has not been implemented"
+#endif
 
 namespace fuzzer {
+
+typedef int (*UserCallback)(const uint8_t *Data, size_t Size);
+int FuzzerDriver(int *argc, char ***argv, UserCallback Callback);
+
 using namespace std::chrono;
+typedef std::vector<uint8_t> Unit;
+typedef std::vector<Unit> UnitVector;
+
+// A simple POD sized array of bytes.
+template <size_t kMaxSize> class FixedWord {
+public:
+  FixedWord() {}
+  FixedWord(const uint8_t *B, uint8_t S) { Set(B, S); }
+
+  void Set(const uint8_t *B, uint8_t S) {
+    assert(S <= kMaxSize);
+    memcpy(Data, B, S);
+    Size = S;
+  }
+
+  bool operator==(const FixedWord<kMaxSize> &w) const {
+    return Size == w.Size && 0 == memcmp(Data, w.Data, Size);
+  }
+
+  bool operator<(const FixedWord<kMaxSize> &w) const {
+    if (Size != w.Size)
+      return Size < w.Size;
+    return memcmp(Data, w.Data, Size) < 0;
+  }
+
+  static size_t GetMaxSize() { return kMaxSize; }
+  const uint8_t *data() const { return Data; }
+  uint8_t size() const { return Size; }
+
+private:
+  uint8_t Size = 0;
+  uint8_t Data[kMaxSize];
+};
 
+typedef FixedWord<27> Word; // 28 bytes.
+
+bool IsFile(const std::string &Path);
 std::string FileToString(const std::string &Path);
-Unit FileToVector(const std::string &Path);
+Unit FileToVector(const std::string &Path, size_t MaxSize = 0);
 void ReadDirToVectorOfUnits(const char *Path, std::vector<Unit> *V,
-                            long *Epoch);
+                            long *Epoch, size_t MaxSize);
 void WriteToFile(const Unit &U, const std::string &Path);
 void CopyFileToErr(const std::string &Path);
 // Returns "Dir/FileName" or equivalent for the current OS.
 std::string DirPlusFile(const std::string &DirPath,
                         const std::string &FileName);
 
+void DupAndCloseStderr();
+void CloseStdout();
 void Printf(const char *Fmt, ...);
-void Print(const Unit &U, const char *PrintAfter = "");
+void PrintHexArray(const Unit &U, const char *PrintAfter = "");
+void PrintHexArray(const uint8_t *Data, size_t Size,
+                   const char *PrintAfter = "");
 void PrintASCII(const uint8_t *Data, size_t Size, const char *PrintAfter = "");
 void PrintASCII(const Unit &U, const char *PrintAfter = "");
+void PrintASCII(const Word &W, const char *PrintAfter = "");
 std::string Hash(const Unit &U);
 void SetTimer(int Seconds);
+void SetSigSegvHandler();
+void SetSigBusHandler();
+void SetSigAbrtHandler();
+void SetSigIllHandler();
+void SetSigFpeHandler();
+void SetSigIntHandler();
+void SetSigTermHandler();
 std::string Base64(const Unit &U);
 int ExecuteCommand(const std::string &Command);
+size_t GetPeakRSSMb();
 
 // Private copy of SHA1 implementation.
 static const int kSHA1NumBytes = 20;
@@ -52,11 +122,24 @@ void ComputeSHA1(const uint8_t *Data, size_t Len, uint8_t *Out);
 
 // Changes U to contain only ASCII (isprint+isspace) characters.
 // Returns true iff U has been changed.
-bool ToASCII(Unit &U);
+bool ToASCII(uint8_t *Data, size_t Size);
 bool IsASCII(const Unit &U);
+bool IsASCII(const uint8_t *Data, size_t Size);
 
 int NumberOfCpuCores();
 int GetPid();
+void SleepSeconds(int Seconds);
+
+class Random {
+ public:
+  Random(unsigned int seed) : R(seed) {}
+  size_t Rand() { return R(); }
+  size_t RandBool() { return Rand() % 2; }
+  size_t operator()(size_t n) { return n ? Rand() % n : 0; }
+  std::mt19937 &Get_mt19937() { return R; }
+ private:
+  std::mt19937 R;
+};
 
 // Dictionary.
 
@@ -68,50 +151,240 @@ bool ParseOneDictionaryEntry(const std::string &Str, Unit *U);
 // were parsed succesfully.
 bool ParseDictionaryFile(const std::string &Text, std::vector<Unit> *Units);
 
-class Fuzzer {
+class DictionaryEntry {
+ public:
+  DictionaryEntry() {}
+  DictionaryEntry(Word W) : W(W) {}
+  DictionaryEntry(Word W, size_t PositionHint) : W(W), PositionHint(PositionHint) {}
+  const Word &GetW() const { return W; }
+
+  bool HasPositionHint() const { return PositionHint != std::numeric_limits<size_t>::max(); }
+  size_t GetPositionHint() const {
+    assert(HasPositionHint());
+    return PositionHint;
+  }
+  void IncUseCount() { UseCount++; }
+  void IncSuccessCount() { SuccessCount++; }
+  size_t GetUseCount() const { return UseCount; }
+  size_t GetSuccessCount() const {return SuccessCount; }
+
+private:
+  Word W;
+  size_t PositionHint = std::numeric_limits<size_t>::max();
+  size_t UseCount = 0;
+  size_t SuccessCount = 0;
+};
+
+class Dictionary {
  public:
-  struct FuzzingOptions {
-    int Verbosity = 1;
-    int MaxLen = 0;
-    int UnitTimeoutSec = 300;
-    int MaxTotalTimeSec = 0;
-    bool DoCrossOver = true;
-    int  MutateDepth = 5;
-    bool ExitOnFirst = false;
-    bool UseCounters = false;
-    bool UseIndirCalls = true;
-    bool UseTraces = false;
-    bool UseFullCoverageSet  = false;
-    bool Reload = true;
-    bool ShuffleAtStartUp = true;
-    int PreferSmallDuringInitialShuffle = -1;
-    size_t MaxNumberOfRuns = ULONG_MAX;
-    int SyncTimeout = 600;
-    int ReportSlowUnits = 10;
-    bool OnlyASCII = false;
-    std::string OutputCorpus;
-    std::string SyncCommand;
-    std::string ArtifactPrefix = "./";
-    std::string ExactArtifactPath;
-    bool SaveArtifacts = true;
-    bool PrintNEW = true;  // Print a status line when new units are found;
-    bool OutputCSV = false;
-    bool PrintNewCovPcs = false;
+  static const size_t kMaxDictSize = 1 << 14;
+
+  bool ContainsWord(const Word &W) const {
+    return std::any_of(begin(), end(), [&](const DictionaryEntry &DE) {
+      return DE.GetW() == W;
+    });
+  }
+  const DictionaryEntry *begin() const { return &DE[0]; }
+  const DictionaryEntry *end() const { return begin() + Size; }
+  DictionaryEntry & operator[] (size_t Idx) {
+    assert(Idx < Size);
+    return DE[Idx];
+  }
+  void push_back(DictionaryEntry DE) {
+    if (Size < kMaxDictSize)
+      this->DE[Size++] = DE;
+  }
+  void clear() { Size = 0; }
+  bool empty() const { return Size == 0; }
+  size_t size() const { return Size; }
+
+private:
+  DictionaryEntry DE[kMaxDictSize];
+  size_t Size = 0;
+};
+
+struct FuzzingOptions {
+  int Verbosity = 1;
+  size_t MaxLen = 0;
+  int UnitTimeoutSec = 300;
+  int TimeoutExitCode = 77;
+  int ErrorExitCode = 77;
+  int MaxTotalTimeSec = 0;
+  int RssLimitMb = 0;
+  bool DoCrossOver = true;
+  int MutateDepth = 5;
+  bool UseCounters = false;
+  bool UseIndirCalls = true;
+  bool UseTraces = false;
+  bool UseMemcmp = true;
+  bool UseMemmem = true;
+  bool UseFullCoverageSet = false;
+  bool Reload = true;
+  bool ShuffleAtStartUp = true;
+  bool PreferSmall = true;
+  size_t MaxNumberOfRuns = ULONG_MAX;
+  int ReportSlowUnits = 10;
+  bool OnlyASCII = false;
+  std::string OutputCorpus;
+  std::string ArtifactPrefix = "./";
+  std::string ExactArtifactPath;
+  bool SaveArtifacts = true;
+  bool PrintNEW = true; // Print a status line when new units are found;
+  bool OutputCSV = false;
+  bool PrintNewCovPcs = false;
+  bool PrintFinalStats = false;
+  bool DetectLeaks = true;
+  bool TruncateUnits = false;
+  bool PruneCorpus = true;
+};
+
+class MutationDispatcher {
+public:
+  MutationDispatcher(Random &Rand, const FuzzingOptions &Options);
+  ~MutationDispatcher() {}
+  /// Indicate that we are about to start a new sequence of mutations.
+  void StartMutationSequence();
+  /// Print the current sequence of mutations.
+  void PrintMutationSequence();
+  /// Indicate that the current sequence of mutations was successfull.
+  void RecordSuccessfulMutationSequence();
+  /// Mutates data by invoking user-provided mutator.
+  size_t Mutate_Custom(uint8_t *Data, size_t Size, size_t MaxSize);
+  /// Mutates data by invoking user-provided crossover.
+  size_t Mutate_CustomCrossOver(uint8_t *Data, size_t Size, size_t MaxSize);
+  /// Mutates data by shuffling bytes.
+  size_t Mutate_ShuffleBytes(uint8_t *Data, size_t Size, size_t MaxSize);
+  /// Mutates data by erasing a byte.
+  size_t Mutate_EraseByte(uint8_t *Data, size_t Size, size_t MaxSize);
+  /// Mutates data by inserting a byte.
+  size_t Mutate_InsertByte(uint8_t *Data, size_t Size, size_t MaxSize);
+  /// Mutates data by chanding one byte.
+  size_t Mutate_ChangeByte(uint8_t *Data, size_t Size, size_t MaxSize);
+  /// Mutates data by chanding one bit.
+  size_t Mutate_ChangeBit(uint8_t *Data, size_t Size, size_t MaxSize);
+
+  /// Mutates data by adding a word from the manual dictionary.
+  size_t Mutate_AddWordFromManualDictionary(uint8_t *Data, size_t Size,
+                                            size_t MaxSize);
+
+  /// Mutates data by adding a word from the temporary automatic dictionary.
+  size_t Mutate_AddWordFromTemporaryAutoDictionary(uint8_t *Data, size_t Size,
+                                                   size_t MaxSize);
+
+  /// Mutates data by adding a word from the persistent automatic dictionary.
+  size_t Mutate_AddWordFromPersistentAutoDictionary(uint8_t *Data, size_t Size,
+                                                    size_t MaxSize);
+
+  /// Tries to find an ASCII integer in Data, changes it to another ASCII int.
+  size_t Mutate_ChangeASCIIInteger(uint8_t *Data, size_t Size, size_t MaxSize);
+
+  /// CrossOver Data with some other element of the corpus.
+  size_t Mutate_CrossOver(uint8_t *Data, size_t Size, size_t MaxSize);
+
+  /// Applies one of the configured mutations.
+  /// Returns the new size of data which could be up to MaxSize.
+  size_t Mutate(uint8_t *Data, size_t Size, size_t MaxSize);
+  /// Applies one of the default mutations. Provided as a service
+  /// to mutation authors.
+  size_t DefaultMutate(uint8_t *Data, size_t Size, size_t MaxSize);
+
+  /// Creates a cross-over of two pieces of Data, returns its size.
+  size_t CrossOver(const uint8_t *Data1, size_t Size1, const uint8_t *Data2,
+                   size_t Size2, uint8_t *Out, size_t MaxOutSize);
+
+  void AddWordToManualDictionary(const Word &W);
+
+  void AddWordToAutoDictionary(DictionaryEntry DE);
+  void ClearAutoDictionary();
+  void PrintRecommendedDictionary();
+
+  void SetCorpus(const std::vector<Unit> *Corpus) { this->Corpus = Corpus; }
+
+  Random &GetRand() { return Rand; }
+
+private:
+
+  struct Mutator {
+    size_t (MutationDispatcher::*Fn)(uint8_t *Data, size_t Size, size_t Max);
+    const char *Name;
+  };
+
+  size_t AddWordFromDictionary(Dictionary &D, uint8_t *Data, size_t Size,
+                               size_t MaxSize);
+  size_t MutateImpl(uint8_t *Data, size_t Size, size_t MaxSize,
+                    const std::vector<Mutator> &Mutators);
+
+  Random &Rand;
+  const FuzzingOptions Options;
+
+  // Dictionary provided by the user via -dict=DICT_FILE.
+  Dictionary ManualDictionary;
+  // Temporary dictionary modified by the fuzzer itself,
+  // recreated periodically.
+  Dictionary TempAutoDictionary;
+  // Persistent dictionary modified by the fuzzer, consists of
+  // entries that led to successfull discoveries in the past mutations.
+  Dictionary PersistentAutoDictionary;
+  std::vector<Mutator> CurrentMutatorSequence;
+  std::vector<DictionaryEntry *> CurrentDictionaryEntrySequence;
+  const std::vector<Unit> *Corpus = nullptr;
+  std::vector<uint8_t> MutateInPlaceHere;
+
+  std::vector<Mutator> Mutators;
+  std::vector<Mutator> DefaultMutators;
+};
+
+class Fuzzer {
+public:
+
+  // Aggregates all available coverage measurements.
+  struct Coverage {
+    Coverage() { Reset(); }
+
+    void Reset() {
+      BlockCoverage = 0;
+      CallerCalleeCoverage = 0;
+      PcMapBits = 0;
+      CounterBitmapBits = 0;
+      PcBufferLen = 0;
+      CounterBitmap.clear();
+      PCMap.Reset();
+    }
+
+    std::string DebugString() const;
+
+    size_t BlockCoverage;
+    size_t CallerCalleeCoverage;
+
+    size_t PcBufferLen;
+    // Precalculated number of bits in CounterBitmap.
+    size_t CounterBitmapBits;
+    std::vector<uint8_t> CounterBitmap;
+    // Precalculated number of bits in PCMap.
+    size_t PcMapBits;
+    PcCoverageMap PCMap;
   };
-  Fuzzer(UserSuppliedFuzzer &USF, FuzzingOptions Options);
-  void AddToCorpus(const Unit &U) { Corpus.push_back(U); }
+
+  Fuzzer(UserCallback CB, MutationDispatcher &MD, FuzzingOptions Options);
+  void AddToCorpus(const Unit &U) {
+    Corpus.push_back(U);
+    UpdateCorpusDistribution();
+  }
   size_t ChooseUnitIdxToMutate();
   const Unit &ChooseUnitToMutate() { return Corpus[ChooseUnitIdxToMutate()]; };
+  void TruncateUnits(std::vector<Unit> *NewCorpus);
   void Loop();
   void Drill();
   void ShuffleAndMinimize();
   void InitializeTraceState();
+  void AssignTaintLabels(uint8_t *Data, size_t Size);
   size_t CorpusSize() const { return Corpus.size(); }
-  void ReadDir(const std::string &Path, long *Epoch) {
+  size_t MaxUnitSizeInCorpus() const;
+  void ReadDir(const std::string &Path, long *Epoch, size_t MaxSize) {
     Printf("Loading corpus: %s\n", Path.c_str());
-    ReadDirToVectorOfUnits(Path.c_str(), &Corpus, Epoch);
+    ReadDirToVectorOfUnits(Path.c_str(), &Corpus, Epoch, MaxSize);
   }
-  void RereadOutputCorpus();
+  void RereadOutputCorpus(size_t MaxSize);
   // Save the current corpus to OutputCorpus.
   void SaveCorpus();
 
@@ -119,35 +392,56 @@ class Fuzzer {
     return duration_cast<seconds>(system_clock::now() - ProcessStartTime)
         .count();
   }
+  size_t execPerSec() {
+    size_t Seconds = secondsSinceProcessStartUp();
+    return Seconds ? TotalNumberOfRuns / Seconds : 0;
+  }
 
   size_t getTotalNumberOfRuns() { return TotalNumberOfRuns; }
 
   static void StaticAlarmCallback();
+  static void StaticCrashSignalCallback();
+  static void StaticInterruptCallback();
 
-  void ExecuteCallback(const Unit &U);
+  void ExecuteCallback(const uint8_t *Data, size_t Size);
+  bool RunOne(const uint8_t *Data, size_t Size);
 
   // Merge Corpora[1:] into Corpora[0].
   void Merge(const std::vector<std::string> &Corpora);
+  // Returns a subset of 'Extra' that adds coverage to 'Initial'.
+  UnitVector FindExtraUnits(const UnitVector &Initial, const UnitVector &Extra);
+  MutationDispatcher &GetMD() { return MD; }
+  void PrintFinalStats();
+  void SetMaxLen(size_t MaxLen);
+  void RssLimitCallback();
 
- private:
+  // Public for tests.
+  void ResetCoverage();
+
+  bool InFuzzingThread() const { return IsMyThread; }
+  size_t GetCurrentUnitInFuzzingThead(const uint8_t **Data) const;
+
+private:
   void AlarmCallback();
+  void CrashCallback();
+  void InterruptCallback();
   void MutateAndTestOne();
   void ReportNewCoverage(const Unit &U);
-  bool RunOne(const Unit &U);
-  void RunOneAndUpdateCorpus(Unit &U);
+  bool RunOne(const Unit &U) { return RunOne(U.data(), U.size()); }
+  void RunOneAndUpdateCorpus(const uint8_t *Data, size_t Size);
   void WriteToOutputCorpus(const Unit &U);
   void WriteUnitToFileWithPrefix(const Unit &U, const char *Prefix);
   void PrintStats(const char *Where, const char *End = "\n");
   void PrintStatusForNewUnit(const Unit &U);
-  void PrintUnitInASCII(const Unit &U, const char *PrintAfter = "");
+  void ShuffleCorpus(UnitVector *V);
+  void TryDetectingAMemoryLeak(const uint8_t *Data, size_t Size,
+                               bool DuringInitialCorpusExecution);
 
-  void SyncCorpus();
-
-  size_t RecordBlockCoverage();
-  size_t RecordCallerCalleeCoverage();
-  void PrepareCoverageBeforeRun();
-  bool CheckCoverageAfterRun();
+  // Updates the probability distribution for the units in the corpus.
+  // Must be called whenever the corpus or unit weights are changed.
+  void UpdateCorpusDistribution();
 
+  bool UpdateMaxCoverage();
 
   // Trace-based fuzzing: we run a unit with some kind of tracing
   // enabled and record potentially useful mutations. Then
@@ -160,48 +454,41 @@ class Fuzzer {
 
   void SetDeathCallback();
   static void StaticDeathCallback();
+  void DumpCurrentUnit(const char *Prefix);
   void DeathCallback();
-  Unit CurrentUnit;
+
+  void LazyAllocateCurrentUnitData();
+  uint8_t *CurrentUnitData = nullptr;
+  std::atomic<size_t> CurrentUnitSize;
 
   size_t TotalNumberOfRuns = 0;
-  size_t TotalNumberOfExecutedTraceBasedMutations = 0;
+  size_t NumberOfNewUnitsAdded = 0;
+
+  bool HasMoreMallocsThanFrees = false;
+  size_t NumberOfLeakDetectionAttempts = 0;
 
   std::vector<Unit> Corpus;
   std::unordered_set<std::string> UnitHashesAddedToCorpus;
 
-  // For UseCounters
-  std::vector<uint8_t> CounterBitmap;
-  size_t TotalBits() {  // Slow. Call it only for printing stats.
-    size_t Res = 0;
-    for (auto x : CounterBitmap) Res += __builtin_popcount(x);
-    return Res;
-  }
-
-  UserSuppliedFuzzer &USF;
+  std::piecewise_constant_distribution<double> CorpusDistribution;
+  UserCallback CB;
+  MutationDispatcher &MD;
   FuzzingOptions Options;
   system_clock::time_point ProcessStartTime = system_clock::now();
-  system_clock::time_point LastExternalSync = system_clock::now();
   system_clock::time_point UnitStartTime;
   long TimeOfLongestUnitInSeconds = 0;
   long EpochOfLastReadOfOutputCorpus = 0;
-  size_t LastRecordedBlockCoverage = 0;
-  size_t LastRecordedCallerCalleeCoverage = 0;
-  size_t LastCoveragePcBufferLen = 0;
-};
-
-class SimpleUserSuppliedFuzzer: public UserSuppliedFuzzer {
- public:
-  SimpleUserSuppliedFuzzer(FuzzerRandomBase *Rand, UserCallback Callback)
-      : UserSuppliedFuzzer(Rand), Callback(Callback) {}
 
-  virtual int TargetFunction(const uint8_t *Data, size_t Size) override {
-    return Callback(Data, Size);
-  }
+  // Maximum recorded coverage.
+  Coverage MaxCoverage;
 
- private:
-  UserCallback Callback = nullptr;
+  // Need to know our own thread.
+  static thread_local bool IsMyThread;
 };
 
-};  // namespace fuzzer
+// Global interface to functions that may or may not be available.
+extern ExternalFunctions *EF;
+
+}; // namespace fuzzer
 
 #endif // LLVM_FUZZER_INTERNAL_H
diff --git a/lib/Fuzzer/FuzzerLoop.cpp b/lib/Fuzzer/FuzzerLoop.cpp
index 5237682ff24d..89db5e0ac0a1 100644
--- a/lib/Fuzzer/FuzzerLoop.cpp
+++ b/lib/Fuzzer/FuzzerLoop.cpp
@@ -11,63 +11,159 @@
 
 #include "FuzzerInternal.h"
 #include <algorithm>
+#include <cstring>
+#include <memory>
 
 #if defined(__has_include)
-# if __has_include(<sanitizer/coverage_interface.h>)
-#  include <sanitizer/coverage_interface.h>
-# endif
+#if __has_include(<sanitizer / coverage_interface.h>)
+#include <sanitizer/coverage_interface.h>
+#endif
+#if __has_include(<sanitizer / lsan_interface.h>)
+#include <sanitizer/lsan_interface.h>
+#endif
 #endif
 
-extern "C" {
-// Re-declare some of the sanitizer functions as "weak" so that
-// libFuzzer can be linked w/o the sanitizers and sanitizer-coverage
-// (in which case it will complain at start-up time).
-__attribute__((weak)) void __sanitizer_print_stack_trace();
-__attribute__((weak)) void __sanitizer_reset_coverage();
-__attribute__((weak)) size_t __sanitizer_get_total_unique_caller_callee_pairs();
-__attribute__((weak)) size_t __sanitizer_get_total_unique_coverage();
-__attribute__((weak))
-void __sanitizer_set_death_callback(void (*callback)(void));
-__attribute__((weak)) size_t __sanitizer_get_number_of_counters();
-__attribute__((weak))
-uintptr_t __sanitizer_update_counter_bitset_and_clear_counters(uint8_t *bitset);
-__attribute__((weak)) uintptr_t
-__sanitizer_get_coverage_pc_buffer(uintptr_t **data);
-}
+#define NO_SANITIZE_MEMORY
+#if defined(__has_feature)
+#if __has_feature(memory_sanitizer)
+#undef NO_SANITIZE_MEMORY
+#define NO_SANITIZE_MEMORY __attribute__((no_sanitize_memory))
+#endif
+#endif
 
 namespace fuzzer {
 static const size_t kMaxUnitSizeToPrint = 256;
+static const size_t TruncateMaxRuns = 1000;
+
+thread_local bool Fuzzer::IsMyThread;
 
-static void MissingWeakApiFunction(const char *FnName) {
+static void MissingExternalApiFunction(const char *FnName) {
   Printf("ERROR: %s is not defined. Exiting.\n"
-         "Did you use -fsanitize-coverage=... to build your code?\n", FnName);
+         "Did you use -fsanitize-coverage=... to build your code?\n",
+         FnName);
   exit(1);
 }
 
-#define CHECK_WEAK_API_FUNCTION(fn)                                            \
+#define CHECK_EXTERNAL_FUNCTION(fn)                                            \
   do {                                                                         \
-    if (!fn)                                                                   \
-      MissingWeakApiFunction(#fn);                                             \
+    if (!(EF->fn))                                                             \
+      MissingExternalApiFunction(#fn);                                         \
   } while (false)
 
 // Only one Fuzzer per process.
 static Fuzzer *F;
 
-Fuzzer::Fuzzer(UserSuppliedFuzzer &USF, FuzzingOptions Options)
-    : USF(USF), Options(Options) {
+struct CoverageController {
+  static void Reset() {
+    CHECK_EXTERNAL_FUNCTION(__sanitizer_reset_coverage);
+    EF->__sanitizer_reset_coverage();
+    PcMapResetCurrent();
+  }
+
+  static void ResetCounters(const FuzzingOptions &Options) {
+    if (Options.UseCounters) {
+      EF->__sanitizer_update_counter_bitset_and_clear_counters(0);
+    }
+  }
+
+  static void Prepare(const FuzzingOptions &Options, Fuzzer::Coverage *C) {
+    if (Options.UseCounters) {
+      size_t NumCounters = EF->__sanitizer_get_number_of_counters();
+      C->CounterBitmap.resize(NumCounters);
+    }
+  }
+
+  // Records data to a maximum coverage tracker. Returns true if additional
+  // coverage was discovered.
+  static bool RecordMax(const FuzzingOptions &Options, Fuzzer::Coverage *C) {
+    bool Res = false;
+
+    uint64_t NewBlockCoverage = EF->__sanitizer_get_total_unique_coverage();
+    if (NewBlockCoverage > C->BlockCoverage) {
+      Res = true;
+      C->BlockCoverage = NewBlockCoverage;
+    }
+
+    if (Options.UseIndirCalls &&
+        EF->__sanitizer_get_total_unique_caller_callee_pairs) {
+      uint64_t NewCallerCalleeCoverage =
+          EF->__sanitizer_get_total_unique_caller_callee_pairs();
+      if (NewCallerCalleeCoverage > C->CallerCalleeCoverage) {
+        Res = true;
+        C->CallerCalleeCoverage = NewCallerCalleeCoverage;
+      }
+    }
+
+    if (Options.UseCounters) {
+      uint64_t CounterDelta =
+          EF->__sanitizer_update_counter_bitset_and_clear_counters(
+              C->CounterBitmap.data());
+      if (CounterDelta > 0) {
+        Res = true;
+        C->CounterBitmapBits += CounterDelta;
+      }
+    }
+
+    uint64_t NewPcMapBits = PcMapMergeInto(&C->PCMap);
+    if (NewPcMapBits > C->PcMapBits) {
+      Res = true;
+      C->PcMapBits = NewPcMapBits;
+    }
+
+    uintptr_t *CoverageBuf;
+    uint64_t NewPcBufferLen =
+        EF->__sanitizer_get_coverage_pc_buffer(&CoverageBuf);
+    if (NewPcBufferLen > C->PcBufferLen) {
+      Res = true;
+      C->PcBufferLen = NewPcBufferLen;
+    }
+
+    return Res;
+  }
+};
+
+// Leak detection is expensive, so we first check if there were more mallocs
+// than frees (using the sanitizer malloc hooks) and only then try to call lsan.
+struct MallocFreeTracer {
+  void Start() {
+    Mallocs = 0;
+    Frees = 0;
+  }
+  // Returns true if there were more mallocs than frees.
+  bool Stop() { return Mallocs > Frees; }
+  std::atomic<size_t> Mallocs;
+  std::atomic<size_t> Frees;
+};
+
+static MallocFreeTracer AllocTracer;
+
+void MallocHook(const volatile void *ptr, size_t size) {
+  AllocTracer.Mallocs++;
+}
+void FreeHook(const volatile void *ptr) {
+  AllocTracer.Frees++;
+}
+
+Fuzzer::Fuzzer(UserCallback CB, MutationDispatcher &MD, FuzzingOptions Options)
+    : CB(CB), MD(MD), Options(Options) {
   SetDeathCallback();
   InitializeTraceState();
   assert(!F);
   F = this;
+  ResetCoverage();
+  IsMyThread = true;
+  if (Options.DetectLeaks && EF->__sanitizer_install_malloc_and_free_hooks)
+    EF->__sanitizer_install_malloc_and_free_hooks(MallocHook, FreeHook);
 }
 
-void Fuzzer::SetDeathCallback() {
-  CHECK_WEAK_API_FUNCTION(__sanitizer_set_death_callback);
-  __sanitizer_set_death_callback(StaticDeathCallback);
+void Fuzzer::LazyAllocateCurrentUnitData() {
+  if (CurrentUnitData || Options.MaxLen == 0) return;
+  CurrentUnitData = new uint8_t[Options.MaxLen];
 }
 
-void Fuzzer::PrintUnitInASCII(const Unit &U, const char *PrintAfter) {
-  PrintASCII(U, PrintAfter);
+void Fuzzer::SetDeathCallback() {
+  CHECK_EXTERNAL_FUNCTION(__sanitizer_set_death_callback);
+  EF->__sanitizer_set_death_callback(StaticDeathCallback);
 }
 
 void Fuzzer::StaticDeathCallback() {
@@ -75,13 +171,21 @@ void Fuzzer::StaticDeathCallback() {
   F->DeathCallback();
 }
 
-void Fuzzer::DeathCallback() {
-  Printf("DEATH:\n");
-  if (CurrentUnit.size() <= kMaxUnitSizeToPrint) {
-    Print(CurrentUnit, "\n");
-    PrintUnitInASCII(CurrentUnit, "\n");
+void Fuzzer::DumpCurrentUnit(const char *Prefix) {
+  if (!CurrentUnitData) return;  // Happens when running individual inputs.
+  size_t UnitSize = CurrentUnitSize;
+  if (UnitSize <= kMaxUnitSizeToPrint) {
+    PrintHexArray(CurrentUnitData, UnitSize, "\n");
+    PrintASCII(CurrentUnitData, UnitSize, "\n");
   }
-  WriteUnitToFileWithPrefix(CurrentUnit, "crash-");
+  WriteUnitToFileWithPrefix({CurrentUnitData, CurrentUnitData + UnitSize},
+                            Prefix);
+}
+
+NO_SANITIZE_MEMORY
+void Fuzzer::DeathCallback() {
+  DumpCurrentUnit("crash-");
+  PrintFinalStats();
 }
 
 void Fuzzer::StaticAlarmCallback() {
@@ -89,131 +193,256 @@ void Fuzzer::StaticAlarmCallback() {
   F->AlarmCallback();
 }
 
+void Fuzzer::StaticCrashSignalCallback() {
+  assert(F);
+  F->CrashCallback();
+}
+
+void Fuzzer::StaticInterruptCallback() {
+  assert(F);
+  F->InterruptCallback();
+}
+
+void Fuzzer::CrashCallback() {
+  Printf("==%d== ERROR: libFuzzer: deadly signal\n", GetPid());
+  if (EF->__sanitizer_print_stack_trace)
+    EF->__sanitizer_print_stack_trace();
+  Printf("NOTE: libFuzzer has rudimentary signal handlers.\n"
+         "      Combine libFuzzer with AddressSanitizer or similar for better "
+         "crash reports.\n");
+  Printf("SUMMARY: libFuzzer: deadly signal\n");
+  DumpCurrentUnit("crash-");
+  PrintFinalStats();
+  exit(Options.ErrorExitCode);
+}
+
+void Fuzzer::InterruptCallback() {
+  Printf("==%d== libFuzzer: run interrupted; exiting\n", GetPid());
+  PrintFinalStats();
+  _Exit(0);  // Stop right now, don't perform any at-exit actions.
+}
+
+NO_SANITIZE_MEMORY
 void Fuzzer::AlarmCallback() {
   assert(Options.UnitTimeoutSec > 0);
+  if (!InFuzzingThread()) return;
+  if (!CurrentUnitSize)
+    return; // We have not started running units yet.
   size_t Seconds =
       duration_cast<seconds>(system_clock::now() - UnitStartTime).count();
-  if (Seconds == 0) return;
+  if (Seconds == 0)
+    return;
   if (Options.Verbosity >= 2)
     Printf("AlarmCallback %zd\n", Seconds);
   if (Seconds >= (size_t)Options.UnitTimeoutSec) {
     Printf("ALARM: working on the last Unit for %zd seconds\n", Seconds);
     Printf("       and the timeout value is %d (use -timeout=N to change)\n",
            Options.UnitTimeoutSec);
-    if (CurrentUnit.size() <= kMaxUnitSizeToPrint) {
-      Print(CurrentUnit, "\n");
-      PrintUnitInASCII(CurrentUnit, "\n");
-    }
-    WriteUnitToFileWithPrefix(CurrentUnit, "timeout-");
+    DumpCurrentUnit("timeout-");
     Printf("==%d== ERROR: libFuzzer: timeout after %d seconds\n", GetPid(),
            Seconds);
-    if (__sanitizer_print_stack_trace)
-      __sanitizer_print_stack_trace();
+    if (EF->__sanitizer_print_stack_trace)
+      EF->__sanitizer_print_stack_trace();
     Printf("SUMMARY: libFuzzer: timeout\n");
-    exit(1);
+    PrintFinalStats();
+    _Exit(Options.TimeoutExitCode); // Stop right now.
   }
 }
 
-void Fuzzer::PrintStats(const char *Where, const char *End) {
-  size_t Seconds = secondsSinceProcessStartUp();
-  size_t ExecPerSec = (Seconds ? TotalNumberOfRuns / Seconds : 0);
+void Fuzzer::RssLimitCallback() {
+  Printf(
+      "==%d== ERROR: libFuzzer: out-of-memory (used: %zdMb; limit: %zdMb)\n",
+      GetPid(), GetPeakRSSMb(), Options.RssLimitMb);
+  Printf("   To change the out-of-memory limit use -rss_limit_mb=<N>\n\n");
+  if (EF->__sanitizer_print_memory_profile)
+    EF->__sanitizer_print_memory_profile(50);
+  DumpCurrentUnit("oom-");
+  Printf("SUMMARY: libFuzzer: out-of-memory\n");
+  PrintFinalStats();
+  _Exit(Options.ErrorExitCode); // Stop right now.
+}
 
+void Fuzzer::PrintStats(const char *Where, const char *End) {
+  size_t ExecPerSec = execPerSec();
   if (Options.OutputCSV) {
     static bool csvHeaderPrinted = false;
     if (!csvHeaderPrinted) {
       csvHeaderPrinted = true;
       Printf("runs,block_cov,bits,cc_cov,corpus,execs_per_sec,tbms,reason\n");
     }
-    Printf("%zd,%zd,%zd,%zd,%zd,%zd,%zd,%s\n", TotalNumberOfRuns,
-           LastRecordedBlockCoverage, TotalBits(),
-           LastRecordedCallerCalleeCoverage, Corpus.size(), ExecPerSec,
-           TotalNumberOfExecutedTraceBasedMutations, Where);
+    Printf("%zd,%zd,%zd,%zd,%zd,%zd,%s\n", TotalNumberOfRuns,
+           MaxCoverage.BlockCoverage, MaxCoverage.CounterBitmapBits,
+           MaxCoverage.CallerCalleeCoverage, Corpus.size(), ExecPerSec, Where);
   }
 
   if (!Options.Verbosity)
     return;
   Printf("#%zd\t%s", TotalNumberOfRuns, Where);
-  if (LastRecordedBlockCoverage)
-    Printf(" cov: %zd", LastRecordedBlockCoverage);
-  if (auto TB = TotalBits())
+  if (MaxCoverage.BlockCoverage)
+    Printf(" cov: %zd", MaxCoverage.BlockCoverage);
+  if (MaxCoverage.PcMapBits)
+    Printf(" path: %zd", MaxCoverage.PcMapBits);
+  if (auto TB = MaxCoverage.CounterBitmapBits)
     Printf(" bits: %zd", TB);
-  if (LastRecordedCallerCalleeCoverage)
-    Printf(" indir: %zd", LastRecordedCallerCalleeCoverage);
+  if (MaxCoverage.CallerCalleeCoverage)
+    Printf(" indir: %zd", MaxCoverage.CallerCalleeCoverage);
   Printf(" units: %zd exec/s: %zd", Corpus.size(), ExecPerSec);
-  if (TotalNumberOfExecutedTraceBasedMutations)
-    Printf(" tbm: %zd", TotalNumberOfExecutedTraceBasedMutations);
   Printf("%s", End);
 }
 
-void Fuzzer::RereadOutputCorpus() {
-  if (Options.OutputCorpus.empty()) return;
+void Fuzzer::PrintFinalStats() {
+  if (!Options.PrintFinalStats) return;
+  size_t ExecPerSec = execPerSec();
+  Printf("stat::number_of_executed_units: %zd\n", TotalNumberOfRuns);
+  Printf("stat::average_exec_per_sec:     %zd\n", ExecPerSec);
+  Printf("stat::new_units_added:          %zd\n", NumberOfNewUnitsAdded);
+  Printf("stat::slowest_unit_time_sec:    %zd\n", TimeOfLongestUnitInSeconds);
+  Printf("stat::peak_rss_mb:              %zd\n", GetPeakRSSMb());
+}
+
+size_t Fuzzer::MaxUnitSizeInCorpus() const {
+  size_t Res = 0;
+  for (auto &X : Corpus)
+    Res = std::max(Res, X.size());
+  return Res;
+}
+
+void Fuzzer::SetMaxLen(size_t MaxLen) {
+  assert(Options.MaxLen == 0); // Can only reset MaxLen from 0 to non-0.
+  assert(MaxLen);
+  Options.MaxLen = MaxLen;
+  Printf("INFO: -max_len is not provided, using %zd\n", Options.MaxLen);
+}
+
+
+void Fuzzer::RereadOutputCorpus(size_t MaxSize) {
+  if (Options.OutputCorpus.empty())
+    return;
   std::vector<Unit> AdditionalCorpus;
   ReadDirToVectorOfUnits(Options.OutputCorpus.c_str(), &AdditionalCorpus,
-                         &EpochOfLastReadOfOutputCorpus);
+                         &EpochOfLastReadOfOutputCorpus, MaxSize);
   if (Corpus.empty()) {
     Corpus = AdditionalCorpus;
     return;
   }
-  if (!Options.Reload) return;
+  if (!Options.Reload)
+    return;
   if (Options.Verbosity >= 2)
-    Printf("Reload: read %zd new units.\n",  AdditionalCorpus.size());
+    Printf("Reload: read %zd new units.\n", AdditionalCorpus.size());
   for (auto &X : AdditionalCorpus) {
-    if (X.size() > (size_t)Options.MaxLen)
-      X.resize(Options.MaxLen);
+    if (X.size() > MaxSize)
+      X.resize(MaxSize);
     if (UnitHashesAddedToCorpus.insert(Hash(X)).second) {
-      CurrentUnit.clear();
-      CurrentUnit.insert(CurrentUnit.begin(), X.begin(), X.end());
-      if (RunOne(CurrentUnit)) {
+      if (RunOne(X)) {
         Corpus.push_back(X);
+        UpdateCorpusDistribution();
         PrintStats("RELOAD");
       }
     }
   }
 }
 
+void Fuzzer::ShuffleCorpus(UnitVector *V) {
+  std::random_shuffle(V->begin(), V->end(), MD.GetRand());
+  if (Options.PreferSmall)
+    std::stable_sort(V->begin(), V->end(), [](const Unit &A, const Unit &B) {
+      return A.size() < B.size();
+    });
+}
+
+// Tries random prefixes of corpus items.
+// Prefix length is chosen according to exponential distribution
+// to sample short lengths much more heavily.
+void Fuzzer::TruncateUnits(std::vector<Unit> *NewCorpus) {
+  size_t MaxCorpusLen = 0;
+  for (const auto &U : Corpus)
+    MaxCorpusLen = std::max(MaxCorpusLen, U.size());
+
+  if (MaxCorpusLen <= 1)
+    return;
+
+  // 50% of exponential distribution is Log[2]/lambda.
+  // Choose lambda so that median is MaxCorpusLen / 2.
+  double Lambda = 2.0 * log(2.0) / static_cast<double>(MaxCorpusLen);
+  std::exponential_distribution<> Dist(Lambda);
+  std::vector<double> Sizes;
+  size_t TruncatePoints = std::max(1ul, TruncateMaxRuns / Corpus.size());
+  Sizes.reserve(TruncatePoints);
+  for (size_t I = 0; I < TruncatePoints; ++I) {
+    Sizes.push_back(Dist(MD.GetRand().Get_mt19937()) + 1);
+  }
+  std::sort(Sizes.begin(), Sizes.end());
+
+  for (size_t S : Sizes) {
+    for (const auto &U : Corpus) {
+      if (S < U.size() && RunOne(U.data(), S)) {
+        Unit U1(U.begin(), U.begin() + S);
+        NewCorpus->push_back(U1);
+        WriteToOutputCorpus(U1);
+        PrintStatusForNewUnit(U1);
+      }
+    }
+  }
+  PrintStats("TRUNC  ");
+}
+
 void Fuzzer::ShuffleAndMinimize() {
-  bool PreferSmall = (Options.PreferSmallDuringInitialShuffle == 1 ||
-                      (Options.PreferSmallDuringInitialShuffle == -1 &&
-                       USF.GetRand().RandBool()));
-  if (Options.Verbosity)
-    Printf("PreferSmall: %d\n", PreferSmall);
   PrintStats("READ  ");
   std::vector<Unit> NewCorpus;
-  if (Options.ShuffleAtStartUp) {
-    std::random_shuffle(Corpus.begin(), Corpus.end(), USF.GetRand());
-    if (PreferSmall)
-      std::stable_sort(
-          Corpus.begin(), Corpus.end(),
-          [](const Unit &A, const Unit &B) { return A.size() < B.size(); });
+  if (Options.ShuffleAtStartUp)
+    ShuffleCorpus(&Corpus);
+
+  if (Options.TruncateUnits) {
+    ResetCoverage();
+    TruncateUnits(&NewCorpus);
+    ResetCoverage();
   }
-  Unit &U = CurrentUnit;
-  for (const auto &C : Corpus) {
-    for (size_t First = 0; First < 1; First++) {
-      U.clear();
-      size_t Last = std::min(First + Options.MaxLen, C.size());
-      U.insert(U.begin(), C.begin() + First, C.begin() + Last);
-      if (Options.OnlyASCII)
-        ToASCII(U);
-      if (RunOne(U)) {
-        NewCorpus.push_back(U);
-        if (Options.Verbosity >= 2)
-          Printf("NEW0: %zd L %zd\n", LastRecordedBlockCoverage, U.size());
-      }
+
+  for (const auto &U : Corpus) {
+    bool NewCoverage = RunOne(U);
+    if (!Options.PruneCorpus || NewCoverage) {
+      NewCorpus.push_back(U);
+      if (Options.Verbosity >= 2)
+        Printf("NEW0: %zd L %zd\n", MaxCoverage.BlockCoverage, U.size());
     }
+    TryDetectingAMemoryLeak(U.data(), U.size(),
+                            /*DuringInitialCorpusExecution*/ true);
   }
   Corpus = NewCorpus;
+  UpdateCorpusDistribution();
   for (auto &X : Corpus)
     UnitHashesAddedToCorpus.insert(Hash(X));
   PrintStats("INITED");
+  if (Corpus.empty()) {
+    Printf("ERROR: no interesting inputs were found. "
+           "Is the code instrumented for coverage? Exiting.\n");
+    exit(1);
+  }
 }
 
-bool Fuzzer::RunOne(const Unit &U) {
-  UnitStartTime = system_clock::now();
+bool Fuzzer::UpdateMaxCoverage() {
+  uintptr_t PrevBufferLen = MaxCoverage.PcBufferLen;
+  bool Res = CoverageController::RecordMax(Options, &MaxCoverage);
+
+  if (Options.PrintNewCovPcs && PrevBufferLen != MaxCoverage.PcBufferLen) {
+    uintptr_t *CoverageBuf;
+    EF->__sanitizer_get_coverage_pc_buffer(&CoverageBuf);
+    assert(CoverageBuf);
+    for (size_t I = PrevBufferLen; I < MaxCoverage.PcBufferLen; ++I) {
+      Printf("%p\n", CoverageBuf[I]);
+    }
+  }
+
+  return Res;
+}
+
+bool Fuzzer::RunOne(const uint8_t *Data, size_t Size) {
   TotalNumberOfRuns++;
 
-  PrepareCoverageBeforeRun();
-  ExecuteCallback(U);
-  bool Res = CheckCoverageAfterRun();
+  // TODO(aizatsky): this Reset call seems to be not needed.
+  CoverageController::ResetCounters(Options);
+  ExecuteCallback(Data, Size);
+  bool Res = UpdateMaxCoverage();
 
   auto UnitStopTime = system_clock::now();
   auto TimeOfUnit =
@@ -225,88 +454,63 @@ bool Fuzzer::RunOne(const Unit &U) {
       TimeOfUnit >= Options.ReportSlowUnits) {
     TimeOfLongestUnitInSeconds = TimeOfUnit;
     Printf("Slowest unit: %zd s:\n", TimeOfLongestUnitInSeconds);
-    WriteUnitToFileWithPrefix(U, "slow-unit-");
+    WriteUnitToFileWithPrefix({Data, Data + Size}, "slow-unit-");
   }
   return Res;
 }
 
-void Fuzzer::RunOneAndUpdateCorpus(Unit &U) {
+void Fuzzer::RunOneAndUpdateCorpus(const uint8_t *Data, size_t Size) {
   if (TotalNumberOfRuns >= Options.MaxNumberOfRuns)
     return;
-  if (Options.OnlyASCII)
-    ToASCII(U);
-  if (RunOne(U))
-    ReportNewCoverage(U);
-}
-
-void Fuzzer::ExecuteCallback(const Unit &U) {
-  const uint8_t *Data = U.data();
-  uint8_t EmptyData;
-  if (!Data) 
-    Data = &EmptyData;
-  int Res = USF.TargetFunction(Data, U.size());
-  (void)Res;
-  assert(Res == 0);
-}
-
-size_t Fuzzer::RecordBlockCoverage() {
-  CHECK_WEAK_API_FUNCTION(__sanitizer_get_total_unique_coverage);
-  uintptr_t PrevCoverage = LastRecordedBlockCoverage;
-  LastRecordedBlockCoverage = __sanitizer_get_total_unique_coverage();
-
-  if (PrevCoverage == LastRecordedBlockCoverage || !Options.PrintNewCovPcs)
-    return LastRecordedBlockCoverage;
-
-  uintptr_t PrevBufferLen = LastCoveragePcBufferLen;
-  uintptr_t *CoverageBuf;
-  LastCoveragePcBufferLen = __sanitizer_get_coverage_pc_buffer(&CoverageBuf);
-  assert(CoverageBuf);
-  for (size_t i = PrevBufferLen; i < LastCoveragePcBufferLen; ++i) {
-    Printf("0x%x\n", CoverageBuf[i]);
-  }
-
-  return LastRecordedBlockCoverage;
+  if (RunOne(Data, Size))
+    ReportNewCoverage({Data, Data + Size});
 }
 
-size_t Fuzzer::RecordCallerCalleeCoverage() {
-  if (!Options.UseIndirCalls)
-    return 0;
-  if (!__sanitizer_get_total_unique_caller_callee_pairs)
-    return 0;
-  return LastRecordedCallerCalleeCoverage =
-             __sanitizer_get_total_unique_caller_callee_pairs();
+size_t Fuzzer::GetCurrentUnitInFuzzingThead(const uint8_t **Data) const {
+  assert(InFuzzingThread());
+  *Data = CurrentUnitData;
+  return CurrentUnitSize;
 }
 
-void Fuzzer::PrepareCoverageBeforeRun() {
-  if (Options.UseCounters) {
-    size_t NumCounters = __sanitizer_get_number_of_counters();
-    CounterBitmap.resize(NumCounters);
-    __sanitizer_update_counter_bitset_and_clear_counters(0);
-  }
-  RecordBlockCoverage();
-  RecordCallerCalleeCoverage();
+void Fuzzer::ExecuteCallback(const uint8_t *Data, size_t Size) {
+  assert(InFuzzingThread());
+  LazyAllocateCurrentUnitData();
+  UnitStartTime = system_clock::now();
+  // We copy the contents of Unit into a separate heap buffer
+  // so that we reliably find buffer overflows in it.
+  std::unique_ptr<uint8_t[]> DataCopy(new uint8_t[Size]);
+  memcpy(DataCopy.get(), Data, Size);
+  if (CurrentUnitData && CurrentUnitData != Data)
+    memcpy(CurrentUnitData, Data, Size);
+  AssignTaintLabels(DataCopy.get(), Size);
+  CurrentUnitSize = Size;
+  AllocTracer.Start();
+  int Res = CB(DataCopy.get(), Size);
+  (void)Res;
+  HasMoreMallocsThanFrees = AllocTracer.Stop();
+  CurrentUnitSize = 0;
+  assert(Res == 0);
 }
 
-bool Fuzzer::CheckCoverageAfterRun() {
-  size_t OldCoverage = LastRecordedBlockCoverage;
-  size_t NewCoverage = RecordBlockCoverage();
-  size_t OldCallerCalleeCoverage = LastRecordedCallerCalleeCoverage;
-  size_t NewCallerCalleeCoverage = RecordCallerCalleeCoverage();
-  size_t NumNewBits = 0;
-  if (Options.UseCounters)
-    NumNewBits = __sanitizer_update_counter_bitset_and_clear_counters(
-        CounterBitmap.data());
-  return NewCoverage > OldCoverage ||
-         NewCallerCalleeCoverage > OldCallerCalleeCoverage || NumNewBits;
+std::string Fuzzer::Coverage::DebugString() const {
+  std::string Result =
+      std::string("Coverage{") + "BlockCoverage=" +
+      std::to_string(BlockCoverage) + " CallerCalleeCoverage=" +
+      std::to_string(CallerCalleeCoverage) + " CounterBitmapBits=" +
+      std::to_string(CounterBitmapBits) + " PcMapBits=" +
+      std::to_string(PcMapBits) + "}";
+  return Result;
 }
 
 void Fuzzer::WriteToOutputCorpus(const Unit &U) {
-  if (Options.OutputCorpus.empty()) return;
+  if (Options.OnlyASCII)
+    assert(IsASCII(U));
+  if (Options.OutputCorpus.empty())
+    return;
   std::string Path = DirPlusFile(Options.OutputCorpus, Hash(U));
   WriteToFile(U, Path);
   if (Options.Verbosity >= 2)
     Printf("Written to %s\n", Path.c_str());
-  assert(!Options.OnlyASCII || IsASCII(U));
 }
 
 void Fuzzer::WriteUnitToFileWithPrefix(const Unit &U, const char *Prefix) {
@@ -314,7 +518,7 @@ void Fuzzer::WriteUnitToFileWithPrefix(const Unit &U, const char *Prefix) {
     return;
   std::string Path = Options.ArtifactPrefix + Prefix + Hash(U);
   if (!Options.ExactArtifactPath.empty())
-    Path = Options.ExactArtifactPath;  // Overrides ArtifactPrefix.
+    Path = Options.ExactArtifactPath; // Overrides ArtifactPrefix.
   WriteToFile(U, Path);
   Printf("artifact_prefix='%s'; Test unit written to %s\n",
          Options.ArtifactPrefix.c_str(), Path.c_str());
@@ -323,7 +527,8 @@ void Fuzzer::WriteUnitToFileWithPrefix(const Unit &U, const char *Prefix) {
 }
 
 void Fuzzer::SaveCorpus() {
-  if (Options.OutputCorpus.empty()) return;
+  if (Options.OutputCorpus.empty())
+    return;
   for (const auto &U : Corpus)
     WriteToFile(U, DirPlusFile(Options.OutputCorpus, Hash(U)));
   if (Options.Verbosity)
@@ -337,18 +542,54 @@ void Fuzzer::PrintStatusForNewUnit(const Unit &U) {
   PrintStats("NEW   ", "");
   if (Options.Verbosity) {
     Printf(" L: %zd ", U.size());
-    USF.PrintMutationSequence();
+    MD.PrintMutationSequence();
     Printf("\n");
   }
 }
 
 void Fuzzer::ReportNewCoverage(const Unit &U) {
   Corpus.push_back(U);
+  UpdateCorpusDistribution();
   UnitHashesAddedToCorpus.insert(Hash(U));
+  MD.RecordSuccessfulMutationSequence();
   PrintStatusForNewUnit(U);
   WriteToOutputCorpus(U);
-  if (Options.ExitOnFirst)
-    exit(0);
+  NumberOfNewUnitsAdded++;
+}
+
+// Finds minimal number of units in 'Extra' that add coverage to 'Initial'.
+// We do it by actually executing the units, sometimes more than once,
+// because we may be using different coverage-like signals and the only
+// common thing between them is that we can say "this unit found new stuff".
+UnitVector Fuzzer::FindExtraUnits(const UnitVector &Initial,
+                                  const UnitVector &Extra) {
+  UnitVector Res = Extra;
+  size_t OldSize = Res.size();
+  for (int Iter = 0; Iter < 10; Iter++) {
+    ShuffleCorpus(&Res);
+    ResetCoverage();
+
+    for (auto &U : Initial)
+      RunOne(U);
+
+    Corpus.clear();
+    for (auto &U : Res)
+      if (RunOne(U))
+        Corpus.push_back(U);
+
+    char Stat[7] = "MIN   ";
+    Stat[3] = '0' + Iter;
+    PrintStats(Stat);
+
+    size_t NewSize = Corpus.size();
+    assert(NewSize <= OldSize);
+    Res.swap(Corpus);
+
+    if (NewSize + 5 >= OldSize)
+      break;
+    OldSize = NewSize;
+  }
+  return Res;
 }
 
 void Fuzzer::Merge(const std::vector<std::string> &Corpora) {
@@ -356,72 +597,105 @@ void Fuzzer::Merge(const std::vector<std::string> &Corpora) {
     Printf("Merge requires two or more corpus dirs\n");
     return;
   }
-  auto InitialCorpusDir = Corpora[0];
-  ReadDir(InitialCorpusDir, nullptr);
-  Printf("Merge: running the initial corpus '%s' of %d units\n",
-         InitialCorpusDir.c_str(), Corpus.size());
-  for (auto &U : Corpus)
-    RunOne(U);
-
   std::vector<std::string> ExtraCorpora(Corpora.begin() + 1, Corpora.end());
 
-  size_t NumTried = 0;
-  size_t NumMerged = 0;
-  for (auto &C : ExtraCorpora) {
-    Corpus.clear();
-    ReadDir(C, nullptr);
-    Printf("Merge: merging the extra corpus '%s' of %zd units\n", C.c_str(),
-           Corpus.size());
-    for (auto &U : Corpus) {
-      NumTried++;
-      if (RunOne(U)) {
-        WriteToOutputCorpus(U);
-        NumMerged++;
-      }
-    }
+  assert(Options.MaxLen > 0);
+  UnitVector Initial, Extra;
+  ReadDirToVectorOfUnits(Corpora[0].c_str(), &Initial, nullptr, Options.MaxLen);
+  for (auto &C : ExtraCorpora)
+    ReadDirToVectorOfUnits(C.c_str(), &Extra, nullptr, Options.MaxLen);
+
+  if (!Initial.empty()) {
+    Printf("=== Minimizing the initial corpus of %zd units\n", Initial.size());
+    Initial = FindExtraUnits({}, Initial);
+  }
+
+  Printf("=== Merging extra %zd units\n", Extra.size());
+  auto Res = FindExtraUnits(Initial, Extra);
+
+  for (auto &U: Res)
+    WriteToOutputCorpus(U);
+
+  Printf("=== Merge: written %zd units\n", Res.size());
+}
+
+// Tries detecting a memory leak on the particular input that we have just
+// executed before calling this function.
+void Fuzzer::TryDetectingAMemoryLeak(const uint8_t *Data, size_t Size,
+                                     bool DuringInitialCorpusExecution) {
+  if (!HasMoreMallocsThanFrees) return;  // mallocs==frees, a leak is unlikely.
+  if (!Options.DetectLeaks) return;
+  if (!&(EF->__lsan_enable) || !&(EF->__lsan_disable) ||
+      !(EF->__lsan_do_recoverable_leak_check))
+    return;  // No lsan.
+  // Run the target once again, but with lsan disabled so that if there is
+  // a real leak we do not report it twice.
+  EF->__lsan_disable();
+  RunOne(Data, Size);
+  EF->__lsan_enable();
+  if (!HasMoreMallocsThanFrees) return;  // a leak is unlikely.
+  if (NumberOfLeakDetectionAttempts++ > 1000) {
+    Options.DetectLeaks = false;
+    Printf("INFO: libFuzzer disabled leak detection after every mutation.\n"
+           "      Most likely the target function accumulates allocated\n"
+           "      memory in a global state w/o actually leaking it.\n"
+           "      If LeakSanitizer is enabled in this process it will still\n"
+           "      run on the process shutdown.\n");
+    return;
+  }
+  // Now perform the actual lsan pass. This is expensive and we must ensure
+  // we don't call it too often.
+  if (EF->__lsan_do_recoverable_leak_check()) { // Leak is found, report it.
+    if (DuringInitialCorpusExecution)
+      Printf("\nINFO: a leak has been found in the initial corpus.\n\n");
+    Printf("INFO: to ignore leaks on libFuzzer side use -detect_leaks=0.\n\n");
+    CurrentUnitSize = Size;
+    DumpCurrentUnit("leak-");
+    PrintFinalStats();
+    _Exit(Options.ErrorExitCode);  // not exit() to disable lsan further on.
   }
-  Printf("Merge: written %zd out of %zd units\n", NumMerged, NumTried);
 }
 
 void Fuzzer::MutateAndTestOne() {
-  auto &U = CurrentUnit;
-  USF.StartMutationSequence();
+  LazyAllocateCurrentUnitData();
+  MD.StartMutationSequence();
 
-  U = ChooseUnitToMutate();
+  auto &U = ChooseUnitToMutate();
+  assert(CurrentUnitData);
+  size_t Size = U.size();
+  assert(Size <= Options.MaxLen && "Oversized Unit");
+  memcpy(CurrentUnitData, U.data(), Size);
 
   for (int i = 0; i < Options.MutateDepth; i++) {
-    size_t Size = U.size();
-    U.resize(Options.MaxLen);
-    size_t NewSize = USF.Mutate(U.data(), Size, U.size());
+    size_t NewSize = 0;
+    NewSize = MD.Mutate(CurrentUnitData, Size, Options.MaxLen);
     assert(NewSize > 0 && "Mutator returned empty unit");
-    assert(NewSize <= (size_t)Options.MaxLen &&
+    assert(NewSize <= Options.MaxLen &&
            "Mutator return overisized unit");
-    U.resize(NewSize);
+    Size = NewSize;
     if (i == 0)
       StartTraceRecording();
-    RunOneAndUpdateCorpus(U);
+    RunOneAndUpdateCorpus(CurrentUnitData, Size);
     StopTraceRecording();
+    TryDetectingAMemoryLeak(CurrentUnitData, Size,
+                            /*DuringInitialCorpusExecution*/ false);
   }
 }
 
 // Returns an index of random unit from the corpus to mutate.
 // Hypothesis: units added to the corpus last are more likely to be interesting.
-// This function gives more wieght to the more recent units.
+// This function gives more weight to the more recent units.
 size_t Fuzzer::ChooseUnitIdxToMutate() {
-    size_t N = Corpus.size();
-    size_t Total = (N + 1) * N / 2;
-    size_t R = USF.GetRand()(Total);
-    size_t IdxBeg = 0, IdxEnd = N;
-    // Binary search.
-    while (IdxEnd - IdxBeg >= 2) {
-      size_t Idx = IdxBeg + (IdxEnd - IdxBeg) / 2;
-      if (R > (Idx + 1) * Idx / 2)
-        IdxBeg = Idx;
-      else
-        IdxEnd = Idx;
-    }
-    assert(IdxBeg < N);
-    return IdxBeg;
+  size_t Idx =
+      static_cast<size_t>(CorpusDistribution(MD.GetRand().Get_mt19937()));
+  assert(Idx < Corpus.size());
+  return Idx;
+}
+
+void Fuzzer::ResetCoverage() {
+  CoverageController::Reset();
+  MaxCoverage.Reset();
+  CoverageController::Prepare(Options, &MaxCoverage);
 }
 
 // Experimental search heuristic: drilling.
@@ -434,16 +708,16 @@ size_t Fuzzer::ChooseUnitIdxToMutate() {
 void Fuzzer::Drill() {
   // The corpus is already read, shuffled, and minimized.
   assert(!Corpus.empty());
-  Options.PrintNEW = false;  // Don't print NEW status lines when drilling.
+  Options.PrintNEW = false; // Don't print NEW status lines when drilling.
 
   Unit U = ChooseUnitToMutate();
 
-  CHECK_WEAK_API_FUNCTION(__sanitizer_reset_coverage);
-  __sanitizer_reset_coverage();
+  ResetCoverage();
 
   std::vector<Unit> SavedCorpus;
   SavedCorpus.swap(Corpus);
   Corpus.push_back(U);
+  UpdateCorpusDistribution();
   assert(Corpus.size() == 1);
   RunOne(U);
   PrintStats("DRILL ");
@@ -451,19 +725,16 @@ void Fuzzer::Drill() {
   SavedOutputCorpusPath.swap(Options.OutputCorpus);
   Loop();
 
-  __sanitizer_reset_coverage();
+  ResetCoverage();
 
   PrintStats("REINIT");
   SavedOutputCorpusPath.swap(Options.OutputCorpus);
-  for (auto &U : SavedCorpus) {
-    CurrentUnit = U;
+  for (auto &U : SavedCorpus)
     RunOne(U);
-  }
   PrintStats("MERGE ");
   Options.PrintNEW = true;
   size_t NumMerged = 0;
   for (auto &U : Corpus) {
-    CurrentUnit = U;
     if (RunOne(U)) {
       PrintStatusForNewUnit(U);
       NumMerged++;
@@ -478,35 +749,43 @@ void Fuzzer::Drill() {
 void Fuzzer::Loop() {
   system_clock::time_point LastCorpusReload = system_clock::now();
   if (Options.DoCrossOver)
-    USF.SetCorpus(&Corpus);
+    MD.SetCorpus(&Corpus);
   while (true) {
-    SyncCorpus();
     auto Now = system_clock::now();
     if (duration_cast<seconds>(Now - LastCorpusReload).count()) {
-      RereadOutputCorpus();
+      RereadOutputCorpus(Options.MaxLen);
       LastCorpusReload = Now;
     }
     if (TotalNumberOfRuns >= Options.MaxNumberOfRuns)
       break;
     if (Options.MaxTotalTimeSec > 0 &&
         secondsSinceProcessStartUp() >
-        static_cast<size_t>(Options.MaxTotalTimeSec))
+            static_cast<size_t>(Options.MaxTotalTimeSec))
       break;
     // Perform several mutations and runs.
     MutateAndTestOne();
   }
 
   PrintStats("DONE  ", "\n");
+  MD.PrintRecommendedDictionary();
 }
 
-void Fuzzer::SyncCorpus() {
-  if (Options.SyncCommand.empty() || Options.OutputCorpus.empty()) return;
-  auto Now = system_clock::now();
-  if (duration_cast<seconds>(Now - LastExternalSync).count() <
-      Options.SyncTimeout)
-    return;
-  LastExternalSync = Now;
-  ExecuteCommand(Options.SyncCommand + " " + Options.OutputCorpus);
+void Fuzzer::UpdateCorpusDistribution() {
+  size_t N = Corpus.size();
+  std::vector<double> Intervals(N + 1);
+  std::vector<double> Weights(N);
+  std::iota(Intervals.begin(), Intervals.end(), 0);
+  std::iota(Weights.begin(), Weights.end(), 1);
+  CorpusDistribution = std::piecewise_constant_distribution<double>(
+      Intervals.begin(), Intervals.end(), Weights.begin());
 }
 
-}  // namespace fuzzer
+} // namespace fuzzer
+
+extern "C" {
+
+size_t LLVMFuzzerMutate(uint8_t *Data, size_t Size, size_t MaxSize) {
+  assert(fuzzer::F);
+  return fuzzer::F->GetMD().DefaultMutate(Data, Size, MaxSize);
+}
+}  // extern "C"
diff --git a/lib/Fuzzer/FuzzerMain.cpp b/lib/Fuzzer/FuzzerMain.cpp
index c5af5b059091..55f1687b5f0b 100644
--- a/lib/Fuzzer/FuzzerMain.cpp
+++ b/lib/Fuzzer/FuzzerMain.cpp
@@ -12,9 +12,11 @@
 #include "FuzzerInterface.h"
 #include "FuzzerInternal.h"
 
+extern "C" {
 // This function should be defined by the user.
-extern "C" int LLVMFuzzerTestOneInput(const uint8_t *Data, size_t Size);
+int LLVMFuzzerTestOneInput(const uint8_t *Data, size_t Size);
+}  // extern "C"
 
 int main(int argc, char **argv) {
-  return fuzzer::FuzzerDriver(argc, argv, LLVMFuzzerTestOneInput);
+  return fuzzer::FuzzerDriver(&argc, &argv, LLVMFuzzerTestOneInput);
 }
diff --git a/lib/Fuzzer/FuzzerMutate.cpp b/lib/Fuzzer/FuzzerMutate.cpp
index 30e5b43c0839..65e1650bfeab 100644
--- a/lib/Fuzzer/FuzzerMutate.cpp
+++ b/lib/Fuzzer/FuzzerMutate.cpp
@@ -13,49 +13,43 @@
 
 #include "FuzzerInternal.h"
 
-#include <algorithm>
 
 namespace fuzzer {
 
-struct Mutator {
-  size_t (MutationDispatcher::*Fn)(uint8_t *Data, size_t Size, size_t Max);
-  const char *Name;
-};
-
-struct DictionaryEntry {
-  Unit Word;
-  size_t PositionHint;
-};
-
-struct MutationDispatcher::Impl {
-  std::vector<DictionaryEntry> ManualDictionary;
-  std::vector<DictionaryEntry> AutoDictionary;
-  std::vector<Mutator> Mutators;
-  std::vector<Mutator> CurrentMutatorSequence;
-  std::vector<DictionaryEntry> CurrentDictionaryEntrySequence;
-  const std::vector<Unit> *Corpus = nullptr;
-  FuzzerRandomBase &Rand;
-
-  void Add(Mutator M) { Mutators.push_back(M); }
-  Impl(FuzzerRandomBase &Rand) : Rand(Rand) {
-    Add({&MutationDispatcher::Mutate_EraseByte, "EraseByte"});
-    Add({&MutationDispatcher::Mutate_InsertByte, "InsertByte"});
-    Add({&MutationDispatcher::Mutate_ChangeByte, "ChangeByte"});
-    Add({&MutationDispatcher::Mutate_ChangeBit, "ChangeBit"});
-    Add({&MutationDispatcher::Mutate_ShuffleBytes, "ShuffleBytes"});
-    Add({&MutationDispatcher::Mutate_ChangeASCIIInteger, "ChangeASCIIInt"});
-    Add({&MutationDispatcher::Mutate_CrossOver, "CrossOver"});
-    Add({&MutationDispatcher::Mutate_AddWordFromManualDictionary,
-         "AddFromManualDict"});
-    Add({&MutationDispatcher::Mutate_AddWordFromAutoDictionary,
-         "AddFromAutoDict"});
-  }
-  void SetCorpus(const std::vector<Unit> *Corpus) { this->Corpus = Corpus; }
-  size_t AddWordFromDictionary(const std::vector<DictionaryEntry> &D,
-                               uint8_t *Data, size_t Size, size_t MaxSize);
-};
+const size_t Dictionary::kMaxDictSize;
+
+MutationDispatcher::MutationDispatcher(Random &Rand,
+                                       const FuzzingOptions &Options)
+    : Rand(Rand), Options(Options) {
+  DefaultMutators.insert(
+      DefaultMutators.begin(),
+      {
+          {&MutationDispatcher::Mutate_EraseByte, "EraseByte"},
+          {&MutationDispatcher::Mutate_InsertByte, "InsertByte"},
+          {&MutationDispatcher::Mutate_ChangeByte, "ChangeByte"},
+          {&MutationDispatcher::Mutate_ChangeBit, "ChangeBit"},
+          {&MutationDispatcher::Mutate_ShuffleBytes, "ShuffleBytes"},
+          {&MutationDispatcher::Mutate_ChangeASCIIInteger, "ChangeASCIIInt"},
+          {&MutationDispatcher::Mutate_CrossOver, "CrossOver"},
+          {&MutationDispatcher::Mutate_AddWordFromManualDictionary,
+           "AddFromManualDict"},
+          {&MutationDispatcher::Mutate_AddWordFromTemporaryAutoDictionary,
+           "AddFromTempAutoDict"},
+          {&MutationDispatcher::Mutate_AddWordFromPersistentAutoDictionary,
+           "AddFromPersAutoDict"},
+      });
 
-static char FlipRandomBit(char X, FuzzerRandomBase &Rand) {
+  if (EF->LLVMFuzzerCustomMutator)
+    Mutators.push_back({&MutationDispatcher::Mutate_Custom, "Custom"});
+  else
+    Mutators = DefaultMutators;
+
+  if (EF->LLVMFuzzerCustomCrossOver)
+    Mutators.push_back(
+        {&MutationDispatcher::Mutate_CustomCrossOver, "CustomCrossOver"});
+}
+
+static char FlipRandomBit(char X, Random &Rand) {
   int Bit = Rand(8);
   char Mask = 1 << Bit;
   char R;
@@ -67,12 +61,36 @@ static char FlipRandomBit(char X, FuzzerRandomBase &Rand) {
   return R;
 }
 
-static char RandCh(FuzzerRandomBase &Rand) {
+static char RandCh(Random &Rand) {
   if (Rand.RandBool()) return Rand(256);
   const char *Special = "!*'();:@&=+$,/?%#[]123ABCxyz-`~.";
   return Special[Rand(sizeof(Special) - 1)];
 }
 
+size_t MutationDispatcher::Mutate_Custom(uint8_t *Data, size_t Size,
+                                         size_t MaxSize) {
+  return EF->LLVMFuzzerCustomMutator(Data, Size, MaxSize, Rand.Rand());
+}
+
+size_t MutationDispatcher::Mutate_CustomCrossOver(uint8_t *Data, size_t Size,
+                                                  size_t MaxSize) {
+  if (!Corpus || Corpus->size() < 2 || Size == 0)
+    return 0;
+  size_t Idx = Rand(Corpus->size());
+  const Unit &Other = (*Corpus)[Idx];
+  if (Other.empty())
+    return 0;
+  MutateInPlaceHere.resize(MaxSize);
+  auto &U = MutateInPlaceHere;
+  size_t NewSize = EF->LLVMFuzzerCustomCrossOver(
+      Data, Size, Other.data(), Other.size(), U.data(), U.size(), Rand.Rand());
+  if (!NewSize)
+    return 0;
+  assert(NewSize <= MaxSize && "CustomCrossOver returned overisized unit");
+  memcpy(Data, U.data(), NewSize);
+  return NewSize;
+}
+
 size_t MutationDispatcher::Mutate_ShuffleBytes(uint8_t *Data, size_t Size,
                                                size_t MaxSize) {
   assert(Size);
@@ -122,38 +140,39 @@ size_t MutationDispatcher::Mutate_ChangeBit(uint8_t *Data, size_t Size,
 size_t MutationDispatcher::Mutate_AddWordFromManualDictionary(uint8_t *Data,
                                                               size_t Size,
                                                               size_t MaxSize) {
-  return MDImpl->AddWordFromDictionary(MDImpl->ManualDictionary, Data, Size,
-                                       MaxSize);
+  return AddWordFromDictionary(ManualDictionary, Data, Size, MaxSize);
+}
+
+size_t MutationDispatcher::Mutate_AddWordFromTemporaryAutoDictionary(
+    uint8_t *Data, size_t Size, size_t MaxSize) {
+  return AddWordFromDictionary(TempAutoDictionary, Data, Size, MaxSize);
 }
 
-size_t MutationDispatcher::Mutate_AddWordFromAutoDictionary(uint8_t *Data,
-                                                            size_t Size,
-                                                            size_t MaxSize) {
-  return MDImpl->AddWordFromDictionary(MDImpl->AutoDictionary, Data, Size,
-                                       MaxSize);
+size_t MutationDispatcher::Mutate_AddWordFromPersistentAutoDictionary(
+    uint8_t *Data, size_t Size, size_t MaxSize) {
+  return AddWordFromDictionary(PersistentAutoDictionary, Data, Size, MaxSize);
 }
 
-size_t MutationDispatcher::Impl::AddWordFromDictionary(
-    const std::vector<DictionaryEntry> &D, uint8_t *Data, size_t Size,
-    size_t MaxSize) {
+size_t MutationDispatcher::AddWordFromDictionary(Dictionary &D, uint8_t *Data,
+                                                 size_t Size, size_t MaxSize) {
   if (D.empty()) return 0;
-  const DictionaryEntry &DE = D[Rand(D.size())];
-  const Unit &Word = DE.Word;
-  size_t PositionHint = DE.PositionHint;
-  bool UsePositionHint = PositionHint != std::numeric_limits<size_t>::max() &&
-                         PositionHint + Word.size() < Size && Rand.RandBool();
-  if (Rand.RandBool()) {  // Insert Word.
-    if (Size + Word.size() > MaxSize) return 0;
-    size_t Idx = UsePositionHint ? PositionHint : Rand(Size + 1);
-    memmove(Data + Idx + Word.size(), Data + Idx, Size - Idx);
-    memcpy(Data + Idx, Word.data(), Word.size());
-    Size += Word.size();
-  } else {  // Overwrite some bytes with Word.
-    if (Word.size() > Size) return 0;
-    size_t Idx = UsePositionHint ? PositionHint : Rand(Size - Word.size());
-    memcpy(Data + Idx, Word.data(), Word.size());
+  DictionaryEntry &DE = D[Rand(D.size())];
+  const Word &W = DE.GetW();
+  bool UsePositionHint = DE.HasPositionHint() &&
+                         DE.GetPositionHint() + W.size() < Size && Rand.RandBool();
+  if (Rand.RandBool()) {  // Insert W.
+    if (Size + W.size() > MaxSize) return 0;
+    size_t Idx = UsePositionHint ? DE.GetPositionHint() : Rand(Size + 1);
+    memmove(Data + Idx + W.size(), Data + Idx, Size - Idx);
+    memcpy(Data + Idx, W.data(), W.size());
+    Size += W.size();
+  } else {  // Overwrite some bytes with W.
+    if (W.size() > Size) return 0;
+    size_t Idx = UsePositionHint ? DE.GetPositionHint() : Rand(Size - W.size());
+    memcpy(Data + Idx, W.data(), W.size());
   }
-  CurrentDictionaryEntrySequence.push_back(DE);
+  DE.IncUseCount();
+  CurrentDictionaryEntrySequence.push_back(&DE);
   return Size;
 }
 
@@ -192,12 +211,12 @@ size_t MutationDispatcher::Mutate_ChangeASCIIInteger(uint8_t *Data, size_t Size,
 
 size_t MutationDispatcher::Mutate_CrossOver(uint8_t *Data, size_t Size,
                                             size_t MaxSize) {
-  auto Corpus = MDImpl->Corpus;
   if (!Corpus || Corpus->size() < 2 || Size == 0) return 0;
   size_t Idx = Rand(Corpus->size());
   const Unit &Other = (*Corpus)[Idx];
   if (Other.empty()) return 0;
-  Unit U(MaxSize);
+  MutateInPlaceHere.resize(MaxSize);
+  auto &U = MutateInPlaceHere;
   size_t NewSize =
       CrossOver(Data, Size, Other.data(), Other.size(), U.data(), U.size());
   assert(NewSize > 0 && "CrossOver returned empty unit");
@@ -207,30 +226,69 @@ size_t MutationDispatcher::Mutate_CrossOver(uint8_t *Data, size_t Size,
 }
 
 void MutationDispatcher::StartMutationSequence() {
-  MDImpl->CurrentMutatorSequence.clear();
-  MDImpl->CurrentDictionaryEntrySequence.clear();
+  CurrentMutatorSequence.clear();
+  CurrentDictionaryEntrySequence.clear();
+}
+
+// Copy successful dictionary entries to PersistentAutoDictionary.
+void MutationDispatcher::RecordSuccessfulMutationSequence() {
+  for (auto DE : CurrentDictionaryEntrySequence) {
+    // PersistentAutoDictionary.AddWithSuccessCountOne(DE);
+    DE->IncSuccessCount();
+    // Linear search is fine here as this happens seldom.
+    if (!PersistentAutoDictionary.ContainsWord(DE->GetW()))
+      PersistentAutoDictionary.push_back({DE->GetW(), 1});
+  }
+}
+
+void MutationDispatcher::PrintRecommendedDictionary() {
+  std::vector<DictionaryEntry> V;
+  for (auto &DE : PersistentAutoDictionary)
+    if (!ManualDictionary.ContainsWord(DE.GetW()))
+      V.push_back(DE);
+  if (V.empty()) return;
+  Printf("###### Recommended dictionary. ######\n");
+  for (auto &DE: V) {
+    Printf("\"");
+    PrintASCII(DE.GetW(), "\"");
+    Printf(" # Uses: %zd\n", DE.GetUseCount());
+  }
+  Printf("###### End of recommended dictionary. ######\n");
 }
 
 void MutationDispatcher::PrintMutationSequence() {
-  Printf("MS: %zd ", MDImpl->CurrentMutatorSequence.size());
-  for (auto M : MDImpl->CurrentMutatorSequence)
+  Printf("MS: %zd ", CurrentMutatorSequence.size());
+  for (auto M : CurrentMutatorSequence)
     Printf("%s-", M.Name);
-  if (!MDImpl->CurrentDictionaryEntrySequence.empty()) {
+  if (!CurrentDictionaryEntrySequence.empty()) {
     Printf(" DE: ");
-    for (auto DE : MDImpl->CurrentDictionaryEntrySequence) {
+    for (auto DE : CurrentDictionaryEntrySequence) {
       Printf("\"");
-      PrintASCII(DE.Word, "\"-");
+      PrintASCII(DE->GetW(), "\"-");
     }
   }
 }
 
-// Mutates Data in place, returns new size.
 size_t MutationDispatcher::Mutate(uint8_t *Data, size_t Size, size_t MaxSize) {
+  return MutateImpl(Data, Size, MaxSize, Mutators);
+}
+
+size_t MutationDispatcher::DefaultMutate(uint8_t *Data, size_t Size,
+                                         size_t MaxSize) {
+  return MutateImpl(Data, Size, MaxSize, DefaultMutators);
+}
+
+// Mutates Data in place, returns new size.
+size_t MutationDispatcher::MutateImpl(uint8_t *Data, size_t Size,
+                                      size_t MaxSize,
+                                      const std::vector<Mutator> &Mutators) {
   assert(MaxSize > 0);
   assert(Size <= MaxSize);
   if (Size == 0) {
     for (size_t i = 0; i < MaxSize; i++)
       Data[i] = RandCh(Rand);
+    if (Options.OnlyASCII)
+      ToASCII(Data, MaxSize);
     return MaxSize;
   }
   assert(Size > 0);
@@ -238,41 +296,31 @@ size_t MutationDispatcher::Mutate(uint8_t *Data, size_t Size, size_t MaxSize) {
   // in which case they will return 0.
   // Try several times before returning un-mutated data.
   for (int Iter = 0; Iter < 10; Iter++) {
-    size_t MutatorIdx = Rand(MDImpl->Mutators.size());
-    auto M = MDImpl->Mutators[MutatorIdx];
+    auto M = Mutators[Rand(Mutators.size())];
     size_t NewSize = (this->*(M.Fn))(Data, Size, MaxSize);
     if (NewSize) {
-      MDImpl->CurrentMutatorSequence.push_back(M);
+      if (Options.OnlyASCII)
+        ToASCII(Data, NewSize);
+      CurrentMutatorSequence.push_back(M);
       return NewSize;
     }
   }
   return Size;
 }
 
-void MutationDispatcher::SetCorpus(const std::vector<Unit> *Corpus) {
-  MDImpl->SetCorpus(Corpus);
-}
-
-void MutationDispatcher::AddWordToManualDictionary(const Unit &Word) {
-  MDImpl->ManualDictionary.push_back(
-      {Word, std::numeric_limits<size_t>::max()});
+void MutationDispatcher::AddWordToManualDictionary(const Word &W) {
+  ManualDictionary.push_back(
+      {W, std::numeric_limits<size_t>::max()});
 }
 
-void MutationDispatcher::AddWordToAutoDictionary(const Unit &Word,
-                                                 size_t PositionHint) {
+void MutationDispatcher::AddWordToAutoDictionary(DictionaryEntry DE) {
   static const size_t kMaxAutoDictSize = 1 << 14;
-  if (MDImpl->AutoDictionary.size() >= kMaxAutoDictSize) return;
-  MDImpl->AutoDictionary.push_back({Word, PositionHint});
+  if (TempAutoDictionary.size() >= kMaxAutoDictSize) return;
+  TempAutoDictionary.push_back(DE);
 }
 
 void MutationDispatcher::ClearAutoDictionary() {
-  MDImpl->AutoDictionary.clear();
-}
-
-MutationDispatcher::MutationDispatcher(FuzzerRandomBase &Rand) : Rand(Rand) {
-  MDImpl = new Impl(Rand);
+  TempAutoDictionary.clear();
 }
 
-MutationDispatcher::~MutationDispatcher() { delete MDImpl; }
-
 }  // namespace fuzzer
diff --git a/lib/Fuzzer/FuzzerSanitizerOptions.cpp b/lib/Fuzzer/FuzzerSanitizerOptions.cpp
deleted file mode 100644
index b2f20dd4ddf0..000000000000
--- a/lib/Fuzzer/FuzzerSanitizerOptions.cpp
+++ /dev/null
@@ -1,19 +0,0 @@
-//===- FuzzerSanitizerOptions.cpp - default flags for sanitizers ----------===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-// Set default options for sanitizers while running the fuzzer.
-// Options reside in a separate file, so if we don't want to set the default
-// options we simply do not link this file in.
-// ASAN options:
-//   * don't dump the coverage to disk.
-//   * enable coverage by default.
-//   * enable handle_abort.
-//===----------------------------------------------------------------------===//
-extern "C" const char *__asan_default_options() {
-  return "coverage_pcs=0:coverage=1:handle_abort=1";
-}
diff --git a/lib/Fuzzer/FuzzerTracePC.cpp b/lib/Fuzzer/FuzzerTracePC.cpp
new file mode 100644
index 000000000000..46c43d0c17f0
--- /dev/null
+++ b/lib/Fuzzer/FuzzerTracePC.cpp
@@ -0,0 +1,71 @@
+//===- FuzzerTracePC.cpp - PC tracing--------------------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+// Trace PCs.
+// This module implements __sanitizer_cov_trace_pc, a callback required
+// for -fsanitize-coverage=trace-pc instrumentation.
+//
+//===----------------------------------------------------------------------===//
+
+#include "FuzzerInternal.h"
+
+namespace fuzzer {
+
+void PcCoverageMap::Reset() { memset(Map, 0, sizeof(Map)); }
+
+void PcCoverageMap::Update(uintptr_t Addr) {
+  uintptr_t Idx = Addr % kMapSizeInBits;
+  uintptr_t WordIdx = Idx / kBitsInWord;
+  uintptr_t BitIdx = Idx % kBitsInWord;
+  Map[WordIdx] |= 1UL << BitIdx;
+}
+
+size_t PcCoverageMap::MergeFrom(const PcCoverageMap &Other) {
+  uintptr_t Res = 0;
+  for (size_t i = 0; i < kMapSizeInWords; i++)
+    Res += __builtin_popcountl(Map[i] |= Other.Map[i]);
+  return Res;
+}
+
+static PcCoverageMap CurrentMap;
+static thread_local uintptr_t Prev;
+
+void PcMapResetCurrent() {
+  if (Prev) {
+    Prev = 0;
+    CurrentMap.Reset();
+  }
+}
+
+size_t PcMapMergeInto(PcCoverageMap *Map) {
+  if (!Prev)
+    return 0;
+  return Map->MergeFrom(CurrentMap);
+}
+
+static void HandlePC(uint32_t PC) {
+  // We take 12 bits of PC and mix it with the previous PCs.
+  uintptr_t Next = (Prev << 5) ^ (PC & 4095);
+  CurrentMap.Update(Next);
+  Prev = Next;
+}
+
+} // namespace fuzzer
+
+extern "C" {
+void __sanitizer_cov_trace_pc() {
+  fuzzer::HandlePC(static_cast<uint32_t>(
+      reinterpret_cast<uintptr_t>(__builtin_return_address(0))));
+}
+
+void __sanitizer_cov_trace_pc_indir(int *) {
+  // Stub to allow linking with code built with
+  // -fsanitize=indirect-calls,trace-pc.
+  // This isn't used currently.
+}
+}
diff --git a/lib/Fuzzer/FuzzerTracePC.h b/lib/Fuzzer/FuzzerTracePC.h
new file mode 100644
index 000000000000..47280ba7faa0
--- /dev/null
+++ b/lib/Fuzzer/FuzzerTracePC.h
@@ -0,0 +1,37 @@
+//===- FuzzerTracePC.h - INTERNAL - Path tracer. --------*- C++ -* ===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+// Trace PCs.
+// This module implements __sanitizer_cov_trace_pc, a callback required
+// for -fsanitize-coverage=trace-pc instrumentation.
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_FUZZER_TRACE_PC_H
+#define LLVM_FUZZER_TRACE_PC_H
+
+namespace fuzzer {
+struct PcCoverageMap {
+  static const size_t kMapSizeInBits = 65371;        // Prime.
+  static const size_t kMapSizeInBitsAligned = 65536; // 2^16
+  static const size_t kBitsInWord = (sizeof(uintptr_t) * 8);
+  static const size_t kMapSizeInWords = kMapSizeInBitsAligned / kBitsInWord;
+
+  void Reset();
+  inline void Update(uintptr_t Addr);
+  size_t MergeFrom(const PcCoverageMap &Other);
+
+  uintptr_t Map[kMapSizeInWords] __attribute__((aligned(512)));
+};
+
+// Clears the current PC Map.
+void PcMapResetCurrent();
+// Merges the current PC Map into the combined one, and clears the former.
+size_t PcMapMergeInto(PcCoverageMap *Map);
+}
+
+#endif
diff --git a/lib/Fuzzer/FuzzerTraceState.cpp b/lib/Fuzzer/FuzzerTraceState.cpp
index b2006fa3aa4d..d6e1f79791f5 100644
--- a/lib/Fuzzer/FuzzerTraceState.cpp
+++ b/lib/Fuzzer/FuzzerTraceState.cpp
@@ -41,8 +41,8 @@
 //     __dfsw_HOOK(a, b, label(a), label(b)) so that __dfsw_HOOK
 //     gets all the taint labels for the arguments.
 //   * At the Fuzzer startup we assign a unique DFSan label
-//     to every byte of the input string (Fuzzer::CurrentUnit) so that for any
-//     chunk of data we know which input bytes it has derived from.
+//     to every byte of the input string (Fuzzer::CurrentUnitData) so that
+//     for any chunk of data we know which input bytes it has derived from.
 //   * The __dfsw_* functions (implemented in this file) record the
 //     parameters (i.e. the application data and the corresponding taint labels)
 //     in a global state.
@@ -77,6 +77,7 @@
 #include <cstring>
 #include <thread>
 #include <map>
+#include <set>
 
 #if !LLVM_FUZZER_SUPPORTS_DFSAN
 // Stubs for dfsan for platforms where dfsan does not exist and weak
@@ -164,24 +165,20 @@ struct LabelRange {
 
 // For now, very simple: put Size bytes of Data at position Pos.
 struct TraceBasedMutation {
-  static const size_t kMaxSize = 28;
-  uint32_t Pos : 24;
-  uint32_t Size : 8;
-  uint8_t  Data[kMaxSize];
+  uint32_t Pos;
+  Word W;
 };
 
-const size_t TraceBasedMutation::kMaxSize;
+// Declared as static globals for faster checks inside the hooks.
+static bool RecordingTraces = false;
+static bool RecordingMemcmp = false;
+static bool RecordingMemmem = false;
 
 class TraceState {
- public:
-  TraceState(UserSuppliedFuzzer &USF,
-             const Fuzzer::FuzzingOptions &Options, const Unit &CurrentUnit)
-       : USF(USF), Options(Options), CurrentUnit(CurrentUnit) {
-    // Current trace collection is not thread-friendly and it probably
-    // does not have to be such, but at least we should not crash in presence
-    // of threads. So, just ignore all traces coming from all threads but one.
-    IsMyThread = true;
-  }
+public:
+  TraceState(MutationDispatcher &MD, const FuzzingOptions &Options,
+             const Fuzzer *F)
+      : MD(MD), Options(Options), F(F) {}
 
   LabelRange GetLabelRange(dfsan_label L);
   void DFSanCmpCallback(uintptr_t PC, size_t CmpSize, size_t CmpType,
@@ -200,28 +197,33 @@ class TraceState {
   void TraceSwitchCallback(uintptr_t PC, size_t ValSizeInBits, uint64_t Val,
                            size_t NumCases, uint64_t *Cases);
   int TryToAddDesiredData(uint64_t PresentData, uint64_t DesiredData,
-                           size_t DataSize);
+                          size_t DataSize);
   int TryToAddDesiredData(const uint8_t *PresentData,
                           const uint8_t *DesiredData, size_t DataSize);
 
   void StartTraceRecording() {
-    if (!Options.UseTraces) return;
-    RecordingTraces = true;
+    if (!Options.UseTraces && !Options.UseMemcmp)
+      return;
+    RecordingTraces = Options.UseTraces;
+    RecordingMemcmp = Options.UseMemcmp;
+    RecordingMemmem = Options.UseMemmem;
     NumMutations = 0;
-    USF.GetMD().ClearAutoDictionary();
+    InterestingWords.clear();
+    MD.ClearAutoDictionary();
   }
 
   void StopTraceRecording() {
-    if (!RecordingTraces) return;
+    if (!RecordingTraces && !RecordingMemcmp)
+      return;
     RecordingTraces = false;
+    RecordingMemcmp = false;
     for (size_t i = 0; i < NumMutations; i++) {
       auto &M = Mutations[i];
-      Unit U(M.Data, M.Data + M.Size);
       if (Options.Verbosity >= 2) {
-        AutoDictUnitCounts[U]++;
+        AutoDictUnitCounts[M.W]++;
         AutoDictAdds++;
         if ((AutoDictAdds & (AutoDictAdds - 1)) == 0) {
-          typedef std::pair<size_t, Unit> CU;
+          typedef std::pair<size_t, Word> CU;
           std::vector<CU> CountedUnits;
           for (auto &I : AutoDictUnitCounts)
             CountedUnits.push_back(std::make_pair(I.second, I.first));
@@ -235,17 +237,17 @@ class TraceState {
           }
         }
       }
-      USF.GetMD().AddWordToAutoDictionary(U, M.Pos);
+      MD.AddWordToAutoDictionary({M.W, M.Pos});
     }
+    for (auto &W : InterestingWords)
+      MD.AddWordToAutoDictionary({W});
   }
 
   void AddMutation(uint32_t Pos, uint32_t Size, const uint8_t *Data) {
     if (NumMutations >= kMaxMutations) return;
-    assert(Size <= TraceBasedMutation::kMaxSize);
     auto &M = Mutations[NumMutations++];
     M.Pos = Pos;
-    M.Size = Size;
-    memcpy(M.Data, Data, Size);
+    M.W.Set(Data, Size);
   }
 
   void AddMutation(uint32_t Pos, uint32_t Size, uint64_t Data) {
@@ -253,26 +255,61 @@ class TraceState {
     AddMutation(Pos, Size, reinterpret_cast<uint8_t*>(&Data));
   }
 
+  void AddInterestingWord(const uint8_t *Data, size_t Size) {
+    if (!RecordingMemmem || !F->InFuzzingThread()) return;
+    if (Size <= 1) return;
+    Size = std::min(Size, Word::GetMaxSize());
+    Word W(Data, Size);
+    InterestingWords.insert(W);
+  }
+
+  void EnsureDfsanLabels(size_t Size) {
+    for (; LastDfsanLabel < Size; LastDfsanLabel++) {
+      dfsan_label L = dfsan_create_label("input", (void *)(LastDfsanLabel + 1));
+      // We assume that no one else has called dfsan_create_label before.
+      if (L != LastDfsanLabel + 1) {
+        Printf("DFSan labels are not starting from 1, exiting\n");
+        exit(1);
+      }
+    }
+  }
+
  private:
   bool IsTwoByteData(uint64_t Data) {
     int64_t Signed = static_cast<int64_t>(Data);
     Signed >>= 16;
     return Signed == 0 || Signed == -1L;
   }
-  bool RecordingTraces = false;
+
+  // We don't want to create too many trace-based mutations as it is both
+  // expensive and useless. So after some number of mutations is collected,
+  // start rejecting some of them. The more there are mutations the more we
+  // reject.
+  bool WantToHandleOneMoreMutation() {
+    const size_t FirstN = 64;
+    // Gladly handle first N mutations.
+    if (NumMutations <= FirstN) return true;
+    size_t Diff = NumMutations - FirstN;
+    size_t DiffLog = sizeof(long) * 8 - __builtin_clzl((long)Diff);
+    assert(DiffLog > 0 && DiffLog < 64);
+    bool WantThisOne = MD.GetRand()(1 << DiffLog) == 0;  // 1 out of DiffLog.
+    return WantThisOne;
+  }
+
   static const size_t kMaxMutations = 1 << 16;
   size_t NumMutations;
   TraceBasedMutation Mutations[kMaxMutations];
+  // TODO: std::set is too inefficient, need to have a custom DS here.
+  std::set<Word> InterestingWords;
   LabelRange LabelRanges[1 << (sizeof(dfsan_label) * 8)];
-  UserSuppliedFuzzer &USF;
-  const Fuzzer::FuzzingOptions &Options;
-  const Unit &CurrentUnit;
-  std::map<Unit, size_t> AutoDictUnitCounts;
+  size_t LastDfsanLabel = 0;
+  MutationDispatcher &MD;
+  const FuzzingOptions Options;
+  const Fuzzer *F;
+  std::map<Word, size_t> AutoDictUnitCounts;
   size_t AutoDictAdds = 0;
-  static thread_local bool IsMyThread;
 };
 
-thread_local bool TraceState::IsMyThread;
 
 LabelRange TraceState::GetLabelRange(dfsan_label L) {
   LabelRange &LR = LabelRanges[L];
@@ -288,7 +325,7 @@ void TraceState::DFSanCmpCallback(uintptr_t PC, size_t CmpSize, size_t CmpType,
                                   uint64_t Arg1, uint64_t Arg2, dfsan_label L1,
                                   dfsan_label L2) {
   assert(ReallyHaveDFSan());
-  if (!RecordingTraces || !IsMyThread) return;
+  if (!RecordingTraces || !F->InFuzzingThread()) return;
   if (L1 == 0 && L2 == 0)
     return;  // Not actionable.
   if (L1 != 0 && L2 != 0)
@@ -303,7 +340,7 @@ void TraceState::DFSanCmpCallback(uintptr_t PC, size_t CmpSize, size_t CmpType,
     AddMutation(Pos, CmpSize, Data - 1);
   }
 
-  if (CmpSize > LR.End - LR.Beg)
+  if (CmpSize > (size_t)(LR.End - LR.Beg))
     AddMutation(LR.Beg, (unsigned)(LR.End - LR.Beg), Data);
 
 
@@ -318,7 +355,7 @@ void TraceState::DFSanMemcmpCallback(size_t CmpSize, const uint8_t *Data1,
                                      dfsan_label L2) {
 
   assert(ReallyHaveDFSan());
-  if (!RecordingTraces || !IsMyThread) return;
+  if (!RecordingMemcmp || !F->InFuzzingThread()) return;
   if (L1 == 0 && L2 == 0)
     return;  // Not actionable.
   if (L1 != 0 && L2 != 0)
@@ -337,7 +374,7 @@ void TraceState::DFSanSwitchCallback(uint64_t PC, size_t ValSizeInBits,
                                      uint64_t Val, size_t NumCases,
                                      uint64_t *Cases, dfsan_label L) {
   assert(ReallyHaveDFSan());
-  if (!RecordingTraces || !IsMyThread) return;
+  if (!RecordingTraces || !F->InFuzzingThread()) return;
   if (!L) return;  // Not actionable.
   LabelRange LR = GetLabelRange(L);
   size_t ValSize = ValSizeInBits / 8;
@@ -362,15 +399,18 @@ void TraceState::DFSanSwitchCallback(uint64_t PC, size_t ValSizeInBits,
 
 int TraceState::TryToAddDesiredData(uint64_t PresentData, uint64_t DesiredData,
                                     size_t DataSize) {
+  if (NumMutations >= kMaxMutations || !WantToHandleOneMoreMutation()) return 0;
+  const uint8_t *UnitData;
+  auto UnitSize = F->GetCurrentUnitInFuzzingThead(&UnitData);
   int Res = 0;
-  const uint8_t *Beg = CurrentUnit.data();
-  const uint8_t *End = Beg + CurrentUnit.size();
+  const uint8_t *Beg = UnitData;
+  const uint8_t *End = Beg + UnitSize;
   for (const uint8_t *Cur = Beg; Cur < End; Cur++) {
     Cur = (uint8_t *)memmem(Cur, End - Cur, &PresentData, DataSize);
     if (!Cur)
       break;
     size_t Pos = Cur - Beg;
-    assert(Pos < CurrentUnit.size());
+    assert(Pos < UnitSize);
     AddMutation(Pos, DataSize, DesiredData);
     AddMutation(Pos, DataSize, DesiredData + 1);
     AddMutation(Pos, DataSize, DesiredData - 1);
@@ -382,15 +422,18 @@ int TraceState::TryToAddDesiredData(uint64_t PresentData, uint64_t DesiredData,
 int TraceState::TryToAddDesiredData(const uint8_t *PresentData,
                                     const uint8_t *DesiredData,
                                     size_t DataSize) {
+  if (NumMutations >= kMaxMutations || !WantToHandleOneMoreMutation()) return 0;
+  const uint8_t *UnitData;
+  auto UnitSize = F->GetCurrentUnitInFuzzingThead(&UnitData);
   int Res = 0;
-  const uint8_t *Beg = CurrentUnit.data();
-  const uint8_t *End = Beg + CurrentUnit.size();
+  const uint8_t *Beg = UnitData;
+  const uint8_t *End = Beg + UnitSize;
   for (const uint8_t *Cur = Beg; Cur < End; Cur++) {
     Cur = (uint8_t *)memmem(Cur, End - Cur, PresentData, DataSize);
     if (!Cur)
       break;
     size_t Pos = Cur - Beg;
-    assert(Pos < CurrentUnit.size());
+    assert(Pos < UnitSize);
     AddMutation(Pos, DataSize, DesiredData);
     Res++;
   }
@@ -399,7 +442,7 @@ int TraceState::TryToAddDesiredData(const uint8_t *PresentData,
 
 void TraceState::TraceCmpCallback(uintptr_t PC, size_t CmpSize, size_t CmpType,
                                   uint64_t Arg1, uint64_t Arg2) {
-  if (!RecordingTraces || !IsMyThread) return;
+  if (!RecordingTraces || !F->InFuzzingThread()) return;
   if ((CmpType == ICMP_EQ || CmpType == ICMP_NE) && Arg1 == Arg2)
     return;  // No reason to mutate.
   int Added = 0;
@@ -415,8 +458,8 @@ void TraceState::TraceCmpCallback(uintptr_t PC, size_t CmpSize, size_t CmpType,
 
 void TraceState::TraceMemcmpCallback(size_t CmpSize, const uint8_t *Data1,
                                      const uint8_t *Data2) {
-  if (!RecordingTraces || !IsMyThread) return;
-  CmpSize = std::min(CmpSize, TraceBasedMutation::kMaxSize);
+  if (!RecordingMemcmp || !F->InFuzzingThread()) return;
+  CmpSize = std::min(CmpSize, Word::GetMaxSize());
   int Added2 = TryToAddDesiredData(Data1, Data2, CmpSize);
   int Added1 = TryToAddDesiredData(Data2, Data1, CmpSize);
   if ((Added1 || Added2) && Options.Verbosity >= 3) {
@@ -430,7 +473,7 @@ void TraceState::TraceMemcmpCallback(size_t CmpSize, const uint8_t *Data1,
 void TraceState::TraceSwitchCallback(uintptr_t PC, size_t ValSizeInBits,
                                      uint64_t Val, size_t NumCases,
                                      uint64_t *Cases) {
-  if (!RecordingTraces || !IsMyThread) return;
+  if (!RecordingTraces || !F->InFuzzingThread()) return;
   size_t ValSize = ValSizeInBits / 8;
   bool TryShort = IsTwoByteData(Val);
   for (size_t i = 0; i < NumCases; i++)
@@ -451,9 +494,6 @@ static TraceState *TS;
 
 void Fuzzer::StartTraceRecording() {
   if (!TS) return;
-  if (ReallyHaveDFSan())
-    for (size_t i = 0; i < static_cast<size_t>(Options.MaxLen); i++)
-      dfsan_set_label(i + 1, &CurrentUnit[i], 1);
   TS->StartTraceRecording();
 }
 
@@ -462,20 +502,17 @@ void Fuzzer::StopTraceRecording() {
   TS->StopTraceRecording();
 }
 
-void Fuzzer::InitializeTraceState() {
-  if (!Options.UseTraces) return;
-  TS = new TraceState(USF, Options, CurrentUnit);
-  CurrentUnit.resize(Options.MaxLen);
-  // The rest really requires DFSan.
+void Fuzzer::AssignTaintLabels(uint8_t *Data, size_t Size) {
+  if (!Options.UseTraces && !Options.UseMemcmp) return;
   if (!ReallyHaveDFSan()) return;
-  for (size_t i = 0; i < static_cast<size_t>(Options.MaxLen); i++) {
-    dfsan_label L = dfsan_create_label("input", (void*)(i + 1));
-    // We assume that no one else has called dfsan_create_label before.
-    if (L != i + 1) {
-      Printf("DFSan labels are not starting from 1, exiting\n");
-      exit(1);
-    }
-  }
+  TS->EnsureDfsanLabels(Size);
+  for (size_t i = 0; i < Size; i++)
+    dfsan_set_label(i + 1, &Data[i], 1);
+}
+
+void Fuzzer::InitializeTraceState() {
+  if (!Options.UseTraces && !Options.UseMemcmp) return;
+  TS = new TraceState(MD, Options, this);
 }
 
 static size_t InternalStrnlen(const char *S, size_t MaxLen) {
@@ -487,12 +524,14 @@ static size_t InternalStrnlen(const char *S, size_t MaxLen) {
 }  // namespace fuzzer
 
 using fuzzer::TS;
+using fuzzer::RecordingTraces;
+using fuzzer::RecordingMemcmp;
 
 extern "C" {
 void __dfsw___sanitizer_cov_trace_cmp(uint64_t SizeAndType, uint64_t Arg1,
                                       uint64_t Arg2, dfsan_label L0,
                                       dfsan_label L1, dfsan_label L2) {
-  if (!TS) return;
+  if (!RecordingTraces) return;
   assert(L0 == 0);
   uintptr_t PC = reinterpret_cast<uintptr_t>(__builtin_return_address(0));
   uint64_t CmpSize = (SizeAndType >> 32) / 8;
@@ -502,7 +541,7 @@ void __dfsw___sanitizer_cov_trace_cmp(uint64_t SizeAndType, uint64_t Arg1,
 
 void __dfsw___sanitizer_cov_trace_switch(uint64_t Val, uint64_t *Cases,
                                          dfsan_label L1, dfsan_label L2) {
-  if (!TS) return;
+  if (!RecordingTraces) return;
   uintptr_t PC = reinterpret_cast<uintptr_t>(__builtin_return_address(0));
   TS->DFSanSwitchCallback(PC, Cases[1], Val, Cases[0], Cases+2, L1);
 }
@@ -510,7 +549,7 @@ void __dfsw___sanitizer_cov_trace_switch(uint64_t Val, uint64_t *Cases,
 void dfsan_weak_hook_memcmp(void *caller_pc, const void *s1, const void *s2,
                             size_t n, dfsan_label s1_label,
                             dfsan_label s2_label, dfsan_label n_label) {
-  if (!TS) return;
+  if (!RecordingMemcmp) return;
   dfsan_label L1 = dfsan_read_label(s1, n);
   dfsan_label L2 = dfsan_read_label(s2, n);
   TS->DFSanMemcmpCallback(n, reinterpret_cast<const uint8_t *>(s1),
@@ -520,7 +559,7 @@ void dfsan_weak_hook_memcmp(void *caller_pc, const void *s1, const void *s2,
 void dfsan_weak_hook_strncmp(void *caller_pc, const char *s1, const char *s2,
                              size_t n, dfsan_label s1_label,
                              dfsan_label s2_label, dfsan_label n_label) {
-  if (!TS) return;
+  if (!RecordingMemcmp) return;
   n = std::min(n, fuzzer::InternalStrnlen(s1, n));
   n = std::min(n, fuzzer::InternalStrnlen(s2, n));
   dfsan_label L1 = dfsan_read_label(s1, n);
@@ -531,7 +570,7 @@ void dfsan_weak_hook_strncmp(void *caller_pc, const char *s1, const char *s2,
 
 void dfsan_weak_hook_strcmp(void *caller_pc, const char *s1, const char *s2,
                             dfsan_label s1_label, dfsan_label s2_label) {
-  if (!TS) return;
+  if (!RecordingMemcmp) return;
   size_t Len1 = strlen(s1);
   size_t Len2 = strlen(s2);
   size_t N = std::min(Len1, Len2);
@@ -550,7 +589,7 @@ void dfsan_weak_hook_strcmp(void *caller_pc, const char *s1, const char *s2,
 #if LLVM_FUZZER_DEFINES_SANITIZER_WEAK_HOOOKS
 void __sanitizer_weak_hook_memcmp(void *caller_pc, const void *s1,
                                   const void *s2, size_t n, int result) {
-  if (!TS) return;
+  if (!RecordingMemcmp) return;
   if (result == 0) return;  // No reason to mutate.
   if (n <= 1) return;  // Not interesting.
   TS->TraceMemcmpCallback(n, reinterpret_cast<const uint8_t *>(s1),
@@ -559,7 +598,7 @@ void __sanitizer_weak_hook_memcmp(void *caller_pc, const void *s1,
 
 void __sanitizer_weak_hook_strncmp(void *caller_pc, const char *s1,
                                    const char *s2, size_t n, int result) {
-  if (!TS) return;
+  if (!RecordingMemcmp) return;
   if (result == 0) return;  // No reason to mutate.
   size_t Len1 = fuzzer::InternalStrnlen(s1, n);
   size_t Len2 = fuzzer::InternalStrnlen(s2, n);
@@ -572,7 +611,7 @@ void __sanitizer_weak_hook_strncmp(void *caller_pc, const char *s1,
 
 void __sanitizer_weak_hook_strcmp(void *caller_pc, const char *s1,
                                    const char *s2, int result) {
-  if (!TS) return;
+  if (!RecordingMemcmp) return;
   if (result == 0) return;  // No reason to mutate.
   size_t Len1 = strlen(s1);
   size_t Len2 = strlen(s2);
@@ -582,12 +621,33 @@ void __sanitizer_weak_hook_strcmp(void *caller_pc, const char *s1,
                           reinterpret_cast<const uint8_t *>(s2));
 }
 
+void __sanitizer_weak_hook_strncasecmp(void *called_pc, const char *s1,
+                                       const char *s2, size_t n, int result) {
+  return __sanitizer_weak_hook_strncmp(called_pc, s1, s2, n, result);
+}
+void __sanitizer_weak_hook_strcasecmp(void *called_pc, const char *s1,
+                                      const char *s2, int result) {
+  return __sanitizer_weak_hook_strcmp(called_pc, s1, s2, result);
+}
+void __sanitizer_weak_hook_strstr(void *called_pc, const char *s1,
+                                  const char *s2, char *result) {
+  TS->AddInterestingWord(reinterpret_cast<const uint8_t *>(s2), strlen(s2));
+}
+void __sanitizer_weak_hook_strcasestr(void *called_pc, const char *s1,
+                                      const char *s2, char *result) {
+  TS->AddInterestingWord(reinterpret_cast<const uint8_t *>(s2), strlen(s2));
+}
+void __sanitizer_weak_hook_memmem(void *called_pc, const void *s1, size_t len1,
+                                  const void *s2, size_t len2, void *result) {
+  // TODO: can't hook memmem since memmem is used by libFuzzer.
+}
+
 #endif  // LLVM_FUZZER_DEFINES_SANITIZER_WEAK_HOOOKS
 
 __attribute__((visibility("default")))
 void __sanitizer_cov_trace_cmp(uint64_t SizeAndType, uint64_t Arg1,
                                uint64_t Arg2) {
-  if (!TS) return;
+  if (!RecordingTraces) return;
   uintptr_t PC = reinterpret_cast<uintptr_t>(__builtin_return_address(0));
   uint64_t CmpSize = (SizeAndType >> 32) / 8;
   uint64_t Type = (SizeAndType << 32) >> 32;
@@ -596,7 +656,7 @@ void __sanitizer_cov_trace_cmp(uint64_t SizeAndType, uint64_t Arg1,
 
 __attribute__((visibility("default")))
 void __sanitizer_cov_trace_switch(uint64_t Val, uint64_t *Cases) {
-  if (!TS) return;
+  if (!RecordingTraces) return;
   uintptr_t PC = reinterpret_cast<uintptr_t>(__builtin_return_address(0));
   TS->TraceSwitchCallback(PC, Cases[1], Val, Cases[0], Cases + 2);
 }
diff --git a/lib/Fuzzer/FuzzerUtil.cpp b/lib/Fuzzer/FuzzerUtil.cpp
index d7226cfce966..c2ae94c4d3de 100644
--- a/lib/Fuzzer/FuzzerUtil.cpp
+++ b/lib/Fuzzer/FuzzerUtil.cpp
@@ -12,21 +12,32 @@
 #include "FuzzerInternal.h"
 #include <sstream>
 #include <iomanip>
+#include <sys/resource.h>
 #include <sys/time.h>
+#include <sys/types.h>
+#include <sys/syscall.h>
 #include <cassert>
+#include <chrono>
 #include <cstring>
 #include <signal.h>
 #include <sstream>
 #include <unistd.h>
+#include <errno.h>
+#include <thread>
 
 namespace fuzzer {
 
-void Print(const Unit &v, const char *PrintAfter) {
-  for (auto x : v)
-    Printf("0x%x,", (unsigned) x);
+void PrintHexArray(const uint8_t *Data, size_t Size,
+                   const char *PrintAfter) {
+  for (size_t i = 0; i < Size; i++)
+    Printf("0x%x,", (unsigned)Data[i]);
   Printf("%s", PrintAfter);
 }
 
+void Print(const Unit &v, const char *PrintAfter) {
+  PrintHexArray(v.data(), v.size(), PrintAfter);
+}
+
 void PrintASCIIByte(uint8_t Byte) {
   if (Byte == '\\')
     Printf("\\\\");
@@ -44,10 +55,12 @@ void PrintASCII(const uint8_t *Data, size_t Size, const char *PrintAfter) {
   Printf("%s", PrintAfter);
 }
 
+void PrintASCII(const Word &W, const char *PrintAfter) {
+  PrintASCII(W.data(), W.size(), PrintAfter);
+}
+
 void PrintASCII(const Unit &U, const char *PrintAfter) {
-  for (auto X : U)
-    PrintASCIIByte(X);
-  Printf("%s", PrintAfter);
+  PrintASCII(U.data(), U.size(), PrintAfter);
 }
 
 std::string Hash(const Unit &U) {
@@ -63,22 +76,73 @@ static void AlarmHandler(int, siginfo_t *, void *) {
   Fuzzer::StaticAlarmCallback();
 }
 
-void SetTimer(int Seconds) {
-  struct itimerval T {{Seconds, 0}, {Seconds, 0}};
-  int Res = setitimer(ITIMER_REAL, &T, nullptr);
-  assert(Res == 0);
+static void CrashHandler(int, siginfo_t *, void *) {
+  Fuzzer::StaticCrashSignalCallback();
+}
+
+static void InterruptHandler(int, siginfo_t *, void *) {
+  Fuzzer::StaticInterruptCallback();
+}
+
+static void SetSigaction(int signum,
+                         void (*callback)(int, siginfo_t *, void *)) {
   struct sigaction sigact;
   memset(&sigact, 0, sizeof(sigact));
-  sigact.sa_sigaction = AlarmHandler;
-  Res = sigaction(SIGALRM, &sigact, 0);
-  assert(Res == 0);
+  sigact.sa_sigaction = callback;
+  if (sigaction(signum, &sigact, 0)) {
+    Printf("libFuzzer: sigaction failed with %d\n", errno);
+    exit(1);
+  }
 }
 
+void SetTimer(int Seconds) {
+  struct itimerval T {{Seconds, 0}, {Seconds, 0}};
+  if (setitimer(ITIMER_REAL, &T, nullptr)) {
+    Printf("libFuzzer: setitimer failed with %d\n", errno);
+    exit(1);
+  }
+  SetSigaction(SIGALRM, AlarmHandler);
+}
+
+void SetSigSegvHandler() { SetSigaction(SIGSEGV, CrashHandler); }
+void SetSigBusHandler() { SetSigaction(SIGBUS, CrashHandler); }
+void SetSigAbrtHandler() { SetSigaction(SIGABRT, CrashHandler); }
+void SetSigIllHandler() { SetSigaction(SIGILL, CrashHandler); }
+void SetSigFpeHandler() { SetSigaction(SIGFPE, CrashHandler); }
+void SetSigIntHandler() { SetSigaction(SIGINT, InterruptHandler); }
+void SetSigTermHandler() { SetSigaction(SIGTERM, InterruptHandler); }
+
 int NumberOfCpuCores() {
-  FILE *F = popen("nproc", "r");
-  int N = 0;
-  fscanf(F, "%d", &N);
-  fclose(F);
+  const char *CmdLine = nullptr;
+  if (LIBFUZZER_LINUX) {
+    CmdLine = "nproc";
+  } else if (LIBFUZZER_APPLE) {
+    CmdLine = "sysctl -n hw.ncpu";
+  } else {
+    assert(0 && "NumberOfCpuCores() is not implemented for your platform");
+  }
+
+  FILE *F = popen(CmdLine, "r");
+  int N = 1;
+  if (!F || fscanf(F, "%d", &N) != 1) {
+    Printf("WARNING: Failed to parse output of command \"%s\" in %s(). "
+           "Assuming CPU count of 1.\n",
+           CmdLine, __func__);
+    N = 1;
+  }
+
+  if (pclose(F)) {
+    Printf("WARNING: Executing command \"%s\" failed in %s(). "
+           "Assuming CPU count of 1.\n",
+           CmdLine, __func__);
+    N = 1;
+  }
+  if (N < 1) {
+    Printf("WARNING: Reported CPU count (%d) from command \"%s\" was invalid "
+           "in %s(). Assuming CPU count of 1.\n",
+           N, CmdLine, __func__);
+    N = 1;
+  }
   return N;
 }
 
@@ -86,9 +150,10 @@ int ExecuteCommand(const std::string &Command) {
   return system(Command.c_str());
 }
 
-bool ToASCII(Unit &U) {
+bool ToASCII(uint8_t *Data, size_t Size) {
   bool Changed = false;
-  for (auto &X : U) {
+  for (size_t i = 0; i < Size; i++) {
+    uint8_t &X = Data[i];
     auto NewX = X;
     NewX &= 127;
     if (!isspace(NewX) && !isprint(NewX))
@@ -99,9 +164,11 @@ bool ToASCII(Unit &U) {
   return Changed;
 }
 
-bool IsASCII(const Unit &U) {
-  for (auto X : U)
-    if (!(isprint(X) || isspace(X))) return false;
+bool IsASCII(const Unit &U) { return IsASCII(U.data(), U.size()); }
+
+bool IsASCII(const uint8_t *Data, size_t Size) {
+  for (size_t i = 0; i < Size; i++)
+    if (!(isprint(Data[i]) || isspace(Data[i]))) return false;
   return true;
 }
 
@@ -178,8 +245,11 @@ bool ParseDictionaryFile(const std::string &Text, std::vector<Unit> *Units) {
   return true;
 }
 
-int GetPid() { return getpid(); }
+void SleepSeconds(int Seconds) {
+  std::this_thread::sleep_for(std::chrono::seconds(Seconds));
+}
 
+int GetPid() { return getpid(); }
 
 std::string Base64(const Unit &U) {
   static const char Table[] = "ABCDEFGHIJKLMNOPQRSTUVWXYZ"
@@ -209,4 +279,19 @@ std::string Base64(const Unit &U) {
   return Res;
 }
 
+size_t GetPeakRSSMb() {
+  struct rusage usage;
+  if (getrusage(RUSAGE_SELF, &usage))
+    return 0;
+  if (LIBFUZZER_LINUX) {
+    // ru_maxrss is in KiB
+    return usage.ru_maxrss >> 10;
+  } else if (LIBFUZZER_APPLE) {
+    // ru_maxrss is in bytes
+    return usage.ru_maxrss >> 20;
+  }
+  assert(0 && "GetPeakRSSMb() is not implemented for your platform");
+  return 0;
+}
+
 }  // namespace fuzzer
diff --git a/lib/Fuzzer/afl/afl_driver.cpp b/lib/Fuzzer/afl/afl_driver.cpp
new file mode 100644
index 000000000000..0f789981d745
--- /dev/null
+++ b/lib/Fuzzer/afl/afl_driver.cpp
@@ -0,0 +1,287 @@
+//===- afl_driver.cpp - a glue between AFL and libFuzzer --------*- C++ -* ===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//===----------------------------------------------------------------------===//
+
+/* This file allows to fuzz libFuzzer-style target functions
+ (LLVMFuzzerTestOneInput) with AFL using AFL's persistent (in-process) mode.
+
+Usage:
+################################################################################
+cat << EOF > test_fuzzer.cc
+#include <stdint.h>
+#include <stddef.h>
+extern "C" int LLVMFuzzerTestOneInput(const uint8_t *data, size_t size) {
+  if (size > 0 && data[0] == 'H')
+    if (size > 1 && data[1] == 'I')
+       if (size > 2 && data[2] == '!')
+       __builtin_trap();
+  return 0;
+}
+EOF
+# Build your target with -fsanitize-coverage=trace-pc using fresh clang.
+clang -g -fsanitize-coverage=trace-pc test_fuzzer.cc -c
+# Build afl-llvm-rt.o.c from the AFL distribution.
+clang -c -w $AFL_HOME/llvm_mode/afl-llvm-rt.o.c
+# Build this file, link it with afl-llvm-rt.o.o and the target code.
+clang++ afl_driver.cpp test_fuzzer.o afl-llvm-rt.o.o
+# Run AFL:
+rm -rf IN OUT; mkdir IN OUT; echo z > IN/z;
+$AFL_HOME/afl-fuzz -i IN -o OUT ./a.out
+################################################################################
+Environment Variables:
+There are a few environment variables that can be set to use features that
+afl-fuzz doesn't have.
+
+AFL_DRIVER_STDERR_DUPLICATE_FILENAME: Setting this *appends* stderr to the file
+specified. If the file does not exist, it is created. This is useful for getting
+stack traces (when using ASAN for example) or original error messages on hard to
+reproduce bugs.
+
+AFL_DRIVER_EXTRA_STATS_FILENAME: Setting this causes afl_driver to write extra
+statistics to the file specified. Currently these are peak_rss_mb
+(the peak amount of virtual memory used in MB) and slowest_unit_time_secs. If
+the file does not exist it is created. If the file does exist then
+afl_driver assumes it was restarted by afl-fuzz and will try to read old
+statistics from the file. If that fails then the process will quit.
+
+*/
+#include <assert.h>
+#include <stdio.h>
+#include <stdint.h>
+#include <stdlib.h>
+#include <string.h>
+#include <unistd.h>
+#include <errno.h>
+#include <signal.h>
+#include <sys/resource.h>
+#include <sys/time.h>
+// Platform detection. Copied from FuzzerInternal.h
+#ifdef __linux__
+#define LIBFUZZER_LINUX 1
+#define LIBFUZZER_APPLE 0
+#elif __APPLE__
+#define LIBFUZZER_LINUX 0
+#define LIBFUZZER_APPLE 1
+#else
+#error "Support for your platform has not been implemented"
+#endif
+
+// Used to avoid repeating error checking boilerplate. If cond is false, a
+// fatal error has occured in the program. In this event print error_message
+// to stderr and abort(). Otherwise do nothing. Note that setting
+// AFL_DRIVER_STDERR_DUPLICATE_FILENAME may cause error_message to be appended
+// to the file as well, if the error occurs after the duplication is performed.
+#define CHECK_ERROR(cond, error_message)                                       \
+  if (!(cond)) {                                                               \
+    fprintf(stderr, (error_message));                                          \
+    abort();                                                                   \
+  }
+
+// libFuzzer interface is thin, so we don't include any libFuzzer headers.
+extern "C" {
+int LLVMFuzzerTestOneInput(const uint8_t *Data, size_t Size);
+__attribute__((weak)) int LLVMFuzzerInitialize(int *argc, char ***argv);
+}
+
+// Notify AFL about persistent mode.
+static volatile char AFL_PERSISTENT[] = "##SIG_AFL_PERSISTENT##";
+extern "C" int __afl_persistent_loop(unsigned int);
+static volatile char suppress_warning2 = AFL_PERSISTENT[0];
+
+// Notify AFL about deferred forkserver.
+static volatile char AFL_DEFER_FORKSVR[] = "##SIG_AFL_DEFER_FORKSRV##";
+extern "C" void  __afl_manual_init();
+static volatile char suppress_warning1 = AFL_DEFER_FORKSVR[0];
+
+// Input buffer.
+static const size_t kMaxAflInputSize = 1 << 20;
+static uint8_t AflInputBuf[kMaxAflInputSize];
+
+// Variables we need for writing to the extra stats file.
+static FILE *extra_stats_file = NULL;
+static uint32_t previous_peak_rss = 0;
+static time_t slowest_unit_time_secs = 0;
+static const int kNumExtraStats = 2;
+static const char *kExtraStatsFormatString = "peak_rss_mb            : %u\n"
+                                             "slowest_unit_time_sec  : %u\n";
+
+// Copied from FuzzerUtil.cpp.
+size_t GetPeakRSSMb() {
+  struct rusage usage;
+  if (getrusage(RUSAGE_SELF, &usage))
+    return 0;
+  if (LIBFUZZER_LINUX) {
+    // ru_maxrss is in KiB
+    return usage.ru_maxrss >> 10;
+  } else if (LIBFUZZER_APPLE) {
+    // ru_maxrss is in bytes
+    return usage.ru_maxrss >> 20;
+  }
+  assert(0 && "GetPeakRSSMb() is not implemented for your platform");
+  return 0;
+}
+
+// Based on SetSigaction in FuzzerUtil.cpp
+static void SetSigaction(int signum,
+                         void (*callback)(int, siginfo_t *, void *)) {
+  struct sigaction sigact;
+  memset(&sigact, 0, sizeof(sigact));
+  sigact.sa_sigaction = callback;
+  if (sigaction(signum, &sigact, 0)) {
+    fprintf(stderr, "libFuzzer: sigaction failed with %d\n", errno);
+    exit(1);
+  }
+}
+
+// Write extra stats to the file specified by the user. If none is specified
+// this function will never be called.
+static void write_extra_stats() {
+  uint32_t peak_rss = GetPeakRSSMb();
+
+  if (peak_rss < previous_peak_rss)
+    peak_rss = previous_peak_rss;
+
+  int chars_printed = fprintf(extra_stats_file, kExtraStatsFormatString,
+                              peak_rss, slowest_unit_time_secs);
+
+  CHECK_ERROR(chars_printed != 0, "Failed to write extra_stats_file");
+
+  CHECK_ERROR(fclose(extra_stats_file) == 0,
+              "Failed to close extra_stats_file");
+}
+
+// Call write_extra_stats before we exit.
+static void crash_handler(int, siginfo_t *, void *) {
+  // Make sure we don't try calling write_extra_stats again if we crashed while
+  // trying to call it.
+  static bool first_crash = true;
+  CHECK_ERROR(first_crash,
+              "Crashed in crash signal handler. This is a bug in the fuzzer.");
+
+  first_crash = false;
+  write_extra_stats();
+}
+
+// If the user has specified an extra_stats_file through the environment
+// variable AFL_DRIVER_EXTRA_STATS_FILENAME, then perform necessary set up
+// to write stats to it on exit. If no file is specified, do nothing. Otherwise
+// install signal and exit handlers to write to the file when the process exits.
+// Then if the file doesn't exist create it and set extra stats to 0. But if it
+// does exist then read the initial values of the extra stats from the file
+// and check that the file is writable.
+static void maybe_initialize_extra_stats() {
+  // If AFL_DRIVER_EXTRA_STATS_FILENAME isn't set then we have nothing to do.
+  char *extra_stats_filename = getenv("AFL_DRIVER_EXTRA_STATS_FILENAME");
+  if (!extra_stats_filename)
+    return;
+
+  // Open the file and find the previous peak_rss_mb value.
+  // This is necessary because the fuzzing process is restarted after N
+  // iterations are completed. So we may need to get this value from a previous
+  // process to be accurate.
+  extra_stats_file = fopen(extra_stats_filename, "r");
+
+  // If extra_stats_file already exists: read old stats from it.
+  if (extra_stats_file) {
+    int matches = fscanf(extra_stats_file, kExtraStatsFormatString,
+                         &previous_peak_rss, &slowest_unit_time_secs);
+
+    // Make sure we have read a real extra stats file and that we have used it
+    // to set slowest_unit_time_secs and previous_peak_rss.
+    CHECK_ERROR(matches == kNumExtraStats, "Extra stats file is corrupt");
+
+    CHECK_ERROR(fclose(extra_stats_file) == 0, "Failed to close file");
+
+    // Now open the file for writing.
+    extra_stats_file = fopen(extra_stats_filename, "w");
+    CHECK_ERROR(extra_stats_file,
+                "Failed to open extra stats file for writing");
+  } else {
+    // Looks like this is the first time in a fuzzing job this is being called.
+    extra_stats_file = fopen(extra_stats_filename, "w+");
+    CHECK_ERROR(extra_stats_file, "failed to create extra stats file");
+  }
+
+  // Make sure that crash_handler gets called on any kind of fatal error.
+  int crash_signals[] = {SIGSEGV, SIGBUS, SIGABRT, SIGILL, SIGFPE,  SIGINT,
+                         SIGTERM};
+
+  const size_t num_signals = sizeof(crash_signals) / sizeof(crash_signals[0]);
+
+  for (size_t idx = 0; idx < num_signals; idx++)
+    SetSigaction(crash_signals[idx], crash_handler);
+
+  // Make sure it gets called on other kinds of exits.
+  atexit(write_extra_stats);
+}
+
+// If the user asks us to duplicate stderr, then do it.
+static void maybe_duplicate_stderr() {
+  char* stderr_duplicate_filename =
+      getenv("AFL_DRIVER_STDERR_DUPLICATE_FILENAME");
+
+  if (!stderr_duplicate_filename)
+    return;
+
+  FILE* stderr_duplicate_stream =
+      freopen(stderr_duplicate_filename, "a+", stderr);
+
+  if (!stderr_duplicate_stream) {
+    fprintf(
+        stderr,
+        "Failed to duplicate stderr to AFL_DRIVER_STDERR_DUPLICATE_FILENAME");
+    abort();
+  }
+}
+
+int main(int argc, char **argv) {
+  fprintf(stderr, "Running in AFl-fuzz mode\nUsage:\n"
+                  "afl-fuzz [afl-flags] %s [N] "
+                  "-- run N fuzzing iterations before "
+                  "re-spawning the process (default: 1000)\n",
+          argv[0]);
+  if (LLVMFuzzerInitialize)
+    LLVMFuzzerInitialize(&argc, &argv);
+  // Do any other expensive one-time initialization here.
+
+  maybe_duplicate_stderr();
+  maybe_initialize_extra_stats();
+
+  __afl_manual_init();
+
+  int N = 1000;
+  if (argc >= 2)
+    N = atoi(argv[1]);
+  assert(N > 0);
+  time_t unit_time_secs;
+  while (__afl_persistent_loop(N)) {
+    ssize_t n_read = read(0, AflInputBuf, kMaxAflInputSize);
+    if (n_read > 0) {
+      // Copy AflInputBuf into a separate buffer to let asan find buffer
+      // overflows. Don't use unique_ptr/etc to avoid extra dependencies.
+      uint8_t *copy = new uint8_t[n_read];
+      memcpy(copy, AflInputBuf, n_read);
+
+      struct timeval unit_start_time;
+      CHECK_ERROR(gettimeofday(&unit_start_time, NULL) == 0,
+                  "Calling gettimeofday failed");
+
+      LLVMFuzzerTestOneInput(copy, n_read);
+
+      struct timeval unit_stop_time;
+      CHECK_ERROR(gettimeofday(&unit_stop_time, NULL) == 0,
+                  "Calling gettimeofday failed");
+
+      // Update slowest_unit_time_secs if we see a new max.
+      unit_time_secs = unit_stop_time.tv_sec - unit_start_time.tv_sec;
+      if (slowest_unit_time_secs < unit_time_secs)
+        slowest_unit_time_secs = unit_time_secs;
+
+      delete[] copy;
+    }
+  }
+}
diff --git a/lib/Fuzzer/pull_and_push_fuzz_corpus.sh b/lib/Fuzzer/pull_and_push_fuzz_corpus.sh
deleted file mode 100755
index 05c322c6e5bf..000000000000
--- a/lib/Fuzzer/pull_and_push_fuzz_corpus.sh
+++ /dev/null
@@ -1,17 +0,0 @@
-#!/bin/bash
-# A simple script to synchronise a fuzz test corpus
-# with an external git repository.
-# Usage:
-#   pull_and_push_fuzz_corpus.sh DIR
-# It assumes that DIR is inside a git repo and push
-# can be done w/o typing a password.
-cd $1
-git add *
-git commit -m "fuzz test corpus"
-git pull --rebase --no-edit
-for((attempt=0; attempt<5; attempt++)); do
-  echo GIT PUSH $1 ATTEMPT $attempt
-  if $(git push); then break; fi
-  git pull --rebase --no-edit
-done
-
diff --git a/lib/Fuzzer/test/AFLDriverTest.cpp b/lib/Fuzzer/test/AFLDriverTest.cpp
new file mode 100644
index 000000000000..3dd0b6117305
--- /dev/null
+++ b/lib/Fuzzer/test/AFLDriverTest.cpp
@@ -0,0 +1,22 @@
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+
+// Contains dummy functions used to avoid dependency on AFL.
+#include <stdint.h>
+#include <stdlib.h>
+
+extern "C" void __afl_manual_init() {}
+
+extern "C" int __afl_persistent_loop(unsigned int) {
+  return 0;
+}
+
+// This declaration exists to prevent the Darwin linker
+// from complaining about this being a missing weak symbol.
+extern "C" int LLVMFuzzerInitialize(int *argc, char ***argv) {
+  return 0;
+}
+
+extern "C" int LLVMFuzzerTestOneInput(const uint8_t *Data, size_t Size) {
+  return 0;
+}
diff --git a/lib/Fuzzer/test/AccumulateAllocationsTest.cpp b/lib/Fuzzer/test/AccumulateAllocationsTest.cpp
new file mode 100644
index 000000000000..604d8fa299ae
--- /dev/null
+++ b/lib/Fuzzer/test/AccumulateAllocationsTest.cpp
@@ -0,0 +1,17 @@
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+
+// Test with a more mallocs than frees, but no leak.
+#include <cstdint>
+#include <cstddef>
+
+const int kAllocatedPointersSize = 10000;
+int NumAllocatedPointers = 0;
+int *AllocatedPointers[kAllocatedPointersSize];
+
+extern "C" int LLVMFuzzerTestOneInput(const uint8_t *Data, size_t Size) {
+  if (NumAllocatedPointers < kAllocatedPointersSize)
+    AllocatedPointers[NumAllocatedPointers++] = new int;
+  return 0;
+}
+
diff --git a/lib/Fuzzer/test/BufferOverflowOnInput.cpp b/lib/Fuzzer/test/BufferOverflowOnInput.cpp
new file mode 100644
index 000000000000..b9d14052aee4
--- /dev/null
+++ b/lib/Fuzzer/test/BufferOverflowOnInput.cpp
@@ -0,0 +1,23 @@
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+
+// Simple test for a fuzzer. The fuzzer must find the string "Hi!".
+#include <assert.h>
+#include <cstdint>
+#include <cstdlib>
+#include <cstddef>
+#include <iostream>
+
+static volatile bool SeedLargeBuffer;
+
+extern "C" int LLVMFuzzerTestOneInput(const uint8_t *Data, size_t Size) {
+  assert(Data);
+  if (Size >= 4)
+    SeedLargeBuffer = true;
+  if (Size == 3 && SeedLargeBuffer && Data[3]) {
+    std::cout << "Woops, reading Data[3] w/o crashing\n";
+    exit(1);
+  }
+  return 0;
+}
+
diff --git a/lib/Fuzzer/test/CMakeLists.txt b/lib/Fuzzer/test/CMakeLists.txt
index cd0b167eb388..cbc983e9e417 100644
--- a/lib/Fuzzer/test/CMakeLists.txt
+++ b/lib/Fuzzer/test/CMakeLists.txt
@@ -1,115 +1,190 @@
 # Build all these tests with -O0, otherwise optimizations may merge some
 # basic blocks and we'll fail to discover the targets.
-# Also enable the coverage instrumentation back (it is disabled
-# for the Fuzzer lib)
-set(CMAKE_CXX_FLAGS_RELEASE "${LIBFUZZER_FLAGS_BASE} -O0 -fsanitize-coverage=edge,indirect-calls")
-
-set(DFSanTests
-  MemcmpTest
-  SimpleCmpTest
-  StrcmpTest
-  StrncmpTest
-  SwitchTest
+# We change the flags for every build type because we might be doing
+# a multi-configuration build (e.g. Xcode) where CMAKE_BUILD_TYPE doesn't
+# mean anything.
+set(variables_to_filter
+  CMAKE_CXX_FLAGS_RELEASE
+  CMAKE_CXX_FLAGS_DEBUG
+  CMAKE_CXX_FLAGS_RELWITHDEBINFO
+  CMAKE_CXX_FLAGS_MINSIZEREL
+  LIBFUZZER_FLAGS_BASE
   )
+foreach (VARNAME ${variables_to_filter})
+  string(REPLACE " " ";" BUILD_FLAGS_AS_LIST "${${VARNAME}}")
+  set(new_flags "")
+  foreach (flag ${BUILD_FLAGS_AS_LIST})
+    # NOTE: Use of XX here is to avoid a CMake warning due to CMP0054
+    if (NOT ("XX${flag}" MATCHES "XX-O[0123s]"))
+      set(new_flags "${new_flags} ${flag}")
+    else()
+      set(new_flags "${new_flags} -O0")
+    endif()
+  endforeach()
+  set(${VARNAME} "${new_flags}")
+endforeach()
+
+# Enable the coverage instrumentation (it is disabled for the Fuzzer lib).
+set(CMAKE_CXX_FLAGS "${LIBFUZZER_FLAGS_BASE} -fsanitize-coverage=edge,indirect-calls")
+
+# add_libfuzzer_test(<name>
+#   SOURCES source0.cpp [source1.cpp ...]
+#   )
+#
+#   Declares a LibFuzzer test executable with target name LLVMFuzzer-<name>.
+#
+#   One or more source files to be compiled into the binary must be declared
+#   after the SOURCES keyword.
+function(add_libfuzzer_test name)
+  set(multi_arg_options "SOURCES")
+  cmake_parse_arguments(
+    "add_libfuzzer_test" "" "" "${multi_arg_options}" ${ARGN})
+  if ("${add_libfuzzer_test_SOURCES}" STREQUAL "")
+    message(FATAL_ERROR "Source files must be specified")
+  endif()
+  add_executable(LLVMFuzzer-${name}
+    ${add_libfuzzer_test_SOURCES}
+    )
+  target_link_libraries(LLVMFuzzer-${name} LLVMFuzzer)
+  # Place binary where llvm-lit expects to find it
+  set_target_properties(LLVMFuzzer-${name}
+    PROPERTIES RUNTIME_OUTPUT_DIRECTORY
+    "${CMAKE_BINARY_DIR}/lib/Fuzzer/test"
+    )
+  set(TestBinaries ${TestBinaries} LLVMFuzzer-${name} PARENT_SCOPE)
+endfunction()
+
+# Variable to keep track of all test targets
+set(TestBinaries)
+
+###############################################################################
+# Basic tests
+###############################################################################
 
 set(Tests
+  AccumulateAllocationsTest
+  BufferOverflowOnInput
   CallerCalleeTest
   CounterTest
+  CustomCrossOverTest
+  CustomMutatorTest
+  EmptyTest
   FourIndependentBranchesTest
   FullCoverageSetTest
+  InitializeTest
   MemcmpTest
+  LeakTest
+  LeakTimeoutTest
   NullDerefTest
+  NullDerefOnEmptyTest
+  NthRunCrashTest
+  OneHugeAllocTest
+  OutOfMemoryTest
+  RepeatedMemcmp
   SimpleCmpTest
   SimpleDictionaryTest
+  SimpleFnAdapterTest
   SimpleHashTest
   SimpleTest
+  SimpleThreadedTest
+  SpamyTest
   StrcmpTest
   StrncmpTest
+  StrstrTest
   SwitchTest
+  ThreadedLeakTest
   ThreadedTest
   TimeoutTest
   )
 
-set(CustomMainTests
-  UserSuppliedFuzzerTest
-  )
-
-set(UninstrumentedTests
-  UninstrumentedTest
-  )
-
-set(TraceBBTests
-  SimpleTest
-  )
-
-set(TestBinaries)
+if(APPLE)
+  # LeakSanitizer is not supported on OSX right now
+  set(HAS_LSAN 0)
+  message(WARNING "LeakSanitizer is not supported on Apple platforms."
+    " Building and running LibFuzzer LeakSanitizer tests is disabled."
+    )
+else()
+  set(HAS_LSAN 1)
+endif()
 
 foreach(Test ${Tests})
-  add_executable(LLVMFuzzer-${Test}
-    ${Test}.cpp
-    )
-  target_link_libraries(LLVMFuzzer-${Test}
-    LLVMFuzzer
-    )
-  set(TestBinaries ${TestBinaries} LLVMFuzzer-${Test})
+  add_libfuzzer_test(${Test} SOURCES ${Test}.cpp)
 endforeach()
 
-foreach(Test ${CustomMainTests})
-  add_executable(LLVMFuzzer-${Test}
-    ${Test}.cpp
-    )
-  target_link_libraries(LLVMFuzzer-${Test}
-    LLVMFuzzerNoMain
-    )
-  set(TestBinaries ${TestBinaries} LLVMFuzzer-${Test})
-endforeach()
+###############################################################################
+# AFL Driver test
+###############################################################################
 
+add_executable(AFLDriverTest
+  AFLDriverTest.cpp ../afl/afl_driver.cpp)
 
-configure_lit_site_cfg(
-  ${CMAKE_CURRENT_SOURCE_DIR}/lit.site.cfg.in
-  ${CMAKE_CURRENT_BINARY_DIR}/lit.site.cfg
-  )
-
-configure_lit_site_cfg(
-  ${CMAKE_CURRENT_SOURCE_DIR}/unit/lit.site.cfg.in
-  ${CMAKE_CURRENT_BINARY_DIR}/unit/lit.site.cfg
-  )
+set_target_properties(AFLDriverTest
+    PROPERTIES RUNTIME_OUTPUT_DIRECTORY
+    "${CMAKE_BINARY_DIR}/lib/Fuzzer/test"
+    )
+set(TestBinaries ${TestBinaries} AFLDriverTest)
 
-include_directories(..)
-include_directories(${LLVM_MAIN_SRC_DIR}/utils/unittest/googletest/include)
+###############################################################################
+# Unit tests
+###############################################################################
 
 add_executable(LLVMFuzzer-Unittest
   FuzzerUnittest.cpp
-  $<TARGET_OBJECTS:LLVMFuzzerNoMainObjects>
+  FuzzerFnAdapterUnittest.cpp
   )
 
 target_link_libraries(LLVMFuzzer-Unittest
   gtest
   gtest_main
+  LLVMFuzzerNoMain
+  )
+
+target_include_directories(LLVMFuzzer-Unittest PRIVATE
+  "${LLVM_MAIN_SRC_DIR}/utils/unittest/googletest/include"
   )
 
 set(TestBinaries ${TestBinaries} LLVMFuzzer-Unittest)
+set_target_properties(LLVMFuzzer-Unittest
+  PROPERTIES RUNTIME_OUTPUT_DIRECTORY
+  "${CMAKE_CURRENT_BINARY_DIR}"
+)
+###############################################################################
+# Additional tests
+###############################################################################
 
-add_subdirectory(dfsan)
+include_directories(..)
 
-foreach(Test ${DFSanTests})
-  set(TestBinaries ${TestBinaries} LLVMFuzzer-${Test}-DFSan)
-endforeach()
+if(APPLE)
+  message(WARNING "DataflowSanitizer is not supported on Apple platforms."
+    " Building and running LibFuzzer DataflowSanitizer tests is disabled."
+    )
+  set(HAS_DFSAN 0)
+else()
+  set(HAS_DFSAN 1)
+  add_subdirectory(dfsan)
+endif()
 
 add_subdirectory(uninstrumented)
-
-foreach(Test ${UninstrumentedTests})
-  set(TestBinaries ${TestBinaries} LLVMFuzzer-${Test}-Uninstrumented)
-endforeach()
-
+add_subdirectory(no-coverage)
+add_subdirectory(ubsan)
 add_subdirectory(trace-bb)
+add_subdirectory(trace-pc)
 
-foreach(Test ${TraceBBTests})
-  set(TestBinaries ${TestBinaries} LLVMFuzzer-${Test}-TraceBB)
-endforeach()
+###############################################################################
+# Configure lit to run the tests
+#
+# Note this is done after declaring all tests so we can inform lit if any tests
+# need to be disabled.
+###############################################################################
 
-set_target_properties(${TestBinaries}
-  PROPERTIES RUNTIME_OUTPUT_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}
+configure_lit_site_cfg(
+  ${CMAKE_CURRENT_SOURCE_DIR}/lit.site.cfg.in
+  ${CMAKE_CURRENT_BINARY_DIR}/lit.site.cfg
+  )
+
+configure_lit_site_cfg(
+  ${CMAKE_CURRENT_SOURCE_DIR}/unit/lit.site.cfg.in
+  ${CMAKE_CURRENT_BINARY_DIR}/unit/lit.site.cfg
   )
 
 add_lit_testsuite(check-fuzzer "Running Fuzzer tests"
diff --git a/lib/Fuzzer/test/CallerCalleeTest.cpp b/lib/Fuzzer/test/CallerCalleeTest.cpp
index 150b2fc04058..3ec025d02301 100644
--- a/lib/Fuzzer/test/CallerCalleeTest.cpp
+++ b/lib/Fuzzer/test/CallerCalleeTest.cpp
@@ -1,3 +1,6 @@
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+
 // Simple test for a fuzzer.
 // Try to find the target using the indirect caller-callee pairs.
 #include <cstdint>
diff --git a/lib/Fuzzer/test/CounterTest.cpp b/lib/Fuzzer/test/CounterTest.cpp
index b61f419c4991..4917934c62e5 100644
--- a/lib/Fuzzer/test/CounterTest.cpp
+++ b/lib/Fuzzer/test/CounterTest.cpp
@@ -1,3 +1,6 @@
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+
 // Test for a fuzzer: must find the case where a particular basic block is
 // executed many times.
 #include <iostream>
diff --git a/lib/Fuzzer/test/CustomCrossOverTest.cpp b/lib/Fuzzer/test/CustomCrossOverTest.cpp
new file mode 100644
index 000000000000..2ab5781155f0
--- /dev/null
+++ b/lib/Fuzzer/test/CustomCrossOverTest.cpp
@@ -0,0 +1,57 @@
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+
+// Simple test for a cutom mutator.
+#include <assert.h>
+#include <cstddef>
+#include <cstdint>
+#include <cstdlib>
+#include <iostream>
+#include <random>
+#include <string.h>
+
+#include "FuzzerInterface.h"
+
+static const char *Separator = "-_^_-";
+static const char *Target = "012-_^_-abc";
+
+extern "C" int LLVMFuzzerTestOneInput(const uint8_t *Data, size_t Size) {
+  assert(Data);
+  std::string Str(reinterpret_cast<const char *>(Data), Size);
+
+  if (Str.find(Target) != std::string::npos) {
+    std::cout << "BINGO; Found the target, exiting\n";
+    exit(1);
+  }
+  return 0;
+}
+
+extern "C" size_t LLVMFuzzerCustomCrossOver(const uint8_t *Data1, size_t Size1,
+                                            const uint8_t *Data2, size_t Size2,
+                                            uint8_t *Out, size_t MaxOutSize,
+                                            unsigned int Seed) {
+  static bool Printed;
+  static size_t SeparatorLen = strlen(Separator);
+
+  if (!Printed) {
+    std::cerr << "In LLVMFuzzerCustomCrossover\n";
+    Printed = true;
+  }
+
+  std::mt19937 R(Seed);
+
+  size_t Offset1 = 0;
+  size_t Len1 = R() % (Size1 - Offset1);
+  size_t Offset2 = 0;
+  size_t Len2 = R() % (Size2 - Offset2);
+  size_t Size = Len1 + Len2 + SeparatorLen;
+
+  if (Size > MaxOutSize)
+    return 0;
+
+  memcpy(Out, Data1 + Offset1, Len1);
+  memcpy(Out + Len1, Separator, SeparatorLen);
+  memcpy(Out + Len1 + SeparatorLen, Data2 + Offset2, Len2);
+
+  return Len1 + Len2 + SeparatorLen;
+}
diff --git a/lib/Fuzzer/test/CustomMutatorTest.cpp b/lib/Fuzzer/test/CustomMutatorTest.cpp
new file mode 100644
index 000000000000..4f84519a90e6
--- /dev/null
+++ b/lib/Fuzzer/test/CustomMutatorTest.cpp
@@ -0,0 +1,38 @@
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+
+// Simple test for a cutom mutator.
+#include <assert.h>
+#include <cstdint>
+#include <cstdlib>
+#include <cstddef>
+#include <iostream>
+
+#include "FuzzerInterface.h"
+
+static volatile int Sink;
+
+extern "C" int LLVMFuzzerTestOneInput(const uint8_t *Data, size_t Size) {
+  assert(Data);
+  if (Size > 0 && Data[0] == 'H') {
+    Sink = 1;
+    if (Size > 1 && Data[1] == 'i') {
+      Sink = 2;
+      if (Size > 2 && Data[2] == '!') {
+        std::cout << "BINGO; Found the target, exiting\n";
+        exit(1);
+      }
+    }
+  }
+  return 0;
+}
+
+extern "C" size_t LLVMFuzzerCustomMutator(uint8_t *Data, size_t Size,
+                                          size_t MaxSize, unsigned int Seed) {
+  static bool Printed;
+  if (!Printed) {
+    std::cerr << "In LLVMFuzzerCustomMutator\n";
+    Printed = true;
+  }
+  return LLVMFuzzerMutate(Data, Size, MaxSize);
+}
diff --git a/lib/Fuzzer/test/EmptyTest.cpp b/lib/Fuzzer/test/EmptyTest.cpp
new file mode 100644
index 000000000000..5e843308fa57
--- /dev/null
+++ b/lib/Fuzzer/test/EmptyTest.cpp
@@ -0,0 +1,11 @@
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+// A fuzzer with empty target function.
+
+#include <cstdint>
+#include <cstdlib>
+
+extern "C" int LLVMFuzzerTestOneInput(const uint8_t *Data, size_t Size) {
+  return 0;
+}
diff --git a/lib/Fuzzer/test/FourIndependentBranchesTest.cpp b/lib/Fuzzer/test/FourIndependentBranchesTest.cpp
index 6007dd4a027b..62b3be76e3aa 100644
--- a/lib/Fuzzer/test/FourIndependentBranchesTest.cpp
+++ b/lib/Fuzzer/test/FourIndependentBranchesTest.cpp
@@ -1,3 +1,6 @@
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+
 // Simple test for a fuzzer. The fuzzer must find the string "FUZZ".
 #include <cstdint>
 #include <cstdlib>
diff --git a/lib/Fuzzer/test/FullCoverageSetTest.cpp b/lib/Fuzzer/test/FullCoverageSetTest.cpp
index a868084a0cee..415e0b4760c5 100644
--- a/lib/Fuzzer/test/FullCoverageSetTest.cpp
+++ b/lib/Fuzzer/test/FullCoverageSetTest.cpp
@@ -1,3 +1,6 @@
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+
 // Simple test for a fuzzer. The fuzzer must find the string "FUZZER".
 #include <cstdint>
 #include <cstdlib>
diff --git a/lib/Fuzzer/test/FuzzerFnAdapterUnittest.cpp b/lib/Fuzzer/test/FuzzerFnAdapterUnittest.cpp
new file mode 100644
index 000000000000..11be18096bc3
--- /dev/null
+++ b/lib/Fuzzer/test/FuzzerFnAdapterUnittest.cpp
@@ -0,0 +1,110 @@
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+
+#include "FuzzerFnAdapter.h"
+#include "gtest/gtest-spi.h"
+#include "gtest/gtest.h"
+
+namespace fuzzer {
+namespace impl {
+
+template <typename... Args>
+bool Unpack(std::tuple<Args...> *Tuple, std::initializer_list<uint8_t> data) {
+  std::vector<uint8_t> V(data);
+  return Unpack(V.data(), V.size(), Tuple);
+}
+
+TEST(Unpack, Bool) {
+  std::tuple<bool> T;
+  EXPECT_TRUE(Unpack(&T, {1}));
+  EXPECT_TRUE(std::get<0>(T));
+
+  EXPECT_TRUE(Unpack(&T, {0}));
+  EXPECT_FALSE(std::get<0>(T));
+
+  EXPECT_FALSE(Unpack(&T, {}));
+}
+
+TEST(Unpack, BoolBool) {
+  std::tuple<bool, bool> T;
+  EXPECT_TRUE(Unpack(&T, {1, 0}));
+  EXPECT_TRUE(std::get<0>(T));
+  EXPECT_FALSE(std::get<1>(T));
+
+  EXPECT_TRUE(Unpack(&T, {0, 1}));
+  EXPECT_FALSE(std::get<0>(T));
+  EXPECT_TRUE(std::get<1>(T));
+
+  EXPECT_FALSE(Unpack(&T, {}));
+  EXPECT_FALSE(Unpack(&T, {10}));
+}
+
+TEST(Unpack, BoolInt) {
+  std::tuple<bool, int> T;
+  EXPECT_TRUE(Unpack(&T, {1, 16, 2, 0, 0}));
+  EXPECT_TRUE(std::get<0>(T));
+  EXPECT_EQ(528, std::get<1>(T));
+
+  EXPECT_FALSE(Unpack(&T, {1, 2}));
+}
+
+TEST(Unpack, Vector) {
+  std::tuple<std::vector<uint8_t>> T;
+  const auto &V = std::get<0>(T);
+
+  EXPECT_FALSE(Unpack(&T, {}));
+
+  EXPECT_TRUE(Unpack(&T, {0}));
+  EXPECT_EQ(0ul, V.size());
+
+  EXPECT_TRUE(Unpack(&T, {0, 1, 2, 3}));
+  EXPECT_EQ(0ul, V.size());
+
+  EXPECT_TRUE(Unpack(&T, {2}));
+  EXPECT_EQ(0ul, V.size());
+
+  EXPECT_TRUE(Unpack(&T, {2, 3}));
+  EXPECT_EQ(1ul, V.size());
+  EXPECT_EQ(3, V[0]);
+
+  EXPECT_TRUE(Unpack(&T, {2, 9, 8}));
+  EXPECT_EQ(2ul, V.size());
+  EXPECT_EQ(9, V[0]);
+  EXPECT_EQ(8, V[1]);
+}
+
+TEST(Unpack, String) {
+  std::tuple<std::string> T;
+  const auto &S = std::get<0>(T);
+
+  EXPECT_TRUE(Unpack(&T, {2, 3}));
+  EXPECT_EQ(1ul, S.size());
+  EXPECT_EQ(3, S[0]);
+}
+
+template <typename Fn>
+bool UnpackAndApply(Fn F, std::initializer_list<uint8_t> Data) {
+  std::vector<uint8_t> V(Data);
+  return UnpackAndApply(F, V.data(), V.size());
+}
+
+static void fnBool(bool b) { EXPECT_TRUE(b); }
+
+TEST(Apply, Bool) {
+  EXPECT_FALSE(UnpackAndApply(fnBool, {}));
+  EXPECT_TRUE(UnpackAndApply(fnBool, {1}));
+  EXPECT_NONFATAL_FAILURE(UnpackAndApply(fnBool, {0}),
+                          "Actual: false\nExpected: true");
+}
+
+static void fnInt(int i) { EXPECT_EQ(42, i); }
+
+TEST(Apply, Int) {
+  EXPECT_FALSE(UnpackAndApply(fnInt, {}));
+  EXPECT_TRUE(UnpackAndApply(fnInt, {42, 0, 0, 0}));
+  EXPECT_NONFATAL_FAILURE(UnpackAndApply(fnInt, {10, 0, 0, 0}),
+                          "Actual: 10\nExpected: 42");
+}
+
+} // namespace impl
+} // namespace fuzzer
diff --git a/lib/Fuzzer/test/FuzzerUnittest.cpp b/lib/Fuzzer/test/FuzzerUnittest.cpp
index b33e0c961455..3fd87e5b9e01 100644
--- a/lib/Fuzzer/test/FuzzerUnittest.cpp
+++ b/lib/Fuzzer/test/FuzzerUnittest.cpp
@@ -1,18 +1,28 @@
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+
+// Avoid ODR violations (LibFuzzer is built without ASan and this test is built
+// with ASan) involving C++ standard library types when using libcxx.
+#define _LIBCPP_HAS_NO_ASAN
+
 #include "FuzzerInternal.h"
 #include "gtest/gtest.h"
+#include <memory>
 #include <set>
 
 using namespace fuzzer;
 
 // For now, have LLVMFuzzerTestOneInput just to make it link.
 // Later we may want to make unittests that actually call LLVMFuzzerTestOneInput.
-extern "C" void LLVMFuzzerTestOneInput(const uint8_t *Data, size_t Size) {
+extern "C" int LLVMFuzzerTestOneInput(const uint8_t *Data, size_t Size) {
   abort();
 }
 
 TEST(Fuzzer, CrossOver) {
-  FuzzerRandomLibc Rand(0);
-  MutationDispatcher MD(Rand);
+  std::unique_ptr<ExternalFunctions> t(new ExternalFunctions());
+  fuzzer::EF = t.get();
+  Random Rand(0);
+  MutationDispatcher MD(Rand, {});
   Unit A({0, 1, 2}), B({5, 6, 7});
   Unit C;
   Unit Expected[] = {
@@ -79,6 +89,8 @@ typedef size_t (MutationDispatcher::*Mutator)(uint8_t *Data, size_t Size,
                                               size_t MaxSize);
 
 void TestEraseByte(Mutator M, int NumIter) {
+  std::unique_ptr<ExternalFunctions> t(new ExternalFunctions());
+  fuzzer::EF = t.get();
   uint8_t REM0[8] = {0x11, 0x22, 0x33, 0x44, 0x55, 0x66, 0x77};
   uint8_t REM1[8] = {0x00, 0x22, 0x33, 0x44, 0x55, 0x66, 0x77};
   uint8_t REM2[8] = {0x00, 0x11, 0x33, 0x44, 0x55, 0x66, 0x77};
@@ -87,8 +99,8 @@ void TestEraseByte(Mutator M, int NumIter) {
   uint8_t REM5[8] = {0x00, 0x11, 0x22, 0x33, 0x44, 0x66, 0x77};
   uint8_t REM6[8] = {0x00, 0x11, 0x22, 0x33, 0x44, 0x55, 0x77};
   uint8_t REM7[8] = {0x00, 0x11, 0x22, 0x33, 0x44, 0x55, 0x66};
-  FuzzerRandomLibc Rand(0);
-  MutationDispatcher MD(Rand);
+  Random Rand(0);
+  MutationDispatcher MD(Rand, {});
   int FoundMask = 0;
   for (int i = 0; i < NumIter; i++) {
     uint8_t T[8] = {0x00, 0x11, 0x22, 0x33, 0x44, 0x55, 0x66, 0x77};
@@ -113,8 +125,10 @@ TEST(FuzzerMutate, EraseByte2) {
 }
 
 void TestInsertByte(Mutator M, int NumIter) {
-  FuzzerRandomLibc Rand(0);
-  MutationDispatcher MD(Rand);
+  std::unique_ptr<ExternalFunctions> t(new ExternalFunctions());
+  fuzzer::EF = t.get();
+  Random Rand(0);
+  MutationDispatcher MD(Rand, {});
   int FoundMask = 0;
   uint8_t INS0[8] = {0xF1, 0x00, 0x11, 0x22, 0x33, 0x44, 0x55, 0x66};
   uint8_t INS1[8] = {0x00, 0xF2, 0x11, 0x22, 0x33, 0x44, 0x55, 0x66};
@@ -147,8 +161,10 @@ TEST(FuzzerMutate, InsertByte2) {
 }
 
 void TestChangeByte(Mutator M, int NumIter) {
-  FuzzerRandomLibc Rand(0);
-  MutationDispatcher MD(Rand);
+  std::unique_ptr<ExternalFunctions> t(new ExternalFunctions());
+  fuzzer::EF = t.get();
+  Random Rand(0);
+  MutationDispatcher MD(Rand, {});
   int FoundMask = 0;
   uint8_t CH0[8] = {0xF0, 0x11, 0x22, 0x33, 0x44, 0x55, 0x66, 0x77};
   uint8_t CH1[8] = {0x00, 0xF1, 0x22, 0x33, 0x44, 0x55, 0x66, 0x77};
@@ -181,8 +197,10 @@ TEST(FuzzerMutate, ChangeByte2) {
 }
 
 void TestChangeBit(Mutator M, int NumIter) {
-  FuzzerRandomLibc Rand(0);
-  MutationDispatcher MD(Rand);
+  std::unique_ptr<ExternalFunctions> t(new ExternalFunctions());
+  fuzzer::EF = t.get();
+  Random Rand(0);
+  MutationDispatcher MD(Rand, {});
   int FoundMask = 0;
   uint8_t CH0[8] = {0x01, 0x11, 0x22, 0x33, 0x44, 0x55, 0x66, 0x77};
   uint8_t CH1[8] = {0x00, 0x13, 0x22, 0x33, 0x44, 0x55, 0x66, 0x77};
@@ -215,8 +233,10 @@ TEST(FuzzerMutate, ChangeBit2) {
 }
 
 void TestShuffleBytes(Mutator M, int NumIter) {
-  FuzzerRandomLibc Rand(0);
-  MutationDispatcher MD(Rand);
+  std::unique_ptr<ExternalFunctions> t(new ExternalFunctions());
+  fuzzer::EF = t.get();
+  Random Rand(0);
+  MutationDispatcher MD(Rand, {});
   int FoundMask = 0;
   uint8_t CH0[7] = {0x00, 0x22, 0x11, 0x33, 0x44, 0x55, 0x66};
   uint8_t CH1[7] = {0x11, 0x00, 0x33, 0x22, 0x44, 0x55, 0x66};
@@ -236,19 +256,21 @@ void TestShuffleBytes(Mutator M, int NumIter) {
 }
 
 TEST(FuzzerMutate, ShuffleBytes1) {
-  TestShuffleBytes(&MutationDispatcher::Mutate_ShuffleBytes, 1 << 15);
+  TestShuffleBytes(&MutationDispatcher::Mutate_ShuffleBytes, 1 << 16);
 }
 TEST(FuzzerMutate, ShuffleBytes2) {
-  TestShuffleBytes(&MutationDispatcher::Mutate, 1 << 19);
+  TestShuffleBytes(&MutationDispatcher::Mutate, 1 << 20);
 }
 
 void TestAddWordFromDictionary(Mutator M, int NumIter) {
-  FuzzerRandomLibc Rand(0);
-  MutationDispatcher MD(Rand);
+  std::unique_ptr<ExternalFunctions> t(new ExternalFunctions());
+  fuzzer::EF = t.get();
+  Random Rand(0);
+  MutationDispatcher MD(Rand, {});
   uint8_t Word1[4] = {0xAA, 0xBB, 0xCC, 0xDD};
   uint8_t Word2[3] = {0xFF, 0xEE, 0xEF};
-  MD.AddWordToManualDictionary(Unit(Word1, Word1 + sizeof(Word1)));
-  MD.AddWordToManualDictionary(Unit(Word2, Word2 + sizeof(Word2)));
+  MD.AddWordToManualDictionary(Word(Word1, sizeof(Word1)));
+  MD.AddWordToManualDictionary(Word(Word2, sizeof(Word2)));
   int FoundMask = 0;
   uint8_t CH0[7] = {0x00, 0x11, 0x22, 0xAA, 0xBB, 0xCC, 0xDD};
   uint8_t CH1[7] = {0x00, 0x11, 0xAA, 0xBB, 0xCC, 0xDD, 0x22};
@@ -283,18 +305,20 @@ TEST(FuzzerMutate, AddWordFromDictionary2) {
 }
 
 void TestAddWordFromDictionaryWithHint(Mutator M, int NumIter) {
-  FuzzerRandomLibc Rand(0);
-  MutationDispatcher MD(Rand);
-  uint8_t Word[] = {0xAA, 0xBB, 0xCC, 0xDD, 0xFF, 0xEE, 0xEF};
+  std::unique_ptr<ExternalFunctions> t(new ExternalFunctions());
+  fuzzer::EF = t.get();
+  Random Rand(0);
+  MutationDispatcher MD(Rand, {});
+  uint8_t W[] = {0xAA, 0xBB, 0xCC, 0xDD, 0xFF, 0xEE, 0xEF};
   size_t PosHint = 7777;
-  MD.AddWordToAutoDictionary(Unit(Word, Word + sizeof(Word)), PosHint);
+  MD.AddWordToAutoDictionary({Word(W, sizeof(W)), PosHint});
   int FoundMask = 0;
   for (int i = 0; i < NumIter; i++) {
     uint8_t T[10000];
     memset(T, 0, sizeof(T));
     size_t NewSize = (MD.*M)(T, 9000, 10000);
-    if (NewSize >= PosHint + sizeof(Word) &&
-        !memcmp(Word, T + PosHint, sizeof(Word)))
+    if (NewSize >= PosHint + sizeof(W) &&
+        !memcmp(W, T + PosHint, sizeof(W)))
       FoundMask = 1;
   }
   EXPECT_EQ(FoundMask, 1);
@@ -302,7 +326,7 @@ void TestAddWordFromDictionaryWithHint(Mutator M, int NumIter) {
 
 TEST(FuzzerMutate, AddWordFromDictionaryWithHint1) {
   TestAddWordFromDictionaryWithHint(
-      &MutationDispatcher::Mutate_AddWordFromAutoDictionary, 1 << 5);
+      &MutationDispatcher::Mutate_AddWordFromTemporaryAutoDictionary, 1 << 5);
 }
 
 TEST(FuzzerMutate, AddWordFromDictionaryWithHint2) {
@@ -310,8 +334,10 @@ TEST(FuzzerMutate, AddWordFromDictionaryWithHint2) {
 }
 
 void TestChangeASCIIInteger(Mutator M, int NumIter) {
-  FuzzerRandomLibc Rand(0);
-  MutationDispatcher MD(Rand);
+  std::unique_ptr<ExternalFunctions> t(new ExternalFunctions());
+  fuzzer::EF = t.get();
+  Random Rand(0);
+  MutationDispatcher MD(Rand, {});
 
   uint8_t CH0[8] = {'1', '2', '3', '4', '5', '6', '7', '7'};
   uint8_t CH1[8] = {'1', '2', '3', '4', '5', '6', '7', '9'};
@@ -400,3 +426,24 @@ TEST(FuzzerUtil, Base64) {
   EXPECT_EQ("YWJjeHk=", Base64({'a', 'b', 'c', 'x', 'y'}));
   EXPECT_EQ("YWJjeHl6", Base64({'a', 'b', 'c', 'x', 'y', 'z'}));
 }
+
+TEST(Corpus, Distribution) {
+  std::unique_ptr<ExternalFunctions> t(new ExternalFunctions());
+  fuzzer::EF = t.get();
+  Random Rand(0);
+  MutationDispatcher MD(Rand, {});
+  Fuzzer Fuzz(LLVMFuzzerTestOneInput, MD, {});
+  size_t N = 10;
+  size_t TriesPerUnit = 1<<20;
+  for (size_t i = 0; i < N; i++) {
+    Fuzz.AddToCorpus(Unit{ static_cast<uint8_t>(i) });
+  }
+  std::vector<size_t> Hist(N);
+  for (size_t i = 0; i < N * TriesPerUnit; i++) {
+    Hist[Fuzz.ChooseUnitIdxToMutate()]++;
+  }
+  for (size_t i = 0; i < N; i++) {
+    // A weak sanity check that every unit gets invoked.
+    EXPECT_GT(Hist[i], TriesPerUnit / N / 3);
+  }
+}
diff --git a/lib/Fuzzer/test/InitializeTest.cpp b/lib/Fuzzer/test/InitializeTest.cpp
new file mode 100644
index 000000000000..d40ff2f79365
--- /dev/null
+++ b/lib/Fuzzer/test/InitializeTest.cpp
@@ -0,0 +1,26 @@
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+
+// Make sure LLVMFuzzerInitialize is called.
+#include <assert.h>
+#include <stddef.h>
+#include <stdint.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+static char *argv0;
+
+extern "C" int LLVMFuzzerInitialize(int *argc, char ***argv) {
+  assert(argc > 0);
+  argv0 = **argv;
+  return 0;
+}
+
+extern "C" int LLVMFuzzerTestOneInput(const uint8_t *Data, size_t Size) {
+  if (strncmp(reinterpret_cast<const char*>(Data), argv0, Size)) {
+    fprintf(stderr, "BINGO\n");
+    exit(1);
+  }
+  return 0;
+}
diff --git a/lib/Fuzzer/test/LeakTest.cpp b/lib/Fuzzer/test/LeakTest.cpp
new file mode 100644
index 000000000000..22e5164050e5
--- /dev/null
+++ b/lib/Fuzzer/test/LeakTest.cpp
@@ -0,0 +1,17 @@
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+
+// Test with a leak.
+#include <cstdint>
+#include <cstddef>
+
+static volatile void *Sink;
+
+extern "C" int LLVMFuzzerTestOneInput(const uint8_t *Data, size_t Size) {
+  if (Size > 0 && *Data == 'H') {
+    Sink = new int;
+    Sink = nullptr;
+  }
+  return 0;
+}
+
diff --git a/lib/Fuzzer/test/LeakTimeoutTest.cpp b/lib/Fuzzer/test/LeakTimeoutTest.cpp
new file mode 100644
index 000000000000..4f31b3e52c16
--- /dev/null
+++ b/lib/Fuzzer/test/LeakTimeoutTest.cpp
@@ -0,0 +1,17 @@
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+
+// Test with a leak.
+#include <cstdint>
+#include <cstddef>
+
+static volatile int *Sink;
+
+extern "C" int LLVMFuzzerTestOneInput(const uint8_t *Data, size_t Size) {
+  if (!Size) return 0;
+  Sink = new int;
+  Sink = new int;
+  while (Sink) *Sink = 0;  // Infinite loop.
+  return 0;
+}
+
diff --git a/lib/Fuzzer/test/MemcmpTest.cpp b/lib/Fuzzer/test/MemcmpTest.cpp
index c19c95717bbb..fdbf94683f76 100644
--- a/lib/Fuzzer/test/MemcmpTest.cpp
+++ b/lib/Fuzzer/test/MemcmpTest.cpp
@@ -1,3 +1,6 @@
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+
 // Simple test for a fuzzer. The fuzzer must find a particular string.
 #include <cstring>
 #include <cstdint>
@@ -9,7 +12,7 @@ extern "C" int LLVMFuzzerTestOneInput(const uint8_t *Data, size_t Size) {
   if (Size >= 8 && memcmp(Data, "01234567", 8) == 0) {
     if (Size >= 12 && memcmp(Data + 8, "ABCD", 4) == 0) {
       if (Size >= 14 && memcmp(Data + 12, "XY", 2) == 0) {
-        if (Size >= 16 && memcmp(Data + 14, "KLM", 3) == 0) {
+        if (Size >= 17 && memcmp(Data + 14, "KLM", 3) == 0) {
           if (Size >= 27 && memcmp(Data + 17, "ABCDE-GHIJ", 10) == 0){
             fprintf(stderr, "BINGO %zd\n", Size);
             for (size_t i = 0; i < Size; i++) {
diff --git a/lib/Fuzzer/test/NthRunCrashTest.cpp b/lib/Fuzzer/test/NthRunCrashTest.cpp
new file mode 100644
index 000000000000..b43e69e51b25
--- /dev/null
+++ b/lib/Fuzzer/test/NthRunCrashTest.cpp
@@ -0,0 +1,18 @@
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+
+// Crash on the N-th execution.
+#include <cstdint>
+#include <cstddef>
+#include <iostream>
+
+static int Counter;
+
+extern "C" int LLVMFuzzerTestOneInput(const uint8_t *Data, size_t Size) {
+  if (Counter++ == 1000) {
+    std::cout << "BINGO; Found the target, exiting\n";
+    exit(1);
+  }
+  return 0;
+}
+
diff --git a/lib/Fuzzer/test/NullDerefOnEmptyTest.cpp b/lib/Fuzzer/test/NullDerefOnEmptyTest.cpp
new file mode 100644
index 000000000000..153710920a5f
--- /dev/null
+++ b/lib/Fuzzer/test/NullDerefOnEmptyTest.cpp
@@ -0,0 +1,19 @@
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+
+// Simple test for a fuzzer. The fuzzer must find the empty string.
+#include <cstdint>
+#include <cstdlib>
+#include <cstddef>
+#include <iostream>
+
+static volatile int *Null = 0;
+
+extern "C" int LLVMFuzzerTestOneInput(const uint8_t *Data, size_t Size) {
+  if (Size == 0) {
+    std::cout << "Found the target, dereferencing NULL\n";
+    *Null = 1;
+  }
+  return 0;
+}
+
diff --git a/lib/Fuzzer/test/NullDerefTest.cpp b/lib/Fuzzer/test/NullDerefTest.cpp
index 200c56ccbbc9..3f03d2498197 100644
--- a/lib/Fuzzer/test/NullDerefTest.cpp
+++ b/lib/Fuzzer/test/NullDerefTest.cpp
@@ -1,3 +1,6 @@
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+
 // Simple test for a fuzzer. The fuzzer must find the string "Hi!".
 #include <cstdint>
 #include <cstdlib>
diff --git a/lib/Fuzzer/test/OneHugeAllocTest.cpp b/lib/Fuzzer/test/OneHugeAllocTest.cpp
new file mode 100644
index 000000000000..617fa20fa2e0
--- /dev/null
+++ b/lib/Fuzzer/test/OneHugeAllocTest.cpp
@@ -0,0 +1,29 @@
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+
+// Tests OOM handling when there is a single large allocation.
+#include <assert.h>
+#include <cstdint>
+#include <cstdlib>
+#include <cstddef>
+#include <cstring>
+#include <iostream>
+#include <unistd.h>
+
+static volatile char *SinkPtr;
+
+extern "C" int LLVMFuzzerTestOneInput(const uint8_t *Data, size_t Size) {
+  if (Size > 0 && Data[0] == 'H') {
+    if (Size > 1 && Data[1] == 'i') {
+      if (Size > 2 && Data[2] == '!') {
+        size_t kSize = (size_t)1 << 31;
+        char *p = new char[kSize];
+        memset(p, 0, kSize);
+        SinkPtr = p;
+        delete [] p;
+      }
+    }
+  }
+  return 0;
+}
+
diff --git a/lib/Fuzzer/test/OutOfMemoryTest.cpp b/lib/Fuzzer/test/OutOfMemoryTest.cpp
new file mode 100644
index 000000000000..e5c9f0a038f5
--- /dev/null
+++ b/lib/Fuzzer/test/OutOfMemoryTest.cpp
@@ -0,0 +1,31 @@
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+
+// Tests OOM handling.
+#include <assert.h>
+#include <cstdint>
+#include <cstdlib>
+#include <cstddef>
+#include <cstring>
+#include <iostream>
+#include <unistd.h>
+
+static volatile char *SinkPtr;
+
+extern "C" int LLVMFuzzerTestOneInput(const uint8_t *Data, size_t Size) {
+  if (Size > 0 && Data[0] == 'H') {
+    if (Size > 1 && Data[1] == 'i') {
+      if (Size > 2 && Data[2] == '!') {
+        while (true) {
+          size_t kSize = 1 << 28;
+          char *p = new char[kSize];
+          memset(p, 0, kSize);
+          SinkPtr = p;
+          sleep(1);
+        }
+      }
+    }
+  }
+  return 0;
+}
+
diff --git a/lib/Fuzzer/test/RepeatedMemcmp.cpp b/lib/Fuzzer/test/RepeatedMemcmp.cpp
new file mode 100644
index 000000000000..a327bbee7815
--- /dev/null
+++ b/lib/Fuzzer/test/RepeatedMemcmp.cpp
@@ -0,0 +1,22 @@
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+
+
+#include <cstring>
+#include <cstdint>
+#include <cstdio>
+#include <cstdlib>
+
+extern "C" int LLVMFuzzerTestOneInput(const uint8_t *Data, size_t Size) {
+  int Matches = 0;
+  for (size_t i = 0; i + 2 < Size; i += 3) {
+    const char *Pat = i % 2 ? "foo" : "bar";
+    if (!memcmp(Data + i, Pat, 3))
+      Matches++;
+  }
+  if (Matches > 20) {
+    fprintf(stderr, "BINGO!\n");
+    exit(1);
+  }
+  return 0;
+}
diff --git a/lib/Fuzzer/test/SignedIntOverflowTest.cpp b/lib/Fuzzer/test/SignedIntOverflowTest.cpp
new file mode 100644
index 000000000000..7df32ad57933
--- /dev/null
+++ b/lib/Fuzzer/test/SignedIntOverflowTest.cpp
@@ -0,0 +1,28 @@
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+
+// Test for signed-integer-overflow.
+#include <assert.h>
+#include <cstdint>
+#include <cstdlib>
+#include <cstddef>
+#include <iostream>
+#include <climits>
+
+static volatile int Sink;
+static int Large = INT_MAX;
+
+extern "C" int LLVMFuzzerTestOneInput(const uint8_t *Data, size_t Size) {
+  assert(Data);
+  if (Size > 0 && Data[0] == 'H') {
+    Sink = 1;
+    if (Size > 1 && Data[1] == 'i') {
+      Sink = 2;
+      if (Size > 2 && Data[2] == '!') {
+        Large++;  // int overflow.
+      }
+    }
+  }
+  return 0;
+}
+
diff --git a/lib/Fuzzer/test/SimpleCmpTest.cpp b/lib/Fuzzer/test/SimpleCmpTest.cpp
index 8568c737efb1..54dc016ce843 100644
--- a/lib/Fuzzer/test/SimpleCmpTest.cpp
+++ b/lib/Fuzzer/test/SimpleCmpTest.cpp
@@ -1,3 +1,6 @@
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+
 // Simple test for a fuzzer. The fuzzer must find several narrow ranges.
 #include <cstdint>
 #include <cstdlib>
diff --git a/lib/Fuzzer/test/SimpleDictionaryTest.cpp b/lib/Fuzzer/test/SimpleDictionaryTest.cpp
index b9cb2f0270a3..cd7292bd006c 100644
--- a/lib/Fuzzer/test/SimpleDictionaryTest.cpp
+++ b/lib/Fuzzer/test/SimpleDictionaryTest.cpp
@@ -1,3 +1,6 @@
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+
 // Simple test for a fuzzer.
 // The fuzzer must find a string based on dictionary words:
 //   "Elvis"
diff --git a/lib/Fuzzer/test/SimpleFnAdapterTest.cpp b/lib/Fuzzer/test/SimpleFnAdapterTest.cpp
new file mode 100644
index 000000000000..d30c98b250ea
--- /dev/null
+++ b/lib/Fuzzer/test/SimpleFnAdapterTest.cpp
@@ -0,0 +1,24 @@
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+
+// Simple test for a fuzzer Fn adapter. The fuzzer has to find two non-empty
+// vectors with the same content.
+
+#include <iostream>
+#include <vector>
+
+#include "FuzzerFnAdapter.h"
+
+static void TestFn(std::vector<uint8_t> V1, std::vector<uint8_t> V2) {
+  if (V1.size() > 0 && V1 == V2) {
+    std::cout << "BINGO; Found the target, exiting\n";
+    exit(0);
+  }
+}
+
+extern "C" int LLVMFuzzerTestOneInput(const uint8_t *Data, size_t Size) {
+  fuzzer::Adapt(TestFn, Data, Size);
+  return 0;
+}
+
+
diff --git a/lib/Fuzzer/test/SimpleHashTest.cpp b/lib/Fuzzer/test/SimpleHashTest.cpp
index 5bab3fa7f649..00599de78ebe 100644
--- a/lib/Fuzzer/test/SimpleHashTest.cpp
+++ b/lib/Fuzzer/test/SimpleHashTest.cpp
@@ -1,3 +1,6 @@
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+
 // This test computes a checksum of the data (all but the last 4 bytes),
 // and then compares the last 4 bytes with the computed value.
 // A fuzzer with cmp traces is expected to defeat this check.
diff --git a/lib/Fuzzer/test/SimpleTest.cpp b/lib/Fuzzer/test/SimpleTest.cpp
index 04225a889f5d..e53ea160ed8f 100644
--- a/lib/Fuzzer/test/SimpleTest.cpp
+++ b/lib/Fuzzer/test/SimpleTest.cpp
@@ -1,3 +1,6 @@
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+
 // Simple test for a fuzzer. The fuzzer must find the string "Hi!".
 #include <assert.h>
 #include <cstdint>
diff --git a/lib/Fuzzer/test/SimpleThreadedTest.cpp b/lib/Fuzzer/test/SimpleThreadedTest.cpp
new file mode 100644
index 000000000000..5f02d3f8457c
--- /dev/null
+++ b/lib/Fuzzer/test/SimpleThreadedTest.cpp
@@ -0,0 +1,25 @@
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+
+// Threaded test for a fuzzer. The fuzzer should find "H"
+#include <assert.h>
+#include <cstdint>
+#include <cstddef>
+#include <cstring>
+#include <iostream>
+#include <thread>
+
+extern "C" int LLVMFuzzerTestOneInput(const uint8_t *Data, size_t Size) {
+  auto C = [&] {
+    if (Size >= 2 && Data[0] == 'H') {
+        std::cout << "BINGO; Found the target, exiting\n";
+        abort();
+    }
+  };
+  std::thread T[] = {std::thread(C), std::thread(C), std::thread(C),
+                     std::thread(C), std::thread(C), std::thread(C)};
+  for (auto &X : T)
+    X.join();
+  return 0;
+}
+
diff --git a/lib/Fuzzer/test/SpamyTest.cpp b/lib/Fuzzer/test/SpamyTest.cpp
new file mode 100644
index 000000000000..d294d4dc53e0
--- /dev/null
+++ b/lib/Fuzzer/test/SpamyTest.cpp
@@ -0,0 +1,21 @@
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+
+// The test spams to stderr and stdout.
+#include <assert.h>
+#include <cstdint>
+#include <cstdio>
+#include <cstddef>
+#include <iostream>
+
+extern "C" int LLVMFuzzerTestOneInput(const uint8_t *Data, size_t Size) {
+  assert(Data);
+  printf("PRINTF_STDOUT\n");
+  fflush(stdout);
+  fprintf(stderr, "PRINTF_STDERR\n");
+  std::cout << "STREAM_COUT\n";
+  std::cout.flush();
+  std::cerr << "STREAM_CERR\n";
+  return 0;
+}
+
diff --git a/lib/Fuzzer/test/StrcmpTest.cpp b/lib/Fuzzer/test/StrcmpTest.cpp
index 835819ae2f45..5a1329904612 100644
--- a/lib/Fuzzer/test/StrcmpTest.cpp
+++ b/lib/Fuzzer/test/StrcmpTest.cpp
@@ -1,3 +1,6 @@
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+
 // Break through a series of strcmp.
 #include <cstring>
 #include <cstdint>
diff --git a/lib/Fuzzer/test/StrncmpTest.cpp b/lib/Fuzzer/test/StrncmpTest.cpp
index 55344d75e0b1..8575c2682f1a 100644
--- a/lib/Fuzzer/test/StrncmpTest.cpp
+++ b/lib/Fuzzer/test/StrncmpTest.cpp
@@ -1,3 +1,6 @@
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+
 // Simple test for a fuzzer. The fuzzer must find a particular string.
 #include <cstring>
 #include <cstdint>
diff --git a/lib/Fuzzer/test/StrstrTest.cpp b/lib/Fuzzer/test/StrstrTest.cpp
new file mode 100644
index 000000000000..90d539b660a9
--- /dev/null
+++ b/lib/Fuzzer/test/StrstrTest.cpp
@@ -0,0 +1,18 @@
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+
+// Test strstr and strcasestr hooks.
+#include <string>
+#include <string.h>
+#include <cstdint>
+#include <cstdio>
+#include <cstdlib>
+
+extern "C" int LLVMFuzzerTestOneInput(const uint8_t *Data, size_t Size) {
+  std::string s(reinterpret_cast<const char*>(Data), Size);
+  if (strstr(s.c_str(), "FUZZ") && strcasestr(s.c_str(), "aBcD")) {
+    fprintf(stderr, "BINGO\n");
+    exit(1);
+  }
+  return 0;
+}
diff --git a/lib/Fuzzer/test/SwitchTest.cpp b/lib/Fuzzer/test/SwitchTest.cpp
index 5de7fff74525..3dc051ff7b5b 100644
--- a/lib/Fuzzer/test/SwitchTest.cpp
+++ b/lib/Fuzzer/test/SwitchTest.cpp
@@ -1,3 +1,6 @@
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+
 // Simple test for a fuzzer. The fuzzer must find the interesting switch value.
 #include <cstdint>
 #include <cstdlib>
diff --git a/lib/Fuzzer/test/ThreadedLeakTest.cpp b/lib/Fuzzer/test/ThreadedLeakTest.cpp
new file mode 100644
index 000000000000..751107110871
--- /dev/null
+++ b/lib/Fuzzer/test/ThreadedLeakTest.cpp
@@ -0,0 +1,18 @@
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+
+// The fuzzer should find a leak in a non-main thread.
+#include <cstdint>
+#include <cstddef>
+#include <thread>
+
+static volatile int *Sink;
+
+extern "C" int LLVMFuzzerTestOneInput(const uint8_t *Data, size_t Size) {
+  if (Size == 0) return 0;
+  if (Data[0] != 'F') return 0;
+  std::thread T([&] { Sink = new int; });
+  T.join();
+  return 0;
+}
+
diff --git a/lib/Fuzzer/test/ThreadedTest.cpp b/lib/Fuzzer/test/ThreadedTest.cpp
index 7aa114a41f36..09137a9a70c1 100644
--- a/lib/Fuzzer/test/ThreadedTest.cpp
+++ b/lib/Fuzzer/test/ThreadedTest.cpp
@@ -1,3 +1,6 @@
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+
 // Threaded test for a fuzzer. The fuzzer should not crash.
 #include <assert.h>
 #include <cstdint>
diff --git a/lib/Fuzzer/test/TimeoutTest.cpp b/lib/Fuzzer/test/TimeoutTest.cpp
index 71790ded95a2..f8107012c841 100644
--- a/lib/Fuzzer/test/TimeoutTest.cpp
+++ b/lib/Fuzzer/test/TimeoutTest.cpp
@@ -1,3 +1,6 @@
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+
 // Simple test for a fuzzer. The fuzzer must find the string "Hi!".
 #include <cstdint>
 #include <cstdlib>
diff --git a/lib/Fuzzer/test/UninstrumentedTest.cpp b/lib/Fuzzer/test/UninstrumentedTest.cpp
index c1730198d83f..ffe952c749d2 100644
--- a/lib/Fuzzer/test/UninstrumentedTest.cpp
+++ b/lib/Fuzzer/test/UninstrumentedTest.cpp
@@ -1,3 +1,6 @@
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+
 // This test should not be instrumented.
 #include <cstdint>
 #include <cstddef>
diff --git a/lib/Fuzzer/test/UserSuppliedFuzzerTest.cpp b/lib/Fuzzer/test/UserSuppliedFuzzerTest.cpp
deleted file mode 100644
index 59f83b57bfad..000000000000
--- a/lib/Fuzzer/test/UserSuppliedFuzzerTest.cpp
+++ /dev/null
@@ -1,51 +0,0 @@
-// Simple test for a fuzzer.
-// The fuzzer must find the string "Hi!" preceded by a magic value.
-// Uses UserSuppliedFuzzer which ensures that the magic is present.
-#include <cstdint>
-#include <cassert>
-#include <cstdlib>
-#include <cstddef>
-#include <cstring>
-#include <iostream>
-
-#include "FuzzerInterface.h"
-
-static const uint64_t kMagic = 8860221463604ULL;
-
-class MyFuzzer : public fuzzer::UserSuppliedFuzzer {
- public:
-  MyFuzzer(fuzzer::FuzzerRandomBase *Rand)
-      : fuzzer::UserSuppliedFuzzer(Rand) {}
-  int TargetFunction(const uint8_t *Data, size_t Size) {
-    if (Size <= 10) return 0;
-    if (memcmp(Data, &kMagic, sizeof(kMagic))) return 0;
-    // It's hard to get here w/o advanced fuzzing techniques (e.g. cmp tracing).
-    // So, we simply 'fix' the data in the custom mutator.
-    if (Data[8] == 'H') {
-      if (Data[9] == 'i') {
-        if (Data[10] == '!') {
-          std::cout << "BINGO; Found the target, exiting\n";
-          exit(1);
-        }
-      }
-    }
-    return 0;
-  }
-  // Custom mutator.
-  virtual size_t Mutate(uint8_t *Data, size_t Size, size_t MaxSize) {
-    assert(MaxSize > sizeof(kMagic));
-    if (Size < sizeof(kMagic))
-      Size = sizeof(kMagic);
-    // "Fix" the data, then mutate.
-    memcpy(Data, &kMagic, std::min(MaxSize, sizeof(kMagic)));
-    return fuzzer::UserSuppliedFuzzer::Mutate(
-        Data + sizeof(kMagic), Size - sizeof(kMagic), MaxSize - sizeof(kMagic));
-  }
-  // No need to redefine CrossOver() here.
-};
-
-int main(int argc, char **argv) {
-  fuzzer::FuzzerRandomLibc Rand(0);
-  MyFuzzer F(&Rand);
-  fuzzer::FuzzerDriver(argc, argv, F);
-}
diff --git a/lib/Fuzzer/test/afl-driver-extra-stats.test b/lib/Fuzzer/test/afl-driver-extra-stats.test
new file mode 100644
index 000000000000..81e384e7dad2
--- /dev/null
+++ b/lib/Fuzzer/test/afl-driver-extra-stats.test
@@ -0,0 +1,28 @@
+; Test that not specifying an extra stats file isn't broken.
+RUN: unset AFL_DRIVER_EXTRA_STATS_FILENAME
+RUN: AFLDriverTest
+
+; Test that specifying an invalid extra stats file causes a crash.
+RUN: ASAN_OPTIONS= AFL_DRIVER_EXTRA_STATS_FILENAME=%T not --crash AFLDriverTest
+
+; Test that specifying a corrupted stats file causes a crash.
+echo "peak_rss_mb :0" > %t
+ASAN_OPTIONS= AFL_DRIVER_EXTRA_STATS_FILENAME=%t not --crash AFLDriverTest
+
+; Test that specifying a valid nonexistent stats file works.
+RUN: rm -f %t
+RUN: AFL_DRIVER_EXTRA_STATS_FILENAME=%t AFLDriverTest
+RUN: [[ $(grep "peak_rss_mb\|slowest_unit_time_sec" %t | wc -l) -eq 2 ]]
+
+; Test that specifying a valid preexisting stats file works.
+RUN: printf "peak_rss_mb : 0\nslowest_unit_time_sec: 0\n" > %t
+RUN: AFL_DRIVER_EXTRA_STATS_FILENAME=%t AFLDriverTest
+; Check that both lines were printed.
+RUN: [[ $(grep "peak_rss_mb\|slowest_unit_time_sec" %t | wc -l) -eq 2 ]]
+
+; Test that peak_rss_mb and slowest_unit_time_in_secs are only updated when necessary.
+; Check that both lines have 9999 since there's no way we have exceeded that
+; amount of time or virtual memory.
+RUN: printf "peak_rss_mb : 9999\nslowest_unit_time_sec: 9999\n" > %t
+RUN: AFL_DRIVER_EXTRA_STATS_FILENAME=%t AFLDriverTest
+RUN: [[ $(grep "9999" %t | wc -l) -eq 2 ]]
diff --git a/lib/Fuzzer/test/afl-driver-stderr.test b/lib/Fuzzer/test/afl-driver-stderr.test
new file mode 100644
index 000000000000..c0f9c8398c2a
--- /dev/null
+++ b/lib/Fuzzer/test/afl-driver-stderr.test
@@ -0,0 +1,10 @@
+; Test that not specifying a stderr file isn't broken.
+RUN: unset AFL_DRIVER_STDERR_DUPLICATE_FILENAME
+RUN: AFLDriverTest
+
+; Test that specifying an invalid file causes a crash.
+RUN: ASAN_OPTIONS= AFL_DRIVER_STDERR_DUPLICATE_FILENAME="%T" not --crash AFLDriverTest
+
+; Test that a file is created when specified as the duplicate stderr.
+RUN: AFL_DRIVER_STDERR_DUPLICATE_FILENAME=%t AFLDriverTest
+RUN: stat %t
diff --git a/lib/Fuzzer/test/dfsan/CMakeLists.txt b/lib/Fuzzer/test/dfsan/CMakeLists.txt
index 2b49831fcdb8..2a4dc18bfeed 100644
--- a/lib/Fuzzer/test/dfsan/CMakeLists.txt
+++ b/lib/Fuzzer/test/dfsan/CMakeLists.txt
@@ -1,14 +1,19 @@
 # These tests depend on both coverage and dfsan instrumentation.
 
-set(CMAKE_CXX_FLAGS_RELEASE
-  "${LIBFUZZER_FLAGS_BASE} -O0 -fno-sanitize=all -fsanitize=dataflow")
+set(CMAKE_CXX_FLAGS
+  "${LIBFUZZER_FLAGS_BASE} -fno-sanitize=all -fsanitize=dataflow")
+
+set(DFSanTests
+  MemcmpTest
+  SimpleCmpTest
+  StrcmpTest
+  StrncmpTest
+  SwitchTest
+  )
 
 foreach(Test ${DFSanTests})
-  add_executable(LLVMFuzzer-${Test}-DFSan
-    ../${Test}.cpp
-    )
-  target_link_libraries(LLVMFuzzer-${Test}-DFSan
-    LLVMFuzzer
-    )
+  add_libfuzzer_test(${Test}-DFSan SOURCES ../${Test}.cpp)
 endforeach()
 
+# Propagate value into parent directory
+set(TestBinaries ${TestBinaries} PARENT_SCOPE)
diff --git a/lib/Fuzzer/test/fuzzer-customcrossover.test b/lib/Fuzzer/test/fuzzer-customcrossover.test
new file mode 100644
index 000000000000..4be54d3f799e
--- /dev/null
+++ b/lib/Fuzzer/test/fuzzer-customcrossover.test
@@ -0,0 +1,10 @@
+RUN: rm -rf %t/CustomCrossover
+RUN: mkdir -p %t/CustomCrossover
+RUN: echo "0123456789" > %t/CustomCrossover/digits
+RUN: echo "abcdefghij" > %t/CustomCrossover/chars
+RUN: not LLVMFuzzer-CustomCrossOverTest -seed=1 -use_memcmp=0 -runs=100000 -prune_corpus=0 %t/CustomCrossover 2>&1 | FileCheck %s --check-prefix=LLVMFuzzerCustomCrossover
+RUN: rm -rf %t/CustomCrossover
+
+LLVMFuzzerCustomCrossover: In LLVMFuzzerCustomCrossover
+LLVMFuzzerCustomCrossover: BINGO
+
diff --git a/lib/Fuzzer/test/fuzzer-custommutator.test b/lib/Fuzzer/test/fuzzer-custommutator.test
new file mode 100644
index 000000000000..fcd740bf5457
--- /dev/null
+++ b/lib/Fuzzer/test/fuzzer-custommutator.test
@@ -0,0 +1,4 @@
+RUN: not LLVMFuzzer-CustomMutatorTest 2>&1 | FileCheck %s --check-prefix=LLVMFuzzerCustomMutator
+LLVMFuzzerCustomMutator: In LLVMFuzzerCustomMutator
+LLVMFuzzerCustomMutator: BINGO
+
diff --git a/lib/Fuzzer/test/fuzzer-dfsan.test b/lib/Fuzzer/test/fuzzer-dfsan.test
index 567086ed65af..5bd5c0f18d2f 100644
--- a/lib/Fuzzer/test/fuzzer-dfsan.test
+++ b/lib/Fuzzer/test/fuzzer-dfsan.test
@@ -1,3 +1,4 @@
+REQUIRES: dfsan
 CHECK1: BINGO
 CHECK2: BINGO
 CHECK3: BINGO
@@ -7,10 +8,10 @@ CHECK_DFSanCmpCallback: DFSanCmpCallback: PC
 CHECK_DFSanSwitchCallback: DFSanSwitchCallback: PC
 CHECK_DFSanMemcmpCallback: DFSanMemcmpCallback: Pos
 
-RUN: not LLVMFuzzer-SimpleCmpTest-DFSan -use_traces=1 -seed=1 -runs=1000000 -timeout=5 2>&1 | FileCheck %s --check-prefix=CHECK1
+RUN: not LLVMFuzzer-SimpleCmpTest-DFSan -use_traces=1 -seed=1 -runs=10000000 -timeout=5 2>&1 | FileCheck %s --check-prefix=CHECK1
 RUN: LLVMFuzzer-SimpleCmpTest-DFSan -use_traces=1 -seed=1 -runs=100 -timeout=5 -verbosity=3 2>&1 | FileCheck %s  -check-prefix=CHECK_DFSanCmpCallback
 
-RUN: not LLVMFuzzer-MemcmpTest-DFSan -use_traces=1 -seed=1 -runs=10000 -timeout=5 2>&1 | FileCheck %s --check-prefix=CHECK2
+RUN: not LLVMFuzzer-MemcmpTest-DFSan -use_traces=1 -seed=1 -runs=100000 -timeout=5 2>&1 | FileCheck %s --check-prefix=CHECK2
 RUN: LLVMFuzzer-MemcmpTest-DFSan -use_traces=1 -seed=1 -runs=2 -timeout=5 -verbosity=3 2>&1 | FileCheck %s  -check-prefix=CHECK_DFSanMemcmpCallback
 
 RUN: not LLVMFuzzer-StrncmpTest-DFSan -use_traces=1 -seed=1 -runs=10000 -timeout=5 2>&1 | FileCheck %s --check-prefix=CHECK3
diff --git a/lib/Fuzzer/test/fuzzer-dirs.test b/lib/Fuzzer/test/fuzzer-dirs.test
new file mode 100644
index 000000000000..3eaaf6b6bb5b
--- /dev/null
+++ b/lib/Fuzzer/test/fuzzer-dirs.test
@@ -0,0 +1,12 @@
+RUN: rm -rf %t/SUB1
+RUN: mkdir -p %t/SUB1/SUB2/SUB3
+RUN: echo a > %t/SUB1/a
+RUN: echo b > %t/SUB1/SUB2/b
+RUN: echo c > %t/SUB1/SUB2/SUB3/c
+RUN: LLVMFuzzer-SimpleTest %t/SUB1 -runs=0 2>&1 | FileCheck %s --check-prefix=SUBDIRS
+SUBDIRS: READ   units: 3
+RUN: rm -rf %t/SUB1
+
+RUN: not LLVMFuzzer-SimpleTest NONEXISTENT_DIR 2>&1 | FileCheck %s --check-prefix=NONEXISTENT_DIR
+NONEXISTENT_DIR: No such directory: NONEXISTENT_DIR; exiting
+
diff --git a/lib/Fuzzer/test/fuzzer-fdmask.test b/lib/Fuzzer/test/fuzzer-fdmask.test
new file mode 100644
index 000000000000..abbc4bd6412f
--- /dev/null
+++ b/lib/Fuzzer/test/fuzzer-fdmask.test
@@ -0,0 +1,30 @@
+RUN: LLVMFuzzer-SpamyTest -runs=1                  2>&1 | FileCheck %s --check-prefix=FD_MASK_0
+RUN: LLVMFuzzer-SpamyTest -runs=1 -close_fd_mask=0 2>&1 | FileCheck %s --check-prefix=FD_MASK_0
+RUN: LLVMFuzzer-SpamyTest -runs=1 -close_fd_mask=1 2>&1 | FileCheck %s --check-prefix=FD_MASK_1
+RUN: LLVMFuzzer-SpamyTest -runs=1 -close_fd_mask=2 2>&1 | FileCheck %s --check-prefix=FD_MASK_2
+RUN: LLVMFuzzer-SpamyTest -runs=1 -close_fd_mask=3 2>&1 | FileCheck %s --check-prefix=FD_MASK_3
+
+FD_MASK_0: PRINTF_STDOUT
+FD_MASK_0: PRINTF_STDERR
+FD_MASK_0: STREAM_COUT
+FD_MASK_0: STREAM_CERR
+FD_MASK_0: INITED
+
+FD_MASK_1-NOT: PRINTF_STDOUT
+FD_MASK_1: PRINTF_STDERR
+FD_MASK_1-NOT: STREAM_COUT
+FD_MASK_1: STREAM_CERR
+FD_MASK_1: INITED
+
+FD_MASK_2: PRINTF_STDOUT
+FD_MASK_2-NOT: PRINTF_STDERR
+FD_MASK_2: STREAM_COUT
+FD_MASK_2-NOTE: STREAM_CERR
+FD_MASK_2: INITED
+
+FD_MASK_3-NOT: PRINTF_STDOUT
+FD_MASK_3-NOT: PRINTF_STDERR
+FD_MASK_3-NOT: STREAM_COUT
+FD_MASK_3-NOT: STREAM_CERR
+FD_MASK_3: INITED
+
diff --git a/lib/Fuzzer/test/fuzzer-finalstats.test b/lib/Fuzzer/test/fuzzer-finalstats.test
new file mode 100644
index 000000000000..1cbcd10f0498
--- /dev/null
+++ b/lib/Fuzzer/test/fuzzer-finalstats.test
@@ -0,0 +1,11 @@
+RUN: LLVMFuzzer-SimpleTest -seed=1 -runs=77 -print_final_stats=1 2>&1 | FileCheck %s --check-prefix=FINAL_STATS
+FINAL_STATS: stat::number_of_executed_units: 77
+FINAL_STATS: stat::average_exec_per_sec:     0
+FINAL_STATS: stat::new_units_added:
+FINAL_STATS: stat::slowest_unit_time_sec:    0
+FINAL_STATS: stat::peak_rss_mb:
+
+RUN: LLVMFuzzer-SimpleTest %S/dict1.txt -runs=33 -print_final_stats=1 2>&1 | FileCheck %s --check-prefix=FINAL_STATS1
+FINAL_STATS1: stat::number_of_executed_units: 33
+FINAL_STATS1: stat::peak_rss_mb:
+
diff --git a/lib/Fuzzer/test/fuzzer-flags.test b/lib/Fuzzer/test/fuzzer-flags.test
new file mode 100644
index 000000000000..a94faf20a586
--- /dev/null
+++ b/lib/Fuzzer/test/fuzzer-flags.test
@@ -0,0 +1,8 @@
+RUN: LLVMFuzzer-SimpleTest -foo_bar=1 2>&1 | FileCheck %s --check-prefix=FOO_BAR
+FOO_BAR: WARNING: unrecognized flag '-foo_bar=1'; use -help=1 to list all flags
+FOO_BAR: BINGO
+
+RUN: LLVMFuzzer-SimpleTest -runs=10 --max_len=100 2>&1 | FileCheck %s --check-prefix=DASH_DASH
+DASH_DASH: WARNING: did you mean '-max_len=100' (single dash)?
+DASH_DASH: INFO: A corpus is not provided, starting from an empty corpus
+
diff --git a/lib/Fuzzer/test/fuzzer-fn-adapter.test b/lib/Fuzzer/test/fuzzer-fn-adapter.test
new file mode 100644
index 000000000000..0ea96f3f9f00
--- /dev/null
+++ b/lib/Fuzzer/test/fuzzer-fn-adapter.test
@@ -0,0 +1,3 @@
+RUN: LLVMFuzzer-SimpleFnAdapterTest 2>&1 | FileCheck %s
+
+CHECK: BINGO
diff --git a/lib/Fuzzer/test/fuzzer-leak.test b/lib/Fuzzer/test/fuzzer-leak.test
new file mode 100644
index 000000000000..59ba02cd7d28
--- /dev/null
+++ b/lib/Fuzzer/test/fuzzer-leak.test
@@ -0,0 +1,31 @@
+REQUIRES: lsan
+RUN: not LLVMFuzzer-LeakTest -runs=100000 -detect_leaks=1 2>&1 | FileCheck %s --check-prefix=LEAK_DURING
+LEAK_DURING: ERROR: LeakSanitizer: detected memory leaks
+LEAK_DURING: Direct leak of 4 byte(s) in 1 object(s) allocated from:
+LEAK_DURING: INFO: to ignore leaks on libFuzzer side use -detect_leaks=0
+LEAK_DURING: Test unit written to ./leak-
+LEAK_DURING-NOT: DONE
+LEAK_DURING-NOT: Done
+
+RUN: not LLVMFuzzer-LeakTest -runs=0 -detect_leaks=1 %S 2>&1 | FileCheck %s --check-prefix=LEAK_IN_CORPUS
+LEAK_IN_CORPUS: ERROR: LeakSanitizer: detected memory leaks
+LEAK_IN_CORPUS: INFO: a leak has been found in the initial corpus.
+
+
+RUN: not LLVMFuzzer-LeakTest -runs=100000 -detect_leaks=0 2>&1 | FileCheck %s --check-prefix=LEAK_AFTER
+RUN: not LLVMFuzzer-LeakTest -runs=100000                 2>&1 | FileCheck %s --check-prefix=LEAK_DURING
+RUN: not LLVMFuzzer-ThreadedLeakTest -runs=100000 -detect_leaks=0 2>&1 | FileCheck %s --check-prefix=LEAK_AFTER
+RUN: not LLVMFuzzer-ThreadedLeakTest -runs=100000                 2>&1 | FileCheck %s --check-prefix=LEAK_DURING
+LEAK_AFTER: Done 100000 runs in
+LEAK_AFTER: ERROR: LeakSanitizer: detected memory leaks
+
+RUN: not LLVMFuzzer-LeakTest -runs=100000 -max_len=1 2>&1 | FileCheck %s --check-prefix=MAX_LEN_1
+MAX_LEN_1: Test unit written to ./leak-7cf184f4c67ad58283ecb19349720b0cae756829
+
+RUN: not LLVMFuzzer-LeakTimeoutTest -timeout=1 2>&1 | FileCheck %s --check-prefix=LEAK_TIMEOUT
+LEAK_TIMEOUT: ERROR: libFuzzer: timeout after
+LEAK_TIMEOUT-NOT: LeakSanitizer
+
+RUN: LLVMFuzzer-AccumulateAllocationsTest -detect_leaks=1 -runs=100000 2>&1 | FileCheck %s --check-prefix=ACCUMULATE_ALLOCS
+ACCUMULATE_ALLOCS: INFO: libFuzzer disabled leak detection after every mutation
+
diff --git a/lib/Fuzzer/test/fuzzer-oom-with-profile.test b/lib/Fuzzer/test/fuzzer-oom-with-profile.test
new file mode 100644
index 000000000000..391fd4bb0ff6
--- /dev/null
+++ b/lib/Fuzzer/test/fuzzer-oom-with-profile.test
@@ -0,0 +1,6 @@
+REQUIRES: linux
+RUN: not LLVMFuzzer-OutOfMemoryTest -rss_limit_mb=10 2>&1 | FileCheck %s
+CHECK: ERROR: libFuzzer: out-of-memory (used: {{.*}}; limit: 10Mb)
+CHECK: Live Heap Allocations
+CHECK: Test unit written to ./oom-
+SUMMARY: libFuzzer: out-of-memory
diff --git a/lib/Fuzzer/test/fuzzer-oom.test b/lib/Fuzzer/test/fuzzer-oom.test
new file mode 100644
index 000000000000..4cdff2142fda
--- /dev/null
+++ b/lib/Fuzzer/test/fuzzer-oom.test
@@ -0,0 +1,4 @@
+RUN: not LLVMFuzzer-OutOfMemoryTest -rss_limit_mb=10 2>&1 | FileCheck %s
+CHECK: ERROR: libFuzzer: out-of-memory (used: {{.*}}; limit: 10Mb)
+CHECK: Test unit written to ./oom-
+SUMMARY: libFuzzer: out-of-memory
diff --git a/lib/Fuzzer/test/fuzzer-printcovpcs.test b/lib/Fuzzer/test/fuzzer-printcovpcs.test
new file mode 100644
index 000000000000..70b22c7a54b9
--- /dev/null
+++ b/lib/Fuzzer/test/fuzzer-printcovpcs.test
@@ -0,0 +1,5 @@
+RUN: LLVMFuzzer-SimpleTest -print_new_cov_pcs=1 2>&1 | FileCheck %s --check-prefix=PCS
+PCS:{{^0x[a-f0-9]+}}
+PCS:NEW
+PCS:BINGO
+
diff --git a/lib/Fuzzer/test/fuzzer-prunecorpus.test b/lib/Fuzzer/test/fuzzer-prunecorpus.test
new file mode 100644
index 000000000000..a8a660e91b9a
--- /dev/null
+++ b/lib/Fuzzer/test/fuzzer-prunecorpus.test
@@ -0,0 +1,13 @@
+RUN: rm -rf %t/PruneCorpus
+RUN: mkdir -p %t/PruneCorpus
+RUN: echo a > %t/PruneCorpus/a
+RUN: echo b > %t/PruneCorpus/b
+RUN: LLVMFuzzer-EmptyTest %t/PruneCorpus -prune_corpus=1 -runs=0 2>&1 | FileCheck %s --check-prefix=PRUNE
+RUN: LLVMFuzzer-EmptyTest %t/PruneCorpus -prune_corpus=0 -runs=0 2>&1 | FileCheck %s --check-prefix=NOPRUNE
+RUN: rm -rf %t/PruneCorpus
+
+PRUNE: READ units: 2
+PRUNE: INITED{{.*}}units: 1
+NOPRUNE: READ units: 2
+NOPRUNE: INITED{{.*}}units: 2
+
diff --git a/lib/Fuzzer/test/fuzzer-runs.test b/lib/Fuzzer/test/fuzzer-runs.test
new file mode 100644
index 000000000000..056c44782a15
--- /dev/null
+++ b/lib/Fuzzer/test/fuzzer-runs.test
@@ -0,0 +1,8 @@
+RUN: mkdir -p %t
+RUN: echo abcd > %t/NthRunCrashTest.in
+RUN: LLVMFuzzer-NthRunCrashTest %t/NthRunCrashTest.in
+RUN: LLVMFuzzer-NthRunCrashTest %t/NthRunCrashTest.in -runs=10
+RUN: not LLVMFuzzer-NthRunCrashTest %t/NthRunCrashTest.in -runs=10000 2>&1 | FileCheck %s
+RUN: rm %t/NthRunCrashTest.in
+CHECK: BINGO
+
diff --git a/lib/Fuzzer/test/fuzzer-seed.test b/lib/Fuzzer/test/fuzzer-seed.test
new file mode 100644
index 000000000000..f1bdf9e4ae94
--- /dev/null
+++ b/lib/Fuzzer/test/fuzzer-seed.test
@@ -0,0 +1,3 @@
+RUN: LLVMFuzzer-SimpleCmpTest -seed=-1 -runs=0 2>&1 | FileCheck %s --check-prefix=CHECK_SEED_MINUS_ONE
+CHECK_SEED_MINUS_ONE: Seed: 4294967295
+
diff --git a/lib/Fuzzer/test/fuzzer-segv.test b/lib/Fuzzer/test/fuzzer-segv.test
new file mode 100644
index 000000000000..330f03bcc494
--- /dev/null
+++ b/lib/Fuzzer/test/fuzzer-segv.test
@@ -0,0 +1,5 @@
+RUN: ASAN_OPTIONS=handle_segv=0 not LLVMFuzzer-NullDerefTest 2>&1 | FileCheck %s --check-prefix=LIBFUZZER_OWN_SEGV_HANDLER
+LIBFUZZER_OWN_SEGV_HANDLER: == ERROR: libFuzzer: deadly signal
+LIBFUZZER_OWN_SEGV_HANDLER: SUMMARY: libFuzzer: deadly signal
+LIBFUZZER_OWN_SEGV_HANDLER: Test unit written to ./crash-
+
diff --git a/lib/Fuzzer/test/fuzzer-singleinputs.test b/lib/Fuzzer/test/fuzzer-singleinputs.test
new file mode 100644
index 000000000000..a4faf2cea50c
--- /dev/null
+++ b/lib/Fuzzer/test/fuzzer-singleinputs.test
@@ -0,0 +1,13 @@
+RUN: not LLVMFuzzer-NullDerefTest %S/hi.txt 2>&1 | FileCheck %s --check-prefix=SingleInput
+SingleInput-NOT: Test unit written to ./crash-
+
+RUN: rm -rf  %tmp/SINGLE_INPUTS
+RUN: mkdir -p  %tmp/SINGLE_INPUTS
+RUN: echo aaa > %tmp/SINGLE_INPUTS/aaa
+RUN: echo bbb > %tmp/SINGLE_INPUTS/bbb
+RUN: LLVMFuzzer-SimpleTest %tmp/SINGLE_INPUTS/aaa %tmp/SINGLE_INPUTS/bbb 2>&1 | FileCheck %s --check-prefix=SINGLE_INPUTS
+RUN: rm -rf  %tmp/SINGLE_INPUTS
+SINGLE_INPUTS: LLVMFuzzer-SimpleTest: Running 2 inputs 1 time(s) each.
+SINGLE_INPUTS: aaa in
+SINGLE_INPUTS: bbb in
+
diff --git a/lib/Fuzzer/test/fuzzer-timeout.test b/lib/Fuzzer/test/fuzzer-timeout.test
index c3a9e8a3a9e0..8e8b713fcd79 100644
--- a/lib/Fuzzer/test/fuzzer-timeout.test
+++ b/lib/Fuzzer/test/fuzzer-timeout.test
@@ -7,7 +7,8 @@ TimeoutTest: #1
 TimeoutTest: #2
 TimeoutTest: SUMMARY: libFuzzer: timeout
 
-RUN: not LLVMFuzzer-TimeoutTest -timeout=1 -test_single_input=%S/hi.txt 2>&1 | FileCheck %s --check-prefix=SingleInputTimeoutTest
-SingleInputTimeoutTest: ALARM: working on the last Unit for
+RUN: not LLVMFuzzer-TimeoutTest -timeout=1 %S/hi.txt 2>&1 | FileCheck %s --check-prefix=SingleInputTimeoutTest
+SingleInputTimeoutTest: ALARM: working on the last Unit for {{[1-3]}} seconds
 SingleInputTimeoutTest-NOT: Test unit written to ./timeout-
 
+RUN: LLVMFuzzer-TimeoutTest -timeout=1 -timeout_exitcode=0
diff --git a/lib/Fuzzer/test/fuzzer-trace-pc.test b/lib/Fuzzer/test/fuzzer-trace-pc.test
new file mode 100644
index 000000000000..673249d04786
--- /dev/null
+++ b/lib/Fuzzer/test/fuzzer-trace-pc.test
@@ -0,0 +1,7 @@
+CHECK: BINGO
+REQUIRES: linux
+RUN: not LLVMFuzzer-FourIndependentBranchesTest-TracePC      -seed=1 -runs=1000000 2>&1 | FileCheck %s
+// FIXME: The test below uses a significant amount of memory on OSX and
+// sometimes hits the 2GiB memory limit. This needs to be investigated. For now
+// only run the test on Linux.
+RUN: not LLVMFuzzer-FullCoverageSetTest-TracePC              -seed=1 -runs=10000000 2>&1 | FileCheck %s
diff --git a/lib/Fuzzer/test/fuzzer-traces-hooks.test b/lib/Fuzzer/test/fuzzer-traces-hooks.test
new file mode 100644
index 000000000000..71fe6f2daf11
--- /dev/null
+++ b/lib/Fuzzer/test/fuzzer-traces-hooks.test
@@ -0,0 +1,25 @@
+// FIXME: Support sanitizer hooks for memcmp and strcmp need
+// to be implemented in the sanitizer runtime for platforms other
+// than linux
+REQUIRES: linux
+CHECK: BINGO
+Done1000000: Done 1000000 runs in
+
+RUN: not LLVMFuzzer-MemcmpTest               -seed=4294967295 -runs=100000   2>&1 | FileCheck %s
+RUN:     LLVMFuzzer-MemcmpTest -use_memcmp=0 -seed=4294967295 -runs=1000000  2>&1 | FileCheck %s --check-prefix=Done1000000
+
+RUN: not LLVMFuzzer-StrncmpTest               -seed=2 -runs=100000   2>&1 | FileCheck %s
+RUN:     LLVMFuzzer-StrncmpTest -use_memcmp=0 -seed=3 -runs=1000000  2>&1 | FileCheck %s --check-prefix=Done1000000
+
+RUN: not LLVMFuzzer-StrcmpTest               -seed=4 -runs=200000   2>&1 | FileCheck %s
+RUN:     LLVMFuzzer-StrcmpTest -use_memcmp=0 -seed=5 -runs=1000000  2>&1 | FileCheck %s --check-prefix=Done1000000
+
+RUN: not LLVMFuzzer-StrstrTest               -seed=6 -runs=200000   2>&1 | FileCheck %s
+RUN:     LLVMFuzzer-StrstrTest -use_memmem=0 -seed=7 -runs=1000000  2>&1 | FileCheck %s --check-prefix=Done1000000
+
+RUN: LLVMFuzzer-RepeatedMemcmp -seed=10 -runs=100000 2>&1 | FileCheck %s --check-prefix=RECOMMENDED_DICT
+RECOMMENDED_DICT:###### Recommended dictionary. ######
+RECOMMENDED_DICT-DAG: "foo"
+RECOMMENDED_DICT-DAG: "bar"
+RECOMMENDED_DICT:###### End of recommended dictionary. ######
+
diff --git a/lib/Fuzzer/test/fuzzer-traces.test b/lib/Fuzzer/test/fuzzer-traces.test
index 3b8639b8e941..2d7729536643 100644
--- a/lib/Fuzzer/test/fuzzer-traces.test
+++ b/lib/Fuzzer/test/fuzzer-traces.test
@@ -1,20 +1,9 @@
 CHECK: BINGO
 Done1000000: Done 1000000 runs in
-Done10000000: Done 10000000 runs in
-
 RUN: not LLVMFuzzer-SimpleCmpTest -use_traces=1 -seed=1 -runs=10000001  2>&1 | FileCheck %s
 
-RUN: not LLVMFuzzer-MemcmpTest -use_traces=1 -seed=4294967295 -runs=100000   2>&1 | FileCheck %s
-RUN:     LLVMFuzzer-MemcmpTest               -seed=4294967295 -runs=1000000  2>&1 | FileCheck %s --check-prefix=Done1000000
-
-RUN: not LLVMFuzzer-StrncmpTest -use_traces=1 -seed=1 -runs=100000   2>&1 | FileCheck %s
-RUN:     LLVMFuzzer-StrncmpTest               -seed=1 -runs=1000000  2>&1 | FileCheck %s --check-prefix=Done1000000
-
-RUN: not LLVMFuzzer-StrcmpTest -use_traces=1 -seed=1 -runs=200000   2>&1 | FileCheck %s
-RUN:     LLVMFuzzer-StrcmpTest               -seed=1 -runs=1000000  2>&1 | FileCheck %s --check-prefix=Done1000000
-
-RUN: not LLVMFuzzer-SwitchTest -use_traces=1 -seed=1 -runs=1000002  2>&1 | FileCheck %s
-RUN:     LLVMFuzzer-SwitchTest               -seed=1 -runs=1000000  2>&1 | FileCheck %s --check-prefix=Done1000000
+RUN: not LLVMFuzzer-SwitchTest -use_traces=1 -seed=6 -runs=1000002  2>&1 | FileCheck %s
+RUN:     LLVMFuzzer-SwitchTest               -seed=7 -runs=1000000  2>&1 | FileCheck %s --check-prefix=Done1000000
 
-RUN: not LLVMFuzzer-SimpleHashTest -use_traces=1 -seed=1 -runs=10000000  2>&1 | FileCheck %s
-RUN:     LLVMFuzzer-SimpleHashTest               -seed=1 -runs=10000000 2>&1 | FileCheck %s --check-prefix=Done10000000
+RUN: not LLVMFuzzer-SimpleHashTest -use_traces=1 -seed=8 -runs=1000000 -max_len=16  2>&1 | FileCheck %s
+RUN:     LLVMFuzzer-SimpleHashTest               -seed=9 -runs=1000000 -max_len=16 2>&1 | FileCheck %s --check-prefix=Done1000000
diff --git a/lib/Fuzzer/test/fuzzer-trunc.test b/lib/Fuzzer/test/fuzzer-trunc.test
new file mode 100644
index 000000000000..ebab7b863a0f
--- /dev/null
+++ b/lib/Fuzzer/test/fuzzer-trunc.test
@@ -0,0 +1,10 @@
+# Test truncate_units option.
+RUN: rm -rf FuzzerTruncateTestCORPUS
+RUN: mkdir FuzzerTruncateTestCORPUS
+RUN: echo "01234567890123456789012345678901234567890" > FuzzerTruncateTestCORPUS/unit1
+# Simply running a fuzzer won't produce new results
+RUN:   LLVMFuzzer-EmptyTest -seed=1 -runs=100 -truncate_units=0 ./FuzzerTruncateTestCORPUS
+# Truncating would create a new unit of length 1.
+RUN:   LLVMFuzzer-EmptyTest -seed=1 -runs=0 -truncate_units=1 ./FuzzerTruncateTestCORPUS
+RUN: find FuzzerTruncateTestCORPUS/b6589fc6ab0dc82cf12099d1c2d40ab994e8410c
+RUN: rm -rf FuzzerTruncateTestCORPUS
diff --git a/lib/Fuzzer/test/fuzzer-ubsan.test b/lib/Fuzzer/test/fuzzer-ubsan.test
new file mode 100644
index 000000000000..0e8ad6c94a1b
--- /dev/null
+++ b/lib/Fuzzer/test/fuzzer-ubsan.test
@@ -0,0 +1,4 @@
+RUN: not LLVMFuzzer-SignedIntOverflowTest-Ubsan 2>&1 | FileCheck %s
+CHECK: runtime error: signed integer overflow: 2147483647 + 1 cannot be represented in type 'int'
+CHECK: Test unit written to ./crash-
+
diff --git a/lib/Fuzzer/test/fuzzer.test b/lib/Fuzzer/test/fuzzer.test
index c63014f59d62..11343ae3834a 100644
--- a/lib/Fuzzer/test/fuzzer.test
+++ b/lib/Fuzzer/test/fuzzer.test
@@ -2,19 +2,25 @@ CHECK: BINGO
 Done1000000: Done 1000000 runs in
 
 RUN: LLVMFuzzer-SimpleTest 2>&1 | FileCheck %s
-RUN: not LLVMFuzzer-NullDerefTest -test_single_input=%S/hi.txt 2>&1 | FileCheck %s --check-prefix=SingleInput
-SingleInput-NOT: Test unit written to ./crash-
+
+# only_ascii mode. Will perform some minimal self-validation.
+RUN: LLVMFuzzer-SimpleTest -only_ascii=1 2>&1
 
 RUN: LLVMFuzzer-SimpleCmpTest -max_total_time=1 2>&1 | FileCheck %s --check-prefix=MaxTotalTime
 MaxTotalTime: Done {{.*}} runs in {{.}} second(s)
 
-RUN: not LLVMFuzzer-NullDerefTest 2>&1 | FileCheck %s --check-prefix=NullDerefTest
+RUN: not LLVMFuzzer-NullDerefTest                  2>&1 | FileCheck %s --check-prefix=NullDerefTest
+RUN: not LLVMFuzzer-NullDerefTest -close_fd_mask=3 2>&1 | FileCheck %s --check-prefix=NullDerefTest
+NullDerefTest: ERROR: AddressSanitizer: SEGV on unknown address
 NullDerefTest: Test unit written to ./crash-
 RUN: not LLVMFuzzer-NullDerefTest  -artifact_prefix=ZZZ 2>&1 | FileCheck %s --check-prefix=NullDerefTestPrefix
 NullDerefTestPrefix: Test unit written to ZZZcrash-
 RUN: not LLVMFuzzer-NullDerefTest  -artifact_prefix=ZZZ -exact_artifact_path=FOOBAR 2>&1 | FileCheck %s --check-prefix=NullDerefTestExactPath
 NullDerefTestExactPath: Test unit written to FOOBAR
 
+RUN: not LLVMFuzzer-NullDerefOnEmptyTest -print_final_stats=1 2>&1 | FileCheck %s --check-prefix=NULL_DEREF_ON_EMPTY
+NULL_DEREF_ON_EMPTY: stat::number_of_executed_units:
+
 #not LLVMFuzzer-FullCoverageSetTest -timeout=15 -seed=1 -mutate_depth=2 -use_full_coverage_set=1 2>&1 | FileCheck %s
 
 RUN: not LLVMFuzzer-CounterTest -use_counters=1 -max_len=6 -seed=1 -timeout=15 2>&1 | FileCheck %s
@@ -23,14 +29,15 @@ RUN: not LLVMFuzzer-CallerCalleeTest                     -cross_over=0 -max_len=
 # This one is flaky, may actually find the goal even w/o use_indir_calls.
 # LLVMFuzzer-CallerCalleeTest  -use_indir_calls=0 -cross_over=0 -max_len=6 -seed=1 -runs=1000000 2>&1 | FileCheck %s  --check-prefix=Done1000000
 
-
-RUN: not LLVMFuzzer-UserSuppliedFuzzerTest -seed=1 -timeout=15 2>&1 | FileCheck %s
-
 RUN: not LLVMFuzzer-UninstrumentedTest-Uninstrumented 2>&1 | FileCheck %s --check-prefix=UNINSTRUMENTED
 UNINSTRUMENTED: ERROR: __sanitizer_set_death_callback is not defined. Exiting.
 
-RUN: LLVMFuzzer-SimpleTest -print_new_cov_pcs=1 2>&1 | FileCheck %s --check-prefix=PCS
-PCS:{{^0x[a-f0-9]+}}
-PCS:NEW
-PCS:BINGO
+RUN: not LLVMFuzzer-UninstrumentedTest-NoCoverage 2>&1 | FileCheck %s --check-prefix=NO_COVERAGE
+NO_COVERAGE: ERROR: no interesting inputs were found. Is the code instrumented for coverage? Exiting
+
+RUN: not LLVMFuzzer-BufferOverflowOnInput 2>&1 | FileCheck %s --check-prefix=OOB
+OOB: AddressSanitizer: heap-buffer-overflow
+OOB: is located 0 bytes to the right of 3-byte region
+
+RUN: not LLVMFuzzer-InitializeTest 2>&1 | FileCheck %s
 
diff --git a/lib/Fuzzer/test/lit.cfg b/lib/Fuzzer/test/lit.cfg
index 2140a97668b3..e262e3c25850 100644
--- a/lib/Fuzzer/test/lit.cfg
+++ b/lib/Fuzzer/test/lit.cfg
@@ -1,4 +1,5 @@
 import lit.formats
+import sys
 
 config.name = "LLVMFuzzer"
 config.test_format = lit.formats.ShTest(True)
@@ -13,3 +14,22 @@ path = os.path.pathsep.join((llvm_tools_dir, config.test_exec_root,
                              config.environment['PATH']))
 config.environment['PATH'] = path
 
+if config.has_dfsan:
+  lit_config.note('dfsan feature available')
+  config.available_features.add('dfsan')
+else:
+  lit_config.note('dfsan feature unavailable')
+
+if config.has_lsan:
+  lit_config.note('lsan feature available')
+  config.available_features.add('lsan')
+else:
+  lit_config.note('lsan feature unavailable')
+
+if sys.platform.startswith('linux'):
+  # Note the value of ``sys.platform`` is not consistent
+  # between python 2 and 3, hence the use of ``.startswith()``.
+  lit_config.note('linux feature available')
+  config.available_features.add('linux')
+else:
+  lit_config.note('linux feature unavailable')
diff --git a/lib/Fuzzer/test/lit.site.cfg.in b/lib/Fuzzer/test/lit.site.cfg.in
index e520db8e881d..95ad6d0ab178 100644
--- a/lib/Fuzzer/test/lit.site.cfg.in
+++ b/lib/Fuzzer/test/lit.site.cfg.in
@@ -1,3 +1,5 @@
 config.test_exec_root = "@CMAKE_CURRENT_BINARY_DIR@"
 config.llvm_tools_dir = "@LLVM_TOOLS_DIR@"
+config.has_dfsan = True if @HAS_DFSAN@ == 1 else False
+config.has_lsan = True if @HAS_LSAN@ == 1 else False
 lit_config.load_config(config, "@CMAKE_CURRENT_SOURCE_DIR@/lit.cfg")
diff --git a/lib/Fuzzer/test/merge.test b/lib/Fuzzer/test/merge.test
index 57ecc141bbfe..6f19e21d132d 100644
--- a/lib/Fuzzer/test/merge.test
+++ b/lib/Fuzzer/test/merge.test
@@ -8,8 +8,8 @@ RUN: echo ..Z... > %tmp/T1/3
 
 # T1 has 3 elements, T2 is empty.
 RUN: LLVMFuzzer-FullCoverageSetTest -merge=1 %tmp/T1 %tmp/T2 2>&1 | FileCheck %s --check-prefix=CHECK1
-CHECK1: Merge: running the initial corpus {{.*}} of 3 units
-CHECK1: Merge: written 0 out of 0 units
+CHECK1: === Minimizing the initial corpus of 3 units
+CHECK1: === Merge: written 0 units
 
 RUN: echo ...Z.. > %tmp/T2/1
 RUN: echo ....E. > %tmp/T2/2
@@ -20,10 +20,11 @@ RUN: echo ..Z... > %tmp/T2/c
 
 # T1 has 3 elements, T2 has 6 elements, only 3 are new.
 RUN: LLVMFuzzer-FullCoverageSetTest -merge=1 %tmp/T1 %tmp/T2 2>&1 | FileCheck %s --check-prefix=CHECK2
-CHECK2: Merge: running the initial corpus {{.*}} of 3 units
-CHECK2: Merge: written 3 out of 6 units
+CHECK2: === Minimizing the initial corpus of 3 units
+CHECK2: === Merging extra 6 units
+CHECK2: === Merge: written 3 units
 
 # Now, T1 has 6 units and T2 has no new interesting units.
 RUN: LLVMFuzzer-FullCoverageSetTest -merge=1 %tmp/T1 %tmp/T2 2>&1 | FileCheck %s --check-prefix=CHECK3
-CHECK3: Merge: running the initial corpus {{.*}} of 6 units
-CHECK3: Merge: written 0 out of 6 units
+CHECK3: === Minimizing the initial corpus of 6 units
+CHECK3: === Merge: written 0 units
diff --git a/lib/Fuzzer/test/no-coverage/CMakeLists.txt b/lib/Fuzzer/test/no-coverage/CMakeLists.txt
new file mode 100644
index 000000000000..1dc7d15926ca
--- /dev/null
+++ b/lib/Fuzzer/test/no-coverage/CMakeLists.txt
@@ -0,0 +1,16 @@
+# These tests are not instrumented with coverage,
+# but have coverage rt in the binary.
+
+set(CMAKE_CXX_FLAGS
+  "${LIBFUZZER_FLAGS_BASE} -fno-sanitize-coverage=edge,trace-cmp,indirect-calls,8bit-counters")
+
+set(NoCoverageTests
+  UninstrumentedTest
+  )
+
+foreach(Test ${NoCoverageTests})
+  add_libfuzzer_test(${Test}-NoCoverage SOURCES ../${Test}.cpp)
+endforeach()
+
+# Propagate value into parent directory
+set(TestBinaries ${TestBinaries} PARENT_SCOPE)
diff --git a/lib/Fuzzer/test/trace-bb/CMakeLists.txt b/lib/Fuzzer/test/trace-bb/CMakeLists.txt
index 99af019565b5..fd168c4515b9 100644
--- a/lib/Fuzzer/test/trace-bb/CMakeLists.txt
+++ b/lib/Fuzzer/test/trace-bb/CMakeLists.txt
@@ -1,14 +1,15 @@
 # These tests are not instrumented with coverage.
 
-set(CMAKE_CXX_FLAGS_RELEASE
+set(CMAKE_CXX_FLAGS
   "${LIBFUZZER_FLAGS_BASE} -fsanitize-coverage=edge,trace-bb")
 
+set(TraceBBTests
+  SimpleTest
+  )
+
 foreach(Test ${TraceBBTests})
-  add_executable(LLVMFuzzer-${Test}-TraceBB
-    ../${Test}.cpp
-    )
-  target_link_libraries(LLVMFuzzer-${Test}-TraceBB
-    LLVMFuzzer
-    )
+  add_libfuzzer_test(${Test}-TraceBB SOURCES ../${Test}.cpp)
 endforeach()
 
+# Propagate value into parent directory
+set(TestBinaries ${TestBinaries} PARENT_SCOPE)
diff --git a/lib/Fuzzer/test/trace-pc/CMakeLists.txt b/lib/Fuzzer/test/trace-pc/CMakeLists.txt
new file mode 100644
index 000000000000..cf18278ac64b
--- /dev/null
+++ b/lib/Fuzzer/test/trace-pc/CMakeLists.txt
@@ -0,0 +1,16 @@
+# These tests are not instrumented with coverage.
+
+set(CMAKE_CXX_FLAGS
+  "${LIBFUZZER_FLAGS_BASE} -fno-sanitize-coverage=8bit-counters -fsanitize-coverage=trace-pc")
+
+set(TracePCTests
+  FourIndependentBranchesTest
+  FullCoverageSetTest
+  )
+
+foreach(Test ${TracePCTests})
+  add_libfuzzer_test(${Test}-TracePC SOURCES ../${Test}.cpp)
+endforeach()
+
+# Propagate value into parent directory
+set(TestBinaries ${TestBinaries} PARENT_SCOPE)
diff --git a/lib/Fuzzer/test/ubsan/CMakeLists.txt b/lib/Fuzzer/test/ubsan/CMakeLists.txt
new file mode 100644
index 000000000000..7a9eacdbe7df
--- /dev/null
+++ b/lib/Fuzzer/test/ubsan/CMakeLists.txt
@@ -0,0 +1,15 @@
+# These tests are instrumented with ubsan in non-recovery mode.
+
+set(CMAKE_CXX_FLAGS
+  "${LIBFUZZER_FLAGS_BASE} -fsanitize=undefined -fno-sanitize-recover=all")
+
+set(UbsanTests
+  SignedIntOverflowTest
+  )
+
+foreach(Test ${UbsanTests})
+  add_libfuzzer_test(${Test}-Ubsan SOURCES ../${Test}.cpp)
+endforeach()
+
+# Propagate value into parent directory
+set(TestBinaries ${TestBinaries} PARENT_SCOPE)
diff --git a/lib/Fuzzer/test/uninstrumented/CMakeLists.txt b/lib/Fuzzer/test/uninstrumented/CMakeLists.txt
index 443ba3716f66..06e48985e7ef 100644
--- a/lib/Fuzzer/test/uninstrumented/CMakeLists.txt
+++ b/lib/Fuzzer/test/uninstrumented/CMakeLists.txt
@@ -1,14 +1,16 @@
-# These tests are not instrumented with coverage.
+# These tests are not instrumented with coverage and don't
+# have coverage rt in the binary.
 
-set(CMAKE_CXX_FLAGS_RELEASE
-  "${LIBFUZZER_FLAGS_BASE} -O0 -fno-sanitize=all")
+set(CMAKE_CXX_FLAGS
+  "${LIBFUZZER_FLAGS_BASE} -fno-sanitize=all -fno-sanitize-coverage=edge,trace-cmp,indirect-calls,8bit-counters")
+
+set(UninstrumentedTests
+  UninstrumentedTest
+  )
 
 foreach(Test ${UninstrumentedTests})
-  add_executable(LLVMFuzzer-${Test}-Uninstrumented
-    ../${Test}.cpp
-    )
-  target_link_libraries(LLVMFuzzer-${Test}-Uninstrumented
-    LLVMFuzzer
-    )
+  add_libfuzzer_test(${Test}-Uninstrumented SOURCES ../${Test}.cpp)
 endforeach()
 
+# Propagate value into parent directory
+set(TestBinaries ${TestBinaries} PARENT_SCOPE)
diff --git a/lib/IR/AsmWriter.cpp b/lib/IR/AsmWriter.cpp
index 0ce44e105cc3..9b2399dd880c 100644
--- a/lib/IR/AsmWriter.cpp
+++ b/lib/IR/AsmWriter.cpp
@@ -102,6 +102,11 @@ static OrderMap orderModule(const Module *M) {
       orderValue(A.getAliasee(), OM);
     orderValue(&A, OM);
   }
+  for (const GlobalIFunc &I : M->ifuncs()) {
+    if (!isa<GlobalValue>(I.getResolver()))
+      orderValue(I.getResolver(), OM);
+    orderValue(&I, OM);
+  }
   for (const Function &F : *M) {
     for (const Use &U : F.operands())
       if (!isa<GlobalValue>(U.get()))
@@ -249,11 +254,15 @@ static UseListOrderStack predictUseListOrder(const Module *M) {
     predictValueUseListOrder(&F, nullptr, OM, Stack);
   for (const GlobalAlias &A : M->aliases())
     predictValueUseListOrder(&A, nullptr, OM, Stack);
+  for (const GlobalIFunc &I : M->ifuncs())
+    predictValueUseListOrder(&I, nullptr, OM, Stack);
   for (const GlobalVariable &G : M->globals())
     if (G.hasInitializer())
       predictValueUseListOrder(G.getInitializer(), nullptr, OM, Stack);
   for (const GlobalAlias &A : M->aliases())
     predictValueUseListOrder(A.getAliasee(), nullptr, OM, Stack);
+  for (const GlobalIFunc &I : M->ifuncs())
+    predictValueUseListOrder(I.getResolver(), nullptr, OM, Stack);
   for (const Function &F : *M)
     for (const Use &U : F.operands())
       predictValueUseListOrder(U.get(), nullptr, OM, Stack);
@@ -307,15 +316,23 @@ static void PrintCallingConv(unsigned cc, raw_ostream &Out) {
   case CallingConv::ARM_AAPCS:     Out << "arm_aapcscc"; break;
   case CallingConv::ARM_AAPCS_VFP: Out << "arm_aapcs_vfpcc"; break;
   case CallingConv::MSP430_INTR:   Out << "msp430_intrcc"; break;
+  case CallingConv::AVR_INTR:      Out << "avr_intrcc "; break;
+  case CallingConv::AVR_SIGNAL:    Out << "avr_signalcc "; break;
   case CallingConv::PTX_Kernel:    Out << "ptx_kernel"; break;
   case CallingConv::PTX_Device:    Out << "ptx_device"; break;
   case CallingConv::X86_64_SysV:   Out << "x86_64_sysvcc"; break;
   case CallingConv::X86_64_Win64:  Out << "x86_64_win64cc"; break;
   case CallingConv::SPIR_FUNC:     Out << "spir_func"; break;
   case CallingConv::SPIR_KERNEL:   Out << "spir_kernel"; break;
+  case CallingConv::Swift:         Out << "swiftcc"; break;
   case CallingConv::X86_INTR:      Out << "x86_intrcc"; break;
   case CallingConv::HHVM:          Out << "hhvmcc"; break;
   case CallingConv::HHVM_C:        Out << "hhvm_ccc"; break;
+  case CallingConv::AMDGPU_VS:     Out << "amdgpu_vs"; break;
+  case CallingConv::AMDGPU_GS:     Out << "amdgpu_gs"; break;
+  case CallingConv::AMDGPU_PS:     Out << "amdgpu_ps"; break;
+  case CallingConv::AMDGPU_CS:     Out << "amdgpu_cs"; break;
+  case CallingConv::AMDGPU_KERNEL: Out << "amdgpu_kernel"; break;
   }
 }
 
@@ -664,6 +681,9 @@ private:
   /// Add all of the functions arguments, basic blocks, and instructions.
   void processFunction();
 
+  /// Add the metadata directly attached to a GlobalObject.
+  void processGlobalObjectMetadata(const GlobalObject &GO);
+
   /// Add all of the metadata from a function.
   void processFunctionMetadata(const Function &F);
 
@@ -681,14 +701,25 @@ ModuleSlotTracker::ModuleSlotTracker(SlotTracker &Machine, const Module *M,
 
 ModuleSlotTracker::ModuleSlotTracker(const Module *M,
                                      bool ShouldInitializeAllMetadata)
-    : MachineStorage(M ? new SlotTracker(M, ShouldInitializeAllMetadata)
-                       : nullptr),
-      M(M), Machine(MachineStorage.get()) {}
+    : ShouldCreateStorage(M),
+      ShouldInitializeAllMetadata(ShouldInitializeAllMetadata), M(M) {}
 
 ModuleSlotTracker::~ModuleSlotTracker() {}
 
+SlotTracker *ModuleSlotTracker::getMachine() {
+  if (!ShouldCreateStorage)
+    return Machine;
+
+  ShouldCreateStorage = false;
+  MachineStorage =
+      llvm::make_unique<SlotTracker>(M, ShouldInitializeAllMetadata);
+  Machine = MachineStorage.get();
+  return Machine;
+}
+
 void ModuleSlotTracker::incorporateFunction(const Function &F) {
-  if (!Machine)
+  // Using getMachine() may lazily create the slot tracker.
+  if (!getMachine())
     return;
 
   // Nothing to do if this is the right function already.
@@ -722,6 +753,9 @@ static SlotTracker *createSlotTracker(const Value *V) {
   if (const GlobalAlias *GA = dyn_cast<GlobalAlias>(V))
     return new SlotTracker(GA->getParent());
 
+  if (const GlobalIFunc *GIF = dyn_cast<GlobalIFunc>(V))
+    return new SlotTracker(GIF->getParent());
+
   if (const Function *Func = dyn_cast<Function>(V))
     return new SlotTracker(Func);
 
@@ -768,6 +802,7 @@ void SlotTracker::processModule() {
   for (const GlobalVariable &Var : TheModule->globals()) {
     if (!Var.hasName())
       CreateModuleSlot(&Var);
+    processGlobalObjectMetadata(Var);
   }
 
   for (const GlobalAlias &A : TheModule->aliases()) {
@@ -775,6 +810,11 @@ void SlotTracker::processModule() {
       CreateModuleSlot(&A);
   }
 
+  for (const GlobalIFunc &I : TheModule->ifuncs()) {
+    if (!I.hasName())
+      CreateModuleSlot(&I);
+  }
+
   // Add metadata used by named metadata.
   for (const NamedMDNode &NMD : TheModule->named_metadata()) {
     for (unsigned i = 0, e = NMD.getNumOperands(); i != e; ++i)
@@ -846,12 +886,15 @@ void SlotTracker::processFunction() {
   ST_DEBUG("end processFunction!\n");
 }
 
-void SlotTracker::processFunctionMetadata(const Function &F) {
+void SlotTracker::processGlobalObjectMetadata(const GlobalObject &GO) {
   SmallVector<std::pair<unsigned, MDNode *>, 4> MDs;
-  F.getAllMetadata(MDs);
+  GO.getAllMetadata(MDs);
   for (auto &MD : MDs)
     CreateMetadataSlot(MD.second);
+}
 
+void SlotTracker::processFunctionMetadata(const Function &F) {
+  processGlobalObjectMetadata(F);
   for (auto &BB : F) {
     for (auto &I : BB)
       processInstructionMetadata(I);
@@ -938,10 +981,11 @@ void SlotTracker::CreateModuleSlot(const GlobalValue *V) {
 
   ST_DEBUG("  Inserting value [" << V->getType() << "] = " << V << " slot=" <<
            DestSlot << " [");
-  // G = Global, F = Function, A = Alias, o = other
+  // G = Global, F = Function, A = Alias, I = IFunc, o = other
   ST_DEBUG((isa<GlobalVariable>(V) ? 'G' :
             (isa<Function>(V) ? 'F' :
-             (isa<GlobalAlias>(V) ? 'A' : 'o'))) << "]\n");
+             (isa<GlobalAlias>(V) ? 'A' :
+              (isa<GlobalIFunc>(V) ? 'I' : 'o')))) << "]\n");
 }
 
 /// CreateSlot - Create a new slot for the specified value if it has no name.
@@ -1401,6 +1445,7 @@ struct MDFieldPrinter {
   template <class IntTy, class Stringifier>
   void printDwarfEnum(StringRef Name, IntTy Value, Stringifier toString,
                       bool ShouldSkipZero = true);
+  void printEmissionKind(StringRef Name, DICompileUnit::DebugEmissionKind EK);
 };
 } // end namespace
 
@@ -1481,6 +1526,12 @@ void MDFieldPrinter::printDIFlags(StringRef Name, unsigned Flags) {
     Out << FlagsFS << Extra;
 }
 
+void MDFieldPrinter::printEmissionKind(StringRef Name,
+                                       DICompileUnit::DebugEmissionKind EK) {
+  Out << FS << Name << ": " << DICompileUnit::EmissionKindString(EK);
+}
+
+
 template <class IntTy, class Stringifier>
 void MDFieldPrinter::printDwarfEnum(StringRef Name, IntTy Value,
                                     Stringifier toString, bool ShouldSkipZero) {
@@ -1608,6 +1659,7 @@ static void writeDISubroutineType(raw_ostream &Out, const DISubroutineType *N,
   Out << "!DISubroutineType(";
   MDFieldPrinter Printer(Out, TypePrinter, Machine, Context);
   Printer.printDIFlags("flags", N->getFlags());
+  Printer.printDwarfEnum("cc", N->getCC(), dwarf::ConventionString);
   Printer.printMetadata("types", N->getRawTypeArray(),
                         /* ShouldSkipNull */ false);
   Out << ")";
@@ -1638,11 +1690,9 @@ static void writeDICompileUnit(raw_ostream &Out, const DICompileUnit *N,
   Printer.printInt("runtimeVersion", N->getRuntimeVersion(),
                    /* ShouldSkipZero */ false);
   Printer.printString("splitDebugFilename", N->getSplitDebugFilename());
-  Printer.printInt("emissionKind", N->getEmissionKind(),
-                   /* ShouldSkipZero */ false);
+  Printer.printEmissionKind("emissionKind", N->getEmissionKind());
   Printer.printMetadata("enums", N->getRawEnumTypes());
   Printer.printMetadata("retainedTypes", N->getRawRetainedTypes());
-  Printer.printMetadata("subprograms", N->getRawSubprograms());
   Printer.printMetadata("globals", N->getRawGlobalVariables());
   Printer.printMetadata("imports", N->getRawImportedEntities());
   Printer.printMetadata("macros", N->getRawMacros());
@@ -1667,9 +1717,13 @@ static void writeDISubprogram(raw_ostream &Out, const DISubprogram *N,
   Printer.printMetadata("containingType", N->getRawContainingType());
   Printer.printDwarfEnum("virtuality", N->getVirtuality(),
                          dwarf::VirtualityString);
-  Printer.printInt("virtualIndex", N->getVirtualIndex());
+  if (N->getVirtuality() != dwarf::DW_VIRTUALITY_none ||
+      N->getVirtualIndex() != 0)
+    Printer.printInt("virtualIndex", N->getVirtualIndex(), false);
+  Printer.printInt("thisAdjustment", N->getThisAdjustment());
   Printer.printDIFlags("flags", N->getFlags());
   Printer.printBool("isOptimized", N->isOptimized());
+  Printer.printMetadata("unit", N->getRawUnit());
   Printer.printMetadata("templateParams", N->getRawTemplateParams());
   Printer.printMetadata("declaration", N->getRawDeclaration());
   Printer.printMetadata("variables", N->getRawVariables());
@@ -2037,7 +2091,7 @@ public:
 
   void printTypeIdentities();
   void printGlobal(const GlobalVariable *GV);
-  void printAlias(const GlobalAlias *GV);
+  void printIndirectSymbol(const GlobalIndirectSymbol *GIS);
   void printComdat(const Comdat *C);
   void printFunction(const Function *F);
   void printArgument(const Argument *FA, AttributeSet Attrs, unsigned Idx);
@@ -2073,11 +2127,8 @@ AssemblyWriter::AssemblyWriter(formatted_raw_ostream &o, SlotTracker &Mac,
   if (!TheModule)
     return;
   TypePrinter.incorporateTypes(*TheModule);
-  for (const Function &F : *TheModule)
-    if (const Comdat *C = F.getComdat())
-      Comdats.insert(C);
-  for (const GlobalVariable &GV : TheModule->globals())
-    if (const Comdat *C = GV.getComdat())
+  for (const GlobalObject &GO : TheModule->global_objects())
+    if (const Comdat *C = GO.getComdat())
       Comdats.insert(C);
 }
 
@@ -2095,7 +2146,7 @@ void AssemblyWriter::writeOperand(const Value *Operand, bool PrintType) {
 
 void AssemblyWriter::writeAtomic(AtomicOrdering Ordering,
                                  SynchronizationScope SynchScope) {
-  if (Ordering == NotAtomic)
+  if (Ordering == AtomicOrdering::NotAtomic)
     return;
 
   switch (SynchScope) {
@@ -2103,46 +2154,22 @@ void AssemblyWriter::writeAtomic(AtomicOrdering Ordering,
   case CrossThread: break;
   }
 
-  switch (Ordering) {
-  default: Out << " <bad ordering " << int(Ordering) << ">"; break;
-  case Unordered: Out << " unordered"; break;
-  case Monotonic: Out << " monotonic"; break;
-  case Acquire: Out << " acquire"; break;
-  case Release: Out << " release"; break;
-  case AcquireRelease: Out << " acq_rel"; break;
-  case SequentiallyConsistent: Out << " seq_cst"; break;
-  }
+  Out << " " << toIRString(Ordering);
 }
 
 void AssemblyWriter::writeAtomicCmpXchg(AtomicOrdering SuccessOrdering,
                                         AtomicOrdering FailureOrdering,
                                         SynchronizationScope SynchScope) {
-  assert(SuccessOrdering != NotAtomic && FailureOrdering != NotAtomic);
+  assert(SuccessOrdering != AtomicOrdering::NotAtomic &&
+         FailureOrdering != AtomicOrdering::NotAtomic);
 
   switch (SynchScope) {
   case SingleThread: Out << " singlethread"; break;
   case CrossThread: break;
   }
 
-  switch (SuccessOrdering) {
-  default: Out << " <bad ordering " << int(SuccessOrdering) << ">"; break;
-  case Unordered: Out << " unordered"; break;
-  case Monotonic: Out << " monotonic"; break;
-  case Acquire: Out << " acquire"; break;
-  case Release: Out << " release"; break;
-  case AcquireRelease: Out << " acq_rel"; break;
-  case SequentiallyConsistent: Out << " seq_cst"; break;
-  }
-
-  switch (FailureOrdering) {
-  default: Out << " <bad ordering " << int(FailureOrdering) << ">"; break;
-  case Unordered: Out << " unordered"; break;
-  case Monotonic: Out << " monotonic"; break;
-  case Acquire: Out << " acquire"; break;
-  case Release: Out << " release"; break;
-  case AcquireRelease: Out << " acq_rel"; break;
-  case SequentiallyConsistent: Out << " seq_cst"; break;
-  }
+  Out << " " << toIRString(SuccessOrdering);
+  Out << " " << toIRString(FailureOrdering);
 }
 
 void AssemblyWriter::writeParamOperand(const Value *Operand,
@@ -2211,6 +2238,12 @@ void AssemblyWriter::printModule(const Module *M) {
       M->getModuleIdentifier().find('\n') == std::string::npos)
     Out << "; ModuleID = '" << M->getModuleIdentifier() << "'\n";
 
+  if (!M->getSourceFileName().empty()) {
+    Out << "source_filename = \"";
+    PrintEscapedString(M->getSourceFileName(), Out);
+    Out << "\"\n";
+  }
+
   const std::string &DL = M->getDataLayoutStr();
   if (!DL.empty())
     Out << "target datalayout = \"" << DL << "\"\n";
@@ -2254,7 +2287,12 @@ void AssemblyWriter::printModule(const Module *M) {
   // Output all aliases.
   if (!M->alias_empty()) Out << "\n";
   for (const GlobalAlias &GA : M->aliases())
-    printAlias(&GA);
+    printIndirectSymbol(&GA);
+
+  // Output all ifuncs.
+  if (!M->ifunc_empty()) Out << "\n";
+  for (const GlobalIFunc &GI : M->ifuncs())
+    printIndirectSymbol(&GI);
 
   // Output global use-lists.
   printUseLists(nullptr);
@@ -2320,23 +2358,32 @@ void AssemblyWriter::printNamedMDNode(const NamedMDNode *NMD) {
   Out << "}\n";
 }
 
-static void PrintLinkage(GlobalValue::LinkageTypes LT,
-                         formatted_raw_ostream &Out) {
+static const char *getLinkagePrintName(GlobalValue::LinkageTypes LT) {
   switch (LT) {
-  case GlobalValue::ExternalLinkage: break;
-  case GlobalValue::PrivateLinkage:       Out << "private ";        break;
-  case GlobalValue::InternalLinkage:      Out << "internal ";       break;
-  case GlobalValue::LinkOnceAnyLinkage:   Out << "linkonce ";       break;
-  case GlobalValue::LinkOnceODRLinkage:   Out << "linkonce_odr ";   break;
-  case GlobalValue::WeakAnyLinkage:       Out << "weak ";           break;
-  case GlobalValue::WeakODRLinkage:       Out << "weak_odr ";       break;
-  case GlobalValue::CommonLinkage:        Out << "common ";         break;
-  case GlobalValue::AppendingLinkage:     Out << "appending ";      break;
-  case GlobalValue::ExternalWeakLinkage:  Out << "extern_weak ";    break;
+  case GlobalValue::ExternalLinkage:
+    return "";
+  case GlobalValue::PrivateLinkage:
+    return "private ";
+  case GlobalValue::InternalLinkage:
+    return "internal ";
+  case GlobalValue::LinkOnceAnyLinkage:
+    return "linkonce ";
+  case GlobalValue::LinkOnceODRLinkage:
+    return "linkonce_odr ";
+  case GlobalValue::WeakAnyLinkage:
+    return "weak ";
+  case GlobalValue::WeakODRLinkage:
+    return "weak_odr ";
+  case GlobalValue::CommonLinkage:
+    return "common ";
+  case GlobalValue::AppendingLinkage:
+    return "appending ";
+  case GlobalValue::ExternalWeakLinkage:
+    return "extern_weak ";
   case GlobalValue::AvailableExternallyLinkage:
-    Out << "available_externally ";
-    break;
+    return "available_externally ";
   }
+  llvm_unreachable("invalid linkage");
 }
 
 static void PrintVisibility(GlobalValue::VisibilityTypes Vis,
@@ -2377,6 +2424,18 @@ static void PrintThreadLocalModel(GlobalVariable::ThreadLocalMode TLM,
   }
 }
 
+static StringRef getUnnamedAddrEncoding(GlobalVariable::UnnamedAddr UA) {
+  switch (UA) {
+  case GlobalVariable::UnnamedAddr::None:
+    return "";
+  case GlobalVariable::UnnamedAddr::Local:
+    return "local_unnamed_addr";
+  case GlobalVariable::UnnamedAddr::Global:
+    return "unnamed_addr";
+  }
+  llvm_unreachable("Unknown UnnamedAddr");
+}
+
 static void maybePrintComdat(formatted_raw_ostream &Out,
                              const GlobalObject &GO) {
   const Comdat *C = GO.getComdat();
@@ -2405,18 +2464,19 @@ void AssemblyWriter::printGlobal(const GlobalVariable *GV) {
   if (!GV->hasInitializer() && GV->hasExternalLinkage())
     Out << "external ";
 
-  PrintLinkage(GV->getLinkage(), Out);
+  Out << getLinkagePrintName(GV->getLinkage());
   PrintVisibility(GV->getVisibility(), Out);
   PrintDLLStorageClass(GV->getDLLStorageClass(), Out);
   PrintThreadLocalModel(GV->getThreadLocalMode(), Out);
-  if (GV->hasUnnamedAddr())
-    Out << "unnamed_addr ";
+  StringRef UA = getUnnamedAddrEncoding(GV->getUnnamedAddr());
+  if (!UA.empty())
+      Out << UA << ' ';
 
   if (unsigned AddressSpace = GV->getType()->getAddressSpace())
     Out << "addrspace(" << AddressSpace << ") ";
   if (GV->isExternallyInitialized()) Out << "externally_initialized ";
   Out << (GV->isConstant() ? "constant " : "global ");
-  TypePrinter.print(GV->getType()->getElementType(), Out);
+  TypePrinter.print(GV->getValueType(), Out);
 
   if (GV->hasInitializer()) {
     Out << ' ';
@@ -2432,39 +2492,49 @@ void AssemblyWriter::printGlobal(const GlobalVariable *GV) {
   if (GV->getAlignment())
     Out << ", align " << GV->getAlignment();
 
+  SmallVector<std::pair<unsigned, MDNode *>, 4> MDs;
+  GV->getAllMetadata(MDs);
+  printMetadataAttachments(MDs, ", ");
+
   printInfoComment(*GV);
 }
 
-void AssemblyWriter::printAlias(const GlobalAlias *GA) {
-  if (GA->isMaterializable())
+void AssemblyWriter::printIndirectSymbol(const GlobalIndirectSymbol *GIS) {
+  if (GIS->isMaterializable())
     Out << "; Materializable\n";
 
-  WriteAsOperandInternal(Out, GA, &TypePrinter, &Machine, GA->getParent());
+  WriteAsOperandInternal(Out, GIS, &TypePrinter, &Machine, GIS->getParent());
   Out << " = ";
 
-  PrintLinkage(GA->getLinkage(), Out);
-  PrintVisibility(GA->getVisibility(), Out);
-  PrintDLLStorageClass(GA->getDLLStorageClass(), Out);
-  PrintThreadLocalModel(GA->getThreadLocalMode(), Out);
-  if (GA->hasUnnamedAddr())
-    Out << "unnamed_addr ";
-
-  Out << "alias ";
+  Out << getLinkagePrintName(GIS->getLinkage());
+  PrintVisibility(GIS->getVisibility(), Out);
+  PrintDLLStorageClass(GIS->getDLLStorageClass(), Out);
+  PrintThreadLocalModel(GIS->getThreadLocalMode(), Out);
+  StringRef UA = getUnnamedAddrEncoding(GIS->getUnnamedAddr());
+  if (!UA.empty())
+      Out << UA << ' ';
+
+  if (isa<GlobalAlias>(GIS))
+    Out << "alias ";
+  else if (isa<GlobalIFunc>(GIS))
+    Out << "ifunc ";
+  else
+    llvm_unreachable("Not an alias or ifunc!");
 
-  TypePrinter.print(GA->getValueType(), Out);
+  TypePrinter.print(GIS->getValueType(), Out);
 
   Out << ", ";
 
-  const Constant *Aliasee = GA->getAliasee();
+  const Constant *IS = GIS->getIndirectSymbol();
 
-  if (!Aliasee) {
-    TypePrinter.print(GA->getType(), Out);
+  if (!IS) {
+    TypePrinter.print(GIS->getType(), Out);
     Out << " <<NULL ALIASEE>>";
   } else {
-    writeOperand(Aliasee, !isa<ConstantExpr>(Aliasee));
+    writeOperand(IS, !isa<ConstantExpr>(IS));
   }
 
-  printInfoComment(*GA);
+  printInfoComment(*GIS);
   Out << '\n';
 }
 
@@ -2544,12 +2614,18 @@ void AssemblyWriter::printFunction(const Function *F) {
       Out << "; Function Attrs: " << AttrStr << '\n';
   }
 
-  if (F->isDeclaration())
-    Out << "declare ";
-  else
+  Machine.incorporateFunction(F);
+
+  if (F->isDeclaration()) {
+    Out << "declare";
+    SmallVector<std::pair<unsigned, MDNode *>, 4> MDs;
+    F->getAllMetadata(MDs);
+    printMetadataAttachments(MDs, " ");
+    Out << ' ';
+  } else
     Out << "define ";
 
-  PrintLinkage(F->getLinkage(), Out);
+  Out << getLinkagePrintName(F->getLinkage());
   PrintVisibility(F->getVisibility(), Out);
   PrintDLLStorageClass(F->getDLLStorageClass(), Out);
 
@@ -2566,7 +2642,6 @@ void AssemblyWriter::printFunction(const Function *F) {
   Out << ' ';
   WriteAsOperandInternal(Out, F, &TypePrinter, &Machine, F->getParent());
   Out << '(';
-  Machine.incorporateFunction(F);
 
   // Loop over the arguments, printing them...
   if (F->isDeclaration() && !IsForDebug) {
@@ -2598,8 +2673,9 @@ void AssemblyWriter::printFunction(const Function *F) {
     Out << "...";  // Output varargs portion of signature!
   }
   Out << ')';
-  if (F->hasUnnamedAddr())
-    Out << " unnamed_addr";
+  StringRef UA = getUnnamedAddrEncoding(F->getUnnamedAddr());
+  if (!UA.empty())
+    Out << ' ' << UA;
   if (Attrs.hasAttributes(AttributeSet::FunctionIndex))
     Out << " #" << Machine.getAttributeGroupSlot(Attrs.getFnAttributes());
   if (F->hasSection()) {
@@ -2625,17 +2701,17 @@ void AssemblyWriter::printFunction(const Function *F) {
     writeOperand(F->getPersonalityFn(), /*PrintType=*/true);
   }
 
-  SmallVector<std::pair<unsigned, MDNode *>, 4> MDs;
-  F->getAllMetadata(MDs);
-  printMetadataAttachments(MDs, " ");
-
   if (F->isDeclaration()) {
     Out << '\n';
   } else {
+    SmallVector<std::pair<unsigned, MDNode *>, 4> MDs;
+    F->getAllMetadata(MDs);
+    printMetadataAttachments(MDs, " ");
+
     Out << " {";
     // Output all of the function's basic blocks.
-    for (Function::const_iterator I = F->begin(), E = F->end(); I != E; ++I)
-      printBasicBlock(&*I);
+    for (const BasicBlock &BB : *F)
+      printBasicBlock(&BB);
 
     // Output the function's use-lists.
     printUseLists(F);
@@ -2676,7 +2752,7 @@ void AssemblyWriter::printBasicBlock(const BasicBlock *BB) {
     Out << "\n; <label>:";
     int Slot = Machine.getLocalSlot(BB);
     if (Slot != -1)
-      Out << Slot;
+      Out << Slot << ":";
     else
       Out << "<badref>";
   }
@@ -2707,8 +2783,8 @@ void AssemblyWriter::printBasicBlock(const BasicBlock *BB) {
   if (AnnotationWriter) AnnotationWriter->emitBasicBlockStartAnnot(BB, Out);
 
   // Output all of the instructions in the basic block...
-  for (BasicBlock::const_iterator I = BB->begin(), E = BB->end(); I != E; ++I) {
-    printInstructionLine(*I);
+  for (const Instruction &I : *BB) {
+    printInstructionLine(I);
   }
 
   if (AnnotationWriter) AnnotationWriter->emitBasicBlockEndAnnot(BB, Out);
@@ -3012,6 +3088,8 @@ void AssemblyWriter::printInstruction(const Instruction &I) {
     Out << ' ';
     if (AI->isUsedWithInAlloca())
       Out << "inalloca ";
+    if (AI->isSwiftError())
+      Out << "swifterror ";
     TypePrinter.print(AI->getAllocatedType(), Out);
 
     // Explicitly write the array size if the code is broken, if it's an array
@@ -3166,10 +3244,9 @@ void AssemblyWriter::writeAllAttributeGroups() {
        I != E; ++I)
     asVec[I->second] = *I;
 
-  for (std::vector<std::pair<AttributeSet, unsigned> >::iterator
-         I = asVec.begin(), E = asVec.end(); I != E; ++I)
-    Out << "attributes #" << I->second << " = { "
-        << I->first.getAsString(AttributeSet::FunctionIndex, true) << " }\n";
+  for (const auto &I : asVec)
+    Out << "attributes #" << I.second << " = { "
+        << I.first.getAsString(AttributeSet::FunctionIndex, true) << " }\n";
 }
 
 void AssemblyWriter::printUseListOrder(const UseListOrder &Order) {
@@ -3215,6 +3292,17 @@ void AssemblyWriter::printUseLists(const Function *F) {
 //                       External Interface declarations
 //===----------------------------------------------------------------------===//
 
+void Function::print(raw_ostream &ROS, AssemblyAnnotationWriter *AAW,
+                     bool ShouldPreserveUseListOrder,
+                     bool IsForDebug) const {
+  SlotTracker SlotTable(this->getParent());
+  formatted_raw_ostream OS(ROS);
+  AssemblyWriter W(OS, SlotTable, this->getParent(), AAW,
+                   IsForDebug,
+                   ShouldPreserveUseListOrder);
+  W.printFunction(this);
+}
+
 void Module::print(raw_ostream &ROS, AssemblyAnnotationWriter *AAW,
                    bool ShouldPreserveUseListOrder, bool IsForDebug) const {
   SlotTracker SlotTable(this);
@@ -3231,6 +3319,22 @@ void NamedMDNode::print(raw_ostream &ROS, bool IsForDebug) const {
   W.printNamedMDNode(this);
 }
 
+void NamedMDNode::print(raw_ostream &ROS, ModuleSlotTracker &MST,
+                        bool IsForDebug) const {
+  Optional<SlotTracker> LocalST;
+  SlotTracker *SlotTable;
+  if (auto *ST = MST.getMachine())
+    SlotTable = ST;
+  else {
+    LocalST.emplace(getParent());
+    SlotTable = &*LocalST;
+  }
+
+  formatted_raw_ostream OS(ROS);
+  AssemblyWriter W(OS, *SlotTable, getParent(), nullptr, IsForDebug);
+  W.printNamedMDNode(this);
+}
+
 void Comdat::print(raw_ostream &ROS, bool /*IsForDebug*/) const {
   PrintLLVMName(ROS, getName(), ComdatPrefix);
   ROS << " = comdat ";
@@ -3256,10 +3360,13 @@ void Comdat::print(raw_ostream &ROS, bool /*IsForDebug*/) const {
   ROS << '\n';
 }
 
-void Type::print(raw_ostream &OS, bool /*IsForDebug*/) const {
+void Type::print(raw_ostream &OS, bool /*IsForDebug*/, bool NoDetails) const {
   TypePrinting TP;
   TP.print(const_cast<Type*>(this), OS);
 
+  if (NoDetails)
+    return;
+
   // If the type is a named struct type, print the body as well.
   if (StructType *STy = dyn_cast<StructType>(const_cast<Type*>(this)))
     if (!STy->isLiteral()) {
@@ -3316,7 +3423,7 @@ void Value::print(raw_ostream &ROS, ModuleSlotTracker &MST,
     else if (const Function *F = dyn_cast<Function>(GV))
       W.printFunction(F);
     else
-      W.printAlias(cast<GlobalAlias>(GV));
+      W.printIndirectSymbol(cast<GlobalIndirectSymbol>(GV));
   } else if (const MetadataAsValue *V = dyn_cast<MetadataAsValue>(this)) {
     V->getMetadata()->print(ROS, MST, getModuleFromVal(V));
   } else if (const Constant *C = dyn_cast<Constant>(this)) {
diff --git a/lib/IR/AttributeImpl.h b/lib/IR/AttributeImpl.h
index 659f9568b7c6..267a0dab2f25 100644
--- a/lib/IR/AttributeImpl.h
+++ b/lib/IR/AttributeImpl.h
@@ -17,8 +17,11 @@
 #define LLVM_LIB_IR_ATTRIBUTEIMPL_H
 
 #include "llvm/ADT/FoldingSet.h"
+#include "llvm/ADT/Optional.h"
 #include "llvm/IR/Attributes.h"
+#include "llvm/Support/DataTypes.h"
 #include "llvm/Support/TrailingObjects.h"
+#include <climits>
 #include <string>
 
 namespace llvm {
@@ -118,7 +121,8 @@ public:
       : EnumAttributeImpl(IntAttrEntry, Kind), Val(Val) {
     assert((Kind == Attribute::Alignment || Kind == Attribute::StackAlignment ||
             Kind == Attribute::Dereferenceable ||
-            Kind == Attribute::DereferenceableOrNull) &&
+            Kind == Attribute::DereferenceableOrNull ||
+            Kind == Attribute::AllocSize) &&
            "Wrong kind for int attribute!");
   }
 
@@ -148,19 +152,37 @@ class AttributeSetNode final
   friend TrailingObjects;
 
   unsigned NumAttrs; ///< Number of attributes in this node.
+  /// Bitset with a bit for each available attribute Attribute::AttrKind.
+  uint64_t AvailableAttrs;
 
-  AttributeSetNode(ArrayRef<Attribute> Attrs) : NumAttrs(Attrs.size()) {
+  AttributeSetNode(ArrayRef<Attribute> Attrs)
+    : NumAttrs(Attrs.size()), AvailableAttrs(0) {
+    static_assert(Attribute::EndAttrKinds <= sizeof(AvailableAttrs) * CHAR_BIT,
+                  "Too many attributes for AvailableAttrs");
     // There's memory after the node where we can store the entries in.
     std::copy(Attrs.begin(), Attrs.end(), getTrailingObjects<Attribute>());
+
+    for (Attribute I : *this) {
+      if (!I.isStringAttribute()) {
+        AvailableAttrs |= ((uint64_t)1) << I.getKindAsEnum();
+      }
+    }
   }
 
   // AttributesSetNode is uniqued, these should not be publicly available.
   void operator=(const AttributeSetNode &) = delete;
   AttributeSetNode(const AttributeSetNode &) = delete;
 public:
+  void operator delete(void *p) { ::operator delete(p); }
+
   static AttributeSetNode *get(LLVMContext &C, ArrayRef<Attribute> Attrs);
 
-  bool hasAttribute(Attribute::AttrKind Kind) const;
+  /// \brief Return the number of attributes this AttributeSet contains.
+  unsigned getNumAttributes() const { return NumAttrs; }
+
+  bool hasAttribute(Attribute::AttrKind Kind) const {
+    return AvailableAttrs & ((uint64_t)1) << Kind;
+  }
   bool hasAttribute(StringRef Kind) const;
   bool hasAttributes() const { return NumAttrs != 0; }
 
@@ -171,6 +193,7 @@ public:
   unsigned getStackAlignment() const;
   uint64_t getDereferenceableBytes() const;
   uint64_t getDereferenceableOrNullBytes() const;
+  std::pair<unsigned, Optional<unsigned>> getAllocSizeArgs() const;
   std::string getAsString(bool InAttrGrp) const;
 
   typedef const Attribute *iterator;
@@ -200,10 +223,12 @@ class AttributeSetImpl final
 
 private:
   LLVMContext &Context;
-  unsigned NumAttrs; ///< Number of entries in this set.
+  unsigned NumSlots; ///< Number of entries in this set.
+  /// Bitset with a bit for each available attribute Attribute::AttrKind.
+  uint64_t AvailableFunctionAttrs;
 
   // Helper fn for TrailingObjects class.
-  size_t numTrailingObjects(OverloadToken<IndexAttrPair>) { return NumAttrs; }
+  size_t numTrailingObjects(OverloadToken<IndexAttrPair>) { return NumSlots; }
 
   /// \brief Return a pointer to the IndexAttrPair for the specified slot.
   const IndexAttrPair *getNode(unsigned Slot) const {
@@ -215,27 +240,48 @@ private:
   AttributeSetImpl(const AttributeSetImpl &) = delete;
 public:
   AttributeSetImpl(LLVMContext &C,
-                   ArrayRef<std::pair<unsigned, AttributeSetNode *> > Attrs)
-      : Context(C), NumAttrs(Attrs.size()) {
+                   ArrayRef<std::pair<unsigned, AttributeSetNode *> > Slots)
+      : Context(C), NumSlots(Slots.size()), AvailableFunctionAttrs(0) {
+    static_assert(Attribute::EndAttrKinds <=
+                      sizeof(AvailableFunctionAttrs) * CHAR_BIT,
+                  "Too many attributes");
 
 #ifndef NDEBUG
-    if (Attrs.size() >= 2) {
-      for (const std::pair<unsigned, AttributeSetNode *> *i = Attrs.begin() + 1,
-                                                         *e = Attrs.end();
+    if (Slots.size() >= 2) {
+      for (const std::pair<unsigned, AttributeSetNode *> *i = Slots.begin() + 1,
+                                                         *e = Slots.end();
            i != e; ++i) {
         assert((i-1)->first <= i->first && "Attribute set not ordered!");
       }
     }
 #endif
     // There's memory after the node where we can store the entries in.
-    std::copy(Attrs.begin(), Attrs.end(), getTrailingObjects<IndexAttrPair>());
+    std::copy(Slots.begin(), Slots.end(), getTrailingObjects<IndexAttrPair>());
+
+    // Initialize AvailableFunctionAttrs summary bitset.
+    if (NumSlots > 0) {
+      static_assert(AttributeSet::FunctionIndex == ~0u,
+                    "FunctionIndex should be biggest possible index");
+      const std::pair<unsigned, AttributeSetNode *> &Last = Slots.back();
+      if (Last.first == AttributeSet::FunctionIndex) {
+        const AttributeSetNode *Node = Last.second;
+        for (Attribute I : *Node) {
+          if (!I.isStringAttribute())
+            AvailableFunctionAttrs |= ((uint64_t)1) << I.getKindAsEnum();
+        }
+      }
+    }
   }
 
+  void operator delete(void *p) { ::operator delete(p); }
+
   /// \brief Get the context that created this AttributeSetImpl.
   LLVMContext &getContext() { return Context; }
 
-  /// \brief Return the number of attributes this AttributeSet contains.
-  unsigned getNumAttributes() const { return NumAttrs; }
+  /// \brief Return the number of slots used in this attribute list. This is
+  /// the number of arguments that have an attribute set on them (including the
+  /// function itself).
+  unsigned getNumSlots() const { return NumSlots; }
 
   /// \brief Get the index of the given "slot" in the AttrNodes list. This index
   /// is the index of the return, parameter, or function object that the
@@ -258,12 +304,18 @@ public:
     return getNode(Slot)->second;
   }
 
+  /// \brief Return true if the AttributeSetNode for the FunctionIndex has an
+  /// enum attribute of the given kind.
+  bool hasFnAttribute(Attribute::AttrKind Kind) const {
+    return AvailableFunctionAttrs & ((uint64_t)1) << Kind;
+  }
+
   typedef AttributeSetNode::iterator iterator;
   iterator begin(unsigned Slot) const { return getSlotNode(Slot)->begin(); }
   iterator end(unsigned Slot) const { return getSlotNode(Slot)->end(); }
 
   void Profile(FoldingSetNodeID &ID) const {
-    Profile(ID, makeArrayRef(getNode(0), getNumAttributes()));
+    Profile(ID, makeArrayRef(getNode(0), getNumSlots()));
   }
   static void Profile(FoldingSetNodeID &ID,
                       ArrayRef<std::pair<unsigned, AttributeSetNode*> > Nodes) {
diff --git a/lib/IR/Attributes.cpp b/lib/IR/Attributes.cpp
index 6c01bb645629..d774c1ae9dfb 100644
--- a/lib/IR/Attributes.cpp
+++ b/lib/IR/Attributes.cpp
@@ -32,6 +32,35 @@ using namespace llvm;
 // Attribute Construction Methods
 //===----------------------------------------------------------------------===//
 
+// allocsize has two integer arguments, but because they're both 32 bits, we can
+// pack them into one 64-bit value, at the cost of making said value
+// nonsensical.
+//
+// In order to do this, we need to reserve one value of the second (optional)
+// allocsize argument to signify "not present."
+LLVM_CONSTEXPR static unsigned AllocSizeNumElemsNotPresent = -1;
+
+static uint64_t packAllocSizeArgs(unsigned ElemSizeArg,
+                                  const Optional<unsigned> &NumElemsArg) {
+  assert((!NumElemsArg.hasValue() ||
+          *NumElemsArg != AllocSizeNumElemsNotPresent) &&
+         "Attempting to pack a reserved value");
+
+  return uint64_t(ElemSizeArg) << 32 |
+         NumElemsArg.getValueOr(AllocSizeNumElemsNotPresent);
+}
+
+static std::pair<unsigned, Optional<unsigned>>
+unpackAllocSizeArgs(uint64_t Num) {
+  unsigned NumElems = Num & std::numeric_limits<unsigned>::max();
+  unsigned ElemSizeArg = Num >> 32;
+
+  Optional<unsigned> NumElemsArg;
+  if (NumElems != AllocSizeNumElemsNotPresent)
+    NumElemsArg = NumElems;
+  return std::make_pair(ElemSizeArg, NumElemsArg);
+}
+
 Attribute Attribute::get(LLVMContext &Context, Attribute::AttrKind Kind,
                          uint64_t Val) {
   LLVMContextImpl *pImpl = Context.pImpl;
@@ -101,6 +130,14 @@ Attribute Attribute::getWithDereferenceableOrNullBytes(LLVMContext &Context,
   return get(Context, DereferenceableOrNull, Bytes);
 }
 
+Attribute
+Attribute::getWithAllocSizeArgs(LLVMContext &Context, unsigned ElemSizeArg,
+                                const Optional<unsigned> &NumElemsArg) {
+  assert(!(ElemSizeArg == 0 && NumElemsArg && *NumElemsArg == 0) &&
+         "Invalid allocsize arguments -- given allocsize(0, 0)");
+  return get(Context, AllocSize, packAllocSizeArgs(ElemSizeArg, NumElemsArg));
+}
+
 //===----------------------------------------------------------------------===//
 // Attribute Accessor Methods
 //===----------------------------------------------------------------------===//
@@ -154,22 +191,18 @@ bool Attribute::hasAttribute(StringRef Kind) const {
   return pImpl && pImpl->hasAttribute(Kind);
 }
 
-/// This returns the alignment field of an attribute as a byte alignment value.
 unsigned Attribute::getAlignment() const {
   assert(hasAttribute(Attribute::Alignment) &&
          "Trying to get alignment from non-alignment attribute!");
   return pImpl->getValueAsInt();
 }
 
-/// This returns the stack alignment field of an attribute as a byte alignment
-/// value.
 unsigned Attribute::getStackAlignment() const {
   assert(hasAttribute(Attribute::StackAlignment) &&
          "Trying to get alignment from non-alignment attribute!");
   return pImpl->getValueAsInt();
 }
 
-/// This returns the number of dereferenceable bytes.
 uint64_t Attribute::getDereferenceableBytes() const {
   assert(hasAttribute(Attribute::Dereferenceable) &&
          "Trying to get dereferenceable bytes from "
@@ -184,6 +217,12 @@ uint64_t Attribute::getDereferenceableOrNullBytes() const {
   return pImpl->getValueAsInt();
 }
 
+std::pair<unsigned, Optional<unsigned>> Attribute::getAllocSizeArgs() const {
+  assert(hasAttribute(Attribute::AllocSize) &&
+         "Trying to get allocsize args from non-allocsize attribute");
+  return unpackAllocSizeArgs(pImpl->getValueAsInt());
+}
+
 std::string Attribute::getAsString(bool InAttrGrp) const {
   if (!pImpl) return "";
 
@@ -199,6 +238,10 @@ std::string Attribute::getAsString(bool InAttrGrp) const {
     return "byval";
   if (hasAttribute(Attribute::Convergent))
     return "convergent";
+  if (hasAttribute(Attribute::SwiftError))
+    return "swifterror";
+  if (hasAttribute(Attribute::SwiftSelf))
+    return "swiftself";
   if (hasAttribute(Attribute::InaccessibleMemOnly))
     return "inaccessiblememonly";
   if (hasAttribute(Attribute::InaccessibleMemOrArgMemOnly))
@@ -249,6 +292,8 @@ std::string Attribute::getAsString(bool InAttrGrp) const {
     return "readnone";
   if (hasAttribute(Attribute::ReadOnly))
     return "readonly";
+  if (hasAttribute(Attribute::WriteOnly))
+    return "writeonly";
   if (hasAttribute(Attribute::Returned))
     return "returned";
   if (hasAttribute(Attribute::ReturnsTwice))
@@ -312,6 +357,21 @@ std::string Attribute::getAsString(bool InAttrGrp) const {
   if (hasAttribute(Attribute::DereferenceableOrNull))
     return AttrWithBytesToString("dereferenceable_or_null");
 
+  if (hasAttribute(Attribute::AllocSize)) {
+    unsigned ElemSize;
+    Optional<unsigned> NumElems;
+    std::tie(ElemSize, NumElems) = getAllocSizeArgs();
+
+    std::string Result = "allocsize(";
+    Result += utostr(ElemSize);
+    if (NumElems.hasValue()) {
+      Result += ',';
+      Result += utostr(*NumElems);
+    }
+    Result += ')';
+    return Result;
+  }
+
   // Convert target-dependent attributes to strings of the form:
   //
   //   "kind"
@@ -389,7 +449,11 @@ bool AttributeImpl::operator<(const AttributeImpl &AI) const {
 
   if (isIntAttribute()) {
     if (AI.isEnumAttribute()) return false;
-    if (AI.isIntAttribute()) return getValueAsInt() < AI.getValueAsInt();
+    if (AI.isIntAttribute()) {
+      if (getKindAsEnum() == AI.getKindAsEnum())
+        return getValueAsInt() < AI.getValueAsInt();
+      return getKindAsEnum() < AI.getKindAsEnum();
+    }
     if (AI.isStringAttribute()) return true;
   }
 
@@ -452,6 +516,9 @@ uint64_t AttributeImpl::getAttrMask(Attribute::AttrKind Val) {
   case Attribute::NoRecurse:       return 1ULL << 48;
   case Attribute::InaccessibleMemOnly:         return 1ULL << 49;
   case Attribute::InaccessibleMemOrArgMemOnly: return 1ULL << 50;
+  case Attribute::SwiftSelf:       return 1ULL << 51;
+  case Attribute::SwiftError:      return 1ULL << 52;
+  case Attribute::WriteOnly:       return 1ULL << 53;
   case Attribute::Dereferenceable:
     llvm_unreachable("dereferenceable attribute not supported in raw format");
     break;
@@ -462,6 +529,9 @@ uint64_t AttributeImpl::getAttrMask(Attribute::AttrKind Val) {
   case Attribute::ArgMemOnly:
     llvm_unreachable("argmemonly attribute not supported in raw format");
     break;
+  case Attribute::AllocSize:
+    llvm_unreachable("allocsize not supported in raw format");
+    break;
   }
   llvm_unreachable("Unsupported attribute type");
 }
@@ -480,7 +550,7 @@ AttributeSetNode *AttributeSetNode::get(LLVMContext &C,
   FoldingSetNodeID ID;
 
   SmallVector<Attribute, 8> SortedAttrs(Attrs.begin(), Attrs.end());
-  array_pod_sort(SortedAttrs.begin(), SortedAttrs.end());
+  std::sort(SortedAttrs.begin(), SortedAttrs.end());
 
   for (Attribute Attr : SortedAttrs)
     Attr.Profile(ID);
@@ -502,62 +572,65 @@ AttributeSetNode *AttributeSetNode::get(LLVMContext &C,
   return PA;
 }
 
-bool AttributeSetNode::hasAttribute(Attribute::AttrKind Kind) const {
-  for (iterator I = begin(), E = end(); I != E; ++I)
-    if (I->hasAttribute(Kind))
-      return true;
-  return false;
-}
-
 bool AttributeSetNode::hasAttribute(StringRef Kind) const {
-  for (iterator I = begin(), E = end(); I != E; ++I)
-    if (I->hasAttribute(Kind))
+  for (Attribute I : *this)
+    if (I.hasAttribute(Kind))
       return true;
   return false;
 }
 
 Attribute AttributeSetNode::getAttribute(Attribute::AttrKind Kind) const {
-  for (iterator I = begin(), E = end(); I != E; ++I)
-    if (I->hasAttribute(Kind))
-      return *I;
+  if (hasAttribute(Kind)) {
+    for (Attribute I : *this)
+      if (I.hasAttribute(Kind))
+        return I;
+  }
   return Attribute();
 }
 
 Attribute AttributeSetNode::getAttribute(StringRef Kind) const {
-  for (iterator I = begin(), E = end(); I != E; ++I)
-    if (I->hasAttribute(Kind))
-      return *I;
+  for (Attribute I : *this)
+    if (I.hasAttribute(Kind))
+      return I;
   return Attribute();
 }
 
 unsigned AttributeSetNode::getAlignment() const {
-  for (iterator I = begin(), E = end(); I != E; ++I)
-    if (I->hasAttribute(Attribute::Alignment))
-      return I->getAlignment();
+  for (Attribute I : *this)
+    if (I.hasAttribute(Attribute::Alignment))
+      return I.getAlignment();
   return 0;
 }
 
 unsigned AttributeSetNode::getStackAlignment() const {
-  for (iterator I = begin(), E = end(); I != E; ++I)
-    if (I->hasAttribute(Attribute::StackAlignment))
-      return I->getStackAlignment();
+  for (Attribute I : *this)
+    if (I.hasAttribute(Attribute::StackAlignment))
+      return I.getStackAlignment();
   return 0;
 }
 
 uint64_t AttributeSetNode::getDereferenceableBytes() const {
-  for (iterator I = begin(), E = end(); I != E; ++I)
-    if (I->hasAttribute(Attribute::Dereferenceable))
-      return I->getDereferenceableBytes();
+  for (Attribute I : *this)
+    if (I.hasAttribute(Attribute::Dereferenceable))
+      return I.getDereferenceableBytes();
   return 0;
 }
 
 uint64_t AttributeSetNode::getDereferenceableOrNullBytes() const {
-  for (iterator I = begin(), E = end(); I != E; ++I)
-    if (I->hasAttribute(Attribute::DereferenceableOrNull))
-      return I->getDereferenceableOrNullBytes();
+  for (Attribute I : *this)
+    if (I.hasAttribute(Attribute::DereferenceableOrNull))
+      return I.getDereferenceableOrNullBytes();
   return 0;
 }
 
+std::pair<unsigned, Optional<unsigned>>
+AttributeSetNode::getAllocSizeArgs() const {
+  for (Attribute I : *this)
+    if (I.hasAttribute(Attribute::AllocSize))
+      return I.getAllocSizeArgs();
+  return std::make_pair(0, 0);
+}
+
 std::string AttributeSetNode::getAsString(bool InAttrGrp) const {
   std::string Str;
   for (iterator I = begin(), E = end(); I != E; ++I) {
@@ -573,7 +646,7 @@ std::string AttributeSetNode::getAsString(bool InAttrGrp) const {
 //===----------------------------------------------------------------------===//
 
 uint64_t AttributeSetImpl::Raw(unsigned Index) const {
-  for (unsigned I = 0, E = getNumAttributes(); I != E; ++I) {
+  for (unsigned I = 0, E = getNumSlots(); I != E; ++I) {
     if (getSlotIndex(I) != Index) continue;
     const AttributeSetNode *ASN = getSlotNode(I);
     uint64_t Mask = 0;
@@ -593,6 +666,8 @@ uint64_t AttributeSetImpl::Raw(unsigned Index) const {
         Mask |= (Log2_32(ASN->getStackAlignment()) + 1) << 26;
       else if (Kind == Attribute::Dereferenceable)
         llvm_unreachable("dereferenceable not supported in bit mask");
+      else if (Kind == Attribute::AllocSize)
+        llvm_unreachable("allocsize not supported in bit mask");
       else
         Mask |= AttributeImpl::getAttrMask(Kind);
     }
@@ -603,7 +678,7 @@ uint64_t AttributeSetImpl::Raw(unsigned Index) const {
   return 0;
 }
 
-void AttributeSetImpl::dump() const {
+LLVM_DUMP_METHOD void AttributeSetImpl::dump() const {
   AttributeSet(const_cast<AttributeSetImpl *>(this)).dump();
 }
 
@@ -708,6 +783,11 @@ AttributeSet AttributeSet::get(LLVMContext &C, unsigned Index,
       Attr = Attribute::getWithDereferenceableOrNullBytes(
           C, B.getDereferenceableOrNullBytes());
       break;
+    case Attribute::AllocSize: {
+      auto A = B.getAllocSizeArgs();
+      Attr = Attribute::getWithAllocSizeArgs(C, A.first, A.second);
+      break;
+    }
     default:
       Attr = Attribute::get(C, Kind);
     }
@@ -715,7 +795,7 @@ AttributeSet AttributeSet::get(LLVMContext &C, unsigned Index,
   }
 
   // Add target-dependent (string) attributes.
-  for (const AttrBuilder::td_type &TDA : B.td_attrs())
+  for (const auto &TDA : B.td_attrs())
     Attrs.push_back(
         std::make_pair(Index, Attribute::get(C, TDA.first, TDA.second)));
 
@@ -723,9 +803,17 @@ AttributeSet AttributeSet::get(LLVMContext &C, unsigned Index,
 }
 
 AttributeSet AttributeSet::get(LLVMContext &C, unsigned Index,
-                               ArrayRef<Attribute::AttrKind> Kind) {
+                               ArrayRef<Attribute::AttrKind> Kinds) {
   SmallVector<std::pair<unsigned, Attribute>, 8> Attrs;
-  for (Attribute::AttrKind K : Kind)
+  for (Attribute::AttrKind K : Kinds)
+    Attrs.push_back(std::make_pair(Index, Attribute::get(C, K)));
+  return get(C, Attrs);
+}
+
+AttributeSet AttributeSet::get(LLVMContext &C, unsigned Index,
+                               ArrayRef<StringRef> Kinds) {
+  SmallVector<std::pair<unsigned, Attribute>, 8> Attrs;
+  for (StringRef K : Kinds)
     Attrs.push_back(std::make_pair(Index, Attribute::get(C, K)));
   return get(C, Attrs);
 }
@@ -737,7 +825,7 @@ AttributeSet AttributeSet::get(LLVMContext &C, ArrayRef<AttributeSet> Attrs) {
   SmallVector<std::pair<unsigned, AttributeSetNode*>, 8> AttrNodeVec;
   AttributeSetImpl *A0 = Attrs[0].pImpl;
   if (A0)
-    AttrNodeVec.append(A0->getNode(0), A0->getNode(A0->getNumAttributes()));
+    AttrNodeVec.append(A0->getNode(0), A0->getNode(A0->getNumSlots()));
   // Copy all attributes from Attrs into AttrNodeVec while keeping AttrNodeVec
   // ordered by index.  Because we know that each list in Attrs is ordered by
   // index we only need to merge each successive list in rather than doing a
@@ -748,7 +836,7 @@ AttributeSet AttributeSet::get(LLVMContext &C, ArrayRef<AttributeSet> Attrs) {
     SmallVector<std::pair<unsigned, AttributeSetNode *>, 8>::iterator
       ANVI = AttrNodeVec.begin(), ANVE;
     for (const IndexAttrPair *AI = AS->getNode(0),
-                             *AE = AS->getNode(AS->getNumAttributes());
+                             *AE = AS->getNode(AS->getNumSlots());
          AI != AE; ++AI) {
       ANVE = AttrNodeVec.end();
       while (ANVI != ANVE && ANVI->first <= AI->first)
@@ -761,16 +849,9 @@ AttributeSet AttributeSet::get(LLVMContext &C, ArrayRef<AttributeSet> Attrs) {
 }
 
 AttributeSet AttributeSet::addAttribute(LLVMContext &C, unsigned Index,
-                                        Attribute::AttrKind Attr) const {
-  if (hasAttribute(Index, Attr)) return *this;
-  return addAttributes(C, Index, AttributeSet::get(C, Index, Attr));
-}
-
-AttributeSet AttributeSet::addAttribute(LLVMContext &C, unsigned Index,
-                                        StringRef Kind) const {
-  llvm::AttrBuilder B;
-  B.addAttribute(Kind);
-  return addAttributes(C, Index, AttributeSet::get(C, Index, B));
+                                        Attribute::AttrKind Kind) const {
+  if (hasAttribute(Index, Kind)) return *this;
+  return addAttributes(C, Index, AttributeSet::get(C, Index, Kind));
 }
 
 AttributeSet AttributeSet::addAttribute(LLVMContext &C, unsigned Index,
@@ -783,7 +864,7 @@ AttributeSet AttributeSet::addAttribute(LLVMContext &C, unsigned Index,
 AttributeSet AttributeSet::addAttribute(LLVMContext &C,
                                         ArrayRef<unsigned> Indices,
                                         Attribute A) const {
-  unsigned I = 0, E = pImpl ? pImpl->getNumAttributes() : 0;
+  unsigned I = 0, E = pImpl ? pImpl->getNumSlots() : 0;
   auto IdxI = Indices.begin(), IdxE = Indices.end();
   SmallVector<AttributeSet, 4> AttrSet;
 
@@ -826,7 +907,7 @@ AttributeSet AttributeSet::addAttributes(LLVMContext &C, unsigned Index,
 
   // Add the attribute slots before the one we're trying to add.
   SmallVector<AttributeSet, 4> AttrSet;
-  uint64_t NumAttrs = pImpl->getNumAttributes();
+  uint64_t NumAttrs = pImpl->getNumSlots();
   AttributeSet AS;
   uint64_t LastIndex = 0;
   for (unsigned I = 0, E = NumAttrs; I != E; ++I) {
@@ -842,7 +923,7 @@ AttributeSet AttributeSet::addAttributes(LLVMContext &C, unsigned Index,
   // AttributeSet there.
   AttrBuilder B(AS, Index);
 
-  for (unsigned I = 0, E = Attrs.pImpl->getNumAttributes(); I != E; ++I)
+  for (unsigned I = 0, E = Attrs.pImpl->getNumSlots(); I != E; ++I)
     if (Attrs.getSlotIndex(I) == Index) {
       for (AttributeSetImpl::iterator II = Attrs.pImpl->begin(I),
              IE = Attrs.pImpl->end(I); II != IE; ++II)
@@ -860,9 +941,15 @@ AttributeSet AttributeSet::addAttributes(LLVMContext &C, unsigned Index,
 }
 
 AttributeSet AttributeSet::removeAttribute(LLVMContext &C, unsigned Index,
-                                           Attribute::AttrKind Attr) const {
-  if (!hasAttribute(Index, Attr)) return *this;
-  return removeAttributes(C, Index, AttributeSet::get(C, Index, Attr));
+                                           Attribute::AttrKind Kind) const {
+  if (!hasAttribute(Index, Kind)) return *this;
+  return removeAttributes(C, Index, AttributeSet::get(C, Index, Kind));
+}
+
+AttributeSet AttributeSet::removeAttribute(LLVMContext &C, unsigned Index,
+                                           StringRef Kind) const {
+  if (!hasAttribute(Index, Kind)) return *this;
+  return removeAttributes(C, Index, AttributeSet::get(C, Index, Kind));
 }
 
 AttributeSet AttributeSet::removeAttributes(LLVMContext &C, unsigned Index,
@@ -877,7 +964,7 @@ AttributeSet AttributeSet::removeAttributes(LLVMContext &C, unsigned Index,
 
   // Add the attribute slots before the one we're trying to add.
   SmallVector<AttributeSet, 4> AttrSet;
-  uint64_t NumAttrs = pImpl->getNumAttributes();
+  uint64_t NumAttrs = pImpl->getNumSlots();
   AttributeSet AS;
   uint64_t LastIndex = 0;
   for (unsigned I = 0, E = NumAttrs; I != E; ++I) {
@@ -893,7 +980,7 @@ AttributeSet AttributeSet::removeAttributes(LLVMContext &C, unsigned Index,
   // AttributeSet there.
   AttrBuilder B(AS, Index);
 
-  for (unsigned I = 0, E = Attrs.pImpl->getNumAttributes(); I != E; ++I)
+  for (unsigned I = 0, E = Attrs.pImpl->getNumSlots(); I != E; ++I)
     if (Attrs.getSlotIndex(I) == Index) {
       B.removeAttributes(Attrs.pImpl->getSlotAttributes(I), Index);
       break;
@@ -918,7 +1005,7 @@ AttributeSet AttributeSet::removeAttributes(LLVMContext &C, unsigned Index,
 
   // Add the attribute slots before the one we're trying to add.
   SmallVector<AttributeSet, 4> AttrSet;
-  uint64_t NumAttrs = pImpl->getNumAttributes();
+  uint64_t NumAttrs = pImpl->getNumSlots();
   AttributeSet AS;
   uint64_t LastIndex = 0;
   for (unsigned I = 0, E = NumAttrs; I != E; ++I) {
@@ -959,6 +1046,15 @@ AttributeSet AttributeSet::addDereferenceableOrNullAttr(LLVMContext &C,
   return addAttributes(C, Index, AttributeSet::get(C, Index, B));
 }
 
+AttributeSet
+AttributeSet::addAllocSizeAttr(LLVMContext &C, unsigned Index,
+                               unsigned ElemSizeArg,
+                               const Optional<unsigned> &NumElemsArg) {
+  llvm::AttrBuilder B;
+  B.addAllocSizeAttr(ElemSizeArg, NumElemsArg);
+  return addAttributes(C, Index, AttributeSet::get(C, Index, B));
+}
+
 //===----------------------------------------------------------------------===//
 // AttributeSet Accessor Methods
 //===----------------------------------------------------------------------===//
@@ -1008,16 +1104,21 @@ bool AttributeSet::hasAttributes(unsigned Index) const {
   return ASN && ASN->hasAttributes();
 }
 
-/// \brief Return true if the specified attribute is set for at least one
-/// parameter or for the return value.
-bool AttributeSet::hasAttrSomewhere(Attribute::AttrKind Attr) const {
+bool AttributeSet::hasFnAttribute(Attribute::AttrKind Kind) const {
+  return pImpl && pImpl->hasFnAttribute(Kind);
+}
+
+bool AttributeSet::hasAttrSomewhere(Attribute::AttrKind Attr,
+                                    unsigned *Index) const {
   if (!pImpl) return false;
 
-  for (unsigned I = 0, E = pImpl->getNumAttributes(); I != E; ++I)
+  for (unsigned I = 0, E = pImpl->getNumSlots(); I != E; ++I)
     for (AttributeSetImpl::iterator II = pImpl->begin(I),
            IE = pImpl->end(I); II != IE; ++II)
-      if (II->hasAttribute(Attr))
+      if (II->hasAttribute(Attr)) {
+        if (Index) *Index = pImpl->getSlotIndex(I);
         return true;
+      }
 
   return false;
 }
@@ -1054,18 +1155,22 @@ uint64_t AttributeSet::getDereferenceableOrNullBytes(unsigned Index) const {
   return ASN ? ASN->getDereferenceableOrNullBytes() : 0;
 }
 
-std::string AttributeSet::getAsString(unsigned Index,
-                                      bool InAttrGrp) const {
+std::pair<unsigned, Optional<unsigned>>
+AttributeSet::getAllocSizeArgs(unsigned Index) const {
+  AttributeSetNode *ASN = getAttributes(Index);
+  return ASN ? ASN->getAllocSizeArgs() : std::make_pair(0, 0);
+}
+
+std::string AttributeSet::getAsString(unsigned Index, bool InAttrGrp) const {
   AttributeSetNode *ASN = getAttributes(Index);
   return ASN ? ASN->getAsString(InAttrGrp) : std::string("");
 }
 
-/// \brief The attributes for the specified index are returned.
 AttributeSetNode *AttributeSet::getAttributes(unsigned Index) const {
   if (!pImpl) return nullptr;
 
   // Loop through to find the attribute node we want.
-  for (unsigned I = 0, E = pImpl->getNumAttributes(); I != E; ++I)
+  for (unsigned I = 0, E = pImpl->getNumSlots(); I != E; ++I)
     if (pImpl->getSlotIndex(I) == Index)
       return pImpl->getSlotNode(I);
 
@@ -1088,21 +1193,18 @@ AttributeSet::iterator AttributeSet::end(unsigned Slot) const {
 // AttributeSet Introspection Methods
 //===----------------------------------------------------------------------===//
 
-/// \brief Return the number of slots used in this attribute list.  This is the
-/// number of arguments that have an attribute set on them (including the
-/// function itself).
 unsigned AttributeSet::getNumSlots() const {
-  return pImpl ? pImpl->getNumAttributes() : 0;
+  return pImpl ? pImpl->getNumSlots() : 0;
 }
 
 unsigned AttributeSet::getSlotIndex(unsigned Slot) const {
-  assert(pImpl && Slot < pImpl->getNumAttributes() &&
+  assert(pImpl && Slot < pImpl->getNumSlots() &&
          "Slot # out of range!");
   return pImpl->getSlotIndex(Slot);
 }
 
 AttributeSet AttributeSet::getSlotAttributes(unsigned Slot) const {
-  assert(pImpl && Slot < pImpl->getNumAttributes() &&
+  assert(pImpl && Slot < pImpl->getNumSlots() &&
          "Slot # out of range!");
   return pImpl->getSlotAttributes(Slot);
 }
@@ -1112,7 +1214,7 @@ uint64_t AttributeSet::Raw(unsigned Index) const {
   return pImpl ? pImpl->Raw(Index) : 0;
 }
 
-void AttributeSet::dump() const {
+LLVM_DUMP_METHOD void AttributeSet::dump() const {
   dbgs() << "PAL[\n";
 
   for (unsigned i = 0, e = getNumSlots(); i < e; ++i) {
@@ -1134,11 +1236,11 @@ void AttributeSet::dump() const {
 
 AttrBuilder::AttrBuilder(AttributeSet AS, unsigned Index)
     : Attrs(0), Alignment(0), StackAlignment(0), DerefBytes(0),
-      DerefOrNullBytes(0) {
+      DerefOrNullBytes(0), AllocSizeArgs(0) {
   AttributeSetImpl *pImpl = AS.pImpl;
   if (!pImpl) return;
 
-  for (unsigned I = 0, E = pImpl->getNumAttributes(); I != E; ++I) {
+  for (unsigned I = 0, E = pImpl->getNumSlots(); I != E; ++I) {
     if (pImpl->getSlotIndex(I) != Index) continue;
 
     for (AttributeSetImpl::iterator II = pImpl->begin(I),
@@ -1153,12 +1255,13 @@ void AttrBuilder::clear() {
   Attrs.reset();
   TargetDepAttrs.clear();
   Alignment = StackAlignment = DerefBytes = DerefOrNullBytes = 0;
+  AllocSizeArgs = 0;
 }
 
 AttrBuilder &AttrBuilder::addAttribute(Attribute::AttrKind Val) {
   assert((unsigned)Val < Attribute::EndAttrKinds && "Attribute out of range!");
   assert(Val != Attribute::Alignment && Val != Attribute::StackAlignment &&
-         Val != Attribute::Dereferenceable &&
+         Val != Attribute::Dereferenceable && Val != Attribute::AllocSize &&
          "Adding integer attribute without adding a value!");
   Attrs[Val] = true;
   return *this;
@@ -1181,6 +1284,8 @@ AttrBuilder &AttrBuilder::addAttribute(Attribute Attr) {
     DerefBytes = Attr.getDereferenceableBytes();
   else if (Kind == Attribute::DereferenceableOrNull)
     DerefOrNullBytes = Attr.getDereferenceableOrNullBytes();
+  else if (Kind == Attribute::AllocSize)
+    AllocSizeArgs = Attr.getValueAsInt();
   return *this;
 }
 
@@ -1201,6 +1306,8 @@ AttrBuilder &AttrBuilder::removeAttribute(Attribute::AttrKind Val) {
     DerefBytes = 0;
   else if (Val == Attribute::DereferenceableOrNull)
     DerefOrNullBytes = 0;
+  else if (Val == Attribute::AllocSize)
+    AllocSizeArgs = 0;
 
   return *this;
 }
@@ -1235,6 +1342,10 @@ AttrBuilder &AttrBuilder::removeAttribute(StringRef A) {
   return *this;
 }
 
+std::pair<unsigned, Optional<unsigned>> AttrBuilder::getAllocSizeArgs() const {
+  return unpackAllocSizeArgs(AllocSizeArgs);
+}
+
 AttrBuilder &AttrBuilder::addAlignmentAttr(unsigned Align) {
   if (Align == 0) return *this;
 
@@ -1275,6 +1386,22 @@ AttrBuilder &AttrBuilder::addDereferenceableOrNullAttr(uint64_t Bytes) {
   return *this;
 }
 
+AttrBuilder &AttrBuilder::addAllocSizeAttr(unsigned ElemSize,
+                                           const Optional<unsigned> &NumElems) {
+  return addAllocSizeAttrFromRawRepr(packAllocSizeArgs(ElemSize, NumElems));
+}
+
+AttrBuilder &AttrBuilder::addAllocSizeAttrFromRawRepr(uint64_t RawArgs) {
+  // (0, 0) is our "not present" value, so we need to check for it here.
+  assert(RawArgs && "Invalid allocsize arguments -- given allocsize(0, 0)");
+
+  Attrs[Attribute::AllocSize] = true;
+  // Reuse existing machinery to store this as a single 64-bit integer so we can
+  // save a few bytes over using a pair<unsigned, Optional<unsigned>>.
+  AllocSizeArgs = RawArgs;
+  return *this;
+}
+
 AttrBuilder &AttrBuilder::merge(const AttrBuilder &B) {
   // FIXME: What if both have alignments, but they don't match?!
   if (!Alignment)
@@ -1289,6 +1416,9 @@ AttrBuilder &AttrBuilder::merge(const AttrBuilder &B) {
   if (!DerefOrNullBytes)
     DerefOrNullBytes = B.DerefOrNullBytes;
 
+  if (!AllocSizeArgs)
+    AllocSizeArgs = B.AllocSizeArgs;
+
   Attrs |= B.Attrs;
 
   for (auto I : B.td_attrs())
@@ -1311,6 +1441,9 @@ AttrBuilder &AttrBuilder::remove(const AttrBuilder &B) {
   if (B.DerefOrNullBytes)
     DerefOrNullBytes = 0;
 
+  if (B.AllocSizeArgs)
+    AllocSizeArgs = 0;
+
   Attrs &= ~B.Attrs;
 
   for (auto I : B.td_attrs())
@@ -1389,7 +1522,8 @@ AttrBuilder &AttrBuilder::addRawValue(uint64_t Val) {
        I = Attribute::AttrKind(I + 1)) {
     if (I == Attribute::Dereferenceable ||
         I == Attribute::DereferenceableOrNull ||
-        I == Attribute::ArgMemOnly)
+        I == Attribute::ArgMemOnly ||
+        I == Attribute::AllocSize)
       continue;
     if (uint64_t A = (Val & AttributeImpl::getAttrMask(I))) {
       Attrs[I] = true;
@@ -1478,20 +1612,14 @@ static void adjustCallerSSPLevel(Function &Caller, const Function &Callee) {
                                               AttributeSet::FunctionIndex,
                                               B);
 
-  if (Callee.hasFnAttribute(Attribute::SafeStack)) {
-    Caller.removeAttributes(AttributeSet::FunctionIndex, OldSSPAttr);
-    Caller.addFnAttr(Attribute::SafeStack);
-  } else if (Callee.hasFnAttribute(Attribute::StackProtectReq) &&
-             !Caller.hasFnAttribute(Attribute::SafeStack)) {
+  if (Callee.hasFnAttribute(Attribute::StackProtectReq)) {
     Caller.removeAttributes(AttributeSet::FunctionIndex, OldSSPAttr);
     Caller.addFnAttr(Attribute::StackProtectReq);
   } else if (Callee.hasFnAttribute(Attribute::StackProtectStrong) &&
-             !Caller.hasFnAttribute(Attribute::SafeStack) &&
              !Caller.hasFnAttribute(Attribute::StackProtectReq)) {
     Caller.removeAttributes(AttributeSet::FunctionIndex, OldSSPAttr);
     Caller.addFnAttr(Attribute::StackProtectStrong);
   } else if (Callee.hasFnAttribute(Attribute::StackProtect) &&
-             !Caller.hasFnAttribute(Attribute::SafeStack) &&
              !Caller.hasFnAttribute(Attribute::StackProtectReq) &&
              !Caller.hasFnAttribute(Attribute::StackProtectStrong))
     Caller.addFnAttr(Attribute::StackProtect);
diff --git a/lib/IR/AutoUpgrade.cpp b/lib/IR/AutoUpgrade.cpp
index 12c354c89b20..431e51bb4562 100644
--- a/lib/IR/AutoUpgrade.cpp
+++ b/lib/IR/AutoUpgrade.cpp
@@ -126,6 +126,10 @@ static bool UpgradeIntrinsicFunction1(Function *F, Function *&NewFn) {
                                           StoreLaneInts[fArgs.size() - 5], Tys);
       return true;
     }
+    if (Name == "aarch64.thread.pointer" || Name == "arm.thread.pointer") {
+      NewFn = Intrinsic::getDeclaration(F->getParent(), Intrinsic::thread_pointer);
+      return true;
+    }
     break;
   }
 
@@ -145,6 +149,31 @@ static bool UpgradeIntrinsicFunction1(Function *F, Function *&NewFn) {
     break;
   }
 
+  case 'm': {
+    if (Name.startswith("masked.load.")) {
+      Type *Tys[] = { F->getReturnType(), F->arg_begin()->getType() };
+      if (F->getName() != Intrinsic::getName(Intrinsic::masked_load, Tys)) {
+        F->setName(Name + ".old");
+        NewFn = Intrinsic::getDeclaration(F->getParent(),
+                                          Intrinsic::masked_load,
+                                          Tys);
+        return true;
+      }
+    }
+    if (Name.startswith("masked.store.")) {
+      auto Args = F->getFunctionType()->params();
+      Type *Tys[] = { Args[0], Args[1] };
+      if (F->getName() != Intrinsic::getName(Intrinsic::masked_store, Tys)) {
+        F->setName(Name + ".old");
+        NewFn = Intrinsic::getDeclaration(F->getParent(),
+                                          Intrinsic::masked_store,
+                                          Tys);
+        return true;
+      }
+    }
+    break;
+  }
+
   case 'o':
     // We only need to change the name to match the mangling including the
     // address space.
@@ -159,101 +188,209 @@ static bool UpgradeIntrinsicFunction1(Function *F, Function *&NewFn) {
     }
     break;
 
+  case 's':
+    if (Name == "stackprotectorcheck") {
+      NewFn = nullptr;
+      return true;
+    }
+
   case 'x': {
-    if (Name.startswith("x86.sse2.pcmpeq.") ||
-        Name.startswith("x86.sse2.pcmpgt.") ||
-        Name.startswith("x86.avx2.pcmpeq.") ||
-        Name.startswith("x86.avx2.pcmpgt.") ||
-        Name.startswith("x86.avx2.vbroadcast") ||
-        Name.startswith("x86.avx2.pbroadcast") ||
-        Name.startswith("x86.avx.vpermil.") ||
-        Name.startswith("x86.sse41.pmovsx") ||
-        Name == "x86.avx.vinsertf128.pd.256" ||
-        Name == "x86.avx.vinsertf128.ps.256" ||
-        Name == "x86.avx.vinsertf128.si.256" ||
-        Name == "x86.avx2.vinserti128" ||
-        Name == "x86.avx.vextractf128.pd.256" ||
-        Name == "x86.avx.vextractf128.ps.256" ||
-        Name == "x86.avx.vextractf128.si.256" ||
-        Name == "x86.avx2.vextracti128" ||
-        Name == "x86.avx.movnt.dq.256" ||
-        Name == "x86.avx.movnt.pd.256" ||
-        Name == "x86.avx.movnt.ps.256" ||
-        Name == "x86.sse42.crc32.64.8" ||
-        Name == "x86.avx.vbroadcast.ss" ||
-        Name == "x86.avx.vbroadcast.ss.256" ||
-        Name == "x86.avx.vbroadcast.sd.256" ||
-        Name == "x86.sse2.psll.dq" ||
-        Name == "x86.sse2.psrl.dq" ||
-        Name == "x86.avx2.psll.dq" ||
-        Name == "x86.avx2.psrl.dq" ||
-        Name == "x86.sse2.psll.dq.bs" ||
-        Name == "x86.sse2.psrl.dq.bs" ||
-        Name == "x86.avx2.psll.dq.bs" ||
-        Name == "x86.avx2.psrl.dq.bs" ||
-        Name == "x86.sse41.pblendw" ||
-        Name == "x86.sse41.blendpd" ||
-        Name == "x86.sse41.blendps" ||
-        Name == "x86.avx.blend.pd.256" ||
-        Name == "x86.avx.blend.ps.256" ||
-        Name == "x86.avx2.pblendw" ||
-        Name == "x86.avx2.pblendd.128" ||
-        Name == "x86.avx2.pblendd.256" ||
-        Name == "x86.avx2.vbroadcasti128" ||
-        Name == "x86.xop.vpcmov" ||
-        (Name.startswith("x86.xop.vpcom") && F->arg_size() == 2)) {
+    bool IsX86 = Name.startswith("x86.");
+    if (IsX86)
+      Name = Name.substr(4);
+
+    if (IsX86 &&
+        (Name.startswith("sse2.pcmpeq.") ||
+         Name.startswith("sse2.pcmpgt.") ||
+         Name.startswith("avx2.pcmpeq.") ||
+         Name.startswith("avx2.pcmpgt.") ||
+         Name.startswith("avx512.mask.pcmpeq.") ||
+         Name.startswith("avx512.mask.pcmpgt.") ||
+         Name == "sse41.pmaxsb" ||
+         Name == "sse2.pmaxs.w" ||
+         Name == "sse41.pmaxsd" ||
+         Name == "sse2.pmaxu.b" ||
+         Name == "sse41.pmaxuw" ||
+         Name == "sse41.pmaxud" ||
+         Name == "sse41.pminsb" ||
+         Name == "sse2.pmins.w" ||
+         Name == "sse41.pminsd" ||
+         Name == "sse2.pminu.b" ||
+         Name == "sse41.pminuw" ||
+         Name == "sse41.pminud" ||
+         Name.startswith("avx2.pmax") ||
+         Name.startswith("avx2.pmin") ||
+         Name.startswith("avx2.vbroadcast") ||
+         Name.startswith("avx2.pbroadcast") ||
+         Name.startswith("avx.vpermil.") ||
+         Name.startswith("sse2.pshuf") ||
+         Name.startswith("avx512.pbroadcast") ||
+         Name.startswith("avx512.mask.broadcast.s") ||
+         Name.startswith("avx512.mask.movddup") ||
+         Name.startswith("avx512.mask.movshdup") ||
+         Name.startswith("avx512.mask.movsldup") ||
+         Name.startswith("avx512.mask.pshuf.d.") ||
+         Name.startswith("avx512.mask.pshufl.w.") ||
+         Name.startswith("avx512.mask.pshufh.w.") ||
+         Name.startswith("avx512.mask.vpermil.p") ||
+         Name.startswith("avx512.mask.perm.df.") ||
+         Name.startswith("avx512.mask.perm.di.") ||
+         Name.startswith("avx512.mask.punpckl") ||
+         Name.startswith("avx512.mask.punpckh") ||
+         Name.startswith("avx512.mask.unpckl.") ||
+         Name.startswith("avx512.mask.unpckh.") ||
+         Name.startswith("avx512.mask.pand.") ||
+         Name.startswith("avx512.mask.pandn.") ||
+         Name.startswith("avx512.mask.por.") ||
+         Name.startswith("avx512.mask.pxor.") ||
+         Name.startswith("sse41.pmovsx") ||
+         Name.startswith("sse41.pmovzx") ||
+         Name.startswith("avx2.pmovsx") ||
+         Name.startswith("avx2.pmovzx") ||
+         Name == "sse2.cvtdq2pd" ||
+         Name == "sse2.cvtps2pd" ||
+         Name == "avx.cvtdq2.pd.256" ||
+         Name == "avx.cvt.ps2.pd.256" ||
+         Name == "sse2.cvttps2dq" ||
+         Name.startswith("avx.cvtt.") ||
+         Name.startswith("avx.vinsertf128.") ||
+         Name == "avx2.vinserti128" ||
+         Name.startswith("avx.vextractf128.") ||
+         Name == "avx2.vextracti128" ||
+         Name.startswith("sse4a.movnt.") ||
+         Name.startswith("avx.movnt.") ||
+         Name.startswith("avx512.storent.") ||
+         Name == "sse2.storel.dq" ||
+         Name.startswith("sse.storeu.") ||
+         Name.startswith("sse2.storeu.") ||
+         Name.startswith("avx.storeu.") ||
+         Name.startswith("avx512.mask.storeu.p") ||
+         Name.startswith("avx512.mask.storeu.b.") ||
+         Name.startswith("avx512.mask.storeu.w.") ||
+         Name.startswith("avx512.mask.storeu.d.") ||
+         Name.startswith("avx512.mask.storeu.q.") ||
+         Name.startswith("avx512.mask.store.p") ||
+         Name.startswith("avx512.mask.store.b.") ||
+         Name.startswith("avx512.mask.store.w.") ||
+         Name.startswith("avx512.mask.store.d.") ||
+         Name.startswith("avx512.mask.store.q.") ||
+         Name.startswith("avx512.mask.loadu.p") ||
+         Name.startswith("avx512.mask.loadu.b.") ||
+         Name.startswith("avx512.mask.loadu.w.") ||
+         Name.startswith("avx512.mask.loadu.d.") ||
+         Name.startswith("avx512.mask.loadu.q.") ||
+         Name.startswith("avx512.mask.load.p") ||
+         Name.startswith("avx512.mask.load.b.") ||
+         Name.startswith("avx512.mask.load.w.") ||
+         Name.startswith("avx512.mask.load.d.") ||
+         Name.startswith("avx512.mask.load.q.") ||
+         Name == "sse42.crc32.64.8" ||
+         Name.startswith("avx.vbroadcast.s") ||
+         Name.startswith("avx512.mask.palignr.") ||
+         Name.startswith("sse2.psll.dq") ||
+         Name.startswith("sse2.psrl.dq") ||
+         Name.startswith("avx2.psll.dq") ||
+         Name.startswith("avx2.psrl.dq") ||
+         Name.startswith("avx512.psll.dq") ||
+         Name.startswith("avx512.psrl.dq") ||
+         Name == "sse41.pblendw" ||
+         Name.startswith("sse41.blendp") ||
+         Name.startswith("avx.blend.p") ||
+         Name == "avx2.pblendw" ||
+         Name.startswith("avx2.pblendd.") ||
+         Name == "avx2.vbroadcasti128" ||
+         Name == "xop.vpcmov" ||
+         (Name.startswith("xop.vpcom") && F->arg_size() == 2))) {
       NewFn = nullptr;
       return true;
     }
     // SSE4.1 ptest functions may have an old signature.
-    if (Name.startswith("x86.sse41.ptest")) {
-      if (Name == "x86.sse41.ptestc")
+    if (IsX86 && Name.startswith("sse41.ptest")) {
+      if (Name.substr(11) == "c")
         return UpgradeSSE41Function(F, Intrinsic::x86_sse41_ptestc, NewFn);
-      if (Name == "x86.sse41.ptestz")
+      if (Name.substr(11) == "z")
         return UpgradeSSE41Function(F, Intrinsic::x86_sse41_ptestz, NewFn);
-      if (Name == "x86.sse41.ptestnzc")
+      if (Name.substr(11) == "nzc")
         return UpgradeSSE41Function(F, Intrinsic::x86_sse41_ptestnzc, NewFn);
     }
     // Several blend and other instructions with masks used the wrong number of
     // bits.
-    if (Name == "x86.sse41.insertps")
+    if (IsX86 && Name == "sse41.insertps")
       return UpgradeX86IntrinsicsWith8BitMask(F, Intrinsic::x86_sse41_insertps,
                                               NewFn);
-    if (Name == "x86.sse41.dppd")
+    if (IsX86 && Name == "sse41.dppd")
       return UpgradeX86IntrinsicsWith8BitMask(F, Intrinsic::x86_sse41_dppd,
                                               NewFn);
-    if (Name == "x86.sse41.dpps")
+    if (IsX86 && Name == "sse41.dpps")
       return UpgradeX86IntrinsicsWith8BitMask(F, Intrinsic::x86_sse41_dpps,
                                               NewFn);
-    if (Name == "x86.sse41.mpsadbw")
+    if (IsX86 && Name == "sse41.mpsadbw")
       return UpgradeX86IntrinsicsWith8BitMask(F, Intrinsic::x86_sse41_mpsadbw,
                                               NewFn);
-    if (Name == "x86.avx.dp.ps.256")
+    if (IsX86 && Name == "avx.dp.ps.256")
       return UpgradeX86IntrinsicsWith8BitMask(F, Intrinsic::x86_avx_dp_ps_256,
                                               NewFn);
-    if (Name == "x86.avx2.mpsadbw")
+    if (IsX86 && Name == "avx2.mpsadbw")
       return UpgradeX86IntrinsicsWith8BitMask(F, Intrinsic::x86_avx2_mpsadbw,
                                               NewFn);
 
     // frcz.ss/sd may need to have an argument dropped
-    if (Name.startswith("x86.xop.vfrcz.ss") && F->arg_size() == 2) {
+    if (IsX86 && Name.startswith("xop.vfrcz.ss") && F->arg_size() == 2) {
       F->setName(Name + ".old");
       NewFn = Intrinsic::getDeclaration(F->getParent(),
                                         Intrinsic::x86_xop_vfrcz_ss);
       return true;
     }
-    if (Name.startswith("x86.xop.vfrcz.sd") && F->arg_size() == 2) {
+    if (IsX86 && Name.startswith("xop.vfrcz.sd") && F->arg_size() == 2) {
       F->setName(Name + ".old");
       NewFn = Intrinsic::getDeclaration(F->getParent(),
                                         Intrinsic::x86_xop_vfrcz_sd);
       return true;
     }
+    if (IsX86 && (Name.startswith("avx512.mask.pslli.") ||
+                  Name.startswith("avx512.mask.psrai.") ||
+                  Name.startswith("avx512.mask.psrli."))) {
+      Intrinsic::ID ShiftID;
+      if (Name.slice(12, 16) == "psll")
+        ShiftID = Name[18] == 'd' ? Intrinsic::x86_avx512_mask_psll_di_512
+                                  : Intrinsic::x86_avx512_mask_psll_qi_512;
+      else if (Name.slice(12, 16) == "psra")
+        ShiftID = Name[18] == 'd' ? Intrinsic::x86_avx512_mask_psra_di_512
+                                  : Intrinsic::x86_avx512_mask_psra_qi_512;
+      else
+        ShiftID = Name[18] == 'd' ? Intrinsic::x86_avx512_mask_psrl_di_512
+                                  : Intrinsic::x86_avx512_mask_psrl_qi_512;
+      F->setName("llvm.x86." + Name + ".old");
+      NewFn = Intrinsic::getDeclaration(F->getParent(), ShiftID);
+      return true;
+    }
     // Fix the FMA4 intrinsics to remove the 4
-    if (Name.startswith("x86.fma4.")) {
-      F->setName("llvm.x86.fma" + Name.substr(8));
+    if (IsX86 && Name.startswith("fma4.")) {
+      F->setName("llvm.x86.fma" + Name.substr(5));
       NewFn = F;
       return true;
     }
+    // Upgrade any XOP PERMIL2 index operand still using a float/double vector.
+    if (IsX86 && Name.startswith("xop.vpermil2")) {
+      auto Params = F->getFunctionType()->params();
+      auto Idx = Params[2];
+      if (Idx->getScalarType()->isFloatingPointTy()) {
+        F->setName("llvm.x86." + Name + ".old");
+        unsigned IdxSize = Idx->getPrimitiveSizeInBits();
+        unsigned EltSize = Idx->getScalarSizeInBits();
+        Intrinsic::ID Permil2ID;
+        if (EltSize == 64 && IdxSize == 128)
+          Permil2ID = Intrinsic::x86_xop_vpermil2pd;
+        else if (EltSize == 32 && IdxSize == 128)
+          Permil2ID = Intrinsic::x86_xop_vpermil2ps;
+        else if (EltSize == 64 && IdxSize == 256)
+          Permil2ID = Intrinsic::x86_xop_vpermil2pd_256;
+        else
+          Permil2ID = Intrinsic::x86_xop_vpermil2ps_256;
+        NewFn = Intrinsic::getDeclaration(F->getParent(), Permil2ID);
+        return true;
+      }
+    }
     break;
   }
   }
@@ -283,83 +420,219 @@ bool llvm::UpgradeGlobalVariable(GlobalVariable *GV) {
   return false;
 }
 
-// Handles upgrading SSE2 and AVX2 PSLLDQ intrinsics by converting them
+// Handles upgrading SSE2/AVX2/AVX512BW PSLLDQ intrinsics by converting them
 // to byte shuffles.
-static Value *UpgradeX86PSLLDQIntrinsics(IRBuilder<> &Builder, LLVMContext &C,
-                                         Value *Op, unsigned NumLanes,
-                                         unsigned Shift) {
-  // Each lane is 16 bytes.
-  unsigned NumElts = NumLanes * 16;
+static Value *UpgradeX86PSLLDQIntrinsics(IRBuilder<> &Builder,
+                                         Value *Op, unsigned Shift) {
+  Type *ResultTy = Op->getType();
+  unsigned NumElts = ResultTy->getVectorNumElements() * 8;
 
   // Bitcast from a 64-bit element type to a byte element type.
-  Op = Builder.CreateBitCast(Op,
-                             VectorType::get(Type::getInt8Ty(C), NumElts),
-                             "cast");
+  Type *VecTy = VectorType::get(Builder.getInt8Ty(), NumElts);
+  Op = Builder.CreateBitCast(Op, VecTy, "cast");
+
   // We'll be shuffling in zeroes.
-  Value *Res = ConstantVector::getSplat(NumElts, Builder.getInt8(0));
+  Value *Res = Constant::getNullValue(VecTy);
 
   // If shift is less than 16, emit a shuffle to move the bytes. Otherwise,
   // we'll just return the zero vector.
   if (Shift < 16) {
-    SmallVector<Constant*, 32> Idxs;
-    // 256-bit version is split into two 16-byte lanes.
+    uint32_t Idxs[64];
+    // 256/512-bit version is split into 2/4 16-byte lanes.
     for (unsigned l = 0; l != NumElts; l += 16)
       for (unsigned i = 0; i != 16; ++i) {
         unsigned Idx = NumElts + i - Shift;
         if (Idx < NumElts)
           Idx -= NumElts - 16; // end of lane, switch operand.
-        Idxs.push_back(Builder.getInt32(Idx + l));
+        Idxs[l + i] = Idx + l;
       }
 
-    Res = Builder.CreateShuffleVector(Res, Op, ConstantVector::get(Idxs));
+    Res = Builder.CreateShuffleVector(Res, Op, makeArrayRef(Idxs, NumElts));
   }
 
   // Bitcast back to a 64-bit element type.
-  return Builder.CreateBitCast(Res,
-                               VectorType::get(Type::getInt64Ty(C), 2*NumLanes),
-                               "cast");
+  return Builder.CreateBitCast(Res, ResultTy, "cast");
 }
 
-// Handles upgrading SSE2 and AVX2 PSRLDQ intrinsics by converting them
+// Handles upgrading SSE2/AVX2/AVX512BW PSRLDQ intrinsics by converting them
 // to byte shuffles.
-static Value *UpgradeX86PSRLDQIntrinsics(IRBuilder<> &Builder, LLVMContext &C,
-                                         Value *Op, unsigned NumLanes,
+static Value *UpgradeX86PSRLDQIntrinsics(IRBuilder<> &Builder, Value *Op,
                                          unsigned Shift) {
-  // Each lane is 16 bytes.
-  unsigned NumElts = NumLanes * 16;
+  Type *ResultTy = Op->getType();
+  unsigned NumElts = ResultTy->getVectorNumElements() * 8;
 
   // Bitcast from a 64-bit element type to a byte element type.
-  Op = Builder.CreateBitCast(Op,
-                             VectorType::get(Type::getInt8Ty(C), NumElts),
-                             "cast");
+  Type *VecTy = VectorType::get(Builder.getInt8Ty(), NumElts);
+  Op = Builder.CreateBitCast(Op, VecTy, "cast");
+
   // We'll be shuffling in zeroes.
-  Value *Res = ConstantVector::getSplat(NumElts, Builder.getInt8(0));
+  Value *Res = Constant::getNullValue(VecTy);
 
   // If shift is less than 16, emit a shuffle to move the bytes. Otherwise,
   // we'll just return the zero vector.
   if (Shift < 16) {
-    SmallVector<Constant*, 32> Idxs;
-    // 256-bit version is split into two 16-byte lanes.
+    uint32_t Idxs[64];
+    // 256/512-bit version is split into 2/4 16-byte lanes.
     for (unsigned l = 0; l != NumElts; l += 16)
       for (unsigned i = 0; i != 16; ++i) {
         unsigned Idx = i + Shift;
         if (Idx >= 16)
           Idx += NumElts - 16; // end of lane, switch operand.
-        Idxs.push_back(Builder.getInt32(Idx + l));
+        Idxs[l + i] = Idx + l;
       }
 
-    Res = Builder.CreateShuffleVector(Op, Res, ConstantVector::get(Idxs));
+    Res = Builder.CreateShuffleVector(Op, Res, makeArrayRef(Idxs, NumElts));
   }
 
   // Bitcast back to a 64-bit element type.
-  return Builder.CreateBitCast(Res,
-                               VectorType::get(Type::getInt64Ty(C), 2*NumLanes),
-                               "cast");
+  return Builder.CreateBitCast(Res, ResultTy, "cast");
+}
+
+static Value *getX86MaskVec(IRBuilder<> &Builder, Value *Mask,
+                            unsigned NumElts) {
+  llvm::VectorType *MaskTy = llvm::VectorType::get(Builder.getInt1Ty(),
+                             cast<IntegerType>(Mask->getType())->getBitWidth());
+  Mask = Builder.CreateBitCast(Mask, MaskTy);
+
+  // If we have less than 8 elements, then the starting mask was an i8 and
+  // we need to extract down to the right number of elements.
+  if (NumElts < 8) {
+    uint32_t Indices[4];
+    for (unsigned i = 0; i != NumElts; ++i)
+      Indices[i] = i;
+    Mask = Builder.CreateShuffleVector(Mask, Mask,
+                                       makeArrayRef(Indices, NumElts),
+                                       "extract");
+  }
+
+  return Mask;
+}
+
+static Value *EmitX86Select(IRBuilder<> &Builder, Value *Mask,
+                            Value *Op0, Value *Op1) {
+  // If the mask is all ones just emit the align operation.
+  if (const auto *C = dyn_cast<Constant>(Mask))
+    if (C->isAllOnesValue())
+      return Op0;
+
+  Mask = getX86MaskVec(Builder, Mask, Op0->getType()->getVectorNumElements());
+  return Builder.CreateSelect(Mask, Op0, Op1);
+}
+
+static Value *UpgradeX86PALIGNRIntrinsics(IRBuilder<> &Builder,
+                                          Value *Op0, Value *Op1, Value *Shift,
+                                          Value *Passthru, Value *Mask) {
+  unsigned ShiftVal = cast<llvm::ConstantInt>(Shift)->getZExtValue();
+
+  unsigned NumElts = Op0->getType()->getVectorNumElements();
+  assert(NumElts % 16 == 0);
+
+  // If palignr is shifting the pair of vectors more than the size of two
+  // lanes, emit zero.
+  if (ShiftVal >= 32)
+    return llvm::Constant::getNullValue(Op0->getType());
+
+  // If palignr is shifting the pair of input vectors more than one lane,
+  // but less than two lanes, convert to shifting in zeroes.
+  if (ShiftVal > 16) {
+    ShiftVal -= 16;
+    Op1 = Op0;
+    Op0 = llvm::Constant::getNullValue(Op0->getType());
+  }
+
+  uint32_t Indices[64];
+  // 256-bit palignr operates on 128-bit lanes so we need to handle that
+  for (unsigned l = 0; l != NumElts; l += 16) {
+    for (unsigned i = 0; i != 16; ++i) {
+      unsigned Idx = ShiftVal + i;
+      if (Idx >= 16)
+        Idx += NumElts - 16; // End of lane, switch operand.
+      Indices[l + i] = Idx + l;
+    }
+  }
+
+  Value *Align = Builder.CreateShuffleVector(Op1, Op0,
+                                             makeArrayRef(Indices, NumElts),
+                                             "palignr");
+
+  return EmitX86Select(Builder, Mask, Align, Passthru);
+}
+
+static Value *UpgradeMaskedStore(IRBuilder<> &Builder,
+                                 Value *Ptr, Value *Data, Value *Mask,
+                                 bool Aligned) {
+  // Cast the pointer to the right type.
+  Ptr = Builder.CreateBitCast(Ptr,
+                              llvm::PointerType::getUnqual(Data->getType()));
+  unsigned Align =
+    Aligned ? cast<VectorType>(Data->getType())->getBitWidth() / 8 : 1;
+
+  // If the mask is all ones just emit a regular store.
+  if (const auto *C = dyn_cast<Constant>(Mask))
+    if (C->isAllOnesValue())
+      return Builder.CreateAlignedStore(Data, Ptr, Align);
+
+  // Convert the mask from an integer type to a vector of i1.
+  unsigned NumElts = Data->getType()->getVectorNumElements();
+  Mask = getX86MaskVec(Builder, Mask, NumElts);
+  return Builder.CreateMaskedStore(Data, Ptr, Align, Mask);
+}
+
+static Value *UpgradeMaskedLoad(IRBuilder<> &Builder,
+                                Value *Ptr, Value *Passthru, Value *Mask,
+                                bool Aligned) {
+  // Cast the pointer to the right type.
+  Ptr = Builder.CreateBitCast(Ptr,
+                             llvm::PointerType::getUnqual(Passthru->getType()));
+  unsigned Align =
+    Aligned ? cast<VectorType>(Passthru->getType())->getBitWidth() / 8 : 1;
+
+  // If the mask is all ones just emit a regular store.
+  if (const auto *C = dyn_cast<Constant>(Mask))
+    if (C->isAllOnesValue())
+      return Builder.CreateAlignedLoad(Ptr, Align);
+
+  // Convert the mask from an integer type to a vector of i1.
+  unsigned NumElts = Passthru->getType()->getVectorNumElements();
+  Mask = getX86MaskVec(Builder, Mask, NumElts);
+  return Builder.CreateMaskedLoad(Ptr, Align, Mask, Passthru);
 }
 
-// UpgradeIntrinsicCall - Upgrade a call to an old intrinsic to be a call the
-// upgraded intrinsic. All argument and return casting must be provided in
-// order to seamlessly integrate with existing context.
+static Value *upgradeIntMinMax(IRBuilder<> &Builder, CallInst &CI,
+                               ICmpInst::Predicate Pred) {
+  Value *Op0 = CI.getArgOperand(0);
+  Value *Op1 = CI.getArgOperand(1);
+  Value *Cmp = Builder.CreateICmp(Pred, Op0, Op1);
+  return Builder.CreateSelect(Cmp, Op0, Op1);
+}
+
+static Value *upgradeMaskedCompare(IRBuilder<> &Builder, CallInst &CI,
+                                   ICmpInst::Predicate Pred) {
+  Value *Op0 = CI.getArgOperand(0);
+  unsigned NumElts = Op0->getType()->getVectorNumElements();
+  Value *Cmp = Builder.CreateICmp(Pred, Op0, CI.getArgOperand(1));
+
+  Value *Mask = CI.getArgOperand(2);
+  const auto *C = dyn_cast<Constant>(Mask);
+  if (!C || !C->isAllOnesValue())
+    Cmp = Builder.CreateAnd(Cmp, getX86MaskVec(Builder, Mask, NumElts));
+
+  if (NumElts < 8) {
+    uint32_t Indices[8];
+    for (unsigned i = 0; i != NumElts; ++i)
+      Indices[i] = i;
+    for (unsigned i = NumElts; i != 8; ++i)
+      Indices[i] = NumElts + i % NumElts;
+    Cmp = Builder.CreateShuffleVector(Cmp,
+                                      Constant::getNullValue(Cmp->getType()),
+                                      Indices);
+  }
+  return Builder.CreateBitCast(Cmp, IntegerType::get(CI.getContext(),
+                                                     std::max(NumElts, 8U)));
+}
+
+/// Upgrade a call to an old intrinsic. All argument and return casting must be
+/// provided to seamlessly integrate with existing context.
 void llvm::UpgradeIntrinsicCall(CallInst *CI, Function *NewFn) {
   Function *F = CI->getCalledFunction();
   LLVMContext &C = CI->getContext();
@@ -372,26 +645,105 @@ void llvm::UpgradeIntrinsicCall(CallInst *CI, Function *NewFn) {
     // Get the Function's name.
     StringRef Name = F->getName();
 
+    assert(Name.startswith("llvm.") && "Intrinsic doesn't start with 'llvm.'");
+    Name = Name.substr(5);
+
+    bool IsX86 = Name.startswith("x86.");
+    if (IsX86)
+      Name = Name.substr(4);
+
     Value *Rep;
-    // Upgrade packed integer vector compares intrinsics to compare instructions
-    if (Name.startswith("llvm.x86.sse2.pcmpeq.") ||
-        Name.startswith("llvm.x86.avx2.pcmpeq.")) {
+    // Upgrade packed integer vector compare intrinsics to compare instructions.
+    if (IsX86 && (Name.startswith("sse2.pcmpeq.") ||
+                  Name.startswith("avx2.pcmpeq."))) {
       Rep = Builder.CreateICmpEQ(CI->getArgOperand(0), CI->getArgOperand(1),
                                  "pcmpeq");
-      // need to sign extend since icmp returns vector of i1
       Rep = Builder.CreateSExt(Rep, CI->getType(), "");
-    } else if (Name.startswith("llvm.x86.sse2.pcmpgt.") ||
-               Name.startswith("llvm.x86.avx2.pcmpgt.")) {
+    } else if (IsX86 && (Name.startswith("sse2.pcmpgt.") ||
+                         Name.startswith("avx2.pcmpgt."))) {
       Rep = Builder.CreateICmpSGT(CI->getArgOperand(0), CI->getArgOperand(1),
                                   "pcmpgt");
-      // need to sign extend since icmp returns vector of i1
       Rep = Builder.CreateSExt(Rep, CI->getType(), "");
-    } else if (Name == "llvm.x86.avx.movnt.dq.256" ||
-               Name == "llvm.x86.avx.movnt.ps.256" ||
-               Name == "llvm.x86.avx.movnt.pd.256") {
-      IRBuilder<> Builder(C);
-      Builder.SetInsertPoint(CI->getParent(), CI->getIterator());
+    } else if (IsX86 && Name.startswith("avx512.mask.pcmpeq.")) {
+      Rep = upgradeMaskedCompare(Builder, *CI, ICmpInst::ICMP_EQ);
+    } else if (IsX86 && Name.startswith("avx512.mask.pcmpgt.")) {
+      Rep = upgradeMaskedCompare(Builder, *CI, ICmpInst::ICMP_SGT);
+    } else if (IsX86 && (Name == "sse41.pmaxsb" ||
+                         Name == "sse2.pmaxs.w" ||
+                         Name == "sse41.pmaxsd" ||
+                         Name.startswith("avx2.pmaxs"))) {
+      Rep = upgradeIntMinMax(Builder, *CI, ICmpInst::ICMP_SGT);
+    } else if (IsX86 && (Name == "sse2.pmaxu.b" ||
+                         Name == "sse41.pmaxuw" ||
+                         Name == "sse41.pmaxud" ||
+                         Name.startswith("avx2.pmaxu"))) {
+      Rep = upgradeIntMinMax(Builder, *CI, ICmpInst::ICMP_UGT);
+    } else if (IsX86 && (Name == "sse41.pminsb" ||
+                         Name == "sse2.pmins.w" ||
+                         Name == "sse41.pminsd" ||
+                         Name.startswith("avx2.pmins"))) {
+      Rep = upgradeIntMinMax(Builder, *CI, ICmpInst::ICMP_SLT);
+    } else if (IsX86 && (Name == "sse2.pminu.b" ||
+                         Name == "sse41.pminuw" ||
+                         Name == "sse41.pminud" ||
+                         Name.startswith("avx2.pminu"))) {
+      Rep = upgradeIntMinMax(Builder, *CI, ICmpInst::ICMP_ULT);
+    } else if (IsX86 && (Name == "sse2.cvtdq2pd" ||
+                         Name == "sse2.cvtps2pd" ||
+                         Name == "avx.cvtdq2.pd.256" ||
+                         Name == "avx.cvt.ps2.pd.256")) {
+      // Lossless i32/float to double conversion.
+      // Extract the bottom elements if necessary and convert to double vector.
+      Value *Src = CI->getArgOperand(0);
+      VectorType *SrcTy = cast<VectorType>(Src->getType());
+      VectorType *DstTy = cast<VectorType>(CI->getType());
+      Rep = CI->getArgOperand(0);
+
+      unsigned NumDstElts = DstTy->getNumElements();
+      if (NumDstElts < SrcTy->getNumElements()) {
+        assert(NumDstElts == 2 && "Unexpected vector size");
+        uint32_t ShuffleMask[2] = { 0, 1 };
+        Rep = Builder.CreateShuffleVector(Rep, UndefValue::get(SrcTy),
+                                          ShuffleMask);
+      }
+
+      bool Int2Double = (StringRef::npos != Name.find("cvtdq2"));
+      if (Int2Double)
+        Rep = Builder.CreateSIToFP(Rep, DstTy, "cvtdq2pd");
+      else
+        Rep = Builder.CreateFPExt(Rep, DstTy, "cvtps2pd");
+    } else if (IsX86 && (Name == "sse2.cvttps2dq" ||
+                         Name.startswith("avx.cvtt."))) {
+      // Truncation (round to zero) float/double to i32 vector conversion.
+      Value *Src = CI->getArgOperand(0);
+      VectorType *DstTy = cast<VectorType>(CI->getType());
+      Rep = Builder.CreateFPToSI(Src, DstTy, "cvtt");
+    } else if (IsX86 && Name.startswith("sse4a.movnt.")) {
+      Module *M = F->getParent();
+      SmallVector<Metadata *, 1> Elts;
+      Elts.push_back(
+          ConstantAsMetadata::get(ConstantInt::get(Type::getInt32Ty(C), 1)));
+      MDNode *Node = MDNode::get(C, Elts);
+
+      Value *Arg0 = CI->getArgOperand(0);
+      Value *Arg1 = CI->getArgOperand(1);
+
+      // Nontemporal (unaligned) store of the 0'th element of the float/double
+      // vector.
+      Type *SrcEltTy = cast<VectorType>(Arg1->getType())->getElementType();
+      PointerType *EltPtrTy = PointerType::getUnqual(SrcEltTy);
+      Value *Addr = Builder.CreateBitCast(Arg0, EltPtrTy, "cast");
+      Value *Extract =
+          Builder.CreateExtractElement(Arg1, (uint64_t)0, "extractelement");
+
+      StoreInst *SI = Builder.CreateAlignedStore(Extract, Addr, 1);
+      SI->setMetadata(M->getMDKindID("nontemporal"), Node);
 
+      // Remove intrinsic.
+      CI->eraseFromParent();
+      return;
+    } else if (IsX86 && (Name.startswith("avx.movnt.") ||
+                         Name.startswith("avx512.storent."))) {
       Module *M = F->getParent();
       SmallVector<Metadata *, 1> Elts;
       Elts.push_back(
@@ -405,14 +757,82 @@ void llvm::UpgradeIntrinsicCall(CallInst *CI, Function *NewFn) {
       Value *BC = Builder.CreateBitCast(Arg0,
                                         PointerType::getUnqual(Arg1->getType()),
                                         "cast");
-      StoreInst *SI = Builder.CreateStore(Arg1, BC);
+      VectorType *VTy = cast<VectorType>(Arg1->getType());
+      StoreInst *SI = Builder.CreateAlignedStore(Arg1, BC,
+                                                 VTy->getBitWidth() / 8);
       SI->setMetadata(M->getMDKindID("nontemporal"), Node);
-      SI->setAlignment(32);
 
       // Remove intrinsic.
       CI->eraseFromParent();
       return;
-    } else if (Name.startswith("llvm.x86.xop.vpcom")) {
+    } else if (IsX86 && Name == "sse2.storel.dq") {
+      Value *Arg0 = CI->getArgOperand(0);
+      Value *Arg1 = CI->getArgOperand(1);
+
+      Type *NewVecTy = VectorType::get(Type::getInt64Ty(C), 2);
+      Value *BC0 = Builder.CreateBitCast(Arg1, NewVecTy, "cast");
+      Value *Elt = Builder.CreateExtractElement(BC0, (uint64_t)0);
+      Value *BC = Builder.CreateBitCast(Arg0,
+                                        PointerType::getUnqual(Elt->getType()),
+                                        "cast");
+      Builder.CreateAlignedStore(Elt, BC, 1);
+
+      // Remove intrinsic.
+      CI->eraseFromParent();
+      return;
+    } else if (IsX86 && (Name.startswith("sse.storeu.") ||
+                         Name.startswith("sse2.storeu.") ||
+                         Name.startswith("avx.storeu."))) {
+      Value *Arg0 = CI->getArgOperand(0);
+      Value *Arg1 = CI->getArgOperand(1);
+
+      Arg0 = Builder.CreateBitCast(Arg0,
+                                   PointerType::getUnqual(Arg1->getType()),
+                                   "cast");
+      Builder.CreateAlignedStore(Arg1, Arg0, 1);
+
+      // Remove intrinsic.
+      CI->eraseFromParent();
+      return;
+    } else if (IsX86 && (Name.startswith("avx512.mask.storeu.p") ||
+                         Name.startswith("avx512.mask.storeu.b.") ||
+                         Name.startswith("avx512.mask.storeu.w.") ||
+                         Name.startswith("avx512.mask.storeu.d.") ||
+                         Name.startswith("avx512.mask.storeu.q."))) {
+      UpgradeMaskedStore(Builder, CI->getArgOperand(0), CI->getArgOperand(1),
+                         CI->getArgOperand(2), /*Aligned*/false);
+
+      // Remove intrinsic.
+      CI->eraseFromParent();
+      return;
+    } else if (IsX86 && (Name.startswith("avx512.mask.store.p") ||
+                         Name.startswith("avx512.mask.store.b.") ||
+                         Name.startswith("avx512.mask.store.w.") ||
+                         Name.startswith("avx512.mask.store.d.") ||
+                         Name.startswith("avx512.mask.store.q."))) {
+      UpgradeMaskedStore(Builder, CI->getArgOperand(0), CI->getArgOperand(1),
+                         CI->getArgOperand(2), /*Aligned*/true);
+
+      // Remove intrinsic.
+      CI->eraseFromParent();
+      return;
+    } else if (IsX86 && (Name.startswith("avx512.mask.loadu.p") ||
+                         Name.startswith("avx512.mask.loadu.b.") ||
+                         Name.startswith("avx512.mask.loadu.w.") ||
+                         Name.startswith("avx512.mask.loadu.d.") ||
+                         Name.startswith("avx512.mask.loadu.q."))) {
+      Rep = UpgradeMaskedLoad(Builder, CI->getArgOperand(0),
+                              CI->getArgOperand(1), CI->getArgOperand(2),
+                              /*Aligned*/false);
+    } else if (IsX86 && (Name.startswith("avx512.mask.load.p") ||
+                         Name.startswith("avx512.mask.load.b.") ||
+                         Name.startswith("avx512.mask.load.w.") ||
+                         Name.startswith("avx512.mask.load.d.") ||
+                         Name.startswith("avx512.mask.load.q."))) {
+      Rep = UpgradeMaskedLoad(Builder, CI->getArgOperand(0),
+                              CI->getArgOperand(1),CI->getArgOperand(2),
+                              /*Aligned*/true);
+    } else if (IsX86 && Name.startswith("xop.vpcom")) {
       Intrinsic::ID intID;
       if (Name.endswith("ub"))
         intID = Intrinsic::x86_xop_vpcomub;
@@ -433,7 +853,7 @@ void llvm::UpgradeIntrinsicCall(CallInst *CI, Function *NewFn) {
       else
         llvm_unreachable("Unknown suffix");
 
-      Name = Name.substr(18); // strip off "llvm.x86.xop.vpcom"
+      Name = Name.substr(9); // strip off "xop.vpcom"
       unsigned Imm;
       if (Name.startswith("lt"))
         Imm = 0;
@@ -458,7 +878,7 @@ void llvm::UpgradeIntrinsicCall(CallInst *CI, Function *NewFn) {
       Rep =
           Builder.CreateCall(VPCOM, {CI->getArgOperand(0), CI->getArgOperand(1),
                                      Builder.getInt8(Imm)});
-    } else if (Name == "llvm.x86.xop.vpcmov") {
+    } else if (IsX86 && Name == "xop.vpcmov") {
       Value *Arg0 = CI->getArgOperand(0);
       Value *Arg1 = CI->getArgOperand(1);
       Value *Sel = CI->getArgOperand(2);
@@ -468,13 +888,13 @@ void llvm::UpgradeIntrinsicCall(CallInst *CI, Function *NewFn) {
       Value *Sel0 = Builder.CreateAnd(Arg0, Sel);
       Value *Sel1 = Builder.CreateAnd(Arg1, NotSel);
       Rep = Builder.CreateOr(Sel0, Sel1);
-    } else if (Name == "llvm.x86.sse42.crc32.64.8") {
+    } else if (IsX86 && Name == "sse42.crc32.64.8") {
       Function *CRC32 = Intrinsic::getDeclaration(F->getParent(),
                                                Intrinsic::x86_sse42_crc32_32_8);
       Value *Trunc0 = Builder.CreateTrunc(CI->getArgOperand(0), Type::getInt32Ty(C));
       Rep = Builder.CreateCall(CRC32, {Trunc0, CI->getArgOperand(1)});
       Rep = Builder.CreateZExt(Rep, CI->getType(), "");
-    } else if (Name.startswith("llvm.x86.avx.vbroadcast")) {
+    } else if (IsX86 && Name.startswith("avx.vbroadcast")) {
       // Replace broadcasts with a series of insertelements.
       Type *VecTy = CI->getType();
       Type *EltTy = VecTy->getVectorElementType();
@@ -487,101 +907,96 @@ void llvm::UpgradeIntrinsicCall(CallInst *CI, Function *NewFn) {
       for (unsigned I = 0; I < EltNum; ++I)
         Rep = Builder.CreateInsertElement(Rep, Load,
                                           ConstantInt::get(I32Ty, I));
-    } else if (Name.startswith("llvm.x86.sse41.pmovsx")) {
+    } else if (IsX86 && (Name.startswith("sse41.pmovsx") ||
+                         Name.startswith("sse41.pmovzx") ||
+                         Name.startswith("avx2.pmovsx") ||
+                         Name.startswith("avx2.pmovzx"))) {
       VectorType *SrcTy = cast<VectorType>(CI->getArgOperand(0)->getType());
       VectorType *DstTy = cast<VectorType>(CI->getType());
       unsigned NumDstElts = DstTy->getNumElements();
 
-      // Extract a subvector of the first NumDstElts lanes and sign extend.
-      SmallVector<int, 8> ShuffleMask;
-      for (int i = 0; i != (int)NumDstElts; ++i)
-        ShuffleMask.push_back(i);
+      // Extract a subvector of the first NumDstElts lanes and sign/zero extend.
+      SmallVector<uint32_t, 8> ShuffleMask(NumDstElts);
+      for (unsigned i = 0; i != NumDstElts; ++i)
+        ShuffleMask[i] = i;
 
       Value *SV = Builder.CreateShuffleVector(
           CI->getArgOperand(0), UndefValue::get(SrcTy), ShuffleMask);
-      Rep = Builder.CreateSExt(SV, DstTy);
-    } else if (Name == "llvm.x86.avx2.vbroadcasti128") {
+
+      bool DoSext = (StringRef::npos != Name.find("pmovsx"));
+      Rep = DoSext ? Builder.CreateSExt(SV, DstTy)
+                   : Builder.CreateZExt(SV, DstTy);
+    } else if (IsX86 && Name == "avx2.vbroadcasti128") {
       // Replace vbroadcasts with a vector shuffle.
       Type *VT = VectorType::get(Type::getInt64Ty(C), 2);
       Value *Op = Builder.CreatePointerCast(CI->getArgOperand(0),
                                             PointerType::getUnqual(VT));
       Value *Load = Builder.CreateLoad(VT, Op);
-      const int Idxs[4] = { 0, 1, 0, 1 };
+      uint32_t Idxs[4] = { 0, 1, 0, 1 };
       Rep = Builder.CreateShuffleVector(Load, UndefValue::get(Load->getType()),
                                         Idxs);
-    } else if (Name.startswith("llvm.x86.avx2.pbroadcast") ||
-               Name.startswith("llvm.x86.avx2.vbroadcast")) {
+    } else if (IsX86 && (Name.startswith("avx2.pbroadcast") ||
+                         Name.startswith("avx2.vbroadcast") ||
+                         Name.startswith("avx512.pbroadcast") ||
+                         Name.startswith("avx512.mask.broadcast.s"))) {
       // Replace vp?broadcasts with a vector shuffle.
       Value *Op = CI->getArgOperand(0);
       unsigned NumElts = CI->getType()->getVectorNumElements();
       Type *MaskTy = VectorType::get(Type::getInt32Ty(C), NumElts);
       Rep = Builder.CreateShuffleVector(Op, UndefValue::get(Op->getType()),
                                         Constant::getNullValue(MaskTy));
-    } else if (Name == "llvm.x86.sse2.psll.dq") {
-      // 128-bit shift left specified in bits.
-      unsigned Shift = cast<ConstantInt>(CI->getArgOperand(1))->getZExtValue();
-      Rep = UpgradeX86PSLLDQIntrinsics(Builder, C, CI->getArgOperand(0), 1,
-                                       Shift / 8); // Shift is in bits.
-    } else if (Name == "llvm.x86.sse2.psrl.dq") {
-      // 128-bit shift right specified in bits.
-      unsigned Shift = cast<ConstantInt>(CI->getArgOperand(1))->getZExtValue();
-      Rep = UpgradeX86PSRLDQIntrinsics(Builder, C, CI->getArgOperand(0), 1,
-                                       Shift / 8); // Shift is in bits.
-    } else if (Name == "llvm.x86.avx2.psll.dq") {
-      // 256-bit shift left specified in bits.
+
+      if (CI->getNumArgOperands() == 3)
+        Rep = EmitX86Select(Builder, CI->getArgOperand(2), Rep,
+                            CI->getArgOperand(1));
+    } else if (IsX86 && Name.startswith("avx512.mask.palignr.")) {
+      Rep = UpgradeX86PALIGNRIntrinsics(Builder, CI->getArgOperand(0),
+                                        CI->getArgOperand(1),
+                                        CI->getArgOperand(2),
+                                        CI->getArgOperand(3),
+                                        CI->getArgOperand(4));
+    } else if (IsX86 && (Name == "sse2.psll.dq" ||
+                         Name == "avx2.psll.dq")) {
+      // 128/256-bit shift left specified in bits.
       unsigned Shift = cast<ConstantInt>(CI->getArgOperand(1))->getZExtValue();
-      Rep = UpgradeX86PSLLDQIntrinsics(Builder, C, CI->getArgOperand(0), 2,
+      Rep = UpgradeX86PSLLDQIntrinsics(Builder, CI->getArgOperand(0),
                                        Shift / 8); // Shift is in bits.
-    } else if (Name == "llvm.x86.avx2.psrl.dq") {
-      // 256-bit shift right specified in bits.
+    } else if (IsX86 && (Name == "sse2.psrl.dq" ||
+                         Name == "avx2.psrl.dq")) {
+      // 128/256-bit shift right specified in bits.
       unsigned Shift = cast<ConstantInt>(CI->getArgOperand(1))->getZExtValue();
-      Rep = UpgradeX86PSRLDQIntrinsics(Builder, C, CI->getArgOperand(0), 2,
+      Rep = UpgradeX86PSRLDQIntrinsics(Builder, CI->getArgOperand(0),
                                        Shift / 8); // Shift is in bits.
-    } else if (Name == "llvm.x86.sse2.psll.dq.bs") {
-      // 128-bit shift left specified in bytes.
-      unsigned Shift = cast<ConstantInt>(CI->getArgOperand(1))->getZExtValue();
-      Rep = UpgradeX86PSLLDQIntrinsics(Builder, C, CI->getArgOperand(0), 1,
-                                       Shift);
-    } else if (Name == "llvm.x86.sse2.psrl.dq.bs") {
-      // 128-bit shift right specified in bytes.
-      unsigned Shift = cast<ConstantInt>(CI->getArgOperand(1))->getZExtValue();
-      Rep = UpgradeX86PSRLDQIntrinsics(Builder, C, CI->getArgOperand(0), 1,
-                                       Shift);
-    } else if (Name == "llvm.x86.avx2.psll.dq.bs") {
-      // 256-bit shift left specified in bytes.
+    } else if (IsX86 && (Name == "sse2.psll.dq.bs" ||
+                         Name == "avx2.psll.dq.bs" ||
+                         Name == "avx512.psll.dq.512")) {
+      // 128/256/512-bit shift left specified in bytes.
       unsigned Shift = cast<ConstantInt>(CI->getArgOperand(1))->getZExtValue();
-      Rep = UpgradeX86PSLLDQIntrinsics(Builder, C, CI->getArgOperand(0), 2,
-                                       Shift);
-    } else if (Name == "llvm.x86.avx2.psrl.dq.bs") {
-      // 256-bit shift right specified in bytes.
+      Rep = UpgradeX86PSLLDQIntrinsics(Builder, CI->getArgOperand(0), Shift);
+    } else if (IsX86 && (Name == "sse2.psrl.dq.bs" ||
+                         Name == "avx2.psrl.dq.bs" ||
+                         Name == "avx512.psrl.dq.512")) {
+      // 128/256/512-bit shift right specified in bytes.
       unsigned Shift = cast<ConstantInt>(CI->getArgOperand(1))->getZExtValue();
-      Rep = UpgradeX86PSRLDQIntrinsics(Builder, C, CI->getArgOperand(0), 2,
-                                       Shift);
-    } else if (Name == "llvm.x86.sse41.pblendw" ||
-               Name == "llvm.x86.sse41.blendpd" ||
-               Name == "llvm.x86.sse41.blendps" ||
-               Name == "llvm.x86.avx.blend.pd.256" ||
-               Name == "llvm.x86.avx.blend.ps.256" ||
-               Name == "llvm.x86.avx2.pblendw" ||
-               Name == "llvm.x86.avx2.pblendd.128" ||
-               Name == "llvm.x86.avx2.pblendd.256") {
+      Rep = UpgradeX86PSRLDQIntrinsics(Builder, CI->getArgOperand(0), Shift);
+    } else if (IsX86 && (Name == "sse41.pblendw" ||
+                         Name.startswith("sse41.blendp") ||
+                         Name.startswith("avx.blend.p") ||
+                         Name == "avx2.pblendw" ||
+                         Name.startswith("avx2.pblendd."))) {
       Value *Op0 = CI->getArgOperand(0);
       Value *Op1 = CI->getArgOperand(1);
       unsigned Imm = cast <ConstantInt>(CI->getArgOperand(2))->getZExtValue();
       VectorType *VecTy = cast<VectorType>(CI->getType());
       unsigned NumElts = VecTy->getNumElements();
 
-      SmallVector<Constant*, 16> Idxs;
-      for (unsigned i = 0; i != NumElts; ++i) {
-        unsigned Idx = ((Imm >> (i%8)) & 1) ? i + NumElts : i;
-        Idxs.push_back(Builder.getInt32(Idx));
-      }
+      SmallVector<uint32_t, 16> Idxs(NumElts);
+      for (unsigned i = 0; i != NumElts; ++i)
+        Idxs[i] = ((Imm >> (i%8)) & 1) ? i + NumElts : i;
 
-      Rep = Builder.CreateShuffleVector(Op0, Op1, ConstantVector::get(Idxs));
-    } else if (Name == "llvm.x86.avx.vinsertf128.pd.256" ||
-               Name == "llvm.x86.avx.vinsertf128.ps.256" ||
-               Name == "llvm.x86.avx.vinsertf128.si.256" ||
-               Name == "llvm.x86.avx2.vinserti128") {
+      Rep = Builder.CreateShuffleVector(Op0, Op1, Idxs);
+    } else if (IsX86 && (Name.startswith("avx.vinsertf128.") ||
+                         Name == "avx2.vinserti128")) {
       Value *Op0 = CI->getArgOperand(0);
       Value *Op1 = CI->getArgOperand(1);
       unsigned Imm = cast<ConstantInt>(CI->getArgOperand(2))->getZExtValue();
@@ -593,11 +1008,10 @@ void llvm::UpgradeIntrinsicCall(CallInst *CI, Function *NewFn) {
 
       // Extend the second operand into a vector that is twice as big.
       Value *UndefV = UndefValue::get(Op1->getType());
-      SmallVector<Constant*, 8> Idxs;
-      for (unsigned i = 0; i != NumElts; ++i) {
-        Idxs.push_back(Builder.getInt32(i));
-      }
-      Rep = Builder.CreateShuffleVector(Op1, UndefV, ConstantVector::get(Idxs));
+      SmallVector<uint32_t, 8> Idxs(NumElts);
+      for (unsigned i = 0; i != NumElts; ++i)
+        Idxs[i] = i;
+      Rep = Builder.CreateShuffleVector(Op1, UndefV, Idxs);
 
       // Insert the second operand into the first operand.
 
@@ -610,24 +1024,17 @@ void llvm::UpgradeIntrinsicCall(CallInst *CI, Function *NewFn) {
       // Imm = 1  <i32 0, i32 1, i32 2,  i32 3,  i32 8, i32 9, i32 10, i32 11>
       // Imm = 0  <i32 8, i32 9, i32 10, i32 11, i32 4, i32 5, i32 6,  i32 7 >
 
-      SmallVector<Constant*, 8> Idxs2;
       // The low half of the result is either the low half of the 1st operand
       // or the low half of the 2nd operand (the inserted vector).
-      for (unsigned i = 0; i != NumElts / 2; ++i) {
-        unsigned Idx = Imm ? i : (i + NumElts);
-        Idxs2.push_back(Builder.getInt32(Idx));
-      }
+      for (unsigned i = 0; i != NumElts / 2; ++i)
+        Idxs[i] = Imm ? i : (i + NumElts);
       // The high half of the result is either the low half of the 2nd operand
       // (the inserted vector) or the high half of the 1st operand.
-      for (unsigned i = NumElts / 2; i != NumElts; ++i) {
-        unsigned Idx = Imm ? (i + NumElts / 2) : i;
-        Idxs2.push_back(Builder.getInt32(Idx));
-      }
-      Rep = Builder.CreateShuffleVector(Op0, Rep, ConstantVector::get(Idxs2));
-    } else if (Name == "llvm.x86.avx.vextractf128.pd.256" ||
-               Name == "llvm.x86.avx.vextractf128.ps.256" ||
-               Name == "llvm.x86.avx.vextractf128.si.256" ||
-               Name == "llvm.x86.avx2.vextracti128") {
+      for (unsigned i = NumElts / 2; i != NumElts; ++i)
+        Idxs[i] = Imm ? (i + NumElts / 2) : i;
+      Rep = Builder.CreateShuffleVector(Op0, Rep, Idxs);
+    } else if (IsX86 && (Name.startswith("avx.vextractf128.") ||
+                         Name == "avx2.vextracti128")) {
       Value *Op0 = CI->getArgOperand(0);
       unsigned Imm = cast<ConstantInt>(CI->getArgOperand(1))->getZExtValue();
       VectorType *VecTy = cast<VectorType>(CI->getType());
@@ -637,54 +1044,170 @@ void llvm::UpgradeIntrinsicCall(CallInst *CI, Function *NewFn) {
       Imm = Imm & 1;
 
       // Get indexes for either the high half or low half of the input vector.
-      SmallVector<Constant*, 4> Idxs(NumElts);
+      SmallVector<uint32_t, 4> Idxs(NumElts);
       for (unsigned i = 0; i != NumElts; ++i) {
-        unsigned Idx = Imm ? (i + NumElts) : i;
-        Idxs[i] = Builder.getInt32(Idx);
+        Idxs[i] = Imm ? (i + NumElts) : i;
       }
 
       Value *UndefV = UndefValue::get(Op0->getType());
-      Rep = Builder.CreateShuffleVector(Op0, UndefV, ConstantVector::get(Idxs));
-    } else {
-      bool PD128 = false, PD256 = false, PS128 = false, PS256 = false;
-      if (Name == "llvm.x86.avx.vpermil.pd.256")
-        PD256 = true;
-      else if (Name == "llvm.x86.avx.vpermil.pd")
-        PD128 = true;
-      else if (Name == "llvm.x86.avx.vpermil.ps.256")
-        PS256 = true;
-      else if (Name == "llvm.x86.avx.vpermil.ps")
-        PS128 = true;
-
-      if (PD256 || PD128 || PS256 || PS128) {
-        Value *Op0 = CI->getArgOperand(0);
-        unsigned Imm = cast<ConstantInt>(CI->getArgOperand(1))->getZExtValue();
-        SmallVector<Constant*, 8> Idxs;
-
-        if (PD128)
-          for (unsigned i = 0; i != 2; ++i)
-            Idxs.push_back(Builder.getInt32((Imm >> i) & 0x1));
-        else if (PD256)
-          for (unsigned l = 0; l != 4; l+=2)
-            for (unsigned i = 0; i != 2; ++i)
-              Idxs.push_back(Builder.getInt32(((Imm >> (l+i)) & 0x1) + l));
-        else if (PS128)
-          for (unsigned i = 0; i != 4; ++i)
-            Idxs.push_back(Builder.getInt32((Imm >> (2 * i)) & 0x3));
-        else if (PS256)
-          for (unsigned l = 0; l != 8; l+=4)
-            for (unsigned i = 0; i != 4; ++i)
-              Idxs.push_back(Builder.getInt32(((Imm >> (2 * i)) & 0x3) + l));
-        else
-          llvm_unreachable("Unexpected function");
+      Rep = Builder.CreateShuffleVector(Op0, UndefV, Idxs);
+    } else if (!IsX86 && Name == "stackprotectorcheck") {
+      Rep = nullptr;
+    } else if (IsX86 && (Name.startswith("avx512.mask.perm.df.") ||
+                         Name.startswith("avx512.mask.perm.di."))) {
+      Value *Op0 = CI->getArgOperand(0);
+      unsigned Imm = cast<ConstantInt>(CI->getArgOperand(1))->getZExtValue();
+      VectorType *VecTy = cast<VectorType>(CI->getType());
+      unsigned NumElts = VecTy->getNumElements();
 
-        Rep = Builder.CreateShuffleVector(Op0, Op0, ConstantVector::get(Idxs));
-      } else {
-        llvm_unreachable("Unknown function for CallInst upgrade.");
+      SmallVector<uint32_t, 8> Idxs(NumElts);
+      for (unsigned i = 0; i != NumElts; ++i)
+        Idxs[i] = (i & ~0x3) + ((Imm >> (2 * (i & 0x3))) & 3);
+
+      Rep = Builder.CreateShuffleVector(Op0, Op0, Idxs);
+
+      if (CI->getNumArgOperands() == 4)
+        Rep = EmitX86Select(Builder, CI->getArgOperand(3), Rep,
+                            CI->getArgOperand(2));
+    } else if (IsX86 && (Name.startswith("avx.vpermil.") ||
+                         Name == "sse2.pshuf.d" ||
+                         Name.startswith("avx512.mask.vpermil.p") ||
+                         Name.startswith("avx512.mask.pshuf.d."))) {
+      Value *Op0 = CI->getArgOperand(0);
+      unsigned Imm = cast<ConstantInt>(CI->getArgOperand(1))->getZExtValue();
+      VectorType *VecTy = cast<VectorType>(CI->getType());
+      unsigned NumElts = VecTy->getNumElements();
+      // Calculate the size of each index in the immediate.
+      unsigned IdxSize = 64 / VecTy->getScalarSizeInBits();
+      unsigned IdxMask = ((1 << IdxSize) - 1);
+
+      SmallVector<uint32_t, 8> Idxs(NumElts);
+      // Lookup the bits for this element, wrapping around the immediate every
+      // 8-bits. Elements are grouped into sets of 2 or 4 elements so we need
+      // to offset by the first index of each group.
+      for (unsigned i = 0; i != NumElts; ++i)
+        Idxs[i] = ((Imm >> ((i * IdxSize) % 8)) & IdxMask) | (i & ~IdxMask);
+
+      Rep = Builder.CreateShuffleVector(Op0, Op0, Idxs);
+
+      if (CI->getNumArgOperands() == 4)
+        Rep = EmitX86Select(Builder, CI->getArgOperand(3), Rep,
+                            CI->getArgOperand(2));
+    } else if (IsX86 && (Name == "sse2.pshufl.w" ||
+                         Name.startswith("avx512.mask.pshufl.w."))) {
+      Value *Op0 = CI->getArgOperand(0);
+      unsigned Imm = cast<ConstantInt>(CI->getArgOperand(1))->getZExtValue();
+      unsigned NumElts = CI->getType()->getVectorNumElements();
+
+      SmallVector<uint32_t, 16> Idxs(NumElts);
+      for (unsigned l = 0; l != NumElts; l += 8) {
+        for (unsigned i = 0; i != 4; ++i)
+          Idxs[i + l] = ((Imm >> (2 * i)) & 0x3) + l;
+        for (unsigned i = 4; i != 8; ++i)
+          Idxs[i + l] = i + l;
+      }
+
+      Rep = Builder.CreateShuffleVector(Op0, Op0, Idxs);
+
+      if (CI->getNumArgOperands() == 4)
+        Rep = EmitX86Select(Builder, CI->getArgOperand(3), Rep,
+                            CI->getArgOperand(2));
+    } else if (IsX86 && (Name == "sse2.pshufh.w" ||
+                         Name.startswith("avx512.mask.pshufh.w."))) {
+      Value *Op0 = CI->getArgOperand(0);
+      unsigned Imm = cast<ConstantInt>(CI->getArgOperand(1))->getZExtValue();
+      unsigned NumElts = CI->getType()->getVectorNumElements();
+
+      SmallVector<uint32_t, 16> Idxs(NumElts);
+      for (unsigned l = 0; l != NumElts; l += 8) {
+        for (unsigned i = 0; i != 4; ++i)
+          Idxs[i + l] = i + l;
+        for (unsigned i = 0; i != 4; ++i)
+          Idxs[i + l + 4] = ((Imm >> (2 * i)) & 0x3) + 4 + l;
       }
+
+      Rep = Builder.CreateShuffleVector(Op0, Op0, Idxs);
+
+      if (CI->getNumArgOperands() == 4)
+        Rep = EmitX86Select(Builder, CI->getArgOperand(3), Rep,
+                            CI->getArgOperand(2));
+    } else if (IsX86 && (Name.startswith("avx512.mask.movddup") ||
+                         Name.startswith("avx512.mask.movshdup") ||
+                         Name.startswith("avx512.mask.movsldup"))) {
+      Value *Op0 = CI->getArgOperand(0);
+      unsigned NumElts = CI->getType()->getVectorNumElements();
+      unsigned NumLaneElts = 128/CI->getType()->getScalarSizeInBits();
+
+      unsigned Offset = 0;
+      if (Name.startswith("avx512.mask.movshdup."))
+        Offset = 1;
+
+      SmallVector<uint32_t, 16> Idxs(NumElts);
+      for (unsigned l = 0; l != NumElts; l += NumLaneElts)
+        for (unsigned i = 0; i != NumLaneElts; i += 2) {
+          Idxs[i + l + 0] = i + l + Offset;
+          Idxs[i + l + 1] = i + l + Offset;
+        }
+
+      Rep = Builder.CreateShuffleVector(Op0, Op0, Idxs);
+
+      Rep = EmitX86Select(Builder, CI->getArgOperand(2), Rep,
+                          CI->getArgOperand(1));
+    } else if (IsX86 && (Name.startswith("avx512.mask.punpckl") ||
+                         Name.startswith("avx512.mask.unpckl."))) {
+      Value *Op0 = CI->getArgOperand(0);
+      Value *Op1 = CI->getArgOperand(1);
+      int NumElts = CI->getType()->getVectorNumElements();
+      int NumLaneElts = 128/CI->getType()->getScalarSizeInBits();
+
+      SmallVector<uint32_t, 64> Idxs(NumElts);
+      for (int l = 0; l != NumElts; l += NumLaneElts)
+        for (int i = 0; i != NumLaneElts; ++i)
+          Idxs[i + l] = l + (i / 2) + NumElts * (i % 2);
+
+      Rep = Builder.CreateShuffleVector(Op0, Op1, Idxs);
+
+      Rep = EmitX86Select(Builder, CI->getArgOperand(3), Rep,
+                          CI->getArgOperand(2));
+    } else if (IsX86 && (Name.startswith("avx512.mask.punpckh") ||
+                         Name.startswith("avx512.mask.unpckh."))) {
+      Value *Op0 = CI->getArgOperand(0);
+      Value *Op1 = CI->getArgOperand(1);
+      int NumElts = CI->getType()->getVectorNumElements();
+      int NumLaneElts = 128/CI->getType()->getScalarSizeInBits();
+
+      SmallVector<uint32_t, 64> Idxs(NumElts);
+      for (int l = 0; l != NumElts; l += NumLaneElts)
+        for (int i = 0; i != NumLaneElts; ++i)
+          Idxs[i + l] = (NumLaneElts / 2) + l + (i / 2) + NumElts * (i % 2);
+
+      Rep = Builder.CreateShuffleVector(Op0, Op1, Idxs);
+
+      Rep = EmitX86Select(Builder, CI->getArgOperand(3), Rep,
+                          CI->getArgOperand(2));
+    } else if (IsX86 && Name.startswith("avx512.mask.pand.")) {
+      Rep = Builder.CreateAnd(CI->getArgOperand(0), CI->getArgOperand(1));
+      Rep = EmitX86Select(Builder, CI->getArgOperand(3), Rep,
+                          CI->getArgOperand(2));
+    } else if (IsX86 && Name.startswith("avx512.mask.pandn.")) {
+      Rep = Builder.CreateAnd(Builder.CreateNot(CI->getArgOperand(0)),
+                              CI->getArgOperand(1));
+      Rep = EmitX86Select(Builder, CI->getArgOperand(3), Rep,
+                          CI->getArgOperand(2));
+    } else if (IsX86 && Name.startswith("avx512.mask.por.")) {
+      Rep = Builder.CreateOr(CI->getArgOperand(0), CI->getArgOperand(1));
+      Rep = EmitX86Select(Builder, CI->getArgOperand(3), Rep,
+                          CI->getArgOperand(2));
+    } else if (IsX86 && Name.startswith("avx512.mask.pxor.")) {
+      Rep = Builder.CreateXor(CI->getArgOperand(0), CI->getArgOperand(1));
+      Rep = EmitX86Select(Builder, CI->getArgOperand(3), Rep,
+                          CI->getArgOperand(2));
+    } else {
+      llvm_unreachable("Unknown function for CallInst upgrade.");
     }
 
-    CI->replaceAllUsesWith(Rep);
+    if (Rep)
+      CI->replaceAllUsesWith(Rep);
     CI->eraseFromParent();
     return;
   }
@@ -697,6 +1220,12 @@ void llvm::UpgradeIntrinsicCall(CallInst *CI, Function *NewFn) {
   default:
     llvm_unreachable("Unknown function for CallInst upgrade.");
 
+  case Intrinsic::x86_avx512_mask_psll_di_512:
+  case Intrinsic::x86_avx512_mask_psra_di_512:
+  case Intrinsic::x86_avx512_mask_psrl_di_512:
+  case Intrinsic::x86_avx512_mask_psll_qi_512:
+  case Intrinsic::x86_avx512_mask_psra_qi_512:
+  case Intrinsic::x86_avx512_mask_psrl_qi_512:
   case Intrinsic::arm_neon_vld1:
   case Intrinsic::arm_neon_vld2:
   case Intrinsic::arm_neon_vld3:
@@ -746,6 +1275,20 @@ void llvm::UpgradeIntrinsicCall(CallInst *CI, Function *NewFn) {
     CI->eraseFromParent();
     return;
 
+  case Intrinsic::x86_xop_vpermil2pd:
+  case Intrinsic::x86_xop_vpermil2ps:
+  case Intrinsic::x86_xop_vpermil2pd_256:
+  case Intrinsic::x86_xop_vpermil2ps_256: {
+    SmallVector<Value *, 4> Args(CI->arg_operands().begin(),
+                                 CI->arg_operands().end());
+    VectorType *FltIdxTy = cast<VectorType>(Args[2]->getType());
+    VectorType *IntIdxTy = VectorType::getInteger(FltIdxTy);
+    Args[2] = Builder.CreateBitCast(Args[2], IntIdxTy);
+    CI->replaceAllUsesWith(Builder.CreateCall(NewFn, Args, Name));
+    CI->eraseFromParent();
+    return;
+  }
+
   case Intrinsic::x86_sse41_ptestc:
   case Intrinsic::x86_sse41_ptestz:
   case Intrinsic::x86_sse41_ptestnzc: {
@@ -790,24 +1333,37 @@ void llvm::UpgradeIntrinsicCall(CallInst *CI, Function *NewFn) {
     CI->eraseFromParent();
     return;
   }
+
+  case Intrinsic::thread_pointer: {
+    CI->replaceAllUsesWith(Builder.CreateCall(NewFn, {}));
+    CI->eraseFromParent();
+    return;
+  }
+
+  case Intrinsic::masked_load:
+  case Intrinsic::masked_store: {
+    SmallVector<Value *, 4> Args(CI->arg_operands().begin(),
+                                 CI->arg_operands().end());
+    CI->replaceAllUsesWith(Builder.CreateCall(NewFn, Args));
+    CI->eraseFromParent();
+    return;
+  }
   }
 }
 
-// This tests each Function to determine if it needs upgrading. When we find
-// one we are interested in, we then upgrade all calls to reflect the new
-// function.
-void llvm::UpgradeCallsToIntrinsic(Function* F) {
+void llvm::UpgradeCallsToIntrinsic(Function *F) {
   assert(F && "Illegal attempt to upgrade a non-existent intrinsic.");
 
-  // Upgrade the function and check if it is a totaly new function.
+  // Check if this function should be upgraded and get the replacement function
+  // if there is one.
   Function *NewFn;
   if (UpgradeIntrinsicFunction(F, NewFn)) {
-    // Replace all uses to the old function with the new one if necessary.
-    for (Value::user_iterator UI = F->user_begin(), UE = F->user_end();
-         UI != UE;) {
+    // Replace all users of the old function with the new function or new
+    // instructions. This is not a range loop because the call is deleted.
+    for (auto UI = F->user_begin(), UE = F->user_end(); UI != UE; )
       if (CallInst *CI = dyn_cast<CallInst>(*UI++))
         UpgradeIntrinsicCall(CI, NewFn);
-    }
+
     // Remove old function, no longer used, from the module.
     F->eraseFromParent();
   }
@@ -894,11 +1450,95 @@ bool llvm::UpgradeDebugInfo(Module &M) {
   return RetCode;
 }
 
-void llvm::UpgradeMDStringConstant(std::string &String) {
-  const std::string OldPrefix = "llvm.vectorizer.";
-  if (String == "llvm.vectorizer.unroll") {
-    String = "llvm.loop.interleave.count";
-  } else if (String.find(OldPrefix) == 0) {
-    String.replace(0, OldPrefix.size(), "llvm.loop.vectorize.");
+bool llvm::UpgradeModuleFlags(Module &M) {
+  const NamedMDNode *ModFlags = M.getModuleFlagsMetadata();
+  if (!ModFlags)
+    return false;
+
+  bool HasObjCFlag = false, HasClassProperties = false;
+  for (unsigned I = 0, E = ModFlags->getNumOperands(); I != E; ++I) {
+    MDNode *Op = ModFlags->getOperand(I);
+    if (Op->getNumOperands() < 2)
+      continue;
+    MDString *ID = dyn_cast_or_null<MDString>(Op->getOperand(1));
+    if (!ID)
+      continue;
+    if (ID->getString() == "Objective-C Image Info Version")
+      HasObjCFlag = true;
+    if (ID->getString() == "Objective-C Class Properties")
+      HasClassProperties = true;
   }
+  // "Objective-C Class Properties" is recently added for Objective-C. We
+  // upgrade ObjC bitcodes to contain a "Objective-C Class Properties" module
+  // flag of value 0, so we can correclty report error when trying to link
+  // an ObjC bitcode without this module flag with an ObjC bitcode with this
+  // module flag.
+  if (HasObjCFlag && !HasClassProperties) {
+    M.addModuleFlag(llvm::Module::Error, "Objective-C Class Properties",
+                    (uint32_t)0);
+    return true;
+  }
+  return false;
+}
+
+static bool isOldLoopArgument(Metadata *MD) {
+  auto *T = dyn_cast_or_null<MDTuple>(MD);
+  if (!T)
+    return false;
+  if (T->getNumOperands() < 1)
+    return false;
+  auto *S = dyn_cast_or_null<MDString>(T->getOperand(0));
+  if (!S)
+    return false;
+  return S->getString().startswith("llvm.vectorizer.");
+}
+
+static MDString *upgradeLoopTag(LLVMContext &C, StringRef OldTag) {
+  StringRef OldPrefix = "llvm.vectorizer.";
+  assert(OldTag.startswith(OldPrefix) && "Expected old prefix");
+
+  if (OldTag == "llvm.vectorizer.unroll")
+    return MDString::get(C, "llvm.loop.interleave.count");
+
+  return MDString::get(
+      C, (Twine("llvm.loop.vectorize.") + OldTag.drop_front(OldPrefix.size()))
+             .str());
+}
+
+static Metadata *upgradeLoopArgument(Metadata *MD) {
+  auto *T = dyn_cast_or_null<MDTuple>(MD);
+  if (!T)
+    return MD;
+  if (T->getNumOperands() < 1)
+    return MD;
+  auto *OldTag = dyn_cast_or_null<MDString>(T->getOperand(0));
+  if (!OldTag)
+    return MD;
+  if (!OldTag->getString().startswith("llvm.vectorizer."))
+    return MD;
+
+  // This has an old tag.  Upgrade it.
+  SmallVector<Metadata *, 8> Ops;
+  Ops.reserve(T->getNumOperands());
+  Ops.push_back(upgradeLoopTag(T->getContext(), OldTag->getString()));
+  for (unsigned I = 1, E = T->getNumOperands(); I != E; ++I)
+    Ops.push_back(T->getOperand(I));
+
+  return MDTuple::get(T->getContext(), Ops);
+}
+
+MDNode *llvm::upgradeInstructionLoopAttachment(MDNode &N) {
+  auto *T = dyn_cast<MDTuple>(&N);
+  if (!T)
+    return &N;
+
+  if (!llvm::any_of(T->operands(), isOldLoopArgument))
+    return &N;
+
+  SmallVector<Metadata *, 8> Ops;
+  Ops.reserve(T->getNumOperands());
+  for (Metadata *MD : T->operands())
+    Ops.push_back(upgradeLoopArgument(MD));
+
+  return MDTuple::get(T->getContext(), Ops);
 }
diff --git a/lib/IR/BasicBlock.cpp b/lib/IR/BasicBlock.cpp
index f61276fd436b..4640b4f9d413 100644
--- a/lib/IR/BasicBlock.cpp
+++ b/lib/IR/BasicBlock.cpp
@@ -162,6 +162,21 @@ CallInst *BasicBlock::getTerminatingMustTailCall() {
   return nullptr;
 }
 
+CallInst *BasicBlock::getTerminatingDeoptimizeCall() {
+  if (InstList.empty())
+    return nullptr;
+  auto *RI = dyn_cast<ReturnInst>(&InstList.back());
+  if (!RI || RI == &InstList.front())
+    return nullptr;
+
+  if (auto *CI = dyn_cast_or_null<CallInst>(RI->getPrevNode()))
+    if (Function *F = CI->getCalledFunction())
+      if (F->getIntrinsicID() == Intrinsic::experimental_deoptimize)
+        return CI;
+
+  return nullptr;
+}
+
 Instruction* BasicBlock::getFirstNonPHI() {
   for (Instruction &I : *this)
     if (!isa<PHINode>(I))
@@ -202,8 +217,8 @@ BasicBlock::iterator BasicBlock::getFirstInsertionPt() {
 }
 
 void BasicBlock::dropAllReferences() {
-  for(iterator I = begin(), E = end(); I != E; ++I)
-    I->dropAllReferences();
+  for (Instruction &I : *this)
+    I.dropAllReferences();
 }
 
 /// If this basic block has a single predecessor block,
@@ -361,10 +376,8 @@ BasicBlock *BasicBlock::splitBasicBlock(iterator I, const Twine &BBName) {
   assert(I != InstList.end() &&
          "Trying to get me to create degenerate basic block!");
 
-  BasicBlock *InsertBefore = std::next(Function::iterator(this))
-                               .getNodePtrUnchecked();
-  BasicBlock *New = BasicBlock::Create(getContext(), BBName,
-                                       getParent(), InsertBefore);
+  BasicBlock *New = BasicBlock::Create(getContext(), BBName, getParent(),
+                                       this->getNextNode());
 
   // Save DebugLoc of split point before invalidating iterator.
   DebugLoc Loc = I->getDebugLoc();
diff --git a/lib/IR/CMakeLists.txt b/lib/IR/CMakeLists.txt
index 40b4ec65e22b..07cec97084ee 100644
--- a/lib/IR/CMakeLists.txt
+++ b/lib/IR/CMakeLists.txt
@@ -37,12 +37,14 @@ add_llvm_library(LLVMCore
   Mangler.cpp
   Metadata.cpp
   Module.cpp
+  ModuleSummaryIndex.cpp
   Operator.cpp
+  OptBisect.cpp
   Pass.cpp
   PassManager.cpp
   PassRegistry.cpp
+  ProfileSummary.cpp
   Statepoint.cpp
-  FunctionInfo.cpp
   Type.cpp
   TypeFinder.cpp
   Use.cpp
@@ -56,12 +58,4 @@ add_llvm_library(LLVMCore
   ${LLVM_MAIN_INCLUDE_DIR}/llvm/IR
   )
 
-# PR24785: Workaround for hanging compilation.
-if( MSVC_VERSION EQUAL 1800)
-  set_property(
-    SOURCE Function.cpp
-    PROPERTY COMPILE_FLAGS "/Og-"
-    )
-endif()
-
 add_dependencies(LLVMCore intrinsics_gen)
diff --git a/lib/IR/Comdat.cpp b/lib/IR/Comdat.cpp
index 80715ff40ba9..fc1b48d1c190 100644
--- a/lib/IR/Comdat.cpp
+++ b/lib/IR/Comdat.cpp
@@ -15,9 +15,6 @@
 #include "llvm/ADT/StringMap.h"
 using namespace llvm;
 
-Comdat::Comdat(SelectionKind SK, StringMapEntry<Comdat> *Name)
-    : Name(Name), SK(SK) {}
-
 Comdat::Comdat(Comdat &&C) : Name(C.Name), SK(C.SK) {}
 
 Comdat::Comdat() : Name(nullptr), SK(Comdat::Any) {}
diff --git a/lib/IR/ConstantFold.cpp b/lib/IR/ConstantFold.cpp
index ce3fe03e2df7..c06a99c05363 100644
--- a/lib/IR/ConstantFold.cpp
+++ b/lib/IR/ConstantFold.cpp
@@ -28,11 +28,9 @@
 #include "llvm/IR/Instructions.h"
 #include "llvm/IR/Operator.h"
 #include "llvm/IR/PatternMatch.h"
-#include "llvm/Support/Compiler.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/ManagedStatic.h"
 #include "llvm/Support/MathExtras.h"
-#include <limits>
 using namespace llvm;
 using namespace llvm::PatternMatch;
 
@@ -40,9 +38,9 @@ using namespace llvm::PatternMatch;
 //                ConstantFold*Instruction Implementations
 //===----------------------------------------------------------------------===//
 
-/// BitCastConstantVector - Convert the specified vector Constant node to the
-/// specified vector type.  At this point, we know that the elements of the
-/// input vector constant are all simple integer or FP values.
+/// Convert the specified vector Constant node to the specified vector type.
+/// At this point, we know that the elements of the input vector constant are
+/// all simple integer or FP values.
 static Constant *BitCastConstantVector(Constant *CV, VectorType *DstTy) {
 
   if (CV->isAllOnesValue()) return Constant::getAllOnesValue(DstTy);
@@ -54,7 +52,7 @@ static Constant *BitCastConstantVector(Constant *CV, VectorType *DstTy) {
   unsigned NumElts = DstTy->getNumElements();
   if (NumElts != CV->getType()->getVectorNumElements())
     return nullptr;
-  
+
   Type *DstEltTy = DstTy->getElementType();
 
   SmallVector<Constant*, 16> Result;
@@ -69,7 +67,7 @@ static Constant *BitCastConstantVector(Constant *CV, VectorType *DstTy) {
   return ConstantVector::get(Result);
 }
 
-/// This function determines which opcode to use to fold two constant cast 
+/// This function determines which opcode to use to fold two constant cast
 /// expressions together. It uses CastInst::isEliminableCastPair to determine
 /// the opcode. Consequently its just a wrapper around that function.
 /// @brief Determine if it is valid to fold a cast of a cast
@@ -120,7 +118,7 @@ static Constant *FoldBitCast(Constant *V, Type *DestTy) {
             if (STy->getNumElements() == 0) break;
             ElTy = STy->getElementType(0);
             IdxList.push_back(Zero);
-          } else if (SequentialType *STy = 
+          } else if (SequentialType *STy =
                      dyn_cast<SequentialType>(ElTy)) {
             if (ElTy->isPointerTy()) break;  // Can't index into pointers!
             ElTy = STy->getElementType();
@@ -136,7 +134,7 @@ static Constant *FoldBitCast(Constant *V, Type *DestTy) {
                                                         V, IdxList);
       }
 
-  // Handle casts from one vector constant to another.  We know that the src 
+  // Handle casts from one vector constant to another.  We know that the src
   // and dest type have the same size (otherwise its an illegal cast).
   if (VectorType *DestPTy = dyn_cast<VectorType>(DestTy)) {
     if (VectorType *SrcTy = dyn_cast<VectorType>(V->getType())) {
@@ -191,6 +189,10 @@ static Constant *FoldBitCast(Constant *V, Type *DestTy) {
     if (FP->getType()->isPPC_FP128Ty())
       return nullptr;
 
+    // Make sure dest type is compatible with the folded integer constant.
+    if (!DestTy->isIntegerTy())
+      return nullptr;
+
     return ConstantInt::get(FP->getContext(),
                             FP->getValueAPF().bitcastToAPInt());
   }
@@ -199,15 +201,14 @@ static Constant *FoldBitCast(Constant *V, Type *DestTy) {
 }
 
 
-/// ExtractConstantBytes - V is an integer constant which only has a subset of
-/// its bytes used.  The bytes used are indicated by ByteStart (which is the
-/// first byte used, counting from the least significant byte) and ByteSize,
-/// which is the number of bytes used.
+/// V is an integer constant which only has a subset of its bytes used.
+/// The bytes used are indicated by ByteStart (which is the first byte used,
+/// counting from the least significant byte) and ByteSize, which is the number
+/// of bytes used.
 ///
 /// This function analyzes the specified constant to see if the specified byte
 /// range can be returned as a simplified constant.  If so, the constant is
 /// returned, otherwise null is returned.
-/// 
 static Constant *ExtractConstantBytes(Constant *C, unsigned ByteStart,
                                       unsigned ByteSize) {
   assert(C->getType()->isIntegerTy() &&
@@ -217,7 +218,7 @@ static Constant *ExtractConstantBytes(Constant *C, unsigned ByteStart,
   assert(ByteSize && "Must be accessing some piece");
   assert(ByteStart+ByteSize <= CSize && "Extracting invalid piece from input");
   assert(ByteSize != CSize && "Should not extract everything");
-  
+
   // Constant Integers are simple.
   if (ConstantInt *CI = dyn_cast<ConstantInt>(C)) {
     APInt V = CI->getValue();
@@ -226,7 +227,7 @@ static Constant *ExtractConstantBytes(Constant *C, unsigned ByteStart,
     V = V.trunc(ByteSize*8);
     return ConstantInt::get(CI->getContext(), V);
   }
-  
+
   // In the input is a constant expr, we might be able to recursively simplify.
   // If not, we definitely can't do anything.
   ConstantExpr *CE = dyn_cast<ConstantExpr>(C);
@@ -238,12 +239,12 @@ static Constant *ExtractConstantBytes(Constant *C, unsigned ByteStart,
     Constant *RHS = ExtractConstantBytes(CE->getOperand(1), ByteStart,ByteSize);
     if (!RHS)
       return nullptr;
-    
+
     // X | -1 -> -1.
     if (ConstantInt *RHSC = dyn_cast<ConstantInt>(RHS))
       if (RHSC->isAllOnesValue())
         return RHSC;
-    
+
     Constant *LHS = ExtractConstantBytes(CE->getOperand(0), ByteStart,ByteSize);
     if (!LHS)
       return nullptr;
@@ -253,11 +254,11 @@ static Constant *ExtractConstantBytes(Constant *C, unsigned ByteStart,
     Constant *RHS = ExtractConstantBytes(CE->getOperand(1), ByteStart,ByteSize);
     if (!RHS)
       return nullptr;
-    
+
     // X & 0 -> 0.
     if (RHS->isNullValue())
       return RHS;
-    
+
     Constant *LHS = ExtractConstantBytes(CE->getOperand(0), ByteStart,ByteSize);
     if (!LHS)
       return nullptr;
@@ -272,7 +273,7 @@ static Constant *ExtractConstantBytes(Constant *C, unsigned ByteStart,
     if ((ShAmt & 7) != 0)
       return nullptr;
     ShAmt >>= 3;
-    
+
     // If the extract is known to be all zeros, return zero.
     if (ByteStart >= CSize-ShAmt)
       return Constant::getNullValue(IntegerType::get(CE->getContext(),
@@ -280,11 +281,11 @@ static Constant *ExtractConstantBytes(Constant *C, unsigned ByteStart,
     // If the extract is known to be fully in the input, extract it.
     if (ByteStart+ByteSize+ShAmt <= CSize)
       return ExtractConstantBytes(CE->getOperand(0), ByteStart+ShAmt, ByteSize);
-    
+
     // TODO: Handle the 'partially zero' case.
     return nullptr;
   }
-    
+
   case Instruction::Shl: {
     ConstantInt *Amt = dyn_cast<ConstantInt>(CE->getOperand(1));
     if (!Amt)
@@ -294,7 +295,7 @@ static Constant *ExtractConstantBytes(Constant *C, unsigned ByteStart,
     if ((ShAmt & 7) != 0)
       return nullptr;
     ShAmt >>= 3;
-    
+
     // If the extract is known to be all zeros, return zero.
     if (ByteStart+ByteSize <= ShAmt)
       return Constant::getNullValue(IntegerType::get(CE->getContext(),
@@ -302,15 +303,15 @@ static Constant *ExtractConstantBytes(Constant *C, unsigned ByteStart,
     // If the extract is known to be fully in the input, extract it.
     if (ByteStart >= ShAmt)
       return ExtractConstantBytes(CE->getOperand(0), ByteStart-ShAmt, ByteSize);
-    
+
     // TODO: Handle the 'partially zero' case.
     return nullptr;
   }
-      
+
   case Instruction::ZExt: {
     unsigned SrcBitSize =
       cast<IntegerType>(CE->getOperand(0)->getType())->getBitWidth();
-    
+
     // If extracting something that is completely zero, return 0.
     if (ByteStart*8 >= SrcBitSize)
       return Constant::getNullValue(IntegerType::get(CE->getContext(),
@@ -319,35 +320,34 @@ static Constant *ExtractConstantBytes(Constant *C, unsigned ByteStart,
     // If exactly extracting the input, return it.
     if (ByteStart == 0 && ByteSize*8 == SrcBitSize)
       return CE->getOperand(0);
-    
+
     // If extracting something completely in the input, if if the input is a
     // multiple of 8 bits, recurse.
     if ((SrcBitSize&7) == 0 && (ByteStart+ByteSize)*8 <= SrcBitSize)
       return ExtractConstantBytes(CE->getOperand(0), ByteStart, ByteSize);
-      
+
     // Otherwise, if extracting a subset of the input, which is not multiple of
     // 8 bits, do a shift and trunc to get the bits.
     if ((ByteStart+ByteSize)*8 < SrcBitSize) {
       assert((SrcBitSize&7) && "Shouldn't get byte sized case here");
       Constant *Res = CE->getOperand(0);
       if (ByteStart)
-        Res = ConstantExpr::getLShr(Res, 
+        Res = ConstantExpr::getLShr(Res,
                                  ConstantInt::get(Res->getType(), ByteStart*8));
       return ConstantExpr::getTrunc(Res, IntegerType::get(C->getContext(),
                                                           ByteSize*8));
     }
-    
+
     // TODO: Handle the 'partially zero' case.
     return nullptr;
   }
   }
 }
 
-/// getFoldedSizeOf - Return a ConstantExpr with type DestTy for sizeof
-/// on Ty, with any known factors factored out. If Folded is false,
-/// return null if no factoring was possible, to avoid endlessly
-/// bouncing an unfoldable expression back into the top-level folder.
-///
+/// Return a ConstantExpr with type DestTy for sizeof on Ty, with any known
+/// factors factored out. If Folded is false, return null if no factoring was
+/// possible, to avoid endlessly bouncing an unfoldable expression back into the
+/// top-level folder.
 static Constant *getFoldedSizeOf(Type *Ty, Type *DestTy,
                                  bool Folded) {
   if (ArrayType *ATy = dyn_cast<ArrayType>(Ty)) {
@@ -400,11 +400,10 @@ static Constant *getFoldedSizeOf(Type *Ty, Type *DestTy,
   return C;
 }
 
-/// getFoldedAlignOf - Return a ConstantExpr with type DestTy for alignof
-/// on Ty, with any known factors factored out. If Folded is false,
-/// return null if no factoring was possible, to avoid endlessly
-/// bouncing an unfoldable expression back into the top-level folder.
-///
+/// Return a ConstantExpr with type DestTy for alignof on Ty, with any known
+/// factors factored out. If Folded is false, return null if no factoring was
+/// possible, to avoid endlessly bouncing an unfoldable expression back into the
+/// top-level folder.
 static Constant *getFoldedAlignOf(Type *Ty, Type *DestTy,
                                   bool Folded) {
   // The alignment of an array is equal to the alignment of the
@@ -466,11 +465,10 @@ static Constant *getFoldedAlignOf(Type *Ty, Type *DestTy,
   return C;
 }
 
-/// getFoldedOffsetOf - Return a ConstantExpr with type DestTy for offsetof
-/// on Ty and FieldNo, with any known factors factored out. If Folded is false,
-/// return null if no factoring was possible, to avoid endlessly
-/// bouncing an unfoldable expression back into the top-level folder.
-///
+/// Return a ConstantExpr with type DestTy for offsetof on Ty and FieldNo, with
+/// any known factors factored out. If Folded is false, return null if no
+/// factoring was possible, to avoid endlessly bouncing an unfoldable expression
+/// back into the top-level folder.
 static Constant *getFoldedOffsetOf(Type *Ty, Constant *FieldNo,
                                    Type *DestTy,
                                    bool Folded) {
@@ -533,7 +531,8 @@ Constant *llvm::ConstantFoldCastInstruction(unsigned opc, Constant *V,
     return UndefValue::get(DestTy);
   }
 
-  if (V->isNullValue() && !DestTy->isX86_MMXTy())
+  if (V->isNullValue() && !DestTy->isX86_MMXTy() &&
+      opc != Instruction::AddrSpaceCast)
     return Constant::getNullValue(DestTy);
 
   // If the cast operand is a constant expression, there's a few things we can
@@ -600,12 +599,12 @@ Constant *llvm::ConstantFoldCastInstruction(unsigned opc, Constant *V,
       return ConstantFP::get(V->getContext(), Val);
     }
     return nullptr; // Can't fold.
-  case Instruction::FPToUI: 
+  case Instruction::FPToUI:
   case Instruction::FPToSI:
     if (ConstantFP *FPC = dyn_cast<ConstantFP>(V)) {
       const APFloat &V = FPC->getValueAPF();
       bool ignored;
-      uint64_t x[2]; 
+      uint64_t x[2];
       uint32_t DestBitWidth = cast<IntegerType>(DestTy)->getBitWidth();
       if (APFloat::opInvalidOp ==
           V.convertToInteger(x, DestBitWidth, opc==Instruction::FPToSI,
@@ -669,7 +668,7 @@ Constant *llvm::ConstantFoldCastInstruction(unsigned opc, Constant *V,
   case Instruction::UIToFP:
   case Instruction::SIToFP:
     if (ConstantInt *CI = dyn_cast<ConstantInt>(V)) {
-      APInt api = CI->getValue();
+      const APInt &api = CI->getValue();
       APFloat apf(DestTy->getFltSemantics(),
                   APInt::getNullValue(DestTy->getPrimitiveSizeInBits()));
       if (APFloat::opOverflow &
@@ -705,7 +704,7 @@ Constant *llvm::ConstantFoldCastInstruction(unsigned opc, Constant *V,
       return ConstantInt::get(V->getContext(),
                               CI->getValue().trunc(DestBitWidth));
     }
-    
+
     // The input must be a constantexpr.  See if we can simplify this based on
     // the bytes we are demanding.  Only do this if the source and dest are an
     // even multiple of a byte.
@@ -713,7 +712,7 @@ Constant *llvm::ConstantFoldCastInstruction(unsigned opc, Constant *V,
         (cast<IntegerType>(V->getType())->getBitWidth() & 7) == 0)
       if (Constant *Res = ExtractConstantBytes(V, 0, DestBitWidth / 8))
         return Res;
-      
+
     return nullptr;
   }
   case Instruction::BitCast:
@@ -750,7 +749,7 @@ Constant *llvm::ConstantFoldSelectInstruction(Constant *Cond,
       }
       Result.push_back(V);
     }
-    
+
     // If we were able to build the vector, return it.
     if (Result.size() == V1->getType()->getVectorNumElements())
       return ConstantVector::get(Result);
@@ -814,12 +813,12 @@ Constant *llvm::ConstantFoldInsertElementInstruction(Constant *Val,
   Result.reserve(NumElts);
   auto *Ty = Type::getInt32Ty(Val->getContext());
   uint64_t IdxVal = CIdx->getZExtValue();
-  for (unsigned i = 0; i != NumElts; ++i) {    
+  for (unsigned i = 0; i != NumElts; ++i) {
     if (i == IdxVal) {
       Result.push_back(Elt);
       continue;
     }
-    
+
     Constant *C = ConstantExpr::getExtractElement(Val, ConstantInt::get(Ty, i));
     Result.push_back(C);
   }
@@ -839,7 +838,7 @@ Constant *llvm::ConstantFoldShuffleVectorInstruction(Constant *V1,
 
   // Don't break the bitcode reader hack.
   if (isa<ConstantExpr>(Mask)) return nullptr;
-  
+
   unsigned SrcNumElts = V1->getType()->getVectorNumElements();
 
   // Loop over the shuffle mask, evaluating each element.
@@ -902,10 +901,10 @@ Constant *llvm::ConstantFoldInsertValueInstruction(Constant *Agg,
 
     if (Idxs[0] == i)
       C = ConstantFoldInsertValueInstruction(C, Val, Idxs.slice(1));
-    
+
     Result.push_back(C);
   }
-  
+
   if (StructType *ST = dyn_cast<StructType>(Agg->getType()))
     return ConstantStruct::get(ST, Result);
   if (ArrayType *AT = dyn_cast<ArrayType>(Agg->getType()))
@@ -916,9 +915,11 @@ Constant *llvm::ConstantFoldInsertValueInstruction(Constant *Agg,
 
 Constant *llvm::ConstantFoldBinaryInstruction(unsigned Opcode,
                                               Constant *C1, Constant *C2) {
+  assert(Instruction::isBinaryOp(Opcode) && "Non-binary instruction detected");
+
   // Handle UndefValue up front.
   if (isa<UndefValue>(C1) || isa<UndefValue>(C2)) {
-    switch (Opcode) {
+    switch (static_cast<Instruction::BinaryOps>(Opcode)) {
     case Instruction::Xor:
       if (isa<UndefValue>(C1) && isa<UndefValue>(C2))
         // Handle undef ^ undef -> 0 special case. This is a common
@@ -948,7 +949,7 @@ Constant *llvm::ConstantFoldBinaryInstruction(unsigned Opcode,
     case Instruction::SDiv:
     case Instruction::UDiv:
       // X / undef -> undef
-      if (match(C1, m_Zero()))
+      if (isa<UndefValue>(C2))
         return C2;
       // undef / 0 -> undef
       // undef / 1 -> undef
@@ -998,9 +999,22 @@ Constant *llvm::ConstantFoldBinaryInstruction(unsigned Opcode,
         return C1;
       // undef << X -> 0
       return Constant::getNullValue(C1->getType());
+    case Instruction::FAdd:
+    case Instruction::FSub:
+    case Instruction::FMul:
+    case Instruction::FDiv:
+    case Instruction::FRem:
+      // TODO: UNDEF handling for binary float instructions.
+      return nullptr;
+    case Instruction::BinaryOpsEnd:
+      llvm_unreachable("Invalid BinaryOp");
     }
   }
 
+  // At this point neither constant should be an UndefValue.
+  assert(!isa<UndefValue>(C1) && !isa<UndefValue>(C2) &&
+         "Unexpected UndefValue");
+
   // Handle simplifications when the RHS is a constant int.
   if (ConstantInt *CI2 = dyn_cast<ConstantInt>(C2)) {
     switch (Opcode) {
@@ -1046,7 +1060,7 @@ Constant *llvm::ConstantFoldBinaryInstruction(unsigned Opcode,
         }
 
         // If and'ing the address of a global with a constant, fold it.
-        if (CE1->getOpcode() == Instruction::PtrToInt && 
+        if (CE1->getOpcode() == Instruction::PtrToInt &&
             isa<GlobalValue>(CE1->getOperand(0))) {
           GlobalValue *GV = cast<GlobalValue>(CE1->getOperand(0));
 
@@ -1102,7 +1116,6 @@ Constant *llvm::ConstantFoldBinaryInstruction(unsigned Opcode,
       return ConstantExpr::get(Opcode, C2, C1);
   }
 
-  // At this point we know neither constant is an UndefValue.
   if (ConstantInt *CI1 = dyn_cast<ConstantInt>(C1)) {
     if (ConstantInt *CI2 = dyn_cast<ConstantInt>(C2)) {
       const APInt &C1V = CI1->getValue();
@@ -1110,11 +1123,11 @@ Constant *llvm::ConstantFoldBinaryInstruction(unsigned Opcode,
       switch (Opcode) {
       default:
         break;
-      case Instruction::Add:     
+      case Instruction::Add:
         return ConstantInt::get(CI1->getContext(), C1V + C2V);
-      case Instruction::Sub:     
+      case Instruction::Sub:
         return ConstantInt::get(CI1->getContext(), C1V - C2V);
-      case Instruction::Mul:     
+      case Instruction::Mul:
         return ConstantInt::get(CI1->getContext(), C1V * C2V);
       case Instruction::UDiv:
         assert(!CI2->isNullValue() && "Div by zero handled above");
@@ -1168,11 +1181,11 @@ Constant *llvm::ConstantFoldBinaryInstruction(unsigned Opcode,
     }
   } else if (ConstantFP *CFP1 = dyn_cast<ConstantFP>(C1)) {
     if (ConstantFP *CFP2 = dyn_cast<ConstantFP>(C2)) {
-      APFloat C1V = CFP1->getValueAPF();
-      APFloat C2V = CFP2->getValueAPF();
+      const APFloat &C1V = CFP1->getValueAPF();
+      const APFloat &C2V = CFP2->getValueAPF();
       APFloat C3V = C1V;  // copy for modification
       switch (Opcode) {
-      default:                   
+      default:
         break;
       case Instruction::FAdd:
         (void)C3V.add(C2V, APFloat::rmNearestTiesToEven);
@@ -1200,10 +1213,10 @@ Constant *llvm::ConstantFoldBinaryInstruction(unsigned Opcode,
         ConstantExpr::getExtractElement(C1, ConstantInt::get(Ty, i));
       Constant *RHS =
         ConstantExpr::getExtractElement(C2, ConstantInt::get(Ty, i));
-      
+
       Result.push_back(ConstantExpr::get(Opcode, LHS, RHS));
     }
-    
+
     return ConstantVector::get(Result);
   }
 
@@ -1259,8 +1272,8 @@ Constant *llvm::ConstantFoldBinaryInstruction(unsigned Opcode,
   return nullptr;
 }
 
-/// isZeroSizedType - This type is zero sized if its an array or structure of
-/// zero sized types.  The only leaf zero sized type is an empty structure.
+/// This type is zero-sized if it's an array or structure of zero-sized types.
+/// The only leaf zero-sized type is an empty structure.
 static bool isMaybeZeroSizedType(Type *Ty) {
   if (StructType *STy = dyn_cast<StructType>(Ty)) {
     if (STy->isOpaque()) return true;  // Can't say.
@@ -1276,8 +1289,8 @@ static bool isMaybeZeroSizedType(Type *Ty) {
   return false;
 }
 
-/// IdxCompare - Compare the two constants as though they were getelementptr
-/// indices.  This allows coercion of the types to be the same thing.
+/// Compare the two constants as though they were getelementptr indices.
+/// This allows coercion of the types to be the same thing.
 ///
 /// If the two constants are the "same" (after coercion), return 0.  If the
 /// first is less than the second, return -1, if the second is less than the
@@ -1316,11 +1329,11 @@ static int IdxCompare(Constant *C1, Constant *C2, Type *ElTy) {
     return 1;
 }
 
-/// evaluateFCmpRelation - This function determines if there is anything we can
-/// decide about the two constants provided.  This doesn't need to handle simple
-/// things like ConstantFP comparisons, but should instead handle ConstantExprs.
-/// If we can determine that the two constants have a particular relation to 
-/// each other, we should return the corresponding FCmpInst predicate, 
+/// This function determines if there is anything we can decide about the two
+/// constants provided. This doesn't need to handle simple things like
+/// ConstantFP comparisons, but should instead handle ConstantExprs.
+/// If we can determine that the two constants have a particular relation to
+/// each other, we should return the corresponding FCmpInst predicate,
 /// otherwise return FCmpInst::BAD_FCMP_PREDICATE. This is used below in
 /// ConstantFoldCompareInstruction.
 ///
@@ -1340,15 +1353,15 @@ static FCmpInst::Predicate evaluateFCmpRelation(Constant *V1, Constant *V2) {
       ConstantInt *R = nullptr;
       R = dyn_cast<ConstantInt>(
                       ConstantExpr::getFCmp(FCmpInst::FCMP_OEQ, V1, V2));
-      if (R && !R->isZero()) 
+      if (R && !R->isZero())
         return FCmpInst::FCMP_OEQ;
       R = dyn_cast<ConstantInt>(
                       ConstantExpr::getFCmp(FCmpInst::FCMP_OLT, V1, V2));
-      if (R && !R->isZero()) 
+      if (R && !R->isZero())
         return FCmpInst::FCMP_OLT;
       R = dyn_cast<ConstantInt>(
                       ConstantExpr::getFCmp(FCmpInst::FCMP_OGT, V1, V2));
-      if (R && !R->isZero()) 
+      if (R && !R->isZero())
         return FCmpInst::FCMP_OGT;
 
       // Nothing more we can do
@@ -1403,12 +1416,12 @@ static ICmpInst::Predicate areGlobalsPotentiallyEqual(const GlobalValue *GV1,
   return ICmpInst::BAD_ICMP_PREDICATE;
 }
 
-/// evaluateICmpRelation - This function determines if there is anything we can
-/// decide about the two constants provided.  This doesn't need to handle simple
-/// things like integer comparisons, but should instead handle ConstantExprs
-/// and GlobalValues.  If we can determine that the two constants have a
-/// particular relation to each other, we should return the corresponding ICmp
-/// predicate, otherwise return ICmpInst::BAD_ICMP_PREDICATE.
+/// This function determines if there is anything we can decide about the two
+/// constants provided. This doesn't need to handle simple things like integer
+/// comparisons, but should instead handle ConstantExprs and GlobalValues.
+/// If we can determine that the two constants have a particular relation to
+/// each other, we should return the corresponding ICmp predicate, otherwise
+/// return ICmpInst::BAD_ICMP_PREDICATE.
 ///
 /// To simplify this code we canonicalize the relation so that the first
 /// operand is always the most "complex" of the two.  We consider simple
@@ -1430,7 +1443,7 @@ static ICmpInst::Predicate evaluateICmpRelation(Constant *V1, Constant *V2,
       ConstantInt *R = nullptr;
       ICmpInst::Predicate pred = ICmpInst::ICMP_EQ;
       R = dyn_cast<ConstantInt>(ConstantExpr::getICmp(pred, V1, V2));
-      if (R && !R->isZero()) 
+      if (R && !R->isZero())
         return pred;
       pred = isSigned ? ICmpInst::ICMP_SLT : ICmpInst::ICMP_ULT;
       R = dyn_cast<ConstantInt>(ConstantExpr::getICmp(pred, V1, V2));
@@ -1446,14 +1459,14 @@ static ICmpInst::Predicate evaluateICmpRelation(Constant *V1, Constant *V2,
     }
 
     // If the first operand is simple, swap operands.
-    ICmpInst::Predicate SwappedRelation = 
+    ICmpInst::Predicate SwappedRelation =
       evaluateICmpRelation(V2, V1, isSigned);
     if (SwappedRelation != ICmpInst::BAD_ICMP_PREDICATE)
       return ICmpInst::getSwappedPredicate(SwappedRelation);
 
   } else if (const GlobalValue *GV = dyn_cast<GlobalValue>(V1)) {
     if (isa<ConstantExpr>(V2)) {  // Swap as necessary.
-      ICmpInst::Predicate SwappedRelation = 
+      ICmpInst::Predicate SwappedRelation =
         evaluateICmpRelation(V2, V1, isSigned);
       if (SwappedRelation != ICmpInst::BAD_ICMP_PREDICATE)
         return ICmpInst::getSwappedPredicate(SwappedRelation);
@@ -1476,13 +1489,13 @@ static ICmpInst::Predicate evaluateICmpRelation(Constant *V1, Constant *V2,
     }
   } else if (const BlockAddress *BA = dyn_cast<BlockAddress>(V1)) {
     if (isa<ConstantExpr>(V2)) {  // Swap as necessary.
-      ICmpInst::Predicate SwappedRelation = 
+      ICmpInst::Predicate SwappedRelation =
         evaluateICmpRelation(V2, V1, isSigned);
       if (SwappedRelation != ICmpInst::BAD_ICMP_PREDICATE)
         return ICmpInst::getSwappedPredicate(SwappedRelation);
       return ICmpInst::BAD_ICMP_PREDICATE;
     }
-    
+
     // Now we know that the RHS is a GlobalValue, BlockAddress or simple
     // constant (which, since the types must match, means that it is a
     // ConstantPointerNull).
@@ -1517,6 +1530,10 @@ static ICmpInst::Predicate evaluateICmpRelation(Constant *V1, Constant *V2,
     case Instruction::BitCast:
     case Instruction::ZExt:
     case Instruction::SExt:
+      // We can't evaluate floating point casts or truncations.
+      if (CE1Op0->getType()->isFloatingPointTy())
+        break;
+
       // If the cast is not actually changing bits, and the second operand is a
       // null pointer, do the comparison with the pre-casted value.
       if (V2->isNullValue() &&
@@ -1524,7 +1541,7 @@ static ICmpInst::Predicate evaluateICmpRelation(Constant *V1, Constant *V2,
         if (CE1->getOpcode() == Instruction::ZExt) isSigned = false;
         if (CE1->getOpcode() == Instruction::SExt) isSigned = true;
         return evaluateICmpRelation(CE1Op0,
-                                    Constant::getNullValue(CE1Op0->getType()), 
+                                    Constant::getNullValue(CE1Op0->getType()),
                                     isSigned);
       }
       break;
@@ -1541,7 +1558,7 @@ static ICmpInst::Predicate evaluateICmpRelation(Constant *V1, Constant *V2,
             // Weak linkage GVals could be zero or not. We're comparing that
             // to null pointer so its greater-or-equal
             return isSigned ? ICmpInst::ICMP_SGE : ICmpInst::ICMP_UGE;
-          else 
+          else
             // If its not weak linkage, the GVal must have a non-zero address
             // so the result is greater-than
             return isSigned ? ICmpInst::ICMP_SGT : ICmpInst::ICMP_UGT;
@@ -1656,7 +1673,7 @@ static ICmpInst::Predicate evaluateICmpRelation(Constant *V1, Constant *V2,
   return ICmpInst::BAD_ICMP_PREDICATE;
 }
 
-Constant *llvm::ConstantFoldCompareInstruction(unsigned short pred, 
+Constant *llvm::ConstantFoldCompareInstruction(unsigned short pred,
                                                Constant *C1, Constant *C2) {
   Type *ResultTy;
   if (VectorType *VT = dyn_cast<VectorType>(C1->getType()))
@@ -1729,8 +1746,8 @@ Constant *llvm::ConstantFoldCompareInstruction(unsigned short pred,
   }
 
   if (isa<ConstantInt>(C1) && isa<ConstantInt>(C2)) {
-    APInt V1 = cast<ConstantInt>(C1)->getValue();
-    APInt V2 = cast<ConstantInt>(C2)->getValue();
+    const APInt &V1 = cast<ConstantInt>(C1)->getValue();
+    const APInt &V2 = cast<ConstantInt>(C2)->getValue();
     switch (pred) {
     default: llvm_unreachable("Invalid ICmp Predicate");
     case ICmpInst::ICMP_EQ:  return ConstantInt::get(ResultTy, V1 == V2);
@@ -1745,8 +1762,8 @@ Constant *llvm::ConstantFoldCompareInstruction(unsigned short pred,
     case ICmpInst::ICMP_UGE: return ConstantInt::get(ResultTy, V1.uge(V2));
     }
   } else if (isa<ConstantFP>(C1) && isa<ConstantFP>(C2)) {
-    APFloat C1V = cast<ConstantFP>(C1)->getValueAPF();
-    APFloat C2V = cast<ConstantFP>(C2)->getValueAPF();
+    const APFloat &C1V = cast<ConstantFP>(C1)->getValueAPF();
+    const APFloat &C2V = cast<ConstantFP>(C2)->getValueAPF();
     APFloat::cmpResult R = C1V.compare(C2V);
     switch (pred) {
     default: llvm_unreachable("Invalid FCmp Predicate");
@@ -1759,17 +1776,17 @@ Constant *llvm::ConstantFoldCompareInstruction(unsigned short pred,
     case FCmpInst::FCMP_UEQ:
       return ConstantInt::get(ResultTy, R==APFloat::cmpUnordered ||
                                         R==APFloat::cmpEqual);
-    case FCmpInst::FCMP_OEQ:   
+    case FCmpInst::FCMP_OEQ:
       return ConstantInt::get(ResultTy, R==APFloat::cmpEqual);
     case FCmpInst::FCMP_UNE:
       return ConstantInt::get(ResultTy, R!=APFloat::cmpEqual);
-    case FCmpInst::FCMP_ONE:   
+    case FCmpInst::FCMP_ONE:
       return ConstantInt::get(ResultTy, R==APFloat::cmpLessThan ||
                                         R==APFloat::cmpGreaterThan);
-    case FCmpInst::FCMP_ULT: 
+    case FCmpInst::FCMP_ULT:
       return ConstantInt::get(ResultTy, R==APFloat::cmpUnordered ||
                                         R==APFloat::cmpLessThan);
-    case FCmpInst::FCMP_OLT:   
+    case FCmpInst::FCMP_OLT:
       return ConstantInt::get(ResultTy, R==APFloat::cmpLessThan);
     case FCmpInst::FCMP_UGT:
       return ConstantInt::get(ResultTy, R==APFloat::cmpUnordered ||
@@ -1778,12 +1795,12 @@ Constant *llvm::ConstantFoldCompareInstruction(unsigned short pred,
       return ConstantInt::get(ResultTy, R==APFloat::cmpGreaterThan);
     case FCmpInst::FCMP_ULE:
       return ConstantInt::get(ResultTy, R!=APFloat::cmpGreaterThan);
-    case FCmpInst::FCMP_OLE: 
+    case FCmpInst::FCMP_OLE:
       return ConstantInt::get(ResultTy, R==APFloat::cmpLessThan ||
                                         R==APFloat::cmpEqual);
     case FCmpInst::FCMP_UGE:
       return ConstantInt::get(ResultTy, R!=APFloat::cmpLessThan);
-    case FCmpInst::FCMP_OGE: 
+    case FCmpInst::FCMP_OGE:
       return ConstantInt::get(ResultTy, R==APFloat::cmpGreaterThan ||
                                         R==APFloat::cmpEqual);
     }
@@ -1798,10 +1815,10 @@ Constant *llvm::ConstantFoldCompareInstruction(unsigned short pred,
         ConstantExpr::getExtractElement(C1, ConstantInt::get(Ty, i));
       Constant *C2E =
         ConstantExpr::getExtractElement(C2, ConstantInt::get(Ty, i));
-      
+
       ResElts.push_back(ConstantExpr::getCompare(pred, C1E, C2E));
     }
-    
+
     return ConstantVector::get(ResElts);
   }
 
@@ -1841,23 +1858,23 @@ Constant *llvm::ConstantFoldCompareInstruction(unsigned short pred,
       break;
     case FCmpInst::FCMP_OLE: // We know that C1 <= C2
       // We can only partially decide this relation.
-      if (pred == FCmpInst::FCMP_UGT || pred == FCmpInst::FCMP_OGT) 
+      if (pred == FCmpInst::FCMP_UGT || pred == FCmpInst::FCMP_OGT)
         Result = 0;
-      else if (pred == FCmpInst::FCMP_ULT || pred == FCmpInst::FCMP_OLT) 
+      else if (pred == FCmpInst::FCMP_ULT || pred == FCmpInst::FCMP_OLT)
         Result = 1;
       break;
     case FCmpInst::FCMP_OGE: // We known that C1 >= C2
       // We can only partially decide this relation.
-      if (pred == FCmpInst::FCMP_ULT || pred == FCmpInst::FCMP_OLT) 
+      if (pred == FCmpInst::FCMP_ULT || pred == FCmpInst::FCMP_OLT)
         Result = 0;
-      else if (pred == FCmpInst::FCMP_UGT || pred == FCmpInst::FCMP_OGT) 
+      else if (pred == FCmpInst::FCMP_UGT || pred == FCmpInst::FCMP_OGT)
         Result = 1;
       break;
     case FCmpInst::FCMP_ONE: // We know that C1 != C2
       // We can only partially decide this relation.
-      if (pred == FCmpInst::FCMP_OEQ || pred == FCmpInst::FCMP_UEQ) 
+      if (pred == FCmpInst::FCMP_OEQ || pred == FCmpInst::FCMP_UEQ)
         Result = 0;
-      else if (pred == FCmpInst::FCMP_ONE || pred == FCmpInst::FCMP_UNE) 
+      else if (pred == FCmpInst::FCMP_ONE || pred == FCmpInst::FCMP_UNE)
         Result = 1;
       break;
     }
@@ -1979,8 +1996,7 @@ Constant *llvm::ConstantFoldCompareInstruction(unsigned short pred,
   return nullptr;
 }
 
-/// isInBoundsIndices - Test whether the given sequence of *normalized* indices
-/// is "inbounds".
+/// Test whether the given sequence of *normalized* indices is "inbounds".
 template<typename IndexTy>
 static bool isInBoundsIndices(ArrayRef<IndexTy> Idxs) {
   // No indices means nothing that could be out of bounds.
@@ -1999,7 +2015,7 @@ static bool isInBoundsIndices(ArrayRef<IndexTy> Idxs) {
   return true;
 }
 
-/// \brief Test whether a given ConstantInt is in-range for a SequentialType.
+/// Test whether a given ConstantInt is in-range for a SequentialType.
 static bool isIndexInRangeOfSequentialType(SequentialType *STy,
                                            const ConstantInt *CI) {
   // And indices are valid when indexing along a pointer
@@ -2040,11 +2056,13 @@ static Constant *ConstantFoldGetElementPtrImpl(Type *PointeeTy, Constant *C,
     return C;
 
   if (isa<UndefValue>(C)) {
-    PointerType *Ptr = cast<PointerType>(C->getType());
-    Type *Ty = GetElementPtrInst::getIndexedType(
-        cast<PointerType>(Ptr->getScalarType())->getElementType(), Idxs);
+    PointerType *PtrTy = cast<PointerType>(C->getType()->getScalarType());
+    Type *Ty = GetElementPtrInst::getIndexedType(PointeeTy, Idxs);
     assert(Ty && "Invalid indices for GEP!");
-    return UndefValue::get(PointerType::get(Ty, Ptr->getAddressSpace()));
+    Type *GEPTy = PointerType::get(Ty, PtrTy->getAddressSpace());
+    if (VectorType *VT = dyn_cast<VectorType>(C->getType()))
+      GEPTy = VectorType::get(GEPTy, VT->getNumElements());
+    return UndefValue::get(GEPTy);
   }
 
   if (C->isNullValue()) {
@@ -2055,12 +2073,14 @@ static Constant *ConstantFoldGetElementPtrImpl(Type *PointeeTy, Constant *C,
         break;
       }
     if (isNull) {
-      PointerType *Ptr = cast<PointerType>(C->getType());
-      Type *Ty = GetElementPtrInst::getIndexedType(
-          cast<PointerType>(Ptr->getScalarType())->getElementType(), Idxs);
+      PointerType *PtrTy = cast<PointerType>(C->getType()->getScalarType());
+      Type *Ty = GetElementPtrInst::getIndexedType(PointeeTy, Idxs);
+
       assert(Ty && "Invalid indices for GEP!");
-      return ConstantPointerNull::get(PointerType::get(Ty,
-                                                       Ptr->getAddressSpace()));
+      Type *GEPTy = PointerType::get(Ty, PtrTy->getAddressSpace());
+      if (VectorType *VT = dyn_cast<VectorType>(C->getType()))
+        GEPTy = VectorType::get(GEPTy, VT->getNumElements());
+      return Constant::getNullValue(GEPTy);
     }
   }
 
@@ -2172,52 +2192,68 @@ static Constant *ConstantFoldGetElementPtrImpl(Type *PointeeTy, Constant *C,
   bool Unknown = !isa<ConstantInt>(Idxs[0]);
   for (unsigned i = 1, e = Idxs.size(); i != e;
        Prev = Ty, Ty = cast<CompositeType>(Ty)->getTypeAtIndex(Idxs[i]), ++i) {
-    if (ConstantInt *CI = dyn_cast<ConstantInt>(Idxs[i])) {
-      if (isa<ArrayType>(Ty) || isa<VectorType>(Ty))
-        if (CI->getSExtValue() > 0 &&
-            !isIndexInRangeOfSequentialType(cast<SequentialType>(Ty), CI)) {
-          if (isa<SequentialType>(Prev)) {
-            // It's out of range, but we can factor it into the prior
-            // dimension.
-            NewIdxs.resize(Idxs.size());
-            uint64_t NumElements = 0;
-            if (auto *ATy = dyn_cast<ArrayType>(Ty))
-              NumElements = ATy->getNumElements();
-            else
-              NumElements = cast<VectorType>(Ty)->getNumElements();
-
-            ConstantInt *Factor = ConstantInt::get(CI->getType(), NumElements);
-            NewIdxs[i] = ConstantExpr::getSRem(CI, Factor);
-
-            Constant *PrevIdx = cast<Constant>(Idxs[i-1]);
-            Constant *Div = ConstantExpr::getSDiv(CI, Factor);
-
-            unsigned CommonExtendedWidth =
-                std::max(PrevIdx->getType()->getIntegerBitWidth(),
-                         Div->getType()->getIntegerBitWidth());
-            CommonExtendedWidth = std::max(CommonExtendedWidth, 64U);
-
-            // Before adding, extend both operands to i64 to avoid
-            // overflow trouble.
-            if (!PrevIdx->getType()->isIntegerTy(CommonExtendedWidth))
-              PrevIdx = ConstantExpr::getSExt(
-                  PrevIdx,
-                  Type::getIntNTy(Div->getContext(), CommonExtendedWidth));
-            if (!Div->getType()->isIntegerTy(CommonExtendedWidth))
-              Div = ConstantExpr::getSExt(
-                  Div, Type::getIntNTy(Div->getContext(), CommonExtendedWidth));
-
-            NewIdxs[i-1] = ConstantExpr::getAdd(PrevIdx, Div);
-          } else {
-            // It's out of range, but the prior dimension is a struct
-            // so we can't do anything about it.
-            Unknown = true;
-          }
-        }
-    } else {
+    auto *CI = dyn_cast<ConstantInt>(Idxs[i]);
+    if (!CI) {
       // We don't know if it's in range or not.
       Unknown = true;
+      continue;
+    }
+    if (isa<StructType>(Ty)) {
+      // The verify makes sure that GEPs into a struct are in range.
+      continue;
     }
+    auto *STy = cast<SequentialType>(Ty);
+    if (isa<PointerType>(STy)) {
+      // We don't know if it's in range or not.
+      Unknown = true;
+      continue;
+    }
+    if (isa<VectorType>(STy)) {
+      // There can be awkward padding in after a non-power of two vector.
+      Unknown = true;
+      continue;
+    }
+    if (isIndexInRangeOfSequentialType(STy, CI))
+      // It's in range, skip to the next index.
+      continue;
+    if (!isa<SequentialType>(Prev)) {
+      // It's out of range, but the prior dimension is a struct
+      // so we can't do anything about it.
+      Unknown = true;
+      continue;
+    }
+    if (CI->getSExtValue() < 0) {
+      // It's out of range and negative, don't try to factor it.
+      Unknown = true;
+      continue;
+    }
+    // It's out of range, but we can factor it into the prior
+    // dimension.
+    NewIdxs.resize(Idxs.size());
+    // Determine the number of elements in our sequential type.
+    uint64_t NumElements = STy->getArrayNumElements();
+
+    ConstantInt *Factor = ConstantInt::get(CI->getType(), NumElements);
+    NewIdxs[i] = ConstantExpr::getSRem(CI, Factor);
+
+    Constant *PrevIdx = cast<Constant>(Idxs[i - 1]);
+    Constant *Div = ConstantExpr::getSDiv(CI, Factor);
+
+    unsigned CommonExtendedWidth =
+        std::max(PrevIdx->getType()->getIntegerBitWidth(),
+                 Div->getType()->getIntegerBitWidth());
+    CommonExtendedWidth = std::max(CommonExtendedWidth, 64U);
+
+    // Before adding, extend both operands to i64 to avoid
+    // overflow trouble.
+    if (!PrevIdx->getType()->isIntegerTy(CommonExtendedWidth))
+      PrevIdx = ConstantExpr::getSExt(
+          PrevIdx, Type::getIntNTy(Div->getContext(), CommonExtendedWidth));
+    if (!Div->getType()->isIntegerTy(CommonExtendedWidth))
+      Div = ConstantExpr::getSExt(
+          Div, Type::getIntNTy(Div->getContext(), CommonExtendedWidth));
+
+    NewIdxs[i - 1] = ConstantExpr::getAdd(PrevIdx, Div);
   }
 
   // If we did any factoring, start over with the adjusted indices.
@@ -2237,22 +2273,6 @@ static Constant *ConstantFoldGetElementPtrImpl(Type *PointeeTy, Constant *C,
   return nullptr;
 }
 
-Constant *llvm::ConstantFoldGetElementPtr(Constant *C,
-                                          bool inBounds,
-                                          ArrayRef<Constant *> Idxs) {
-  return ConstantFoldGetElementPtrImpl(
-      cast<PointerType>(C->getType()->getScalarType())->getElementType(), C,
-      inBounds, Idxs);
-}
-
-Constant *llvm::ConstantFoldGetElementPtr(Constant *C,
-                                          bool inBounds,
-                                          ArrayRef<Value *> Idxs) {
-  return ConstantFoldGetElementPtrImpl(
-      cast<PointerType>(C->getType()->getScalarType())->getElementType(), C,
-      inBounds, Idxs);
-}
-
 Constant *llvm::ConstantFoldGetElementPtr(Type *Ty, Constant *C,
                                           bool inBounds,
                                           ArrayRef<Constant *> Idxs) {
diff --git a/lib/IR/ConstantFold.h b/lib/IR/ConstantFold.h
index 42a9c6ba908a..9b0a937c84df 100644
--- a/lib/IR/ConstantFold.h
+++ b/lib/IR/ConstantFold.h
@@ -19,9 +19,8 @@
 #ifndef LLVM_LIB_IR_CONSTANTFOLD_H
 #define LLVM_LIB_IR_CONSTANTFOLD_H
 
-#include "llvm/ADT/ArrayRef.h"
-
 namespace llvm {
+template <typename T> class ArrayRef;
   class Value;
   class Constant;
   class Type;
@@ -45,12 +44,8 @@ namespace llvm {
                                                ArrayRef<unsigned> Idxs);
   Constant *ConstantFoldBinaryInstruction(unsigned Opcode, Constant *V1,
                                           Constant *V2);
-  Constant *ConstantFoldCompareInstruction(unsigned short predicate, 
+  Constant *ConstantFoldCompareInstruction(unsigned short predicate,
                                            Constant *C1, Constant *C2);
-  Constant *ConstantFoldGetElementPtr(Constant *C, bool inBounds,
-                                      ArrayRef<Constant *> Idxs);
-  Constant *ConstantFoldGetElementPtr(Constant *C, bool inBounds,
-                                      ArrayRef<Value *> Idxs);
   Constant *ConstantFoldGetElementPtr(Type *Ty, Constant *C, bool inBounds,
                                       ArrayRef<Constant *> Idxs);
   Constant *ConstantFoldGetElementPtr(Type *Ty, Constant *C, bool inBounds,
diff --git a/lib/IR/ConstantRange.cpp b/lib/IR/ConstantRange.cpp
index 48f9b27a25ae..0f5c7128f3d0 100644
--- a/lib/IR/ConstantRange.cpp
+++ b/lib/IR/ConstantRange.cpp
@@ -60,60 +60,60 @@ ConstantRange ConstantRange::makeAllowedICmpRegion(CmpInst::Predicate Pred,
   switch (Pred) {
   default:
     llvm_unreachable("Invalid ICmp predicate to makeAllowedICmpRegion()");
-    case CmpInst::ICMP_EQ:
-      return CR;
-    case CmpInst::ICMP_NE:
-      if (CR.isSingleElement())
-        return ConstantRange(CR.getUpper(), CR.getLower());
+  case CmpInst::ICMP_EQ:
+    return CR;
+  case CmpInst::ICMP_NE:
+    if (CR.isSingleElement())
+      return ConstantRange(CR.getUpper(), CR.getLower());
+    return ConstantRange(W);
+  case CmpInst::ICMP_ULT: {
+    APInt UMax(CR.getUnsignedMax());
+    if (UMax.isMinValue())
+      return ConstantRange(W, /* empty */ false);
+    return ConstantRange(APInt::getMinValue(W), UMax);
+  }
+  case CmpInst::ICMP_SLT: {
+    APInt SMax(CR.getSignedMax());
+    if (SMax.isMinSignedValue())
+      return ConstantRange(W, /* empty */ false);
+    return ConstantRange(APInt::getSignedMinValue(W), SMax);
+  }
+  case CmpInst::ICMP_ULE: {
+    APInt UMax(CR.getUnsignedMax());
+    if (UMax.isMaxValue())
       return ConstantRange(W);
-    case CmpInst::ICMP_ULT: {
-      APInt UMax(CR.getUnsignedMax());
-      if (UMax.isMinValue())
-        return ConstantRange(W, /* empty */ false);
-      return ConstantRange(APInt::getMinValue(W), UMax);
-    }
-    case CmpInst::ICMP_SLT: {
-      APInt SMax(CR.getSignedMax());
-      if (SMax.isMinSignedValue())
-        return ConstantRange(W, /* empty */ false);
-      return ConstantRange(APInt::getSignedMinValue(W), SMax);
-    }
-    case CmpInst::ICMP_ULE: {
-      APInt UMax(CR.getUnsignedMax());
-      if (UMax.isMaxValue())
-        return ConstantRange(W);
-      return ConstantRange(APInt::getMinValue(W), UMax + 1);
-    }
-    case CmpInst::ICMP_SLE: {
-      APInt SMax(CR.getSignedMax());
-      if (SMax.isMaxSignedValue())
-        return ConstantRange(W);
-      return ConstantRange(APInt::getSignedMinValue(W), SMax + 1);
-    }
-    case CmpInst::ICMP_UGT: {
-      APInt UMin(CR.getUnsignedMin());
-      if (UMin.isMaxValue())
-        return ConstantRange(W, /* empty */ false);
-      return ConstantRange(UMin + 1, APInt::getNullValue(W));
-    }
-    case CmpInst::ICMP_SGT: {
-      APInt SMin(CR.getSignedMin());
-      if (SMin.isMaxSignedValue())
-        return ConstantRange(W, /* empty */ false);
-      return ConstantRange(SMin + 1, APInt::getSignedMinValue(W));
-    }
-    case CmpInst::ICMP_UGE: {
-      APInt UMin(CR.getUnsignedMin());
-      if (UMin.isMinValue())
-        return ConstantRange(W);
-      return ConstantRange(UMin, APInt::getNullValue(W));
-    }
-    case CmpInst::ICMP_SGE: {
-      APInt SMin(CR.getSignedMin());
-      if (SMin.isMinSignedValue())
-        return ConstantRange(W);
-      return ConstantRange(SMin, APInt::getSignedMinValue(W));
-    }
+    return ConstantRange(APInt::getMinValue(W), UMax + 1);
+  }
+  case CmpInst::ICMP_SLE: {
+    APInt SMax(CR.getSignedMax());
+    if (SMax.isMaxSignedValue())
+      return ConstantRange(W);
+    return ConstantRange(APInt::getSignedMinValue(W), SMax + 1);
+  }
+  case CmpInst::ICMP_UGT: {
+    APInt UMin(CR.getUnsignedMin());
+    if (UMin.isMaxValue())
+      return ConstantRange(W, /* empty */ false);
+    return ConstantRange(UMin + 1, APInt::getNullValue(W));
+  }
+  case CmpInst::ICMP_SGT: {
+    APInt SMin(CR.getSignedMin());
+    if (SMin.isMaxSignedValue())
+      return ConstantRange(W, /* empty */ false);
+    return ConstantRange(SMin + 1, APInt::getSignedMinValue(W));
+  }
+  case CmpInst::ICMP_UGE: {
+    APInt UMin(CR.getUnsignedMin());
+    if (UMin.isMinValue())
+      return ConstantRange(W);
+    return ConstantRange(UMin, APInt::getNullValue(W));
+  }
+  case CmpInst::ICMP_SGE: {
+    APInt SMin(CR.getSignedMin());
+    if (SMin.isMinSignedValue())
+      return ConstantRange(W);
+    return ConstantRange(SMin, APInt::getSignedMinValue(W));
+  }
   }
 }
 
@@ -127,9 +127,48 @@ ConstantRange ConstantRange::makeSatisfyingICmpRegion(CmpInst::Predicate Pred,
       .inverse();
 }
 
-ConstantRange ConstantRange::makeNoWrapRegion(Instruction::BinaryOps BinOp,
-                                              const APInt &C,
-                                              unsigned NoWrapKind) {
+ConstantRange ConstantRange::makeExactICmpRegion(CmpInst::Predicate Pred,
+                                                 const APInt &C) {
+  // Computes the exact range that is equal to both the constant ranges returned
+  // by makeAllowedICmpRegion and makeSatisfyingICmpRegion. This is always true
+  // when RHS is a singleton such as an APInt and so the assert is valid.
+  // However for non-singleton RHS, for example ult [2,5) makeAllowedICmpRegion
+  // returns [0,4) but makeSatisfyICmpRegion returns [0,2).
+  //
+  assert(makeAllowedICmpRegion(Pred, C) == makeSatisfyingICmpRegion(Pred, C));
+  return makeAllowedICmpRegion(Pred, C);
+}
+
+bool ConstantRange::getEquivalentICmp(CmpInst::Predicate &Pred,
+                                      APInt &RHS) const {
+  bool Success = false;
+
+  if (isFullSet() || isEmptySet()) {
+    Pred = isEmptySet() ? CmpInst::ICMP_ULT : CmpInst::ICMP_UGE;
+    RHS = APInt(getBitWidth(), 0);
+    Success = true;
+  } else if (getLower().isMinSignedValue() || getLower().isMinValue()) {
+    Pred =
+        getLower().isMinSignedValue() ? CmpInst::ICMP_SLT : CmpInst::ICMP_ULT;
+    RHS = getUpper();
+    Success = true;
+  } else if (getUpper().isMinSignedValue() || getUpper().isMinValue()) {
+    Pred =
+        getUpper().isMinSignedValue() ? CmpInst::ICMP_SGE : CmpInst::ICMP_UGE;
+    RHS = getLower();
+    Success = true;
+  }
+
+  assert((!Success || ConstantRange::makeExactICmpRegion(Pred, RHS) == *this) &&
+         "Bad result!");
+
+  return Success;
+}
+
+ConstantRange
+ConstantRange::makeGuaranteedNoWrapRegion(Instruction::BinaryOps BinOp,
+                                          const ConstantRange &Other,
+                                          unsigned NoWrapKind) {
   typedef OverflowingBinaryOperator OBO;
 
   // Computes the intersection of CR0 and CR1.  It is different from
@@ -149,29 +188,36 @@ ConstantRange ConstantRange::makeNoWrapRegion(Instruction::BinaryOps BinOp,
           NoWrapKind == (OBO::NoUnsignedWrap | OBO::NoSignedWrap)) &&
          "NoWrapKind invalid!");
 
-  unsigned BitWidth = C.getBitWidth();
+  unsigned BitWidth = Other.getBitWidth();
   if (BinOp != Instruction::Add)
     // Conservative answer: empty set
     return ConstantRange(BitWidth, false);
 
-  if (C.isMinValue())
-    // Full set: nothing signed / unsigned wraps when added to 0.
-    return ConstantRange(BitWidth);
+  if (auto *C = Other.getSingleElement())
+    if (C->isMinValue())
+      // Full set: nothing signed / unsigned wraps when added to 0.
+      return ConstantRange(BitWidth);
 
   ConstantRange Result(BitWidth);
 
   if (NoWrapKind & OBO::NoUnsignedWrap)
-    Result = SubsetIntersect(Result,
-                             ConstantRange(APInt::getNullValue(BitWidth), -C));
+    Result =
+        SubsetIntersect(Result, ConstantRange(APInt::getNullValue(BitWidth),
+                                              -Other.getUnsignedMax()));
 
   if (NoWrapKind & OBO::NoSignedWrap) {
-    if (C.isStrictlyPositive())
+    APInt SignedMin = Other.getSignedMin();
+    APInt SignedMax = Other.getSignedMax();
+
+    if (SignedMax.isStrictlyPositive())
       Result = SubsetIntersect(
-          Result, ConstantRange(APInt::getSignedMinValue(BitWidth),
-                                APInt::getSignedMinValue(BitWidth) - C));
-    else
+          Result,
+          ConstantRange(APInt::getSignedMinValue(BitWidth),
+                        APInt::getSignedMinValue(BitWidth) - SignedMax));
+
+    if (SignedMin.isNegative())
       Result = SubsetIntersect(
-          Result, ConstantRange(APInt::getSignedMinValue(BitWidth) - C,
+          Result, ConstantRange(APInt::getSignedMinValue(BitWidth) - SignedMin,
                                 APInt::getSignedMinValue(BitWidth)));
   }
 
@@ -544,7 +590,7 @@ ConstantRange ConstantRange::truncate(uint32_t DstTySize) const {
   // We use the non-wrapped set code to analyze the [Lower, MaxValue) part, and
   // then we do the union with [MaxValue, Upper)
   if (isWrappedSet()) {
-    // if Upper is greater than Max Value, it covers the whole truncated range.
+    // If Upper is greater than Max Value, it covers the whole truncated range.
     if (Upper.uge(MaxValue))
       return ConstantRange(DstTySize, /*isFullSet=*/true);
 
@@ -568,7 +614,7 @@ ConstantRange ConstantRange::truncate(uint32_t DstTySize) const {
     return ConstantRange(LowerDiv.trunc(DstTySize),
                          UpperDiv.trunc(DstTySize)).unionWith(Union);
 
-  // The truncated value wrapps around. Check if we can do better than fullset.
+  // The truncated value wraps around. Check if we can do better than fullset.
   APInt UpperModulo = UpperDiv - MaxBitValue;
   if (UpperModulo.ult(LowerDiv))
     return ConstantRange(LowerDiv.trunc(DstTySize),
@@ -667,6 +713,13 @@ ConstantRange::multiply(const ConstantRange &Other) const {
                                             this_max * Other_max + 1);
   ConstantRange UR = Result_zext.truncate(getBitWidth());
 
+  // If the unsigned range doesn't wrap, and isn't negative then it's a range
+  // from one positive number to another which is as good as we can generate.
+  // In this case, skip the extra work of generating signed ranges which aren't
+  // going to be better than this range.
+  if (!UR.isWrappedSet() && UR.getLower().isNonNegative())
+    return UR;
+
   // Now the signed range. Because we could be dealing with negative numbers
   // here, the lower bound is the smallest of the cartesian product of the
   // lower and upper ranges; for example:
@@ -714,6 +767,32 @@ ConstantRange::umax(const ConstantRange &Other) const {
 }
 
 ConstantRange
+ConstantRange::smin(const ConstantRange &Other) const {
+  // X smin Y is: range(smin(X_smin, Y_smin),
+  //                    smin(X_smax, Y_smax))
+  if (isEmptySet() || Other.isEmptySet())
+    return ConstantRange(getBitWidth(), /*isFullSet=*/false);
+  APInt NewL = APIntOps::smin(getSignedMin(), Other.getSignedMin());
+  APInt NewU = APIntOps::smin(getSignedMax(), Other.getSignedMax()) + 1;
+  if (NewU == NewL)
+    return ConstantRange(getBitWidth(), /*isFullSet=*/true);
+  return ConstantRange(NewL, NewU);
+}
+
+ConstantRange
+ConstantRange::umin(const ConstantRange &Other) const {
+  // X umin Y is: range(umin(X_umin, Y_umin),
+  //                    umin(X_umax, Y_umax))
+  if (isEmptySet() || Other.isEmptySet())
+    return ConstantRange(getBitWidth(), /*isFullSet=*/false);
+  APInt NewL = APIntOps::umin(getUnsignedMin(), Other.getUnsignedMin());
+  APInt NewU = APIntOps::umin(getUnsignedMax(), Other.getUnsignedMax()) + 1;
+  if (NewU == NewL)
+    return ConstantRange(getBitWidth(), /*isFullSet=*/true);
+  return ConstantRange(NewL, NewU);
+}
+
+ConstantRange
 ConstantRange::udiv(const ConstantRange &RHS) const {
   if (isEmptySet() || RHS.isEmptySet() || RHS.getUnsignedMax() == 0)
     return ConstantRange(getBitWidth(), /*isFullSet=*/false);
@@ -819,6 +898,6 @@ void ConstantRange::print(raw_ostream &OS) const {
 
 /// dump - Allow printing from a debugger easily...
 ///
-void ConstantRange::dump() const {
+LLVM_DUMP_METHOD void ConstantRange::dump() const {
   print(dbgs());
 }
diff --git a/lib/IR/Constants.cpp b/lib/IR/Constants.cpp
index 0898bf645385..d8d55b472f32 100644
--- a/lib/IR/Constants.cpp
+++ b/lib/IR/Constants.cpp
@@ -14,8 +14,6 @@
 #include "llvm/IR/Constants.h"
 #include "ConstantFold.h"
 #include "LLVMContextImpl.h"
-#include "llvm/ADT/DenseMap.h"
-#include "llvm/ADT/FoldingSet.h"
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/StringExtras.h"
@@ -26,7 +24,6 @@
 #include "llvm/IR/Instructions.h"
 #include "llvm/IR/Module.h"
 #include "llvm/IR/Operator.h"
-#include "llvm/Support/Compiler.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/ManagedStatic.h"
@@ -42,6 +39,8 @@ using namespace llvm;
 
 void Constant::anchor() { }
 
+void ConstantData::anchor() {}
+
 bool Constant::isNegativeZeroValue() const {
   // Floating point values have an explicit -0.0 value.
   if (const ConstantFP *CFP = dyn_cast<ConstantFP>(this))
@@ -192,7 +191,7 @@ bool Constant::isNotMinSignedValue() const {
   return false;
 }
 
-// Constructor to create a '0' constant of arbitrary type...
+/// Constructor to create a '0' constant of arbitrary type.
 Constant *Constant::getNullValue(Type *Ty) {
   switch (Ty->getTypeID()) {
   case Type::IntegerTyID:
@@ -263,19 +262,9 @@ Constant *Constant::getAllOnesValue(Type *Ty) {
                                   getAllOnesValue(VTy->getElementType()));
 }
 
-/// getAggregateElement - For aggregates (struct/array/vector) return the
-/// constant that corresponds to the specified element if possible, or null if
-/// not.  This can return null if the element index is a ConstantExpr, or if
-/// 'this' is a constant expr.
 Constant *Constant::getAggregateElement(unsigned Elt) const {
-  if (const ConstantStruct *CS = dyn_cast<ConstantStruct>(this))
-    return Elt < CS->getNumOperands() ? CS->getOperand(Elt) : nullptr;
-
-  if (const ConstantArray *CA = dyn_cast<ConstantArray>(this))
-    return Elt < CA->getNumOperands() ? CA->getOperand(Elt) : nullptr;
-
-  if (const ConstantVector *CV = dyn_cast<ConstantVector>(this))
-    return Elt < CV->getNumOperands() ? CV->getOperand(Elt) : nullptr;
+  if (const ConstantAggregate *CC = dyn_cast<ConstantAggregate>(this))
+    return Elt < CC->getNumOperands() ? CC->getOperand(Elt) : nullptr;
 
   if (const ConstantAggregateZero *CAZ = dyn_cast<ConstantAggregateZero>(this))
     return Elt < CAZ->getNumElements() ? CAZ->getElementValue(Elt) : nullptr;
@@ -369,8 +358,6 @@ static bool canTrapImpl(const Constant *C,
   }
 }
 
-/// canTrap - Return true if evaluation of this constant could trap.  This is
-/// true for things like constant expressions that could divide by zero.
 bool Constant::canTrap() const {
   SmallPtrSet<const ConstantExpr *, 4> NonTrappingOps;
   return canTrapImpl(this, NonTrappingOps);
@@ -401,7 +388,6 @@ ConstHasGlobalValuePredicate(const Constant *C,
   return false;
 }
 
-/// Return true if the value can vary between threads.
 bool Constant::isThreadDependent() const {
   auto DLLImportPredicate = [](const GlobalValue *GV) {
     return GV->isThreadLocal();
@@ -416,8 +402,6 @@ bool Constant::isDLLImportDependent() const {
   return ConstHasGlobalValuePredicate(this, DLLImportPredicate);
 }
 
-/// Return true if the constant has users other than constant exprs and other
-/// dangling things.
 bool Constant::isConstantUsed() const {
   for (const User *U : users()) {
     const Constant *UC = dyn_cast<Constant>(U);
@@ -461,9 +445,8 @@ bool Constant::needsRelocation() const {
   return Result;
 }
 
-/// removeDeadUsersOfConstant - If the specified constantexpr is dead, remove
-/// it.  This involves recursively eliminating any dead users of the
-/// constantexpr.
+/// If the specified constantexpr is dead, remove it. This involves recursively
+/// eliminating any dead users of the constantexpr.
 static bool removeDeadUsersOfConstant(const Constant *C) {
   if (isa<GlobalValue>(C)) return false; // Cannot remove this
 
@@ -479,10 +462,6 @@ static bool removeDeadUsersOfConstant(const Constant *C) {
 }
 
 
-/// removeDeadConstantUsers - If there are any dead constant users dangling
-/// off of this constant, remove them.  This method is useful for clients
-/// that want to check to see if a global is unused, but don't want to deal
-/// with potentially dead constants hanging off of the globals.
 void Constant::removeDeadConstantUsers() const {
   Value::const_user_iterator I = user_begin(), E = user_end();
   Value::const_user_iterator LastNonDeadUser = E;
@@ -521,8 +500,8 @@ void Constant::removeDeadConstantUsers() const {
 
 void ConstantInt::anchor() { }
 
-ConstantInt::ConstantInt(IntegerType *Ty, const APInt& V)
-  : Constant(Ty, ConstantIntVal, nullptr, 0), Val(V) {
+ConstantInt::ConstantInt(IntegerType *Ty, const APInt &V)
+    : ConstantData(Ty, ConstantIntVal), Val(V) {
   assert(V.getBitWidth() == Ty->getBitWidth() && "Invalid constant for type");
 }
 
@@ -588,8 +567,7 @@ Constant *ConstantInt::get(Type *Ty, uint64_t V, bool isSigned) {
   return C;
 }
 
-ConstantInt *ConstantInt::get(IntegerType *Ty, uint64_t V, 
-                              bool isSigned) {
+ConstantInt *ConstantInt::get(IntegerType *Ty, uint64_t V, bool isSigned) {
   return get(Ty->getContext(), APInt(Ty->getBitWidth(), V, isSigned));
 }
 
@@ -613,8 +591,7 @@ Constant *ConstantInt::get(Type *Ty, const APInt& V) {
   return C;
 }
 
-ConstantInt *ConstantInt::get(IntegerType* Ty, StringRef Str,
-                              uint8_t radix) {
+ConstantInt *ConstantInt::get(IntegerType* Ty, StringRef Str, uint8_t radix) {
   return get(Ty->getContext(), APInt(Ty->getBitWidth(), Str, radix));
 }
 
@@ -645,9 +622,6 @@ static const fltSemantics *TypeToFloatSemantics(Type *Ty) {
 
 void ConstantFP::anchor() { }
 
-/// get() - This returns a constant fp for the specified value in the
-/// specified type.  This should only be used for simple constant values like
-/// 2.0/1.0 etc, that are known-valid both as double and as the target format.
 Constant *ConstantFP::get(Type *Ty, double V) {
   LLVMContext &Context = Ty->getContext();
 
@@ -748,8 +722,8 @@ Constant *ConstantFP::getInfinity(Type *Ty, bool Negative) {
   return C;
 }
 
-ConstantFP::ConstantFP(Type *Ty, const APFloat& V)
-  : Constant(Ty, ConstantFPVal, nullptr, 0), Val(V) {
+ConstantFP::ConstantFP(Type *Ty, const APFloat &V)
+    : ConstantData(Ty, ConstantFPVal), Val(V) {
   assert(&V.getSemantics() == TypeToFloatSemantics(Ty) &&
          "FP type Mismatch");
 }
@@ -767,28 +741,20 @@ void ConstantFP::destroyConstantImpl() {
 //                   ConstantAggregateZero Implementation
 //===----------------------------------------------------------------------===//
 
-/// getSequentialElement - If this CAZ has array or vector type, return a zero
-/// with the right element type.
 Constant *ConstantAggregateZero::getSequentialElement() const {
   return Constant::getNullValue(getType()->getSequentialElementType());
 }
 
-/// getStructElement - If this CAZ has struct type, return a zero with the
-/// right element type for the specified element.
 Constant *ConstantAggregateZero::getStructElement(unsigned Elt) const {
   return Constant::getNullValue(getType()->getStructElementType(Elt));
 }
 
-/// getElementValue - Return a zero of the right value for the specified GEP
-/// index if we can, otherwise return null (e.g. if C is a ConstantExpr).
 Constant *ConstantAggregateZero::getElementValue(Constant *C) const {
   if (isa<SequentialType>(getType()))
     return getSequentialElement();
   return getStructElement(cast<ConstantInt>(C)->getZExtValue());
 }
 
-/// getElementValue - Return a zero of the right value for the specified GEP
-/// index.
 Constant *ConstantAggregateZero::getElementValue(unsigned Idx) const {
   if (isa<SequentialType>(getType()))
     return getSequentialElement();
@@ -808,28 +774,20 @@ unsigned ConstantAggregateZero::getNumElements() const {
 //                         UndefValue Implementation
 //===----------------------------------------------------------------------===//
 
-/// getSequentialElement - If this undef has array or vector type, return an
-/// undef with the right element type.
 UndefValue *UndefValue::getSequentialElement() const {
   return UndefValue::get(getType()->getSequentialElementType());
 }
 
-/// getStructElement - If this undef has struct type, return a zero with the
-/// right element type for the specified element.
 UndefValue *UndefValue::getStructElement(unsigned Elt) const {
   return UndefValue::get(getType()->getStructElementType(Elt));
 }
 
-/// getElementValue - Return an undef of the right value for the specified GEP
-/// index if we can, otherwise return null (e.g. if C is a ConstantExpr).
 UndefValue *UndefValue::getElementValue(Constant *C) const {
   if (isa<SequentialType>(getType()))
     return getSequentialElement();
   return getStructElement(cast<ConstantInt>(C)->getZExtValue());
 }
 
-/// getElementValue - Return an undef of the right value for the specified GEP
-/// index.
 UndefValue *UndefValue::getElementValue(unsigned Idx) const {
   if (isa<SequentialType>(getType()))
     return getSequentialElement();
@@ -910,16 +868,25 @@ static Constant *getSequenceIfElementsMatch(Constant *C,
   return nullptr;
 }
 
+ConstantAggregate::ConstantAggregate(CompositeType *T, ValueTy VT,
+                                     ArrayRef<Constant *> V)
+    : Constant(T, VT, OperandTraits<ConstantAggregate>::op_end(this) - V.size(),
+               V.size()) {
+  std::copy(V.begin(), V.end(), op_begin());
+
+  // Check that types match, unless this is an opaque struct.
+  if (auto *ST = dyn_cast<StructType>(T))
+    if (ST->isOpaque())
+      return;
+  for (unsigned I = 0, E = V.size(); I != E; ++I)
+    assert(V[I]->getType() == T->getTypeAtIndex(I) &&
+           "Initializer for composite element doesn't match!");
+}
+
 ConstantArray::ConstantArray(ArrayType *T, ArrayRef<Constant *> V)
-  : Constant(T, ConstantArrayVal,
-             OperandTraits<ConstantArray>::op_end(this) - V.size(),
-             V.size()) {
+    : ConstantAggregate(T, ConstantArrayVal, V) {
   assert(V.size() == T->getNumElements() &&
-         "Invalid initializer vector for constant array");
-  for (unsigned i = 0, e = V.size(); i != e; ++i)
-    assert(V[i]->getType() == T->getElementType() &&
-           "Initializer for array element doesn't match array element type!");
-  std::copy(V.begin(), V.end(), op_begin());
+         "Invalid initializer for constant array");
 }
 
 Constant *ConstantArray::get(ArrayType *Ty, ArrayRef<Constant*> V) {
@@ -957,8 +924,6 @@ Constant *ConstantArray::getImpl(ArrayType *Ty, ArrayRef<Constant*> V) {
   return nullptr;
 }
 
-/// getTypeForElements - Return an anonymous struct type to use for a constant
-/// with the specified set of elements.  The list must not be empty.
 StructType *ConstantStruct::getTypeForElements(LLVMContext &Context,
                                                ArrayRef<Constant*> V,
                                                bool Packed) {
@@ -978,17 +943,10 @@ StructType *ConstantStruct::getTypeForElements(ArrayRef<Constant*> V,
   return getTypeForElements(V[0]->getContext(), V, Packed);
 }
 
-
 ConstantStruct::ConstantStruct(StructType *T, ArrayRef<Constant *> V)
-  : Constant(T, ConstantStructVal,
-             OperandTraits<ConstantStruct>::op_end(this) - V.size(),
-             V.size()) {
-  assert(V.size() == T->getNumElements() &&
-         "Invalid initializer vector for constant structure");
-  for (unsigned i = 0, e = V.size(); i != e; ++i)
-    assert((T->isOpaque() || V[i]->getType() == T->getElementType(i)) &&
-           "Initializer for struct element doesn't match struct element type!");
-  std::copy(V.begin(), V.end(), op_begin());
+    : ConstantAggregate(T, ConstantStructVal, V) {
+  assert((T->isOpaque() || V.size() == T->getNumElements()) &&
+         "Invalid initializer for constant struct");
 }
 
 // ConstantStruct accessors.
@@ -1031,13 +989,9 @@ Constant *ConstantStruct::get(StructType *T, ...) {
 }
 
 ConstantVector::ConstantVector(VectorType *T, ArrayRef<Constant *> V)
-  : Constant(T, ConstantVectorVal,
-             OperandTraits<ConstantVector>::op_end(this) - V.size(),
-             V.size()) {
-  for (size_t i = 0, e = V.size(); i != e; i++)
-    assert(V[i]->getType() == T->getElementType() &&
-           "Initializer for vector element doesn't match vector element type!");
-  std::copy(V.begin(), V.end(), op_begin());
+    : ConstantAggregate(T, ConstantVectorVal, V) {
+  assert(V.size() == T->getNumElements() &&
+         "Invalid initializer for constant vector");
 }
 
 // ConstantVector accessors.
@@ -1157,8 +1111,6 @@ unsigned ConstantExpr::getPredicate() const {
   return cast<CompareConstantExpr>(this)->predicate;
 }
 
-/// getWithOperandReplaced - Return a constant expression identical to this
-/// one, but with the specified operand set to the specified value.
 Constant *
 ConstantExpr::getWithOperandReplaced(unsigned OpNo, Constant *Op) const {
   assert(Op->getType() == getOperand(OpNo)->getType() &&
@@ -1173,9 +1125,6 @@ ConstantExpr::getWithOperandReplaced(unsigned OpNo, Constant *Op) const {
   return getWithOperands(NewOps);
 }
 
-/// getWithOperands - This returns the current constant expression with the
-/// operands replaced with the specified values.  The specified array must
-/// have the same number of operands as our current one.
 Constant *ConstantExpr::getWithOperands(ArrayRef<Constant *> Ops, Type *Ty,
                                         bool OnlyIfReduced, Type *SrcTy) const {
   assert(Ops.size() == getNumOperands() && "Operand count mismatch!");
@@ -1320,14 +1269,12 @@ ConstantAggregateZero *ConstantAggregateZero::get(Type *Ty) {
   return Entry;
 }
 
-/// destroyConstant - Remove the constant from the constant table.
-///
+/// Remove the constant from the constant table.
 void ConstantAggregateZero::destroyConstantImpl() {
   getContext().pImpl->CAZConstants.erase(getType());
 }
 
-/// destroyConstant - Remove the constant from the constant table...
-///
+/// Remove the constant from the constant table.
 void ConstantArray::destroyConstantImpl() {
   getType()->getContext().pImpl->ArrayConstants.remove(this);
 }
@@ -1336,20 +1283,16 @@ void ConstantArray::destroyConstantImpl() {
 //---- ConstantStruct::get() implementation...
 //
 
-// destroyConstant - Remove the constant from the constant table...
-//
+/// Remove the constant from the constant table.
 void ConstantStruct::destroyConstantImpl() {
   getType()->getContext().pImpl->StructConstants.remove(this);
 }
 
-// destroyConstant - Remove the constant from the constant table...
-//
+/// Remove the constant from the constant table.
 void ConstantVector::destroyConstantImpl() {
   getType()->getContext().pImpl->VectorConstants.remove(this);
 }
 
-/// getSplatValue - If this is a splat vector constant, meaning that all of
-/// the elements have the same value, return that value. Otherwise return 0.
 Constant *Constant::getSplatValue() const {
   assert(this->getType()->isVectorTy() && "Only valid for vectors!");
   if (isa<ConstantAggregateZero>(this))
@@ -1361,8 +1304,6 @@ Constant *Constant::getSplatValue() const {
   return nullptr;
 }
 
-/// getSplatValue - If this is a splat constant, where all of the
-/// elements have the same value, return that value. Otherwise return null.
 Constant *ConstantVector::getSplatValue() const {
   // Check out first element.
   Constant *Elt = getOperand(0);
@@ -1373,8 +1314,6 @@ Constant *ConstantVector::getSplatValue() const {
   return Elt;
 }
 
-/// If C is a constant integer then return its value, otherwise C must be a
-/// vector of constant integers, all equal, and the common value is returned.
 const APInt &Constant::getUniqueInteger() const {
   if (const ConstantInt *CI = dyn_cast<ConstantInt>(this))
     return CI->getValue();
@@ -1395,16 +1334,11 @@ ConstantPointerNull *ConstantPointerNull::get(PointerType *Ty) {
   return Entry;
 }
 
-// destroyConstant - Remove the constant from the constant table...
-//
+/// Remove the constant from the constant table.
 void ConstantPointerNull::destroyConstantImpl() {
   getContext().pImpl->CPNConstants.erase(getType());
 }
 
-
-//---- UndefValue::get() implementation.
-//
-
 UndefValue *UndefValue::get(Type *Ty) {
   UndefValue *&Entry = Ty->getContext().pImpl->UVConstants[Ty];
   if (!Entry)
@@ -1413,16 +1347,12 @@ UndefValue *UndefValue::get(Type *Ty) {
   return Entry;
 }
 
-// destroyConstant - Remove the constant from the constant table.
-//
+/// Remove the constant from the constant table.
 void UndefValue::destroyConstantImpl() {
   // Free the constant and any dangling references to it.
   getContext().pImpl->UVConstants.erase(getType());
 }
 
-//---- BlockAddress::get() implementation.
-//
-
 BlockAddress *BlockAddress::get(BasicBlock *BB) {
   assert(BB->getParent() && "Block must have a parent");
   return get(BB->getParent(), BB);
@@ -1458,24 +1388,25 @@ BlockAddress *BlockAddress::lookup(const BasicBlock *BB) {
   return BA;
 }
 
-// destroyConstant - Remove the constant from the constant table.
-//
+/// Remove the constant from the constant table.
 void BlockAddress::destroyConstantImpl() {
   getFunction()->getType()->getContext().pImpl
     ->BlockAddresses.erase(std::make_pair(getFunction(), getBasicBlock()));
   getBasicBlock()->AdjustBlockAddressRefCount(-1);
 }
 
-Value *BlockAddress::handleOperandChangeImpl(Value *From, Value *To, Use *U) {
+Value *BlockAddress::handleOperandChangeImpl(Value *From, Value *To) {
   // This could be replacing either the Basic Block or the Function.  In either
   // case, we have to remove the map entry.
   Function *NewF = getFunction();
   BasicBlock *NewBB = getBasicBlock();
 
-  if (U == &Op<0>())
+  if (From == NewF)
     NewF = cast<Function>(To->stripPointerCasts());
-  else
+  else {
+    assert(From == NewBB && "From does not match any operand");
     NewBB = cast<BasicBlock>(To);
+  }
 
   // See if the 'new' entry already exists, if not, just update this in place
   // and return early.
@@ -1606,8 +1537,7 @@ Constant *ConstantExpr::getPointerBitCastOrAddrSpaceCast(Constant *S,
   return getBitCast(S, Ty);
 }
 
-Constant *ConstantExpr::getIntegerCast(Constant *C, Type *Ty,
-                                       bool isSigned) {
+Constant *ConstantExpr::getIntegerCast(Constant *C, Type *Ty, bool isSigned) {
   assert(C->getType()->isIntOrIntVectorTy() &&
          Ty->isIntOrIntVectorTy() && "Invalid cast");
   unsigned SrcBits = C->getType()->getScalarSizeInBits();
@@ -1979,8 +1909,16 @@ Constant *ConstantExpr::getGetElementPtr(Type *Ty, Constant *C,
   assert(DestTy && "GEP indices invalid!");
   unsigned AS = C->getType()->getPointerAddressSpace();
   Type *ReqTy = DestTy->getPointerTo(AS);
-  if (VectorType *VecTy = dyn_cast<VectorType>(C->getType()))
-    ReqTy = VectorType::get(ReqTy, VecTy->getNumElements());
+
+  unsigned NumVecElts = 0;
+  if (C->getType()->isVectorTy())
+    NumVecElts = C->getType()->getVectorNumElements();
+  else for (auto Idx : Idxs)
+    if (Idx->getType()->isVectorTy())
+      NumVecElts = Idx->getType()->getVectorNumElements();
+
+  if (NumVecElts)
+    ReqTy = VectorType::get(ReqTy, NumVecElts);
 
   if (OnlyIfReducedTy == ReqTy)
     return nullptr;
@@ -1990,13 +1928,14 @@ Constant *ConstantExpr::getGetElementPtr(Type *Ty, Constant *C,
   ArgVec.reserve(1 + Idxs.size());
   ArgVec.push_back(C);
   for (unsigned i = 0, e = Idxs.size(); i != e; ++i) {
-    assert(Idxs[i]->getType()->isVectorTy() == ReqTy->isVectorTy() &&
-           "getelementptr index type missmatch");
     assert((!Idxs[i]->getType()->isVectorTy() ||
-            ReqTy->getVectorNumElements() ==
-            Idxs[i]->getType()->getVectorNumElements()) &&
+            Idxs[i]->getType()->getVectorNumElements() == NumVecElts) &&
            "getelementptr index type missmatch");
-    ArgVec.push_back(cast<Constant>(Idxs[i]));
+
+    Constant *Idx = cast<Constant>(Idxs[i]);
+    if (NumVecElts && !Idxs[i]->getType()->isVectorTy())
+      Idx = ConstantVector::getSplat(NumVecElts, Idx);
+    ArgVec.push_back(Idx);
   }
   const ConstantExprKeyType Key(Instruction::GetElementPtr, ArgVec, 0,
                                 InBounds ? GEPOperator::IsInBounds : 0, None,
@@ -2278,9 +2217,6 @@ Constant *ConstantExpr::getAShr(Constant *C1, Constant *C2, bool isExact) {
              isExact ? PossiblyExactOperator::IsExact : 0);
 }
 
-/// getBinOpIdentity - Return the identity for the given binary operation,
-/// i.e. a constant C such that X op C = X and C op X = X for every X.  It
-/// returns null if the operator doesn't have an identity.
 Constant *ConstantExpr::getBinOpIdentity(unsigned Opcode, Type *Ty) {
   switch (Opcode) {
   default:
@@ -2300,10 +2236,6 @@ Constant *ConstantExpr::getBinOpIdentity(unsigned Opcode, Type *Ty) {
   }
 }
 
-/// getBinOpAbsorber - Return the absorbing element for the given binary
-/// operation, i.e. a constant C such that X op C = C and C op X = C for
-/// every X.  For example, this returns zero for integer multiplication.
-/// It returns null if the operator doesn't have an absorbing element.
 Constant *ConstantExpr::getBinOpAbsorber(unsigned Opcode, Type *Ty) {
   switch (Opcode) {
   default:
@@ -2319,8 +2251,7 @@ Constant *ConstantExpr::getBinOpAbsorber(unsigned Opcode, Type *Ty) {
   }
 }
 
-// destroyConstant - Remove the constant from the constant table...
-//
+/// Remove the constant from the constant table.
 void ConstantExpr::destroyConstantImpl() {
   getType()->getContext().pImpl->ExprConstants.remove(this);
 }
@@ -2335,7 +2266,8 @@ GetElementPtrConstantExpr::GetElementPtrConstantExpr(
                    OperandTraits<GetElementPtrConstantExpr>::op_end(this) -
                        (IdxList.size() + 1),
                    IdxList.size() + 1),
-      SrcElementTy(SrcElementTy) {
+      SrcElementTy(SrcElementTy),
+      ResElementTy(GetElementPtrInst::getIndexedType(SrcElementTy, IdxList)) {
   Op<0>() = C;
   Use *OperandList = getOperandList();
   for (unsigned i = 0, E = IdxList.size(); i != E; ++i)
@@ -2346,13 +2278,16 @@ Type *GetElementPtrConstantExpr::getSourceElementType() const {
   return SrcElementTy;
 }
 
+Type *GetElementPtrConstantExpr::getResultElementType() const {
+  return ResElementTy;
+}
+
 //===----------------------------------------------------------------------===//
 //                       ConstantData* implementations
 
 void ConstantDataArray::anchor() {}
 void ConstantDataVector::anchor() {}
 
-/// getElementType - Return the element type of the array/vector.
 Type *ConstantDataSequential::getElementType() const {
   return getType()->getElementType();
 }
@@ -2361,10 +2296,6 @@ StringRef ConstantDataSequential::getRawDataValues() const {
   return StringRef(DataElements, getNumElements()*getElementByteSize());
 }
 
-/// isElementTypeCompatible - Return true if a ConstantDataSequential can be
-/// formed with a vector or array of the specified element type.
-/// ConstantDataArray only works with normal float and int types that are
-/// stored densely in memory, not with things like i42 or x86_f80.
 bool ConstantDataSequential::isElementTypeCompatible(Type *Ty) {
   if (Ty->isHalfTy() || Ty->isFloatTy() || Ty->isDoubleTy()) return true;
   if (auto *IT = dyn_cast<IntegerType>(Ty)) {
@@ -2380,7 +2311,6 @@ bool ConstantDataSequential::isElementTypeCompatible(Type *Ty) {
   return false;
 }
 
-/// getNumElements - Return the number of elements in the array or vector.
 unsigned ConstantDataSequential::getNumElements() const {
   if (ArrayType *AT = dyn_cast<ArrayType>(getType()))
     return AT->getNumElements();
@@ -2388,27 +2318,26 @@ unsigned ConstantDataSequential::getNumElements() const {
 }
 
 
-/// getElementByteSize - Return the size in bytes of the elements in the data.
 uint64_t ConstantDataSequential::getElementByteSize() const {
   return getElementType()->getPrimitiveSizeInBits()/8;
 }
 
-/// getElementPointer - Return the start of the specified element.
+/// Return the start of the specified element.
 const char *ConstantDataSequential::getElementPointer(unsigned Elt) const {
   assert(Elt < getNumElements() && "Invalid Elt");
   return DataElements+Elt*getElementByteSize();
 }
 
 
-/// isAllZeros - return true if the array is empty or all zeros.
+/// Return true if the array is empty or all zeros.
 static bool isAllZeros(StringRef Arr) {
-  for (StringRef::iterator I = Arr.begin(), E = Arr.end(); I != E; ++I)
-    if (*I != 0)
+  for (char I : Arr)
+    if (I != 0)
       return false;
   return true;
 }
 
-/// getImpl - This is the underlying implementation of all of the
+/// This is the underlying implementation of all of the
 /// ConstantDataSequential::get methods.  They all thunk down to here, providing
 /// the correct element type.  We take the bytes in as a StringRef because
 /// we *want* an underlying "char*" to avoid TBAA type punning violations.
@@ -2539,11 +2468,6 @@ Constant *ConstantDataArray::getFP(LLVMContext &Context,
   return getImpl(StringRef(const_cast<char *>(Data), Elts.size() * 8), Ty);
 }
 
-/// getString - This method constructs a CDS and initializes it with a text
-/// string. The default behavior (AddNull==true) causes a null terminator to
-/// be placed at the end of the array (increasing the length of the string by
-/// one more than the StringRef would normally indicate.  Pass AddNull=false
-/// to disable this behavior.
 Constant *ConstantDataArray::getString(LLVMContext &Context,
                                        StringRef Str, bool AddNull) {
   if (!AddNull) {
@@ -2658,8 +2582,6 @@ Constant *ConstantDataVector::getSplat(unsigned NumElts, Constant *V) {
 }
 
 
-/// getElementAsInteger - If this is a sequential container of integers (of
-/// any size), return the specified element in the low bits of a uint64_t.
 uint64_t ConstantDataSequential::getElementAsInteger(unsigned Elt) const {
   assert(isa<IntegerType>(getElementType()) &&
          "Accessor can only be used when element is an integer");
@@ -2680,8 +2602,6 @@ uint64_t ConstantDataSequential::getElementAsInteger(unsigned Elt) const {
   }
 }
 
-/// getElementAsAPFloat - If this is a sequential container of floating point
-/// type, return the specified element as an APFloat.
 APFloat ConstantDataSequential::getElementAsAPFloat(unsigned Elt) const {
   const char *EltPtr = getElementPointer(Elt);
 
@@ -2703,8 +2623,6 @@ APFloat ConstantDataSequential::getElementAsAPFloat(unsigned Elt) const {
   }
 }
 
-/// getElementAsFloat - If this is an sequential container of floats, return
-/// the specified element as a float.
 float ConstantDataSequential::getElementAsFloat(unsigned Elt) const {
   assert(getElementType()->isFloatTy() &&
          "Accessor can only be used when element is a 'float'");
@@ -2712,8 +2630,6 @@ float ConstantDataSequential::getElementAsFloat(unsigned Elt) const {
   return *const_cast<float *>(EltPtr);
 }
 
-/// getElementAsDouble - If this is an sequential container of doubles, return
-/// the specified element as a float.
 double ConstantDataSequential::getElementAsDouble(unsigned Elt) const {
   assert(getElementType()->isDoubleTy() &&
          "Accessor can only be used when element is a 'float'");
@@ -2722,9 +2638,6 @@ double ConstantDataSequential::getElementAsDouble(unsigned Elt) const {
   return *const_cast<double *>(EltPtr);
 }
 
-/// getElementAsConstant - Return a Constant for a specified index's element.
-/// Note that this has to compute a new constant to return, so it isn't as
-/// efficient as getElementAsInteger/Float/Double.
 Constant *ConstantDataSequential::getElementAsConstant(unsigned Elt) const {
   if (getElementType()->isHalfTy() || getElementType()->isFloatTy() ||
       getElementType()->isDoubleTy())
@@ -2733,13 +2646,10 @@ Constant *ConstantDataSequential::getElementAsConstant(unsigned Elt) const {
   return ConstantInt::get(getElementType(), getElementAsInteger(Elt));
 }
 
-/// isString - This method returns true if this is an array of i8.
 bool ConstantDataSequential::isString() const {
   return isa<ArrayType>(getType()) && getElementType()->isIntegerTy(8);
 }
 
-/// isCString - This method returns true if the array "isString", ends with a
-/// nul byte, and does not contains any other nul bytes.
 bool ConstantDataSequential::isCString() const {
   if (!isString())
     return false;
@@ -2753,8 +2663,6 @@ bool ConstantDataSequential::isCString() const {
   return Str.drop_back().find(0) == StringRef::npos;
 }
 
-/// getSplatValue - If this is a splat constant, meaning that all of the
-/// elements have the same value, return that value. Otherwise return nullptr.
 Constant *ConstantDataVector::getSplatValue() const {
   const char *Base = getRawDataValues().data();
 
@@ -2782,14 +2690,14 @@ Constant *ConstantDataVector::getSplatValue() const {
 /// work, but would be really slow because it would have to unique each updated
 /// array instance.
 ///
-void Constant::handleOperandChange(Value *From, Value *To, Use *U) {
+void Constant::handleOperandChange(Value *From, Value *To) {
   Value *Replacement = nullptr;
   switch (getValueID()) {
   default:
     llvm_unreachable("Not a constant!");
 #define HANDLE_CONSTANT(Name)                                                  \
   case Value::Name##Val:                                                       \
-    Replacement = cast<Name>(this)->handleOperandChangeImpl(From, To, U);      \
+    Replacement = cast<Name>(this)->handleOperandChangeImpl(From, To);         \
     break;
 #include "llvm/IR/Value.def"
   }
@@ -2809,39 +2717,7 @@ void Constant::handleOperandChange(Value *From, Value *To, Use *U) {
   destroyConstant();
 }
 
-Value *ConstantInt::handleOperandChangeImpl(Value *From, Value *To, Use *U) {
-  llvm_unreachable("Unsupported class for handleOperandChange()!");
-}
-
-Value *ConstantFP::handleOperandChangeImpl(Value *From, Value *To, Use *U) {
-  llvm_unreachable("Unsupported class for handleOperandChange()!");
-}
-
-Value *ConstantTokenNone::handleOperandChangeImpl(Value *From, Value *To,
-                                                  Use *U) {
-  llvm_unreachable("Unsupported class for handleOperandChange()!");
-}
-
-Value *UndefValue::handleOperandChangeImpl(Value *From, Value *To, Use *U) {
-  llvm_unreachable("Unsupported class for handleOperandChange()!");
-}
-
-Value *ConstantPointerNull::handleOperandChangeImpl(Value *From, Value *To,
-                                                    Use *U) {
-  llvm_unreachable("Unsupported class for handleOperandChange()!");
-}
-
-Value *ConstantAggregateZero::handleOperandChangeImpl(Value *From, Value *To,
-                                                      Use *U) {
-  llvm_unreachable("Unsupported class for handleOperandChange()!");
-}
-
-Value *ConstantDataSequential::handleOperandChangeImpl(Value *From, Value *To,
-                                                       Use *U) {
-  llvm_unreachable("Unsupported class for handleOperandChange()!");
-}
-
-Value *ConstantArray::handleOperandChangeImpl(Value *From, Value *To, Use *U) {
+Value *ConstantArray::handleOperandChangeImpl(Value *From, Value *To) {
   assert(isa<Constant>(To) && "Cannot make Constant refer to non-constant!");
   Constant *ToC = cast<Constant>(To);
 
@@ -2855,9 +2731,11 @@ Value *ConstantArray::handleOperandChangeImpl(Value *From, Value *To, Use *U) {
   // Keep track of whether all the values in the array are "ToC".
   bool AllSame = true;
   Use *OperandList = getOperandList();
+  unsigned OperandNo = 0;
   for (Use *O = OperandList, *E = OperandList+getNumOperands(); O != E; ++O) {
     Constant *Val = cast<Constant>(O->get());
     if (Val == From) {
+      OperandNo = (O - OperandList);
       Val = ToC;
       ++NumUpdated;
     }
@@ -2877,65 +2755,57 @@ Value *ConstantArray::handleOperandChangeImpl(Value *From, Value *To, Use *U) {
 
   // Update to the new value.
   return getContext().pImpl->ArrayConstants.replaceOperandsInPlace(
-      Values, this, From, ToC, NumUpdated, U - OperandList);
+      Values, this, From, ToC, NumUpdated, OperandNo);
 }
 
-Value *ConstantStruct::handleOperandChangeImpl(Value *From, Value *To, Use *U) {
+Value *ConstantStruct::handleOperandChangeImpl(Value *From, Value *To) {
   assert(isa<Constant>(To) && "Cannot make Constant refer to non-constant!");
   Constant *ToC = cast<Constant>(To);
 
   Use *OperandList = getOperandList();
-  unsigned OperandToUpdate = U-OperandList;
-  assert(getOperand(OperandToUpdate) == From && "ReplaceAllUsesWith broken!");
 
   SmallVector<Constant*, 8> Values;
   Values.reserve(getNumOperands());  // Build replacement struct.
 
   // Fill values with the modified operands of the constant struct.  Also,
   // compute whether this turns into an all-zeros struct.
-  bool isAllZeros = false;
-  bool isAllUndef = false;
-  if (ToC->isNullValue()) {
-    isAllZeros = true;
-    for (Use *O = OperandList, *E = OperandList+getNumOperands(); O != E; ++O) {
-      Constant *Val = cast<Constant>(O->get());
-      Values.push_back(Val);
-      if (isAllZeros) isAllZeros = Val->isNullValue();
-    }
-  } else if (isa<UndefValue>(ToC)) {
-    isAllUndef = true;
-    for (Use *O = OperandList, *E = OperandList+getNumOperands(); O != E; ++O) {
-      Constant *Val = cast<Constant>(O->get());
-      Values.push_back(Val);
-      if (isAllUndef) isAllUndef = isa<UndefValue>(Val);
+  unsigned NumUpdated = 0;
+  bool AllSame = true;
+  unsigned OperandNo = 0;
+  for (Use *O = OperandList, *E = OperandList + getNumOperands(); O != E; ++O) {
+    Constant *Val = cast<Constant>(O->get());
+    if (Val == From) {
+      OperandNo = (O - OperandList);
+      Val = ToC;
+      ++NumUpdated;
     }
-  } else {
-    for (Use *O = OperandList, *E = OperandList + getNumOperands(); O != E; ++O)
-      Values.push_back(cast<Constant>(O->get()));
+    Values.push_back(Val);
+    AllSame &= Val == ToC;
   }
-  Values[OperandToUpdate] = ToC;
 
-  if (isAllZeros)
+  if (AllSame && ToC->isNullValue())
     return ConstantAggregateZero::get(getType());
 
-  if (isAllUndef)
+  if (AllSame && isa<UndefValue>(ToC))
     return UndefValue::get(getType());
 
   // Update to the new value.
   return getContext().pImpl->StructConstants.replaceOperandsInPlace(
-      Values, this, From, ToC);
+      Values, this, From, ToC, NumUpdated, OperandNo);
 }
 
-Value *ConstantVector::handleOperandChangeImpl(Value *From, Value *To, Use *U) {
+Value *ConstantVector::handleOperandChangeImpl(Value *From, Value *To) {
   assert(isa<Constant>(To) && "Cannot make Constant refer to non-constant!");
   Constant *ToC = cast<Constant>(To);
 
   SmallVector<Constant*, 8> Values;
   Values.reserve(getNumOperands());  // Build replacement array...
   unsigned NumUpdated = 0;
+  unsigned OperandNo = 0;
   for (unsigned i = 0, e = getNumOperands(); i != e; ++i) {
     Constant *Val = getOperand(i);
     if (Val == From) {
+      OperandNo = i;
       ++NumUpdated;
       Val = ToC;
     }
@@ -2946,20 +2816,21 @@ Value *ConstantVector::handleOperandChangeImpl(Value *From, Value *To, Use *U) {
     return C;
 
   // Update to the new value.
-  Use *OperandList = getOperandList();
   return getContext().pImpl->VectorConstants.replaceOperandsInPlace(
-      Values, this, From, ToC, NumUpdated, U - OperandList);
+      Values, this, From, ToC, NumUpdated, OperandNo);
 }
 
-Value *ConstantExpr::handleOperandChangeImpl(Value *From, Value *ToV, Use *U) {
+Value *ConstantExpr::handleOperandChangeImpl(Value *From, Value *ToV) {
   assert(isa<Constant>(ToV) && "Cannot make Constant refer to non-constant!");
   Constant *To = cast<Constant>(ToV);
 
   SmallVector<Constant*, 8> NewOps;
   unsigned NumUpdated = 0;
+  unsigned OperandNo = 0;
   for (unsigned i = 0, e = getNumOperands(); i != e; ++i) {
     Constant *Op = getOperand(i);
     if (Op == From) {
+      OperandNo = i;
       ++NumUpdated;
       Op = To;
     }
@@ -2971,9 +2842,8 @@ Value *ConstantExpr::handleOperandChangeImpl(Value *From, Value *ToV, Use *U) {
     return C;
 
   // Update to the new value.
-  Use *OperandList = getOperandList();
   return getContext().pImpl->ExprConstants.replaceOperandsInPlace(
-      NewOps, this, From, To, NumUpdated, U - OperandList);
+      NewOps, this, From, To, NumUpdated, OperandNo);
 }
 
 Instruction *ConstantExpr::getAsInstruction() {
diff --git a/lib/IR/ConstantsContext.h b/lib/IR/ConstantsContext.h
index 13fcbd2ece10..7db87edff010 100644
--- a/lib/IR/ConstantsContext.h
+++ b/lib/IR/ConstantsContext.h
@@ -15,7 +15,7 @@
 #ifndef LLVM_LIB_IR_CONSTANTSCONTEXT_H
 #define LLVM_LIB_IR_CONSTANTSCONTEXT_H
 
-#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/DenseSet.h"
 #include "llvm/ADT/Hashing.h"
 #include "llvm/IR/InlineAsm.h"
 #include "llvm/IR/Instructions.h"
@@ -23,8 +23,6 @@
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/raw_ostream.h"
-#include <map>
-#include <tuple>
 
 #define DEBUG_TYPE "ir"
 
@@ -225,19 +223,12 @@ public:
 /// used behind the scenes to implement getelementpr constant exprs.
 class GetElementPtrConstantExpr : public ConstantExpr {
   Type *SrcElementTy;
+  Type *ResElementTy;
   void anchor() override;
   GetElementPtrConstantExpr(Type *SrcElementTy, Constant *C,
                             ArrayRef<Constant *> IdxList, Type *DestTy);
 
 public:
-  static GetElementPtrConstantExpr *Create(Constant *C,
-                                           ArrayRef<Constant*> IdxList,
-                                           Type *DestTy,
-                                           unsigned Flags) {
-    return Create(
-        cast<PointerType>(C->getType()->getScalarType())->getElementType(), C,
-        IdxList, DestTy, Flags);
-  }
   static GetElementPtrConstantExpr *Create(Type *SrcElementTy, Constant *C,
                                            ArrayRef<Constant *> IdxList,
                                            Type *DestTy, unsigned Flags) {
@@ -247,6 +238,7 @@ public:
     return Result;
   }
   Type *getSourceElementType() const;
+  Type *getResultElementType() const;
   /// Transparently provide more efficient getOperand methods.
   DECLARE_TRANSPARENT_OPERAND_ACCESSORS(Value);
 
@@ -552,6 +544,9 @@ public:
   typedef typename ConstantInfo<ConstantClass>::TypeClass TypeClass;
   typedef std::pair<TypeClass *, ValType> LookupKey;
 
+  /// Key and hash together, so that we compute the hash only once and reuse it.
+  typedef std::pair<unsigned, LookupKey> LookupKeyHashed;
+
 private:
   struct MapInfo {
     typedef DenseMapInfo<ConstantClass *> ConstantClassInfo;
@@ -562,7 +557,7 @@ private:
       return ConstantClassInfo::getTombstoneKey();
     }
     static unsigned getHashValue(const ConstantClass *CP) {
-      SmallVector<Constant *, 8> Storage;
+      SmallVector<Constant *, 32> Storage;
       return getHashValue(LookupKey(CP->getType(), ValType(CP, Storage)));
     }
     static bool isEqual(const ConstantClass *LHS, const ConstantClass *RHS) {
@@ -571,6 +566,9 @@ private:
     static unsigned getHashValue(const LookupKey &Val) {
       return hash_combine(Val.first, Val.second.getHash());
     }
+    static unsigned getHashValue(const LookupKeyHashed &Val) {
+      return Val.first;
+    }
     static bool isEqual(const LookupKey &LHS, const ConstantClass *RHS) {
       if (RHS == getEmptyKey() || RHS == getTombstoneKey())
         return false;
@@ -578,30 +576,31 @@ private:
         return false;
       return LHS.second == RHS;
     }
+    static bool isEqual(const LookupKeyHashed &LHS, const ConstantClass *RHS) {
+      return isEqual(LHS.second, RHS);
+    }
   };
 
 public:
-  typedef DenseMap<ConstantClass *, char, MapInfo> MapTy;
+  typedef DenseSet<ConstantClass *, MapInfo> MapTy;
 
 private:
   MapTy Map;
 
 public:
-  typename MapTy::iterator map_begin() { return Map.begin(); }
-  typename MapTy::iterator map_end() { return Map.end(); }
+  typename MapTy::iterator begin() { return Map.begin(); }
+  typename MapTy::iterator end() { return Map.end(); }
 
   void freeConstants() {
     for (auto &I : Map)
-      // Asserts that use_empty().
-      delete I.first;
+      delete I; // Asserts that use_empty().
   }
-
 private:
-  ConstantClass *create(TypeClass *Ty, ValType V) {
+  ConstantClass *create(TypeClass *Ty, ValType V, LookupKeyHashed &HashKey) {
     ConstantClass *Result = V.create(Ty);
 
     assert(Result->getType() == Ty && "Type specified is not correct!");
-    insert(Result);
+    Map.insert_as(Result, HashKey);
 
     return Result;
   }
@@ -609,32 +608,27 @@ private:
 public:
   /// Return the specified constant from the map, creating it if necessary.
   ConstantClass *getOrCreate(TypeClass *Ty, ValType V) {
-    LookupKey Lookup(Ty, V);
+    LookupKey Key(Ty, V);
+    /// Hash once, and reuse it for the lookup and the insertion if needed.
+    LookupKeyHashed Lookup(MapInfo::getHashValue(Key), Key);
+
     ConstantClass *Result = nullptr;
 
-    auto I = find(Lookup);
+    auto I = Map.find_as(Lookup);
     if (I == Map.end())
-      Result = create(Ty, V);
+      Result = create(Ty, V, Lookup);
     else
-      Result = I->first;
+      Result = *I;
     assert(Result && "Unexpected nullptr");
 
     return Result;
   }
 
-  /// Find the constant by lookup key.
-  typename MapTy::iterator find(LookupKey Lookup) {
-    return Map.find_as(Lookup);
-  }
-
-  /// Insert the constant into its proper slot.
-  void insert(ConstantClass *CP) { Map[CP] = '\0'; }
-
   /// Remove this constant from the map
   void remove(ConstantClass *CP) {
     typename MapTy::iterator I = Map.find(CP);
     assert(I != Map.end() && "Constant not found in constant table!");
-    assert(I->first == CP && "Didn't find correct element?");
+    assert(*I == CP && "Didn't find correct element?");
     Map.erase(I);
   }
 
@@ -642,10 +636,13 @@ public:
                                         ConstantClass *CP, Value *From,
                                         Constant *To, unsigned NumUpdated = 0,
                                         unsigned OperandNo = ~0u) {
-    LookupKey Lookup(CP->getType(), ValType(Operands, CP));
-    auto I = find(Lookup);
+    LookupKey Key(CP->getType(), ValType(Operands, CP));
+    /// Hash once, and reuse it for the lookup and the insertion if needed.
+    LookupKeyHashed Lookup(MapInfo::getHashValue(Key), Key);
+
+    auto I = Map.find_as(Lookup);
     if (I != Map.end())
-      return I->first;
+      return *I;
 
     // Update to the new value.  Optimize for the case when we have a single
     // operand that we're changing, but handle bulk updates efficiently.
@@ -659,7 +656,7 @@ public:
         if (CP->getOperand(I) == From)
           CP->setOperand(I, To);
     }
-    insert(CP);
+    Map.insert_as(CP, Lookup);
     return nullptr;
   }
 
diff --git a/lib/IR/Core.cpp b/lib/IR/Core.cpp
index 591dafa22a4b..a55361489ada 100644
--- a/lib/IR/Core.cpp
+++ b/lib/IR/Core.cpp
@@ -13,6 +13,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm-c/Core.h"
+#include "llvm/ADT/StringSwitch.h"
 #include "llvm/Bitcode/ReaderWriter.h"
 #include "llvm/IR/Attributes.h"
 #include "llvm/IR/CallSite.h"
@@ -73,22 +74,32 @@ void LLVMDisposeMessage(char *Message) {
 
 /*===-- Operations on contexts --------------------------------------------===*/
 
+static ManagedStatic<LLVMContext> GlobalContext;
+
 LLVMContextRef LLVMContextCreate() {
   return wrap(new LLVMContext());
 }
 
-LLVMContextRef LLVMGetGlobalContext() {
-  return wrap(&getGlobalContext());
-}
+LLVMContextRef LLVMGetGlobalContext() { return wrap(&*GlobalContext); }
 
 void LLVMContextSetDiagnosticHandler(LLVMContextRef C,
                                      LLVMDiagnosticHandler Handler,
                                      void *DiagnosticContext) {
   unwrap(C)->setDiagnosticHandler(
-      LLVM_EXTENSION reinterpret_cast<LLVMContext::DiagnosticHandlerTy>(Handler),
+      LLVM_EXTENSION reinterpret_cast<LLVMContext::DiagnosticHandlerTy>(
+          Handler),
       DiagnosticContext);
 }
 
+LLVMDiagnosticHandler LLVMContextGetDiagnosticHandler(LLVMContextRef C) {
+  return LLVM_EXTENSION reinterpret_cast<LLVMDiagnosticHandler>(
+      unwrap(C)->getDiagnosticHandler());
+}
+
+void *LLVMContextGetDiagnosticContext(LLVMContextRef C) {
+  return unwrap(C)->getDiagnosticContext();
+}
+
 void LLVMContextSetYieldCallback(LLVMContextRef C, LLVMYieldCallback Callback,
                                  void *OpaqueHandle) {
   auto YieldCallback =
@@ -100,15 +111,72 @@ void LLVMContextDispose(LLVMContextRef C) {
   delete unwrap(C);
 }
 
-unsigned LLVMGetMDKindIDInContext(LLVMContextRef C, const char* Name,
+unsigned LLVMGetMDKindIDInContext(LLVMContextRef C, const char *Name,
                                   unsigned SLen) {
   return unwrap(C)->getMDKindID(StringRef(Name, SLen));
 }
 
-unsigned LLVMGetMDKindID(const char* Name, unsigned SLen) {
+unsigned LLVMGetMDKindID(const char *Name, unsigned SLen) {
   return LLVMGetMDKindIDInContext(LLVMGetGlobalContext(), Name, SLen);
 }
 
+#define GET_ATTR_KIND_FROM_NAME
+#include "AttributesCompatFunc.inc"
+
+unsigned LLVMGetEnumAttributeKindForName(const char *Name, size_t SLen) {
+  return getAttrKindFromName(StringRef(Name, SLen));
+}
+
+unsigned LLVMGetLastEnumAttributeKind(void) {
+  return Attribute::AttrKind::EndAttrKinds;
+}
+
+LLVMAttributeRef LLVMCreateEnumAttribute(LLVMContextRef C, unsigned KindID,
+                                         uint64_t Val) {
+  return wrap(Attribute::get(*unwrap(C), (Attribute::AttrKind)KindID, Val));
+}
+
+unsigned LLVMGetEnumAttributeKind(LLVMAttributeRef A) {
+  return unwrap(A).getKindAsEnum();
+}
+
+uint64_t LLVMGetEnumAttributeValue(LLVMAttributeRef A) {
+  auto Attr = unwrap(A);
+  if (Attr.isEnumAttribute())
+    return 0;
+  return Attr.getValueAsInt();
+}
+
+LLVMAttributeRef LLVMCreateStringAttribute(LLVMContextRef C,
+                                           const char *K, unsigned KLength,
+                                           const char *V, unsigned VLength) {
+  return wrap(Attribute::get(*unwrap(C), StringRef(K, KLength),
+                             StringRef(V, VLength)));
+}
+
+const char *LLVMGetStringAttributeKind(LLVMAttributeRef A,
+                                       unsigned *Length) {
+  auto S = unwrap(A).getKindAsString();
+  *Length = S.size();
+  return S.data();
+}
+
+const char *LLVMGetStringAttributeValue(LLVMAttributeRef A,
+                                        unsigned *Length) {
+  auto S = unwrap(A).getValueAsString();
+  *Length = S.size();
+  return S.data();
+}
+
+LLVMBool LLVMIsEnumAttribute(LLVMAttributeRef A) {
+  auto Attr = unwrap(A);
+  return Attr.isEnumAttribute() || Attr.isIntAttribute();
+}
+
+LLVMBool LLVMIsStringAttribute(LLVMAttributeRef A) {
+  return unwrap(A).isStringAttribute();
+}
+
 char *LLVMGetDiagInfoDescription(LLVMDiagnosticInfoRef DI) {
   std::string MsgStorage;
   raw_string_ostream Stream(MsgStorage);
@@ -120,7 +188,7 @@ char *LLVMGetDiagInfoDescription(LLVMDiagnosticInfoRef DI) {
   return LLVMCreateMessage(MsgStorage.c_str());
 }
 
-LLVMDiagnosticSeverity LLVMGetDiagInfoSeverity(LLVMDiagnosticInfoRef DI){
+LLVMDiagnosticSeverity LLVMGetDiagInfoSeverity(LLVMDiagnosticInfoRef DI) {
     LLVMDiagnosticSeverity severity;
 
     switch(unwrap(DI)->getSeverity()) {
@@ -141,13 +209,10 @@ LLVMDiagnosticSeverity LLVMGetDiagInfoSeverity(LLVMDiagnosticInfoRef DI){
     return severity;
 }
 
-
-
-
 /*===-- Operations on modules ---------------------------------------------===*/
 
 LLVMModuleRef LLVMModuleCreateWithName(const char *ModuleID) {
-  return wrap(new Module(ModuleID, getGlobalContext()));
+  return wrap(new Module(ModuleID, *GlobalContext));
 }
 
 LLVMModuleRef LLVMModuleCreateWithNameInContext(const char *ModuleID,
@@ -159,13 +224,28 @@ void LLVMDisposeModule(LLVMModuleRef M) {
   delete unwrap(M);
 }
 
+const char *LLVMGetModuleIdentifier(LLVMModuleRef M, size_t *Len) {
+  auto &Str = unwrap(M)->getModuleIdentifier();
+  *Len = Str.length();
+  return Str.c_str();
+}
+
+void LLVMSetModuleIdentifier(LLVMModuleRef M, const char *Ident, size_t Len) {
+  unwrap(M)->setModuleIdentifier(StringRef(Ident, Len));
+}
+
+
 /*--.. Data layout .........................................................--*/
-const char * LLVMGetDataLayout(LLVMModuleRef M) {
+const char *LLVMGetDataLayoutStr(LLVMModuleRef M) {
   return unwrap(M)->getDataLayoutStr().c_str();
 }
 
-void LLVMSetDataLayout(LLVMModuleRef M, const char *Triple) {
-  unwrap(M)->setDataLayout(Triple);
+const char *LLVMGetDataLayout(LLVMModuleRef M) {
+  return LLVMGetDataLayoutStr(M);
+}
+
+void LLVMSetDataLayout(LLVMModuleRef M, const char *DataLayoutStr) {
+  unwrap(M)->setDataLayout(DataLayoutStr);
 }
 
 /*--.. Target triple .......................................................--*/
@@ -537,6 +617,17 @@ LLVMTypeRef LLVMTypeOf(LLVMValueRef Val) {
   return wrap(unwrap(Val)->getType());
 }
 
+LLVMValueKind LLVMGetValueKind(LLVMValueRef Val) {
+    switch(unwrap(Val)->getValueID()) {
+#define HANDLE_VALUE(Name) \
+  case Value::Name##Val: \
+    return LLVM##Name##ValueKind;
+#include "llvm/IR/Value.def"
+  default:
+    return LLVMInstructionValueKind;
+  }
+}
+
 const char *LLVMGetValueName(LLVMValueRef Val) {
   return unwrap(Val)->getName().data();
 }
@@ -719,8 +810,7 @@ LLVMBool LLVMIsUndef(LLVMValueRef Val) {
 }
 
 LLVMValueRef LLVMConstPointerNull(LLVMTypeRef Ty) {
-  return
-      wrap(ConstantPointerNull::get(unwrap<PointerType>(Ty)));
+  return wrap(ConstantPointerNull::get(unwrap<PointerType>(Ty)));
 }
 
 /*--.. Operations on metadata nodes ........................................--*/
@@ -767,26 +857,24 @@ LLVMValueRef LLVMMDNode(LLVMValueRef *Vals, unsigned Count) {
   return LLVMMDNodeInContext(LLVMGetGlobalContext(), Vals, Count);
 }
 
-const char *LLVMGetMDString(LLVMValueRef V, unsigned* Len) {
+const char *LLVMGetMDString(LLVMValueRef V, unsigned *Length) {
   if (const auto *MD = dyn_cast<MetadataAsValue>(unwrap(V)))
     if (const MDString *S = dyn_cast<MDString>(MD->getMetadata())) {
-      *Len = S->getString().size();
+      *Length = S->getString().size();
       return S->getString().data();
     }
-  *Len = 0;
+  *Length = 0;
   return nullptr;
 }
 
-unsigned LLVMGetMDNodeNumOperands(LLVMValueRef V)
-{
+unsigned LLVMGetMDNodeNumOperands(LLVMValueRef V) {
   auto *MD = cast<MetadataAsValue>(unwrap(V));
   if (isa<ValueAsMetadata>(MD->getMetadata()))
     return 1;
   return cast<MDNode>(MD->getMetadata())->getNumOperands();
 }
 
-void LLVMGetMDNodeOperands(LLVMValueRef V, LLVMValueRef *Dest)
-{
+void LLVMGetMDNodeOperands(LLVMValueRef V, LLVMValueRef *Dest) {
   auto *MD = cast<MetadataAsValue>(unwrap(V));
   if (auto *MDV = dyn_cast<ValueAsMetadata>(MD->getMetadata())) {
     *Dest = wrap(MDV->getValue());
@@ -799,17 +887,16 @@ void LLVMGetMDNodeOperands(LLVMValueRef V, LLVMValueRef *Dest)
     Dest[i] = getMDNodeOperandImpl(Context, N, i);
 }
 
-unsigned LLVMGetNamedMetadataNumOperands(LLVMModuleRef M, const char* name)
-{
-  if (NamedMDNode *N = unwrap(M)->getNamedMetadata(name)) {
+unsigned LLVMGetNamedMetadataNumOperands(LLVMModuleRef M, const char *Name) {
+  if (NamedMDNode *N = unwrap(M)->getNamedMetadata(Name)) {
     return N->getNumOperands();
   }
   return 0;
 }
 
-void LLVMGetNamedMetadataOperands(LLVMModuleRef M, const char* name, LLVMValueRef *Dest)
-{
-  NamedMDNode *N = unwrap(M)->getNamedMetadata(name);
+void LLVMGetNamedMetadataOperands(LLVMModuleRef M, const char *Name,
+                                  LLVMValueRef *Dest) {
+  NamedMDNode *N = unwrap(M)->getNamedMetadata(Name);
   if (!N)
     return;
   LLVMContext &Context = unwrap(M)->getContext();
@@ -817,10 +904,9 @@ void LLVMGetNamedMetadataOperands(LLVMModuleRef M, const char* name, LLVMValueRe
     Dest[i] = wrap(MetadataAsValue::get(Context, N->getOperand(i)));
 }
 
-void LLVMAddNamedMetadataOperand(LLVMModuleRef M, const char* name,
-                                 LLVMValueRef Val)
-{
-  NamedMDNode *N = unwrap(M)->getOrInsertNamedMetadata(name);
+void LLVMAddNamedMetadataOperand(LLVMModuleRef M, const char *Name,
+                                 LLVMValueRef Val) {
+  NamedMDNode *N = unwrap(M)->getOrInsertNamedMetadata(Name);
   if (!N)
     return;
   if (!Val)
@@ -908,13 +994,6 @@ LLVMValueRef LLVMConstStringInContext(LLVMContextRef C, const char *Str,
   return wrap(ConstantDataArray::getString(*unwrap(C), StringRef(Str, Length),
                                            DontNullTerminate == 0));
 }
-LLVMValueRef LLVMConstStructInContext(LLVMContextRef C,
-                                      LLVMValueRef *ConstantVals,
-                                      unsigned Count, LLVMBool Packed) {
-  Constant **Elements = unwrap<Constant>(ConstantVals, Count);
-  return wrap(ConstantStruct::getAnon(*unwrap(C), makeArrayRef(Elements, Count),
-                                      Packed != 0));
-}
 
 LLVMValueRef LLVMConstString(const char *Str, unsigned Length,
                              LLVMBool DontNullTerminate) {
@@ -922,18 +1001,18 @@ LLVMValueRef LLVMConstString(const char *Str, unsigned Length,
                                   DontNullTerminate);
 }
 
-LLVMValueRef LLVMGetElementAsConstant(LLVMValueRef c, unsigned idx) {
-  return wrap(static_cast<ConstantDataSequential*>(unwrap(c))->getElementAsConstant(idx));
+LLVMValueRef LLVMGetElementAsConstant(LLVMValueRef C, unsigned idx) {
+  return wrap(unwrap<ConstantDataSequential>(C)->getElementAsConstant(idx));
 }
 
-LLVMBool LLVMIsConstantString(LLVMValueRef c) {
-  return static_cast<ConstantDataSequential*>(unwrap(c))->isString();
+LLVMBool LLVMIsConstantString(LLVMValueRef C) {
+  return unwrap<ConstantDataSequential>(C)->isString();
 }
 
-const char *LLVMGetAsString(LLVMValueRef c, size_t* Length) {
-  StringRef str = static_cast<ConstantDataSequential*>(unwrap(c))->getAsString();
-  *Length = str.size();
-  return str.data();
+const char *LLVMGetAsString(LLVMValueRef C, size_t *Length) {
+  StringRef Str = unwrap<ConstantDataSequential>(C)->getAsString();
+  *Length = Str.size();
+  return Str.data();
 }
 
 LLVMValueRef LLVMConstArray(LLVMTypeRef ElementTy,
@@ -942,6 +1021,14 @@ LLVMValueRef LLVMConstArray(LLVMTypeRef ElementTy,
   return wrap(ConstantArray::get(ArrayType::get(unwrap(ElementTy), Length), V));
 }
 
+LLVMValueRef LLVMConstStructInContext(LLVMContextRef C,
+                                      LLVMValueRef *ConstantVals,
+                                      unsigned Count, LLVMBool Packed) {
+  Constant **Elements = unwrap<Constant>(ConstantVals, Count);
+  return wrap(ConstantStruct::getAnon(*unwrap(C), makeArrayRef(Elements, Count),
+                                      Packed != 0));
+}
+
 LLVMValueRef LLVMConstStruct(LLVMValueRef *ConstantVals, unsigned Count,
                              LLVMBool Packed) {
   return LLVMConstStructInContext(LLVMGetGlobalContext(), ConstantVals, Count,
@@ -1441,7 +1528,9 @@ void LLVMSetLinkage(LLVMValueRef Global, LLVMLinkage Linkage) {
 }
 
 const char *LLVMGetSection(LLVMValueRef Global) {
-  return unwrap<GlobalValue>(Global)->getSection();
+  // Using .data() is safe because of how GlobalObject::setSection is
+  // implemented.
+  return unwrap<GlobalValue>(Global)->getSection().data();
 }
 
 void LLVMSetSection(LLVMValueRef Global, const char *Section) {
@@ -1469,11 +1558,13 @@ void LLVMSetDLLStorageClass(LLVMValueRef Global, LLVMDLLStorageClass Class) {
 }
 
 LLVMBool LLVMHasUnnamedAddr(LLVMValueRef Global) {
-  return unwrap<GlobalValue>(Global)->hasUnnamedAddr();
+  return unwrap<GlobalValue>(Global)->hasGlobalUnnamedAddr();
 }
 
 void LLVMSetUnnamedAddr(LLVMValueRef Global, LLVMBool HasUnnamedAddr) {
-  unwrap<GlobalValue>(Global)->setUnnamedAddr(HasUnnamedAddr);
+  unwrap<GlobalValue>(Global)->setUnnamedAddr(
+      HasUnnamedAddr ? GlobalValue::UnnamedAddr::Global
+                     : GlobalValue::UnnamedAddr::None);
 }
 
 /*--.. Operations on global variables, load and store instructions .........--*/
@@ -1697,6 +1788,10 @@ void LLVMDeleteFunction(LLVMValueRef Fn) {
   unwrap<Function>(Fn)->eraseFromParent();
 }
 
+LLVMBool LLVMHasPersonalityFn(LLVMValueRef Fn) {
+  return unwrap<Function>(Fn)->hasPersonalityFn();
+}
+
 LLVMValueRef LLVMGetPersonalityFn(LLVMValueRef Fn) {
   return wrap(unwrap<Function>(Fn)->getPersonalityFn());
 }
@@ -1744,6 +1839,34 @@ void LLVMAddFunctionAttr(LLVMValueRef Fn, LLVMAttribute PA) {
   Func->setAttributes(PALnew);
 }
 
+void LLVMAddAttributeAtIndex(LLVMValueRef F, LLVMAttributeIndex Idx,
+                             LLVMAttributeRef A) {
+  unwrap<Function>(F)->addAttribute(Idx, unwrap(A));
+}
+
+LLVMAttributeRef LLVMGetEnumAttributeAtIndex(LLVMValueRef F,
+                                             LLVMAttributeIndex Idx,
+                                             unsigned KindID) {
+  return wrap(unwrap<Function>(F)->getAttribute(Idx,
+                                                (Attribute::AttrKind)KindID));
+}
+
+LLVMAttributeRef LLVMGetStringAttributeAtIndex(LLVMValueRef F,
+                                               LLVMAttributeIndex Idx,
+                                               const char *K, unsigned KLen) {
+  return wrap(unwrap<Function>(F)->getAttribute(Idx, StringRef(K, KLen)));
+}
+
+void LLVMRemoveEnumAttributeAtIndex(LLVMValueRef F, LLVMAttributeIndex Idx,
+                                    unsigned KindID) {
+  unwrap<Function>(F)->removeAttribute(Idx, (Attribute::AttrKind)KindID);
+}
+
+void LLVMRemoveStringAttributeAtIndex(LLVMValueRef F, LLVMAttributeIndex Idx,
+                                      const char *K, unsigned KLen) {
+  unwrap<Function>(F)->removeAttribute(Idx, StringRef(K, KLen));
+}
+
 void LLVMAddTargetDependentFunctionAttr(LLVMValueRef Fn, const char *A,
                                         const char *V) {
   Function *Func = unwrap<Function>(Fn);
@@ -1849,7 +1972,6 @@ LLVMAttribute LLVMGetAttribute(LLVMValueRef Arg) {
     Raw(A->getArgNo()+1);
 }
 
-
 void LLVMSetParamAlignment(LLVMValueRef Arg, unsigned align) {
   Argument *A = unwrap<Argument>(Arg);
   AttrBuilder B;
@@ -1871,6 +1993,10 @@ LLVMBasicBlockRef LLVMValueAsBasicBlock(LLVMValueRef Val) {
   return wrap(unwrap<BasicBlock>(Val));
 }
 
+const char *LLVMGetBasicBlockName(LLVMBasicBlockRef BB) {
+  return unwrap(BB)->getName().data();
+}
+
 LLVMValueRef LLVMGetBasicBlockParent(LLVMBasicBlockRef BB) {
   return wrap(unwrap(BB)->getParent());
 }
@@ -1885,8 +2011,8 @@ unsigned LLVMCountBasicBlocks(LLVMValueRef FnRef) {
 
 void LLVMGetBasicBlocks(LLVMValueRef FnRef, LLVMBasicBlockRef *BasicBlocksRefs){
   Function *Fn = unwrap<Function>(FnRef);
-  for (Function::iterator I = Fn->begin(), E = Fn->end(); I != E; I++)
-    *BasicBlocksRefs++ = wrap(&*I);
+  for (BasicBlock &BB : *Fn)
+    *BasicBlocksRefs++ = wrap(&BB);
 }
 
 LLVMBasicBlockRef LLVMGetEntryBasicBlock(LLVMValueRef Fn) {
@@ -2001,6 +2127,10 @@ LLVMValueRef LLVMGetPreviousInstruction(LLVMValueRef Inst) {
   return wrap(&*--I);
 }
 
+void LLVMInstructionRemoveFromParent(LLVMValueRef Inst) {
+  unwrap<Instruction>(Inst)->removeFromParent();
+}
+
 void LLVMInstructionEraseFromParent(LLVMValueRef Inst) {
   unwrap<Instruction>(Inst)->eraseFromParent();
 }
@@ -2037,22 +2167,17 @@ LLVMValueRef LLVMInstructionClone(LLVMValueRef Inst) {
 
 /*--.. Call and invoke instructions ........................................--*/
 
+unsigned LLVMGetNumArgOperands(LLVMValueRef Instr) {
+  return CallSite(unwrap<Instruction>(Instr)).getNumArgOperands();
+}
+
 unsigned LLVMGetInstructionCallConv(LLVMValueRef Instr) {
-  Value *V = unwrap(Instr);
-  if (CallInst *CI = dyn_cast<CallInst>(V))
-    return CI->getCallingConv();
-  if (InvokeInst *II = dyn_cast<InvokeInst>(V))
-    return II->getCallingConv();
-  llvm_unreachable("LLVMGetInstructionCallConv applies only to call and invoke!");
+  return CallSite(unwrap<Instruction>(Instr)).getCallingConv();
 }
 
 void LLVMSetInstructionCallConv(LLVMValueRef Instr, unsigned CC) {
-  Value *V = unwrap(Instr);
-  if (CallInst *CI = dyn_cast<CallInst>(V))
-    return CI->setCallingConv(static_cast<CallingConv::ID>(CC));
-  else if (InvokeInst *II = dyn_cast<InvokeInst>(V))
-    return II->setCallingConv(static_cast<CallingConv::ID>(CC));
-  llvm_unreachable("LLVMSetInstructionCallConv applies only to call and invoke!");
+  return CallSite(unwrap<Instruction>(Instr))
+    .setCallingConv(static_cast<CallingConv::ID>(CC));
 }
 
 void LLVMAddInstrAttribute(LLVMValueRef Instr, unsigned index,
@@ -2086,6 +2211,40 @@ void LLVMSetInstrParamAlignment(LLVMValueRef Instr, unsigned index,
                                                         index, B)));
 }
 
+void LLVMAddCallSiteAttribute(LLVMValueRef C, LLVMAttributeIndex Idx,
+                              LLVMAttributeRef A) {
+  CallSite(unwrap<Instruction>(C)).addAttribute(Idx, unwrap(A));
+}
+
+LLVMAttributeRef LLVMGetCallSiteEnumAttribute(LLVMValueRef C,
+                                              LLVMAttributeIndex Idx,
+                                              unsigned KindID) {
+  return wrap(CallSite(unwrap<Instruction>(C))
+    .getAttribute(Idx, (Attribute::AttrKind)KindID));
+}
+
+LLVMAttributeRef LLVMGetCallSiteStringAttribute(LLVMValueRef C,
+                                                LLVMAttributeIndex Idx,
+                                                const char *K, unsigned KLen) {
+  return wrap(CallSite(unwrap<Instruction>(C))
+    .getAttribute(Idx, StringRef(K, KLen)));
+}
+
+void LLVMRemoveCallSiteEnumAttribute(LLVMValueRef C, LLVMAttributeIndex Idx,
+                                     unsigned KindID) {
+  CallSite(unwrap<Instruction>(C))
+    .removeAttribute(Idx, (Attribute::AttrKind)KindID);
+}
+
+void LLVMRemoveCallSiteStringAttribute(LLVMValueRef C, LLVMAttributeIndex Idx,
+                                       const char *K, unsigned KLen) {
+  CallSite(unwrap<Instruction>(C)).removeAttribute(Idx, StringRef(K, KLen));
+}
+
+LLVMValueRef LLVMGetCalledValue(LLVMValueRef Instr) {
+  return wrap(CallSite(unwrap<Instruction>(Instr)).getCalledValue());
+}
+
 /*--.. Operations on call instructions (only) ..............................--*/
 
 LLVMBool LLVMIsTailCall(LLVMValueRef Call) {
@@ -2096,6 +2255,24 @@ void LLVMSetTailCall(LLVMValueRef Call, LLVMBool isTailCall) {
   unwrap<CallInst>(Call)->setTailCall(isTailCall);
 }
 
+/*--.. Operations on invoke instructions (only) ............................--*/
+
+LLVMBasicBlockRef LLVMGetNormalDest(LLVMValueRef Invoke) {
+  return wrap(unwrap<InvokeInst>(Invoke)->getNormalDest());
+}
+
+LLVMBasicBlockRef LLVMGetUnwindDest(LLVMValueRef Invoke) {
+  return wrap(unwrap<InvokeInst>(Invoke)->getUnwindDest());
+}
+
+void LLVMSetNormalDest(LLVMValueRef Invoke, LLVMBasicBlockRef B) {
+  unwrap<InvokeInst>(Invoke)->setNormalDest(unwrap(B));
+}
+
+void LLVMSetUnwindDest(LLVMValueRef Invoke, LLVMBasicBlockRef B) {
+  unwrap<InvokeInst>(Invoke)->setUnwindDest(unwrap(B));
+}
+
 /*--.. Operations on terminators ...........................................--*/
 
 unsigned LLVMGetNumSuccessors(LLVMValueRef Term) {
@@ -2130,6 +2307,22 @@ LLVMBasicBlockRef LLVMGetSwitchDefaultDest(LLVMValueRef Switch) {
   return wrap(unwrap<SwitchInst>(Switch)->getDefaultDest());
 }
 
+/*--.. Operations on alloca instructions (only) ............................--*/
+
+LLVMTypeRef LLVMGetAllocatedType(LLVMValueRef Alloca) {
+  return wrap(unwrap<AllocaInst>(Alloca)->getAllocatedType());
+}
+
+/*--.. Operations on gep instructions (only) ...............................--*/
+
+LLVMBool LLVMIsInBounds(LLVMValueRef GEP) {
+  return unwrap<GetElementPtrInst>(GEP)->isInBounds();
+}
+
+void LLVMSetIsInBounds(LLVMValueRef GEP, LLVMBool InBounds) {
+  return unwrap<GetElementPtrInst>(GEP)->setIsInBounds(InBounds);
+}
+
 /*--.. Operations on phi nodes .............................................--*/
 
 void LLVMAddIncoming(LLVMValueRef PhiNode, LLVMValueRef *IncomingValues,
@@ -2151,6 +2344,30 @@ LLVMBasicBlockRef LLVMGetIncomingBlock(LLVMValueRef PhiNode, unsigned Index) {
   return wrap(unwrap<PHINode>(PhiNode)->getIncomingBlock(Index));
 }
 
+/*--.. Operations on extractvalue and insertvalue nodes ....................--*/
+
+unsigned LLVMGetNumIndices(LLVMValueRef Inst) {
+  auto *I = unwrap(Inst);
+  if (auto *GEP = dyn_cast<GetElementPtrInst>(I))
+    return GEP->getNumIndices();
+  if (auto *EV = dyn_cast<ExtractValueInst>(I))
+    return EV->getNumIndices();
+  if (auto *IV = dyn_cast<InsertValueInst>(I))
+    return IV->getNumIndices();
+  llvm_unreachable(
+    "LLVMGetNumIndices applies only to extractvalue and insertvalue!");
+}
+
+const unsigned *LLVMGetIndices(LLVMValueRef Inst) {
+  auto *I = unwrap(Inst);
+  if (auto *EV = dyn_cast<ExtractValueInst>(I))
+    return EV->getIndices().data();
+  if (auto *IV = dyn_cast<InsertValueInst>(I))
+    return IV->getIndices().data();
+  llvm_unreachable(
+    "LLVMGetIndices applies only to extractvalue and insertvalue!");
+}
+
 
 /*===-- Instruction builders ----------------------------------------------===*/
 
@@ -2291,11 +2508,23 @@ void LLVMAddDestination(LLVMValueRef IndirectBr, LLVMBasicBlockRef Dest) {
   unwrap<IndirectBrInst>(IndirectBr)->addDestination(unwrap(Dest));
 }
 
+unsigned LLVMGetNumClauses(LLVMValueRef LandingPad) {
+  return unwrap<LandingPadInst>(LandingPad)->getNumClauses();
+}
+
+LLVMValueRef LLVMGetClause(LLVMValueRef LandingPad, unsigned Idx) {
+  return wrap(unwrap<LandingPadInst>(LandingPad)->getClause(Idx));
+}
+
 void LLVMAddClause(LLVMValueRef LandingPad, LLVMValueRef ClauseVal) {
   unwrap<LandingPadInst>(LandingPad)->
     addClause(cast<Constant>(unwrap(ClauseVal)));
 }
 
+LLVMBool LLVMIsCleanup(LLVMValueRef LandingPad) {
+  return unwrap<LandingPadInst>(LandingPad)->isCleanup();
+}
+
 void LLVMSetCleanup(LLVMValueRef LandingPad, LLVMBool Val) {
   unwrap<LandingPadInst>(LandingPad)->setCleanup(Val);
 }
@@ -2507,14 +2736,15 @@ LLVMValueRef LLVMBuildStore(LLVMBuilderRef B, LLVMValueRef Val,
 
 static AtomicOrdering mapFromLLVMOrdering(LLVMAtomicOrdering Ordering) {
   switch (Ordering) {
-    case LLVMAtomicOrderingNotAtomic: return NotAtomic;
-    case LLVMAtomicOrderingUnordered: return Unordered;
-    case LLVMAtomicOrderingMonotonic: return Monotonic;
-    case LLVMAtomicOrderingAcquire: return Acquire;
-    case LLVMAtomicOrderingRelease: return Release;
-    case LLVMAtomicOrderingAcquireRelease: return AcquireRelease;
+    case LLVMAtomicOrderingNotAtomic: return AtomicOrdering::NotAtomic;
+    case LLVMAtomicOrderingUnordered: return AtomicOrdering::Unordered;
+    case LLVMAtomicOrderingMonotonic: return AtomicOrdering::Monotonic;
+    case LLVMAtomicOrderingAcquire: return AtomicOrdering::Acquire;
+    case LLVMAtomicOrderingRelease: return AtomicOrdering::Release;
+    case LLVMAtomicOrderingAcquireRelease:
+      return AtomicOrdering::AcquireRelease;
     case LLVMAtomicOrderingSequentiallyConsistent:
-      return SequentiallyConsistent;
+      return AtomicOrdering::SequentiallyConsistent;
   }
 
   llvm_unreachable("Invalid LLVMAtomicOrdering value!");
@@ -2522,13 +2752,14 @@ static AtomicOrdering mapFromLLVMOrdering(LLVMAtomicOrdering Ordering) {
 
 static LLVMAtomicOrdering mapToLLVMOrdering(AtomicOrdering Ordering) {
   switch (Ordering) {
-    case NotAtomic: return LLVMAtomicOrderingNotAtomic;
-    case Unordered: return LLVMAtomicOrderingUnordered;
-    case Monotonic: return LLVMAtomicOrderingMonotonic;
-    case Acquire: return LLVMAtomicOrderingAcquire;
-    case Release: return LLVMAtomicOrderingRelease;
-    case AcquireRelease: return LLVMAtomicOrderingAcquireRelease;
-    case SequentiallyConsistent:
+    case AtomicOrdering::NotAtomic: return LLVMAtomicOrderingNotAtomic;
+    case AtomicOrdering::Unordered: return LLVMAtomicOrderingUnordered;
+    case AtomicOrdering::Monotonic: return LLVMAtomicOrderingMonotonic;
+    case AtomicOrdering::Acquire: return LLVMAtomicOrderingAcquire;
+    case AtomicOrdering::Release: return LLVMAtomicOrderingRelease;
+    case AtomicOrdering::AcquireRelease:
+      return LLVMAtomicOrderingAcquireRelease;
+    case AtomicOrdering::SequentiallyConsistent:
       return LLVMAtomicOrderingSequentiallyConsistent;
   }
 
@@ -2824,6 +3055,61 @@ LLVMValueRef LLVMBuildAtomicRMW(LLVMBuilderRef B,LLVMAtomicRMWBinOp op,
     mapFromLLVMOrdering(ordering), singleThread ? SingleThread : CrossThread));
 }
 
+LLVMValueRef LLVMBuildAtomicCmpXchg(LLVMBuilderRef B, LLVMValueRef Ptr,
+                                    LLVMValueRef Cmp, LLVMValueRef New,
+                                    LLVMAtomicOrdering SuccessOrdering,
+                                    LLVMAtomicOrdering FailureOrdering,
+                                    LLVMBool singleThread) {
+
+  return wrap(unwrap(B)->CreateAtomicCmpXchg(unwrap(Ptr), unwrap(Cmp),
+                unwrap(New), mapFromLLVMOrdering(SuccessOrdering),
+                mapFromLLVMOrdering(FailureOrdering),
+                singleThread ? SingleThread : CrossThread));
+}
+
+
+LLVMBool LLVMIsAtomicSingleThread(LLVMValueRef AtomicInst) {
+  Value *P = unwrap<Value>(AtomicInst);
+
+  if (AtomicRMWInst *I = dyn_cast<AtomicRMWInst>(P))
+    return I->getSynchScope() == SingleThread;
+  return cast<AtomicCmpXchgInst>(P)->getSynchScope() == SingleThread;
+}
+
+void LLVMSetAtomicSingleThread(LLVMValueRef AtomicInst, LLVMBool NewValue) {
+  Value *P = unwrap<Value>(AtomicInst);
+  SynchronizationScope Sync = NewValue ? SingleThread : CrossThread;
+
+  if (AtomicRMWInst *I = dyn_cast<AtomicRMWInst>(P))
+    return I->setSynchScope(Sync);
+  return cast<AtomicCmpXchgInst>(P)->setSynchScope(Sync);
+}
+
+LLVMAtomicOrdering LLVMGetCmpXchgSuccessOrdering(LLVMValueRef CmpXchgInst)  {
+  Value *P = unwrap<Value>(CmpXchgInst);
+  return mapToLLVMOrdering(cast<AtomicCmpXchgInst>(P)->getSuccessOrdering());
+}
+
+void LLVMSetCmpXchgSuccessOrdering(LLVMValueRef CmpXchgInst,
+                                   LLVMAtomicOrdering Ordering) {
+  Value *P = unwrap<Value>(CmpXchgInst);
+  AtomicOrdering O = mapFromLLVMOrdering(Ordering);
+
+  return cast<AtomicCmpXchgInst>(P)->setSuccessOrdering(O);
+}
+
+LLVMAtomicOrdering LLVMGetCmpXchgFailureOrdering(LLVMValueRef CmpXchgInst)  {
+  Value *P = unwrap<Value>(CmpXchgInst);
+  return mapToLLVMOrdering(cast<AtomicCmpXchgInst>(P)->getFailureOrdering());
+}
+
+void LLVMSetCmpXchgFailureOrdering(LLVMValueRef CmpXchgInst,
+                                   LLVMAtomicOrdering Ordering) {
+  Value *P = unwrap<Value>(CmpXchgInst);
+  AtomicOrdering O = mapFromLLVMOrdering(Ordering);
+
+  return cast<AtomicCmpXchgInst>(P)->setFailureOrdering(O);
+}
 
 /*===-- Module providers --------------------------------------------------===*/
 
diff --git a/lib/IR/DIBuilder.cpp b/lib/IR/DIBuilder.cpp
index b7841fe2b85c..01b47f386e1c 100644
--- a/lib/IR/DIBuilder.cpp
+++ b/lib/IR/DIBuilder.cpp
@@ -19,44 +19,11 @@
 #include "llvm/IR/Module.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/Dwarf.h"
+#include "LLVMContextImpl.h"
 
 using namespace llvm;
 using namespace llvm::dwarf;
 
-namespace {
-class HeaderBuilder {
-  /// \brief Whether there are any fields yet.
-  ///
-  /// Note that this is not equivalent to \c Chars.empty(), since \a concat()
-  /// may have been called already with an empty string.
-  bool IsEmpty;
-  SmallVector<char, 256> Chars;
-
-public:
-  HeaderBuilder() : IsEmpty(true) {}
-  HeaderBuilder(const HeaderBuilder &X) : IsEmpty(X.IsEmpty), Chars(X.Chars) {}
-  HeaderBuilder(HeaderBuilder &&X)
-      : IsEmpty(X.IsEmpty), Chars(std::move(X.Chars)) {}
-
-  template <class Twineable> HeaderBuilder &concat(Twineable &&X) {
-    if (IsEmpty)
-      IsEmpty = false;
-    else
-      Chars.push_back(0);
-    Twine(X).toVector(Chars);
-    return *this;
-  }
-
-  MDString *get(LLVMContext &Context) const {
-    return MDString::get(Context, StringRef(Chars.begin(), Chars.size()));
-  }
-
-  static HeaderBuilder get(unsigned Tag) {
-    return HeaderBuilder().concat("0x" + Twine::utohexstr(Tag));
-  }
-};
-}
-
 DIBuilder::DIBuilder(Module &m, bool AllowUnresolvedNodes)
   : M(m), VMContext(M.getContext()), CUNode(nullptr),
       DeclareFn(nullptr), ValueFn(nullptr),
@@ -95,17 +62,25 @@ void DIBuilder::finalize() {
     CUNode->replaceRetainedTypes(MDTuple::get(VMContext, RetainValues));
 
   DISubprogramArray SPs = MDTuple::get(VMContext, AllSubprograms);
-  if (!AllSubprograms.empty())
-    CUNode->replaceSubprograms(SPs.get());
-
-  for (auto *SP : SPs) {
-    if (MDTuple *Temp = SP->getVariables().get()) {
-      const auto &PV = PreservedVariables.lookup(SP);
-      SmallVector<Metadata *, 4> Variables(PV.begin(), PV.end());
-      DINodeArray AV = getOrCreateArray(Variables);
-      TempMDTuple(Temp)->replaceAllUsesWith(AV.get());
-    }
-  }
+  auto resolveVariables = [&](DISubprogram *SP) {
+    MDTuple *Temp = SP->getVariables().get();
+    if (!Temp)
+      return;
+
+    SmallVector<Metadata *, 4> Variables;
+
+    auto PV = PreservedVariables.find(SP);
+    if (PV != PreservedVariables.end())
+      Variables.append(PV->second.begin(), PV->second.end());
+
+    DINodeArray AV = getOrCreateArray(Variables);
+    TempMDTuple(Temp)->replaceAllUsesWith(AV.get());
+  };
+  for (auto *SP : SPs)
+    resolveVariables(SP);
+  for (auto *N : RetainValues)
+    if (auto *SP = dyn_cast<DISubprogram>(N))
+      resolveVariables(SP);
 
   if (!AllGVs.empty())
     CUNode->replaceGlobalVariables(MDTuple::get(VMContext, AllGVs));
@@ -136,7 +111,7 @@ static DIScope *getNonCompileUnitScope(DIScope *N) {
 DICompileUnit *DIBuilder::createCompileUnit(
     unsigned Lang, StringRef Filename, StringRef Directory, StringRef Producer,
     bool isOptimized, StringRef Flags, unsigned RunTimeVer, StringRef SplitName,
-    DebugEmissionKind Kind, uint64_t DWOId, bool EmitDebugInfo) {
+    DICompileUnit::DebugEmissionKind Kind, uint64_t DWOId) {
 
   assert(((Lang <= dwarf::DW_LANG_Fortran08 && Lang >= dwarf::DW_LANG_C89) ||
           (Lang <= dwarf::DW_LANG_hi_user && Lang >= dwarf::DW_LANG_lo_user)) &&
@@ -147,19 +122,12 @@ DICompileUnit *DIBuilder::createCompileUnit(
   assert(!CUNode && "Can only make one compile unit per DIBuilder instance");
   CUNode = DICompileUnit::getDistinct(
       VMContext, Lang, DIFile::get(VMContext, Filename, Directory), Producer,
-      isOptimized, Flags, RunTimeVer, SplitName, Kind, nullptr,
-      nullptr, nullptr, nullptr, nullptr, nullptr, DWOId);
+      isOptimized, Flags, RunTimeVer, SplitName, Kind, nullptr, nullptr,
+      nullptr, nullptr, nullptr, DWOId);
 
   // Create a named metadata so that it is easier to find cu in a module.
-  // Note that we only generate this when the caller wants to actually
-  // emit debug information. When we are only interested in tracking
-  // source line locations throughout the backend, we prevent codegen from
-  // emitting debug info in the final output by not generating llvm.dbg.cu.
-  if (EmitDebugInfo) {
-    NamedMDNode *NMD = M.getOrInsertNamedMetadata("llvm.dbg.cu");
-    NMD->addOperand(CUNode);
-  }
-
+  NamedMDNode *NMD = M.getOrInsertNamedMetadata("llvm.dbg.cu");
+  NMD->addOperand(CUNode);
   trackIfUnresolved(CUNode);
   return CUNode;
 }
@@ -168,8 +136,12 @@ static DIImportedEntity *
 createImportedModule(LLVMContext &C, dwarf::Tag Tag, DIScope *Context,
                      Metadata *NS, unsigned Line, StringRef Name,
                      SmallVectorImpl<TrackingMDNodeRef> &AllImportedModules) {
+  unsigned EntitiesCount = C.pImpl->DIImportedEntitys.size();
   auto *M = DIImportedEntity::get(C, Tag, Context, DINodeRef(NS), Line, Name);
-  AllImportedModules.emplace_back(M);
+  if (EntitiesCount < C.pImpl->DIImportedEntitys.size())
+    // A new Imported Entity was just added to the context.
+    // Add it to the Imported Modules list.
+    AllImportedModules.emplace_back(M);
   return M;
 }
 
@@ -200,8 +172,7 @@ DIImportedEntity *DIBuilder::createImportedDeclaration(DIScope *Context,
   // Make sure to use the unique identifier based metadata reference for
   // types that have one.
   return ::createImportedModule(VMContext, dwarf::DW_TAG_imported_declaration,
-                                Context, DINodeRef::get(Decl), Line, Name,
-                                AllImportedModules);
+                                Context, Decl, Line, Name, AllImportedModules);
 }
 
 DIFile *DIBuilder::createFile(StringRef Filename, StringRef Directory) {
@@ -231,8 +202,8 @@ DIBasicType *DIBuilder::createBasicType(StringRef Name, uint64_t SizeInBits,
 }
 
 DIDerivedType *DIBuilder::createQualifiedType(unsigned Tag, DIType *FromTy) {
-  return DIDerivedType::get(VMContext, Tag, "", nullptr, 0, nullptr,
-                            DITypeRef::get(FromTy), 0, 0, 0, 0);
+  return DIDerivedType::get(VMContext, Tag, "", nullptr, 0, nullptr, FromTy, 0,
+                            0, 0, 0);
 }
 
 DIDerivedType *DIBuilder::createPointerType(DIType *PointeeTy,
@@ -241,43 +212,41 @@ DIDerivedType *DIBuilder::createPointerType(DIType *PointeeTy,
                                             StringRef Name) {
   // FIXME: Why is there a name here?
   return DIDerivedType::get(VMContext, dwarf::DW_TAG_pointer_type, Name,
-                            nullptr, 0, nullptr, DITypeRef::get(PointeeTy),
-                            SizeInBits, AlignInBits, 0, 0);
+                            nullptr, 0, nullptr, PointeeTy, SizeInBits,
+                            AlignInBits, 0, 0);
 }
 
 DIDerivedType *DIBuilder::createMemberPointerType(DIType *PointeeTy,
                                                   DIType *Base,
                                                   uint64_t SizeInBits,
-                                                  uint64_t AlignInBits) {
+                                                  uint64_t AlignInBits,
+                                                  unsigned Flags) {
   return DIDerivedType::get(VMContext, dwarf::DW_TAG_ptr_to_member_type, "",
-                            nullptr, 0, nullptr, DITypeRef::get(PointeeTy),
-                            SizeInBits, AlignInBits, 0, 0,
-                            DITypeRef::get(Base));
+                            nullptr, 0, nullptr, PointeeTy, SizeInBits,
+                            AlignInBits, 0, Flags, Base);
 }
 
 DIDerivedType *DIBuilder::createReferenceType(unsigned Tag, DIType *RTy,
                                               uint64_t SizeInBits,
                                               uint64_t AlignInBits) {
   assert(RTy && "Unable to create reference type");
-  return DIDerivedType::get(VMContext, Tag, "", nullptr, 0, nullptr,
-                            DITypeRef::get(RTy), SizeInBits, AlignInBits, 0, 0);
+  return DIDerivedType::get(VMContext, Tag, "", nullptr, 0, nullptr, RTy,
+                            SizeInBits, AlignInBits, 0, 0);
 }
 
 DIDerivedType *DIBuilder::createTypedef(DIType *Ty, StringRef Name,
                                         DIFile *File, unsigned LineNo,
                                         DIScope *Context) {
   return DIDerivedType::get(VMContext, dwarf::DW_TAG_typedef, Name, File,
-                            LineNo,
-                            DIScopeRef::get(getNonCompileUnitScope(Context)),
-                            DITypeRef::get(Ty), 0, 0, 0, 0);
+                            LineNo, getNonCompileUnitScope(Context), Ty, 0, 0,
+                            0, 0);
 }
 
 DIDerivedType *DIBuilder::createFriend(DIType *Ty, DIType *FriendTy) {
   assert(Ty && "Invalid type!");
   assert(FriendTy && "Invalid friend type!");
-  return DIDerivedType::get(VMContext, dwarf::DW_TAG_friend, "", nullptr, 0,
-                            DITypeRef::get(Ty), DITypeRef::get(FriendTy), 0, 0,
-                            0, 0);
+  return DIDerivedType::get(VMContext, dwarf::DW_TAG_friend, "", nullptr, 0, Ty,
+                            FriendTy, 0, 0, 0, 0);
 }
 
 DIDerivedType *DIBuilder::createInheritance(DIType *Ty, DIType *BaseTy,
@@ -285,8 +254,7 @@ DIDerivedType *DIBuilder::createInheritance(DIType *Ty, DIType *BaseTy,
                                             unsigned Flags) {
   assert(Ty && "Unable to create inheritance");
   return DIDerivedType::get(VMContext, dwarf::DW_TAG_inheritance, "", nullptr,
-                            0, DITypeRef::get(Ty), DITypeRef::get(BaseTy), 0, 0,
-                            BaseOffset, Flags);
+                            0, Ty, BaseTy, 0, 0, BaseOffset, Flags);
 }
 
 DIDerivedType *DIBuilder::createMemberType(DIScope *Scope, StringRef Name,
@@ -295,10 +263,9 @@ DIDerivedType *DIBuilder::createMemberType(DIScope *Scope, StringRef Name,
                                            uint64_t AlignInBits,
                                            uint64_t OffsetInBits,
                                            unsigned Flags, DIType *Ty) {
-  return DIDerivedType::get(
-      VMContext, dwarf::DW_TAG_member, Name, File, LineNumber,
-      DIScopeRef::get(getNonCompileUnitScope(Scope)), DITypeRef::get(Ty),
-      SizeInBits, AlignInBits, OffsetInBits, Flags);
+  return DIDerivedType::get(VMContext, dwarf::DW_TAG_member, Name, File,
+                            LineNumber, getNonCompileUnitScope(Scope), Ty,
+                            SizeInBits, AlignInBits, OffsetInBits, Flags);
 }
 
 static ConstantAsMetadata *getConstantOrNull(Constant *C) {
@@ -307,16 +274,27 @@ static ConstantAsMetadata *getConstantOrNull(Constant *C) {
   return nullptr;
 }
 
+DIDerivedType *DIBuilder::createBitFieldMemberType(
+    DIScope *Scope, StringRef Name, DIFile *File, unsigned LineNumber,
+    uint64_t SizeInBits, uint64_t AlignInBits, uint64_t OffsetInBits,
+    uint64_t StorageOffsetInBits, unsigned Flags, DIType *Ty) {
+  Flags |= DINode::FlagBitField;
+  return DIDerivedType::get(
+      VMContext, dwarf::DW_TAG_member, Name, File, LineNumber,
+      getNonCompileUnitScope(Scope), Ty, SizeInBits, AlignInBits, OffsetInBits,
+      Flags, ConstantAsMetadata::get(ConstantInt::get(
+                 IntegerType::get(VMContext, 64), StorageOffsetInBits)));
+}
+
 DIDerivedType *DIBuilder::createStaticMemberType(DIScope *Scope, StringRef Name,
                                                  DIFile *File,
                                                  unsigned LineNumber,
                                                  DIType *Ty, unsigned Flags,
                                                  llvm::Constant *Val) {
   Flags |= DINode::FlagStaticMember;
-  return DIDerivedType::get(
-      VMContext, dwarf::DW_TAG_member, Name, File, LineNumber,
-      DIScopeRef::get(getNonCompileUnitScope(Scope)), DITypeRef::get(Ty), 0, 0,
-      0, Flags, getConstantOrNull(Val));
+  return DIDerivedType::get(VMContext, dwarf::DW_TAG_member, Name, File,
+                            LineNumber, getNonCompileUnitScope(Scope), Ty, 0, 0,
+                            0, Flags, getConstantOrNull(Val));
 }
 
 DIDerivedType *DIBuilder::createObjCIVar(StringRef Name, DIFile *File,
@@ -325,10 +303,10 @@ DIDerivedType *DIBuilder::createObjCIVar(StringRef Name, DIFile *File,
                                          uint64_t AlignInBits,
                                          uint64_t OffsetInBits, unsigned Flags,
                                          DIType *Ty, MDNode *PropertyNode) {
-  return DIDerivedType::get(
-      VMContext, dwarf::DW_TAG_member, Name, File, LineNumber,
-      DIScopeRef::get(getNonCompileUnitScope(File)), DITypeRef::get(Ty),
-      SizeInBits, AlignInBits, OffsetInBits, Flags, PropertyNode);
+  return DIDerivedType::get(VMContext, dwarf::DW_TAG_member, Name, File,
+                            LineNumber, getNonCompileUnitScope(File), Ty,
+                            SizeInBits, AlignInBits, OffsetInBits, Flags,
+                            PropertyNode);
 }
 
 DIObjCProperty *
@@ -336,15 +314,14 @@ DIBuilder::createObjCProperty(StringRef Name, DIFile *File, unsigned LineNumber,
                               StringRef GetterName, StringRef SetterName,
                               unsigned PropertyAttributes, DIType *Ty) {
   return DIObjCProperty::get(VMContext, Name, File, LineNumber, GetterName,
-                             SetterName, PropertyAttributes,
-                             DITypeRef::get(Ty));
+                             SetterName, PropertyAttributes, Ty);
 }
 
 DITemplateTypeParameter *
 DIBuilder::createTemplateTypeParameter(DIScope *Context, StringRef Name,
                                        DIType *Ty) {
   assert((!Context || isa<DICompileUnit>(Context)) && "Expected compile unit");
-  return DITemplateTypeParameter::get(VMContext, Name, DITypeRef::get(Ty));
+  return DITemplateTypeParameter::get(VMContext, Name, Ty);
 }
 
 static DITemplateValueParameter *
@@ -352,8 +329,7 @@ createTemplateValueParameterHelper(LLVMContext &VMContext, unsigned Tag,
                                    DIScope *Context, StringRef Name, DIType *Ty,
                                    Metadata *MD) {
   assert((!Context || isa<DICompileUnit>(Context)) && "Expected compile unit");
-  return DITemplateValueParameter::get(VMContext, Tag, Name, DITypeRef::get(Ty),
-                                       MD);
+  return DITemplateValueParameter::get(VMContext, Tag, Name, Ty, MD);
 }
 
 DITemplateValueParameter *
@@ -390,12 +366,9 @@ DICompositeType *DIBuilder::createClassType(
 
   auto *R = DICompositeType::get(
       VMContext, dwarf::DW_TAG_structure_type, Name, File, LineNumber,
-      DIScopeRef::get(getNonCompileUnitScope(Context)),
-      DITypeRef::get(DerivedFrom), SizeInBits, AlignInBits, OffsetInBits, Flags,
-      Elements, 0, DITypeRef::get(VTableHolder),
+      getNonCompileUnitScope(Context), DerivedFrom, SizeInBits, AlignInBits,
+      OffsetInBits, Flags, Elements, 0, VTableHolder,
       cast_or_null<MDTuple>(TemplateParams), UniqueIdentifier);
-  if (!UniqueIdentifier.empty())
-    retainType(R);
   trackIfUnresolved(R);
   return R;
 }
@@ -407,11 +380,8 @@ DICompositeType *DIBuilder::createStructType(
     DIType *VTableHolder, StringRef UniqueIdentifier) {
   auto *R = DICompositeType::get(
       VMContext, dwarf::DW_TAG_structure_type, Name, File, LineNumber,
-      DIScopeRef::get(getNonCompileUnitScope(Context)),
-      DITypeRef::get(DerivedFrom), SizeInBits, AlignInBits, 0, Flags, Elements,
-      RunTimeLang, DITypeRef::get(VTableHolder), nullptr, UniqueIdentifier);
-  if (!UniqueIdentifier.empty())
-    retainType(R);
+      getNonCompileUnitScope(Context), DerivedFrom, SizeInBits, AlignInBits, 0,
+      Flags, Elements, RunTimeLang, VTableHolder, nullptr, UniqueIdentifier);
   trackIfUnresolved(R);
   return R;
 }
@@ -422,30 +392,23 @@ DICompositeType *DIBuilder::createUnionType(
     DINodeArray Elements, unsigned RunTimeLang, StringRef UniqueIdentifier) {
   auto *R = DICompositeType::get(
       VMContext, dwarf::DW_TAG_union_type, Name, File, LineNumber,
-      DIScopeRef::get(getNonCompileUnitScope(Scope)), nullptr, SizeInBits,
-      AlignInBits, 0, Flags, Elements, RunTimeLang, nullptr, nullptr,
-      UniqueIdentifier);
-  if (!UniqueIdentifier.empty())
-    retainType(R);
+      getNonCompileUnitScope(Scope), nullptr, SizeInBits, AlignInBits, 0, Flags,
+      Elements, RunTimeLang, nullptr, nullptr, UniqueIdentifier);
   trackIfUnresolved(R);
   return R;
 }
 
 DISubroutineType *DIBuilder::createSubroutineType(DITypeRefArray ParameterTypes,
-                                                  unsigned Flags) {
-  return DISubroutineType::get(VMContext, Flags, ParameterTypes);
+                                                  unsigned Flags, unsigned CC) {
+  return DISubroutineType::get(VMContext, Flags, CC, ParameterTypes);
 }
 
 DICompositeType *DIBuilder::createExternalTypeRef(unsigned Tag, DIFile *File,
                                                   StringRef UniqueIdentifier) {
   assert(!UniqueIdentifier.empty() && "external type ref without uid");
-  auto *CTy =
-      DICompositeType::get(VMContext, Tag, "", nullptr, 0, nullptr, nullptr, 0,
-                           0, 0, DINode::FlagExternalTypeRef, nullptr, 0,
-                           nullptr, nullptr, UniqueIdentifier);
-  // Types with unique IDs need to be in the type map.
-  retainType(CTy);
-  return CTy;
+  return DICompositeType::get(VMContext, Tag, "", nullptr, 0, nullptr, nullptr,
+                              0, 0, 0, DINode::FlagExternalTypeRef, nullptr, 0,
+                              nullptr, nullptr, UniqueIdentifier);
 }
 
 DICompositeType *DIBuilder::createEnumerationType(
@@ -454,12 +417,9 @@ DICompositeType *DIBuilder::createEnumerationType(
     DIType *UnderlyingType, StringRef UniqueIdentifier) {
   auto *CTy = DICompositeType::get(
       VMContext, dwarf::DW_TAG_enumeration_type, Name, File, LineNumber,
-      DIScopeRef::get(getNonCompileUnitScope(Scope)),
-      DITypeRef::get(UnderlyingType), SizeInBits, AlignInBits, 0, 0, Elements,
-      0, nullptr, nullptr, UniqueIdentifier);
+      getNonCompileUnitScope(Scope), UnderlyingType, SizeInBits, AlignInBits, 0,
+      0, Elements, 0, nullptr, nullptr, UniqueIdentifier);
   AllEnumTypes.push_back(CTy);
-  if (!UniqueIdentifier.empty())
-    retainType(CTy);
   trackIfUnresolved(CTy);
   return CTy;
 }
@@ -468,8 +428,8 @@ DICompositeType *DIBuilder::createArrayType(uint64_t Size, uint64_t AlignInBits,
                                             DIType *Ty,
                                             DINodeArray Subscripts) {
   auto *R = DICompositeType::get(VMContext, dwarf::DW_TAG_array_type, "",
-                                 nullptr, 0, nullptr, DITypeRef::get(Ty), Size,
-                                 AlignInBits, 0, 0, Subscripts, 0, nullptr);
+                                 nullptr, 0, nullptr, Ty, Size, AlignInBits, 0,
+                                 0, Subscripts, 0, nullptr);
   trackIfUnresolved(R);
   return R;
 }
@@ -477,10 +437,9 @@ DICompositeType *DIBuilder::createArrayType(uint64_t Size, uint64_t AlignInBits,
 DICompositeType *DIBuilder::createVectorType(uint64_t Size,
                                              uint64_t AlignInBits, DIType *Ty,
                                              DINodeArray Subscripts) {
-  auto *R =
-      DICompositeType::get(VMContext, dwarf::DW_TAG_array_type, "", nullptr, 0,
-                           nullptr, DITypeRef::get(Ty), Size, AlignInBits, 0,
-                           DINode::FlagVector, Subscripts, 0, nullptr);
+  auto *R = DICompositeType::get(VMContext, dwarf::DW_TAG_array_type, "",
+                                 nullptr, 0, nullptr, Ty, Size, AlignInBits, 0,
+                                 DINode::FlagVector, Subscripts, 0, nullptr);
   trackIfUnresolved(R);
   return R;
 }
@@ -507,8 +466,11 @@ DIType *DIBuilder::createObjectPointerType(DIType *Ty) {
   return createTypeWithFlags(VMContext, Ty, Flags);
 }
 
-void DIBuilder::retainType(DIType *T) {
+void DIBuilder::retainType(DIScope *T) {
   assert(T && "Expected non-null type");
+  assert((isa<DIType>(T) || (isa<DISubprogram>(T) &&
+                             cast<DISubprogram>(T)->isDefinition() == false)) &&
+         "Expected type or subprogram declaration");
   AllRetainTypes.emplace_back(T);
 }
 
@@ -522,12 +484,9 @@ DIBuilder::createForwardDecl(unsigned Tag, StringRef Name, DIScope *Scope,
   // FIXME: Define in terms of createReplaceableForwardDecl() by calling
   // replaceWithUniqued().
   auto *RetTy = DICompositeType::get(
-      VMContext, Tag, Name, F, Line,
-      DIScopeRef::get(getNonCompileUnitScope(Scope)), nullptr, SizeInBits,
-      AlignInBits, 0, DINode::FlagFwdDecl, nullptr, RuntimeLang, nullptr,
-      nullptr, UniqueIdentifier);
-  if (!UniqueIdentifier.empty())
-    retainType(RetTy);
+      VMContext, Tag, Name, F, Line, getNonCompileUnitScope(Scope), nullptr,
+      SizeInBits, AlignInBits, 0, DINode::FlagFwdDecl, nullptr, RuntimeLang,
+      nullptr, nullptr, UniqueIdentifier);
   trackIfUnresolved(RetTy);
   return RetTy;
 }
@@ -536,14 +495,12 @@ DICompositeType *DIBuilder::createReplaceableCompositeType(
     unsigned Tag, StringRef Name, DIScope *Scope, DIFile *F, unsigned Line,
     unsigned RuntimeLang, uint64_t SizeInBits, uint64_t AlignInBits,
     unsigned Flags, StringRef UniqueIdentifier) {
-  auto *RetTy = DICompositeType::getTemporary(
-                    VMContext, Tag, Name, F, Line,
-                    DIScopeRef::get(getNonCompileUnitScope(Scope)), nullptr,
-                    SizeInBits, AlignInBits, 0, Flags, nullptr, RuntimeLang,
-                    nullptr, nullptr, UniqueIdentifier)
-                    .release();
-  if (!UniqueIdentifier.empty())
-    retainType(RetTy);
+  auto *RetTy =
+      DICompositeType::getTemporary(
+          VMContext, Tag, Name, F, Line, getNonCompileUnitScope(Scope), nullptr,
+          SizeInBits, AlignInBits, 0, Flags, nullptr, RuntimeLang, nullptr,
+          nullptr, UniqueIdentifier)
+          .release();
   trackIfUnresolved(RetTy);
   return RetTy;
 }
@@ -556,7 +513,7 @@ DITypeRefArray DIBuilder::getOrCreateTypeArray(ArrayRef<Metadata *> Elements) {
   SmallVector<llvm::Metadata *, 16> Elts;
   for (unsigned i = 0, e = Elements.size(); i != e; ++i) {
     if (Elements[i] && isa<MDNode>(Elements[i]))
-      Elts.push_back(DITypeRef::get(cast<DIType>(Elements[i])));
+      Elts.push_back(cast<DIType>(Elements[i]));
     else
       Elts.push_back(Elements[i]);
   }
@@ -582,10 +539,10 @@ DIGlobalVariable *DIBuilder::createGlobalVariable(
     MDNode *Decl) {
   checkGlobalVariableScope(Context);
 
-  auto *N = DIGlobalVariable::get(VMContext, cast_or_null<DIScope>(Context),
-                                  Name, LinkageName, F, LineNumber,
-                                  DITypeRef::get(Ty), isLocalToUnit, true, Val,
-                                  cast_or_null<DIDerivedType>(Decl));
+  auto *N = DIGlobalVariable::getDistinct(
+      VMContext, cast_or_null<DIScope>(Context), Name, LinkageName, F,
+      LineNumber, Ty, isLocalToUnit, true, Val,
+      cast_or_null<DIDerivedType>(Decl));
   AllGVs.push_back(N);
   return N;
 }
@@ -598,14 +555,14 @@ DIGlobalVariable *DIBuilder::createTempGlobalVariableFwdDecl(
 
   return DIGlobalVariable::getTemporary(
              VMContext, cast_or_null<DIScope>(Context), Name, LinkageName, F,
-             LineNumber, DITypeRef::get(Ty), isLocalToUnit, false, Val,
+             LineNumber, Ty, isLocalToUnit, false, Val,
              cast_or_null<DIDerivedType>(Decl))
       .release();
 }
 
 static DILocalVariable *createLocalVariable(
     LLVMContext &VMContext,
-    DenseMap<MDNode *, std::vector<TrackingMDNodeRef>> &PreservedVariables,
+    DenseMap<MDNode *, SmallVector<TrackingMDNodeRef, 1>> &PreservedVariables,
     DIScope *Scope, StringRef Name, unsigned ArgNo, DIFile *File,
     unsigned LineNo, DIType *Ty, bool AlwaysPreserve, unsigned Flags) {
   // FIXME: Why getNonCompileUnitScope()?
@@ -616,7 +573,7 @@ static DILocalVariable *createLocalVariable(
 
   auto *Node =
       DILocalVariable::get(VMContext, cast_or_null<DILocalScope>(Context), Name,
-                           File, LineNo, DITypeRef::get(Ty), ArgNo, Flags);
+                           File, LineNo, Ty, ArgNo, Flags);
   if (AlwaysPreserve) {
     // The optimizer may remove local variables. If there is an interest
     // to preserve variable info in such situation then stash it in a
@@ -661,19 +618,6 @@ DIExpression *DIBuilder::createBitPieceExpression(unsigned OffsetInBytes,
   return DIExpression::get(VMContext, Addr);
 }
 
-DISubprogram *DIBuilder::createFunction(
-    DIScopeRef Context, StringRef Name, StringRef LinkageName, DIFile *File,
-    unsigned LineNo, DISubroutineType *Ty, bool isLocalToUnit,
-    bool isDefinition, unsigned ScopeLine, unsigned Flags, bool isOptimized,
-    DITemplateParameterArray TParams, DISubprogram *Decl) {
-  // dragonegg does not generate identifier for types, so using an empty map
-  // to resolve the context should be fine.
-  DITypeIdentifierMap EmptyMap;
-  return createFunction(Context.resolve(EmptyMap), Name, LinkageName, File,
-                        LineNo, Ty, isLocalToUnit, isDefinition, ScopeLine,
-                        Flags, isOptimized, TParams, Decl);
-}
-
 template <class... Ts>
 static DISubprogram *getSubprogram(bool IsDistinct, Ts &&... Args) {
   if (IsDistinct)
@@ -686,12 +630,12 @@ DISubprogram *DIBuilder::createFunction(
     unsigned LineNo, DISubroutineType *Ty, bool isLocalToUnit,
     bool isDefinition, unsigned ScopeLine, unsigned Flags, bool isOptimized,
     DITemplateParameterArray TParams, DISubprogram *Decl) {
-  auto *Node =
-      getSubprogram(/* IsDistinct = */ isDefinition, VMContext,
-                    DIScopeRef::get(getNonCompileUnitScope(Context)), Name,
-                    LinkageName, File, LineNo, Ty, isLocalToUnit, isDefinition,
-                    ScopeLine, nullptr, 0, 0, Flags, isOptimized, TParams, Decl,
-                    MDTuple::getTemporary(VMContext, None).release());
+  auto *Node = getSubprogram(
+      /* IsDistinct = */ isDefinition, VMContext,
+      getNonCompileUnitScope(Context), Name, LinkageName, File, LineNo, Ty,
+      isLocalToUnit, isDefinition, ScopeLine, nullptr, 0, 0, 0, Flags,
+      isOptimized, isDefinition ? CUNode : nullptr, TParams, Decl,
+      MDTuple::getTemporary(VMContext, None).release());
 
   if (isDefinition)
     AllSubprograms.push_back(Node);
@@ -705,10 +649,10 @@ DISubprogram *DIBuilder::createTempFunctionFwdDecl(
     bool isDefinition, unsigned ScopeLine, unsigned Flags, bool isOptimized,
     DITemplateParameterArray TParams, DISubprogram *Decl) {
   return DISubprogram::getTemporary(
-             VMContext, DIScopeRef::get(getNonCompileUnitScope(Context)), Name,
-             LinkageName, File, LineNo, Ty, isLocalToUnit, isDefinition,
-             ScopeLine, nullptr, 0, 0, Flags, isOptimized, TParams, Decl,
-             nullptr)
+             VMContext, getNonCompileUnitScope(Context), Name, LinkageName,
+             File, LineNo, Ty, isLocalToUnit, isDefinition, ScopeLine, nullptr,
+             0, 0, 0, Flags, isOptimized, isDefinition ? CUNode : nullptr,
+             TParams, Decl, nullptr)
       .release();
 }
 
@@ -716,17 +660,18 @@ DISubprogram *
 DIBuilder::createMethod(DIScope *Context, StringRef Name, StringRef LinkageName,
                         DIFile *F, unsigned LineNo, DISubroutineType *Ty,
                         bool isLocalToUnit, bool isDefinition, unsigned VK,
-                        unsigned VIndex, DIType *VTableHolder, unsigned Flags,
-                        bool isOptimized, DITemplateParameterArray TParams) {
+                        unsigned VIndex, int ThisAdjustment,
+                        DIType *VTableHolder, unsigned Flags, bool isOptimized,
+                        DITemplateParameterArray TParams) {
   assert(getNonCompileUnitScope(Context) &&
          "Methods should have both a Context and a context that isn't "
          "the compile unit.");
   // FIXME: Do we want to use different scope/lines?
   auto *SP = getSubprogram(
-      /* IsDistinct = */ isDefinition, VMContext,
-      DIScopeRef::get(cast<DIScope>(Context)), Name, LinkageName, F, LineNo, Ty,
-      isLocalToUnit, isDefinition, LineNo, DITypeRef::get(VTableHolder), VK,
-      VIndex, Flags, isOptimized, TParams, nullptr, nullptr);
+      /* IsDistinct = */ isDefinition, VMContext, cast<DIScope>(Context), Name,
+      LinkageName, F, LineNo, Ty, isLocalToUnit, isDefinition, LineNo,
+      VTableHolder, VK, VIndex, ThisAdjustment, Flags, isOptimized,
+      isDefinition ? CUNode : nullptr, TParams, nullptr, nullptr);
 
   if (isDefinition)
     AllSubprograms.push_back(SP);
@@ -866,7 +811,7 @@ void DIBuilder::replaceVTableHolder(DICompositeType *&T,
                                     DICompositeType *VTableHolder) {
   {
     TypedTrackingMDRef<DICompositeType> N(T);
-    N->replaceVTableHolder(DITypeRef::get(VTableHolder));
+    N->replaceVTableHolder(VTableHolder);
     T = N.get();
   }
 
diff --git a/lib/IR/DataLayout.cpp b/lib/IR/DataLayout.cpp
index 5468f47bbfe6..20a15fb8831f 100644
--- a/lib/IR/DataLayout.cpp
+++ b/lib/IR/DataLayout.cpp
@@ -52,7 +52,7 @@ StructLayout::StructLayout(StructType *ST, const DataLayout &DL) {
     // Add padding if necessary to align the data element properly.
     if ((StructSize & (TyAlign-1)) != 0) {
       IsPadded = true;
-      StructSize = RoundUpToAlignment(StructSize, TyAlign);
+      StructSize = alignTo(StructSize, TyAlign);
     }
 
     // Keep track of maximum alignment constraint.
@@ -69,7 +69,7 @@ StructLayout::StructLayout(StructType *ST, const DataLayout &DL) {
   // and all array elements would be aligned correctly.
   if ((StructSize & (StructAlignment-1)) != 0) {
     IsPadded = true;
-    StructSize = RoundUpToAlignment(StructSize, StructAlignment);
+    StructSize = alignTo(StructSize, StructAlignment);
   }
 }
 
@@ -718,42 +718,36 @@ Type *DataLayout::getSmallestLegalIntType(LLVMContext &C, unsigned Width) const
   return nullptr;
 }
 
-unsigned DataLayout::getLargestLegalIntTypeSize() const {
+unsigned DataLayout::getLargestLegalIntTypeSizeInBits() const {
   auto Max = std::max_element(LegalIntWidths.begin(), LegalIntWidths.end());
   return Max != LegalIntWidths.end() ? *Max : 0;
 }
 
-uint64_t DataLayout::getIndexedOffset(Type *ptrTy,
-                                      ArrayRef<Value *> Indices) const {
-  Type *Ty = ptrTy;
-  assert(Ty->isPointerTy() && "Illegal argument for getIndexedOffset()");
-  uint64_t Result = 0;
+int64_t DataLayout::getIndexedOffsetInType(Type *ElemTy,
+                                           ArrayRef<Value *> Indices) const {
+  int64_t Result = 0;
 
+  // We can use 0 as the address space as we don't need
+  // to get pointer types back from gep_type_iterator.
+  unsigned AS = 0;
   generic_gep_type_iterator<Value* const*>
-    TI = gep_type_begin(ptrTy, Indices);
-  for (unsigned CurIDX = 0, EndIDX = Indices.size(); CurIDX != EndIDX;
-       ++CurIDX, ++TI) {
-    if (StructType *STy = dyn_cast<StructType>(*TI)) {
-      assert(Indices[CurIDX]->getType() ==
-             Type::getInt32Ty(ptrTy->getContext()) &&
-             "Illegal struct idx");
-      unsigned FieldNo = cast<ConstantInt>(Indices[CurIDX])->getZExtValue();
+    GTI = gep_type_begin(ElemTy, AS, Indices),
+    GTE = gep_type_end(ElemTy, AS, Indices);
+  for (; GTI != GTE; ++GTI) {
+    Value *Idx = GTI.getOperand();
+    if (auto *STy = dyn_cast<StructType>(*GTI)) {
+      assert(Idx->getType()->isIntegerTy(32) && "Illegal struct idx");
+      unsigned FieldNo = cast<ConstantInt>(Idx)->getZExtValue();
 
       // Get structure layout information...
       const StructLayout *Layout = getStructLayout(STy);
 
       // Add in the offset, as calculated by the structure layout info...
       Result += Layout->getElementOffset(FieldNo);
-
-      // Update Ty to refer to current element
-      Ty = STy->getElementType(FieldNo);
     } else {
-      // Update Ty to refer to current element
-      Ty = cast<SequentialType>(Ty)->getElementType();
-
       // Get the array index and the size of each array element.
-      if (int64_t arrayIdx = cast<ConstantInt>(Indices[CurIDX])->getSExtValue())
-        Result += (uint64_t)arrayIdx * getTypeAllocSize(Ty);
+      if (int64_t arrayIdx = cast<ConstantInt>(Idx)->getSExtValue())
+        Result += arrayIdx * getTypeAllocSize(GTI.getIndexedType());
     }
   }
 
@@ -764,7 +758,7 @@ uint64_t DataLayout::getIndexedOffset(Type *ptrTy,
 /// global.  This includes an explicitly requested alignment (if the global
 /// has one).
 unsigned DataLayout::getPreferredAlignment(const GlobalVariable *GV) const {
-  Type *ElemType = GV->getType()->getElementType();
+  Type *ElemType = GV->getValueType();
   unsigned Alignment = getPrefTypeAlignment(ElemType);
   unsigned GVAlignment = GV->getAlignment();
   if (GVAlignment >= Alignment) {
diff --git a/lib/IR/DebugInfo.cpp b/lib/IR/DebugInfo.cpp
index a2443becdd00..1d3c8299255b 100644
--- a/lib/IR/DebugInfo.cpp
+++ b/lib/IR/DebugInfo.cpp
@@ -16,15 +16,13 @@
 #include "LLVMContextImpl.h"
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/SmallPtrSet.h"
-#include "llvm/ADT/SmallString.h"
-#include "llvm/Analysis/ValueTracking.h"
 #include "llvm/IR/Constants.h"
 #include "llvm/IR/DIBuilder.h"
 #include "llvm/IR/DerivedTypes.h"
+#include "llvm/IR/GVMaterializer.h"
 #include "llvm/IR/Instructions.h"
 #include "llvm/IR/IntrinsicInst.h"
 #include "llvm/IR/Intrinsics.h"
-#include "llvm/IR/GVMaterializer.h"
 #include "llvm/IR/Module.h"
 #include "llvm/IR/ValueHandle.h"
 #include "llvm/Support/Debug.h"
@@ -39,48 +37,6 @@ DISubprogram *llvm::getDISubprogram(const MDNode *Scope) {
   return nullptr;
 }
 
-DISubprogram *llvm::getDISubprogram(const Function *F) {
-  // We look for the first instr that has a debug annotation leading back to F.
-  for (auto &BB : *F) {
-    auto Inst = std::find_if(BB.begin(), BB.end(), [](const Instruction &Inst) {
-      return Inst.getDebugLoc();
-    });
-    if (Inst == BB.end())
-      continue;
-    DebugLoc DLoc = Inst->getDebugLoc();
-    const MDNode *Scope = DLoc.getInlinedAtScope();
-    auto *Subprogram = getDISubprogram(Scope);
-    return Subprogram->describes(F) ? Subprogram : nullptr;
-  }
-
-  return nullptr;
-}
-
-DITypeIdentifierMap
-llvm::generateDITypeIdentifierMap(const NamedMDNode *CU_Nodes) {
-  DITypeIdentifierMap Map;
-  for (unsigned CUi = 0, CUe = CU_Nodes->getNumOperands(); CUi != CUe; ++CUi) {
-    auto *CU = cast<DICompileUnit>(CU_Nodes->getOperand(CUi));
-    DINodeArray Retain = CU->getRetainedTypes();
-    for (unsigned Ti = 0, Te = Retain.size(); Ti != Te; ++Ti) {
-      if (!isa<DICompositeType>(Retain[Ti]))
-        continue;
-      auto *Ty = cast<DICompositeType>(Retain[Ti]);
-      if (MDString *TypeId = Ty->getRawIdentifier()) {
-        // Definition has priority over declaration.
-        // Try to insert (TypeId, Ty) to Map.
-        std::pair<DITypeIdentifierMap::iterator, bool> P =
-            Map.insert(std::make_pair(TypeId, Ty));
-        // If TypeId already exists in Map and this is a definition, replace
-        // whatever we had (declaration or definition) with the definition.
-        if (!P.second && !Ty->isForwardDecl())
-          P.first->second = Ty;
-      }
-    }
-  }
-  return Map;
-}
-
 //===----------------------------------------------------------------------===//
 // DebugInfoFinder implementations.
 //===----------------------------------------------------------------------===//
@@ -92,55 +48,44 @@ void DebugInfoFinder::reset() {
   TYs.clear();
   Scopes.clear();
   NodesSeen.clear();
-  TypeIdentifierMap.clear();
-  TypeMapInitialized = false;
-}
-
-void DebugInfoFinder::InitializeTypeMap(const Module &M) {
-  if (!TypeMapInitialized)
-    if (NamedMDNode *CU_Nodes = M.getNamedMetadata("llvm.dbg.cu")) {
-      TypeIdentifierMap = generateDITypeIdentifierMap(CU_Nodes);
-      TypeMapInitialized = true;
-    }
 }
 
 void DebugInfoFinder::processModule(const Module &M) {
-  InitializeTypeMap(M);
-  if (NamedMDNode *CU_Nodes = M.getNamedMetadata("llvm.dbg.cu")) {
-    for (unsigned i = 0, e = CU_Nodes->getNumOperands(); i != e; ++i) {
-      auto *CU = cast<DICompileUnit>(CU_Nodes->getOperand(i));
-      addCompileUnit(CU);
-      for (auto *DIG : CU->getGlobalVariables()) {
-        if (addGlobalVariable(DIG)) {
-          processScope(DIG->getScope());
-          processType(DIG->getType().resolve(TypeIdentifierMap));
-        }
+  for (auto *CU : M.debug_compile_units()) {
+    addCompileUnit(CU);
+    for (auto *DIG : CU->getGlobalVariables()) {
+      if (addGlobalVariable(DIG)) {
+        processScope(DIG->getScope());
+        processType(DIG->getType().resolve());
       }
-      for (auto *SP : CU->getSubprograms())
+    }
+    for (auto *ET : CU->getEnumTypes())
+      processType(ET);
+    for (auto *RT : CU->getRetainedTypes())
+      if (auto *T = dyn_cast<DIType>(RT))
+        processType(T);
+      else
+        processSubprogram(cast<DISubprogram>(RT));
+    for (auto *Import : CU->getImportedEntities()) {
+      auto *Entity = Import->getEntity().resolve();
+      if (auto *T = dyn_cast<DIType>(Entity))
+        processType(T);
+      else if (auto *SP = dyn_cast<DISubprogram>(Entity))
         processSubprogram(SP);
-      for (auto *ET : CU->getEnumTypes())
-        processType(ET);
-      for (auto *RT : CU->getRetainedTypes())
-        processType(RT);
-      for (auto *Import : CU->getImportedEntities()) {
-        auto *Entity = Import->getEntity().resolve(TypeIdentifierMap);
-        if (auto *T = dyn_cast<DIType>(Entity))
-          processType(T);
-        else if (auto *SP = dyn_cast<DISubprogram>(Entity))
-          processSubprogram(SP);
-        else if (auto *NS = dyn_cast<DINamespace>(Entity))
-          processScope(NS->getScope());
-        else if (auto *M = dyn_cast<DIModule>(Entity))
-          processScope(M->getScope());
-      }
+      else if (auto *NS = dyn_cast<DINamespace>(Entity))
+        processScope(NS->getScope());
+      else if (auto *M = dyn_cast<DIModule>(Entity))
+        processScope(M->getScope());
     }
   }
+  for (auto &F : M.functions())
+    if (auto *SP = cast_or_null<DISubprogram>(F.getSubprogram()))
+      processSubprogram(SP);
 }
 
 void DebugInfoFinder::processLocation(const Module &M, const DILocation *Loc) {
   if (!Loc)
     return;
-  InitializeTypeMap(M);
   processScope(Loc->getScope());
   processLocation(M, Loc->getInlinedAt());
 }
@@ -148,14 +93,14 @@ void DebugInfoFinder::processLocation(const Module &M, const DILocation *Loc) {
 void DebugInfoFinder::processType(DIType *DT) {
   if (!addType(DT))
     return;
-  processScope(DT->getScope().resolve(TypeIdentifierMap));
+  processScope(DT->getScope().resolve());
   if (auto *ST = dyn_cast<DISubroutineType>(DT)) {
     for (DITypeRef Ref : ST->getTypeArray())
-      processType(Ref.resolve(TypeIdentifierMap));
+      processType(Ref.resolve());
     return;
   }
   if (auto *DCT = dyn_cast<DICompositeType>(DT)) {
-    processType(DCT->getBaseType().resolve(TypeIdentifierMap));
+    processType(DCT->getBaseType().resolve());
     for (Metadata *D : DCT->getElements()) {
       if (auto *T = dyn_cast<DIType>(D))
         processType(T);
@@ -165,7 +110,7 @@ void DebugInfoFinder::processType(DIType *DT) {
     return;
   }
   if (auto *DDT = dyn_cast<DIDerivedType>(DT)) {
-    processType(DDT->getBaseType().resolve(TypeIdentifierMap));
+    processType(DDT->getBaseType().resolve());
   }
 }
 
@@ -198,13 +143,13 @@ void DebugInfoFinder::processScope(DIScope *Scope) {
 void DebugInfoFinder::processSubprogram(DISubprogram *SP) {
   if (!addSubprogram(SP))
     return;
-  processScope(SP->getScope().resolve(TypeIdentifierMap));
+  processScope(SP->getScope().resolve());
   processType(SP->getType());
   for (auto *Element : SP->getTemplateParams()) {
     if (auto *TType = dyn_cast<DITemplateTypeParameter>(Element)) {
-      processType(TType->getType().resolve(TypeIdentifierMap));
+      processType(TType->getType().resolve());
     } else if (auto *TVal = dyn_cast<DITemplateValueParameter>(Element)) {
-      processType(TVal->getType().resolve(TypeIdentifierMap));
+      processType(TVal->getType().resolve());
     }
   }
 }
@@ -214,7 +159,6 @@ void DebugInfoFinder::processDeclare(const Module &M,
   auto *N = dyn_cast<MDNode>(DDI->getVariable());
   if (!N)
     return;
-  InitializeTypeMap(M);
 
   auto *DV = dyn_cast<DILocalVariable>(N);
   if (!DV)
@@ -223,14 +167,13 @@ void DebugInfoFinder::processDeclare(const Module &M,
   if (!NodesSeen.insert(DV).second)
     return;
   processScope(DV->getScope());
-  processType(DV->getType().resolve(TypeIdentifierMap));
+  processType(DV->getType().resolve());
 }
 
 void DebugInfoFinder::processValue(const Module &M, const DbgValueInst *DVI) {
   auto *N = dyn_cast<MDNode>(DVI->getVariable());
   if (!N)
     return;
-  InitializeTypeMap(M);
 
   auto *DV = dyn_cast<DILocalVariable>(N);
   if (!DV)
@@ -239,7 +182,7 @@ void DebugInfoFinder::processValue(const Module &M, const DbgValueInst *DVI) {
   if (!NodesSeen.insert(DV).second)
     return;
   processScope(DV->getScope());
-  processType(DV->getType().resolve(TypeIdentifierMap));
+  processType(DV->getType().resolve());
 }
 
 bool DebugInfoFinder::addType(DIType *DT) {
@@ -304,8 +247,15 @@ bool llvm::stripDebugInfo(Function &F) {
     Changed = true;
     F.setSubprogram(nullptr);
   }
+
   for (BasicBlock &BB : F) {
-    for (Instruction &I : BB) {
+    for (auto II = BB.begin(), End = BB.end(); II != End;) {
+      Instruction &I = *II++; // We may delete the instruction, increment now.
+      if (isa<DbgInfoIntrinsic>(&I)) {
+        I.eraseFromParent();
+        Changed = true;
+        continue;
+      }
       if (I.getDebugLoc()) {
         Changed = true;
         I.setDebugLoc(DebugLoc());
@@ -318,26 +268,6 @@ bool llvm::stripDebugInfo(Function &F) {
 bool llvm::StripDebugInfo(Module &M) {
   bool Changed = false;
 
-  // Remove all of the calls to the debugger intrinsics, and remove them from
-  // the module.
-  if (Function *Declare = M.getFunction("llvm.dbg.declare")) {
-    while (!Declare->use_empty()) {
-      CallInst *CI = cast<CallInst>(Declare->user_back());
-      CI->eraseFromParent();
-    }
-    Declare->eraseFromParent();
-    Changed = true;
-  }
-
-  if (Function *DbgVal = M.getFunction("llvm.dbg.value")) {
-    while (!DbgVal->use_empty()) {
-      CallInst *CI = cast<CallInst>(DbgVal->user_back());
-      CI->eraseFromParent();
-    }
-    DbgVal->eraseFromParent();
-    Changed = true;
-  }
-
   for (Module::named_metadata_iterator NMI = M.named_metadata_begin(),
          NME = M.named_metadata_end(); NMI != NME;) {
     NamedMDNode *NMD = &*NMI;
diff --git a/lib/IR/DebugInfoMetadata.cpp b/lib/IR/DebugInfoMetadata.cpp
index 58e0abdd577c..c58e3685b3c9 100644
--- a/lib/IR/DebugInfoMetadata.cpp
+++ b/lib/IR/DebugInfoMetadata.cpp
@@ -45,7 +45,6 @@ DILocation *DILocation::getImpl(LLVMContext &Context, unsigned Line,
   // Fixup column.
   adjustColumn(Column);
 
-  assert(Scope && "Expected scope");
   if (Storage == Uniqued) {
     if (auto *N =
             getUniqued(Context.pImpl->DILocations,
@@ -66,23 +65,6 @@ DILocation *DILocation::getImpl(LLVMContext &Context, unsigned Line,
                    Storage, Context.pImpl->DILocations);
 }
 
-unsigned DILocation::computeNewDiscriminator() const {
-  // FIXME: This seems completely wrong.
-  //
-  //  1. If two modules are generated in the same context, then the second
-  //     Module will get different discriminators than it would have if it were
-  //     generated in its own context.
-  //  2. If this function is called after round-tripping to bitcode instead of
-  //     before, it will give a different (and potentially incorrect!) return.
-  //
-  // The discriminator should instead be calculated from local information
-  // where it's actually needed.  This logic should be moved to
-  // AddDiscriminators::runOnFunction(), where it doesn't pollute the
-  // LLVMContext.
-  std::pair<const char *, unsigned> Key(getFilename().data(), getLine());
-  return ++getContext().pImpl->DiscriminatorTable[Key];
-}
-
 unsigned DINode::getFlag(StringRef Flag) {
   return StringSwitch<unsigned>(Flag)
 #define HANDLE_DI_FLAG(ID, NAME) .Case("DIFlag" #NAME, Flag##NAME)
@@ -103,8 +85,8 @@ const char *DINode::getFlagString(unsigned Flag) {
 
 unsigned DINode::splitFlags(unsigned Flags,
                             SmallVectorImpl<unsigned> &SplitFlags) {
-  // Accessibility flags need to be specially handled, since they're packed
-  // together.
+  // Accessibility and member pointer flags need to be specially handled, since
+  // they're packed together.
   if (unsigned A = Flags & FlagAccessibility) {
     if (A == FlagPrivate)
       SplitFlags.push_back(FlagPrivate);
@@ -114,6 +96,15 @@ unsigned DINode::splitFlags(unsigned Flags,
       SplitFlags.push_back(FlagPublic);
     Flags &= ~A;
   }
+  if (unsigned R = Flags & FlagPtrToMemberRep) {
+    if (R == FlagSingleInheritance)
+      SplitFlags.push_back(FlagSingleInheritance);
+    else if (R == FlagMultipleInheritance)
+      SplitFlags.push_back(FlagMultipleInheritance);
+    else
+      SplitFlags.push_back(FlagVirtualInheritance);
+    Flags &= ~R;
+  }
 
 #define HANDLE_DI_FLAG(ID, NAME)                                               \
   if (unsigned Bit = Flags & ID) {                                             \
@@ -133,13 +124,13 @@ DIScopeRef DIScope::getScope() const {
     return SP->getScope();
 
   if (auto *LB = dyn_cast<DILexicalBlockBase>(this))
-    return DIScopeRef(LB->getScope());
+    return LB->getScope();
 
   if (auto *NS = dyn_cast<DINamespace>(this))
-    return DIScopeRef(NS->getScope());
+    return NS->getScope();
 
   if (auto *M = dyn_cast<DIModule>(this))
-    return DIScopeRef(M->getScope());
+    return M->getScope();
 
   assert((isa<DIFile>(this) || isa<DICompileUnit>(this)) &&
          "Unhandled type of scope.");
@@ -161,12 +152,6 @@ StringRef DIScope::getName() const {
   return "";
 }
 
-static StringRef getString(const MDString *S) {
-  if (S)
-    return S->getString();
-  return StringRef();
-}
-
 #ifndef NDEBUG
 static bool isCanonical(const MDString *S) {
   return !S || !S->getString().empty();
@@ -179,7 +164,7 @@ GenericDINode *GenericDINode::getImpl(LLVMContext &Context, unsigned Tag,
                                       StorageType Storage, bool ShouldCreate) {
   unsigned Hash = 0;
   if (Storage == Uniqued) {
-    GenericDINodeInfo::KeyTy Key(Tag, getString(Header), DwarfOps);
+    GenericDINodeInfo::KeyTy Key(Tag, Header, DwarfOps);
     if (auto *N = getUniqued(Context.pImpl->GenericDINodes, Key))
       return N;
     if (!ShouldCreate)
@@ -217,15 +202,14 @@ void GenericDINode::recalculateHash() {
     }                                                                          \
   } while (false)
 #define DEFINE_GETIMPL_STORE(CLASS, ARGS, OPS)                                 \
-  return storeImpl(new (ArrayRef<Metadata *>(OPS).size())                      \
+  return storeImpl(new (array_lengthof(OPS))                                   \
                        CLASS(Context, Storage, UNWRAP_ARGS(ARGS), OPS),        \
                    Storage, Context.pImpl->CLASS##s)
 #define DEFINE_GETIMPL_STORE_NO_OPS(CLASS, ARGS)                               \
   return storeImpl(new (0u) CLASS(Context, Storage, UNWRAP_ARGS(ARGS)),        \
                    Storage, Context.pImpl->CLASS##s)
 #define DEFINE_GETIMPL_STORE_NO_CONSTRUCTOR_ARGS(CLASS, OPS)                   \
-  return storeImpl(new (ArrayRef<Metadata *>(OPS).size())                      \
-                       CLASS(Context, Storage, OPS),                           \
+  return storeImpl(new (array_lengthof(OPS)) CLASS(Context, Storage, OPS),     \
                    Storage, Context.pImpl->CLASS##s)
 
 DISubrange *DISubrange::getImpl(LLVMContext &Context, int64_t Count, int64_t Lo,
@@ -238,7 +222,7 @@ DIEnumerator *DIEnumerator::getImpl(LLVMContext &Context, int64_t Value,
                                     MDString *Name, StorageType Storage,
                                     bool ShouldCreate) {
   assert(isCanonical(Name) && "Expected canonical MDString");
-  DEFINE_GETIMPL_LOOKUP(DIEnumerator, (Value, getString(Name)));
+  DEFINE_GETIMPL_LOOKUP(DIEnumerator, (Value, Name));
   Metadata *Ops[] = {Name};
   DEFINE_GETIMPL_STORE(DIEnumerator, (Value), Ops);
 }
@@ -248,8 +232,8 @@ DIBasicType *DIBasicType::getImpl(LLVMContext &Context, unsigned Tag,
                                   uint64_t AlignInBits, unsigned Encoding,
                                   StorageType Storage, bool ShouldCreate) {
   assert(isCanonical(Name) && "Expected canonical MDString");
-  DEFINE_GETIMPL_LOOKUP(
-      DIBasicType, (Tag, getString(Name), SizeInBits, AlignInBits, Encoding));
+  DEFINE_GETIMPL_LOOKUP(DIBasicType,
+                        (Tag, Name, SizeInBits, AlignInBits, Encoding));
   Metadata *Ops[] = {nullptr, nullptr, Name};
   DEFINE_GETIMPL_STORE(DIBasicType, (Tag, SizeInBits, AlignInBits, Encoding),
                        Ops);
@@ -261,9 +245,9 @@ DIDerivedType *DIDerivedType::getImpl(
     uint64_t AlignInBits, uint64_t OffsetInBits, unsigned Flags,
     Metadata *ExtraData, StorageType Storage, bool ShouldCreate) {
   assert(isCanonical(Name) && "Expected canonical MDString");
-  DEFINE_GETIMPL_LOOKUP(DIDerivedType, (Tag, getString(Name), File, Line, Scope,
-                                        BaseType, SizeInBits, AlignInBits,
-                                        OffsetInBits, Flags, ExtraData));
+  DEFINE_GETIMPL_LOOKUP(DIDerivedType,
+                        (Tag, Name, File, Line, Scope, BaseType, SizeInBits,
+                         AlignInBits, OffsetInBits, Flags, ExtraData));
   Metadata *Ops[] = {File, Scope, Name, BaseType, ExtraData};
   DEFINE_GETIMPL_STORE(
       DIDerivedType, (Tag, Line, SizeInBits, AlignInBits, OffsetInBits, Flags),
@@ -278,11 +262,12 @@ DICompositeType *DICompositeType::getImpl(
     Metadata *TemplateParams, MDString *Identifier, StorageType Storage,
     bool ShouldCreate) {
   assert(isCanonical(Name) && "Expected canonical MDString");
-  DEFINE_GETIMPL_LOOKUP(DICompositeType,
-                        (Tag, getString(Name), File, Line, Scope, BaseType,
-                         SizeInBits, AlignInBits, OffsetInBits, Flags, Elements,
-                         RuntimeLang, VTableHolder, TemplateParams,
-                         getString(Identifier)));
+
+  // Keep this in sync with buildODRType.
+  DEFINE_GETIMPL_LOOKUP(
+      DICompositeType, (Tag, Name, File, Line, Scope, BaseType, SizeInBits,
+                        AlignInBits, OffsetInBits, Flags, Elements, RuntimeLang,
+                        VTableHolder, TemplateParams, Identifier));
   Metadata *Ops[] = {File,     Scope,        Name,           BaseType,
                      Elements, VTableHolder, TemplateParams, Identifier};
   DEFINE_GETIMPL_STORE(DICompositeType, (Tag, Line, RuntimeLang, SizeInBits,
@@ -290,13 +275,74 @@ DICompositeType *DICompositeType::getImpl(
                        Ops);
 }
 
+DICompositeType *DICompositeType::buildODRType(
+    LLVMContext &Context, MDString &Identifier, unsigned Tag, MDString *Name,
+    Metadata *File, unsigned Line, Metadata *Scope, Metadata *BaseType,
+    uint64_t SizeInBits, uint64_t AlignInBits, uint64_t OffsetInBits,
+    unsigned Flags, Metadata *Elements, unsigned RuntimeLang,
+    Metadata *VTableHolder, Metadata *TemplateParams) {
+  assert(!Identifier.getString().empty() && "Expected valid identifier");
+  if (!Context.isODRUniquingDebugTypes())
+    return nullptr;
+  auto *&CT = (*Context.pImpl->DITypeMap)[&Identifier];
+  if (!CT)
+    return CT = DICompositeType::getDistinct(
+               Context, Tag, Name, File, Line, Scope, BaseType, SizeInBits,
+               AlignInBits, OffsetInBits, Flags, Elements, RuntimeLang,
+               VTableHolder, TemplateParams, &Identifier);
+
+  // Only mutate CT if it's a forward declaration and the new operands aren't.
+  assert(CT->getRawIdentifier() == &Identifier && "Wrong ODR identifier?");
+  if (!CT->isForwardDecl() || (Flags & DINode::FlagFwdDecl))
+    return CT;
+
+  // Mutate CT in place.  Keep this in sync with getImpl.
+  CT->mutate(Tag, Line, RuntimeLang, SizeInBits, AlignInBits, OffsetInBits,
+             Flags);
+  Metadata *Ops[] = {File,     Scope,        Name,           BaseType,
+                     Elements, VTableHolder, TemplateParams, &Identifier};
+  assert((std::end(Ops) - std::begin(Ops)) == (int)CT->getNumOperands() &&
+         "Mismatched number of operands");
+  for (unsigned I = 0, E = CT->getNumOperands(); I != E; ++I)
+    if (Ops[I] != CT->getOperand(I))
+      CT->setOperand(I, Ops[I]);
+  return CT;
+}
+
+DICompositeType *DICompositeType::getODRType(
+    LLVMContext &Context, MDString &Identifier, unsigned Tag, MDString *Name,
+    Metadata *File, unsigned Line, Metadata *Scope, Metadata *BaseType,
+    uint64_t SizeInBits, uint64_t AlignInBits, uint64_t OffsetInBits,
+    unsigned Flags, Metadata *Elements, unsigned RuntimeLang,
+    Metadata *VTableHolder, Metadata *TemplateParams) {
+  assert(!Identifier.getString().empty() && "Expected valid identifier");
+  if (!Context.isODRUniquingDebugTypes())
+    return nullptr;
+  auto *&CT = (*Context.pImpl->DITypeMap)[&Identifier];
+  if (!CT)
+    CT = DICompositeType::getDistinct(
+        Context, Tag, Name, File, Line, Scope, BaseType, SizeInBits,
+        AlignInBits, OffsetInBits, Flags, Elements, RuntimeLang, VTableHolder,
+        TemplateParams, &Identifier);
+  return CT;
+}
+
+DICompositeType *DICompositeType::getODRTypeIfExists(LLVMContext &Context,
+                                                     MDString &Identifier) {
+  assert(!Identifier.getString().empty() && "Expected valid identifier");
+  if (!Context.isODRUniquingDebugTypes())
+    return nullptr;
+  return Context.pImpl->DITypeMap->lookup(&Identifier);
+}
+
 DISubroutineType *DISubroutineType::getImpl(LLVMContext &Context,
-                                            unsigned Flags, Metadata *TypeArray,
+                                            unsigned Flags, uint8_t CC,
+                                            Metadata *TypeArray,
                                             StorageType Storage,
                                             bool ShouldCreate) {
-  DEFINE_GETIMPL_LOOKUP(DISubroutineType, (Flags, TypeArray));
+  DEFINE_GETIMPL_LOOKUP(DISubroutineType, (Flags, CC, TypeArray));
   Metadata *Ops[] = {nullptr, nullptr, nullptr, TypeArray};
-  DEFINE_GETIMPL_STORE(DISubroutineType, (Flags), Ops);
+  DEFINE_GETIMPL_STORE(DISubroutineType, (Flags, CC), Ops);
 }
 
 DIFile *DIFile::getImpl(LLVMContext &Context, MDString *Filename,
@@ -304,7 +350,7 @@ DIFile *DIFile::getImpl(LLVMContext &Context, MDString *Filename,
                         bool ShouldCreate) {
   assert(isCanonical(Filename) && "Expected canonical MDString");
   assert(isCanonical(Directory) && "Expected canonical MDString");
-  DEFINE_GETIMPL_LOOKUP(DIFile, (getString(Filename), getString(Directory)));
+  DEFINE_GETIMPL_LOOKUP(DIFile, (Filename, Directory));
   Metadata *Ops[] = {Filename, Directory};
   DEFINE_GETIMPL_STORE_NO_CONSTRUCTOR_ARGS(DIFile, Ops);
 }
@@ -314,50 +360,74 @@ DICompileUnit *DICompileUnit::getImpl(
     MDString *Producer, bool IsOptimized, MDString *Flags,
     unsigned RuntimeVersion, MDString *SplitDebugFilename,
     unsigned EmissionKind, Metadata *EnumTypes, Metadata *RetainedTypes,
-    Metadata *Subprograms, Metadata *GlobalVariables,
-    Metadata *ImportedEntities, Metadata *Macros, uint64_t DWOId,
-    StorageType Storage, bool ShouldCreate) {
+    Metadata *GlobalVariables, Metadata *ImportedEntities, Metadata *Macros,
+    uint64_t DWOId, StorageType Storage, bool ShouldCreate) {
   assert(Storage != Uniqued && "Cannot unique DICompileUnit");
   assert(isCanonical(Producer) && "Expected canonical MDString");
   assert(isCanonical(Flags) && "Expected canonical MDString");
   assert(isCanonical(SplitDebugFilename) && "Expected canonical MDString");
 
-  Metadata *Ops[] = {File, Producer, Flags, SplitDebugFilename, EnumTypes,
-                     RetainedTypes, Subprograms, GlobalVariables,
-                     ImportedEntities, Macros};
-  return storeImpl(new (ArrayRef<Metadata *>(Ops).size()) DICompileUnit(
+  Metadata *Ops[] = {
+      File,      Producer,      Flags,           SplitDebugFilename,
+      EnumTypes, RetainedTypes, GlobalVariables, ImportedEntities,
+      Macros};
+  return storeImpl(new (array_lengthof(Ops)) DICompileUnit(
                        Context, Storage, SourceLanguage, IsOptimized,
                        RuntimeVersion, EmissionKind, DWOId, Ops),
                    Storage);
 }
 
+Optional<DICompileUnit::DebugEmissionKind>
+DICompileUnit::getEmissionKind(StringRef Str) {
+  return StringSwitch<Optional<DebugEmissionKind>>(Str)
+      .Case("NoDebug", NoDebug)
+      .Case("FullDebug", FullDebug)
+      .Case("LineTablesOnly", LineTablesOnly)
+      .Default(None);
+}
+
+const char *DICompileUnit::EmissionKindString(DebugEmissionKind EK) {
+  switch (EK) {
+  case NoDebug:        return "NoDebug";
+  case FullDebug:      return "FullDebug";
+  case LineTablesOnly: return "LineTablesOnly";
+  }
+  return nullptr;
+}
+
 DISubprogram *DILocalScope::getSubprogram() const {
   if (auto *Block = dyn_cast<DILexicalBlockBase>(this))
     return Block->getScope()->getSubprogram();
   return const_cast<DISubprogram *>(cast<DISubprogram>(this));
 }
 
+DILocalScope *DILocalScope::getNonLexicalBlockFileScope() const {
+  if (auto *File = dyn_cast<DILexicalBlockFile>(this))
+    return File->getScope()->getNonLexicalBlockFileScope();
+  return const_cast<DILocalScope *>(this);
+}
+
 DISubprogram *DISubprogram::getImpl(
     LLVMContext &Context, Metadata *Scope, MDString *Name,
     MDString *LinkageName, Metadata *File, unsigned Line, Metadata *Type,
     bool IsLocalToUnit, bool IsDefinition, unsigned ScopeLine,
     Metadata *ContainingType, unsigned Virtuality, unsigned VirtualIndex,
-    unsigned Flags, bool IsOptimized, Metadata *TemplateParams,
-    Metadata *Declaration, Metadata *Variables, StorageType Storage,
-    bool ShouldCreate) {
+    int ThisAdjustment, unsigned Flags, bool IsOptimized, Metadata *Unit,
+    Metadata *TemplateParams, Metadata *Declaration, Metadata *Variables,
+    StorageType Storage, bool ShouldCreate) {
   assert(isCanonical(Name) && "Expected canonical MDString");
   assert(isCanonical(LinkageName) && "Expected canonical MDString");
-  DEFINE_GETIMPL_LOOKUP(DISubprogram,
-                        (Scope, getString(Name), getString(LinkageName), File,
-                         Line, Type, IsLocalToUnit, IsDefinition, ScopeLine,
-                         ContainingType, Virtuality, VirtualIndex, Flags,
-                         IsOptimized, TemplateParams, Declaration, Variables));
-  Metadata *Ops[] = {File,        Scope,    Name,           Name,
-                     LinkageName, Type,     ContainingType, TemplateParams,
-                     Declaration, Variables};
-  DEFINE_GETIMPL_STORE(DISubprogram,
-                       (Line, ScopeLine, Virtuality, VirtualIndex, Flags,
-                        IsLocalToUnit, IsDefinition, IsOptimized),
+  DEFINE_GETIMPL_LOOKUP(
+      DISubprogram,
+      (Scope, Name, LinkageName, File, Line, Type, IsLocalToUnit, IsDefinition,
+       ScopeLine, ContainingType, Virtuality, VirtualIndex, ThisAdjustment,
+       Flags, IsOptimized, Unit, TemplateParams, Declaration, Variables));
+  Metadata *Ops[] = {File,           Scope,       Name,           Name,
+                     LinkageName,    Type,        ContainingType, Unit,
+                     TemplateParams, Declaration, Variables};
+  DEFINE_GETIMPL_STORE(DISubprogram, (Line, ScopeLine, Virtuality, VirtualIndex,
+                                      ThisAdjustment, Flags, IsLocalToUnit,
+                                      IsDefinition, IsOptimized),
                        Ops);
 }
 
@@ -399,7 +469,7 @@ DINamespace *DINamespace::getImpl(LLVMContext &Context, Metadata *Scope,
                                   Metadata *File, MDString *Name, unsigned Line,
                                   StorageType Storage, bool ShouldCreate) {
   assert(isCanonical(Name) && "Expected canonical MDString");
-  DEFINE_GETIMPL_LOOKUP(DINamespace, (Scope, File, getString(Name), Line));
+  DEFINE_GETIMPL_LOOKUP(DINamespace, (Scope, File, Name, Line));
   Metadata *Ops[] = {File, Scope, Name};
   DEFINE_GETIMPL_STORE(DINamespace, (Line), Ops);
 }
@@ -409,9 +479,8 @@ DIModule *DIModule::getImpl(LLVMContext &Context, Metadata *Scope,
                             MDString *IncludePath, MDString *ISysRoot,
                             StorageType Storage, bool ShouldCreate) {
   assert(isCanonical(Name) && "Expected canonical MDString");
-  DEFINE_GETIMPL_LOOKUP(DIModule,
-    (Scope, getString(Name), getString(ConfigurationMacros),
-     getString(IncludePath), getString(ISysRoot)));
+  DEFINE_GETIMPL_LOOKUP(
+      DIModule, (Scope, Name, ConfigurationMacros, IncludePath, ISysRoot));
   Metadata *Ops[] = {Scope, Name, ConfigurationMacros, IncludePath, ISysRoot};
   DEFINE_GETIMPL_STORE_NO_CONSTRUCTOR_ARGS(DIModule, Ops);
 }
@@ -422,7 +491,7 @@ DITemplateTypeParameter *DITemplateTypeParameter::getImpl(LLVMContext &Context,
                                                           StorageType Storage,
                                                           bool ShouldCreate) {
   assert(isCanonical(Name) && "Expected canonical MDString");
-  DEFINE_GETIMPL_LOOKUP(DITemplateTypeParameter, (getString(Name), Type));
+  DEFINE_GETIMPL_LOOKUP(DITemplateTypeParameter, (Name, Type));
   Metadata *Ops[] = {Name, Type};
   DEFINE_GETIMPL_STORE_NO_CONSTRUCTOR_ARGS(DITemplateTypeParameter, Ops);
 }
@@ -431,8 +500,7 @@ DITemplateValueParameter *DITemplateValueParameter::getImpl(
     LLVMContext &Context, unsigned Tag, MDString *Name, Metadata *Type,
     Metadata *Value, StorageType Storage, bool ShouldCreate) {
   assert(isCanonical(Name) && "Expected canonical MDString");
-  DEFINE_GETIMPL_LOOKUP(DITemplateValueParameter,
-                        (Tag, getString(Name), Type, Value));
+  DEFINE_GETIMPL_LOOKUP(DITemplateValueParameter, (Tag, Name, Type, Value));
   Metadata *Ops[] = {Name, Type, Value};
   DEFINE_GETIMPL_STORE(DITemplateValueParameter, (Tag), Ops);
 }
@@ -447,8 +515,8 @@ DIGlobalVariable::getImpl(LLVMContext &Context, Metadata *Scope, MDString *Name,
   assert(isCanonical(Name) && "Expected canonical MDString");
   assert(isCanonical(LinkageName) && "Expected canonical MDString");
   DEFINE_GETIMPL_LOOKUP(DIGlobalVariable,
-                        (Scope, getString(Name), getString(LinkageName), File,
-                         Line, Type, IsLocalToUnit, IsDefinition, Variable,
+                        (Scope, Name, LinkageName, File, Line, Type,
+                         IsLocalToUnit, IsDefinition, Variable,
                          StaticDataMemberDeclaration));
   Metadata *Ops[] = {Scope, Name,        File,     Type,
                      Name,  LinkageName, Variable, StaticDataMemberDeclaration};
@@ -468,7 +536,7 @@ DILocalVariable *DILocalVariable::getImpl(LLVMContext &Context, Metadata *Scope,
   assert(Scope && "Expected scope");
   assert(isCanonical(Name) && "Expected canonical MDString");
   DEFINE_GETIMPL_LOOKUP(DILocalVariable,
-                        (Scope, getString(Name), File, Line, Type, Arg, Flags));
+                        (Scope, Name, File, Line, Type, Arg, Flags));
   Metadata *Ops[] = {Scope, Name, File, Type};
   DEFINE_GETIMPL_STORE(DILocalVariable, (Line, Arg, Flags), Ops);
 }
@@ -539,9 +607,8 @@ DIObjCProperty *DIObjCProperty::getImpl(
   assert(isCanonical(Name) && "Expected canonical MDString");
   assert(isCanonical(GetterName) && "Expected canonical MDString");
   assert(isCanonical(SetterName) && "Expected canonical MDString");
-  DEFINE_GETIMPL_LOOKUP(DIObjCProperty,
-                        (getString(Name), File, Line, getString(GetterName),
-                         getString(SetterName), Attributes, Type));
+  DEFINE_GETIMPL_LOOKUP(DIObjCProperty, (Name, File, Line, GetterName,
+                                         SetterName, Attributes, Type));
   Metadata *Ops[] = {Name, File, GetterName, SetterName, Type};
   DEFINE_GETIMPL_STORE(DIObjCProperty, (Line, Attributes), Ops);
 }
@@ -552,8 +619,7 @@ DIImportedEntity *DIImportedEntity::getImpl(LLVMContext &Context, unsigned Tag,
                                             StorageType Storage,
                                             bool ShouldCreate) {
   assert(isCanonical(Name) && "Expected canonical MDString");
-  DEFINE_GETIMPL_LOOKUP(DIImportedEntity,
-                        (Tag, Scope, Entity, Line, getString(Name)));
+  DEFINE_GETIMPL_LOOKUP(DIImportedEntity, (Tag, Scope, Entity, Line, Name));
   Metadata *Ops[] = {Scope, Entity, Name};
   DEFINE_GETIMPL_STORE(DIImportedEntity, (Tag, Line), Ops);
 }
@@ -562,8 +628,7 @@ DIMacro *DIMacro::getImpl(LLVMContext &Context, unsigned MIType,
                           unsigned Line, MDString *Name, MDString *Value,
                           StorageType Storage, bool ShouldCreate) {
   assert(isCanonical(Name) && "Expected canonical MDString");
-  DEFINE_GETIMPL_LOOKUP(DIMacro,
-                        (MIType, Line, getString(Name), getString(Value)));
+  DEFINE_GETIMPL_LOOKUP(DIMacro, (MIType, Line, Name, Value));
   Metadata *Ops[] = { Name, Value };
   DEFINE_GETIMPL_STORE(DIMacro, (MIType, Line), Ops);
 }
diff --git a/lib/IR/DebugLoc.cpp b/lib/IR/DebugLoc.cpp
index 72d5c0e61883..ffa7a6b40e2a 100644
--- a/lib/IR/DebugLoc.cpp
+++ b/lib/IR/DebugLoc.cpp
@@ -9,7 +9,6 @@
 
 #include "llvm/IR/DebugLoc.h"
 #include "LLVMContextImpl.h"
-#include "llvm/ADT/DenseMapInfo.h"
 #include "llvm/IR/DebugInfo.h"
 using namespace llvm;
 
@@ -67,7 +66,7 @@ DebugLoc DebugLoc::get(unsigned Line, unsigned Col, const MDNode *Scope,
                          const_cast<MDNode *>(InlinedAt));
 }
 
-void DebugLoc::dump() const {
+LLVM_DUMP_METHOD void DebugLoc::dump() const {
 #ifndef NDEBUG
   if (!Loc)
     return;
diff --git a/lib/IR/DiagnosticInfo.cpp b/lib/IR/DiagnosticInfo.cpp
index 6426f76bbaa6..ce67be328ab1 100644
--- a/lib/IR/DiagnosticInfo.cpp
+++ b/lib/IR/DiagnosticInfo.cpp
@@ -91,7 +91,7 @@ int llvm::getNextAvailablePluginDiagnosticKind() {
   return ++PluginKindID;
 }
 
-const char *DiagnosticInfo::AlwaysPrint = "";
+const char *DiagnosticInfoOptimizationRemarkAnalysis::AlwaysPrint = "";
 
 DiagnosticInfoInlineAsm::DiagnosticInfoInlineAsm(const Instruction &I,
                                                  const Twine &MsgStr,
@@ -112,9 +112,13 @@ void DiagnosticInfoInlineAsm::print(DiagnosticPrinter &DP) const {
     DP << " at line " << getLocCookie();
 }
 
-void DiagnosticInfoStackSize::print(DiagnosticPrinter &DP) const {
-  DP << "stack size limit exceeded (" << getStackSize() << ") in "
-     << getFunction();
+void DiagnosticInfoResourceLimit::print(DiagnosticPrinter &DP) const {
+  DP << getResourceName() << " limit";
+
+  if (getResourceLimit() != 0)
+    DP << " of " << getResourceLimit();
+
+  DP << " exceeded (" <<  getResourceSize() << ") in " << getFunction();
 }
 
 void DiagnosticInfoDebugMetadataVersion::print(DiagnosticPrinter &DP) const {
@@ -122,6 +126,11 @@ void DiagnosticInfoDebugMetadataVersion::print(DiagnosticPrinter &DP) const {
      << ") in " << getModule();
 }
 
+void DiagnosticInfoIgnoringInvalidDebugMetadata::print(
+    DiagnosticPrinter &DP) const {
+  DP << "ignoring invalid debug info in " << getModule().getModuleIdentifier();
+}
+
 void DiagnosticInfoSampleProfile::print(DiagnosticPrinter &DP) const {
   if (!FileName.empty()) {
     DP << getFileName();
@@ -138,11 +147,11 @@ void DiagnosticInfoPGOProfile::print(DiagnosticPrinter &DP) const {
   DP << getMsg();
 }
 
-bool DiagnosticInfoOptimizationBase::isLocationAvailable() const {
+bool DiagnosticInfoWithDebugLocBase::isLocationAvailable() const {
   return getDebugLoc();
 }
 
-void DiagnosticInfoOptimizationBase::getLocation(StringRef *Filename,
+void DiagnosticInfoWithDebugLocBase::getLocation(StringRef *Filename,
                                                  unsigned *Line,
                                                  unsigned *Column) const {
   DILocation *L = getDebugLoc();
@@ -152,7 +161,7 @@ void DiagnosticInfoOptimizationBase::getLocation(StringRef *Filename,
   *Column = L->getColumn();
 }
 
-const std::string DiagnosticInfoOptimizationBase::getLocationStr() const {
+const std::string DiagnosticInfoWithDebugLocBase::getLocationStr() const {
   StringRef Filename("<unknown>");
   unsigned Line = 0;
   unsigned Column = 0;
@@ -163,6 +172,8 @@ const std::string DiagnosticInfoOptimizationBase::getLocationStr() const {
 
 void DiagnosticInfoOptimizationBase::print(DiagnosticPrinter &DP) const {
   DP << getLocationStr() << ": " << getMsg();
+  if (Hotness)
+    DP << " (hotness: " << *Hotness << ")";
 }
 
 bool DiagnosticInfoOptimizationRemark::isEnabled() const {
@@ -176,7 +187,7 @@ bool DiagnosticInfoOptimizationRemarkMissed::isEnabled() const {
 }
 
 bool DiagnosticInfoOptimizationRemarkAnalysis::isEnabled() const {
-  return getPassName() == DiagnosticInfo::AlwaysPrint ||
+  return shouldAlwaysPrint() ||
          (PassRemarksAnalysisOptLoc.Pattern &&
           PassRemarksAnalysisOptLoc.Pattern->match(getPassName()));
 }
@@ -230,6 +241,16 @@ bool DiagnosticInfoOptimizationFailure::isEnabled() const {
   return getSeverity() == DS_Warning;
 }
 
+void DiagnosticInfoUnsupported::print(DiagnosticPrinter &DP) const {
+  std::string Str;
+  raw_string_ostream OS(Str);
+
+  OS << getLocationStr() << ": in function " << getFunction().getName() << ' '
+     << *getFunction().getFunctionType() << ": " << Msg << '\n';
+  OS.flush();
+  DP << Str;
+}
+
 void llvm::emitLoopVectorizeWarning(LLVMContext &Ctx, const Function &Fn,
                                     const DebugLoc &DLoc, const Twine &Msg) {
   Ctx.diagnose(DiagnosticInfoOptimizationFailure(
diff --git a/lib/IR/Dominators.cpp b/lib/IR/Dominators.cpp
index b9d4fb7de881..57e3df76d023 100644
--- a/lib/IR/Dominators.cpp
+++ b/lib/IR/Dominators.cpp
@@ -17,12 +17,10 @@
 #include "llvm/IR/Dominators.h"
 #include "llvm/ADT/DepthFirstIterator.h"
 #include "llvm/ADT/SmallPtrSet.h"
-#include "llvm/ADT/SmallVector.h"
 #include "llvm/IR/CFG.h"
 #include "llvm/IR/Instructions.h"
 #include "llvm/IR/PassManager.h"
 #include "llvm/Support/CommandLine.h"
-#include "llvm/Support/Compiler.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/GenericDomTreeConstruction.h"
 #include "llvm/Support/raw_ostream.h"
@@ -30,7 +28,7 @@
 using namespace llvm;
 
 // Always verify dominfo if expensive checking is enabled.
-#ifdef XDEBUG
+#ifdef EXPENSIVE_CHECKS
 static bool VerifyDomInfo = true;
 #else
 static bool VerifyDomInfo = false;
@@ -302,7 +300,8 @@ void DominatorTree::verifyDomTree() const {
 //
 //===----------------------------------------------------------------------===//
 
-DominatorTree DominatorTreeAnalysis::run(Function &F) {
+DominatorTree DominatorTreeAnalysis::run(Function &F,
+                                         AnalysisManager<Function> &) {
   DominatorTree DT;
   DT.recalculate(F);
   return DT;
@@ -313,16 +312,16 @@ char DominatorTreeAnalysis::PassID;
 DominatorTreePrinterPass::DominatorTreePrinterPass(raw_ostream &OS) : OS(OS) {}
 
 PreservedAnalyses DominatorTreePrinterPass::run(Function &F,
-                                                FunctionAnalysisManager *AM) {
+                                                FunctionAnalysisManager &AM) {
   OS << "DominatorTree for function: " << F.getName() << "\n";
-  AM->getResult<DominatorTreeAnalysis>(F).print(OS);
+  AM.getResult<DominatorTreeAnalysis>(F).print(OS);
 
   return PreservedAnalyses::all();
 }
 
 PreservedAnalyses DominatorTreeVerifierPass::run(Function &F,
-                                                 FunctionAnalysisManager *AM) {
-  AM->getResult<DominatorTreeAnalysis>(F).verifyDomTree();
+                                                 FunctionAnalysisManager &AM) {
+  AM.getResult<DominatorTreeAnalysis>(F).verifyDomTree();
 
   return PreservedAnalyses::all();
 }
diff --git a/lib/IR/Function.cpp b/lib/IR/Function.cpp
index cfdfc40cd8aa..e1223d0d0337 100644
--- a/lib/IR/Function.cpp
+++ b/lib/IR/Function.cpp
@@ -14,7 +14,6 @@
 #include "llvm/IR/Function.h"
 #include "LLVMContextImpl.h"
 #include "SymbolTableListTraitsImpl.h"
-#include "llvm/ADT/DenseMap.h"
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/StringExtras.h"
 #include "llvm/CodeGen/ValueTypes.h"
@@ -89,16 +88,24 @@ bool Argument::hasNonNullAttr() const {
 /// in its containing function.
 bool Argument::hasByValAttr() const {
   if (!getType()->isPointerTy()) return false;
+  return hasAttribute(Attribute::ByVal);
+}
+
+bool Argument::hasSwiftSelfAttr() const {
+  return getParent()->getAttributes().
+    hasAttribute(getArgNo()+1, Attribute::SwiftSelf);
+}
+
+bool Argument::hasSwiftErrorAttr() const {
   return getParent()->getAttributes().
-    hasAttribute(getArgNo()+1, Attribute::ByVal);
+    hasAttribute(getArgNo()+1, Attribute::SwiftError);
 }
 
 /// \brief Return true if this argument has the inalloca attribute on it in
 /// its containing function.
 bool Argument::hasInAllocaAttr() const {
   if (!getType()->isPointerTy()) return false;
-  return getParent()->getAttributes().
-    hasAttribute(getArgNo()+1, Attribute::InAlloca);
+  return hasAttribute(Attribute::InAlloca);
 }
 
 bool Argument::hasByValOrInAllocaAttr() const {
@@ -130,53 +137,46 @@ uint64_t Argument::getDereferenceableOrNullBytes() const {
 /// it in its containing function.
 bool Argument::hasNestAttr() const {
   if (!getType()->isPointerTy()) return false;
-  return getParent()->getAttributes().
-    hasAttribute(getArgNo()+1, Attribute::Nest);
+  return hasAttribute(Attribute::Nest);
 }
 
 /// hasNoAliasAttr - Return true if this argument has the noalias attribute on
 /// it in its containing function.
 bool Argument::hasNoAliasAttr() const {
   if (!getType()->isPointerTy()) return false;
-  return getParent()->getAttributes().
-    hasAttribute(getArgNo()+1, Attribute::NoAlias);
+  return hasAttribute(Attribute::NoAlias);
 }
 
 /// hasNoCaptureAttr - Return true if this argument has the nocapture attribute
 /// on it in its containing function.
 bool Argument::hasNoCaptureAttr() const {
   if (!getType()->isPointerTy()) return false;
-  return getParent()->getAttributes().
-    hasAttribute(getArgNo()+1, Attribute::NoCapture);
+  return hasAttribute(Attribute::NoCapture);
 }
 
 /// hasSRetAttr - Return true if this argument has the sret attribute on
 /// it in its containing function.
 bool Argument::hasStructRetAttr() const {
   if (!getType()->isPointerTy()) return false;
-  return getParent()->getAttributes().
-    hasAttribute(getArgNo()+1, Attribute::StructRet);
+  return hasAttribute(Attribute::StructRet);
 }
 
 /// hasReturnedAttr - Return true if this argument has the returned attribute on
 /// it in its containing function.
 bool Argument::hasReturnedAttr() const {
-  return getParent()->getAttributes().
-    hasAttribute(getArgNo()+1, Attribute::Returned);
+  return hasAttribute(Attribute::Returned);
 }
 
 /// hasZExtAttr - Return true if this argument has the zext attribute on it in
 /// its containing function.
 bool Argument::hasZExtAttr() const {
-  return getParent()->getAttributes().
-    hasAttribute(getArgNo()+1, Attribute::ZExt);
+  return hasAttribute(Attribute::ZExt);
 }
 
 /// hasSExtAttr Return true if this argument has the sext attribute on it in its
 /// containing function.
 bool Argument::hasSExtAttr() const {
-  return getParent()->getAttributes().
-    hasAttribute(getArgNo()+1, Attribute::SExt);
+  return hasAttribute(Attribute::SExt);
 }
 
 /// Return true if this argument has the readonly or readnone attribute on it
@@ -208,23 +208,32 @@ void Argument::removeAttr(AttributeSet AS) {
                                                   getArgNo() + 1, B));
 }
 
+/// hasAttribute - Checks if an argument has a given attribute.
+bool Argument::hasAttribute(Attribute::AttrKind Kind) const {
+  return getParent()->hasAttribute(getArgNo() + 1, Kind);
+}
+
 //===----------------------------------------------------------------------===//
 // Helper Methods in Function
 //===----------------------------------------------------------------------===//
 
 bool Function::isMaterializable() const {
-  return getGlobalObjectSubClassData() & IsMaterializableBit;
+  return getGlobalObjectSubClassData() & (1 << IsMaterializableBit);
 }
 
 void Function::setIsMaterializable(bool V) {
-  setGlobalObjectBit(IsMaterializableBit, V);
+  unsigned Mask = 1 << IsMaterializableBit;
+  setGlobalObjectSubClassData((~Mask & getGlobalObjectSubClassData()) |
+                              (V ? Mask : 0u));
 }
 
 LLVMContext &Function::getContext() const {
   return getType()->getContext();
 }
 
-FunctionType *Function::getFunctionType() const { return Ty; }
+FunctionType *Function::getFunctionType() const {
+  return cast<FunctionType>(getValueType());
+}
 
 bool Function::isVarArg() const {
   return getFunctionType()->isVarArg();
@@ -249,8 +258,7 @@ void Function::eraseFromParent() {
 Function::Function(FunctionType *Ty, LinkageTypes Linkage, const Twine &name,
                    Module *ParentModule)
     : GlobalObject(Ty, Value::FunctionVal,
-                   OperandTraits<Function>::op_begin(this), 0, Linkage, name),
-      Ty(Ty) {
+                   OperandTraits<Function>::op_begin(this), 0, Linkage, name) {
   assert(FunctionType::isValidReturnType(getReturnType()) &&
          "invalid return type");
   setGlobalObjectSubClassData(0);
@@ -295,6 +303,28 @@ void Function::BuildLazyArguments() const {
   const_cast<Function*>(this)->setValueSubclassData(SDC &= ~(1<<0));
 }
 
+void Function::stealArgumentListFrom(Function &Src) {
+  assert(isDeclaration() && "Expected no references to current arguments");
+
+  // Drop the current arguments, if any, and set the lazy argument bit.
+  if (!hasLazyArguments()) {
+    assert(llvm::all_of(ArgumentList,
+                        [](const Argument &A) { return A.use_empty(); }) &&
+           "Expected arguments to be unused in declaration");
+    ArgumentList.clear();
+    setValueSubclassData(getSubclassDataFromValue() | (1 << 0));
+  }
+
+  // Nothing to steal if Src has lazy arguments.
+  if (Src.hasLazyArguments())
+    return;
+
+  // Steal arguments from Src, and fix the lazy argument bits.
+  ArgumentList.splice(ArgumentList.end(), Src.ArgumentList);
+  setValueSubclassData(getSubclassDataFromValue() & ~(1 << 0));
+  Src.setValueSubclassData(Src.getSubclassDataFromValue() | (1 << 0));
+}
+
 size_t Function::arg_size() const {
   return getFunctionType()->getNumParams();
 }
@@ -317,8 +347,8 @@ void Function::setParent(Module *parent) {
 void Function::dropAllReferences() {
   setIsMaterializable(false);
 
-  for (iterator I = begin(), E = end(); I != E; ++I)
-    I->dropAllReferences();
+  for (BasicBlock &BB : *this)
+    BB.dropAllReferences();
 
   // Delete all basic blocks. They are now unused, except possibly by
   // blockaddresses, but BasicBlock's destructor takes care of those.
@@ -336,21 +366,39 @@ void Function::dropAllReferences() {
   clearMetadata();
 }
 
-void Function::addAttribute(unsigned i, Attribute::AttrKind attr) {
+void Function::addAttribute(unsigned i, Attribute::AttrKind Kind) {
   AttributeSet PAL = getAttributes();
-  PAL = PAL.addAttribute(getContext(), i, attr);
+  PAL = PAL.addAttribute(getContext(), i, Kind);
   setAttributes(PAL);
 }
 
-void Function::addAttributes(unsigned i, AttributeSet attrs) {
+void Function::addAttribute(unsigned i, Attribute Attr) {
   AttributeSet PAL = getAttributes();
-  PAL = PAL.addAttributes(getContext(), i, attrs);
+  PAL = PAL.addAttribute(getContext(), i, Attr);
   setAttributes(PAL);
 }
 
-void Function::removeAttributes(unsigned i, AttributeSet attrs) {
+void Function::addAttributes(unsigned i, AttributeSet Attrs) {
   AttributeSet PAL = getAttributes();
-  PAL = PAL.removeAttributes(getContext(), i, attrs);
+  PAL = PAL.addAttributes(getContext(), i, Attrs);
+  setAttributes(PAL);
+}
+
+void Function::removeAttribute(unsigned i, Attribute::AttrKind Kind) {
+  AttributeSet PAL = getAttributes();
+  PAL = PAL.removeAttribute(getContext(), i, Kind);
+  setAttributes(PAL);
+}
+
+void Function::removeAttribute(unsigned i, StringRef Kind) {
+  AttributeSet PAL = getAttributes();
+  PAL = PAL.removeAttribute(getContext(), i, Kind);
+  setAttributes(PAL);
+}
+
+void Function::removeAttributes(unsigned i, AttributeSet Attrs) {
+  AttributeSet PAL = getAttributes();
+  PAL = PAL.removeAttributes(getContext(), i, Attrs);
   setAttributes(PAL);
 }
 
@@ -371,7 +419,7 @@ const std::string &Function::getGC() const {
   return getContext().getGC(*this);
 }
 
-void Function::setGC(const std::string Str) {
+void Function::setGC(std::string Str) {
   setValueSubclassDataBit(14, !Str.empty());
   getContext().setGC(*this, std::move(Str));
 }
@@ -405,17 +453,58 @@ void Function::copyAttributesFrom(const GlobalValue *Src) {
     setPrologueData(SrcF->getPrologueData());
 }
 
+/// Table of string intrinsic names indexed by enum value.
+static const char * const IntrinsicNameTable[] = {
+  "not_intrinsic",
+#define GET_INTRINSIC_NAME_TABLE
+#include "llvm/IR/Intrinsics.gen"
+#undef GET_INTRINSIC_NAME_TABLE
+};
+
+/// Table of per-target intrinsic name tables.
+#define GET_INTRINSIC_TARGET_DATA
+#include "llvm/IR/Intrinsics.gen"
+#undef GET_INTRINSIC_TARGET_DATA
+
+/// Find the segment of \c IntrinsicNameTable for intrinsics with the same
+/// target as \c Name, or the generic table if \c Name is not target specific.
+///
+/// Returns the relevant slice of \c IntrinsicNameTable
+static ArrayRef<const char *> findTargetSubtable(StringRef Name) {
+  assert(Name.startswith("llvm."));
+
+  ArrayRef<IntrinsicTargetInfo> Targets(TargetInfos);
+  // Drop "llvm." and take the first dotted component. That will be the target
+  // if this is target specific.
+  StringRef Target = Name.drop_front(5).split('.').first;
+  auto It = std::lower_bound(Targets.begin(), Targets.end(), Target,
+                             [](const IntrinsicTargetInfo &TI,
+                                StringRef Target) { return TI.Name < Target; });
+  // We've either found the target or just fall back to the generic set, which
+  // is always first.
+  const auto &TI = It != Targets.end() && It->Name == Target ? *It : Targets[0];
+  return makeArrayRef(&IntrinsicNameTable[1] + TI.Offset, TI.Count);
+}
+
 /// \brief This does the actual lookup of an intrinsic ID which
 /// matches the given function name.
 static Intrinsic::ID lookupIntrinsicID(const ValueName *ValName) {
-  unsigned Len = ValName->getKeyLength();
-  const char *Name = ValName->getKeyData();
+  StringRef Name = ValName->getKey();
 
-#define GET_FUNCTION_RECOGNIZER
-#include "llvm/IR/Intrinsics.gen"
-#undef GET_FUNCTION_RECOGNIZER
+  ArrayRef<const char *> NameTable = findTargetSubtable(Name);
+  int Idx = Intrinsic::lookupLLVMIntrinsicByName(NameTable, Name);
+  if (Idx == -1)
+    return Intrinsic::not_intrinsic;
+
+  // Intrinsic IDs correspond to the location in IntrinsicNameTable, but we have
+  // an index into a sub-table.
+  int Adjust = NameTable.data() - IntrinsicNameTable;
+  Intrinsic::ID ID = static_cast<Intrinsic::ID>(Idx + Adjust);
 
-  return Intrinsic::not_intrinsic;
+  // If the intrinsic is not overloaded, require an exact match. If it is
+  // overloaded, require a prefix match.
+  bool IsPrefixMatch = Name.size() > strlen(NameTable[Idx]);
+  return IsPrefixMatch == isOverloaded(ID) ? ID : Intrinsic::not_intrinsic;
 }
 
 void Function::recalculateIntrinsicID() {
@@ -470,17 +559,9 @@ static std::string getMangledTypeStr(Type* Ty) {
 
 std::string Intrinsic::getName(ID id, ArrayRef<Type*> Tys) {
   assert(id < num_intrinsics && "Invalid intrinsic ID!");
-  static const char * const Table[] = {
-    "not_intrinsic",
-#define GET_INTRINSIC_NAME_TABLE
-#include "llvm/IR/Intrinsics.gen"
-#undef GET_INTRINSIC_NAME_TABLE
-  };
-  if (Tys.empty())
-    return Table[id];
-  std::string Result(Table[id]);
-  for (unsigned i = 0; i < Tys.size(); ++i) {
-    Result += "." + getMangledTypeStr(Tys[i]);
+  std::string Result(IntrinsicNameTable[id]);
+  for (Type *Ty : Tys) {
+    Result += "." + getMangledTypeStr(Ty);
   }
   return Result;
 }
@@ -867,6 +948,198 @@ Function *Intrinsic::getDeclaration(Module *M, ID id, ArrayRef<Type*> Tys) {
 #include "llvm/IR/Intrinsics.gen"
 #undef GET_LLVM_INTRINSIC_FOR_MS_BUILTIN
 
+bool Intrinsic::matchIntrinsicType(Type *Ty, ArrayRef<Intrinsic::IITDescriptor> &Infos,
+                                   SmallVectorImpl<Type*> &ArgTys) {
+  using namespace Intrinsic;
+
+  // If we ran out of descriptors, there are too many arguments.
+  if (Infos.empty()) return true;
+  IITDescriptor D = Infos.front();
+  Infos = Infos.slice(1);
+
+  switch (D.Kind) {
+    case IITDescriptor::Void: return !Ty->isVoidTy();
+    case IITDescriptor::VarArg: return true;
+    case IITDescriptor::MMX:  return !Ty->isX86_MMXTy();
+    case IITDescriptor::Token: return !Ty->isTokenTy();
+    case IITDescriptor::Metadata: return !Ty->isMetadataTy();
+    case IITDescriptor::Half: return !Ty->isHalfTy();
+    case IITDescriptor::Float: return !Ty->isFloatTy();
+    case IITDescriptor::Double: return !Ty->isDoubleTy();
+    case IITDescriptor::Integer: return !Ty->isIntegerTy(D.Integer_Width);
+    case IITDescriptor::Vector: {
+      VectorType *VT = dyn_cast<VectorType>(Ty);
+      return !VT || VT->getNumElements() != D.Vector_Width ||
+             matchIntrinsicType(VT->getElementType(), Infos, ArgTys);
+    }
+    case IITDescriptor::Pointer: {
+      PointerType *PT = dyn_cast<PointerType>(Ty);
+      return !PT || PT->getAddressSpace() != D.Pointer_AddressSpace ||
+             matchIntrinsicType(PT->getElementType(), Infos, ArgTys);
+    }
+
+    case IITDescriptor::Struct: {
+      StructType *ST = dyn_cast<StructType>(Ty);
+      if (!ST || ST->getNumElements() != D.Struct_NumElements)
+        return true;
+
+      for (unsigned i = 0, e = D.Struct_NumElements; i != e; ++i)
+        if (matchIntrinsicType(ST->getElementType(i), Infos, ArgTys))
+          return true;
+      return false;
+    }
+
+    case IITDescriptor::Argument:
+      // Two cases here - If this is the second occurrence of an argument, verify
+      // that the later instance matches the previous instance.
+      if (D.getArgumentNumber() < ArgTys.size())
+        return Ty != ArgTys[D.getArgumentNumber()];
+
+          // Otherwise, if this is the first instance of an argument, record it and
+          // verify the "Any" kind.
+          assert(D.getArgumentNumber() == ArgTys.size() && "Table consistency error");
+          ArgTys.push_back(Ty);
+
+          switch (D.getArgumentKind()) {
+            case IITDescriptor::AK_Any:        return false; // Success
+            case IITDescriptor::AK_AnyInteger: return !Ty->isIntOrIntVectorTy();
+            case IITDescriptor::AK_AnyFloat:   return !Ty->isFPOrFPVectorTy();
+            case IITDescriptor::AK_AnyVector:  return !isa<VectorType>(Ty);
+            case IITDescriptor::AK_AnyPointer: return !isa<PointerType>(Ty);
+          }
+          llvm_unreachable("all argument kinds not covered");
+
+    case IITDescriptor::ExtendArgument: {
+      // This may only be used when referring to a previous vector argument.
+      if (D.getArgumentNumber() >= ArgTys.size())
+        return true;
+
+      Type *NewTy = ArgTys[D.getArgumentNumber()];
+      if (VectorType *VTy = dyn_cast<VectorType>(NewTy))
+        NewTy = VectorType::getExtendedElementVectorType(VTy);
+      else if (IntegerType *ITy = dyn_cast<IntegerType>(NewTy))
+        NewTy = IntegerType::get(ITy->getContext(), 2 * ITy->getBitWidth());
+      else
+        return true;
+
+      return Ty != NewTy;
+    }
+    case IITDescriptor::TruncArgument: {
+      // This may only be used when referring to a previous vector argument.
+      if (D.getArgumentNumber() >= ArgTys.size())
+        return true;
+
+      Type *NewTy = ArgTys[D.getArgumentNumber()];
+      if (VectorType *VTy = dyn_cast<VectorType>(NewTy))
+        NewTy = VectorType::getTruncatedElementVectorType(VTy);
+      else if (IntegerType *ITy = dyn_cast<IntegerType>(NewTy))
+        NewTy = IntegerType::get(ITy->getContext(), ITy->getBitWidth() / 2);
+      else
+        return true;
+
+      return Ty != NewTy;
+    }
+    case IITDescriptor::HalfVecArgument:
+      // This may only be used when referring to a previous vector argument.
+      return D.getArgumentNumber() >= ArgTys.size() ||
+             !isa<VectorType>(ArgTys[D.getArgumentNumber()]) ||
+             VectorType::getHalfElementsVectorType(
+                     cast<VectorType>(ArgTys[D.getArgumentNumber()])) != Ty;
+    case IITDescriptor::SameVecWidthArgument: {
+      if (D.getArgumentNumber() >= ArgTys.size())
+        return true;
+      VectorType * ReferenceType =
+              dyn_cast<VectorType>(ArgTys[D.getArgumentNumber()]);
+      VectorType *ThisArgType = dyn_cast<VectorType>(Ty);
+      if (!ThisArgType || !ReferenceType ||
+          (ReferenceType->getVectorNumElements() !=
+           ThisArgType->getVectorNumElements()))
+        return true;
+      return matchIntrinsicType(ThisArgType->getVectorElementType(),
+                                Infos, ArgTys);
+    }
+    case IITDescriptor::PtrToArgument: {
+      if (D.getArgumentNumber() >= ArgTys.size())
+        return true;
+      Type * ReferenceType = ArgTys[D.getArgumentNumber()];
+      PointerType *ThisArgType = dyn_cast<PointerType>(Ty);
+      return (!ThisArgType || ThisArgType->getElementType() != ReferenceType);
+    }
+    case IITDescriptor::VecOfPtrsToElt: {
+      if (D.getArgumentNumber() >= ArgTys.size())
+        return true;
+      VectorType * ReferenceType =
+              dyn_cast<VectorType> (ArgTys[D.getArgumentNumber()]);
+      VectorType *ThisArgVecTy = dyn_cast<VectorType>(Ty);
+      if (!ThisArgVecTy || !ReferenceType ||
+          (ReferenceType->getVectorNumElements() !=
+           ThisArgVecTy->getVectorNumElements()))
+        return true;
+      PointerType *ThisArgEltTy =
+              dyn_cast<PointerType>(ThisArgVecTy->getVectorElementType());
+      if (!ThisArgEltTy)
+        return true;
+      return ThisArgEltTy->getElementType() !=
+             ReferenceType->getVectorElementType();
+    }
+  }
+  llvm_unreachable("unhandled");
+}
+
+bool
+Intrinsic::matchIntrinsicVarArg(bool isVarArg,
+                                ArrayRef<Intrinsic::IITDescriptor> &Infos) {
+  // If there are no descriptors left, then it can't be a vararg.
+  if (Infos.empty())
+    return isVarArg;
+
+  // There should be only one descriptor remaining at this point.
+  if (Infos.size() != 1)
+    return true;
+
+  // Check and verify the descriptor.
+  IITDescriptor D = Infos.front();
+  Infos = Infos.slice(1);
+  if (D.Kind == IITDescriptor::VarArg)
+    return !isVarArg;
+
+  return true;
+}
+
+Optional<Function*> Intrinsic::remangleIntrinsicFunction(Function *F) {
+  Intrinsic::ID ID = F->getIntrinsicID();
+  if (!ID)
+    return None;
+
+  FunctionType *FTy = F->getFunctionType();
+  // Accumulate an array of overloaded types for the given intrinsic
+  SmallVector<Type *, 4> ArgTys;
+  {
+    SmallVector<Intrinsic::IITDescriptor, 8> Table;
+    getIntrinsicInfoTableEntries(ID, Table);
+    ArrayRef<Intrinsic::IITDescriptor> TableRef = Table;
+
+    // If we encounter any problems matching the signature with the descriptor
+    // just give up remangling. It's up to verifier to report the discrepancy.
+    if (Intrinsic::matchIntrinsicType(FTy->getReturnType(), TableRef, ArgTys))
+      return None;
+    for (auto Ty : FTy->params())
+      if (Intrinsic::matchIntrinsicType(Ty, TableRef, ArgTys))
+        return None;
+    if (Intrinsic::matchIntrinsicVarArg(FTy->isVarArg(), TableRef))
+      return None;
+  }
+
+  StringRef Name = F->getName();
+  if (Name == Intrinsic::getName(ID, ArgTys))
+    return None;
+
+  auto NewDecl = Intrinsic::getDeclaration(F->getParent(), ID, ArgTys);
+  NewDecl->setCallingConv(F->getCallingConv());
+  assert(NewDecl->getFunctionType() == FTy && "Shouldn't change the signature");
+  return NewDecl;
+}
+
 /// hasAddressTaken - returns true if there are any uses of this function
 /// other than direct calls or invokes to it.
 bool Function::hasAddressTaken(const User* *PutOffender) const {
@@ -874,11 +1147,17 @@ bool Function::hasAddressTaken(const User* *PutOffender) const {
     const User *FU = U.getUser();
     if (isa<BlockAddress>(FU))
       continue;
-    if (!isa<CallInst>(FU) && !isa<InvokeInst>(FU))
-      return PutOffender ? (*PutOffender = FU, true) : true;
+    if (!isa<CallInst>(FU) && !isa<InvokeInst>(FU)) {
+      if (PutOffender)
+        *PutOffender = FU;
+      return true;
+    }
     ImmutableCallSite CS(cast<Instruction>(FU));
-    if (!CS.isCallee(&U))
-      return PutOffender ? (*PutOffender = FU, true) : true;
+    if (!CS.isCallee(&U)) {
+      if (PutOffender)
+        *PutOffender = FU;
+      return true;
+    }
   }
   return false;
 }
diff --git a/lib/IR/FunctionInfo.cpp b/lib/IR/FunctionInfo.cpp
deleted file mode 100644
index 17a67bcf0472..000000000000
--- a/lib/IR/FunctionInfo.cpp
+++ /dev/null
@@ -1,67 +0,0 @@
-//===-- FunctionInfo.cpp - Function Info Index ----------------------------===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-//
-// This file implements the function info index and summary classes for the
-// IR library.
-//
-//===----------------------------------------------------------------------===//
-
-#include "llvm/IR/FunctionInfo.h"
-#include "llvm/ADT/StringMap.h"
-using namespace llvm;
-
-// Create the combined function index/summary from multiple
-// per-module instances.
-void FunctionInfoIndex::mergeFrom(std::unique_ptr<FunctionInfoIndex> Other,
-                                  uint64_t NextModuleId) {
-
-  StringRef ModPath;
-  for (auto &OtherFuncInfoLists : *Other) {
-    std::string FuncName = OtherFuncInfoLists.getKey();
-    FunctionInfoList &List = OtherFuncInfoLists.second;
-
-    // Assert that the func info list only has one entry, since we shouldn't
-    // have duplicate names within a single per-module index.
-    assert(List.size() == 1);
-    std::unique_ptr<FunctionInfo> Info = std::move(List.front());
-
-    // Skip if there was no function summary section.
-    if (!Info->functionSummary())
-      continue;
-
-    // Add the module path string ref for this module if we haven't already
-    // saved a reference to it.
-    if (ModPath.empty())
-      ModPath =
-          addModulePath(Info->functionSummary()->modulePath(), NextModuleId);
-    else
-      assert(ModPath == Info->functionSummary()->modulePath() &&
-             "Each module in the combined map should have a unique ID");
-
-    // Note the module path string ref was copied above and is still owned by
-    // the original per-module index. Reset it to the new module path
-    // string reference owned by the combined index.
-    Info->functionSummary()->setModulePath(ModPath);
-
-    // If it is a local function, rename it.
-    if (Info->functionSummary()->isLocalFunction()) {
-      // Any local functions are virtually renamed when being added to the
-      // combined index map, to disambiguate from other functions with
-      // the same name. The symbol table created for the combined index
-      // file should contain the renamed symbols.
-      FuncName =
-          FunctionInfoIndex::getGlobalNameForLocal(FuncName, NextModuleId);
-    }
-
-    // Add new function info to existing list. There may be duplicates when
-    // combining FunctionMap entries, due to COMDAT functions. Any local
-    // functions were virtually renamed above.
-    addFunctionInfo(FuncName, std::move(Info));
-  }
-}
diff --git a/lib/IR/GCOV.cpp b/lib/IR/GCOV.cpp
index 35b8157751b6..a9f7f45ee305 100644
--- a/lib/IR/GCOV.cpp
+++ b/lib/IR/GCOV.cpp
@@ -22,6 +22,7 @@
 #include "llvm/Support/raw_ostream.h"
 #include <algorithm>
 #include <system_error>
+
 using namespace llvm;
 
 //===----------------------------------------------------------------------===//
@@ -104,7 +105,7 @@ bool GCOVFile::readGCDA(GCOVBuffer &Buffer) {
 }
 
 /// dump - Dump GCOVFile content to dbgs() for debugging purposes.
-void GCOVFile::dump() const {
+LLVM_DUMP_METHOD void GCOVFile::dump() const {
   for (const auto &FPtr : Functions)
     FPtr->dump();
 }
@@ -247,10 +248,12 @@ bool GCOVFunction::readGCNO(GCOVBuffer &Buff, GCOV::GCOVVersion Version) {
 /// readGCDA - Read a function from the GCDA buffer. Return false if an error
 /// occurs.
 bool GCOVFunction::readGCDA(GCOVBuffer &Buff, GCOV::GCOVVersion Version) {
-  uint32_t Dummy;
-  if (!Buff.readInt(Dummy))
+  uint32_t HeaderLength;
+  if (!Buff.readInt(HeaderLength))
     return false; // Function header length
 
+  uint64_t EndPos = Buff.getCursor() + HeaderLength * sizeof(uint32_t);
+
   uint32_t GCDAIdent;
   if (!Buff.readInt(GCDAIdent))
     return false;
@@ -280,13 +283,15 @@ bool GCOVFunction::readGCDA(GCOVBuffer &Buff, GCOV::GCOVVersion Version) {
     }
   }
 
-  StringRef GCDAName;
-  if (!Buff.readString(GCDAName))
-    return false;
-  if (Name != GCDAName) {
-    errs() << "Function names do not match: " << Name << " != " << GCDAName
-           << ".\n";
-    return false;
+  if (Buff.getCursor() < EndPos) {
+    StringRef GCDAName;
+    if (!Buff.readString(GCDAName))
+      return false;
+    if (Name != GCDAName) {
+      errs() << "Function names do not match: " << Name << " != " << GCDAName
+             << ".\n";
+      return false;
+    }
   }
 
   if (!Buff.readArcTag()) {
@@ -340,7 +345,7 @@ uint64_t GCOVFunction::getExitCount() const {
 }
 
 /// dump - Dump GCOVFunction content to dbgs() for debugging purposes.
-void GCOVFunction::dump() const {
+LLVM_DUMP_METHOD void GCOVFunction::dump() const {
   dbgs() << "===== " << Name << " (" << Ident << ") @ " << Filename << ":"
          << LineNumber << "\n";
   for (const auto &Block : Blocks)
@@ -397,7 +402,7 @@ void GCOVBlock::collectLineCounts(FileInfo &FI) {
 }
 
 /// dump - Dump GCOVBlock content to dbgs() for debugging purposes.
-void GCOVBlock::dump() const {
+LLVM_DUMP_METHOD void GCOVBlock::dump() const {
   dbgs() << "Block : " << Number << " Counter : " << Counter << "\n";
   if (!SrcEdges.empty()) {
     dbgs() << "\tSource Edges : ";
@@ -496,7 +501,7 @@ public:
     OS << format("%5u:", LineNum) << Line << "\n";
   }
 };
-}
+} // end anonymous namespace
 
 /// Convert a path to a gcov filename. If PreservePaths is true, this
 /// translates "/" to "#", ".." to "^", and drops ".", to match gcov.
@@ -683,7 +688,6 @@ void FileInfo::print(raw_ostream &InfoOS, StringRef MainFilename,
   if (Options.FuncCoverage)
     printFuncCoverage(InfoOS);
   printFileCoverage(InfoOS);
-  return;
 }
 
 /// printFunctionSummary - Print function and block summary.
diff --git a/lib/IR/Globals.cpp b/lib/IR/Globals.cpp
index a61b62bd9687..6715484518a9 100644
--- a/lib/IR/Globals.cpp
+++ b/lib/IR/Globals.cpp
@@ -43,7 +43,7 @@ void GlobalValue::destroyConstantImpl() {
   llvm_unreachable("You can't GV->destroyConstantImpl()!");
 }
 
-Value *GlobalValue::handleOperandChangeImpl(Value *From, Value *To, Use *U) {
+Value *GlobalValue::handleOperandChangeImpl(Value *From, Value *To) {
   llvm_unreachable("Unsupported class for handleOperandChange()!");
 }
 
@@ -51,7 +51,7 @@ Value *GlobalValue::handleOperandChangeImpl(Value *From, Value *To, Use *U) {
 /// create a GlobalValue) from the GlobalValue Src to this one.
 void GlobalValue::copyAttributesFrom(const GlobalValue *Src) {
   setVisibility(Src->getVisibility());
-  setUnnamedAddr(Src->hasUnnamedAddr());
+  setUnnamedAddr(Src->getUnnamedAddr());
   setDLLStorageClass(Src->getDLLStorageClass());
 }
 
@@ -81,13 +81,13 @@ void GlobalObject::setAlignment(unsigned Align) {
 
 unsigned GlobalObject::getGlobalObjectSubClassData() const {
   unsigned ValueData = getGlobalValueSubClassData();
-  return ValueData >> AlignmentBits;
+  return ValueData >> GlobalObjectBits;
 }
 
 void GlobalObject::setGlobalObjectSubClassData(unsigned Val) {
   unsigned OldData = getGlobalValueSubClassData();
-  setGlobalValueSubClassData((OldData & AlignmentMask) |
-                             (Val << AlignmentBits));
+  setGlobalValueSubClassData((OldData & GlobalObjectMask) |
+                             (Val << GlobalObjectBits));
   assert(getGlobalObjectSubClassData() == Val && "representation error");
 }
 
@@ -99,7 +99,36 @@ void GlobalObject::copyAttributesFrom(const GlobalValue *Src) {
   }
 }
 
-const char *GlobalValue::getSection() const {
+std::string GlobalValue::getGlobalIdentifier(StringRef Name,
+                                             GlobalValue::LinkageTypes Linkage,
+                                             StringRef FileName) {
+
+  // Value names may be prefixed with a binary '1' to indicate
+  // that the backend should not modify the symbols due to any platform
+  // naming convention. Do not include that '1' in the PGO profile name.
+  if (Name[0] == '\1')
+    Name = Name.substr(1);
+
+  std::string NewName = Name;
+  if (llvm::GlobalValue::isLocalLinkage(Linkage)) {
+    // For local symbols, prepend the main file name to distinguish them.
+    // Do not include the full path in the file name since there's no guarantee
+    // that it will stay the same, e.g., if the files are checked out from
+    // version control in different locations.
+    if (FileName.empty())
+      NewName = NewName.insert(0, "<unknown>:");
+    else
+      NewName = NewName.insert(0, FileName.str() + ":");
+  }
+  return NewName;
+}
+
+std::string GlobalValue::getGlobalIdentifier() const {
+  return getGlobalIdentifier(getName(), getLinkage(),
+                             getParent()->getSourceFileName());
+}
+
+StringRef GlobalValue::getSection() const {
   if (auto *GA = dyn_cast<GlobalAlias>(this)) {
     // In general we cannot compute this at the IR level, but we try.
     if (const GlobalObject *GO = GA->getBaseObject())
@@ -116,10 +145,18 @@ Comdat *GlobalValue::getComdat() {
       return const_cast<GlobalObject *>(GO)->getComdat();
     return nullptr;
   }
+  // ifunc and its resolver are separate things so don't use resolver comdat.
+  if (isa<GlobalIFunc>(this))
+    return nullptr;
   return cast<GlobalObject>(this)->getComdat();
 }
 
-void GlobalObject::setSection(StringRef S) { Section = S; }
+void GlobalObject::setSection(StringRef S) {
+  Section = S;
+
+  // The C api requires this to be null terminated.
+  Section.c_str();
+}
 
 bool GlobalValue::isDeclaration() const {
   // Globals are definitions if they have an initializer.
@@ -130,8 +167,8 @@ bool GlobalValue::isDeclaration() const {
   if (const Function *F = dyn_cast<Function>(this))
     return F->empty() && !F->isMaterializable();
 
-  // Aliases are always definitions.
-  assert(isa<GlobalAlias>(this));
+  // Aliases and ifuncs are always definitions.
+  assert(isa<GlobalIndirectSymbol>(this));
   return false;
 }
 
@@ -242,7 +279,7 @@ void GlobalVariable::setInitializer(Constant *InitVal) {
       setGlobalVariableNumOperands(0);
     }
   } else {
-    assert(InitVal->getType() == getType()->getElementType() &&
+    assert(InitVal->getType() == getValueType() &&
            "Initializer type must match GlobalVariable type");
     // Note, the num operands is used to compute the offset of the operand, so
     // the order here matters.  We need to set num operands to 1 first so that
@@ -263,6 +300,22 @@ void GlobalVariable::copyAttributesFrom(const GlobalValue *Src) {
   }
 }
 
+void GlobalVariable::dropAllReferences() {
+  User::dropAllReferences();
+  clearMetadata();
+}
+
+//===----------------------------------------------------------------------===//
+// GlobalIndirectSymbol Implementation
+//===----------------------------------------------------------------------===//
+
+GlobalIndirectSymbol::GlobalIndirectSymbol(Type *Ty, ValueTy VTy,
+    unsigned AddressSpace, LinkageTypes Linkage, const Twine &Name,
+    Constant *Symbol)
+    : GlobalValue(Ty, VTy, &Op<0>(), 1, Linkage, Name, AddressSpace) {
+    Op<0>() = Symbol;
+}
+
 
 //===----------------------------------------------------------------------===//
 // GlobalAlias Implementation
@@ -271,10 +324,8 @@ void GlobalVariable::copyAttributesFrom(const GlobalValue *Src) {
 GlobalAlias::GlobalAlias(Type *Ty, unsigned AddressSpace, LinkageTypes Link,
                          const Twine &Name, Constant *Aliasee,
                          Module *ParentModule)
-    : GlobalValue(Ty, Value::GlobalAliasVal, &Op<0>(), 1, Link, Name,
-                  AddressSpace) {
-  Op<0>() = Aliasee;
-
+    : GlobalIndirectSymbol(Ty, Value::GlobalAliasVal, AddressSpace, Link, Name,
+                           Aliasee) {
   if (ParentModule)
     ParentModule->getAliasList().push_back(this);
 }
@@ -323,5 +374,36 @@ void GlobalAlias::eraseFromParent() {
 void GlobalAlias::setAliasee(Constant *Aliasee) {
   assert((!Aliasee || Aliasee->getType() == getType()) &&
          "Alias and aliasee types should match!");
-  setOperand(0, Aliasee);
+  setIndirectSymbol(Aliasee);
+}
+
+//===----------------------------------------------------------------------===//
+// GlobalIFunc Implementation
+//===----------------------------------------------------------------------===//
+
+GlobalIFunc::GlobalIFunc(Type *Ty, unsigned AddressSpace, LinkageTypes Link,
+                         const Twine &Name, Constant *Resolver,
+                         Module *ParentModule)
+    : GlobalIndirectSymbol(Ty, Value::GlobalIFuncVal, AddressSpace, Link, Name,
+                           Resolver) {
+  if (ParentModule)
+    ParentModule->getIFuncList().push_back(this);
+}
+
+GlobalIFunc *GlobalIFunc::create(Type *Ty, unsigned AddressSpace,
+                                 LinkageTypes Link, const Twine &Name,
+                                 Constant *Resolver, Module *ParentModule) {
+  return new GlobalIFunc(Ty, AddressSpace, Link, Name, Resolver, ParentModule);
+}
+
+void GlobalIFunc::setParent(Module *parent) {
+  Parent = parent;
+}
+
+void GlobalIFunc::removeFromParent() {
+  getParent()->getIFuncList().remove(getIterator());
+}
+
+void GlobalIFunc::eraseFromParent() {
+  getParent()->getIFuncList().erase(getIterator());
 }
diff --git a/lib/IR/IRBuilder.cpp b/lib/IR/IRBuilder.cpp
index 447412936335..298331d51c88 100644
--- a/lib/IR/IRBuilder.cpp
+++ b/lib/IR/IRBuilder.cpp
@@ -34,7 +34,7 @@ GlobalVariable *IRBuilderBase::CreateGlobalString(StringRef Str,
                                           StrConstant, Name, nullptr,
                                           GlobalVariable::NotThreadLocal,
                                           AddressSpace);
-  GV->setUnnamedAddr(true);
+  GV->setUnnamedAddr(GlobalValue::UnnamedAddr::Global);
   return GV;
 }
 
@@ -201,52 +201,113 @@ CallInst *IRBuilderBase::CreateAssumption(Value *Cond) {
   return createCallHelper(FnAssume, Ops, this);
 }
 
-/// Create a call to a Masked Load intrinsic.
-/// Ptr      - the base pointer for the load
-/// Align    - alignment of the source location
-/// Mask     - an vector of booleans which indicates what vector lanes should
-///            be accessed in memory
-/// PassThru - a pass-through value that is used to fill the masked-off lanes
-///            of the result
-/// Name     - name of the result variable
+/// \brief Create a call to a Masked Load intrinsic.
+/// \p Ptr      - base pointer for the load
+/// \p Align    - alignment of the source location
+/// \p Mask     - vector of booleans which indicates what vector lanes should
+///               be accessed in memory
+/// \p PassThru - pass-through value that is used to fill the masked-off lanes
+///               of the result
+/// \p Name     - name of the result variable
 CallInst *IRBuilderBase::CreateMaskedLoad(Value *Ptr, unsigned Align,
                                           Value *Mask, Value *PassThru,
                                           const Twine &Name) {
-  assert(Ptr->getType()->isPointerTy() && "Ptr must be of pointer type");
-  // DataTy is the overloaded type
-  Type *DataTy = cast<PointerType>(Ptr->getType())->getElementType();
+  PointerType *PtrTy = cast<PointerType>(Ptr->getType());
+  Type *DataTy = PtrTy->getElementType();
   assert(DataTy->isVectorTy() && "Ptr should point to a vector");
   if (!PassThru)
     PassThru = UndefValue::get(DataTy);
+  Type *OverloadedTypes[] = { DataTy, PtrTy };
   Value *Ops[] = { Ptr, getInt32(Align), Mask,  PassThru};
-  return CreateMaskedIntrinsic(Intrinsic::masked_load, Ops, DataTy, Name);
+  return CreateMaskedIntrinsic(Intrinsic::masked_load, Ops,
+                               OverloadedTypes, Name);
 }
 
-/// Create a call to a Masked Store intrinsic.
-/// Val   - the data to be stored,
-/// Ptr   - the base pointer for the store
-/// Align - alignment of the destination location
-/// Mask  - an vector of booleans which indicates what vector lanes should
-///         be accessed in memory
+/// \brief Create a call to a Masked Store intrinsic.
+/// \p Val   - data to be stored,
+/// \p Ptr   - base pointer for the store
+/// \p Align - alignment of the destination location
+/// \p Mask  - vector of booleans which indicates what vector lanes should
+///            be accessed in memory
 CallInst *IRBuilderBase::CreateMaskedStore(Value *Val, Value *Ptr,
                                            unsigned Align, Value *Mask) {
+  PointerType *PtrTy = cast<PointerType>(Ptr->getType());
+  Type *DataTy = PtrTy->getElementType();
+  assert(DataTy->isVectorTy() && "Ptr should point to a vector");
+  Type *OverloadedTypes[] = { DataTy, PtrTy };
   Value *Ops[] = { Val, Ptr, getInt32(Align), Mask };
-  // Type of the data to be stored - the only one overloaded type
-  return CreateMaskedIntrinsic(Intrinsic::masked_store, Ops, Val->getType());
+  return CreateMaskedIntrinsic(Intrinsic::masked_store, Ops, OverloadedTypes);
 }
 
 /// Create a call to a Masked intrinsic, with given intrinsic Id,
-/// an array of operands - Ops, and one overloaded type - DataTy
+/// an array of operands - Ops, and an array of overloaded types -
+/// OverloadedTypes.
 CallInst *IRBuilderBase::CreateMaskedIntrinsic(Intrinsic::ID Id,
                                                ArrayRef<Value *> Ops,
-                                               Type *DataTy,
+                                               ArrayRef<Type *> OverloadedTypes,
                                                const Twine &Name) {
   Module *M = BB->getParent()->getParent();
-  Type *OverloadedTypes[] = { DataTy };
   Value *TheFn = Intrinsic::getDeclaration(M, Id, OverloadedTypes);
   return createCallHelper(TheFn, Ops, this, Name);
 }
 
+/// \brief Create a call to a Masked Gather intrinsic.
+/// \p Ptrs     - vector of pointers for loading
+/// \p Align    - alignment for one element
+/// \p Mask     - vector of booleans which indicates what vector lanes should
+///               be accessed in memory
+/// \p PassThru - pass-through value that is used to fill the masked-off lanes
+///               of the result
+/// \p Name     - name of the result variable
+CallInst *IRBuilderBase::CreateMaskedGather(Value *Ptrs, unsigned Align,
+                                            Value *Mask,  Value *PassThru,
+                                            const Twine& Name) {
+  auto PtrsTy = cast<VectorType>(Ptrs->getType());
+  auto PtrTy = cast<PointerType>(PtrsTy->getElementType());
+  unsigned NumElts = PtrsTy->getVectorNumElements();
+  Type *DataTy = VectorType::get(PtrTy->getElementType(), NumElts);
+
+  if (!Mask)
+    Mask = Constant::getAllOnesValue(VectorType::get(Type::getInt1Ty(Context),
+                                     NumElts));
+
+  Value * Ops[] = {Ptrs, getInt32(Align), Mask, UndefValue::get(DataTy)};
+
+  // We specify only one type when we create this intrinsic. Types of other
+  // arguments are derived from this type.
+  return CreateMaskedIntrinsic(Intrinsic::masked_gather, Ops, { DataTy }, Name);
+}
+
+/// \brief Create a call to a Masked Scatter intrinsic.
+/// \p Data  - data to be stored,
+/// \p Ptrs  - the vector of pointers, where the \p Data elements should be
+///            stored
+/// \p Align - alignment for one element
+/// \p Mask  - vector of booleans which indicates what vector lanes should
+///            be accessed in memory
+CallInst *IRBuilderBase::CreateMaskedScatter(Value *Data, Value *Ptrs,
+                                             unsigned Align, Value *Mask) {
+  auto PtrsTy = cast<VectorType>(Ptrs->getType());
+  auto DataTy = cast<VectorType>(Data->getType());
+  unsigned NumElts = PtrsTy->getVectorNumElements();
+
+#ifndef NDEBUG
+  auto PtrTy = cast<PointerType>(PtrsTy->getElementType());
+  assert(NumElts == DataTy->getVectorNumElements() &&
+         PtrTy->getElementType() == DataTy->getElementType() &&
+         "Incompatible pointer and data types");
+#endif
+
+  if (!Mask)
+    Mask = Constant::getAllOnesValue(VectorType::get(Type::getInt1Ty(Context),
+                                     NumElts));
+  Value * Ops[] = {Data, Ptrs, getInt32(Align), Mask};
+
+  // We specify only one type when we create this intrinsic. Types of other
+  // arguments are derived from this type.
+  return CreateMaskedIntrinsic(Intrinsic::masked_scatter, Ops, { DataTy });
+}
+
 template <typename T0, typename T1, typename T2, typename T3>
 static std::vector<Value *>
 getStatepointArgs(IRBuilderBase &B, uint64_t ID, uint32_t NumPatchBytes,
diff --git a/lib/IR/IRPrintingPasses.cpp b/lib/IR/IRPrintingPasses.cpp
index 822dbeb08b33..4d2f9b98911b 100644
--- a/lib/IR/IRPrintingPasses.cpp
+++ b/lib/IR/IRPrintingPasses.cpp
@@ -26,7 +26,7 @@ PrintModulePass::PrintModulePass(raw_ostream &OS, const std::string &Banner,
     : OS(OS), Banner(Banner),
       ShouldPreserveUseListOrder(ShouldPreserveUseListOrder) {}
 
-PreservedAnalyses PrintModulePass::run(Module &M) {
+PreservedAnalyses PrintModulePass::run(Module &M, AnalysisManager<Module> &) {
   OS << Banner;
   if (llvm::isFunctionInPrintList("*"))
     M.print(OS, nullptr, ShouldPreserveUseListOrder);
@@ -42,7 +42,8 @@ PrintFunctionPass::PrintFunctionPass() : OS(dbgs()) {}
 PrintFunctionPass::PrintFunctionPass(raw_ostream &OS, const std::string &Banner)
     : OS(OS), Banner(Banner) {}
 
-PreservedAnalyses PrintFunctionPass::run(Function &F) {
+PreservedAnalyses PrintFunctionPass::run(Function &F,
+                                         AnalysisManager<Function> &) {
   if (isFunctionInPrintList(F.getName()))
     OS << Banner << static_cast<Value &>(F);
   return PreservedAnalyses::all();
@@ -61,7 +62,8 @@ public:
       : ModulePass(ID), P(OS, Banner, ShouldPreserveUseListOrder) {}
 
   bool runOnModule(Module &M) override {
-    P.run(M);
+    ModuleAnalysisManager DummyMAM;
+    P.run(M, DummyMAM);
     return false;
   }
 
@@ -81,7 +83,8 @@ public:
 
   // This pass just prints a banner followed by the function as it's processed.
   bool runOnFunction(Function &F) override {
-    P.run(F);
+    FunctionAnalysisManager DummyFAM;
+    P.run(F, DummyFAM);
     return false;
   }
 
diff --git a/lib/IR/InlineAsm.cpp b/lib/IR/InlineAsm.cpp
index 15d3b830b8fc..d6cf8c543dbd 100644
--- a/lib/IR/InlineAsm.cpp
+++ b/lib/IR/InlineAsm.cpp
@@ -142,14 +142,14 @@ bool InlineAsm::ConstraintInfo::Parse(StringRef Str,
       // Find the end of the register name.
       StringRef::iterator ConstraintEnd = std::find(I+1, E, '}');
       if (ConstraintEnd == E) return true;  // "{foo"
-      pCodes->push_back(std::string(I, ConstraintEnd+1));
+      pCodes->push_back(StringRef(I, ConstraintEnd+1 - I));
       I = ConstraintEnd+1;
     } else if (isdigit(static_cast<unsigned char>(*I))) { // Matching Constraint
       // Maximal munch numbers.
       StringRef::iterator NumStart = I;
       while (I != E && isdigit(static_cast<unsigned char>(*I)))
         ++I;
-      pCodes->push_back(std::string(NumStart, I));
+      pCodes->push_back(StringRef(NumStart, I - NumStart));
       unsigned N = atoi(pCodes->back().c_str());
       // Check that this is a valid matching constraint!
       if (N >= ConstraintsSoFar.size() || ConstraintsSoFar[N].Type != isOutput||
@@ -183,11 +183,11 @@ bool InlineAsm::ConstraintInfo::Parse(StringRef Str,
     } else if (*I == '^') {
       // Multi-letter constraint
       // FIXME: For now assuming these are 2-character constraints.
-      pCodes->push_back(std::string(I+1, I+3));
+      pCodes->push_back(StringRef(I+1, 2));
       I += 3;
     } else {
       // Single letter constraint.
-      pCodes->push_back(std::string(I, I+1));
+      pCodes->push_back(StringRef(I, 1));
       ++I;
     }
   }
diff --git a/lib/IR/Instruction.cpp b/lib/IR/Instruction.cpp
index 4b33d2e66ea1..ed08f85c60b6 100644
--- a/lib/IR/Instruction.cpp
+++ b/lib/IR/Instruction.cpp
@@ -96,6 +96,30 @@ void Instruction::moveBefore(Instruction *MovePos) {
       MovePos->getIterator(), getParent()->getInstList(), getIterator());
 }
 
+void Instruction::setHasNoUnsignedWrap(bool b) {
+  cast<OverflowingBinaryOperator>(this)->setHasNoUnsignedWrap(b);
+}
+
+void Instruction::setHasNoSignedWrap(bool b) {
+  cast<OverflowingBinaryOperator>(this)->setHasNoSignedWrap(b);
+}
+
+void Instruction::setIsExact(bool b) {
+  cast<PossiblyExactOperator>(this)->setIsExact(b);
+}
+
+bool Instruction::hasNoUnsignedWrap() const {
+  return cast<OverflowingBinaryOperator>(this)->hasNoUnsignedWrap();
+}
+
+bool Instruction::hasNoSignedWrap() const {
+  return cast<OverflowingBinaryOperator>(this)->hasNoSignedWrap();
+}
+
+bool Instruction::isExact() const {
+  return cast<PossiblyExactOperator>(this)->isExact();
+}
+
 /// Set or clear the unsafe-algebra flag on this instruction, which must be an
 /// operator which supports this flag. See LangRef.html for the meaning of this
 /// flag.
@@ -190,6 +214,54 @@ void Instruction::copyFastMathFlags(const Instruction *I) {
   copyFastMathFlags(I->getFastMathFlags());
 }
 
+void Instruction::copyIRFlags(const Value *V) {
+  // Copy the wrapping flags.
+  if (auto *OB = dyn_cast<OverflowingBinaryOperator>(V)) {
+    if (isa<OverflowingBinaryOperator>(this)) {
+      setHasNoSignedWrap(OB->hasNoSignedWrap());
+      setHasNoUnsignedWrap(OB->hasNoUnsignedWrap());
+    }
+  }
+
+  // Copy the exact flag.
+  if (auto *PE = dyn_cast<PossiblyExactOperator>(V))
+    if (isa<PossiblyExactOperator>(this))
+      setIsExact(PE->isExact());
+
+  // Copy the fast-math flags.
+  if (auto *FP = dyn_cast<FPMathOperator>(V))
+    if (isa<FPMathOperator>(this))
+      copyFastMathFlags(FP->getFastMathFlags());
+
+  if (auto *SrcGEP = dyn_cast<GetElementPtrInst>(V))
+    if (auto *DestGEP = dyn_cast<GetElementPtrInst>(this))
+      DestGEP->setIsInBounds(SrcGEP->isInBounds() | DestGEP->isInBounds());
+}
+
+void Instruction::andIRFlags(const Value *V) {
+  if (auto *OB = dyn_cast<OverflowingBinaryOperator>(V)) {
+    if (isa<OverflowingBinaryOperator>(this)) {
+      setHasNoSignedWrap(hasNoSignedWrap() & OB->hasNoSignedWrap());
+      setHasNoUnsignedWrap(hasNoUnsignedWrap() & OB->hasNoUnsignedWrap());
+    }
+  }
+
+  if (auto *PE = dyn_cast<PossiblyExactOperator>(V))
+    if (isa<PossiblyExactOperator>(this))
+      setIsExact(isExact() & PE->isExact());
+
+  if (auto *FP = dyn_cast<FPMathOperator>(V)) {
+    if (isa<FPMathOperator>(this)) {
+      FastMathFlags FM = getFastMathFlags();
+      FM &= FP->getFastMathFlags();
+      copyFastMathFlags(FM);
+    }
+  }
+
+  if (auto *SrcGEP = dyn_cast<GetElementPtrInst>(V))
+    if (auto *DestGEP = dyn_cast<GetElementPtrInst>(this))
+      DestGEP->setIsInBounds(SrcGEP->isInBounds() & DestGEP->isInBounds());
+}
 
 const char *Instruction::getOpcodeName(unsigned OpCode) {
   switch (OpCode) {
@@ -271,13 +343,18 @@ const char *Instruction::getOpcodeName(unsigned OpCode) {
   }
 }
 
-/// Return true if both instructions have the same special state
-/// This must be kept in sync with lib/Transforms/IPO/MergeFunctions.cpp.
+/// Return true if both instructions have the same special state This must be
+/// kept in sync with FunctionComparator::cmpOperations in
+/// lib/Transforms/IPO/MergeFunctions.cpp.
 static bool haveSameSpecialState(const Instruction *I1, const Instruction *I2,
                                  bool IgnoreAlignment = false) {
   assert(I1->getOpcode() == I2->getOpcode() &&
          "Can not compare special state of different instructions");
 
+  if (const AllocaInst *AI = dyn_cast<AllocaInst>(I1))
+    return AI->getAllocatedType() == cast<AllocaInst>(I2)->getAllocatedType() &&
+           (AI->getAlignment() == cast<AllocaInst>(I2)->getAlignment() ||
+            IgnoreAlignment);
   if (const LoadInst *LI = dyn_cast<LoadInst>(I1))
     return LI->isVolatile() == cast<LoadInst>(I2)->isVolatile() &&
            (LI->getAlignment() == cast<LoadInst>(I2)->getAlignment() ||
@@ -360,8 +437,7 @@ bool Instruction::isIdenticalToWhenDefined(const Instruction *I) const {
   return haveSameSpecialState(this, I);
 }
 
-// isSameOperationAs
-// This should be kept in sync with isEquivalentOperation in
+// Keep this in sync with FunctionComparator::cmpOperations in
 // lib/Transforms/IPO/MergeFunctions.cpp.
 bool Instruction::isSameOperationAs(const Instruction *I,
                                     unsigned flags) const {
@@ -461,9 +537,9 @@ bool Instruction::isAtomic() const {
   case Instruction::Fence:
     return true;
   case Instruction::Load:
-    return cast<LoadInst>(this)->getOrdering() != NotAtomic;
+    return cast<LoadInst>(this)->getOrdering() != AtomicOrdering::NotAtomic;
   case Instruction::Store:
-    return cast<StoreInst>(this)->getOrdering() != NotAtomic;
+    return cast<StoreInst>(this)->getOrdering() != AtomicOrdering::NotAtomic;
   }
 }
 
@@ -477,12 +553,6 @@ bool Instruction::mayThrow() const {
   return isa<ResumeInst>(this);
 }
 
-bool Instruction::mayReturn() const {
-  if (const CallInst *CI = dyn_cast<CallInst>(this))
-    return !CI->doesNotReturn();
-  return true;
-}
-
 /// isAssociative - Return true if the instruction is associative:
 ///
 ///   Associative operators satisfy:  x op (y op z) === (x op y) op z
diff --git a/lib/IR/Instructions.cpp b/lib/IR/Instructions.cpp
index 7c64ca7b7275..b9c693ff19ad 100644
--- a/lib/IR/Instructions.cpp
+++ b/lib/IR/Instructions.cpp
@@ -154,6 +154,24 @@ Value *PHINode::hasConstantValue() const {
   return ConstantValue;
 }
 
+/// hasConstantOrUndefValue - Whether the specified PHI node always merges
+/// together the same value, assuming that undefs result in the same value as
+/// non-undefs.
+/// Unlike \ref hasConstantValue, this does not return a value because the
+/// unique non-undef incoming value need not dominate the PHI node.
+bool PHINode::hasConstantOrUndefValue() const {
+  Value *ConstantValue = nullptr;
+  for (unsigned i = 0, e = getNumIncomingValues(); i != e; ++i) {
+    Value *Incoming = getIncomingValue(i);
+    if (Incoming != this && !isa<UndefValue>(Incoming)) {
+      if (ConstantValue && ConstantValue != Incoming)
+        return false;
+      ConstantValue = Incoming;
+    }
+  }
+  return true;
+}
+
 //===----------------------------------------------------------------------===//
 //                       LandingPadInst Implementation
 //===----------------------------------------------------------------------===//
@@ -309,12 +327,26 @@ CallInst *CallInst::Create(CallInst *CI, ArrayRef<OperandBundleDef> OpB,
   NewCI->setCallingConv(CI->getCallingConv());
   NewCI->SubclassOptionalData = CI->SubclassOptionalData;
   NewCI->setAttributes(CI->getAttributes());
+  NewCI->setDebugLoc(CI->getDebugLoc());
   return NewCI;
 }
 
-void CallInst::addAttribute(unsigned i, Attribute::AttrKind attr) {
+Value *CallInst::getReturnedArgOperand() const {
+  unsigned Index;
+
+  if (AttributeList.hasAttrSomewhere(Attribute::Returned, &Index) && Index)
+    return getArgOperand(Index-1);
+  if (const Function *F = getCalledFunction())
+    if (F->getAttributes().hasAttrSomewhere(Attribute::Returned, &Index) &&
+        Index)
+      return getArgOperand(Index-1);
+      
+  return nullptr;
+}
+
+void CallInst::addAttribute(unsigned i, Attribute::AttrKind Kind) {
   AttributeSet PAL = getAttributes();
-  PAL = PAL.addAttribute(getContext(), i, attr);
+  PAL = PAL.addAttribute(getContext(), i, Kind);
   setAttributes(PAL);
 }
 
@@ -324,9 +356,27 @@ void CallInst::addAttribute(unsigned i, StringRef Kind, StringRef Value) {
   setAttributes(PAL);
 }
 
-void CallInst::removeAttribute(unsigned i, Attribute attr) {
+void CallInst::addAttribute(unsigned i, Attribute Attr) {
+  AttributeSet PAL = getAttributes();
+  PAL = PAL.addAttribute(getContext(), i, Attr);
+  setAttributes(PAL);
+}
+
+void CallInst::removeAttribute(unsigned i, Attribute::AttrKind Kind) {
+  AttributeSet PAL = getAttributes();
+  PAL = PAL.removeAttribute(getContext(), i, Kind);
+  setAttributes(PAL);
+}
+
+void CallInst::removeAttribute(unsigned i, StringRef Kind) {
   AttributeSet PAL = getAttributes();
-  AttrBuilder B(attr);
+  PAL = PAL.removeAttribute(getContext(), i, Kind);
+  setAttributes(PAL);
+}
+
+void CallInst::removeAttribute(unsigned i, Attribute Attr) {
+  AttributeSet PAL = getAttributes();
+  AttrBuilder B(Attr);
   LLVMContext &Context = getContext();
   PAL = PAL.removeAttributes(Context, i,
                              AttributeSet::get(Context, i, B));
@@ -345,19 +395,26 @@ void CallInst::addDereferenceableOrNullAttr(unsigned i, uint64_t Bytes) {
   setAttributes(PAL);
 }
 
-bool CallInst::paramHasAttr(unsigned i, Attribute::AttrKind A) const {
+bool CallInst::paramHasAttr(unsigned i, Attribute::AttrKind Kind) const {
   assert(i < (getNumArgOperands() + 1) && "Param index out of bounds!");
 
-  if (AttributeList.hasAttribute(i, A))
+  if (AttributeList.hasAttribute(i, Kind))
     return true;
   if (const Function *F = getCalledFunction())
-    return F->getAttributes().hasAttribute(i, A);
+    return F->getAttributes().hasAttribute(i, Kind);
   return false;
 }
 
-bool CallInst::dataOperandHasImpliedAttr(unsigned i,
-                                         Attribute::AttrKind A) const {
+Attribute CallInst::getAttribute(unsigned i, Attribute::AttrKind Kind) const {
+  return getAttributes().getAttribute(i, Kind);
+}
 
+Attribute CallInst::getAttribute(unsigned i, StringRef Kind) const {
+  return getAttributes().getAttribute(i, Kind);
+}
+
+bool CallInst::dataOperandHasImpliedAttr(unsigned i,
+                                         Attribute::AttrKind Kind) const {
   // There are getNumOperands() - 1 data operands.  The last operand is the
   // callee.
   assert(i < getNumOperands() && "Data operand index out of bounds!");
@@ -367,11 +424,11 @@ bool CallInst::dataOperandHasImpliedAttr(unsigned i,
   // containing operand bundle, if the operand is a bundle operand.
 
   if (i < (getNumArgOperands() + 1))
-    return paramHasAttr(i, A);
+    return paramHasAttr(i, Kind);
 
   assert(hasOperandBundles() && i >= (getBundleOperandsStartIndex() + 1) &&
          "Must be either a call argument or an operand bundle!");
-  return bundleOperandHasAttr(i - 1, A);
+  return bundleOperandHasAttr(i - 1, Kind);
 }
 
 /// IsConstantOne - Return true only if val is constant int 1
@@ -383,16 +440,17 @@ static bool IsConstantOne(Value *val) {
 
 static Instruction *createMalloc(Instruction *InsertBefore,
                                  BasicBlock *InsertAtEnd, Type *IntPtrTy,
-                                 Type *AllocTy, Value *AllocSize, 
-                                 Value *ArraySize, Function *MallocF,
-                                 const Twine &Name) {
+                                 Type *AllocTy, Value *AllocSize,
+                                 Value *ArraySize,
+                                 ArrayRef<OperandBundleDef> OpB,
+                                 Function *MallocF, const Twine &Name) {
   assert(((!InsertBefore && InsertAtEnd) || (InsertBefore && !InsertAtEnd)) &&
          "createMalloc needs either InsertBefore or InsertAtEnd");
 
   // malloc(type) becomes: 
   //       bitcast (i8* malloc(typeSize)) to type*
   // malloc(type, arraySize) becomes:
-  //       bitcast (i8 *malloc(typeSize*arraySize)) to type*
+  //       bitcast (i8* malloc(typeSize*arraySize)) to type*
   if (!ArraySize)
     ArraySize = ConstantInt::get(IntPtrTy, 1);
   else if (ArraySize->getType() != IntPtrTy) {
@@ -425,8 +483,8 @@ static Instruction *createMalloc(Instruction *InsertBefore,
 
   assert(AllocSize->getType() == IntPtrTy && "malloc arg is wrong size");
   // Create the call to Malloc.
-  BasicBlock* BB = InsertBefore ? InsertBefore->getParent() : InsertAtEnd;
-  Module* M = BB->getParent()->getParent();
+  BasicBlock *BB = InsertBefore ? InsertBefore->getParent() : InsertAtEnd;
+  Module *M = BB->getParent()->getParent();
   Type *BPTy = Type::getInt8PtrTy(BB->getContext());
   Value *MallocFunc = MallocF;
   if (!MallocFunc)
@@ -436,13 +494,14 @@ static Instruction *createMalloc(Instruction *InsertBefore,
   CallInst *MCall = nullptr;
   Instruction *Result = nullptr;
   if (InsertBefore) {
-    MCall = CallInst::Create(MallocFunc, AllocSize, "malloccall", InsertBefore);
+    MCall = CallInst::Create(MallocFunc, AllocSize, OpB, "malloccall",
+                             InsertBefore);
     Result = MCall;
     if (Result->getType() != AllocPtrType)
       // Create a cast instruction to convert to the right type...
       Result = new BitCastInst(MCall, AllocPtrType, Name, InsertBefore);
   } else {
-    MCall = CallInst::Create(MallocFunc, AllocSize, "malloccall");
+    MCall = CallInst::Create(MallocFunc, AllocSize, OpB, "malloccall");
     Result = MCall;
     if (Result->getType() != AllocPtrType) {
       InsertAtEnd->getInstList().push_back(MCall);
@@ -469,11 +528,21 @@ static Instruction *createMalloc(Instruction *InsertBefore,
 Instruction *CallInst::CreateMalloc(Instruction *InsertBefore,
                                     Type *IntPtrTy, Type *AllocTy,
                                     Value *AllocSize, Value *ArraySize,
-                                    Function * MallocF,
+                                    Function *MallocF,
                                     const Twine &Name) {
   return createMalloc(InsertBefore, nullptr, IntPtrTy, AllocTy, AllocSize,
-                      ArraySize, MallocF, Name);
+                      ArraySize, None, MallocF, Name);
 }
+Instruction *CallInst::CreateMalloc(Instruction *InsertBefore,
+                                    Type *IntPtrTy, Type *AllocTy,
+                                    Value *AllocSize, Value *ArraySize,
+                                    ArrayRef<OperandBundleDef> OpB,
+                                    Function *MallocF,
+                                    const Twine &Name) {
+  return createMalloc(InsertBefore, nullptr, IntPtrTy, AllocTy, AllocSize,
+                      ArraySize, OpB, MallocF, Name);
+}
+
 
 /// CreateMalloc - Generate the IR for a call to malloc:
 /// 1. Compute the malloc call's argument as the specified type's size,
@@ -488,33 +557,43 @@ Instruction *CallInst::CreateMalloc(BasicBlock *InsertAtEnd,
                                     Value *AllocSize, Value *ArraySize, 
                                     Function *MallocF, const Twine &Name) {
   return createMalloc(nullptr, InsertAtEnd, IntPtrTy, AllocTy, AllocSize,
-                      ArraySize, MallocF, Name);
+                      ArraySize, None, MallocF, Name);
+}
+Instruction *CallInst::CreateMalloc(BasicBlock *InsertAtEnd,
+                                    Type *IntPtrTy, Type *AllocTy,
+                                    Value *AllocSize, Value *ArraySize,
+                                    ArrayRef<OperandBundleDef> OpB,
+                                    Function *MallocF, const Twine &Name) {
+  return createMalloc(nullptr, InsertAtEnd, IntPtrTy, AllocTy, AllocSize,
+                      ArraySize, OpB, MallocF, Name);
 }
 
-static Instruction* createFree(Value* Source, Instruction *InsertBefore,
+static Instruction *createFree(Value *Source,
+                               ArrayRef<OperandBundleDef> Bundles,
+                               Instruction *InsertBefore,
                                BasicBlock *InsertAtEnd) {
   assert(((!InsertBefore && InsertAtEnd) || (InsertBefore && !InsertAtEnd)) &&
          "createFree needs either InsertBefore or InsertAtEnd");
   assert(Source->getType()->isPointerTy() &&
          "Can not free something of nonpointer type!");
 
-  BasicBlock* BB = InsertBefore ? InsertBefore->getParent() : InsertAtEnd;
-  Module* M = BB->getParent()->getParent();
+  BasicBlock *BB = InsertBefore ? InsertBefore->getParent() : InsertAtEnd;
+  Module *M = BB->getParent()->getParent();
 
   Type *VoidTy = Type::getVoidTy(M->getContext());
   Type *IntPtrTy = Type::getInt8PtrTy(M->getContext());
   // prototype free as "void free(void*)"
   Value *FreeFunc = M->getOrInsertFunction("free", VoidTy, IntPtrTy, nullptr);
-  CallInst* Result = nullptr;
+  CallInst *Result = nullptr;
   Value *PtrCast = Source;
   if (InsertBefore) {
     if (Source->getType() != IntPtrTy)
       PtrCast = new BitCastInst(Source, IntPtrTy, "", InsertBefore);
-    Result = CallInst::Create(FreeFunc, PtrCast, "", InsertBefore);
+    Result = CallInst::Create(FreeFunc, PtrCast, Bundles, "", InsertBefore);
   } else {
     if (Source->getType() != IntPtrTy)
       PtrCast = new BitCastInst(Source, IntPtrTy, "", InsertAtEnd);
-    Result = CallInst::Create(FreeFunc, PtrCast, "");
+    Result = CallInst::Create(FreeFunc, PtrCast, Bundles, "");
   }
   Result->setTailCall();
   if (Function *F = dyn_cast<Function>(FreeFunc))
@@ -524,15 +603,27 @@ static Instruction* createFree(Value* Source, Instruction *InsertBefore,
 }
 
 /// CreateFree - Generate the IR for a call to the builtin free function.
-Instruction * CallInst::CreateFree(Value* Source, Instruction *InsertBefore) {
-  return createFree(Source, InsertBefore, nullptr);
+Instruction *CallInst::CreateFree(Value *Source, Instruction *InsertBefore) {
+  return createFree(Source, None, InsertBefore, nullptr);
+}
+Instruction *CallInst::CreateFree(Value *Source,
+                                  ArrayRef<OperandBundleDef> Bundles,
+                                  Instruction *InsertBefore) {
+  return createFree(Source, Bundles, InsertBefore, nullptr);
 }
 
 /// CreateFree - Generate the IR for a call to the builtin free function.
 /// Note: This function does not add the call to the basic block, that is the
 /// responsibility of the caller.
-Instruction* CallInst::CreateFree(Value* Source, BasicBlock *InsertAtEnd) {
-  Instruction* FreeCall = createFree(Source, nullptr, InsertAtEnd);
+Instruction *CallInst::CreateFree(Value *Source, BasicBlock *InsertAtEnd) {
+  Instruction *FreeCall = createFree(Source, None, nullptr, InsertAtEnd);
+  assert(FreeCall && "CreateFree did not create a CallInst");
+  return FreeCall;
+}
+Instruction *CallInst::CreateFree(Value *Source,
+                                  ArrayRef<OperandBundleDef> Bundles,
+                                  BasicBlock *InsertAtEnd) {
+  Instruction *FreeCall = createFree(Source, Bundles, nullptr, InsertAtEnd);
   assert(FreeCall && "CreateFree did not create a CallInst");
   return FreeCall;
 }
@@ -596,6 +687,7 @@ InvokeInst *InvokeInst::Create(InvokeInst *II, ArrayRef<OperandBundleDef> OpB,
   NewII->setCallingConv(II->getCallingConv());
   NewII->SubclassOptionalData = II->SubclassOptionalData;
   NewII->setAttributes(II->getAttributes());
+  NewII->setDebugLoc(II->getDebugLoc());
   return NewII;
 }
 
@@ -609,18 +701,31 @@ void InvokeInst::setSuccessorV(unsigned idx, BasicBlock *B) {
   return setSuccessor(idx, B);
 }
 
-bool InvokeInst::paramHasAttr(unsigned i, Attribute::AttrKind A) const {
+Value *InvokeInst::getReturnedArgOperand() const {
+  unsigned Index;
+
+  if (AttributeList.hasAttrSomewhere(Attribute::Returned, &Index) && Index)
+    return getArgOperand(Index-1);
+  if (const Function *F = getCalledFunction())
+    if (F->getAttributes().hasAttrSomewhere(Attribute::Returned, &Index) &&
+        Index)
+      return getArgOperand(Index-1);
+      
+  return nullptr;
+}
+
+bool InvokeInst::paramHasAttr(unsigned i, Attribute::AttrKind Kind) const {
   assert(i < (getNumArgOperands() + 1) && "Param index out of bounds!");
 
-  if (AttributeList.hasAttribute(i, A))
+  if (AttributeList.hasAttribute(i, Kind))
     return true;
   if (const Function *F = getCalledFunction())
-    return F->getAttributes().hasAttribute(i, A);
+    return F->getAttributes().hasAttribute(i, Kind);
   return false;
 }
 
 bool InvokeInst::dataOperandHasImpliedAttr(unsigned i,
-                                           Attribute::AttrKind A) const {
+                                           Attribute::AttrKind Kind) const {
   // There are getNumOperands() - 3 data operands.  The last three operands are
   // the callee and the two successor basic blocks.
   assert(i < (getNumOperands() - 2) && "Data operand index out of bounds!");
@@ -630,27 +735,54 @@ bool InvokeInst::dataOperandHasImpliedAttr(unsigned i,
   // containing operand bundle, if the operand is a bundle operand.
 
   if (i < (getNumArgOperands() + 1))
-    return paramHasAttr(i, A);
+    return paramHasAttr(i, Kind);
 
   assert(hasOperandBundles() && i >= (getBundleOperandsStartIndex() + 1) &&
          "Must be either an invoke argument or an operand bundle!");
-  return bundleOperandHasAttr(i - 1, A);
+  return bundleOperandHasAttr(i - 1, Kind);
+}
+
+void InvokeInst::addAttribute(unsigned i, Attribute::AttrKind Kind) {
+  AttributeSet PAL = getAttributes();
+  PAL = PAL.addAttribute(getContext(), i, Kind);
+  setAttributes(PAL);
+}
+
+void InvokeInst::addAttribute(unsigned i, Attribute Attr) {
+  AttributeSet PAL = getAttributes();
+  PAL = PAL.addAttribute(getContext(), i, Attr);
+  setAttributes(PAL);
+}
+
+void InvokeInst::removeAttribute(unsigned i, Attribute::AttrKind Kind) {
+  AttributeSet PAL = getAttributes();
+  PAL = PAL.removeAttribute(getContext(), i, Kind);
+  setAttributes(PAL);
 }
 
-void InvokeInst::addAttribute(unsigned i, Attribute::AttrKind attr) {
+void InvokeInst::removeAttribute(unsigned i, StringRef Kind) {
   AttributeSet PAL = getAttributes();
-  PAL = PAL.addAttribute(getContext(), i, attr);
+  PAL = PAL.removeAttribute(getContext(), i, Kind);
   setAttributes(PAL);
 }
 
-void InvokeInst::removeAttribute(unsigned i, Attribute attr) {
+void InvokeInst::removeAttribute(unsigned i, Attribute Attr) {
   AttributeSet PAL = getAttributes();
-  AttrBuilder B(attr);
+  AttrBuilder B(Attr);
   PAL = PAL.removeAttributes(getContext(), i,
                              AttributeSet::get(getContext(), i, B));
   setAttributes(PAL);
 }
 
+Attribute InvokeInst::getAttribute(unsigned i,
+                                   Attribute::AttrKind Kind) const {
+  return getAttributes().getAttribute(i, Kind);
+}
+
+Attribute InvokeInst::getAttribute(unsigned i, StringRef Kind) const {
+  return getAttributes().getAttribute(i, Kind);
+}
+
 void InvokeInst::addDereferenceableAttr(unsigned i, uint64_t Bytes) {
   AttributeSet PAL = getAttributes();
   PAL = PAL.addDereferenceableAttr(getContext(), i, Bytes);
@@ -1207,13 +1339,13 @@ LoadInst::LoadInst(Value *Ptr, const Twine &Name, bool isVolatile,
 
 LoadInst::LoadInst(Type *Ty, Value *Ptr, const Twine &Name, bool isVolatile,
                    unsigned Align, Instruction *InsertBef)
-    : LoadInst(Ty, Ptr, Name, isVolatile, Align, NotAtomic, CrossThread,
-               InsertBef) {}
+    : LoadInst(Ty, Ptr, Name, isVolatile, Align, AtomicOrdering::NotAtomic,
+               CrossThread, InsertBef) {}
 
 LoadInst::LoadInst(Value *Ptr, const Twine &Name, bool isVolatile,
                    unsigned Align, BasicBlock *InsertAE)
-    : LoadInst(Ptr, Name, isVolatile, Align, NotAtomic, CrossThread, InsertAE) {
-}
+    : LoadInst(Ptr, Name, isVolatile, Align, AtomicOrdering::NotAtomic,
+               CrossThread, InsertAE) {}
 
 LoadInst::LoadInst(Type *Ty, Value *Ptr, const Twine &Name, bool isVolatile,
                    unsigned Align, AtomicOrdering Order,
@@ -1245,7 +1377,7 @@ LoadInst::LoadInst(Value *Ptr, const char *Name, Instruction *InsertBef)
                      Load, Ptr, InsertBef) {
   setVolatile(false);
   setAlignment(0);
-  setAtomic(NotAtomic);
+  setAtomic(AtomicOrdering::NotAtomic);
   AssertOK();
   if (Name && Name[0]) setName(Name);
 }
@@ -1255,7 +1387,7 @@ LoadInst::LoadInst(Value *Ptr, const char *Name, BasicBlock *InsertAE)
                      Load, Ptr, InsertAE) {
   setVolatile(false);
   setAlignment(0);
-  setAtomic(NotAtomic);
+  setAtomic(AtomicOrdering::NotAtomic);
   AssertOK();
   if (Name && Name[0]) setName(Name);
 }
@@ -1266,7 +1398,7 @@ LoadInst::LoadInst(Type *Ty, Value *Ptr, const char *Name, bool isVolatile,
   assert(Ty == cast<PointerType>(Ptr->getType())->getElementType());
   setVolatile(isVolatile);
   setAlignment(0);
-  setAtomic(NotAtomic);
+  setAtomic(AtomicOrdering::NotAtomic);
   AssertOK();
   if (Name && Name[0]) setName(Name);
 }
@@ -1277,7 +1409,7 @@ LoadInst::LoadInst(Value *Ptr, const char *Name, bool isVolatile,
                      Load, Ptr, InsertAE) {
   setVolatile(isVolatile);
   setAlignment(0);
-  setAtomic(NotAtomic);
+  setAtomic(AtomicOrdering::NotAtomic);
   AssertOK();
   if (Name && Name[0]) setName(Name);
 }
@@ -1322,13 +1454,13 @@ StoreInst::StoreInst(Value *val, Value *addr, bool isVolatile,
 
 StoreInst::StoreInst(Value *val, Value *addr, bool isVolatile, unsigned Align,
                      Instruction *InsertBefore)
-    : StoreInst(val, addr, isVolatile, Align, NotAtomic, CrossThread,
-                InsertBefore) {}
+    : StoreInst(val, addr, isVolatile, Align, AtomicOrdering::NotAtomic,
+                CrossThread, InsertBefore) {}
 
 StoreInst::StoreInst(Value *val, Value *addr, bool isVolatile, unsigned Align,
                      BasicBlock *InsertAtEnd)
-    : StoreInst(val, addr, isVolatile, Align, NotAtomic, CrossThread,
-                InsertAtEnd) {}
+    : StoreInst(val, addr, isVolatile, Align, AtomicOrdering::NotAtomic,
+                CrossThread, InsertAtEnd) {}
 
 StoreInst::StoreInst(Value *val, Value *addr, bool isVolatile,
                      unsigned Align, AtomicOrdering Order,
@@ -1396,13 +1528,15 @@ void AtomicCmpXchgInst::Init(Value *Ptr, Value *Cmp, Value *NewVal,
   assert(getOperand(2)->getType() ==
                  cast<PointerType>(getOperand(0)->getType())->getElementType()
          && "Ptr must be a pointer to NewVal type!");
-  assert(SuccessOrdering != NotAtomic &&
+  assert(SuccessOrdering != AtomicOrdering::NotAtomic &&
          "AtomicCmpXchg instructions must be atomic!");
-  assert(FailureOrdering != NotAtomic &&
+  assert(FailureOrdering != AtomicOrdering::NotAtomic &&
          "AtomicCmpXchg instructions must be atomic!");
-  assert(SuccessOrdering >= FailureOrdering &&
-         "AtomicCmpXchg success ordering must be at least as strong as fail");
-  assert(FailureOrdering != Release && FailureOrdering != AcquireRelease &&
+  assert(!isStrongerThan(FailureOrdering, SuccessOrdering) &&
+         "AtomicCmpXchg failure argument shall be no stronger than the success "
+         "argument");
+  assert(FailureOrdering != AtomicOrdering::Release &&
+         FailureOrdering != AtomicOrdering::AcquireRelease &&
          "AtomicCmpXchg failure ordering cannot include release semantics");
 }
 
@@ -1452,7 +1586,7 @@ void AtomicRMWInst::Init(BinOp Operation, Value *Ptr, Value *Val,
   assert(getOperand(1)->getType() ==
          cast<PointerType>(getOperand(0)->getType())->getElementType()
          && "Ptr must be a pointer to Val type!");
-  assert(Ordering != NotAtomic &&
+  assert(Ordering != AtomicOrdering::NotAtomic &&
          "AtomicRMW instructions must be atomic!");
 }
 
@@ -2099,7 +2233,7 @@ static inline bool isConstantAllOnes(const Value *V) {
 bool BinaryOperator::isNeg(const Value *V) {
   if (const BinaryOperator *Bop = dyn_cast<BinaryOperator>(V))
     if (Bop->getOpcode() == Instruction::Sub)
-      if (Constant* C = dyn_cast<Constant>(Bop->getOperand(0)))
+      if (Constant *C = dyn_cast<Constant>(Bop->getOperand(0)))
         return C->isNegativeZeroValue();
   return false;
 }
@@ -2107,7 +2241,7 @@ bool BinaryOperator::isNeg(const Value *V) {
 bool BinaryOperator::isFNeg(const Value *V, bool IgnoreZeroSign) {
   if (const BinaryOperator *Bop = dyn_cast<BinaryOperator>(V))
     if (Bop->getOpcode() == Instruction::FSub)
-      if (Constant* C = dyn_cast<Constant>(Bop->getOperand(0))) {
+      if (Constant *C = dyn_cast<Constant>(Bop->getOperand(0))) {
         if (!IgnoreZeroSign)
           IgnoreZeroSign = cast<Instruction>(V)->hasNoSignedZeros();
         return !IgnoreZeroSign ? C->isNegativeZeroValue() : C->isZeroValue();
@@ -2167,62 +2301,6 @@ bool BinaryOperator::swapOperands() {
   return false;
 }
 
-void BinaryOperator::setHasNoUnsignedWrap(bool b) {
-  cast<OverflowingBinaryOperator>(this)->setHasNoUnsignedWrap(b);
-}
-
-void BinaryOperator::setHasNoSignedWrap(bool b) {
-  cast<OverflowingBinaryOperator>(this)->setHasNoSignedWrap(b);
-}
-
-void BinaryOperator::setIsExact(bool b) {
-  cast<PossiblyExactOperator>(this)->setIsExact(b);
-}
-
-bool BinaryOperator::hasNoUnsignedWrap() const {
-  return cast<OverflowingBinaryOperator>(this)->hasNoUnsignedWrap();
-}
-
-bool BinaryOperator::hasNoSignedWrap() const {
-  return cast<OverflowingBinaryOperator>(this)->hasNoSignedWrap();
-}
-
-bool BinaryOperator::isExact() const {
-  return cast<PossiblyExactOperator>(this)->isExact();
-}
-
-void BinaryOperator::copyIRFlags(const Value *V) {
-  // Copy the wrapping flags.
-  if (auto *OB = dyn_cast<OverflowingBinaryOperator>(V)) {
-    setHasNoSignedWrap(OB->hasNoSignedWrap());
-    setHasNoUnsignedWrap(OB->hasNoUnsignedWrap());
-  }
-
-  // Copy the exact flag.
-  if (auto *PE = dyn_cast<PossiblyExactOperator>(V))
-    setIsExact(PE->isExact());
-  
-  // Copy the fast-math flags.
-  if (auto *FP = dyn_cast<FPMathOperator>(V))
-    copyFastMathFlags(FP->getFastMathFlags());
-}
-
-void BinaryOperator::andIRFlags(const Value *V) {
-  if (auto *OB = dyn_cast<OverflowingBinaryOperator>(V)) {
-    setHasNoSignedWrap(hasNoSignedWrap() & OB->hasNoSignedWrap());
-    setHasNoUnsignedWrap(hasNoUnsignedWrap() & OB->hasNoUnsignedWrap());
-  }
-  
-  if (auto *PE = dyn_cast<PossiblyExactOperator>(V))
-    setIsExact(isExact() & PE->isExact());
-  
-  if (auto *FP = dyn_cast<FPMathOperator>(V)) {
-    FastMathFlags FM = getFastMathFlags();
-    FM &= FP->getFastMathFlags();
-    copyFastMathFlags(FM);
-  }
-}
-
 
 //===----------------------------------------------------------------------===//
 //                             FPMathOperator Class
@@ -2267,8 +2345,8 @@ bool CastInst::isLosslessCast() const {
     return false;
 
   // Identity cast is always lossless
-  Type* SrcTy = getOperand(0)->getType();
-  Type* DstTy = getType();
+  Type *SrcTy = getOperand(0)->getType();
+  Type *DstTy = getType();
   if (SrcTy == DstTy)
     return true;
   
@@ -3575,6 +3653,34 @@ bool CmpInst::isFalseWhenEqual(Predicate predicate) {
   }
 }
 
+bool CmpInst::isImpliedTrueByMatchingCmp(Predicate Pred1, Predicate Pred2) {
+  // If the predicates match, then we know the first condition implies the
+  // second is true.
+  if (Pred1 == Pred2)
+    return true;
+
+  switch (Pred1) {
+  default:
+    break;
+  case ICMP_EQ:
+    // A == B implies A >=u B, A <=u B, A >=s B, and A <=s B are true.
+    return Pred2 == ICMP_UGE || Pred2 == ICMP_ULE || Pred2 == ICMP_SGE ||
+           Pred2 == ICMP_SLE;
+  case ICMP_UGT: // A >u B implies A != B and A >=u B are true.
+    return Pred2 == ICMP_NE || Pred2 == ICMP_UGE;
+  case ICMP_ULT: // A <u B implies A != B and A <=u B are true.
+    return Pred2 == ICMP_NE || Pred2 == ICMP_ULE;
+  case ICMP_SGT: // A >s B implies A != B and A >=s B are true.
+    return Pred2 == ICMP_NE || Pred2 == ICMP_SGE;
+  case ICMP_SLT: // A <s B implies A != B and A <=s B are true.
+    return Pred2 == ICMP_NE || Pred2 == ICMP_SLE;
+  }
+  return false;
+}
+
+bool CmpInst::isImpliedFalseByMatchingCmp(Predicate Pred1, Predicate Pred2) {
+  return isImpliedTrueByMatchingCmp(Pred1, getInversePredicate(Pred2));
+}
 
 //===----------------------------------------------------------------------===//
 //                        SwitchInst Implementation
@@ -3809,6 +3915,7 @@ AllocaInst *AllocaInst::cloneImpl() const {
   AllocaInst *Result = new AllocaInst(getAllocatedType(),
                                       (Value *)getOperand(0), getAlignment());
   Result->setUsedWithInAlloca(isUsedWithInAlloca());
+  Result->setSwiftError(isSwiftError());
   return Result;
 }
 
diff --git a/lib/IR/IntrinsicInst.cpp b/lib/IR/IntrinsicInst.cpp
index b9b5a29091df..3f747117b728 100644
--- a/lib/IR/IntrinsicInst.cpp
+++ b/lib/IR/IntrinsicInst.cpp
@@ -25,31 +25,18 @@
 #include "llvm/IR/Constants.h"
 #include "llvm/IR/GlobalVariable.h"
 #include "llvm/IR/Metadata.h"
+#include "llvm/Support/raw_ostream.h"
 using namespace llvm;
 
 //===----------------------------------------------------------------------===//
 /// DbgInfoIntrinsic - This is the common base class for debug info intrinsics
 ///
 
-static Value *CastOperand(Value *C) {
-  if (ConstantExpr *CE = dyn_cast<ConstantExpr>(C))
-    if (CE->isCast())
-      return CE->getOperand(0);
-  return nullptr;
-}
-
-Value *DbgInfoIntrinsic::StripCast(Value *C) {
-  if (Value *CO = CastOperand(C)) {
-    C = StripCast(CO);
-  } else if (GlobalVariable *GV = dyn_cast<GlobalVariable>(C)) {
-    if (GV->hasInitializer())
-      if (Value *CO = CastOperand(GV->getInitializer()))
-        C = StripCast(CO);
-  }
-  return dyn_cast<GlobalVariable>(C);
-}
+Value *DbgInfoIntrinsic::getVariableLocation(bool AllowNullOp) const {
+  Value *Op = getArgOperand(0);
+  if (AllowNullOp && !Op)
+    return nullptr;
 
-static Value *getValueImpl(Value *Op) {
   auto *MD = cast<MetadataAsValue>(Op)->getMetadata();
   if (auto *V = dyn_cast<ValueAsMetadata>(MD))
     return V->getValue();
@@ -59,23 +46,40 @@ static Value *getValueImpl(Value *Op) {
   return nullptr;
 }
 
-//===----------------------------------------------------------------------===//
-/// DbgDeclareInst - This represents the llvm.dbg.declare instruction.
-///
-
-Value *DbgDeclareInst::getAddress() const {
-  if (!getArgOperand(0))
-    return nullptr;
-
-  return getValueImpl(getArgOperand(0));
-}
+int llvm::Intrinsic::lookupLLVMIntrinsicByName(ArrayRef<const char *> NameTable,
+                                               StringRef Name) {
+  assert(Name.startswith("llvm."));
 
-//===----------------------------------------------------------------------===//
-/// DbgValueInst - This represents the llvm.dbg.value instruction.
-///
+  // Do successive binary searches of the dotted name components. For
+  // "llvm.gc.experimental.statepoint.p1i8.p1i32", we will find the range of
+  // intrinsics starting with "llvm.gc", then "llvm.gc.experimental", then
+  // "llvm.gc.experimental.statepoint", and then we will stop as the range is
+  // size 1. During the search, we can skip the prefix that we already know is
+  // identical. By using strncmp we consider names with differing suffixes to
+  // be part of the equal range.
+  size_t CmpStart = 0;
+  size_t CmpEnd = 4; // Skip the "llvm" component.
+  const char *const *Low = NameTable.begin();
+  const char *const *High = NameTable.end();
+  const char *const *LastLow = Low;
+  while (CmpEnd < Name.size() && High - Low > 0) {
+    CmpStart = CmpEnd;
+    CmpEnd = Name.find('.', CmpStart + 1);
+    CmpEnd = CmpEnd == StringRef::npos ? Name.size() : CmpEnd;
+    auto Cmp = [CmpStart, CmpEnd](const char *LHS, const char *RHS) {
+      return strncmp(LHS + CmpStart, RHS + CmpStart, CmpEnd - CmpStart) < 0;
+    };
+    LastLow = Low;
+    std::tie(Low, High) = std::equal_range(Low, High, Name.data(), Cmp);
+  }
+  if (High - Low > 0)
+    LastLow = Low;
 
-const Value *DbgValueInst::getValue() const {
-  return const_cast<DbgValueInst *>(this)->getValue();
+  if (LastLow == NameTable.end())
+    return -1;
+  StringRef NameFound = *LastLow;
+  if (Name == NameFound ||
+      (Name.startswith(NameFound) && Name[NameFound.size()] == '.'))
+    return LastLow - NameTable.begin();
+  return -1;
 }
-
-Value *DbgValueInst::getValue() { return getValueImpl(getArgOperand(0)); }
diff --git a/lib/IR/LLVMContext.cpp b/lib/IR/LLVMContext.cpp
index 48b53b0f532a..d27fcfb1b7a8 100644
--- a/lib/IR/LLVMContext.cpp
+++ b/lib/IR/LLVMContext.cpp
@@ -13,23 +13,24 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/IR/LLVMContext.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/StringMap.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/ADT/Twine.h"
 #include "LLVMContextImpl.h"
-#include "llvm/IR/Constants.h"
-#include "llvm/IR/DebugLoc.h"
 #include "llvm/IR/DiagnosticInfo.h"
 #include "llvm/IR/DiagnosticPrinter.h"
-#include "llvm/IR/Instruction.h"
 #include "llvm/IR/Metadata.h"
-#include "llvm/Support/ManagedStatic.h"
-#include "llvm/Support/SourceMgr.h"
-#include <cctype>
-using namespace llvm;
-
-static ManagedStatic<LLVMContext> GlobalContext;
+#include "llvm/IR/Module.h"
+#include "llvm/Support/Casting.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/raw_ostream.h"
+#include <cassert>
+#include <cstdlib>
+#include <string>
+#include <utility>
 
-LLVMContext& llvm::getGlobalContext() {
-  return *GlobalContext;
-}
+using namespace llvm;
 
 LLVMContext::LLVMContext() : pImpl(new LLVMContextImpl(*this)) {
   // Create the fixed metadata kinds. This is done in the same order as the
@@ -128,6 +129,15 @@ LLVMContext::LLVMContext() : pImpl(new LLVMContextImpl(*this)) {
   assert(AlignID == MD_align && "align kind id drifted");
   (void)AlignID;
 
+  // Create the 'llvm.loop' metadata kind.
+  unsigned LoopID = getMDKindID("llvm.loop");
+  assert(LoopID == MD_loop && "llvm.loop kind id drifted");
+  (void)LoopID;
+
+  unsigned TypeID = getMDKindID("type");
+  assert(TypeID == MD_type && "type kind id drifted");
+  (void)TypeID;
+
   auto *DeoptEntry = pImpl->getOrInsertBundleTag("deopt");
   assert(DeoptEntry->second == LLVMContext::OB_deopt &&
          "deopt operand bundle id drifted!");
@@ -137,7 +147,13 @@ LLVMContext::LLVMContext() : pImpl(new LLVMContextImpl(*this)) {
   assert(FuncletEntry->second == LLVMContext::OB_funclet &&
          "funclet operand bundle id drifted!");
   (void)FuncletEntry;
+
+  auto *GCTransitionEntry = pImpl->getOrInsertBundleTag("gc-transition");
+  assert(GCTransitionEntry->second == LLVMContext::OB_gc_transition &&
+         "gc-transition operand bundle id drifted!");
+  (void)GCTransitionEntry;
 }
+
 LLVMContext::~LLVMContext() { delete pImpl; }
 
 void LLVMContext::addModule(Module *M) {
@@ -180,6 +196,13 @@ void LLVMContext::setDiagnosticHandler(DiagnosticHandlerTy DiagnosticHandler,
   pImpl->RespectDiagnosticFilters = RespectFilters;
 }
 
+void LLVMContext::setDiagnosticHotnessRequested(bool Requested) {
+  pImpl->DiagnosticHotnessRequested = Requested;
+}
+bool LLVMContext::getDiagnosticHotnessRequested() const {
+  return pImpl->DiagnosticHotnessRequested;
+}
+
 LLVMContext::DiagnosticHandlerTy LLVMContext::getDiagnosticHandler() const {
   return pImpl->DiagnosticHandler;
 }
@@ -213,31 +236,14 @@ static bool isDiagnosticEnabled(const DiagnosticInfo &DI) {
   // pattern, passed via one of the -pass-remarks* flags, matches the name of
   // the pass that is emitting the diagnostic. If there is no match, ignore the
   // diagnostic and return.
-  switch (DI.getKind()) {
-  case llvm::DK_OptimizationRemark:
-    if (!cast<DiagnosticInfoOptimizationRemark>(DI).isEnabled())
-      return false;
-    break;
-  case llvm::DK_OptimizationRemarkMissed:
-    if (!cast<DiagnosticInfoOptimizationRemarkMissed>(DI).isEnabled())
-      return false;
-    break;
-  case llvm::DK_OptimizationRemarkAnalysis:
-    if (!cast<DiagnosticInfoOptimizationRemarkAnalysis>(DI).isEnabled())
-      return false;
-    break;
-  case llvm::DK_OptimizationRemarkAnalysisFPCommute:
-    if (!cast<DiagnosticInfoOptimizationRemarkAnalysisFPCommute>(DI)
-             .isEnabled())
-      return false;
-    break;
-  default:
-    break;
-  }
+  if (auto *Remark = dyn_cast<DiagnosticInfoOptimizationBase>(&DI))
+    return Remark->isEnabled();
+
   return true;
 }
 
-static const char *getDiagnosticMessagePrefix(DiagnosticSeverity Severity) {
+const char *
+LLVMContext::getDiagnosticMessagePrefix(DiagnosticSeverity Severity) {
   switch (Severity) {
   case DS_Error:
     return "error";
@@ -314,9 +320,34 @@ void LLVMContext::setGC(const Function &Fn, std::string GCName) {
   }
   It->second = std::move(GCName);
 }
+
 const std::string &LLVMContext::getGC(const Function &Fn) {
   return pImpl->GCNames[&Fn];
 }
+
 void LLVMContext::deleteGC(const Function &Fn) {
   pImpl->GCNames.erase(&Fn);
 }
+
+bool LLVMContext::shouldDiscardValueNames() const {
+  return pImpl->DiscardValueNames;
+}
+
+bool LLVMContext::isODRUniquingDebugTypes() const { return !!pImpl->DITypeMap; }
+
+void LLVMContext::enableDebugTypeODRUniquing() {
+  if (pImpl->DITypeMap)
+    return;
+
+  pImpl->DITypeMap.emplace();
+}
+
+void LLVMContext::disableDebugTypeODRUniquing() { pImpl->DITypeMap.reset(); }
+
+void LLVMContext::setDiscardValueNames(bool Discard) {
+  pImpl->DiscardValueNames = Discard;
+}
+
+OptBisect &LLVMContext::getOptBisect() {
+  return pImpl->getOptBisect();
+}
diff --git a/lib/IR/LLVMContextImpl.cpp b/lib/IR/LLVMContextImpl.cpp
index 5239b4f7d84a..b0b2c61bdf1f 100644
--- a/lib/IR/LLVMContextImpl.cpp
+++ b/lib/IR/LLVMContextImpl.cpp
@@ -16,6 +16,8 @@
 #include "llvm/IR/Attributes.h"
 #include "llvm/IR/DiagnosticInfo.h"
 #include "llvm/IR/Module.h"
+#include "llvm/IR/OptBisect.h"
+#include "llvm/Support/ManagedStatic.h"
 #include <algorithm>
 using namespace llvm;
 
@@ -43,31 +45,12 @@ LLVMContextImpl::LLVMContextImpl(LLVMContext &C)
   DiagnosticHandler = nullptr;
   DiagnosticContext = nullptr;
   RespectDiagnosticFilters = false;
+  DiagnosticHotnessRequested = false;
   YieldCallback = nullptr;
   YieldOpaqueHandle = nullptr;
   NamedStructTypesUniqueID = 0;
 }
 
-namespace {
-struct DropReferences {
-  // Takes the value_type of a ConstantUniqueMap's internal map, whose 'second'
-  // is a Constant*.
-  template <typename PairT> void operator()(const PairT &P) {
-    P.second->dropAllReferences();
-  }
-};
-
-// Temporary - drops pair.first instead of second.
-struct DropFirst {
-  // Takes the value_type of a ConstantUniqueMap's internal map, whose 'second'
-  // is a Constant*.
-  template<typename PairT>
-  void operator()(const PairT &P) {
-    P.first->dropAllReferences();
-  }
-};
-}
-
 LLVMContextImpl::~LLVMContextImpl() {
   // NOTE: We need to delete the contents of OwnedModules, but Module's dtor
   // will call LLVMContextImpl::removeModule, thus invalidating iterators into
@@ -99,14 +82,14 @@ LLVMContextImpl::~LLVMContextImpl() {
 #include "llvm/IR/Metadata.def"
 
   // Free the constants.
-  std::for_each(ExprConstants.map_begin(), ExprConstants.map_end(),
-                DropFirst());
-  std::for_each(ArrayConstants.map_begin(), ArrayConstants.map_end(),
-                DropFirst());
-  std::for_each(StructConstants.map_begin(), StructConstants.map_end(),
-                DropFirst());
-  std::for_each(VectorConstants.map_begin(), VectorConstants.map_end(),
-                DropFirst());
+  for (auto *I : ExprConstants)
+    I->dropAllReferences();
+  for (auto *I : ArrayConstants)
+    I->dropAllReferences();
+  for (auto *I : StructConstants)
+    I->dropAllReferences();
+  for (auto *I : VectorConstants)
+    I->dropAllReferences();
   ExprConstants.freeConstants();
   ArrayConstants.freeConstants();
   StructConstants.freeConstants();
@@ -117,10 +100,9 @@ LLVMContextImpl::~LLVMContextImpl() {
   InlineAsms.freeConstants();
   DeleteContainerSeconds(IntConstants);
   DeleteContainerSeconds(FPConstants);
-  
-  for (StringMap<ConstantDataSequential*>::iterator I = CDSConstants.begin(),
-       E = CDSConstants.end(); I != E; ++I)
-    delete I->second;
+
+  for (auto &CDSConstant : CDSConstants)
+    delete CDSConstant.second;
   CDSConstants.clear();
 
   // Destroy attributes.
@@ -158,9 +140,6 @@ LLVMContextImpl::~LLVMContextImpl() {
   // Destroy ValuesAsMetadata.
   for (auto &Pair : ValuesAsMetadata)
     delete Pair.second;
-
-  // Destroy MDStrings.
-  MDStringCache.clear();
 }
 
 void LLVMContextImpl::dropTriviallyDeadConstantArrays() {
@@ -168,10 +147,8 @@ void LLVMContextImpl::dropTriviallyDeadConstantArrays() {
   do {
     Changed = false;
 
-    for (auto I = ArrayConstants.map_begin(), E = ArrayConstants.map_end();
-         I != E; ) {
-      auto *C = I->first;
-      I++;
+    for (auto I = ArrayConstants.begin(), E = ArrayConstants.end(); I != E;) {
+      auto *C = *I++;
       if (C->use_empty()) {
         Changed = true;
         C->destroyConstant();
@@ -257,3 +234,19 @@ void GetElementPtrConstantExpr::anchor() { }
 
 void CompareConstantExpr::anchor() { }
 
+/// Singleton instance of the OptBisect class.
+///
+/// This singleton is accessed via the LLVMContext::getOptBisect() function.  It
+/// provides a mechanism to disable passes and individual optimizations at
+/// compile time based on a command line option (-opt-bisect-limit) in order to
+/// perform a bisecting search for optimization-related problems.
+///
+/// Even if multiple LLVMContext objects are created, they will all return the
+/// same instance of OptBisect in order to provide a single bisect count.  Any
+/// code that uses the OptBisect object should be serialized when bisection is
+/// enabled in order to enable a consistent bisect count.
+static ManagedStatic<OptBisect> OptBisector;
+
+OptBisect &LLVMContextImpl::getOptBisect() {
+  return *OptBisector;
+}
diff --git a/lib/IR/LLVMContextImpl.h b/lib/IR/LLVMContextImpl.h
index d42047d4e775..7820e2ab958d 100644
--- a/lib/IR/LLVMContextImpl.h
+++ b/lib/IR/LLVMContextImpl.h
@@ -32,6 +32,7 @@
 #include "llvm/IR/LLVMContext.h"
 #include "llvm/IR/Metadata.h"
 #include "llvm/IR/ValueHandle.h"
+#include "llvm/Support/Dwarf.h"
 #include <vector>
 
 namespace llvm {
@@ -211,6 +212,17 @@ public:
 template <class NodeTy> struct MDNodeKeyImpl;
 template <class NodeTy> struct MDNodeInfo;
 
+/// Configuration point for MDNodeInfo::isEqual().
+template <class NodeTy> struct MDNodeSubsetEqualImpl {
+  typedef MDNodeKeyImpl<NodeTy> KeyTy;
+  static bool isSubsetEqual(const KeyTy &LHS, const NodeTy *RHS) {
+    return false;
+  }
+  static bool isSubsetEqual(const NodeTy *LHS, const NodeTy *RHS) {
+    return false;
+  }
+};
+
 /// \brief DenseMapInfo for MDTuple.
 ///
 /// Note that we don't need the is-function-local bit, since that's implicit in
@@ -255,14 +267,14 @@ template <> struct MDNodeKeyImpl<DILocation> {
 /// \brief DenseMapInfo for GenericDINode.
 template <> struct MDNodeKeyImpl<GenericDINode> : MDNodeOpsKey {
   unsigned Tag;
-  StringRef Header;
-  MDNodeKeyImpl(unsigned Tag, StringRef Header, ArrayRef<Metadata *> DwarfOps)
+  MDString *Header;
+  MDNodeKeyImpl(unsigned Tag, MDString *Header, ArrayRef<Metadata *> DwarfOps)
       : MDNodeOpsKey(DwarfOps), Tag(Tag), Header(Header) {}
   MDNodeKeyImpl(const GenericDINode *N)
-      : MDNodeOpsKey(N, 1), Tag(N->getTag()), Header(N->getHeader()) {}
+      : MDNodeOpsKey(N, 1), Tag(N->getTag()), Header(N->getRawHeader()) {}
 
   bool isKeyOf(const GenericDINode *RHS) const {
-    return Tag == RHS->getTag() && Header == RHS->getHeader() &&
+    return Tag == RHS->getTag() && Header == RHS->getRawHeader() &&
            compareOps(RHS, 1);
   }
 
@@ -290,35 +302,35 @@ template <> struct MDNodeKeyImpl<DISubrange> {
 
 template <> struct MDNodeKeyImpl<DIEnumerator> {
   int64_t Value;
-  StringRef Name;
+  MDString *Name;
 
-  MDNodeKeyImpl(int64_t Value, StringRef Name) : Value(Value), Name(Name) {}
+  MDNodeKeyImpl(int64_t Value, MDString *Name) : Value(Value), Name(Name) {}
   MDNodeKeyImpl(const DIEnumerator *N)
-      : Value(N->getValue()), Name(N->getName()) {}
+      : Value(N->getValue()), Name(N->getRawName()) {}
 
   bool isKeyOf(const DIEnumerator *RHS) const {
-    return Value == RHS->getValue() && Name == RHS->getName();
+    return Value == RHS->getValue() && Name == RHS->getRawName();
   }
   unsigned getHashValue() const { return hash_combine(Value, Name); }
 };
 
 template <> struct MDNodeKeyImpl<DIBasicType> {
   unsigned Tag;
-  StringRef Name;
+  MDString *Name;
   uint64_t SizeInBits;
   uint64_t AlignInBits;
   unsigned Encoding;
 
-  MDNodeKeyImpl(unsigned Tag, StringRef Name, uint64_t SizeInBits,
+  MDNodeKeyImpl(unsigned Tag, MDString *Name, uint64_t SizeInBits,
                 uint64_t AlignInBits, unsigned Encoding)
       : Tag(Tag), Name(Name), SizeInBits(SizeInBits), AlignInBits(AlignInBits),
         Encoding(Encoding) {}
   MDNodeKeyImpl(const DIBasicType *N)
-      : Tag(N->getTag()), Name(N->getName()), SizeInBits(N->getSizeInBits()),
+      : Tag(N->getTag()), Name(N->getRawName()), SizeInBits(N->getSizeInBits()),
         AlignInBits(N->getAlignInBits()), Encoding(N->getEncoding()) {}
 
   bool isKeyOf(const DIBasicType *RHS) const {
-    return Tag == RHS->getTag() && Name == RHS->getName() &&
+    return Tag == RHS->getTag() && Name == RHS->getRawName() &&
            SizeInBits == RHS->getSizeInBits() &&
            AlignInBits == RHS->getAlignInBits() &&
            Encoding == RHS->getEncoding();
@@ -330,7 +342,7 @@ template <> struct MDNodeKeyImpl<DIBasicType> {
 
 template <> struct MDNodeKeyImpl<DIDerivedType> {
   unsigned Tag;
-  StringRef Name;
+  MDString *Name;
   Metadata *File;
   unsigned Line;
   Metadata *Scope;
@@ -341,7 +353,7 @@ template <> struct MDNodeKeyImpl<DIDerivedType> {
   unsigned Flags;
   Metadata *ExtraData;
 
-  MDNodeKeyImpl(unsigned Tag, StringRef Name, Metadata *File, unsigned Line,
+  MDNodeKeyImpl(unsigned Tag, MDString *Name, Metadata *File, unsigned Line,
                 Metadata *Scope, Metadata *BaseType, uint64_t SizeInBits,
                 uint64_t AlignInBits, uint64_t OffsetInBits, unsigned Flags,
                 Metadata *ExtraData)
@@ -349,14 +361,14 @@ template <> struct MDNodeKeyImpl<DIDerivedType> {
         BaseType(BaseType), SizeInBits(SizeInBits), AlignInBits(AlignInBits),
         OffsetInBits(OffsetInBits), Flags(Flags), ExtraData(ExtraData) {}
   MDNodeKeyImpl(const DIDerivedType *N)
-      : Tag(N->getTag()), Name(N->getName()), File(N->getRawFile()),
+      : Tag(N->getTag()), Name(N->getRawName()), File(N->getRawFile()),
         Line(N->getLine()), Scope(N->getRawScope()),
         BaseType(N->getRawBaseType()), SizeInBits(N->getSizeInBits()),
         AlignInBits(N->getAlignInBits()), OffsetInBits(N->getOffsetInBits()),
         Flags(N->getFlags()), ExtraData(N->getRawExtraData()) {}
 
   bool isKeyOf(const DIDerivedType *RHS) const {
-    return Tag == RHS->getTag() && Name == RHS->getName() &&
+    return Tag == RHS->getTag() && Name == RHS->getRawName() &&
            File == RHS->getRawFile() && Line == RHS->getLine() &&
            Scope == RHS->getRawScope() && BaseType == RHS->getRawBaseType() &&
            SizeInBits == RHS->getSizeInBits() &&
@@ -365,14 +377,53 @@ template <> struct MDNodeKeyImpl<DIDerivedType> {
            ExtraData == RHS->getRawExtraData();
   }
   unsigned getHashValue() const {
-    return hash_combine(Tag, Name, File, Line, Scope, BaseType, SizeInBits,
-                        AlignInBits, OffsetInBits, Flags, ExtraData);
+    // If this is a member inside an ODR type, only hash the type and the name.
+    // Otherwise the hash will be stronger than
+    // MDNodeSubsetEqualImpl::isODRMember().
+    if (Tag == dwarf::DW_TAG_member && Name)
+      if (auto *CT = dyn_cast_or_null<DICompositeType>(Scope))
+        if (CT->getRawIdentifier())
+          return hash_combine(Name, Scope);
+
+    // Intentionally computes the hash on a subset of the operands for
+    // performance reason. The subset has to be significant enough to avoid
+    // collision "most of the time". There is no correctness issue in case of
+    // collision because of the full check above.
+    return hash_combine(Tag, Name, File, Line, Scope, BaseType, Flags);
+  }
+};
+
+template <> struct MDNodeSubsetEqualImpl<DIDerivedType> {
+  typedef MDNodeKeyImpl<DIDerivedType> KeyTy;
+  static bool isSubsetEqual(const KeyTy &LHS, const DIDerivedType *RHS) {
+    return isODRMember(LHS.Tag, LHS.Scope, LHS.Name, RHS);
+  }
+  static bool isSubsetEqual(const DIDerivedType *LHS, const DIDerivedType *RHS) {
+    return isODRMember(LHS->getTag(), LHS->getRawScope(), LHS->getRawName(),
+                       RHS);
+  }
+
+  /// Subprograms compare equal if they declare the same function in an ODR
+  /// type.
+  static bool isODRMember(unsigned Tag, const Metadata *Scope,
+                          const MDString *Name, const DIDerivedType *RHS) {
+    // Check whether the LHS is eligible.
+    if (Tag != dwarf::DW_TAG_member || !Name)
+      return false;
+
+    auto *CT = dyn_cast_or_null<DICompositeType>(Scope);
+    if (!CT || !CT->getRawIdentifier())
+      return false;
+
+    // Compare to the RHS.
+    return Tag == RHS->getTag() && Name == RHS->getRawName() &&
+           Scope == RHS->getRawScope();
   }
 };
 
 template <> struct MDNodeKeyImpl<DICompositeType> {
   unsigned Tag;
-  StringRef Name;
+  MDString *Name;
   Metadata *File;
   unsigned Line;
   Metadata *Scope;
@@ -385,31 +436,31 @@ template <> struct MDNodeKeyImpl<DICompositeType> {
   unsigned RuntimeLang;
   Metadata *VTableHolder;
   Metadata *TemplateParams;
-  StringRef Identifier;
+  MDString *Identifier;
 
-  MDNodeKeyImpl(unsigned Tag, StringRef Name, Metadata *File, unsigned Line,
+  MDNodeKeyImpl(unsigned Tag, MDString *Name, Metadata *File, unsigned Line,
                 Metadata *Scope, Metadata *BaseType, uint64_t SizeInBits,
                 uint64_t AlignInBits, uint64_t OffsetInBits, unsigned Flags,
                 Metadata *Elements, unsigned RuntimeLang,
                 Metadata *VTableHolder, Metadata *TemplateParams,
-                StringRef Identifier)
+                MDString *Identifier)
       : Tag(Tag), Name(Name), File(File), Line(Line), Scope(Scope),
         BaseType(BaseType), SizeInBits(SizeInBits), AlignInBits(AlignInBits),
         OffsetInBits(OffsetInBits), Flags(Flags), Elements(Elements),
         RuntimeLang(RuntimeLang), VTableHolder(VTableHolder),
         TemplateParams(TemplateParams), Identifier(Identifier) {}
   MDNodeKeyImpl(const DICompositeType *N)
-      : Tag(N->getTag()), Name(N->getName()), File(N->getRawFile()),
+      : Tag(N->getTag()), Name(N->getRawName()), File(N->getRawFile()),
         Line(N->getLine()), Scope(N->getRawScope()),
         BaseType(N->getRawBaseType()), SizeInBits(N->getSizeInBits()),
         AlignInBits(N->getAlignInBits()), OffsetInBits(N->getOffsetInBits()),
         Flags(N->getFlags()), Elements(N->getRawElements()),
         RuntimeLang(N->getRuntimeLang()), VTableHolder(N->getRawVTableHolder()),
         TemplateParams(N->getRawTemplateParams()),
-        Identifier(N->getIdentifier()) {}
+        Identifier(N->getRawIdentifier()) {}
 
   bool isKeyOf(const DICompositeType *RHS) const {
-    return Tag == RHS->getTag() && Name == RHS->getName() &&
+    return Tag == RHS->getTag() && Name == RHS->getRawName() &&
            File == RHS->getRawFile() && Line == RHS->getLine() &&
            Scope == RHS->getRawScope() && BaseType == RHS->getRawBaseType() &&
            SizeInBits == RHS->getSizeInBits() &&
@@ -419,49 +470,55 @@ template <> struct MDNodeKeyImpl<DICompositeType> {
            RuntimeLang == RHS->getRuntimeLang() &&
            VTableHolder == RHS->getRawVTableHolder() &&
            TemplateParams == RHS->getRawTemplateParams() &&
-           Identifier == RHS->getIdentifier();
+           Identifier == RHS->getRawIdentifier();
   }
   unsigned getHashValue() const {
-    return hash_combine(Tag, Name, File, Line, Scope, BaseType, SizeInBits,
-                        AlignInBits, OffsetInBits, Flags, Elements, RuntimeLang,
-                        VTableHolder, TemplateParams, Identifier);
+    // Intentionally computes the hash on a subset of the operands for
+    // performance reason. The subset has to be significant enough to avoid
+    // collision "most of the time". There is no correctness issue in case of
+    // collision because of the full check above.
+    return hash_combine(Name, File, Line, BaseType, Scope, Elements,
+                        TemplateParams);
   }
 };
 
 template <> struct MDNodeKeyImpl<DISubroutineType> {
   unsigned Flags;
+  uint8_t CC;
   Metadata *TypeArray;
 
-  MDNodeKeyImpl(int64_t Flags, Metadata *TypeArray)
-      : Flags(Flags), TypeArray(TypeArray) {}
+  MDNodeKeyImpl(unsigned Flags, uint8_t CC, Metadata *TypeArray)
+      : Flags(Flags), CC(CC), TypeArray(TypeArray) {}
   MDNodeKeyImpl(const DISubroutineType *N)
-      : Flags(N->getFlags()), TypeArray(N->getRawTypeArray()) {}
+      : Flags(N->getFlags()), CC(N->getCC()), TypeArray(N->getRawTypeArray()) {}
 
   bool isKeyOf(const DISubroutineType *RHS) const {
-    return Flags == RHS->getFlags() && TypeArray == RHS->getRawTypeArray();
+    return Flags == RHS->getFlags() && CC == RHS->getCC() &&
+           TypeArray == RHS->getRawTypeArray();
   }
-  unsigned getHashValue() const { return hash_combine(Flags, TypeArray); }
+  unsigned getHashValue() const { return hash_combine(Flags, CC, TypeArray); }
 };
 
 template <> struct MDNodeKeyImpl<DIFile> {
-  StringRef Filename;
-  StringRef Directory;
+  MDString *Filename;
+  MDString *Directory;
 
-  MDNodeKeyImpl(StringRef Filename, StringRef Directory)
+  MDNodeKeyImpl(MDString *Filename, MDString *Directory)
       : Filename(Filename), Directory(Directory) {}
   MDNodeKeyImpl(const DIFile *N)
-      : Filename(N->getFilename()), Directory(N->getDirectory()) {}
+      : Filename(N->getRawFilename()), Directory(N->getRawDirectory()) {}
 
   bool isKeyOf(const DIFile *RHS) const {
-    return Filename == RHS->getFilename() && Directory == RHS->getDirectory();
+    return Filename == RHS->getRawFilename() &&
+           Directory == RHS->getRawDirectory();
   }
   unsigned getHashValue() const { return hash_combine(Filename, Directory); }
 };
 
 template <> struct MDNodeKeyImpl<DISubprogram> {
   Metadata *Scope;
-  StringRef Name;
-  StringRef LinkageName;
+  MDString *Name;
+  MDString *LinkageName;
   Metadata *File;
   unsigned Line;
   Metadata *Type;
@@ -471,57 +528,102 @@ template <> struct MDNodeKeyImpl<DISubprogram> {
   Metadata *ContainingType;
   unsigned Virtuality;
   unsigned VirtualIndex;
+  int ThisAdjustment;
   unsigned Flags;
   bool IsOptimized;
+  Metadata *Unit;
   Metadata *TemplateParams;
   Metadata *Declaration;
   Metadata *Variables;
 
-  MDNodeKeyImpl(Metadata *Scope, StringRef Name, StringRef LinkageName,
+  MDNodeKeyImpl(Metadata *Scope, MDString *Name, MDString *LinkageName,
                 Metadata *File, unsigned Line, Metadata *Type,
                 bool IsLocalToUnit, bool IsDefinition, unsigned ScopeLine,
                 Metadata *ContainingType, unsigned Virtuality,
-                unsigned VirtualIndex, unsigned Flags, bool IsOptimized,
-                Metadata *TemplateParams, Metadata *Declaration,
-                Metadata *Variables)
+                unsigned VirtualIndex, int ThisAdjustment, unsigned Flags,
+                bool IsOptimized, Metadata *Unit, Metadata *TemplateParams,
+                Metadata *Declaration, Metadata *Variables)
       : Scope(Scope), Name(Name), LinkageName(LinkageName), File(File),
         Line(Line), Type(Type), IsLocalToUnit(IsLocalToUnit),
         IsDefinition(IsDefinition), ScopeLine(ScopeLine),
         ContainingType(ContainingType), Virtuality(Virtuality),
-        VirtualIndex(VirtualIndex), Flags(Flags), IsOptimized(IsOptimized),
+        VirtualIndex(VirtualIndex), ThisAdjustment(ThisAdjustment),
+        Flags(Flags), IsOptimized(IsOptimized), Unit(Unit),
         TemplateParams(TemplateParams), Declaration(Declaration),
         Variables(Variables) {}
   MDNodeKeyImpl(const DISubprogram *N)
-      : Scope(N->getRawScope()), Name(N->getName()),
-        LinkageName(N->getLinkageName()), File(N->getRawFile()),
+      : Scope(N->getRawScope()), Name(N->getRawName()),
+        LinkageName(N->getRawLinkageName()), File(N->getRawFile()),
         Line(N->getLine()), Type(N->getRawType()),
         IsLocalToUnit(N->isLocalToUnit()), IsDefinition(N->isDefinition()),
         ScopeLine(N->getScopeLine()), ContainingType(N->getRawContainingType()),
         Virtuality(N->getVirtuality()), VirtualIndex(N->getVirtualIndex()),
-        Flags(N->getFlags()), IsOptimized(N->isOptimized()),
+        ThisAdjustment(N->getThisAdjustment()), Flags(N->getFlags()),
+        IsOptimized(N->isOptimized()), Unit(N->getRawUnit()),
         TemplateParams(N->getRawTemplateParams()),
         Declaration(N->getRawDeclaration()), Variables(N->getRawVariables()) {}
 
   bool isKeyOf(const DISubprogram *RHS) const {
-    return Scope == RHS->getRawScope() && Name == RHS->getName() &&
-           LinkageName == RHS->getLinkageName() && File == RHS->getRawFile() &&
-           Line == RHS->getLine() && Type == RHS->getRawType() &&
-           IsLocalToUnit == RHS->isLocalToUnit() &&
+    return Scope == RHS->getRawScope() && Name == RHS->getRawName() &&
+           LinkageName == RHS->getRawLinkageName() &&
+           File == RHS->getRawFile() && Line == RHS->getLine() &&
+           Type == RHS->getRawType() && IsLocalToUnit == RHS->isLocalToUnit() &&
            IsDefinition == RHS->isDefinition() &&
            ScopeLine == RHS->getScopeLine() &&
            ContainingType == RHS->getRawContainingType() &&
            Virtuality == RHS->getVirtuality() &&
-           VirtualIndex == RHS->getVirtualIndex() && Flags == RHS->getFlags() &&
-           IsOptimized == RHS->isOptimized() &&
+           VirtualIndex == RHS->getVirtualIndex() &&
+           ThisAdjustment == RHS->getThisAdjustment() &&
+           Flags == RHS->getFlags() && IsOptimized == RHS->isOptimized() &&
+           Unit == RHS->getUnit() &&
            TemplateParams == RHS->getRawTemplateParams() &&
            Declaration == RHS->getRawDeclaration() &&
            Variables == RHS->getRawVariables();
   }
   unsigned getHashValue() const {
-    return hash_combine(Scope, Name, LinkageName, File, Line, Type,
-                        IsLocalToUnit, IsDefinition, ScopeLine, ContainingType,
-                        Virtuality, VirtualIndex, Flags, IsOptimized,
-                        TemplateParams, Declaration, Variables);
+    // If this is a declaration inside an ODR type, only hash the type and the
+    // name.  Otherwise the hash will be stronger than
+    // MDNodeSubsetEqualImpl::isDeclarationOfODRMember().
+    if (!IsDefinition && LinkageName)
+      if (auto *CT = dyn_cast_or_null<DICompositeType>(Scope))
+        if (CT->getRawIdentifier())
+          return hash_combine(LinkageName, Scope);
+
+    // Intentionally computes the hash on a subset of the operands for
+    // performance reason. The subset has to be significant enough to avoid
+    // collision "most of the time". There is no correctness issue in case of
+    // collision because of the full check above.
+    return hash_combine(Name, Scope, File, Type, Line);
+  }
+};
+
+template <> struct MDNodeSubsetEqualImpl<DISubprogram> {
+  typedef MDNodeKeyImpl<DISubprogram> KeyTy;
+  static bool isSubsetEqual(const KeyTy &LHS, const DISubprogram *RHS) {
+    return isDeclarationOfODRMember(LHS.IsDefinition, LHS.Scope,
+                                    LHS.LinkageName, RHS);
+  }
+  static bool isSubsetEqual(const DISubprogram *LHS, const DISubprogram *RHS) {
+    return isDeclarationOfODRMember(LHS->isDefinition(), LHS->getRawScope(),
+                                    LHS->getRawLinkageName(), RHS);
+  }
+
+  /// Subprograms compare equal if they declare the same function in an ODR
+  /// type.
+  static bool isDeclarationOfODRMember(bool IsDefinition, const Metadata *Scope,
+                                       const MDString *LinkageName,
+                                       const DISubprogram *RHS) {
+    // Check whether the LHS is eligible.
+    if (IsDefinition || !Scope || !LinkageName)
+      return false;
+
+    auto *CT = dyn_cast_or_null<DICompositeType>(Scope);
+    if (!CT || !CT->getRawIdentifier())
+      return false;
+
+    // Compare to the RHS.
+    return IsDefinition == RHS->isDefinition() && Scope == RHS->getRawScope() &&
+           LinkageName == RHS->getRawLinkageName();
   }
 };
 
@@ -569,18 +671,18 @@ template <> struct MDNodeKeyImpl<DILexicalBlockFile> {
 template <> struct MDNodeKeyImpl<DINamespace> {
   Metadata *Scope;
   Metadata *File;
-  StringRef Name;
+  MDString *Name;
   unsigned Line;
 
-  MDNodeKeyImpl(Metadata *Scope, Metadata *File, StringRef Name, unsigned Line)
+  MDNodeKeyImpl(Metadata *Scope, Metadata *File, MDString *Name, unsigned Line)
       : Scope(Scope), File(File), Name(Name), Line(Line) {}
   MDNodeKeyImpl(const DINamespace *N)
-      : Scope(N->getRawScope()), File(N->getRawFile()), Name(N->getName()),
+      : Scope(N->getRawScope()), File(N->getRawFile()), Name(N->getRawName()),
         Line(N->getLine()) {}
 
   bool isKeyOf(const DINamespace *RHS) const {
     return Scope == RHS->getRawScope() && File == RHS->getRawFile() &&
-           Name == RHS->getName() && Line == RHS->getLine();
+           Name == RHS->getRawName() && Line == RHS->getLine();
   }
   unsigned getHashValue() const {
     return hash_combine(Scope, File, Name, Line);
@@ -589,26 +691,24 @@ template <> struct MDNodeKeyImpl<DINamespace> {
 
 template <> struct MDNodeKeyImpl<DIModule> {
   Metadata *Scope;
-  StringRef Name;
-  StringRef ConfigurationMacros;
-  StringRef IncludePath;
-  StringRef ISysRoot;
-  MDNodeKeyImpl(Metadata *Scope, StringRef Name,
-                StringRef ConfigurationMacros,
-                StringRef IncludePath,
-                StringRef ISysRoot)
-    : Scope(Scope), Name(Name), ConfigurationMacros(ConfigurationMacros),
-      IncludePath(IncludePath), ISysRoot(ISysRoot) {}
+  MDString *Name;
+  MDString *ConfigurationMacros;
+  MDString *IncludePath;
+  MDString *ISysRoot;
+  MDNodeKeyImpl(Metadata *Scope, MDString *Name, MDString *ConfigurationMacros,
+                MDString *IncludePath, MDString *ISysRoot)
+      : Scope(Scope), Name(Name), ConfigurationMacros(ConfigurationMacros),
+        IncludePath(IncludePath), ISysRoot(ISysRoot) {}
   MDNodeKeyImpl(const DIModule *N)
-    : Scope(N->getRawScope()), Name(N->getName()),
-      ConfigurationMacros(N->getConfigurationMacros()),
-      IncludePath(N->getIncludePath()), ISysRoot(N->getISysRoot()) {}
+      : Scope(N->getRawScope()), Name(N->getRawName()),
+        ConfigurationMacros(N->getRawConfigurationMacros()),
+        IncludePath(N->getRawIncludePath()), ISysRoot(N->getRawISysRoot()) {}
 
   bool isKeyOf(const DIModule *RHS) const {
-    return Scope == RHS->getRawScope() && Name == RHS->getName() &&
-           ConfigurationMacros == RHS->getConfigurationMacros() &&
-           IncludePath == RHS->getIncludePath() &&
-           ISysRoot == RHS->getISysRoot();
+    return Scope == RHS->getRawScope() && Name == RHS->getRawName() &&
+           ConfigurationMacros == RHS->getRawConfigurationMacros() &&
+           IncludePath == RHS->getRawIncludePath() &&
+           ISysRoot == RHS->getRawISysRoot();
   }
   unsigned getHashValue() const {
     return hash_combine(Scope, Name,
@@ -617,33 +717,33 @@ template <> struct MDNodeKeyImpl<DIModule> {
 };
 
 template <> struct MDNodeKeyImpl<DITemplateTypeParameter> {
-  StringRef Name;
+  MDString *Name;
   Metadata *Type;
 
-  MDNodeKeyImpl(StringRef Name, Metadata *Type) : Name(Name), Type(Type) {}
+  MDNodeKeyImpl(MDString *Name, Metadata *Type) : Name(Name), Type(Type) {}
   MDNodeKeyImpl(const DITemplateTypeParameter *N)
-      : Name(N->getName()), Type(N->getRawType()) {}
+      : Name(N->getRawName()), Type(N->getRawType()) {}
 
   bool isKeyOf(const DITemplateTypeParameter *RHS) const {
-    return Name == RHS->getName() && Type == RHS->getRawType();
+    return Name == RHS->getRawName() && Type == RHS->getRawType();
   }
   unsigned getHashValue() const { return hash_combine(Name, Type); }
 };
 
 template <> struct MDNodeKeyImpl<DITemplateValueParameter> {
   unsigned Tag;
-  StringRef Name;
+  MDString *Name;
   Metadata *Type;
   Metadata *Value;
 
-  MDNodeKeyImpl(unsigned Tag, StringRef Name, Metadata *Type, Metadata *Value)
+  MDNodeKeyImpl(unsigned Tag, MDString *Name, Metadata *Type, Metadata *Value)
       : Tag(Tag), Name(Name), Type(Type), Value(Value) {}
   MDNodeKeyImpl(const DITemplateValueParameter *N)
-      : Tag(N->getTag()), Name(N->getName()), Type(N->getRawType()),
+      : Tag(N->getTag()), Name(N->getRawName()), Type(N->getRawType()),
         Value(N->getValue()) {}
 
   bool isKeyOf(const DITemplateValueParameter *RHS) const {
-    return Tag == RHS->getTag() && Name == RHS->getName() &&
+    return Tag == RHS->getTag() && Name == RHS->getRawName() &&
            Type == RHS->getRawType() && Value == RHS->getValue();
   }
   unsigned getHashValue() const { return hash_combine(Tag, Name, Type, Value); }
@@ -651,8 +751,8 @@ template <> struct MDNodeKeyImpl<DITemplateValueParameter> {
 
 template <> struct MDNodeKeyImpl<DIGlobalVariable> {
   Metadata *Scope;
-  StringRef Name;
-  StringRef LinkageName;
+  MDString *Name;
+  MDString *LinkageName;
   Metadata *File;
   unsigned Line;
   Metadata *Type;
@@ -661,7 +761,7 @@ template <> struct MDNodeKeyImpl<DIGlobalVariable> {
   Metadata *Variable;
   Metadata *StaticDataMemberDeclaration;
 
-  MDNodeKeyImpl(Metadata *Scope, StringRef Name, StringRef LinkageName,
+  MDNodeKeyImpl(Metadata *Scope, MDString *Name, MDString *LinkageName,
                 Metadata *File, unsigned Line, Metadata *Type,
                 bool IsLocalToUnit, bool IsDefinition, Metadata *Variable,
                 Metadata *StaticDataMemberDeclaration)
@@ -670,18 +770,18 @@ template <> struct MDNodeKeyImpl<DIGlobalVariable> {
         IsDefinition(IsDefinition), Variable(Variable),
         StaticDataMemberDeclaration(StaticDataMemberDeclaration) {}
   MDNodeKeyImpl(const DIGlobalVariable *N)
-      : Scope(N->getRawScope()), Name(N->getName()),
-        LinkageName(N->getLinkageName()), File(N->getRawFile()),
+      : Scope(N->getRawScope()), Name(N->getRawName()),
+        LinkageName(N->getRawLinkageName()), File(N->getRawFile()),
         Line(N->getLine()), Type(N->getRawType()),
         IsLocalToUnit(N->isLocalToUnit()), IsDefinition(N->isDefinition()),
         Variable(N->getRawVariable()),
         StaticDataMemberDeclaration(N->getRawStaticDataMemberDeclaration()) {}
 
   bool isKeyOf(const DIGlobalVariable *RHS) const {
-    return Scope == RHS->getRawScope() && Name == RHS->getName() &&
-           LinkageName == RHS->getLinkageName() && File == RHS->getRawFile() &&
-           Line == RHS->getLine() && Type == RHS->getRawType() &&
-           IsLocalToUnit == RHS->isLocalToUnit() &&
+    return Scope == RHS->getRawScope() && Name == RHS->getRawName() &&
+           LinkageName == RHS->getRawLinkageName() &&
+           File == RHS->getRawFile() && Line == RHS->getLine() &&
+           Type == RHS->getRawType() && IsLocalToUnit == RHS->isLocalToUnit() &&
            IsDefinition == RHS->isDefinition() &&
            Variable == RHS->getRawVariable() &&
            StaticDataMemberDeclaration ==
@@ -696,24 +796,24 @@ template <> struct MDNodeKeyImpl<DIGlobalVariable> {
 
 template <> struct MDNodeKeyImpl<DILocalVariable> {
   Metadata *Scope;
-  StringRef Name;
+  MDString *Name;
   Metadata *File;
   unsigned Line;
   Metadata *Type;
   unsigned Arg;
   unsigned Flags;
 
-  MDNodeKeyImpl(Metadata *Scope, StringRef Name, Metadata *File, unsigned Line,
+  MDNodeKeyImpl(Metadata *Scope, MDString *Name, Metadata *File, unsigned Line,
                 Metadata *Type, unsigned Arg, unsigned Flags)
       : Scope(Scope), Name(Name), File(File), Line(Line), Type(Type), Arg(Arg),
         Flags(Flags) {}
   MDNodeKeyImpl(const DILocalVariable *N)
-      : Scope(N->getRawScope()), Name(N->getName()), File(N->getRawFile()),
+      : Scope(N->getRawScope()), Name(N->getRawName()), File(N->getRawFile()),
         Line(N->getLine()), Type(N->getRawType()), Arg(N->getArg()),
         Flags(N->getFlags()) {}
 
   bool isKeyOf(const DILocalVariable *RHS) const {
-    return Scope == RHS->getRawScope() && Name == RHS->getName() &&
+    return Scope == RHS->getRawScope() && Name == RHS->getRawName() &&
            File == RHS->getRawFile() && Line == RHS->getLine() &&
            Type == RHS->getRawType() && Arg == RHS->getArg() &&
            Flags == RHS->getFlags();
@@ -738,28 +838,28 @@ template <> struct MDNodeKeyImpl<DIExpression> {
 };
 
 template <> struct MDNodeKeyImpl<DIObjCProperty> {
-  StringRef Name;
+  MDString *Name;
   Metadata *File;
   unsigned Line;
-  StringRef GetterName;
-  StringRef SetterName;
+  MDString *GetterName;
+  MDString *SetterName;
   unsigned Attributes;
   Metadata *Type;
 
-  MDNodeKeyImpl(StringRef Name, Metadata *File, unsigned Line,
-                StringRef GetterName, StringRef SetterName, unsigned Attributes,
+  MDNodeKeyImpl(MDString *Name, Metadata *File, unsigned Line,
+                MDString *GetterName, MDString *SetterName, unsigned Attributes,
                 Metadata *Type)
       : Name(Name), File(File), Line(Line), GetterName(GetterName),
         SetterName(SetterName), Attributes(Attributes), Type(Type) {}
   MDNodeKeyImpl(const DIObjCProperty *N)
-      : Name(N->getName()), File(N->getRawFile()), Line(N->getLine()),
-        GetterName(N->getGetterName()), SetterName(N->getSetterName()),
+      : Name(N->getRawName()), File(N->getRawFile()), Line(N->getLine()),
+        GetterName(N->getRawGetterName()), SetterName(N->getRawSetterName()),
         Attributes(N->getAttributes()), Type(N->getRawType()) {}
 
   bool isKeyOf(const DIObjCProperty *RHS) const {
-    return Name == RHS->getName() && File == RHS->getRawFile() &&
-           Line == RHS->getLine() && GetterName == RHS->getGetterName() &&
-           SetterName == RHS->getSetterName() &&
+    return Name == RHS->getRawName() && File == RHS->getRawFile() &&
+           Line == RHS->getLine() && GetterName == RHS->getRawGetterName() &&
+           SetterName == RHS->getRawSetterName() &&
            Attributes == RHS->getAttributes() && Type == RHS->getRawType();
   }
   unsigned getHashValue() const {
@@ -773,19 +873,19 @@ template <> struct MDNodeKeyImpl<DIImportedEntity> {
   Metadata *Scope;
   Metadata *Entity;
   unsigned Line;
-  StringRef Name;
+  MDString *Name;
 
   MDNodeKeyImpl(unsigned Tag, Metadata *Scope, Metadata *Entity, unsigned Line,
-                StringRef Name)
+                MDString *Name)
       : Tag(Tag), Scope(Scope), Entity(Entity), Line(Line), Name(Name) {}
   MDNodeKeyImpl(const DIImportedEntity *N)
       : Tag(N->getTag()), Scope(N->getRawScope()), Entity(N->getRawEntity()),
-        Line(N->getLine()), Name(N->getName()) {}
+        Line(N->getLine()), Name(N->getRawName()) {}
 
   bool isKeyOf(const DIImportedEntity *RHS) const {
     return Tag == RHS->getTag() && Scope == RHS->getRawScope() &&
            Entity == RHS->getRawEntity() && Line == RHS->getLine() &&
-           Name == RHS->getName();
+           Name == RHS->getRawName();
   }
   unsigned getHashValue() const {
     return hash_combine(Tag, Scope, Entity, Line, Name);
@@ -795,18 +895,18 @@ template <> struct MDNodeKeyImpl<DIImportedEntity> {
 template <> struct MDNodeKeyImpl<DIMacro> {
   unsigned MIType;
   unsigned Line;
-  StringRef Name;
-  StringRef Value;
+  MDString *Name;
+  MDString *Value;
 
-  MDNodeKeyImpl(unsigned MIType, unsigned Line, StringRef Name, StringRef Value)
+  MDNodeKeyImpl(unsigned MIType, unsigned Line, MDString *Name, MDString *Value)
       : MIType(MIType), Line(Line), Name(Name), Value(Value) {}
   MDNodeKeyImpl(const DIMacro *N)
-      : MIType(N->getMacinfoType()), Line(N->getLine()), Name(N->getName()),
-        Value(N->getValue()) {}
+      : MIType(N->getMacinfoType()), Line(N->getLine()), Name(N->getRawName()),
+        Value(N->getRawValue()) {}
 
   bool isKeyOf(const DIMacro *RHS) const {
     return MIType == RHS->getMacinfoType() && Line == RHS->getLine() &&
-           Name == RHS->getName() && Value == RHS->getValue();
+           Name == RHS->getRawName() && Value == RHS->getRawValue();
   }
   unsigned getHashValue() const {
     return hash_combine(MIType, Line, Name, Value);
@@ -838,6 +938,7 @@ template <> struct MDNodeKeyImpl<DIMacroFile> {
 /// \brief DenseMapInfo for MDNode subclasses.
 template <class NodeTy> struct MDNodeInfo {
   typedef MDNodeKeyImpl<NodeTy> KeyTy;
+  typedef MDNodeSubsetEqualImpl<NodeTy> SubsetEqualTy;
   static inline NodeTy *getEmptyKey() {
     return DenseMapInfo<NodeTy *>::getEmptyKey();
   }
@@ -851,10 +952,14 @@ template <class NodeTy> struct MDNodeInfo {
   static bool isEqual(const KeyTy &LHS, const NodeTy *RHS) {
     if (RHS == getEmptyKey() || RHS == getTombstoneKey())
       return false;
-    return LHS.isKeyOf(RHS);
+    return SubsetEqualTy::isSubsetEqual(LHS, RHS) || LHS.isKeyOf(RHS);
   }
   static bool isEqual(const NodeTy *LHS, const NodeTy *RHS) {
-    return LHS == RHS;
+    if (LHS == RHS)
+      return true;
+    if (RHS == getEmptyKey() || RHS == getTombstoneKey())
+      return false;
+    return SubsetEqualTy::isSubsetEqual(LHS, RHS);
   }
 };
 
@@ -899,6 +1004,33 @@ public:
   }
 };
 
+/// Multimap-like storage for metadata attachments for globals. This differs
+/// from MDAttachmentMap in that it allows multiple attachments per metadata
+/// kind.
+class MDGlobalAttachmentMap {
+  struct Attachment {
+    unsigned MDKind;
+    TrackingMDNodeRef Node;
+  };
+  SmallVector<Attachment, 1> Attachments;
+
+public:
+  bool empty() const { return Attachments.empty(); }
+
+  /// Appends all attachments with the given ID to \c Result in insertion order.
+  /// If the global has no attachments with the given ID, or if ID is invalid,
+  /// leaves Result unchanged.
+  void get(unsigned ID, SmallVectorImpl<MDNode *> &Result);
+
+  void insert(unsigned ID, MDNode &MD);
+  void erase(unsigned ID);
+
+  /// Appends all attachments for the global to \c Result, sorting by attachment
+  /// ID. Attachments with the same ID appear in insertion order. This function
+  /// does \em not clear \c Result.
+  void getAll(SmallVectorImpl<std::pair<unsigned, MDNode *>> &Result) const;
+};
+
 class LLVMContextImpl {
 public:
   /// OwnedModules - The set of modules instantiated in this context, and which
@@ -911,6 +1043,7 @@ public:
   LLVMContext::DiagnosticHandlerTy DiagnosticHandler;
   void *DiagnosticContext;
   bool RespectDiagnosticFilters;
+  bool DiagnosticHotnessRequested;
 
   LLVMContext::YieldCallbackTy YieldCallback;
   void *YieldOpaqueHandle;
@@ -925,7 +1058,7 @@ public:
   FoldingSet<AttributeSetImpl> AttrsLists;
   FoldingSet<AttributeSetNode> AttrsSetNodes;
 
-  StringMap<MDString> MDStringCache;
+  StringMap<MDString, BumpPtrAllocator> MDStringCache;
   DenseMap<Value *, ValueAsMetadata *> ValuesAsMetadata;
   DenseMap<Metadata *, MetadataAsValue *> MetadataAsValues;
 
@@ -935,11 +1068,14 @@ public:
   DenseSet<CLASS *, CLASS##Info> CLASS##s;
 #include "llvm/IR/Metadata.def"
 
+  // Optional map for looking up composite types by identifier.
+  Optional<DenseMap<const MDString *, DICompositeType *>> DITypeMap;
+
   // MDNodes may be uniqued or not uniqued.  When they're not uniqued, they
   // aren't in the MDNodeSet, but they're still shared between objects, so no
-  // one object can destroy them.  This set allows us to at least destroy them
-  // on Context destruction.
-  SmallPtrSet<MDNode *, 1> DistinctMDNodes;
+  // one object can destroy them.  Keep track of them here so we can delete
+  // them on context teardown.
+  std::vector<MDNode *> DistinctMDNodes;
 
   DenseMap<Type*, ConstantAggregateZero*> CAZConstants;
 
@@ -1006,8 +1142,8 @@ public:
   /// Collection of per-instruction metadata used in this context.
   DenseMap<const Instruction *, MDAttachmentMap> InstructionMetadata;
 
-  /// Collection of per-function metadata used in this context.
-  DenseMap<const Function *, MDAttachmentMap> FunctionMetadata;
+  /// Collection of per-GlobalObject metadata used in this context.
+  DenseMap<const GlobalObject *, MDGlobalAttachmentMap> GlobalObjectMetadata;
 
   /// DiscriminatorTable - This table maps file:line locations to an
   /// integer representing the next DWARF path discriminator to assign to
@@ -1034,11 +1170,19 @@ public:
   /// clients which do use GC.
   DenseMap<const Function*, std::string> GCNames;
 
+  /// Flag to indicate if Value (other than GlobalValue) retains their name or
+  /// not.
+  bool DiscardValueNames = false;
+
   LLVMContextImpl(LLVMContext &C);
   ~LLVMContextImpl();
 
   /// Destroy the ConstantArrays if they are not used.
   void dropTriviallyDeadConstantArrays();
+
+  /// \brief Access the object which manages optimization bisection for failure
+  /// analysis.
+  OptBisect &getOptBisect();
 };
 
 }
diff --git a/lib/IR/LegacyPassManager.cpp b/lib/IR/LegacyPassManager.cpp
index 63d89f21b350..8f71d822d271 100644
--- a/lib/IR/LegacyPassManager.cpp
+++ b/lib/IR/LegacyPassManager.cpp
@@ -325,14 +325,6 @@ public:
   using llvm::Pass::doInitialization;
   using llvm::Pass::doFinalization;
 
-  /// doInitialization - Run all of the initializers for the module passes.
-  ///
-  bool doInitialization();
-
-  /// doFinalization - Run all of the finalizers for the module passes.
-  ///
-  bool doFinalization();
-
   /// Pass Manager itself does not invalidate any analysis info.
   void getAnalysisUsage(AnalysisUsage &Info) const override {
     Info.setPreservesAll();
@@ -423,14 +415,6 @@ public:
   using llvm::Pass::doInitialization;
   using llvm::Pass::doFinalization;
 
-  /// doInitialization - Run all of the initializers for the module passes.
-  ///
-  bool doInitialization();
-
-  /// doFinalization - Run all of the finalizers for the module passes.
-  ///
-  bool doFinalization();
-
   /// Pass Manager itself does not invalidate any analysis info.
   void getAnalysisUsage(AnalysisUsage &Info) const override {
     Info.setPreservesAll();
@@ -531,9 +515,8 @@ PMTopLevelManager::setLastUser(ArrayRef<Pass*> AnalysisPasses, Pass *P) {
     const AnalysisUsage::VectorType &IDs = AnUsage->getRequiredTransitiveSet();
     SmallVector<Pass *, 12> LastUses;
     SmallVector<Pass *, 12> LastPMUses;
-    for (AnalysisUsage::VectorType::const_iterator I = IDs.begin(),
-         E = IDs.end(); I != E; ++I) {
-      Pass *AnalysisPass = findAnalysisPass(*I);
+    for (AnalysisID ID : IDs) {
+      Pass *AnalysisPass = findAnalysisPass(ID);
       assert(AnalysisPass && "Expected analysis pass to exist.");
       AnalysisResolver *AR = AnalysisPass->getResolver();
       assert(AR && "Expected analysis resolver to exist.");
@@ -791,29 +774,24 @@ void PMTopLevelManager::dumpArguments() const {
     return;
 
   dbgs() << "Pass Arguments: ";
-  for (SmallVectorImpl<ImmutablePass *>::const_iterator I =
-       ImmutablePasses.begin(), E = ImmutablePasses.end(); I != E; ++I)
-    if (const PassInfo *PI = findAnalysisPassInfo((*I)->getPassID())) {
+  for (ImmutablePass *P : ImmutablePasses)
+    if (const PassInfo *PI = findAnalysisPassInfo(P->getPassID())) {
       assert(PI && "Expected all immutable passes to be initialized");
       if (!PI->isAnalysisGroup())
         dbgs() << " -" << PI->getPassArgument();
     }
-  for (SmallVectorImpl<PMDataManager *>::const_iterator I =
-       PassManagers.begin(), E = PassManagers.end(); I != E; ++I)
-    (*I)->dumpPassArguments();
+  for (PMDataManager *PM : PassManagers)
+    PM->dumpPassArguments();
   dbgs() << "\n";
 }
 
 void PMTopLevelManager::initializeAllAnalysisInfo() {
-  for (SmallVectorImpl<PMDataManager *>::iterator I = PassManagers.begin(),
-         E = PassManagers.end(); I != E; ++I)
-    (*I)->initializeAnalysisInfo();
+  for (PMDataManager *PM : PassManagers)
+    PM->initializeAnalysisInfo();
 
   // Initailize other pass managers
-  for (SmallVectorImpl<PMDataManager *>::iterator
-       I = IndirectPassManagers.begin(), E = IndirectPassManagers.end();
-       I != E; ++I)
-    (*I)->initializeAnalysisInfo();
+  for (PMDataManager *IPM : IndirectPassManagers)
+    IPM->initializeAnalysisInfo();
 
   for (DenseMap<Pass *, Pass *>::iterator DMI = LastUser.begin(),
         DME = LastUser.end(); DMI != DME; ++DMI) {
@@ -824,13 +802,11 @@ void PMTopLevelManager::initializeAllAnalysisInfo() {
 
 /// Destructor
 PMTopLevelManager::~PMTopLevelManager() {
-  for (SmallVectorImpl<PMDataManager *>::iterator I = PassManagers.begin(),
-         E = PassManagers.end(); I != E; ++I)
-    delete *I;
+  for (PMDataManager *PM : PassManagers)
+    delete PM;
 
-  for (SmallVectorImpl<ImmutablePass *>::iterator
-         I = ImmutablePasses.begin(), E = ImmutablePasses.end(); I != E; ++I)
-    delete *I;
+  for (ImmutablePass *P : ImmutablePasses)
+    delete P;
 }
 
 //===----------------------------------------------------------------------===//
@@ -1827,7 +1803,7 @@ void PMStack::push(PMDataManager *PM) {
 }
 
 // Dump content of the pass manager stack.
-void PMStack::dump() const {
+LLVM_DUMP_METHOD void PMStack::dump() const {
   for (PMDataManager *Manager : S)
     dbgs() << Manager->getAsPass()->getPassName() << ' ';
 
diff --git a/lib/IR/MDBuilder.cpp b/lib/IR/MDBuilder.cpp
index 4ce3ea2e9c04..a5a4cd06db09 100644
--- a/lib/IR/MDBuilder.cpp
+++ b/lib/IR/MDBuilder.cpp
@@ -40,7 +40,7 @@ MDNode *MDBuilder::createBranchWeights(uint32_t TrueWeight,
 }
 
 MDNode *MDBuilder::createBranchWeights(ArrayRef<uint32_t> Weights) {
-  assert(Weights.size() >= 2 && "Need at least two branch weights!");
+  assert(Weights.size() >= 1 && "Need at least one branch weights!");
 
   SmallVector<Metadata *, 4> Vals(Weights.size() + 1);
   Vals[0] = createString("branch_weights");
diff --git a/lib/IR/Makefile b/lib/IR/Makefile
deleted file mode 100644
index 329cd6636e94..000000000000
--- a/lib/IR/Makefile
+++ /dev/null
@@ -1,61 +0,0 @@
-##===- lib/IR/Makefile -------------------------------------*- Makefile -*-===##
-#
-#                     The LLVM Compiler Infrastructure
-#
-# This file is distributed under the University of Illinois Open Source
-# License. See LICENSE.TXT for details.
-#
-##===----------------------------------------------------------------------===##
-LEVEL = ../..
-LIBRARYNAME = LLVMCore
-BUILD_ARCHIVE = 1
-
-BUILT_SOURCES = $(PROJ_OBJ_ROOT)/include/llvm/IR/Intrinsics.gen \
-		 $(PROJ_OBJ_ROOT)/include/llvm/IR/Attributes.inc \
-		 $(PROJ_OBJ_ROOT)/lib/IR/AttributesCompatFunc.inc
-
-include $(LEVEL)/Makefile.common
-
-GENFILE:=$(PROJ_OBJ_ROOT)/include/llvm/IR/Intrinsics.gen
-ATTRINCFILE:=$(PROJ_OBJ_ROOT)/include/llvm/IR/Attributes.inc
-ATTRCOMPATFUNCINCFILE:=$(PROJ_OBJ_ROOT)/lib/IR/AttributesCompatFunc.inc
-
-INTRINSICTD  := $(PROJ_SRC_ROOT)/include/llvm/IR/Intrinsics.td
-INTRINSICTDS := $(wildcard $(PROJ_SRC_ROOT)/include/llvm/IR/Intrinsics*.td)
-ATTRIBUTESTD  := $(PROJ_SRC_ROOT)/include/llvm/IR/Attributes.td
-ATTRCOMPATFUNCTD  := $(PROJ_SRC_ROOT)/lib/IR/AttributesCompatFunc.td
-
-$(ObjDir)/Intrinsics.gen.tmp: $(ObjDir)/.dir $(INTRINSICTDS) $(LLVM_TBLGEN)
-	$(Echo) Building Intrinsics.gen.tmp from Intrinsics.td
-	$(Verb) $(LLVMTableGen) $(call SYSPATH, $(INTRINSICTD)) -o $(call SYSPATH, $@) -gen-intrinsic
-
-$(GENFILE): $(ObjDir)/Intrinsics.gen.tmp $(PROJ_OBJ_ROOT)/include/llvm/IR/.dir
-	$(Verb) $(CMP) -s $@ $< || ( $(CP) $< $@ && \
-	  $(EchoCmd) Updated Intrinsics.gen because Intrinsics.gen.tmp \
-	    changed significantly. )
-
-$(ObjDir)/Attributes.inc.tmp: $(ObjDir)/.dir $(ATTRIBUTESTD) $(LLVM_TBLGEN)
-	$(Echo) Building Attributes.inc.tmp from $(ATTRIBUTESTD)
-	$(Verb) $(LLVMTableGen) $(call SYSPATH, $(ATTRIBUTESTD)) -o $(call SYSPATH, $@) -gen-attrs
-
-$(ATTRINCFILE): $(ObjDir)/Attributes.inc.tmp $(PROJ_OBJ_ROOT)/include/llvm/IR/.dir
-	$(Verb) $(CMP) -s $@ $< || ( $(CP) $< $@ && \
-	  $(EchoCmd) Updated Attributes.inc because Attributes.inc.tmp \
-	    changed significantly. )
-
-$(ObjDir)/AttributesCompatFunc.inc.tmp: $(ObjDir)/.dir $(ATTRCOMPATFUNCTD) $(LLVM_TBLGEN)
-	$(Echo) Building AttributesCompatFunc.inc.tmp from $(ATTRCOMPATFUNCTD)
-	$(Verb) $(LLVMTableGen) $(call SYSPATH, $(ATTRCOMPATFUNCTD)) -o $(call SYSPATH, $@) -gen-attrs
-
-$(ATTRCOMPATFUNCINCFILE): $(ObjDir)/AttributesCompatFunc.inc.tmp $(PROJ_OBJ_ROOT)/include/llvm/IR/.dir
-	$(Verb) $(CMP) -s $@ $< || ( $(CP) $< $@ && \
-	  $(EchoCmd) Updated AttributesCompatFunc.inc because AttributesCompatFunc.inc.tmp \
-	    changed significantly. )
-
-install-local:: $(GENFILE)
-	$(Echo) Installing $(DESTDIR)$(PROJ_includedir)/llvm/IR/Intrinsics.gen
-	$(Verb) $(DataInstall) $(GENFILE) $(DESTDIR)$(PROJ_includedir)/llvm/IR/Intrinsics.gen
-
-install-local:: $(ATTRINCFILE)
-	$(Echo) Installing $(DESTDIR)$(PROJ_includedir)/llvm/IR/Attributes.inc
-	$(Verb) $(DataInstall) $(ATTRINCFILE) $(DESTDIR)$(PROJ_includedir)/llvm/IR/Attributes.inc
diff --git a/lib/IR/Mangler.cpp b/lib/IR/Mangler.cpp
index 016cb9eb6892..ddf024df8ccc 100644
--- a/lib/IR/Mangler.cpp
+++ b/lib/IR/Mangler.cpp
@@ -99,7 +99,7 @@ static void addByteCountSuffix(raw_ostream &OS, const Function *F,
       Ty = cast<PointerType>(Ty)->getElementType();
     // Size should be aligned to pointer size.
     unsigned PtrSize = DL.getPointerSize();
-    ArgWords += RoundUpToAlignment(DL.getTypeAllocSize(Ty), PtrSize);
+    ArgWords += alignTo(DL.getTypeAllocSize(Ty), PtrSize);
   }
 
   OS << '@' << ArgWords;
diff --git a/lib/IR/Metadata.cpp b/lib/IR/Metadata.cpp
index 9a9a5017841c..5201c2ecce6a 100644
--- a/lib/IR/Metadata.cpp
+++ b/lib/IR/Metadata.cpp
@@ -15,10 +15,8 @@
 #include "LLVMContextImpl.h"
 #include "MetadataImpl.h"
 #include "SymbolTableListTraitsImpl.h"
-#include "llvm/ADT/DenseMap.h"
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/SmallSet.h"
-#include "llvm/ADT/SmallString.h"
 #include "llvm/ADT/StringMap.h"
 #include "llvm/IR/ConstantRange.h"
 #include "llvm/IR/DebugInfoMetadata.h"
@@ -39,7 +37,7 @@ MetadataAsValue::~MetadataAsValue() {
   untrack();
 }
 
-/// \brief Canonicalize metadata arguments to intrinsics.
+/// Canonicalize metadata arguments to intrinsics.
 ///
 /// To support bitcode upgrades (and assembly semantic sugar) for \a
 /// MetadataAsValue, we need to canonicalize certain metadata.
@@ -124,32 +122,44 @@ bool MetadataTracking::track(void *Ref, Metadata &MD, OwnerTy Owner) {
   assert(Ref && "Expected live reference");
   assert((Owner || *static_cast<Metadata **>(Ref) == &MD) &&
          "Reference without owner must be direct");
-  if (auto *R = ReplaceableMetadataImpl::get(MD)) {
+  if (auto *R = ReplaceableMetadataImpl::getOrCreate(MD)) {
     R->addRef(Ref, Owner);
     return true;
   }
+  if (auto *PH = dyn_cast<DistinctMDOperandPlaceholder>(&MD)) {
+    assert(!PH->Use && "Placeholders can only be used once");
+    assert(!Owner && "Unexpected callback to owner");
+    PH->Use = static_cast<Metadata **>(Ref);
+    return true;
+  }
   return false;
 }
 
 void MetadataTracking::untrack(void *Ref, Metadata &MD) {
   assert(Ref && "Expected live reference");
-  if (auto *R = ReplaceableMetadataImpl::get(MD))
+  if (auto *R = ReplaceableMetadataImpl::getIfExists(MD))
     R->dropRef(Ref);
+  else if (auto *PH = dyn_cast<DistinctMDOperandPlaceholder>(&MD))
+    PH->Use = nullptr;
 }
 
 bool MetadataTracking::retrack(void *Ref, Metadata &MD, void *New) {
   assert(Ref && "Expected live reference");
   assert(New && "Expected live reference");
   assert(Ref != New && "Expected change");
-  if (auto *R = ReplaceableMetadataImpl::get(MD)) {
+  if (auto *R = ReplaceableMetadataImpl::getIfExists(MD)) {
     R->moveRef(Ref, New, MD);
     return true;
   }
+  assert(!isa<DistinctMDOperandPlaceholder>(MD) &&
+         "Unexpected move of an MDOperand");
+  assert(!isReplaceable(MD) &&
+         "Expected un-replaceable metadata, since we didn't move a reference");
   return false;
 }
 
 bool MetadataTracking::isReplaceable(const Metadata &MD) {
-  return ReplaceableMetadataImpl::get(const_cast<Metadata &>(MD));
+  return ReplaceableMetadataImpl::isReplaceable(MD);
 }
 
 void ReplaceableMetadataImpl::addRef(void *Ref, OwnerTy Owner) {
@@ -188,11 +198,6 @@ void ReplaceableMetadataImpl::moveRef(void *Ref, void *New,
 }
 
 void ReplaceableMetadataImpl::replaceAllUsesWith(Metadata *MD) {
-  assert(!(MD && isa<MDNode>(MD) && cast<MDNode>(MD)->isTemporary()) &&
-         "Expected non-temp node");
-  assert(CanReplace &&
-         "Attempted to replace Metadata marked for no replacement");
-
   if (UseMap.empty())
     return;
 
@@ -273,9 +278,21 @@ void ReplaceableMetadataImpl::resolveAllUses(bool ResolveUsers) {
   }
 }
 
-ReplaceableMetadataImpl *ReplaceableMetadataImpl::get(Metadata &MD) {
+ReplaceableMetadataImpl *ReplaceableMetadataImpl::getOrCreate(Metadata &MD) {
+  if (auto *N = dyn_cast<MDNode>(&MD))
+    return N->isResolved() ? nullptr : N->Context.getOrCreateReplaceableUses();
+  return dyn_cast<ValueAsMetadata>(&MD);
+}
+
+ReplaceableMetadataImpl *ReplaceableMetadataImpl::getIfExists(Metadata &MD) {
   if (auto *N = dyn_cast<MDNode>(&MD))
-    return N->Context.getReplaceableUses();
+    return N->isResolved() ? nullptr : N->Context.getReplaceableUses();
+  return dyn_cast<ValueAsMetadata>(&MD);
+}
+
+bool ReplaceableMetadataImpl::isReplaceable(const Metadata &MD) {
+  if (auto *N = dyn_cast<MDNode>(&MD))
+    return !N->isResolved();
   return dyn_cast<ValueAsMetadata>(&MD);
 }
 
@@ -399,17 +416,12 @@ void ValueAsMetadata::handleRAUW(Value *From, Value *To) {
 
 MDString *MDString::get(LLVMContext &Context, StringRef Str) {
   auto &Store = Context.pImpl->MDStringCache;
-  auto I = Store.find(Str);
-  if (I != Store.end())
-    return &I->second;
-
-  auto *Entry =
-      StringMapEntry<MDString>::Create(Str, Store.getAllocator(), MDString());
-  bool WasInserted = Store.insert(Entry);
-  (void)WasInserted;
-  assert(WasInserted && "Expected entry to be inserted");
-  Entry->second.Entry = Entry;
-  return &Entry->second;
+  auto I = Store.emplace_second(Str);
+  auto &MapEntry = I.first->getValue();
+  if (!I.second)
+    return &MapEntry;
+  MapEntry.Entry = &*I.first;
+  return &MapEntry;
 }
 
 StringRef MDString::getString() const {
@@ -433,7 +445,7 @@ void *MDNode::operator new(size_t Size, unsigned NumOps) {
   size_t OpSize = NumOps * sizeof(MDOperand);
   // uint64_t is the most aligned type we need support (ensured by static_assert
   // above)
-  OpSize = RoundUpToAlignment(OpSize, llvm::alignOf<uint64_t>());
+  OpSize = alignTo(OpSize, llvm::alignOf<uint64_t>());
   void *Ptr = reinterpret_cast<char *>(::operator new(OpSize + Size)) + OpSize;
   MDOperand *O = static_cast<MDOperand *>(Ptr);
   for (MDOperand *E = O - NumOps; O != E; --O)
@@ -444,7 +456,7 @@ void *MDNode::operator new(size_t Size, unsigned NumOps) {
 void MDNode::operator delete(void *Mem) {
   MDNode *N = static_cast<MDNode *>(Mem);
   size_t OpSize = N->NumOperands * sizeof(MDOperand);
-  OpSize = RoundUpToAlignment(OpSize, llvm::alignOf<uint64_t>());
+  OpSize = alignTo(OpSize, llvm::alignOf<uint64_t>());
 
   MDOperand *O = static_cast<MDOperand *>(Mem);
   for (MDOperand *E = O - N->NumOperands; O != E; --O)
@@ -462,16 +474,12 @@ MDNode::MDNode(LLVMContext &Context, unsigned ID, StorageType Storage,
   for (Metadata *MD : Ops2)
     setOperand(Op++, MD);
 
-  if (isDistinct())
+  if (!isUniqued())
     return;
 
-  if (isUniqued())
-    // Check whether any operands are unresolved, requiring re-uniquing.  If
-    // not, don't support RAUW.
-    if (!countUnresolvedOperands())
-      return;
-
-  this->Context.makeReplaceable(make_unique<ReplaceableMetadataImpl>(Context));
+  // Count the unresolved operands.  If there are any, RAUW support will be
+  // added lazily on first reference.
+  countUnresolvedOperands();
 }
 
 TempMDNode MDNode::clone() const {
@@ -491,10 +499,10 @@ static bool isOperandUnresolved(Metadata *Op) {
   return false;
 }
 
-unsigned MDNode::countUnresolvedOperands() {
+void MDNode::countUnresolvedOperands() {
   assert(NumUnresolved == 0 && "Expected unresolved ops to be uncounted");
-  NumUnresolved = std::count_if(op_begin(), op_end(), isOperandUnresolved);
-  return NumUnresolved;
+  assert(isUniqued() && "Expected this to be uniqued");
+  NumUnresolved = count_if(operands(), isOperandUnresolved);
 }
 
 void MDNode::makeUniqued() {
@@ -507,8 +515,11 @@ void MDNode::makeUniqued() {
 
   // Make this 'uniqued'.
   Storage = Uniqued;
-  if (!countUnresolvedOperands())
-    resolve();
+  countUnresolvedOperands();
+  if (!NumUnresolved) {
+    dropReplaceableUses();
+    assert(isResolved() && "Expected this to be resolved");
+  }
 
   assert(isUniqued() && "Expected this to be uniqued");
 }
@@ -517,9 +528,8 @@ void MDNode::makeDistinct() {
   assert(isTemporary() && "Expected this to be temporary");
   assert(!isResolved() && "Expected this to be unresolved");
 
-  // Pretend to be uniqued, resolve the node, and then store in distinct table.
-  Storage = Uniqued;
-  resolve();
+  // Drop RAUW support and store as a distinct node.
+  dropReplaceableUses();
   storeDistinctInContext();
 
   assert(isDistinct() && "Expected this to be distinct");
@@ -530,16 +540,22 @@ void MDNode::resolve() {
   assert(isUniqued() && "Expected this to be uniqued");
   assert(!isResolved() && "Expected this to be unresolved");
 
-  // Move the map, so that this immediately looks resolved.
-  auto Uses = Context.takeReplaceableUses();
   NumUnresolved = 0;
+  dropReplaceableUses();
+
   assert(isResolved() && "Expected this to be resolved");
+}
 
-  // Drop RAUW support.
-  Uses->resolveAllUses();
+void MDNode::dropReplaceableUses() {
+  assert(!NumUnresolved && "Unexpected unresolved operand");
+
+  // Drop any RAUW support.
+  if (Context.hasReplaceableUses())
+    Context.takeReplaceableUses()->resolveAllUses();
 }
 
 void MDNode::resolveAfterOperandChange(Metadata *Old, Metadata *New) {
+  assert(isUniqued() && "Expected this to be uniqued");
   assert(NumUnresolved != 0 && "Expected unresolved operands");
 
   // Check if an operand was resolved.
@@ -552,12 +568,20 @@ void MDNode::resolveAfterOperandChange(Metadata *Old, Metadata *New) {
 }
 
 void MDNode::decrementUnresolvedOperandCount() {
-  if (!--NumUnresolved)
-    // Last unresolved operand has just been resolved.
-    resolve();
+  assert(!isResolved() && "Expected this to be unresolved");
+  if (isTemporary())
+    return;
+
+  assert(isUniqued() && "Expected this to be uniqued");
+  if (--NumUnresolved)
+    return;
+
+  // Last unresolved operand has just been resolved.
+  dropReplaceableUses();
+  assert(isResolved() && "Expected this to become resolved");
 }
 
-void MDNode::resolveRecursivelyImpl(bool AllowTemps) {
+void MDNode::resolveCycles() {
   if (isResolved())
     return;
 
@@ -570,8 +594,6 @@ void MDNode::resolveRecursivelyImpl(bool AllowTemps) {
     if (!N)
       continue;
 
-    if (N->isTemporary() && AllowTemps)
-      continue;
     assert(!N->isTemporary() &&
            "Expected all forward declarations to be resolved");
     if (!N->isResolved())
@@ -631,7 +653,7 @@ void MDTuple::recalculateHash() {
 void MDNode::dropAllReferences() {
   for (unsigned I = 0, E = NumOperands; I != E; ++I)
     setOperand(I, nullptr);
-  if (!isResolved()) {
+  if (Context.hasReplaceableUses()) {
     Context.getReplaceableUses()->resolveAllUses(/* ResolveUsers */ false);
     (void)Context.takeReplaceableUses();
   }
@@ -677,7 +699,8 @@ void MDNode::handleChangedOperand(void *Ref, Metadata *New) {
     // dropAllReferences(), but we still need the use-list).
     for (unsigned O = 0, E = getNumOperands(); O != E; ++O)
       setOperand(O, nullptr);
-    Context.getReplaceableUses()->replaceAllUsesWith(Uniqued);
+    if (Context.hasReplaceableUses())
+      Context.getReplaceableUses()->replaceAllUsesWith(Uniqued);
     deleteAsSubclass();
     return;
   }
@@ -775,8 +798,10 @@ void MDNode::deleteTemporary(MDNode *N) {
 }
 
 void MDNode::storeDistinctInContext() {
-  assert(isResolved() && "Expected resolved nodes");
+  assert(!Context.hasReplaceableUses() && "Unexpected replaceable uses");
+  assert(!NumUnresolved && "Unexpected unresolved nodes");
   Storage = Distinct;
+  assert(isResolved() && "Expected this to be resolved");
 
   // Reset the hash.
   switch (getMetadataID()) {
@@ -791,7 +816,7 @@ void MDNode::storeDistinctInContext() {
 #include "llvm/IR/Metadata.def"
   }
 
-  getContext().pImpl->DistinctMDNodes.insert(this);
+  getContext().pImpl->DistinctMDNodes.push_back(this);
 }
 
 void MDNode::replaceOperandWith(unsigned I, Metadata *New) {
@@ -811,7 +836,7 @@ void MDNode::setOperand(unsigned I, Metadata *New) {
   mutable_begin()[I].reset(New, isUniqued() ? this : nullptr);
 }
 
-/// \brief Get a node, or a self-reference that looks like it.
+/// Get a node or a self-reference that looks like it.
 ///
 /// Special handling for finding self-references, for use by \a
 /// MDNode::concatenate() and \a MDNode::intersect() to maintain behaviour from
@@ -1100,6 +1125,43 @@ void MDAttachmentMap::getAll(
     array_pod_sort(Result.begin(), Result.end());
 }
 
+void MDGlobalAttachmentMap::insert(unsigned ID, MDNode &MD) {
+  Attachments.push_back({ID, TrackingMDNodeRef(&MD)});
+}
+
+void MDGlobalAttachmentMap::get(unsigned ID,
+                                SmallVectorImpl<MDNode *> &Result) {
+  for (auto A : Attachments)
+    if (A.MDKind == ID)
+      Result.push_back(A.Node);
+}
+
+void MDGlobalAttachmentMap::erase(unsigned ID) {
+  auto Follower = Attachments.begin();
+  for (auto Leader = Attachments.begin(), E = Attachments.end(); Leader != E;
+       ++Leader) {
+    if (Leader->MDKind != ID) {
+      if (Follower != Leader)
+        *Follower = std::move(*Leader);
+      ++Follower;
+    }
+  }
+  Attachments.resize(Follower - Attachments.begin());
+}
+
+void MDGlobalAttachmentMap::getAll(
+    SmallVectorImpl<std::pair<unsigned, MDNode *>> &Result) const {
+  for (auto &A : Attachments)
+    Result.emplace_back(A.MDKind, A.Node);
+
+  // Sort the resulting array so it is stable with respect to metadata IDs. We
+  // need to preserve the original insertion order though.
+  std::stable_sort(
+      Result.begin(), Result.end(),
+      [](const std::pair<unsigned, MDNode *> &A,
+         const std::pair<unsigned, MDNode *> &B) { return A.first < B.first; });
+}
+
 void Instruction::setMetadata(StringRef Kind, MDNode *Node) {
   if (!Node && !hasMetadata())
     return;
@@ -1138,9 +1200,6 @@ void Instruction::dropUnknownNonDebugMetadata(ArrayRef<unsigned> KnownIDs) {
   }
 }
 
-/// setMetadata - Set the metadata of the specified kind to the specified
-/// node.  This updates/replaces metadata if already present, or removes it if
-/// Node is null.
 void Instruction::setMetadata(unsigned KindID, MDNode *Node) {
   if (!Node && !hasMetadata())
     return;
@@ -1229,88 +1288,162 @@ void Instruction::getAllMetadataOtherThanDebugLocImpl(
   Info.getAll(Result);
 }
 
-/// clearMetadataHashEntries - Clear all hashtable-based metadata from
-/// this instruction.
+bool Instruction::extractProfMetadata(uint64_t &TrueVal, uint64_t &FalseVal) {
+  assert((getOpcode() == Instruction::Br ||
+          getOpcode() == Instruction::Select) &&
+         "Looking for branch weights on something besides branch or select");
+
+  auto *ProfileData = getMetadata(LLVMContext::MD_prof);
+  if (!ProfileData || ProfileData->getNumOperands() != 3)
+    return false;
+
+  auto *ProfDataName = dyn_cast<MDString>(ProfileData->getOperand(0));
+  if (!ProfDataName || !ProfDataName->getString().equals("branch_weights"))
+    return false;
+
+  auto *CITrue = mdconst::dyn_extract<ConstantInt>(ProfileData->getOperand(1));
+  auto *CIFalse = mdconst::dyn_extract<ConstantInt>(ProfileData->getOperand(2));
+  if (!CITrue || !CIFalse)
+    return false;
+
+  TrueVal = CITrue->getValue().getZExtValue();
+  FalseVal = CIFalse->getValue().getZExtValue();
+
+  return true;
+}
+
+bool Instruction::extractProfTotalWeight(uint64_t &TotalVal) {
+  assert((getOpcode() == Instruction::Br ||
+          getOpcode() == Instruction::Select ||
+          getOpcode() == Instruction::Call ||
+          getOpcode() == Instruction::Invoke) &&
+         "Looking for branch weights on something besides branch");
+
+  TotalVal = 0;
+  auto *ProfileData = getMetadata(LLVMContext::MD_prof);
+  if (!ProfileData)
+    return false;
+
+  auto *ProfDataName = dyn_cast<MDString>(ProfileData->getOperand(0));
+  if (!ProfDataName || !ProfDataName->getString().equals("branch_weights"))
+    return false;
+
+  TotalVal = 0;
+  for (unsigned i = 1; i < ProfileData->getNumOperands(); i++) {
+    auto *V = mdconst::dyn_extract<ConstantInt>(ProfileData->getOperand(i));
+    if (!V)
+      return false;
+    TotalVal += V->getValue().getZExtValue();
+  }
+  return true;
+}
+
 void Instruction::clearMetadataHashEntries() {
   assert(hasMetadataHashEntry() && "Caller should check");
   getContext().pImpl->InstructionMetadata.erase(this);
   setHasMetadataHashEntry(false);
 }
 
-MDNode *Function::getMetadata(unsigned KindID) const {
-  if (!hasMetadata())
-    return nullptr;
-  return getContext().pImpl->FunctionMetadata[this].lookup(KindID);
+void GlobalObject::getMetadata(unsigned KindID,
+                               SmallVectorImpl<MDNode *> &MDs) const {
+  if (hasMetadata())
+    getContext().pImpl->GlobalObjectMetadata[this].get(KindID, MDs);
 }
 
-MDNode *Function::getMetadata(StringRef Kind) const {
-  if (!hasMetadata())
-    return nullptr;
-  return getMetadata(getContext().getMDKindID(Kind));
+void GlobalObject::getMetadata(StringRef Kind,
+                               SmallVectorImpl<MDNode *> &MDs) const {
+  if (hasMetadata())
+    getMetadata(getContext().getMDKindID(Kind), MDs);
 }
 
-void Function::setMetadata(unsigned KindID, MDNode *MD) {
-  if (MD) {
-    if (!hasMetadata())
-      setHasMetadataHashEntry(true);
+void GlobalObject::addMetadata(unsigned KindID, MDNode &MD) {
+  if (!hasMetadata())
+    setHasMetadataHashEntry(true);
 
-    getContext().pImpl->FunctionMetadata[this].set(KindID, *MD);
-    return;
-  }
+  getContext().pImpl->GlobalObjectMetadata[this].insert(KindID, MD);
+}
+
+void GlobalObject::addMetadata(StringRef Kind, MDNode &MD) {
+  addMetadata(getContext().getMDKindID(Kind), MD);
+}
 
+void GlobalObject::eraseMetadata(unsigned KindID) {
   // Nothing to unset.
   if (!hasMetadata())
     return;
 
-  auto &Store = getContext().pImpl->FunctionMetadata[this];
+  auto &Store = getContext().pImpl->GlobalObjectMetadata[this];
   Store.erase(KindID);
   if (Store.empty())
     clearMetadata();
 }
 
-void Function::setMetadata(StringRef Kind, MDNode *MD) {
-  if (!MD && !hasMetadata())
-    return;
-  setMetadata(getContext().getMDKindID(Kind), MD);
-}
-
-void Function::getAllMetadata(
+void GlobalObject::getAllMetadata(
     SmallVectorImpl<std::pair<unsigned, MDNode *>> &MDs) const {
   MDs.clear();
 
   if (!hasMetadata())
     return;
 
-  getContext().pImpl->FunctionMetadata[this].getAll(MDs);
+  getContext().pImpl->GlobalObjectMetadata[this].getAll(MDs);
 }
 
-void Function::dropUnknownMetadata(ArrayRef<unsigned> KnownIDs) {
+void GlobalObject::clearMetadata() {
   if (!hasMetadata())
     return;
-  if (KnownIDs.empty()) {
-    clearMetadata();
-    return;
-  }
+  getContext().pImpl->GlobalObjectMetadata.erase(this);
+  setHasMetadataHashEntry(false);
+}
 
-  SmallSet<unsigned, 5> KnownSet;
-  KnownSet.insert(KnownIDs.begin(), KnownIDs.end());
+void GlobalObject::setMetadata(unsigned KindID, MDNode *N) {
+  eraseMetadata(KindID);
+  if (N)
+    addMetadata(KindID, *N);
+}
 
-  auto &Store = getContext().pImpl->FunctionMetadata[this];
-  assert(!Store.empty());
+void GlobalObject::setMetadata(StringRef Kind, MDNode *N) {
+  setMetadata(getContext().getMDKindID(Kind), N);
+}
 
-  Store.remove_if([&KnownSet](const std::pair<unsigned, TrackingMDNodeRef> &I) {
-    return !KnownSet.count(I.first);
-  });
+MDNode *GlobalObject::getMetadata(unsigned KindID) const {
+  SmallVector<MDNode *, 1> MDs;
+  getMetadata(KindID, MDs);
+  assert(MDs.size() <= 1 && "Expected at most one metadata attachment");
+  if (MDs.empty())
+    return nullptr;
+  return MDs[0];
+}
 
-  if (Store.empty())
-    clearMetadata();
+MDNode *GlobalObject::getMetadata(StringRef Kind) const {
+  return getMetadata(getContext().getMDKindID(Kind));
 }
 
-void Function::clearMetadata() {
-  if (!hasMetadata())
-    return;
-  getContext().pImpl->FunctionMetadata.erase(this);
-  setHasMetadataHashEntry(false);
+void GlobalObject::copyMetadata(const GlobalObject *Other, unsigned Offset) {
+  SmallVector<std::pair<unsigned, MDNode *>, 8> MDs;
+  Other->getAllMetadata(MDs);
+  for (auto &MD : MDs) {
+    // We need to adjust the type metadata offset.
+    if (Offset != 0 && MD.first == LLVMContext::MD_type) {
+      auto *OffsetConst = cast<ConstantInt>(
+          cast<ConstantAsMetadata>(MD.second->getOperand(0))->getValue());
+      Metadata *TypeId = MD.second->getOperand(1);
+      auto *NewOffsetMD = ConstantAsMetadata::get(ConstantInt::get(
+          OffsetConst->getType(), OffsetConst->getValue() + Offset));
+      addMetadata(LLVMContext::MD_type,
+                  *MDNode::get(getContext(), {NewOffsetMD, TypeId}));
+      continue;
+    }
+    addMetadata(MD.first, *MD.second);
+  }
+}
+
+void GlobalObject::addTypeMetadata(unsigned Offset, Metadata *TypeID) {
+  addMetadata(
+      LLVMContext::MD_type,
+      *MDTuple::get(getContext(),
+                    {llvm::ConstantAsMetadata::get(llvm::ConstantInt::get(
+                         Type::getInt64Ty(getContext()), Offset)),
+                     TypeID}));
 }
 
 void Function::setSubprogram(DISubprogram *SP) {
diff --git a/lib/IR/Module.cpp b/lib/IR/Module.cpp
index ac578d6dba0f..ae81b25b9517 100644
--- a/lib/IR/Module.cpp
+++ b/lib/IR/Module.cpp
@@ -13,12 +13,13 @@
 
 #include "llvm/IR/Module.h"
 #include "SymbolTableListTraitsImpl.h"
-#include "llvm/ADT/DenseSet.h"
 #include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/SmallPtrSet.h"
 #include "llvm/ADT/SmallString.h"
 #include "llvm/ADT/StringExtras.h"
 #include "llvm/IR/Constants.h"
 #include "llvm/IR/DerivedTypes.h"
+#include "llvm/IR/DebugInfoMetadata.h"
 #include "llvm/IR/GVMaterializer.h"
 #include "llvm/IR/InstrTypes.h"
 #include "llvm/IR/LLVMContext.h"
@@ -41,13 +42,14 @@ using namespace llvm;
 template class llvm::SymbolTableListTraits<Function>;
 template class llvm::SymbolTableListTraits<GlobalVariable>;
 template class llvm::SymbolTableListTraits<GlobalAlias>;
+template class llvm::SymbolTableListTraits<GlobalIFunc>;
 
 //===----------------------------------------------------------------------===//
 // Primitive Module methods.
 //
 
 Module::Module(StringRef MID, LLVMContext &C)
-    : Context(C), Materializer(), ModuleID(MID), DL("") {
+    : Context(C), Materializer(), ModuleID(MID), SourceFileName(MID), DL("") {
   ValSymTab = new ValueSymbolTable();
   NamedMDSymTab = new StringMap<NamedMDNode *>();
   Context.addModule(this);
@@ -59,6 +61,7 @@ Module::~Module() {
   GlobalList.clear();
   FunctionList.clear();
   AliasList.clear();
+  IFuncList.clear();
   NamedMDList.clear();
   delete ValSymTab;
   delete static_cast<StringMap<NamedMDNode *> *>(NamedMDSymTab);
@@ -250,6 +253,10 @@ GlobalAlias *Module::getNamedAlias(StringRef Name) const {
   return dyn_cast_or_null<GlobalAlias>(getNamedValue(Name));
 }
 
+GlobalIFunc *Module::getNamedIFunc(StringRef Name) const {
+  return dyn_cast_or_null<GlobalIFunc>(getNamedValue(Name));
+}
+
 /// getNamedMetadata - Return the first NamedMDNode in the module with the
 /// specified name. This method returns null if a NamedMDNode with the
 /// specified name is not found.
@@ -374,6 +381,19 @@ void Module::setDataLayout(const DataLayout &Other) { DL = Other; }
 
 const DataLayout &Module::getDataLayout() const { return DL; }
 
+DICompileUnit *Module::debug_compile_units_iterator::operator*() const {
+  return cast<DICompileUnit>(CUs->getOperand(Idx));
+}
+DICompileUnit *Module::debug_compile_units_iterator::operator->() const {
+  return cast<DICompileUnit>(CUs->getOperand(Idx));
+}
+
+void Module::debug_compile_units_iterator::SkipNoDebugCUs() {
+  while (CUs && (Idx < CUs->getNumOperands()) &&
+         ((*this)->getEmissionKind() == DICompileUnit::NoDebug))
+    ++Idx;
+}
+
 //===----------------------------------------------------------------------===//
 // Methods to control the materialization of GlobalValues in the Module.
 //
@@ -438,6 +458,9 @@ void Module::dropAllReferences() {
 
   for (GlobalAlias &GA : aliases())
     GA.dropAllReferences();
+
+  for (GlobalIFunc &GIF : ifuncs())
+    GIF.dropAllReferences();
 }
 
 unsigned Module::getDwarfVersion() const {
@@ -464,7 +487,7 @@ PICLevel::Level Module::getPICLevel() const {
   auto *Val = cast_or_null<ConstantAsMetadata>(getModuleFlag("PIC Level"));
 
   if (!Val)
-    return PICLevel::Default;
+    return PICLevel::NotPIC;
 
   return static_cast<PICLevel::Level>(
       cast<ConstantInt>(Val->getValue())->getZExtValue());
@@ -474,14 +497,39 @@ void Module::setPICLevel(PICLevel::Level PL) {
   addModuleFlag(ModFlagBehavior::Error, "PIC Level", PL);
 }
 
-void Module::setMaximumFunctionCount(uint64_t Count) {
-  addModuleFlag(ModFlagBehavior::Error, "MaxFunctionCount", Count);
-}
+PIELevel::Level Module::getPIELevel() const {
+  auto *Val = cast_or_null<ConstantAsMetadata>(getModuleFlag("PIE Level"));
 
-Optional<uint64_t> Module::getMaximumFunctionCount() {
-  auto *Val =
-      cast_or_null<ConstantAsMetadata>(getModuleFlag("MaxFunctionCount"));
   if (!Val)
-    return None;
-  return cast<ConstantInt>(Val->getValue())->getZExtValue();
+    return PIELevel::Default;
+
+  return static_cast<PIELevel::Level>(
+      cast<ConstantInt>(Val->getValue())->getZExtValue());
+}
+
+void Module::setPIELevel(PIELevel::Level PL) {
+  addModuleFlag(ModFlagBehavior::Error, "PIE Level", PL);
+}
+
+void Module::setProfileSummary(Metadata *M) {
+  addModuleFlag(ModFlagBehavior::Error, "ProfileSummary", M);
+}
+
+Metadata *Module::getProfileSummary() {
+  return getModuleFlag("ProfileSummary");
+}
+
+GlobalVariable *llvm::collectUsedGlobalVariables(
+    const Module &M, SmallPtrSetImpl<GlobalValue *> &Set, bool CompilerUsed) {
+  const char *Name = CompilerUsed ? "llvm.compiler.used" : "llvm.used";
+  GlobalVariable *GV = M.getGlobalVariable(Name);
+  if (!GV || !GV->hasInitializer())
+    return GV;
+
+  const ConstantArray *Init = cast<ConstantArray>(GV->getInitializer());
+  for (Value *Op : Init->operands()) {
+    GlobalValue *G = cast<GlobalValue>(Op->stripPointerCastsNoFollowAliases());
+    Set.insert(G);
+  }
+  return GV;
 }
diff --git a/lib/IR/ModuleSummaryIndex.cpp b/lib/IR/ModuleSummaryIndex.cpp
new file mode 100644
index 000000000000..6107cf40a08d
--- /dev/null
+++ b/lib/IR/ModuleSummaryIndex.cpp
@@ -0,0 +1,107 @@
+//===-- ModuleSummaryIndex.cpp - Module Summary Index ---------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the module index and summary classes for the
+// IR library.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/IR/ModuleSummaryIndex.h"
+#include "llvm/ADT/StringMap.h"
+using namespace llvm;
+
+// Create the combined module index/summary from multiple
+// per-module instances.
+void ModuleSummaryIndex::mergeFrom(std::unique_ptr<ModuleSummaryIndex> Other,
+                                   uint64_t NextModuleId) {
+
+  StringRef ModPath;
+  for (auto &OtherGlobalValSummaryLists : *Other) {
+    GlobalValue::GUID ValueGUID = OtherGlobalValSummaryLists.first;
+    GlobalValueSummaryList &List = OtherGlobalValSummaryLists.second;
+
+    // Assert that the value summary list only has one entry, since we shouldn't
+    // have duplicate names within a single per-module index.
+    assert(List.size() == 1);
+    std::unique_ptr<GlobalValueSummary> Summary = std::move(List.front());
+
+    // Add the module path string ref for this module if we haven't already
+    // saved a reference to it.
+    if (ModPath.empty()) {
+      auto Path = Summary->modulePath();
+      ModPath = addModulePath(Path, NextModuleId, Other->getModuleHash(Path))
+                    ->first();
+    } else
+      assert(ModPath == Summary->modulePath() &&
+             "Each module in the combined map should have a unique ID");
+
+    // Note the module path string ref was copied above and is still owned by
+    // the original per-module index. Reset it to the new module path
+    // string reference owned by the combined index.
+    Summary->setModulePath(ModPath);
+
+    // Add new value summary to existing list. There may be duplicates when
+    // combining GlobalValueMap entries, due to COMDAT values. Any local
+    // values were given unique global IDs.
+    addGlobalValueSummary(ValueGUID, std::move(Summary));
+  }
+}
+
+void ModuleSummaryIndex::removeEmptySummaryEntries() {
+  for (auto MI = begin(), MIE = end(); MI != MIE;) {
+    // Only expect this to be called on a per-module index, which has a single
+    // entry per value entry list.
+    assert(MI->second.size() == 1);
+    if (!MI->second[0])
+      MI = GlobalValueMap.erase(MI);
+    else
+      ++MI;
+  }
+}
+
+// Collect for the given module the list of function it defines
+// (GUID -> Summary).
+void ModuleSummaryIndex::collectDefinedFunctionsForModule(
+    StringRef ModulePath, GVSummaryMapTy &GVSummaryMap) const {
+  for (auto &GlobalList : *this) {
+    auto GUID = GlobalList.first;
+    for (auto &GlobSummary : GlobalList.second) {
+      auto *Summary = dyn_cast_or_null<FunctionSummary>(GlobSummary.get());
+      if (!Summary)
+        // Ignore global variable, focus on functions
+        continue;
+      // Ignore summaries from other modules.
+      if (Summary->modulePath() != ModulePath)
+        continue;
+      GVSummaryMap[GUID] = Summary;
+    }
+  }
+}
+
+// Collect for each module the list of function it defines (GUID -> Summary).
+void ModuleSummaryIndex::collectDefinedGVSummariesPerModule(
+    StringMap<GVSummaryMapTy> &ModuleToDefinedGVSummaries) const {
+  for (auto &GlobalList : *this) {
+    auto GUID = GlobalList.first;
+    for (auto &Summary : GlobalList.second) {
+      ModuleToDefinedGVSummaries[Summary->modulePath()][GUID] = Summary.get();
+    }
+  }
+}
+
+GlobalValueSummary *
+ModuleSummaryIndex::getGlobalValueSummary(uint64_t ValueGUID,
+                                          bool PerModuleIndex) const {
+  auto SummaryList = findGlobalValueSummaryList(ValueGUID);
+  assert(SummaryList != end() && "GlobalValue not found in index");
+  assert((!PerModuleIndex || SummaryList->second.size() == 1) &&
+         "Expected a single entry per global value in per-module index");
+  auto &Summary = SummaryList->second[0];
+  return Summary.get();
+}
diff --git a/lib/IR/Operator.cpp b/lib/IR/Operator.cpp
index 77dc680af110..8a94053a72c5 100644
--- a/lib/IR/Operator.cpp
+++ b/lib/IR/Operator.cpp
@@ -12,6 +12,12 @@ Type *GEPOperator::getSourceElementType() const {
   return cast<GetElementPtrConstantExpr>(this)->getSourceElementType();
 }
 
+Type *GEPOperator::getResultElementType() const {
+  if (auto *I = dyn_cast<GetElementPtrInst>(this))
+    return I->getResultElementType();
+  return cast<GetElementPtrConstantExpr>(this)->getResultElementType();
+}
+
 bool GEPOperator::accumulateConstantOffset(const DataLayout &DL,
                                            APInt &Offset) const {
   assert(Offset.getBitWidth() ==
diff --git a/lib/IR/OptBisect.cpp b/lib/IR/OptBisect.cpp
new file mode 100644
index 000000000000..e9574ca81261
--- /dev/null
+++ b/lib/IR/OptBisect.cpp
@@ -0,0 +1,120 @@
+//===------- llvm/IR/OptBisect/Bisect.cpp - LLVM Bisect support --------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// This file implements support for a bisecting optimizations based on a
+/// command line option.
+///
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Analysis/CallGraphSCCPass.h"
+#include "llvm/Analysis/LazyCallGraph.h"
+#include "llvm/Analysis/LoopInfo.h"
+#include "llvm/IR/Module.h"
+#include "llvm/IR/OptBisect.h"
+#include "llvm/Pass.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/raw_ostream.h"
+
+using namespace llvm;
+
+static cl::opt<int> OptBisectLimit("opt-bisect-limit", cl::Hidden,
+                                   cl::init(INT_MAX), cl::Optional,
+                                   cl::desc("Maximum optimization to perform"));
+
+OptBisect::OptBisect() {
+  BisectEnabled = OptBisectLimit != INT_MAX;
+}
+
+static void printPassMessage(const StringRef &Name, int PassNum,
+                             StringRef TargetDesc, bool Running) {
+  StringRef Status = Running ? "" : "NOT ";
+  errs() << "BISECT: " << Status << "running pass "
+         << "(" << PassNum << ") " << Name << " on " << TargetDesc << "\n";
+}
+
+static void printCaseMessage(int CaseNum, StringRef Msg, bool Running) {
+  if (Running)
+    errs() << "BISECT: running case (";
+  else
+    errs() << "BISECT: NOT running case (";
+  errs() << CaseNum << "): " << Msg << "\n";
+}
+
+static std::string getDescription(const Module &M) {
+  return "module (" + M.getName().str() + ")";
+}
+
+static std::string getDescription(const Function &F) {
+  return "function (" + F.getName().str() + ")";
+}
+
+static std::string getDescription(const BasicBlock &BB) {
+  return "basic block (" + BB.getName().str() + ") in function (" +
+         BB.getParent()->getName().str() + ")";
+}
+
+static std::string getDescription(const Loop &L) {
+  // FIXME: I'd like to be able to provide a better description here, but
+  //        calling L->getHeader() would introduce a new dependency on the
+  //        LLVMCore library.
+  return "loop";
+}
+
+static std::string getDescription(const CallGraphSCC &SCC) {
+  std::string Desc = "SCC (";
+  bool First = true;
+  for (CallGraphNode *CGN : SCC) {
+    if (First)
+      First = false;
+    else
+      Desc += ", ";
+    Function *F = CGN->getFunction();
+    if (F)
+      Desc += F->getName();
+    else
+      Desc += "<<null function>>";
+  }
+  Desc += ")";
+  return Desc;
+}
+
+// Force instantiations.
+template bool OptBisect::shouldRunPass(const Pass *, const Module &);
+template bool OptBisect::shouldRunPass(const Pass *, const Function &);
+template bool OptBisect::shouldRunPass(const Pass *, const BasicBlock &);
+template bool OptBisect::shouldRunPass(const Pass *, const Loop &);
+template bool OptBisect::shouldRunPass(const Pass *, const CallGraphSCC &);
+
+template <class UnitT>
+bool OptBisect::shouldRunPass(const Pass *P, const UnitT &U) {
+  if (!BisectEnabled)
+    return true;
+  return checkPass(P->getPassName(), getDescription(U));
+}
+
+bool OptBisect::checkPass(const StringRef PassName,
+                          const StringRef TargetDesc) {
+  assert(BisectEnabled);
+
+  int CurBisectNum = ++LastBisectNum;
+  bool ShouldRun = (OptBisectLimit == -1 || CurBisectNum <= OptBisectLimit);
+  printPassMessage(PassName, CurBisectNum, TargetDesc, ShouldRun);
+  return ShouldRun;
+}
+
+bool OptBisect::shouldRunCase(const Twine &Msg) {
+  if (!BisectEnabled)
+    return true;
+  int CurFuelNum = ++LastBisectNum;
+  bool ShouldRun = (OptBisectLimit == -1 || CurFuelNum <= OptBisectLimit);
+  printCaseMessage(CurFuelNum, Msg.str(), ShouldRun);
+  return ShouldRun;
+}
+
diff --git a/lib/IR/Pass.cpp b/lib/IR/Pass.cpp
index df45460a6cca..69299fea7634 100644
--- a/lib/IR/Pass.cpp
+++ b/lib/IR/Pass.cpp
@@ -17,6 +17,8 @@
 #include "llvm/IR/Function.h"
 #include "llvm/IR/IRPrintingPasses.h"
 #include "llvm/IR/LegacyPassNameParser.h"
+#include "llvm/IR/Module.h"
+#include "llvm/IR/OptBisect.h"
 #include "llvm/PassRegistry.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/raw_ostream.h"
@@ -45,6 +47,10 @@ PassManagerType ModulePass::getPotentialPassManagerType() const {
   return PMT_ModulePassManager;
 }
 
+bool ModulePass::skipModule(Module &M) const {
+  return !M.getContext().getOptBisect().shouldRunPass(this, M);
+}
+
 bool Pass::mustPreserveAnalysisID(char &AID) const {
   return Resolver->getAnalysisIfAvailable(&AID, true) != nullptr;
 }
@@ -113,7 +119,7 @@ void Pass::print(raw_ostream &O,const Module*) const {
 }
 
 // dump - call print(cerr);
-void Pass::dump() const {
+LLVM_DUMP_METHOD void Pass::dump() const {
   print(dbgs(), nullptr);
 }
 
@@ -140,10 +146,13 @@ PassManagerType FunctionPass::getPotentialPassManagerType() const {
   return PMT_FunctionPassManager;
 }
 
-bool FunctionPass::skipOptnoneFunction(const Function &F) const {
+bool FunctionPass::skipFunction(const Function &F) const {
+  if (!F.getContext().getOptBisect().shouldRunPass(this, F))
+    return true;
+
   if (F.hasFnAttribute(Attribute::OptimizeNone)) {
-    DEBUG(dbgs() << "Skipping pass '" << getPassName()
-          << "' on function " << F.getName() << "\n");
+    DEBUG(dbgs() << "Skipping pass '" << getPassName() << "' on function "
+                 << F.getName() << "\n");
     return true;
   }
   return false;
@@ -168,9 +177,13 @@ bool BasicBlockPass::doFinalization(Function &) {
   return false;
 }
 
-bool BasicBlockPass::skipOptnoneFunction(const BasicBlock &BB) const {
+bool BasicBlockPass::skipBasicBlock(const BasicBlock &BB) const {
   const Function *F = BB.getParent();
-  if (F && F->hasFnAttribute(Attribute::OptimizeNone)) {
+  if (!F)
+    return false;
+  if (!F->getContext().getOptBisect().shouldRunPass(this, BB))
+    return true;
+  if (F->hasFnAttribute(Attribute::OptimizeNone)) {
     // Report this only once per function.
     if (&BB == &F->getEntryBlock())
       DEBUG(dbgs() << "Skipping pass '" << getPassName()
diff --git a/lib/IR/PassManager.cpp b/lib/IR/PassManager.cpp
index a5f407c00e8e..8563a4019a2b 100644
--- a/lib/IR/PassManager.cpp
+++ b/lib/IR/PassManager.cpp
@@ -7,37 +7,18 @@
 //
 //===----------------------------------------------------------------------===//
 
+#include "llvm/IR/PassManager.h"
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/IR/LLVMContext.h"
-#include "llvm/IR/PassManager.h"
 
 using namespace llvm;
 
-char FunctionAnalysisManagerModuleProxy::PassID;
-
-FunctionAnalysisManagerModuleProxy::Result
-FunctionAnalysisManagerModuleProxy::run(Module &M) {
-  assert(FAM->empty() && "Function analyses ran prior to the module proxy!");
-  return Result(*FAM);
-}
-
-FunctionAnalysisManagerModuleProxy::Result::~Result() {
-  // Clear out the analysis manager if we're being destroyed -- it means we
-  // didn't even see an invalidate call when we got invalidated.
-  FAM->clear();
+// Explicit template instantiations for core template typedefs.
+namespace llvm {
+template class PassManager<Module>;
+template class PassManager<Function>;
+template class AnalysisManager<Module>;
+template class AnalysisManager<Function>;
+template class InnerAnalysisManagerProxy<FunctionAnalysisManager, Module>;
+template class OuterAnalysisManagerProxy<ModuleAnalysisManager, Function>;
 }
-
-bool FunctionAnalysisManagerModuleProxy::Result::invalidate(
-    Module &M, const PreservedAnalyses &PA) {
-  // If this proxy isn't marked as preserved, then we can't even invalidate
-  // individual function analyses, there may be an invalid set of Function
-  // objects in the cache making it impossible to incrementally preserve them.
-  // Just clear the entire manager.
-  if (!PA.preserved(ID()))
-    FAM->clear();
-
-  // Return false to indicate that this result is still a valid proxy.
-  return false;
-}
-
-char ModuleAnalysisManagerFunctionProxy::PassID;
diff --git a/lib/IR/PassRegistry.cpp b/lib/IR/PassRegistry.cpp
index b879fef3f4a7..09b17ba308dd 100644
--- a/lib/IR/PassRegistry.cpp
+++ b/lib/IR/PassRegistry.cpp
@@ -13,12 +13,8 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/PassRegistry.h"
-#include "llvm/IR/Function.h"
 #include "llvm/PassSupport.h"
-#include "llvm/Support/Compiler.h"
 #include "llvm/Support/ManagedStatic.h"
-#include "llvm/Support/RWMutex.h"
-#include <vector>
 
 using namespace llvm;
 
diff --git a/lib/IR/ProfileSummary.cpp b/lib/IR/ProfileSummary.cpp
new file mode 100644
index 000000000000..2b24d1251121
--- /dev/null
+++ b/lib/IR/ProfileSummary.cpp
@@ -0,0 +1,191 @@
+//=-- Profilesummary.cpp - Profile summary support --------------------------=//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains support for converting profile summary data from/to
+// metadata.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/IR/ProfileSummary.h"
+#include "llvm/IR/Attributes.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/Metadata.h"
+#include "llvm/IR/Type.h"
+#include "llvm/Support/Casting.h"
+
+using namespace llvm;
+
+const char *ProfileSummary::KindStr[2] = {"InstrProf", "SampleProfile"};
+
+// Return an MDTuple with two elements. The first element is a string Key and
+// the second is a uint64_t Value.
+static Metadata *getKeyValMD(LLVMContext &Context, const char *Key,
+                             uint64_t Val) {
+  Type *Int64Ty = Type::getInt64Ty(Context);
+  Metadata *Ops[2] = {MDString::get(Context, Key),
+                      ConstantAsMetadata::get(ConstantInt::get(Int64Ty, Val))};
+  return MDTuple::get(Context, Ops);
+}
+
+// Return an MDTuple with two elements. The first element is a string Key and
+// the second is a string Value.
+static Metadata *getKeyValMD(LLVMContext &Context, const char *Key,
+                             const char *Val) {
+  Metadata *Ops[2] = {MDString::get(Context, Key), MDString::get(Context, Val)};
+  return MDTuple::get(Context, Ops);
+}
+
+// This returns an MDTuple representing the detiled summary. The tuple has two
+// elements: a string "DetailedSummary" and an MDTuple representing the value
+// of the detailed summary. Each element of this tuple is again an MDTuple whose
+// elements are the (Cutoff, MinCount, NumCounts) triplet of the
+// DetailedSummaryEntry.
+Metadata *ProfileSummary::getDetailedSummaryMD(LLVMContext &Context) {
+  std::vector<Metadata *> Entries;
+  Type *Int32Ty = Type::getInt32Ty(Context);
+  Type *Int64Ty = Type::getInt64Ty(Context);
+  for (auto &Entry : DetailedSummary) {
+    Metadata *EntryMD[3] = {
+        ConstantAsMetadata::get(ConstantInt::get(Int32Ty, Entry.Cutoff)),
+        ConstantAsMetadata::get(ConstantInt::get(Int64Ty, Entry.MinCount)),
+        ConstantAsMetadata::get(ConstantInt::get(Int32Ty, Entry.NumCounts))};
+    Entries.push_back(MDTuple::get(Context, EntryMD));
+  }
+  Metadata *Ops[2] = {MDString::get(Context, "DetailedSummary"),
+                      MDTuple::get(Context, Entries)};
+  return MDTuple::get(Context, Ops);
+}
+
+// This returns an MDTuple representing this ProfileSummary object. The first
+// entry of this tuple is another MDTuple of two elements: a string
+// "ProfileFormat" and a string representing the format ("InstrProf" or
+// "SampleProfile"). The rest of the elements of the outer MDTuple are specific
+// to the kind of profile summary as returned by getFormatSpecificMD.
+Metadata *ProfileSummary::getMD(LLVMContext &Context) {
+  std::vector<Metadata *> Components;
+  Components.push_back(getKeyValMD(Context, "ProfileFormat", KindStr[PSK]));
+
+  Components.push_back(getKeyValMD(Context, "TotalCount", getTotalCount()));
+  Components.push_back(getKeyValMD(Context, "MaxCount", getMaxCount()));
+  Components.push_back(
+      getKeyValMD(Context, "MaxInternalCount", getMaxInternalCount()));
+  Components.push_back(
+      getKeyValMD(Context, "MaxFunctionCount", getMaxFunctionCount()));
+  Components.push_back(getKeyValMD(Context, "NumCounts", getNumCounts()));
+  Components.push_back(getKeyValMD(Context, "NumFunctions", getNumFunctions()));
+  Components.push_back(getDetailedSummaryMD(Context));
+  return MDTuple::get(Context, Components);
+}
+
+// Parse an MDTuple representing (Key, Val) pair.
+static bool getVal(MDTuple *MD, const char *Key, uint64_t &Val) {
+  if (!MD)
+    return false;
+  if (MD->getNumOperands() != 2)
+    return false;
+  MDString *KeyMD = dyn_cast<MDString>(MD->getOperand(0));
+  ConstantAsMetadata *ValMD = dyn_cast<ConstantAsMetadata>(MD->getOperand(1));
+  if (!KeyMD || !ValMD)
+    return false;
+  if (!KeyMD->getString().equals(Key))
+    return false;
+  Val = cast<ConstantInt>(ValMD->getValue())->getZExtValue();
+  return true;
+}
+
+// Check if an MDTuple represents a (Key, Val) pair.
+static bool isKeyValuePair(MDTuple *MD, const char *Key, const char *Val) {
+  if (!MD || MD->getNumOperands() != 2)
+    return false;
+  MDString *KeyMD = dyn_cast<MDString>(MD->getOperand(0));
+  MDString *ValMD = dyn_cast<MDString>(MD->getOperand(1));
+  if (!KeyMD || !ValMD)
+    return false;
+  if (!KeyMD->getString().equals(Key) || !ValMD->getString().equals(Val))
+    return false;
+  return true;
+}
+
+// Parse an MDTuple representing detailed summary.
+static bool getSummaryFromMD(MDTuple *MD, SummaryEntryVector &Summary) {
+  if (!MD || MD->getNumOperands() != 2)
+    return false;
+  MDString *KeyMD = dyn_cast<MDString>(MD->getOperand(0));
+  if (!KeyMD || !KeyMD->getString().equals("DetailedSummary"))
+    return false;
+  MDTuple *EntriesMD = dyn_cast<MDTuple>(MD->getOperand(1));
+  if (!EntriesMD)
+    return false;
+  for (auto &&MDOp : EntriesMD->operands()) {
+    MDTuple *EntryMD = dyn_cast<MDTuple>(MDOp);
+    if (!EntryMD || EntryMD->getNumOperands() != 3)
+      return false;
+    ConstantAsMetadata *Op0 =
+        dyn_cast<ConstantAsMetadata>(EntryMD->getOperand(0));
+    ConstantAsMetadata *Op1 =
+        dyn_cast<ConstantAsMetadata>(EntryMD->getOperand(1));
+    ConstantAsMetadata *Op2 =
+        dyn_cast<ConstantAsMetadata>(EntryMD->getOperand(2));
+
+    if (!Op0 || !Op1 || !Op2)
+      return false;
+    Summary.emplace_back(cast<ConstantInt>(Op0->getValue())->getZExtValue(),
+                         cast<ConstantInt>(Op1->getValue())->getZExtValue(),
+                         cast<ConstantInt>(Op2->getValue())->getZExtValue());
+  }
+  return true;
+}
+
+ProfileSummary *ProfileSummary::getFromMD(Metadata *MD) {
+  if (!MD)
+    return nullptr;
+  if (!isa<MDTuple>(MD))
+    return nullptr;
+  MDTuple *Tuple = cast<MDTuple>(MD);
+  if (Tuple->getNumOperands() != 8)
+    return nullptr;
+
+  auto &FormatMD = Tuple->getOperand(0);
+  ProfileSummary::Kind SummaryKind;
+  if (isKeyValuePair(dyn_cast_or_null<MDTuple>(FormatMD), "ProfileFormat",
+                     "SampleProfile"))
+    SummaryKind = PSK_Sample;
+  else if (isKeyValuePair(dyn_cast_or_null<MDTuple>(FormatMD), "ProfileFormat",
+                          "InstrProf"))
+    SummaryKind = PSK_Instr;
+  else
+    return nullptr;
+
+  uint64_t NumCounts, TotalCount, NumFunctions, MaxFunctionCount, MaxCount,
+      MaxInternalCount;
+  if (!getVal(dyn_cast<MDTuple>(Tuple->getOperand(1)), "TotalCount",
+              TotalCount))
+    return nullptr;
+  if (!getVal(dyn_cast<MDTuple>(Tuple->getOperand(2)), "MaxCount", MaxCount))
+    return nullptr;
+  if (!getVal(dyn_cast<MDTuple>(Tuple->getOperand(3)), "MaxInternalCount",
+              MaxInternalCount))
+    return nullptr;
+  if (!getVal(dyn_cast<MDTuple>(Tuple->getOperand(4)), "MaxFunctionCount",
+              MaxFunctionCount))
+    return nullptr;
+  if (!getVal(dyn_cast<MDTuple>(Tuple->getOperand(5)), "NumCounts", NumCounts))
+    return nullptr;
+  if (!getVal(dyn_cast<MDTuple>(Tuple->getOperand(6)), "NumFunctions",
+              NumFunctions))
+    return nullptr;
+
+  SummaryEntryVector Summary;
+  if (!getSummaryFromMD(dyn_cast<MDTuple>(Tuple->getOperand(7)), Summary))
+    return nullptr;
+  return new ProfileSummary(SummaryKind, Summary, TotalCount, MaxCount,
+                            MaxInternalCount, MaxFunctionCount, NumCounts,
+                            NumFunctions);
+}
diff --git a/lib/IR/Statepoint.cpp b/lib/IR/Statepoint.cpp
index 27a990eaff81..63be1e780d81 100644
--- a/lib/IR/Statepoint.cpp
+++ b/lib/IR/Statepoint.cpp
@@ -7,55 +7,68 @@
 //
 //===----------------------------------------------------------------------===//
 //
-// 
+// This file contains some utility functions to help recognize gc.statepoint
+// intrinsics.
+//
 //===----------------------------------------------------------------------===//
 
-#include "llvm/IR/Function.h"
-#include "llvm/IR/Constant.h"
-#include "llvm/IR/Constants.h"
 #include "llvm/IR/Statepoint.h"
-#include "llvm/Support/CommandLine.h"
 
-using namespace std;
+#include "llvm/IR/Function.h"
+
 using namespace llvm;
 
-bool llvm::isStatepoint(const ImmutableCallSite &CS) {
-  if (!CS.getInstruction()) {
-    // This is not a call site
-    return false;
-  }
+static const Function *getCalledFunction(ImmutableCallSite CS) {
+  if (!CS.getInstruction())
+    return nullptr;
+  return CS.getCalledFunction();
+}
 
-  const Function *F = CS.getCalledFunction();
-  return (F && F->getIntrinsicID() == Intrinsic::experimental_gc_statepoint);
+bool llvm::isStatepoint(ImmutableCallSite CS) {
+  if (auto *F = getCalledFunction(CS))
+    return F->getIntrinsicID() == Intrinsic::experimental_gc_statepoint;
+  return false;
 }
-bool llvm::isStatepoint(const Value *inst) {
-  if (isa<InvokeInst>(inst) || isa<CallInst>(inst)) {
-    ImmutableCallSite CS(inst);
+
+bool llvm::isStatepoint(const Value *V) {
+  if (auto CS = ImmutableCallSite(V))
     return isStatepoint(CS);
-  }
   return false;
 }
-bool llvm::isStatepoint(const Value &inst) {
-  return isStatepoint(&inst);
+
+bool llvm::isStatepoint(const Value &V) {
+  return isStatepoint(&V);
 }
 
-bool llvm::isGCRelocate(const ImmutableCallSite &CS) {
+bool llvm::isGCRelocate(ImmutableCallSite CS) {
   return CS.getInstruction() && isa<GCRelocateInst>(CS.getInstruction());
 }
 
-bool llvm::isGCResult(const ImmutableCallSite &CS) {
-  if (!CS.getInstruction()) {
-    // This is not a call site
-    return false;
-  }
+bool llvm::isGCResult(ImmutableCallSite CS) {
+  return CS.getInstruction() && isa<GCResultInst>(CS.getInstruction());
+}
 
-  return isGCResult(CS.getInstruction());
+bool llvm::isStatepointDirectiveAttr(Attribute Attr) {
+  return Attr.hasAttribute("statepoint-id") ||
+         Attr.hasAttribute("statepoint-num-patch-bytes");
 }
-bool llvm::isGCResult(const Value *inst) {
-  if (const CallInst *call = dyn_cast<CallInst>(inst)) {
-    if (Function *F = call->getCalledFunction()) {
-      return F->getIntrinsicID() == Intrinsic::experimental_gc_result;
-    }
-  }
-  return false;
+
+StatepointDirectives llvm::parseStatepointDirectivesFromAttrs(AttributeSet AS) {
+  StatepointDirectives Result;
+
+  Attribute AttrID =
+      AS.getAttribute(AttributeSet::FunctionIndex, "statepoint-id");
+  uint64_t StatepointID;
+  if (AttrID.isStringAttribute())
+    if (!AttrID.getValueAsString().getAsInteger(10, StatepointID))
+      Result.StatepointID = StatepointID;
+
+  uint32_t NumPatchBytes;
+  Attribute AttrNumPatchBytes = AS.getAttribute(AttributeSet::FunctionIndex,
+                                                "statepoint-num-patch-bytes");
+  if (AttrNumPatchBytes.isStringAttribute())
+    if (!AttrNumPatchBytes.getValueAsString().getAsInteger(10, NumPatchBytes))
+      Result.NumPatchBytes = NumPatchBytes;
+
+  return Result;
 }
diff --git a/lib/IR/Type.cpp b/lib/IR/Type.cpp
index 4c1baf52a58f..5c97a4ea0705 100644
--- a/lib/IR/Type.cpp
+++ b/lib/IR/Type.cpp
@@ -41,22 +41,16 @@ Type *Type::getPrimitiveType(LLVMContext &C, TypeID IDNumber) {
   }
 }
 
-/// getScalarType - If this is a vector type, return the element type,
-/// otherwise return this.
 Type *Type::getScalarType() const {
   if (auto *VTy = dyn_cast<VectorType>(this))
     return VTy->getElementType();
   return const_cast<Type*>(this);
 }
 
-/// isIntegerTy - Return true if this is an IntegerType of the specified width.
 bool Type::isIntegerTy(unsigned Bitwidth) const {
   return isIntegerTy() && cast<IntegerType>(this)->getBitWidth() == Bitwidth;
 }
 
-// canLosslesslyBitCastTo - Return true if this type can be converted to
-// 'Ty' without any reinterpretation of bits.  For example, i8* to i32*.
-//
 bool Type::canLosslesslyBitCastTo(Type *Ty) const {
   // Identity cast means no change so return true
   if (this == Ty) 
@@ -126,16 +120,10 @@ unsigned Type::getPrimitiveSizeInBits() const {
   }
 }
 
-/// getScalarSizeInBits - If this is a vector type, return the
-/// getPrimitiveSizeInBits value for the element type. Otherwise return the
-/// getPrimitiveSizeInBits value for this type.
 unsigned Type::getScalarSizeInBits() const {
   return getScalarType()->getPrimitiveSizeInBits();
 }
 
-/// getFPMantissaWidth - Return the width of the mantissa of this type.  This
-/// is only valid on floating point types.  If the FP type does not
-/// have a stable mantissa (e.g. ppc long double), this method returns -1.
 int Type::getFPMantissaWidth() const {
   if (auto *VTy = dyn_cast<VectorType>(this))
     return VTy->getElementType()->getFPMantissaWidth();
@@ -149,9 +137,6 @@ int Type::getFPMantissaWidth() const {
   return -1;
 }
 
-/// isSizedDerivedType - Derived types like structures and arrays are sized
-/// iff all of the members of the type are sized as well.  Since asking for
-/// their size is relatively uncommon, move this operation out of line.
 bool Type::isSizedDerivedType(SmallPtrSetImpl<Type*> *Visited) const {
   if (auto *ATy = dyn_cast<ArrayType>(this))
     return ATy->getElementType()->isSized(Visited);
@@ -302,7 +287,7 @@ FunctionType::FunctionType(Type *Result, ArrayRef<Type*> Params,
   NumContainedTys = Params.size() + 1; // + 1 for result type
 }
 
-// FunctionType::get - The factory function for the FunctionType class.
+// This is the factory function for the FunctionType class.
 FunctionType *FunctionType::get(Type *ReturnType,
                                 ArrayRef<Type*> Params, bool isVarArg) {
   LLVMContextImpl *pImpl = ReturnType->getContext().pImpl;
@@ -327,15 +312,11 @@ FunctionType *FunctionType::get(Type *Result, bool isVarArg) {
   return get(Result, None, isVarArg);
 }
 
-/// isValidReturnType - Return true if the specified type is valid as a return
-/// type.
 bool FunctionType::isValidReturnType(Type *RetTy) {
   return !RetTy->isFunctionTy() && !RetTy->isLabelTy() &&
   !RetTy->isMetadataTy();
 }
 
-/// isValidArgumentType - Return true if the specified type is valid as an
-/// argument type.
 bool FunctionType::isValidArgumentType(Type *ArgTy) {
   return ArgTy->isFirstClassType();
 }
@@ -552,8 +533,6 @@ bool StructType::isValidElementType(Type *ElemTy) {
          !ElemTy->isTokenTy();
 }
 
-/// isLayoutIdentical - Return true if this is layout identical to the
-/// specified struct.
 bool StructType::isLayoutIdentical(StructType *Other) const {
   if (this == Other) return true;
 
@@ -563,8 +542,6 @@ bool StructType::isLayoutIdentical(StructType *Other) const {
   return elements() == Other->elements();
 }
 
-/// getTypeByName - Return the type with the specified name, or null if there
-/// is none by that name.
 StructType *Module::getTypeByName(StringRef Name) const {
   return getContext().pImpl->NamedStructTypes.lookup(Name);
 }
diff --git a/lib/IR/TypeFinder.cpp b/lib/IR/TypeFinder.cpp
index b5bdab0865b6..dc4c1cffb20c 100644
--- a/lib/IR/TypeFinder.cpp
+++ b/lib/IR/TypeFinder.cpp
@@ -41,23 +41,19 @@ void TypeFinder::run(const Module &M, bool onlyNamed) {
 
   // Get types from functions.
   SmallVector<std::pair<unsigned, MDNode *>, 4> MDForInst;
-  for (Module::const_iterator FI = M.begin(), E = M.end(); FI != E; ++FI) {
-    incorporateType(FI->getType());
+  for (const Function &FI : M) {
+    incorporateType(FI.getType());
 
-    for (const Use &U : FI->operands())
+    for (const Use &U : FI.operands())
       incorporateValue(U.get());
 
     // First incorporate the arguments.
-    for (Function::const_arg_iterator AI = FI->arg_begin(),
-           AE = FI->arg_end(); AI != AE; ++AI)
+    for (Function::const_arg_iterator AI = FI.arg_begin(), AE = FI.arg_end();
+         AI != AE; ++AI)
       incorporateValue(&*AI);
 
-    for (Function::const_iterator BB = FI->begin(), E = FI->end();
-         BB != E;++BB)
-      for (BasicBlock::const_iterator II = BB->begin(),
-             E = BB->end(); II != E; ++II) {
-        const Instruction &I = *II;
-
+    for (const BasicBlock &BB : FI)
+      for (const Instruction &I : BB) {
         // Incorporate the type of the instruction.
         incorporateType(I.getType());
 
diff --git a/lib/IR/Value.cpp b/lib/IR/Value.cpp
index 4d224a041349..6ae37fa6f15a 100644
--- a/lib/IR/Value.cpp
+++ b/lib/IR/Value.cpp
@@ -34,6 +34,7 @@
 #include "llvm/Support/ManagedStatic.h"
 #include "llvm/Support/raw_ostream.h"
 #include <algorithm>
+
 using namespace llvm;
 
 //===----------------------------------------------------------------------===//
@@ -58,6 +59,8 @@ Value::Value(Type *ty, unsigned scid)
            (SubclassID < ConstantFirstVal || SubclassID > ConstantLastVal))
     assert((VTy->isFirstClassType() || VTy->isVoidTy()) &&
            "Cannot create non-first-class values except for constants!");
+  static_assert(sizeof(Value) == 3 * sizeof(void *) + 2 * sizeof(unsigned),
+                "Value too big");
 }
 
 Value::~Value() {
@@ -195,6 +198,10 @@ StringRef Value::getName() const {
 }
 
 void Value::setNameImpl(const Twine &NewName) {
+  // Fast-path: LLVMContext can be set to strip out non-GlobalValue names
+  if (getContext().shouldDiscardValueNames() && !isa<GlobalValue>(this))
+    return;
+
   // Fast path for common IRBuilder case of setName("") when there is no name.
   if (NewName.isTriviallyEmpty() && !hasName())
     return;
@@ -358,7 +365,7 @@ static bool contains(Value *Expr, Value *V) {
   SmallPtrSet<ConstantExpr *, 4> Cache;
   return contains(Cache, CE, C);
 }
-#endif
+#endif // NDEBUG
 
 void Value::replaceAllUsesWith(Value *New) {
   assert(New && "Value::replaceAllUsesWith(<null>) is invalid!");
@@ -379,7 +386,7 @@ void Value::replaceAllUsesWith(Value *New) {
     // constant because they are uniqued.
     if (auto *C = dyn_cast<Constant>(U.getUser())) {
       if (!isa<GlobalValue>(C)) {
-        C->handleOperandChange(this, New, &U);
+        C->handleOperandChange(this, New);
         continue;
       }
     }
@@ -410,7 +417,6 @@ void Value::replaceUsesOutsideBlock(Value *New, BasicBlock *BB) {
       continue;
     U.set(New);
   }
-  return;
 }
 
 namespace {
@@ -454,10 +460,16 @@ static Value *stripPointerCastsAndOffsets(Value *V) {
                Operator::getOpcode(V) == Instruction::AddrSpaceCast) {
       V = cast<Operator>(V)->getOperand(0);
     } else if (GlobalAlias *GA = dyn_cast<GlobalAlias>(V)) {
-      if (StripKind == PSK_ZeroIndices || GA->mayBeOverridden())
+      if (StripKind == PSK_ZeroIndices || GA->isInterposable())
         return V;
       V = GA->getAliasee();
     } else {
+      if (auto CS = CallSite(V))
+        if (Value *RV = CS.getReturnedArgOperand()) {
+          V = RV;
+          continue;
+        }
+
       return V;
     }
     assert(V->getType()->isPointerTy() && "Unexpected operand type!");
@@ -465,7 +477,7 @@ static Value *stripPointerCastsAndOffsets(Value *V) {
 
   return V;
 }
-} // namespace
+} // end anonymous namespace
 
 Value *Value::stripPointerCasts() {
   return stripPointerCastsAndOffsets<PSK_ZeroIndicesAndAliases>(this);
@@ -507,6 +519,12 @@ Value *Value::stripAndAccumulateInBoundsConstantOffsets(const DataLayout &DL,
     } else if (GlobalAlias *GA = dyn_cast<GlobalAlias>(V)) {
       V = GA->getAliasee();
     } else {
+      if (auto CS = CallSite(V))
+        if (Value *RV = CS.getReturnedArgOperand()) {
+          V = RV;
+          continue;
+        }
+
       return V;
     }
     assert(V->getType()->isPointerTy() && "Unexpected operand type!");
@@ -519,6 +537,104 @@ Value *Value::stripInBoundsOffsets() {
   return stripPointerCastsAndOffsets<PSK_InBounds>(this);
 }
 
+unsigned Value::getPointerDereferenceableBytes(const DataLayout &DL,
+                                               bool &CanBeNull) const {
+  assert(getType()->isPointerTy() && "must be pointer");
+
+  unsigned DerefBytes = 0;
+  CanBeNull = false;
+  if (const Argument *A = dyn_cast<Argument>(this)) {
+    DerefBytes = A->getDereferenceableBytes();
+    if (DerefBytes == 0 && A->hasByValAttr() && A->getType()->isSized()) {
+      DerefBytes = DL.getTypeStoreSize(A->getType());
+      CanBeNull = false;
+    }
+    if (DerefBytes == 0) {
+      DerefBytes = A->getDereferenceableOrNullBytes();
+      CanBeNull = true;
+    }
+  } else if (auto CS = ImmutableCallSite(this)) {
+    DerefBytes = CS.getDereferenceableBytes(0);
+    if (DerefBytes == 0) {
+      DerefBytes = CS.getDereferenceableOrNullBytes(0);
+      CanBeNull = true;
+    }
+  } else if (const LoadInst *LI = dyn_cast<LoadInst>(this)) {
+    if (MDNode *MD = LI->getMetadata(LLVMContext::MD_dereferenceable)) {
+      ConstantInt *CI = mdconst::extract<ConstantInt>(MD->getOperand(0));
+      DerefBytes = CI->getLimitedValue();
+    }
+    if (DerefBytes == 0) {
+      if (MDNode *MD =
+              LI->getMetadata(LLVMContext::MD_dereferenceable_or_null)) {
+        ConstantInt *CI = mdconst::extract<ConstantInt>(MD->getOperand(0));
+        DerefBytes = CI->getLimitedValue();
+      }
+      CanBeNull = true;
+    }
+  } else if (auto *AI = dyn_cast<AllocaInst>(this)) {
+    if (AI->getAllocatedType()->isSized()) {
+      DerefBytes = DL.getTypeStoreSize(AI->getAllocatedType());
+      CanBeNull = false;
+    }
+  } else if (auto *GV = dyn_cast<GlobalVariable>(this)) {
+    if (GV->getValueType()->isSized() && !GV->hasExternalWeakLinkage()) {
+      // TODO: Don't outright reject hasExternalWeakLinkage but set the
+      // CanBeNull flag.
+      DerefBytes = DL.getTypeStoreSize(GV->getValueType());
+      CanBeNull = false;
+    }
+  }
+  return DerefBytes;
+}
+
+unsigned Value::getPointerAlignment(const DataLayout &DL) const {
+  assert(getType()->isPointerTy() && "must be pointer");
+
+  unsigned Align = 0;
+  if (auto *GO = dyn_cast<GlobalObject>(this)) {
+    Align = GO->getAlignment();
+    if (Align == 0) {
+      if (auto *GVar = dyn_cast<GlobalVariable>(GO)) {
+        Type *ObjectType = GVar->getValueType();
+        if (ObjectType->isSized()) {
+          // If the object is defined in the current Module, we'll be giving
+          // it the preferred alignment. Otherwise, we have to assume that it
+          // may only have the minimum ABI alignment.
+          if (GVar->isStrongDefinitionForLinker())
+            Align = DL.getPreferredAlignment(GVar);
+          else
+            Align = DL.getABITypeAlignment(ObjectType);
+        }
+      }
+    }
+  } else if (const Argument *A = dyn_cast<Argument>(this)) {
+    Align = A->getParamAlignment();
+
+    if (!Align && A->hasStructRetAttr()) {
+      // An sret parameter has at least the ABI alignment of the return type.
+      Type *EltTy = cast<PointerType>(A->getType())->getElementType();
+      if (EltTy->isSized())
+        Align = DL.getABITypeAlignment(EltTy);
+    }
+  } else if (const AllocaInst *AI = dyn_cast<AllocaInst>(this)) {
+    Align = AI->getAlignment();
+    if (Align == 0) {
+      Type *AllocatedType = AI->getAllocatedType();
+      if (AllocatedType->isSized())
+        Align = DL.getPrefTypeAlignment(AllocatedType);
+    }
+  } else if (auto CS = ImmutableCallSite(this))
+    Align = CS.getAttributes().getParamAlignment(AttributeSet::ReturnIndex);
+  else if (const LoadInst *LI = dyn_cast<LoadInst>(this))
+    if (MDNode *MD = LI->getMetadata(LLVMContext::MD_align)) {
+      ConstantInt *CI = mdconst::extract<ConstantInt>(MD->getOperand(0));
+      Align = CI->getLimitedValue();
+    }
+
+  return Align;
+}
+
 Value *Value::DoPHITranslation(const BasicBlock *CurBB,
                                const BasicBlock *PredBB) {
   PHINode *PN = dyn_cast<PHINode>(this);
@@ -644,7 +760,6 @@ void ValueHandleBase::RemoveFromUseList() {
   }
 }
 
-
 void ValueHandleBase::ValueIsDeleted(Value *V) {
   assert(V->HasValueHandle && "Should only be called if ValueHandles present");
 
@@ -701,7 +816,6 @@ void ValueHandleBase::ValueIsDeleted(Value *V) {
   }
 }
 
-
 void ValueHandleBase::ValueIsRAUWd(Value *Old, Value *New) {
   assert(Old->HasValueHandle &&"Should only be called if ValueHandles present");
   assert(Old != New && "Changing value into itself!");
diff --git a/lib/IR/ValueSymbolTable.cpp b/lib/IR/ValueSymbolTable.cpp
index deb6e7573e72..f6f1dd984e9f 100644
--- a/lib/IR/ValueSymbolTable.cpp
+++ b/lib/IR/ValueSymbolTable.cpp
@@ -24,10 +24,10 @@ using namespace llvm;
 // Class destructor
 ValueSymbolTable::~ValueSymbolTable() {
 #ifndef NDEBUG   // Only do this in -g mode...
-  for (iterator VI = vmap.begin(), VE = vmap.end(); VI != VE; ++VI)
+  for (const auto &VI : vmap)
     dbgs() << "Value still in symbol table! Type = '"
-           << *VI->getValue()->getType() << "' Name = '"
-           << VI->getKeyData() << "'\n";
+           << *VI.getValue()->getType() << "' Name = '" << VI.getKeyData()
+           << "'\n";
   assert(vmap.empty() && "Values remain in symbol table!");
 #endif
 }
@@ -97,11 +97,11 @@ ValueName *ValueSymbolTable::createValueName(StringRef Name, Value *V) {
 
 // dump - print out the symbol table
 //
-void ValueSymbolTable::dump() const {
+LLVM_DUMP_METHOD void ValueSymbolTable::dump() const {
   //DEBUG(dbgs() << "ValueSymbolTable:\n");
-  for (const_iterator I = begin(), E = end(); I != E; ++I) {
+  for (const auto &I : *this) {
     //DEBUG(dbgs() << "  '" << I->getKeyData() << "' = ");
-    I->getValue()->dump();
+    I.getValue()->dump();
     //DEBUG(dbgs() << "\n");
   }
 }
diff --git a/lib/IR/ValueTypes.cpp b/lib/IR/ValueTypes.cpp
index f2932302af2a..ff1e431c2e9a 100644
--- a/lib/IR/ValueTypes.cpp
+++ b/lib/IR/ValueTypes.cpp
@@ -55,6 +55,11 @@ bool EVT::isExtendedInteger() const {
   return LLVMTy->isIntOrIntVectorTy();
 }
 
+bool EVT::isExtendedScalarInteger() const {
+  assert(isExtended() && "Type is not extended!");
+  return LLVMTy->isIntegerTy();
+}
+
 bool EVT::isExtendedVector() const {
   assert(isExtended() && "Type is not extended!");
   return LLVMTy->isVectorTy();
diff --git a/lib/IR/Verifier.cpp b/lib/IR/Verifier.cpp
index 9198b0e1fb58..682e934d4b0f 100644
--- a/lib/IR/Verifier.cpp
+++ b/lib/IR/Verifier.cpp
@@ -59,6 +59,7 @@
 #include "llvm/IR/DataLayout.h"
 #include "llvm/IR/DebugInfo.h"
 #include "llvm/IR/DerivedTypes.h"
+#include "llvm/IR/DiagnosticInfo.h"
 #include "llvm/IR/Dominators.h"
 #include "llvm/IR/InlineAsm.h"
 #include "llvm/IR/InstIterator.h"
@@ -67,6 +68,7 @@
 #include "llvm/IR/LLVMContext.h"
 #include "llvm/IR/Metadata.h"
 #include "llvm/IR/Module.h"
+#include "llvm/IR/ModuleSlotTracker.h"
 #include "llvm/IR/PassManager.h"
 #include "llvm/IR/Statepoint.h"
 #include "llvm/Pass.h"
@@ -82,14 +84,18 @@ static cl::opt<bool> VerifyDebugInfo("verify-debug-info", cl::init(true));
 
 namespace {
 struct VerifierSupport {
-  raw_ostream &OS;
-  const Module *M;
+  raw_ostream *OS;
+  const Module *M = nullptr;
+  Optional<ModuleSlotTracker> MST;
 
-  /// \brief Track the brokenness of the module while recursively visiting.
-  bool Broken;
+  /// Track the brokenness of the module while recursively visiting.
+  bool Broken = false;
+  /// Broken debug info can be "recovered" from by stripping the debug info.
+  bool BrokenDebugInfo = false;
+  /// Whether to treat broken debug info as an error.
+  bool TreatBrokenDebugInfoAsError = true;
 
-  explicit VerifierSupport(raw_ostream &OS)
-      : OS(OS), M(nullptr), Broken(false) {}
+  explicit VerifierSupport(raw_ostream *OS) : OS(OS) {}
 
 private:
   template <class NodeTy> void Write(const ilist_iterator<NodeTy> &I) {
@@ -99,17 +105,18 @@ private:
   void Write(const Module *M) {
     if (!M)
       return;
-    OS << "; ModuleID = '" << M->getModuleIdentifier() << "'\n";
+    *OS << "; ModuleID = '" << M->getModuleIdentifier() << "'\n";
   }
 
   void Write(const Value *V) {
     if (!V)
       return;
     if (isa<Instruction>(V)) {
-      OS << *V << '\n';
+      V->print(*OS, *MST);
+      *OS << '\n';
     } else {
-      V->printAsOperand(OS, true, M);
-      OS << '\n';
+      V->printAsOperand(*OS, true, *MST);
+      *OS << '\n';
     }
   }
   void Write(ImmutableCallSite CS) {
@@ -119,8 +126,8 @@ private:
   void Write(const Metadata *MD) {
     if (!MD)
       return;
-    MD->print(OS, M);
-    OS << '\n';
+    MD->print(*OS, *MST, M);
+    *OS << '\n';
   }
 
   template <class T> void Write(const MDTupleTypedArrayWrapper<T> &MD) {
@@ -130,20 +137,20 @@ private:
   void Write(const NamedMDNode *NMD) {
     if (!NMD)
       return;
-    NMD->print(OS);
-    OS << '\n';
+    NMD->print(*OS, *MST);
+    *OS << '\n';
   }
 
   void Write(Type *T) {
     if (!T)
       return;
-    OS << ' ' << *T;
+    *OS << ' ' << *T;
   }
 
   void Write(const Comdat *C) {
     if (!C)
       return;
-    OS << *C;
+    *OS << *C;
   }
 
   template <typename T> void Write(ArrayRef<T> Vs) {
@@ -165,7 +172,8 @@ public:
   /// This provides a nice place to put a breakpoint if you want to see why
   /// something is not correct.
   void CheckFailed(const Twine &Message) {
-    OS << Message << '\n';
+    if (OS)
+      *OS << Message << '\n';
     Broken = true;
   }
 
@@ -176,7 +184,25 @@ public:
   template <typename T1, typename... Ts>
   void CheckFailed(const Twine &Message, const T1 &V1, const Ts &... Vs) {
     CheckFailed(Message);
-    WriteTs(V1, Vs...);
+    if (OS)
+      WriteTs(V1, Vs...);
+  }
+
+  /// A debug info check failed.
+  void DebugInfoCheckFailed(const Twine &Message) {
+    if (OS)
+      *OS << Message << '\n';
+    Broken |= TreatBrokenDebugInfoAsError;
+    BrokenDebugInfo = true;
+  }
+
+  /// A debug info check failed (with values to print).
+  template <typename T1, typename... Ts>
+  void DebugInfoCheckFailed(const Twine &Message, const T1 &V1,
+                            const Ts &... Vs) {
+    DebugInfoCheckFailed(Message);
+    if (OS)
+      WriteTs(V1, Vs...);
   }
 };
 
@@ -196,8 +222,8 @@ class Verifier : public InstVisitor<Verifier>, VerifierSupport {
   /// \brief Keep track of the metadata nodes that have been checked already.
   SmallPtrSet<const Metadata *, 32> MDNodes;
 
-  /// \brief Track unresolved string-based type references.
-  SmallDenseMap<const MDString *, const MDNode *, 32> UnresolvedTypeRefs;
+  /// Track all DICompileUnits visited.
+  SmallPtrSet<const Metadata *, 2> CUVisited;
 
   /// \brief The result type for a landingpad.
   Type *LandingPadResultTy;
@@ -217,41 +243,60 @@ class Verifier : public InstVisitor<Verifier>, VerifierSupport {
   /// Cache of constants visited in search of ConstantExprs.
   SmallPtrSet<const Constant *, 32> ConstantExprVisited;
 
+  /// Cache of declarations of the llvm.experimental.deoptimize.<ty> intrinsic.
+  SmallVector<const Function *, 4> DeoptimizeDeclarations;
+
+  // Verify that this GlobalValue is only used in this module.
+  // This map is used to avoid visiting uses twice. We can arrive at a user
+  // twice, if they have multiple operands. In particular for very large
+  // constant expressions, we can arrive at a particular user many times.
+  SmallPtrSet<const Value *, 32> GlobalValueVisited;
+
   void checkAtomicMemAccessSize(const Module *M, Type *Ty,
                                 const Instruction *I);
+
+  void updateModule(const Module *NewM) {
+    if (M == NewM)
+      return;
+    MST.emplace(NewM);
+    M = NewM;
+  }
+
 public:
-  explicit Verifier(raw_ostream &OS)
+  explicit Verifier(raw_ostream *OS, bool ShouldTreatBrokenDebugInfoAsError)
       : VerifierSupport(OS), Context(nullptr), LandingPadResultTy(nullptr),
-        SawFrameEscape(false) {}
+        SawFrameEscape(false) {
+    TreatBrokenDebugInfoAsError = ShouldTreatBrokenDebugInfoAsError;
+  }
+
+  bool hasBrokenDebugInfo() const { return BrokenDebugInfo; }
 
   bool verify(const Function &F) {
-    M = F.getParent();
+    updateModule(F.getParent());
     Context = &M->getContext();
 
     // First ensure the function is well-enough formed to compute dominance
-    // information.
-    if (F.empty()) {
-      OS << "Function '" << F.getName()
-         << "' does not contain an entry block!\n";
-      return false;
-    }
-    for (Function::const_iterator I = F.begin(), E = F.end(); I != E; ++I) {
-      if (I->empty() || !I->back().isTerminator()) {
-        OS << "Basic Block in function '" << F.getName()
-           << "' does not have terminator!\n";
-        I->printAsOperand(OS, true);
-        OS << "\n";
-        return false;
+    // information, and directly compute a dominance tree. We don't rely on the
+    // pass manager to provide this as it isolates us from a potentially
+    // out-of-date dominator tree and makes it significantly more complex to run
+    // this code outside of a pass manager.
+    // FIXME: It's really gross that we have to cast away constness here.
+    if (!F.empty())
+      DT.recalculate(const_cast<Function &>(F));
+
+    for (const BasicBlock &BB : F) {
+      if (!BB.empty() && BB.back().isTerminator())
+        continue;
+
+      if (OS) {
+        *OS << "Basic Block in function '" << F.getName()
+            << "' does not have terminator!\n";
+        BB.printAsOperand(*OS, true, *MST);
+        *OS << "\n";
       }
+      return false;
     }
 
-    // Now directly compute a dominance tree. We don't rely on the pass
-    // manager to provide this as it isolates us from a potentially
-    // out-of-date dominator tree and makes it significantly more complex to
-    // run this code outside of a pass manager.
-    // FIXME: It's really gross that we have to cast away constness here.
-    DT.recalculate(const_cast<Function &>(F));
-
     Broken = false;
     // FIXME: We strip const here because the inst visitor strips const.
     visit(const_cast<Function &>(F));
@@ -265,35 +310,26 @@ public:
   }
 
   bool verify(const Module &M) {
-    this->M = &M;
+    updateModule(&M);
     Context = &M.getContext();
     Broken = false;
 
-    // Scan through, checking all of the external function's linkage now...
-    for (Module::const_iterator I = M.begin(), E = M.end(); I != E; ++I) {
-      visitGlobalValue(*I);
-
-      // Check to make sure function prototypes are okay.
-      if (I->isDeclaration())
-        visitFunction(*I);
-    }
+    // Collect all declarations of the llvm.experimental.deoptimize intrinsic.
+    for (const Function &F : M)
+      if (F.getIntrinsicID() == Intrinsic::experimental_deoptimize)
+        DeoptimizeDeclarations.push_back(&F);
 
     // Now that we've visited every function, verify that we never asked to
     // recover a frame index that wasn't escaped.
     verifyFrameRecoverIndices();
+    for (const GlobalVariable &GV : M.globals())
+      visitGlobalVariable(GV);
 
-    for (Module::const_global_iterator I = M.global_begin(), E = M.global_end();
-         I != E; ++I)
-      visitGlobalVariable(*I);
+    for (const GlobalAlias &GA : M.aliases())
+      visitGlobalAlias(GA);
 
-    for (Module::const_alias_iterator I = M.alias_begin(), E = M.alias_end();
-         I != E; ++I)
-      visitGlobalAlias(*I);
-
-    for (Module::const_named_metadata_iterator I = M.named_metadata_begin(),
-                                               E = M.named_metadata_end();
-         I != E; ++I)
-      visitNamedMDNode(*I);
+    for (const NamedMDNode &NMD : M.named_metadata())
+      visitNamedMDNode(NMD);
 
     for (const StringMapEntry<Comdat> &SMEC : M.getComdatSymbolTable())
       visitComdat(SMEC.getValue());
@@ -301,8 +337,9 @@ public:
     visitModuleFlags(M);
     visitModuleIdents(M);
 
-    // Verify type referneces last.
-    verifyTypeRefs();
+    verifyCompileUnits();
+
+    verifyDeoptimizeCallingConvs();
 
     return !Broken;
   }
@@ -340,27 +377,6 @@ private:
 
   void visitTemplateParams(const MDNode &N, const Metadata &RawParams);
 
-  /// \brief Check for a valid string-based type reference.
-  ///
-  /// Checks if \c MD is a string-based type reference.  If it is, keeps track
-  /// of it (and its user, \c N) for error messages later.
-  bool isValidUUID(const MDNode &N, const Metadata *MD);
-
-  /// \brief Check for a valid type reference.
-  ///
-  /// Checks for subclasses of \a DIType, or \a isValidUUID().
-  bool isTypeRef(const MDNode &N, const Metadata *MD);
-
-  /// \brief Check for a valid scope reference.
-  ///
-  /// Checks for subclasses of \a DIScope, or \a isValidUUID().
-  bool isScopeRef(const MDNode &N, const Metadata *MD);
-
-  /// \brief Check for a valid debug info reference.
-  ///
-  /// Checks for subclasses of \a DINode, or \a isValidUUID().
-  bool isDIRef(const MDNode &N, const Metadata *MD);
-
   // InstVisitor overrides...
   using InstVisitor<Verifier>::visit;
   void visit(Instruction &I);
@@ -419,53 +435,69 @@ private:
   void visitCatchSwitchInst(CatchSwitchInst &CatchSwitch);
   void visitCleanupReturnInst(CleanupReturnInst &CRI);
 
-  void VerifyCallSite(CallSite CS);
+  void verifyCallSite(CallSite CS);
+  void verifySwiftErrorCallSite(CallSite CS, const Value *SwiftErrorVal);
+  void verifySwiftErrorValue(const Value *SwiftErrorVal);
   void verifyMustTailCall(CallInst &CI);
-  bool PerformTypeCheck(Intrinsic::ID ID, Function *F, Type *Ty, int VT,
+  bool performTypeCheck(Intrinsic::ID ID, Function *F, Type *Ty, int VT,
                         unsigned ArgNo, std::string &Suffix);
-  bool VerifyIntrinsicType(Type *Ty, ArrayRef<Intrinsic::IITDescriptor> &Infos,
-                           SmallVectorImpl<Type *> &ArgTys);
-  bool VerifyIntrinsicIsVarArg(bool isVarArg,
-                               ArrayRef<Intrinsic::IITDescriptor> &Infos);
-  bool VerifyAttributeCount(AttributeSet Attrs, unsigned Params);
-  void VerifyAttributeTypes(AttributeSet Attrs, unsigned Idx, bool isFunction,
+  bool verifyAttributeCount(AttributeSet Attrs, unsigned Params);
+  void verifyAttributeTypes(AttributeSet Attrs, unsigned Idx, bool isFunction,
                             const Value *V);
-  void VerifyParameterAttrs(AttributeSet Attrs, unsigned Idx, Type *Ty,
+  void verifyParameterAttrs(AttributeSet Attrs, unsigned Idx, Type *Ty,
                             bool isReturnValue, const Value *V);
-  void VerifyFunctionAttrs(FunctionType *FT, AttributeSet Attrs,
+  void verifyFunctionAttrs(FunctionType *FT, AttributeSet Attrs,
                            const Value *V);
-  void VerifyFunctionMetadata(
-      const SmallVector<std::pair<unsigned, MDNode *>, 4> MDs);
+  void verifyFunctionMetadata(ArrayRef<std::pair<unsigned, MDNode *>> MDs);
 
   void visitConstantExprsRecursively(const Constant *EntryC);
   void visitConstantExpr(const ConstantExpr *CE);
-  void VerifyStatepoint(ImmutableCallSite CS);
+  void verifyStatepoint(ImmutableCallSite CS);
   void verifyFrameRecoverIndices();
   void verifySiblingFuncletUnwinds();
 
-  // Module-level debug info verification...
-  void verifyTypeRefs();
-  template <class MapTy>
-  void verifyBitPieceExpression(const DbgInfoIntrinsic &I,
-                                const MapTy &TypeRefs);
-  void visitUnresolvedTypeRef(const MDString *S, const MDNode *N);
+  void verifyBitPieceExpression(const DbgInfoIntrinsic &I);
+
+  /// Module-level debug info verification...
+  void verifyCompileUnits();
+
+  /// Module-level verification that all @llvm.experimental.deoptimize
+  /// declarations share the same calling convention.
+  void verifyDeoptimizeCallingConvs();
 };
 } // End anonymous namespace
 
-// Assert - We know that cond should be true, if not print an error message.
+/// We know that cond should be true, if not print an error message.
 #define Assert(C, ...) \
   do { if (!(C)) { CheckFailed(__VA_ARGS__); return; } } while (0)
 
+/// We know that a debug info condition should be true, if not print
+/// an error message.
+#define AssertDI(C, ...) \
+  do { if (!(C)) { DebugInfoCheckFailed(__VA_ARGS__); return; } } while (0)
+
+
 void Verifier::visit(Instruction &I) {
   for (unsigned i = 0, e = I.getNumOperands(); i != e; ++i)
     Assert(I.getOperand(i) != nullptr, "Operand is null", &I);
   InstVisitor<Verifier>::visit(I);
 }
 
+// Helper to recursively iterate over indirect users. By
+// returning false, the callback can ask to stop recursing
+// further.
+static void forEachUser(const Value *User,
+                        SmallPtrSet<const Value *, 32> &Visited,
+                        llvm::function_ref<bool(const Value *)> Callback) {
+  if (!Visited.insert(User).second)
+    return;
+  for (const Value *TheNextUser : User->materialized_users())
+    if (Callback(TheNextUser))
+      forEachUser(TheNextUser, Visited, Callback);
+}
 
 void Verifier::visitGlobalValue(const GlobalValue &GV) {
-  Assert(!GV.isDeclaration() || GV.hasExternalLinkage() ||
-             GV.hasExternalWeakLinkage(),
+  Assert(!GV.isDeclaration() || GV.hasValidDeclarationLinkage(),
          "Global is external, but doesn't have external or weak linkage!", &GV);
 
   Assert(GV.getAlignment() <= Value::MaximumAlignment,
@@ -481,11 +513,30 @@ void Verifier::visitGlobalValue(const GlobalValue &GV) {
 
   if (GV.isDeclarationForLinker())
     Assert(!GV.hasComdat(), "Declaration may not be in a Comdat!", &GV);
+
+  forEachUser(&GV, GlobalValueVisited, [&](const Value *V) -> bool {
+    if (const Instruction *I = dyn_cast<Instruction>(V)) {
+      if (!I->getParent() || !I->getParent()->getParent())
+        CheckFailed("Global is referenced by parentless instruction!", &GV,
+                    M, I);
+      else if (I->getParent()->getParent()->getParent() != M)
+        CheckFailed("Global is referenced in a different module!", &GV,
+                    M, I, I->getParent()->getParent(),
+                    I->getParent()->getParent()->getParent());
+      return false;
+    } else if (const Function *F = dyn_cast<Function>(V)) {
+      if (F->getParent() != M)
+        CheckFailed("Global is used by function in a different module", &GV,
+                    M, F, F->getParent());
+      return false;
+    }
+    return true;
+  });
 }
 
 void Verifier::visitGlobalVariable(const GlobalVariable &GV) {
   if (GV.hasInitializer()) {
-    Assert(GV.getInitializer()->getType() == GV.getType()->getElementType(),
+    Assert(GV.getInitializer()->getType() == GV.getValueType(),
            "Global variable initializer type does not match global "
            "variable type!",
            &GV);
@@ -499,9 +550,6 @@ void Verifier::visitGlobalVariable(const GlobalVariable &GV) {
              &GV);
       Assert(!GV.hasComdat(), "'common' global may not be in a Comdat!", &GV);
     }
-  } else {
-    Assert(GV.hasExternalLinkage() || GV.hasExternalWeakLinkage(),
-           "invalid linkage type for global declaration", &GV);
   }
 
   if (GV.hasName() && (GV.getName() == "llvm.global_ctors" ||
@@ -542,8 +590,8 @@ void Verifier::visitGlobalVariable(const GlobalVariable &GV) {
         const ConstantArray *InitArray = dyn_cast<ConstantArray>(Init);
         Assert(InitArray, "wrong initalizer for intrinsic global variable",
                Init);
-        for (unsigned i = 0, e = InitArray->getNumOperands(); i != e; ++i) {
-          Value *V = Init->getOperand(i)->stripPointerCastsNoFollowAliases();
+        for (Value *Op : InitArray->operands()) {
+          Value *V = Op->stripPointerCastsNoFollowAliases();
           Assert(isa<GlobalVariable>(V) || isa<Function>(V) ||
                      isa<GlobalAlias>(V),
                  "invalid llvm.used member", V);
@@ -584,7 +632,7 @@ void Verifier::visitAliaseeSubExpr(SmallPtrSetImpl<const GlobalAlias*> &Visited,
     if (const auto *GA2 = dyn_cast<GlobalAlias>(GV)) {
       Assert(Visited.insert(GA2).second, "Aliases cannot form a cycle", &GA);
 
-      Assert(!GA2->mayBeOverridden(), "Alias cannot point to a weak alias",
+      Assert(!GA2->isInterposable(), "Alias cannot point to an interposable alias",
              &GA);
     } else {
       // Only continue verifying subexpressions of GlobalAliases.
@@ -624,11 +672,9 @@ void Verifier::visitGlobalAlias(const GlobalAlias &GA) {
 }
 
 void Verifier::visitNamedMDNode(const NamedMDNode &NMD) {
-  for (unsigned i = 0, e = NMD.getNumOperands(); i != e; ++i) {
-    MDNode *MD = NMD.getOperand(i);
-
+  for (const MDNode *MD : NMD.operands()) {
     if (NMD.getName() == "llvm.dbg.cu") {
-      Assert(MD && isa<DICompileUnit>(MD), "invalid compile unit", &NMD, MD);
+      AssertDI(MD && isa<DICompileUnit>(MD), "invalid compile unit", &NMD, MD);
     }
 
     if (!MD)
@@ -656,8 +702,7 @@ void Verifier::visitMDNode(const MDNode &MD) {
 #include "llvm/IR/Metadata.def"
   }
 
-  for (unsigned i = 0, e = MD.getNumOperands(); i != e; ++i) {
-    Metadata *Op = MD.getOperand(i);
+  for (const Metadata *Op : MD.operands()) {
     if (!Op)
       continue;
     Assert(!isa<LocalAsMetadata>(Op), "Invalid operand for global metadata!",
@@ -719,33 +764,9 @@ void Verifier::visitMetadataAsValue(const MetadataAsValue &MDV, Function *F) {
     visitValueAsMetadata(*V, F);
 }
 
-bool Verifier::isValidUUID(const MDNode &N, const Metadata *MD) {
-  auto *S = dyn_cast<MDString>(MD);
-  if (!S)
-    return false;
-  if (S->getString().empty())
-    return false;
-
-  // Keep track of names of types referenced via UUID so we can check that they
-  // actually exist.
-  UnresolvedTypeRefs.insert(std::make_pair(S, &N));
-  return true;
-}
-
-/// \brief Check if a value can be a reference to a type.
-bool Verifier::isTypeRef(const MDNode &N, const Metadata *MD) {
-  return !MD || isValidUUID(N, MD) || isa<DIType>(MD);
-}
-
-/// \brief Check if a value can be a ScopeRef.
-bool Verifier::isScopeRef(const MDNode &N, const Metadata *MD) {
-  return !MD || isValidUUID(N, MD) || isa<DIScope>(MD);
-}
-
-/// \brief Check if a value can be a debug info ref.
-bool Verifier::isDIRef(const MDNode &N, const Metadata *MD) {
-  return !MD || isValidUUID(N, MD) || isa<DINode>(MD);
-}
+static bool isType(const Metadata *MD) { return !MD || isa<DIType>(MD); }
+static bool isScope(const Metadata *MD) { return !MD || isa<DIScope>(MD); }
+static bool isDINode(const Metadata *MD) { return !MD || isa<DINode>(MD); }
 
 template <class Ty>
 bool isValidMetadataArrayImpl(const MDTuple &N, bool AllowNull) {
@@ -772,60 +793,60 @@ bool isValidMetadataNullArray(const MDTuple &N) {
 }
 
 void Verifier::visitDILocation(const DILocation &N) {
-  Assert(N.getRawScope() && isa<DILocalScope>(N.getRawScope()),
-         "location requires a valid scope", &N, N.getRawScope());
+  AssertDI(N.getRawScope() && isa<DILocalScope>(N.getRawScope()),
+           "location requires a valid scope", &N, N.getRawScope());
   if (auto *IA = N.getRawInlinedAt())
-    Assert(isa<DILocation>(IA), "inlined-at should be a location", &N, IA);
+    AssertDI(isa<DILocation>(IA), "inlined-at should be a location", &N, IA);
 }
 
 void Verifier::visitGenericDINode(const GenericDINode &N) {
-  Assert(N.getTag(), "invalid tag", &N);
+  AssertDI(N.getTag(), "invalid tag", &N);
 }
 
 void Verifier::visitDIScope(const DIScope &N) {
   if (auto *F = N.getRawFile())
-    Assert(isa<DIFile>(F), "invalid file", &N, F);
+    AssertDI(isa<DIFile>(F), "invalid file", &N, F);
 }
 
 void Verifier::visitDISubrange(const DISubrange &N) {
-  Assert(N.getTag() == dwarf::DW_TAG_subrange_type, "invalid tag", &N);
-  Assert(N.getCount() >= -1, "invalid subrange count", &N);
+  AssertDI(N.getTag() == dwarf::DW_TAG_subrange_type, "invalid tag", &N);
+  AssertDI(N.getCount() >= -1, "invalid subrange count", &N);
 }
 
 void Verifier::visitDIEnumerator(const DIEnumerator &N) {
-  Assert(N.getTag() == dwarf::DW_TAG_enumerator, "invalid tag", &N);
+  AssertDI(N.getTag() == dwarf::DW_TAG_enumerator, "invalid tag", &N);
 }
 
 void Verifier::visitDIBasicType(const DIBasicType &N) {
-  Assert(N.getTag() == dwarf::DW_TAG_base_type ||
-             N.getTag() == dwarf::DW_TAG_unspecified_type,
-         "invalid tag", &N);
+  AssertDI(N.getTag() == dwarf::DW_TAG_base_type ||
+               N.getTag() == dwarf::DW_TAG_unspecified_type,
+           "invalid tag", &N);
 }
 
 void Verifier::visitDIDerivedType(const DIDerivedType &N) {
   // Common scope checks.
   visitDIScope(N);
 
-  Assert(N.getTag() == dwarf::DW_TAG_typedef ||
-             N.getTag() == dwarf::DW_TAG_pointer_type ||
-             N.getTag() == dwarf::DW_TAG_ptr_to_member_type ||
-             N.getTag() == dwarf::DW_TAG_reference_type ||
-             N.getTag() == dwarf::DW_TAG_rvalue_reference_type ||
-             N.getTag() == dwarf::DW_TAG_const_type ||
-             N.getTag() == dwarf::DW_TAG_volatile_type ||
-             N.getTag() == dwarf::DW_TAG_restrict_type ||
-             N.getTag() == dwarf::DW_TAG_member ||
-             N.getTag() == dwarf::DW_TAG_inheritance ||
-             N.getTag() == dwarf::DW_TAG_friend,
-         "invalid tag", &N);
+  AssertDI(N.getTag() == dwarf::DW_TAG_typedef ||
+               N.getTag() == dwarf::DW_TAG_pointer_type ||
+               N.getTag() == dwarf::DW_TAG_ptr_to_member_type ||
+               N.getTag() == dwarf::DW_TAG_reference_type ||
+               N.getTag() == dwarf::DW_TAG_rvalue_reference_type ||
+               N.getTag() == dwarf::DW_TAG_const_type ||
+               N.getTag() == dwarf::DW_TAG_volatile_type ||
+               N.getTag() == dwarf::DW_TAG_restrict_type ||
+               N.getTag() == dwarf::DW_TAG_member ||
+               N.getTag() == dwarf::DW_TAG_inheritance ||
+               N.getTag() == dwarf::DW_TAG_friend,
+           "invalid tag", &N);
   if (N.getTag() == dwarf::DW_TAG_ptr_to_member_type) {
-    Assert(isTypeRef(N, N.getExtraData()), "invalid pointer to member type", &N,
-           N.getExtraData());
+    AssertDI(isType(N.getRawExtraData()), "invalid pointer to member type", &N,
+             N.getRawExtraData());
   }
 
-  Assert(isScopeRef(N, N.getScope()), "invalid scope", &N, N.getScope());
-  Assert(isTypeRef(N, N.getBaseType()), "invalid base type", &N,
-         N.getBaseType());
+  AssertDI(isScope(N.getRawScope()), "invalid scope", &N, N.getRawScope());
+  AssertDI(isType(N.getRawBaseType()), "invalid base type", &N,
+           N.getRawBaseType());
 }
 
 static bool hasConflictingReferenceFlags(unsigned Flags) {
@@ -835,10 +856,10 @@ static bool hasConflictingReferenceFlags(unsigned Flags) {
 
 void Verifier::visitTemplateParams(const MDNode &N, const Metadata &RawParams) {
   auto *Params = dyn_cast<MDTuple>(&RawParams);
-  Assert(Params, "invalid template params", &N, &RawParams);
+  AssertDI(Params, "invalid template params", &N, &RawParams);
   for (Metadata *Op : Params->operands()) {
-    Assert(Op && isa<DITemplateParameter>(Op), "invalid template parameter", &N,
-           Params, Op);
+    AssertDI(Op && isa<DITemplateParameter>(Op), "invalid template parameter",
+             &N, Params, Op);
   }
 }
 
@@ -846,141 +867,151 @@ void Verifier::visitDICompositeType(const DICompositeType &N) {
   // Common scope checks.
   visitDIScope(N);
 
-  Assert(N.getTag() == dwarf::DW_TAG_array_type ||
-             N.getTag() == dwarf::DW_TAG_structure_type ||
-             N.getTag() == dwarf::DW_TAG_union_type ||
-             N.getTag() == dwarf::DW_TAG_enumeration_type ||
-             N.getTag() == dwarf::DW_TAG_class_type,
-         "invalid tag", &N);
-
-  Assert(isScopeRef(N, N.getScope()), "invalid scope", &N, N.getScope());
-  Assert(isTypeRef(N, N.getBaseType()), "invalid base type", &N,
-         N.getBaseType());
-
-  Assert(!N.getRawElements() || isa<MDTuple>(N.getRawElements()),
-         "invalid composite elements", &N, N.getRawElements());
-  Assert(isTypeRef(N, N.getRawVTableHolder()), "invalid vtable holder", &N,
-         N.getRawVTableHolder());
-  Assert(!hasConflictingReferenceFlags(N.getFlags()), "invalid reference flags",
-         &N);
+  AssertDI(N.getTag() == dwarf::DW_TAG_array_type ||
+               N.getTag() == dwarf::DW_TAG_structure_type ||
+               N.getTag() == dwarf::DW_TAG_union_type ||
+               N.getTag() == dwarf::DW_TAG_enumeration_type ||
+               N.getTag() == dwarf::DW_TAG_class_type,
+           "invalid tag", &N);
+
+  AssertDI(isScope(N.getRawScope()), "invalid scope", &N, N.getRawScope());
+  AssertDI(isType(N.getRawBaseType()), "invalid base type", &N,
+           N.getRawBaseType());
+
+  AssertDI(!N.getRawElements() || isa<MDTuple>(N.getRawElements()),
+           "invalid composite elements", &N, N.getRawElements());
+  AssertDI(isType(N.getRawVTableHolder()), "invalid vtable holder", &N,
+           N.getRawVTableHolder());
+  AssertDI(!hasConflictingReferenceFlags(N.getFlags()),
+           "invalid reference flags", &N);
   if (auto *Params = N.getRawTemplateParams())
     visitTemplateParams(N, *Params);
 
   if (N.getTag() == dwarf::DW_TAG_class_type ||
       N.getTag() == dwarf::DW_TAG_union_type) {
-    Assert(N.getFile() && !N.getFile()->getFilename().empty(),
-           "class/union requires a filename", &N, N.getFile());
+    AssertDI(N.getFile() && !N.getFile()->getFilename().empty(),
+             "class/union requires a filename", &N, N.getFile());
   }
 }
 
 void Verifier::visitDISubroutineType(const DISubroutineType &N) {
-  Assert(N.getTag() == dwarf::DW_TAG_subroutine_type, "invalid tag", &N);
+  AssertDI(N.getTag() == dwarf::DW_TAG_subroutine_type, "invalid tag", &N);
   if (auto *Types = N.getRawTypeArray()) {
-    Assert(isa<MDTuple>(Types), "invalid composite elements", &N, Types);
+    AssertDI(isa<MDTuple>(Types), "invalid composite elements", &N, Types);
     for (Metadata *Ty : N.getTypeArray()->operands()) {
-      Assert(isTypeRef(N, Ty), "invalid subroutine type ref", &N, Types, Ty);
+      AssertDI(isType(Ty), "invalid subroutine type ref", &N, Types, Ty);
     }
   }
-  Assert(!hasConflictingReferenceFlags(N.getFlags()), "invalid reference flags",
-         &N);
+  AssertDI(!hasConflictingReferenceFlags(N.getFlags()),
+           "invalid reference flags", &N);
 }
 
 void Verifier::visitDIFile(const DIFile &N) {
-  Assert(N.getTag() == dwarf::DW_TAG_file_type, "invalid tag", &N);
+  AssertDI(N.getTag() == dwarf::DW_TAG_file_type, "invalid tag", &N);
 }
 
 void Verifier::visitDICompileUnit(const DICompileUnit &N) {
-  Assert(N.isDistinct(), "compile units must be distinct", &N);
-  Assert(N.getTag() == dwarf::DW_TAG_compile_unit, "invalid tag", &N);
+  AssertDI(N.isDistinct(), "compile units must be distinct", &N);
+  AssertDI(N.getTag() == dwarf::DW_TAG_compile_unit, "invalid tag", &N);
 
   // Don't bother verifying the compilation directory or producer string
   // as those could be empty.
-  Assert(N.getRawFile() && isa<DIFile>(N.getRawFile()), "invalid file", &N,
-         N.getRawFile());
-  Assert(!N.getFile()->getFilename().empty(), "invalid filename", &N,
-         N.getFile());
+  AssertDI(N.getRawFile() && isa<DIFile>(N.getRawFile()), "invalid file", &N,
+           N.getRawFile());
+  AssertDI(!N.getFile()->getFilename().empty(), "invalid filename", &N,
+           N.getFile());
+
+  AssertDI((N.getEmissionKind() <= DICompileUnit::LastEmissionKind),
+           "invalid emission kind", &N);
 
   if (auto *Array = N.getRawEnumTypes()) {
-    Assert(isa<MDTuple>(Array), "invalid enum list", &N, Array);
+    AssertDI(isa<MDTuple>(Array), "invalid enum list", &N, Array);
     for (Metadata *Op : N.getEnumTypes()->operands()) {
       auto *Enum = dyn_cast_or_null<DICompositeType>(Op);
-      Assert(Enum && Enum->getTag() == dwarf::DW_TAG_enumeration_type,
-             "invalid enum type", &N, N.getEnumTypes(), Op);
+      AssertDI(Enum && Enum->getTag() == dwarf::DW_TAG_enumeration_type,
+               "invalid enum type", &N, N.getEnumTypes(), Op);
     }
   }
   if (auto *Array = N.getRawRetainedTypes()) {
-    Assert(isa<MDTuple>(Array), "invalid retained type list", &N, Array);
+    AssertDI(isa<MDTuple>(Array), "invalid retained type list", &N, Array);
     for (Metadata *Op : N.getRetainedTypes()->operands()) {
-      Assert(Op && isa<DIType>(Op), "invalid retained type", &N, Op);
-    }
-  }
-  if (auto *Array = N.getRawSubprograms()) {
-    Assert(isa<MDTuple>(Array), "invalid subprogram list", &N, Array);
-    for (Metadata *Op : N.getSubprograms()->operands()) {
-      Assert(Op && isa<DISubprogram>(Op), "invalid subprogram ref", &N, Op);
+      AssertDI(Op && (isa<DIType>(Op) ||
+                      (isa<DISubprogram>(Op) &&
+                       cast<DISubprogram>(Op)->isDefinition() == false)),
+               "invalid retained type", &N, Op);
     }
   }
   if (auto *Array = N.getRawGlobalVariables()) {
-    Assert(isa<MDTuple>(Array), "invalid global variable list", &N, Array);
+    AssertDI(isa<MDTuple>(Array), "invalid global variable list", &N, Array);
     for (Metadata *Op : N.getGlobalVariables()->operands()) {
-      Assert(Op && isa<DIGlobalVariable>(Op), "invalid global variable ref", &N,
-             Op);
+      AssertDI(Op && isa<DIGlobalVariable>(Op), "invalid global variable ref",
+               &N, Op);
     }
   }
   if (auto *Array = N.getRawImportedEntities()) {
-    Assert(isa<MDTuple>(Array), "invalid imported entity list", &N, Array);
+    AssertDI(isa<MDTuple>(Array), "invalid imported entity list", &N, Array);
     for (Metadata *Op : N.getImportedEntities()->operands()) {
-      Assert(Op && isa<DIImportedEntity>(Op), "invalid imported entity ref", &N,
-             Op);
+      AssertDI(Op && isa<DIImportedEntity>(Op), "invalid imported entity ref",
+               &N, Op);
     }
   }
   if (auto *Array = N.getRawMacros()) {
-    Assert(isa<MDTuple>(Array), "invalid macro list", &N, Array);
+    AssertDI(isa<MDTuple>(Array), "invalid macro list", &N, Array);
     for (Metadata *Op : N.getMacros()->operands()) {
-      Assert(Op && isa<DIMacroNode>(Op), "invalid macro ref", &N, Op);
+      AssertDI(Op && isa<DIMacroNode>(Op), "invalid macro ref", &N, Op);
     }
   }
+  CUVisited.insert(&N);
 }
 
 void Verifier::visitDISubprogram(const DISubprogram &N) {
-  Assert(N.getTag() == dwarf::DW_TAG_subprogram, "invalid tag", &N);
-  Assert(isScopeRef(N, N.getRawScope()), "invalid scope", &N, N.getRawScope());
+  AssertDI(N.getTag() == dwarf::DW_TAG_subprogram, "invalid tag", &N);
+  AssertDI(isScope(N.getRawScope()), "invalid scope", &N, N.getRawScope());
+  if (auto *F = N.getRawFile())
+    AssertDI(isa<DIFile>(F), "invalid file", &N, F);
   if (auto *T = N.getRawType())
-    Assert(isa<DISubroutineType>(T), "invalid subroutine type", &N, T);
-  Assert(isTypeRef(N, N.getRawContainingType()), "invalid containing type", &N,
-         N.getRawContainingType());
+    AssertDI(isa<DISubroutineType>(T), "invalid subroutine type", &N, T);
+  AssertDI(isType(N.getRawContainingType()), "invalid containing type", &N,
+           N.getRawContainingType());
   if (auto *Params = N.getRawTemplateParams())
     visitTemplateParams(N, *Params);
-  if (auto *S = N.getRawDeclaration()) {
-    Assert(isa<DISubprogram>(S) && !cast<DISubprogram>(S)->isDefinition(),
-           "invalid subprogram declaration", &N, S);
-  }
+  if (auto *S = N.getRawDeclaration())
+    AssertDI(isa<DISubprogram>(S) && !cast<DISubprogram>(S)->isDefinition(),
+             "invalid subprogram declaration", &N, S);
   if (auto *RawVars = N.getRawVariables()) {
     auto *Vars = dyn_cast<MDTuple>(RawVars);
-    Assert(Vars, "invalid variable list", &N, RawVars);
+    AssertDI(Vars, "invalid variable list", &N, RawVars);
     for (Metadata *Op : Vars->operands()) {
-      Assert(Op && isa<DILocalVariable>(Op), "invalid local variable", &N, Vars,
-             Op);
+      AssertDI(Op && isa<DILocalVariable>(Op), "invalid local variable", &N,
+               Vars, Op);
     }
   }
-  Assert(!hasConflictingReferenceFlags(N.getFlags()), "invalid reference flags",
-         &N);
-
-  if (N.isDefinition())
-    Assert(N.isDistinct(), "subprogram definitions must be distinct", &N);
+  AssertDI(!hasConflictingReferenceFlags(N.getFlags()),
+           "invalid reference flags", &N);
+
+  auto *Unit = N.getRawUnit();
+  if (N.isDefinition()) {
+    // Subprogram definitions (not part of the type hierarchy).
+    AssertDI(N.isDistinct(), "subprogram definitions must be distinct", &N);
+    AssertDI(Unit, "subprogram definitions must have a compile unit", &N);
+    AssertDI(isa<DICompileUnit>(Unit), "invalid unit type", &N, Unit);
+  } else {
+    // Subprogram declarations (part of the type hierarchy).
+    AssertDI(!Unit, "subprogram declarations must not have a compile unit", &N);
+  }
 }
 
 void Verifier::visitDILexicalBlockBase(const DILexicalBlockBase &N) {
-  Assert(N.getTag() == dwarf::DW_TAG_lexical_block, "invalid tag", &N);
-  Assert(N.getRawScope() && isa<DILocalScope>(N.getRawScope()),
-         "invalid local scope", &N, N.getRawScope());
+  AssertDI(N.getTag() == dwarf::DW_TAG_lexical_block, "invalid tag", &N);
+  AssertDI(N.getRawScope() && isa<DILocalScope>(N.getRawScope()),
+           "invalid local scope", &N, N.getRawScope());
 }
 
 void Verifier::visitDILexicalBlock(const DILexicalBlock &N) {
   visitDILexicalBlockBase(N);
 
-  Assert(N.getLine() || !N.getColumn(),
-         "cannot have column info without line info", &N);
+  AssertDI(N.getLine() || !N.getColumn(),
+           "cannot have column info without line info", &N);
 }
 
 void Verifier::visitDILexicalBlockFile(const DILexicalBlockFile &N) {
@@ -988,83 +1019,84 @@ void Verifier::visitDILexicalBlockFile(const DILexicalBlockFile &N) {
 }
 
 void Verifier::visitDINamespace(const DINamespace &N) {
-  Assert(N.getTag() == dwarf::DW_TAG_namespace, "invalid tag", &N);
+  AssertDI(N.getTag() == dwarf::DW_TAG_namespace, "invalid tag", &N);
   if (auto *S = N.getRawScope())
-    Assert(isa<DIScope>(S), "invalid scope ref", &N, S);
+    AssertDI(isa<DIScope>(S), "invalid scope ref", &N, S);
 }
 
 void Verifier::visitDIMacro(const DIMacro &N) {
-  Assert(N.getMacinfoType() == dwarf::DW_MACINFO_define ||
-         N.getMacinfoType() == dwarf::DW_MACINFO_undef,
-         "invalid macinfo type", &N);
-  Assert(!N.getName().empty(), "anonymous macro", &N);
+  AssertDI(N.getMacinfoType() == dwarf::DW_MACINFO_define ||
+               N.getMacinfoType() == dwarf::DW_MACINFO_undef,
+           "invalid macinfo type", &N);
+  AssertDI(!N.getName().empty(), "anonymous macro", &N);
   if (!N.getValue().empty()) {
     assert(N.getValue().data()[0] != ' ' && "Macro value has a space prefix");
   }
 }
 
 void Verifier::visitDIMacroFile(const DIMacroFile &N) {
-  Assert(N.getMacinfoType() == dwarf::DW_MACINFO_start_file,
-         "invalid macinfo type", &N);
+  AssertDI(N.getMacinfoType() == dwarf::DW_MACINFO_start_file,
+           "invalid macinfo type", &N);
   if (auto *F = N.getRawFile())
-    Assert(isa<DIFile>(F), "invalid file", &N, F);
+    AssertDI(isa<DIFile>(F), "invalid file", &N, F);
 
   if (auto *Array = N.getRawElements()) {
-    Assert(isa<MDTuple>(Array), "invalid macro list", &N, Array);
+    AssertDI(isa<MDTuple>(Array), "invalid macro list", &N, Array);
     for (Metadata *Op : N.getElements()->operands()) {
-      Assert(Op && isa<DIMacroNode>(Op), "invalid macro ref", &N, Op);
+      AssertDI(Op && isa<DIMacroNode>(Op), "invalid macro ref", &N, Op);
     }
   }
 }
 
 void Verifier::visitDIModule(const DIModule &N) {
-  Assert(N.getTag() == dwarf::DW_TAG_module, "invalid tag", &N);
-  Assert(!N.getName().empty(), "anonymous module", &N);
+  AssertDI(N.getTag() == dwarf::DW_TAG_module, "invalid tag", &N);
+  AssertDI(!N.getName().empty(), "anonymous module", &N);
 }
 
 void Verifier::visitDITemplateParameter(const DITemplateParameter &N) {
-  Assert(isTypeRef(N, N.getType()), "invalid type ref", &N, N.getType());
+  AssertDI(isType(N.getRawType()), "invalid type ref", &N, N.getRawType());
 }
 
 void Verifier::visitDITemplateTypeParameter(const DITemplateTypeParameter &N) {
   visitDITemplateParameter(N);
 
-  Assert(N.getTag() == dwarf::DW_TAG_template_type_parameter, "invalid tag",
-         &N);
+  AssertDI(N.getTag() == dwarf::DW_TAG_template_type_parameter, "invalid tag",
+           &N);
 }
 
 void Verifier::visitDITemplateValueParameter(
     const DITemplateValueParameter &N) {
   visitDITemplateParameter(N);
 
-  Assert(N.getTag() == dwarf::DW_TAG_template_value_parameter ||
-             N.getTag() == dwarf::DW_TAG_GNU_template_template_param ||
-             N.getTag() == dwarf::DW_TAG_GNU_template_parameter_pack,
-         "invalid tag", &N);
+  AssertDI(N.getTag() == dwarf::DW_TAG_template_value_parameter ||
+               N.getTag() == dwarf::DW_TAG_GNU_template_template_param ||
+               N.getTag() == dwarf::DW_TAG_GNU_template_parameter_pack,
+           "invalid tag", &N);
 }
 
 void Verifier::visitDIVariable(const DIVariable &N) {
   if (auto *S = N.getRawScope())
-    Assert(isa<DIScope>(S), "invalid scope", &N, S);
-  Assert(isTypeRef(N, N.getRawType()), "invalid type ref", &N, N.getRawType());
+    AssertDI(isa<DIScope>(S), "invalid scope", &N, S);
+  AssertDI(isType(N.getRawType()), "invalid type ref", &N, N.getRawType());
   if (auto *F = N.getRawFile())
-    Assert(isa<DIFile>(F), "invalid file", &N, F);
+    AssertDI(isa<DIFile>(F), "invalid file", &N, F);
 }
 
 void Verifier::visitDIGlobalVariable(const DIGlobalVariable &N) {
   // Checks common to all variables.
   visitDIVariable(N);
 
-  Assert(N.getTag() == dwarf::DW_TAG_variable, "invalid tag", &N);
-  Assert(!N.getName().empty(), "missing global variable name", &N);
+  AssertDI(N.getTag() == dwarf::DW_TAG_variable, "invalid tag", &N);
+  AssertDI(!N.getName().empty(), "missing global variable name", &N);
   if (auto *V = N.getRawVariable()) {
-    Assert(isa<ConstantAsMetadata>(V) &&
-               !isa<Function>(cast<ConstantAsMetadata>(V)->getValue()),
-           "invalid global varaible ref", &N, V);
+    AssertDI(isa<ConstantAsMetadata>(V) &&
+                 !isa<Function>(cast<ConstantAsMetadata>(V)->getValue()),
+             "invalid global varaible ref", &N, V);
+    visitConstantExprsRecursively(cast<ConstantAsMetadata>(V)->getValue());
   }
   if (auto *Member = N.getRawStaticDataMemberDeclaration()) {
-    Assert(isa<DIDerivedType>(Member), "invalid static data member declaration",
-           &N, Member);
+    AssertDI(isa<DIDerivedType>(Member),
+             "invalid static data member declaration", &N, Member);
   }
 }
 
@@ -1072,31 +1104,31 @@ void Verifier::visitDILocalVariable(const DILocalVariable &N) {
   // Checks common to all variables.
   visitDIVariable(N);
 
-  Assert(N.getTag() == dwarf::DW_TAG_variable, "invalid tag", &N);
-  Assert(N.getRawScope() && isa<DILocalScope>(N.getRawScope()),
-         "local variable requires a valid scope", &N, N.getRawScope());
+  AssertDI(N.getTag() == dwarf::DW_TAG_variable, "invalid tag", &N);
+  AssertDI(N.getRawScope() && isa<DILocalScope>(N.getRawScope()),
+           "local variable requires a valid scope", &N, N.getRawScope());
 }
 
 void Verifier::visitDIExpression(const DIExpression &N) {
-  Assert(N.isValid(), "invalid expression", &N);
+  AssertDI(N.isValid(), "invalid expression", &N);
 }
 
 void Verifier::visitDIObjCProperty(const DIObjCProperty &N) {
-  Assert(N.getTag() == dwarf::DW_TAG_APPLE_property, "invalid tag", &N);
+  AssertDI(N.getTag() == dwarf::DW_TAG_APPLE_property, "invalid tag", &N);
   if (auto *T = N.getRawType())
-    Assert(isTypeRef(N, T), "invalid type ref", &N, T);
+    AssertDI(isType(T), "invalid type ref", &N, T);
   if (auto *F = N.getRawFile())
-    Assert(isa<DIFile>(F), "invalid file", &N, F);
+    AssertDI(isa<DIFile>(F), "invalid file", &N, F);
 }
 
 void Verifier::visitDIImportedEntity(const DIImportedEntity &N) {
-  Assert(N.getTag() == dwarf::DW_TAG_imported_module ||
-             N.getTag() == dwarf::DW_TAG_imported_declaration,
-         "invalid tag", &N);
+  AssertDI(N.getTag() == dwarf::DW_TAG_imported_module ||
+               N.getTag() == dwarf::DW_TAG_imported_declaration,
+           "invalid tag", &N);
   if (auto *S = N.getRawScope())
-    Assert(isa<DIScope>(S), "invalid scope for imported entity", &N, S);
-  Assert(isDIRef(N, N.getEntity()), "invalid imported entity", &N,
-         N.getEntity());
+    AssertDI(isa<DIScope>(S), "invalid scope for imported entity", &N, S);
+  AssertDI(isDINode(N.getRawEntity()), "invalid imported entity", &N,
+           N.getRawEntity());
 }
 
 void Verifier::visitComdat(const Comdat &C) {
@@ -1114,8 +1146,7 @@ void Verifier::visitModuleIdents(const Module &M) {
   
   // llvm.ident takes a list of metadata entry. Each entry has only one string.
   // Scan each llvm.ident entry and make sure that this requirement is met.
-  for (unsigned i = 0, e = Idents->getNumOperands(); i != e; ++i) {
-    const MDNode *N = Idents->getOperand(i);
+  for (const MDNode *N : Idents->operands()) {
     Assert(N->getNumOperands() == 1,
            "incorrect number of operands in llvm.ident metadata", N);
     Assert(dyn_cast_or_null<MDString>(N->getOperand(0)),
@@ -1132,13 +1163,11 @@ void Verifier::visitModuleFlags(const Module &M) {
   // Scan each flag, and track the flags and requirements.
   DenseMap<const MDString*, const MDNode*> SeenIDs;
   SmallVector<const MDNode*, 16> Requirements;
-  for (unsigned I = 0, E = Flags->getNumOperands(); I != E; ++I) {
-    visitModuleFlag(Flags->getOperand(I), SeenIDs, Requirements);
-  }
+  for (const MDNode *MDN : Flags->operands())
+    visitModuleFlag(MDN, SeenIDs, Requirements);
 
   // Validate that the requirements in the module are valid.
-  for (unsigned I = 0, E = Requirements.size(); I != E; ++I) {
-    const MDNode *Requirement = Requirements[I];
+  for (const MDNode *Requirement : Requirements) {
     const MDString *Flag = cast<MDString>(Requirement->getOperand(0));
     const Metadata *ReqValue = Requirement->getOperand(1);
 
@@ -1225,7 +1254,7 @@ Verifier::visitModuleFlag(const MDNode *Op,
   }
 }
 
-void Verifier::VerifyAttributeTypes(AttributeSet Attrs, unsigned Idx,
+void Verifier::verifyAttributeTypes(AttributeSet Attrs, unsigned Idx,
                                     bool isFunction, const Value *V) {
   unsigned Slot = ~0U;
   for (unsigned I = 0, E = Attrs.getNumSlots(); I != E; ++I)
@@ -1272,13 +1301,15 @@ void Verifier::VerifyAttributeTypes(AttributeSet Attrs, unsigned Idx,
         I->getKindAsEnum() == Attribute::ArgMemOnly ||
         I->getKindAsEnum() == Attribute::NoRecurse ||
         I->getKindAsEnum() == Attribute::InaccessibleMemOnly ||
-        I->getKindAsEnum() == Attribute::InaccessibleMemOrArgMemOnly) {
+        I->getKindAsEnum() == Attribute::InaccessibleMemOrArgMemOnly ||
+        I->getKindAsEnum() == Attribute::AllocSize) {
       if (!isFunction) {
         CheckFailed("Attribute '" + I->getAsString() +
                     "' only applies to functions!", V);
         return;
       }
     } else if (I->getKindAsEnum() == Attribute::ReadOnly ||
+               I->getKindAsEnum() == Attribute::WriteOnly ||
                I->getKindAsEnum() == Attribute::ReadNone) {
       if (Idx == 0) {
         CheckFailed("Attribute '" + I->getAsString() +
@@ -1295,12 +1326,12 @@ void Verifier::VerifyAttributeTypes(AttributeSet Attrs, unsigned Idx,
 
 // VerifyParameterAttrs - Check the given attributes for an argument or return
 // value of the specified type.  The value V is printed in error messages.
-void Verifier::VerifyParameterAttrs(AttributeSet Attrs, unsigned Idx, Type *Ty,
+void Verifier::verifyParameterAttrs(AttributeSet Attrs, unsigned Idx, Type *Ty,
                                     bool isReturnValue, const Value *V) {
   if (!Attrs.hasAttributes(Idx))
     return;
 
-  VerifyAttributeTypes(Attrs, Idx, false, V);
+  verifyAttributeTypes(Attrs, Idx, false, V);
 
   if (isReturnValue)
     Assert(!Attrs.hasAttribute(Idx, Attribute::ByVal) &&
@@ -1308,9 +1339,12 @@ void Verifier::VerifyParameterAttrs(AttributeSet Attrs, unsigned Idx, Type *Ty,
                !Attrs.hasAttribute(Idx, Attribute::StructRet) &&
                !Attrs.hasAttribute(Idx, Attribute::NoCapture) &&
                !Attrs.hasAttribute(Idx, Attribute::Returned) &&
-               !Attrs.hasAttribute(Idx, Attribute::InAlloca),
-           "Attributes 'byval', 'inalloca', 'nest', 'sret', 'nocapture', and "
-           "'returned' do not apply to return values!",
+               !Attrs.hasAttribute(Idx, Attribute::InAlloca) &&
+               !Attrs.hasAttribute(Idx, Attribute::SwiftSelf) &&
+               !Attrs.hasAttribute(Idx, Attribute::SwiftError),
+           "Attributes 'byval', 'inalloca', 'nest', 'sret', 'nocapture', "
+           "'returned', 'swiftself', and 'swifterror' do not apply to return "
+           "values!",
            V);
 
   // Check for mutually incompatible attributes.  Only inreg is compatible with
@@ -1349,6 +1383,18 @@ void Verifier::VerifyParameterAttrs(AttributeSet Attrs, unsigned Idx, Type *Ty,
          "'readnone and readonly' are incompatible!",
          V);
 
+  Assert(!(Attrs.hasAttribute(Idx, Attribute::ReadNone) &&
+           Attrs.hasAttribute(Idx, Attribute::WriteOnly)),
+         "Attributes "
+         "'readnone and writeonly' are incompatible!",
+         V);
+
+  Assert(!(Attrs.hasAttribute(Idx, Attribute::ReadOnly) &&
+           Attrs.hasAttribute(Idx, Attribute::WriteOnly)),
+         "Attributes "
+         "'readonly and writeonly' are incompatible!",
+         V);
+
   Assert(!(Attrs.hasAttribute(Idx, Attribute::NoInline) &&
            Attrs.hasAttribute(Idx, Attribute::AlwaysInline)),
          "Attributes "
@@ -1370,16 +1416,25 @@ void Verifier::VerifyParameterAttrs(AttributeSet Attrs, unsigned Idx, Type *Ty,
              "Attributes 'byval' and 'inalloca' do not support unsized types!",
              V);
     }
+    if (!isa<PointerType>(PTy->getElementType()))
+      Assert(!Attrs.hasAttribute(Idx, Attribute::SwiftError),
+             "Attribute 'swifterror' only applies to parameters "
+             "with pointer to pointer type!",
+             V);
   } else {
     Assert(!Attrs.hasAttribute(Idx, Attribute::ByVal),
            "Attribute 'byval' only applies to parameters with pointer type!",
            V);
+    Assert(!Attrs.hasAttribute(Idx, Attribute::SwiftError),
+           "Attribute 'swifterror' only applies to parameters "
+           "with pointer type!",
+           V);
   }
 }
 
-// VerifyFunctionAttrs - Check parameter attributes against a function type.
+// Check parameter attributes against a function type.
 // The value V is printed in error messages.
-void Verifier::VerifyFunctionAttrs(FunctionType *FT, AttributeSet Attrs,
+void Verifier::verifyFunctionAttrs(FunctionType *FT, AttributeSet Attrs,
                                    const Value *V) {
   if (Attrs.isEmpty())
     return;
@@ -1387,6 +1442,8 @@ void Verifier::VerifyFunctionAttrs(FunctionType *FT, AttributeSet Attrs,
   bool SawNest = false;
   bool SawReturned = false;
   bool SawSRet = false;
+  bool SawSwiftSelf = false;
+  bool SawSwiftError = false;
 
   for (unsigned i = 0, e = Attrs.getNumSlots(); i != e; ++i) {
     unsigned Idx = Attrs.getSlotIndex(i);
@@ -1399,7 +1456,7 @@ void Verifier::VerifyFunctionAttrs(FunctionType *FT, AttributeSet Attrs,
     else
       break;  // VarArgs attributes, verified elsewhere.
 
-    VerifyParameterAttrs(Attrs, Idx, Ty, Idx == 0, V);
+    verifyParameterAttrs(Attrs, Idx, Ty, Idx == 0, V);
 
     if (Idx == 0)
       continue;
@@ -1426,6 +1483,17 @@ void Verifier::VerifyFunctionAttrs(FunctionType *FT, AttributeSet Attrs,
       SawSRet = true;
     }
 
+    if (Attrs.hasAttribute(Idx, Attribute::SwiftSelf)) {
+      Assert(!SawSwiftSelf, "Cannot have multiple 'swiftself' parameters!", V);
+      SawSwiftSelf = true;
+    }
+
+    if (Attrs.hasAttribute(Idx, Attribute::SwiftError)) {
+      Assert(!SawSwiftError, "Cannot have multiple 'swifterror' parameters!",
+             V);
+      SawSwiftError = true;
+    }
+
     if (Attrs.hasAttribute(Idx, Attribute::InAlloca)) {
       Assert(Idx == FT->getNumParams(), "inalloca isn't on the last parameter!",
              V);
@@ -1435,7 +1503,7 @@ void Verifier::VerifyFunctionAttrs(FunctionType *FT, AttributeSet Attrs,
   if (!Attrs.hasAttributes(AttributeSet::FunctionIndex))
     return;
 
-  VerifyAttributeTypes(Attrs, AttributeSet::FunctionIndex, true, V);
+  verifyAttributeTypes(Attrs, AttributeSet::FunctionIndex, true, V);
 
   Assert(
       !(Attrs.hasAttribute(AttributeSet::FunctionIndex, Attribute::ReadNone) &&
@@ -1444,6 +1512,16 @@ void Verifier::VerifyFunctionAttrs(FunctionType *FT, AttributeSet Attrs,
 
   Assert(
       !(Attrs.hasAttribute(AttributeSet::FunctionIndex, Attribute::ReadNone) &&
+        Attrs.hasAttribute(AttributeSet::FunctionIndex, Attribute::WriteOnly)),
+      "Attributes 'readnone and writeonly' are incompatible!", V);
+
+  Assert(
+      !(Attrs.hasAttribute(AttributeSet::FunctionIndex, Attribute::ReadOnly) &&
+        Attrs.hasAttribute(AttributeSet::FunctionIndex, Attribute::WriteOnly)),
+      "Attributes 'readonly and writeonly' are incompatible!", V);
+
+  Assert(
+      !(Attrs.hasAttribute(AttributeSet::FunctionIndex, Attribute::ReadNone) &&
         Attrs.hasAttribute(AttributeSet::FunctionIndex, 
                            Attribute::InaccessibleMemOrArgMemOnly)),
       "Attributes 'readnone and inaccessiblemem_or_argmemonly' are incompatible!", V);
@@ -1476,19 +1554,43 @@ void Verifier::VerifyFunctionAttrs(FunctionType *FT, AttributeSet Attrs,
   if (Attrs.hasAttribute(AttributeSet::FunctionIndex,
                          Attribute::JumpTable)) {
     const GlobalValue *GV = cast<GlobalValue>(V);
-    Assert(GV->hasUnnamedAddr(),
+    Assert(GV->hasGlobalUnnamedAddr(),
            "Attribute 'jumptable' requires 'unnamed_addr'", V);
   }
-}
 
-void Verifier::VerifyFunctionMetadata(
-    const SmallVector<std::pair<unsigned, MDNode *>, 4> MDs) {
-  if (MDs.empty())
-    return;
+  if (Attrs.hasAttribute(AttributeSet::FunctionIndex, Attribute::AllocSize)) {
+    std::pair<unsigned, Optional<unsigned>> Args =
+        Attrs.getAllocSizeArgs(AttributeSet::FunctionIndex);
+
+    auto CheckParam = [&](StringRef Name, unsigned ParamNo) {
+      if (ParamNo >= FT->getNumParams()) {
+        CheckFailed("'allocsize' " + Name + " argument is out of bounds", V);
+        return false;
+      }
+
+      if (!FT->getParamType(ParamNo)->isIntegerTy()) {
+        CheckFailed("'allocsize' " + Name +
+                        " argument must refer to an integer parameter",
+                    V);
+        return false;
+      }
+
+      return true;
+    };
+
+    if (!CheckParam("element size", Args.first))
+      return;
 
-  for (unsigned i = 0; i < MDs.size(); i++) {
-    if (MDs[i].first == LLVMContext::MD_prof) {
-      MDNode *MD = MDs[i].second;
+    if (Args.second && !CheckParam("number of elements", *Args.second))
+      return;
+  }
+}
+
+void Verifier::verifyFunctionMetadata(
+    ArrayRef<std::pair<unsigned, MDNode *>> MDs) {
+  for (const auto &Pair : MDs) {
+    if (Pair.first == LLVMContext::MD_prof) {
+      MDNode *MD = Pair.second;
       Assert(MD->getNumOperands() == 2,
              "!prof annotations should have exactly 2 operands", MD);
 
@@ -1525,13 +1627,19 @@ void Verifier::visitConstantExprsRecursively(const Constant *EntryC) {
     if (const auto *CE = dyn_cast<ConstantExpr>(C))
       visitConstantExpr(CE);
 
+    if (const auto *GV = dyn_cast<GlobalValue>(C)) {
+      // Global Values get visited separately, but we do need to make sure
+      // that the global value is in the correct module
+      Assert(GV->getParent() == M, "Referencing global in another module!",
+             EntryC, M, GV, GV->getParent());
+      continue;
+    }
+
     // Visit all sub-expressions.
     for (const Use &U : C->operands()) {
       const auto *OpC = dyn_cast<Constant>(U);
       if (!OpC)
         continue;
-      if (isa<GlobalValue>(OpC))
-        continue; // Global values get visited separately.
       if (!ConstantExprVisited.insert(OpC).second)
         continue;
       Stack.push_back(OpC);
@@ -1548,7 +1656,7 @@ void Verifier::visitConstantExpr(const ConstantExpr *CE) {
          "Invalid bitcast", CE);
 }
 
-bool Verifier::VerifyAttributeCount(AttributeSet Attrs, unsigned Params) {
+bool Verifier::verifyAttributeCount(AttributeSet Attrs, unsigned Params) {
   if (Attrs.getNumSlots() == 0)
     return true;
 
@@ -1562,8 +1670,8 @@ bool Verifier::VerifyAttributeCount(AttributeSet Attrs, unsigned Params) {
   return false;
 }
 
-/// \brief Verify that statepoint intrinsic is well formed.
-void Verifier::VerifyStatepoint(ImmutableCallSite CS) {
+/// Verify that statepoint intrinsic is well formed.
+void Verifier::verifyStatepoint(ImmutableCallSite CS) {
   assert(CS.getCalledFunction() &&
          CS.getCalledFunction()->getIntrinsicID() ==
            Intrinsic::experimental_gc_statepoint);
@@ -1674,11 +1782,11 @@ void Verifier::VerifyStatepoint(ImmutableCallSite CS) {
     const CallInst *Call = dyn_cast<const CallInst>(U);
     Assert(Call, "illegal use of statepoint token", &CI, U);
     if (!Call) continue;
-    Assert(isa<GCRelocateInst>(Call) || isGCResult(Call),
+    Assert(isa<GCRelocateInst>(Call) || isa<GCResultInst>(Call),
            "gc.result or gc.relocate are the only value uses"
            "of a gc.statepoint",
            &CI, U);
-    if (isGCResult(Call)) {
+    if (isa<GCResultInst>(Call)) {
       Assert(Call->getArgOperand(0) == &CI,
              "gc.result connected to wrong gc.statepoint", &CI, Call);
     } else if (isa<GCRelocateInst>(Call)) {
@@ -1766,6 +1874,8 @@ void Verifier::verifySiblingFuncletUnwinds() {
 // visitFunction - Verify that a function is ok.
 //
 void Verifier::visitFunction(const Function &F) {
+  visitGlobalValue(F);
+
   // Check function arguments.
   FunctionType *FT = F.getFunctionType();
   unsigned NumArgs = F.arg_size();
@@ -1786,11 +1896,11 @@ void Verifier::visitFunction(const Function &F) {
 
   AttributeSet Attrs = F.getAttributes();
 
-  Assert(VerifyAttributeCount(Attrs, FT->getNumParams()),
+  Assert(verifyAttributeCount(Attrs, FT->getNumParams()),
          "Attribute after last parameter!", &F);
 
   // Check function attributes.
-  VerifyFunctionAttrs(FT, Attrs, &F);
+  verifyFunctionAttrs(FT, Attrs, &F);
 
   // On function declarations/definitions, we do not support the builtin
   // attribute. We do not check this in VerifyFunctionAttrs since that is
@@ -1821,19 +1931,24 @@ void Verifier::visitFunction(const Function &F) {
 
   // Check that the argument values match the function type for this function...
   unsigned i = 0;
-  for (Function::const_arg_iterator I = F.arg_begin(), E = F.arg_end(); I != E;
-       ++I, ++i) {
-    Assert(I->getType() == FT->getParamType(i),
-           "Argument value does not match function argument type!", I,
+  for (const Argument &Arg : F.args()) {
+    Assert(Arg.getType() == FT->getParamType(i),
+           "Argument value does not match function argument type!", &Arg,
            FT->getParamType(i));
-    Assert(I->getType()->isFirstClassType(),
-           "Function arguments must have first-class types!", I);
+    Assert(Arg.getType()->isFirstClassType(),
+           "Function arguments must have first-class types!", &Arg);
     if (!isLLVMdotName) {
-      Assert(!I->getType()->isMetadataTy(),
-             "Function takes metadata but isn't an intrinsic", I, &F);
-      Assert(!I->getType()->isTokenTy(),
-             "Function takes token but isn't an intrinsic", I, &F);
+      Assert(!Arg.getType()->isMetadataTy(),
+             "Function takes metadata but isn't an intrinsic", &Arg, &F);
+      Assert(!Arg.getType()->isTokenTy(),
+             "Function takes token but isn't an intrinsic", &Arg, &F);
+    }
+
+    // Check that swifterror argument is only used by loads and stores.
+    if (Attrs.hasAttribute(i+1, Attribute::SwiftError)) {
+      verifySwiftErrorValue(&Arg);
     }
+    ++i;
   }
 
   if (!isLLVMdotName)
@@ -1844,7 +1959,7 @@ void Verifier::visitFunction(const Function &F) {
   SmallVector<std::pair<unsigned, MDNode *>, 4> MDs;
   F.getAllMetadata(MDs);
   assert(F.hasMetadata() != MDs.empty() && "Bit out-of-sync");
-  VerifyFunctionMetadata(MDs);
+  verifyFunctionMetadata(MDs);
 
   // Check validity of the personality function
   if (F.hasPersonalityFn()) {
@@ -1860,10 +1975,15 @@ void Verifier::visitFunction(const Function &F) {
     Assert(MDs.empty(), "unmaterialized function cannot have metadata", &F,
            MDs.empty() ? nullptr : MDs.front().second);
   } else if (F.isDeclaration()) {
-    Assert(F.hasExternalLinkage() || F.hasExternalWeakLinkage(),
-           "invalid linkage type for function declaration", &F);
-    Assert(MDs.empty(), "function without a body cannot have metadata", &F,
-           MDs.empty() ? nullptr : MDs.front().second);
+    for (const auto &I : MDs) {
+      AssertDI(I.first != LLVMContext::MD_dbg,
+               "function declaration may not have a !dbg attachment", &F);
+      Assert(I.first != LLVMContext::MD_prof,
+             "function declaration may not have a !prof attachment", &F);
+
+      // Verify the metadata itself.
+      visitMDNode(*I.second);
+    }
     Assert(!F.hasPersonalityFn(),
            "Function declaration shouldn't have a personality routine", &F);
   } else {
@@ -1882,6 +2002,7 @@ void Verifier::visitFunction(const Function &F) {
              "blockaddress may not be used with the entry block!", Entry);
     }
 
+    unsigned NumDebugAttachments = 0, NumProfAttachments = 0;
     // Visit metadata attachments.
     for (const auto &I : MDs) {
       // Verify that the attachment is legal.
@@ -1889,8 +2010,16 @@ void Verifier::visitFunction(const Function &F) {
       default:
         break;
       case LLVMContext::MD_dbg:
-        Assert(isa<DISubprogram>(I.second),
-               "function !dbg attachment must be a subprogram", &F, I.second);
+        ++NumDebugAttachments;
+        AssertDI(NumDebugAttachments == 1,
+                 "function must have a single !dbg attachment", &F, I.second);
+        AssertDI(isa<DISubprogram>(I.second),
+                 "function !dbg attachment must be a subprogram", &F, I.second);
+        break;
+      case LLVMContext::MD_prof:
+        ++NumProfAttachments;
+        Assert(NumProfAttachments == 1,
+               "function must have a single !prof attachment", &F, I.second);
         break;
       }
 
@@ -1918,6 +2047,8 @@ void Verifier::visitFunction(const Function &F) {
   if (!N)
     return;
 
+  visitDISubprogram(*N);
+
   // Check that all !dbg attachments lead to back to N (or, at least, another
   // subprogram that describes the same function).
   //
@@ -2053,11 +2184,11 @@ void Verifier::visitSwitchInst(SwitchInst &SI) {
   // have the same type as the switched-on value.
   Type *SwitchTy = SI.getCondition()->getType();
   SmallPtrSet<ConstantInt*, 32> Constants;
-  for (SwitchInst::CaseIt i = SI.case_begin(), e = SI.case_end(); i != e; ++i) {
-    Assert(i.getCaseValue()->getType() == SwitchTy,
+  for (auto &Case : SI.cases()) {
+    Assert(Case.getCaseValue()->getType() == SwitchTy,
            "Switch constants must all be same type as switch value!", &SI);
-    Assert(Constants.insert(i.getCaseValue()).second,
-           "Duplicate integer as switch case", &SI, i.getCaseValue());
+    Assert(Constants.insert(Case.getCaseValue()).second,
+           "Duplicate integer as switch case", &SI, Case.getCaseValue());
   }
 
   visitTerminatorInst(SI);
@@ -2362,7 +2493,7 @@ void Verifier::visitPHINode(PHINode &PN) {
   visitInstruction(PN);
 }
 
-void Verifier::VerifyCallSite(CallSite CS) {
+void Verifier::verifyCallSite(CallSite CS) {
   Instruction *I = CS.getInstruction();
 
   Assert(CS.getCalledValue()->getType()->isPointerTy(),
@@ -2393,11 +2524,11 @@ void Verifier::VerifyCallSite(CallSite CS) {
 
   AttributeSet Attrs = CS.getAttributes();
 
-  Assert(VerifyAttributeCount(Attrs, CS.arg_size()),
+  Assert(verifyAttributeCount(Attrs, CS.arg_size()),
          "Attribute after last parameter!", I);
 
   // Verify call attributes.
-  VerifyFunctionAttrs(FTy, Attrs, I);
+  verifyFunctionAttrs(FTy, Attrs, I);
 
   // Conservatively check the inalloca argument.
   // We have a bug if we can find that there is an underlying alloca without
@@ -2409,6 +2540,18 @@ void Verifier::VerifyCallSite(CallSite CS) {
              "inalloca argument for call has mismatched alloca", AI, I);
   }
 
+  // For each argument of the callsite, if it has the swifterror argument,
+  // make sure the underlying alloca has swifterror as well.
+  for (unsigned i = 0, e = FTy->getNumParams(); i != e; ++i)
+    if (CS.paramHasAttr(i+1, Attribute::SwiftError)) {
+      Value *SwiftErrorArg = CS.getArgument(i);
+      auto AI = dyn_cast<AllocaInst>(SwiftErrorArg->stripInBoundsOffsets());
+      Assert(AI, "swifterror argument should come from alloca", AI, I);
+      if (AI)
+        Assert(AI->isSwiftError(),
+               "swifterror argument for call has mismatched alloca", AI, I);
+    }
+
   if (FTy->isVarArg()) {
     // FIXME? is 'nest' even legal here?
     bool SawNest = false;
@@ -2424,7 +2567,7 @@ void Verifier::VerifyCallSite(CallSite CS) {
     // Check attributes on the varargs part.
     for (unsigned Idx = 1 + FTy->getNumParams(); Idx <= CS.arg_size(); ++Idx) {
       Type *Ty = CS.getArgument(Idx-1)->getType();
-      VerifyParameterAttrs(Attrs, Idx, Ty, false, I);
+      verifyParameterAttrs(Attrs, Idx, Ty, false, I);
 
       if (Attrs.hasAttribute(Idx, Attribute::Nest)) {
         Assert(!SawNest, "More than one parameter has attribute nest!", I);
@@ -2469,17 +2612,21 @@ void Verifier::VerifyCallSite(CallSite CS) {
     if (Intrinsic::ID ID = (Intrinsic::ID)F->getIntrinsicID())
       visitIntrinsicCallSite(ID, CS);
 
-  // Verify that a callsite has at most one "deopt" and one "funclet" operand
-  // bundle.
-  bool FoundDeoptBundle = false, FoundFuncletBundle = false;
+  // Verify that a callsite has at most one "deopt", at most one "funclet" and
+  // at most one "gc-transition" operand bundle.
+  bool FoundDeoptBundle = false, FoundFuncletBundle = false,
+       FoundGCTransitionBundle = false;
   for (unsigned i = 0, e = CS.getNumOperandBundles(); i < e; ++i) {
     OperandBundleUse BU = CS.getOperandBundleAt(i);
     uint32_t Tag = BU.getTagID();
     if (Tag == LLVMContext::OB_deopt) {
       Assert(!FoundDeoptBundle, "Multiple deopt operand bundles", I);
       FoundDeoptBundle = true;
-    }
-    if (Tag == LLVMContext::OB_funclet) {
+    } else if (Tag == LLVMContext::OB_gc_transition) {
+      Assert(!FoundGCTransitionBundle, "Multiple gc-transition operand bundles",
+             I);
+      FoundGCTransitionBundle = true;
+    } else if (Tag == LLVMContext::OB_funclet) {
       Assert(!FoundFuncletBundle, "Multiple funclet operand bundles", I);
       FoundFuncletBundle = true;
       Assert(BU.Inputs.size() == 1,
@@ -2490,6 +2637,15 @@ void Verifier::VerifyCallSite(CallSite CS) {
     }
   }
 
+  // Verify that each inlinable callsite of a debug-info-bearing function in a
+  // debug-info-bearing function has a debug location attached to it. Failure to
+  // do so causes assertion failures when the inliner sets up inline scope info.
+  if (I->getFunction()->getSubprogram() && CS.getCalledFunction() &&
+      CS.getCalledFunction()->getSubprogram())
+    Assert(I->getDebugLoc(), "inlinable function call in a function with debug "
+                             "info must have a !dbg location",
+           I);
+
   visitInstruction(*I);
 }
 
@@ -2508,7 +2664,8 @@ static bool isTypeCongruent(Type *L, Type *R) {
 static AttrBuilder getParameterABIAttributes(int I, AttributeSet Attrs) {
   static const Attribute::AttrKind ABIAttrs[] = {
       Attribute::StructRet, Attribute::ByVal, Attribute::InAlloca,
-      Attribute::InReg, Attribute::Returned};
+      Attribute::InReg, Attribute::Returned, Attribute::SwiftSelf,
+      Attribute::SwiftError};
   AttrBuilder Copy;
   for (auto AK : ABIAttrs) {
     if (Attrs.hasAttribute(I + 1, AK))
@@ -2581,14 +2738,14 @@ void Verifier::verifyMustTailCall(CallInst &CI) {
 }
 
 void Verifier::visitCallInst(CallInst &CI) {
-  VerifyCallSite(&CI);
+  verifyCallSite(&CI);
 
   if (CI.isMustTailCall())
     verifyMustTailCall(CI);
 }
 
 void Verifier::visitInvokeInst(InvokeInst &II) {
-  VerifyCallSite(&II);
+  verifyCallSite(&II);
 
   // Verify that the first non-PHI instruction of the unwind destination is an
   // exception handling instruction.
@@ -2741,8 +2898,8 @@ void Verifier::visitGetElementPtrInst(GetElementPtrInst &GEP) {
     if (GEP.getPointerOperandType()->isVectorTy())
       Assert(GEPWidth == GEP.getPointerOperandType()->getVectorNumElements(),
              "Vector GEP result width doesn't match operand's", &GEP);
-    for (unsigned i = 0, e = Idxs.size(); i != e; ++i) {
-      Type *IndexTy = Idxs[i]->getType();
+    for (Value *Idx : Idxs) {
+      Type *IndexTy = Idx->getType();
       if (IndexTy->isVectorTy()) {
         unsigned IndexWidth = IndexTy->getVectorNumElements();
         Assert(IndexWidth == GEPWidth, "Invalid GEP index vector width", &GEP);
@@ -2822,8 +2979,10 @@ void Verifier::visitLoadInst(LoadInst &LI) {
   Type *ElTy = LI.getType();
   Assert(LI.getAlignment() <= Value::MaximumAlignment,
          "huge alignment values are unsupported", &LI);
+  Assert(ElTy->isSized(), "loading unsized types is not allowed", &LI);
   if (LI.isAtomic()) {
-    Assert(LI.getOrdering() != Release && LI.getOrdering() != AcquireRelease,
+    Assert(LI.getOrdering() != AtomicOrdering::Release &&
+               LI.getOrdering() != AtomicOrdering::AcquireRelease,
            "Load cannot have Release ordering", &LI);
     Assert(LI.getAlignment() != 0,
            "Atomic load must specify explicit alignment", &LI);
@@ -2849,8 +3008,10 @@ void Verifier::visitStoreInst(StoreInst &SI) {
          "Stored value type does not match pointer operand type!", &SI, ElTy);
   Assert(SI.getAlignment() <= Value::MaximumAlignment,
          "huge alignment values are unsupported", &SI);
+  Assert(ElTy->isSized(), "storing unsized types is not allowed", &SI);
   if (SI.isAtomic()) {
-    Assert(SI.getOrdering() != Acquire && SI.getOrdering() != AcquireRelease,
+    Assert(SI.getOrdering() != AtomicOrdering::Acquire &&
+               SI.getOrdering() != AtomicOrdering::AcquireRelease,
            "Store cannot have Acquire ordering", &SI);
     Assert(SI.getAlignment() != 0,
            "Atomic store must specify explicit alignment", &SI);
@@ -2867,6 +3028,42 @@ void Verifier::visitStoreInst(StoreInst &SI) {
   visitInstruction(SI);
 }
 
+/// Check that SwiftErrorVal is used as a swifterror argument in CS.
+void Verifier::verifySwiftErrorCallSite(CallSite CS,
+                                        const Value *SwiftErrorVal) {
+  unsigned Idx = 0;
+  for (CallSite::arg_iterator I = CS.arg_begin(), E = CS.arg_end();
+       I != E; ++I, ++Idx) {
+    if (*I == SwiftErrorVal) {
+      Assert(CS.paramHasAttr(Idx+1, Attribute::SwiftError),
+             "swifterror value when used in a callsite should be marked "
+             "with swifterror attribute",
+              SwiftErrorVal, CS);
+    }
+  }
+}
+
+void Verifier::verifySwiftErrorValue(const Value *SwiftErrorVal) {
+  // Check that swifterror value is only used by loads, stores, or as
+  // a swifterror argument.
+  for (const User *U : SwiftErrorVal->users()) {
+    Assert(isa<LoadInst>(U) || isa<StoreInst>(U) || isa<CallInst>(U) ||
+           isa<InvokeInst>(U),
+           "swifterror value can only be loaded and stored from, or "
+           "as a swifterror argument!",
+           SwiftErrorVal, U);
+    // If it is used by a store, check it is the second operand.
+    if (auto StoreI = dyn_cast<StoreInst>(U))
+      Assert(StoreI->getOperand(1) == SwiftErrorVal,
+             "swifterror value should be the second operand when used "
+             "by stores", SwiftErrorVal, U);
+    if (auto CallI = dyn_cast<CallInst>(U))
+      verifySwiftErrorCallSite(const_cast<CallInst*>(CallI), SwiftErrorVal);
+    if (auto II = dyn_cast<InvokeInst>(U))
+      verifySwiftErrorCallSite(const_cast<InvokeInst*>(II), SwiftErrorVal);
+  }
+}
+
 void Verifier::visitAllocaInst(AllocaInst &AI) {
   SmallPtrSet<Type*, 4> Visited;
   PointerType *PTy = AI.getType();
@@ -2880,32 +3077,38 @@ void Verifier::visitAllocaInst(AllocaInst &AI) {
   Assert(AI.getAlignment() <= Value::MaximumAlignment,
          "huge alignment values are unsupported", &AI);
 
+  if (AI.isSwiftError()) {
+    verifySwiftErrorValue(&AI);
+  }
+
   visitInstruction(AI);
 }
 
 void Verifier::visitAtomicCmpXchgInst(AtomicCmpXchgInst &CXI) {
 
   // FIXME: more conditions???
-  Assert(CXI.getSuccessOrdering() != NotAtomic,
+  Assert(CXI.getSuccessOrdering() != AtomicOrdering::NotAtomic,
          "cmpxchg instructions must be atomic.", &CXI);
-  Assert(CXI.getFailureOrdering() != NotAtomic,
+  Assert(CXI.getFailureOrdering() != AtomicOrdering::NotAtomic,
          "cmpxchg instructions must be atomic.", &CXI);
-  Assert(CXI.getSuccessOrdering() != Unordered,
+  Assert(CXI.getSuccessOrdering() != AtomicOrdering::Unordered,
          "cmpxchg instructions cannot be unordered.", &CXI);
-  Assert(CXI.getFailureOrdering() != Unordered,
+  Assert(CXI.getFailureOrdering() != AtomicOrdering::Unordered,
          "cmpxchg instructions cannot be unordered.", &CXI);
-  Assert(CXI.getSuccessOrdering() >= CXI.getFailureOrdering(),
-         "cmpxchg instructions be at least as constrained on success as fail",
+  Assert(!isStrongerThan(CXI.getFailureOrdering(), CXI.getSuccessOrdering()),
+         "cmpxchg instructions failure argument shall be no stronger than the "
+         "success argument",
          &CXI);
-  Assert(CXI.getFailureOrdering() != Release &&
-             CXI.getFailureOrdering() != AcquireRelease,
+  Assert(CXI.getFailureOrdering() != AtomicOrdering::Release &&
+             CXI.getFailureOrdering() != AtomicOrdering::AcquireRelease,
          "cmpxchg failure ordering cannot include release semantics", &CXI);
 
   PointerType *PTy = dyn_cast<PointerType>(CXI.getOperand(0)->getType());
   Assert(PTy, "First cmpxchg operand must be a pointer.", &CXI);
   Type *ElTy = PTy->getElementType();
-  Assert(ElTy->isIntegerTy(), "cmpxchg operand must have integer type!", &CXI,
-         ElTy);
+  Assert(ElTy->isIntegerTy() || ElTy->isPointerTy(),
+        "cmpxchg operand must have integer or pointer type",
+         ElTy, &CXI);
   checkAtomicMemAccessSize(M, ElTy, &CXI);
   Assert(ElTy == CXI.getOperand(1)->getType(),
          "Expected value type does not match pointer operand type!", &CXI,
@@ -2916,9 +3119,9 @@ void Verifier::visitAtomicCmpXchgInst(AtomicCmpXchgInst &CXI) {
 }
 
 void Verifier::visitAtomicRMWInst(AtomicRMWInst &RMWI) {
-  Assert(RMWI.getOrdering() != NotAtomic,
+  Assert(RMWI.getOrdering() != AtomicOrdering::NotAtomic,
          "atomicrmw instructions must be atomic.", &RMWI);
-  Assert(RMWI.getOrdering() != Unordered,
+  Assert(RMWI.getOrdering() != AtomicOrdering::Unordered,
          "atomicrmw instructions cannot be unordered.", &RMWI);
   PointerType *PTy = dyn_cast<PointerType>(RMWI.getOperand(0)->getType());
   Assert(PTy, "First atomicrmw operand must be a pointer.", &RMWI);
@@ -2937,10 +3140,12 @@ void Verifier::visitAtomicRMWInst(AtomicRMWInst &RMWI) {
 
 void Verifier::visitFenceInst(FenceInst &FI) {
   const AtomicOrdering Ordering = FI.getOrdering();
-  Assert(Ordering == Acquire || Ordering == Release ||
-             Ordering == AcquireRelease || Ordering == SequentiallyConsistent,
-         "fence instructions may only have "
-         "acquire, release, acq_rel, or seq_cst ordering.",
+  Assert(Ordering == AtomicOrdering::Acquire ||
+             Ordering == AtomicOrdering::Release ||
+             Ordering == AtomicOrdering::AcquireRelease ||
+             Ordering == AtomicOrdering::SequentiallyConsistent,
+         "fence instructions may only have acquire, release, acq_rel, or "
+         "seq_cst ordering.",
          &FI);
   visitInstruction(FI);
 }
@@ -3017,7 +3222,7 @@ void Verifier::visitEHPadPredecessors(Instruction &I) {
       else
         FromPad = ConstantTokenNone::get(II->getContext());
     } else if (auto *CRI = dyn_cast<CleanupReturnInst>(TI)) {
-      FromPad = CRI->getCleanupPad();
+      FromPad = CRI->getOperand(0);
       Assert(FromPad != ToPadParent, "A cleanupret must exit its cleanup", CRI);
     } else if (auto *CSI = dyn_cast<CatchSwitchInst>(TI)) {
       FromPad = CSI;
@@ -3026,6 +3231,7 @@ void Verifier::visitEHPadPredecessors(Instruction &I) {
     }
 
     // The edge may exit from zero or more nested pads.
+    SmallSet<Value *, 8> Seen;
     for (;; FromPad = getParentPad(FromPad)) {
       Assert(FromPad != ToPad,
              "EH pad cannot handle exceptions raised within it", FromPad, TI);
@@ -3035,6 +3241,8 @@ void Verifier::visitEHPadPredecessors(Instruction &I) {
       }
       Assert(!isa<ConstantTokenNone>(FromPad),
              "A single unwind edge may only enter one EH pad", TI);
+      Assert(Seen.insert(FromPad).second,
+             "EH pad jumps through a cycle of pads", FromPad);
     }
   }
 }
@@ -3081,8 +3289,6 @@ void Verifier::visitLandingPadInst(LandingPadInst &LPI) {
 }
 
 void Verifier::visitCatchPadInst(CatchPadInst &CPI) {
-  visitEHPadPredecessors(CPI);
-
   BasicBlock *BB = CPI.getParent();
 
   Function *F = BB->getParent();
@@ -3098,6 +3304,7 @@ void Verifier::visitCatchPadInst(CatchPadInst &CPI) {
   Assert(BB->getFirstNonPHI() == &CPI,
          "CatchPadInst not the first non-PHI instruction in the block.", &CPI);
 
+  visitEHPadPredecessors(CPI);
   visitFuncletPadInst(CPI);
 }
 
@@ -3110,8 +3317,6 @@ void Verifier::visitCatchReturnInst(CatchReturnInst &CatchReturn) {
 }
 
 void Verifier::visitCleanupPadInst(CleanupPadInst &CPI) {
-  visitEHPadPredecessors(CPI);
-
   BasicBlock *BB = CPI.getParent();
 
   Function *F = BB->getParent();
@@ -3128,6 +3333,7 @@ void Verifier::visitCleanupPadInst(CleanupPadInst &CPI) {
   Assert(isa<ConstantTokenNone>(ParentPad) || isa<FuncletPadInst>(ParentPad),
          "CleanupPadInst has an invalid parent.", &CPI);
 
+  visitEHPadPredecessors(CPI);
   visitFuncletPadInst(CPI);
 }
 
@@ -3135,8 +3341,12 @@ void Verifier::visitFuncletPadInst(FuncletPadInst &FPI) {
   User *FirstUser = nullptr;
   Value *FirstUnwindPad = nullptr;
   SmallVector<FuncletPadInst *, 8> Worklist({&FPI});
+  SmallSet<FuncletPadInst *, 8> Seen;
+
   while (!Worklist.empty()) {
     FuncletPadInst *CurrentPad = Worklist.pop_back_val();
+    Assert(Seen.insert(CurrentPad).second,
+           "FuncletPadInst must not be nested within itself", CurrentPad);
     Value *UnresolvedAncestorPad = nullptr;
     for (User *U : CurrentPad->users()) {
       BasicBlock *UnwindDest;
@@ -3172,6 +3382,8 @@ void Verifier::visitFuncletPadInst(FuncletPadInst &FPI) {
       bool ExitsFPI;
       if (UnwindDest) {
         UnwindPad = UnwindDest->getFirstNonPHI();
+        if (!cast<Instruction>(UnwindPad)->isEHPad())
+          continue;
         Value *UnwindParent = getParentPad(UnwindPad);
         // Ignore unwind edges that don't exit CurrentPad.
         if (UnwindParent == CurrentPad)
@@ -3285,8 +3497,6 @@ void Verifier::visitFuncletPadInst(FuncletPadInst &FPI) {
 }
 
 void Verifier::visitCatchSwitchInst(CatchSwitchInst &CatchSwitch) {
-  visitEHPadPredecessors(CatchSwitch);
-
   BasicBlock *BB = CatchSwitch.getParent();
 
   Function *F = BB->getParent();
@@ -3324,6 +3534,7 @@ void Verifier::visitCatchSwitchInst(CatchSwitchInst &CatchSwitch) {
            "CatchSwitchInst handlers must be catchpads", &CatchSwitch, Handler);
   }
 
+  visitEHPadPredecessors(CatchSwitch);
   visitTerminatorInst(CatchSwitch);
 }
 
@@ -3353,8 +3564,18 @@ void Verifier::verifyDominatesUse(Instruction &I, unsigned i) {
       return;
   }
 
+  // Quick check whether the def has already been encountered in the same block.
+  // PHI nodes are not checked to prevent accepting preceeding PHIs, because PHI
+  // uses are defined to happen on the incoming edge, not at the instruction.
+  //
+  // FIXME: If this operand is a MetadataAsValue (wrapping a LocalAsMetadata)
+  // wrapping an SSA value, assert that we've already encountered it.  See
+  // related FIXME in Mapper::mapLocalAsMetadata in ValueMapper.cpp.
+  if (!isa<PHINode>(I) && InstsInThisBlock.count(Op))
+    return;
+
   const Use &U = I.getOperandUse(i);
-  Assert(InstsInThisBlock.count(Op) || DT.dominates(Op, U),
+  Assert(DT.dominates(Op, U),
          "Instruction does not dominate all uses!", Op, &I);
 }
 
@@ -3435,8 +3656,8 @@ void Verifier::visitInstruction(Instruction &I) {
               F->getIntrinsicID() == Intrinsic::experimental_patchpoint_void ||
               F->getIntrinsicID() == Intrinsic::experimental_patchpoint_i64 ||
               F->getIntrinsicID() == Intrinsic::experimental_gc_statepoint,
-          "Cannot invoke an intrinsinc other than"
-          " donothing or patchpoint",
+          "Cannot invoke an intrinsic other than donothing, patchpoint or "
+          "statepoint",
           &I);
       Assert(F->getParent() == M, "Referencing function in another module!",
              &I, M, F, F->getParent());
@@ -3469,7 +3690,9 @@ void Verifier::visitInstruction(Instruction &I) {
     Assert(MD->getNumOperands() == 1, "fpmath takes one operand!", &I);
     if (ConstantFP *CFP0 =
             mdconst::dyn_extract_or_null<ConstantFP>(MD->getOperand(0))) {
-      APFloat Accuracy = CFP0->getValueAPF();
+      const APFloat &Accuracy = CFP0->getValueAPF();
+      Assert(&Accuracy.getSemantics() == &APFloat::IEEEsingle,
+             "fpmath accuracy must have float type", &I);
       Assert(Accuracy.isFiniteNonZero() && !Accuracy.isNegative(),
              "fpmath accuracy not a positive number!", &I);
     } else {
@@ -3515,182 +3738,14 @@ void Verifier::visitInstruction(Instruction &I) {
   }
 
   if (MDNode *N = I.getDebugLoc().getAsMDNode()) {
-    Assert(isa<DILocation>(N), "invalid !dbg metadata attachment", &I, N);
+    AssertDI(isa<DILocation>(N), "invalid !dbg metadata attachment", &I, N);
     visitMDNode(*N);
   }
 
-  InstsInThisBlock.insert(&I);
-}
-
-/// VerifyIntrinsicType - Verify that the specified type (which comes from an
-/// intrinsic argument or return value) matches the type constraints specified
-/// by the .td file (e.g. an "any integer" argument really is an integer).
-///
-/// This return true on error but does not print a message.
-bool Verifier::VerifyIntrinsicType(Type *Ty,
-                                   ArrayRef<Intrinsic::IITDescriptor> &Infos,
-                                   SmallVectorImpl<Type*> &ArgTys) {
-  using namespace Intrinsic;
-
-  // If we ran out of descriptors, there are too many arguments.
-  if (Infos.empty()) return true;
-  IITDescriptor D = Infos.front();
-  Infos = Infos.slice(1);
-
-  switch (D.Kind) {
-  case IITDescriptor::Void: return !Ty->isVoidTy();
-  case IITDescriptor::VarArg: return true;
-  case IITDescriptor::MMX:  return !Ty->isX86_MMXTy();
-  case IITDescriptor::Token: return !Ty->isTokenTy();
-  case IITDescriptor::Metadata: return !Ty->isMetadataTy();
-  case IITDescriptor::Half: return !Ty->isHalfTy();
-  case IITDescriptor::Float: return !Ty->isFloatTy();
-  case IITDescriptor::Double: return !Ty->isDoubleTy();
-  case IITDescriptor::Integer: return !Ty->isIntegerTy(D.Integer_Width);
-  case IITDescriptor::Vector: {
-    VectorType *VT = dyn_cast<VectorType>(Ty);
-    return !VT || VT->getNumElements() != D.Vector_Width ||
-           VerifyIntrinsicType(VT->getElementType(), Infos, ArgTys);
-  }
-  case IITDescriptor::Pointer: {
-    PointerType *PT = dyn_cast<PointerType>(Ty);
-    return !PT || PT->getAddressSpace() != D.Pointer_AddressSpace ||
-           VerifyIntrinsicType(PT->getElementType(), Infos, ArgTys);
-  }
-
-  case IITDescriptor::Struct: {
-    StructType *ST = dyn_cast<StructType>(Ty);
-    if (!ST || ST->getNumElements() != D.Struct_NumElements)
-      return true;
-
-    for (unsigned i = 0, e = D.Struct_NumElements; i != e; ++i)
-      if (VerifyIntrinsicType(ST->getElementType(i), Infos, ArgTys))
-        return true;
-    return false;
-  }
-
-  case IITDescriptor::Argument:
-    // Two cases here - If this is the second occurrence of an argument, verify
-    // that the later instance matches the previous instance.
-    if (D.getArgumentNumber() < ArgTys.size())
-      return Ty != ArgTys[D.getArgumentNumber()];
-
-    // Otherwise, if this is the first instance of an argument, record it and
-    // verify the "Any" kind.
-    assert(D.getArgumentNumber() == ArgTys.size() && "Table consistency error");
-    ArgTys.push_back(Ty);
-
-    switch (D.getArgumentKind()) {
-    case IITDescriptor::AK_Any:        return false; // Success
-    case IITDescriptor::AK_AnyInteger: return !Ty->isIntOrIntVectorTy();
-    case IITDescriptor::AK_AnyFloat:   return !Ty->isFPOrFPVectorTy();
-    case IITDescriptor::AK_AnyVector:  return !isa<VectorType>(Ty);
-    case IITDescriptor::AK_AnyPointer: return !isa<PointerType>(Ty);
-    }
-    llvm_unreachable("all argument kinds not covered");
-
-  case IITDescriptor::ExtendArgument: {
-    // This may only be used when referring to a previous vector argument.
-    if (D.getArgumentNumber() >= ArgTys.size())
-      return true;
-
-    Type *NewTy = ArgTys[D.getArgumentNumber()];
-    if (VectorType *VTy = dyn_cast<VectorType>(NewTy))
-      NewTy = VectorType::getExtendedElementVectorType(VTy);
-    else if (IntegerType *ITy = dyn_cast<IntegerType>(NewTy))
-      NewTy = IntegerType::get(ITy->getContext(), 2 * ITy->getBitWidth());
-    else
-      return true;
-
-    return Ty != NewTy;
-  }
-  case IITDescriptor::TruncArgument: {
-    // This may only be used when referring to a previous vector argument.
-    if (D.getArgumentNumber() >= ArgTys.size())
-      return true;
-
-    Type *NewTy = ArgTys[D.getArgumentNumber()];
-    if (VectorType *VTy = dyn_cast<VectorType>(NewTy))
-      NewTy = VectorType::getTruncatedElementVectorType(VTy);
-    else if (IntegerType *ITy = dyn_cast<IntegerType>(NewTy))
-      NewTy = IntegerType::get(ITy->getContext(), ITy->getBitWidth() / 2);
-    else
-      return true;
-
-    return Ty != NewTy;
-  }
-  case IITDescriptor::HalfVecArgument:
-    // This may only be used when referring to a previous vector argument.
-    return D.getArgumentNumber() >= ArgTys.size() ||
-           !isa<VectorType>(ArgTys[D.getArgumentNumber()]) ||
-           VectorType::getHalfElementsVectorType(
-                         cast<VectorType>(ArgTys[D.getArgumentNumber()])) != Ty;
-  case IITDescriptor::SameVecWidthArgument: {
-    if (D.getArgumentNumber() >= ArgTys.size())
-      return true;
-    VectorType * ReferenceType =
-      dyn_cast<VectorType>(ArgTys[D.getArgumentNumber()]);
-    VectorType *ThisArgType = dyn_cast<VectorType>(Ty);
-    if (!ThisArgType || !ReferenceType || 
-        (ReferenceType->getVectorNumElements() !=
-         ThisArgType->getVectorNumElements()))
-      return true;
-    return VerifyIntrinsicType(ThisArgType->getVectorElementType(),
-                               Infos, ArgTys);
-  }
-  case IITDescriptor::PtrToArgument: {
-    if (D.getArgumentNumber() >= ArgTys.size())
-      return true;
-    Type * ReferenceType = ArgTys[D.getArgumentNumber()];
-    PointerType *ThisArgType = dyn_cast<PointerType>(Ty);
-    return (!ThisArgType || ThisArgType->getElementType() != ReferenceType);
-  }
-  case IITDescriptor::VecOfPtrsToElt: {
-    if (D.getArgumentNumber() >= ArgTys.size())
-      return true;
-    VectorType * ReferenceType =
-      dyn_cast<VectorType> (ArgTys[D.getArgumentNumber()]);
-    VectorType *ThisArgVecTy = dyn_cast<VectorType>(Ty);
-    if (!ThisArgVecTy || !ReferenceType || 
-        (ReferenceType->getVectorNumElements() !=
-         ThisArgVecTy->getVectorNumElements()))
-      return true;
-    PointerType *ThisArgEltTy =
-      dyn_cast<PointerType>(ThisArgVecTy->getVectorElementType());
-    if (!ThisArgEltTy)
-      return true;
-    return ThisArgEltTy->getElementType() !=
-           ReferenceType->getVectorElementType();
-  }
-  }
-  llvm_unreachable("unhandled");
-}
-
-/// \brief Verify if the intrinsic has variable arguments.
-/// This method is intended to be called after all the fixed arguments have been
-/// verified first.
-///
-/// This method returns true on error and does not print an error message.
-bool
-Verifier::VerifyIntrinsicIsVarArg(bool isVarArg,
-                                  ArrayRef<Intrinsic::IITDescriptor> &Infos) {
-  using namespace Intrinsic;
-
-  // If there are no descriptors left, then it can't be a vararg.
-  if (Infos.empty())
-    return isVarArg;
-
-  // There should be only one descriptor remaining at this point.
-  if (Infos.size() != 1)
-    return true;
-
-  // Check and verify the descriptor.
-  IITDescriptor D = Infos.front();
-  Infos = Infos.slice(1);
-  if (D.Kind == IITDescriptor::VarArg)
-    return !isVarArg;
+  if (auto *DII = dyn_cast<DbgInfoIntrinsic>(&I))
+    verifyBitPieceExpression(*DII);
 
-  return true;
+  InstsInThisBlock.insert(&I);
 }
 
 /// Allow intrinsics to be verified in different ways.
@@ -3709,18 +3764,20 @@ void Verifier::visitIntrinsicCallSite(Intrinsic::ID ID, CallSite CS) {
   ArrayRef<Intrinsic::IITDescriptor> TableRef = Table;
 
   SmallVector<Type *, 4> ArgTys;
-  Assert(!VerifyIntrinsicType(IFTy->getReturnType(), TableRef, ArgTys),
+  Assert(!Intrinsic::matchIntrinsicType(IFTy->getReturnType(),
+                                        TableRef, ArgTys),
          "Intrinsic has incorrect return type!", IF);
   for (unsigned i = 0, e = IFTy->getNumParams(); i != e; ++i)
-    Assert(!VerifyIntrinsicType(IFTy->getParamType(i), TableRef, ArgTys),
+    Assert(!Intrinsic::matchIntrinsicType(IFTy->getParamType(i),
+                                          TableRef, ArgTys),
            "Intrinsic has incorrect argument type!", IF);
 
   // Verify if the intrinsic call matches the vararg property.
   if (IsVarArg)
-    Assert(!VerifyIntrinsicIsVarArg(IsVarArg, TableRef),
+    Assert(!Intrinsic::matchIntrinsicVarArg(IsVarArg, TableRef),
            "Intrinsic was not defined with variable arguments!", IF);
   else
-    Assert(!VerifyIntrinsicIsVarArg(IsVarArg, TableRef),
+    Assert(!Intrinsic::matchIntrinsicVarArg(IsVarArg, TableRef),
            "Callsite was not defined with variable arguments!", IF);
 
   // All descriptors should be absorbed by now.
@@ -3863,7 +3920,7 @@ void Verifier::visitIntrinsicCallSite(Intrinsic::ID ID, CallSite CS) {
     Assert(CS.getParent()->getParent()->hasGC(),
            "Enclosing function does not use GC.", CS);
 
-    VerifyStatepoint(CS);
+    verifyStatepoint(CS);
     break;
   case Intrinsic::experimental_gc_result: {
     Assert(CS.getParent()->getParent()->hasGC(),
@@ -3913,18 +3970,18 @@ void Verifier::visitIntrinsicCallSite(Intrinsic::ID ID, CallSite CS) {
     else {
       // In all other cases relocate should be tied to the statepoint directly.
       // This covers relocates on a normal return path of invoke statepoint and
-      // relocates of a call statepoint
+      // relocates of a call statepoint.
       auto Token = CS.getArgOperand(0);
       Assert(isa<Instruction>(Token) && isStatepoint(cast<Instruction>(Token)),
              "gc relocate is incorrectly tied to the statepoint", CS, Token);
     }
 
-    // Verify rest of the relocate arguments
+    // Verify rest of the relocate arguments.
 
     ImmutableCallSite StatepointCS(
         cast<GCRelocateInst>(*CS.getInstruction()).getStatepoint());
 
-    // Both the base and derived must be piped through the safepoint
+    // Both the base and derived must be piped through the safepoint.
     Value* Base = CS.getArgOperand(1);
     Assert(isa<ConstantInt>(Base),
            "gc.relocate operand #2 must be integer offset", CS);
@@ -3942,7 +3999,7 @@ void Verifier::visitIntrinsicCallSite(Intrinsic::ID ID, CallSite CS) {
            "gc.relocate: statepoint derived index out of bounds", CS);
 
     // Check that BaseIndex and DerivedIndex fall within the 'gc parameters'
-    // section of the statepoint's argument
+    // section of the statepoint's argument.
     Assert(StatepointCS.arg_size() > 0,
            "gc.statepoint: insufficient arguments");
     Assert(isa<ConstantInt>(StatepointCS.getArgument(3)),
@@ -3962,7 +4019,8 @@ void Verifier::visitIntrinsicCallSite(Intrinsic::ID ID, CallSite CS) {
            "gc.statepoint: number of deoptimization arguments must be "
            "a constant integer");
     const int NumDeoptArgs =
-      cast<ConstantInt>(StatepointCS.getArgument(DeoptArgsStart))->getZExtValue();
+        cast<ConstantInt>(StatepointCS.getArgument(DeoptArgsStart))
+            ->getZExtValue();
     const int GCParamArgsStart = DeoptArgsStart + 1 + NumDeoptArgs;
     const int GCParamArgsEnd = StatepointCS.arg_size();
     Assert(GCParamArgsStart <= BaseIndex && BaseIndex < GCParamArgsEnd,
@@ -3985,10 +4043,13 @@ void Verifier::visitIntrinsicCallSite(Intrinsic::ID ID, CallSite CS) {
     auto ResultType = CS.getType();
     auto DerivedType = Relocate.getDerivedPtr()->getType();
     Assert(ResultType->isVectorTy() == DerivedType->isVectorTy(),
-           "gc.relocate: vector relocates to vector and pointer to pointer", CS);
-    Assert(ResultType->getPointerAddressSpace() ==
-           DerivedType->getPointerAddressSpace(),
-           "gc.relocate: relocating a pointer shouldn't change its address space", CS);
+           "gc.relocate: vector relocates to vector and pointer to pointer",
+           CS);
+    Assert(
+        ResultType->getPointerAddressSpace() ==
+            DerivedType->getPointerAddressSpace(),
+        "gc.relocate: relocating a pointer shouldn't change its address space",
+        CS);
     break;
   }
   case Intrinsic::eh_exceptioncode:
@@ -3997,6 +4058,75 @@ void Verifier::visitIntrinsicCallSite(Intrinsic::ID ID, CallSite CS) {
            "eh.exceptionpointer argument must be a catchpad", CS);
     break;
   }
+  case Intrinsic::masked_load: {
+    Assert(CS.getType()->isVectorTy(), "masked_load: must return a vector", CS);
+    
+    Value *Ptr = CS.getArgOperand(0);
+    //Value *Alignment = CS.getArgOperand(1);
+    Value *Mask = CS.getArgOperand(2);
+    Value *PassThru = CS.getArgOperand(3);
+    Assert(Mask->getType()->isVectorTy(),
+           "masked_load: mask must be vector", CS);
+
+    // DataTy is the overloaded type
+    Type *DataTy = cast<PointerType>(Ptr->getType())->getElementType();
+    Assert(DataTy == CS.getType(), 
+           "masked_load: return must match pointer type", CS);
+    Assert(PassThru->getType() == DataTy,
+           "masked_load: pass through and data type must match", CS);
+    Assert(Mask->getType()->getVectorNumElements() ==
+           DataTy->getVectorNumElements(), 
+           "masked_load: vector mask must be same length as data", CS);
+    break;
+  }
+  case Intrinsic::masked_store: {
+    Value *Val = CS.getArgOperand(0);
+    Value *Ptr = CS.getArgOperand(1);
+    //Value *Alignment = CS.getArgOperand(2);
+    Value *Mask = CS.getArgOperand(3);
+    Assert(Mask->getType()->isVectorTy(),
+           "masked_store: mask must be vector", CS);
+
+    // DataTy is the overloaded type
+    Type *DataTy = cast<PointerType>(Ptr->getType())->getElementType();
+    Assert(DataTy == Val->getType(), 
+           "masked_store: storee must match pointer type", CS);
+    Assert(Mask->getType()->getVectorNumElements() ==
+           DataTy->getVectorNumElements(), 
+           "masked_store: vector mask must be same length as data", CS);
+    break;
+  }
+
+  case Intrinsic::experimental_guard: {
+    Assert(CS.isCall(), "experimental_guard cannot be invoked", CS);
+    Assert(CS.countOperandBundlesOfType(LLVMContext::OB_deopt) == 1,
+           "experimental_guard must have exactly one "
+           "\"deopt\" operand bundle");
+    break;
+  }
+
+  case Intrinsic::experimental_deoptimize: {
+    Assert(CS.isCall(), "experimental_deoptimize cannot be invoked", CS);
+    Assert(CS.countOperandBundlesOfType(LLVMContext::OB_deopt) == 1,
+           "experimental_deoptimize must have exactly one "
+           "\"deopt\" operand bundle");
+    Assert(CS.getType() == CS.getInstruction()->getFunction()->getReturnType(),
+           "experimental_deoptimize return type must match caller return type");
+
+    if (CS.isCall()) {
+      auto *DeoptCI = CS.getInstruction();
+      auto *RI = dyn_cast<ReturnInst>(DeoptCI->getNextNode());
+      Assert(RI,
+             "calls to experimental_deoptimize must be followed by a return");
+
+      if (!CS.getType()->isVoidTy() && RI)
+        Assert(RI->getReturnValue() == DeoptCI,
+               "calls to experimental_deoptimize must be followed by a return "
+               "of the value computed by experimental_deoptimize");
+    }
+
+    break;
+  }
   };
 }
 
@@ -4022,13 +4152,13 @@ static DISubprogram *getSubprogram(Metadata *LocalScope) {
 template <class DbgIntrinsicTy>
 void Verifier::visitDbgIntrinsic(StringRef Kind, DbgIntrinsicTy &DII) {
   auto *MD = cast<MetadataAsValue>(DII.getArgOperand(0))->getMetadata();
-  Assert(isa<ValueAsMetadata>(MD) ||
+  AssertDI(isa<ValueAsMetadata>(MD) ||
              (isa<MDNode>(MD) && !cast<MDNode>(MD)->getNumOperands()),
          "invalid llvm.dbg." + Kind + " intrinsic address/value", &DII, MD);
-  Assert(isa<DILocalVariable>(DII.getRawVariable()),
+  AssertDI(isa<DILocalVariable>(DII.getRawVariable()),
          "invalid llvm.dbg." + Kind + " intrinsic variable", &DII,
          DII.getRawVariable());
-  Assert(isa<DIExpression>(DII.getRawExpression()),
+  AssertDI(isa<DIExpression>(DII.getRawExpression()),
          "invalid llvm.dbg." + Kind + " intrinsic expression", &DII,
          DII.getRawExpression());
 
@@ -4057,8 +4187,7 @@ void Verifier::visitDbgIntrinsic(StringRef Kind, DbgIntrinsicTy &DII) {
          Loc->getScope()->getSubprogram());
 }
 
-template <class MapTy>
-static uint64_t getVariableSize(const DILocalVariable &V, const MapTy &Map) {
+static uint64_t getVariableSize(const DILocalVariable &V) {
   // Be careful of broken types (checked elsewhere).
   const Metadata *RawType = V.getRawType();
   while (RawType) {
@@ -4073,12 +4202,6 @@ static uint64_t getVariableSize(const DILocalVariable &V, const MapTy &Map) {
       continue;
     }
 
-    if (auto *S = dyn_cast<MDString>(RawType)) {
-      // Don't error on missing types (checked elsewhere).
-      RawType = Map.lookup(S);
-      continue;
-    }
-
     // Missing type or size.
     break;
   }
@@ -4087,9 +4210,7 @@ static uint64_t getVariableSize(const DILocalVariable &V, const MapTy &Map) {
   return 0;
 }
 
-template <class MapTy>
-void Verifier::verifyBitPieceExpression(const DbgInfoIntrinsic &I,
-                                        const MapTy &TypeRefs) {
+void Verifier::verifyBitPieceExpression(const DbgInfoIntrinsic &I) {
   DILocalVariable *V;
   DIExpression *E;
   if (auto *DVI = dyn_cast<DbgValueInst>(&I)) {
@@ -4120,7 +4241,7 @@ void Verifier::verifyBitPieceExpression(const DbgInfoIntrinsic &I,
 
   // If there's no size, the type is broken, but that should be checked
   // elsewhere.
-  uint64_t VarSize = getVariableSize(*V, TypeRefs);
+  uint64_t VarSize = getVariableSize(*V);
   if (!VarSize)
     return;
 
@@ -4131,54 +4252,29 @@ void Verifier::verifyBitPieceExpression(const DbgInfoIntrinsic &I,
   Assert(PieceSize != VarSize, "piece covers entire variable", &I, V, E);
 }
 
-void Verifier::visitUnresolvedTypeRef(const MDString *S, const MDNode *N) {
-  // This is in its own function so we get an error for each bad type ref (not
-  // just the first).
-  Assert(false, "unresolved type ref", S, N);
-}
-
-void Verifier::verifyTypeRefs() {
+void Verifier::verifyCompileUnits() {
   auto *CUs = M->getNamedMetadata("llvm.dbg.cu");
-  if (!CUs)
-    return;
-
-  // Visit all the compile units again to map the type references.
-  SmallDenseMap<const MDString *, const DIType *, 32> TypeRefs;
-  for (auto *CU : CUs->operands())
-    if (auto Ts = cast<DICompileUnit>(CU)->getRetainedTypes())
-      for (DIType *Op : Ts)
-        if (auto *T = dyn_cast_or_null<DICompositeType>(Op))
-          if (auto *S = T->getRawIdentifier()) {
-            UnresolvedTypeRefs.erase(S);
-            TypeRefs.insert(std::make_pair(S, T));
-          }
+  SmallPtrSet<const Metadata *, 2> Listed;
+  if (CUs)
+    Listed.insert(CUs->op_begin(), CUs->op_end());
+  Assert(
+      std::all_of(CUVisited.begin(), CUVisited.end(),
+                  [&Listed](const Metadata *CU) { return Listed.count(CU); }),
+      "All DICompileUnits must be listed in llvm.dbg.cu");
+  CUVisited.clear();
+}
 
-  // Verify debug info intrinsic bit piece expressions.  This needs a second
-  // pass through the intructions, since we haven't built TypeRefs yet when
-  // verifying functions, and simply queuing the DbgInfoIntrinsics to evaluate
-  // later/now would queue up some that could be later deleted.
-  for (const Function &F : *M)
-    for (const BasicBlock &BB : F)
-      for (const Instruction &I : BB)
-        if (auto *DII = dyn_cast<DbgInfoIntrinsic>(&I))
-          verifyBitPieceExpression(*DII, TypeRefs);
-
-  // Return early if all typerefs were resolved.
-  if (UnresolvedTypeRefs.empty())
+void Verifier::verifyDeoptimizeCallingConvs() {
+  if (DeoptimizeDeclarations.empty())
     return;
 
-  // Sort the unresolved references by name so the output is deterministic.
-  typedef std::pair<const MDString *, const MDNode *> TypeRef;
-  SmallVector<TypeRef, 32> Unresolved(UnresolvedTypeRefs.begin(),
-                                      UnresolvedTypeRefs.end());
-  std::sort(Unresolved.begin(), Unresolved.end(),
-            [](const TypeRef &LHS, const TypeRef &RHS) {
-    return LHS.first->getString() < RHS.first->getString();
-  });
-
-  // Visit the unresolved refs (printing out the errors).
-  for (const TypeRef &TR : Unresolved)
-    visitUnresolvedTypeRef(TR.first, TR.second);
+  const Function *First = DeoptimizeDeclarations[0];
+  for (auto *F : makeArrayRef(DeoptimizeDeclarations).slice(1)) {
+    Assert(First->getCallingConv() == F->getCallingConv(),
+           "All llvm.experimental.deoptimize declarations must have the same "
+           "calling convention",
+           First, F);
+  }
 }
 
 //===----------------------------------------------------------------------===//
@@ -4187,28 +4283,30 @@ void Verifier::verifyTypeRefs() {
 
 bool llvm::verifyFunction(const Function &f, raw_ostream *OS) {
   Function &F = const_cast<Function &>(f);
-  assert(!F.isDeclaration() && "Cannot verify external functions");
 
-  raw_null_ostream NullStr;
-  Verifier V(OS ? *OS : NullStr);
+  // Don't use a raw_null_ostream.  Printing IR is expensive.
+  Verifier V(OS, /*ShouldTreatBrokenDebugInfoAsError=*/true);
 
   // Note that this function's return value is inverted from what you would
   // expect of a function called "verify".
   return !V.verify(F);
 }
 
-bool llvm::verifyModule(const Module &M, raw_ostream *OS) {
-  raw_null_ostream NullStr;
-  Verifier V(OS ? *OS : NullStr);
+bool llvm::verifyModule(const Module &M, raw_ostream *OS,
+                        bool *BrokenDebugInfo) {
+  // Don't use a raw_null_ostream.  Printing IR is expensive.
+  Verifier V(OS, /*ShouldTreatBrokenDebugInfoAsError=*/!BrokenDebugInfo);
 
   bool Broken = false;
-  for (Module::const_iterator I = M.begin(), E = M.end(); I != E; ++I)
-    if (!I->isDeclaration() && !I->isMaterializable())
-      Broken |= !V.verify(*I);
+  for (const Function &F : M)
+    Broken |= !V.verify(F);
 
+  Broken |= !V.verify(M);
+  if (BrokenDebugInfo)
+    *BrokenDebugInfo = V.hasBrokenDebugInfo();
   // Note that this function's return value is inverted from what you would
   // expect of a function called "verify".
-  return !V.verify(M) || Broken;
+  return Broken;
 }
 
 namespace {
@@ -4216,13 +4314,17 @@ struct VerifierLegacyPass : public FunctionPass {
   static char ID;
 
   Verifier V;
-  bool FatalErrors;
+  bool FatalErrors = true;
 
-  VerifierLegacyPass() : FunctionPass(ID), V(dbgs()), FatalErrors(true) {
+  VerifierLegacyPass()
+      : FunctionPass(ID),
+        V(&dbgs(), /*ShouldTreatBrokenDebugInfoAsError=*/false) {
     initializeVerifierLegacyPassPass(*PassRegistry::getPassRegistry());
   }
   explicit VerifierLegacyPass(bool FatalErrors)
-      : FunctionPass(ID), V(dbgs()), FatalErrors(FatalErrors) {
+      : FunctionPass(ID),
+        V(&dbgs(), /*ShouldTreatBrokenDebugInfoAsError=*/false),
+        FatalErrors(FatalErrors) {
     initializeVerifierLegacyPassPass(*PassRegistry::getPassRegistry());
   }
 
@@ -4234,9 +4336,25 @@ struct VerifierLegacyPass : public FunctionPass {
   }
 
   bool doFinalization(Module &M) override {
-    if (!V.verify(M) && FatalErrors)
-      report_fatal_error("Broken module found, compilation aborted!");
+    bool HasErrors = false;
+    for (Function &F : M)
+      if (F.isDeclaration())
+        HasErrors |= !V.verify(F);
+
+    HasErrors |= !V.verify(M);
+    if (FatalErrors) {
+      if (HasErrors)
+        report_fatal_error("Broken module found, compilation aborted!");
+      assert(!V.hasBrokenDebugInfo() && "Module contains invalid debug info");
+    }
 
+    // Strip broken debug info.
+    if (V.hasBrokenDebugInfo()) {
+      DiagnosticInfoIgnoringInvalidDebugMetadata DiagInvalid(M);
+      M.getContext().diagnose(DiagInvalid);
+      if (!StripDebugInfo(M))
+        report_fatal_error("Failed to strip malformed debug info");
+    }
     return false;
   }
 
@@ -4253,15 +4371,40 @@ FunctionPass *llvm::createVerifierPass(bool FatalErrors) {
   return new VerifierLegacyPass(FatalErrors);
 }
 
-PreservedAnalyses VerifierPass::run(Module &M) {
-  if (verifyModule(M, &dbgs()) && FatalErrors)
-    report_fatal_error("Broken module found, compilation aborted!");
+char VerifierAnalysis::PassID;
+VerifierAnalysis::Result VerifierAnalysis::run(Module &M,
+                                               ModuleAnalysisManager &) {
+  Result Res;
+  Res.IRBroken = llvm::verifyModule(M, &dbgs(), &Res.DebugInfoBroken);
+  return Res;
+}
+
+VerifierAnalysis::Result VerifierAnalysis::run(Function &F,
+                                               FunctionAnalysisManager &) {
+  return { llvm::verifyFunction(F, &dbgs()), false };
+}
 
+PreservedAnalyses VerifierPass::run(Module &M, ModuleAnalysisManager &AM) {
+  auto Res = AM.getResult<VerifierAnalysis>(M);
+  if (FatalErrors) {
+    if (Res.IRBroken)
+      report_fatal_error("Broken module found, compilation aborted!");
+    assert(!Res.DebugInfoBroken && "Module contains invalid debug info");
+  }
+
+  // Strip broken debug info.
+  if (Res.DebugInfoBroken) {
+    DiagnosticInfoIgnoringInvalidDebugMetadata DiagInvalid(M);
+    M.getContext().diagnose(DiagInvalid);
+    if (!StripDebugInfo(M))
+      report_fatal_error("Failed to strip malformed debug info");
+  }
   return PreservedAnalyses::all();
 }
 
-PreservedAnalyses VerifierPass::run(Function &F) {
-  if (verifyFunction(F, &dbgs()) && FatalErrors)
+PreservedAnalyses VerifierPass::run(Function &F, FunctionAnalysisManager &AM) {
+  auto res = AM.getResult<VerifierAnalysis>(F);
+  if (res.IRBroken && FatalErrors)
     report_fatal_error("Broken function found, compilation aborted!");
 
   return PreservedAnalyses::all();
diff --git a/lib/IR/module.modulemap b/lib/IR/module.modulemap
deleted file mode 100644
index 9698e9178349..000000000000
--- a/lib/IR/module.modulemap
+++ /dev/null
@@ -1 +0,0 @@
-module IR { requires cplusplus umbrella "." module * { export * } }
diff --git a/lib/IRReader/Makefile b/lib/IRReader/Makefile
deleted file mode 100644
index cf6bc1135427..000000000000
--- a/lib/IRReader/Makefile
+++ /dev/null
@@ -1,14 +0,0 @@
-##===- lib/IRReader/Makefile -------------------------------*- Makefile -*-===##
-#
-#                     The LLVM Compiler Infrastructure
-#
-# This file is distributed under the University of Illinois Open Source
-# License. See LICENSE.TXT for details.
-#
-##===----------------------------------------------------------------------===##
-
-LEVEL = ../..
-LIBRARYNAME := LLVMIRReader
-BUILD_ARCHIVE = 1
-
-include $(LEVEL)/Makefile.common
diff --git a/lib/LLVMBuild.txt b/lib/LLVMBuild.txt
index 2edb66ae3ae7..50bf69089ed9 100644
--- a/lib/LLVMBuild.txt
+++ b/lib/LLVMBuild.txt
@@ -31,6 +31,7 @@ subdirectories =
  LTO
  MC
  Object
+ ObjectYAML
  Option
  Passes
  ProfileData
diff --git a/lib/LTO/CMakeLists.txt b/lib/LTO/CMakeLists.txt
index 1c099bb029b4..2007ef21a614 100644
--- a/lib/LTO/CMakeLists.txt
+++ b/lib/LTO/CMakeLists.txt
@@ -1,9 +1,62 @@
+# Figure out if we can track VC revisions.
+function(find_first_existing_file out_var)
+  foreach(file ${ARGN})
+    if(EXISTS "${file}")
+      set(${out_var} "${file}" PARENT_SCOPE)
+      return()
+    endif()
+  endforeach()
+endfunction()
+
+macro(find_first_existing_vc_file out_var path)
+  find_first_existing_file(${out_var}
+    "${path}/.git/logs/HEAD" # Git
+    "${path}/.svn/wc.db"     # SVN 1.7
+    "${path}/.svn/entries"   # SVN 1.6
+    )
+endmacro()
+
+find_first_existing_vc_file(llvm_vc "${LLVM_MAIN_SRC_DIR}")
+
+# The VC revision include that we want to generate.
+set(version_inc "${CMAKE_CURRENT_BINARY_DIR}/LLVMLTORevision.h")
+
+set(get_svn_script "${LLVM_MAIN_SRC_DIR}/cmake/modules/GenerateVersionFromCVS.cmake")
+
+if(DEFINED llvm_vc)
+  # Create custom target to generate the VC revision include.
+  add_custom_command(OUTPUT "${version_inc}"
+    DEPENDS "${llvm_vc}" "${get_svn_script}"
+    COMMAND
+    ${CMAKE_COMMAND} "-DSOURCE_DIR=${LLVM_MAIN_SRC_DIR}"
+                     "-DNAME=LLVM_REVISION"
+                     "-DHEADER_FILE=${version_inc}"
+                     -P "${get_svn_script}")
+
+  # Mark the generated header as being generated.
+  set_source_files_properties("${version_inc}"
+    PROPERTIES GENERATED TRUE
+               HEADER_FILE_ONLY TRUE)
+
+  # Tell Version.cpp that it needs to build with -DHAVE_SVN_VERSION_INC.
+  set_source_files_properties(Version.cpp
+    PROPERTIES COMPILE_DEFINITIONS "HAVE_SVN_VERSION_INC")
+else()
+  # Not producing a VC revision include.
+  set(version_inc)
+endif()
+
+
 add_llvm_library(LLVMLTO
+  LTO.cpp
   LTOModule.cpp
   LTOCodeGenerator.cpp
+  UpdateCompilerUsed.cpp
+  ThinLTOCodeGenerator.cpp
+  ${version_inc}
 
   ADDITIONAL_HEADER_DIRS
   ${LLVM_MAIN_INCLUDE_DIR}/llvm/LTO
-  )
+)
 
 add_dependencies(LLVMLTO intrinsics_gen)
diff --git a/lib/LTO/LLVMBuild.txt b/lib/LTO/LLVMBuild.txt
index 7a0ad50fb94c..cf0158c8b10e 100644
--- a/lib/LTO/LLVMBuild.txt
+++ b/lib/LTO/LLVMBuild.txt
@@ -34,3 +34,4 @@ required_libraries =
  Scalar
  Support
  Target
+ TransformUtils
+\ No newline at end of file
diff --git a/lib/LTO/LTO.cpp b/lib/LTO/LTO.cpp
new file mode 100644
index 000000000000..10226c4a3ff6
--- /dev/null
+++ b/lib/LTO/LTO.cpp
@@ -0,0 +1,119 @@
+//===-LTO.cpp - LLVM Link Time Optimizer ----------------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements functions and classes used to support LTO.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/LTO/LTO.h"
+#include "llvm/Bitcode/ReaderWriter.h"
+#include "llvm/Support/MemoryBuffer.h"
+#include "llvm/Support/SourceMgr.h"
+#include "llvm/Support/raw_ostream.h"
+
+namespace llvm {
+
+// Simple helper to load a module from bitcode
+std::unique_ptr<Module> loadModuleFromBuffer(const MemoryBufferRef &Buffer,
+                                             LLVMContext &Context, bool Lazy) {
+  SMDiagnostic Err;
+  ErrorOr<std::unique_ptr<Module>> ModuleOrErr(nullptr);
+  if (Lazy) {
+    ModuleOrErr =
+        getLazyBitcodeModule(MemoryBuffer::getMemBuffer(Buffer, false), Context,
+                             /* ShouldLazyLoadMetadata */ Lazy);
+  } else {
+    ModuleOrErr = parseBitcodeFile(Buffer, Context);
+  }
+  if (std::error_code EC = ModuleOrErr.getError()) {
+    Err = SMDiagnostic(Buffer.getBufferIdentifier(), SourceMgr::DK_Error,
+                       EC.message());
+    Err.print("ThinLTO", errs());
+    report_fatal_error("Can't load module, abort.");
+  }
+  return std::move(ModuleOrErr.get());
+}
+
+static void thinLTOResolveWeakForLinkerGUID(
+    GlobalValueSummaryList &GVSummaryList, GlobalValue::GUID GUID,
+    DenseSet<GlobalValueSummary *> &GlobalInvolvedWithAlias,
+    function_ref<bool(GlobalValue::GUID, const GlobalValueSummary *)>
+        isPrevailing,
+    function_ref<void(StringRef, GlobalValue::GUID, GlobalValue::LinkageTypes)>
+        recordNewLinkage) {
+  for (auto &S : GVSummaryList) {
+    if (GlobalInvolvedWithAlias.count(S.get()))
+      continue;
+    GlobalValue::LinkageTypes OriginalLinkage = S->linkage();
+    if (!GlobalValue::isWeakForLinker(OriginalLinkage))
+      continue;
+    // We need to emit only one of these. The prevailing module will keep it,
+    // but turned into a weak, while the others will drop it when possible.
+    if (isPrevailing(GUID, S.get())) {
+      if (GlobalValue::isLinkOnceLinkage(OriginalLinkage))
+        S->setLinkage(GlobalValue::getWeakLinkage(
+            GlobalValue::isLinkOnceODRLinkage(OriginalLinkage)));
+    }
+    // Alias can't be turned into available_externally.
+    else if (!isa<AliasSummary>(S.get()) &&
+             (GlobalValue::isLinkOnceODRLinkage(OriginalLinkage) ||
+              GlobalValue::isWeakODRLinkage(OriginalLinkage)))
+      S->setLinkage(GlobalValue::AvailableExternallyLinkage);
+    if (S->linkage() != OriginalLinkage)
+      recordNewLinkage(S->modulePath(), GUID, S->linkage());
+  }
+}
+
+// Resolve Weak and LinkOnce values in the \p Index.
+//
+// We'd like to drop these functions if they are no longer referenced in the
+// current module. However there is a chance that another module is still
+// referencing them because of the import. We make sure we always emit at least
+// one copy.
+void thinLTOResolveWeakForLinkerInIndex(
+    ModuleSummaryIndex &Index,
+    function_ref<bool(GlobalValue::GUID, const GlobalValueSummary *)>
+        isPrevailing,
+    function_ref<void(StringRef, GlobalValue::GUID, GlobalValue::LinkageTypes)>
+        recordNewLinkage) {
+  // We won't optimize the globals that are referenced by an alias for now
+  // Ideally we should turn the alias into a global and duplicate the definition
+  // when needed.
+  DenseSet<GlobalValueSummary *> GlobalInvolvedWithAlias;
+  for (auto &I : Index)
+    for (auto &S : I.second)
+      if (auto AS = dyn_cast<AliasSummary>(S.get()))
+        GlobalInvolvedWithAlias.insert(&AS->getAliasee());
+
+  for (auto &I : Index)
+    thinLTOResolveWeakForLinkerGUID(I.second, I.first, GlobalInvolvedWithAlias,
+                                    isPrevailing, recordNewLinkage);
+}
+
+static void thinLTOInternalizeAndPromoteGUID(
+    GlobalValueSummaryList &GVSummaryList, GlobalValue::GUID GUID,
+    function_ref<bool(StringRef, GlobalValue::GUID)> isExported) {
+  for (auto &S : GVSummaryList) {
+    if (isExported(S->modulePath(), GUID)) {
+      if (GlobalValue::isLocalLinkage(S->linkage()))
+        S->setLinkage(GlobalValue::ExternalLinkage);
+    } else if (!GlobalValue::isLocalLinkage(S->linkage()))
+      S->setLinkage(GlobalValue::InternalLinkage);
+  }
+}
+
+// Update the linkages in the given \p Index to mark exported values
+// as external and non-exported values as internal.
+void thinLTOInternalizeAndPromoteInIndex(
+    ModuleSummaryIndex &Index,
+    function_ref<bool(StringRef, GlobalValue::GUID)> isExported) {
+  for (auto &I : Index)
+    thinLTOInternalizeAndPromoteGUID(I.second, I.first, isExported);
+}
+}
diff --git a/lib/LTO/LTOCodeGenerator.cpp b/lib/LTO/LTOCodeGenerator.cpp
index 66df23bab1b5..1da2d18b7d1c 100644
--- a/lib/LTO/LTOCodeGenerator.cpp
+++ b/lib/LTO/LTOCodeGenerator.cpp
@@ -12,7 +12,9 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "llvm/LTO/LTOCodeGenerator.h"
+#include "llvm/LTO/legacy/LTOCodeGenerator.h"
+
+#include "llvm/ADT/Statistic.h"
 #include "llvm/ADT/StringExtras.h"
 #include "llvm/Analysis/Passes.h"
 #include "llvm/Analysis/TargetLibraryInfo.h"
@@ -23,6 +25,7 @@
 #include "llvm/Config/config.h"
 #include "llvm/IR/Constants.h"
 #include "llvm/IR/DataLayout.h"
+#include "llvm/IR/DebugInfo.h"
 #include "llvm/IR/DerivedTypes.h"
 #include "llvm/IR/DiagnosticInfo.h"
 #include "llvm/IR/DiagnosticPrinter.h"
@@ -32,7 +35,8 @@
 #include "llvm/IR/Module.h"
 #include "llvm/IR/Verifier.h"
 #include "llvm/InitializePasses.h"
-#include "llvm/LTO/LTOModule.h"
+#include "llvm/LTO/legacy/LTOModule.h"
+#include "llvm/LTO/legacy/UpdateCompilerUsed.h"
 #include "llvm/Linker/Linker.h"
 #include "llvm/MC/MCAsmInfo.h"
 #include "llvm/MC/MCContext.h"
@@ -51,6 +55,7 @@
 #include "llvm/Target/TargetRegisterInfo.h"
 #include "llvm/Target/TargetSubtargetInfo.h"
 #include "llvm/Transforms/IPO.h"
+#include "llvm/Transforms/IPO/Internalize.h"
 #include "llvm/Transforms/IPO/PassManagerBuilder.h"
 #include "llvm/Transforms/ObjCARC.h"
 #include <system_error>
@@ -64,9 +69,33 @@ const char* LTOCodeGenerator::getVersionString() {
 #endif
 }
 
+namespace llvm {
+cl::opt<bool> LTODiscardValueNames(
+    "lto-discard-value-names",
+    cl::desc("Strip names from Value during LTO (other than GlobalValue)."),
+#ifdef NDEBUG
+    cl::init(true),
+#else
+    cl::init(false),
+#endif
+    cl::Hidden);
+
+cl::opt<bool> LTOStripInvalidDebugInfo(
+    "lto-strip-invalid-debug-info",
+    cl::desc("Strip invalid debug info metadata during LTO instead of aborting."),
+#ifdef NDEBUG
+    cl::init(true),
+#else
+    cl::init(false),
+#endif
+    cl::Hidden);
+}
+
 LTOCodeGenerator::LTOCodeGenerator(LLVMContext &Context)
     : Context(Context), MergedModule(new Module("ld-temp.o", Context)),
       TheLinker(new Linker(*MergedModule)) {
+  Context.setDiscardValueNames(LTODiscardValueNames);
+  Context.enableDebugTypeODRUniquing();
   initializeLTOPasses();
 }
 
@@ -78,28 +107,26 @@ LTOCodeGenerator::~LTOCodeGenerator() {}
 void LTOCodeGenerator::initializeLTOPasses() {
   PassRegistry &R = *PassRegistry::getPassRegistry();
 
-  initializeInternalizePassPass(R);
-  initializeIPSCCPPass(R);
-  initializeGlobalOptPass(R);
-  initializeConstantMergePass(R);
+  initializeInternalizeLegacyPassPass(R);
+  initializeIPSCCPLegacyPassPass(R);
+  initializeGlobalOptLegacyPassPass(R);
+  initializeConstantMergeLegacyPassPass(R);
   initializeDAHPass(R);
   initializeInstructionCombiningPassPass(R);
   initializeSimpleInlinerPass(R);
   initializePruneEHPass(R);
-  initializeGlobalDCEPass(R);
+  initializeGlobalDCELegacyPassPass(R);
   initializeArgPromotionPass(R);
   initializeJumpThreadingPass(R);
   initializeSROALegacyPassPass(R);
-  initializeSROA_DTPass(R);
-  initializeSROA_SSAUpPass(R);
-  initializePostOrderFunctionAttrsPass(R);
-  initializeReversePostOrderFunctionAttrsPass(R);
+  initializePostOrderFunctionAttrsLegacyPassPass(R);
+  initializeReversePostOrderFunctionAttrsLegacyPassPass(R);
   initializeGlobalsAAWrapperPassPass(R);
-  initializeLICMPass(R);
-  initializeMergedLoadStoreMotionPass(R);
-  initializeGVNPass(R);
-  initializeMemCpyOptPass(R);
-  initializeDCEPass(R);
+  initializeLegacyLICMPassPass(R);
+  initializeMergedLoadStoreMotionLegacyPassPass(R);
+  initializeGVNLegacyPassPass(R);
+  initializeMemCpyOptLegacyPassPass(R);
+  initializeDCELegacyPassPass(R);
   initializeCFGSimplifyPassPass(R);
 }
 
@@ -113,6 +140,9 @@ bool LTOCodeGenerator::addModule(LTOModule *Mod) {
   for (int i = 0, e = undefs.size(); i != e; ++i)
     AsmUndefinedRefs[undefs[i]] = 1;
 
+  // We've just changed the input, so let's make sure we verify it.
+  HasVerifiedInput = false;
+
   return !ret;
 }
 
@@ -128,9 +158,12 @@ void LTOCodeGenerator::setModule(std::unique_ptr<LTOModule> Mod) {
   const std::vector<const char*> &Undefs = Mod->getAsmUndefinedRefs();
   for (int I = 0, E = Undefs.size(); I != E; ++I)
     AsmUndefinedRefs[Undefs[I]] = 1;
+
+  // We've just changed the input, so let's make sure we verify it.
+  HasVerifiedInput = false;
 }
 
-void LTOCodeGenerator::setTargetOptions(TargetOptions Options) {
+void LTOCodeGenerator::setTargetOptions(const TargetOptions &Options) {
   this->Options = Options;
 }
 
@@ -169,6 +202,9 @@ bool LTOCodeGenerator::writeMergedModules(const char *Path) {
   if (!determineTarget())
     return false;
 
+  // We always run the verifier once on the merged module.
+  verifyMergedModuleOnce();
+
   // mark which symbols can not be internalized
   applyScopeRestrictions();
 
@@ -281,7 +317,7 @@ bool LTOCodeGenerator::determineTarget() {
   if (TargetMach)
     return true;
 
-  std::string TripleStr = MergedModule->getTargetTriple();
+  TripleStr = MergedModule->getTargetTriple();
   if (TripleStr.empty()) {
     TripleStr = sys::getDefaultTargetTriple();
     MergedModule->setTargetTriple(TripleStr);
@@ -290,8 +326,8 @@ bool LTOCodeGenerator::determineTarget() {
 
   // create target machine from info for merged modules
   std::string ErrMsg;
-  const Target *march = TargetRegistry::lookupTarget(TripleStr, ErrMsg);
-  if (!march) {
+  MArch = TargetRegistry::lookupTarget(TripleStr, ErrMsg);
+  if (!MArch) {
     emitError(ErrMsg);
     return false;
   }
@@ -311,147 +347,167 @@ bool LTOCodeGenerator::determineTarget() {
       MCpu = "cyclone";
   }
 
-  TargetMach.reset(march->createTargetMachine(TripleStr, MCpu, FeatureStr,
-                                              Options, RelocModel,
-                                              CodeModel::Default, CGOptLevel));
+  TargetMach = createTargetMachine();
   return true;
 }
 
-void LTOCodeGenerator::
-applyRestriction(GlobalValue &GV,
-                 ArrayRef<StringRef> Libcalls,
-                 std::vector<const char*> &MustPreserveList,
-                 SmallPtrSetImpl<GlobalValue*> &AsmUsed,
-                 Mangler &Mangler) {
-  // There are no restrictions to apply to declarations.
-  if (GV.isDeclaration())
-    return;
+std::unique_ptr<TargetMachine> LTOCodeGenerator::createTargetMachine() {
+  return std::unique_ptr<TargetMachine>(
+      MArch->createTargetMachine(TripleStr, MCpu, FeatureStr, Options,
+                                 RelocModel, CodeModel::Default, CGOptLevel));
+}
 
-  // There is nothing more restrictive than private linkage.
-  if (GV.hasPrivateLinkage())
+// If a linkonce global is present in the MustPreserveSymbols, we need to make
+// sure we honor this. To force the compiler to not drop it, we add it to the
+// "llvm.compiler.used" global.
+void LTOCodeGenerator::preserveDiscardableGVs(
+    Module &TheModule,
+    llvm::function_ref<bool(const GlobalValue &)> mustPreserveGV) {
+  SetVector<Constant *> UsedValuesSet;
+  if (GlobalVariable *LLVMUsed =
+          TheModule.getGlobalVariable("llvm.compiler.used")) {
+    ConstantArray *Inits = cast<ConstantArray>(LLVMUsed->getInitializer());
+    for (auto &V : Inits->operands())
+      UsedValuesSet.insert(cast<Constant>(&V));
+    LLVMUsed->eraseFromParent();
+  }
+  llvm::Type *i8PTy = llvm::Type::getInt8PtrTy(TheModule.getContext());
+  auto mayPreserveGlobal = [&](GlobalValue &GV) {
+    if (!GV.isDiscardableIfUnused() || GV.isDeclaration())
+      return;
+    if (!mustPreserveGV(GV))
+      return;
+    if (GV.hasAvailableExternallyLinkage()) {
+      emitWarning(
+          (Twine("Linker asked to preserve available_externally global: '") +
+           GV.getName() + "'").str());
+      return;
+    }
+    if (GV.hasInternalLinkage()) {
+      emitWarning((Twine("Linker asked to preserve internal global: '") +
+                   GV.getName() + "'").str());
+      return;
+    }
+    UsedValuesSet.insert(ConstantExpr::getBitCast(&GV, i8PTy));
+  };
+  for (auto &GV : TheModule)
+    mayPreserveGlobal(GV);
+  for (auto &GV : TheModule.globals())
+    mayPreserveGlobal(GV);
+  for (auto &GV : TheModule.aliases())
+    mayPreserveGlobal(GV);
+
+  if (UsedValuesSet.empty())
     return;
 
-  SmallString<64> Buffer;
-  TargetMach->getNameWithPrefix(Buffer, &GV, Mangler);
-
-  if (MustPreserveSymbols.count(Buffer))
-    MustPreserveList.push_back(GV.getName().data());
-  if (AsmUndefinedRefs.count(Buffer))
-    AsmUsed.insert(&GV);
-
-  // Conservatively append user-supplied runtime library functions to
-  // llvm.compiler.used.  These could be internalized and deleted by
-  // optimizations like -globalopt, causing problems when later optimizations
-  // add new library calls (e.g., llvm.memset => memset and printf => puts).
-  // Leave it to the linker to remove any dead code (e.g. with -dead_strip).
-  if (isa<Function>(GV) &&
-      std::binary_search(Libcalls.begin(), Libcalls.end(), GV.getName()))
-    AsmUsed.insert(&GV);
+  llvm::ArrayType *ATy = llvm::ArrayType::get(i8PTy, UsedValuesSet.size());
+  auto *LLVMUsed = new llvm::GlobalVariable(
+      TheModule, ATy, false, llvm::GlobalValue::AppendingLinkage,
+      llvm::ConstantArray::get(ATy, UsedValuesSet.getArrayRef()),
+      "llvm.compiler.used");
+  LLVMUsed->setSection("llvm.metadata");
 }
 
-static void findUsedValues(GlobalVariable *LLVMUsed,
-                           SmallPtrSetImpl<GlobalValue*> &UsedValues) {
-  if (!LLVMUsed) return;
+void LTOCodeGenerator::applyScopeRestrictions() {
+  if (ScopeRestrictionsDone)
+    return;
 
-  ConstantArray *Inits = cast<ConstantArray>(LLVMUsed->getInitializer());
-  for (unsigned i = 0, e = Inits->getNumOperands(); i != e; ++i)
-    if (GlobalValue *GV =
-        dyn_cast<GlobalValue>(Inits->getOperand(i)->stripPointerCasts()))
-      UsedValues.insert(GV);
-}
+  // Declare a callback for the internalize pass that will ask for every
+  // candidate GlobalValue if it can be internalized or not.
+  SmallString<64> MangledName;
+  auto mustPreserveGV = [&](const GlobalValue &GV) -> bool {
+    // Unnamed globals can't be mangled, but they can't be preserved either.
+    if (!GV.hasName())
+      return false;
+
+    // Need to mangle the GV as the "MustPreserveSymbols" StringSet is filled
+    // with the linker supplied name, which on Darwin includes a leading
+    // underscore.
+    MangledName.clear();
+    MangledName.reserve(GV.getName().size() + 1);
+    Mangler::getNameWithPrefix(MangledName, GV.getName(),
+                               MergedModule->getDataLayout());
+    return MustPreserveSymbols.count(MangledName);
+  };
+
+  // Preserve linkonce value on linker request
+  preserveDiscardableGVs(*MergedModule, mustPreserveGV);
+
+  if (!ShouldInternalize)
+    return;
 
-// Collect names of runtime library functions. User-defined functions with the
-// same names are added to llvm.compiler.used to prevent them from being
-// deleted by optimizations.
-static void accumulateAndSortLibcalls(std::vector<StringRef> &Libcalls,
-                                      const TargetLibraryInfo& TLI,
-                                      const Module &Mod,
-                                      const TargetMachine &TM) {
-  // TargetLibraryInfo has info on C runtime library calls on the current
-  // target.
-  for (unsigned I = 0, E = static_cast<unsigned>(LibFunc::NumLibFuncs);
-       I != E; ++I) {
-    LibFunc::Func F = static_cast<LibFunc::Func>(I);
-    if (TLI.has(F))
-      Libcalls.push_back(TLI.getName(F));
+  if (ShouldRestoreGlobalsLinkage) {
+    // Record the linkage type of non-local symbols so they can be restored
+    // prior
+    // to module splitting.
+    auto RecordLinkage = [&](const GlobalValue &GV) {
+      if (!GV.hasAvailableExternallyLinkage() && !GV.hasLocalLinkage() &&
+          GV.hasName())
+        ExternalSymbols.insert(std::make_pair(GV.getName(), GV.getLinkage()));
+    };
+    for (auto &GV : *MergedModule)
+      RecordLinkage(GV);
+    for (auto &GV : MergedModule->globals())
+      RecordLinkage(GV);
+    for (auto &GV : MergedModule->aliases())
+      RecordLinkage(GV);
   }
 
-  SmallPtrSet<const TargetLowering *, 1> TLSet;
+  // Update the llvm.compiler_used globals to force preserving libcalls and
+  // symbols referenced from asm
+  updateCompilerUsed(*MergedModule, *TargetMach, AsmUndefinedRefs);
 
-  for (const Function &F : Mod) {
-    const TargetLowering *Lowering =
-        TM.getSubtargetImpl(F)->getTargetLowering();
-
-    if (Lowering && TLSet.insert(Lowering).second)
-      // TargetLowering has info on library calls that CodeGen expects to be
-      // available, both from the C runtime and compiler-rt.
-      for (unsigned I = 0, E = static_cast<unsigned>(RTLIB::UNKNOWN_LIBCALL);
-           I != E; ++I)
-        if (const char *Name =
-                Lowering->getLibcallName(static_cast<RTLIB::Libcall>(I)))
-          Libcalls.push_back(Name);
-  }
+  internalizeModule(*MergedModule, mustPreserveGV);
 
-  array_pod_sort(Libcalls.begin(), Libcalls.end());
-  Libcalls.erase(std::unique(Libcalls.begin(), Libcalls.end()),
-                 Libcalls.end());
+  ScopeRestrictionsDone = true;
 }
 
-void LTOCodeGenerator::applyScopeRestrictions() {
-  if (ScopeRestrictionsDone || !ShouldInternalize)
+/// Restore original linkage for symbols that may have been internalized
+void LTOCodeGenerator::restoreLinkageForExternals() {
+  if (!ShouldInternalize || !ShouldRestoreGlobalsLinkage)
     return;
 
-  // Start off with a verification pass.
-  legacy::PassManager passes;
-  passes.add(createVerifierPass());
+  assert(ScopeRestrictionsDone &&
+         "Cannot externalize without internalization!");
 
-  // mark which symbols can not be internalized
-  Mangler Mangler;
-  std::vector<const char*> MustPreserveList;
-  SmallPtrSet<GlobalValue*, 8> AsmUsed;
-  std::vector<StringRef> Libcalls;
-  TargetLibraryInfoImpl TLII(Triple(TargetMach->getTargetTriple()));
-  TargetLibraryInfo TLI(TLII);
-
-  accumulateAndSortLibcalls(Libcalls, TLI, *MergedModule, *TargetMach);
-
-  for (Function &f : *MergedModule)
-    applyRestriction(f, Libcalls, MustPreserveList, AsmUsed, Mangler);
-  for (GlobalVariable &v : MergedModule->globals())
-    applyRestriction(v, Libcalls, MustPreserveList, AsmUsed, Mangler);
-  for (GlobalAlias &a : MergedModule->aliases())
-    applyRestriction(a, Libcalls, MustPreserveList, AsmUsed, Mangler);
-
-  GlobalVariable *LLVMCompilerUsed =
-    MergedModule->getGlobalVariable("llvm.compiler.used");
-  findUsedValues(LLVMCompilerUsed, AsmUsed);
-  if (LLVMCompilerUsed)
-    LLVMCompilerUsed->eraseFromParent();
-
-  if (!AsmUsed.empty()) {
-    llvm::Type *i8PTy = llvm::Type::getInt8PtrTy(Context);
-    std::vector<Constant*> asmUsed2;
-    for (auto *GV : AsmUsed) {
-      Constant *c = ConstantExpr::getBitCast(GV, i8PTy);
-      asmUsed2.push_back(c);
-    }
+  if (ExternalSymbols.empty())
+    return;
 
-    llvm::ArrayType *ATy = llvm::ArrayType::get(i8PTy, asmUsed2.size());
-    LLVMCompilerUsed =
-      new llvm::GlobalVariable(*MergedModule, ATy, false,
-                               llvm::GlobalValue::AppendingLinkage,
-                               llvm::ConstantArray::get(ATy, asmUsed2),
-                               "llvm.compiler.used");
+  auto externalize = [this](GlobalValue &GV) {
+    if (!GV.hasLocalLinkage() || !GV.hasName())
+      return;
 
-    LLVMCompilerUsed->setSection("llvm.metadata");
-  }
+    auto I = ExternalSymbols.find(GV.getName());
+    if (I == ExternalSymbols.end())
+      return;
 
-  passes.add(createInternalizePass(MustPreserveList));
+    GV.setLinkage(I->second);
+  };
 
-  // apply scope restrictions
-  passes.run(*MergedModule);
+  std::for_each(MergedModule->begin(), MergedModule->end(), externalize);
+  std::for_each(MergedModule->global_begin(), MergedModule->global_end(),
+                externalize);
+  std::for_each(MergedModule->alias_begin(), MergedModule->alias_end(),
+                externalize);
+}
 
-  ScopeRestrictionsDone = true;
+void LTOCodeGenerator::verifyMergedModuleOnce() {
+  // Only run on the first call.
+  if (HasVerifiedInput)
+    return;
+  HasVerifiedInput = true;
+
+  if (LTOStripInvalidDebugInfo) {
+    bool BrokenDebugInfo = false;
+    if (verifyModule(*MergedModule, &dbgs(), &BrokenDebugInfo))
+      report_fatal_error("Broken module found, compilation aborted!");
+    if (BrokenDebugInfo) {
+      emitWarning("Invalid debug info found, debug info will be stripped");
+      StripDebugInfo(*MergedModule);
+    }
+  }
+  if (verifyModule(*MergedModule, &dbgs()))
+    report_fatal_error("Broken module found, compilation aborted!");
 }
 
 /// Optimize merged modules using various IPO passes
@@ -461,6 +517,10 @@ bool LTOCodeGenerator::optimize(bool DisableVerify, bool DisableInline,
   if (!this->determineTarget())
     return false;
 
+  // We always run the verifier once on the merged module, the `DisableVerify`
+  // parameter only applies to subsequent verify.
+  verifyMergedModuleOnce();
+
   // Mark which symbols can not be internalized
   this->applyScopeRestrictions();
 
@@ -497,6 +557,10 @@ bool LTOCodeGenerator::compileOptimized(ArrayRef<raw_pwrite_stream *> Out) {
   if (!this->determineTarget())
     return false;
 
+  // We always run the verifier once on the merged module.  If it has already
+  // been called in optimize(), this call will return early.
+  verifyMergedModuleOnce();
+
   legacy::PassManager preCodeGenPasses;
 
   // If the bitcode files contain ARC code and were compiled with optimization,
@@ -504,14 +568,22 @@ bool LTOCodeGenerator::compileOptimized(ArrayRef<raw_pwrite_stream *> Out) {
   preCodeGenPasses.add(createObjCARCContractPass());
   preCodeGenPasses.run(*MergedModule);
 
+  // Re-externalize globals that may have been internalized to increase scope
+  // for splitting
+  restoreLinkageForExternals();
+
   // Do code generation. We need to preserve the module in case the client calls
   // writeMergedModules() after compilation, but we only need to allow this at
   // parallelism level 1. This is achieved by having splitCodeGen return the
   // original module at parallelism level 1 which we then assign back to
   // MergedModule.
-  MergedModule =
-      splitCodeGen(std::move(MergedModule), Out, MCpu, FeatureStr, Options,
-                   RelocModel, CodeModel::Default, CGOptLevel, FileType);
+  MergedModule = splitCodeGen(std::move(MergedModule), Out, {},
+                              [&]() { return createTargetMachine(); }, FileType,
+                              ShouldRestoreGlobalsLinkage);
+
+  // If statistics were requested, print them out after codegen.
+  if (llvm::AreStatisticsEnabled())
+    llvm::PrintStatistics();
 
   return true;
 }
@@ -599,3 +671,10 @@ void LTOCodeGenerator::emitError(const std::string &ErrMsg) {
   else
     Context.diagnose(LTODiagnosticInfo(ErrMsg));
 }
+
+void LTOCodeGenerator::emitWarning(const std::string &ErrMsg) {
+  if (DiagHandler)
+    (*DiagHandler)(LTO_DS_WARNING, ErrMsg.c_str(), DiagContext);
+  else
+    Context.diagnose(LTODiagnosticInfo(ErrMsg, DS_Warning));
+}
diff --git a/lib/LTO/LTOModule.cpp b/lib/LTO/LTOModule.cpp
index 409b94902332..a1d6f93f0875 100644
--- a/lib/LTO/LTOModule.cpp
+++ b/lib/LTO/LTOModule.cpp
@@ -12,7 +12,7 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "llvm/LTO/LTOModule.h"
+#include "llvm/LTO/legacy/LTOModule.h"
 #include "llvm/ADT/Triple.h"
 #include "llvm/Bitcode/ReaderWriter.h"
 #include "llvm/CodeGen/Analysis.h"
@@ -26,14 +26,13 @@
 #include "llvm/MC/MCInst.h"
 #include "llvm/MC/MCInstrInfo.h"
 #include "llvm/MC/MCParser/MCAsmParser.h"
+#include "llvm/MC/MCParser/MCTargetAsmParser.h"
 #include "llvm/MC/MCSection.h"
 #include "llvm/MC/MCSubtargetInfo.h"
 #include "llvm/MC/MCSymbol.h"
-#include "llvm/MC/MCTargetAsmParser.h"
 #include "llvm/MC/SubtargetFeature.h"
 #include "llvm/Object/IRObjectFile.h"
 #include "llvm/Object/ObjectFile.h"
-#include "llvm/Support/CommandLine.h"
 #include "llvm/Support/FileSystem.h"
 #include "llvm/Support/Host.h"
 #include "llvm/Support/MemoryBuffer.h"
@@ -54,11 +53,6 @@ LTOModule::LTOModule(std::unique_ptr<object::IRObjectFile> Obj,
                      llvm::TargetMachine *TM)
     : IRFile(std::move(Obj)), _target(TM) {}
 
-LTOModule::LTOModule(std::unique_ptr<object::IRObjectFile> Obj,
-                     llvm::TargetMachine *TM,
-                     std::unique_ptr<LLVMContext> Context)
-    : OwnedContext(std::move(Context)), IRFile(std::move(Obj)), _target(TM) {}
-
 LTOModule::~LTOModule() {}
 
 /// isBitcodeFile - Returns 'true' if the file (or memory contents) is LLVM
@@ -80,6 +74,18 @@ bool LTOModule::isBitcodeFile(const char *Path) {
   return bool(BCData);
 }
 
+bool LTOModule::isThinLTO() {
+  // Right now the detection is only based on the summary presence. We may want
+  // to add a dedicated flag at some point.
+  return hasGlobalValueSummary(IRFile->getMemoryBufferRef(),
+                            [](const DiagnosticInfo &DI) {
+                              DiagnosticPrinterRawOStream DP(errs());
+                              DI.print(DP);
+                              errs() << '\n';
+                              return;
+                            });
+}
+
 bool LTOModule::isBitcodeForTarget(MemoryBuffer *Buffer,
                                    StringRef TriplePrefix) {
   ErrorOr<MemoryBufferRef> BCOrErr =
@@ -102,53 +108,61 @@ std::string LTOModule::getProducerString(MemoryBuffer *Buffer) {
 
 ErrorOr<std::unique_ptr<LTOModule>>
 LTOModule::createFromFile(LLVMContext &Context, const char *path,
-                          TargetOptions options) {
+                          const TargetOptions &options) {
   ErrorOr<std::unique_ptr<MemoryBuffer>> BufferOrErr =
       MemoryBuffer::getFile(path);
-  if (std::error_code EC = BufferOrErr.getError())
+  if (std::error_code EC = BufferOrErr.getError()) {
+    Context.emitError(EC.message());
     return EC;
+  }
   std::unique_ptr<MemoryBuffer> Buffer = std::move(BufferOrErr.get());
-  return makeLTOModule(Buffer->getMemBufferRef(), options, &Context);
+  return makeLTOModule(Buffer->getMemBufferRef(), options, Context,
+                       /* ShouldBeLazy*/ false);
 }
 
 ErrorOr<std::unique_ptr<LTOModule>>
 LTOModule::createFromOpenFile(LLVMContext &Context, int fd, const char *path,
-                              size_t size, TargetOptions options) {
+                              size_t size, const TargetOptions &options) {
   return createFromOpenFileSlice(Context, fd, path, size, 0, options);
 }
 
 ErrorOr<std::unique_ptr<LTOModule>>
 LTOModule::createFromOpenFileSlice(LLVMContext &Context, int fd,
                                    const char *path, size_t map_size,
-                                   off_t offset, TargetOptions options) {
+                                   off_t offset, const TargetOptions &options) {
   ErrorOr<std::unique_ptr<MemoryBuffer>> BufferOrErr =
       MemoryBuffer::getOpenFileSlice(fd, path, map_size, offset);
-  if (std::error_code EC = BufferOrErr.getError())
+  if (std::error_code EC = BufferOrErr.getError()) {
+    Context.emitError(EC.message());
     return EC;
+  }
   std::unique_ptr<MemoryBuffer> Buffer = std::move(BufferOrErr.get());
-  return makeLTOModule(Buffer->getMemBufferRef(), options, &Context);
+  return makeLTOModule(Buffer->getMemBufferRef(), options, Context,
+                       /* ShouldBeLazy */ false);
 }
 
 ErrorOr<std::unique_ptr<LTOModule>>
 LTOModule::createFromBuffer(LLVMContext &Context, const void *mem,
-                            size_t length, TargetOptions options,
+                            size_t length, const TargetOptions &options,
                             StringRef path) {
-  return createInContext(mem, length, options, path, &Context);
-}
-
-ErrorOr<std::unique_ptr<LTOModule>>
-LTOModule::createInLocalContext(const void *mem, size_t length,
-                                TargetOptions options, StringRef path) {
-  return createInContext(mem, length, options, path, nullptr);
+  StringRef Data((const char *)mem, length);
+  MemoryBufferRef Buffer(Data, path);
+  return makeLTOModule(Buffer, options, Context, /* ShouldBeLazy */ false);
 }
 
 ErrorOr<std::unique_ptr<LTOModule>>
-LTOModule::createInContext(const void *mem, size_t length,
-                           TargetOptions options, StringRef path,
-                           LLVMContext *Context) {
+LTOModule::createInLocalContext(std::unique_ptr<LLVMContext> Context,
+                                const void *mem, size_t length,
+                                const TargetOptions &options, StringRef path) {
   StringRef Data((const char *)mem, length);
   MemoryBufferRef Buffer(Data, path);
-  return makeLTOModule(Buffer, options, Context);
+  // If we own a context, we know this is being used only for symbol extraction,
+  // not linking.  Be lazy in that case.
+  ErrorOr<std::unique_ptr<LTOModule>> Ret =
+      makeLTOModule(Buffer, options, *Context, /* ShouldBeLazy */ true);
+  if (Ret)
+    (*Ret)->OwnedContext = std::move(Context);
+  return Ret;
 }
 
 static ErrorOr<std::unique_ptr<Module>>
@@ -158,8 +172,10 @@ parseBitcodeFileImpl(MemoryBufferRef Buffer, LLVMContext &Context,
   // Find the buffer.
   ErrorOr<MemoryBufferRef> MBOrErr =
       IRObjectFile::findBitcodeInMemBuffer(Buffer);
-  if (std::error_code EC = MBOrErr.getError())
+  if (std::error_code EC = MBOrErr.getError()) {
+    Context.emitError(EC.message());
     return EC;
+  }
 
   if (!ShouldBeLazy) {
     // Parse the full file.
@@ -180,19 +196,10 @@ parseBitcodeFileImpl(MemoryBufferRef Buffer, LLVMContext &Context,
 }
 
 ErrorOr<std::unique_ptr<LTOModule>>
-LTOModule::makeLTOModule(MemoryBufferRef Buffer, TargetOptions options,
-                         LLVMContext *Context) {
-  std::unique_ptr<LLVMContext> OwnedContext;
-  if (!Context) {
-    OwnedContext = llvm::make_unique<LLVMContext>();
-    Context = OwnedContext.get();
-  }
-
-  // If we own a context, we know this is being used only for symbol
-  // extraction, not linking.  Be lazy in that case.
+LTOModule::makeLTOModule(MemoryBufferRef Buffer, const TargetOptions &options,
+                         LLVMContext &Context, bool ShouldBeLazy) {
   ErrorOr<std::unique_ptr<Module>> MOrErr =
-      parseBitcodeFileImpl(Buffer, *Context,
-                           /* ShouldBeLazy */ static_cast<bool>(OwnedContext));
+      parseBitcodeFileImpl(Buffer, Context, ShouldBeLazy);
   if (std::error_code EC = MOrErr.getError())
     return EC;
   std::unique_ptr<Module> &M = *MOrErr;
@@ -223,19 +230,14 @@ LTOModule::makeLTOModule(MemoryBufferRef Buffer, TargetOptions options,
       CPU = "cyclone";
   }
 
-  TargetMachine *target = march->createTargetMachine(TripleStr, CPU, FeatureStr,
-                                                     options);
+  TargetMachine *target =
+      march->createTargetMachine(TripleStr, CPU, FeatureStr, options, None);
   M->setDataLayout(target->createDataLayout());
 
   std::unique_ptr<object::IRObjectFile> IRObj(
       new object::IRObjectFile(Buffer, std::move(M)));
 
-  std::unique_ptr<LTOModule> Ret;
-  if (OwnedContext)
-    Ret.reset(new LTOModule(std::move(IRObj), target, std::move(OwnedContext)));
-  else
-    Ret.reset(new LTOModule(std::move(IRObj), target));
-
+  std::unique_ptr<LTOModule> Ret(new LTOModule(std::move(IRObj), target));
   Ret->parseSymbols();
   Ret->parseMetadata();
 
diff --git a/lib/LTO/Makefile b/lib/LTO/Makefile
deleted file mode 100644
index 55e2a5ef8036..000000000000
--- a/lib/LTO/Makefile
+++ /dev/null
@@ -1,15 +0,0 @@
-##===- lib/LTO/Makefile ------------------------------------*- Makefile -*-===##
-#
-#                     The LLVM Compiler Infrastructure
-#
-# This file is distributed under the University of Illinois Open Source
-# License. See LICENSE.TXT for details.
-#
-##===----------------------------------------------------------------------===##
-
-LEVEL = ../..
-LIBRARYNAME = LLVMLTO
-BUILD_ARCHIVE := 1
-
-include $(LEVEL)/Makefile.common
-
diff --git a/lib/LTO/ThinLTOCodeGenerator.cpp b/lib/LTO/ThinLTOCodeGenerator.cpp
new file mode 100644
index 000000000000..bfb0980bc165
--- /dev/null
+++ b/lib/LTO/ThinLTOCodeGenerator.cpp
@@ -0,0 +1,835 @@
+//===-ThinLTOCodeGenerator.cpp - LLVM Link Time Optimizer -----------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the Thin Link Time Optimization library. This library is
+// intended to be used by linker to optimize code at link time.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/LTO/legacy/ThinLTOCodeGenerator.h"
+
+#ifdef HAVE_LLVM_REVISION
+#include "LLVMLTORevision.h"
+#endif
+
+#include "llvm/ADT/Statistic.h"
+#include "llvm/ADT/StringExtras.h"
+#include "llvm/Analysis/ModuleSummaryAnalysis.h"
+#include "llvm/Analysis/TargetLibraryInfo.h"
+#include "llvm/Analysis/TargetTransformInfo.h"
+#include "llvm/Bitcode/BitcodeWriterPass.h"
+#include "llvm/Bitcode/ReaderWriter.h"
+#include "llvm/ExecutionEngine/ObjectMemoryBuffer.h"
+#include "llvm/IR/DiagnosticPrinter.h"
+#include "llvm/IR/LLVMContext.h"
+#include "llvm/IR/LegacyPassManager.h"
+#include "llvm/IR/Mangler.h"
+#include "llvm/IRReader/IRReader.h"
+#include "llvm/LTO/LTO.h"
+#include "llvm/Linker/Linker.h"
+#include "llvm/MC/SubtargetFeature.h"
+#include "llvm/Object/IRObjectFile.h"
+#include "llvm/Object/ModuleSummaryIndexObjectFile.h"
+#include "llvm/Support/CachePruning.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/Path.h"
+#include "llvm/Support/SHA1.h"
+#include "llvm/Support/TargetRegistry.h"
+#include "llvm/Support/ThreadPool.h"
+#include "llvm/Target/TargetMachine.h"
+#include "llvm/Transforms/IPO.h"
+#include "llvm/Transforms/IPO/FunctionImport.h"
+#include "llvm/Transforms/IPO/Internalize.h"
+#include "llvm/Transforms/IPO/PassManagerBuilder.h"
+#include "llvm/Transforms/ObjCARC.h"
+#include "llvm/Transforms/Utils/FunctionImportUtils.h"
+
+#include <numeric>
+
+using namespace llvm;
+
+#define DEBUG_TYPE "thinlto"
+
+namespace llvm {
+// Flags -discard-value-names, defined in LTOCodeGenerator.cpp
+extern cl::opt<bool> LTODiscardValueNames;
+}
+
+namespace {
+
+static cl::opt<int> ThreadCount("threads",
+                                cl::init(std::thread::hardware_concurrency()));
+
+static void diagnosticHandler(const DiagnosticInfo &DI) {
+  DiagnosticPrinterRawOStream DP(errs());
+  DI.print(DP);
+  errs() << '\n';
+}
+
+// Simple helper to save temporary files for debug.
+static void saveTempBitcode(const Module &TheModule, StringRef TempDir,
+                            unsigned count, StringRef Suffix) {
+  if (TempDir.empty())
+    return;
+  // User asked to save temps, let dump the bitcode file after import.
+  auto SaveTempPath = TempDir + llvm::utostr(count) + Suffix;
+  std::error_code EC;
+  raw_fd_ostream OS(SaveTempPath.str(), EC, sys::fs::F_None);
+  if (EC)
+    report_fatal_error(Twine("Failed to open ") + SaveTempPath +
+                       " to save optimized bitcode\n");
+  WriteBitcodeToFile(&TheModule, OS, /* ShouldPreserveUseListOrder */ true);
+}
+
+static const GlobalValueSummary *
+getFirstDefinitionForLinker(const GlobalValueSummaryList &GVSummaryList) {
+  // If there is any strong definition anywhere, get it.
+  auto StrongDefForLinker = llvm::find_if(
+      GVSummaryList, [](const std::unique_ptr<GlobalValueSummary> &Summary) {
+        auto Linkage = Summary->linkage();
+        return !GlobalValue::isAvailableExternallyLinkage(Linkage) &&
+               !GlobalValue::isWeakForLinker(Linkage);
+      });
+  if (StrongDefForLinker != GVSummaryList.end())
+    return StrongDefForLinker->get();
+  // Get the first *linker visible* definition for this global in the summary
+  // list.
+  auto FirstDefForLinker = llvm::find_if(
+      GVSummaryList, [](const std::unique_ptr<GlobalValueSummary> &Summary) {
+        auto Linkage = Summary->linkage();
+        return !GlobalValue::isAvailableExternallyLinkage(Linkage);
+      });
+  // Extern templates can be emitted as available_externally.
+  if (FirstDefForLinker == GVSummaryList.end())
+    return nullptr;
+  return FirstDefForLinker->get();
+}
+
+// Populate map of GUID to the prevailing copy for any multiply defined
+// symbols. Currently assume first copy is prevailing, or any strong
+// definition. Can be refined with Linker information in the future.
+static void computePrevailingCopies(
+    const ModuleSummaryIndex &Index,
+    DenseMap<GlobalValue::GUID, const GlobalValueSummary *> &PrevailingCopy) {
+  auto HasMultipleCopies = [&](const GlobalValueSummaryList &GVSummaryList) {
+    return GVSummaryList.size() > 1;
+  };
+
+  for (auto &I : Index) {
+    if (HasMultipleCopies(I.second))
+      PrevailingCopy[I.first] = getFirstDefinitionForLinker(I.second);
+  }
+}
+
+static StringMap<MemoryBufferRef>
+generateModuleMap(const std::vector<MemoryBufferRef> &Modules) {
+  StringMap<MemoryBufferRef> ModuleMap;
+  for (auto &ModuleBuffer : Modules) {
+    assert(ModuleMap.find(ModuleBuffer.getBufferIdentifier()) ==
+               ModuleMap.end() &&
+           "Expect unique Buffer Identifier");
+    ModuleMap[ModuleBuffer.getBufferIdentifier()] = ModuleBuffer;
+  }
+  return ModuleMap;
+}
+
+static void promoteModule(Module &TheModule, const ModuleSummaryIndex &Index) {
+  if (renameModuleForThinLTO(TheModule, Index))
+    report_fatal_error("renameModuleForThinLTO failed");
+}
+
+static void
+crossImportIntoModule(Module &TheModule, const ModuleSummaryIndex &Index,
+                      StringMap<MemoryBufferRef> &ModuleMap,
+                      const FunctionImporter::ImportMapTy &ImportList) {
+  ModuleLoader Loader(TheModule.getContext(), ModuleMap);
+  FunctionImporter Importer(Index, Loader);
+  Importer.importFunctions(TheModule, ImportList);
+}
+
+static void optimizeModule(Module &TheModule, TargetMachine &TM) {
+  // Populate the PassManager
+  PassManagerBuilder PMB;
+  PMB.LibraryInfo = new TargetLibraryInfoImpl(TM.getTargetTriple());
+  PMB.Inliner = createFunctionInliningPass();
+  // FIXME: should get it from the bitcode?
+  PMB.OptLevel = 3;
+  PMB.LoopVectorize = true;
+  PMB.SLPVectorize = true;
+  PMB.VerifyInput = true;
+  PMB.VerifyOutput = false;
+
+  legacy::PassManager PM;
+
+  // Add the TTI (required to inform the vectorizer about register size for
+  // instance)
+  PM.add(createTargetTransformInfoWrapperPass(TM.getTargetIRAnalysis()));
+
+  // Add optimizations
+  PMB.populateThinLTOPassManager(PM);
+
+  PM.run(TheModule);
+}
+
+// Convert the PreservedSymbols map from "Name" based to "GUID" based.
+static DenseSet<GlobalValue::GUID>
+computeGUIDPreservedSymbols(const StringSet<> &PreservedSymbols,
+                            const Triple &TheTriple) {
+  DenseSet<GlobalValue::GUID> GUIDPreservedSymbols(PreservedSymbols.size());
+  for (auto &Entry : PreservedSymbols) {
+    StringRef Name = Entry.first();
+    if (TheTriple.isOSBinFormatMachO() && Name.size() > 0 && Name[0] == '_')
+      Name = Name.drop_front();
+    GUIDPreservedSymbols.insert(GlobalValue::getGUID(Name));
+  }
+  return GUIDPreservedSymbols;
+}
+
+std::unique_ptr<MemoryBuffer> codegenModule(Module &TheModule,
+                                            TargetMachine &TM) {
+  SmallVector<char, 128> OutputBuffer;
+
+  // CodeGen
+  {
+    raw_svector_ostream OS(OutputBuffer);
+    legacy::PassManager PM;
+
+    // If the bitcode files contain ARC code and were compiled with optimization,
+    // the ObjCARCContractPass must be run, so do it unconditionally here.
+    PM.add(createObjCARCContractPass());
+
+    // Setup the codegen now.
+    if (TM.addPassesToEmitFile(PM, OS, TargetMachine::CGFT_ObjectFile,
+                               /* DisableVerify */ true))
+      report_fatal_error("Failed to setup codegen");
+
+    // Run codegen now. resulting binary is in OutputBuffer.
+    PM.run(TheModule);
+  }
+  return make_unique<ObjectMemoryBuffer>(std::move(OutputBuffer));
+}
+
+/// Manage caching for a single Module.
+class ModuleCacheEntry {
+  SmallString<128> EntryPath;
+
+public:
+  // Create a cache entry. This compute a unique hash for the Module considering
+  // the current list of export/import, and offer an interface to query to
+  // access the content in the cache.
+  ModuleCacheEntry(
+      StringRef CachePath, const ModuleSummaryIndex &Index, StringRef ModuleID,
+      const FunctionImporter::ImportMapTy &ImportList,
+      const FunctionImporter::ExportSetTy &ExportList,
+      const std::map<GlobalValue::GUID, GlobalValue::LinkageTypes> &ResolvedODR,
+      const GVSummaryMapTy &DefinedFunctions,
+      const DenseSet<GlobalValue::GUID> &PreservedSymbols) {
+    if (CachePath.empty())
+      return;
+
+    // Compute the unique hash for this entry
+    // This is based on the current compiler version, the module itself, the
+    // export list, the hash for every single module in the import list, the
+    // list of ResolvedODR for the module, and the list of preserved symbols.
+
+    SHA1 Hasher;
+
+    // Start with the compiler revision
+    Hasher.update(LLVM_VERSION_STRING);
+#ifdef HAVE_LLVM_REVISION
+    Hasher.update(LLVM_REVISION);
+#endif
+
+    // Include the hash for the current module
+    auto ModHash = Index.getModuleHash(ModuleID);
+    Hasher.update(ArrayRef<uint8_t>((uint8_t *)&ModHash[0], sizeof(ModHash)));
+    for (auto F : ExportList)
+      // The export list can impact the internalization, be conservative here
+      Hasher.update(ArrayRef<uint8_t>((uint8_t *)&F, sizeof(F)));
+
+    // Include the hash for every module we import functions from
+    for (auto &Entry : ImportList) {
+      auto ModHash = Index.getModuleHash(Entry.first());
+      Hasher.update(ArrayRef<uint8_t>((uint8_t *)&ModHash[0], sizeof(ModHash)));
+    }
+
+    // Include the hash for the resolved ODR.
+    for (auto &Entry : ResolvedODR) {
+      Hasher.update(ArrayRef<uint8_t>((const uint8_t *)&Entry.first,
+                                      sizeof(GlobalValue::GUID)));
+      Hasher.update(ArrayRef<uint8_t>((const uint8_t *)&Entry.second,
+                                      sizeof(GlobalValue::LinkageTypes)));
+    }
+
+    // Include the hash for the preserved symbols.
+    for (auto &Entry : PreservedSymbols) {
+      if (DefinedFunctions.count(Entry))
+        Hasher.update(
+            ArrayRef<uint8_t>((const uint8_t *)&Entry, sizeof(GlobalValue::GUID)));
+    }
+
+    sys::path::append(EntryPath, CachePath, toHex(Hasher.result()));
+  }
+
+  // Access the path to this entry in the cache.
+  StringRef getEntryPath() { return EntryPath; }
+
+  // Try loading the buffer for this cache entry.
+  ErrorOr<std::unique_ptr<MemoryBuffer>> tryLoadingBuffer() {
+    if (EntryPath.empty())
+      return std::error_code();
+    return MemoryBuffer::getFile(EntryPath);
+  }
+
+  // Cache the Produced object file
+  std::unique_ptr<MemoryBuffer>
+  write(std::unique_ptr<MemoryBuffer> OutputBuffer) {
+    if (EntryPath.empty())
+      return OutputBuffer;
+
+    // Write to a temporary to avoid race condition
+    SmallString<128> TempFilename;
+    int TempFD;
+    std::error_code EC =
+        sys::fs::createTemporaryFile("Thin", "tmp.o", TempFD, TempFilename);
+    if (EC) {
+      errs() << "Error: " << EC.message() << "\n";
+      report_fatal_error("ThinLTO: Can't get a temporary file");
+    }
+    {
+      raw_fd_ostream OS(TempFD, /* ShouldClose */ true);
+      OS << OutputBuffer->getBuffer();
+    }
+    // Rename to final destination (hopefully race condition won't matter here)
+    EC = sys::fs::rename(TempFilename, EntryPath);
+    if (EC) {
+      sys::fs::remove(TempFilename);
+      raw_fd_ostream OS(EntryPath, EC, sys::fs::F_None);
+      if (EC)
+        report_fatal_error(Twine("Failed to open ") + EntryPath +
+                           " to save cached entry\n");
+      OS << OutputBuffer->getBuffer();
+    }
+    auto ReloadedBufferOrErr = MemoryBuffer::getFile(EntryPath);
+    if (auto EC = ReloadedBufferOrErr.getError()) {
+      // FIXME diagnose
+      errs() << "error: can't reload cached file '" << EntryPath
+             << "': " << EC.message() << "\n";
+      return OutputBuffer;
+    }
+    return std::move(*ReloadedBufferOrErr);
+  }
+};
+
+static std::unique_ptr<MemoryBuffer>
+ProcessThinLTOModule(Module &TheModule, ModuleSummaryIndex &Index,
+                     StringMap<MemoryBufferRef> &ModuleMap, TargetMachine &TM,
+                     const FunctionImporter::ImportMapTy &ImportList,
+                     const FunctionImporter::ExportSetTy &ExportList,
+                     const DenseSet<GlobalValue::GUID> &GUIDPreservedSymbols,
+                     const GVSummaryMapTy &DefinedGlobals,
+                     const ThinLTOCodeGenerator::CachingOptions &CacheOptions,
+                     bool DisableCodeGen, StringRef SaveTempsDir,
+                     unsigned count) {
+
+  // "Benchmark"-like optimization: single-source case
+  bool SingleModule = (ModuleMap.size() == 1);
+
+  if (!SingleModule) {
+    promoteModule(TheModule, Index);
+
+    // Apply summary-based LinkOnce/Weak resolution decisions.
+    thinLTOResolveWeakForLinkerModule(TheModule, DefinedGlobals);
+
+    // Save temps: after promotion.
+    saveTempBitcode(TheModule, SaveTempsDir, count, ".1.promoted.bc");
+  }
+
+  // Be friendly and don't nuke totally the module when the client didn't
+  // supply anything to preserve.
+  if (!ExportList.empty() || !GUIDPreservedSymbols.empty()) {
+    // Apply summary-based internalization decisions.
+    thinLTOInternalizeModule(TheModule, DefinedGlobals);
+  }
+
+  // Save internalized bitcode
+  saveTempBitcode(TheModule, SaveTempsDir, count, ".2.internalized.bc");
+
+  if (!SingleModule) {
+    crossImportIntoModule(TheModule, Index, ModuleMap, ImportList);
+
+    // Save temps: after cross-module import.
+    saveTempBitcode(TheModule, SaveTempsDir, count, ".3.imported.bc");
+  }
+
+  optimizeModule(TheModule, TM);
+
+  saveTempBitcode(TheModule, SaveTempsDir, count, ".4.opt.bc");
+
+  if (DisableCodeGen) {
+    // Configured to stop before CodeGen, serialize the bitcode and return.
+    SmallVector<char, 128> OutputBuffer;
+    {
+      raw_svector_ostream OS(OutputBuffer);
+      ModuleSummaryIndexBuilder IndexBuilder(&TheModule);
+      WriteBitcodeToFile(&TheModule, OS, true, &IndexBuilder.getIndex());
+    }
+    return make_unique<ObjectMemoryBuffer>(std::move(OutputBuffer));
+  }
+
+  return codegenModule(TheModule, TM);
+}
+
+/// Resolve LinkOnce/Weak symbols. Record resolutions in the \p ResolvedODR map
+/// for caching, and in the \p Index for application during the ThinLTO
+/// backends. This is needed for correctness for exported symbols (ensure
+/// at least one copy kept) and a compile-time optimization (to drop duplicate
+/// copies when possible).
+static void resolveWeakForLinkerInIndex(
+    ModuleSummaryIndex &Index,
+    StringMap<std::map<GlobalValue::GUID, GlobalValue::LinkageTypes>>
+        &ResolvedODR) {
+
+  DenseMap<GlobalValue::GUID, const GlobalValueSummary *> PrevailingCopy;
+  computePrevailingCopies(Index, PrevailingCopy);
+
+  auto isPrevailing = [&](GlobalValue::GUID GUID, const GlobalValueSummary *S) {
+    const auto &Prevailing = PrevailingCopy.find(GUID);
+    // Not in map means that there was only one copy, which must be prevailing.
+    if (Prevailing == PrevailingCopy.end())
+      return true;
+    return Prevailing->second == S;
+  };
+
+  auto recordNewLinkage = [&](StringRef ModuleIdentifier,
+                              GlobalValue::GUID GUID,
+                              GlobalValue::LinkageTypes NewLinkage) {
+    ResolvedODR[ModuleIdentifier][GUID] = NewLinkage;
+  };
+
+  thinLTOResolveWeakForLinkerInIndex(Index, isPrevailing, recordNewLinkage);
+}
+
+// Initialize the TargetMachine builder for a given Triple
+static void initTMBuilder(TargetMachineBuilder &TMBuilder,
+                          const Triple &TheTriple) {
+  // Set a default CPU for Darwin triples (copied from LTOCodeGenerator).
+  // FIXME this looks pretty terrible...
+  if (TMBuilder.MCpu.empty() && TheTriple.isOSDarwin()) {
+    if (TheTriple.getArch() == llvm::Triple::x86_64)
+      TMBuilder.MCpu = "core2";
+    else if (TheTriple.getArch() == llvm::Triple::x86)
+      TMBuilder.MCpu = "yonah";
+    else if (TheTriple.getArch() == llvm::Triple::aarch64)
+      TMBuilder.MCpu = "cyclone";
+  }
+  TMBuilder.TheTriple = std::move(TheTriple);
+}
+
+} // end anonymous namespace
+
+void ThinLTOCodeGenerator::addModule(StringRef Identifier, StringRef Data) {
+  MemoryBufferRef Buffer(Data, Identifier);
+  if (Modules.empty()) {
+    // First module added, so initialize the triple and some options
+    LLVMContext Context;
+    Triple TheTriple(getBitcodeTargetTriple(Buffer, Context));
+    initTMBuilder(TMBuilder, Triple(TheTriple));
+  }
+#ifndef NDEBUG
+  else {
+    LLVMContext Context;
+    assert(TMBuilder.TheTriple.str() ==
+               getBitcodeTargetTriple(Buffer, Context) &&
+           "ThinLTO modules with different triple not supported");
+  }
+#endif
+  Modules.push_back(Buffer);
+}
+
+void ThinLTOCodeGenerator::preserveSymbol(StringRef Name) {
+  PreservedSymbols.insert(Name);
+}
+
+void ThinLTOCodeGenerator::crossReferenceSymbol(StringRef Name) {
+  // FIXME: At the moment, we don't take advantage of this extra information,
+  // we're conservatively considering cross-references as preserved.
+  //  CrossReferencedSymbols.insert(Name);
+  PreservedSymbols.insert(Name);
+}
+
+// TargetMachine factory
+std::unique_ptr<TargetMachine> TargetMachineBuilder::create() const {
+  std::string ErrMsg;
+  const Target *TheTarget =
+      TargetRegistry::lookupTarget(TheTriple.str(), ErrMsg);
+  if (!TheTarget) {
+    report_fatal_error("Can't load target for this Triple: " + ErrMsg);
+  }
+
+  // Use MAttr as the default set of features.
+  SubtargetFeatures Features(MAttr);
+  Features.getDefaultSubtargetFeatures(TheTriple);
+  std::string FeatureStr = Features.getString();
+  return std::unique_ptr<TargetMachine>(TheTarget->createTargetMachine(
+      TheTriple.str(), MCpu, FeatureStr, Options, RelocModel,
+      CodeModel::Default, CGOptLevel));
+}
+
+/**
+ * Produce the combined summary index from all the bitcode files:
+ * "thin-link".
+ */
+std::unique_ptr<ModuleSummaryIndex> ThinLTOCodeGenerator::linkCombinedIndex() {
+  std::unique_ptr<ModuleSummaryIndex> CombinedIndex;
+  uint64_t NextModuleId = 0;
+  for (auto &ModuleBuffer : Modules) {
+    ErrorOr<std::unique_ptr<object::ModuleSummaryIndexObjectFile>> ObjOrErr =
+        object::ModuleSummaryIndexObjectFile::create(ModuleBuffer,
+                                                     diagnosticHandler);
+    if (std::error_code EC = ObjOrErr.getError()) {
+      // FIXME diagnose
+      errs() << "error: can't create ModuleSummaryIndexObjectFile for buffer: "
+             << EC.message() << "\n";
+      return nullptr;
+    }
+    auto Index = (*ObjOrErr)->takeIndex();
+    if (CombinedIndex) {
+      CombinedIndex->mergeFrom(std::move(Index), ++NextModuleId);
+    } else {
+      CombinedIndex = std::move(Index);
+    }
+  }
+  return CombinedIndex;
+}
+
+/**
+ * Perform promotion and renaming of exported internal functions.
+ * Index is updated to reflect linkage changes from weak resolution.
+ */
+void ThinLTOCodeGenerator::promote(Module &TheModule,
+                                   ModuleSummaryIndex &Index) {
+  auto ModuleCount = Index.modulePaths().size();
+  auto ModuleIdentifier = TheModule.getModuleIdentifier();
+  // Collect for each module the list of function it defines (GUID -> Summary).
+  StringMap<GVSummaryMapTy> ModuleToDefinedGVSummaries;
+  Index.collectDefinedGVSummariesPerModule(ModuleToDefinedGVSummaries);
+
+  // Generate import/export list
+  StringMap<FunctionImporter::ImportMapTy> ImportLists(ModuleCount);
+  StringMap<FunctionImporter::ExportSetTy> ExportLists(ModuleCount);
+  ComputeCrossModuleImport(Index, ModuleToDefinedGVSummaries, ImportLists,
+                           ExportLists);
+
+  // Resolve LinkOnce/Weak symbols.
+  StringMap<std::map<GlobalValue::GUID, GlobalValue::LinkageTypes>> ResolvedODR;
+  resolveWeakForLinkerInIndex(Index, ResolvedODR);
+
+  thinLTOResolveWeakForLinkerModule(
+      TheModule, ModuleToDefinedGVSummaries[ModuleIdentifier]);
+
+  promoteModule(TheModule, Index);
+}
+
+/**
+ * Perform cross-module importing for the module identified by ModuleIdentifier.
+ */
+void ThinLTOCodeGenerator::crossModuleImport(Module &TheModule,
+                                             ModuleSummaryIndex &Index) {
+  auto ModuleMap = generateModuleMap(Modules);
+  auto ModuleCount = Index.modulePaths().size();
+
+  // Collect for each module the list of function it defines (GUID -> Summary).
+  StringMap<GVSummaryMapTy> ModuleToDefinedGVSummaries(ModuleCount);
+  Index.collectDefinedGVSummariesPerModule(ModuleToDefinedGVSummaries);
+
+  // Generate import/export list
+  StringMap<FunctionImporter::ImportMapTy> ImportLists(ModuleCount);
+  StringMap<FunctionImporter::ExportSetTy> ExportLists(ModuleCount);
+  ComputeCrossModuleImport(Index, ModuleToDefinedGVSummaries, ImportLists,
+                           ExportLists);
+  auto &ImportList = ImportLists[TheModule.getModuleIdentifier()];
+
+  crossImportIntoModule(TheModule, Index, ModuleMap, ImportList);
+}
+
+/**
+ * Compute the list of summaries needed for importing into module.
+ */
+void ThinLTOCodeGenerator::gatherImportedSummariesForModule(
+    StringRef ModulePath, ModuleSummaryIndex &Index,
+    std::map<std::string, GVSummaryMapTy> &ModuleToSummariesForIndex) {
+  auto ModuleCount = Index.modulePaths().size();
+
+  // Collect for each module the list of function it defines (GUID -> Summary).
+  StringMap<GVSummaryMapTy> ModuleToDefinedGVSummaries(ModuleCount);
+  Index.collectDefinedGVSummariesPerModule(ModuleToDefinedGVSummaries);
+
+  // Generate import/export list
+  StringMap<FunctionImporter::ImportMapTy> ImportLists(ModuleCount);
+  StringMap<FunctionImporter::ExportSetTy> ExportLists(ModuleCount);
+  ComputeCrossModuleImport(Index, ModuleToDefinedGVSummaries, ImportLists,
+                           ExportLists);
+
+  llvm::gatherImportedSummariesForModule(ModulePath, ModuleToDefinedGVSummaries,
+                                         ImportLists,
+                                         ModuleToSummariesForIndex);
+}
+
+/**
+ * Emit the list of files needed for importing into module.
+ */
+void ThinLTOCodeGenerator::emitImports(StringRef ModulePath,
+                                       StringRef OutputName,
+                                       ModuleSummaryIndex &Index) {
+  auto ModuleCount = Index.modulePaths().size();
+
+  // Collect for each module the list of function it defines (GUID -> Summary).
+  StringMap<GVSummaryMapTy> ModuleToDefinedGVSummaries(ModuleCount);
+  Index.collectDefinedGVSummariesPerModule(ModuleToDefinedGVSummaries);
+
+  // Generate import/export list
+  StringMap<FunctionImporter::ImportMapTy> ImportLists(ModuleCount);
+  StringMap<FunctionImporter::ExportSetTy> ExportLists(ModuleCount);
+  ComputeCrossModuleImport(Index, ModuleToDefinedGVSummaries, ImportLists,
+                           ExportLists);
+
+  std::error_code EC;
+  if ((EC = EmitImportsFiles(ModulePath, OutputName, ImportLists)))
+    report_fatal_error(Twine("Failed to open ") + OutputName +
+                       " to save imports lists\n");
+}
+
+/**
+ * Perform internalization. Index is updated to reflect linkage changes.
+ */
+void ThinLTOCodeGenerator::internalize(Module &TheModule,
+                                       ModuleSummaryIndex &Index) {
+  initTMBuilder(TMBuilder, Triple(TheModule.getTargetTriple()));
+  auto ModuleCount = Index.modulePaths().size();
+  auto ModuleIdentifier = TheModule.getModuleIdentifier();
+
+  // Convert the preserved symbols set from string to GUID
+  auto GUIDPreservedSymbols =
+      computeGUIDPreservedSymbols(PreservedSymbols, TMBuilder.TheTriple);
+
+  // Collect for each module the list of function it defines (GUID -> Summary).
+  StringMap<GVSummaryMapTy> ModuleToDefinedGVSummaries(ModuleCount);
+  Index.collectDefinedGVSummariesPerModule(ModuleToDefinedGVSummaries);
+
+  // Generate import/export list
+  StringMap<FunctionImporter::ImportMapTy> ImportLists(ModuleCount);
+  StringMap<FunctionImporter::ExportSetTy> ExportLists(ModuleCount);
+  ComputeCrossModuleImport(Index, ModuleToDefinedGVSummaries, ImportLists,
+                           ExportLists);
+  auto &ExportList = ExportLists[ModuleIdentifier];
+
+  // Be friendly and don't nuke totally the module when the client didn't
+  // supply anything to preserve.
+  if (ExportList.empty() && GUIDPreservedSymbols.empty())
+    return;
+
+  // Internalization
+  auto isExported = [&](StringRef ModuleIdentifier, GlobalValue::GUID GUID) {
+    const auto &ExportList = ExportLists.find(ModuleIdentifier);
+    return (ExportList != ExportLists.end() &&
+            ExportList->second.count(GUID)) ||
+           GUIDPreservedSymbols.count(GUID);
+  };
+  thinLTOInternalizeAndPromoteInIndex(Index, isExported);
+  thinLTOInternalizeModule(TheModule,
+                           ModuleToDefinedGVSummaries[ModuleIdentifier]);
+}
+
+/**
+ * Perform post-importing ThinLTO optimizations.
+ */
+void ThinLTOCodeGenerator::optimize(Module &TheModule) {
+  initTMBuilder(TMBuilder, Triple(TheModule.getTargetTriple()));
+
+  // Optimize now
+  optimizeModule(TheModule, *TMBuilder.create());
+}
+
+/**
+ * Perform ThinLTO CodeGen.
+ */
+std::unique_ptr<MemoryBuffer> ThinLTOCodeGenerator::codegen(Module &TheModule) {
+  initTMBuilder(TMBuilder, Triple(TheModule.getTargetTriple()));
+  return codegenModule(TheModule, *TMBuilder.create());
+}
+
+// Main entry point for the ThinLTO processing
+void ThinLTOCodeGenerator::run() {
+  if (CodeGenOnly) {
+    // Perform only parallel codegen and return.
+    ThreadPool Pool;
+    assert(ProducedBinaries.empty() && "The generator should not be reused");
+    ProducedBinaries.resize(Modules.size());
+    int count = 0;
+    for (auto &ModuleBuffer : Modules) {
+      Pool.async([&](int count) {
+        LLVMContext Context;
+        Context.setDiscardValueNames(LTODiscardValueNames);
+
+        // Parse module now
+        auto TheModule = loadModuleFromBuffer(ModuleBuffer, Context, false);
+
+        // CodeGen
+        ProducedBinaries[count] = codegen(*TheModule);
+      }, count++);
+    }
+
+    return;
+  }
+
+  // Sequential linking phase
+  auto Index = linkCombinedIndex();
+
+  // Save temps: index.
+  if (!SaveTempsDir.empty()) {
+    auto SaveTempPath = SaveTempsDir + "index.bc";
+    std::error_code EC;
+    raw_fd_ostream OS(SaveTempPath, EC, sys::fs::F_None);
+    if (EC)
+      report_fatal_error(Twine("Failed to open ") + SaveTempPath +
+                         " to save optimized bitcode\n");
+    WriteIndexToFile(*Index, OS);
+  }
+
+  // Prepare the resulting object vector
+  assert(ProducedBinaries.empty() && "The generator should not be reused");
+  ProducedBinaries.resize(Modules.size());
+
+  // Prepare the module map.
+  auto ModuleMap = generateModuleMap(Modules);
+  auto ModuleCount = Modules.size();
+
+  // Collect for each module the list of function it defines (GUID -> Summary).
+  StringMap<GVSummaryMapTy> ModuleToDefinedGVSummaries(ModuleCount);
+  Index->collectDefinedGVSummariesPerModule(ModuleToDefinedGVSummaries);
+
+  // Collect the import/export lists for all modules from the call-graph in the
+  // combined index.
+  StringMap<FunctionImporter::ImportMapTy> ImportLists(ModuleCount);
+  StringMap<FunctionImporter::ExportSetTy> ExportLists(ModuleCount);
+  ComputeCrossModuleImport(*Index, ModuleToDefinedGVSummaries, ImportLists,
+                           ExportLists);
+
+  // Convert the preserved symbols set from string to GUID, this is needed for
+  // computing the caching hash and the internalization.
+  auto GUIDPreservedSymbols =
+      computeGUIDPreservedSymbols(PreservedSymbols, TMBuilder.TheTriple);
+
+  // We use a std::map here to be able to have a defined ordering when
+  // producing a hash for the cache entry.
+  // FIXME: we should be able to compute the caching hash for the entry based
+  // on the index, and nuke this map.
+  StringMap<std::map<GlobalValue::GUID, GlobalValue::LinkageTypes>> ResolvedODR;
+
+  // Resolve LinkOnce/Weak symbols, this has to be computed early because it
+  // impacts the caching.
+  resolveWeakForLinkerInIndex(*Index, ResolvedODR);
+
+  auto isExported = [&](StringRef ModuleIdentifier, GlobalValue::GUID GUID) {
+    const auto &ExportList = ExportLists.find(ModuleIdentifier);
+    return (ExportList != ExportLists.end() &&
+            ExportList->second.count(GUID)) ||
+           GUIDPreservedSymbols.count(GUID);
+  };
+
+  // Use global summary-based analysis to identify symbols that can be
+  // internalized (because they aren't exported or preserved as per callback).
+  // Changes are made in the index, consumed in the ThinLTO backends.
+  thinLTOInternalizeAndPromoteInIndex(*Index, isExported);
+
+  // Make sure that every module has an entry in the ExportLists and
+  // ResolvedODR maps to enable threaded access to these maps below.
+  for (auto &DefinedGVSummaries : ModuleToDefinedGVSummaries) {
+    ExportLists[DefinedGVSummaries.first()];
+    ResolvedODR[DefinedGVSummaries.first()];
+  }
+
+  // Compute the ordering we will process the inputs: the rough heuristic here
+  // is to sort them per size so that the largest module get schedule as soon as
+  // possible. This is purely a compile-time optimization.
+  std::vector<int> ModulesOrdering;
+  ModulesOrdering.resize(Modules.size());
+  std::iota(ModulesOrdering.begin(), ModulesOrdering.end(), 0);
+  std::sort(ModulesOrdering.begin(), ModulesOrdering.end(),
+            [&](int LeftIndex, int RightIndex) {
+              auto LSize = Modules[LeftIndex].getBufferSize();
+              auto RSize = Modules[RightIndex].getBufferSize();
+              return LSize > RSize;
+            });
+
+  // Parallel optimizer + codegen
+  {
+    ThreadPool Pool(ThreadCount);
+    for (auto IndexCount : ModulesOrdering) {
+      auto &ModuleBuffer = Modules[IndexCount];
+      Pool.async([&](int count) {
+        auto ModuleIdentifier = ModuleBuffer.getBufferIdentifier();
+        auto &ExportList = ExportLists[ModuleIdentifier];
+
+        auto &DefinedFunctions = ModuleToDefinedGVSummaries[ModuleIdentifier];
+
+        // The module may be cached, this helps handling it.
+        ModuleCacheEntry CacheEntry(CacheOptions.Path, *Index, ModuleIdentifier,
+                                    ImportLists[ModuleIdentifier], ExportList,
+                                    ResolvedODR[ModuleIdentifier],
+                                    DefinedFunctions, GUIDPreservedSymbols);
+
+        {
+          auto ErrOrBuffer = CacheEntry.tryLoadingBuffer();
+          DEBUG(dbgs() << "Cache " << (ErrOrBuffer ? "hit" : "miss") << " '"
+                       << CacheEntry.getEntryPath() << "' for buffer " << count
+                       << " " << ModuleIdentifier << "\n");
+
+          if (ErrOrBuffer) {
+            // Cache Hit!
+            ProducedBinaries[count] = std::move(ErrOrBuffer.get());
+            return;
+          }
+        }
+
+        LLVMContext Context;
+        Context.setDiscardValueNames(LTODiscardValueNames);
+        Context.enableDebugTypeODRUniquing();
+
+        // Parse module now
+        auto TheModule = loadModuleFromBuffer(ModuleBuffer, Context, false);
+
+        // Save temps: original file.
+        saveTempBitcode(*TheModule, SaveTempsDir, count, ".0.original.bc");
+
+        auto &ImportList = ImportLists[ModuleIdentifier];
+        // Run the main process now, and generates a binary
+        auto OutputBuffer = ProcessThinLTOModule(
+            *TheModule, *Index, ModuleMap, *TMBuilder.create(), ImportList,
+            ExportList, GUIDPreservedSymbols,
+            ModuleToDefinedGVSummaries[ModuleIdentifier], CacheOptions,
+            DisableCodeGen, SaveTempsDir, count);
+
+        OutputBuffer = CacheEntry.write(std::move(OutputBuffer));
+        ProducedBinaries[count] = std::move(OutputBuffer);
+      }, IndexCount);
+    }
+  }
+
+  CachePruning(CacheOptions.Path)
+      .setPruningInterval(CacheOptions.PruningInterval)
+      .setEntryExpiration(CacheOptions.Expiration)
+      .setMaxSize(CacheOptions.MaxPercentageOfAvailableSpace)
+      .prune();
+
+  // If statistics were requested, print them out now.
+  if (llvm::AreStatisticsEnabled())
+    llvm::PrintStatistics();
+}
diff --git a/lib/LTO/UpdateCompilerUsed.cpp b/lib/LTO/UpdateCompilerUsed.cpp
new file mode 100644
index 000000000000..a574db6fb5aa
--- /dev/null
+++ b/lib/LTO/UpdateCompilerUsed.cpp
@@ -0,0 +1,146 @@
+//==-LTOInternalize.cpp - LLVM Link Time Optimizer Internalization Utility -==//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines a helper to run the internalization part of LTO.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/LTO/legacy/UpdateCompilerUsed.h"
+#include "llvm/Analysis/TargetLibraryInfo.h"
+#include "llvm/IR/LegacyPassManager.h"
+#include "llvm/IR/Mangler.h"
+#include "llvm/Target/TargetLowering.h"
+#include "llvm/Target/TargetSubtargetInfo.h"
+#include "llvm/Transforms/IPO/Internalize.h"
+
+using namespace llvm;
+
+namespace {
+
+// Helper class that collects AsmUsed and user supplied libcalls.
+class PreserveLibCallsAndAsmUsed {
+public:
+  PreserveLibCallsAndAsmUsed(const StringSet<> &AsmUndefinedRefs,
+                             const TargetMachine &TM,
+                             SmallPtrSetImpl<const GlobalValue *> &LLVMUsed)
+      : AsmUndefinedRefs(AsmUndefinedRefs), TM(TM), LLVMUsed(LLVMUsed) {}
+
+  void findInModule(const Module &TheModule) {
+    initializeLibCalls(TheModule);
+    for (const Function &F : TheModule)
+      findLibCallsAndAsm(F);
+    for (const GlobalVariable &GV : TheModule.globals())
+      findLibCallsAndAsm(GV);
+    for (const GlobalAlias &GA : TheModule.aliases())
+      findLibCallsAndAsm(GA);
+  }
+
+private:
+  // Inputs
+  const StringSet<> &AsmUndefinedRefs;
+  const TargetMachine &TM;
+
+  // Temps
+  llvm::Mangler Mangler;
+  StringSet<> Libcalls;
+
+  // Output
+  SmallPtrSetImpl<const GlobalValue *> &LLVMUsed;
+
+  // Collect names of runtime library functions. User-defined functions with the
+  // same names are added to llvm.compiler.used to prevent them from being
+  // deleted by optimizations.
+  void initializeLibCalls(const Module &TheModule) {
+    TargetLibraryInfoImpl TLII(Triple(TM.getTargetTriple()));
+    TargetLibraryInfo TLI(TLII);
+
+    // TargetLibraryInfo has info on C runtime library calls on the current
+    // target.
+    for (unsigned I = 0, E = static_cast<unsigned>(LibFunc::NumLibFuncs);
+         I != E; ++I) {
+      LibFunc::Func F = static_cast<LibFunc::Func>(I);
+      if (TLI.has(F))
+        Libcalls.insert(TLI.getName(F));
+    }
+
+    SmallPtrSet<const TargetLowering *, 1> TLSet;
+
+    for (const Function &F : TheModule) {
+      const TargetLowering *Lowering =
+          TM.getSubtargetImpl(F)->getTargetLowering();
+
+      if (Lowering && TLSet.insert(Lowering).second)
+        // TargetLowering has info on library calls that CodeGen expects to be
+        // available, both from the C runtime and compiler-rt.
+        for (unsigned I = 0, E = static_cast<unsigned>(RTLIB::UNKNOWN_LIBCALL);
+             I != E; ++I)
+          if (const char *Name =
+                  Lowering->getLibcallName(static_cast<RTLIB::Libcall>(I)))
+            Libcalls.insert(Name);
+    }
+  }
+
+  void findLibCallsAndAsm(const GlobalValue &GV) {
+    // There are no restrictions to apply to declarations.
+    if (GV.isDeclaration())
+      return;
+
+    // There is nothing more restrictive than private linkage.
+    if (GV.hasPrivateLinkage())
+      return;
+
+    // Conservatively append user-supplied runtime library functions to
+    // llvm.compiler.used.  These could be internalized and deleted by
+    // optimizations like -globalopt, causing problems when later optimizations
+    // add new library calls (e.g., llvm.memset => memset and printf => puts).
+    // Leave it to the linker to remove any dead code (e.g. with -dead_strip).
+    if (isa<Function>(GV) && Libcalls.count(GV.getName()))
+      LLVMUsed.insert(&GV);
+
+    SmallString<64> Buffer;
+    TM.getNameWithPrefix(Buffer, &GV, Mangler);
+    if (AsmUndefinedRefs.count(Buffer))
+      LLVMUsed.insert(&GV);
+  }
+};
+
+} // namespace anonymous
+
+void llvm::updateCompilerUsed(Module &TheModule, const TargetMachine &TM,
+                              const StringSet<> &AsmUndefinedRefs) {
+  SmallPtrSet<const GlobalValue *, 8> UsedValues;
+  PreserveLibCallsAndAsmUsed(AsmUndefinedRefs, TM, UsedValues)
+      .findInModule(TheModule);
+
+  if (UsedValues.empty())
+    return;
+
+  llvm::Type *i8PTy = llvm::Type::getInt8PtrTy(TheModule.getContext());
+  std::vector<Constant *> UsedValuesList;
+  for (const auto *GV : UsedValues) {
+    Constant *c =
+        ConstantExpr::getBitCast(const_cast<GlobalValue *>(GV), i8PTy);
+    UsedValuesList.push_back(c);
+  }
+
+  GlobalVariable *LLVMUsed = TheModule.getGlobalVariable("llvm.compiler.used");
+  if (LLVMUsed) {
+    ConstantArray *Inits = cast<ConstantArray>(LLVMUsed->getInitializer());
+    for (auto &V : Inits->operands())
+      UsedValuesList.push_back(cast<Constant>(&V));
+    LLVMUsed->eraseFromParent();
+  }
+
+  llvm::ArrayType *ATy = llvm::ArrayType::get(i8PTy, UsedValuesList.size());
+  LLVMUsed = new llvm::GlobalVariable(
+      TheModule, ATy, false, llvm::GlobalValue::AppendingLinkage,
+      llvm::ConstantArray::get(ATy, UsedValuesList), "llvm.compiler.used");
+
+  LLVMUsed->setSection("llvm.metadata");
+}
diff --git a/lib/LibDriver/LibDriver.cpp b/lib/LibDriver/LibDriver.cpp
index 3ae543460745..ea6d921d0f8e 100644
--- a/lib/LibDriver/LibDriver.cpp
+++ b/lib/LibDriver/LibDriver.cpp
@@ -57,10 +57,10 @@ public:
 }
 
 static std::string getOutputPath(llvm::opt::InputArgList *Args,
-                                 const llvm::NewArchiveIterator &FirstMember) {
+                                 const llvm::NewArchiveMember &FirstMember) {
   if (auto *Arg = Args->getLastArg(OPT_out))
     return Arg->getValue();
-  SmallString<128> Val = FirstMember.getNew();
+  SmallString<128> Val = StringRef(FirstMember.Buf->getBufferIdentifier());
   llvm::sys::path::replace_extension(Val, ".lib");
   return Val.str();
 }
@@ -122,20 +122,28 @@ int llvm::libDriverMain(llvm::ArrayRef<const char*> ArgsArr) {
     llvm::errs() << "ignoring unknown argument: " << Arg->getSpelling() << "\n";
 
   if (Args.filtered_begin(OPT_INPUT) == Args.filtered_end()) {
-    llvm::errs() << "no input files.\n";
-    return 1;
+    // No input files.  To match lib.exe, silently do nothing.
+    return 0;
   }
 
   std::vector<StringRef> SearchPaths = getSearchPaths(&Args, Saver);
 
-  std::vector<llvm::NewArchiveIterator> Members;
+  std::vector<llvm::NewArchiveMember> Members;
   for (auto *Arg : Args.filtered(OPT_INPUT)) {
     Optional<std::string> Path = findInputFile(Arg->getValue(), SearchPaths);
     if (!Path.hasValue()) {
       llvm::errs() << Arg->getValue() << ": no such file or directory\n";
       return 1;
     }
-    Members.emplace_back(Saver.save(*Path));
+    Expected<NewArchiveMember> MOrErr =
+        NewArchiveMember::getFile(Saver.save(*Path), /*Deterministic=*/true);
+    if (!MOrErr) {
+      handleAllErrors(MOrErr.takeError(), [&](const llvm::ErrorInfoBase &EIB) {
+        llvm::errs() << Arg->getValue() << ": " << EIB.message() << "\n";
+      });
+      return 1;
+    }
+    Members.emplace_back(std::move(*MOrErr));
   }
 
   std::pair<StringRef, std::error_code> Result =
diff --git a/lib/LibDriver/Makefile b/lib/LibDriver/Makefile
deleted file mode 100644
index 1c62eac9093d..000000000000
--- a/lib/LibDriver/Makefile
+++ /dev/null
@@ -1,20 +0,0 @@
-##===- lib/LibDriver/Makefile ------------------------------*- Makefile -*-===##
-#
-#                     The LLVM Compiler Infrastructure
-#
-# This file is distributed under the University of Illinois Open Source
-# License. See LICENSE.TXT for details.
-#
-##===----------------------------------------------------------------------===##
-
-LEVEL = ../..
-LIBRARYNAME = LLVMLibDriver
-BUILD_ARCHIVE := 1
-BUILT_SOURCES = Options.inc
-TABLEGEN_INC_FILES_COMMON = 1
-
-include $(LEVEL)/Makefile.common
-
-$(ObjDir)/Options.inc.tmp : Options.td $(LLVM_TBLGEN) $(ObjDir)/.dir
-	$(Echo) "Building lib Driver Option tables with tblgen"
-	$(Verb) $(LLVMTableGen) -gen-opt-parser-defs -o $(call SYSPATH, $@) $<
diff --git a/lib/LineEditor/LineEditor.cpp b/lib/LineEditor/LineEditor.cpp
index a50ccc388f15..533a928b2dfd 100644
--- a/lib/LineEditor/LineEditor.cpp
+++ b/lib/LineEditor/LineEditor.cpp
@@ -12,7 +12,9 @@
 #include "llvm/Config/config.h"
 #include "llvm/Support/Path.h"
 #include "llvm/Support/raw_ostream.h"
-#include <stdio.h>
+#include <algorithm>
+#include <cassert>
+#include <cstdio>
 #ifdef HAVE_LIBEDIT
 #include <histedit.h>
 #endif
@@ -106,7 +108,9 @@ struct LineEditor::InternalData {
   FILE *Out;
 };
 
-static const char *ElGetPromptFn(EditLine *EL) {
+namespace {
+
+const char *ElGetPromptFn(EditLine *EL) {
   LineEditor::InternalData *Data;
   if (el_get(EL, EL_CLIENTDATA, &Data) == 0)
     return Data->LE->getPrompt().c_str();
@@ -117,7 +121,7 @@ static const char *ElGetPromptFn(EditLine *EL) {
 //
 // This function is really horrible. But since the alternative is to get into
 // the line editor business, here we are.
-static unsigned char ElCompletionFn(EditLine *EL, int ch) {
+unsigned char ElCompletionFn(EditLine *EL, int ch) {
   LineEditor::InternalData *Data;
   if (el_get(EL, EL_CLIENTDATA, &Data) == 0) {
     if (!Data->ContinuationOutput.empty()) {
@@ -190,6 +194,8 @@ static unsigned char ElCompletionFn(EditLine *EL, int ch) {
   return CC_ERROR;
 }
 
+} // end anonymous namespace
+
 LineEditor::LineEditor(StringRef ProgName, StringRef HistoryPath, FILE *In,
                        FILE *Out, FILE *Err)
     : Prompt((ProgName + "> ").str()), HistoryPath(HistoryPath),
@@ -269,7 +275,7 @@ Optional<std::string> LineEditor::readLine() const {
   return std::string(Line, LineLen);
 }
 
-#else
+#else // HAVE_LIBEDIT
 
 // Simple fgets-based implementation.
 
@@ -316,4 +322,4 @@ Optional<std::string> LineEditor::readLine() const {
   return Line;
 }
 
-#endif
+#endif // HAVE_LIBEDIT
diff --git a/lib/LineEditor/Makefile b/lib/LineEditor/Makefile
deleted file mode 100644
index c7ff6d8eaae1..000000000000
--- a/lib/LineEditor/Makefile
+++ /dev/null
@@ -1,15 +0,0 @@
-##===- lib/LineEditor/Makefile -----------------------------*- Makefile -*-===##
-#
-#                     The LLVM Compiler Infrastructure
-#
-# This file is distributed under the University of Illinois Open Source
-# License. See LICENSE.TXT for details.
-#
-##===----------------------------------------------------------------------===##
-
-LEVEL = ../..
-LIBRARYNAME = LLVMLineEditor
-BUILD_ARCHIVE := 1
-
-include $(LEVEL)/Makefile.common
-
diff --git a/lib/Linker/IRMover.cpp b/lib/Linker/IRMover.cpp
index 8dd59f9e0e3e..4935868c00f4 100644
--- a/lib/Linker/IRMover.cpp
+++ b/lib/Linker/IRMover.cpp
@@ -16,8 +16,11 @@
 #include "llvm/IR/DebugInfo.h"
 #include "llvm/IR/DiagnosticPrinter.h"
 #include "llvm/IR/GVMaterializer.h"
+#include "llvm/IR/Intrinsics.h"
 #include "llvm/IR/TypeFinder.h"
+#include "llvm/Support/Error.h"
 #include "llvm/Transforms/Utils/Cloning.h"
+#include <utility>
 using namespace llvm;
 
 //===----------------------------------------------------------------------===//
@@ -345,43 +348,40 @@ class IRLinker;
 /// speeds up linking for modules with many/ lazily linked functions of which
 /// few get used.
 class GlobalValueMaterializer final : public ValueMaterializer {
-  IRLinker *TheIRLinker;
+  IRLinker &TheIRLinker;
 
 public:
-  GlobalValueMaterializer(IRLinker *TheIRLinker) : TheIRLinker(TheIRLinker) {}
-  Value *materializeDeclFor(Value *V) override;
-  void materializeInitFor(GlobalValue *New, GlobalValue *Old) override;
-  Metadata *mapTemporaryMetadata(Metadata *MD) override;
-  void replaceTemporaryMetadata(const Metadata *OrigMD,
-                                Metadata *NewMD) override;
-  bool isMetadataNeeded(Metadata *MD) override;
+  GlobalValueMaterializer(IRLinker &TheIRLinker) : TheIRLinker(TheIRLinker) {}
+  Value *materialize(Value *V) override;
 };
 
 class LocalValueMaterializer final : public ValueMaterializer {
-  IRLinker *TheIRLinker;
+  IRLinker &TheIRLinker;
 
 public:
-  LocalValueMaterializer(IRLinker *TheIRLinker) : TheIRLinker(TheIRLinker) {}
-  Value *materializeDeclFor(Value *V) override;
-  void materializeInitFor(GlobalValue *New, GlobalValue *Old) override;
-  Metadata *mapTemporaryMetadata(Metadata *MD) override;
-  void replaceTemporaryMetadata(const Metadata *OrigMD,
-                                Metadata *NewMD) override;
-  bool isMetadataNeeded(Metadata *MD) override;
+  LocalValueMaterializer(IRLinker &TheIRLinker) : TheIRLinker(TheIRLinker) {}
+  Value *materialize(Value *V) override;
 };
 
+/// Type of the Metadata map in \a ValueToValueMapTy.
+typedef DenseMap<const Metadata *, TrackingMDRef> MDMapT;
+
 /// This is responsible for keeping track of the state used for moving data
 /// from SrcM to DstM.
 class IRLinker {
   Module &DstM;
-  Module &SrcM;
+  std::unique_ptr<Module> SrcM;
 
+  /// See IRMover::move().
   std::function<void(GlobalValue &, IRMover::ValueAdder)> AddLazyFor;
 
   TypeMapTy TypeMap;
   GlobalValueMaterializer GValMaterializer;
   LocalValueMaterializer LValMaterializer;
 
+  /// A metadata map that's shared between IRLinker instances.
+  MDMapT &SharedMDs;
+
   /// Mapping of values from what they used to be in Src, to what they are now
   /// in DstM.  ValueToValueMapTy is a ValueMap, which involves some overhead
   /// due to the use of Value handles which the Linker doesn't actually need,
@@ -402,52 +402,30 @@ class IRLinker {
   /// references.
   bool DoneLinkingBodies = false;
 
-  bool HasError = false;
-
-  /// Flag indicating that we are just linking metadata (after function
-  /// importing).
-  bool IsMetadataLinkingPostpass;
-
-  /// Flags to pass to value mapper invocations.
-  RemapFlags ValueMapperFlags = RF_MoveDistinctMDs;
-
-  /// Association between metadata values created during bitcode parsing and
-  /// the value id. Used to correlate temporary metadata created during
-  /// function importing with the final metadata parsed during the subsequent
-  /// metadata linking postpass.
-  DenseMap<const Metadata *, unsigned> MetadataToIDs;
+  /// The Error encountered during materialization. We use an Optional here to
+  /// avoid needing to manage an unconsumed success value.
+  Optional<Error> FoundError;
+  void setError(Error E) {
+    if (E)
+      FoundError = std::move(E);
+  }
 
-  /// Association between metadata value id and temporary metadata that
-  /// remains unmapped after function importing. Saved during function
-  /// importing and consumed during the metadata linking postpass.
-  DenseMap<unsigned, MDNode *> *ValIDToTempMDMap;
+  /// Most of the errors produced by this module are inconvertible StringErrors.
+  /// This convenience function lets us return one of those more easily.
+  Error stringErr(const Twine &T) {
+    return make_error<StringError>(T, inconvertibleErrorCode());
+  }
 
-  /// Set of subprogram metadata that does not need to be linked into the
-  /// destination module, because the functions were not imported directly
-  /// or via an inlined body in an imported function.
-  SmallPtrSet<const Metadata *, 16> UnneededSubprograms;
+  /// Entry point for mapping values and alternate context for mapping aliases.
+  ValueMapper Mapper;
+  unsigned AliasMCID;
 
   /// Handles cloning of a global values from the source module into
   /// the destination module, including setting the attributes and visibility.
   GlobalValue *copyGlobalValueProto(const GlobalValue *SGV, bool ForDefinition);
 
-  /// Helper method for setting a message and returning an error code.
-  bool emitError(const Twine &Message) {
-    SrcM.getContext().diagnose(LinkDiagnosticInfo(DS_Error, Message));
-    HasError = true;
-    return true;
-  }
-
   void emitWarning(const Twine &Message) {
-    SrcM.getContext().diagnose(LinkDiagnosticInfo(DS_Warning, Message));
-  }
-
-  /// Check whether we should be linking metadata from the source module.
-  bool shouldLinkMetadata() {
-    // ValIDToTempMDMap will be non-null when we are importing or otherwise want
-    // to link metadata lazily, and then when linking the metadata.
-    // We only want to return true for the former case.
-    return ValIDToTempMDMap == nullptr || IsMetadataLinkingPostpass;
+    SrcM->getContext().diagnose(LinkDiagnosticInfo(DS_Warning, Message));
   }
 
   /// Given a global in the source module, return the global in the
@@ -474,18 +452,23 @@ class IRLinker {
 
   void computeTypeMapping();
 
-  Constant *linkAppendingVarProto(GlobalVariable *DstGV,
-                                  const GlobalVariable *SrcGV);
+  Expected<Constant *> linkAppendingVarProto(GlobalVariable *DstGV,
+                                             const GlobalVariable *SrcGV);
 
+  /// Given the GlobaValue \p SGV in the source module, and the matching
+  /// GlobalValue \p DGV (if any), return true if the linker will pull \p SGV
+  /// into the destination module.
+  ///
+  /// Note this code may call the client-provided \p AddLazyFor.
   bool shouldLink(GlobalValue *DGV, GlobalValue &SGV);
-  Constant *linkGlobalValueProto(GlobalValue *GV, bool ForAlias);
+  Expected<Constant *> linkGlobalValueProto(GlobalValue *GV, bool ForAlias);
 
-  bool linkModuleFlagsMetadata();
+  Error linkModuleFlagsMetadata();
 
   void linkGlobalInit(GlobalVariable &Dst, GlobalVariable &Src);
-  bool linkFunctionBody(Function &Dst, Function &Src);
+  Error linkFunctionBody(Function &Dst, Function &Src);
   void linkAliasBody(GlobalAlias &Dst, GlobalAlias &Src);
-  bool linkGlobalValueBody(GlobalValue &Dst, GlobalValue &Src);
+  Error linkGlobalValueBody(GlobalValue &Dst, GlobalValue &Src);
 
   /// Functions that take care of cloning a specific global value type
   /// into the destination module.
@@ -495,70 +478,26 @@ class IRLinker {
 
   void linkNamedMDNodes();
 
-  /// Populate the UnneededSubprograms set with the DISubprogram metadata
-  /// from the source module that we don't need to link into the dest module,
-  /// because the functions were not imported directly or via an inlined body
-  /// in an imported function.
-  void findNeededSubprograms(ValueToValueMapTy &ValueMap);
-
-  /// The value mapper leaves nulls in the list of subprograms for any
-  /// in the UnneededSubprograms map. Strip those out after metadata linking.
-  void stripNullSubprograms();
-
 public:
-  IRLinker(Module &DstM, IRMover::IdentifiedStructTypeSet &Set, Module &SrcM,
+  IRLinker(Module &DstM, MDMapT &SharedMDs,
+           IRMover::IdentifiedStructTypeSet &Set, std::unique_ptr<Module> SrcM,
            ArrayRef<GlobalValue *> ValuesToLink,
-           std::function<void(GlobalValue &, IRMover::ValueAdder)> AddLazyFor,
-           DenseMap<unsigned, MDNode *> *ValIDToTempMDMap = nullptr,
-           bool IsMetadataLinkingPostpass = false)
-      : DstM(DstM), SrcM(SrcM), AddLazyFor(AddLazyFor), TypeMap(Set),
-        GValMaterializer(this), LValMaterializer(this),
-        IsMetadataLinkingPostpass(IsMetadataLinkingPostpass),
-        ValIDToTempMDMap(ValIDToTempMDMap) {
+           std::function<void(GlobalValue &, IRMover::ValueAdder)> AddLazyFor)
+      : DstM(DstM), SrcM(std::move(SrcM)), AddLazyFor(std::move(AddLazyFor)),
+        TypeMap(Set), GValMaterializer(*this), LValMaterializer(*this),
+        SharedMDs(SharedMDs),
+        Mapper(ValueMap, RF_MoveDistinctMDs | RF_IgnoreMissingLocals, &TypeMap,
+               &GValMaterializer),
+        AliasMCID(Mapper.registerAlternateMappingContext(AliasValueMap,
+                                                         &LValMaterializer)) {
+    ValueMap.getMDMap() = std::move(SharedMDs);
     for (GlobalValue *GV : ValuesToLink)
       maybeAdd(GV);
-
-    // If appropriate, tell the value mapper that it can expect to see
-    // temporary metadata.
-    if (!shouldLinkMetadata())
-      ValueMapperFlags = ValueMapperFlags | RF_HaveUnmaterializedMetadata;
-  }
-
-  ~IRLinker() {
-    // In the case where we are not linking metadata, we unset the CanReplace
-    // flag on all temporary metadata in the MetadataToIDs map to ensure
-    // none was replaced while being a map key. Now that we are destructing
-    // the map, set the flag back to true, so that it is replaceable during
-    // metadata linking.
-    if (!shouldLinkMetadata()) {
-      for (auto MDI : MetadataToIDs) {
-        Metadata *MD = const_cast<Metadata *>(MDI.first);
-        MDNode *Node = dyn_cast<MDNode>(MD);
-        assert((Node && Node->isTemporary()) &&
-               "Found non-temp metadata in map when not linking metadata");
-        Node->setCanReplace(true);
-      }
-    }
   }
+  ~IRLinker() { SharedMDs = std::move(*ValueMap.getMDMap()); }
 
-  bool run();
-  Value *materializeDeclFor(Value *V, bool ForAlias);
-  void materializeInitFor(GlobalValue *New, GlobalValue *Old, bool ForAlias);
-
-  /// Save the mapping between the given temporary metadata and its metadata
-  /// value id. Used to support metadata linking as a postpass for function
-  /// importing.
-  Metadata *mapTemporaryMetadata(Metadata *MD);
-
-  /// Replace any temporary metadata saved for the source metadata's id with
-  /// the new non-temporary metadata. Used when metadata linking as a postpass
-  /// for function importing.
-  void replaceTemporaryMetadata(const Metadata *OrigMD, Metadata *NewMD);
-
-  /// Indicates whether we need to map the given metadata into the destination
-  /// module. Used to prevent linking of metadata only needed by functions not
-  /// linked into the dest module.
-  bool isMetadataNeeded(Metadata *MD);
+  Error run();
+  Value *materialize(Value *V, bool ForAlias);
 };
 }
 
@@ -583,133 +522,59 @@ static void forceRenaming(GlobalValue *GV, StringRef Name) {
   }
 }
 
-Value *GlobalValueMaterializer::materializeDeclFor(Value *V) {
-  return TheIRLinker->materializeDeclFor(V, false);
-}
-
-void GlobalValueMaterializer::materializeInitFor(GlobalValue *New,
-                                                 GlobalValue *Old) {
-  TheIRLinker->materializeInitFor(New, Old, false);
-}
-
-Metadata *GlobalValueMaterializer::mapTemporaryMetadata(Metadata *MD) {
-  return TheIRLinker->mapTemporaryMetadata(MD);
-}
-
-void GlobalValueMaterializer::replaceTemporaryMetadata(const Metadata *OrigMD,
-                                                       Metadata *NewMD) {
-  TheIRLinker->replaceTemporaryMetadata(OrigMD, NewMD);
+Value *GlobalValueMaterializer::materialize(Value *SGV) {
+  return TheIRLinker.materialize(SGV, false);
 }
 
-bool GlobalValueMaterializer::isMetadataNeeded(Metadata *MD) {
-  return TheIRLinker->isMetadataNeeded(MD);
+Value *LocalValueMaterializer::materialize(Value *SGV) {
+  return TheIRLinker.materialize(SGV, true);
 }
 
-Value *LocalValueMaterializer::materializeDeclFor(Value *V) {
-  return TheIRLinker->materializeDeclFor(V, true);
-}
-
-void LocalValueMaterializer::materializeInitFor(GlobalValue *New,
-                                                GlobalValue *Old) {
-  TheIRLinker->materializeInitFor(New, Old, true);
-}
-
-Metadata *LocalValueMaterializer::mapTemporaryMetadata(Metadata *MD) {
-  return TheIRLinker->mapTemporaryMetadata(MD);
-}
-
-void LocalValueMaterializer::replaceTemporaryMetadata(const Metadata *OrigMD,
-                                                      Metadata *NewMD) {
-  TheIRLinker->replaceTemporaryMetadata(OrigMD, NewMD);
-}
-
-bool LocalValueMaterializer::isMetadataNeeded(Metadata *MD) {
-  return TheIRLinker->isMetadataNeeded(MD);
-}
-
-Value *IRLinker::materializeDeclFor(Value *V, bool ForAlias) {
+Value *IRLinker::materialize(Value *V, bool ForAlias) {
   auto *SGV = dyn_cast<GlobalValue>(V);
   if (!SGV)
     return nullptr;
 
-  return linkGlobalValueProto(SGV, ForAlias);
-}
+  Expected<Constant *> NewProto = linkGlobalValueProto(SGV, ForAlias);
+  if (!NewProto) {
+    setError(NewProto.takeError());
+    return nullptr;
+  }
+  if (!*NewProto)
+    return nullptr;
+
+  GlobalValue *New = dyn_cast<GlobalValue>(*NewProto);
+  if (!New)
+    return *NewProto;
 
-void IRLinker::materializeInitFor(GlobalValue *New, GlobalValue *Old,
-                                  bool ForAlias) {
   // If we already created the body, just return.
   if (auto *F = dyn_cast<Function>(New)) {
     if (!F->isDeclaration())
-      return;
+      return New;
   } else if (auto *V = dyn_cast<GlobalVariable>(New)) {
-    if (V->hasInitializer())
-      return;
+    if (V->hasInitializer() || V->hasAppendingLinkage())
+      return New;
   } else {
     auto *A = cast<GlobalAlias>(New);
     if (A->getAliasee())
-      return;
+      return New;
   }
 
-  if (ForAlias || shouldLink(New, *Old))
-    linkGlobalValueBody(*New, *Old);
-}
-
-Metadata *IRLinker::mapTemporaryMetadata(Metadata *MD) {
-  if (!ValIDToTempMDMap)
-    return nullptr;
-  // If this temporary metadata has a value id recorded during function
-  // parsing, record that in the ValIDToTempMDMap if one was provided.
-  if (MetadataToIDs.count(MD)) {
-    unsigned Idx = MetadataToIDs[MD];
-    // Check if we created a temp MD when importing a different function from
-    // this module. If so, reuse it the same temporary metadata, otherwise
-    // add this temporary metadata to the map.
-    if (!ValIDToTempMDMap->count(Idx)) {
-      MDNode *Node = cast<MDNode>(MD);
-      assert(Node->isTemporary());
-      (*ValIDToTempMDMap)[Idx] = Node;
-    }
-    return (*ValIDToTempMDMap)[Idx];
-  }
-  return nullptr;
-}
-
-void IRLinker::replaceTemporaryMetadata(const Metadata *OrigMD,
-                                        Metadata *NewMD) {
-  if (!ValIDToTempMDMap)
-    return;
-#ifndef NDEBUG
-  auto *N = dyn_cast_or_null<MDNode>(NewMD);
-  assert(!N || !N->isTemporary());
-#endif
-  // If a mapping between metadata value ids and temporary metadata
-  // created during function importing was provided, and the source
-  // metadata has a value id recorded during metadata parsing, replace
-  // the temporary metadata with the final mapped metadata now.
-  if (MetadataToIDs.count(OrigMD)) {
-    unsigned Idx = MetadataToIDs[OrigMD];
-    // Nothing to do if we didn't need to create a temporary metadata during
-    // function importing.
-    if (!ValIDToTempMDMap->count(Idx))
-      return;
-    MDNode *TempMD = (*ValIDToTempMDMap)[Idx];
-    TempMD->replaceAllUsesWith(NewMD);
-    MDNode::deleteTemporary(TempMD);
-    ValIDToTempMDMap->erase(Idx);
-  }
-}
-
-bool IRLinker::isMetadataNeeded(Metadata *MD) {
-  // Currently only DISubprogram metadata is marked as being unneeded.
-  if (UnneededSubprograms.empty())
-    return true;
-  MDNode *Node = dyn_cast<MDNode>(MD);
-  if (!Node)
-    return true;
-  DISubprogram *SP = getDISubprogram(Node);
-  if (!SP)
-    return true;
-  return !UnneededSubprograms.count(SP);
+  // When linking a global for an alias, it will always be linked. However we
+  // need to check if it was not already scheduled to satify a reference from a
+  // regular global value initializer. We know if it has been schedule if the
+  // "New" GlobalValue that is mapped here for the alias is the same as the one
+  // already mapped. If there is an entry in the ValueMap but the value is
+  // different, it means that the value already had a definition in the
+  // destination module (linkonce for instance), but we need a new definition
+  // for the alias ("New" will be different.
+  if (ForAlias && ValueMap.lookup(SGV) == New)
+    return New;
+
+  if (ForAlias || shouldLink(New, *SGV))
+    setError(linkGlobalValueBody(*New, *SGV));
+
+  return New;
 }
 
 /// Loop through the global variables in the src module and merge them into the
@@ -719,7 +584,7 @@ GlobalVariable *IRLinker::copyGlobalVariableProto(const GlobalVariable *SGVar) {
   // identical version of the symbol over in the dest module... the
   // initializer will be filled in later by LinkGlobalInits.
   GlobalVariable *NewDGV =
-      new GlobalVariable(DstM, TypeMap.get(SGVar->getType()->getElementType()),
+      new GlobalVariable(DstM, TypeMap.get(SGVar->getValueType()),
                          SGVar->isConstant(), GlobalValue::ExternalLinkage,
                          /*init*/ nullptr, SGVar->getName(),
                          /*insertbefore*/ nullptr, SGVar->getThreadLocalMode(),
@@ -759,7 +624,7 @@ GlobalValue *IRLinker::copyGlobalValueProto(const GlobalValue *SGV,
       NewGV = copyGlobalAliasProto(cast<GlobalAlias>(SGV));
     else
       NewGV = new GlobalVariable(
-          DstM, TypeMap.get(SGV->getType()->getElementType()),
+          DstM, TypeMap.get(SGV->getValueType()),
           /*isConstant*/ false, GlobalValue::ExternalLinkage,
           /*init*/ nullptr, SGV->getName(),
           /*insertbefore*/ nullptr, SGV->getThreadLocalMode(),
@@ -768,12 +633,17 @@ GlobalValue *IRLinker::copyGlobalValueProto(const GlobalValue *SGV,
 
   if (ForDefinition)
     NewGV->setLinkage(SGV->getLinkage());
-  else if (SGV->hasExternalWeakLinkage() || SGV->hasWeakLinkage() ||
-           SGV->hasLinkOnceLinkage())
+  else if (SGV->hasExternalWeakLinkage())
     NewGV->setLinkage(GlobalValue::ExternalWeakLinkage);
 
   NewGV->copyAttributesFrom(SGV);
 
+  if (auto *NewGO = dyn_cast<GlobalObject>(NewGV)) {
+    // Metadata for global variables and function declarations is copied eagerly.
+    if (isa<GlobalVariable>(SGV) || SGV->isDeclaration())
+      NewGO->copyMetadata(cast<GlobalObject>(SGV), 0);
+  }
+
   // Remove these copied constants in case this stays a declaration, since
   // they point to the source module. If the def is linked the values will
   // be mapped in during linkFunctionBody.
@@ -791,7 +661,7 @@ GlobalValue *IRLinker::copyGlobalValueProto(const GlobalValue *SGV,
 /// types 'Foo' but one got renamed when the module was loaded into the same
 /// LLVMContext.
 void IRLinker::computeTypeMapping() {
-  for (GlobalValue &SGV : SrcM.globals()) {
+  for (GlobalValue &SGV : SrcM->globals()) {
     GlobalValue *DGV = getLinkedToGlobal(&SGV);
     if (!DGV)
       continue;
@@ -802,16 +672,16 @@ void IRLinker::computeTypeMapping() {
     }
 
     // Unify the element type of appending arrays.
-    ArrayType *DAT = cast<ArrayType>(DGV->getType()->getElementType());
-    ArrayType *SAT = cast<ArrayType>(SGV.getType()->getElementType());
+    ArrayType *DAT = cast<ArrayType>(DGV->getValueType());
+    ArrayType *SAT = cast<ArrayType>(SGV.getValueType());
     TypeMap.addTypeMapping(DAT->getElementType(), SAT->getElementType());
   }
 
-  for (GlobalValue &SGV : SrcM)
+  for (GlobalValue &SGV : *SrcM)
     if (GlobalValue *DGV = getLinkedToGlobal(&SGV))
       TypeMap.addTypeMapping(DGV->getType(), SGV.getType());
 
-  for (GlobalValue &SGV : SrcM.aliases())
+  for (GlobalValue &SGV : SrcM->aliases())
     if (GlobalValue *DGV = getLinkedToGlobal(&SGV))
       TypeMap.addTypeMapping(DGV->getType(), SGV.getType());
 
@@ -819,7 +689,7 @@ void IRLinker::computeTypeMapping() {
   // At this point, the destination module may have a type "%foo = { i32 }" for
   // example.  When the source module got loaded into the same LLVMContext, if
   // it had the same type, it would have been renamed to "%foo.42 = { i32 }".
-  std::vector<StructType *> Types = SrcM.getIdentifiedStructTypes();
+  std::vector<StructType *> Types = SrcM->getIdentifiedStructTypes();
   for (StructType *ST : Types) {
     if (!ST->hasName())
       continue;
@@ -871,12 +741,16 @@ static void getArrayElements(const Constant *C,
 }
 
 /// If there were any appending global variables, link them together now.
-/// Return true on error.
-Constant *IRLinker::linkAppendingVarProto(GlobalVariable *DstGV,
-                                          const GlobalVariable *SrcGV) {
-  Type *EltTy = cast<ArrayType>(TypeMap.get(SrcGV->getType()->getElementType()))
+Expected<Constant *>
+IRLinker::linkAppendingVarProto(GlobalVariable *DstGV,
+                                const GlobalVariable *SrcGV) {
+  Type *EltTy = cast<ArrayType>(TypeMap.get(SrcGV->getValueType()))
                     ->getElementType();
 
+  // FIXME: This upgrade is done during linking to support the C API.  Once the
+  // old form is deprecated, we should move this upgrade to
+  // llvm::UpgradeGlobalVariable() and simplify the logic here and in
+  // Mapper::mapAppendingVariable() in ValueMapper.cpp.
   StringRef Name = SrcGV->getName();
   bool IsNewStructor = false;
   bool IsOldStructor = false;
@@ -894,55 +768,40 @@ Constant *IRLinker::linkAppendingVarProto(GlobalVariable *DstGV,
     EltTy = StructType::get(SrcGV->getContext(), Tys, false);
   }
 
+  uint64_t DstNumElements = 0;
   if (DstGV) {
-    ArrayType *DstTy = cast<ArrayType>(DstGV->getType()->getElementType());
+    ArrayType *DstTy = cast<ArrayType>(DstGV->getValueType());
+    DstNumElements = DstTy->getNumElements();
 
-    if (!SrcGV->hasAppendingLinkage() || !DstGV->hasAppendingLinkage()) {
-      emitError(
+    if (!SrcGV->hasAppendingLinkage() || !DstGV->hasAppendingLinkage())
+      return stringErr(
           "Linking globals named '" + SrcGV->getName() +
-          "': can only link appending global with another appending global!");
-      return nullptr;
-    }
+          "': can only link appending global with another appending "
+          "global!");
 
     // Check to see that they two arrays agree on type.
-    if (EltTy != DstTy->getElementType()) {
-      emitError("Appending variables with different element types!");
-      return nullptr;
-    }
-    if (DstGV->isConstant() != SrcGV->isConstant()) {
-      emitError("Appending variables linked with different const'ness!");
-      return nullptr;
-    }
+    if (EltTy != DstTy->getElementType())
+      return stringErr("Appending variables with different element types!");
+    if (DstGV->isConstant() != SrcGV->isConstant())
+      return stringErr("Appending variables linked with different const'ness!");
 
-    if (DstGV->getAlignment() != SrcGV->getAlignment()) {
-      emitError(
+    if (DstGV->getAlignment() != SrcGV->getAlignment())
+      return stringErr(
           "Appending variables with different alignment need to be linked!");
-      return nullptr;
-    }
 
-    if (DstGV->getVisibility() != SrcGV->getVisibility()) {
-      emitError(
+    if (DstGV->getVisibility() != SrcGV->getVisibility())
+      return stringErr(
           "Appending variables with different visibility need to be linked!");
-      return nullptr;
-    }
 
-    if (DstGV->hasUnnamedAddr() != SrcGV->hasUnnamedAddr()) {
-      emitError(
+    if (DstGV->hasGlobalUnnamedAddr() != SrcGV->hasGlobalUnnamedAddr())
+      return stringErr(
           "Appending variables with different unnamed_addr need to be linked!");
-      return nullptr;
-    }
 
-    if (StringRef(DstGV->getSection()) != SrcGV->getSection()) {
-      emitError(
+    if (DstGV->getSection() != SrcGV->getSection())
+      return stringErr(
           "Appending variables with different section name need to be linked!");
-      return nullptr;
-    }
   }
 
-  SmallVector<Constant *, 16> DstElements;
-  if (DstGV)
-    getArrayElements(DstGV->getInitializer(), DstElements);
-
   SmallVector<Constant *, 16> SrcElements;
   getArrayElements(SrcGV->getInitializer(), SrcElements);
 
@@ -958,7 +817,7 @@ Constant *IRLinker::linkAppendingVarProto(GlobalVariable *DstGV,
                          return !shouldLink(DGV, *Key);
                        }),
         SrcElements.end());
-  uint64_t NewSize = DstElements.size() + SrcElements.size();
+  uint64_t NewSize = DstNumElements + SrcElements.size();
   ArrayType *NewType = ArrayType::get(EltTy, NewSize);
 
   // Create the new global variable.
@@ -972,28 +831,9 @@ Constant *IRLinker::linkAppendingVarProto(GlobalVariable *DstGV,
 
   Constant *Ret = ConstantExpr::getBitCast(NG, TypeMap.get(SrcGV->getType()));
 
-  // Stop recursion.
-  ValueMap[SrcGV] = Ret;
-
-  for (auto *V : SrcElements) {
-    Constant *NewV;
-    if (IsOldStructor) {
-      auto *S = cast<ConstantStruct>(V);
-      auto *E1 = MapValue(S->getOperand(0), ValueMap, ValueMapperFlags,
-                          &TypeMap, &GValMaterializer);
-      auto *E2 = MapValue(S->getOperand(1), ValueMap, ValueMapperFlags,
-                          &TypeMap, &GValMaterializer);
-      Value *Null = Constant::getNullValue(VoidPtrTy);
-      NewV =
-          ConstantStruct::get(cast<StructType>(EltTy), E1, E2, Null, nullptr);
-    } else {
-      NewV =
-          MapValue(V, ValueMap, ValueMapperFlags, &TypeMap, &GValMaterializer);
-    }
-    DstElements.push_back(NewV);
-  }
-
-  NG->setInitializer(ConstantArray::get(NewType, DstElements));
+  Mapper.scheduleMapAppendingVariable(*NG,
+                                      DstGV ? DstGV->getInitializer() : nullptr,
+                                      IsOldStructor, SrcElements);
 
   // Replace any uses of the two global variables with uses of the new
   // global.
@@ -1005,52 +845,31 @@ Constant *IRLinker::linkAppendingVarProto(GlobalVariable *DstGV,
   return Ret;
 }
 
-static bool useExistingDest(GlobalValue &SGV, GlobalValue *DGV,
-                            bool ShouldLink) {
-  if (!DGV)
-    return false;
-
-  if (SGV.isDeclaration())
-    return true;
-
-  if (DGV->isDeclarationForLinker() && !SGV.isDeclarationForLinker())
-    return false;
-
-  if (ShouldLink)
-    return false;
-
-  return true;
-}
-
 bool IRLinker::shouldLink(GlobalValue *DGV, GlobalValue &SGV) {
-  // Already imported all the values. Just map to the Dest value
-  // in case it is referenced in the metadata.
-  if (IsMetadataLinkingPostpass) {
-    assert(!ValuesToLink.count(&SGV) &&
-           "Source value unexpectedly requested for link during metadata link");
-    return false;
-  }
-
-  if (ValuesToLink.count(&SGV))
-    return true;
-
-  if (SGV.hasLocalLinkage())
+  if (ValuesToLink.count(&SGV) || SGV.hasLocalLinkage())
     return true;
 
-  if (DGV && !DGV->isDeclaration())
+  if (DGV && !DGV->isDeclarationForLinker())
     return false;
 
   if (SGV.hasAvailableExternallyLinkage())
     return true;
 
-  if (DoneLinkingBodies)
+  if (SGV.isDeclaration() || DoneLinkingBodies)
     return false;
 
-  AddLazyFor(SGV, [this](GlobalValue &GV) { maybeAdd(&GV); });
-  return ValuesToLink.count(&SGV);
+  // Callback to the client to give a chance to lazily add the Global to the
+  // list of value to link.
+  bool LazilyAdded = false;
+  AddLazyFor(SGV, [this, &LazilyAdded](GlobalValue &GV) {
+    maybeAdd(&GV);
+    LazilyAdded = true;
+  });
+  return LazilyAdded;
 }
 
-Constant *IRLinker::linkGlobalValueProto(GlobalValue *SGV, bool ForAlias) {
+Expected<Constant *> IRLinker::linkGlobalValueProto(GlobalValue *SGV,
+                                                    bool ForAlias) {
   GlobalValue *DGV = getLinkedToGlobal(SGV);
 
   bool ShouldLink = shouldLink(DGV, *SGV);
@@ -1066,9 +885,8 @@ Constant *IRLinker::linkGlobalValueProto(GlobalValue *SGV, bool ForAlias) {
       return cast<Constant>(I->second);
   }
 
-  DGV = nullptr;
-  if (ShouldLink || !ForAlias)
-    DGV = getLinkedToGlobal(SGV);
+  if (!ShouldLink && ForAlias)
+    DGV = nullptr;
 
   // Handle the ultra special appending linkage case first.
   assert(!DGV || SGV->hasAppendingLinkage() == DGV->hasAppendingLinkage());
@@ -1077,7 +895,7 @@ Constant *IRLinker::linkGlobalValueProto(GlobalValue *SGV, bool ForAlias) {
                                  cast<GlobalVariable>(SGV));
 
   GlobalValue *NewGV;
-  if (useExistingDest(*SGV, DGV, ShouldLink)) {
+  if (DGV && !ShouldLink) {
     NewGV = DGV;
   } else {
     // If we are done linking global value bodies (i.e. we are performing
@@ -1087,9 +905,17 @@ Constant *IRLinker::linkGlobalValueProto(GlobalValue *SGV, bool ForAlias) {
       return nullptr;
 
     NewGV = copyGlobalValueProto(SGV, ShouldLink);
-    if (!ForAlias)
+    if (ShouldLink || !ForAlias)
       forceRenaming(NewGV, SGV->getName());
   }
+
+  // Overloaded intrinsics have overloaded types names as part of their
+  // names. If we renamed overloaded types we should rename the intrinsic
+  // as well.
+  if (Function *F = dyn_cast<Function>(NewGV))
+    if (auto Remangled = Intrinsic::remangleIntrinsicFunction(F))
+      NewGV = Remangled.getValue();
+
   if (ShouldLink || ForAlias) {
     if (const Comdat *SC = SGV->getComdat()) {
       if (auto *GO = dyn_cast<GlobalObject>(NewGV)) {
@@ -1119,196 +945,74 @@ Constant *IRLinker::linkGlobalValueProto(GlobalValue *SGV, bool ForAlias) {
 /// referenced are in Dest.
 void IRLinker::linkGlobalInit(GlobalVariable &Dst, GlobalVariable &Src) {
   // Figure out what the initializer looks like in the dest module.
-  Dst.setInitializer(MapValue(Src.getInitializer(), ValueMap, ValueMapperFlags,
-                              &TypeMap, &GValMaterializer));
+  Mapper.scheduleMapGlobalInitializer(Dst, *Src.getInitializer());
 }
 
 /// Copy the source function over into the dest function and fix up references
 /// to values. At this point we know that Dest is an external function, and
 /// that Src is not.
-bool IRLinker::linkFunctionBody(Function &Dst, Function &Src) {
+Error IRLinker::linkFunctionBody(Function &Dst, Function &Src) {
   assert(Dst.isDeclaration() && !Src.isDeclaration());
 
   // Materialize if needed.
   if (std::error_code EC = Src.materialize())
-    return emitError(EC.message());
-
-  if (!shouldLinkMetadata())
-    // This is only supported for lazy links. Do after materialization of
-    // a function and before remapping metadata on instructions below
-    // in RemapInstruction, as the saved mapping is used to handle
-    // the temporary metadata hanging off instructions.
-    SrcM.getMaterializer()->saveMetadataList(MetadataToIDs,
-                                             /* OnlyTempMD = */ true);
+    return errorCodeToError(EC);
 
-  // Link in the prefix data.
+  // Link in the operands without remapping.
   if (Src.hasPrefixData())
-    Dst.setPrefixData(MapValue(Src.getPrefixData(), ValueMap, ValueMapperFlags,
-                               &TypeMap, &GValMaterializer));
-
-  // Link in the prologue data.
+    Dst.setPrefixData(Src.getPrefixData());
   if (Src.hasPrologueData())
-    Dst.setPrologueData(MapValue(Src.getPrologueData(), ValueMap,
-                                 ValueMapperFlags, &TypeMap,
-                                 &GValMaterializer));
-
-  // Link in the personality function.
+    Dst.setPrologueData(Src.getPrologueData());
   if (Src.hasPersonalityFn())
-    Dst.setPersonalityFn(MapValue(Src.getPersonalityFn(), ValueMap,
-                                  ValueMapperFlags, &TypeMap,
-                                  &GValMaterializer));
-
-  // Go through and convert function arguments over, remembering the mapping.
-  Function::arg_iterator DI = Dst.arg_begin();
-  for (Argument &Arg : Src.args()) {
-    DI->setName(Arg.getName()); // Copy the name over.
-
-    // Add a mapping to our mapping.
-    ValueMap[&Arg] = &*DI;
-    ++DI;
-  }
+    Dst.setPersonalityFn(Src.getPersonalityFn());
 
-  // Copy over the metadata attachments.
-  SmallVector<std::pair<unsigned, MDNode *>, 8> MDs;
-  Src.getAllMetadata(MDs);
-  for (const auto &I : MDs)
-    Dst.setMetadata(I.first, MapMetadata(I.second, ValueMap, ValueMapperFlags,
-                                         &TypeMap, &GValMaterializer));
+  // Copy over the metadata attachments without remapping.
+  Dst.copyMetadata(&Src, 0);
 
-  // Splice the body of the source function into the dest function.
+  // Steal arguments and splice the body of Src into Dst.
+  Dst.stealArgumentListFrom(Src);
   Dst.getBasicBlockList().splice(Dst.end(), Src.getBasicBlockList());
 
-  // At this point, all of the instructions and values of the function are now
-  // copied over.  The only problem is that they are still referencing values in
-  // the Source function as operands.  Loop through all of the operands of the
-  // functions and patch them up to point to the local versions.
-  for (BasicBlock &BB : Dst)
-    for (Instruction &I : BB)
-      RemapInstruction(&I, ValueMap, RF_IgnoreMissingEntries | ValueMapperFlags,
-                       &TypeMap, &GValMaterializer);
-
-  // There is no need to map the arguments anymore.
-  for (Argument &Arg : Src.args())
-    ValueMap.erase(&Arg);
-
-  return false;
+  // Everything has been moved over.  Remap it.
+  Mapper.scheduleRemapFunction(Dst);
+  return Error::success();
 }
 
 void IRLinker::linkAliasBody(GlobalAlias &Dst, GlobalAlias &Src) {
-  Constant *Aliasee = Src.getAliasee();
-  Constant *Val = MapValue(Aliasee, AliasValueMap, ValueMapperFlags, &TypeMap,
-                           &LValMaterializer);
-  Dst.setAliasee(Val);
+  Mapper.scheduleMapGlobalAliasee(Dst, *Src.getAliasee(), AliasMCID);
 }
 
-bool IRLinker::linkGlobalValueBody(GlobalValue &Dst, GlobalValue &Src) {
+Error IRLinker::linkGlobalValueBody(GlobalValue &Dst, GlobalValue &Src) {
   if (auto *F = dyn_cast<Function>(&Src))
     return linkFunctionBody(cast<Function>(Dst), *F);
   if (auto *GVar = dyn_cast<GlobalVariable>(&Src)) {
     linkGlobalInit(cast<GlobalVariable>(Dst), *GVar);
-    return false;
+    return Error::success();
   }
   linkAliasBody(cast<GlobalAlias>(Dst), cast<GlobalAlias>(Src));
-  return false;
-}
-
-void IRLinker::findNeededSubprograms(ValueToValueMapTy &ValueMap) {
-  // Track unneeded nodes to make it simpler to handle the case
-  // where we are checking if an already-mapped SP is needed.
-  NamedMDNode *CompileUnits = SrcM.getNamedMetadata("llvm.dbg.cu");
-  if (!CompileUnits)
-    return;
-  for (unsigned I = 0, E = CompileUnits->getNumOperands(); I != E; ++I) {
-    auto *CU = cast<DICompileUnit>(CompileUnits->getOperand(I));
-    assert(CU && "Expected valid compile unit");
-    // Ensure that we don't remove subprograms referenced by DIImportedEntity.
-    // It is not legal to have a DIImportedEntity with a null entity or scope.
-    // FIXME: The DISubprogram for functions not linked in but kept due to
-    // being referenced by a DIImportedEntity should also get their
-    // IsDefinition flag is unset.
-    SmallPtrSet<DISubprogram *, 8> ImportedEntitySPs;
-    for (auto *IE : CU->getImportedEntities()) {
-      if (auto *SP = dyn_cast<DISubprogram>(IE->getEntity()))
-        ImportedEntitySPs.insert(SP);
-      if (auto *SP = dyn_cast<DISubprogram>(IE->getScope()))
-        ImportedEntitySPs.insert(SP);
-    }
-    for (auto *Op : CU->getSubprograms()) {
-      // Unless we were doing function importing and deferred metadata linking,
-      // any needed SPs should have been mapped as they would be reached
-      // from the function linked in (either on the function itself for linked
-      // function bodies, or from DILocation on inlined instructions).
-      assert(!(ValueMap.MD()[Op] && IsMetadataLinkingPostpass) &&
-             "DISubprogram shouldn't be mapped yet");
-      if (!ValueMap.MD()[Op] && !ImportedEntitySPs.count(Op))
-        UnneededSubprograms.insert(Op);
-    }
-  }
-  if (!IsMetadataLinkingPostpass)
-    return;
-  // In the case of metadata linking as a postpass (e.g. for function
-  // importing), see which DISubprogram MD from the source has an associated
-  // temporary metadata node, which means the SP was needed by an imported
-  // function.
-  for (auto MDI : MetadataToIDs) {
-    const MDNode *Node = dyn_cast<MDNode>(MDI.first);
-    if (!Node)
-      continue;
-    DISubprogram *SP = getDISubprogram(Node);
-    if (!SP || !ValIDToTempMDMap->count(MDI.second))
-      continue;
-    UnneededSubprograms.erase(SP);
-  }
-}
-
-// Squash null subprograms from compile unit subprogram lists.
-void IRLinker::stripNullSubprograms() {
-  NamedMDNode *CompileUnits = DstM.getNamedMetadata("llvm.dbg.cu");
-  if (!CompileUnits)
-    return;
-  for (unsigned I = 0, E = CompileUnits->getNumOperands(); I != E; ++I) {
-    auto *CU = cast<DICompileUnit>(CompileUnits->getOperand(I));
-    assert(CU && "Expected valid compile unit");
-
-    SmallVector<Metadata *, 16> NewSPs;
-    NewSPs.reserve(CU->getSubprograms().size());
-    bool FoundNull = false;
-    for (DISubprogram *SP : CU->getSubprograms()) {
-      if (!SP) {
-        FoundNull = true;
-        continue;
-      }
-      NewSPs.push_back(SP);
-    }
-    if (FoundNull)
-      CU->replaceSubprograms(MDTuple::get(CU->getContext(), NewSPs));
-  }
+  return Error::success();
 }
 
 /// Insert all of the named MDNodes in Src into the Dest module.
 void IRLinker::linkNamedMDNodes() {
-  findNeededSubprograms(ValueMap);
-  const NamedMDNode *SrcModFlags = SrcM.getModuleFlagsMetadata();
-  for (const NamedMDNode &NMD : SrcM.named_metadata()) {
+  const NamedMDNode *SrcModFlags = SrcM->getModuleFlagsMetadata();
+  for (const NamedMDNode &NMD : SrcM->named_metadata()) {
     // Don't link module flags here. Do them separately.
     if (&NMD == SrcModFlags)
       continue;
     NamedMDNode *DestNMD = DstM.getOrInsertNamedMetadata(NMD.getName());
     // Add Src elements into Dest node.
-    for (const MDNode *op : NMD.operands())
-      DestNMD->addOperand(MapMetadata(
-          op, ValueMap, ValueMapperFlags | RF_NullMapMissingGlobalValues,
-          &TypeMap, &GValMaterializer));
+    for (const MDNode *Op : NMD.operands())
+      DestNMD->addOperand(Mapper.mapMDNode(*Op));
   }
-  stripNullSubprograms();
 }
 
 /// Merge the linker flags in Src into the Dest module.
-bool IRLinker::linkModuleFlagsMetadata() {
+Error IRLinker::linkModuleFlagsMetadata() {
   // If the source module has no module flags, we are done.
-  const NamedMDNode *SrcModFlags = SrcM.getModuleFlagsMetadata();
+  const NamedMDNode *SrcModFlags = SrcM->getModuleFlagsMetadata();
   if (!SrcModFlags)
-    return false;
+    return Error::success();
 
   // If the destination module doesn't have module flags yet, then just copy
   // over the source module's flags.
@@ -1317,7 +1021,7 @@ bool IRLinker::linkModuleFlagsMetadata() {
     for (unsigned I = 0, E = SrcModFlags->getNumOperands(); I != E; ++I)
       DstModFlags->addOperand(SrcModFlags->getOperand(I));
 
-    return false;
+    return Error::success();
   }
 
   // First build a map of the existing module flags and requirements.
@@ -1373,10 +1077,9 @@ bool IRLinker::linkModuleFlagsMetadata() {
     if (DstBehaviorValue == Module::Override) {
       // Diagnose inconsistent flags which both have override behavior.
       if (SrcBehaviorValue == Module::Override &&
-          SrcOp->getOperand(2) != DstOp->getOperand(2)) {
-        emitError("linking module flags '" + ID->getString() +
-                  "': IDs have conflicting override values");
-      }
+          SrcOp->getOperand(2) != DstOp->getOperand(2))
+        return stringErr("linking module flags '" + ID->getString() +
+                         "': IDs have conflicting override values");
       continue;
     } else if (SrcBehaviorValue == Module::Override) {
       // Update the destination flag to that of the source.
@@ -1386,11 +1089,9 @@ bool IRLinker::linkModuleFlagsMetadata() {
     }
 
     // Diagnose inconsistent merge behavior types.
-    if (SrcBehaviorValue != DstBehaviorValue) {
-      emitError("linking module flags '" + ID->getString() +
-                "': IDs have conflicting behaviors");
-      continue;
-    }
+    if (SrcBehaviorValue != DstBehaviorValue)
+      return stringErr("linking module flags '" + ID->getString() +
+                       "': IDs have conflicting behaviors");
 
     auto replaceDstValue = [&](MDNode *New) {
       Metadata *FlagOps[] = {DstOp->getOperand(0), ID, New};
@@ -1406,10 +1107,9 @@ bool IRLinker::linkModuleFlagsMetadata() {
       llvm_unreachable("not possible");
     case Module::Error: {
       // Emit an error if the values differ.
-      if (SrcOp->getOperand(2) != DstOp->getOperand(2)) {
-        emitError("linking module flags '" + ID->getString() +
-                  "': IDs have conflicting values");
-      }
+      if (SrcOp->getOperand(2) != DstOp->getOperand(2))
+        return stringErr("linking module flags '" + ID->getString() +
+                         "': IDs have conflicting values");
       continue;
     }
     case Module::Warning: {
@@ -1452,14 +1152,11 @@ bool IRLinker::linkModuleFlagsMetadata() {
     Metadata *ReqValue = Requirement->getOperand(1);
 
     MDNode *Op = Flags[Flag].first;
-    if (!Op || Op->getOperand(2) != ReqValue) {
-      emitError("linking module flags '" + Flag->getString() +
-                "': does not have the required value");
-      continue;
-    }
+    if (!Op || Op->getOperand(2) != ReqValue)
+      return stringErr("linking module flags '" + Flag->getString() +
+                       "': does not have the required value");
   }
-
-  return HasError;
+  return Error::success();
 }
 
 // This function returns true if the triples match.
@@ -1483,41 +1180,47 @@ static std::string mergeTriples(const Triple &SrcTriple,
   return DstTriple.str();
 }
 
-bool IRLinker::run() {
+Error IRLinker::run() {
+  // Ensure metadata materialized before value mapping.
+  if (SrcM->getMaterializer())
+    if (std::error_code EC = SrcM->getMaterializer()->materializeMetadata())
+      return errorCodeToError(EC);
+
   // Inherit the target data from the source module if the destination module
   // doesn't have one already.
   if (DstM.getDataLayout().isDefault())
-    DstM.setDataLayout(SrcM.getDataLayout());
+    DstM.setDataLayout(SrcM->getDataLayout());
 
-  if (SrcM.getDataLayout() != DstM.getDataLayout()) {
+  if (SrcM->getDataLayout() != DstM.getDataLayout()) {
     emitWarning("Linking two modules of different data layouts: '" +
-                SrcM.getModuleIdentifier() + "' is '" +
-                SrcM.getDataLayoutStr() + "' whereas '" +
+                SrcM->getModuleIdentifier() + "' is '" +
+                SrcM->getDataLayoutStr() + "' whereas '" +
                 DstM.getModuleIdentifier() + "' is '" +
                 DstM.getDataLayoutStr() + "'\n");
   }
 
   // Copy the target triple from the source to dest if the dest's is empty.
-  if (DstM.getTargetTriple().empty() && !SrcM.getTargetTriple().empty())
-    DstM.setTargetTriple(SrcM.getTargetTriple());
+  if (DstM.getTargetTriple().empty() && !SrcM->getTargetTriple().empty())
+    DstM.setTargetTriple(SrcM->getTargetTriple());
 
-  Triple SrcTriple(SrcM.getTargetTriple()), DstTriple(DstM.getTargetTriple());
+  Triple SrcTriple(SrcM->getTargetTriple()), DstTriple(DstM.getTargetTriple());
 
-  if (!SrcM.getTargetTriple().empty() && !triplesMatch(SrcTriple, DstTriple))
+  if (!SrcM->getTargetTriple().empty() && !triplesMatch(SrcTriple, DstTriple))
     emitWarning("Linking two modules of different target triples: " +
-                SrcM.getModuleIdentifier() + "' is '" + SrcM.getTargetTriple() +
-                "' whereas '" + DstM.getModuleIdentifier() + "' is '" +
-                DstM.getTargetTriple() + "'\n");
+                SrcM->getModuleIdentifier() + "' is '" +
+                SrcM->getTargetTriple() + "' whereas '" +
+                DstM.getModuleIdentifier() + "' is '" + DstM.getTargetTriple() +
+                "'\n");
 
   DstM.setTargetTriple(mergeTriples(SrcTriple, DstTriple));
 
   // Append the module inline asm string.
-  if (!SrcM.getModuleInlineAsm().empty()) {
+  if (!SrcM->getModuleInlineAsm().empty()) {
     if (DstM.getModuleInlineAsm().empty())
-      DstM.setModuleInlineAsm(SrcM.getModuleInlineAsm());
+      DstM.setModuleInlineAsm(SrcM->getModuleInlineAsm());
     else
       DstM.setModuleInlineAsm(DstM.getModuleInlineAsm() + "\n" +
-                              SrcM.getModuleInlineAsm());
+                              SrcM->getModuleInlineAsm());
   }
 
   // Loop over all of the linked values to compute type mappings.
@@ -1534,54 +1237,23 @@ bool IRLinker::run() {
       continue;
 
     assert(!GV->isDeclaration());
-    MapValue(GV, ValueMap, ValueMapperFlags, &TypeMap, &GValMaterializer);
-    if (HasError)
-      return true;
+    Mapper.mapValue(*GV);
+    if (FoundError)
+      return std::move(*FoundError);
   }
 
   // Note that we are done linking global value bodies. This prevents
   // metadata linking from creating new references.
   DoneLinkingBodies = true;
+  Mapper.addFlags(RF_NullMapMissingGlobalValues);
 
   // Remap all of the named MDNodes in Src into the DstM module. We do this
   // after linking GlobalValues so that MDNodes that reference GlobalValues
   // are properly remapped.
-  if (shouldLinkMetadata()) {
-    // Even if just linking metadata we should link decls above in case
-    // any are referenced by metadata. IRLinker::shouldLink ensures that
-    // we don't actually link anything from source.
-    if (IsMetadataLinkingPostpass) {
-      // Ensure metadata materialized
-      if (SrcM.getMaterializer()->materializeMetadata())
-        return true;
-      SrcM.getMaterializer()->saveMetadataList(MetadataToIDs,
-                                               /* OnlyTempMD = */ false);
-    }
-
-    linkNamedMDNodes();
-
-    if (IsMetadataLinkingPostpass) {
-      // Handle anything left in the ValIDToTempMDMap, such as metadata nodes
-      // not reached by the dbg.cu NamedMD (i.e. only reached from
-      // instructions).
-      // Walk the MetadataToIDs once to find the set of new (imported) MD
-      // that still has corresponding temporary metadata, and invoke metadata
-      // mapping on each one.
-      for (auto MDI : MetadataToIDs) {
-        if (!ValIDToTempMDMap->count(MDI.second))
-          continue;
-        MapMetadata(MDI.first, ValueMap, ValueMapperFlags, &TypeMap,
-                    &GValMaterializer);
-      }
-      assert(ValIDToTempMDMap->empty());
-    }
-
-    // Merge the module flags into the DstM module.
-    if (linkModuleFlagsMetadata())
-      return true;
-  }
+  linkNamedMDNodes();
 
-  return false;
+  // Merge the module flags into the DstM module.
+  return linkModuleFlagsMetadata();
 }
 
 IRMover::StructTypeKeyInfo::KeyTy::KeyTy(ArrayRef<Type *> E, bool P)
@@ -1591,11 +1263,7 @@ IRMover::StructTypeKeyInfo::KeyTy::KeyTy(const StructType *ST)
     : ETypes(ST->elements()), IsPacked(ST->isPacked()) {}
 
 bool IRMover::StructTypeKeyInfo::KeyTy::operator==(const KeyTy &That) const {
-  if (IsPacked != That.IsPacked)
-    return false;
-  if (ETypes != That.ETypes)
-    return false;
-  return true;
+  return IsPacked == That.IsPacked && ETypes == That.ETypes;
 }
 
 bool IRMover::StructTypeKeyInfo::KeyTy::operator!=(const KeyTy &That) const {
@@ -1628,12 +1296,8 @@ bool IRMover::StructTypeKeyInfo::isEqual(const KeyTy &LHS,
 
 bool IRMover::StructTypeKeyInfo::isEqual(const StructType *LHS,
                                          const StructType *RHS) {
-  if (RHS == getEmptyKey())
-    return LHS == getEmptyKey();
-
-  if (RHS == getTombstoneKey())
-    return LHS == getTombstoneKey();
-
+  if (RHS == getEmptyKey() || RHS == getTombstoneKey())
+    return LHS == RHS;
   return KeyTy(LHS) == KeyTy(RHS);
 }
 
@@ -1660,18 +1324,14 @@ IRMover::IdentifiedStructTypeSet::findNonOpaque(ArrayRef<Type *> ETypes,
                                                 bool IsPacked) {
   IRMover::StructTypeKeyInfo::KeyTy Key(ETypes, IsPacked);
   auto I = NonOpaqueStructTypes.find_as(Key);
-  if (I == NonOpaqueStructTypes.end())
-    return nullptr;
-  return *I;
+  return I == NonOpaqueStructTypes.end() ? nullptr : *I;
 }
 
 bool IRMover::IdentifiedStructTypeSet::hasType(StructType *Ty) {
   if (Ty->isOpaque())
     return OpaqueStructTypes.count(Ty);
   auto I = NonOpaqueStructTypes.find(Ty);
-  if (I == NonOpaqueStructTypes.end())
-    return false;
-  return *I == Ty;
+  return I == NonOpaqueStructTypes.end() ? false : *I == Ty;
 }
 
 IRMover::IRMover(Module &M) : Composite(M) {
@@ -1685,14 +1345,12 @@ IRMover::IRMover(Module &M) : Composite(M) {
   }
 }
 
-bool IRMover::move(
-    Module &Src, ArrayRef<GlobalValue *> ValuesToLink,
-    std::function<void(GlobalValue &, ValueAdder Add)> AddLazyFor,
-    DenseMap<unsigned, MDNode *> *ValIDToTempMDMap,
-    bool IsMetadataLinkingPostpass) {
-  IRLinker TheIRLinker(Composite, IdentifiedStructTypes, Src, ValuesToLink,
-                       AddLazyFor, ValIDToTempMDMap, IsMetadataLinkingPostpass);
-  bool RetCode = TheIRLinker.run();
+Error IRMover::move(
+    std::unique_ptr<Module> Src, ArrayRef<GlobalValue *> ValuesToLink,
+    std::function<void(GlobalValue &, ValueAdder Add)> AddLazyFor) {
+  IRLinker TheIRLinker(Composite, SharedMDs, IdentifiedStructTypes,
+                       std::move(Src), ValuesToLink, std::move(AddLazyFor));
+  Error E = TheIRLinker.run();
   Composite.dropTriviallyDeadConstantArrays();
-  return RetCode;
+  return E;
 }
diff --git a/lib/Linker/LinkModules.cpp b/lib/Linker/LinkModules.cpp
index 6ffa71e14779..fae9c95ebe8f 100644
--- a/lib/Linker/LinkModules.cpp
+++ b/lib/Linker/LinkModules.cpp
@@ -11,13 +11,15 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "llvm/Linker/Linker.h"
 #include "LinkDiagnosticInfo.h"
 #include "llvm-c/Linker.h"
 #include "llvm/ADT/SetVector.h"
 #include "llvm/ADT/StringSet.h"
 #include "llvm/IR/DiagnosticPrinter.h"
 #include "llvm/IR/LLVMContext.h"
+#include "llvm/Linker/Linker.h"
+#include "llvm/Support/Error.h"
+#include "llvm/Transforms/Utils/FunctionImportUtils.h"
 using namespace llvm;
 
 namespace {
@@ -26,7 +28,7 @@ namespace {
 /// entrypoint for this file.
 class ModuleLinker {
   IRMover &Mover;
-  Module &SrcM;
+  std::unique_ptr<Module> SrcM;
 
   SetVector<GlobalValue *> ValuesToLink;
   StringSet<> Internalize;
@@ -34,31 +36,19 @@ class ModuleLinker {
   /// For symbol clashes, prefer those from Src.
   unsigned Flags;
 
-  /// Function index passed into ModuleLinker for using in function
-  /// importing/exporting handling.
-  const FunctionInfoIndex *ImportIndex;
-
   /// Functions to import from source module, all other functions are
   /// imported as declarations instead of definitions.
-  DenseSet<const GlobalValue *> *FunctionsToImport;
-
-  /// Set to true if the given FunctionInfoIndex contains any functions
-  /// from this source module, in which case we must conservatively assume
-  /// that any of its functions may be imported into another module
-  /// as part of a different backend compilation process.
-  bool HasExportedFunctions = false;
-
-  /// Association between metadata value id and temporary metadata that
-  /// remains unmapped after function importing. Saved during function
-  /// importing and consumed during the metadata linking postpass.
-  DenseMap<unsigned, MDNode *> *ValIDToTempMDMap;
+  DenseSet<const GlobalValue *> *GlobalsToImport;
 
   /// Used as the callback for lazy linking.
   /// The mover has just hit GV and we have to decide if it, and other members
   /// of the same comdat, should be linked. Every member to be linked is passed
   /// to Add.
-  void addLazyFor(GlobalValue &GV, IRMover::ValueAdder Add);
+  void addLazyFor(GlobalValue &GV, const IRMover::ValueAdder &Add);
 
+  bool shouldLinkReferencedLinkOnce() {
+    return !(Flags & Linker::DontForceLinkLinkonceODR);
+  }
   bool shouldOverrideFromSrc() { return Flags & Linker::OverrideFromSrc; }
   bool shouldLinkOnlyNeeded() { return Flags & Linker::LinkOnlyNeeded; }
   bool shouldInternalizeLinkedSymbols() {
@@ -70,7 +60,7 @@ class ModuleLinker {
 
   /// Should we have mover and linker error diag info?
   bool emitError(const Twine &Message) {
-    SrcM.getContext().diagnose(LinkDiagnosticInfo(DS_Error, Message));
+    SrcM->getContext().diagnose(LinkDiagnosticInfo(DS_Error, Message));
     return true;
   }
 
@@ -85,8 +75,8 @@ class ModuleLinker {
       ComdatsChosen;
   bool getComdatResult(const Comdat *SrcC, Comdat::SelectionKind &SK,
                        bool &LinkFromSrc);
-  // Keep track of the global value members of each comdat in source.
-  DenseMap<const Comdat *, std::vector<GlobalValue *>> ComdatMembers;
+  // Keep track of the lazy linked global members of each comdat in source.
+  DenseMap<const Comdat *, std::vector<GlobalValue *>> LazyComdatMembers;
 
   /// Given a global in the source module, return the global in the
   /// destination module that is being linked to, if any.
@@ -111,289 +101,36 @@ class ModuleLinker {
     return DGV;
   }
 
+  /// Drop GV if it is a member of a comdat that we are dropping.
+  /// This can happen with COFF's largest selection kind.
+  void dropReplacedComdat(GlobalValue &GV,
+                          const DenseSet<const Comdat *> &ReplacedDstComdats);
+
   bool linkIfNeeded(GlobalValue &GV);
 
   /// Helper method to check if we are importing from the current source
   /// module.
-  bool isPerformingImport() const { return FunctionsToImport != nullptr; }
+  bool isPerformingImport() const { return GlobalsToImport != nullptr; }
 
   /// If we are importing from the source module, checks if we should
   /// import SGV as a definition, otherwise import as a declaration.
   bool doImportAsDefinition(const GlobalValue *SGV);
 
 public:
-  ModuleLinker(IRMover &Mover, Module &SrcM, unsigned Flags,
-               const FunctionInfoIndex *Index = nullptr,
-               DenseSet<const GlobalValue *> *FunctionsToImport = nullptr,
-               DenseMap<unsigned, MDNode *> *ValIDToTempMDMap = nullptr)
-      : Mover(Mover), SrcM(SrcM), Flags(Flags), ImportIndex(Index),
-        FunctionsToImport(FunctionsToImport),
-        ValIDToTempMDMap(ValIDToTempMDMap) {
-    assert((ImportIndex || !FunctionsToImport) &&
-           "Expect a FunctionInfoIndex when importing");
-    // If we have a FunctionInfoIndex but no function to import,
-    // then this is the primary module being compiled in a ThinLTO
-    // backend compilation, and we need to see if it has functions that
-    // may be exported to another backend compilation.
-    if (ImportIndex && !FunctionsToImport)
-      HasExportedFunctions = ImportIndex->hasExportedFunctions(SrcM);
-    assert((ValIDToTempMDMap || !FunctionsToImport) &&
-           "Function importing must provide a ValIDToTempMDMap");
-  }
+  ModuleLinker(IRMover &Mover, std::unique_ptr<Module> SrcM, unsigned Flags,
+               DenseSet<const GlobalValue *> *GlobalsToImport = nullptr)
+      : Mover(Mover), SrcM(std::move(SrcM)), Flags(Flags),
+        GlobalsToImport(GlobalsToImport) {}
 
   bool run();
 };
-
-/// Class to handle necessary GlobalValue changes required by ThinLTO including
-/// linkage changes and any necessary renaming.
-class ThinLTOGlobalProcessing {
-  /// The Module which we are exporting or importing functions from.
-  Module &M;
-
-  /// Function index passed in for function importing/exporting handling.
-  const FunctionInfoIndex *ImportIndex;
-
-  /// Functions to import from this module, all other functions will be
-  /// imported as declarations instead of definitions.
-  DenseSet<const GlobalValue *> *FunctionsToImport;
-
-  /// Set to true if the given FunctionInfoIndex contains any functions
-  /// from this source module, in which case we must conservatively assume
-  /// that any of its functions may be imported into another module
-  /// as part of a different backend compilation process.
-  bool HasExportedFunctions = false;
-
-  /// Populated during ThinLTO global processing with locals promoted
-  /// to global scope in an exporting module, which now need to be linked
-  /// in if calling from the ModuleLinker.
-  SetVector<GlobalValue *> NewExportedValues;
-
-  /// Check if we should promote the given local value to global scope.
-  bool doPromoteLocalToGlobal(const GlobalValue *SGV);
-
-  /// Helper methods to check if we are importing from or potentially
-  /// exporting from the current source module.
-  bool isPerformingImport() const { return FunctionsToImport != nullptr; }
-  bool isModuleExporting() const { return HasExportedFunctions; }
-
-  /// If we are importing from the source module, checks if we should
-  /// import SGV as a definition, otherwise import as a declaration.
-  bool doImportAsDefinition(const GlobalValue *SGV);
-
-  /// Get the name for SGV that should be used in the linked destination
-  /// module. Specifically, this handles the case where we need to rename
-  /// a local that is being promoted to global scope.
-  std::string getName(const GlobalValue *SGV);
-
-  /// Process globals so that they can be used in ThinLTO. This includes
-  /// promoting local variables so that they can be reference externally by
-  /// thin lto imported globals and converting strong external globals to
-  /// available_externally.
-  void processGlobalsForThinLTO();
-  void processGlobalForThinLTO(GlobalValue &GV);
-
-  /// Get the new linkage for SGV that should be used in the linked destination
-  /// module. Specifically, for ThinLTO importing or exporting it may need
-  /// to be adjusted.
-  GlobalValue::LinkageTypes getLinkage(const GlobalValue *SGV);
-
-public:
-  ThinLTOGlobalProcessing(
-      Module &M, const FunctionInfoIndex *Index,
-      DenseSet<const GlobalValue *> *FunctionsToImport = nullptr)
-      : M(M), ImportIndex(Index), FunctionsToImport(FunctionsToImport) {
-    // If we have a FunctionInfoIndex but no function to import,
-    // then this is the primary module being compiled in a ThinLTO
-    // backend compilation, and we need to see if it has functions that
-    // may be exported to another backend compilation.
-    if (!FunctionsToImport)
-      HasExportedFunctions = ImportIndex->hasExportedFunctions(M);
-  }
-
-  bool run();
-
-  /// Access the promoted globals that are now exported and need to be linked.
-  SetVector<GlobalValue *> &getNewExportedValues() { return NewExportedValues; }
-};
-}
-
-/// Checks if we should import SGV as a definition, otherwise import as a
-/// declaration.
-static bool
-doImportAsDefinitionImpl(const GlobalValue *SGV,
-                         DenseSet<const GlobalValue *> *FunctionsToImport) {
-  auto *GA = dyn_cast<GlobalAlias>(SGV);
-  if (GA) {
-    if (GA->hasWeakAnyLinkage())
-      return false;
-    const GlobalObject *GO = GA->getBaseObject();
-    if (!GO->hasLinkOnceODRLinkage())
-      return false;
-    return doImportAsDefinitionImpl(GO, FunctionsToImport);
-  }
-  // Always import GlobalVariable definitions, except for the special
-  // case of WeakAny which are imported as ExternalWeak declarations
-  // (see comments in ModuleLinker::getLinkage). The linkage changes
-  // described in ModuleLinker::getLinkage ensure the correct behavior (e.g.
-  // global variables with external linkage are transformed to
-  // available_externally definitions, which are ultimately turned into
-  // declarations after the EliminateAvailableExternally pass).
-  if (isa<GlobalVariable>(SGV) && !SGV->isDeclaration() &&
-      !SGV->hasWeakAnyLinkage())
-    return true;
-  // Only import the function requested for importing.
-  auto *SF = dyn_cast<Function>(SGV);
-  if (SF && FunctionsToImport->count(SF))
-    return true;
-  // Otherwise no.
-  return false;
-}
-
-bool ThinLTOGlobalProcessing::doImportAsDefinition(const GlobalValue *SGV) {
-  if (!isPerformingImport())
-    return false;
-  return doImportAsDefinitionImpl(SGV, FunctionsToImport);
 }
 
 bool ModuleLinker::doImportAsDefinition(const GlobalValue *SGV) {
   if (!isPerformingImport())
     return false;
-  return doImportAsDefinitionImpl(SGV, FunctionsToImport);
-}
-
-bool ThinLTOGlobalProcessing::doPromoteLocalToGlobal(const GlobalValue *SGV) {
-  assert(SGV->hasLocalLinkage());
-  // Both the imported references and the original local variable must
-  // be promoted.
-  if (!isPerformingImport() && !isModuleExporting())
-    return false;
-
-  // Local const variables never need to be promoted unless they are address
-  // taken. The imported uses can simply use the clone created in this module.
-  // For now we are conservative in determining which variables are not
-  // address taken by checking the unnamed addr flag. To be more aggressive,
-  // the address taken information must be checked earlier during parsing
-  // of the module and recorded in the function index for use when importing
-  // from that module.
-  auto *GVar = dyn_cast<GlobalVariable>(SGV);
-  if (GVar && GVar->isConstant() && GVar->hasUnnamedAddr())
-    return false;
-
-  // Eventually we only need to promote functions in the exporting module that
-  // are referenced by a potentially exported function (i.e. one that is in the
-  // function index).
-  return true;
-}
-
-std::string ThinLTOGlobalProcessing::getName(const GlobalValue *SGV) {
-  // For locals that must be promoted to global scope, ensure that
-  // the promoted name uniquely identifies the copy in the original module,
-  // using the ID assigned during combined index creation. When importing,
-  // we rename all locals (not just those that are promoted) in order to
-  // avoid naming conflicts between locals imported from different modules.
-  if (SGV->hasLocalLinkage() &&
-      (doPromoteLocalToGlobal(SGV) || isPerformingImport()))
-    return FunctionInfoIndex::getGlobalNameForLocal(
-        SGV->getName(),
-        ImportIndex->getModuleId(SGV->getParent()->getModuleIdentifier()));
-  return SGV->getName();
-}
-
-GlobalValue::LinkageTypes
-ThinLTOGlobalProcessing::getLinkage(const GlobalValue *SGV) {
-  // Any local variable that is referenced by an exported function needs
-  // to be promoted to global scope. Since we don't currently know which
-  // functions reference which local variables/functions, we must treat
-  // all as potentially exported if this module is exporting anything.
-  if (isModuleExporting()) {
-    if (SGV->hasLocalLinkage() && doPromoteLocalToGlobal(SGV))
-      return GlobalValue::ExternalLinkage;
-    return SGV->getLinkage();
-  }
-
-  // Otherwise, if we aren't importing, no linkage change is needed.
-  if (!isPerformingImport())
-    return SGV->getLinkage();
-
-  switch (SGV->getLinkage()) {
-  case GlobalValue::ExternalLinkage:
-    // External defnitions are converted to available_externally
-    // definitions upon import, so that they are available for inlining
-    // and/or optimization, but are turned into declarations later
-    // during the EliminateAvailableExternally pass.
-    if (doImportAsDefinition(SGV) && !dyn_cast<GlobalAlias>(SGV))
-      return GlobalValue::AvailableExternallyLinkage;
-    // An imported external declaration stays external.
-    return SGV->getLinkage();
-
-  case GlobalValue::AvailableExternallyLinkage:
-    // An imported available_externally definition converts
-    // to external if imported as a declaration.
-    if (!doImportAsDefinition(SGV))
-      return GlobalValue::ExternalLinkage;
-    // An imported available_externally declaration stays that way.
-    return SGV->getLinkage();
-
-  case GlobalValue::LinkOnceAnyLinkage:
-  case GlobalValue::LinkOnceODRLinkage:
-    // These both stay the same when importing the definition.
-    // The ThinLTO pass will eventually force-import their definitions.
-    return SGV->getLinkage();
-
-  case GlobalValue::WeakAnyLinkage:
-    // Can't import weak_any definitions correctly, or we might change the
-    // program semantics, since the linker will pick the first weak_any
-    // definition and importing would change the order they are seen by the
-    // linker. The module linking caller needs to enforce this.
-    assert(!doImportAsDefinition(SGV));
-    // If imported as a declaration, it becomes external_weak.
-    return GlobalValue::ExternalWeakLinkage;
-
-  case GlobalValue::WeakODRLinkage:
-    // For weak_odr linkage, there is a guarantee that all copies will be
-    // equivalent, so the issue described above for weak_any does not exist,
-    // and the definition can be imported. It can be treated similarly
-    // to an imported externally visible global value.
-    if (doImportAsDefinition(SGV) && !dyn_cast<GlobalAlias>(SGV))
-      return GlobalValue::AvailableExternallyLinkage;
-    else
-      return GlobalValue::ExternalLinkage;
-
-  case GlobalValue::AppendingLinkage:
-    // It would be incorrect to import an appending linkage variable,
-    // since it would cause global constructors/destructors to be
-    // executed multiple times. This should have already been handled
-    // by linkIfNeeded, and we will assert in shouldLinkFromSource
-    // if we try to import, so we simply return AppendingLinkage.
-    return GlobalValue::AppendingLinkage;
-
-  case GlobalValue::InternalLinkage:
-  case GlobalValue::PrivateLinkage:
-    // If we are promoting the local to global scope, it is handled
-    // similarly to a normal externally visible global.
-    if (doPromoteLocalToGlobal(SGV)) {
-      if (doImportAsDefinition(SGV) && !dyn_cast<GlobalAlias>(SGV))
-        return GlobalValue::AvailableExternallyLinkage;
-      else
-        return GlobalValue::ExternalLinkage;
-    }
-    // A non-promoted imported local definition stays local.
-    // The ThinLTO pass will eventually force-import their definitions.
-    return SGV->getLinkage();
-
-  case GlobalValue::ExternalWeakLinkage:
-    // External weak doesn't apply to definitions, must be a declaration.
-    assert(!doImportAsDefinition(SGV));
-    // Linkage stays external_weak.
-    return SGV->getLinkage();
-
-  case GlobalValue::CommonLinkage:
-    // Linkage stays common on definitions.
-    // The ThinLTO pass will eventually force-import their definitions.
-    return SGV->getLinkage();
-  }
-
-  llvm_unreachable("unknown linkage type");
+  return FunctionImportGlobalProcessing::doImportAsDefinition(SGV,
+                                                              GlobalsToImport);
 }
 
 static GlobalValue::VisibilityTypes
@@ -466,15 +203,13 @@ bool ModuleLinker::computeResultingSelectionKind(StringRef ComdatName,
     const GlobalVariable *DstGV;
     const GlobalVariable *SrcGV;
     if (getComdatLeader(DstM, ComdatName, DstGV) ||
-        getComdatLeader(SrcM, ComdatName, SrcGV))
+        getComdatLeader(*SrcM, ComdatName, SrcGV))
       return true;
 
     const DataLayout &DstDL = DstM.getDataLayout();
-    const DataLayout &SrcDL = SrcM.getDataLayout();
-    uint64_t DstSize =
-        DstDL.getTypeAllocSize(DstGV->getType()->getPointerElementType());
-    uint64_t SrcSize =
-        SrcDL.getTypeAllocSize(SrcGV->getType()->getPointerElementType());
+    const DataLayout &SrcDL = SrcM->getDataLayout();
+    uint64_t DstSize = DstDL.getTypeAllocSize(DstGV->getValueType());
+    uint64_t SrcSize = SrcDL.getTypeAllocSize(SrcGV->getValueType());
     if (Result == Comdat::SelectionKind::ExactMatch) {
       if (SrcGV->getInitializer() != DstGV->getInitializer())
         return emitError("Linking COMDATs named '" + ComdatName +
@@ -537,31 +272,15 @@ bool ModuleLinker::shouldLinkFromSource(bool &LinkFromSrc,
     return false;
   }
 
-  bool SrcIsDeclaration = Src.isDeclarationForLinker();
-  bool DestIsDeclaration = Dest.isDeclarationForLinker();
-
   if (isPerformingImport()) {
-    if (isa<Function>(&Src)) {
-      // For functions, LinkFromSrc iff this is a function requested
-      // for importing. For variables, decide below normally.
-      LinkFromSrc = FunctionsToImport->count(&Src);
-      return false;
-    }
-
-    // Check if this is an alias with an already existing definition
-    // in Dest, which must have come from a prior importing pass from
-    // the same Src module. Unlike imported function and variable
-    // definitions, which are imported as available_externally and are
-    // not definitions for the linker, that is not a valid linkage for
-    // imported aliases which must be definitions. Simply use the existing
-    // Dest copy.
-    if (isa<GlobalAlias>(&Src) && !DestIsDeclaration) {
-      assert(isa<GlobalAlias>(&Dest));
-      LinkFromSrc = false;
-      return false;
-    }
+    // LinkFromSrc iff this is a global requested for importing.
+    LinkFromSrc = GlobalsToImport->count(&Src);
+    return false;
   }
 
+  bool SrcIsDeclaration = Src.isDeclarationForLinker();
+  bool DestIsDeclaration = Dest.isDeclarationForLinker();
+
   if (SrcIsDeclaration) {
     // If Src is external or if both Src & Dest are external..  Just link the
     // external globals, we aren't adding anything.
@@ -598,8 +317,8 @@ bool ModuleLinker::shouldLinkFromSource(bool &LinkFromSrc,
     }
 
     const DataLayout &DL = Dest.getParent()->getDataLayout();
-    uint64_t DestSize = DL.getTypeAllocSize(Dest.getType()->getElementType());
-    uint64_t SrcSize = DL.getTypeAllocSize(Src.getType()->getElementType());
+    uint64_t DestSize = DL.getTypeAllocSize(Dest.getValueType());
+    uint64_t SrcSize = DL.getTypeAllocSize(Src.getValueType());
     LinkFromSrc = SrcSize > DestSize;
     return false;
   }
@@ -658,9 +377,10 @@ bool ModuleLinker::linkIfNeeded(GlobalValue &GV) {
     DGV->setVisibility(Visibility);
     GV.setVisibility(Visibility);
 
-    bool HasUnnamedAddr = GV.hasUnnamedAddr() && DGV->hasUnnamedAddr();
-    DGV->setUnnamedAddr(HasUnnamedAddr);
-    GV.setUnnamedAddr(HasUnnamedAddr);
+    GlobalValue::UnnamedAddr UnnamedAddr = GlobalValue::getMinUnnamedAddr(
+        DGV->getUnnamedAddr(), GV.getUnnamedAddr());
+    DGV->setUnnamedAddr(UnnamedAddr);
+    GV.setUnnamedAddr(UnnamedAddr);
   }
 
   // Don't want to append to global_ctors list, for example, when we
@@ -670,12 +390,12 @@ bool ModuleLinker::linkIfNeeded(GlobalValue &GV) {
   if (GV.hasAppendingLinkage() && isPerformingImport())
     return false;
 
-  if (isPerformingImport() && !doImportAsDefinition(&GV))
-    return false;
-
-  if (!DGV && !shouldOverrideFromSrc() &&
-      (GV.hasLocalLinkage() || GV.hasLinkOnceLinkage() ||
-       GV.hasAvailableExternallyLinkage()))
+  if (isPerformingImport()) {
+    if (!doImportAsDefinition(&GV))
+      return false;
+  } else if (!DGV && !shouldOverrideFromSrc() &&
+             (GV.hasLocalLinkage() || GV.hasLinkOnceLinkage() ||
+              GV.hasAvailableExternallyLinkage()))
     return false;
 
   if (GV.isDeclaration())
@@ -685,9 +405,8 @@ bool ModuleLinker::linkIfNeeded(GlobalValue &GV) {
     bool LinkFromSrc;
     Comdat::SelectionKind SK;
     std::tie(SK, LinkFromSrc) = ComdatsChosen[SC];
-    if (LinkFromSrc)
-      ValuesToLink.insert(&GV);
-    return false;
+    if (!LinkFromSrc)
+      return false;
   }
 
   bool LinkFromSrc = true;
@@ -698,9 +417,15 @@ bool ModuleLinker::linkIfNeeded(GlobalValue &GV) {
   return false;
 }
 
-void ModuleLinker::addLazyFor(GlobalValue &GV, IRMover::ValueAdder Add) {
+void ModuleLinker::addLazyFor(GlobalValue &GV, const IRMover::ValueAdder &Add) {
+  if (!shouldLinkReferencedLinkOnce())
+    // For ThinLTO we don't import more than what was required.
+    // The client has to guarantee that the linkonce will be availabe at link
+    // time (by promoting it to weak for instance).
+    return;
+
   // Add these to the internalize list
-  if (!GV.hasLinkOnceLinkage())
+  if (!GV.hasLinkOnceLinkage() && !shouldLinkOnlyNeeded())
     return;
 
   if (shouldInternalizeLinkedSymbols())
@@ -710,43 +435,59 @@ void ModuleLinker::addLazyFor(GlobalValue &GV, IRMover::ValueAdder Add) {
   const Comdat *SC = GV.getComdat();
   if (!SC)
     return;
-  for (GlobalValue *GV2 : ComdatMembers[SC]) {
-    if (!GV2->hasLocalLinkage() && shouldInternalizeLinkedSymbols())
+  for (GlobalValue *GV2 : LazyComdatMembers[SC]) {
+    GlobalValue *DGV = getLinkedToGlobal(GV2);
+    bool LinkFromSrc = true;
+    if (DGV && shouldLinkFromSource(LinkFromSrc, *DGV, *GV2))
+      return;
+    if (!LinkFromSrc)
+      continue;
+    if (shouldInternalizeLinkedSymbols())
       Internalize.insert(GV2->getName());
     Add(*GV2);
   }
 }
 
-void ThinLTOGlobalProcessing::processGlobalForThinLTO(GlobalValue &GV) {
-  if (GV.hasLocalLinkage() &&
-      (doPromoteLocalToGlobal(&GV) || isPerformingImport())) {
-    GV.setName(getName(&GV));
-    GV.setLinkage(getLinkage(&GV));
-    if (!GV.hasLocalLinkage())
-      GV.setVisibility(GlobalValue::HiddenVisibility);
-    if (isModuleExporting())
-      NewExportedValues.insert(&GV);
+void ModuleLinker::dropReplacedComdat(
+    GlobalValue &GV, const DenseSet<const Comdat *> &ReplacedDstComdats) {
+  Comdat *C = GV.getComdat();
+  if (!C)
+    return;
+  if (!ReplacedDstComdats.count(C))
+    return;
+  if (GV.use_empty()) {
+    GV.eraseFromParent();
     return;
   }
-  GV.setLinkage(getLinkage(&GV));
-}
-
-void ThinLTOGlobalProcessing::processGlobalsForThinLTO() {
-  for (GlobalVariable &GV : M.globals())
-    processGlobalForThinLTO(GV);
-  for (Function &SF : M)
-    processGlobalForThinLTO(SF);
-  for (GlobalAlias &GA : M.aliases())
-    processGlobalForThinLTO(GA);
-}
 
-bool ThinLTOGlobalProcessing::run() {
-  processGlobalsForThinLTO();
-  return false;
+  if (auto *F = dyn_cast<Function>(&GV)) {
+    F->deleteBody();
+  } else if (auto *Var = dyn_cast<GlobalVariable>(&GV)) {
+    Var->setInitializer(nullptr);
+  } else {
+    auto &Alias = cast<GlobalAlias>(GV);
+    Module &M = *Alias.getParent();
+    PointerType &Ty = *cast<PointerType>(Alias.getType());
+    GlobalValue *Declaration;
+    if (auto *FTy = dyn_cast<FunctionType>(Alias.getValueType())) {
+      Declaration = Function::Create(FTy, GlobalValue::ExternalLinkage, "", &M);
+    } else {
+      Declaration =
+          new GlobalVariable(M, Ty.getElementType(), /*isConstant*/ false,
+                             GlobalValue::ExternalLinkage,
+                             /*Initializer*/ nullptr);
+    }
+    Declaration->takeName(&Alias);
+    Alias.replaceAllUsesWith(Declaration);
+    Alias.eraseFromParent();
+  }
 }
 
 bool ModuleLinker::run() {
-  for (const auto &SMEC : SrcM.getComdatSymbolTable()) {
+  Module &DstM = Mover.getModule();
+  DenseSet<const Comdat *> ReplacedDstComdats;
+
+  for (const auto &SMEC : SrcM->getComdatSymbolTable()) {
     const Comdat &C = SMEC.getValue();
     if (ComdatsChosen.count(&C))
       continue;
@@ -755,50 +496,79 @@ bool ModuleLinker::run() {
     if (getComdatResult(&C, SK, LinkFromSrc))
       return true;
     ComdatsChosen[&C] = std::make_pair(SK, LinkFromSrc);
+
+    if (!LinkFromSrc)
+      continue;
+
+    Module::ComdatSymTabType &ComdatSymTab = DstM.getComdatSymbolTable();
+    Module::ComdatSymTabType::iterator DstCI = ComdatSymTab.find(C.getName());
+    if (DstCI == ComdatSymTab.end())
+      continue;
+
+    // The source comdat is replacing the dest one.
+    const Comdat *DstC = &DstCI->second;
+    ReplacedDstComdats.insert(DstC);
   }
 
-  for (GlobalVariable &GV : SrcM.globals())
-    if (const Comdat *SC = GV.getComdat())
-      ComdatMembers[SC].push_back(&GV);
+  // Alias have to go first, since we are not able to find their comdats
+  // otherwise.
+  for (auto I = DstM.alias_begin(), E = DstM.alias_end(); I != E;) {
+    GlobalAlias &GV = *I++;
+    dropReplacedComdat(GV, ReplacedDstComdats);
+  }
+
+  for (auto I = DstM.global_begin(), E = DstM.global_end(); I != E;) {
+    GlobalVariable &GV = *I++;
+    dropReplacedComdat(GV, ReplacedDstComdats);
+  }
+
+  for (auto I = DstM.begin(), E = DstM.end(); I != E;) {
+    Function &GV = *I++;
+    dropReplacedComdat(GV, ReplacedDstComdats);
+  }
+
+  for (GlobalVariable &GV : SrcM->globals())
+    if (GV.hasLinkOnceLinkage())
+      if (const Comdat *SC = GV.getComdat())
+        LazyComdatMembers[SC].push_back(&GV);
 
-  for (Function &SF : SrcM)
-    if (const Comdat *SC = SF.getComdat())
-      ComdatMembers[SC].push_back(&SF);
+  for (Function &SF : *SrcM)
+    if (SF.hasLinkOnceLinkage())
+      if (const Comdat *SC = SF.getComdat())
+        LazyComdatMembers[SC].push_back(&SF);
 
-  for (GlobalAlias &GA : SrcM.aliases())
-    if (const Comdat *SC = GA.getComdat())
-      ComdatMembers[SC].push_back(&GA);
+  for (GlobalAlias &GA : SrcM->aliases())
+    if (GA.hasLinkOnceLinkage())
+      if (const Comdat *SC = GA.getComdat())
+        LazyComdatMembers[SC].push_back(&GA);
 
   // Insert all of the globals in src into the DstM module... without linking
   // initializers (which could refer to functions not yet mapped over).
-  for (GlobalVariable &GV : SrcM.globals())
+  for (GlobalVariable &GV : SrcM->globals())
     if (linkIfNeeded(GV))
       return true;
 
-  for (Function &SF : SrcM)
+  for (Function &SF : *SrcM)
     if (linkIfNeeded(SF))
       return true;
 
-  for (GlobalAlias &GA : SrcM.aliases())
+  for (GlobalAlias &GA : SrcM->aliases())
     if (linkIfNeeded(GA))
       return true;
 
-  if (ImportIndex) {
-    ThinLTOGlobalProcessing ThinLTOProcessing(SrcM, ImportIndex,
-                                              FunctionsToImport);
-    if (ThinLTOProcessing.run())
-      return true;
-    for (auto *GV : ThinLTOProcessing.getNewExportedValues())
-      ValuesToLink.insert(GV);
-  }
-
   for (unsigned I = 0; I < ValuesToLink.size(); ++I) {
     GlobalValue *GV = ValuesToLink[I];
     const Comdat *SC = GV->getComdat();
     if (!SC)
       continue;
-    for (GlobalValue *GV2 : ComdatMembers[SC])
-      ValuesToLink.insert(GV2);
+    for (GlobalValue *GV2 : LazyComdatMembers[SC]) {
+      GlobalValue *DGV = getLinkedToGlobal(GV2);
+      bool LinkFromSrc = true;
+      if (DGV && shouldLinkFromSource(LinkFromSrc, *DGV, *GV2))
+        return true;
+      if (LinkFromSrc)
+        ValuesToLink.insert(GV2);
+    }
   }
 
   if (shouldInternalizeLinkedSymbols()) {
@@ -806,13 +576,21 @@ bool ModuleLinker::run() {
       Internalize.insert(GV->getName());
   }
 
-  if (Mover.move(SrcM, ValuesToLink.getArrayRef(),
-                 [this](GlobalValue &GV, IRMover::ValueAdder Add) {
-                   addLazyFor(GV, Add);
-                 },
-                 ValIDToTempMDMap, false))
+  // FIXME: Propagate Errors through to the caller instead of emitting
+  // diagnostics.
+  bool HasErrors = false;
+  if (Error E = Mover.move(std::move(SrcM), ValuesToLink.getArrayRef(),
+                           [this](GlobalValue &GV, IRMover::ValueAdder Add) {
+                             addLazyFor(GV, Add);
+                           })) {
+    handleAllErrors(std::move(E), [&](ErrorInfoBase &EIB) {
+      DstM.getContext().diagnose(LinkDiagnosticInfo(DS_Error, EIB.message()));
+      HasErrors = true;
+    });
+  }
+  if (HasErrors)
     return true;
-  Module &DstM = Mover.getModule();
+
   for (auto &P : Internalize) {
     GlobalValue *GV = DstM.getNamedValue(P.first());
     GV->setLinkage(GlobalValue::InternalLinkage);
@@ -824,30 +602,11 @@ bool ModuleLinker::run() {
 Linker::Linker(Module &M) : Mover(M) {}
 
 bool Linker::linkInModule(std::unique_ptr<Module> Src, unsigned Flags,
-                          const FunctionInfoIndex *Index,
-                          DenseSet<const GlobalValue *> *FunctionsToImport,
-                          DenseMap<unsigned, MDNode *> *ValIDToTempMDMap) {
-  ModuleLinker ModLinker(Mover, *Src, Flags, Index, FunctionsToImport,
-                         ValIDToTempMDMap);
-  return ModLinker.run();
-}
-
-bool Linker::linkInModuleForCAPI(Module &Src) {
-  ModuleLinker ModLinker(Mover, Src, 0, nullptr, nullptr);
+                          DenseSet<const GlobalValue *> *GlobalsToImport) {
+  ModuleLinker ModLinker(Mover, std::move(Src), Flags, GlobalsToImport);
   return ModLinker.run();
 }
 
-bool Linker::linkInMetadata(Module &Src,
-                            DenseMap<unsigned, MDNode *> *ValIDToTempMDMap) {
-  SetVector<GlobalValue *> ValuesToLink;
-  if (Mover.move(
-          Src, ValuesToLink.getArrayRef(),
-          [this](GlobalValue &GV, IRMover::ValueAdder Add) { assert(false); },
-          ValIDToTempMDMap, true))
-    return true;
-  return false;
-}
-
 //===----------------------------------------------------------------------===//
 // LinkModules entrypoint.
 //===----------------------------------------------------------------------===//
@@ -863,44 +622,10 @@ bool Linker::linkModules(Module &Dest, std::unique_ptr<Module> Src,
   return L.linkInModule(std::move(Src), Flags);
 }
 
-bool llvm::renameModuleForThinLTO(Module &M, const FunctionInfoIndex *Index) {
-  ThinLTOGlobalProcessing ThinLTOProcessing(M, Index);
-  return ThinLTOProcessing.run();
-}
-
 //===----------------------------------------------------------------------===//
 // C API.
 //===----------------------------------------------------------------------===//
 
-static void diagnosticHandler(const DiagnosticInfo &DI, void *C) {
-  auto *Message = reinterpret_cast<std::string *>(C);
-  raw_string_ostream Stream(*Message);
-  DiagnosticPrinterRawOStream DP(Stream);
-  DI.print(DP);
-}
-
-LLVMBool LLVMLinkModules(LLVMModuleRef Dest, LLVMModuleRef Src,
-                         LLVMLinkerMode Unused, char **OutMessages) {
-  Module *D = unwrap(Dest);
-  LLVMContext &Ctx = D->getContext();
-
-  LLVMContext::DiagnosticHandlerTy OldDiagnosticHandler =
-      Ctx.getDiagnosticHandler();
-  void *OldDiagnosticContext = Ctx.getDiagnosticContext();
-  std::string Message;
-  Ctx.setDiagnosticHandler(diagnosticHandler, &Message, true);
-
-  Linker L(*D);
-  Module *M = unwrap(Src);
-  LLVMBool Result = L.linkInModuleForCAPI(*M);
-
-  Ctx.setDiagnosticHandler(OldDiagnosticHandler, OldDiagnosticContext, true);
-
-  if (OutMessages && Result)
-    *OutMessages = strdup(Message.c_str());
-  return Result;
-}
-
 LLVMBool LLVMLinkModules2(LLVMModuleRef Dest, LLVMModuleRef Src) {
   Module *D = unwrap(Dest);
   std::unique_ptr<Module> M(unwrap(Src));
diff --git a/lib/Linker/Makefile b/lib/Linker/Makefile
deleted file mode 100644
index 19e646b74830..000000000000
--- a/lib/Linker/Makefile
+++ /dev/null
@@ -1,15 +0,0 @@
-##===- lib/Linker/Makefile ---------------------------------*- Makefile -*-===##
-#
-#                     The LLVM Compiler Infrastructure
-#
-# This file is distributed under the University of Illinois Open Source
-# License. See LICENSE.TXT for details.
-#
-##===----------------------------------------------------------------------===##
-
-LEVEL = ../..
-LIBRARYNAME = LLVMLinker
-BUILD_ARCHIVE := 1
-
-include $(LEVEL)/Makefile.common
-
diff --git a/lib/MC/CMakeLists.txt b/lib/MC/CMakeLists.txt
index 8c015644d8ad..2f1b39e58e33 100644
--- a/lib/MC/CMakeLists.txt
+++ b/lib/MC/CMakeLists.txt
@@ -9,7 +9,7 @@ add_llvm_library(LLVMMC
   MCAsmStreamer.cpp
   MCAssembler.cpp
   MCCodeEmitter.cpp
-  MCCodeGenInfo.cpp
+  MCCodeView.cpp
   MCContext.cpp
   MCDwarf.cpp
   MCELFObjectTargetWriter.cpp
@@ -38,7 +38,6 @@ add_llvm_library(LLVMMC
   MCSubtargetInfo.cpp
   MCSymbol.cpp
   MCSymbolELF.cpp
-  MCSymbolizer.cpp
   MCTargetOptions.cpp
   MCValue.cpp
   MCWin64EH.cpp
@@ -48,7 +47,6 @@ add_llvm_library(LLVMMC
   SubtargetFeature.cpp
   WinCOFFObjectWriter.cpp
   WinCOFFStreamer.cpp
-  YAML.cpp
 
   ADDITIONAL_HEADER_DIRS
   ${LLVM_MAIN_INCLUDE_DIR}/llvm/MC
diff --git a/lib/MC/ConstantPools.cpp b/lib/MC/ConstantPools.cpp
index 9643b7594682..17a23d063b7d 100644
--- a/lib/MC/ConstantPools.cpp
+++ b/lib/MC/ConstantPools.cpp
@@ -25,11 +25,10 @@ void ConstantPool::emitEntries(MCStreamer &Streamer) {
   if (Entries.empty())
     return;
   Streamer.EmitDataRegion(MCDR_DataRegion);
-  for (EntryVecTy::const_iterator I = Entries.begin(), E = Entries.end();
-       I != E; ++I) {
-    Streamer.EmitCodeAlignment(I->Size); // align naturally
-    Streamer.EmitLabel(I->Label);
-    Streamer.EmitValue(I->Value, I->Size, I->Loc);
+  for (const ConstantPoolEntry &Entry : Entries) {
+    Streamer.EmitCodeAlignment(Entry.Size); // align naturally
+    Streamer.EmitLabel(Entry.Label);
+    Streamer.EmitValue(Entry.Value, Entry.Size, Entry.Loc);
   }
   Streamer.EmitDataRegion(MCDR_DataRegionEnd);
   Entries.clear();
@@ -71,11 +70,9 @@ static void emitConstantPool(MCStreamer &Streamer, MCSection *Section,
 
 void AssemblerConstantPools::emitAll(MCStreamer &Streamer) {
   // Dump contents of assembler constant pools.
-  for (ConstantPoolMapTy::iterator CPI = ConstantPools.begin(),
-                                   CPE = ConstantPools.end();
-       CPI != CPE; ++CPI) {
-    MCSection *Section = CPI->first;
-    ConstantPool &CP = CPI->second;
+  for (auto &CPI : ConstantPools) {
+    MCSection *Section = CPI.first;
+    ConstantPool &CP = CPI.second;
 
     emitConstantPool(Streamer, Section, CP);
   }
diff --git a/lib/MC/ELFObjectWriter.cpp b/lib/MC/ELFObjectWriter.cpp
index e6552beefd01..dc21b48ca6f6 100644
--- a/lib/MC/ELFObjectWriter.cpp
+++ b/lib/MC/ELFObjectWriter.cpp
@@ -35,13 +35,13 @@
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/StringSaver.h"
 #include <vector>
+
 using namespace llvm;
 
 #undef  DEBUG_TYPE
 #define DEBUG_TYPE "reloc-info"
 
 namespace {
-
 typedef DenseMap<const MCSectionELF *, uint32_t> SectionIndexMapTy;
 
 class ELFObjectWriter;
@@ -70,169 +70,171 @@ public:
 };
 
 class ELFObjectWriter : public MCObjectWriter {
-    static bool isFixupKindPCRel(const MCAssembler &Asm, unsigned Kind);
-    static uint64_t SymbolValue(const MCSymbol &Sym, const MCAsmLayout &Layout);
-    static bool isInSymtab(const MCAsmLayout &Layout, const MCSymbolELF &Symbol,
-                           bool Used, bool Renamed);
-
-    /// Helper struct for containing some precomputed information on symbols.
-    struct ELFSymbolData {
-      const MCSymbolELF *Symbol;
-      uint32_t SectionIndex;
-      StringRef Name;
-
-      // Support lexicographic sorting.
-      bool operator<(const ELFSymbolData &RHS) const {
-        unsigned LHSType = Symbol->getType();
-        unsigned RHSType = RHS.Symbol->getType();
-        if (LHSType == ELF::STT_SECTION && RHSType != ELF::STT_SECTION)
-          return false;
-        if (LHSType != ELF::STT_SECTION && RHSType == ELF::STT_SECTION)
-          return true;
-        if (LHSType == ELF::STT_SECTION && RHSType == ELF::STT_SECTION)
-          return SectionIndex < RHS.SectionIndex;
-        return Name < RHS.Name;
-      }
-    };
+  static uint64_t SymbolValue(const MCSymbol &Sym, const MCAsmLayout &Layout);
+  static bool isInSymtab(const MCAsmLayout &Layout, const MCSymbolELF &Symbol,
+                         bool Used, bool Renamed);
+
+  /// Helper struct for containing some precomputed information on symbols.
+  struct ELFSymbolData {
+    const MCSymbolELF *Symbol;
+    uint32_t SectionIndex;
+    StringRef Name;
+
+    // Support lexicographic sorting.
+    bool operator<(const ELFSymbolData &RHS) const {
+      unsigned LHSType = Symbol->getType();
+      unsigned RHSType = RHS.Symbol->getType();
+      if (LHSType == ELF::STT_SECTION && RHSType != ELF::STT_SECTION)
+        return false;
+      if (LHSType != ELF::STT_SECTION && RHSType == ELF::STT_SECTION)
+        return true;
+      if (LHSType == ELF::STT_SECTION && RHSType == ELF::STT_SECTION)
+        return SectionIndex < RHS.SectionIndex;
+      return Name < RHS.Name;
+    }
+  };
 
-    /// The target specific ELF writer instance.
-    std::unique_ptr<MCELFObjectTargetWriter> TargetObjectWriter;
+  /// The target specific ELF writer instance.
+  std::unique_ptr<MCELFObjectTargetWriter> TargetObjectWriter;
 
-    DenseMap<const MCSymbolELF *, const MCSymbolELF *> Renames;
+  DenseMap<const MCSymbolELF *, const MCSymbolELF *> Renames;
 
-    llvm::DenseMap<const MCSectionELF *, std::vector<ELFRelocationEntry>>
-        Relocations;
+  llvm::DenseMap<const MCSectionELF *, std::vector<ELFRelocationEntry>>
+      Relocations;
 
-    /// @}
-    /// @name Symbol Table Data
-    /// @{
+  /// @}
+  /// @name Symbol Table Data
+  /// @{
 
-    BumpPtrAllocator Alloc;
-    StringSaver VersionSymSaver{Alloc};
-    StringTableBuilder StrTabBuilder{StringTableBuilder::ELF};
+  BumpPtrAllocator Alloc;
+  StringSaver VersionSymSaver{Alloc};
+  StringTableBuilder StrTabBuilder{StringTableBuilder::ELF};
 
-    /// @}
+  /// @}
 
-    // This holds the symbol table index of the last local symbol.
-    unsigned LastLocalSymbolIndex;
-    // This holds the .strtab section index.
-    unsigned StringTableIndex;
-    // This holds the .symtab section index.
-    unsigned SymbolTableIndex;
+  // This holds the symbol table index of the last local symbol.
+  unsigned LastLocalSymbolIndex;
+  // This holds the .strtab section index.
+  unsigned StringTableIndex;
+  // This holds the .symtab section index.
+  unsigned SymbolTableIndex;
 
-    // Sections in the order they are to be output in the section table.
-    std::vector<const MCSectionELF *> SectionTable;
-    unsigned addToSectionTable(const MCSectionELF *Sec);
+  // Sections in the order they are to be output in the section table.
+  std::vector<const MCSectionELF *> SectionTable;
+  unsigned addToSectionTable(const MCSectionELF *Sec);
 
-    // TargetObjectWriter wrappers.
-    bool is64Bit() const { return TargetObjectWriter->is64Bit(); }
-    bool hasRelocationAddend() const {
-      return TargetObjectWriter->hasRelocationAddend();
-    }
-    unsigned GetRelocType(const MCValue &Target, const MCFixup &Fixup,
-                          bool IsPCRel) const {
-      return TargetObjectWriter->GetRelocType(Target, Fixup, IsPCRel);
-    }
+  // TargetObjectWriter wrappers.
+  bool is64Bit() const { return TargetObjectWriter->is64Bit(); }
+  bool hasRelocationAddend() const {
+    return TargetObjectWriter->hasRelocationAddend();
+  }
+  unsigned getRelocType(MCContext &Ctx, const MCValue &Target,
+                        const MCFixup &Fixup, bool IsPCRel) const {
+    return TargetObjectWriter->getRelocType(Ctx, Target, Fixup, IsPCRel);
+  }
 
-    void align(unsigned Alignment);
+  void align(unsigned Alignment);
 
-  public:
-    ELFObjectWriter(MCELFObjectTargetWriter *MOTW, raw_pwrite_stream &OS,
-                    bool IsLittleEndian)
-        : MCObjectWriter(OS, IsLittleEndian), TargetObjectWriter(MOTW) {}
+  bool maybeWriteCompression(uint64_t Size,
+                             SmallVectorImpl<char> &CompressedContents,
+                             bool ZLibStyle, unsigned Alignment);
 
-    void reset() override {
-      Renames.clear();
-      Relocations.clear();
-      StrTabBuilder.clear();
-      SectionTable.clear();
-      MCObjectWriter::reset();
-    }
+public:
+  ELFObjectWriter(MCELFObjectTargetWriter *MOTW, raw_pwrite_stream &OS,
+                  bool IsLittleEndian)
+      : MCObjectWriter(OS, IsLittleEndian), TargetObjectWriter(MOTW) {}
+
+  void reset() override {
+    Renames.clear();
+    Relocations.clear();
+    StrTabBuilder.clear();
+    SectionTable.clear();
+    MCObjectWriter::reset();
+  }
 
-    ~ELFObjectWriter() override;
+  ~ELFObjectWriter() override;
 
-    void WriteWord(uint64_t W) {
-      if (is64Bit())
-        write64(W);
-      else
-        write32(W);
-    }
+  void WriteWord(uint64_t W) {
+    if (is64Bit())
+      write64(W);
+    else
+      write32(W);
+  }
 
-    template <typename T> void write(T Val) {
-      if (IsLittleEndian)
-        support::endian::Writer<support::little>(getStream()).write(Val);
-      else
-        support::endian::Writer<support::big>(getStream()).write(Val);
-    }
+  template <typename T> void write(T Val) {
+    if (IsLittleEndian)
+      support::endian::Writer<support::little>(getStream()).write(Val);
+    else
+      support::endian::Writer<support::big>(getStream()).write(Val);
+  }
 
-    void writeHeader(const MCAssembler &Asm);
+  void writeHeader(const MCAssembler &Asm);
 
-    void writeSymbol(SymbolTableWriter &Writer, uint32_t StringIndex,
-                     ELFSymbolData &MSD, const MCAsmLayout &Layout);
+  void writeSymbol(SymbolTableWriter &Writer, uint32_t StringIndex,
+                   ELFSymbolData &MSD, const MCAsmLayout &Layout);
 
-    // Start and end offset of each section
-    typedef std::map<const MCSectionELF *, std::pair<uint64_t, uint64_t>>
-        SectionOffsetsTy;
+  // Start and end offset of each section
+  typedef std::map<const MCSectionELF *, std::pair<uint64_t, uint64_t>>
+      SectionOffsetsTy;
 
-    bool shouldRelocateWithSymbol(const MCAssembler &Asm,
-                                  const MCSymbolRefExpr *RefA,
-                                  const MCSymbol *Sym, uint64_t C,
-                                  unsigned Type) const;
+  bool shouldRelocateWithSymbol(const MCAssembler &Asm,
+                                const MCSymbolRefExpr *RefA,
+                                const MCSymbol *Sym, uint64_t C,
+                                unsigned Type) const;
 
-    void recordRelocation(MCAssembler &Asm, const MCAsmLayout &Layout,
-                          const MCFragment *Fragment, const MCFixup &Fixup,
-                          MCValue Target, bool &IsPCRel,
-                          uint64_t &FixedValue) override;
+  void recordRelocation(MCAssembler &Asm, const MCAsmLayout &Layout,
+                        const MCFragment *Fragment, const MCFixup &Fixup,
+                        MCValue Target, bool &IsPCRel,
+                        uint64_t &FixedValue) override;
 
-    // Map from a signature symbol to the group section index
-    typedef DenseMap<const MCSymbol *, unsigned> RevGroupMapTy;
+  // Map from a signature symbol to the group section index
+  typedef DenseMap<const MCSymbol *, unsigned> RevGroupMapTy;
 
-    /// Compute the symbol table data
-    ///
-    /// \param Asm - The assembler.
-    /// \param SectionIndexMap - Maps a section to its index.
-    /// \param RevGroupMap - Maps a signature symbol to the group section.
-    void computeSymbolTable(MCAssembler &Asm, const MCAsmLayout &Layout,
-                            const SectionIndexMapTy &SectionIndexMap,
-                            const RevGroupMapTy &RevGroupMap,
-                            SectionOffsetsTy &SectionOffsets);
+  /// Compute the symbol table data
+  ///
+  /// \param Asm - The assembler.
+  /// \param SectionIndexMap - Maps a section to its index.
+  /// \param RevGroupMap - Maps a signature symbol to the group section.
+  void computeSymbolTable(MCAssembler &Asm, const MCAsmLayout &Layout,
+                          const SectionIndexMapTy &SectionIndexMap,
+                          const RevGroupMapTy &RevGroupMap,
+                          SectionOffsetsTy &SectionOffsets);
 
-    MCSectionELF *createRelocationSection(MCContext &Ctx,
-                                          const MCSectionELF &Sec);
+  MCSectionELF *createRelocationSection(MCContext &Ctx,
+                                        const MCSectionELF &Sec);
 
-    const MCSectionELF *createStringTable(MCContext &Ctx);
+  const MCSectionELF *createStringTable(MCContext &Ctx);
 
-    void executePostLayoutBinding(MCAssembler &Asm,
-                                  const MCAsmLayout &Layout) override;
+  void executePostLayoutBinding(MCAssembler &Asm,
+                                const MCAsmLayout &Layout) override;
 
-    void writeSectionHeader(const MCAsmLayout &Layout,
-                            const SectionIndexMapTy &SectionIndexMap,
-                            const SectionOffsetsTy &SectionOffsets);
+  void writeSectionHeader(const MCAsmLayout &Layout,
+                          const SectionIndexMapTy &SectionIndexMap,
+                          const SectionOffsetsTy &SectionOffsets);
 
-    void writeSectionData(const MCAssembler &Asm, MCSection &Sec,
-                          const MCAsmLayout &Layout);
+  void writeSectionData(const MCAssembler &Asm, MCSection &Sec,
+                        const MCAsmLayout &Layout);
 
-    void WriteSecHdrEntry(uint32_t Name, uint32_t Type, uint64_t Flags,
-                          uint64_t Address, uint64_t Offset, uint64_t Size,
-                          uint32_t Link, uint32_t Info, uint64_t Alignment,
-                          uint64_t EntrySize);
+  void WriteSecHdrEntry(uint32_t Name, uint32_t Type, uint64_t Flags,
+                        uint64_t Address, uint64_t Offset, uint64_t Size,
+                        uint32_t Link, uint32_t Info, uint64_t Alignment,
+                        uint64_t EntrySize);
 
-    void writeRelocations(const MCAssembler &Asm, const MCSectionELF &Sec);
+  void writeRelocations(const MCAssembler &Asm, const MCSectionELF &Sec);
 
-    bool isSymbolRefDifferenceFullyResolvedImpl(const MCAssembler &Asm,
-                                                const MCSymbol &SymA,
-                                                const MCFragment &FB,
-                                                bool InSet,
-                                                bool IsPCRel) const override;
+  bool isSymbolRefDifferenceFullyResolvedImpl(const MCAssembler &Asm,
+                                              const MCSymbol &SymA,
+                                              const MCFragment &FB, bool InSet,
+                                              bool IsPCRel) const override;
 
-    bool isWeak(const MCSymbol &Sym) const override;
+  bool isWeak(const MCSymbol &Sym) const override;
 
-    void writeObject(MCAssembler &Asm, const MCAsmLayout &Layout) override;
-    void writeSection(const SectionIndexMapTy &SectionIndexMap,
-                      uint32_t GroupSymbolIndex, uint64_t Offset, uint64_t Size,
-                      const MCSectionELF &Section);
-  };
-}
+  void writeObject(MCAssembler &Asm, const MCAsmLayout &Layout) override;
+  void writeSection(const SectionIndexMapTy &SectionIndexMap,
+                    uint32_t GroupSymbolIndex, uint64_t Offset, uint64_t Size,
+                    const MCSectionELF &Section);
+};
+} // end anonymous namespace
 
 void ELFObjectWriter::align(unsigned Alignment) {
   uint64_t Padding = OffsetToAlignment(getStream().tell(), Alignment);
@@ -295,13 +297,6 @@ void SymbolTableWriter::writeSymbol(uint32_t name, uint8_t info, uint64_t value,
   ++NumWritten;
 }
 
-bool ELFObjectWriter::isFixupKindPCRel(const MCAssembler &Asm, unsigned Kind) {
-  const MCFixupKindInfo &FKI =
-    Asm.getBackend().getFixupKindInfo((MCFixupKind) Kind);
-
-  return FKI.Flags & MCFixupKindInfo::FKF_IsPCRel;
-}
-
 ELFObjectWriter::~ELFObjectWriter()
 {}
 
@@ -375,9 +370,24 @@ uint64_t ELFObjectWriter::SymbolValue(const MCSymbol &Sym,
 
 void ELFObjectWriter::executePostLayoutBinding(MCAssembler &Asm,
                                                const MCAsmLayout &Layout) {
+  // Section symbols are used as definitions for undefined symbols with matching
+  // names. If there are multiple sections with the same name, the first one is
+  // used.
+  for (const MCSection &Sec : Asm) {
+    const MCSymbol *Begin = Sec.getBeginSymbol();
+    if (!Begin)
+      continue;
+
+    const MCSymbol *Alias = Asm.getContext().lookupSymbol(Begin->getName());
+    if (!Alias || !Alias->isUndefined())
+      continue;
+
+    Renames.insert(
+        std::make_pair(cast<MCSymbolELF>(Alias), cast<MCSymbolELF>(Begin)));
+  }
+
   // The presence of symbol versions causes undefined symbols and
   // versions declared with @@@ to be renamed.
-
   for (const MCSymbol &A : Asm.symbols()) {
     const auto &Alias = cast<MCSymbolELF>(A);
     // Not an alias.
@@ -522,7 +532,6 @@ bool ELFObjectWriter::shouldRelocateWithSymbol(const MCAssembler &Asm,
   case MCSymbolRefExpr::VK_GOT:
   case MCSymbolRefExpr::VK_PLT:
   case MCSymbolRefExpr::VK_GOTPCREL:
-  case MCSymbolRefExpr::VK_Mips_GOT:
   case MCSymbolRefExpr::VK_PPC_GOT_LO:
   case MCSymbolRefExpr::VK_PPC_GOT_HI:
   case MCSymbolRefExpr::VK_PPC_GOT_HA:
@@ -618,6 +627,7 @@ void ELFObjectWriter::recordRelocation(MCAssembler &Asm,
   const MCSectionELF &FixupSection = cast<MCSectionELF>(*Fragment->getParent());
   uint64_t C = Target.getConstant();
   uint64_t FixupOffset = Layout.getFragmentOffset(Fragment) + Fixup.getOffset();
+  MCContext &Ctx = Asm.getContext();
 
   if (const MCSymbolRefExpr *RefB = Target.getSymB()) {
     assert(RefB->getKind() == MCSymbolRefExpr::VK_None &&
@@ -631,7 +641,7 @@ void ELFObjectWriter::recordRelocation(MCAssembler &Asm,
     // or (A + C - R). If B = R + K and the relocation is not pcrel, we can
     // replace B to implement it: (A - R - K + C)
     if (IsPCRel) {
-      Asm.getContext().reportError(
+      Ctx.reportError(
           Fixup.getLoc(),
           "No relocation available to represent this relative expression");
       return;
@@ -640,24 +650,17 @@ void ELFObjectWriter::recordRelocation(MCAssembler &Asm,
     const auto &SymB = cast<MCSymbolELF>(RefB->getSymbol());
 
     if (SymB.isUndefined()) {
-      Asm.getContext().reportError(
-          Fixup.getLoc(),
-          Twine("symbol '") + SymB.getName() +
-              "' can not be undefined in a subtraction expression");
+      Ctx.reportError(Fixup.getLoc(),
+                      Twine("symbol '") + SymB.getName() +
+                          "' can not be undefined in a subtraction expression");
       return;
     }
 
     assert(!SymB.isAbsolute() && "Should have been folded");
     const MCSection &SecB = SymB.getSection();
     if (&SecB != &FixupSection) {
-      Asm.getContext().reportError(
-          Fixup.getLoc(), "Cannot represent a difference across sections");
-      return;
-    }
-
-    if (::isWeak(SymB)) {
-      Asm.getContext().reportError(
-          Fixup.getLoc(), "Cannot represent a subtraction with a weak symbol");
+      Ctx.reportError(Fixup.getLoc(),
+                      "Cannot represent a difference across sections");
       return;
     }
 
@@ -682,7 +685,8 @@ void ELFObjectWriter::recordRelocation(MCAssembler &Asm,
     }
   }
 
-  unsigned Type = GetRelocType(Target, Fixup, IsPCRel);
+  unsigned Type = getRelocType(Ctx, Target, Fixup, IsPCRel);
+  uint64_t OriginalC = C;
   bool RelocateWithSymbol = shouldRelocateWithSymbol(Asm, RefA, SymA, C, Type);
   if (!RelocateWithSymbol && SymA && !SymA->isUndefined())
     C += Layout.getSymbolOffset(*SymA);
@@ -703,23 +707,25 @@ void ELFObjectWriter::recordRelocation(MCAssembler &Asm,
         ELFSec ? cast<MCSymbolELF>(ELFSec->getBeginSymbol()) : nullptr;
     if (SectionSymbol)
       SectionSymbol->setUsedInReloc();
-    ELFRelocationEntry Rec(FixupOffset, SectionSymbol, Type, Addend);
+    ELFRelocationEntry Rec(FixupOffset, SectionSymbol, Type, Addend, SymA,
+                           OriginalC);
     Relocations[&FixupSection].push_back(Rec);
     return;
   }
 
+  const auto *RenamedSymA = SymA;
   if (SymA) {
     if (const MCSymbolELF *R = Renames.lookup(SymA))
-      SymA = R;
+      RenamedSymA = R;
 
     if (ViaWeakRef)
-      SymA->setIsWeakrefUsedInReloc();
+      RenamedSymA->setIsWeakrefUsedInReloc();
     else
-      SymA->setUsedInReloc();
+      RenamedSymA->setUsedInReloc();
   }
-  ELFRelocationEntry Rec(FixupOffset, SymA, Type, Addend);
+  ELFRelocationEntry Rec(FixupOffset, RenamedSymA, Type, Addend, SymA,
+                         OriginalC);
   Relocations[&FixupSection].push_back(Rec);
-  return;
 }
 
 bool ELFObjectWriter::isInSymtab(const MCAsmLayout &Layout,
@@ -969,23 +975,38 @@ ELFObjectWriter::createRelocationSection(MCContext &Ctx,
   return RelaSection;
 }
 
-// Include the debug info compression header:
-// "ZLIB" followed by 8 bytes representing the uncompressed size of the section,
-// useful for consumers to preallocate a buffer to decompress into.
-static bool
-prependCompressionHeader(uint64_t Size,
-                         SmallVectorImpl<char> &CompressedContents) {
+// Include the debug info compression header.
+bool ELFObjectWriter::maybeWriteCompression(
+    uint64_t Size, SmallVectorImpl<char> &CompressedContents, bool ZLibStyle,
+    unsigned Alignment) {
+  if (ZLibStyle) {
+    uint64_t HdrSize =
+        is64Bit() ? sizeof(ELF::Elf32_Chdr) : sizeof(ELF::Elf64_Chdr);
+    if (Size <= HdrSize + CompressedContents.size())
+      return false;
+    // Platform specific header is followed by compressed data.
+    if (is64Bit()) {
+      // Write Elf64_Chdr header.
+      write(static_cast<ELF::Elf64_Word>(ELF::ELFCOMPRESS_ZLIB));
+      write(static_cast<ELF::Elf64_Word>(0)); // ch_reserved field.
+      write(static_cast<ELF::Elf64_Xword>(Size));
+      write(static_cast<ELF::Elf64_Xword>(Alignment));
+    } else {
+      // Write Elf32_Chdr header otherwise.
+      write(static_cast<ELF::Elf32_Word>(ELF::ELFCOMPRESS_ZLIB));
+      write(static_cast<ELF::Elf32_Word>(Size));
+      write(static_cast<ELF::Elf32_Word>(Alignment));
+    }
+    return true;
+  }
+
+  // "ZLIB" followed by 8 bytes representing the uncompressed size of the section,
+  // useful for consumers to preallocate a buffer to decompress into.
   const StringRef Magic = "ZLIB";
   if (Size <= Magic.size() + sizeof(Size) + CompressedContents.size())
     return false;
-  if (sys::IsLittleEndianHost)
-    sys::swapByteOrder(Size);
-  CompressedContents.insert(CompressedContents.begin(),
-                            Magic.size() + sizeof(Size), 0);
-  std::copy(Magic.begin(), Magic.end(), CompressedContents.begin());
-  std::copy(reinterpret_cast<char *>(&Size),
-            reinterpret_cast<char *>(&Size + 1),
-            CompressedContents.begin() + Magic.size());
+  write(ArrayRef<char>(Magic.begin(), Magic.size()));
+  writeBE64(Size);
   return true;
 }
 
@@ -997,8 +1018,11 @@ void ELFObjectWriter::writeSectionData(const MCAssembler &Asm, MCSection &Sec,
   // Compressing debug_frame requires handling alignment fragments which is
   // more work (possibly generalizing MCAssembler.cpp:writeFragment to allow
   // for writing to arbitrary buffers) for little benefit.
-  if (!Asm.getContext().getAsmInfo()->compressDebugSections() ||
-      !SectionName.startswith(".debug_") || SectionName == ".debug_frame") {
+  bool CompressionEnabled =
+      Asm.getContext().getAsmInfo()->compressDebugSections() !=
+      DebugCompressionType::DCT_None;
+  if (!CompressionEnabled || !SectionName.startswith(".debug_") ||
+      SectionName == ".debug_frame") {
     Asm.writeSectionData(&Section, Layout);
     return;
   }
@@ -1019,12 +1043,21 @@ void ELFObjectWriter::writeSectionData(const MCAssembler &Asm, MCSection &Sec,
     return;
   }
 
-  if (!prependCompressionHeader(UncompressedData.size(), CompressedContents)) {
+  bool ZlibStyle = Asm.getContext().getAsmInfo()->compressDebugSections() ==
+                   DebugCompressionType::DCT_Zlib;
+  if (!maybeWriteCompression(UncompressedData.size(), CompressedContents,
+                             ZlibStyle, Sec.getAlignment())) {
     getStream() << UncompressedData;
     return;
   }
-  Asm.getContext().renameELFSection(&Section,
-                                    (".z" + SectionName.drop_front(1)).str());
+
+  if (ZlibStyle)
+    // Set the compressed flag. That is zlib style.
+    Section.setFlags(Section.getFlags() | ELF::SHF_COMPRESSED);
+  else
+    // Add "z" prefix to section name. This is zlib-gnu style.
+    Asm.getContext().renameELFSection(&Section,
+                                      (".z" + SectionName.drop_front(1)).str());
   getStream() << CompressedContents;
 }
 
@@ -1279,7 +1312,7 @@ void ELFObjectWriter::writeObject(MCAssembler &Asm,
   uint64_t NaturalAlignment = is64Bit() ? 8 : 4;
   align(NaturalAlignment);
 
-  const unsigned SectionHeaderOffset = getStream().tell();
+  const uint64_t SectionHeaderOffset = getStream().tell();
 
   // ... then the section header table ...
   writeSectionHeader(Layout, SectionIndexMap, SectionOffsets);
diff --git a/lib/MC/MCAsmBackend.cpp b/lib/MC/MCAsmBackend.cpp
index fcf139b72537..b868b9d48896 100644
--- a/lib/MC/MCAsmBackend.cpp
+++ b/lib/MC/MCAsmBackend.cpp
@@ -12,12 +12,12 @@
 #include "llvm/MC/MCFixupKindInfo.h"
 using namespace llvm;
 
-MCAsmBackend::MCAsmBackend() : HasDataInCodeSupport(false) {}
+MCAsmBackend::MCAsmBackend() {}
 
 MCAsmBackend::~MCAsmBackend() {}
 
-bool MCAsmBackend::getFixupKind(StringRef Name, MCFixupKind &MappedKind) const {
-  return false;
+Optional<MCFixupKind> MCAsmBackend::getFixupKind(StringRef Name) const {
+  return None;
 }
 
 const MCFixupKindInfo &MCAsmBackend::getFixupKindInfo(MCFixupKind Kind) const {
diff --git a/lib/MC/MCAsmInfo.cpp b/lib/MC/MCAsmInfo.cpp
index 36e10b3c6a07..4a05175fdec3 100644
--- a/lib/MC/MCAsmInfo.cpp
+++ b/lib/MC/MCAsmInfo.cpp
@@ -75,6 +75,7 @@ MCAsmInfo::MCAsmInfo() {
   HasSingleParameterDotFile = true;
   HasIdentDirective = false;
   HasNoDeadStrip = false;
+  HasAltEntry = false;
   WeakDirective = "\t.weak\t";
   WeakRefDirective = nullptr;
   HasWeakDefDirective = false;
@@ -106,8 +107,9 @@ MCAsmInfo::MCAsmInfo() {
   //   architecture basis.
   //   - The target subclasses for AArch64, ARM, and X86 handle these cases
   UseIntegratedAssembler = false;
+  PreserveAsmComments = true;
 
-  CompressDebugSections = false;
+  CompressDebugSections = DebugCompressionType::DCT_None;
 }
 
 MCAsmInfo::~MCAsmInfo() {
diff --git a/lib/MC/MCAsmInfoDarwin.cpp b/lib/MC/MCAsmInfoDarwin.cpp
index ae9486d3db4d..fc60313dd6b2 100644
--- a/lib/MC/MCAsmInfoDarwin.cpp
+++ b/lib/MC/MCAsmInfoDarwin.cpp
@@ -48,6 +48,7 @@ bool MCAsmInfoDarwin::isSectionAtomizableBySymbols(
   case MachO::S_LITERAL_POINTERS:
   case MachO::S_NON_LAZY_SYMBOL_POINTERS:
   case MachO::S_LAZY_SYMBOL_POINTERS:
+  case MachO::S_THREAD_LOCAL_VARIABLE_POINTERS:
   case MachO::S_MOD_INIT_FUNC_POINTERS:
   case MachO::S_MOD_TERM_FUNC_POINTERS:
   case MachO::S_INTERPOSING:
@@ -88,6 +89,7 @@ MCAsmInfoDarwin::MCAsmInfoDarwin() {
 
   HasDotTypeDotSizeDirective = false;
   HasNoDeadStrip = true;
+  HasAltEntry = true;
 
   DwarfUsesRelocationsAcrossSections = false;
 
diff --git a/lib/MC/MCAsmInfoELF.cpp b/lib/MC/MCAsmInfoELF.cpp
index 2bff6e059663..26e5608d8733 100644
--- a/lib/MC/MCAsmInfoELF.cpp
+++ b/lib/MC/MCAsmInfoELF.cpp
@@ -21,6 +21,8 @@ using namespace llvm;
 void MCAsmInfoELF::anchor() { }
 
 MCSection *MCAsmInfoELF::getNonexecutableStackSection(MCContext &Ctx) const {
+  if (!UsesNonexecutableStackSection)
+    return nullptr;
   return Ctx.getELFSection(".note.GNU-stack", ELF::SHT_PROGBITS, 0);
 }
 
@@ -29,4 +31,5 @@ MCAsmInfoELF::MCAsmInfoELF() {
   WeakRefDirective = "\t.weak\t";
   PrivateGlobalPrefix = ".L";
   PrivateLabelPrefix = ".L";
+  UsesNonexecutableStackSection = true;
 }
diff --git a/lib/MC/MCAsmStreamer.cpp b/lib/MC/MCAsmStreamer.cpp
index c99ce7752b30..ef2f7810deaa 100644
--- a/lib/MC/MCAsmStreamer.cpp
+++ b/lib/MC/MCAsmStreamer.cpp
@@ -7,7 +7,6 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "llvm/MC/MCStreamer.h"
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/SmallString.h"
 #include "llvm/ADT/StringExtras.h"
@@ -24,14 +23,15 @@
 #include "llvm/MC/MCRegisterInfo.h"
 #include "llvm/MC/MCSectionCOFF.h"
 #include "llvm/MC/MCSectionMachO.h"
+#include "llvm/MC/MCStreamer.h"
 #include "llvm/MC/MCSymbolELF.h"
-#include "llvm/Support/CommandLine.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/Format.h"
 #include "llvm/Support/FormattedStream.h"
 #include "llvm/Support/LEB128.h"
 #include "llvm/Support/MathExtras.h"
 #include "llvm/Support/Path.h"
+#include "llvm/Support/SourceMgr.h"
 #include <cctype>
 
 using namespace llvm;
@@ -46,6 +46,7 @@ class MCAsmStreamer final : public MCStreamer {
   std::unique_ptr<MCCodeEmitter> Emitter;
   std::unique_ptr<MCAsmBackend> AsmBackend;
 
+  SmallString<128> ExplicitCommentToEmit;
   SmallString<128> CommentToEmit;
   raw_svector_ostream CommentStream;
 
@@ -73,6 +74,8 @@ public:
   }
 
   inline void EmitEOL() {
+    // Dump Explicit Comments here.
+    emitExplicitComments();
     // If we don't have any comments, just emit a \n.
     if (!IsVerboseAsm) {
       OS << '\n';
@@ -112,6 +115,9 @@ public:
 
   void emitRawComment(const Twine &T, bool TabPrefix = true) override;
 
+  void addExplicitComment(const Twine &T) override;
+  void emitExplicitComments() override;
+
   /// AddBlankLine - Emit a blank line to a .s file to pretty it up.
   void AddBlankLine() override {
     EmitEOL();
@@ -162,6 +168,8 @@ public:
   void EmitTBSSSymbol(MCSection *Section, MCSymbol *Symbol, uint64_t Size,
                       unsigned ByteAlignment = 0) override;
 
+  void EmitBinaryData(StringRef Data) override;
+
   void EmitBytes(StringRef Data) override;
 
   void EmitValueImpl(const MCExpr *Value, unsigned Size,
@@ -177,7 +185,15 @@ public:
   void EmitGPRel32Value(const MCExpr *Value) override;
 
 
-  void EmitFill(uint64_t NumBytes, uint8_t FillValue) override;
+  void emitFill(uint64_t NumBytes, uint8_t FillValue) override;
+
+  void emitFill(const MCExpr &NumBytes, uint64_t FillValue,
+                SMLoc Loc = SMLoc()) override;
+
+  void emitFill(uint64_t NumValues, int64_t Size, int64_t Expr) override;
+
+  void emitFill(const MCExpr &NumValues, int64_t Size, int64_t Expr,
+                SMLoc Loc = SMLoc()) override;
 
   void EmitValueToAlignment(unsigned ByteAlignment, int64_t Value = 0,
                             unsigned ValueSize = 1,
@@ -199,6 +215,22 @@ public:
                              StringRef FileName) override;
   MCSymbol *getDwarfLineTableSymbol(unsigned CUID) override;
 
+  unsigned EmitCVFileDirective(unsigned FileNo, StringRef Filename) override;
+  void EmitCVLocDirective(unsigned FunctionId, unsigned FileNo, unsigned Line,
+                          unsigned Column, bool PrologueEnd, bool IsStmt,
+                          StringRef FileName) override;
+  void EmitCVLinetableDirective(unsigned FunctionId, const MCSymbol *FnStart,
+                                const MCSymbol *FnEnd) override;
+  void EmitCVInlineLinetableDirective(
+      unsigned PrimaryFunctionId, unsigned SourceFileId, unsigned SourceLineNum,
+      const MCSymbol *FnStartSym, const MCSymbol *FnEndSym,
+      ArrayRef<unsigned> SecondaryFunctionIds) override;
+  void EmitCVDefRangeDirective(
+      ArrayRef<std::pair<const MCSymbol *, const MCSymbol *>> Ranges,
+      StringRef FixedSizePortion) override;
+  void EmitCVStringTableDirective() override;
+  void EmitCVFileChecksumsDirective() override;
+
   void EmitIdent(StringRef IdentString) override;
   void EmitCFISections(bool EH, bool Debug) override;
   void EmitCFIDefCfa(int64_t Register, int64_t Offset) override;
@@ -288,7 +320,7 @@ void MCAsmStreamer::EmitCommentsAndEOL() {
 }
 
 static inline int64_t truncateToSize(int64_t Value, unsigned Bytes) {
-  assert(Bytes && "Invalid size!");
+  assert(Bytes > 0 && Bytes <= 8 && "Invalid size!");
   return Value & ((uint64_t) (int64_t) -1 >> (64 - Bytes * 8));
 }
 
@@ -299,6 +331,49 @@ void MCAsmStreamer::emitRawComment(const Twine &T, bool TabPrefix) {
   EmitEOL();
 }
 
+void MCAsmStreamer::addExplicitComment(const Twine &T) {
+  StringRef c = T.getSingleStringRef();
+  if (c.equals(StringRef(MAI->getSeparatorString())))
+    return;
+  if (c.startswith(StringRef("//"))) {
+    ExplicitCommentToEmit.append("\t");
+    ExplicitCommentToEmit.append(MAI->getCommentString());
+    // drop //
+    ExplicitCommentToEmit.append(c.slice(2, c.size()).str());
+  } else if (c.startswith(StringRef("/*"))) {
+    size_t p = 2, len = c.size() - 2;
+    // emit each line in comment as separate newline.
+    do {
+      size_t newp = std::min(len, c.find_first_of("\r\n", p));
+      ExplicitCommentToEmit.append("\t");
+      ExplicitCommentToEmit.append(MAI->getCommentString());
+      ExplicitCommentToEmit.append(c.slice(p, newp).str());
+      // If we have another line in this comment add line
+      if (newp < len)
+        ExplicitCommentToEmit.append("\n");
+      p = newp + 1;
+    } while (p < len);
+  } else if (c.startswith(StringRef(MAI->getCommentString()))) {
+    ExplicitCommentToEmit.append("\t");
+    ExplicitCommentToEmit.append(c.str());
+  } else if (c.front() == '#') {
+    // # are comments for ## commentString. Output extra #.
+    ExplicitCommentToEmit.append("\t#");
+    ExplicitCommentToEmit.append(c.str());
+  } else
+    assert(false && "Unexpected Assembly Comment");
+  // full line comments immediately output
+  if (c.back() == '\n')
+    emitExplicitComments();
+}
+
+void MCAsmStreamer::emitExplicitComments() {
+  StringRef Comments = ExplicitCommentToEmit;
+  if (!Comments.empty())
+    OS << Comments;
+  ExplicitCommentToEmit.clear();
+}
+
 void MCAsmStreamer::ChangeSection(MCSection *Section,
                                   const MCExpr *Subsection) {
   assert(Section && "Cannot switch to a null section!");
@@ -326,12 +401,11 @@ void MCAsmStreamer::EmitLOHDirective(MCLOHType Kind, const MCLOHArgs &Args) {
 
   OS << "\t" << MCLOHDirectiveName() << " " << str << "\t";
   bool IsFirst = true;
-  for (MCLOHArgs::const_iterator It = Args.begin(), EndIt = Args.end();
-       It != EndIt; ++It) {
+  for (const MCSymbol *Arg : Args) {
     if (!IsFirst)
       OS << ", ";
     IsFirst = false;
-    (*It)->print(OS, MAI);
+    Arg->print(OS, MAI);
   }
   EmitEOL();
 }
@@ -354,7 +428,7 @@ void MCAsmStreamer::EmitLinkerOptions(ArrayRef<std::string> Options) {
          ie = Options.end(); it != ie; ++it) {
     OS << ", " << '"' << *it << '"';
   }
-  OS << "\n";
+  EmitEOL();
 }
 
 void MCAsmStreamer::EmitDataRegion(MCDataRegionType Kind) {
@@ -456,6 +530,7 @@ bool MCAsmStreamer::EmitSymbolAttribute(MCSymbol *Symbol,
     OS << "\t.no_dead_strip\t";
     break;
   case MCSA_SymbolResolver: OS << "\t.symbol_resolver\t"; break;
+  case MCSA_AltEntry:       OS << "\t.alt_entry\t";       break;
   case MCSA_PrivateExtern:
     OS << "\t.private_extern\t";
     break;
@@ -484,8 +559,10 @@ void MCAsmStreamer::EmitSymbolDesc(MCSymbol *Symbol, unsigned DescValue) {
 }
 
 void MCAsmStreamer::EmitSyntaxDirective() {
-  if (MAI->getAssemblerDialect() == 1)
-    OS << "\t.intel_syntax noprefix\n";
+  if (MAI->getAssemblerDialect() == 1) {
+    OS << "\t.intel_syntax noprefix";
+    EmitEOL();
+  }
   // FIXME: Currently emit unprefix'ed registers.
   // The intel_syntax directive has one optional argument 
   // with may have a value of prefix or noprefix.
@@ -537,7 +614,7 @@ void MCAsmStreamer::emitELFSize(MCSymbolELF *Symbol, const MCExpr *Value) {
   Symbol->print(OS, MAI);
   OS << ", ";
   Value->print(OS, MAI);
-  OS << '\n';
+  EmitEOL();
 }
 
 void MCAsmStreamer::EmitCommonSymbol(MCSymbol *Symbol, uint64_t Size,
@@ -684,6 +761,20 @@ void MCAsmStreamer::EmitBytes(StringRef Data) {
   EmitEOL();
 }
 
+void MCAsmStreamer::EmitBinaryData(StringRef Data) {
+  // This is binary data. Print it in a grid of hex bytes for readability.
+  const size_t Cols = 4;
+  for (size_t I = 0, EI = alignTo(Data.size(), Cols); I < EI; I += Cols) {
+    size_t J = I, EJ = std::min(I + Cols, Data.size());
+    assert(EJ > 0);
+    OS << MAI->getData8bitsDirective();
+    for (; J < EJ - 1; ++J)
+      OS << format("0x%02x", uint8_t(Data[J])) << ", ";
+    OS << format("0x%02x", uint8_t(Data[J]));
+    EmitEOL();
+  }
+}
+
 void MCAsmStreamer::EmitIntValue(uint64_t Value, unsigned Size) {
   EmitValue(MCConstantExpr::create(Value, getContext()), Size);
 }
@@ -708,17 +799,15 @@ void MCAsmStreamer::EmitValueImpl(const MCExpr *Value, unsigned Size,
       report_fatal_error("Don't know how to emit this value.");
 
     // We couldn't handle the requested integer size so we fallback by breaking
-    // the request down into several, smaller, integers.  Since sizes greater
-    // than eight are invalid and size equivalent to eight should have been
-    // handled earlier, we use four bytes as our largest piece of granularity.
+    // the request down into several, smaller, integers.
+    // Since sizes greater or equal to "Size" are invalid, we use the greatest
+    // power of 2 that is less than "Size" as our largest piece of granularity.
     bool IsLittleEndian = MAI->isLittleEndian();
     for (unsigned Emitted = 0; Emitted != Size;) {
       unsigned Remaining = Size - Emitted;
       // The size of our partial emission must be a power of two less than
-      // eight.
-      unsigned EmissionSize = PowerOf2Floor(Remaining);
-      if (EmissionSize > 4)
-        EmissionSize = 4;
+      // Size.
+      unsigned EmissionSize = PowerOf2Floor(std::min(Remaining, Size - 1));
       // Calculate the byte offset of our partial emission taking into account
       // the endianness of the target.
       unsigned ByteOffset =
@@ -780,21 +869,46 @@ void MCAsmStreamer::EmitGPRel32Value(const MCExpr *Value) {
   EmitEOL();
 }
 
-/// EmitFill - Emit NumBytes bytes worth of the value specified by
+/// emitFill - Emit NumBytes bytes worth of the value specified by
 /// FillValue.  This implements directives such as '.space'.
-void MCAsmStreamer::EmitFill(uint64_t NumBytes, uint8_t FillValue) {
+void MCAsmStreamer::emitFill(uint64_t NumBytes, uint8_t FillValue) {
   if (NumBytes == 0) return;
 
+  const MCExpr *E = MCConstantExpr::create(NumBytes, getContext());
+  emitFill(*E, FillValue);
+}
+
+void MCAsmStreamer::emitFill(const MCExpr &NumBytes, uint64_t FillValue,
+                             SMLoc Loc) {
   if (const char *ZeroDirective = MAI->getZeroDirective()) {
-    OS << ZeroDirective << NumBytes;
+    // FIXME: Emit location directives
+    OS << ZeroDirective;
+    NumBytes.print(OS, MAI);
     if (FillValue != 0)
       OS << ',' << (int)FillValue;
     EmitEOL();
     return;
   }
 
-  // Emit a byte at a time.
-  MCStreamer::EmitFill(NumBytes, FillValue);
+  MCStreamer::emitFill(NumBytes, FillValue);
+}
+
+void MCAsmStreamer::emitFill(uint64_t NumValues, int64_t Size, int64_t Expr) {
+  if (NumValues == 0)
+    return;
+
+  const MCExpr *E = MCConstantExpr::create(NumValues, getContext());
+  emitFill(*E, Size, Expr);
+}
+
+void MCAsmStreamer::emitFill(const MCExpr &NumValues, int64_t Size,
+                             int64_t Expr, SMLoc Loc) {
+  // FIXME: Emit location directives
+  OS << "\t.fill\t";
+  NumValues.print(OS, MAI);
+  OS << ", " << Size << ", 0x";
+  OS.write_hex(truncateToSize(Expr, 4));
+  EmitEOL();
 }
 
 void MCAsmStreamer::EmitValueToAlignment(unsigned ByteAlignment, int64_t Value,
@@ -807,7 +921,7 @@ void MCAsmStreamer::EmitValueToAlignment(unsigned ByteAlignment, int64_t Value,
     default:
       llvm_unreachable("Invalid size for machine code value!");
     case 1:
-      OS << "\t.align\t";
+      OS << "\t.p2align\t";
       break;
     case 2:
       OS << ".p2alignw ";
@@ -819,10 +933,7 @@ void MCAsmStreamer::EmitValueToAlignment(unsigned ByteAlignment, int64_t Value,
       llvm_unreachable("Unsupported alignment size!");
     }
 
-    if (MAI->getAlignmentIsInBytes())
-      OS << ByteAlignment;
-    else
-      OS << Log2_32(ByteAlignment);
+    OS << Log2_32(ByteAlignment);
 
     if (Value || MaxBytesToEmit) {
       OS << ", 0x";
@@ -957,6 +1068,105 @@ MCSymbol *MCAsmStreamer::getDwarfLineTableSymbol(unsigned CUID) {
   return MCStreamer::getDwarfLineTableSymbol(0);
 }
 
+unsigned MCAsmStreamer::EmitCVFileDirective(unsigned FileNo,
+                                            StringRef Filename) {
+  if (!getContext().getCVFile(Filename, FileNo))
+    return 0;
+
+  OS << "\t.cv_file\t" << FileNo << ' ';
+
+  PrintQuotedString(Filename, OS);
+  EmitEOL();
+
+  return FileNo;
+}
+
+void MCAsmStreamer::EmitCVLocDirective(unsigned FunctionId, unsigned FileNo,
+                                       unsigned Line, unsigned Column,
+                                       bool PrologueEnd, bool IsStmt,
+                                       StringRef FileName) {
+  OS << "\t.cv_loc\t" << FunctionId << " " << FileNo << " " << Line << " "
+     << Column;
+  if (PrologueEnd)
+    OS << " prologue_end";
+
+  unsigned OldIsStmt = getContext().getCurrentCVLoc().isStmt();
+  if (IsStmt != OldIsStmt) {
+    OS << " is_stmt ";
+
+    if (IsStmt)
+      OS << "1";
+    else
+      OS << "0";
+  }
+
+  if (IsVerboseAsm) {
+    OS.PadToColumn(MAI->getCommentColumn());
+    OS << MAI->getCommentString() << ' ' << FileName << ':'
+       << Line << ':' << Column;
+  }
+  EmitEOL();
+  this->MCStreamer::EmitCVLocDirective(FunctionId, FileNo, Line, Column,
+                                       PrologueEnd, IsStmt, FileName);
+}
+
+void MCAsmStreamer::EmitCVLinetableDirective(unsigned FunctionId,
+                                             const MCSymbol *FnStart,
+                                             const MCSymbol *FnEnd) {
+  OS << "\t.cv_linetable\t" << FunctionId << ", ";
+  FnStart->print(OS, MAI);
+  OS << ", ";
+  FnEnd->print(OS, MAI);
+  EmitEOL();
+  this->MCStreamer::EmitCVLinetableDirective(FunctionId, FnStart, FnEnd);
+}
+
+void MCAsmStreamer::EmitCVInlineLinetableDirective(
+    unsigned PrimaryFunctionId, unsigned SourceFileId, unsigned SourceLineNum,
+    const MCSymbol *FnStartSym, const MCSymbol *FnEndSym,
+    ArrayRef<unsigned> SecondaryFunctionIds) {
+  OS << "\t.cv_inline_linetable\t" << PrimaryFunctionId << ' ' << SourceFileId
+     << ' ' << SourceLineNum << ' ';
+  FnStartSym->print(OS, MAI);
+  OS << ' ';
+  FnEndSym->print(OS, MAI);
+  if (!SecondaryFunctionIds.empty()) {
+    OS << " contains";
+    for (unsigned SecondaryFunctionId : SecondaryFunctionIds)
+      OS << ' ' << SecondaryFunctionId;
+  }
+  EmitEOL();
+  this->MCStreamer::EmitCVInlineLinetableDirective(
+      PrimaryFunctionId, SourceFileId, SourceLineNum, FnStartSym, FnEndSym,
+      SecondaryFunctionIds);
+}
+
+void MCAsmStreamer::EmitCVDefRangeDirective(
+    ArrayRef<std::pair<const MCSymbol *, const MCSymbol *>> Ranges,
+    StringRef FixedSizePortion) {
+  OS << "\t.cv_def_range\t";
+  for (std::pair<const MCSymbol *, const MCSymbol *> Range : Ranges) {
+    OS << ' ';
+    Range.first->print(OS, MAI);
+    OS << ' ';
+    Range.second->print(OS, MAI);
+  }
+  OS << ", ";
+  PrintQuotedString(FixedSizePortion, OS);
+  EmitEOL();
+  this->MCStreamer::EmitCVDefRangeDirective(Ranges, FixedSizePortion);
+}
+
+void MCAsmStreamer::EmitCVStringTableDirective() {
+  OS << "\t.cv_stringtable";
+  EmitEOL();
+}
+
+void MCAsmStreamer::EmitCVFileChecksumsDirective() {
+  OS << "\t.cv_filechecksums";
+  EmitEOL();
+}
+
 void MCAsmStreamer::EmitIdent(StringRef IdentString) {
   assert(MAI->hasIdentDirective() && ".ident directive not supported");
   OS << "\t.ident\t";
@@ -1033,10 +1243,10 @@ void MCAsmStreamer::EmitCFIEscape(StringRef Values) {
 
 void MCAsmStreamer::EmitCFIGnuArgsSize(int64_t Size) {
   MCStreamer::EmitCFIGnuArgsSize(Size);
-  
+
   uint8_t Buffer[16] = { dwarf::DW_CFA_GNU_args_size };
   unsigned Len = encodeULEB128(Size, Buffer + 1) + 1;
-  
+
   PrintCFIEscape(OS, StringRef((const char *)&Buffer[0], Len));
   EmitEOL();
 }
@@ -1178,8 +1388,8 @@ void MCAsmStreamer::EmitWinEHHandlerData() {
   // We only do this so the section switch that terminates the handler
   // data block is visible.
   WinEH::FrameInfo *CurFrame = getCurrentWinFrameInfo();
-  MCSection *XData =
-      WinEH::UnwindEmitter::getXDataSection(CurFrame->Function, getContext());
+  MCSection *TextSec = &CurFrame->Function->getSection();
+  MCSection *XData = getAssociatedXDataSection(TextSec);
   SwitchSectionNoChange(XData);
 
   OS << "\t.seh_handlerdata";
diff --git a/lib/MC/MCAssembler.cpp b/lib/MC/MCAssembler.cpp
index 15e82fa49388..7a42108ceaf3 100644
--- a/lib/MC/MCAssembler.cpp
+++ b/lib/MC/MCAssembler.cpp
@@ -15,6 +15,7 @@
 #include "llvm/MC/MCAsmInfo.h"
 #include "llvm/MC/MCAsmLayout.h"
 #include "llvm/MC/MCCodeEmitter.h"
+#include "llvm/MC/MCCodeView.h"
 #include "llvm/MC/MCContext.h"
 #include "llvm/MC/MCDwarf.h"
 #include "llvm/MC/MCExpr.h"
@@ -64,9 +65,9 @@ STATISTIC(RelaxedInstructions, "Number of relaxed instructions");
 
 /* *** */
 
-MCAssembler::MCAssembler(MCContext &Context_, MCAsmBackend &Backend_,
-                         MCCodeEmitter &Emitter_, MCObjectWriter &Writer_)
-    : Context(Context_), Backend(Backend_), Emitter(Emitter_), Writer(Writer_),
+MCAssembler::MCAssembler(MCContext &Context, MCAsmBackend &Backend,
+                         MCCodeEmitter &Emitter, MCObjectWriter &Writer)
+    : Context(Context), Backend(Backend), Emitter(Emitter), Writer(Writer),
       BundleAlignSize(0), RelaxAll(false), SubsectionsViaSymbols(false),
       IncrementalLinkerCompatible(false), ELFHeaderEFlags(0) {
   VersionMinInfo.Major = 0; // Major version == 0 for "none specified"
@@ -300,6 +301,10 @@ uint64_t MCAssembler::computeFragmentSize(const MCAsmLayout &Layout,
     return cast<MCDwarfLineAddrFragment>(F).getContents().size();
   case MCFragment::FT_DwarfFrame:
     return cast<MCDwarfCallFrameFragment>(F).getContents().size();
+  case MCFragment::FT_CVInlineLines:
+    return cast<MCCVInlineLineTableFragment>(F).getContents().size();
+  case MCFragment::FT_CVDefRange:
+    return cast<MCCVDefRangeFragment>(F).getContents().size();
   case MCFragment::FT_Dummy:
     llvm_unreachable("Should not have been added");
   }
@@ -488,17 +493,19 @@ static void writeFragment(const MCAssembler &Asm, const MCAsmLayout &Layout,
   case MCFragment::FT_Fill: {
     ++stats::EmittedFillFragments;
     const MCFillFragment &FF = cast<MCFillFragment>(F);
-
-    assert(FF.getValueSize() && "Invalid virtual align in concrete fragment!");
-
-    for (uint64_t i = 0, e = FF.getSize() / FF.getValueSize(); i != e; ++i) {
-      switch (FF.getValueSize()) {
-      default: llvm_unreachable("Invalid size!");
-      case 1: OW->write8 (uint8_t (FF.getValue())); break;
-      case 2: OW->write16(uint16_t(FF.getValue())); break;
-      case 4: OW->write32(uint32_t(FF.getValue())); break;
-      case 8: OW->write64(uint64_t(FF.getValue())); break;
-      }
+    uint8_t V = FF.getValue();
+    const unsigned MaxChunkSize = 16;
+    char Data[MaxChunkSize];
+    memcpy(Data, &V, 1);
+    for (unsigned I = 1; I < MaxChunkSize; ++I)
+      Data[I] = Data[0];
+
+    uint64_t Size = FF.getSize();
+    for (unsigned ChunkSize = MaxChunkSize; ChunkSize; ChunkSize /= 2) {
+      StringRef Ref(Data, ChunkSize);
+      for (uint64_t I = 0, E = Size / ChunkSize; I != E; ++I)
+        OW->writeBytes(Ref);
+      Size = Size % ChunkSize;
     }
     break;
   }
@@ -535,6 +542,16 @@ static void writeFragment(const MCAssembler &Asm, const MCAsmLayout &Layout,
     OW->writeBytes(CF.getContents());
     break;
   }
+  case MCFragment::FT_CVInlineLines: {
+    const auto &OF = cast<MCCVInlineLineTableFragment>(F);
+    OW->writeBytes(OF.getContents());
+    break;
+  }
+  case MCFragment::FT_CVDefRange: {
+    const auto &DRF = cast<MCCVDefRangeFragment>(F);
+    OW->writeBytes(DRF.getContents());
+    break;
+  }
   case MCFragment::FT_Dummy:
     llvm_unreachable("Should not have been added");
   }
@@ -578,8 +595,7 @@ void MCAssembler::writeSectionData(const MCSection *Sec,
                "Invalid align in virtual section!");
         break;
       case MCFragment::FT_Fill:
-        assert((cast<MCFillFragment>(F).getValueSize() == 0 ||
-                cast<MCFillFragment>(F).getValue() == 0) &&
+        assert((cast<MCFillFragment>(F).getValue() == 0) &&
                "Invalid fill in virtual section!");
         break;
       }
@@ -664,19 +680,24 @@ void MCAssembler::layout(MCAsmLayout &Layout) {
   // Evaluate and apply the fixups, generating relocation entries as necessary.
   for (MCSection &Sec : *this) {
     for (MCFragment &Frag : Sec) {
-      MCEncodedFragment *F = dyn_cast<MCEncodedFragment>(&Frag);
       // Data and relaxable fragments both have fixups.  So only process
       // those here.
       // FIXME: Is there a better way to do this?  MCEncodedFragmentWithFixups
       // being templated makes this tricky.
-      if (!F || isa<MCCompactEncodedInstFragment>(F))
+      if (isa<MCEncodedFragment>(&Frag) &&
+          isa<MCCompactEncodedInstFragment>(&Frag))
+        continue;
+      if (!isa<MCEncodedFragment>(&Frag) && !isa<MCCVDefRangeFragment>(&Frag))
         continue;
       ArrayRef<MCFixup> Fixups;
       MutableArrayRef<char> Contents;
-      if (auto *FragWithFixups = dyn_cast<MCDataFragment>(F)) {
+      if (auto *FragWithFixups = dyn_cast<MCDataFragment>(&Frag)) {
         Fixups = FragWithFixups->getFixups();
         Contents = FragWithFixups->getContents();
-      } else if (auto *FragWithFixups = dyn_cast<MCRelaxableFragment>(F)) {
+      } else if (auto *FragWithFixups = dyn_cast<MCRelaxableFragment>(&Frag)) {
+        Fixups = FragWithFixups->getFixups();
+        Contents = FragWithFixups->getContents();
+      } else if (auto *FragWithFixups = dyn_cast<MCCVDefRangeFragment>(&Frag)) {
         Fixups = FragWithFixups->getFixups();
         Contents = FragWithFixups->getContents();
       } else
@@ -684,7 +705,7 @@ void MCAssembler::layout(MCAsmLayout &Layout) {
       for (const MCFixup &Fixup : Fixups) {
         uint64_t FixedValue;
         bool IsPCRel;
-        std::tie(FixedValue, IsPCRel) = handleFixup(Layout, *F, Fixup);
+        std::tie(FixedValue, IsPCRel) = handleFixup(Layout, Frag, Fixup);
         getBackend().applyFixup(Fixup, Contents.data(),
                                 Contents.size(), FixedValue, IsPCRel);
       }
@@ -744,7 +765,7 @@ bool MCAssembler::relaxInstruction(MCAsmLayout &Layout,
   // Relax the fragment.
 
   MCInst Relaxed;
-  getBackend().relaxInstruction(F.getInst(), Relaxed);
+  getBackend().relaxInstruction(F.getInst(), F.getSubtargetInfo(), Relaxed);
 
   // Encode the new instruction.
   //
@@ -812,6 +833,20 @@ bool MCAssembler::relaxDwarfCallFrameFragment(MCAsmLayout &Layout,
   return OldSize != Data.size();
 }
 
+bool MCAssembler::relaxCVInlineLineTable(MCAsmLayout &Layout,
+                                         MCCVInlineLineTableFragment &F) {
+  unsigned OldSize = F.getContents().size();
+  getContext().getCVContext().encodeInlineLineTable(Layout, F);
+  return OldSize != F.getContents().size();
+}
+
+bool MCAssembler::relaxCVDefRange(MCAsmLayout &Layout,
+                                  MCCVDefRangeFragment &F) {
+  unsigned OldSize = F.getContents().size();
+  getContext().getCVContext().encodeDefRange(Layout, F);
+  return OldSize != F.getContents().size();
+}
+
 bool MCAssembler::layoutSectionOnce(MCAsmLayout &Layout, MCSection &Sec) {
   // Holds the first fragment which needed relaxing during this layout. It will
   // remain NULL if none were relaxed.
@@ -843,6 +878,13 @@ bool MCAssembler::layoutSectionOnce(MCAsmLayout &Layout, MCSection &Sec) {
     case MCFragment::FT_LEB:
       RelaxedFrag = relaxLEB(Layout, *cast<MCLEBFragment>(I));
       break;
+    case MCFragment::FT_CVInlineLines:
+      RelaxedFrag =
+          relaxCVInlineLineTable(Layout, *cast<MCCVInlineLineTableFragment>(I));
+      break;
+    case MCFragment::FT_CVDefRange:
+      RelaxedFrag = relaxCVDefRange(Layout, *cast<MCCVDefRangeFragment>(I));
+      break;
     }
     if (RelaxedFrag && !FirstRelaxedFragment)
       FirstRelaxedFragment = &*I;
@@ -872,4 +914,5 @@ void MCAssembler::finishLayout(MCAsmLayout &Layout) {
   for (unsigned int i = 0, n = Layout.getSectionOrder().size(); i != n; ++i) {
     Layout.getFragmentOffset(&*Layout.getSectionOrder()[i]->rbegin());
   }
+  getBackend().finishLayout(*this, Layout);
 }
diff --git a/lib/MC/MCCodeGenInfo.cpp b/lib/MC/MCCodeGenInfo.cpp
deleted file mode 100644
index 347ec2cd01e1..000000000000
--- a/lib/MC/MCCodeGenInfo.cpp
+++ /dev/null
@@ -1,23 +0,0 @@
-//===-- MCCodeGenInfo.cpp - Target CodeGen Info -----------------*- C++ -*-===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-//
-// This file tracks information about the target which can affect codegen,
-// asm parsing, and asm printing. For example, relocation model.
-//
-//===----------------------------------------------------------------------===//
-
-#include "llvm/MC/MCCodeGenInfo.h"
-using namespace llvm;
-
-void MCCodeGenInfo::initMCCodeGenInfo(Reloc::Model RM, CodeModel::Model CM,
-                                      CodeGenOpt::Level OL) {
-  RelocationModel = RM;
-  CMModel = CM;
-  OptLevel = OL;
-}
diff --git a/lib/MC/MCCodeView.cpp b/lib/MC/MCCodeView.cpp
new file mode 100644
index 000000000000..65cff41abebe
--- /dev/null
+++ b/lib/MC/MCCodeView.cpp
@@ -0,0 +1,464 @@
+//===- MCCodeView.h - Machine Code CodeView support -------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// Holds state from .cv_file and .cv_loc directives for later emission.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/MC/MCCodeView.h"
+#include "llvm/MC/MCAsmLayout.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/DebugInfo/CodeView/CodeView.h"
+#include "llvm/DebugInfo/CodeView/Line.h"
+#include "llvm/DebugInfo/CodeView/SymbolRecord.h"
+#include "llvm/MC/MCContext.h"
+#include "llvm/MC/MCObjectStreamer.h"
+#include "llvm/MC/MCValue.h"
+#include "llvm/Support/COFF.h"
+#include "llvm/Support/EndianStream.h"
+
+using namespace llvm;
+using namespace llvm::codeview;
+
+CodeViewContext::CodeViewContext() {}
+
+CodeViewContext::~CodeViewContext() {
+  // If someone inserted strings into the string table but never actually
+  // emitted them somewhere, clean up the fragment.
+  if (!InsertedStrTabFragment)
+    delete StrTabFragment;
+}
+
+/// This is a valid number for use with .cv_loc if we've already seen a .cv_file
+/// for it.
+bool CodeViewContext::isValidFileNumber(unsigned FileNumber) const {
+  unsigned Idx = FileNumber - 1;
+  if (Idx < Filenames.size())
+    return !Filenames[Idx].empty();
+  return false;
+}
+
+bool CodeViewContext::addFile(unsigned FileNumber, StringRef Filename) {
+  assert(FileNumber > 0);
+  Filename = addToStringTable(Filename);
+  unsigned Idx = FileNumber - 1;
+  if (Idx >= Filenames.size())
+    Filenames.resize(Idx + 1);
+
+  if (Filename.empty())
+    Filename = "<stdin>";
+
+  if (!Filenames[Idx].empty())
+    return false;
+
+  // FIXME: We should store the string table offset of the filename, rather than
+  // the filename itself for efficiency.
+  Filename = addToStringTable(Filename);
+
+  Filenames[Idx] = Filename;
+  return true;
+}
+
+MCDataFragment *CodeViewContext::getStringTableFragment() {
+  if (!StrTabFragment) {
+    StrTabFragment = new MCDataFragment();
+    // Start a new string table out with a null byte.
+    StrTabFragment->getContents().push_back('\0');
+  }
+  return StrTabFragment;
+}
+
+StringRef CodeViewContext::addToStringTable(StringRef S) {
+  SmallVectorImpl<char> &Contents = getStringTableFragment()->getContents();
+  auto Insertion =
+      StringTable.insert(std::make_pair(S, unsigned(Contents.size())));
+  // Return the string from the table, since it is stable.
+  S = Insertion.first->first();
+  if (Insertion.second) {
+    // The string map key is always null terminated.
+    Contents.append(S.begin(), S.end() + 1);
+  }
+  return S;
+}
+
+unsigned CodeViewContext::getStringTableOffset(StringRef S) {
+  // A string table offset of zero is always the empty string.
+  if (S.empty())
+    return 0;
+  auto I = StringTable.find(S);
+  assert(I != StringTable.end());
+  return I->second;
+}
+
+void CodeViewContext::emitStringTable(MCObjectStreamer &OS) {
+  MCContext &Ctx = OS.getContext();
+  MCSymbol *StringBegin = Ctx.createTempSymbol("strtab_begin", false),
+           *StringEnd = Ctx.createTempSymbol("strtab_end", false);
+
+  OS.EmitIntValue(unsigned(ModuleSubstreamKind::StringTable), 4);
+  OS.emitAbsoluteSymbolDiff(StringEnd, StringBegin, 4);
+  OS.EmitLabel(StringBegin);
+
+  // Put the string table data fragment here, if we haven't already put it
+  // somewhere else. If somebody wants two string tables in their .s file, one
+  // will just be empty.
+  if (!InsertedStrTabFragment) {
+    OS.insert(getStringTableFragment());
+    InsertedStrTabFragment = true;
+  }
+
+  OS.EmitValueToAlignment(4, 0);
+
+  OS.EmitLabel(StringEnd);
+}
+
+void CodeViewContext::emitFileChecksums(MCObjectStreamer &OS) {
+  // Do nothing if there are no file checksums. Microsoft's linker rejects empty
+  // CodeView substreams.
+  if (Filenames.empty())
+    return;
+
+  MCContext &Ctx = OS.getContext();
+  MCSymbol *FileBegin = Ctx.createTempSymbol("filechecksums_begin", false),
+           *FileEnd = Ctx.createTempSymbol("filechecksums_end", false);
+
+  OS.EmitIntValue(unsigned(ModuleSubstreamKind::FileChecksums), 4);
+  OS.emitAbsoluteSymbolDiff(FileEnd, FileBegin, 4);
+  OS.EmitLabel(FileBegin);
+
+  // Emit an array of FileChecksum entries. We index into this table using the
+  // user-provided file number. Each entry is currently 8 bytes, as we don't
+  // emit checksums.
+  for (StringRef Filename : Filenames) {
+    OS.EmitIntValue(getStringTableOffset(Filename), 4);
+    // Zero the next two fields and align back to 4 bytes. This indicates that
+    // no checksum is present.
+    OS.EmitIntValue(0, 4);
+  }
+
+  OS.EmitLabel(FileEnd);
+}
+
+void CodeViewContext::emitLineTableForFunction(MCObjectStreamer &OS,
+                                               unsigned FuncId,
+                                               const MCSymbol *FuncBegin,
+                                               const MCSymbol *FuncEnd) {
+  MCContext &Ctx = OS.getContext();
+  MCSymbol *LineBegin = Ctx.createTempSymbol("linetable_begin", false),
+           *LineEnd = Ctx.createTempSymbol("linetable_end", false);
+
+  OS.EmitIntValue(unsigned(ModuleSubstreamKind::Lines), 4);
+  OS.emitAbsoluteSymbolDiff(LineEnd, LineBegin, 4);
+  OS.EmitLabel(LineBegin);
+  OS.EmitCOFFSecRel32(FuncBegin);
+  OS.EmitCOFFSectionIndex(FuncBegin);
+
+  // Actual line info.
+  std::vector<MCCVLineEntry> Locs = getFunctionLineEntries(FuncId);
+  bool HaveColumns = any_of(Locs, [](const MCCVLineEntry &LineEntry) {
+    return LineEntry.getColumn() != 0;
+  });
+  OS.EmitIntValue(HaveColumns ? int(LineFlags::HaveColumns) : 0, 2);
+  OS.emitAbsoluteSymbolDiff(FuncEnd, FuncBegin, 4);
+
+  for (auto I = Locs.begin(), E = Locs.end(); I != E;) {
+    // Emit a file segment for the run of locations that share a file id.
+    unsigned CurFileNum = I->getFileNum();
+    auto FileSegEnd =
+        std::find_if(I, E, [CurFileNum](const MCCVLineEntry &Loc) {
+          return Loc.getFileNum() != CurFileNum;
+        });
+    unsigned EntryCount = FileSegEnd - I;
+    OS.AddComment("Segment for file '" + Twine(Filenames[CurFileNum - 1]) +
+                  "' begins");
+    OS.EmitIntValue(8 * (CurFileNum - 1), 4);
+    OS.EmitIntValue(EntryCount, 4);
+    uint32_t SegmentSize = 12;
+    SegmentSize += 8 * EntryCount;
+    if (HaveColumns)
+      SegmentSize += 4 * EntryCount;
+    OS.EmitIntValue(SegmentSize, 4);
+
+    for (auto J = I; J != FileSegEnd; ++J) {
+      OS.emitAbsoluteSymbolDiff(J->getLabel(), FuncBegin, 4);
+      unsigned LineData = J->getLine();
+      if (J->isStmt())
+        LineData |= LineInfo::StatementFlag;
+      OS.EmitIntValue(LineData, 4);
+    }
+    if (HaveColumns) {
+      for (auto J = I; J != FileSegEnd; ++J) {
+        OS.EmitIntValue(J->getColumn(), 2);
+        OS.EmitIntValue(0, 2);
+      }
+    }
+    I = FileSegEnd;
+  }
+  OS.EmitLabel(LineEnd);
+}
+
+static bool compressAnnotation(uint32_t Data, SmallVectorImpl<char> &Buffer) {
+  if (isUInt<7>(Data)) {
+    Buffer.push_back(Data);
+    return true;
+  }
+
+  if (isUInt<14>(Data)) {
+    Buffer.push_back((Data >> 8) | 0x80);
+    Buffer.push_back(Data & 0xff);
+    return true;
+  }
+
+  if (isUInt<29>(Data)) {
+    Buffer.push_back((Data >> 24) | 0xC0);
+    Buffer.push_back((Data >> 16) & 0xff);
+    Buffer.push_back((Data >> 8) & 0xff);
+    Buffer.push_back(Data & 0xff);
+    return true;
+  }
+
+  return false;
+}
+
+static bool compressAnnotation(BinaryAnnotationsOpCode Annotation,
+                               SmallVectorImpl<char> &Buffer) {
+  return compressAnnotation(static_cast<uint32_t>(Annotation), Buffer);
+}
+
+static uint32_t encodeSignedNumber(uint32_t Data) {
+  if (Data >> 31)
+    return ((-Data) << 1) | 1;
+  return Data << 1;
+}
+
+void CodeViewContext::emitInlineLineTableForFunction(
+    MCObjectStreamer &OS, unsigned PrimaryFunctionId, unsigned SourceFileId,
+    unsigned SourceLineNum, const MCSymbol *FnStartSym,
+    const MCSymbol *FnEndSym, ArrayRef<unsigned> SecondaryFunctionIds) {
+  // Create and insert a fragment into the current section that will be encoded
+  // later.
+  new MCCVInlineLineTableFragment(
+      PrimaryFunctionId, SourceFileId, SourceLineNum, FnStartSym, FnEndSym,
+      SecondaryFunctionIds, OS.getCurrentSectionOnly());
+}
+
+void CodeViewContext::emitDefRange(
+    MCObjectStreamer &OS,
+    ArrayRef<std::pair<const MCSymbol *, const MCSymbol *>> Ranges,
+    StringRef FixedSizePortion) {
+  // Create and insert a fragment into the current section that will be encoded
+  // later.
+  new MCCVDefRangeFragment(Ranges, FixedSizePortion,
+                           OS.getCurrentSectionOnly());
+}
+
+static unsigned computeLabelDiff(MCAsmLayout &Layout, const MCSymbol *Begin,
+                                 const MCSymbol *End) {
+  MCContext &Ctx = Layout.getAssembler().getContext();
+  MCSymbolRefExpr::VariantKind Variant = MCSymbolRefExpr::VK_None;
+  const MCExpr *BeginRef = MCSymbolRefExpr::create(Begin, Variant, Ctx),
+               *EndRef = MCSymbolRefExpr::create(End, Variant, Ctx);
+  const MCExpr *AddrDelta =
+      MCBinaryExpr::create(MCBinaryExpr::Sub, EndRef, BeginRef, Ctx);
+  int64_t Result;
+  bool Success = AddrDelta->evaluateKnownAbsolute(Result, Layout);
+  assert(Success && "failed to evaluate label difference as absolute");
+  (void)Success;
+  assert(Result >= 0 && "negative label difference requested");
+  assert(Result < UINT_MAX && "label difference greater than 2GB");
+  return unsigned(Result);
+}
+
+void CodeViewContext::encodeInlineLineTable(MCAsmLayout &Layout,
+                                            MCCVInlineLineTableFragment &Frag) {
+  size_t LocBegin;
+  size_t LocEnd;
+  std::tie(LocBegin, LocEnd) = getLineExtent(Frag.SiteFuncId);
+  for (unsigned SecondaryId : Frag.SecondaryFuncs) {
+    auto Extent = getLineExtent(SecondaryId);
+    LocBegin = std::min(LocBegin, Extent.first);
+    LocEnd = std::max(LocEnd, Extent.second);
+  }
+  if (LocBegin >= LocEnd)
+    return;
+  ArrayRef<MCCVLineEntry> Locs = getLinesForExtent(LocBegin, LocEnd);
+  if (Locs.empty())
+    return;
+
+  SmallSet<unsigned, 8> InlinedFuncIds;
+  InlinedFuncIds.insert(Frag.SiteFuncId);
+  InlinedFuncIds.insert(Frag.SecondaryFuncs.begin(), Frag.SecondaryFuncs.end());
+
+  // Make an artificial start location using the function start and the inlinee
+  // lines start location information. All deltas start relative to this
+  // location.
+  MCCVLineEntry StartLoc(Frag.getFnStartSym(), MCCVLoc(Locs.front()));
+  StartLoc.setFileNum(Frag.StartFileId);
+  StartLoc.setLine(Frag.StartLineNum);
+  const MCCVLineEntry *LastLoc = &StartLoc;
+  bool HaveOpenRange = false;
+
+  SmallVectorImpl<char> &Buffer = Frag.getContents();
+  Buffer.clear(); // Clear old contents if we went through relaxation.
+  for (const MCCVLineEntry &Loc : Locs) {
+    if (!InlinedFuncIds.count(Loc.getFunctionId())) {
+      // We've hit a cv_loc not attributed to this inline call site. Use this
+      // label to end the PC range.
+      if (HaveOpenRange) {
+        unsigned Length =
+            computeLabelDiff(Layout, LastLoc->getLabel(), Loc.getLabel());
+        compressAnnotation(BinaryAnnotationsOpCode::ChangeCodeLength, Buffer);
+        compressAnnotation(Length, Buffer);
+      }
+      HaveOpenRange = false;
+      continue;
+    }
+
+    // If we've already opened the function and we're at an indirectly inlined
+    // location, continue until the next directly inlined location.
+    bool DirectlyInlined = Loc.getFunctionId() == Frag.SiteFuncId;
+    if (!DirectlyInlined && HaveOpenRange)
+      continue;
+    HaveOpenRange = true;
+
+    if (Loc.getFileNum() != LastLoc->getFileNum()) {
+      // File ids are 1 based, and each file checksum table entry is 8 bytes
+      // long. See emitFileChecksums above.
+      unsigned FileOffset = 8 * (Loc.getFileNum() - 1);
+      compressAnnotation(BinaryAnnotationsOpCode::ChangeFile, Buffer);
+      compressAnnotation(FileOffset, Buffer);
+    }
+
+    int LineDelta = Loc.getLine() - LastLoc->getLine();
+    if (LineDelta == 0)
+      continue;
+
+    unsigned EncodedLineDelta = encodeSignedNumber(LineDelta);
+    unsigned CodeDelta =
+        computeLabelDiff(Layout, LastLoc->getLabel(), Loc.getLabel());
+    if (CodeDelta == 0) {
+      compressAnnotation(BinaryAnnotationsOpCode::ChangeLineOffset, Buffer);
+      compressAnnotation(EncodedLineDelta, Buffer);
+    } else if (EncodedLineDelta < 0x8 && CodeDelta <= 0xf) {
+      // The ChangeCodeOffsetAndLineOffset combination opcode is used when the
+      // encoded line delta uses 3 or fewer set bits and the code offset fits
+      // in one nibble.
+      unsigned Operand = (EncodedLineDelta << 4) | CodeDelta;
+      compressAnnotation(BinaryAnnotationsOpCode::ChangeCodeOffsetAndLineOffset,
+                         Buffer);
+      compressAnnotation(Operand, Buffer);
+    } else {
+      // Otherwise use the separate line and code deltas.
+      compressAnnotation(BinaryAnnotationsOpCode::ChangeLineOffset, Buffer);
+      compressAnnotation(EncodedLineDelta, Buffer);
+      compressAnnotation(BinaryAnnotationsOpCode::ChangeCodeOffset, Buffer);
+      compressAnnotation(CodeDelta, Buffer);
+    }
+
+    LastLoc = &Loc;
+  }
+
+  assert(HaveOpenRange);
+
+  unsigned EndSymLength =
+      computeLabelDiff(Layout, LastLoc->getLabel(), Frag.getFnEndSym());
+  unsigned LocAfterLength = ~0U;
+  ArrayRef<MCCVLineEntry> LocAfter = getLinesForExtent(LocEnd, LocEnd + 1);
+  if (!LocAfter.empty()) {
+    // Only try to compute this difference if we're in the same section.
+    const MCCVLineEntry &Loc = LocAfter[0];
+    if (&Loc.getLabel()->getSection(false) ==
+        &LastLoc->getLabel()->getSection(false)) {
+      LocAfterLength =
+          computeLabelDiff(Layout, LastLoc->getLabel(), Loc.getLabel());
+    }
+  }
+
+  compressAnnotation(BinaryAnnotationsOpCode::ChangeCodeLength, Buffer);
+  compressAnnotation(std::min(EndSymLength, LocAfterLength), Buffer);
+}
+
+void CodeViewContext::encodeDefRange(MCAsmLayout &Layout,
+                                     MCCVDefRangeFragment &Frag) {
+  MCContext &Ctx = Layout.getAssembler().getContext();
+  SmallVectorImpl<char> &Contents = Frag.getContents();
+  Contents.clear();
+  SmallVectorImpl<MCFixup> &Fixups = Frag.getFixups();
+  Fixups.clear();
+  raw_svector_ostream OS(Contents);
+
+  // Write down each range where the variable is defined.
+  for (std::pair<const MCSymbol *, const MCSymbol *> Range : Frag.getRanges()) {
+    unsigned RangeSize = computeLabelDiff(Layout, Range.first, Range.second);
+    unsigned Bias = 0;
+    // We must split the range into chunks of MaxDefRange, this is a fundamental
+    // limitation of the file format.
+    do {
+      uint16_t Chunk = std::min((uint32_t)MaxDefRange, RangeSize);
+
+      const MCSymbolRefExpr *SRE = MCSymbolRefExpr::create(Range.first, Ctx);
+      const MCBinaryExpr *BE =
+          MCBinaryExpr::createAdd(SRE, MCConstantExpr::create(Bias, Ctx), Ctx);
+      MCValue Res;
+      BE->evaluateAsRelocatable(Res, &Layout, /*Fixup=*/nullptr);
+
+      // Each record begins with a 2-byte number indicating how large the record
+      // is.
+      StringRef FixedSizePortion = Frag.getFixedSizePortion();
+      // Our record is a fixed sized prefix and a LocalVariableAddrRange that we
+      // are artificially constructing.
+      size_t RecordSize =
+          FixedSizePortion.size() + sizeof(LocalVariableAddrRange);
+      // Write out the recrod size.
+      support::endian::Writer<support::little>(OS).write<uint16_t>(RecordSize);
+      // Write out the fixed size prefix.
+      OS << FixedSizePortion;
+      // Make space for a fixup that will eventually have a section relative
+      // relocation pointing at the offset where the variable becomes live.
+      Fixups.push_back(MCFixup::create(Contents.size(), BE, FK_SecRel_4));
+      Contents.resize(Contents.size() + 4); // Fixup for code start.
+      // Make space for a fixup that will record the section index for the code.
+      Fixups.push_back(MCFixup::create(Contents.size(), BE, FK_SecRel_2));
+      Contents.resize(Contents.size() + 2); // Fixup for section index.
+      // Write down the range's extent.
+      support::endian::Writer<support::little>(OS).write<uint16_t>(Chunk);
+
+      // Move on to the next range.
+      Bias += Chunk;
+      RangeSize -= Chunk;
+    } while (RangeSize > 0);
+  }
+}
+
+//
+// This is called when an instruction is assembled into the specified section
+// and if there is information from the last .cv_loc directive that has yet to have
+// a line entry made for it is made.
+//
+void MCCVLineEntry::Make(MCObjectStreamer *MCOS) {
+  if (!MCOS->getContext().getCVLocSeen())
+    return;
+
+  // Create a symbol at in the current section for use in the line entry.
+  MCSymbol *LineSym = MCOS->getContext().createTempSymbol();
+  // Set the value of the symbol to use for the MCCVLineEntry.
+  MCOS->EmitLabel(LineSym);
+
+  // Get the current .loc info saved in the context.
+  const MCCVLoc &CVLoc = MCOS->getContext().getCurrentCVLoc();
+
+  // Create a (local) line entry with the symbol and the current .loc info.
+  MCCVLineEntry LineEntry(LineSym, CVLoc);
+
+  // clear CVLocSeen saying the current .loc info is now used.
+  MCOS->getContext().clearCVLocSeen();
+
+  // Add the line entry to this section's entries.
+  MCOS->getContext().getCVContext().addLineEntry(LineEntry);
+}
diff --git a/lib/MC/MCContext.cpp b/lib/MC/MCContext.cpp
index b5ad518d0330..47ed1ca3add5 100644
--- a/lib/MC/MCContext.cpp
+++ b/lib/MC/MCContext.cpp
@@ -10,8 +10,9 @@
 #include "llvm/MC/MCContext.h"
 #include "llvm/ADT/SmallString.h"
 #include "llvm/ADT/Twine.h"
-#include "llvm/MC/MCAssembler.h"
 #include "llvm/MC/MCAsmInfo.h"
+#include "llvm/MC/MCAssembler.h"
+#include "llvm/MC/MCCodeView.h"
 #include "llvm/MC/MCDwarf.h"
 #include "llvm/MC/MCLabel.h"
 #include "llvm/MC/MCObjectFileInfo.h"
@@ -24,16 +25,22 @@
 #include "llvm/MC/MCSymbolELF.h"
 #include "llvm/MC/MCSymbolMachO.h"
 #include "llvm/Support/COFF.h"
+#include "llvm/Support/CommandLine.h"
 #include "llvm/Support/ELF.h"
 #include "llvm/Support/ErrorHandling.h"
-#include "llvm/Support/FileSystem.h"
 #include "llvm/Support/MemoryBuffer.h"
 #include "llvm/Support/Signals.h"
 #include "llvm/Support/SourceMgr.h"
-#include <map>
 
 using namespace llvm;
 
+static cl::opt<char*>
+AsSecureLogFileName("as-secure-log-file-name",
+        cl::desc("As secure log file name (initialized from "
+                 "AS_SECURE_LOG_FILE env variable)"),
+        cl::init(getenv("AS_SECURE_LOG_FILE")), cl::Hidden);
+
+
 MCContext::MCContext(const MCAsmInfo *mai, const MCRegisterInfo *mri,
                      const MCObjectFileInfo *mofi, const SourceMgr *mgr,
                      bool DoAutoReset)
@@ -43,12 +50,7 @@ MCContext::MCContext(const MCAsmInfo *mai, const MCRegisterInfo *mri,
       GenDwarfForAssembly(false), GenDwarfFileNumber(0), DwarfVersion(4),
       AllowTemporaryLabels(true), DwarfCompileUnitID(0),
       AutoReset(DoAutoReset), HadError(false) {
-
-  std::error_code EC = llvm::sys::fs::current_path(CompilationDir);
-  if (EC)
-    CompilationDir.clear();
-
-  SecureLogFile = getenv("AS_SECURE_LOG_FILE");
+  SecureLogFile = AsSecureLogFileName;
   SecureLog = nullptr;
   SecureLogUsed = false;
 
@@ -90,6 +92,8 @@ void MCContext::reset() {
   DwarfCompileUnitID = 0;
   CurrentDwarfLoc = MCDwarfLoc(0, 0, 0, DWARF2_FLAG_IS_STMT, 0, 0);
 
+  CVContext.reset();
+
   MachOUniquingMap.clear();
   ELFUniquingMap.clear();
   COFFUniquingMap.clear();
@@ -126,19 +130,9 @@ MCSymbolELF *MCContext::getOrCreateSectionSymbol(const MCSectionELF &Section) {
     return Sym;
 
   StringRef Name = Section.getSectionName();
-
-  MCSymbol *&OldSym = Symbols[Name];
-  if (OldSym && OldSym->isUndefined()) {
-    Sym = cast<MCSymbolELF>(OldSym);
-    return Sym;
-  }
-
-  auto NameIter = UsedNames.insert(std::make_pair(Name, true)).first;
+  auto NameIter = UsedNames.insert(std::make_pair(Name, false)).first;
   Sym = new (&*NameIter, *this) MCSymbolELF(&*NameIter, /*isTemporary*/ false);
 
-  if (!OldSym)
-    OldSym = Sym;
-
   return Sym;
 }
 
@@ -194,9 +188,12 @@ MCSymbol *MCContext::createSymbol(StringRef Name, bool AlwaysAddSuffix,
       raw_svector_ostream(NewName) << NextUniqueID++;
     }
     auto NameEntry = UsedNames.insert(std::make_pair(NewName, true));
-    if (NameEntry.second) {
-      // Ok, we found a name. Have the MCSymbol object itself refer to the copy
-      // of the string that is embedded in the UsedNames entry.
+    if (NameEntry.second || !NameEntry.first->second) {
+      // Ok, we found a name.
+      // Mark it as used for a non-section symbol.
+      NameEntry.first->second = true;
+      // Have the MCSymbol object itself refer to the copy of the string that is
+      // embedded in the UsedNames entry.
       return createSymbolImpl(&*NameEntry.first, IsTemporary);
     }
     assert(IsTemporary && "Cannot rename non-temporary symbols");
@@ -312,32 +309,40 @@ void MCContext::renameELFSection(MCSectionELF *Section, StringRef Name) {
   const_cast<MCSectionELF *>(Section)->setSectionName(CachedName);
 }
 
-MCSectionELF *MCContext::createELFRelSection(StringRef Name, unsigned Type,
+MCSectionELF *MCContext::createELFRelSection(const Twine &Name, unsigned Type,
                                              unsigned Flags, unsigned EntrySize,
                                              const MCSymbolELF *Group,
                                              const MCSectionELF *Associated) {
   StringMap<bool>::iterator I;
   bool Inserted;
-  std::tie(I, Inserted) = ELFRelSecNames.insert(std::make_pair(Name, true));
+  std::tie(I, Inserted) =
+      ELFRelSecNames.insert(std::make_pair(Name.str(), true));
 
   return new (ELFAllocator.Allocate())
       MCSectionELF(I->getKey(), Type, Flags, SectionKind::getReadOnly(),
                    EntrySize, Group, true, nullptr, Associated);
 }
 
-MCSectionELF *MCContext::getELFSection(StringRef Section, unsigned Type,
+MCSectionELF *MCContext::getELFNamedSection(const Twine &Prefix,
+                                            const Twine &Suffix, unsigned Type,
+                                            unsigned Flags,
+                                            unsigned EntrySize) {
+  return getELFSection(Prefix + "." + Suffix, Type, Flags, EntrySize, Suffix);
+}
+
+MCSectionELF *MCContext::getELFSection(const Twine &Section, unsigned Type,
                                        unsigned Flags, unsigned EntrySize,
-                                       StringRef Group, unsigned UniqueID,
+                                       const Twine &Group, unsigned UniqueID,
                                        const char *BeginSymName) {
   MCSymbolELF *GroupSym = nullptr;
-  if (!Group.empty())
+  if (!Group.isTriviallyEmpty() && !Group.str().empty())
     GroupSym = cast<MCSymbolELF>(getOrCreateSymbol(Group));
 
   return getELFSection(Section, Type, Flags, EntrySize, GroupSym, UniqueID,
                        BeginSymName, nullptr);
 }
 
-MCSectionELF *MCContext::getELFSection(StringRef Section, unsigned Type,
+MCSectionELF *MCContext::getELFSection(const Twine &Section, unsigned Type,
                                        unsigned Flags, unsigned EntrySize,
                                        const MCSymbolELF *GroupSym,
                                        unsigned UniqueID,
@@ -348,7 +353,7 @@ MCSectionELF *MCContext::getELFSection(StringRef Section, unsigned Type,
     Group = GroupSym->getName();
   // Do the lookup, if we have a hit, return it.
   auto IterBool = ELFUniquingMap.insert(
-      std::make_pair(ELFSectionKey{Section, Group, UniqueID}, nullptr));
+      std::make_pair(ELFSectionKey{Section.str(), Group, UniqueID}, nullptr));
   auto &Entry = *IterBool.first;
   if (!IterBool.second)
     return Entry.second;
@@ -383,6 +388,7 @@ MCSectionCOFF *MCContext::getCOFFSection(StringRef Section,
                                          unsigned Characteristics,
                                          SectionKind Kind,
                                          StringRef COMDATSymName, int Selection,
+                                         unsigned UniqueID,
                                          const char *BeginSymName) {
   MCSymbol *COMDATSymbol = nullptr;
   if (!COMDATSymName.empty()) {
@@ -390,8 +396,9 @@ MCSectionCOFF *MCContext::getCOFFSection(StringRef Section,
     COMDATSymName = COMDATSymbol->getName();
   }
 
+
   // Do the lookup, if we have a hit, return it.
-  COFFSectionKey T{Section, COMDATSymName, Selection};
+  COFFSectionKey T{Section, COMDATSymName, Selection, UniqueID};
   auto IterBool = COFFUniquingMap.insert(std::make_pair(T, nullptr));
   auto Iter = IterBool.first;
   if (!IterBool.second)
@@ -413,11 +420,12 @@ MCSectionCOFF *MCContext::getCOFFSection(StringRef Section,
                                          unsigned Characteristics,
                                          SectionKind Kind,
                                          const char *BeginSymName) {
-  return getCOFFSection(Section, Characteristics, Kind, "", 0, BeginSymName);
+  return getCOFFSection(Section, Characteristics, Kind, "", 0, GenericSectionID,
+                        BeginSymName);
 }
 
 MCSectionCOFF *MCContext::getCOFFSection(StringRef Section) {
-  COFFSectionKey T{Section, "", 0};
+  COFFSectionKey T{Section, "", 0, GenericSectionID};
   auto Iter = COFFUniquingMap.find(T);
   if (Iter == COFFUniquingMap.end())
     return nullptr;
@@ -425,18 +433,24 @@ MCSectionCOFF *MCContext::getCOFFSection(StringRef Section) {
 }
 
 MCSectionCOFF *MCContext::getAssociativeCOFFSection(MCSectionCOFF *Sec,
-                                                    const MCSymbol *KeySym) {
-  // Return the normal section if we don't have to be associative.
-  if (!KeySym)
+                                                    const MCSymbol *KeySym,
+                                                    unsigned UniqueID) {
+  // Return the normal section if we don't have to be associative or unique.
+  if (!KeySym && UniqueID == GenericSectionID)
     return Sec;
 
-  // Make an associative section with the same name and kind as the normal
-  // section.
-  unsigned Characteristics =
-      Sec->getCharacteristics() | COFF::IMAGE_SCN_LNK_COMDAT;
+  // If we have a key symbol, make an associative section with the same name and
+  // kind as the normal section.
+  unsigned Characteristics = Sec->getCharacteristics();
+  if (KeySym) {
+    Characteristics |= COFF::IMAGE_SCN_LNK_COMDAT;
+    return getCOFFSection(Sec->getSectionName(), Characteristics,
+                          Sec->getKind(), KeySym->getName(),
+                          COFF::IMAGE_COMDAT_SELECT_ASSOCIATIVE, UniqueID);
+  }
+
   return getCOFFSection(Sec->getSectionName(), Characteristics, Sec->getKind(),
-                        KeySym->getName(),
-                        COFF::IMAGE_COMDAT_SELECT_ASSOCIATIVE);
+                        "", 0, UniqueID);
 }
 
 MCSubtargetInfo &MCContext::getSubtargetCopy(const MCSubtargetInfo &STI) {
@@ -474,6 +488,20 @@ void MCContext::finalizeDwarfSections(MCStreamer &MCOS) {
       [&](MCSection *Sec) { return !MCOS.mayHaveInstructions(*Sec); });
 }
 
+CodeViewContext &MCContext::getCVContext() {
+  if (!CVContext.get())
+    CVContext.reset(new CodeViewContext);
+  return *CVContext.get();
+}
+
+unsigned MCContext::getCVFile(StringRef FileName, unsigned FileNumber) {
+  return getCVContext().addFile(FileNumber, FileName) ? FileNumber : 0;
+}
+
+bool MCContext::isValidCVFileNumber(unsigned FileNumber) {
+  return getCVContext().isValidFileNumber(FileNumber);
+}
+
 //===----------------------------------------------------------------------===//
 // Error Reporting
 //===----------------------------------------------------------------------===//
diff --git a/lib/MC/MCDisassembler/CMakeLists.txt b/lib/MC/MCDisassembler/CMakeLists.txt
index f266f8fcd301..e940afc56f5b 100644
--- a/lib/MC/MCDisassembler/CMakeLists.txt
+++ b/lib/MC/MCDisassembler/CMakeLists.txt
@@ -1,6 +1,7 @@
 add_llvm_library(LLVMMCDisassembler
   Disassembler.cpp
-  MCRelocationInfo.cpp
-  MCExternalSymbolizer.cpp
   MCDisassembler.cpp
+  MCExternalSymbolizer.cpp
+  MCRelocationInfo.cpp
+  MCSymbolizer.cpp
   )
diff --git a/lib/MC/MCDisassembler/Disassembler.cpp b/lib/MC/MCDisassembler/Disassembler.cpp
index 82063fb74696..21e8748b797a 100644
--- a/lib/MC/MCDisassembler/Disassembler.cpp
+++ b/lib/MC/MCDisassembler/Disassembler.cpp
@@ -11,14 +11,14 @@
 #include "llvm-c/Disassembler.h"
 #include "llvm/MC/MCAsmInfo.h"
 #include "llvm/MC/MCContext.h"
-#include "llvm/MC/MCDisassembler.h"
+#include "llvm/MC/MCDisassembler/MCDisassembler.h"
+#include "llvm/MC/MCDisassembler/MCRelocationInfo.h"
+#include "llvm/MC/MCDisassembler/MCSymbolizer.h"
 #include "llvm/MC/MCInst.h"
 #include "llvm/MC/MCInstPrinter.h"
 #include "llvm/MC/MCInstrInfo.h"
 #include "llvm/MC/MCRegisterInfo.h"
-#include "llvm/MC/MCRelocationInfo.h"
 #include "llvm/MC/MCSubtargetInfo.h"
-#include "llvm/MC/MCSymbolizer.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/FormattedStream.h"
 #include "llvm/Support/TargetRegistry.h"
diff --git a/lib/MC/MCDisassembler/Disassembler.h b/lib/MC/MCDisassembler/Disassembler.h
index 46d0c4c3d94c..25d17dafb576 100644
--- a/lib/MC/MCDisassembler/Disassembler.h
+++ b/lib/MC/MCDisassembler/Disassembler.h
@@ -19,17 +19,18 @@
 
 #include "llvm-c/Disassembler.h"
 #include "llvm/ADT/SmallString.h"
+#include "llvm/MC/MCAsmInfo.h"
+#include "llvm/MC/MCContext.h"
+#include "llvm/MC/MCDisassembler/MCDisassembler.h"
+#include "llvm/MC/MCInstPrinter.h"
+#include "llvm/MC/MCInstrInfo.h"
+#include "llvm/MC/MCRegisterInfo.h"
+#include "llvm/MC/MCSubtargetInfo.h"
 #include "llvm/Support/raw_ostream.h"
 #include <string>
+#include <utility>
 
 namespace llvm {
-class MCContext;
-class MCAsmInfo;
-class MCDisassembler;
-class MCInstPrinter; 
-class MCInstrInfo;
-class MCRegisterInfo;
-class MCSubtargetInfo;
 class Target;
 
 //
@@ -86,15 +87,12 @@ public:
                     LLVMOpInfoCallback getOpInfo,
                     LLVMSymbolLookupCallback symbolLookUp,
                     const Target *theTarget, const MCAsmInfo *mAI,
-                    const MCRegisterInfo *mRI,
-                    const MCSubtargetInfo *mSI,
-                    const MCInstrInfo *mII,
-                    llvm::MCContext *ctx, const MCDisassembler *disAsm,
-                    MCInstPrinter *iP) : TripleName(tripleName),
-                    DisInfo(disInfo), TagType(tagType), GetOpInfo(getOpInfo),
-                    SymbolLookUp(symbolLookUp), TheTarget(theTarget),
-                    Options(0),
-                    CommentStream(CommentsToEmit) {
+                    const MCRegisterInfo *mRI, const MCSubtargetInfo *mSI,
+                    const MCInstrInfo *mII, llvm::MCContext *ctx,
+                    const MCDisassembler *disAsm, MCInstPrinter *iP)
+      : TripleName(std::move(tripleName)), DisInfo(disInfo), TagType(tagType),
+        GetOpInfo(getOpInfo), SymbolLookUp(symbolLookUp), TheTarget(theTarget),
+        Options(0), CommentStream(CommentsToEmit) {
     MAI.reset(mAI);
     MRI.reset(mRI);
     MSI.reset(mSI);
diff --git a/lib/MC/MCDisassembler/MCDisassembler.cpp b/lib/MC/MCDisassembler/MCDisassembler.cpp
index 1084e5ea7666..3a4f7382bd3c 100644
--- a/lib/MC/MCDisassembler/MCDisassembler.cpp
+++ b/lib/MC/MCDisassembler/MCDisassembler.cpp
@@ -7,8 +7,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "llvm/MC/MCDisassembler.h"
-#include "llvm/MC/MCExternalSymbolizer.h"
+#include "llvm/MC/MCDisassembler/MCDisassembler.h"
+#include "llvm/MC/MCDisassembler/MCExternalSymbolizer.h"
 #include "llvm/Support/raw_ostream.h"
 
 using namespace llvm;
diff --git a/lib/MC/MCDisassembler/MCExternalSymbolizer.cpp b/lib/MC/MCDisassembler/MCExternalSymbolizer.cpp
index 5fc2ca44f5d4..1969c5dc66ab 100644
--- a/lib/MC/MCDisassembler/MCExternalSymbolizer.cpp
+++ b/lib/MC/MCDisassembler/MCExternalSymbolizer.cpp
@@ -7,7 +7,7 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "llvm/MC/MCExternalSymbolizer.h"
+#include "llvm/MC/MCDisassembler/MCExternalSymbolizer.h"
 #include "llvm/MC/MCContext.h"
 #include "llvm/MC/MCExpr.h"
 #include "llvm/MC/MCInst.h"
diff --git a/lib/MC/MCDisassembler/MCRelocationInfo.cpp b/lib/MC/MCDisassembler/MCRelocationInfo.cpp
index 43005e7c740c..1612562497d9 100644
--- a/lib/MC/MCDisassembler/MCRelocationInfo.cpp
+++ b/lib/MC/MCDisassembler/MCRelocationInfo.cpp
@@ -7,9 +7,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "llvm/MC/MCRelocationInfo.h"
+#include "llvm/MC/MCDisassembler/MCRelocationInfo.h"
 #include "llvm-c/Disassembler.h"
-#include "llvm/Object/ObjectFile.h"
 #include "llvm/Support/TargetRegistry.h"
 
 using namespace llvm;
@@ -22,11 +21,6 @@ MCRelocationInfo::~MCRelocationInfo() {
 }
 
 const MCExpr *
-MCRelocationInfo::createExprForRelocation(object::RelocationRef Rel) {
-  return nullptr;
-}
-
-const MCExpr *
 MCRelocationInfo::createExprForCAPIVariantKind(const MCExpr *SubExpr,
                                                unsigned VariantKind) {
   if (VariantKind != LLVMDisassembler_VariantKind_None)
diff --git a/lib/MC/MCSymbolizer.cpp b/lib/MC/MCDisassembler/MCSymbolizer.cpp
index 4080e40b3f10..c0f707d356c1 100644
--- a/lib/MC/MCSymbolizer.cpp
+++ b/lib/MC/MCDisassembler/MCSymbolizer.cpp
@@ -7,7 +7,7 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "llvm/MC/MCSymbolizer.h"
+#include "llvm/MC/MCDisassembler/MCSymbolizer.h"
 
 using namespace llvm;
 
diff --git a/lib/MC/MCDisassembler/Makefile b/lib/MC/MCDisassembler/Makefile
deleted file mode 100644
index 7d71cd381a7c..000000000000
--- a/lib/MC/MCDisassembler/Makefile
+++ /dev/null
@@ -1,14 +0,0 @@
-##===- lib/MC/MCDisassembler/Makefile ----------------------*- Makefile -*-===##
-# 
-#                     The LLVM Compiler Infrastructure
-#
-# This file is distributed under the University of Illinois Open Source
-# License. See LICENSE.TXT for details.
-# 
-##===----------------------------------------------------------------------===##
-
-LEVEL = ../../..
-LIBRARYNAME = LLVMMCDisassembler
-
-include $(LEVEL)/Makefile.common
-
diff --git a/lib/MC/MCDwarf.cpp b/lib/MC/MCDwarf.cpp
index dafa7683b1ab..54b2c918c849 100644
--- a/lib/MC/MCDwarf.cpp
+++ b/lib/MC/MCDwarf.cpp
@@ -22,6 +22,7 @@
 #include "llvm/MC/MCSection.h"
 #include "llvm/MC/MCSymbol.h"
 #include "llvm/Support/Debug.h"
+#include "llvm/Support/EndianStream.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/LEB128.h"
 #include "llvm/Support/Path.h"
@@ -46,20 +47,20 @@ static inline uint64_t ScaleAddrDelta(MCContext &Context, uint64_t AddrDelta) {
 // and if there is information from the last .loc directive that has yet to have
 // a line entry made for it is made.
 //
-void MCLineEntry::Make(MCObjectStreamer *MCOS, MCSection *Section) {
+void MCDwarfLineEntry::Make(MCObjectStreamer *MCOS, MCSection *Section) {
   if (!MCOS->getContext().getDwarfLocSeen())
     return;
 
   // Create a symbol at in the current section for use in the line entry.
   MCSymbol *LineSym = MCOS->getContext().createTempSymbol();
-  // Set the value of the symbol to use for the MCLineEntry.
+  // Set the value of the symbol to use for the MCDwarfLineEntry.
   MCOS->EmitLabel(LineSym);
 
   // Get the current .loc info saved in the context.
   const MCDwarfLoc &DwarfLoc = MCOS->getContext().getCurrentDwarfLoc();
 
   // Create a (local) line entry with the symbol and the current .loc info.
-  MCLineEntry LineEntry(LineSym, DwarfLoc);
+  MCDwarfLineEntry LineEntry(LineSym, DwarfLoc);
 
   // clear DwarfLocSeen saying the current .loc info is now used.
   MCOS->getContext().clearDwarfLocSeen();
@@ -98,7 +99,7 @@ static inline const MCExpr *MakeStartMinusEndExpr(const MCStreamer &MCOS,
 //
 static inline void
 EmitDwarfLineTable(MCObjectStreamer *MCOS, MCSection *Section,
-                   const MCLineSection::MCLineEntryCollection &LineEntries) {
+                   const MCLineSection::MCDwarfLineEntryCollection &LineEntries) {
   unsigned FileNum = 1;
   unsigned LastLine = 1;
   unsigned Column = 0;
@@ -107,47 +108,45 @@ EmitDwarfLineTable(MCObjectStreamer *MCOS, MCSection *Section,
   unsigned Discriminator = 0;
   MCSymbol *LastLabel = nullptr;
 
-  // Loop through each MCLineEntry and encode the dwarf line number table.
-  for (auto it = LineEntries.begin(),
-            ie = LineEntries.end();
-       it != ie; ++it) {
+  // Loop through each MCDwarfLineEntry and encode the dwarf line number table.
+  for (const MCDwarfLineEntry &LineEntry : LineEntries) {
+    int64_t LineDelta = static_cast<int64_t>(LineEntry.getLine()) - LastLine;
 
-    if (FileNum != it->getFileNum()) {
-      FileNum = it->getFileNum();
+    if (FileNum != LineEntry.getFileNum()) {
+      FileNum = LineEntry.getFileNum();
       MCOS->EmitIntValue(dwarf::DW_LNS_set_file, 1);
       MCOS->EmitULEB128IntValue(FileNum);
     }
-    if (Column != it->getColumn()) {
-      Column = it->getColumn();
+    if (Column != LineEntry.getColumn()) {
+      Column = LineEntry.getColumn();
       MCOS->EmitIntValue(dwarf::DW_LNS_set_column, 1);
       MCOS->EmitULEB128IntValue(Column);
     }
-    if (Discriminator != it->getDiscriminator()) {
-      Discriminator = it->getDiscriminator();
+    if (Discriminator != LineEntry.getDiscriminator()) {
+      Discriminator = LineEntry.getDiscriminator();
       unsigned Size = getULEB128Size(Discriminator);
       MCOS->EmitIntValue(dwarf::DW_LNS_extended_op, 1);
       MCOS->EmitULEB128IntValue(Size + 1);
       MCOS->EmitIntValue(dwarf::DW_LNE_set_discriminator, 1);
       MCOS->EmitULEB128IntValue(Discriminator);
     }
-    if (Isa != it->getIsa()) {
-      Isa = it->getIsa();
+    if (Isa != LineEntry.getIsa()) {
+      Isa = LineEntry.getIsa();
       MCOS->EmitIntValue(dwarf::DW_LNS_set_isa, 1);
       MCOS->EmitULEB128IntValue(Isa);
     }
-    if ((it->getFlags() ^ Flags) & DWARF2_FLAG_IS_STMT) {
-      Flags = it->getFlags();
+    if ((LineEntry.getFlags() ^ Flags) & DWARF2_FLAG_IS_STMT) {
+      Flags = LineEntry.getFlags();
       MCOS->EmitIntValue(dwarf::DW_LNS_negate_stmt, 1);
     }
-    if (it->getFlags() & DWARF2_FLAG_BASIC_BLOCK)
+    if (LineEntry.getFlags() & DWARF2_FLAG_BASIC_BLOCK)
       MCOS->EmitIntValue(dwarf::DW_LNS_set_basic_block, 1);
-    if (it->getFlags() & DWARF2_FLAG_PROLOGUE_END)
+    if (LineEntry.getFlags() & DWARF2_FLAG_PROLOGUE_END)
       MCOS->EmitIntValue(dwarf::DW_LNS_set_prologue_end, 1);
-    if (it->getFlags() & DWARF2_FLAG_EPILOGUE_BEGIN)
+    if (LineEntry.getFlags() & DWARF2_FLAG_EPILOGUE_BEGIN)
       MCOS->EmitIntValue(dwarf::DW_LNS_set_epilogue_begin, 1);
 
-    int64_t LineDelta = static_cast<int64_t>(it->getLine()) - LastLine;
-    MCSymbol *Label = it->getLabel();
+    MCSymbol *Label = LineEntry.getLabel();
 
     // At this point we want to emit/create the sequence to encode the delta in
     // line numbers and the increment of the address from the previous Label
@@ -156,7 +155,8 @@ EmitDwarfLineTable(MCObjectStreamer *MCOS, MCSection *Section,
     MCOS->EmitDwarfAdvanceLineAddr(LineDelta, LastLabel, Label,
                                    asmInfo->getPointerSize());
 
-    LastLine = it->getLine();
+    Discriminator = 0;
+    LastLine = LineEntry.getLine();
     LastLabel = Label;
   }
 
@@ -344,9 +344,9 @@ unsigned MCDwarfLineTableHeader::getFile(StringRef &Directory,
   }
   assert(!FileName.empty());
   if (FileNumber == 0) {
-    FileNumber = SourceIdMap.size() + 1;
-    assert((MCDwarfFiles.empty() || FileNumber == MCDwarfFiles.size()) &&
-           "Don't mix autonumbered and explicit numbered line table usage");
+    // File numbers start with 1 and/or after any file numbers
+    // allocated by inline-assembler .file directives.
+    FileNumber = MCDwarfFiles.empty() ? 1 : MCDwarfFiles.size();
     SmallString<256> Buffer;
     auto IterBool = SourceIdMap.insert(
         std::make_pair((Directory + Twine('\0') + FileName).toStringRef(Buffer),
@@ -452,7 +452,8 @@ void MCDwarfLineAddr::Encode(MCContext &Context, MCDwarfLineTableParams Params,
 
   // If the line increment is out of range of a special opcode, we must encode
   // it with DW_LNS_advance_line.
-  if (Temp >= Params.DWARF2LineRange) {
+  if (Temp >= Params.DWARF2LineRange ||
+      Temp + Params.DWARF2LineOpcodeBase > 255) {
     OS << char(dwarf::DW_LNS_advance_line);
     encodeSLEB128(LineDelta, OS);
 
@@ -494,8 +495,10 @@ void MCDwarfLineAddr::Encode(MCContext &Context, MCDwarfLineTableParams Params,
 
   if (NeedCopy)
     OS << char(dwarf::DW_LNS_copy);
-  else
+  else {
+    assert(Temp <= 255 && "Buggy special opcode encoding.");
     OS << char(Temp);
+  }
 }
 
 // Utility function to write a tuple for .debug_abbrev.
@@ -815,7 +818,7 @@ static void EmitGenDwarfRanges(MCStreamer *MCOS) {
     // Emit a base address selection entry for the start of this section
     const MCExpr *SectionStartAddr = MCSymbolRefExpr::create(
       StartSymbol, MCSymbolRefExpr::VK_None, context);
-    MCOS->EmitFill(AddrSize, 0xFF);
+    MCOS->emitFill(AddrSize, 0xFF);
     MCOS->EmitValue(SectionStartAddr, AddrSize);
 
     // Emit a range list entry spanning this section
@@ -1156,8 +1159,7 @@ void FrameEmitterImpl::EmitCFIInstruction(const MCCFIInstruction &Instr) {
 /// Emit frame instructions to describe the layout of the frame.
 void FrameEmitterImpl::EmitCFIInstructions(ArrayRef<MCCFIInstruction> Instrs,
                                            MCSymbol *BaseLabel) {
-  for (unsigned i = 0, N = Instrs.size(); i < N; ++i) {
-    const MCCFIInstruction &Instr = Instrs[i];
+  for (const MCCFIInstruction &Instr : Instrs) {
     MCSymbol *Label = Instr.getLabel();
     // Throw out move if the label is invalid.
     if (Label && !Label->isDefined()) continue; // Not emitted, in dead code.
@@ -1494,8 +1496,7 @@ void MCDwarfFrameEmitter::Emit(MCObjectStreamer &Streamer, MCAsmBackend *MAB,
   bool NeedsEHFrameSection = !MOFI->getSupportsCompactUnwindWithoutEHFrame();
   if (IsEH && MOFI->getCompactUnwindSection()) {
     bool SectionEmitted = false;
-    for (unsigned i = 0, n = FrameArray.size(); i < n; ++i) {
-      const MCDwarfFrameInfo &Frame = FrameArray[i];
+    for (const MCDwarfFrameInfo &Frame : FrameArray) {
       if (Frame.CompactUnwindEncoding == 0) continue;
       if (!SectionEmitted) {
         Streamer.SwitchSection(MOFI->getCompactUnwindSection());
diff --git a/lib/MC/MCELFStreamer.cpp b/lib/MC/MCELFStreamer.cpp
index 06d161bccab4..7d858c306d2e 100644
--- a/lib/MC/MCELFStreamer.cpp
+++ b/lib/MC/MCELFStreamer.cpp
@@ -24,6 +24,7 @@
 #include "llvm/MC/MCInst.h"
 #include "llvm/MC/MCObjectFileInfo.h"
 #include "llvm/MC/MCObjectStreamer.h"
+#include "llvm/MC/MCObjectWriter.h"
 #include "llvm/MC/MCSection.h"
 #include "llvm/MC/MCSectionELF.h"
 #include "llvm/MC/MCSymbolELF.h"
@@ -283,6 +284,9 @@ bool MCELFStreamer::EmitSymbolAttribute(MCSymbol *S, MCSymbolAttr Attribute) {
   case MCSA_Internal:
     Symbol->setVisibility(ELF::STV_INTERNAL);
     break;
+
+  case MCSA_AltEntry:
+    llvm_unreachable("ELF doesn't support the .alt_entry attribute");
   }
 
   return true;
@@ -406,13 +410,10 @@ void MCELFStreamer::fixSymbolsInTLSFixups(const MCExpr *expr) {
     case MCSymbolRefExpr::VK_TLSLD:
     case MCSymbolRefExpr::VK_TLSLDM:
     case MCSymbolRefExpr::VK_TPOFF:
+    case MCSymbolRefExpr::VK_TPREL:
     case MCSymbolRefExpr::VK_DTPOFF:
-    case MCSymbolRefExpr::VK_Mips_TLSGD:
-    case MCSymbolRefExpr::VK_Mips_GOTTPREL:
-    case MCSymbolRefExpr::VK_Mips_TPREL_HI:
-    case MCSymbolRefExpr::VK_Mips_TPREL_LO:
+    case MCSymbolRefExpr::VK_DTPREL:
     case MCSymbolRefExpr::VK_PPC_DTPMOD:
-    case MCSymbolRefExpr::VK_PPC_TPREL:
     case MCSymbolRefExpr::VK_PPC_TPREL_LO:
     case MCSymbolRefExpr::VK_PPC_TPREL_HI:
     case MCSymbolRefExpr::VK_PPC_TPREL_HA:
@@ -420,7 +421,6 @@ void MCELFStreamer::fixSymbolsInTLSFixups(const MCExpr *expr) {
     case MCSymbolRefExpr::VK_PPC_TPREL_HIGHERA:
     case MCSymbolRefExpr::VK_PPC_TPREL_HIGHEST:
     case MCSymbolRefExpr::VK_PPC_TPREL_HIGHESTA:
-    case MCSymbolRefExpr::VK_PPC_DTPREL:
     case MCSymbolRefExpr::VK_PPC_DTPREL_LO:
     case MCSymbolRefExpr::VK_PPC_DTPREL_HI:
     case MCSymbolRefExpr::VK_PPC_DTPREL_HA:
diff --git a/lib/MC/MCExpr.cpp b/lib/MC/MCExpr.cpp
index 748644bd9c8f..6f90ff843bd0 100644
--- a/lib/MC/MCExpr.cpp
+++ b/lib/MC/MCExpr.cpp
@@ -30,7 +30,7 @@ STATISTIC(MCExprEvaluate, "Number of MCExpr evaluations");
 }
 }
 
-void MCExpr::print(raw_ostream &OS, const MCAsmInfo *MAI) const {
+void MCExpr::print(raw_ostream &OS, const MCAsmInfo *MAI, bool InParens) const {
   switch (getKind()) {
   case MCExpr::Target:
     return cast<MCTargetExpr>(this)->printImpl(OS, MAI);
@@ -43,7 +43,8 @@ void MCExpr::print(raw_ostream &OS, const MCAsmInfo *MAI) const {
     const MCSymbol &Sym = SRE.getSymbol();
     // Parenthesize names that start with $ so that they don't look like
     // absolute names.
-    bool UseParens = Sym.getName().size() && Sym.getName()[0] == '$';
+    bool UseParens =
+        !InParens && Sym.getName().size() && Sym.getName()[0] == '$';
     if (UseParens) {
       OS << '(';
       Sym.print(OS, MAI);
@@ -129,7 +130,7 @@ void MCExpr::print(raw_ostream &OS, const MCAsmInfo *MAI) const {
 }
 
 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
-void MCExpr::dump() const {
+LLVM_DUMP_METHOD void MCExpr::dump() const {
   dbgs() << *this;
   dbgs() << '\n';
 }
@@ -178,8 +179,11 @@ StringRef MCSymbolRefExpr::getVariantKindName(VariantKind Kind) {
   case VK_Invalid: return "<<invalid>>";
   case VK_None: return "<<none>>";
 
+  case VK_DTPOFF: return "DTPOFF";
+  case VK_DTPREL: return "DTPREL";
   case VK_GOT: return "GOT";
   case VK_GOTOFF: return "GOTOFF";
+  case VK_GOTREL: return "GOTREL";
   case VK_GOTPCREL: return "GOTPCREL";
   case VK_GOTTPOFF: return "GOTTPOFF";
   case VK_INDNTPOFF: return "INDNTPOFF";
@@ -190,7 +194,9 @@ StringRef MCSymbolRefExpr::getVariantKindName(VariantKind Kind) {
   case VK_TLSLD: return "TLSLD";
   case VK_TLSLDM: return "TLSLDM";
   case VK_TPOFF: return "TPOFF";
-  case VK_DTPOFF: return "DTPOFF";
+  case VK_TPREL: return "TPREL";
+  case VK_TLSCALL: return "tlscall";
+  case VK_TLSDESC: return "tlsdesc";
   case VK_TLVP: return "TLVP";
   case VK_TLVPPAGE: return "TLVPPAGE";
   case VK_TLVPPAGEOFF: return "TLVPPAGEOFF";
@@ -208,8 +214,6 @@ StringRef MCSymbolRefExpr::getVariantKindName(VariantKind Kind) {
   case VK_ARM_PREL31: return "prel31";
   case VK_ARM_SBREL: return "sbrel";
   case VK_ARM_TLSLDO: return "tlsldo";
-  case VK_ARM_TLSCALL: return "tlscall";
-  case VK_ARM_TLSDESC: return "tlsdesc";
   case VK_ARM_TLSDESCSEQ: return "tlsdescseq";
   case VK_PPC_LO: return "l";
   case VK_PPC_HI: return "h";
@@ -227,7 +231,6 @@ StringRef MCSymbolRefExpr::getVariantKindName(VariantKind Kind) {
   case VK_PPC_TOC_HI: return "toc@h";
   case VK_PPC_TOC_HA: return "toc@ha";
   case VK_PPC_DTPMOD: return "dtpmod";
-  case VK_PPC_TPREL: return "tprel";
   case VK_PPC_TPREL_LO: return "tprel@l";
   case VK_PPC_TPREL_HI: return "tprel@h";
   case VK_PPC_TPREL_HA: return "tprel@ha";
@@ -235,7 +238,6 @@ StringRef MCSymbolRefExpr::getVariantKindName(VariantKind Kind) {
   case VK_PPC_TPREL_HIGHERA: return "tprel@highera";
   case VK_PPC_TPREL_HIGHEST: return "tprel@highest";
   case VK_PPC_TPREL_HIGHESTA: return "tprel@highesta";
-  case VK_PPC_DTPREL: return "dtprel";
   case VK_PPC_DTPREL_LO: return "dtprel@l";
   case VK_PPC_DTPREL_HI: return "dtprel@h";
   case VK_PPC_DTPREL_HA: return "dtprel@ha";
@@ -263,32 +265,6 @@ StringRef MCSymbolRefExpr::getVariantKindName(VariantKind Kind) {
   case VK_PPC_GOT_TLSLD_HA: return "got@tlsld@ha";
   case VK_PPC_TLSLD: return "tlsld";
   case VK_PPC_LOCAL: return "local";
-  case VK_Mips_GPREL: return "GPREL";
-  case VK_Mips_GOT_CALL: return "GOT_CALL";
-  case VK_Mips_GOT16: return "GOT16";
-  case VK_Mips_GOT: return "GOT";
-  case VK_Mips_ABS_HI: return "ABS_HI";
-  case VK_Mips_ABS_LO: return "ABS_LO";
-  case VK_Mips_TLSGD: return "TLSGD";
-  case VK_Mips_TLSLDM: return "TLSLDM";
-  case VK_Mips_DTPREL_HI: return "DTPREL_HI";
-  case VK_Mips_DTPREL_LO: return "DTPREL_LO";
-  case VK_Mips_GOTTPREL: return "GOTTPREL";
-  case VK_Mips_TPREL_HI: return "TPREL_HI";
-  case VK_Mips_TPREL_LO: return "TPREL_LO";
-  case VK_Mips_GPOFF_HI: return "GPOFF_HI";
-  case VK_Mips_GPOFF_LO: return "GPOFF_LO";
-  case VK_Mips_GOT_DISP: return "GOT_DISP";
-  case VK_Mips_GOT_PAGE: return "GOT_PAGE";
-  case VK_Mips_GOT_OFST: return "GOT_OFST";
-  case VK_Mips_HIGHER:   return "HIGHER";
-  case VK_Mips_HIGHEST:  return "HIGHEST";
-  case VK_Mips_GOT_HI16: return "GOT_HI16";
-  case VK_Mips_GOT_LO16: return "GOT_LO16";
-  case VK_Mips_CALL_HI16: return "CALL_HI16";
-  case VK_Mips_CALL_LO16: return "CALL_LO16";
-  case VK_Mips_PCREL_HI16: return "PCREL_HI16";
-  case VK_Mips_PCREL_LO16: return "PCREL_LO16";
   case VK_COFF_IMGREL32: return "IMGREL";
   case VK_Hexagon_PCREL: return "PCREL";
   case VK_Hexagon_LO16: return "LO16";
@@ -301,8 +277,6 @@ StringRef MCSymbolRefExpr::getVariantKindName(VariantKind Kind) {
   case VK_Hexagon_IE: return "IE";
   case VK_Hexagon_IE_GOT: return "IEGOT";
   case VK_WebAssembly_FUNCTION: return "FUNCTION";
-  case VK_TPREL: return "tprel";
-  case VK_DTPREL: return "dtprel";
   }
   llvm_unreachable("Invalid variant kind");
 }
@@ -310,19 +284,24 @@ StringRef MCSymbolRefExpr::getVariantKindName(VariantKind Kind) {
 MCSymbolRefExpr::VariantKind
 MCSymbolRefExpr::getVariantKindForName(StringRef Name) {
   return StringSwitch<VariantKind>(Name.lower())
+    .Case("dtprel", VK_DTPREL)
+    .Case("dtpoff", VK_DTPOFF)
     .Case("got", VK_GOT)
     .Case("gotoff", VK_GOTOFF)
+    .Case("gotrel", VK_GOTREL)
     .Case("gotpcrel", VK_GOTPCREL)
     .Case("gottpoff", VK_GOTTPOFF)
     .Case("indntpoff", VK_INDNTPOFF)
     .Case("ntpoff", VK_NTPOFF)
     .Case("gotntpoff", VK_GOTNTPOFF)
     .Case("plt", VK_PLT)
+    .Case("tlscall", VK_TLSCALL)
+    .Case("tlsdesc", VK_TLSDESC)
     .Case("tlsgd", VK_TLSGD)
     .Case("tlsld", VK_TLSLD)
     .Case("tlsldm", VK_TLSLDM)
     .Case("tpoff", VK_TPOFF)
-    .Case("dtpoff", VK_DTPOFF)
+    .Case("tprel", VK_TPREL)
     .Case("tlvp", VK_TLVP)
     .Case("tlvppage", VK_TLVPPAGE)
     .Case("tlvppageoff", VK_TLVPPAGEOFF)
@@ -351,7 +330,6 @@ MCSymbolRefExpr::getVariantKindForName(StringRef Name) {
     .Case("toc@ha", VK_PPC_TOC_HA)
     .Case("tls", VK_PPC_TLS)
     .Case("dtpmod", VK_PPC_DTPMOD)
-    .Case("tprel", VK_PPC_TPREL)
     .Case("tprel@l", VK_PPC_TPREL_LO)
     .Case("tprel@h", VK_PPC_TPREL_HI)
     .Case("tprel@ha", VK_PPC_TPREL_HA)
@@ -359,7 +337,6 @@ MCSymbolRefExpr::getVariantKindForName(StringRef Name) {
     .Case("tprel@highera", VK_PPC_TPREL_HIGHERA)
     .Case("tprel@highest", VK_PPC_TPREL_HIGHEST)
     .Case("tprel@highesta", VK_PPC_TPREL_HIGHESTA)
-    .Case("dtprel", VK_PPC_DTPREL)
     .Case("dtprel@l", VK_PPC_DTPREL_LO)
     .Case("dtprel@h", VK_PPC_DTPREL_HI)
     .Case("dtprel@ha", VK_PPC_DTPREL_HA)
@@ -397,8 +374,6 @@ MCSymbolRefExpr::getVariantKindForName(StringRef Name) {
     .Case("prel31", VK_ARM_PREL31)
     .Case("sbrel", VK_ARM_SBREL)
     .Case("tlsldo", VK_ARM_TLSLDO)
-    .Case("tlscall", VK_ARM_TLSCALL)
-    .Case("tlsdesc", VK_ARM_TLSDESC)
     .Default(VK_Invalid);
 }
 
@@ -688,8 +663,10 @@ bool MCExpr::evaluateAsRelocatableImpl(MCValue &Res, const MCAssembler *Asm,
       /// -(a - b + const) ==> (b - a - const)
       if (Value.getSymA() && !Value.getSymB())
         return false;
+
+      // The cast avoids undefined behavior if the constant is INT64_MIN.
       Res = MCValue::get(Value.getSymB(), Value.getSymA(),
-                         -Value.getConstant());
+                         -(uint64_t)Value.getConstant());
       break;
     case MCUnaryExpr::Not:
       if (!Value.isAbsolute())
@@ -722,9 +699,10 @@ bool MCExpr::evaluateAsRelocatableImpl(MCValue &Res, const MCAssembler *Asm,
         return false;
       case MCBinaryExpr::Sub:
         // Negate RHS and add.
+        // The cast avoids undefined behavior if the constant is INT64_MIN.
         return EvaluateSymbolicAdd(Asm, Layout, Addrs, InSet, LHSValue,
                                    RHSValue.getSymB(), RHSValue.getSymA(),
-                                   -RHSValue.getConstant(), Res);
+                                   -(uint64_t)RHSValue.getConstant(), Res);
 
       case MCBinaryExpr::Add:
         return EvaluateSymbolicAdd(Asm, Layout, Addrs, InSet, LHSValue,
diff --git a/lib/MC/MCFragment.cpp b/lib/MC/MCFragment.cpp
index efdb7049203a..1eb1d2996cb1 100644
--- a/lib/MC/MCFragment.cpp
+++ b/lib/MC/MCFragment.cpp
@@ -25,7 +25,6 @@
 #include "llvm/Support/LEB128.h"
 #include "llvm/Support/TargetRegistry.h"
 #include "llvm/Support/raw_ostream.h"
-#include <tuple>
 using namespace llvm;
 
 MCAsmLayout::MCAsmLayout(MCAssembler &Asm)
@@ -289,6 +288,12 @@ void MCFragment::destroy() {
     case FT_SafeSEH:
       delete cast<MCSafeSEHFragment>(this);
       return;
+    case FT_CVInlineLines:
+      delete cast<MCCVInlineLineTableFragment>(this);
+      return;
+    case FT_CVDefRange:
+      delete cast<MCCVDefRangeFragment>(this);
+      return;
     case FT_Dummy:
       delete cast<MCDummyFragment>(this);
       return;
@@ -311,7 +316,7 @@ raw_ostream &operator<<(raw_ostream &OS, const MCFixup &AF) {
 }
 
 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
-void MCFragment::dump() {
+LLVM_DUMP_METHOD void MCFragment::dump() {
   raw_ostream &OS = llvm::errs();
 
   OS << "<";
@@ -327,9 +332,9 @@ void MCFragment::dump() {
   case MCFragment::FT_DwarfFrame: OS << "MCDwarfCallFrameFragment"; break;
   case MCFragment::FT_LEB:   OS << "MCLEBFragment"; break;
   case MCFragment::FT_SafeSEH:    OS << "MCSafeSEHFragment"; break;
-  case MCFragment::FT_Dummy:
-    OS << "MCDummyFragment";
-    break;
+  case MCFragment::FT_CVInlineLines: OS << "MCCVInlineLineTableFragment"; break;
+  case MCFragment::FT_CVDefRange: OS << "MCCVDefRangeTableFragment"; break;
+  case MCFragment::FT_Dummy: OS << "MCDummyFragment"; break;
   }
 
   OS << "<MCFragment " << (void*) this << " LayoutOrder:" << LayoutOrder
@@ -386,8 +391,7 @@ void MCFragment::dump() {
   }
   case MCFragment::FT_Fill:  {
     const MCFillFragment *FF = cast<MCFillFragment>(this);
-    OS << " Value:" << FF->getValue() << " ValueSize:" << FF->getValueSize()
-       << " Size:" << FF->getSize();
+    OS << " Value:" << FF->getValue() << " Size:" << FF->getSize();
     break;
   }
   case MCFragment::FT_Relaxable:  {
@@ -428,13 +432,29 @@ void MCFragment::dump() {
     OS << " Sym:" << F->getSymbol();
     break;
   }
+  case MCFragment::FT_CVInlineLines: {
+    const auto *F = cast<MCCVInlineLineTableFragment>(this);
+    OS << "\n       ";
+    OS << " Sym:" << *F->getFnStartSym();
+    break;
+  }
+  case MCFragment::FT_CVDefRange: {
+    const auto *F = cast<MCCVDefRangeFragment>(this);
+    OS << "\n       ";
+    for (std::pair<const MCSymbol *, const MCSymbol *> RangeStartEnd :
+         F->getRanges()) {
+      OS << " RangeStart:" << RangeStartEnd.first;
+      OS << " RangeEnd:" << RangeStartEnd.second;
+    }
+    break;
+  }
   case MCFragment::FT_Dummy:
     break;
   }
   OS << ">";
 }
 
-void MCAssembler::dump() {
+LLVM_DUMP_METHOD void MCAssembler::dump() {
   raw_ostream &OS = llvm::errs();
 
   OS << "<MCAssembler\n";
diff --git a/lib/MC/MCInst.cpp b/lib/MC/MCInst.cpp
index 5f829aeb339c..16bc597cf3a2 100644
--- a/lib/MC/MCInst.cpp
+++ b/lib/MC/MCInst.cpp
@@ -35,7 +35,7 @@ void MCOperand::print(raw_ostream &OS) const {
 }
 
 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
-void MCOperand::dump() const {
+LLVM_DUMP_METHOD void MCOperand::dump() const {
   print(dbgs());
   dbgs() << "\n";
 }
@@ -66,7 +66,7 @@ void MCInst::dump_pretty(raw_ostream &OS, const MCInstPrinter *Printer,
 }
 
 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
-void MCInst::dump() const {
+LLVM_DUMP_METHOD void MCInst::dump() const {
   print(dbgs());
   dbgs() << "\n";
 }
diff --git a/lib/MC/MCLabel.cpp b/lib/MC/MCLabel.cpp
index 1d3022a93e86..d973fc93b98c 100644
--- a/lib/MC/MCLabel.cpp
+++ b/lib/MC/MCLabel.cpp
@@ -17,7 +17,7 @@ void MCLabel::print(raw_ostream &OS) const {
 }
 
 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
-void MCLabel::dump() const {
+LLVM_DUMP_METHOD void MCLabel::dump() const {
   print(dbgs());
 }
 #endif
diff --git a/lib/MC/MCLinkerOptimizationHint.cpp b/lib/MC/MCLinkerOptimizationHint.cpp
index 5f6a57980ad4..f71fc7830129 100644
--- a/lib/MC/MCLinkerOptimizationHint.cpp
+++ b/lib/MC/MCLinkerOptimizationHint.cpp
@@ -10,6 +10,7 @@
 #include "llvm/MC/MCLinkerOptimizationHint.h"
 #include "llvm/MC/MCAsmLayout.h"
 #include "llvm/MC/MCAssembler.h"
+#include "llvm/MC/MCMachObjectWriter.h"
 #include "llvm/Support/LEB128.h"
 
 using namespace llvm;
@@ -27,7 +28,31 @@ void MCLOHDirective::emit_impl(raw_ostream &OutStream,
                                const MCAsmLayout &Layout) const {
   encodeULEB128(Kind, OutStream);
   encodeULEB128(Args.size(), OutStream);
-  for (LOHArgs::const_iterator It = Args.begin(), EndIt = Args.end();
-       It != EndIt; ++It)
-    encodeULEB128(ObjWriter.getSymbolAddress(**It, Layout), OutStream);
+  for (const MCSymbol *Arg : Args)
+    encodeULEB128(ObjWriter.getSymbolAddress(*Arg, Layout), OutStream);
+}
+
+void MCLOHDirective::emit(MachObjectWriter &ObjWriter,
+                          const MCAsmLayout &Layout) const {
+  raw_ostream &OutStream = ObjWriter.getStream();
+  emit_impl(OutStream, ObjWriter, Layout);
+}
+
+uint64_t MCLOHDirective::getEmitSize(const MachObjectWriter &ObjWriter,
+                                     const MCAsmLayout &Layout) const {
+  class raw_counting_ostream : public raw_ostream {
+    uint64_t Count;
+
+    void write_impl(const char *, size_t size) override { Count += size; }
+
+    uint64_t current_pos() const override { return Count; }
+
+  public:
+    raw_counting_ostream() : Count(0) {}
+    ~raw_counting_ostream() override { flush(); }
+  };
+
+  raw_counting_ostream OutStream;
+  emit_impl(OutStream, ObjWriter, Layout);
+  return OutStream.tell();
 }
diff --git a/lib/MC/MCMachOStreamer.cpp b/lib/MC/MCMachOStreamer.cpp
index 21f7571eec4a..45a497240b4e 100644
--- a/lib/MC/MCMachOStreamer.cpp
+++ b/lib/MC/MCMachOStreamer.cpp
@@ -23,6 +23,7 @@
 #include "llvm/MC/MCSection.h"
 #include "llvm/MC/MCSectionMachO.h"
 #include "llvm/MC/MCSymbolMachO.h"
+#include "llvm/MC/MCValue.h"
 #include "llvm/Support/Dwarf.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/TargetRegistry.h"
@@ -70,6 +71,7 @@ public:
 
   void ChangeSection(MCSection *Sect, const MCExpr *Subsect) override;
   void EmitLabel(MCSymbol *Symbol) override;
+  void EmitAssignment(MCSymbol *Symbol, const MCExpr *Value) override;
   void EmitEHSymAttributes(const MCSymbol *Symbol, MCSymbol *EHSymbol) override;
   void EmitAssemblerFlag(MCAssemblerFlag Flag) override;
   void EmitLinkerOptions(ArrayRef<std::string> Options) override;
@@ -198,9 +200,20 @@ void MCMachOStreamer::EmitLabel(MCSymbol *Symbol) {
   cast<MCSymbolMachO>(Symbol)->clearReferenceType();
 }
 
+void MCMachOStreamer::EmitAssignment(MCSymbol *Symbol, const MCExpr *Value) {
+  MCValue Res;
+
+  if (Value->evaluateAsRelocatable(Res, nullptr, nullptr)) {
+    if (const MCSymbolRefExpr *SymAExpr = Res.getSymA()) {
+      const MCSymbol &SymA = SymAExpr->getSymbol();
+      if (!Res.getSymB() && (SymA.getName() == "" || Res.getConstant() != 0))
+        cast<MCSymbolMachO>(Symbol)->setAltEntry();
+    }
+  }
+  MCObjectStreamer::EmitAssignment(Symbol, Value);
+}
+
 void MCMachOStreamer::EmitDataRegion(DataRegionData::KindTy Kind) {
-  if (!getAssembler().getBackend().hasDataInCodeSupport())
-    return;
   // Create a temporary label to mark the start of the data region.
   MCSymbol *Start = getContext().createTempSymbol();
   EmitLabel(Start);
@@ -211,8 +224,6 @@ void MCMachOStreamer::EmitDataRegion(DataRegionData::KindTy Kind) {
 }
 
 void MCMachOStreamer::EmitDataRegionEnd() {
-  if (!getAssembler().getBackend().hasDataInCodeSupport())
-    return;
   std::vector<DataRegionData> &Regions = getAssembler().getDataRegions();
   assert(!Regions.empty() && "Mismatched .end_data_region!");
   DataRegionData &Data = Regions.back();
@@ -346,6 +357,10 @@ bool MCMachOStreamer::EmitSymbolAttribute(MCSymbol *Sym,
     Symbol->setSymbolResolver();
     break;
 
+  case MCSA_AltEntry:
+    Symbol->setAltEntry();
+    break;
+
   case MCSA_PrivateExtern:
     Symbol->setExternal(true);
     Symbol->setPrivateExtern(true);
@@ -414,7 +429,7 @@ void MCMachOStreamer::EmitZerofill(MCSection *Section, MCSymbol *Symbol,
   if (ByteAlignment != 1)
     new MCAlignFragment(ByteAlignment, 0, 0, ByteAlignment, Section);
 
-  MCFragment *F = new MCFillFragment(0, 0, Size, Section);
+  MCFragment *F = new MCFillFragment(0, Size, Section);
   Symbol->setFragment(F);
 
   // Update the maximum alignment on the zero fill section if necessary.
@@ -427,7 +442,6 @@ void MCMachOStreamer::EmitZerofill(MCSection *Section, MCSymbol *Symbol,
 void MCMachOStreamer::EmitTBSSSymbol(MCSection *Section, MCSymbol *Symbol,
                                      uint64_t Size, unsigned ByteAlignment) {
   EmitZerofill(Section, Symbol, Size, ByteAlignment);
-  return;
 }
 
 void MCMachOStreamer::EmitInstToData(const MCInst &Inst,
diff --git a/lib/MC/MCObjectFileInfo.cpp b/lib/MC/MCObjectFileInfo.cpp
index f86f7e40acb4..d05bcea14c98 100644
--- a/lib/MC/MCObjectFileInfo.cpp
+++ b/lib/MC/MCObjectFileInfo.cpp
@@ -30,7 +30,7 @@ static bool useCompactUnwind(const Triple &T) {
     return true;
 
   // armv7k always has it.
-  if (T.isWatchOS())
+  if (T.isWatchABI())
     return true;
 
   // Use it on newer version of OS X.
@@ -45,7 +45,7 @@ static bool useCompactUnwind(const Triple &T) {
   return false;
 }
 
-void MCObjectFileInfo::initMachOMCObjectFileInfo(Triple T) {
+void MCObjectFileInfo::initMachOMCObjectFileInfo(const Triple &T) {
   // MachO
   SupportsWeakOmittedEHFrame = false;
 
@@ -58,7 +58,7 @@ void MCObjectFileInfo::initMachOMCObjectFileInfo(Triple T) {
   if (T.isOSDarwin() && T.getArch() == Triple::aarch64)
     SupportsCompactUnwindWithoutEHFrame = true;
 
-  if (T.isWatchOS())
+  if (T.isWatchABI())
     OmitDwarfIfHaveCompactUnwind = true;
 
   PersonalityEncoding = dwarf::DW_EH_PE_indirect | dwarf::DW_EH_PE_pcrel
@@ -172,7 +172,12 @@ void MCObjectFileInfo::initMachOMCObjectFileInfo(Triple T) {
                            MachO::S_NON_LAZY_SYMBOL_POINTERS,
                            SectionKind::getMetadata());
 
-  if (RelocM == Reloc::Static) {
+  ThreadLocalPointerSection
+    = Ctx->getMachOSection("__DATA", "__thread_ptr",
+                           MachO::S_THREAD_LOCAL_VARIABLE_POINTERS,
+                           SectionKind::getMetadata());
+
+  if (!PositionIndependent) {
     StaticCtorSection = Ctx->getMachOSection("__TEXT", "__constructor", 0,
                                              SectionKind::getData());
     StaticDtorSection = Ctx->getMachOSection("__TEXT", "__destructor", 0,
@@ -191,6 +196,7 @@ void MCObjectFileInfo::initMachOMCObjectFileInfo(Triple T) {
                                      SectionKind::getReadOnlyWithRel());
 
   COFFDebugSymbolsSection = nullptr;
+  COFFDebugTypesSection = nullptr;
 
   if (useCompactUnwind(T)) {
     CompactUnwindSection =
@@ -258,7 +264,7 @@ void MCObjectFileInfo::initMachOMCObjectFileInfo(Triple T) {
                            SectionKind::getMetadata(), "debug_range");
   DwarfMacinfoSection =
       Ctx->getMachOSection("__DWARF", "__debug_macinfo", MachO::S_ATTR_DEBUG,
-                           SectionKind::getMetadata());
+                           SectionKind::getMetadata(), "debug_macinfo");
   DwarfDebugInlineSection =
       Ctx->getMachOSection("__DWARF", "__debug_inlined", MachO::S_ATTR_DEBUG,
                            SectionKind::getMetadata());
@@ -277,7 +283,7 @@ void MCObjectFileInfo::initMachOMCObjectFileInfo(Triple T) {
   TLSExtraDataSection = TLSTLVSection;
 }
 
-void MCObjectFileInfo::initELFMCObjectFileInfo(Triple T) {
+void MCObjectFileInfo::initELFMCObjectFileInfo(const Triple &T) {
   switch (T.getArch()) {
   case Triple::mips:
   case Triple::mipsel:
@@ -307,18 +313,21 @@ void MCObjectFileInfo::initELFMCObjectFileInfo(Triple T) {
     // Fallthrough if not using EHABI
   case Triple::ppc:
   case Triple::x86:
-    PersonalityEncoding = (RelocM == Reloc::PIC_)
-     ? dwarf::DW_EH_PE_indirect | dwarf::DW_EH_PE_pcrel | dwarf::DW_EH_PE_sdata4
-     : dwarf::DW_EH_PE_absptr;
-    LSDAEncoding = (RelocM == Reloc::PIC_)
-      ? dwarf::DW_EH_PE_pcrel | dwarf::DW_EH_PE_sdata4
-      : dwarf::DW_EH_PE_absptr;
-    TTypeEncoding = (RelocM == Reloc::PIC_)
-     ? dwarf::DW_EH_PE_indirect | dwarf::DW_EH_PE_pcrel | dwarf::DW_EH_PE_sdata4
-     : dwarf::DW_EH_PE_absptr;
+    PersonalityEncoding = PositionIndependent
+                              ? dwarf::DW_EH_PE_indirect |
+                                    dwarf::DW_EH_PE_pcrel |
+                                    dwarf::DW_EH_PE_sdata4
+                              : dwarf::DW_EH_PE_absptr;
+    LSDAEncoding = PositionIndependent
+                       ? dwarf::DW_EH_PE_pcrel | dwarf::DW_EH_PE_sdata4
+                       : dwarf::DW_EH_PE_absptr;
+    TTypeEncoding = PositionIndependent
+                        ? dwarf::DW_EH_PE_indirect | dwarf::DW_EH_PE_pcrel |
+                              dwarf::DW_EH_PE_sdata4
+                        : dwarf::DW_EH_PE_absptr;
     break;
   case Triple::x86_64:
-    if (RelocM == Reloc::PIC_) {
+    if (PositionIndependent) {
       PersonalityEncoding = dwarf::DW_EH_PE_indirect | dwarf::DW_EH_PE_pcrel |
         ((CMModel == CodeModel::Small || CMModel == CodeModel::Medium)
          ? dwarf::DW_EH_PE_sdata4 : dwarf::DW_EH_PE_sdata8);
@@ -338,12 +347,24 @@ void MCObjectFileInfo::initELFMCObjectFileInfo(Triple T) {
         ? dwarf::DW_EH_PE_udata4 : dwarf::DW_EH_PE_absptr;
     }
     break;
+  case Triple::hexagon:
+    PersonalityEncoding = dwarf::DW_EH_PE_absptr;
+    LSDAEncoding = dwarf::DW_EH_PE_absptr;
+    FDECFIEncoding = dwarf::DW_EH_PE_absptr;
+    TTypeEncoding = dwarf::DW_EH_PE_absptr;
+    if (PositionIndependent) {
+      PersonalityEncoding |= dwarf::DW_EH_PE_indirect | dwarf::DW_EH_PE_pcrel;
+      LSDAEncoding |= dwarf::DW_EH_PE_pcrel;
+      FDECFIEncoding |= dwarf::DW_EH_PE_pcrel;
+      TTypeEncoding |= dwarf::DW_EH_PE_indirect | dwarf::DW_EH_PE_pcrel;
+    }
+    break;
   case Triple::aarch64:
   case Triple::aarch64_be:
     // The small model guarantees static code/data size < 4GB, but not where it
     // will be in memory. Most of these could end up >2GB away so even a signed
     // pc-relative 32-bit address is insufficient, theoretically.
-    if (RelocM == Reloc::PIC_) {
+    if (PositionIndependent) {
       PersonalityEncoding = dwarf::DW_EH_PE_indirect | dwarf::DW_EH_PE_pcrel |
         dwarf::DW_EH_PE_sdata8;
       LSDAEncoding = dwarf::DW_EH_PE_pcrel | dwarf::DW_EH_PE_sdata8;
@@ -355,6 +376,11 @@ void MCObjectFileInfo::initELFMCObjectFileInfo(Triple T) {
       TTypeEncoding = dwarf::DW_EH_PE_absptr;
     }
     break;
+  case Triple::lanai:
+    LSDAEncoding = dwarf::DW_EH_PE_absptr;
+    PersonalityEncoding = dwarf::DW_EH_PE_absptr;
+    TTypeEncoding = dwarf::DW_EH_PE_absptr;
+    break;
   case Triple::mips:
   case Triple::mipsel:
   case Triple::mips64:
@@ -380,7 +406,7 @@ void MCObjectFileInfo::initELFMCObjectFileInfo(Triple T) {
     break;
   case Triple::sparcel:
   case Triple::sparc:
-    if (RelocM == Reloc::PIC_) {
+    if (PositionIndependent) {
       LSDAEncoding = dwarf::DW_EH_PE_pcrel | dwarf::DW_EH_PE_sdata4;
       PersonalityEncoding = dwarf::DW_EH_PE_indirect | dwarf::DW_EH_PE_pcrel |
         dwarf::DW_EH_PE_sdata4;
@@ -394,7 +420,7 @@ void MCObjectFileInfo::initELFMCObjectFileInfo(Triple T) {
     break;
   case Triple::sparcv9:
     LSDAEncoding = dwarf::DW_EH_PE_pcrel | dwarf::DW_EH_PE_sdata4;
-    if (RelocM == Reloc::PIC_) {
+    if (PositionIndependent) {
       PersonalityEncoding = dwarf::DW_EH_PE_indirect | dwarf::DW_EH_PE_pcrel |
         dwarf::DW_EH_PE_sdata4;
       TTypeEncoding = dwarf::DW_EH_PE_indirect | dwarf::DW_EH_PE_pcrel |
@@ -407,7 +433,7 @@ void MCObjectFileInfo::initELFMCObjectFileInfo(Triple T) {
   case Triple::systemz:
     // All currently-defined code models guarantee that 4-byte PC-relative
     // values will be in range.
-    if (RelocM == Reloc::PIC_) {
+    if (PositionIndependent) {
       PersonalityEncoding = dwarf::DW_EH_PE_indirect | dwarf::DW_EH_PE_pcrel |
         dwarf::DW_EH_PE_sdata4;
       LSDAEncoding = dwarf::DW_EH_PE_pcrel | dwarf::DW_EH_PE_sdata4;
@@ -468,6 +494,10 @@ void MCObjectFileInfo::initELFMCObjectFileInfo(Triple T) {
       Ctx->getELFSection(".rodata.cst16", ELF::SHT_PROGBITS,
                          ELF::SHF_ALLOC | ELF::SHF_MERGE, 16, "");
 
+  MergeableConst32Section =
+      Ctx->getELFSection(".rodata.cst32", ELF::SHT_PROGBITS,
+                         ELF::SHF_ALLOC | ELF::SHF_MERGE, 32, "");
+
   StaticCtorSection = Ctx->getELFSection(".ctors", ELF::SHT_PROGBITS,
                                          ELF::SHF_ALLOC | ELF::SHF_WRITE);
 
@@ -484,6 +514,7 @@ void MCObjectFileInfo::initELFMCObjectFileInfo(Triple T) {
                                    ELF::SHF_ALLOC);
 
   COFFDebugSymbolsSection = nullptr;
+  COFFDebugTypesSection = nullptr;
 
   // Debug Info Sections.
   DwarfAbbrevSection = Ctx->getELFSection(".debug_abbrev", ELF::SHT_PROGBITS, 0,
@@ -508,8 +539,8 @@ void MCObjectFileInfo::initELFMCObjectFileInfo(Triple T) {
       Ctx->getELFSection(".debug_aranges", ELF::SHT_PROGBITS, 0);
   DwarfRangesSection =
       Ctx->getELFSection(".debug_ranges", ELF::SHT_PROGBITS, 0, "debug_range");
-  DwarfMacinfoSection =
-      Ctx->getELFSection(".debug_macinfo", ELF::SHT_PROGBITS, 0);
+  DwarfMacinfoSection = Ctx->getELFSection(".debug_macinfo", ELF::SHT_PROGBITS,
+                                           0, "debug_macinfo");
 
   // DWARF5 Experimental Debug Info
 
@@ -558,13 +589,16 @@ void MCObjectFileInfo::initELFMCObjectFileInfo(Triple T) {
       Ctx->getELFSection(".eh_frame", EHSectionType, EHSectionFlags);
 }
 
-void MCObjectFileInfo::initCOFFMCObjectFileInfo(Triple T) {
+void MCObjectFileInfo::initCOFFMCObjectFileInfo(const Triple &T) {
   EHFrameSection = Ctx->getCOFFSection(
       ".eh_frame", COFF::IMAGE_SCN_CNT_INITIALIZED_DATA |
                        COFF::IMAGE_SCN_MEM_READ | COFF::IMAGE_SCN_MEM_WRITE,
       SectionKind::getData());
 
-  bool IsWoA = T.getArch() == Triple::arm || T.getArch() == Triple::thumb;
+  // Set the `IMAGE_SCN_MEM_16BIT` flag when compiling for thumb mode.  This is
+  // used to indicate to the linker that the text segment contains thumb instructions
+  // and to set the ISA selection bit for calls accordingly.
+  const bool IsThumb = T.getArch() == Triple::thumb;
 
   CommDirectiveSupportsAlignment = true;
 
@@ -575,7 +609,7 @@ void MCObjectFileInfo::initCOFFMCObjectFileInfo(Triple T) {
       SectionKind::getBSS());
   TextSection = Ctx->getCOFFSection(
       ".text",
-      (IsWoA ? COFF::IMAGE_SCN_MEM_16BIT : (COFF::SectionCharacteristics)0) |
+      (IsThumb ? COFF::IMAGE_SCN_MEM_16BIT : (COFF::SectionCharacteristics)0) |
           COFF::IMAGE_SCN_CNT_CODE | COFF::IMAGE_SCN_MEM_EXECUTE |
           COFF::IMAGE_SCN_MEM_READ,
       SectionKind::getText());
@@ -623,9 +657,14 @@ void MCObjectFileInfo::initCOFFMCObjectFileInfo(Triple T) {
 
   // Debug info.
   COFFDebugSymbolsSection =
-      Ctx->getCOFFSection(".debug$S", COFF::IMAGE_SCN_MEM_DISCARDABLE |
-                                          COFF::IMAGE_SCN_CNT_INITIALIZED_DATA |
-                                          COFF::IMAGE_SCN_MEM_READ,
+      Ctx->getCOFFSection(".debug$S", (COFF::IMAGE_SCN_MEM_DISCARDABLE |
+                                       COFF::IMAGE_SCN_CNT_INITIALIZED_DATA |
+                                       COFF::IMAGE_SCN_MEM_READ),
+                          SectionKind::getMetadata());
+  COFFDebugTypesSection =
+      Ctx->getCOFFSection(".debug$T", (COFF::IMAGE_SCN_MEM_DISCARDABLE |
+                                       COFF::IMAGE_SCN_CNT_INITIALIZED_DATA |
+                                       COFF::IMAGE_SCN_MEM_READ),
                           SectionKind::getMetadata());
 
   DwarfAbbrevSection = Ctx->getCOFFSection(
@@ -693,7 +732,7 @@ void MCObjectFileInfo::initCOFFMCObjectFileInfo(Triple T) {
       ".debug_macinfo",
       COFF::IMAGE_SCN_MEM_DISCARDABLE | COFF::IMAGE_SCN_CNT_INITIALIZED_DATA |
           COFF::IMAGE_SCN_MEM_READ,
-      SectionKind::getMetadata());
+      SectionKind::getMetadata(), "debug_macinfo");
   DwarfInfoDWOSection = Ctx->getCOFFSection(
       ".debug_info.dwo",
       COFF::IMAGE_SCN_MEM_DISCARDABLE | COFF::IMAGE_SCN_CNT_INITIALIZED_DATA |
@@ -791,11 +830,10 @@ void MCObjectFileInfo::initCOFFMCObjectFileInfo(Triple T) {
                                         SectionKind::getReadOnly());
 }
 
-void MCObjectFileInfo::InitMCObjectFileInfo(const Triple &TheTriple,
-                                            Reloc::Model relocm,
+void MCObjectFileInfo::InitMCObjectFileInfo(const Triple &TheTriple, bool PIC,
                                             CodeModel::Model cm,
                                             MCContext &ctx) {
-  RelocM = relocm;
+  PositionIndependent = PIC;
   CMModel = cm;
   Ctx = &ctx;
 
@@ -842,12 +880,6 @@ void MCObjectFileInfo::InitMCObjectFileInfo(const Triple &TheTriple,
   }
 }
 
-void MCObjectFileInfo::InitMCObjectFileInfo(StringRef TT, Reloc::Model RM,
-                                            CodeModel::Model CM,
-                                            MCContext &ctx) {
-  InitMCObjectFileInfo(Triple(TT), RM, CM, ctx);
-}
-
 MCSection *MCObjectFileInfo::getDwarfTypesSection(uint64_t Hash) const {
   return Ctx->getELFSection(".debug_types", ELF::SHT_PROGBITS, ELF::SHF_GROUP,
                             0, utostr(Hash));
diff --git a/lib/MC/MCObjectStreamer.cpp b/lib/MC/MCObjectStreamer.cpp
index 972610ac8d6e..d2ac0f50261d 100644
--- a/lib/MC/MCObjectStreamer.cpp
+++ b/lib/MC/MCObjectStreamer.cpp
@@ -20,6 +20,7 @@
 #include "llvm/MC/MCSection.h"
 #include "llvm/MC/MCSymbol.h"
 #include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/SourceMgr.h"
 #include "llvm/Support/TargetRegistry.h"
 using namespace llvm;
 
@@ -125,7 +126,8 @@ void MCObjectStreamer::EmitValueImpl(const MCExpr *Value, unsigned Size,
   MCDataFragment *DF = getOrCreateDataFragment();
   flushPendingLabels(DF, DF->getContents().size());
 
-  MCLineEntry::Make(this, getCurrentSection().first);
+  MCCVLineEntry::Make(this);
+  MCDwarfLineEntry::Make(this, getCurrentSection().first);
 
   // Avoid fixups when possible.
   int64_t AbsValue;
@@ -232,7 +234,8 @@ void MCObjectStreamer::EmitInstruction(const MCInst &Inst,
 
   // Now that a machine instruction has been assembled into this section, make
   // a line entry for any .loc directive that has been seen.
-  MCLineEntry::Make(this, getCurrentSection().first);
+  MCCVLineEntry::Make(this);
+  MCDwarfLineEntry::Make(this, getCurrentSection().first);
 
   // If this instruction doesn't need relaxation, just emit it as data.
   MCAssembler &Assembler = getAssembler();
@@ -249,9 +252,9 @@ void MCObjectStreamer::EmitInstruction(const MCInst &Inst,
   if (Assembler.getRelaxAll() ||
       (Assembler.isBundlingEnabled() && Sec->isBundleLocked())) {
     MCInst Relaxed;
-    getAssembler().getBackend().relaxInstruction(Inst, Relaxed);
+    getAssembler().getBackend().relaxInstruction(Inst, STI, Relaxed);
     while (getAssembler().getBackend().mayNeedRelaxation(Relaxed))
-      getAssembler().getBackend().relaxInstruction(Relaxed, Relaxed);
+      getAssembler().getBackend().relaxInstruction(Relaxed, STI, Relaxed);
     EmitInstToData(Relaxed, STI);
     return;
   }
@@ -301,7 +304,7 @@ void MCObjectStreamer::EmitDwarfLocDirective(unsigned FileNo, unsigned Line,
                                              StringRef FileName) {
   // In case we see two .loc directives in a row, make sure the
   // first one gets a line entry.
-  MCLineEntry::Make(this, getCurrentSection().first);
+  MCDwarfLineEntry::Make(this, getCurrentSection().first);
 
   this->MCStreamer::EmitDwarfLocDirective(FileNo, Line, Column, Flags,
                                           Isa, Discriminator, FileName);
@@ -362,8 +365,56 @@ void MCObjectStreamer::EmitDwarfAdvanceFrameAddr(const MCSymbol *LastLabel,
   insert(new MCDwarfCallFrameFragment(*AddrDelta));
 }
 
+void MCObjectStreamer::EmitCVLocDirective(unsigned FunctionId, unsigned FileNo,
+                                          unsigned Line, unsigned Column,
+                                          bool PrologueEnd, bool IsStmt,
+                                          StringRef FileName) {
+  // In case we see two .cv_loc directives in a row, make sure the
+  // first one gets a line entry.
+  MCCVLineEntry::Make(this);
+
+  this->MCStreamer::EmitCVLocDirective(FunctionId, FileNo, Line, Column,
+                                       PrologueEnd, IsStmt, FileName);
+}
+
+void MCObjectStreamer::EmitCVLinetableDirective(unsigned FunctionId,
+                                                const MCSymbol *Begin,
+                                                const MCSymbol *End) {
+  getContext().getCVContext().emitLineTableForFunction(*this, FunctionId, Begin,
+                                                       End);
+  this->MCStreamer::EmitCVLinetableDirective(FunctionId, Begin, End);
+}
+
+void MCObjectStreamer::EmitCVInlineLinetableDirective(
+    unsigned PrimaryFunctionId, unsigned SourceFileId, unsigned SourceLineNum,
+    const MCSymbol *FnStartSym, const MCSymbol *FnEndSym,
+    ArrayRef<unsigned> SecondaryFunctionIds) {
+  getContext().getCVContext().emitInlineLineTableForFunction(
+      *this, PrimaryFunctionId, SourceFileId, SourceLineNum, FnStartSym,
+      FnEndSym, SecondaryFunctionIds);
+  this->MCStreamer::EmitCVInlineLinetableDirective(
+      PrimaryFunctionId, SourceFileId, SourceLineNum, FnStartSym, FnEndSym,
+      SecondaryFunctionIds);
+}
+
+void MCObjectStreamer::EmitCVDefRangeDirective(
+    ArrayRef<std::pair<const MCSymbol *, const MCSymbol *>> Ranges,
+    StringRef FixedSizePortion) {
+  getContext().getCVContext().emitDefRange(*this, Ranges, FixedSizePortion);
+  this->MCStreamer::EmitCVDefRangeDirective(Ranges, FixedSizePortion);
+}
+
+void MCObjectStreamer::EmitCVStringTableDirective() {
+  getContext().getCVContext().emitStringTable(*this);
+}
+void MCObjectStreamer::EmitCVFileChecksumsDirective() {
+  getContext().getCVContext().emitFileChecksums(*this);
+}
+
+
 void MCObjectStreamer::EmitBytes(StringRef Data) {
-  MCLineEntry::Make(this, getCurrentSection().first);
+  MCCVLineEntry::Make(this);
+  MCDwarfLineEntry::Make(this, getCurrentSection().first);
   MCDataFragment *DF = getOrCreateDataFragment();
   flushPendingLabels(DF, DF->getContents().size());
   DF->getContents().append(Data.begin(), Data.end());
@@ -420,13 +471,18 @@ bool MCObjectStreamer::EmitRelocDirective(const MCExpr &Offset, StringRef Name,
   if (!Offset.evaluateAsAbsolute(OffsetValue))
     llvm_unreachable("Offset is not absolute");
 
+  if (OffsetValue < 0)
+    llvm_unreachable("Offset is negative");
+
   MCDataFragment *DF = getOrCreateDataFragment();
   flushPendingLabels(DF, DF->getContents().size());
 
-  MCFixupKind Kind;
-  if (!Assembler->getBackend().getFixupKind(Name, Kind))
+  Optional<MCFixupKind> MaybeKind = Assembler->getBackend().getFixupKind(Name);
+  if (!MaybeKind.hasValue())
     return true;
 
+  MCFixupKind Kind = *MaybeKind;
+
   if (Expr == nullptr)
     Expr =
         MCSymbolRefExpr::create(getContext().createTempSymbol(), getContext());
@@ -434,11 +490,48 @@ bool MCObjectStreamer::EmitRelocDirective(const MCExpr &Offset, StringRef Name,
   return false;
 }
 
-void MCObjectStreamer::EmitFill(uint64_t NumBytes, uint8_t FillValue) {
+void MCObjectStreamer::emitFill(uint64_t NumBytes, uint8_t FillValue) {
   const MCSection *Sec = getCurrentSection().first;
+  (void)Sec;
   assert(Sec && "need a section");
-  unsigned ItemSize = Sec->isVirtualSection() ? 0 : 1;
-  insert(new MCFillFragment(FillValue, ItemSize, NumBytes));
+  insert(new MCFillFragment(FillValue, NumBytes));
+}
+
+void MCObjectStreamer::emitFill(const MCExpr &NumBytes, uint64_t FillValue,
+                                SMLoc Loc) {
+  MCDataFragment *DF = getOrCreateDataFragment();
+  flushPendingLabels(DF, DF->getContents().size());
+
+  int64_t IntNumBytes;
+  if (!NumBytes.evaluateAsAbsolute(IntNumBytes, getAssembler())) {
+    getContext().reportError(Loc, "expected absolute expression");
+    return;
+  }
+
+  if (IntNumBytes <= 0) {
+    getContext().reportError(Loc, "invalid number of bytes");
+    return;
+  }
+
+  emitFill(IntNumBytes, FillValue);
+}
+
+void MCObjectStreamer::emitFill(const MCExpr &NumValues, int64_t Size,
+                                int64_t Expr, SMLoc Loc) {
+  int64_t IntNumValues;
+  if (!NumValues.evaluateAsAbsolute(IntNumValues, getAssembler())) {
+    getContext().reportError(Loc, "expected absolute expression");
+    return;
+  }
+
+  if (IntNumValues < 0) {
+    getContext().getSourceManager()->PrintMessage(
+        Loc, SourceMgr::DK_Warning,
+        "'.fill' directive with negative repeat count has no effect");
+    return;
+  }
+
+  MCStreamer::emitFill(IntNumValues, Size, Expr);
 }
 
 void MCObjectStreamer::FinishImpl() {
diff --git a/lib/MC/MCParser/AsmLexer.cpp b/lib/MC/MCParser/AsmLexer.cpp
index 36c192026856..d56071aea4df 100644
--- a/lib/MC/MCParser/AsmLexer.cpp
+++ b/lib/MC/MCParser/AsmLexer.cpp
@@ -23,7 +23,8 @@ using namespace llvm;
 
 AsmLexer::AsmLexer(const MCAsmInfo &MAI) : MAI(MAI) {
   CurPtr = nullptr;
-  isAtStartOfLine = true;
+  IsAtStartOfLine = true;
+  IsAtStartOfStatement = true;
   AllowAtInIdentifier = !StringRef(MAI.getCommentString()).startswith("@");
 }
 
@@ -46,24 +47,13 @@ void AsmLexer::setBuffer(StringRef Buf, const char *ptr) {
 AsmToken AsmLexer::ReturnError(const char *Loc, const std::string &Msg) {
   SetError(SMLoc::getFromPointer(Loc), Msg);
 
-  return AsmToken(AsmToken::Error, StringRef(Loc, 0));
+  return AsmToken(AsmToken::Error, StringRef(Loc, CurPtr - Loc));
 }
 
 int AsmLexer::getNextChar() {
-  char CurChar = *CurPtr++;
-  switch (CurChar) {
-  default:
-    return (unsigned char)CurChar;
-  case 0:
-    // A nul character in the stream is either the end of the current buffer or
-    // a random nul in the file.  Disambiguate that here.
-    if (CurPtr - 1 != CurBuf.end())
-      return 0;  // Just whitespace.
-
-    // Otherwise, return end of file.
-    --CurPtr;  // Another call to lex will return EOF again.
+  if (CurPtr == CurBuf.end())
     return EOF;
-  }
+  return (unsigned char)*CurPtr++;
 }
 
 /// LexFloatLiteral: [0-9]*[.][0-9]*([eE][+-]?[0-9]*)?
@@ -168,40 +158,53 @@ AsmToken AsmLexer::LexIdentifier() {
 ///           C-Style Comment: /* ... */
 AsmToken AsmLexer::LexSlash() {
   switch (*CurPtr) {
-  case '*': break; // C style comment.
-  case '/': return ++CurPtr, LexLineComment();
-  default:  return AsmToken(AsmToken::Slash, StringRef(CurPtr-1, 1));
+  case '*':
+    IsAtStartOfStatement = false;
+    break; // C style comment.
+  case '/':
+    ++CurPtr;
+    return LexLineComment();
+  default:
+    IsAtStartOfStatement = false;
+    return AsmToken(AsmToken::Slash, StringRef(TokStart, 1));
   }
 
   // C Style comment.
   ++CurPtr;  // skip the star.
-  while (1) {
-    int CurChar = getNextChar();
-    switch (CurChar) {
-    case EOF:
-      return ReturnError(TokStart, "unterminated comment");
+  while (CurPtr != CurBuf.end()) {
+    switch (*CurPtr++) {
     case '*':
       // End of the comment?
-      if (CurPtr[0] != '/') break;
-
+      if (*CurPtr != '/')
+        break;
       ++CurPtr;   // End the */.
-      return LexToken();
+      return AsmToken(AsmToken::Comment,
+                      StringRef(TokStart, CurPtr - TokStart));
     }
   }
+  return ReturnError(TokStart, "unterminated comment");
 }
 
 /// LexLineComment: Comment: #[^\n]*
 ///                        : //[^\n]*
 AsmToken AsmLexer::LexLineComment() {
-  // FIXME: This is broken if we happen to a comment at the end of a file, which
-  // was .included, and which doesn't end with a newline.
+  // Mark This as an end of statement with a body of the
+  // comment. While it would be nicer to leave this two tokens,
+  // backwards compatability with TargetParsers makes keeping this in this form
+  // better.
   int CurChar = getNextChar();
   while (CurChar != '\n' && CurChar != '\r' && CurChar != EOF)
     CurChar = getNextChar();
 
-  if (CurChar == EOF)
-    return AsmToken(AsmToken::Eof, StringRef(TokStart, 0));
-  return AsmToken(AsmToken::EndOfStatement, StringRef(TokStart, 0));
+  IsAtStartOfLine = true;
+  // Whis is a whole line comment. leave newline
+  if (IsAtStartOfStatement)
+    return AsmToken(AsmToken::EndOfStatement,
+                    StringRef(TokStart, CurPtr - TokStart));
+  IsAtStartOfStatement = true;
+
+  return AsmToken(AsmToken::EndOfStatement,
+                  StringRef(TokStart, CurPtr - 1 - TokStart));
 }
 
 static void SkipIgnoredIntegerSuffix(const char *&CurPtr) {
@@ -280,7 +283,7 @@ AsmToken AsmLexer::LexDigit() {
     return intToken(Result, Value);
   }
 
-  if (*CurPtr == 'b') {
+  if ((*CurPtr == 'b') || (*CurPtr == 'B')) {
     ++CurPtr;
     // See if we actually have "0b" as part of something like "jmp 0b\n"
     if (!isdigit(CurPtr[0])) {
@@ -309,7 +312,7 @@ AsmToken AsmLexer::LexDigit() {
     return intToken(Result, Value);
   }
 
-  if (*CurPtr == 'x') {
+  if ((*CurPtr == 'x') || (*CurPtr == 'X')) {
     ++CurPtr;
     const char *NumStart = CurPtr;
     while (isxdigit(CurPtr[0]))
@@ -419,8 +422,7 @@ StringRef AsmLexer::LexUntilEndOfStatement() {
 
   while (!isAtStartOfComment(CurPtr) &&     // Start of line comment.
          !isAtStatementSeparator(CurPtr) && // End of statement marker.
-         *CurPtr != '\n' && *CurPtr != '\r' &&
-         (*CurPtr != 0 || CurPtr != CurBuf.end())) {
+         *CurPtr != '\n' && *CurPtr != '\r' && CurPtr != CurBuf.end()) {
     ++CurPtr;
   }
   return StringRef(TokStart, CurPtr-TokStart);
@@ -429,8 +431,7 @@ StringRef AsmLexer::LexUntilEndOfStatement() {
 StringRef AsmLexer::LexUntilEndOfLine() {
   TokStart = CurPtr;
 
-  while (*CurPtr != '\n' && *CurPtr != '\r' &&
-         (*CurPtr != 0 || CurPtr != CurBuf.end())) {
+  while (*CurPtr != '\n' && *CurPtr != '\r' && CurPtr != CurBuf.end()) {
     ++CurPtr;
   }
   return StringRef(TokStart, CurPtr-TokStart);
@@ -440,7 +441,8 @@ size_t AsmLexer::peekTokens(MutableArrayRef<AsmToken> Buf,
                             bool ShouldSkipSpace) {
   const char *SavedTokStart = TokStart;
   const char *SavedCurPtr = CurPtr;
-  bool SavedAtStartOfLine = isAtStartOfLine;
+  bool SavedAtStartOfLine = IsAtStartOfLine;
+  bool SavedAtStartOfStatement = IsAtStartOfStatement;
   bool SavedSkipSpace = SkipSpace;
 
   std::string SavedErr = getErr();
@@ -461,7 +463,8 @@ size_t AsmLexer::peekTokens(MutableArrayRef<AsmToken> Buf,
   SetError(SavedErrLoc, SavedErr);
 
   SkipSpace = SavedSkipSpace;
-  isAtStartOfLine = SavedAtStartOfLine;
+  IsAtStartOfLine = SavedAtStartOfLine;
+  IsAtStartOfStatement = SavedAtStartOfStatement;
   CurPtr = SavedCurPtr;
   TokStart = SavedTokStart;
 
@@ -491,29 +494,45 @@ AsmToken AsmLexer::LexToken() {
   // This always consumes at least one character.
   int CurChar = getNextChar();
 
-  if (isAtStartOfComment(TokStart)) {
-    // If this comment starts with a '#', then return the Hash token and let
-    // the assembler parser see if it can be parsed as a cpp line filename
-    // comment. We do this only if we are at the start of a line.
-    if (CurChar == '#' && isAtStartOfLine)
-      return AsmToken(AsmToken::Hash, StringRef(TokStart, 1));
-    isAtStartOfLine = true;
+  if (CurChar == '#' && IsAtStartOfStatement) {
+    // If this starts with a '#', this may be a cpp
+    // hash directive and otherwise a line comment.
+    AsmToken TokenBuf[2];
+    MutableArrayRef<AsmToken> Buf(TokenBuf, 2);
+    size_t num = peekTokens(Buf, true);
+    // There cannot be a space preceeding this
+    if (IsAtStartOfLine && num == 2 && TokenBuf[0].is(AsmToken::Integer) &&
+        TokenBuf[1].is(AsmToken::String)) {
+      CurPtr = TokStart; // reset curPtr;
+      StringRef s = LexUntilEndOfLine();
+      UnLex(TokenBuf[1]);
+      UnLex(TokenBuf[0]);
+      return AsmToken(AsmToken::HashDirective, s);
+    }
     return LexLineComment();
   }
+
+  if (isAtStartOfComment(TokStart))
+    return LexLineComment();
+
   if (isAtStatementSeparator(TokStart)) {
     CurPtr += strlen(MAI.getSeparatorString()) - 1;
+    IsAtStartOfLine = true;
+    IsAtStartOfStatement = true;
     return AsmToken(AsmToken::EndOfStatement,
                     StringRef(TokStart, strlen(MAI.getSeparatorString())));
   }
 
   // If we're missing a newline at EOF, make sure we still get an
   // EndOfStatement token before the Eof token.
-  if (CurChar == EOF && !isAtStartOfLine) {
-    isAtStartOfLine = true;
+  if (CurChar == EOF && !IsAtStartOfStatement) {
+    IsAtStartOfLine = true;
+    IsAtStartOfStatement = true;
     return AsmToken(AsmToken::EndOfStatement, StringRef(TokStart, 1));
   }
-
-  isAtStartOfLine = false;
+  IsAtStartOfLine = false;
+  bool OldIsAtStartOfStatement = IsAtStartOfStatement;
+  IsAtStartOfStatement = false;
   switch (CurChar) {
   default:
     // Handle identifier: [a-zA-Z_.][a-zA-Z0-9_$.@]*
@@ -522,24 +541,24 @@ AsmToken AsmLexer::LexToken() {
 
     // Unknown character, emit an error.
     return ReturnError(TokStart, "invalid character in input");
-  case EOF: return AsmToken(AsmToken::Eof, StringRef(TokStart, 0));
+  case EOF:
+    IsAtStartOfLine = true;
+    IsAtStartOfStatement = true;
+    return AsmToken(AsmToken::Eof, StringRef(TokStart, 0));
   case 0:
   case ' ':
   case '\t':
-    if (SkipSpace) {
-      // Ignore whitespace.
-      return LexToken();
-    } else {
-      int len = 1;
-      while (*CurPtr==' ' || *CurPtr=='\t') {
-        CurPtr++;
-        len++;
-      }
-      return AsmToken(AsmToken::Space, StringRef(TokStart, len));
-    }
-  case '\n': // FALL THROUGH.
+    IsAtStartOfStatement = OldIsAtStartOfStatement;
+    while (*CurPtr == ' ' || *CurPtr == '\t')
+      CurPtr++;
+    if (SkipSpace)
+      return LexToken(); // Ignore whitespace.
+    else
+      return AsmToken(AsmToken::Space, StringRef(TokStart, CurPtr - TokStart));
+  case '\n':
   case '\r':
-    isAtStartOfLine = true;
+    IsAtStartOfLine = true;
+    IsAtStartOfStatement = true;
     return AsmToken(AsmToken::EndOfStatement, StringRef(TokStart, 1));
   case ':': return AsmToken(AsmToken::Colon, StringRef(TokStart, 1));
   case '+': return AsmToken(AsmToken::Plus, StringRef(TokStart, 1));
@@ -557,24 +576,34 @@ AsmToken AsmLexer::LexToken() {
   case '@': return AsmToken(AsmToken::At, StringRef(TokStart, 1));
   case '\\': return AsmToken(AsmToken::BackSlash, StringRef(TokStart, 1));
   case '=':
-    if (*CurPtr == '=')
-      return ++CurPtr, AsmToken(AsmToken::EqualEqual, StringRef(TokStart, 2));
+    if (*CurPtr == '=') {
+      ++CurPtr;
+      return AsmToken(AsmToken::EqualEqual, StringRef(TokStart, 2));
+    }
     return AsmToken(AsmToken::Equal, StringRef(TokStart, 1));
   case '|':
-    if (*CurPtr == '|')
-      return ++CurPtr, AsmToken(AsmToken::PipePipe, StringRef(TokStart, 2));
+    if (*CurPtr == '|') {
+      ++CurPtr;
+      return AsmToken(AsmToken::PipePipe, StringRef(TokStart, 2));
+    }
     return AsmToken(AsmToken::Pipe, StringRef(TokStart, 1));
   case '^': return AsmToken(AsmToken::Caret, StringRef(TokStart, 1));
   case '&':
-    if (*CurPtr == '&')
-      return ++CurPtr, AsmToken(AsmToken::AmpAmp, StringRef(TokStart, 2));
+    if (*CurPtr == '&') {
+      ++CurPtr;
+      return AsmToken(AsmToken::AmpAmp, StringRef(TokStart, 2));
+    }
     return AsmToken(AsmToken::Amp, StringRef(TokStart, 1));
   case '!':
-    if (*CurPtr == '=')
-      return ++CurPtr, AsmToken(AsmToken::ExclaimEqual, StringRef(TokStart, 2));
+    if (*CurPtr == '=') {
+      ++CurPtr;
+      return AsmToken(AsmToken::ExclaimEqual, StringRef(TokStart, 2));
+    }
     return AsmToken(AsmToken::Exclaim, StringRef(TokStart, 1));
   case '%': return AsmToken(AsmToken::Percent, StringRef(TokStart, 1));
-  case '/': return LexSlash();
+  case '/':
+    IsAtStartOfStatement = OldIsAtStartOfStatement;
+    return LexSlash();
   case '#': return AsmToken(AsmToken::Hash, StringRef(TokStart, 1));
   case '\'': return LexSingleQuote();
   case '"': return LexQuote();
@@ -583,21 +612,28 @@ AsmToken AsmLexer::LexToken() {
     return LexDigit();
   case '<':
     switch (*CurPtr) {
-    case '<': return ++CurPtr, AsmToken(AsmToken::LessLess,
-                                        StringRef(TokStart, 2));
-    case '=': return ++CurPtr, AsmToken(AsmToken::LessEqual,
-                                        StringRef(TokStart, 2));
-    case '>': return ++CurPtr, AsmToken(AsmToken::LessGreater,
-                                        StringRef(TokStart, 2));
-    default: return AsmToken(AsmToken::Less, StringRef(TokStart, 1));
+    case '<':
+      ++CurPtr;
+      return AsmToken(AsmToken::LessLess, StringRef(TokStart, 2));
+    case '=':
+      ++CurPtr;
+      return AsmToken(AsmToken::LessEqual, StringRef(TokStart, 2));
+    case '>':
+      ++CurPtr;
+      return AsmToken(AsmToken::LessGreater, StringRef(TokStart, 2));
+    default:
+      return AsmToken(AsmToken::Less, StringRef(TokStart, 1));
     }
   case '>':
     switch (*CurPtr) {
-    case '>': return ++CurPtr, AsmToken(AsmToken::GreaterGreater,
-                                        StringRef(TokStart, 2));
-    case '=': return ++CurPtr, AsmToken(AsmToken::GreaterEqual,
-                                        StringRef(TokStart, 2));
-    default: return AsmToken(AsmToken::Greater, StringRef(TokStart, 1));
+    case '>':
+      ++CurPtr;
+      return AsmToken(AsmToken::GreaterGreater, StringRef(TokStart, 2));
+    case '=':
+      ++CurPtr;
+      return AsmToken(AsmToken::GreaterEqual, StringRef(TokStart, 2));
+    default:
+      return AsmToken(AsmToken::Greater, StringRef(TokStart, 1));
     }
 
   // TODO: Quoted identifiers (objc methods etc)
diff --git a/lib/MC/MCParser/AsmParser.cpp b/lib/MC/MCParser/AsmParser.cpp
index 646cbb43cae8..1548aee84227 100644
--- a/lib/MC/MCParser/AsmParser.cpp
+++ b/lib/MC/MCParser/AsmParser.cpp
@@ -28,13 +28,12 @@
 #include "llvm/MC/MCParser/MCAsmParser.h"
 #include "llvm/MC/MCParser/MCAsmParserUtils.h"
 #include "llvm/MC/MCParser/MCParsedAsmOperand.h"
+#include "llvm/MC/MCParser/MCTargetAsmParser.h"
 #include "llvm/MC/MCRegisterInfo.h"
 #include "llvm/MC/MCSectionMachO.h"
 #include "llvm/MC/MCStreamer.h"
 #include "llvm/MC/MCSymbol.h"
-#include "llvm/MC/MCTargetAsmParser.h"
 #include "llvm/MC/MCValue.h"
-#include "llvm/Support/CommandLine.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/MathExtras.h"
 #include "llvm/Support/MemoryBuffer.h"
@@ -42,7 +41,6 @@
 #include "llvm/Support/raw_ostream.h"
 #include <cctype>
 #include <deque>
-#include <set>
 #include <string>
 #include <vector>
 using namespace llvm;
@@ -156,10 +154,17 @@ private:
   unsigned HadError : 1;
 
   /// The values from the last parsed cpp hash file line comment if any.
-  StringRef CppHashFilename;
-  int64_t CppHashLineNumber;
-  SMLoc CppHashLoc;
-  unsigned CppHashBuf;
+  struct CppHashInfoTy {
+    StringRef Filename;
+    int64_t LineNumber = 0;
+    SMLoc Loc;
+    unsigned Buf = 0;
+  };
+  CppHashInfoTy CppHashInfo;
+
+  /// \brief List of forward directional labels for diagnosis at the end.
+  SmallVector<std::tuple<SMLoc, CppHashInfoTy, MCSymbol *>, 4> DirLabels;
+
   /// When generating dwarf for assembly source files we need to calculate the
   /// logical line number based on the last parsed cpp hash file line comment
   /// and current line. Since this is slow and messes up the SourceMgr's
@@ -245,13 +250,36 @@ public:
   void eatToEndOfStatement() override;
 
   void checkForValidSection() override;
+
+  bool getTokenLoc(SMLoc &Loc) {
+    Loc = getTok().getLoc();
+    return false;
+  }
+
+  /// parseToken - If current token has the specified kind, eat it and
+  /// return success.  Otherwise, emit the specified error and return failure.
+  bool parseToken(AsmToken::TokenKind T, const Twine &ErrMsg) {
+    if (getTok().getKind() != T)
+      return TokError(ErrMsg);
+    Lex();
+    return false;
+  }
+
+  bool parseIntToken(int64_t &V, const Twine &ErrMsg) {
+    if (getTok().getKind() != AsmToken::Integer)
+      return TokError(ErrMsg);
+    V = getTok().getIntVal();
+    Lex();
+    return false;
+  }
+
   /// }
 
 private:
 
   bool parseStatement(ParseStatementInfo &Info,
                       MCAsmParserSemaCallback *SI);
-  void eatToEndOfLine();
+  bool parseCurlyBlockScope(SmallVectorImpl<AsmRewrite>& AsmStrRewrites);
   bool parseCppHashLineFilenameComment(SMLoc L);
 
   void checkForBadMacro(SMLoc DirectiveLoc, StringRef Name, StringRef Body,
@@ -303,6 +331,18 @@ private:
   }
   static void DiagHandler(const SMDiagnostic &Diag, void *Context);
 
+  bool check(bool P, SMLoc Loc, const Twine &Msg) {
+    if (P)
+      return Error(Loc, Msg);
+    return false;
+  }
+
+  bool check(bool P, const Twine &Msg) {
+    if (P)
+      return TokError(Msg);
+    return false;
+  }
+
   /// \brief Enter the specified file. This returns true on failure.
   bool enterIncludeFile(const std::string &Filename);
 
@@ -349,14 +389,16 @@ private:
     DK_BALIGNL, DK_P2ALIGN, DK_P2ALIGNW, DK_P2ALIGNL, DK_ORG, DK_FILL, DK_ENDR,
     DK_BUNDLE_ALIGN_MODE, DK_BUNDLE_LOCK, DK_BUNDLE_UNLOCK,
     DK_ZERO, DK_EXTERN, DK_GLOBL, DK_GLOBAL,
-    DK_LAZY_REFERENCE, DK_NO_DEAD_STRIP, DK_SYMBOL_RESOLVER, DK_PRIVATE_EXTERN,
-    DK_REFERENCE, DK_WEAK_DEFINITION, DK_WEAK_REFERENCE,
+    DK_LAZY_REFERENCE, DK_NO_DEAD_STRIP, DK_SYMBOL_RESOLVER,
+    DK_PRIVATE_EXTERN, DK_REFERENCE, DK_WEAK_DEFINITION, DK_WEAK_REFERENCE,
     DK_WEAK_DEF_CAN_BE_HIDDEN, DK_COMM, DK_COMMON, DK_LCOMM, DK_ABORT,
     DK_INCLUDE, DK_INCBIN, DK_CODE16, DK_CODE16GCC, DK_REPT, DK_IRP, DK_IRPC,
     DK_IF, DK_IFEQ, DK_IFGE, DK_IFGT, DK_IFLE, DK_IFLT, DK_IFNE, DK_IFB,
     DK_IFNB, DK_IFC, DK_IFEQS, DK_IFNC, DK_IFNES, DK_IFDEF, DK_IFNDEF,
     DK_IFNOTDEF, DK_ELSEIF, DK_ELSE, DK_ENDIF,
     DK_SPACE, DK_SKIP, DK_FILE, DK_LINE, DK_LOC, DK_STABS,
+    DK_CV_FILE, DK_CV_LOC, DK_CV_LINETABLE, DK_CV_INLINE_LINETABLE,
+    DK_CV_DEF_RANGE, DK_CV_STRINGTABLE, DK_CV_FILECHECKSUMS,
     DK_CFI_SECTIONS, DK_CFI_STARTPROC, DK_CFI_ENDPROC, DK_CFI_DEF_CFA,
     DK_CFI_DEF_CFA_OFFSET, DK_CFI_ADJUST_CFA_OFFSET, DK_CFI_DEF_CFA_REGISTER,
     DK_CFI_OFFSET, DK_CFI_REL_OFFSET, DK_CFI_PERSONALITY, DK_CFI_LSDA,
@@ -394,6 +436,16 @@ private:
   bool parseDirectiveLoc();
   bool parseDirectiveStabs();
 
+  // ".cv_file", ".cv_loc", ".cv_linetable", "cv_inline_linetable",
+  // ".cv_def_range"
+  bool parseDirectiveCVFile();
+  bool parseDirectiveCVLoc();
+  bool parseDirectiveCVLinetable();
+  bool parseDirectiveCVInlineLinetable();
+  bool parseDirectiveCVDefRange();
+  bool parseDirectiveCVStringTable();
+  bool parseDirectiveCVFileChecksums();
+
   // .cfi directives
   bool parseDirectiveCFIRegister(SMLoc DirectiveLoc);
   bool parseDirectiveCFIWindowSave();
@@ -506,7 +558,7 @@ AsmParser::AsmParser(SourceMgr &SM, MCContext &Ctx, MCStreamer &Out,
                      const MCAsmInfo &MAI)
     : Lexer(MAI), Ctx(Ctx), Out(Out), MAI(MAI), SrcMgr(SM),
       PlatformParser(nullptr), CurBuffer(SM.getMainFileID()),
-      MacrosEnabledFlag(true), HadError(false), CppHashLineNumber(0),
+      MacrosEnabledFlag(true), HadError(false), CppHashInfo(),
       AssemblerDialect(~0U), IsDarwin(false), ParsingInlineAsm(false) {
   // Save the old handler.
   SavedDiagHandler = SrcMgr.getDiagHandler();
@@ -606,20 +658,36 @@ void AsmParser::jumpToLoc(SMLoc Loc, unsigned InBuffer) {
 }
 
 const AsmToken &AsmParser::Lex() {
+  if (Lexer.getTok().is(AsmToken::Error))
+    Error(Lexer.getErrLoc(), Lexer.getErr());
+
+  // if it's a end of statement with a comment in it
+  if (getTok().is(AsmToken::EndOfStatement)) {
+    // if this is a line comment output it.
+    if (getTok().getString().front() != '\n' &&
+        getTok().getString().front() != '\r' && MAI.preserveAsmComments())
+      Out.addExplicitComment(Twine(getTok().getString()));
+  }
+
   const AsmToken *tok = &Lexer.Lex();
 
+  // Parse comments here to be deferred until end of next statement.
+  while (tok->is(AsmToken::Comment)) {
+    if (MAI.preserveAsmComments())
+      Out.addExplicitComment(Twine(tok->getString()));
+    tok = &Lexer.Lex();
+  }
+
   if (tok->is(AsmToken::Eof)) {
     // If this is the end of an included file, pop the parent file off the
     // include stack.
     SMLoc ParentIncludeLoc = SrcMgr.getParentIncludeLoc(CurBuffer);
     if (ParentIncludeLoc != SMLoc()) {
       jumpToLoc(ParentIncludeLoc);
-      tok = &Lexer.Lex();
+      return Lex();
     }
   }
 
-  if (tok->is(AsmToken::Error))
-    Error(Lexer.getErrLoc(), Lexer.getErr());
 
   return *tok;
 }
@@ -657,6 +725,12 @@ bool AsmParser::Run(bool NoInitialTextSection, bool NoFinalize) {
     if (!parseStatement(Info, nullptr))
       continue;
 
+    // If we've failed, but on a Error Token, but did not consume it in
+    // favor of a better message, emit it now.
+    if (Lexer.getTok().is(AsmToken::Error)) {
+      Lex();
+    }
+
     // We had an error, validate that one was emitted and recover by skipping to
     // the next line.
     assert(HadError && "Parse statement returned an error, but none emitted!");
@@ -683,18 +757,32 @@ bool AsmParser::Run(bool NoInitialTextSection, bool NoFinalize) {
   // Targets that don't do subsections via symbols may not want this, though,
   // so conservatively exclude them. Only do this if we're finalizing, though,
   // as otherwise we won't necessarilly have seen everything yet.
-  if (!NoFinalize && MAI.hasSubsectionsViaSymbols()) {
-    for (const auto &TableEntry : getContext().getSymbols()) {
-      MCSymbol *Sym = TableEntry.getValue();
-      // Variable symbols may not be marked as defined, so check those
-      // explicitly. If we know it's a variable, we have a definition for
-      // the purposes of this check.
-      if (Sym->isTemporary() && !Sym->isVariable() && !Sym->isDefined())
-        // FIXME: We would really like to refer back to where the symbol was
-        // first referenced for a source location. We need to add something
-        // to track that. Currently, we just point to the end of the file.
-        return Error(getLexer().getLoc(), "assembler local symbol '" +
-                                              Sym->getName() + "' not defined");
+  if (!NoFinalize) {
+    if (MAI.hasSubsectionsViaSymbols()) {
+      for (const auto &TableEntry : getContext().getSymbols()) {
+        MCSymbol *Sym = TableEntry.getValue();
+        // Variable symbols may not be marked as defined, so check those
+        // explicitly. If we know it's a variable, we have a definition for
+        // the purposes of this check.
+        if (Sym->isTemporary() && !Sym->isVariable() && !Sym->isDefined())
+          // FIXME: We would really like to refer back to where the symbol was
+          // first referenced for a source location. We need to add something
+          // to track that. Currently, we just point to the end of the file.
+          HadError |=
+              Error(getTok().getLoc(), "assembler local symbol '" +
+                                           Sym->getName() + "' not defined");
+      }
+    }
+
+    // Temporary symbols like the ones for directional jumps don't go in the
+    // symbol table. They also need to be diagnosed in all (final) cases.
+    for (std::tuple<SMLoc, CppHashInfoTy, MCSymbol *> &LocSym : DirLabels) {
+      if (std::get<2>(LocSym)->isUndefined()) {
+        // Reset the state of any "# line file" directives we've seen to the
+        // context as it was at the diagnostic site.
+        CppHashInfo = std::get<1>(LocSym);
+        HadError |= Error(std::get<0>(LocSym), "directional label undefined");
+      }
     }
   }
 
@@ -716,18 +804,18 @@ void AsmParser::checkForValidSection() {
 /// \brief Throw away the rest of the line for testing purposes.
 void AsmParser::eatToEndOfStatement() {
   while (Lexer.isNot(AsmToken::EndOfStatement) && Lexer.isNot(AsmToken::Eof))
-    Lex();
+    Lexer.Lex();
 
   // Eat EOL.
   if (Lexer.is(AsmToken::EndOfStatement))
-    Lex();
+    Lexer.Lex();
 }
 
 StringRef AsmParser::parseStringToEndOfStatement() {
   const char *Start = getTok().getLoc().getPointer();
 
   while (Lexer.isNot(AsmToken::EndOfStatement) && Lexer.isNot(AsmToken::Eof))
-    Lex();
+    Lexer.Lex();
 
   const char *End = getTok().getLoc().getPointer();
   return StringRef(Start, End - Start);
@@ -738,7 +826,7 @@ StringRef AsmParser::parseStringToComma() {
 
   while (Lexer.isNot(AsmToken::EndOfStatement) &&
          Lexer.isNot(AsmToken::Comma) && Lexer.isNot(AsmToken::Eof))
-    Lex();
+    Lexer.Lex();
 
   const char *End = getTok().getLoc().getPointer();
   return StringRef(Start, End - Start);
@@ -767,10 +855,9 @@ bool AsmParser::parseParenExpr(const MCExpr *&Res, SMLoc &EndLoc) {
 bool AsmParser::parseBracketExpr(const MCExpr *&Res, SMLoc &EndLoc) {
   if (parseExpression(Res))
     return true;
-  if (Lexer.isNot(AsmToken::RBrac))
-    return TokError("expected ']' in brackets expression");
-  EndLoc = Lexer.getTok().getEndLoc();
-  Lex();
+  EndLoc = getTok().getEndLoc();
+  if (parseToken(AsmToken::RBrac, "expected ']' in brackets expression"))
+    return true;
   return false;
 }
 
@@ -820,7 +907,7 @@ bool AsmParser::parsePrimaryExpr(const MCExpr *&Res, SMLoc &EndLoc) {
     if (!MAI.useParensForSymbolVariant()) {
       if (FirstTokenKind == AsmToken::String) {
         if (Lexer.is(AsmToken::At)) {
-          Lexer.Lex(); // eat @
+          Lex(); // eat @
           SMLoc AtLoc = getLexer().getLoc();
           StringRef VName;
           if (parseIdentifier(VName))
@@ -832,14 +919,13 @@ bool AsmParser::parsePrimaryExpr(const MCExpr *&Res, SMLoc &EndLoc) {
         Split = Identifier.split('@');
       }
     } else if (Lexer.is(AsmToken::LParen)) {
-      Lexer.Lex(); // eat (
+      Lex(); // eat '('.
       StringRef VName;
       parseIdentifier(VName);
-      if (Lexer.isNot(AsmToken::RParen)) {
-          return Error(Lexer.getTok().getLoc(),
-                       "unexpected token in variant, expected ')'");
-      }
-      Lexer.Lex(); // eat )
+      // eat ')'.
+      if (parseToken(AsmToken::RParen,
+                     "unexpected token in variant, expected ')'"))
+        return true;
       Split = std::make_pair(Identifier, VName);
     }
 
@@ -904,7 +990,8 @@ bool AsmParser::parsePrimaryExpr(const MCExpr *&Res, SMLoc &EndLoc) {
             Ctx.getDirectionalLocalSymbol(IntVal, IDVal == "b");
         Res = MCSymbolRefExpr::create(Sym, Variant, getContext());
         if (IDVal == "b" && Sym->isUndefined())
-          return Error(Loc, "invalid reference to undefined symbol");
+          return Error(Loc, "directional label undefined");
+        DirLabels.push_back(std::make_tuple(Loc, CppHashInfo, Sym));
         EndLoc = Lexer.getTok().getEndLoc();
         Lex(); // Eat identifier.
       }
@@ -1082,10 +1169,10 @@ bool AsmParser::parseParenExprOfDepth(unsigned ParenDepth, const MCExpr *&Res,
     // We don't Lex() the last RParen.
     // This is the same behavior as parseParenExpression().
     if (ParenDepth - 1 > 0) {
-      if (Lexer.isNot(AsmToken::RParen))
-        return TokError("expected ')' in parentheses expression");
-      EndLoc = Lexer.getTok().getEndLoc();
-      Lex();
+      EndLoc = getTok().getEndLoc();
+      if (parseToken(AsmToken::RParen,
+                     "expected ')' in parentheses expression"))
+        return true;
     }
   }
   return false;
@@ -1303,21 +1390,24 @@ bool AsmParser::parseBinOpRHS(unsigned Precedence, const MCExpr *&Res,
 ///   ::= Label* Identifier OperandList* EndOfStatement
 bool AsmParser::parseStatement(ParseStatementInfo &Info,
                                MCAsmParserSemaCallback *SI) {
+  // Eat initial spaces and comments
+  while (Lexer.is(AsmToken::Space))
+    Lex();
   if (Lexer.is(AsmToken::EndOfStatement)) {
-    Out.AddBlankLine();
+    // if this is a line comment we can drop it safely
+    if (getTok().getString().front() == '\r' ||
+        getTok().getString().front() == '\n')
+      Out.AddBlankLine();
     Lex();
     return false;
   }
-
-  // Statements always start with an identifier or are a full line comment.
+  // Statements always start with an identifier.
   AsmToken ID = getTok();
   SMLoc IDLoc = ID.getLoc();
   StringRef IDVal;
   int64_t LocalLabelVal = -1;
-  // A full line comment is a '#' as the first token.
-  if (Lexer.is(AsmToken::Hash))
+  if (Lexer.is(AsmToken::HashDirective))
     return parseCppHashLineFilenameComment(IDLoc);
-
   // Allow an integer followed by a ':' as a directional local label.
   if (Lexer.is(AsmToken::Integer)) {
     LocalLabelVal = getTok().getIntVal();
@@ -1444,6 +1534,12 @@ bool AsmParser::parseStatement(ParseStatementInfo &Info,
     if (!Sym->isUndefined() || Sym->isVariable())
       return Error(IDLoc, "invalid symbol redefinition");
 
+    // Consume any end of statement token, if present, to avoid spurious
+    // AddBlankLine calls().
+    if (getTok().is(AsmToken::EndOfStatement)) {
+      Lex();
+    }
+
     // Emit the label.
     if (!ParsingInlineAsm)
       Out.EmitLabel(Sym);
@@ -1456,13 +1552,7 @@ bool AsmParser::parseStatement(ParseStatementInfo &Info,
 
     getTargetParser().onLabelParsed(Sym);
 
-    // Consume any end of statement token, if present, to avoid spurious
-    // AddBlankLine calls().
-    if (Lexer.is(AsmToken::EndOfStatement)) {
-      Lex();
-      if (Lexer.is(AsmToken::Eof))
-        return false;
-    }
+
 
     return false;
   }
@@ -1608,7 +1698,8 @@ bool AsmParser::parseStatement(ParseStatementInfo &Info,
       return parseDirectiveIncbin();
     case DK_CODE16:
     case DK_CODE16GCC:
-      return TokError(Twine(IDVal) + " not supported yet");
+      return TokError(Twine(IDVal) +
+                      " not currently supported for this target");
     case DK_REPT:
       return parseDirectiveRept(IDLoc, IDVal);
     case DK_IRP:
@@ -1638,6 +1729,20 @@ bool AsmParser::parseStatement(ParseStatementInfo &Info,
       return parseDirectiveLoc();
     case DK_STABS:
       return parseDirectiveStabs();
+    case DK_CV_FILE:
+      return parseDirectiveCVFile();
+    case DK_CV_LOC:
+      return parseDirectiveCVLoc();
+    case DK_CV_LINETABLE:
+      return parseDirectiveCVLinetable();
+    case DK_CV_INLINE_LINETABLE:
+      return parseDirectiveCVInlineLinetable();
+    case DK_CV_DEF_RANGE:
+      return parseDirectiveCVDefRange();
+    case DK_CV_STRINGTABLE:
+      return parseDirectiveCVStringTable();
+    case DK_CV_FILECHECKSUMS:
+      return parseDirectiveCVFileChecksums();
     case DK_CFI_SECTIONS:
       return parseDirectiveCFISections();
     case DK_CFI_STARTPROC:
@@ -1755,24 +1860,26 @@ bool AsmParser::parseStatement(ParseStatementInfo &Info,
     // If we previously parsed a cpp hash file line comment then make sure the
     // current Dwarf File is for the CppHashFilename if not then emit the
     // Dwarf File table for it and adjust the line number for the .loc.
-    if (CppHashFilename.size()) {
+    if (CppHashInfo.Filename.size()) {
       unsigned FileNumber = getStreamer().EmitDwarfFileDirective(
-          0, StringRef(), CppHashFilename);
+          0, StringRef(), CppHashInfo.Filename);
       getContext().setGenDwarfFileNumber(FileNumber);
 
       // Since SrcMgr.FindLineNumber() is slow and messes up the SourceMgr's
       // cache with the different Loc from the call above we save the last
       // info we queried here with SrcMgr.FindLineNumber().
       unsigned CppHashLocLineNo;
-      if (LastQueryIDLoc == CppHashLoc && LastQueryBuffer == CppHashBuf)
+      if (LastQueryIDLoc == CppHashInfo.Loc &&
+          LastQueryBuffer == CppHashInfo.Buf)
         CppHashLocLineNo = LastQueryLine;
       else {
-        CppHashLocLineNo = SrcMgr.FindLineNumber(CppHashLoc, CppHashBuf);
+        CppHashLocLineNo =
+            SrcMgr.FindLineNumber(CppHashInfo.Loc, CppHashInfo.Buf);
         LastQueryLine = CppHashLocLineNo;
-        LastQueryIDLoc = CppHashLoc;
-        LastQueryBuffer = CppHashBuf;
+        LastQueryIDLoc = CppHashInfo.Loc;
+        LastQueryBuffer = CppHashInfo.Buf;
       }
-      Line = CppHashLineNumber - 1 + (Line - CppHashLocLineNo);
+      Line = CppHashInfo.LineNumber - 1 + (Line - CppHashLocLineNo);
     }
 
     getStreamer().EmitDwarfLocDirective(
@@ -1794,48 +1901,46 @@ bool AsmParser::parseStatement(ParseStatementInfo &Info,
   return false;
 }
 
-/// eatToEndOfLine uses the Lexer to eat the characters to the end of the line
-/// since they may not be able to be tokenized to get to the end of line token.
-void AsmParser::eatToEndOfLine() {
-  if (!Lexer.is(AsmToken::EndOfStatement))
-    Lexer.LexUntilEndOfLine();
-  // Eat EOL.
-  Lex();
+// Parse and erase curly braces marking block start/end
+bool 
+AsmParser::parseCurlyBlockScope(SmallVectorImpl<AsmRewrite> &AsmStrRewrites) {
+  // Identify curly brace marking block start/end
+  if (Lexer.isNot(AsmToken::LCurly) && Lexer.isNot(AsmToken::RCurly))
+    return false;
+
+  SMLoc StartLoc = Lexer.getLoc();
+  Lex(); // Eat the brace
+  if (Lexer.is(AsmToken::EndOfStatement))
+    Lex(); // Eat EndOfStatement following the brace
+
+  // Erase the block start/end brace from the output asm string
+  AsmStrRewrites.emplace_back(AOK_Skip, StartLoc, Lexer.getLoc().getPointer() -
+                                                  StartLoc.getPointer());
+  return true;
 }
 
 /// parseCppHashLineFilenameComment as this:
 ///   ::= # number "filename"
-/// or just as a full line comment if it doesn't have a number and a string.
 bool AsmParser::parseCppHashLineFilenameComment(SMLoc L) {
   Lex(); // Eat the hash token.
-
-  if (getLexer().isNot(AsmToken::Integer)) {
-    // Consume the line since in cases it is not a well-formed line directive,
-    // as if were simply a full line comment.
-    eatToEndOfLine();
-    return false;
-  }
-
+  // Lexer only ever emits HashDirective if it fully formed if it's
+  // done the checking already so this is an internal error.
+  assert(getTok().is(AsmToken::Integer) &&
+         "Lexing Cpp line comment: Expected Integer");
   int64_t LineNumber = getTok().getIntVal();
   Lex();
-
-  if (getLexer().isNot(AsmToken::String)) {
-    eatToEndOfLine();
-    return false;
-  }
-
+  assert(getTok().is(AsmToken::String) &&
+         "Lexing Cpp line comment: Expected String");
   StringRef Filename = getTok().getString();
+  Lex();
   // Get rid of the enclosing quotes.
   Filename = Filename.substr(1, Filename.size() - 2);
 
   // Save the SMLoc, Filename and LineNumber for later use by diagnostics.
-  CppHashLoc = L;
-  CppHashFilename = Filename;
-  CppHashLineNumber = LineNumber;
-  CppHashBuf = CurBuffer;
-
-  // Ignore any trailing characters, they're just comment.
-  eatToEndOfLine();
+  CppHashInfo.Loc = L;
+  CppHashInfo.Filename = Filename;
+  CppHashInfo.LineNumber = LineNumber;
+  CppHashInfo.Buf = CurBuffer;
   return false;
 }
 
@@ -1849,7 +1954,7 @@ void AsmParser::DiagHandler(const SMDiagnostic &Diag, void *Context) {
   SMLoc DiagLoc = Diag.getLoc();
   unsigned DiagBuf = DiagSrcMgr.FindBufferContainingLoc(DiagLoc);
   unsigned CppHashBuf =
-      Parser->SrcMgr.FindBufferContainingLoc(Parser->CppHashLoc);
+      Parser->SrcMgr.FindBufferContainingLoc(Parser->CppHashInfo.Loc);
 
   // Like SourceMgr::printMessage() we need to print the include stack if any
   // before printing the message.
@@ -1863,7 +1968,7 @@ void AsmParser::DiagHandler(const SMDiagnostic &Diag, void *Context) {
   // If we have not parsed a cpp hash line filename comment or the source
   // manager changed or buffer changed (like in a nested include) then just
   // print the normal diagnostic using its Filename and LineNo.
-  if (!Parser->CppHashLineNumber || &DiagSrcMgr != &Parser->SrcMgr ||
+  if (!Parser->CppHashInfo.LineNumber || &DiagSrcMgr != &Parser->SrcMgr ||
       DiagBuf != CppHashBuf) {
     if (Parser->SavedDiagHandler)
       Parser->SavedDiagHandler(Diag, Parser->SavedDiagContext);
@@ -1873,15 +1978,15 @@ void AsmParser::DiagHandler(const SMDiagnostic &Diag, void *Context) {
   }
 
   // Use the CppHashFilename and calculate a line number based on the
-  // CppHashLoc and CppHashLineNumber relative to this Diag's SMLoc for
-  // the diagnostic.
-  const std::string &Filename = Parser->CppHashFilename;
+  // CppHashInfo.Loc and CppHashInfo.LineNumber relative to this Diag's SMLoc
+  // for the diagnostic.
+  const std::string &Filename = Parser->CppHashInfo.Filename;
 
   int DiagLocLineNo = DiagSrcMgr.FindLineNumber(DiagLoc, DiagBuf);
   int CppHashLocLineNo =
-      Parser->SrcMgr.FindLineNumber(Parser->CppHashLoc, CppHashBuf);
+      Parser->SrcMgr.FindLineNumber(Parser->CppHashInfo.Loc, CppHashBuf);
   int LineNo =
-      Parser->CppHashLineNumber - 1 + (DiagLocLineNo - CppHashLocLineNo);
+      Parser->CppHashInfo.LineNumber - 1 + (DiagLocLineNo - CppHashLocLineNo);
 
   SMDiagnostic NewDiag(*Diag.getSourceMgr(), Diag.getLoc(), Filename, LineNo,
                        Diag.getColumnNo(), Diag.getKind(), Diag.getMessage(),
@@ -2041,7 +2146,6 @@ static bool isOperator(AsmToken::TokenKind kind) {
   case AsmToken::AmpAmp:
   case AsmToken::Exclaim:
   case AsmToken::ExclaimEqual:
-  case AsmToken::Percent:
   case AsmToken::Less:
   case AsmToken::LessEqual:
   case AsmToken::LessLess:
@@ -2080,37 +2184,44 @@ bool AsmParser::parseMacroArgument(MCAsmMacroArgument &MA, bool Vararg) {
   }
 
   unsigned ParenLevel = 0;
-  unsigned AddTokens = 0;
 
   // Darwin doesn't use spaces to delmit arguments.
   AsmLexerSkipSpaceRAII ScopedSkipSpace(Lexer, IsDarwin);
 
+  bool SpaceEaten;
+
   for (;;) {
+    SpaceEaten = false;
     if (Lexer.is(AsmToken::Eof) || Lexer.is(AsmToken::Equal))
       return TokError("unexpected token in macro instantiation");
 
-    if (ParenLevel == 0 && Lexer.is(AsmToken::Comma))
-      break;
+    if (ParenLevel == 0) {
+
+      if (Lexer.is(AsmToken::Comma))
+        break;
 
-    if (Lexer.is(AsmToken::Space)) {
-      Lex(); // Eat spaces
+      if (Lexer.is(AsmToken::Space)) {
+        SpaceEaten = true;
+        Lexer.Lex(); // Eat spaces
+      }
 
       // Spaces can delimit parameters, but could also be part an expression.
       // If the token after a space is an operator, add the token and the next
       // one into this argument
       if (!IsDarwin) {
         if (isOperator(Lexer.getKind())) {
-          // Check to see whether the token is used as an operator,
-          // or part of an identifier
-          const char *NextChar = getTok().getEndLoc().getPointer();
-          if (*NextChar == ' ')
-            AddTokens = 2;
-        }
+          MA.push_back(getTok());
+          Lexer.Lex();
 
-        if (!AddTokens && ParenLevel == 0) {
-          break;
+          // Whitespace after an operator can be ignored.
+          if (Lexer.is(AsmToken::Space))
+            Lexer.Lex();
+
+          continue;
         }
       }
+      if (SpaceEaten)
+        break;
     }
 
     // handleMacroEntry relies on not advancing the lexer here
@@ -2126,9 +2237,7 @@ bool AsmParser::parseMacroArgument(MCAsmMacroArgument &MA, bool Vararg) {
 
     // Append the token to the current argument list.
     MA.push_back(getTok());
-    if (AddTokens)
-      AddTokens--;
-    Lex();
+    Lexer.Lex();
   }
 
   if (ParenLevel != 0)
@@ -2162,7 +2271,7 @@ bool AsmParser::parseMacroArguments(const MCAsmMacro *M,
         return true;
       }
 
-      if (!Lexer.is(AsmToken::Equal)) {
+      if (Lexer.isNot(AsmToken::Equal)) {
         TokError("expected '=' after formal parameter identifier");
         eatToEndOfStatement();
         return true;
@@ -2190,7 +2299,7 @@ bool AsmParser::parseMacroArguments(const MCAsmMacro *M,
           break;
 
       if (FAI >= NParameters) {
-    assert(M && "expected macro to be defined");
+        assert(M && "expected macro to be defined");
         Error(IDLoc,
               "parameter named '" + FA.Name + "' does not exist for macro '" +
               M->Name + "'");
@@ -2337,7 +2446,7 @@ bool AsmParser::parseIdentifier(StringRef &Res) {
     SMLoc PrefixLoc = getLexer().getLoc();
 
     // Consume the prefix character, and check for a following identifier.
-    Lex();
+    Lexer.Lex(); // Lexer's Lex guarantees consecutive token.
     if (Lexer.isNot(AsmToken::Identifier))
       return true;
 
@@ -2348,7 +2457,7 @@ bool AsmParser::parseIdentifier(StringRef &Res) {
     // Construct the joined identifier and consume the token.
     Res =
         StringRef(PrefixLoc.getPointer(), getTok().getIdentifier().size() + 1);
-    Lex();
+    Lex(); // Parser Lex to maintain invariants.
     return false;
   }
 
@@ -2369,12 +2478,10 @@ bool AsmParser::parseIdentifier(StringRef &Res) {
 bool AsmParser::parseDirectiveSet(StringRef IDVal, bool allow_redef) {
   StringRef Name;
 
-  if (parseIdentifier(Name))
-    return TokError("expected identifier after '" + Twine(IDVal) + "'");
-
-  if (getLexer().isNot(AsmToken::Comma))
-    return TokError("unexpected token in '" + Twine(IDVal) + "'");
-  Lex();
+  if (check(parseIdentifier(Name),
+            "expected identifier after '" + Twine(IDVal) + "'") ||
+      parseToken(AsmToken::Comma, "unexpected token in '" + Twine(IDVal) + "'"))
+    return true;
 
   return parseAssignment(Name, allow_redef, true);
 }
@@ -2434,6 +2541,7 @@ bool AsmParser::parseEscapedString(std::string &Data) {
     }
   }
 
+  Lex();
   return false;
 }
 
@@ -2444,25 +2552,22 @@ bool AsmParser::parseDirectiveAscii(StringRef IDVal, bool ZeroTerminated) {
     checkForValidSection();
 
     for (;;) {
-      if (getLexer().isNot(AsmToken::String))
-        return TokError("expected string in '" + Twine(IDVal) + "' directive");
-
       std::string Data;
-      if (parseEscapedString(Data))
+      if (check(getTok().isNot(AsmToken::String),
+                "expected string in '" + Twine(IDVal) + "' directive") ||
+          parseEscapedString(Data))
         return true;
 
       getStreamer().EmitBytes(Data);
       if (ZeroTerminated)
         getStreamer().EmitBytes(StringRef("\0", 1));
 
-      Lex();
-
       if (getLexer().is(AsmToken::EndOfStatement))
         break;
 
-      if (getLexer().isNot(AsmToken::Comma))
-        return TokError("unexpected token in '" + Twine(IDVal) + "' directive");
-      Lex();
+      if (parseToken(AsmToken::Comma,
+                     "unexpected token in '" + Twine(IDVal) + "' directive"))
+        return true;
     }
   }
 
@@ -2482,21 +2587,19 @@ bool AsmParser::parseDirectiveReloc(SMLoc DirectiveLoc) {
 
   // We can only deal with constant expressions at the moment.
   int64_t OffsetValue;
-  if (!Offset->evaluateAsAbsolute(OffsetValue))
-    return Error(OffsetLoc, "expression is not a constant value");
-
-  if (Lexer.isNot(AsmToken::Comma))
-    return TokError("expected comma");
-  Lexer.Lex();
+  if (check(!Offset->evaluateAsAbsolute(OffsetValue), OffsetLoc,
+            "expression is not a constant value") ||
+      check(OffsetValue < 0, OffsetLoc, "expression is negative") ||
+      parseToken(AsmToken::Comma, "expected comma") ||
+      check(getTok().isNot(AsmToken::Identifier), "expected relocation name"))
+    return true;
 
-  if (Lexer.isNot(AsmToken::Identifier))
-    return TokError("expected relocation name");
   SMLoc NameLoc = Lexer.getTok().getLoc();
   StringRef Name = Lexer.getTok().getIdentifier();
-  Lexer.Lex();
+  Lex();
 
   if (Lexer.is(AsmToken::Comma)) {
-    Lexer.Lex();
+    Lex();
     SMLoc ExprLoc = Lexer.getLoc();
     if (parseExpression(Expr))
       return true;
@@ -2506,12 +2609,11 @@ bool AsmParser::parseDirectiveReloc(SMLoc DirectiveLoc) {
       return Error(ExprLoc, "expression must be relocatable");
   }
 
-  if (Lexer.isNot(AsmToken::EndOfStatement))
-    return TokError("unexpected token in .reloc directive");
-
-  if (getStreamer().EmitRelocDirective(*Offset, Name, Expr, DirectiveLoc))
-    return Error(NameLoc, "unknown relocation name");
-
+  if (parseToken(AsmToken::EndOfStatement,
+                 "unexpected token in .reloc directive") ||
+      check(getStreamer().EmitRelocDirective(*Offset, Name, Expr, DirectiveLoc),
+            NameLoc, "unknown relocation name"))
+    return true;
   return false;
 }
 
@@ -2541,9 +2643,8 @@ bool AsmParser::parseDirectiveValue(unsigned Size) {
         break;
 
       // FIXME: Improve diagnostic.
-      if (getLexer().isNot(AsmToken::Comma))
-        return TokError("unexpected token in directive");
-      Lex();
+      if (parseToken(AsmToken::Comma, "unexpected token in directive"))
+        return true;
     }
   }
 
@@ -2558,10 +2659,9 @@ bool AsmParser::parseDirectiveOctaValue() {
     checkForValidSection();
 
     for (;;) {
-      if (Lexer.getKind() == AsmToken::Error)
+      if (getTok().is(AsmToken::Error))
         return true;
-      if (Lexer.getKind() != AsmToken::Integer &&
-          Lexer.getKind() != AsmToken::BigNum)
+      if (getTok().isNot(AsmToken::Integer) && getTok().isNot(AsmToken::BigNum))
         return TokError("unknown token in expression");
 
       SMLoc ExprLoc = getLexer().getLoc();
@@ -2591,9 +2691,8 @@ bool AsmParser::parseDirectiveOctaValue() {
         break;
 
       // FIXME: Improve diagnostic.
-      if (getLexer().isNot(AsmToken::Comma))
-        return TokError("unexpected token in directive");
-      Lex();
+      if (parseToken(AsmToken::Comma, "unexpected token in directive"))
+        return true;
     }
   }
 
@@ -2612,14 +2711,15 @@ bool AsmParser::parseDirectiveRealValue(const fltSemantics &Semantics) {
       // have to manually parse unary prefixes.
       bool IsNeg = false;
       if (getLexer().is(AsmToken::Minus)) {
-        Lex();
+        Lexer.Lex();
         IsNeg = true;
       } else if (getLexer().is(AsmToken::Plus))
-        Lex();
+        Lexer.Lex();
 
-      if (getLexer().isNot(AsmToken::Integer) &&
-          getLexer().isNot(AsmToken::Real) &&
-          getLexer().isNot(AsmToken::Identifier))
+      if (Lexer.is(AsmToken::Error))
+        return TokError(Lexer.getErr());
+      if (Lexer.isNot(AsmToken::Integer) && Lexer.isNot(AsmToken::Real) &&
+          Lexer.isNot(AsmToken::Identifier))
         return TokError("unexpected token in directive");
 
       // Convert to an APFloat.
@@ -2646,12 +2746,11 @@ bool AsmParser::parseDirectiveRealValue(const fltSemantics &Semantics) {
       getStreamer().EmitIntValue(AsInt.getLimitedValue(),
                                  AsInt.getBitWidth() / 8);
 
-      if (getLexer().is(AsmToken::EndOfStatement))
+      if (Lexer.is(AsmToken::EndOfStatement))
         break;
 
-      if (getLexer().isNot(AsmToken::Comma))
-        return TokError("unexpected token in directive");
-      Lex();
+      if (parseToken(AsmToken::Comma, "unexpected token in directive"))
+        return true;
     }
   }
 
@@ -2664,8 +2763,9 @@ bool AsmParser::parseDirectiveRealValue(const fltSemantics &Semantics) {
 bool AsmParser::parseDirectiveZero() {
   checkForValidSection();
 
-  int64_t NumBytes;
-  if (parseAbsoluteExpression(NumBytes))
+  SMLoc NumBytesLoc = Lexer.getLoc();
+  const MCExpr *NumBytes;
+  if (parseExpression(NumBytes))
     return true;
 
   int64_t Val = 0;
@@ -2675,12 +2775,10 @@ bool AsmParser::parseDirectiveZero() {
       return true;
   }
 
-  if (getLexer().isNot(AsmToken::EndOfStatement))
-    return TokError("unexpected token in '.zero' directive");
-
-  Lex();
-
-  getStreamer().EmitFill(NumBytes, Val);
+  if (parseToken(AsmToken::EndOfStatement,
+                 "unexpected token in '.zero' directive"))
+    return true;
+  getStreamer().emitFill(*NumBytes, Val, NumBytesLoc);
 
   return false;
 }
@@ -2690,49 +2788,34 @@ bool AsmParser::parseDirectiveZero() {
 bool AsmParser::parseDirectiveFill() {
   checkForValidSection();
 
-  SMLoc RepeatLoc = getLexer().getLoc();
-  int64_t NumValues;
-  if (parseAbsoluteExpression(NumValues))
+  SMLoc NumValuesLoc = Lexer.getLoc();
+  const MCExpr *NumValues;
+  if (parseExpression(NumValues))
     return true;
 
-  if (NumValues < 0) {
-    Warning(RepeatLoc,
-            "'.fill' directive with negative repeat count has no effect");
-    NumValues = 0;
-  }
-
   int64_t FillSize = 1;
   int64_t FillExpr = 0;
 
   SMLoc SizeLoc, ExprLoc;
   if (getLexer().isNot(AsmToken::EndOfStatement)) {
-    if (getLexer().isNot(AsmToken::Comma))
-      return TokError("unexpected token in '.fill' directive");
-    Lex();
 
-    SizeLoc = getLexer().getLoc();
-    if (parseAbsoluteExpression(FillSize))
+    if (parseToken(AsmToken::Comma, "unexpected token in '.fill' directive") ||
+        getTokenLoc(SizeLoc) || parseAbsoluteExpression(FillSize))
       return true;
 
     if (getLexer().isNot(AsmToken::EndOfStatement)) {
-      if (getLexer().isNot(AsmToken::Comma))
-        return TokError("unexpected token in '.fill' directive");
-      Lex();
-
-      ExprLoc = getLexer().getLoc();
-      if (parseAbsoluteExpression(FillExpr))
+      if (parseToken(AsmToken::Comma,
+                     "unexpected token in '.fill' directive") ||
+          getTokenLoc(ExprLoc) || parseAbsoluteExpression(FillExpr) ||
+          parseToken(AsmToken::EndOfStatement,
+                     "unexpected token in '.fill' directive"))
         return true;
-
-      if (getLexer().isNot(AsmToken::EndOfStatement))
-        return TokError("unexpected token in '.fill' directive");
-
-      Lex();
     }
   }
 
   if (FillSize < 0) {
     Warning(SizeLoc, "'.fill' directive with negative size has no effect");
-    NumValues = 0;
+    return false;
   }
   if (FillSize > 8) {
     Warning(SizeLoc, "'.fill' directive with size greater than 8 has been truncated to 8");
@@ -2742,15 +2825,7 @@ bool AsmParser::parseDirectiveFill() {
   if (!isUInt<32>(FillExpr) && FillSize > 4)
     Warning(ExprLoc, "'.fill' directive pattern has been truncated to 32-bits");
 
-  if (NumValues > 0) {
-    int64_t NonZeroFillSize = FillSize > 4 ? 4 : FillSize;
-    FillExpr &= ~0ULL >> (64 - NonZeroFillSize * 8);
-    for (uint64_t i = 0, e = NumValues; i != e; ++i) {
-      getStreamer().EmitIntValue(FillExpr, NonZeroFillSize);
-      if (NonZeroFillSize < FillSize)
-        getStreamer().EmitIntValue(0, FillSize - NonZeroFillSize);
-    }
-  }
+  getStreamer().emitFill(*NumValues, FillSize, FillExpr, NumValuesLoc);
 
   return false;
 }
@@ -2767,18 +2842,15 @@ bool AsmParser::parseDirectiveOrg() {
   // Parse optional fill expression.
   int64_t FillExpr = 0;
   if (getLexer().isNot(AsmToken::EndOfStatement)) {
-    if (getLexer().isNot(AsmToken::Comma))
-      return TokError("unexpected token in '.org' directive");
-    Lex();
-
-    if (parseAbsoluteExpression(FillExpr))
+    if (parseToken(AsmToken::Comma, "unexpected token in '.org' directive") ||
+        parseAbsoluteExpression(FillExpr))
       return true;
-
-    if (getLexer().isNot(AsmToken::EndOfStatement))
-      return TokError("unexpected token in '.org' directive");
   }
 
-  Lex();
+  if (parseToken(AsmToken::EndOfStatement,
+                 "unexpected token in '.org' directive"))
+    return true;
+
   getStreamer().emitValueToOffset(Offset, FillExpr);
   return false;
 }
@@ -2798,34 +2870,27 @@ bool AsmParser::parseDirectiveAlign(bool IsPow2, unsigned ValueSize) {
   int64_t FillExpr = 0;
   int64_t MaxBytesToFill = 0;
   if (getLexer().isNot(AsmToken::EndOfStatement)) {
-    if (getLexer().isNot(AsmToken::Comma))
-      return TokError("unexpected token in directive");
-    Lex();
+    if (parseToken(AsmToken::Comma, "unexpected token in directive"))
+      return true;
 
     // The fill expression can be omitted while specifying a maximum number of
     // alignment bytes, e.g:
     //  .align 3,,4
-    if (getLexer().isNot(AsmToken::Comma)) {
+    if (getTok().isNot(AsmToken::Comma)) {
       HasFillExpr = true;
       if (parseAbsoluteExpression(FillExpr))
         return true;
     }
 
-    if (getLexer().isNot(AsmToken::EndOfStatement)) {
-      if (getLexer().isNot(AsmToken::Comma))
-        return TokError("unexpected token in directive");
-      Lex();
-
-      MaxBytesLoc = getLexer().getLoc();
-      if (parseAbsoluteExpression(MaxBytesToFill))
+    if (getTok().isNot(AsmToken::EndOfStatement)) {
+      if (parseToken(AsmToken::Comma, "unexpected token in directive") ||
+          getTokenLoc(MaxBytesLoc) || parseAbsoluteExpression(MaxBytesToFill))
         return true;
-
-      if (getLexer().isNot(AsmToken::EndOfStatement))
-        return TokError("unexpected token in directive");
     }
   }
 
-  Lex();
+  if (parseToken(AsmToken::EndOfStatement, "unexpected token in directive"))
+    return true;
 
   if (!HasFillExpr)
     FillExpr = 0;
@@ -2896,43 +2961,41 @@ bool AsmParser::parseDirectiveFile(SMLoc DirectiveLoc) {
       return TokError("file number less than one");
   }
 
-  if (getLexer().isNot(AsmToken::String))
-    return TokError("unexpected token in '.file' directive");
+  std::string Path = getTok().getString();
 
   // Usually the directory and filename together, otherwise just the directory.
   // Allow the strings to have escaped octal character sequence.
-  std::string Path = getTok().getString();
-  if (parseEscapedString(Path))
+  if (check(getTok().isNot(AsmToken::String),
+            "unexpected token in '.file' directive") ||
+      parseEscapedString(Path))
     return true;
-  Lex();
 
   StringRef Directory;
   StringRef Filename;
   std::string FilenameData;
   if (getLexer().is(AsmToken::String)) {
-    if (FileNumber == -1)
-      return TokError("explicit path specified, but no file number");
-    if (parseEscapedString(FilenameData))
+    if (check(FileNumber == -1,
+              "explicit path specified, but no file number") ||
+        parseEscapedString(FilenameData))
       return true;
     Filename = FilenameData;
     Directory = Path;
-    Lex();
   } else {
     Filename = Path;
   }
 
-  if (getLexer().isNot(AsmToken::EndOfStatement))
-    return TokError("unexpected token in '.file' directive");
+  if (parseToken(AsmToken::EndOfStatement,
+                 "unexpected token in '.file' directive"))
+    return true;
 
   if (FileNumber == -1)
     getStreamer().EmitFileDirective(Filename);
   else {
+    // If there is -g option as well as debug info from directive file,
+    // we turn off -g option, directly use the existing debug info instead.
     if (getContext().getGenDwarfForAssembly())
-      Error(DirectiveLoc,
-            "input can't have .file dwarf directives when -g is "
-            "used to generate dwarf debug info for assembly code");
-
-    if (getStreamer().EmitDwarfFileDirective(FileNumber, Directory, Filename) ==
+      getContext().setGenDwarfForAssembly(false);
+    else if (getStreamer().EmitDwarfFileDirective(FileNumber, Directory, Filename) ==
         0)
       Error(FileNumberLoc, "file number already allocated");
   }
@@ -2943,19 +3006,16 @@ bool AsmParser::parseDirectiveFile(SMLoc DirectiveLoc) {
 /// parseDirectiveLine
 /// ::= .line [number]
 bool AsmParser::parseDirectiveLine() {
+  int64_t LineNumber;
   if (getLexer().isNot(AsmToken::EndOfStatement)) {
-    if (getLexer().isNot(AsmToken::Integer))
-      return TokError("unexpected token in '.line' directive");
-
-    int64_t LineNumber = getTok().getIntVal();
+    if (parseIntToken(LineNumber, "unexpected token in '.line' directive"))
+      return true;
     (void)LineNumber;
-    Lex();
-
     // FIXME: Do something with the .line.
   }
-
-  if (getLexer().isNot(AsmToken::EndOfStatement))
-    return TokError("unexpected token in '.line' directive");
+  if (parseToken(AsmToken::EndOfStatement,
+                 "unexpected token in '.line' directive"))
+    return true;
 
   return false;
 }
@@ -2968,16 +3028,16 @@ bool AsmParser::parseDirectiveLine() {
 /// third number is a column position (zero if not specified).  The remaining
 /// optional items are .loc sub-directives.
 bool AsmParser::parseDirectiveLoc() {
-  if (getLexer().isNot(AsmToken::Integer))
-    return TokError("unexpected token in '.loc' directive");
-  int64_t FileNumber = getTok().getIntVal();
-  if (FileNumber < 1)
-    return TokError("file number less than one in '.loc' directive");
-  if (!getContext().isValidDwarfFileNumber(FileNumber))
-    return TokError("unassigned file number in '.loc' directive");
-  Lex();
+  int64_t FileNumber = 0, LineNumber = 0;
+  SMLoc Loc = getTok().getLoc();
+  if (parseIntToken(FileNumber, "unexpected token in '.loc' directive") ||
+      check(FileNumber < 1, Loc,
+            "file number less than one in '.loc' directive") ||
+      check(!getContext().isValidDwarfFileNumber(FileNumber), Loc,
+            "unassigned file number in '.loc' directive"))
+    return true;
 
-  int64_t LineNumber = 0;
+  // optional
   if (getLexer().is(AsmToken::Integer)) {
     LineNumber = getTok().getIntVal();
     if (LineNumber < 0)
@@ -3054,6 +3114,7 @@ bool AsmParser::parseDirectiveLoc() {
         break;
     }
   }
+  Lex();
 
   getStreamer().EmitDwarfLocDirective(FileNumber, LineNumber, ColumnPos, Flags,
                                       Isa, Discriminator, StringRef());
@@ -3067,6 +3128,231 @@ bool AsmParser::parseDirectiveStabs() {
   return TokError("unsupported directive '.stabs'");
 }
 
+/// parseDirectiveCVFile
+/// ::= .cv_file number filename
+bool AsmParser::parseDirectiveCVFile() {
+  SMLoc FileNumberLoc = getTok().getLoc();
+  int64_t FileNumber;
+  std::string Filename;
+
+  if (parseIntToken(FileNumber,
+                    "expected file number in '.cv_file' directive") ||
+      check(FileNumber < 1, FileNumberLoc, "file number less than one") ||
+      check(getTok().isNot(AsmToken::String),
+            "unexpected token in '.cv_file' directive") ||
+      // Usually directory and filename are together, otherwise just
+      // directory. Allow the strings to have escaped octal character sequence.
+      parseEscapedString(Filename) ||
+      parseToken(AsmToken::EndOfStatement,
+                 "unexpected token in '.cv_file' directive") ||
+      check(getStreamer().EmitCVFileDirective(FileNumber, Filename) == 0,
+            FileNumberLoc, "file number already allocated"))
+    return true;
+
+  return false;
+}
+
+/// parseDirectiveCVLoc
+/// ::= .cv_loc FunctionId FileNumber [LineNumber] [ColumnPos] [prologue_end]
+///                                [is_stmt VALUE]
+/// The first number is a file number, must have been previously assigned with
+/// a .file directive, the second number is the line number and optionally the
+/// third number is a column position (zero if not specified).  The remaining
+/// optional items are .loc sub-directives.
+bool AsmParser::parseDirectiveCVLoc() {
+  SMLoc Loc;
+  int64_t FunctionId, FileNumber;
+  if (getTokenLoc(Loc) ||
+      parseIntToken(FunctionId, "unexpected token in '.cv_loc' directive") ||
+      check(FunctionId < 0, Loc,
+            "function id less than zero in '.cv_loc' directive") ||
+      getTokenLoc(Loc) ||
+      parseIntToken(FileNumber, "expected integer in '.cv_loc' directive") ||
+      check(FileNumber < 1, Loc,
+            "file number less than one in '.cv_loc' directive") ||
+      check(!getContext().isValidCVFileNumber(FileNumber), Loc,
+            "unassigned file number in '.cv_loc' directive"))
+    return true;
+
+  int64_t LineNumber = 0;
+  if (getLexer().is(AsmToken::Integer)) {
+    LineNumber = getTok().getIntVal();
+    if (LineNumber < 0)
+      return TokError("line number less than zero in '.cv_loc' directive");
+    Lex();
+  }
+
+  int64_t ColumnPos = 0;
+  if (getLexer().is(AsmToken::Integer)) {
+    ColumnPos = getTok().getIntVal();
+    if (ColumnPos < 0)
+      return TokError("column position less than zero in '.cv_loc' directive");
+    Lex();
+  }
+
+  bool PrologueEnd = false;
+  uint64_t IsStmt = 0;
+  while (getLexer().isNot(AsmToken::EndOfStatement)) {
+    StringRef Name;
+    SMLoc Loc = getTok().getLoc();
+    if (parseIdentifier(Name))
+      return TokError("unexpected token in '.cv_loc' directive");
+
+    if (Name == "prologue_end")
+      PrologueEnd = true;
+    else if (Name == "is_stmt") {
+      Loc = getTok().getLoc();
+      const MCExpr *Value;
+      if (parseExpression(Value))
+        return true;
+      // The expression must be the constant 0 or 1.
+      IsStmt = ~0ULL;
+      if (const auto *MCE = dyn_cast<MCConstantExpr>(Value))
+        IsStmt = MCE->getValue();
+
+      if (IsStmt > 1)
+        return Error(Loc, "is_stmt value not 0 or 1");
+    } else {
+      return Error(Loc, "unknown sub-directive in '.cv_loc' directive");
+    }
+  }
+  Lex();
+
+  getStreamer().EmitCVLocDirective(FunctionId, FileNumber, LineNumber,
+                                   ColumnPos, PrologueEnd, IsStmt, StringRef());
+  return false;
+}
+
+/// parseDirectiveCVLinetable
+/// ::= .cv_linetable FunctionId, FnStart, FnEnd
+bool AsmParser::parseDirectiveCVLinetable() {
+  int64_t FunctionId;
+  StringRef FnStartName, FnEndName;
+  SMLoc Loc = getTok().getLoc();
+  if (parseIntToken(FunctionId,
+                    "expected Integer in '.cv_linetable' directive") ||
+      check(FunctionId < 0, Loc,
+            "function id less than zero in '.cv_linetable' directive") ||
+      parseToken(AsmToken::Comma,
+                 "unexpected token in '.cv_linetable' directive") ||
+      getTokenLoc(Loc) || check(parseIdentifier(FnStartName), Loc,
+                                "expected identifier in directive") ||
+      parseToken(AsmToken::Comma,
+                 "unexpected token in '.cv_linetable' directive") ||
+      getTokenLoc(Loc) || check(parseIdentifier(FnEndName), Loc,
+                                "expected identifier in directive"))
+    return true;
+
+  MCSymbol *FnStartSym = getContext().getOrCreateSymbol(FnStartName);
+  MCSymbol *FnEndSym = getContext().getOrCreateSymbol(FnEndName);
+
+  getStreamer().EmitCVLinetableDirective(FunctionId, FnStartSym, FnEndSym);
+  return false;
+}
+
+/// parseDirectiveCVInlineLinetable
+/// ::= .cv_inline_linetable PrimaryFunctionId FileId LineNum FnStart FnEnd
+///          ("contains" SecondaryFunctionId+)?
+bool AsmParser::parseDirectiveCVInlineLinetable() {
+  int64_t PrimaryFunctionId, SourceFileId, SourceLineNum;
+  StringRef FnStartName, FnEndName;
+  SMLoc Loc = getTok().getLoc();
+  if (parseIntToken(
+          PrimaryFunctionId,
+          "expected PrimaryFunctionId in '.cv_inline_linetable' directive") ||
+      check(PrimaryFunctionId < 0, Loc,
+            "function id less than zero in '.cv_inline_linetable' directive") ||
+      getTokenLoc(Loc) ||
+      parseIntToken(
+          SourceFileId,
+          "expected SourceField in '.cv_inline_linetable' directive") ||
+      check(SourceFileId <= 0, Loc,
+            "File id less than zero in '.cv_inline_linetable' directive") ||
+      getTokenLoc(Loc) ||
+      parseIntToken(
+          SourceLineNum,
+          "expected SourceLineNum in '.cv_inline_linetable' directive") ||
+      check(SourceLineNum < 0, Loc,
+            "Line number less than zero in '.cv_inline_linetable' directive") ||
+      getTokenLoc(Loc) || check(parseIdentifier(FnStartName), Loc,
+                                "expected identifier in directive") ||
+      getTokenLoc(Loc) || check(parseIdentifier(FnEndName), Loc,
+                                "expected identifier in directive"))
+    return true;
+
+  SmallVector<unsigned, 8> SecondaryFunctionIds;
+  if (getLexer().is(AsmToken::Identifier)) {
+    if (getTok().getIdentifier() != "contains")
+      return TokError(
+          "unexpected identifier in '.cv_inline_linetable' directive");
+    Lex();
+
+    while (getLexer().isNot(AsmToken::EndOfStatement)) {
+      int64_t SecondaryFunctionId = getTok().getIntVal();
+      if (SecondaryFunctionId < 0)
+        return TokError(
+            "function id less than zero in '.cv_inline_linetable' directive");
+      Lex();
+
+      SecondaryFunctionIds.push_back(SecondaryFunctionId);
+    }
+  }
+
+  if (parseToken(AsmToken::EndOfStatement, "Expected End of Statement"))
+    return true;
+
+  MCSymbol *FnStartSym = getContext().getOrCreateSymbol(FnStartName);
+  MCSymbol *FnEndSym = getContext().getOrCreateSymbol(FnEndName);
+  getStreamer().EmitCVInlineLinetableDirective(PrimaryFunctionId, SourceFileId,
+                                               SourceLineNum, FnStartSym,
+                                               FnEndSym, SecondaryFunctionIds);
+  return false;
+}
+
+/// parseDirectiveCVDefRange
+/// ::= .cv_def_range RangeStart RangeEnd (GapStart GapEnd)*, bytes*
+bool AsmParser::parseDirectiveCVDefRange() {
+  SMLoc Loc;
+  std::vector<std::pair<const MCSymbol *, const MCSymbol *>> Ranges;
+  while (getLexer().is(AsmToken::Identifier)) {
+    Loc = getLexer().getLoc();
+    StringRef GapStartName;
+    if (parseIdentifier(GapStartName))
+      return Error(Loc, "expected identifier in directive");
+    MCSymbol *GapStartSym = getContext().getOrCreateSymbol(GapStartName);
+
+    Loc = getLexer().getLoc();
+    StringRef GapEndName;
+    if (parseIdentifier(GapEndName))
+      return Error(Loc, "expected identifier in directive");
+    MCSymbol *GapEndSym = getContext().getOrCreateSymbol(GapEndName);
+
+    Ranges.push_back({GapStartSym, GapEndSym});
+  }
+
+  std::string FixedSizePortion;
+  if (parseToken(AsmToken::Comma, "unexpected token in directive") ||
+      parseEscapedString(FixedSizePortion))
+    return true;
+
+  getStreamer().EmitCVDefRangeDirective(Ranges, FixedSizePortion);
+  return false;
+}
+
+/// parseDirectiveCVStringTable
+/// ::= .cv_stringtable
+bool AsmParser::parseDirectiveCVStringTable() {
+  getStreamer().EmitCVStringTableDirective();
+  return false;
+}
+
+/// parseDirectiveCVFileChecksums
+/// ::= .cv_filechecksums
+bool AsmParser::parseDirectiveCVFileChecksums() {
+  getStreamer().EmitCVFileChecksumsDirective();
+  return false;
+}
+
 /// parseDirectiveCFISections
 /// ::= .cfi_sections section [, section]
 bool AsmParser::parseDirectiveCFISections() {
@@ -3106,6 +3392,9 @@ bool AsmParser::parseDirectiveCFIStartProc() {
     if (parseIdentifier(Simple) || Simple != "simple")
       return TokError("unexpected token in .cfi_startproc directive");
 
+  if (parseToken(AsmToken::EndOfStatement, "Expected end of statement"))
+    return true;
+
   getStreamer().EmitCFIStartProc(!Simple.empty());
   return false;
 }
@@ -3135,16 +3424,10 @@ bool AsmParser::parseRegisterOrRegisterNumber(int64_t &Register,
 /// parseDirectiveCFIDefCfa
 /// ::= .cfi_def_cfa register,  offset
 bool AsmParser::parseDirectiveCFIDefCfa(SMLoc DirectiveLoc) {
-  int64_t Register = 0;
-  if (parseRegisterOrRegisterNumber(Register, DirectiveLoc))
-    return true;
-
-  if (getLexer().isNot(AsmToken::Comma))
-    return TokError("unexpected token in directive");
-  Lex();
-
-  int64_t Offset = 0;
-  if (parseAbsoluteExpression(Offset))
+  int64_t Register = 0, Offset = 0;
+  if (parseRegisterOrRegisterNumber(Register, DirectiveLoc) ||
+      parseToken(AsmToken::Comma, "unexpected token in directive") ||
+      parseAbsoluteExpression(Offset))
     return true;
 
   getStreamer().EmitCFIDefCfa(Register, Offset);
@@ -3165,16 +3448,10 @@ bool AsmParser::parseDirectiveCFIDefCfaOffset() {
 /// parseDirectiveCFIRegister
 /// ::= .cfi_register register, register
 bool AsmParser::parseDirectiveCFIRegister(SMLoc DirectiveLoc) {
-  int64_t Register1 = 0;
-  if (parseRegisterOrRegisterNumber(Register1, DirectiveLoc))
-    return true;
-
-  if (getLexer().isNot(AsmToken::Comma))
-    return TokError("unexpected token in directive");
-  Lex();
-
-  int64_t Register2 = 0;
-  if (parseRegisterOrRegisterNumber(Register2, DirectiveLoc))
+  int64_t Register1 = 0, Register2 = 0;
+  if (parseRegisterOrRegisterNumber(Register1, DirectiveLoc) ||
+      parseToken(AsmToken::Comma, "unexpected token in directive") ||
+      parseRegisterOrRegisterNumber(Register2, DirectiveLoc))
     return true;
 
   getStreamer().EmitCFIRegister(Register1, Register2);
@@ -3216,14 +3493,9 @@ bool AsmParser::parseDirectiveCFIOffset(SMLoc DirectiveLoc) {
   int64_t Register = 0;
   int64_t Offset = 0;
 
-  if (parseRegisterOrRegisterNumber(Register, DirectiveLoc))
-    return true;
-
-  if (getLexer().isNot(AsmToken::Comma))
-    return TokError("unexpected token in directive");
-  Lex();
-
-  if (parseAbsoluteExpression(Offset))
+  if (parseRegisterOrRegisterNumber(Register, DirectiveLoc) ||
+      parseToken(AsmToken::Comma, "unexpected token in directive") ||
+      parseAbsoluteExpression(Offset))
     return true;
 
   getStreamer().EmitCFIOffset(Register, Offset);
@@ -3233,17 +3505,11 @@ bool AsmParser::parseDirectiveCFIOffset(SMLoc DirectiveLoc) {
 /// parseDirectiveCFIRelOffset
 /// ::= .cfi_rel_offset register, offset
 bool AsmParser::parseDirectiveCFIRelOffset(SMLoc DirectiveLoc) {
-  int64_t Register = 0;
+  int64_t Register = 0, Offset = 0;
 
-  if (parseRegisterOrRegisterNumber(Register, DirectiveLoc))
-    return true;
-
-  if (getLexer().isNot(AsmToken::Comma))
-    return TokError("unexpected token in directive");
-  Lex();
-
-  int64_t Offset = 0;
-  if (parseAbsoluteExpression(Offset))
+  if (parseRegisterOrRegisterNumber(Register, DirectiveLoc) ||
+      parseToken(AsmToken::Comma, "unexpected token in directive") ||
+      parseAbsoluteExpression(Offset))
     return true;
 
   getStreamer().EmitCFIRelOffset(Register, Offset);
@@ -3283,16 +3549,11 @@ bool AsmParser::parseDirectiveCFIPersonalityOrLsda(bool IsPersonality) {
   if (Encoding == dwarf::DW_EH_PE_omit)
     return false;
 
-  if (!isValidEncoding(Encoding))
-    return TokError("unsupported encoding.");
-
-  if (getLexer().isNot(AsmToken::Comma))
-    return TokError("unexpected token in directive");
-  Lex();
-
   StringRef Name;
-  if (parseIdentifier(Name))
-    return TokError("expected identifier in directive");
+  if (check(!isValidEncoding(Encoding), "unsupported encoding.") ||
+      parseToken(AsmToken::Comma, "unexpected token in directive") ||
+      check(parseIdentifier(Name), "expected identifier in directive"))
+    return true;
 
   MCSymbol *Sym = getContext().getOrCreateSymbol(Name);
 
@@ -3366,9 +3627,9 @@ bool AsmParser::parseDirectiveCFIEscape() {
 /// parseDirectiveCFISignalFrame
 /// ::= .cfi_signal_frame
 bool AsmParser::parseDirectiveCFISignalFrame() {
-  if (getLexer().isNot(AsmToken::EndOfStatement))
-    return Error(getLexer().getLoc(),
-                 "unexpected token in '.cfi_signal_frame'");
+  if (parseToken(AsmToken::EndOfStatement,
+                 "unexpected token in '.cfi_signal_frame'"))
+    return true;
 
   getStreamer().EmitCFISignalFrame();
   return false;
@@ -3390,9 +3651,9 @@ bool AsmParser::parseDirectiveCFIUndefined(SMLoc DirectiveLoc) {
 /// ::= .macros_on
 /// ::= .macros_off
 bool AsmParser::parseDirectiveMacrosOnOff(StringRef Directive) {
-  if (getLexer().isNot(AsmToken::EndOfStatement))
-    return Error(getLexer().getLoc(),
-                 "unexpected token in '" + Directive + "' directive");
+  if (parseToken(AsmToken::EndOfStatement,
+                 "unexpected token in '" + Directive + "' directive"))
+    return true;
 
   setMacrosEnabled(Directive == ".macros_on");
   return false;
@@ -3460,14 +3721,19 @@ bool AsmParser::parseDirectiveMacro(SMLoc DirectiveLoc) {
       Lex();
   }
 
-  // Eat the end of statement.
-  Lex();
+  // Eat just the end of statement.
+  Lexer.Lex();
 
+  // Consuming deferred text, so use Lexer.Lex to ignore Lexing Errors
   AsmToken EndToken, StartToken = getTok();
   unsigned MacroDepth = 0;
-
   // Lex the macro definition.
   for (;;) {
+    // Ignore Lexing errors in macros.
+    while (Lexer.is(AsmToken::Error)) {
+      Lexer.Lex();
+    }
+
     // Check whether we have reached the end of the file.
     if (getLexer().is(AsmToken::Eof))
       return Error(DirectiveLoc, "no matching '.endmacro' in definition");
@@ -3478,7 +3744,7 @@ bool AsmParser::parseDirectiveMacro(SMLoc DirectiveLoc) {
           getTok().getIdentifier() == ".endmacro") {
         if (MacroDepth == 0) { // Outermost macro.
           EndToken = getTok();
-          Lex();
+          Lexer.Lex();
           if (getLexer().isNot(AsmToken::EndOfStatement))
             return TokError("unexpected token in '" + EndToken.getIdentifier() +
                             "' directive");
@@ -3615,8 +3881,9 @@ void AsmParser::checkForBadMacro(SMLoc DirectiveLoc, StringRef Name,
 /// parseDirectiveExitMacro
 /// ::= .exitm
 bool AsmParser::parseDirectiveExitMacro(StringRef Directive) {
-  if (getLexer().isNot(AsmToken::EndOfStatement))
-    return TokError("unexpected token in '" + Directive + "' directive");
+  if (parseToken(AsmToken::EndOfStatement,
+                 "unexpected token in '" + Directive + "' directive"))
+    return true;
 
   if (!isInsideMacroInstantiation())
     return TokError("unexpected '" + Directive + "' in file, "
@@ -3656,14 +3923,14 @@ bool AsmParser::parseDirectiveEndMacro(StringRef Directive) {
 /// ::= .purgem
 bool AsmParser::parseDirectivePurgeMacro(SMLoc DirectiveLoc) {
   StringRef Name;
-  if (parseIdentifier(Name))
-    return TokError("expected identifier in '.purgem' directive");
-
-  if (getLexer().isNot(AsmToken::EndOfStatement))
-    return TokError("unexpected token in '.purgem' directive");
-
-  if (!lookupMacro(Name))
-    return Error(DirectiveLoc, "macro '" + Name + "' is not defined");
+  SMLoc Loc;
+  if (getTokenLoc(Loc) || check(parseIdentifier(Name), Loc,
+                                "expected identifier in '.purgem' directive") ||
+      parseToken(AsmToken::EndOfStatement,
+                 "unexpected token in '.purgem' directive") ||
+      check(!lookupMacro(Name), DirectiveLoc,
+            "macro '" + Name + "' is not defined"))
+    return true;
 
   undefineMacro(Name);
   return false;
@@ -3678,16 +3945,13 @@ bool AsmParser::parseDirectiveBundleAlignMode() {
   // in the inclusive range 0-30.
   SMLoc ExprLoc = getLexer().getLoc();
   int64_t AlignSizePow2;
-  if (parseAbsoluteExpression(AlignSizePow2))
+  if (parseAbsoluteExpression(AlignSizePow2) ||
+      parseToken(AsmToken::EndOfStatement, "unexpected token after expression "
+                                           "in '.bundle_align_mode' "
+                                           "directive") ||
+      check(AlignSizePow2 < 0 || AlignSizePow2 > 30, ExprLoc,
+            "invalid bundle alignment size (expected between 0 and 30)"))
     return true;
-  else if (getLexer().isNot(AsmToken::EndOfStatement))
-    return TokError("unexpected token after expression in"
-                    " '.bundle_align_mode' directive");
-  else if (AlignSizePow2 < 0 || AlignSizePow2 > 30)
-    return Error(ExprLoc,
-                 "invalid bundle alignment size (expected between 0 and 30)");
-
-  Lex();
 
   // Because of AlignSizePow2's verified range we can safely truncate it to
   // unsigned.
@@ -3707,14 +3971,11 @@ bool AsmParser::parseDirectiveBundleLock() {
     const char *kInvalidOptionError =
         "invalid option for '.bundle_lock' directive";
 
-    if (parseIdentifier(Option))
-      return Error(Loc, kInvalidOptionError);
-
-    if (Option != "align_to_end")
-      return Error(Loc, kInvalidOptionError);
-    else if (getLexer().isNot(AsmToken::EndOfStatement))
-      return Error(Loc,
-                   "unexpected token after '.bundle_lock' directive option");
+    if (check(parseIdentifier(Option), Loc, kInvalidOptionError) ||
+        check(Option != "align_to_end", Loc, kInvalidOptionError) ||
+        check(getTok().isNot(AsmToken::EndOfStatement), Loc,
+              "unexpected token after '.bundle_lock' directive option"))
+      return true;
     AlignToEnd = true;
   }
 
@@ -3729,9 +3990,9 @@ bool AsmParser::parseDirectiveBundleLock() {
 bool AsmParser::parseDirectiveBundleUnlock() {
   checkForValidSection();
 
-  if (getLexer().isNot(AsmToken::EndOfStatement))
-    return TokError("unexpected token in '.bundle_unlock' directive");
-  Lex();
+  if (parseToken(AsmToken::EndOfStatement,
+                 "unexpected token in '.bundle_unlock' directive"))
+    return true;
 
   getStreamer().EmitBundleUnlock();
   return false;
@@ -3742,31 +4003,26 @@ bool AsmParser::parseDirectiveBundleUnlock() {
 bool AsmParser::parseDirectiveSpace(StringRef IDVal) {
   checkForValidSection();
 
-  int64_t NumBytes;
-  if (parseAbsoluteExpression(NumBytes))
+  SMLoc NumBytesLoc = Lexer.getLoc();
+  const MCExpr *NumBytes;
+  if (parseExpression(NumBytes))
     return true;
 
   int64_t FillExpr = 0;
   if (getLexer().isNot(AsmToken::EndOfStatement)) {
-    if (getLexer().isNot(AsmToken::Comma))
-      return TokError("unexpected token in '" + Twine(IDVal) + "' directive");
-    Lex();
 
-    if (parseAbsoluteExpression(FillExpr))
+    if (parseToken(AsmToken::Comma,
+                   "unexpected token in '" + Twine(IDVal) + "' directive") ||
+        parseAbsoluteExpression(FillExpr))
       return true;
-
-    if (getLexer().isNot(AsmToken::EndOfStatement))
-      return TokError("unexpected token in '" + Twine(IDVal) + "' directive");
   }
 
-  Lex();
-
-  if (NumBytes <= 0)
-    return TokError("invalid number of bytes in '" + Twine(IDVal) +
-                    "' directive");
+  if (parseToken(AsmToken::EndOfStatement,
+                 "unexpected token in '" + Twine(IDVal) + "' directive"))
+    return true;
 
   // FIXME: Sometimes the fill expr is 'nop' if it isn't supplied, instead of 0.
-  getStreamer().EmitFill(NumBytes, FillExpr);
+  getStreamer().emitFill(*NumBytes, FillExpr, NumBytesLoc);
 
   return false;
 }
@@ -3789,10 +4045,10 @@ bool AsmParser::parseDirectiveLEB128(bool Signed) {
     if (getLexer().is(AsmToken::EndOfStatement))
       break;
 
-    if (getLexer().isNot(AsmToken::Comma))
-      return TokError("unexpected token in directive");
-    Lex();
+    if (parseToken(AsmToken::Comma, "unexpected token in directive"))
+      return true;
   }
+  Lex();
 
   return false;
 }
@@ -3820,9 +4076,8 @@ bool AsmParser::parseDirectiveSymbolAttribute(MCSymbolAttr Attr) {
       if (getLexer().is(AsmToken::EndOfStatement))
         break;
 
-      if (getLexer().isNot(AsmToken::Comma))
-        return TokError("unexpected token in directive");
-      Lex();
+      if (parseToken(AsmToken::Comma, "unexpected token in directive"))
+        return true;
     }
   }
 
@@ -3911,10 +4166,9 @@ bool AsmParser::parseDirectiveAbort() {
   SMLoc Loc = getLexer().getLoc();
 
   StringRef Str = parseStringToEndOfStatement();
-  if (getLexer().isNot(AsmToken::EndOfStatement))
-    return TokError("unexpected token in '.abort' directive");
-
-  Lex();
+  if (parseToken(AsmToken::EndOfStatement,
+                 "unexpected token in '.abort' directive"))
+    return true;
 
   if (Str.empty())
     Error(Loc, ".abort detected. Assembly stopping.");
@@ -3928,25 +4182,20 @@ bool AsmParser::parseDirectiveAbort() {
 /// parseDirectiveInclude
 ///  ::= .include "filename"
 bool AsmParser::parseDirectiveInclude() {
-  if (getLexer().isNot(AsmToken::String))
-    return TokError("expected string in '.include' directive");
-
   // Allow the strings to have escaped octal character sequence.
   std::string Filename;
-  if (parseEscapedString(Filename))
-    return true;
-  SMLoc IncludeLoc = getLexer().getLoc();
-  Lex();
-
-  if (getLexer().isNot(AsmToken::EndOfStatement))
-    return TokError("unexpected token in '.include' directive");
-
-  // Attempt to switch the lexer to the included file before consuming the end
-  // of statement to avoid losing it when we switch.
-  if (enterIncludeFile(Filename)) {
-    Error(IncludeLoc, "Could not find include file '" + Filename + "'");
+  SMLoc IncludeLoc = getTok().getLoc();
+
+  if (check(getTok().isNot(AsmToken::String),
+            "expected string in '.include' directive") ||
+      parseEscapedString(Filename) ||
+      check(getTok().isNot(AsmToken::EndOfStatement),
+            "unexpected token in '.include' directive") ||
+      // Attempt to switch the lexer to the included file before consuming the
+      // end of statement to avoid losing it when we switch.
+      check(enterIncludeFile(Filename), IncludeLoc,
+            "Could not find include file '" + Filename + "'"))
     return true;
-  }
 
   return false;
 }
@@ -3954,25 +4203,18 @@ bool AsmParser::parseDirectiveInclude() {
 /// parseDirectiveIncbin
 ///  ::= .incbin "filename"
 bool AsmParser::parseDirectiveIncbin() {
-  if (getLexer().isNot(AsmToken::String))
-    return TokError("expected string in '.incbin' directive");
-
   // Allow the strings to have escaped octal character sequence.
   std::string Filename;
-  if (parseEscapedString(Filename))
-    return true;
-  SMLoc IncbinLoc = getLexer().getLoc();
-  Lex();
-
-  if (getLexer().isNot(AsmToken::EndOfStatement))
-    return TokError("unexpected token in '.incbin' directive");
-
-  // Attempt to process the included file.
-  if (processIncbinFile(Filename)) {
-    Error(IncbinLoc, "Could not find incbin file '" + Filename + "'");
+  SMLoc IncbinLoc = getTok().getLoc();
+  if (check(getTok().isNot(AsmToken::String),
+            "expected string in '.incbin' directive") ||
+      parseEscapedString(Filename) ||
+      parseToken(AsmToken::EndOfStatement,
+                 "unexpected token in '.incbin' directive") ||
+      // Attempt to process the included file.
+      check(processIncbinFile(Filename), IncbinLoc,
+            "Could not find incbin file '" + Filename + "'"))
     return true;
-  }
-
   return false;
 }
 
@@ -3985,14 +4227,11 @@ bool AsmParser::parseDirectiveIf(SMLoc DirectiveLoc, DirectiveKind DirKind) {
     eatToEndOfStatement();
   } else {
     int64_t ExprValue;
-    if (parseAbsoluteExpression(ExprValue))
+    if (parseAbsoluteExpression(ExprValue) ||
+        parseToken(AsmToken::EndOfStatement,
+                   "unexpected token in '.if' directive"))
       return true;
 
-    if (getLexer().isNot(AsmToken::EndOfStatement))
-      return TokError("unexpected token in '.if' directive");
-
-    Lex();
-
     switch (DirKind) {
     default:
       llvm_unreachable("unsupported directive");
@@ -4034,10 +4273,9 @@ bool AsmParser::parseDirectiveIfb(SMLoc DirectiveLoc, bool ExpectBlank) {
   } else {
     StringRef Str = parseStringToEndOfStatement();
 
-    if (getLexer().isNot(AsmToken::EndOfStatement))
-      return TokError("unexpected token in '.ifb' directive");
-
-    Lex();
+    if (parseToken(AsmToken::EndOfStatement,
+                   "unexpected token in '.ifb' directive"))
+      return true;
 
     TheCondState.CondMet = ExpectBlank == Str.empty();
     TheCondState.Ignore = !TheCondState.CondMet;
@@ -4058,17 +4296,14 @@ bool AsmParser::parseDirectiveIfc(SMLoc DirectiveLoc, bool ExpectEqual) {
   } else {
     StringRef Str1 = parseStringToComma();
 
-    if (getLexer().isNot(AsmToken::Comma))
-      return TokError("unexpected token in '.ifc' directive");
-
-    Lex();
+    if (parseToken(AsmToken::Comma, "unexpected token in '.ifc' directive"))
+      return true;
 
     StringRef Str2 = parseStringToEndOfStatement();
 
-    if (getLexer().isNot(AsmToken::EndOfStatement))
-      return TokError("unexpected token in '.ifc' directive");
-
-    Lex();
+    if (parseToken(AsmToken::EndOfStatement,
+                   "unexpected token in '.ifc' directive"))
+      return true;
 
     TheCondState.CondMet = ExpectEqual == (Str1.trim() == Str2.trim());
     TheCondState.Ignore = !TheCondState.CondMet;
@@ -4133,10 +4368,9 @@ bool AsmParser::parseDirectiveIfdef(SMLoc DirectiveLoc, bool expect_defined) {
   if (TheCondState.Ignore) {
     eatToEndOfStatement();
   } else {
-    if (parseIdentifier(Name))
-      return TokError("expected identifier after '.ifdef'");
-
-    Lex();
+    if (check(parseIdentifier(Name), "expected identifier after '.ifdef'") ||
+        parseToken(AsmToken::EndOfStatement, "unexpected token in '.ifdef'"))
+      return true;
 
     MCSymbol *Sym = getContext().lookupSymbol(Name);
 
@@ -4184,10 +4418,9 @@ bool AsmParser::parseDirectiveElseIf(SMLoc DirectiveLoc) {
 /// parseDirectiveElse
 /// ::= .else
 bool AsmParser::parseDirectiveElse(SMLoc DirectiveLoc) {
-  if (getLexer().isNot(AsmToken::EndOfStatement))
-    return TokError("unexpected token in '.else' directive");
-
-  Lex();
+  if (parseToken(AsmToken::EndOfStatement,
+                 "unexpected token in '.else' directive"))
+    return true;
 
   if (TheCondState.TheCond != AsmCond::IfCond &&
       TheCondState.TheCond != AsmCond::ElseIfCond)
@@ -4208,10 +4441,9 @@ bool AsmParser::parseDirectiveElse(SMLoc DirectiveLoc) {
 /// parseDirectiveEnd
 /// ::= .end
 bool AsmParser::parseDirectiveEnd(SMLoc DirectiveLoc) {
-  if (getLexer().isNot(AsmToken::EndOfStatement))
-    return TokError("unexpected token in '.end' directive");
-
-  Lex();
+  if (parseToken(AsmToken::EndOfStatement,
+                 "unexpected token in '.end' directive"))
+    return true;
 
   while (Lexer.isNot(AsmToken::Eof))
     Lex();
@@ -4278,10 +4510,9 @@ bool AsmParser::parseDirectiveWarning(SMLoc L) {
 /// parseDirectiveEndIf
 /// ::= .endif
 bool AsmParser::parseDirectiveEndIf(SMLoc DirectiveLoc) {
-  if (getLexer().isNot(AsmToken::EndOfStatement))
-    return TokError("unexpected token in '.endif' directive");
-
-  Lex();
+  if (parseToken(AsmToken::EndOfStatement,
+                 "unexpected token in '.endif' directive"))
+    return true;
 
   if ((TheCondState.TheCond == AsmCond::NoCond) || TheCondStack.empty())
     Error(DirectiveLoc, "Encountered a .endif that doesn't follow a .if or "
@@ -4378,6 +4609,13 @@ void AsmParser::initializeDirectiveKindMap() {
   DirectiveKindMap[".line"] = DK_LINE;
   DirectiveKindMap[".loc"] = DK_LOC;
   DirectiveKindMap[".stabs"] = DK_STABS;
+  DirectiveKindMap[".cv_file"] = DK_CV_FILE;
+  DirectiveKindMap[".cv_loc"] = DK_CV_LOC;
+  DirectiveKindMap[".cv_linetable"] = DK_CV_LINETABLE;
+  DirectiveKindMap[".cv_inline_linetable"] = DK_CV_INLINE_LINETABLE;
+  DirectiveKindMap[".cv_def_range"] = DK_CV_DEF_RANGE;
+  DirectiveKindMap[".cv_stringtable"] = DK_CV_STRINGTABLE;
+  DirectiveKindMap[".cv_filechecksums"] = DK_CV_FILECHECKSUMS;
   DirectiveKindMap[".sleb128"] = DK_SLEB128;
   DirectiveKindMap[".uleb128"] = DK_ULEB128;
   DirectiveKindMap[".cfi_sections"] = DK_CFI_SECTIONS;
@@ -4425,7 +4663,9 @@ MCAsmMacro *AsmParser::parseMacroLikeBody(SMLoc DirectiveLoc) {
     }
 
     if (Lexer.is(AsmToken::Identifier) &&
-        (getTok().getIdentifier() == ".rept")) {
+        (getTok().getIdentifier() == ".rept" ||
+         getTok().getIdentifier() == ".irp" ||
+         getTok().getIdentifier() == ".irpc")) {
       ++NestLevel;
     }
 
@@ -4489,14 +4729,10 @@ bool AsmParser::parseDirectiveRept(SMLoc DirectiveLoc, StringRef Dir) {
     return Error(CountLoc, "unexpected token in '" + Dir + "' directive");
   }
 
-  if (Count < 0)
-    return Error(CountLoc, "Count is negative");
-
-  if (Lexer.isNot(AsmToken::EndOfStatement))
-    return TokError("unexpected token in '" + Dir + "' directive");
-
-  // Eat the end of statement.
-  Lex();
+  if (check(Count < 0, CountLoc, "Count is negative") ||
+      parseToken(AsmToken::EndOfStatement,
+                 "unexpected token in '" + Dir + "' directive"))
+    return true;
 
   // Lex the rept definition.
   MCAsmMacro *M = parseMacroLikeBody(DirectiveLoc);
@@ -4521,22 +4757,14 @@ bool AsmParser::parseDirectiveRept(SMLoc DirectiveLoc, StringRef Dir) {
 /// ::= .irp symbol,values
 bool AsmParser::parseDirectiveIrp(SMLoc DirectiveLoc) {
   MCAsmMacroParameter Parameter;
-
-  if (parseIdentifier(Parameter.Name))
-    return TokError("expected identifier in '.irp' directive");
-
-  if (Lexer.isNot(AsmToken::Comma))
-    return TokError("expected comma in '.irp' directive");
-
-  Lex();
-
   MCAsmMacroArguments A;
-  if (parseMacroArguments(nullptr, A))
+  if (check(parseIdentifier(Parameter.Name),
+            "expected identifier in '.irp' directive") ||
+      parseToken(AsmToken::Comma, "expected comma in '.irp' directive") ||
+      parseMacroArguments(nullptr, A) ||
+      parseToken(AsmToken::EndOfStatement, "expected End of Statement"))
     return true;
 
-  // Eat the end of statement.
-  Lex();
-
   // Lex the irp definition.
   MCAsmMacro *M = parseMacroLikeBody(DirectiveLoc);
   if (!M)
@@ -4563,24 +4791,20 @@ bool AsmParser::parseDirectiveIrp(SMLoc DirectiveLoc) {
 /// ::= .irpc symbol,values
 bool AsmParser::parseDirectiveIrpc(SMLoc DirectiveLoc) {
   MCAsmMacroParameter Parameter;
-
-  if (parseIdentifier(Parameter.Name))
-    return TokError("expected identifier in '.irpc' directive");
-
-  if (Lexer.isNot(AsmToken::Comma))
-    return TokError("expected comma in '.irpc' directive");
-
-  Lex();
-
   MCAsmMacroArguments A;
-  if (parseMacroArguments(nullptr, A))
+
+  if (check(parseIdentifier(Parameter.Name),
+            "expected identifier in '.irpc' directive") ||
+      parseToken(AsmToken::Comma, "expected comma in '.irpc' directive") ||
+      parseMacroArguments(nullptr, A))
     return true;
 
   if (A.size() != 1 || A.front().size() != 1)
     return TokError("unexpected token in '.irpc' directive");
 
   // Eat the end of statement.
-  Lex();
+  if (parseToken(AsmToken::EndOfStatement, "expected end of statement"))
+    return true;
 
   // Lex the irpc definition.
   MCAsmMacro *M = parseMacroLikeBody(DirectiveLoc);
@@ -4699,6 +4923,10 @@ bool AsmParser::parseMSInlineAsm(
   unsigned InputIdx = 0;
   unsigned OutputIdx = 0;
   while (getLexer().isNot(AsmToken::Eof)) {
+    // Parse curly braces marking block start/end
+    if (parseCurlyBlockScope(AsmStrRewrites))
+      continue;
+
     ParseStatementInfo Info(&AsmStrRewrites);
     if (parseStatement(Info, &SI))
       return true;
@@ -4875,6 +5103,9 @@ bool AsmParser::parseMSInlineAsm(
         OS << '.';
       OS << AR.Val;
       break;
+    case AOK_EndOfStatement:
+      OS << "\n\t";
+      break;
     }
 
     // Skip the original expression.
@@ -4922,10 +5153,9 @@ static bool isSymbolUsedInExpression(const MCSymbol *Sym, const MCExpr *Value) {
 bool parseAssignmentExpression(StringRef Name, bool allow_redef,
                                MCAsmParser &Parser, MCSymbol *&Sym,
                                const MCExpr *&Value) {
-  MCAsmLexer &Lexer = Parser.getLexer();
 
   // FIXME: Use better location, we should use proper tokens.
-  SMLoc EqualLoc = Lexer.getLoc();
+  SMLoc EqualLoc = Parser.getTok().getLoc();
 
   if (Parser.parseExpression(Value)) {
     Parser.TokError("missing expression");
@@ -4937,7 +5167,7 @@ bool parseAssignmentExpression(StringRef Name, bool allow_redef,
   // a = b
   // b = c
 
-  if (Lexer.isNot(AsmToken::EndOfStatement))
+  if (Parser.getTok().isNot(AsmToken::EndOfStatement))
     return Parser.TokError("unexpected token in assignment");
 
   // Eat the end of statement marker.
diff --git a/lib/MC/MCParser/COFFAsmParser.cpp b/lib/MC/MCParser/COFFAsmParser.cpp
index a4b2b195f710..653627ad8dca 100644
--- a/lib/MC/MCParser/COFFAsmParser.cpp
+++ b/lib/MC/MCParser/COFFAsmParser.cpp
@@ -15,10 +15,10 @@
 #include "llvm/MC/MCExpr.h"
 #include "llvm/MC/MCObjectFileInfo.h"
 #include "llvm/MC/MCParser/MCAsmLexer.h"
+#include "llvm/MC/MCParser/MCTargetAsmParser.h"
 #include "llvm/MC/MCRegisterInfo.h"
 #include "llvm/MC/MCSectionCOFF.h"
 #include "llvm/MC/MCStreamer.h"
-#include "llvm/MC/MCTargetAsmParser.h"
 #include "llvm/Support/COFF.h"
 using namespace llvm;
 
diff --git a/lib/MC/MCParser/DarwinAsmParser.cpp b/lib/MC/MCParser/DarwinAsmParser.cpp
index 73e068a34391..37515d9c074d 100644
--- a/lib/MC/MCParser/DarwinAsmParser.cpp
+++ b/lib/MC/MCParser/DarwinAsmParser.cpp
@@ -50,6 +50,7 @@ public:
     // Call the base implementation.
     this->MCAsmParserExtension::Initialize(Parser);
 
+    addDirectiveHandler<&DarwinAsmParser::parseDirectiveAltEntry>(".alt_entry");
     addDirectiveHandler<&DarwinAsmParser::parseDirectiveDesc>(".desc");
     addDirectiveHandler<&DarwinAsmParser::parseDirectiveIndirectSymbol>(
       ".indirect_symbol");
@@ -111,6 +112,9 @@ public:
     addDirectiveHandler<
       &DarwinAsmParser::parseSectionDirectiveNonLazySymbolPointers>(
         ".non_lazy_symbol_pointer");
+    addDirectiveHandler<
+      &DarwinAsmParser::parseSectionDirectiveThreadLocalVariablePointers>(
+        ".thread_local_variable_pointer");
     addDirectiveHandler<&DarwinAsmParser::parseSectionDirectiveObjCCatClsMeth>(
       ".objc_cat_cls_meth");
     addDirectiveHandler<&DarwinAsmParser::parseSectionDirectiveObjCCatInstMeth>(
@@ -179,6 +183,7 @@ public:
     LastVersionMinDirective = SMLoc();
   }
 
+  bool parseDirectiveAltEntry(StringRef, SMLoc);
   bool parseDirectiveDesc(StringRef, SMLoc);
   bool parseDirectiveIndirectSymbol(StringRef, SMLoc);
   bool parseDirectiveDumpOrLoad(StringRef, SMLoc);
@@ -261,6 +266,10 @@ public:
     return parseSectionSwitch("__DATA", "__la_symbol_ptr",
                               MachO::S_LAZY_SYMBOL_POINTERS, 4);
   }
+  bool parseSectionDirectiveThreadLocalVariablePointers(StringRef, SMLoc) {
+    return parseSectionSwitch("__DATA", "__thread_ptr",
+                              MachO::S_THREAD_LOCAL_VARIABLE_POINTERS, 4);
+  }
   bool parseSectionDirectiveDyld(StringRef, SMLoc) {
     return parseSectionSwitch("__DATA", "__dyld");
   }
@@ -408,6 +417,26 @@ bool DarwinAsmParser::parseSectionSwitch(const char *Segment,
   return false;
 }
 
+/// parseDirectiveAltEntry
+///  ::= .alt_entry identifier
+bool DarwinAsmParser::parseDirectiveAltEntry(StringRef, SMLoc) {
+  StringRef Name;
+  if (getParser().parseIdentifier(Name))
+    return TokError("expected identifier in directive");
+
+  // Look up symbol.
+  MCSymbol *Sym = getContext().getOrCreateSymbol(Name);
+
+  if (Sym->isDefined())
+    return TokError(".alt_entry must preceed symbol definition");
+
+  if (!getStreamer().EmitSymbolAttribute(Sym, MCSA_AltEntry))
+    return TokError("unable to emit symbol attribute");
+
+  Lex();
+  return false;
+}
+
 /// parseDirectiveDesc
 ///  ::= .desc identifier , expression
 bool DarwinAsmParser::parseDirectiveDesc(StringRef, SMLoc) {
@@ -445,6 +474,7 @@ bool DarwinAsmParser::parseDirectiveIndirectSymbol(StringRef, SMLoc Loc) {
   MachO::SectionType SectionType = Current->getType();
   if (SectionType != MachO::S_NON_LAZY_SYMBOL_POINTERS &&
       SectionType != MachO::S_LAZY_SYMBOL_POINTERS &&
+      SectionType != MachO::S_THREAD_LOCAL_VARIABLE_POINTERS &&
       SectionType != MachO::S_SYMBOL_STUBS)
     return Error(Loc, "indirect symbol not in a symbol pointer or stub "
                       "section");
@@ -507,7 +537,6 @@ bool DarwinAsmParser::parseDirectiveLinkerOption(StringRef IDVal, SMLoc) {
 
     Args.push_back(Data);
 
-    Lex();
     if (getLexer().is(AsmToken::EndOfStatement))
       break;
 
@@ -929,8 +958,8 @@ bool DarwinAsmParser::parseVersionMin(StringRef Directive, SMLoc Loc) {
     if (getLexer().isNot(AsmToken::Integer))
       return TokError("invalid OS update number");
     Update = getLexer().getTok().getIntVal();
-  if (Update > 255 || Update < 0)
-    return TokError("invalid OS update number");
+    if (Update > 255 || Update < 0)
+      return TokError("invalid OS update number");
     Lex();
   }
 
diff --git a/lib/MC/MCParser/ELFAsmParser.cpp b/lib/MC/MCParser/ELFAsmParser.cpp
index 6cbcdec5e275..47d19a824d19 100644
--- a/lib/MC/MCParser/ELFAsmParser.cpp
+++ b/lib/MC/MCParser/ELFAsmParser.cpp
@@ -188,6 +188,7 @@ bool ELFAsmParser::ParseSectionSwitch(StringRef Section, unsigned Type,
     if (getParser().parseExpression(Subsection))
       return true;
   }
+  Lex();
 
   getStreamer().SwitchSection(getContext().getELFSection(Section, Type, Flags),
                               Subsection);
@@ -211,6 +212,7 @@ bool ELFAsmParser::ParseDirectiveSize(StringRef, SMLoc) {
 
   if (getLexer().isNot(AsmToken::EndOfStatement))
     return TokError("unexpected token in directive");
+  Lex();
 
   getStreamer().emitELFSize(Sym, Expr);
   return false;
@@ -229,22 +231,23 @@ bool ELFAsmParser::ParseSectionName(StringRef &SectionName) {
   }
 
   for (;;) {
-    unsigned CurSize;
-
+    
     SMLoc PrevLoc = getLexer().getLoc();
-    if (getLexer().is(AsmToken::Minus)) {
-      CurSize = 1;
-      Lex(); // Consume the "-".
-    } else if (getLexer().is(AsmToken::String)) {
+    if (getLexer().is(AsmToken::Comma) ||
+      getLexer().is(AsmToken::EndOfStatement))
+      break;
+    
+    unsigned CurSize;
+    if (getLexer().is(AsmToken::String)) {
       CurSize = getTok().getIdentifier().size() + 2;
       Lex();
     } else if (getLexer().is(AsmToken::Identifier)) {
       CurSize = getTok().getIdentifier().size();
       Lex();
     } else {
-      break;
+      CurSize = getTok().getString().size();
+      Lex();
     }
-
     Size += CurSize;
     SectionName = StringRef(FirstLoc.getPointer(), Size);
 
@@ -261,8 +264,8 @@ bool ELFAsmParser::ParseSectionName(StringRef &SectionName) {
 static unsigned parseSectionFlags(StringRef flagsStr, bool *UseLastGroup) {
   unsigned flags = 0;
 
-  for (unsigned i = 0; i < flagsStr.size(); i++) {
-    switch (flagsStr[i]) {
+  for (char i : flagsStr) {
+    switch (i) {
     case 'a':
       flags |= ELF::SHF_ALLOC;
       break;
@@ -476,6 +479,7 @@ bool ELFAsmParser::ParseSectionArguments(bool IsPush, SMLoc loc) {
 EndStmt:
   if (getLexer().isNot(AsmToken::EndOfStatement))
     return TokError("unexpected token in directive");
+  Lex();
 
   unsigned Type = ELF::SHT_PROGBITS;
 
@@ -627,6 +631,10 @@ bool ELFAsmParser::ParseDirectiveIdent(StringRef, SMLoc) {
 
   Lex();
 
+  if (getLexer().isNot(AsmToken::EndOfStatement))
+    return TokError("unexpected token in '.ident' directive");
+  Lex();
+
   getStreamer().EmitIdent(Data);
   return false;
 }
@@ -725,6 +733,8 @@ bool ELFAsmParser::ParseDirectiveSubsection(StringRef, SMLoc) {
   if (getLexer().isNot(AsmToken::EndOfStatement))
     return TokError("unexpected token in directive");
 
+  Lex();
+
   getStreamer().SubSection(Subsection);
   return false;
 }
diff --git a/lib/MC/MCParser/MCAsmLexer.cpp b/lib/MC/MCParser/MCAsmLexer.cpp
index e891bd2c6240..d95cd12accbc 100644
--- a/lib/MC/MCParser/MCAsmLexer.cpp
+++ b/lib/MC/MCParser/MCAsmLexer.cpp
@@ -13,7 +13,7 @@
 using namespace llvm;
 
 MCAsmLexer::MCAsmLexer() : TokStart(nullptr), SkipSpace(true) {
-  CurTok.emplace_back(AsmToken::Error, StringRef());
+  CurTok.emplace_back(AsmToken::Space, StringRef());
 }
 
 MCAsmLexer::~MCAsmLexer() {
diff --git a/lib/MC/MCParser/MCAsmParser.cpp b/lib/MC/MCParser/MCAsmParser.cpp
index 290dcb297742..dc7a3f00840f 100644
--- a/lib/MC/MCParser/MCAsmParser.cpp
+++ b/lib/MC/MCParser/MCAsmParser.cpp
@@ -11,7 +11,7 @@
 #include "llvm/ADT/Twine.h"
 #include "llvm/MC/MCParser/MCAsmLexer.h"
 #include "llvm/MC/MCParser/MCParsedAsmOperand.h"
-#include "llvm/MC/MCTargetAsmParser.h"
+#include "llvm/MC/MCParser/MCTargetAsmParser.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/SourceMgr.h"
 #include "llvm/Support/raw_ostream.h"
@@ -43,7 +43,7 @@ bool MCAsmParser::parseExpression(const MCExpr *&Res) {
   return parseExpression(Res, L);
 }
 
-void MCParsedAsmOperand::dump() const {
+LLVM_DUMP_METHOD void MCParsedAsmOperand::dump() const {
 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
   dbgs() << "  " << *this;
 #endif
diff --git a/lib/MC/MCParser/MCTargetAsmParser.cpp b/lib/MC/MCParser/MCTargetAsmParser.cpp
index 4e4b47805cd8..14a22c6b8a2f 100644
--- a/lib/MC/MCParser/MCTargetAsmParser.cpp
+++ b/lib/MC/MCParser/MCTargetAsmParser.cpp
@@ -7,8 +7,8 @@
 //
 //===----------------------------------------------------------------------===//
 
+#include "llvm/MC/MCParser/MCTargetAsmParser.h"
 #include "llvm/MC/MCContext.h"
-#include "llvm/MC/MCTargetAsmParser.h"
 using namespace llvm;
 
 MCTargetAsmParser::MCTargetAsmParser(MCTargetOptions const &MCOptions,
diff --git a/lib/MC/MCParser/Makefile b/lib/MC/MCParser/Makefile
deleted file mode 100644
index 4477757657c7..000000000000
--- a/lib/MC/MCParser/Makefile
+++ /dev/null
@@ -1,15 +0,0 @@
-##===- lib/MC/MCParser/Makefile ----------------------------*- Makefile -*-===##
-#
-#                     The LLVM Compiler Infrastructure
-#
-# This file is distributed under the University of Illinois Open Source
-# License. See LICENSE.TXT for details.
-#
-##===----------------------------------------------------------------------===##
-
-LEVEL = ../../..
-LIBRARYNAME = LLVMMCParser
-BUILD_ARCHIVE := 1
-
-include $(LEVEL)/Makefile.common
-
diff --git a/lib/MC/MCRegisterInfo.cpp b/lib/MC/MCRegisterInfo.cpp
index ce79cd5c2c6b..c76bb646c126 100644
--- a/lib/MC/MCRegisterInfo.cpp
+++ b/lib/MC/MCRegisterInfo.cpp
@@ -84,3 +84,12 @@ int MCRegisterInfo::getSEHRegNum(unsigned RegNum) const {
   if (I == L2SEHRegs.end()) return (int)RegNum;
   return I->second;
 }
+
+int MCRegisterInfo::getCodeViewRegNum(unsigned RegNum) const {
+  if (L2CVRegs.empty())
+    report_fatal_error("target does not implement codeview register mapping");
+  const DenseMap<unsigned, int>::const_iterator I = L2CVRegs.find(RegNum);
+  if (I == L2CVRegs.end())
+    report_fatal_error("unknown codeview register");
+  return I->second;
+}
diff --git a/lib/MC/MCSection.cpp b/lib/MC/MCSection.cpp
index dbd544a44ce3..32e4cce4f68c 100644
--- a/lib/MC/MCSection.cpp
+++ b/lib/MC/MCSection.cpp
@@ -86,7 +86,7 @@ MCSection::getSubsectionInsertionPoint(unsigned Subsection) {
 }
 
 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
-void MCSection::dump() {
+LLVM_DUMP_METHOD void MCSection::dump() {
   raw_ostream &OS = llvm::errs();
 
   OS << "<MCSection";
diff --git a/lib/MC/MCStreamer.cpp b/lib/MC/MCStreamer.cpp
index 836b40544642..6c8828f71ba2 100644
--- a/lib/MC/MCStreamer.cpp
+++ b/lib/MC/MCStreamer.cpp
@@ -19,8 +19,10 @@
 #include "llvm/MC/MCObjectFileInfo.h"
 #include "llvm/MC/MCObjectWriter.h"
 #include "llvm/MC/MCSection.h"
+#include "llvm/MC/MCSectionCOFF.h"
 #include "llvm/MC/MCSymbol.h"
 #include "llvm/MC/MCWin64EH.h"
+#include "llvm/Support/COFF.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/LEB128.h"
 #include "llvm/Support/raw_ostream.h"
@@ -68,6 +70,9 @@ raw_ostream &MCStreamer::GetCommentOS() {
 
 void MCStreamer::emitRawComment(const Twine &T, bool TabPrefix) {}
 
+void MCStreamer::addExplicitComment(const Twine &T) {}
+void MCStreamer::emitExplicitComments() {}
+
 void MCStreamer::generateCompactUnwindEncodings(MCAsmBackend *MAB) {
   for (auto &FI : DwarfFrameInfos)
     FI.CompactUnwindEncoding =
@@ -130,17 +135,26 @@ void MCStreamer::EmitGPRel32Value(const MCExpr *Value) {
   report_fatal_error("unsupported directive in streamer");
 }
 
-/// EmitFill - Emit NumBytes bytes worth of the value specified by
-/// FillValue.  This implements directives such as '.space'.
-void MCStreamer::EmitFill(uint64_t NumBytes, uint8_t FillValue) {
-  const MCExpr *E = MCConstantExpr::create(FillValue, getContext());
+/// Emit NumBytes bytes worth of the value specified by FillValue.
+/// This implements directives such as '.space'.
+void MCStreamer::emitFill(uint64_t NumBytes, uint8_t FillValue) {
   for (uint64_t i = 0, e = NumBytes; i != e; ++i)
-    EmitValue(E, 1);
+    EmitIntValue(FillValue, 1);
+}
+
+void MCStreamer::emitFill(uint64_t NumValues, int64_t Size, int64_t Expr) {
+  int64_t NonZeroSize = Size > 4 ? 4 : Size;
+  Expr &= ~0ULL >> (64 - NonZeroSize * 8);
+  for (uint64_t i = 0, e = NumValues; i != e; ++i) {
+    EmitIntValue(Expr, NonZeroSize);
+    if (NonZeroSize < Size)
+      EmitIntValue(0, Size - NonZeroSize);
+  }
 }
 
-/// The implementation in this class just redirects to EmitFill.
+/// The implementation in this class just redirects to emitFill.
 void MCStreamer::EmitZeros(uint64_t NumBytes) {
-  EmitFill(NumBytes, 0);
+  emitFill(NumBytes, 0);
 }
 
 unsigned MCStreamer::EmitDwarfFileDirective(unsigned FileNo,
@@ -174,12 +188,42 @@ MCDwarfFrameInfo *MCStreamer::getCurrentDwarfFrameInfo() {
   return &DwarfFrameInfos.back();
 }
 
+bool MCStreamer::hasUnfinishedDwarfFrameInfo() {
+  MCDwarfFrameInfo *CurFrame = getCurrentDwarfFrameInfo();
+  return CurFrame && !CurFrame->End;
+}
+
 void MCStreamer::EnsureValidDwarfFrame() {
   MCDwarfFrameInfo *CurFrame = getCurrentDwarfFrameInfo();
   if (!CurFrame || CurFrame->End)
     report_fatal_error("No open frame");
 }
 
+unsigned MCStreamer::EmitCVFileDirective(unsigned FileNo, StringRef Filename) {
+  return getContext().getCVFile(Filename, FileNo);
+}
+
+void MCStreamer::EmitCVLocDirective(unsigned FunctionId, unsigned FileNo,
+                                    unsigned Line, unsigned Column,
+                                    bool PrologueEnd, bool IsStmt,
+                                    StringRef FileName) {
+  getContext().setCurrentCVLoc(FunctionId, FileNo, Line, Column, PrologueEnd,
+                               IsStmt);
+}
+
+void MCStreamer::EmitCVLinetableDirective(unsigned FunctionId,
+                                          const MCSymbol *Begin,
+                                          const MCSymbol *End) {}
+
+void MCStreamer::EmitCVInlineLinetableDirective(
+    unsigned PrimaryFunctionId, unsigned SourceFileId, unsigned SourceLineNum,
+    const MCSymbol *FnStartSym, const MCSymbol *FnEndSym,
+    ArrayRef<unsigned> SecondaryFunctionIds) {}
+
+void MCStreamer::EmitCVDefRangeDirective(
+    ArrayRef<std::pair<const MCSymbol *, const MCSymbol *>> Ranges,
+    StringRef FixedSizePortion) {}
+
 void MCStreamer::EmitEHSymAttributes(const MCSymbol *Symbol,
                                      MCSymbol *EHSymbol) {
 }
@@ -213,8 +257,7 @@ void MCStreamer::EmitCFISections(bool EH, bool Debug) {
 }
 
 void MCStreamer::EmitCFIStartProc(bool IsSimple) {
-  MCDwarfFrameInfo *CurFrame = getCurrentDwarfFrameInfo();
-  if (CurFrame && !CurFrame->End)
+  if (hasUnfinishedDwarfFrameInfo())
     report_fatal_error("Starting a frame before finishing the previous one!");
 
   MCDwarfFrameInfo Frame;
@@ -417,6 +460,7 @@ void MCStreamer::EmitWinCFIStartProc(const MCSymbol *Symbol) {
 
   WinFrameInfos.push_back(new WinEH::FrameInfo(Symbol, StartProc));
   CurrentWinFrameInfo = WinFrameInfos.back();
+  CurrentWinFrameInfo->TextSection = getCurrentSectionOnly();
 }
 
 void MCStreamer::EmitWinCFIEndProc() {
@@ -438,6 +482,7 @@ void MCStreamer::EmitWinCFIStartChained() {
   WinFrameInfos.push_back(new WinEH::FrameInfo(CurrentWinFrameInfo->Function,
                                                StartProc, CurrentWinFrameInfo));
   CurrentWinFrameInfo = WinFrameInfos.back();
+  CurrentWinFrameInfo->TextSection = getCurrentSectionOnly();
 }
 
 void MCStreamer::EmitWinCFIEndChained() {
@@ -473,6 +518,38 @@ void MCStreamer::EmitWinEHHandlerData() {
     report_fatal_error("Chained unwind areas can't have handlers!");
 }
 
+static MCSection *getWinCFISection(MCContext &Context, unsigned *NextWinCFIID,
+                                   MCSection *MainCFISec,
+                                   const MCSection *TextSec) {
+  // If this is the main .text section, use the main unwind info section.
+  if (TextSec == Context.getObjectFileInfo()->getTextSection())
+    return MainCFISec;
+
+  const auto *TextSecCOFF = cast<MCSectionCOFF>(TextSec);
+  unsigned UniqueID = TextSecCOFF->getOrAssignWinCFISectionID(NextWinCFIID);
+
+  // If this section is COMDAT, this unwind section should be COMDAT associative
+  // with its group.
+  const MCSymbol *KeySym = nullptr;
+  if (TextSecCOFF->getCharacteristics() & COFF::IMAGE_SCN_LNK_COMDAT)
+    KeySym = TextSecCOFF->getCOMDATSymbol();
+
+  return Context.getAssociativeCOFFSection(cast<MCSectionCOFF>(MainCFISec),
+                                           KeySym, UniqueID);
+}
+
+MCSection *MCStreamer::getAssociatedPDataSection(const MCSection *TextSec) {
+  return getWinCFISection(getContext(), &NextWinCFIID,
+                          getContext().getObjectFileInfo()->getPDataSection(),
+                          TextSec);
+}
+
+MCSection *MCStreamer::getAssociatedXDataSection(const MCSection *TextSec) {
+  return getWinCFISection(getContext(), &NextWinCFIID,
+                          getContext().getObjectFileInfo()->getXDataSection(),
+                          TextSec);
+}
+
 void MCStreamer::EmitSyntaxDirective() {}
 
 void MCStreamer::EmitWinCFIPushReg(unsigned Register) {
@@ -660,7 +737,7 @@ void MCStreamer::emitAbsoluteSymbolDiff(const MCSymbol *Hi, const MCSymbol *Lo,
                               MCSymbolRefExpr::create(Lo, Context), Context);
 
   const MCAsmInfo *MAI = Context.getAsmInfo();
-  if (!MAI->doesSetDirectiveSuppressesReloc()) {
+  if (!MAI->doesSetDirectiveSuppressReloc()) {
     EmitValue(Diff, Size);
     return;
   }
@@ -687,11 +764,15 @@ void MCStreamer::EmitTBSSSymbol(MCSection *Section, MCSymbol *Symbol,
 void MCStreamer::ChangeSection(MCSection *, const MCExpr *) {}
 void MCStreamer::EmitWeakReference(MCSymbol *Alias, const MCSymbol *Symbol) {}
 void MCStreamer::EmitBytes(StringRef Data) {}
+void MCStreamer::EmitBinaryData(StringRef Data) { EmitBytes(Data); }
 void MCStreamer::EmitValueImpl(const MCExpr *Value, unsigned Size, SMLoc Loc) {
   visitUsedExpr(*Value);
 }
 void MCStreamer::EmitULEB128Value(const MCExpr *Value) {}
 void MCStreamer::EmitSLEB128Value(const MCExpr *Value) {}
+void MCStreamer::emitFill(const MCExpr &NumBytes, uint64_t Value, SMLoc Loc) {}
+void MCStreamer::emitFill(const MCExpr &NumValues, int64_t Size, int64_t Expr,
+                          SMLoc Loc) {}
 void MCStreamer::EmitValueToAlignment(unsigned ByteAlignment, int64_t Value,
                                       unsigned ValueSize,
                                       unsigned MaxBytesToEmit) {}
diff --git a/lib/MC/MCSymbol.cpp b/lib/MC/MCSymbol.cpp
index ab3b8eb68322..2ddece6bddce 100644
--- a/lib/MC/MCSymbol.cpp
+++ b/lib/MC/MCSymbol.cpp
@@ -77,5 +77,5 @@ void MCSymbol::print(raw_ostream &OS, const MCAsmInfo *MAI) const {
 }
 
 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
-void MCSymbol::dump() const { dbgs() << *this; }
+LLVM_DUMP_METHOD void MCSymbol::dump() const { dbgs() << *this; }
 #endif
diff --git a/lib/MC/MCValue.cpp b/lib/MC/MCValue.cpp
index 495a2b6ea5bb..32a6adbf224e 100644
--- a/lib/MC/MCValue.cpp
+++ b/lib/MC/MCValue.cpp
@@ -38,7 +38,7 @@ void MCValue::print(raw_ostream &OS) const {
 }
 
 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
-void MCValue::dump() const {
+LLVM_DUMP_METHOD void MCValue::dump() const {
   print(dbgs());
 }
 #endif
diff --git a/lib/MC/MCWin64EH.cpp b/lib/MC/MCWin64EH.cpp
index 1b73b7afb6a0..fdc4c10cd6ce 100644
--- a/lib/MC/MCWin64EH.cpp
+++ b/lib/MC/MCWin64EH.cpp
@@ -17,7 +17,7 @@
 #include "llvm/MC/MCSymbol.h"
 #include "llvm/Support/Win64EH.h"
 
-namespace llvm {
+using namespace llvm;
 
 // NOTE: All relocations generated here are 4-byte image-relative.
 
@@ -218,35 +218,29 @@ static void EmitUnwindInfo(MCStreamer &streamer, WinEH::FrameInfo *info) {
   }
 }
 
-namespace Win64EH {
-void UnwindEmitter::Emit(MCStreamer &Streamer) const {
-  MCContext &Context = Streamer.getContext();
-
+void llvm::Win64EH::UnwindEmitter::Emit(MCStreamer &Streamer) const {
   // Emit the unwind info structs first.
-  for (const auto &CFI : Streamer.getWinFrameInfos()) {
-    MCSection *XData = getXDataSection(CFI->Function, Context);
+  for (WinEH::FrameInfo *CFI : Streamer.getWinFrameInfos()) {
+    MCSection *XData = Streamer.getAssociatedXDataSection(CFI->TextSection);
     Streamer.SwitchSection(XData);
-    EmitUnwindInfo(Streamer, CFI);
+    ::EmitUnwindInfo(Streamer, CFI);
   }
 
   // Now emit RUNTIME_FUNCTION entries.
-  for (const auto &CFI : Streamer.getWinFrameInfos()) {
-    MCSection *PData = getPDataSection(CFI->Function, Context);
+  for (WinEH::FrameInfo *CFI : Streamer.getWinFrameInfos()) {
+    MCSection *PData = Streamer.getAssociatedPDataSection(CFI->TextSection);
     Streamer.SwitchSection(PData);
     EmitRuntimeFunction(Streamer, CFI);
   }
 }
 
-void UnwindEmitter::EmitUnwindInfo(MCStreamer &Streamer,
-                                   WinEH::FrameInfo *info) const {
+void llvm::Win64EH::UnwindEmitter::EmitUnwindInfo(
+    MCStreamer &Streamer, WinEH::FrameInfo *info) const {
   // Switch sections (the static function above is meant to be called from
   // here and from Emit().
-  MCContext &context = Streamer.getContext();
-  MCSection *xdataSect = getXDataSection(info->Function, context);
-  Streamer.SwitchSection(xdataSect);
+  MCSection *XData = Streamer.getAssociatedXDataSection(info->TextSection);
+  Streamer.SwitchSection(XData);
 
-  llvm::EmitUnwindInfo(Streamer, info);
-}
+  ::EmitUnwindInfo(Streamer, info);
 }
-} // End of namespace llvm
 
diff --git a/lib/MC/MCWinEH.cpp b/lib/MC/MCWinEH.cpp
index 83af203c7acb..21a913999f64 100644
--- a/lib/MC/MCWinEH.cpp
+++ b/lib/MC/MCWinEH.cpp
@@ -19,60 +19,7 @@
 namespace llvm {
 namespace WinEH {
 
-/// We can't have one section for all .pdata or .xdata because the Microsoft
-/// linker seems to want all code relocations to refer to the same object file
-/// section. If the code described is comdat, create a new comdat section
-/// associated with that comdat. If the code described is not in the main .text
-/// section, make a new section for it. Otherwise use the main unwind info
-/// section.
-static MCSection *getUnwindInfoSection(StringRef SecName,
-                                       MCSectionCOFF *UnwindSec,
-                                       const MCSymbol *Function,
-                                       MCContext &Context) {
-  if (Function && Function->isInSection()) {
-    // If Function is in a COMDAT, get or create an unwind info section in that
-    // COMDAT group.
-    const MCSectionCOFF *FunctionSection =
-        cast<MCSectionCOFF>(&Function->getSection());
-    if (FunctionSection->getCharacteristics() & COFF::IMAGE_SCN_LNK_COMDAT) {
-      return Context.getAssociativeCOFFSection(
-          UnwindSec, FunctionSection->getCOMDATSymbol());
-    }
-
-    // If Function is in a section other than .text, create a new .pdata section.
-    // Otherwise use the plain .pdata section.
-    if (const auto *Section = dyn_cast<MCSectionCOFF>(FunctionSection)) {
-      StringRef CodeSecName = Section->getSectionName();
-      if (CodeSecName == ".text")
-        return UnwindSec;
-
-      if (CodeSecName.startswith(".text$"))
-        CodeSecName = CodeSecName.substr(6);
-
-      return Context.getCOFFSection((SecName + Twine('$') + CodeSecName).str(),
-                                    COFF::IMAGE_SCN_CNT_INITIALIZED_DATA |
-                                        COFF::IMAGE_SCN_MEM_READ,
-                                    SectionKind::getData());
-    }
-  }
-
-  return UnwindSec;
-
-}
-
-MCSection *UnwindEmitter::getPDataSection(const MCSymbol *Function,
-                                          MCContext &Context) {
-  MCSectionCOFF *PData =
-      cast<MCSectionCOFF>(Context.getObjectFileInfo()->getPDataSection());
-  return getUnwindInfoSection(".pdata", PData, Function, Context);
-}
-
-MCSection *UnwindEmitter::getXDataSection(const MCSymbol *Function,
-                                          MCContext &Context) {
-  MCSectionCOFF *XData =
-      cast<MCSectionCOFF>(Context.getObjectFileInfo()->getXDataSection());
-  return getUnwindInfoSection(".xdata", XData, Function, Context);
-}
+UnwindEmitter::~UnwindEmitter() {}
 
 }
 }
diff --git a/lib/MC/MachObjectWriter.cpp b/lib/MC/MachObjectWriter.cpp
index 324385fa132a..e39271949d94 100644
--- a/lib/MC/MachObjectWriter.cpp
+++ b/lib/MC/MachObjectWriter.cpp
@@ -334,7 +334,7 @@ void MachObjectWriter::writeNlist(MachSymbolData &MSD,
     if (AliaseeInfo)
       SectionIndex = AliaseeInfo->SectionIndex;
     Symbol = AliasedSymbol;
-    // FIXME: Should this update Data as well?  Do we need OrigSymbol at all?
+    // FIXME: Should this update Data as well?
   }
 
   // Set the N_TYPE bits. See <mach-o/nlist.h>.
@@ -377,7 +377,9 @@ void MachObjectWriter::writeNlist(MachSymbolData &MSD,
 
   // The Mach-O streamer uses the lowest 16-bits of the flags for the 'desc'
   // value.
-  write16(cast<MCSymbolMachO>(Symbol)->getEncodedFlags());
+  bool EncodeAsAltEntry =
+    IsAlias && cast<MCSymbolMachO>(OrigSymbol).isAltEntry();
+  write16(cast<MCSymbolMachO>(Symbol)->getEncodedFlags(EncodeAsAltEntry));
   if (is64Bit())
     write64(Address);
   else
@@ -404,7 +406,7 @@ static unsigned ComputeLinkerOptionsLoadCommandSize(
   unsigned Size = sizeof(MachO::linker_option_command);
   for (const std::string &Option : Options)
     Size += Option.size() + 1;
-  return RoundUpToAlignment(Size, is64Bit ? 8 : 4);
+  return alignTo(Size, is64Bit ? 8 : 4);
 }
 
 void MachObjectWriter::writeLinkerOptionsLoadCommand(
@@ -455,6 +457,7 @@ void MachObjectWriter::bindIndirectSymbols(MCAssembler &Asm) {
 
     if (Section.getType() != MachO::S_NON_LAZY_SYMBOL_POINTERS &&
         Section.getType() != MachO::S_LAZY_SYMBOL_POINTERS &&
+        Section.getType() != MachO::S_THREAD_LOCAL_VARIABLE_POINTERS &&
         Section.getType() != MachO::S_SYMBOL_STUBS) {
       MCSymbol &Symbol = *it->Symbol;
       report_fatal_error("indirect symbol '" + Symbol.getName() +
@@ -468,7 +471,8 @@ void MachObjectWriter::bindIndirectSymbols(MCAssembler &Asm) {
          ie = Asm.indirect_symbol_end(); it != ie; ++it, ++IndirectIndex) {
     const MCSectionMachO &Section = cast<MCSectionMachO>(*it->Section);
 
-    if (Section.getType() != MachO::S_NON_LAZY_SYMBOL_POINTERS)
+    if (Section.getType() != MachO::S_NON_LAZY_SYMBOL_POINTERS &&
+        Section.getType() !=  MachO::S_THREAD_LOCAL_VARIABLE_POINTERS)
       continue;
 
     // Initialize the section indirect symbol base, if necessary.
@@ -606,7 +610,7 @@ void MachObjectWriter::computeSectionAddresses(const MCAssembler &Asm,
                                                const MCAsmLayout &Layout) {
   uint64_t StartAddress = 0;
   for (const MCSection *Sec : Layout.getSectionOrder()) {
-    StartAddress = RoundUpToAlignment(StartAddress, Sec->getAlignment());
+    StartAddress = alignTo(StartAddress, Sec->getAlignment());
     SectionAddress[Sec] = StartAddress;
     StartAddress += Layout.getSectionAddressSize(Sec);
 
@@ -736,7 +740,7 @@ void MachObjectWriter::writeObject(MCAssembler &Asm,
 
   // Add the loh load command size, if used.
   uint64_t LOHRawSize = Asm.getLOHContainer().getEmitSize(*this, Layout);
-  uint64_t LOHSize = RoundUpToAlignment(LOHRawSize, is64Bit() ? 8 : 4);
+  uint64_t LOHSize = alignTo(LOHRawSize, is64Bit() ? 8 : 4);
   if (LOHSize) {
     ++NumLoadCommands;
     LoadCommandsSize += sizeof(MachO::linkedit_data_command);
diff --git a/lib/MC/Makefile b/lib/MC/Makefile
deleted file mode 100644
index bf8b7c0e7831..000000000000
--- a/lib/MC/Makefile
+++ /dev/null
@@ -1,16 +0,0 @@
-##===- lib/MC/Makefile -------------------------------------*- Makefile -*-===##
-#
-#                     The LLVM Compiler Infrastructure
-#
-# This file is distributed under the University of Illinois Open Source
-# License. See LICENSE.TXT for details.
-#
-##===----------------------------------------------------------------------===##
-
-LEVEL = ../..
-LIBRARYNAME = LLVMMC
-BUILD_ARCHIVE := 1
-PARALLEL_DIRS := MCParser MCDisassembler
-
-include $(LEVEL)/Makefile.common
-
diff --git a/lib/MC/StringTableBuilder.cpp b/lib/MC/StringTableBuilder.cpp
index 80e552287b3d..9d95952a6d30 100644
--- a/lib/MC/StringTableBuilder.cpp
+++ b/lib/MC/StringTableBuilder.cpp
@@ -16,13 +16,29 @@
 
 using namespace llvm;
 
-StringTableBuilder::StringTableBuilder(Kind K) : K(K) {}
+StringTableBuilder::StringTableBuilder(Kind K, unsigned Alignment)
+    : K(K), Alignment(Alignment) {
+  // Account for leading bytes in table so that offsets returned from add are
+  // correct.
+  switch (K) {
+  case RAW:
+    Size = 0;
+    break;
+  case MachO:
+  case ELF:
+    Size = 1;
+    break;
+  case WinCOFF:
+    Size = 4;
+    break;
+  }
+}
 
-typedef std::pair<StringRef, size_t> StringPair;
+typedef std::pair<CachedHash<StringRef>, size_t> StringPair;
 
 // Returns the character at Pos from end of a string.
 static int charTailAt(StringPair *P, size_t Pos) {
-  StringRef S = P->first;
+  StringRef S = P->first.Val;
   if (Pos >= S.size())
     return -1;
   return (unsigned char)S[S.size() - Pos - 1];
@@ -62,13 +78,32 @@ tailcall:
 }
 
 void StringTableBuilder::finalize() {
-  std::vector<std::pair<StringRef, size_t> *> Strings;
+  finalizeStringTable(/*Optimize=*/true);
+}
+
+void StringTableBuilder::finalizeInOrder() {
+  finalizeStringTable(/*Optimize=*/false);
+}
+
+void StringTableBuilder::finalizeStringTable(bool Optimize) {
+  typedef std::pair<CachedHash<StringRef>, size_t> StringOffsetPair;
+  std::vector<StringOffsetPair *> Strings;
   Strings.reserve(StringIndexMap.size());
-  for (std::pair<StringRef, size_t> &P : StringIndexMap)
+  for (StringOffsetPair &P : StringIndexMap)
     Strings.push_back(&P);
 
-  if (!Strings.empty())
-    multikey_qsort(&Strings[0], &Strings[0] + Strings.size(), 0);
+  if (!Strings.empty()) {
+    // If we're optimizing, sort by name. If not, sort by previously assigned
+    // offset.
+    if (Optimize) {
+      multikey_qsort(&Strings[0], &Strings[0] + Strings.size(), 0);
+    } else {
+      std::sort(Strings.begin(), Strings.end(),
+                [](const StringOffsetPair *LHS, const StringOffsetPair *RHS) {
+                  return LHS->second < RHS->second;
+                });
+    }
+  }
 
   switch (K) {
   case RAW:
@@ -85,17 +120,28 @@ void StringTableBuilder::finalize() {
   }
 
   StringRef Previous;
-  for (std::pair<StringRef, size_t> *P : Strings) {
-    StringRef S = P->first;
+  for (StringOffsetPair *P : Strings) {
+    StringRef S = P->first.Val;
     if (K == WinCOFF)
       assert(S.size() > COFF::NameSize && "Short string in COFF string table!");
 
-    if (Previous.endswith(S)) {
-      P->second = StringTable.size() - S.size() - (K != RAW);
-      continue;
+    if (Optimize && Previous.endswith(S)) {
+      size_t Pos = StringTable.size() - S.size() - (K != RAW);
+      if (!(Pos & (Alignment - 1))) {
+        P->second = Pos;
+        continue;
+      }
+    }
+
+    if (Optimize) {
+      size_t Start = alignTo(StringTable.size(), Alignment);
+      P->second = Start;
+      StringTable.append(Start - StringTable.size(), '\0');
+    } else {
+      assert(P->second == StringTable.size() &&
+             "different strtab offset after finalization");
     }
 
-    P->second = StringTable.size();
     StringTable += S;
     if (K != RAW)
       StringTable += '\x00';
@@ -137,8 +183,9 @@ size_t StringTableBuilder::getOffset(StringRef S) const {
 
 size_t StringTableBuilder::add(StringRef S) {
   assert(!isFinalized());
-  auto P = StringIndexMap.insert(std::make_pair(S, Size));
+  size_t Start = alignTo(Size, Alignment);
+  auto P = StringIndexMap.insert(std::make_pair(S, Start));
   if (P.second)
-    Size += S.size() + (K != RAW);
+    Size = Start + S.size() + (K != RAW);
   return P.first->second;
 }
diff --git a/lib/MC/SubtargetFeature.cpp b/lib/MC/SubtargetFeature.cpp
index 7cce0fe756ef..a97cd1db6932 100644
--- a/lib/MC/SubtargetFeature.cpp
+++ b/lib/MC/SubtargetFeature.cpp
@@ -12,6 +12,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/MC/SubtargetFeature.h"
+#include "llvm/ADT/ArrayRef.h"
 #include "llvm/ADT/StringExtras.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/Format.h"
@@ -284,7 +285,7 @@ void SubtargetFeatures::print(raw_ostream &OS) const {
 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
 /// dump - Dump feature info.
 ///
-void SubtargetFeatures::dump() const {
+LLVM_DUMP_METHOD void SubtargetFeatures::dump() const {
   print(dbgs());
 }
 #endif
diff --git a/lib/MC/WinCOFFObjectWriter.cpp b/lib/MC/WinCOFFObjectWriter.cpp
index a76cbdbd5447..f316a5af387d 100644
--- a/lib/MC/WinCOFFObjectWriter.cpp
+++ b/lib/MC/WinCOFFObjectWriter.cpp
@@ -109,7 +109,6 @@ public:
   relocations Relocations;
 
   COFFSection(StringRef name);
-  static size_t size();
 };
 
 class WinCOFFObjectWriter : public MCObjectWriter {
@@ -155,6 +154,8 @@ public:
   object_t *createCOFFEntity(StringRef Name, list_t &List);
 
   void defineSection(MCSectionCOFF const &Sec);
+
+  COFFSymbol *getLinkedSymbol(const MCSymbol &Symbol);
   void DefineSymbol(const MCSymbol &Symbol, MCAssembler &Assembler,
                     const MCAsmLayout &Layout);
 
@@ -222,8 +223,6 @@ COFFSection::COFFSection(StringRef name)
   memset(&Header, 0, sizeof(Header));
 }
 
-size_t COFFSection::size() { return COFF::SectionSize; }
-
 //------------------------------------------------------------------------------
 // WinCOFFObjectWriter class implementation
 
@@ -353,34 +352,52 @@ static uint64_t getSymbolValue(const MCSymbol &Symbol,
   return Res;
 }
 
+COFFSymbol *WinCOFFObjectWriter::getLinkedSymbol(const MCSymbol &Symbol) {
+  if (!Symbol.isVariable())
+    return nullptr;
+
+  const MCSymbolRefExpr *SymRef =
+      dyn_cast<MCSymbolRefExpr>(Symbol.getVariableValue());
+  if (!SymRef)
+    return nullptr;
+
+  const MCSymbol &Aliasee = SymRef->getSymbol();
+  if (!Aliasee.isUndefined())
+    return nullptr;
+  return GetOrCreateCOFFSymbol(&Aliasee);
+}
+
 /// This function takes a symbol data object from the assembler
 /// and creates the associated COFF symbol staging object.
 void WinCOFFObjectWriter::DefineSymbol(const MCSymbol &Symbol,
                                        MCAssembler &Assembler,
                                        const MCAsmLayout &Layout) {
   COFFSymbol *coff_symbol = GetOrCreateCOFFSymbol(&Symbol);
+  const MCSymbol *Base = Layout.getBaseSymbol(Symbol);
+  COFFSection *Sec = nullptr;
+  if (Base && Base->getFragment()) {
+    Sec = SectionMap[Base->getFragment()->getParent()];
+    if (coff_symbol->Section && coff_symbol->Section != Sec)
+      report_fatal_error("conflicting sections for symbol");
+  }
 
+  COFFSymbol *Local = nullptr;
   if (cast<MCSymbolCOFF>(Symbol).isWeakExternal()) {
     coff_symbol->Data.StorageClass = COFF::IMAGE_SYM_CLASS_WEAK_EXTERNAL;
 
-    if (Symbol.isVariable()) {
-      const MCSymbolRefExpr *SymRef =
-          dyn_cast<MCSymbolRefExpr>(Symbol.getVariableValue());
-
-      if (!SymRef)
-        report_fatal_error("Weak externals may only alias symbols");
-
-      coff_symbol->Other = GetOrCreateCOFFSymbol(&SymRef->getSymbol());
-    } else {
+    COFFSymbol *WeakDefault = getLinkedSymbol(Symbol);
+    if (!WeakDefault) {
       std::string WeakName = (".weak." + Symbol.getName() + ".default").str();
-      COFFSymbol *WeakDefault = createSymbol(WeakName);
-      WeakDefault->Data.SectionNumber = COFF::IMAGE_SYM_ABSOLUTE;
-      WeakDefault->Data.StorageClass = COFF::IMAGE_SYM_CLASS_EXTERNAL;
-      WeakDefault->Data.Type = 0;
-      WeakDefault->Data.Value = 0;
-      coff_symbol->Other = WeakDefault;
+      WeakDefault = createSymbol(WeakName);
+      if (!Sec)
+        WeakDefault->Data.SectionNumber = COFF::IMAGE_SYM_ABSOLUTE;
+      else
+        WeakDefault->Section = Sec;
+      Local = WeakDefault;
     }
 
+    coff_symbol->Other = WeakDefault;
+
     // Setup the Weak External auxiliary symbol.
     coff_symbol->Aux.resize(1);
     memset(&coff_symbol->Aux[0], 0, sizeof(coff_symbol->Aux[0]));
@@ -388,47 +405,37 @@ void WinCOFFObjectWriter::DefineSymbol(const MCSymbol &Symbol,
     coff_symbol->Aux[0].Aux.WeakExternal.TagIndex = 0;
     coff_symbol->Aux[0].Aux.WeakExternal.Characteristics =
         COFF::IMAGE_WEAK_EXTERN_SEARCH_LIBRARY;
-
-    coff_symbol->MC = &Symbol;
   } else {
-    const MCSymbol *Base = Layout.getBaseSymbol(Symbol);
-    coff_symbol->Data.Value = getSymbolValue(Symbol, Layout);
+    if (!Base)
+      coff_symbol->Data.SectionNumber = COFF::IMAGE_SYM_ABSOLUTE;
+    else
+      coff_symbol->Section = Sec;
+    Local = coff_symbol;
+  }
+
+  if (Local) {
+    Local->Data.Value = getSymbolValue(Symbol, Layout);
 
     const MCSymbolCOFF &SymbolCOFF = cast<MCSymbolCOFF>(Symbol);
-    coff_symbol->Data.Type = SymbolCOFF.getType();
-    coff_symbol->Data.StorageClass = SymbolCOFF.getClass();
+    Local->Data.Type = SymbolCOFF.getType();
+    Local->Data.StorageClass = SymbolCOFF.getClass();
 
     // If no storage class was specified in the streamer, define it here.
-    if (coff_symbol->Data.StorageClass == COFF::IMAGE_SYM_CLASS_NULL) {
+    if (Local->Data.StorageClass == COFF::IMAGE_SYM_CLASS_NULL) {
       bool IsExternal = Symbol.isExternal() ||
                         (!Symbol.getFragment() && !Symbol.isVariable());
 
-      coff_symbol->Data.StorageClass = IsExternal
-                                           ? COFF::IMAGE_SYM_CLASS_EXTERNAL
-                                           : COFF::IMAGE_SYM_CLASS_STATIC;
-    }
-
-    if (!Base) {
-      coff_symbol->Data.SectionNumber = COFF::IMAGE_SYM_ABSOLUTE;
-    } else {
-      if (Base->getFragment()) {
-        COFFSection *Sec = SectionMap[Base->getFragment()->getParent()];
-
-        if (coff_symbol->Section && coff_symbol->Section != Sec)
-          report_fatal_error("conflicting sections for symbol");
-
-        coff_symbol->Section = Sec;
-      }
+      Local->Data.StorageClass = IsExternal ? COFF::IMAGE_SYM_CLASS_EXTERNAL
+                                            : COFF::IMAGE_SYM_CLASS_STATIC;
     }
-
-    coff_symbol->MC = &Symbol;
   }
+
+  coff_symbol->MC = &Symbol;
 }
 
 // Maximum offsets for different string table entry encodings.
-static const unsigned Max6DecimalOffset = 999999;
-static const unsigned Max7DecimalOffset = 9999999;
-static const uint64_t MaxBase64Offset = 0xFFFFFFFFFULL; // 64^6, including 0
+enum : unsigned { Max7DecimalOffset = 9999999U };
+enum : uint64_t { MaxBase64Offset = 0xFFFFFFFFFULL }; // 64^6, including 0
 
 // Encode a string table entry offset in base 64, padded to 6 chars, and
 // prefixed with a double slash: '//AAAAAA', '//AAAAAB', ...
@@ -456,22 +463,21 @@ void WinCOFFObjectWriter::SetSectionName(COFFSection &S) {
   if (S.Name.size() > COFF::NameSize) {
     uint64_t StringTableEntry = Strings.getOffset(S.Name);
 
-    if (StringTableEntry <= Max6DecimalOffset) {
-      std::sprintf(S.Header.Name, "/%d", unsigned(StringTableEntry));
-    } else if (StringTableEntry <= Max7DecimalOffset) {
-      // With seven digits, we have to skip the terminating null. Because
-      // sprintf always appends it, we use a larger temporary buffer.
-      char buffer[9] = {};
-      std::sprintf(buffer, "/%d", unsigned(StringTableEntry));
-      std::memcpy(S.Header.Name, buffer, 8);
+    if (StringTableEntry <= Max7DecimalOffset) {
+      SmallVector<char, COFF::NameSize> Buffer;
+      Twine('/').concat(Twine(StringTableEntry)).toVector(Buffer);
+      assert(Buffer.size() <= COFF::NameSize && Buffer.size() >= 2);
+
+      std::memcpy(S.Header.Name, Buffer.data(), Buffer.size());
     } else if (StringTableEntry <= MaxBase64Offset) {
       // Starting with 10,000,000, offsets are encoded as base64.
       encodeBase64StringEntry(S.Header.Name, StringTableEntry);
     } else {
       report_fatal_error("COFF string table is greater than 64 GB.");
     }
-  } else
+  } else {
     std::memcpy(S.Header.Name, S.Name.c_str(), S.Name.size());
+  }
 }
 
 void WinCOFFObjectWriter::SetSymbolName(COFFSymbol &S) {
@@ -530,48 +536,47 @@ void WinCOFFObjectWriter::WriteSymbol(const COFFSymbol &S) {
 
 void WinCOFFObjectWriter::WriteAuxiliarySymbols(
     const COFFSymbol::AuxiliarySymbols &S) {
-  for (COFFSymbol::AuxiliarySymbols::const_iterator i = S.begin(), e = S.end();
-       i != e; ++i) {
-    switch (i->AuxType) {
+  for (const AuxSymbol &i : S) {
+    switch (i.AuxType) {
     case ATFunctionDefinition:
-      writeLE32(i->Aux.FunctionDefinition.TagIndex);
-      writeLE32(i->Aux.FunctionDefinition.TotalSize);
-      writeLE32(i->Aux.FunctionDefinition.PointerToLinenumber);
-      writeLE32(i->Aux.FunctionDefinition.PointerToNextFunction);
-      WriteZeros(sizeof(i->Aux.FunctionDefinition.unused));
+      writeLE32(i.Aux.FunctionDefinition.TagIndex);
+      writeLE32(i.Aux.FunctionDefinition.TotalSize);
+      writeLE32(i.Aux.FunctionDefinition.PointerToLinenumber);
+      writeLE32(i.Aux.FunctionDefinition.PointerToNextFunction);
+      WriteZeros(sizeof(i.Aux.FunctionDefinition.unused));
       if (UseBigObj)
         WriteZeros(COFF::Symbol32Size - COFF::Symbol16Size);
       break;
     case ATbfAndefSymbol:
-      WriteZeros(sizeof(i->Aux.bfAndefSymbol.unused1));
-      writeLE16(i->Aux.bfAndefSymbol.Linenumber);
-      WriteZeros(sizeof(i->Aux.bfAndefSymbol.unused2));
-      writeLE32(i->Aux.bfAndefSymbol.PointerToNextFunction);
-      WriteZeros(sizeof(i->Aux.bfAndefSymbol.unused3));
+      WriteZeros(sizeof(i.Aux.bfAndefSymbol.unused1));
+      writeLE16(i.Aux.bfAndefSymbol.Linenumber);
+      WriteZeros(sizeof(i.Aux.bfAndefSymbol.unused2));
+      writeLE32(i.Aux.bfAndefSymbol.PointerToNextFunction);
+      WriteZeros(sizeof(i.Aux.bfAndefSymbol.unused3));
       if (UseBigObj)
         WriteZeros(COFF::Symbol32Size - COFF::Symbol16Size);
       break;
     case ATWeakExternal:
-      writeLE32(i->Aux.WeakExternal.TagIndex);
-      writeLE32(i->Aux.WeakExternal.Characteristics);
-      WriteZeros(sizeof(i->Aux.WeakExternal.unused));
+      writeLE32(i.Aux.WeakExternal.TagIndex);
+      writeLE32(i.Aux.WeakExternal.Characteristics);
+      WriteZeros(sizeof(i.Aux.WeakExternal.unused));
       if (UseBigObj)
         WriteZeros(COFF::Symbol32Size - COFF::Symbol16Size);
       break;
     case ATFile:
       writeBytes(
-          StringRef(reinterpret_cast<const char *>(&i->Aux),
+          StringRef(reinterpret_cast<const char *>(&i.Aux),
                     UseBigObj ? COFF::Symbol32Size : COFF::Symbol16Size));
       break;
     case ATSectionDefinition:
-      writeLE32(i->Aux.SectionDefinition.Length);
-      writeLE16(i->Aux.SectionDefinition.NumberOfRelocations);
-      writeLE16(i->Aux.SectionDefinition.NumberOfLinenumbers);
-      writeLE32(i->Aux.SectionDefinition.CheckSum);
-      writeLE16(static_cast<int16_t>(i->Aux.SectionDefinition.Number));
-      write8(i->Aux.SectionDefinition.Selection);
-      WriteZeros(sizeof(i->Aux.SectionDefinition.unused));
-      writeLE16(static_cast<int16_t>(i->Aux.SectionDefinition.Number >> 16));
+      writeLE32(i.Aux.SectionDefinition.Length);
+      writeLE16(i.Aux.SectionDefinition.NumberOfRelocations);
+      writeLE16(i.Aux.SectionDefinition.NumberOfLinenumbers);
+      writeLE32(i.Aux.SectionDefinition.CheckSum);
+      writeLE16(static_cast<int16_t>(i.Aux.SectionDefinition.Number));
+      write8(i.Aux.SectionDefinition.Selection);
+      WriteZeros(sizeof(i.Aux.SectionDefinition.unused));
+      writeLE16(static_cast<int16_t>(i.Aux.SectionDefinition.Number >> 16));
       if (UseBigObj)
         WriteZeros(COFF::Symbol32Size - COFF::Symbol16Size);
       break;
@@ -787,6 +792,10 @@ void WinCOFFObjectWriter::recordRelocation(
     }
   }
 
+  // The fixed value never makes sense for section indicies, ignore it.
+  if (Fixup.getKind() == FK_SecRel_2)
+    FixedValue = 0;
+
   if (TargetObjectWriter->recordRelocation(Fixup))
     coff_section->Relocations.push_back(Reloc);
 }
@@ -924,7 +933,7 @@ void WinCOFFObjectWriter::writeObject(MCAssembler &Asm,
 
     if (IsPhysicalSection(Sec)) {
       // Align the section data to a four byte boundary.
-      offset = RoundUpToAlignment(offset, 4);
+      offset = alignTo(offset, 4);
       Sec->Header.PointerToRawData = offset;
 
       offset += Sec->Header.SizeOfRawData;
diff --git a/lib/MC/WinCOFFStreamer.cpp b/lib/MC/WinCOFFStreamer.cpp
index a38b1a41a9b0..5c6407ef1e5c 100644
--- a/lib/MC/WinCOFFStreamer.cpp
+++ b/lib/MC/WinCOFFStreamer.cpp
@@ -75,7 +75,8 @@ void MCWinCOFFStreamer::InitSections(bool NoExecStack) {
   SwitchSection(getContext().getObjectFileInfo()->getTextSection());
 }
 
-void MCWinCOFFStreamer::EmitLabel(MCSymbol *Symbol) {
+void MCWinCOFFStreamer::EmitLabel(MCSymbol *S) {
+  auto *Symbol = cast<MCSymbolCOFF>(S);
   assert(Symbol->isUndefined() && "Cannot define a symbol twice!");
   MCObjectStreamer::EmitLabel(Symbol);
 }
@@ -88,25 +89,23 @@ void MCWinCOFFStreamer::EmitThumbFunc(MCSymbol *Func) {
   llvm_unreachable("not implemented");
 }
 
-bool MCWinCOFFStreamer::EmitSymbolAttribute(MCSymbol *Symbol,
+bool MCWinCOFFStreamer::EmitSymbolAttribute(MCSymbol *S,
                                             MCSymbolAttr Attribute) {
-  assert(Symbol && "Symbol must be non-null!");
-  assert((!Symbol->isInSection() ||
-          Symbol->getSection().getVariant() == MCSection::SV_COFF) &&
-         "Got non-COFF section in the COFF backend!");
-
+  auto *Symbol = cast<MCSymbolCOFF>(S);
   getAssembler().registerSymbol(*Symbol);
 
   switch (Attribute) {
   default: return false;
   case MCSA_WeakReference:
   case MCSA_Weak:
-    cast<MCSymbolCOFF>(Symbol)->setIsWeakExternal();
+    Symbol->setIsWeakExternal();
     Symbol->setExternal(true);
     break;
   case MCSA_Global:
     Symbol->setExternal(true);
     break;
+  case MCSA_AltEntry:
+    llvm_unreachable("COFF doesn't support the .alt_entry attribute");
   }
 
   return true;
@@ -116,11 +115,8 @@ void MCWinCOFFStreamer::EmitSymbolDesc(MCSymbol *Symbol, unsigned DescValue) {
   llvm_unreachable("not implemented");
 }
 
-void MCWinCOFFStreamer::BeginCOFFSymbolDef(MCSymbol const *Symbol) {
-  assert((!Symbol->isInSection() ||
-          Symbol->getSection().getVariant() == MCSection::SV_COFF) &&
-         "Got non-COFF section in the COFF backend!");
-
+void MCWinCOFFStreamer::BeginCOFFSymbolDef(MCSymbol const *S) {
+  auto *Symbol = cast<MCSymbolCOFF>(S);
   if (CurSymbol)
     Error("starting a new symbol definition without completing the "
           "previous one");
@@ -207,11 +203,9 @@ void MCWinCOFFStreamer::EmitCOFFSecRel32(MCSymbol const *Symbol) {
   DF->getContents().resize(DF->getContents().size() + 4, 0);
 }
 
-void MCWinCOFFStreamer::EmitCommonSymbol(MCSymbol *Symbol, uint64_t Size,
+void MCWinCOFFStreamer::EmitCommonSymbol(MCSymbol *S, uint64_t Size,
                                          unsigned ByteAlignment) {
-  assert((!Symbol->isInSection() ||
-          Symbol->getSection().getVariant() == MCSection::SV_COFF) &&
-         "Got non-COFF section in the COFF backend!");
+  auto *Symbol = cast<MCSymbolCOFF>(S);
 
   const Triple &T = getContext().getObjectFileInfo()->getTargetTriple();
   if (T.isKnownWindowsMSVCEnvironment()) {
@@ -241,9 +235,9 @@ void MCWinCOFFStreamer::EmitCommonSymbol(MCSymbol *Symbol, uint64_t Size,
   }
 }
 
-void MCWinCOFFStreamer::EmitLocalCommonSymbol(MCSymbol *Symbol, uint64_t Size,
+void MCWinCOFFStreamer::EmitLocalCommonSymbol(MCSymbol *S, uint64_t Size,
                                               unsigned ByteAlignment) {
-  assert(!Symbol->isInSection() && "Symbol must not already have a section!");
+  auto *Symbol = cast<MCSymbolCOFF>(S);
 
   MCSection *Section = getContext().getObjectFileInfo()->getBSSSection();
   getAssembler().registerSection(*Section);
@@ -258,7 +252,7 @@ void MCWinCOFFStreamer::EmitLocalCommonSymbol(MCSymbol *Symbol, uint64_t Size,
                         ByteAlignment, Section);
 
   MCFillFragment *Fragment = new MCFillFragment(
-      /*Value=*/0, /*ValueSize=*/0, Size, Section);
+      /*Value=*/0, Size, Section);
   Symbol->setFragment(Fragment);
 }
 
diff --git a/lib/Makefile b/lib/Makefile
deleted file mode 100644
index 9b76126b80a9..000000000000
--- a/lib/Makefile
+++ /dev/null
@@ -1,17 +0,0 @@
-##===- lib/Makefile ----------------------------------------*- Makefile -*-===##
-#
-#                     The LLVM Compiler Infrastructure
-#
-# This file is distributed under the University of Illinois Open Source
-# License. See LICENSE.TXT for details.
-#
-##===----------------------------------------------------------------------===##
-LEVEL = ..
-
-include $(LEVEL)/Makefile.config
-
-PARALLEL_DIRS := IR AsmParser Bitcode Analysis Transforms CodeGen Target      \
-                 ExecutionEngine Linker LTO MC Object Option DebugInfo        \
-                 IRReader LineEditor ProfileData Passes LibDriver
-
-include $(LEVEL)/Makefile.common
diff --git a/lib/Object/Archive.cpp b/lib/Object/Archive.cpp
index 99b0650c8b7e..daf301e2e7e4 100644
--- a/lib/Object/Archive.cpp
+++ b/lib/Object/Archive.cpp
@@ -12,7 +12,6 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/Object/Archive.h"
-#include "llvm/ADT/APInt.h"
 #include "llvm/ADT/SmallString.h"
 #include "llvm/ADT/Twine.h"
 #include "llvm/Support/Endian.h"
@@ -52,14 +51,14 @@ ErrorOr<uint32_t> ArchiveMemberHeader::getSize() const {
 
 sys::fs::perms ArchiveMemberHeader::getAccessMode() const {
   unsigned Ret;
-  if (StringRef(AccessMode, sizeof(AccessMode)).rtrim(" ").getAsInteger(8, Ret))
+  if (StringRef(AccessMode, sizeof(AccessMode)).rtrim(' ').getAsInteger(8, Ret))
     llvm_unreachable("Access mode is not an octal number.");
   return static_cast<sys::fs::perms>(Ret);
 }
 
 sys::TimeValue ArchiveMemberHeader::getLastModified() const {
   unsigned Seconds;
-  if (StringRef(LastModified, sizeof(LastModified)).rtrim(" ")
+  if (StringRef(LastModified, sizeof(LastModified)).rtrim(' ')
           .getAsInteger(10, Seconds))
     llvm_unreachable("Last modified time not a decimal number.");
 
@@ -70,14 +69,20 @@ sys::TimeValue ArchiveMemberHeader::getLastModified() const {
 
 unsigned ArchiveMemberHeader::getUID() const {
   unsigned Ret;
-  if (StringRef(UID, sizeof(UID)).rtrim(" ").getAsInteger(10, Ret))
+  StringRef User = StringRef(UID, sizeof(UID)).rtrim(' ');
+  if (User.empty())
+    return 0;
+  if (User.getAsInteger(10, Ret))
     llvm_unreachable("UID time not a decimal number.");
   return Ret;
 }
 
 unsigned ArchiveMemberHeader::getGID() const {
   unsigned Ret;
-  if (StringRef(GID, sizeof(GID)).rtrim(" ").getAsInteger(10, Ret))
+  StringRef Group = StringRef(GID, sizeof(GID)).rtrim(' ');
+  if (Group.empty())
+    return 0;
+  if (Group.getAsInteger(10, Ret))
     llvm_unreachable("GID time not a decimal number.");
   return Ret;
 }
@@ -108,7 +113,7 @@ Archive::Child::Child(const Archive *Parent, const char *Start,
   StringRef Name = getRawName();
   if (Name.startswith("#1/")) {
     uint64_t NameSize;
-    if (Name.substr(3).rtrim(" ").getAsInteger(10, NameSize))
+    if (Name.substr(3).rtrim(' ').getAsInteger(10, NameSize))
       llvm_unreachable("Long name length is not an integer");
     StartOfFile += NameSize;
   }
@@ -136,6 +141,21 @@ bool Archive::Child::isThinMember() const {
   return Parent->IsThin && Name != "/" && Name != "//";
 }
 
+ErrorOr<std::string> Archive::Child::getFullName() const {
+  assert(isThinMember());
+  ErrorOr<StringRef> NameOrErr = getName();
+  if (std::error_code EC = NameOrErr.getError())
+    return EC;
+  StringRef Name = *NameOrErr;
+  if (sys::path::is_absolute(Name))
+    return Name;
+
+  SmallString<128> FullName = sys::path::parent_path(
+      Parent->getMemoryBufferRef().getBufferIdentifier());
+  sys::path::append(FullName, Name);
+  return StringRef(FullName);
+}
+
 ErrorOr<StringRef> Archive::Child::getBuffer() const {
   if (!isThinMember()) {
     ErrorOr<uint32_t> Size = getSize();
@@ -143,12 +163,10 @@ ErrorOr<StringRef> Archive::Child::getBuffer() const {
       return EC;
     return StringRef(Data.data() + StartOfFile, Size.get());
   }
-  ErrorOr<StringRef> Name = getName();
-  if (std::error_code EC = Name.getError())
+  ErrorOr<std::string> FullNameOrEr = getFullName();
+  if (std::error_code EC = FullNameOrEr.getError())
     return EC;
-  SmallString<128> FullName = sys::path::parent_path(
-      Parent->getMemoryBufferRef().getBufferIdentifier());
-  sys::path::append(FullName, *Name);
+  const std::string &FullName = *FullNameOrEr;
   ErrorOr<std::unique_ptr<MemoryBuffer>> Buf = MemoryBuffer::getFile(FullName);
   if (std::error_code EC = Buf.getError())
     return EC;
@@ -197,7 +215,7 @@ ErrorOr<StringRef> Archive::Child::getName() const {
     // It's a long name.
     // Get the offset.
     std::size_t offset;
-    if (name.substr(1).rtrim(" ").getAsInteger(10, offset))
+    if (name.substr(1).rtrim(' ').getAsInteger(10, offset))
       llvm_unreachable("Long name offset is not an integer");
 
     // Verify it.
@@ -213,10 +231,14 @@ ErrorOr<StringRef> Archive::Child::getName() const {
     return StringRef(addr);
   } else if (name.startswith("#1/")) {
     uint64_t name_size;
-    if (name.substr(3).rtrim(" ").getAsInteger(10, name_size))
+    if (name.substr(3).rtrim(' ').getAsInteger(10, name_size))
       llvm_unreachable("Long name length is not an ingeter");
-    return Data.substr(sizeof(ArchiveMemberHeader), name_size)
-        .rtrim(StringRef("\0", 1));
+    return Data.substr(sizeof(ArchiveMemberHeader), name_size).rtrim('\0');
+  } else {
+    // It is not a long name so trim the blanks at the end of the name.
+    if (name[name.size() - 1] != '/') {
+      return name.rtrim(' ');
+    }
   }
   // It's a simple name.
   if (name[name.size() - 1] == '/')
@@ -235,20 +257,23 @@ ErrorOr<MemoryBufferRef> Archive::Child::getMemoryBufferRef() const {
   return MemoryBufferRef(*Buf, Name);
 }
 
-ErrorOr<std::unique_ptr<Binary>>
+Expected<std::unique_ptr<Binary>>
 Archive::Child::getAsBinary(LLVMContext *Context) const {
   ErrorOr<MemoryBufferRef> BuffOrErr = getMemoryBufferRef();
   if (std::error_code EC = BuffOrErr.getError())
-    return EC;
+    return errorCodeToError(EC);
 
-  return createBinary(BuffOrErr.get(), Context);
+  auto BinaryOrErr = createBinary(BuffOrErr.get(), Context);
+  if (BinaryOrErr)
+    return std::move(*BinaryOrErr);
+  return BinaryOrErr.takeError();
 }
 
-ErrorOr<std::unique_ptr<Archive>> Archive::create(MemoryBufferRef Source) {
-  std::error_code EC;
-  std::unique_ptr<Archive> Ret(new Archive(Source, EC));
-  if (EC)
-    return EC;
+Expected<std::unique_ptr<Archive>> Archive::create(MemoryBufferRef Source) {
+  Error Err;
+  std::unique_ptr<Archive> Ret(new Archive(Source, Err));
+  if (Err)
+    return std::move(Err);
   return std::move(Ret);
 }
 
@@ -257,8 +282,9 @@ void Archive::setFirstRegular(const Child &C) {
   FirstRegularStartOfFile = C.StartOfFile;
 }
 
-Archive::Archive(MemoryBufferRef Source, std::error_code &ec)
+Archive::Archive(MemoryBufferRef Source, Error &Err)
     : Binary(Binary::ID_Archive, Source) {
+  ErrorAsOutParameter ErrAsOutParam(Err);
   StringRef Buffer = Data.getBuffer();
   // Check for sufficient magic.
   if (Buffer.startswith(ThinMagic)) {
@@ -266,27 +292,33 @@ Archive::Archive(MemoryBufferRef Source, std::error_code &ec)
   } else if (Buffer.startswith(Magic)) {
     IsThin = false;
   } else {
-    ec = object_error::invalid_file_type;
+    Err = make_error<GenericBinaryError>("File too small to be an archive",
+                                         object_error::invalid_file_type);
     return;
   }
 
   // Get the special members.
-  child_iterator I = child_begin(false);
-  if ((ec = I->getError()))
+  child_iterator I = child_begin(Err, false);
+  if (Err)
     return;
   child_iterator E = child_end();
 
+  // This is at least a valid empty archive. Since an empty archive is the
+  // same in all formats, just claim it to be gnu to make sure Format is
+  // initialized.
+  Format = K_GNU;
+
   if (I == E) {
-    ec = std::error_code();
+    Err = Error::success();
     return;
   }
-  const Child *C = &**I;
+  const Child *C = &*I;
 
   auto Increment = [&]() {
     ++I;
-    if ((ec = I->getError()))
+    if (Err)
       return true;
-    C = &**I;
+    C = &*I;
     return false;
   };
 
@@ -311,8 +343,11 @@ Archive::Archive(MemoryBufferRef Source, std::error_code &ec)
   //  seem to create the third member if there's no member whose filename
   //  exceeds 15 characters. So the third member is optional.
 
-  if (Name == "__.SYMDEF") {
-    Format = K_BSD;
+  if (Name == "__.SYMDEF" || Name == "__.SYMDEF_64") {
+    if (Name == "__.SYMDEF")
+      Format = K_BSD;
+    else // Name == "__.SYMDEF_64"
+      Format = K_DARWIN64;
     // We know that the symbol table is not an external file, so we just assert
     // there is no error.
     SymbolTable = *C->getBuffer();
@@ -320,7 +355,7 @@ Archive::Archive(MemoryBufferRef Source, std::error_code &ec)
       return;
     setFirstRegular(*C);
 
-    ec = std::error_code();
+    Err = Error::success();
     return;
   }
 
@@ -328,9 +363,10 @@ Archive::Archive(MemoryBufferRef Source, std::error_code &ec)
     Format = K_BSD;
     // We know this is BSD, so getName will work since there is no string table.
     ErrorOr<StringRef> NameOrErr = C->getName();
-    ec = NameOrErr.getError();
-    if (ec)
+    if (auto ec = NameOrErr.getError()) {
+      Err = errorCodeToError(ec);
       return;
+    }
     Name = NameOrErr.get();
     if (Name == "__.SYMDEF SORTED" || Name == "__.SYMDEF") {
       // We know that the symbol table is not an external file, so we just
@@ -339,6 +375,14 @@ Archive::Archive(MemoryBufferRef Source, std::error_code &ec)
       if (Increment())
         return;
     }
+    else if (Name == "__.SYMDEF_64 SORTED" || Name == "__.SYMDEF_64") {
+      Format = K_DARWIN64;
+      // We know that the symbol table is not an external file, so we just
+      // assert there is no error.
+      SymbolTable = *C->getBuffer();
+      if (Increment())
+        return;
+    }
     setFirstRegular(*C);
     return;
   }
@@ -359,7 +403,7 @@ Archive::Archive(MemoryBufferRef Source, std::error_code &ec)
     if (Increment())
       return;
     if (I == E) {
-      ec = std::error_code();
+      Err = Error::success();
       return;
     }
     Name = C->getRawName();
@@ -373,19 +417,19 @@ Archive::Archive(MemoryBufferRef Source, std::error_code &ec)
     if (Increment())
       return;
     setFirstRegular(*C);
-    ec = std::error_code();
+    Err = Error::success();
     return;
   }
 
   if (Name[0] != '/') {
     Format = has64SymTable ? K_MIPS64 : K_GNU;
     setFirstRegular(*C);
-    ec = std::error_code();
+    Err = Error::success();
     return;
   }
 
   if (Name != "/") {
-    ec = object_error::parse_failed;
+    Err = errorCodeToError(object_error::parse_failed);
     return;
   }
 
@@ -399,7 +443,7 @@ Archive::Archive(MemoryBufferRef Source, std::error_code &ec)
 
   if (I == E) {
     setFirstRegular(*C);
-    ec = std::error_code();
+    Err = Error::success();
     return;
   }
 
@@ -414,26 +458,32 @@ Archive::Archive(MemoryBufferRef Source, std::error_code &ec)
   }
 
   setFirstRegular(*C);
-  ec = std::error_code();
+  Err = Error::success();
 }
 
-Archive::child_iterator Archive::child_begin(bool SkipInternal) const {
+Archive::child_iterator Archive::child_begin(Error &Err,
+                                             bool SkipInternal) const {
   if (Data.getBufferSize() == 8) // empty archive.
     return child_end();
 
   if (SkipInternal)
-    return Child(this, FirstRegularData, FirstRegularStartOfFile);
+    return child_iterator(Child(this, FirstRegularData,
+                                FirstRegularStartOfFile),
+                          &Err);
 
   const char *Loc = Data.getBufferStart() + strlen(Magic);
   std::error_code EC;
-  Child c(this, Loc, &EC);
-  if (EC)
-    return child_iterator(EC);
-  return child_iterator(c);
+  Child C(this, Loc, &EC);
+  if (EC) {
+    ErrorAsOutParameter ErrAsOutParam(Err);
+    Err = errorCodeToError(EC);
+    return child_end();
+  }
+  return child_iterator(C, &Err);
 }
 
 Archive::child_iterator Archive::child_end() const {
-  return Child(this, nullptr, nullptr);
+  return child_iterator(Child(this, nullptr, nullptr), nullptr);
 }
 
 StringRef Archive::Symbol::getName() const {
@@ -443,7 +493,7 @@ StringRef Archive::Symbol::getName() const {
 ErrorOr<Archive::Child> Archive::Symbol::getMember() const {
   const char *Buf = Parent->getSymbolTable().begin();
   const char *Offsets = Buf;
-  if (Parent->kind() == K_MIPS64)
+  if (Parent->kind() == K_MIPS64 || Parent->kind() == K_DARWIN64)
     Offsets += sizeof(uint64_t);
   else
     Offsets += sizeof(uint32_t);
@@ -460,6 +510,14 @@ ErrorOr<Archive::Child> Archive::Symbol::getMember() const {
     // the archive of the member that defines the symbol.  Which is what
     // is needed here.
     Offset = read32le(Offsets + SymbolIndex * 8 + 4);
+  } else if (Parent->kind() == K_DARWIN64) {
+    // The SymbolIndex is an index into the ranlib_64 structs that start at
+    // Offsets (the first uint64_t is the number of bytes of the ranlib_64
+    // structs).  The ranlib_64 structs are a pair of uint64_t's the first
+    // being a string table offset and the second being the offset into
+    // the archive of the member that defines the symbol.  Which is what
+    // is needed here.
+    Offset = read64le(Offsets + SymbolIndex * 16 + 8);
   } else {
     // Skip offsets.
     uint32_t MemberCount = read32le(Buf);
@@ -559,6 +617,22 @@ Archive::symbol_iterator Archive::symbol_begin() const {
     // Skip the byte count of the string table.
     buf += sizeof(uint32_t);
     buf += ran_strx;
+  } else if (kind() == K_DARWIN64) {
+    // The __.SYMDEF_64 or "__.SYMDEF_64 SORTED" member starts with a uint64_t
+    // which is the number of bytes of ranlib_64 structs that follow.  The
+    // ranlib_64 structs are a pair of uint64_t's the first being a string
+    // table offset and the second being the offset into the archive of the
+    // member that define the symbol. After that the next uint64_t is the byte
+    // count of the string table followed by the string table.
+    uint64_t ranlib_count = 0;
+    ranlib_count = read64le(buf) / 16;
+    const char *ranlibs = buf + 8;
+    uint64_t ran_strx = 0;
+    ran_strx = read64le(ranlibs);
+    buf += sizeof(uint64_t) + (ranlib_count * (2 * (sizeof(uint64_t))));
+    // Skip the byte count of the string table.
+    buf += sizeof(uint64_t);
+    buf += ran_strx;
   } else {
     uint32_t member_count = 0;
     uint32_t symbol_count = 0;
@@ -585,27 +659,28 @@ uint32_t Archive::getNumberOfSymbols() const {
     return read64be(buf);
   if (kind() == K_BSD)
     return read32le(buf) / 8;
+  if (kind() == K_DARWIN64)
+    return read64le(buf) / 16;
   uint32_t member_count = 0;
   member_count = read32le(buf);
   buf += 4 + (member_count * 4); // Skip offsets.
   return read32le(buf);
 }
 
-Archive::child_iterator Archive::findSym(StringRef name) const {
+Expected<Optional<Archive::Child>> Archive::findSym(StringRef name) const {
   Archive::symbol_iterator bs = symbol_begin();
   Archive::symbol_iterator es = symbol_end();
 
   for (; bs != es; ++bs) {
     StringRef SymName = bs->getName();
     if (SymName == name) {
-      ErrorOr<Archive::child_iterator> ResultOrErr = bs->getMember();
-      // FIXME: Should we really eat the error?
-      if (ResultOrErr.getError())
-        return child_end();
-      return ResultOrErr.get();
+      if (auto MemberOrErr = bs->getMember())
+        return Child(*MemberOrErr);
+      else
+        return errorCodeToError(MemberOrErr.getError());
     }
   }
-  return child_end();
+  return Optional<Child>();
 }
 
 bool Archive::hasSymbolTable() const { return !SymbolTable.empty(); }
diff --git a/lib/Object/ArchiveWriter.cpp b/lib/Object/ArchiveWriter.cpp
index c7343fdc171d..53573262104c 100644
--- a/lib/Object/ArchiveWriter.cpp
+++ b/lib/Object/ArchiveWriter.cpp
@@ -34,45 +34,61 @@
 
 using namespace llvm;
 
-NewArchiveIterator::NewArchiveIterator(const object::Archive::Child &OldMember,
-                                       StringRef Name)
-    : IsNewMember(false), Name(Name), OldMember(OldMember) {}
-
-NewArchiveIterator::NewArchiveIterator(StringRef FileName)
-    : IsNewMember(true), Name(FileName), OldMember(nullptr, nullptr, nullptr) {}
-
-StringRef NewArchiveIterator::getName() const { return Name; }
-
-bool NewArchiveIterator::isNewMember() const { return IsNewMember; }
-
-const object::Archive::Child &NewArchiveIterator::getOld() const {
-  assert(!IsNewMember);
-  return OldMember;
-}
-
-StringRef NewArchiveIterator::getNew() const {
-  assert(IsNewMember);
-  return Name;
+NewArchiveMember::NewArchiveMember(MemoryBufferRef BufRef)
+    : Buf(MemoryBuffer::getMemBuffer(BufRef, false)) {}
+
+Expected<NewArchiveMember>
+NewArchiveMember::getOldMember(const object::Archive::Child &OldMember,
+                               bool Deterministic) {
+  ErrorOr<llvm::MemoryBufferRef> BufOrErr = OldMember.getMemoryBufferRef();
+  if (!BufOrErr)
+    return errorCodeToError(BufOrErr.getError());
+
+  NewArchiveMember M;
+  M.Buf = MemoryBuffer::getMemBuffer(*BufOrErr, false);
+  if (!Deterministic) {
+    M.ModTime = OldMember.getLastModified();
+    M.UID = OldMember.getUID();
+    M.GID = OldMember.getGID();
+    M.Perms = OldMember.getAccessMode();
+  }
+  return std::move(M);
 }
 
-llvm::ErrorOr<int>
-NewArchiveIterator::getFD(sys::fs::file_status &NewStatus) const {
-  assert(IsNewMember);
-  int NewFD;
-  if (auto EC = sys::fs::openFileForRead(Name, NewFD))
-    return EC;
-  assert(NewFD != -1);
+Expected<NewArchiveMember> NewArchiveMember::getFile(StringRef FileName,
+                                                     bool Deterministic) {
+  sys::fs::file_status Status;
+  int FD;
+  if (auto EC = sys::fs::openFileForRead(FileName, FD))
+    return errorCodeToError(EC);
+  assert(FD != -1);
 
-  if (auto EC = sys::fs::status(NewFD, NewStatus))
-    return EC;
+  if (auto EC = sys::fs::status(FD, Status))
+    return errorCodeToError(EC);
 
   // Opening a directory doesn't make sense. Let it fail.
   // Linux cannot open directories with open(2), although
   // cygwin and *bsd can.
-  if (NewStatus.type() == sys::fs::file_type::directory_file)
-    return make_error_code(errc::is_a_directory);
-
-  return NewFD;
+  if (Status.type() == sys::fs::file_type::directory_file)
+    return errorCodeToError(make_error_code(errc::is_a_directory));
+
+  ErrorOr<std::unique_ptr<MemoryBuffer>> MemberBufferOrErr =
+      MemoryBuffer::getOpenFile(FD, FileName, Status.getSize(), false);
+  if (!MemberBufferOrErr)
+    return errorCodeToError(MemberBufferOrErr.getError());
+
+  if (close(FD) != 0)
+    return errorCodeToError(std::error_code(errno, std::generic_category()));
+
+  NewArchiveMember M;
+  M.Buf = std::move(*MemberBufferOrErr);
+  if (!Deterministic) {
+    M.ModTime = Status.getLastModificationTime();
+    M.UID = Status.getUser();
+    M.GID = Status.getGroup();
+    M.Perms = Status.permissions();
+  }
+  return std::move(M);
 }
 
 template <typename T>
@@ -178,12 +194,13 @@ static std::string computeRelativePath(StringRef From, StringRef To) {
 }
 
 static void writeStringTable(raw_fd_ostream &Out, StringRef ArcName,
-                             ArrayRef<NewArchiveIterator> Members,
+                             ArrayRef<NewArchiveMember> Members,
                              std::vector<unsigned> &StringMapIndexes,
                              bool Thin) {
   unsigned StartOffset = 0;
-  for (const NewArchiveIterator &I : Members) {
-    StringRef Name = sys::path::filename(I.getName());
+  for (const NewArchiveMember &M : Members) {
+    StringRef Path = M.Buf->getBufferIdentifier();
+    StringRef Name = sys::path::filename(Path);
     if (!useStringTable(Thin, Name))
       continue;
     if (StartOffset == 0) {
@@ -194,7 +211,7 @@ static void writeStringTable(raw_fd_ostream &Out, StringRef ArcName,
     StringMapIndexes.push_back(Out.tell() - StartOffset);
 
     if (Thin)
-      Out << computeRelativePath(ArcName, I.getName());
+      Out << computeRelativePath(ArcName, Path);
     else
       Out << Name;
 
@@ -221,8 +238,7 @@ static sys::TimeValue now(bool Deterministic) {
 // Returns the offset of the first reference to a member offset.
 static ErrorOr<unsigned>
 writeSymbolTable(raw_fd_ostream &Out, object::Archive::Kind Kind,
-                 ArrayRef<NewArchiveIterator> Members,
-                 ArrayRef<MemoryBufferRef> Buffers,
+                 ArrayRef<NewArchiveMember> Members,
                  std::vector<unsigned> &MemberOffsetRefs, bool Deterministic) {
   unsigned HeaderStartOffset = 0;
   unsigned BodyStartOffset = 0;
@@ -230,12 +246,15 @@ writeSymbolTable(raw_fd_ostream &Out, object::Archive::Kind Kind,
   raw_svector_ostream NameOS(NameBuf);
   LLVMContext Context;
   for (unsigned MemberNum = 0, N = Members.size(); MemberNum < N; ++MemberNum) {
-    MemoryBufferRef MemberBuffer = Buffers[MemberNum];
-    ErrorOr<std::unique_ptr<object::SymbolicFile>> ObjOrErr =
+    MemoryBufferRef MemberBuffer = Members[MemberNum].Buf->getMemBufferRef();
+    Expected<std::unique_ptr<object::SymbolicFile>> ObjOrErr =
         object::SymbolicFile::createSymbolicFile(
             MemberBuffer, sys::fs::file_magic::unknown, &Context);
-    if (!ObjOrErr)
-      continue;  // FIXME: check only for "not an object file" errors.
+    if (!ObjOrErr) {
+      // FIXME: check only for "not an object file" errors.
+      consumeError(ObjOrErr.takeError());
+      continue;
+    }
     object::SymbolicFile &Obj = *ObjOrErr.get();
 
     if (!HeaderStartOffset) {
@@ -302,9 +321,12 @@ writeSymbolTable(raw_fd_ostream &Out, object::Archive::Kind Kind,
 
 std::pair<StringRef, std::error_code>
 llvm::writeArchive(StringRef ArcName,
-                   std::vector<NewArchiveIterator> &NewMembers,
+                   std::vector<NewArchiveMember> &NewMembers,
                    bool WriteSymtab, object::Archive::Kind Kind,
-                   bool Deterministic, bool Thin) {
+                   bool Deterministic, bool Thin,
+                   std::unique_ptr<MemoryBuffer> OldArchiveBuf) {
+  assert((!Thin || Kind == object::Archive::K_GNU) &&
+         "Only the gnu format has a thin mode");
   SmallString<128> TmpArchive;
   int TmpArchiveFD;
   if (auto EC = sys::fs::createUniqueFile(ArcName + ".temp-archive-%%%%%%%.a",
@@ -324,44 +346,10 @@ llvm::writeArchive(StringRef ArcName,
   std::vector<MemoryBufferRef> Members;
   std::vector<sys::fs::file_status> NewMemberStatus;
 
-  for (unsigned I = 0, N = NewMembers.size(); I < N; ++I) {
-    NewArchiveIterator &Member = NewMembers[I];
-    MemoryBufferRef MemberRef;
-
-    if (Member.isNewMember()) {
-      StringRef Filename = Member.getNew();
-      NewMemberStatus.resize(NewMemberStatus.size() + 1);
-      sys::fs::file_status &Status = NewMemberStatus.back();
-      ErrorOr<int> FD = Member.getFD(Status);
-      if (auto EC = FD.getError())
-        return std::make_pair(Filename, EC);
-      ErrorOr<std::unique_ptr<MemoryBuffer>> MemberBufferOrErr =
-          MemoryBuffer::getOpenFile(FD.get(), Filename, Status.getSize(),
-                                    false);
-      if (auto EC = MemberBufferOrErr.getError())
-        return std::make_pair(Filename, EC);
-      if (close(FD.get()) != 0)
-        return std::make_pair(Filename,
-                              std::error_code(errno, std::generic_category()));
-      Buffers.push_back(std::move(MemberBufferOrErr.get()));
-      MemberRef = Buffers.back()->getMemBufferRef();
-    } else {
-      const object::Archive::Child &OldMember = Member.getOld();
-      assert((!Thin || OldMember.getParent()->isThin()) &&
-             "Thin archives cannot refers to member of other archives");
-      ErrorOr<MemoryBufferRef> MemberBufferOrErr =
-          OldMember.getMemoryBufferRef();
-      if (auto EC = MemberBufferOrErr.getError())
-        return std::make_pair("", EC);
-      MemberRef = MemberBufferOrErr.get();
-    }
-    Members.push_back(MemberRef);
-  }
-
   unsigned MemberReferenceOffset = 0;
   if (WriteSymtab) {
     ErrorOr<unsigned> MemberReferenceOffsetOrErr = writeSymbolTable(
-        Out, Kind, NewMembers, Members, MemberOffsetRefs, Deterministic);
+        Out, Kind, NewMembers, MemberOffsetRefs, Deterministic);
     if (auto EC = MemberReferenceOffsetOrErr.getError())
       return std::make_pair(ArcName, EC);
     MemberReferenceOffset = MemberReferenceOffsetOrErr.get();
@@ -371,55 +359,18 @@ llvm::writeArchive(StringRef ArcName,
   if (Kind != object::Archive::K_BSD)
     writeStringTable(Out, ArcName, NewMembers, StringMapIndexes, Thin);
 
-  unsigned MemberNum = 0;
-  unsigned NewMemberNum = 0;
   std::vector<unsigned>::iterator StringMapIndexIter = StringMapIndexes.begin();
   std::vector<unsigned> MemberOffset;
-  for (const NewArchiveIterator &I : NewMembers) {
-    MemoryBufferRef File = Members[MemberNum++];
+  for (const NewArchiveMember &M : NewMembers) {
+    MemoryBufferRef File = M.Buf->getMemBufferRef();
 
     unsigned Pos = Out.tell();
     MemberOffset.push_back(Pos);
 
-    sys::TimeValue ModTime;
-    unsigned UID;
-    unsigned GID;
-    unsigned Perms;
-    if (Deterministic) {
-      ModTime.fromEpochTime(0);
-      UID = 0;
-      GID = 0;
-      Perms = 0644;
-    } else if (I.isNewMember()) {
-      const sys::fs::file_status &Status = NewMemberStatus[NewMemberNum];
-      ModTime = Status.getLastModificationTime();
-      UID = Status.getUser();
-      GID = Status.getGroup();
-      Perms = Status.permissions();
-    } else {
-      const object::Archive::Child &OldMember = I.getOld();
-      ModTime = OldMember.getLastModified();
-      UID = OldMember.getUID();
-      GID = OldMember.getGID();
-      Perms = OldMember.getAccessMode();
-    }
-
-    if (I.isNewMember()) {
-      StringRef FileName = I.getNew();
-      const sys::fs::file_status &Status = NewMemberStatus[NewMemberNum++];
-      printMemberHeader(Out, Kind, Thin, sys::path::filename(FileName),
-                        StringMapIndexIter, ModTime, UID, GID, Perms,
-                        Status.getSize());
-    } else {
-      const object::Archive::Child &OldMember = I.getOld();
-      ErrorOr<uint32_t> Size = OldMember.getSize();
-      if (std::error_code EC = Size.getError())
-        return std::make_pair("", EC);
-      StringRef FileName = I.getName();
-      printMemberHeader(Out, Kind, Thin, sys::path::filename(FileName),
-                        StringMapIndexIter, ModTime, UID, GID, Perms,
-                        Size.get());
-    }
+    printMemberHeader(Out, Kind, Thin,
+                      sys::path::filename(M.Buf->getBufferIdentifier()),
+                      StringMapIndexIter, M.ModTime, M.UID, M.GID, M.Perms,
+                      M.Buf->getBufferSize());
 
     if (!Thin)
       Out << File.getBuffer();
@@ -439,6 +390,19 @@ llvm::writeArchive(StringRef ArcName,
 
   Output.keep();
   Out.close();
+
+  // At this point, we no longer need whatever backing memory
+  // was used to generate the NewMembers. On Windows, this buffer
+  // could be a mapped view of the file we want to replace (if
+  // we're updating an existing archive, say). In that case, the
+  // rename would still succeed, but it would leave behind a
+  // temporary file (actually the original file renamed) because
+  // a file cannot be deleted while there's a handle open on it,
+  // only renamed. So by freeing this buffer, this ensures that
+  // the last open handle on the destination file, if any, is
+  // closed before we attempt to rename.
+  OldArchiveBuf.reset();
+
   sys::fs::rename(TmpArchive, ArcName);
   return std::make_pair("", std::error_code());
 }
diff --git a/lib/Object/Binary.cpp b/lib/Object/Binary.cpp
index a2b167a665c5..ec051fec375b 100644
--- a/lib/Object/Binary.cpp
+++ b/lib/Object/Binary.cpp
@@ -36,7 +36,7 @@ StringRef Binary::getFileName() const { return Data.getBufferIdentifier(); }
 
 MemoryBufferRef Binary::getMemoryBufferRef() const { return Data; }
 
-ErrorOr<std::unique_ptr<Binary>> object::createBinary(MemoryBufferRef Buffer,
+Expected<std::unique_ptr<Binary>> object::createBinary(MemoryBufferRef Buffer,
                                                       LLVMContext *Context) {
   sys::fs::file_magic Type = sys::fs::identify_magic(Buffer.getBuffer());
 
@@ -69,22 +69,22 @@ ErrorOr<std::unique_ptr<Binary>> object::createBinary(MemoryBufferRef Buffer,
     case sys::fs::file_magic::unknown:
     case sys::fs::file_magic::windows_resource:
       // Unrecognized object file format.
-      return object_error::invalid_file_type;
+      return errorCodeToError(object_error::invalid_file_type);
   }
   llvm_unreachable("Unexpected Binary File Type");
 }
 
-ErrorOr<OwningBinary<Binary>> object::createBinary(StringRef Path) {
+Expected<OwningBinary<Binary>> object::createBinary(StringRef Path) {
   ErrorOr<std::unique_ptr<MemoryBuffer>> FileOrErr =
       MemoryBuffer::getFileOrSTDIN(Path);
   if (std::error_code EC = FileOrErr.getError())
-    return EC;
+    return errorCodeToError(EC);
   std::unique_ptr<MemoryBuffer> &Buffer = FileOrErr.get();
 
-  ErrorOr<std::unique_ptr<Binary>> BinOrErr =
+  Expected<std::unique_ptr<Binary>> BinOrErr =
       createBinary(Buffer->getMemBufferRef());
-  if (std::error_code EC = BinOrErr.getError())
-    return EC;
+  if (!BinOrErr)
+    return BinOrErr.takeError();
   std::unique_ptr<Binary> &Bin = BinOrErr.get();
 
   return OwningBinary<Binary>(std::move(Bin), std::move(Buffer));
diff --git a/lib/Object/CMakeLists.txt b/lib/Object/CMakeLists.txt
index 2ac2ee51dc23..0a37cc360fe9 100644
--- a/lib/Object/CMakeLists.txt
+++ b/lib/Object/CMakeLists.txt
@@ -3,20 +3,18 @@ add_llvm_library(LLVMObject
   ArchiveWriter.cpp
   Binary.cpp
   COFFObjectFile.cpp
-  COFFYAML.cpp
   ELF.cpp
   ELFObjectFile.cpp
-  ELFYAML.cpp
   Error.cpp
   IRObjectFile.cpp
   MachOObjectFile.cpp
   MachOUniversal.cpp
+  ModuleSummaryIndexObjectFile.cpp
   Object.cpp
   ObjectFile.cpp
   RecordStreamer.cpp
   SymbolicFile.cpp
   SymbolSize.cpp
-  FunctionIndexObjectFile.cpp
 
   ADDITIONAL_HEADER_DIRS
   ${LLVM_MAIN_INCLUDE_DIR}/llvm/Object
diff --git a/lib/Object/COFFObjectFile.cpp b/lib/Object/COFFObjectFile.cpp
index 4cd6aff5f17c..0f790086cfc5 100644
--- a/lib/Object/COFFObjectFile.cpp
+++ b/lib/Object/COFFObjectFile.cpp
@@ -13,7 +13,6 @@
 
 #include "llvm/Object/COFF.h"
 #include "llvm/ADT/ArrayRef.h"
-#include "llvm/ADT/SmallString.h"
 #include "llvm/ADT/StringSwitch.h"
 #include "llvm/ADT/Triple.h"
 #include "llvm/ADT/iterator_range.h"
@@ -145,12 +144,12 @@ void COFFObjectFile::moveSymbolNext(DataRefImpl &Ref) const {
   }
 }
 
-ErrorOr<StringRef> COFFObjectFile::getSymbolName(DataRefImpl Ref) const {
+Expected<StringRef> COFFObjectFile::getSymbolName(DataRefImpl Ref) const {
   COFFSymbolRef Symb = getCOFFSymbol(Ref);
   StringRef Result;
   std::error_code EC = getSymbolName(Symb, Result);
   if (EC)
-    return EC;
+    return errorCodeToError(EC);
   return Result;
 }
 
@@ -158,7 +157,7 @@ uint64_t COFFObjectFile::getSymbolValueImpl(DataRefImpl Ref) const {
   return getCOFFSymbol(Ref).getValue();
 }
 
-ErrorOr<uint64_t> COFFObjectFile::getSymbolAddress(DataRefImpl Ref) const {
+Expected<uint64_t> COFFObjectFile::getSymbolAddress(DataRefImpl Ref) const {
   uint64_t Result = getSymbolValue(Ref);
   COFFSymbolRef Symb = getCOFFSymbol(Ref);
   int32_t SectionNumber = Symb.getSectionNumber();
@@ -169,7 +168,7 @@ ErrorOr<uint64_t> COFFObjectFile::getSymbolAddress(DataRefImpl Ref) const {
 
   const coff_section *Section = nullptr;
   if (std::error_code EC = getSection(SectionNumber, Section))
-    return EC;
+    return errorCodeToError(EC);
   Result += Section->VirtualAddress;
 
   // The section VirtualAddress does not include ImageBase, and we want to
@@ -179,7 +178,7 @@ ErrorOr<uint64_t> COFFObjectFile::getSymbolAddress(DataRefImpl Ref) const {
   return Result;
 }
 
-SymbolRef::Type COFFObjectFile::getSymbolType(DataRefImpl Ref) const {
+Expected<SymbolRef::Type> COFFObjectFile::getSymbolType(DataRefImpl Ref) const {
   COFFSymbolRef Symb = getCOFFSymbol(Ref);
   int32_t SectionNumber = Symb.getSectionNumber();
 
@@ -235,14 +234,14 @@ uint64_t COFFObjectFile::getCommonSymbolSizeImpl(DataRefImpl Ref) const {
   return Symb.getValue();
 }
 
-ErrorOr<section_iterator>
+Expected<section_iterator>
 COFFObjectFile::getSymbolSection(DataRefImpl Ref) const {
   COFFSymbolRef Symb = getCOFFSymbol(Ref);
   if (COFF::isReservedSectionNumber(Symb.getSectionNumber()))
     return section_end();
   const coff_section *Sec = nullptr;
   if (std::error_code EC = getSection(Symb.getSectionNumber(), Sec))
-    return EC;
+    return errorCodeToError(EC);
   DataRefImpl Ret;
   Ret.p = reinterpret_cast<uintptr_t>(Sec);
   return section_iterator(SectionRef(Ret, this));
@@ -290,7 +289,11 @@ std::error_code COFFObjectFile::getSectionContents(DataRefImpl Ref,
 
 uint64_t COFFObjectFile::getSectionAlignment(DataRefImpl Ref) const {
   const coff_section *Sec = toSec(Ref);
-  return uint64_t(1) << (((Sec->Characteristics & 0x00F00000) >> 20) - 1);
+  return Sec->getAlignment();
+}
+
+bool COFFObjectFile::isSectionCompressed(DataRefImpl Sec) const {
+  return false;
 }
 
 bool COFFObjectFile::isSectionText(DataRefImpl Ref) const {
@@ -450,6 +453,27 @@ std::error_code COFFObjectFile::getRvaPtr(uint32_t Addr, uintptr_t &Res) const {
   return object_error::parse_failed;
 }
 
+std::error_code
+COFFObjectFile::getRvaAndSizeAsBytes(uint32_t RVA, uint32_t Size,
+                                     ArrayRef<uint8_t> &Contents) const {
+  for (const SectionRef &S : sections()) {
+    const coff_section *Section = getCOFFSection(S);
+    uint32_t SectionStart = Section->VirtualAddress;
+    // Check if this RVA is within the section bounds. Be careful about integer
+    // overflow.
+    uint32_t OffsetIntoSection = RVA - SectionStart;
+    if (SectionStart <= RVA && OffsetIntoSection < Section->VirtualSize &&
+        Size <= Section->VirtualSize - OffsetIntoSection) {
+      uintptr_t Begin =
+          uintptr_t(base()) + Section->PointerToRawData + OffsetIntoSection;
+      Contents =
+          ArrayRef<uint8_t>(reinterpret_cast<const uint8_t *>(Begin), Size);
+      return std::error_code();
+    }
+  }
+  return object_error::parse_failed;
+}
+
 // Returns hint and name fields, assuming \p Rva is pointing to a Hint/Name
 // table entry.
 std::error_code COFFObjectFile::getHintName(uint32_t Rva, uint16_t &Hint,
@@ -463,6 +487,35 @@ std::error_code COFFObjectFile::getHintName(uint32_t Rva, uint16_t &Hint,
   return std::error_code();
 }
 
+std::error_code COFFObjectFile::getDebugPDBInfo(const debug_directory *DebugDir,
+                                                const debug_pdb_info *&PDBInfo,
+                                                StringRef &PDBFileName) const {
+  ArrayRef<uint8_t> InfoBytes;
+  if (std::error_code EC = getRvaAndSizeAsBytes(
+          DebugDir->AddressOfRawData, DebugDir->SizeOfData, InfoBytes))
+    return EC;
+  if (InfoBytes.size() < sizeof(debug_pdb_info) + 1)
+    return object_error::parse_failed;
+  PDBInfo = reinterpret_cast<const debug_pdb_info *>(InfoBytes.data());
+  InfoBytes = InfoBytes.drop_front(sizeof(debug_pdb_info));
+  PDBFileName = StringRef(reinterpret_cast<const char *>(InfoBytes.data()),
+                          InfoBytes.size());
+  // Truncate the name at the first null byte. Ignore any padding.
+  PDBFileName = PDBFileName.split('\0').first;
+  return std::error_code();
+}
+
+std::error_code COFFObjectFile::getDebugPDBInfo(const debug_pdb_info *&PDBInfo,
+                                                StringRef &PDBFileName) const {
+  for (const debug_directory &D : debug_directories())
+    if (D.Type == COFF::IMAGE_DEBUG_TYPE_CODEVIEW)
+      return getDebugPDBInfo(&D, PDBInfo, PDBFileName);
+  // If we get here, there is no PDB info to return.
+  PDBInfo = nullptr;
+  PDBFileName = StringRef();
+  return std::error_code();
+}
+
 // Find the import table.
 std::error_code COFFObjectFile::initImportTablePtr() {
   // First, we get the RVA of the import table. If the file lacks a pointer to
@@ -476,15 +529,14 @@ std::error_code COFFObjectFile::initImportTablePtr() {
     return std::error_code();
 
   uint32_t ImportTableRva = DataEntry->RelativeVirtualAddress;
-  // -1 because the last entry is the null entry.
-  NumberOfImportDirectory = DataEntry->Size /
-      sizeof(import_directory_table_entry) - 1;
 
   // Find the section that contains the RVA. This is needed because the RVA is
   // the import table's memory address which is different from its file offset.
   uintptr_t IntPtr = 0;
   if (std::error_code EC = getRvaPtr(ImportTableRva, IntPtr))
     return EC;
+  if (std::error_code EC = checkOffset(Data, IntPtr, DataEntry->Size))
+    return EC;
   ImportDirectory = reinterpret_cast<
       const import_directory_table_entry *>(IntPtr);
   return std::error_code();
@@ -548,15 +600,40 @@ std::error_code COFFObjectFile::initBaseRelocPtr() {
   return std::error_code();
 }
 
+std::error_code COFFObjectFile::initDebugDirectoryPtr() {
+  // Get the RVA of the debug directory. Do nothing if it does not exist.
+  const data_directory *DataEntry;
+  if (getDataDirectory(COFF::DEBUG_DIRECTORY, DataEntry))
+    return std::error_code();
+
+  // Do nothing if the RVA is NULL.
+  if (DataEntry->RelativeVirtualAddress == 0)
+    return std::error_code();
+
+  // Check that the size is a multiple of the entry size.
+  if (DataEntry->Size % sizeof(debug_directory) != 0)
+    return object_error::parse_failed;
+
+  uintptr_t IntPtr = 0;
+  if (std::error_code EC = getRvaPtr(DataEntry->RelativeVirtualAddress, IntPtr))
+    return EC;
+  DebugDirectoryBegin = reinterpret_cast<const debug_directory *>(IntPtr);
+  if (std::error_code EC = getRvaPtr(
+          DataEntry->RelativeVirtualAddress + DataEntry->Size, IntPtr))
+    return EC;
+  DebugDirectoryEnd = reinterpret_cast<const debug_directory *>(IntPtr);
+  return std::error_code();
+}
+
 COFFObjectFile::COFFObjectFile(MemoryBufferRef Object, std::error_code &EC)
     : ObjectFile(Binary::ID_COFF, Object), COFFHeader(nullptr),
       COFFBigObjHeader(nullptr), PE32Header(nullptr), PE32PlusHeader(nullptr),
       DataDirectory(nullptr), SectionTable(nullptr), SymbolTable16(nullptr),
       SymbolTable32(nullptr), StringTable(nullptr), StringTableSize(0),
-      ImportDirectory(nullptr), NumberOfImportDirectory(0),
+      ImportDirectory(nullptr),
       DelayImportDirectory(nullptr), NumberOfDelayImportDirectory(0),
-      ExportDirectory(nullptr), BaseRelocHeader(nullptr),
-      BaseRelocEnd(nullptr) {
+      ExportDirectory(nullptr), BaseRelocHeader(nullptr), BaseRelocEnd(nullptr),
+      DebugDirectoryBegin(nullptr), DebugDirectoryEnd(nullptr) {
   // Check that we at least have enough room for a header.
   if (!checkSize(Data, EC, sizeof(coff_file_header)))
     return;
@@ -672,6 +749,10 @@ COFFObjectFile::COFFObjectFile(MemoryBufferRef Object, std::error_code &EC)
   if ((EC = initBaseRelocPtr()))
     return;
 
+  // Initialize the pointer to the export table.
+  if ((EC = initDebugDirectoryPtr()))
+    return;
+
   EC = std::error_code();
 }
 
@@ -689,13 +770,17 @@ basic_symbol_iterator COFFObjectFile::symbol_end_impl() const {
 }
 
 import_directory_iterator COFFObjectFile::import_directory_begin() const {
+  if (!ImportDirectory)
+    return import_directory_end();
+  if (ImportDirectory[0].ImportLookupTableRVA == 0)
+    return import_directory_end();
   return import_directory_iterator(
       ImportDirectoryEntryRef(ImportDirectory, 0, this));
 }
 
 import_directory_iterator COFFObjectFile::import_directory_end() const {
   return import_directory_iterator(
-      ImportDirectoryEntryRef(ImportDirectory, NumberOfImportDirectory, this));
+      ImportDirectoryEntryRef(nullptr, -1, this));
 }
 
 delay_import_directory_iterator
@@ -947,10 +1032,10 @@ uint64_t COFFObjectFile::getSectionSize(const coff_section *Sec) const {
 std::error_code
 COFFObjectFile::getSectionContents(const coff_section *Sec,
                                    ArrayRef<uint8_t> &Res) const {
-  // PointerToRawData and SizeOfRawData won't make sense for BSS sections,
-  // don't do anything interesting for them.
-  assert((Sec->Characteristics & COFF::IMAGE_SCN_CNT_UNINITIALIZED_DATA) == 0 &&
-         "BSS sections don't have contents!");
+  // In COFF, a virtual section won't have any in-file
+  // content, so the file pointer to the content will be zero.
+  if (Sec->PointerToRawData == 0)
+    return object_error::parse_failed;
   // The only thing that we need to verify is that the contents is contained
   // within the file bounds. We don't need to make sure it doesn't cover other
   // data, as there's nothing that says that is not allowed.
@@ -1116,12 +1201,15 @@ operator==(const ImportDirectoryEntryRef &Other) const {
 
 void ImportDirectoryEntryRef::moveNext() {
   ++Index;
+  if (ImportTable[Index].ImportLookupTableRVA == 0) {
+    Index = -1;
+    ImportTable = nullptr;
+  }
 }
 
 std::error_code ImportDirectoryEntryRef::getImportTableEntry(
     const import_directory_table_entry *&Result) const {
-  Result = ImportTable + Index;
-  return std::error_code();
+  return getObject(Result, OwningObject->Data, ImportTable + Index);
 }
 
 static imported_symbol_iterator
@@ -1198,16 +1286,6 @@ ImportDirectoryEntryRef::getImportAddressTableRVA(uint32_t &Result) const {
   return std::error_code();
 }
 
-std::error_code ImportDirectoryEntryRef::getImportLookupEntry(
-    const import_lookup_table_entry32 *&Result) const {
-  uintptr_t IntPtr = 0;
-  uint32_t RVA = ImportTable[Index].ImportLookupTableRVA;
-  if (std::error_code EC = OwningObject->getRvaPtr(RVA, IntPtr))
-    return EC;
-  Result = reinterpret_cast<const import_lookup_table_entry32 *>(IntPtr);
-  return std::error_code();
-}
-
 bool DelayImportDirectoryEntryRef::
 operator==(const DelayImportDirectoryEntryRef &Other) const {
   return Table == Other.Table && Index == Other.Index;
@@ -1391,6 +1469,22 @@ ImportedSymbolRef::getSymbolName(StringRef &Result) const {
   return std::error_code();
 }
 
+std::error_code ImportedSymbolRef::isOrdinal(bool &Result) const {
+  if (Entry32)
+    Result = Entry32[Index].isOrdinal();
+  else
+    Result = Entry64[Index].isOrdinal();
+  return std::error_code();
+}
+
+std::error_code ImportedSymbolRef::getHintNameRVA(uint32_t &Result) const {
+  if (Entry32)
+    Result = Entry32[Index].getHintNameRVA();
+  else
+    Result = Entry64[Index].getHintNameRVA();
+  return std::error_code();
+}
+
 std::error_code ImportedSymbolRef::getOrdinal(uint16_t &Result) const {
   uint32_t RVA;
   if (Entry32) {
diff --git a/lib/Object/ELF.cpp b/lib/Object/ELF.cpp
index 12b772d930ba..2dde18a24280 100644
--- a/lib/Object/ELF.cpp
+++ b/lib/Object/ELF.cpp
@@ -61,6 +61,13 @@ StringRef getELFRelocationTypeName(uint32_t Machine, uint32_t Type) {
       break;
     }
     break;
+  case ELF::EM_LANAI:
+    switch (Type) {
+#include "llvm/Support/ELFRelocs/Lanai.def"
+    default:
+      break;
+    }
+    break;
   case ELF::EM_PPC:
     switch (Type) {
 #include "llvm/Support/ELFRelocs/PowerPC.def"
@@ -98,6 +105,19 @@ StringRef getELFRelocationTypeName(uint32_t Machine, uint32_t Type) {
       break;
     }
     break;
+  case ELF::EM_AMDGPU:
+    switch (Type) {
+#include "llvm/Support/ELFRelocs/AMDGPU.def"
+    default:
+      break;
+    }
+  case ELF::EM_BPF:
+    switch (Type) {
+#include "llvm/Support/ELFRelocs/BPF.def"
+    default:
+      break;
+    }
+    break;
   default:
     break;
   }
diff --git a/lib/Object/ELFObjectFile.cpp b/lib/Object/ELFObjectFile.cpp
index c7df30a59035..4bd69e34e3c3 100644
--- a/lib/Object/ELFObjectFile.cpp
+++ b/lib/Object/ELFObjectFile.cpp
@@ -55,4 +55,71 @@ ObjectFile::createELFObjectFile(MemoryBufferRef Obj) {
   return std::move(R);
 }
 
+SubtargetFeatures ELFObjectFileBase::getFeatures() const {
+  switch (getEMachine()) {
+  case ELF::EM_MIPS: {
+    SubtargetFeatures Features;
+    unsigned PlatformFlags;
+    getPlatformFlags(PlatformFlags);
+
+    switch (PlatformFlags & ELF::EF_MIPS_ARCH) {
+    case ELF::EF_MIPS_ARCH_1:
+      break;
+    case ELF::EF_MIPS_ARCH_2:
+      Features.AddFeature("mips2");
+      break;
+    case ELF::EF_MIPS_ARCH_3:
+      Features.AddFeature("mips3");
+      break;
+    case ELF::EF_MIPS_ARCH_4:
+      Features.AddFeature("mips4");
+      break;
+    case ELF::EF_MIPS_ARCH_5:
+      Features.AddFeature("mips5");
+      break;
+    case ELF::EF_MIPS_ARCH_32:
+      Features.AddFeature("mips32");
+      break;
+    case ELF::EF_MIPS_ARCH_64:
+      Features.AddFeature("mips64");
+      break;
+    case ELF::EF_MIPS_ARCH_32R2:
+      Features.AddFeature("mips32r2");
+      break;
+    case ELF::EF_MIPS_ARCH_64R2:
+      Features.AddFeature("mips64r2");
+      break;
+    case ELF::EF_MIPS_ARCH_32R6:
+      Features.AddFeature("mips32r6");
+      break;
+    case ELF::EF_MIPS_ARCH_64R6:
+      Features.AddFeature("mips64r6");
+      break;
+    default:
+      llvm_unreachable("Unknown EF_MIPS_ARCH value");
+    }
+
+    switch (PlatformFlags & ELF::EF_MIPS_MACH) {
+    case ELF::EF_MIPS_MACH_NONE:
+      // No feature associated with this value.
+      break;
+    case ELF::EF_MIPS_MACH_OCTEON:
+      Features.AddFeature("cnmips");
+      break;
+    default:
+      llvm_unreachable("Unknown EF_MIPS_ARCH value");
+    }
+
+    if (PlatformFlags & ELF::EF_MIPS_ARCH_ASE_M16)
+      Features.AddFeature("mips16");
+    if (PlatformFlags & ELF::EF_MIPS_MICROMIPS)
+      Features.AddFeature("micromips");
+
+    return Features;
+  }
+  default:
+    return SubtargetFeatures();
+  }
+}
+
 } // end namespace llvm
diff --git a/lib/Object/Error.cpp b/lib/Object/Error.cpp
index 7ecc3a19af9d..2357526b7894 100644
--- a/lib/Object/Error.cpp
+++ b/lib/Object/Error.cpp
@@ -19,6 +19,9 @@ using namespace llvm;
 using namespace object;
 
 namespace {
+// FIXME: This class is only here to support the transition to llvm::Error. It
+// will be removed once this transition is complete. Clients should prefer to
+// deal with the Error value directly, rather than converting to error_code.
 class _object_error_category : public std::error_category {
 public:
   const char* name() const LLVM_NOEXCEPT override;
@@ -47,21 +50,45 @@ std::string _object_error_category::message(int EV) const {
     return "Invalid section index";
   case object_error::bitcode_section_not_found:
     return "Bitcode section not found in object file";
-  case object_error::elf_invalid_dynamic_table_size:
-    return "Invalid dynamic table size";
-  case object_error::macho_small_load_command:
-    return "Mach-O load command with size < 8 bytes";
-  case object_error::macho_load_segment_too_many_sections:
-    return "Mach-O segment load command contains too many sections";
-  case object_error::macho_load_segment_too_small:
-    return "Mach-O segment load command size is too small";
   }
   llvm_unreachable("An enumerator of object_error does not have a message "
                    "defined.");
 }
 
+char BinaryError::ID = 0;
+char GenericBinaryError::ID = 0;
+
+GenericBinaryError::GenericBinaryError(Twine Msg) : Msg(Msg.str()) {}
+
+GenericBinaryError::GenericBinaryError(Twine Msg, object_error ECOverride)
+    : Msg(Msg.str()) {
+  setErrorCode(make_error_code(ECOverride));
+}
+
+void GenericBinaryError::log(raw_ostream &OS) const {
+  OS << Msg;
+}
+
 static ManagedStatic<_object_error_category> error_category;
 
 const std::error_category &object::object_category() {
   return *error_category;
 }
+
+llvm::Error llvm::object::isNotObjectErrorInvalidFileType(llvm::Error Err) {
+  if (auto Err2 =
+       handleErrors(std::move(Err),
+         [](std::unique_ptr<ECError> M) {
+           // Try to handle 'M'. If successful, return a success value from
+           // the handler.
+           if (M->convertToErrorCode() == object_error::invalid_file_type)
+             return Error::success();
+
+           // We failed to handle 'M' - return it from the handler.
+           // This value will be passed back from catchErrors and
+           // wind up in Err2, where it will be returned from this function.
+           return Error(std::move(M));
+         }))
+    return Err2;
+  return Err;
+}
diff --git a/lib/Object/FunctionIndexObjectFile.cpp b/lib/Object/FunctionIndexObjectFile.cpp
deleted file mode 100644
index fe111de1a9c8..000000000000
--- a/lib/Object/FunctionIndexObjectFile.cpp
+++ /dev/null
@@ -1,143 +0,0 @@
-//===- FunctionIndexObjectFile.cpp - Function index file implementation ---===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-//
-// Part of the FunctionIndexObjectFile class implementation.
-//
-//===----------------------------------------------------------------------===//
-
-#include "llvm/Object/FunctionIndexObjectFile.h"
-#include "llvm/ADT/STLExtras.h"
-#include "llvm/Bitcode/ReaderWriter.h"
-#include "llvm/IR/FunctionInfo.h"
-#include "llvm/MC/MCStreamer.h"
-#include "llvm/Object/ObjectFile.h"
-#include "llvm/Support/MemoryBuffer.h"
-#include "llvm/Support/raw_ostream.h"
-using namespace llvm;
-using namespace object;
-
-FunctionIndexObjectFile::FunctionIndexObjectFile(
-    MemoryBufferRef Object, std::unique_ptr<FunctionInfoIndex> I)
-    : SymbolicFile(Binary::ID_FunctionIndex, Object), Index(std::move(I)) {}
-
-FunctionIndexObjectFile::~FunctionIndexObjectFile() {}
-
-std::unique_ptr<FunctionInfoIndex> FunctionIndexObjectFile::takeIndex() {
-  return std::move(Index);
-}
-
-ErrorOr<MemoryBufferRef>
-FunctionIndexObjectFile::findBitcodeInObject(const ObjectFile &Obj) {
-  for (const SectionRef &Sec : Obj.sections()) {
-    StringRef SecName;
-    if (std::error_code EC = Sec.getName(SecName))
-      return EC;
-    if (SecName == ".llvmbc") {
-      StringRef SecContents;
-      if (std::error_code EC = Sec.getContents(SecContents))
-        return EC;
-      return MemoryBufferRef(SecContents, Obj.getFileName());
-    }
-  }
-
-  return object_error::bitcode_section_not_found;
-}
-
-ErrorOr<MemoryBufferRef>
-FunctionIndexObjectFile::findBitcodeInMemBuffer(MemoryBufferRef Object) {
-  sys::fs::file_magic Type = sys::fs::identify_magic(Object.getBuffer());
-  switch (Type) {
-  case sys::fs::file_magic::bitcode:
-    return Object;
-  case sys::fs::file_magic::elf_relocatable:
-  case sys::fs::file_magic::macho_object:
-  case sys::fs::file_magic::coff_object: {
-    ErrorOr<std::unique_ptr<ObjectFile>> ObjFile =
-        ObjectFile::createObjectFile(Object, Type);
-    if (!ObjFile)
-      return ObjFile.getError();
-    return findBitcodeInObject(*ObjFile->get());
-  }
-  default:
-    return object_error::invalid_file_type;
-  }
-}
-
-// Looks for function index in the given memory buffer.
-// returns true if found, else false.
-bool FunctionIndexObjectFile::hasFunctionSummaryInMemBuffer(
-    MemoryBufferRef Object, DiagnosticHandlerFunction DiagnosticHandler) {
-  ErrorOr<MemoryBufferRef> BCOrErr = findBitcodeInMemBuffer(Object);
-  if (!BCOrErr)
-    return false;
-
-  return hasFunctionSummary(BCOrErr.get(), DiagnosticHandler);
-}
-
-// Parse function index in the given memory buffer.
-// Return new FunctionIndexObjectFile instance containing parsed
-// function summary/index.
-ErrorOr<std::unique_ptr<FunctionIndexObjectFile>>
-FunctionIndexObjectFile::create(MemoryBufferRef Object,
-                                DiagnosticHandlerFunction DiagnosticHandler,
-                                bool IsLazy) {
-  std::unique_ptr<FunctionInfoIndex> Index;
-
-  ErrorOr<MemoryBufferRef> BCOrErr = findBitcodeInMemBuffer(Object);
-  if (!BCOrErr)
-    return BCOrErr.getError();
-
-  ErrorOr<std::unique_ptr<FunctionInfoIndex>> IOrErr = getFunctionInfoIndex(
-      BCOrErr.get(), DiagnosticHandler, IsLazy);
-
-  if (std::error_code EC = IOrErr.getError())
-    return EC;
-
-  Index = std::move(IOrErr.get());
-
-  return llvm::make_unique<FunctionIndexObjectFile>(Object, std::move(Index));
-}
-
-// Parse the function summary information for function with the
-// given name out of the given buffer. Parsed information is
-// stored on the index object saved in this object.
-std::error_code FunctionIndexObjectFile::findFunctionSummaryInMemBuffer(
-    MemoryBufferRef Object, DiagnosticHandlerFunction DiagnosticHandler,
-    StringRef FunctionName) {
-  sys::fs::file_magic Type = sys::fs::identify_magic(Object.getBuffer());
-  switch (Type) {
-  case sys::fs::file_magic::bitcode: {
-    return readFunctionSummary(Object, DiagnosticHandler, FunctionName,
-                               std::move(Index));
-  }
-  default:
-    return object_error::invalid_file_type;
-  }
-}
-
-// Parse the function index out of an IR file and return the function
-// index object if found, or nullptr if not.
-ErrorOr<std::unique_ptr<FunctionInfoIndex>>
-llvm::getFunctionIndexForFile(StringRef Path,
-                              DiagnosticHandlerFunction DiagnosticHandler) {
-  ErrorOr<std::unique_ptr<MemoryBuffer>> FileOrErr =
-      MemoryBuffer::getFileOrSTDIN(Path);
-  std::error_code EC = FileOrErr.getError();
-  if (EC)
-    return EC;
-  MemoryBufferRef BufferRef = (FileOrErr.get())->getMemBufferRef();
-  ErrorOr<std::unique_ptr<object::FunctionIndexObjectFile>> ObjOrErr =
-      object::FunctionIndexObjectFile::create(BufferRef, DiagnosticHandler);
-  EC = ObjOrErr.getError();
-  if (EC)
-    return EC;
-
-  object::FunctionIndexObjectFile &Obj = **ObjOrErr;
-  return Obj.takeIndex();
-}
diff --git a/lib/Object/IRObjectFile.cpp b/lib/Object/IRObjectFile.cpp
index c35c413b3c3b..42c8ecd62da8 100644
--- a/lib/Object/IRObjectFile.cpp
+++ b/lib/Object/IRObjectFile.cpp
@@ -24,9 +24,9 @@
 #include "llvm/MC/MCInstrInfo.h"
 #include "llvm/MC/MCObjectFileInfo.h"
 #include "llvm/MC/MCParser/MCAsmParser.h"
+#include "llvm/MC/MCParser/MCTargetAsmParser.h"
 #include "llvm/MC/MCRegisterInfo.h"
 #include "llvm/MC/MCSubtargetInfo.h"
-#include "llvm/MC/MCTargetAsmParser.h"
 #include "llvm/Object/ObjectFile.h"
 #include "llvm/Support/MemoryBuffer.h"
 #include "llvm/Support/SourceMgr.h"
@@ -38,12 +38,20 @@ using namespace object;
 IRObjectFile::IRObjectFile(MemoryBufferRef Object, std::unique_ptr<Module> Mod)
     : SymbolicFile(Binary::ID_IR, Object), M(std::move(Mod)) {
   Mang.reset(new Mangler());
+  CollectAsmUndefinedRefs(Triple(M->getTargetTriple()), M->getModuleInlineAsm(),
+                          [this](StringRef Name, BasicSymbolRef::Flags Flags) {
+                            AsmSymbols.emplace_back(Name, std::move(Flags));
+                          });
+}
 
-  const std::string &InlineAsm = M->getModuleInlineAsm();
+// Parse inline ASM and collect the list of symbols that are not defined in
+// the current module. This is inspired from IRObjectFile.
+void IRObjectFile::CollectAsmUndefinedRefs(
+    const Triple &TT, StringRef InlineAsm,
+    function_ref<void(StringRef, BasicSymbolRef::Flags)> AsmUndefinedRefs) {
   if (InlineAsm.empty())
     return;
 
-  Triple TT(M->getTargetTriple());
   std::string Err;
   const Target *T = TargetRegistry::lookupTarget(TT.str(), Err);
   if (!T)
@@ -68,7 +76,7 @@ IRObjectFile::IRObjectFile(MemoryBufferRef Object, std::unique_ptr<Module> Mod)
 
   MCObjectFileInfo MOFI;
   MCContext MCCtx(MAI.get(), MRI.get(), &MOFI);
-  MOFI.InitMCObjectFileInfo(TT, Reloc::Default, CodeModel::Default, MCCtx);
+  MOFI.InitMCObjectFileInfo(TT, /*PIC*/ false, CodeModel::Default, MCCtx);
   std::unique_ptr<RecordStreamer> Streamer(new RecordStreamer(MCCtx));
   T->createNullTargetStreamer(*Streamer);
 
@@ -105,9 +113,12 @@ IRObjectFile::IRObjectFile(MemoryBufferRef Object, std::unique_ptr<Module> Mod)
       Res |= BasicSymbolRef::SF_Undefined;
       Res |= BasicSymbolRef::SF_Global;
       break;
+    case RecordStreamer::GlobalWeak:
+      Res |= BasicSymbolRef::SF_Weak;
+      Res |= BasicSymbolRef::SF_Global;
+      break;
     }
-    AsmSymbols.push_back(
-        std::make_pair<std::string, uint32_t>(Key, std::move(Res)));
+    AsmUndefinedRefs(Key, BasicSymbolRef::Flags(Res));
   }
 }
 
@@ -231,13 +242,14 @@ uint32_t IRObjectFile::getSymbolFlags(DataRefImpl Symb) const {
     Res |= BasicSymbolRef::SF_Global;
   if (GV->hasCommonLinkage())
     Res |= BasicSymbolRef::SF_Common;
-  if (GV->hasLinkOnceLinkage() || GV->hasWeakLinkage())
+  if (GV->hasLinkOnceLinkage() || GV->hasWeakLinkage() ||
+      GV->hasExternalWeakLinkage())
     Res |= BasicSymbolRef::SF_Weak;
 
   if (GV->getName().startswith("llvm."))
     Res |= BasicSymbolRef::SF_FormatSpecific;
   else if (auto *Var = dyn_cast<GlobalVariable>(GV)) {
-    if (Var->getSection() == StringRef("llvm.metadata"))
+    if (Var->getSection() == "llvm.metadata")
       Res |= BasicSymbolRef::SF_FormatSpecific;
   }
 
@@ -265,10 +277,7 @@ basic_symbol_iterator IRObjectFile::symbol_end_impl() const {
 
 ErrorOr<MemoryBufferRef> IRObjectFile::findBitcodeInObject(const ObjectFile &Obj) {
   for (const SectionRef &Sec : Obj.sections()) {
-    StringRef SecName;
-    if (std::error_code EC = Sec.getName(SecName))
-      return EC;
-    if (SecName == ".llvmbc") {
+    if (Sec.isBitcode()) {
       StringRef SecContents;
       if (std::error_code EC = Sec.getContents(SecContents))
         return EC;
@@ -287,10 +296,10 @@ ErrorOr<MemoryBufferRef> IRObjectFile::findBitcodeInMemBuffer(MemoryBufferRef Ob
   case sys::fs::file_magic::elf_relocatable:
   case sys::fs::file_magic::macho_object:
   case sys::fs::file_magic::coff_object: {
-    ErrorOr<std::unique_ptr<ObjectFile>> ObjFile =
+    Expected<std::unique_ptr<ObjectFile>> ObjFile =
         ObjectFile::createObjectFile(Object, Type);
     if (!ObjFile)
-      return ObjFile.getError();
+      return errorToErrorCode(ObjFile.takeError());
     return findBitcodeInObject(*ObjFile->get());
   }
   default:
@@ -305,8 +314,8 @@ llvm::object::IRObjectFile::create(MemoryBufferRef Object,
   if (!BCOrErr)
     return BCOrErr.getError();
 
-  std::unique_ptr<MemoryBuffer> Buff(
-      MemoryBuffer::getMemBuffer(BCOrErr.get(), false));
+  std::unique_ptr<MemoryBuffer> Buff =
+      MemoryBuffer::getMemBuffer(BCOrErr.get(), false);
 
   ErrorOr<std::unique_ptr<Module>> MOrErr =
       getLazyBitcodeModule(std::move(Buff), Context,
diff --git a/lib/Object/MachOObjectFile.cpp b/lib/Object/MachOObjectFile.cpp
index d1f79b225ee4..563236f95a5b 100644
--- a/lib/Object/MachOObjectFile.cpp
+++ b/lib/Object/MachOObjectFile.cpp
@@ -38,6 +38,13 @@ namespace {
   };
 }
 
+static Error
+malformedError(Twine Msg) {
+  std::string StringMsg = "truncated or malformed object (" + Msg.str() + ")";
+  return make_error<GenericBinaryError>(std::move(StringMsg),
+                                        object_error::parse_failed);
+}
+
 // FIXME: Replace all uses of this function with getStructOrErr.
 template <typename T>
 static T getStruct(const MachOObjectFile *O, const char *P) {
@@ -53,10 +60,10 @@ static T getStruct(const MachOObjectFile *O, const char *P) {
 }
 
 template <typename T>
-static ErrorOr<T> getStructOrErr(const MachOObjectFile *O, const char *P) {
+static Expected<T> getStructOrErr(const MachOObjectFile *O, const char *P) {
   // Don't read before the beginning or past the end of the file
   if (P < O->getData().begin() || P + sizeof(T) > O->getData().end())
-    return object_error::parse_failed;
+    return malformedError("Structure read out-of-range");
 
   T Cmd;
   memcpy(&Cmd, P, sizeof(T));
@@ -161,120 +168,180 @@ static uint32_t getSectionFlags(const MachOObjectFile *O,
   return Sect.flags;
 }
 
-static ErrorOr<MachOObjectFile::LoadCommandInfo>
-getLoadCommandInfo(const MachOObjectFile *Obj, const char *Ptr) {
-  auto CmdOrErr = getStructOrErr<MachO::load_command>(Obj, Ptr);
-  if (!CmdOrErr)
-    return CmdOrErr.getError();
-  if (CmdOrErr->cmdsize < 8)
-    return object_error::macho_small_load_command;
-  MachOObjectFile::LoadCommandInfo Load;
-  Load.Ptr = Ptr;
-  Load.C = CmdOrErr.get();
-  return Load;
+static Expected<MachOObjectFile::LoadCommandInfo>
+getLoadCommandInfo(const MachOObjectFile *Obj, const char *Ptr,
+                   uint32_t LoadCommandIndex) {
+  if (auto CmdOrErr = getStructOrErr<MachO::load_command>(Obj, Ptr)) {
+    if (CmdOrErr->cmdsize < 8)
+      return malformedError("load command " + Twine(LoadCommandIndex) +
+                            " with size less than 8 bytes");
+    return MachOObjectFile::LoadCommandInfo({Ptr, *CmdOrErr});
+  } else
+    return CmdOrErr.takeError();
 }
 
-static ErrorOr<MachOObjectFile::LoadCommandInfo>
+static Expected<MachOObjectFile::LoadCommandInfo>
 getFirstLoadCommandInfo(const MachOObjectFile *Obj) {
   unsigned HeaderSize = Obj->is64Bit() ? sizeof(MachO::mach_header_64)
                                        : sizeof(MachO::mach_header);
-  return getLoadCommandInfo(Obj, getPtr(Obj, HeaderSize));
+  if (sizeof(MachOObjectFile::LoadCommandInfo) > Obj->getHeader().sizeofcmds)
+    return malformedError("load command 0 extends past the end all load "
+                          "commands in the file");
+  return getLoadCommandInfo(Obj, getPtr(Obj, HeaderSize), 0);
 }
 
-static ErrorOr<MachOObjectFile::LoadCommandInfo>
-getNextLoadCommandInfo(const MachOObjectFile *Obj,
+static Expected<MachOObjectFile::LoadCommandInfo>
+getNextLoadCommandInfo(const MachOObjectFile *Obj, uint32_t LoadCommandIndex,
                        const MachOObjectFile::LoadCommandInfo &L) {
-  return getLoadCommandInfo(Obj, L.Ptr + L.C.cmdsize);
+  unsigned HeaderSize = Obj->is64Bit() ? sizeof(MachO::mach_header_64)
+                                       : sizeof(MachO::mach_header);
+  if (L.Ptr + L.C.cmdsize + sizeof(MachOObjectFile::LoadCommandInfo) >
+      Obj->getData().data() + HeaderSize + Obj->getHeader().sizeofcmds)
+    return malformedError("load command " + Twine(LoadCommandIndex + 1) +
+                          " extends past the end all load commands in the file");
+  return getLoadCommandInfo(Obj, L.Ptr + L.C.cmdsize, LoadCommandIndex + 1);
 }
 
 template <typename T>
 static void parseHeader(const MachOObjectFile *Obj, T &Header,
-                        std::error_code &EC) {
-  auto HeaderOrErr = getStructOrErr<T>(Obj, getPtr(Obj, 0));
-  if (HeaderOrErr)
-    Header = HeaderOrErr.get();
+                        Error &Err) {
+  if (sizeof(T) > Obj->getData().size()) {
+    Err = malformedError("the mach header extends past the end of the "
+                         "file");
+    return;
+  }
+  if (auto HeaderOrErr = getStructOrErr<T>(Obj, getPtr(Obj, 0)))
+    Header = *HeaderOrErr;
   else
-    EC = HeaderOrErr.getError();
+    Err = HeaderOrErr.takeError();
 }
 
 // Parses LC_SEGMENT or LC_SEGMENT_64 load command, adds addresses of all
 // sections to \param Sections, and optionally sets
 // \param IsPageZeroSegment to true.
 template <typename SegmentCmd>
-static std::error_code parseSegmentLoadCommand(
+static Error parseSegmentLoadCommand(
     const MachOObjectFile *Obj, const MachOObjectFile::LoadCommandInfo &Load,
-    SmallVectorImpl<const char *> &Sections, bool &IsPageZeroSegment) {
+    SmallVectorImpl<const char *> &Sections, bool &IsPageZeroSegment,
+    uint32_t LoadCommandIndex, const char *CmdName) {
   const unsigned SegmentLoadSize = sizeof(SegmentCmd);
   if (Load.C.cmdsize < SegmentLoadSize)
-    return object_error::macho_load_segment_too_small;
-  auto SegOrErr = getStructOrErr<SegmentCmd>(Obj, Load.Ptr);
-  if (!SegOrErr)
-    return SegOrErr.getError();
-  SegmentCmd S = SegOrErr.get();
-  const unsigned SectionSize =
+    return malformedError("load command " + Twine(LoadCommandIndex) +
+                          " " + CmdName + " cmdsize too small");
+  if (auto SegOrErr = getStructOrErr<SegmentCmd>(Obj, Load.Ptr)) {
+    SegmentCmd S = SegOrErr.get();
+    const unsigned SectionSize =
       Obj->is64Bit() ? sizeof(MachO::section_64) : sizeof(MachO::section);
-  if (S.nsects > std::numeric_limits<uint32_t>::max() / SectionSize ||
-      S.nsects * SectionSize > Load.C.cmdsize - SegmentLoadSize)
-    return object_error::macho_load_segment_too_many_sections;
-  for (unsigned J = 0; J < S.nsects; ++J) {
-    const char *Sec = getSectionPtr(Obj, Load, J);
-    Sections.push_back(Sec);
-  }
-  IsPageZeroSegment |= StringRef("__PAGEZERO").equals(S.segname);
-  return std::error_code();
+    if (S.nsects > std::numeric_limits<uint32_t>::max() / SectionSize ||
+        S.nsects * SectionSize > Load.C.cmdsize - SegmentLoadSize)
+      return malformedError("load command " + Twine(LoadCommandIndex) +
+                            " inconsistent cmdsize in " + CmdName + 
+                            " for the number of sections");
+    for (unsigned J = 0; J < S.nsects; ++J) {
+      const char *Sec = getSectionPtr(Obj, Load, J);
+      Sections.push_back(Sec);
+    }
+    IsPageZeroSegment |= StringRef("__PAGEZERO").equals(S.segname);
+  } else
+    return SegOrErr.takeError();
+
+  return Error::success();
+}
+
+Expected<std::unique_ptr<MachOObjectFile>>
+MachOObjectFile::create(MemoryBufferRef Object, bool IsLittleEndian,
+                        bool Is64Bits) {
+  Error Err;
+  std::unique_ptr<MachOObjectFile> Obj(
+      new MachOObjectFile(std::move(Object), IsLittleEndian,
+                           Is64Bits, Err));
+  if (Err)
+    return std::move(Err);
+  return std::move(Obj);
 }
 
 MachOObjectFile::MachOObjectFile(MemoryBufferRef Object, bool IsLittleEndian,
-                                 bool Is64bits, std::error_code &EC)
+                                 bool Is64bits, Error &Err)
     : ObjectFile(getMachOType(IsLittleEndian, Is64bits), Object),
       SymtabLoadCmd(nullptr), DysymtabLoadCmd(nullptr),
       DataInCodeLoadCmd(nullptr), LinkOptHintsLoadCmd(nullptr),
       DyldInfoLoadCmd(nullptr), UuidLoadCmd(nullptr),
       HasPageZeroSegment(false) {
-  if (is64Bit())
-    parseHeader(this, Header64, EC);
-  else
-    parseHeader(this, Header, EC);
-  if (EC)
+  ErrorAsOutParameter ErrAsOutParam(Err);
+  uint64_t BigSize;
+  if (is64Bit()) {
+    parseHeader(this, Header64, Err);
+    BigSize = sizeof(MachO::mach_header_64);
+  } else {
+    parseHeader(this, Header, Err);
+    BigSize = sizeof(MachO::mach_header);
+  }
+  if (Err)
     return;
+  BigSize += getHeader().sizeofcmds;
+  if (getData().data() + BigSize > getData().end()) {
+    Err = malformedError("load commands extend past the end of the file");
+    return;
+  }
 
   uint32_t LoadCommandCount = getHeader().ncmds;
   if (LoadCommandCount == 0)
     return;
 
-  auto LoadOrErr = getFirstLoadCommandInfo(this);
-  if (!LoadOrErr) {
-    EC = LoadOrErr.getError();
+  LoadCommandInfo Load;
+  if (auto LoadOrErr = getFirstLoadCommandInfo(this))
+    Load = *LoadOrErr;
+  else {
+    Err = LoadOrErr.takeError();
     return;
   }
-  LoadCommandInfo Load = LoadOrErr.get();
+
   for (unsigned I = 0; I < LoadCommandCount; ++I) {
+    if (is64Bit()) {
+      if (Load.C.cmdsize % 8 != 0) {
+        // We have a hack here to allow 64-bit Mach-O core files to have
+        // LC_THREAD commands that are only a multiple of 4 and not 8 to be
+        // allowed since the macOS kernel produces them.
+        if (getHeader().filetype != MachO::MH_CORE ||
+            Load.C.cmd != MachO::LC_THREAD || Load.C.cmdsize % 4) {
+          Err = malformedError("load command " + Twine(I) + " cmdsize not a "
+                               "multiple of 8");
+          return;
+        }
+      }
+    } else {
+      if (Load.C.cmdsize % 4 != 0) {
+        Err = malformedError("load command " + Twine(I) + " cmdsize not a "
+                             "multiple of 4");
+        return;
+      }
+    }
     LoadCommands.push_back(Load);
     if (Load.C.cmd == MachO::LC_SYMTAB) {
       // Multiple symbol tables
       if (SymtabLoadCmd) {
-        EC = object_error::parse_failed;
+        Err = malformedError("Multiple symbol tables");
         return;
       }
       SymtabLoadCmd = Load.Ptr;
     } else if (Load.C.cmd == MachO::LC_DYSYMTAB) {
       // Multiple dynamic symbol tables
       if (DysymtabLoadCmd) {
-        EC = object_error::parse_failed;
+        Err = malformedError("Multiple dynamic symbol tables");
         return;
       }
       DysymtabLoadCmd = Load.Ptr;
     } else if (Load.C.cmd == MachO::LC_DATA_IN_CODE) {
       // Multiple data in code tables
       if (DataInCodeLoadCmd) {
-        EC = object_error::parse_failed;
+        Err = malformedError("Multiple data-in-code tables");
         return;
       }
       DataInCodeLoadCmd = Load.Ptr;
     } else if (Load.C.cmd == MachO::LC_LINKER_OPTIMIZATION_HINT) {
       // Multiple linker optimization hint tables
       if (LinkOptHintsLoadCmd) {
-        EC = object_error::parse_failed;
+        Err = malformedError("Multiple linker optimization hint tables");
         return;
       }
       LinkOptHintsLoadCmd = Load.Ptr;
@@ -282,24 +349,25 @@ MachOObjectFile::MachOObjectFile(MemoryBufferRef Object, bool IsLittleEndian,
                Load.C.cmd == MachO::LC_DYLD_INFO_ONLY) {
       // Multiple dyldinfo load commands
       if (DyldInfoLoadCmd) {
-        EC = object_error::parse_failed;
+        Err = malformedError("Multiple dyldinfo load commands");
         return;
       }
       DyldInfoLoadCmd = Load.Ptr;
     } else if (Load.C.cmd == MachO::LC_UUID) {
       // Multiple UUID load commands
       if (UuidLoadCmd) {
-        EC = object_error::parse_failed;
+        Err = malformedError("Multiple UUID load commands");
         return;
       }
       UuidLoadCmd = Load.Ptr;
     } else if (Load.C.cmd == MachO::LC_SEGMENT_64) {
-      if ((EC = parseSegmentLoadCommand<MachO::segment_command_64>(
-               this, Load, Sections, HasPageZeroSegment)))
+      if ((Err = parseSegmentLoadCommand<MachO::segment_command_64>(
+                   this, Load, Sections, HasPageZeroSegment, I,
+                   "LC_SEGMENT_64")))
         return;
     } else if (Load.C.cmd == MachO::LC_SEGMENT) {
-      if ((EC = parseSegmentLoadCommand<MachO::segment_command>(
-               this, Load, Sections, HasPageZeroSegment)))
+      if ((Err = parseSegmentLoadCommand<MachO::segment_command>(
+                   this, Load, Sections, HasPageZeroSegment, I, "LC_SEGMENT")))
         return;
     } else if (Load.C.cmd == MachO::LC_LOAD_DYLIB ||
                Load.C.cmd == MachO::LC_LOAD_WEAK_DYLIB ||
@@ -309,15 +377,66 @@ MachOObjectFile::MachOObjectFile(MemoryBufferRef Object, bool IsLittleEndian,
       Libraries.push_back(Load.Ptr);
     }
     if (I < LoadCommandCount - 1) {
-      auto LoadOrErr = getNextLoadCommandInfo(this, Load);
-      if (!LoadOrErr) {
-        EC = LoadOrErr.getError();
+      if (auto LoadOrErr = getNextLoadCommandInfo(this, I, Load))
+        Load = *LoadOrErr;
+      else {
+        Err = LoadOrErr.takeError();
         return;
       }
-      Load = LoadOrErr.get();
+    }
+  }
+  if (!SymtabLoadCmd) {
+    if (DysymtabLoadCmd) {
+      Err = malformedError("contains LC_DYSYMTAB load command without a "
+                           "LC_SYMTAB load command");
+      return;
+    }
+  } else if (DysymtabLoadCmd) {
+    MachO::symtab_command Symtab =
+      getStruct<MachO::symtab_command>(this, SymtabLoadCmd);
+    MachO::dysymtab_command Dysymtab =
+      getStruct<MachO::dysymtab_command>(this, DysymtabLoadCmd);
+    if (Dysymtab.nlocalsym != 0 && Dysymtab.ilocalsym > Symtab.nsyms) {
+      Err = malformedError("ilocalsym in LC_DYSYMTAB load command "
+                           "extends past the end of the symbol table");
+      return;
+    }
+    uint64_t BigSize = Dysymtab.ilocalsym;
+    BigSize += Dysymtab.nlocalsym;
+    if (Dysymtab.nlocalsym != 0 && BigSize > Symtab.nsyms) {
+      Err = malformedError("ilocalsym plus nlocalsym in LC_DYSYMTAB load "
+                           "command extends past the end of the symbol table");
+      return;
+    }
+    if (Dysymtab.nextdefsym != 0 && Dysymtab.ilocalsym > Symtab.nsyms) {
+      Err = malformedError("nextdefsym in LC_DYSYMTAB load command "
+                           "extends past the end of the symbol table");
+      return;
+    }
+    BigSize = Dysymtab.iextdefsym;
+    BigSize += Dysymtab.nextdefsym;
+    if (Dysymtab.nextdefsym != 0 && BigSize > Symtab.nsyms) {
+      Err = malformedError("iextdefsym plus nextdefsym in LC_DYSYMTAB "
+                           "load command extends past the end of the symbol "
+                           "table");
+      return;
+    }
+    if (Dysymtab.nundefsym != 0 && Dysymtab.iundefsym > Symtab.nsyms) {
+      Err = malformedError("nundefsym in LC_DYSYMTAB load command "
+                           "extends past the end of the symbol table");
+      return;
+    }
+    BigSize = Dysymtab.iundefsym;
+    BigSize += Dysymtab.nundefsym;
+    if (Dysymtab.nundefsym != 0 && BigSize > Symtab.nsyms) {
+      Err = malformedError("iundefsym plus nundefsym in LC_DYSYMTAB load "
+                           " command extends past the end of the symbol table");
+      return;
     }
   }
   assert(LoadCommands.size() == LoadCommandCount);
+
+  Err = Error::success();
 }
 
 void MachOObjectFile::moveSymbolNext(DataRefImpl &Symb) const {
@@ -327,13 +446,14 @@ void MachOObjectFile::moveSymbolNext(DataRefImpl &Symb) const {
   Symb.p += SymbolTableEntrySize;
 }
 
-ErrorOr<StringRef> MachOObjectFile::getSymbolName(DataRefImpl Symb) const {
+Expected<StringRef> MachOObjectFile::getSymbolName(DataRefImpl Symb) const {
   StringRef StringTable = getStringTableData();
   MachO::nlist_base Entry = getSymbolTableEntryBase(this, Symb);
   const char *Start = &StringTable.data()[Entry.n_strx];
-  if (Start < getData().begin() || Start >= getData().end())
-    report_fatal_error(
-        "Symbol name entry points before beginning or past end of file.");
+  if (Start < getData().begin() || Start >= getData().end()) {
+    return malformedError("bad string index: " + Twine(Entry.n_strx) +
+                          " for symbol at index " + Twine(getSymbolIndex(Symb)));
+  }
   return StringRef(Start);
 }
 
@@ -372,7 +492,7 @@ uint64_t MachOObjectFile::getSymbolValueImpl(DataRefImpl Sym) const {
   return getNValue(Sym);
 }
 
-ErrorOr<uint64_t> MachOObjectFile::getSymbolAddress(DataRefImpl Sym) const {
+Expected<uint64_t> MachOObjectFile::getSymbolAddress(DataRefImpl Sym) const {
   return getSymbolValue(Sym);
 }
 
@@ -389,7 +509,8 @@ uint64_t MachOObjectFile::getCommonSymbolSizeImpl(DataRefImpl DRI) const {
   return getNValue(DRI);
 }
 
-SymbolRef::Type MachOObjectFile::getSymbolType(DataRefImpl Symb) const {
+Expected<SymbolRef::Type>
+MachOObjectFile::getSymbolType(DataRefImpl Symb) const {
   MachO::nlist_base Entry = getSymbolTableEntryBase(this, Symb);
   uint8_t n_type = Entry.n_type;
 
@@ -401,7 +522,10 @@ SymbolRef::Type MachOObjectFile::getSymbolType(DataRefImpl Symb) const {
     case MachO::N_UNDF :
       return SymbolRef::ST_Unknown;
     case MachO::N_SECT :
-      section_iterator Sec = *getSymbolSection(Symb);
+      Expected<section_iterator> SecOrError = getSymbolSection(Symb);
+      if (!SecOrError)
+        return SecOrError.takeError();
+      section_iterator Sec = *SecOrError;
       if (Sec->isData() || Sec->isBSS())
         return SymbolRef::ST_Data;
       return SymbolRef::ST_Function;
@@ -448,7 +572,7 @@ uint32_t MachOObjectFile::getSymbolFlags(DataRefImpl DRI) const {
   return Result;
 }
 
-ErrorOr<section_iterator>
+Expected<section_iterator>
 MachOObjectFile::getSymbolSection(DataRefImpl Symb) const {
   MachO::nlist_base Entry = getSymbolTableEntryBase(this, Symb);
   uint8_t index = Entry.n_sect;
@@ -457,8 +581,10 @@ MachOObjectFile::getSymbolSection(DataRefImpl Symb) const {
     return section_end();
   DataRefImpl DRI;
   DRI.d.a = index - 1;
-  if (DRI.d.a >= Sections.size())
-    report_fatal_error("getSymbolSection: Invalid section index.");
+  if (DRI.d.a >= Sections.size()){
+    return malformedError("bad section index: " + Twine((int)index) +
+                          " for symbol at index " + Twine(getSymbolIndex(Symb)));
+  }
   return section_iterator(SectionRef(DRI, this));
 }
 
@@ -546,6 +672,10 @@ uint64_t MachOObjectFile::getSectionAlignment(DataRefImpl Sec) const {
   return uint64_t(1) << Align;
 }
 
+bool MachOObjectFile::isSectionCompressed(DataRefImpl Sec) const {
+  return false;
+}
+
 bool MachOObjectFile::isSectionText(DataRefImpl Sec) const {
   uint32_t Flags = getSectionFlags(this, Sec);
   return Flags & MachO::S_ATTR_PURE_INSTRUCTIONS;
@@ -576,6 +706,14 @@ bool MachOObjectFile::isSectionVirtual(DataRefImpl Sec) const {
   return false;
 }
 
+bool MachOObjectFile::isSectionBitcode(DataRefImpl Sec) const {
+  StringRef SegmentName = getSectionFinalSegmentName(Sec);
+  StringRef SectName;
+  if (!getSectionName(Sec, SectName))
+    return (SegmentName == "__LLVM" && SectName == "__bitcode");
+  return false;
+}
+
 relocation_iterator MachOObjectFile::section_rel_begin(DataRefImpl Sec) const {
   DataRefImpl Ret;
   Ret.d.a = Sec.d.a;
@@ -942,15 +1080,20 @@ MachOObjectFile::getRelocationRelocatedSection(relocation_iterator Rel) const {
 }
 
 basic_symbol_iterator MachOObjectFile::symbol_begin_impl() const {
+  DataRefImpl DRI;
+  MachO::symtab_command Symtab = getSymtabLoadCommand();
+  if (!SymtabLoadCmd || Symtab.nsyms == 0)
+    return basic_symbol_iterator(SymbolRef(DRI, this));
+
   return getSymbolByIndex(0);
 }
 
 basic_symbol_iterator MachOObjectFile::symbol_end_impl() const {
   DataRefImpl DRI;
-  if (!SymtabLoadCmd)
+  MachO::symtab_command Symtab = getSymtabLoadCommand();
+  if (!SymtabLoadCmd || Symtab.nsyms == 0)
     return basic_symbol_iterator(SymbolRef(DRI, this));
 
-  MachO::symtab_command Symtab = getSymtabLoadCommand();
   unsigned SymbolTableEntrySize = is64Bit() ?
     sizeof(MachO::nlist_64) :
     sizeof(MachO::nlist);
@@ -961,20 +1104,29 @@ basic_symbol_iterator MachOObjectFile::symbol_end_impl() const {
 }
 
 basic_symbol_iterator MachOObjectFile::getSymbolByIndex(unsigned Index) const {
-  DataRefImpl DRI;
-  if (!SymtabLoadCmd)
-    return basic_symbol_iterator(SymbolRef(DRI, this));
-
   MachO::symtab_command Symtab = getSymtabLoadCommand();
-  if (Index >= Symtab.nsyms)
+  if (!SymtabLoadCmd || Index >= Symtab.nsyms)
     report_fatal_error("Requested symbol index is out of range.");
   unsigned SymbolTableEntrySize =
     is64Bit() ? sizeof(MachO::nlist_64) : sizeof(MachO::nlist);
+  DataRefImpl DRI;
   DRI.p = reinterpret_cast<uintptr_t>(getPtr(this, Symtab.symoff));
   DRI.p += Index * SymbolTableEntrySize;
   return basic_symbol_iterator(SymbolRef(DRI, this));
 }
 
+uint64_t MachOObjectFile::getSymbolIndex(DataRefImpl Symb) const {
+  MachO::symtab_command Symtab = getSymtabLoadCommand();
+  if (!SymtabLoadCmd)
+    report_fatal_error("getSymbolIndex() called with no symbol table symbol");
+  unsigned SymbolTableEntrySize =
+    is64Bit() ? sizeof(MachO::nlist_64) : sizeof(MachO::nlist);
+  DataRefImpl DRIstart;
+  DRIstart.p = reinterpret_cast<uintptr_t>(getPtr(this, Symtab.symoff));
+  uint64_t Index = (Symb.p - DRIstart.p) / SymbolTableEntrySize;
+  return Index;
+}
+
 section_iterator MachOObjectFile::section_begin() const {
   DataRefImpl DRI;
   return section_iterator(SectionRef(DRI, this));
@@ -1036,8 +1188,8 @@ Triple::ArchType MachOObjectFile::getArch(uint32_t CPUType) {
   }
 }
 
-Triple MachOObjectFile::getArch(uint32_t CPUType, uint32_t CPUSubType,
-                                const char **McpuDefault) {
+Triple MachOObjectFile::getArchTriple(uint32_t CPUType, uint32_t CPUSubType,
+                                      const char **McpuDefault) {
   if (McpuDefault)
     *McpuDefault = nullptr;
 
@@ -1077,13 +1229,13 @@ Triple MachOObjectFile::getArch(uint32_t CPUType, uint32_t CPUSubType,
     case MachO::CPU_SUBTYPE_ARM_V7EM:
       if (McpuDefault)
         *McpuDefault = "cortex-m4";
-      return Triple("armv7em-apple-darwin");
+      return Triple("thumbv7em-apple-darwin");
     case MachO::CPU_SUBTYPE_ARM_V7K:
       return Triple("armv7k-apple-darwin");
     case MachO::CPU_SUBTYPE_ARM_V7M:
       if (McpuDefault)
         *McpuDefault = "cortex-m3";
-      return Triple("armv7m-apple-darwin");
+      return Triple("thumbv7m-apple-darwin");
     case MachO::CPU_SUBTYPE_ARM_V7S:
       return Triple("armv7s-apple-darwin");
     default:
@@ -1115,56 +1267,6 @@ Triple MachOObjectFile::getArch(uint32_t CPUType, uint32_t CPUSubType,
   }
 }
 
-Triple MachOObjectFile::getThumbArch(uint32_t CPUType, uint32_t CPUSubType,
-                                     const char **McpuDefault) {
-  if (McpuDefault)
-    *McpuDefault = nullptr;
-
-  switch (CPUType) {
-  case MachO::CPU_TYPE_ARM:
-    switch (CPUSubType & ~MachO::CPU_SUBTYPE_MASK) {
-    case MachO::CPU_SUBTYPE_ARM_V4T:
-      return Triple("thumbv4t-apple-darwin");
-    case MachO::CPU_SUBTYPE_ARM_V5TEJ:
-      return Triple("thumbv5e-apple-darwin");
-    case MachO::CPU_SUBTYPE_ARM_XSCALE:
-      return Triple("xscale-apple-darwin");
-    case MachO::CPU_SUBTYPE_ARM_V6:
-      return Triple("thumbv6-apple-darwin");
-    case MachO::CPU_SUBTYPE_ARM_V6M:
-      if (McpuDefault)
-        *McpuDefault = "cortex-m0";
-      return Triple("thumbv6m-apple-darwin");
-    case MachO::CPU_SUBTYPE_ARM_V7:
-      return Triple("thumbv7-apple-darwin");
-    case MachO::CPU_SUBTYPE_ARM_V7EM:
-      if (McpuDefault)
-        *McpuDefault = "cortex-m4";
-      return Triple("thumbv7em-apple-darwin");
-    case MachO::CPU_SUBTYPE_ARM_V7K:
-      return Triple("thumbv7k-apple-darwin");
-    case MachO::CPU_SUBTYPE_ARM_V7M:
-      if (McpuDefault)
-        *McpuDefault = "cortex-m3";
-      return Triple("thumbv7m-apple-darwin");
-    case MachO::CPU_SUBTYPE_ARM_V7S:
-      return Triple("thumbv7s-apple-darwin");
-    default:
-      return Triple();
-    }
-  default:
-    return Triple();
-  }
-}
-
-Triple MachOObjectFile::getArch(uint32_t CPUType, uint32_t CPUSubType,
-                                const char **McpuDefault, Triple *ThumbTriple) {
-  Triple T = MachOObjectFile::getArch(CPUType, CPUSubType, McpuDefault);
-  *ThumbTriple = MachOObjectFile::getThumbArch(CPUType, CPUSubType,
-                                               McpuDefault);
-  return T;
-}
-
 Triple MachOObjectFile::getHostArch() {
   return Triple(sys::getDefaultTargetTriple());
 }
@@ -1194,10 +1296,8 @@ unsigned MachOObjectFile::getArch() const {
   return getArch(getCPUType(this));
 }
 
-Triple MachOObjectFile::getArch(const char **McpuDefault,
-                                Triple *ThumbTriple) const {
-  *ThumbTriple = getThumbArch(Header.cputype, Header.cpusubtype, McpuDefault);
-  return getArch(Header.cputype, Header.cpusubtype, McpuDefault);
+Triple MachOObjectFile::getArchTriple(const char **McpuDefault) const {
+  return getArchTriple(Header.cputype, Header.cpusubtype, McpuDefault);
 }
 
 relocation_iterator MachOObjectFile::section_rel_begin(unsigned Index) const {
@@ -2307,23 +2407,17 @@ bool MachOObjectFile::isRelocatableObject() const {
   return getHeader().filetype == MachO::MH_OBJECT;
 }
 
-ErrorOr<std::unique_ptr<MachOObjectFile>>
+Expected<std::unique_ptr<MachOObjectFile>>
 ObjectFile::createMachOObjectFile(MemoryBufferRef Buffer) {
   StringRef Magic = Buffer.getBuffer().slice(0, 4);
-  std::error_code EC;
-  std::unique_ptr<MachOObjectFile> Ret;
   if (Magic == "\xFE\xED\xFA\xCE")
-    Ret.reset(new MachOObjectFile(Buffer, false, false, EC));
-  else if (Magic == "\xCE\xFA\xED\xFE")
-    Ret.reset(new MachOObjectFile(Buffer, true, false, EC));
-  else if (Magic == "\xFE\xED\xFA\xCF")
-    Ret.reset(new MachOObjectFile(Buffer, false, true, EC));
-  else if (Magic == "\xCF\xFA\xED\xFE")
-    Ret.reset(new MachOObjectFile(Buffer, true, true, EC));
-  else
-    return object_error::parse_failed;
-
-  if (EC)
-    return EC;
-  return std::move(Ret);
+    return MachOObjectFile::create(Buffer, false, false);
+  if (Magic == "\xCE\xFA\xED\xFE")
+    return MachOObjectFile::create(Buffer, true, false);
+  if (Magic == "\xFE\xED\xFA\xCF")
+    return MachOObjectFile::create(Buffer, false, true);
+  if (Magic == "\xCF\xFA\xED\xFE")
+    return MachOObjectFile::create(Buffer, true, true);
+  return make_error<GenericBinaryError>("Unrecognized MachO magic number",
+                                        object_error::invalid_file_type);
 }
diff --git a/lib/Object/MachOUniversal.cpp b/lib/Object/MachOUniversal.cpp
index a1c83b9b7f86..66c9151eb69c 100644
--- a/lib/Object/MachOUniversal.cpp
+++ b/lib/Object/MachOUniversal.cpp
@@ -22,22 +22,11 @@
 using namespace llvm;
 using namespace object;
 
-template<typename T>
-static void SwapStruct(T &Value);
-
-template<>
-void SwapStruct(MachO::fat_header &H) {
-  sys::swapByteOrder(H.magic);
-  sys::swapByteOrder(H.nfat_arch);
-}
-
-template<>
-void SwapStruct(MachO::fat_arch &H) {
-  sys::swapByteOrder(H.cputype);
-  sys::swapByteOrder(H.cpusubtype);
-  sys::swapByteOrder(H.offset);
-  sys::swapByteOrder(H.size);
-  sys::swapByteOrder(H.align);
+static Error
+malformedError(Twine Msg) {
+  std::string StringMsg = "truncated or malformed fat file (" + Msg.str() + ")";
+  return make_error<GenericBinaryError>(std::move(StringMsg),
+                                        object_error::parse_failed);
 }
 
 template<typename T>
@@ -46,7 +35,7 @@ static T getUniversalBinaryStruct(const char *Ptr) {
   memcpy(&Res, Ptr, sizeof(T));
   // Universal binary headers have big-endian byte order.
   if (sys::IsLittleEndianHost)
-    SwapStruct(Res);
+    swapStruct(Res);
   return Res;
 }
 
@@ -58,34 +47,53 @@ MachOUniversalBinary::ObjectForArch::ObjectForArch(
   } else {
     // Parse object header.
     StringRef ParentData = Parent->getData();
-    const char *HeaderPos = ParentData.begin() + sizeof(MachO::fat_header) +
-                            Index * sizeof(MachO::fat_arch);
-    Header = getUniversalBinaryStruct<MachO::fat_arch>(HeaderPos);
-    if (ParentData.size() < Header.offset + Header.size) {
-      clear();
+    if (Parent->getMagic() == MachO::FAT_MAGIC) {
+      const char *HeaderPos = ParentData.begin() + sizeof(MachO::fat_header) +
+                              Index * sizeof(MachO::fat_arch);
+      Header = getUniversalBinaryStruct<MachO::fat_arch>(HeaderPos);
+      if (ParentData.size() < Header.offset + Header.size) {
+        clear();
+      }
+    } else { // Parent->getMagic() == MachO::FAT_MAGIC_64
+      const char *HeaderPos = ParentData.begin() + sizeof(MachO::fat_header) +
+                              Index * sizeof(MachO::fat_arch_64);
+      Header64 = getUniversalBinaryStruct<MachO::fat_arch_64>(HeaderPos);
+      if (ParentData.size() < Header64.offset + Header64.size) {
+        clear();
+      }
     }
   }
 }
 
-ErrorOr<std::unique_ptr<MachOObjectFile>>
+Expected<std::unique_ptr<MachOObjectFile>>
 MachOUniversalBinary::ObjectForArch::getAsObjectFile() const {
   if (!Parent)
-    return object_error::parse_failed;
+    report_fatal_error("MachOUniversalBinary::ObjectForArch::getAsObjectFile() "
+                       "called when Parent is a nullptr");
 
   StringRef ParentData = Parent->getData();
-  StringRef ObjectData = ParentData.substr(Header.offset, Header.size);
+  StringRef ObjectData;
+  if (Parent->getMagic() == MachO::FAT_MAGIC)
+    ObjectData = ParentData.substr(Header.offset, Header.size);
+  else // Parent->getMagic() == MachO::FAT_MAGIC_64
+    ObjectData = ParentData.substr(Header64.offset, Header64.size);
   StringRef ObjectName = Parent->getFileName();
   MemoryBufferRef ObjBuffer(ObjectData, ObjectName);
   return ObjectFile::createMachOObjectFile(ObjBuffer);
 }
 
-ErrorOr<std::unique_ptr<Archive>>
+Expected<std::unique_ptr<Archive>>
 MachOUniversalBinary::ObjectForArch::getAsArchive() const {
   if (!Parent)
-    return object_error::parse_failed;
+    report_fatal_error("MachOUniversalBinary::ObjectForArch::getAsArchive() "
+                       "called when Parent is a nullptr");
 
   StringRef ParentData = Parent->getData();
-  StringRef ObjectData = ParentData.substr(Header.offset, Header.size);
+  StringRef ObjectData;
+  if (Parent->getMagic() == MachO::FAT_MAGIC)
+    ObjectData = ParentData.substr(Header.offset, Header.size);
+  else // Parent->getMagic() == MachO::FAT_MAGIC_64
+    ObjectData = ParentData.substr(Header64.offset, Header64.size);
   StringRef ObjectName = Parent->getFileName();
   MemoryBufferRef ObjBuffer(ObjectData, ObjectName);
   return Archive::create(ObjBuffer);
@@ -93,44 +101,64 @@ MachOUniversalBinary::ObjectForArch::getAsArchive() const {
 
 void MachOUniversalBinary::anchor() { }
 
-ErrorOr<std::unique_ptr<MachOUniversalBinary>>
+Expected<std::unique_ptr<MachOUniversalBinary>>
 MachOUniversalBinary::create(MemoryBufferRef Source) {
-  std::error_code EC;
+  Error Err;
   std::unique_ptr<MachOUniversalBinary> Ret(
-      new MachOUniversalBinary(Source, EC));
-  if (EC)
-    return EC;
+      new MachOUniversalBinary(Source, Err));
+  if (Err)
+    return std::move(Err);
   return std::move(Ret);
 }
 
-MachOUniversalBinary::MachOUniversalBinary(MemoryBufferRef Source,
-                                           std::error_code &ec)
-    : Binary(Binary::ID_MachOUniversalBinary, Source), NumberOfObjects(0) {
+MachOUniversalBinary::MachOUniversalBinary(MemoryBufferRef Source, Error &Err)
+    : Binary(Binary::ID_MachOUniversalBinary, Source), Magic(0),
+      NumberOfObjects(0) {
+  ErrorAsOutParameter ErrAsOutParam(Err);
   if (Data.getBufferSize() < sizeof(MachO::fat_header)) {
-    ec = object_error::invalid_file_type;
+    Err = make_error<GenericBinaryError>("File too small to be a Mach-O "
+                                         "universal file",
+                                         object_error::invalid_file_type);
     return;
   }
   // Check for magic value and sufficient header size.
   StringRef Buf = getData();
-  MachO::fat_header H= getUniversalBinaryStruct<MachO::fat_header>(Buf.begin());
+  MachO::fat_header H =
+      getUniversalBinaryStruct<MachO::fat_header>(Buf.begin());
+  Magic = H.magic;
   NumberOfObjects = H.nfat_arch;
-  uint32_t MinSize = sizeof(MachO::fat_header) +
-                     sizeof(MachO::fat_arch) * NumberOfObjects;
-  if (H.magic != MachO::FAT_MAGIC || Buf.size() < MinSize) {
-    ec = object_error::parse_failed;
+  uint32_t MinSize = sizeof(MachO::fat_header);
+  if (Magic == MachO::FAT_MAGIC)
+    MinSize += sizeof(MachO::fat_arch) * NumberOfObjects;
+  else if (Magic == MachO::FAT_MAGIC_64)
+    MinSize += sizeof(MachO::fat_arch_64) * NumberOfObjects;
+  else {
+    Err = malformedError("bad magic number");
+    return;
+  }
+  if (Buf.size() < MinSize) {
+    Err = malformedError("fat_arch" +
+                         Twine(Magic == MachO::FAT_MAGIC ? "" : "_64") +
+                         " structs would extend past the end of the file");
     return;
   }
-  ec = std::error_code();
+  Err = Error::success();
 }
 
-ErrorOr<std::unique_ptr<MachOObjectFile>>
+Expected<std::unique_ptr<MachOObjectFile>>
 MachOUniversalBinary::getObjectForArch(StringRef ArchName) const {
   if (Triple(ArchName).getArch() == Triple::ArchType::UnknownArch)
-    return object_error::arch_not_found;
+    return make_error<GenericBinaryError>("Unknown architecture "
+                                          "named: " +
+                                              ArchName,
+                                          object_error::arch_not_found);
 
   for (object_iterator I = begin_objects(), E = end_objects(); I != E; ++I) {
     if (I->getArchTypeName() == ArchName)
       return I->getAsObjectFile();
   }
-  return object_error::arch_not_found;
+  return make_error<GenericBinaryError>("fat file does not "
+                                        "contain " +
+                                            ArchName,
+                                        object_error::arch_not_found);
 }
diff --git a/lib/Object/Makefile b/lib/Object/Makefile
deleted file mode 100644
index 79388dc97f1a..000000000000
--- a/lib/Object/Makefile
+++ /dev/null
@@ -1,14 +0,0 @@
-##===- lib/Object/Makefile ---------------------------------*- Makefile -*-===##
-#
-#                     The LLVM Compiler Infrastructure
-#
-# This file is distributed under the University of Illinois Open Source
-# License. See LICENSE.TXT for details.
-#
-##===----------------------------------------------------------------------===##
-
-LEVEL = ../..
-LIBRARYNAME = LLVMObject
-BUILD_ARCHIVE := 1
-
-include $(LEVEL)/Makefile.common
diff --git a/lib/Object/ModuleSummaryIndexObjectFile.cpp b/lib/Object/ModuleSummaryIndexObjectFile.cpp
new file mode 100644
index 000000000000..e6b1040d8f5d
--- /dev/null
+++ b/lib/Object/ModuleSummaryIndexObjectFile.cpp
@@ -0,0 +1,126 @@
+//===- ModuleSummaryIndexObjectFile.cpp - Summary index file implementation ==//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// Part of the ModuleSummaryIndexObjectFile class implementation.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Object/ModuleSummaryIndexObjectFile.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/Bitcode/ReaderWriter.h"
+#include "llvm/IR/ModuleSummaryIndex.h"
+#include "llvm/MC/MCStreamer.h"
+#include "llvm/Object/ObjectFile.h"
+#include "llvm/Support/MemoryBuffer.h"
+#include "llvm/Support/raw_ostream.h"
+using namespace llvm;
+using namespace object;
+
+ModuleSummaryIndexObjectFile::ModuleSummaryIndexObjectFile(
+    MemoryBufferRef Object, std::unique_ptr<ModuleSummaryIndex> I)
+    : SymbolicFile(Binary::ID_ModuleSummaryIndex, Object), Index(std::move(I)) {
+}
+
+ModuleSummaryIndexObjectFile::~ModuleSummaryIndexObjectFile() {}
+
+std::unique_ptr<ModuleSummaryIndex> ModuleSummaryIndexObjectFile::takeIndex() {
+  return std::move(Index);
+}
+
+ErrorOr<MemoryBufferRef>
+ModuleSummaryIndexObjectFile::findBitcodeInObject(const ObjectFile &Obj) {
+  for (const SectionRef &Sec : Obj.sections()) {
+    if (Sec.isBitcode()) {
+      StringRef SecContents;
+      if (std::error_code EC = Sec.getContents(SecContents))
+        return EC;
+      return MemoryBufferRef(SecContents, Obj.getFileName());
+    }
+  }
+
+  return object_error::bitcode_section_not_found;
+}
+
+ErrorOr<MemoryBufferRef>
+ModuleSummaryIndexObjectFile::findBitcodeInMemBuffer(MemoryBufferRef Object) {
+  sys::fs::file_magic Type = sys::fs::identify_magic(Object.getBuffer());
+  switch (Type) {
+  case sys::fs::file_magic::bitcode:
+    return Object;
+  case sys::fs::file_magic::elf_relocatable:
+  case sys::fs::file_magic::macho_object:
+  case sys::fs::file_magic::coff_object: {
+    Expected<std::unique_ptr<ObjectFile>> ObjFile =
+        ObjectFile::createObjectFile(Object, Type);
+    if (!ObjFile)
+      return errorToErrorCode(ObjFile.takeError());
+    return findBitcodeInObject(*ObjFile->get());
+  }
+  default:
+    return object_error::invalid_file_type;
+  }
+}
+
+// Looks for module summary index in the given memory buffer.
+// returns true if found, else false.
+bool ModuleSummaryIndexObjectFile::hasGlobalValueSummaryInMemBuffer(
+    MemoryBufferRef Object,
+    const DiagnosticHandlerFunction &DiagnosticHandler) {
+  ErrorOr<MemoryBufferRef> BCOrErr = findBitcodeInMemBuffer(Object);
+  if (!BCOrErr)
+    return false;
+
+  return hasGlobalValueSummary(BCOrErr.get(), DiagnosticHandler);
+}
+
+// Parse module summary index in the given memory buffer.
+// Return new ModuleSummaryIndexObjectFile instance containing parsed
+// module summary/index.
+ErrorOr<std::unique_ptr<ModuleSummaryIndexObjectFile>>
+ModuleSummaryIndexObjectFile::create(
+    MemoryBufferRef Object,
+    const DiagnosticHandlerFunction &DiagnosticHandler) {
+  std::unique_ptr<ModuleSummaryIndex> Index;
+
+  ErrorOr<MemoryBufferRef> BCOrErr = findBitcodeInMemBuffer(Object);
+  if (!BCOrErr)
+    return BCOrErr.getError();
+
+  ErrorOr<std::unique_ptr<ModuleSummaryIndex>> IOrErr =
+      getModuleSummaryIndex(BCOrErr.get(), DiagnosticHandler);
+
+  if (std::error_code EC = IOrErr.getError())
+    return EC;
+
+  Index = std::move(IOrErr.get());
+
+  return llvm::make_unique<ModuleSummaryIndexObjectFile>(Object,
+                                                         std::move(Index));
+}
+
+// Parse the module summary index out of an IR file and return the summary
+// index object if found, or nullptr if not.
+ErrorOr<std::unique_ptr<ModuleSummaryIndex>> llvm::getModuleSummaryIndexForFile(
+    StringRef Path, const DiagnosticHandlerFunction &DiagnosticHandler) {
+  ErrorOr<std::unique_ptr<MemoryBuffer>> FileOrErr =
+      MemoryBuffer::getFileOrSTDIN(Path);
+  std::error_code EC = FileOrErr.getError();
+  if (EC)
+    return EC;
+  MemoryBufferRef BufferRef = (FileOrErr.get())->getMemBufferRef();
+  ErrorOr<std::unique_ptr<object::ModuleSummaryIndexObjectFile>> ObjOrErr =
+      object::ModuleSummaryIndexObjectFile::create(BufferRef,
+                                                   DiagnosticHandler);
+  EC = ObjOrErr.getError();
+  if (EC)
+    return EC;
+
+  object::ModuleSummaryIndexObjectFile &Obj = **ObjOrErr;
+  return Obj.takeIndex();
+}
diff --git a/lib/Object/Object.cpp b/lib/Object/Object.cpp
index b44c1a16fd08..6df481b060e1 100644
--- a/lib/Object/Object.cpp
+++ b/lib/Object/Object.cpp
@@ -61,11 +61,14 @@ wrap(const relocation_iterator *SI) {
 // ObjectFile creation
 LLVMObjectFileRef LLVMCreateObjectFile(LLVMMemoryBufferRef MemBuf) {
   std::unique_ptr<MemoryBuffer> Buf(unwrap(MemBuf));
-  ErrorOr<std::unique_ptr<ObjectFile>> ObjOrErr(
+  Expected<std::unique_ptr<ObjectFile>> ObjOrErr(
       ObjectFile::createObjectFile(Buf->getMemBufferRef()));
   std::unique_ptr<ObjectFile> Obj;
-  if (!ObjOrErr)
+  if (!ObjOrErr) {
+    // TODO: Actually report errors helpfully.
+    consumeError(ObjOrErr.takeError());
     return nullptr;
+  }
 
   auto *Ret = new OwningBinary<ObjectFile>(std::move(ObjOrErr.get()), std::move(Buf));
   return wrap(Ret);
@@ -98,9 +101,14 @@ void LLVMMoveToNextSection(LLVMSectionIteratorRef SI) {
 
 void LLVMMoveToContainingSection(LLVMSectionIteratorRef Sect,
                                  LLVMSymbolIteratorRef Sym) {
-  ErrorOr<section_iterator> SecOrErr = (*unwrap(Sym))->getSection();
-  if (std::error_code ec = SecOrErr.getError())
-    report_fatal_error(ec.message());
+  Expected<section_iterator> SecOrErr = (*unwrap(Sym))->getSection();
+  if (!SecOrErr) {
+   std::string Buf;
+   raw_string_ostream OS(Buf);
+   logAllUnhandledErrors(SecOrErr.takeError(), OS, "");
+   OS.flush();
+   report_fatal_error(Buf);
+  }
   *unwrap(Sect) = *SecOrErr;
 }
 
@@ -175,16 +183,26 @@ void LLVMMoveToNextRelocation(LLVMRelocationIteratorRef SI) {
 
 // SymbolRef accessors
 const char *LLVMGetSymbolName(LLVMSymbolIteratorRef SI) {
-  ErrorOr<StringRef> Ret = (*unwrap(SI))->getName();
-  if (std::error_code EC = Ret.getError())
-    report_fatal_error(EC.message());
+  Expected<StringRef> Ret = (*unwrap(SI))->getName();
+  if (!Ret) {
+    std::string Buf;
+    raw_string_ostream OS(Buf);
+    logAllUnhandledErrors(Ret.takeError(), OS, "");
+    OS.flush();
+    report_fatal_error(Buf);
+  }
   return Ret->data();
 }
 
 uint64_t LLVMGetSymbolAddress(LLVMSymbolIteratorRef SI) {
-  ErrorOr<uint64_t> Ret = (*unwrap(SI))->getAddress();
-  if (std::error_code EC = Ret.getError())
-    report_fatal_error(EC.message());
+  Expected<uint64_t> Ret = (*unwrap(SI))->getAddress();
+  if (!Ret) {
+    std::string Buf;
+    raw_string_ostream OS(Buf);
+    logAllUnhandledErrors(Ret.takeError(), OS, "");
+    OS.flush();
+    report_fatal_error(Buf);
+  }
   return *Ret;
 }
 
diff --git a/lib/Object/ObjectFile.cpp b/lib/Object/ObjectFile.cpp
index d12dc411361c..92f9c1f4f0a0 100644
--- a/lib/Object/ObjectFile.cpp
+++ b/lib/Object/ObjectFile.cpp
@@ -29,9 +29,12 @@ ObjectFile::ObjectFile(unsigned int Type, MemoryBufferRef Source)
     : SymbolicFile(Type, Source) {}
 
 bool SectionRef::containsSymbol(SymbolRef S) const {
-  ErrorOr<section_iterator> SymSec = S.getSection();
-  if (!SymSec)
+  Expected<section_iterator> SymSec = S.getSection();
+  if (!SymSec) {
+    // TODO: Actually report errors helpfully.
+    consumeError(SymSec.takeError());
     return false;
+  }
   return *this == **SymSec;
 }
 
@@ -46,20 +49,27 @@ uint64_t ObjectFile::getSymbolValue(DataRefImpl Ref) const {
 
 std::error_code ObjectFile::printSymbolName(raw_ostream &OS,
                                             DataRefImpl Symb) const {
-  ErrorOr<StringRef> Name = getSymbolName(Symb);
-  if (std::error_code EC = Name.getError())
-    return EC;
+  Expected<StringRef> Name = getSymbolName(Symb);
+  if (!Name)
+    return errorToErrorCode(Name.takeError());
   OS << *Name;
   return std::error_code();
 }
 
 uint32_t ObjectFile::getSymbolAlignment(DataRefImpl DRI) const { return 0; }
 
+bool ObjectFile::isSectionBitcode(DataRefImpl Sec) const {
+  StringRef SectName;
+  if (!getSectionName(Sec, SectName))
+    return SectName == ".llvmbc";
+  return false;
+}
+
 section_iterator ObjectFile::getRelocatedSection(DataRefImpl Sec) const {
   return section_iterator(SectionRef(Sec, this));
 }
 
-ErrorOr<std::unique_ptr<ObjectFile>>
+Expected<std::unique_ptr<ObjectFile>>
 ObjectFile::createObjectFile(MemoryBufferRef Object, sys::fs::file_magic Type) {
   StringRef Data = Object.getBuffer();
   if (Type == sys::fs::file_magic::unknown)
@@ -71,13 +81,13 @@ ObjectFile::createObjectFile(MemoryBufferRef Object, sys::fs::file_magic Type) {
   case sys::fs::file_magic::archive:
   case sys::fs::file_magic::macho_universal_binary:
   case sys::fs::file_magic::windows_resource:
-    return object_error::invalid_file_type;
+    return errorCodeToError(object_error::invalid_file_type);
   case sys::fs::file_magic::elf:
   case sys::fs::file_magic::elf_relocatable:
   case sys::fs::file_magic::elf_executable:
   case sys::fs::file_magic::elf_shared_object:
   case sys::fs::file_magic::elf_core:
-    return createELFObjectFile(Object);
+    return errorOrToExpected(createELFObjectFile(Object));
   case sys::fs::file_magic::macho_object:
   case sys::fs::file_magic::macho_executable:
   case sys::fs::file_magic::macho_fixed_virtual_memory_shared_lib:
@@ -93,23 +103,23 @@ ObjectFile::createObjectFile(MemoryBufferRef Object, sys::fs::file_magic Type) {
   case sys::fs::file_magic::coff_object:
   case sys::fs::file_magic::coff_import_library:
   case sys::fs::file_magic::pecoff_executable:
-    return createCOFFObjectFile(Object);
+    return errorOrToExpected(createCOFFObjectFile(Object));
   }
   llvm_unreachable("Unexpected Object File Type");
 }
 
-ErrorOr<OwningBinary<ObjectFile>>
+Expected<OwningBinary<ObjectFile>>
 ObjectFile::createObjectFile(StringRef ObjectPath) {
   ErrorOr<std::unique_ptr<MemoryBuffer>> FileOrErr =
       MemoryBuffer::getFile(ObjectPath);
   if (std::error_code EC = FileOrErr.getError())
-    return EC;
+    return errorCodeToError(EC);
   std::unique_ptr<MemoryBuffer> Buffer = std::move(FileOrErr.get());
 
-  ErrorOr<std::unique_ptr<ObjectFile>> ObjOrErr =
+  Expected<std::unique_ptr<ObjectFile>> ObjOrErr =
       createObjectFile(Buffer->getMemBufferRef());
-  if (std::error_code EC = ObjOrErr.getError())
-    return EC;
+  if (!ObjOrErr)
+    ObjOrErr.takeError();
   std::unique_ptr<ObjectFile> Obj = std::move(ObjOrErr.get());
 
   return OwningBinary<ObjectFile>(std::move(Obj), std::move(Buffer));
diff --git a/lib/Object/RecordStreamer.cpp b/lib/Object/RecordStreamer.cpp
index 42dbd3e0c2d8..f03bd5e5fb9f 100644
--- a/lib/Object/RecordStreamer.cpp
+++ b/lib/Object/RecordStreamer.cpp
@@ -23,21 +23,26 @@ void RecordStreamer::markDefined(const MCSymbol &Symbol) {
   case Used:
     S = Defined;
     break;
+  case GlobalWeak:
+    break;
   }
 }
 
-void RecordStreamer::markGlobal(const MCSymbol &Symbol) {
+void RecordStreamer::markGlobal(const MCSymbol &Symbol,
+                                MCSymbolAttr Attribute) {
   State &S = Symbols[Symbol.getName()];
   switch (S) {
   case DefinedGlobal:
   case Defined:
-    S = DefinedGlobal;
+    S = (Attribute == MCSA_Weak) ? GlobalWeak : DefinedGlobal;
     break;
 
   case NeverSeen:
   case Global:
   case Used:
-    S = Global;
+    S = (Attribute == MCSA_Weak) ? GlobalWeak : Global;
+    break;
+  case GlobalWeak:
     break;
   }
 }
@@ -48,6 +53,7 @@ void RecordStreamer::markUsed(const MCSymbol &Symbol) {
   case DefinedGlobal:
   case Defined:
   case Global:
+  case GlobalWeak:
     break;
 
   case NeverSeen:
@@ -84,8 +90,8 @@ void RecordStreamer::EmitAssignment(MCSymbol *Symbol, const MCExpr *Value) {
 
 bool RecordStreamer::EmitSymbolAttribute(MCSymbol *Symbol,
                                          MCSymbolAttr Attribute) {
-  if (Attribute == MCSA_Global)
-    markGlobal(*Symbol);
+  if (Attribute == MCSA_Global || Attribute == MCSA_Weak)
+    markGlobal(*Symbol, Attribute);
   return true;
 }
 
diff --git a/lib/Object/RecordStreamer.h b/lib/Object/RecordStreamer.h
index d8610610c332..71337a60591b 100644
--- a/lib/Object/RecordStreamer.h
+++ b/lib/Object/RecordStreamer.h
@@ -15,12 +15,12 @@
 namespace llvm {
 class RecordStreamer : public MCStreamer {
 public:
-  enum State { NeverSeen, Global, Defined, DefinedGlobal, Used };
+  enum State { NeverSeen, Global, GlobalWeak, Defined, DefinedGlobal, Used };
 
 private:
   StringMap<State> Symbols;
   void markDefined(const MCSymbol &Symbol);
-  void markGlobal(const MCSymbol &Symbol);
+  void markGlobal(const MCSymbol &Symbol, MCSymbolAttr Attribute);
   void markUsed(const MCSymbol &Symbol);
   void visitUsedSymbol(const MCSymbol &Sym) override;
 
diff --git a/lib/Object/SymbolicFile.cpp b/lib/Object/SymbolicFile.cpp
index bf79dfb8da62..1e8e31b6b22d 100644
--- a/lib/Object/SymbolicFile.cpp
+++ b/lib/Object/SymbolicFile.cpp
@@ -26,7 +26,7 @@ SymbolicFile::SymbolicFile(unsigned int Type, MemoryBufferRef Source)
 
 SymbolicFile::~SymbolicFile() {}
 
-ErrorOr<std::unique_ptr<SymbolicFile>> SymbolicFile::createSymbolicFile(
+Expected<std::unique_ptr<SymbolicFile>> SymbolicFile::createSymbolicFile(
     MemoryBufferRef Object, sys::fs::file_magic Type, LLVMContext *Context) {
   StringRef Data = Object.getBuffer();
   if (Type == sys::fs::file_magic::unknown)
@@ -35,13 +35,13 @@ ErrorOr<std::unique_ptr<SymbolicFile>> SymbolicFile::createSymbolicFile(
   switch (Type) {
   case sys::fs::file_magic::bitcode:
     if (Context)
-      return IRObjectFile::create(Object, *Context);
+      return errorOrToExpected(IRObjectFile::create(Object, *Context));
   // Fallthrough
   case sys::fs::file_magic::unknown:
   case sys::fs::file_magic::archive:
   case sys::fs::file_magic::macho_universal_binary:
   case sys::fs::file_magic::windows_resource:
-    return object_error::invalid_file_type;
+    return errorCodeToError(object_error::invalid_file_type);
   case sys::fs::file_magic::elf:
   case sys::fs::file_magic::elf_executable:
   case sys::fs::file_magic::elf_shared_object:
@@ -63,7 +63,7 @@ ErrorOr<std::unique_ptr<SymbolicFile>> SymbolicFile::createSymbolicFile(
   case sys::fs::file_magic::elf_relocatable:
   case sys::fs::file_magic::macho_object:
   case sys::fs::file_magic::coff_object: {
-    ErrorOr<std::unique_ptr<ObjectFile>> Obj =
+    Expected<std::unique_ptr<ObjectFile>> Obj =
         ObjectFile::createObjectFile(Object, Type);
     if (!Obj || !Context)
       return std::move(Obj);
@@ -73,9 +73,9 @@ ErrorOr<std::unique_ptr<SymbolicFile>> SymbolicFile::createSymbolicFile(
     if (!BCData)
       return std::move(Obj);
 
-    return IRObjectFile::create(
-        MemoryBufferRef(BCData->getBuffer(), Object.getBufferIdentifier()),
-        *Context);
+    return errorOrToExpected(IRObjectFile::create(
+                                 MemoryBufferRef(BCData->getBuffer(),
+                                 Object.getBufferIdentifier()), *Context));
   }
   }
   llvm_unreachable("Unexpected Binary File Type");
diff --git a/lib/ObjectYAML/CMakeLists.txt b/lib/ObjectYAML/CMakeLists.txt
new file mode 100644
index 000000000000..77370908046e
--- /dev/null
+++ b/lib/ObjectYAML/CMakeLists.txt
@@ -0,0 +1,7 @@
+add_llvm_library(LLVMObjectYAML
+  YAML.cpp
+  COFFYAML.cpp
+  ELFYAML.cpp
+  MachOYAML.cpp
+  ObjectYAML.cpp
+  )
diff --git a/lib/Object/COFFYAML.cpp b/lib/ObjectYAML/COFFYAML.cpp
index 4c1fca19bf1b..7f9f4c1f8c2c 100644
--- a/lib/Object/COFFYAML.cpp
+++ b/lib/ObjectYAML/COFFYAML.cpp
@@ -11,7 +11,7 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "llvm/Object/COFFYAML.h"
+#include "llvm/ObjectYAML/COFFYAML.h"
 
 #define ECase(X) IO.enumCase(Value, #X, COFF::X);
 namespace llvm {
@@ -397,7 +397,7 @@ void MappingTraits<COFFYAML::PEHeader>::mapping(IO &IO,
   IO.mapOptional("CertificateTable", PH.DataDirectories[COFF::CERTIFICATE_TABLE]);
   IO.mapOptional("BaseRelocationTable",
                  PH.DataDirectories[COFF::BASE_RELOCATION_TABLE]);
-  IO.mapOptional("Debug", PH.DataDirectories[COFF::DEBUG]);
+  IO.mapOptional("Debug", PH.DataDirectories[COFF::DEBUG_DIRECTORY]);
   IO.mapOptional("Architecture", PH.DataDirectories[COFF::ARCHITECTURE]);
   IO.mapOptional("GlobalPtr", PH.DataDirectories[COFF::GLOBAL_PTR]);
   IO.mapOptional("TlsTable", PH.DataDirectories[COFF::TLS_TABLE]);
@@ -487,12 +487,13 @@ void MappingTraits<COFFYAML::Section>::mapping(IO &IO, COFFYAML::Section &Sec) {
   IO.mapRequired("Characteristics", NC->Characteristics);
   IO.mapOptional("VirtualAddress", Sec.Header.VirtualAddress, 0U);
   IO.mapOptional("VirtualSize", Sec.Header.VirtualSize, 0U);
-  IO.mapOptional("Alignment", Sec.Alignment);
+  IO.mapOptional("Alignment", Sec.Alignment, 0U);
   IO.mapRequired("SectionData", Sec.SectionData);
   IO.mapOptional("Relocations", Sec.Relocations);
 }
 
 void MappingTraits<COFFYAML::Object>::mapping(IO &IO, COFFYAML::Object &Obj) {
+  IO.mapTag("!COFF", true);
   IO.mapOptional("OptionalHeader", Obj.OptionalHeader);
   IO.mapRequired("header", Obj.Header);
   IO.mapRequired("sections", Obj.Sections);
diff --git a/lib/Object/ELFYAML.cpp b/lib/ObjectYAML/ELFYAML.cpp
index 4a4b2276f46b..2137eee4752f 100644
--- a/lib/Object/ELFYAML.cpp
+++ b/lib/ObjectYAML/ELFYAML.cpp
@@ -11,7 +11,7 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "llvm/Object/ELFYAML.h"
+#include "llvm/ObjectYAML/ELFYAML.h"
 #include "llvm/Support/Casting.h"
 #include "llvm/Support/MipsABIFlags.h"
 
@@ -194,6 +194,8 @@ ScalarEnumerationTraits<ELFYAML::ELF_EM>::enumeration(IO &IO,
   ECase(EM_78KOR)
   ECase(EM_56800EX)
   ECase(EM_AMDGPU)
+  ECase(EM_LANAI)
+  ECase(EM_BPF)
 #undef ECase
 }
 
@@ -239,6 +241,7 @@ void ScalarEnumerationTraits<ELFYAML::ELF_ELFOSABI>::enumeration(
   ECase(ELFOSABI_FENIXOS)
   ECase(ELFOSABI_CLOUDABI)
   ECase(ELFOSABI_C6000_ELFABI)
+  ECase(ELFOSABI_AMDGPU_HSA)
   ECase(ELFOSABI_C6000_LINUX)
   ECase(ELFOSABI_ARM)
   ECase(ELFOSABI_STANDALONE)
@@ -336,6 +339,9 @@ void ScalarBitSetTraits<ELFYAML::ELF_EF>::bitset(IO &IO,
     BCase(EF_AVR_ARCH_XMEGA6)
     BCase(EF_AVR_ARCH_XMEGA7)
     break;
+  case ELF::EM_AMDGPU:
+  case ELF::EM_X86_64:
+    break;
   default:
     llvm_unreachable("Unsupported architecture");
   }
@@ -422,6 +428,22 @@ void ScalarBitSetTraits<ELFYAML::ELF_SHF>::bitset(IO &IO,
     BCase(SHF_AMDGPU_HSA_CODE)
     BCase(SHF_AMDGPU_HSA_AGENT)
     break;
+  case ELF::EM_HEXAGON:
+    BCase(SHF_HEX_GPREL)
+    break;
+  case ELF::EM_MIPS:
+    BCase(SHF_MIPS_NODUPES)
+    BCase(SHF_MIPS_NAMES)
+    BCase(SHF_MIPS_LOCAL)
+    BCase(SHF_MIPS_NOSTRIP)
+    BCase(SHF_MIPS_GPREL)
+    BCase(SHF_MIPS_MERGE)
+    BCase(SHF_MIPS_ADDR)
+    BCase(SHF_MIPS_STRING)
+    break;
+  case ELF::EM_X86_64:
+    BCase(SHF_X86_64_LARGE)
+    break;
   default:
     // Nothing to do.
     break;
@@ -507,6 +529,15 @@ void ScalarEnumerationTraits<ELFYAML::ELF_REL>::enumeration(
   case ELF::EM_ARM:
 #include "llvm/Support/ELFRelocs/ARM.def"
     break;
+  case ELF::EM_LANAI:
+#include "llvm/Support/ELFRelocs/Lanai.def"
+    break;
+  case ELF::EM_AMDGPU:
+#include "llvm/Support/ELFRelocs/AMDGPU.def"
+    break;
+  case ELF::EM_BPF:
+#include "llvm/Support/ELFRelocs/BPF.def"
+    break;
   default:
     llvm_unreachable("Unsupported architecture");
   }
@@ -793,6 +824,7 @@ void MappingTraits<ELFYAML::Relocation>::mapping(IO &IO,
 void MappingTraits<ELFYAML::Object>::mapping(IO &IO, ELFYAML::Object &Object) {
   assert(!IO.getContext() && "The IO context is initialized already");
   IO.setContext(&Object);
+  IO.mapTag("!ELF", true);
   IO.mapRequired("FileHeader", Object.Header);
   IO.mapOptional("Sections", Object.Sections);
   IO.mapOptional("Symbols", Object.Symbols);
diff --git a/lib/ObjectYAML/LLVMBuild.txt b/lib/ObjectYAML/LLVMBuild.txt
new file mode 100644
index 000000000000..b8d1d2f1779e
--- /dev/null
+++ b/lib/ObjectYAML/LLVMBuild.txt
@@ -0,0 +1,14 @@
+;===------------------------------------------------------------*- Conf -*--===;
+;
+;                     The LLVM Compiler Infrastructure
+;
+; This file is distributed under the University of Illinois Open Source
+; License. See LICENSE.TXT for details.
+;
+;===------------------------------------------------------------------------===;
+
+[component_0]
+type = Library
+name = ObjectYAML
+parent = Libraries
+required_libraries = Support
diff --git a/lib/ObjectYAML/MachOYAML.cpp b/lib/ObjectYAML/MachOYAML.cpp
new file mode 100644
index 000000000000..d819e80836ce
--- /dev/null
+++ b/lib/ObjectYAML/MachOYAML.cpp
@@ -0,0 +1,546 @@
+//===- MachOYAML.cpp - MachO YAMLIO implementation ------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines classes for handling the YAML representation of MachO.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/ObjectYAML/MachOYAML.h"
+#include "llvm/Support/Casting.h"
+#include "llvm/Support/Format.h"
+#include "llvm/Support/MachO.h"
+
+#include <string.h> // For memcpy, memset and strnlen.
+
+namespace llvm {
+
+MachOYAML::LoadCommand::~LoadCommand() {}
+
+namespace yaml {
+
+void ScalarTraits<char_16>::output(const char_16 &Val, void *,
+                                   llvm::raw_ostream &Out) {
+  auto Len = strnlen(&Val[0], 16);
+  Out << StringRef(&Val[0], Len);
+}
+
+StringRef ScalarTraits<char_16>::input(StringRef Scalar, void *, char_16 &Val) {
+  size_t CopySize = 16 >= Scalar.size() ? 16 : Scalar.size();
+  memcpy((void *)Val, Scalar.data(), CopySize);
+
+  if (Scalar.size() < 16) {
+    memset((void *)&Val[Scalar.size()], 0, 16 - Scalar.size());
+  }
+
+  return StringRef();
+}
+
+bool ScalarTraits<char_16>::mustQuote(StringRef S) { return needsQuotes(S); }
+
+void ScalarTraits<uuid_t>::output(const uuid_t &Val, void *,
+                                  llvm::raw_ostream &Out) {
+  for (int Idx = 0; Idx < 16; ++Idx) {
+    Out << format("%02" PRIX32, Val[Idx]);
+    if (Idx == 3 || Idx == 5 || Idx == 7 || Idx == 9)
+      Out << "-";
+  }
+}
+
+StringRef ScalarTraits<uuid_t>::input(StringRef Scalar, void *, uuid_t &Val) {
+  size_t OutIdx = 0;
+  for (size_t Idx = 0; Idx < Scalar.size(); ++Idx) {
+    if (Scalar[Idx] == '-' || OutIdx >= 16)
+      continue;
+    unsigned long long TempInt;
+    if (getAsUnsignedInteger(Scalar.slice(Idx, Idx + 2), 16, TempInt))
+      return "invalid number";
+    if (TempInt > 0xFF)
+      return "out of range number";
+    Val[OutIdx] = static_cast<uint8_t>(TempInt);
+    ++Idx; // increment idx an extra time because we're consuming 2 chars
+    ++OutIdx;
+  }
+  return StringRef();
+}
+
+bool ScalarTraits<uuid_t>::mustQuote(StringRef S) { return needsQuotes(S); }
+
+void MappingTraits<MachOYAML::FileHeader>::mapping(
+    IO &IO, MachOYAML::FileHeader &FileHdr) {
+  IO.mapRequired("magic", FileHdr.magic);
+  IO.mapRequired("cputype", FileHdr.cputype);
+  IO.mapRequired("cpusubtype", FileHdr.cpusubtype);
+  IO.mapRequired("filetype", FileHdr.filetype);
+  IO.mapRequired("ncmds", FileHdr.ncmds);
+  IO.mapRequired("sizeofcmds", FileHdr.sizeofcmds);
+  IO.mapRequired("flags", FileHdr.flags);
+  if (FileHdr.magic == MachO::MH_MAGIC_64 ||
+      FileHdr.magic == MachO::MH_CIGAM_64)
+    IO.mapRequired("reserved", FileHdr.reserved);
+}
+
+void MappingTraits<MachOYAML::Object>::mapping(IO &IO,
+                                               MachOYAML::Object &Object) {
+  // If the context isn't already set, tag the document as !mach-o.
+  // For Fat files there will be a different tag so they can be differentiated.
+  if (!IO.getContext()) {
+    IO.setContext(&Object);
+  }
+  IO.mapTag("!mach-o", true);
+  IO.mapRequired("FileHeader", Object.Header);
+  IO.mapOptional("LoadCommands", Object.LoadCommands);
+  IO.mapOptional("LinkEditData", Object.LinkEdit);
+
+  if (IO.getContext() == &Object)
+    IO.setContext(nullptr);
+}
+
+void MappingTraits<MachOYAML::FatHeader>::mapping(
+    IO &IO, MachOYAML::FatHeader &FatHeader) {
+  IO.mapRequired("magic", FatHeader.magic);
+  IO.mapRequired("nfat_arch", FatHeader.nfat_arch);
+}
+
+void MappingTraits<MachOYAML::FatArch>::mapping(IO &IO,
+                                                MachOYAML::FatArch &FatArch) {
+  IO.mapRequired("cputype", FatArch.cputype);
+  IO.mapRequired("cpusubtype", FatArch.cpusubtype);
+  IO.mapRequired("offset", FatArch.offset);
+  IO.mapRequired("size", FatArch.size);
+  IO.mapRequired("align", FatArch.align);
+  IO.mapOptional("reserved", FatArch.reserved,
+                 static_cast<llvm::yaml::Hex32>(0));
+}
+
+void MappingTraits<MachOYAML::UniversalBinary>::mapping(
+    IO &IO, MachOYAML::UniversalBinary &UniversalBinary) {
+  if (!IO.getContext()) {
+    IO.setContext(&UniversalBinary);
+    IO.mapTag("!fat-mach-o", true);
+  }
+  IO.mapRequired("FatHeader", UniversalBinary.Header);
+  IO.mapRequired("FatArchs", UniversalBinary.FatArchs);
+  IO.mapRequired("Slices", UniversalBinary.Slices);
+
+  if (IO.getContext() == &UniversalBinary)
+    IO.setContext(nullptr);
+}
+
+void MappingTraits<MachOYAML::LinkEditData>::mapping(
+    IO &IO, MachOYAML::LinkEditData &LinkEditData) {
+  IO.mapOptional("RebaseOpcodes", LinkEditData.RebaseOpcodes);
+  IO.mapOptional("BindOpcodes", LinkEditData.BindOpcodes);
+  IO.mapOptional("WeakBindOpcodes", LinkEditData.WeakBindOpcodes);
+  IO.mapOptional("LazyBindOpcodes", LinkEditData.LazyBindOpcodes);
+  IO.mapOptional("ExportTrie", LinkEditData.ExportTrie);
+  IO.mapOptional("NameList", LinkEditData.NameList);
+  IO.mapOptional("StringTable", LinkEditData.StringTable);
+}
+
+void MappingTraits<MachOYAML::RebaseOpcode>::mapping(
+    IO &IO, MachOYAML::RebaseOpcode &RebaseOpcode) {
+  IO.mapRequired("Opcode", RebaseOpcode.Opcode);
+  IO.mapRequired("Imm", RebaseOpcode.Imm);
+  IO.mapOptional("ExtraData", RebaseOpcode.ExtraData);
+}
+
+void MappingTraits<MachOYAML::BindOpcode>::mapping(
+    IO &IO, MachOYAML::BindOpcode &BindOpcode) {
+  IO.mapRequired("Opcode", BindOpcode.Opcode);
+  IO.mapRequired("Imm", BindOpcode.Imm);
+  IO.mapOptional("ULEBExtraData", BindOpcode.ULEBExtraData);
+  IO.mapOptional("SLEBExtraData", BindOpcode.SLEBExtraData);
+  IO.mapOptional("Symbol", BindOpcode.Symbol);
+}
+
+void MappingTraits<MachOYAML::ExportEntry>::mapping(
+    IO &IO, MachOYAML::ExportEntry &ExportEntry) {
+  IO.mapRequired("TerminalSize", ExportEntry.TerminalSize);
+  IO.mapOptional("NodeOffset", ExportEntry.NodeOffset);
+  IO.mapOptional("Name", ExportEntry.Name);
+  IO.mapOptional("Flags", ExportEntry.Flags);
+  IO.mapOptional("Address", ExportEntry.Address);
+  IO.mapOptional("Other", ExportEntry.Other);
+  IO.mapOptional("ImportName", ExportEntry.ImportName);
+  IO.mapOptional("Children", ExportEntry.Children);
+}
+
+void MappingTraits<MachOYAML::NListEntry>::mapping(
+    IO &IO, MachOYAML::NListEntry &NListEntry) {
+  IO.mapRequired("n_strx", NListEntry.n_strx);
+  IO.mapRequired("n_type", NListEntry.n_type);
+  IO.mapRequired("n_sect", NListEntry.n_sect);
+  IO.mapRequired("n_desc", NListEntry.n_desc);
+  IO.mapRequired("n_value", NListEntry.n_value);
+}
+
+template <typename StructType>
+void mapLoadCommandData(IO &IO, MachOYAML::LoadCommand &LoadCommand) {}
+
+template <>
+void mapLoadCommandData<MachO::segment_command>(
+    IO &IO, MachOYAML::LoadCommand &LoadCommand) {
+  IO.mapOptional("Sections", LoadCommand.Sections);
+}
+
+template <>
+void mapLoadCommandData<MachO::segment_command_64>(
+    IO &IO, MachOYAML::LoadCommand &LoadCommand) {
+  IO.mapOptional("Sections", LoadCommand.Sections);
+}
+
+template <>
+void mapLoadCommandData<MachO::dylib_command>(
+    IO &IO, MachOYAML::LoadCommand &LoadCommand) {
+  IO.mapOptional("PayloadString", LoadCommand.PayloadString);
+}
+
+template <>
+void mapLoadCommandData<MachO::rpath_command>(
+    IO &IO, MachOYAML::LoadCommand &LoadCommand) {
+  IO.mapOptional("PayloadString", LoadCommand.PayloadString);
+}
+
+template <>
+void mapLoadCommandData<MachO::dylinker_command>(
+    IO &IO, MachOYAML::LoadCommand &LoadCommand) {
+  IO.mapOptional("PayloadString", LoadCommand.PayloadString);
+}
+
+void MappingTraits<MachOYAML::LoadCommand>::mapping(
+    IO &IO, MachOYAML::LoadCommand &LoadCommand) {
+  MachO::LoadCommandType TempCmd = static_cast<MachO::LoadCommandType>(
+      LoadCommand.Data.load_command_data.cmd);
+  IO.mapRequired("cmd", TempCmd);
+  LoadCommand.Data.load_command_data.cmd = TempCmd;
+  IO.mapRequired("cmdsize", LoadCommand.Data.load_command_data.cmdsize);
+
+#define HANDLE_LOAD_COMMAND(LCName, LCValue, LCStruct)                         \
+  case MachO::LCName:                                                          \
+    MappingTraits<MachO::LCStruct>::mapping(IO,                                \
+                                            LoadCommand.Data.LCStruct##_data); \
+    mapLoadCommandData<MachO::LCStruct>(IO, LoadCommand);                      \
+    break;
+
+  switch (LoadCommand.Data.load_command_data.cmd) {
+#include "llvm/Support/MachO.def"
+  }
+  IO.mapOptional("PayloadBytes", LoadCommand.PayloadBytes);
+  IO.mapOptional("ZeroPadBytes", LoadCommand.ZeroPadBytes, (uint64_t)0ull);
+}
+
+void MappingTraits<MachO::dyld_info_command>::mapping(
+    IO &IO, MachO::dyld_info_command &LoadCommand) {
+  IO.mapRequired("rebase_off", LoadCommand.rebase_off);
+  IO.mapRequired("rebase_size", LoadCommand.rebase_size);
+  IO.mapRequired("bind_off", LoadCommand.bind_off);
+  IO.mapRequired("bind_size", LoadCommand.bind_size);
+  IO.mapRequired("weak_bind_off", LoadCommand.weak_bind_off);
+  IO.mapRequired("weak_bind_size", LoadCommand.weak_bind_size);
+  IO.mapRequired("lazy_bind_off", LoadCommand.lazy_bind_off);
+  IO.mapRequired("lazy_bind_size", LoadCommand.lazy_bind_size);
+  IO.mapRequired("export_off", LoadCommand.export_off);
+  IO.mapRequired("export_size", LoadCommand.export_size);
+}
+
+void MappingTraits<MachOYAML::Section>::mapping(IO &IO,
+                                                MachOYAML::Section &Section) {
+  IO.mapRequired("sectname", Section.sectname);
+  IO.mapRequired("segname", Section.segname);
+  IO.mapRequired("addr", Section.addr);
+  IO.mapRequired("size", Section.size);
+  IO.mapRequired("offset", Section.offset);
+  IO.mapRequired("align", Section.align);
+  IO.mapRequired("reloff", Section.reloff);
+  IO.mapRequired("nreloc", Section.nreloc);
+  IO.mapRequired("flags", Section.flags);
+  IO.mapRequired("reserved1", Section.reserved1);
+  IO.mapRequired("reserved2", Section.reserved2);
+  IO.mapOptional("reserved3", Section.reserved3);
+}
+
+void MappingTraits<MachO::dylib>::mapping(IO &IO, MachO::dylib &DylibStruct) {
+  IO.mapRequired("name", DylibStruct.name);
+  IO.mapRequired("timestamp", DylibStruct.timestamp);
+  IO.mapRequired("current_version", DylibStruct.current_version);
+  IO.mapRequired("compatibility_version", DylibStruct.compatibility_version);
+}
+
+void MappingTraits<MachO::dylib_command>::mapping(
+    IO &IO, MachO::dylib_command &LoadCommand) {
+  IO.mapRequired("dylib", LoadCommand.dylib);
+}
+
+void MappingTraits<MachO::dylinker_command>::mapping(
+    IO &IO, MachO::dylinker_command &LoadCommand) {
+
+  IO.mapRequired("name", LoadCommand.name);
+}
+
+void MappingTraits<MachO::dysymtab_command>::mapping(
+    IO &IO, MachO::dysymtab_command &LoadCommand) {
+
+  IO.mapRequired("ilocalsym", LoadCommand.ilocalsym);
+  IO.mapRequired("nlocalsym", LoadCommand.nlocalsym);
+  IO.mapRequired("iextdefsym", LoadCommand.iextdefsym);
+  IO.mapRequired("nextdefsym", LoadCommand.nextdefsym);
+  IO.mapRequired("iundefsym", LoadCommand.iundefsym);
+  IO.mapRequired("nundefsym", LoadCommand.nundefsym);
+  IO.mapRequired("tocoff", LoadCommand.tocoff);
+  IO.mapRequired("ntoc", LoadCommand.ntoc);
+  IO.mapRequired("modtaboff", LoadCommand.modtaboff);
+  IO.mapRequired("nmodtab", LoadCommand.nmodtab);
+  IO.mapRequired("extrefsymoff", LoadCommand.extrefsymoff);
+  IO.mapRequired("nextrefsyms", LoadCommand.nextrefsyms);
+  IO.mapRequired("indirectsymoff", LoadCommand.indirectsymoff);
+  IO.mapRequired("nindirectsyms", LoadCommand.nindirectsyms);
+  IO.mapRequired("extreloff", LoadCommand.extreloff);
+  IO.mapRequired("nextrel", LoadCommand.nextrel);
+  IO.mapRequired("locreloff", LoadCommand.locreloff);
+  IO.mapRequired("nlocrel", LoadCommand.nlocrel);
+}
+
+void MappingTraits<MachO::encryption_info_command>::mapping(
+    IO &IO, MachO::encryption_info_command &LoadCommand) {
+
+  IO.mapRequired("cryptoff", LoadCommand.cryptoff);
+  IO.mapRequired("cryptsize", LoadCommand.cryptsize);
+  IO.mapRequired("cryptid", LoadCommand.cryptid);
+}
+
+void MappingTraits<MachO::encryption_info_command_64>::mapping(
+    IO &IO, MachO::encryption_info_command_64 &LoadCommand) {
+
+  IO.mapRequired("cryptoff", LoadCommand.cryptoff);
+  IO.mapRequired("cryptsize", LoadCommand.cryptsize);
+  IO.mapRequired("cryptid", LoadCommand.cryptid);
+  IO.mapRequired("pad", LoadCommand.pad);
+}
+
+void MappingTraits<MachO::entry_point_command>::mapping(
+    IO &IO, MachO::entry_point_command &LoadCommand) {
+
+  IO.mapRequired("entryoff", LoadCommand.entryoff);
+  IO.mapRequired("stacksize", LoadCommand.stacksize);
+}
+
+void MappingTraits<MachO::fvmfile_command>::mapping(
+    IO &IO, MachO::fvmfile_command &LoadCommand) {
+
+  IO.mapRequired("name", LoadCommand.name);
+  IO.mapRequired("header_addr", LoadCommand.header_addr);
+}
+
+void MappingTraits<MachO::fvmlib>::mapping(IO &IO, MachO::fvmlib &FVMLib) {
+  IO.mapRequired("name", FVMLib.name);
+  IO.mapRequired("minor_version", FVMLib.minor_version);
+  IO.mapRequired("header_addr", FVMLib.header_addr);
+}
+
+void MappingTraits<MachO::fvmlib_command>::mapping(
+    IO &IO, MachO::fvmlib_command &LoadCommand) {
+
+  IO.mapRequired("fvmlib", LoadCommand.fvmlib);
+}
+
+void MappingTraits<MachO::ident_command>::mapping(
+    IO &IO, MachO::ident_command &LoadCommand) {}
+
+void MappingTraits<MachO::linkedit_data_command>::mapping(
+    IO &IO, MachO::linkedit_data_command &LoadCommand) {
+
+  IO.mapRequired("dataoff", LoadCommand.dataoff);
+  IO.mapRequired("datasize", LoadCommand.datasize);
+}
+
+void MappingTraits<MachO::linker_option_command>::mapping(
+    IO &IO, MachO::linker_option_command &LoadCommand) {
+
+  IO.mapRequired("count", LoadCommand.count);
+}
+
+void MappingTraits<MachO::prebind_cksum_command>::mapping(
+    IO &IO, MachO::prebind_cksum_command &LoadCommand) {
+
+  IO.mapRequired("cksum", LoadCommand.cksum);
+}
+
+void MappingTraits<MachO::load_command>::mapping(
+    IO &IO, MachO::load_command &LoadCommand) {}
+
+void MappingTraits<MachO::prebound_dylib_command>::mapping(
+    IO &IO, MachO::prebound_dylib_command &LoadCommand) {
+
+  IO.mapRequired("name", LoadCommand.name);
+  IO.mapRequired("nmodules", LoadCommand.nmodules);
+  IO.mapRequired("linked_modules", LoadCommand.linked_modules);
+}
+
+void MappingTraits<MachO::routines_command>::mapping(
+    IO &IO, MachO::routines_command &LoadCommand) {
+
+  IO.mapRequired("init_address", LoadCommand.init_address);
+  IO.mapRequired("init_module", LoadCommand.init_module);
+  IO.mapRequired("reserved1", LoadCommand.reserved1);
+  IO.mapRequired("reserved2", LoadCommand.reserved2);
+  IO.mapRequired("reserved3", LoadCommand.reserved3);
+  IO.mapRequired("reserved4", LoadCommand.reserved4);
+  IO.mapRequired("reserved5", LoadCommand.reserved5);
+  IO.mapRequired("reserved6", LoadCommand.reserved6);
+}
+
+void MappingTraits<MachO::routines_command_64>::mapping(
+    IO &IO, MachO::routines_command_64 &LoadCommand) {
+
+  IO.mapRequired("init_address", LoadCommand.init_address);
+  IO.mapRequired("init_module", LoadCommand.init_module);
+  IO.mapRequired("reserved1", LoadCommand.reserved1);
+  IO.mapRequired("reserved2", LoadCommand.reserved2);
+  IO.mapRequired("reserved3", LoadCommand.reserved3);
+  IO.mapRequired("reserved4", LoadCommand.reserved4);
+  IO.mapRequired("reserved5", LoadCommand.reserved5);
+  IO.mapRequired("reserved6", LoadCommand.reserved6);
+}
+
+void MappingTraits<MachO::rpath_command>::mapping(
+    IO &IO, MachO::rpath_command &LoadCommand) {
+
+  IO.mapRequired("path", LoadCommand.path);
+}
+
+void MappingTraits<MachO::section>::mapping(IO &IO, MachO::section &Section) {
+  IO.mapRequired("sectname", Section.sectname);
+  IO.mapRequired("segname", Section.segname);
+  IO.mapRequired("addr", Section.addr);
+  IO.mapRequired("size", Section.size);
+  IO.mapRequired("offset", Section.offset);
+  IO.mapRequired("align", Section.align);
+  IO.mapRequired("reloff", Section.reloff);
+  IO.mapRequired("nreloc", Section.nreloc);
+  IO.mapRequired("flags", Section.flags);
+  IO.mapRequired("reserved1", Section.reserved1);
+  IO.mapRequired("reserved2", Section.reserved2);
+}
+
+void MappingTraits<MachO::section_64>::mapping(IO &IO,
+                                               MachO::section_64 &Section) {
+  IO.mapRequired("sectname", Section.sectname);
+  IO.mapRequired("segname", Section.segname);
+  IO.mapRequired("addr", Section.addr);
+  IO.mapRequired("size", Section.size);
+  IO.mapRequired("offset", Section.offset);
+  IO.mapRequired("align", Section.align);
+  IO.mapRequired("reloff", Section.reloff);
+  IO.mapRequired("nreloc", Section.nreloc);
+  IO.mapRequired("flags", Section.flags);
+  IO.mapRequired("reserved1", Section.reserved1);
+  IO.mapRequired("reserved2", Section.reserved2);
+  IO.mapRequired("reserved3", Section.reserved3);
+}
+
+void MappingTraits<MachO::segment_command>::mapping(
+    IO &IO, MachO::segment_command &LoadCommand) {
+
+  IO.mapRequired("segname", LoadCommand.segname);
+  IO.mapRequired("vmaddr", LoadCommand.vmaddr);
+  IO.mapRequired("vmsize", LoadCommand.vmsize);
+  IO.mapRequired("fileoff", LoadCommand.fileoff);
+  IO.mapRequired("filesize", LoadCommand.filesize);
+  IO.mapRequired("maxprot", LoadCommand.maxprot);
+  IO.mapRequired("initprot", LoadCommand.initprot);
+  IO.mapRequired("nsects", LoadCommand.nsects);
+  IO.mapRequired("flags", LoadCommand.flags);
+}
+
+void MappingTraits<MachO::segment_command_64>::mapping(
+    IO &IO, MachO::segment_command_64 &LoadCommand) {
+
+  IO.mapRequired("segname", LoadCommand.segname);
+  IO.mapRequired("vmaddr", LoadCommand.vmaddr);
+  IO.mapRequired("vmsize", LoadCommand.vmsize);
+  IO.mapRequired("fileoff", LoadCommand.fileoff);
+  IO.mapRequired("filesize", LoadCommand.filesize);
+  IO.mapRequired("maxprot", LoadCommand.maxprot);
+  IO.mapRequired("initprot", LoadCommand.initprot);
+  IO.mapRequired("nsects", LoadCommand.nsects);
+  IO.mapRequired("flags", LoadCommand.flags);
+}
+
+void MappingTraits<MachO::source_version_command>::mapping(
+    IO &IO, MachO::source_version_command &LoadCommand) {
+
+  IO.mapRequired("version", LoadCommand.version);
+}
+
+void MappingTraits<MachO::sub_client_command>::mapping(
+    IO &IO, MachO::sub_client_command &LoadCommand) {
+
+  IO.mapRequired("client", LoadCommand.client);
+}
+
+void MappingTraits<MachO::sub_framework_command>::mapping(
+    IO &IO, MachO::sub_framework_command &LoadCommand) {
+
+  IO.mapRequired("umbrella", LoadCommand.umbrella);
+}
+
+void MappingTraits<MachO::sub_library_command>::mapping(
+    IO &IO, MachO::sub_library_command &LoadCommand) {
+
+  IO.mapRequired("sub_library", LoadCommand.sub_library);
+}
+
+void MappingTraits<MachO::sub_umbrella_command>::mapping(
+    IO &IO, MachO::sub_umbrella_command &LoadCommand) {
+
+  IO.mapRequired("sub_umbrella", LoadCommand.sub_umbrella);
+}
+
+void MappingTraits<MachO::symseg_command>::mapping(
+    IO &IO, MachO::symseg_command &LoadCommand) {
+
+  IO.mapRequired("offset", LoadCommand.offset);
+  IO.mapRequired("size", LoadCommand.size);
+}
+
+void MappingTraits<MachO::symtab_command>::mapping(
+    IO &IO, MachO::symtab_command &LoadCommand) {
+
+  IO.mapRequired("symoff", LoadCommand.symoff);
+  IO.mapRequired("nsyms", LoadCommand.nsyms);
+  IO.mapRequired("stroff", LoadCommand.stroff);
+  IO.mapRequired("strsize", LoadCommand.strsize);
+}
+
+void MappingTraits<MachO::thread_command>::mapping(
+    IO &IO, MachO::thread_command &LoadCommand) {}
+
+void MappingTraits<MachO::twolevel_hints_command>::mapping(
+    IO &IO, MachO::twolevel_hints_command &LoadCommand) {
+
+  IO.mapRequired("offset", LoadCommand.offset);
+  IO.mapRequired("nhints", LoadCommand.nhints);
+}
+
+void MappingTraits<MachO::uuid_command>::mapping(
+    IO &IO, MachO::uuid_command &LoadCommand) {
+
+  IO.mapRequired("uuid", LoadCommand.uuid);
+}
+
+void MappingTraits<MachO::version_min_command>::mapping(
+    IO &IO, MachO::version_min_command &LoadCommand) {
+
+  IO.mapRequired("version", LoadCommand.version);
+  IO.mapRequired("sdk", LoadCommand.sdk);
+}
+
+} // namespace llvm::yaml
+
+} // namespace llvm
diff --git a/lib/ObjectYAML/ObjectYAML.cpp b/lib/ObjectYAML/ObjectYAML.cpp
new file mode 100644
index 000000000000..97741b5ec8b7
--- /dev/null
+++ b/lib/ObjectYAML/ObjectYAML.cpp
@@ -0,0 +1,57 @@
+//===- ObjectYAML.cpp - YAML utilities for object files -------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines a wrapper class for handling tagged YAML input
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/ObjectYAML/YAML.h"
+#include "llvm/ObjectYAML/ObjectYAML.h"
+
+using namespace llvm;
+using namespace yaml;
+
+void MappingTraits<YamlObjectFile>::mapping(IO &IO,
+                                            YamlObjectFile &ObjectFile) {
+  if (IO.outputting()) {
+    if (ObjectFile.Elf)
+      MappingTraits<ELFYAML::Object>::mapping(IO, *ObjectFile.Elf);
+    if (ObjectFile.Coff)
+      MappingTraits<COFFYAML::Object>::mapping(IO, *ObjectFile.Coff);
+    if (ObjectFile.MachO)
+      MappingTraits<MachOYAML::Object>::mapping(IO, *ObjectFile.MachO);
+    if (ObjectFile.FatMachO)
+      MappingTraits<MachOYAML::UniversalBinary>::mapping(IO,
+                                                         *ObjectFile.FatMachO);
+  } else {
+    if (IO.mapTag("!ELF")) {
+      ObjectFile.Elf.reset(new ELFYAML::Object());
+      MappingTraits<ELFYAML::Object>::mapping(IO, *ObjectFile.Elf);
+    } else if (IO.mapTag("!COFF")) {
+      ObjectFile.Coff.reset(new COFFYAML::Object());
+      MappingTraits<COFFYAML::Object>::mapping(IO, *ObjectFile.Coff);
+    } else if (IO.mapTag("!mach-o")) {
+      ObjectFile.MachO.reset(new MachOYAML::Object());
+      MappingTraits<MachOYAML::Object>::mapping(IO, *ObjectFile.MachO);
+    } else if (IO.mapTag("!fat-mach-o")) {
+      ObjectFile.FatMachO.reset(new MachOYAML::UniversalBinary());
+      MappingTraits<MachOYAML::UniversalBinary>::mapping(IO,
+                                                         *ObjectFile.FatMachO);
+    } else {
+      Input &In = (Input &)IO;
+      std::string Tag = In.getCurrentNode()->getRawTag();
+      if (Tag.empty())
+        IO.setError("YAML Object File missing document type tag!");
+      else
+        IO.setError(
+            llvm::Twine("YAML Object File unsupported document type tag '") +
+            llvm::Twine(Tag.c_str()) + llvm::Twine("'!"));
+    }
+  }
+}
diff --git a/lib/MC/YAML.cpp b/lib/ObjectYAML/YAML.cpp
index 067e91a26d37..75cf1fbccc80 100644
--- a/lib/MC/YAML.cpp
+++ b/lib/ObjectYAML/YAML.cpp
@@ -12,7 +12,7 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "llvm/MC/YAML.h"
+#include "llvm/ObjectYAML/YAML.h"
 #include "llvm/ADT/StringExtras.h"
 #include "llvm/Support/raw_ostream.h"
 #include <cctype>
@@ -56,10 +56,6 @@ void yaml::BinaryRef::writeAsHex(raw_ostream &OS) const {
     OS.write((const char *)Data.data(), Data.size());
     return;
   }
-  for (ArrayRef<uint8_t>::iterator I = Data.begin(), E = Data.end(); I != E;
-       ++I) {
-    uint8_t Byte = *I;
-    OS << hexdigit(Byte >> 4);
-    OS << hexdigit(Byte & 0xf);
-  }
+  for (uint8_t Byte : Data)
+    OS << hexdigit(Byte >> 4) << hexdigit(Byte & 0xf);
 }
diff --git a/lib/Option/Makefile b/lib/Option/Makefile
deleted file mode 100644
index 255d0796e237..000000000000
--- a/lib/Option/Makefile
+++ /dev/null
@@ -1,14 +0,0 @@
-##===- lib/Option/Makefile ---------------------------------*- Makefile -*-===##
-#
-#                     The LLVM Compiler Infrastructure
-#
-# This file is distributed under the University of Illinois Open Source
-# License. See LICENSE.TXT for details.
-#
-##===----------------------------------------------------------------------===##
-
-LEVEL = ../..
-LIBRARYNAME = LLVMOption
-BUILD_ARCHIVE := 1
-
-include $(LEVEL)/Makefile.common
diff --git a/lib/Option/OptTable.cpp b/lib/Option/OptTable.cpp
index 09d4cebb83d0..13aa9667b5c2 100644
--- a/lib/Option/OptTable.cpp
+++ b/lib/Option/OptTable.cpp
@@ -315,7 +315,7 @@ static std::string getOptionHelpName(const OptTable &Opts, OptSpecifier Id) {
     break;
 
   case Option::SeparateClass: case Option::JoinedOrSeparateClass:
-  case Option::RemainingArgsClass:
+  case Option::RemainingArgsClass: case Option::RemainingArgsJoinedClass:
     Name += ' ';
     // FALLTHROUGH
   case Option::JoinedClass: case Option::CommaJoinedClass:
diff --git a/lib/Option/Option.cpp b/lib/Option/Option.cpp
index ebf05aab764b..5eb179fbd257 100644
--- a/lib/Option/Option.cpp
+++ b/lib/Option/Option.cpp
@@ -51,6 +51,7 @@ void Option::print(raw_ostream &O) const {
     P(JoinedOrSeparateClass);
     P(JoinedAndSeparateClass);
     P(RemainingArgsClass);
+    P(RemainingArgsJoinedClass);
 #undef P
   }
 
@@ -82,7 +83,7 @@ void Option::print(raw_ostream &O) const {
   O << ">\n";
 }
 
-void Option::dump() const { print(dbgs()); }
+LLVM_DUMP_METHOD void Option::dump() const { print(dbgs()); }
 
 bool Option::matches(OptSpecifier Opt) const {
   // Aliases are never considered in matching, look through them.
@@ -234,6 +235,19 @@ Arg *Option::accept(const ArgList &Args,
       A->getValues().push_back(Args.getArgString(Index++));
     return A;
   }
+  case RemainingArgsJoinedClass: {
+    Arg *A = new Arg(UnaliasedOption, Spelling, Index);
+    if (ArgSize != strlen(Args.getArgString(Index))) {
+      // An inexact match means there is a joined arg.
+      A->getValues().push_back(Args.getArgString(Index) + ArgSize);
+    }
+    Index++;
+    while (Index < Args.getNumInputArgStrings() &&
+           Args.getArgString(Index) != nullptr)
+      A->getValues().push_back(Args.getArgString(Index++));
+    return A;
+  }
+
   default:
     llvm_unreachable("Invalid option kind!");
   }
diff --git a/lib/Passes/LLVMBuild.txt b/lib/Passes/LLVMBuild.txt
index a752f42dcedd..4d8c7f85d3aa 100644
--- a/lib/Passes/LLVMBuild.txt
+++ b/lib/Passes/LLVMBuild.txt
@@ -19,4 +19,4 @@
 type = Library
 name = Passes
 parent = Libraries
-required_libraries = Analysis Core IPO InstCombine Scalar Support TransformUtils Vectorize
+required_libraries = Analysis CodeGen Core IPO InstCombine Scalar Support TransformUtils Vectorize Instrumentation
diff --git a/lib/Passes/Makefile b/lib/Passes/Makefile
deleted file mode 100644
index 413dc5cf485b..000000000000
--- a/lib/Passes/Makefile
+++ /dev/null
@@ -1,14 +0,0 @@
-##===- lib/Passes/Makefile ---------------------------------*- Makefile -*-===##
-#
-#                     The LLVM Compiler Infrastructure
-#
-# This file is distributed under the University of Illinois Open Source
-# License. See LICENSE.TXT for details.
-#
-##===----------------------------------------------------------------------===##
-
-LEVEL = ../..
-LIBRARYNAME = LLVMPasses
-BUILD_ARCHIVE := 1
-
-include $(LEVEL)/Makefile.common
diff --git a/lib/Passes/PassBuilder.cpp b/lib/Passes/PassBuilder.cpp
index 8ba81f72a717..0e64df80f91f 100644
--- a/lib/Passes/PassBuilder.cpp
+++ b/lib/Passes/PassBuilder.cpp
@@ -16,112 +16,265 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/Passes/PassBuilder.h"
+#include "llvm/ADT/StringSwitch.h"
+#include "llvm/Analysis/AliasAnalysis.h"
+#include "llvm/Analysis/AliasAnalysisEvaluator.h"
 #include "llvm/Analysis/AssumptionCache.h"
+#include "llvm/Analysis/BasicAliasAnalysis.h"
+#include "llvm/Analysis/BlockFrequencyInfo.h"
+#include "llvm/Analysis/BlockFrequencyInfoImpl.h"
+#include "llvm/Analysis/BranchProbabilityInfo.h"
+#include "llvm/Analysis/CFLAndersAliasAnalysis.h"
+#include "llvm/Analysis/CFLSteensAliasAnalysis.h"
 #include "llvm/Analysis/CGSCCPassManager.h"
+#include "llvm/Analysis/CallGraph.h"
+#include "llvm/Analysis/DemandedBits.h"
+#include "llvm/Analysis/DependenceAnalysis.h"
+#include "llvm/Analysis/DominanceFrontier.h"
+#include "llvm/Analysis/GlobalsModRef.h"
+#include "llvm/Analysis/IVUsers.h"
 #include "llvm/Analysis/LazyCallGraph.h"
+#include "llvm/Analysis/LazyValueInfo.h"
+#include "llvm/Analysis/LoopAccessAnalysis.h"
 #include "llvm/Analysis/LoopInfo.h"
+#include "llvm/Analysis/MemoryDependenceAnalysis.h"
+#include "llvm/Analysis/OptimizationDiagnosticInfo.h"
+#include "llvm/Analysis/PostDominators.h"
+#include "llvm/Analysis/ProfileSummaryInfo.h"
+#include "llvm/Analysis/RegionInfo.h"
 #include "llvm/Analysis/ScalarEvolution.h"
+#include "llvm/Analysis/ScalarEvolutionAliasAnalysis.h"
+#include "llvm/Analysis/ScopedNoAliasAA.h"
 #include "llvm/Analysis/TargetLibraryInfo.h"
 #include "llvm/Analysis/TargetTransformInfo.h"
+#include "llvm/Analysis/TypeBasedAliasAnalysis.h"
+#include "llvm/CodeGen/PreISelIntrinsicLowering.h"
+#include "llvm/CodeGen/UnreachableBlockElim.h"
 #include "llvm/IR/Dominators.h"
 #include "llvm/IR/IRPrintingPasses.h"
 #include "llvm/IR/PassManager.h"
 #include "llvm/IR/Verifier.h"
 #include "llvm/Support/Debug.h"
+#include "llvm/Support/Regex.h"
 #include "llvm/Target/TargetMachine.h"
+#include "llvm/Transforms/GCOVProfiler.h"
+#include "llvm/Transforms/IPO/ConstantMerge.h"
+#include "llvm/Transforms/IPO/CrossDSOCFI.h"
+#include "llvm/Transforms/IPO/DeadArgumentElimination.h"
+#include "llvm/Transforms/IPO/ElimAvailExtern.h"
 #include "llvm/Transforms/IPO/ForceFunctionAttrs.h"
+#include "llvm/Transforms/IPO/FunctionAttrs.h"
+#include "llvm/Transforms/IPO/GlobalDCE.h"
+#include "llvm/Transforms/IPO/GlobalOpt.h"
 #include "llvm/Transforms/IPO/InferFunctionAttrs.h"
+#include "llvm/Transforms/IPO/Internalize.h"
+#include "llvm/Transforms/IPO/LowerTypeTests.h"
+#include "llvm/Transforms/IPO/PartialInlining.h"
+#include "llvm/Transforms/IPO/SCCP.h"
 #include "llvm/Transforms/IPO/StripDeadPrototypes.h"
+#include "llvm/Transforms/IPO/WholeProgramDevirt.h"
 #include "llvm/Transforms/InstCombine/InstCombine.h"
+#include "llvm/Transforms/InstrProfiling.h"
+#include "llvm/Transforms/PGOInstrumentation.h"
+#include "llvm/Transforms/SampleProfile.h"
 #include "llvm/Transforms/Scalar/ADCE.h"
+#include "llvm/Transforms/Scalar/AlignmentFromAssumptions.h"
+#include "llvm/Transforms/Scalar/BDCE.h"
+#include "llvm/Transforms/Scalar/ConstantHoisting.h"
+#include "llvm/Transforms/Scalar/CorrelatedValuePropagation.h"
+#include "llvm/Transforms/Scalar/DCE.h"
+#include "llvm/Transforms/Scalar/DeadStoreElimination.h"
 #include "llvm/Transforms/Scalar/EarlyCSE.h"
+#include "llvm/Transforms/Scalar/Float2Int.h"
+#include "llvm/Transforms/Scalar/GVN.h"
+#include "llvm/Transforms/Scalar/GuardWidening.h"
+#include "llvm/Transforms/Scalar/IndVarSimplify.h"
+#include "llvm/Transforms/Scalar/JumpThreading.h"
+#include "llvm/Transforms/Scalar/LICM.h"
+#include "llvm/Transforms/Scalar/LoopDeletion.h"
+#include "llvm/Transforms/Scalar/LoopDistribute.h"
+#include "llvm/Transforms/Scalar/LoopIdiomRecognize.h"
+#include "llvm/Transforms/Scalar/LoopInstSimplify.h"
+#include "llvm/Transforms/Scalar/LoopRotation.h"
+#include "llvm/Transforms/Scalar/LoopSimplifyCFG.h"
+#include "llvm/Transforms/Scalar/LowerAtomic.h"
 #include "llvm/Transforms/Scalar/LowerExpectIntrinsic.h"
+#include "llvm/Transforms/Scalar/MemCpyOptimizer.h"
+#include "llvm/Transforms/Scalar/MergedLoadStoreMotion.h"
+#include "llvm/Transforms/Scalar/PartiallyInlineLibCalls.h"
+#include "llvm/Transforms/Scalar/Reassociate.h"
+#include "llvm/Transforms/Scalar/SCCP.h"
 #include "llvm/Transforms/Scalar/SROA.h"
 #include "llvm/Transforms/Scalar/SimplifyCFG.h"
+#include "llvm/Transforms/Scalar/Sink.h"
+#include "llvm/Transforms/Scalar/TailRecursionElimination.h"
+#include "llvm/Transforms/Utils/AddDiscriminators.h"
+#include "llvm/Transforms/Utils/LCSSA.h"
+#include "llvm/Transforms/Utils/LoopSimplify.h"
+#include "llvm/Transforms/Utils/Mem2Reg.h"
+#include "llvm/Transforms/Utils/MemorySSA.h"
+#include "llvm/Transforms/Utils/SimplifyInstructions.h"
+#include "llvm/Transforms/Vectorize/LoopVectorize.h"
+#include "llvm/Transforms/Vectorize/SLPVectorizer.h"
+
+#include <type_traits>
 
 using namespace llvm;
 
+static Regex DefaultAliasRegex("^(default|lto-pre-link|lto)<(O[0123sz])>$");
+
 namespace {
 
 /// \brief No-op module pass which does nothing.
 struct NoOpModulePass {
-  PreservedAnalyses run(Module &M) { return PreservedAnalyses::all(); }
+  PreservedAnalyses run(Module &M, AnalysisManager<Module> &) {
+    return PreservedAnalyses::all();
+  }
   static StringRef name() { return "NoOpModulePass"; }
 };
 
 /// \brief No-op module analysis.
-struct NoOpModuleAnalysis {
+class NoOpModuleAnalysis : public AnalysisInfoMixin<NoOpModuleAnalysis> {
+  friend AnalysisInfoMixin<NoOpModuleAnalysis>;
+  static char PassID;
+
+public:
   struct Result {};
-  Result run(Module &) { return Result(); }
+  Result run(Module &, AnalysisManager<Module> &) { return Result(); }
   static StringRef name() { return "NoOpModuleAnalysis"; }
-  static void *ID() { return (void *)&PassID; }
-private:
-  static char PassID;
 };
 
-char NoOpModuleAnalysis::PassID;
-
 /// \brief No-op CGSCC pass which does nothing.
 struct NoOpCGSCCPass {
-  PreservedAnalyses run(LazyCallGraph::SCC &C) {
+  PreservedAnalyses run(LazyCallGraph::SCC &C,
+                        AnalysisManager<LazyCallGraph::SCC> &) {
     return PreservedAnalyses::all();
   }
   static StringRef name() { return "NoOpCGSCCPass"; }
 };
 
 /// \brief No-op CGSCC analysis.
-struct NoOpCGSCCAnalysis {
+class NoOpCGSCCAnalysis : public AnalysisInfoMixin<NoOpCGSCCAnalysis> {
+  friend AnalysisInfoMixin<NoOpCGSCCAnalysis>;
+  static char PassID;
+
+public:
   struct Result {};
-  Result run(LazyCallGraph::SCC &) { return Result(); }
+  Result run(LazyCallGraph::SCC &, AnalysisManager<LazyCallGraph::SCC> &) {
+    return Result();
+  }
   static StringRef name() { return "NoOpCGSCCAnalysis"; }
-  static void *ID() { return (void *)&PassID; }
-private:
-  static char PassID;
 };
 
-char NoOpCGSCCAnalysis::PassID;
-
 /// \brief No-op function pass which does nothing.
 struct NoOpFunctionPass {
-  PreservedAnalyses run(Function &F) { return PreservedAnalyses::all(); }
+  PreservedAnalyses run(Function &F, AnalysisManager<Function> &) {
+    return PreservedAnalyses::all();
+  }
   static StringRef name() { return "NoOpFunctionPass"; }
 };
 
 /// \brief No-op function analysis.
-struct NoOpFunctionAnalysis {
+class NoOpFunctionAnalysis : public AnalysisInfoMixin<NoOpFunctionAnalysis> {
+  friend AnalysisInfoMixin<NoOpFunctionAnalysis>;
+  static char PassID;
+
+public:
   struct Result {};
-  Result run(Function &) { return Result(); }
+  Result run(Function &, AnalysisManager<Function> &) { return Result(); }
   static StringRef name() { return "NoOpFunctionAnalysis"; }
-  static void *ID() { return (void *)&PassID; }
-private:
+};
+
+/// \brief No-op loop pass which does nothing.
+struct NoOpLoopPass {
+  PreservedAnalyses run(Loop &L, AnalysisManager<Loop> &) {
+    return PreservedAnalyses::all();
+  }
+  static StringRef name() { return "NoOpLoopPass"; }
+};
+
+/// \brief No-op loop analysis.
+class NoOpLoopAnalysis : public AnalysisInfoMixin<NoOpLoopAnalysis> {
+  friend AnalysisInfoMixin<NoOpLoopAnalysis>;
   static char PassID;
+
+public:
+  struct Result {};
+  Result run(Loop &, AnalysisManager<Loop> &) { return Result(); }
+  static StringRef name() { return "NoOpLoopAnalysis"; }
 };
 
+char NoOpModuleAnalysis::PassID;
+char NoOpCGSCCAnalysis::PassID;
 char NoOpFunctionAnalysis::PassID;
+char NoOpLoopAnalysis::PassID;
 
 } // End anonymous namespace.
 
 void PassBuilder::registerModuleAnalyses(ModuleAnalysisManager &MAM) {
-#define MODULE_ANALYSIS(NAME, CREATE_PASS) \
-  MAM.registerPass(CREATE_PASS);
+#define MODULE_ANALYSIS(NAME, CREATE_PASS)                                     \
+  MAM.registerPass([&] { return CREATE_PASS; });
 #include "PassRegistry.def"
 }
 
 void PassBuilder::registerCGSCCAnalyses(CGSCCAnalysisManager &CGAM) {
-#define CGSCC_ANALYSIS(NAME, CREATE_PASS) \
-  CGAM.registerPass(CREATE_PASS);
+#define CGSCC_ANALYSIS(NAME, CREATE_PASS)                                      \
+  CGAM.registerPass([&] { return CREATE_PASS; });
 #include "PassRegistry.def"
 }
 
 void PassBuilder::registerFunctionAnalyses(FunctionAnalysisManager &FAM) {
-#define FUNCTION_ANALYSIS(NAME, CREATE_PASS) \
-  FAM.registerPass(CREATE_PASS);
+#define FUNCTION_ANALYSIS(NAME, CREATE_PASS)                                   \
+  FAM.registerPass([&] { return CREATE_PASS; });
+#include "PassRegistry.def"
+}
+
+void PassBuilder::registerLoopAnalyses(LoopAnalysisManager &LAM) {
+#define LOOP_ANALYSIS(NAME, CREATE_PASS)                                       \
+  LAM.registerPass([&] { return CREATE_PASS; });
 #include "PassRegistry.def"
 }
 
+void PassBuilder::addPerModuleDefaultPipeline(ModulePassManager &MPM,
+                                              OptimizationLevel Level,
+                                              bool DebugLogging) {
+  // FIXME: Finish fleshing this out to match the legacy pipelines.
+  FunctionPassManager EarlyFPM(DebugLogging);
+  EarlyFPM.addPass(SimplifyCFGPass());
+  EarlyFPM.addPass(SROA());
+  EarlyFPM.addPass(EarlyCSEPass());
+  EarlyFPM.addPass(LowerExpectIntrinsicPass());
+
+  MPM.addPass(createModuleToFunctionPassAdaptor(std::move(EarlyFPM)));
+}
+
+void PassBuilder::addLTOPreLinkDefaultPipeline(ModulePassManager &MPM,
+                                               OptimizationLevel Level,
+                                               bool DebugLogging) {
+  // FIXME: We should use a customized pre-link pipeline!
+  addPerModuleDefaultPipeline(MPM, Level, DebugLogging);
+}
+
+void PassBuilder::addLTODefaultPipeline(ModulePassManager &MPM,
+                                        OptimizationLevel Level,
+                                        bool DebugLogging) {
+  // FIXME: Finish fleshing this out to match the legacy LTO pipelines.
+  FunctionPassManager LateFPM(DebugLogging);
+  LateFPM.addPass(InstCombinePass());
+  LateFPM.addPass(SimplifyCFGPass());
+
+  MPM.addPass(createModuleToFunctionPassAdaptor(std::move(LateFPM)));
+}
+
 #ifndef NDEBUG
 static bool isModulePassName(StringRef Name) {
-#define MODULE_PASS(NAME, CREATE_PASS) if (Name == NAME) return true;
+  // Manually handle aliases for pre-configured pipeline fragments.
+  if (Name.startswith("default") || Name.startswith("lto"))
+    return DefaultAliasRegex.match(Name);
+
+#define MODULE_PASS(NAME, CREATE_PASS)                                         \
+  if (Name == NAME)                                                            \
+    return true;
 #define MODULE_ANALYSIS(NAME, CREATE_PASS)                                     \
   if (Name == "require<" NAME ">" || Name == "invalidate<" NAME ">")           \
     return true;
@@ -132,7 +285,9 @@ static bool isModulePassName(StringRef Name) {
 #endif
 
 static bool isCGSCCPassName(StringRef Name) {
-#define CGSCC_PASS(NAME, CREATE_PASS) if (Name == NAME) return true;
+#define CGSCC_PASS(NAME, CREATE_PASS)                                          \
+  if (Name == NAME)                                                            \
+    return true;
 #define CGSCC_ANALYSIS(NAME, CREATE_PASS)                                      \
   if (Name == "require<" NAME ">" || Name == "invalidate<" NAME ">")           \
     return true;
@@ -142,7 +297,9 @@ static bool isCGSCCPassName(StringRef Name) {
 }
 
 static bool isFunctionPassName(StringRef Name) {
-#define FUNCTION_PASS(NAME, CREATE_PASS) if (Name == NAME) return true;
+#define FUNCTION_PASS(NAME, CREATE_PASS)                                       \
+  if (Name == NAME)                                                            \
+    return true;
 #define FUNCTION_ANALYSIS(NAME, CREATE_PASS)                                   \
   if (Name == "require<" NAME ">" || Name == "invalidate<" NAME ">")           \
     return true;
@@ -151,7 +308,46 @@ static bool isFunctionPassName(StringRef Name) {
   return false;
 }
 
-bool PassBuilder::parseModulePassName(ModulePassManager &MPM, StringRef Name) {
+static bool isLoopPassName(StringRef Name) {
+#define LOOP_PASS(NAME, CREATE_PASS)                                           \
+  if (Name == NAME)                                                            \
+    return true;
+#define LOOP_ANALYSIS(NAME, CREATE_PASS)                                       \
+  if (Name == "require<" NAME ">" || Name == "invalidate<" NAME ">")           \
+    return true;
+#include "PassRegistry.def"
+
+  return false;
+}
+
+bool PassBuilder::parseModulePassName(ModulePassManager &MPM, StringRef Name,
+                                      bool DebugLogging) {
+  // Manually handle aliases for pre-configured pipeline fragments.
+  if (Name.startswith("default") || Name.startswith("lto")) {
+    SmallVector<StringRef, 3> Matches;
+    if (!DefaultAliasRegex.match(Name, &Matches))
+      return false;
+    assert(Matches.size() == 3 && "Must capture two matched strings!");
+
+    auto L = StringSwitch<OptimizationLevel>(Matches[2])
+                 .Case("O0", O0)
+                 .Case("O1", O1)
+                 .Case("O2", O2)
+                 .Case("O3", O3)
+                 .Case("Os", Os)
+                 .Case("Oz", Oz);
+
+    if (Matches[1] == "default") {
+      addPerModuleDefaultPipeline(MPM, L, DebugLogging);
+    } else if (Matches[1] == "lto-pre-link") {
+      addLTOPreLinkDefaultPipeline(MPM, L, DebugLogging);
+    } else {
+      assert(Matches[1] == "lto" && "Not one of the matched options!");
+      addLTODefaultPipeline(MPM, L, DebugLogging);
+    }
+    return true;
+  }
+
 #define MODULE_PASS(NAME, CREATE_PASS)                                         \
   if (Name == NAME) {                                                          \
     MPM.addPass(CREATE_PASS);                                                  \
@@ -159,11 +355,13 @@ bool PassBuilder::parseModulePassName(ModulePassManager &MPM, StringRef Name) {
   }
 #define MODULE_ANALYSIS(NAME, CREATE_PASS)                                     \
   if (Name == "require<" NAME ">") {                                           \
-    MPM.addPass(RequireAnalysisPass<decltype(CREATE_PASS)>());                 \
+    MPM.addPass(RequireAnalysisPass<                                           \
+                std::remove_reference<decltype(CREATE_PASS)>::type>());        \
     return true;                                                               \
   }                                                                            \
   if (Name == "invalidate<" NAME ">") {                                        \
-    MPM.addPass(InvalidateAnalysisPass<decltype(CREATE_PASS)>());              \
+    MPM.addPass(InvalidateAnalysisPass<                                        \
+                std::remove_reference<decltype(CREATE_PASS)>::type>());        \
     return true;                                                               \
   }
 #include "PassRegistry.def"
@@ -179,11 +377,13 @@ bool PassBuilder::parseCGSCCPassName(CGSCCPassManager &CGPM, StringRef Name) {
   }
 #define CGSCC_ANALYSIS(NAME, CREATE_PASS)                                      \
   if (Name == "require<" NAME ">") {                                           \
-    CGPM.addPass(RequireAnalysisPass<decltype(CREATE_PASS)>());                \
+    CGPM.addPass(RequireAnalysisPass<                                          \
+                 std::remove_reference<decltype(CREATE_PASS)>::type>());       \
     return true;                                                               \
   }                                                                            \
   if (Name == "invalidate<" NAME ">") {                                        \
-    CGPM.addPass(InvalidateAnalysisPass<decltype(CREATE_PASS)>());             \
+    CGPM.addPass(InvalidateAnalysisPass<                                       \
+                 std::remove_reference<decltype(CREATE_PASS)>::type>());       \
     return true;                                                               \
   }
 #include "PassRegistry.def"
@@ -200,11 +400,53 @@ bool PassBuilder::parseFunctionPassName(FunctionPassManager &FPM,
   }
 #define FUNCTION_ANALYSIS(NAME, CREATE_PASS)                                   \
   if (Name == "require<" NAME ">") {                                           \
-    FPM.addPass(RequireAnalysisPass<decltype(CREATE_PASS)>());                 \
+    FPM.addPass(RequireAnalysisPass<                                           \
+                std::remove_reference<decltype(CREATE_PASS)>::type>());        \
+    return true;                                                               \
+  }                                                                            \
+  if (Name == "invalidate<" NAME ">") {                                        \
+    FPM.addPass(InvalidateAnalysisPass<                                        \
+                std::remove_reference<decltype(CREATE_PASS)>::type>());        \
+    return true;                                                               \
+  }
+#include "PassRegistry.def"
+
+  return false;
+}
+
+bool PassBuilder::parseLoopPassName(LoopPassManager &FPM, StringRef Name) {
+#define LOOP_PASS(NAME, CREATE_PASS)                                           \
+  if (Name == NAME) {                                                          \
+    FPM.addPass(CREATE_PASS);                                                  \
+    return true;                                                               \
+  }
+#define LOOP_ANALYSIS(NAME, CREATE_PASS)                                       \
+  if (Name == "require<" NAME ">") {                                           \
+    FPM.addPass(RequireAnalysisPass<                                           \
+                std::remove_reference<decltype(CREATE_PASS)>::type>());        \
     return true;                                                               \
   }                                                                            \
   if (Name == "invalidate<" NAME ">") {                                        \
-    FPM.addPass(InvalidateAnalysisPass<decltype(CREATE_PASS)>());              \
+    FPM.addPass(InvalidateAnalysisPass<                                        \
+                std::remove_reference<decltype(CREATE_PASS)>::type>());        \
+    return true;                                                               \
+  }
+#include "PassRegistry.def"
+
+  return false;
+}
+
+bool PassBuilder::parseAAPassName(AAManager &AA, StringRef Name) {
+#define MODULE_ALIAS_ANALYSIS(NAME, CREATE_PASS)                               \
+  if (Name == NAME) {                                                          \
+    AA.registerModuleAnalysis<                                                 \
+        std::remove_reference<decltype(CREATE_PASS)>::type>();                 \
+    return true;                                                               \
+  }
+#define FUNCTION_ALIAS_ANALYSIS(NAME, CREATE_PASS)                             \
+  if (Name == NAME) {                                                          \
+    AA.registerFunctionAnalysis<                                               \
+        std::remove_reference<decltype(CREATE_PASS)>::type>();                 \
     return true;                                                               \
   }
 #include "PassRegistry.def"
@@ -212,6 +454,45 @@ bool PassBuilder::parseFunctionPassName(FunctionPassManager &FPM,
   return false;
 }
 
+bool PassBuilder::parseLoopPassPipeline(LoopPassManager &LPM,
+                                        StringRef &PipelineText,
+                                        bool VerifyEachPass,
+                                        bool DebugLogging) {
+  for (;;) {
+    // Parse nested pass managers by recursing.
+    if (PipelineText.startswith("loop(")) {
+      LoopPassManager NestedLPM(DebugLogging);
+
+      // Parse the inner pipeline inte the nested manager.
+      PipelineText = PipelineText.substr(strlen("loop("));
+      if (!parseLoopPassPipeline(NestedLPM, PipelineText, VerifyEachPass,
+                                 DebugLogging) ||
+          PipelineText.empty())
+        return false;
+      assert(PipelineText[0] == ')');
+      PipelineText = PipelineText.substr(1);
+
+      // Add the nested pass manager with the appropriate adaptor.
+      LPM.addPass(std::move(NestedLPM));
+    } else {
+      // Otherwise try to parse a pass name.
+      size_t End = PipelineText.find_first_of(",)");
+      if (!parseLoopPassName(LPM, PipelineText.substr(0, End)))
+        return false;
+      // TODO: Ideally, we would run a LoopVerifierPass() here in the
+      // VerifyEachPass case, but we don't have such a verifier yet.
+
+      PipelineText = PipelineText.substr(End);
+    }
+
+    if (PipelineText.empty() || PipelineText[0] == ')')
+      return true;
+
+    assert(PipelineText[0] == ',');
+    PipelineText = PipelineText.substr(1);
+  }
+}
+
 bool PassBuilder::parseFunctionPassPipeline(FunctionPassManager &FPM,
                                             StringRef &PipelineText,
                                             bool VerifyEachPass,
@@ -232,6 +513,20 @@ bool PassBuilder::parseFunctionPassPipeline(FunctionPassManager &FPM,
 
       // Add the nested pass manager with the appropriate adaptor.
       FPM.addPass(std::move(NestedFPM));
+    } else if (PipelineText.startswith("loop(")) {
+      LoopPassManager NestedLPM(DebugLogging);
+
+      // Parse the inner pipeline inte the nested manager.
+      PipelineText = PipelineText.substr(strlen("loop("));
+      if (!parseLoopPassPipeline(NestedLPM, PipelineText, VerifyEachPass,
+                                 DebugLogging) ||
+          PipelineText.empty())
+        return false;
+      assert(PipelineText[0] == ')');
+      PipelineText = PipelineText.substr(1);
+
+      // Add the nested pass manager with the appropriate adaptor.
+      FPM.addPass(createFunctionToLoopPassAdaptor(std::move(NestedLPM)));
     } else {
       // Otherwise try to parse a pass name.
       size_t End = PipelineText.find_first_of(",)");
@@ -284,7 +579,8 @@ bool PassBuilder::parseCGSCCPassPipeline(CGSCCPassManager &CGPM,
       PipelineText = PipelineText.substr(1);
 
       // Add the nested pass manager with the appropriate adaptor.
-      CGPM.addPass(createCGSCCToFunctionPassAdaptor(std::move(NestedFPM)));
+      CGPM.addPass(
+          createCGSCCToFunctionPassAdaptor(std::move(NestedFPM), DebugLogging));
     } else {
       // Otherwise try to parse a pass name.
       size_t End = PipelineText.find_first_of(",)");
@@ -303,6 +599,20 @@ bool PassBuilder::parseCGSCCPassPipeline(CGSCCPassManager &CGPM,
   }
 }
 
+void PassBuilder::crossRegisterProxies(LoopAnalysisManager &LAM,
+                                       FunctionAnalysisManager &FAM,
+                                       CGSCCAnalysisManager &CGAM,
+                                       ModuleAnalysisManager &MAM) {
+  MAM.registerPass([&] { return FunctionAnalysisManagerModuleProxy(FAM); });
+  MAM.registerPass([&] { return CGSCCAnalysisManagerModuleProxy(CGAM); });
+  CGAM.registerPass([&] { return FunctionAnalysisManagerCGSCCProxy(FAM); });
+  CGAM.registerPass([&] { return ModuleAnalysisManagerCGSCCProxy(MAM); });
+  FAM.registerPass([&] { return CGSCCAnalysisManagerFunctionProxy(CGAM); });
+  FAM.registerPass([&] { return ModuleAnalysisManagerFunctionProxy(MAM); });
+  FAM.registerPass([&] { return LoopAnalysisManagerFunctionProxy(LAM); });
+  LAM.registerPass([&] { return FunctionAnalysisManagerLoopProxy(FAM); });
+}
+
 bool PassBuilder::parseModulePassPipeline(ModulePassManager &MPM,
                                           StringRef &PipelineText,
                                           bool VerifyEachPass,
@@ -336,8 +646,8 @@ bool PassBuilder::parseModulePassPipeline(ModulePassManager &MPM,
       PipelineText = PipelineText.substr(1);
 
       // Add the nested pass manager with the appropriate adaptor.
-      MPM.addPass(
-          createModuleToPostOrderCGSCCPassAdaptor(std::move(NestedCGPM)));
+      MPM.addPass(createModuleToPostOrderCGSCCPassAdaptor(std::move(NestedCGPM),
+                                                          DebugLogging));
     } else if (PipelineText.startswith("function(")) {
       FunctionPassManager NestedFPM(DebugLogging);
 
@@ -355,7 +665,7 @@ bool PassBuilder::parseModulePassPipeline(ModulePassManager &MPM,
     } else {
       // Otherwise try to parse a pass name.
       size_t End = PipelineText.find_first_of(",)");
-      if (!parseModulePassName(MPM, PipelineText.substr(0, End)))
+      if (!parseModulePassName(MPM, PipelineText.substr(0, End), DebugLogging))
         return false;
       if (VerifyEachPass)
         MPM.addPass(VerifierPass());
@@ -392,19 +702,20 @@ bool PassBuilder::parsePassPipeline(ModulePassManager &MPM,
 
   // If this looks like a CGSCC pass, parse the whole thing as a CGSCC
   // pipeline.
-  if (isCGSCCPassName(FirstName)) {
+  if (PipelineText.startswith("cgscc(") || isCGSCCPassName(FirstName)) {
     CGSCCPassManager CGPM(DebugLogging);
     if (!parseCGSCCPassPipeline(CGPM, PipelineText, VerifyEachPass,
                                 DebugLogging) ||
         !PipelineText.empty())
       return false;
-    MPM.addPass(createModuleToPostOrderCGSCCPassAdaptor(std::move(CGPM)));
+    MPM.addPass(
+        createModuleToPostOrderCGSCCPassAdaptor(std::move(CGPM), DebugLogging));
     return true;
   }
 
   // Similarly, if this looks like a Function pass, parse the whole thing as
   // a Function pipelien.
-  if (isFunctionPassName(FirstName)) {
+  if (PipelineText.startswith("function(") || isFunctionPassName(FirstName)) {
     FunctionPassManager FPM(DebugLogging);
     if (!parseFunctionPassPipeline(FPM, PipelineText, VerifyEachPass,
                                    DebugLogging) ||
@@ -414,5 +725,29 @@ bool PassBuilder::parsePassPipeline(ModulePassManager &MPM,
     return true;
   }
 
+  // If this looks like a Loop pass, parse the whole thing as a Loop pipeline.
+  if (PipelineText.startswith("loop(") || isLoopPassName(FirstName)) {
+    LoopPassManager LPM(DebugLogging);
+    if (!parseLoopPassPipeline(LPM, PipelineText, VerifyEachPass,
+                               DebugLogging) ||
+        !PipelineText.empty())
+      return false;
+    FunctionPassManager FPM(DebugLogging);
+    FPM.addPass(createFunctionToLoopPassAdaptor(std::move(LPM)));
+    MPM.addPass(createModuleToFunctionPassAdaptor(std::move(FPM)));
+    return true;
+  }
+
   return false;
 }
+
+bool PassBuilder::parseAAPipeline(AAManager &AA, StringRef PipelineText) {
+  while (!PipelineText.empty()) {
+    StringRef Name;
+    std::tie(Name, PipelineText) = PipelineText.split(',');
+    if (!parseAAPassName(AA, Name))
+      return false;
+  }
+
+  return true;
+}
diff --git a/lib/Passes/PassRegistry.def b/lib/Passes/PassRegistry.def
index 241a78927c77..b717057632b4 100644
--- a/lib/Passes/PassRegistry.def
+++ b/lib/Passes/PassRegistry.def
@@ -19,21 +19,53 @@
 #ifndef MODULE_ANALYSIS
 #define MODULE_ANALYSIS(NAME, CREATE_PASS)
 #endif
+MODULE_ANALYSIS("callgraph", CallGraphAnalysis())
 MODULE_ANALYSIS("lcg", LazyCallGraphAnalysis())
 MODULE_ANALYSIS("no-op-module", NoOpModuleAnalysis())
+MODULE_ANALYSIS("profile-summary", ProfileSummaryAnalysis())
 MODULE_ANALYSIS("targetlibinfo", TargetLibraryAnalysis())
+MODULE_ANALYSIS("verify", VerifierAnalysis())
+
+#ifndef MODULE_ALIAS_ANALYSIS
+#define MODULE_ALIAS_ANALYSIS(NAME, CREATE_PASS)                               \
+  MODULE_ANALYSIS(NAME, CREATE_PASS)
+#endif
+MODULE_ALIAS_ANALYSIS("globals-aa", GlobalsAA())
+#undef MODULE_ALIAS_ANALYSIS
 #undef MODULE_ANALYSIS
 
 #ifndef MODULE_PASS
 #define MODULE_PASS(NAME, CREATE_PASS)
 #endif
+MODULE_PASS("constmerge", ConstantMergePass())
+MODULE_PASS("cross-dso-cfi", CrossDSOCFIPass())
+MODULE_PASS("deadargelim", DeadArgumentEliminationPass())
+MODULE_PASS("elim-avail-extern", EliminateAvailableExternallyPass())
 MODULE_PASS("forceattrs", ForceFunctionAttrsPass())
+MODULE_PASS("globaldce", GlobalDCEPass())
+MODULE_PASS("globalopt", GlobalOptPass())
 MODULE_PASS("inferattrs", InferFunctionAttrsPass())
+MODULE_PASS("insert-gcov-profiling", GCOVProfilerPass())
+MODULE_PASS("instrprof", InstrProfiling())
+MODULE_PASS("internalize", InternalizePass())
 MODULE_PASS("invalidate<all>", InvalidateAllAnalysesPass())
+MODULE_PASS("ipsccp", IPSCCPPass())
+MODULE_PASS("lowertypetests", LowerTypeTestsPass())
 MODULE_PASS("no-op-module", NoOpModulePass())
+MODULE_PASS("partial-inliner", PartialInlinerPass())
+MODULE_PASS("pgo-icall-prom", PGOIndirectCallPromotion())
+MODULE_PASS("pgo-instr-gen", PGOInstrumentationGen())
+MODULE_PASS("pgo-instr-use", PGOInstrumentationUse())
+MODULE_PASS("pre-isel-intrinsic-lowering", PreISelIntrinsicLoweringPass())
+MODULE_PASS("print-profile-summary", ProfileSummaryPrinterPass(dbgs()))
+MODULE_PASS("print-callgraph", CallGraphPrinterPass(dbgs()))
 MODULE_PASS("print", PrintModulePass(dbgs()))
-MODULE_PASS("print-cg", LazyCallGraphPrinterPass(dbgs()))
+MODULE_PASS("print-lcg", LazyCallGraphPrinterPass(dbgs()))
+MODULE_PASS("print-lcg-dot", LazyCallGraphDOTPrinterPass(dbgs()))
+MODULE_PASS("rpo-functionattrs", ReversePostOrderFunctionAttrsPass())
+MODULE_PASS("sample-profile", SampleProfileLoaderPass())
 MODULE_PASS("strip-dead-prototypes", StripDeadPrototypesPass())
+MODULE_PASS("wholeprogramdevirt", WholeProgramDevirtPass())
 MODULE_PASS("verify", VerifierPass())
 #undef MODULE_PASS
 
@@ -47,38 +79,127 @@ CGSCC_ANALYSIS("no-op-cgscc", NoOpCGSCCAnalysis())
 #define CGSCC_PASS(NAME, CREATE_PASS)
 #endif
 CGSCC_PASS("invalidate<all>", InvalidateAllAnalysesPass())
+CGSCC_PASS("function-attrs", PostOrderFunctionAttrsPass())
 CGSCC_PASS("no-op-cgscc", NoOpCGSCCPass())
 #undef CGSCC_PASS
 
 #ifndef FUNCTION_ANALYSIS
 #define FUNCTION_ANALYSIS(NAME, CREATE_PASS)
 #endif
+FUNCTION_ANALYSIS("aa", AAManager())
 FUNCTION_ANALYSIS("assumptions", AssumptionAnalysis())
+FUNCTION_ANALYSIS("block-freq", BlockFrequencyAnalysis())
+FUNCTION_ANALYSIS("branch-prob", BranchProbabilityAnalysis())
 FUNCTION_ANALYSIS("domtree", DominatorTreeAnalysis())
+FUNCTION_ANALYSIS("postdomtree", PostDominatorTreeAnalysis())
+FUNCTION_ANALYSIS("demanded-bits", DemandedBitsAnalysis())
+FUNCTION_ANALYSIS("domfrontier", DominanceFrontierAnalysis())
 FUNCTION_ANALYSIS("loops", LoopAnalysis())
+FUNCTION_ANALYSIS("lazy-value-info", LazyValueAnalysis())
+FUNCTION_ANALYSIS("da", DependenceAnalysis())
+FUNCTION_ANALYSIS("memdep", MemoryDependenceAnalysis())
+FUNCTION_ANALYSIS("memoryssa", MemorySSAAnalysis())
+FUNCTION_ANALYSIS("regions", RegionInfoAnalysis())
 FUNCTION_ANALYSIS("no-op-function", NoOpFunctionAnalysis())
+FUNCTION_ANALYSIS("opt-remark-emit", OptimizationRemarkEmitterAnalysis())
 FUNCTION_ANALYSIS("scalar-evolution", ScalarEvolutionAnalysis())
 FUNCTION_ANALYSIS("targetlibinfo", TargetLibraryAnalysis())
 FUNCTION_ANALYSIS("targetir",
                   TM ? TM->getTargetIRAnalysis() : TargetIRAnalysis())
+FUNCTION_ANALYSIS("verify", VerifierAnalysis())
+
+#ifndef FUNCTION_ALIAS_ANALYSIS
+#define FUNCTION_ALIAS_ANALYSIS(NAME, CREATE_PASS)                             \
+  FUNCTION_ANALYSIS(NAME, CREATE_PASS)
+#endif
+FUNCTION_ALIAS_ANALYSIS("basic-aa", BasicAA())
+FUNCTION_ALIAS_ANALYSIS("cfl-anders-aa", CFLAndersAA())
+FUNCTION_ALIAS_ANALYSIS("cfl-steens-aa", CFLSteensAA())
+FUNCTION_ALIAS_ANALYSIS("scev-aa", SCEVAA())
+FUNCTION_ALIAS_ANALYSIS("scoped-noalias-aa", ScopedNoAliasAA())
+FUNCTION_ALIAS_ANALYSIS("type-based-aa", TypeBasedAA())
+#undef FUNCTION_ALIAS_ANALYSIS
 #undef FUNCTION_ANALYSIS
 
 #ifndef FUNCTION_PASS
 #define FUNCTION_PASS(NAME, CREATE_PASS)
 #endif
+FUNCTION_PASS("aa-eval", AAEvaluator())
 FUNCTION_PASS("adce", ADCEPass())
+FUNCTION_PASS("add-discriminators", AddDiscriminatorsPass())
+FUNCTION_PASS("alignment-from-assumptions", AlignmentFromAssumptionsPass())
+FUNCTION_PASS("bdce", BDCEPass())
+FUNCTION_PASS("consthoist", ConstantHoistingPass())
+FUNCTION_PASS("correlated-propagation", CorrelatedValuePropagationPass())
+FUNCTION_PASS("dce", DCEPass())
+FUNCTION_PASS("dse", DSEPass())
 FUNCTION_PASS("early-cse", EarlyCSEPass())
+FUNCTION_PASS("gvn-hoist", GVNHoistPass())
 FUNCTION_PASS("instcombine", InstCombinePass())
+FUNCTION_PASS("instsimplify", InstSimplifierPass())
 FUNCTION_PASS("invalidate<all>", InvalidateAllAnalysesPass())
+FUNCTION_PASS("float2int", Float2IntPass())
 FUNCTION_PASS("no-op-function", NoOpFunctionPass())
+FUNCTION_PASS("loweratomic", LowerAtomicPass())
 FUNCTION_PASS("lower-expect", LowerExpectIntrinsicPass())
+FUNCTION_PASS("guard-widening", GuardWideningPass())
+FUNCTION_PASS("gvn", GVN())
+FUNCTION_PASS("loop-simplify", LoopSimplifyPass())
+FUNCTION_PASS("mem2reg", PromotePass())
+FUNCTION_PASS("memcpyopt", MemCpyOptPass())
+FUNCTION_PASS("mldst-motion", MergedLoadStoreMotionPass())
+FUNCTION_PASS("jump-threading", JumpThreadingPass())
+FUNCTION_PASS("partially-inline-libcalls", PartiallyInlineLibCallsPass())
+FUNCTION_PASS("lcssa", LCSSAPass())
+FUNCTION_PASS("loop-distribute", LoopDistributePass())
+FUNCTION_PASS("loop-vectorize", LoopVectorizePass())
 FUNCTION_PASS("print", PrintFunctionPass(dbgs()))
 FUNCTION_PASS("print<assumptions>", AssumptionPrinterPass(dbgs()))
+FUNCTION_PASS("print<block-freq>", BlockFrequencyPrinterPass(dbgs()))
+FUNCTION_PASS("print<branch-prob>", BranchProbabilityPrinterPass(dbgs()))
 FUNCTION_PASS("print<domtree>", DominatorTreePrinterPass(dbgs()))
+FUNCTION_PASS("print<postdomtree>", PostDominatorTreePrinterPass(dbgs()))
+FUNCTION_PASS("print<demanded-bits>", DemandedBitsPrinterPass(dbgs()))
+FUNCTION_PASS("print<domfrontier>", DominanceFrontierPrinterPass(dbgs()))
 FUNCTION_PASS("print<loops>", LoopPrinterPass(dbgs()))
+FUNCTION_PASS("print<memoryssa>", MemorySSAPrinterPass(dbgs()))
+FUNCTION_PASS("print<regions>", RegionInfoPrinterPass(dbgs()))
 FUNCTION_PASS("print<scalar-evolution>", ScalarEvolutionPrinterPass(dbgs()))
+FUNCTION_PASS("reassociate", ReassociatePass())
+FUNCTION_PASS("sccp", SCCPPass())
 FUNCTION_PASS("simplify-cfg", SimplifyCFGPass())
+FUNCTION_PASS("sink", SinkingPass())
+FUNCTION_PASS("slp-vectorizer", SLPVectorizerPass())
 FUNCTION_PASS("sroa", SROA())
+FUNCTION_PASS("tailcallelim", TailCallElimPass())
+FUNCTION_PASS("unreachableblockelim", UnreachableBlockElimPass())
 FUNCTION_PASS("verify", VerifierPass())
 FUNCTION_PASS("verify<domtree>", DominatorTreeVerifierPass())
+FUNCTION_PASS("verify<memoryssa>", MemorySSAVerifierPass())
+FUNCTION_PASS("verify<regions>", RegionInfoVerifierPass())
 #undef FUNCTION_PASS
+
+#ifndef LOOP_ANALYSIS
+#define LOOP_ANALYSIS(NAME, CREATE_PASS)
+#endif
+LOOP_ANALYSIS("no-op-loop", NoOpLoopAnalysis())
+LOOP_ANALYSIS("access-info", LoopAccessAnalysis())
+LOOP_ANALYSIS("ivusers", IVUsersAnalysis())
+#undef LOOP_ANALYSIS
+
+#ifndef LOOP_PASS
+#define LOOP_PASS(NAME, CREATE_PASS)
+#endif
+LOOP_PASS("invalidate<all>", InvalidateAllAnalysesPass())
+LOOP_PASS("licm", LICMPass())
+LOOP_PASS("loop-idiom", LoopIdiomRecognizePass())
+LOOP_PASS("loop-instsimplify", LoopInstSimplifyPass())
+LOOP_PASS("rotate", LoopRotatePass())
+LOOP_PASS("no-op-loop", NoOpLoopPass())
+LOOP_PASS("print", PrintLoopPass(dbgs()))
+LOOP_PASS("loop-deletion", LoopDeletionPass())
+LOOP_PASS("simplify-cfg", LoopSimplifyCFGPass())
+LOOP_PASS("indvars", IndVarSimplifyPass())
+LOOP_PASS("print-access-info", LoopAccessInfoPrinterPass(dbgs()))
+LOOP_PASS("print<ivusers>", IVUsersPrinterPass(dbgs()))
+#undef LOOP_PASS
diff --git a/lib/ProfileData/CMakeLists.txt b/lib/ProfileData/CMakeLists.txt
index 22cca4b44df5..cd65762ae6a0 100644
--- a/lib/ProfileData/CMakeLists.txt
+++ b/lib/ProfileData/CMakeLists.txt
@@ -2,9 +2,7 @@ add_llvm_library(LLVMProfileData
   InstrProf.cpp
   InstrProfReader.cpp
   InstrProfWriter.cpp
-  CoverageMapping.cpp
-  CoverageMappingWriter.cpp
-  CoverageMappingReader.cpp
+  ProfileSummaryBuilder.cpp
   SampleProf.cpp
   SampleProfReader.cpp
   SampleProfWriter.cpp
@@ -15,3 +13,5 @@ add_llvm_library(LLVMProfileData
   DEPENDS
   intrinsics_gen
   )
+
+add_subdirectory(Coverage)
diff --git a/lib/ProfileData/Coverage/CMakeLists.txt b/lib/ProfileData/Coverage/CMakeLists.txt
new file mode 100644
index 000000000000..035b8fdb8b3d
--- /dev/null
+++ b/lib/ProfileData/Coverage/CMakeLists.txt
@@ -0,0 +1,11 @@
+add_llvm_library(LLVMCoverage
+  CoverageMapping.cpp
+  CoverageMappingWriter.cpp
+  CoverageMappingReader.cpp
+
+  ADDITIONAL_HEADER_DIRS
+  ${LLVM_MAIN_INCLUDE_DIR}/llvm/ProfileData/Coverage
+
+  DEPENDS
+  intrinsics_gen
+  )
diff --git a/lib/ProfileData/CoverageMapping.cpp b/lib/ProfileData/Coverage/CoverageMapping.cpp
index f5d477bd139a..fcd4e24bdfcb 100644
--- a/lib/ProfileData/CoverageMapping.cpp
+++ b/lib/ProfileData/Coverage/CoverageMapping.cpp
@@ -12,11 +12,11 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "llvm/ProfileData/CoverageMapping.h"
+#include "llvm/ProfileData/Coverage/CoverageMapping.h"
 #include "llvm/ADT/DenseMap.h"
 #include "llvm/ADT/Optional.h"
 #include "llvm/ADT/SmallBitVector.h"
-#include "llvm/ProfileData/CoverageMappingReader.h"
+#include "llvm/ProfileData/Coverage/CoverageMappingReader.h"
 #include "llvm/ProfileData/InstrProfReader.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/Errc.h"
@@ -143,28 +143,30 @@ void CounterMappingContext::dump(const Counter &C,
   }
   if (CounterValues.empty())
     return;
-  ErrorOr<int64_t> Value = evaluate(C);
-  if (!Value)
+  Expected<int64_t> Value = evaluate(C);
+  if (auto E = Value.takeError()) {
+    llvm::consumeError(std::move(E));
     return;
+  }
   OS << '[' << *Value << ']';
 }
 
-ErrorOr<int64_t> CounterMappingContext::evaluate(const Counter &C) const {
+Expected<int64_t> CounterMappingContext::evaluate(const Counter &C) const {
   switch (C.getKind()) {
   case Counter::Zero:
     return 0;
   case Counter::CounterValueReference:
     if (C.getCounterID() >= CounterValues.size())
-      return make_error_code(errc::argument_out_of_domain);
+      return errorCodeToError(errc::argument_out_of_domain);
     return CounterValues[C.getCounterID()];
   case Counter::Expression: {
     if (C.getExpressionID() >= Expressions.size())
-      return make_error_code(errc::argument_out_of_domain);
+      return errorCodeToError(errc::argument_out_of_domain);
     const auto &E = Expressions[C.getExpressionID()];
-    ErrorOr<int64_t> LHS = evaluate(E.LHS);
+    Expected<int64_t> LHS = evaluate(E.LHS);
     if (!LHS)
       return LHS;
-    ErrorOr<int64_t> RHS = evaluate(E.RHS);
+    Expected<int64_t> RHS = evaluate(E.RHS);
     if (!RHS)
       return RHS;
     return E.Kind == CounterExpression::Subtract ? *LHS - *RHS : *LHS + *RHS;
@@ -181,7 +183,7 @@ void FunctionRecordIterator::skipOtherFiles() {
     *this = FunctionRecordIterator();
 }
 
-ErrorOr<std::unique_ptr<CoverageMapping>>
+Expected<std::unique_ptr<CoverageMapping>>
 CoverageMapping::load(CoverageMappingReader &CoverageReader,
                       IndexedInstrProfReader &ProfileReader) {
   auto Coverage = std::unique_ptr<CoverageMapping>(new CoverageMapping());
@@ -191,13 +193,14 @@ CoverageMapping::load(CoverageMappingReader &CoverageReader,
     CounterMappingContext Ctx(Record.Expressions);
 
     Counts.clear();
-    if (std::error_code EC = ProfileReader.getFunctionCounts(
+    if (Error E = ProfileReader.getFunctionCounts(
             Record.FunctionName, Record.FunctionHash, Counts)) {
-      if (EC == instrprof_error::hash_mismatch) {
+      instrprof_error IPE = InstrProfError::take(std::move(E));
+      if (IPE == instrprof_error::hash_mismatch) {
         Coverage->MismatchedFunctionCount++;
         continue;
-      } else if (EC != instrprof_error::unknown_function)
-        return EC;
+      } else if (IPE != instrprof_error::unknown_function)
+        return make_error<InstrProfError>(IPE);
       Counts.assign(Record.MappingRegions.size(), 0);
     }
     Ctx.setCounts(Counts);
@@ -205,14 +208,18 @@ CoverageMapping::load(CoverageMappingReader &CoverageReader,
     assert(!Record.MappingRegions.empty() && "Function has no regions");
 
     StringRef OrigFuncName = Record.FunctionName;
-    if (!Record.Filenames.empty())
+    if (Record.Filenames.empty())
+      OrigFuncName = getFuncNameWithoutPrefix(OrigFuncName);
+    else
       OrigFuncName =
           getFuncNameWithoutPrefix(OrigFuncName, Record.Filenames[0]);
     FunctionRecord Function(OrigFuncName, Record.Filenames);
     for (const auto &Region : Record.MappingRegions) {
-      ErrorOr<int64_t> ExecutionCount = Ctx.evaluate(Region.Count);
-      if (!ExecutionCount)
+      Expected<int64_t> ExecutionCount = Ctx.evaluate(Region.Count);
+      if (auto E = ExecutionCount.takeError()) {
+        llvm::consumeError(std::move(E));
         break;
+      }
       Function.pushRegion(Region, *ExecutionCount);
     }
     if (Function.CountedRegions.size() != Record.MappingRegions.size()) {
@@ -226,20 +233,20 @@ CoverageMapping::load(CoverageMappingReader &CoverageReader,
   return std::move(Coverage);
 }
 
-ErrorOr<std::unique_ptr<CoverageMapping>>
+Expected<std::unique_ptr<CoverageMapping>>
 CoverageMapping::load(StringRef ObjectFilename, StringRef ProfileFilename,
                       StringRef Arch) {
   auto CounterMappingBuff = MemoryBuffer::getFileOrSTDIN(ObjectFilename);
   if (std::error_code EC = CounterMappingBuff.getError())
-    return EC;
+    return errorCodeToError(EC);
   auto CoverageReaderOrErr =
       BinaryCoverageReader::create(CounterMappingBuff.get(), Arch);
-  if (std::error_code EC = CoverageReaderOrErr.getError())
-    return EC;
+  if (Error E = CoverageReaderOrErr.takeError())
+    return std::move(E);
   auto CoverageReader = std::move(CoverageReaderOrErr.get());
   auto ProfileReaderOrErr = IndexedInstrProfReader::create(ProfileFilename);
-  if (auto EC = ProfileReaderOrErr.getError())
-    return EC;
+  if (Error E = ProfileReaderOrErr.takeError())
+    return std::move(E);
   auto ProfileReader = std::move(ProfileReaderOrErr.get());
   return load(*CoverageReader, *ProfileReader);
 }
@@ -270,9 +277,11 @@ public:
 };
 
 class SegmentBuilder {
-  std::vector<CoverageSegment> Segments;
+  std::vector<CoverageSegment> &Segments;
   SmallVector<const CountedRegion *, 8> ActiveRegions;
 
+  SegmentBuilder(std::vector<CoverageSegment> &Segments) : Segments(Segments) {}
+
   /// Start a segment with no count specified.
   void startSegment(unsigned Line, unsigned Col) {
     DEBUG(dbgs() << "Top level segment at " << Line << ":" << Col << "\n");
@@ -282,20 +291,17 @@ class SegmentBuilder {
   /// Start a segment with the given Region's count.
   void startSegment(unsigned Line, unsigned Col, bool IsRegionEntry,
                     const CountedRegion &Region) {
-    if (Segments.empty())
-      Segments.emplace_back(Line, Col, IsRegionEntry);
-    CoverageSegment S = Segments.back();
     // Avoid creating empty regions.
-    if (S.Line != Line || S.Col != Col) {
-      Segments.emplace_back(Line, Col, IsRegionEntry);
-      S = Segments.back();
-    }
+    if (!Segments.empty() && Segments.back().Line == Line &&
+        Segments.back().Col == Col)
+      Segments.pop_back();
     DEBUG(dbgs() << "Segment at " << Line << ":" << Col);
     // Set this region's count.
     if (Region.Kind != coverage::CounterMappingRegion::SkippedRegion) {
       DEBUG(dbgs() << " with count " << Region.ExecutionCount);
-      Segments.back().setCount(Region.ExecutionCount);
-    }
+      Segments.emplace_back(Line, Col, Region.ExecutionCount, IsRegionEntry);
+    } else
+      Segments.emplace_back(Line, Col, IsRegionEntry);
     DEBUG(dbgs() << "\n");
   }
 
@@ -316,29 +322,89 @@ class SegmentBuilder {
       startSegment(Line, Col, false, *ActiveRegions.back());
   }
 
-public:
-  /// Build a list of CoverageSegments from a sorted list of Regions.
-  std::vector<CoverageSegment> buildSegments(ArrayRef<CountedRegion> Regions) {
-    const CountedRegion *PrevRegion = nullptr;
+  void buildSegmentsImpl(ArrayRef<CountedRegion> Regions) {
     for (const auto &Region : Regions) {
       // Pop any regions that end before this one starts.
       while (!ActiveRegions.empty() &&
              ActiveRegions.back()->endLoc() <= Region.startLoc())
         popRegion();
-      if (PrevRegion && PrevRegion->startLoc() == Region.startLoc() &&
-          PrevRegion->endLoc() == Region.endLoc()) {
-        if (Region.Kind == coverage::CounterMappingRegion::CodeRegion)
-          Segments.back().addCount(Region.ExecutionCount);
-      } else {
-        // Add this region to the stack.
-        ActiveRegions.push_back(&Region);
-        startSegment(Region);
-      }
-      PrevRegion = &Region;
+      // Add this region to the stack.
+      ActiveRegions.push_back(&Region);
+      startSegment(Region);
     }
     // Pop any regions that are left in the stack.
     while (!ActiveRegions.empty())
       popRegion();
+  }
+
+  /// Sort a nested sequence of regions from a single file.
+  static void sortNestedRegions(MutableArrayRef<CountedRegion> Regions) {
+    std::sort(Regions.begin(), Regions.end(), [](const CountedRegion &LHS,
+                                                 const CountedRegion &RHS) {
+      if (LHS.startLoc() != RHS.startLoc())
+        return LHS.startLoc() < RHS.startLoc();
+      if (LHS.endLoc() != RHS.endLoc())
+        // When LHS completely contains RHS, we sort LHS first.
+        return RHS.endLoc() < LHS.endLoc();
+      // If LHS and RHS cover the same area, we need to sort them according
+      // to their kinds so that the most suitable region will become "active"
+      // in combineRegions(). Because we accumulate counter values only from
+      // regions of the same kind as the first region of the area, prefer
+      // CodeRegion to ExpansionRegion and ExpansionRegion to SkippedRegion.
+      static_assert(coverage::CounterMappingRegion::CodeRegion <
+                            coverage::CounterMappingRegion::ExpansionRegion &&
+                        coverage::CounterMappingRegion::ExpansionRegion <
+                            coverage::CounterMappingRegion::SkippedRegion,
+                    "Unexpected order of region kind values");
+      return LHS.Kind < RHS.Kind;
+    });
+  }
+
+  /// Combine counts of regions which cover the same area.
+  static ArrayRef<CountedRegion>
+  combineRegions(MutableArrayRef<CountedRegion> Regions) {
+    if (Regions.empty())
+      return Regions;
+    auto Active = Regions.begin();
+    auto End = Regions.end();
+    for (auto I = Regions.begin() + 1; I != End; ++I) {
+      if (Active->startLoc() != I->startLoc() ||
+          Active->endLoc() != I->endLoc()) {
+        // Shift to the next region.
+        ++Active;
+        if (Active != I)
+          *Active = *I;
+        continue;
+      }
+      // Merge duplicate region.
+      // If CodeRegions and ExpansionRegions cover the same area, it's probably
+      // a macro which is fully expanded to another macro. In that case, we need
+      // to accumulate counts only from CodeRegions, or else the area will be
+      // counted twice.
+      // On the other hand, a macro may have a nested macro in its body. If the
+      // outer macro is used several times, the ExpansionRegion for the nested
+      // macro will also be added several times. These ExpansionRegions cover
+      // the same source locations and have to be combined to reach the correct
+      // value for that area.
+      // We add counts of the regions of the same kind as the active region
+      // to handle the both situations.
+      if (I->Kind == Active->Kind)
+        Active->ExecutionCount += I->ExecutionCount;
+    }
+    return Regions.drop_back(std::distance(++Active, End));
+  }
+
+public:
+  /// Build a list of CoverageSegments from a list of Regions.
+  static std::vector<CoverageSegment>
+  buildSegments(MutableArrayRef<CountedRegion> Regions) {
+    std::vector<CoverageSegment> Segments;
+    SegmentBuilder Builder(Segments);
+
+    sortNestedRegions(Regions);
+    ArrayRef<CountedRegion> CombinedRegions = combineRegions(Regions);
+
+    Builder.buildSegmentsImpl(CombinedRegions);
     return Segments;
   }
 };
@@ -364,21 +430,7 @@ static SmallBitVector gatherFileIDs(StringRef SourceFile,
   return FilenameEquivalence;
 }
 
-static Optional<unsigned> findMainViewFileID(StringRef SourceFile,
-                                             const FunctionRecord &Function) {
-  SmallBitVector IsNotExpandedFile(Function.Filenames.size(), true);
-  SmallBitVector FilenameEquivalence = gatherFileIDs(SourceFile, Function);
-  for (const auto &CR : Function.CountedRegions)
-    if (CR.Kind == CounterMappingRegion::ExpansionRegion &&
-        FilenameEquivalence[CR.FileID])
-      IsNotExpandedFile[CR.ExpandedFileID] = false;
-  IsNotExpandedFile &= FilenameEquivalence;
-  int I = IsNotExpandedFile.find_first();
-  if (I == -1)
-    return None;
-  return I;
-}
-
+/// Return the ID of the file where the definition of the function is located.
 static Optional<unsigned> findMainViewFileID(const FunctionRecord &Function) {
   SmallBitVector IsNotExpandedFile(Function.Filenames.size(), true);
   for (const auto &CR : Function.CountedRegions)
@@ -390,47 +442,43 @@ static Optional<unsigned> findMainViewFileID(const FunctionRecord &Function) {
   return I;
 }
 
-/// Sort a nested sequence of regions from a single file.
-template <class It> static void sortNestedRegions(It First, It Last) {
-  std::sort(First, Last,
-            [](const CountedRegion &LHS, const CountedRegion &RHS) {
-    if (LHS.startLoc() == RHS.startLoc())
-      // When LHS completely contains RHS, we sort LHS first.
-      return RHS.endLoc() < LHS.endLoc();
-    return LHS.startLoc() < RHS.startLoc();
-  });
+/// Check if SourceFile is the file that contains the definition of
+/// the Function. Return the ID of the file in that case or None otherwise.
+static Optional<unsigned> findMainViewFileID(StringRef SourceFile,
+                                             const FunctionRecord &Function) {
+  Optional<unsigned> I = findMainViewFileID(Function);
+  if (I && SourceFile == Function.Filenames[*I])
+    return I;
+  return None;
 }
 
 static bool isExpansion(const CountedRegion &R, unsigned FileID) {
   return R.Kind == CounterMappingRegion::ExpansionRegion && R.FileID == FileID;
 }
 
-CoverageData CoverageMapping::getCoverageForFile(StringRef Filename) {
+CoverageData CoverageMapping::getCoverageForFile(StringRef Filename) const {
   CoverageData FileCoverage(Filename);
   std::vector<coverage::CountedRegion> Regions;
 
   for (const auto &Function : Functions) {
     auto MainFileID = findMainViewFileID(Filename, Function);
-    if (!MainFileID)
-      continue;
     auto FileIDs = gatherFileIDs(Filename, Function);
     for (const auto &CR : Function.CountedRegions)
       if (FileIDs.test(CR.FileID)) {
         Regions.push_back(CR);
-        if (isExpansion(CR, *MainFileID))
+        if (MainFileID && isExpansion(CR, *MainFileID))
           FileCoverage.Expansions.emplace_back(CR, Function);
       }
   }
 
-  sortNestedRegions(Regions.begin(), Regions.end());
   DEBUG(dbgs() << "Emitting segments for file: " << Filename << "\n");
-  FileCoverage.Segments = SegmentBuilder().buildSegments(Regions);
+  FileCoverage.Segments = SegmentBuilder::buildSegments(Regions);
 
   return FileCoverage;
 }
 
 std::vector<const FunctionRecord *>
-CoverageMapping::getInstantiations(StringRef Filename) {
+CoverageMapping::getInstantiations(StringRef Filename) const {
   FunctionInstantiationSetCollector InstantiationSetCollector;
   for (const auto &Function : Functions) {
     auto MainFileID = findMainViewFileID(Filename, Function);
@@ -450,7 +498,7 @@ CoverageMapping::getInstantiations(StringRef Filename) {
 }
 
 CoverageData
-CoverageMapping::getCoverageForFunction(const FunctionRecord &Function) {
+CoverageMapping::getCoverageForFunction(const FunctionRecord &Function) const {
   auto MainFileID = findMainViewFileID(Function);
   if (!MainFileID)
     return CoverageData();
@@ -464,15 +512,14 @@ CoverageMapping::getCoverageForFunction(const FunctionRecord &Function) {
         FunctionCoverage.Expansions.emplace_back(CR, Function);
     }
 
-  sortNestedRegions(Regions.begin(), Regions.end());
   DEBUG(dbgs() << "Emitting segments for function: " << Function.Name << "\n");
-  FunctionCoverage.Segments = SegmentBuilder().buildSegments(Regions);
+  FunctionCoverage.Segments = SegmentBuilder::buildSegments(Regions);
 
   return FunctionCoverage;
 }
 
-CoverageData
-CoverageMapping::getCoverageForExpansion(const ExpansionRecord &Expansion) {
+CoverageData CoverageMapping::getCoverageForExpansion(
+    const ExpansionRecord &Expansion) const {
   CoverageData ExpansionCoverage(
       Expansion.Function.Filenames[Expansion.FileID]);
   std::vector<coverage::CountedRegion> Regions;
@@ -483,36 +530,45 @@ CoverageMapping::getCoverageForExpansion(const ExpansionRecord &Expansion) {
         ExpansionCoverage.Expansions.emplace_back(CR, Expansion.Function);
     }
 
-  sortNestedRegions(Regions.begin(), Regions.end());
   DEBUG(dbgs() << "Emitting segments for expansion of file " << Expansion.FileID
                << "\n");
-  ExpansionCoverage.Segments = SegmentBuilder().buildSegments(Regions);
+  ExpansionCoverage.Segments = SegmentBuilder::buildSegments(Regions);
 
   return ExpansionCoverage;
 }
 
 namespace {
+std::string getCoverageMapErrString(coveragemap_error Err) {
+  switch (Err) {
+  case coveragemap_error::success:
+    return "Success";
+  case coveragemap_error::eof:
+    return "End of File";
+  case coveragemap_error::no_data_found:
+    return "No coverage data found";
+  case coveragemap_error::unsupported_version:
+    return "Unsupported coverage format version";
+  case coveragemap_error::truncated:
+    return "Truncated coverage data";
+  case coveragemap_error::malformed:
+    return "Malformed coverage data";
+  }
+  llvm_unreachable("A value of coveragemap_error has no message.");
+}
+
+// FIXME: This class is only here to support the transition to llvm::Error. It
+// will be removed once this transition is complete. Clients should prefer to
+// deal with the Error value directly, rather than converting to error_code.
 class CoverageMappingErrorCategoryType : public std::error_category {
   const char *name() const LLVM_NOEXCEPT override { return "llvm.coveragemap"; }
   std::string message(int IE) const override {
-    auto E = static_cast<coveragemap_error>(IE);
-    switch (E) {
-    case coveragemap_error::success:
-      return "Success";
-    case coveragemap_error::eof:
-      return "End of File";
-    case coveragemap_error::no_data_found:
-      return "No coverage data found";
-    case coveragemap_error::unsupported_version:
-      return "Unsupported coverage format version";
-    case coveragemap_error::truncated:
-      return "Truncated coverage data";
-    case coveragemap_error::malformed:
-      return "Malformed coverage data";
-    }
-    llvm_unreachable("A value of coveragemap_error has no message.");
+    return getCoverageMapErrString(static_cast<coveragemap_error>(IE));
   }
 };
+} // end anonymous namespace
+
+std::string CoverageMapError::message() const {
+  return getCoverageMapErrString(Err);
 }
 
 static ManagedStatic<CoverageMappingErrorCategoryType> ErrorCategory;
@@ -520,3 +576,5 @@ static ManagedStatic<CoverageMappingErrorCategoryType> ErrorCategory;
 const std::error_category &llvm::coverage::coveragemap_category() {
   return *ErrorCategory;
 }
+
+char CoverageMapError::ID = 0;
diff --git a/lib/ProfileData/CoverageMappingReader.cpp b/lib/ProfileData/Coverage/CoverageMappingReader.cpp
index 89e1cf42c577..1a4b4f590841 100644
--- a/lib/ProfileData/CoverageMappingReader.cpp
+++ b/lib/ProfileData/Coverage/CoverageMappingReader.cpp
@@ -12,8 +12,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "llvm/ProfileData/CoverageMappingReader.h"
-#include "llvm/ADT/DenseSet.h"
+#include "llvm/ProfileData/Coverage/CoverageMappingReader.h"
+#include "llvm/ADT/DenseMap.h"
 #include "llvm/Object/MachOUniversal.h"
 #include "llvm/Object/ObjectFile.h"
 #include "llvm/Support/Debug.h"
@@ -31,49 +31,54 @@ using namespace object;
 void CoverageMappingIterator::increment() {
   // Check if all the records were read or if an error occurred while reading
   // the next record.
-  if (Reader->readNextRecord(Record))
-    *this = CoverageMappingIterator();
+  if (auto E = Reader->readNextRecord(Record)) {
+    handleAllErrors(std::move(E), [&](const CoverageMapError &CME) {
+      if (CME.get() == coveragemap_error::eof)
+        *this = CoverageMappingIterator();
+      else
+        llvm_unreachable("Unexpected error in coverage mapping iterator");
+    });
+  }
 }
 
-std::error_code RawCoverageReader::readULEB128(uint64_t &Result) {
+Error RawCoverageReader::readULEB128(uint64_t &Result) {
   if (Data.size() < 1)
-    return coveragemap_error::truncated;
+    return make_error<CoverageMapError>(coveragemap_error::truncated);
   unsigned N = 0;
   Result = decodeULEB128(reinterpret_cast<const uint8_t *>(Data.data()), &N);
   if (N > Data.size())
-    return coveragemap_error::malformed;
+    return make_error<CoverageMapError>(coveragemap_error::malformed);
   Data = Data.substr(N);
-  return std::error_code();
+  return Error::success();
 }
 
-std::error_code RawCoverageReader::readIntMax(uint64_t &Result,
-                                              uint64_t MaxPlus1) {
+Error RawCoverageReader::readIntMax(uint64_t &Result, uint64_t MaxPlus1) {
   if (auto Err = readULEB128(Result))
     return Err;
   if (Result >= MaxPlus1)
-    return coveragemap_error::malformed;
-  return std::error_code();
+    return make_error<CoverageMapError>(coveragemap_error::malformed);
+  return Error::success();
 }
 
-std::error_code RawCoverageReader::readSize(uint64_t &Result) {
+Error RawCoverageReader::readSize(uint64_t &Result) {
   if (auto Err = readULEB128(Result))
     return Err;
   // Sanity check the number.
   if (Result > Data.size())
-    return coveragemap_error::malformed;
-  return std::error_code();
+    return make_error<CoverageMapError>(coveragemap_error::malformed);
+  return Error::success();
 }
 
-std::error_code RawCoverageReader::readString(StringRef &Result) {
+Error RawCoverageReader::readString(StringRef &Result) {
   uint64_t Length;
   if (auto Err = readSize(Length))
     return Err;
   Result = Data.substr(0, Length);
   Data = Data.substr(Length);
-  return std::error_code();
+  return Error::success();
 }
 
-std::error_code RawCoverageFilenamesReader::read() {
+Error RawCoverageFilenamesReader::read() {
   uint64_t NumFilenames;
   if (auto Err = readSize(NumFilenames))
     return Err;
@@ -83,19 +88,18 @@ std::error_code RawCoverageFilenamesReader::read() {
       return Err;
     Filenames.push_back(Filename);
   }
-  return std::error_code();
+  return Error::success();
 }
 
-std::error_code RawCoverageMappingReader::decodeCounter(unsigned Value,
-                                                        Counter &C) {
+Error RawCoverageMappingReader::decodeCounter(unsigned Value, Counter &C) {
   auto Tag = Value & Counter::EncodingTagMask;
   switch (Tag) {
   case Counter::Zero:
     C = Counter::getZero();
-    return std::error_code();
+    return Error::success();
   case Counter::CounterValueReference:
     C = Counter::getCounter(Value >> Counter::EncodingTagBits);
-    return std::error_code();
+    return Error::success();
   default:
     break;
   }
@@ -105,25 +109,25 @@ std::error_code RawCoverageMappingReader::decodeCounter(unsigned Value,
   case CounterExpression::Add: {
     auto ID = Value >> Counter::EncodingTagBits;
     if (ID >= Expressions.size())
-      return coveragemap_error::malformed;
+      return make_error<CoverageMapError>(coveragemap_error::malformed);
     Expressions[ID].Kind = CounterExpression::ExprKind(Tag);
     C = Counter::getExpression(ID);
     break;
   }
   default:
-    return coveragemap_error::malformed;
+    return make_error<CoverageMapError>(coveragemap_error::malformed);
   }
-  return std::error_code();
+  return Error::success();
 }
 
-std::error_code RawCoverageMappingReader::readCounter(Counter &C) {
+Error RawCoverageMappingReader::readCounter(Counter &C) {
   uint64_t EncodedCounter;
   if (auto Err =
           readIntMax(EncodedCounter, std::numeric_limits<unsigned>::max()))
     return Err;
   if (auto Err = decodeCounter(EncodedCounter, C))
     return Err;
-  return std::error_code();
+  return Error::success();
 }
 
 static const unsigned EncodingExpansionRegionBit = 1
@@ -132,7 +136,7 @@ static const unsigned EncodingExpansionRegionBit = 1
 /// \brief Read the sub-array of regions for the given inferred file id.
 /// \param NumFileIDs the number of file ids that are defined for this
 /// function.
-std::error_code RawCoverageMappingReader::readMappingRegionsSubArray(
+Error RawCoverageMappingReader::readMappingRegionsSubArray(
     std::vector<CounterMappingRegion> &MappingRegions, unsigned InferredFileID,
     size_t NumFileIDs) {
   uint64_t NumRegions;
@@ -160,7 +164,7 @@ std::error_code RawCoverageMappingReader::readMappingRegionsSubArray(
         ExpandedFileID = EncodedCounterAndRegion >>
                          Counter::EncodingCounterTagAndExpansionRegionTagBits;
         if (ExpandedFileID >= NumFileIDs)
-          return coveragemap_error::malformed;
+          return make_error<CoverageMapError>(coveragemap_error::malformed);
       } else {
         switch (EncodedCounterAndRegion >>
                 Counter::EncodingCounterTagAndExpansionRegionTagBits) {
@@ -171,7 +175,7 @@ std::error_code RawCoverageMappingReader::readMappingRegionsSubArray(
           Kind = CounterMappingRegion::SkippedRegion;
           break;
         default:
-          return coveragemap_error::malformed;
+          return make_error<CoverageMapError>(coveragemap_error::malformed);
         }
       }
     }
@@ -184,7 +188,7 @@ std::error_code RawCoverageMappingReader::readMappingRegionsSubArray(
     if (auto Err = readULEB128(ColumnStart))
       return Err;
     if (ColumnStart > std::numeric_limits<unsigned>::max())
-      return coveragemap_error::malformed;
+      return make_error<CoverageMapError>(coveragemap_error::malformed);
     if (auto Err = readIntMax(NumLines, std::numeric_limits<unsigned>::max()))
       return Err;
     if (auto Err = readIntMax(ColumnEnd, std::numeric_limits<unsigned>::max()))
@@ -218,10 +222,10 @@ std::error_code RawCoverageMappingReader::readMappingRegionsSubArray(
         C, InferredFileID, ExpandedFileID, LineStart, ColumnStart,
         LineStart + NumLines, ColumnEnd, Kind));
   }
-  return std::error_code();
+  return Error::success();
 }
 
-std::error_code RawCoverageMappingReader::read() {
+Error RawCoverageMappingReader::read() {
 
   // Read the virtual file mapping.
   llvm::SmallVector<unsigned, 8> VirtualFileMapping;
@@ -287,14 +291,44 @@ std::error_code RawCoverageMappingReader::read() {
     }
   }
 
-  return std::error_code();
+  return Error::success();
 }
 
-std::error_code InstrProfSymtab::create(SectionRef &Section) {
-  if (auto Err = Section.getContents(Data))
-    return Err;
+Expected<bool> RawCoverageMappingDummyChecker::isDummy() {
+  // A dummy coverage mapping data consists of just one region with zero count.
+  uint64_t NumFileMappings;
+  if (Error Err = readSize(NumFileMappings))
+    return std::move(Err);
+  if (NumFileMappings != 1)
+    return false;
+  // We don't expect any specific value for the filename index, just skip it.
+  uint64_t FilenameIndex;
+  if (Error Err =
+          readIntMax(FilenameIndex, std::numeric_limits<unsigned>::max()))
+    return std::move(Err);
+  uint64_t NumExpressions;
+  if (Error Err = readSize(NumExpressions))
+    return std::move(Err);
+  if (NumExpressions != 0)
+    return false;
+  uint64_t NumRegions;
+  if (Error Err = readSize(NumRegions))
+    return std::move(Err);
+  if (NumRegions != 1)
+    return false;
+  uint64_t EncodedCounterAndRegion;
+  if (Error Err = readIntMax(EncodedCounterAndRegion,
+                             std::numeric_limits<unsigned>::max()))
+    return std::move(Err);
+  unsigned Tag = EncodedCounterAndRegion & Counter::EncodingTagMask;
+  return Tag == Counter::Zero;
+}
+
+Error InstrProfSymtab::create(SectionRef &Section) {
+  if (auto EC = Section.getContents(Data))
+    return errorCodeToError(EC);
   Address = Section.getAddress();
-  return std::error_code();
+  return Error::success();
 }
 
 StringRef InstrProfSymtab::getFuncName(uint64_t Pointer, size_t Size) {
@@ -306,40 +340,124 @@ StringRef InstrProfSymtab::getFuncName(uint64_t Pointer, size_t Size) {
   return Data.substr(Pointer - Address, Size);
 }
 
-template <typename T, support::endianness Endian>
-static std::error_code readCoverageMappingData(
-    InstrProfSymtab &ProfileNames, StringRef Data,
-    std::vector<BinaryCoverageReader::ProfileMappingRecord> &Records,
-    std::vector<StringRef> &Filenames) {
-  using namespace support;
-  llvm::DenseSet<T> UniqueFunctionMappingData;
+// Check if the mapping data is a dummy, i.e. is emitted for an unused function.
+static Expected<bool> isCoverageMappingDummy(uint64_t Hash, StringRef Mapping) {
+  // The hash value of dummy mapping records is always zero.
+  if (Hash)
+    return false;
+  return RawCoverageMappingDummyChecker(Mapping).isDummy();
+}
 
-  // Read the records in the coverage data section.
-  for (const char *Buf = Data.data(), *End = Buf + Data.size(); Buf < End;) {
+namespace {
+struct CovMapFuncRecordReader {
+  // The interface to read coverage mapping function records for a module.
+  //
+  // \p Buf points to the buffer containing the \c CovHeader of the coverage
+  // mapping data associated with the module.
+  //
+  // Returns a pointer to the next \c CovHeader if it exists, or a pointer
+  // greater than \p End if not.
+  virtual Expected<const char *> readFunctionRecords(const char *Buf,
+                                                     const char *End) = 0;
+  virtual ~CovMapFuncRecordReader() {}
+  template <class IntPtrT, support::endianness Endian>
+  static Expected<std::unique_ptr<CovMapFuncRecordReader>>
+  get(coverage::CovMapVersion Version, InstrProfSymtab &P,
+      std::vector<BinaryCoverageReader::ProfileMappingRecord> &R,
+      std::vector<StringRef> &F);
+};
+
+// A class for reading coverage mapping function records for a module.
+template <coverage::CovMapVersion Version, class IntPtrT,
+          support::endianness Endian>
+class VersionedCovMapFuncRecordReader : public CovMapFuncRecordReader {
+  typedef typename coverage::CovMapTraits<
+      Version, IntPtrT>::CovMapFuncRecordType FuncRecordType;
+  typedef typename coverage::CovMapTraits<Version, IntPtrT>::NameRefType
+      NameRefType;
+
+  // Maps function's name references to the indexes of their records
+  // in \c Records.
+  llvm::DenseMap<NameRefType, size_t> FunctionRecords;
+  InstrProfSymtab &ProfileNames;
+  std::vector<StringRef> &Filenames;
+  std::vector<BinaryCoverageReader::ProfileMappingRecord> &Records;
+
+  // Add the record to the collection if we don't already have a record that
+  // points to the same function name. This is useful to ignore the redundant
+  // records for the functions with ODR linkage.
+  // In addition, prefer records with real coverage mapping data to dummy
+  // records, which were emitted for inline functions which were seen but
+  // not used in the corresponding translation unit.
+  Error insertFunctionRecordIfNeeded(const FuncRecordType *CFR,
+                                     StringRef Mapping, size_t FilenamesBegin) {
+    uint64_t FuncHash = CFR->template getFuncHash<Endian>();
+    NameRefType NameRef = CFR->template getFuncNameRef<Endian>();
+    auto InsertResult =
+        FunctionRecords.insert(std::make_pair(NameRef, Records.size()));
+    if (InsertResult.second) {
+      StringRef FuncName;
+      if (Error Err = CFR->template getFuncName<Endian>(ProfileNames, FuncName))
+        return Err;
+      Records.emplace_back(Version, FuncName, FuncHash, Mapping, FilenamesBegin,
+                           Filenames.size() - FilenamesBegin);
+      return Error::success();
+    }
+    // Update the existing record if it's a dummy and the new record is real.
+    size_t OldRecordIndex = InsertResult.first->second;
+    BinaryCoverageReader::ProfileMappingRecord &OldRecord =
+        Records[OldRecordIndex];
+    Expected<bool> OldIsDummyExpected = isCoverageMappingDummy(
+        OldRecord.FunctionHash, OldRecord.CoverageMapping);
+    if (Error Err = OldIsDummyExpected.takeError())
+      return Err;
+    if (!*OldIsDummyExpected)
+      return Error::success();
+    Expected<bool> NewIsDummyExpected =
+        isCoverageMappingDummy(FuncHash, Mapping);
+    if (Error Err = NewIsDummyExpected.takeError())
+      return Err;
+    if (*NewIsDummyExpected)
+      return Error::success();
+    OldRecord.FunctionHash = FuncHash;
+    OldRecord.CoverageMapping = Mapping;
+    OldRecord.FilenamesBegin = FilenamesBegin;
+    OldRecord.FilenamesSize = Filenames.size() - FilenamesBegin;
+    return Error::success();
+  }
+
+public:
+  VersionedCovMapFuncRecordReader(
+      InstrProfSymtab &P,
+      std::vector<BinaryCoverageReader::ProfileMappingRecord> &R,
+      std::vector<StringRef> &F)
+      : ProfileNames(P), Filenames(F), Records(R) {}
+  ~VersionedCovMapFuncRecordReader() override {}
+
+  Expected<const char *> readFunctionRecords(const char *Buf,
+                                             const char *End) override {
+    using namespace support;
     if (Buf + sizeof(CovMapHeader) > End)
-      return coveragemap_error::malformed;
+      return make_error<CoverageMapError>(coveragemap_error::malformed);
     auto CovHeader = reinterpret_cast<const coverage::CovMapHeader *>(Buf);
     uint32_t NRecords = CovHeader->getNRecords<Endian>();
     uint32_t FilenamesSize = CovHeader->getFilenamesSize<Endian>();
     uint32_t CoverageSize = CovHeader->getCoverageSize<Endian>();
-    uint32_t Version = CovHeader->getVersion<Endian>();
-    Buf = reinterpret_cast<const char *>(++CovHeader);
-
-    if (Version > coverage::CoverageMappingCurrentVersion)
-      return coveragemap_error::unsupported_version;
+    assert((CovMapVersion)CovHeader->getVersion<Endian>() == Version);
+    Buf = reinterpret_cast<const char *>(CovHeader + 1);
 
     // Skip past the function records, saving the start and end for later.
     const char *FunBuf = Buf;
-    Buf += NRecords * sizeof(coverage::CovMapFunctionRecord<T>);
+    Buf += NRecords * sizeof(FuncRecordType);
     const char *FunEnd = Buf;
 
     // Get the filenames.
     if (Buf + FilenamesSize > End)
-      return coveragemap_error::malformed;
+      return make_error<CoverageMapError>(coveragemap_error::malformed);
     size_t FilenamesBegin = Filenames.size();
     RawCoverageFilenamesReader Reader(StringRef(Buf, FilenamesSize), Filenames);
     if (auto Err = Reader.read())
-      return Err;
+      return std::move(Err);
     Buf += FilenamesSize;
 
     // We'll read the coverage mapping records in the loop below.
@@ -348,115 +466,156 @@ static std::error_code readCoverageMappingData(
     const char *CovEnd = Buf;
 
     if (Buf > End)
-      return coveragemap_error::malformed;
+      return make_error<CoverageMapError>(coveragemap_error::malformed);
     // Each coverage map has an alignment of 8, so we need to adjust alignment
     // before reading the next map.
     Buf += alignmentAdjustment(Buf, 8);
 
-    auto CFR =
-        reinterpret_cast<const coverage::CovMapFunctionRecord<T> *>(FunBuf);
+    auto CFR = reinterpret_cast<const FuncRecordType *>(FunBuf);
     while ((const char *)CFR < FunEnd) {
       // Read the function information
       uint32_t DataSize = CFR->template getDataSize<Endian>();
-      uint64_t FuncHash = CFR->template getFuncHash<Endian>();
 
       // Now use that to read the coverage data.
       if (CovBuf + DataSize > CovEnd)
-        return coveragemap_error::malformed;
+        return make_error<CoverageMapError>(coveragemap_error::malformed);
       auto Mapping = StringRef(CovBuf, DataSize);
       CovBuf += DataSize;
 
-      // Ignore this record if we already have a record that points to the same
-      // function name. This is useful to ignore the redundant records for the
-      // functions with ODR linkage.
-      T NameRef = CFR->template getFuncNameRef<Endian>();
-      if (!UniqueFunctionMappingData.insert(NameRef).second)
-        continue;
-
-      StringRef FuncName;
-      if (std::error_code EC =
-              CFR->template getFuncName<Endian>(ProfileNames, FuncName))
-        return EC;
-      Records.push_back(BinaryCoverageReader::ProfileMappingRecord(
-          CoverageMappingVersion(Version), FuncName, FuncHash, Mapping,
-          FilenamesBegin, Filenames.size() - FilenamesBegin));
+      if (Error Err =
+              insertFunctionRecordIfNeeded(CFR, Mapping, FilenamesBegin))
+        return std::move(Err);
       CFR++;
     }
+    return Buf;
   }
-
-  return std::error_code();
+};
+} // end anonymous namespace
+
+template <class IntPtrT, support::endianness Endian>
+Expected<std::unique_ptr<CovMapFuncRecordReader>> CovMapFuncRecordReader::get(
+    coverage::CovMapVersion Version, InstrProfSymtab &P,
+    std::vector<BinaryCoverageReader::ProfileMappingRecord> &R,
+    std::vector<StringRef> &F) {
+  using namespace coverage;
+  switch (Version) {
+  case CovMapVersion::Version1:
+    return llvm::make_unique<VersionedCovMapFuncRecordReader<
+        CovMapVersion::Version1, IntPtrT, Endian>>(P, R, F);
+  case CovMapVersion::Version2:
+    // Decompress the name data.
+    if (Error E = P.create(P.getNameData()))
+      return std::move(E);
+    return llvm::make_unique<VersionedCovMapFuncRecordReader<
+        CovMapVersion::Version2, IntPtrT, Endian>>(P, R, F);
+  }
+  llvm_unreachable("Unsupported version");
 }
 
+template <typename T, support::endianness Endian>
+static Error readCoverageMappingData(
+    InstrProfSymtab &ProfileNames, StringRef Data,
+    std::vector<BinaryCoverageReader::ProfileMappingRecord> &Records,
+    std::vector<StringRef> &Filenames) {
+  using namespace coverage;
+  // Read the records in the coverage data section.
+  auto CovHeader =
+      reinterpret_cast<const coverage::CovMapHeader *>(Data.data());
+  CovMapVersion Version = (CovMapVersion)CovHeader->getVersion<Endian>();
+  if (Version > coverage::CovMapVersion::CurrentVersion)
+    return make_error<CoverageMapError>(coveragemap_error::unsupported_version);
+  Expected<std::unique_ptr<CovMapFuncRecordReader>> ReaderExpected =
+      CovMapFuncRecordReader::get<T, Endian>(Version, ProfileNames, Records,
+                                             Filenames);
+  if (Error E = ReaderExpected.takeError())
+    return E;
+  auto Reader = std::move(ReaderExpected.get());
+  for (const char *Buf = Data.data(), *End = Buf + Data.size(); Buf < End;) {
+    auto NextHeaderOrErr = Reader->readFunctionRecords(Buf, End);
+    if (auto E = NextHeaderOrErr.takeError())
+      return E;
+    Buf = NextHeaderOrErr.get();
+  }
+  return Error::success();
+}
 static const char *TestingFormatMagic = "llvmcovmtestdata";
 
-static std::error_code loadTestingFormat(StringRef Data,
-                                         InstrProfSymtab &ProfileNames,
-                                         StringRef &CoverageMapping,
-                                         uint8_t &BytesInAddress,
-                                         support::endianness &Endian) {
+static Error loadTestingFormat(StringRef Data, InstrProfSymtab &ProfileNames,
+                               StringRef &CoverageMapping,
+                               uint8_t &BytesInAddress,
+                               support::endianness &Endian) {
   BytesInAddress = 8;
   Endian = support::endianness::little;
 
   Data = Data.substr(StringRef(TestingFormatMagic).size());
   if (Data.size() < 1)
-    return coveragemap_error::truncated;
+    return make_error<CoverageMapError>(coveragemap_error::truncated);
   unsigned N = 0;
   auto ProfileNamesSize =
       decodeULEB128(reinterpret_cast<const uint8_t *>(Data.data()), &N);
   if (N > Data.size())
-    return coveragemap_error::malformed;
+    return make_error<CoverageMapError>(coveragemap_error::malformed);
   Data = Data.substr(N);
   if (Data.size() < 1)
-    return coveragemap_error::truncated;
+    return make_error<CoverageMapError>(coveragemap_error::truncated);
   N = 0;
   uint64_t Address =
       decodeULEB128(reinterpret_cast<const uint8_t *>(Data.data()), &N);
   if (N > Data.size())
-    return coveragemap_error::malformed;
+    return make_error<CoverageMapError>(coveragemap_error::malformed);
   Data = Data.substr(N);
   if (Data.size() < ProfileNamesSize)
-    return coveragemap_error::malformed;
-  ProfileNames.create(Data.substr(0, ProfileNamesSize), Address);
+    return make_error<CoverageMapError>(coveragemap_error::malformed);
+  if (Error E = ProfileNames.create(Data.substr(0, ProfileNamesSize), Address))
+    return E;
   CoverageMapping = Data.substr(ProfileNamesSize);
-  return std::error_code();
+  // Skip the padding bytes because coverage map data has an alignment of 8.
+  if (CoverageMapping.size() < 1)
+    return make_error<CoverageMapError>(coveragemap_error::truncated);
+  size_t Pad = alignmentAdjustment(CoverageMapping.data(), 8);
+  if (CoverageMapping.size() < Pad)
+    return make_error<CoverageMapError>(coveragemap_error::malformed);
+  CoverageMapping = CoverageMapping.substr(Pad);
+  return Error::success();
 }
 
-static ErrorOr<SectionRef> lookupSection(ObjectFile &OF, StringRef Name) {
+static Expected<SectionRef> lookupSection(ObjectFile &OF, StringRef Name) {
   StringRef FoundName;
   for (const auto &Section : OF.sections()) {
     if (auto EC = Section.getName(FoundName))
-      return EC;
+      return errorCodeToError(EC);
     if (FoundName == Name)
       return Section;
   }
-  return coveragemap_error::no_data_found;
+  return make_error<CoverageMapError>(coveragemap_error::no_data_found);
 }
 
-static std::error_code
-loadBinaryFormat(MemoryBufferRef ObjectBuffer, InstrProfSymtab &ProfileNames,
-                 StringRef &CoverageMapping, uint8_t &BytesInAddress,
-                 support::endianness &Endian, StringRef Arch) {
+static Error loadBinaryFormat(MemoryBufferRef ObjectBuffer,
+                              InstrProfSymtab &ProfileNames,
+                              StringRef &CoverageMapping,
+                              uint8_t &BytesInAddress,
+                              support::endianness &Endian, StringRef Arch) {
   auto BinOrErr = object::createBinary(ObjectBuffer);
-  if (std::error_code EC = BinOrErr.getError())
-    return EC;
+  if (!BinOrErr)
+    return BinOrErr.takeError();
   auto Bin = std::move(BinOrErr.get());
   std::unique_ptr<ObjectFile> OF;
   if (auto *Universal = dyn_cast<object::MachOUniversalBinary>(Bin.get())) {
     // If we have a universal binary, try to look up the object for the
     // appropriate architecture.
     auto ObjectFileOrErr = Universal->getObjectForArch(Arch);
-    if (std::error_code EC = ObjectFileOrErr.getError())
-      return EC;
+    if (!ObjectFileOrErr)
+      return ObjectFileOrErr.takeError();
     OF = std::move(ObjectFileOrErr.get());
   } else if (isa<object::ObjectFile>(Bin.get())) {
     // For any other object file, upcast and take ownership.
     OF.reset(cast<object::ObjectFile>(Bin.release()));
     // If we've asked for a particular arch, make sure they match.
     if (!Arch.empty() && OF->getArch() != Triple(Arch).getArch())
-      return object_error::arch_not_found;
+      return errorCodeToError(object_error::arch_not_found);
   } else
     // We can only handle object files.
-    return coveragemap_error::malformed;
+    return make_error<CoverageMapError>(coveragemap_error::malformed);
 
   // The coverage uses native pointer sizes for the object it's written in.
   BytesInAddress = OF->getBytesInAddress();
@@ -465,65 +624,68 @@ loadBinaryFormat(MemoryBufferRef ObjectBuffer, InstrProfSymtab &ProfileNames,
 
   // Look for the sections that we are interested in.
   auto NamesSection = lookupSection(*OF, getInstrProfNameSectionName(false));
-  if (auto EC = NamesSection.getError())
-    return EC;
+  if (auto E = NamesSection.takeError())
+    return E;
   auto CoverageSection =
       lookupSection(*OF, getInstrProfCoverageSectionName(false));
-  if (auto EC = CoverageSection.getError())
-    return EC;
+  if (auto E = CoverageSection.takeError())
+    return E;
 
   // Get the contents of the given sections.
-  if (std::error_code EC = CoverageSection->getContents(CoverageMapping))
-    return EC;
-  if (std::error_code EC = ProfileNames.create(*NamesSection))
-    return EC;
+  if (auto EC = CoverageSection->getContents(CoverageMapping))
+    return errorCodeToError(EC);
+  if (Error E = ProfileNames.create(*NamesSection))
+    return E;
 
-  return std::error_code();
+  return Error::success();
 }
 
-ErrorOr<std::unique_ptr<BinaryCoverageReader>>
+Expected<std::unique_ptr<BinaryCoverageReader>>
 BinaryCoverageReader::create(std::unique_ptr<MemoryBuffer> &ObjectBuffer,
                              StringRef Arch) {
   std::unique_ptr<BinaryCoverageReader> Reader(new BinaryCoverageReader());
 
-  InstrProfSymtab ProfileNames;
   StringRef Coverage;
   uint8_t BytesInAddress;
   support::endianness Endian;
-  std::error_code EC;
+  Error E;
+  consumeError(std::move(E));
   if (ObjectBuffer->getBuffer().startswith(TestingFormatMagic))
     // This is a special format used for testing.
-    EC = loadTestingFormat(ObjectBuffer->getBuffer(), ProfileNames, Coverage,
-                           BytesInAddress, Endian);
+    E = loadTestingFormat(ObjectBuffer->getBuffer(), Reader->ProfileNames,
+                          Coverage, BytesInAddress, Endian);
   else
-    EC = loadBinaryFormat(ObjectBuffer->getMemBufferRef(), ProfileNames,
-                          Coverage, BytesInAddress, Endian, Arch);
-  if (EC)
-    return EC;
+    E = loadBinaryFormat(ObjectBuffer->getMemBufferRef(), Reader->ProfileNames,
+                         Coverage, BytesInAddress, Endian, Arch);
+  if (E)
+    return std::move(E);
 
   if (BytesInAddress == 4 && Endian == support::endianness::little)
-    EC = readCoverageMappingData<uint32_t, support::endianness::little>(
-        ProfileNames, Coverage, Reader->MappingRecords, Reader->Filenames);
+    E = readCoverageMappingData<uint32_t, support::endianness::little>(
+        Reader->ProfileNames, Coverage, Reader->MappingRecords,
+        Reader->Filenames);
   else if (BytesInAddress == 4 && Endian == support::endianness::big)
-    EC = readCoverageMappingData<uint32_t, support::endianness::big>(
-        ProfileNames, Coverage, Reader->MappingRecords, Reader->Filenames);
+    E = readCoverageMappingData<uint32_t, support::endianness::big>(
+        Reader->ProfileNames, Coverage, Reader->MappingRecords,
+        Reader->Filenames);
   else if (BytesInAddress == 8 && Endian == support::endianness::little)
-    EC = readCoverageMappingData<uint64_t, support::endianness::little>(
-        ProfileNames, Coverage, Reader->MappingRecords, Reader->Filenames);
+    E = readCoverageMappingData<uint64_t, support::endianness::little>(
+        Reader->ProfileNames, Coverage, Reader->MappingRecords,
+        Reader->Filenames);
   else if (BytesInAddress == 8 && Endian == support::endianness::big)
-    EC = readCoverageMappingData<uint64_t, support::endianness::big>(
-        ProfileNames, Coverage, Reader->MappingRecords, Reader->Filenames);
+    E = readCoverageMappingData<uint64_t, support::endianness::big>(
+        Reader->ProfileNames, Coverage, Reader->MappingRecords,
+        Reader->Filenames);
   else
-    return coveragemap_error::malformed;
-  if (EC)
-    return EC;
+    return make_error<CoverageMapError>(coveragemap_error::malformed);
+  if (E)
+    return std::move(E);
   return std::move(Reader);
 }
 
-std::error_code
-BinaryCoverageReader::readNextRecord(CoverageMappingRecord &Record) {
+Error BinaryCoverageReader::readNextRecord(CoverageMappingRecord &Record) {
   if (CurrentRecord >= MappingRecords.size())
-    return coveragemap_error::eof;
+    return make_error<CoverageMapError>(coveragemap_error::eof);
 
   FunctionsFilenames.clear();
   Expressions.clear();
@@ -543,5 +705,5 @@ BinaryCoverageReader::readNextRecord(CoverageMappingRecord &Record) {
   Record.MappingRegions = MappingRegions;
 
   ++CurrentRecord;
-  return std::error_code();
+  return Error::success();
 }
diff --git a/lib/ProfileData/CoverageMappingWriter.cpp b/lib/ProfileData/Coverage/CoverageMappingWriter.cpp
index d90d2f565155..8ff90d62cfdc 100644
--- a/lib/ProfileData/CoverageMappingWriter.cpp
+++ b/lib/ProfileData/Coverage/CoverageMappingWriter.cpp
@@ -12,7 +12,7 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "llvm/ProfileData/CoverageMappingWriter.h"
+#include "llvm/ProfileData/Coverage/CoverageMappingWriter.h"
 #include "llvm/Support/LEB128.h"
 
 using namespace llvm;
diff --git a/lib/Target/CppBackend/LLVMBuild.txt b/lib/ProfileData/Coverage/LLVMBuild.txt
index 122b5e7502fc..fc8284b0ef38 100644
--- a/lib/Target/CppBackend/LLVMBuild.txt
+++ b/lib/ProfileData/Coverage/LLVMBuild.txt
@@ -1,4 +1,4 @@
-;===- ./lib/Target/CppBackend/LLVMBuild.txt --------------------*- Conf -*--===;
+;===- ./lib/ProfileData/Coverage/LLVMBuild.txt -----------------*- Conf -*--===;
 ;
 ;                     The LLVM Compiler Infrastructure
 ;
@@ -15,17 +15,9 @@
 ;
 ;===------------------------------------------------------------------------===;
 
-[common]
-subdirectories = TargetInfo
-
 [component_0]
-type = TargetGroup
-name = CppBackend
-parent = Target
-
-[component_1]
 type = Library
-name = CppBackendCodeGen
-parent = CppBackend
-required_libraries = Core CppBackendInfo Support Target
-add_to_library_groups = CppBackend
+name = Coverage
+parent = ProfileData
+required_libraries = Core Object ProfileData Support
+
diff --git a/lib/ProfileData/InstrProf.cpp b/lib/ProfileData/InstrProf.cpp
index d6777639abe7..6962f82a5ef5 100644
--- a/lib/ProfileData/InstrProf.cpp
+++ b/lib/ProfileData/InstrProf.cpp
@@ -17,55 +17,72 @@
 #include "llvm/IR/Constants.h"
 #include "llvm/IR/Function.h"
 #include "llvm/IR/GlobalVariable.h"
+#include "llvm/IR/MDBuilder.h"
 #include "llvm/IR/Module.h"
 #include "llvm/Support/Compression.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/LEB128.h"
 #include "llvm/Support/ManagedStatic.h"
+#include "llvm/Support/Path.h"
 
 using namespace llvm;
 
+static cl::opt<bool> StaticFuncFullModulePrefix(
+    "static-func-full-module-prefix", cl::init(false),
+    cl::desc("Use full module build paths in the profile counter names for "
+             "static functions."));
+
 namespace {
+std::string getInstrProfErrString(instrprof_error Err) {
+  switch (Err) {
+  case instrprof_error::success:
+    return "Success";
+  case instrprof_error::eof:
+    return "End of File";
+  case instrprof_error::unrecognized_format:
+    return "Unrecognized instrumentation profile encoding format";
+  case instrprof_error::bad_magic:
+    return "Invalid instrumentation profile data (bad magic)";
+  case instrprof_error::bad_header:
+    return "Invalid instrumentation profile data (file header is corrupt)";
+  case instrprof_error::unsupported_version:
+    return "Unsupported instrumentation profile format version";
+  case instrprof_error::unsupported_hash_type:
+    return "Unsupported instrumentation profile hash type";
+  case instrprof_error::too_large:
+    return "Too much profile data";
+  case instrprof_error::truncated:
+    return "Truncated profile data";
+  case instrprof_error::malformed:
+    return "Malformed instrumentation profile data";
+  case instrprof_error::unknown_function:
+    return "No profile data available for function";
+  case instrprof_error::hash_mismatch:
+    return "Function control flow change detected (hash mismatch)";
+  case instrprof_error::count_mismatch:
+    return "Function basic block count change detected (counter mismatch)";
+  case instrprof_error::counter_overflow:
+    return "Counter overflow";
+  case instrprof_error::value_site_count_mismatch:
+    return "Function value site count change detected (counter mismatch)";
+  case instrprof_error::compress_failed:
+    return "Failed to compress data (zlib)";
+  case instrprof_error::uncompress_failed:
+    return "Failed to uncompress data (zlib)";
+  }
+  llvm_unreachable("A value of instrprof_error has no message.");
+}
+
+// FIXME: This class is only here to support the transition to llvm::Error. It
+// will be removed once this transition is complete. Clients should prefer to
+// deal with the Error value directly, rather than converting to error_code.
 class InstrProfErrorCategoryType : public std::error_category {
   const char *name() const LLVM_NOEXCEPT override { return "llvm.instrprof"; }
   std::string message(int IE) const override {
-    instrprof_error E = static_cast<instrprof_error>(IE);
-    switch (E) {
-    case instrprof_error::success:
-      return "Success";
-    case instrprof_error::eof:
-      return "End of File";
-    case instrprof_error::unrecognized_format:
-      return "Unrecognized instrumentation profile encoding format";
-    case instrprof_error::bad_magic:
-      return "Invalid instrumentation profile data (bad magic)";
-    case instrprof_error::bad_header:
-      return "Invalid instrumentation profile data (file header is corrupt)";
-    case instrprof_error::unsupported_version:
-      return "Unsupported instrumentation profile format version";
-    case instrprof_error::unsupported_hash_type:
-      return "Unsupported instrumentation profile hash type";
-    case instrprof_error::too_large:
-      return "Too much profile data";
-    case instrprof_error::truncated:
-      return "Truncated profile data";
-    case instrprof_error::malformed:
-      return "Malformed instrumentation profile data";
-    case instrprof_error::unknown_function:
-      return "No profile data available for function";
-    case instrprof_error::hash_mismatch:
-      return "Function control flow change detected (hash mismatch)";
-    case instrprof_error::count_mismatch:
-      return "Function basic block count change detected (counter mismatch)";
-    case instrprof_error::counter_overflow:
-      return "Counter overflow";
-    case instrprof_error::value_site_count_mismatch:
-      return "Function value site count change detected (counter mismatch)";
-    }
-    llvm_unreachable("A value of instrprof_error has no message.");
+    return getInstrProfErrString(static_cast<instrprof_error>(IE));
   }
 };
-}
+} // end anonymous namespace
 
 static ManagedStatic<InstrProfErrorCategoryType> ErrorCategory;
 
@@ -75,34 +92,72 @@ const std::error_category &llvm::instrprof_category() {
 
 namespace llvm {
 
+void SoftInstrProfErrors::addError(instrprof_error IE) {
+  if (IE == instrprof_error::success)
+    return;
+
+  if (FirstError == instrprof_error::success)
+    FirstError = IE;
+
+  switch (IE) {
+  case instrprof_error::hash_mismatch:
+    ++NumHashMismatches;
+    break;
+  case instrprof_error::count_mismatch:
+    ++NumCountMismatches;
+    break;
+  case instrprof_error::counter_overflow:
+    ++NumCounterOverflows;
+    break;
+  case instrprof_error::value_site_count_mismatch:
+    ++NumValueSiteCountMismatches;
+    break;
+  default:
+    llvm_unreachable("Not a soft error");
+  }
+}
+
+std::string InstrProfError::message() const {
+  return getInstrProfErrString(Err);
+}
+
+char InstrProfError::ID = 0;
+
 std::string getPGOFuncName(StringRef RawFuncName,
                            GlobalValue::LinkageTypes Linkage,
                            StringRef FileName,
                            uint64_t Version LLVM_ATTRIBUTE_UNUSED) {
+  return GlobalValue::getGlobalIdentifier(RawFuncName, Linkage, FileName);
+}
+
+// Return the PGOFuncName. This function has some special handling when called
+// in LTO optimization. The following only applies when calling in LTO passes
+// (when \c InLTO is true): LTO's internalization privatizes many global linkage
+// symbols. This happens after value profile annotation, but those internal
+// linkage functions should not have a source prefix.
+// To differentiate compiler generated internal symbols from original ones,
+// PGOFuncName meta data are created and attached to the original internal
+// symbols in the value profile annotation step
+// (PGOUseFunc::annotateIndirectCallSites). If a symbol does not have the meta
+// data, its original linkage must be non-internal.
+std::string getPGOFuncName(const Function &F, bool InLTO, uint64_t Version) {
+  if (!InLTO) {
+    StringRef FileName = (StaticFuncFullModulePrefix
+                              ? F.getParent()->getName()
+                              : sys::path::filename(F.getParent()->getName()));
+    return getPGOFuncName(F.getName(), F.getLinkage(), FileName, Version);
+  }
 
-  // Function names may be prefixed with a binary '1' to indicate
-  // that the backend should not modify the symbols due to any platform
-  // naming convention. Do not include that '1' in the PGO profile name.
-  if (RawFuncName[0] == '\1')
-    RawFuncName = RawFuncName.substr(1);
-
-  std::string FuncName = RawFuncName;
-  if (llvm::GlobalValue::isLocalLinkage(Linkage)) {
-    // For local symbols, prepend the main file name to distinguish them.
-    // Do not include the full path in the file name since there's no guarantee
-    // that it will stay the same, e.g., if the files are checked out from
-    // version control in different locations.
-    if (FileName.empty())
-      FuncName = FuncName.insert(0, "<unknown>:");
-    else
-      FuncName = FuncName.insert(0, FileName.str() + ":");
+  // In LTO mode (when InLTO is true), first check if there is a meta data.
+  if (MDNode *MD = getPGOFuncNameMetadata(F)) {
+    StringRef S = cast<MDString>(MD->getOperand(0))->getString();
+    return S.str();
   }
-  return FuncName;
-}
 
-std::string getPGOFuncName(const Function &F, uint64_t Version) {
-  return getPGOFuncName(F.getName(), F.getLinkage(), F.getParent()->getName(),
-                        Version);
+  // If there is no meta data, the function must be a global before the value
+  // profile annotation pass. Its current linkage may be internal if it is
+  // internalized in LTO mode.
+  return getPGOFuncName(F.getName(), GlobalValue::ExternalLinkage, "");
 }
 
 StringRef getFuncNameWithoutPrefix(StringRef PGOFuncName, StringRef FileName) {
@@ -116,8 +171,8 @@ StringRef getFuncNameWithoutPrefix(StringRef PGOFuncName, StringRef FileName) {
 
 // \p FuncName is the string used as profile lookup key for the function. A
 // symbol is created to hold the name. Return the legalized symbol name.
-static std::string getPGOFuncNameVarName(StringRef FuncName,
-                                         GlobalValue::LinkageTypes Linkage) {
+std::string getPGOFuncNameVarName(StringRef FuncName,
+                                  GlobalValue::LinkageTypes Linkage) {
   std::string VarName = getInstrProfNameVarPrefix();
   VarName += FuncName;
 
@@ -125,7 +180,7 @@ static std::string getPGOFuncNameVarName(StringRef FuncName,
     return VarName;
 
   // Now fix up illegal chars in local VarName that may upset the assembler.
-  const char *InvalidChars = "-:<>\"'";
+  const char *InvalidChars = "-:<>/\"'";
   size_t found = VarName.find_first_of(InvalidChars);
   while (found != std::string::npos) {
     VarName[found] = '_';
@@ -136,7 +191,7 @@ static std::string getPGOFuncNameVarName(StringRef FuncName,
 
 GlobalVariable *createPGOFuncNameVar(Module &M,
                                      GlobalValue::LinkageTypes Linkage,
-                                     StringRef FuncName) {
+                                     StringRef PGOFuncName) {
 
   // We generally want to match the function's linkage, but available_externally
   // and extern_weak both have the wrong semantics, and anything that doesn't
@@ -149,10 +204,11 @@ GlobalVariable *createPGOFuncNameVar(Module &M,
            Linkage == GlobalValue::ExternalLinkage)
     Linkage = GlobalValue::PrivateLinkage;
 
-  auto *Value = ConstantDataArray::getString(M.getContext(), FuncName, false);
+  auto *Value =
+      ConstantDataArray::getString(M.getContext(), PGOFuncName, false);
   auto FuncNameVar =
       new GlobalVariable(M, Value->getType(), true, Linkage, Value,
-                         getPGOFuncNameVarName(FuncName, Linkage));
+                         getPGOFuncNameVarName(PGOFuncName, Linkage));
 
   // Hide the symbol so that we correctly get a copy for each executable.
   if (!GlobalValue::isLocalLinkage(FuncNameVar->getLinkage()))
@@ -161,63 +217,83 @@ GlobalVariable *createPGOFuncNameVar(Module &M,
   return FuncNameVar;
 }
 
-GlobalVariable *createPGOFuncNameVar(Function &F, StringRef FuncName) {
-  return createPGOFuncNameVar(*F.getParent(), F.getLinkage(), FuncName);
+GlobalVariable *createPGOFuncNameVar(Function &F, StringRef PGOFuncName) {
+  return createPGOFuncNameVar(*F.getParent(), F.getLinkage(), PGOFuncName);
+}
+
+void InstrProfSymtab::create(Module &M, bool InLTO) {
+  for (Function &F : M) {
+    // Function may not have a name: like using asm("") to overwrite the name.
+    // Ignore in this case.
+    if (!F.hasName())
+      continue;
+    const std::string &PGOFuncName = getPGOFuncName(F, InLTO);
+    addFuncName(PGOFuncName);
+    MD5FuncMap.emplace_back(Function::getGUID(PGOFuncName), &F);
+  }
+
+  finalizeSymtab();
 }
 
-int collectPGOFuncNameStrings(const std::vector<std::string> &NameStrs,
-                              bool doCompression, std::string &Result) {
+Error collectPGOFuncNameStrings(const std::vector<std::string> &NameStrs,
+                                bool doCompression, std::string &Result) {
+  assert(NameStrs.size() && "No name data to emit");
+
   uint8_t Header[16], *P = Header;
   std::string UncompressedNameStrings =
-      join(NameStrs.begin(), NameStrs.end(), StringRef(" "));
+      join(NameStrs.begin(), NameStrs.end(), getInstrProfNameSeparator());
+
+  assert(StringRef(UncompressedNameStrings)
+                 .count(getInstrProfNameSeparator()) == (NameStrs.size() - 1) &&
+         "PGO name is invalid (contains separator token)");
 
   unsigned EncLen = encodeULEB128(UncompressedNameStrings.length(), P);
   P += EncLen;
 
-  auto WriteStringToResult = [&](size_t CompressedLen,
-                                 const std::string &InputStr) {
+  auto WriteStringToResult = [&](size_t CompressedLen, StringRef InputStr) {
     EncLen = encodeULEB128(CompressedLen, P);
     P += EncLen;
     char *HeaderStr = reinterpret_cast<char *>(&Header[0]);
     unsigned HeaderLen = P - &Header[0];
     Result.append(HeaderStr, HeaderLen);
     Result += InputStr;
-    return 0;
+    return Error::success();
   };
 
-  if (!doCompression)
+  if (!doCompression) {
     return WriteStringToResult(0, UncompressedNameStrings);
+  }
 
-  SmallVector<char, 128> CompressedNameStrings;
+  SmallString<128> CompressedNameStrings;
   zlib::Status Success =
       zlib::compress(StringRef(UncompressedNameStrings), CompressedNameStrings,
                      zlib::BestSizeCompression);
 
   if (Success != zlib::StatusOK)
-    return 1;
+    return make_error<InstrProfError>(instrprof_error::compress_failed);
 
-  return WriteStringToResult(
-      CompressedNameStrings.size(),
-      std::string(CompressedNameStrings.data(), CompressedNameStrings.size()));
+  return WriteStringToResult(CompressedNameStrings.size(),
+                             CompressedNameStrings);
 }
 
-StringRef getPGOFuncNameInitializer(GlobalVariable *NameVar) {
+StringRef getPGOFuncNameVarInitializer(GlobalVariable *NameVar) {
   auto *Arr = cast<ConstantDataArray>(NameVar->getInitializer());
   StringRef NameStr =
       Arr->isCString() ? Arr->getAsCString() : Arr->getAsString();
   return NameStr;
 }
 
-int collectPGOFuncNameStrings(const std::vector<GlobalVariable *> &NameVars,
-                              std::string &Result) {
+Error collectPGOFuncNameStrings(const std::vector<GlobalVariable *> &NameVars,
+                                std::string &Result, bool doCompression) {
   std::vector<std::string> NameStrs;
   for (auto *NameVar : NameVars) {
-    NameStrs.push_back(getPGOFuncNameInitializer(NameVar));
+    NameStrs.push_back(getPGOFuncNameVarInitializer(NameVar));
   }
-  return collectPGOFuncNameStrings(NameStrs, zlib::isAvailable(), Result);
+  return collectPGOFuncNameStrings(
+      NameStrs, zlib::isAvailable() && doCompression, Result);
 }
 
-int readPGOFuncNameStrings(StringRef NameStrings, InstrProfSymtab &Symtab) {
+Error readPGOFuncNameStrings(StringRef NameStrings, InstrProfSymtab &Symtab) {
   const uint8_t *P = reinterpret_cast<const uint8_t *>(NameStrings.data());
   const uint8_t *EndP = reinterpret_cast<const uint8_t *>(NameStrings.data() +
                                                           NameStrings.size());
@@ -235,7 +311,7 @@ int readPGOFuncNameStrings(StringRef NameStrings, InstrProfSymtab &Symtab) {
                                       CompressedSize);
       if (zlib::uncompress(CompressedNameStrings, UncompressedNameStrings,
                            UncompressedSize) != zlib::StatusOK)
-        return 1;
+        return make_error<InstrProfError>(instrprof_error::uncompress_failed);
       P += CompressedSize;
       NameStrings = StringRef(UncompressedNameStrings.data(),
                               UncompressedNameStrings.size());
@@ -246,7 +322,7 @@ int readPGOFuncNameStrings(StringRef NameStrings, InstrProfSymtab &Symtab) {
     }
     // Now parse the name strings.
     SmallVector<StringRef, 0> Names;
-    NameStrings.split(Names, ' ');
+    NameStrings.split(Names, getInstrProfNameSeparator());
     for (StringRef &Name : Names)
       Symtab.addFuncName(Name);
 
@@ -254,16 +330,16 @@ int readPGOFuncNameStrings(StringRef NameStrings, InstrProfSymtab &Symtab) {
       P++;
   }
   Symtab.finalizeSymtab();
-  return 0;
+  return Error::success();
 }
 
-instrprof_error InstrProfValueSiteRecord::merge(InstrProfValueSiteRecord &Input,
-                                                uint64_t Weight) {
+void InstrProfValueSiteRecord::merge(SoftInstrProfErrors &SIPE,
+                                     InstrProfValueSiteRecord &Input,
+                                     uint64_t Weight) {
   this->sortByTargetValues();
   Input.sortByTargetValues();
   auto I = ValueData.begin();
   auto IE = ValueData.end();
-  instrprof_error Result = instrprof_error::success;
   for (auto J = Input.ValueData.begin(), JE = Input.ValueData.end(); J != JE;
        ++J) {
     while (I != IE && I->Value < J->Value)
@@ -272,92 +348,80 @@ instrprof_error InstrProfValueSiteRecord::merge(InstrProfValueSiteRecord &Input,
       bool Overflowed;
       I->Count = SaturatingMultiplyAdd(J->Count, Weight, I->Count, &Overflowed);
       if (Overflowed)
-        Result = instrprof_error::counter_overflow;
+        SIPE.addError(instrprof_error::counter_overflow);
       ++I;
       continue;
     }
     ValueData.insert(I, *J);
   }
-  return Result;
 }
 
-instrprof_error InstrProfValueSiteRecord::scale(uint64_t Weight) {
-  instrprof_error Result = instrprof_error::success;
+void InstrProfValueSiteRecord::scale(SoftInstrProfErrors &SIPE,
+                                     uint64_t Weight) {
   for (auto I = ValueData.begin(), IE = ValueData.end(); I != IE; ++I) {
     bool Overflowed;
     I->Count = SaturatingMultiply(I->Count, Weight, &Overflowed);
     if (Overflowed)
-      Result = instrprof_error::counter_overflow;
+      SIPE.addError(instrprof_error::counter_overflow);
   }
-  return Result;
 }
 
 // Merge Value Profile data from Src record to this record for ValueKind.
 // Scale merged value counts by \p Weight.
-instrprof_error InstrProfRecord::mergeValueProfData(uint32_t ValueKind,
-                                                    InstrProfRecord &Src,
-                                                    uint64_t Weight) {
+void InstrProfRecord::mergeValueProfData(uint32_t ValueKind,
+                                         InstrProfRecord &Src,
+                                         uint64_t Weight) {
   uint32_t ThisNumValueSites = getNumValueSites(ValueKind);
   uint32_t OtherNumValueSites = Src.getNumValueSites(ValueKind);
-  if (ThisNumValueSites != OtherNumValueSites)
-    return instrprof_error::value_site_count_mismatch;
+  if (ThisNumValueSites != OtherNumValueSites) {
+    SIPE.addError(instrprof_error::value_site_count_mismatch);
+    return;
+  }
   std::vector<InstrProfValueSiteRecord> &ThisSiteRecords =
       getValueSitesForKind(ValueKind);
   std::vector<InstrProfValueSiteRecord> &OtherSiteRecords =
       Src.getValueSitesForKind(ValueKind);
-  instrprof_error Result = instrprof_error::success;
   for (uint32_t I = 0; I < ThisNumValueSites; I++)
-    MergeResult(Result, ThisSiteRecords[I].merge(OtherSiteRecords[I], Weight));
-  return Result;
+    ThisSiteRecords[I].merge(SIPE, OtherSiteRecords[I], Weight);
 }
 
-instrprof_error InstrProfRecord::merge(InstrProfRecord &Other,
-                                       uint64_t Weight) {
+void InstrProfRecord::merge(InstrProfRecord &Other, uint64_t Weight) {
   // If the number of counters doesn't match we either have bad data
   // or a hash collision.
-  if (Counts.size() != Other.Counts.size())
-    return instrprof_error::count_mismatch;
-
-  instrprof_error Result = instrprof_error::success;
+  if (Counts.size() != Other.Counts.size()) {
+    SIPE.addError(instrprof_error::count_mismatch);
+    return;
+  }
 
   for (size_t I = 0, E = Other.Counts.size(); I < E; ++I) {
     bool Overflowed;
     Counts[I] =
         SaturatingMultiplyAdd(Other.Counts[I], Weight, Counts[I], &Overflowed);
     if (Overflowed)
-      Result = instrprof_error::counter_overflow;
+      SIPE.addError(instrprof_error::counter_overflow);
   }
 
   for (uint32_t Kind = IPVK_First; Kind <= IPVK_Last; ++Kind)
-    MergeResult(Result, mergeValueProfData(Kind, Other, Weight));
-
-  return Result;
+    mergeValueProfData(Kind, Other, Weight);
 }
 
-instrprof_error InstrProfRecord::scaleValueProfData(uint32_t ValueKind,
-                                                    uint64_t Weight) {
+void InstrProfRecord::scaleValueProfData(uint32_t ValueKind, uint64_t Weight) {
   uint32_t ThisNumValueSites = getNumValueSites(ValueKind);
   std::vector<InstrProfValueSiteRecord> &ThisSiteRecords =
       getValueSitesForKind(ValueKind);
-  instrprof_error Result = instrprof_error::success;
   for (uint32_t I = 0; I < ThisNumValueSites; I++)
-    MergeResult(Result, ThisSiteRecords[I].scale(Weight));
-  return Result;
+    ThisSiteRecords[I].scale(SIPE, Weight);
 }
 
-instrprof_error InstrProfRecord::scale(uint64_t Weight) {
-  instrprof_error Result = instrprof_error::success;
+void InstrProfRecord::scale(uint64_t Weight) {
   for (auto &Count : this->Counts) {
     bool Overflowed;
     Count = SaturatingMultiply(Count, Weight, &Overflowed);
-    if (Overflowed && Result == instrprof_error::success) {
-      Result = instrprof_error::counter_overflow;
-    }
+    if (Overflowed)
+      SIPE.addError(instrprof_error::counter_overflow);
   }
   for (uint32_t Kind = IPVK_First; Kind <= IPVK_Last; ++Kind)
-    MergeResult(Result, scaleValueProfData(Kind, Weight));
-
-  return Result;
+    scaleValueProfData(Kind, Weight);
 }
 
 // Map indirect call target name hash to name string.
@@ -371,8 +435,14 @@ uint64_t InstrProfRecord::remapValue(uint64_t Value, uint32_t ValueKind,
         std::lower_bound(ValueMap->begin(), ValueMap->end(), Value,
                          [](const std::pair<uint64_t, uint64_t> &LHS,
                             uint64_t RHS) { return LHS.first < RHS; });
-    if (Result != ValueMap->end())
+   // Raw function pointer collected by value profiler may be from 
+   // external functions that are not instrumented. They won't have
+   // mapping data to be used by the deserializer. Force the value to
+   // be 0 in this case.
+    if (Result != ValueMap->end() && Result->first == Value)
       Value = (uint64_t)Result->second;
+    else
+      Value = 0;
     break;
   }
   }
@@ -388,7 +458,7 @@ void InstrProfRecord::addValueData(uint32_t ValueKind, uint32_t Site,
   std::vector<InstrProfValueSiteRecord> &ValueSites =
       getValueSitesForKind(ValueKind);
   if (N == 0)
-    ValueSites.push_back(InstrProfValueSiteRecord());
+    ValueSites.emplace_back();
   else
     ValueSites.emplace_back(VData, VData + N);
 }
@@ -422,10 +492,8 @@ uint32_t getNumValueDataForSiteInstrProf(const void *R, uint32_t VK,
 }
 
 void getValueForSiteInstrProf(const void *R, InstrProfValueData *Dst,
-                              uint32_t K, uint32_t S,
-                              uint64_t (*Mapper)(uint32_t, uint64_t)) {
-  return reinterpret_cast<const InstrProfRecord *>(R)->getValueForSite(
-      Dst, K, S, Mapper);
+                              uint32_t K, uint32_t S) {
+  reinterpret_cast<const InstrProfRecord *>(R)->getValueForSite(Dst, K, S);
 }
 
 ValueProfData *allocValueProfDataInstrProf(size_t TotalSizeInBytes) {
@@ -436,12 +504,12 @@ ValueProfData *allocValueProfDataInstrProf(size_t TotalSizeInBytes) {
 }
 
 static ValueProfRecordClosure InstrProfRecordClosure = {
-    0,
+    nullptr,
     getNumValueKindsInstrProf,
     getNumValueSitesInstrProf,
     getNumValueDataInstrProf,
     getNumValueDataForSiteInstrProf,
-    0,
+    nullptr,
     getValueForSiteInstrProf,
     allocValueProfDataInstrProf};
 
@@ -526,45 +594,45 @@ static std::unique_ptr<ValueProfData> allocValueProfData(uint32_t TotalSize) {
                                             ValueProfData());
 }
 
-instrprof_error ValueProfData::checkIntegrity() {
+Error ValueProfData::checkIntegrity() {
   if (NumValueKinds > IPVK_Last + 1)
-    return instrprof_error::malformed;
+    return make_error<InstrProfError>(instrprof_error::malformed);
   // Total size needs to be mulltiple of quadword size.
   if (TotalSize % sizeof(uint64_t))
-    return instrprof_error::malformed;
+    return make_error<InstrProfError>(instrprof_error::malformed);
 
   ValueProfRecord *VR = getFirstValueProfRecord(this);
   for (uint32_t K = 0; K < this->NumValueKinds; K++) {
     if (VR->Kind > IPVK_Last)
-      return instrprof_error::malformed;
+      return make_error<InstrProfError>(instrprof_error::malformed);
     VR = getValueProfRecordNext(VR);
     if ((char *)VR - (char *)this > (ptrdiff_t)TotalSize)
-      return instrprof_error::malformed;
+      return make_error<InstrProfError>(instrprof_error::malformed);
   }
-  return instrprof_error::success;
+  return Error::success();
 }
 
-ErrorOr<std::unique_ptr<ValueProfData>>
+Expected<std::unique_ptr<ValueProfData>>
 ValueProfData::getValueProfData(const unsigned char *D,
                                 const unsigned char *const BufferEnd,
                                 support::endianness Endianness) {
   using namespace support;
   if (D + sizeof(ValueProfData) > BufferEnd)
-    return instrprof_error::truncated;
+    return make_error<InstrProfError>(instrprof_error::truncated);
 
   const unsigned char *Header = D;
   uint32_t TotalSize = swapToHostOrder<uint32_t>(Header, Endianness);
   if (D + TotalSize > BufferEnd)
-    return instrprof_error::too_large;
+    return make_error<InstrProfError>(instrprof_error::too_large);
 
   std::unique_ptr<ValueProfData> VPD = allocValueProfData(TotalSize);
   memcpy(VPD.get(), D, TotalSize);
   // Byte swap.
   VPD->swapBytesToHost(Endianness);
 
-  instrprof_error EC = VPD->checkIntegrity();
-  if (EC != instrprof_error::success)
-    return EC;
+  Error E = VPD->checkIntegrity();
+  if (E)
+    return std::move(E);
 
   return std::move(VPD);
 }
@@ -599,4 +667,117 @@ void ValueProfData::swapBytesFromHost(support::endianness Endianness) {
   sys::swapByteOrder<uint32_t>(NumValueKinds);
 }
 
+void annotateValueSite(Module &M, Instruction &Inst,
+                       const InstrProfRecord &InstrProfR,
+                       InstrProfValueKind ValueKind, uint32_t SiteIdx,
+                       uint32_t MaxMDCount) {
+  uint32_t NV = InstrProfR.getNumValueDataForSite(ValueKind, SiteIdx);
+  if (!NV)
+    return;
+
+  uint64_t Sum = 0;
+  std::unique_ptr<InstrProfValueData[]> VD =
+      InstrProfR.getValueForSite(ValueKind, SiteIdx, &Sum);
+
+  ArrayRef<InstrProfValueData> VDs(VD.get(), NV);
+  annotateValueSite(M, Inst, VDs, Sum, ValueKind, MaxMDCount);
+}
+
+void annotateValueSite(Module &M, Instruction &Inst,
+                       ArrayRef<InstrProfValueData> VDs,
+                       uint64_t Sum, InstrProfValueKind ValueKind,
+                       uint32_t MaxMDCount) {
+  LLVMContext &Ctx = M.getContext();
+  MDBuilder MDHelper(Ctx);
+  SmallVector<Metadata *, 3> Vals;
+  // Tag
+  Vals.push_back(MDHelper.createString("VP"));
+  // Value Kind
+  Vals.push_back(MDHelper.createConstant(
+      ConstantInt::get(Type::getInt32Ty(Ctx), ValueKind)));
+  // Total Count
+  Vals.push_back(
+      MDHelper.createConstant(ConstantInt::get(Type::getInt64Ty(Ctx), Sum)));
+
+  // Value Profile Data
+  uint32_t MDCount = MaxMDCount;
+  for (auto &VD : VDs) {
+    Vals.push_back(MDHelper.createConstant(
+        ConstantInt::get(Type::getInt64Ty(Ctx), VD.Value)));
+    Vals.push_back(MDHelper.createConstant(
+        ConstantInt::get(Type::getInt64Ty(Ctx), VD.Count)));
+    if (--MDCount == 0)
+      break;
+  }
+  Inst.setMetadata(LLVMContext::MD_prof, MDNode::get(Ctx, Vals));
+}
+
+bool getValueProfDataFromInst(const Instruction &Inst,
+                              InstrProfValueKind ValueKind,
+                              uint32_t MaxNumValueData,
+                              InstrProfValueData ValueData[],
+                              uint32_t &ActualNumValueData, uint64_t &TotalC) {
+  MDNode *MD = Inst.getMetadata(LLVMContext::MD_prof);
+  if (!MD)
+    return false;
+
+  unsigned NOps = MD->getNumOperands();
+
+  if (NOps < 5)
+    return false;
+
+  // Operand 0 is a string tag "VP":
+  MDString *Tag = cast<MDString>(MD->getOperand(0));
+  if (!Tag)
+    return false;
+
+  if (!Tag->getString().equals("VP"))
+    return false;
+
+  // Now check kind:
+  ConstantInt *KindInt = mdconst::dyn_extract<ConstantInt>(MD->getOperand(1));
+  if (!KindInt)
+    return false;
+  if (KindInt->getZExtValue() != ValueKind)
+    return false;
+
+  // Get total count
+  ConstantInt *TotalCInt = mdconst::dyn_extract<ConstantInt>(MD->getOperand(2));
+  if (!TotalCInt)
+    return false;
+  TotalC = TotalCInt->getZExtValue();
+
+  ActualNumValueData = 0;
+
+  for (unsigned I = 3; I < NOps; I += 2) {
+    if (ActualNumValueData >= MaxNumValueData)
+      break;
+    ConstantInt *Value = mdconst::dyn_extract<ConstantInt>(MD->getOperand(I));
+    ConstantInt *Count =
+        mdconst::dyn_extract<ConstantInt>(MD->getOperand(I + 1));
+    if (!Value || !Count)
+      return false;
+    ValueData[ActualNumValueData].Value = Value->getZExtValue();
+    ValueData[ActualNumValueData].Count = Count->getZExtValue();
+    ActualNumValueData++;
+  }
+  return true;
+}
+
+MDNode *getPGOFuncNameMetadata(const Function &F) {
+  return F.getMetadata(getPGOFuncNameMetadataName());
+}
+
+void createPGOFuncNameMetadata(Function &F, StringRef PGOFuncName) {
+  // Only for internal linkage functions.
+  if (PGOFuncName == F.getName())
+      return;
+  // Don't create duplicated meta-data.
+  if (getPGOFuncNameMetadata(F))
+    return;
+  LLVMContext &C = F.getContext();
+  MDNode *N = MDNode::get(C, MDString::get(C, PGOFuncName));
+  F.setMetadata(getPGOFuncNameMetadataName(), N);
 }
+
+} // end namespace llvm
diff --git a/lib/ProfileData/InstrProfReader.cpp b/lib/ProfileData/InstrProfReader.cpp
index 5e83456822fd..81c13b35ce30 100644
--- a/lib/ProfileData/InstrProfReader.cpp
+++ b/lib/ProfileData/InstrProfReader.cpp
@@ -18,33 +18,33 @@
 
 using namespace llvm;
 
-static ErrorOr<std::unique_ptr<MemoryBuffer>>
-setupMemoryBuffer(std::string Path) {
+static Expected<std::unique_ptr<MemoryBuffer>>
+setupMemoryBuffer(const Twine &Path) {
   ErrorOr<std::unique_ptr<MemoryBuffer>> BufferOrErr =
       MemoryBuffer::getFileOrSTDIN(Path);
   if (std::error_code EC = BufferOrErr.getError())
-    return EC;
+    return errorCodeToError(EC);
   return std::move(BufferOrErr.get());
 }
 
-static std::error_code initializeReader(InstrProfReader &Reader) {
+static Error initializeReader(InstrProfReader &Reader) {
   return Reader.readHeader();
 }
 
-ErrorOr<std::unique_ptr<InstrProfReader>>
-InstrProfReader::create(std::string Path) {
+Expected<std::unique_ptr<InstrProfReader>>
+InstrProfReader::create(const Twine &Path) {
   // Set up the buffer to read.
   auto BufferOrError = setupMemoryBuffer(Path);
-  if (std::error_code EC = BufferOrError.getError())
-    return EC;
+  if (Error E = BufferOrError.takeError())
+    return std::move(E);
   return InstrProfReader::create(std::move(BufferOrError.get()));
 }
 
-ErrorOr<std::unique_ptr<InstrProfReader>>
+Expected<std::unique_ptr<InstrProfReader>>
 InstrProfReader::create(std::unique_ptr<MemoryBuffer> Buffer) {
   // Sanity check the buffer.
   if (Buffer->getBufferSize() > std::numeric_limits<unsigned>::max())
-    return instrprof_error::too_large;
+    return make_error<InstrProfError>(instrprof_error::too_large);
 
   std::unique_ptr<InstrProfReader> Result;
   // Create the reader.
@@ -57,46 +57,49 @@ InstrProfReader::create(std::unique_ptr<MemoryBuffer> Buffer) {
   else if (TextInstrProfReader::hasFormat(*Buffer))
     Result.reset(new TextInstrProfReader(std::move(Buffer)));
   else
-    return instrprof_error::unrecognized_format;
+    return make_error<InstrProfError>(instrprof_error::unrecognized_format);
 
   // Initialize the reader and return the result.
-  if (std::error_code EC = initializeReader(*Result))
-    return EC;
+  if (Error E = initializeReader(*Result))
+    return std::move(E);
 
   return std::move(Result);
 }
 
-ErrorOr<std::unique_ptr<IndexedInstrProfReader>>
-IndexedInstrProfReader::create(std::string Path) {
+Expected<std::unique_ptr<IndexedInstrProfReader>>
+IndexedInstrProfReader::create(const Twine &Path) {
   // Set up the buffer to read.
   auto BufferOrError = setupMemoryBuffer(Path);
-  if (std::error_code EC = BufferOrError.getError())
-    return EC;
+  if (Error E = BufferOrError.takeError())
+    return std::move(E);
   return IndexedInstrProfReader::create(std::move(BufferOrError.get()));
 }
 
 
-ErrorOr<std::unique_ptr<IndexedInstrProfReader>>
+Expected<std::unique_ptr<IndexedInstrProfReader>>
 IndexedInstrProfReader::create(std::unique_ptr<MemoryBuffer> Buffer) {
   // Sanity check the buffer.
   if (Buffer->getBufferSize() > std::numeric_limits<unsigned>::max())
-    return instrprof_error::too_large;
+    return make_error<InstrProfError>(instrprof_error::too_large);
 
   // Create the reader.
   if (!IndexedInstrProfReader::hasFormat(*Buffer))
-    return instrprof_error::bad_magic;
+    return make_error<InstrProfError>(instrprof_error::bad_magic);
   auto Result = llvm::make_unique<IndexedInstrProfReader>(std::move(Buffer));
 
   // Initialize the reader and return the result.
-  if (std::error_code EC = initializeReader(*Result))
-    return EC;
+  if (Error E = initializeReader(*Result))
+    return std::move(E);
 
   return std::move(Result);
 }
 
 void InstrProfIterator::Increment() {
-  if (Reader->readNextRecord(Record))
+  if (auto E = Reader->readNextRecord(Record)) {
+    // Handle errors in the reader.
+    InstrProfError::take(std::move(E));
     *this = InstrProfIterator();
+  }
 }
 
 bool TextInstrProfReader::hasFormat(const MemoryBuffer &Buffer) {
@@ -109,12 +112,30 @@ bool TextInstrProfReader::hasFormat(const MemoryBuffer &Buffer) {
                      [](char c) { return ::isprint(c) || ::isspace(c); });
 }
 
-std::error_code TextInstrProfReader::readHeader() {
+// Read the profile variant flag from the header: ":FE" means this is a FE
+// generated profile. ":IR" means this is an IR level profile. Other strings
+// with a leading ':' will be reported an error format.
+Error TextInstrProfReader::readHeader() {
   Symtab.reset(new InstrProfSymtab());
+  bool IsIRInstr = false;
+  if (!Line->startswith(":")) {
+    IsIRLevelProfile = false;
+    return success();
+  }
+  StringRef Str = (Line)->substr(1);
+  if (Str.equals_lower("ir"))
+    IsIRInstr = true;
+  else if (Str.equals_lower("fe"))
+    IsIRInstr = false;
+  else
+    return error(instrprof_error::bad_header);
+
+  ++Line;
+  IsIRLevelProfile = IsIRInstr;
   return success();
 }
 
-std::error_code
+Error
 TextInstrProfReader::readValueProfileData(InstrProfRecord &Record) {
 
 #define CHECK_LINE_END(Line)                                                   \
@@ -156,7 +177,7 @@ TextInstrProfReader::readValueProfileData(InstrProfRecord &Record) {
       std::vector<InstrProfValueData> CurrentValues;
       for (uint32_t V = 0; V < NumValueData; V++) {
         CHECK_LINE_END(Line);
-        std::pair<StringRef, StringRef> VD = Line->split(':');
+        std::pair<StringRef, StringRef> VD = Line->rsplit(':');
         uint64_t TakenCount, Value;
         if (VK == IPVK_IndirectCallTarget) {
           Symtab->addFuncName(VD.first);
@@ -178,7 +199,7 @@ TextInstrProfReader::readValueProfileData(InstrProfRecord &Record) {
 #undef VP_READ_ADVANCE
 }
 
-std::error_code TextInstrProfReader::readNextRecord(InstrProfRecord &Record) {
+Error TextInstrProfReader::readNextRecord(InstrProfRecord &Record) {
   // Skip empty lines and comments.
   while (!Line.is_at_end() && (Line->empty() || Line->startswith("#")))
     ++Line;
@@ -220,8 +241,8 @@ std::error_code TextInstrProfReader::readNextRecord(InstrProfRecord &Record) {
   }
 
   // Check if value profile data exists and read it if so.
-  if (std::error_code EC = readValueProfileData(Record))
-    return EC;
+  if (Error E = readValueProfileData(Record))
+    return E;
 
   // This is needed to avoid two pass parsing because llvm-profdata
   // does dumping while reading.
@@ -240,7 +261,7 @@ bool RawInstrProfReader<IntPtrT>::hasFormat(const MemoryBuffer &DataBuffer) {
 }
 
 template <class IntPtrT>
-std::error_code RawInstrProfReader<IntPtrT>::readHeader() {
+Error RawInstrProfReader<IntPtrT>::readHeader() {
   if (!hasFormat(*DataBuffer))
     return error(instrprof_error::bad_magic);
   if (DataBuffer->getBufferSize() < sizeof(RawInstrProf::Header))
@@ -252,26 +273,25 @@ std::error_code RawInstrProfReader<IntPtrT>::readHeader() {
 }
 
 template <class IntPtrT>
-std::error_code
-RawInstrProfReader<IntPtrT>::readNextHeader(const char *CurrentPos) {
+Error RawInstrProfReader<IntPtrT>::readNextHeader(const char *CurrentPos) {
   const char *End = DataBuffer->getBufferEnd();
   // Skip zero padding between profiles.
   while (CurrentPos != End && *CurrentPos == 0)
     ++CurrentPos;
   // If there's nothing left, we're done.
   if (CurrentPos == End)
-    return instrprof_error::eof;
+    return make_error<InstrProfError>(instrprof_error::eof);
   // If there isn't enough space for another header, this is probably just
   // garbage at the end of the file.
   if (CurrentPos + sizeof(RawInstrProf::Header) > End)
-    return instrprof_error::malformed;
+    return make_error<InstrProfError>(instrprof_error::malformed);
   // The writer ensures each profile is padded to start at an aligned address.
   if (reinterpret_cast<size_t>(CurrentPos) % alignOf<uint64_t>())
-    return instrprof_error::malformed;
+    return make_error<InstrProfError>(instrprof_error::malformed);
   // The magic should have the same byte order as in the previous header.
   uint64_t Magic = *reinterpret_cast<const uint64_t *>(CurrentPos);
   if (Magic != swap(RawInstrProf::getMagic<IntPtrT>()))
-    return instrprof_error::bad_magic;
+    return make_error<InstrProfError>(instrprof_error::bad_magic);
 
   // There's another profile to read, so we need to process the header.
   auto *Header = reinterpret_cast<const RawInstrProf::Header *>(CurrentPos);
@@ -279,30 +299,31 @@ RawInstrProfReader<IntPtrT>::readNextHeader(const char *CurrentPos) {
 }
 
 template <class IntPtrT>
-void RawInstrProfReader<IntPtrT>::createSymtab(InstrProfSymtab &Symtab) {
+Error RawInstrProfReader<IntPtrT>::createSymtab(InstrProfSymtab &Symtab) {
+  if (Error E = Symtab.create(StringRef(NamesStart, NamesSize)))
+    return error(std::move(E));
   for (const RawInstrProf::ProfileData<IntPtrT> *I = Data; I != DataEnd; ++I) {
-    StringRef FunctionName(getName(I->NamePtr), swap(I->NameSize));
-    Symtab.addFuncName(FunctionName);
     const IntPtrT FPtr = swap(I->FunctionPointer);
     if (!FPtr)
       continue;
-    Symtab.mapAddress(FPtr, IndexedInstrProf::ComputeHash(FunctionName));
+    Symtab.mapAddress(FPtr, I->NameRef);
   }
   Symtab.finalizeSymtab();
+  return success();
 }
 
 template <class IntPtrT>
-std::error_code
-RawInstrProfReader<IntPtrT>::readHeader(const RawInstrProf::Header &Header) {
-  if (swap(Header.Version) != RawInstrProf::Version)
+Error RawInstrProfReader<IntPtrT>::readHeader(
+    const RawInstrProf::Header &Header) {
+  Version = swap(Header.Version);
+  if (GET_VERSION(Version) != RawInstrProf::Version)
     return error(instrprof_error::unsupported_version);
 
   CountersDelta = swap(Header.CountersDelta);
   NamesDelta = swap(Header.NamesDelta);
   auto DataSize = swap(Header.DataSize);
   auto CountersSize = swap(Header.CountersSize);
-  auto NamesSize = swap(Header.NamesSize);
-  auto ValueDataSize = swap(Header.ValueDataSize);
+  NamesSize = swap(Header.NamesSize);
   ValueKindLast = swap(Header.ValueKindLast);
 
   auto DataSizeInBytes = DataSize * sizeof(RawInstrProf::ProfileData<IntPtrT>);
@@ -312,10 +333,9 @@ RawInstrProfReader<IntPtrT>::readHeader(const RawInstrProf::Header &Header) {
   ptrdiff_t CountersOffset = DataOffset + DataSizeInBytes;
   ptrdiff_t NamesOffset = CountersOffset + sizeof(uint64_t) * CountersSize;
   ptrdiff_t ValueDataOffset = NamesOffset + NamesSize + PaddingSize;
-  size_t ProfileSize = ValueDataOffset + ValueDataSize;
 
   auto *Start = reinterpret_cast<const char *>(&Header);
-  if (Start + ProfileSize > DataBuffer->getBufferEnd())
+  if (Start + ValueDataOffset > DataBuffer->getBufferEnd())
     return error(instrprof_error::bad_header);
 
   Data = reinterpret_cast<const RawInstrProf::ProfileData<IntPtrT> *>(
@@ -324,33 +344,29 @@ RawInstrProfReader<IntPtrT>::readHeader(const RawInstrProf::Header &Header) {
   CountersStart = reinterpret_cast<const uint64_t *>(Start + CountersOffset);
   NamesStart = Start + NamesOffset;
   ValueDataStart = reinterpret_cast<const uint8_t *>(Start + ValueDataOffset);
-  ProfileEnd = Start + ProfileSize;
 
   std::unique_ptr<InstrProfSymtab> NewSymtab = make_unique<InstrProfSymtab>();
-  createSymtab(*NewSymtab.get());
+  if (Error E = createSymtab(*NewSymtab.get()))
+    return E;
+
   Symtab = std::move(NewSymtab);
   return success();
 }
 
 template <class IntPtrT>
-std::error_code RawInstrProfReader<IntPtrT>::readName(InstrProfRecord &Record) {
-  Record.Name = StringRef(getName(Data->NamePtr), swap(Data->NameSize));
-  if (Record.Name.data() < NamesStart ||
-      Record.Name.data() + Record.Name.size() >
-          reinterpret_cast<const char *>(ValueDataStart))
-    return error(instrprof_error::malformed);
+Error RawInstrProfReader<IntPtrT>::readName(InstrProfRecord &Record) {
+  Record.Name = getName(Data->NameRef);
   return success();
 }
 
 template <class IntPtrT>
-std::error_code RawInstrProfReader<IntPtrT>::readFuncHash(
-    InstrProfRecord &Record) {
+Error RawInstrProfReader<IntPtrT>::readFuncHash(InstrProfRecord &Record) {
   Record.Hash = swap(Data->FuncHash);
   return success();
 }
 
 template <class IntPtrT>
-std::error_code RawInstrProfReader<IntPtrT>::readRawCounts(
+Error RawInstrProfReader<IntPtrT>::readRawCounts(
     InstrProfRecord &Record) {
   uint32_t NumCounters = swap(Data->NumCounters);
   IntPtrT CounterPtr = Data->CounterPtr;
@@ -377,8 +393,8 @@ std::error_code RawInstrProfReader<IntPtrT>::readRawCounts(
 }
 
 template <class IntPtrT>
-std::error_code
-RawInstrProfReader<IntPtrT>::readValueProfilingData(InstrProfRecord &Record) {
+Error RawInstrProfReader<IntPtrT>::readValueProfilingData(
+    InstrProfRecord &Record) {
 
   Record.clearValueData();
   CurValueDataSize = 0;
@@ -390,41 +406,44 @@ RawInstrProfReader<IntPtrT>::readValueProfilingData(InstrProfRecord &Record) {
   if (!NumValueKinds)
     return success();
 
-  ErrorOr<std::unique_ptr<ValueProfData>> VDataPtrOrErr =
-      ValueProfData::getValueProfData(ValueDataStart,
-                                      (const unsigned char *)ProfileEnd,
-                                      getDataEndianness());
+  Expected<std::unique_ptr<ValueProfData>> VDataPtrOrErr =
+      ValueProfData::getValueProfData(
+          ValueDataStart, (const unsigned char *)DataBuffer->getBufferEnd(),
+          getDataEndianness());
 
-  if (VDataPtrOrErr.getError())
-    return VDataPtrOrErr.getError();
+  if (Error E = VDataPtrOrErr.takeError())
+    return E;
 
+  // Note that besides deserialization, this also performs the conversion for
+  // indirect call targets.  The function pointers from the raw profile are
+  // remapped into function name hashes.
   VDataPtrOrErr.get()->deserializeTo(Record, &Symtab->getAddrHashMap());
   CurValueDataSize = VDataPtrOrErr.get()->getSize();
   return success();
 }
 
 template <class IntPtrT>
-std::error_code
-RawInstrProfReader<IntPtrT>::readNextRecord(InstrProfRecord &Record) {
+Error RawInstrProfReader<IntPtrT>::readNextRecord(InstrProfRecord &Record) {
   if (atEnd())
-    if (std::error_code EC = readNextHeader(ProfileEnd))
-      return EC;
+    // At this point, ValueDataStart field points to the next header.
+    if (Error E = readNextHeader(getNextHeaderPos()))
+      return E;
 
   // Read name ad set it in Record.
-  if (std::error_code EC = readName(Record))
-    return EC;
+  if (Error E = readName(Record))
+    return E;
 
   // Read FuncHash and set it in Record.
-  if (std::error_code EC = readFuncHash(Record))
-    return EC;
+  if (Error E = readFuncHash(Record))
+    return E;
 
   // Read raw counts and set Record.
-  if (std::error_code EC = readRawCounts(Record))
-    return EC;
+  if (Error E = readRawCounts(Record))
+    return E;
 
   // Read value data and set Record.
-  if (std::error_code EC = readValueProfilingData(Record))
-    return EC;
+  if (Error E = readValueProfilingData(Record))
+    return E;
 
   // Iterate.
   advanceData();
@@ -446,10 +465,10 @@ typedef InstrProfLookupTrait::offset_type offset_type;
 
 bool InstrProfLookupTrait::readValueProfilingData(
     const unsigned char *&D, const unsigned char *const End) {
-  ErrorOr<std::unique_ptr<ValueProfData>> VDataPtrOrErr =
+  Expected<std::unique_ptr<ValueProfData>> VDataPtrOrErr =
       ValueProfData::getValueProfData(D, End, ValueProfDataEndianness);
 
-  if (VDataPtrOrErr.getError())
+  if (VDataPtrOrErr.takeError())
     return false;
 
   VDataPtrOrErr.get()->deserializeTo(DataBuffer.back(), nullptr);
@@ -475,10 +494,10 @@ data_type InstrProfLookupTrait::ReadData(StringRef K, const unsigned char *D,
       return data_type();
     uint64_t Hash = endian::readNext<uint64_t, little, unaligned>(D);
 
-    // Initialize number of counters for FormatVersion == 1.
+    // Initialize number of counters for GET_VERSION(FormatVersion) == 1.
     uint64_t CountsSize = N / sizeof(uint64_t) - 1;
     // If format version is different then read the number of counters.
-    if (FormatVersion != 1) {
+    if (GET_VERSION(FormatVersion) != IndexedInstrProf::ProfVersion::Version1) {
       if (D + sizeof(uint64_t) > End)
         return data_type();
       CountsSize = endian::readNext<uint64_t, little, unaligned>(D);
@@ -495,7 +514,8 @@ data_type InstrProfLookupTrait::ReadData(StringRef K, const unsigned char *D,
     DataBuffer.emplace_back(K, Hash, std::move(CounterBuffer));
 
     // Read value profiling data.
-    if (FormatVersion > 2 && !readValueProfilingData(D, End)) {
+    if (GET_VERSION(FormatVersion) > IndexedInstrProf::ProfVersion::Version2 &&
+        !readValueProfilingData(D, End)) {
       DataBuffer.clear();
       return data_type();
     }
@@ -504,31 +524,31 @@ data_type InstrProfLookupTrait::ReadData(StringRef K, const unsigned char *D,
 }
 
 template <typename HashTableImpl>
-std::error_code InstrProfReaderIndex<HashTableImpl>::getRecords(
+Error InstrProfReaderIndex<HashTableImpl>::getRecords(
     StringRef FuncName, ArrayRef<InstrProfRecord> &Data) {
   auto Iter = HashTable->find(FuncName);
   if (Iter == HashTable->end())
-    return instrprof_error::unknown_function;
+    return make_error<InstrProfError>(instrprof_error::unknown_function);
 
   Data = (*Iter);
   if (Data.empty())
-    return instrprof_error::malformed;
+    return make_error<InstrProfError>(instrprof_error::malformed);
 
-  return instrprof_error::success;
+  return Error::success();
 }
 
 template <typename HashTableImpl>
-std::error_code InstrProfReaderIndex<HashTableImpl>::getRecords(
+Error InstrProfReaderIndex<HashTableImpl>::getRecords(
     ArrayRef<InstrProfRecord> &Data) {
   if (atEnd())
-    return instrprof_error::eof;
+    return make_error<InstrProfError>(instrprof_error::eof);
 
   Data = *RecordIterator;
 
   if (Data.empty())
-    return instrprof_error::malformed;
+    return make_error<InstrProfError>(instrprof_error::malformed);
 
-  return instrprof_error::success;
+  return Error::success();
 }
 
 template <typename HashTableImpl>
@@ -553,7 +573,56 @@ bool IndexedInstrProfReader::hasFormat(const MemoryBuffer &DataBuffer) {
   return Magic == IndexedInstrProf::Magic;
 }
 
-std::error_code IndexedInstrProfReader::readHeader() {
+const unsigned char *
+IndexedInstrProfReader::readSummary(IndexedInstrProf::ProfVersion Version,
+                                    const unsigned char *Cur) {
+  using namespace IndexedInstrProf;
+  using namespace support;
+  if (Version >= IndexedInstrProf::Version4) {
+    const IndexedInstrProf::Summary *SummaryInLE =
+        reinterpret_cast<const IndexedInstrProf::Summary *>(Cur);
+    uint64_t NFields =
+        endian::byte_swap<uint64_t, little>(SummaryInLE->NumSummaryFields);
+    uint64_t NEntries =
+        endian::byte_swap<uint64_t, little>(SummaryInLE->NumCutoffEntries);
+    uint32_t SummarySize =
+        IndexedInstrProf::Summary::getSize(NFields, NEntries);
+    std::unique_ptr<IndexedInstrProf::Summary> SummaryData =
+        IndexedInstrProf::allocSummary(SummarySize);
+
+    const uint64_t *Src = reinterpret_cast<const uint64_t *>(SummaryInLE);
+    uint64_t *Dst = reinterpret_cast<uint64_t *>(SummaryData.get());
+    for (unsigned I = 0; I < SummarySize / sizeof(uint64_t); I++)
+      Dst[I] = endian::byte_swap<uint64_t, little>(Src[I]);
+
+    llvm::SummaryEntryVector DetailedSummary;
+    for (unsigned I = 0; I < SummaryData->NumCutoffEntries; I++) {
+      const IndexedInstrProf::Summary::Entry &Ent = SummaryData->getEntry(I);
+      DetailedSummary.emplace_back((uint32_t)Ent.Cutoff, Ent.MinBlockCount,
+                                   Ent.NumBlocks);
+    }
+    // initialize InstrProfSummary using the SummaryData from disk.
+    this->Summary = llvm::make_unique<ProfileSummary>(
+        ProfileSummary::PSK_Instr, DetailedSummary,
+        SummaryData->get(Summary::TotalBlockCount),
+        SummaryData->get(Summary::MaxBlockCount),
+        SummaryData->get(Summary::MaxInternalBlockCount),
+        SummaryData->get(Summary::MaxFunctionCount),
+        SummaryData->get(Summary::TotalNumBlocks),
+        SummaryData->get(Summary::TotalNumFunctions));
+    return Cur + SummarySize;
+  } else {
+    // For older version of profile data, we need to compute on the fly:
+    using namespace IndexedInstrProf;
+    InstrProfSummaryBuilder Builder(ProfileSummaryBuilder::DefaultCutoffs);
+    // FIXME: This only computes an empty summary. Need to call addRecord for
+    // all InstrProfRecords to get the correct summary.
+    this->Summary = Builder.getSummary();
+    return Cur;
+  }
+}
+
+Error IndexedInstrProfReader::readHeader() {
   const unsigned char *Start =
       (const unsigned char *)DataBuffer->getBufferStart();
   const unsigned char *Cur = Start;
@@ -572,12 +641,11 @@ std::error_code IndexedInstrProfReader::readHeader() {
 
   // Read the version.
   uint64_t FormatVersion = endian::byte_swap<uint64_t, little>(Header->Version);
-  if (FormatVersion > IndexedInstrProf::Version)
+  if (GET_VERSION(FormatVersion) >
+      IndexedInstrProf::ProfVersion::CurrentVersion)
     return error(instrprof_error::unsupported_version);
 
-  // Read the maximal function count.
-  MaxFunctionCount =
-      endian::byte_swap<uint64_t, little>(Header->MaxFunctionCount);
+  Cur = readSummary((IndexedInstrProf::ProfVersion)FormatVersion, Cur);
 
   // Read the hash type and start offset.
   IndexedInstrProf::HashT HashType = static_cast<IndexedInstrProf::HashT>(
@@ -606,13 +674,13 @@ InstrProfSymtab &IndexedInstrProfReader::getSymtab() {
   return *Symtab.get();
 }
 
-ErrorOr<InstrProfRecord>
+Expected<InstrProfRecord>
 IndexedInstrProfReader::getInstrProfRecord(StringRef FuncName,
                                            uint64_t FuncHash) {
   ArrayRef<InstrProfRecord> Data;
-  std::error_code EC = Index->getRecords(FuncName, Data);
-  if (EC != instrprof_error::success)
-    return EC;
+  Error Err = Index->getRecords(FuncName, Data);
+  if (Err)
+    return std::move(Err);
   // Found it. Look for counters with the right hash.
   for (unsigned I = 0, E = Data.size(); I < E; ++I) {
     // Check for a match and fill the vector if there is one.
@@ -623,26 +691,25 @@ IndexedInstrProfReader::getInstrProfRecord(StringRef FuncName,
   return error(instrprof_error::hash_mismatch);
 }
 
-std::error_code
-IndexedInstrProfReader::getFunctionCounts(StringRef FuncName, uint64_t FuncHash,
-                                          std::vector<uint64_t> &Counts) {
-  ErrorOr<InstrProfRecord> Record = getInstrProfRecord(FuncName, FuncHash);
-  if (std::error_code EC = Record.getError())
-    return EC;
+Error IndexedInstrProfReader::getFunctionCounts(StringRef FuncName,
+                                                uint64_t FuncHash,
+                                                std::vector<uint64_t> &Counts) {
+  Expected<InstrProfRecord> Record = getInstrProfRecord(FuncName, FuncHash);
+  if (Error E = Record.takeError())
+    return error(std::move(E));
 
   Counts = Record.get().Counts;
   return success();
 }
 
-std::error_code IndexedInstrProfReader::readNextRecord(
-    InstrProfRecord &Record) {
+Error IndexedInstrProfReader::readNextRecord(InstrProfRecord &Record) {
   static unsigned RecordIndex = 0;
 
   ArrayRef<InstrProfRecord> Data;
 
-  std::error_code EC = Index->getRecords(Data);
-  if (EC != instrprof_error::success)
-    return error(EC);
+  Error E = Index->getRecords(Data);
+  if (E)
+    return error(std::move(E));
 
   Record = Data[RecordIndex++];
   if (RecordIndex >= Data.size()) {
diff --git a/lib/ProfileData/InstrProfWriter.cpp b/lib/ProfileData/InstrProfWriter.cpp
index f5227248af20..e25299ef6706 100644
--- a/lib/ProfileData/InstrProfWriter.cpp
+++ b/lib/ProfileData/InstrProfWriter.cpp
@@ -20,10 +20,59 @@
 
 using namespace llvm;
 
-namespace {
-static support::endianness ValueProfDataEndianness = support::little;
+// A struct to define how the data stream should be patched. For Indexed
+// profiling, only uint64_t data type is needed.
+struct PatchItem {
+  uint64_t Pos; // Where to patch.
+  uint64_t *D;  // Pointer to an array of source data.
+  int N;        // Number of elements in \c D array.
+};
+
+namespace llvm {
+// A wrapper class to abstract writer stream with support of bytes
+// back patching.
+class ProfOStream {
+
+public:
+  ProfOStream(llvm::raw_fd_ostream &FD) : IsFDOStream(true), OS(FD), LE(FD) {}
+  ProfOStream(llvm::raw_string_ostream &STR)
+      : IsFDOStream(false), OS(STR), LE(STR) {}
+
+  uint64_t tell() { return OS.tell(); }
+  void write(uint64_t V) { LE.write<uint64_t>(V); }
+  // \c patch can only be called when all data is written and flushed.
+  // For raw_string_ostream, the patch is done on the target string
+  // directly and it won't be reflected in the stream's internal buffer.
+  void patch(PatchItem *P, int NItems) {
+    using namespace support;
+    if (IsFDOStream) {
+      llvm::raw_fd_ostream &FDOStream = static_cast<llvm::raw_fd_ostream &>(OS);
+      for (int K = 0; K < NItems; K++) {
+        FDOStream.seek(P[K].Pos);
+        for (int I = 0; I < P[K].N; I++)
+          write(P[K].D[I]);
+      }
+    } else {
+      llvm::raw_string_ostream &SOStream =
+          static_cast<llvm::raw_string_ostream &>(OS);
+      std::string &Data = SOStream.str(); // with flush
+      for (int K = 0; K < NItems; K++) {
+        for (int I = 0; I < P[K].N; I++) {
+          uint64_t Bytes = endian::byte_swap<uint64_t, little>(P[K].D[I]);
+          Data.replace(P[K].Pos + I * sizeof(uint64_t), sizeof(uint64_t),
+                       (const char *)&Bytes, sizeof(uint64_t));
+        }
+      }
+    }
+  }
+  // If \c OS is an instance of \c raw_fd_ostream, this field will be
+  // true. Otherwise, \c OS will be an raw_string_ostream.
+  bool IsFDOStream;
+  raw_ostream &OS;
+  support::endian::Writer<support::little> LE;
+};
 
-class InstrProfRecordTrait {
+class InstrProfRecordWriterTrait {
 public:
   typedef StringRef key_type;
   typedef StringRef key_type_ref;
@@ -34,6 +83,10 @@ public:
   typedef uint64_t hash_value_type;
   typedef uint64_t offset_type;
 
+  support::endianness ValueProfDataEndianness;
+  InstrProfSummaryBuilder *SummaryBuilder;
+
+  InstrProfRecordWriterTrait() : ValueProfDataEndianness(support::little) {}
   static hash_value_type ComputeHash(key_type_ref K) {
     return IndexedInstrProf::ComputeHash(K);
   }
@@ -61,16 +114,16 @@ public:
     return std::make_pair(N, M);
   }
 
-  static void EmitKey(raw_ostream &Out, key_type_ref K, offset_type N){
+  void EmitKey(raw_ostream &Out, key_type_ref K, offset_type N) {
     Out.write(K.data(), N);
   }
 
-  static void EmitData(raw_ostream &Out, key_type_ref, data_type_ref V,
-                       offset_type) {
+  void EmitData(raw_ostream &Out, key_type_ref, data_type_ref V, offset_type) {
     using namespace llvm::support;
     endian::Writer<little> LE(Out);
     for (const auto &ProfileData : *V) {
       const InstrProfRecord &ProfRecord = ProfileData.second;
+      SummaryBuilder->addRecord(ProfRecord);
 
       LE.write<uint64_t>(ProfileData.first); // Function hash
       LE.write<uint64_t>(ProfRecord.Counts.size());
@@ -88,14 +141,22 @@ public:
 };
 }
 
+InstrProfWriter::InstrProfWriter(bool Sparse)
+    : Sparse(Sparse), FunctionData(), ProfileKind(PF_Unknown),
+      InfoObj(new InstrProfRecordWriterTrait()) {}
+
+InstrProfWriter::~InstrProfWriter() { delete InfoObj; }
+
 // Internal interface for testing purpose only.
 void InstrProfWriter::setValueProfDataEndianness(
     support::endianness Endianness) {
-  ValueProfDataEndianness = Endianness;
+  InfoObj->ValueProfDataEndianness = Endianness;
+}
+void InstrProfWriter::setOutputSparse(bool Sparse) {
+  this->Sparse = Sparse;
 }
 
-std::error_code InstrProfWriter::addRecord(InstrProfRecord &&I,
-                                           uint64_t Weight) {
+Error InstrProfWriter::addRecord(InstrProfRecord &&I, uint64_t Weight) {
   auto &ProfileDataMap = FunctionData[I.Name];
 
   bool NewFunc;
@@ -104,73 +165,128 @@ std::error_code InstrProfWriter::addRecord(InstrProfRecord &&I,
       ProfileDataMap.insert(std::make_pair(I.Hash, InstrProfRecord()));
   InstrProfRecord &Dest = Where->second;
 
-  instrprof_error Result = instrprof_error::success;
   if (NewFunc) {
     // We've never seen a function with this name and hash, add it.
     Dest = std::move(I);
     // Fix up the name to avoid dangling reference.
     Dest.Name = FunctionData.find(Dest.Name)->getKey();
     if (Weight > 1)
-      Result = Dest.scale(Weight);
+      Dest.scale(Weight);
   } else {
     // We're updating a function we've seen before.
-    Result = Dest.merge(I, Weight);
+    Dest.merge(I, Weight);
   }
 
   Dest.sortValueData();
 
-  // We keep track of the max function count as we go for simplicity.
-  // Update this statistic no matter the result of the merge.
-  if (Dest.Counts[0] > MaxFunctionCount)
-    MaxFunctionCount = Dest.Counts[0];
+  return Dest.takeError();
+}
+
+bool InstrProfWriter::shouldEncodeData(const ProfilingData &PD) {
+  if (!Sparse)
+    return true;
+  for (const auto &Func : PD) {
+    const InstrProfRecord &IPR = Func.second;
+    if (std::any_of(IPR.Counts.begin(), IPR.Counts.end(),
+                    [](uint64_t Count) { return Count > 0; }))
+      return true;
+  }
+  return false;
+}
 
-  return Result;
+static void setSummary(IndexedInstrProf::Summary *TheSummary,
+                       ProfileSummary &PS) {
+  using namespace IndexedInstrProf;
+  std::vector<ProfileSummaryEntry> &Res = PS.getDetailedSummary();
+  TheSummary->NumSummaryFields = Summary::NumKinds;
+  TheSummary->NumCutoffEntries = Res.size();
+  TheSummary->set(Summary::MaxFunctionCount, PS.getMaxFunctionCount());
+  TheSummary->set(Summary::MaxBlockCount, PS.getMaxCount());
+  TheSummary->set(Summary::MaxInternalBlockCount, PS.getMaxInternalCount());
+  TheSummary->set(Summary::TotalBlockCount, PS.getTotalCount());
+  TheSummary->set(Summary::TotalNumBlocks, PS.getNumCounts());
+  TheSummary->set(Summary::TotalNumFunctions, PS.getNumFunctions());
+  for (unsigned I = 0; I < Res.size(); I++)
+    TheSummary->setEntry(I, Res[I]);
 }
 
-std::pair<uint64_t, uint64_t> InstrProfWriter::writeImpl(raw_ostream &OS) {
-  OnDiskChainedHashTableGenerator<InstrProfRecordTrait> Generator;
+void InstrProfWriter::writeImpl(ProfOStream &OS) {
+  OnDiskChainedHashTableGenerator<InstrProfRecordWriterTrait> Generator;
+
+  using namespace IndexedInstrProf;
+  InstrProfSummaryBuilder ISB(ProfileSummaryBuilder::DefaultCutoffs);
+  InfoObj->SummaryBuilder = &ISB;
 
   // Populate the hash table generator.
   for (const auto &I : FunctionData)
-    Generator.insert(I.getKey(), &I.getValue());
-
-  using namespace llvm::support;
-  endian::Writer<little> LE(OS);
-
+    if (shouldEncodeData(I.getValue()))
+      Generator.insert(I.getKey(), &I.getValue());
   // Write the header.
   IndexedInstrProf::Header Header;
   Header.Magic = IndexedInstrProf::Magic;
-  Header.Version = IndexedInstrProf::Version;
-  Header.MaxFunctionCount = MaxFunctionCount;
+  Header.Version = IndexedInstrProf::ProfVersion::CurrentVersion;
+  if (ProfileKind == PF_IRLevel)
+    Header.Version |= VARIANT_MASK_IR_PROF;
+  Header.Unused = 0;
   Header.HashType = static_cast<uint64_t>(IndexedInstrProf::HashType);
   Header.HashOffset = 0;
   int N = sizeof(IndexedInstrProf::Header) / sizeof(uint64_t);
 
-  // Only write out all the fields execpt 'HashOffset'. We need
+  // Only write out all the fields except 'HashOffset'. We need
   // to remember the offset of that field to allow back patching
   // later.
   for (int I = 0; I < N - 1; I++)
-    LE.write<uint64_t>(reinterpret_cast<uint64_t *>(&Header)[I]);
+    OS.write(reinterpret_cast<uint64_t *>(&Header)[I]);
 
-  // Save a space to write the hash table start location.
-  uint64_t HashTableStartLoc = OS.tell();
+  // Save the location of Header.HashOffset field in \c OS.
+  uint64_t HashTableStartFieldOffset = OS.tell();
   // Reserve the space for HashOffset field.
-  LE.write<uint64_t>(0);
-  // Write the hash table.
-  uint64_t HashTableStart = Generator.Emit(OS);
+  OS.write(0);
+
+  // Reserve space to write profile summary data.
+  uint32_t NumEntries = ProfileSummaryBuilder::DefaultCutoffs.size();
+  uint32_t SummarySize = Summary::getSize(Summary::NumKinds, NumEntries);
+  // Remember the summary offset.
+  uint64_t SummaryOffset = OS.tell();
+  for (unsigned I = 0; I < SummarySize / sizeof(uint64_t); I++)
+    OS.write(0);
 
-  return std::make_pair(HashTableStartLoc, HashTableStart);
+  // Write the hash table.
+  uint64_t HashTableStart = Generator.Emit(OS.OS, *InfoObj);
+
+  // Allocate space for data to be serialized out.
+  std::unique_ptr<IndexedInstrProf::Summary> TheSummary =
+      IndexedInstrProf::allocSummary(SummarySize);
+  // Compute the Summary and copy the data to the data
+  // structure to be serialized out (to disk or buffer).
+  std::unique_ptr<ProfileSummary> PS = ISB.getSummary();
+  setSummary(TheSummary.get(), *PS);
+  InfoObj->SummaryBuilder = 0;
+
+  // Now do the final patch:
+  PatchItem PatchItems[] = {
+      // Patch the Header.HashOffset field.
+      {HashTableStartFieldOffset, &HashTableStart, 1},
+      // Patch the summary data.
+      {SummaryOffset, reinterpret_cast<uint64_t *>(TheSummary.get()),
+       (int)(SummarySize / sizeof(uint64_t))}};
+  OS.patch(PatchItems, sizeof(PatchItems) / sizeof(*PatchItems));
 }
 
 void InstrProfWriter::write(raw_fd_ostream &OS) {
   // Write the hash table.
-  auto TableStart = writeImpl(OS);
+  ProfOStream POS(OS);
+  writeImpl(POS);
+}
 
-  // Go back and fill in the hash table start.
-  using namespace support;
-  OS.seek(TableStart.first);
-  // Now patch the HashOffset field previously reserved.
-  endian::Writer<little>(OS).write<uint64_t>(TableStart.second);
+std::unique_ptr<MemoryBuffer> InstrProfWriter::writeBuffer() {
+  std::string Data;
+  llvm::raw_string_ostream OS(Data);
+  ProfOStream POS(OS);
+  // Write the hash table.
+  writeImpl(POS);
+  // Return this in an aligned memory buffer.
+  return MemoryBuffer::getMemBufferCopy(Data);
 }
 
 static const char *ValueProfKindStr[] = {
@@ -218,29 +334,16 @@ void InstrProfWriter::writeRecordInText(const InstrProfRecord &Func,
 }
 
 void InstrProfWriter::writeText(raw_fd_ostream &OS) {
+  if (ProfileKind == PF_IRLevel)
+    OS << "# IR level Instrumentation Flag\n:ir\n";
   InstrProfSymtab Symtab;
   for (const auto &I : FunctionData)
-    Symtab.addFuncName(I.getKey());
+    if (shouldEncodeData(I.getValue()))
+      Symtab.addFuncName(I.getKey());
   Symtab.finalizeSymtab();
 
   for (const auto &I : FunctionData)
-    for (const auto &Func : I.getValue())
-      writeRecordInText(Func.second, Symtab, OS);
-}
-
-std::unique_ptr<MemoryBuffer> InstrProfWriter::writeBuffer() {
-  std::string Data;
-  llvm::raw_string_ostream OS(Data);
-  // Write the hash table.
-  auto TableStart = writeImpl(OS);
-  OS.flush();
-
-  // Go back and fill in the hash table start.
-  using namespace support;
-  uint64_t Bytes = endian::byte_swap<uint64_t, little>(TableStart.second);
-  Data.replace(TableStart.first, sizeof(uint64_t), (const char *)&Bytes,
-               sizeof(uint64_t));
-
-  // Return this in an aligned memory buffer.
-  return MemoryBuffer::getMemBufferCopy(Data);
+    if (shouldEncodeData(I.getValue()))
+      for (const auto &Func : I.getValue())
+        writeRecordInText(Func.second, Symtab, OS);
 }
diff --git a/lib/ProfileData/LLVMBuild.txt b/lib/ProfileData/LLVMBuild.txt
index a7f471fc582e..b3d749feda61 100644
--- a/lib/ProfileData/LLVMBuild.txt
+++ b/lib/ProfileData/LLVMBuild.txt
@@ -15,8 +15,11 @@
 ;
 ;===------------------------------------------------------------------------===;
 
+[common]
+subdirectories = Coverage
+
 [component_0]
 type = Library
 name = ProfileData
 parent = Libraries
-required_libraries = Core Support Object
+required_libraries = Core Support
diff --git a/lib/ProfileData/Makefile b/lib/ProfileData/Makefile
deleted file mode 100644
index 26743612d3d7..000000000000
--- a/lib/ProfileData/Makefile
+++ /dev/null
@@ -1,14 +0,0 @@
-##===- lib/ProfileData/Makefile ----------------------------*- Makefile -*-===##
-#
-#                     The LLVM Compiler Infrastructure
-#
-# This file is distributed under the University of Illinois Open Source
-# License. See LICENSE.TXT for details.
-#
-##===----------------------------------------------------------------------===##
-
-LEVEL = ../..
-LIBRARYNAME = LLVMProfileData
-BUILD_ARCHIVE := 1
-
-include $(LEVEL)/Makefile.common
diff --git a/lib/ProfileData/ProfileSummaryBuilder.cpp b/lib/ProfileData/ProfileSummaryBuilder.cpp
new file mode 100644
index 000000000000..f8c3717007b3
--- /dev/null
+++ b/lib/ProfileData/ProfileSummaryBuilder.cpp
@@ -0,0 +1,116 @@
+//=-- ProfilesummaryBuilder.cpp - Profile summary computation ---------------=//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains support for computing profile summary data.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/IR/Attributes.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/Metadata.h"
+#include "llvm/IR/Type.h"
+#include "llvm/ProfileData/InstrProf.h"
+#include "llvm/ProfileData/ProfileCommon.h"
+#include "llvm/ProfileData/SampleProf.h"
+#include "llvm/Support/Casting.h"
+
+using namespace llvm;
+
+// A set of cutoff values. Each value, when divided by ProfileSummary::Scale
+// (which is 1000000) is a desired percentile of total counts.
+static const uint32_t DefaultCutoffsData[] = {
+    10000,  /*  1% */
+    100000, /* 10% */
+    200000, 300000, 400000, 500000, 600000, 500000, 600000, 700000,
+    800000, 900000, 950000, 990000, 999000, 999900, 999990, 999999};
+const ArrayRef<uint32_t> ProfileSummaryBuilder::DefaultCutoffs =
+    DefaultCutoffsData;
+
+void InstrProfSummaryBuilder::addRecord(const InstrProfRecord &R) {
+  // The first counter is not necessarily an entry count for IR
+  // instrumentation profiles.
+  // Eventually MaxFunctionCount will become obsolete and this can be
+  // removed.
+  addEntryCount(R.Counts[0]);
+  for (size_t I = 1, E = R.Counts.size(); I < E; ++I)
+    addInternalCount(R.Counts[I]);
+}
+
+// To compute the detailed summary, we consider each line containing samples as
+// equivalent to a block with a count in the instrumented profile.
+void SampleProfileSummaryBuilder::addRecord(
+    const sampleprof::FunctionSamples &FS) {
+  NumFunctions++;
+  if (FS.getHeadSamples() > MaxFunctionCount)
+    MaxFunctionCount = FS.getHeadSamples();
+  for (const auto &I : FS.getBodySamples())
+    addCount(I.second.getSamples());
+}
+
+// The argument to this method is a vector of cutoff percentages and the return
+// value is a vector of (Cutoff, MinCount, NumCounts) triplets.
+void ProfileSummaryBuilder::computeDetailedSummary() {
+  if (DetailedSummaryCutoffs.empty())
+    return;
+  auto Iter = CountFrequencies.begin();
+  auto End = CountFrequencies.end();
+  std::sort(DetailedSummaryCutoffs.begin(), DetailedSummaryCutoffs.end());
+
+  uint32_t CountsSeen = 0;
+  uint64_t CurrSum = 0, Count = 0;
+
+  for (uint32_t Cutoff : DetailedSummaryCutoffs) {
+    assert(Cutoff <= 999999);
+    APInt Temp(128, TotalCount);
+    APInt N(128, Cutoff);
+    APInt D(128, ProfileSummary::Scale);
+    Temp *= N;
+    Temp = Temp.sdiv(D);
+    uint64_t DesiredCount = Temp.getZExtValue();
+    assert(DesiredCount <= TotalCount);
+    while (CurrSum < DesiredCount && Iter != End) {
+      Count = Iter->first;
+      uint32_t Freq = Iter->second;
+      CurrSum += (Count * Freq);
+      CountsSeen += Freq;
+      Iter++;
+    }
+    assert(CurrSum >= DesiredCount);
+    ProfileSummaryEntry PSE = {Cutoff, Count, CountsSeen};
+    DetailedSummary.push_back(PSE);
+  }
+}
+
+std::unique_ptr<ProfileSummary> SampleProfileSummaryBuilder::getSummary() {
+  computeDetailedSummary();
+  return llvm::make_unique<ProfileSummary>(
+      ProfileSummary::PSK_Sample, DetailedSummary, TotalCount, MaxCount, 0,
+      MaxFunctionCount, NumCounts, NumFunctions);
+}
+
+std::unique_ptr<ProfileSummary> InstrProfSummaryBuilder::getSummary() {
+  computeDetailedSummary();
+  return llvm::make_unique<ProfileSummary>(
+      ProfileSummary::PSK_Instr, DetailedSummary, TotalCount, MaxCount,
+      MaxInternalBlockCount, MaxFunctionCount, NumCounts, NumFunctions);
+}
+
+void InstrProfSummaryBuilder::addEntryCount(uint64_t Count) {
+  addCount(Count);
+  NumFunctions++;
+  if (Count > MaxFunctionCount)
+    MaxFunctionCount = Count;
+}
+
+void InstrProfSummaryBuilder::addInternalCount(uint64_t Count) {
+  addCount(Count);
+  if (Count > MaxInternalBlockCount)
+    MaxInternalBlockCount = Count;
+}
diff --git a/lib/ProfileData/SampleProf.cpp b/lib/ProfileData/SampleProf.cpp
index 9ded757f2b28..cb0246113d80 100644
--- a/lib/ProfileData/SampleProf.cpp
+++ b/lib/ProfileData/SampleProf.cpp
@@ -20,6 +20,9 @@ using namespace llvm::sampleprof;
 using namespace llvm;
 
 namespace {
+// FIXME: This class is only here to support the transition to llvm::Error. It
+// will be removed once this transition is complete. Clients should prefer to
+// deal with the Error value directly, rather than converting to error_code.
 class SampleProfErrorCategoryType : public std::error_category {
   const char *name() const LLVM_NOEXCEPT override { return "llvm.sampleprof"; }
   std::string message(int IE) const override {
@@ -71,20 +74,7 @@ raw_ostream &llvm::sampleprof::operator<<(raw_ostream &OS,
   return OS;
 }
 
-void LineLocation::dump() const { print(dbgs()); }
-
-void CallsiteLocation::print(raw_ostream &OS) const {
-  LineLocation::print(OS);
-  OS << ": inlined callee: " << CalleeName;
-}
-
-void CallsiteLocation::dump() const { print(dbgs()); }
-
-inline raw_ostream &llvm::sampleprof::operator<<(raw_ostream &OS,
-                                                 const CallsiteLocation &Loc) {
-  Loc.print(OS);
-  return OS;
-}
+LLVM_DUMP_METHOD void LineLocation::dump() const { print(dbgs()); }
 
 /// \brief Print the sample record to the stream \p OS indented by \p Indent.
 void SampleRecord::print(raw_ostream &OS, unsigned Indent) const {
@@ -97,7 +87,7 @@ void SampleRecord::print(raw_ostream &OS, unsigned Indent) const {
   OS << "\n";
 }
 
-void SampleRecord::dump() const { print(dbgs(), 0); }
+LLVM_DUMP_METHOD void SampleRecord::dump() const { print(dbgs(), 0); }
 
 raw_ostream &llvm::sampleprof::operator<<(raw_ostream &OS,
                                           const SampleRecord &Sample) {
@@ -127,11 +117,11 @@ void FunctionSamples::print(raw_ostream &OS, unsigned Indent) const {
   OS.indent(Indent);
   if (CallsiteSamples.size() > 0) {
     OS << "Samples collected in inlined callsites {\n";
-    SampleSorter<CallsiteLocation, FunctionSamples> SortedCallsiteSamples(
+    SampleSorter<LineLocation, FunctionSamples> SortedCallsiteSamples(
         CallsiteSamples);
     for (const auto &CS : SortedCallsiteSamples.get()) {
       OS.indent(Indent + 2);
-      OS << CS->first << ": ";
+      OS << CS->first << ": inlined callee: " << CS->second.getName() << ": ";
       CS->second.print(OS, Indent + 4);
     }
     OS << "}\n";
diff --git a/lib/ProfileData/SampleProfReader.cpp b/lib/ProfileData/SampleProfReader.cpp
index 93cd87bb82f8..af80b036a5bb 100644
--- a/lib/ProfileData/SampleProfReader.cpp
+++ b/lib/ProfileData/SampleProfReader.cpp
@@ -22,7 +22,7 @@
 
 #include "llvm/ProfileData/SampleProfReader.h"
 #include "llvm/ADT/DenseMap.h"
-#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/STLExtras.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/ErrorOr.h"
 #include "llvm/Support/LEB128.h"
@@ -68,11 +68,8 @@ static bool ParseHead(const StringRef &Input, StringRef &FName,
   return true;
 }
 
-
 /// \brief Returns true if line offset \p L is legal (only has 16 bits).
-static bool isOffsetLegal(unsigned L) {
-  return (L & 0xffff) == L;
-}
+static bool isOffsetLegal(unsigned L) { return (L & 0xffff) == L; }
 
 /// \brief Parse \p Input as line sample.
 ///
@@ -180,6 +177,7 @@ std::error_code SampleProfileReaderText::read() {
       }
       Profiles[FName] = FunctionSamples();
       FunctionSamples &FProfile = Profiles[FName];
+      FProfile.setName(FName);
       MergeResult(Result, FProfile.addTotalSamples(NumSamples));
       MergeResult(Result, FProfile.addHeadSamples(NumHeadSamples));
       InlineStack.clear();
@@ -202,7 +200,8 @@ std::error_code SampleProfileReaderText::read() {
           InlineStack.pop_back();
         }
         FunctionSamples &FSamples = InlineStack.back()->functionSamplesAt(
-            CallsiteLocation(LineOffset, Discriminator, FName));
+            LineLocation(LineOffset, Discriminator));
+        FSamples.setName(FName);
         MergeResult(Result, FSamples.addTotalSamples(NumSamples));
         InlineStack.push_back(&FSamples);
       } else {
@@ -220,6 +219,8 @@ std::error_code SampleProfileReaderText::read() {
       }
     }
   }
+  if (Result == sampleprof_error::success)
+    computeSummary();
 
   return Result;
 }
@@ -351,8 +352,9 @@ SampleProfileReaderBinary::readProfile(FunctionSamples &FProfile) {
     if (std::error_code EC = FName.getError())
       return EC;
 
-    FunctionSamples &CalleeProfile = FProfile.functionSamplesAt(
-        CallsiteLocation(*LineOffset, *Discriminator, *FName));
+    FunctionSamples &CalleeProfile =
+        FProfile.functionSamplesAt(LineLocation(*LineOffset, *Discriminator));
+    CalleeProfile.setName(*FName);
     if (std::error_code EC = readProfile(CalleeProfile))
       return EC;
   }
@@ -372,6 +374,7 @@ std::error_code SampleProfileReaderBinary::read() {
 
     Profiles[*FName] = FunctionSamples();
     FunctionSamples &FProfile = Profiles[*FName];
+    FProfile.setName(*FName);
 
     FProfile.addHeadSamples(*NumHeadSamples);
 
@@ -400,6 +403,9 @@ std::error_code SampleProfileReaderBinary::readHeader() {
   else if (*Version != SPVersion())
     return sampleprof_error::unsupported_version;
 
+  if (std::error_code EC = readSummary())
+    return EC;
+
   // Read the name table.
   auto Size = readNumber<uint32_t>();
   if (std::error_code EC = Size.getError())
@@ -415,6 +421,62 @@ std::error_code SampleProfileReaderBinary::readHeader() {
   return sampleprof_error::success;
 }
 
+std::error_code SampleProfileReaderBinary::readSummaryEntry(
+    std::vector<ProfileSummaryEntry> &Entries) {
+  auto Cutoff = readNumber<uint64_t>();
+  if (std::error_code EC = Cutoff.getError())
+    return EC;
+
+  auto MinBlockCount = readNumber<uint64_t>();
+  if (std::error_code EC = MinBlockCount.getError())
+    return EC;
+
+  auto NumBlocks = readNumber<uint64_t>();
+  if (std::error_code EC = NumBlocks.getError())
+    return EC;
+
+  Entries.emplace_back(*Cutoff, *MinBlockCount, *NumBlocks);
+  return sampleprof_error::success;
+}
+
+std::error_code SampleProfileReaderBinary::readSummary() {
+  auto TotalCount = readNumber<uint64_t>();
+  if (std::error_code EC = TotalCount.getError())
+    return EC;
+
+  auto MaxBlockCount = readNumber<uint64_t>();
+  if (std::error_code EC = MaxBlockCount.getError())
+    return EC;
+
+  auto MaxFunctionCount = readNumber<uint64_t>();
+  if (std::error_code EC = MaxFunctionCount.getError())
+    return EC;
+
+  auto NumBlocks = readNumber<uint64_t>();
+  if (std::error_code EC = NumBlocks.getError())
+    return EC;
+
+  auto NumFunctions = readNumber<uint64_t>();
+  if (std::error_code EC = NumFunctions.getError())
+    return EC;
+
+  auto NumSummaryEntries = readNumber<uint64_t>();
+  if (std::error_code EC = NumSummaryEntries.getError())
+    return EC;
+
+  std::vector<ProfileSummaryEntry> Entries;
+  for (unsigned i = 0; i < *NumSummaryEntries; i++) {
+    std::error_code EC = readSummaryEntry(Entries);
+    if (EC != sampleprof_error::success)
+      return EC;
+  }
+  Summary = llvm::make_unique<ProfileSummary>(
+      ProfileSummary::PSK_Sample, Entries, *TotalCount, *MaxBlockCount, 0,
+      *MaxFunctionCount, *NumBlocks, *NumFunctions);
+
+  return sampleprof_error::success;
+}
+
 bool SampleProfileReaderBinary::hasFormat(const MemoryBuffer &Buffer) {
   const uint8_t *Data =
       reinterpret_cast<const uint8_t *>(Buffer.getBufferStart());
@@ -518,6 +580,7 @@ std::error_code SampleProfileReaderGCC::readFunctionProfiles() {
     if (std::error_code EC = readOneFunctionProfile(Stack, true, 0))
       return EC;
 
+  computeSummary();
   return sampleprof_error::success;
 }
 
@@ -562,8 +625,9 @@ std::error_code SampleProfileReaderGCC::readOneFunctionProfile(
     uint32_t LineOffset = Offset >> 16;
     uint32_t Discriminator = Offset & 0xffff;
     FProfile = &CallerProfile->functionSamplesAt(
-        CallsiteLocation(LineOffset, Discriminator, Name));
+        LineLocation(LineOffset, Discriminator));
   }
+  FProfile->setName(Name);
 
   for (uint32_t I = 0; I < NumPosCounts; ++I) {
     uint32_t Offset;
@@ -669,7 +733,7 @@ bool SampleProfileReaderGCC::hasFormat(const MemoryBuffer &Buffer) {
 ///
 /// \returns an error code indicating the status of the buffer.
 static ErrorOr<std::unique_ptr<MemoryBuffer>>
-setupMemoryBuffer(std::string Filename) {
+setupMemoryBuffer(const Twine &Filename) {
   auto BufferOrErr = MemoryBuffer::getFileOrSTDIN(Filename);
   if (std::error_code EC = BufferOrErr.getError())
     return EC;
@@ -692,7 +756,7 @@ setupMemoryBuffer(std::string Filename) {
 ///
 /// \returns an error code indicating the status of the created reader.
 ErrorOr<std::unique_ptr<SampleProfileReader>>
-SampleProfileReader::create(StringRef Filename, LLVMContext &C) {
+SampleProfileReader::create(const Twine &Filename, LLVMContext &C) {
   auto BufferOrError = setupMemoryBuffer(Filename);
   if (std::error_code EC = BufferOrError.getError())
     return EC;
@@ -725,3 +789,14 @@ SampleProfileReader::create(std::unique_ptr<MemoryBuffer> &B, LLVMContext &C) {
 
   return std::move(Reader);
 }
+
+// For text and GCC file formats, we compute the summary after reading the
+// profile. Binary format has the profile summary in its header.
+void SampleProfileReader::computeSummary() {
+  SampleProfileSummaryBuilder Builder(ProfileSummaryBuilder::DefaultCutoffs);
+  for (const auto &I : Profiles) {
+    const FunctionSamples &Profile = I.second;
+    Builder.addRecord(Profile);
+  }
+  Summary = Builder.getSummary();
+}
diff --git a/lib/ProfileData/SampleProfWriter.cpp b/lib/ProfileData/SampleProfWriter.cpp
index 51feee5ad7d1..4fa71288f8d9 100644
--- a/lib/ProfileData/SampleProfWriter.cpp
+++ b/lib/ProfileData/SampleProfWriter.cpp
@@ -37,11 +37,9 @@ using namespace llvm;
 ///
 /// The format used here is more structured and deliberate because
 /// it needs to be parsed by the SampleProfileReaderText class.
-std::error_code SampleProfileWriterText::write(StringRef FName,
-                                               const FunctionSamples &S) {
+std::error_code SampleProfileWriterText::write(const FunctionSamples &S) {
   auto &OS = *OutputStream;
-
-  OS << FName << ":" << S.getTotalSamples();
+  OS << S.getName() << ":" << S.getTotalSamples();
   if (Indent == 0)
     OS << ":" << S.getHeadSamples();
   OS << "\n";
@@ -63,18 +61,18 @@ std::error_code SampleProfileWriterText::write(StringRef FName,
     OS << "\n";
   }
 
-  SampleSorter<CallsiteLocation, FunctionSamples> SortedCallsiteSamples(
+  SampleSorter<LineLocation, FunctionSamples> SortedCallsiteSamples(
       S.getCallsiteSamples());
   Indent += 1;
   for (const auto &I : SortedCallsiteSamples.get()) {
-    CallsiteLocation Loc = I->first;
+    LineLocation Loc = I->first;
     const FunctionSamples &CalleeSamples = I->second;
     OS.indent(Indent);
     if (Loc.Discriminator == 0)
       OS << Loc.LineOffset << ": ";
     else
       OS << Loc.LineOffset << "." << Loc.Discriminator << ": ";
-    if (std::error_code EC = write(Loc.CalleeName, CalleeSamples))
+    if (std::error_code EC = write(CalleeSamples))
       return EC;
   }
   Indent -= 1;
@@ -105,9 +103,8 @@ void SampleProfileWriterBinary::addNames(const FunctionSamples &S) {
 
   // Recursively add all the names for inlined callsites.
   for (const auto &J : S.getCallsiteSamples()) {
-    CallsiteLocation Loc = J.first;
     const FunctionSamples &CalleeSamples = J.second;
-    addName(Loc.CalleeName);
+    addName(CalleeSamples.getName());
     addNames(CalleeSamples);
   }
 }
@@ -120,6 +117,10 @@ std::error_code SampleProfileWriterBinary::writeHeader(
   encodeULEB128(SPMagic(), OS);
   encodeULEB128(SPVersion(), OS);
 
+  computeSummary(ProfileMap);
+  if (auto EC = writeSummary())
+    return EC;
+
   // Generate the name table for all the functions referenced in the profile.
   for (const auto &I : ProfileMap) {
     addName(I.first());
@@ -132,15 +133,29 @@ std::error_code SampleProfileWriterBinary::writeHeader(
     OS << N.first;
     encodeULEB128(0, OS);
   }
-
   return sampleprof_error::success;
 }
 
-std::error_code SampleProfileWriterBinary::writeBody(StringRef FName,
-                                                     const FunctionSamples &S) {
+std::error_code SampleProfileWriterBinary::writeSummary() {
+  auto &OS = *OutputStream;
+  encodeULEB128(Summary->getTotalCount(), OS);
+  encodeULEB128(Summary->getMaxCount(), OS);
+  encodeULEB128(Summary->getMaxFunctionCount(), OS);
+  encodeULEB128(Summary->getNumCounts(), OS);
+  encodeULEB128(Summary->getNumFunctions(), OS);
+  std::vector<ProfileSummaryEntry> &Entries = Summary->getDetailedSummary();
+  encodeULEB128(Entries.size(), OS);
+  for (auto Entry : Entries) {
+    encodeULEB128(Entry.Cutoff, OS);
+    encodeULEB128(Entry.MinCount, OS);
+    encodeULEB128(Entry.NumCounts, OS);
+  }
+  return sampleprof_error::success;
+}
+std::error_code SampleProfileWriterBinary::writeBody(const FunctionSamples &S) {
   auto &OS = *OutputStream;
 
-  if (std::error_code EC = writeNameIdx(FName))
+  if (std::error_code EC = writeNameIdx(S.getName()))
     return EC;
 
   encodeULEB128(S.getTotalSamples(), OS);
@@ -166,11 +181,11 @@ std::error_code SampleProfileWriterBinary::writeBody(StringRef FName,
   // Recursively emit all the callsite samples.
   encodeULEB128(S.getCallsiteSamples().size(), OS);
   for (const auto &J : S.getCallsiteSamples()) {
-    CallsiteLocation Loc = J.first;
+    LineLocation Loc = J.first;
     const FunctionSamples &CalleeSamples = J.second;
     encodeULEB128(Loc.LineOffset, OS);
     encodeULEB128(Loc.Discriminator, OS);
-    if (std::error_code EC = writeBody(Loc.CalleeName, CalleeSamples))
+    if (std::error_code EC = writeBody(CalleeSamples))
       return EC;
   }
 
@@ -180,10 +195,9 @@ std::error_code SampleProfileWriterBinary::writeBody(StringRef FName,
 /// \brief Write samples of a top-level function to a binary file.
 ///
 /// \returns true if the samples were written successfully, false otherwise.
-std::error_code SampleProfileWriterBinary::write(StringRef FName,
-                                                 const FunctionSamples &S) {
+std::error_code SampleProfileWriterBinary::write(const FunctionSamples &S) {
   encodeULEB128(S.getHeadSamples(), *OutputStream);
-  return writeBody(FName, S);
+  return writeBody(S);
 }
 
 /// \brief Create a sample profile file writer based on the specified format.
@@ -238,3 +252,13 @@ SampleProfileWriter::create(std::unique_ptr<raw_ostream> &OS,
 
   return std::move(Writer);
 }
+
+void SampleProfileWriter::computeSummary(
+    const StringMap<FunctionSamples> &ProfileMap) {
+  SampleProfileSummaryBuilder Builder(ProfileSummaryBuilder::DefaultCutoffs);
+  for (const auto &I : ProfileMap) {
+    const FunctionSamples &Profile = I.second;
+    Builder.addRecord(Profile);
+  }
+  Summary = Builder.getSummary();
+}
diff --git a/lib/Support/APFloat.cpp b/lib/Support/APFloat.cpp
index 19b8221b60cb..f9370b85234f 100644
--- a/lib/Support/APFloat.cpp
+++ b/lib/Support/APFloat.cpp
@@ -14,6 +14,7 @@
 
 #include "llvm/ADT/APFloat.h"
 #include "llvm/ADT/APSInt.h"
+#include "llvm/ADT/ArrayRef.h"
 #include "llvm/ADT/FoldingSet.h"
 #include "llvm/ADT/Hashing.h"
 #include "llvm/ADT/StringExtras.h"
@@ -501,7 +502,9 @@ powerOf5(integerPart *dst, unsigned int power)
 
       /* Now result is in p1 with partsCount parts and p2 is scratch
          space.  */
-      tmp = p1, p1 = p2, p2 = tmp;
+      tmp = p1;
+      p1 = p2;
+      p2 = tmp;
     }
 
     pow5 += pc;
@@ -3940,22 +3943,68 @@ APFloat::makeZero(bool Negative) {
   category = fcZero;
   sign = Negative;
   exponent = semantics->minExponent-1;
-  APInt::tcSet(significandParts(), 0, partCount());  
+  APInt::tcSet(significandParts(), 0, partCount());
+}
+
+void APFloat::makeQuiet() {
+  assert(isNaN());
+  APInt::tcSetBit(significandParts(), semantics->precision - 2);
 }
 
-APFloat llvm::scalbn(APFloat X, int Exp) {
-  if (X.isInfinity() || X.isZero() || X.isNaN())
-    return X;
+int llvm::ilogb(const APFloat &Arg) {
+  if (Arg.isNaN())
+    return APFloat::IEK_NaN;
+  if (Arg.isZero())
+    return APFloat::IEK_Zero;
+  if (Arg.isInfinity())
+    return APFloat::IEK_Inf;
+  if (!Arg.isDenormal())
+    return Arg.exponent;
+
+  APFloat Normalized(Arg);
+  int SignificandBits = Arg.getSemantics().precision - 1;
 
+  Normalized.exponent += SignificandBits;
+  Normalized.normalize(APFloat::rmNearestTiesToEven, lfExactlyZero);
+  return Normalized.exponent - SignificandBits;
+}
+
+APFloat llvm::scalbn(APFloat X, int Exp, APFloat::roundingMode RoundingMode) {
   auto MaxExp = X.getSemantics().maxExponent;
   auto MinExp = X.getSemantics().minExponent;
-  if (Exp > (MaxExp - X.exponent))
-    // Overflow saturates to infinity.
-    return APFloat::getInf(X.getSemantics(), X.isNegative());
-  if (Exp < (MinExp - X.exponent))
-    // Underflow saturates to zero.
-    return APFloat::getZero(X.getSemantics(), X.isNegative());
-
-  X.exponent += Exp;
+
+  // If Exp is wildly out-of-scale, simply adding it to X.exponent will
+  // overflow; clamp it to a safe range before adding, but ensure that the range
+  // is large enough that the clamp does not change the result. The range we
+  // need to support is the difference between the largest possible exponent and
+  // the normalized exponent of half the smallest denormal.
+
+  int SignificandBits = X.getSemantics().precision - 1;
+  int MaxIncrement = MaxExp - (MinExp - SignificandBits) + 1;
+
+  // Clamp to one past the range ends to let normalize handle overlflow.
+  X.exponent += std::min(std::max(Exp, -MaxIncrement - 1), MaxIncrement);
+  X.normalize(RoundingMode, lfExactlyZero);
+  if (X.isNaN())
+    X.makeQuiet();
   return X;
 }
+
+APFloat llvm::frexp(const APFloat &Val, int &Exp, APFloat::roundingMode RM) {
+  Exp = ilogb(Val);
+
+  // Quiet signalling nans.
+  if (Exp == APFloat::IEK_NaN) {
+    APFloat Quiet(Val);
+    Quiet.makeQuiet();
+    return Quiet;
+  }
+
+  if (Exp == APFloat::IEK_Inf)
+    return Val;
+
+  // 1 is added because frexp is defined to return a normalized fraction in
+  // +/-[0.5, 1.0), rather than the usual +/-[1.0, 2.0).
+  Exp = Exp == APFloat::IEK_Zero ? 0 : Exp + 1;
+  return scalbn(Val, -Exp, RM);
+}
diff --git a/lib/Support/APInt.cpp b/lib/Support/APInt.cpp
index 23f89bb66f9e..66eee99c6ec2 100644
--- a/lib/Support/APInt.cpp
+++ b/lib/Support/APInt.cpp
@@ -13,6 +13,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/ADT/APInt.h"
+#include "llvm/ADT/ArrayRef.h"
 #include "llvm/ADT/FoldingSet.h"
 #include "llvm/ADT/Hashing.h"
 #include "llvm/ADT/SmallString.h"
@@ -74,7 +75,7 @@ inline static unsigned getDigit(char cdigit, uint8_t radix) {
 }
 
 
-void APInt::initSlowCase(unsigned numBits, uint64_t val, bool isSigned) {
+void APInt::initSlowCase(uint64_t val, bool isSigned) {
   pVal = getClearedMemory(getNumWords());
   pVal[0] = val;
   if (isSigned && int64_t(val) < 0)
@@ -479,6 +480,15 @@ APInt APInt::operator+(const APInt& RHS) const {
   return Result;
 }
 
+APInt APInt::operator+(uint64_t RHS) const {
+  if (isSingleWord())
+    return APInt(BitWidth, VAL + RHS);
+  APInt Result(*this);
+  add_1(Result.pVal, Result.pVal, getNumWords(), RHS);
+  Result.clearUnusedBits();
+  return Result;
+}
+
 APInt APInt::operator-(const APInt& RHS) const {
   assert(BitWidth == RHS.BitWidth && "Bit widths must be the same");
   if (isSingleWord())
@@ -489,24 +499,17 @@ APInt APInt::operator-(const APInt& RHS) const {
   return Result;
 }
 
-bool APInt::EqualSlowCase(const APInt& RHS) const {
-  // Get some facts about the number of bits used in the two operands.
-  unsigned n1 = getActiveBits();
-  unsigned n2 = RHS.getActiveBits();
-
-  // If the number of bits isn't the same, they aren't equal
-  if (n1 != n2)
-    return false;
-
-  // If the number of bits fits in a word, we only need to compare the low word.
-  if (n1 <= APINT_BITS_PER_WORD)
-    return pVal[0] == RHS.pVal[0];
+APInt APInt::operator-(uint64_t RHS) const {
+  if (isSingleWord())
+    return APInt(BitWidth, VAL - RHS);
+  APInt Result(*this);
+  sub_1(Result.pVal, getNumWords(), RHS);
+  Result.clearUnusedBits();
+  return Result;
+}
 
-  // Otherwise, compare everything
-  for (int i = whichWord(n1 - 1); i >= 0; --i)
-    if (pVal[i] != RHS.pVal[i])
-      return false;
-  return true;
+bool APInt::EqualSlowCase(const APInt& RHS) const {
+  return std::equal(pVal, pVal + getNumWords(), RHS.pVal);
 }
 
 bool APInt::EqualSlowCase(uint64_t Val) const {
@@ -552,37 +555,21 @@ bool APInt::ult(const APInt& RHS) const {
 bool APInt::slt(const APInt& RHS) const {
   assert(BitWidth == RHS.BitWidth && "Bit widths must be same for comparison");
   if (isSingleWord()) {
-    int64_t lhsSext = (int64_t(VAL) << (64-BitWidth)) >> (64-BitWidth);
-    int64_t rhsSext = (int64_t(RHS.VAL) << (64-BitWidth)) >> (64-BitWidth);
+    int64_t lhsSext = SignExtend64(VAL, BitWidth);
+    int64_t rhsSext = SignExtend64(RHS.VAL, BitWidth);
     return lhsSext < rhsSext;
   }
 
-  APInt lhs(*this);
-  APInt rhs(RHS);
   bool lhsNeg = isNegative();
-  bool rhsNeg = rhs.isNegative();
-  if (lhsNeg) {
-    // Sign bit is set so perform two's complement to make it positive
-    lhs.flipAllBits();
-    ++lhs;
-  }
-  if (rhsNeg) {
-    // Sign bit is set so perform two's complement to make it positive
-    rhs.flipAllBits();
-    ++rhs;
-  }
-
-  // Now we have unsigned values to compare so do the comparison if necessary
-  // based on the negativeness of the values.
-  if (lhsNeg)
-    if (rhsNeg)
-      return lhs.ugt(rhs);
-    else
-      return true;
-  else if (rhsNeg)
-    return false;
-  else
-    return lhs.ult(rhs);
+  bool rhsNeg = RHS.isNegative();
+
+  // If the sign bits don't match, then (LHS < RHS) if LHS is negative
+  if (lhsNeg != rhsNeg)
+    return lhsNeg;
+
+  // Otherwise we can just use an unsigned comparision, because even negative
+  // numbers compare correctly this way if both have the same signed-ness.
+  return ult(RHS);
 }
 
 void APInt::setBit(unsigned bitPosition) {
@@ -692,30 +679,19 @@ APInt APInt::getLoBits(unsigned numBits) const {
 }
 
 unsigned APInt::countLeadingZerosSlowCase() const {
-  // Treat the most significand word differently because it might have
-  // meaningless bits set beyond the precision.
-  unsigned BitsInMSW = BitWidth % APINT_BITS_PER_WORD;
-  integerPart MSWMask;
-  if (BitsInMSW) MSWMask = (integerPart(1) << BitsInMSW) - 1;
-  else {
-    MSWMask = ~integerPart(0);
-    BitsInMSW = APINT_BITS_PER_WORD;
-  }
-
-  unsigned i = getNumWords();
-  integerPart MSW = pVal[i-1] & MSWMask;
-  if (MSW)
-    return llvm::countLeadingZeros(MSW) - (APINT_BITS_PER_WORD - BitsInMSW);
-
-  unsigned Count = BitsInMSW;
-  for (--i; i > 0u; --i) {
-    if (pVal[i-1] == 0)
+  unsigned Count = 0;
+  for (int i = getNumWords()-1; i >= 0; --i) {
+    integerPart V = pVal[i];
+    if (V == 0)
       Count += APINT_BITS_PER_WORD;
     else {
-      Count += llvm::countLeadingZeros(pVal[i-1]);
+      Count += llvm::countLeadingZeros(V);
       break;
     }
   }
+  // Adjust for unused bits in the most significant word (they are zero).
+  unsigned Mod = BitWidth % APINT_BITS_PER_WORD;
+  Count -= Mod > 0 ? APINT_BITS_PER_WORD - Mod : 0;
   return Count;
 }
 
@@ -814,6 +790,36 @@ APInt APInt::byteSwap() const {
   return Result;
 }
 
+APInt APInt::reverseBits() const {
+  switch (BitWidth) {
+  case 64:
+    return APInt(BitWidth, llvm::reverseBits<uint64_t>(VAL));
+  case 32:
+    return APInt(BitWidth, llvm::reverseBits<uint32_t>(VAL));
+  case 16:
+    return APInt(BitWidth, llvm::reverseBits<uint16_t>(VAL));
+  case 8:
+    return APInt(BitWidth, llvm::reverseBits<uint8_t>(VAL));
+  default:
+    break;
+  }
+
+  APInt Val(*this);
+  APInt Reversed(*this);
+  int S = BitWidth - 1;
+
+  const APInt One(BitWidth, 1);
+
+  for ((Val = Val.lshr(1)); Val != 0; (Val = Val.lshr(1))) {
+    Reversed <<= 1;
+    Reversed |= (Val & One);
+    --S;
+  }
+
+  Reversed <<= S;
+  return Reversed;
+}
+
 APInt llvm::APIntOps::GreatestCommonDivisor(const APInt& API1,
                                             const APInt& API2) {
   APInt A = API1, B = API2;
@@ -874,7 +880,7 @@ double APInt::roundToDouble(bool isSigned) const {
   // It is wrong to optimize getWord(0) to VAL; there might be more than one word.
   if (isSingleWord() || getActiveBits() <= APINT_BITS_PER_WORD) {
     if (isSigned) {
-      int64_t sext = (int64_t(getWord(0)) << (64-BitWidth)) >> (64-BitWidth);
+      int64_t sext = SignExtend64(getWord(0), BitWidth);
       return double(sext);
     } else
       return double(getWord(0));
@@ -1658,10 +1664,8 @@ static void KnuthDiv(unsigned *u, unsigned *v, unsigned *q, unsigned* r,
   DEBUG(dbgs() << '\n');
 }
 
-void APInt::divide(const APInt LHS, unsigned lhsWords,
-                   const APInt &RHS, unsigned rhsWords,
-                   APInt *Quotient, APInt *Remainder)
-{
+void APInt::divide(const APInt &LHS, unsigned lhsWords, const APInt &RHS,
+                   unsigned rhsWords, APInt *Quotient, APInt *Remainder) {
   assert(lhsWords >= rhsWords && "Fractional result");
 
   // First, compose the values into an array of 32-bit words instead of
@@ -2268,7 +2272,7 @@ std::string APInt::toString(unsigned Radix = 10, bool Signed = true) const {
 }
 
 
-void APInt::dump() const {
+LLVM_DUMP_METHOD void APInt::dump() const {
   SmallString<40> S, U;
   this->toStringUnsigned(U);
   this->toStringSigned(S);
@@ -2725,8 +2729,10 @@ APInt::tcDivide(integerPart *lhs, const integerPart *rhs,
         break;
       shiftCount--;
       tcShiftRight(srhs, parts, 1);
-      if ((mask >>= 1) == 0)
-        mask = (integerPart) 1 << (integerPartWidth - 1), n--;
+      if ((mask >>= 1) == 0) {
+        mask = (integerPart) 1 << (integerPartWidth - 1);
+        n--;
+      }
   }
 
   return false;
diff --git a/lib/Support/APSInt.cpp b/lib/Support/APSInt.cpp
index 975457c070a5..46c0f70ff66b 100644
--- a/lib/Support/APSInt.cpp
+++ b/lib/Support/APSInt.cpp
@@ -14,6 +14,7 @@
 
 #include "llvm/ADT/APSInt.h"
 #include "llvm/ADT/FoldingSet.h"
+#include "llvm/ADT/StringRef.h"
 
 using namespace llvm;
 
diff --git a/lib/Support/ARMBuildAttrs.cpp b/lib/Support/ARMBuildAttrs.cpp
index 960a0f13c674..6d34f76f0c21 100644
--- a/lib/Support/ARMBuildAttrs.cpp
+++ b/lib/Support/ARMBuildAttrs.cpp
@@ -54,6 +54,7 @@ const struct {
   { ARMBuildAttrs::ABI_FP_16bit_format, "Tag_ABI_FP_16bit_format" },
   { ARMBuildAttrs::MPextension_use, "Tag_MPextension_use" },
   { ARMBuildAttrs::DIV_use, "Tag_DIV_use" },
+  { ARMBuildAttrs::DSP_extension, "Tag_DSP_extension" },
   { ARMBuildAttrs::nodefaults, "Tag_nodefaults" },
   { ARMBuildAttrs::also_compatible_with, "Tag_also_compatible_with" },
   { ARMBuildAttrs::T2EE_use, "Tag_T2EE_use" },
diff --git a/lib/Support/Atomic.cpp b/lib/Support/Atomic.cpp
index ac4ff3eb5c66..80550e2b46a7 100644
--- a/lib/Support/Atomic.cpp
+++ b/lib/Support/Atomic.cpp
@@ -56,62 +56,3 @@ sys::cas_flag sys::CompareAndSwap(volatile sys::cas_flag* ptr,
 #  error No compare-and-swap implementation for your platform!
 #endif
 }
-
-sys::cas_flag sys::AtomicIncrement(volatile sys::cas_flag* ptr) {
-#if LLVM_HAS_ATOMICS == 0
-  ++(*ptr);
-  return *ptr;
-#elif defined(GNU_ATOMICS)
-  return __sync_add_and_fetch(ptr, 1);
-#elif defined(_MSC_VER)
-  return InterlockedIncrement(ptr);
-#else
-#  error No atomic increment implementation for your platform!
-#endif
-}
-
-sys::cas_flag sys::AtomicDecrement(volatile sys::cas_flag* ptr) {
-#if LLVM_HAS_ATOMICS == 0
-  --(*ptr);
-  return *ptr;
-#elif defined(GNU_ATOMICS)
-  return __sync_sub_and_fetch(ptr, 1);
-#elif defined(_MSC_VER)
-  return InterlockedDecrement(ptr);
-#else
-#  error No atomic decrement implementation for your platform!
-#endif
-}
-
-sys::cas_flag sys::AtomicAdd(volatile sys::cas_flag* ptr, sys::cas_flag val) {
-#if LLVM_HAS_ATOMICS == 0
-  *ptr += val;
-  return *ptr;
-#elif defined(GNU_ATOMICS)
-  return __sync_add_and_fetch(ptr, val);
-#elif defined(_MSC_VER)
-  return InterlockedExchangeAdd(ptr, val) + val;
-#else
-#  error No atomic add implementation for your platform!
-#endif
-}
-
-sys::cas_flag sys::AtomicMul(volatile sys::cas_flag* ptr, sys::cas_flag val) {
-  sys::cas_flag original, result;
-  do {
-    original = *ptr;
-    result = original * val;
-  } while (sys::CompareAndSwap(ptr, result, original) != original);
-
-  return result;
-}
-
-sys::cas_flag sys::AtomicDiv(volatile sys::cas_flag* ptr, sys::cas_flag val) {
-  sys::cas_flag original, result;
-  do {
-    original = *ptr;
-    result = original / val;
-  } while (sys::CompareAndSwap(ptr, result, original) != original);
-
-  return result;
-}
diff --git a/lib/Support/BranchProbability.cpp b/lib/Support/BranchProbability.cpp
index 771d02c0aa3c..1c41659cf8df 100644
--- a/lib/Support/BranchProbability.cpp
+++ b/lib/Support/BranchProbability.cpp
@@ -32,7 +32,7 @@ raw_ostream &BranchProbability::print(raw_ostream &OS) const {
                       Percent);
 }
 
-void BranchProbability::dump() const { print(dbgs()) << '\n'; }
+LLVM_DUMP_METHOD void BranchProbability::dump() const { print(dbgs()) << '\n'; }
 
 BranchProbability::BranchProbability(uint32_t Numerator, uint32_t Denominator) {
   assert(Denominator > 0 && "Denominator cannot be 0!");
diff --git a/lib/Support/CMakeLists.txt b/lib/Support/CMakeLists.txt
index 75b3e89f9167..3d718e6a11c4 100644
--- a/lib/Support/CMakeLists.txt
+++ b/lib/Support/CMakeLists.txt
@@ -1,32 +1,30 @@
 set(system_libs)
-if( NOT MSVC )
-  if( MINGW )
-    # libuuid required for FOLDERID_Profile usage in lib/Support/Windows/Path.inc.
-    set(system_libs ${system_libs} psapi shell32 ole32 uuid)
-  elseif( CMAKE_HOST_UNIX )
-    if( HAVE_LIBRT )
-      set(system_libs ${system_libs} rt)
-    endif()
-    if( HAVE_LIBDL )
-      set(system_libs ${system_libs} ${CMAKE_DL_LIBS})
-    endif()
-    if(LLVM_ENABLE_TERMINFO)
-      if(HAVE_TERMINFO)
-        set(system_libs ${system_libs} ${TERMINFO_LIBS})
-      endif()
-    endif()
-    if( LLVM_ENABLE_THREADS AND HAVE_LIBATOMIC )
-      set(system_libs ${system_libs} atomic)
-    endif()
-    if( LLVM_ENABLE_THREADS AND HAVE_LIBPTHREAD )
-      set(system_libs ${system_libs} pthread)
-    endif()
-    if ( LLVM_ENABLE_ZLIB AND HAVE_LIBZ )
-      set(system_libs ${system_libs} z)
+if( MSVC OR MINGW )
+  # libuuid required for FOLDERID_Profile usage in lib/Support/Windows/Path.inc.
+  set(system_libs ${system_libs} psapi shell32 ole32 uuid)
+elseif( CMAKE_HOST_UNIX )
+  if( HAVE_LIBRT )
+    set(system_libs ${system_libs} rt)
+  endif()
+  if( HAVE_LIBDL )
+    set(system_libs ${system_libs} ${CMAKE_DL_LIBS})
+  endif()
+  if(LLVM_ENABLE_TERMINFO)
+    if(HAVE_TERMINFO)
+      set(system_libs ${system_libs} ${TERMINFO_LIBS})
     endif()
+  endif()
+  if( LLVM_ENABLE_THREADS AND HAVE_LIBATOMIC )
+    set(system_libs ${system_libs} atomic)
+  endif()
+  set(system_libs ${system_libs} ${PTHREAD_LIB})
+  if ( LLVM_ENABLE_ZLIB AND HAVE_LIBZ )
+    set(system_libs ${system_libs} z)
+  endif()
+  if( UNIX AND NOT (BEOS OR HAIKU) )
     set(system_libs ${system_libs} m)
-  endif( MINGW )
-endif( NOT MSVC )
+  endif()
+endif( MSVC OR MINGW )
 
 add_llvm_library(LLVMSupport
   APFloat.cpp
@@ -37,6 +35,7 @@ add_llvm_library(LLVMSupport
   Allocator.cpp
   BlockFrequency.cpp
   BranchProbability.cpp
+  CachePruning.cpp
   circular_raw_ostream.cpp
   COM.cpp
   CommandLine.cpp
@@ -50,6 +49,7 @@ add_llvm_library(LLVMSupport
   DeltaAlgorithm.cpp
   DAGDeltaAlgorithm.cpp
   Dwarf.cpp
+  Error.cpp
   ErrorHandling.cpp
   FileUtilities.cpp
   FileOutputBuffer.cpp
@@ -76,6 +76,8 @@ add_llvm_library(LLVMSupport
   RandomNumberGenerator.cpp
   Regex.cpp
   ScaledNumber.cpp
+  ScopedPrinter.cpp
+  SHA1.cpp
   SmallPtrSet.cpp
   SmallVector.cpp
   SourceMgr.cpp
diff --git a/lib/Support/CachePruning.cpp b/lib/Support/CachePruning.cpp
new file mode 100644
index 000000000000..bd42befce386
--- /dev/null
+++ b/lib/Support/CachePruning.cpp
@@ -0,0 +1,159 @@
+//===-CachePruning.cpp - LLVM Cache Directory Pruning ---------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the pruning of a directory based on least recently used.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Support/CachePruning.h"
+
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/Errc.h"
+#include "llvm/Support/FileSystem.h"
+#include "llvm/Support/Path.h"
+#include "llvm/Support/raw_ostream.h"
+
+#define DEBUG_TYPE "cache-pruning"
+
+#include <set>
+
+using namespace llvm;
+
+/// Write a new timestamp file with the given path. This is used for the pruning
+/// interval option.
+static void writeTimestampFile(StringRef TimestampFile) {
+  std::error_code EC;
+  raw_fd_ostream Out(TimestampFile.str(), EC, sys::fs::F_None);
+}
+
+/// Prune the cache of files that haven't been accessed in a long time.
+bool CachePruning::prune() {
+  if (Path.empty())
+    return false;
+
+  bool isPathDir;
+  if (sys::fs::is_directory(Path, isPathDir))
+    return false;
+
+  if (!isPathDir)
+    return false;
+
+  if (Expiration == 0 && PercentageOfAvailableSpace == 0) {
+    DEBUG(dbgs() << "No pruning settings set, exit early\n");
+    // Nothing will be pruned, early exit
+    return false;
+  }
+
+  // Try to stat() the timestamp file.
+  SmallString<128> TimestampFile(Path);
+  sys::path::append(TimestampFile, "llvmcache.timestamp");
+  sys::fs::file_status FileStatus;
+  sys::TimeValue CurrentTime = sys::TimeValue::now();
+  if (auto EC = sys::fs::status(TimestampFile, FileStatus)) {
+    if (EC == errc::no_such_file_or_directory) {
+      // If the timestamp file wasn't there, create one now.
+      writeTimestampFile(TimestampFile);
+    } else {
+      // Unknown error?
+      return false;
+    }
+  } else {
+    if (Interval) {
+      // Check whether the time stamp is older than our pruning interval.
+      // If not, do nothing.
+      sys::TimeValue TimeStampModTime = FileStatus.getLastModificationTime();
+      auto TimeInterval = sys::TimeValue(sys::TimeValue::SecondsType(Interval));
+      auto TimeStampAge = CurrentTime - TimeStampModTime;
+      if (TimeStampAge <= TimeInterval) {
+        DEBUG(dbgs() << "Timestamp file too recent (" << TimeStampAge.seconds()
+                     << "s old), do not prune.\n");
+        return false;
+      }
+    }
+    // Write a new timestamp file so that nobody else attempts to prune.
+    // There is a benign race condition here, if two processes happen to
+    // notice at the same time that the timestamp is out-of-date.
+    writeTimestampFile(TimestampFile);
+  }
+
+  bool ShouldComputeSize = (PercentageOfAvailableSpace > 0);
+
+  // Keep track of space
+  std::set<std::pair<uint64_t, std::string>> FileSizes;
+  uint64_t TotalSize = 0;
+  // Helper to add a path to the set of files to consider for size-based
+  // pruning, sorted by size.
+  auto AddToFileListForSizePruning =
+      [&](StringRef Path) {
+        if (!ShouldComputeSize)
+          return;
+        TotalSize += FileStatus.getSize();
+        FileSizes.insert(
+            std::make_pair(FileStatus.getSize(), std::string(Path)));
+      };
+
+  // Walk the entire directory cache, looking for unused files.
+  std::error_code EC;
+  SmallString<128> CachePathNative;
+  sys::path::native(Path, CachePathNative);
+  auto TimeExpiration = sys::TimeValue(sys::TimeValue::SecondsType(Expiration));
+  // Walk all of the files within this directory.
+  for (sys::fs::directory_iterator File(CachePathNative, EC), FileEnd;
+       File != FileEnd && !EC; File.increment(EC)) {
+    // Do not touch the timestamp.
+    if (File->path() == TimestampFile)
+      continue;
+
+    // Look at this file. If we can't stat it, there's nothing interesting
+    // there.
+    if (sys::fs::status(File->path(), FileStatus)) {
+      DEBUG(dbgs() << "Ignore " << File->path() << " (can't stat)\n");
+      continue;
+    }
+
+    // If the file hasn't been used recently enough, delete it
+    sys::TimeValue FileAccessTime = FileStatus.getLastAccessedTime();
+    auto FileAge = CurrentTime - FileAccessTime;
+    if (FileAge > TimeExpiration) {
+      DEBUG(dbgs() << "Remove " << File->path() << " (" << FileAge.seconds()
+                   << "s old)\n");
+      sys::fs::remove(File->path());
+      continue;
+    }
+
+    // Leave it here for now, but add it to the list of size-based pruning.
+    AddToFileListForSizePruning(File->path());
+  }
+
+  // Prune for size now if needed
+  if (ShouldComputeSize) {
+    auto ErrOrSpaceInfo = sys::fs::disk_space(Path);
+    if (!ErrOrSpaceInfo) {
+      report_fatal_error("Can't get available size");
+    }
+    sys::fs::space_info SpaceInfo = ErrOrSpaceInfo.get();
+    auto AvailableSpace = TotalSize + SpaceInfo.free;
+    auto FileAndSize = FileSizes.rbegin();
+    DEBUG(dbgs() << "Occupancy: " << ((100 * TotalSize) / AvailableSpace)
+                 << "% target is: " << PercentageOfAvailableSpace << "\n");
+    // Remove the oldest accessed files first, till we get below the threshold
+    while (((100 * TotalSize) / AvailableSpace) > PercentageOfAvailableSpace &&
+           FileAndSize != FileSizes.rend()) {
+      // Remove the file.
+      sys::fs::remove(FileAndSize->second);
+      // Update size
+      TotalSize -= FileAndSize->first;
+      DEBUG(dbgs() << " - Remove " << FileAndSize->second << " (size "
+                   << FileAndSize->first << "), new occupancy is " << TotalSize
+                   << "%\n");
+      ++FileAndSize;
+    }
+  }
+  return true;
+}
diff --git a/lib/Support/CommandLine.cpp b/lib/Support/CommandLine.cpp
index fdcdb03706de..a5d2ba2d6a2d 100644
--- a/lib/Support/CommandLine.cpp
+++ b/lib/Support/CommandLine.cpp
@@ -19,6 +19,7 @@
 #include "llvm/Support/CommandLine.h"
 #include "llvm-c/Support.h"
 #include "llvm/ADT/ArrayRef.h"
+#include "llvm/ADT/DenseMap.h"
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/SmallPtrSet.h"
 #include "llvm/ADT/SmallString.h"
@@ -94,35 +95,56 @@ public:
   // This collects additional help to be printed.
   std::vector<const char *> MoreHelp;
 
-  SmallVector<Option *, 4> PositionalOpts;
-  SmallVector<Option *, 4> SinkOpts;
-  StringMap<Option *> OptionsMap;
-
-  Option *ConsumeAfterOpt; // The ConsumeAfter option if it exists.
-
   // This collects the different option categories that have been registered.
   SmallPtrSet<OptionCategory *, 16> RegisteredOptionCategories;
 
-  CommandLineParser() : ProgramOverview(nullptr), ConsumeAfterOpt(nullptr) {}
+  // This collects the different subcommands that have been registered.
+  SmallPtrSet<SubCommand *, 4> RegisteredSubCommands;
 
-  void ParseCommandLineOptions(int argc, const char *const *argv,
-                               const char *Overview);
+  CommandLineParser() : ProgramOverview(nullptr), ActiveSubCommand(nullptr) {
+    registerSubCommand(&*TopLevelSubCommand);
+    registerSubCommand(&*AllSubCommands);
+  }
 
-  void addLiteralOption(Option &Opt, const char *Name) {
-    if (!Opt.hasArgStr()) {
-      if (!OptionsMap.insert(std::make_pair(Name, &Opt)).second) {
-        errs() << ProgramName << ": CommandLine Error: Option '" << Name
-               << "' registered more than once!\n";
-        report_fatal_error("inconsistency in registered CommandLine options");
+  void ResetAllOptionOccurrences();
+
+  bool ParseCommandLineOptions(int argc, const char *const *argv,
+                               const char *Overview, bool IgnoreErrors);
+
+  void addLiteralOption(Option &Opt, SubCommand *SC, const char *Name) {
+    if (Opt.hasArgStr())
+      return;
+    if (!SC->OptionsMap.insert(std::make_pair(Name, &Opt)).second) {
+      errs() << ProgramName << ": CommandLine Error: Option '" << Name
+             << "' registered more than once!\n";
+      report_fatal_error("inconsistency in registered CommandLine options");
+    }
+
+    // If we're adding this to all sub-commands, add it to the ones that have
+    // already been registered.
+    if (SC == &*AllSubCommands) {
+      for (const auto &Sub : RegisteredSubCommands) {
+        if (SC == Sub)
+          continue;
+        addLiteralOption(Opt, Sub, Name);
       }
     }
   }
 
-  void addOption(Option *O) {
+  void addLiteralOption(Option &Opt, const char *Name) {
+    if (Opt.Subs.empty())
+      addLiteralOption(Opt, &*TopLevelSubCommand, Name);
+    else {
+      for (auto SC : Opt.Subs)
+        addLiteralOption(Opt, SC, Name);
+    }
+  }
+
+  void addOption(Option *O, SubCommand *SC) {
     bool HadErrors = false;
     if (O->hasArgStr()) {
       // Add argument to the argument map!
-      if (!OptionsMap.insert(std::make_pair(O->ArgStr, O)).second) {
+      if (!SC->OptionsMap.insert(std::make_pair(O->ArgStr, O)).second) {
         errs() << ProgramName << ": CommandLine Error: Option '" << O->ArgStr
                << "' registered more than once!\n";
         HadErrors = true;
@@ -131,15 +153,15 @@ public:
 
     // Remember information about positional options.
     if (O->getFormattingFlag() == cl::Positional)
-      PositionalOpts.push_back(O);
+      SC->PositionalOpts.push_back(O);
     else if (O->getMiscFlags() & cl::Sink) // Remember sink options
-      SinkOpts.push_back(O);
+      SC->SinkOpts.push_back(O);
     else if (O->getNumOccurrencesFlag() == cl::ConsumeAfter) {
-      if (ConsumeAfterOpt) {
+      if (SC->ConsumeAfterOpt) {
         O->error("Cannot specify more than one option with cl::ConsumeAfter!");
         HadErrors = true;
       }
-      ConsumeAfterOpt = O;
+      SC->ConsumeAfterOpt = O;
     }
 
     // Fail hard if there were errors. These are strictly unrecoverable and
@@ -148,64 +170,165 @@ public:
     // linked LLVM distribution.
     if (HadErrors)
       report_fatal_error("inconsistency in registered CommandLine options");
+
+    // If we're adding this to all sub-commands, add it to the ones that have
+    // already been registered.
+    if (SC == &*AllSubCommands) {
+      for (const auto &Sub : RegisteredSubCommands) {
+        if (SC == Sub)
+          continue;
+        addOption(O, Sub);
+      }
+    }
   }
 
-  void removeOption(Option *O) {
+  void addOption(Option *O) {
+    if (O->Subs.empty()) {
+      addOption(O, &*TopLevelSubCommand);
+    } else {
+      for (auto SC : O->Subs)
+        addOption(O, SC);
+    }
+  }
+
+  void removeOption(Option *O, SubCommand *SC) {
     SmallVector<StringRef, 16> OptionNames;
     O->getExtraOptionNames(OptionNames);
     if (O->hasArgStr())
       OptionNames.push_back(O->ArgStr);
+
+    SubCommand &Sub = *SC;
     for (auto Name : OptionNames)
-      OptionsMap.erase(Name);
+      Sub.OptionsMap.erase(Name);
 
     if (O->getFormattingFlag() == cl::Positional)
-      for (auto Opt = PositionalOpts.begin(); Opt != PositionalOpts.end();
-           ++Opt) {
+      for (auto Opt = Sub.PositionalOpts.begin();
+           Opt != Sub.PositionalOpts.end(); ++Opt) {
         if (*Opt == O) {
-          PositionalOpts.erase(Opt);
+          Sub.PositionalOpts.erase(Opt);
           break;
         }
       }
     else if (O->getMiscFlags() & cl::Sink)
-      for (auto Opt = SinkOpts.begin(); Opt != SinkOpts.end(); ++Opt) {
+      for (auto Opt = Sub.SinkOpts.begin(); Opt != Sub.SinkOpts.end(); ++Opt) {
         if (*Opt == O) {
-          SinkOpts.erase(Opt);
+          Sub.SinkOpts.erase(Opt);
           break;
         }
       }
-    else if (O == ConsumeAfterOpt)
-      ConsumeAfterOpt = nullptr;
+    else if (O == Sub.ConsumeAfterOpt)
+      Sub.ConsumeAfterOpt = nullptr;
   }
 
-  bool hasOptions() {
-    return (!OptionsMap.empty() || !PositionalOpts.empty() ||
-            nullptr != ConsumeAfterOpt);
+  void removeOption(Option *O) {
+    if (O->Subs.empty())
+      removeOption(O, &*TopLevelSubCommand);
+    else {
+      if (O->isInAllSubCommands()) {
+        for (auto SC : RegisteredSubCommands)
+          removeOption(O, SC);
+      } else {
+        for (auto SC : O->Subs)
+          removeOption(O, SC);
+      }
+    }
   }
 
-  void updateArgStr(Option *O, StringRef NewName) {
-    if (!OptionsMap.insert(std::make_pair(NewName, O)).second) {
+  bool hasOptions(const SubCommand &Sub) const {
+    return (!Sub.OptionsMap.empty() || !Sub.PositionalOpts.empty() ||
+            nullptr != Sub.ConsumeAfterOpt);
+  }
+
+  bool hasOptions() const {
+    for (const auto &S : RegisteredSubCommands) {
+      if (hasOptions(*S))
+        return true;
+    }
+    return false;
+  }
+
+  SubCommand *getActiveSubCommand() { return ActiveSubCommand; }
+
+  void updateArgStr(Option *O, StringRef NewName, SubCommand *SC) {
+    SubCommand &Sub = *SC;
+    if (!Sub.OptionsMap.insert(std::make_pair(NewName, O)).second) {
       errs() << ProgramName << ": CommandLine Error: Option '" << O->ArgStr
              << "' registered more than once!\n";
       report_fatal_error("inconsistency in registered CommandLine options");
     }
-    OptionsMap.erase(O->ArgStr);
+    Sub.OptionsMap.erase(O->ArgStr);
+  }
+
+  void updateArgStr(Option *O, StringRef NewName) {
+    if (O->Subs.empty())
+      updateArgStr(O, NewName, &*TopLevelSubCommand);
+    else {
+      for (auto SC : O->Subs)
+        updateArgStr(O, NewName, SC);
+    }
   }
 
   void printOptionValues();
 
   void registerCategory(OptionCategory *cat) {
-    assert(std::count_if(RegisteredOptionCategories.begin(),
-                         RegisteredOptionCategories.end(),
-                         [cat](const OptionCategory *Category) {
-                           return cat->getName() == Category->getName();
-                         }) == 0 &&
+    assert(count_if(RegisteredOptionCategories,
+                    [cat](const OptionCategory *Category) {
+             return cat->getName() == Category->getName();
+           }) == 0 &&
            "Duplicate option categories");
 
     RegisteredOptionCategories.insert(cat);
   }
 
+  void registerSubCommand(SubCommand *sub) {
+    assert(count_if(RegisteredSubCommands,
+                    [sub](const SubCommand *Sub) {
+                      return (sub->getName() != nullptr) &&
+                             (Sub->getName() == sub->getName());
+                    }) == 0 &&
+           "Duplicate subcommands");
+    RegisteredSubCommands.insert(sub);
+
+    // For all options that have been registered for all subcommands, add the
+    // option to this subcommand now.
+    if (sub != &*AllSubCommands) {
+      for (auto &E : AllSubCommands->OptionsMap) {
+        Option *O = E.second;
+        if ((O->isPositional() || O->isSink() || O->isConsumeAfter()) ||
+            O->hasArgStr())
+          addOption(O, sub);
+        else
+          addLiteralOption(*O, sub, E.first().str().c_str());
+      }
+    }
+  }
+
+  void unregisterSubCommand(SubCommand *sub) {
+    RegisteredSubCommands.erase(sub);
+  }
+
+  void reset() {
+    ActiveSubCommand = nullptr;
+    ProgramName.clear();
+    ProgramOverview = nullptr;
+
+    MoreHelp.clear();
+    RegisteredOptionCategories.clear();
+
+    ResetAllOptionOccurrences();
+    RegisteredSubCommands.clear();
+
+    TopLevelSubCommand->reset();
+    AllSubCommands->reset();
+    registerSubCommand(&*TopLevelSubCommand);
+    registerSubCommand(&*AllSubCommands);
+  }
+
 private:
-  Option *LookupOption(StringRef &Arg, StringRef &Value);
+  SubCommand *ActiveSubCommand;
+
+  Option *LookupOption(SubCommand &Sub, StringRef &Arg, StringRef &Value);
+  SubCommand *LookupSubCommand(const char *Name);
 };
 
 } // namespace
@@ -240,6 +363,32 @@ void OptionCategory::registerCategory() {
   GlobalParser->registerCategory(this);
 }
 
+// A special subcommand representing no subcommand
+ManagedStatic<SubCommand> llvm::cl::TopLevelSubCommand;
+
+// A special subcommand that can be used to put an option into all subcommands.
+ManagedStatic<SubCommand> llvm::cl::AllSubCommands;
+
+void SubCommand::registerSubCommand() {
+  GlobalParser->registerSubCommand(this);
+}
+
+void SubCommand::unregisterSubCommand() {
+  GlobalParser->unregisterSubCommand(this);
+}
+
+void SubCommand::reset() {
+  PositionalOpts.clear();
+  SinkOpts.clear();
+  OptionsMap.clear();
+
+  ConsumeAfterOpt = nullptr;
+}
+
+SubCommand::operator bool() const {
+  return (GlobalParser->getActiveSubCommand() == this);
+}
+
 //===----------------------------------------------------------------------===//
 // Basic, shared command line option processing machinery.
 //
@@ -247,25 +396,29 @@ void OptionCategory::registerCategory() {
 /// LookupOption - Lookup the option specified by the specified option on the
 /// command line.  If there is a value specified (after an equal sign) return
 /// that as well.  This assumes that leading dashes have already been stripped.
-Option *CommandLineParser::LookupOption(StringRef &Arg, StringRef &Value) {
+Option *CommandLineParser::LookupOption(SubCommand &Sub, StringRef &Arg,
+                                        StringRef &Value) {
   // Reject all dashes.
   if (Arg.empty())
     return nullptr;
+  assert(&Sub != &*AllSubCommands);
 
   size_t EqualPos = Arg.find('=');
 
   // If we have an equals sign, remember the value.
   if (EqualPos == StringRef::npos) {
     // Look up the option.
-    StringMap<Option *>::const_iterator I = OptionsMap.find(Arg);
-    return I != OptionsMap.end() ? I->second : nullptr;
+    auto I = Sub.OptionsMap.find(Arg);
+    if (I == Sub.OptionsMap.end())
+      return nullptr;
+
+    return I != Sub.OptionsMap.end() ? I->second : nullptr;
   }
 
   // If the argument before the = is a valid option name, we match.  If not,
   // return Arg unmolested.
-  StringMap<Option *>::const_iterator I =
-      OptionsMap.find(Arg.substr(0, EqualPos));
-  if (I == OptionsMap.end())
+  auto I = Sub.OptionsMap.find(Arg.substr(0, EqualPos));
+  if (I == Sub.OptionsMap.end())
     return nullptr;
 
   Value = Arg.substr(EqualPos + 1);
@@ -273,6 +426,21 @@ Option *CommandLineParser::LookupOption(StringRef &Arg, StringRef &Value) {
   return I->second;
 }
 
+SubCommand *CommandLineParser::LookupSubCommand(const char *Name) {
+  if (Name == nullptr)
+    return &*TopLevelSubCommand;
+  for (auto S : RegisteredSubCommands) {
+    if (S == &*AllSubCommands)
+      continue;
+    if (S->getName() == nullptr)
+      continue;
+
+    if (StringRef(S->getName()) == StringRef(Name))
+      return S;
+  }
+  return &*TopLevelSubCommand;
+}
+
 /// LookupNearestOption - Lookup the closest match to the option specified by
 /// the specified option on the command line.  If there is a value specified
 /// (after an equal sign) return that as well.  This assumes that leading dashes
@@ -515,8 +683,6 @@ static bool isWhitespace(char C) { return strchr(" \t\n\r\f\v", C); }
 
 static bool isQuote(char C) { return C == '\"' || C == '\''; }
 
-static bool isGNUSpecial(char C) { return strchr("\\\"\' ", C); }
-
 void cl::TokenizeGNUCommandLine(StringRef Src, StringSaver &Saver,
                                 SmallVectorImpl<const char *> &NewArgv,
                                 bool MarkEOLs) {
@@ -534,9 +700,8 @@ void cl::TokenizeGNUCommandLine(StringRef Src, StringSaver &Saver,
         break;
     }
 
-    // Backslashes can escape backslashes, spaces, and other quotes.  Otherwise
-    // they are literal.  This makes it much easier to read Windows file paths.
-    if (I + 1 < E && Src[I] == '\\' && isGNUSpecial(Src[I + 1])) {
+    // Backslash escapes the next character.
+    if (I + 1 < E && Src[I] == '\\') {
       ++I; // Skip the escape.
       Token.push_back(Src[I]);
       continue;
@@ -546,8 +711,8 @@ void cl::TokenizeGNUCommandLine(StringRef Src, StringSaver &Saver,
     if (isQuote(Src[I])) {
       char Quote = Src[I++];
       while (I != E && Src[I] != Quote) {
-        // Backslashes are literal, unless they escape a special character.
-        if (Src[I] == '\\' && I + 1 != E && isGNUSpecial(Src[I + 1]))
+        // Backslash escapes the next character.
+        if (Src[I] == '\\' && I + 1 != E)
           ++I;
         Token.push_back(Src[I]);
         ++I;
@@ -787,9 +952,28 @@ void cl::ParseEnvironmentOptions(const char *progName, const char *envVar,
   assert(envVar && "Environment variable name missing");
 
   // Get the environment variable they want us to parse options out of.
+#ifdef _WIN32
+  std::wstring wenvVar;
+  if (!llvm::ConvertUTF8toWide(envVar, wenvVar)) {
+    assert(false &&
+           "Unicode conversion of environment variable name failed");
+    return;
+  }
+  const wchar_t *wenvValue = _wgetenv(wenvVar.c_str());
+  if (!wenvValue)
+    return;
+  std::string envValueBuffer;
+  if (!llvm::convertWideToUTF8(wenvValue, envValueBuffer)) {
+    assert(false &&
+           "Unicode conversion of environment variable value failed");
+    return;
+  }
+  const char *envValue = envValueBuffer.c_str();
+#else
   const char *envValue = getenv(envVar);
   if (!envValue)
     return;
+#endif
 
   // Get program's "name", which we wouldn't know without the caller
   // telling us.
@@ -805,14 +989,25 @@ void cl::ParseEnvironmentOptions(const char *progName, const char *envVar,
   ParseCommandLineOptions(newArgc, &newArgv[0], Overview);
 }
 
-void cl::ParseCommandLineOptions(int argc, const char *const *argv,
-                                 const char *Overview) {
-  GlobalParser->ParseCommandLineOptions(argc, argv, Overview);
+bool cl::ParseCommandLineOptions(int argc, const char *const *argv,
+                                 const char *Overview, bool IgnoreErrors) {
+  return GlobalParser->ParseCommandLineOptions(argc, argv, Overview,
+                                               IgnoreErrors);
 }
 
-void CommandLineParser::ParseCommandLineOptions(int argc,
+void CommandLineParser::ResetAllOptionOccurrences() {
+  // So that we can parse different command lines multiple times in succession
+  // we reset all option values to look like they have never been seen before.
+  for (auto SC : RegisteredSubCommands) {
+    for (auto &O : SC->OptionsMap)
+      O.second->reset();
+  }
+}
+
+bool CommandLineParser::ParseCommandLineOptions(int argc,
                                                 const char *const *argv,
-                                                const char *Overview) {
+                                                const char *Overview,
+                                                bool IgnoreErrors) {
   assert(hasOptions() && "No options specified!");
 
   // Expand response files.
@@ -835,6 +1030,23 @@ void CommandLineParser::ParseCommandLineOptions(int argc,
   // Determine whether or not there are an unlimited number of positionals
   bool HasUnlimitedPositionals = false;
 
+  int FirstArg = 1;
+  SubCommand *ChosenSubCommand = &*TopLevelSubCommand;
+  if (argc >= 2 && argv[FirstArg][0] != '-') {
+    // If the first argument specifies a valid subcommand, start processing
+    // options from the second argument.
+    ChosenSubCommand = LookupSubCommand(argv[FirstArg]);
+    if (ChosenSubCommand != &*TopLevelSubCommand)
+      FirstArg = 2;
+  }
+  GlobalParser->ActiveSubCommand = ChosenSubCommand;
+
+  assert(ChosenSubCommand);
+  auto &ConsumeAfterOpt = ChosenSubCommand->ConsumeAfterOpt;
+  auto &PositionalOpts = ChosenSubCommand->PositionalOpts;
+  auto &SinkOpts = ChosenSubCommand->SinkOpts;
+  auto &OptionsMap = ChosenSubCommand->OptionsMap;
+
   if (ConsumeAfterOpt) {
     assert(PositionalOpts.size() > 0 &&
            "Cannot specify cl::ConsumeAfter without a positional argument!");
@@ -850,23 +1062,28 @@ void CommandLineParser::ParseCommandLineOptions(int argc,
       else if (ConsumeAfterOpt) {
         // ConsumeAfter cannot be combined with "optional" positional options
         // unless there is only one positional argument...
-        if (PositionalOpts.size() > 1)
-          ErrorParsing |= Opt->error(
-              "error - this positional option will never be matched, "
-              "because it does not Require a value, and a "
-              "cl::ConsumeAfter option is active!");
+        if (PositionalOpts.size() > 1) {
+          if (!IgnoreErrors)
+            Opt->error("error - this positional option will never be matched, "
+                       "because it does not Require a value, and a "
+                       "cl::ConsumeAfter option is active!");
+          ErrorParsing = true;
+        }
       } else if (UnboundedFound && !Opt->hasArgStr()) {
         // This option does not "require" a value...  Make sure this option is
         // not specified after an option that eats all extra arguments, or this
         // one will never get any!
         //
-        ErrorParsing |= Opt->error("error - option can never match, because "
-                                   "another positional argument will match an "
-                                   "unbounded number of values, and this option"
-                                   " does not require a value!");
-        errs() << ProgramName << ": CommandLine Error: Option '" << Opt->ArgStr
-               << "' is all messed up!\n";
-        errs() << PositionalOpts.size();
+        if (!IgnoreErrors) {
+          Opt->error("error - option can never match, because "
+                     "another positional argument will match an "
+                     "unbounded number of values, and this option"
+                     " does not require a value!");
+          errs() << ProgramName << ": CommandLine Error: Option '"
+                 << Opt->ArgStr << "' is all messed up!\n";
+          errs() << PositionalOpts.size();
+        }
+        ErrorParsing = true;
       }
       UnboundedFound |= EatsUnboundedNumberOfValues(Opt);
     }
@@ -885,7 +1102,7 @@ void CommandLineParser::ParseCommandLineOptions(int argc,
 
   // Loop over all of the arguments... processing them.
   bool DashDashFound = false; // Have we read '--'?
-  for (int i = 1; i < argc; ++i) {
+  for (int i = FirstArg; i < argc; ++i) {
     Option *Handler = nullptr;
     Option *NearestHandler = nullptr;
     std::string NearestHandlerString;
@@ -932,7 +1149,7 @@ void CommandLineParser::ParseCommandLineOptions(int argc,
       while (!ArgName.empty() && ArgName[0] == '-')
         ArgName = ArgName.substr(1);
 
-      Handler = LookupOption(ArgName, Value);
+      Handler = LookupOption(*ChosenSubCommand, ArgName, Value);
       if (!Handler || Handler->getFormattingFlag() != cl::Positional) {
         ProvidePositionalOption(ActivePositionalArg, argv[i], i);
         continue; // We are done!
@@ -944,7 +1161,7 @@ void CommandLineParser::ParseCommandLineOptions(int argc,
       while (!ArgName.empty() && ArgName[0] == '-')
         ArgName = ArgName.substr(1);
 
-      Handler = LookupOption(ArgName, Value);
+      Handler = LookupOption(*ChosenSubCommand, ArgName, Value);
 
       // Check to see if this "option" is really a prefixed or grouped argument.
       if (!Handler)
@@ -960,13 +1177,15 @@ void CommandLineParser::ParseCommandLineOptions(int argc,
 
     if (!Handler) {
       if (SinkOpts.empty()) {
-        errs() << ProgramName << ": Unknown command line argument '" << argv[i]
-               << "'.  Try: '" << argv[0] << " -help'\n";
-
-        if (NearestHandler) {
-          // If we know a near match, report it as well.
-          errs() << ProgramName << ": Did you mean '-" << NearestHandlerString
-                 << "'?\n";
+        if (!IgnoreErrors) {
+          errs() << ProgramName << ": Unknown command line argument '"
+                 << argv[i] << "'.  Try: '" << argv[0] << " -help'\n";
+
+          if (NearestHandler) {
+            // If we know a near match, report it as well.
+            errs() << ProgramName << ": Did you mean '-" << NearestHandlerString
+                   << "'?\n";
+          }
         }
 
         ErrorParsing = true;
@@ -989,17 +1208,21 @@ void CommandLineParser::ParseCommandLineOptions(int argc,
 
   // Check and handle positional arguments now...
   if (NumPositionalRequired > PositionalVals.size()) {
-    errs() << ProgramName
-           << ": Not enough positional command line arguments specified!\n"
-           << "Must specify at least " << NumPositionalRequired
-           << " positional arguments: See: " << argv[0] << " -help\n";
+    if (!IgnoreErrors) {
+      errs() << ProgramName
+             << ": Not enough positional command line arguments specified!\n"
+             << "Must specify at least " << NumPositionalRequired
+             << " positional arguments: See: " << argv[0] << " -help\n";
+    }
 
     ErrorParsing = true;
   } else if (!HasUnlimitedPositionals &&
              PositionalVals.size() > PositionalOpts.size()) {
-    errs() << ProgramName << ": Too many positional arguments specified!\n"
-           << "Can specify at most " << PositionalOpts.size()
-           << " positional arguments: See: " << argv[0] << " -help\n";
+    if (!IgnoreErrors) {
+      errs() << ProgramName << ": Too many positional arguments specified!\n"
+             << "Can specify at most " << PositionalOpts.size()
+             << " positional arguments: See: " << argv[0] << " -help\n";
+    }
     ErrorParsing = true;
 
   } else if (!ConsumeAfterOpt) {
@@ -1094,8 +1317,12 @@ void CommandLineParser::ParseCommandLineOptions(int argc,
   MoreHelp.clear();
 
   // If we had an error processing our arguments, don't let the program execute
-  if (ErrorParsing)
-    exit(1);
+  if (ErrorParsing) {
+    if (!IgnoreErrors)
+      exit(1);
+    return false;
+  }
+  return true;
 }
 
 //===----------------------------------------------------------------------===//
@@ -1416,7 +1643,7 @@ PRINT_OPT_DIFF(float)
 PRINT_OPT_DIFF(char)
 
 void parser<std::string>::printOptionDiff(const Option &O, StringRef V,
-                                          OptionValue<std::string> D,
+                                          const OptionValue<std::string> &D,
                                           size_t GlobalWidth) const {
   printOptionName(O, GlobalWidth);
   outs() << "= " << V;
@@ -1445,11 +1672,16 @@ static int OptNameCompare(const std::pair<const char *, Option *> *LHS,
   return strcmp(LHS->first, RHS->first);
 }
 
+static int SubNameCompare(const std::pair<const char *, SubCommand *> *LHS,
+                          const std::pair<const char *, SubCommand *> *RHS) {
+  return strcmp(LHS->first, RHS->first);
+}
+
 // Copy Options into a vector so we can sort them as we like.
 static void sortOpts(StringMap<Option *> &OptMap,
                      SmallVectorImpl<std::pair<const char *, Option *>> &Opts,
                      bool ShowHidden) {
-  SmallPtrSet<Option *, 128> OptionSet; // Duplicate option detection.
+  SmallPtrSet<Option *, 32> OptionSet; // Duplicate option detection.
 
   for (StringMap<Option *>::iterator I = OptMap.begin(), E = OptMap.end();
        I != E; ++I) {
@@ -1473,6 +1705,17 @@ static void sortOpts(StringMap<Option *> &OptMap,
   array_pod_sort(Opts.begin(), Opts.end(), OptNameCompare);
 }
 
+static void
+sortSubCommands(const SmallPtrSetImpl<SubCommand *> &SubMap,
+                SmallVectorImpl<std::pair<const char *, SubCommand *>> &Subs) {
+  for (const auto &S : SubMap) {
+    if (S->getName() == nullptr)
+      continue;
+    Subs.push_back(std::make_pair(S->getName(), S));
+  }
+  array_pod_sort(Subs.begin(), Subs.end(), SubNameCompare);
+}
+
 namespace {
 
 class HelpPrinter {
@@ -1480,12 +1723,25 @@ protected:
   const bool ShowHidden;
   typedef SmallVector<std::pair<const char *, Option *>, 128>
       StrOptionPairVector;
+  typedef SmallVector<std::pair<const char *, SubCommand *>, 128>
+      StrSubCommandPairVector;
   // Print the options. Opts is assumed to be alphabetically sorted.
   virtual void printOptions(StrOptionPairVector &Opts, size_t MaxArgLen) {
     for (size_t i = 0, e = Opts.size(); i != e; ++i)
       Opts[i].second->printOptionInfo(MaxArgLen);
   }
 
+  void printSubCommands(StrSubCommandPairVector &Subs, size_t MaxSubLen) {
+    for (const auto &S : Subs) {
+      outs() << "  " << S.first;
+      if (S.second->getDescription()) {
+        outs().indent(MaxSubLen - strlen(S.first));
+        outs() << " - " << S.second->getDescription();
+      }
+      outs() << "\n";
+    }
+  }
+
 public:
   explicit HelpPrinter(bool showHidden) : ShowHidden(showHidden) {}
   virtual ~HelpPrinter() {}
@@ -1495,23 +1751,56 @@ public:
     if (!Value)
       return;
 
+    SubCommand *Sub = GlobalParser->getActiveSubCommand();
+    auto &OptionsMap = Sub->OptionsMap;
+    auto &PositionalOpts = Sub->PositionalOpts;
+    auto &ConsumeAfterOpt = Sub->ConsumeAfterOpt;
+
     StrOptionPairVector Opts;
-    sortOpts(GlobalParser->OptionsMap, Opts, ShowHidden);
+    sortOpts(OptionsMap, Opts, ShowHidden);
+
+    StrSubCommandPairVector Subs;
+    sortSubCommands(GlobalParser->RegisteredSubCommands, Subs);
 
     if (GlobalParser->ProgramOverview)
       outs() << "OVERVIEW: " << GlobalParser->ProgramOverview << "\n";
 
-    outs() << "USAGE: " << GlobalParser->ProgramName << " [options]";
+    if (Sub == &*TopLevelSubCommand)
+      outs() << "USAGE: " << GlobalParser->ProgramName
+             << " [subcommand] [options]";
+    else {
+      if (Sub->getDescription() != nullptr) {
+        outs() << "SUBCOMMAND '" << Sub->getName()
+               << "': " << Sub->getDescription() << "\n\n";
+      }
+      outs() << "USAGE: " << GlobalParser->ProgramName << " " << Sub->getName()
+             << " [options]";
+    }
 
-    for (auto Opt : GlobalParser->PositionalOpts) {
+    for (auto Opt : PositionalOpts) {
       if (Opt->hasArgStr())
         outs() << " --" << Opt->ArgStr;
       outs() << " " << Opt->HelpStr;
     }
 
     // Print the consume after option info if it exists...
-    if (GlobalParser->ConsumeAfterOpt)
-      outs() << " " << GlobalParser->ConsumeAfterOpt->HelpStr;
+    if (ConsumeAfterOpt)
+      outs() << " " << ConsumeAfterOpt->HelpStr;
+
+    if (Sub == &*TopLevelSubCommand && Subs.size() > 2) {
+      // Compute the maximum subcommand length...
+      size_t MaxSubLen = 0;
+      for (size_t i = 0, e = Subs.size(); i != e; ++i)
+        MaxSubLen = std::max(MaxSubLen, strlen(Subs[i].first));
+
+      outs() << "\n\n";
+      outs() << "SUBCOMMANDS:\n\n";
+      printSubCommands(Subs, MaxSubLen);
+      outs() << "\n";
+      outs() << "  Type \"" << GlobalParser->ProgramName
+             << " <subcommand> -help\" to get more help on a specific "
+                "subcommand";
+    }
 
     outs() << "\n\n";
 
@@ -1589,7 +1878,8 @@ protected:
              E = SortedCategories.end();
          Category != E; ++Category) {
       // Hide empty categories for -help, but show for -help-hidden.
-      bool IsEmptyCategory = CategorizedOptions[*Category].size() == 0;
+      const auto &CategoryOptions = CategorizedOptions[*Category];
+      bool IsEmptyCategory = CategoryOptions.empty();
       if (!ShowHidden && IsEmptyCategory)
         continue;
 
@@ -1610,11 +1900,8 @@ protected:
         continue;
       }
       // Loop over the options in the category and print.
-      for (std::vector<Option *>::const_iterator
-               Opt = CategorizedOptions[*Category].begin(),
-               E = CategorizedOptions[*Category].end();
-           Opt != E; ++Opt)
-        (*Opt)->printOptionInfo(MaxArgLen);
+      for (const Option *Opt : CategoryOptions)
+        Opt->printOptionInfo(MaxArgLen);
     }
   }
 };
@@ -1662,12 +1949,13 @@ static cl::opt<HelpPrinter, true, parser<bool>> HLOp(
     "help-list",
     cl::desc("Display list of available options (-help-list-hidden for more)"),
     cl::location(UncategorizedNormalPrinter), cl::Hidden, cl::ValueDisallowed,
-    cl::cat(GenericCategory));
+    cl::cat(GenericCategory), cl::sub(*AllSubCommands));
 
 static cl::opt<HelpPrinter, true, parser<bool>>
     HLHOp("help-list-hidden", cl::desc("Display list of all available options"),
           cl::location(UncategorizedHiddenPrinter), cl::Hidden,
-          cl::ValueDisallowed, cl::cat(GenericCategory));
+          cl::ValueDisallowed, cl::cat(GenericCategory),
+          cl::sub(*AllSubCommands));
 
 // Define uncategorized/categorized help printers. These printers change their
 // behaviour at runtime depending on whether one or more Option categories have
@@ -1675,22 +1963,23 @@ static cl::opt<HelpPrinter, true, parser<bool>>
 static cl::opt<HelpPrinterWrapper, true, parser<bool>>
     HOp("help", cl::desc("Display available options (-help-hidden for more)"),
         cl::location(WrappedNormalPrinter), cl::ValueDisallowed,
-        cl::cat(GenericCategory));
+        cl::cat(GenericCategory), cl::sub(*AllSubCommands));
 
 static cl::opt<HelpPrinterWrapper, true, parser<bool>>
     HHOp("help-hidden", cl::desc("Display all available options"),
          cl::location(WrappedHiddenPrinter), cl::Hidden, cl::ValueDisallowed,
-         cl::cat(GenericCategory));
+         cl::cat(GenericCategory), cl::sub(*AllSubCommands));
 
 static cl::opt<bool> PrintOptions(
     "print-options",
     cl::desc("Print non-default options after command line parsing"),
-    cl::Hidden, cl::init(false), cl::cat(GenericCategory));
+    cl::Hidden, cl::init(false), cl::cat(GenericCategory),
+    cl::sub(*AllSubCommands));
 
 static cl::opt<bool> PrintAllOptions(
     "print-all-options",
     cl::desc("Print all option values after command line parsing"), cl::Hidden,
-    cl::init(false), cl::cat(GenericCategory));
+    cl::init(false), cl::cat(GenericCategory), cl::sub(*AllSubCommands));
 
 void HelpPrinterWrapper::operator=(bool Value) {
   if (!Value)
@@ -1717,7 +2006,7 @@ void CommandLineParser::printOptionValues() {
     return;
 
   SmallVector<std::pair<const char *, Option *>, 128> Opts;
-  sortOpts(OptionsMap, Opts, /*ShowHidden*/ true);
+  sortOpts(ActiveSubCommand->OptionsMap, Opts, /*ShowHidden*/ true);
 
   // Compute the maximum argument length...
   size_t MaxArgLen = 0;
@@ -1737,8 +2026,12 @@ class VersionPrinter {
 public:
   void print() {
     raw_ostream &OS = outs();
-    OS << "LLVM (http://llvm.org/):\n"
-       << "  " << PACKAGE_NAME << " version " << PACKAGE_VERSION;
+#ifdef PACKAGE_VENDOR
+    OS << PACKAGE_VENDOR << " ";
+#else
+    OS << "LLVM (http://llvm.org/):\n  ";
+#endif
+    OS << PACKAGE_NAME << " version " << PACKAGE_VERSION;
 #ifdef LLVM_VERSION_INFO
     OS << " " << LLVM_VERSION_INFO;
 #endif
@@ -1755,9 +2048,6 @@ public:
     if (CPU == "generic")
       CPU = "(unknown)";
     OS << ".\n"
-#if (ENABLE_TIMESTAMPS == 1)
-       << "  Built " << __DATE__ << " (" << __TIME__ << ").\n"
-#endif
        << "  Default target: " << sys::getDefaultTargetTriple() << '\n'
        << "  Host CPU: " << CPU << '\n';
   }
@@ -1825,22 +2115,26 @@ void cl::AddExtraVersionPrinter(void (*func)()) {
   ExtraVersionPrinters->push_back(func);
 }
 
-StringMap<Option *> &cl::getRegisteredOptions() {
-  return GlobalParser->OptionsMap;
+StringMap<Option *> &cl::getRegisteredOptions(SubCommand &Sub) {
+  auto &Subs = GlobalParser->RegisteredSubCommands;
+  (void)Subs;
+  assert(std::find(Subs.begin(), Subs.end(), &Sub) != Subs.end());
+  return Sub.OptionsMap;
 }
 
-void cl::HideUnrelatedOptions(cl::OptionCategory &Category) {
-  for (auto &I : GlobalParser->OptionsMap) {
+void cl::HideUnrelatedOptions(cl::OptionCategory &Category, SubCommand &Sub) {
+  for (auto &I : Sub.OptionsMap) {
     if (I.second->Category != &Category &&
         I.second->Category != &GenericCategory)
       I.second->setHiddenFlag(cl::ReallyHidden);
   }
 }
 
-void cl::HideUnrelatedOptions(ArrayRef<const cl::OptionCategory *> Categories) {
+void cl::HideUnrelatedOptions(ArrayRef<const cl::OptionCategory *> Categories,
+                              SubCommand &Sub) {
   auto CategoriesBegin = Categories.begin();
   auto CategoriesEnd = Categories.end();
-  for (auto &I : GlobalParser->OptionsMap) {
+  for (auto &I : Sub.OptionsMap) {
     if (std::find(CategoriesBegin, CategoriesEnd, I.second->Category) ==
             CategoriesEnd &&
         I.second->Category != &GenericCategory)
@@ -1848,7 +2142,12 @@ void cl::HideUnrelatedOptions(ArrayRef<const cl::OptionCategory *> Categories) {
   }
 }
 
+void cl::ResetCommandLineParser() { GlobalParser->reset(); }
+void cl::ResetAllOptionOccurrences() {
+  GlobalParser->ResetAllOptionOccurrences();
+}
+
 void LLVMParseCommandLineOptions(int argc, const char *const *argv,
                                  const char *Overview) {
-  llvm::cl::ParseCommandLineOptions(argc, argv, Overview);
+  llvm::cl::ParseCommandLineOptions(argc, argv, Overview, true);
 }
diff --git a/lib/Support/ConvertUTFWrapper.cpp b/lib/Support/ConvertUTFWrapper.cpp
index 1bbef233b82f..217cedb24df6 100644
--- a/lib/Support/ConvertUTFWrapper.cpp
+++ b/lib/Support/ConvertUTFWrapper.cpp
@@ -8,6 +8,9 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/Support/ConvertUTF.h"
+#include "llvm/ADT/ArrayRef.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/SwapByteOrder.h"
 #include <string>
 #include <vector>
@@ -36,7 +39,7 @@ bool ConvertUTF8toWide(unsigned WideCharWidth, llvm::StringRef Source,
     ConversionFlags flags = strictConversion;
     result = ConvertUTF8toUTF16(
         &sourceStart, sourceStart + Source.size(),
-        &targetStart, targetStart + 2*Source.size(), flags);
+        &targetStart, targetStart + Source.size(), flags);
     if (result == conversionOK)
       ResultPtr = reinterpret_cast<char*>(targetStart);
     else
@@ -49,7 +52,7 @@ bool ConvertUTF8toWide(unsigned WideCharWidth, llvm::StringRef Source,
     ConversionFlags flags = strictConversion;
     result = ConvertUTF8toUTF32(
         &sourceStart, sourceStart + Source.size(),
-        &targetStart, targetStart + 4*Source.size(), flags);
+        &targetStart, targetStart + Source.size(), flags);
     if (result == conversionOK)
       ResultPtr = reinterpret_cast<char*>(targetStart);
     else
@@ -130,6 +133,13 @@ bool convertUTF16ToUTF8String(ArrayRef<char> SrcBytes, std::string &Out) {
   return true;
 }
 
+bool convertUTF16ToUTF8String(ArrayRef<UTF16> Src, std::string &Out)
+{
+  return convertUTF16ToUTF8String(
+      llvm::ArrayRef<char>(reinterpret_cast<const char *>(Src.data()),
+      Src.size() * sizeof(UTF16)), Out);
+}
+
 bool convertUTF8ToUTF16String(StringRef SrcUTF8,
                               SmallVectorImpl<UTF16> &DstUTF16) {
   assert(DstUTF16.empty());
@@ -168,5 +178,74 @@ bool convertUTF8ToUTF16String(StringRef SrcUTF8,
   return true;
 }
 
+static_assert(sizeof(wchar_t) == 1 || sizeof(wchar_t) == 2 ||
+                  sizeof(wchar_t) == 4,
+              "Expected wchar_t to be 1, 2, or 4 bytes");
+
+template <typename TResult>
+static inline bool ConvertUTF8toWideInternal(llvm::StringRef Source,
+                                             TResult &Result) {
+  // Even in the case of UTF-16, the number of bytes in a UTF-8 string is
+  // at least as large as the number of elements in the resulting wide
+  // string, because surrogate pairs take at least 4 bytes in UTF-8.
+  Result.resize(Source.size() + 1);
+  char *ResultPtr = reinterpret_cast<char *>(&Result[0]);
+  const UTF8 *ErrorPtr;
+  if (!ConvertUTF8toWide(sizeof(wchar_t), Source, ResultPtr, ErrorPtr)) {
+    Result.clear();
+    return false;
+  }
+  Result.resize(reinterpret_cast<wchar_t *>(ResultPtr) - &Result[0]);
+  return true;
+}
+
+bool ConvertUTF8toWide(llvm::StringRef Source, std::wstring &Result) {
+  return ConvertUTF8toWideInternal(Source, Result);
+}
+
+bool ConvertUTF8toWide(const char *Source, std::wstring &Result) {
+  if (!Source) {
+    Result.clear();
+    return true;
+  }
+  return ConvertUTF8toWide(llvm::StringRef(Source), Result);
+}
+
+bool convertWideToUTF8(const std::wstring &Source, std::string &Result) {
+  if (sizeof(wchar_t) == 1) {
+    const UTF8 *Start = reinterpret_cast<const UTF8 *>(Source.data());
+    const UTF8 *End =
+        reinterpret_cast<const UTF8 *>(Source.data() + Source.size());
+    if (!isLegalUTF8String(&Start, End))
+      return false;
+    Result.resize(Source.size());
+    memcpy(&Result[0], Source.data(), Source.size());
+    return true;
+  } else if (sizeof(wchar_t) == 2) {
+    return convertUTF16ToUTF8String(
+        llvm::ArrayRef<UTF16>(reinterpret_cast<const UTF16 *>(Source.data()),
+                              Source.size()),
+        Result);
+  } else if (sizeof(wchar_t) == 4) {
+    const UTF32 *Start = reinterpret_cast<const UTF32 *>(Source.data());
+    const UTF32 *End =
+        reinterpret_cast<const UTF32 *>(Source.data() + Source.size());
+    Result.resize(UNI_MAX_UTF8_BYTES_PER_CODE_POINT * Source.size());
+    UTF8 *ResultPtr = reinterpret_cast<UTF8 *>(&Result[0]);
+    UTF8 *ResultEnd = reinterpret_cast<UTF8 *>(&Result[0] + Result.size());
+    if (ConvertUTF32toUTF8(&Start, End, &ResultPtr, ResultEnd,
+                           strictConversion) == conversionOK) {
+      Result.resize(reinterpret_cast<char *>(ResultPtr) - &Result[0]);
+      return true;
+    } else {
+      Result.clear();
+      return false;
+    }
+  } else {
+    llvm_unreachable(
+        "Control should never reach this point; see static_assert further up");
+  }
+}
+
 } // end namespace llvm
 
diff --git a/lib/Support/CrashRecoveryContext.cpp b/lib/Support/CrashRecoveryContext.cpp
index 3f4ef9da48f1..98865f5e065e 100644
--- a/lib/Support/CrashRecoveryContext.cpp
+++ b/lib/Support/CrashRecoveryContext.cpp
@@ -31,7 +31,6 @@ struct CrashRecoveryContextImpl {
   const CrashRecoveryContextImpl *Next;
 
   CrashRecoveryContext *CRC;
-  std::string Backtrace;
   ::jmp_buf JumpBuffer;
   volatile unsigned Failed : 1;
   unsigned SwitchedThread : 1;
@@ -335,13 +334,6 @@ void CrashRecoveryContext::HandleCrash() {
   CRCI->HandleCrash();
 }
 
-const std::string &CrashRecoveryContext::getBacktrace() const {
-  CrashRecoveryContextImpl *CRC = (CrashRecoveryContextImpl *) Impl;
-  assert(CRC && "Crash recovery context never initialized!");
-  assert(CRC->Failed && "No crash was detected!");
-  return CRC->Backtrace;
-}
-
 // FIXME: Portability.
 static void setThreadBackgroundPriority() {
 #ifdef __APPLE__
diff --git a/lib/Support/Dwarf.cpp b/lib/Support/Dwarf.cpp
index 7d7225671737..7aea05d7701f 100644
--- a/lib/Support/Dwarf.cpp
+++ b/lib/Support/Dwarf.cpp
@@ -384,23 +384,22 @@ const char *llvm::dwarf::CaseString(unsigned Case) {
   return nullptr;
 }
 
-const char *llvm::dwarf::ConventionString(unsigned Convention) {
-   switch (Convention) {
-   case DW_CC_normal:                     return "DW_CC_normal";
-   case DW_CC_program:                    return "DW_CC_program";
-   case DW_CC_nocall:                     return "DW_CC_nocall";
-   case DW_CC_lo_user:                    return "DW_CC_lo_user";
-   case DW_CC_hi_user:                    return "DW_CC_hi_user";
-   case DW_CC_GNU_borland_fastcall_i386:  return "DW_CC_GNU_borland_fastcall_i386";
-   case DW_CC_BORLAND_safecall:           return "DW_CC_BORLAND_safecall";
-   case DW_CC_BORLAND_stdcall:            return "DW_CC_BORLAND_stdcall";
-   case DW_CC_BORLAND_pascal:             return "DW_CC_BORLAND_pascal";
-   case DW_CC_BORLAND_msfastcall:         return "DW_CC_BORLAND_msfastcall";
-   case DW_CC_BORLAND_msreturn:           return "DW_CC_BORLAND_msreturn";
-   case DW_CC_BORLAND_thiscall:           return "DW_CC_BORLAND_thiscall";
-   case DW_CC_BORLAND_fastcall:           return "DW_CC_BORLAND_fastcall";
+const char *llvm::dwarf::ConventionString(unsigned CC) {
+  switch (CC) {
+  default:
+    return nullptr;
+#define HANDLE_DW_CC(ID, NAME)                                               \
+  case DW_CC_##NAME:                                                         \
+    return "DW_CC_" #NAME;
+#include "llvm/Support/Dwarf.def"
   }
-  return nullptr;
+}
+
+unsigned llvm::dwarf::getCallingConvention(StringRef CCString) {
+  return StringSwitch<unsigned>(CCString)
+#define HANDLE_DW_CC(ID, NAME) .Case("DW_CC_" #NAME, DW_CC_##NAME)
+#include "llvm/Support/Dwarf.def"
+      .Default(0);
 }
 
 const char *llvm::dwarf::InlineCodeString(unsigned Code) {
@@ -546,6 +545,12 @@ const char *llvm::dwarf::ApplePropertyString(unsigned Prop) {
     return "DW_APPLE_PROPERTY_strong";
   case DW_APPLE_PROPERTY_unsafe_unretained:
     return "DW_APPLE_PROPERTY_unsafe_unretained";
+  case DW_APPLE_PROPERTY_nullability:
+    return "DW_APPLE_PROPERTY_nullability";
+  case DW_APPLE_PROPERTY_null_resettable:
+    return "DW_APPLE_PROPERTY_null_resettable";
+  case DW_APPLE_PROPERTY_class:
+    return "DW_APPLE_PROPERTY_class";
   }
   return nullptr;
 }
diff --git a/lib/Support/Error.cpp b/lib/Support/Error.cpp
new file mode 100644
index 000000000000..6b22691eb1ad
--- /dev/null
+++ b/lib/Support/Error.cpp
@@ -0,0 +1,113 @@
+//===----- lib/Support/Error.cpp - Error and associated utilities ---------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Support/Error.h"
+#include "llvm/ADT/Twine.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/ManagedStatic.h"
+
+
+using namespace llvm;
+
+namespace {
+
+  enum class ErrorErrorCode : int {
+    MultipleErrors = 1,
+    InconvertibleError
+  };
+
+  // FIXME: This class is only here to support the transition to llvm::Error. It
+  // will be removed once this transition is complete. Clients should prefer to
+  // deal with the Error value directly, rather than converting to error_code.
+  class ErrorErrorCategory : public std::error_category {
+  public:
+    const char *name() const LLVM_NOEXCEPT override { return "Error"; }
+
+    std::string message(int condition) const override {
+      switch (static_cast<ErrorErrorCode>(condition)) {
+      case ErrorErrorCode::MultipleErrors:
+        return "Multiple errors";
+      case ErrorErrorCode::InconvertibleError:
+        return "Inconvertible error value. An error has occurred that could "
+               "not be converted to a known std::error_code. Please file a "
+               "bug.";
+      }
+      llvm_unreachable("Unhandled error code");
+    }
+  };
+
+}
+
+static ManagedStatic<ErrorErrorCategory> ErrorErrorCat;
+
+namespace llvm {
+
+void ErrorInfoBase::anchor() {}
+char ErrorInfoBase::ID = 0;
+char ErrorList::ID = 0;
+char ECError::ID = 0;
+char StringError::ID = 0;
+
+void logAllUnhandledErrors(Error E, raw_ostream &OS, Twine ErrorBanner) {
+  if (!E)
+    return;
+  OS << ErrorBanner;
+  handleAllErrors(std::move(E), [&](const ErrorInfoBase &EI) {
+    EI.log(OS);
+    OS << "\n";
+  });
+}
+
+
+std::error_code ErrorList::convertToErrorCode() const {
+  return std::error_code(static_cast<int>(ErrorErrorCode::MultipleErrors),
+                         *ErrorErrorCat);
+}
+
+std::error_code inconvertibleErrorCode() {
+  return std::error_code(static_cast<int>(ErrorErrorCode::InconvertibleError),
+                         *ErrorErrorCat);
+}
+
+Error errorCodeToError(std::error_code EC) {
+  if (!EC)
+    return Error::success();
+  return Error(llvm::make_unique<ECError>(ECError(EC)));
+}
+
+std::error_code errorToErrorCode(Error Err) {
+  std::error_code EC;
+  handleAllErrors(std::move(Err), [&](const ErrorInfoBase &EI) {
+    EC = EI.convertToErrorCode();
+  });
+  if (EC == inconvertibleErrorCode())
+    report_fatal_error(EC.message());
+  return EC;
+}
+
+StringError::StringError(const Twine &S, std::error_code EC)
+    : Msg(S.str()), EC(EC) {}
+
+void StringError::log(raw_ostream &OS) const { OS << Msg; }
+
+std::error_code StringError::convertToErrorCode() const {
+  return EC;
+}
+
+void report_fatal_error(Error Err, bool GenCrashDiag) {
+  assert(Err && "report_fatal_error called with success value");
+  std::string ErrMsg;
+  {
+    raw_string_ostream ErrStream(ErrMsg);
+    logAllUnhandledErrors(std::move(Err), ErrStream, "");
+  }
+  report_fatal_error(ErrMsg);
+}
+
+}
diff --git a/lib/Support/ErrorHandling.cpp b/lib/Support/ErrorHandling.cpp
index 2808bd34af06..a7d3a18003ee 100644
--- a/lib/Support/ErrorHandling.cpp
+++ b/lib/Support/ErrorHandling.cpp
@@ -19,6 +19,7 @@
 #include "llvm/Config/config.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/Errc.h"
+#include "llvm/Support/Error.h"
 #include "llvm/Support/ManagedStatic.h"
 #include "llvm/Support/Mutex.h"
 #include "llvm/Support/MutexGuard.h"
diff --git a/lib/Support/FileUtilities.cpp b/lib/Support/FileUtilities.cpp
index 5316f049a38a..c6a58cc9d038 100644
--- a/lib/Support/FileUtilities.cpp
+++ b/lib/Support/FileUtilities.cpp
@@ -217,8 +217,10 @@ int llvm::DiffFilesWithTolerance(StringRef NameA,
   bool CompareFailed = false;
   while (1) {
     // Scan for the end of file or next difference.
-    while (F1P < File1End && F2P < File2End && *F1P == *F2P)
-      ++F1P, ++F2P;
+    while (F1P < File1End && F2P < File2End && *F1P == *F2P) {
+      ++F1P;
+      ++F2P;
+    }
 
     if (F1P >= File1End || F2P >= File2End) break;
 
diff --git a/lib/Support/FoldingSet.cpp b/lib/Support/FoldingSet.cpp
index bb0ec2defef9..52baf865d83e 100644
--- a/lib/Support/FoldingSet.cpp
+++ b/lib/Support/FoldingSet.cpp
@@ -266,12 +266,12 @@ void FoldingSetImpl::clear() {
   NumNodes = 0;
 }
 
-/// GrowHashTable - Double the size of the hash table and rehash everything.
-///
-void FoldingSetImpl::GrowHashTable() {
+void FoldingSetImpl::GrowBucketCount(unsigned NewBucketCount) {
+  assert((NewBucketCount > NumBuckets) && "Can't shrink a folding set with GrowBucketCount");
+  assert(isPowerOf2_32(NewBucketCount) && "Bad bucket count!");
   void **OldBuckets = Buckets;
   unsigned OldNumBuckets = NumBuckets;
-  NumBuckets <<= 1;
+  NumBuckets = NewBucketCount;
   
   // Clear out new buckets.
   Buckets = AllocateBuckets(NumBuckets);
@@ -298,6 +298,21 @@ void FoldingSetImpl::GrowHashTable() {
   free(OldBuckets);
 }
 
+/// GrowHashTable - Double the size of the hash table and rehash everything.
+///
+void FoldingSetImpl::GrowHashTable() {
+  GrowBucketCount(NumBuckets * 2);
+}
+
+void FoldingSetImpl::reserve(unsigned EltCount) {
+  // This will give us somewhere between EltCount / 2 and
+  // EltCount buckets.  This puts us in the load factor
+  // range of 1.0 - 2.0.
+  if(EltCount < capacity())
+    return;
+  GrowBucketCount(PowerOf2Floor(EltCount));
+}
+
 /// FindNodeOrInsertPos - Look up the node specified by ID.  If it exists,
 /// return it.  If not, return the insertion token that will make insertion
 /// faster.
@@ -330,7 +345,7 @@ FoldingSetImpl::Node
 void FoldingSetImpl::InsertNode(Node *N, void *InsertPos) {
   assert(!N->getNextInBucket());
   // Do we need to grow the hashtable?
-  if (NumNodes+1 > NumBuckets*2) {
+  if (NumNodes+1 > capacity()) {
     GrowHashTable();
     FoldingSetNodeID TempID;
     InsertPos = GetBucketFor(ComputeNodeHash(N, TempID), Buckets, NumBuckets);
diff --git a/lib/Support/Host.cpp b/lib/Support/Host.cpp
index c0f9e0744b5e..12ac19d3be8c 100644
--- a/lib/Support/Host.cpp
+++ b/lib/Support/Host.cpp
@@ -33,9 +33,9 @@
 #include <intrin.h>
 #endif
 #if defined(__APPLE__) && (defined(__ppc__) || defined(__powerpc__))
+#include <mach/host_info.h>
 #include <mach/mach.h>
 #include <mach/mach_host.h>
-#include <mach/host_info.h>
 #include <mach/machine.h>
 #endif
 
@@ -69,40 +69,131 @@ static ssize_t LLVM_ATTRIBUTE_UNUSED readCpuInfo(void *Buf, size_t Size) {
 }
 #endif
 
-#if defined(i386) || defined(__i386__) || defined(__x86__) || defined(_M_IX86)\
- || defined(__x86_64__) || defined(_M_AMD64) || defined (_M_X64)
-
-/// GetX86CpuIDAndInfo - Execute the specified cpuid and return the 4 values in the
-/// specified arguments.  If we can't run cpuid on the host, return true.
-static bool GetX86CpuIDAndInfo(unsigned value, unsigned *rEAX, unsigned *rEBX,
+#if defined(i386) || defined(__i386__) || defined(__x86__) ||                  \
+    defined(_M_IX86) || defined(__x86_64__) || defined(_M_AMD64) ||            \
+    defined(_M_X64)
+
+enum VendorSignatures {
+  SIG_INTEL = 0x756e6547 /* Genu */,
+  SIG_AMD = 0x68747541 /* Auth */
+};
+
+enum ProcessorVendors {
+  VENDOR_INTEL = 1,
+  VENDOR_AMD,
+  VENDOR_OTHER,
+  VENDOR_MAX
+};
+
+enum ProcessorTypes {
+  INTEL_ATOM = 1,
+  INTEL_CORE2,
+  INTEL_COREI7,
+  AMDFAM10H,
+  AMDFAM15H,
+  INTEL_i386,
+  INTEL_i486,
+  INTEL_PENTIUM,
+  INTEL_PENTIUM_PRO,
+  INTEL_PENTIUM_II,
+  INTEL_PENTIUM_III,
+  INTEL_PENTIUM_IV,
+  INTEL_PENTIUM_M,
+  INTEL_CORE_DUO,
+  INTEL_XEONPHI,
+  INTEL_X86_64,
+  INTEL_NOCONA,
+  INTEL_PRESCOTT,
+  AMD_i486,
+  AMDPENTIUM,
+  AMDATHLON,
+  AMDFAM14H,
+  AMDFAM16H,
+  CPU_TYPE_MAX
+};
+
+enum ProcessorSubtypes {
+  INTEL_COREI7_NEHALEM = 1,
+  INTEL_COREI7_WESTMERE,
+  INTEL_COREI7_SANDYBRIDGE,
+  AMDFAM10H_BARCELONA,
+  AMDFAM10H_SHANGHAI,
+  AMDFAM10H_ISTANBUL,
+  AMDFAM15H_BDVER1,
+  AMDFAM15H_BDVER2,
+  INTEL_PENTIUM_MMX,
+  INTEL_CORE2_65,
+  INTEL_CORE2_45,
+  INTEL_COREI7_IVYBRIDGE,
+  INTEL_COREI7_HASWELL,
+  INTEL_COREI7_BROADWELL,
+  INTEL_COREI7_SKYLAKE,
+  INTEL_COREI7_SKYLAKE_AVX512,
+  INTEL_ATOM_BONNELL,
+  INTEL_ATOM_SILVERMONT,
+  INTEL_KNIGHTS_LANDING,
+  AMDPENTIUM_K6,
+  AMDPENTIUM_K62,
+  AMDPENTIUM_K63,
+  AMDPENTIUM_GEODE,
+  AMDATHLON_TBIRD,
+  AMDATHLON_MP,
+  AMDATHLON_XP,
+  AMDATHLON_K8SSE3,
+  AMDATHLON_OPTERON,
+  AMDATHLON_FX,
+  AMDATHLON_64,
+  AMD_BTVER1,
+  AMD_BTVER2,
+  AMDFAM15H_BDVER3,
+  AMDFAM15H_BDVER4,
+  CPU_SUBTYPE_MAX
+};
+
+enum ProcessorFeatures {
+  FEATURE_CMOV = 0,
+  FEATURE_MMX,
+  FEATURE_POPCNT,
+  FEATURE_SSE,
+  FEATURE_SSE2,
+  FEATURE_SSE3,
+  FEATURE_SSSE3,
+  FEATURE_SSE4_1,
+  FEATURE_SSE4_2,
+  FEATURE_AVX,
+  FEATURE_AVX2,
+  FEATURE_AVX512,
+  FEATURE_AVX512SAVE,
+  FEATURE_MOVBE,
+  FEATURE_ADX,
+  FEATURE_EM64T
+};
+
+/// getX86CpuIDAndInfo - Execute the specified cpuid and return the 4 values in
+/// the specified arguments.  If we can't run cpuid on the host, return true.
+static bool getX86CpuIDAndInfo(unsigned value, unsigned *rEAX, unsigned *rEBX,
                                unsigned *rECX, unsigned *rEDX) {
 #if defined(__GNUC__) || defined(__clang__)
-  #if defined(__x86_64__) || defined(_M_AMD64) || defined (_M_X64)
-    // gcc doesn't know cpuid would clobber ebx/rbx. Preseve it manually.
-    asm ("movq\t%%rbx, %%rsi\n\t"
-         "cpuid\n\t"
-         "xchgq\t%%rbx, %%rsi\n\t"
-         : "=a" (*rEAX),
-           "=S" (*rEBX),
-           "=c" (*rECX),
-           "=d" (*rEDX)
-         :  "a" (value));
-    return false;
-  #elif defined(i386) || defined(__i386__) || defined(__x86__) || defined(_M_IX86)
-    asm ("movl\t%%ebx, %%esi\n\t"
-         "cpuid\n\t"
-         "xchgl\t%%ebx, %%esi\n\t"
-         : "=a" (*rEAX),
-           "=S" (*rEBX),
-           "=c" (*rECX),
-           "=d" (*rEDX)
-         :  "a" (value));
-    return false;
+#if defined(__x86_64__) || defined(_M_AMD64) || defined(_M_X64)
+  // gcc doesn't know cpuid would clobber ebx/rbx. Preseve it manually.
+  asm("movq\t%%rbx, %%rsi\n\t"
+      "cpuid\n\t"
+      "xchgq\t%%rbx, %%rsi\n\t"
+      : "=a"(*rEAX), "=S"(*rEBX), "=c"(*rECX), "=d"(*rEDX)
+      : "a"(value));
+  return false;
+#elif defined(i386) || defined(__i386__) || defined(__x86__) || defined(_M_IX86)
+  asm("movl\t%%ebx, %%esi\n\t"
+      "cpuid\n\t"
+      "xchgl\t%%ebx, %%esi\n\t"
+      : "=a"(*rEAX), "=S"(*rEBX), "=c"(*rECX), "=d"(*rEDX)
+      : "a"(value));
+  return false;
 // pedantic #else returns to appease -Wunreachable-code (so we don't generate
 // postprocessed code that looks like "return true; return false;")
-  #else
-    return true;
-  #endif
+#else
+  return true;
+#endif
 #elif defined(_MSC_VER)
   // The MSVC intrinsic is portable across x86 and x64.
   int registers[4];
@@ -117,50 +208,42 @@ static bool GetX86CpuIDAndInfo(unsigned value, unsigned *rEAX, unsigned *rEBX,
 #endif
 }
 
-/// GetX86CpuIDAndInfoEx - Execute the specified cpuid with subleaf and return the
-/// 4 values in the specified arguments.  If we can't run cpuid on the host,
+/// getX86CpuIDAndInfoEx - Execute the specified cpuid with subleaf and return
+/// the 4 values in the specified arguments.  If we can't run cpuid on the host,
 /// return true.
-static bool GetX86CpuIDAndInfoEx(unsigned value, unsigned subleaf,
+static bool getX86CpuIDAndInfoEx(unsigned value, unsigned subleaf,
                                  unsigned *rEAX, unsigned *rEBX, unsigned *rECX,
                                  unsigned *rEDX) {
-#if defined(__x86_64__) || defined(_M_AMD64) || defined (_M_X64)
-  #if defined(__GNUC__)
-    // gcc doesn't know cpuid would clobber ebx/rbx. Preseve it manually.
-    asm ("movq\t%%rbx, %%rsi\n\t"
-         "cpuid\n\t"
-         "xchgq\t%%rbx, %%rsi\n\t"
-         : "=a" (*rEAX),
-           "=S" (*rEBX),
-           "=c" (*rECX),
-           "=d" (*rEDX)
-         :  "a" (value),
-            "c" (subleaf));
-    return false;
-  #elif defined(_MSC_VER)
-    int registers[4];
-    __cpuidex(registers, value, subleaf);
-    *rEAX = registers[0];
-    *rEBX = registers[1];
-    *rECX = registers[2];
-    *rEDX = registers[3];
-    return false;
-  #else
-    return true;
-  #endif
+#if defined(__x86_64__) || defined(_M_AMD64) || defined(_M_X64)
+#if defined(__GNUC__)
+  // gcc doesn't know cpuid would clobber ebx/rbx. Preseve it manually.
+  asm("movq\t%%rbx, %%rsi\n\t"
+      "cpuid\n\t"
+      "xchgq\t%%rbx, %%rsi\n\t"
+      : "=a"(*rEAX), "=S"(*rEBX), "=c"(*rECX), "=d"(*rEDX)
+      : "a"(value), "c"(subleaf));
+  return false;
+#elif defined(_MSC_VER)
+  int registers[4];
+  __cpuidex(registers, value, subleaf);
+  *rEAX = registers[0];
+  *rEBX = registers[1];
+  *rECX = registers[2];
+  *rEDX = registers[3];
+  return false;
+#else
+  return true;
+#endif
 #elif defined(i386) || defined(__i386__) || defined(__x86__) || defined(_M_IX86)
-  #if defined(__GNUC__)
-    asm ("movl\t%%ebx, %%esi\n\t"
-         "cpuid\n\t"
-         "xchgl\t%%ebx, %%esi\n\t"
-         : "=a" (*rEAX),
-           "=S" (*rEBX),
-           "=c" (*rECX),
-           "=d" (*rEDX)
-         :  "a" (value),
-            "c" (subleaf));
-    return false;
-  #elif defined(_MSC_VER)
-    __asm {
+#if defined(__GNUC__)
+  asm("movl\t%%ebx, %%esi\n\t"
+      "cpuid\n\t"
+      "xchgl\t%%ebx, %%esi\n\t"
+      : "=a"(*rEAX), "=S"(*rEBX), "=c"(*rECX), "=d"(*rEDX)
+      : "a"(value), "c"(subleaf));
+  return false;
+#elif defined(_MSC_VER)
+  __asm {
       mov   eax,value
       mov   ecx,subleaf
       cpuid
@@ -172,22 +255,22 @@ static bool GetX86CpuIDAndInfoEx(unsigned value, unsigned subleaf,
       mov   dword ptr [esi],ecx
       mov   esi,rEDX
       mov   dword ptr [esi],edx
-    }
-    return false;
-  #else
-    return true;
-  #endif
+  }
+  return false;
+#else
+  return true;
+#endif
 #else
   return true;
 #endif
 }
 
-static bool GetX86XCR0(unsigned *rEAX, unsigned *rEDX) {
+static bool getX86XCR0(unsigned *rEAX, unsigned *rEDX) {
 #if defined(__GNUC__)
   // Check xgetbv; this uses a .byte sequence instead of the instruction
   // directly because older assemblers do not include support for xgetbv and
   // there is no easy way to conditionally compile based on the assembler used.
-  __asm__ (".byte 0x0f, 0x01, 0xd0" : "=a" (*rEAX), "=d" (*rEDX) : "c" (0));
+  __asm__(".byte 0x0f, 0x01, 0xd0" : "=a"(*rEAX), "=d"(*rEDX) : "c"(0));
   return false;
 #elif defined(_MSC_FULL_VER) && defined(_XCR_XFEATURE_ENABLED_MASK)
   unsigned long long Result = _xgetbv(_XCR_XFEATURE_ENABLED_MASK);
@@ -199,342 +282,669 @@ static bool GetX86XCR0(unsigned *rEAX, unsigned *rEDX) {
 #endif
 }
 
-static void DetectX86FamilyModel(unsigned EAX, unsigned &Family,
-                                 unsigned &Model) {
-  Family = (EAX >> 8) & 0xf; // Bits 8 - 11
-  Model  = (EAX >> 4) & 0xf; // Bits 4 - 7
-  if (Family == 6 || Family == 0xf) {
-    if (Family == 0xf)
+static void detectX86FamilyModel(unsigned EAX, unsigned *Family,
+                                 unsigned *Model) {
+  *Family = (EAX >> 8) & 0xf; // Bits 8 - 11
+  *Model = (EAX >> 4) & 0xf;  // Bits 4 - 7
+  if (*Family == 6 || *Family == 0xf) {
+    if (*Family == 0xf)
       // Examine extended family ID if family ID is F.
-      Family += (EAX >> 20) & 0xff;    // Bits 20 - 27
+      *Family += (EAX >> 20) & 0xff; // Bits 20 - 27
     // Examine extended model ID if family ID is 6 or F.
-    Model += ((EAX >> 16) & 0xf) << 4; // Bits 16 - 19
+    *Model += ((EAX >> 16) & 0xf) << 4; // Bits 16 - 19
   }
 }
 
-StringRef sys::getHostCPUName() {
-  unsigned EAX = 0, EBX = 0, ECX = 0, EDX = 0;
-  if (GetX86CpuIDAndInfo(0x1, &EAX, &EBX, &ECX, &EDX))
-    return "generic";
-  unsigned Family = 0;
-  unsigned Model  = 0;
-  DetectX86FamilyModel(EAX, Family, Model);
-
-  union {
-    unsigned u[3];
-    char     c[12];
-  } text;
-
-  unsigned MaxLeaf;
-  GetX86CpuIDAndInfo(0, &MaxLeaf, text.u+0, text.u+2, text.u+1);
-
-  bool HasMMX   = (EDX >> 23) & 1;
-  bool HasSSE   = (EDX >> 25) & 1;
-  bool HasSSE2  = (EDX >> 26) & 1;
-  bool HasSSE3  = (ECX >>  0) & 1;
-  bool HasSSSE3 = (ECX >>  9) & 1;
-  bool HasSSE41 = (ECX >> 19) & 1;
-  bool HasSSE42 = (ECX >> 20) & 1;
-  bool HasMOVBE = (ECX >> 22) & 1;
-  // If CPUID indicates support for XSAVE, XRESTORE and AVX, and XGETBV
-  // indicates that the AVX registers will be saved and restored on context
-  // switch, then we have full AVX support.
-  const unsigned AVXBits = (1 << 27) | (1 << 28);
-  bool HasAVX = ((ECX & AVXBits) == AVXBits) && !GetX86XCR0(&EAX, &EDX) &&
-                ((EAX & 0x6) == 0x6);
-  bool HasAVX512Save = HasAVX && ((EAX & 0xe0) == 0xe0);
-  bool HasLeaf7 = MaxLeaf >= 0x7 &&
-                  !GetX86CpuIDAndInfoEx(0x7, 0x0, &EAX, &EBX, &ECX, &EDX);
-  bool HasADX = HasLeaf7 && ((EBX >> 19) & 1);
-  bool HasAVX2 = HasAVX && HasLeaf7 && (EBX & 0x20);
-  bool HasAVX512 = HasLeaf7 && HasAVX512Save && ((EBX >> 16) & 1);
-
-  GetX86CpuIDAndInfo(0x80000001, &EAX, &EBX, &ECX, &EDX);
-  bool Em64T = (EDX >> 29) & 0x1;
-  bool HasTBM = (ECX >> 21) & 0x1;
-
-  if (memcmp(text.c, "GenuineIntel", 12) == 0) {
-    switch (Family) {
-    case 3:
-      return "i386";
-    case 4:
-      switch (Model) {
-      case 0: // Intel486 DX processors
-      case 1: // Intel486 DX processors
-      case 2: // Intel486 SX processors
-      case 3: // Intel487 processors, IntelDX2 OverDrive processors,
-              // IntelDX2 processors
-      case 4: // Intel486 SL processor
-      case 5: // IntelSX2 processors
-      case 7: // Write-Back Enhanced IntelDX2 processors
-      case 8: // IntelDX4 OverDrive processors, IntelDX4 processors
-      default: return "i486";
-      }
-    case 5:
-      switch (Model) {
-      case  1: // Pentium OverDrive processor for Pentium processor (60, 66),
-               // Pentium processors (60, 66)
-      case  2: // Pentium OverDrive processor for Pentium processor (75, 90,
-               // 100, 120, 133), Pentium processors (75, 90, 100, 120, 133,
-               // 150, 166, 200)
-      case  3: // Pentium OverDrive processors for Intel486 processor-based
-               // systems
-        return "pentium";
-
-      case  4: // Pentium OverDrive processor with MMX technology for Pentium
-               // processor (75, 90, 100, 120, 133), Pentium processor with
-               // MMX technology (166, 200)
-        return "pentium-mmx";
-
-      default: return "pentium";
-      }
-    case 6:
-      switch (Model) {
-      case  1: // Pentium Pro processor
-        return "pentiumpro";
-
-      case  3: // Intel Pentium II OverDrive processor, Pentium II processor,
+static void
+getIntelProcessorTypeAndSubtype(unsigned int Family, unsigned int Model,
+                                unsigned int Brand_id, unsigned int Features,
+                                unsigned *Type, unsigned *Subtype) {
+  if (Brand_id != 0)
+    return;
+  switch (Family) {
+  case 3:
+    *Type = INTEL_i386;
+    break;
+  case 4:
+    switch (Model) {
+    case 0: // Intel486 DX processors
+    case 1: // Intel486 DX processors
+    case 2: // Intel486 SX processors
+    case 3: // Intel487 processors, IntelDX2 OverDrive processors,
+            // IntelDX2 processors
+    case 4: // Intel486 SL processor
+    case 5: // IntelSX2 processors
+    case 7: // Write-Back Enhanced IntelDX2 processors
+    case 8: // IntelDX4 OverDrive processors, IntelDX4 processors
+    default:
+      *Type = INTEL_i486;
+      break;
+    }
+    break;
+  case 5:
+    switch (Model) {
+    case 1: // Pentium OverDrive processor for Pentium processor (60, 66),
+            // Pentium processors (60, 66)
+    case 2: // Pentium OverDrive processor for Pentium processor (75, 90,
+            // 100, 120, 133), Pentium processors (75, 90, 100, 120, 133,
+            // 150, 166, 200)
+    case 3: // Pentium OverDrive processors for Intel486 processor-based
+            // systems
+      *Type = INTEL_PENTIUM;
+      break;
+    case 4: // Pentium OverDrive processor with MMX technology for Pentium
+            // processor (75, 90, 100, 120, 133), Pentium processor with
+            // MMX technology (166, 200)
+      *Type = INTEL_PENTIUM;
+      *Subtype = INTEL_PENTIUM_MMX;
+      break;
+    default:
+      *Type = INTEL_PENTIUM;
+      break;
+    }
+    break;
+  case 6:
+    switch (Model) {
+    case 0x01: // Pentium Pro processor
+      *Type = INTEL_PENTIUM_PRO;
+      break;
+    case 0x03: // Intel Pentium II OverDrive processor, Pentium II processor,
                // model 03
-      case  5: // Pentium II processor, model 05, Pentium II Xeon processor,
+    case 0x05: // Pentium II processor, model 05, Pentium II Xeon processor,
                // model 05, and Intel Celeron processor, model 05
-      case  6: // Celeron processor, model 06
-        return "pentium2";
-
-      case  7: // Pentium III processor, model 07, and Pentium III Xeon
+    case 0x06: // Celeron processor, model 06
+      *Type = INTEL_PENTIUM_II;
+      break;
+    case 0x07: // Pentium III processor, model 07, and Pentium III Xeon
                // processor, model 07
-      case  8: // Pentium III processor, model 08, Pentium III Xeon processor,
+    case 0x08: // Pentium III processor, model 08, Pentium III Xeon processor,
                // model 08, and Celeron processor, model 08
-      case 10: // Pentium III Xeon processor, model 0Ah
-      case 11: // Pentium III processor, model 0Bh
-        return "pentium3";
-
-      case  9: // Intel Pentium M processor, Intel Celeron M processor model 09.
-      case 13: // Intel Pentium M processor, Intel Celeron M processor, model
+    case 0x0a: // Pentium III Xeon processor, model 0Ah
+    case 0x0b: // Pentium III processor, model 0Bh
+      *Type = INTEL_PENTIUM_III;
+      break;
+    case 0x09: // Intel Pentium M processor, Intel Celeron M processor model 09.
+    case 0x0d: // Intel Pentium M processor, Intel Celeron M processor, model
                // 0Dh. All processors are manufactured using the 90 nm process.
-      case 21: // Intel EP80579 Integrated Processor and Intel EP80579
+    case 0x15: // Intel EP80579 Integrated Processor and Intel EP80579
                // Integrated Processor with Intel QuickAssist Technology
-        return "pentium-m";
-
-      case 14: // Intel Core Duo processor, Intel Core Solo processor, model
+      *Type = INTEL_PENTIUM_M;
+      break;
+    case 0x0e: // Intel Core Duo processor, Intel Core Solo processor, model
                // 0Eh. All processors are manufactured using the 65 nm process.
-        return "yonah";
-
-      case 15: // Intel Core 2 Duo processor, Intel Core 2 Duo mobile
+      *Type = INTEL_CORE_DUO;
+      break;   // yonah
+    case 0x0f: // Intel Core 2 Duo processor, Intel Core 2 Duo mobile
                // processor, Intel Core 2 Quad processor, Intel Core 2 Quad
                // mobile processor, Intel Core 2 Extreme processor, Intel
                // Pentium Dual-Core processor, Intel Xeon processor, model
                // 0Fh. All processors are manufactured using the 65 nm process.
-      case 22: // Intel Celeron processor model 16h. All processors are
+    case 0x16: // Intel Celeron processor model 16h. All processors are
                // manufactured using the 65 nm process
-        return "core2";
-
-      case 23: // Intel Core 2 Extreme processor, Intel Xeon processor, model
+      *Type = INTEL_CORE2; // "core2"
+      *Subtype = INTEL_CORE2_65;
+      break;
+    case 0x17: // Intel Core 2 Extreme processor, Intel Xeon processor, model
                // 17h. All processors are manufactured using the 45 nm process.
                //
                // 45nm: Penryn , Wolfdale, Yorkfield (XE)
-      case 29: // Intel Xeon processor MP. All processors are manufactured using
+    case 0x1d: // Intel Xeon processor MP. All processors are manufactured using
                // the 45 nm process.
-        return "penryn";
-
-      case 26: // Intel Core i7 processor and Intel Xeon processor. All
+      *Type = INTEL_CORE2; // "penryn"
+      *Subtype = INTEL_CORE2_45;
+      break;
+    case 0x1a: // Intel Core i7 processor and Intel Xeon processor. All
                // processors are manufactured using the 45 nm process.
-      case 30: // Intel(R) Core(TM) i7 CPU         870  @ 2.93GHz.
+    case 0x1e: // Intel(R) Core(TM) i7 CPU         870  @ 2.93GHz.
                // As found in a Summer 2010 model iMac.
-      case 46: // Nehalem EX
-        return "nehalem";
-      case 37: // Intel Core i7, laptop version.
-      case 44: // Intel Core i7 processor and Intel Xeon processor. All
+    case 0x1f:
+    case 0x2e:             // Nehalem EX
+      *Type = INTEL_COREI7; // "nehalem"
+      *Subtype = INTEL_COREI7_NEHALEM;
+      break;
+    case 0x25: // Intel Core i7, laptop version.
+    case 0x2c: // Intel Core i7 processor and Intel Xeon processor. All
                // processors are manufactured using the 32 nm process.
-      case 47: // Westmere EX
-        return "westmere";
-
-      // SandyBridge:
-      case 42: // Intel Core i7 processor. All processors are manufactured
+    case 0x2f: // Westmere EX
+      *Type = INTEL_COREI7; // "westmere"
+      *Subtype = INTEL_COREI7_WESTMERE;
+      break;
+    case 0x2a: // Intel Core i7 processor. All processors are manufactured
                // using the 32 nm process.
-      case 45:
-        return "sandybridge";
+    case 0x2d:
+      *Type = INTEL_COREI7; //"sandybridge"
+      *Subtype = INTEL_COREI7_SANDYBRIDGE;
+      break;
+    case 0x3a:
+    case 0x3e:             // Ivy Bridge EP
+      *Type = INTEL_COREI7; // "ivybridge"
+      *Subtype = INTEL_COREI7_IVYBRIDGE;
+      break;
 
-      // Ivy Bridge:
-      case 58:
-      case 62: // Ivy Bridge EP
-        return "ivybridge";
+    // Haswell:
+    case 0x3c:
+    case 0x3f:
+    case 0x45:
+    case 0x46:
+      *Type = INTEL_COREI7; // "haswell"
+      *Subtype = INTEL_COREI7_HASWELL;
+      break;
 
-      // Haswell:
-      case 60:
-      case 63:
-      case 69:
-      case 70:
-        return "haswell";
+    // Broadwell:
+    case 0x3d:
+    case 0x47:
+    case 0x4f:
+    case 0x56:
+      *Type = INTEL_COREI7; // "broadwell"
+      *Subtype = INTEL_COREI7_BROADWELL;
+      break;
 
-      // Broadwell:
-      case 61:
-      case 71:
-        return "broadwell";
+    // Skylake:
+    case 0x4e:
+      *Type = INTEL_COREI7; // "skylake-avx512"
+      *Subtype = INTEL_COREI7_SKYLAKE_AVX512;
+      break;
+    case 0x5e:
+      *Type = INTEL_COREI7; // "skylake"
+      *Subtype = INTEL_COREI7_SKYLAKE;
+      break;
 
-      // Skylake:
-      case 78:
-      case 94:
-        return "skylake";
+    case 0x1c: // Most 45 nm Intel Atom processors
+    case 0x26: // 45 nm Atom Lincroft
+    case 0x27: // 32 nm Atom Medfield
+    case 0x35: // 32 nm Atom Midview
+    case 0x36: // 32 nm Atom Midview
+      *Type = INTEL_ATOM;
+      *Subtype = INTEL_ATOM_BONNELL;
+      break; // "bonnell"
+
+    // Atom Silvermont codes from the Intel software optimization guide.
+    case 0x37:
+    case 0x4a:
+    case 0x4d:
+    case 0x5a:
+    case 0x5d:
+    case 0x4c: // really airmont
+      *Type = INTEL_ATOM;
+      *Subtype = INTEL_ATOM_SILVERMONT;
+      break; // "silvermont"
+
+    case 0x57:
+      *Type = INTEL_XEONPHI; // knl
+      *Subtype = INTEL_KNIGHTS_LANDING;
+      break;
 
-      case 28: // Most 45 nm Intel Atom processors
-      case 38: // 45 nm Atom Lincroft
-      case 39: // 32 nm Atom Medfield
-      case 53: // 32 nm Atom Midview
-      case 54: // 32 nm Atom Midview
-        return "bonnell";
+    default: // Unknown family 6 CPU, try to guess.
+      if (Features & (1 << FEATURE_AVX512)) {
+        *Type = INTEL_XEONPHI; // knl
+        *Subtype = INTEL_KNIGHTS_LANDING;
+        break;
+      }
+      if (Features & (1 << FEATURE_ADX)) {
+        *Type = INTEL_COREI7;
+        *Subtype = INTEL_COREI7_BROADWELL;
+        break;
+      }
+      if (Features & (1 << FEATURE_AVX2)) {
+        *Type = INTEL_COREI7;
+        *Subtype = INTEL_COREI7_HASWELL;
+        break;
+      }
+      if (Features & (1 << FEATURE_AVX)) {
+        *Type = INTEL_COREI7;
+        *Subtype = INTEL_COREI7_SANDYBRIDGE;
+        break;
+      }
+      if (Features & (1 << FEATURE_SSE4_2)) {
+        if (Features & (1 << FEATURE_MOVBE)) {
+          *Type = INTEL_ATOM;
+          *Subtype = INTEL_ATOM_SILVERMONT;
+        } else {
+          *Type = INTEL_COREI7;
+          *Subtype = INTEL_COREI7_NEHALEM;
+        }
+        break;
+      }
+      if (Features & (1 << FEATURE_SSE4_1)) {
+        *Type = INTEL_CORE2; // "penryn"
+        *Subtype = INTEL_CORE2_45;
+        break;
+      }
+      if (Features & (1 << FEATURE_SSSE3)) {
+        if (Features & (1 << FEATURE_MOVBE)) {
+          *Type = INTEL_ATOM;
+          *Subtype = INTEL_ATOM_BONNELL; // "bonnell"
+        } else {
+          *Type = INTEL_CORE2; // "core2"
+          *Subtype = INTEL_CORE2_65;
+        }
+        break;
+      }
+      if (Features & (1 << FEATURE_EM64T)) {
+        *Type = INTEL_X86_64;
+        break; // x86-64
+      }
+      if (Features & (1 << FEATURE_SSE2)) {
+        *Type = INTEL_PENTIUM_M;
+        break;
+      }
+      if (Features & (1 << FEATURE_SSE)) {
+        *Type = INTEL_PENTIUM_III;
+        break;
+      }
+      if (Features & (1 << FEATURE_MMX)) {
+        *Type = INTEL_PENTIUM_II;
+        break;
+      }
+      *Type = INTEL_PENTIUM_PRO;
+      break;
+    }
+    break;
+  case 15: {
+    switch (Model) {
+    case 0: // Pentium 4 processor, Intel Xeon processor. All processors are
+            // model 00h and manufactured using the 0.18 micron process.
+    case 1: // Pentium 4 processor, Intel Xeon processor, Intel Xeon
+            // processor MP, and Intel Celeron processor. All processors are
+            // model 01h and manufactured using the 0.18 micron process.
+    case 2: // Pentium 4 processor, Mobile Intel Pentium 4 processor - M,
+            // Intel Xeon processor, Intel Xeon processor MP, Intel Celeron
+            // processor, and Mobile Intel Celeron processor. All processors
+            // are model 02h and manufactured using the 0.13 micron process.
+      *Type =
+          ((Features & (1 << FEATURE_EM64T)) ? INTEL_X86_64 : INTEL_PENTIUM_IV);
+      break;
 
-      // Atom Silvermont codes from the Intel software optimization guide.
-      case 55:
-      case 74:
-      case 77:
-      case 90:
-      case 93:
-        return "silvermont";
+    case 3: // Pentium 4 processor, Intel Xeon processor, Intel Celeron D
+            // processor. All processors are model 03h and manufactured using
+            // the 90 nm process.
+    case 4: // Pentium 4 processor, Pentium 4 processor Extreme Edition,
+            // Pentium D processor, Intel Xeon processor, Intel Xeon
+            // processor MP, Intel Celeron D processor. All processors are
+            // model 04h and manufactured using the 90 nm process.
+    case 6: // Pentium 4 processor, Pentium D processor, Pentium processor
+            // Extreme Edition, Intel Xeon processor, Intel Xeon processor
+            // MP, Intel Celeron D processor. All processors are model 06h
+            // and manufactured using the 65 nm process.
+      *Type =
+          ((Features & (1 << FEATURE_EM64T)) ? INTEL_NOCONA : INTEL_PRESCOTT);
+      break;
 
-      default: // Unknown family 6 CPU, try to guess.
-        if (HasAVX512)
-          return "knl";
-        if (HasADX)
-          return "broadwell";
-        if (HasAVX2)
-          return "haswell";
-        if (HasAVX)
-          return "sandybridge";
-        if (HasSSE42)
-          return HasMOVBE ? "silvermont" : "nehalem";
-        if (HasSSE41)
-          return "penryn";
-        if (HasSSSE3)
-          return HasMOVBE ? "bonnell" : "core2";
-        if (Em64T)
-          return "x86-64";
-        if (HasSSE2)
-          return "pentium-m";
-        if (HasSSE)
-          return "pentium3";
-        if (HasMMX)
-          return "pentium2";
-        return "pentiumpro";
-      }
-    case 15: {
-      switch (Model) {
-      case  0: // Pentium 4 processor, Intel Xeon processor. All processors are
-               // model 00h and manufactured using the 0.18 micron process.
-      case  1: // Pentium 4 processor, Intel Xeon processor, Intel Xeon
-               // processor MP, and Intel Celeron processor. All processors are
-               // model 01h and manufactured using the 0.18 micron process.
-      case  2: // Pentium 4 processor, Mobile Intel Pentium 4 processor - M,
-               // Intel Xeon processor, Intel Xeon processor MP, Intel Celeron
-               // processor, and Mobile Intel Celeron processor. All processors
-               // are model 02h and manufactured using the 0.13 micron process.
-        return (Em64T) ? "x86-64" : "pentium4";
-
-      case  3: // Pentium 4 processor, Intel Xeon processor, Intel Celeron D
-               // processor. All processors are model 03h and manufactured using
-               // the 90 nm process.
-      case  4: // Pentium 4 processor, Pentium 4 processor Extreme Edition,
-               // Pentium D processor, Intel Xeon processor, Intel Xeon
-               // processor MP, Intel Celeron D processor. All processors are
-               // model 04h and manufactured using the 90 nm process.
-      case  6: // Pentium 4 processor, Pentium D processor, Pentium processor
-               // Extreme Edition, Intel Xeon processor, Intel Xeon processor
-               // MP, Intel Celeron D processor. All processors are model 06h
-               // and manufactured using the 65 nm process.
-        return (Em64T) ? "nocona" : "prescott";
+    default:
+      *Type =
+          ((Features & (1 << FEATURE_EM64T)) ? INTEL_X86_64 : INTEL_PENTIUM_IV);
+      break;
+    }
+    break;
+  }
+  default:
+    break; /*"generic"*/
+  }
+}
 
-      default:
-        return (Em64T) ? "x86-64" : "pentium4";
-      }
+static void getAMDProcessorTypeAndSubtype(unsigned int Family,
+                                          unsigned int Model,
+                                          unsigned int Features,
+                                          unsigned *Type,
+                                          unsigned *Subtype) {
+  // FIXME: this poorly matches the generated SubtargetFeatureKV table.  There
+  // appears to be no way to generate the wide variety of AMD-specific targets
+  // from the information returned from CPUID.
+  switch (Family) {
+  case 4:
+    *Type = AMD_i486;
+    break;
+  case 5:
+    *Type = AMDPENTIUM;
+    switch (Model) {
+    case 6:
+    case 7:
+      *Subtype = AMDPENTIUM_K6;
+      break; // "k6"
+    case 8:
+      *Subtype = AMDPENTIUM_K62;
+      break; // "k6-2"
+    case 9:
+    case 13:
+      *Subtype = AMDPENTIUM_K63;
+      break; // "k6-3"
+    case 10:
+      *Subtype = AMDPENTIUM_GEODE;
+      break; // "geode"
+    }
+    break;
+  case 6:
+    *Type = AMDATHLON;
+    switch (Model) {
+    case 4:
+      *Subtype = AMDATHLON_TBIRD;
+      break; // "athlon-tbird"
+    case 6:
+    case 7:
+    case 8:
+      *Subtype = AMDATHLON_MP;
+      break; // "athlon-mp"
+    case 10:
+      *Subtype = AMDATHLON_XP;
+      break; // "athlon-xp"
     }
+    break;
+  case 15:
+    *Type = AMDATHLON;
+    if (Features & (1 << FEATURE_SSE3)) {
+      *Subtype = AMDATHLON_K8SSE3;
+      break; // "k8-sse3"
+    }
+    switch (Model) {
+    case 1:
+      *Subtype = AMDATHLON_OPTERON;
+      break; // "opteron"
+    case 5:
+      *Subtype = AMDATHLON_FX;
+      break; // "athlon-fx"; also opteron
+    default:
+      *Subtype = AMDATHLON_64;
+      break; // "athlon64"
+    }
+    break;
+  case 16:
+    *Type = AMDFAM10H; // "amdfam10"
+    switch (Model) {
+    case 2:
+      *Subtype = AMDFAM10H_BARCELONA;
+      break;
+    case 4:
+      *Subtype = AMDFAM10H_SHANGHAI;
+      break;
+    case 8:
+      *Subtype = AMDFAM10H_ISTANBUL;
+      break;
+    }
+    break;
+  case 20:
+    *Type = AMDFAM14H;
+    *Subtype = AMD_BTVER1;
+    break; // "btver1";
+  case 21:
+    *Type = AMDFAM15H;
+    if (!(Features &
+          (1 << FEATURE_AVX))) { // If no AVX support, provide a sane fallback.
+      *Subtype = AMD_BTVER1;
+      break; // "btver1"
+    }
+    if (Model >= 0x50 && Model <= 0x6f) {
+      *Subtype = AMDFAM15H_BDVER4;
+      break; // "bdver4"; 50h-6Fh: Excavator
+    }
+    if (Model >= 0x30 && Model <= 0x3f) {
+      *Subtype = AMDFAM15H_BDVER3;
+      break; // "bdver3"; 30h-3Fh: Steamroller
+    }
+    if (Model >= 0x10 && Model <= 0x1f) {
+      *Subtype = AMDFAM15H_BDVER2;
+      break; // "bdver2"; 10h-1Fh: Piledriver
+    }
+    if (Model <= 0x0f) {
+      *Subtype = AMDFAM15H_BDVER1;
+      break; // "bdver1"; 00h-0Fh: Bulldozer
+    }
+    break;
+  case 22:
+    *Type = AMDFAM16H;
+    if (!(Features &
+          (1 << FEATURE_AVX))) { // If no AVX support provide a sane fallback.
+      *Subtype = AMD_BTVER1;
+      break; // "btver1";
+    }
+    *Subtype = AMD_BTVER2;
+    break; // "btver2"
+  default:
+    break; // "generic"
+  }
+}
+
+static unsigned getAvailableFeatures(unsigned int ECX, unsigned int EDX,
+                                     unsigned MaxLeaf) {
+  unsigned Features = 0;
+  unsigned int EAX, EBX;
+  Features |= (((EDX >> 23) & 1) << FEATURE_MMX);
+  Features |= (((EDX >> 25) & 1) << FEATURE_SSE);
+  Features |= (((EDX >> 26) & 1) << FEATURE_SSE2);
+  Features |= (((ECX >> 0) & 1) << FEATURE_SSE3);
+  Features |= (((ECX >> 9) & 1) << FEATURE_SSSE3);
+  Features |= (((ECX >> 19) & 1) << FEATURE_SSE4_1);
+  Features |= (((ECX >> 20) & 1) << FEATURE_SSE4_2);
+  Features |= (((ECX >> 22) & 1) << FEATURE_MOVBE);
+
+  // If CPUID indicates support for XSAVE, XRESTORE and AVX, and XGETBV
+  // indicates that the AVX registers will be saved and restored on context
+  // switch, then we have full AVX support.
+  const unsigned AVXBits = (1 << 27) | (1 << 28);
+  bool HasAVX = ((ECX & AVXBits) == AVXBits) && !getX86XCR0(&EAX, &EDX) &&
+                ((EAX & 0x6) == 0x6);
+  bool HasAVX512Save = HasAVX && ((EAX & 0xe0) == 0xe0);
+  bool HasLeaf7 =
+      MaxLeaf >= 0x7 && !getX86CpuIDAndInfoEx(0x7, 0x0, &EAX, &EBX, &ECX, &EDX);
+  bool HasADX = HasLeaf7 && ((EBX >> 19) & 1);
+  bool HasAVX2 = HasAVX && HasLeaf7 && (EBX & 0x20);
+  bool HasAVX512 = HasLeaf7 && HasAVX512Save && ((EBX >> 16) & 1);
+  Features |= (HasAVX << FEATURE_AVX);
+  Features |= (HasAVX2 << FEATURE_AVX2);
+  Features |= (HasAVX512 << FEATURE_AVX512);
+  Features |= (HasAVX512Save << FEATURE_AVX512SAVE);
+  Features |= (HasADX << FEATURE_ADX);
+
+  getX86CpuIDAndInfo(0x80000001, &EAX, &EBX, &ECX, &EDX);
+  Features |= (((EDX >> 29) & 0x1) << FEATURE_EM64T);
+  return Features;
+}
 
+StringRef sys::getHostCPUName() {
+  unsigned EAX = 0, EBX = 0, ECX = 0, EDX = 0;
+  unsigned MaxLeaf, Vendor;
+
+  if (getX86CpuIDAndInfo(0, &MaxLeaf, &Vendor, &ECX, &EDX))
+    return "generic";
+  if (getX86CpuIDAndInfo(0x1, &EAX, &EBX, &ECX, &EDX))
+    return "generic";
+
+  unsigned Brand_id = EBX & 0xff;
+  unsigned Family = 0, Model = 0;
+  unsigned Features = 0;
+  detectX86FamilyModel(EAX, &Family, &Model);
+  Features = getAvailableFeatures(ECX, EDX, MaxLeaf);
+
+  unsigned Type;
+  unsigned Subtype;
+
+  if (Vendor == SIG_INTEL) {
+    getIntelProcessorTypeAndSubtype(Family, Model, Brand_id, Features, &Type,
+                                    &Subtype);
+    switch (Type) {
+    case INTEL_i386:
+      return "i386";
+    case INTEL_i486:
+      return "i486";
+    case INTEL_PENTIUM:
+      if (Subtype == INTEL_PENTIUM_MMX)
+        return "pentium-mmx";
+      return "pentium";
+    case INTEL_PENTIUM_PRO:
+      return "pentiumpro";
+    case INTEL_PENTIUM_II:
+      return "pentium2";
+    case INTEL_PENTIUM_III:
+      return "pentium3";
+    case INTEL_PENTIUM_IV:
+      return "pentium4";
+    case INTEL_PENTIUM_M:
+      return "pentium-m";
+    case INTEL_CORE_DUO:
+      return "yonah";
+    case INTEL_CORE2:
+      switch (Subtype) {
+      case INTEL_CORE2_65:
+        return "core2";
+      case INTEL_CORE2_45:
+        return "penryn";
+      default:
+        return "core2";
+      }
+    case INTEL_COREI7:
+      switch (Subtype) {
+      case INTEL_COREI7_NEHALEM:
+        return "nehalem";
+      case INTEL_COREI7_WESTMERE:
+        return "westmere";
+      case INTEL_COREI7_SANDYBRIDGE:
+        return "sandybridge";
+      case INTEL_COREI7_IVYBRIDGE:
+        return "ivybridge";
+      case INTEL_COREI7_HASWELL:
+        return "haswell";
+      case INTEL_COREI7_BROADWELL:
+        return "broadwell";
+      case INTEL_COREI7_SKYLAKE:
+        return "skylake";
+      case INTEL_COREI7_SKYLAKE_AVX512:
+        return "skylake-avx512";
+      default:
+        return "corei7";
+      }
+    case INTEL_ATOM:
+      switch (Subtype) {
+      case INTEL_ATOM_BONNELL:
+        return "bonnell";
+      case INTEL_ATOM_SILVERMONT:
+        return "silvermont";
+      default:
+        return "atom";
+      }
+    case INTEL_XEONPHI:
+      return "knl"; /*update for more variants added*/
+    case INTEL_X86_64:
+      return "x86-64";
+    case INTEL_NOCONA:
+      return "nocona";
+    case INTEL_PRESCOTT:
+      return "prescott";
     default:
       return "generic";
     }
-  } else if (memcmp(text.c, "AuthenticAMD", 12) == 0) {
-    // FIXME: this poorly matches the generated SubtargetFeatureKV table.  There
-    // appears to be no way to generate the wide variety of AMD-specific targets
-    // from the information returned from CPUID.
-    switch (Family) {
-      case 4:
-        return "i486";
-      case 5:
-        switch (Model) {
-        case 6:
-        case 7:  return "k6";
-        case 8:  return "k6-2";
-        case 9:
-        case 13: return "k6-3";
-        case 10: return "geode";
-        default: return "pentium";
-        }
-      case 6:
-        switch (Model) {
-        case 4:  return "athlon-tbird";
-        case 6:
-        case 7:
-        case 8:  return "athlon-mp";
-        case 10: return "athlon-xp";
-        default: return "athlon";
-        }
-      case 15:
-        if (HasSSE3)
-          return "k8-sse3";
-        switch (Model) {
-        case 1:  return "opteron";
-        case 5:  return "athlon-fx"; // also opteron
-        default: return "athlon64";
-        }
-      case 16:
-        return "amdfam10";
-      case 20:
+  } else if (Vendor == SIG_AMD) {
+    getAMDProcessorTypeAndSubtype(Family, Model, Features, &Type, &Subtype);
+    switch (Type) {
+    case AMD_i486:
+      return "i486";
+    case AMDPENTIUM:
+      switch (Subtype) {
+      case AMDPENTIUM_K6:
+        return "k6";
+      case AMDPENTIUM_K62:
+        return "k6-2";
+      case AMDPENTIUM_K63:
+        return "k6-3";
+      case AMDPENTIUM_GEODE:
+        return "geode";
+      default:
+        return "pentium";
+      }
+    case AMDATHLON:
+      switch (Subtype) {
+      case AMDATHLON_TBIRD:
+        return "athlon-tbird";
+      case AMDATHLON_MP:
+        return "athlon-mp";
+      case AMDATHLON_XP:
+        return "athlon-xp";
+      case AMDATHLON_K8SSE3:
+        return "k8-sse3";
+      case AMDATHLON_OPTERON:
+        return "opteron";
+      case AMDATHLON_FX:
+        return "athlon-fx";
+      case AMDATHLON_64:
+        return "athlon64";
+      default:
+        return "athlon";
+      }
+    case AMDFAM10H:
+      if(Subtype == AMDFAM10H_BARCELONA)
+        return "barcelona";
+      return "amdfam10";
+    case AMDFAM14H:
+      return "btver1";
+    case AMDFAM15H:
+      switch (Subtype) {
+      case AMDFAM15H_BDVER1:
+        return "bdver1";
+      case AMDFAM15H_BDVER2:
+        return "bdver2";
+      case AMDFAM15H_BDVER3:
+        return "bdver3";
+      case AMDFAM15H_BDVER4:
+        return "bdver4";
+      case AMD_BTVER1:
         return "btver1";
-      case 21:
-        if (!HasAVX) // If the OS doesn't support AVX provide a sane fallback.
-          return "btver1";
-        if (Model >= 0x50)
-          return "bdver4"; // 50h-6Fh: Excavator
-        if (Model >= 0x30)
-          return "bdver3"; // 30h-3Fh: Steamroller
-        if (Model >= 0x10 || HasTBM)
-          return "bdver2"; // 10h-1Fh: Piledriver
-        return "bdver1";   // 00h-0Fh: Bulldozer
-      case 22:
-        if (!HasAVX) // If the OS doesn't support AVX provide a sane fallback.
-          return "btver1";
+      default:
+        return "amdfam15";
+      }
+    case AMDFAM16H:
+      switch (Subtype) {
+      case AMD_BTVER1:
+        return "btver1";
+      case AMD_BTVER2:
         return "btver2";
+      default:
+        return "amdfam16";
+      }
     default:
       return "generic";
     }
   }
   return "generic";
 }
+
 #elif defined(__APPLE__) && (defined(__ppc__) || defined(__powerpc__))
 StringRef sys::getHostCPUName() {
   host_basic_info_data_t hostInfo;
   mach_msg_type_number_t infoCount;
 
   infoCount = HOST_BASIC_INFO_COUNT;
-  host_info(mach_host_self(), HOST_BASIC_INFO, (host_info_t)&hostInfo, 
+  host_info(mach_host_self(), HOST_BASIC_INFO, (host_info_t)&hostInfo,
             &infoCount);
-            
-  if (hostInfo.cpu_type != CPU_TYPE_POWERPC) return "generic";
-
-  switch(hostInfo.cpu_subtype) {
-  case CPU_SUBTYPE_POWERPC_601:   return "601";
-  case CPU_SUBTYPE_POWERPC_602:   return "602";
-  case CPU_SUBTYPE_POWERPC_603:   return "603";
-  case CPU_SUBTYPE_POWERPC_603e:  return "603e";
-  case CPU_SUBTYPE_POWERPC_603ev: return "603ev";
-  case CPU_SUBTYPE_POWERPC_604:   return "604";
-  case CPU_SUBTYPE_POWERPC_604e:  return "604e";
-  case CPU_SUBTYPE_POWERPC_620:   return "620";
-  case CPU_SUBTYPE_POWERPC_750:   return "750";
-  case CPU_SUBTYPE_POWERPC_7400:  return "7400";
-  case CPU_SUBTYPE_POWERPC_7450:  return "7450";
-  case CPU_SUBTYPE_POWERPC_970:   return "970";
-  default: ;
+
+  if (hostInfo.cpu_type != CPU_TYPE_POWERPC)
+    return "generic";
+
+  switch (hostInfo.cpu_subtype) {
+  case CPU_SUBTYPE_POWERPC_601:
+    return "601";
+  case CPU_SUBTYPE_POWERPC_602:
+    return "602";
+  case CPU_SUBTYPE_POWERPC_603:
+    return "603";
+  case CPU_SUBTYPE_POWERPC_603e:
+    return "603e";
+  case CPU_SUBTYPE_POWERPC_603ev:
+    return "603ev";
+  case CPU_SUBTYPE_POWERPC_604:
+    return "604";
+  case CPU_SUBTYPE_POWERPC_604e:
+    return "604e";
+  case CPU_SUBTYPE_POWERPC_620:
+    return "620";
+  case CPU_SUBTYPE_POWERPC_750:
+    return "750";
+  case CPU_SUBTYPE_POWERPC_7400:
+    return "7400";
+  case CPU_SUBTYPE_POWERPC_7450:
+    return "7450";
+  case CPU_SUBTYPE_POWERPC_970:
+    return "970";
+  default:;
   }
-  
+
   return "generic";
 }
 #elif defined(__linux__) && (defined(__ppc__) || defined(__powerpc__))
@@ -573,12 +983,12 @@ StringRef sys::getHostCPUName() {
           ++CIP;
           while (CIP < CPUInfoEnd && (*CIP == ' ' || *CIP == '\t'))
             ++CIP;
-  
+
           if (CIP < CPUInfoEnd && *CIP == ':') {
             ++CIP;
             while (CIP < CPUInfoEnd && (*CIP == ' ' || *CIP == '\t'))
               ++CIP;
-  
+
             if (CIP < CPUInfoEnd) {
               CPUStart = CIP;
               while (CIP < CPUInfoEnd && (*CIP != ' ' && *CIP != '\t' &&
@@ -600,24 +1010,25 @@ StringRef sys::getHostCPUName() {
     return generic;
 
   return StringSwitch<const char *>(StringRef(CPUStart, CPULen))
-    .Case("604e", "604e")
-    .Case("604", "604")
-    .Case("7400", "7400")
-    .Case("7410", "7400")
-    .Case("7447", "7400")
-    .Case("7455", "7450")
-    .Case("G4", "g4")
-    .Case("POWER4", "970")
-    .Case("PPC970FX", "970")
-    .Case("PPC970MP", "970")
-    .Case("G5", "g5")
-    .Case("POWER5", "g5")
-    .Case("A2", "a2")
-    .Case("POWER6", "pwr6")
-    .Case("POWER7", "pwr7")
-    .Case("POWER8", "pwr8")
-    .Case("POWER8E", "pwr8")
-    .Default(generic);
+      .Case("604e", "604e")
+      .Case("604", "604")
+      .Case("7400", "7400")
+      .Case("7410", "7400")
+      .Case("7447", "7400")
+      .Case("7455", "7450")
+      .Case("G4", "g4")
+      .Case("POWER4", "970")
+      .Case("PPC970FX", "970")
+      .Case("PPC970MP", "970")
+      .Case("G5", "g5")
+      .Case("POWER5", "g5")
+      .Case("A2", "a2")
+      .Case("POWER6", "pwr6")
+      .Case("POWER7", "pwr7")
+      .Case("POWER8", "pwr8")
+      .Case("POWER8E", "pwr8")
+      .Case("POWER9", "pwr9")
+      .Default(generic);
 }
 #elif defined(__linux__) && defined(__arm__)
 StringRef sys::getHostCPUName() {
@@ -650,18 +1061,18 @@ StringRef sys::getHostCPUName() {
         // values correspond to the "Part number" in the CP15/c0 register. The
         // contents are specified in the various processor manuals.
         return StringSwitch<const char *>(Lines[I].substr(8).ltrim("\t :"))
-          .Case("0x926", "arm926ej-s")
-          .Case("0xb02", "mpcore")
-          .Case("0xb36", "arm1136j-s")
-          .Case("0xb56", "arm1156t2-s")
-          .Case("0xb76", "arm1176jz-s")
-          .Case("0xc08", "cortex-a8")
-          .Case("0xc09", "cortex-a9")
-          .Case("0xc0f", "cortex-a15")
-          .Case("0xc20", "cortex-m0")
-          .Case("0xc23", "cortex-m3")
-          .Case("0xc24", "cortex-m4")
-          .Default("generic");
+            .Case("0x926", "arm926ej-s")
+            .Case("0xb02", "mpcore")
+            .Case("0xb36", "arm1136j-s")
+            .Case("0xb56", "arm1156t2-s")
+            .Case("0xb76", "arm1176jz-s")
+            .Case("0xc08", "cortex-a8")
+            .Case("0xc09", "cortex-a9")
+            .Case("0xc0f", "cortex-a15")
+            .Case("0xc20", "cortex-m0")
+            .Case("0xc23", "cortex-m3")
+            .Case("0xc24", "cortex-m4")
+            .Default("generic");
 
   if (Implementer == "0x51") // Qualcomm Technologies, Inc.
     // Look for the CPU part line.
@@ -671,8 +1082,8 @@ StringRef sys::getHostCPUName() {
         // values correspond to the "Part number" in the CP15/c0 register. The
         // contents are specified in the various processor manuals.
         return StringSwitch<const char *>(Lines[I].substr(8).ltrim("\t :"))
-          .Case("0x06f", "krait") // APQ8064
-          .Default("generic");
+            .Case("0x06f", "krait") // APQ8064
+            .Default("generic");
 
   return "generic";
 }
@@ -730,107 +1141,117 @@ StringRef sys::getHostCPUName() {
       break;
     }
   }
-  
+
   return "generic";
 }
 #else
-StringRef sys::getHostCPUName() {
-  return "generic";
-}
+StringRef sys::getHostCPUName() { return "generic"; }
 #endif
 
-#if defined(i386) || defined(__i386__) || defined(__x86__) || defined(_M_IX86)\
- || defined(__x86_64__) || defined(_M_AMD64) || defined (_M_X64)
+#if defined(i386) || defined(__i386__) || defined(__x86__) ||                  \
+    defined(_M_IX86) || defined(__x86_64__) || defined(_M_AMD64) ||            \
+    defined(_M_X64)
 bool sys::getHostCPUFeatures(StringMap<bool> &Features) {
   unsigned EAX = 0, EBX = 0, ECX = 0, EDX = 0;
   unsigned MaxLevel;
   union {
     unsigned u[3];
-    char     c[12];
+    char c[12];
   } text;
 
-  if (GetX86CpuIDAndInfo(0, &MaxLevel, text.u+0, text.u+2, text.u+1) ||
+  if (getX86CpuIDAndInfo(0, &MaxLevel, text.u + 0, text.u + 2, text.u + 1) ||
       MaxLevel < 1)
     return false;
 
-  GetX86CpuIDAndInfo(1, &EAX, &EBX, &ECX, &EDX);
+  getX86CpuIDAndInfo(1, &EAX, &EBX, &ECX, &EDX);
 
-  Features["cmov"]   = (EDX >> 15) & 1;
-  Features["mmx"]    = (EDX >> 23) & 1;
-  Features["sse"]    = (EDX >> 25) & 1;
-  Features["sse2"]   = (EDX >> 26) & 1;
-  Features["sse3"]   = (ECX >>  0) & 1;
-  Features["ssse3"]  = (ECX >>  9) & 1;
+  Features["cmov"] = (EDX >> 15) & 1;
+  Features["mmx"] = (EDX >> 23) & 1;
+  Features["sse"] = (EDX >> 25) & 1;
+  Features["sse2"] = (EDX >> 26) & 1;
+  Features["sse3"] = (ECX >> 0) & 1;
+  Features["ssse3"] = (ECX >> 9) & 1;
   Features["sse4.1"] = (ECX >> 19) & 1;
   Features["sse4.2"] = (ECX >> 20) & 1;
 
-  Features["pclmul"] = (ECX >>  1) & 1;
-  Features["cx16"]   = (ECX >> 13) & 1;
-  Features["movbe"]  = (ECX >> 22) & 1;
+  Features["pclmul"] = (ECX >> 1) & 1;
+  Features["cx16"] = (ECX >> 13) & 1;
+  Features["movbe"] = (ECX >> 22) & 1;
   Features["popcnt"] = (ECX >> 23) & 1;
-  Features["aes"]    = (ECX >> 25) & 1;
-  Features["rdrnd"]  = (ECX >> 30) & 1;
+  Features["aes"] = (ECX >> 25) & 1;
+  Features["rdrnd"] = (ECX >> 30) & 1;
 
   // If CPUID indicates support for XSAVE, XRESTORE and AVX, and XGETBV
   // indicates that the AVX registers will be saved and restored on context
   // switch, then we have full AVX support.
   bool HasAVXSave = ((ECX >> 27) & 1) && ((ECX >> 28) & 1) &&
-                    !GetX86XCR0(&EAX, &EDX) && ((EAX & 0x6) == 0x6);
-  Features["avx"]    = HasAVXSave;
-  Features["fma"]    = HasAVXSave && (ECX >> 12) & 1;
-  Features["f16c"]   = HasAVXSave && (ECX >> 29) & 1;
+                    !getX86XCR0(&EAX, &EDX) && ((EAX & 0x6) == 0x6);
+  Features["avx"] = HasAVXSave;
+  Features["fma"] = HasAVXSave && (ECX >> 12) & 1;
+  Features["f16c"] = HasAVXSave && (ECX >> 29) & 1;
 
   // Only enable XSAVE if OS has enabled support for saving YMM state.
-  Features["xsave"]  = HasAVXSave && (ECX >> 26) & 1;
+  Features["xsave"] = HasAVXSave && (ECX >> 26) & 1;
 
   // AVX512 requires additional context to be saved by the OS.
   bool HasAVX512Save = HasAVXSave && ((EAX & 0xe0) == 0xe0);
 
   unsigned MaxExtLevel;
-  GetX86CpuIDAndInfo(0x80000000, &MaxExtLevel, &EBX, &ECX, &EDX);
+  getX86CpuIDAndInfo(0x80000000, &MaxExtLevel, &EBX, &ECX, &EDX);
 
   bool HasExtLeaf1 = MaxExtLevel >= 0x80000001 &&
-                     !GetX86CpuIDAndInfo(0x80000001, &EAX, &EBX, &ECX, &EDX);
-  Features["lzcnt"]  = HasExtLeaf1 && ((ECX >>  5) & 1);
-  Features["sse4a"]  = HasExtLeaf1 && ((ECX >>  6) & 1);
-  Features["prfchw"] = HasExtLeaf1 && ((ECX >>  8) & 1);
-  Features["xop"]    = HasExtLeaf1 && ((ECX >> 11) & 1) && HasAVXSave;
-  Features["fma4"]   = HasExtLeaf1 && ((ECX >> 16) & 1) && HasAVXSave;
-  Features["tbm"]    = HasExtLeaf1 && ((ECX >> 21) & 1);
-
-  bool HasLeaf7 = MaxLevel >= 7 &&
-                  !GetX86CpuIDAndInfoEx(0x7, 0x0, &EAX, &EBX, &ECX, &EDX);
+                     !getX86CpuIDAndInfo(0x80000001, &EAX, &EBX, &ECX, &EDX);
+  Features["lzcnt"] = HasExtLeaf1 && ((ECX >> 5) & 1);
+  Features["sse4a"] = HasExtLeaf1 && ((ECX >> 6) & 1);
+  Features["prfchw"] = HasExtLeaf1 && ((ECX >> 8) & 1);
+  Features["xop"] = HasExtLeaf1 && ((ECX >> 11) & 1) && HasAVXSave;
+  Features["fma4"] = HasExtLeaf1 && ((ECX >> 16) & 1) && HasAVXSave;
+  Features["tbm"] = HasExtLeaf1 && ((ECX >> 21) & 1);
+  Features["mwaitx"] = HasExtLeaf1 && ((ECX >> 29) & 1);
+
+  bool HasLeaf7 =
+      MaxLevel >= 7 && !getX86CpuIDAndInfoEx(0x7, 0x0, &EAX, &EBX, &ECX, &EDX);
 
   // AVX2 is only supported if we have the OS save support from AVX.
-  Features["avx2"]     = HasAVXSave && HasLeaf7 && ((EBX >>  5) & 1);
-
-  Features["fsgsbase"] = HasLeaf7 && ((EBX >>  0) & 1);
-  Features["bmi"]      = HasLeaf7 && ((EBX >>  3) & 1);
-  Features["hle"]      = HasLeaf7 && ((EBX >>  4) & 1);
-  Features["bmi2"]     = HasLeaf7 && ((EBX >>  8) & 1);
-  Features["rtm"]      = HasLeaf7 && ((EBX >> 11) & 1);
-  Features["rdseed"]   = HasLeaf7 && ((EBX >> 18) & 1);
-  Features["adx"]      = HasLeaf7 && ((EBX >> 19) & 1);
-  Features["sha"]      = HasLeaf7 && ((EBX >> 29) & 1);
-  // Enable protection keys
-  Features["pku"]    = HasLeaf7 && ((ECX >> 4) & 1);
+  Features["avx2"] = HasAVXSave && HasLeaf7 && ((EBX >> 5) & 1);
+
+  Features["fsgsbase"] = HasLeaf7 && ((EBX >> 0) & 1);
+  Features["sgx"] = HasLeaf7 && ((EBX >> 2) & 1);
+  Features["bmi"] = HasLeaf7 && ((EBX >> 3) & 1);
+  Features["hle"] = HasLeaf7 && ((EBX >> 4) & 1);
+  Features["bmi2"] = HasLeaf7 && ((EBX >> 8) & 1);
+  Features["invpcid"] = HasLeaf7 && ((EBX >> 10) & 1);
+  Features["rtm"] = HasLeaf7 && ((EBX >> 11) & 1);
+  Features["rdseed"] = HasLeaf7 && ((EBX >> 18) & 1);
+  Features["adx"] = HasLeaf7 && ((EBX >> 19) & 1);
+  Features["smap"] = HasLeaf7 && ((EBX >> 20) & 1);
+  Features["pcommit"] = HasLeaf7 && ((EBX >> 22) & 1);
+  Features["clflushopt"] = HasLeaf7 && ((EBX >> 23) & 1);
+  Features["clwb"] = HasLeaf7 && ((EBX >> 24) & 1);
+  Features["sha"] = HasLeaf7 && ((EBX >> 29) & 1);
 
   // AVX512 is only supported if the OS supports the context save for it.
-  Features["avx512f"]  = HasLeaf7 && ((EBX >> 16) & 1) && HasAVX512Save;
+  Features["avx512f"] = HasLeaf7 && ((EBX >> 16) & 1) && HasAVX512Save;
   Features["avx512dq"] = HasLeaf7 && ((EBX >> 17) & 1) && HasAVX512Save;
+  Features["avx512ifma"] = HasLeaf7 && ((EBX >> 21) & 1) && HasAVX512Save;
   Features["avx512pf"] = HasLeaf7 && ((EBX >> 26) & 1) && HasAVX512Save;
   Features["avx512er"] = HasLeaf7 && ((EBX >> 27) & 1) && HasAVX512Save;
   Features["avx512cd"] = HasLeaf7 && ((EBX >> 28) & 1) && HasAVX512Save;
   Features["avx512bw"] = HasLeaf7 && ((EBX >> 30) & 1) && HasAVX512Save;
   Features["avx512vl"] = HasLeaf7 && ((EBX >> 31) & 1) && HasAVX512Save;
 
+  Features["prefetchwt1"] = HasLeaf7 && (ECX & 1);
+  Features["avx512vbmi"] = HasLeaf7 && ((ECX >> 1) & 1) && HasAVX512Save;
+  // Enable protection keys
+  Features["pku"] = HasLeaf7 && ((ECX >> 4) & 1);
+
   bool HasLeafD = MaxLevel >= 0xd &&
-    !GetX86CpuIDAndInfoEx(0xd, 0x1, &EAX, &EBX, &ECX, &EDX);
+                  !getX86CpuIDAndInfoEx(0xd, 0x1, &EAX, &EBX, &ECX, &EDX);
 
   // Only enable XSAVE if OS has enabled support for saving YMM state.
   Features["xsaveopt"] = HasAVXSave && HasLeafD && ((EAX >> 0) & 1);
-  Features["xsavec"]   = HasAVXSave && HasLeafD && ((EAX >> 1) & 1);
-  Features["xsaves"]   = HasAVXSave && HasLeafD && ((EAX >> 3) & 1);
+  Features["xsavec"] = HasAVXSave && HasLeafD && ((EAX >> 1) & 1);
+  Features["xsaves"] = HasAVXSave && HasLeafD && ((EAX >> 3) & 1);
 
   return true;
 }
@@ -859,31 +1280,26 @@ bool sys::getHostCPUFeatures(StringMap<bool> &Features) {
 
 #if defined(__aarch64__)
   // Keep track of which crypto features we have seen
-  enum {
-    CAP_AES   = 0x1,
-    CAP_PMULL = 0x2,
-    CAP_SHA1  = 0x4,
-    CAP_SHA2  = 0x8
-  };
+  enum { CAP_AES = 0x1, CAP_PMULL = 0x2, CAP_SHA1 = 0x4, CAP_SHA2 = 0x8 };
   uint32_t crypto = 0;
 #endif
 
   for (unsigned I = 0, E = CPUFeatures.size(); I != E; ++I) {
     StringRef LLVMFeatureStr = StringSwitch<StringRef>(CPUFeatures[I])
 #if defined(__aarch64__)
-      .Case("asimd", "neon")
-      .Case("fp", "fp-armv8")
-      .Case("crc32", "crc")
+                                   .Case("asimd", "neon")
+                                   .Case("fp", "fp-armv8")
+                                   .Case("crc32", "crc")
 #else
-      .Case("half", "fp16")
-      .Case("neon", "neon")
-      .Case("vfpv3", "vfp3")
-      .Case("vfpv3d16", "d16")
-      .Case("vfpv4", "vfp4")
-      .Case("idiva", "hwdiv-arm")
-      .Case("idivt", "hwdiv")
+                                   .Case("half", "fp16")
+                                   .Case("neon", "neon")
+                                   .Case("vfpv3", "vfp3")
+                                   .Case("vfpv3d16", "d16")
+                                   .Case("vfpv4", "vfp4")
+                                   .Case("idiva", "hwdiv-arm")
+                                   .Case("idivt", "hwdiv")
 #endif
-      .Default("");
+                                   .Default("");
 
 #if defined(__aarch64__)
     // We need to check crypto separately since we need all of the crypto
@@ -911,9 +1327,7 @@ bool sys::getHostCPUFeatures(StringMap<bool> &Features) {
   return true;
 }
 #else
-bool sys::getHostCPUFeatures(StringMap<bool> &Features){
-  return false;
-}
+bool sys::getHostCPUFeatures(StringMap<bool> &Features) { return false; }
 #endif
 
 std::string sys::getProcessTriple() {
diff --git a/lib/Support/IntEqClasses.cpp b/lib/Support/IntEqClasses.cpp
index ff213570807c..cb6e3a19e8d3 100644
--- a/lib/Support/IntEqClasses.cpp
+++ b/lib/Support/IntEqClasses.cpp
@@ -37,10 +37,15 @@ unsigned IntEqClasses::join(unsigned a, unsigned b) {
   // incrementally. The larger leader will eventually be updated, joining the
   // classes.
   while (eca != ecb)
-    if (eca < ecb)
-      EC[b] = eca, b = ecb, ecb = EC[b];
-    else
-      EC[a] = ecb, a = eca, eca = EC[a];
+    if (eca < ecb) {
+      EC[b] = eca;
+      b = ecb;
+      ecb = EC[b];
+    } else {
+      EC[a] = ecb;
+      a = eca;
+      eca = EC[a];
+    }
 
   return eca;
 }
diff --git a/lib/Support/JamCRC.cpp b/lib/Support/JamCRC.cpp
index bc21c917d55b..17c55f565e08 100644
--- a/lib/Support/JamCRC.cpp
+++ b/lib/Support/JamCRC.cpp
@@ -18,6 +18,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/Support/JamCRC.h"
+#include "llvm/ADT/ArrayRef.h"
 
 using namespace llvm;
 
diff --git a/lib/Support/Locale.cpp b/lib/Support/Locale.cpp
index 53bc0e36d830..e24a28be4306 100644
--- a/lib/Support/Locale.cpp
+++ b/lib/Support/Locale.cpp
@@ -1,5 +1,6 @@
-#include "llvm/Config/llvm-config.h"
 #include "llvm/Support/Locale.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/Config/llvm-config.h"
 #include "llvm/Support/Unicode.h"
 
 namespace llvm {
diff --git a/lib/Support/LockFileManager.cpp b/lib/Support/LockFileManager.cpp
index fb81d60f1837..611f94a9709b 100644
--- a/lib/Support/LockFileManager.cpp
+++ b/lib/Support/LockFileManager.cpp
@@ -144,7 +144,9 @@ LockFileManager::LockFileManager(StringRef FileName)
 {
   this->FileName = FileName;
   if (std::error_code EC = sys::fs::make_absolute(this->FileName)) {
-    Error = EC;
+    std::string S("failed to obtain absolute path for ");
+    S.append(this->FileName.str());
+    setError(EC, S);
     return;
   }
   LockFileName = this->FileName;
@@ -161,7 +163,9 @@ LockFileManager::LockFileManager(StringRef FileName)
   int UniqueLockFileID;
   if (std::error_code EC = sys::fs::createUniqueFile(
           UniqueLockFileName, UniqueLockFileID, UniqueLockFileName)) {
-    Error = EC;
+    std::string S("failed to create unique file ");
+    S.append(UniqueLockFileName.str());
+    setError(EC, S);
     return;
   }
 
@@ -169,7 +173,7 @@ LockFileManager::LockFileManager(StringRef FileName)
   {
     SmallString<256> HostID;
     if (auto EC = getHostID(HostID)) {
-      Error = EC;
+      setError(EC, "failed to get host id");
       return;
     }
 
@@ -185,7 +189,10 @@ LockFileManager::LockFileManager(StringRef FileName)
     if (Out.has_error()) {
       // We failed to write out PID, so make up an excuse, remove the
       // unique lock file, and fail.
-      Error = make_error_code(errc::no_space_on_device);
+      auto EC = make_error_code(errc::no_space_on_device);
+      std::string S("failed to write to ");
+      S.append(UniqueLockFileName.str());
+      setError(EC, S);
       sys::fs::remove(UniqueLockFileName);
       return;
     }
@@ -205,7 +212,10 @@ LockFileManager::LockFileManager(StringRef FileName)
     }
 
     if (EC != errc::file_exists) {
-      Error = EC;
+      std::string S("failed to create link ");
+      raw_string_ostream OSS(S);
+      OSS << LockFileName.str() << " to " << UniqueLockFileName.str();
+      setError(EC, OSS.str());
       return;
     }
 
@@ -226,7 +236,9 @@ LockFileManager::LockFileManager(StringRef FileName)
     // There is a lock file that nobody owns; try to clean it up and get
     // ownership.
     if ((EC = sys::fs::remove(LockFileName))) {
-      Error = EC;
+      std::string S("failed to remove lockfile ");
+      S.append(UniqueLockFileName.str());
+      setError(EC, S);
       return;
     }
   }
@@ -242,6 +254,19 @@ LockFileManager::LockFileState LockFileManager::getState() const {
   return LFS_Owned;
 }
 
+std::string LockFileManager::getErrorMessage() const {
+  if (Error) {
+    std::string Str(ErrorDiagMsg);
+    std::string ErrCodeMsg = Error->message();
+    raw_string_ostream OSS(Str);
+    if (!ErrCodeMsg.empty())
+      OSS << ": " << Error->message();
+    OSS.flush();
+    return Str;
+  }
+  return "";
+}
+
 LockFileManager::~LockFileManager() {
   if (getState() != LFS_Owned)
     return;
diff --git a/lib/Support/Makefile b/lib/Support/Makefile
deleted file mode 100644
index 39426aaaacee..000000000000
--- a/lib/Support/Makefile
+++ /dev/null
@@ -1,23 +0,0 @@
-##===- lib/Support/Makefile --------------------------------*- Makefile -*-===##
-#
-#                     The LLVM Compiler Infrastructure
-#
-# This file is distributed under the University of Illinois Open Source
-# License. See LICENSE.TXT for details.
-#
-##===----------------------------------------------------------------------===##
-
-LEVEL = ../..
-LIBRARYNAME = LLVMSupport
-BUILD_ARCHIVE = 1
-
-EXTRA_DIST = Unix Win32 README.txt
-
-include $(LEVEL)/Makefile.common
-
-CompileCommonOpts := $(filter-out -pedantic,$(CompileCommonOpts))
-CompileCommonOpts := $(filter-out -Wno-long-long,$(CompileCommonOpts))
-
-ifdef LLVM_VERSION_INFO
-CompileCommonOpts += -DLLVM_VERSION_INFO='"$(LLVM_VERSION_INFO)"'
-endif
diff --git a/lib/Support/ManagedStatic.cpp b/lib/Support/ManagedStatic.cpp
index 9868207b14ff..7dd31315f90d 100644
--- a/lib/Support/ManagedStatic.cpp
+++ b/lib/Support/ManagedStatic.cpp
@@ -13,20 +13,25 @@
 
 #include "llvm/Support/ManagedStatic.h"
 #include "llvm/Config/config.h"
-#include "llvm/Support/Atomic.h"
-#include "llvm/Support/Compiler.h"
 #include "llvm/Support/Mutex.h"
 #include "llvm/Support/MutexGuard.h"
+#include "llvm/Support/Threading.h"
 #include <cassert>
 using namespace llvm;
 
 static const ManagedStaticBase *StaticList = nullptr;
+static sys::Mutex *ManagedStaticMutex = nullptr;
+LLVM_DEFINE_ONCE_FLAG(mutex_init_flag);
 
-static sys::Mutex& getManagedStaticMutex() {
+static void initializeMutex() {
+  ManagedStaticMutex = new sys::Mutex();
+}
+
+static sys::Mutex* getManagedStaticMutex() {
   // We need to use a function local static here, since this can get called
   // during a static constructor and we need to guarantee that it's initialized
   // correctly.
-  static sys::Mutex ManagedStaticMutex;
+  llvm::call_once(mutex_init_flag, initializeMutex);
   return ManagedStaticMutex;
 }
 
@@ -34,20 +39,12 @@ void ManagedStaticBase::RegisterManagedStatic(void *(*Creator)(),
                                               void (*Deleter)(void*)) const {
   assert(Creator);
   if (llvm_is_multithreaded()) {
-    MutexGuard Lock(getManagedStaticMutex());
-
-    if (!Ptr) {
-      void* tmp = Creator();
+    MutexGuard Lock(*getManagedStaticMutex());
 
-      TsanHappensBefore(this);
-      sys::MemoryFence();
+    if (!Ptr.load(std::memory_order_relaxed)) {
+      void *Tmp = Creator();
 
-      // This write is racy against the first read in the ManagedStatic
-      // accessors. The race is benign because it does a second read after a
-      // memory fence, at which point it isn't possible to get a partial value.
-      TsanIgnoreWritesBegin();
-      Ptr = tmp;
-      TsanIgnoreWritesEnd();
+      Ptr.store(Tmp, std::memory_order_release);
       DeleterFn = Deleter;
       
       // Add to list of managed statics.
@@ -84,7 +81,7 @@ void ManagedStaticBase::destroy() const {
 
 /// llvm_shutdown - Deallocate and destroy all ManagedStatic variables.
 void llvm::llvm_shutdown() {
-  MutexGuard Lock(getManagedStaticMutex());
+  MutexGuard Lock(*getManagedStaticMutex());
 
   while (StaticList)
     StaticList->destroy();
diff --git a/lib/Support/MemoryBuffer.cpp b/lib/Support/MemoryBuffer.cpp
index faee10bb07cf..b935cbf1ae7a 100644
--- a/lib/Support/MemoryBuffer.cpp
+++ b/lib/Support/MemoryBuffer.cpp
@@ -86,6 +86,10 @@ public:
     init(InputData.begin(), InputData.end(), RequiresNullTerminator);
   }
 
+  /// Disable sized deallocation for MemoryBufferMem, because it has
+  /// tail-allocated data.
+  void operator delete(void *p) { ::operator delete(p); }
+
   const char *getBufferIdentifier() const override {
      // The name is stored after the class itself.
     return reinterpret_cast<const char*>(this + 1);
@@ -135,7 +139,7 @@ MemoryBuffer::getNewUninitMemBuffer(size_t Size, const Twine &BufferName) {
   SmallString<256> NameBuf;
   StringRef NameRef = BufferName.toStringRef(NameBuf);
   size_t AlignedStringLen =
-      RoundUpToAlignment(sizeof(MemoryBufferMem) + NameRef.size() + 1, 16);
+      alignTo(sizeof(MemoryBufferMem) + NameRef.size() + 1, 16);
   size_t RealLen = AlignedStringLen + Size + 1;
   char *Mem = static_cast<char*>(operator new(RealLen, std::nothrow));
   if (!Mem)
@@ -213,6 +217,10 @@ public:
     }
   }
 
+  /// Disable sized deallocation for MemoryBufferMMapFile, because it has
+  /// tail-allocated data.
+  void operator delete(void *p) { ::operator delete(p); }
+
   const char *getBufferIdentifier() const override {
     // The name is stored after the class itself.
     return reinterpret_cast<const char *>(this + 1);
diff --git a/lib/Support/Path.cpp b/lib/Support/Path.cpp
index 4952f59fc24d..f6355d1df805 100644
--- a/lib/Support/Path.cpp
+++ b/lib/Support/Path.cpp
@@ -12,6 +12,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/Support/COFF.h"
+#include "llvm/Support/MachO.h"
 #include "llvm/Support/Endian.h"
 #include "llvm/Support/Errc.h"
 #include "llvm/Support/ErrorHandling.h"
@@ -352,6 +353,10 @@ bool reverse_iterator::operator==(const reverse_iterator &RHS) const {
          Position == RHS.Position;
 }
 
+ptrdiff_t reverse_iterator::operator-(const reverse_iterator &RHS) const {
+  return Position - RHS.Position;
+}
+
 StringRef root_path(StringRef path) {
   const_iterator b = begin(path),
                  pos = b,
@@ -517,6 +522,29 @@ void replace_extension(SmallVectorImpl<char> &path, const Twine &extension) {
   path.append(ext.begin(), ext.end());
 }
 
+void replace_path_prefix(SmallVectorImpl<char> &Path,
+                         const StringRef &OldPrefix,
+                         const StringRef &NewPrefix) {
+  if (OldPrefix.empty() && NewPrefix.empty())
+    return;
+
+  StringRef OrigPath(Path.begin(), Path.size());
+  if (!OrigPath.startswith(OldPrefix))
+    return;
+
+  // If prefixes have the same size we can simply copy the new one over.
+  if (OldPrefix.size() == NewPrefix.size()) {
+    std::copy(NewPrefix.begin(), NewPrefix.end(), Path.begin());
+    return;
+  }
+
+  StringRef RelPath = OrigPath.substr(OldPrefix.size());
+  SmallString<256> NewPath;
+  path::append(NewPath, NewPrefix);
+  path::append(NewPath, RelPath);
+  Path.swap(NewPath);
+}
+
 void native(const Twine &path, SmallVectorImpl<char> &result) {
   assert((!path.isSingleStringRef() ||
           path.getSingleStringRef().data() != result.data()) &&
@@ -1021,7 +1049,7 @@ file_magic identify_magic(StringRef Magic) {
 
     case 0xCA:
       if (Magic[1] == char(0xFE) && Magic[2] == char(0xBA) &&
-          Magic[3] == char(0xBE)) {
+          (Magic[3] == char(0xBE) || Magic[3] == char(0xBF))) {
         // This is complicated by an overlap with Java class files.
         // See the Mach-O section in /usr/share/file/magic for details.
         if (Magic.size() >= 8 && Magic[7] < 43)
@@ -1040,12 +1068,24 @@ file_magic identify_magic(StringRef Magic) {
           Magic[2] == char(0xFA) &&
           (Magic[3] == char(0xCE) || Magic[3] == char(0xCF))) {
         /* Native endian */
-        if (Magic.size() >= 16) type = Magic[14] << 8 | Magic[15];
+        size_t MinSize;
+        if (Magic[3] == char(0xCE))
+          MinSize = sizeof(MachO::mach_header);
+        else
+          MinSize = sizeof(MachO::mach_header_64);
+        if (Magic.size() >= MinSize)
+          type = Magic[12] << 24 | Magic[13] << 12 | Magic[14] << 8 | Magic[15];
       } else if ((Magic[0] == char(0xCE) || Magic[0] == char(0xCF)) &&
                  Magic[1] == char(0xFA) && Magic[2] == char(0xED) &&
                  Magic[3] == char(0xFE)) {
         /* Reverse endian */
-        if (Magic.size() >= 14) type = Magic[13] << 8 | Magic[12];
+        size_t MinSize;
+        if (Magic[0] == char(0xCE))
+          MinSize = sizeof(MachO::mach_header);
+        else
+          MinSize = sizeof(MachO::mach_header_64);
+        if (Magic.size() >= MinSize)
+          type = Magic[15] << 24 | Magic[14] << 12 |Magic[13] << 8 | Magic[12];
       }
       switch (type) {
         default: break;
diff --git a/lib/Support/PrettyStackTrace.cpp b/lib/Support/PrettyStackTrace.cpp
index 05b3e31644bd..ebad67bb8dce 100644
--- a/lib/Support/PrettyStackTrace.cpp
+++ b/lib/Support/PrettyStackTrace.cpp
@@ -21,6 +21,8 @@
 #include "llvm/Support/Watchdog.h"
 #include "llvm/Support/raw_ostream.h"
 
+#include <tuple>
+
 #ifdef HAVE_CRASHREPORTERCLIENT_H
 #include <CrashReporterClient.h>
 #endif
@@ -36,20 +38,32 @@ using namespace llvm;
 // objects, but we *really* cannot tolerate destructors running and do not want
 // to pay any overhead of synchronizing. As a consequence, we use a raw
 // thread-local variable.
-static LLVM_THREAD_LOCAL const PrettyStackTraceEntry *PrettyStackTraceHead =
-    nullptr;
-
-static unsigned PrintStack(const PrettyStackTraceEntry *Entry, raw_ostream &OS){
-  unsigned NextID = 0;
-  if (Entry->getNextEntry())
-    NextID = PrintStack(Entry->getNextEntry(), OS);
-  OS << NextID << ".\t";
-  {
+static LLVM_THREAD_LOCAL PrettyStackTraceEntry *PrettyStackTraceHead = nullptr;
+
+namespace llvm {
+PrettyStackTraceEntry *ReverseStackTrace(PrettyStackTraceEntry *Head) {
+  PrettyStackTraceEntry *Prev = nullptr;
+  while (Head)
+    std::tie(Prev, Head, Head->NextEntry) =
+        std::make_tuple(Head, Head->NextEntry, Prev);
+  return Prev;
+}
+}
+
+static void PrintStack(raw_ostream &OS) {
+  // Print out the stack in reverse order. To avoid recursion (which is likely
+  // to fail if we crashed due to stack overflow), we do an up-front pass to
+  // reverse the stack, then print it, then reverse it again.
+  unsigned ID = 0;
+  PrettyStackTraceEntry *ReversedStack =
+      llvm::ReverseStackTrace(PrettyStackTraceHead);
+  for (const PrettyStackTraceEntry *Entry = ReversedStack; Entry;
+       Entry = Entry->getNextEntry()) {
+    OS << ID++ << ".\t";
     sys::Watchdog W(5);
     Entry->print(OS);
   }
-  
-  return NextID+1;
+  llvm::ReverseStackTrace(ReversedStack);
 }
 
 /// PrintCurStackTrace - Print the current stack trace to the specified stream.
@@ -60,7 +74,7 @@ static void PrintCurStackTrace(raw_ostream &OS) {
   // If there are pretty stack frames registered, walk and emit them.
   OS << "Stack dump:\n";
   
-  PrintStack(PrettyStackTraceHead, OS);
+  PrintStack(OS);
   OS.flush();
 }
 
@@ -123,7 +137,7 @@ PrettyStackTraceEntry::~PrettyStackTraceEntry() {
 #if defined(HAVE_BACKTRACE) && defined(ENABLE_BACKTRACES)
   assert(PrettyStackTraceHead == this &&
          "Pretty stack trace entry destruction is out of order");
-  PrettyStackTraceHead = getNextEntry();
+  PrettyStackTraceHead = NextEntry;
 #endif
 }
 
@@ -154,7 +168,7 @@ void llvm::EnablePrettyStackTrace() {
 #endif
 }
 
-const void* llvm::SavePrettyStackState() {
+const void *llvm::SavePrettyStackState() {
 #if defined(HAVE_BACKTRACE) && defined(ENABLE_BACKTRACES)
   return PrettyStackTraceHead;
 #else
@@ -162,9 +176,10 @@ const void* llvm::SavePrettyStackState() {
 #endif
 }
 
-void llvm::RestorePrettyStackState(const void* Top) {
+void llvm::RestorePrettyStackState(const void *Top) {
 #if defined(HAVE_BACKTRACE) && defined(ENABLE_BACKTRACES)
-  PrettyStackTraceHead = (const PrettyStackTraceEntry*)Top;
+  PrettyStackTraceHead =
+      static_cast<PrettyStackTraceEntry *>(const_cast<void *>(Top));
 #endif
 }
 
diff --git a/lib/Support/Process.cpp b/lib/Support/Process.cpp
index 6dcbb47e87d0..290c30f4968f 100644
--- a/lib/Support/Process.cpp
+++ b/lib/Support/Process.cpp
@@ -73,6 +73,13 @@ static const char colorcodes[2][2][8][10] = {
  { ALLCOLORS("4",""), ALLCOLORS("4","1;") }
 };
 
+// This is set to true when Process::PreventCoreFiles() is called.
+static bool coreFilesPrevented = false;
+
+bool Process::AreCoreFilesPrevented() {
+  return coreFilesPrevented;
+}
+
 // Include the platform-specific parts of this class.
 #ifdef LLVM_ON_UNIX
 #include "Unix/Process.inc"
diff --git a/lib/Support/SHA1.cpp b/lib/Support/SHA1.cpp
new file mode 100644
index 000000000000..a461d1ebc93d
--- /dev/null
+++ b/lib/Support/SHA1.cpp
@@ -0,0 +1,170 @@
+//======- SHA1.h - Private copy of the SHA1 implementation ---*- C++ -* ======//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+// This code is taken from public domain
+// (http://oauth.googlecode.com/svn/code/c/liboauth/src/sha1.c)
+// and modified by wrapping it in a C++ interface for LLVM,
+// and removing unnecessary code.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Support/Host.h"
+#include "llvm/Support/SHA1.h"
+#include "llvm/ADT/ArrayRef.h"
+using namespace llvm;
+
+#include <stdint.h>
+#include <string.h>
+
+#if defined(BYTE_ORDER) && defined(BIG_ENDIAN) && BYTE_ORDER == BIG_ENDIAN
+#define SHA_BIG_ENDIAN
+#endif
+
+/* code */
+#define SHA1_K0 0x5a827999
+#define SHA1_K20 0x6ed9eba1
+#define SHA1_K40 0x8f1bbcdc
+#define SHA1_K60 0xca62c1d6
+
+#define SEED_0 0x67452301
+#define SEED_1 0xefcdab89
+#define SEED_2 0x98badcfe
+#define SEED_3 0x10325476
+#define SEED_4 0xc3d2e1f0
+
+void SHA1::init() {
+  InternalState.State[0] = SEED_0;
+  InternalState.State[1] = SEED_1;
+  InternalState.State[2] = SEED_2;
+  InternalState.State[3] = SEED_3;
+  InternalState.State[4] = SEED_4;
+  InternalState.ByteCount = 0;
+  InternalState.BufferOffset = 0;
+}
+
+static uint32_t rol32(uint32_t number, uint8_t bits) {
+  return ((number << bits) | (number >> (32 - bits)));
+}
+
+void SHA1::hashBlock() {
+  uint8_t i;
+  uint32_t a, b, c, d, e, t;
+
+  a = InternalState.State[0];
+  b = InternalState.State[1];
+  c = InternalState.State[2];
+  d = InternalState.State[3];
+  e = InternalState.State[4];
+  for (i = 0; i < 80; i++) {
+    if (i >= 16) {
+      t = InternalState.Buffer[(i + 13) & 15] ^
+          InternalState.Buffer[(i + 8) & 15] ^
+          InternalState.Buffer[(i + 2) & 15] ^ InternalState.Buffer[i & 15];
+      InternalState.Buffer[i & 15] = rol32(t, 1);
+    }
+    if (i < 20) {
+      t = (d ^ (b & (c ^ d))) + SHA1_K0;
+    } else if (i < 40) {
+      t = (b ^ c ^ d) + SHA1_K20;
+    } else if (i < 60) {
+      t = ((b & c) | (d & (b | c))) + SHA1_K40;
+    } else {
+      t = (b ^ c ^ d) + SHA1_K60;
+    }
+    t += rol32(a, 5) + e + InternalState.Buffer[i & 15];
+    e = d;
+    d = c;
+    c = rol32(b, 30);
+    b = a;
+    a = t;
+  }
+  InternalState.State[0] += a;
+  InternalState.State[1] += b;
+  InternalState.State[2] += c;
+  InternalState.State[3] += d;
+  InternalState.State[4] += e;
+}
+
+void SHA1::addUncounted(uint8_t data) {
+  uint8_t *const b = (uint8_t *)InternalState.Buffer;
+#ifdef SHA_BIG_ENDIAN
+  b[InternalState.BufferOffset] = data;
+#else
+  b[InternalState.BufferOffset ^ 3] = data;
+#endif
+  InternalState.BufferOffset++;
+  if (InternalState.BufferOffset == BLOCK_LENGTH) {
+    hashBlock();
+    InternalState.BufferOffset = 0;
+  }
+}
+
+void SHA1::writebyte(uint8_t data) {
+  ++InternalState.ByteCount;
+  addUncounted(data);
+}
+
+void SHA1::update(ArrayRef<uint8_t> Data) {
+  for (auto &C : Data)
+    writebyte(C);
+}
+
+void SHA1::pad() {
+  // Implement SHA-1 padding (fips180-2 5.1.1)
+
+  // Pad with 0x80 followed by 0x00 until the end of the block
+  addUncounted(0x80);
+  while (InternalState.BufferOffset != 56)
+    addUncounted(0x00);
+
+  // Append length in the last 8 bytes
+  addUncounted(0); // We're only using 32 bit lengths
+  addUncounted(0); // But SHA-1 supports 64 bit lengths
+  addUncounted(0); // So zero pad the top bits
+  addUncounted(InternalState.ByteCount >> 29); // Shifting to multiply by 8
+  addUncounted(InternalState.ByteCount >>
+               21); // as SHA-1 supports bitstreams as well as
+  addUncounted(InternalState.ByteCount >> 13); // byte.
+  addUncounted(InternalState.ByteCount >> 5);
+  addUncounted(InternalState.ByteCount << 3);
+}
+
+StringRef SHA1::final() {
+  // Pad to complete the last block
+  pad();
+
+#ifdef SHA_BIG_ENDIAN
+  // Just copy the current state
+  for (int i = 0; i < 5; i++) {
+    HashResult[i] = InternalState.State[i];
+  }
+#else
+  // Swap byte order back
+  for (int i = 0; i < 5; i++) {
+    HashResult[i] = (((InternalState.State[i]) << 24) & 0xff000000) |
+                    (((InternalState.State[i]) << 8) & 0x00ff0000) |
+                    (((InternalState.State[i]) >> 8) & 0x0000ff00) |
+                    (((InternalState.State[i]) >> 24) & 0x000000ff);
+  }
+#endif
+
+  // Return pointer to hash (20 characters)
+  return StringRef((char *)HashResult, HASH_LENGTH);
+}
+
+StringRef SHA1::result() {
+  auto StateToRestore = InternalState;
+
+  auto Hash = final();
+
+  // Restore the state
+  InternalState = StateToRestore;
+
+  // Return pointer to hash (20 characters)
+  return Hash;
+}
diff --git a/lib/Support/ScaledNumber.cpp b/lib/Support/ScaledNumber.cpp
index 987c2d803b7e..b9432d46f9bb 100644
--- a/lib/Support/ScaledNumber.cpp
+++ b/lib/Support/ScaledNumber.cpp
@@ -13,6 +13,7 @@
 
 #include "llvm/Support/ScaledNumber.h"
 #include "llvm/ADT/APFloat.h"
+#include "llvm/ADT/ArrayRef.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/raw_ostream.h"
 
diff --git a/lib/Support/ScopedPrinter.cpp b/lib/Support/ScopedPrinter.cpp
new file mode 100644
index 000000000000..0225f01156a9
--- /dev/null
+++ b/lib/Support/ScopedPrinter.cpp
@@ -0,0 +1,72 @@
+#include "llvm/Support/ScopedPrinter.h"
+
+#include "llvm/ADT/StringExtras.h"
+#include "llvm/Support/Format.h"
+#include <cctype>
+
+using namespace llvm::support;
+
+namespace llvm {
+
+raw_ostream &operator<<(raw_ostream &OS, const HexNumber &Value) {
+  OS << "0x" << to_hexString(Value.Value);
+  return OS;
+}
+
+const std::string to_hexString(uint64_t Value, bool UpperCase) {
+  std::string number;
+  llvm::raw_string_ostream stream(number);
+  stream << format_hex_no_prefix(Value, 1, UpperCase);
+  return stream.str();
+}
+
+void ScopedPrinter::printBinaryImpl(StringRef Label, StringRef Str,
+                                    ArrayRef<uint8_t> Data, bool Block) {
+  if (Data.size() > 16)
+    Block = true;
+
+  if (Block) {
+    startLine() << Label;
+    if (Str.size() > 0)
+      OS << ": " << Str;
+    OS << " (\n";
+    for (size_t addr = 0, end = Data.size(); addr < end; addr += 16) {
+      startLine() << format("  %04" PRIX64 ": ", uint64_t(addr));
+      // Dump line of hex.
+      for (size_t i = 0; i < 16; ++i) {
+        if (i != 0 && i % 4 == 0)
+          OS << ' ';
+        if (addr + i < end)
+          OS << hexdigit((Data[addr + i] >> 4) & 0xF, false)
+             << hexdigit(Data[addr + i] & 0xF, false);
+        else
+          OS << "  ";
+      }
+      // Print ascii.
+      OS << "  |";
+      for (std::size_t i = 0; i < 16 && addr + i < end; ++i) {
+        if (std::isprint(Data[addr + i] & 0xFF))
+          OS << Data[addr + i];
+        else
+          OS << ".";
+      }
+      OS << "|\n";
+    }
+
+    startLine() << ")\n";
+  } else {
+    startLine() << Label << ":";
+    if (Str.size() > 0)
+      OS << " " << Str;
+    OS << " (";
+    for (size_t i = 0; i < Data.size(); ++i) {
+      if (i > 0)
+        OS << " ";
+
+      OS << format("%02X", static_cast<int>(Data[i]));
+    }
+    OS << ")\n";
+  }
+}
+
+} // namespace llvm
diff --git a/lib/Support/Signals.cpp b/lib/Support/Signals.cpp
index 3dc6b7c99d01..e5e38f59c040 100644
--- a/lib/Support/Signals.cpp
+++ b/lib/Support/Signals.cpp
@@ -62,28 +62,40 @@ static FormattedNumber format_ptr(void *PC) {
   return format_hex((uint64_t)PC, PtrWidth);
 }
 
-static bool printSymbolizedStackTrace(void **StackTrace, int Depth,
+static bool printSymbolizedStackTrace(StringRef Argv0,
+                                      void **StackTrace, int Depth,
                                       llvm::raw_ostream &OS)
   LLVM_ATTRIBUTE_USED;
 
 /// Helper that launches llvm-symbolizer and symbolizes a backtrace.
-static bool printSymbolizedStackTrace(void **StackTrace, int Depth,
+static bool printSymbolizedStackTrace(StringRef Argv0,
+                                      void **StackTrace, int Depth,
                                       llvm::raw_ostream &OS) {
+  // Don't recursively invoke the llvm-symbolizer binary.
+  if (Argv0.find("llvm-symbolizer") != std::string::npos)
+    return false;
+
   // FIXME: Subtract necessary number from StackTrace entries to turn return addresses
   // into actual instruction addresses.
-  // Use llvm-symbolizer tool to symbolize the stack traces.
-  ErrorOr<std::string> LLVMSymbolizerPathOrErr =
-      sys::findProgramByName("llvm-symbolizer");
+  // Use llvm-symbolizer tool to symbolize the stack traces. First look for it
+  // alongside our binary, then in $PATH.
+  ErrorOr<std::string> LLVMSymbolizerPathOrErr = std::error_code();
+  if (!Argv0.empty()) {
+    StringRef Parent = llvm::sys::path::parent_path(Argv0);
+    if (!Parent.empty())
+      LLVMSymbolizerPathOrErr = sys::findProgramByName("llvm-symbolizer", Parent);
+  }
+  if (!LLVMSymbolizerPathOrErr)
+    LLVMSymbolizerPathOrErr = sys::findProgramByName("llvm-symbolizer");
   if (!LLVMSymbolizerPathOrErr)
     return false;
   const std::string &LLVMSymbolizerPath = *LLVMSymbolizerPathOrErr;
-  // We don't know argv0 or the address of main() at this point, but try
-  // to guess it anyway (it's possible on some platforms).
-  std::string MainExecutableName = sys::fs::getMainExecutable(nullptr, nullptr);
-  if (MainExecutableName.empty() ||
-      MainExecutableName.find("llvm-symbolizer") != std::string::npos)
-    return false;
 
+  // If we don't know argv0 or the address of main() at this point, try
+  // to guess it anyway (it's possible on some platforms).
+  std::string MainExecutableName =
+      Argv0.empty() ? sys::fs::getMainExecutable(nullptr, nullptr)
+                    : (std::string)Argv0;
   BumpPtrAllocator Allocator;
   StringSaver StrPool(Allocator);
   std::vector<const char *> Modules(Depth, nullptr);
diff --git a/lib/Support/SmallPtrSet.cpp b/lib/Support/SmallPtrSet.cpp
index 358c8e8abbe9..539b4eb34da1 100644
--- a/lib/Support/SmallPtrSet.cpp
+++ b/lib/Support/SmallPtrSet.cpp
@@ -25,8 +25,9 @@ void SmallPtrSetImplBase::shrink_and_clear() {
   free(CurArray);
 
   // Reduce the number of buckets.
-  CurArraySize = NumElements > 16 ? 1 << (Log2_32_Ceil(NumElements) + 1) : 32;
-  NumElements = NumTombstones = 0;
+  unsigned Size = size();
+  CurArraySize = Size > 16 ? 1 << (Log2_32_Ceil(Size) + 1) : 32;
+  NumNonEmpty = NumTombstones = 0;
 
   // Install the new array.  Clear all the buckets to empty.
   CurArray = (const void**)malloc(sizeof(void*) * CurArraySize);
@@ -35,32 +36,16 @@ void SmallPtrSetImplBase::shrink_and_clear() {
 }
 
 std::pair<const void *const *, bool>
-SmallPtrSetImplBase::insert_imp(const void *Ptr) {
-  if (isSmall()) {
-    // Check to see if it is already in the set.
-    for (const void **APtr = SmallArray, **E = SmallArray+NumElements;
-         APtr != E; ++APtr)
-      if (*APtr == Ptr)
-        return std::make_pair(APtr, false);
-
-    // Nope, there isn't.  If we stay small, just 'pushback' now.
-    if (NumElements < CurArraySize) {
-      SmallArray[NumElements++] = Ptr;
-      return std::make_pair(SmallArray + (NumElements - 1), true);
-    }
-    // Otherwise, hit the big set case, which will call grow.
-  }
-
-  if (LLVM_UNLIKELY(NumElements * 4 >= CurArraySize * 3)) {
+SmallPtrSetImplBase::insert_imp_big(const void *Ptr) {
+  if (LLVM_UNLIKELY(size() * 4 >= CurArraySize * 3)) {
     // If more than 3/4 of the array is full, grow.
-    Grow(CurArraySize < 64 ? 128 : CurArraySize*2);
-  } else if (LLVM_UNLIKELY(CurArraySize - (NumElements + NumTombstones) <
-                           CurArraySize / 8)) {
+    Grow(CurArraySize < 64 ? 128 : CurArraySize * 2);
+  } else if (LLVM_UNLIKELY(CurArraySize - NumNonEmpty < CurArraySize / 8)) {
     // If fewer of 1/8 of the array is empty (meaning that many are filled with
     // tombstones), rehash.
     Grow(CurArraySize);
   }
-  
+
   // Okay, we know we have space.  Find a hash bucket.
   const void **Bucket = const_cast<const void**>(FindBucketFor(Ptr));
   if (*Bucket == Ptr)
@@ -69,34 +54,33 @@ SmallPtrSetImplBase::insert_imp(const void *Ptr) {
   // Otherwise, insert it!
   if (*Bucket == getTombstoneMarker())
     --NumTombstones;
+  else
+    ++NumNonEmpty; // Track density.
   *Bucket = Ptr;
-  ++NumElements;  // Track density.
   return std::make_pair(Bucket, true);
 }
 
 bool SmallPtrSetImplBase::erase_imp(const void * Ptr) {
   if (isSmall()) {
     // Check to see if it is in the set.
-    for (const void **APtr = SmallArray, **E = SmallArray+NumElements;
-         APtr != E; ++APtr)
+    for (const void **APtr = CurArray, **E = CurArray + NumNonEmpty; APtr != E;
+         ++APtr)
       if (*APtr == Ptr) {
         // If it is in the set, replace this element.
-        *APtr = E[-1];
-        E[-1] = getEmptyMarker();
-        --NumElements;
+        *APtr = getTombstoneMarker();
+        ++NumTombstones;
         return true;
       }
-    
+
     return false;
   }
-  
+
   // Okay, we know we have space.  Find a hash bucket.
   void **Bucket = const_cast<void**>(FindBucketFor(Ptr));
   if (*Bucket != Ptr) return false;  // Not in the set?
 
   // Set this as a tombstone.
   *Bucket = getTombstoneMarker();
-  --NumElements;
   ++NumTombstones;
   return true;
 }
@@ -122,7 +106,7 @@ const void * const *SmallPtrSetImplBase::FindBucketFor(const void *Ptr) const {
     // prefer to return it than something that would require more probing.
     if (Array[Bucket] == getTombstoneMarker() && !Tombstone)
       Tombstone = Array+Bucket;  // Remember the first tombstone found.
-    
+
     // It's a hash collision or a tombstone. Reprobe.
     Bucket = (Bucket + ProbeAmt++) & (ArraySize-1);
   }
@@ -131,43 +115,32 @@ const void * const *SmallPtrSetImplBase::FindBucketFor(const void *Ptr) const {
 /// Grow - Allocate a larger backing store for the buckets and move it over.
 ///
 void SmallPtrSetImplBase::Grow(unsigned NewSize) {
-  // Allocate at twice as many buckets, but at least 128.
-  unsigned OldSize = CurArraySize;
-  
   const void **OldBuckets = CurArray;
+  const void **OldEnd = EndPointer();
   bool WasSmall = isSmall();
-  
+
   // Install the new array.  Clear all the buckets to empty.
   CurArray = (const void**)malloc(sizeof(void*) * NewSize);
   assert(CurArray && "Failed to allocate memory?");
   CurArraySize = NewSize;
   memset(CurArray, -1, NewSize*sizeof(void*));
-  
-  // Copy over all the elements.
-  if (WasSmall) {
-    // Small sets store their elements in order.
-    for (const void **BucketPtr = OldBuckets, **E = OldBuckets+NumElements;
-         BucketPtr != E; ++BucketPtr) {
-      const void *Elt = *BucketPtr;
+
+  // Copy over all valid entries.
+  for (const void **BucketPtr = OldBuckets; BucketPtr != OldEnd; ++BucketPtr) {
+    // Copy over the element if it is valid.
+    const void *Elt = *BucketPtr;
+    if (Elt != getTombstoneMarker() && Elt != getEmptyMarker())
       *const_cast<void**>(FindBucketFor(Elt)) = const_cast<void*>(Elt);
-    }
-  } else {
-    // Copy over all valid entries.
-    for (const void **BucketPtr = OldBuckets, **E = OldBuckets+OldSize;
-         BucketPtr != E; ++BucketPtr) {
-      // Copy over the element if it is valid.
-      const void *Elt = *BucketPtr;
-      if (Elt != getTombstoneMarker() && Elt != getEmptyMarker())
-        *const_cast<void**>(FindBucketFor(Elt)) = const_cast<void*>(Elt);
-    }
-    
-    free(OldBuckets);
-    NumTombstones = 0;
   }
+
+  if (!WasSmall)
+    free(OldBuckets);
+  NumNonEmpty -= NumTombstones;
+  NumTombstones = 0;
 }
 
 SmallPtrSetImplBase::SmallPtrSetImplBase(const void **SmallStorage,
-                                 const SmallPtrSetImplBase& that) {
+                                         const SmallPtrSetImplBase &that) {
   SmallArray = SmallStorage;
 
   // If we're becoming small, prepare to insert into our stack space
@@ -178,46 +151,18 @@ SmallPtrSetImplBase::SmallPtrSetImplBase(const void **SmallStorage,
     CurArray = (const void**)malloc(sizeof(void*) * that.CurArraySize);
     assert(CurArray && "Failed to allocate memory?");
   }
-  
-  // Copy over the new array size
-  CurArraySize = that.CurArraySize;
 
-  // Copy over the contents from the other set
-  memcpy(CurArray, that.CurArray, sizeof(void*)*CurArraySize);
-  
-  NumElements = that.NumElements;
-  NumTombstones = that.NumTombstones;
+  // Copy over the that array.
+  CopyHelper(that);
 }
 
 SmallPtrSetImplBase::SmallPtrSetImplBase(const void **SmallStorage,
                                          unsigned SmallSize,
                                          SmallPtrSetImplBase &&that) {
   SmallArray = SmallStorage;
-
-  // Copy over the basic members.
-  CurArraySize = that.CurArraySize;
-  NumElements = that.NumElements;
-  NumTombstones = that.NumTombstones;
-
-  // When small, just copy into our small buffer.
-  if (that.isSmall()) {
-    CurArray = SmallArray;
-    memcpy(CurArray, that.CurArray, sizeof(void *) * CurArraySize);
-  } else {
-    // Otherwise, we steal the large memory allocation and no copy is needed.
-    CurArray = that.CurArray;
-    that.CurArray = that.SmallArray;
-  }
-
-  // Make the "that" object small and empty.
-  that.CurArraySize = SmallSize;
-  assert(that.CurArray == that.SmallArray);
-  that.NumElements = 0;
-  that.NumTombstones = 0;
+  MoveHelper(SmallSize, std::move(that));
 }
 
-/// CopyFrom - implement operator= from a smallptrset that has the same pointer
-/// type, but may have a different small size.
 void SmallPtrSetImplBase::CopyFrom(const SmallPtrSetImplBase &RHS) {
   assert(&RHS != this && "Self-copy should be handled by the caller.");
 
@@ -243,28 +188,36 @@ void SmallPtrSetImplBase::CopyFrom(const SmallPtrSetImplBase &RHS) {
     }
     assert(CurArray && "Failed to allocate memory?");
   }
-  
+
+  CopyHelper(RHS);
+}
+
+void SmallPtrSetImplBase::CopyHelper(const SmallPtrSetImplBase &RHS) {
   // Copy over the new array size
   CurArraySize = RHS.CurArraySize;
 
   // Copy over the contents from the other set
-  memcpy(CurArray, RHS.CurArray, sizeof(void*)*CurArraySize);
-  
-  NumElements = RHS.NumElements;
+  std::copy(RHS.CurArray, RHS.EndPointer(), CurArray);
+
+  NumNonEmpty = RHS.NumNonEmpty;
   NumTombstones = RHS.NumTombstones;
 }
 
 void SmallPtrSetImplBase::MoveFrom(unsigned SmallSize,
                                    SmallPtrSetImplBase &&RHS) {
-  assert(&RHS != this && "Self-move should be handled by the caller.");
-
   if (!isSmall())
     free(CurArray);
+  MoveHelper(SmallSize, std::move(RHS));
+}
+
+void SmallPtrSetImplBase::MoveHelper(unsigned SmallSize,
+                                     SmallPtrSetImplBase &&RHS) {
+  assert(&RHS != this && "Self-move should be handled by the caller.");
 
   if (RHS.isSmall()) {
     // Copy a small RHS rather than moving.
     CurArray = SmallArray;
-    memcpy(CurArray, RHS.CurArray, sizeof(void*)*RHS.CurArraySize);
+    std::copy(RHS.CurArray, RHS.CurArray + RHS.NumNonEmpty, CurArray);
   } else {
     CurArray = RHS.CurArray;
     RHS.CurArray = RHS.SmallArray;
@@ -272,13 +225,13 @@ void SmallPtrSetImplBase::MoveFrom(unsigned SmallSize,
 
   // Copy the rest of the trivial members.
   CurArraySize = RHS.CurArraySize;
-  NumElements = RHS.NumElements;
+  NumNonEmpty = RHS.NumNonEmpty;
   NumTombstones = RHS.NumTombstones;
 
   // Make the RHS small and empty.
   RHS.CurArraySize = SmallSize;
   assert(RHS.CurArray == RHS.SmallArray);
-  RHS.NumElements = 0;
+  RHS.NumNonEmpty = 0;
   RHS.NumTombstones = 0;
 }
 
@@ -289,7 +242,7 @@ void SmallPtrSetImplBase::swap(SmallPtrSetImplBase &RHS) {
   if (!this->isSmall() && !RHS.isSmall()) {
     std::swap(this->CurArray, RHS.CurArray);
     std::swap(this->CurArraySize, RHS.CurArraySize);
-    std::swap(this->NumElements, RHS.NumElements);
+    std::swap(this->NumNonEmpty, RHS.NumNonEmpty);
     std::swap(this->NumTombstones, RHS.NumTombstones);
     return;
   }
@@ -299,40 +252,44 @@ void SmallPtrSetImplBase::swap(SmallPtrSetImplBase &RHS) {
   // If only RHS is small, copy the small elements into LHS and move the pointer
   // from LHS to RHS.
   if (!this->isSmall() && RHS.isSmall()) {
-    std::copy(RHS.SmallArray, RHS.SmallArray+RHS.CurArraySize,
-              this->SmallArray);
-    std::swap(this->NumElements, RHS.NumElements);
-    std::swap(this->CurArraySize, RHS.CurArraySize);
+    assert(RHS.CurArray == RHS.SmallArray);
+    std::copy(RHS.CurArray, RHS.CurArray + RHS.NumNonEmpty, this->SmallArray);
+    std::swap(RHS.CurArraySize, this->CurArraySize);
+    std::swap(this->NumNonEmpty, RHS.NumNonEmpty);
+    std::swap(this->NumTombstones, RHS.NumTombstones);
     RHS.CurArray = this->CurArray;
-    RHS.NumTombstones = this->NumTombstones;
     this->CurArray = this->SmallArray;
-    this->NumTombstones = 0;
     return;
   }
 
   // If only LHS is small, copy the small elements into RHS and move the pointer
   // from RHS to LHS.
   if (this->isSmall() && !RHS.isSmall()) {
-    std::copy(this->SmallArray, this->SmallArray+this->CurArraySize,
+    assert(this->CurArray == this->SmallArray);
+    std::copy(this->CurArray, this->CurArray + this->NumNonEmpty,
               RHS.SmallArray);
-    std::swap(RHS.NumElements, this->NumElements);
     std::swap(RHS.CurArraySize, this->CurArraySize);
+    std::swap(RHS.NumNonEmpty, this->NumNonEmpty);
+    std::swap(RHS.NumTombstones, this->NumTombstones);
     this->CurArray = RHS.CurArray;
-    this->NumTombstones = RHS.NumTombstones;
     RHS.CurArray = RHS.SmallArray;
-    RHS.NumTombstones = 0;
     return;
   }
 
   // Both a small, just swap the small elements.
   assert(this->isSmall() && RHS.isSmall());
-  assert(this->CurArraySize == RHS.CurArraySize);
-  std::swap_ranges(this->SmallArray, this->SmallArray+this->CurArraySize,
+  unsigned MinNonEmpty = std::min(this->NumNonEmpty, RHS.NumNonEmpty);
+  std::swap_ranges(this->SmallArray, this->SmallArray + MinNonEmpty,
                    RHS.SmallArray);
-  std::swap(this->NumElements, RHS.NumElements);
-}
-
-SmallPtrSetImplBase::~SmallPtrSetImplBase() {
-  if (!isSmall())
-    free(CurArray);
+  if (this->NumNonEmpty > MinNonEmpty) {
+    std::copy(this->SmallArray + MinNonEmpty,
+              this->SmallArray + this->NumNonEmpty,
+              RHS.SmallArray + MinNonEmpty);
+  } else {
+    std::copy(RHS.SmallArray + MinNonEmpty, RHS.SmallArray + RHS.NumNonEmpty,
+              this->SmallArray + MinNonEmpty);
+  }
+  assert(this->CurArraySize == RHS.CurArraySize);
+  std::swap(this->NumNonEmpty, RHS.NumNonEmpty);
+  std::swap(this->NumTombstones, RHS.NumTombstones);
 }
diff --git a/lib/Support/SpecialCaseList.cpp b/lib/Support/SpecialCaseList.cpp
index ea417c41c0a7..0ffe4444a179 100644
--- a/lib/Support/SpecialCaseList.cpp
+++ b/lib/Support/SpecialCaseList.cpp
@@ -50,7 +50,7 @@ std::unique_ptr<SpecialCaseList>
 SpecialCaseList::create(const std::vector<std::string> &Paths,
                         std::string &Error) {
   std::unique_ptr<SpecialCaseList> SCL(new SpecialCaseList());
-  for (auto Path : Paths) {
+  for (const auto &Path : Paths) {
     ErrorOr<std::unique_ptr<MemoryBuffer>> FileOrErr =
         MemoryBuffer::getFile(Path);
     if (std::error_code EC = FileOrErr.getError()) {
diff --git a/lib/Support/Statistic.cpp b/lib/Support/Statistic.cpp
index e49d1cbe0637..cdd36794507e 100644
--- a/lib/Support/Statistic.cpp
+++ b/lib/Support/Statistic.cpp
@@ -43,6 +43,9 @@ Enabled(
     cl::desc("Enable statistics output from program (available with Asserts)"));
 
 
+static cl::opt<bool> StatsAsJSON("stats-json",
+                                 cl::desc("Display statistics as json data"));
+
 namespace {
 /// StatisticInfo - This class is used in a ManagedStatic so that it is created
 /// on demand (when the first statistic is bumped) and destroyed only when
@@ -51,6 +54,10 @@ class StatisticInfo {
   std::vector<const Statistic*> Stats;
   friend void llvm::PrintStatistics();
   friend void llvm::PrintStatistics(raw_ostream &OS);
+  friend void llvm::PrintStatisticsJSON(raw_ostream &OS);
+
+  /// Sort statistics by debugtype,name,description.
+  void sort();
 public:
   ~StatisticInfo();
 
@@ -95,27 +102,32 @@ bool llvm::AreStatisticsEnabled() {
   return Enabled;
 }
 
+void StatisticInfo::sort() {
+  std::stable_sort(Stats.begin(), Stats.end(),
+                   [](const Statistic *LHS, const Statistic *RHS) {
+    if (int Cmp = std::strcmp(LHS->getDebugType(), RHS->getDebugType()))
+      return Cmp < 0;
+
+    if (int Cmp = std::strcmp(LHS->getName(), RHS->getName()))
+      return Cmp < 0;
+
+    return std::strcmp(LHS->getDesc(), RHS->getDesc()) < 0;
+  });
+}
+
 void llvm::PrintStatistics(raw_ostream &OS) {
   StatisticInfo &Stats = *StatInfo;
 
   // Figure out how long the biggest Value and Name fields are.
-  unsigned MaxNameLen = 0, MaxValLen = 0;
+  unsigned MaxDebugTypeLen = 0, MaxValLen = 0;
   for (size_t i = 0, e = Stats.Stats.size(); i != e; ++i) {
     MaxValLen = std::max(MaxValLen,
                          (unsigned)utostr(Stats.Stats[i]->getValue()).size());
-    MaxNameLen = std::max(MaxNameLen,
-                          (unsigned)std::strlen(Stats.Stats[i]->getName()));
+    MaxDebugTypeLen = std::max(MaxDebugTypeLen,
+                         (unsigned)std::strlen(Stats.Stats[i]->getDebugType()));
   }
 
-  // Sort the fields by name.
-  std::stable_sort(Stats.Stats.begin(), Stats.Stats.end(),
-                   [](const Statistic *LHS, const Statistic *RHS) {
-    if (int Cmp = std::strcmp(LHS->getName(), RHS->getName()))
-      return Cmp < 0;
-
-    // Secondary key is the description.
-    return std::strcmp(LHS->getDesc(), RHS->getDesc()) < 0;
-  });
+  Stats.sort();
 
   // Print out the statistics header...
   OS << "===" << std::string(73, '-') << "===\n"
@@ -126,12 +138,43 @@ void llvm::PrintStatistics(raw_ostream &OS) {
   for (size_t i = 0, e = Stats.Stats.size(); i != e; ++i)
     OS << format("%*u %-*s - %s\n",
                  MaxValLen, Stats.Stats[i]->getValue(),
-                 MaxNameLen, Stats.Stats[i]->getName(),
+                 MaxDebugTypeLen, Stats.Stats[i]->getDebugType(),
                  Stats.Stats[i]->getDesc());
 
   OS << '\n';  // Flush the output stream.
   OS.flush();
+}
+
+static void write_json_string_escaped(raw_ostream &OS, const char *string) {
+  // Out current usage should not need any escaping. Keep it simple and just
+  // check that the input is pure ASCII without special characers.
+#ifndef NDEBUG
+  for (const unsigned char *c = (const unsigned char*)string; *c != '\0'; ++c) {
+    assert(*c != '\\' && *c != '\"' && *c >= 0x20 && *c < 0x80);
+  }
+#endif
+  OS << string;
+}
+
+void llvm::PrintStatisticsJSON(raw_ostream &OS) {
+  StatisticInfo &Stats = *StatInfo;
+
+  Stats.sort();
 
+  // Print all of the statistics.
+  OS << "{\n";
+  const char *delim = "";
+  for (const Statistic *Stat : Stats.Stats) {
+    OS << delim;
+    OS << "\t\"";
+    write_json_string_escaped(OS, Stat->getDebugType());
+    OS << '.';
+    write_json_string_escaped(OS, Stat->getName());
+    OS << "\": " << Stat->getValue();
+    delim = ",\n";
+  }
+  OS << "\n}\n";
+  OS.flush();
 }
 
 void llvm::PrintStatistics() {
@@ -143,7 +186,10 @@ void llvm::PrintStatistics() {
 
   // Get the stream to write to.
   std::unique_ptr<raw_ostream> OutStream = CreateInfoOutputFile();
-  PrintStatistics(*OutStream);
+  if (StatsAsJSON)
+    PrintStatisticsJSON(*OutStream);
+  else
+    PrintStatistics(*OutStream);
 
 #else
   // Check if the -stats option is set instead of checking
diff --git a/lib/Support/StreamingMemoryObject.cpp b/lib/Support/StreamingMemoryObject.cpp
index 5a44e624eb8c..fb566179486a 100644
--- a/lib/Support/StreamingMemoryObject.cpp
+++ b/lib/Support/StreamingMemoryObject.cpp
@@ -104,6 +104,12 @@ uint64_t StreamingMemoryObject::readBytes(uint8_t *Buf, uint64_t Size,
   return Size;
 }
 
+const uint8_t *StreamingMemoryObject::getPointer(uint64_t Address,
+                                                 uint64_t Size) const {
+  fetchToPos(Address + Size - 1);
+  return &Bytes[Address + BytesSkipped];
+}
+
 bool StreamingMemoryObject::dropLeadingBytes(size_t s) {
   if (BytesRead < s) return true;
   BytesSkipped = s;
diff --git a/lib/Support/StringMap.cpp b/lib/Support/StringMap.cpp
index 7be946642d90..7da9ccbd40c7 100644
--- a/lib/Support/StringMap.cpp
+++ b/lib/Support/StringMap.cpp
@@ -17,12 +17,26 @@
 #include <cassert>
 using namespace llvm;
 
+/// Returns the number of buckets to allocate to ensure that the DenseMap can
+/// accommodate \p NumEntries without need to grow().
+static unsigned getMinBucketToReserveForEntries(unsigned NumEntries) {
+  // Ensure that "NumEntries * 4 < NumBuckets * 3"
+  if (NumEntries == 0)
+    return 0;
+  // +1 is required because of the strict equality.
+  // For example if NumEntries is 48, we need to return 401.
+  return NextPowerOf2(NumEntries * 4 / 3 + 1);
+}
+
 StringMapImpl::StringMapImpl(unsigned InitSize, unsigned itemSize) {
   ItemSize = itemSize;
   
   // If a size is specified, initialize the table with that many buckets.
   if (InitSize) {
-    init(InitSize);
+    // The table will grow when the number of entries reach 3/4 of the number of
+    // buckets. To guarantee that "InitSize" number of entries can be inserted
+    // in the table without growing, we allocate just what is needed here.
+    init(getMinBucketToReserveForEntries(InitSize));
     return;
   }
   
diff --git a/lib/Support/StringRef.cpp b/lib/Support/StringRef.cpp
index 7ecff2964c51..8a9da5edca84 100644
--- a/lib/Support/StringRef.cpp
+++ b/lib/Support/StringRef.cpp
@@ -351,12 +351,12 @@ size_t StringRef::count(StringRef Str) const {
 }
 
 static unsigned GetAutoSenseRadix(StringRef &Str) {
-  if (Str.startswith("0x")) {
+  if (Str.startswith("0x") || Str.startswith("0X")) {
     Str = Str.substr(2);
     return 16;
   }
   
-  if (Str.startswith("0b")) {
+  if (Str.startswith("0b") || Str.startswith("0B")) {
     Str = Str.substr(2);
     return 2;
   }
diff --git a/lib/Support/TargetParser.cpp b/lib/Support/TargetParser.cpp
index 337532edbbc6..c3f86130b68c 100644
--- a/lib/Support/TargetParser.cpp
+++ b/lib/Support/TargetParser.cpp
@@ -21,6 +21,7 @@
 
 using namespace llvm;
 using namespace ARM;
+using namespace AArch64;
 
 namespace {
 
@@ -75,6 +76,11 @@ static const struct {
   {NAME, sizeof(NAME) - 1, CPU_ATTR, sizeof(CPU_ATTR) - 1, SUB_ARCH,       \
    sizeof(SUB_ARCH) - 1, ARCH_FPU, ARCH_BASE_EXT, ID, ARCH_ATTR},
 #include "llvm/Support/ARMTargetParser.def"
+},AArch64ARCHNames[] = {
+#define AARCH64_ARCH(NAME, ID, CPU_ATTR, SUB_ARCH, ARCH_ATTR, ARCH_FPU, ARCH_BASE_EXT)       \
+  {NAME, sizeof(NAME) - 1, CPU_ATTR, sizeof(CPU_ATTR) - 1, SUB_ARCH,       \
+   sizeof(SUB_ARCH) - 1, ARCH_FPU, ARCH_BASE_EXT, ID, ARCH_ATTR},
+#include "llvm/Support/AArch64TargetParser.def"
 };
 
 // List of Arch Extension names.
@@ -91,6 +97,10 @@ static const struct {
 #define ARM_ARCH_EXT_NAME(NAME, ID, FEATURE, NEGFEATURE) \
   { NAME, sizeof(NAME) - 1, ID, FEATURE, NEGFEATURE },
 #include "llvm/Support/ARMTargetParser.def"
+},AArch64ARCHExtNames[] = {
+#define AARCH64_ARCH_EXT_NAME(NAME, ID, FEATURE, NEGFEATURE) \
+  { NAME, sizeof(NAME) - 1, ID, FEATURE, NEGFEATURE },
+#include "llvm/Support/AArch64TargetParser.def"
 };
 
 // List of HWDiv names (use getHWDivSynonym) and which architectural
@@ -124,6 +134,10 @@ static const struct {
 #define ARM_CPU_NAME(NAME, ID, DEFAULT_FPU, IS_DEFAULT, DEFAULT_EXT) \
   { NAME, sizeof(NAME) - 1, ID, IS_DEFAULT, DEFAULT_EXT },
 #include "llvm/Support/ARMTargetParser.def"
+},AArch64CPUNames[] = {
+#define AARCH64_CPU_NAME(NAME, ID, DEFAULT_FPU, IS_DEFAULT, DEFAULT_EXT) \
+  { NAME, sizeof(NAME) - 1, ID, IS_DEFAULT, DEFAULT_EXT },
+#include "llvm/Support/AArch64TargetParser.def"
 };
 
 } // namespace
@@ -305,13 +319,13 @@ StringRef llvm::ARM::getArchName(unsigned ArchKind) {
 }
 
 StringRef llvm::ARM::getCPUAttr(unsigned ArchKind) {
-  if (ArchKind >= ARM::AK_LAST)
+  if (ArchKind == ARM::AK_INVALID || ArchKind >= ARM::AK_LAST)
     return StringRef();
   return ARCHNames[ArchKind].getCPUAttr();
 }
 
 StringRef llvm::ARM::getSubArch(unsigned ArchKind) {
-  if (ArchKind >= ARM::AK_LAST)
+  if (ArchKind == ARM::AK_INVALID || ArchKind >= ARM::AK_LAST)
     return StringRef();
   return ARCHNames[ArchKind].getSubArch();
 }
@@ -369,6 +383,156 @@ StringRef llvm::ARM::getDefaultCPU(StringRef Arch) {
   return "generic";
 }
 
+StringRef llvm::AArch64::getFPUName(unsigned FPUKind) {
+  return ARM::getFPUName(FPUKind);
+}
+
+unsigned llvm::AArch64::getFPUVersion(unsigned FPUKind) {
+  return ARM::getFPUVersion(FPUKind);
+}
+
+unsigned llvm::AArch64::getFPUNeonSupportLevel(unsigned FPUKind) {
+  return ARM::getFPUNeonSupportLevel( FPUKind);
+}
+
+unsigned llvm::AArch64::getFPURestriction(unsigned FPUKind) {
+  return ARM::getFPURestriction(FPUKind);
+}
+
+unsigned llvm::AArch64::getDefaultFPU(StringRef CPU, unsigned ArchKind) {
+  if (CPU == "generic")
+    return AArch64ARCHNames[ArchKind].DefaultFPU;
+
+  return StringSwitch<unsigned>(CPU)
+#define AARCH64_CPU_NAME(NAME, ID, DEFAULT_FPU, IS_DEFAULT, DEFAULT_EXT) \
+    .Case(NAME, DEFAULT_FPU)
+#include "llvm/Support/AArch64TargetParser.def"
+    .Default(ARM::FK_INVALID);
+}
+
+unsigned llvm::AArch64::getDefaultExtensions(StringRef CPU, unsigned ArchKind) {
+  if (CPU == "generic")
+    return AArch64ARCHNames[ArchKind].ArchBaseExtensions;
+
+  return StringSwitch<unsigned>(CPU)
+#define AARCH64_CPU_NAME(NAME, ID, DEFAULT_FPU, IS_DEFAULT, DEFAULT_EXT) \
+    .Case(NAME, AArch64ARCHNames[ID].ArchBaseExtensions | DEFAULT_EXT)
+#include "llvm/Support/AArch64TargetParser.def"
+    .Default(AArch64::AEK_INVALID);
+}
+
+bool llvm::AArch64::getExtensionFeatures(unsigned Extensions,
+                                     std::vector<const char *> &Features) {
+
+  if (Extensions == AArch64::AEK_INVALID)
+    return false;
+
+  if (Extensions & AArch64::AEK_FP)
+    Features.push_back("+fp-armv8");
+  if (Extensions & AArch64::AEK_SIMD)
+    Features.push_back("+neon");
+  if (Extensions & AArch64::AEK_CRC)
+    Features.push_back("+crc");
+  if (Extensions & AArch64::AEK_CRYPTO)
+    Features.push_back("+crypto");
+  if (Extensions & AArch64::AEK_FP16)
+    Features.push_back("+fullfp16");
+  if (Extensions & AArch64::AEK_PROFILE)
+    Features.push_back("+spe");
+  if (Extensions & AArch64::AEK_RAS)
+    Features.push_back("+ras");
+
+  return true;
+}
+
+bool llvm::AArch64::getFPUFeatures(unsigned FPUKind,
+                               std::vector<const char *> &Features) {
+  return ARM::getFPUFeatures(FPUKind, Features);
+}
+
+bool llvm::AArch64::getArchFeatures(unsigned ArchKind,
+                                     std::vector<const char *> &Features) {
+  if (ArchKind == ARM::AK_INVALID || ArchKind >= ARM::AK_LAST)
+    return false;
+
+  if (ArchKind == ARM::AK_ARMV8_1A)
+    Features.push_back("+v8.1a");
+  if (ArchKind == ARM::AK_ARMV8_2A)
+    Features.push_back("+v8.2a");
+
+  return true;
+}
+
+StringRef llvm::AArch64::getArchName(unsigned ArchKind) {
+  for (const auto &AI : AArch64ARCHNames)
+    if (AI.ID == ArchKind)
+      return AI.getName();
+  return StringRef();
+}
+
+StringRef llvm::AArch64::getCPUAttr(unsigned ArchKind) {
+  for (const auto &AI : AArch64ARCHNames)
+    if (AI.ID == ArchKind)
+      return AI.getCPUAttr();
+  return StringRef();
+}
+
+StringRef llvm::AArch64::getSubArch(unsigned ArchKind) {
+  for (const auto &AI : AArch64ARCHNames)
+    if (AI.ID == ArchKind)
+      return AI.getSubArch();
+  return StringRef();
+}
+
+unsigned llvm::AArch64::getArchAttr(unsigned ArchKind) {
+  for (const auto &AI : AArch64ARCHNames)
+    if (AI.ID == ArchKind)
+      return AI.ArchAttr;
+  return ARMBuildAttrs::CPUArch::v8_A;
+}
+
+StringRef llvm::AArch64::getArchExtName(unsigned AArchExtKind) {
+  for (const auto &AE : AArch64ARCHExtNames)
+    if (AArchExtKind == AE.ID)
+      return AE.getName();
+  return StringRef();
+}
+
+const char *llvm::AArch64::getArchExtFeature(StringRef ArchExt) {
+  if (ArchExt.startswith("no")) {
+    StringRef ArchExtBase(ArchExt.substr(2));
+    for (const auto &AE : AArch64ARCHExtNames) {
+      if (AE.NegFeature && ArchExtBase == AE.getName())
+        return AE.NegFeature;
+    }
+  }
+
+  for (const auto &AE : AArch64ARCHExtNames)
+    if (AE.Feature && ArchExt == AE.getName())
+      return AE.Feature;
+  return nullptr;
+}
+
+StringRef llvm::AArch64::getDefaultCPU(StringRef Arch) {
+  unsigned AK = parseArch(Arch);
+  if (AK == ARM::AK_INVALID)
+    return StringRef();
+
+  // Look for multiple AKs to find the default for pair AK+Name.
+  for (const auto &CPU : AArch64CPUNames)
+    if (CPU.ArchID == AK && CPU.Default)
+      return CPU.getName();
+
+  // If we can't find a default then target the architecture instead
+  return "generic";
+}
+
+unsigned llvm::AArch64::checkArchVersion(StringRef Arch) {
+  if (Arch[0] == 'v' && std::isdigit(Arch[1]))
+    return (Arch[1] - 48);
+  return 0;
+}
+
 // ======================================================= //
 // Parsers
 // ======================================================= //
@@ -411,6 +575,8 @@ static StringRef getArchSynonym(StringRef Arch) {
       .Cases("v8", "v8a", "aarch64", "arm64", "v8-a")
       .Case("v8.1a", "v8.1-a")
       .Case("v8.2a", "v8.2-a")
+      .Case("v8m.base", "v8-m.base")
+      .Case("v8m.main", "v8-m.main")
       .Default(Arch);
 }
 
@@ -548,6 +714,8 @@ unsigned llvm::ARM::parseArchProfile(StringRef Arch) {
   case ARM::AK_ARMV6M:
   case ARM::AK_ARMV7M:
   case ARM::AK_ARMV7EM:
+  case ARM::AK_ARMV8MMainline:
+  case ARM::AK_ARMV8MBaseline:
     return ARM::PK_M;
   case ARM::AK_ARMV7R:
     return ARM::PK_R;
@@ -597,7 +765,67 @@ unsigned llvm::ARM::parseArchVersion(StringRef Arch) {
   case ARM::AK_ARMV8A:
   case ARM::AK_ARMV8_1A:
   case ARM::AK_ARMV8_2A:
+  case ARM::AK_ARMV8MBaseline:
+  case ARM::AK_ARMV8MMainline:
     return 8;
   }
   return 0;
 }
+
+StringRef llvm::AArch64::getCanonicalArchName(StringRef Arch) {
+  return ARM::getCanonicalArchName(Arch);
+}
+
+unsigned llvm::AArch64::parseFPU(StringRef FPU) {
+  return ARM::parseFPU(FPU);
+}
+
+// Allows partial match, ex. "v8a" matches "armv8a".
+unsigned llvm::AArch64::parseArch(StringRef Arch) {
+  Arch = getCanonicalArchName(Arch);
+  if (checkArchVersion(Arch) < 8)
+    return ARM::AK_INVALID;
+
+  StringRef Syn = getArchSynonym(Arch);
+  for (const auto A : AArch64ARCHNames) {
+    if (A.getName().endswith(Syn))
+      return A.ID;
+  }
+  return ARM::AK_INVALID;
+}
+
+unsigned llvm::AArch64::parseArchExt(StringRef ArchExt) {
+  for (const auto A : AArch64ARCHExtNames) {
+    if (ArchExt == A.getName())
+      return A.ID;
+  }
+  return AArch64::AEK_INVALID;
+}
+
+unsigned llvm::AArch64::parseCPUArch(StringRef CPU) {
+  for (const auto C : AArch64CPUNames) {
+    if (CPU == C.getName())
+      return C.ArchID;
+  }
+  return ARM::AK_INVALID;
+}
+
+// ARM, Thumb, AArch64
+unsigned llvm::AArch64::parseArchISA(StringRef Arch) {
+  return ARM::parseArchISA(Arch);
+}
+
+// Little/Big endian
+unsigned llvm::AArch64::parseArchEndian(StringRef Arch) {
+  return ARM::parseArchEndian(Arch);
+}
+
+// Profile A/R/M
+unsigned llvm::AArch64::parseArchProfile(StringRef Arch) {
+  return ARM::parseArchProfile(Arch);
+}
+
+// Version number (ex. v8 = 8).
+unsigned llvm::AArch64::parseArchVersion(StringRef Arch) {
+  return ARM::parseArchVersion(Arch);
+}
diff --git a/lib/Support/TargetRegistry.cpp b/lib/Support/TargetRegistry.cpp
index eefef8ad8eaa..02a6d334b911 100644
--- a/lib/Support/TargetRegistry.cpp
+++ b/lib/Support/TargetRegistry.cpp
@@ -73,8 +73,7 @@ const Target *TargetRegistry::lookupTarget(const std::string &TT,
   auto I = std::find_if(targets().begin(), targets().end(), ArchMatch);
 
   if (I == targets().end()) {
-    Error = "No available targets are compatible with this triple, "
-      "see -version for the available targets.";
+    Error = "No available targets are compatible with this triple.";
     return nullptr;
   }
 
diff --git a/lib/Support/ThreadPool.cpp b/lib/Support/ThreadPool.cpp
index d4dcb2ee96df..db03a4d6240d 100644
--- a/lib/Support/ThreadPool.cpp
+++ b/lib/Support/ThreadPool.cpp
@@ -75,8 +75,11 @@ ThreadPool::ThreadPool(unsigned ThreadCount)
 void ThreadPool::wait() {
   // Wait for all threads to complete and the queue to be empty
   std::unique_lock<std::mutex> LockGuard(CompletionLock);
+  // The order of the checks for ActiveThreads and Tasks.empty() matters because
+  // any active threads might be modifying the Tasks queue, and this would be a
+  // race.
   CompletionCondition.wait(LockGuard,
-                           [&] { return Tasks.empty() && !ActiveThreads; });
+                           [&] { return !ActiveThreads && Tasks.empty(); });
 }
 
 std::shared_future<ThreadPool::VoidTy> ThreadPool::asyncImpl(TaskTy Task) {
diff --git a/lib/Support/Threading.cpp b/lib/Support/Threading.cpp
index ca7f3f64aa37..e8f5622d0e5c 100644
--- a/lib/Support/Threading.cpp
+++ b/lib/Support/Threading.cpp
@@ -16,6 +16,7 @@
 #include "llvm/Config/config.h"
 #include "llvm/Support/Atomic.h"
 #include "llvm/Support/Mutex.h"
+#include "llvm/Support/thread.h"
 #include <cassert>
 
 using namespace llvm;
@@ -71,6 +72,11 @@ void llvm::llvm_execute_on_thread(void (*Fn)(void*), void *UserData,
 #include "Windows/WindowsSupport.h"
 #include <process.h>
 
+// Windows will at times define MemoryFence.
+#ifdef MemoryFence
+#undef MemoryFence
+#endif
+
 struct ThreadInfo {
   void (*func)(void*);
   void *param;
diff --git a/lib/Support/Timer.cpp b/lib/Support/Timer.cpp
index 414f559f8f0e..49bd39e14e28 100644
--- a/lib/Support/Timer.cpp
+++ b/lib/Support/Timer.cpp
@@ -79,7 +79,7 @@ static TimerGroup *getDefaultTimerGroup() {
   TimerGroup *tmp = DefaultTimerGroup;
   sys::MemoryFence();
   if (tmp) return tmp;
-  
+
   sys::SmartScopedLock<true> Lock(*TimerLock);
   tmp = DefaultTimerGroup;
   if (!tmp) {
@@ -120,7 +120,7 @@ static inline size_t getMemUsage() {
 TimeRecord TimeRecord::getCurrentTime(bool Start) {
   TimeRecord Result;
   sys::TimeValue now(0,0), user(0,0), sys(0,0);
-  
+
   if (Start) {
     Result.MemUsed = getMemUsage();
     sys::Process::GetTimeUsage(now, user, sys);
@@ -168,9 +168,9 @@ void TimeRecord::print(const TimeRecord &Total, raw_ostream &OS) const {
   if (Total.getProcessTime())
     printVal(getProcessTime(), Total.getProcessTime(), OS);
   printVal(getWallTime(), Total.getWallTime(), OS);
-  
+
   OS << "  ";
-  
+
   if (Total.getMemUsed())
     OS << format("%9" PRId64 "  ", (int64_t)getMemUsed());
 }
@@ -192,15 +192,15 @@ public:
          I = Map.begin(), E = Map.end(); I != E; ++I)
       delete I->second.first;
   }
-  
+
   Timer &get(StringRef Name, StringRef GroupName) {
     sys::SmartScopedLock<true> L(*TimerLock);
-    
+
     std::pair<TimerGroup*, Name2TimerMap> &GroupEntry = Map[GroupName];
-    
+
     if (!GroupEntry.first)
       GroupEntry.first = new TimerGroup(GroupName);
-    
+
     Timer &T = GroupEntry.second[Name];
     if (!T.isInitialized())
       T.init(Name, *GroupEntry.first);
@@ -215,7 +215,7 @@ static ManagedStatic<Name2PairMap> NamedGroupedTimers;
 
 static Timer &getNamedRegionTimer(StringRef Name) {
   sys::SmartScopedLock<true> L(*TimerLock);
-  
+
   Timer &T = (*NamedTimers)[Name];
   if (!T.isInitialized())
     T.init(Name);
@@ -240,7 +240,7 @@ static TimerGroup *TimerGroupList = nullptr;
 
 TimerGroup::TimerGroup(StringRef name)
   : Name(name.begin(), name.end()), FirstTimer(nullptr) {
-    
+
   // Add the group to TimerGroupList.
   sys::SmartScopedLock<true> L(*TimerLock);
   if (TimerGroupList)
@@ -255,7 +255,7 @@ TimerGroup::~TimerGroup() {
   // print the timing data.
   while (FirstTimer)
     removeTimer(*FirstTimer);
-  
+
   // Remove the group from the TimerGroupList.
   sys::SmartScopedLock<true> L(*TimerLock);
   *Prev = Next;
@@ -266,18 +266,18 @@ TimerGroup::~TimerGroup() {
 
 void TimerGroup::removeTimer(Timer &T) {
   sys::SmartScopedLock<true> L(*TimerLock);
-  
+
   // If the timer was started, move its data to TimersToPrint.
   if (T.hasTriggered())
     TimersToPrint.emplace_back(T.Time, T.Name);
 
   T.TG = nullptr;
-  
+
   // Unlink the timer from our list.
   *T.Prev = T.Next;
   if (T.Next)
     T.Next->Prev = T.Prev;
-  
+
   // Print the report when all timers in this group are destroyed if some of
   // them were started.
   if (FirstTimer || TimersToPrint.empty())
@@ -289,7 +289,7 @@ void TimerGroup::removeTimer(Timer &T) {
 
 void TimerGroup::addTimer(Timer &T) {
   sys::SmartScopedLock<true> L(*TimerLock);
-  
+
   // Add the timer to our list.
   if (FirstTimer)
     FirstTimer->Prev = &T.Next;
@@ -301,11 +301,11 @@ void TimerGroup::addTimer(Timer &T) {
 void TimerGroup::PrintQueuedTimers(raw_ostream &OS) {
   // Sort the timers in descending order by amount of time taken.
   std::sort(TimersToPrint.begin(), TimersToPrint.end());
-  
+
   TimeRecord Total;
   for (auto &RecordNamePair : TimersToPrint)
     Total += RecordNamePair.first;
-  
+
   // Print out timing header.
   OS << "===" << std::string(73, '-') << "===\n";
   // Figure out how many spaces to indent TimerGroup name.
@@ -313,7 +313,7 @@ void TimerGroup::PrintQueuedTimers(raw_ostream &OS) {
   if (Padding > 80) Padding = 0;         // Don't allow "negative" numbers
   OS.indent(Padding) << Name << '\n';
   OS << "===" << std::string(73, '-') << "===\n";
-  
+
   // If this is not an collection of ungrouped times, print the total time.
   // Ungrouped timers don't really make sense to add up.  We still print the
   // TOTAL line to make the percentages make sense.
@@ -321,7 +321,7 @@ void TimerGroup::PrintQueuedTimers(raw_ostream &OS) {
     OS << format("  Total Execution Time: %5.4f seconds (%5.4f wall clock)\n",
                  Total.getProcessTime(), Total.getWallTime());
   OS << '\n';
-  
+
   if (Total.getUserTime())
     OS << "   ---User Time---";
   if (Total.getSystemTime())
@@ -332,18 +332,18 @@ void TimerGroup::PrintQueuedTimers(raw_ostream &OS) {
   if (Total.getMemUsed())
     OS << "  ---Mem---";
   OS << "  --- Name ---\n";
-  
+
   // Loop through all of the timing data, printing it out.
   for (unsigned i = 0, e = TimersToPrint.size(); i != e; ++i) {
     const std::pair<TimeRecord, std::string> &Entry = TimersToPrint[e-i-1];
     Entry.first.print(Total, OS);
     OS << Entry.second << '\n';
   }
-  
+
   Total.print(Total, OS);
   OS << "Total\n\n";
   OS.flush();
-  
+
   TimersToPrint.clear();
 }
 
diff --git a/lib/Support/Triple.cpp b/lib/Support/Triple.cpp
index 11afcf7f0adc..cfa12a9f0b27 100644
--- a/lib/Support/Triple.cpp
+++ b/lib/Support/Triple.cpp
@@ -19,50 +19,53 @@ using namespace llvm;
 
 const char *Triple::getArchTypeName(ArchType Kind) {
   switch (Kind) {
-  case UnknownArch: return "unknown";
-
-  case aarch64:     return "aarch64";
-  case aarch64_be:  return "aarch64_be";
-  case arm:         return "arm";
-  case armeb:       return "armeb";
-  case avr:         return "avr";
-  case bpfel:       return "bpfel";
-  case bpfeb:       return "bpfeb";
-  case hexagon:     return "hexagon";
-  case mips:        return "mips";
-  case mipsel:      return "mipsel";
-  case mips64:      return "mips64";
-  case mips64el:    return "mips64el";
-  case msp430:      return "msp430";
-  case ppc64:       return "powerpc64";
-  case ppc64le:     return "powerpc64le";
-  case ppc:         return "powerpc";
-  case r600:        return "r600";
-  case amdgcn:      return "amdgcn";
-  case sparc:       return "sparc";
-  case sparcv9:     return "sparcv9";
-  case sparcel:     return "sparcel";
-  case systemz:     return "s390x";
-  case tce:         return "tce";
-  case thumb:       return "thumb";
-  case thumbeb:     return "thumbeb";
-  case x86:         return "i386";
-  case x86_64:      return "x86_64";
-  case xcore:       return "xcore";
-  case nvptx:       return "nvptx";
-  case nvptx64:     return "nvptx64";
-  case le32:        return "le32";
-  case le64:        return "le64";
-  case amdil:       return "amdil";
-  case amdil64:     return "amdil64";
-  case hsail:       return "hsail";
-  case hsail64:     return "hsail64";
-  case spir:        return "spir";
-  case spir64:      return "spir64";
-  case kalimba:     return "kalimba";
-  case shave:       return "shave";
-  case wasm32:      return "wasm32";
-  case wasm64:      return "wasm64";
+  case UnknownArch:    return "unknown";
+
+  case aarch64:        return "aarch64";
+  case aarch64_be:     return "aarch64_be";
+  case arm:            return "arm";
+  case armeb:          return "armeb";
+  case avr:            return "avr";
+  case bpfel:          return "bpfel";
+  case bpfeb:          return "bpfeb";
+  case hexagon:        return "hexagon";
+  case mips:           return "mips";
+  case mipsel:         return "mipsel";
+  case mips64:         return "mips64";
+  case mips64el:       return "mips64el";
+  case msp430:         return "msp430";
+  case ppc64:          return "powerpc64";
+  case ppc64le:        return "powerpc64le";
+  case ppc:            return "powerpc";
+  case r600:           return "r600";
+  case amdgcn:         return "amdgcn";
+  case sparc:          return "sparc";
+  case sparcv9:        return "sparcv9";
+  case sparcel:        return "sparcel";
+  case systemz:        return "s390x";
+  case tce:            return "tce";
+  case thumb:          return "thumb";
+  case thumbeb:        return "thumbeb";
+  case x86:            return "i386";
+  case x86_64:         return "x86_64";
+  case xcore:          return "xcore";
+  case nvptx:          return "nvptx";
+  case nvptx64:        return "nvptx64";
+  case le32:           return "le32";
+  case le64:           return "le64";
+  case amdil:          return "amdil";
+  case amdil64:        return "amdil64";
+  case hsail:          return "hsail";
+  case hsail64:        return "hsail64";
+  case spir:           return "spir";
+  case spir64:         return "spir64";
+  case kalimba:        return "kalimba";
+  case lanai:          return "lanai";
+  case shave:          return "shave";
+  case wasm32:         return "wasm32";
+  case wasm64:         return "wasm64";
+  case renderscript32: return "renderscript32";
+  case renderscript64: return "renderscript64";
   }
 
   llvm_unreachable("Invalid ArchType!");
@@ -94,8 +97,8 @@ const char *Triple::getArchTypePrefix(ArchType Kind) {
 
   case hexagon:     return "hexagon";
 
-  case amdgcn:
-  case r600:        return "amdgpu";
+  case amdgcn:      return "amdgcn";
+  case r600:        return "r600";
 
   case bpfel:
   case bpfeb:       return "bpf";
@@ -111,8 +114,9 @@ const char *Triple::getArchTypePrefix(ArchType Kind) {
 
   case xcore:       return "xcore";
 
-  case nvptx:       return "nvptx";
-  case nvptx64:     return "nvptx";
+  // NVPTX intrinsics are namespaced under nvvm.
+  case nvptx:       return "nvvm";
+  case nvptx64:     return "nvvm";
 
   case le32:        return "le32";
   case le64:        return "le64";
@@ -126,6 +130,7 @@ const char *Triple::getArchTypePrefix(ArchType Kind) {
   case spir:
   case spir64:      return "spir";
   case kalimba:     return "kalimba";
+  case lanai:       return "lanai";
   case shave:       return "shave";
   case wasm32:
   case wasm64:      return "wasm";
@@ -148,6 +153,8 @@ const char *Triple::getVendorTypeName(VendorType Kind) {
   case NVIDIA: return "nvidia";
   case CSR: return "csr";
   case Myriad: return "myriad";
+  case AMD: return "amd";
+  case Mesa: return "mesa";
   }
 
   llvm_unreachable("Invalid VendorType!");
@@ -184,6 +191,7 @@ const char *Triple::getOSTypeName(OSType Kind) {
   case ELFIAMCU: return "elfiamcu";
   case TvOS: return "tvos";
   case WatchOS: return "watchos";
+  case Mesa3D: return "mesa3d";
   }
 
   llvm_unreachable("Invalid OSType");
@@ -200,6 +208,9 @@ const char *Triple::getEnvironmentTypeName(EnvironmentType Kind) {
   case EABI: return "eabi";
   case EABIHF: return "eabihf";
   case Android: return "android";
+  case Musl: return "musl";
+  case MuslEABI: return "musleabi";
+  case MuslEABIHF: return "musleabihf";
   case MSVC: return "msvc";
   case Itanium: return "itanium";
   case Cygnus: return "cygnus";
@@ -268,9 +279,12 @@ Triple::ArchType Triple::getArchTypeForLLVMName(StringRef Name) {
     .Case("spir", spir)
     .Case("spir64", spir64)
     .Case("kalimba", kalimba)
+    .Case("lanai", lanai)
     .Case("shave", shave)
     .Case("wasm32", wasm32)
     .Case("wasm64", wasm64)
+    .Case("renderscript32", renderscript32)
+    .Case("renderscript64", renderscript64)
     .Default(UnknownArch);
 }
 
@@ -376,9 +390,12 @@ static Triple::ArchType parseArch(StringRef ArchName) {
     .Case("spir", Triple::spir)
     .Case("spir64", Triple::spir64)
     .StartsWith("kalimba", Triple::kalimba)
+    .Case("lanai", Triple::lanai)
     .Case("shave", Triple::shave)
     .Case("wasm32", Triple::wasm32)
     .Case("wasm64", Triple::wasm64)
+    .Case("renderscript32", Triple::renderscript32)
+    .Case("renderscript64", Triple::renderscript64)
     .Default(Triple::UnknownArch);
 
   // Some architectures require special parsing logic just to compute the
@@ -408,6 +425,8 @@ static Triple::VendorType parseVendor(StringRef VendorName) {
     .Case("nvidia", Triple::NVIDIA)
     .Case("csr", Triple::CSR)
     .Case("myriad", Triple::Myriad)
+    .Case("amd", Triple::AMD)
+    .Case("mesa", Triple::Mesa)
     .Default(Triple::UnknownVendor);
 }
 
@@ -441,6 +460,7 @@ static Triple::OSType parseOS(StringRef OSName) {
     .StartsWith("elfiamcu", Triple::ELFIAMCU)
     .StartsWith("tvos", Triple::TvOS)
     .StartsWith("watchos", Triple::WatchOS)
+    .StartsWith("mesa3d", Triple::Mesa3D)
     .Default(Triple::UnknownOS);
 }
 
@@ -454,6 +474,9 @@ static Triple::EnvironmentType parseEnvironment(StringRef EnvironmentName) {
     .StartsWith("code16", Triple::CODE16)
     .StartsWith("gnu", Triple::GNU)
     .StartsWith("android", Triple::Android)
+    .StartsWith("musleabihf", Triple::MuslEABIHF)
+    .StartsWith("musleabi", Triple::MuslEABI)
+    .StartsWith("musl", Triple::Musl)
     .StartsWith("msvc", Triple::MSVC)
     .StartsWith("itanium", Triple::Itanium)
     .StartsWith("cygnus", Triple::Cygnus)
@@ -521,6 +544,10 @@ static Triple::SubArchType parseSubArch(StringRef SubArchName) {
     return Triple::ARMSubArch_v8_1a;
   case ARM::AK_ARMV8_2A:
     return Triple::ARMSubArch_v8_2a;
+  case ARM::AK_ARMV8MBaseline:
+    return Triple::ARMSubArch_v8m_baseline;
+  case ARM::AK_ARMV8MMainline:
+    return Triple::ARMSubArch_v8m_mainline;
   default:
     return Triple::NoSubArch;
   }
@@ -559,6 +586,7 @@ static Triple::ObjectFormatType getDefaultFormat(const Triple &T) {
   case Triple::bpfeb:
   case Triple::bpfel:
   case Triple::hexagon:
+  case Triple::lanai:
   case Triple::hsail:
   case Triple::hsail64:
   case Triple::kalimba:
@@ -573,6 +601,8 @@ static Triple::ObjectFormatType getDefaultFormat(const Triple &T) {
   case Triple::nvptx64:
   case Triple::ppc64le:
   case Triple::r600:
+  case Triple::renderscript32:
+  case Triple::renderscript64:
   case Triple::shave:
   case Triple::sparc:
   case Triple::sparcel:
@@ -1111,8 +1141,10 @@ static unsigned getArchPointerBitWidth(llvm::Triple::ArchType Arch) {
   case llvm::Triple::hsail:
   case llvm::Triple::spir:
   case llvm::Triple::kalimba:
+  case llvm::Triple::lanai:
   case llvm::Triple::shave:
   case llvm::Triple::wasm32:
+  case llvm::Triple::renderscript32:
     return 32;
 
   case llvm::Triple::aarch64:
@@ -1133,6 +1165,7 @@ static unsigned getArchPointerBitWidth(llvm::Triple::ArchType Arch) {
   case llvm::Triple::hsail64:
   case llvm::Triple::spir64:
   case llvm::Triple::wasm64:
+  case llvm::Triple::renderscript64:
     return 64;
   }
   llvm_unreachable("Invalid architecture value");
@@ -1184,24 +1217,27 @@ Triple Triple::get32BitArchVariant() const {
   case Triple::thumbeb:
   case Triple::x86:
   case Triple::xcore:
+  case Triple::lanai:
   case Triple::shave:
   case Triple::wasm32:
+  case Triple::renderscript32:
     // Already 32-bit.
     break;
 
-  case Triple::aarch64:    T.setArch(Triple::arm);     break;
-  case Triple::aarch64_be: T.setArch(Triple::armeb);   break;
-  case Triple::le64:       T.setArch(Triple::le32);    break;
-  case Triple::mips64:     T.setArch(Triple::mips);    break;
-  case Triple::mips64el:   T.setArch(Triple::mipsel);  break;
-  case Triple::nvptx64:    T.setArch(Triple::nvptx);   break;
-  case Triple::ppc64:      T.setArch(Triple::ppc);     break;
-  case Triple::sparcv9:    T.setArch(Triple::sparc);   break;
-  case Triple::x86_64:     T.setArch(Triple::x86);     break;
-  case Triple::amdil64:    T.setArch(Triple::amdil);   break;
-  case Triple::hsail64:    T.setArch(Triple::hsail);   break;
-  case Triple::spir64:     T.setArch(Triple::spir);    break;
-  case Triple::wasm64:     T.setArch(Triple::wasm32);  break;
+  case Triple::aarch64:        T.setArch(Triple::arm);     break;
+  case Triple::aarch64_be:     T.setArch(Triple::armeb);   break;
+  case Triple::le64:           T.setArch(Triple::le32);    break;
+  case Triple::mips64:         T.setArch(Triple::mips);    break;
+  case Triple::mips64el:       T.setArch(Triple::mipsel);  break;
+  case Triple::nvptx64:        T.setArch(Triple::nvptx);   break;
+  case Triple::ppc64:          T.setArch(Triple::ppc);     break;
+  case Triple::sparcv9:        T.setArch(Triple::sparc);   break;
+  case Triple::x86_64:         T.setArch(Triple::x86);     break;
+  case Triple::amdil64:        T.setArch(Triple::amdil);   break;
+  case Triple::hsail64:        T.setArch(Triple::hsail);   break;
+  case Triple::spir64:         T.setArch(Triple::spir);    break;
+  case Triple::wasm64:         T.setArch(Triple::wasm32);  break;
+  case Triple::renderscript64: T.setArch(Triple::renderscript32); break;
   }
   return T;
 }
@@ -1213,6 +1249,7 @@ Triple Triple::get64BitArchVariant() const {
   case Triple::avr:
   case Triple::hexagon:
   case Triple::kalimba:
+  case Triple::lanai:
   case Triple::msp430:
   case Triple::r600:
   case Triple::tce:
@@ -1240,30 +1277,35 @@ Triple Triple::get64BitArchVariant() const {
   case Triple::systemz:
   case Triple::x86_64:
   case Triple::wasm64:
+  case Triple::renderscript64:
     // Already 64-bit.
     break;
 
-  case Triple::arm:     T.setArch(Triple::aarch64);    break;
-  case Triple::armeb:   T.setArch(Triple::aarch64_be); break;
-  case Triple::le32:    T.setArch(Triple::le64);       break;
-  case Triple::mips:    T.setArch(Triple::mips64);     break;
-  case Triple::mipsel:  T.setArch(Triple::mips64el);   break;
-  case Triple::nvptx:   T.setArch(Triple::nvptx64);    break;
-  case Triple::ppc:     T.setArch(Triple::ppc64);      break;
-  case Triple::sparc:   T.setArch(Triple::sparcv9);    break;
-  case Triple::x86:     T.setArch(Triple::x86_64);     break;
-  case Triple::amdil:   T.setArch(Triple::amdil64);    break;
-  case Triple::hsail:   T.setArch(Triple::hsail64);    break;
-  case Triple::spir:    T.setArch(Triple::spir64);     break;
-  case Triple::thumb:   T.setArch(Triple::aarch64);    break;
-  case Triple::thumbeb: T.setArch(Triple::aarch64_be); break;
-  case Triple::wasm32:  T.setArch(Triple::wasm64);     break;
+  case Triple::arm:             T.setArch(Triple::aarch64);    break;
+  case Triple::armeb:           T.setArch(Triple::aarch64_be); break;
+  case Triple::le32:            T.setArch(Triple::le64);       break;
+  case Triple::mips:            T.setArch(Triple::mips64);     break;
+  case Triple::mipsel:          T.setArch(Triple::mips64el);   break;
+  case Triple::nvptx:           T.setArch(Triple::nvptx64);    break;
+  case Triple::ppc:             T.setArch(Triple::ppc64);      break;
+  case Triple::sparc:           T.setArch(Triple::sparcv9);    break;
+  case Triple::x86:             T.setArch(Triple::x86_64);     break;
+  case Triple::amdil:           T.setArch(Triple::amdil64);    break;
+  case Triple::hsail:           T.setArch(Triple::hsail64);    break;
+  case Triple::spir:            T.setArch(Triple::spir64);     break;
+  case Triple::thumb:           T.setArch(Triple::aarch64);    break;
+  case Triple::thumbeb:         T.setArch(Triple::aarch64_be); break;
+  case Triple::wasm32:          T.setArch(Triple::wasm64);     break;
+  case Triple::renderscript32:  T.setArch(Triple::renderscript64);     break;
   }
   return T;
 }
 
 Triple Triple::getBigEndianArchVariant() const {
   Triple T(*this);
+  // Already big endian.
+  if (!isLittleEndian())
+    return T;
   switch (getArch()) {
   case Triple::UnknownArch:
   case Triple::amdgcn:
@@ -1288,6 +1330,8 @@ Triple Triple::getBigEndianArchVariant() const {
   case Triple::x86:
   case Triple::x86_64:
   case Triple::xcore:
+  case Triple::renderscript32:
+  case Triple::renderscript64:
 
   // ARM is intentionally unsupported here, changing the architecture would
   // drop any arch suffixes.
@@ -1296,35 +1340,26 @@ Triple Triple::getBigEndianArchVariant() const {
     T.setArch(UnknownArch);
     break;
 
-  case Triple::aarch64_be:
-  case Triple::armeb:
-  case Triple::bpfeb:
-  case Triple::mips64:
-  case Triple::mips:
-  case Triple::ppc64:
-  case Triple::ppc:
-  case Triple::sparc:
-  case Triple::sparcv9:
-  case Triple::systemz:
-  case Triple::tce:
-  case Triple::thumbeb:
-    // Already big endian.
-    break;
-
   case Triple::aarch64: T.setArch(Triple::aarch64_be); break;
   case Triple::bpfel:   T.setArch(Triple::bpfeb);      break;
   case Triple::mips64el:T.setArch(Triple::mips64);     break;
   case Triple::mipsel:  T.setArch(Triple::mips);       break;
   case Triple::ppc64le: T.setArch(Triple::ppc64);      break;
   case Triple::sparcel: T.setArch(Triple::sparc);      break;
+  default:
+    llvm_unreachable("getBigEndianArchVariant: unknown triple.");
   }
   return T;
 }
 
 Triple Triple::getLittleEndianArchVariant() const {
   Triple T(*this);
+  if (isLittleEndian())
+    return T;
+
   switch (getArch()) {
   case Triple::UnknownArch:
+  case Triple::lanai:
   case Triple::ppc:
   case Triple::sparcv9:
   case Triple::systemz:
@@ -1337,6 +1372,20 @@ Triple Triple::getLittleEndianArchVariant() const {
     T.setArch(UnknownArch);
     break;
 
+  case Triple::aarch64_be: T.setArch(Triple::aarch64);  break;
+  case Triple::bpfeb:      T.setArch(Triple::bpfel);    break;
+  case Triple::mips64:     T.setArch(Triple::mips64el); break;
+  case Triple::mips:       T.setArch(Triple::mipsel);   break;
+  case Triple::ppc64:      T.setArch(Triple::ppc64le);  break;
+  case Triple::sparc:      T.setArch(Triple::sparcel);  break;
+  default:
+    llvm_unreachable("getLittleEndianArchVariant: unknown triple.");
+  }
+  return T;
+}
+
+bool Triple::isLittleEndian() const {
+  switch (getArch()) {
   case Triple::aarch64:
   case Triple::amdgcn:
   case Triple::amdil64:
@@ -1367,17 +1416,12 @@ Triple Triple::getLittleEndianArchVariant() const {
   case Triple::x86:
   case Triple::x86_64:
   case Triple::xcore:
-    // Already little endian.
-    break;
-
-  case Triple::aarch64_be: T.setArch(Triple::aarch64);  break;
-  case Triple::bpfeb:      T.setArch(Triple::bpfel);    break;
-  case Triple::mips64:     T.setArch(Triple::mips64el); break;
-  case Triple::mips:       T.setArch(Triple::mipsel);   break;
-  case Triple::ppc64:      T.setArch(Triple::ppc64le);  break;
-  case Triple::sparc:      T.setArch(Triple::sparcel);  break;
+  case Triple::renderscript32:
+  case Triple::renderscript64:
+    return true;
+  default:
+    return false;
   }
-  return T;
 }
 
 StringRef Triple::getARMCPUForArch(StringRef MArch) const {
@@ -1398,6 +1442,7 @@ StringRef Triple::getARMCPUForArch(StringRef MArch) const {
   case llvm::Triple::MacOSX:
   case llvm::Triple::IOS:
   case llvm::Triple::WatchOS:
+  case llvm::Triple::TvOS:
     if (MArch == "v7k")
       return "cortex-a7";
     break;
@@ -1431,6 +1476,7 @@ StringRef Triple::getARMCPUForArch(StringRef MArch) const {
     switch (getEnvironment()) {
     case llvm::Triple::EABIHF:
     case llvm::Triple::GNUEABIHF:
+    case llvm::Triple::MuslEABIHF:
       return "arm1176jzf-s";
     default:
       return "arm7tdmi";
diff --git a/lib/Support/Twine.cpp b/lib/Support/Twine.cpp
index 020dd9596d9c..5e989fb28b4d 100644
--- a/lib/Support/Twine.cpp
+++ b/lib/Support/Twine.cpp
@@ -161,7 +161,7 @@ void Twine::printRepr(raw_ostream &OS) const {
   OS << ")";
 }
 
-void Twine::dump() const {
+LLVM_DUMP_METHOD void Twine::dump() const {
   print(dbgs());
 }
 
diff --git a/lib/Support/Unix/Memory.inc b/lib/Support/Unix/Memory.inc
index d70319168b84..f3463e581735 100644
--- a/lib/Support/Unix/Memory.inc
+++ b/lib/Support/Unix/Memory.inc
@@ -73,7 +73,7 @@ int getPosixProtectionFlags(unsigned Flags) {
   return PROT_NONE;
 }
 
-} // namespace
+} // anonymous namespace
 
 namespace llvm {
 namespace sys {
@@ -265,7 +265,7 @@ bool Memory::setWritable (MemoryBlock &M, std::string *ErrMsg) {
 }
 
 bool Memory::setExecutable (MemoryBlock &M, std::string *ErrMsg) {
-  if (M.Address == 0 || M.Size == 0) return false;
+  if (M.Address == nullptr || M.Size == 0) return false;
   Memory::InvalidateInstructionCache(M.Address, M.Size);
 #if defined(__APPLE__) && (defined(__arm__) || defined(__arm64__))
   kern_return_t kr = vm_protect(mach_task_self(), (vm_address_t)M.Address,
diff --git a/lib/Support/Unix/Path.inc b/lib/Support/Unix/Path.inc
index d85c37ab3bfa..84aafcb70d73 100644
--- a/lib/Support/Unix/Path.inc
+++ b/lib/Support/Unix/Path.inc
@@ -25,6 +25,9 @@
 #if HAVE_FCNTL_H
 #include <fcntl.h>
 #endif
+#ifdef HAVE_UNISTD_H
+#include <unistd.h>
+#endif
 #ifdef HAVE_SYS_MMAN_H
 #include <sys/mman.h>
 #endif
@@ -47,6 +50,7 @@
 
 #ifdef __APPLE__
 #include <mach-o/dyld.h>
+#include <sys/attr.h>
 #endif
 
 // Both stdio.h and cstdio are included via different pathes and
@@ -60,6 +64,25 @@
 # define PATH_MAX 4096
 #endif
 
+#include <sys/types.h>
+#if !defined(__APPLE__) && !defined(__OpenBSD__) && !defined(__ANDROID__)
+#include <sys/statvfs.h>
+#define STATVFS statvfs
+#define STATVFS_F_FRSIZE(vfs) vfs.f_frsize
+#else
+#ifdef __OpenBSD__
+#include <sys/param.h>
+#include <sys/mount.h>
+#elif defined(__ANDROID__)
+#include <sys/vfs.h>
+#else
+#include <sys/mount.h>
+#endif
+#define STATVFS statfs
+#define STATVFS_F_FRSIZE(vfs) static_cast<uint64_t>(vfs.f_bsize)
+#endif
+
+
 using namespace llvm;
 
 namespace llvm {
@@ -70,7 +93,7 @@ namespace fs {
     defined(__linux__) || defined(__CYGWIN__) || defined(__DragonFly__)
 static int
 test_dir(char ret[PATH_MAX], const char *dir, const char *bin)
-{  
+{
   struct stat sb;
   char fullpath[PATH_MAX];
 
@@ -174,6 +197,12 @@ std::string getMainExecutable(const char *argv0, void *MainAddr) {
   return "";
 }
 
+TimeValue file_status::getLastAccessedTime() const {
+  TimeValue Ret;
+  Ret.fromEpochTime(fs_st_atime);
+  return Ret;
+}
+
 TimeValue file_status::getLastModificationTime() const {
   TimeValue Ret;
   Ret.fromEpochTime(fs_st_mtime);
@@ -184,6 +213,18 @@ UniqueID file_status::getUniqueID() const {
   return UniqueID(fs_st_dev, fs_st_ino);
 }
 
+ErrorOr<space_info> disk_space(const Twine &Path) {
+  struct STATVFS Vfs;
+  if (::STATVFS(Path.str().c_str(), &Vfs))
+    return std::error_code(errno, std::generic_category());
+  auto FrSize = STATVFS_F_FRSIZE(Vfs);
+  space_info SpaceInfo;
+  SpaceInfo.capacity = static_cast<uint64_t>(Vfs.f_blocks) * FrSize;
+  SpaceInfo.free = static_cast<uint64_t>(Vfs.f_bfree) * FrSize;
+  SpaceInfo.available = static_cast<uint64_t>(Vfs.f_bavail) * FrSize;
+  return SpaceInfo;
+}
+
 std::error_code current_path(SmallVectorImpl<char> &result) {
   result.clear();
 
@@ -373,8 +414,9 @@ static std::error_code fillStatus(int StatRet, const struct stat &Status,
 
   perms Perms = static_cast<perms>(Status.st_mode);
   Result =
-      file_status(Type, Perms, Status.st_dev, Status.st_ino, Status.st_mtime,
-                  Status.st_uid, Status.st_gid, Status.st_size);
+      file_status(Type, Perms, Status.st_dev, Status.st_ino, Status.st_atime,
+                  Status.st_mtime, Status.st_uid, Status.st_gid,
+                  Status.st_size);
 
   return std::error_code();
 }
@@ -506,13 +548,47 @@ std::error_code detail::directory_iterator_increment(detail::DirIterState &it) {
   return std::error_code();
 }
 
-std::error_code openFileForRead(const Twine &Name, int &ResultFD) {
+#if !defined(F_GETPATH)
+static bool hasProcSelfFD() {
+  // If we have a /proc filesystem mounted, we can quickly establish the
+  // real name of the file with readlink
+  static const bool Result = (::access("/proc/self/fd", R_OK) == 0);
+  return Result;
+}
+#endif
+
+std::error_code openFileForRead(const Twine &Name, int &ResultFD,
+                                SmallVectorImpl<char> *RealPath) {
   SmallString<128> Storage;
   StringRef P = Name.toNullTerminatedStringRef(Storage);
   while ((ResultFD = open(P.begin(), O_RDONLY)) < 0) {
     if (errno != EINTR)
       return std::error_code(errno, std::generic_category());
   }
+  // Attempt to get the real name of the file, if the user asked
+  if(!RealPath)
+    return std::error_code();
+  RealPath->clear();
+#if defined(F_GETPATH)
+  // When F_GETPATH is availble, it is the quickest way to get
+  // the real path name.
+  char Buffer[MAXPATHLEN];
+  if (::fcntl(ResultFD, F_GETPATH, Buffer) != -1)
+    RealPath->append(Buffer, Buffer + strlen(Buffer));
+#else
+  char Buffer[PATH_MAX];
+  if (hasProcSelfFD()) {
+    char ProcPath[64];
+    snprintf(ProcPath, sizeof(ProcPath), "/proc/self/fd/%d", ResultFD);
+    ssize_t CharCount = ::readlink(ProcPath, Buffer, sizeof(Buffer));
+    if (CharCount > 0)
+      RealPath->append(Buffer, Buffer + CharCount);
+  } else {
+    // Use ::realpath to get the real path name
+    if (::realpath(P.begin(), Buffer) != nullptr)
+      RealPath->append(Buffer, Buffer + strlen(Buffer));
+  }
+#endif
   return std::error_code();
 }
 
@@ -546,6 +622,53 @@ std::error_code openFileForWrite(const Twine &Name, int &ResultFD,
   return std::error_code();
 }
 
+std::error_code getPathFromOpenFD(int FD, SmallVectorImpl<char> &ResultPath) {
+  if (FD < 0)
+    return make_error_code(errc::bad_file_descriptor);
+
+#if defined(F_GETPATH)
+  // When F_GETPATH is availble, it is the quickest way to get
+  // the path from a file descriptor.
+  ResultPath.reserve(MAXPATHLEN);
+  if (::fcntl(FD, F_GETPATH, ResultPath.begin()) == -1)
+    return std::error_code(errno, std::generic_category());
+
+  ResultPath.set_size(strlen(ResultPath.begin()));
+#else
+  // If we have a /proc filesystem mounted, we can quickly establish the
+  // real name of the file with readlink. Otherwise, we don't know how to
+  // get the filename from a file descriptor. Give up.
+  if (!fs::hasProcSelfFD())
+    return make_error_code(errc::function_not_supported);
+
+  ResultPath.reserve(PATH_MAX);
+  char ProcPath[64];
+  snprintf(ProcPath, sizeof(ProcPath), "/proc/self/fd/%d", FD);
+  ssize_t CharCount = ::readlink(ProcPath, ResultPath.begin(), ResultPath.capacity());
+  if (CharCount < 0)
+      return std::error_code(errno, std::generic_category());
+
+  // Was the filename truncated?
+  if (static_cast<size_t>(CharCount) == ResultPath.capacity()) {
+    // Use lstat to get the size of the filename
+    struct stat sb;
+    if (::lstat(ProcPath, &sb) < 0)
+      return std::error_code(errno, std::generic_category());
+
+    ResultPath.reserve(sb.st_size + 1);
+    CharCount = ::readlink(ProcPath, ResultPath.begin(), ResultPath.capacity());
+    if (CharCount < 0)
+      return std::error_code(errno, std::generic_category());
+
+    // Test for race condition: did the link size change?
+    if (CharCount > sb.st_size)
+      return std::error_code(ENAMETOOLONG, std::generic_category());
+  }
+  ResultPath.set_size(static_cast<size_t>(CharCount));
+#endif
+  return std::error_code();
+}
+
 } // end namespace fs
 
 namespace path {
@@ -586,12 +709,12 @@ static bool getDarwinConfDir(bool TempDir, SmallVectorImpl<char> &Result) {
 }
 
 static bool getUserCacheDir(SmallVectorImpl<char> &Result) {
-  // First try using XDS_CACHE_HOME env variable,
+  // First try using XDG_CACHE_HOME env variable,
   // as specified in XDG Base Directory Specification at
   // http://standards.freedesktop.org/basedir-spec/basedir-spec-latest.html
-  if (const char *XdsCacheDir = std::getenv("XDS_CACHE_HOME")) {
+  if (const char *XdgCacheDir = std::getenv("XDG_CACHE_HOME")) {
     Result.clear();
-    Result.append(XdsCacheDir, XdsCacheDir + strlen(XdsCacheDir));
+    Result.append(XdgCacheDir, XdgCacheDir + strlen(XdgCacheDir));
     return true;
   }
 
diff --git a/lib/Support/Unix/Process.inc b/lib/Support/Unix/Process.inc
index 27083eeb072d..d81836b92dae 100644
--- a/lib/Support/Unix/Process.inc
+++ b/lib/Support/Unix/Process.inc
@@ -169,6 +169,8 @@ void Process::PreventCoreFiles() {
   signal(SIGSEGV, _exit);
   signal(SIGBUS,  _exit);
 #endif
+
+  coreFilesPrevented = true;
 }
 
 Optional<std::string> Process::GetEnv(StringRef Name) {
@@ -456,7 +458,7 @@ unsigned llvm::sys::Process::GetRandomNumber() {
 #if defined(HAVE_DECL_ARC4RANDOM) && HAVE_DECL_ARC4RANDOM
   return arc4random();
 #else
-  static int x = (::srand(GetRandomNumberSeed()), 0);
+  static int x = (static_cast<void>(::srand(GetRandomNumberSeed())), 0);
   (void)x;
   return ::rand();
 #endif
diff --git a/lib/Support/Unix/Signals.inc b/lib/Support/Unix/Signals.inc
index 061cdb3da216..117d4e8bcb52 100644
--- a/lib/Support/Unix/Signals.inc
+++ b/lib/Support/Unix/Signals.inc
@@ -45,6 +45,17 @@
 #if HAVE_LINK_H
 #include <link.h>
 #endif
+#if HAVE_UNWIND_BACKTRACE
+// FIXME: We should be able to use <unwind.h> for any target that has an
+// _Unwind_Backtrace function, but on FreeBSD the configure test passes
+// despite the function not existing, and on Android, <unwind.h> conflicts
+// with <link.h>.
+#ifdef __GLIBC__
+#include <unwind.h>
+#else
+#undef HAVE_UNWIND_BACKTRACE
+#endif
+#endif
 
 using namespace llvm;
 
@@ -57,6 +68,8 @@ static void (*InterruptFunction)() = nullptr;
 
 static ManagedStatic<std::vector<std::string>> FilesToRemove;
 
+static StringRef Argv0;
+
 // IntSigs - Signals that represent requested termination. There's no bug
 // or failure, or if there is, it's not our direct responsibility. For whatever
 // reason, our continued execution is no longer desirable.
@@ -96,7 +109,7 @@ static void RegisterHandler(int Signal) {
   struct sigaction NewHandler;
 
   NewHandler.sa_handler = SignalHandler;
-  NewHandler.sa_flags = SA_NODEFER|SA_RESETHAND;
+  NewHandler.sa_flags = SA_NODEFER | SA_RESETHAND | SA_ONSTACK;
   sigemptyset(&NewHandler.sa_mask);
 
   // Install the new handler, save the old one in RegisteredSignalInfo.
@@ -106,6 +119,35 @@ static void RegisterHandler(int Signal) {
   ++NumRegisteredSignals;
 }
 
+#if defined(HAVE_SIGALTSTACK)
+// Hold onto the old alternate signal stack so that it's not reported as a leak.
+// We don't make any attempt to remove our alt signal stack if we remove our
+// signal handlers; that can't be done reliably if someone else is also trying
+// to do the same thing.
+static stack_t OldAltStack;
+
+static void CreateSigAltStack() {
+  const size_t AltStackSize = MINSIGSTKSZ + 8192;
+
+  // If we're executing on the alternate stack, or we already have an alternate
+  // signal stack that we're happy with, there's nothing for us to do. Don't
+  // reduce the size, some other part of the process might need a larger stack
+  // than we do.
+  if (sigaltstack(nullptr, &OldAltStack) != 0 ||
+      OldAltStack.ss_flags & SS_ONSTACK ||
+      (OldAltStack.ss_sp && OldAltStack.ss_size >= AltStackSize))
+    return;
+
+  stack_t AltStack = {};
+  AltStack.ss_sp = reinterpret_cast<char *>(malloc(AltStackSize));
+  AltStack.ss_size = AltStackSize;
+  if (sigaltstack(&AltStack, &OldAltStack) != 0)
+    free(AltStack.ss_sp);
+}
+#else
+static void CreateSigAltStack() {}
+#endif
+
 static void RegisterHandlers() {
   // We need to dereference the signals mutex during handler registration so
   // that we force its construction. This is to prevent the first use being
@@ -116,6 +158,10 @@ static void RegisterHandlers() {
   // If the handlers are already registered, we're done.
   if (NumRegisteredSignals != 0) return;
 
+  // Create an alternate stack for signal handling. This is necessary for us to
+  // be able to reliably handle signals due to stack overflow.
+  CreateSigAltStack();
+
   for (auto S : IntSigs) RegisterHandler(S);
   for (auto S : KillSigs) RegisterHandler(S);
 }
@@ -309,18 +355,62 @@ static bool findModulesAndOffsets(void **StackTrace, int Depth,
 }
 #endif // defined(HAVE_BACKTRACE) && defined(ENABLE_BACKTRACES) && ...
 
+#if defined(ENABLE_BACKTRACES) && defined(HAVE_UNWIND_BACKTRACE)
+static int unwindBacktrace(void **StackTrace, int MaxEntries) {
+  if (MaxEntries < 0)
+    return 0;
+
+  // Skip the first frame ('unwindBacktrace' itself).
+  int Entries = -1;
+
+  auto HandleFrame = [&](_Unwind_Context *Context) -> _Unwind_Reason_Code {
+    // Apparently we need to detect reaching the end of the stack ourselves.
+    void *IP = (void *)_Unwind_GetIP(Context);
+    if (!IP)
+      return _URC_END_OF_STACK;
+
+    assert(Entries < MaxEntries && "recursively called after END_OF_STACK?");
+    if (Entries >= 0)
+      StackTrace[Entries] = IP;
+
+    if (++Entries == MaxEntries)
+      return _URC_END_OF_STACK;
+    return _URC_NO_REASON;
+  };
+
+  _Unwind_Backtrace(
+      [](_Unwind_Context *Context, void *Handler) {
+        return (*static_cast<decltype(HandleFrame) *>(Handler))(Context);
+      },
+      static_cast<void *>(&HandleFrame));
+  return std::max(Entries, 0);
+}
+#endif
+
 // PrintStackTrace - In the case of a program crash or fault, print out a stack
 // trace so that the user has an indication of why and where we died.
 //
 // On glibc systems we have the 'backtrace' function, which works nicely, but
 // doesn't demangle symbols.
 void llvm::sys::PrintStackTrace(raw_ostream &OS) {
-#if defined(HAVE_BACKTRACE) && defined(ENABLE_BACKTRACES)
-  static void* StackTrace[256];
+#if defined(ENABLE_BACKTRACES)
+  static void *StackTrace[256];
+  int depth = 0;
+#if defined(HAVE_BACKTRACE)
   // Use backtrace() to output a backtrace on Linux systems with glibc.
-  int depth = backtrace(StackTrace,
+  if (!depth)
+    depth = backtrace(StackTrace, static_cast<int>(array_lengthof(StackTrace)));
+#endif
+#if defined(HAVE_UNWIND_BACKTRACE)
+  // Try _Unwind_Backtrace() if backtrace() failed.
+  if (!depth)
+    depth = unwindBacktrace(StackTrace,
                         static_cast<int>(array_lengthof(StackTrace)));
-  if (printSymbolizedStackTrace(StackTrace, depth, OS))
+#endif
+  if (!depth)
+    return;
+
+  if (printSymbolizedStackTrace(Argv0, StackTrace, depth, OS))
     return;
 #if HAVE_DLFCN_H && __GNUG__
   int width = 0;
@@ -369,7 +459,7 @@ void llvm::sys::PrintStackTrace(raw_ostream &OS) {
     }
     OS << '\n';
   }
-#else
+#elif defined(HAVE_BACKTRACE)
   backtrace_symbols_fd(StackTrace, depth, STDERR_FILENO);
 #endif
 #endif
@@ -383,7 +473,10 @@ void llvm::sys::DisableSystemDialogsOnCrash() {}
 
 /// PrintStackTraceOnErrorSignal - When an error signal (such as SIGABRT or
 /// SIGSEGV) is delivered to the process, print a stack trace and then exit.
-void llvm::sys::PrintStackTraceOnErrorSignal(bool DisableCrashReporting) {
+void llvm::sys::PrintStackTraceOnErrorSignal(StringRef Argv0,
+                                             bool DisableCrashReporting) {
+  ::Argv0 = Argv0;
+
   AddSignalHandler(PrintStackTraceSignalHandler, nullptr);
 
 #if defined(__APPLE__) && defined(ENABLE_CRASH_OVERRIDES)
@@ -402,42 +495,3 @@ void llvm::sys::PrintStackTraceOnErrorSignal(bool DisableCrashReporting) {
   }
 #endif
 }
-
-
-/***/
-
-// On Darwin, raise sends a signal to the main thread instead of the current
-// thread. This has the unfortunate effect that assert() and abort() will end up
-// bypassing our crash recovery attempts. We work around this for anything in
-// the same linkage unit by just defining our own versions of the assert handler
-// and abort.
-
-#if defined(__APPLE__) && defined(ENABLE_CRASH_OVERRIDES)
-
-#include <signal.h>
-#include <pthread.h>
-
-int raise(int sig) {
-  return pthread_kill(pthread_self(), sig);
-}
-
-void __assert_rtn(const char *func,
-                  const char *file,
-                  int line,
-                  const char *expr) {
-  if (func)
-    fprintf(stderr, "Assertion failed: (%s), function %s, file %s, line %d.\n",
-            expr, func, file, line);
-  else
-    fprintf(stderr, "Assertion failed: (%s), file %s, line %d.\n",
-            expr, file, line);
-  abort();
-}
-
-void abort() {
-  raise(SIGABRT);
-  usleep(1000);
-  __builtin_trap();
-}
-
-#endif
diff --git a/lib/Support/Windows/DynamicLibrary.inc b/lib/Support/Windows/DynamicLibrary.inc
index 17418b015c75..050689483deb 100644
--- a/lib/Support/Windows/DynamicLibrary.inc
+++ b/lib/Support/Windows/DynamicLibrary.inc
@@ -45,7 +45,7 @@ static bool loadDebugHelp(void) {
 }
 
 static BOOL CALLBACK
-ELM_Callback(WIN32_ELMCB_PCSTR ModuleName, DWORD64 ModuleBase,
+ELM_Callback(PCSTR ModuleName, DWORD64 ModuleBase,
              ULONG ModuleSize, PVOID UserContext) {
   OpenedHandles->insert((HMODULE)ModuleBase);
   return TRUE;
diff --git a/lib/Support/Windows/Path.inc b/lib/Support/Windows/Path.inc
index 5ef77b150ef0..fab6aeca54a9 100644
--- a/lib/Support/Windows/Path.inc
+++ b/lib/Support/Windows/Path.inc
@@ -151,6 +151,29 @@ UniqueID file_status::getUniqueID() const {
   return UniqueID(VolumeSerialNumber, FileID);
 }
 
+ErrorOr<space_info> disk_space(const Twine &Path) {
+  ULARGE_INTEGER Avail, Total, Free;
+  if (!::GetDiskFreeSpaceExA(Path.str().c_str(), &Avail, &Total, &Free))
+    return mapWindowsError(::GetLastError());
+  space_info SpaceInfo;
+  SpaceInfo.capacity =
+      (static_cast<uint64_t>(Total.HighPart) << 32) + Total.LowPart;
+  SpaceInfo.free = (static_cast<uint64_t>(Free.HighPart) << 32) + Free.LowPart;
+  SpaceInfo.available =
+      (static_cast<uint64_t>(Avail.HighPart) << 32) + Avail.LowPart;
+  return SpaceInfo;
+}
+
+TimeValue file_status::getLastAccessedTime() const {
+  ULARGE_INTEGER UI;
+  UI.LowPart = LastAccessedTimeLow;
+  UI.HighPart = LastAccessedTimeHigh;
+
+  TimeValue Ret;
+  Ret.fromWin32Time(UI.QuadPart);
+  return Ret;
+}
+
 TimeValue file_status::getLastModificationTime() const {
   ULARGE_INTEGER UI;
   UI.LowPart = LastWriteTimeLow;
@@ -255,24 +278,43 @@ std::error_code rename(const Twine &from, const Twine &to) {
 
   std::error_code ec = std::error_code();
 
-  // Retry while we see ERROR_ACCESS_DENIED.
+  // Retry while we see recoverable errors.
   // System scanners (eg. indexer) might open the source file when it is written
   // and closed.
 
-  for (int i = 0; i < 2000; i++) {
-    // Try ReplaceFile first, as it is able to associate a new data stream with
-    // the destination even if the destination file is currently open.
-    if (::ReplaceFileW(wide_to.begin(), wide_from.begin(), NULL, 0, NULL, NULL))
-      return std::error_code();
+  bool TryReplace = true;
 
-    // We get ERROR_FILE_NOT_FOUND if the destination file is missing.
-    // MoveFileEx can handle this case.
-    DWORD ReplaceError = ::GetLastError();
-    ec = mapWindowsError(ReplaceError);
-    if (ReplaceError != ERROR_ACCESS_DENIED &&
-        ReplaceError != ERROR_FILE_NOT_FOUND &&
-        ReplaceError != ERROR_SHARING_VIOLATION)
-      break;
+  for (int i = 0; i < 2000; i++) {
+    if (i > 0)
+      ::Sleep(1);
+
+    if (TryReplace) {
+      // Try ReplaceFile first, as it is able to associate a new data stream
+      // with the destination even if the destination file is currently open.
+      if (::ReplaceFileW(wide_to.data(), wide_from.data(), NULL, 0, NULL, NULL))
+        return std::error_code();
+
+      DWORD ReplaceError = ::GetLastError();
+      ec = mapWindowsError(ReplaceError);
+
+      // If ReplaceFileW returned ERROR_UNABLE_TO_MOVE_REPLACEMENT or
+      // ERROR_UNABLE_TO_MOVE_REPLACEMENT_2, retry but only use MoveFileExW().
+      if (ReplaceError == ERROR_UNABLE_TO_MOVE_REPLACEMENT ||
+          ReplaceError == ERROR_UNABLE_TO_MOVE_REPLACEMENT_2) {
+        TryReplace = false;
+        continue;
+      }
+      // If ReplaceFileW returned ERROR_UNABLE_TO_REMOVE_REPLACED, retry
+      // using ReplaceFileW().
+      if (ReplaceError == ERROR_UNABLE_TO_REMOVE_REPLACED)
+        continue;
+      // We get ERROR_FILE_NOT_FOUND if the destination file is missing.
+      // MoveFileEx can handle this case.
+      if (ReplaceError != ERROR_ACCESS_DENIED &&
+          ReplaceError != ERROR_FILE_NOT_FOUND &&
+          ReplaceError != ERROR_SHARING_VIOLATION)
+        break;
+    }
 
     if (::MoveFileExW(wide_from.begin(), wide_to.begin(),
                       MOVEFILE_COPY_ALLOWED | MOVEFILE_REPLACE_EXISTING))
@@ -281,8 +323,6 @@ std::error_code rename(const Twine &from, const Twine &to) {
     DWORD MoveError = ::GetLastError();
     ec = mapWindowsError(MoveError);
     if (MoveError != ERROR_ACCESS_DENIED) break;
-
-    ::Sleep(1);
   }
 
   return ec;
@@ -327,13 +367,15 @@ bool can_execute(const Twine &Path) {
 
 bool equivalent(file_status A, file_status B) {
   assert(status_known(A) && status_known(B));
-  return A.FileIndexHigh      == B.FileIndexHigh &&
-         A.FileIndexLow       == B.FileIndexLow &&
-         A.FileSizeHigh       == B.FileSizeHigh &&
-         A.FileSizeLow        == B.FileSizeLow &&
-         A.LastWriteTimeHigh  == B.LastWriteTimeHigh &&
-         A.LastWriteTimeLow   == B.LastWriteTimeLow &&
-         A.VolumeSerialNumber == B.VolumeSerialNumber;
+  return A.FileIndexHigh         == B.FileIndexHigh &&
+         A.FileIndexLow          == B.FileIndexLow &&
+         A.FileSizeHigh          == B.FileSizeHigh &&
+         A.FileSizeLow           == B.FileSizeLow &&
+         A.LastAccessedTimeHigh  == B.LastAccessedTimeHigh &&
+         A.LastAccessedTimeLow   == B.LastAccessedTimeLow &&
+         A.LastWriteTimeHigh     == B.LastWriteTimeHigh &&
+         A.LastWriteTimeLow      == B.LastWriteTimeLow &&
+         A.VolumeSerialNumber    == B.VolumeSerialNumber;
 }
 
 std::error_code equivalent(const Twine &A, const Twine &B, bool &result) {
@@ -361,7 +403,7 @@ static bool isReservedName(StringRef path) {
   if (path.startswith("\\\\.\\"))
     return true;
 
-  // Then compare against the list of ancient reserved names
+  // Then compare against the list of ancient reserved names.
   for (size_t i = 0; i < array_lengthof(sReservedNames); ++i) {
     if (path.equals_lower(sReservedNames[i]))
       return true;
@@ -404,7 +446,9 @@ static std::error_code getStatus(HANDLE FileHandle, file_status &Result) {
                          ? file_type::directory_file
                          : file_type::regular_file;
     Result =
-        file_status(Type, Info.ftLastWriteTime.dwHighDateTime,
+        file_status(Type, Info.ftLastAccessTime.dwHighDateTime,
+                    Info.ftLastAccessTime.dwLowDateTime,
+                    Info.ftLastWriteTime.dwHighDateTime,
                     Info.ftLastWriteTime.dwLowDateTime,
                     Info.dwVolumeSerialNumber, Info.nFileSizeHigh,
                     Info.nFileSizeLow, Info.nFileIndexHigh, Info.nFileIndexLow);
@@ -663,7 +707,8 @@ std::error_code detail::directory_iterator_increment(detail::DirIterState &it) {
   return std::error_code();
 }
 
-std::error_code openFileForRead(const Twine &Name, int &ResultFD) {
+std::error_code openFileForRead(const Twine &Name, int &ResultFD,
+                                SmallVectorImpl<char> *RealPath) {
   SmallVector<wchar_t, 128> PathUTF16;
 
   if (std::error_code EC = widenPath(Name, PathUTF16))
@@ -692,6 +737,22 @@ std::error_code openFileForRead(const Twine &Name, int &ResultFD) {
     return mapWindowsError(ERROR_INVALID_HANDLE);
   }
 
+  // Fetch the real name of the file, if the user asked
+  if (RealPath) {
+    RealPath->clear();
+    wchar_t RealPathUTF16[MAX_PATH];
+    DWORD CountChars =
+      ::GetFinalPathNameByHandleW(H, RealPathUTF16, MAX_PATH,
+                                  FILE_NAME_NORMALIZED);
+    if (CountChars > 0 && CountChars < MAX_PATH) {
+      // Convert the result from UTF-16 to UTF-8.
+      SmallString<MAX_PATH> RealPathUTF8;
+      if (!UTF16ToUTF8(RealPathUTF16, CountChars, RealPathUTF8))
+        RealPath->append(RealPathUTF8.data(),
+                         RealPathUTF8.data() + strlen(RealPathUTF8.data()));
+    }
+  }
+
   ResultFD = FD;
   return std::error_code();
 }
@@ -752,6 +813,42 @@ std::error_code openFileForWrite(const Twine &Name, int &ResultFD,
   ResultFD = FD;
   return std::error_code();
 }
+
+std::error_code getPathFromOpenFD(int FD, SmallVectorImpl<char> &ResultPath) {
+  HANDLE FileHandle = reinterpret_cast<HANDLE>(::_get_osfhandle(FD));
+  if (FileHandle == INVALID_HANDLE_VALUE)
+    return make_error_code(errc::bad_file_descriptor);
+
+  DWORD CharCount;
+  SmallVector<wchar_t, 1024> TempPath;
+  do {
+    CharCount = ::GetFinalPathNameByHandleW(FileHandle, TempPath.begin(),
+                                            TempPath.capacity(),
+                                            FILE_NAME_NORMALIZED);
+    if (CharCount < TempPath.capacity())
+      break;
+
+    // Reserve sufficient space for the path as well as the null character. Even
+    // though the API does not document that it is required, if we reserve just
+    // CharCount space, the function call will not store the resulting path and
+    // still report success.
+    TempPath.reserve(CharCount + 1);
+  } while (true);
+
+  if (CharCount == 0)
+    return mapWindowsError(::GetLastError());
+
+  TempPath.set_size(CharCount);
+
+  // On earlier Windows releases, the character count includes the terminating
+  // null.
+  if (TempPath.back() == L'\0') {
+    --CharCount;
+    TempPath.pop_back();
+  }
+
+  return windows::UTF16ToUTF8(TempPath.data(), CharCount, ResultPath);
+}
 } // end namespace fs
 
 namespace path {
@@ -886,6 +983,7 @@ std::error_code UTF16ToCurCP(const wchar_t *utf16, size_t utf16_len,
                              llvm::SmallVectorImpl<char> &utf8) {
   return UTF16ToCodePage(CP_ACP, utf16, utf16_len, utf8);
 }
+
 } // end namespace windows
 } // end namespace sys
 } // end namespace llvm
diff --git a/lib/Support/Windows/Process.inc b/lib/Support/Windows/Process.inc
index dae35a88132b..b012991c2a54 100644
--- a/lib/Support/Windows/Process.inc
+++ b/lib/Support/Windows/Process.inc
@@ -123,6 +123,8 @@ void Process::PreventCoreFiles() {
   SetErrorMode(SEM_FAILCRITICALERRORS |
                SEM_NOGPFAULTERRORBOX |
                SEM_NOOPENFILEERRORBOX);
+
+  coreFilesPrevented = true;
 }
 
 /// Returns the environment variable \arg Name's value as a string encoded in
@@ -199,6 +201,9 @@ WildcardExpand(const wchar_t *Arg, SmallVectorImpl<const char *> &Args,
   const int DirSize = Dir.size();
 
   // Search for matching files.
+  // FIXME:  This assumes the wildcard is only in the file name and not in the
+  // directory portion of the file path.  For example, it doesn't handle
+  // "*\foo.c" nor "s?c\bar.cpp".
   WIN32_FIND_DATAW FileData;
   HANDLE FindHandle = FindFirstFileW(Arg, &FileData);
   if (FindHandle == INVALID_HANDLE_VALUE) {
@@ -213,7 +218,7 @@ WildcardExpand(const wchar_t *Arg, SmallVectorImpl<const char *> &Args,
     if (ec)
       break;
 
-    // Push the filename onto Dir, and remove it afterwards.
+    // Append FileName to Dir, and remove it afterwards.
     llvm::sys::path::append(Dir, StringRef(FileName.data(), FileName.size()));
     AllocateAndPush(Dir, Args, Allocator);
     Dir.resize(DirSize);
@@ -223,6 +228,23 @@ WildcardExpand(const wchar_t *Arg, SmallVectorImpl<const char *> &Args,
   return ec;
 }
 
+static std::error_code
+ExpandShortFileName(const wchar_t *Arg, SmallVectorImpl<const char *> &Args,
+                    SpecificBumpPtrAllocator<char> &Allocator) {
+  SmallVector<wchar_t, MAX_PATH> LongPath;
+  DWORD Length = GetLongPathNameW(Arg, LongPath.data(), LongPath.capacity());
+  if (Length == 0)
+    return mapWindowsError(GetLastError());
+  if (Length > LongPath.capacity()) {
+    // We're not going to try to deal with paths longer than MAX_PATH, so we'll
+    // treat this as an error.  GetLastError() returns ERROR_SUCCESS, which
+    // isn't useful, so we'll hardcode an appropriate error value.
+    return mapWindowsError(ERROR_INSUFFICIENT_BUFFER);
+  }
+  LongPath.set_size(Length);
+  return ConvertAndPushArg(LongPath.data(), Args, Allocator);
+}
+
 std::error_code
 Process::GetArgumentVector(SmallVectorImpl<const char *> &Args,
                            ArrayRef<const char *>,
@@ -236,7 +258,19 @@ Process::GetArgumentVector(SmallVectorImpl<const char *> &Args,
   Args.reserve(ArgCount);
   std::error_code ec;
 
-  for (int i = 0; i < ArgCount; ++i) {
+  // The first argument may contain just the name of the executable (e.g.,
+  // "clang") rather than the full path, so swap it with the full path.
+  wchar_t ModuleName[MAX_PATH];
+  int Length = ::GetModuleFileNameW(NULL, ModuleName, MAX_PATH);
+  if (0 < Length && Length < MAX_PATH)
+    UnicodeCommandLine[0] = ModuleName;
+
+  // If the first argument is a shortened (8.3) name (which is possible even
+  // if we got the module name), the driver will have trouble distinguishing it
+  // (e.g., clang.exe v. clang++.exe), so expand it now.
+  ec = ExpandShortFileName(UnicodeCommandLine[0], Args, ArgAllocator);
+
+  for (int i = 1; i < ArgCount && !ec; ++i) {
     ec = WildcardExpand(UnicodeCommandLine[i], Args, ArgAllocator);
     if (ec)
       break;
diff --git a/lib/Support/Windows/Signals.inc b/lib/Support/Windows/Signals.inc
index f40ca72996a1..1e2fa4210df6 100644
--- a/lib/Support/Windows/Signals.inc
+++ b/lib/Support/Windows/Signals.inc
@@ -11,7 +11,11 @@
 //
 //===----------------------------------------------------------------------===//
 #include "llvm/Support/FileSystem.h"
+#include "llvm/Support/Path.h"
+#include "llvm/Support/Process.h"
+#include "llvm/Support/WindowsError.h"
 #include <algorithm>
+#include <io.h>
 #include <signal.h>
 #include <stdio.h>
 
@@ -117,6 +121,12 @@ typedef DWORD64 (__stdcall *PGET_MODULE_BASE_ROUTINE64)(HANDLE hProcess,
 typedef DWORD64 (__stdcall *PTRANSLATE_ADDRESS_ROUTINE64)(HANDLE hProcess,
                       HANDLE hThread, LPADDRESS64 lpaddr);
 
+typedef BOOL(WINAPI *fpMiniDumpWriteDump)(HANDLE, DWORD, HANDLE, MINIDUMP_TYPE,
+                                          PMINIDUMP_EXCEPTION_INFORMATION,
+                                          PMINIDUMP_USER_STREAM_INFORMATION,
+                                          PMINIDUMP_CALLBACK_INFORMATION);
+static fpMiniDumpWriteDump fMiniDumpWriteDump;
+
 typedef BOOL (WINAPI *fpStackWalk64)(DWORD, HANDLE, HANDLE, LPSTACKFRAME64,
                       PVOID, PREAD_PROCESS_MEMORY_ROUTINE64,
                       PFUNCTION_TABLE_ACCESS_ROUTINE64,
@@ -154,6 +164,8 @@ static fpEnumerateLoadedModules fEnumerateLoadedModules;
 static bool load64BitDebugHelp(void) {
   HMODULE hLib = ::LoadLibraryW(L"Dbghelp.dll");
   if (hLib) {
+    fMiniDumpWriteDump = (fpMiniDumpWriteDump)
+                      ::GetProcAddress(hLib, "MiniDumpWriteDump");
     fStackWalk64 = (fpStackWalk64)
                       ::GetProcAddress(hLib, "StackWalk64");
     fSymGetModuleBase64 = (fpSymGetModuleBase64)
@@ -171,7 +183,7 @@ static bool load64BitDebugHelp(void) {
     fEnumerateLoadedModules = (fpEnumerateLoadedModules)
       ::GetProcAddress(hLib, "EnumerateLoadedModules64");
   }
-  return fStackWalk64 && fSymInitialize && fSymSetOptions;
+  return fStackWalk64 && fSymInitialize && fSymSetOptions && fMiniDumpWriteDump;
 }
 
 using namespace llvm;
@@ -194,6 +206,8 @@ static PTOP_LEVEL_EXCEPTION_FILTER OldFilter = NULL;
 static CRITICAL_SECTION CriticalSection;
 static bool CriticalSectionInitialized = false;
 
+static StringRef Argv0;
+
 enum {
 #if defined(_M_X64)
   NativeMachineType = IMAGE_FILE_MACHINE_AMD64
@@ -228,7 +242,7 @@ static bool printStackTraceWithLLVMSymbolizer(llvm::raw_ostream &OS,
       break;
   }
 
-  return printSymbolizedStackTrace(&StackTrace[0], Depth, OS);
+  return printSymbolizedStackTrace(Argv0, &StackTrace[0], Depth, OS);
 }
 
 namespace {
@@ -241,7 +255,7 @@ struct FindModuleData {
 };
 }
 
-static BOOL CALLBACK findModuleCallback(WIN32_ELMCB_PCSTR ModuleName,
+static BOOL CALLBACK findModuleCallback(PCSTR ModuleName,
                                         DWORD64 ModuleBase, ULONG ModuleSize,
                                         void *VoidData) {
   FindModuleData *Data = (FindModuleData*)VoidData;
@@ -484,7 +498,13 @@ void sys::DisableSystemDialogsOnCrash() {
 
 /// PrintStackTraceOnErrorSignal - When an error signal (such as SIBABRT or
 /// SIGSEGV) is delivered to the process, print a stack trace and then exit.
-void sys::PrintStackTraceOnErrorSignal(bool DisableCrashReporting) {
+void sys::PrintStackTraceOnErrorSignal(StringRef Argv0,
+                                       bool DisableCrashReporting) {
+  ::Argv0 = Argv0;
+
+  if (DisableCrashReporting || getenv("LLVM_DISABLE_CRASH_REPORT"))
+    Process::PreventCoreFiles();
+
   DisableSystemDialogsOnCrash();
   RegisterHandler();
   LeaveCriticalSection(&CriticalSection);
@@ -563,9 +583,209 @@ void llvm::sys::RunInterruptHandlers() {
   Cleanup();
 }
 
+/// \brief Find the Windows Registry Key for a given location.
+///
+/// \returns a valid HKEY if the location exists, else NULL.
+static HKEY FindWERKey(const llvm::Twine &RegistryLocation) {
+  HKEY Key;
+  if (ERROR_SUCCESS != ::RegOpenKeyExA(HKEY_LOCAL_MACHINE,
+                                       RegistryLocation.str().c_str(), 0,
+                                       KEY_QUERY_VALUE | KEY_READ, &Key))
+    return NULL;
+
+  return Key;
+}
+
+/// \brief Populate ResultDirectory with the value for "DumpFolder" for a given
+/// Windows Registry key.
+///
+/// \returns true if a valid value for DumpFolder exists, false otherwise.
+static bool GetDumpFolder(HKEY Key,
+                          llvm::SmallVectorImpl<char> &ResultDirectory) {
+  using llvm::sys::windows::UTF16ToUTF8;
+
+  if (!Key)
+    return false;
+
+  DWORD BufferLengthBytes = 0;
+
+  if (ERROR_SUCCESS != ::RegGetValueW(Key, 0, L"DumpFolder", REG_EXPAND_SZ,
+                                      NULL, NULL, &BufferLengthBytes))
+    return false;
+
+  SmallVector<wchar_t, MAX_PATH> Buffer(BufferLengthBytes);
+
+  if (ERROR_SUCCESS != ::RegGetValueW(Key, 0, L"DumpFolder", REG_EXPAND_SZ,
+                                      NULL, Buffer.data(), &BufferLengthBytes))
+    return false;
+
+  DWORD ExpandBufferSize = ::ExpandEnvironmentStringsW(Buffer.data(), NULL, 0);
+
+  if (!ExpandBufferSize)
+    return false;
+
+  SmallVector<wchar_t, MAX_PATH> ExpandBuffer(ExpandBufferSize);
+
+  if (ExpandBufferSize != ::ExpandEnvironmentStringsW(Buffer.data(),
+                                                      ExpandBuffer.data(),
+                                                      ExpandBufferSize))
+    return false;
+
+  if (UTF16ToUTF8(ExpandBuffer.data(), ExpandBufferSize - 1, ResultDirectory))
+    return false;
+
+  return true;
+}
+
+/// \brief Populate ResultType with a valid MINIDUMP_TYPE based on the value of
+/// "DumpType" for a given Windows Registry key.
+///
+/// According to
+/// https://msdn.microsoft.com/en-us/library/windows/desktop/bb787181(v=vs.85).aspx
+/// valid values for DumpType are:
+///   * 0: Custom dump
+///   * 1: Mini dump
+///   * 2: Full dump
+/// If "Custom dump" is specified then the "CustomDumpFlags" field is read
+/// containing a bitwise combination of MINIDUMP_TYPE values.
+///
+/// \returns true if a valid value for ResultType can be set, false otherwise.
+static bool GetDumpType(HKEY Key, MINIDUMP_TYPE &ResultType) {
+  if (!Key)
+    return false;
+
+  DWORD DumpType;
+  DWORD TypeSize = sizeof(DumpType);
+  if (ERROR_SUCCESS != ::RegGetValueW(Key, NULL, L"DumpType", RRF_RT_REG_DWORD,
+                                      NULL, &DumpType,
+                                      &TypeSize))
+    return false;
+
+  switch (DumpType) {
+  case 0: {
+    DWORD Flags = 0;
+    if (ERROR_SUCCESS != ::RegGetValueW(Key, NULL, L"CustomDumpFlags",
+                                        RRF_RT_REG_DWORD, NULL, &Flags,
+                                        &TypeSize))
+      return false;
+
+    ResultType = static_cast<MINIDUMP_TYPE>(Flags);
+    break;
+  }
+  case 1:
+    ResultType = MiniDumpNormal;
+    break;
+  case 2:
+    ResultType = MiniDumpWithFullMemory;
+    break;
+  default:
+    return false;
+  }
+  return true;
+}
+
+/// \brief Write a Windows dump file containing process information that can be
+/// used for post-mortem debugging.
+///
+/// \returns zero error code if a mini dump created, actual error code
+/// otherwise.
+static std::error_code WINAPI
+WriteWindowsDumpFile(PMINIDUMP_EXCEPTION_INFORMATION ExceptionInfo) {
+  using namespace llvm;
+  using namespace llvm::sys;
+
+  std::string MainExecutableName = fs::getMainExecutable(nullptr, nullptr);
+  StringRef ProgramName;
+
+  if (MainExecutableName.empty()) {
+    // If we can't get the executable filename,
+    // things are in worse shape than we realize
+    // and we should just bail out.
+    return mapWindowsError(::GetLastError());
+  }
+
+  ProgramName = path::filename(MainExecutableName.c_str());
+
+  // The Windows Registry location as specified at
+  // https://msdn.microsoft.com/en-us/library/windows/desktop/bb787181%28v=vs.85%29.aspx
+  // "Collecting User-Mode Dumps" that may optionally be set to collect crash
+  // dumps in a specified location.
+  StringRef LocalDumpsRegistryLocation =
+      "SOFTWARE\\Microsoft\\Windows\\Windows Error Reporting\\LocalDumps";
+
+  // The key pointing to the Registry location that may contain global crash
+  // dump settings.  This will be NULL if the location can not be found.
+  ScopedRegHandle DefaultLocalDumpsKey(FindWERKey(LocalDumpsRegistryLocation));
+
+  // The key pointing to the Registry location that may contain
+  // application-specific crash dump settings.  This will be NULL if the
+  // location can not be found.
+  ScopedRegHandle AppSpecificKey(
+      FindWERKey(Twine(LocalDumpsRegistryLocation) + "\\" + ProgramName));
+
+  // Look to see if a dump type is specified in the registry; first with the
+  // app-specific key and failing that with the global key.  If none are found
+  // default to a normal dump (GetDumpType will return false either if the key
+  // is NULL or if there is no valid DumpType value at its location).
+  MINIDUMP_TYPE DumpType;
+  if (!GetDumpType(AppSpecificKey, DumpType))
+    if (!GetDumpType(DefaultLocalDumpsKey, DumpType))
+      DumpType = MiniDumpNormal;
+
+  // Look to see if a dump location is specified in the registry; first with the
+  // app-specific key and failing that with the global key.  If none are found
+  // we'll just create the dump file in the default temporary file location
+  // (GetDumpFolder will return false either if the key is NULL or if there is
+  // no valid DumpFolder value at its location).
+  bool ExplicitDumpDirectorySet = true;
+  SmallString<MAX_PATH> DumpDirectory;
+  if (!GetDumpFolder(AppSpecificKey, DumpDirectory))
+    if (!GetDumpFolder(DefaultLocalDumpsKey, DumpDirectory))
+      ExplicitDumpDirectorySet = false;
+
+  int FD;
+  SmallString<MAX_PATH> DumpPath;
+
+  if (ExplicitDumpDirectorySet) {
+    if (std::error_code EC = fs::create_directories(DumpDirectory))
+      return EC;
+    if (std::error_code EC = fs::createUniqueFile(
+            Twine(DumpDirectory) + "\\" + ProgramName + ".%%%%%%.dmp", FD,
+            DumpPath))
+      return EC;
+  } else if (std::error_code EC =
+                 fs::createTemporaryFile(ProgramName, "dmp", FD, DumpPath))
+    return EC;
+
+  // Our support functions return a file descriptor but Windows wants a handle.
+  ScopedCommonHandle FileHandle(reinterpret_cast<HANDLE>(_get_osfhandle(FD)));
+
+  if (!fMiniDumpWriteDump(::GetCurrentProcess(), ::GetCurrentProcessId(),
+                          FileHandle, DumpType, ExceptionInfo, NULL, NULL))
+    return mapWindowsError(::GetLastError());
+
+  llvm::errs() << "Wrote crash dump file \"" << DumpPath << "\"\n";
+  return std::error_code();
+}
+
 static LONG WINAPI LLVMUnhandledExceptionFilter(LPEXCEPTION_POINTERS ep) {
   Cleanup();
 
+  // We'll automatically write a Minidump file here to help diagnose
+  // the nasty sorts of crashes that aren't 100% reproducible from a set of
+  // inputs (or in the event that the user is unable or unwilling to provide a
+  // reproducible case).
+  if (!llvm::Process::AreCoreFilesPrevented()) {
+    MINIDUMP_EXCEPTION_INFORMATION ExceptionInfo;
+    ExceptionInfo.ThreadId = ::GetCurrentThreadId();
+    ExceptionInfo.ExceptionPointers = ep;
+    ExceptionInfo.ClientPointers = FALSE;
+
+    if (std::error_code EC = WriteWindowsDumpFile(&ExceptionInfo))
+      llvm::errs() << "Could not write crash dump file: " << EC.message()
+                   << "\n";
+  }
+
   // Initialize the STACKFRAME structure.
   STACKFRAME64 StackFrame = {};
 
diff --git a/lib/Support/Windows/WindowsSupport.h b/lib/Support/Windows/WindowsSupport.h
index 60490f266434..18ecdf4e73d9 100644
--- a/lib/Support/Windows/WindowsSupport.h
+++ b/lib/Support/Windows/WindowsSupport.h
@@ -45,7 +45,6 @@
 #include <wincrypt.h>
 #include <cassert>
 #include <string>
-#include <vector>
 
 /// Determines if the program is running on Windows 8 or newer. This
 /// reimplements one of the helpers in the Windows 8.1 SDK, which are intended
@@ -69,15 +68,15 @@ inline bool RunningWindows8OrGreater() {
                             Mask) != FALSE;
 }
 
-inline bool MakeErrMsg(std::string* ErrMsg, const std::string& prefix) {
+inline bool MakeErrMsg(std::string *ErrMsg, const std::string &prefix) {
   if (!ErrMsg)
     return true;
   char *buffer = NULL;
   DWORD LastError = GetLastError();
-  DWORD R = FormatMessage(FORMAT_MESSAGE_ALLOCATE_BUFFER |
-                          FORMAT_MESSAGE_FROM_SYSTEM |
-                          FORMAT_MESSAGE_MAX_WIDTH_MASK,
-                          NULL, LastError, 0, (LPSTR)&buffer, 1, NULL);
+  DWORD R = FormatMessageA(FORMAT_MESSAGE_ALLOCATE_BUFFER |
+                               FORMAT_MESSAGE_FROM_SYSTEM |
+                               FORMAT_MESSAGE_MAX_WIDTH_MASK,
+                           NULL, LastError, 0, (LPSTR)&buffer, 1, NULL);
   if (R)
     *ErrMsg = prefix + ": " + buffer;
   else
@@ -168,6 +167,22 @@ struct CryptContextTraits : CommonHandleTraits {
   }
 };
 
+struct RegTraits : CommonHandleTraits {
+  typedef HKEY handle_type;
+
+  static handle_type GetInvalid() {
+    return NULL;
+  }
+
+  static void Close(handle_type h) {
+    ::RegCloseKey(h);
+  }
+
+  static bool IsValid(handle_type h) {
+    return h != GetInvalid();
+  }
+};
+
 struct FindHandleTraits : CommonHandleTraits {
   static void Close(handle_type h) {
     ::FindClose(h);
@@ -179,6 +194,7 @@ struct FileHandleTraits : CommonHandleTraits {};
 typedef ScopedHandle<CommonHandleTraits> ScopedCommonHandle;
 typedef ScopedHandle<FileHandleTraits>   ScopedFileHandle;
 typedef ScopedHandle<CryptContextTraits> ScopedCryptContext;
+typedef ScopedHandle<RegTraits>          ScopedRegHandle;
 typedef ScopedHandle<FindHandleTraits>   ScopedFindHandle;
 typedef ScopedHandle<JobHandleTraits>    ScopedJobHandle;
 
diff --git a/lib/Support/YAMLParser.cpp b/lib/Support/YAMLParser.cpp
index c4384cafff62..620841c2d152 100644
--- a/lib/Support/YAMLParser.cpp
+++ b/lib/Support/YAMLParser.cpp
@@ -1911,7 +1911,7 @@ StringRef ScalarNode::getValue(SmallVectorImpl<char> &Storage) const {
     return UnquotedValue;
   }
   // Plain or block.
-  return Value.rtrim(" ");
+  return Value.rtrim(' ');
 }
 
 StringRef ScalarNode::unescapeDoubleQuoted( StringRef UnquotedValue
diff --git a/lib/Support/YAMLTraits.cpp b/lib/Support/YAMLTraits.cpp
index 2aa6e9b74683..75fac20a8edb 100644
--- a/lib/Support/YAMLTraits.cpp
+++ b/lib/Support/YAMLTraits.cpp
@@ -423,8 +423,29 @@ void Output::beginMapping() {
 
 bool Output::mapTag(StringRef Tag, bool Use) {
   if (Use) {
-    this->output(" ");
+    // If this tag is being written inside a sequence we should write the start
+    // of the sequence before writing the tag, otherwise the tag won't be
+    // attached to the element in the sequence, but rather the sequence itself.
+    bool SequenceElement =
+        StateStack.size() > 1 && (StateStack[StateStack.size() - 2] == inSeq ||
+          StateStack[StateStack.size() - 2] == inFlowSeq);
+    if (SequenceElement && StateStack.back() == inMapFirstKey) {
+      this->newLineCheck();
+    } else {
+      this->output(" ");
+    }
     this->output(Tag);
+    if (SequenceElement) {
+      // If we're writing the tag during the first element of a map, the tag
+      // takes the place of the first element in the sequence.
+      if (StateStack.back() == inMapFirstKey) {
+        StateStack.pop_back();
+        StateStack.push_back(inMapOtherKey);
+      }
+      // Tags inside maps in sequences should act as keys in the map from a
+      // formatting perspective, so we always want a newline in a sequence.
+      NeedsNewLine = true;
+    }
   }
   return Use;
 }
diff --git a/lib/Support/raw_ostream.cpp b/lib/Support/raw_ostream.cpp
index 15813fd3e669..275fe1d49459 100644
--- a/lib/Support/raw_ostream.cpp
+++ b/lib/Support/raw_ostream.cpp
@@ -141,7 +141,7 @@ raw_ostream &raw_ostream::operator<<(unsigned long long N) {
     return this->operator<<(static_cast<unsigned long>(N));
 
   char NumberBuffer[20];
-  char *EndPtr = NumberBuffer+sizeof(NumberBuffer);
+  char *EndPtr = std::end(NumberBuffer);
   char *CurPtr = EndPtr;
 
   while (N) {
@@ -166,13 +166,13 @@ raw_ostream &raw_ostream::write_hex(unsigned long long N) {
   if (N == 0)
     return *this << '0';
 
-  char NumberBuffer[20];
-  char *EndPtr = NumberBuffer+sizeof(NumberBuffer);
+  char NumberBuffer[16];
+  char *EndPtr = std::end(NumberBuffer);
   char *CurPtr = EndPtr;
 
   while (N) {
-    uintptr_t x = N % 16;
-    *--CurPtr = (x < 10 ? '0' + x : 'a' + x - 10);
+    unsigned char x = static_cast<unsigned char>(N) % 16;
+    *--CurPtr = hexdigit(x, /*LowerCase*/true);
     N /= 16;
   }
 
@@ -181,9 +181,7 @@ raw_ostream &raw_ostream::write_hex(unsigned long long N) {
 
 raw_ostream &raw_ostream::write_escaped(StringRef Str,
                                         bool UseHexEscapes) {
-  for (unsigned i = 0, e = Str.size(); i != e; ++i) {
-    unsigned char c = Str[i];
-
+  for (unsigned char c : Str) {
     switch (c) {
     case '\\':
       *this << '\\' << '\\';
@@ -232,7 +230,7 @@ raw_ostream &raw_ostream::operator<<(double N) {
   // On MSVCRT and compatible, output of %e is incompatible to Posix
   // by default. Number of exponent digits should be at least 2. "%+03d"
   // FIXME: Implement our formatter to here or Support/Format.h!
-#if __cplusplus >= 201103L && defined(__MINGW32__)
+#if defined(__MINGW32__)
   // FIXME: It should be generic to C++11.
   if (N == 0.0 && std::signbit(N))
     return *this << "-0.000000e+00";
@@ -422,11 +420,10 @@ raw_ostream &raw_ostream::operator<<(const FormattedNumber &FN) {
       NumberBuffer[1] = '0';
     char *EndPtr = NumberBuffer+Width;
     char *CurPtr = EndPtr;
-    const char A = FN.Upper ? 'A' : 'a';
     unsigned long long N = FN.HexValue;
     while (N) {
-      uintptr_t x = N % 16;
-      *--CurPtr = (x < 10 ? '0' + x : A + x - 10);
+      unsigned char x = static_cast<unsigned char>(N) % 16;
+      *--CurPtr = hexdigit(x, !FN.Upper);
       N /= 16;
     }
 
@@ -626,6 +623,7 @@ void raw_fd_ostream::close() {
 }
 
 uint64_t raw_fd_ostream::seek(uint64_t off) {
+  assert(SupportsSeeking && "Stream does not support seeking!");
   flush();
   pos = ::lseek(FD, off, SEEK_SET);
   if (pos == (uint64_t)-1)
@@ -718,9 +716,10 @@ bool raw_fd_ostream::has_colors() const {
 /// outs() - This returns a reference to a raw_ostream for standard output.
 /// Use it like: outs() << "foo" << "bar";
 raw_ostream &llvm::outs() {
-  // Set buffer settings to model stdout behavior.
-  // Delete the file descriptor when the program exits, forcing error
-  // detection. If you don't want this behavior, don't use outs().
+  // Set buffer settings to model stdout behavior.  Delete the file descriptor
+  // when the program exits, forcing error detection.  This means that if you
+  // ever call outs(), you can't open another raw_fd_ostream on stdout, as we'll
+  // close stdout twice and print an error the second time.
   std::error_code EC;
   static raw_fd_ostream S("-", EC, sys::fs::F_None);
   assert(!EC);
diff --git a/lib/TableGen/Makefile b/lib/TableGen/Makefile
deleted file mode 100644
index 345db3465cfe..000000000000
--- a/lib/TableGen/Makefile
+++ /dev/null
@@ -1,14 +0,0 @@
-##===- lib/TableGen/Makefile -------------------------------*- Makefile -*-===##
-#
-#                     The LLVM Compiler Infrastructure
-#
-# This file is distributed under the University of Illinois Open Source
-# License. See LICENSE.TXT for details.
-#
-##===----------------------------------------------------------------------===##
-
-LEVEL = ../..
-LIBRARYNAME = LLVMTableGen
-BUILD_ARCHIVE = 1
-
-include $(LEVEL)/Makefile.common
diff --git a/lib/TableGen/Record.cpp b/lib/TableGen/Record.cpp
index 11e35b75375e..66fbc9a8c96b 100644
--- a/lib/TableGen/Record.cpp
+++ b/lib/TableGen/Record.cpp
@@ -32,8 +32,8 @@ using namespace llvm;
 
 namespace llvm {
 
-/// TableGenStringKey - This is a wrapper for std::string suitable for
-/// using as a key to a DenseMap.  Because there isn't a particularly
+/// This is a wrapper for std::string suitable for using as a key to a DenseMap.
+/// Because there isn't a particularly
 /// good way to indicate tombstone or empty keys for strings, we want
 /// to wrap std::string to indicate that this is a "special" string
 /// not expected to take on certain values (those of the tombstone and
@@ -82,11 +82,12 @@ template<> struct DenseMapInfo<TableGenStringKey> {
 //===----------------------------------------------------------------------===//
 
 BitRecTy BitRecTy::Shared;
+CodeRecTy CodeRecTy::Shared;
 IntRecTy IntRecTy::Shared;
 StringRecTy StringRecTy::Shared;
 DagRecTy DagRecTy::Shared;
 
-void RecTy::dump() const { print(errs()); }
+LLVM_DUMP_METHOD void RecTy::dump() const { print(errs()); }
 
 ListRecTy *RecTy::getListTy() {
   if (!ListTy)
@@ -167,16 +168,13 @@ bool RecordRecTy::typeIsConvertibleTo(const RecTy *RHS) const {
   if (RTy->getRecord() == Rec || Rec->isSubClassOf(RTy->getRecord()))
     return true;
 
-  for (Record *SC : RTy->getRecord()->getSuperClasses())
-    if (Rec->isSubClassOf(SC))
+  for (const auto &SCPair : RTy->getRecord()->getSuperClasses())
+    if (Rec->isSubClassOf(SCPair.first))
       return true;
 
   return false;
 }
 
-/// resolveTypes - Find a common type that T1 and T2 convert to.
-/// Return null if no such type exists.
-///
 RecTy *llvm::resolveTypes(RecTy *T1, RecTy *T2) {
   if (T1->typeIsConvertibleTo(T2))
     return T2;
@@ -186,8 +184,8 @@ RecTy *llvm::resolveTypes(RecTy *T1, RecTy *T2) {
   // If one is a Record type, check superclasses
   if (RecordRecTy *RecTy1 = dyn_cast<RecordRecTy>(T1)) {
     // See if T2 inherits from a type T1 also inherits from
-    for (Record *SuperRec1 : RecTy1->getRecord()->getSuperClasses()) {
-      RecordRecTy *SuperRecTy1 = RecordRecTy::get(SuperRec1);
+    for (const auto &SuperPair1 : RecTy1->getRecord()->getSuperClasses()) {
+      RecordRecTy *SuperRecTy1 = RecordRecTy::get(SuperPair1.first);
       RecTy *NewType1 = resolveTypes(SuperRecTy1, T2);
       if (NewType1)
         return NewType1;
@@ -195,8 +193,8 @@ RecTy *llvm::resolveTypes(RecTy *T1, RecTy *T2) {
   }
   if (RecordRecTy *RecTy2 = dyn_cast<RecordRecTy>(T2)) {
     // See if T1 inherits from a type T2 also inherits from
-    for (Record *SuperRec2 : RecTy2->getRecord()->getSuperClasses()) {
-      RecordRecTy *SuperRecTy2 = RecordRecTy::get(SuperRec2);
+    for (const auto &SuperPair2 : RecTy2->getRecord()->getSuperClasses()) {
+      RecordRecTy *SuperRecTy2 = RecordRecTy::get(SuperPair2.first);
       RecTy *NewType2 = resolveTypes(T1, SuperRecTy2);
       if (NewType2)
         return NewType2;
@@ -211,7 +209,7 @@ RecTy *llvm::resolveTypes(RecTy *T1, RecTy *T2) {
 //===----------------------------------------------------------------------===//
 
 void Init::anchor() { }
-void Init::dump() const { return print(errs()); }
+LLVM_DUMP_METHOD void Init::dump() const { return print(errs()); }
 
 UnsetInit *UnsetInit::get() {
   static UnsetInit TheInit;
@@ -274,14 +272,17 @@ BitsInit *BitsInit::get(ArrayRef<Init *> Range) {
   if (BitsInit *I = ThePool.FindNodeOrInsertPos(ID, IP))
     return I;
 
-  BitsInit *I = new BitsInit(Range);
+  void *Mem = ::operator new (totalSizeToAlloc<Init *>(Range.size()));
+  BitsInit *I = new (Mem) BitsInit(Range.size());
+  std::uninitialized_copy(Range.begin(), Range.end(),
+                          I->getTrailingObjects<Init *>());
   ThePool.InsertNode(I, IP);
   TheActualPool.push_back(std::unique_ptr<BitsInit>(I));
   return I;
 }
 
 void BitsInit::Profile(FoldingSetNodeID &ID) const {
-  ProfileBitsInit(ID, Bits);
+  ProfileBitsInit(ID, makeArrayRef(getTrailingObjects<Init *>(), NumBits));
 }
 
 Init *BitsInit::convertInitializerTo(RecTy *Ty) const {
@@ -355,7 +356,7 @@ Init *BitsInit::resolveReferences(Record &R, const RecordVal *RV) const {
   bool CachedBitVarChanged = false;
 
   for (unsigned i = 0, e = getNumBits(); i != e; ++i) {
-    Init *CurBit = Bits[i];
+    Init *CurBit = getBit(i);
     Init *CurBitVar = CurBit->getBitVar();
 
     NewBits[i] = CurBit;
@@ -403,8 +404,6 @@ std::string IntInit::getAsString() const {
   return itostr(Value);
 }
 
-/// canFitInBitfield - Return true if the number of bits is large enough to hold
-/// the integer value.
 static bool canFitInBitfield(int64_t Value, unsigned NumBits) {
   // For example, with NumBits == 4, we permit Values from [-7 .. 15].
   return (NumBits >= sizeof(Value) * 8) ||
@@ -450,6 +449,14 @@ IntInit::convertInitializerBitRange(const std::vector<unsigned> &Bits) const {
   return BitsInit::get(NewBits);
 }
 
+CodeInit *CodeInit::get(StringRef V) {
+  static StringMap<std::unique_ptr<CodeInit>> ThePool;
+
+  std::unique_ptr<CodeInit> &I = ThePool[V];
+  if (!I) I.reset(new CodeInit(V));
+  return I.get();
+}
+
 StringInit *StringInit::get(StringRef V) {
   static StringMap<std::unique_ptr<StringInit>> ThePool;
 
@@ -465,6 +472,13 @@ Init *StringInit::convertInitializerTo(RecTy *Ty) const {
   return nullptr;
 }
 
+Init *CodeInit::convertInitializerTo(RecTy *Ty) const {
+  if (isa<CodeRecTy>(Ty))
+    return const_cast<CodeInit *>(this);
+
+  return nullptr;
+}
+
 static void ProfileListInit(FoldingSetNodeID &ID,
                             ArrayRef<Init *> Range,
                             RecTy *EltTy) {
@@ -486,7 +500,10 @@ ListInit *ListInit::get(ArrayRef<Init *> Range, RecTy *EltTy) {
   if (ListInit *I = ThePool.FindNodeOrInsertPos(ID, IP))
     return I;
 
-  ListInit *I = new ListInit(Range, EltTy);
+  void *Mem = ::operator new (totalSizeToAlloc<Init *>(Range.size()));
+  ListInit *I = new (Mem) ListInit(Range.size(), EltTy);
+  std::uninitialized_copy(Range.begin(), Range.end(),
+                          I->getTrailingObjects<Init *>());
   ThePool.InsertNode(I, IP);
   TheActualPool.push_back(std::unique_ptr<ListInit>(I));
   return I;
@@ -495,7 +512,7 @@ ListInit *ListInit::get(ArrayRef<Init *> Range, RecTy *EltTy) {
 void ListInit::Profile(FoldingSetNodeID &ID) const {
   RecTy *EltTy = cast<ListRecTy>(getType())->getElementType();
 
-  ProfileListInit(ID, Values, EltTy);
+  ProfileListInit(ID, getValues(), EltTy);
 }
 
 Init *ListInit::convertInitializerTo(RecTy *Ty) const {
@@ -529,8 +546,8 @@ ListInit::convertInitListSlice(const std::vector<unsigned> &Elements) const {
 }
 
 Record *ListInit::getElementAsRecord(unsigned i) const {
-  assert(i < Values.size() && "List element index out of range!");
-  DefInit *DI = dyn_cast<DefInit>(Values[i]);
+  assert(i < NumValues && "List element index out of range!");
+  DefInit *DI = dyn_cast<DefInit>(getElement(i));
   if (!DI)
     PrintFatalError("Expected record in list!");
   return DI->getDef();
@@ -572,9 +589,9 @@ Init *ListInit::resolveListElementReference(Record &R, const RecordVal *IRV,
 
 std::string ListInit::getAsString() const {
   std::string Result = "[";
-  for (unsigned i = 0, e = Values.size(); i != e; ++i) {
+  for (unsigned i = 0, e = NumValues; i != e; ++i) {
     if (i) Result += ", ";
-    Result += Values[i]->getAsString();
+    Result += getElement(i)->getAsString();
   }
   return Result + "]";
 }
@@ -603,15 +620,32 @@ Init *OpInit::getBit(unsigned Bit) const {
   return VarBitInit::get(const_cast<OpInit*>(this), Bit);
 }
 
-UnOpInit *UnOpInit::get(UnaryOp opc, Init *lhs, RecTy *Type) {
-  typedef std::pair<std::pair<unsigned, Init *>, RecTy *> Key;
-  static DenseMap<Key, std::unique_ptr<UnOpInit>> ThePool;
+static void
+ProfileUnOpInit(FoldingSetNodeID &ID, unsigned Opcode, Init *Op, RecTy *Type) {
+  ID.AddInteger(Opcode);
+  ID.AddPointer(Op);
+  ID.AddPointer(Type);
+}
 
-  Key TheKey(std::make_pair(std::make_pair(opc, lhs), Type));
+UnOpInit *UnOpInit::get(UnaryOp Opc, Init *LHS, RecTy *Type) {
+  static FoldingSet<UnOpInit> ThePool;
+  static std::vector<std::unique_ptr<UnOpInit>> TheActualPool;
 
-  std::unique_ptr<UnOpInit> &I = ThePool[TheKey];
-  if (!I) I.reset(new UnOpInit(opc, lhs, Type));
-  return I.get();
+  FoldingSetNodeID ID;
+  ProfileUnOpInit(ID, Opc, LHS, Type);
+
+  void *IP = nullptr;
+  if (UnOpInit *I = ThePool.FindNodeOrInsertPos(ID, IP))
+    return I;
+
+  UnOpInit *I = new UnOpInit(Opc, LHS, Type);
+  ThePool.InsertNode(I, IP);
+  TheActualPool.push_back(std::unique_ptr<UnOpInit>(I));
+  return I;
+}
+
+void UnOpInit::Profile(FoldingSetNodeID &ID) const {
+  ProfileUnOpInit(ID, getOpcode(), getOperand(), getType());
 }
 
 Init *UnOpInit::Fold(Record *CurRec, MultiClass *CurMultiClass) const {
@@ -628,7 +662,7 @@ Init *UnOpInit::Fold(Record *CurRec, MultiClass *CurMultiClass) const {
         return StringInit::get(LHSi->getAsString());
     } else {
       if (StringInit *LHSs = dyn_cast<StringInit>(LHS)) {
-        std::string Name = LHSs->getValue();
+        const std::string &Name = LHSs->getValue();
 
         // From TGParser::ParseIDValue
         if (CurRec) {
@@ -731,21 +765,35 @@ std::string UnOpInit::getAsString() const {
   return Result + "(" + LHS->getAsString() + ")";
 }
 
-BinOpInit *BinOpInit::get(BinaryOp opc, Init *lhs,
-                          Init *rhs, RecTy *Type) {
-  typedef std::pair<
-    std::pair<std::pair<unsigned, Init *>, Init *>,
-    RecTy *
-    > Key;
+static void
+ProfileBinOpInit(FoldingSetNodeID &ID, unsigned Opcode, Init *LHS, Init *RHS,
+                 RecTy *Type) {
+  ID.AddInteger(Opcode);
+  ID.AddPointer(LHS);
+  ID.AddPointer(RHS);
+  ID.AddPointer(Type);
+}
 
-  static DenseMap<Key, std::unique_ptr<BinOpInit>> ThePool;
+BinOpInit *BinOpInit::get(BinaryOp Opc, Init *LHS,
+                          Init *RHS, RecTy *Type) {
+  static FoldingSet<BinOpInit> ThePool;
+  static std::vector<std::unique_ptr<BinOpInit>> TheActualPool;
 
-  Key TheKey(std::make_pair(std::make_pair(std::make_pair(opc, lhs), rhs),
-                            Type));
+  FoldingSetNodeID ID;
+  ProfileBinOpInit(ID, Opc, LHS, RHS, Type);
 
-  std::unique_ptr<BinOpInit> &I = ThePool[TheKey];
-  if (!I) I.reset(new BinOpInit(opc, lhs, rhs, Type));
-  return I.get();
+  void *IP = nullptr;
+  if (BinOpInit *I = ThePool.FindNodeOrInsertPos(ID, IP))
+    return I;
+
+  BinOpInit *I = new BinOpInit(Opc, LHS, RHS, Type);
+  ThePool.InsertNode(I, IP);
+  TheActualPool.push_back(std::unique_ptr<BinOpInit>(I));
+  return I;
+}
+
+void BinOpInit::Profile(FoldingSetNodeID &ID) const {
+  ProfileBinOpInit(ID, getOpcode(), getLHS(), getRHS(), getType());
 }
 
 Init *BinOpInit::Fold(Record *CurRec, MultiClass *CurMultiClass) const {
@@ -864,27 +912,36 @@ std::string BinOpInit::getAsString() const {
   return Result + "(" + LHS->getAsString() + ", " + RHS->getAsString() + ")";
 }
 
-TernOpInit *TernOpInit::get(TernaryOp opc, Init *lhs, Init *mhs, Init *rhs,
+static void
+ProfileTernOpInit(FoldingSetNodeID &ID, unsigned Opcode, Init *LHS, Init *MHS,
+                  Init *RHS, RecTy *Type) {
+  ID.AddInteger(Opcode);
+  ID.AddPointer(LHS);
+  ID.AddPointer(MHS);
+  ID.AddPointer(RHS);
+  ID.AddPointer(Type);
+}
+
+TernOpInit *TernOpInit::get(TernaryOp Opc, Init *LHS, Init *MHS, Init *RHS,
                             RecTy *Type) {
-  typedef std::pair<
-    std::pair<
-      std::pair<std::pair<unsigned, RecTy *>, Init *>,
-      Init *
-      >,
-    Init *
-    > Key;
-
-  static DenseMap<Key, std::unique_ptr<TernOpInit>> ThePool;
-
-  Key TheKey(std::make_pair(std::make_pair(std::make_pair(std::make_pair(opc,
-                                                                         Type),
-                                                          lhs),
-                                           mhs),
-                            rhs));
-
-  std::unique_ptr<TernOpInit> &I = ThePool[TheKey];
-  if (!I) I.reset(new TernOpInit(opc, lhs, mhs, rhs, Type));
-  return I.get();
+  static FoldingSet<TernOpInit> ThePool;
+  static std::vector<std::unique_ptr<TernOpInit>> TheActualPool;
+
+  FoldingSetNodeID ID;
+  ProfileTernOpInit(ID, Opc, LHS, MHS, RHS, Type);
+
+  void *IP = nullptr;
+  if (TernOpInit *I = ThePool.FindNodeOrInsertPos(ID, IP))
+    return I;
+
+  TernOpInit *I = new TernOpInit(Opc, LHS, MHS, RHS, Type);
+  ThePool.InsertNode(I, IP);
+  TheActualPool.push_back(std::unique_ptr<TernOpInit>(I));
+  return I;
+}
+
+void TernOpInit::Profile(FoldingSetNodeID &ID) const {
+  ProfileTernOpInit(ID, getOpcode(), getLHS(), getMHS(), getRHS(), getType());
 }
 
 static Init *ForeachHelper(Init *LHS, Init *MHS, Init *RHS, RecTy *Type,
@@ -1112,6 +1169,12 @@ TypedInit::convertInitializerTo(RecTy *Ty) const {
     return nullptr;
   }
 
+  if (isa<CodeRecTy>(Ty)) {
+    if (isa<CodeRecTy>(getType()))
+      return const_cast<TypedInit *>(this);
+    return nullptr;
+  }
+
   if (isa<BitRecTy>(Ty)) {
     // Accept variable if it is already of bit type!
     if (isa<BitRecTy>(getType()))
@@ -1280,11 +1343,6 @@ Init *VarInit::getFieldInit(Record &R, const RecordVal *RV,
   return nullptr;
 }
 
-/// resolveReferences - This method is used by classes that refer to other
-/// variables which may not be defined at the time the expression is formed.
-/// If a value is set for the variable later, this method will be called on
-/// users of the value to allow the value to propagate out.
-///
 Init *VarInit::resolveReferences(Record &R, const RecordVal *RV) const {
   if (RecordVal *Val = R.getValue(VarName))
     if (RV == Val || (!RV && !isa<UnsetInit>(Val->getValue())))
@@ -1551,7 +1609,7 @@ const std::string &RecordVal::getName() const {
   return cast<StringInit>(getNameInit())->getValue();
 }
 
-void RecordVal::dump() const { errs() << *this; }
+LLVM_DUMP_METHOD void RecordVal::dump() const { errs() << *this; }
 
 void RecordVal::print(raw_ostream &OS, bool PrintSem) const {
   if (getPrefix()) OS << "field ";
@@ -1611,9 +1669,6 @@ void Record::setName(const std::string &Name) {
   setName(StringInit::get(Name));
 }
 
-/// resolveReferencesTo - If anything in this record refers to RV, replace the
-/// reference to RV with the RHS of RV.  If RV is null, we resolve all possible
-/// references.
 void Record::resolveReferencesTo(const RecordVal *RV) {
   for (unsigned i = 0, e = Values.size(); i != e; ++i) {
     if (RV == &Values[i]) // Skip resolve the same field as the given one
@@ -1636,7 +1691,7 @@ void Record::resolveReferencesTo(const RecordVal *RV) {
   }
 }
 
-void Record::dump() const { errs() << *this; }
+LLVM_DUMP_METHOD void Record::dump() const { errs() << *this; }
 
 raw_ostream &llvm::operator<<(raw_ostream &OS, const Record &R) {
   OS << R.getNameInitAsString();
@@ -1656,11 +1711,11 @@ raw_ostream &llvm::operator<<(raw_ostream &OS, const Record &R) {
   }
 
   OS << " {";
-  ArrayRef<Record *> SC = R.getSuperClasses();
+  ArrayRef<std::pair<Record *, SMRange>> SC = R.getSuperClasses();
   if (!SC.empty()) {
     OS << "\t//";
-    for (const Record *Super : SC)
-      OS << " " << Super->getNameInitAsString();
+    for (const auto &SuperPair : SC)
+      OS << " " << SuperPair.first->getNameInitAsString();
   }
   OS << "\n";
 
@@ -1674,9 +1729,6 @@ raw_ostream &llvm::operator<<(raw_ostream &OS, const Record &R) {
   return OS << "}\n";
 }
 
-/// getValueInit - Return the initializer for a value with the specified name,
-/// or abort if the field does not exist.
-///
 Init *Record::getValueInit(StringRef FieldName) const {
   const RecordVal *R = getValue(FieldName);
   if (!R || !R->getValue())
@@ -1686,10 +1738,6 @@ Init *Record::getValueInit(StringRef FieldName) const {
 }
 
 
-/// getValueAsString - This method looks up the specified field and returns its
-/// value as a string, aborts if the field does not exist or if
-/// the value is not a string.
-///
 std::string Record::getValueAsString(StringRef FieldName) const {
   const RecordVal *R = getValue(FieldName);
   if (!R || !R->getValue())
@@ -1698,14 +1746,13 @@ std::string Record::getValueAsString(StringRef FieldName) const {
 
   if (StringInit *SI = dyn_cast<StringInit>(R->getValue()))
     return SI->getValue();
+  if (CodeInit *CI = dyn_cast<CodeInit>(R->getValue()))
+    return CI->getValue();
+
   PrintFatalError(getLoc(), "Record `" + getName() + "', field `" +
     FieldName + "' does not have a string initializer!");
 }
 
-/// getValueAsBitsInit - This method looks up the specified field and returns
-/// its value as a BitsInit, aborts if the field does not exist or if
-/// the value is not the right type.
-///
 BitsInit *Record::getValueAsBitsInit(StringRef FieldName) const {
   const RecordVal *R = getValue(FieldName);
   if (!R || !R->getValue())
@@ -1718,10 +1765,6 @@ BitsInit *Record::getValueAsBitsInit(StringRef FieldName) const {
     FieldName + "' does not have a BitsInit initializer!");
 }
 
-/// getValueAsListInit - This method looks up the specified field and returns
-/// its value as a ListInit, aborting if the field does not exist or if
-/// the value is not the right type.
-///
 ListInit *Record::getValueAsListInit(StringRef FieldName) const {
   const RecordVal *R = getValue(FieldName);
   if (!R || !R->getValue())
@@ -1734,10 +1777,6 @@ ListInit *Record::getValueAsListInit(StringRef FieldName) const {
     FieldName + "' does not have a list initializer!");
 }
 
-/// getValueAsListOfDefs - This method looks up the specified field and returns
-/// its value as a vector of records, aborting if the field does not exist
-/// or if the value is not the right type.
-///
 std::vector<Record*>
 Record::getValueAsListOfDefs(StringRef FieldName) const {
   ListInit *List = getValueAsListInit(FieldName);
@@ -1752,10 +1791,6 @@ Record::getValueAsListOfDefs(StringRef FieldName) const {
   return Defs;
 }
 
-/// getValueAsInt - This method looks up the specified field and returns its
-/// value as an int64_t, aborting if the field does not exist or if the value
-/// is not the right type.
-///
 int64_t Record::getValueAsInt(StringRef FieldName) const {
   const RecordVal *R = getValue(FieldName);
   if (!R || !R->getValue())
@@ -1768,10 +1803,6 @@ int64_t Record::getValueAsInt(StringRef FieldName) const {
     FieldName + "' does not have an int initializer!");
 }
 
-/// getValueAsListOfInts - This method looks up the specified field and returns
-/// its value as a vector of integers, aborting if the field does not exist or
-/// if the value is not the right type.
-///
 std::vector<int64_t>
 Record::getValueAsListOfInts(StringRef FieldName) const {
   ListInit *List = getValueAsListInit(FieldName);
@@ -1786,10 +1817,6 @@ Record::getValueAsListOfInts(StringRef FieldName) const {
   return Ints;
 }
 
-/// getValueAsListOfStrings - This method looks up the specified field and
-/// returns its value as a vector of strings, aborting if the field does not
-/// exist or if the value is not the right type.
-///
 std::vector<std::string>
 Record::getValueAsListOfStrings(StringRef FieldName) const {
   ListInit *List = getValueAsListInit(FieldName);
@@ -1804,10 +1831,6 @@ Record::getValueAsListOfStrings(StringRef FieldName) const {
   return Strings;
 }
 
-/// getValueAsDef - This method looks up the specified field and returns its
-/// value as a Record, aborting if the field does not exist or if the value
-/// is not the right type.
-///
 Record *Record::getValueAsDef(StringRef FieldName) const {
   const RecordVal *R = getValue(FieldName);
   if (!R || !R->getValue())
@@ -1820,10 +1843,6 @@ Record *Record::getValueAsDef(StringRef FieldName) const {
     FieldName + "' does not have a def initializer!");
 }
 
-/// getValueAsBit - This method looks up the specified field and returns its
-/// value as a bit, aborting if the field does not exist or if the value is
-/// not the right type.
-///
 bool Record::getValueAsBit(StringRef FieldName) const {
   const RecordVal *R = getValue(FieldName);
   if (!R || !R->getValue())
@@ -1853,10 +1872,6 @@ bool Record::getValueAsBitOrUnset(StringRef FieldName, bool &Unset) const {
     FieldName + "' does not have a bit initializer!");
 }
 
-/// getValueAsDag - This method looks up the specified field and returns its
-/// value as an Dag, aborting if the field does not exist or if the value is
-/// not the right type.
-///
 DagInit *Record::getValueAsDag(StringRef FieldName) const {
   const RecordVal *R = getValue(FieldName);
   if (!R || !R->getValue())
@@ -1870,7 +1885,7 @@ DagInit *Record::getValueAsDag(StringRef FieldName) const {
 }
 
 
-void MultiClass::dump() const {
+LLVM_DUMP_METHOD void MultiClass::dump() const {
   errs() << "Record:\n";
   Rec.dump();
 
@@ -1880,7 +1895,7 @@ void MultiClass::dump() const {
 }
 
 
-void RecordKeeper::dump() const { errs() << *this; }
+LLVM_DUMP_METHOD void RecordKeeper::dump() const { errs() << *this; }
 
 raw_ostream &llvm::operator<<(raw_ostream &OS, const RecordKeeper &RK) {
   OS << "------------- Classes -----------------\n";
@@ -1893,11 +1908,7 @@ raw_ostream &llvm::operator<<(raw_ostream &OS, const RecordKeeper &RK) {
   return OS;
 }
 
-
-/// getAllDerivedDefinitions - This method returns all concrete definitions
-/// that derive from the specified class name.  If a class with the specified
-/// name does not exist, an error is printed and true is returned.
-std::vector<Record*>
+std::vector<Record *>
 RecordKeeper::getAllDerivedDefinitions(const std::string &ClassName) const {
   Record *Class = getClass(ClassName);
   if (!Class)
@@ -1911,8 +1922,6 @@ RecordKeeper::getAllDerivedDefinitions(const std::string &ClassName) const {
   return Defs;
 }
 
-/// QualifyName - Return an Init with a qualifier prefix referring
-/// to CurRec's name.
 Init *llvm::QualifyName(Record &CurRec, MultiClass *CurMultiClass,
                         Init *Name, const std::string &Scoper) {
   RecTy *Type = cast<TypedInit>(Name)->getType();
@@ -1940,8 +1949,6 @@ Init *llvm::QualifyName(Record &CurRec, MultiClass *CurMultiClass,
   return NewName->Fold(&CurRec, CurMultiClass);
 }
 
-/// QualifyName - Return an Init with a qualifier prefix referring
-/// to CurRec's name.
 Init *llvm::QualifyName(Record &CurRec, MultiClass *CurMultiClass,
                         const std::string &Name,
                         const std::string &Scoper) {
diff --git a/lib/TableGen/SetTheory.cpp b/lib/TableGen/SetTheory.cpp
index f56b17acbfba..a4d33051b4f7 100644
--- a/lib/TableGen/SetTheory.cpp
+++ b/lib/TableGen/SetTheory.cpp
@@ -302,12 +302,12 @@ const RecVec *SetTheory::expand(Record *Set) {
     return &I->second;
 
   // This is the first time we see Set. Find a suitable expander.
-  ArrayRef<Record *> SC = Set->getSuperClasses();
-  for (unsigned i = 0, e = SC.size(); i != e; ++i) {
+  ArrayRef<std::pair<Record *, SMRange>> SC = Set->getSuperClasses();
+  for (const auto &SCPair : SC) {
     // Skip unnamed superclasses.
-    if (!dyn_cast<StringInit>(SC[i]->getNameInit()))
+    if (!isa<StringInit>(SCPair.first->getNameInit()))
       continue;
-    auto I = Expanders.find(SC[i]->getName());
+    auto I = Expanders.find(SCPair.first->getName());
     if (I != Expanders.end()) {
       // This breaks recursive definitions.
       RecVec &EltVec = Expansions[Set];
diff --git a/lib/TableGen/TGParser.cpp b/lib/TableGen/TGParser.cpp
index 1506a7171ac4..34e90925e929 100644
--- a/lib/TableGen/TGParser.cpp
+++ b/lib/TableGen/TGParser.cpp
@@ -15,10 +15,8 @@
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/StringExtras.h"
-#include "llvm/Support/CommandLine.h"
 #include "llvm/TableGen/Record.h"
 #include <algorithm>
-#include <sstream>
 using namespace llvm;
 
 //===----------------------------------------------------------------------===//
@@ -45,7 +43,7 @@ struct SubMultiClassReference {
   void dump() const;
 };
 
-void SubMultiClassReference::dump() const {
+LLVM_DUMP_METHOD void SubMultiClassReference::dump() const {
   errs() << "Multiclass:\n";
 
   MC->dump();
@@ -185,13 +183,12 @@ bool TGParser::AddSubClass(Record *CurRec, SubClassReference &SubClass) {
 
   // Since everything went well, we can now set the "superclass" list for the
   // current record.
-  ArrayRef<Record *> SCs = SC->getSuperClasses();
-  ArrayRef<SMRange> SCRanges = SC->getSuperClassRanges();
-  for (unsigned i = 0, e = SCs.size(); i != e; ++i) {
-    if (CurRec->isSubClassOf(SCs[i]))
+  ArrayRef<std::pair<Record *, SMRange>> SCs = SC->getSuperClasses();
+  for (const auto &SCPair : SCs) {
+    if (CurRec->isSubClassOf(SCPair.first))
       return Error(SubClass.RefRange.Start,
-                   "Already subclass of '" + SCs[i]->getName() + "'!\n");
-    CurRec->addSuperClass(SCs[i], SCRanges[i]);
+                   "Already subclass of '" + SCPair.first->getName() + "'!\n");
+    CurRec->addSuperClass(SCPair.first, SCPair.second);
   }
 
   if (CurRec->isSubClassOf(SC))
@@ -663,7 +660,7 @@ RecTy *TGParser::ParseType() {
   switch (Lex.getCode()) {
   default: TokError("Unknown token when expecting a type"); return nullptr;
   case tgtok::String: Lex.Lex(); return StringRecTy::get();
-  case tgtok::Code:   Lex.Lex(); return StringRecTy::get();
+  case tgtok::Code:   Lex.Lex(); return CodeRecTy::get();
   case tgtok::Bit:    Lex.Lex(); return BitRecTy::get();
   case tgtok::Int:    Lex.Lex(); return IntRecTy::get();
   case tgtok::Dag:    Lex.Lex(); return DagRecTy::get();
@@ -1167,7 +1164,7 @@ Init *TGParser::ParseSimpleValue(Record *CurRec, RecTy *ItemType,
     break;
   }
   case tgtok::CodeFragment:
-    R = StringInit::get(Lex.getCurStrVal());
+    R = CodeInit::get(Lex.getCurStrVal());
     Lex.Lex();
     break;
   case tgtok::question:
@@ -2543,7 +2540,7 @@ bool TGParser::ParseDefm(MultiClass *CurMultiClass) {
       // The record name construction goes as follow:
       //  - If the def name is a string, prepend the prefix.
       //  - If the def name is a more complex pattern, use that pattern.
-      // As a result, the record is instanciated before resolving
+      // As a result, the record is instantiated before resolving
       // arguments, as it would make its name a string.
       Record *CurRec = InstantiateMulticlassDef(*MC, DefProto.get(), DefmPrefix,
                                                 SMRange(DefmLoc,
@@ -2552,7 +2549,7 @@ bool TGParser::ParseDefm(MultiClass *CurMultiClass) {
       if (!CurRec)
         return true;
 
-      // Now that the record is instanciated, we can resolve arguments.
+      // Now that the record is instantiated, we can resolve arguments.
       if (ResolveMulticlassDefArgs(*MC, CurRec, DefmLoc, SubClassLoc,
                                    TArgs, TemplateVals, true/*Delete args*/))
         return Error(SubClassLoc, "could not instantiate def");
diff --git a/lib/TableGen/module.modulemap b/lib/TableGen/module.modulemap
deleted file mode 100644
index 8dac0a22c142..000000000000
--- a/lib/TableGen/module.modulemap
+++ /dev/null
@@ -1 +0,0 @@
-module TableGen { requires cplusplus umbrella "." module * { export * } }
diff --git a/lib/Target/AArch64/AArch64.h b/lib/Target/AArch64/AArch64.h
index 21106c9ad29a..c767c75fce57 100644
--- a/lib/Target/AArch64/AArch64.h
+++ b/lib/Target/AArch64/AArch64.h
@@ -27,6 +27,7 @@ class FunctionPass;
 class MachineFunctionPass;
 
 FunctionPass *createAArch64DeadRegisterDefinitions();
+FunctionPass *createAArch64RedundantCopyEliminationPass();
 FunctionPass *createAArch64ConditionalCompares();
 FunctionPass *createAArch64AdvSIMDScalar();
 FunctionPass *createAArch64BranchRelaxation();
@@ -44,6 +45,8 @@ FunctionPass *createAArch64A53Fix835769();
 FunctionPass *createAArch64CleanupLocalDynamicTLSPass();
 
 FunctionPass *createAArch64CollectLOHPass();
+
+void initializeAArch64ExpandPseudoPass(PassRegistry&);
 } // end namespace llvm
 
 #endif
diff --git a/lib/Target/AArch64/AArch64.td b/lib/Target/AArch64/AArch64.td
index cd3e84d38fe2..b1e881685b0c 100644
--- a/lib/Target/AArch64/AArch64.td
+++ b/lib/Target/AArch64/AArch64.td
@@ -11,7 +11,7 @@
 //===----------------------------------------------------------------------===//
 
 //===----------------------------------------------------------------------===//
-// Target-independent interfaces which we are implementing
+// Target-independent interfaces which we are implementing.
 //===----------------------------------------------------------------------===//
 
 include "llvm/Target/Target.td"
@@ -32,6 +32,9 @@ def FeatureCrypto : SubtargetFeature<"crypto", "HasCrypto", "true",
 def FeatureCRC : SubtargetFeature<"crc", "HasCRC", "true",
   "Enable ARMv8 CRC-32 checksum instructions">;
 
+def FeatureRAS : SubtargetFeature<"ras", "HasRAS", "true",
+  "Enable ARMv8 Reliability, Availability and Serviceability Extensions">;
+
 def FeaturePerfMon : SubtargetFeature<"perfmon", "HasPerfMon", "true",
   "Enable ARMv8 PMUv3 Performance Monitors extension">;
 
@@ -58,6 +61,50 @@ def FeatureReserveX18 : SubtargetFeature<"reserve-x18", "ReserveX18", "true",
                                          "Reserve X18, making it unavailable "
                                          "as a GPR">;
 
+def FeatureMergeNarrowLd : SubtargetFeature<"merge-narrow-ld",
+                                            "MergeNarrowLoads", "true",
+                                            "Merge narrow load instructions">;
+
+def FeatureUseAA : SubtargetFeature<"use-aa", "UseAA", "true",
+                                    "Use alias analysis during codegen">;
+
+def FeatureBalanceFPOps : SubtargetFeature<"balance-fp-ops", "BalanceFPOps",
+    "true",
+    "balance mix of odd and even D-registers for fp multiply(-accumulate) ops">;
+
+def FeaturePredictableSelectIsExpensive : SubtargetFeature<
+    "predictable-select-expensive", "PredictableSelectIsExpensive", "true",
+    "Prefer likely predicted branches over selects">;
+
+def FeatureCustomCheapAsMoveHandling : SubtargetFeature<"custom-cheap-as-move",
+    "CustomAsCheapAsMove", "true",
+    "Use custom code for TargetInstrInfo::isAsCheapAsAMove()">;
+
+def FeaturePostRAScheduler : SubtargetFeature<"use-postra-scheduler",
+    "UsePostRAScheduler", "true", "Schedule again after register allocation">;
+
+def FeatureSlowMisaligned128Store : SubtargetFeature<"slow-misaligned-128store",
+    "Misaligned128StoreIsSlow", "true", "Misaligned 128 bit stores are slow">;
+
+def FeatureAvoidQuadLdStPairs : SubtargetFeature<"no-quad-ldst-pairs",
+    "AvoidQuadLdStPairs", "true",
+    "Do not form quad load/store pair operations">;
+
+def FeatureAlternateSExtLoadCVTF32Pattern : SubtargetFeature<
+    "alternate-sextload-cvt-f32-pattern", "UseAlternateSExtLoadCVTF32Pattern",
+    "true", "Use alternative pattern for sextload convert to f32">;
+
+def FeatureMacroOpFusion : SubtargetFeature<
+    "macroop-fusion", "HasMacroOpFusion", "true",
+    "CPU supports macro op fusion">;
+
+def FeatureDisableLatencySchedHeuristic : SubtargetFeature<
+    "disable-latency-sched-heuristic", "DisableLatencySchedHeuristic", "true",
+    "Disable latency scheduling heuristic">;
+
+def FeatureUseRSqrt : SubtargetFeature<
+    "use-reverse-square-root", "UseRSqrt", "true", "Use reverse square root">;
+
 //===----------------------------------------------------------------------===//
 // Architectures.
 //
@@ -66,7 +113,7 @@ def HasV8_1aOps : SubtargetFeature<"v8.1a", "HasV8_1aOps", "true",
   "Support ARM v8.1a instructions", [FeatureCRC]>;
 
 def HasV8_2aOps : SubtargetFeature<"v8.2a", "HasV8_2aOps", "true",
-  "Support ARM v8.2a instructions", [HasV8_1aOps]>;
+  "Support ARM v8.2a instructions", [HasV8_1aOps, FeatureRAS]>;
 
 //===----------------------------------------------------------------------===//
 // Register File Description
@@ -85,67 +132,145 @@ include "AArch64InstrInfo.td"
 def AArch64InstrInfo : InstrInfo;
 
 //===----------------------------------------------------------------------===//
+// Named operands for MRS/MSR/TLBI/...
+//===----------------------------------------------------------------------===//
+
+include "AArch64SystemOperands.td"
+
+//===----------------------------------------------------------------------===//
 // AArch64 Processors supported.
 //
 include "AArch64SchedA53.td"
 include "AArch64SchedA57.td"
 include "AArch64SchedCyclone.td"
 include "AArch64SchedM1.td"
+include "AArch64SchedKryo.td"
+include "AArch64SchedVulcan.td"
 
 def ProcA35     : SubtargetFeature<"a35", "ARMProcFamily", "CortexA35",
-                                   "Cortex-A35 ARM processors",
-                                   [FeatureFPARMv8,
-                                   FeatureNEON,
-                                   FeatureCrypto,
+                                   "Cortex-A35 ARM processors", [
                                    FeatureCRC,
-                                   FeaturePerfMon]>;
+                                   FeatureCrypto,
+                                   FeatureFPARMv8,
+                                   FeatureNEON,
+                                   FeaturePerfMon
+                                   ]>;
 
 def ProcA53     : SubtargetFeature<"a53", "ARMProcFamily", "CortexA53",
-                                   "Cortex-A53 ARM processors",
-                                   [FeatureFPARMv8,
-                                   FeatureNEON,
-                                   FeatureCrypto,
+                                   "Cortex-A53 ARM processors", [
+                                   FeatureBalanceFPOps,
                                    FeatureCRC,
-                                   FeaturePerfMon]>;
+                                   FeatureCrypto,
+                                   FeatureCustomCheapAsMoveHandling,
+                                   FeatureFPARMv8,
+                                   FeatureNEON,
+                                   FeaturePerfMon,
+                                   FeaturePostRAScheduler,
+                                   FeatureUseAA
+                                   ]>;
 
 def ProcA57     : SubtargetFeature<"a57", "ARMProcFamily", "CortexA57",
-                                   "Cortex-A57 ARM processors",
-                                   [FeatureFPARMv8,
+                                   "Cortex-A57 ARM processors", [
+                                   FeatureBalanceFPOps,
+                                   FeatureCRC,
+                                   FeatureCrypto,
+                                   FeatureCustomCheapAsMoveHandling,
+                                   FeatureFPARMv8,
+                                   FeatureMergeNarrowLd,
                                    FeatureNEON,
+                                   FeaturePerfMon,
+                                   FeaturePostRAScheduler,
+                                   FeaturePredictableSelectIsExpensive
+                                   ]>;
+
+def ProcA72     : SubtargetFeature<"a72", "ARMProcFamily", "CortexA72",
+                                   "Cortex-A72 ARM processors", [
+                                   FeatureCRC,
                                    FeatureCrypto,
+                                   FeatureFPARMv8,
+                                   FeatureNEON,
+                                   FeaturePerfMon
+                                   ]>;
+
+def ProcA73     : SubtargetFeature<"a73", "ARMProcFamily", "CortexA73",
+                                   "Cortex-A73 ARM processors", [
                                    FeatureCRC,
-                                   FeaturePerfMon]>;
+                                   FeatureCrypto,
+                                   FeatureFPARMv8,
+                                   FeatureNEON,
+                                   FeaturePerfMon
+                                   ]>;
 
 def ProcCyclone : SubtargetFeature<"cyclone", "ARMProcFamily", "Cyclone",
-                                   "Cyclone",
-                                   [FeatureFPARMv8,
-                                   FeatureNEON,
+                                   "Cyclone", [
+                                   FeatureAlternateSExtLoadCVTF32Pattern,
                                    FeatureCrypto,
-                                   FeatureCRC,
+                                   FeatureDisableLatencySchedHeuristic,
+                                   FeatureFPARMv8,
+                                   FeatureMacroOpFusion,
+                                   FeatureNEON,
                                    FeaturePerfMon,
-                                   FeatureZCRegMove, FeatureZCZeroing]>;
+                                   FeatureSlowMisaligned128Store,
+                                   FeatureZCRegMove,
+                                   FeatureZCZeroing
+                                   ]>;
 
 def ProcExynosM1 : SubtargetFeature<"exynosm1", "ARMProcFamily", "ExynosM1",
-                                    "Samsung Exynos-M1 processors",
-                                    [FeatureFPARMv8,
-                                    FeatureNEON,
-                                    FeatureCrypto,
+                                    "Samsung Exynos-M1 processors", [
+                                    FeatureAvoidQuadLdStPairs,
                                     FeatureCRC,
-                                    FeaturePerfMon]>;
+                                    FeatureCrypto,
+                                    FeatureCustomCheapAsMoveHandling,
+                                    FeatureFPARMv8,
+                                    FeatureNEON,
+                                    FeaturePerfMon,
+                                    FeaturePostRAScheduler,
+                                    FeatureUseRSqrt
+                                    ]>;
+
+def ProcKryo    : SubtargetFeature<"kryo", "ARMProcFamily", "Kryo",
+                                   "Qualcomm Kryo processors", [
+                                   FeatureCRC,
+                                   FeatureCrypto,
+                                   FeatureCustomCheapAsMoveHandling,
+                                   FeatureFPARMv8,
+                                   FeatureMergeNarrowLd,
+                                   FeatureNEON,
+                                   FeaturePerfMon,
+                                   FeaturePostRAScheduler,
+                                   FeaturePredictableSelectIsExpensive,
+                                   FeatureZCZeroing
+                                   ]>;
+
+def ProcVulcan  : SubtargetFeature<"vulcan", "ARMProcFamily", "Vulcan",
+                                   "Broadcom Vulcan processors", [
+                                   FeatureCRC,
+                                   FeatureCrypto,
+                                   FeatureFPARMv8,
+                                   FeatureMacroOpFusion,
+                                   FeatureNEON,
+                                   FeaturePostRAScheduler,
+                                   HasV8_1aOps]>;
 
-def : ProcessorModel<"generic", NoSchedModel, [FeatureFPARMv8,
-                                              FeatureNEON,
-                                              FeatureCRC,
-                                              FeaturePerfMon]>;
+def : ProcessorModel<"generic", NoSchedModel, [
+                     FeatureCRC,
+                     FeatureFPARMv8,
+                     FeatureNEON,
+                     FeaturePerfMon,
+                     FeaturePostRAScheduler
+                     ]>;
 
 // FIXME: Cortex-A35 is currently modelled as a Cortex-A53
 def : ProcessorModel<"cortex-a35", CortexA53Model, [ProcA35]>;
 def : ProcessorModel<"cortex-a53", CortexA53Model, [ProcA53]>;
 def : ProcessorModel<"cortex-a57", CortexA57Model, [ProcA57]>;
-// FIXME: Cortex-A72 is currently modelled as an Cortex-A57.
-def : ProcessorModel<"cortex-a72", CortexA57Model, [ProcA57]>;
+// FIXME: Cortex-A72 and Cortex-A73 are currently modelled as an Cortex-A57.
+def : ProcessorModel<"cortex-a72", CortexA57Model, [ProcA72]>;
+def : ProcessorModel<"cortex-a73", CortexA57Model, [ProcA73]>;
 def : ProcessorModel<"cyclone", CycloneModel, [ProcCyclone]>;
 def : ProcessorModel<"exynos-m1", ExynosM1Model, [ProcExynosM1]>;
+def : ProcessorModel<"kryo", KryoModel, [ProcKryo]>;
+def : ProcessorModel<"vulcan", VulcanModel, [ProcVulcan]>;
 
 //===----------------------------------------------------------------------===//
 // Assembly parser
diff --git a/lib/Target/AArch64/AArch64A53Fix835769.cpp b/lib/Target/AArch64/AArch64A53Fix835769.cpp
index d215d9e831c0..c2cca63f4977 100644
--- a/lib/Target/AArch64/AArch64A53Fix835769.cpp
+++ b/lib/Target/AArch64/AArch64A53Fix835769.cpp
@@ -22,7 +22,6 @@
 #include "llvm/CodeGen/MachineInstr.h"
 #include "llvm/CodeGen/MachineInstrBuilder.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
-#include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/Target/TargetInstrInfo.h"
@@ -87,6 +86,11 @@ public:
 
   bool runOnMachineFunction(MachineFunction &F) override;
 
+  MachineFunctionProperties getRequiredProperties() const override {
+    return MachineFunctionProperties().set(
+        MachineFunctionProperties::Property::AllVRegsAllocated);
+  }
+
   const char *getPassName() const override {
     return "Workaround A53 erratum 835769 pass";
   }
@@ -133,8 +137,8 @@ static MachineBasicBlock *getBBFallenThrough(MachineBasicBlock *MBB,
 
   MachineBasicBlock *PrevBB = &*std::prev(MBBI);
   for (MachineBasicBlock *S : MBB->predecessors())
-    if (S == PrevBB && !TII->AnalyzeBranch(*PrevBB, TBB, FBB, Cond) &&
-        !TBB && !FBB)
+    if (S == PrevBB && !TII->analyzeBranch(*PrevBB, TBB, FBB, Cond) && !TBB &&
+        !FBB)
       return S;
 
   return nullptr;
diff --git a/lib/Target/AArch64/AArch64A57FPLoadBalancing.cpp b/lib/Target/AArch64/AArch64A57FPLoadBalancing.cpp
index 3d1ab4e3fc2b..0465e59dc54a 100644
--- a/lib/Target/AArch64/AArch64A57FPLoadBalancing.cpp
+++ b/lib/Target/AArch64/AArch64A57FPLoadBalancing.cpp
@@ -43,7 +43,6 @@
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/raw_ostream.h"
-#include <list>
 using namespace llvm;
 
 #define DEBUG_TYPE "aarch64-a57-fp-load-balancing"
@@ -125,6 +124,11 @@ public:
 
   bool runOnMachineFunction(MachineFunction &F) override;
 
+  MachineFunctionProperties getRequiredProperties() const override {
+    return MachineFunctionProperties().set(
+        MachineFunctionProperties::Property::AllVRegsAllocated);
+  }
+
   const char *getPassName() const override {
     return "A57 FP Anti-dependency breaker";
   }
@@ -222,7 +226,7 @@ public:
   }
 
   /// Return true if MI is a member of the chain.
-  bool contains(MachineInstr *MI) { return Insts.count(MI) > 0; }
+  bool contains(MachineInstr &MI) { return Insts.count(&MI) > 0; }
 
   /// Return the number of instructions in the chain.
   unsigned size() const {
@@ -248,9 +252,10 @@ public:
   MachineInstr *getKill() const { return KillInst; }
   /// Return an instruction that can be used as an iterator for the end
   /// of the chain. This is the maximum of KillInst (if set) and LastInst.
-  MachineBasicBlock::iterator getEnd() const {
+  MachineBasicBlock::iterator end() const {
     return ++MachineBasicBlock::iterator(KillInst ? KillInst : LastInst);
   }
+  MachineBasicBlock::iterator begin() const { return getStart(); }
 
   /// Can the Kill instruction (assuming one exists) be modified?
   bool isKillImmutable() const { return KillIsImmutable; }
@@ -307,9 +312,10 @@ public:
 //===----------------------------------------------------------------------===//
 
 bool AArch64A57FPLoadBalancing::runOnMachineFunction(MachineFunction &F) {
-  // Don't do anything if this isn't an A53 or A57.
-  if (!(F.getSubtarget<AArch64Subtarget>().isCortexA53() ||
-        F.getSubtarget<AArch64Subtarget>().isCortexA57()))
+  if (skipFunction(*F.getFunction()))
+    return false;
+
+  if (!F.getSubtarget<AArch64Subtarget>().balanceFPOps())
     return false;
 
   bool Changed = false;
@@ -492,15 +498,14 @@ bool AArch64A57FPLoadBalancing::colorChainSet(std::vector<Chain*> GV,
 int AArch64A57FPLoadBalancing::scavengeRegister(Chain *G, Color C,
                                                 MachineBasicBlock &MBB) {
   RegScavenger RS;
-  RS.enterBasicBlock(&MBB);
+  RS.enterBasicBlock(MBB);
   RS.forward(MachineBasicBlock::iterator(G->getStart()));
 
   // Can we find an appropriate register that is available throughout the life
   // of the chain?
   unsigned RegClassID = G->getStart()->getDesc().OpInfo[0].RegClass;
   BitVector AvailableRegs = RS.getRegsAvailable(TRI->getRegClass(RegClassID));
-  for (MachineBasicBlock::iterator I = G->getStart(), E = G->getEnd();
-       I != E; ++I) {
+  for (MachineBasicBlock::iterator I = G->begin(), E = G->end(); I != E; ++I) {
     RS.forward(I);
     AvailableRegs &= RS.getRegsAvailable(TRI->getRegClass(RegClassID));
 
@@ -530,8 +535,7 @@ int AArch64A57FPLoadBalancing::scavengeRegister(Chain *G, Color C,
   for (auto Reg : Ord) {
     if (!AvailableRegs[Reg])
       continue;
-    if ((C == Color::Even && (Reg % 2) == 0) ||
-        (C == Color::Odd && (Reg % 2) == 1))
+    if (C == getColor(Reg))
       return Reg;
   }
 
@@ -554,16 +558,14 @@ bool AArch64A57FPLoadBalancing::colorChain(Chain *G, Color C,
   DEBUG(dbgs() << " - Scavenged register: " << TRI->getName(Reg) << "\n");
 
   std::map<unsigned, unsigned> Substs;
-  for (MachineBasicBlock::iterator I = G->getStart(), E = G->getEnd();
-       I != E; ++I) {
-    if (!G->contains(I) &&
-        (&*I != G->getKill() || G->isKillImmutable()))
+  for (MachineInstr &I : *G) {
+    if (!G->contains(I) && (&I != G->getKill() || G->isKillImmutable()))
       continue;
 
     // I is a member of G, or I is a mutable instruction that kills G.
 
     std::vector<unsigned> ToErase;
-    for (auto &U : I->operands()) {
+    for (auto &U : I.operands()) {
       if (U.isReg() && U.isUse() && Substs.find(U.getReg()) != Substs.end()) {
         unsigned OrigReg = U.getReg();
         U.setReg(Substs[OrigReg]);
@@ -583,11 +585,11 @@ bool AArch64A57FPLoadBalancing::colorChain(Chain *G, Color C,
       Substs.erase(J);
 
     // Only change the def if this isn't the last instruction.
-    if (&*I != G->getKill()) {
-      MachineOperand &MO = I->getOperand(0);
+    if (&I != G->getKill()) {
+      MachineOperand &MO = I.getOperand(0);
 
       bool Change = TransformAll || getColor(MO.getReg()) != C;
-      if (G->requiresFixup() && &*I == G->getLast())
+      if (G->requiresFixup() && &I == G->getLast())
         Change = false;
 
       if (Change) {
diff --git a/lib/Target/AArch64/AArch64AddressTypePromotion.cpp b/lib/Target/AArch64/AArch64AddressTypePromotion.cpp
index 3afcdfb8b930..4846ef08c983 100644
--- a/lib/Target/AArch64/AArch64AddressTypePromotion.cpp
+++ b/lib/Target/AArch64/AArch64AddressTypePromotion.cpp
@@ -20,10 +20,9 @@
 // e = getelementptr ..., i64 a
 //
 // This is legal to do if the computations are marked with either nsw or nuw
-// markers.
-// Moreover, the current heuristic is simple: it does not create new sext
-// operations, i.e., it gives up when a sext would have forked (e.g., if
-// a = add i32 b, c, two sexts are required to promote the computation).
+// markers. Moreover, the current heuristic is simple: it does not create new
+// sext operations, i.e., it gives up when a sext would have forked (e.g., if a
+// = add i32 b, c, two sexts are required to promote the computation).
 //
 // FIXME: This pass may be useful for other targets too.
 // ===---------------------------------------------------------------------===//
@@ -207,9 +206,7 @@ bool AArch64AddressTypePromotion::shouldGetThrough(const Instruction *Inst) {
 }
 
 static bool shouldSExtOperand(const Instruction *Inst, int OpIdx) {
-  if (isa<SelectInst>(Inst) && OpIdx == 0)
-    return false;
-  return true;
+  return !(isa<SelectInst>(Inst) && OpIdx == 0);
 }
 
 bool
@@ -481,6 +478,9 @@ void AArch64AddressTypePromotion::analyzeSExtension(Instructions &SExtInsts) {
 }
 
 bool AArch64AddressTypePromotion::runOnFunction(Function &F) {
+  if (skipFunction(F))
+    return false;
+
   if (!EnableAddressTypePromotion || F.isDeclaration())
     return false;
   Func = &F;
diff --git a/lib/Target/AArch64/AArch64AdvSIMDScalarPass.cpp b/lib/Target/AArch64/AArch64AdvSIMDScalarPass.cpp
index 1644d71d2821..d0a2dd3fa1fc 100644
--- a/lib/Target/AArch64/AArch64AdvSIMDScalarPass.cpp
+++ b/lib/Target/AArch64/AArch64AdvSIMDScalarPass.cpp
@@ -76,12 +76,12 @@ private:
   // isProfitableToTransform - Predicate function to determine whether an
   // instruction should be transformed to its equivalent AdvSIMD scalar
   // instruction. "add Xd, Xn, Xm" ==> "add Dd, Da, Db", for example.
-  bool isProfitableToTransform(const MachineInstr *MI) const;
+  bool isProfitableToTransform(const MachineInstr &MI) const;
 
   // transformInstruction - Perform the transformation of an instruction
   // to its equivalant AdvSIMD scalar instruction. Update inputs and outputs
   // to be the correct register class, minimizing cross-class copies.
-  void transformInstruction(MachineInstr *MI);
+  void transformInstruction(MachineInstr &MI);
 
   // processMachineBasicBlock - Main optimzation loop.
   bool processMachineBasicBlock(MachineBasicBlock *MBB);
@@ -132,19 +132,19 @@ static bool isFPR64(unsigned Reg, unsigned SubReg,
 
 // getSrcFromCopy - Get the original source register for a GPR64 <--> FPR64
 // copy instruction. Return zero_reg if the instruction is not a copy.
-static unsigned getSrcFromCopy(const MachineInstr *MI,
-                               const MachineRegisterInfo *MRI,
-                               unsigned &SubReg) {
+static MachineOperand *getSrcFromCopy(MachineInstr *MI,
+                                      const MachineRegisterInfo *MRI,
+                                      unsigned &SubReg) {
   SubReg = 0;
   // The "FMOV Xd, Dn" instruction is the typical form.
   if (MI->getOpcode() == AArch64::FMOVDXr ||
       MI->getOpcode() == AArch64::FMOVXDr)
-    return MI->getOperand(1).getReg();
+    return &MI->getOperand(1);
   // A lane zero extract "UMOV.d Xd, Vn[0]" is equivalent. We shouldn't see
   // these at this stage, but it's easy to check for.
   if (MI->getOpcode() == AArch64::UMOVvi64 && MI->getOperand(2).getImm() == 0) {
     SubReg = AArch64::dsub;
-    return MI->getOperand(1).getReg();
+    return &MI->getOperand(1);
   }
   // Or just a plain COPY instruction. This can be directly to/from FPR64,
   // or it can be a dsub subreg reference to an FPR128.
@@ -152,18 +152,18 @@ static unsigned getSrcFromCopy(const MachineInstr *MI,
     if (isFPR64(MI->getOperand(0).getReg(), MI->getOperand(0).getSubReg(),
                 MRI) &&
         isGPR64(MI->getOperand(1).getReg(), MI->getOperand(1).getSubReg(), MRI))
-      return MI->getOperand(1).getReg();
+      return &MI->getOperand(1);
     if (isGPR64(MI->getOperand(0).getReg(), MI->getOperand(0).getSubReg(),
                 MRI) &&
         isFPR64(MI->getOperand(1).getReg(), MI->getOperand(1).getSubReg(),
                 MRI)) {
       SubReg = MI->getOperand(1).getSubReg();
-      return MI->getOperand(1).getReg();
+      return &MI->getOperand(1);
     }
   }
 
   // Otherwise, this is some other kind of instruction.
-  return 0;
+  return nullptr;
 }
 
 // getTransformOpcode - For any opcode for which there is an AdvSIMD equivalent
@@ -189,16 +189,16 @@ static unsigned getTransformOpcode(unsigned Opc) {
   return Opc;
 }
 
-static bool isTransformable(const MachineInstr *MI) {
-  unsigned Opc = MI->getOpcode();
+static bool isTransformable(const MachineInstr &MI) {
+  unsigned Opc = MI.getOpcode();
   return Opc != getTransformOpcode(Opc);
 }
 
 // isProfitableToTransform - Predicate function to determine whether an
 // instruction should be transformed to its equivalent AdvSIMD scalar
 // instruction. "add Xd, Xn, Xm" ==> "add Dd, Da, Db", for example.
-bool
-AArch64AdvSIMDScalar::isProfitableToTransform(const MachineInstr *MI) const {
+bool AArch64AdvSIMDScalar::isProfitableToTransform(
+    const MachineInstr &MI) const {
   // If this instruction isn't eligible to be transformed (no SIMD equivalent),
   // early exit since that's the common case.
   if (!isTransformable(MI))
@@ -209,33 +209,33 @@ AArch64AdvSIMDScalar::isProfitableToTransform(const MachineInstr *MI) const {
   unsigned NumNewCopies = 3;
   unsigned NumRemovableCopies = 0;
 
-  unsigned OrigSrc0 = MI->getOperand(1).getReg();
-  unsigned OrigSrc1 = MI->getOperand(2).getReg();
-  unsigned Src0 = 0, SubReg0;
-  unsigned Src1 = 0, SubReg1;
+  unsigned OrigSrc0 = MI.getOperand(1).getReg();
+  unsigned OrigSrc1 = MI.getOperand(2).getReg();
+  unsigned SubReg0;
+  unsigned SubReg1;
   if (!MRI->def_empty(OrigSrc0)) {
     MachineRegisterInfo::def_instr_iterator Def =
         MRI->def_instr_begin(OrigSrc0);
     assert(std::next(Def) == MRI->def_instr_end() && "Multiple def in SSA!");
-    Src0 = getSrcFromCopy(&*Def, MRI, SubReg0);
+    MachineOperand *MOSrc0 = getSrcFromCopy(&*Def, MRI, SubReg0);
     // If the source was from a copy, we don't need to insert a new copy.
-    if (Src0)
+    if (MOSrc0)
       --NumNewCopies;
     // If there are no other users of the original source, we can delete
     // that instruction.
-    if (Src0 && MRI->hasOneNonDBGUse(OrigSrc0))
+    if (MOSrc0 && MRI->hasOneNonDBGUse(OrigSrc0))
       ++NumRemovableCopies;
   }
   if (!MRI->def_empty(OrigSrc1)) {
     MachineRegisterInfo::def_instr_iterator Def =
         MRI->def_instr_begin(OrigSrc1);
     assert(std::next(Def) == MRI->def_instr_end() && "Multiple def in SSA!");
-    Src1 = getSrcFromCopy(&*Def, MRI, SubReg1);
-    if (Src1)
+    MachineOperand *MOSrc1 = getSrcFromCopy(&*Def, MRI, SubReg1);
+    if (MOSrc1)
       --NumNewCopies;
     // If there are no other users of the original source, we can delete
     // that instruction.
-    if (Src1 && MRI->hasOneNonDBGUse(OrigSrc1))
+    if (MOSrc1 && MRI->hasOneNonDBGUse(OrigSrc1))
       ++NumRemovableCopies;
   }
 
@@ -244,14 +244,14 @@ AArch64AdvSIMDScalar::isProfitableToTransform(const MachineInstr *MI) const {
   // any of the uses is a transformable instruction, it's likely the tranforms
   // will chain, enabling us to save a copy there, too. This is an aggressive
   // heuristic that approximates the graph based cost analysis described above.
-  unsigned Dst = MI->getOperand(0).getReg();
+  unsigned Dst = MI.getOperand(0).getReg();
   bool AllUsesAreCopies = true;
   for (MachineRegisterInfo::use_instr_nodbg_iterator
            Use = MRI->use_instr_nodbg_begin(Dst),
            E = MRI->use_instr_nodbg_end();
        Use != E; ++Use) {
     unsigned SubReg;
-    if (getSrcFromCopy(&*Use, MRI, SubReg) || isTransformable(&*Use))
+    if (getSrcFromCopy(&*Use, MRI, SubReg) || isTransformable(*Use))
       ++NumRemovableCopies;
     // If the use is an INSERT_SUBREG, that's still something that can
     // directly use the FPR64, so we don't invalidate AllUsesAreCopies. It's
@@ -279,12 +279,11 @@ AArch64AdvSIMDScalar::isProfitableToTransform(const MachineInstr *MI) const {
   return TransformAll;
 }
 
-static MachineInstr *insertCopy(const TargetInstrInfo *TII, MachineInstr *MI,
+static MachineInstr *insertCopy(const TargetInstrInfo *TII, MachineInstr &MI,
                                 unsigned Dst, unsigned Src, bool IsKill) {
-  MachineInstrBuilder MIB =
-      BuildMI(*MI->getParent(), MI, MI->getDebugLoc(), TII->get(AArch64::COPY),
-              Dst)
-          .addReg(Src, getKillRegState(IsKill));
+  MachineInstrBuilder MIB = BuildMI(*MI.getParent(), MI, MI.getDebugLoc(),
+                                    TII->get(AArch64::COPY), Dst)
+                                .addReg(Src, getKillRegState(IsKill));
   DEBUG(dbgs() << "    adding copy: " << *MIB);
   ++NumCopiesInserted;
   return MIB;
@@ -293,43 +292,56 @@ static MachineInstr *insertCopy(const TargetInstrInfo *TII, MachineInstr *MI,
 // transformInstruction - Perform the transformation of an instruction
 // to its equivalant AdvSIMD scalar instruction. Update inputs and outputs
 // to be the correct register class, minimizing cross-class copies.
-void AArch64AdvSIMDScalar::transformInstruction(MachineInstr *MI) {
-  DEBUG(dbgs() << "Scalar transform: " << *MI);
+void AArch64AdvSIMDScalar::transformInstruction(MachineInstr &MI) {
+  DEBUG(dbgs() << "Scalar transform: " << MI);
 
-  MachineBasicBlock *MBB = MI->getParent();
-  unsigned OldOpc = MI->getOpcode();
+  MachineBasicBlock *MBB = MI.getParent();
+  unsigned OldOpc = MI.getOpcode();
   unsigned NewOpc = getTransformOpcode(OldOpc);
   assert(OldOpc != NewOpc && "transform an instruction to itself?!");
 
   // Check if we need a copy for the source registers.
-  unsigned OrigSrc0 = MI->getOperand(1).getReg();
-  unsigned OrigSrc1 = MI->getOperand(2).getReg();
+  unsigned OrigSrc0 = MI.getOperand(1).getReg();
+  unsigned OrigSrc1 = MI.getOperand(2).getReg();
   unsigned Src0 = 0, SubReg0;
   unsigned Src1 = 0, SubReg1;
+  bool KillSrc0 = false, KillSrc1 = false;
   if (!MRI->def_empty(OrigSrc0)) {
     MachineRegisterInfo::def_instr_iterator Def =
         MRI->def_instr_begin(OrigSrc0);
     assert(std::next(Def) == MRI->def_instr_end() && "Multiple def in SSA!");
-    Src0 = getSrcFromCopy(&*Def, MRI, SubReg0);
+    MachineOperand *MOSrc0 = getSrcFromCopy(&*Def, MRI, SubReg0);
     // If there are no other users of the original source, we can delete
     // that instruction.
-    if (Src0 && MRI->hasOneNonDBGUse(OrigSrc0)) {
-      assert(Src0 && "Can't delete copy w/o a valid original source!");
-      Def->eraseFromParent();
-      ++NumCopiesDeleted;
+    if (MOSrc0) {
+      Src0 = MOSrc0->getReg();
+      KillSrc0 = MOSrc0->isKill();
+      // Src0 is going to be reused, thus, it cannot be killed anymore.
+      MOSrc0->setIsKill(false);
+      if (MRI->hasOneNonDBGUse(OrigSrc0)) {
+        assert(MOSrc0 && "Can't delete copy w/o a valid original source!");
+        Def->eraseFromParent();
+        ++NumCopiesDeleted;
+      }
     }
   }
   if (!MRI->def_empty(OrigSrc1)) {
     MachineRegisterInfo::def_instr_iterator Def =
         MRI->def_instr_begin(OrigSrc1);
     assert(std::next(Def) == MRI->def_instr_end() && "Multiple def in SSA!");
-    Src1 = getSrcFromCopy(&*Def, MRI, SubReg1);
+    MachineOperand *MOSrc1 = getSrcFromCopy(&*Def, MRI, SubReg1);
     // If there are no other users of the original source, we can delete
     // that instruction.
-    if (Src1 && MRI->hasOneNonDBGUse(OrigSrc1)) {
-      assert(Src1 && "Can't delete copy w/o a valid original source!");
-      Def->eraseFromParent();
-      ++NumCopiesDeleted;
+    if (MOSrc1) {
+      Src1 = MOSrc1->getReg();
+      KillSrc1 = MOSrc1->isKill();
+      // Src0 is going to be reused, thus, it cannot be killed anymore.
+      MOSrc1->setIsKill(false);
+      if (MRI->hasOneNonDBGUse(OrigSrc1)) {
+        assert(MOSrc1 && "Can't delete copy w/o a valid original source!");
+        Def->eraseFromParent();
+        ++NumCopiesDeleted;
+      }
     }
   }
   // If we weren't able to reference the original source directly, create a
@@ -337,12 +349,14 @@ void AArch64AdvSIMDScalar::transformInstruction(MachineInstr *MI) {
   if (!Src0) {
     SubReg0 = 0;
     Src0 = MRI->createVirtualRegister(&AArch64::FPR64RegClass);
-    insertCopy(TII, MI, Src0, OrigSrc0, true);
+    insertCopy(TII, MI, Src0, OrigSrc0, KillSrc0);
+    KillSrc0 = true;
   }
   if (!Src1) {
     SubReg1 = 0;
     Src1 = MRI->createVirtualRegister(&AArch64::FPR64RegClass);
-    insertCopy(TII, MI, Src1, OrigSrc1, true);
+    insertCopy(TII, MI, Src1, OrigSrc1, KillSrc1);
+    KillSrc1 = true;
   }
 
   // Create a vreg for the destination.
@@ -353,17 +367,17 @@ void AArch64AdvSIMDScalar::transformInstruction(MachineInstr *MI) {
   // For now, all of the new instructions have the same simple three-register
   // form, so no need to special case based on what instruction we're
   // building.
-  BuildMI(*MBB, MI, MI->getDebugLoc(), TII->get(NewOpc), Dst)
-      .addReg(Src0, getKillRegState(true), SubReg0)
-      .addReg(Src1, getKillRegState(true), SubReg1);
+  BuildMI(*MBB, MI, MI.getDebugLoc(), TII->get(NewOpc), Dst)
+      .addReg(Src0, getKillRegState(KillSrc0), SubReg0)
+      .addReg(Src1, getKillRegState(KillSrc1), SubReg1);
 
   // Now copy the result back out to a GPR.
   // FIXME: Try to avoid this if all uses could actually just use the FPR64
   // directly.
-  insertCopy(TII, MI, MI->getOperand(0).getReg(), Dst, true);
+  insertCopy(TII, MI, MI.getOperand(0).getReg(), Dst, true);
 
   // Erase the old instruction.
-  MI->eraseFromParent();
+  MI.eraseFromParent();
 
   ++NumScalarInsnsUsed;
 }
@@ -372,8 +386,7 @@ void AArch64AdvSIMDScalar::transformInstruction(MachineInstr *MI) {
 bool AArch64AdvSIMDScalar::processMachineBasicBlock(MachineBasicBlock *MBB) {
   bool Changed = false;
   for (MachineBasicBlock::iterator I = MBB->begin(), E = MBB->end(); I != E;) {
-    MachineInstr *MI = I;
-    ++I;
+    MachineInstr &MI = *I++;
     if (isProfitableToTransform(MI)) {
       transformInstruction(MI);
       Changed = true;
@@ -387,6 +400,9 @@ bool AArch64AdvSIMDScalar::runOnMachineFunction(MachineFunction &mf) {
   bool Changed = false;
   DEBUG(dbgs() << "***** AArch64AdvSIMDScalar *****\n");
 
+  if (skipFunction(*mf.getFunction()))
+    return false;
+
   MRI = &mf.getRegInfo();
   TII = mf.getSubtarget().getInstrInfo();
 
diff --git a/lib/Target/AArch64/AArch64AsmPrinter.cpp b/lib/Target/AArch64/AArch64AsmPrinter.cpp
index ada995bad37e..22374f754603 100644
--- a/lib/Target/AArch64/AArch64AsmPrinter.cpp
+++ b/lib/Target/AArch64/AArch64AsmPrinter.cpp
@@ -49,6 +49,7 @@ namespace {
 class AArch64AsmPrinter : public AsmPrinter {
   AArch64MCInstLower MCInstLowering;
   StackMaps SM;
+  const AArch64Subtarget *STI;
 
 public:
   AArch64AsmPrinter(TargetMachine &TM, std::unique_ptr<MCStreamer> Streamer)
@@ -83,11 +84,11 @@ public:
 
   bool runOnMachineFunction(MachineFunction &F) override {
     AArch64FI = F.getInfo<AArch64FunctionInfo>();
+    STI = static_cast<const AArch64Subtarget*>(&F.getSubtarget());
     return AsmPrinter::runOnMachineFunction(F);
   }
 
 private:
-  MachineLocation getDebugValueLocation(const MachineInstr *MI) const;
   void printOperand(const MachineInstr *MI, unsigned OpNum, raw_ostream &O);
   bool printAsmMRegister(const MachineOperand &MO, char Mode, raw_ostream &O);
   bool printAsmRegInClass(const MachineOperand &MO,
@@ -112,6 +113,9 @@ private:
   /// \brief Emit the LOHs contained in AArch64FI.
   void EmitLOHs();
 
+  /// Emit instruction to set float register to zero.
+  void EmitFMov0(const MachineInstr &MI);
+
   typedef std::map<const MachineInstr *, MCSymbol *> MInstToMCSymbol;
   MInstToMCSymbol LOHInstToLabel;
 };
@@ -133,19 +137,6 @@ void AArch64AsmPrinter::EmitEndOfAsmFile(Module &M) {
   }
 }
 
-MachineLocation
-AArch64AsmPrinter::getDebugValueLocation(const MachineInstr *MI) const {
-  MachineLocation Location;
-  assert(MI->getNumOperands() == 4 && "Invalid no. of machine operands!");
-  // Frame address.  Currently handles register +- offset only.
-  if (MI->getOperand(0).isReg() && MI->getOperand(1).isImm())
-    Location.set(MI->getOperand(0).getReg(), MI->getOperand(1).getImm());
-  else {
-    DEBUG(dbgs() << "DBG_VALUE instruction ignored! " << *MI << "\n");
-  }
-  return Location;
-}
-
 void AArch64AsmPrinter::EmitLOHs() {
   SmallVector<MCSymbol *, 3> MCArgs;
 
@@ -238,8 +229,7 @@ bool AArch64AsmPrinter::printAsmRegInClass(const MachineOperand &MO,
                                            const TargetRegisterClass *RC,
                                            bool isVector, raw_ostream &O) {
   assert(MO.isReg() && "Should only get here with a register!");
-  const AArch64RegisterInfo *RI =
-      MF->getSubtarget<AArch64Subtarget>().getRegisterInfo();
+  const TargetRegisterInfo *RI = STI->getRegisterInfo();
   unsigned Reg = MO.getReg();
   unsigned RegToPrint = RC->getRegister(RI->getEncodingValue(Reg));
   assert(RI->regsOverlap(RegToPrint, Reg));
@@ -404,16 +394,16 @@ void AArch64AsmPrinter::LowerPATCHPOINT(MCStreamer &OutStreamer, StackMaps &SM,
     unsigned ScratchReg = MI.getOperand(Opers.getNextScratchIdx()).getReg();
     EncodedBytes = 16;
     // Materialize the jump address:
-    EmitToStreamer(OutStreamer, MCInstBuilder(AArch64::MOVZWi)
+    EmitToStreamer(OutStreamer, MCInstBuilder(AArch64::MOVZXi)
                                     .addReg(ScratchReg)
                                     .addImm((CallTarget >> 32) & 0xFFFF)
                                     .addImm(32));
-    EmitToStreamer(OutStreamer, MCInstBuilder(AArch64::MOVKWi)
+    EmitToStreamer(OutStreamer, MCInstBuilder(AArch64::MOVKXi)
                                     .addReg(ScratchReg)
                                     .addReg(ScratchReg)
                                     .addImm((CallTarget >> 16) & 0xFFFF)
                                     .addImm(16));
-    EmitToStreamer(OutStreamer, MCInstBuilder(AArch64::MOVKWi)
+    EmitToStreamer(OutStreamer, MCInstBuilder(AArch64::MOVKXi)
                                     .addReg(ScratchReg)
                                     .addReg(ScratchReg)
                                     .addImm(CallTarget & 0xFFFF)
@@ -430,6 +420,40 @@ void AArch64AsmPrinter::LowerPATCHPOINT(MCStreamer &OutStreamer, StackMaps &SM,
     EmitToStreamer(OutStreamer, MCInstBuilder(AArch64::HINT).addImm(0));
 }
 
+void AArch64AsmPrinter::EmitFMov0(const MachineInstr &MI) {
+  unsigned DestReg = MI.getOperand(0).getReg();
+  if (STI->hasZeroCycleZeroing()) {
+    // Convert S/D register to corresponding Q register
+    if (AArch64::S0 <= DestReg && DestReg <= AArch64::S31) {
+      DestReg = AArch64::Q0 + (DestReg - AArch64::S0);
+    } else {
+      assert(AArch64::D0 <= DestReg && DestReg <= AArch64::D31);
+      DestReg = AArch64::Q0 + (DestReg - AArch64::D0);
+    }
+    MCInst MOVI;
+    MOVI.setOpcode(AArch64::MOVIv2d_ns);
+    MOVI.addOperand(MCOperand::createReg(DestReg));
+    MOVI.addOperand(MCOperand::createImm(0));
+    EmitToStreamer(*OutStreamer, MOVI);
+  } else {
+    MCInst FMov;
+    switch (MI.getOpcode()) {
+    default: llvm_unreachable("Unexpected opcode");
+    case AArch64::FMOVS0:
+      FMov.setOpcode(AArch64::FMOVWSr);
+      FMov.addOperand(MCOperand::createReg(DestReg));
+      FMov.addOperand(MCOperand::createReg(AArch64::WZR));
+      break;
+    case AArch64::FMOVD0:
+      FMov.setOpcode(AArch64::FMOVXDr);
+      FMov.addOperand(MCOperand::createReg(DestReg));
+      FMov.addOperand(MCOperand::createReg(AArch64::XZR));
+      break;
+    }
+    EmitToStreamer(*OutStreamer, FMov);
+  }
+}
+
 // Simple pseudo-instructions have their lowering (with expansion to real
 // instructions) auto-generated.
 #include "AArch64GenMCPseudoLowering.inc"
@@ -535,6 +559,11 @@ void AArch64AsmPrinter::EmitInstruction(const MachineInstr *MI) {
     return;
   }
 
+  case AArch64::FMOVS0:
+  case AArch64::FMOVD0:
+    EmitFMov0(*MI);
+    return;
+
   case TargetOpcode::STACKMAP:
     return LowerSTACKMAP(*OutStreamer, SM, *MI);
 
diff --git a/lib/Target/AArch64/AArch64BranchRelaxation.cpp b/lib/Target/AArch64/AArch64BranchRelaxation.cpp
index a614f555a4e9..9ec6ae4118a4 100644
--- a/lib/Target/AArch64/AArch64BranchRelaxation.cpp
+++ b/lib/Target/AArch64/AArch64BranchRelaxation.cpp
@@ -177,7 +177,7 @@ void AArch64BranchRelaxation::scanFunction() {
 void AArch64BranchRelaxation::computeBlockSize(const MachineBasicBlock &MBB) {
   unsigned Size = 0;
   for (const MachineInstr &MI : MBB)
-    Size += TII->GetInstSizeInBytes(&MI);
+    Size += TII->GetInstSizeInBytes(MI);
   BlockInfo[MBB.getNumber()].Size = Size;
 }
 
@@ -195,7 +195,7 @@ unsigned AArch64BranchRelaxation::getInstrOffset(MachineInstr *MI) const {
   // Sum instructions before MI in MBB.
   for (MachineBasicBlock::iterator I = MBB->begin(); &*I != MI; ++I) {
     assert(I != MBB->end() && "Didn't find MI in its own basic block?");
-    Offset += TII->GetInstSizeInBytes(I);
+    Offset += TII->GetInstSizeInBytes(*I);
   }
   return Offset;
 }
@@ -415,12 +415,12 @@ bool AArch64BranchRelaxation::fixupConditionalBranch(MachineInstr *MI) {
     // Analyze the branch so we know how to update the successor lists.
     MachineBasicBlock *TBB, *FBB;
     SmallVector<MachineOperand, 2> Cond;
-    TII->AnalyzeBranch(*MBB, TBB, FBB, Cond, false);
+    TII->analyzeBranch(*MBB, TBB, FBB, Cond, false);
 
     MachineBasicBlock *NewBB = splitBlockBeforeInstr(MI);
     // No need for the branch to the next block. We're adding an unconditional
     // branch to the destination.
-    int delta = TII->GetInstSizeInBytes(&MBB->back());
+    int delta = TII->GetInstSizeInBytes(MBB->back());
     BlockInfo[MBB->getNumber()].Size -= delta;
     MBB->back().eraseFromParent();
     // BlockInfo[SplitBB].Offset is wrong temporarily, fixed below
@@ -446,12 +446,12 @@ bool AArch64BranchRelaxation::fixupConditionalBranch(MachineInstr *MI) {
   if (MI->getOpcode() == AArch64::Bcc)
     invertBccCondition(MIB);
   MIB.addMBB(NextBB);
-  BlockInfo[MBB->getNumber()].Size += TII->GetInstSizeInBytes(&MBB->back());
+  BlockInfo[MBB->getNumber()].Size += TII->GetInstSizeInBytes(MBB->back());
   BuildMI(MBB, DebugLoc(), TII->get(AArch64::B)).addMBB(DestBB);
-  BlockInfo[MBB->getNumber()].Size += TII->GetInstSizeInBytes(&MBB->back());
+  BlockInfo[MBB->getNumber()].Size += TII->GetInstSizeInBytes(MBB->back());
 
   // Remove the old conditional branch.  It may or may not still be in MBB.
-  BlockInfo[MI->getParent()->getNumber()].Size -= TII->GetInstSizeInBytes(MI);
+  BlockInfo[MI->getParent()->getNumber()].Size -= TII->GetInstSizeInBytes(*MI);
   MI->eraseFromParent();
 
   // Finally, keep the block offsets up to date.
@@ -463,12 +463,13 @@ bool AArch64BranchRelaxation::relaxBranchInstructions() {
   bool Changed = false;
   // Relaxing branches involves creating new basic blocks, so re-eval
   // end() for termination.
-  for (auto &MBB : *MF) {
-    MachineInstr *MI = MBB.getFirstTerminator();
-    if (isConditionalBranch(MI->getOpcode()) &&
-        !isBlockInRange(MI, getDestBlock(MI),
-                        getBranchDisplacementBits(MI->getOpcode()))) {
-      fixupConditionalBranch(MI);
+  for (MachineFunction::iterator I = MF->begin(); I != MF->end(); ++I) {
+    MachineBasicBlock &MBB = *I;
+    MachineInstr &MI = *MBB.getFirstTerminator();
+    if (isConditionalBranch(MI.getOpcode()) &&
+        !isBlockInRange(&MI, getDestBlock(&MI),
+                        getBranchDisplacementBits(MI.getOpcode()))) {
+      fixupConditionalBranch(&MI);
       ++NumRelaxed;
       Changed = true;
     }
@@ -513,8 +514,7 @@ bool AArch64BranchRelaxation::runOnMachineFunction(MachineFunction &mf) {
   return MadeChange;
 }
 
-/// createAArch64BranchRelaxation - returns an instance of the constpool
-/// island pass.
+/// Returns an instance of the AArch64 Branch Relaxation pass.
 FunctionPass *llvm::createAArch64BranchRelaxation() {
   return new AArch64BranchRelaxation();
 }
diff --git a/lib/Target/AArch64/AArch64CallLowering.cpp b/lib/Target/AArch64/AArch64CallLowering.cpp
new file mode 100644
index 000000000000..e3522e63c21c
--- /dev/null
+++ b/lib/Target/AArch64/AArch64CallLowering.cpp
@@ -0,0 +1,104 @@
+//===-- llvm/lib/Target/AArch64/AArch64CallLowering.cpp - Call lowering ---===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// This file implements the lowering of LLVM calls to machine code calls for
+/// GlobalISel.
+///
+//===----------------------------------------------------------------------===//
+
+#include "AArch64CallLowering.h"
+#include "AArch64ISelLowering.h"
+
+#include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+
+using namespace llvm;
+
+#ifndef LLVM_BUILD_GLOBAL_ISEL
+#error "This shouldn't be built without GISel"
+#endif
+
+AArch64CallLowering::AArch64CallLowering(const AArch64TargetLowering &TLI)
+  : CallLowering(&TLI) {
+}
+
+bool AArch64CallLowering::lowerReturn(MachineIRBuilder &MIRBuilder,
+                                        const Value *Val, unsigned VReg) const {
+  MachineInstr *Return = MIRBuilder.buildInstr(AArch64::RET_ReallyLR);
+  assert(Return && "Unable to build a return instruction?!");
+
+  assert(((Val && VReg) || (!Val && !VReg)) && "Return value without a vreg");
+  if (VReg) {
+    assert(Val->getType()->isIntegerTy() && "Type not supported yet");
+    unsigned Size = Val->getType()->getPrimitiveSizeInBits();
+    assert((Size == 64 || Size == 32) && "Size not supported yet");
+    unsigned ResReg = (Size == 32) ? AArch64::W0 : AArch64::X0;
+    // Set the insertion point to be right before Return.
+    MIRBuilder.setInstr(*Return, /* Before */ true);
+    MachineInstr *Copy =
+        MIRBuilder.buildInstr(TargetOpcode::COPY, ResReg, VReg);
+    (void)Copy;
+    assert(Copy->getNextNode() == Return &&
+           "The insertion did not happen where we expected");
+    MachineInstrBuilder(MIRBuilder.getMF(), Return)
+        .addReg(ResReg, RegState::Implicit);
+  }
+  return true;
+}
+
+bool AArch64CallLowering::lowerFormalArguments(
+    MachineIRBuilder &MIRBuilder, const Function::ArgumentListType &Args,
+    const SmallVectorImpl<unsigned> &VRegs) const {
+  MachineFunction &MF = MIRBuilder.getMF();
+  const Function &F = *MF.getFunction();
+
+  SmallVector<CCValAssign, 16> ArgLocs;
+  CCState CCInfo(F.getCallingConv(), F.isVarArg(), MF, ArgLocs, F.getContext());
+
+  unsigned NumArgs = Args.size();
+  Function::const_arg_iterator CurOrigArg = Args.begin();
+  const AArch64TargetLowering &TLI = *getTLI<AArch64TargetLowering>();
+  for (unsigned i = 0; i != NumArgs; ++i, ++CurOrigArg) {
+    MVT ValVT = MVT::getVT(CurOrigArg->getType());
+    CCAssignFn *AssignFn =
+        TLI.CCAssignFnForCall(F.getCallingConv(), /*IsVarArg=*/false);
+    bool Res =
+        AssignFn(i, ValVT, ValVT, CCValAssign::Full, ISD::ArgFlagsTy(), CCInfo);
+    assert(!Res && "Call operand has unhandled type");
+    (void)Res;
+  }
+  assert(ArgLocs.size() == Args.size() &&
+         "We have a different number of location and args?!");
+  for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
+    CCValAssign &VA = ArgLocs[i];
+
+    assert(VA.isRegLoc() && "Not yet implemented");
+    // Transform the arguments in physical registers into virtual ones.
+    MIRBuilder.getMBB().addLiveIn(VA.getLocReg());
+    MIRBuilder.buildInstr(TargetOpcode::COPY, VRegs[i], VA.getLocReg());
+
+    switch (VA.getLocInfo()) {
+    default:
+      llvm_unreachable("Unknown loc info!");
+    case CCValAssign::Full:
+      break;
+    case CCValAssign::BCvt:
+      // We don't care about bitcast.
+      break;
+    case CCValAssign::AExt:
+    case CCValAssign::SExt:
+    case CCValAssign::ZExt:
+      // Zero/Sign extend the register.
+      assert(0 && "Not yet implemented");
+      break;
+    }
+  }
+  return true;
+}
diff --git a/lib/Target/AArch64/AArch64CallLowering.h b/lib/Target/AArch64/AArch64CallLowering.h
new file mode 100644
index 000000000000..411622803461
--- /dev/null
+++ b/lib/Target/AArch64/AArch64CallLowering.h
@@ -0,0 +1,36 @@
+//===-- llvm/lib/Target/AArch64/AArch64CallLowering.h - Call lowering -----===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// This file describes how to lower LLVM calls to machine code calls.
+///
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_AARCH64_AARCH64CALLLOWERING
+#define LLVM_LIB_TARGET_AARCH64_AARCH64CALLLOWERING
+
+#include "llvm/CodeGen/GlobalISel/CallLowering.h"
+
+namespace llvm {
+
+class AArch64TargetLowering;
+
+class AArch64CallLowering: public CallLowering {
+ public:
+  AArch64CallLowering(const AArch64TargetLowering &TLI);
+
+  bool lowerReturn(MachineIRBuilder &MIRBuiler, const Value *Val,
+                   unsigned VReg) const override;
+  bool
+  lowerFormalArguments(MachineIRBuilder &MIRBuilder,
+                       const Function::ArgumentListType &Args,
+                       const SmallVectorImpl<unsigned> &VRegs) const override;
+};
+} // End of namespace llvm;
+#endif
diff --git a/lib/Target/AArch64/AArch64CallingConvention.td b/lib/Target/AArch64/AArch64CallingConvention.td
index 388d64ec4e99..178e3971640e 100644
--- a/lib/Target/AArch64/AArch64CallingConvention.td
+++ b/lib/Target/AArch64/AArch64CallingConvention.td
@@ -45,6 +45,9 @@ def CC_AArch64_AAPCS : CallingConv<[
   // supported there.
   CCIfNest<CCAssignToReg<[X18]>>,
 
+  // Pass SwiftSelf in a callee saved register.
+  CCIfSwiftSelf<CCIfType<[i64], CCAssignToRegWithShadow<[X20], [W20]>>>,
+
   CCIfConsecutiveRegs<CCCustom<"CC_AArch64_Custom_Block">>,
 
   // Handle i1, i8, i16, i32, i64, f32, f64 and v2f64 by passing in registers,
@@ -86,6 +89,8 @@ def RetCC_AArch64_AAPCS : CallingConv<[
   CCIfType<[v2f32], CCBitConvertToType<v2i32>>,
   CCIfType<[v2f64, v4f32], CCBitConvertToType<v2i64>>,
 
+  CCIfSwiftError<CCIfType<[i64], CCAssignToRegWithShadow<[X19], [W19]>>>,
+
   // Big endian vectors must be passed as if they were 1-element vectors so that
   // their lanes are in a consistent order.
   CCIfBigEndian<CCIfType<[v2i32, v2f32, v4i16, v4f16, v8i8],
@@ -126,6 +131,12 @@ def CC_AArch64_DarwinPCS : CallingConv<[
   // slot is 64-bit.
   CCIfByVal<CCPassByVal<8, 8>>,
 
+  // Pass SwiftSelf in a callee saved register.
+  CCIfSwiftSelf<CCIfType<[i64], CCAssignToRegWithShadow<[X20], [W20]>>>,
+
+  // A SwiftError is passed in X19.
+  CCIfSwiftError<CCIfType<[i64], CCAssignToRegWithShadow<[X19], [W19]>>>,
+
   CCIfConsecutiveRegs<CCCustom<"CC_AArch64_Custom_Block">>,
 
   // Handle i1, i8, i16, i32, i64, f32, f64 and v2f64 by passing in registers,
@@ -270,6 +281,9 @@ def CSR_AArch64_AAPCS : CalleeSavedRegs<(add LR, FP, X19, X20, X21, X22,
 // case)
 def CSR_AArch64_AAPCS_ThisReturn : CalleeSavedRegs<(add CSR_AArch64_AAPCS, X0)>;
 
+def CSR_AArch64_AAPCS_SwiftError
+    : CalleeSavedRegs<(sub CSR_AArch64_AAPCS, X19)>;
+
 // The function used by Darwin to obtain the address of a thread-local variable
 // guarantees more than a normal AAPCS function. x16 and x17 are used on the
 // fast path for calculation, but other registers except X0 (argument/return)
@@ -310,3 +324,7 @@ def CSR_AArch64_AllRegs
                            (sequence "Q%u", 0, 31))>;
 
 def CSR_AArch64_NoRegs : CalleeSavedRegs<(add)>;
+
+def CSR_AArch64_RT_MostRegs :  CalleeSavedRegs<(add CSR_AArch64_AAPCS,
+                                                (sequence "X%u", 9, 15))>;
+
diff --git a/lib/Target/AArch64/AArch64CleanupLocalDynamicTLSPass.cpp b/lib/Target/AArch64/AArch64CleanupLocalDynamicTLSPass.cpp
index 9310ac4a44a2..011a03622ba5 100644
--- a/lib/Target/AArch64/AArch64CleanupLocalDynamicTLSPass.cpp
+++ b/lib/Target/AArch64/AArch64CleanupLocalDynamicTLSPass.cpp
@@ -39,6 +39,9 @@ struct LDTLSCleanup : public MachineFunctionPass {
   LDTLSCleanup() : MachineFunctionPass(ID) {}
 
   bool runOnMachineFunction(MachineFunction &MF) override {
+    if (skipFunction(*MF.getFunction()))
+      return false;
+
     AArch64FunctionInfo *AFI = MF.getInfo<AArch64FunctionInfo>();
     if (AFI->getNumLocalDynamicTLSAccesses() < 2) {
       // No point folding accesses if there isn't at least two.
@@ -69,9 +72,9 @@ struct LDTLSCleanup : public MachineFunctionPass {
           break;
 
         if (TLSBaseAddrReg)
-          I = replaceTLSBaseAddrCall(I, TLSBaseAddrReg);
+          I = replaceTLSBaseAddrCall(*I, TLSBaseAddrReg);
         else
-          I = setRegister(I, &TLSBaseAddrReg);
+          I = setRegister(*I, &TLSBaseAddrReg);
         Changed = true;
         break;
       default:
@@ -89,27 +92,27 @@ struct LDTLSCleanup : public MachineFunctionPass {
 
   // Replace the TLS_base_addr instruction I with a copy from
   // TLSBaseAddrReg, returning the new instruction.
-  MachineInstr *replaceTLSBaseAddrCall(MachineInstr *I,
+  MachineInstr *replaceTLSBaseAddrCall(MachineInstr &I,
                                        unsigned TLSBaseAddrReg) {
-    MachineFunction *MF = I->getParent()->getParent();
+    MachineFunction *MF = I.getParent()->getParent();
     const TargetInstrInfo *TII = MF->getSubtarget().getInstrInfo();
 
     // Insert a Copy from TLSBaseAddrReg to x0, which is where the rest of the
     // code sequence assumes the address will be.
-    MachineInstr *Copy = BuildMI(*I->getParent(), I, I->getDebugLoc(),
-                                 TII->get(TargetOpcode::COPY),
-                                 AArch64::X0).addReg(TLSBaseAddrReg);
+    MachineInstr *Copy = BuildMI(*I.getParent(), I, I.getDebugLoc(),
+                                 TII->get(TargetOpcode::COPY), AArch64::X0)
+                             .addReg(TLSBaseAddrReg);
 
     // Erase the TLS_base_addr instruction.
-    I->eraseFromParent();
+    I.eraseFromParent();
 
     return Copy;
   }
 
   // Create a virtal register in *TLSBaseAddrReg, and populate it by
   // inserting a copy instruction after I. Returns the new instruction.
-  MachineInstr *setRegister(MachineInstr *I, unsigned *TLSBaseAddrReg) {
-    MachineFunction *MF = I->getParent()->getParent();
+  MachineInstr *setRegister(MachineInstr &I, unsigned *TLSBaseAddrReg) {
+    MachineFunction *MF = I.getParent()->getParent();
     const TargetInstrInfo *TII = MF->getSubtarget().getInstrInfo();
 
     // Create a virtual register for the TLS base address.
@@ -118,7 +121,7 @@ struct LDTLSCleanup : public MachineFunctionPass {
 
     // Insert a copy from X0 to TLSBaseAddrReg for later.
     MachineInstr *Copy =
-        BuildMI(*I->getParent(), ++I->getIterator(), I->getDebugLoc(),
+        BuildMI(*I.getParent(), ++I.getIterator(), I.getDebugLoc(),
                 TII->get(TargetOpcode::COPY), *TLSBaseAddrReg)
             .addReg(AArch64::X0);
 
diff --git a/lib/Target/AArch64/AArch64CollectLOH.cpp b/lib/Target/AArch64/AArch64CollectLOH.cpp
index 78c239b11ef3..5eecb3a86856 100644
--- a/lib/Target/AArch64/AArch64CollectLOH.cpp
+++ b/lib/Target/AArch64/AArch64CollectLOH.cpp
@@ -179,6 +179,11 @@ struct AArch64CollectLOH : public MachineFunctionPass {
 
   bool runOnMachineFunction(MachineFunction &MF) override;
 
+  MachineFunctionProperties getRequiredProperties() const override {
+    return MachineFunctionProperties().set(
+        MachineFunctionProperties::Property::AllVRegsAllocated);
+  }
+
   const char *getPassName() const override {
     return AARCH64_COLLECT_LOH_NAME;
   }
@@ -623,10 +628,7 @@ static void computeADRP(const InstrToInstrs &UseToDefs,
         continue;
       }
       DEBUG(dbgs() << "Record AdrpAdrp:\n" << *L2 << '\n' << *L1 << '\n');
-      SmallVector<const MachineInstr *, 2> Args;
-      Args.push_back(L2);
-      Args.push_back(L1);
-      AArch64FI.addLOHDirective(MCLOH_AdrpAdrp, Args);
+      AArch64FI.addLOHDirective(MCLOH_AdrpAdrp, {L2, L1});
       ++NumADRPSimpleCandidate;
     }
 #ifdef DEBUG
@@ -760,13 +762,9 @@ static bool registerADRCandidate(const MachineInstr &Use,
          "ADD already involved in LOH.");
   DEBUG(dbgs() << "Record AdrpAdd\n" << Def << '\n' << Use << '\n');
 
-  SmallVector<const MachineInstr *, 2> Args;
-  Args.push_back(&Def);
-  Args.push_back(&Use);
-
-  AArch64FI.addLOHDirective(Use.getOpcode() == AArch64::ADDXri ? MCLOH_AdrpAdd
-                                                           : MCLOH_AdrpLdrGot,
-                          Args);
+  AArch64FI.addLOHDirective(
+      Use.getOpcode() == AArch64::ADDXri ? MCLOH_AdrpAdd : MCLOH_AdrpLdrGot,
+      {&Def, &Use});
   return true;
 }
 
@@ -1036,6 +1034,9 @@ static void collectInvolvedReg(const MachineFunction &MF, MapRegToId &RegToId,
 }
 
 bool AArch64CollectLOH::runOnMachineFunction(MachineFunction &MF) {
+  if (skipFunction(*MF.getFunction()))
+    return false;
+
   const TargetRegisterInfo *TRI = MF.getSubtarget().getRegisterInfo();
   const MachineDominatorTree *MDT = &getAnalysis<MachineDominatorTree>();
 
diff --git a/lib/Target/AArch64/AArch64ConditionOptimizer.cpp b/lib/Target/AArch64/AArch64ConditionOptimizer.cpp
index fc27bfee73d1..8fff381d391e 100644
--- a/lib/Target/AArch64/AArch64ConditionOptimizer.cpp
+++ b/lib/Target/AArch64/AArch64ConditionOptimizer.cpp
@@ -70,7 +70,6 @@
 #include "llvm/CodeGen/MachineInstrBuilder.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
 #include "llvm/CodeGen/Passes.h"
-#include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/Target/TargetInstrInfo.h"
@@ -144,10 +143,18 @@ MachineInstr *AArch64ConditionOptimizer::findSuitableCompare(
   if (I->getOpcode() != AArch64::Bcc)
     return nullptr;
 
+  // Since we may modify cmp of this MBB, make sure NZCV does not live out.
+  for (auto SuccBB : MBB->successors())
+    if (SuccBB->isLiveIn(AArch64::NZCV))
+      return nullptr;
+
   // Now find the instruction controlling the terminator.
   for (MachineBasicBlock::iterator B = MBB->begin(); I != B;) {
     --I;
     assert(!I->isTerminator() && "Spurious terminator");
+    // Check if there is any use of NZCV between CMP and Bcc.
+    if (I->readsRegister(AArch64::NZCV))
+      return nullptr;
     switch (I->getOpcode()) {
     // cmp is an alias for subs with a dead destination register.
     case AArch64::SUBSWri:
@@ -166,7 +173,7 @@ MachineInstr *AArch64ConditionOptimizer::findSuitableCompare(
         DEBUG(dbgs() << "Destination of cmp is not dead, " << *I << '\n');
         return nullptr;
       }
-      return I;
+      return &*I;
     }
     // Prevent false positive case like:
     // cmp      w19, #0
@@ -268,13 +275,13 @@ void AArch64ConditionOptimizer::modifyCmp(MachineInstr *CmpMI,
 
   // The fact that this comparison was picked ensures that it's related to the
   // first terminator instruction.
-  MachineInstr *BrMI = MBB->getFirstTerminator();
+  MachineInstr &BrMI = *MBB->getFirstTerminator();
 
   // Change condition in branch instruction.
-  BuildMI(*MBB, BrMI, BrMI->getDebugLoc(), TII->get(AArch64::Bcc))
+  BuildMI(*MBB, BrMI, BrMI.getDebugLoc(), TII->get(AArch64::Bcc))
       .addImm(Cmp)
-      .addOperand(BrMI->getOperand(1));
-  BrMI->eraseFromParent();
+      .addOperand(BrMI.getOperand(1));
+  BrMI.eraseFromParent();
 
   MBB->updateTerminator();
 
@@ -311,6 +318,9 @@ bool AArch64ConditionOptimizer::adjustTo(MachineInstr *CmpMI,
 bool AArch64ConditionOptimizer::runOnMachineFunction(MachineFunction &MF) {
   DEBUG(dbgs() << "********** AArch64 Conditional Compares **********\n"
                << "********** Function: " << MF.getName() << '\n');
+  if (skipFunction(*MF.getFunction()))
+    return false;
+
   TII = MF.getSubtarget().getInstrInfo();
   DomTree = &getAnalysis<MachineDominatorTree>();
   MRI = &MF.getRegInfo();
@@ -327,7 +337,7 @@ bool AArch64ConditionOptimizer::runOnMachineFunction(MachineFunction &MF) {
 
     SmallVector<MachineOperand, 4> HeadCond;
     MachineBasicBlock *TBB = nullptr, *FBB = nullptr;
-    if (TII->AnalyzeBranch(*HBB, TBB, FBB, HeadCond)) {
+    if (TII->analyzeBranch(*HBB, TBB, FBB, HeadCond)) {
       continue;
     }
 
@@ -338,7 +348,7 @@ bool AArch64ConditionOptimizer::runOnMachineFunction(MachineFunction &MF) {
 
     SmallVector<MachineOperand, 4> TrueCond;
     MachineBasicBlock *TBB_TBB = nullptr, *TBB_FBB = nullptr;
-    if (TII->AnalyzeBranch(*TBB, TBB_TBB, TBB_FBB, TrueCond)) {
+    if (TII->analyzeBranch(*TBB, TBB_TBB, TBB_FBB, TrueCond)) {
       continue;
     }
 
diff --git a/lib/Target/AArch64/AArch64ConditionalCompares.cpp b/lib/Target/AArch64/AArch64ConditionalCompares.cpp
index df1320fbd4c9..e1b0dc724b39 100644
--- a/lib/Target/AArch64/AArch64ConditionalCompares.cpp
+++ b/lib/Target/AArch64/AArch64ConditionalCompares.cpp
@@ -18,13 +18,10 @@
 //===----------------------------------------------------------------------===//
 
 #include "AArch64.h"
-#include "llvm/ADT/BitVector.h"
 #include "llvm/ADT/DepthFirstIterator.h"
 #include "llvm/ADT/SetVector.h"
 #include "llvm/ADT/SmallPtrSet.h"
-#include "llvm/ADT/SparseSet.h"
 #include "llvm/ADT/Statistic.h"
-#include "llvm/CodeGen/MachineBranchProbabilityInfo.h"
 #include "llvm/CodeGen/MachineDominators.h"
 #include "llvm/CodeGen/MachineFunction.h"
 #include "llvm/CodeGen/MachineFunctionPass.h"
@@ -307,7 +304,7 @@ MachineInstr *SSACCmpConv::findConvertibleCompare(MachineBasicBlock *MBB) {
     case AArch64::CBNZW:
     case AArch64::CBNZX:
       // These can be converted into a ccmp against #0.
-      return I;
+      return &*I;
     }
     ++NumCmpTermRejs;
     DEBUG(dbgs() << "Flags not used by terminator: " << *I);
@@ -338,7 +335,7 @@ MachineInstr *SSACCmpConv::findConvertibleCompare(MachineBasicBlock *MBB) {
     case AArch64::ADDSWrr:
     case AArch64::ADDSXrr:
       if (isDeadDef(I->getOperand(0).getReg()))
-        return I;
+        return &*I;
       DEBUG(dbgs() << "Can't convert compare with live destination: " << *I);
       ++NumLiveDstRejs;
       return nullptr;
@@ -346,12 +343,12 @@ MachineInstr *SSACCmpConv::findConvertibleCompare(MachineBasicBlock *MBB) {
     case AArch64::FCMPDrr:
     case AArch64::FCMPESrr:
     case AArch64::FCMPEDrr:
-      return I;
+      return &*I;
     }
 
     // Check for flag reads and clobbers.
     MIOperands::PhysRegInfo PRI =
-        MIOperands(I).analyzePhysReg(AArch64::NZCV, TRI);
+        MIOperands(*I).analyzePhysReg(AArch64::NZCV, TRI);
 
     if (PRI.Read) {
       // The ccmp doesn't produce exactly the same flags as the original
@@ -496,7 +493,7 @@ bool SSACCmpConv::canConvert(MachineBasicBlock *MBB) {
   // The branch we're looking to eliminate must be analyzable.
   HeadCond.clear();
   MachineBasicBlock *TBB = nullptr, *FBB = nullptr;
-  if (TII->AnalyzeBranch(*Head, TBB, FBB, HeadCond)) {
+  if (TII->analyzeBranch(*Head, TBB, FBB, HeadCond)) {
     DEBUG(dbgs() << "Head branch not analyzable.\n");
     ++NumHeadBranchRejs;
     return false;
@@ -524,7 +521,7 @@ bool SSACCmpConv::canConvert(MachineBasicBlock *MBB) {
 
   CmpBBCond.clear();
   TBB = FBB = nullptr;
-  if (TII->AnalyzeBranch(*CmpBB, TBB, FBB, CmpBBCond)) {
+  if (TII->analyzeBranch(*CmpBB, TBB, FBB, CmpBBCond)) {
     DEBUG(dbgs() << "CmpBB branch not analyzable.\n");
     ++NumCmpBranchRejs;
     return false;
@@ -759,7 +756,6 @@ void initializeAArch64ConditionalComparesPass(PassRegistry &);
 
 INITIALIZE_PASS_BEGIN(AArch64ConditionalCompares, "aarch64-ccmp",
                       "AArch64 CCMP Pass", false, false)
-INITIALIZE_PASS_DEPENDENCY(MachineBranchProbabilityInfo)
 INITIALIZE_PASS_DEPENDENCY(MachineDominatorTree)
 INITIALIZE_PASS_DEPENDENCY(MachineTraceMetrics)
 INITIALIZE_PASS_END(AArch64ConditionalCompares, "aarch64-ccmp",
@@ -770,7 +766,6 @@ FunctionPass *llvm::createAArch64ConditionalCompares() {
 }
 
 void AArch64ConditionalCompares::getAnalysisUsage(AnalysisUsage &AU) const {
-  AU.addRequired<MachineBranchProbabilityInfo>();
   AU.addRequired<MachineDominatorTree>();
   AU.addPreserved<MachineDominatorTree>();
   AU.addRequired<MachineLoopInfo>();
@@ -849,9 +844,9 @@ bool AArch64ConditionalCompares::shouldConvert() {
 
   // Instruction depths can be computed for all trace instructions above CmpBB.
   unsigned HeadDepth =
-      Trace.getInstrCycles(CmpConv.Head->getFirstTerminator()).Depth;
+      Trace.getInstrCycles(*CmpConv.Head->getFirstTerminator()).Depth;
   unsigned CmpBBDepth =
-      Trace.getInstrCycles(CmpConv.CmpBB->getFirstTerminator()).Depth;
+      Trace.getInstrCycles(*CmpConv.CmpBB->getFirstTerminator()).Depth;
   DEBUG(dbgs() << "Head depth:  " << HeadDepth
                << "\nCmpBB depth: " << CmpBBDepth << '\n');
   if (CmpBBDepth > HeadDepth + DelayLimit) {
@@ -891,6 +886,9 @@ bool AArch64ConditionalCompares::tryConvert(MachineBasicBlock *MBB) {
 bool AArch64ConditionalCompares::runOnMachineFunction(MachineFunction &MF) {
   DEBUG(dbgs() << "********** AArch64 Conditional Compares **********\n"
                << "********** Function: " << MF.getName() << '\n');
+  if (skipFunction(*MF.getFunction()))
+    return false;
+
   TII = MF.getSubtarget().getInstrInfo();
   TRI = MF.getSubtarget().getRegisterInfo();
   SchedModel = MF.getSubtarget().getSchedModel();
diff --git a/lib/Target/AArch64/AArch64DeadRegisterDefinitionsPass.cpp b/lib/Target/AArch64/AArch64DeadRegisterDefinitionsPass.cpp
index 576cf4a74167..7a6f7669db5f 100644
--- a/lib/Target/AArch64/AArch64DeadRegisterDefinitionsPass.cpp
+++ b/lib/Target/AArch64/AArch64DeadRegisterDefinitionsPass.cpp
@@ -48,6 +48,11 @@ public:
 
   bool runOnMachineFunction(MachineFunction &F) override;
 
+  MachineFunctionProperties getRequiredProperties() const override {
+    return MachineFunctionProperties().set(
+        MachineFunctionProperties::Property::AllVRegsAllocated);
+  }
+
   const char *getPassName() const override { return AARCH64_DEAD_REG_DEF_NAME; }
 
   void getAnalysisUsage(AnalysisUsage &AU) const override {
@@ -88,6 +93,12 @@ bool AArch64DeadRegisterDefinitions::processMachineBasicBlock(
       DEBUG(dbgs() << "    Ignoring, operand is frame index\n");
       continue;
     }
+    if (MI.definesRegister(AArch64::XZR) || MI.definesRegister(AArch64::WZR)) {
+      // It is not allowed to write to the same register (not even the zero
+      // register) twice in a single instruction.
+      DEBUG(dbgs() << "    Ignoring, XZR or WZR already used by the instruction\n");
+      continue;
+    }
     for (int i = 0, e = MI.getDesc().getNumDefs(); i != e; ++i) {
       MachineOperand &MO = MI.getOperand(i);
       if (MO.isReg() && MO.isDead() && MO.isDef()) {
@@ -100,7 +111,7 @@ bool AArch64DeadRegisterDefinitions::processMachineBasicBlock(
           continue;
         }
         // Don't change the register if there's an implicit def of a subreg or
-        // supperreg.
+        // superreg.
         if (implicitlyDefinesOverlappingReg(MO.getReg(), MI)) {
           DEBUG(dbgs() << "    Ignoring, implicitly defines overlap reg.\n");
           continue;
@@ -123,6 +134,8 @@ bool AArch64DeadRegisterDefinitions::processMachineBasicBlock(
         MO.setReg(NewReg);
         DEBUG(MI.print(dbgs()));
         ++NumDeadDefsReplaced;
+        // Only replace one dead register, see check for zero register above.
+        break;
       }
     }
   }
@@ -136,6 +149,9 @@ bool AArch64DeadRegisterDefinitions::runOnMachineFunction(MachineFunction &MF) {
   bool Changed = false;
   DEBUG(dbgs() << "***** AArch64DeadRegisterDefinitions *****\n");
 
+  if (skipFunction(*MF.getFunction()))
+    return false;
+
   for (auto &MBB : MF)
     if (processMachineBasicBlock(MBB))
       Changed = true;
diff --git a/lib/Target/AArch64/AArch64ExpandPseudoInsts.cpp b/lib/Target/AArch64/AArch64ExpandPseudoInsts.cpp
index d24e42a93763..5e477d39e074 100644
--- a/lib/Target/AArch64/AArch64ExpandPseudoInsts.cpp
+++ b/lib/Target/AArch64/AArch64ExpandPseudoInsts.cpp
@@ -17,6 +17,7 @@
 #include "MCTargetDesc/AArch64AddressingModes.h"
 #include "AArch64InstrInfo.h"
 #include "AArch64Subtarget.h"
+#include "llvm/CodeGen/LivePhysRegs.h"
 #include "llvm/CodeGen/MachineFunctionPass.h"
 #include "llvm/CodeGen/MachineInstrBuilder.h"
 #include "llvm/Support/MathExtras.h"
@@ -46,9 +47,18 @@ public:
 
 private:
   bool expandMBB(MachineBasicBlock &MBB);
-  bool expandMI(MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI);
+  bool expandMI(MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI,
+                MachineBasicBlock::iterator &NextMBBI);
   bool expandMOVImm(MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI,
                     unsigned BitSize);
+
+  bool expandCMP_SWAP(MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI,
+                      unsigned LdarOp, unsigned StlrOp, unsigned CmpOp,
+                      unsigned ExtendImm, unsigned ZeroReg,
+                      MachineBasicBlock::iterator &NextMBBI);
+  bool expandCMP_SWAP_128(MachineBasicBlock &MBB,
+                          MachineBasicBlock::iterator MBBI,
+                          MachineBasicBlock::iterator &NextMBBI);
 };
 char AArch64ExpandPseudo::ID = 0;
 }
@@ -403,9 +413,17 @@ bool AArch64ExpandPseudo::expandMOVImm(MachineBasicBlock &MBB,
                                        MachineBasicBlock::iterator MBBI,
                                        unsigned BitSize) {
   MachineInstr &MI = *MBBI;
+  unsigned DstReg = MI.getOperand(0).getReg();
   uint64_t Imm = MI.getOperand(1).getImm();
   const unsigned Mask = 0xFFFF;
 
+  if (DstReg == AArch64::XZR || DstReg == AArch64::WZR) {
+    // Useless def, and we don't want to risk creating an invalid ORR (which
+    // would really write to sp).
+    MI.eraseFromParent();
+    return true;
+  }
+
   // Try a MOVI instruction (aka ORR-immediate with the zero register).
   uint64_t UImm = Imm << (64 - BitSize) >> (64 - BitSize);
   uint64_t Encoding;
@@ -531,7 +549,6 @@ bool AArch64ExpandPseudo::expandMOVImm(MachineBasicBlock &MBB,
     LastShift = (TZ / 16) * 16;
   }
   unsigned Imm16 = (Imm >> Shift) & Mask;
-  unsigned DstReg = MI.getOperand(0).getReg();
   bool DstIsDead = MI.getOperand(0).isDead();
   MachineInstrBuilder MIB1 =
       BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(FirstOpc))
@@ -572,10 +589,178 @@ bool AArch64ExpandPseudo::expandMOVImm(MachineBasicBlock &MBB,
   return true;
 }
 
+static void addPostLoopLiveIns(MachineBasicBlock *MBB, LivePhysRegs &LiveRegs) {
+  for (auto I = LiveRegs.begin(); I != LiveRegs.end(); ++I)
+    MBB->addLiveIn(*I);
+}
+
+bool AArch64ExpandPseudo::expandCMP_SWAP(
+    MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, unsigned LdarOp,
+    unsigned StlrOp, unsigned CmpOp, unsigned ExtendImm, unsigned ZeroReg,
+    MachineBasicBlock::iterator &NextMBBI) {
+  MachineInstr &MI = *MBBI;
+  DebugLoc DL = MI.getDebugLoc();
+  MachineOperand &Dest = MI.getOperand(0);
+  unsigned StatusReg = MI.getOperand(1).getReg();
+  MachineOperand &Addr = MI.getOperand(2);
+  MachineOperand &Desired = MI.getOperand(3);
+  MachineOperand &New = MI.getOperand(4);
+
+  LivePhysRegs LiveRegs(&TII->getRegisterInfo());
+  LiveRegs.addLiveOuts(MBB);
+  for (auto I = std::prev(MBB.end()); I != MBBI; --I)
+    LiveRegs.stepBackward(*I);
+
+  MachineFunction *MF = MBB.getParent();
+  auto LoadCmpBB = MF->CreateMachineBasicBlock(MBB.getBasicBlock());
+  auto StoreBB = MF->CreateMachineBasicBlock(MBB.getBasicBlock());
+  auto DoneBB = MF->CreateMachineBasicBlock(MBB.getBasicBlock());
+
+  MF->insert(++MBB.getIterator(), LoadCmpBB);
+  MF->insert(++LoadCmpBB->getIterator(), StoreBB);
+  MF->insert(++StoreBB->getIterator(), DoneBB);
+
+  // .Lloadcmp:
+  //     ldaxr xDest, [xAddr]
+  //     cmp xDest, xDesired
+  //     b.ne .Ldone
+  LoadCmpBB->addLiveIn(Addr.getReg());
+  LoadCmpBB->addLiveIn(Dest.getReg());
+  LoadCmpBB->addLiveIn(Desired.getReg());
+  addPostLoopLiveIns(LoadCmpBB, LiveRegs);
+
+  BuildMI(LoadCmpBB, DL, TII->get(LdarOp), Dest.getReg())
+      .addReg(Addr.getReg());
+  BuildMI(LoadCmpBB, DL, TII->get(CmpOp), ZeroReg)
+      .addReg(Dest.getReg(), getKillRegState(Dest.isDead()))
+      .addOperand(Desired)
+      .addImm(ExtendImm);
+  BuildMI(LoadCmpBB, DL, TII->get(AArch64::Bcc))
+      .addImm(AArch64CC::NE)
+      .addMBB(DoneBB)
+      .addReg(AArch64::NZCV, RegState::Implicit | RegState::Kill);
+  LoadCmpBB->addSuccessor(DoneBB);
+  LoadCmpBB->addSuccessor(StoreBB);
+
+  // .Lstore:
+  //     stlxr wStatus, xNew, [xAddr]
+  //     cbnz wStatus, .Lloadcmp
+  StoreBB->addLiveIn(Addr.getReg());
+  StoreBB->addLiveIn(New.getReg());
+  addPostLoopLiveIns(StoreBB, LiveRegs);
+
+  BuildMI(StoreBB, DL, TII->get(StlrOp), StatusReg)
+      .addOperand(New)
+      .addOperand(Addr);
+  BuildMI(StoreBB, DL, TII->get(AArch64::CBNZW))
+      .addReg(StatusReg, RegState::Kill)
+      .addMBB(LoadCmpBB);
+  StoreBB->addSuccessor(LoadCmpBB);
+  StoreBB->addSuccessor(DoneBB);
+
+  DoneBB->splice(DoneBB->end(), &MBB, MI, MBB.end());
+  DoneBB->transferSuccessors(&MBB);
+  addPostLoopLiveIns(DoneBB, LiveRegs);
+
+  MBB.addSuccessor(LoadCmpBB);
+
+  NextMBBI = MBB.end();
+  MI.eraseFromParent();
+  return true;
+}
+
+bool AArch64ExpandPseudo::expandCMP_SWAP_128(
+    MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI,
+    MachineBasicBlock::iterator &NextMBBI) {
+
+  MachineInstr &MI = *MBBI;
+  DebugLoc DL = MI.getDebugLoc();
+  MachineOperand &DestLo = MI.getOperand(0);
+  MachineOperand &DestHi = MI.getOperand(1);
+  unsigned StatusReg = MI.getOperand(2).getReg();
+  MachineOperand &Addr = MI.getOperand(3);
+  MachineOperand &DesiredLo = MI.getOperand(4);
+  MachineOperand &DesiredHi = MI.getOperand(5);
+  MachineOperand &NewLo = MI.getOperand(6);
+  MachineOperand &NewHi = MI.getOperand(7);
+
+  LivePhysRegs LiveRegs(&TII->getRegisterInfo());
+  LiveRegs.addLiveOuts(MBB);
+  for (auto I = std::prev(MBB.end()); I != MBBI; --I)
+    LiveRegs.stepBackward(*I);
+
+  MachineFunction *MF = MBB.getParent();
+  auto LoadCmpBB = MF->CreateMachineBasicBlock(MBB.getBasicBlock());
+  auto StoreBB = MF->CreateMachineBasicBlock(MBB.getBasicBlock());
+  auto DoneBB = MF->CreateMachineBasicBlock(MBB.getBasicBlock());
+
+  MF->insert(++MBB.getIterator(), LoadCmpBB);
+  MF->insert(++LoadCmpBB->getIterator(), StoreBB);
+  MF->insert(++StoreBB->getIterator(), DoneBB);
+
+  // .Lloadcmp:
+  //     ldaxp xDestLo, xDestHi, [xAddr]
+  //     cmp xDestLo, xDesiredLo
+  //     sbcs xDestHi, xDesiredHi
+  //     b.ne .Ldone
+  LoadCmpBB->addLiveIn(Addr.getReg());
+  LoadCmpBB->addLiveIn(DestLo.getReg());
+  LoadCmpBB->addLiveIn(DestHi.getReg());
+  LoadCmpBB->addLiveIn(DesiredLo.getReg());
+  LoadCmpBB->addLiveIn(DesiredHi.getReg());
+  addPostLoopLiveIns(LoadCmpBB, LiveRegs);
+
+  BuildMI(LoadCmpBB, DL, TII->get(AArch64::LDAXPX))
+      .addReg(DestLo.getReg(), RegState::Define)
+      .addReg(DestHi.getReg(), RegState::Define)
+      .addReg(Addr.getReg());
+  BuildMI(LoadCmpBB, DL, TII->get(AArch64::SUBSXrs), AArch64::XZR)
+      .addReg(DestLo.getReg(), getKillRegState(DestLo.isDead()))
+      .addOperand(DesiredLo)
+      .addImm(0);
+  BuildMI(LoadCmpBB, DL, TII->get(AArch64::SBCSXr), AArch64::XZR)
+      .addReg(DestHi.getReg(), getKillRegState(DestHi.isDead()))
+      .addOperand(DesiredHi);
+  BuildMI(LoadCmpBB, DL, TII->get(AArch64::Bcc))
+      .addImm(AArch64CC::NE)
+      .addMBB(DoneBB)
+      .addReg(AArch64::NZCV, RegState::Implicit | RegState::Kill);
+  LoadCmpBB->addSuccessor(DoneBB);
+  LoadCmpBB->addSuccessor(StoreBB);
+
+  // .Lstore:
+  //     stlxp wStatus, xNewLo, xNewHi, [xAddr]
+  //     cbnz wStatus, .Lloadcmp
+  StoreBB->addLiveIn(Addr.getReg());
+  StoreBB->addLiveIn(NewLo.getReg());
+  StoreBB->addLiveIn(NewHi.getReg());
+  addPostLoopLiveIns(StoreBB, LiveRegs);
+  BuildMI(StoreBB, DL, TII->get(AArch64::STLXPX), StatusReg)
+      .addOperand(NewLo)
+      .addOperand(NewHi)
+      .addOperand(Addr);
+  BuildMI(StoreBB, DL, TII->get(AArch64::CBNZW))
+      .addReg(StatusReg, RegState::Kill)
+      .addMBB(LoadCmpBB);
+  StoreBB->addSuccessor(LoadCmpBB);
+  StoreBB->addSuccessor(DoneBB);
+
+  DoneBB->splice(DoneBB->end(), &MBB, MI, MBB.end());
+  DoneBB->transferSuccessors(&MBB);
+  addPostLoopLiveIns(DoneBB, LiveRegs);
+
+  MBB.addSuccessor(LoadCmpBB);
+
+  NextMBBI = MBB.end();
+  MI.eraseFromParent();
+  return true;
+}
+
 /// \brief If MBBI references a pseudo instruction that should be expanded here,
 /// do the expansion and return true.  Otherwise return false.
 bool AArch64ExpandPseudo::expandMI(MachineBasicBlock &MBB,
-                                 MachineBasicBlock::iterator MBBI) {
+                                   MachineBasicBlock::iterator MBBI,
+                                   MachineBasicBlock::iterator &NextMBBI) {
   MachineInstr &MI = *MBBI;
   unsigned Opcode = MI.getOpcode();
   switch (Opcode) {
@@ -717,6 +902,28 @@ bool AArch64ExpandPseudo::expandMI(MachineBasicBlock &MBB,
     MI.eraseFromParent();
     return true;
   }
+  case AArch64::CMP_SWAP_8:
+    return expandCMP_SWAP(MBB, MBBI, AArch64::LDAXRB, AArch64::STLXRB,
+                          AArch64::SUBSWrx,
+                          AArch64_AM::getArithExtendImm(AArch64_AM::UXTB, 0),
+                          AArch64::WZR, NextMBBI);
+  case AArch64::CMP_SWAP_16:
+    return expandCMP_SWAP(MBB, MBBI, AArch64::LDAXRH, AArch64::STLXRH,
+                          AArch64::SUBSWrx,
+                          AArch64_AM::getArithExtendImm(AArch64_AM::UXTH, 0),
+                          AArch64::WZR, NextMBBI);
+  case AArch64::CMP_SWAP_32:
+    return expandCMP_SWAP(MBB, MBBI, AArch64::LDAXRW, AArch64::STLXRW,
+                          AArch64::SUBSWrs,
+                          AArch64_AM::getShifterImm(AArch64_AM::LSL, 0),
+                          AArch64::WZR, NextMBBI);
+  case AArch64::CMP_SWAP_64:
+    return expandCMP_SWAP(MBB, MBBI,
+                          AArch64::LDAXRX, AArch64::STLXRX, AArch64::SUBSXrs,
+                          AArch64_AM::getShifterImm(AArch64_AM::LSL, 0),
+                          AArch64::XZR, NextMBBI);
+  case AArch64::CMP_SWAP_128:
+    return expandCMP_SWAP_128(MBB, MBBI, NextMBBI);
   }
   return false;
 }
@@ -729,7 +936,7 @@ bool AArch64ExpandPseudo::expandMBB(MachineBasicBlock &MBB) {
   MachineBasicBlock::iterator MBBI = MBB.begin(), E = MBB.end();
   while (MBBI != E) {
     MachineBasicBlock::iterator NMBBI = std::next(MBBI);
-    Modified |= expandMI(MBB, MBBI);
+    Modified |= expandMI(MBB, MBBI, NMBBI);
     MBBI = NMBBI;
   }
 
diff --git a/lib/Target/AArch64/AArch64FastISel.cpp b/lib/Target/AArch64/AArch64FastISel.cpp
index 0ac4b39b0357..e2ab7ab79be1 100644
--- a/lib/Target/AArch64/AArch64FastISel.cpp
+++ b/lib/Target/AArch64/AArch64FastISel.cpp
@@ -37,7 +37,6 @@
 #include "llvm/IR/IntrinsicInst.h"
 #include "llvm/IR/Operator.h"
 #include "llvm/MC/MCSymbol.h"
-#include "llvm/Support/CommandLine.h"
 using namespace llvm;
 
 namespace {
@@ -144,8 +143,8 @@ private:
   bool computeCallAddress(const Value *V, Address &Addr);
   bool simplifyAddress(Address &Addr, MVT VT);
   void addLoadStoreOperands(Address &Addr, const MachineInstrBuilder &MIB,
-                            unsigned Flags, unsigned ScaleFactor,
-                            MachineMemOperand *MMO);
+                            MachineMemOperand::Flags Flags,
+                            unsigned ScaleFactor, MachineMemOperand *MMO);
   bool isMemCpySmall(uint64_t Len, unsigned Alignment);
   bool tryEmitSmallMemCpy(Address Dest, Address Src, uint64_t Len,
                           unsigned Alignment);
@@ -439,9 +438,6 @@ unsigned AArch64FastISel::materializeGV(const GlobalValue *GV) {
       .addReg(ADRPReg)
       .addGlobalAddress(GV, 0, AArch64II::MO_GOT | AArch64II::MO_PAGEOFF |
                         AArch64II::MO_NC);
-  } else if (OpFlags & AArch64II::MO_CONSTPOOL) {
-    // We can't handle addresses loaded from a constant pool quickly yet.
-    return 0;
   } else {
     // ADRP + ADDX
     BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(AArch64::ADRP),
@@ -555,10 +551,9 @@ bool AArch64FastISel::computeAddress(const Value *Obj, Address &Addr, Type *Ty)
 
     // Iterate through the GEP folding the constants into offsets where
     // we can.
-    gep_type_iterator GTI = gep_type_begin(U);
-    for (User::const_op_iterator i = U->op_begin() + 1, e = U->op_end(); i != e;
-         ++i, ++GTI) {
-      const Value *Op = *i;
+    for (gep_type_iterator GTI = gep_type_begin(U), E = gep_type_end(U);
+         GTI != E; ++GTI) {
+      const Value *Op = GTI.getOperand();
       if (StructType *STy = dyn_cast<StructType>(*GTI)) {
         const StructLayout *SL = DL.getStructLayout(STy);
         unsigned Idx = cast<ConstantInt>(Op)->getZExtValue();
@@ -947,10 +942,7 @@ bool AArch64FastISel::isValueAvailable(const Value *V) const {
     return true;
 
   const auto *I = cast<Instruction>(V);
-  if (FuncInfo.MBBMap[I->getParent()] == FuncInfo.MBB)
-    return true;
-
-  return false;
+  return FuncInfo.MBBMap[I->getParent()] == FuncInfo.MBB;
 }
 
 bool AArch64FastISel::simplifyAddress(Address &Addr, MVT VT) {
@@ -1048,7 +1040,7 @@ bool AArch64FastISel::simplifyAddress(Address &Addr, MVT VT) {
 
 void AArch64FastISel::addLoadStoreOperands(Address &Addr,
                                            const MachineInstrBuilder &MIB,
-                                           unsigned Flags,
+                                           MachineMemOperand::Flags Flags,
                                            unsigned ScaleFactor,
                                            MachineMemOperand *MMO) {
   int64_t Offset = Addr.getOffset() / ScaleFactor;
@@ -1612,8 +1604,8 @@ unsigned AArch64FastISel::emitLogicalOp(unsigned ISDOpc, MVT RetVT,
 unsigned AArch64FastISel::emitLogicalOp_ri(unsigned ISDOpc, MVT RetVT,
                                            unsigned LHSReg, bool LHSIsKill,
                                            uint64_t Imm) {
-  assert((ISD::AND + 1 == ISD::OR) && (ISD::AND + 2 == ISD::XOR) &&
-         "ISD nodes are not consecutive!");
+  static_assert((ISD::AND + 1 == ISD::OR) && (ISD::AND + 2 == ISD::XOR),
+                "ISD nodes are not consecutive!");
   static const unsigned OpcTable[3][2] = {
     { AArch64::ANDWri, AArch64::ANDXri },
     { AArch64::ORRWri, AArch64::ORRXri },
@@ -1659,8 +1651,8 @@ unsigned AArch64FastISel::emitLogicalOp_rs(unsigned ISDOpc, MVT RetVT,
                                            unsigned LHSReg, bool LHSIsKill,
                                            unsigned RHSReg, bool RHSIsKill,
                                            uint64_t ShiftImm) {
-  assert((ISD::AND + 1 == ISD::OR) && (ISD::AND + 2 == ISD::XOR) &&
-         "ISD nodes are not consecutive!");
+  static_assert((ISD::AND + 1 == ISD::OR) && (ISD::AND + 2 == ISD::XOR),
+                "ISD nodes are not consecutive!");
   static const unsigned OpcTable[3][2] = {
     { AArch64::ANDWrs, AArch64::ANDXrs },
     { AArch64::ORRWrs, AArch64::ORRXrs },
@@ -1904,6 +1896,21 @@ bool AArch64FastISel::selectLoad(const Instruction *I) {
       cast<LoadInst>(I)->isAtomic())
     return false;
 
+  const Value *SV = I->getOperand(0);
+  if (TLI.supportSwiftError()) {
+    // Swifterror values can come from either a function parameter with
+    // swifterror attribute or an alloca with swifterror attribute.
+    if (const Argument *Arg = dyn_cast<Argument>(SV)) {
+      if (Arg->hasSwiftErrorAttr())
+        return false;
+    }
+
+    if (const AllocaInst *Alloca = dyn_cast<AllocaInst>(SV)) {
+      if (Alloca->isSwiftError())
+        return false;
+    }
+  }
+
   // See if we can handle this address.
   Address Addr;
   if (!computeAddress(I->getOperand(0), Addr, I->getType()))
@@ -2068,6 +2075,21 @@ bool AArch64FastISel::selectStore(const Instruction *I) {
       cast<StoreInst>(I)->isAtomic())
     return false;
 
+  const Value *PtrV = I->getOperand(1);
+  if (TLI.supportSwiftError()) {
+    // Swifterror values can come from either a function parameter with
+    // swifterror attribute or an alloca with swifterror attribute.
+    if (const Argument *Arg = dyn_cast<Argument>(PtrV)) {
+      if (Arg->hasSwiftErrorAttr())
+        return false;
+    }
+
+    if (const AllocaInst *Alloca = dyn_cast<AllocaInst>(PtrV)) {
+      if (Alloca->isSwiftError())
+        return false;
+    }
+  }
+
   // Get the value to be stored into a register. Use the zero register directly
   // when possible to avoid an unnecessary copy and a wasted register.
   unsigned SrcReg = 0;
@@ -2813,6 +2835,8 @@ bool AArch64FastISel::fastLowerArguments() {
     if (F->getAttributes().hasAttribute(Idx, Attribute::ByVal) ||
         F->getAttributes().hasAttribute(Idx, Attribute::InReg) ||
         F->getAttributes().hasAttribute(Idx, Attribute::StructRet) ||
+        F->getAttributes().hasAttribute(Idx, Attribute::SwiftSelf) ||
+        F->getAttributes().hasAttribute(Idx, Attribute::SwiftError) ||
         F->getAttributes().hasAttribute(Idx, Attribute::Nest))
       return false;
 
@@ -3064,7 +3088,8 @@ bool AArch64FastISel::fastLowerCall(CallLoweringInfo &CLI) {
     return false;
 
   for (auto Flag : CLI.OutFlags)
-    if (Flag.isInReg() || Flag.isSRet() || Flag.isNest() || Flag.isByVal())
+    if (Flag.isInReg() || Flag.isSRet() || Flag.isNest() || Flag.isByVal() ||
+        Flag.isSwiftSelf() || Flag.isSwiftError())
       return false;
 
   // Set up the argument vectors.
@@ -3646,6 +3671,10 @@ bool AArch64FastISel::selectRet(const Instruction *I) {
   if (F.isVarArg())
     return false;
 
+  if (TLI.supportSwiftError() &&
+      F.getAttributes().hasAttrSomewhere(Attribute::SwiftError))
+    return false;
+
   if (TLI.supportSplitCSR(FuncInfo.MF))
     return false;
 
@@ -4814,18 +4843,18 @@ bool AArch64FastISel::selectGetElementPtr(const Instruction *I) {
   // Keep a running tab of the total offset to coalesce multiple N = N + Offset
   // into a single N = N + TotalOffset.
   uint64_t TotalOffs = 0;
-  Type *Ty = I->getOperand(0)->getType();
   MVT VT = TLI.getPointerTy(DL);
-  for (auto OI = std::next(I->op_begin()), E = I->op_end(); OI != E; ++OI) {
-    const Value *Idx = *OI;
-    if (auto *StTy = dyn_cast<StructType>(Ty)) {
+  for (gep_type_iterator GTI = gep_type_begin(I), E = gep_type_end(I);
+       GTI != E; ++GTI) {
+    const Value *Idx = GTI.getOperand();
+    if (auto *StTy = dyn_cast<StructType>(*GTI)) {
       unsigned Field = cast<ConstantInt>(Idx)->getZExtValue();
       // N = N + Offset
       if (Field)
         TotalOffs += DL.getStructLayout(StTy)->getElementOffset(Field);
-      Ty = StTy->getElementType(Field);
     } else {
-      Ty = cast<SequentialType>(Ty)->getElementType();
+      Type *Ty = GTI.getIndexedType();
+
       // If this is a constant subscript, handle it quickly.
       if (const auto *CI = dyn_cast<ConstantInt>(Idx)) {
         if (CI->isZero())
diff --git a/lib/Target/AArch64/AArch64FrameLowering.cpp b/lib/Target/AArch64/AArch64FrameLowering.cpp
index 3f63d049c34e..82111e5c7259 100644
--- a/lib/Target/AArch64/AArch64FrameLowering.cpp
+++ b/lib/Target/AArch64/AArch64FrameLowering.cpp
@@ -93,6 +93,7 @@
 #include "AArch64Subtarget.h"
 #include "AArch64TargetMachine.h"
 #include "llvm/ADT/Statistic.h"
+#include "llvm/CodeGen/LivePhysRegs.h"
 #include "llvm/CodeGen/MachineFrameInfo.h"
 #include "llvm/CodeGen/MachineFunction.h"
 #include "llvm/CodeGen/MachineInstrBuilder.h"
@@ -127,12 +128,7 @@ bool AArch64FrameLowering::canUseRedZone(const MachineFunction &MF) const {
   const AArch64FunctionInfo *AFI = MF.getInfo<AArch64FunctionInfo>();
   unsigned NumBytes = AFI->getLocalStackSize();
 
-  // Note: currently hasFP() is always true for hasCalls(), but that's an
-  // implementation detail of the current code, not a strict requirement,
-  // so stay safe here and check both.
-  if (MFI->hasCalls() || hasFP(MF) || NumBytes > 128)
-    return false;
-  return true;
+  return !(MFI->hasCalls() || hasFP(MF) || NumBytes > 128);
 }
 
 /// hasFP - Return true if the specified function should have a dedicated frame
@@ -140,9 +136,12 @@ bool AArch64FrameLowering::canUseRedZone(const MachineFunction &MF) const {
 bool AArch64FrameLowering::hasFP(const MachineFunction &MF) const {
   const MachineFrameInfo *MFI = MF.getFrameInfo();
   const TargetRegisterInfo *RegInfo = MF.getSubtarget().getRegisterInfo();
-  return (MFI->hasCalls() || MFI->hasVarSizedObjects() ||
-          MFI->isFrameAddressTaken() || MFI->hasStackMap() ||
-          MFI->hasPatchPoint() || RegInfo->needsStackRealignment(MF));
+  // Retain behavior of always omitting the FP for leaf functions when possible.
+  return (MFI->hasCalls() &&
+          MF.getTarget().Options.DisableFramePointerElim(MF)) ||
+         MFI->hasVarSizedObjects() || MFI->isFrameAddressTaken() ||
+         MFI->hasStackMap() || MFI->hasPatchPoint() ||
+         RegInfo->needsStackRealignment(MF);
 }
 
 /// hasReservedCallFrame - Under normal circumstances, when a frame pointer is
@@ -155,7 +154,7 @@ AArch64FrameLowering::hasReservedCallFrame(const MachineFunction &MF) const {
   return !MF.getFrameInfo()->hasVarSizedObjects();
 }
 
-void AArch64FrameLowering::eliminateCallFramePseudoInstr(
+MachineBasicBlock::iterator AArch64FrameLowering::eliminateCallFramePseudoInstr(
     MachineFunction &MF, MachineBasicBlock &MBB,
     MachineBasicBlock::iterator I) const {
   const AArch64InstrInfo *TII =
@@ -170,7 +169,7 @@ void AArch64FrameLowering::eliminateCallFramePseudoInstr(
     unsigned Align = getStackAlignment();
 
     int64_t Amount = I->getOperand(0).getImm();
-    Amount = RoundUpToAlignment(Amount, Align);
+    Amount = alignTo(Amount, Align);
     if (!IsDestroy)
       Amount = -Amount;
 
@@ -186,7 +185,7 @@ void AArch64FrameLowering::eliminateCallFramePseudoInstr(
       // 2) For 12-bit <= offset <= 24-bit, we use two instructions. One uses
       // LSL #0, and the other uses LSL #12.
       //
-      // Mostly call frames will be allocated at the start of a function so
+      // Most call frames will be allocated at the start of a function so
       // this is OK, but it is a limitation that needs dealing with.
       assert(Amount > -0xffffff && Amount < 0xffffff && "call frame too large");
       emitFrameOffset(MBB, I, DL, AArch64::SP, AArch64::SP, Amount, TII);
@@ -198,12 +197,11 @@ void AArch64FrameLowering::eliminateCallFramePseudoInstr(
     emitFrameOffset(MBB, I, DL, AArch64::SP, AArch64::SP, -CalleePopAmount,
                     TII);
   }
-  MBB.erase(I);
+  return MBB.erase(I);
 }
 
 void AArch64FrameLowering::emitCalleeSavedFrameMoves(
-    MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI,
-    unsigned FramePtr) const {
+    MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI) const {
   MachineFunction &MF = *MBB.getParent();
   MachineFrameInfo *MFI = MF.getFrameInfo();
   MachineModuleInfo &MMI = MF.getMMI();
@@ -216,75 +214,194 @@ void AArch64FrameLowering::emitCalleeSavedFrameMoves(
   if (CSI.empty())
     return;
 
-  const DataLayout &TD = MF.getDataLayout();
-  bool HasFP = hasFP(MF);
-
-  // Calculate amount of bytes used for return address storing.
-  int stackGrowth = -TD.getPointerSize(0);
-
-  // Calculate offsets.
-  int64_t saveAreaOffset = (HasFP ? 2 : 1) * stackGrowth;
-  unsigned TotalSkipped = 0;
   for (const auto &Info : CSI) {
     unsigned Reg = Info.getReg();
-    int64_t Offset = MFI->getObjectOffset(Info.getFrameIdx()) -
-                     getOffsetOfLocalArea() + saveAreaOffset;
-
-    // Don't output a new CFI directive if we're re-saving the frame pointer or
-    // link register. This happens when the PrologEpilogInserter has inserted an
-    // extra "STP" of the frame pointer and link register -- the "emitPrologue"
-    // method automatically generates the directives when frame pointers are
-    // used. If we generate CFI directives for the extra "STP"s, the linker will
-    // lose track of the correct values for the frame pointer and link register.
-    if (HasFP && (FramePtr == Reg || Reg == AArch64::LR)) {
-      TotalSkipped += stackGrowth;
-      continue;
-    }
-
+    int64_t Offset =
+        MFI->getObjectOffset(Info.getFrameIdx()) - getOffsetOfLocalArea();
     unsigned DwarfReg = MRI->getDwarfRegNum(Reg, true);
-    unsigned CFIIndex = MMI.addFrameInst(MCCFIInstruction::createOffset(
-        nullptr, DwarfReg, Offset - TotalSkipped));
+    unsigned CFIIndex = MMI.addFrameInst(
+        MCCFIInstruction::createOffset(nullptr, DwarfReg, Offset));
     BuildMI(MBB, MBBI, DL, TII->get(TargetOpcode::CFI_INSTRUCTION))
         .addCFIIndex(CFIIndex)
         .setMIFlags(MachineInstr::FrameSetup);
   }
 }
 
-/// Get FPOffset by analyzing the first instruction.
-static int getFPOffsetInPrologue(MachineInstr *MBBI) {
-  // First instruction must a) allocate the stack  and b) have an immediate
-  // that is a multiple of -2.
-  assert(((MBBI->getOpcode() == AArch64::STPXpre ||
-           MBBI->getOpcode() == AArch64::STPDpre) &&
-          MBBI->getOperand(3).getReg() == AArch64::SP &&
-          MBBI->getOperand(4).getImm() < 0 &&
-          (MBBI->getOperand(4).getImm() & 1) == 0));
-
-  // Frame pointer is fp = sp - 16. Since the  STPXpre subtracts the space
-  // required for the callee saved register area we get the frame pointer
-  // by addding that offset - 16 = -getImm()*8 - 2*8 = -(getImm() + 2) * 8.
-  int FPOffset = -(MBBI->getOperand(4).getImm() + 2) * 8;
-  assert(FPOffset >= 0 && "Bad Framepointer Offset");
-  return FPOffset;
-}
+// Find a scratch register that we can use at the start of the prologue to
+// re-align the stack pointer.  We avoid using callee-save registers since they
+// may appear to be free when this is called from canUseAsPrologue (during
+// shrink wrapping), but then no longer be free when this is called from
+// emitPrologue.
+//
+// FIXME: This is a bit conservative, since in the above case we could use one
+// of the callee-save registers as a scratch temp to re-align the stack pointer,
+// but we would then have to make sure that we were in fact saving at least one
+// callee-save register in the prologue, which is additional complexity that
+// doesn't seem worth the benefit.
+static unsigned findScratchNonCalleeSaveRegister(MachineBasicBlock *MBB) {
+  MachineFunction *MF = MBB->getParent();
+
+  // If MBB is an entry block, use X9 as the scratch register
+  if (&MF->front() == MBB)
+    return AArch64::X9;
+
+  const TargetRegisterInfo &TRI = *MF->getSubtarget().getRegisterInfo();
+  LivePhysRegs LiveRegs(&TRI);
+  LiveRegs.addLiveIns(*MBB);
+
+  // Mark callee saved registers as used so we will not choose them.
+  const AArch64Subtarget &Subtarget = MF->getSubtarget<AArch64Subtarget>();
+  const AArch64RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
+  const MCPhysReg *CSRegs = RegInfo->getCalleeSavedRegs(MF);
+  for (unsigned i = 0; CSRegs[i]; ++i)
+    LiveRegs.addReg(CSRegs[i]);
+
+  // Prefer X9 since it was historically used for the prologue scratch reg.
+  const MachineRegisterInfo &MRI = MF->getRegInfo();
+  if (LiveRegs.available(MRI, AArch64::X9))
+    return AArch64::X9;
 
-static bool isCSSave(MachineInstr *MBBI) {
-  return MBBI->getOpcode() == AArch64::STPXi ||
-         MBBI->getOpcode() == AArch64::STPDi ||
-         MBBI->getOpcode() == AArch64::STPXpre ||
-         MBBI->getOpcode() == AArch64::STPDpre;
+  for (unsigned Reg : AArch64::GPR64RegClass) {
+    if (LiveRegs.available(MRI, Reg))
+      return Reg;
+  }
+  return AArch64::NoRegister;
 }
 
 bool AArch64FrameLowering::canUseAsPrologue(
     const MachineBasicBlock &MBB) const {
   const MachineFunction *MF = MBB.getParent();
+  MachineBasicBlock *TmpMBB = const_cast<MachineBasicBlock *>(&MBB);
   const AArch64Subtarget &Subtarget = MF->getSubtarget<AArch64Subtarget>();
   const AArch64RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
 
   // Don't need a scratch register if we're not going to re-align the stack.
-  // Otherwise, we may need a scratch register to be available and we do not
-  // support that for now.
-  return !RegInfo->needsStackRealignment(*MF);
+  if (!RegInfo->needsStackRealignment(*MF))
+    return true;
+  // Otherwise, we can use any block as long as it has a scratch register
+  // available.
+  return findScratchNonCalleeSaveRegister(TmpMBB) != AArch64::NoRegister;
+}
+
+bool AArch64FrameLowering::shouldCombineCSRLocalStackBump(
+    MachineFunction &MF, unsigned StackBumpBytes) const {
+  AArch64FunctionInfo *AFI = MF.getInfo<AArch64FunctionInfo>();
+  const MachineFrameInfo *MFI = MF.getFrameInfo();
+  const AArch64Subtarget &Subtarget = MF.getSubtarget<AArch64Subtarget>();
+  const AArch64RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
+
+  if (AFI->getLocalStackSize() == 0)
+    return false;
+
+  // 512 is the maximum immediate for stp/ldp that will be used for
+  // callee-save save/restores
+  if (StackBumpBytes >= 512)
+    return false;
+
+  if (MFI->hasVarSizedObjects())
+    return false;
+
+  if (RegInfo->needsStackRealignment(MF))
+    return false;
+
+  // This isn't strictly necessary, but it simplifies things a bit since the
+  // current RedZone handling code assumes the SP is adjusted by the
+  // callee-save save/restore code.
+  if (canUseRedZone(MF))
+    return false;
+
+  return true;
+}
+
+// Convert callee-save register save/restore instruction to do stack pointer
+// decrement/increment to allocate/deallocate the callee-save stack area by
+// converting store/load to use pre/post increment version.
+static MachineBasicBlock::iterator convertCalleeSaveRestoreToSPPrePostIncDec(
+    MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI,
+    const DebugLoc &DL, const TargetInstrInfo *TII, int CSStackSizeInc) {
+
+  unsigned NewOpc;
+  bool NewIsUnscaled = false;
+  switch (MBBI->getOpcode()) {
+  default:
+    llvm_unreachable("Unexpected callee-save save/restore opcode!");
+  case AArch64::STPXi:
+    NewOpc = AArch64::STPXpre;
+    break;
+  case AArch64::STPDi:
+    NewOpc = AArch64::STPDpre;
+    break;
+  case AArch64::STRXui:
+    NewOpc = AArch64::STRXpre;
+    NewIsUnscaled = true;
+    break;
+  case AArch64::STRDui:
+    NewOpc = AArch64::STRDpre;
+    NewIsUnscaled = true;
+    break;
+  case AArch64::LDPXi:
+    NewOpc = AArch64::LDPXpost;
+    break;
+  case AArch64::LDPDi:
+    NewOpc = AArch64::LDPDpost;
+    break;
+  case AArch64::LDRXui:
+    NewOpc = AArch64::LDRXpost;
+    NewIsUnscaled = true;
+    break;
+  case AArch64::LDRDui:
+    NewOpc = AArch64::LDRDpost;
+    NewIsUnscaled = true;
+    break;
+  }
+
+  MachineInstrBuilder MIB = BuildMI(MBB, MBBI, DL, TII->get(NewOpc));
+  MIB.addReg(AArch64::SP, RegState::Define);
+
+  // Copy all operands other than the immediate offset.
+  unsigned OpndIdx = 0;
+  for (unsigned OpndEnd = MBBI->getNumOperands() - 1; OpndIdx < OpndEnd;
+       ++OpndIdx)
+    MIB.addOperand(MBBI->getOperand(OpndIdx));
+
+  assert(MBBI->getOperand(OpndIdx).getImm() == 0 &&
+         "Unexpected immediate offset in first/last callee-save save/restore "
+         "instruction!");
+  assert(MBBI->getOperand(OpndIdx - 1).getReg() == AArch64::SP &&
+         "Unexpected base register in callee-save save/restore instruction!");
+  // Last operand is immediate offset that needs fixing.
+  assert(CSStackSizeInc % 8 == 0);
+  int64_t CSStackSizeIncImm = CSStackSizeInc;
+  if (!NewIsUnscaled)
+    CSStackSizeIncImm /= 8;
+  MIB.addImm(CSStackSizeIncImm);
+
+  MIB.setMIFlags(MBBI->getFlags());
+  MIB.setMemRefs(MBBI->memoperands_begin(), MBBI->memoperands_end());
+
+  return std::prev(MBB.erase(MBBI));
+}
+
+// Fixup callee-save register save/restore instructions to take into account
+// combined SP bump by adding the local stack size to the stack offsets.
+static void fixupCalleeSaveRestoreStackOffset(MachineInstr &MI,
+                                              unsigned LocalStackSize) {
+  unsigned Opc = MI.getOpcode();
+  (void)Opc;
+  assert((Opc == AArch64::STPXi || Opc == AArch64::STPDi ||
+          Opc == AArch64::STRXui || Opc == AArch64::STRDui ||
+          Opc == AArch64::LDPXi || Opc == AArch64::LDPDi ||
+          Opc == AArch64::LDRXui || Opc == AArch64::LDRDui) &&
+         "Unexpected callee-save save/restore opcode!");
+
+  unsigned OffsetIdx = MI.getNumExplicitOperands() - 1;
+  assert(MI.getOperand(OffsetIdx - 1).getReg() == AArch64::SP &&
+         "Unexpected base register in callee-save save/restore instruction!");
+  // Last operand is immediate offset that needs fixing.
+  MachineOperand &OffsetOpnd = MI.getOperand(OffsetIdx);
+  // All generated opcodes have scaled offsets.
+  assert(LocalStackSize % 8 == 0);
+  OffsetOpnd.setImm(OffsetOpnd.getImm() + LocalStackSize / 8);
 }
 
 void AArch64FrameLowering::emitPrologue(MachineFunction &MF,
@@ -316,40 +433,59 @@ void AArch64FrameLowering::emitPrologue(MachineFunction &MF,
     // All of the stack allocation is for locals.
     AFI->setLocalStackSize(NumBytes);
 
-    // Label used to tie together the PROLOG_LABEL and the MachineMoves.
-    MCSymbol *FrameLabel = MMI.getContext().createTempSymbol();
-
+    if (!NumBytes)
+      return;
     // REDZONE: If the stack size is less than 128 bytes, we don't need
     // to actually allocate.
-    if (NumBytes && !canUseRedZone(MF)) {
+    if (canUseRedZone(MF))
+      ++NumRedZoneFunctions;
+    else {
       emitFrameOffset(MBB, MBBI, DL, AArch64::SP, AArch64::SP, -NumBytes, TII,
                       MachineInstr::FrameSetup);
 
+      // Label used to tie together the PROLOG_LABEL and the MachineMoves.
+      MCSymbol *FrameLabel = MMI.getContext().createTempSymbol();
       // Encode the stack size of the leaf function.
       unsigned CFIIndex = MMI.addFrameInst(
           MCCFIInstruction::createDefCfaOffset(FrameLabel, -NumBytes));
       BuildMI(MBB, MBBI, DL, TII->get(TargetOpcode::CFI_INSTRUCTION))
           .addCFIIndex(CFIIndex)
           .setMIFlags(MachineInstr::FrameSetup);
-    } else if (NumBytes) {
-      ++NumRedZoneFunctions;
     }
-
     return;
   }
 
-  // Only set up FP if we actually need to.
-  int FPOffset = 0;
-  if (HasFP)
-    FPOffset = getFPOffsetInPrologue(MBBI);
+  auto CSStackSize = AFI->getCalleeSavedStackSize();
+  // All of the remaining stack allocations are for locals.
+  AFI->setLocalStackSize(NumBytes - CSStackSize);
 
-  // Move past the saves of the callee-saved registers.
-  while (isCSSave(MBBI)) {
-    ++MBBI;
-    NumBytes -= 16;
+  bool CombineSPBump = shouldCombineCSRLocalStackBump(MF, NumBytes);
+  if (CombineSPBump) {
+    emitFrameOffset(MBB, MBBI, DL, AArch64::SP, AArch64::SP, -NumBytes, TII,
+                    MachineInstr::FrameSetup);
+    NumBytes = 0;
+  } else if (CSStackSize != 0) {
+    MBBI = convertCalleeSaveRestoreToSPPrePostIncDec(MBB, MBBI, DL, TII,
+                                                     -CSStackSize);
+    NumBytes -= CSStackSize;
   }
   assert(NumBytes >= 0 && "Negative stack allocation size!?");
+
+  // Move past the saves of the callee-saved registers, fixing up the offsets
+  // and pre-inc if we decided to combine the callee-save and local stack
+  // pointer bump above.
+  MachineBasicBlock::iterator End = MBB.end();
+  while (MBBI != End && MBBI->getFlag(MachineInstr::FrameSetup)) {
+    if (CombineSPBump)
+      fixupCalleeSaveRestoreStackOffset(*MBBI, AFI->getLocalStackSize());
+    ++MBBI;
+  }
   if (HasFP) {
+    // Only set up FP if we actually need to. Frame pointer is fp = sp - 16.
+    int FPOffset = CSStackSize - 16;
+    if (CombineSPBump)
+      FPOffset += AFI->getLocalStackSize();
+
     // Issue    sub fp, sp, FPOffset or
     //          mov fp,sp          when FPOffset is zero.
     // Note: All stores of callee-saved registers are marked as "FrameSetup".
@@ -358,47 +494,46 @@ void AArch64FrameLowering::emitPrologue(MachineFunction &MF,
                     MachineInstr::FrameSetup);
   }
 
-  // All of the remaining stack allocations are for locals.
-  AFI->setLocalStackSize(NumBytes);
-
   // Allocate space for the rest of the frame.
+  if (NumBytes) {
+    const bool NeedsRealignment = RegInfo->needsStackRealignment(MF);
+    unsigned scratchSPReg = AArch64::SP;
 
-  const unsigned Alignment = MFI->getMaxAlignment();
-  const bool NeedsRealignment = RegInfo->needsStackRealignment(MF);
-  unsigned scratchSPReg = AArch64::SP;
-  if (NumBytes && NeedsRealignment) {
-    // Use the first callee-saved register as a scratch register.
-    scratchSPReg = AArch64::X9;
-  }
+    if (NeedsRealignment) {
+      scratchSPReg = findScratchNonCalleeSaveRegister(&MBB);
+      assert(scratchSPReg != AArch64::NoRegister);
+    }
 
-  // If we're a leaf function, try using the red zone.
-  if (NumBytes && !canUseRedZone(MF))
-    // FIXME: in the case of dynamic re-alignment, NumBytes doesn't have
-    // the correct value here, as NumBytes also includes padding bytes,
-    // which shouldn't be counted here.
-    emitFrameOffset(MBB, MBBI, DL, scratchSPReg, AArch64::SP, -NumBytes, TII,
-                    MachineInstr::FrameSetup);
+    // If we're a leaf function, try using the red zone.
+    if (!canUseRedZone(MF))
+      // FIXME: in the case of dynamic re-alignment, NumBytes doesn't have
+      // the correct value here, as NumBytes also includes padding bytes,
+      // which shouldn't be counted here.
+      emitFrameOffset(MBB, MBBI, DL, scratchSPReg, AArch64::SP, -NumBytes, TII,
+                      MachineInstr::FrameSetup);
 
-  if (NumBytes && NeedsRealignment) {
-    const unsigned NrBitsToZero = countTrailingZeros(Alignment);
-    assert(NrBitsToZero > 1);
-    assert(scratchSPReg != AArch64::SP);
-
-    // SUB X9, SP, NumBytes
-    //   -- X9 is temporary register, so shouldn't contain any live data here,
-    //   -- free to use. This is already produced by emitFrameOffset above.
-    // AND SP, X9, 0b11111...0000
-    // The logical immediates have a non-trivial encoding. The following
-    // formula computes the encoded immediate with all ones but
-    // NrBitsToZero zero bits as least significant bits.
-    uint32_t andMaskEncoded =
-        (1                   <<12) // = N
-      | ((64-NrBitsToZero)   << 6) // immr
-      | ((64-NrBitsToZero-1) << 0) // imms
-      ;
-    BuildMI(MBB, MBBI, DL, TII->get(AArch64::ANDXri), AArch64::SP)
-      .addReg(scratchSPReg, RegState::Kill)
-      .addImm(andMaskEncoded);
+    if (NeedsRealignment) {
+      const unsigned Alignment = MFI->getMaxAlignment();
+      const unsigned NrBitsToZero = countTrailingZeros(Alignment);
+      assert(NrBitsToZero > 1);
+      assert(scratchSPReg != AArch64::SP);
+
+      // SUB X9, SP, NumBytes
+      //   -- X9 is temporary register, so shouldn't contain any live data here,
+      //   -- free to use. This is already produced by emitFrameOffset above.
+      // AND SP, X9, 0b11111...0000
+      // The logical immediates have a non-trivial encoding. The following
+      // formula computes the encoded immediate with all ones but
+      // NrBitsToZero zero bits as least significant bits.
+      uint32_t andMaskEncoded = (1 << 12)                         // = N
+                                | ((64 - NrBitsToZero) << 6)      // immr
+                                | ((64 - NrBitsToZero - 1) << 0); // imms
+
+      BuildMI(MBB, MBBI, DL, TII->get(AArch64::ANDXri), AArch64::SP)
+          .addReg(scratchSPReg, RegState::Kill)
+          .addImm(andMaskEncoded);
+      AFI->setStackRealigned(true);
+    }
   }
 
   // If we need a base pointer, set it up here. It's whatever the value of the
@@ -491,21 +626,6 @@ void AArch64FrameLowering::emitPrologue(MachineFunction &MF,
       BuildMI(MBB, MBBI, DL, TII->get(TargetOpcode::CFI_INSTRUCTION))
           .addCFIIndex(CFIIndex)
           .setMIFlags(MachineInstr::FrameSetup);
-
-      // Record the location of the stored LR
-      unsigned LR = RegInfo->getDwarfRegNum(AArch64::LR, true);
-      CFIIndex = MMI.addFrameInst(
-          MCCFIInstruction::createOffset(nullptr, LR, StackGrowth));
-      BuildMI(MBB, MBBI, DL, TII->get(TargetOpcode::CFI_INSTRUCTION))
-          .addCFIIndex(CFIIndex)
-          .setMIFlags(MachineInstr::FrameSetup);
-
-      // Record the location of the stored FP
-      CFIIndex = MMI.addFrameInst(
-          MCCFIInstruction::createOffset(nullptr, Reg, 2 * StackGrowth));
-      BuildMI(MBB, MBBI, DL, TII->get(TargetOpcode::CFI_INSTRUCTION))
-          .addCFIIndex(CFIIndex)
-          .setMIFlags(MachineInstr::FrameSetup);
     } else {
       // Encode the stack size of the leaf function.
       unsigned CFIIndex = MMI.addFrameInst(
@@ -515,36 +635,10 @@ void AArch64FrameLowering::emitPrologue(MachineFunction &MF,
           .setMIFlags(MachineInstr::FrameSetup);
     }
 
-    // Now emit the moves for whatever callee saved regs we have.
-    emitCalleeSavedFrameMoves(MBB, MBBI, FramePtr);
-  }
-}
-
-static bool isCalleeSavedRegister(unsigned Reg, const MCPhysReg *CSRegs) {
-  for (unsigned i = 0; CSRegs[i]; ++i)
-    if (Reg == CSRegs[i])
-      return true;
-  return false;
-}
-
-/// Checks whether the given instruction restores callee save registers
-/// and if so returns how many.
-static unsigned getNumCSRestores(MachineInstr &MI, const MCPhysReg *CSRegs) {
-  unsigned RtIdx = 0;
-  switch (MI.getOpcode()) {
-  case AArch64::LDPXpost:
-  case AArch64::LDPDpost:
-    RtIdx = 1;
-    // FALLTHROUGH
-  case AArch64::LDPXi:
-  case AArch64::LDPDi:
-    if (!isCalleeSavedRegister(MI.getOperand(RtIdx).getReg(), CSRegs) ||
-        !isCalleeSavedRegister(MI.getOperand(RtIdx + 1).getReg(), CSRegs) ||
-        MI.getOperand(RtIdx + 2).getReg() != AArch64::SP)
-      return 0;
-    return 2;
+    // Now emit the moves for whatever callee saved regs we have (including FP,
+    // LR if those are saved).
+    emitCalleeSavedFrameMoves(MBB, MBBI);
   }
-  return 0;
 }
 
 void AArch64FrameLowering::emitEpilogue(MachineFunction &MF,
@@ -552,7 +646,6 @@ void AArch64FrameLowering::emitEpilogue(MachineFunction &MF,
   MachineBasicBlock::iterator MBBI = MBB.getLastNonDebugInstr();
   MachineFrameInfo *MFI = MF.getFrameInfo();
   const AArch64Subtarget &Subtarget = MF.getSubtarget<AArch64Subtarget>();
-  const AArch64RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
   const TargetInstrInfo *TII = Subtarget.getInstrInfo();
   DebugLoc DL;
   bool IsTailCallReturn = false;
@@ -599,7 +692,7 @@ void AArch64FrameLowering::emitEpilogue(MachineFunction &MF,
   //      ---------------------|        ---           |
   //      |                    |         |            |
   //      |   CalleeSavedReg   |         |            |
-  //      | (NumRestores * 8)  |         |            |
+  //      | (CalleeSavedStackSize)|      |            |
   //      |                    |         |            |
   //      ---------------------|         |         NumBytes
   //      |                    |     StackSize  (StackAdjustUp)
@@ -614,41 +707,74 @@ void AArch64FrameLowering::emitEpilogue(MachineFunction &MF,
   //
   // AArch64TargetLowering::LowerCall figures out ArgumentPopSize and keeps
   // it as the 2nd argument of AArch64ISD::TC_RETURN.
-  NumBytes += ArgumentPopSize;
 
-  unsigned NumRestores = 0;
+  auto CSStackSize = AFI->getCalleeSavedStackSize();
+  bool CombineSPBump = shouldCombineCSRLocalStackBump(MF, NumBytes);
+
+  if (!CombineSPBump && CSStackSize != 0)
+    convertCalleeSaveRestoreToSPPrePostIncDec(
+        MBB, std::prev(MBB.getFirstTerminator()), DL, TII, CSStackSize);
+
   // Move past the restores of the callee-saved registers.
   MachineBasicBlock::iterator LastPopI = MBB.getFirstTerminator();
-  const MCPhysReg *CSRegs = RegInfo->getCalleeSavedRegs(&MF);
   MachineBasicBlock::iterator Begin = MBB.begin();
   while (LastPopI != Begin) {
     --LastPopI;
-    unsigned Restores = getNumCSRestores(*LastPopI, CSRegs);
-    NumRestores += Restores;
-    if (Restores == 0) {
+    if (!LastPopI->getFlag(MachineInstr::FrameDestroy)) {
       ++LastPopI;
       break;
-    }
+    } else if (CombineSPBump)
+      fixupCalleeSaveRestoreStackOffset(*LastPopI, AFI->getLocalStackSize());
+  }
+
+  // If there is a single SP update, insert it before the ret and we're done.
+  if (CombineSPBump) {
+    emitFrameOffset(MBB, MBB.getFirstTerminator(), DL, AArch64::SP, AArch64::SP,
+                    NumBytes + ArgumentPopSize, TII,
+                    MachineInstr::FrameDestroy);
+    return;
   }
-  NumBytes -= NumRestores * 8;
+
+  NumBytes -= CSStackSize;
   assert(NumBytes >= 0 && "Negative stack allocation size!?");
 
   if (!hasFP(MF)) {
+    bool RedZone = canUseRedZone(MF);
     // If this was a redzone leaf function, we don't need to restore the
-    // stack pointer.
-    if (!canUseRedZone(MF))
-      emitFrameOffset(MBB, LastPopI, DL, AArch64::SP, AArch64::SP, NumBytes,
-                      TII);
-    return;
+    // stack pointer (but we may need to pop stack args for fastcc).
+    if (RedZone && ArgumentPopSize == 0)
+      return;
+
+    bool NoCalleeSaveRestore = CSStackSize == 0;
+    int StackRestoreBytes = RedZone ? 0 : NumBytes;
+    if (NoCalleeSaveRestore)
+      StackRestoreBytes += ArgumentPopSize;
+    emitFrameOffset(MBB, LastPopI, DL, AArch64::SP, AArch64::SP,
+                    StackRestoreBytes, TII, MachineInstr::FrameDestroy);
+    // If we were able to combine the local stack pop with the argument pop,
+    // then we're done.
+    if (NoCalleeSaveRestore || ArgumentPopSize == 0)
+      return;
+    NumBytes = 0;
   }
 
   // Restore the original stack pointer.
   // FIXME: Rather than doing the math here, we should instead just use
   // non-post-indexed loads for the restores if we aren't actually going to
   // be able to save any instructions.
-  if (NumBytes || MFI->hasVarSizedObjects())
+  if (MFI->hasVarSizedObjects() || AFI->isStackRealigned())
     emitFrameOffset(MBB, LastPopI, DL, AArch64::SP, AArch64::FP,
-                    -(NumRestores - 2) * 8, TII, MachineInstr::NoFlags);
+                    -CSStackSize + 16, TII, MachineInstr::FrameDestroy);
+  else if (NumBytes)
+    emitFrameOffset(MBB, LastPopI, DL, AArch64::SP, AArch64::SP, NumBytes, TII,
+                    MachineInstr::FrameDestroy);
+
+  // This must be placed after the callee-save restore code because that code
+  // assumes the SP is at the same location as it was after the callee-save save
+  // code in the prologue.
+  if (ArgumentPopSize)
+    emitFrameOffset(MBB, MBB.getFirstTerminator(), DL, AArch64::SP, AArch64::SP,
+                    ArgumentPopSize, TII, MachineInstr::FrameDestroy);
 }
 
 /// getFrameIndexReference - Provide a base+offset reference to an FI slot for
@@ -726,86 +852,167 @@ int AArch64FrameLowering::resolveFrameIndexReference(const MachineFunction &MF,
 }
 
 static unsigned getPrologueDeath(MachineFunction &MF, unsigned Reg) {
-  if (Reg != AArch64::LR)
-    return getKillRegState(true);
+  // Do not set a kill flag on values that are also marked as live-in. This
+  // happens with the @llvm-returnaddress intrinsic and with arguments passed in
+  // callee saved registers.
+  // Omitting the kill flags is conservatively correct even if the live-in
+  // is not used after all.
+  bool IsLiveIn = MF.getRegInfo().isLiveIn(Reg);
+  return getKillRegState(!IsLiveIn);
+}
 
-  // LR maybe referred to later by an @llvm.returnaddress intrinsic.
-  bool LRLiveIn = MF.getRegInfo().isLiveIn(AArch64::LR);
-  bool LRKill = !(LRLiveIn && MF.getFrameInfo()->isReturnAddressTaken());
-  return getKillRegState(LRKill);
+static bool produceCompactUnwindFrame(MachineFunction &MF) {
+  const AArch64Subtarget &Subtarget = MF.getSubtarget<AArch64Subtarget>();
+  AttributeSet Attrs = MF.getFunction()->getAttributes();
+  return Subtarget.isTargetMachO() &&
+         !(Subtarget.getTargetLowering()->supportSwiftError() &&
+           Attrs.hasAttrSomewhere(Attribute::SwiftError));
 }
 
-bool AArch64FrameLowering::spillCalleeSavedRegisters(
-    MachineBasicBlock &MBB, MachineBasicBlock::iterator MI,
-    const std::vector<CalleeSavedInfo> &CSI,
-    const TargetRegisterInfo *TRI) const {
-  MachineFunction &MF = *MBB.getParent();
-  const TargetInstrInfo &TII = *MF.getSubtarget().getInstrInfo();
+
+struct RegPairInfo {
+  RegPairInfo() : Reg1(AArch64::NoRegister), Reg2(AArch64::NoRegister) {}
+  unsigned Reg1;
+  unsigned Reg2;
+  int FrameIdx;
+  int Offset;
+  bool IsGPR;
+  bool isPaired() const { return Reg2 != AArch64::NoRegister; }
+};
+
+static void computeCalleeSaveRegisterPairs(
+    MachineFunction &MF, const std::vector<CalleeSavedInfo> &CSI,
+    const TargetRegisterInfo *TRI, SmallVectorImpl<RegPairInfo> &RegPairs) {
+
+  if (CSI.empty())
+    return;
+
+  AArch64FunctionInfo *AFI = MF.getInfo<AArch64FunctionInfo>();
+  MachineFrameInfo *MFI = MF.getFrameInfo();
+  CallingConv::ID CC = MF.getFunction()->getCallingConv();
   unsigned Count = CSI.size();
-  DebugLoc DL;
-  assert((Count & 1) == 0 && "Odd number of callee-saved regs to spill!");
+  (void)CC;
+  // MachO's compact unwind format relies on all registers being stored in
+  // pairs.
+  assert((!produceCompactUnwindFrame(MF) ||
+          CC == CallingConv::PreserveMost ||
+          (Count & 1) == 0) &&
+         "Odd number of callee-saved regs to spill!");
+  unsigned Offset = AFI->getCalleeSavedStackSize();
+
+  for (unsigned i = 0; i < Count; ++i) {
+    RegPairInfo RPI;
+    RPI.Reg1 = CSI[i].getReg();
+
+    assert(AArch64::GPR64RegClass.contains(RPI.Reg1) ||
+           AArch64::FPR64RegClass.contains(RPI.Reg1));
+    RPI.IsGPR = AArch64::GPR64RegClass.contains(RPI.Reg1);
+
+    // Add the next reg to the pair if it is in the same register class.
+    if (i + 1 < Count) {
+      unsigned NextReg = CSI[i + 1].getReg();
+      if ((RPI.IsGPR && AArch64::GPR64RegClass.contains(NextReg)) ||
+          (!RPI.IsGPR && AArch64::FPR64RegClass.contains(NextReg)))
+        RPI.Reg2 = NextReg;
+    }
 
-  for (unsigned i = 0; i < Count; i += 2) {
-    unsigned idx = Count - i - 2;
-    unsigned Reg1 = CSI[idx].getReg();
-    unsigned Reg2 = CSI[idx + 1].getReg();
     // GPRs and FPRs are saved in pairs of 64-bit regs. We expect the CSI
     // list to come in sorted by frame index so that we can issue the store
     // pair instructions directly. Assert if we see anything otherwise.
     //
     // The order of the registers in the list is controlled by
     // getCalleeSavedRegs(), so they will always be in-order, as well.
-    assert(CSI[idx].getFrameIdx() + 1 == CSI[idx + 1].getFrameIdx() &&
+    assert((!RPI.isPaired() ||
+            (CSI[i].getFrameIdx() + 1 == CSI[i + 1].getFrameIdx())) &&
            "Out of order callee saved regs!");
+
+    // MachO's compact unwind format relies on all registers being stored in
+    // adjacent register pairs.
+    assert((!produceCompactUnwindFrame(MF) ||
+            CC == CallingConv::PreserveMost ||
+            (RPI.isPaired() &&
+             ((RPI.Reg1 == AArch64::LR && RPI.Reg2 == AArch64::FP) ||
+              RPI.Reg1 + 1 == RPI.Reg2))) &&
+           "Callee-save registers not saved as adjacent register pair!");
+
+    RPI.FrameIdx = CSI[i].getFrameIdx();
+
+    if (Count * 8 != AFI->getCalleeSavedStackSize() && !RPI.isPaired()) {
+      // Round up size of non-pair to pair size if we need to pad the
+      // callee-save area to ensure 16-byte alignment.
+      Offset -= 16;
+      assert(MFI->getObjectAlignment(RPI.FrameIdx) <= 16);
+      MFI->setObjectAlignment(RPI.FrameIdx, 16);
+      AFI->setCalleeSaveStackHasFreeSpace(true);
+    } else
+      Offset -= RPI.isPaired() ? 16 : 8;
+    assert(Offset % 8 == 0);
+    RPI.Offset = Offset / 8;
+    assert((RPI.Offset >= -64 && RPI.Offset <= 63) &&
+           "Offset out of bounds for LDP/STP immediate");
+
+    RegPairs.push_back(RPI);
+    if (RPI.isPaired())
+      ++i;
+  }
+}
+
+bool AArch64FrameLowering::spillCalleeSavedRegisters(
+    MachineBasicBlock &MBB, MachineBasicBlock::iterator MI,
+    const std::vector<CalleeSavedInfo> &CSI,
+    const TargetRegisterInfo *TRI) const {
+  MachineFunction &MF = *MBB.getParent();
+  const TargetInstrInfo &TII = *MF.getSubtarget().getInstrInfo();
+  DebugLoc DL;
+  SmallVector<RegPairInfo, 8> RegPairs;
+
+  computeCalleeSaveRegisterPairs(MF, CSI, TRI, RegPairs);
+
+  for (auto RPII = RegPairs.rbegin(), RPIE = RegPairs.rend(); RPII != RPIE;
+       ++RPII) {
+    RegPairInfo RPI = *RPII;
+    unsigned Reg1 = RPI.Reg1;
+    unsigned Reg2 = RPI.Reg2;
     unsigned StrOpc;
-    assert((Count & 1) == 0 && "Odd number of callee-saved regs to spill!");
-    assert((i & 1) == 0 && "Odd index for callee-saved reg spill!");
-    // Issue sequence of non-sp increment and pi sp spills for cs regs. The
-    // first spill is a pre-increment that allocates the stack.
+
+    // Issue sequence of spills for cs regs.  The first spill may be converted
+    // to a pre-decrement store later by emitPrologue if the callee-save stack
+    // area allocation can't be combined with the local stack area allocation.
     // For example:
-    //    stp     x22, x21, [sp, #-48]!   // addImm(-6)
+    //    stp     x22, x21, [sp, #0]     // addImm(+0)
     //    stp     x20, x19, [sp, #16]    // addImm(+2)
     //    stp     fp, lr, [sp, #32]      // addImm(+4)
     // Rationale: This sequence saves uop updates compared to a sequence of
     // pre-increment spills like stp xi,xj,[sp,#-16]!
-    // Note: Similar rational and sequence for restores in epilog.
-    if (AArch64::GPR64RegClass.contains(Reg1)) {
-      assert(AArch64::GPR64RegClass.contains(Reg2) &&
-             "Expected GPR64 callee-saved register pair!");
-      // For first spill use pre-increment store.
-      if (i == 0)
-        StrOpc = AArch64::STPXpre;
-      else
-        StrOpc = AArch64::STPXi;
-    } else if (AArch64::FPR64RegClass.contains(Reg1)) {
-      assert(AArch64::FPR64RegClass.contains(Reg2) &&
-             "Expected FPR64 callee-saved register pair!");
-      // For first spill use pre-increment store.
-      if (i == 0)
-        StrOpc = AArch64::STPDpre;
-      else
-        StrOpc = AArch64::STPDi;
-    } else
-      llvm_unreachable("Unexpected callee saved register!");
-    DEBUG(dbgs() << "CSR spill: (" << TRI->getName(Reg1) << ", "
-                 << TRI->getName(Reg2) << ") -> fi#(" << CSI[idx].getFrameIdx()
-                 << ", " << CSI[idx + 1].getFrameIdx() << ")\n");
-    // Compute offset: i = 0 => offset = -Count;
-    //                 i = 2 => offset = -(Count - 2) + Count = 2 = i; etc.
-    const int Offset = (i == 0) ? -Count : i;
-    assert((Offset >= -64 && Offset <= 63) &&
-           "Offset out of bounds for STP immediate");
-    MachineInstrBuilder MIB = BuildMI(MBB, MI, DL, TII.get(StrOpc));
-    if (StrOpc == AArch64::STPDpre || StrOpc == AArch64::STPXpre)
-      MIB.addReg(AArch64::SP, RegState::Define);
+    // Note: Similar rationale and sequence for restores in epilog.
+    if (RPI.IsGPR)
+      StrOpc = RPI.isPaired() ? AArch64::STPXi : AArch64::STRXui;
+    else
+      StrOpc = RPI.isPaired() ? AArch64::STPDi : AArch64::STRDui;
+    DEBUG(dbgs() << "CSR spill: (" << TRI->getName(Reg1);
+          if (RPI.isPaired())
+            dbgs() << ", " << TRI->getName(Reg2);
+          dbgs() << ") -> fi#(" << RPI.FrameIdx;
+          if (RPI.isPaired())
+            dbgs() << ", " << RPI.FrameIdx+1;
+          dbgs() << ")\n");
 
+    MachineInstrBuilder MIB = BuildMI(MBB, MI, DL, TII.get(StrOpc));
     MBB.addLiveIn(Reg1);
-    MBB.addLiveIn(Reg2);
-    MIB.addReg(Reg2, getPrologueDeath(MF, Reg2))
-        .addReg(Reg1, getPrologueDeath(MF, Reg1))
+    if (RPI.isPaired()) {
+      MBB.addLiveIn(Reg2);
+      MIB.addReg(Reg2, getPrologueDeath(MF, Reg2));
+      MIB.addMemOperand(MF.getMachineMemOperand(
+          MachinePointerInfo::getFixedStack(MF, RPI.FrameIdx + 1),
+          MachineMemOperand::MOStore, 8, 8));
+    }
+    MIB.addReg(Reg1, getPrologueDeath(MF, Reg1))
         .addReg(AArch64::SP)
-        .addImm(Offset) // [sp, #offset * 8], where factor * 8 is implicit
+        .addImm(RPI.Offset) // [sp, #offset*8], where factor*8 is implicit
         .setMIFlag(MachineInstr::FrameSetup);
+    MIB.addMemOperand(MF.getMachineMemOperand(
+        MachinePointerInfo::getFixedStack(MF, RPI.FrameIdx),
+        MachineMemOperand::MOStore, 8, 8));
   }
   return true;
 }
@@ -816,66 +1023,55 @@ bool AArch64FrameLowering::restoreCalleeSavedRegisters(
     const TargetRegisterInfo *TRI) const {
   MachineFunction &MF = *MBB.getParent();
   const TargetInstrInfo &TII = *MF.getSubtarget().getInstrInfo();
-  unsigned Count = CSI.size();
   DebugLoc DL;
-  assert((Count & 1) == 0 && "Odd number of callee-saved regs to spill!");
+  SmallVector<RegPairInfo, 8> RegPairs;
 
   if (MI != MBB.end())
     DL = MI->getDebugLoc();
 
-  for (unsigned i = 0; i < Count; i += 2) {
-    unsigned Reg1 = CSI[i].getReg();
-    unsigned Reg2 = CSI[i + 1].getReg();
-    // GPRs and FPRs are saved in pairs of 64-bit regs. We expect the CSI
-    // list to come in sorted by frame index so that we can issue the store
-    // pair instructions directly. Assert if we see anything otherwise.
-    assert(CSI[i].getFrameIdx() + 1 == CSI[i + 1].getFrameIdx() &&
-           "Out of order callee saved regs!");
-    // Issue sequence of non-sp increment and sp-pi restores for cs regs. Only
-    // the last load is sp-pi post-increment and de-allocates the stack:
+  computeCalleeSaveRegisterPairs(MF, CSI, TRI, RegPairs);
+
+  for (auto RPII = RegPairs.begin(), RPIE = RegPairs.end(); RPII != RPIE;
+       ++RPII) {
+    RegPairInfo RPI = *RPII;
+    unsigned Reg1 = RPI.Reg1;
+    unsigned Reg2 = RPI.Reg2;
+
+    // Issue sequence of restores for cs regs. The last restore may be converted
+    // to a post-increment load later by emitEpilogue if the callee-save stack
+    // area allocation can't be combined with the local stack area allocation.
     // For example:
     //    ldp     fp, lr, [sp, #32]       // addImm(+4)
     //    ldp     x20, x19, [sp, #16]     // addImm(+2)
-    //    ldp     x22, x21, [sp], #48     // addImm(+6)
+    //    ldp     x22, x21, [sp, #0]      // addImm(+0)
     // Note: see comment in spillCalleeSavedRegisters()
     unsigned LdrOpc;
+    if (RPI.IsGPR)
+      LdrOpc = RPI.isPaired() ? AArch64::LDPXi : AArch64::LDRXui;
+    else
+      LdrOpc = RPI.isPaired() ? AArch64::LDPDi : AArch64::LDRDui;
+    DEBUG(dbgs() << "CSR restore: (" << TRI->getName(Reg1);
+          if (RPI.isPaired())
+            dbgs() << ", " << TRI->getName(Reg2);
+          dbgs() << ") -> fi#(" << RPI.FrameIdx;
+          if (RPI.isPaired())
+            dbgs() << ", " << RPI.FrameIdx+1;
+          dbgs() << ")\n");
 
-    assert((Count & 1) == 0 && "Odd number of callee-saved regs to spill!");
-    assert((i & 1) == 0 && "Odd index for callee-saved reg spill!");
-    if (AArch64::GPR64RegClass.contains(Reg1)) {
-      assert(AArch64::GPR64RegClass.contains(Reg2) &&
-             "Expected GPR64 callee-saved register pair!");
-      if (i == Count - 2)
-        LdrOpc = AArch64::LDPXpost;
-      else
-        LdrOpc = AArch64::LDPXi;
-    } else if (AArch64::FPR64RegClass.contains(Reg1)) {
-      assert(AArch64::FPR64RegClass.contains(Reg2) &&
-             "Expected FPR64 callee-saved register pair!");
-      if (i == Count - 2)
-        LdrOpc = AArch64::LDPDpost;
-      else
-        LdrOpc = AArch64::LDPDi;
-    } else
-      llvm_unreachable("Unexpected callee saved register!");
-    DEBUG(dbgs() << "CSR restore: (" << TRI->getName(Reg1) << ", "
-                 << TRI->getName(Reg2) << ") -> fi#(" << CSI[i].getFrameIdx()
-                 << ", " << CSI[i + 1].getFrameIdx() << ")\n");
-
-    // Compute offset: i = 0 => offset = Count - 2; i = 2 => offset = Count - 4;
-    // etc.
-    const int Offset = (i == Count - 2) ? Count : Count - i - 2;
-    assert((Offset >= -64 && Offset <= 63) &&
-           "Offset out of bounds for LDP immediate");
     MachineInstrBuilder MIB = BuildMI(MBB, MI, DL, TII.get(LdrOpc));
-    if (LdrOpc == AArch64::LDPXpost || LdrOpc == AArch64::LDPDpost)
-      MIB.addReg(AArch64::SP, RegState::Define);
-
-    MIB.addReg(Reg2, getDefRegState(true))
-        .addReg(Reg1, getDefRegState(true))
+    if (RPI.isPaired()) {
+      MIB.addReg(Reg2, getDefRegState(true));
+      MIB.addMemOperand(MF.getMachineMemOperand(
+          MachinePointerInfo::getFixedStack(MF, RPI.FrameIdx + 1),
+          MachineMemOperand::MOLoad, 8, 8));
+    }
+    MIB.addReg(Reg1, getDefRegState(true))
         .addReg(AArch64::SP)
-        .addImm(Offset); // [sp], #offset * 8  or [sp, #offset * 8]
-                         // where the factor * 8 is implicit
+        .addImm(RPI.Offset) // [sp, #offset*8] where the factor*8 is implicit
+        .setMIFlag(MachineInstr::FrameDestroy);
+    MIB.addMemOperand(MF.getMachineMemOperand(
+        MachinePointerInfo::getFixedStack(MF, RPI.FrameIdx),
+        MachineMemOperand::MOLoad, 8, 8));
   }
   return true;
 }
@@ -892,8 +1088,8 @@ void AArch64FrameLowering::determineCalleeSaves(MachineFunction &MF,
   const AArch64RegisterInfo *RegInfo = static_cast<const AArch64RegisterInfo *>(
       MF.getSubtarget().getRegisterInfo());
   AArch64FunctionInfo *AFI = MF.getInfo<AArch64FunctionInfo>();
-  SmallVector<unsigned, 4> UnspilledCSGPRs;
-  SmallVector<unsigned, 4> UnspilledCSFPRs;
+  unsigned UnspilledCSGPR = AArch64::NoRegister;
+  unsigned UnspilledCSGPRPaired = AArch64::NoRegister;
 
   // The frame record needs to be created by saving the appropriate registers
   if (hasFP(MF)) {
@@ -901,79 +1097,51 @@ void AArch64FrameLowering::determineCalleeSaves(MachineFunction &MF,
     SavedRegs.set(AArch64::LR);
   }
 
-  // Spill the BasePtr if it's used. Do this first thing so that the
-  // getCalleeSavedRegs() below will get the right answer.
+  unsigned BasePointerReg = AArch64::NoRegister;
   if (RegInfo->hasBasePointer(MF))
-    SavedRegs.set(RegInfo->getBaseRegister());
-
-  if (RegInfo->needsStackRealignment(MF) && !RegInfo->hasBasePointer(MF))
-    SavedRegs.set(AArch64::X9);
+    BasePointerReg = RegInfo->getBaseRegister();
 
-  // If any callee-saved registers are used, the frame cannot be eliminated.
-  unsigned NumGPRSpilled = 0;
-  unsigned NumFPRSpilled = 0;
   bool ExtraCSSpill = false;
-  bool CanEliminateFrame = true;
-  DEBUG(dbgs() << "*** determineCalleeSaves\nUsed CSRs:");
   const MCPhysReg *CSRegs = RegInfo->getCalleeSavedRegs(&MF);
+  // Figure out which callee-saved registers to save/restore.
+  for (unsigned i = 0; CSRegs[i]; ++i) {
+    const unsigned Reg = CSRegs[i];
+
+    // Add the base pointer register to SavedRegs if it is callee-save.
+    if (Reg == BasePointerReg)
+      SavedRegs.set(Reg);
 
-  // Check pairs of consecutive callee-saved registers.
-  for (unsigned i = 0; CSRegs[i]; i += 2) {
-    assert(CSRegs[i + 1] && "Odd number of callee-saved registers!");
-
-    const unsigned OddReg = CSRegs[i];
-    const unsigned EvenReg = CSRegs[i + 1];
-    assert((AArch64::GPR64RegClass.contains(OddReg) &&
-            AArch64::GPR64RegClass.contains(EvenReg)) ^
-               (AArch64::FPR64RegClass.contains(OddReg) &&
-                AArch64::FPR64RegClass.contains(EvenReg)) &&
-           "Register class mismatch!");
-
-    const bool OddRegUsed = SavedRegs.test(OddReg);
-    const bool EvenRegUsed = SavedRegs.test(EvenReg);
-
-    // Early exit if none of the registers in the register pair is actually
-    // used.
-    if (!OddRegUsed && !EvenRegUsed) {
-      if (AArch64::GPR64RegClass.contains(OddReg)) {
-        UnspilledCSGPRs.push_back(OddReg);
-        UnspilledCSGPRs.push_back(EvenReg);
-      } else {
-        UnspilledCSFPRs.push_back(OddReg);
-        UnspilledCSFPRs.push_back(EvenReg);
+    bool RegUsed = SavedRegs.test(Reg);
+    unsigned PairedReg = CSRegs[i ^ 1];
+    if (!RegUsed) {
+      if (AArch64::GPR64RegClass.contains(Reg) &&
+          !RegInfo->isReservedReg(MF, Reg)) {
+        UnspilledCSGPR = Reg;
+        UnspilledCSGPRPaired = PairedReg;
       }
       continue;
     }
 
-    unsigned Reg = AArch64::NoRegister;
-    // If only one of the registers of the register pair is used, make sure to
-    // mark the other one as used as well.
-    if (OddRegUsed ^ EvenRegUsed) {
-      // Find out which register is the additional spill.
-      Reg = OddRegUsed ? EvenReg : OddReg;
-      SavedRegs.set(Reg);
+    // MachO's compact unwind format relies on all registers being stored in
+    // pairs.
+    // FIXME: the usual format is actually better if unwinding isn't needed.
+    if (produceCompactUnwindFrame(MF) && !SavedRegs.test(PairedReg)) {
+      SavedRegs.set(PairedReg);
+      if (AArch64::GPR64RegClass.contains(PairedReg) &&
+          !RegInfo->isReservedReg(MF, PairedReg))
+        ExtraCSSpill = true;
     }
+  }
 
-    DEBUG(dbgs() << ' ' << PrintReg(OddReg, RegInfo));
-    DEBUG(dbgs() << ' ' << PrintReg(EvenReg, RegInfo));
-
-    assert(((OddReg == AArch64::LR && EvenReg == AArch64::FP) ||
-            (RegInfo->getEncodingValue(OddReg) + 1 ==
-             RegInfo->getEncodingValue(EvenReg))) &&
-           "Register pair of non-adjacent registers!");
-    if (AArch64::GPR64RegClass.contains(OddReg)) {
-      NumGPRSpilled += 2;
-      // If it's not a reserved register, we can use it in lieu of an
-      // emergency spill slot for the register scavenger.
-      // FIXME: It would be better to instead keep looking and choose another
-      // unspilled register that isn't reserved, if there is one.
-      if (Reg != AArch64::NoRegister && !RegInfo->isReservedReg(MF, Reg))
-        ExtraCSSpill = true;
-    } else
-      NumFPRSpilled += 2;
+  DEBUG(dbgs() << "*** determineCalleeSaves\nUsed CSRs:";
+        for (int Reg = SavedRegs.find_first(); Reg != -1;
+             Reg = SavedRegs.find_next(Reg))
+          dbgs() << ' ' << PrintReg(Reg, RegInfo);
+        dbgs() << "\n";);
 
-    CanEliminateFrame = false;
-  }
+  // If any callee-saved registers are used, the frame cannot be eliminated.
+  unsigned NumRegsSpilled = SavedRegs.count();
+  bool CanEliminateFrame = NumRegsSpilled == 0;
 
   // FIXME: Set BigStack if any stack slot references may be out of range.
   // For now, just conservatively guestimate based on unscaled indexing
@@ -982,8 +1150,7 @@ void AArch64FrameLowering::determineCalleeSaves(MachineFunction &MF,
   // The CSR spill slots have not been allocated yet, so estimateStackSize
   // won't include them.
   MachineFrameInfo *MFI = MF.getFrameInfo();
-  unsigned CFSize =
-      MFI->estimateStackSize(MF) + 8 * (NumGPRSpilled + NumFPRSpilled);
+  unsigned CFSize = MFI->estimateStackSize(MF) + 8 * NumRegsSpilled;
   DEBUG(dbgs() << "Estimated stack frame size: " << CFSize << " bytes.\n");
   bool BigStack = (CFSize >= 256);
   if (BigStack || !CanEliminateFrame || RegInfo->cannotEliminateFrame(MF))
@@ -996,19 +1163,17 @@ void AArch64FrameLowering::determineCalleeSaves(MachineFunction &MF,
   // above to keep the number of spills even, we don't need to do anything else
   // here.
   if (BigStack && !ExtraCSSpill) {
-
-    // If we're adding a register to spill here, we have to add two of them
-    // to keep the number of regs to spill even.
-    assert(((UnspilledCSGPRs.size() & 1) == 0) && "Odd number of registers!");
-    unsigned Count = 0;
-    while (!UnspilledCSGPRs.empty() && Count < 2) {
-      unsigned Reg = UnspilledCSGPRs.back();
-      UnspilledCSGPRs.pop_back();
-      DEBUG(dbgs() << "Spilling " << PrintReg(Reg, RegInfo)
-                   << " to get a scratch register.\n");
-      SavedRegs.set(Reg);
+    if (UnspilledCSGPR != AArch64::NoRegister) {
+      DEBUG(dbgs() << "Spilling " << PrintReg(UnspilledCSGPR, RegInfo)
+            << " to get a scratch register.\n");
+      SavedRegs.set(UnspilledCSGPR);
+      // MachO's compact unwind format relies on all registers being stored in
+      // pairs, so if we need to spill one extra for BigStack, then we need to
+      // store the pair.
+      if (produceCompactUnwindFrame(MF))
+        SavedRegs.set(UnspilledCSGPRPaired);
       ExtraCSSpill = true;
-      ++Count;
+      NumRegsSpilled = SavedRegs.count();
     }
 
     // If we didn't find an extra callee-saved register to spill, create
@@ -1021,4 +1186,14 @@ void AArch64FrameLowering::determineCalleeSaves(MachineFunction &MF,
                    << " as the emergency spill slot.\n");
     }
   }
+
+  // Round up to register pair alignment to avoid additional SP adjustment
+  // instructions.
+  AFI->setCalleeSavedStackSize(alignTo(8 * NumRegsSpilled, 16));
+}
+
+bool AArch64FrameLowering::enableStackSlotScavenging(
+    const MachineFunction &MF) const {
+  const AArch64FunctionInfo *AFI = MF.getInfo<AArch64FunctionInfo>();
+  return AFI->hasCalleeSaveStackFreeSpace();
 }
diff --git a/lib/Target/AArch64/AArch64FrameLowering.h b/lib/Target/AArch64/AArch64FrameLowering.h
index 7d8354c38787..f254ea9b70aa 100644
--- a/lib/Target/AArch64/AArch64FrameLowering.h
+++ b/lib/Target/AArch64/AArch64FrameLowering.h
@@ -25,12 +25,11 @@ public:
                             true /*StackRealignable*/) {}
 
   void emitCalleeSavedFrameMoves(MachineBasicBlock &MBB,
-                                 MachineBasicBlock::iterator MBBI,
-                                 unsigned FramePtr) const;
+                                 MachineBasicBlock::iterator MBBI) const;
 
-  void eliminateCallFramePseudoInstr(MachineFunction &MF,
-                                  MachineBasicBlock &MBB,
-                                  MachineBasicBlock::iterator I) const override;
+  MachineBasicBlock::iterator
+  eliminateCallFramePseudoInstr(MachineFunction &MF, MachineBasicBlock &MBB,
+                                MachineBasicBlock::iterator I) const override;
 
   /// emitProlog/emitEpilog - These methods insert prolog and epilog code into
   /// the function.
@@ -67,6 +66,12 @@ public:
   bool enableShrinkWrapping(const MachineFunction &MF) const override {
     return true;
   }
+
+  bool enableStackSlotScavenging(const MachineFunction &MF) const override;
+
+private:
+  bool shouldCombineCSRLocalStackBump(MachineFunction &MF,
+                                      unsigned StackBumpBytes) const;
 };
 
 } // End llvm namespace
diff --git a/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp b/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp
index 6c868880bcac..8d649250f656 100644
--- a/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp
+++ b/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp
@@ -57,7 +57,7 @@ public:
     return SelectionDAGISel::runOnMachineFunction(MF);
   }
 
-  SDNode *Select(SDNode *Node) override;
+  void Select(SDNode *Node) override;
 
   /// SelectInlineAsmMemoryOperand - Implement addressing mode selection for
   /// inline asm expressions.
@@ -65,8 +65,8 @@ public:
                                     unsigned ConstraintID,
                                     std::vector<SDValue> &OutOps) override;
 
-  SDNode *SelectMLAV64LaneV128(SDNode *N);
-  SDNode *SelectMULLV64LaneV128(unsigned IntNo, SDNode *N);
+  bool tryMLAV64LaneV128(SDNode *N);
+  bool tryMULLV64LaneV128(unsigned IntNo, SDNode *N);
   bool SelectArithExtendedRegister(SDValue N, SDValue &Reg, SDValue &Shift);
   bool SelectArithImmed(SDValue N, SDValue &Val, SDValue &Shift);
   bool SelectNegArithImmed(SDValue N, SDValue &Val, SDValue &Shift);
@@ -147,28 +147,29 @@ public:
   SDValue createTuple(ArrayRef<SDValue> Vecs, const unsigned RegClassIDs[],
                       const unsigned SubRegs[]);
 
-  SDNode *SelectTable(SDNode *N, unsigned NumVecs, unsigned Opc, bool isExt);
+  void SelectTable(SDNode *N, unsigned NumVecs, unsigned Opc, bool isExt);
 
-  SDNode *SelectIndexedLoad(SDNode *N, bool &Done);
+  bool tryIndexedLoad(SDNode *N);
 
-  SDNode *SelectLoad(SDNode *N, unsigned NumVecs, unsigned Opc,
+  void SelectLoad(SDNode *N, unsigned NumVecs, unsigned Opc,
                      unsigned SubRegIdx);
-  SDNode *SelectPostLoad(SDNode *N, unsigned NumVecs, unsigned Opc,
+  void SelectPostLoad(SDNode *N, unsigned NumVecs, unsigned Opc,
                          unsigned SubRegIdx);
-  SDNode *SelectLoadLane(SDNode *N, unsigned NumVecs, unsigned Opc);
-  SDNode *SelectPostLoadLane(SDNode *N, unsigned NumVecs, unsigned Opc);
+  void SelectLoadLane(SDNode *N, unsigned NumVecs, unsigned Opc);
+  void SelectPostLoadLane(SDNode *N, unsigned NumVecs, unsigned Opc);
 
-  SDNode *SelectStore(SDNode *N, unsigned NumVecs, unsigned Opc);
-  SDNode *SelectPostStore(SDNode *N, unsigned NumVecs, unsigned Opc);
-  SDNode *SelectStoreLane(SDNode *N, unsigned NumVecs, unsigned Opc);
-  SDNode *SelectPostStoreLane(SDNode *N, unsigned NumVecs, unsigned Opc);
+  void SelectStore(SDNode *N, unsigned NumVecs, unsigned Opc);
+  void SelectPostStore(SDNode *N, unsigned NumVecs, unsigned Opc);
+  void SelectStoreLane(SDNode *N, unsigned NumVecs, unsigned Opc);
+  void SelectPostStoreLane(SDNode *N, unsigned NumVecs, unsigned Opc);
 
-  SDNode *SelectBitfieldExtractOp(SDNode *N);
-  SDNode *SelectBitfieldInsertOp(SDNode *N);
-  SDNode *SelectBitfieldInsertInZeroOp(SDNode *N);
+  bool tryBitfieldExtractOp(SDNode *N);
+  bool tryBitfieldExtractOpFromSExt(SDNode *N);
+  bool tryBitfieldInsertOp(SDNode *N);
+  bool tryBitfieldInsertInZeroOp(SDNode *N);
 
-  SDNode *SelectReadRegister(SDNode *N);
-  SDNode *SelectWriteRegister(SDNode *N);
+  bool tryReadRegister(SDNode *N);
+  bool tryWriteRegister(SDNode *N);
 
 // Include the pieces autogenerated from the target description.
 #include "AArch64GenDAGISel.inc"
@@ -198,6 +199,9 @@ private:
   }
 
   bool SelectCVTFixedPosOperand(SDValue N, SDValue &FixedPos, unsigned Width);
+
+  void SelectCMP_SWAP(SDNode *N);
+
 };
 } // end anonymous namespace
 
@@ -328,9 +332,7 @@ static AArch64_AM::ShiftExtendType getShiftTypeForNode(SDValue N) {
 bool AArch64DAGToDAGISel::isWorthFolding(SDValue V) const {
   // it hurts if the value is used at least twice, unless we are optimizing
   // for code size.
-  if (ForCodeSize || V.hasOneUse())
-    return true;
-  return false;
+  return ForCodeSize || V.hasOneUse();
 }
 
 /// SelectShiftedRegister - Select a "shifted register" operand.  If the value
@@ -452,7 +454,7 @@ static bool checkV64LaneV128(SDValue Op0, SDValue Op1, SDValue &StdOp,
 /// SelectMLAV64LaneV128 - AArch64 supports vector MLAs where one multiplicand
 /// is a lane in the upper half of a 128-bit vector.  Recognize and select this
 /// so that we don't emit unnecessary lane extracts.
-SDNode *AArch64DAGToDAGISel::SelectMLAV64LaneV128(SDNode *N) {
+bool AArch64DAGToDAGISel::tryMLAV64LaneV128(SDNode *N) {
   SDLoc dl(N);
   SDValue Op0 = N->getOperand(0);
   SDValue Op1 = N->getOperand(1);
@@ -467,7 +469,7 @@ SDNode *AArch64DAGToDAGISel::SelectMLAV64LaneV128(SDNode *N) {
     if (Op1.getOpcode() != ISD::MUL ||
         !checkV64LaneV128(Op1.getOperand(0), Op1.getOperand(1), MLAOp1, MLAOp2,
                           LaneIdx))
-      return nullptr;
+      return false;
   }
 
   SDValue LaneIdxVal = CurDAG->getTargetConstant(LaneIdx, dl, MVT::i64);
@@ -493,10 +495,11 @@ SDNode *AArch64DAGToDAGISel::SelectMLAV64LaneV128(SDNode *N) {
     break;
   }
 
-  return CurDAG->getMachineNode(MLAOpc, dl, N->getValueType(0), Ops);
+  ReplaceNode(N, CurDAG->getMachineNode(MLAOpc, dl, N->getValueType(0), Ops));
+  return true;
 }
 
-SDNode *AArch64DAGToDAGISel::SelectMULLV64LaneV128(unsigned IntNo, SDNode *N) {
+bool AArch64DAGToDAGISel::tryMULLV64LaneV128(unsigned IntNo, SDNode *N) {
   SDLoc dl(N);
   SDValue SMULLOp0;
   SDValue SMULLOp1;
@@ -504,7 +507,7 @@ SDNode *AArch64DAGToDAGISel::SelectMULLV64LaneV128(unsigned IntNo, SDNode *N) {
 
   if (!checkV64LaneV128(N->getOperand(1), N->getOperand(2), SMULLOp0, SMULLOp1,
                         LaneIdx))
-    return nullptr;
+    return false;
 
   SDValue LaneIdxVal = CurDAG->getTargetConstant(LaneIdx, dl, MVT::i64);
 
@@ -537,7 +540,8 @@ SDNode *AArch64DAGToDAGISel::SelectMULLV64LaneV128(unsigned IntNo, SDNode *N) {
   } else
     llvm_unreachable("Unrecognized intrinsic.");
 
-  return CurDAG->getMachineNode(SMULLOpc, dl, N->getValueType(0), Ops);
+  ReplaceNode(N, CurDAG->getMachineNode(SMULLOpc, dl, N->getValueType(0), Ops));
+  return true;
 }
 
 /// Instructions that accept extend modifiers like UXTW expect the register
@@ -610,7 +614,7 @@ static bool isWorthFoldingADDlow(SDValue N) {
 
     // ldar and stlr have much more restrictive addressing modes (just a
     // register).
-    if (cast<MemSDNode>(Use)->getOrdering() > Monotonic)
+    if (isStrongerThanMonotonic(cast<MemSDNode>(Use)->getOrdering()))
       return false;
   }
 
@@ -687,7 +691,7 @@ bool AArch64DAGToDAGISel::SelectAddrModeIndexed(SDValue N, unsigned Size,
 
     const GlobalValue *GV = GAN->getGlobal();
     unsigned Alignment = GV->getAlignment();
-    Type *Ty = GV->getType()->getElementType();
+    Type *Ty = GV->getValueType();
     if (Alignment == 0 && Ty->isSized())
       Alignment = DL.getABITypeAlignment(Ty);
 
@@ -797,10 +801,7 @@ bool AArch64DAGToDAGISel::SelectExtendedSHL(SDValue N, unsigned Size,
   if (ShiftVal != 0 && ShiftVal != LegalShiftVal)
     return false;
 
-  if (isWorthFolding(N))
-    return true;
-
-  return false;
+  return isWorthFolding(N);
 }
 
 bool AArch64DAGToDAGISel::SelectAddrModeWRO(SDValue N, unsigned Size,
@@ -1015,8 +1016,8 @@ SDValue AArch64DAGToDAGISel::createTuple(ArrayRef<SDValue> Regs,
   return SDValue(N, 0);
 }
 
-SDNode *AArch64DAGToDAGISel::SelectTable(SDNode *N, unsigned NumVecs,
-                                         unsigned Opc, bool isExt) {
+void AArch64DAGToDAGISel::SelectTable(SDNode *N, unsigned NumVecs, unsigned Opc,
+                                      bool isExt) {
   SDLoc dl(N);
   EVT VT = N->getValueType(0);
 
@@ -1033,13 +1034,13 @@ SDNode *AArch64DAGToDAGISel::SelectTable(SDNode *N, unsigned NumVecs,
     Ops.push_back(N->getOperand(1));
   Ops.push_back(RegSeq);
   Ops.push_back(N->getOperand(NumVecs + ExtOff + 1));
-  return CurDAG->getMachineNode(Opc, dl, VT, Ops);
+  ReplaceNode(N, CurDAG->getMachineNode(Opc, dl, VT, Ops));
 }
 
-SDNode *AArch64DAGToDAGISel::SelectIndexedLoad(SDNode *N, bool &Done) {
+bool AArch64DAGToDAGISel::tryIndexedLoad(SDNode *N) {
   LoadSDNode *LD = cast<LoadSDNode>(N);
   if (LD->isUnindexed())
-    return nullptr;
+    return false;
   EVT VT = LD->getMemoryVT();
   EVT DstVT = N->getValueType(0);
   ISD::MemIndexedMode AM = LD->getAddressingMode();
@@ -1101,7 +1102,7 @@ SDNode *AArch64DAGToDAGISel::SelectIndexedLoad(SDNode *N, bool &Done) {
   } else if (VT.is128BitVector()) {
     Opcode = IsPre ? AArch64::LDRQpre : AArch64::LDRQpost;
   } else
-    return nullptr;
+    return false;
   SDValue Chain = LD->getChain();
   SDValue Base = LD->getBasePtr();
   ConstantSDNode *OffsetOp = cast<ConstantSDNode>(LD->getOffset());
@@ -1112,7 +1113,6 @@ SDNode *AArch64DAGToDAGISel::SelectIndexedLoad(SDNode *N, bool &Done) {
   SDNode *Res = CurDAG->getMachineNode(Opcode, dl, MVT::i64, DstVT,
                                        MVT::Other, Ops);
   // Either way, we're replacing the node, so tell the caller that.
-  Done = true;
   SDValue LoadedVal = SDValue(Res, 1);
   if (InsertTo64) {
     SDValue SubReg = CurDAG->getTargetConstant(AArch64::sub_32, dl, MVT::i32);
@@ -1127,12 +1127,12 @@ SDNode *AArch64DAGToDAGISel::SelectIndexedLoad(SDNode *N, bool &Done) {
   ReplaceUses(SDValue(N, 0), LoadedVal);
   ReplaceUses(SDValue(N, 1), SDValue(Res, 0));
   ReplaceUses(SDValue(N, 2), SDValue(Res, 2));
-
-  return nullptr;
+  CurDAG->RemoveDeadNode(N);
+  return true;
 }
 
-SDNode *AArch64DAGToDAGISel::SelectLoad(SDNode *N, unsigned NumVecs,
-                                        unsigned Opc, unsigned SubRegIdx) {
+void AArch64DAGToDAGISel::SelectLoad(SDNode *N, unsigned NumVecs, unsigned Opc,
+                                     unsigned SubRegIdx) {
   SDLoc dl(N);
   EVT VT = N->getValueType(0);
   SDValue Chain = N->getOperand(0);
@@ -1149,11 +1149,11 @@ SDNode *AArch64DAGToDAGISel::SelectLoad(SDNode *N, unsigned NumVecs,
         CurDAG->getTargetExtractSubreg(SubRegIdx + i, dl, VT, SuperReg));
 
   ReplaceUses(SDValue(N, NumVecs), SDValue(Ld, 1));
-  return nullptr;
+  CurDAG->RemoveDeadNode(N);
 }
 
-SDNode *AArch64DAGToDAGISel::SelectPostLoad(SDNode *N, unsigned NumVecs,
-                                            unsigned Opc, unsigned SubRegIdx) {
+void AArch64DAGToDAGISel::SelectPostLoad(SDNode *N, unsigned NumVecs,
+                                         unsigned Opc, unsigned SubRegIdx) {
   SDLoc dl(N);
   EVT VT = N->getValueType(0);
   SDValue Chain = N->getOperand(0);
@@ -1181,11 +1181,11 @@ SDNode *AArch64DAGToDAGISel::SelectPostLoad(SDNode *N, unsigned NumVecs,
 
   // Update the chain
   ReplaceUses(SDValue(N, NumVecs + 1), SDValue(Ld, 2));
-  return nullptr;
+  CurDAG->RemoveDeadNode(N);
 }
 
-SDNode *AArch64DAGToDAGISel::SelectStore(SDNode *N, unsigned NumVecs,
-                                         unsigned Opc) {
+void AArch64DAGToDAGISel::SelectStore(SDNode *N, unsigned NumVecs,
+                                      unsigned Opc) {
   SDLoc dl(N);
   EVT VT = N->getOperand(2)->getValueType(0);
 
@@ -1197,11 +1197,11 @@ SDNode *AArch64DAGToDAGISel::SelectStore(SDNode *N, unsigned NumVecs,
   SDValue Ops[] = {RegSeq, N->getOperand(NumVecs + 2), N->getOperand(0)};
   SDNode *St = CurDAG->getMachineNode(Opc, dl, N->getValueType(0), Ops);
 
-  return St;
+  ReplaceNode(N, St);
 }
 
-SDNode *AArch64DAGToDAGISel::SelectPostStore(SDNode *N, unsigned NumVecs,
-                                             unsigned Opc) {
+void AArch64DAGToDAGISel::SelectPostStore(SDNode *N, unsigned NumVecs,
+                                          unsigned Opc) {
   SDLoc dl(N);
   EVT VT = N->getOperand(2)->getValueType(0);
   const EVT ResTys[] = {MVT::i64,    // Type of the write back register
@@ -1218,7 +1218,7 @@ SDNode *AArch64DAGToDAGISel::SelectPostStore(SDNode *N, unsigned NumVecs,
                    N->getOperand(0)};          // Chain
   SDNode *St = CurDAG->getMachineNode(Opc, dl, ResTys, Ops);
 
-  return St;
+  ReplaceNode(N, St);
 }
 
 namespace {
@@ -1256,8 +1256,8 @@ static SDValue NarrowVector(SDValue V128Reg, SelectionDAG &DAG) {
                                     V128Reg);
 }
 
-SDNode *AArch64DAGToDAGISel::SelectLoadLane(SDNode *N, unsigned NumVecs,
-                                            unsigned Opc) {
+void AArch64DAGToDAGISel::SelectLoadLane(SDNode *N, unsigned NumVecs,
+                                         unsigned Opc) {
   SDLoc dl(N);
   EVT VT = N->getValueType(0);
   bool Narrow = VT.getSizeInBits() == 64;
@@ -1292,12 +1292,11 @@ SDNode *AArch64DAGToDAGISel::SelectLoadLane(SDNode *N, unsigned NumVecs,
   }
 
   ReplaceUses(SDValue(N, NumVecs), SDValue(Ld, 1));
-
-  return Ld;
+  CurDAG->RemoveDeadNode(N);
 }
 
-SDNode *AArch64DAGToDAGISel::SelectPostLoadLane(SDNode *N, unsigned NumVecs,
-                                                unsigned Opc) {
+void AArch64DAGToDAGISel::SelectPostLoadLane(SDNode *N, unsigned NumVecs,
+                                             unsigned Opc) {
   SDLoc dl(N);
   EVT VT = N->getValueType(0);
   bool Narrow = VT.getSizeInBits() == 64;
@@ -1348,12 +1347,11 @@ SDNode *AArch64DAGToDAGISel::SelectPostLoadLane(SDNode *N, unsigned NumVecs,
 
   // Update the Chain
   ReplaceUses(SDValue(N, NumVecs + 1), SDValue(Ld, 2));
-
-  return Ld;
+  CurDAG->RemoveDeadNode(N);
 }
 
-SDNode *AArch64DAGToDAGISel::SelectStoreLane(SDNode *N, unsigned NumVecs,
-                                             unsigned Opc) {
+void AArch64DAGToDAGISel::SelectStoreLane(SDNode *N, unsigned NumVecs,
+                                          unsigned Opc) {
   SDLoc dl(N);
   EVT VT = N->getOperand(2)->getValueType(0);
   bool Narrow = VT.getSizeInBits() == 64;
@@ -1379,11 +1377,11 @@ SDNode *AArch64DAGToDAGISel::SelectStoreLane(SDNode *N, unsigned NumVecs,
   MemOp[0] = cast<MemIntrinsicSDNode>(N)->getMemOperand();
   cast<MachineSDNode>(St)->setMemRefs(MemOp, MemOp + 1);
 
-  return St;
+  ReplaceNode(N, St);
 }
 
-SDNode *AArch64DAGToDAGISel::SelectPostStoreLane(SDNode *N, unsigned NumVecs,
-                                                 unsigned Opc) {
+void AArch64DAGToDAGISel::SelectPostStoreLane(SDNode *N, unsigned NumVecs,
+                                              unsigned Opc) {
   SDLoc dl(N);
   EVT VT = N->getOperand(2)->getValueType(0);
   bool Narrow = VT.getSizeInBits() == 64;
@@ -1414,7 +1412,7 @@ SDNode *AArch64DAGToDAGISel::SelectPostStoreLane(SDNode *N, unsigned NumVecs,
   MemOp[0] = cast<MemIntrinsicSDNode>(N)->getMemOperand();
   cast<MachineSDNode>(St)->setMemRefs(MemOp, MemOp + 1);
 
-  return St;
+  ReplaceNode(N, St);
 }
 
 static bool isBitfieldExtractOpFromAnd(SelectionDAG *CurDAG, SDNode *N,
@@ -1441,25 +1439,25 @@ static bool isBitfieldExtractOpFromAnd(SelectionDAG *CurDAG, SDNode *N,
   // form these situations when matching bigger pattern (bitfield insert).
 
   // For unsigned extracts, check for a shift right and mask
-  uint64_t And_imm = 0;
-  if (!isOpcWithIntImmediate(N, ISD::AND, And_imm))
+  uint64_t AndImm = 0;
+  if (!isOpcWithIntImmediate(N, ISD::AND, AndImm))
     return false;
 
   const SDNode *Op0 = N->getOperand(0).getNode();
 
   // Because of simplify-demanded-bits in DAGCombine, the mask may have been
   // simplified. Try to undo that
-  And_imm |= (1 << NumberOfIgnoredLowBits) - 1;
+  AndImm |= (1 << NumberOfIgnoredLowBits) - 1;
 
   // The immediate is a mask of the low bits iff imm & (imm+1) == 0
-  if (And_imm & (And_imm + 1))
+  if (AndImm & (AndImm + 1))
     return false;
 
   bool ClampMSB = false;
-  uint64_t Srl_imm = 0;
+  uint64_t SrlImm = 0;
   // Handle the SRL + ANY_EXTEND case.
   if (VT == MVT::i64 && Op0->getOpcode() == ISD::ANY_EXTEND &&
-      isOpcWithIntImmediate(Op0->getOperand(0).getNode(), ISD::SRL, Srl_imm)) {
+      isOpcWithIntImmediate(Op0->getOperand(0).getNode(), ISD::SRL, SrlImm)) {
     // Extend the incoming operand of the SRL to 64-bit.
     Opd0 = Widen(CurDAG, Op0->getOperand(0).getOperand(0));
     // Make sure to clamp the MSB so that we preserve the semantics of the
@@ -1467,13 +1465,13 @@ static bool isBitfieldExtractOpFromAnd(SelectionDAG *CurDAG, SDNode *N,
     ClampMSB = true;
   } else if (VT == MVT::i32 && Op0->getOpcode() == ISD::TRUNCATE &&
              isOpcWithIntImmediate(Op0->getOperand(0).getNode(), ISD::SRL,
-                                   Srl_imm)) {
+                                   SrlImm)) {
     // If the shift result was truncated, we can still combine them.
     Opd0 = Op0->getOperand(0).getOperand(0);
 
     // Use the type of SRL node.
     VT = Opd0->getValueType(0);
-  } else if (isOpcWithIntImmediate(Op0, ISD::SRL, Srl_imm)) {
+  } else if (isOpcWithIntImmediate(Op0, ISD::SRL, SrlImm)) {
     Opd0 = Op0->getOperand(0);
   } else if (BiggerPattern) {
     // Let's pretend a 0 shift right has been performed.
@@ -1487,15 +1485,15 @@ static bool isBitfieldExtractOpFromAnd(SelectionDAG *CurDAG, SDNode *N,
 
   // Bail out on large immediates. This happens when no proper
   // combining/constant folding was performed.
-  if (!BiggerPattern && (Srl_imm <= 0 || Srl_imm >= VT.getSizeInBits())) {
+  if (!BiggerPattern && (SrlImm <= 0 || SrlImm >= VT.getSizeInBits())) {
     DEBUG((dbgs() << N
            << ": Found large shift immediate, this should not happen\n"));
     return false;
   }
 
-  LSB = Srl_imm;
-  MSB = Srl_imm + (VT == MVT::i32 ? countTrailingOnes<uint32_t>(And_imm)
-                                  : countTrailingOnes<uint64_t>(And_imm)) -
+  LSB = SrlImm;
+  MSB = SrlImm + (VT == MVT::i32 ? countTrailingOnes<uint32_t>(AndImm)
+                                 : countTrailingOnes<uint64_t>(AndImm)) -
         1;
   if (ClampMSB)
     // Since we're moving the extend before the right shift operation, we need
@@ -1508,6 +1506,39 @@ static bool isBitfieldExtractOpFromAnd(SelectionDAG *CurDAG, SDNode *N,
   return true;
 }
 
+static bool isBitfieldExtractOpFromSExtInReg(SDNode *N, unsigned &Opc,
+                                             SDValue &Opd0, unsigned &Immr,
+                                             unsigned &Imms) {
+  assert(N->getOpcode() == ISD::SIGN_EXTEND_INREG);
+
+  EVT VT = N->getValueType(0);
+  unsigned BitWidth = VT.getSizeInBits();
+  assert((VT == MVT::i32 || VT == MVT::i64) &&
+         "Type checking must have been done before calling this function");
+
+  SDValue Op = N->getOperand(0);
+  if (Op->getOpcode() == ISD::TRUNCATE) {
+    Op = Op->getOperand(0);
+    VT = Op->getValueType(0);
+    BitWidth = VT.getSizeInBits();
+  }
+
+  uint64_t ShiftImm;
+  if (!isOpcWithIntImmediate(Op.getNode(), ISD::SRL, ShiftImm) &&
+      !isOpcWithIntImmediate(Op.getNode(), ISD::SRA, ShiftImm))
+    return false;
+
+  unsigned Width = cast<VTSDNode>(N->getOperand(1))->getVT().getSizeInBits();
+  if (ShiftImm + Width > BitWidth)
+    return false;
+
+  Opc = (VT == MVT::i32) ? AArch64::SBFMWri : AArch64::SBFMXri;
+  Opd0 = Op.getOperand(0);
+  Immr = ShiftImm;
+  Imms = ShiftImm + Width - 1;
+  return true;
+}
+
 static bool isSeveralBitsExtractOpFromShr(SDNode *N, unsigned &Opc,
                                           SDValue &Opd0, unsigned &LSB,
                                           unsigned &MSB) {
@@ -1522,32 +1553,32 @@ static bool isSeveralBitsExtractOpFromShr(SDNode *N, unsigned &Opc,
   //
   // This gets selected into a single UBFM:
   //
-  // UBFM Value, ShiftImm, BitWide + Srl_imm -1
+  // UBFM Value, ShiftImm, BitWide + SrlImm -1
   //
 
   if (N->getOpcode() != ISD::SRL)
     return false;
 
-  uint64_t And_mask = 0;
-  if (!isOpcWithIntImmediate(N->getOperand(0).getNode(), ISD::AND, And_mask))
+  uint64_t AndMask = 0;
+  if (!isOpcWithIntImmediate(N->getOperand(0).getNode(), ISD::AND, AndMask))
     return false;
 
   Opd0 = N->getOperand(0).getOperand(0);
 
-  uint64_t Srl_imm = 0;
-  if (!isIntImmediate(N->getOperand(1), Srl_imm))
+  uint64_t SrlImm = 0;
+  if (!isIntImmediate(N->getOperand(1), SrlImm))
     return false;
 
   // Check whether we really have several bits extract here.
-  unsigned BitWide = 64 - countLeadingOnes(~(And_mask >> Srl_imm));
-  if (BitWide && isMask_64(And_mask >> Srl_imm)) {
+  unsigned BitWide = 64 - countLeadingOnes(~(AndMask >> SrlImm));
+  if (BitWide && isMask_64(AndMask >> SrlImm)) {
     if (N->getValueType(0) == MVT::i32)
       Opc = AArch64::UBFMWri;
     else
       Opc = AArch64::UBFMXri;
 
-    LSB = Srl_imm;
-    MSB = BitWide + Srl_imm - 1;
+    LSB = SrlImm;
+    MSB = BitWide + SrlImm - 1;
     return true;
   }
 
@@ -1572,10 +1603,10 @@ static bool isBitfieldExtractOpFromShr(SDNode *N, unsigned &Opc, SDValue &Opd0,
   if (isSeveralBitsExtractOpFromShr(N, Opc, Opd0, Immr, Imms))
     return true;
 
-  // we're looking for a shift of a shift
-  uint64_t Shl_imm = 0;
-  uint64_t Trunc_bits = 0;
-  if (isOpcWithIntImmediate(N->getOperand(0).getNode(), ISD::SHL, Shl_imm)) {
+  // We're looking for a shift of a shift.
+  uint64_t ShlImm = 0;
+  uint64_t TruncBits = 0;
+  if (isOpcWithIntImmediate(N->getOperand(0).getNode(), ISD::SHL, ShlImm)) {
     Opd0 = N->getOperand(0).getOperand(0);
   } else if (VT == MVT::i32 && N->getOpcode() == ISD::SRL &&
              N->getOperand(0).getNode()->getOpcode() == ISD::TRUNCATE) {
@@ -1584,7 +1615,7 @@ static bool isBitfieldExtractOpFromShr(SDNode *N, unsigned &Opc, SDValue &Opd0,
     // always generate 64bit UBFM. This consistency will help the CSE pass
     // later find more redundancy.
     Opd0 = N->getOperand(0).getOperand(0);
-    Trunc_bits = Opd0->getValueType(0).getSizeInBits() - VT.getSizeInBits();
+    TruncBits = Opd0->getValueType(0).getSizeInBits() - VT.getSizeInBits();
     VT = Opd0->getValueType(0);
     assert(VT == MVT::i64 && "the promoted type should be i64");
   } else if (BiggerPattern) {
@@ -1597,21 +1628,21 @@ static bool isBitfieldExtractOpFromShr(SDNode *N, unsigned &Opc, SDValue &Opd0,
 
   // Missing combines/constant folding may have left us with strange
   // constants.
-  if (Shl_imm >= VT.getSizeInBits()) {
+  if (ShlImm >= VT.getSizeInBits()) {
     DEBUG((dbgs() << N
            << ": Found large shift immediate, this should not happen\n"));
     return false;
   }
 
-  uint64_t Srl_imm = 0;
-  if (!isIntImmediate(N->getOperand(1), Srl_imm))
+  uint64_t SrlImm = 0;
+  if (!isIntImmediate(N->getOperand(1), SrlImm))
     return false;
 
-  assert(Srl_imm > 0 && Srl_imm < VT.getSizeInBits() &&
+  assert(SrlImm > 0 && SrlImm < VT.getSizeInBits() &&
          "bad amount in shift node!");
-  int immr = Srl_imm - Shl_imm;
+  int immr = SrlImm - ShlImm;
   Immr = immr < 0 ? immr + VT.getSizeInBits() : immr;
-  Imms = VT.getSizeInBits() - Shl_imm - Trunc_bits - 1;
+  Imms = VT.getSizeInBits() - ShlImm - TruncBits - 1;
   // SRA requires a signed extraction
   if (VT == MVT::i32)
     Opc = N->getOpcode() == ISD::SRA ? AArch64::SBFMWri : AArch64::UBFMWri;
@@ -1620,6 +1651,30 @@ static bool isBitfieldExtractOpFromShr(SDNode *N, unsigned &Opc, SDValue &Opd0,
   return true;
 }
 
+bool AArch64DAGToDAGISel::tryBitfieldExtractOpFromSExt(SDNode *N) {
+  assert(N->getOpcode() == ISD::SIGN_EXTEND);
+
+  EVT VT = N->getValueType(0);
+  EVT NarrowVT = N->getOperand(0)->getValueType(0);
+  if (VT != MVT::i64 || NarrowVT != MVT::i32)
+    return false;
+
+  uint64_t ShiftImm;
+  SDValue Op = N->getOperand(0);
+  if (!isOpcWithIntImmediate(Op.getNode(), ISD::SRA, ShiftImm))
+    return false;
+
+  SDLoc dl(N);
+  // Extend the incoming operand of the shift to 64-bits.
+  SDValue Opd0 = Widen(CurDAG, Op.getOperand(0));
+  unsigned Immr = ShiftImm;
+  unsigned Imms = NarrowVT.getSizeInBits() - 1;
+  SDValue Ops[] = {Opd0, CurDAG->getTargetConstant(Immr, dl, VT),
+                   CurDAG->getTargetConstant(Imms, dl, VT)};
+  CurDAG->SelectNodeTo(N, AArch64::SBFMXri, VT, Ops);
+  return true;
+}
+
 static bool isBitfieldExtractOp(SelectionDAG *CurDAG, SDNode *N, unsigned &Opc,
                                 SDValue &Opd0, unsigned &Immr, unsigned &Imms,
                                 unsigned NumberOfIgnoredLowBits = 0,
@@ -1638,6 +1693,9 @@ static bool isBitfieldExtractOp(SelectionDAG *CurDAG, SDNode *N, unsigned &Opc,
   case ISD::SRL:
   case ISD::SRA:
     return isBitfieldExtractOpFromShr(N, Opc, Opd0, Immr, Imms, BiggerPattern);
+
+  case ISD::SIGN_EXTEND_INREG:
+    return isBitfieldExtractOpFromSExtInReg(N, Opc, Opd0, Immr, Imms);
   }
 
   unsigned NOpc = N->getMachineOpcode();
@@ -1658,11 +1716,11 @@ static bool isBitfieldExtractOp(SelectionDAG *CurDAG, SDNode *N, unsigned &Opc,
   return false;
 }
 
-SDNode *AArch64DAGToDAGISel::SelectBitfieldExtractOp(SDNode *N) {
+bool AArch64DAGToDAGISel::tryBitfieldExtractOp(SDNode *N) {
   unsigned Opc, Immr, Imms;
   SDValue Opd0;
   if (!isBitfieldExtractOp(CurDAG, N, Opc, Opd0, Immr, Imms))
-    return nullptr;
+    return false;
 
   EVT VT = N->getValueType(0);
   SDLoc dl(N);
@@ -1675,22 +1733,22 @@ SDNode *AArch64DAGToDAGISel::SelectBitfieldExtractOp(SDNode *N) {
 
     SDNode *BFM = CurDAG->getMachineNode(Opc, dl, MVT::i64, Ops64);
     SDValue SubReg = CurDAG->getTargetConstant(AArch64::sub_32, dl, MVT::i32);
-    MachineSDNode *Node =
-        CurDAG->getMachineNode(TargetOpcode::EXTRACT_SUBREG, dl, MVT::i32,
-                               SDValue(BFM, 0), SubReg);
-    return Node;
+    ReplaceNode(N, CurDAG->getMachineNode(TargetOpcode::EXTRACT_SUBREG, dl,
+                                          MVT::i32, SDValue(BFM, 0), SubReg));
+    return true;
   }
 
   SDValue Ops[] = {Opd0, CurDAG->getTargetConstant(Immr, dl, VT),
                    CurDAG->getTargetConstant(Imms, dl, VT)};
-  return CurDAG->SelectNodeTo(N, Opc, VT, Ops);
+  CurDAG->SelectNodeTo(N, Opc, VT, Ops);
+  return true;
 }
 
 /// Does DstMask form a complementary pair with the mask provided by
 /// BitsToBeInserted, suitable for use in a BFI instruction. Roughly speaking,
 /// this asks whether DstMask zeroes precisely those bits that will be set by
 /// the other half.
-static bool isBitfieldDstMask(uint64_t DstMask, APInt BitsToBeInserted,
+static bool isBitfieldDstMask(uint64_t DstMask, const APInt &BitsToBeInserted,
                               unsigned NumberOfIgnoredHighBits, EVT VT) {
   assert((VT == MVT::i32 || VT == MVT::i64) &&
          "i32 or i64 mask type expected!");
@@ -1851,6 +1909,20 @@ static void getUsefulBitsForUse(SDNode *UserNode, APInt &UsefulBits,
   case AArch64::BFMWri:
   case AArch64::BFMXri:
     return getUsefulBitsFromBFM(SDValue(UserNode, 0), Orig, UsefulBits, Depth);
+
+  case AArch64::STRBBui:
+  case AArch64::STURBBi:
+    if (UserNode->getOperand(0) != Orig)
+      return;
+    UsefulBits &= APInt(UsefulBits.getBitWidth(), 0xff);
+    return;
+
+  case AArch64::STRHHui:
+  case AArch64::STURHHi:
+    if (UserNode->getOperand(0) != Orig)
+      return;
+    UsefulBits &= APInt(UsefulBits.getBitWidth(), 0xffff);
+    return;
   }
 }
 
@@ -1963,36 +2035,129 @@ static bool isBitfieldPositioningOp(SelectionDAG *CurDAG, SDValue Op,
   return true;
 }
 
-// Given a OR operation, check if we have the following pattern
-// ubfm c, b, imm, imm2 (or something that does the same jobs, see
-//                       isBitfieldExtractOp)
-// d = e & mask2 ; where mask is a binary sequence of 1..10..0 and
-//                 countTrailingZeros(mask2) == imm2 - imm + 1
-// f = d | c
-// if yes, given reference arguments will be update so that one can replace
-// the OR instruction with:
-// f = Opc Opd0, Opd1, LSB, MSB ; where Opc is a BFM, LSB = imm, and MSB = imm2
-static bool isBitfieldInsertOpFromOr(SDNode *N, unsigned &Opc, SDValue &Dst,
-                                     SDValue &Src, unsigned &ImmR,
-                                     unsigned &ImmS, const APInt &UsefulBits,
-                                     SelectionDAG *CurDAG) {
+static bool isShiftedMask(uint64_t Mask, EVT VT) {
+  assert(VT == MVT::i32 || VT == MVT::i64);
+  if (VT == MVT::i32)
+    return isShiftedMask_32(Mask);
+  return isShiftedMask_64(Mask);
+}
+
+// Generate a BFI/BFXIL from 'or (and X, MaskImm), OrImm' iff the value being
+// inserted only sets known zero bits.
+static bool tryBitfieldInsertOpFromOrAndImm(SDNode *N, SelectionDAG *CurDAG) {
   assert(N->getOpcode() == ISD::OR && "Expect a OR operation");
 
-  // Set Opc
   EVT VT = N->getValueType(0);
-  if (VT == MVT::i32)
-    Opc = AArch64::BFMWri;
-  else if (VT == MVT::i64)
-    Opc = AArch64::BFMXri;
-  else
+  if (VT != MVT::i32 && VT != MVT::i64)
+    return false;
+
+  unsigned BitWidth = VT.getSizeInBits();
+
+  uint64_t OrImm;
+  if (!isOpcWithIntImmediate(N, ISD::OR, OrImm))
+    return false;
+
+  // Skip this transformation if the ORR immediate can be encoded in the ORR.
+  // Otherwise, we'll trade an AND+ORR for ORR+BFI/BFXIL, which is most likely
+  // performance neutral.
+  if (AArch64_AM::isLogicalImmediate(OrImm, BitWidth))
     return false;
 
+  uint64_t MaskImm;
+  SDValue And = N->getOperand(0);
+  // Must be a single use AND with an immediate operand.
+  if (!And.hasOneUse() ||
+      !isOpcWithIntImmediate(And.getNode(), ISD::AND, MaskImm))
+    return false;
+
+  // Compute the Known Zero for the AND as this allows us to catch more general
+  // cases than just looking for AND with imm.
+  APInt KnownZero, KnownOne;
+  CurDAG->computeKnownBits(And, KnownZero, KnownOne);
+
+  // Non-zero in the sense that they're not provably zero, which is the key
+  // point if we want to use this value.
+  uint64_t NotKnownZero = (~KnownZero).getZExtValue();
+
+  // The KnownZero mask must be a shifted mask (e.g., 1110..011, 11100..00).
+  if (!isShiftedMask(KnownZero.getZExtValue(), VT))
+    return false;
+
+  // The bits being inserted must only set those bits that are known to be zero.
+  if ((OrImm & NotKnownZero) != 0) {
+    // FIXME:  It's okay if the OrImm sets NotKnownZero bits to 1, but we don't
+    // currently handle this case.
+    return false;
+  }
+
+  // BFI/BFXIL dst, src, #lsb, #width.
+  int LSB = countTrailingOnes(NotKnownZero);
+  int Width = BitWidth - APInt(BitWidth, NotKnownZero).countPopulation();
+
+  // BFI/BFXIL is an alias of BFM, so translate to BFM operands.
+  unsigned ImmR = (BitWidth - LSB) % BitWidth;
+  unsigned ImmS = Width - 1;
+
+  // If we're creating a BFI instruction avoid cases where we need more
+  // instructions to materialize the BFI constant as compared to the original
+  // ORR.  A BFXIL will use the same constant as the original ORR, so the code
+  // should be no worse in this case.
+  bool IsBFI = LSB != 0;
+  uint64_t BFIImm = OrImm >> LSB;
+  if (IsBFI && !AArch64_AM::isLogicalImmediate(BFIImm, BitWidth)) {
+    // We have a BFI instruction and we know the constant can't be materialized
+    // with a ORR-immediate with the zero register.
+    unsigned OrChunks = 0, BFIChunks = 0;
+    for (unsigned Shift = 0; Shift < BitWidth; Shift += 16) {
+      if (((OrImm >> Shift) & 0xFFFF) != 0)
+        ++OrChunks;
+      if (((BFIImm >> Shift) & 0xFFFF) != 0)
+        ++BFIChunks;
+    }
+    if (BFIChunks > OrChunks)
+      return false;
+  }
+
+  // Materialize the constant to be inserted.
+  SDLoc DL(N);
+  unsigned MOVIOpc = VT == MVT::i32 ? AArch64::MOVi32imm : AArch64::MOVi64imm;
+  SDNode *MOVI = CurDAG->getMachineNode(
+      MOVIOpc, DL, VT, CurDAG->getTargetConstant(BFIImm, DL, VT));
+
+  // Create the BFI/BFXIL instruction.
+  SDValue Ops[] = {And.getOperand(0), SDValue(MOVI, 0),
+                   CurDAG->getTargetConstant(ImmR, DL, VT),
+                   CurDAG->getTargetConstant(ImmS, DL, VT)};
+  unsigned Opc = (VT == MVT::i32) ? AArch64::BFMWri : AArch64::BFMXri;
+  CurDAG->SelectNodeTo(N, Opc, VT, Ops);
+  return true;
+}
+
+static bool tryBitfieldInsertOpFromOr(SDNode *N, const APInt &UsefulBits,
+                                      SelectionDAG *CurDAG) {
+  assert(N->getOpcode() == ISD::OR && "Expect a OR operation");
+
+  EVT VT = N->getValueType(0);
+  if (VT != MVT::i32 && VT != MVT::i64)
+    return false;
+
+  unsigned BitWidth = VT.getSizeInBits();
+
   // Because of simplify-demanded-bits in DAGCombine, involved masks may not
   // have the expected shape. Try to undo that.
 
   unsigned NumberOfIgnoredLowBits = UsefulBits.countTrailingZeros();
   unsigned NumberOfIgnoredHighBits = UsefulBits.countLeadingZeros();
 
+  // Given a OR operation, check if we have the following pattern
+  // ubfm c, b, imm, imm2 (or something that does the same jobs, see
+  //                       isBitfieldExtractOp)
+  // d = e & mask2 ; where mask is a binary sequence of 1..10..0 and
+  //                 countTrailingZeros(mask2) == imm2 - imm + 1
+  // f = d | c
+  // if yes, replace the OR instruction with:
+  // f = BFM Opd0, Opd1, LSB, MSB ; where LSB = imm, and MSB = imm2
+
   // OR is commutative, check all combinations of operand order and values of
   // BiggerPattern, i.e.
   //     Opd0, Opd1, BiggerPattern=false
@@ -2004,8 +2169,11 @@ static bool isBitfieldInsertOpFromOr(SDNode *N, unsigned &Opc, SDValue &Dst,
   // and/or inserting fewer extra instructions.
   for (int I = 0; I < 4; ++I) {
 
+    SDValue Dst, Src;
+    unsigned ImmR, ImmS;
     bool BiggerPattern = I / 2;
-    SDNode *OrOpd0 = N->getOperand(I % 2).getNode();
+    SDValue OrOpd0Val = N->getOperand(I % 2);
+    SDNode *OrOpd0 = OrOpd0Val.getNode();
     SDValue OrOpd1Val = N->getOperand((I + 1) % 2);
     SDNode *OrOpd1 = OrOpd1Val.getNode();
 
@@ -2030,10 +2198,10 @@ static bool isBitfieldInsertOpFromOr(SDNode *N, unsigned &Opc, SDValue &Dst,
 
       // If the mask on the insertee is correct, we have a BFXIL operation. We
       // can share the ImmR and ImmS values from the already-computed UBFM.
-    } else if (isBitfieldPositioningOp(CurDAG, SDValue(OrOpd0, 0),
+    } else if (isBitfieldPositioningOp(CurDAG, OrOpd0Val,
                                        BiggerPattern,
                                        Src, DstLSB, Width)) {
-      ImmR = (VT.getSizeInBits() - DstLSB) % VT.getSizeInBits();
+      ImmR = (BitWidth - DstLSB) % BitWidth;
       ImmS = Width - 1;
     } else
       continue;
@@ -2069,60 +2237,98 @@ static bool isBitfieldInsertOpFromOr(SDNode *N, unsigned &Opc, SDValue &Dst,
       Dst = OrOpd1Val;
 
     // both parts match
+    SDLoc DL(N);
+    SDValue Ops[] = {Dst, Src, CurDAG->getTargetConstant(ImmR, DL, VT),
+                     CurDAG->getTargetConstant(ImmS, DL, VT)};
+    unsigned Opc = (VT == MVT::i32) ? AArch64::BFMWri : AArch64::BFMXri;
+    CurDAG->SelectNodeTo(N, Opc, VT, Ops);
+    return true;
+  }
+
+  // Generate a BFXIL from 'or (and X, Mask0Imm), (and Y, Mask1Imm)' iff
+  // Mask0Imm and ~Mask1Imm are equivalent and one of the MaskImms is a shifted
+  // mask (e.g., 0x000ffff0).
+  uint64_t Mask0Imm, Mask1Imm;
+  SDValue And0 = N->getOperand(0);
+  SDValue And1 = N->getOperand(1);
+  if (And0.hasOneUse() && And1.hasOneUse() &&
+      isOpcWithIntImmediate(And0.getNode(), ISD::AND, Mask0Imm) &&
+      isOpcWithIntImmediate(And1.getNode(), ISD::AND, Mask1Imm) &&
+      APInt(BitWidth, Mask0Imm) == ~APInt(BitWidth, Mask1Imm) &&
+      (isShiftedMask(Mask0Imm, VT) || isShiftedMask(Mask1Imm, VT))) {
+
+    // ORR is commutative, so canonicalize to the form 'or (and X, Mask0Imm),
+    // (and Y, Mask1Imm)' where Mask1Imm is the shifted mask masking off the
+    // bits to be inserted.
+    if (isShiftedMask(Mask0Imm, VT)) {
+      std::swap(And0, And1);
+      std::swap(Mask0Imm, Mask1Imm);
+    }
+
+    SDValue Src = And1->getOperand(0);
+    SDValue Dst = And0->getOperand(0);
+    unsigned LSB = countTrailingZeros(Mask1Imm);
+    int Width = BitWidth - APInt(BitWidth, Mask0Imm).countPopulation();
+
+    // The BFXIL inserts the low-order bits from a source register, so right
+    // shift the needed bits into place.
+    SDLoc DL(N);
+    unsigned ShiftOpc = (VT == MVT::i32) ? AArch64::UBFMWri : AArch64::UBFMXri;
+    SDNode *LSR = CurDAG->getMachineNode(
+        ShiftOpc, DL, VT, Src, CurDAG->getTargetConstant(LSB, DL, VT),
+        CurDAG->getTargetConstant(BitWidth - 1, DL, VT));
+
+    // BFXIL is an alias of BFM, so translate to BFM operands.
+    unsigned ImmR = (BitWidth - LSB) % BitWidth;
+    unsigned ImmS = Width - 1;
+
+    // Create the BFXIL instruction.
+    SDValue Ops[] = {Dst, SDValue(LSR, 0),
+                     CurDAG->getTargetConstant(ImmR, DL, VT),
+                     CurDAG->getTargetConstant(ImmS, DL, VT)};
+    unsigned Opc = (VT == MVT::i32) ? AArch64::BFMWri : AArch64::BFMXri;
+    CurDAG->SelectNodeTo(N, Opc, VT, Ops);
     return true;
   }
 
   return false;
 }
 
-SDNode *AArch64DAGToDAGISel::SelectBitfieldInsertOp(SDNode *N) {
+bool AArch64DAGToDAGISel::tryBitfieldInsertOp(SDNode *N) {
   if (N->getOpcode() != ISD::OR)
-    return nullptr;
+    return false;
 
-  unsigned Opc;
-  unsigned LSB, MSB;
-  SDValue Opd0, Opd1;
-  EVT VT = N->getValueType(0);
   APInt NUsefulBits;
   getUsefulBits(SDValue(N, 0), NUsefulBits);
 
   // If all bits are not useful, just return UNDEF.
-  if (!NUsefulBits)
-    return CurDAG->SelectNodeTo(N, TargetOpcode::IMPLICIT_DEF, VT);
+  if (!NUsefulBits) {
+    CurDAG->SelectNodeTo(N, TargetOpcode::IMPLICIT_DEF, N->getValueType(0));
+    return true;
+  }
 
-  if (!isBitfieldInsertOpFromOr(N, Opc, Opd0, Opd1, LSB, MSB, NUsefulBits,
-                                CurDAG))
-    return nullptr;
+  if (tryBitfieldInsertOpFromOr(N, NUsefulBits, CurDAG))
+    return true;
 
-  SDLoc dl(N);
-  SDValue Ops[] = { Opd0,
-                    Opd1,
-                    CurDAG->getTargetConstant(LSB, dl, VT),
-                    CurDAG->getTargetConstant(MSB, dl, VT) };
-  return CurDAG->SelectNodeTo(N, Opc, VT, Ops);
+  return tryBitfieldInsertOpFromOrAndImm(N, CurDAG);
 }
 
 /// SelectBitfieldInsertInZeroOp - Match a UBFIZ instruction that is the
 /// equivalent of a left shift by a constant amount followed by an and masking
 /// out a contiguous set of bits.
-SDNode *AArch64DAGToDAGISel::SelectBitfieldInsertInZeroOp(SDNode *N) {
+bool AArch64DAGToDAGISel::tryBitfieldInsertInZeroOp(SDNode *N) {
   if (N->getOpcode() != ISD::AND)
-    return nullptr;
+    return false;
 
   EVT VT = N->getValueType(0);
-  unsigned Opc;
-  if (VT == MVT::i32)
-    Opc = AArch64::UBFMWri;
-  else if (VT == MVT::i64)
-    Opc = AArch64::UBFMXri;
-  else
-    return nullptr;
+  if (VT != MVT::i32 && VT != MVT::i64)
+    return false;
 
   SDValue Op0;
   int DstLSB, Width;
   if (!isBitfieldPositioningOp(CurDAG, SDValue(N, 0), /*BiggerPattern=*/false,
                                Op0, DstLSB, Width))
-    return nullptr;
+    return false;
 
   // ImmR is the rotate right amount.
   unsigned ImmR = (VT.getSizeInBits() - DstLSB) % VT.getSizeInBits();
@@ -2132,7 +2338,9 @@ SDNode *AArch64DAGToDAGISel::SelectBitfieldInsertInZeroOp(SDNode *N) {
   SDLoc DL(N);
   SDValue Ops[] = {Op0, CurDAG->getTargetConstant(ImmR, DL, VT),
                    CurDAG->getTargetConstant(ImmS, DL, VT)};
-  return CurDAG->SelectNodeTo(N, Opc, VT, Ops);
+  unsigned Opc = (VT == MVT::i32) ? AArch64::UBFMWri : AArch64::UBFMXri;
+  CurDAG->SelectNodeTo(N, Opc, VT, Ops);
+  return true;
 }
 
 bool
@@ -2214,62 +2422,68 @@ static int getIntOperandFromRegisterString(StringRef RegString) {
 // register string argument is either of the form detailed in the ALCE (the
 // form described in getIntOperandsFromRegsterString) or is a named register
 // known by the MRS SysReg mapper.
-SDNode *AArch64DAGToDAGISel::SelectReadRegister(SDNode *N) {
+bool AArch64DAGToDAGISel::tryReadRegister(SDNode *N) {
   const MDNodeSDNode *MD = dyn_cast<MDNodeSDNode>(N->getOperand(1));
   const MDString *RegString = dyn_cast<MDString>(MD->getMD()->getOperand(0));
   SDLoc DL(N);
 
   int Reg = getIntOperandFromRegisterString(RegString->getString());
-  if (Reg != -1)
-    return CurDAG->getMachineNode(AArch64::MRS, DL, N->getSimpleValueType(0),
-                                  MVT::Other,
-                                  CurDAG->getTargetConstant(Reg, DL, MVT::i32),
-                                  N->getOperand(0));
+  if (Reg != -1) {
+    ReplaceNode(N, CurDAG->getMachineNode(
+                       AArch64::MRS, DL, N->getSimpleValueType(0), MVT::Other,
+                       CurDAG->getTargetConstant(Reg, DL, MVT::i32),
+                       N->getOperand(0)));
+    return true;
+  }
 
   // Use the sysreg mapper to map the remaining possible strings to the
   // value for the register to be used for the instruction operand.
-  AArch64SysReg::MRSMapper mapper;
-  bool IsValidSpecialReg;
-  Reg = mapper.fromString(RegString->getString(),
-                          Subtarget->getFeatureBits(),
-                          IsValidSpecialReg);
-  if (IsValidSpecialReg)
-    return CurDAG->getMachineNode(AArch64::MRS, DL, N->getSimpleValueType(0),
-                                  MVT::Other,
-                                  CurDAG->getTargetConstant(Reg, DL, MVT::i32),
-                                  N->getOperand(0));
+  auto TheReg = AArch64SysReg::lookupSysRegByName(RegString->getString());
+  if (TheReg && TheReg->Readable &&
+      TheReg->haveFeatures(Subtarget->getFeatureBits()))
+    Reg = TheReg->Encoding;
+  else
+    Reg = AArch64SysReg::parseGenericRegister(RegString->getString());
+
+  if (Reg != -1) {
+    ReplaceNode(N, CurDAG->getMachineNode(
+                       AArch64::MRS, DL, N->getSimpleValueType(0), MVT::Other,
+                       CurDAG->getTargetConstant(Reg, DL, MVT::i32),
+                       N->getOperand(0)));
+    return true;
+  }
 
-  return nullptr;
+  return false;
 }
 
 // Lower the write_register intrinsic to an MSR instruction node if the special
 // register string argument is either of the form detailed in the ALCE (the
 // form described in getIntOperandsFromRegsterString) or is a named register
 // known by the MSR SysReg mapper.
-SDNode *AArch64DAGToDAGISel::SelectWriteRegister(SDNode *N) {
+bool AArch64DAGToDAGISel::tryWriteRegister(SDNode *N) {
   const MDNodeSDNode *MD = dyn_cast<MDNodeSDNode>(N->getOperand(1));
   const MDString *RegString = dyn_cast<MDString>(MD->getMD()->getOperand(0));
   SDLoc DL(N);
 
   int Reg = getIntOperandFromRegisterString(RegString->getString());
-  if (Reg != -1)
-    return CurDAG->getMachineNode(AArch64::MSR, DL, MVT::Other,
+  if (Reg != -1) {
+    ReplaceNode(
+        N, CurDAG->getMachineNode(AArch64::MSR, DL, MVT::Other,
                                   CurDAG->getTargetConstant(Reg, DL, MVT::i32),
-                                  N->getOperand(2), N->getOperand(0));
+                                  N->getOperand(2), N->getOperand(0)));
+    return true;
+  }
 
   // Check if the register was one of those allowed as the pstatefield value in
   // the MSR (immediate) instruction. To accept the values allowed in the
   // pstatefield for the MSR (immediate) instruction, we also require that an
   // immediate value has been provided as an argument, we know that this is
   // the case as it has been ensured by semantic checking.
-  AArch64PState::PStateMapper PMapper;
-  bool IsValidSpecialReg;
-  Reg = PMapper.fromString(RegString->getString(),
-                           Subtarget->getFeatureBits(),
-                           IsValidSpecialReg);
-  if (IsValidSpecialReg) {
+  auto PMapper = AArch64PState::lookupPStateByName(RegString->getString());;
+  if (PMapper) {
     assert (isa<ConstantSDNode>(N->getOperand(2))
               && "Expected a constant integer expression.");
+    unsigned Reg = PMapper->Encoding;
     uint64_t Immed = cast<ConstantSDNode>(N->getOperand(2))->getZExtValue();
     unsigned State;
     if (Reg == AArch64PState::PAN || Reg == AArch64PState::UAO) {
@@ -2279,29 +2493,66 @@ SDNode *AArch64DAGToDAGISel::SelectWriteRegister(SDNode *N) {
       assert(Immed < 16 && "Bad imm");
       State = AArch64::MSRpstateImm4;
     }
-    return CurDAG->getMachineNode(State, DL, MVT::Other,
-                                  CurDAG->getTargetConstant(Reg, DL, MVT::i32),
-                                  CurDAG->getTargetConstant(Immed, DL, MVT::i16),
-                                  N->getOperand(0));
+    ReplaceNode(N, CurDAG->getMachineNode(
+                       State, DL, MVT::Other,
+                       CurDAG->getTargetConstant(Reg, DL, MVT::i32),
+                       CurDAG->getTargetConstant(Immed, DL, MVT::i16),
+                       N->getOperand(0)));
+    return true;
   }
 
   // Use the sysreg mapper to attempt to map the remaining possible strings
   // to the value for the register to be used for the MSR (register)
   // instruction operand.
-  AArch64SysReg::MSRMapper Mapper;
-  Reg = Mapper.fromString(RegString->getString(),
-                          Subtarget->getFeatureBits(),
-                          IsValidSpecialReg);
+  auto TheReg = AArch64SysReg::lookupSysRegByName(RegString->getString());
+  if (TheReg && TheReg->Writeable &&
+      TheReg->haveFeatures(Subtarget->getFeatureBits()))
+    Reg = TheReg->Encoding;
+  else
+    Reg = AArch64SysReg::parseGenericRegister(RegString->getString());
+  if (Reg != -1) {
+    ReplaceNode(N, CurDAG->getMachineNode(
+                       AArch64::MSR, DL, MVT::Other,
+                       CurDAG->getTargetConstant(Reg, DL, MVT::i32),
+                       N->getOperand(2), N->getOperand(0)));
+    return true;
+  }
 
-  if (IsValidSpecialReg)
-    return CurDAG->getMachineNode(AArch64::MSR, DL, MVT::Other,
-                                  CurDAG->getTargetConstant(Reg, DL, MVT::i32),
-                                  N->getOperand(2), N->getOperand(0));
+  return false;
+}
+
+/// We've got special pseudo-instructions for these
+void AArch64DAGToDAGISel::SelectCMP_SWAP(SDNode *N) {
+  unsigned Opcode;
+  EVT MemTy = cast<MemSDNode>(N)->getMemoryVT();
+  if (MemTy == MVT::i8)
+    Opcode = AArch64::CMP_SWAP_8;
+  else if (MemTy == MVT::i16)
+    Opcode = AArch64::CMP_SWAP_16;
+  else if (MemTy == MVT::i32)
+    Opcode = AArch64::CMP_SWAP_32;
+  else if (MemTy == MVT::i64)
+    Opcode = AArch64::CMP_SWAP_64;
+  else
+    llvm_unreachable("Unknown AtomicCmpSwap type");
 
-  return nullptr;
+  MVT RegTy = MemTy == MVT::i64 ? MVT::i64 : MVT::i32;
+  SDValue Ops[] = {N->getOperand(1), N->getOperand(2), N->getOperand(3),
+                   N->getOperand(0)};
+  SDNode *CmpSwap = CurDAG->getMachineNode(
+      Opcode, SDLoc(N),
+      CurDAG->getVTList(RegTy, MVT::i32, MVT::Other), Ops);
+
+  MachineSDNode::mmo_iterator MemOp = MF->allocateMemRefsArray(1);
+  MemOp[0] = cast<MemSDNode>(N)->getMemOperand();
+  cast<MachineSDNode>(CmpSwap)->setMemRefs(MemOp, MemOp + 1);
+
+  ReplaceUses(SDValue(N, 0), SDValue(CmpSwap, 0));
+  ReplaceUses(SDValue(N, 1), SDValue(CmpSwap, 2));
+  CurDAG->RemoveDeadNode(N);
 }
 
-SDNode *AArch64DAGToDAGISel::Select(SDNode *Node) {
+void AArch64DAGToDAGISel::Select(SDNode *Node) {
   // Dump information about the Node being selected
   DEBUG(errs() << "Selecting: ");
   DEBUG(Node->dump(CurDAG));
@@ -2311,54 +2562,61 @@ SDNode *AArch64DAGToDAGISel::Select(SDNode *Node) {
   if (Node->isMachineOpcode()) {
     DEBUG(errs() << "== "; Node->dump(CurDAG); errs() << "\n");
     Node->setNodeId(-1);
-    return nullptr;
+    return;
   }
 
   // Few custom selection stuff.
-  SDNode *ResNode = nullptr;
   EVT VT = Node->getValueType(0);
 
   switch (Node->getOpcode()) {
   default:
     break;
 
+  case ISD::ATOMIC_CMP_SWAP:
+    SelectCMP_SWAP(Node);
+    return;
+
   case ISD::READ_REGISTER:
-    if (SDNode *Res = SelectReadRegister(Node))
-      return Res;
+    if (tryReadRegister(Node))
+      return;
     break;
 
   case ISD::WRITE_REGISTER:
-    if (SDNode *Res = SelectWriteRegister(Node))
-      return Res;
+    if (tryWriteRegister(Node))
+      return;
     break;
 
   case ISD::ADD:
-    if (SDNode *I = SelectMLAV64LaneV128(Node))
-      return I;
+    if (tryMLAV64LaneV128(Node))
+      return;
     break;
 
   case ISD::LOAD: {
     // Try to select as an indexed load. Fall through to normal processing
     // if we can't.
-    bool Done = false;
-    SDNode *I = SelectIndexedLoad(Node, Done);
-    if (Done)
-      return I;
+    if (tryIndexedLoad(Node))
+      return;
     break;
   }
 
   case ISD::SRL:
   case ISD::AND:
   case ISD::SRA:
-    if (SDNode *I = SelectBitfieldExtractOp(Node))
-      return I;
-    if (SDNode *I = SelectBitfieldInsertInZeroOp(Node))
-      return I;
+  case ISD::SIGN_EXTEND_INREG:
+    if (tryBitfieldExtractOp(Node))
+      return;
+    if (tryBitfieldInsertInZeroOp(Node))
+      return;
+    break;
+
+  case ISD::SIGN_EXTEND:
+    if (tryBitfieldExtractOpFromSExt(Node))
+      return;
     break;
 
   case ISD::OR:
-    if (SDNode *I = SelectBitfieldInsertOp(Node))
-      return I;
+    if (tryBitfieldInsertOp(Node))
+      return;
     break;
 
   case ISD::EXTRACT_VECTOR_ELT: {
@@ -2401,19 +2659,25 @@ SDNode *AArch64DAGToDAGISel::Select(SDNode *Node) {
     DEBUG(dbgs() << "ISEL: Custom selection!\n=> ");
     DEBUG(Extract->dumpr(CurDAG));
     DEBUG(dbgs() << "\n");
-    return Extract.getNode();
+    ReplaceNode(Node, Extract.getNode());
+    return;
   }
   case ISD::Constant: {
     // Materialize zero constants as copies from WZR/XZR.  This allows
     // the coalescer to propagate these into other instructions.
     ConstantSDNode *ConstNode = cast<ConstantSDNode>(Node);
     if (ConstNode->isNullValue()) {
-      if (VT == MVT::i32)
-        return CurDAG->getCopyFromReg(CurDAG->getEntryNode(), SDLoc(Node),
-                                      AArch64::WZR, MVT::i32).getNode();
-      else if (VT == MVT::i64)
-        return CurDAG->getCopyFromReg(CurDAG->getEntryNode(), SDLoc(Node),
-                                      AArch64::XZR, MVT::i64).getNode();
+      if (VT == MVT::i32) {
+        SDValue New = CurDAG->getCopyFromReg(
+            CurDAG->getEntryNode(), SDLoc(Node), AArch64::WZR, MVT::i32);
+        ReplaceNode(Node, New.getNode());
+        return;
+      } else if (VT == MVT::i64) {
+        SDValue New = CurDAG->getCopyFromReg(
+            CurDAG->getEntryNode(), SDLoc(Node), AArch64::XZR, MVT::i64);
+        ReplaceNode(Node, New.getNode());
+        return;
+      }
     }
     break;
   }
@@ -2428,7 +2692,8 @@ SDNode *AArch64DAGToDAGISel::Select(SDNode *Node) {
     SDLoc DL(Node);
     SDValue Ops[] = { TFI, CurDAG->getTargetConstant(0, DL, MVT::i32),
                       CurDAG->getTargetConstant(Shifter, DL, MVT::i32) };
-    return CurDAG->SelectNodeTo(Node, AArch64::ADDXri, MVT::i64, Ops);
+    CurDAG->SelectNodeTo(Node, AArch64::ADDXri, MVT::i64, Ops);
+    return;
   }
   case ISD::INTRINSIC_W_CHAIN: {
     unsigned IntNo = cast<ConstantSDNode>(Node->getOperand(1))->getZExtValue();
@@ -2450,7 +2715,8 @@ SDNode *AArch64DAGToDAGISel::Select(SDNode *Node) {
       MachineSDNode::mmo_iterator MemOp = MF->allocateMemRefsArray(1);
       MemOp[0] = cast<MemIntrinsicSDNode>(Node)->getMemOperand();
       cast<MachineSDNode>(Ld)->setMemRefs(MemOp, MemOp + 1);
-      return Ld;
+      ReplaceNode(Node, Ld);
+      return;
     }
     case Intrinsic::aarch64_stlxp:
     case Intrinsic::aarch64_stxp: {
@@ -2471,208 +2737,305 @@ SDNode *AArch64DAGToDAGISel::Select(SDNode *Node) {
       MemOp[0] = cast<MemIntrinsicSDNode>(Node)->getMemOperand();
       cast<MachineSDNode>(St)->setMemRefs(MemOp, MemOp + 1);
 
-      return St;
+      ReplaceNode(Node, St);
+      return;
     }
     case Intrinsic::aarch64_neon_ld1x2:
-      if (VT == MVT::v8i8)
-        return SelectLoad(Node, 2, AArch64::LD1Twov8b, AArch64::dsub0);
-      else if (VT == MVT::v16i8)
-        return SelectLoad(Node, 2, AArch64::LD1Twov16b, AArch64::qsub0);
-      else if (VT == MVT::v4i16 || VT == MVT::v4f16)
-        return SelectLoad(Node, 2, AArch64::LD1Twov4h, AArch64::dsub0);
-      else if (VT == MVT::v8i16 || VT == MVT::v8f16)
-        return SelectLoad(Node, 2, AArch64::LD1Twov8h, AArch64::qsub0);
-      else if (VT == MVT::v2i32 || VT == MVT::v2f32)
-        return SelectLoad(Node, 2, AArch64::LD1Twov2s, AArch64::dsub0);
-      else if (VT == MVT::v4i32 || VT == MVT::v4f32)
-        return SelectLoad(Node, 2, AArch64::LD1Twov4s, AArch64::qsub0);
-      else if (VT == MVT::v1i64 || VT == MVT::v1f64)
-        return SelectLoad(Node, 2, AArch64::LD1Twov1d, AArch64::dsub0);
-      else if (VT == MVT::v2i64 || VT == MVT::v2f64)
-        return SelectLoad(Node, 2, AArch64::LD1Twov2d, AArch64::qsub0);
+      if (VT == MVT::v8i8) {
+        SelectLoad(Node, 2, AArch64::LD1Twov8b, AArch64::dsub0);
+        return;
+      } else if (VT == MVT::v16i8) {
+        SelectLoad(Node, 2, AArch64::LD1Twov16b, AArch64::qsub0);
+        return;
+      } else if (VT == MVT::v4i16 || VT == MVT::v4f16) {
+        SelectLoad(Node, 2, AArch64::LD1Twov4h, AArch64::dsub0);
+        return;
+      } else if (VT == MVT::v8i16 || VT == MVT::v8f16) {
+        SelectLoad(Node, 2, AArch64::LD1Twov8h, AArch64::qsub0);
+        return;
+      } else if (VT == MVT::v2i32 || VT == MVT::v2f32) {
+        SelectLoad(Node, 2, AArch64::LD1Twov2s, AArch64::dsub0);
+        return;
+      } else if (VT == MVT::v4i32 || VT == MVT::v4f32) {
+        SelectLoad(Node, 2, AArch64::LD1Twov4s, AArch64::qsub0);
+        return;
+      } else if (VT == MVT::v1i64 || VT == MVT::v1f64) {
+        SelectLoad(Node, 2, AArch64::LD1Twov1d, AArch64::dsub0);
+        return;
+      } else if (VT == MVT::v2i64 || VT == MVT::v2f64) {
+        SelectLoad(Node, 2, AArch64::LD1Twov2d, AArch64::qsub0);
+        return;
+      }
       break;
     case Intrinsic::aarch64_neon_ld1x3:
-      if (VT == MVT::v8i8)
-        return SelectLoad(Node, 3, AArch64::LD1Threev8b, AArch64::dsub0);
-      else if (VT == MVT::v16i8)
-        return SelectLoad(Node, 3, AArch64::LD1Threev16b, AArch64::qsub0);
-      else if (VT == MVT::v4i16 || VT == MVT::v4f16)
-        return SelectLoad(Node, 3, AArch64::LD1Threev4h, AArch64::dsub0);
-      else if (VT == MVT::v8i16 || VT == MVT::v8f16)
-        return SelectLoad(Node, 3, AArch64::LD1Threev8h, AArch64::qsub0);
-      else if (VT == MVT::v2i32 || VT == MVT::v2f32)
-        return SelectLoad(Node, 3, AArch64::LD1Threev2s, AArch64::dsub0);
-      else if (VT == MVT::v4i32 || VT == MVT::v4f32)
-        return SelectLoad(Node, 3, AArch64::LD1Threev4s, AArch64::qsub0);
-      else if (VT == MVT::v1i64 || VT == MVT::v1f64)
-        return SelectLoad(Node, 3, AArch64::LD1Threev1d, AArch64::dsub0);
-      else if (VT == MVT::v2i64 || VT == MVT::v2f64)
-        return SelectLoad(Node, 3, AArch64::LD1Threev2d, AArch64::qsub0);
+      if (VT == MVT::v8i8) {
+        SelectLoad(Node, 3, AArch64::LD1Threev8b, AArch64::dsub0);
+        return;
+      } else if (VT == MVT::v16i8) {
+        SelectLoad(Node, 3, AArch64::LD1Threev16b, AArch64::qsub0);
+        return;
+      } else if (VT == MVT::v4i16 || VT == MVT::v4f16) {
+        SelectLoad(Node, 3, AArch64::LD1Threev4h, AArch64::dsub0);
+        return;
+      } else if (VT == MVT::v8i16 || VT == MVT::v8f16) {
+        SelectLoad(Node, 3, AArch64::LD1Threev8h, AArch64::qsub0);
+        return;
+      } else if (VT == MVT::v2i32 || VT == MVT::v2f32) {
+        SelectLoad(Node, 3, AArch64::LD1Threev2s, AArch64::dsub0);
+        return;
+      } else if (VT == MVT::v4i32 || VT == MVT::v4f32) {
+        SelectLoad(Node, 3, AArch64::LD1Threev4s, AArch64::qsub0);
+        return;
+      } else if (VT == MVT::v1i64 || VT == MVT::v1f64) {
+        SelectLoad(Node, 3, AArch64::LD1Threev1d, AArch64::dsub0);
+        return;
+      } else if (VT == MVT::v2i64 || VT == MVT::v2f64) {
+        SelectLoad(Node, 3, AArch64::LD1Threev2d, AArch64::qsub0);
+        return;
+      }
       break;
     case Intrinsic::aarch64_neon_ld1x4:
-      if (VT == MVT::v8i8)
-        return SelectLoad(Node, 4, AArch64::LD1Fourv8b, AArch64::dsub0);
-      else if (VT == MVT::v16i8)
-        return SelectLoad(Node, 4, AArch64::LD1Fourv16b, AArch64::qsub0);
-      else if (VT == MVT::v4i16 || VT == MVT::v4f16)
-        return SelectLoad(Node, 4, AArch64::LD1Fourv4h, AArch64::dsub0);
-      else if (VT == MVT::v8i16 || VT == MVT::v8f16)
-        return SelectLoad(Node, 4, AArch64::LD1Fourv8h, AArch64::qsub0);
-      else if (VT == MVT::v2i32 || VT == MVT::v2f32)
-        return SelectLoad(Node, 4, AArch64::LD1Fourv2s, AArch64::dsub0);
-      else if (VT == MVT::v4i32 || VT == MVT::v4f32)
-        return SelectLoad(Node, 4, AArch64::LD1Fourv4s, AArch64::qsub0);
-      else if (VT == MVT::v1i64 || VT == MVT::v1f64)
-        return SelectLoad(Node, 4, AArch64::LD1Fourv1d, AArch64::dsub0);
-      else if (VT == MVT::v2i64 || VT == MVT::v2f64)
-        return SelectLoad(Node, 4, AArch64::LD1Fourv2d, AArch64::qsub0);
+      if (VT == MVT::v8i8) {
+        SelectLoad(Node, 4, AArch64::LD1Fourv8b, AArch64::dsub0);
+        return;
+      } else if (VT == MVT::v16i8) {
+        SelectLoad(Node, 4, AArch64::LD1Fourv16b, AArch64::qsub0);
+        return;
+      } else if (VT == MVT::v4i16 || VT == MVT::v4f16) {
+        SelectLoad(Node, 4, AArch64::LD1Fourv4h, AArch64::dsub0);
+        return;
+      } else if (VT == MVT::v8i16 || VT == MVT::v8f16) {
+        SelectLoad(Node, 4, AArch64::LD1Fourv8h, AArch64::qsub0);
+        return;
+      } else if (VT == MVT::v2i32 || VT == MVT::v2f32) {
+        SelectLoad(Node, 4, AArch64::LD1Fourv2s, AArch64::dsub0);
+        return;
+      } else if (VT == MVT::v4i32 || VT == MVT::v4f32) {
+        SelectLoad(Node, 4, AArch64::LD1Fourv4s, AArch64::qsub0);
+        return;
+      } else if (VT == MVT::v1i64 || VT == MVT::v1f64) {
+        SelectLoad(Node, 4, AArch64::LD1Fourv1d, AArch64::dsub0);
+        return;
+      } else if (VT == MVT::v2i64 || VT == MVT::v2f64) {
+        SelectLoad(Node, 4, AArch64::LD1Fourv2d, AArch64::qsub0);
+        return;
+      }
       break;
     case Intrinsic::aarch64_neon_ld2:
-      if (VT == MVT::v8i8)
-        return SelectLoad(Node, 2, AArch64::LD2Twov8b, AArch64::dsub0);
-      else if (VT == MVT::v16i8)
-        return SelectLoad(Node, 2, AArch64::LD2Twov16b, AArch64::qsub0);
-      else if (VT == MVT::v4i16 || VT == MVT::v4f16)
-        return SelectLoad(Node, 2, AArch64::LD2Twov4h, AArch64::dsub0);
-      else if (VT == MVT::v8i16 || VT == MVT::v8f16)
-        return SelectLoad(Node, 2, AArch64::LD2Twov8h, AArch64::qsub0);
-      else if (VT == MVT::v2i32 || VT == MVT::v2f32)
-        return SelectLoad(Node, 2, AArch64::LD2Twov2s, AArch64::dsub0);
-      else if (VT == MVT::v4i32 || VT == MVT::v4f32)
-        return SelectLoad(Node, 2, AArch64::LD2Twov4s, AArch64::qsub0);
-      else if (VT == MVT::v1i64 || VT == MVT::v1f64)
-        return SelectLoad(Node, 2, AArch64::LD1Twov1d, AArch64::dsub0);
-      else if (VT == MVT::v2i64 || VT == MVT::v2f64)
-        return SelectLoad(Node, 2, AArch64::LD2Twov2d, AArch64::qsub0);
+      if (VT == MVT::v8i8) {
+        SelectLoad(Node, 2, AArch64::LD2Twov8b, AArch64::dsub0);
+        return;
+      } else if (VT == MVT::v16i8) {
+        SelectLoad(Node, 2, AArch64::LD2Twov16b, AArch64::qsub0);
+        return;
+      } else if (VT == MVT::v4i16 || VT == MVT::v4f16) {
+        SelectLoad(Node, 2, AArch64::LD2Twov4h, AArch64::dsub0);
+        return;
+      } else if (VT == MVT::v8i16 || VT == MVT::v8f16) {
+        SelectLoad(Node, 2, AArch64::LD2Twov8h, AArch64::qsub0);
+        return;
+      } else if (VT == MVT::v2i32 || VT == MVT::v2f32) {
+        SelectLoad(Node, 2, AArch64::LD2Twov2s, AArch64::dsub0);
+        return;
+      } else if (VT == MVT::v4i32 || VT == MVT::v4f32) {
+        SelectLoad(Node, 2, AArch64::LD2Twov4s, AArch64::qsub0);
+        return;
+      } else if (VT == MVT::v1i64 || VT == MVT::v1f64) {
+        SelectLoad(Node, 2, AArch64::LD1Twov1d, AArch64::dsub0);
+        return;
+      } else if (VT == MVT::v2i64 || VT == MVT::v2f64) {
+        SelectLoad(Node, 2, AArch64::LD2Twov2d, AArch64::qsub0);
+        return;
+      }
       break;
     case Intrinsic::aarch64_neon_ld3:
-      if (VT == MVT::v8i8)
-        return SelectLoad(Node, 3, AArch64::LD3Threev8b, AArch64::dsub0);
-      else if (VT == MVT::v16i8)
-        return SelectLoad(Node, 3, AArch64::LD3Threev16b, AArch64::qsub0);
-      else if (VT == MVT::v4i16 || VT == MVT::v4f16)
-        return SelectLoad(Node, 3, AArch64::LD3Threev4h, AArch64::dsub0);
-      else if (VT == MVT::v8i16 || VT == MVT::v8f16)
-        return SelectLoad(Node, 3, AArch64::LD3Threev8h, AArch64::qsub0);
-      else if (VT == MVT::v2i32 || VT == MVT::v2f32)
-        return SelectLoad(Node, 3, AArch64::LD3Threev2s, AArch64::dsub0);
-      else if (VT == MVT::v4i32 || VT == MVT::v4f32)
-        return SelectLoad(Node, 3, AArch64::LD3Threev4s, AArch64::qsub0);
-      else if (VT == MVT::v1i64 || VT == MVT::v1f64)
-        return SelectLoad(Node, 3, AArch64::LD1Threev1d, AArch64::dsub0);
-      else if (VT == MVT::v2i64 || VT == MVT::v2f64)
-        return SelectLoad(Node, 3, AArch64::LD3Threev2d, AArch64::qsub0);
+      if (VT == MVT::v8i8) {
+        SelectLoad(Node, 3, AArch64::LD3Threev8b, AArch64::dsub0);
+        return;
+      } else if (VT == MVT::v16i8) {
+        SelectLoad(Node, 3, AArch64::LD3Threev16b, AArch64::qsub0);
+        return;
+      } else if (VT == MVT::v4i16 || VT == MVT::v4f16) {
+        SelectLoad(Node, 3, AArch64::LD3Threev4h, AArch64::dsub0);
+        return;
+      } else if (VT == MVT::v8i16 || VT == MVT::v8f16) {
+        SelectLoad(Node, 3, AArch64::LD3Threev8h, AArch64::qsub0);
+        return;
+      } else if (VT == MVT::v2i32 || VT == MVT::v2f32) {
+        SelectLoad(Node, 3, AArch64::LD3Threev2s, AArch64::dsub0);
+        return;
+      } else if (VT == MVT::v4i32 || VT == MVT::v4f32) {
+        SelectLoad(Node, 3, AArch64::LD3Threev4s, AArch64::qsub0);
+        return;
+      } else if (VT == MVT::v1i64 || VT == MVT::v1f64) {
+        SelectLoad(Node, 3, AArch64::LD1Threev1d, AArch64::dsub0);
+        return;
+      } else if (VT == MVT::v2i64 || VT == MVT::v2f64) {
+        SelectLoad(Node, 3, AArch64::LD3Threev2d, AArch64::qsub0);
+        return;
+      }
       break;
     case Intrinsic::aarch64_neon_ld4:
-      if (VT == MVT::v8i8)
-        return SelectLoad(Node, 4, AArch64::LD4Fourv8b, AArch64::dsub0);
-      else if (VT == MVT::v16i8)
-        return SelectLoad(Node, 4, AArch64::LD4Fourv16b, AArch64::qsub0);
-      else if (VT == MVT::v4i16 || VT == MVT::v4f16)
-        return SelectLoad(Node, 4, AArch64::LD4Fourv4h, AArch64::dsub0);
-      else if (VT == MVT::v8i16  || VT == MVT::v8f16)
-        return SelectLoad(Node, 4, AArch64::LD4Fourv8h, AArch64::qsub0);
-      else if (VT == MVT::v2i32 || VT == MVT::v2f32)
-        return SelectLoad(Node, 4, AArch64::LD4Fourv2s, AArch64::dsub0);
-      else if (VT == MVT::v4i32 || VT == MVT::v4f32)
-        return SelectLoad(Node, 4, AArch64::LD4Fourv4s, AArch64::qsub0);
-      else if (VT == MVT::v1i64 || VT == MVT::v1f64)
-        return SelectLoad(Node, 4, AArch64::LD1Fourv1d, AArch64::dsub0);
-      else if (VT == MVT::v2i64 || VT == MVT::v2f64)
-        return SelectLoad(Node, 4, AArch64::LD4Fourv2d, AArch64::qsub0);
+      if (VT == MVT::v8i8) {
+        SelectLoad(Node, 4, AArch64::LD4Fourv8b, AArch64::dsub0);
+        return;
+      } else if (VT == MVT::v16i8) {
+        SelectLoad(Node, 4, AArch64::LD4Fourv16b, AArch64::qsub0);
+        return;
+      } else if (VT == MVT::v4i16 || VT == MVT::v4f16) {
+        SelectLoad(Node, 4, AArch64::LD4Fourv4h, AArch64::dsub0);
+        return;
+      } else if (VT == MVT::v8i16 || VT == MVT::v8f16) {
+        SelectLoad(Node, 4, AArch64::LD4Fourv8h, AArch64::qsub0);
+        return;
+      } else if (VT == MVT::v2i32 || VT == MVT::v2f32) {
+        SelectLoad(Node, 4, AArch64::LD4Fourv2s, AArch64::dsub0);
+        return;
+      } else if (VT == MVT::v4i32 || VT == MVT::v4f32) {
+        SelectLoad(Node, 4, AArch64::LD4Fourv4s, AArch64::qsub0);
+        return;
+      } else if (VT == MVT::v1i64 || VT == MVT::v1f64) {
+        SelectLoad(Node, 4, AArch64::LD1Fourv1d, AArch64::dsub0);
+        return;
+      } else if (VT == MVT::v2i64 || VT == MVT::v2f64) {
+        SelectLoad(Node, 4, AArch64::LD4Fourv2d, AArch64::qsub0);
+        return;
+      }
       break;
     case Intrinsic::aarch64_neon_ld2r:
-      if (VT == MVT::v8i8)
-        return SelectLoad(Node, 2, AArch64::LD2Rv8b, AArch64::dsub0);
-      else if (VT == MVT::v16i8)
-        return SelectLoad(Node, 2, AArch64::LD2Rv16b, AArch64::qsub0);
-      else if (VT == MVT::v4i16 || VT == MVT::v4f16)
-        return SelectLoad(Node, 2, AArch64::LD2Rv4h, AArch64::dsub0);
-      else if (VT == MVT::v8i16 || VT == MVT::v8f16)
-        return SelectLoad(Node, 2, AArch64::LD2Rv8h, AArch64::qsub0);
-      else if (VT == MVT::v2i32 || VT == MVT::v2f32)
-        return SelectLoad(Node, 2, AArch64::LD2Rv2s, AArch64::dsub0);
-      else if (VT == MVT::v4i32 || VT == MVT::v4f32)
-        return SelectLoad(Node, 2, AArch64::LD2Rv4s, AArch64::qsub0);
-      else if (VT == MVT::v1i64 || VT == MVT::v1f64)
-        return SelectLoad(Node, 2, AArch64::LD2Rv1d, AArch64::dsub0);
-      else if (VT == MVT::v2i64 || VT == MVT::v2f64)
-        return SelectLoad(Node, 2, AArch64::LD2Rv2d, AArch64::qsub0);
+      if (VT == MVT::v8i8) {
+        SelectLoad(Node, 2, AArch64::LD2Rv8b, AArch64::dsub0);
+        return;
+      } else if (VT == MVT::v16i8) {
+        SelectLoad(Node, 2, AArch64::LD2Rv16b, AArch64::qsub0);
+        return;
+      } else if (VT == MVT::v4i16 || VT == MVT::v4f16) {
+        SelectLoad(Node, 2, AArch64::LD2Rv4h, AArch64::dsub0);
+        return;
+      } else if (VT == MVT::v8i16 || VT == MVT::v8f16) {
+        SelectLoad(Node, 2, AArch64::LD2Rv8h, AArch64::qsub0);
+        return;
+      } else if (VT == MVT::v2i32 || VT == MVT::v2f32) {
+        SelectLoad(Node, 2, AArch64::LD2Rv2s, AArch64::dsub0);
+        return;
+      } else if (VT == MVT::v4i32 || VT == MVT::v4f32) {
+        SelectLoad(Node, 2, AArch64::LD2Rv4s, AArch64::qsub0);
+        return;
+      } else if (VT == MVT::v1i64 || VT == MVT::v1f64) {
+        SelectLoad(Node, 2, AArch64::LD2Rv1d, AArch64::dsub0);
+        return;
+      } else if (VT == MVT::v2i64 || VT == MVT::v2f64) {
+        SelectLoad(Node, 2, AArch64::LD2Rv2d, AArch64::qsub0);
+        return;
+      }
       break;
     case Intrinsic::aarch64_neon_ld3r:
-      if (VT == MVT::v8i8)
-        return SelectLoad(Node, 3, AArch64::LD3Rv8b, AArch64::dsub0);
-      else if (VT == MVT::v16i8)
-        return SelectLoad(Node, 3, AArch64::LD3Rv16b, AArch64::qsub0);
-      else if (VT == MVT::v4i16 || VT == MVT::v4f16)
-        return SelectLoad(Node, 3, AArch64::LD3Rv4h, AArch64::dsub0);
-      else if (VT == MVT::v8i16 || VT == MVT::v8f16)
-        return SelectLoad(Node, 3, AArch64::LD3Rv8h, AArch64::qsub0);
-      else if (VT == MVT::v2i32 || VT == MVT::v2f32)
-        return SelectLoad(Node, 3, AArch64::LD3Rv2s, AArch64::dsub0);
-      else if (VT == MVT::v4i32 || VT == MVT::v4f32)
-        return SelectLoad(Node, 3, AArch64::LD3Rv4s, AArch64::qsub0);
-      else if (VT == MVT::v1i64 || VT == MVT::v1f64)
-        return SelectLoad(Node, 3, AArch64::LD3Rv1d, AArch64::dsub0);
-      else if (VT == MVT::v2i64 || VT == MVT::v2f64)
-        return SelectLoad(Node, 3, AArch64::LD3Rv2d, AArch64::qsub0);
+      if (VT == MVT::v8i8) {
+        SelectLoad(Node, 3, AArch64::LD3Rv8b, AArch64::dsub0);
+        return;
+      } else if (VT == MVT::v16i8) {
+        SelectLoad(Node, 3, AArch64::LD3Rv16b, AArch64::qsub0);
+        return;
+      } else if (VT == MVT::v4i16 || VT == MVT::v4f16) {
+        SelectLoad(Node, 3, AArch64::LD3Rv4h, AArch64::dsub0);
+        return;
+      } else if (VT == MVT::v8i16 || VT == MVT::v8f16) {
+        SelectLoad(Node, 3, AArch64::LD3Rv8h, AArch64::qsub0);
+        return;
+      } else if (VT == MVT::v2i32 || VT == MVT::v2f32) {
+        SelectLoad(Node, 3, AArch64::LD3Rv2s, AArch64::dsub0);
+        return;
+      } else if (VT == MVT::v4i32 || VT == MVT::v4f32) {
+        SelectLoad(Node, 3, AArch64::LD3Rv4s, AArch64::qsub0);
+        return;
+      } else if (VT == MVT::v1i64 || VT == MVT::v1f64) {
+        SelectLoad(Node, 3, AArch64::LD3Rv1d, AArch64::dsub0);
+        return;
+      } else if (VT == MVT::v2i64 || VT == MVT::v2f64) {
+        SelectLoad(Node, 3, AArch64::LD3Rv2d, AArch64::qsub0);
+        return;
+      }
       break;
     case Intrinsic::aarch64_neon_ld4r:
-      if (VT == MVT::v8i8)
-        return SelectLoad(Node, 4, AArch64::LD4Rv8b, AArch64::dsub0);
-      else if (VT == MVT::v16i8)
-        return SelectLoad(Node, 4, AArch64::LD4Rv16b, AArch64::qsub0);
-      else if (VT == MVT::v4i16 || VT == MVT::v4f16)
-        return SelectLoad(Node, 4, AArch64::LD4Rv4h, AArch64::dsub0);
-      else if (VT == MVT::v8i16 || VT == MVT::v8f16)
-        return SelectLoad(Node, 4, AArch64::LD4Rv8h, AArch64::qsub0);
-      else if (VT == MVT::v2i32 || VT == MVT::v2f32)
-        return SelectLoad(Node, 4, AArch64::LD4Rv2s, AArch64::dsub0);
-      else if (VT == MVT::v4i32 || VT == MVT::v4f32)
-        return SelectLoad(Node, 4, AArch64::LD4Rv4s, AArch64::qsub0);
-      else if (VT == MVT::v1i64 || VT == MVT::v1f64)
-        return SelectLoad(Node, 4, AArch64::LD4Rv1d, AArch64::dsub0);
-      else if (VT == MVT::v2i64 || VT == MVT::v2f64)
-        return SelectLoad(Node, 4, AArch64::LD4Rv2d, AArch64::qsub0);
+      if (VT == MVT::v8i8) {
+        SelectLoad(Node, 4, AArch64::LD4Rv8b, AArch64::dsub0);
+        return;
+      } else if (VT == MVT::v16i8) {
+        SelectLoad(Node, 4, AArch64::LD4Rv16b, AArch64::qsub0);
+        return;
+      } else if (VT == MVT::v4i16 || VT == MVT::v4f16) {
+        SelectLoad(Node, 4, AArch64::LD4Rv4h, AArch64::dsub0);
+        return;
+      } else if (VT == MVT::v8i16 || VT == MVT::v8f16) {
+        SelectLoad(Node, 4, AArch64::LD4Rv8h, AArch64::qsub0);
+        return;
+      } else if (VT == MVT::v2i32 || VT == MVT::v2f32) {
+        SelectLoad(Node, 4, AArch64::LD4Rv2s, AArch64::dsub0);
+        return;
+      } else if (VT == MVT::v4i32 || VT == MVT::v4f32) {
+        SelectLoad(Node, 4, AArch64::LD4Rv4s, AArch64::qsub0);
+        return;
+      } else if (VT == MVT::v1i64 || VT == MVT::v1f64) {
+        SelectLoad(Node, 4, AArch64::LD4Rv1d, AArch64::dsub0);
+        return;
+      } else if (VT == MVT::v2i64 || VT == MVT::v2f64) {
+        SelectLoad(Node, 4, AArch64::LD4Rv2d, AArch64::qsub0);
+        return;
+      }
       break;
     case Intrinsic::aarch64_neon_ld2lane:
-      if (VT == MVT::v16i8 || VT == MVT::v8i8)
-        return SelectLoadLane(Node, 2, AArch64::LD2i8);
-      else if (VT == MVT::v8i16 || VT == MVT::v4i16 || VT == MVT::v4f16 ||
-               VT == MVT::v8f16)
-        return SelectLoadLane(Node, 2, AArch64::LD2i16);
-      else if (VT == MVT::v4i32 || VT == MVT::v2i32 || VT == MVT::v4f32 ||
-               VT == MVT::v2f32)
-        return SelectLoadLane(Node, 2, AArch64::LD2i32);
-      else if (VT == MVT::v2i64 || VT == MVT::v1i64 || VT == MVT::v2f64 ||
-               VT == MVT::v1f64)
-        return SelectLoadLane(Node, 2, AArch64::LD2i64);
+      if (VT == MVT::v16i8 || VT == MVT::v8i8) {
+        SelectLoadLane(Node, 2, AArch64::LD2i8);
+        return;
+      } else if (VT == MVT::v8i16 || VT == MVT::v4i16 || VT == MVT::v4f16 ||
+                 VT == MVT::v8f16) {
+        SelectLoadLane(Node, 2, AArch64::LD2i16);
+        return;
+      } else if (VT == MVT::v4i32 || VT == MVT::v2i32 || VT == MVT::v4f32 ||
+                 VT == MVT::v2f32) {
+        SelectLoadLane(Node, 2, AArch64::LD2i32);
+        return;
+      } else if (VT == MVT::v2i64 || VT == MVT::v1i64 || VT == MVT::v2f64 ||
+                 VT == MVT::v1f64) {
+        SelectLoadLane(Node, 2, AArch64::LD2i64);
+        return;
+      }
       break;
     case Intrinsic::aarch64_neon_ld3lane:
-      if (VT == MVT::v16i8 || VT == MVT::v8i8)
-        return SelectLoadLane(Node, 3, AArch64::LD3i8);
-      else if (VT == MVT::v8i16 || VT == MVT::v4i16 || VT == MVT::v4f16 ||
-               VT == MVT::v8f16)
-        return SelectLoadLane(Node, 3, AArch64::LD3i16);
-      else if (VT == MVT::v4i32 || VT == MVT::v2i32 || VT == MVT::v4f32 ||
-               VT == MVT::v2f32)
-        return SelectLoadLane(Node, 3, AArch64::LD3i32);
-      else if (VT == MVT::v2i64 || VT == MVT::v1i64 || VT == MVT::v2f64 ||
-               VT == MVT::v1f64)
-        return SelectLoadLane(Node, 3, AArch64::LD3i64);
+      if (VT == MVT::v16i8 || VT == MVT::v8i8) {
+        SelectLoadLane(Node, 3, AArch64::LD3i8);
+        return;
+      } else if (VT == MVT::v8i16 || VT == MVT::v4i16 || VT == MVT::v4f16 ||
+                 VT == MVT::v8f16) {
+        SelectLoadLane(Node, 3, AArch64::LD3i16);
+        return;
+      } else if (VT == MVT::v4i32 || VT == MVT::v2i32 || VT == MVT::v4f32 ||
+                 VT == MVT::v2f32) {
+        SelectLoadLane(Node, 3, AArch64::LD3i32);
+        return;
+      } else if (VT == MVT::v2i64 || VT == MVT::v1i64 || VT == MVT::v2f64 ||
+                 VT == MVT::v1f64) {
+        SelectLoadLane(Node, 3, AArch64::LD3i64);
+        return;
+      }
       break;
     case Intrinsic::aarch64_neon_ld4lane:
-      if (VT == MVT::v16i8 || VT == MVT::v8i8)
-        return SelectLoadLane(Node, 4, AArch64::LD4i8);
-      else if (VT == MVT::v8i16 || VT == MVT::v4i16 || VT == MVT::v4f16 ||
-               VT == MVT::v8f16)
-        return SelectLoadLane(Node, 4, AArch64::LD4i16);
-      else if (VT == MVT::v4i32 || VT == MVT::v2i32 || VT == MVT::v4f32 ||
-               VT == MVT::v2f32)
-        return SelectLoadLane(Node, 4, AArch64::LD4i32);
-      else if (VT == MVT::v2i64 || VT == MVT::v1i64 || VT == MVT::v2f64 ||
-               VT == MVT::v1f64)
-        return SelectLoadLane(Node, 4, AArch64::LD4i64);
+      if (VT == MVT::v16i8 || VT == MVT::v8i8) {
+        SelectLoadLane(Node, 4, AArch64::LD4i8);
+        return;
+      } else if (VT == MVT::v8i16 || VT == MVT::v4i16 || VT == MVT::v4f16 ||
+                 VT == MVT::v8f16) {
+        SelectLoadLane(Node, 4, AArch64::LD4i16);
+        return;
+      } else if (VT == MVT::v4i32 || VT == MVT::v2i32 || VT == MVT::v4f32 ||
+                 VT == MVT::v2f32) {
+        SelectLoadLane(Node, 4, AArch64::LD4i32);
+        return;
+      } else if (VT == MVT::v2i64 || VT == MVT::v1i64 || VT == MVT::v2f64 ||
+                 VT == MVT::v1f64) {
+        SelectLoadLane(Node, 4, AArch64::LD4i64);
+        return;
+      }
       break;
     }
   } break;
@@ -2682,33 +3045,39 @@ SDNode *AArch64DAGToDAGISel::Select(SDNode *Node) {
     default:
       break;
     case Intrinsic::aarch64_neon_tbl2:
-      return SelectTable(Node, 2, VT == MVT::v8i8 ? AArch64::TBLv8i8Two
-                                                  : AArch64::TBLv16i8Two,
-                         false);
+      SelectTable(Node, 2,
+                  VT == MVT::v8i8 ? AArch64::TBLv8i8Two : AArch64::TBLv16i8Two,
+                  false);
+      return;
     case Intrinsic::aarch64_neon_tbl3:
-      return SelectTable(Node, 3, VT == MVT::v8i8 ? AArch64::TBLv8i8Three
-                                                  : AArch64::TBLv16i8Three,
-                         false);
+      SelectTable(Node, 3, VT == MVT::v8i8 ? AArch64::TBLv8i8Three
+                                           : AArch64::TBLv16i8Three,
+                  false);
+      return;
     case Intrinsic::aarch64_neon_tbl4:
-      return SelectTable(Node, 4, VT == MVT::v8i8 ? AArch64::TBLv8i8Four
-                                                  : AArch64::TBLv16i8Four,
-                         false);
+      SelectTable(Node, 4, VT == MVT::v8i8 ? AArch64::TBLv8i8Four
+                                           : AArch64::TBLv16i8Four,
+                  false);
+      return;
     case Intrinsic::aarch64_neon_tbx2:
-      return SelectTable(Node, 2, VT == MVT::v8i8 ? AArch64::TBXv8i8Two
-                                                  : AArch64::TBXv16i8Two,
-                         true);
+      SelectTable(Node, 2,
+                  VT == MVT::v8i8 ? AArch64::TBXv8i8Two : AArch64::TBXv16i8Two,
+                  true);
+      return;
     case Intrinsic::aarch64_neon_tbx3:
-      return SelectTable(Node, 3, VT == MVT::v8i8 ? AArch64::TBXv8i8Three
-                                                  : AArch64::TBXv16i8Three,
-                         true);
+      SelectTable(Node, 3, VT == MVT::v8i8 ? AArch64::TBXv8i8Three
+                                           : AArch64::TBXv16i8Three,
+                  true);
+      return;
     case Intrinsic::aarch64_neon_tbx4:
-      return SelectTable(Node, 4, VT == MVT::v8i8 ? AArch64::TBXv8i8Four
-                                                  : AArch64::TBXv16i8Four,
-                         true);
+      SelectTable(Node, 4, VT == MVT::v8i8 ? AArch64::TBXv8i8Four
+                                           : AArch64::TBXv16i8Four,
+                  true);
+      return;
     case Intrinsic::aarch64_neon_smull:
     case Intrinsic::aarch64_neon_umull:
-      if (SDNode *N = SelectMULLV64LaneV128(IntNo, Node))
-        return N;
+      if (tryMULLV64LaneV128(IntNo, Node))
+        return;
       break;
     }
     break;
@@ -2721,588 +3090,827 @@ SDNode *AArch64DAGToDAGISel::Select(SDNode *Node) {
     default:
       break;
     case Intrinsic::aarch64_neon_st1x2: {
-      if (VT == MVT::v8i8)
-        return SelectStore(Node, 2, AArch64::ST1Twov8b);
-      else if (VT == MVT::v16i8)
-        return SelectStore(Node, 2, AArch64::ST1Twov16b);
-      else if (VT == MVT::v4i16 || VT == MVT::v4f16)
-        return SelectStore(Node, 2, AArch64::ST1Twov4h);
-      else if (VT == MVT::v8i16 || VT == MVT::v8f16)
-        return SelectStore(Node, 2, AArch64::ST1Twov8h);
-      else if (VT == MVT::v2i32 || VT == MVT::v2f32)
-        return SelectStore(Node, 2, AArch64::ST1Twov2s);
-      else if (VT == MVT::v4i32 || VT == MVT::v4f32)
-        return SelectStore(Node, 2, AArch64::ST1Twov4s);
-      else if (VT == MVT::v2i64 || VT == MVT::v2f64)
-        return SelectStore(Node, 2, AArch64::ST1Twov2d);
-      else if (VT == MVT::v1i64 || VT == MVT::v1f64)
-        return SelectStore(Node, 2, AArch64::ST1Twov1d);
+      if (VT == MVT::v8i8) {
+        SelectStore(Node, 2, AArch64::ST1Twov8b);
+        return;
+      } else if (VT == MVT::v16i8) {
+        SelectStore(Node, 2, AArch64::ST1Twov16b);
+        return;
+      } else if (VT == MVT::v4i16 || VT == MVT::v4f16) {
+        SelectStore(Node, 2, AArch64::ST1Twov4h);
+        return;
+      } else if (VT == MVT::v8i16 || VT == MVT::v8f16) {
+        SelectStore(Node, 2, AArch64::ST1Twov8h);
+        return;
+      } else if (VT == MVT::v2i32 || VT == MVT::v2f32) {
+        SelectStore(Node, 2, AArch64::ST1Twov2s);
+        return;
+      } else if (VT == MVT::v4i32 || VT == MVT::v4f32) {
+        SelectStore(Node, 2, AArch64::ST1Twov4s);
+        return;
+      } else if (VT == MVT::v2i64 || VT == MVT::v2f64) {
+        SelectStore(Node, 2, AArch64::ST1Twov2d);
+        return;
+      } else if (VT == MVT::v1i64 || VT == MVT::v1f64) {
+        SelectStore(Node, 2, AArch64::ST1Twov1d);
+        return;
+      }
       break;
     }
     case Intrinsic::aarch64_neon_st1x3: {
-      if (VT == MVT::v8i8)
-        return SelectStore(Node, 3, AArch64::ST1Threev8b);
-      else if (VT == MVT::v16i8)
-        return SelectStore(Node, 3, AArch64::ST1Threev16b);
-      else if (VT == MVT::v4i16 || VT == MVT::v4f16)
-        return SelectStore(Node, 3, AArch64::ST1Threev4h);
-      else if (VT == MVT::v8i16 || VT == MVT::v8f16)
-        return SelectStore(Node, 3, AArch64::ST1Threev8h);
-      else if (VT == MVT::v2i32 || VT == MVT::v2f32)
-        return SelectStore(Node, 3, AArch64::ST1Threev2s);
-      else if (VT == MVT::v4i32 || VT == MVT::v4f32)
-        return SelectStore(Node, 3, AArch64::ST1Threev4s);
-      else if (VT == MVT::v2i64 || VT == MVT::v2f64)
-        return SelectStore(Node, 3, AArch64::ST1Threev2d);
-      else if (VT == MVT::v1i64 || VT == MVT::v1f64)
-        return SelectStore(Node, 3, AArch64::ST1Threev1d);
+      if (VT == MVT::v8i8) {
+        SelectStore(Node, 3, AArch64::ST1Threev8b);
+        return;
+      } else if (VT == MVT::v16i8) {
+        SelectStore(Node, 3, AArch64::ST1Threev16b);
+        return;
+      } else if (VT == MVT::v4i16 || VT == MVT::v4f16) {
+        SelectStore(Node, 3, AArch64::ST1Threev4h);
+        return;
+      } else if (VT == MVT::v8i16 || VT == MVT::v8f16) {
+        SelectStore(Node, 3, AArch64::ST1Threev8h);
+        return;
+      } else if (VT == MVT::v2i32 || VT == MVT::v2f32) {
+        SelectStore(Node, 3, AArch64::ST1Threev2s);
+        return;
+      } else if (VT == MVT::v4i32 || VT == MVT::v4f32) {
+        SelectStore(Node, 3, AArch64::ST1Threev4s);
+        return;
+      } else if (VT == MVT::v2i64 || VT == MVT::v2f64) {
+        SelectStore(Node, 3, AArch64::ST1Threev2d);
+        return;
+      } else if (VT == MVT::v1i64 || VT == MVT::v1f64) {
+        SelectStore(Node, 3, AArch64::ST1Threev1d);
+        return;
+      }
       break;
     }
     case Intrinsic::aarch64_neon_st1x4: {
-      if (VT == MVT::v8i8)
-        return SelectStore(Node, 4, AArch64::ST1Fourv8b);
-      else if (VT == MVT::v16i8)
-        return SelectStore(Node, 4, AArch64::ST1Fourv16b);
-      else if (VT == MVT::v4i16 || VT == MVT::v4f16)
-        return SelectStore(Node, 4, AArch64::ST1Fourv4h);
-      else if (VT == MVT::v8i16 || VT == MVT::v8f16)
-        return SelectStore(Node, 4, AArch64::ST1Fourv8h);
-      else if (VT == MVT::v2i32 || VT == MVT::v2f32)
-        return SelectStore(Node, 4, AArch64::ST1Fourv2s);
-      else if (VT == MVT::v4i32 || VT == MVT::v4f32)
-        return SelectStore(Node, 4, AArch64::ST1Fourv4s);
-      else if (VT == MVT::v2i64 || VT == MVT::v2f64)
-        return SelectStore(Node, 4, AArch64::ST1Fourv2d);
-      else if (VT == MVT::v1i64 || VT == MVT::v1f64)
-        return SelectStore(Node, 4, AArch64::ST1Fourv1d);
+      if (VT == MVT::v8i8) {
+        SelectStore(Node, 4, AArch64::ST1Fourv8b);
+        return;
+      } else if (VT == MVT::v16i8) {
+        SelectStore(Node, 4, AArch64::ST1Fourv16b);
+        return;
+      } else if (VT == MVT::v4i16 || VT == MVT::v4f16) {
+        SelectStore(Node, 4, AArch64::ST1Fourv4h);
+        return;
+      } else if (VT == MVT::v8i16 || VT == MVT::v8f16) {
+        SelectStore(Node, 4, AArch64::ST1Fourv8h);
+        return;
+      } else if (VT == MVT::v2i32 || VT == MVT::v2f32) {
+        SelectStore(Node, 4, AArch64::ST1Fourv2s);
+        return;
+      } else if (VT == MVT::v4i32 || VT == MVT::v4f32) {
+        SelectStore(Node, 4, AArch64::ST1Fourv4s);
+        return;
+      } else if (VT == MVT::v2i64 || VT == MVT::v2f64) {
+        SelectStore(Node, 4, AArch64::ST1Fourv2d);
+        return;
+      } else if (VT == MVT::v1i64 || VT == MVT::v1f64) {
+        SelectStore(Node, 4, AArch64::ST1Fourv1d);
+        return;
+      }
       break;
     }
     case Intrinsic::aarch64_neon_st2: {
-      if (VT == MVT::v8i8)
-        return SelectStore(Node, 2, AArch64::ST2Twov8b);
-      else if (VT == MVT::v16i8)
-        return SelectStore(Node, 2, AArch64::ST2Twov16b);
-      else if (VT == MVT::v4i16 || VT == MVT::v4f16)
-        return SelectStore(Node, 2, AArch64::ST2Twov4h);
-      else if (VT == MVT::v8i16 || VT == MVT::v8f16)
-        return SelectStore(Node, 2, AArch64::ST2Twov8h);
-      else if (VT == MVT::v2i32 || VT == MVT::v2f32)
-        return SelectStore(Node, 2, AArch64::ST2Twov2s);
-      else if (VT == MVT::v4i32 || VT == MVT::v4f32)
-        return SelectStore(Node, 2, AArch64::ST2Twov4s);
-      else if (VT == MVT::v2i64 || VT == MVT::v2f64)
-        return SelectStore(Node, 2, AArch64::ST2Twov2d);
-      else if (VT == MVT::v1i64 || VT == MVT::v1f64)
-        return SelectStore(Node, 2, AArch64::ST1Twov1d);
+      if (VT == MVT::v8i8) {
+        SelectStore(Node, 2, AArch64::ST2Twov8b);
+        return;
+      } else if (VT == MVT::v16i8) {
+        SelectStore(Node, 2, AArch64::ST2Twov16b);
+        return;
+      } else if (VT == MVT::v4i16 || VT == MVT::v4f16) {
+        SelectStore(Node, 2, AArch64::ST2Twov4h);
+        return;
+      } else if (VT == MVT::v8i16 || VT == MVT::v8f16) {
+        SelectStore(Node, 2, AArch64::ST2Twov8h);
+        return;
+      } else if (VT == MVT::v2i32 || VT == MVT::v2f32) {
+        SelectStore(Node, 2, AArch64::ST2Twov2s);
+        return;
+      } else if (VT == MVT::v4i32 || VT == MVT::v4f32) {
+        SelectStore(Node, 2, AArch64::ST2Twov4s);
+        return;
+      } else if (VT == MVT::v2i64 || VT == MVT::v2f64) {
+        SelectStore(Node, 2, AArch64::ST2Twov2d);
+        return;
+      } else if (VT == MVT::v1i64 || VT == MVT::v1f64) {
+        SelectStore(Node, 2, AArch64::ST1Twov1d);
+        return;
+      }
       break;
     }
     case Intrinsic::aarch64_neon_st3: {
-      if (VT == MVT::v8i8)
-        return SelectStore(Node, 3, AArch64::ST3Threev8b);
-      else if (VT == MVT::v16i8)
-        return SelectStore(Node, 3, AArch64::ST3Threev16b);
-      else if (VT == MVT::v4i16 || VT == MVT::v4f16)
-        return SelectStore(Node, 3, AArch64::ST3Threev4h);
-      else if (VT == MVT::v8i16 || VT == MVT::v8f16)
-        return SelectStore(Node, 3, AArch64::ST3Threev8h);
-      else if (VT == MVT::v2i32 || VT == MVT::v2f32)
-        return SelectStore(Node, 3, AArch64::ST3Threev2s);
-      else if (VT == MVT::v4i32 || VT == MVT::v4f32)
-        return SelectStore(Node, 3, AArch64::ST3Threev4s);
-      else if (VT == MVT::v2i64 || VT == MVT::v2f64)
-        return SelectStore(Node, 3, AArch64::ST3Threev2d);
-      else if (VT == MVT::v1i64 || VT == MVT::v1f64)
-        return SelectStore(Node, 3, AArch64::ST1Threev1d);
+      if (VT == MVT::v8i8) {
+        SelectStore(Node, 3, AArch64::ST3Threev8b);
+        return;
+      } else if (VT == MVT::v16i8) {
+        SelectStore(Node, 3, AArch64::ST3Threev16b);
+        return;
+      } else if (VT == MVT::v4i16 || VT == MVT::v4f16) {
+        SelectStore(Node, 3, AArch64::ST3Threev4h);
+        return;
+      } else if (VT == MVT::v8i16 || VT == MVT::v8f16) {
+        SelectStore(Node, 3, AArch64::ST3Threev8h);
+        return;
+      } else if (VT == MVT::v2i32 || VT == MVT::v2f32) {
+        SelectStore(Node, 3, AArch64::ST3Threev2s);
+        return;
+      } else if (VT == MVT::v4i32 || VT == MVT::v4f32) {
+        SelectStore(Node, 3, AArch64::ST3Threev4s);
+        return;
+      } else if (VT == MVT::v2i64 || VT == MVT::v2f64) {
+        SelectStore(Node, 3, AArch64::ST3Threev2d);
+        return;
+      } else if (VT == MVT::v1i64 || VT == MVT::v1f64) {
+        SelectStore(Node, 3, AArch64::ST1Threev1d);
+        return;
+      }
       break;
     }
     case Intrinsic::aarch64_neon_st4: {
-      if (VT == MVT::v8i8)
-        return SelectStore(Node, 4, AArch64::ST4Fourv8b);
-      else if (VT == MVT::v16i8)
-        return SelectStore(Node, 4, AArch64::ST4Fourv16b);
-      else if (VT == MVT::v4i16 || VT == MVT::v4f16)
-        return SelectStore(Node, 4, AArch64::ST4Fourv4h);
-      else if (VT == MVT::v8i16 || VT == MVT::v8f16)
-        return SelectStore(Node, 4, AArch64::ST4Fourv8h);
-      else if (VT == MVT::v2i32 || VT == MVT::v2f32)
-        return SelectStore(Node, 4, AArch64::ST4Fourv2s);
-      else if (VT == MVT::v4i32 || VT == MVT::v4f32)
-        return SelectStore(Node, 4, AArch64::ST4Fourv4s);
-      else if (VT == MVT::v2i64 || VT == MVT::v2f64)
-        return SelectStore(Node, 4, AArch64::ST4Fourv2d);
-      else if (VT == MVT::v1i64 || VT == MVT::v1f64)
-        return SelectStore(Node, 4, AArch64::ST1Fourv1d);
+      if (VT == MVT::v8i8) {
+        SelectStore(Node, 4, AArch64::ST4Fourv8b);
+        return;
+      } else if (VT == MVT::v16i8) {
+        SelectStore(Node, 4, AArch64::ST4Fourv16b);
+        return;
+      } else if (VT == MVT::v4i16 || VT == MVT::v4f16) {
+        SelectStore(Node, 4, AArch64::ST4Fourv4h);
+        return;
+      } else if (VT == MVT::v8i16 || VT == MVT::v8f16) {
+        SelectStore(Node, 4, AArch64::ST4Fourv8h);
+        return;
+      } else if (VT == MVT::v2i32 || VT == MVT::v2f32) {
+        SelectStore(Node, 4, AArch64::ST4Fourv2s);
+        return;
+      } else if (VT == MVT::v4i32 || VT == MVT::v4f32) {
+        SelectStore(Node, 4, AArch64::ST4Fourv4s);
+        return;
+      } else if (VT == MVT::v2i64 || VT == MVT::v2f64) {
+        SelectStore(Node, 4, AArch64::ST4Fourv2d);
+        return;
+      } else if (VT == MVT::v1i64 || VT == MVT::v1f64) {
+        SelectStore(Node, 4, AArch64::ST1Fourv1d);
+        return;
+      }
       break;
     }
     case Intrinsic::aarch64_neon_st2lane: {
-      if (VT == MVT::v16i8 || VT == MVT::v8i8)
-        return SelectStoreLane(Node, 2, AArch64::ST2i8);
-      else if (VT == MVT::v8i16 || VT == MVT::v4i16 || VT == MVT::v4f16 ||
-               VT == MVT::v8f16)
-        return SelectStoreLane(Node, 2, AArch64::ST2i16);
-      else if (VT == MVT::v4i32 || VT == MVT::v2i32 || VT == MVT::v4f32 ||
-               VT == MVT::v2f32)
-        return SelectStoreLane(Node, 2, AArch64::ST2i32);
-      else if (VT == MVT::v2i64 || VT == MVT::v1i64 || VT == MVT::v2f64 ||
-               VT == MVT::v1f64)
-        return SelectStoreLane(Node, 2, AArch64::ST2i64);
+      if (VT == MVT::v16i8 || VT == MVT::v8i8) {
+        SelectStoreLane(Node, 2, AArch64::ST2i8);
+        return;
+      } else if (VT == MVT::v8i16 || VT == MVT::v4i16 || VT == MVT::v4f16 ||
+                 VT == MVT::v8f16) {
+        SelectStoreLane(Node, 2, AArch64::ST2i16);
+        return;
+      } else if (VT == MVT::v4i32 || VT == MVT::v2i32 || VT == MVT::v4f32 ||
+                 VT == MVT::v2f32) {
+        SelectStoreLane(Node, 2, AArch64::ST2i32);
+        return;
+      } else if (VT == MVT::v2i64 || VT == MVT::v1i64 || VT == MVT::v2f64 ||
+                 VT == MVT::v1f64) {
+        SelectStoreLane(Node, 2, AArch64::ST2i64);
+        return;
+      }
       break;
     }
     case Intrinsic::aarch64_neon_st3lane: {
-      if (VT == MVT::v16i8 || VT == MVT::v8i8)
-        return SelectStoreLane(Node, 3, AArch64::ST3i8);
-      else if (VT == MVT::v8i16 || VT == MVT::v4i16 || VT == MVT::v4f16 ||
-               VT == MVT::v8f16)
-        return SelectStoreLane(Node, 3, AArch64::ST3i16);
-      else if (VT == MVT::v4i32 || VT == MVT::v2i32 || VT == MVT::v4f32 ||
-               VT == MVT::v2f32)
-        return SelectStoreLane(Node, 3, AArch64::ST3i32);
-      else if (VT == MVT::v2i64 || VT == MVT::v1i64 || VT == MVT::v2f64 ||
-               VT == MVT::v1f64)
-        return SelectStoreLane(Node, 3, AArch64::ST3i64);
+      if (VT == MVT::v16i8 || VT == MVT::v8i8) {
+        SelectStoreLane(Node, 3, AArch64::ST3i8);
+        return;
+      } else if (VT == MVT::v8i16 || VT == MVT::v4i16 || VT == MVT::v4f16 ||
+                 VT == MVT::v8f16) {
+        SelectStoreLane(Node, 3, AArch64::ST3i16);
+        return;
+      } else if (VT == MVT::v4i32 || VT == MVT::v2i32 || VT == MVT::v4f32 ||
+                 VT == MVT::v2f32) {
+        SelectStoreLane(Node, 3, AArch64::ST3i32);
+        return;
+      } else if (VT == MVT::v2i64 || VT == MVT::v1i64 || VT == MVT::v2f64 ||
+                 VT == MVT::v1f64) {
+        SelectStoreLane(Node, 3, AArch64::ST3i64);
+        return;
+      }
       break;
     }
     case Intrinsic::aarch64_neon_st4lane: {
-      if (VT == MVT::v16i8 || VT == MVT::v8i8)
-        return SelectStoreLane(Node, 4, AArch64::ST4i8);
-      else if (VT == MVT::v8i16 || VT == MVT::v4i16 || VT == MVT::v4f16 ||
-               VT == MVT::v8f16)
-        return SelectStoreLane(Node, 4, AArch64::ST4i16);
-      else if (VT == MVT::v4i32 || VT == MVT::v2i32 || VT == MVT::v4f32 ||
-               VT == MVT::v2f32)
-        return SelectStoreLane(Node, 4, AArch64::ST4i32);
-      else if (VT == MVT::v2i64 || VT == MVT::v1i64 || VT == MVT::v2f64 ||
-               VT == MVT::v1f64)
-        return SelectStoreLane(Node, 4, AArch64::ST4i64);
+      if (VT == MVT::v16i8 || VT == MVT::v8i8) {
+        SelectStoreLane(Node, 4, AArch64::ST4i8);
+        return;
+      } else if (VT == MVT::v8i16 || VT == MVT::v4i16 || VT == MVT::v4f16 ||
+                 VT == MVT::v8f16) {
+        SelectStoreLane(Node, 4, AArch64::ST4i16);
+        return;
+      } else if (VT == MVT::v4i32 || VT == MVT::v2i32 || VT == MVT::v4f32 ||
+                 VT == MVT::v2f32) {
+        SelectStoreLane(Node, 4, AArch64::ST4i32);
+        return;
+      } else if (VT == MVT::v2i64 || VT == MVT::v1i64 || VT == MVT::v2f64 ||
+                 VT == MVT::v1f64) {
+        SelectStoreLane(Node, 4, AArch64::ST4i64);
+        return;
+      }
       break;
     }
     }
     break;
   }
   case AArch64ISD::LD2post: {
-    if (VT == MVT::v8i8)
-      return SelectPostLoad(Node, 2, AArch64::LD2Twov8b_POST, AArch64::dsub0);
-    else if (VT == MVT::v16i8)
-      return SelectPostLoad(Node, 2, AArch64::LD2Twov16b_POST, AArch64::qsub0);
-    else if (VT == MVT::v4i16 || VT == MVT::v4f16)
-      return SelectPostLoad(Node, 2, AArch64::LD2Twov4h_POST, AArch64::dsub0);
-    else if (VT == MVT::v8i16 || VT == MVT::v8f16)
-      return SelectPostLoad(Node, 2, AArch64::LD2Twov8h_POST, AArch64::qsub0);
-    else if (VT == MVT::v2i32 || VT == MVT::v2f32)
-      return SelectPostLoad(Node, 2, AArch64::LD2Twov2s_POST, AArch64::dsub0);
-    else if (VT == MVT::v4i32 || VT == MVT::v4f32)
-      return SelectPostLoad(Node, 2, AArch64::LD2Twov4s_POST, AArch64::qsub0);
-    else if (VT == MVT::v1i64 || VT == MVT::v1f64)
-      return SelectPostLoad(Node, 2, AArch64::LD1Twov1d_POST, AArch64::dsub0);
-    else if (VT == MVT::v2i64 || VT == MVT::v2f64)
-      return SelectPostLoad(Node, 2, AArch64::LD2Twov2d_POST, AArch64::qsub0);
+    if (VT == MVT::v8i8) {
+      SelectPostLoad(Node, 2, AArch64::LD2Twov8b_POST, AArch64::dsub0);
+      return;
+    } else if (VT == MVT::v16i8) {
+      SelectPostLoad(Node, 2, AArch64::LD2Twov16b_POST, AArch64::qsub0);
+      return;
+    } else if (VT == MVT::v4i16 || VT == MVT::v4f16) {
+      SelectPostLoad(Node, 2, AArch64::LD2Twov4h_POST, AArch64::dsub0);
+      return;
+    } else if (VT == MVT::v8i16 || VT == MVT::v8f16) {
+      SelectPostLoad(Node, 2, AArch64::LD2Twov8h_POST, AArch64::qsub0);
+      return;
+    } else if (VT == MVT::v2i32 || VT == MVT::v2f32) {
+      SelectPostLoad(Node, 2, AArch64::LD2Twov2s_POST, AArch64::dsub0);
+      return;
+    } else if (VT == MVT::v4i32 || VT == MVT::v4f32) {
+      SelectPostLoad(Node, 2, AArch64::LD2Twov4s_POST, AArch64::qsub0);
+      return;
+    } else if (VT == MVT::v1i64 || VT == MVT::v1f64) {
+      SelectPostLoad(Node, 2, AArch64::LD1Twov1d_POST, AArch64::dsub0);
+      return;
+    } else if (VT == MVT::v2i64 || VT == MVT::v2f64) {
+      SelectPostLoad(Node, 2, AArch64::LD2Twov2d_POST, AArch64::qsub0);
+      return;
+    }
     break;
   }
   case AArch64ISD::LD3post: {
-    if (VT == MVT::v8i8)
-      return SelectPostLoad(Node, 3, AArch64::LD3Threev8b_POST, AArch64::dsub0);
-    else if (VT == MVT::v16i8)
-      return SelectPostLoad(Node, 3, AArch64::LD3Threev16b_POST, AArch64::qsub0);
-    else if (VT == MVT::v4i16 || VT == MVT::v4f16)
-      return SelectPostLoad(Node, 3, AArch64::LD3Threev4h_POST, AArch64::dsub0);
-    else if (VT == MVT::v8i16 || VT == MVT::v8f16)
-      return SelectPostLoad(Node, 3, AArch64::LD3Threev8h_POST, AArch64::qsub0);
-    else if (VT == MVT::v2i32 || VT == MVT::v2f32)
-      return SelectPostLoad(Node, 3, AArch64::LD3Threev2s_POST, AArch64::dsub0);
-    else if (VT == MVT::v4i32 || VT == MVT::v4f32)
-      return SelectPostLoad(Node, 3, AArch64::LD3Threev4s_POST, AArch64::qsub0);
-    else if (VT == MVT::v1i64 || VT == MVT::v1f64)
-      return SelectPostLoad(Node, 3, AArch64::LD1Threev1d_POST, AArch64::dsub0);
-    else if (VT == MVT::v2i64 || VT == MVT::v2f64)
-      return SelectPostLoad(Node, 3, AArch64::LD3Threev2d_POST, AArch64::qsub0);
+    if (VT == MVT::v8i8) {
+      SelectPostLoad(Node, 3, AArch64::LD3Threev8b_POST, AArch64::dsub0);
+      return;
+    } else if (VT == MVT::v16i8) {
+      SelectPostLoad(Node, 3, AArch64::LD3Threev16b_POST, AArch64::qsub0);
+      return;
+    } else if (VT == MVT::v4i16 || VT == MVT::v4f16) {
+      SelectPostLoad(Node, 3, AArch64::LD3Threev4h_POST, AArch64::dsub0);
+      return;
+    } else if (VT == MVT::v8i16 || VT == MVT::v8f16) {
+      SelectPostLoad(Node, 3, AArch64::LD3Threev8h_POST, AArch64::qsub0);
+      return;
+    } else if (VT == MVT::v2i32 || VT == MVT::v2f32) {
+      SelectPostLoad(Node, 3, AArch64::LD3Threev2s_POST, AArch64::dsub0);
+      return;
+    } else if (VT == MVT::v4i32 || VT == MVT::v4f32) {
+      SelectPostLoad(Node, 3, AArch64::LD3Threev4s_POST, AArch64::qsub0);
+      return;
+    } else if (VT == MVT::v1i64 || VT == MVT::v1f64) {
+      SelectPostLoad(Node, 3, AArch64::LD1Threev1d_POST, AArch64::dsub0);
+      return;
+    } else if (VT == MVT::v2i64 || VT == MVT::v2f64) {
+      SelectPostLoad(Node, 3, AArch64::LD3Threev2d_POST, AArch64::qsub0);
+      return;
+    }
     break;
   }
   case AArch64ISD::LD4post: {
-    if (VT == MVT::v8i8)
-      return SelectPostLoad(Node, 4, AArch64::LD4Fourv8b_POST, AArch64::dsub0);
-    else if (VT == MVT::v16i8)
-      return SelectPostLoad(Node, 4, AArch64::LD4Fourv16b_POST, AArch64::qsub0);
-    else if (VT == MVT::v4i16 || VT == MVT::v4f16)
-      return SelectPostLoad(Node, 4, AArch64::LD4Fourv4h_POST, AArch64::dsub0);
-    else if (VT == MVT::v8i16 || VT == MVT::v8f16)
-      return SelectPostLoad(Node, 4, AArch64::LD4Fourv8h_POST, AArch64::qsub0);
-    else if (VT == MVT::v2i32 || VT == MVT::v2f32)
-      return SelectPostLoad(Node, 4, AArch64::LD4Fourv2s_POST, AArch64::dsub0);
-    else if (VT == MVT::v4i32 || VT == MVT::v4f32)
-      return SelectPostLoad(Node, 4, AArch64::LD4Fourv4s_POST, AArch64::qsub0);
-    else if (VT == MVT::v1i64 || VT == MVT::v1f64)
-      return SelectPostLoad(Node, 4, AArch64::LD1Fourv1d_POST, AArch64::dsub0);
-    else if (VT == MVT::v2i64 || VT == MVT::v2f64)
-      return SelectPostLoad(Node, 4, AArch64::LD4Fourv2d_POST, AArch64::qsub0);
+    if (VT == MVT::v8i8) {
+      SelectPostLoad(Node, 4, AArch64::LD4Fourv8b_POST, AArch64::dsub0);
+      return;
+    } else if (VT == MVT::v16i8) {
+      SelectPostLoad(Node, 4, AArch64::LD4Fourv16b_POST, AArch64::qsub0);
+      return;
+    } else if (VT == MVT::v4i16 || VT == MVT::v4f16) {
+      SelectPostLoad(Node, 4, AArch64::LD4Fourv4h_POST, AArch64::dsub0);
+      return;
+    } else if (VT == MVT::v8i16 || VT == MVT::v8f16) {
+      SelectPostLoad(Node, 4, AArch64::LD4Fourv8h_POST, AArch64::qsub0);
+      return;
+    } else if (VT == MVT::v2i32 || VT == MVT::v2f32) {
+      SelectPostLoad(Node, 4, AArch64::LD4Fourv2s_POST, AArch64::dsub0);
+      return;
+    } else if (VT == MVT::v4i32 || VT == MVT::v4f32) {
+      SelectPostLoad(Node, 4, AArch64::LD4Fourv4s_POST, AArch64::qsub0);
+      return;
+    } else if (VT == MVT::v1i64 || VT == MVT::v1f64) {
+      SelectPostLoad(Node, 4, AArch64::LD1Fourv1d_POST, AArch64::dsub0);
+      return;
+    } else if (VT == MVT::v2i64 || VT == MVT::v2f64) {
+      SelectPostLoad(Node, 4, AArch64::LD4Fourv2d_POST, AArch64::qsub0);
+      return;
+    }
     break;
   }
   case AArch64ISD::LD1x2post: {
-    if (VT == MVT::v8i8)
-      return SelectPostLoad(Node, 2, AArch64::LD1Twov8b_POST, AArch64::dsub0);
-    else if (VT == MVT::v16i8)
-      return SelectPostLoad(Node, 2, AArch64::LD1Twov16b_POST, AArch64::qsub0);
-    else if (VT == MVT::v4i16 || VT == MVT::v4f16)
-      return SelectPostLoad(Node, 2, AArch64::LD1Twov4h_POST, AArch64::dsub0);
-    else if (VT == MVT::v8i16 || VT == MVT::v8f16)
-      return SelectPostLoad(Node, 2, AArch64::LD1Twov8h_POST, AArch64::qsub0);
-    else if (VT == MVT::v2i32 || VT == MVT::v2f32)
-      return SelectPostLoad(Node, 2, AArch64::LD1Twov2s_POST, AArch64::dsub0);
-    else if (VT == MVT::v4i32 || VT == MVT::v4f32)
-      return SelectPostLoad(Node, 2, AArch64::LD1Twov4s_POST, AArch64::qsub0);
-    else if (VT == MVT::v1i64 || VT == MVT::v1f64)
-      return SelectPostLoad(Node, 2, AArch64::LD1Twov1d_POST, AArch64::dsub0);
-    else if (VT == MVT::v2i64 || VT == MVT::v2f64)
-      return SelectPostLoad(Node, 2, AArch64::LD1Twov2d_POST, AArch64::qsub0);
+    if (VT == MVT::v8i8) {
+      SelectPostLoad(Node, 2, AArch64::LD1Twov8b_POST, AArch64::dsub0);
+      return;
+    } else if (VT == MVT::v16i8) {
+      SelectPostLoad(Node, 2, AArch64::LD1Twov16b_POST, AArch64::qsub0);
+      return;
+    } else if (VT == MVT::v4i16 || VT == MVT::v4f16) {
+      SelectPostLoad(Node, 2, AArch64::LD1Twov4h_POST, AArch64::dsub0);
+      return;
+    } else if (VT == MVT::v8i16 || VT == MVT::v8f16) {
+      SelectPostLoad(Node, 2, AArch64::LD1Twov8h_POST, AArch64::qsub0);
+      return;
+    } else if (VT == MVT::v2i32 || VT == MVT::v2f32) {
+      SelectPostLoad(Node, 2, AArch64::LD1Twov2s_POST, AArch64::dsub0);
+      return;
+    } else if (VT == MVT::v4i32 || VT == MVT::v4f32) {
+      SelectPostLoad(Node, 2, AArch64::LD1Twov4s_POST, AArch64::qsub0);
+      return;
+    } else if (VT == MVT::v1i64 || VT == MVT::v1f64) {
+      SelectPostLoad(Node, 2, AArch64::LD1Twov1d_POST, AArch64::dsub0);
+      return;
+    } else if (VT == MVT::v2i64 || VT == MVT::v2f64) {
+      SelectPostLoad(Node, 2, AArch64::LD1Twov2d_POST, AArch64::qsub0);
+      return;
+    }
     break;
   }
   case AArch64ISD::LD1x3post: {
-    if (VT == MVT::v8i8)
-      return SelectPostLoad(Node, 3, AArch64::LD1Threev8b_POST, AArch64::dsub0);
-    else if (VT == MVT::v16i8)
-      return SelectPostLoad(Node, 3, AArch64::LD1Threev16b_POST, AArch64::qsub0);
-    else if (VT == MVT::v4i16 || VT == MVT::v4f16)
-      return SelectPostLoad(Node, 3, AArch64::LD1Threev4h_POST, AArch64::dsub0);
-    else if (VT == MVT::v8i16 || VT == MVT::v8f16)
-      return SelectPostLoad(Node, 3, AArch64::LD1Threev8h_POST, AArch64::qsub0);
-    else if (VT == MVT::v2i32 || VT == MVT::v2f32)
-      return SelectPostLoad(Node, 3, AArch64::LD1Threev2s_POST, AArch64::dsub0);
-    else if (VT == MVT::v4i32 || VT == MVT::v4f32)
-      return SelectPostLoad(Node, 3, AArch64::LD1Threev4s_POST, AArch64::qsub0);
-    else if (VT == MVT::v1i64 || VT == MVT::v1f64)
-      return SelectPostLoad(Node, 3, AArch64::LD1Threev1d_POST, AArch64::dsub0);
-    else if (VT == MVT::v2i64 || VT == MVT::v2f64)
-      return SelectPostLoad(Node, 3, AArch64::LD1Threev2d_POST, AArch64::qsub0);
+    if (VT == MVT::v8i8) {
+      SelectPostLoad(Node, 3, AArch64::LD1Threev8b_POST, AArch64::dsub0);
+      return;
+    } else if (VT == MVT::v16i8) {
+      SelectPostLoad(Node, 3, AArch64::LD1Threev16b_POST, AArch64::qsub0);
+      return;
+    } else if (VT == MVT::v4i16 || VT == MVT::v4f16) {
+      SelectPostLoad(Node, 3, AArch64::LD1Threev4h_POST, AArch64::dsub0);
+      return;
+    } else if (VT == MVT::v8i16 || VT == MVT::v8f16) {
+      SelectPostLoad(Node, 3, AArch64::LD1Threev8h_POST, AArch64::qsub0);
+      return;
+    } else if (VT == MVT::v2i32 || VT == MVT::v2f32) {
+      SelectPostLoad(Node, 3, AArch64::LD1Threev2s_POST, AArch64::dsub0);
+      return;
+    } else if (VT == MVT::v4i32 || VT == MVT::v4f32) {
+      SelectPostLoad(Node, 3, AArch64::LD1Threev4s_POST, AArch64::qsub0);
+      return;
+    } else if (VT == MVT::v1i64 || VT == MVT::v1f64) {
+      SelectPostLoad(Node, 3, AArch64::LD1Threev1d_POST, AArch64::dsub0);
+      return;
+    } else if (VT == MVT::v2i64 || VT == MVT::v2f64) {
+      SelectPostLoad(Node, 3, AArch64::LD1Threev2d_POST, AArch64::qsub0);
+      return;
+    }
     break;
   }
   case AArch64ISD::LD1x4post: {
-    if (VT == MVT::v8i8)
-      return SelectPostLoad(Node, 4, AArch64::LD1Fourv8b_POST, AArch64::dsub0);
-    else if (VT == MVT::v16i8)
-      return SelectPostLoad(Node, 4, AArch64::LD1Fourv16b_POST, AArch64::qsub0);
-    else if (VT == MVT::v4i16 || VT == MVT::v4f16)
-      return SelectPostLoad(Node, 4, AArch64::LD1Fourv4h_POST, AArch64::dsub0);
-    else if (VT == MVT::v8i16 || VT == MVT::v8f16)
-      return SelectPostLoad(Node, 4, AArch64::LD1Fourv8h_POST, AArch64::qsub0);
-    else if (VT == MVT::v2i32 || VT == MVT::v2f32)
-      return SelectPostLoad(Node, 4, AArch64::LD1Fourv2s_POST, AArch64::dsub0);
-    else if (VT == MVT::v4i32 || VT == MVT::v4f32)
-      return SelectPostLoad(Node, 4, AArch64::LD1Fourv4s_POST, AArch64::qsub0);
-    else if (VT == MVT::v1i64 || VT == MVT::v1f64)
-      return SelectPostLoad(Node, 4, AArch64::LD1Fourv1d_POST, AArch64::dsub0);
-    else if (VT == MVT::v2i64 || VT == MVT::v2f64)
-      return SelectPostLoad(Node, 4, AArch64::LD1Fourv2d_POST, AArch64::qsub0);
+    if (VT == MVT::v8i8) {
+      SelectPostLoad(Node, 4, AArch64::LD1Fourv8b_POST, AArch64::dsub0);
+      return;
+    } else if (VT == MVT::v16i8) {
+      SelectPostLoad(Node, 4, AArch64::LD1Fourv16b_POST, AArch64::qsub0);
+      return;
+    } else if (VT == MVT::v4i16 || VT == MVT::v4f16) {
+      SelectPostLoad(Node, 4, AArch64::LD1Fourv4h_POST, AArch64::dsub0);
+      return;
+    } else if (VT == MVT::v8i16 || VT == MVT::v8f16) {
+      SelectPostLoad(Node, 4, AArch64::LD1Fourv8h_POST, AArch64::qsub0);
+      return;
+    } else if (VT == MVT::v2i32 || VT == MVT::v2f32) {
+      SelectPostLoad(Node, 4, AArch64::LD1Fourv2s_POST, AArch64::dsub0);
+      return;
+    } else if (VT == MVT::v4i32 || VT == MVT::v4f32) {
+      SelectPostLoad(Node, 4, AArch64::LD1Fourv4s_POST, AArch64::qsub0);
+      return;
+    } else if (VT == MVT::v1i64 || VT == MVT::v1f64) {
+      SelectPostLoad(Node, 4, AArch64::LD1Fourv1d_POST, AArch64::dsub0);
+      return;
+    } else if (VT == MVT::v2i64 || VT == MVT::v2f64) {
+      SelectPostLoad(Node, 4, AArch64::LD1Fourv2d_POST, AArch64::qsub0);
+      return;
+    }
     break;
   }
   case AArch64ISD::LD1DUPpost: {
-    if (VT == MVT::v8i8)
-      return SelectPostLoad(Node, 1, AArch64::LD1Rv8b_POST, AArch64::dsub0);
-    else if (VT == MVT::v16i8)
-      return SelectPostLoad(Node, 1, AArch64::LD1Rv16b_POST, AArch64::qsub0);
-    else if (VT == MVT::v4i16 || VT == MVT::v4f16)
-      return SelectPostLoad(Node, 1, AArch64::LD1Rv4h_POST, AArch64::dsub0);
-    else if (VT == MVT::v8i16 || VT == MVT::v8f16)
-      return SelectPostLoad(Node, 1, AArch64::LD1Rv8h_POST, AArch64::qsub0);
-    else if (VT == MVT::v2i32 || VT == MVT::v2f32)
-      return SelectPostLoad(Node, 1, AArch64::LD1Rv2s_POST, AArch64::dsub0);
-    else if (VT == MVT::v4i32 || VT == MVT::v4f32)
-      return SelectPostLoad(Node, 1, AArch64::LD1Rv4s_POST, AArch64::qsub0);
-    else if (VT == MVT::v1i64 || VT == MVT::v1f64)
-      return SelectPostLoad(Node, 1, AArch64::LD1Rv1d_POST, AArch64::dsub0);
-    else if (VT == MVT::v2i64 || VT == MVT::v2f64)
-      return SelectPostLoad(Node, 1, AArch64::LD1Rv2d_POST, AArch64::qsub0);
+    if (VT == MVT::v8i8) {
+      SelectPostLoad(Node, 1, AArch64::LD1Rv8b_POST, AArch64::dsub0);
+      return;
+    } else if (VT == MVT::v16i8) {
+      SelectPostLoad(Node, 1, AArch64::LD1Rv16b_POST, AArch64::qsub0);
+      return;
+    } else if (VT == MVT::v4i16 || VT == MVT::v4f16) {
+      SelectPostLoad(Node, 1, AArch64::LD1Rv4h_POST, AArch64::dsub0);
+      return;
+    } else if (VT == MVT::v8i16 || VT == MVT::v8f16) {
+      SelectPostLoad(Node, 1, AArch64::LD1Rv8h_POST, AArch64::qsub0);
+      return;
+    } else if (VT == MVT::v2i32 || VT == MVT::v2f32) {
+      SelectPostLoad(Node, 1, AArch64::LD1Rv2s_POST, AArch64::dsub0);
+      return;
+    } else if (VT == MVT::v4i32 || VT == MVT::v4f32) {
+      SelectPostLoad(Node, 1, AArch64::LD1Rv4s_POST, AArch64::qsub0);
+      return;
+    } else if (VT == MVT::v1i64 || VT == MVT::v1f64) {
+      SelectPostLoad(Node, 1, AArch64::LD1Rv1d_POST, AArch64::dsub0);
+      return;
+    } else if (VT == MVT::v2i64 || VT == MVT::v2f64) {
+      SelectPostLoad(Node, 1, AArch64::LD1Rv2d_POST, AArch64::qsub0);
+      return;
+    }
     break;
   }
   case AArch64ISD::LD2DUPpost: {
-    if (VT == MVT::v8i8)
-      return SelectPostLoad(Node, 2, AArch64::LD2Rv8b_POST, AArch64::dsub0);
-    else if (VT == MVT::v16i8)
-      return SelectPostLoad(Node, 2, AArch64::LD2Rv16b_POST, AArch64::qsub0);
-    else if (VT == MVT::v4i16 || VT == MVT::v4f16)
-      return SelectPostLoad(Node, 2, AArch64::LD2Rv4h_POST, AArch64::dsub0);
-    else if (VT == MVT::v8i16 || VT == MVT::v8f16)
-      return SelectPostLoad(Node, 2, AArch64::LD2Rv8h_POST, AArch64::qsub0);
-    else if (VT == MVT::v2i32 || VT == MVT::v2f32)
-      return SelectPostLoad(Node, 2, AArch64::LD2Rv2s_POST, AArch64::dsub0);
-    else if (VT == MVT::v4i32 || VT == MVT::v4f32)
-      return SelectPostLoad(Node, 2, AArch64::LD2Rv4s_POST, AArch64::qsub0);
-    else if (VT == MVT::v1i64 || VT == MVT::v1f64)
-      return SelectPostLoad(Node, 2, AArch64::LD2Rv1d_POST, AArch64::dsub0);
-    else if (VT == MVT::v2i64 || VT == MVT::v2f64)
-      return SelectPostLoad(Node, 2, AArch64::LD2Rv2d_POST, AArch64::qsub0);
+    if (VT == MVT::v8i8) {
+      SelectPostLoad(Node, 2, AArch64::LD2Rv8b_POST, AArch64::dsub0);
+      return;
+    } else if (VT == MVT::v16i8) {
+      SelectPostLoad(Node, 2, AArch64::LD2Rv16b_POST, AArch64::qsub0);
+      return;
+    } else if (VT == MVT::v4i16 || VT == MVT::v4f16) {
+      SelectPostLoad(Node, 2, AArch64::LD2Rv4h_POST, AArch64::dsub0);
+      return;
+    } else if (VT == MVT::v8i16 || VT == MVT::v8f16) {
+      SelectPostLoad(Node, 2, AArch64::LD2Rv8h_POST, AArch64::qsub0);
+      return;
+    } else if (VT == MVT::v2i32 || VT == MVT::v2f32) {
+      SelectPostLoad(Node, 2, AArch64::LD2Rv2s_POST, AArch64::dsub0);
+      return;
+    } else if (VT == MVT::v4i32 || VT == MVT::v4f32) {
+      SelectPostLoad(Node, 2, AArch64::LD2Rv4s_POST, AArch64::qsub0);
+      return;
+    } else if (VT == MVT::v1i64 || VT == MVT::v1f64) {
+      SelectPostLoad(Node, 2, AArch64::LD2Rv1d_POST, AArch64::dsub0);
+      return;
+    } else if (VT == MVT::v2i64 || VT == MVT::v2f64) {
+      SelectPostLoad(Node, 2, AArch64::LD2Rv2d_POST, AArch64::qsub0);
+      return;
+    }
     break;
   }
   case AArch64ISD::LD3DUPpost: {
-    if (VT == MVT::v8i8)
-      return SelectPostLoad(Node, 3, AArch64::LD3Rv8b_POST, AArch64::dsub0);
-    else if (VT == MVT::v16i8)
-      return SelectPostLoad(Node, 3, AArch64::LD3Rv16b_POST, AArch64::qsub0);
-    else if (VT == MVT::v4i16 || VT == MVT::v4f16)
-      return SelectPostLoad(Node, 3, AArch64::LD3Rv4h_POST, AArch64::dsub0);
-    else if (VT == MVT::v8i16 || VT == MVT::v8f16)
-      return SelectPostLoad(Node, 3, AArch64::LD3Rv8h_POST, AArch64::qsub0);
-    else if (VT == MVT::v2i32 || VT == MVT::v2f32)
-      return SelectPostLoad(Node, 3, AArch64::LD3Rv2s_POST, AArch64::dsub0);
-    else if (VT == MVT::v4i32 || VT == MVT::v4f32)
-      return SelectPostLoad(Node, 3, AArch64::LD3Rv4s_POST, AArch64::qsub0);
-    else if (VT == MVT::v1i64 || VT == MVT::v1f64)
-      return SelectPostLoad(Node, 3, AArch64::LD3Rv1d_POST, AArch64::dsub0);
-    else if (VT == MVT::v2i64 || VT == MVT::v2f64)
-      return SelectPostLoad(Node, 3, AArch64::LD3Rv2d_POST, AArch64::qsub0);
+    if (VT == MVT::v8i8) {
+      SelectPostLoad(Node, 3, AArch64::LD3Rv8b_POST, AArch64::dsub0);
+      return;
+    } else if (VT == MVT::v16i8) {
+      SelectPostLoad(Node, 3, AArch64::LD3Rv16b_POST, AArch64::qsub0);
+      return;
+    } else if (VT == MVT::v4i16 || VT == MVT::v4f16) {
+      SelectPostLoad(Node, 3, AArch64::LD3Rv4h_POST, AArch64::dsub0);
+      return;
+    } else if (VT == MVT::v8i16 || VT == MVT::v8f16) {
+      SelectPostLoad(Node, 3, AArch64::LD3Rv8h_POST, AArch64::qsub0);
+      return;
+    } else if (VT == MVT::v2i32 || VT == MVT::v2f32) {
+      SelectPostLoad(Node, 3, AArch64::LD3Rv2s_POST, AArch64::dsub0);
+      return;
+    } else if (VT == MVT::v4i32 || VT == MVT::v4f32) {
+      SelectPostLoad(Node, 3, AArch64::LD3Rv4s_POST, AArch64::qsub0);
+      return;
+    } else if (VT == MVT::v1i64 || VT == MVT::v1f64) {
+      SelectPostLoad(Node, 3, AArch64::LD3Rv1d_POST, AArch64::dsub0);
+      return;
+    } else if (VT == MVT::v2i64 || VT == MVT::v2f64) {
+      SelectPostLoad(Node, 3, AArch64::LD3Rv2d_POST, AArch64::qsub0);
+      return;
+    }
     break;
   }
   case AArch64ISD::LD4DUPpost: {
-    if (VT == MVT::v8i8)
-      return SelectPostLoad(Node, 4, AArch64::LD4Rv8b_POST, AArch64::dsub0);
-    else if (VT == MVT::v16i8)
-      return SelectPostLoad(Node, 4, AArch64::LD4Rv16b_POST, AArch64::qsub0);
-    else if (VT == MVT::v4i16 || VT == MVT::v4f16)
-      return SelectPostLoad(Node, 4, AArch64::LD4Rv4h_POST, AArch64::dsub0);
-    else if (VT == MVT::v8i16 || VT == MVT::v8f16)
-      return SelectPostLoad(Node, 4, AArch64::LD4Rv8h_POST, AArch64::qsub0);
-    else if (VT == MVT::v2i32 || VT == MVT::v2f32)
-      return SelectPostLoad(Node, 4, AArch64::LD4Rv2s_POST, AArch64::dsub0);
-    else if (VT == MVT::v4i32 || VT == MVT::v4f32)
-      return SelectPostLoad(Node, 4, AArch64::LD4Rv4s_POST, AArch64::qsub0);
-    else if (VT == MVT::v1i64 || VT == MVT::v1f64)
-      return SelectPostLoad(Node, 4, AArch64::LD4Rv1d_POST, AArch64::dsub0);
-    else if (VT == MVT::v2i64 || VT == MVT::v2f64)
-      return SelectPostLoad(Node, 4, AArch64::LD4Rv2d_POST, AArch64::qsub0);
+    if (VT == MVT::v8i8) {
+      SelectPostLoad(Node, 4, AArch64::LD4Rv8b_POST, AArch64::dsub0);
+      return;
+    } else if (VT == MVT::v16i8) {
+      SelectPostLoad(Node, 4, AArch64::LD4Rv16b_POST, AArch64::qsub0);
+      return;
+    } else if (VT == MVT::v4i16 || VT == MVT::v4f16) {
+      SelectPostLoad(Node, 4, AArch64::LD4Rv4h_POST, AArch64::dsub0);
+      return;
+    } else if (VT == MVT::v8i16 || VT == MVT::v8f16) {
+      SelectPostLoad(Node, 4, AArch64::LD4Rv8h_POST, AArch64::qsub0);
+      return;
+    } else if (VT == MVT::v2i32 || VT == MVT::v2f32) {
+      SelectPostLoad(Node, 4, AArch64::LD4Rv2s_POST, AArch64::dsub0);
+      return;
+    } else if (VT == MVT::v4i32 || VT == MVT::v4f32) {
+      SelectPostLoad(Node, 4, AArch64::LD4Rv4s_POST, AArch64::qsub0);
+      return;
+    } else if (VT == MVT::v1i64 || VT == MVT::v1f64) {
+      SelectPostLoad(Node, 4, AArch64::LD4Rv1d_POST, AArch64::dsub0);
+      return;
+    } else if (VT == MVT::v2i64 || VT == MVT::v2f64) {
+      SelectPostLoad(Node, 4, AArch64::LD4Rv2d_POST, AArch64::qsub0);
+      return;
+    }
     break;
   }
   case AArch64ISD::LD1LANEpost: {
-    if (VT == MVT::v16i8 || VT == MVT::v8i8)
-      return SelectPostLoadLane(Node, 1, AArch64::LD1i8_POST);
-    else if (VT == MVT::v8i16 || VT == MVT::v4i16 || VT == MVT::v4f16 ||
-             VT == MVT::v8f16)
-      return SelectPostLoadLane(Node, 1, AArch64::LD1i16_POST);
-    else if (VT == MVT::v4i32 || VT == MVT::v2i32 || VT == MVT::v4f32 ||
-             VT == MVT::v2f32)
-      return SelectPostLoadLane(Node, 1, AArch64::LD1i32_POST);
-    else if (VT == MVT::v2i64 || VT == MVT::v1i64 || VT == MVT::v2f64 ||
-             VT == MVT::v1f64)
-      return SelectPostLoadLane(Node, 1, AArch64::LD1i64_POST);
+    if (VT == MVT::v16i8 || VT == MVT::v8i8) {
+      SelectPostLoadLane(Node, 1, AArch64::LD1i8_POST);
+      return;
+    } else if (VT == MVT::v8i16 || VT == MVT::v4i16 || VT == MVT::v4f16 ||
+               VT == MVT::v8f16) {
+      SelectPostLoadLane(Node, 1, AArch64::LD1i16_POST);
+      return;
+    } else if (VT == MVT::v4i32 || VT == MVT::v2i32 || VT == MVT::v4f32 ||
+               VT == MVT::v2f32) {
+      SelectPostLoadLane(Node, 1, AArch64::LD1i32_POST);
+      return;
+    } else if (VT == MVT::v2i64 || VT == MVT::v1i64 || VT == MVT::v2f64 ||
+               VT == MVT::v1f64) {
+      SelectPostLoadLane(Node, 1, AArch64::LD1i64_POST);
+      return;
+    }
     break;
   }
   case AArch64ISD::LD2LANEpost: {
-    if (VT == MVT::v16i8 || VT == MVT::v8i8)
-      return SelectPostLoadLane(Node, 2, AArch64::LD2i8_POST);
-    else if (VT == MVT::v8i16 || VT == MVT::v4i16 || VT == MVT::v4f16 ||
-             VT == MVT::v8f16)
-      return SelectPostLoadLane(Node, 2, AArch64::LD2i16_POST);
-    else if (VT == MVT::v4i32 || VT == MVT::v2i32 || VT == MVT::v4f32 ||
-             VT == MVT::v2f32)
-      return SelectPostLoadLane(Node, 2, AArch64::LD2i32_POST);
-    else if (VT == MVT::v2i64 || VT == MVT::v1i64 || VT == MVT::v2f64 ||
-             VT == MVT::v1f64)
-      return SelectPostLoadLane(Node, 2, AArch64::LD2i64_POST);
+    if (VT == MVT::v16i8 || VT == MVT::v8i8) {
+      SelectPostLoadLane(Node, 2, AArch64::LD2i8_POST);
+      return;
+    } else if (VT == MVT::v8i16 || VT == MVT::v4i16 || VT == MVT::v4f16 ||
+               VT == MVT::v8f16) {
+      SelectPostLoadLane(Node, 2, AArch64::LD2i16_POST);
+      return;
+    } else if (VT == MVT::v4i32 || VT == MVT::v2i32 || VT == MVT::v4f32 ||
+               VT == MVT::v2f32) {
+      SelectPostLoadLane(Node, 2, AArch64::LD2i32_POST);
+      return;
+    } else if (VT == MVT::v2i64 || VT == MVT::v1i64 || VT == MVT::v2f64 ||
+               VT == MVT::v1f64) {
+      SelectPostLoadLane(Node, 2, AArch64::LD2i64_POST);
+      return;
+    }
     break;
   }
   case AArch64ISD::LD3LANEpost: {
-    if (VT == MVT::v16i8 || VT == MVT::v8i8)
-      return SelectPostLoadLane(Node, 3, AArch64::LD3i8_POST);
-    else if (VT == MVT::v8i16 || VT == MVT::v4i16 || VT == MVT::v4f16 ||
-             VT == MVT::v8f16)
-      return SelectPostLoadLane(Node, 3, AArch64::LD3i16_POST);
-    else if (VT == MVT::v4i32 || VT == MVT::v2i32 || VT == MVT::v4f32 ||
-             VT == MVT::v2f32)
-      return SelectPostLoadLane(Node, 3, AArch64::LD3i32_POST);
-    else if (VT == MVT::v2i64 || VT == MVT::v1i64 || VT == MVT::v2f64 ||
-             VT == MVT::v1f64)
-      return SelectPostLoadLane(Node, 3, AArch64::LD3i64_POST);
+    if (VT == MVT::v16i8 || VT == MVT::v8i8) {
+      SelectPostLoadLane(Node, 3, AArch64::LD3i8_POST);
+      return;
+    } else if (VT == MVT::v8i16 || VT == MVT::v4i16 || VT == MVT::v4f16 ||
+               VT == MVT::v8f16) {
+      SelectPostLoadLane(Node, 3, AArch64::LD3i16_POST);
+      return;
+    } else if (VT == MVT::v4i32 || VT == MVT::v2i32 || VT == MVT::v4f32 ||
+               VT == MVT::v2f32) {
+      SelectPostLoadLane(Node, 3, AArch64::LD3i32_POST);
+      return;
+    } else if (VT == MVT::v2i64 || VT == MVT::v1i64 || VT == MVT::v2f64 ||
+               VT == MVT::v1f64) {
+      SelectPostLoadLane(Node, 3, AArch64::LD3i64_POST);
+      return;
+    }
     break;
   }
   case AArch64ISD::LD4LANEpost: {
-    if (VT == MVT::v16i8 || VT == MVT::v8i8)
-      return SelectPostLoadLane(Node, 4, AArch64::LD4i8_POST);
-    else if (VT == MVT::v8i16 || VT == MVT::v4i16 || VT == MVT::v4f16 ||
-             VT == MVT::v8f16)
-      return SelectPostLoadLane(Node, 4, AArch64::LD4i16_POST);
-    else if (VT == MVT::v4i32 || VT == MVT::v2i32 || VT == MVT::v4f32 ||
-             VT == MVT::v2f32)
-      return SelectPostLoadLane(Node, 4, AArch64::LD4i32_POST);
-    else if (VT == MVT::v2i64 || VT == MVT::v1i64 || VT == MVT::v2f64 ||
-             VT == MVT::v1f64)
-      return SelectPostLoadLane(Node, 4, AArch64::LD4i64_POST);
+    if (VT == MVT::v16i8 || VT == MVT::v8i8) {
+      SelectPostLoadLane(Node, 4, AArch64::LD4i8_POST);
+      return;
+    } else if (VT == MVT::v8i16 || VT == MVT::v4i16 || VT == MVT::v4f16 ||
+               VT == MVT::v8f16) {
+      SelectPostLoadLane(Node, 4, AArch64::LD4i16_POST);
+      return;
+    } else if (VT == MVT::v4i32 || VT == MVT::v2i32 || VT == MVT::v4f32 ||
+               VT == MVT::v2f32) {
+      SelectPostLoadLane(Node, 4, AArch64::LD4i32_POST);
+      return;
+    } else if (VT == MVT::v2i64 || VT == MVT::v1i64 || VT == MVT::v2f64 ||
+               VT == MVT::v1f64) {
+      SelectPostLoadLane(Node, 4, AArch64::LD4i64_POST);
+      return;
+    }
     break;
   }
   case AArch64ISD::ST2post: {
     VT = Node->getOperand(1).getValueType();
-    if (VT == MVT::v8i8)
-      return SelectPostStore(Node, 2, AArch64::ST2Twov8b_POST);
-    else if (VT == MVT::v16i8)
-      return SelectPostStore(Node, 2, AArch64::ST2Twov16b_POST);
-    else if (VT == MVT::v4i16 || VT == MVT::v4f16)
-      return SelectPostStore(Node, 2, AArch64::ST2Twov4h_POST);
-    else if (VT == MVT::v8i16 || VT == MVT::v8f16)
-      return SelectPostStore(Node, 2, AArch64::ST2Twov8h_POST);
-    else if (VT == MVT::v2i32 || VT == MVT::v2f32)
-      return SelectPostStore(Node, 2, AArch64::ST2Twov2s_POST);
-    else if (VT == MVT::v4i32 || VT == MVT::v4f32)
-      return SelectPostStore(Node, 2, AArch64::ST2Twov4s_POST);
-    else if (VT == MVT::v2i64 || VT == MVT::v2f64)
-      return SelectPostStore(Node, 2, AArch64::ST2Twov2d_POST);
-    else if (VT == MVT::v1i64 || VT == MVT::v1f64)
-      return SelectPostStore(Node, 2, AArch64::ST1Twov1d_POST);
+    if (VT == MVT::v8i8) {
+      SelectPostStore(Node, 2, AArch64::ST2Twov8b_POST);
+      return;
+    } else if (VT == MVT::v16i8) {
+      SelectPostStore(Node, 2, AArch64::ST2Twov16b_POST);
+      return;
+    } else if (VT == MVT::v4i16 || VT == MVT::v4f16) {
+      SelectPostStore(Node, 2, AArch64::ST2Twov4h_POST);
+      return;
+    } else if (VT == MVT::v8i16 || VT == MVT::v8f16) {
+      SelectPostStore(Node, 2, AArch64::ST2Twov8h_POST);
+      return;
+    } else if (VT == MVT::v2i32 || VT == MVT::v2f32) {
+      SelectPostStore(Node, 2, AArch64::ST2Twov2s_POST);
+      return;
+    } else if (VT == MVT::v4i32 || VT == MVT::v4f32) {
+      SelectPostStore(Node, 2, AArch64::ST2Twov4s_POST);
+      return;
+    } else if (VT == MVT::v2i64 || VT == MVT::v2f64) {
+      SelectPostStore(Node, 2, AArch64::ST2Twov2d_POST);
+      return;
+    } else if (VT == MVT::v1i64 || VT == MVT::v1f64) {
+      SelectPostStore(Node, 2, AArch64::ST1Twov1d_POST);
+      return;
+    }
     break;
   }
   case AArch64ISD::ST3post: {
     VT = Node->getOperand(1).getValueType();
-    if (VT == MVT::v8i8)
-      return SelectPostStore(Node, 3, AArch64::ST3Threev8b_POST);
-    else if (VT == MVT::v16i8)
-      return SelectPostStore(Node, 3, AArch64::ST3Threev16b_POST);
-    else if (VT == MVT::v4i16 || VT == MVT::v4f16)
-      return SelectPostStore(Node, 3, AArch64::ST3Threev4h_POST);
-    else if (VT == MVT::v8i16 || VT == MVT::v8f16)
-      return SelectPostStore(Node, 3, AArch64::ST3Threev8h_POST);
-    else if (VT == MVT::v2i32 || VT == MVT::v2f32)
-      return SelectPostStore(Node, 3, AArch64::ST3Threev2s_POST);
-    else if (VT == MVT::v4i32 || VT == MVT::v4f32)
-      return SelectPostStore(Node, 3, AArch64::ST3Threev4s_POST);
-    else if (VT == MVT::v2i64 || VT == MVT::v2f64)
-      return SelectPostStore(Node, 3, AArch64::ST3Threev2d_POST);
-    else if (VT == MVT::v1i64 || VT == MVT::v1f64)
-      return SelectPostStore(Node, 3, AArch64::ST1Threev1d_POST);
+    if (VT == MVT::v8i8) {
+      SelectPostStore(Node, 3, AArch64::ST3Threev8b_POST);
+      return;
+    } else if (VT == MVT::v16i8) {
+      SelectPostStore(Node, 3, AArch64::ST3Threev16b_POST);
+      return;
+    } else if (VT == MVT::v4i16 || VT == MVT::v4f16) {
+      SelectPostStore(Node, 3, AArch64::ST3Threev4h_POST);
+      return;
+    } else if (VT == MVT::v8i16 || VT == MVT::v8f16) {
+      SelectPostStore(Node, 3, AArch64::ST3Threev8h_POST);
+      return;
+    } else if (VT == MVT::v2i32 || VT == MVT::v2f32) {
+      SelectPostStore(Node, 3, AArch64::ST3Threev2s_POST);
+      return;
+    } else if (VT == MVT::v4i32 || VT == MVT::v4f32) {
+      SelectPostStore(Node, 3, AArch64::ST3Threev4s_POST);
+      return;
+    } else if (VT == MVT::v2i64 || VT == MVT::v2f64) {
+      SelectPostStore(Node, 3, AArch64::ST3Threev2d_POST);
+      return;
+    } else if (VT == MVT::v1i64 || VT == MVT::v1f64) {
+      SelectPostStore(Node, 3, AArch64::ST1Threev1d_POST);
+      return;
+    }
     break;
   }
   case AArch64ISD::ST4post: {
     VT = Node->getOperand(1).getValueType();
-    if (VT == MVT::v8i8)
-      return SelectPostStore(Node, 4, AArch64::ST4Fourv8b_POST);
-    else if (VT == MVT::v16i8)
-      return SelectPostStore(Node, 4, AArch64::ST4Fourv16b_POST);
-    else if (VT == MVT::v4i16 || VT == MVT::v4f16)
-      return SelectPostStore(Node, 4, AArch64::ST4Fourv4h_POST);
-    else if (VT == MVT::v8i16 || VT == MVT::v8f16)
-      return SelectPostStore(Node, 4, AArch64::ST4Fourv8h_POST);
-    else if (VT == MVT::v2i32 || VT == MVT::v2f32)
-      return SelectPostStore(Node, 4, AArch64::ST4Fourv2s_POST);
-    else if (VT == MVT::v4i32 || VT == MVT::v4f32)
-      return SelectPostStore(Node, 4, AArch64::ST4Fourv4s_POST);
-    else if (VT == MVT::v2i64 || VT == MVT::v2f64)
-      return SelectPostStore(Node, 4, AArch64::ST4Fourv2d_POST);
-    else if (VT == MVT::v1i64 || VT == MVT::v1f64)
-      return SelectPostStore(Node, 4, AArch64::ST1Fourv1d_POST);
+    if (VT == MVT::v8i8) {
+      SelectPostStore(Node, 4, AArch64::ST4Fourv8b_POST);
+      return;
+    } else if (VT == MVT::v16i8) {
+      SelectPostStore(Node, 4, AArch64::ST4Fourv16b_POST);
+      return;
+    } else if (VT == MVT::v4i16 || VT == MVT::v4f16) {
+      SelectPostStore(Node, 4, AArch64::ST4Fourv4h_POST);
+      return;
+    } else if (VT == MVT::v8i16 || VT == MVT::v8f16) {
+      SelectPostStore(Node, 4, AArch64::ST4Fourv8h_POST);
+      return;
+    } else if (VT == MVT::v2i32 || VT == MVT::v2f32) {
+      SelectPostStore(Node, 4, AArch64::ST4Fourv2s_POST);
+      return;
+    } else if (VT == MVT::v4i32 || VT == MVT::v4f32) {
+      SelectPostStore(Node, 4, AArch64::ST4Fourv4s_POST);
+      return;
+    } else if (VT == MVT::v2i64 || VT == MVT::v2f64) {
+      SelectPostStore(Node, 4, AArch64::ST4Fourv2d_POST);
+      return;
+    } else if (VT == MVT::v1i64 || VT == MVT::v1f64) {
+      SelectPostStore(Node, 4, AArch64::ST1Fourv1d_POST);
+      return;
+    }
     break;
   }
   case AArch64ISD::ST1x2post: {
     VT = Node->getOperand(1).getValueType();
-    if (VT == MVT::v8i8)
-      return SelectPostStore(Node, 2, AArch64::ST1Twov8b_POST);
-    else if (VT == MVT::v16i8)
-      return SelectPostStore(Node, 2, AArch64::ST1Twov16b_POST);
-    else if (VT == MVT::v4i16 || VT == MVT::v4f16)
-      return SelectPostStore(Node, 2, AArch64::ST1Twov4h_POST);
-    else if (VT == MVT::v8i16 || VT == MVT::v8f16)
-      return SelectPostStore(Node, 2, AArch64::ST1Twov8h_POST);
-    else if (VT == MVT::v2i32 || VT == MVT::v2f32)
-      return SelectPostStore(Node, 2, AArch64::ST1Twov2s_POST);
-    else if (VT == MVT::v4i32 || VT == MVT::v4f32)
-      return SelectPostStore(Node, 2, AArch64::ST1Twov4s_POST);
-    else if (VT == MVT::v1i64 || VT == MVT::v1f64)
-      return SelectPostStore(Node, 2, AArch64::ST1Twov1d_POST);
-    else if (VT == MVT::v2i64 || VT == MVT::v2f64)
-      return SelectPostStore(Node, 2, AArch64::ST1Twov2d_POST);
+    if (VT == MVT::v8i8) {
+      SelectPostStore(Node, 2, AArch64::ST1Twov8b_POST);
+      return;
+    } else if (VT == MVT::v16i8) {
+      SelectPostStore(Node, 2, AArch64::ST1Twov16b_POST);
+      return;
+    } else if (VT == MVT::v4i16 || VT == MVT::v4f16) {
+      SelectPostStore(Node, 2, AArch64::ST1Twov4h_POST);
+      return;
+    } else if (VT == MVT::v8i16 || VT == MVT::v8f16) {
+      SelectPostStore(Node, 2, AArch64::ST1Twov8h_POST);
+      return;
+    } else if (VT == MVT::v2i32 || VT == MVT::v2f32) {
+      SelectPostStore(Node, 2, AArch64::ST1Twov2s_POST);
+      return;
+    } else if (VT == MVT::v4i32 || VT == MVT::v4f32) {
+      SelectPostStore(Node, 2, AArch64::ST1Twov4s_POST);
+      return;
+    } else if (VT == MVT::v1i64 || VT == MVT::v1f64) {
+      SelectPostStore(Node, 2, AArch64::ST1Twov1d_POST);
+      return;
+    } else if (VT == MVT::v2i64 || VT == MVT::v2f64) {
+      SelectPostStore(Node, 2, AArch64::ST1Twov2d_POST);
+      return;
+    }
     break;
   }
   case AArch64ISD::ST1x3post: {
     VT = Node->getOperand(1).getValueType();
-    if (VT == MVT::v8i8)
-      return SelectPostStore(Node, 3, AArch64::ST1Threev8b_POST);
-    else if (VT == MVT::v16i8)
-      return SelectPostStore(Node, 3, AArch64::ST1Threev16b_POST);
-    else if (VT == MVT::v4i16 || VT == MVT::v4f16)
-      return SelectPostStore(Node, 3, AArch64::ST1Threev4h_POST);
-    else if (VT == MVT::v8i16 || VT == MVT::v8f16)
-      return SelectPostStore(Node, 3, AArch64::ST1Threev8h_POST);
-    else if (VT == MVT::v2i32 || VT == MVT::v2f32)
-      return SelectPostStore(Node, 3, AArch64::ST1Threev2s_POST);
-    else if (VT == MVT::v4i32 || VT == MVT::v4f32)
-      return SelectPostStore(Node, 3, AArch64::ST1Threev4s_POST);
-    else if (VT == MVT::v1i64 || VT == MVT::v1f64)
-      return SelectPostStore(Node, 3, AArch64::ST1Threev1d_POST);
-    else if (VT == MVT::v2i64 || VT == MVT::v2f64)
-      return SelectPostStore(Node, 3, AArch64::ST1Threev2d_POST);
+    if (VT == MVT::v8i8) {
+      SelectPostStore(Node, 3, AArch64::ST1Threev8b_POST);
+      return;
+    } else if (VT == MVT::v16i8) {
+      SelectPostStore(Node, 3, AArch64::ST1Threev16b_POST);
+      return;
+    } else if (VT == MVT::v4i16 || VT == MVT::v4f16) {
+      SelectPostStore(Node, 3, AArch64::ST1Threev4h_POST);
+      return;
+    } else if (VT == MVT::v8i16 || VT == MVT::v8f16) {
+      SelectPostStore(Node, 3, AArch64::ST1Threev8h_POST);
+      return;
+    } else if (VT == MVT::v2i32 || VT == MVT::v2f32) {
+      SelectPostStore(Node, 3, AArch64::ST1Threev2s_POST);
+      return;
+    } else if (VT == MVT::v4i32 || VT == MVT::v4f32) {
+      SelectPostStore(Node, 3, AArch64::ST1Threev4s_POST);
+      return;
+    } else if (VT == MVT::v1i64 || VT == MVT::v1f64) {
+      SelectPostStore(Node, 3, AArch64::ST1Threev1d_POST);
+      return;
+    } else if (VT == MVT::v2i64 || VT == MVT::v2f64) {
+      SelectPostStore(Node, 3, AArch64::ST1Threev2d_POST);
+      return;
+    }
     break;
   }
   case AArch64ISD::ST1x4post: {
     VT = Node->getOperand(1).getValueType();
-    if (VT == MVT::v8i8)
-      return SelectPostStore(Node, 4, AArch64::ST1Fourv8b_POST);
-    else if (VT == MVT::v16i8)
-      return SelectPostStore(Node, 4, AArch64::ST1Fourv16b_POST);
-    else if (VT == MVT::v4i16 || VT == MVT::v4f16)
-      return SelectPostStore(Node, 4, AArch64::ST1Fourv4h_POST);
-    else if (VT == MVT::v8i16 || VT == MVT::v8f16)
-      return SelectPostStore(Node, 4, AArch64::ST1Fourv8h_POST);
-    else if (VT == MVT::v2i32 || VT == MVT::v2f32)
-      return SelectPostStore(Node, 4, AArch64::ST1Fourv2s_POST);
-    else if (VT == MVT::v4i32 || VT == MVT::v4f32)
-      return SelectPostStore(Node, 4, AArch64::ST1Fourv4s_POST);
-    else if (VT == MVT::v1i64 || VT == MVT::v1f64)
-      return SelectPostStore(Node, 4, AArch64::ST1Fourv1d_POST);
-    else if (VT == MVT::v2i64 || VT == MVT::v2f64)
-      return SelectPostStore(Node, 4, AArch64::ST1Fourv2d_POST);
+    if (VT == MVT::v8i8) {
+      SelectPostStore(Node, 4, AArch64::ST1Fourv8b_POST);
+      return;
+    } else if (VT == MVT::v16i8) {
+      SelectPostStore(Node, 4, AArch64::ST1Fourv16b_POST);
+      return;
+    } else if (VT == MVT::v4i16 || VT == MVT::v4f16) {
+      SelectPostStore(Node, 4, AArch64::ST1Fourv4h_POST);
+      return;
+    } else if (VT == MVT::v8i16 || VT == MVT::v8f16) {
+      SelectPostStore(Node, 4, AArch64::ST1Fourv8h_POST);
+      return;
+    } else if (VT == MVT::v2i32 || VT == MVT::v2f32) {
+      SelectPostStore(Node, 4, AArch64::ST1Fourv2s_POST);
+      return;
+    } else if (VT == MVT::v4i32 || VT == MVT::v4f32) {
+      SelectPostStore(Node, 4, AArch64::ST1Fourv4s_POST);
+      return;
+    } else if (VT == MVT::v1i64 || VT == MVT::v1f64) {
+      SelectPostStore(Node, 4, AArch64::ST1Fourv1d_POST);
+      return;
+    } else if (VT == MVT::v2i64 || VT == MVT::v2f64) {
+      SelectPostStore(Node, 4, AArch64::ST1Fourv2d_POST);
+      return;
+    }
     break;
   }
   case AArch64ISD::ST2LANEpost: {
     VT = Node->getOperand(1).getValueType();
-    if (VT == MVT::v16i8 || VT == MVT::v8i8)
-      return SelectPostStoreLane(Node, 2, AArch64::ST2i8_POST);
-    else if (VT == MVT::v8i16 || VT == MVT::v4i16 || VT == MVT::v4f16 ||
-             VT == MVT::v8f16)
-      return SelectPostStoreLane(Node, 2, AArch64::ST2i16_POST);
-    else if (VT == MVT::v4i32 || VT == MVT::v2i32 || VT == MVT::v4f32 ||
-             VT == MVT::v2f32)
-      return SelectPostStoreLane(Node, 2, AArch64::ST2i32_POST);
-    else if (VT == MVT::v2i64 || VT == MVT::v1i64 || VT == MVT::v2f64 ||
-             VT == MVT::v1f64)
-      return SelectPostStoreLane(Node, 2, AArch64::ST2i64_POST);
+    if (VT == MVT::v16i8 || VT == MVT::v8i8) {
+      SelectPostStoreLane(Node, 2, AArch64::ST2i8_POST);
+      return;
+    } else if (VT == MVT::v8i16 || VT == MVT::v4i16 || VT == MVT::v4f16 ||
+               VT == MVT::v8f16) {
+      SelectPostStoreLane(Node, 2, AArch64::ST2i16_POST);
+      return;
+    } else if (VT == MVT::v4i32 || VT == MVT::v2i32 || VT == MVT::v4f32 ||
+               VT == MVT::v2f32) {
+      SelectPostStoreLane(Node, 2, AArch64::ST2i32_POST);
+      return;
+    } else if (VT == MVT::v2i64 || VT == MVT::v1i64 || VT == MVT::v2f64 ||
+               VT == MVT::v1f64) {
+      SelectPostStoreLane(Node, 2, AArch64::ST2i64_POST);
+      return;
+    }
     break;
   }
   case AArch64ISD::ST3LANEpost: {
     VT = Node->getOperand(1).getValueType();
-    if (VT == MVT::v16i8 || VT == MVT::v8i8)
-      return SelectPostStoreLane(Node, 3, AArch64::ST3i8_POST);
-    else if (VT == MVT::v8i16 || VT == MVT::v4i16 || VT == MVT::v4f16 ||
-             VT == MVT::v8f16)
-      return SelectPostStoreLane(Node, 3, AArch64::ST3i16_POST);
-    else if (VT == MVT::v4i32 || VT == MVT::v2i32 || VT == MVT::v4f32 ||
-             VT == MVT::v2f32)
-      return SelectPostStoreLane(Node, 3, AArch64::ST3i32_POST);
-    else if (VT == MVT::v2i64 || VT == MVT::v1i64 || VT == MVT::v2f64 ||
-             VT == MVT::v1f64)
-      return SelectPostStoreLane(Node, 3, AArch64::ST3i64_POST);
+    if (VT == MVT::v16i8 || VT == MVT::v8i8) {
+      SelectPostStoreLane(Node, 3, AArch64::ST3i8_POST);
+      return;
+    } else if (VT == MVT::v8i16 || VT == MVT::v4i16 || VT == MVT::v4f16 ||
+               VT == MVT::v8f16) {
+      SelectPostStoreLane(Node, 3, AArch64::ST3i16_POST);
+      return;
+    } else if (VT == MVT::v4i32 || VT == MVT::v2i32 || VT == MVT::v4f32 ||
+               VT == MVT::v2f32) {
+      SelectPostStoreLane(Node, 3, AArch64::ST3i32_POST);
+      return;
+    } else if (VT == MVT::v2i64 || VT == MVT::v1i64 || VT == MVT::v2f64 ||
+               VT == MVT::v1f64) {
+      SelectPostStoreLane(Node, 3, AArch64::ST3i64_POST);
+      return;
+    }
     break;
   }
   case AArch64ISD::ST4LANEpost: {
     VT = Node->getOperand(1).getValueType();
-    if (VT == MVT::v16i8 || VT == MVT::v8i8)
-      return SelectPostStoreLane(Node, 4, AArch64::ST4i8_POST);
-    else if (VT == MVT::v8i16 || VT == MVT::v4i16 || VT == MVT::v4f16 ||
-             VT == MVT::v8f16)
-      return SelectPostStoreLane(Node, 4, AArch64::ST4i16_POST);
-    else if (VT == MVT::v4i32 || VT == MVT::v2i32 || VT == MVT::v4f32 ||
-             VT == MVT::v2f32)
-      return SelectPostStoreLane(Node, 4, AArch64::ST4i32_POST);
-    else if (VT == MVT::v2i64 || VT == MVT::v1i64 || VT == MVT::v2f64 ||
-             VT == MVT::v1f64)
-      return SelectPostStoreLane(Node, 4, AArch64::ST4i64_POST);
+    if (VT == MVT::v16i8 || VT == MVT::v8i8) {
+      SelectPostStoreLane(Node, 4, AArch64::ST4i8_POST);
+      return;
+    } else if (VT == MVT::v8i16 || VT == MVT::v4i16 || VT == MVT::v4f16 ||
+               VT == MVT::v8f16) {
+      SelectPostStoreLane(Node, 4, AArch64::ST4i16_POST);
+      return;
+    } else if (VT == MVT::v4i32 || VT == MVT::v2i32 || VT == MVT::v4f32 ||
+               VT == MVT::v2f32) {
+      SelectPostStoreLane(Node, 4, AArch64::ST4i32_POST);
+      return;
+    } else if (VT == MVT::v2i64 || VT == MVT::v1i64 || VT == MVT::v2f64 ||
+               VT == MVT::v1f64) {
+      SelectPostStoreLane(Node, 4, AArch64::ST4i64_POST);
+      return;
+    }
     break;
   }
   }
 
   // Select the default instruction
-  ResNode = SelectCode(Node);
-
-  DEBUG(errs() << "=> ");
-  if (ResNode == nullptr || ResNode == Node)
-    DEBUG(Node->dump(CurDAG));
-  else
-    DEBUG(ResNode->dump(CurDAG));
-  DEBUG(errs() << "\n");
-
-  return ResNode;
+  SelectCode(Node);
 }
 
 /// createAArch64ISelDag - This pass converts a legalized DAG into a
diff --git a/lib/Target/AArch64/AArch64ISelLowering.cpp b/lib/Target/AArch64/AArch64ISelLowering.cpp
index 92cf1cd71970..d6f2a190d4c8 100644
--- a/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -40,12 +40,6 @@ using namespace llvm;
 STATISTIC(NumTailCalls, "Number of tail calls");
 STATISTIC(NumShiftInserts, "Number of vector shift inserts");
 
-// Place holder until extr generation is tested fully.
-static cl::opt<bool>
-EnableAArch64ExtrGeneration("aarch64-extr-generation", cl::Hidden,
-                          cl::desc("Allow AArch64 (or (shift)(shift))->extract"),
-                          cl::init(true));
-
 static cl::opt<bool>
 EnableAArch64SlrGeneration("aarch64-shift-insert-generation", cl::Hidden,
                            cl::desc("Allow AArch64 SLI/SRI formation"),
@@ -59,6 +53,13 @@ cl::opt<bool> EnableAArch64ELFLocalDynamicTLSGeneration(
     cl::desc("Allow AArch64 Local Dynamic TLS code generation"),
     cl::init(false));
 
+// Disabled for causing self-hosting failures once returned-attribute inference
+// was enabled.
+static cl::opt<bool>
+EnableThisRetForwarding("aarch64-this-return-forwarding", cl::Hidden,
+                        cl::desc("Directly forward this return"),
+                        cl::init(false));
+
 /// Value type used for condition codes.
 static const MVT MVT_CC = MVT::i32;
 
@@ -225,13 +226,6 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM,
   setOperationAction(ISD::SMUL_LOHI, MVT::i64, Expand);
 
 
-  // Expand the undefined-at-zero variants to cttz/ctlz to their defined-at-zero
-  // counterparts, which AArch64 supports directly.
-  setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::i32, Expand);
-  setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i32, Expand);
-  setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::i64, Expand);
-  setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i64, Expand);
-
   setOperationAction(ISD::CTPOP, MVT::i32, Custom);
   setOperationAction(ISD::CTPOP, MVT::i64, Custom);
 
@@ -402,6 +396,8 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM,
 
   setOperationAction(ISD::PREFETCH, MVT::Other, Custom);
 
+  setOperationAction(ISD::ATOMIC_CMP_SWAP, MVT::i128, Custom);
+
   // Lower READCYCLECOUNTER using an mrs from PMCCNTR_EL0.
   // This requires the Performance Monitors extension.
   if (Subtarget->hasPerfMon())
@@ -476,7 +472,7 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM,
   // Also, try to fold ADD into CSINC/CSINV..
   setTargetDAGCombine(ISD::ADD);
   setTargetDAGCombine(ISD::SUB);
-
+  setTargetDAGCombine(ISD::SRL);
   setTargetDAGCombine(ISD::XOR);
   setTargetDAGCombine(ISD::SINT_TO_FP);
   setTargetDAGCombine(ISD::UINT_TO_FP);
@@ -518,7 +514,11 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM,
   MaskAndBranchFoldingIsLegal = true;
   EnableExtLdPromotion = true;
 
+  // Set required alignment.
   setMinFunctionAlignment(2);
+  // Set preferred alignments.
+  setPrefFunctionAlignment(STI.getPrefFunctionAlignment());
+  setPrefLoopAlignment(STI.getPrefLoopAlignment());
 
   setHasExtractBitsInsn(true);
 
@@ -583,6 +583,18 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM,
     setOperationAction(ISD::SINT_TO_FP, MVT::v4i32, Custom);
     setOperationAction(ISD::UINT_TO_FP, MVT::v4i32, Custom);
 
+    setOperationAction(ISD::CTLZ,       MVT::v1i64, Expand);
+    setOperationAction(ISD::CTLZ,       MVT::v2i64, Expand);
+
+    setOperationAction(ISD::CTTZ,       MVT::v2i8,  Expand);
+    setOperationAction(ISD::CTTZ,       MVT::v4i16, Expand);
+    setOperationAction(ISD::CTTZ,       MVT::v2i32, Expand);
+    setOperationAction(ISD::CTTZ,       MVT::v1i64, Expand);
+    setOperationAction(ISD::CTTZ,       MVT::v16i8, Expand);
+    setOperationAction(ISD::CTTZ,       MVT::v8i16, Expand);
+    setOperationAction(ISD::CTTZ,       MVT::v4i32, Expand);
+    setOperationAction(ISD::CTTZ,       MVT::v2i64, Expand);
+
     // AArch64 doesn't have MUL.2d:
     setOperationAction(ISD::MUL, MVT::v2i64, Expand);
     // Custom handling for some quad-vector types to detect MULL.
@@ -623,91 +635,88 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM,
     }
   }
 
-  // Prefer likely predicted branches to selects on out-of-order cores.
-  if (Subtarget->isCortexA57())
-    PredictableSelectIsExpensive = true;
+  PredictableSelectIsExpensive = Subtarget->predictableSelectIsExpensive();
 }
 
-void AArch64TargetLowering::addTypeForNEON(EVT VT, EVT PromotedBitwiseVT) {
+void AArch64TargetLowering::addTypeForNEON(MVT VT, MVT PromotedBitwiseVT) {
   if (VT == MVT::v2f32 || VT == MVT::v4f16) {
-    setOperationAction(ISD::LOAD, VT.getSimpleVT(), Promote);
-    AddPromotedToType(ISD::LOAD, VT.getSimpleVT(), MVT::v2i32);
+    setOperationAction(ISD::LOAD, VT, Promote);
+    AddPromotedToType(ISD::LOAD, VT, MVT::v2i32);
 
-    setOperationAction(ISD::STORE, VT.getSimpleVT(), Promote);
-    AddPromotedToType(ISD::STORE, VT.getSimpleVT(), MVT::v2i32);
+    setOperationAction(ISD::STORE, VT, Promote);
+    AddPromotedToType(ISD::STORE, VT, MVT::v2i32);
   } else if (VT == MVT::v2f64 || VT == MVT::v4f32 || VT == MVT::v8f16) {
-    setOperationAction(ISD::LOAD, VT.getSimpleVT(), Promote);
-    AddPromotedToType(ISD::LOAD, VT.getSimpleVT(), MVT::v2i64);
+    setOperationAction(ISD::LOAD, VT, Promote);
+    AddPromotedToType(ISD::LOAD, VT, MVT::v2i64);
 
-    setOperationAction(ISD::STORE, VT.getSimpleVT(), Promote);
-    AddPromotedToType(ISD::STORE, VT.getSimpleVT(), MVT::v2i64);
+    setOperationAction(ISD::STORE, VT, Promote);
+    AddPromotedToType(ISD::STORE, VT, MVT::v2i64);
   }
 
   // Mark vector float intrinsics as expand.
   if (VT == MVT::v2f32 || VT == MVT::v4f32 || VT == MVT::v2f64) {
-    setOperationAction(ISD::FSIN, VT.getSimpleVT(), Expand);
-    setOperationAction(ISD::FCOS, VT.getSimpleVT(), Expand);
-    setOperationAction(ISD::FPOWI, VT.getSimpleVT(), Expand);
-    setOperationAction(ISD::FPOW, VT.getSimpleVT(), Expand);
-    setOperationAction(ISD::FLOG, VT.getSimpleVT(), Expand);
-    setOperationAction(ISD::FLOG2, VT.getSimpleVT(), Expand);
-    setOperationAction(ISD::FLOG10, VT.getSimpleVT(), Expand);
-    setOperationAction(ISD::FEXP, VT.getSimpleVT(), Expand);
-    setOperationAction(ISD::FEXP2, VT.getSimpleVT(), Expand);
+    setOperationAction(ISD::FSIN, VT, Expand);
+    setOperationAction(ISD::FCOS, VT, Expand);
+    setOperationAction(ISD::FPOWI, VT, Expand);
+    setOperationAction(ISD::FPOW, VT, Expand);
+    setOperationAction(ISD::FLOG, VT, Expand);
+    setOperationAction(ISD::FLOG2, VT, Expand);
+    setOperationAction(ISD::FLOG10, VT, Expand);
+    setOperationAction(ISD::FEXP, VT, Expand);
+    setOperationAction(ISD::FEXP2, VT, Expand);
 
     // But we do support custom-lowering for FCOPYSIGN.
-    setOperationAction(ISD::FCOPYSIGN, VT.getSimpleVT(), Custom);
-  }
-
-  setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT.getSimpleVT(), Custom);
-  setOperationAction(ISD::INSERT_VECTOR_ELT, VT.getSimpleVT(), Custom);
-  setOperationAction(ISD::BUILD_VECTOR, VT.getSimpleVT(), Custom);
-  setOperationAction(ISD::VECTOR_SHUFFLE, VT.getSimpleVT(), Custom);
-  setOperationAction(ISD::EXTRACT_SUBVECTOR, VT.getSimpleVT(), Custom);
-  setOperationAction(ISD::SRA, VT.getSimpleVT(), Custom);
-  setOperationAction(ISD::SRL, VT.getSimpleVT(), Custom);
-  setOperationAction(ISD::SHL, VT.getSimpleVT(), Custom);
-  setOperationAction(ISD::AND, VT.getSimpleVT(), Custom);
-  setOperationAction(ISD::OR, VT.getSimpleVT(), Custom);
-  setOperationAction(ISD::SETCC, VT.getSimpleVT(), Custom);
-  setOperationAction(ISD::CONCAT_VECTORS, VT.getSimpleVT(), Legal);
-
-  setOperationAction(ISD::SELECT, VT.getSimpleVT(), Expand);
-  setOperationAction(ISD::SELECT_CC, VT.getSimpleVT(), Expand);
-  setOperationAction(ISD::VSELECT, VT.getSimpleVT(), Expand);
+    setOperationAction(ISD::FCOPYSIGN, VT, Custom);
+  }
+
+  setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom);
+  setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Custom);
+  setOperationAction(ISD::BUILD_VECTOR, VT, Custom);
+  setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom);
+  setOperationAction(ISD::EXTRACT_SUBVECTOR, VT, Custom);
+  setOperationAction(ISD::SRA, VT, Custom);
+  setOperationAction(ISD::SRL, VT, Custom);
+  setOperationAction(ISD::SHL, VT, Custom);
+  setOperationAction(ISD::AND, VT, Custom);
+  setOperationAction(ISD::OR, VT, Custom);
+  setOperationAction(ISD::SETCC, VT, Custom);
+  setOperationAction(ISD::CONCAT_VECTORS, VT, Legal);
+
+  setOperationAction(ISD::SELECT, VT, Expand);
+  setOperationAction(ISD::SELECT_CC, VT, Expand);
+  setOperationAction(ISD::VSELECT, VT, Expand);
   for (MVT InnerVT : MVT::all_valuetypes())
-    setLoadExtAction(ISD::EXTLOAD, InnerVT, VT.getSimpleVT(), Expand);
+    setLoadExtAction(ISD::EXTLOAD, InnerVT, VT, Expand);
 
   // CNT supports only B element sizes.
   if (VT != MVT::v8i8 && VT != MVT::v16i8)
-    setOperationAction(ISD::CTPOP, VT.getSimpleVT(), Expand);
+    setOperationAction(ISD::CTPOP, VT, Expand);
 
-  setOperationAction(ISD::UDIV, VT.getSimpleVT(), Expand);
-  setOperationAction(ISD::SDIV, VT.getSimpleVT(), Expand);
-  setOperationAction(ISD::UREM, VT.getSimpleVT(), Expand);
-  setOperationAction(ISD::SREM, VT.getSimpleVT(), Expand);
-  setOperationAction(ISD::FREM, VT.getSimpleVT(), Expand);
+  setOperationAction(ISD::UDIV, VT, Expand);
+  setOperationAction(ISD::SDIV, VT, Expand);
+  setOperationAction(ISD::UREM, VT, Expand);
+  setOperationAction(ISD::SREM, VT, Expand);
+  setOperationAction(ISD::FREM, VT, Expand);
 
-  setOperationAction(ISD::FP_TO_SINT, VT.getSimpleVT(), Custom);
-  setOperationAction(ISD::FP_TO_UINT, VT.getSimpleVT(), Custom);
+  setOperationAction(ISD::FP_TO_SINT, VT, Custom);
+  setOperationAction(ISD::FP_TO_UINT, VT, Custom);
 
   // [SU][MIN|MAX] are available for all NEON types apart from i64.
-  if (!VT.isFloatingPoint() &&
-      VT.getSimpleVT() != MVT::v2i64 && VT.getSimpleVT() != MVT::v1i64)
+  if (!VT.isFloatingPoint() && VT != MVT::v2i64 && VT != MVT::v1i64)
     for (unsigned Opcode : {ISD::SMIN, ISD::SMAX, ISD::UMIN, ISD::UMAX})
-      setOperationAction(Opcode, VT.getSimpleVT(), Legal);
+      setOperationAction(Opcode, VT, Legal);
 
   // F[MIN|MAX][NUM|NAN] are available for all FP NEON types (not f16 though!).
   if (VT.isFloatingPoint() && VT.getVectorElementType() != MVT::f16)
     for (unsigned Opcode : {ISD::FMINNAN, ISD::FMAXNAN,
                             ISD::FMINNUM, ISD::FMAXNUM})
-      setOperationAction(Opcode, VT.getSimpleVT(), Legal);
+      setOperationAction(Opcode, VT, Legal);
 
   if (Subtarget->isLittleEndian()) {
     for (unsigned im = (unsigned)ISD::PRE_INC;
          im != (unsigned)ISD::LAST_INDEXED_MODE; ++im) {
-      setIndexedLoadAction(im, VT.getSimpleVT(), Legal);
-      setIndexedStoreAction(im, VT.getSimpleVT(), Legal);
+      setIndexedLoadAction(im, VT, Legal);
+      setIndexedStoreAction(im, VT, Legal);
     }
   }
 }
@@ -804,12 +813,9 @@ bool AArch64TargetLowering::allowsMisalignedMemoryAccesses(EVT VT,
   if (Subtarget->requiresStrictAlign())
     return false;
 
-  // FIXME: This is mostly true for Cyclone, but not necessarily others.
   if (Fast) {
-    // FIXME: Define an attribute for slow unaligned accesses instead of
-    // relying on the CPU type as a proxy.
-    // On Cyclone, unaligned 128-bit stores are slow.
-    *Fast = !Subtarget->isCyclone() || VT.getStoreSize() != 16 ||
+    // Some CPUs are fine with unaligned stores except for 128-bit ones.
+    *Fast = !Subtarget->isMisaligned128StoreSlow() || VT.getStoreSize() != 16 ||
             // See comments in performSTORECombine() for more details about
             // these conditions.
 
@@ -954,12 +960,14 @@ const char *AArch64TargetLowering::getTargetNodeName(unsigned Opcode) const {
   case AArch64ISD::ST4LANEpost:       return "AArch64ISD::ST4LANEpost";
   case AArch64ISD::SMULL:             return "AArch64ISD::SMULL";
   case AArch64ISD::UMULL:             return "AArch64ISD::UMULL";
+  case AArch64ISD::FRSQRTE:           return "AArch64ISD::FRSQRTE";
+  case AArch64ISD::FRECPE:            return "AArch64ISD::FRECPE";
   }
   return nullptr;
 }
 
 MachineBasicBlock *
-AArch64TargetLowering::EmitF128CSEL(MachineInstr *MI,
+AArch64TargetLowering::EmitF128CSEL(MachineInstr &MI,
                                     MachineBasicBlock *MBB) const {
   // We materialise the F128CSEL pseudo-instruction as some control flow and a
   // phi node:
@@ -976,14 +984,14 @@ AArch64TargetLowering::EmitF128CSEL(MachineInstr *MI,
   MachineFunction *MF = MBB->getParent();
   const TargetInstrInfo *TII = Subtarget->getInstrInfo();
   const BasicBlock *LLVM_BB = MBB->getBasicBlock();
-  DebugLoc DL = MI->getDebugLoc();
+  DebugLoc DL = MI.getDebugLoc();
   MachineFunction::iterator It = ++MBB->getIterator();
 
-  unsigned DestReg = MI->getOperand(0).getReg();
-  unsigned IfTrueReg = MI->getOperand(1).getReg();
-  unsigned IfFalseReg = MI->getOperand(2).getReg();
-  unsigned CondCode = MI->getOperand(3).getImm();
-  bool NZCVKilled = MI->getOperand(4).isKill();
+  unsigned DestReg = MI.getOperand(0).getReg();
+  unsigned IfTrueReg = MI.getOperand(1).getReg();
+  unsigned IfFalseReg = MI.getOperand(2).getReg();
+  unsigned CondCode = MI.getOperand(3).getImm();
+  bool NZCVKilled = MI.getOperand(4).isKill();
 
   MachineBasicBlock *TrueBB = MF->CreateMachineBasicBlock(LLVM_BB);
   MachineBasicBlock *EndBB = MF->CreateMachineBasicBlock(LLVM_BB);
@@ -1014,17 +1022,16 @@ AArch64TargetLowering::EmitF128CSEL(MachineInstr *MI,
       .addReg(IfFalseReg)
       .addMBB(MBB);
 
-  MI->eraseFromParent();
+  MI.eraseFromParent();
   return EndBB;
 }
 
-MachineBasicBlock *
-AArch64TargetLowering::EmitInstrWithCustomInserter(MachineInstr *MI,
-                                                 MachineBasicBlock *BB) const {
-  switch (MI->getOpcode()) {
+MachineBasicBlock *AArch64TargetLowering::EmitInstrWithCustomInserter(
+    MachineInstr &MI, MachineBasicBlock *BB) const {
+  switch (MI.getOpcode()) {
   default:
 #ifndef NDEBUG
-    MI->dump();
+    MI.dump();
 #endif
     llvm_unreachable("Unexpected instruction for custom inserter!");
 
@@ -1135,6 +1142,35 @@ static void changeFPCCToAArch64CC(ISD::CondCode CC,
   }
 }
 
+/// Convert a DAG fp condition code to an AArch64 CC.
+/// This differs from changeFPCCToAArch64CC in that it returns cond codes that
+/// should be AND'ed instead of OR'ed.
+static void changeFPCCToANDAArch64CC(ISD::CondCode CC,
+                                     AArch64CC::CondCode &CondCode,
+                                     AArch64CC::CondCode &CondCode2) {
+  CondCode2 = AArch64CC::AL;
+  switch (CC) {
+  default:
+    changeFPCCToAArch64CC(CC, CondCode, CondCode2);
+    assert(CondCode2 == AArch64CC::AL);
+    break;
+  case ISD::SETONE:
+    // (a one b)
+    // == ((a olt b) || (a ogt b))
+    // == ((a ord b) && (a une b))
+    CondCode = AArch64CC::VC;
+    CondCode2 = AArch64CC::NE;
+    break;
+  case ISD::SETUEQ:
+    // (a ueq b)
+    // == ((a uno b) || (a oeq b))
+    // == ((a ule b) && (a uge b))
+    CondCode = AArch64CC::PL;
+    CondCode2 = AArch64CC::LE;
+    break;
+  }
+}
+
 /// changeVectorFPCCToAArch64CC - Convert a DAG fp condition code to an AArch64
 /// CC usable with the vector instructions. Fewer operations are available
 /// without a real NZCV register, so we have to use less efficient combinations
@@ -1174,11 +1210,18 @@ static bool isLegalArithImmed(uint64_t C) {
 }
 
 static SDValue emitComparison(SDValue LHS, SDValue RHS, ISD::CondCode CC,
-                              SDLoc dl, SelectionDAG &DAG) {
+                              const SDLoc &dl, SelectionDAG &DAG) {
   EVT VT = LHS.getValueType();
 
-  if (VT.isFloatingPoint())
+  if (VT.isFloatingPoint()) {
+    assert(VT != MVT::f128);
+    if (VT == MVT::f16) {
+      LHS = DAG.getNode(ISD::FP_EXTEND, dl, MVT::f32, LHS);
+      RHS = DAG.getNode(ISD::FP_EXTEND, dl, MVT::f32, RHS);
+      VT = MVT::f32;
+    }
     return DAG.getNode(AArch64ISD::FCMP, dl, VT, LHS, RHS);
+  }
 
   // The CMP instruction is just an alias for SUBS, and representing it as
   // SUBS means that it's possible to get CSE with subtract operations.
@@ -1258,22 +1301,31 @@ static SDValue emitComparison(SDValue LHS, SDValue RHS, ISD::CondCode CC,
 /// Create a conditional comparison; Use CCMP, CCMN or FCCMP as appropriate.
 static SDValue emitConditionalComparison(SDValue LHS, SDValue RHS,
                                          ISD::CondCode CC, SDValue CCOp,
-                                         SDValue Condition, unsigned NZCV,
-                                         SDLoc DL, SelectionDAG &DAG) {
+                                         AArch64CC::CondCode Predicate,
+                                         AArch64CC::CondCode OutCC,
+                                         const SDLoc &DL, SelectionDAG &DAG) {
   unsigned Opcode = 0;
-  if (LHS.getValueType().isFloatingPoint())
+  if (LHS.getValueType().isFloatingPoint()) {
+    assert(LHS.getValueType() != MVT::f128);
+    if (LHS.getValueType() == MVT::f16) {
+      LHS = DAG.getNode(ISD::FP_EXTEND, DL, MVT::f32, LHS);
+      RHS = DAG.getNode(ISD::FP_EXTEND, DL, MVT::f32, RHS);
+    }
     Opcode = AArch64ISD::FCCMP;
-  else if (RHS.getOpcode() == ISD::SUB) {
+  } else if (RHS.getOpcode() == ISD::SUB) {
     SDValue SubOp0 = RHS.getOperand(0);
     if (isNullConstant(SubOp0) && (CC == ISD::SETEQ || CC == ISD::SETNE)) {
-        // See emitComparison() on why we can only do this for SETEQ and SETNE.
-        Opcode = AArch64ISD::CCMN;
-        RHS = RHS.getOperand(1);
-      }
+      // See emitComparison() on why we can only do this for SETEQ and SETNE.
+      Opcode = AArch64ISD::CCMN;
+      RHS = RHS.getOperand(1);
+    }
   }
   if (Opcode == 0)
     Opcode = AArch64ISD::CCMP;
 
+  SDValue Condition = DAG.getConstant(Predicate, DL, MVT_CC);
+  AArch64CC::CondCode InvOutCC = AArch64CC::getInvertedCondCode(OutCC);
+  unsigned NZCV = AArch64CC::getNZCVToSatisfyCondCode(InvOutCC);
   SDValue NZCVOp = DAG.getConstant(NZCV, DL, MVT::i32);
   return DAG.getNode(Opcode, DL, MVT_CC, LHS, RHS, NZCVOp, Condition, CCOp);
 }
@@ -1284,31 +1336,49 @@ static SDValue emitConditionalComparison(SDValue LHS, SDValue RHS,
 /// at the leafs only. i.e. "not (or (or x y) z)" can be changed to
 /// "and (and (not x) (not y)) (not z)"; "not (or (and x y) z)" cannot be
 /// brought into such a form.
-static bool isConjunctionDisjunctionTree(const SDValue Val, bool &CanPushNegate,
+static bool isConjunctionDisjunctionTree(const SDValue Val, bool &CanNegate,
                                          unsigned Depth = 0) {
   if (!Val.hasOneUse())
     return false;
   unsigned Opcode = Val->getOpcode();
   if (Opcode == ISD::SETCC) {
-    CanPushNegate = true;
+    if (Val->getOperand(0).getValueType() == MVT::f128)
+      return false;
+    CanNegate = true;
     return true;
   }
-  // Protect against stack overflow.
-  if (Depth > 15)
+  // Protect against exponential runtime and stack overflow.
+  if (Depth > 6)
     return false;
   if (Opcode == ISD::AND || Opcode == ISD::OR) {
     SDValue O0 = Val->getOperand(0);
     SDValue O1 = Val->getOperand(1);
-    bool CanPushNegateL;
-    if (!isConjunctionDisjunctionTree(O0, CanPushNegateL, Depth+1))
+    bool CanNegateL;
+    if (!isConjunctionDisjunctionTree(O0, CanNegateL, Depth+1))
       return false;
-    bool CanPushNegateR;
-    if (!isConjunctionDisjunctionTree(O1, CanPushNegateR, Depth+1))
+    bool CanNegateR;
+    if (!isConjunctionDisjunctionTree(O1, CanNegateR, Depth+1))
       return false;
-    // We cannot push a negate through an AND operation (it would become an OR),
-    // we can however change a (not (or x y)) to (and (not x) (not y)) if we can
-    // push the negate through the x/y subtrees.
-    CanPushNegate = (Opcode == ISD::OR) && CanPushNegateL && CanPushNegateR;
+
+    if (Opcode == ISD::OR) {
+      // For an OR expression we need to be able to negate at least one side or
+      // we cannot do the transformation at all.
+      if (!CanNegateL && !CanNegateR)
+        return false;
+      // We can however change a (not (or x y)) to (and (not x) (not y)) if we
+      // can negate the x and y subtrees.
+      CanNegate = CanNegateL && CanNegateR;
+    } else {
+      // If the operands are OR expressions then we finally need to negate their
+      // outputs, we can only do that for the operand with emitted last by
+      // negating OutCC, not for both operands.
+      bool NeedsNegOutL = O0->getOpcode() == ISD::OR;
+      bool NeedsNegOutR = O1->getOpcode() == ISD::OR;
+      if (NeedsNegOutL && NeedsNegOutR)
+        return false;
+      // We cannot negate an AND operation (it would become an OR),
+      CanNegate = false;
+    }
     return true;
   }
   return false;
@@ -1324,10 +1394,9 @@ static bool isConjunctionDisjunctionTree(const SDValue Val, bool &CanPushNegate,
 /// effects pushed to the tree leafs; @p Predicate is an NZCV flag predicate
 /// for the comparisons in the current subtree; @p Depth limits the search
 /// depth to avoid stack overflow.
-static SDValue emitConjunctionDisjunctionTree(SelectionDAG &DAG, SDValue Val,
-    AArch64CC::CondCode &OutCC, bool PushNegate = false,
-    SDValue CCOp = SDValue(), AArch64CC::CondCode Predicate = AArch64CC::AL,
-    unsigned Depth = 0) {
+static SDValue emitConjunctionDisjunctionTreeRec(SelectionDAG &DAG, SDValue Val,
+    AArch64CC::CondCode &OutCC, bool Negate, SDValue CCOp,
+    AArch64CC::CondCode Predicate) {
   // We're at a tree leaf, produce a conditional comparison operation.
   unsigned Opcode = Val->getOpcode();
   if (Opcode == ISD::SETCC) {
@@ -1335,7 +1404,7 @@ static SDValue emitConjunctionDisjunctionTree(SelectionDAG &DAG, SDValue Val,
     SDValue RHS = Val->getOperand(1);
     ISD::CondCode CC = cast<CondCodeSDNode>(Val->getOperand(2))->get();
     bool isInteger = LHS.getValueType().isInteger();
-    if (PushNegate)
+    if (Negate)
       CC = getSetCCInverse(CC, isInteger);
     SDLoc DL(Val);
     // Determine OutCC and handle FP special case.
@@ -1344,68 +1413,62 @@ static SDValue emitConjunctionDisjunctionTree(SelectionDAG &DAG, SDValue Val,
     } else {
       assert(LHS.getValueType().isFloatingPoint());
       AArch64CC::CondCode ExtraCC;
-      changeFPCCToAArch64CC(CC, OutCC, ExtraCC);
-      // Surpisingly some floating point conditions can't be tested with a
-      // single condition code. Construct an additional comparison in this case.
-      // See comment below on how we deal with OR conditions.
+      changeFPCCToANDAArch64CC(CC, OutCC, ExtraCC);
+      // Some floating point conditions can't be tested with a single condition
+      // code. Construct an additional comparison in this case.
       if (ExtraCC != AArch64CC::AL) {
         SDValue ExtraCmp;
         if (!CCOp.getNode())
           ExtraCmp = emitComparison(LHS, RHS, CC, DL, DAG);
-        else {
-          SDValue ConditionOp = DAG.getConstant(Predicate, DL, MVT_CC);
-          // Note that we want the inverse of ExtraCC, so NZCV is not inversed.
-          unsigned NZCV = AArch64CC::getNZCVToSatisfyCondCode(ExtraCC);
-          ExtraCmp = emitConditionalComparison(LHS, RHS, CC, CCOp, ConditionOp,
-                                               NZCV, DL, DAG);
-        }
+        else
+          ExtraCmp = emitConditionalComparison(LHS, RHS, CC, CCOp, Predicate,
+                                               ExtraCC, DL, DAG);
         CCOp = ExtraCmp;
-        Predicate = AArch64CC::getInvertedCondCode(ExtraCC);
-        OutCC = AArch64CC::getInvertedCondCode(OutCC);
+        Predicate = ExtraCC;
       }
     }
 
     // Produce a normal comparison if we are first in the chain
-    if (!CCOp.getNode())
+    if (!CCOp)
       return emitComparison(LHS, RHS, CC, DL, DAG);
     // Otherwise produce a ccmp.
-    SDValue ConditionOp = DAG.getConstant(Predicate, DL, MVT_CC);
-    AArch64CC::CondCode InvOutCC = AArch64CC::getInvertedCondCode(OutCC);
-    unsigned NZCV = AArch64CC::getNZCVToSatisfyCondCode(InvOutCC);
-    return emitConditionalComparison(LHS, RHS, CC, CCOp, ConditionOp, NZCV, DL,
+    return emitConditionalComparison(LHS, RHS, CC, CCOp, Predicate, OutCC, DL,
                                      DAG);
-  } else if ((Opcode != ISD::AND && Opcode != ISD::OR) || !Val->hasOneUse())
-    return SDValue();
-
-  assert((Opcode == ISD::OR || !PushNegate)
-         && "Can only push negate through OR operation");
+  }
+  assert((Opcode == ISD::AND || (Opcode == ISD::OR && Val->hasOneUse())) &&
+         "Valid conjunction/disjunction tree");
 
   // Check if both sides can be transformed.
   SDValue LHS = Val->getOperand(0);
   SDValue RHS = Val->getOperand(1);
-  bool CanPushNegateL;
-  if (!isConjunctionDisjunctionTree(LHS, CanPushNegateL, Depth+1))
-    return SDValue();
-  bool CanPushNegateR;
-  if (!isConjunctionDisjunctionTree(RHS, CanPushNegateR, Depth+1))
-    return SDValue();
 
-  // Do we need to negate our operands?
-  bool NegateOperands = Opcode == ISD::OR;
+  // In case of an OR we need to negate our operands and the result.
+  // (A v B) <=> not(not(A) ^ not(B))
+  bool NegateOpsAndResult = Opcode == ISD::OR;
   // We can negate the results of all previous operations by inverting the
-  // predicate flags giving us a free negation for one side. For the other side
-  // we need to be able to push the negation to the leafs of the tree.
-  if (NegateOperands) {
-    if (!CanPushNegateL && !CanPushNegateR)
-      return SDValue();
-    // Order the side where we can push the negate through to LHS.
-    if (!CanPushNegateL && CanPushNegateR)
+  // predicate flags giving us a free negation for one side. The other side
+  // must be negatable by itself.
+  if (NegateOpsAndResult) {
+    // See which side we can negate.
+    bool CanNegateL;
+    bool isValidL = isConjunctionDisjunctionTree(LHS, CanNegateL);
+    assert(isValidL && "Valid conjunction/disjunction tree");
+    (void)isValidL;
+
+#ifndef NDEBUG
+    bool CanNegateR;
+    bool isValidR = isConjunctionDisjunctionTree(RHS, CanNegateR);
+    assert(isValidR && "Valid conjunction/disjunction tree");
+    assert((CanNegateL || CanNegateR) && "Valid conjunction/disjunction tree");
+#endif
+
+    // Order the side which we cannot negate to RHS so we can emit it first.
+    if (!CanNegateL)
       std::swap(LHS, RHS);
   } else {
     bool NeedsNegOutL = LHS->getOpcode() == ISD::OR;
-    bool NeedsNegOutR = RHS->getOpcode() == ISD::OR;
-    if (NeedsNegOutL && NeedsNegOutR)
-      return SDValue();
+    assert((!NeedsNegOutL || RHS->getOpcode() != ISD::OR) &&
+           "Valid conjunction/disjunction tree");
     // Order the side where we need to negate the output flags to RHS so it
     // gets emitted first.
     if (NeedsNegOutL)
@@ -1416,24 +1479,39 @@ static SDValue emitConjunctionDisjunctionTree(SelectionDAG &DAG, SDValue Val,
   // through if we are already in a PushNegate case, otherwise we can negate
   // the "flags to test" afterwards.
   AArch64CC::CondCode RHSCC;
-  SDValue CmpR = emitConjunctionDisjunctionTree(DAG, RHS, RHSCC, PushNegate,
-                                                CCOp, Predicate, Depth+1);
-  if (NegateOperands && !PushNegate)
+  SDValue CmpR = emitConjunctionDisjunctionTreeRec(DAG, RHS, RHSCC, Negate,
+                                                   CCOp, Predicate);
+  if (NegateOpsAndResult && !Negate)
     RHSCC = AArch64CC::getInvertedCondCode(RHSCC);
-  // Emit LHS. We must push the negate through if we need to negate it.
-  SDValue CmpL = emitConjunctionDisjunctionTree(DAG, LHS, OutCC, NegateOperands,
-                                                CmpR, RHSCC, Depth+1);
+  // Emit LHS. We may need to negate it.
+  SDValue CmpL = emitConjunctionDisjunctionTreeRec(DAG, LHS, OutCC,
+                                                   NegateOpsAndResult, CmpR,
+                                                   RHSCC);
   // If we transformed an OR to and AND then we have to negate the result
-  // (or absorb a PushNegate resulting in a double negation).
-  if (Opcode == ISD::OR && !PushNegate)
+  // (or absorb the Negate parameter).
+  if (NegateOpsAndResult && !Negate)
     OutCC = AArch64CC::getInvertedCondCode(OutCC);
   return CmpL;
 }
 
+/// Emit conjunction or disjunction tree with the CMP/FCMP followed by a chain
+/// of CCMP/CFCMP ops. See @ref AArch64CCMP.
+/// \see emitConjunctionDisjunctionTreeRec().
+static SDValue emitConjunctionDisjunctionTree(SelectionDAG &DAG, SDValue Val,
+                                              AArch64CC::CondCode &OutCC) {
+  bool CanNegate;
+  if (!isConjunctionDisjunctionTree(Val, CanNegate))
+    return SDValue();
+
+  return emitConjunctionDisjunctionTreeRec(DAG, Val, OutCC, false, SDValue(),
+                                           AArch64CC::AL);
+}
+
 /// @}
 
 static SDValue getAArch64Cmp(SDValue LHS, SDValue RHS, ISD::CondCode CC,
-                             SDValue &AArch64cc, SelectionDAG &DAG, SDLoc dl) {
+                             SDValue &AArch64cc, SelectionDAG &DAG,
+                             const SDLoc &dl) {
   if (ConstantSDNode *RHSC = dyn_cast<ConstantSDNode>(RHS.getNode())) {
     EVT VT = RHS.getValueType();
     uint64_t C = RHSC->getZExtValue();
@@ -1994,7 +2072,7 @@ SDValue AArch64TargetLowering::LowerFSINCOS(SDValue Op,
   StructType *RetTy = StructType::get(ArgTy, ArgTy, nullptr);
   TargetLowering::CallLoweringInfo CLI(DAG);
   CLI.setDebugLoc(dl).setChain(DAG.getEntryNode())
-    .setCallee(CallingConv::Fast, RetTy, Callee, std::move(Args), 0);
+    .setCallee(CallingConv::Fast, RetTy, Callee, std::move(Args));
 
   std::pair<SDValue, SDValue> CallResult = LowerCallTo(CLI);
   return CallResult.first;
@@ -2096,8 +2174,7 @@ static SDValue skipExtensionForVectorMULL(SDNode *N, SelectionDAG &DAG) {
     // The values are implicitly truncated so sext vs. zext doesn't matter.
     Ops.push_back(DAG.getConstant(CInt.zextOrTrunc(32), dl, MVT::i32));
   }
-  return DAG.getNode(ISD::BUILD_VECTOR, dl,
-                     MVT::getVectorVT(TruncVT, NumElts), Ops);
+  return DAG.getBuildVector(MVT::getVectorVT(TruncVT, NumElts), dl, Ops);
 }
 
 static bool isSignExtended(SDNode *N, SelectionDAG &DAG) {
@@ -2213,7 +2290,7 @@ SDValue AArch64TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
   SDLoc dl(Op);
   switch (IntNo) {
   default: return SDValue();    // Don't custom lower most intrinsics.
-  case Intrinsic::aarch64_thread_pointer: {
+  case Intrinsic::thread_pointer: {
     EVT PtrVT = getPointerTy(DAG.getDataLayout());
     return DAG.getNode(AArch64ISD::THREAD_POINTER, dl, PtrVT);
   }
@@ -2356,6 +2433,8 @@ CCAssignFn *AArch64TargetLowering::CCAssignFnForCall(CallingConv::ID CC,
     return CC_AArch64_GHC;
   case CallingConv::C:
   case CallingConv::Fast:
+  case CallingConv::PreserveMost:
+  case CallingConv::CXX_FAST_TLS:
     if (!Subtarget->isTargetDarwin())
       return CC_AArch64_AAPCS;
     return IsVarArg ? CC_AArch64_DarwinPCS_VarArg : CC_AArch64_DarwinPCS;
@@ -2364,8 +2443,8 @@ CCAssignFn *AArch64TargetLowering::CCAssignFnForCall(CallingConv::ID CC,
 
 SDValue AArch64TargetLowering::LowerFormalArguments(
     SDValue Chain, CallingConv::ID CallConv, bool isVarArg,
-    const SmallVectorImpl<ISD::InputArg> &Ins, SDLoc DL, SelectionDAG &DAG,
-    SmallVectorImpl<SDValue> &InVals) const {
+    const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &DL,
+    SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals) const {
   MachineFunction &MF = DAG.getMachineFunction();
   MachineFrameInfo *MFI = MF.getFrameInfo();
 
@@ -2515,13 +2594,14 @@ SDValue AArch64TargetLowering::LowerFormalArguments(
       ArgValue = DAG.getExtLoad(
           ExtType, DL, VA.getLocVT(), Chain, FIN,
           MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI),
-          MemVT, false, false, false, 0);
+          MemVT);
 
       InVals.push_back(ArgValue);
     }
   }
 
   // varargs
+  AArch64FunctionInfo *FuncInfo = MF.getInfo<AArch64FunctionInfo>();
   if (isVarArg) {
     if (!Subtarget->isTargetDarwin()) {
       // The AAPCS variadic function ABI is identical to the non-variadic
@@ -2530,22 +2610,20 @@ SDValue AArch64TargetLowering::LowerFormalArguments(
       saveVarArgRegisters(CCInfo, DAG, DL, Chain);
     }
 
-    AArch64FunctionInfo *AFI = MF.getInfo<AArch64FunctionInfo>();
     // This will point to the next argument passed via stack.
     unsigned StackOffset = CCInfo.getNextStackOffset();
     // We currently pass all varargs at 8-byte alignment.
     StackOffset = ((StackOffset + 7) & ~7);
-    AFI->setVarArgsStackIndex(MFI->CreateFixedObject(4, StackOffset, true));
+    FuncInfo->setVarArgsStackIndex(MFI->CreateFixedObject(4, StackOffset, true));
   }
 
-  AArch64FunctionInfo *FuncInfo = MF.getInfo<AArch64FunctionInfo>();
   unsigned StackArgSize = CCInfo.getNextStackOffset();
   bool TailCallOpt = MF.getTarget().Options.GuaranteedTailCallOpt;
   if (DoesCalleeRestoreStack(CallConv, TailCallOpt)) {
     // This is a non-standard ABI so by fiat I say we're allowed to make full
     // use of the stack area to be popped, which must be aligned to 16 bytes in
     // any case:
-    StackArgSize = RoundUpToAlignment(StackArgSize, 16);
+    StackArgSize = alignTo(StackArgSize, 16);
 
     // If we're expected to restore the stack (e.g. fastcc) then we'll be adding
     // a multiple of 16.
@@ -2563,7 +2641,8 @@ SDValue AArch64TargetLowering::LowerFormalArguments(
 }
 
 void AArch64TargetLowering::saveVarArgRegisters(CCState &CCInfo,
-                                                SelectionDAG &DAG, SDLoc DL,
+                                                SelectionDAG &DAG,
+                                                const SDLoc &DL,
                                                 SDValue &Chain) const {
   MachineFunction &MF = DAG.getMachineFunction();
   MachineFrameInfo *MFI = MF.getFrameInfo();
@@ -2590,8 +2669,7 @@ void AArch64TargetLowering::saveVarArgRegisters(CCState &CCInfo,
       SDValue Val = DAG.getCopyFromReg(Chain, DL, VReg, MVT::i64);
       SDValue Store = DAG.getStore(
           Val.getValue(1), DL, Val, FIN,
-          MachinePointerInfo::getStack(DAG.getMachineFunction(), i * 8), false,
-          false, 0);
+          MachinePointerInfo::getStack(DAG.getMachineFunction(), i * 8));
       MemOps.push_back(Store);
       FIN =
           DAG.getNode(ISD::ADD, DL, PtrVT, FIN, DAG.getConstant(8, DL, PtrVT));
@@ -2620,8 +2698,7 @@ void AArch64TargetLowering::saveVarArgRegisters(CCState &CCInfo,
 
         SDValue Store = DAG.getStore(
             Val.getValue(1), DL, Val, FIN,
-            MachinePointerInfo::getStack(DAG.getMachineFunction(), i * 16),
-            false, false, 0);
+            MachinePointerInfo::getStack(DAG.getMachineFunction(), i * 16));
         MemOps.push_back(Store);
         FIN = DAG.getNode(ISD::ADD, DL, PtrVT, FIN,
                           DAG.getConstant(16, DL, PtrVT));
@@ -2640,8 +2717,8 @@ void AArch64TargetLowering::saveVarArgRegisters(CCState &CCInfo,
 /// appropriate copies out of appropriate physical registers.
 SDValue AArch64TargetLowering::LowerCallResult(
     SDValue Chain, SDValue InFlag, CallingConv::ID CallConv, bool isVarArg,
-    const SmallVectorImpl<ISD::InputArg> &Ins, SDLoc DL, SelectionDAG &DAG,
-    SmallVectorImpl<SDValue> &InVals, bool isThisReturn,
+    const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &DL,
+    SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals, bool isThisReturn,
     SDValue ThisVal) const {
   CCAssignFn *RetCC = CallConv == CallingConv::WebKit_JS
                           ? RetCC_AArch64_WebKit_JS
@@ -2658,7 +2735,7 @@ SDValue AArch64TargetLowering::LowerCallResult(
 
     // Pass 'this' value directly from the argument to return value, to avoid
     // reg unit interference
-    if (i == 0 && isThisReturn) {
+    if (i == 0 && isThisReturn && EnableThisRetForwarding) {
       assert(!VA.needsCustom() && VA.getLocVT() == MVT::i64 &&
              "unexpected return calling convention register assignment");
       InVals.push_back(ThisVal);
@@ -2688,7 +2765,6 @@ SDValue AArch64TargetLowering::LowerCallResult(
 
 bool AArch64TargetLowering::isEligibleForTailCallOptimization(
     SDValue Callee, CallingConv::ID CalleeCC, bool isVarArg,
-    bool isCalleeStructRet, bool isCallerStructRet,
     const SmallVectorImpl<ISD::OutputArg> &Outs,
     const SmallVectorImpl<SDValue> &OutVals,
     const SmallVectorImpl<ISD::InputArg> &Ins, SelectionDAG &DAG) const {
@@ -2698,7 +2774,7 @@ bool AArch64TargetLowering::isEligibleForTailCallOptimization(
   if (!IsTailCallConvention(CalleeCC) && CalleeCC != CallingConv::C)
     return false;
 
-  const MachineFunction &MF = DAG.getMachineFunction();
+  MachineFunction &MF = DAG.getMachineFunction();
   const Function *CallerF = MF.getFunction();
   CallingConv::ID CallerCC = CallerF->getCallingConv();
   bool CCMatch = CallerCC == CalleeCC;
@@ -2713,9 +2789,7 @@ bool AArch64TargetLowering::isEligibleForTailCallOptimization(
       return false;
 
   if (getTargetMachine().Options.GuaranteedTailCallOpt) {
-    if (IsTailCallConvention(CalleeCC) && CCMatch)
-      return true;
-    return false;
+    return IsTailCallConvention(CalleeCC) && CCMatch;
   }
 
   // Externally-defined functions with weak linkage should not be
@@ -2742,6 +2816,7 @@ bool AArch64TargetLowering::isEligibleForTailCallOptimization(
   assert((!isVarArg || CalleeCC == CallingConv::C) &&
          "Unexpected variadic calling convention");
 
+  LLVMContext &C = *DAG.getContext();
   if (isVarArg && !Outs.empty()) {
     // At least two cases here: if caller is fastcc then we can't have any
     // memory arguments (we'd be expected to clean up the stack afterwards). If
@@ -2750,8 +2825,7 @@ bool AArch64TargetLowering::isEligibleForTailCallOptimization(
     // FIXME: for now we take the most conservative of these in both cases:
     // disallow all variadic memory operands.
     SmallVector<CCValAssign, 16> ArgLocs;
-    CCState CCInfo(CalleeCC, isVarArg, DAG.getMachineFunction(), ArgLocs,
-                   *DAG.getContext());
+    CCState CCInfo(CalleeCC, isVarArg, MF, ArgLocs, C);
 
     CCInfo.AnalyzeCallOperands(Outs, CCAssignFnForCall(CalleeCC, true));
     for (const CCValAssign &ArgLoc : ArgLocs)
@@ -2759,34 +2833,18 @@ bool AArch64TargetLowering::isEligibleForTailCallOptimization(
         return false;
   }
 
-  // If the calling conventions do not match, then we'd better make sure the
-  // results are returned in the same way as what the caller expects.
+  // Check that the call results are passed in the same way.
+  if (!CCState::resultsCompatible(CalleeCC, CallerCC, MF, C, Ins,
+                                  CCAssignFnForCall(CalleeCC, isVarArg),
+                                  CCAssignFnForCall(CallerCC, isVarArg)))
+    return false;
+  // The callee has to preserve all registers the caller needs to preserve.
+  const AArch64RegisterInfo *TRI = Subtarget->getRegisterInfo();
+  const uint32_t *CallerPreserved = TRI->getCallPreservedMask(MF, CallerCC);
   if (!CCMatch) {
-    SmallVector<CCValAssign, 16> RVLocs1;
-    CCState CCInfo1(CalleeCC, false, DAG.getMachineFunction(), RVLocs1,
-                    *DAG.getContext());
-    CCInfo1.AnalyzeCallResult(Ins, CCAssignFnForCall(CalleeCC, isVarArg));
-
-    SmallVector<CCValAssign, 16> RVLocs2;
-    CCState CCInfo2(CallerCC, false, DAG.getMachineFunction(), RVLocs2,
-                    *DAG.getContext());
-    CCInfo2.AnalyzeCallResult(Ins, CCAssignFnForCall(CallerCC, isVarArg));
-
-    if (RVLocs1.size() != RVLocs2.size())
+    const uint32_t *CalleePreserved = TRI->getCallPreservedMask(MF, CalleeCC);
+    if (!TRI->regmaskSubsetEqual(CallerPreserved, CalleePreserved))
       return false;
-    for (unsigned i = 0, e = RVLocs1.size(); i != e; ++i) {
-      if (RVLocs1[i].isRegLoc() != RVLocs2[i].isRegLoc())
-        return false;
-      if (RVLocs1[i].getLocInfo() != RVLocs2[i].getLocInfo())
-        return false;
-      if (RVLocs1[i].isRegLoc()) {
-        if (RVLocs1[i].getLocReg() != RVLocs2[i].getLocReg())
-          return false;
-      } else {
-        if (RVLocs1[i].getLocMemOffset() != RVLocs2[i].getLocMemOffset())
-          return false;
-      }
-    }
   }
 
   // Nothing more to check if the callee is taking no arguments
@@ -2794,16 +2852,22 @@ bool AArch64TargetLowering::isEligibleForTailCallOptimization(
     return true;
 
   SmallVector<CCValAssign, 16> ArgLocs;
-  CCState CCInfo(CalleeCC, isVarArg, DAG.getMachineFunction(), ArgLocs,
-                 *DAG.getContext());
+  CCState CCInfo(CalleeCC, isVarArg, MF, ArgLocs, C);
 
   CCInfo.AnalyzeCallOperands(Outs, CCAssignFnForCall(CalleeCC, isVarArg));
 
   const AArch64FunctionInfo *FuncInfo = MF.getInfo<AArch64FunctionInfo>();
 
-  // If the stack arguments for this call would fit into our own save area then
-  // the call can be made tail.
-  return CCInfo.getNextStackOffset() <= FuncInfo->getBytesInStackArgArea();
+  // If the stack arguments for this call do not fit into our own save area then
+  // the call cannot be made tail.
+  if (CCInfo.getNextStackOffset() > FuncInfo->getBytesInStackArgArea())
+    return false;
+
+  const MachineRegisterInfo &MRI = MF.getRegInfo();
+  if (!parametersInCSRMatch(MRI, CallerPreserved, ArgLocs, OutVals))
+    return false;
+
+  return true;
 }
 
 SDValue AArch64TargetLowering::addTokenForArgument(SDValue Chain,
@@ -2845,7 +2909,8 @@ bool AArch64TargetLowering::DoesCalleeRestoreStack(CallingConv::ID CallCC,
 }
 
 bool AArch64TargetLowering::IsTailCallConvention(CallingConv::ID CallCC) const {
-  return CallCC == CallingConv::Fast;
+  return CallCC == CallingConv::Fast ||
+         CallCC == CallingConv::PreserveMost;
 }
 
 /// LowerCall - Lower a call to a callseq_start + CALL + callseq_end chain,
@@ -2865,7 +2930,6 @@ AArch64TargetLowering::LowerCall(CallLoweringInfo &CLI,
   bool IsVarArg = CLI.IsVarArg;
 
   MachineFunction &MF = DAG.getMachineFunction();
-  bool IsStructRet = (Outs.empty()) ? false : Outs[0].Flags.isSRet();
   bool IsThisReturn = false;
 
   AArch64FunctionInfo *FuncInfo = MF.getInfo<AArch64FunctionInfo>();
@@ -2875,8 +2939,7 @@ AArch64TargetLowering::LowerCall(CallLoweringInfo &CLI,
   if (IsTailCall) {
     // Check if it's really possible to do a tail call.
     IsTailCall = isEligibleForTailCallOptimization(
-        Callee, CallConv, IsVarArg, IsStructRet,
-        MF.getFunction()->hasStructRetAttr(), Outs, OutVals, Ins, DAG);
+        Callee, CallConv, IsVarArg, Outs, OutVals, Ins, DAG);
     if (!IsTailCall && CLI.CS && CLI.CS->isMustTailCall())
       report_fatal_error("failed to perform tail call elimination on a call "
                          "site marked musttail");
@@ -2959,7 +3022,7 @@ AArch64TargetLowering::LowerCall(CallLoweringInfo &CLI,
 
     // Since callee will pop argument stack as a tail call, we must keep the
     // popped size 16-byte aligned.
-    NumBytes = RoundUpToAlignment(NumBytes, 16);
+    NumBytes = alignTo(NumBytes, 16);
 
     // FPDiff will be negative if this tail call requires more space than we
     // would automatically have in our incoming argument space. Positive if we
@@ -3092,8 +3155,7 @@ AArch64TargetLowering::LowerCall(CallLoweringInfo &CLI,
             VA.getValVT() == MVT::i16)
           Arg = DAG.getNode(ISD::TRUNCATE, DL, VA.getValVT(), Arg);
 
-        SDValue Store =
-            DAG.getStore(Chain, DL, Arg, DstAddr, DstInfo, false, false, 0);
+        SDValue Store = DAG.getStore(Chain, DL, Arg, DstAddr, DstInfo);
         MemOpChains.push_back(Store);
       }
     }
@@ -3199,9 +3261,8 @@ AArch64TargetLowering::LowerCall(CallLoweringInfo &CLI,
   Chain = DAG.getNode(AArch64ISD::CALL, DL, NodeTys, Ops);
   InFlag = Chain.getValue(1);
 
-  uint64_t CalleePopBytes = DoesCalleeRestoreStack(CallConv, TailCallOpt)
-                                ? RoundUpToAlignment(NumBytes, 16)
-                                : 0;
+  uint64_t CalleePopBytes =
+      DoesCalleeRestoreStack(CallConv, TailCallOpt) ? alignTo(NumBytes, 16) : 0;
 
   Chain = DAG.getCALLSEQ_END(Chain, DAG.getIntPtrConstant(NumBytes, DL, true),
                              DAG.getIntPtrConstant(CalleePopBytes, DL, true),
@@ -3232,7 +3293,7 @@ AArch64TargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv,
                                    bool isVarArg,
                                    const SmallVectorImpl<ISD::OutputArg> &Outs,
                                    const SmallVectorImpl<SDValue> &OutVals,
-                                   SDLoc DL, SelectionDAG &DAG) const {
+                                   const SDLoc &DL, SelectionDAG &DAG) const {
   CCAssignFn *RetCC = CallConv == CallingConv::WebKit_JS
                           ? RetCC_AArch64_WebKit_JS
                           : RetCC_AArch64_AAPCS;
@@ -3318,26 +3379,6 @@ SDValue AArch64TargetLowering::LowerGlobalAddress(SDValue Op,
     return DAG.getNode(AArch64ISD::LOADgot, DL, PtrVT, GotAddr);
   }
 
-  if ((OpFlags & AArch64II::MO_CONSTPOOL) != 0) {
-    assert(getTargetMachine().getCodeModel() == CodeModel::Small &&
-           "use of MO_CONSTPOOL only supported on small model");
-    SDValue Hi = DAG.getTargetConstantPool(GV, PtrVT, 0, 0, AArch64II::MO_PAGE);
-    SDValue ADRP = DAG.getNode(AArch64ISD::ADRP, DL, PtrVT, Hi);
-    unsigned char LoFlags = AArch64II::MO_PAGEOFF | AArch64II::MO_NC;
-    SDValue Lo = DAG.getTargetConstantPool(GV, PtrVT, 0, 0, LoFlags);
-    SDValue PoolAddr = DAG.getNode(AArch64ISD::ADDlow, DL, PtrVT, ADRP, Lo);
-    SDValue GlobalAddr = DAG.getLoad(
-        PtrVT, DL, DAG.getEntryNode(), PoolAddr,
-        MachinePointerInfo::getConstantPool(DAG.getMachineFunction()),
-        /*isVolatile=*/false,
-        /*isNonTemporal=*/true,
-        /*isInvariant=*/true, 8);
-    if (GN->getOffset() != 0)
-      return DAG.getNode(ISD::ADD, DL, PtrVT, GlobalAddr,
-                         DAG.getConstant(GN->getOffset(), DL, PtrVT));
-    return GlobalAddr;
-  }
-
   if (getTargetMachine().getCodeModel() == CodeModel::Large) {
     const unsigned char MO_NC = AArch64II::MO_NC;
     return DAG.getNode(
@@ -3405,8 +3446,9 @@ AArch64TargetLowering::LowerDarwinGlobalTLSAddress(SDValue Op,
   SDValue Chain = DAG.getEntryNode();
   SDValue FuncTLVGet =
       DAG.getLoad(MVT::i64, DL, Chain, DescAddr,
-                  MachinePointerInfo::getGOT(DAG.getMachineFunction()), false,
-                  true, true, 8);
+                  MachinePointerInfo::getGOT(DAG.getMachineFunction()),
+                  /* Alignment = */ 8, MachineMemOperand::MONonTemporal |
+                                           MachineMemOperand::MOInvariant);
   Chain = FuncTLVGet.getValue(1);
 
   MachineFrameInfo *MFI = DAG.getMachineFunction().getFrameInfo();
@@ -3447,18 +3489,16 @@ AArch64TargetLowering::LowerDarwinGlobalTLSAddress(SDValue Op,
 ///  Therefore, a pseudo-instruction (TLSDESC_CALLSEQ) is used to represent the
 ///  above sequence, and expanded really late in the compilation flow, to ensure
 ///  the sequence is produced as per above.
-SDValue AArch64TargetLowering::LowerELFTLSDescCallSeq(SDValue SymAddr, SDLoc DL,
+SDValue AArch64TargetLowering::LowerELFTLSDescCallSeq(SDValue SymAddr,
+                                                      const SDLoc &DL,
                                                       SelectionDAG &DAG) const {
   EVT PtrVT = getPointerTy(DAG.getDataLayout());
 
   SDValue Chain = DAG.getEntryNode();
   SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
 
-  SmallVector<SDValue, 2> Ops;
-  Ops.push_back(Chain);
-  Ops.push_back(SymAddr);
-
-  Chain = DAG.getNode(AArch64ISD::TLSDESC_CALLSEQ, DL, NodeTys, Ops);
+  Chain =
+      DAG.getNode(AArch64ISD::TLSDESC_CALLSEQ, DL, NodeTys, {Chain, SymAddr});
   SDValue Glue = Chain.getValue(1);
 
   return DAG.getCopyFromReg(Chain, DL, AArch64::X0, PtrVT, Glue);
@@ -3888,7 +3928,7 @@ SDValue AArch64TargetLowering::LowerSETCC(SDValue Op, SelectionDAG &DAG) const {
 
 SDValue AArch64TargetLowering::LowerSELECT_CC(ISD::CondCode CC, SDValue LHS,
                                               SDValue RHS, SDValue TVal,
-                                              SDValue FVal, SDLoc dl,
+                                              SDValue FVal, const SDLoc &dl,
                                               SelectionDAG &DAG) const {
   // Handle f128 first, because it will result in a comparison of some RTLIB
   // call result against zero.
@@ -4181,7 +4221,7 @@ SDValue AArch64TargetLowering::LowerDarwin_VASTART(SDValue Op,
                                  getPointerTy(DAG.getDataLayout()));
   const Value *SV = cast<SrcValueSDNode>(Op.getOperand(2))->getValue();
   return DAG.getStore(Op.getOperand(0), DL, FR, Op.getOperand(1),
-                      MachinePointerInfo(SV), false, false, 0);
+                      MachinePointerInfo(SV));
 }
 
 SDValue AArch64TargetLowering::LowerAAPCS_VASTART(SDValue Op,
@@ -4201,7 +4241,7 @@ SDValue AArch64TargetLowering::LowerAAPCS_VASTART(SDValue Op,
   // void *__stack at offset 0
   SDValue Stack = DAG.getFrameIndex(FuncInfo->getVarArgsStackIndex(), PtrVT);
   MemOps.push_back(DAG.getStore(Chain, DL, Stack, VAList,
-                                MachinePointerInfo(SV), false, false, 8));
+                                MachinePointerInfo(SV), /* Alignment = */ 8));
 
   // void *__gr_top at offset 8
   int GPRSize = FuncInfo->getVarArgsGPRSize();
@@ -4216,7 +4256,8 @@ SDValue AArch64TargetLowering::LowerAAPCS_VASTART(SDValue Op,
                         DAG.getConstant(GPRSize, DL, PtrVT));
 
     MemOps.push_back(DAG.getStore(Chain, DL, GRTop, GRTopAddr,
-                                  MachinePointerInfo(SV, 8), false, false, 8));
+                                  MachinePointerInfo(SV, 8),
+                                  /* Alignment = */ 8));
   }
 
   // void *__vr_top at offset 16
@@ -4231,24 +4272,23 @@ SDValue AArch64TargetLowering::LowerAAPCS_VASTART(SDValue Op,
                         DAG.getConstant(FPRSize, DL, PtrVT));
 
     MemOps.push_back(DAG.getStore(Chain, DL, VRTop, VRTopAddr,
-                                  MachinePointerInfo(SV, 16), false, false, 8));
+                                  MachinePointerInfo(SV, 16),
+                                  /* Alignment = */ 8));
   }
 
   // int __gr_offs at offset 24
   SDValue GROffsAddr =
       DAG.getNode(ISD::ADD, DL, PtrVT, VAList, DAG.getConstant(24, DL, PtrVT));
-  MemOps.push_back(DAG.getStore(Chain, DL,
-                                DAG.getConstant(-GPRSize, DL, MVT::i32),
-                                GROffsAddr, MachinePointerInfo(SV, 24), false,
-                                false, 4));
+  MemOps.push_back(DAG.getStore(
+      Chain, DL, DAG.getConstant(-GPRSize, DL, MVT::i32), GROffsAddr,
+      MachinePointerInfo(SV, 24), /* Alignment = */ 4));
 
   // int __vr_offs at offset 28
   SDValue VROffsAddr =
       DAG.getNode(ISD::ADD, DL, PtrVT, VAList, DAG.getConstant(28, DL, PtrVT));
-  MemOps.push_back(DAG.getStore(Chain, DL,
-                                DAG.getConstant(-FPRSize, DL, MVT::i32),
-                                VROffsAddr, MachinePointerInfo(SV, 28), false,
-                                false, 4));
+  MemOps.push_back(DAG.getStore(
+      Chain, DL, DAG.getConstant(-FPRSize, DL, MVT::i32), VROffsAddr,
+      MachinePointerInfo(SV, 28), /* Alignment = */ 4));
 
   return DAG.getNode(ISD::TokenFactor, DL, MVT::Other, MemOps);
 }
@@ -4287,8 +4327,7 @@ SDValue AArch64TargetLowering::LowerVAARG(SDValue Op, SelectionDAG &DAG) const {
   unsigned Align = Op.getConstantOperandVal(3);
   auto PtrVT = getPointerTy(DAG.getDataLayout());
 
-  SDValue VAList = DAG.getLoad(PtrVT, DL, Chain, Addr, MachinePointerInfo(V),
-                               false, false, false, 0);
+  SDValue VAList = DAG.getLoad(PtrVT, DL, Chain, Addr, MachinePointerInfo(V));
   Chain = VAList.getValue(1);
 
   if (Align > 8) {
@@ -4318,14 +4357,14 @@ SDValue AArch64TargetLowering::LowerVAARG(SDValue Op, SelectionDAG &DAG) const {
   SDValue VANext = DAG.getNode(ISD::ADD, DL, PtrVT, VAList,
                                DAG.getConstant(ArgSize, DL, PtrVT));
   // Store the incremented VAList to the legalized pointer
-  SDValue APStore = DAG.getStore(Chain, DL, VANext, Addr, MachinePointerInfo(V),
-                                 false, false, 0);
+  SDValue APStore =
+      DAG.getStore(Chain, DL, VANext, Addr, MachinePointerInfo(V));
 
   // Load the actual argument out of the pointer VAList
   if (NeedFPTrunc) {
     // Load the value as an f64.
-    SDValue WideFP = DAG.getLoad(MVT::f64, DL, APStore, VAList,
-                                 MachinePointerInfo(), false, false, false, 0);
+    SDValue WideFP =
+        DAG.getLoad(MVT::f64, DL, APStore, VAList, MachinePointerInfo());
     // Round the value down to an f32.
     SDValue NarrowFP = DAG.getNode(ISD::FP_ROUND, DL, VT, WideFP.getValue(0),
                                    DAG.getIntPtrConstant(1, DL));
@@ -4334,8 +4373,7 @@ SDValue AArch64TargetLowering::LowerVAARG(SDValue Op, SelectionDAG &DAG) const {
     return DAG.getMergeValues(Ops, DL);
   }
 
-  return DAG.getLoad(VT, DL, APStore, VAList, MachinePointerInfo(), false,
-                     false, false, 0);
+  return DAG.getLoad(VT, DL, APStore, VAList, MachinePointerInfo());
 }
 
 SDValue AArch64TargetLowering::LowerFRAMEADDR(SDValue Op,
@@ -4350,7 +4388,7 @@ SDValue AArch64TargetLowering::LowerFRAMEADDR(SDValue Op,
       DAG.getCopyFromReg(DAG.getEntryNode(), DL, AArch64::FP, VT);
   while (Depth--)
     FrameAddr = DAG.getLoad(VT, DL, DAG.getEntryNode(), FrameAddr,
-                            MachinePointerInfo(), false, false, false, 0);
+                            MachinePointerInfo());
   return FrameAddr;
 }
 
@@ -4381,7 +4419,7 @@ SDValue AArch64TargetLowering::LowerRETURNADDR(SDValue Op,
     SDValue Offset = DAG.getConstant(8, DL, getPointerTy(DAG.getDataLayout()));
     return DAG.getLoad(VT, DL, DAG.getEntryNode(),
                        DAG.getNode(ISD::ADD, DL, VT, FrameAddr, Offset),
-                       MachinePointerInfo(), false, false, false, 0);
+                       MachinePointerInfo());
   }
 
   // Return LR, which contains the return address. Mark it an implicit live-in.
@@ -4521,6 +4559,40 @@ bool AArch64TargetLowering::isFPImmLegal(const APFloat &Imm, EVT VT) const {
 //                          AArch64 Optimization Hooks
 //===----------------------------------------------------------------------===//
 
+/// getEstimate - Return the appropriate estimate DAG for either the reciprocal
+/// or the reciprocal square root.
+static SDValue getEstimate(const AArch64Subtarget &ST,
+  const AArch64TargetLowering::DAGCombinerInfo &DCI, unsigned Opcode,
+  const SDValue &Operand, unsigned &ExtraSteps) {
+  if (!ST.hasNEON())
+    return SDValue();
+
+  EVT VT = Operand.getValueType();
+
+  std::string RecipOp;
+  RecipOp = Opcode == (AArch64ISD::FRECPE) ? "div": "sqrt";
+  RecipOp = ((VT.isVector()) ? "vec-": "") + RecipOp;
+  RecipOp += (VT.getScalarType() == MVT::f64) ? "d": "f";
+
+  TargetRecip Recips = DCI.DAG.getTarget().Options.Reciprocals;
+  if (!Recips.isEnabled(RecipOp))
+    return SDValue();
+
+  ExtraSteps = Recips.getRefinementSteps(RecipOp);
+  return DCI.DAG.getNode(Opcode, SDLoc(Operand), VT, Operand);
+}
+
+SDValue AArch64TargetLowering::getRecipEstimate(SDValue Operand,
+  DAGCombinerInfo &DCI, unsigned &ExtraSteps) const {
+  return getEstimate(*Subtarget, DCI, AArch64ISD::FRECPE, Operand, ExtraSteps);
+}
+
+SDValue AArch64TargetLowering::getRsqrtEstimate(SDValue Operand,
+  DAGCombinerInfo &DCI, unsigned &ExtraSteps, bool &UseOneConst) const {
+  UseOneConst = true;
+  return getEstimate(*Subtarget, DCI, AArch64ISD::FRSQRTE, Operand, ExtraSteps);
+}
+
 //===----------------------------------------------------------------------===//
 //                          AArch64 Inline Assembly Support
 //===----------------------------------------------------------------------===//
@@ -4548,6 +4620,27 @@ bool AArch64TargetLowering::isFPImmLegal(const APFloat &Imm, EVT VT) const {
 // is prefixed by the %w modifier. Floating-point and SIMD register operands
 // will be output with the v prefix unless prefixed by the %b, %h, %s, %d or
 // %q modifier.
+const char *AArch64TargetLowering::LowerXConstraint(EVT ConstraintVT) const {
+  // At this point, we have to lower this constraint to something else, so we
+  // lower it to an "r" or "w". However, by doing this we will force the result
+  // to be in register, while the X constraint is much more permissive.
+  //
+  // Although we are correct (we are free to emit anything, without
+  // constraints), we might break use cases that would expect us to be more
+  // efficient and emit something else.
+  if (!Subtarget->hasFPARMv8())
+    return "r";
+
+  if (ConstraintVT.isFloatingPoint())
+    return "w";
+
+  if (ConstraintVT.isVector() &&
+     (ConstraintVT.getSizeInBits() == 64 ||
+      ConstraintVT.getSizeInBits() == 128))
+    return "w";
+
+  return "r";
+}
 
 /// getConstraintType - Given a constraint letter, return the type of
 /// constraint it is for this target.
@@ -4642,11 +4735,16 @@ AArch64TargetLowering::getRegForInlineAsmConstraint(
       int RegNo;
       bool Failed = Constraint.slice(2, Size - 1).getAsInteger(10, RegNo);
       if (!Failed && RegNo >= 0 && RegNo <= 31) {
-        // v0 - v31 are aliases of q0 - q31.
+        // v0 - v31 are aliases of q0 - q31 or d0 - d31 depending on size.
         // By default we'll emit v0-v31 for this unless there's a modifier where
         // we'll emit the correct register as well.
-        Res.first = AArch64::FPR128RegClass.getRegister(RegNo);
-        Res.second = &AArch64::FPR128RegClass;
+        if (VT != MVT::Other && VT.getSizeInBits() == 64) {
+          Res.first = AArch64::FPR64RegClass.getRegister(RegNo);
+          Res.second = &AArch64::FPR64RegClass;
+        } else {
+          Res.first = AArch64::FPR128RegClass.getRegister(RegNo);
+          Res.second = &AArch64::FPR128RegClass;
+        }
       }
     }
   }
@@ -4862,11 +4960,12 @@ SDValue AArch64TargetLowering::ReconstructShuffle(SDValue Op,
   SmallVector<ShuffleSourceInfo, 2> Sources;
   for (unsigned i = 0; i < NumElts; ++i) {
     SDValue V = Op.getOperand(i);
-    if (V.getOpcode() == ISD::UNDEF)
+    if (V.isUndef())
       continue;
-    else if (V.getOpcode() != ISD::EXTRACT_VECTOR_ELT) {
+    else if (V.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
+             !isa<ConstantSDNode>(V.getOperand(1))) {
       // A shuffle can only come from building a vector from various
-      // elements of other vectors.
+      // elements of other vectors, provided their indices are constant.
       return SDValue();
     }
 
@@ -4985,7 +5084,7 @@ SDValue AArch64TargetLowering::ReconstructShuffle(SDValue Op,
   int BitsPerShuffleLane = ShuffleVT.getVectorElementType().getSizeInBits();
   for (unsigned i = 0; i < VT.getVectorNumElements(); ++i) {
     SDValue Entry = Op.getOperand(i);
-    if (Entry.getOpcode() == ISD::UNDEF)
+    if (Entry.isUndef())
       continue;
 
     auto Src = std::find(Sources.begin(), Sources.end(), Entry.getOperand(0));
@@ -5018,7 +5117,7 @@ SDValue AArch64TargetLowering::ReconstructShuffle(SDValue Op,
     ShuffleOps[i] = Sources[i].ShuffleVec;
 
   SDValue Shuffle = DAG.getVectorShuffle(ShuffleVT, dl, ShuffleOps[0],
-                                         ShuffleOps[1], &Mask[0]);
+                                         ShuffleOps[1], Mask);
   return DAG.getNode(ISD::BITCAST, dl, VT, Shuffle);
 }
 
@@ -5304,7 +5403,7 @@ static SDValue tryFormConcatFromShuffle(SDValue Op, SelectionDAG &DAG) {
 /// the specified operations to build the shuffle.
 static SDValue GeneratePerfectShuffle(unsigned PFEntry, SDValue LHS,
                                       SDValue RHS, SelectionDAG &DAG,
-                                      SDLoc dl) {
+                                      const SDLoc &dl) {
   unsigned OpNum = (PFEntry >> 26) & 0x0F;
   unsigned LHSID = (PFEntry >> 13) & ((1 << 13) - 1);
   unsigned RHSID = (PFEntry >> 0) & ((1 << 13) - 1);
@@ -5433,35 +5532,34 @@ static SDValue GenerateTBL(SDValue Op, ArrayRef<int> ShuffleMask,
   SDValue V2Cst = DAG.getNode(ISD::BITCAST, DL, IndexVT, V2);
 
   SDValue Shuffle;
-  if (V2.getNode()->getOpcode() == ISD::UNDEF) {
+  if (V2.getNode()->isUndef()) {
     if (IndexLen == 8)
       V1Cst = DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v16i8, V1Cst, V1Cst);
     Shuffle = DAG.getNode(
         ISD::INTRINSIC_WO_CHAIN, DL, IndexVT,
         DAG.getConstant(Intrinsic::aarch64_neon_tbl1, DL, MVT::i32), V1Cst,
-        DAG.getNode(ISD::BUILD_VECTOR, DL, IndexVT,
-                    makeArrayRef(TBLMask.data(), IndexLen)));
+        DAG.getBuildVector(IndexVT, DL,
+                           makeArrayRef(TBLMask.data(), IndexLen)));
   } else {
     if (IndexLen == 8) {
       V1Cst = DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v16i8, V1Cst, V2Cst);
       Shuffle = DAG.getNode(
           ISD::INTRINSIC_WO_CHAIN, DL, IndexVT,
           DAG.getConstant(Intrinsic::aarch64_neon_tbl1, DL, MVT::i32), V1Cst,
-          DAG.getNode(ISD::BUILD_VECTOR, DL, IndexVT,
-                      makeArrayRef(TBLMask.data(), IndexLen)));
+          DAG.getBuildVector(IndexVT, DL,
+                             makeArrayRef(TBLMask.data(), IndexLen)));
     } else {
       // FIXME: We cannot, for the moment, emit a TBL2 instruction because we
       // cannot currently represent the register constraints on the input
       // table registers.
       //  Shuffle = DAG.getNode(AArch64ISD::TBL2, DL, IndexVT, V1Cst, V2Cst,
-      //                   DAG.getNode(ISD::BUILD_VECTOR, DL, IndexVT,
-      //                               &TBLMask[0], IndexLen));
+      //                   DAG.getBuildVector(IndexVT, DL, &TBLMask[0],
+      //                   IndexLen));
       Shuffle = DAG.getNode(
           ISD::INTRINSIC_WO_CHAIN, DL, IndexVT,
-          DAG.getConstant(Intrinsic::aarch64_neon_tbl2, DL, MVT::i32),
-          V1Cst, V2Cst,
-          DAG.getNode(ISD::BUILD_VECTOR, DL, IndexVT,
-                      makeArrayRef(TBLMask.data(), IndexLen)));
+          DAG.getConstant(Intrinsic::aarch64_neon_tbl2, DL, MVT::i32), V1Cst,
+          V2Cst, DAG.getBuildVector(IndexVT, DL,
+                                    makeArrayRef(TBLMask.data(), IndexLen)));
     }
   }
   return DAG.getNode(ISD::BITCAST, DL, Op.getValueType(), Shuffle);
@@ -5496,8 +5594,7 @@ SDValue AArch64TargetLowering::LowerVECTOR_SHUFFLE(SDValue Op,
   SDValue V1 = Op.getOperand(0);
   SDValue V2 = Op.getOperand(1);
 
-  if (ShuffleVectorSDNode::isSplatMask(&ShuffleMask[0],
-                                       V1.getValueType().getSimpleVT())) {
+  if (SVN->isSplat()) {
     int Lane = SVN->getSplatIndex();
     // If this is undef splat, generate it via "just" vdup, if possible.
     if (Lane == -1)
@@ -5546,8 +5643,7 @@ SDValue AArch64TargetLowering::LowerVECTOR_SHUFFLE(SDValue Op,
     Imm *= getExtFactor(V1);
     return DAG.getNode(AArch64ISD::EXT, dl, V1.getValueType(), V1, V2,
                        DAG.getConstant(Imm, dl, MVT::i32));
-  } else if (V2->getOpcode() == ISD::UNDEF &&
-             isSingletonEXTMask(ShuffleMask, VT, Imm)) {
+  } else if (V2->isUndef() && isSingletonEXTMask(ShuffleMask, VT, Imm)) {
     Imm *= getExtFactor(V1);
     return DAG.getNode(AArch64ISD::EXT, dl, V1.getValueType(), V1, V1,
                        DAG.getConstant(Imm, dl, MVT::i32));
@@ -5580,8 +5676,7 @@ SDValue AArch64TargetLowering::LowerVECTOR_SHUFFLE(SDValue Op,
     return DAG.getNode(Opc, dl, V1.getValueType(), V1, V1);
   }
 
-  SDValue Concat = tryFormConcatFromShuffle(Op, DAG);
-  if (Concat.getNode())
+  if (SDValue Concat = tryFormConcatFromShuffle(Op, DAG))
     return Concat;
 
   bool DstIsLeft;
@@ -5853,8 +5948,7 @@ SDValue AArch64TargetLowering::LowerVectorOR(SDValue Op,
                                              SelectionDAG &DAG) const {
   // Attempt to form a vector S[LR]I from (or (and X, C1), (lsl Y, C2))
   if (EnableAArch64SlrGeneration) {
-    SDValue Res = tryLowerToSLI(Op.getNode(), DAG);
-    if (Res.getNode())
+    if (SDValue Res = tryLowerToSLI(Op.getNode(), DAG))
       return Res;
   }
 
@@ -5972,7 +6066,7 @@ static SDValue NormalizeBuildVector(SDValue Op,
     }
     Ops.push_back(Lane);
   }
-  return DAG.getNode(ISD::BUILD_VECTOR, dl, VT, Ops);
+  return DAG.getBuildVector(VT, dl, Ops);
 }
 
 SDValue AArch64TargetLowering::LowerBUILD_VECTOR(SDValue Op,
@@ -6217,7 +6311,7 @@ FailedModImm:
   SDValue ConstantValue;
   for (unsigned i = 0; i < NumElts; ++i) {
     SDValue V = Op.getOperand(i);
-    if (V.getOpcode() == ISD::UNDEF)
+    if (V.isUndef())
       continue;
     if (i > 0)
       isOnlyLowElement = false;
@@ -6273,7 +6367,7 @@ FailedModImm:
       for (unsigned i = 0; i < NumElts; ++i)
         Ops.push_back(DAG.getNode(ISD::BITCAST, dl, NewType, Op.getOperand(i)));
       EVT VecVT = EVT::getVectorVT(*DAG.getContext(), NewType, NumElts);
-      SDValue Val = DAG.getNode(ISD::BUILD_VECTOR, dl, VecVT, Ops);
+      SDValue Val = DAG.getBuildVector(VecVT, dl, Ops);
       Val = LowerBUILD_VECTOR(Val, DAG);
       if (Val.getNode())
         return DAG.getNode(ISD::BITCAST, dl, VT, Val);
@@ -6328,7 +6422,7 @@ FailedModImm:
     //    value is already in an S or D register.
     // Do not do this for UNDEF/LOAD nodes because we have better patterns
     // for those avoiding the SCALAR_TO_VECTOR/BUILD_VECTOR.
-    if (Op0.getOpcode() != ISD::UNDEF && Op0.getOpcode() != ISD::LOAD &&
+    if (!Op0.isUndef() && Op0.getOpcode() != ISD::LOAD &&
         (ElemSize == 32 || ElemSize == 64)) {
       unsigned SubIdx = ElemSize == 32 ? AArch64::ssub : AArch64::dsub;
       MachineSDNode *N =
@@ -6339,7 +6433,7 @@ FailedModImm:
     }
     for (; i < NumElts; ++i) {
       SDValue V = Op.getOperand(i);
-      if (V.getOpcode() == ISD::UNDEF)
+      if (V.isUndef())
         continue;
       SDValue LaneIdx = DAG.getConstant(i, dl, MVT::i64);
       Vec = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, Vec, V, LaneIdx);
@@ -6580,7 +6674,7 @@ SDValue AArch64TargetLowering::LowerVectorSRA_SRL_SHL(SDValue Op,
 
 static SDValue EmitVectorComparison(SDValue LHS, SDValue RHS,
                                     AArch64CC::CondCode CC, bool NoNans, EVT VT,
-                                    SDLoc dl, SelectionDAG &DAG) {
+                                    const SDLoc &dl, SelectionDAG &DAG) {
   EVT SrcVT = LHS.getValueType();
   assert(VT.getSizeInBits() == SrcVT.getSizeInBits() &&
          "function only supposed to emit natural comparisons");
@@ -6877,12 +6971,10 @@ bool AArch64TargetLowering::isProfitableToHoist(Instruction *I) const {
   const DataLayout &DL = I->getModule()->getDataLayout();
   EVT VT = getValueType(DL, User->getOperand(0)->getType());
 
-  if (isFMAFasterThanFMulAndFAdd(VT) &&
-      isOperationLegalOrCustom(ISD::FMA, VT) &&
-      (Options.AllowFPOpFusion == FPOpFusion::Fast || Options.UnsafeFPMath))
-    return false;
-
-  return true;
+  return !(isFMAFasterThanFMulAndFAdd(VT) &&
+           isOperationLegalOrCustom(ISD::FMA, VT) &&
+           (Options.AllowFPOpFusion == FPOpFusion::Fast ||
+            Options.UnsafeFPMath));
 }
 
 // All 32-bit GPR operations implicitly zero the high-half of the corresponding
@@ -7183,16 +7275,17 @@ EVT AArch64TargetLowering::getOptimalMemOpType(uint64_t Size, unsigned DstAlign,
 
 // 12-bit optionally shifted immediates are legal for adds.
 bool AArch64TargetLowering::isLegalAddImmediate(int64_t Immed) const {
-  if ((Immed >> 12) == 0 || ((Immed & 0xfff) == 0 && Immed >> 24 == 0))
-    return true;
-  return false;
+  // Avoid UB for INT64_MIN.
+  if (Immed == std::numeric_limits<int64_t>::min())
+    return false;
+  // Same encoding for add/sub, just flip the sign.
+  Immed = std::abs(Immed);
+  return ((Immed >> 12) == 0 || ((Immed & 0xfff) == 0 && Immed >> 24 == 0));
 }
 
 // Integer comparisons are implemented with ADDS/SUBS, so the range of valid
 // immediates is the same as for an add or a sub.
 bool AArch64TargetLowering::isLegalICmpImmediate(int64_t Immed) const {
-  if (Immed < 0)
-    Immed *= -1;
   return isLegalAddImmediate(Immed);
 }
 
@@ -7244,10 +7337,8 @@ bool AArch64TargetLowering::isLegalAddressingMode(const DataLayout &DL,
 
   // Check reg1 + SIZE_IN_BYTES * reg2 and reg1 + reg2
 
-  if (!AM.Scale || AM.Scale == 1 ||
-      (AM.Scale > 0 && (uint64_t)AM.Scale == NumBytes))
-    return true;
-  return false;
+  return !AM.Scale || AM.Scale == 1 ||
+         (AM.Scale > 0 && (uint64_t)AM.Scale == NumBytes);
 }
 
 int AArch64TargetLowering::getScalingFactorCost(const DataLayout &DL,
@@ -7334,6 +7425,33 @@ bool AArch64TargetLowering::shouldConvertConstantLoadToIntImm(const APInt &Imm,
   return Shift < 3;
 }
 
+/// Turn vector tests of the signbit in the form of:
+///   xor (sra X, elt_size(X)-1), -1
+/// into:
+///   cmge X, X, #0
+static SDValue foldVectorXorShiftIntoCmp(SDNode *N, SelectionDAG &DAG,
+                                         const AArch64Subtarget *Subtarget) {
+  EVT VT = N->getValueType(0);
+  if (!Subtarget->hasNEON() || !VT.isVector())
+    return SDValue();
+
+  // There must be a shift right algebraic before the xor, and the xor must be a
+  // 'not' operation.
+  SDValue Shift = N->getOperand(0);
+  SDValue Ones = N->getOperand(1);
+  if (Shift.getOpcode() != AArch64ISD::VASHR || !Shift.hasOneUse() ||
+      !ISD::isBuildVectorAllOnes(Ones.getNode()))
+    return SDValue();
+
+  // The shift should be smearing the sign bit across each vector element.
+  auto *ShiftAmt = dyn_cast<ConstantSDNode>(Shift.getOperand(1));
+  EVT ShiftEltTy = Shift.getValueType().getVectorElementType();
+  if (!ShiftAmt || ShiftAmt->getZExtValue() != ShiftEltTy.getSizeInBits() - 1)
+    return SDValue();
+
+  return DAG.getNode(AArch64ISD::CMGEz, SDLoc(N), VT, Shift.getOperand(0));
+}
+
 // Generate SUBS and CSEL for integer abs.
 static SDValue performIntegerAbsCombine(SDNode *N, SelectionDAG &DAG) {
   EVT VT = N->getValueType(0);
@@ -7362,13 +7480,15 @@ static SDValue performIntegerAbsCombine(SDNode *N, SelectionDAG &DAG) {
   return SDValue();
 }
 
-// performXorCombine - Attempts to handle integer ABS.
 static SDValue performXorCombine(SDNode *N, SelectionDAG &DAG,
                                  TargetLowering::DAGCombinerInfo &DCI,
                                  const AArch64Subtarget *Subtarget) {
   if (DCI.isBeforeLegalizeOps())
     return SDValue();
 
+  if (SDValue Cmp = foldVectorXorShiftIntoCmp(N, DAG, Subtarget))
+    return Cmp;
+
   return performIntegerAbsCombine(N, DAG);
 }
 
@@ -7376,6 +7496,10 @@ SDValue
 AArch64TargetLowering::BuildSDIVPow2(SDNode *N, const APInt &Divisor,
                                      SelectionDAG &DAG,
                                      std::vector<SDNode *> *Created) const {
+  AttributeSet Attr = DAG.getMachineFunction().getFunction()->getAttributes();
+  if (isIntDivCheap(N->getValueType(0), Attr))
+    return SDValue(N,0); // Lower SDIV as SDIV
+
   // fold (sdiv X, pow2)
   EVT VT = N->getValueType(0);
   if ((VT != MVT::i32 && VT != MVT::i64) ||
@@ -7426,7 +7550,7 @@ static SDValue performMulCombine(SDNode *N, SelectionDAG &DAG,
   // gated on a subtarget feature. For Cyclone, 32-bit MADD is 4 cycles and
   // 64-bit is 5 cycles, so this is always a win.
   if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(N->getOperand(1))) {
-    APInt Value = C->getAPIntValue();
+    const APInt &Value = C->getAPIntValue();
     EVT VT = N->getValueType(0);
     SDLoc DL(N);
     if (Value.isNonNegative()) {
@@ -7543,9 +7667,8 @@ static SDValue performIntToFpCombine(SDNode *N, SelectionDAG &DAG,
       !cast<LoadSDNode>(N0)->isVolatile()) {
     LoadSDNode *LN0 = cast<LoadSDNode>(N0);
     SDValue Load = DAG.getLoad(VT, SDLoc(N), LN0->getChain(), LN0->getBasePtr(),
-                               LN0->getPointerInfo(), LN0->isVolatile(),
-                               LN0->isNonTemporal(), LN0->isInvariant(),
-                               LN0->getAlignment());
+                               LN0->getPointerInfo(), LN0->getAlignment(),
+                               LN0->getMemOperand()->getFlags());
 
     // Make sure successors of the original load stay after it by updating them
     // to use the new Chain.
@@ -7567,7 +7690,8 @@ static SDValue performFpToIntCombine(SDNode *N, SelectionDAG &DAG,
     return SDValue();
 
   SDValue Op = N->getOperand(0);
-  if (!Op.getValueType().isVector() || Op.getOpcode() != ISD::FMUL)
+  if (!Op.getValueType().isVector() || !Op.getValueType().isSimple() ||
+      Op.getOpcode() != ISD::FMUL)
     return SDValue();
 
   SDValue ConstVec = Op->getOperand(1);
@@ -7801,25 +7925,49 @@ static SDValue tryCombineToBSL(SDNode *N,
 static SDValue performORCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI,
                                 const AArch64Subtarget *Subtarget) {
   // Attempt to form an EXTR from (or (shl VAL1, #N), (srl VAL2, #RegWidth-N))
-  if (!EnableAArch64ExtrGeneration)
-    return SDValue();
   SelectionDAG &DAG = DCI.DAG;
   EVT VT = N->getValueType(0);
 
   if (!DAG.getTargetLoweringInfo().isTypeLegal(VT))
     return SDValue();
 
-  SDValue Res = tryCombineToEXTR(N, DCI);
-  if (Res.getNode())
+  if (SDValue Res = tryCombineToEXTR(N, DCI))
     return Res;
 
-  Res = tryCombineToBSL(N, DCI);
-  if (Res.getNode())
+  if (SDValue Res = tryCombineToBSL(N, DCI))
     return Res;
 
   return SDValue();
 }
 
+static SDValue performSRLCombine(SDNode *N,
+                                 TargetLowering::DAGCombinerInfo &DCI) {
+  SelectionDAG &DAG = DCI.DAG;
+  EVT VT = N->getValueType(0);
+  if (VT != MVT::i32 && VT != MVT::i64)
+    return SDValue();
+
+  // Canonicalize (srl (bswap i32 x), 16) to (rotr (bswap i32 x), 16), if the
+  // high 16-bits of x are zero. Similarly, canonicalize (srl (bswap i64 x), 32)
+  // to (rotr (bswap i64 x), 32), if the high 32-bits of x are zero.
+  SDValue N0 = N->getOperand(0);
+  if (N0.getOpcode() == ISD::BSWAP) {
+    SDLoc DL(N);
+    SDValue N1 = N->getOperand(1);
+    SDValue N00 = N0.getOperand(0);
+    if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(N1)) {
+      uint64_t ShiftAmt = C->getZExtValue();
+      if (VT == MVT::i32 && ShiftAmt == 16 &&
+          DAG.MaskedValueIsZero(N00, APInt::getHighBitsSet(32, 16)))
+        return DAG.getNode(ISD::ROTR, DL, VT, N0, N1);
+      if (VT == MVT::i64 && ShiftAmt == 32 &&
+          DAG.MaskedValueIsZero(N00, APInt::getHighBitsSet(64, 32)))
+        return DAG.getNode(ISD::ROTR, DL, VT, N0, N1);
+    }
+  }
+  return SDValue();
+}
+
 static SDValue performBitcastCombine(SDNode *N,
                                      TargetLowering::DAGCombinerInfo &DCI,
                                      SelectionDAG &DAG) {
@@ -8575,15 +8723,15 @@ static SDValue replaceSplatVectorStore(SelectionDAG &DAG, StoreSDNode *St) {
   SDValue BasePtr = St->getBasePtr();
   SDValue NewST1 =
       DAG.getStore(St->getChain(), DL, SplatVal, BasePtr, St->getPointerInfo(),
-                   St->isVolatile(), St->isNonTemporal(), St->getAlignment());
+                   St->getAlignment(), St->getMemOperand()->getFlags());
 
   unsigned Offset = EltOffset;
   while (--NumVecElts) {
     SDValue OffsetPtr = DAG.getNode(ISD::ADD, DL, MVT::i64, BasePtr,
                                     DAG.getConstant(Offset, DL, MVT::i64));
     NewST1 = DAG.getStore(NewST1.getValue(0), DL, SplatVal, OffsetPtr,
-                          St->getPointerInfo(), St->isVolatile(),
-                          St->isNonTemporal(), Alignment);
+                          St->getPointerInfo(), Alignment,
+                          St->getMemOperand()->getFlags());
     Offset += EltOffset;
   }
   return NewST1;
@@ -8603,9 +8751,7 @@ static SDValue split16BStores(SDNode *N, TargetLowering::DAGCombinerInfo &DCI,
   // be included in TLI.allowsMisalignedMemoryAccesses(), and there should be
   // a call to that function here.
 
-  // Cyclone has bad performance on unaligned 16B stores when crossing line and
-  // page boundaries. We want to split such stores.
-  if (!Subtarget->isCyclone())
+  if (!Subtarget->isMisaligned128StoreSlow())
     return SDValue();
 
   // Don't split at -Oz.
@@ -8647,12 +8793,12 @@ static SDValue split16BStores(SDNode *N, TargetLowering::DAGCombinerInfo &DCI,
   SDValue BasePtr = S->getBasePtr();
   SDValue NewST1 =
       DAG.getStore(S->getChain(), DL, SubVector0, BasePtr, S->getPointerInfo(),
-                   S->isVolatile(), S->isNonTemporal(), S->getAlignment());
+                   S->getAlignment(), S->getMemOperand()->getFlags());
   SDValue OffsetPtr = DAG.getNode(ISD::ADD, DL, MVT::i64, BasePtr,
                                   DAG.getConstant(8, DL, MVT::i64));
   return DAG.getStore(NewST1.getValue(0), DL, SubVector1, OffsetPtr,
-                      S->getPointerInfo(), S->isVolatile(), S->isNonTemporal(),
-                      S->getAlignment());
+                      S->getPointerInfo(), S->getAlignment(),
+                      S->getMemOperand()->getFlags());
 }
 
 /// Target-specific DAG combine function for post-increment LD1 (lane) and
@@ -8741,9 +8887,10 @@ static SDValue performPostLD1Combine(SDNode *N,
                                            LoadSDN->getMemOperand());
 
     // Update the uses.
-    SmallVector<SDValue, 2> NewResults;
-    NewResults.push_back(SDValue(LD, 0));             // The result of load
-    NewResults.push_back(SDValue(UpdN.getNode(), 2)); // Chain
+    SDValue NewResults[] = {
+        SDValue(LD, 0),            // The result of load
+        SDValue(UpdN.getNode(), 2) // Chain
+    };
     DCI.CombineTo(LD, NewResults);
     DCI.CombineTo(N, SDValue(UpdN.getNode(), 0));     // Dup/Inserted Result
     DCI.CombineTo(User, SDValue(UpdN.getNode(), 1));  // Write back register
@@ -8774,8 +8921,7 @@ static SDValue performSTORECombine(SDNode *N,
                                    TargetLowering::DAGCombinerInfo &DCI,
                                    SelectionDAG &DAG,
                                    const AArch64Subtarget *Subtarget) {
-  SDValue Split = split16BStores(N, DCI, DAG, Subtarget);
-  if (Split.getNode())
+  if (SDValue Split = split16BStores(N, DCI, DAG, Subtarget))
     return Split;
 
   if (Subtarget->supportsAddressTopByteIgnored() &&
@@ -9215,10 +9361,8 @@ bool checkValueWidth(SDValue V, unsigned width, ISD::LoadExtType &ExtType) {
   }
   case ISD::Constant:
   case ISD::TargetConstant: {
-    if (std::abs(cast<ConstantSDNode>(V.getNode())->getSExtValue()) <
-        1LL << (width - 1))
-      return true;
-    return false;
+    return std::abs(cast<ConstantSDNode>(V.getNode())->getSExtValue()) <
+           1LL << (width - 1);
   }
   }
 
@@ -9286,14 +9430,13 @@ bool checkValueWidth(SDValue V, unsigned width, ISD::LoadExtType &ExtType) {
 // isEquivalentMaskless() is the code for testing if the AND can be removed
 // factored out of the DAG recognition as the DAG can take several forms.
 
-static
-bool isEquivalentMaskless(unsigned CC, unsigned width,
-                          ISD::LoadExtType ExtType, signed AddConstant,
-                          signed CompConstant) {
+static bool isEquivalentMaskless(unsigned CC, unsigned width,
+                                 ISD::LoadExtType ExtType, int AddConstant,
+                                 int CompConstant) {
   // By being careful about our equations and only writing the in term
   // symbolic values and well known constants (0, 1, -1, MaxUInt) we can
   // make them generally applicable to all bit widths.
-  signed MaxUInt = (1 << width);
+  int MaxUInt = (1 << width);
 
   // For the purposes of these comparisons sign extending the type is
   // equivalent to zero extending the add and displacing it by half the integer
@@ -9441,8 +9584,7 @@ SDValue performCONDCombine(SDNode *N,
 static SDValue performBRCONDCombine(SDNode *N,
                                     TargetLowering::DAGCombinerInfo &DCI,
                                     SelectionDAG &DAG) {
-  SDValue NV = performCONDCombine(N, DCI, DAG, 2, 3);
-  if (NV.getNode())
+  if (SDValue NV = performCONDCombine(N, DCI, DAG, 2, 3))
     N = NV.getNode();
   SDValue Chain = N->getOperand(0);
   SDValue Dest = N->getOperand(1);
@@ -9678,7 +9820,7 @@ static SDValue performSelectCombine(SDNode *N,
 
   // Now duplicate the comparison mask we want across all other lanes.
   SmallVector<int, 8> DUPMask(CCVT.getVectorNumElements(), 0);
-  SDValue Mask = DAG.getVectorShuffle(CCVT, DL, SetCC, SetCC, DUPMask.data());
+  SDValue Mask = DAG.getVectorShuffle(CCVT, DL, SetCC, SetCC, DUPMask);
   Mask = DAG.getNode(ISD::BITCAST, DL,
                      ResVT.changeVectorElementTypeToInteger(), Mask);
 
@@ -9716,6 +9858,8 @@ SDValue AArch64TargetLowering::PerformDAGCombine(SDNode *N,
     return performFDivCombine(N, DAG, Subtarget);
   case ISD::OR:
     return performORCombine(N, DCI, Subtarget);
+  case ISD::SRL:
+    return performSRLCombine(N, DCI);
   case ISD::INTRINSIC_WO_CHAIN:
     return performIntrinsicCombine(N, DCI, Subtarget);
   case ISD::ANY_EXTEND:
@@ -9829,10 +9973,7 @@ bool AArch64TargetLowering::isUsedByReturnOnly(SDNode *N,
 // return instructions to help enable tail call optimizations for this
 // instruction.
 bool AArch64TargetLowering::mayBeEmittedAsTailCall(CallInst *CI) const {
-  if (!CI->isTailCall())
-    return false;
-
-  return true;
+  return CI->isTailCall();
 }
 
 bool AArch64TargetLowering::getIndexedAddressParts(SDNode *Op, SDValue &Base,
@@ -9935,6 +10076,31 @@ static void ReplaceReductionResults(SDNode *N,
   Results.push_back(SplitVal);
 }
 
+static void ReplaceCMP_SWAP_128Results(SDNode *N,
+                                       SmallVectorImpl<SDValue> & Results,
+                                       SelectionDAG &DAG) {
+  assert(N->getValueType(0) == MVT::i128 &&
+         "AtomicCmpSwap on types less than 128 should be legal");
+  SDValue Ops[] = {N->getOperand(1),
+                   N->getOperand(2)->getOperand(0),
+                   N->getOperand(2)->getOperand(1),
+                   N->getOperand(3)->getOperand(0),
+                   N->getOperand(3)->getOperand(1),
+                   N->getOperand(0)};
+  SDNode *CmpSwap = DAG.getMachineNode(
+      AArch64::CMP_SWAP_128, SDLoc(N),
+      DAG.getVTList(MVT::i64, MVT::i64, MVT::i32, MVT::Other), Ops);
+
+  MachineFunction &MF = DAG.getMachineFunction();
+  MachineSDNode::mmo_iterator MemOp = MF.allocateMemRefsArray(1);
+  MemOp[0] = cast<MemSDNode>(N)->getMemOperand();
+  cast<MachineSDNode>(CmpSwap)->setMemRefs(MemOp, MemOp + 1);
+
+  Results.push_back(SDValue(CmpSwap, 0));
+  Results.push_back(SDValue(CmpSwap, 1));
+  Results.push_back(SDValue(CmpSwap, 3));
+}
+
 void AArch64TargetLowering::ReplaceNodeResults(
     SDNode *N, SmallVectorImpl<SDValue> &Results, SelectionDAG &DAG) const {
   switch (N->getOpcode()) {
@@ -9966,11 +10132,16 @@ void AArch64TargetLowering::ReplaceNodeResults(
     assert(N->getValueType(0) == MVT::i128 && "unexpected illegal conversion");
     // Let normal code take care of it by not adding anything to Results.
     return;
+  case ISD::ATOMIC_CMP_SWAP:
+    ReplaceCMP_SWAP_128Results(N, Results, DAG);
+    return;
   }
 }
 
 bool AArch64TargetLowering::useLoadStackGuardNode() const {
-  return true;
+  if (!Subtarget->isTargetAndroid())
+    return true;
+  return TargetLowering::useLoadStackGuardNode();
 }
 
 unsigned AArch64TargetLowering::combineRepeatedFPDivisors() const {
@@ -10017,14 +10188,19 @@ AArch64TargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *AI) const {
 
 bool AArch64TargetLowering::shouldExpandAtomicCmpXchgInIR(
     AtomicCmpXchgInst *AI) const {
-  return true;
+  // At -O0, fast-regalloc cannot cope with the live vregs necessary to
+  // implement cmpxchg without spilling. If the address being exchanged is also
+  // on the stack and close enough to the spill slot, this can lead to a
+  // situation where the monitor always gets cleared and the atomic operation
+  // can never succeed. So at -O0 we need a late-expanded pseudo-inst instead.
+  return getTargetMachine().getOptLevel() != 0;
 }
 
 Value *AArch64TargetLowering::emitLoadLinked(IRBuilder<> &Builder, Value *Addr,
                                              AtomicOrdering Ord) const {
   Module *M = Builder.GetInsertBlock()->getParent()->getParent();
   Type *ValTy = cast<PointerType>(Addr->getType())->getElementType();
-  bool IsAcquire = isAtLeastAcquire(Ord);
+  bool IsAcquire = isAcquireOrStronger(Ord);
 
   // Since i128 isn't legal and intrinsics don't get type-lowered, the ldrexd
   // intrinsic must return {i64, i64} and we have to recombine them into a
@@ -10066,7 +10242,7 @@ Value *AArch64TargetLowering::emitStoreConditional(IRBuilder<> &Builder,
                                                    Value *Val, Value *Addr,
                                                    AtomicOrdering Ord) const {
   Module *M = Builder.GetInsertBlock()->getParent()->getParent();
-  bool IsRelease = isAtLeastRelease(Ord);
+  bool IsRelease = isReleaseOrStronger(Ord);
 
   // Since the intrinsics must have legal type, the i128 intrinsics take two
   // parameters: "i64, i64". We must marshal Val into the appropriate form
@@ -10104,6 +10280,22 @@ bool AArch64TargetLowering::shouldNormalizeToSelectSequence(LLVMContext &,
   return false;
 }
 
+Value *AArch64TargetLowering::getIRStackGuard(IRBuilder<> &IRB) const {
+  if (!Subtarget->isTargetAndroid())
+    return TargetLowering::getIRStackGuard(IRB);
+
+  // Android provides a fixed TLS slot for the stack cookie. See the definition
+  // of TLS_SLOT_STACK_GUARD in
+  // https://android.googlesource.com/platform/bionic/+/master/libc/private/bionic_tls.h
+  const unsigned TlsOffset = 0x28;
+  Module *M = IRB.GetInsertBlock()->getParent()->getParent();
+  Function *ThreadPointerFunc =
+      Intrinsic::getDeclaration(M, Intrinsic::thread_pointer);
+  return IRB.CreatePointerCast(
+      IRB.CreateConstGEP1_32(IRB.CreateCall(ThreadPointerFunc), TlsOffset),
+      Type::getInt8PtrTy(IRB.getContext())->getPointerTo(0));
+}
+
 Value *AArch64TargetLowering::getSafeStackPointerLocation(IRBuilder<> &IRB) const {
   if (!Subtarget->isTargetAndroid())
     return TargetLowering::getSafeStackPointerLocation(IRB);
@@ -10114,7 +10306,7 @@ Value *AArch64TargetLowering::getSafeStackPointerLocation(IRBuilder<> &IRB) cons
   const unsigned TlsOffset = 0x48;
   Module *M = IRB.GetInsertBlock()->getParent()->getParent();
   Function *ThreadPointerFunc =
-      Intrinsic::getDeclaration(M, Intrinsic::aarch64_thread_pointer);
+      Intrinsic::getDeclaration(M, Intrinsic::thread_pointer);
   return IRB.CreatePointerCast(
       IRB.CreateConstGEP1_32(IRB.CreateCall(ThreadPointerFunc), TlsOffset),
       Type::getInt8PtrTy(IRB.getContext())->getPointerTo(0));
@@ -10166,3 +10358,16 @@ void AArch64TargetLowering::insertCopiesSplitCSR(
           .addReg(NewVR);
   }
 }
+
+bool AArch64TargetLowering::isIntDivCheap(EVT VT, AttributeSet Attr) const {
+  // Integer division on AArch64 is expensive. However, when aggressively
+  // optimizing for code size, we prefer to use a div instruction, as it is
+  // usually smaller than the alternative sequence.
+  // The exception to this is vector division. Since AArch64 doesn't have vector
+  // integer division, leaving the division as-is is a loss even in terms of
+  // size, because it will have to be scalarized, while the alternative code
+  // sequence can be performed in vector form.
+  bool OptSize =
+      Attr.hasAttribute(AttributeSet::FunctionIndex, Attribute::MinSize);
+  return OptSize && !VT.isVector();
+}
diff --git a/lib/Target/AArch64/AArch64ISelLowering.h b/lib/Target/AArch64/AArch64ISelLowering.h
index e99616c94068..c87cfed1f892 100644
--- a/lib/Target/AArch64/AArch64ISelLowering.h
+++ b/lib/Target/AArch64/AArch64ISelLowering.h
@@ -187,6 +187,10 @@ enum NodeType : unsigned {
   SMULL,
   UMULL,
 
+  // Reciprocal estimates.
+  FRECPE,
+  FRSQRTE,
+
   // NEON Load/Store with post-increment base updates
   LD2post = ISD::FIRST_TARGET_MEMORY_OPCODE,
   LD3post,
@@ -272,11 +276,11 @@ public:
 
   SDValue ReconstructShuffle(SDValue Op, SelectionDAG &DAG) const;
 
-  MachineBasicBlock *EmitF128CSEL(MachineInstr *MI,
+  MachineBasicBlock *EmitF128CSEL(MachineInstr &MI,
                                   MachineBasicBlock *BB) const;
 
   MachineBasicBlock *
-  EmitInstrWithCustomInserter(MachineInstr *MI,
+  EmitInstrWithCustomInserter(MachineInstr &MI,
                               MachineBasicBlock *MBB) const override;
 
   bool getTgtMemIntrinsic(IntrinsicInfo &Info, const CallInst &I,
@@ -358,6 +362,10 @@ public:
   TargetLoweringBase::LegalizeTypeAction
   getPreferredVectorAction(EVT VT) const override;
 
+  /// If the target has a standard location for the stack protector cookie,
+  /// returns the address of that location. Otherwise, returns nullptr.
+  Value *getIRStackGuard(IRBuilder<> &IRB) const override;
+
   /// If the target has a standard location for the unsafe stack pointer,
   /// returns the address of that location. Otherwise, returns nullptr.
   Value *getSafeStackPointerLocation(IRBuilder<> &IRB) const override;
@@ -378,6 +386,8 @@ public:
     return AArch64::X1;
   }
 
+  bool isIntDivCheap(EVT VT, AttributeSet Attr) const override;
+
   bool isCheapToSpeculateCttz() const override {
     return true;
   }
@@ -385,6 +395,12 @@ public:
   bool isCheapToSpeculateCtlz() const override {
     return true;
   }
+
+  bool hasBitPreservingFPLogic(EVT VT) const override {
+    // FIXME: Is this always true? It should be true for vectors at least.
+    return VT == MVT::f32 || VT == MVT::f64;
+  }
+
   bool supportSplitCSR(MachineFunction *MF) const override {
     return MF->getFunction()->getCallingConv() == CallingConv::CXX_FAST_TLS &&
            MF->getFunction()->hasFnAttribute(Attribute::NoUnwind);
@@ -394,6 +410,10 @@ public:
       MachineBasicBlock *Entry,
       const SmallVectorImpl<MachineBasicBlock *> &Exits) const override;
 
+  bool supportSwiftError() const override {
+    return true;
+  }
+
 private:
   bool isExtFreeImpl(const Instruction *Ext) const override;
 
@@ -401,30 +421,30 @@ private:
   /// make the right decision when generating code for different targets.
   const AArch64Subtarget *Subtarget;
 
-  void addTypeForNEON(EVT VT, EVT PromotedBitwiseVT);
+  void addTypeForNEON(MVT VT, MVT PromotedBitwiseVT);
   void addDRTypeForNEON(MVT VT);
   void addQRTypeForNEON(MVT VT);
 
-  SDValue
-  LowerFormalArguments(SDValue Chain, CallingConv::ID CallConv, bool isVarArg,
-                       const SmallVectorImpl<ISD::InputArg> &Ins, SDLoc DL,
-                       SelectionDAG &DAG,
-                       SmallVectorImpl<SDValue> &InVals) const override;
+  SDValue LowerFormalArguments(SDValue Chain, CallingConv::ID CallConv,
+                               bool isVarArg,
+                               const SmallVectorImpl<ISD::InputArg> &Ins,
+                               const SDLoc &DL, SelectionDAG &DAG,
+                               SmallVectorImpl<SDValue> &InVals) const override;
 
   SDValue LowerCall(CallLoweringInfo & /*CLI*/,
                     SmallVectorImpl<SDValue> &InVals) const override;
 
   SDValue LowerCallResult(SDValue Chain, SDValue InFlag,
                           CallingConv::ID CallConv, bool isVarArg,
-                          const SmallVectorImpl<ISD::InputArg> &Ins, SDLoc DL,
-                          SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals,
-                          bool isThisReturn, SDValue ThisVal) const;
+                          const SmallVectorImpl<ISD::InputArg> &Ins,
+                          const SDLoc &DL, SelectionDAG &DAG,
+                          SmallVectorImpl<SDValue> &InVals, bool isThisReturn,
+                          SDValue ThisVal) const;
 
   SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG) const;
 
   bool isEligibleForTailCallOptimization(
       SDValue Callee, CallingConv::ID CalleeCC, bool isVarArg,
-      bool isCalleeStructRet, bool isCallerStructRet,
       const SmallVectorImpl<ISD::OutputArg> &Outs,
       const SmallVectorImpl<SDValue> &OutVals,
       const SmallVectorImpl<ISD::InputArg> &Ins, SelectionDAG &DAG) const;
@@ -439,7 +459,7 @@ private:
 
   bool IsTailCallConvention(CallingConv::ID CallCC) const;
 
-  void saveVarArgRegisters(CCState &CCInfo, SelectionDAG &DAG, SDLoc DL,
+  void saveVarArgRegisters(CCState &CCInfo, SelectionDAG &DAG, const SDLoc &DL,
                            SDValue &Chain) const;
 
   bool CanLowerReturn(CallingConv::ID CallConv, MachineFunction &MF,
@@ -449,21 +469,21 @@ private:
 
   SDValue LowerReturn(SDValue Chain, CallingConv::ID CallConv, bool isVarArg,
                       const SmallVectorImpl<ISD::OutputArg> &Outs,
-                      const SmallVectorImpl<SDValue> &OutVals, SDLoc DL,
+                      const SmallVectorImpl<SDValue> &OutVals, const SDLoc &DL,
                       SelectionDAG &DAG) const override;
 
   SDValue LowerGlobalAddress(SDValue Op, SelectionDAG &DAG) const;
   SDValue LowerGlobalTLSAddress(SDValue Op, SelectionDAG &DAG) const;
   SDValue LowerDarwinGlobalTLSAddress(SDValue Op, SelectionDAG &DAG) const;
   SDValue LowerELFGlobalTLSAddress(SDValue Op, SelectionDAG &DAG) const;
-  SDValue LowerELFTLSDescCallSeq(SDValue SymAddr, SDLoc DL,
+  SDValue LowerELFTLSDescCallSeq(SDValue SymAddr, const SDLoc &DL,
                                  SelectionDAG &DAG) const;
   SDValue LowerSETCC(SDValue Op, SelectionDAG &DAG) const;
   SDValue LowerBR_CC(SDValue Op, SelectionDAG &DAG) const;
   SDValue LowerSELECT(SDValue Op, SelectionDAG &DAG) const;
   SDValue LowerSELECT_CC(SDValue Op, SelectionDAG &DAG) const;
   SDValue LowerSELECT_CC(ISD::CondCode CC, SDValue LHS, SDValue RHS,
-                         SDValue TVal, SDValue FVal, SDLoc dl,
+                         SDValue TVal, SDValue FVal, const SDLoc &dl,
                          SelectionDAG &DAG) const;
   SDValue LowerJumpTable(SDValue Op, SelectionDAG &DAG) const;
   SDValue LowerConstantPool(SDValue Op, SelectionDAG &DAG) const;
@@ -500,6 +520,11 @@ private:
 
   SDValue BuildSDIVPow2(SDNode *N, const APInt &Divisor, SelectionDAG &DAG,
                         std::vector<SDNode *> *Created) const override;
+  SDValue getRsqrtEstimate(SDValue Operand, DAGCombinerInfo &DCI,
+                           unsigned &RefinementSteps,
+                           bool &UseOneConstNR) const override;
+  SDValue getRecipEstimate(SDValue Operand, DAGCombinerInfo &DCI,
+                           unsigned &RefinementSteps) const override;
   unsigned combineRepeatedFPDivisors() const override;
 
   ConstraintType getConstraintType(StringRef Constraint) const override;
@@ -515,6 +540,9 @@ private:
   std::pair<unsigned, const TargetRegisterClass *>
   getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI,
                                StringRef Constraint, MVT VT) const override;
+
+  const char *LowerXConstraint(EVT ConstraintVT) const override;
+
   void LowerAsmOperandForConstraint(SDValue Op, std::string &Constraint,
                                     std::vector<SDValue> &Ops,
                                     SelectionDAG &DAG) const override;
diff --git a/lib/Target/AArch64/AArch64InstrAtomics.td b/lib/Target/AArch64/AArch64InstrAtomics.td
index 4923a1161dfc..59de62ad2877 100644
--- a/lib/Target/AArch64/AArch64InstrAtomics.td
+++ b/lib/Target/AArch64/AArch64InstrAtomics.td
@@ -29,7 +29,7 @@ def : Pat<(atomic_fence (imm), (imm)), (DMB (i32 0xb))>;
 class acquiring_load<PatFrag base>
   : PatFrag<(ops node:$ptr), (base node:$ptr), [{
   AtomicOrdering Ordering = cast<AtomicSDNode>(N)->getOrdering();
-  return isAtLeastAcquire(Ordering);
+  return isAcquireOrStronger(Ordering);
 }]>;
 
 // An atomic load operation that does not need either acquire or release
@@ -37,7 +37,7 @@ class acquiring_load<PatFrag base>
 class relaxed_load<PatFrag base>
   : PatFrag<(ops node:$ptr), (base node:$ptr), [{
   AtomicOrdering Ordering = cast<AtomicSDNode>(N)->getOrdering();
-  return !isAtLeastAcquire(Ordering);
+  return !isAcquireOrStronger(Ordering);
 }]>;
 
 // 8-bit loads
@@ -112,15 +112,16 @@ def : Pat<(relaxed_load<atomic_load_64>
 class releasing_store<PatFrag base>
   : PatFrag<(ops node:$ptr, node:$val), (base node:$ptr, node:$val), [{
   AtomicOrdering Ordering = cast<AtomicSDNode>(N)->getOrdering();
-  assert(Ordering != AcquireRelease && "unexpected store ordering");
-  return isAtLeastRelease(Ordering);
+  assert(Ordering != AtomicOrdering::AcquireRelease &&
+         "unexpected store ordering");
+  return isReleaseOrStronger(Ordering);
 }]>;
 
 // An atomic store operation that doesn't actually need to be atomic on AArch64.
 class relaxed_store<PatFrag base>
   : PatFrag<(ops node:$ptr, node:$val), (base node:$ptr, node:$val), [{
   AtomicOrdering Ordering = cast<AtomicSDNode>(N)->getOrdering();
-  return !isAtLeastRelease(Ordering);
+  return !isReleaseOrStronger(Ordering);
 }]>;
 
 // 8-bit stores
@@ -361,3 +362,43 @@ def : Pat<(stlxr_4 (and GPR64:$val, 0xffffffff), GPR64sp:$addr),
 // And clear exclusive.
 
 def : Pat<(int_aarch64_clrex), (CLREX 0xf)>;
+
+//===----------------------------------
+// Atomic cmpxchg for -O0
+//===----------------------------------
+
+// The fast register allocator used during -O0 inserts spills to cover any VRegs
+// live across basic block boundaries. When this happens between an LDXR and an
+// STXR it can clear the exclusive monitor, causing all cmpxchg attempts to
+// fail.
+
+// Unfortunately, this means we have to have an alternative (expanded
+// post-regalloc) path for -O0 compilations. Fortunately this path can be
+// significantly more naive than the standard expansion: we conservatively
+// assume seq_cst, strong cmpxchg and omit clrex on failure.
+
+let Constraints = "@earlyclobber $Rd,@earlyclobber $status",
+    mayLoad = 1, mayStore = 1 in {
+def CMP_SWAP_8 : Pseudo<(outs GPR32:$Rd, GPR32:$status),
+                        (ins GPR64:$addr, GPR32:$desired, GPR32:$new), []>,
+                 Sched<[WriteAtomic]>;
+
+def CMP_SWAP_16 : Pseudo<(outs GPR32:$Rd, GPR32:$status),
+                         (ins GPR64:$addr, GPR32:$desired, GPR32:$new), []>,
+                  Sched<[WriteAtomic]>;
+
+def CMP_SWAP_32 : Pseudo<(outs GPR32:$Rd, GPR32:$status),
+                         (ins GPR64:$addr, GPR32:$desired, GPR32:$new), []>,
+                  Sched<[WriteAtomic]>;
+
+def CMP_SWAP_64 : Pseudo<(outs GPR64:$Rd, GPR32:$status),
+                         (ins GPR64:$addr, GPR64:$desired, GPR64:$new), []>,
+                  Sched<[WriteAtomic]>;
+}
+
+let Constraints = "@earlyclobber $RdLo,@earlyclobber $RdHi,@earlyclobber $status",
+    mayLoad = 1, mayStore = 1 in
+def CMP_SWAP_128 : Pseudo<(outs GPR64:$RdLo, GPR64:$RdHi, GPR32:$status),
+                          (ins GPR64:$addr, GPR64:$desiredLo, GPR64:$desiredHi,
+                               GPR64:$newLo, GPR64:$newHi), []>,
+                   Sched<[WriteAtomic]>;
diff --git a/lib/Target/AArch64/AArch64InstrFormats.td b/lib/Target/AArch64/AArch64InstrFormats.td
index 6ac2175e5035..34d35e961210 100644
--- a/lib/Target/AArch64/AArch64InstrFormats.td
+++ b/lib/Target/AArch64/AArch64InstrFormats.td
@@ -496,7 +496,7 @@ def imm0_65535 : Operand<i32>, ImmLeaf<i32, [{
   return ((uint32_t)Imm) < 65536;
 }]> {
   let ParserMatchClass = Imm0_65535Operand;
-  let PrintMethod = "printHexImm";
+  let PrintMethod = "printImmHex";
 }
 
 // imm0_255 predicate - True if the immediate is in the range [0,255].
@@ -505,7 +505,7 @@ def imm0_255 : Operand<i32>, ImmLeaf<i32, [{
   return ((uint32_t)Imm) < 256;
 }]> {
   let ParserMatchClass = Imm0_255Operand;
-  let PrintMethod = "printHexImm";
+  let PrintMethod = "printImm";
 }
 
 // imm0_127 predicate - True if the immediate is in the range [0,127]
@@ -514,7 +514,7 @@ def imm0_127 : Operand<i32>, ImmLeaf<i32, [{
   return ((uint32_t)Imm) < 128;
 }]> {
   let ParserMatchClass = Imm0_127Operand;
-  let PrintMethod = "printHexImm";
+  let PrintMethod = "printImm";
 }
 
 // NOTE: These imm0_N operands have to be of type i64 because i64 is the size
@@ -923,10 +923,7 @@ def psbhint_op : Operand<i32> {
     // "psb" is an alias to "hint" only for certain values of CRm:Op2 fields.
     if (!MCOp.isImm())
       return false;
-    bool ValidNamed;
-    (void)AArch64PSBHint::PSBHintMapper().toString(MCOp.getImm(),
-      STI.getFeatureBits(), ValidNamed);
-    return ValidNamed;
+    return AArch64PSBHint::lookupPSBByEncoding(MCOp.getImm()) != nullptr;
   }];
 }
 
@@ -1549,7 +1546,7 @@ class ADRI<bit page, string asm, Operand adr, list<dag> pattern>
 def movimm32_imm : Operand<i32> {
   let ParserMatchClass = Imm0_65535Operand;
   let EncoderMethod = "getMoveWideImmOpValue";
-  let PrintMethod = "printHexImm";
+  let PrintMethod = "printImm";
 }
 def movimm32_shift : Operand<i32> {
   let PrintMethod = "printShifter";
@@ -9377,7 +9374,8 @@ class BaseCASEncoding<dag oops, dag iops, string asm, string operands,
 class BaseCAS<string order, string size, RegisterClass RC>
       : BaseCASEncoding<(outs RC:$out),(ins RC:$Rs, RC:$Rt, GPR64sp:$Rn),
                         "cas" # order # size, "\t$Rs, $Rt, [$Rn]",
-                        "$out = $Rs",[]> {
+                        "$out = $Rs",[]>,
+        Sched<[WriteAtomic]> {
   let NP = 1;
 }
 
@@ -9391,7 +9389,8 @@ multiclass CompareAndSwap<bits<1> Acq, bits<1> Rel, string order> {
 class BaseCASP<string order, string size, RegisterOperand RC>
       : BaseCASEncoding<(outs RC:$out),(ins RC:$Rs, RC:$Rt, GPR64sp:$Rn),
                         "casp" # order # size, "\t$Rs, $Rt, [$Rn]",
-                        "$out = $Rs",[]> {
+                        "$out = $Rs",[]>,
+        Sched<[WriteAtomic]> {
   let NP = 0;
 }
 
@@ -9405,7 +9404,8 @@ multiclass CompareAndSwapPair<bits<1> Acq, bits<1> Rel, string order> {
 let Predicates = [HasV8_1a] in
 class BaseSWP<string order, string size, RegisterClass RC>
       : I<(outs RC:$Rt),(ins RC:$Rs, GPR64sp:$Rn), "swp" # order # size,
-          "\t$Rs, $Rt, [$Rn]","",[]> {
+          "\t$Rs, $Rt, [$Rn]","",[]>,
+        Sched<[WriteAtomic]> {
   bits<2> Sz;
   bit Acq;
   bit Rel;
@@ -9436,7 +9436,8 @@ multiclass Swap<bits<1> Acq, bits<1> Rel, string order> {
 let Predicates = [HasV8_1a], mayLoad = 1, mayStore = 1, hasSideEffects = 1 in
 class BaseLDOPregister<string op, string order, string size, RegisterClass RC>
       : I<(outs RC:$Rt),(ins RC:$Rs, GPR64sp:$Rn), "ld" # op # order # size,
-          "\t$Rs, $Rt, [$Rn]","",[]> {
+          "\t$Rs, $Rt, [$Rn]","",[]>,
+        Sched<[WriteAtomic]> {
   bits<2> Sz;
   bit Acq;
   bit Rel;
diff --git a/lib/Target/AArch64/AArch64InstrInfo.cpp b/lib/Target/AArch64/AArch64InstrInfo.cpp
index f398117de953..0aa4708f35ac 100644
--- a/lib/Target/AArch64/AArch64InstrInfo.cpp
+++ b/lib/Target/AArch64/AArch64InstrInfo.cpp
@@ -22,27 +22,31 @@
 #include "llvm/MC/MCInst.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/TargetRegistry.h"
+#include <algorithm>
 
 using namespace llvm;
 
 #define GET_INSTRINFO_CTOR_DTOR
 #include "AArch64GenInstrInfo.inc"
 
+static LLVM_CONSTEXPR MachineMemOperand::Flags MOSuppressPair =
+    MachineMemOperand::MOTargetFlag1;
+
 AArch64InstrInfo::AArch64InstrInfo(const AArch64Subtarget &STI)
     : AArch64GenInstrInfo(AArch64::ADJCALLSTACKDOWN, AArch64::ADJCALLSTACKUP),
       RI(STI.getTargetTriple()), Subtarget(STI) {}
 
 /// GetInstSize - Return the number of bytes of code the specified
 /// instruction may be.  This returns the maximum number of bytes.
-unsigned AArch64InstrInfo::GetInstSizeInBytes(const MachineInstr *MI) const {
-  const MachineBasicBlock &MBB = *MI->getParent();
+unsigned AArch64InstrInfo::GetInstSizeInBytes(const MachineInstr &MI) const {
+  const MachineBasicBlock &MBB = *MI.getParent();
   const MachineFunction *MF = MBB.getParent();
   const MCAsmInfo *MAI = MF->getTarget().getMCAsmInfo();
 
-  if (MI->getOpcode() == AArch64::INLINEASM)
-    return getInlineAsmLength(MI->getOperand(0).getSymbolName(), *MAI);
+  if (MI.getOpcode() == AArch64::INLINEASM)
+    return getInlineAsmLength(MI.getOperand(0).getSymbolName(), *MAI);
 
-  const MCInstrDesc &Desc = MI->getDesc();
+  const MCInstrDesc &Desc = MI.getDesc();
   switch (Desc.getOpcode()) {
   default:
     // Anything not explicitly designated otherwise is a nomal 4-byte insn.
@@ -89,25 +93,25 @@ static void parseCondBranch(MachineInstr *LastInst, MachineBasicBlock *&Target,
 }
 
 // Branch analysis.
-bool AArch64InstrInfo::AnalyzeBranch(MachineBasicBlock &MBB,
-                                   MachineBasicBlock *&TBB,
-                                   MachineBasicBlock *&FBB,
-                                   SmallVectorImpl<MachineOperand> &Cond,
-                                   bool AllowModify) const {
+bool AArch64InstrInfo::analyzeBranch(MachineBasicBlock &MBB,
+                                     MachineBasicBlock *&TBB,
+                                     MachineBasicBlock *&FBB,
+                                     SmallVectorImpl<MachineOperand> &Cond,
+                                     bool AllowModify) const {
   // If the block has no terminators, it just falls into the block after it.
   MachineBasicBlock::iterator I = MBB.getLastNonDebugInstr();
   if (I == MBB.end())
     return false;
 
-  if (!isUnpredicatedTerminator(I))
+  if (!isUnpredicatedTerminator(*I))
     return false;
 
   // Get the last instruction in the block.
-  MachineInstr *LastInst = I;
+  MachineInstr *LastInst = &*I;
 
   // If there is only one terminator instruction, process it.
   unsigned LastOpc = LastInst->getOpcode();
-  if (I == MBB.begin() || !isUnpredicatedTerminator(--I)) {
+  if (I == MBB.begin() || !isUnpredicatedTerminator(*--I)) {
     if (isUncondBranchOpcode(LastOpc)) {
       TBB = LastInst->getOperand(0).getMBB();
       return false;
@@ -121,7 +125,7 @@ bool AArch64InstrInfo::AnalyzeBranch(MachineBasicBlock &MBB,
   }
 
   // Get the instruction before it if it is a terminator.
-  MachineInstr *SecondLastInst = I;
+  MachineInstr *SecondLastInst = &*I;
   unsigned SecondLastOpc = SecondLastInst->getOpcode();
 
   // If AllowModify is true and the block ends with two or more unconditional
@@ -131,19 +135,19 @@ bool AArch64InstrInfo::AnalyzeBranch(MachineBasicBlock &MBB,
       LastInst->eraseFromParent();
       LastInst = SecondLastInst;
       LastOpc = LastInst->getOpcode();
-      if (I == MBB.begin() || !isUnpredicatedTerminator(--I)) {
+      if (I == MBB.begin() || !isUnpredicatedTerminator(*--I)) {
         // Return now the only terminator is an unconditional branch.
         TBB = LastInst->getOperand(0).getMBB();
         return false;
       } else {
-        SecondLastInst = I;
+        SecondLastInst = &*I;
         SecondLastOpc = SecondLastInst->getOpcode();
       }
     }
   }
 
   // If there are three terminators, we don't know what sort of block this is.
-  if (SecondLastInst && I != MBB.begin() && isUnpredicatedTerminator(--I))
+  if (SecondLastInst && I != MBB.begin() && isUnpredicatedTerminator(*--I))
     return true;
 
   // If the block ends with a B and a Bcc, handle it.
@@ -243,7 +247,7 @@ unsigned AArch64InstrInfo::RemoveBranch(MachineBasicBlock &MBB) const {
 }
 
 void AArch64InstrInfo::instantiateCondBranch(
-    MachineBasicBlock &MBB, DebugLoc DL, MachineBasicBlock *TBB,
+    MachineBasicBlock &MBB, const DebugLoc &DL, MachineBasicBlock *TBB,
     ArrayRef<MachineOperand> Cond) const {
   if (Cond[0].getImm() != -1) {
     // Regular Bcc
@@ -259,9 +263,11 @@ void AArch64InstrInfo::instantiateCondBranch(
   }
 }
 
-unsigned AArch64InstrInfo::InsertBranch(
-    MachineBasicBlock &MBB, MachineBasicBlock *TBB, MachineBasicBlock *FBB,
-    ArrayRef<MachineOperand> Cond, DebugLoc DL) const {
+unsigned AArch64InstrInfo::InsertBranch(MachineBasicBlock &MBB,
+                                        MachineBasicBlock *TBB,
+                                        MachineBasicBlock *FBB,
+                                        ArrayRef<MachineOperand> Cond,
+                                        const DebugLoc &DL) const {
   // Shouldn't be a fall through.
   assert(TBB && "InsertBranch must not be told to insert a fallthrough");
 
@@ -399,8 +405,8 @@ bool AArch64InstrInfo::canInsertSelect(
 }
 
 void AArch64InstrInfo::insertSelect(MachineBasicBlock &MBB,
-                                    MachineBasicBlock::iterator I, DebugLoc DL,
-                                    unsigned DstReg,
+                                    MachineBasicBlock::iterator I,
+                                    const DebugLoc &DL, unsigned DstReg,
                                     ArrayRef<MachineOperand> Cond,
                                     unsigned TrueReg, unsigned FalseReg) const {
   MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
@@ -533,8 +539,8 @@ void AArch64InstrInfo::insertSelect(MachineBasicBlock &MBB,
 }
 
 /// Returns true if a MOVi32imm or MOVi64imm can be expanded to an  ORRxx.
-static bool canBeExpandedToORR(const MachineInstr *MI, unsigned BitSize) {
-  uint64_t Imm = MI->getOperand(1).getImm();
+static bool canBeExpandedToORR(const MachineInstr &MI, unsigned BitSize) {
+  uint64_t Imm = MI.getOperand(1).getImm();
   uint64_t UImm = Imm << (64 - BitSize) >> (64 - BitSize);
   uint64_t Encoding;
   return AArch64_AM::processLogicalImmediate(UImm, BitSize, Encoding);
@@ -542,11 +548,13 @@ static bool canBeExpandedToORR(const MachineInstr *MI, unsigned BitSize) {
 
 // FIXME: this implementation should be micro-architecture dependent, so a
 // micro-architecture target hook should be introduced here in future.
-bool AArch64InstrInfo::isAsCheapAsAMove(const MachineInstr *MI) const {
-  if (!Subtarget.isCortexA57() && !Subtarget.isCortexA53())
-    return MI->isAsCheapAsAMove();
+bool AArch64InstrInfo::isAsCheapAsAMove(const MachineInstr &MI) const {
+  if (!Subtarget.hasCustomCheapAsMoveHandling())
+    return MI.isAsCheapAsAMove();
+
+  unsigned Imm;
 
-  switch (MI->getOpcode()) {
+  switch (MI.getOpcode()) {
   default:
     return false;
 
@@ -555,7 +563,17 @@ bool AArch64InstrInfo::isAsCheapAsAMove(const MachineInstr *MI) const {
   case AArch64::ADDXri:
   case AArch64::SUBWri:
   case AArch64::SUBXri:
-    return (MI->getOperand(3).getImm() == 0);
+    return (Subtarget.getProcFamily() == AArch64Subtarget::ExynosM1 ||
+            MI.getOperand(3).getImm() == 0);
+
+  // add/sub on register with shift
+  case AArch64::ADDWrs:
+  case AArch64::ADDXrs:
+  case AArch64::SUBWrs:
+  case AArch64::SUBXrs:
+    Imm = MI.getOperand(3).getImm();
+    return (Subtarget.getProcFamily() == AArch64Subtarget::ExynosM1 &&
+            AArch64_AM::getArithShiftValue(Imm) < 4);
 
   // logical ops on immediate
   case AArch64::ANDWri:
@@ -580,12 +598,41 @@ bool AArch64InstrInfo::isAsCheapAsAMove(const MachineInstr *MI) const {
   case AArch64::ORRWrr:
   case AArch64::ORRXrr:
     return true;
+
+  // logical ops on register with shift
+  case AArch64::ANDWrs:
+  case AArch64::ANDXrs:
+  case AArch64::BICWrs:
+  case AArch64::BICXrs:
+  case AArch64::EONWrs:
+  case AArch64::EONXrs:
+  case AArch64::EORWrs:
+  case AArch64::EORXrs:
+  case AArch64::ORNWrs:
+  case AArch64::ORNXrs:
+  case AArch64::ORRWrs:
+  case AArch64::ORRXrs:
+    Imm = MI.getOperand(3).getImm();
+    return (Subtarget.getProcFamily() == AArch64Subtarget::ExynosM1 &&
+            AArch64_AM::getShiftValue(Imm) < 4 &&
+            AArch64_AM::getShiftType(Imm) == AArch64_AM::LSL);
+
   // If MOVi32imm or MOVi64imm can be expanded into ORRWri or
   // ORRXri, it is as cheap as MOV
   case AArch64::MOVi32imm:
     return canBeExpandedToORR(MI, 32);
   case AArch64::MOVi64imm:
     return canBeExpandedToORR(MI, 64);
+
+  // It is cheap to zero out registers if the subtarget has ZeroCycleZeroing
+  // feature.
+  case AArch64::FMOVS0:
+  case AArch64::FMOVD0:
+    return Subtarget.hasZeroCycleZeroing();
+  case TargetOpcode::COPY:
+    return (Subtarget.hasZeroCycleZeroing() &&
+            (MI.getOperand(1).getReg() == AArch64::WZR ||
+             MI.getOperand(1).getReg() == AArch64::XZR));
   }
 
   llvm_unreachable("Unknown opcode to check as cheap as a move!");
@@ -611,20 +658,18 @@ bool AArch64InstrInfo::isCoalescableExtInstr(const MachineInstr &MI,
   }
 }
 
-bool
-AArch64InstrInfo::areMemAccessesTriviallyDisjoint(MachineInstr *MIa,
-                                                  MachineInstr *MIb,
-                                                  AliasAnalysis *AA) const {
+bool AArch64InstrInfo::areMemAccessesTriviallyDisjoint(
+    MachineInstr &MIa, MachineInstr &MIb, AliasAnalysis *AA) const {
   const TargetRegisterInfo *TRI = &getRegisterInfo();
   unsigned BaseRegA = 0, BaseRegB = 0;
-  int OffsetA = 0, OffsetB = 0;
-  int WidthA = 0, WidthB = 0;
+  int64_t OffsetA = 0, OffsetB = 0;
+  unsigned WidthA = 0, WidthB = 0;
 
-  assert(MIa && MIa->mayLoadOrStore() && "MIa must be a load or store.");
-  assert(MIb && MIb->mayLoadOrStore() && "MIb must be a load or store.");
+  assert(MIa.mayLoadOrStore() && "MIa must be a load or store.");
+  assert(MIb.mayLoadOrStore() && "MIb must be a load or store.");
 
-  if (MIa->hasUnmodeledSideEffects() || MIb->hasUnmodeledSideEffects() ||
-      MIa->hasOrderedMemoryRef() || MIb->hasOrderedMemoryRef())
+  if (MIa.hasUnmodeledSideEffects() || MIb.hasUnmodeledSideEffects() ||
+      MIa.hasOrderedMemoryRef() || MIb.hasOrderedMemoryRef())
     return false;
 
   // Retrieve the base register, offset from the base register and width. Width
@@ -648,10 +693,10 @@ AArch64InstrInfo::areMemAccessesTriviallyDisjoint(MachineInstr *MIa,
 /// analyzeCompare - For a comparison instruction, return the source registers
 /// in SrcReg and SrcReg2, and the value it compares against in CmpValue.
 /// Return true if the comparison instruction can be analyzed.
-bool AArch64InstrInfo::analyzeCompare(const MachineInstr *MI, unsigned &SrcReg,
+bool AArch64InstrInfo::analyzeCompare(const MachineInstr &MI, unsigned &SrcReg,
                                       unsigned &SrcReg2, int &CmpMask,
                                       int &CmpValue) const {
-  switch (MI->getOpcode()) {
+  switch (MI.getOpcode()) {
   default:
     break;
   case AArch64::SUBSWrr:
@@ -667,8 +712,8 @@ bool AArch64InstrInfo::analyzeCompare(const MachineInstr *MI, unsigned &SrcReg,
   case AArch64::ADDSXrs:
   case AArch64::ADDSXrx:
     // Replace SUBSWrr with SUBWrr if NZCV is not used.
-    SrcReg = MI->getOperand(1).getReg();
-    SrcReg2 = MI->getOperand(2).getReg();
+    SrcReg = MI.getOperand(1).getReg();
+    SrcReg2 = MI.getOperand(2).getReg();
     CmpMask = ~0;
     CmpValue = 0;
     return true;
@@ -676,17 +721,17 @@ bool AArch64InstrInfo::analyzeCompare(const MachineInstr *MI, unsigned &SrcReg,
   case AArch64::ADDSWri:
   case AArch64::SUBSXri:
   case AArch64::ADDSXri:
-    SrcReg = MI->getOperand(1).getReg();
+    SrcReg = MI.getOperand(1).getReg();
     SrcReg2 = 0;
     CmpMask = ~0;
     // FIXME: In order to convert CmpValue to 0 or 1
-    CmpValue = (MI->getOperand(2).getImm() != 0);
+    CmpValue = MI.getOperand(2).getImm() != 0;
     return true;
   case AArch64::ANDSWri:
   case AArch64::ANDSXri:
     // ANDS does not use the same encoding scheme as the others xxxS
     // instructions.
-    SrcReg = MI->getOperand(1).getReg();
+    SrcReg = MI.getOperand(1).getReg();
     SrcReg2 = 0;
     CmpMask = ~0;
     // FIXME:The return val type of decodeLogicalImmediate is uint64_t,
@@ -694,17 +739,17 @@ bool AArch64InstrInfo::analyzeCompare(const MachineInstr *MI, unsigned &SrcReg,
     // the high 32 bits of uint64_t will be lost.
     // In fact it causes a bug in spec2006-483.xalancbmk
     // CmpValue is only used to compare with zero in OptimizeCompareInstr
-    CmpValue = (AArch64_AM::decodeLogicalImmediate(
-                    MI->getOperand(2).getImm(),
-                    MI->getOpcode() == AArch64::ANDSWri ? 32 : 64) != 0);
+    CmpValue = AArch64_AM::decodeLogicalImmediate(
+                   MI.getOperand(2).getImm(),
+                   MI.getOpcode() == AArch64::ANDSWri ? 32 : 64) != 0;
     return true;
   }
 
   return false;
 }
 
-static bool UpdateOperandRegClass(MachineInstr *Instr) {
-  MachineBasicBlock *MBB = Instr->getParent();
+static bool UpdateOperandRegClass(MachineInstr &Instr) {
+  MachineBasicBlock *MBB = Instr.getParent();
   assert(MBB && "Can't get MachineBasicBlock here");
   MachineFunction *MF = MBB->getParent();
   assert(MF && "Can't get MachineFunction here");
@@ -712,11 +757,11 @@ static bool UpdateOperandRegClass(MachineInstr *Instr) {
   const TargetRegisterInfo *TRI = MF->getSubtarget().getRegisterInfo();
   MachineRegisterInfo *MRI = &MF->getRegInfo();
 
-  for (unsigned OpIdx = 0, EndIdx = Instr->getNumOperands(); OpIdx < EndIdx;
+  for (unsigned OpIdx = 0, EndIdx = Instr.getNumOperands(); OpIdx < EndIdx;
        ++OpIdx) {
-    MachineOperand &MO = Instr->getOperand(OpIdx);
+    MachineOperand &MO = Instr.getOperand(OpIdx);
     const TargetRegisterClass *OpRegCstraints =
-        Instr->getRegClassConstraint(OpIdx, TII, TRI);
+        Instr.getRegClassConstraint(OpIdx, TII, TRI);
 
     // If there's no constraint, there's nothing to do.
     if (!OpRegCstraints)
@@ -744,16 +789,16 @@ static bool UpdateOperandRegClass(MachineInstr *Instr) {
 /// \brief Return the opcode that does not set flags when possible - otherwise
 /// return the original opcode. The caller is responsible to do the actual
 /// substitution and legality checking.
-static unsigned convertFlagSettingOpcode(const MachineInstr *MI) {
+static unsigned convertFlagSettingOpcode(const MachineInstr &MI) {
   // Don't convert all compare instructions, because for some the zero register
   // encoding becomes the sp register.
   bool MIDefinesZeroReg = false;
-  if (MI->definesRegister(AArch64::WZR) || MI->definesRegister(AArch64::XZR))
+  if (MI.definesRegister(AArch64::WZR) || MI.definesRegister(AArch64::XZR))
     MIDefinesZeroReg = true;
 
-  switch (MI->getOpcode()) {
+  switch (MI.getOpcode()) {
   default:
-    return MI->getOpcode();
+    return MI.getOpcode();
   case AArch64::ADDSWrr:
     return AArch64::ADDWrr;
   case AArch64::ADDSWri:
@@ -789,60 +834,76 @@ static unsigned convertFlagSettingOpcode(const MachineInstr *MI) {
   }
 }
 
-/// True when condition code could be modified on the instruction
-/// trace starting at from and ending at to.
-static bool modifiesConditionCode(MachineInstr *From, MachineInstr *To,
-                                  const bool CheckOnlyCCWrites,
-                                  const TargetRegisterInfo *TRI) {
-  // We iterate backward starting \p To until we hit \p From
-  MachineBasicBlock::iterator I = To, E = From, B = To->getParent()->begin();
+enum AccessKind {
+  AK_Write = 0x01,
+  AK_Read  = 0x10,
+  AK_All   = 0x11
+};
 
+/// True when condition flags are accessed (either by writing or reading)
+/// on the instruction trace starting at From and ending at To.
+///
+/// Note: If From and To are from different blocks it's assumed CC are accessed
+///       on the path.
+static bool areCFlagsAccessedBetweenInstrs(
+    MachineBasicBlock::iterator From, MachineBasicBlock::iterator To,
+    const TargetRegisterInfo *TRI, const AccessKind AccessToCheck = AK_All) {
   // Early exit if To is at the beginning of the BB.
-  if (I == B)
+  if (To == To->getParent()->begin())
     return true;
 
-  // Check whether the definition of SrcReg is in the same basic block as
-  // Compare. If not, assume the condition code gets modified on some path.
+  // Check whether the instructions are in the same basic block
+  // If not, assume the condition flags might get modified somewhere.
   if (To->getParent() != From->getParent())
     return true;
 
-  // Check that NZCV isn't set on the trace.
-  for (--I; I != E; --I) {
-    const MachineInstr &Instr = *I;
+  // From must be above To.
+  assert(std::find_if(MachineBasicBlock::reverse_iterator(To),
+                      To->getParent()->rend(), [From](MachineInstr &MI) {
+                        return MachineBasicBlock::iterator(MI) == From;
+                      }) != To->getParent()->rend());
 
-    if (Instr.modifiesRegister(AArch64::NZCV, TRI) ||
-        (!CheckOnlyCCWrites && Instr.readsRegister(AArch64::NZCV, TRI)))
-      // This instruction modifies or uses NZCV after the one we want to
-      // change.
-      return true;
-    if (I == B)
-      // We currently don't allow the instruction trace to cross basic
-      // block boundaries
+  // We iterate backward starting \p To until we hit \p From.
+  for (--To; To != From; --To) {
+    const MachineInstr &Instr = *To;
+
+    if ( ((AccessToCheck & AK_Write) && Instr.modifiesRegister(AArch64::NZCV, TRI)) ||
+         ((AccessToCheck & AK_Read)  && Instr.readsRegister(AArch64::NZCV, TRI)))
       return true;
   }
   return false;
 }
-/// optimizeCompareInstr - Convert the instruction supplying the argument to the
-/// comparison into one that sets the zero bit in the flags register.
+
+/// Try to optimize a compare instruction. A compare instruction is an
+/// instruction which produces AArch64::NZCV. It can be truly compare instruction
+/// when there are no uses of its destination register.
+///
+/// The following steps are tried in order:
+/// 1. Convert CmpInstr into an unconditional version.
+/// 2. Remove CmpInstr if above there is an instruction producing a needed
+///    condition code or an instruction which can be converted into such an instruction.
+///    Only comparison with zero is supported.
 bool AArch64InstrInfo::optimizeCompareInstr(
-    MachineInstr *CmpInstr, unsigned SrcReg, unsigned SrcReg2, int CmpMask,
+    MachineInstr &CmpInstr, unsigned SrcReg, unsigned SrcReg2, int CmpMask,
     int CmpValue, const MachineRegisterInfo *MRI) const {
+  assert(CmpInstr.getParent());
+  assert(MRI);
 
   // Replace SUBSWrr with SUBWrr if NZCV is not used.
-  int Cmp_NZCV = CmpInstr->findRegisterDefOperandIdx(AArch64::NZCV, true);
-  if (Cmp_NZCV != -1) {
-    if (CmpInstr->definesRegister(AArch64::WZR) ||
-        CmpInstr->definesRegister(AArch64::XZR)) {
-      CmpInstr->eraseFromParent();
+  int DeadNZCVIdx = CmpInstr.findRegisterDefOperandIdx(AArch64::NZCV, true);
+  if (DeadNZCVIdx != -1) {
+    if (CmpInstr.definesRegister(AArch64::WZR) ||
+        CmpInstr.definesRegister(AArch64::XZR)) {
+      CmpInstr.eraseFromParent();
       return true;
     }
-    unsigned Opc = CmpInstr->getOpcode();
+    unsigned Opc = CmpInstr.getOpcode();
     unsigned NewOpc = convertFlagSettingOpcode(CmpInstr);
     if (NewOpc == Opc)
       return false;
     const MCInstrDesc &MCID = get(NewOpc);
-    CmpInstr->setDesc(MCID);
-    CmpInstr->RemoveOperand(Cmp_NZCV);
+    CmpInstr.setDesc(MCID);
+    CmpInstr.RemoveOperand(DeadNZCVIdx);
     bool succeeded = UpdateOperandRegClass(CmpInstr);
     (void)succeeded;
     assert(succeeded && "Some operands reg class are incompatible!");
@@ -857,23 +918,21 @@ bool AArch64InstrInfo::optimizeCompareInstr(
     return false;
 
   // CmpInstr is a Compare instruction if destination register is not used.
-  if (!MRI->use_nodbg_empty(CmpInstr->getOperand(0).getReg()))
-    return false;
-
-  // Get the unique definition of SrcReg.
-  MachineInstr *MI = MRI->getUniqueVRegDef(SrcReg);
-  if (!MI)
+  if (!MRI->use_nodbg_empty(CmpInstr.getOperand(0).getReg()))
     return false;
 
-  bool CheckOnlyCCWrites = false;
-  const TargetRegisterInfo *TRI = &getRegisterInfo();
-  if (modifiesConditionCode(MI, CmpInstr, CheckOnlyCCWrites, TRI))
-    return false;
+  return substituteCmpToZero(CmpInstr, SrcReg, MRI);
+}
 
-  unsigned NewOpc = MI->getOpcode();
-  switch (MI->getOpcode()) {
+/// Get opcode of S version of Instr.
+/// If Instr is S version its opcode is returned.
+/// AArch64::INSTRUCTION_LIST_END is returned if Instr does not have S version
+/// or we are not interested in it.
+static unsigned sForm(MachineInstr &Instr) {
+  switch (Instr.getOpcode()) {
   default:
-    return false;
+    return AArch64::INSTRUCTION_LIST_END;
+
   case AArch64::ADDSWrr:
   case AArch64::ADDSWri:
   case AArch64::ADDSXrr:
@@ -882,116 +941,221 @@ bool AArch64InstrInfo::optimizeCompareInstr(
   case AArch64::SUBSWri:
   case AArch64::SUBSXrr:
   case AArch64::SUBSXri:
-    break;
-  case AArch64::ADDWrr:    NewOpc = AArch64::ADDSWrr; break;
-  case AArch64::ADDWri:    NewOpc = AArch64::ADDSWri; break;
-  case AArch64::ADDXrr:    NewOpc = AArch64::ADDSXrr; break;
-  case AArch64::ADDXri:    NewOpc = AArch64::ADDSXri; break;
-  case AArch64::ADCWr:     NewOpc = AArch64::ADCSWr; break;
-  case AArch64::ADCXr:     NewOpc = AArch64::ADCSXr; break;
-  case AArch64::SUBWrr:    NewOpc = AArch64::SUBSWrr; break;
-  case AArch64::SUBWri:    NewOpc = AArch64::SUBSWri; break;
-  case AArch64::SUBXrr:    NewOpc = AArch64::SUBSXrr; break;
-  case AArch64::SUBXri:    NewOpc = AArch64::SUBSXri; break;
-  case AArch64::SBCWr:     NewOpc = AArch64::SBCSWr; break;
-  case AArch64::SBCXr:     NewOpc = AArch64::SBCSXr; break;
-  case AArch64::ANDWri:    NewOpc = AArch64::ANDSWri; break;
-  case AArch64::ANDXri:    NewOpc = AArch64::ANDSXri; break;
-  }
-
-  // Scan forward for the use of NZCV.
-  // When checking against MI: if it's a conditional code requires
-  // checking of V bit, then this is not safe to do.
-  // It is safe to remove CmpInstr if NZCV is redefined or killed.
-  // If we are done with the basic block, we need to check whether NZCV is
-  // live-out.
-  bool IsSafe = false;
-  for (MachineBasicBlock::iterator I = CmpInstr,
-                                   E = CmpInstr->getParent()->end();
-       !IsSafe && ++I != E;) {
-    const MachineInstr &Instr = *I;
-    for (unsigned IO = 0, EO = Instr.getNumOperands(); !IsSafe && IO != EO;
-         ++IO) {
-      const MachineOperand &MO = Instr.getOperand(IO);
-      if (MO.isRegMask() && MO.clobbersPhysReg(AArch64::NZCV)) {
-        IsSafe = true;
-        break;
-      }
-      if (!MO.isReg() || MO.getReg() != AArch64::NZCV)
-        continue;
-      if (MO.isDef()) {
-        IsSafe = true;
-        break;
-      }
+    return Instr.getOpcode();;
+
+  case AArch64::ADDWrr:    return AArch64::ADDSWrr;
+  case AArch64::ADDWri:    return AArch64::ADDSWri;
+  case AArch64::ADDXrr:    return AArch64::ADDSXrr;
+  case AArch64::ADDXri:    return AArch64::ADDSXri;
+  case AArch64::ADCWr:     return AArch64::ADCSWr;
+  case AArch64::ADCXr:     return AArch64::ADCSXr;
+  case AArch64::SUBWrr:    return AArch64::SUBSWrr;
+  case AArch64::SUBWri:    return AArch64::SUBSWri;
+  case AArch64::SUBXrr:    return AArch64::SUBSXrr;
+  case AArch64::SUBXri:    return AArch64::SUBSXri;
+  case AArch64::SBCWr:     return AArch64::SBCSWr;
+  case AArch64::SBCXr:     return AArch64::SBCSXr;
+  case AArch64::ANDWri:    return AArch64::ANDSWri;
+  case AArch64::ANDXri:    return AArch64::ANDSXri;
+  }
+}
 
-      // Decode the condition code.
-      unsigned Opc = Instr.getOpcode();
-      AArch64CC::CondCode CC;
-      switch (Opc) {
-      default:
-        return false;
-      case AArch64::Bcc:
-        CC = (AArch64CC::CondCode)Instr.getOperand(IO - 2).getImm();
-        break;
-      case AArch64::CSINVWr:
-      case AArch64::CSINVXr:
-      case AArch64::CSINCWr:
-      case AArch64::CSINCXr:
-      case AArch64::CSELWr:
-      case AArch64::CSELXr:
-      case AArch64::CSNEGWr:
-      case AArch64::CSNEGXr:
-      case AArch64::FCSELSrrr:
-      case AArch64::FCSELDrrr:
-        CC = (AArch64CC::CondCode)Instr.getOperand(IO - 1).getImm();
-        break;
-      }
+/// Check if AArch64::NZCV should be alive in successors of MBB.
+static bool areCFlagsAliveInSuccessors(MachineBasicBlock *MBB) {
+  for (auto *BB : MBB->successors())
+    if (BB->isLiveIn(AArch64::NZCV))
+      return true;
+  return false;
+}
 
-      // It is not safe to remove Compare instruction if Overflow(V) is used.
-      switch (CC) {
-      default:
-        // NZCV can be used multiple times, we should continue.
-        break;
-      case AArch64CC::VS:
-      case AArch64CC::VC:
-      case AArch64CC::GE:
-      case AArch64CC::LT:
-      case AArch64CC::GT:
-      case AArch64CC::LE:
-        return false;
-      }
+struct UsedNZCV {
+  bool N;
+  bool Z;
+  bool C;
+  bool V;
+  UsedNZCV(): N(false), Z(false), C(false), V(false) {}
+  UsedNZCV& operator |=(const UsedNZCV& UsedFlags) {
+    this->N |= UsedFlags.N;
+    this->Z |= UsedFlags.Z;
+    this->C |= UsedFlags.C;
+    this->V |= UsedFlags.V;
+    return *this;
+  }
+};
+
+/// Find a condition code used by the instruction.
+/// Returns AArch64CC::Invalid if either the instruction does not use condition
+/// codes or we don't optimize CmpInstr in the presence of such instructions.
+static AArch64CC::CondCode findCondCodeUsedByInstr(const MachineInstr &Instr) {
+  switch (Instr.getOpcode()) {
+    default:
+      return AArch64CC::Invalid;
+
+    case AArch64::Bcc: {
+      int Idx = Instr.findRegisterUseOperandIdx(AArch64::NZCV);
+      assert(Idx >= 2);
+      return static_cast<AArch64CC::CondCode>(Instr.getOperand(Idx - 2).getImm());
     }
+
+    case AArch64::CSINVWr:
+    case AArch64::CSINVXr:
+    case AArch64::CSINCWr:
+    case AArch64::CSINCXr:
+    case AArch64::CSELWr:
+    case AArch64::CSELXr:
+    case AArch64::CSNEGWr:
+    case AArch64::CSNEGXr:
+    case AArch64::FCSELSrrr:
+    case AArch64::FCSELDrrr: {
+      int Idx = Instr.findRegisterUseOperandIdx(AArch64::NZCV);
+      assert(Idx >= 1);
+      return static_cast<AArch64CC::CondCode>(Instr.getOperand(Idx - 1).getImm());
+    }
+  }
+}
+
+static UsedNZCV getUsedNZCV(AArch64CC::CondCode CC) {
+  assert(CC != AArch64CC::Invalid);
+  UsedNZCV UsedFlags;
+  switch (CC) {
+    default:
+      break;
+
+    case AArch64CC::EQ: // Z set
+    case AArch64CC::NE: // Z clear
+      UsedFlags.Z = true;
+      break;
+
+    case AArch64CC::HI: // Z clear and C set
+    case AArch64CC::LS: // Z set   or  C clear
+      UsedFlags.Z = true;
+    case AArch64CC::HS: // C set
+    case AArch64CC::LO: // C clear
+      UsedFlags.C = true;
+      break;
+
+    case AArch64CC::MI: // N set
+    case AArch64CC::PL: // N clear
+      UsedFlags.N = true;
+      break;
+
+    case AArch64CC::VS: // V set
+    case AArch64CC::VC: // V clear
+      UsedFlags.V = true;
+      break;
+
+    case AArch64CC::GT: // Z clear, N and V the same
+    case AArch64CC::LE: // Z set,   N and V differ
+      UsedFlags.Z = true;
+    case AArch64CC::GE: // N and V the same
+    case AArch64CC::LT: // N and V differ 
+      UsedFlags.N = true;
+      UsedFlags.V = true;
+      break;
   }
+  return UsedFlags;
+}
+
+static bool isADDSRegImm(unsigned Opcode) {
+  return Opcode == AArch64::ADDSWri || Opcode == AArch64::ADDSXri;
+}
+
+static bool isSUBSRegImm(unsigned Opcode) {
+  return Opcode == AArch64::SUBSWri || Opcode == AArch64::SUBSXri;
+}
+
+/// Check if CmpInstr can be substituted by MI.
+///
+/// CmpInstr can be substituted:
+/// - CmpInstr is either 'ADDS %vreg, 0' or 'SUBS %vreg, 0'
+/// - and, MI and CmpInstr are from the same MachineBB
+/// - and, condition flags are not alive in successors of the CmpInstr parent
+/// - and, if MI opcode is the S form there must be no defs of flags between
+///        MI and CmpInstr
+///        or if MI opcode is not the S form there must be neither defs of flags
+///        nor uses of flags between MI and CmpInstr.
+/// - and  C/V flags are not used after CmpInstr
+static bool canInstrSubstituteCmpInstr(MachineInstr *MI, MachineInstr *CmpInstr,
+    const TargetRegisterInfo *TRI) {
+  assert(MI);
+  assert(sForm(*MI) != AArch64::INSTRUCTION_LIST_END);
+  assert(CmpInstr);
+
+  const unsigned CmpOpcode = CmpInstr->getOpcode();
+  if (!isADDSRegImm(CmpOpcode) && !isSUBSRegImm(CmpOpcode))
+    return false;
 
-  // If NZCV is not killed nor re-defined, we should check whether it is
-  // live-out. If it is live-out, do not optimize.
-  if (!IsSafe) {
-    MachineBasicBlock *ParentBlock = CmpInstr->getParent();
-    for (auto *MBB : ParentBlock->successors())
-      if (MBB->isLiveIn(AArch64::NZCV))
+  if (MI->getParent() != CmpInstr->getParent())
+    return false;
+
+  if (areCFlagsAliveInSuccessors(CmpInstr->getParent()))
+    return false;
+
+  AccessKind AccessToCheck = AK_Write;
+  if (sForm(*MI) != MI->getOpcode())
+    AccessToCheck = AK_All;
+  if (areCFlagsAccessedBetweenInstrs(MI, CmpInstr, TRI, AccessToCheck))
+    return false;
+
+  UsedNZCV NZCVUsedAfterCmp;
+  for (auto I = std::next(CmpInstr->getIterator()), E = CmpInstr->getParent()->instr_end();
+       I != E; ++I) {
+    const MachineInstr &Instr = *I;
+    if (Instr.readsRegister(AArch64::NZCV, TRI)) {
+      AArch64CC::CondCode CC = findCondCodeUsedByInstr(Instr);
+      if (CC == AArch64CC::Invalid) // Unsupported conditional instruction
         return false;
+      NZCVUsedAfterCmp |= getUsedNZCV(CC);
+    }
+
+    if (Instr.modifiesRegister(AArch64::NZCV, TRI))
+      break;
   }
+  
+  return !NZCVUsedAfterCmp.C && !NZCVUsedAfterCmp.V;
+}
+
+/// Substitute an instruction comparing to zero with another instruction
+/// which produces needed condition flags.
+///
+/// Return true on success.
+bool AArch64InstrInfo::substituteCmpToZero(
+    MachineInstr &CmpInstr, unsigned SrcReg,
+    const MachineRegisterInfo *MRI) const {
+  assert(MRI);
+  // Get the unique definition of SrcReg.
+  MachineInstr *MI = MRI->getUniqueVRegDef(SrcReg);
+  if (!MI)
+    return false;
+
+  const TargetRegisterInfo *TRI = &getRegisterInfo();
+
+  unsigned NewOpc = sForm(*MI);
+  if (NewOpc == AArch64::INSTRUCTION_LIST_END)
+    return false;
+
+  if (!canInstrSubstituteCmpInstr(MI, &CmpInstr, TRI))
+    return false;
 
   // Update the instruction to set NZCV.
   MI->setDesc(get(NewOpc));
-  CmpInstr->eraseFromParent();
-  bool succeeded = UpdateOperandRegClass(MI);
+  CmpInstr.eraseFromParent();
+  bool succeeded = UpdateOperandRegClass(*MI);
   (void)succeeded;
   assert(succeeded && "Some operands reg class are incompatible!");
   MI->addRegisterDefined(AArch64::NZCV, TRI);
   return true;
 }
 
-bool
-AArch64InstrInfo::expandPostRAPseudo(MachineBasicBlock::iterator MI) const {
-  if (MI->getOpcode() != TargetOpcode::LOAD_STACK_GUARD)
+bool AArch64InstrInfo::expandPostRAPseudo(MachineInstr &MI) const {
+  if (MI.getOpcode() != TargetOpcode::LOAD_STACK_GUARD)
     return false;
 
-  MachineBasicBlock &MBB = *MI->getParent();
-  DebugLoc DL = MI->getDebugLoc();
-  unsigned Reg = MI->getOperand(0).getReg();
+  MachineBasicBlock &MBB = *MI.getParent();
+  DebugLoc DL = MI.getDebugLoc();
+  unsigned Reg = MI.getOperand(0).getReg();
   const GlobalValue *GV =
-      cast<GlobalValue>((*MI->memoperands_begin())->getValue());
+      cast<GlobalValue>((*MI.memoperands_begin())->getValue());
   const TargetMachine &TM = MBB.getParent()->getTarget();
   unsigned char OpFlags = Subtarget.ClassifyGlobalReference(GV, TM);
   const unsigned char MO_NC = AArch64II::MO_NC;
@@ -1000,8 +1164,9 @@ AArch64InstrInfo::expandPostRAPseudo(MachineBasicBlock::iterator MI) const {
     BuildMI(MBB, MI, DL, get(AArch64::LOADgot), Reg)
         .addGlobalAddress(GV, 0, AArch64II::MO_GOT);
     BuildMI(MBB, MI, DL, get(AArch64::LDRXui), Reg)
-        .addReg(Reg, RegState::Kill).addImm(0)
-        .addMemOperand(*MI->memoperands_begin());
+        .addReg(Reg, RegState::Kill)
+        .addImm(0)
+        .addMemOperand(*MI.memoperands_begin());
   } else if (TM.getCodeModel() == CodeModel::Large) {
     BuildMI(MBB, MI, DL, get(AArch64::MOVZXi), Reg)
         .addGlobalAddress(GV, 0, AArch64II::MO_G3).addImm(48);
@@ -1015,8 +1180,9 @@ AArch64InstrInfo::expandPostRAPseudo(MachineBasicBlock::iterator MI) const {
         .addReg(Reg, RegState::Kill)
         .addGlobalAddress(GV, 0, AArch64II::MO_G0 | MO_NC).addImm(0);
     BuildMI(MBB, MI, DL, get(AArch64::LDRXui), Reg)
-        .addReg(Reg, RegState::Kill).addImm(0)
-        .addMemOperand(*MI->memoperands_begin());
+        .addReg(Reg, RegState::Kill)
+        .addImm(0)
+        .addMemOperand(*MI.memoperands_begin());
   } else {
     BuildMI(MBB, MI, DL, get(AArch64::ADRP), Reg)
         .addGlobalAddress(GV, 0, OpFlags | AArch64II::MO_PAGE);
@@ -1024,7 +1190,7 @@ AArch64InstrInfo::expandPostRAPseudo(MachineBasicBlock::iterator MI) const {
     BuildMI(MBB, MI, DL, get(AArch64::LDRXui), Reg)
         .addReg(Reg, RegState::Kill)
         .addGlobalAddress(GV, 0, LoFlags)
-        .addMemOperand(*MI->memoperands_begin());
+        .addMemOperand(*MI.memoperands_begin());
   }
 
   MBB.erase(MI);
@@ -1033,8 +1199,8 @@ AArch64InstrInfo::expandPostRAPseudo(MachineBasicBlock::iterator MI) const {
 }
 
 /// Return true if this is this instruction has a non-zero immediate
-bool AArch64InstrInfo::hasShiftedReg(const MachineInstr *MI) const {
-  switch (MI->getOpcode()) {
+bool AArch64InstrInfo::hasShiftedReg(const MachineInstr &MI) const {
+  switch (MI.getOpcode()) {
   default:
     break;
   case AArch64::ADDSWrs:
@@ -1069,8 +1235,8 @@ bool AArch64InstrInfo::hasShiftedReg(const MachineInstr *MI) const {
   case AArch64::SUBSXrs:
   case AArch64::SUBWrs:
   case AArch64::SUBXrs:
-    if (MI->getOperand(3).isImm()) {
-      unsigned val = MI->getOperand(3).getImm();
+    if (MI.getOperand(3).isImm()) {
+      unsigned val = MI.getOperand(3).getImm();
       return (val != 0);
     }
     break;
@@ -1079,8 +1245,8 @@ bool AArch64InstrInfo::hasShiftedReg(const MachineInstr *MI) const {
 }
 
 /// Return true if this is this instruction has a non-zero immediate
-bool AArch64InstrInfo::hasExtendedReg(const MachineInstr *MI) const {
-  switch (MI->getOpcode()) {
+bool AArch64InstrInfo::hasExtendedReg(const MachineInstr &MI) const {
+  switch (MI.getOpcode()) {
   default:
     break;
   case AArch64::ADDSWrx:
@@ -1095,8 +1261,8 @@ bool AArch64InstrInfo::hasExtendedReg(const MachineInstr *MI) const {
   case AArch64::SUBWrx:
   case AArch64::SUBXrx:
   case AArch64::SUBXrx64:
-    if (MI->getOperand(3).isImm()) {
-      unsigned val = MI->getOperand(3).getImm();
+    if (MI.getOperand(3).isImm()) {
+      unsigned val = MI.getOperand(3).getImm();
       return (val != 0);
     }
     break;
@@ -1107,51 +1273,51 @@ bool AArch64InstrInfo::hasExtendedReg(const MachineInstr *MI) const {
 
 // Return true if this instruction simply sets its single destination register
 // to zero. This is equivalent to a register rename of the zero-register.
-bool AArch64InstrInfo::isGPRZero(const MachineInstr *MI) const {
-  switch (MI->getOpcode()) {
+bool AArch64InstrInfo::isGPRZero(const MachineInstr &MI) const {
+  switch (MI.getOpcode()) {
   default:
     break;
   case AArch64::MOVZWi:
   case AArch64::MOVZXi: // movz Rd, #0 (LSL #0)
-    if (MI->getOperand(1).isImm() && MI->getOperand(1).getImm() == 0) {
-      assert(MI->getDesc().getNumOperands() == 3 &&
-             MI->getOperand(2).getImm() == 0 && "invalid MOVZi operands");
+    if (MI.getOperand(1).isImm() && MI.getOperand(1).getImm() == 0) {
+      assert(MI.getDesc().getNumOperands() == 3 &&
+             MI.getOperand(2).getImm() == 0 && "invalid MOVZi operands");
       return true;
     }
     break;
   case AArch64::ANDWri: // and Rd, Rzr, #imm
-    return MI->getOperand(1).getReg() == AArch64::WZR;
+    return MI.getOperand(1).getReg() == AArch64::WZR;
   case AArch64::ANDXri:
-    return MI->getOperand(1).getReg() == AArch64::XZR;
+    return MI.getOperand(1).getReg() == AArch64::XZR;
   case TargetOpcode::COPY:
-    return MI->getOperand(1).getReg() == AArch64::WZR;
+    return MI.getOperand(1).getReg() == AArch64::WZR;
   }
   return false;
 }
 
 // Return true if this instruction simply renames a general register without
 // modifying bits.
-bool AArch64InstrInfo::isGPRCopy(const MachineInstr *MI) const {
-  switch (MI->getOpcode()) {
+bool AArch64InstrInfo::isGPRCopy(const MachineInstr &MI) const {
+  switch (MI.getOpcode()) {
   default:
     break;
   case TargetOpcode::COPY: {
     // GPR32 copies will by lowered to ORRXrs
-    unsigned DstReg = MI->getOperand(0).getReg();
+    unsigned DstReg = MI.getOperand(0).getReg();
     return (AArch64::GPR32RegClass.contains(DstReg) ||
             AArch64::GPR64RegClass.contains(DstReg));
   }
   case AArch64::ORRXrs: // orr Xd, Xzr, Xm (LSL #0)
-    if (MI->getOperand(1).getReg() == AArch64::XZR) {
-      assert(MI->getDesc().getNumOperands() == 4 &&
-             MI->getOperand(3).getImm() == 0 && "invalid ORRrs operands");
+    if (MI.getOperand(1).getReg() == AArch64::XZR) {
+      assert(MI.getDesc().getNumOperands() == 4 &&
+             MI.getOperand(3).getImm() == 0 && "invalid ORRrs operands");
       return true;
     }
     break;
   case AArch64::ADDXri: // add Xd, Xn, #0 (LSL #0)
-    if (MI->getOperand(2).getImm() == 0) {
-      assert(MI->getDesc().getNumOperands() == 4 &&
-             MI->getOperand(3).getImm() == 0 && "invalid ADDXri operands");
+    if (MI.getOperand(2).getImm() == 0) {
+      assert(MI.getDesc().getNumOperands() == 4 &&
+             MI.getOperand(3).getImm() == 0 && "invalid ADDXri operands");
       return true;
     }
     break;
@@ -1161,19 +1327,19 @@ bool AArch64InstrInfo::isGPRCopy(const MachineInstr *MI) const {
 
 // Return true if this instruction simply renames a general register without
 // modifying bits.
-bool AArch64InstrInfo::isFPRCopy(const MachineInstr *MI) const {
-  switch (MI->getOpcode()) {
+bool AArch64InstrInfo::isFPRCopy(const MachineInstr &MI) const {
+  switch (MI.getOpcode()) {
   default:
     break;
   case TargetOpcode::COPY: {
     // FPR64 copies will by lowered to ORR.16b
-    unsigned DstReg = MI->getOperand(0).getReg();
+    unsigned DstReg = MI.getOperand(0).getReg();
     return (AArch64::FPR64RegClass.contains(DstReg) ||
             AArch64::FPR128RegClass.contains(DstReg));
   }
   case AArch64::ORRv16i8:
-    if (MI->getOperand(1).getReg() == MI->getOperand(2).getReg()) {
-      assert(MI->getDesc().getNumOperands() == 3 && MI->getOperand(0).isReg() &&
+    if (MI.getOperand(1).getReg() == MI.getOperand(2).getReg()) {
+      assert(MI.getDesc().getNumOperands() == 3 && MI.getOperand(0).isReg() &&
              "invalid ORRv16i8 operands");
       return true;
     }
@@ -1182,9 +1348,9 @@ bool AArch64InstrInfo::isFPRCopy(const MachineInstr *MI) const {
   return false;
 }
 
-unsigned AArch64InstrInfo::isLoadFromStackSlot(const MachineInstr *MI,
+unsigned AArch64InstrInfo::isLoadFromStackSlot(const MachineInstr &MI,
                                                int &FrameIndex) const {
-  switch (MI->getOpcode()) {
+  switch (MI.getOpcode()) {
   default:
     break;
   case AArch64::LDRWui:
@@ -1194,10 +1360,10 @@ unsigned AArch64InstrInfo::isLoadFromStackSlot(const MachineInstr *MI,
   case AArch64::LDRSui:
   case AArch64::LDRDui:
   case AArch64::LDRQui:
-    if (MI->getOperand(0).getSubReg() == 0 && MI->getOperand(1).isFI() &&
-        MI->getOperand(2).isImm() && MI->getOperand(2).getImm() == 0) {
-      FrameIndex = MI->getOperand(1).getIndex();
-      return MI->getOperand(0).getReg();
+    if (MI.getOperand(0).getSubReg() == 0 && MI.getOperand(1).isFI() &&
+        MI.getOperand(2).isImm() && MI.getOperand(2).getImm() == 0) {
+      FrameIndex = MI.getOperand(1).getIndex();
+      return MI.getOperand(0).getReg();
     }
     break;
   }
@@ -1205,9 +1371,9 @@ unsigned AArch64InstrInfo::isLoadFromStackSlot(const MachineInstr *MI,
   return 0;
 }
 
-unsigned AArch64InstrInfo::isStoreToStackSlot(const MachineInstr *MI,
+unsigned AArch64InstrInfo::isStoreToStackSlot(const MachineInstr &MI,
                                               int &FrameIndex) const {
-  switch (MI->getOpcode()) {
+  switch (MI.getOpcode()) {
   default:
     break;
   case AArch64::STRWui:
@@ -1217,10 +1383,10 @@ unsigned AArch64InstrInfo::isStoreToStackSlot(const MachineInstr *MI,
   case AArch64::STRSui:
   case AArch64::STRDui:
   case AArch64::STRQui:
-    if (MI->getOperand(0).getSubReg() == 0 && MI->getOperand(1).isFI() &&
-        MI->getOperand(2).isImm() && MI->getOperand(2).getImm() == 0) {
-      FrameIndex = MI->getOperand(1).getIndex();
-      return MI->getOperand(0).getReg();
+    if (MI.getOperand(0).getSubReg() == 0 && MI.getOperand(1).isFI() &&
+        MI.getOperand(2).isImm() && MI.getOperand(2).getImm() == 0) {
+      FrameIndex = MI.getOperand(1).getIndex();
+      return MI.getOperand(0).getReg();
     }
     break;
   }
@@ -1230,8 +1396,8 @@ unsigned AArch64InstrInfo::isStoreToStackSlot(const MachineInstr *MI,
 /// Return true if this is load/store scales or extends its register offset.
 /// This refers to scaling a dynamic index as opposed to scaled immediates.
 /// MI should be a memory op that allows scaled addressing.
-bool AArch64InstrInfo::isScaledAddr(const MachineInstr *MI) const {
-  switch (MI->getOpcode()) {
+bool AArch64InstrInfo::isScaledAddr(const MachineInstr &MI) const {
+  switch (MI.getOpcode()) {
   default:
     break;
   case AArch64::LDRBBroW:
@@ -1281,7 +1447,7 @@ bool AArch64InstrInfo::isScaledAddr(const MachineInstr *MI) const {
   case AArch64::STRWroX:
   case AArch64::STRXroX:
 
-    unsigned Val = MI->getOperand(3).getImm();
+    unsigned Val = MI.getOperand(3).getImm();
     AArch64_AM::ShiftExtendType ExtType = AArch64_AM::getMemExtendType(Val);
     return (ExtType != AArch64_AM::UXTX) || AArch64_AM::getMemDoShift(Val);
   }
@@ -1289,36 +1455,96 @@ bool AArch64InstrInfo::isScaledAddr(const MachineInstr *MI) const {
 }
 
 /// Check all MachineMemOperands for a hint to suppress pairing.
-bool AArch64InstrInfo::isLdStPairSuppressed(const MachineInstr *MI) const {
-  assert(MOSuppressPair < (1 << MachineMemOperand::MOTargetNumBits) &&
-         "Too many target MO flags");
-  for (auto *MM : MI->memoperands()) {
-    if (MM->getFlags() &
-        (MOSuppressPair << MachineMemOperand::MOTargetStartBit)) {
-      return true;
-    }
-  }
-  return false;
+bool AArch64InstrInfo::isLdStPairSuppressed(const MachineInstr &MI) const {
+  return any_of(MI.memoperands(), [](MachineMemOperand *MMO) {
+    return MMO->getFlags() & MOSuppressPair;
+  });
 }
 
 /// Set a flag on the first MachineMemOperand to suppress pairing.
-void AArch64InstrInfo::suppressLdStPair(MachineInstr *MI) const {
-  if (MI->memoperands_empty())
+void AArch64InstrInfo::suppressLdStPair(MachineInstr &MI) const {
+  if (MI.memoperands_empty())
     return;
+  (*MI.memoperands_begin())->setFlags(MOSuppressPair);
+}
 
-  assert(MOSuppressPair < (1 << MachineMemOperand::MOTargetNumBits) &&
-         "Too many target MO flags");
-  (*MI->memoperands_begin())
-      ->setFlags(MOSuppressPair << MachineMemOperand::MOTargetStartBit);
+bool AArch64InstrInfo::isUnscaledLdSt(unsigned Opc) const {
+  switch (Opc) {
+  default:
+    return false;
+  case AArch64::STURSi:
+  case AArch64::STURDi:
+  case AArch64::STURQi:
+  case AArch64::STURBBi:
+  case AArch64::STURHHi:
+  case AArch64::STURWi:
+  case AArch64::STURXi:
+  case AArch64::LDURSi:
+  case AArch64::LDURDi:
+  case AArch64::LDURQi:
+  case AArch64::LDURWi:
+  case AArch64::LDURXi:
+  case AArch64::LDURSWi:
+  case AArch64::LDURHHi:
+  case AArch64::LDURBBi:
+  case AArch64::LDURSBWi:
+  case AArch64::LDURSHWi:
+    return true;
+  }
 }
 
-bool
-AArch64InstrInfo::getMemOpBaseRegImmOfs(MachineInstr *LdSt, unsigned &BaseReg,
-                                        unsigned &Offset,
-                                        const TargetRegisterInfo *TRI) const {
-  switch (LdSt->getOpcode()) {
+bool AArch64InstrInfo::isUnscaledLdSt(MachineInstr &MI) const {
+  return isUnscaledLdSt(MI.getOpcode());
+}
+
+// Is this a candidate for ld/st merging or pairing?  For example, we don't
+// touch volatiles or load/stores that have a hint to avoid pair formation.
+bool AArch64InstrInfo::isCandidateToMergeOrPair(MachineInstr &MI) const {
+  // If this is a volatile load/store, don't mess with it.
+  if (MI.hasOrderedMemoryRef())
+    return false;
+
+  // Make sure this is a reg+imm (as opposed to an address reloc).
+  assert(MI.getOperand(1).isReg() && "Expected a reg operand.");
+  if (!MI.getOperand(2).isImm())
+    return false;
+
+  // Can't merge/pair if the instruction modifies the base register.
+  // e.g., ldr x0, [x0]
+  unsigned BaseReg = MI.getOperand(1).getReg();
+  const TargetRegisterInfo *TRI = &getRegisterInfo();
+  if (MI.modifiesRegister(BaseReg, TRI))
+    return false;
+
+  // Check if this load/store has a hint to avoid pair formation.
+  // MachineMemOperands hints are set by the AArch64StorePairSuppress pass.
+  if (isLdStPairSuppressed(MI))
+    return false;
+
+  // On some CPUs quad load/store pairs are slower than two single load/stores.
+  if (Subtarget.avoidQuadLdStPairs()) {
+    switch (MI.getOpcode()) {
+    default:
+      break;
+
+    case AArch64::LDURQi:
+    case AArch64::STURQi:
+    case AArch64::LDRQui:
+    case AArch64::STRQui:
+      return false;
+    }
+  }
+
+  return true;
+}
+
+bool AArch64InstrInfo::getMemOpBaseRegImmOfs(
+    MachineInstr &LdSt, unsigned &BaseReg, int64_t &Offset,
+    const TargetRegisterInfo *TRI) const {
+  switch (LdSt.getOpcode()) {
   default:
     return false;
+  // Scaled instructions.
   case AArch64::STRSui:
   case AArch64::STRDui:
   case AArch64::STRQui:
@@ -1329,29 +1555,45 @@ AArch64InstrInfo::getMemOpBaseRegImmOfs(MachineInstr *LdSt, unsigned &BaseReg,
   case AArch64::LDRQui:
   case AArch64::LDRXui:
   case AArch64::LDRWui:
-    if (!LdSt->getOperand(1).isReg() || !LdSt->getOperand(2).isImm())
-      return false;
-    BaseReg = LdSt->getOperand(1).getReg();
-    MachineFunction &MF = *LdSt->getParent()->getParent();
-    unsigned Width = getRegClass(LdSt->getDesc(), 0, TRI, MF)->getSize();
-    Offset = LdSt->getOperand(2).getImm() * Width;
-    return true;
+  case AArch64::LDRSWui:
+  // Unscaled instructions.
+  case AArch64::STURSi:
+  case AArch64::STURDi:
+  case AArch64::STURQi:
+  case AArch64::STURXi:
+  case AArch64::STURWi:
+  case AArch64::LDURSi:
+  case AArch64::LDURDi:
+  case AArch64::LDURQi:
+  case AArch64::LDURWi:
+  case AArch64::LDURXi:
+  case AArch64::LDURSWi:
+    unsigned Width;
+    return getMemOpBaseRegImmOfsWidth(LdSt, BaseReg, Offset, Width, TRI);
   };
 }
 
 bool AArch64InstrInfo::getMemOpBaseRegImmOfsWidth(
-    MachineInstr *LdSt, unsigned &BaseReg, int &Offset, int &Width,
+    MachineInstr &LdSt, unsigned &BaseReg, int64_t &Offset, unsigned &Width,
     const TargetRegisterInfo *TRI) const {
+  assert(LdSt.mayLoadOrStore() && "Expected a memory operation.");
   // Handle only loads/stores with base register followed by immediate offset.
-  if (LdSt->getNumOperands() != 3)
-    return false;
-  if (!LdSt->getOperand(1).isReg() || !LdSt->getOperand(2).isImm())
+  if (LdSt.getNumExplicitOperands() == 3) {
+    // Non-paired instruction (e.g., ldr x1, [x0, #8]).
+    if (!LdSt.getOperand(1).isReg() || !LdSt.getOperand(2).isImm())
+      return false;
+  } else if (LdSt.getNumExplicitOperands() == 4) {
+    // Paired instruction (e.g., ldp x1, x2, [x0, #8]).
+    if (!LdSt.getOperand(1).isReg() || !LdSt.getOperand(2).isReg() ||
+        !LdSt.getOperand(3).isImm())
+      return false;
+  } else
     return false;
 
   // Offset is calculated as the immediate operand multiplied by the scaling factor.
   // Unscaled instructions have scaling factor set to 1.
-  int Scale = 0;
-  switch (LdSt->getOpcode()) {
+  unsigned Scale = 0;
+  switch (LdSt.getOpcode()) {
   default:
     return false;
   case AArch64::LDURQi:
@@ -1392,18 +1634,48 @@ bool AArch64InstrInfo::getMemOpBaseRegImmOfsWidth(
     Width = 1;
     Scale = 1;
     break;
+  case AArch64::LDPQi:
+  case AArch64::LDNPQi:
+  case AArch64::STPQi:
+  case AArch64::STNPQi:
+    Scale = 16;
+    Width = 32;
+    break;
   case AArch64::LDRQui:
   case AArch64::STRQui:
     Scale = Width = 16;
     break;
+  case AArch64::LDPXi:
+  case AArch64::LDPDi:
+  case AArch64::LDNPXi:
+  case AArch64::LDNPDi:
+  case AArch64::STPXi:
+  case AArch64::STPDi:
+  case AArch64::STNPXi:
+  case AArch64::STNPDi:
+    Scale = 8;
+    Width = 16;
+    break;
   case AArch64::LDRXui:
   case AArch64::LDRDui:
   case AArch64::STRXui:
   case AArch64::STRDui:
     Scale = Width = 8;
     break;
+  case AArch64::LDPWi:
+  case AArch64::LDPSi:
+  case AArch64::LDNPWi:
+  case AArch64::LDNPSi:
+  case AArch64::STPWi:
+  case AArch64::STPSi:
+  case AArch64::STNPWi:
+  case AArch64::STNPSi:
+    Scale = 4;
+    Width = 8;
+    break;
   case AArch64::LDRWui:
   case AArch64::LDRSui:
+  case AArch64::LDRSWui:
   case AArch64::STRWui:
   case AArch64::STRSui:
     Scale = Width = 4;
@@ -1420,41 +1692,120 @@ bool AArch64InstrInfo::getMemOpBaseRegImmOfsWidth(
   case AArch64::STRBBui:
     Scale = Width = 1;
     break;
-  };
+  }
 
-  BaseReg = LdSt->getOperand(1).getReg();
-  Offset = LdSt->getOperand(2).getImm() * Scale;
+  if (LdSt.getNumExplicitOperands() == 3) {
+    BaseReg = LdSt.getOperand(1).getReg();
+    Offset = LdSt.getOperand(2).getImm() * Scale;
+  } else {
+    assert(LdSt.getNumExplicitOperands() == 4 && "invalid number of operands");
+    BaseReg = LdSt.getOperand(2).getReg();
+    Offset = LdSt.getOperand(3).getImm() * Scale;
+  }
   return true;
 }
 
+// Scale the unscaled offsets.  Returns false if the unscaled offset can't be
+// scaled.
+static bool scaleOffset(unsigned Opc, int64_t &Offset) {
+  unsigned OffsetStride = 1;
+  switch (Opc) {
+  default:
+    return false;
+  case AArch64::LDURQi:
+  case AArch64::STURQi:
+    OffsetStride = 16;
+    break;
+  case AArch64::LDURXi:
+  case AArch64::LDURDi:
+  case AArch64::STURXi:
+  case AArch64::STURDi:
+    OffsetStride = 8;
+    break;
+  case AArch64::LDURWi:
+  case AArch64::LDURSi:
+  case AArch64::LDURSWi:
+  case AArch64::STURWi:
+  case AArch64::STURSi:
+    OffsetStride = 4;
+    break;
+  }
+  // If the byte-offset isn't a multiple of the stride, we can't scale this
+  // offset.
+  if (Offset % OffsetStride != 0)
+    return false;
+
+  // Convert the byte-offset used by unscaled into an "element" offset used
+  // by the scaled pair load/store instructions.
+  Offset /= OffsetStride;
+  return true;
+}
+
+static bool canPairLdStOpc(unsigned FirstOpc, unsigned SecondOpc) {
+  if (FirstOpc == SecondOpc)
+    return true;
+  // We can also pair sign-ext and zero-ext instructions.
+  switch (FirstOpc) {
+  default:
+    return false;
+  case AArch64::LDRWui:
+  case AArch64::LDURWi:
+    return SecondOpc == AArch64::LDRSWui || SecondOpc == AArch64::LDURSWi;
+  case AArch64::LDRSWui:
+  case AArch64::LDURSWi:
+    return SecondOpc == AArch64::LDRWui || SecondOpc == AArch64::LDURWi;
+  }
+  // These instructions can't be paired based on their opcodes.
+  return false;
+}
+
 /// Detect opportunities for ldp/stp formation.
 ///
 /// Only called for LdSt for which getMemOpBaseRegImmOfs returns true.
-bool AArch64InstrInfo::shouldClusterLoads(MachineInstr *FirstLdSt,
-                                          MachineInstr *SecondLdSt,
-                                          unsigned NumLoads) const {
+bool AArch64InstrInfo::shouldClusterMemOps(MachineInstr &FirstLdSt,
+                                           MachineInstr &SecondLdSt,
+                                           unsigned NumLoads) const {
   // Only cluster up to a single pair.
   if (NumLoads > 1)
     return false;
-  if (FirstLdSt->getOpcode() != SecondLdSt->getOpcode())
+
+  // Can we pair these instructions based on their opcodes?
+  unsigned FirstOpc = FirstLdSt.getOpcode();
+  unsigned SecondOpc = SecondLdSt.getOpcode();
+  if (!canPairLdStOpc(FirstOpc, SecondOpc))
+    return false;
+
+  // Can't merge volatiles or load/stores that have a hint to avoid pair
+  // formation, for example.
+  if (!isCandidateToMergeOrPair(FirstLdSt) ||
+      !isCandidateToMergeOrPair(SecondLdSt))
+    return false;
+
+  // isCandidateToMergeOrPair guarantees that operand 2 is an immediate.
+  int64_t Offset1 = FirstLdSt.getOperand(2).getImm();
+  if (isUnscaledLdSt(FirstOpc) && !scaleOffset(FirstOpc, Offset1))
+    return false;
+
+  int64_t Offset2 = SecondLdSt.getOperand(2).getImm();
+  if (isUnscaledLdSt(SecondOpc) && !scaleOffset(SecondOpc, Offset2))
     return false;
-  // getMemOpBaseRegImmOfs guarantees that oper 2 isImm.
-  unsigned Ofs1 = FirstLdSt->getOperand(2).getImm();
-  // Allow 6 bits of positive range.
-  if (Ofs1 > 64)
+
+  // Pairwise instructions have a 7-bit signed offset field.
+  if (Offset1 > 63 || Offset1 < -64)
     return false;
+
   // The caller should already have ordered First/SecondLdSt by offset.
-  unsigned Ofs2 = SecondLdSt->getOperand(2).getImm();
-  return Ofs1 + 1 == Ofs2;
+  assert(Offset1 <= Offset2 && "Caller should have ordered offsets.");
+  return Offset1 + 1 == Offset2;
 }
 
-bool AArch64InstrInfo::shouldScheduleAdjacent(MachineInstr *First,
-                                              MachineInstr *Second) const {
-  if (Subtarget.isCyclone()) {
-    // Cyclone can fuse CMN, CMP, TST followed by Bcc.
-    unsigned SecondOpcode = Second->getOpcode();
+bool AArch64InstrInfo::shouldScheduleAdjacent(MachineInstr &First,
+                                              MachineInstr &Second) const {
+  if (Subtarget.hasMacroOpFusion()) {
+    // Fuse CMN, CMP, TST followed by Bcc.
+    unsigned SecondOpcode = Second.getOpcode();
     if (SecondOpcode == AArch64::Bcc) {
-      switch (First->getOpcode()) {
+      switch (First.getOpcode()) {
       default:
         return false;
       case AArch64::SUBSWri:
@@ -1466,10 +1817,10 @@ bool AArch64InstrInfo::shouldScheduleAdjacent(MachineInstr *First,
         return true;
       }
     }
-    // Cyclone B0 also supports ALU operations followed by CBZ/CBNZ.
+    // Fuse ALU operations followed by CBZ/CBNZ.
     if (SecondOpcode == AArch64::CBNZW || SecondOpcode == AArch64::CBNZX ||
         SecondOpcode == AArch64::CBZW || SecondOpcode == AArch64::CBZX) {
-      switch (First->getOpcode()) {
+      switch (First.getOpcode()) {
       default:
         return false;
       case AArch64::ADDWri:
@@ -1491,7 +1842,7 @@ bool AArch64InstrInfo::shouldScheduleAdjacent(MachineInstr *First,
 
 MachineInstr *AArch64InstrInfo::emitFrameIndexDebugValue(
     MachineFunction &MF, int FrameIx, uint64_t Offset, const MDNode *Var,
-    const MDNode *Expr, DebugLoc DL) const {
+    const MDNode *Expr, const DebugLoc &DL) const {
   MachineInstrBuilder MIB = BuildMI(MF, DL, get(AArch64::DBG_VALUE))
                                 .addFrameIndex(FrameIx)
                                 .addImm(0)
@@ -1521,7 +1872,7 @@ static bool forwardCopyWillClobberTuple(unsigned DestReg, unsigned SrcReg,
 }
 
 void AArch64InstrInfo::copyPhysRegTuple(
-    MachineBasicBlock &MBB, MachineBasicBlock::iterator I, DebugLoc DL,
+    MachineBasicBlock &MBB, MachineBasicBlock::iterator I, const DebugLoc &DL,
     unsigned DestReg, unsigned SrcReg, bool KillSrc, unsigned Opcode,
     llvm::ArrayRef<unsigned> Indices) const {
   assert(Subtarget.hasNEON() &&
@@ -1547,9 +1898,9 @@ void AArch64InstrInfo::copyPhysRegTuple(
 }
 
 void AArch64InstrInfo::copyPhysReg(MachineBasicBlock &MBB,
-                                   MachineBasicBlock::iterator I, DebugLoc DL,
-                                   unsigned DestReg, unsigned SrcReg,
-                                   bool KillSrc) const {
+                                   MachineBasicBlock::iterator I,
+                                   const DebugLoc &DL, unsigned DestReg,
+                                   unsigned SrcReg, bool KillSrc) const {
   if (AArch64::GPR32spRegClass.contains(DestReg) &&
       (AArch64::GPR32spRegClass.contains(SrcReg) || SrcReg == AArch64::WZR)) {
     const TargetRegisterInfo *TRI = &getRegisterInfo();
@@ -1818,8 +2169,7 @@ void AArch64InstrInfo::copyPhysReg(MachineBasicBlock &MBB,
 
   if (SrcReg == AArch64::NZCV) {
     assert(AArch64::GPR64RegClass.contains(DestReg) && "Invalid NZCV copy");
-    BuildMI(MBB, I, DL, get(AArch64::MRS))
-      .addReg(DestReg)
+    BuildMI(MBB, I, DL, get(AArch64::MRS), DestReg)
       .addImm(AArch64SysReg::NZCV)
       .addReg(AArch64::NZCV, RegState::Implicit | getKillRegState(KillSrc));
     return;
@@ -1879,39 +2229,45 @@ void AArch64InstrInfo::storeRegToStackSlot(
     else if (AArch64::DDRegClass.hasSubClassEq(RC)) {
       assert(Subtarget.hasNEON() &&
              "Unexpected register store without NEON");
-      Opc = AArch64::ST1Twov1d, Offset = false;
+      Opc = AArch64::ST1Twov1d;
+      Offset = false;
     }
     break;
   case 24:
     if (AArch64::DDDRegClass.hasSubClassEq(RC)) {
       assert(Subtarget.hasNEON() &&
              "Unexpected register store without NEON");
-      Opc = AArch64::ST1Threev1d, Offset = false;
+      Opc = AArch64::ST1Threev1d;
+      Offset = false;
     }
     break;
   case 32:
     if (AArch64::DDDDRegClass.hasSubClassEq(RC)) {
       assert(Subtarget.hasNEON() &&
              "Unexpected register store without NEON");
-      Opc = AArch64::ST1Fourv1d, Offset = false;
+      Opc = AArch64::ST1Fourv1d;
+      Offset = false;
     } else if (AArch64::QQRegClass.hasSubClassEq(RC)) {
       assert(Subtarget.hasNEON() &&
              "Unexpected register store without NEON");
-      Opc = AArch64::ST1Twov2d, Offset = false;
+      Opc = AArch64::ST1Twov2d;
+      Offset = false;
     }
     break;
   case 48:
     if (AArch64::QQQRegClass.hasSubClassEq(RC)) {
       assert(Subtarget.hasNEON() &&
              "Unexpected register store without NEON");
-      Opc = AArch64::ST1Threev2d, Offset = false;
+      Opc = AArch64::ST1Threev2d;
+      Offset = false;
     }
     break;
   case 64:
     if (AArch64::QQQQRegClass.hasSubClassEq(RC)) {
       assert(Subtarget.hasNEON() &&
              "Unexpected register store without NEON");
-      Opc = AArch64::ST1Fourv2d, Offset = false;
+      Opc = AArch64::ST1Fourv2d;
+      Offset = false;
     }
     break;
   }
@@ -1977,39 +2333,45 @@ void AArch64InstrInfo::loadRegFromStackSlot(
     else if (AArch64::DDRegClass.hasSubClassEq(RC)) {
       assert(Subtarget.hasNEON() &&
              "Unexpected register load without NEON");
-      Opc = AArch64::LD1Twov1d, Offset = false;
+      Opc = AArch64::LD1Twov1d;
+      Offset = false;
     }
     break;
   case 24:
     if (AArch64::DDDRegClass.hasSubClassEq(RC)) {
       assert(Subtarget.hasNEON() &&
              "Unexpected register load without NEON");
-      Opc = AArch64::LD1Threev1d, Offset = false;
+      Opc = AArch64::LD1Threev1d;
+      Offset = false;
     }
     break;
   case 32:
     if (AArch64::DDDDRegClass.hasSubClassEq(RC)) {
       assert(Subtarget.hasNEON() &&
              "Unexpected register load without NEON");
-      Opc = AArch64::LD1Fourv1d, Offset = false;
+      Opc = AArch64::LD1Fourv1d;
+      Offset = false;
     } else if (AArch64::QQRegClass.hasSubClassEq(RC)) {
       assert(Subtarget.hasNEON() &&
              "Unexpected register load without NEON");
-      Opc = AArch64::LD1Twov2d, Offset = false;
+      Opc = AArch64::LD1Twov2d;
+      Offset = false;
     }
     break;
   case 48:
     if (AArch64::QQQRegClass.hasSubClassEq(RC)) {
       assert(Subtarget.hasNEON() &&
              "Unexpected register load without NEON");
-      Opc = AArch64::LD1Threev2d, Offset = false;
+      Opc = AArch64::LD1Threev2d;
+      Offset = false;
     }
     break;
   case 64:
     if (AArch64::QQQQRegClass.hasSubClassEq(RC)) {
       assert(Subtarget.hasNEON() &&
              "Unexpected register load without NEON");
-      Opc = AArch64::LD1Fourv2d, Offset = false;
+      Opc = AArch64::LD1Fourv2d;
+      Offset = false;
     }
     break;
   }
@@ -2024,13 +2386,16 @@ void AArch64InstrInfo::loadRegFromStackSlot(
 }
 
 void llvm::emitFrameOffset(MachineBasicBlock &MBB,
-                           MachineBasicBlock::iterator MBBI, DebugLoc DL,
+                           MachineBasicBlock::iterator MBBI, const DebugLoc &DL,
                            unsigned DestReg, unsigned SrcReg, int Offset,
                            const TargetInstrInfo *TII,
                            MachineInstr::MIFlag Flag, bool SetNZCV) {
   if (DestReg == SrcReg && Offset == 0)
     return;
 
+  assert((DestReg != AArch64::SP || Offset % 16 == 0) &&
+         "SP increment/decrement not 16-byte aligned");
+
   bool isSub = Offset < 0;
   if (isSub)
     Offset = -Offset;
@@ -2082,8 +2447,9 @@ void llvm::emitFrameOffset(MachineBasicBlock &MBB,
 }
 
 MachineInstr *AArch64InstrInfo::foldMemoryOperandImpl(
-    MachineFunction &MF, MachineInstr *MI, ArrayRef<unsigned> Ops,
-    MachineBasicBlock::iterator InsertPt, int FrameIndex) const {
+    MachineFunction &MF, MachineInstr &MI, ArrayRef<unsigned> Ops,
+    MachineBasicBlock::iterator InsertPt, int FrameIndex,
+    LiveIntervals *LIS) const {
   // This is a bit of a hack. Consider this instruction:
   //
   //   %vreg0<def> = COPY %SP; GPR64all:%vreg0
@@ -2097,9 +2463,9 @@ MachineInstr *AArch64InstrInfo::foldMemoryOperandImpl(
   //
   // <rdar://problem/11522048>
   //
-  if (MI->isCopy()) {
-    unsigned DstReg = MI->getOperand(0).getReg();
-    unsigned SrcReg = MI->getOperand(1).getReg();
+  if (MI.isCopy()) {
+    unsigned DstReg = MI.getOperand(0).getReg();
+    unsigned SrcReg = MI.getOperand(1).getReg();
     if (SrcReg == AArch64::SP &&
         TargetRegisterInfo::isVirtualRegister(DstReg)) {
       MF.getRegInfo().constrainRegClass(DstReg, &AArch64::GPR64RegClass);
@@ -2393,9 +2759,10 @@ void AArch64InstrInfo::getNoopForMachoTarget(MCInst &NopInst) const {
   NopInst.setOpcode(AArch64::HINT);
   NopInst.addOperand(MCOperand::createImm(0));
 }
-/// useMachineCombiner - return true when a target supports MachineCombiner
+
+// AArch64 supports MachineCombiner.
 bool AArch64InstrInfo::useMachineCombiner() const {
-  // AArch64 supports the combiner
+
   return true;
 }
 //
@@ -2456,37 +2823,75 @@ static bool isCombineInstrCandidate64(unsigned Opc) {
   return false;
 }
 //
+// FP Opcodes that can be combined with a FMUL
+static bool isCombineInstrCandidateFP(const MachineInstr &Inst) {
+  switch (Inst.getOpcode()) {
+  case AArch64::FADDSrr:
+  case AArch64::FADDDrr:
+  case AArch64::FADDv2f32:
+  case AArch64::FADDv2f64:
+  case AArch64::FADDv4f32:
+  case AArch64::FSUBSrr:
+  case AArch64::FSUBDrr:
+  case AArch64::FSUBv2f32:
+  case AArch64::FSUBv2f64:
+  case AArch64::FSUBv4f32:
+    return Inst.getParent()->getParent()->getTarget().Options.UnsafeFPMath;
+  default:
+    break;
+  }
+  return false;
+}
+//
 // Opcodes that can be combined with a MUL
 static bool isCombineInstrCandidate(unsigned Opc) {
   return (isCombineInstrCandidate32(Opc) || isCombineInstrCandidate64(Opc));
 }
 
-static bool canCombineWithMUL(MachineBasicBlock &MBB, MachineOperand &MO,
-                              unsigned MulOpc, unsigned ZeroReg) {
+//
+// Utility routine that checks if \param MO is defined by an
+// \param CombineOpc instruction in the basic block \param MBB
+static bool canCombine(MachineBasicBlock &MBB, MachineOperand &MO,
+                       unsigned CombineOpc, unsigned ZeroReg = 0,
+                       bool CheckZeroReg = false) {
   MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
   MachineInstr *MI = nullptr;
-  // We need a virtual register definition.
+
   if (MO.isReg() && TargetRegisterInfo::isVirtualRegister(MO.getReg()))
     MI = MRI.getUniqueVRegDef(MO.getReg());
   // And it needs to be in the trace (otherwise, it won't have a depth).
-  if (!MI || MI->getParent() != &MBB || (unsigned)MI->getOpcode() != MulOpc)
-    return false;
-
-  assert(MI->getNumOperands() >= 4 && MI->getOperand(0).isReg() &&
-         MI->getOperand(1).isReg() && MI->getOperand(2).isReg() &&
-         MI->getOperand(3).isReg() && "MAdd/MSub must have a least 4 regs");
-
-  // The third input reg must be zero.
-  if (MI->getOperand(3).getReg() != ZeroReg)
+  if (!MI || MI->getParent() != &MBB || (unsigned)MI->getOpcode() != CombineOpc)
     return false;
-
   // Must only used by the user we combine with.
   if (!MRI.hasOneNonDBGUse(MI->getOperand(0).getReg()))
     return false;
 
+  if (CheckZeroReg) {
+    assert(MI->getNumOperands() >= 4 && MI->getOperand(0).isReg() &&
+           MI->getOperand(1).isReg() && MI->getOperand(2).isReg() &&
+           MI->getOperand(3).isReg() && "MAdd/MSub must have a least 4 regs");
+    // The third input reg must be zero.
+    if (MI->getOperand(3).getReg() != ZeroReg)
+      return false;
+  }
+
   return true;
 }
 
+//
+// Is \param MO defined by an integer multiply and can be combined?
+static bool canCombineWithMUL(MachineBasicBlock &MBB, MachineOperand &MO,
+                              unsigned MulOpc, unsigned ZeroReg) {
+  return canCombine(MBB, MO, MulOpc, ZeroReg, true);
+}
+
+//
+// Is \param MO defined by a floating-point multiply and can be combined?
+static bool canCombineWithFMUL(MachineBasicBlock &MBB, MachineOperand &MO,
+                               unsigned MulOpc) {
+  return canCombine(MBB, MO, MulOpc);
+}
+
 // TODO: There are many more machine instruction opcodes to match:
 //       1. Other data types (integer, vectors)
 //       2. Other math / logic operations (xor, or)
@@ -2522,17 +2927,17 @@ static bool getMaddPatterns(MachineInstr &Root,
   bool Found = false;
 
   if (!isCombineInstrCandidate(Opc))
-    return 0;
+    return false;
   if (isCombineInstrSettingFlag(Opc)) {
     int Cmp_NZCV = Root.findRegisterDefOperandIdx(AArch64::NZCV, true);
     // When NZCV is live bail out.
     if (Cmp_NZCV == -1)
-      return 0;
-    unsigned NewOpc = convertFlagSettingOpcode(&Root);
+      return false;
+    unsigned NewOpc = convertFlagSettingOpcode(Root);
     // When opcode can't change bail out.
     // CHECKME: do we miss any cases for opcode conversion?
     if (NewOpc == Opc)
-      return 0;
+      return false;
     Opc = NewOpc;
   }
 
@@ -2620,7 +3025,230 @@ static bool getMaddPatterns(MachineInstr &Root,
   }
   return Found;
 }
+/// Floating-Point Support
+
+/// Find instructions that can be turned into madd.
+static bool getFMAPatterns(MachineInstr &Root,
+                           SmallVectorImpl<MachineCombinerPattern> &Patterns) {
+
+  if (!isCombineInstrCandidateFP(Root))
+    return 0;
 
+  MachineBasicBlock &MBB = *Root.getParent();
+  bool Found = false;
+
+  switch (Root.getOpcode()) {
+  default:
+    assert(false && "Unsupported FP instruction in combiner\n");
+    break;
+  case AArch64::FADDSrr:
+    assert(Root.getOperand(1).isReg() && Root.getOperand(2).isReg() &&
+           "FADDWrr does not have register operands");
+    if (canCombineWithFMUL(MBB, Root.getOperand(1), AArch64::FMULSrr)) {
+      Patterns.push_back(MachineCombinerPattern::FMULADDS_OP1);
+      Found = true;
+    } else if (canCombineWithFMUL(MBB, Root.getOperand(1),
+                                  AArch64::FMULv1i32_indexed)) {
+      Patterns.push_back(MachineCombinerPattern::FMLAv1i32_indexed_OP1);
+      Found = true;
+    }
+    if (canCombineWithFMUL(MBB, Root.getOperand(2), AArch64::FMULSrr)) {
+      Patterns.push_back(MachineCombinerPattern::FMULADDS_OP2);
+      Found = true;
+    } else if (canCombineWithFMUL(MBB, Root.getOperand(2),
+                                  AArch64::FMULv1i32_indexed)) {
+      Patterns.push_back(MachineCombinerPattern::FMLAv1i32_indexed_OP2);
+      Found = true;
+    }
+    break;
+  case AArch64::FADDDrr:
+    if (canCombineWithFMUL(MBB, Root.getOperand(1), AArch64::FMULDrr)) {
+      Patterns.push_back(MachineCombinerPattern::FMULADDD_OP1);
+      Found = true;
+    } else if (canCombineWithFMUL(MBB, Root.getOperand(1),
+                                  AArch64::FMULv1i64_indexed)) {
+      Patterns.push_back(MachineCombinerPattern::FMLAv1i64_indexed_OP1);
+      Found = true;
+    }
+    if (canCombineWithFMUL(MBB, Root.getOperand(2), AArch64::FMULDrr)) {
+      Patterns.push_back(MachineCombinerPattern::FMULADDD_OP2);
+      Found = true;
+    } else if (canCombineWithFMUL(MBB, Root.getOperand(2),
+                                  AArch64::FMULv1i64_indexed)) {
+      Patterns.push_back(MachineCombinerPattern::FMLAv1i64_indexed_OP2);
+      Found = true;
+    }
+    break;
+  case AArch64::FADDv2f32:
+    if (canCombineWithFMUL(MBB, Root.getOperand(1),
+                           AArch64::FMULv2i32_indexed)) {
+      Patterns.push_back(MachineCombinerPattern::FMLAv2i32_indexed_OP1);
+      Found = true;
+    } else if (canCombineWithFMUL(MBB, Root.getOperand(1),
+                                  AArch64::FMULv2f32)) {
+      Patterns.push_back(MachineCombinerPattern::FMLAv2f32_OP1);
+      Found = true;
+    }
+    if (canCombineWithFMUL(MBB, Root.getOperand(2),
+                           AArch64::FMULv2i32_indexed)) {
+      Patterns.push_back(MachineCombinerPattern::FMLAv2i32_indexed_OP2);
+      Found = true;
+    } else if (canCombineWithFMUL(MBB, Root.getOperand(2),
+                                  AArch64::FMULv2f32)) {
+      Patterns.push_back(MachineCombinerPattern::FMLAv2f32_OP2);
+      Found = true;
+    }
+    break;
+  case AArch64::FADDv2f64:
+    if (canCombineWithFMUL(MBB, Root.getOperand(1),
+                           AArch64::FMULv2i64_indexed)) {
+      Patterns.push_back(MachineCombinerPattern::FMLAv2i64_indexed_OP1);
+      Found = true;
+    } else if (canCombineWithFMUL(MBB, Root.getOperand(1),
+                                  AArch64::FMULv2f64)) {
+      Patterns.push_back(MachineCombinerPattern::FMLAv2f64_OP1);
+      Found = true;
+    }
+    if (canCombineWithFMUL(MBB, Root.getOperand(2),
+                           AArch64::FMULv2i64_indexed)) {
+      Patterns.push_back(MachineCombinerPattern::FMLAv2i64_indexed_OP2);
+      Found = true;
+    } else if (canCombineWithFMUL(MBB, Root.getOperand(2),
+                                  AArch64::FMULv2f64)) {
+      Patterns.push_back(MachineCombinerPattern::FMLAv2f64_OP2);
+      Found = true;
+    }
+    break;
+  case AArch64::FADDv4f32:
+    if (canCombineWithFMUL(MBB, Root.getOperand(1),
+                           AArch64::FMULv4i32_indexed)) {
+      Patterns.push_back(MachineCombinerPattern::FMLAv4i32_indexed_OP1);
+      Found = true;
+    } else if (canCombineWithFMUL(MBB, Root.getOperand(1),
+                                  AArch64::FMULv4f32)) {
+      Patterns.push_back(MachineCombinerPattern::FMLAv4f32_OP1);
+      Found = true;
+    }
+    if (canCombineWithFMUL(MBB, Root.getOperand(2),
+                           AArch64::FMULv4i32_indexed)) {
+      Patterns.push_back(MachineCombinerPattern::FMLAv4i32_indexed_OP2);
+      Found = true;
+    } else if (canCombineWithFMUL(MBB, Root.getOperand(2),
+                                  AArch64::FMULv4f32)) {
+      Patterns.push_back(MachineCombinerPattern::FMLAv4f32_OP2);
+      Found = true;
+    }
+    break;
+
+  case AArch64::FSUBSrr:
+    if (canCombineWithFMUL(MBB, Root.getOperand(1), AArch64::FMULSrr)) {
+      Patterns.push_back(MachineCombinerPattern::FMULSUBS_OP1);
+      Found = true;
+    }
+    if (canCombineWithFMUL(MBB, Root.getOperand(2), AArch64::FMULSrr)) {
+      Patterns.push_back(MachineCombinerPattern::FMULSUBS_OP2);
+      Found = true;
+    } else if (canCombineWithFMUL(MBB, Root.getOperand(2),
+                                  AArch64::FMULv1i32_indexed)) {
+      Patterns.push_back(MachineCombinerPattern::FMLSv1i32_indexed_OP2);
+      Found = true;
+    }
+    break;
+  case AArch64::FSUBDrr:
+    if (canCombineWithFMUL(MBB, Root.getOperand(1), AArch64::FMULDrr)) {
+      Patterns.push_back(MachineCombinerPattern::FMULSUBD_OP1);
+      Found = true;
+    }
+    if (canCombineWithFMUL(MBB, Root.getOperand(2), AArch64::FMULDrr)) {
+      Patterns.push_back(MachineCombinerPattern::FMULSUBD_OP2);
+      Found = true;
+    } else if (canCombineWithFMUL(MBB, Root.getOperand(2),
+                                  AArch64::FMULv1i64_indexed)) {
+      Patterns.push_back(MachineCombinerPattern::FMLSv1i64_indexed_OP2);
+      Found = true;
+    }
+    break;
+  case AArch64::FSUBv2f32:
+    if (canCombineWithFMUL(MBB, Root.getOperand(2),
+                           AArch64::FMULv2i32_indexed)) {
+      Patterns.push_back(MachineCombinerPattern::FMLSv2i32_indexed_OP2);
+      Found = true;
+    } else if (canCombineWithFMUL(MBB, Root.getOperand(2),
+                                  AArch64::FMULv2f32)) {
+      Patterns.push_back(MachineCombinerPattern::FMLSv2f32_OP2);
+      Found = true;
+    }
+    break;
+  case AArch64::FSUBv2f64:
+    if (canCombineWithFMUL(MBB, Root.getOperand(2),
+                           AArch64::FMULv2i64_indexed)) {
+      Patterns.push_back(MachineCombinerPattern::FMLSv2i64_indexed_OP2);
+      Found = true;
+    } else if (canCombineWithFMUL(MBB, Root.getOperand(2),
+                                  AArch64::FMULv2f64)) {
+      Patterns.push_back(MachineCombinerPattern::FMLSv2f64_OP2);
+      Found = true;
+    }
+    break;
+  case AArch64::FSUBv4f32:
+    if (canCombineWithFMUL(MBB, Root.getOperand(2),
+                           AArch64::FMULv4i32_indexed)) {
+      Patterns.push_back(MachineCombinerPattern::FMLSv4i32_indexed_OP2);
+      Found = true;
+    } else if (canCombineWithFMUL(MBB, Root.getOperand(2),
+                                  AArch64::FMULv4f32)) {
+      Patterns.push_back(MachineCombinerPattern::FMLSv4f32_OP2);
+      Found = true;
+    }
+    break;
+  }
+  return Found;
+}
+
+/// Return true when a code sequence can improve throughput. It
+/// should be called only for instructions in loops.
+/// \param Pattern - combiner pattern
+bool
+AArch64InstrInfo::isThroughputPattern(MachineCombinerPattern Pattern) const {
+  switch (Pattern) {
+  default:
+    break;
+  case MachineCombinerPattern::FMULADDS_OP1:
+  case MachineCombinerPattern::FMULADDS_OP2:
+  case MachineCombinerPattern::FMULSUBS_OP1:
+  case MachineCombinerPattern::FMULSUBS_OP2:
+  case MachineCombinerPattern::FMULADDD_OP1:
+  case MachineCombinerPattern::FMULADDD_OP2:
+  case MachineCombinerPattern::FMULSUBD_OP1:
+  case MachineCombinerPattern::FMULSUBD_OP2:
+  case MachineCombinerPattern::FMLAv1i32_indexed_OP1:
+  case MachineCombinerPattern::FMLAv1i32_indexed_OP2:
+  case MachineCombinerPattern::FMLAv1i64_indexed_OP1:
+  case MachineCombinerPattern::FMLAv1i64_indexed_OP2:
+  case MachineCombinerPattern::FMLAv2f32_OP2:
+  case MachineCombinerPattern::FMLAv2f32_OP1:
+  case MachineCombinerPattern::FMLAv2f64_OP1:
+  case MachineCombinerPattern::FMLAv2f64_OP2:
+  case MachineCombinerPattern::FMLAv2i32_indexed_OP1:
+  case MachineCombinerPattern::FMLAv2i32_indexed_OP2:
+  case MachineCombinerPattern::FMLAv2i64_indexed_OP1:
+  case MachineCombinerPattern::FMLAv2i64_indexed_OP2:
+  case MachineCombinerPattern::FMLAv4f32_OP1:
+  case MachineCombinerPattern::FMLAv4f32_OP2:
+  case MachineCombinerPattern::FMLAv4i32_indexed_OP1:
+  case MachineCombinerPattern::FMLAv4i32_indexed_OP2:
+  case MachineCombinerPattern::FMLSv1i32_indexed_OP2:
+  case MachineCombinerPattern::FMLSv1i64_indexed_OP2:
+  case MachineCombinerPattern::FMLSv2i32_indexed_OP2:
+  case MachineCombinerPattern::FMLSv2i64_indexed_OP2:
+  case MachineCombinerPattern::FMLSv2f32_OP2:
+  case MachineCombinerPattern::FMLSv2f64_OP2:
+  case MachineCombinerPattern::FMLSv4i32_indexed_OP2:
+  case MachineCombinerPattern::FMLSv4f32_OP2:
+    return true;
+  } // end switch (Pattern)
+  return false;
+}
 /// Return true when there is potentially a faster code sequence for an
 /// instruction chain ending in \p Root. All potential patterns are listed in
 /// the \p Pattern vector. Pattern should be sorted in priority order since the
@@ -2629,28 +3257,35 @@ static bool getMaddPatterns(MachineInstr &Root,
 bool AArch64InstrInfo::getMachineCombinerPatterns(
     MachineInstr &Root,
     SmallVectorImpl<MachineCombinerPattern> &Patterns) const {
+  // Integer patterns
   if (getMaddPatterns(Root, Patterns))
     return true;
+  // Floating point patterns
+  if (getFMAPatterns(Root, Patterns))
+    return true;
 
   return TargetInstrInfo::getMachineCombinerPatterns(Root, Patterns);
 }
 
-/// genMadd - Generate madd instruction and combine mul and add.
-/// Example:
-///  MUL I=A,B,0
-///  ADD R,I,C
-///  ==> MADD R,A,B,C
-/// \param Root is the ADD instruction
+enum class FMAInstKind { Default, Indexed, Accumulator };
+/// genFusedMultiply - Generate fused multiply instructions.
+/// This function supports both integer and floating point instructions.
+/// A typical example:
+///  F|MUL I=A,B,0
+///  F|ADD R,I,C
+///  ==> F|MADD R,A,B,C
+/// \param Root is the F|ADD instruction
 /// \param [out] InsInstrs is a vector of machine instructions and will
 /// contain the generated madd instruction
 /// \param IdxMulOpd is index of operand in Root that is the result of
-/// the MUL. In the example above IdxMulOpd is 1.
-/// \param MaddOpc the opcode fo the madd instruction
-static MachineInstr *genMadd(MachineFunction &MF, MachineRegisterInfo &MRI,
-                             const TargetInstrInfo *TII, MachineInstr &Root,
-                             SmallVectorImpl<MachineInstr *> &InsInstrs,
-                             unsigned IdxMulOpd, unsigned MaddOpc,
-                             const TargetRegisterClass *RC) {
+/// the F|MUL. In the example above IdxMulOpd is 1.
+/// \param MaddOpc the opcode fo the f|madd instruction
+static MachineInstr *
+genFusedMultiply(MachineFunction &MF, MachineRegisterInfo &MRI,
+                 const TargetInstrInfo *TII, MachineInstr &Root,
+                 SmallVectorImpl<MachineInstr *> &InsInstrs, unsigned IdxMulOpd,
+                 unsigned MaddOpc, const TargetRegisterClass *RC,
+                 FMAInstKind kind = FMAInstKind::Default) {
   assert(IdxMulOpd == 1 || IdxMulOpd == 2);
 
   unsigned IdxOtherOpd = IdxMulOpd == 1 ? 2 : 1;
@@ -2672,12 +3307,26 @@ static MachineInstr *genMadd(MachineFunction &MF, MachineRegisterInfo &MRI,
   if (TargetRegisterInfo::isVirtualRegister(SrcReg2))
     MRI.constrainRegClass(SrcReg2, RC);
 
-  MachineInstrBuilder MIB = BuildMI(MF, Root.getDebugLoc(), TII->get(MaddOpc),
-                                    ResultReg)
-                                .addReg(SrcReg0, getKillRegState(Src0IsKill))
-                                .addReg(SrcReg1, getKillRegState(Src1IsKill))
-                                .addReg(SrcReg2, getKillRegState(Src2IsKill));
-  // Insert the MADD
+  MachineInstrBuilder MIB;
+  if (kind == FMAInstKind::Default)
+    MIB = BuildMI(MF, Root.getDebugLoc(), TII->get(MaddOpc), ResultReg)
+              .addReg(SrcReg0, getKillRegState(Src0IsKill))
+              .addReg(SrcReg1, getKillRegState(Src1IsKill))
+              .addReg(SrcReg2, getKillRegState(Src2IsKill));
+  else if (kind == FMAInstKind::Indexed)
+    MIB = BuildMI(MF, Root.getDebugLoc(), TII->get(MaddOpc), ResultReg)
+              .addReg(SrcReg2, getKillRegState(Src2IsKill))
+              .addReg(SrcReg0, getKillRegState(Src0IsKill))
+              .addReg(SrcReg1, getKillRegState(Src1IsKill))
+              .addImm(MUL->getOperand(3).getImm());
+  else if (kind == FMAInstKind::Accumulator)
+    MIB = BuildMI(MF, Root.getDebugLoc(), TII->get(MaddOpc), ResultReg)
+              .addReg(SrcReg2, getKillRegState(Src2IsKill))
+              .addReg(SrcReg0, getKillRegState(Src0IsKill))
+              .addReg(SrcReg1, getKillRegState(Src1IsKill));
+  else
+    assert(false && "Invalid FMA instruction kind \n");
+  // Insert the MADD (MADD, FMA, FMS, FMLA, FMSL)
   InsInstrs.push_back(MIB);
   return MUL;
 }
@@ -2765,7 +3414,7 @@ void AArch64InstrInfo::genAlternativeCodeSequence(
       Opc = AArch64::MADDXrrr;
       RC = &AArch64::GPR64RegClass;
     }
-    MUL = genMadd(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC);
+    MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC);
     break;
   case MachineCombinerPattern::MULADDW_OP2:
   case MachineCombinerPattern::MULADDX_OP2:
@@ -2780,7 +3429,7 @@ void AArch64InstrInfo::genAlternativeCodeSequence(
       Opc = AArch64::MADDXrrr;
       RC = &AArch64::GPR64RegClass;
     }
-    MUL = genMadd(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
+    MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
     break;
   case MachineCombinerPattern::MULADDWI_OP1:
   case MachineCombinerPattern::MULADDXI_OP1: {
@@ -2872,7 +3521,7 @@ void AArch64InstrInfo::genAlternativeCodeSequence(
       Opc = AArch64::MSUBXrrr;
       RC = &AArch64::GPR64RegClass;
     }
-    MUL = genMadd(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
+    MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
     break;
   case MachineCombinerPattern::MULSUBWI_OP1:
   case MachineCombinerPattern::MULSUBXI_OP1: {
@@ -2917,6 +3566,234 @@ void AArch64InstrInfo::genAlternativeCodeSequence(
     }
     break;
   }
+  // Floating Point Support
+  case MachineCombinerPattern::FMULADDS_OP1:
+  case MachineCombinerPattern::FMULADDD_OP1:
+    // MUL I=A,B,0
+    // ADD R,I,C
+    // ==> MADD R,A,B,C
+    // --- Create(MADD);
+    if (Pattern == MachineCombinerPattern::FMULADDS_OP1) {
+      Opc = AArch64::FMADDSrrr;
+      RC = &AArch64::FPR32RegClass;
+    } else {
+      Opc = AArch64::FMADDDrrr;
+      RC = &AArch64::FPR64RegClass;
+    }
+    MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC);
+    break;
+  case MachineCombinerPattern::FMULADDS_OP2:
+  case MachineCombinerPattern::FMULADDD_OP2:
+    // FMUL I=A,B,0
+    // FADD R,C,I
+    // ==> FMADD R,A,B,C
+    // --- Create(FMADD);
+    if (Pattern == MachineCombinerPattern::FMULADDS_OP2) {
+      Opc = AArch64::FMADDSrrr;
+      RC = &AArch64::FPR32RegClass;
+    } else {
+      Opc = AArch64::FMADDDrrr;
+      RC = &AArch64::FPR64RegClass;
+    }
+    MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
+    break;
+
+  case MachineCombinerPattern::FMLAv1i32_indexed_OP1:
+    Opc = AArch64::FMLAv1i32_indexed;
+    RC = &AArch64::FPR32RegClass;
+    MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC,
+                           FMAInstKind::Indexed);
+    break;
+  case MachineCombinerPattern::FMLAv1i32_indexed_OP2:
+    Opc = AArch64::FMLAv1i32_indexed;
+    RC = &AArch64::FPR32RegClass;
+    MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
+                           FMAInstKind::Indexed);
+    break;
+
+  case MachineCombinerPattern::FMLAv1i64_indexed_OP1:
+    Opc = AArch64::FMLAv1i64_indexed;
+    RC = &AArch64::FPR64RegClass;
+    MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC,
+                           FMAInstKind::Indexed);
+    break;
+  case MachineCombinerPattern::FMLAv1i64_indexed_OP2:
+    Opc = AArch64::FMLAv1i64_indexed;
+    RC = &AArch64::FPR64RegClass;
+    MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
+                           FMAInstKind::Indexed);
+    break;
+
+  case MachineCombinerPattern::FMLAv2i32_indexed_OP1:
+  case MachineCombinerPattern::FMLAv2f32_OP1:
+    RC = &AArch64::FPR64RegClass;
+    if (Pattern == MachineCombinerPattern::FMLAv2i32_indexed_OP1) {
+      Opc = AArch64::FMLAv2i32_indexed;
+      MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC,
+                             FMAInstKind::Indexed);
+    } else {
+      Opc = AArch64::FMLAv2f32;
+      MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC,
+                             FMAInstKind::Accumulator);
+    }
+    break;
+  case MachineCombinerPattern::FMLAv2i32_indexed_OP2:
+  case MachineCombinerPattern::FMLAv2f32_OP2:
+    RC = &AArch64::FPR64RegClass;
+    if (Pattern == MachineCombinerPattern::FMLAv2i32_indexed_OP2) {
+      Opc = AArch64::FMLAv2i32_indexed;
+      MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
+                             FMAInstKind::Indexed);
+    } else {
+      Opc = AArch64::FMLAv2f32;
+      MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
+                             FMAInstKind::Accumulator);
+    }
+    break;
+
+  case MachineCombinerPattern::FMLAv2i64_indexed_OP1:
+  case MachineCombinerPattern::FMLAv2f64_OP1:
+    RC = &AArch64::FPR128RegClass;
+    if (Pattern == MachineCombinerPattern::FMLAv2i64_indexed_OP1) {
+      Opc = AArch64::FMLAv2i64_indexed;
+      MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC,
+                             FMAInstKind::Indexed);
+    } else {
+      Opc = AArch64::FMLAv2f64;
+      MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC,
+                             FMAInstKind::Accumulator);
+    }
+    break;
+  case MachineCombinerPattern::FMLAv2i64_indexed_OP2:
+  case MachineCombinerPattern::FMLAv2f64_OP2:
+    RC = &AArch64::FPR128RegClass;
+    if (Pattern == MachineCombinerPattern::FMLAv2i64_indexed_OP2) {
+      Opc = AArch64::FMLAv2i64_indexed;
+      MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
+                             FMAInstKind::Indexed);
+    } else {
+      Opc = AArch64::FMLAv2f64;
+      MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
+                             FMAInstKind::Accumulator);
+    }
+    break;
+
+  case MachineCombinerPattern::FMLAv4i32_indexed_OP1:
+  case MachineCombinerPattern::FMLAv4f32_OP1:
+    RC = &AArch64::FPR128RegClass;
+    if (Pattern == MachineCombinerPattern::FMLAv4i32_indexed_OP1) {
+      Opc = AArch64::FMLAv4i32_indexed;
+      MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC,
+                             FMAInstKind::Indexed);
+    } else {
+      Opc = AArch64::FMLAv4f32;
+      MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC,
+                             FMAInstKind::Accumulator);
+    }
+    break;
+
+  case MachineCombinerPattern::FMLAv4i32_indexed_OP2:
+  case MachineCombinerPattern::FMLAv4f32_OP2:
+    RC = &AArch64::FPR128RegClass;
+    if (Pattern == MachineCombinerPattern::FMLAv4i32_indexed_OP2) {
+      Opc = AArch64::FMLAv4i32_indexed;
+      MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
+                             FMAInstKind::Indexed);
+    } else {
+      Opc = AArch64::FMLAv4f32;
+      MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
+                             FMAInstKind::Accumulator);
+    }
+    break;
+
+  case MachineCombinerPattern::FMULSUBS_OP1:
+  case MachineCombinerPattern::FMULSUBD_OP1: {
+    // FMUL I=A,B,0
+    // FSUB R,I,C
+    // ==> FNMSUB R,A,B,C // = -C + A*B
+    // --- Create(FNMSUB);
+    if (Pattern == MachineCombinerPattern::FMULSUBS_OP1) {
+      Opc = AArch64::FNMSUBSrrr;
+      RC = &AArch64::FPR32RegClass;
+    } else {
+      Opc = AArch64::FNMSUBDrrr;
+      RC = &AArch64::FPR64RegClass;
+    }
+    MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC);
+    break;
+  }
+  case MachineCombinerPattern::FMULSUBS_OP2:
+  case MachineCombinerPattern::FMULSUBD_OP2: {
+    // FMUL I=A,B,0
+    // FSUB R,C,I
+    // ==> FMSUB R,A,B,C (computes C - A*B)
+    // --- Create(FMSUB);
+    if (Pattern == MachineCombinerPattern::FMULSUBS_OP2) {
+      Opc = AArch64::FMSUBSrrr;
+      RC = &AArch64::FPR32RegClass;
+    } else {
+      Opc = AArch64::FMSUBDrrr;
+      RC = &AArch64::FPR64RegClass;
+    }
+    MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
+    break;
+
+  case MachineCombinerPattern::FMLSv1i32_indexed_OP2:
+    Opc = AArch64::FMLSv1i32_indexed;
+    RC = &AArch64::FPR32RegClass;
+    MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
+                           FMAInstKind::Indexed);
+    break;
+
+  case MachineCombinerPattern::FMLSv1i64_indexed_OP2:
+    Opc = AArch64::FMLSv1i64_indexed;
+    RC = &AArch64::FPR64RegClass;
+    MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
+                           FMAInstKind::Indexed);
+    break;
+
+  case MachineCombinerPattern::FMLSv2f32_OP2:
+  case MachineCombinerPattern::FMLSv2i32_indexed_OP2:
+    RC = &AArch64::FPR64RegClass;
+    if (Pattern == MachineCombinerPattern::FMLSv2i32_indexed_OP2) {
+      Opc = AArch64::FMLSv2i32_indexed;
+      MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
+                             FMAInstKind::Indexed);
+    } else {
+      Opc = AArch64::FMLSv2f32;
+      MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
+                             FMAInstKind::Accumulator);
+    }
+    break;
+
+  case MachineCombinerPattern::FMLSv2f64_OP2:
+  case MachineCombinerPattern::FMLSv2i64_indexed_OP2:
+    RC = &AArch64::FPR128RegClass;
+    if (Pattern == MachineCombinerPattern::FMLSv2i64_indexed_OP2) {
+      Opc = AArch64::FMLSv2i64_indexed;
+      MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
+                             FMAInstKind::Indexed);
+    } else {
+      Opc = AArch64::FMLSv2f64;
+      MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
+                             FMAInstKind::Accumulator);
+    }
+    break;
+
+  case MachineCombinerPattern::FMLSv4f32_OP2:
+  case MachineCombinerPattern::FMLSv4i32_indexed_OP2:
+    RC = &AArch64::FPR128RegClass;
+    if (Pattern == MachineCombinerPattern::FMLSv4i32_indexed_OP2) {
+      Opc = AArch64::FMLSv4i32_indexed;
+      MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
+                             FMAInstKind::Indexed);
+    } else {
+      Opc = AArch64::FMLSv4f32;
+      MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
+                             FMAInstKind::Accumulator);
+    }
+    break;
+  }
   } // end switch (Pattern)
   // Record MUL and ADD/SUB for deletion
   DelInstrs.push_back(MUL);
@@ -2940,14 +3817,23 @@ void AArch64InstrInfo::genAlternativeCodeSequence(
 /// to
 ///   b.<condition code>
 ///
+/// Replace compare and branch sequence by TBZ/TBNZ instruction when the
+/// compare's constant operand is power of 2.
+///
+/// Examples:
+///   and  w8, w8, #0x400
+///   cbnz w8, L1
+/// to
+///   tbnz w8, #10, L1
+///
 /// \param  MI Conditional Branch
 /// \return True when the simple conditional branch is generated
 ///
-bool AArch64InstrInfo::optimizeCondBranch(MachineInstr *MI) const {
+bool AArch64InstrInfo::optimizeCondBranch(MachineInstr &MI) const {
   bool IsNegativeBranch = false;
   bool IsTestAndBranch = false;
   unsigned TargetBBInMI = 0;
-  switch (MI->getOpcode()) {
+  switch (MI.getOpcode()) {
   default:
     llvm_unreachable("Unknown branch instruction?");
   case AArch64::Bcc:
@@ -2976,48 +3862,108 @@ bool AArch64InstrInfo::optimizeCondBranch(MachineInstr *MI) const {
   // So we increment a zero register and test for bits other
   // than bit 0? Conservatively bail out in case the verifier
   // missed this case.
-  if (IsTestAndBranch && MI->getOperand(1).getImm())
+  if (IsTestAndBranch && MI.getOperand(1).getImm())
     return false;
 
   // Find Definition.
-  assert(MI->getParent() && "Incomplete machine instruciton\n");
-  MachineBasicBlock *MBB = MI->getParent();
+  assert(MI.getParent() && "Incomplete machine instruciton\n");
+  MachineBasicBlock *MBB = MI.getParent();
   MachineFunction *MF = MBB->getParent();
   MachineRegisterInfo *MRI = &MF->getRegInfo();
-  unsigned VReg = MI->getOperand(0).getReg();
+  unsigned VReg = MI.getOperand(0).getReg();
   if (!TargetRegisterInfo::isVirtualRegister(VReg))
     return false;
 
   MachineInstr *DefMI = MRI->getVRegDef(VReg);
 
-  // Look for CSINC
-  if (!(DefMI->getOpcode() == AArch64::CSINCWr &&
-        DefMI->getOperand(1).getReg() == AArch64::WZR &&
-        DefMI->getOperand(2).getReg() == AArch64::WZR) &&
-      !(DefMI->getOpcode() == AArch64::CSINCXr &&
-        DefMI->getOperand(1).getReg() == AArch64::XZR &&
-        DefMI->getOperand(2).getReg() == AArch64::XZR))
-    return false;
+  // Look through COPY instructions to find definition.
+  while (DefMI->isCopy()) {
+    unsigned CopyVReg = DefMI->getOperand(1).getReg();
+    if (!MRI->hasOneNonDBGUse(CopyVReg))
+      return false;
+    if (!MRI->hasOneDef(CopyVReg))
+      return false;
+    DefMI = MRI->getVRegDef(CopyVReg);
+  }
 
-  if (DefMI->findRegisterDefOperandIdx(AArch64::NZCV, true) != -1)
+  switch (DefMI->getOpcode()) {
+  default:
     return false;
+  // Fold AND into a TBZ/TBNZ if constant operand is power of 2.
+  case AArch64::ANDWri:
+  case AArch64::ANDXri: {
+    if (IsTestAndBranch)
+      return false;
+    if (DefMI->getParent() != MBB)
+      return false;
+    if (!MRI->hasOneNonDBGUse(VReg))
+      return false;
 
-  AArch64CC::CondCode CC =
-      (AArch64CC::CondCode)DefMI->getOperand(3).getImm();
-  bool CheckOnlyCCWrites = true;
-  // Convert only when the condition code is not modified between
-  // the CSINC and the branch. The CC may be used by other
-  // instructions in between.
-  if (modifiesConditionCode(DefMI, MI, CheckOnlyCCWrites, &getRegisterInfo()))
-    return false;
-  MachineBasicBlock &RefToMBB = *MBB;
-  MachineBasicBlock *TBB = MI->getOperand(TargetBBInMI).getMBB();
-  DebugLoc DL = MI->getDebugLoc();
-  if (IsNegativeBranch)
-    CC = AArch64CC::getInvertedCondCode(CC);
-  BuildMI(RefToMBB, MI, DL, get(AArch64::Bcc)).addImm(CC).addMBB(TBB);
-  MI->eraseFromParent();
-  return true;
+    bool Is32Bit = (DefMI->getOpcode() == AArch64::ANDWri);
+    uint64_t Mask = AArch64_AM::decodeLogicalImmediate(
+        DefMI->getOperand(2).getImm(), Is32Bit ? 32 : 64);
+    if (!isPowerOf2_64(Mask))
+      return false;
+
+    MachineOperand &MO = DefMI->getOperand(1);
+    unsigned NewReg = MO.getReg();
+    if (!TargetRegisterInfo::isVirtualRegister(NewReg))
+      return false;
+
+    assert(!MRI->def_empty(NewReg) && "Register must be defined.");
+
+    MachineBasicBlock &RefToMBB = *MBB;
+    MachineBasicBlock *TBB = MI.getOperand(1).getMBB();
+    DebugLoc DL = MI.getDebugLoc();
+    unsigned Imm = Log2_64(Mask);
+    unsigned Opc = (Imm < 32)
+                       ? (IsNegativeBranch ? AArch64::TBNZW : AArch64::TBZW)
+                       : (IsNegativeBranch ? AArch64::TBNZX : AArch64::TBZX);
+    MachineInstr *NewMI = BuildMI(RefToMBB, MI, DL, get(Opc))
+                              .addReg(NewReg)
+                              .addImm(Imm)
+                              .addMBB(TBB);
+    // Register lives on to the CBZ now.
+    MO.setIsKill(false);
+
+    // For immediate smaller than 32, we need to use the 32-bit
+    // variant (W) in all cases. Indeed the 64-bit variant does not
+    // allow to encode them.
+    // Therefore, if the input register is 64-bit, we need to take the
+    // 32-bit sub-part.
+    if (!Is32Bit && Imm < 32)
+      NewMI->getOperand(0).setSubReg(AArch64::sub_32);
+    MI.eraseFromParent();
+    return true;
+  }
+  // Look for CSINC
+  case AArch64::CSINCWr:
+  case AArch64::CSINCXr: {
+    if (!(DefMI->getOperand(1).getReg() == AArch64::WZR &&
+          DefMI->getOperand(2).getReg() == AArch64::WZR) &&
+        !(DefMI->getOperand(1).getReg() == AArch64::XZR &&
+          DefMI->getOperand(2).getReg() == AArch64::XZR))
+      return false;
+
+    if (DefMI->findRegisterDefOperandIdx(AArch64::NZCV, true) != -1)
+      return false;
+
+    AArch64CC::CondCode CC = (AArch64CC::CondCode)DefMI->getOperand(3).getImm();
+    // Convert only when the condition code is not modified between
+    // the CSINC and the branch. The CC may be used by other
+    // instructions in between.
+    if (areCFlagsAccessedBetweenInstrs(DefMI, MI, &getRegisterInfo(), AK_Write))
+      return false;
+    MachineBasicBlock &RefToMBB = *MBB;
+    MachineBasicBlock *TBB = MI.getOperand(TargetBBInMI).getMBB();
+    DebugLoc DL = MI.getDebugLoc();
+    if (IsNegativeBranch)
+      CC = AArch64CC::getInvertedCondCode(CC);
+    BuildMI(RefToMBB, MI, DL, get(AArch64::Bcc)).addImm(CC).addMBB(TBB);
+    MI.eraseFromParent();
+    return true;
+  }
+  }
 }
 
 std::pair<unsigned, unsigned>
@@ -3046,7 +3992,6 @@ AArch64InstrInfo::getSerializableBitmaskMachineOperandTargetFlags() const {
   static const std::pair<unsigned, const char *> TargetFlags[] = {
       {MO_GOT, "aarch64-got"},
       {MO_NC, "aarch64-nc"},
-      {MO_TLS, "aarch64-tls"},
-      {MO_CONSTPOOL, "aarch64-constant-pool"}};
+      {MO_TLS, "aarch64-tls"}};
   return makeArrayRef(TargetFlags);
 }
diff --git a/lib/Target/AArch64/AArch64InstrInfo.h b/lib/Target/AArch64/AArch64InstrInfo.h
index b5bb446f8c16..24bc0e639747 100644
--- a/lib/Target/AArch64/AArch64InstrInfo.h
+++ b/lib/Target/AArch64/AArch64InstrInfo.h
@@ -28,12 +28,6 @@ class AArch64Subtarget;
 class AArch64TargetMachine;
 
 class AArch64InstrInfo : public AArch64GenInstrInfo {
-  // Reserve bits in the MachineMemOperand target hint flags, starting at 1.
-  // They will be shifted into MOTargetHintStart when accessed.
-  enum TargetMemOperandFlags {
-    MOSuppressPair = 1
-  };
-
   const AArch64RegisterInfo RI;
   const AArch64Subtarget &Subtarget;
 
@@ -45,76 +39,88 @@ public:
   /// always be able to get register info as well (through this method).
   const AArch64RegisterInfo &getRegisterInfo() const { return RI; }
 
-  unsigned GetInstSizeInBytes(const MachineInstr *MI) const;
+  unsigned GetInstSizeInBytes(const MachineInstr &MI) const;
 
-  bool isAsCheapAsAMove(const MachineInstr *MI) const override;
+  bool isAsCheapAsAMove(const MachineInstr &MI) const override;
 
   bool isCoalescableExtInstr(const MachineInstr &MI, unsigned &SrcReg,
                              unsigned &DstReg, unsigned &SubIdx) const override;
 
   bool
-  areMemAccessesTriviallyDisjoint(MachineInstr *MIa, MachineInstr *MIb,
+  areMemAccessesTriviallyDisjoint(MachineInstr &MIa, MachineInstr &MIb,
                                   AliasAnalysis *AA = nullptr) const override;
 
-  unsigned isLoadFromStackSlot(const MachineInstr *MI,
+  unsigned isLoadFromStackSlot(const MachineInstr &MI,
                                int &FrameIndex) const override;
-  unsigned isStoreToStackSlot(const MachineInstr *MI,
+  unsigned isStoreToStackSlot(const MachineInstr &MI,
                               int &FrameIndex) const override;
 
   /// Returns true if there is a shiftable register and that the shift value
   /// is non-zero.
-  bool hasShiftedReg(const MachineInstr *MI) const;
+  bool hasShiftedReg(const MachineInstr &MI) const;
 
   /// Returns true if there is an extendable register and that the extending
   /// value is non-zero.
-  bool hasExtendedReg(const MachineInstr *MI) const;
+  bool hasExtendedReg(const MachineInstr &MI) const;
 
   /// \brief Does this instruction set its full destination register to zero?
-  bool isGPRZero(const MachineInstr *MI) const;
+  bool isGPRZero(const MachineInstr &MI) const;
 
   /// \brief Does this instruction rename a GPR without modifying bits?
-  bool isGPRCopy(const MachineInstr *MI) const;
+  bool isGPRCopy(const MachineInstr &MI) const;
 
   /// \brief Does this instruction rename an FPR without modifying bits?
-  bool isFPRCopy(const MachineInstr *MI) const;
+  bool isFPRCopy(const MachineInstr &MI) const;
 
   /// Return true if this is load/store scales or extends its register offset.
   /// This refers to scaling a dynamic index as opposed to scaled immediates.
   /// MI should be a memory op that allows scaled addressing.
-  bool isScaledAddr(const MachineInstr *MI) const;
+  bool isScaledAddr(const MachineInstr &MI) const;
 
   /// Return true if pairing the given load or store is hinted to be
   /// unprofitable.
-  bool isLdStPairSuppressed(const MachineInstr *MI) const;
+  bool isLdStPairSuppressed(const MachineInstr &MI) const;
+
+  /// Return true if this is an unscaled load/store.
+  bool isUnscaledLdSt(unsigned Opc) const;
+
+  /// Return true if this is an unscaled load/store.
+  bool isUnscaledLdSt(MachineInstr &MI) const;
+
+  /// Return true if this is a load/store that can be potentially paired/merged.
+  bool isCandidateToMergeOrPair(MachineInstr &MI) const;
 
   /// Hint that pairing the given load or store is unprofitable.
-  void suppressLdStPair(MachineInstr *MI) const;
+  void suppressLdStPair(MachineInstr &MI) const;
 
-  bool getMemOpBaseRegImmOfs(MachineInstr *LdSt, unsigned &BaseReg,
-                             unsigned &Offset,
+  bool getMemOpBaseRegImmOfs(MachineInstr &LdSt, unsigned &BaseReg,
+                             int64_t &Offset,
                              const TargetRegisterInfo *TRI) const override;
 
-  bool getMemOpBaseRegImmOfsWidth(MachineInstr *LdSt, unsigned &BaseReg,
-                                  int &Offset, int &Width,
+  bool getMemOpBaseRegImmOfsWidth(MachineInstr &LdSt, unsigned &BaseReg,
+                                  int64_t &Offset, unsigned &Width,
                                   const TargetRegisterInfo *TRI) const;
 
   bool enableClusterLoads() const override { return true; }
 
-  bool shouldClusterLoads(MachineInstr *FirstLdSt, MachineInstr *SecondLdSt,
-                          unsigned NumLoads) const override;
+  bool enableClusterStores() const override { return true; }
+
+  bool shouldClusterMemOps(MachineInstr &FirstLdSt, MachineInstr &SecondLdSt,
+                           unsigned NumLoads) const override;
 
-  bool shouldScheduleAdjacent(MachineInstr *First,
-                              MachineInstr *Second) const override;
+  bool shouldScheduleAdjacent(MachineInstr &First,
+                              MachineInstr &Second) const override;
 
   MachineInstr *emitFrameIndexDebugValue(MachineFunction &MF, int FrameIx,
                                          uint64_t Offset, const MDNode *Var,
-                                         const MDNode *Expr, DebugLoc DL) const;
+                                         const MDNode *Expr,
+                                         const DebugLoc &DL) const;
   void copyPhysRegTuple(MachineBasicBlock &MBB, MachineBasicBlock::iterator I,
-                        DebugLoc DL, unsigned DestReg, unsigned SrcReg,
+                        const DebugLoc &DL, unsigned DestReg, unsigned SrcReg,
                         bool KillSrc, unsigned Opcode,
                         llvm::ArrayRef<unsigned> Indices) const;
   void copyPhysReg(MachineBasicBlock &MBB, MachineBasicBlock::iterator I,
-                   DebugLoc DL, unsigned DestReg, unsigned SrcReg,
+                   const DebugLoc &DL, unsigned DestReg, unsigned SrcReg,
                    bool KillSrc) const override;
 
   void storeRegToStackSlot(MachineBasicBlock &MBB,
@@ -129,40 +135,47 @@ public:
                             const TargetRegisterInfo *TRI) const override;
 
   using TargetInstrInfo::foldMemoryOperandImpl;
-  MachineInstr *foldMemoryOperandImpl(MachineFunction &MF, MachineInstr *MI,
-                                      ArrayRef<unsigned> Ops,
-                                      MachineBasicBlock::iterator InsertPt,
-                                      int FrameIndex) const override;
+  MachineInstr *
+  foldMemoryOperandImpl(MachineFunction &MF, MachineInstr &MI,
+                        ArrayRef<unsigned> Ops,
+                        MachineBasicBlock::iterator InsertPt, int FrameIndex,
+                        LiveIntervals *LIS = nullptr) const override;
 
-  bool AnalyzeBranch(MachineBasicBlock &MBB, MachineBasicBlock *&TBB,
+  bool analyzeBranch(MachineBasicBlock &MBB, MachineBasicBlock *&TBB,
                      MachineBasicBlock *&FBB,
                      SmallVectorImpl<MachineOperand> &Cond,
                      bool AllowModify = false) const override;
   unsigned RemoveBranch(MachineBasicBlock &MBB) const override;
   unsigned InsertBranch(MachineBasicBlock &MBB, MachineBasicBlock *TBB,
                         MachineBasicBlock *FBB, ArrayRef<MachineOperand> Cond,
-                        DebugLoc DL) const override;
+                        const DebugLoc &DL) const override;
   bool
   ReverseBranchCondition(SmallVectorImpl<MachineOperand> &Cond) const override;
   bool canInsertSelect(const MachineBasicBlock &, ArrayRef<MachineOperand> Cond,
                        unsigned, unsigned, int &, int &, int &) const override;
   void insertSelect(MachineBasicBlock &MBB, MachineBasicBlock::iterator MI,
-                    DebugLoc DL, unsigned DstReg, ArrayRef<MachineOperand> Cond,
-                    unsigned TrueReg, unsigned FalseReg) const override;
+                    const DebugLoc &DL, unsigned DstReg,
+                    ArrayRef<MachineOperand> Cond, unsigned TrueReg,
+                    unsigned FalseReg) const override;
   void getNoopForMachoTarget(MCInst &NopInst) const override;
 
   /// analyzeCompare - For a comparison instruction, return the source registers
   /// in SrcReg and SrcReg2, and the value it compares against in CmpValue.
   /// Return true if the comparison instruction can be analyzed.
-  bool analyzeCompare(const MachineInstr *MI, unsigned &SrcReg,
+  bool analyzeCompare(const MachineInstr &MI, unsigned &SrcReg,
                       unsigned &SrcReg2, int &CmpMask,
                       int &CmpValue) const override;
   /// optimizeCompareInstr - Convert the instruction supplying the argument to
   /// the comparison into one that sets the zero bit in the flags register.
-  bool optimizeCompareInstr(MachineInstr *CmpInstr, unsigned SrcReg,
+  bool optimizeCompareInstr(MachineInstr &CmpInstr, unsigned SrcReg,
                             unsigned SrcReg2, int CmpMask, int CmpValue,
                             const MachineRegisterInfo *MRI) const override;
-  bool optimizeCondBranch(MachineInstr *MI) const override;
+  bool optimizeCondBranch(MachineInstr &MI) const override;
+
+  /// Return true when a code sequence can improve throughput. It
+  /// should be called only for instructions in loops.
+  /// \param Pattern - combiner pattern
+  bool isThroughputPattern(MachineCombinerPattern Pattern) const override;
   /// Return true when there is potentially a faster code sequence
   /// for an instruction chain ending in <Root>. All potential patterns are
   /// listed in the <Patterns> array.
@@ -179,10 +192,10 @@ public:
       SmallVectorImpl<MachineInstr *> &InsInstrs,
       SmallVectorImpl<MachineInstr *> &DelInstrs,
       DenseMap<unsigned, unsigned> &InstrIdxForVirtReg) const override;
-  /// useMachineCombiner - AArch64 supports MachineCombiner
+  /// AArch64 supports MachineCombiner.
   bool useMachineCombiner() const override;
 
-  bool expandPostRAPseudo(MachineBasicBlock::iterator MI) const override;
+  bool expandPostRAPseudo(MachineInstr &MI) const override;
 
   std::pair<unsigned, unsigned>
   decomposeMachineOperandsTargetFlags(unsigned TF) const override;
@@ -192,9 +205,11 @@ public:
   getSerializableBitmaskMachineOperandTargetFlags() const override;
 
 private:
-  void instantiateCondBranch(MachineBasicBlock &MBB, DebugLoc DL,
+  void instantiateCondBranch(MachineBasicBlock &MBB, const DebugLoc &DL,
                              MachineBasicBlock *TBB,
                              ArrayRef<MachineOperand> Cond) const;
+  bool substituteCmpToZero(MachineInstr &CmpInstr, unsigned SrcReg,
+                           const MachineRegisterInfo *MRI) const;
 };
 
 /// emitFrameOffset - Emit instructions as needed to set DestReg to SrcReg
@@ -202,8 +217,8 @@ private:
 /// insertion (PEI) pass, where a virtual scratch register may be allocated
 /// if necessary, to be replaced by the scavenger at the end of PEI.
 void emitFrameOffset(MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI,
-                     DebugLoc DL, unsigned DestReg, unsigned SrcReg, int Offset,
-                     const TargetInstrInfo *TII,
+                     const DebugLoc &DL, unsigned DestReg, unsigned SrcReg,
+                     int Offset, const TargetInstrInfo *TII,
                      MachineInstr::MIFlag = MachineInstr::NoFlags,
                      bool SetNZCV = false);
 
diff --git a/lib/Target/AArch64/AArch64InstrInfo.td b/lib/Target/AArch64/AArch64InstrInfo.td
index d02bc9ff394d..af9ed812e6da 100644
--- a/lib/Target/AArch64/AArch64InstrInfo.td
+++ b/lib/Target/AArch64/AArch64InstrInfo.td
@@ -26,6 +26,8 @@ def HasCrypto        : Predicate<"Subtarget->hasCrypto()">,
                                  AssemblerPredicate<"FeatureCrypto", "crypto">;
 def HasCRC           : Predicate<"Subtarget->hasCRC()">,
                                  AssemblerPredicate<"FeatureCRC", "crc">;
+def HasRAS           : Predicate<"Subtarget->hasRAS()">,
+                                 AssemblerPredicate<"FeatureRAS", "ras">;
 def HasPerfMon       : Predicate<"Subtarget->hasPerfMon()">;
 def HasFullFP16      : Predicate<"Subtarget->hasFullFP16()">,
                                  AssemblerPredicate<"FeatureFullFP16", "fullfp16">;
@@ -34,7 +36,8 @@ def HasSPE           : Predicate<"Subtarget->hasSPE()">,
 
 def IsLE             : Predicate<"Subtarget->isLittleEndian()">;
 def IsBE             : Predicate<"!Subtarget->isLittleEndian()">;
-def IsCyclone        : Predicate<"Subtarget->isCyclone()">;
+def UseAlternateSExtLoadCVTF32
+    : Predicate<"Subtarget->useAlternateSExtLoadCVTF32Pattern()">;
 
 //===----------------------------------------------------------------------===//
 // AArch64-specific DAG Nodes.
@@ -283,6 +286,9 @@ def SDT_AArch64mull : SDTypeProfile<1, 2, [SDTCisInt<0>, SDTCisInt<1>,
 def AArch64smull    : SDNode<"AArch64ISD::SMULL", SDT_AArch64mull>;
 def AArch64umull    : SDNode<"AArch64ISD::UMULL", SDT_AArch64mull>;
 
+def AArch64frecpe   : SDNode<"AArch64ISD::FRECPE", SDTFPUnaryOp>;
+def AArch64frsqrte  : SDNode<"AArch64ISD::FRSQRTE", SDTFPUnaryOp>;
+
 def AArch64saddv    : SDNode<"AArch64ISD::SADDV", SDT_AArch64UnaryVec>;
 def AArch64uaddv    : SDNode<"AArch64ISD::UADDV", SDT_AArch64UnaryVec>;
 def AArch64sminv    : SDNode<"AArch64ISD::SMINV", SDT_AArch64UnaryVec>;
@@ -295,9 +301,6 @@ def AArch64umaxv    : SDNode<"AArch64ISD::UMAXV", SDT_AArch64UnaryVec>;
 //===----------------------------------------------------------------------===//
 
 // AArch64 Instruction Predicate Definitions.
-//
-def HasZCZ    : Predicate<"Subtarget->hasZeroCycleZeroing()">;
-def NoZCZ     : Predicate<"!Subtarget->hasZeroCycleZeroing()">;
 def IsDarwin  : Predicate<"Subtarget->isTargetDarwin()">;
 def IsNotDarwin: Predicate<"!Subtarget->isTargetDarwin()">;
 def ForCodeSize   : Predicate<"ForCodeSize">;
@@ -312,10 +315,13 @@ include "AArch64InstrFormats.td"
 //===----------------------------------------------------------------------===//
 
 let Defs = [SP], Uses = [SP], hasSideEffects = 1, isCodeGenOnly = 1 in {
+// We set Sched to empty list because we expect these instructions to simply get
+// removed in most cases.
 def ADJCALLSTACKDOWN : Pseudo<(outs), (ins i32imm:$amt),
-                              [(AArch64callseq_start timm:$amt)]>;
+                              [(AArch64callseq_start timm:$amt)]>, Sched<[]>;
 def ADJCALLSTACKUP : Pseudo<(outs), (ins i32imm:$amt1, i32imm:$amt2),
-                            [(AArch64callseq_end timm:$amt1, timm:$amt2)]>;
+                            [(AArch64callseq_end timm:$amt1, timm:$amt2)]>,
+                            Sched<[]>;
 } // Defs = [SP], Uses = [SP], hasSideEffects = 1, isCodeGenOnly = 1
 
 let isReMaterializable = 1, isCodeGenOnly = 1 in {
@@ -383,6 +389,7 @@ def : InstAlias<"wfe",  (HINT 0b010)>;
 def : InstAlias<"wfi",  (HINT 0b011)>;
 def : InstAlias<"sev",  (HINT 0b100)>;
 def : InstAlias<"sevl", (HINT 0b101)>;
+def : InstAlias<"esb",  (HINT 0b10000)>, Requires<[HasRAS]>;
 
 // v8.2a Statistical Profiling extension
 def : InstAlias<"psb $op",  (HINT psbhint_op:$op)>, Requires<[HasSPE]>;
@@ -528,6 +535,12 @@ def i64imm_32bit : ImmLeaf<i64, [{
   return (Imm & 0xffffffffULL) == static_cast<uint64_t>(Imm);
 }]>;
 
+def s64imm_32bit : ImmLeaf<i64, [{
+  int64_t Imm64 = static_cast<int64_t>(Imm);
+  return Imm64 >= std::numeric_limits<int32_t>::min() &&
+         Imm64 <= std::numeric_limits<int32_t>::max();
+}]>;
+
 def trunc_imm : SDNodeXForm<imm, [{
   return CurDAG->getTargetConstant(N->getZExtValue(), SDLoc(N), MVT::i32);
 }]>;
@@ -679,10 +692,11 @@ def : InstAlias<"negs $dst, $src$shift",
 // Unsigned/Signed divide
 defm UDIV : Div<0, "udiv", udiv>;
 defm SDIV : Div<1, "sdiv", sdiv>;
-let isCodeGenOnly = 1 in {
-defm UDIV_Int : Div<0, "udiv", int_aarch64_udiv>;
-defm SDIV_Int : Div<1, "sdiv", int_aarch64_sdiv>;
-}
+
+def : Pat<(int_aarch64_udiv GPR32:$Rn, GPR32:$Rm), (UDIVWr $Rn, $Rm)>;
+def : Pat<(int_aarch64_udiv GPR64:$Rn, GPR64:$Rm), (UDIVXr $Rn, $Rm)>;
+def : Pat<(int_aarch64_sdiv GPR32:$Rn, GPR32:$Rm), (SDIVWr $Rn, $Rm)>;
+def : Pat<(int_aarch64_sdiv GPR64:$Rn, GPR64:$Rm), (SDIVXr $Rn, $Rm)>;
 
 // Variable shift
 defm ASRV : Shift<0b10, "asr", sra>;
@@ -734,6 +748,40 @@ def : Pat<(i64 (ineg (mul (sext GPR32:$Rn), (sext GPR32:$Rm)))),
           (SMSUBLrrr GPR32:$Rn, GPR32:$Rm, XZR)>;
 def : Pat<(i64 (ineg (mul (zext GPR32:$Rn), (zext GPR32:$Rm)))),
           (UMSUBLrrr GPR32:$Rn, GPR32:$Rm, XZR)>;
+
+def : Pat<(i64 (mul (sext GPR32:$Rn), (s64imm_32bit:$C))),
+          (SMADDLrrr GPR32:$Rn, (MOVi32imm (trunc_imm imm:$C)), XZR)>;
+def : Pat<(i64 (mul (zext GPR32:$Rn), (i64imm_32bit:$C))),
+          (UMADDLrrr GPR32:$Rn, (MOVi32imm (trunc_imm imm:$C)), XZR)>;
+def : Pat<(i64 (mul (sext_inreg GPR64:$Rn, i32), (s64imm_32bit:$C))),
+          (SMADDLrrr (i32 (EXTRACT_SUBREG GPR64:$Rn, sub_32)),
+                     (MOVi32imm (trunc_imm imm:$C)), XZR)>;
+
+def : Pat<(i64 (ineg (mul (sext GPR32:$Rn), (s64imm_32bit:$C)))),
+          (SMSUBLrrr GPR32:$Rn, (MOVi32imm (trunc_imm imm:$C)), XZR)>;
+def : Pat<(i64 (ineg (mul (zext GPR32:$Rn), (i64imm_32bit:$C)))),
+          (UMSUBLrrr GPR32:$Rn, (MOVi32imm (trunc_imm imm:$C)), XZR)>;
+def : Pat<(i64 (ineg (mul (sext_inreg GPR64:$Rn, i32), (s64imm_32bit:$C)))),
+          (SMSUBLrrr (i32 (EXTRACT_SUBREG GPR64:$Rn, sub_32)),
+                     (MOVi32imm (trunc_imm imm:$C)), XZR)>;
+
+def : Pat<(i64 (add (mul (sext GPR32:$Rn), (s64imm_32bit:$C)), GPR64:$Ra)),
+          (SMADDLrrr GPR32:$Rn, (MOVi32imm (trunc_imm imm:$C)), GPR64:$Ra)>;
+def : Pat<(i64 (add (mul (zext GPR32:$Rn), (i64imm_32bit:$C)), GPR64:$Ra)),
+          (UMADDLrrr GPR32:$Rn, (MOVi32imm (trunc_imm imm:$C)), GPR64:$Ra)>;
+def : Pat<(i64 (add (mul (sext_inreg GPR64:$Rn, i32), (s64imm_32bit:$C)),
+                    GPR64:$Ra)),
+          (SMADDLrrr (i32 (EXTRACT_SUBREG GPR64:$Rn, sub_32)),
+                     (MOVi32imm (trunc_imm imm:$C)), GPR64:$Ra)>;
+
+def : Pat<(i64 (sub GPR64:$Ra, (mul (sext GPR32:$Rn), (s64imm_32bit:$C)))),
+          (SMSUBLrrr GPR32:$Rn, (MOVi32imm (trunc_imm imm:$C)), GPR64:$Ra)>;
+def : Pat<(i64 (sub GPR64:$Ra, (mul (zext GPR32:$Rn), (i64imm_32bit:$C)))),
+          (UMSUBLrrr GPR32:$Rn, (MOVi32imm (trunc_imm imm:$C)), GPR64:$Ra)>;
+def : Pat<(i64 (sub GPR64:$Ra, (mul (sext_inreg GPR64:$Rn, i32),
+                                    (s64imm_32bit:$C)))),
+          (SMSUBLrrr (i32 (EXTRACT_SUBREG GPR64:$Rn, sub_32)),
+                     (MOVi32imm (trunc_imm imm:$C)), GPR64:$Ra)>;
 } // AddedComplexity = 5
 
 def : MulAccumWAlias<"mul", MADDWrrr>;
@@ -1089,6 +1137,14 @@ def : Pat<(AArch64csel (i32 0), (i32 -1), (i32 imm:$cc), NZCV),
           (CSINVWr WZR, WZR, (i32 imm:$cc))>;
 def : Pat<(AArch64csel (i64 0), (i64 -1), (i32 imm:$cc), NZCV),
           (CSINVXr XZR, XZR, (i32 imm:$cc))>;
+def : Pat<(AArch64csel GPR32:$tval, (i32 -1), (i32 imm:$cc), NZCV),
+          (CSINVWr GPR32:$tval, WZR, (i32 imm:$cc))>;
+def : Pat<(AArch64csel GPR64:$tval, (i64 -1), (i32 imm:$cc), NZCV),
+          (CSINVXr GPR64:$tval, XZR, (i32 imm:$cc))>;
+def : Pat<(AArch64csel (i32 -1), GPR32:$fval, (i32 imm:$cc), NZCV),
+          (CSINVWr GPR32:$fval, WZR, (i32 (inv_cond_XFORM imm:$cc)))>;
+def : Pat<(AArch64csel (i64 -1), GPR64:$fval, (i32 imm:$cc), NZCV),
+          (CSINVXr GPR64:$fval, XZR, (i32 (inv_cond_XFORM imm:$cc)))>;
 
 // The inverse of the condition code from the alias instruction is what is used
 // in the aliased instruction. The parser all ready inverts the condition code
@@ -1158,7 +1214,8 @@ def BR  : BranchReg<0b0000, "br", [(brind GPR64:$Rn)]>;
 // Create a separate pseudo-instruction for codegen to use so that we don't
 // flag lr as used in every function. It'll be restored before the RET by the
 // epilogue if it's legitimately used.
-def RET_ReallyLR : Pseudo<(outs), (ins), [(AArch64retflag)]> {
+def RET_ReallyLR : Pseudo<(outs), (ins), [(AArch64retflag)]>,
+                   Sched<[WriteBrReg]> {
   let isTerminator = 1;
   let isBarrier = 1;
   let isReturn = 1;
@@ -1168,7 +1225,7 @@ def RET_ReallyLR : Pseudo<(outs), (ins), [(AArch64retflag)]> {
 // R_AARCH64_TLSDESC_CALL relocation at the offset of the following instruction
 // (which in the usual case is a BLR).
 let hasSideEffects = 1 in
-def TLSDESCCALL : Pseudo<(outs), (ins i64imm:$sym), []> {
+def TLSDESCCALL : Pseudo<(outs), (ins i64imm:$sym), []>, Sched<[]> {
   let AsmString = ".tlsdesccall $sym";
 }
 
@@ -1178,7 +1235,8 @@ let isCall = 1, Defs = [LR, X0, X1], hasSideEffects = 1,
     isCodeGenOnly = 1 in
 def TLSDESC_CALLSEQ
     : Pseudo<(outs), (ins i64imm:$sym),
-             [(AArch64tlsdesc_callseq tglobaltlsaddr:$sym)]>;
+             [(AArch64tlsdesc_callseq tglobaltlsaddr:$sym)]>,
+      Sched<[WriteI, WriteLD, WriteI, WriteBrReg]>;
 def : Pat<(AArch64tlsdesc_callseq texternalsym:$sym),
           (TLSDESC_CALLSEQ texternalsym:$sym)>;
 
@@ -2444,13 +2502,32 @@ defm FCVTZS : FPToIntegerUnscaled<0b11, 0b000, "fcvtzs", fp_to_sint>;
 defm FCVTZU : FPToIntegerUnscaled<0b11, 0b001, "fcvtzu", fp_to_uint>;
 defm FCVTZS : FPToIntegerScaled<0b11, 0b000, "fcvtzs", fp_to_sint>;
 defm FCVTZU : FPToIntegerScaled<0b11, 0b001, "fcvtzu", fp_to_uint>;
-let isCodeGenOnly = 1 in {
-defm FCVTZS_Int : FPToIntegerUnscaled<0b11, 0b000, "fcvtzs", int_aarch64_neon_fcvtzs>;
-defm FCVTZU_Int : FPToIntegerUnscaled<0b11, 0b001, "fcvtzu", int_aarch64_neon_fcvtzu>;
-defm FCVTZS_Int : FPToIntegerScaled<0b11, 0b000, "fcvtzs", int_aarch64_neon_fcvtzs>;
-defm FCVTZU_Int : FPToIntegerScaled<0b11, 0b001, "fcvtzu", int_aarch64_neon_fcvtzu>;
+
+multiclass FPToIntegerIntPats<Intrinsic round, string INST> {
+  def : Pat<(i32 (round f16:$Rn)), (!cast<Instruction>(INST # UWHr) $Rn)>;
+  def : Pat<(i64 (round f16:$Rn)), (!cast<Instruction>(INST # UXHr) $Rn)>;
+  def : Pat<(i32 (round f32:$Rn)), (!cast<Instruction>(INST # UWSr) $Rn)>;
+  def : Pat<(i64 (round f32:$Rn)), (!cast<Instruction>(INST # UXSr) $Rn)>;
+  def : Pat<(i32 (round f64:$Rn)), (!cast<Instruction>(INST # UWDr) $Rn)>;
+  def : Pat<(i64 (round f64:$Rn)), (!cast<Instruction>(INST # UXDr) $Rn)>;
+
+  def : Pat<(i32 (round (fmul f16:$Rn, fixedpoint_f16_i32:$scale))),
+            (!cast<Instruction>(INST # SWHri) $Rn, $scale)>;
+  def : Pat<(i64 (round (fmul f16:$Rn, fixedpoint_f16_i64:$scale))),
+            (!cast<Instruction>(INST # SXHri) $Rn, $scale)>;
+  def : Pat<(i32 (round (fmul f32:$Rn, fixedpoint_f32_i32:$scale))),
+            (!cast<Instruction>(INST # SWSri) $Rn, $scale)>;
+  def : Pat<(i64 (round (fmul f32:$Rn, fixedpoint_f32_i64:$scale))),
+            (!cast<Instruction>(INST # SXSri) $Rn, $scale)>;
+  def : Pat<(i32 (round (fmul f64:$Rn, fixedpoint_f64_i32:$scale))),
+            (!cast<Instruction>(INST # SWDri) $Rn, $scale)>;
+  def : Pat<(i64 (round (fmul f64:$Rn, fixedpoint_f64_i64:$scale))),
+            (!cast<Instruction>(INST # SXDri) $Rn, $scale)>;
 }
 
+defm : FPToIntegerIntPats<int_aarch64_neon_fcvtzs, "FCVTZS">;
+defm : FPToIntegerIntPats<int_aarch64_neon_fcvtzu, "FCVTZU">;
+
 multiclass FPToIntegerPats<SDNode to_int, SDNode round, string INST> {
   def : Pat<(i32 (to_int (round f32:$Rn))),
             (!cast<Instruction>(INST # UWSr) f32:$Rn)>;
@@ -2485,13 +2562,11 @@ defm UCVTF : IntegerToFP<1, "ucvtf", uint_to_fp>;
 defm FMOV : UnscaledConversion<"fmov">;
 
 // Add pseudo ops for FMOV 0 so we can mark them as isReMaterializable
-let isReMaterializable = 1, isCodeGenOnly = 1 in {
+let isReMaterializable = 1, isCodeGenOnly = 1, isAsCheapAsAMove = 1 in {
 def FMOVS0 : Pseudo<(outs FPR32:$Rd), (ins), [(set f32:$Rd, (fpimm0))]>,
-    PseudoInstExpansion<(FMOVWSr FPR32:$Rd, WZR)>,
-    Requires<[NoZCZ]>;
+    Sched<[WriteF]>;
 def FMOVD0 : Pseudo<(outs FPR64:$Rd), (ins), [(set f64:$Rd, (fpimm0))]>,
-    PseudoInstExpansion<(FMOVXDr FPR64:$Rd, XZR)>,
-    Requires<[NoZCZ]>;
+    Sched<[WriteF]>;
 }
 
 //===----------------------------------------------------------------------===//
@@ -2617,6 +2692,7 @@ def F128CSEL : Pseudo<(outs FPR128:$Rd),
                                        (i32 imm:$cond), NZCV))]> {
   let Uses = [NZCV];
   let usesCustomInserter = 1;
+  let hasNoSchedulingInfo = 1;
 }
 
 
@@ -2742,12 +2818,19 @@ defm FCVTXN : SIMDFPInexactCvtTwoVector<1, 0, 0b10110, "fcvtxn",
                                         int_aarch64_neon_fcvtxn>;
 defm FCVTZS : SIMDTwoVectorFPToInt<0, 1, 0b11011, "fcvtzs", fp_to_sint>;
 defm FCVTZU : SIMDTwoVectorFPToInt<1, 1, 0b11011, "fcvtzu", fp_to_uint>;
-let isCodeGenOnly = 1 in {
-defm FCVTZS_Int : SIMDTwoVectorFPToInt<0, 1, 0b11011, "fcvtzs",
-                                       int_aarch64_neon_fcvtzs>;
-defm FCVTZU_Int : SIMDTwoVectorFPToInt<1, 1, 0b11011, "fcvtzu",
-                                       int_aarch64_neon_fcvtzu>;
-}
+
+def : Pat<(v4i16 (int_aarch64_neon_fcvtzs v4f16:$Rn)), (FCVTZSv4f16 $Rn)>;
+def : Pat<(v8i16 (int_aarch64_neon_fcvtzs v8f16:$Rn)), (FCVTZSv8f16 $Rn)>;
+def : Pat<(v2i32 (int_aarch64_neon_fcvtzs v2f32:$Rn)), (FCVTZSv2f32 $Rn)>;
+def : Pat<(v4i32 (int_aarch64_neon_fcvtzs v4f32:$Rn)), (FCVTZSv4f32 $Rn)>;
+def : Pat<(v2i64 (int_aarch64_neon_fcvtzs v2f64:$Rn)), (FCVTZSv2f64 $Rn)>;
+
+def : Pat<(v4i16 (int_aarch64_neon_fcvtzu v4f16:$Rn)), (FCVTZUv4f16 $Rn)>;
+def : Pat<(v8i16 (int_aarch64_neon_fcvtzu v8f16:$Rn)), (FCVTZUv8f16 $Rn)>;
+def : Pat<(v2i32 (int_aarch64_neon_fcvtzu v2f32:$Rn)), (FCVTZUv2f32 $Rn)>;
+def : Pat<(v4i32 (int_aarch64_neon_fcvtzu v4f32:$Rn)), (FCVTZUv4f32 $Rn)>;
+def : Pat<(v2i64 (int_aarch64_neon_fcvtzu v2f64:$Rn)), (FCVTZUv2f64 $Rn)>;
+
 defm FNEG   : SIMDTwoVectorFP<1, 1, 0b01111, "fneg", fneg>;
 defm FRECPE : SIMDTwoVectorFP<0, 1, 0b11101, "frecpe", int_aarch64_neon_frecpe>;
 defm FRINTA : SIMDTwoVectorFP<1, 0, 0b11000, "frinta", frnd>;
@@ -3318,6 +3401,19 @@ def : Pat<(f64 (int_aarch64_neon_frecpe (f64 FPR64:$Rn))),
 def : Pat<(v1f64 (int_aarch64_neon_frecpe (v1f64 FPR64:$Rn))),
           (FRECPEv1i64 FPR64:$Rn)>;
 
+def : Pat<(f32 (AArch64frecpe (f32 FPR32:$Rn))),
+          (FRECPEv1i32 FPR32:$Rn)>;
+def : Pat<(v2f32 (AArch64frecpe (v2f32 V64:$Rn))),
+          (FRECPEv2f32 V64:$Rn)>;
+def : Pat<(v4f32 (AArch64frecpe (v4f32 FPR128:$Rn))),
+          (FRECPEv4f32 FPR128:$Rn)>;
+def : Pat<(f64 (AArch64frecpe (f64 FPR64:$Rn))),
+          (FRECPEv1i64 FPR64:$Rn)>;
+def : Pat<(v1f64 (AArch64frecpe (v1f64 FPR64:$Rn))),
+          (FRECPEv1i64 FPR64:$Rn)>;
+def : Pat<(v2f64 (AArch64frecpe (v2f64 FPR128:$Rn))),
+          (FRECPEv2f64 FPR128:$Rn)>;
+
 def : Pat<(f32 (int_aarch64_neon_frecpx (f32 FPR32:$Rn))),
           (FRECPXv1i32 FPR32:$Rn)>;
 def : Pat<(f64 (int_aarch64_neon_frecpx (f64 FPR64:$Rn))),
@@ -3330,6 +3426,19 @@ def : Pat<(f64 (int_aarch64_neon_frsqrte (f64 FPR64:$Rn))),
 def : Pat<(v1f64 (int_aarch64_neon_frsqrte (v1f64 FPR64:$Rn))),
           (FRSQRTEv1i64 FPR64:$Rn)>;
 
+def : Pat<(f32 (AArch64frsqrte (f32 FPR32:$Rn))),
+          (FRSQRTEv1i32 FPR32:$Rn)>;
+def : Pat<(v2f32 (AArch64frsqrte (v2f32 V64:$Rn))),
+          (FRSQRTEv2f32 V64:$Rn)>;
+def : Pat<(v4f32 (AArch64frsqrte (v4f32 FPR128:$Rn))),
+          (FRSQRTEv4f32 FPR128:$Rn)>;
+def : Pat<(f64 (AArch64frsqrte (f64 FPR64:$Rn))),
+          (FRSQRTEv1i64 FPR64:$Rn)>;
+def : Pat<(v1f64 (AArch64frsqrte (v1f64 FPR64:$Rn))),
+          (FRSQRTEv1i64 FPR64:$Rn)>;
+def : Pat<(v2f64 (AArch64frsqrte (v2f64 FPR128:$Rn))),
+          (FRSQRTEv2f64 FPR128:$Rn)>;
+
 // If an integer is about to be converted to a floating point value,
 // just load it on the floating point unit.
 // Here are the patterns for 8 and 16-bits to float.
@@ -4319,18 +4428,6 @@ def MOVIv2d_ns   : SIMDModifiedImmVectorNoShift<1, 1, 0, 0b1110, V128,
                                                 "movi", ".2d",
                    [(set (v2i64 V128:$Rd), (AArch64movi_edit imm0_255:$imm8))]>;
 
-
-// Use movi.2d to materialize 0.0 if the HW does zero-cycle zeroing.
-// Complexity is added to break a tie with a plain MOVI.
-let AddedComplexity = 1 in {
-def : Pat<(f32   fpimm0),
-          (f32 (EXTRACT_SUBREG (v2i64 (MOVIv2d_ns (i32 0))), ssub))>,
-      Requires<[HasZCZ]>;
-def : Pat<(f64   fpimm0),
-          (f64 (EXTRACT_SUBREG (v2i64 (MOVIv2d_ns (i32 0))), dsub))>,
-      Requires<[HasZCZ]>;
-}
-
 def : Pat<(v2i64 immAllZerosV), (MOVIv2d_ns (i32 0))>;
 def : Pat<(v4i32 immAllZerosV), (MOVIv2d_ns (i32 0))>;
 def : Pat<(v8i16 immAllZerosV), (MOVIv2d_ns (i32 0))>;
@@ -4845,7 +4942,8 @@ class SExtLoadi8CVTf32Pat<dag addrmode, dag INST>
                                     0),
                                   dsub)),
                                0),
-                             ssub)))>, Requires<[NotForCodeSize, IsCyclone]>;
+                             ssub)))>,
+    Requires<[NotForCodeSize, UseAlternateSExtLoadCVTF32]>;
 
 def : SExtLoadi8CVTf32Pat<(ro8.Wpat GPR64sp:$Rn, GPR32:$Rm, ro8.Wext:$ext),
                           (LDRBroW  GPR64sp:$Rn, GPR32:$Rm, ro8.Wext:$ext)>;
@@ -4898,7 +4996,8 @@ class SExtLoadi16CVTf64Pat<dag addrmode, dag INST>
                                      0),
                                    dsub)),
                                0),
-                             dsub)))>, Requires<[NotForCodeSize, IsCyclone]>;
+                             dsub)))>,
+    Requires<[NotForCodeSize, UseAlternateSExtLoadCVTF32]>;
  
 def : SExtLoadi16CVTf64Pat<(ro16.Wpat GPR64sp:$Rn, GPR32:$Rm, ro16.Wext:$ext),
                            (LDRHroW GPR64sp:$Rn, GPR32:$Rm, ro16.Wext:$ext)>;
@@ -5982,7 +6081,7 @@ def : NTStore64Pat<v8i8>;
 def : Pat<(nontemporalstore GPR64:$Rt,
             (am_indexed7s32 GPR64sp:$Rn, simm7s4:$offset)),
           (STNPWi (EXTRACT_SUBREG GPR64:$Rt, sub_32),
-                  (EXTRACT_SUBREG (UBFMXri GPR64:$Rt, 0, 31), sub_32),
+                  (EXTRACT_SUBREG (UBFMXri GPR64:$Rt, 32, 63), sub_32),
                   GPR64sp:$Rn, simm7s4:$offset)>;
 } // AddedComplexity=10
 } // Predicates = [IsLE]
@@ -5990,8 +6089,10 @@ def : Pat<(nontemporalstore GPR64:$Rt,
 // Tail call return handling. These are all compiler pseudo-instructions,
 // so no encoding information or anything like that.
 let isCall = 1, isTerminator = 1, isReturn = 1, isBarrier = 1, Uses = [SP] in {
-  def TCRETURNdi : Pseudo<(outs), (ins i64imm:$dst, i32imm:$FPDiff),[]>;
-  def TCRETURNri : Pseudo<(outs), (ins tcGPR64:$dst, i32imm:$FPDiff), []>;
+  def TCRETURNdi : Pseudo<(outs), (ins i64imm:$dst, i32imm:$FPDiff), []>,
+                   Sched<[WriteBrReg]>;
+  def TCRETURNri : Pseudo<(outs), (ins tcGPR64:$dst, i32imm:$FPDiff), []>,
+                   Sched<[WriteBrReg]>;
 }
 
 def : Pat<(AArch64tcret tcGPR64:$dst, (i32 timm:$FPDiff)),
diff --git a/lib/Target/AArch64/AArch64LoadStoreOptimizer.cpp b/lib/Target/AArch64/AArch64LoadStoreOptimizer.cpp
index 43664df3b861..dca13fc49414 100644
--- a/lib/Target/AArch64/AArch64LoadStoreOptimizer.cpp
+++ b/lib/Target/AArch64/AArch64LoadStoreOptimizer.cpp
@@ -33,9 +33,6 @@ using namespace llvm;
 
 #define DEBUG_TYPE "aarch64-ldst-opt"
 
-/// AArch64AllocLoadStoreOpt - Post-register allocation pass to combine
-/// load / store instructions to form ldp / stp instructions.
-
 STATISTIC(NumPairCreated, "Number of load/store pair instructions generated");
 STATISTIC(NumPostFolded, "Number of post-index updates folded");
 STATISTIC(NumPreFolded, "Number of pre-index updates folded");
@@ -45,9 +42,19 @@ STATISTIC(NumNarrowLoadsPromoted, "Number of narrow loads promoted");
 STATISTIC(NumZeroStoresPromoted, "Number of narrow zero stores promoted");
 STATISTIC(NumLoadsFromStoresPromoted, "Number of loads from stores promoted");
 
-static cl::opt<unsigned> ScanLimit("aarch64-load-store-scan-limit",
+// The LdStLimit limits how far we search for load/store pairs.
+static cl::opt<unsigned> LdStLimit("aarch64-load-store-scan-limit",
                                    cl::init(20), cl::Hidden);
 
+// The UpdateLimit limits how far we search for update instructions when we form
+// pre-/post-index instructions.
+static cl::opt<unsigned> UpdateLimit("aarch64-update-scan-limit", cl::init(100),
+                                     cl::Hidden);
+
+static cl::opt<bool> EnableNarrowLdMerge("enable-narrow-ld-merge", cl::Hidden,
+                                         cl::init(false),
+                                         cl::desc("Enable narrow load merge"));
+
 namespace llvm {
 void initializeAArch64LoadStoreOptPass(PassRegistry &);
 }
@@ -88,22 +95,29 @@ struct AArch64LoadStoreOpt : public MachineFunctionPass {
   const TargetRegisterInfo *TRI;
   const AArch64Subtarget *Subtarget;
 
+  // Track which registers have been modified and used.
+  BitVector ModifiedRegs, UsedRegs;
+
   // Scan the instructions looking for a load/store that can be combined
   // with the current instruction into a load/store pair.
   // Return the matching instruction if one is found, else MBB->end().
   MachineBasicBlock::iterator findMatchingInsn(MachineBasicBlock::iterator I,
                                                LdStPairFlags &Flags,
-                                               unsigned Limit);
+                                               unsigned Limit,
+                                               bool FindNarrowMerge);
 
   // Scan the instructions looking for a store that writes to the address from
   // which the current load instruction reads. Return true if one is found.
   bool findMatchingStore(MachineBasicBlock::iterator I, unsigned Limit,
                          MachineBasicBlock::iterator &StoreI);
 
+  // Merge the two instructions indicated into a wider instruction.
+  MachineBasicBlock::iterator
+  mergeNarrowInsns(MachineBasicBlock::iterator I,
+                   MachineBasicBlock::iterator MergeMI,
+                   const LdStPairFlags &Flags);
+
   // Merge the two instructions indicated into a single pair-wise instruction.
-  // If MergeForward is true, erase the first instruction and fold its
-  // operation into the second. If false, the reverse. Return the instruction
-  // following the first instruction (which may change during processing).
   MachineBasicBlock::iterator
   mergePairedInsns(MachineBasicBlock::iterator I,
                    MachineBasicBlock::iterator Paired,
@@ -118,8 +132,8 @@ struct AArch64LoadStoreOpt : public MachineFunctionPass {
   // be combined with the current instruction (a load or store) using
   // pre or post indexed addressing with writeback. Scan forwards.
   MachineBasicBlock::iterator
-  findMatchingUpdateInsnForward(MachineBasicBlock::iterator I, unsigned Limit,
-                                int UnscaledOffset);
+  findMatchingUpdateInsnForward(MachineBasicBlock::iterator I,
+                                int UnscaledOffset, unsigned Limit);
 
   // Scan the instruction list to find a base register update that can
   // be combined with the current instruction (a load or store) using
@@ -129,7 +143,7 @@ struct AArch64LoadStoreOpt : public MachineFunctionPass {
 
   // Find an instruction that updates the base register of the ld/st
   // instruction.
-  bool isMatchingUpdateInsn(MachineInstr *MemMI, MachineInstr *MI,
+  bool isMatchingUpdateInsn(MachineInstr &MemMI, MachineInstr &MI,
                             unsigned BaseReg, int Offset);
 
   // Merge a pre- or post-index base register update into a ld/st instruction.
@@ -140,17 +154,21 @@ struct AArch64LoadStoreOpt : public MachineFunctionPass {
   // Find and merge foldable ldr/str instructions.
   bool tryToMergeLdStInst(MachineBasicBlock::iterator &MBBI);
 
+  // Find and pair ldr/str instructions.
+  bool tryToPairLdStInst(MachineBasicBlock::iterator &MBBI);
+
   // Find and promote load instructions which read directly from store.
   bool tryToPromoteLoadFromStore(MachineBasicBlock::iterator &MBBI);
 
-  // Check if converting two narrow loads into a single wider load with
-  // bitfield extracts could be enabled.
-  bool enableNarrowLdMerge(MachineFunction &Fn);
-
   bool optimizeBlock(MachineBasicBlock &MBB, bool enableNarrowLdOpt);
 
   bool runOnMachineFunction(MachineFunction &Fn) override;
 
+  MachineFunctionProperties getRequiredProperties() const override {
+    return MachineFunctionProperties().set(
+        MachineFunctionProperties::Property::AllVRegsAllocated);
+  }
+
   const char *getPassName() const override {
     return AARCH64_LOAD_STORE_OPT_NAME;
   }
@@ -161,37 +179,8 @@ char AArch64LoadStoreOpt::ID = 0;
 INITIALIZE_PASS(AArch64LoadStoreOpt, "aarch64-ldst-opt",
                 AARCH64_LOAD_STORE_OPT_NAME, false, false)
 
-static bool isUnscaledLdSt(unsigned Opc) {
-  switch (Opc) {
-  default:
-    return false;
-  case AArch64::STURSi:
-  case AArch64::STURDi:
-  case AArch64::STURQi:
-  case AArch64::STURBBi:
-  case AArch64::STURHHi:
-  case AArch64::STURWi:
-  case AArch64::STURXi:
-  case AArch64::LDURSi:
-  case AArch64::LDURDi:
-  case AArch64::LDURQi:
-  case AArch64::LDURWi:
-  case AArch64::LDURXi:
-  case AArch64::LDURSWi:
-  case AArch64::LDURHHi:
-  case AArch64::LDURBBi:
-  case AArch64::LDURSBWi:
-  case AArch64::LDURSHWi:
-    return true;
-  }
-}
-
-static bool isUnscaledLdSt(MachineInstr *MI) {
-  return isUnscaledLdSt(MI->getOpcode());
-}
-
-static unsigned getBitExtrOpcode(MachineInstr *MI) {
-  switch (MI->getOpcode()) {
+static unsigned getBitExtrOpcode(MachineInstr &MI) {
+  switch (MI.getOpcode()) {
   default:
     llvm_unreachable("Unexpected opcode.");
   case AArch64::LDRBBui:
@@ -219,10 +208,6 @@ static bool isNarrowStore(unsigned Opc) {
   }
 }
 
-static bool isNarrowStore(MachineInstr *MI) {
-  return isNarrowStore(MI->getOpcode());
-}
-
 static bool isNarrowLoad(unsigned Opc) {
   switch (Opc) {
   default:
@@ -239,13 +224,17 @@ static bool isNarrowLoad(unsigned Opc) {
   }
 }
 
-static bool isNarrowLoad(MachineInstr *MI) {
-  return isNarrowLoad(MI->getOpcode());
+static bool isNarrowLoad(MachineInstr &MI) {
+  return isNarrowLoad(MI.getOpcode());
+}
+
+static bool isNarrowLoadOrStore(unsigned Opc) {
+  return isNarrowLoad(Opc) || isNarrowStore(Opc);
 }
 
 // Scaling factor for unscaled load or store.
-static int getMemScale(MachineInstr *MI) {
-  switch (MI->getOpcode()) {
+static int getMemScale(MachineInstr &MI) {
+  switch (MI.getOpcode()) {
   default:
     llvm_unreachable("Opcode has unknown scale!");
   case AArch64::LDRBBui:
@@ -354,6 +343,37 @@ static unsigned getMatchingNonSExtOpcode(unsigned Opc,
   }
 }
 
+static unsigned getMatchingWideOpcode(unsigned Opc) {
+  switch (Opc) {
+  default:
+    llvm_unreachable("Opcode has no wide equivalent!");
+  case AArch64::STRBBui:
+    return AArch64::STRHHui;
+  case AArch64::STRHHui:
+    return AArch64::STRWui;
+  case AArch64::STURBBi:
+    return AArch64::STURHHi;
+  case AArch64::STURHHi:
+    return AArch64::STURWi;
+  case AArch64::STURWi:
+    return AArch64::STURXi;
+  case AArch64::STRWui:
+    return AArch64::STRXui;
+  case AArch64::LDRHHui:
+  case AArch64::LDRSHWui:
+    return AArch64::LDRWui;
+  case AArch64::LDURHHi:
+  case AArch64::LDURSHWi:
+    return AArch64::LDURWi;
+  case AArch64::LDRBBui:
+  case AArch64::LDRSBWui:
+    return AArch64::LDRHHui;
+  case AArch64::LDURBBi:
+  case AArch64::LDURSBWi:
+    return AArch64::LDURHHi;
+  }
+}
+
 static unsigned getMatchingPairOpcode(unsigned Opc) {
   switch (Opc) {
   default:
@@ -367,14 +387,6 @@ static unsigned getMatchingPairOpcode(unsigned Opc) {
   case AArch64::STRQui:
   case AArch64::STURQi:
     return AArch64::STPQi;
-  case AArch64::STRBBui:
-    return AArch64::STRHHui;
-  case AArch64::STRHHui:
-    return AArch64::STRWui;
-  case AArch64::STURBBi:
-    return AArch64::STURHHi;
-  case AArch64::STURHHi:
-    return AArch64::STURWi;
   case AArch64::STRWui:
   case AArch64::STURWi:
     return AArch64::STPWi;
@@ -399,25 +411,13 @@ static unsigned getMatchingPairOpcode(unsigned Opc) {
   case AArch64::LDRSWui:
   case AArch64::LDURSWi:
     return AArch64::LDPSWi;
-  case AArch64::LDRHHui:
-  case AArch64::LDRSHWui:
-    return AArch64::LDRWui;
-  case AArch64::LDURHHi:
-  case AArch64::LDURSHWi:
-    return AArch64::LDURWi;
-  case AArch64::LDRBBui:
-  case AArch64::LDRSBWui:
-    return AArch64::LDRHHui;
-  case AArch64::LDURBBi:
-  case AArch64::LDURSBWi:
-    return AArch64::LDURHHi;
   }
 }
 
-static unsigned isMatchingStore(MachineInstr *LoadInst,
-                                MachineInstr *StoreInst) {
-  unsigned LdOpc = LoadInst->getOpcode();
-  unsigned StOpc = StoreInst->getOpcode();
+static unsigned isMatchingStore(MachineInstr &LoadInst,
+                                MachineInstr &StoreInst) {
+  unsigned LdOpc = LoadInst.getOpcode();
+  unsigned StOpc = StoreInst.getOpcode();
   switch (LdOpc) {
   default:
     llvm_unreachable("Unsupported load instruction!");
@@ -562,8 +562,8 @@ static unsigned getPostIndexedOpcode(unsigned Opc) {
   }
 }
 
-static bool isPairedLdSt(const MachineInstr *MI) {
-  switch (MI->getOpcode()) {
+static bool isPairedLdSt(const MachineInstr &MI) {
+  switch (MI.getOpcode()) {
   default:
     return false;
   case AArch64::LDPSi:
@@ -581,41 +581,55 @@ static bool isPairedLdSt(const MachineInstr *MI) {
   }
 }
 
-static const MachineOperand &getLdStRegOp(const MachineInstr *MI,
+static const MachineOperand &getLdStRegOp(const MachineInstr &MI,
                                           unsigned PairedRegOp = 0) {
   assert(PairedRegOp < 2 && "Unexpected register operand idx.");
   unsigned Idx = isPairedLdSt(MI) ? PairedRegOp : 0;
-  return MI->getOperand(Idx);
+  return MI.getOperand(Idx);
 }
 
-static const MachineOperand &getLdStBaseOp(const MachineInstr *MI) {
+static const MachineOperand &getLdStBaseOp(const MachineInstr &MI) {
   unsigned Idx = isPairedLdSt(MI) ? 2 : 1;
-  return MI->getOperand(Idx);
+  return MI.getOperand(Idx);
 }
 
-static const MachineOperand &getLdStOffsetOp(const MachineInstr *MI) {
+static const MachineOperand &getLdStOffsetOp(const MachineInstr &MI) {
   unsigned Idx = isPairedLdSt(MI) ? 3 : 2;
-  return MI->getOperand(Idx);
+  return MI.getOperand(Idx);
 }
 
-static bool isLdOffsetInRangeOfSt(MachineInstr *LoadInst,
-                                  MachineInstr *StoreInst) {
+static bool isLdOffsetInRangeOfSt(MachineInstr &LoadInst,
+                                  MachineInstr &StoreInst,
+                                  const AArch64InstrInfo *TII) {
   assert(isMatchingStore(LoadInst, StoreInst) && "Expect only matched ld/st.");
   int LoadSize = getMemScale(LoadInst);
   int StoreSize = getMemScale(StoreInst);
-  int UnscaledStOffset = isUnscaledLdSt(StoreInst)
+  int UnscaledStOffset = TII->isUnscaledLdSt(StoreInst)
                              ? getLdStOffsetOp(StoreInst).getImm()
                              : getLdStOffsetOp(StoreInst).getImm() * StoreSize;
-  int UnscaledLdOffset = isUnscaledLdSt(LoadInst)
+  int UnscaledLdOffset = TII->isUnscaledLdSt(LoadInst)
                              ? getLdStOffsetOp(LoadInst).getImm()
                              : getLdStOffsetOp(LoadInst).getImm() * LoadSize;
   return (UnscaledStOffset <= UnscaledLdOffset) &&
          (UnscaledLdOffset + LoadSize <= (UnscaledStOffset + StoreSize));
 }
 
+static bool isPromotableZeroStoreOpcode(unsigned Opc) {
+  return isNarrowStore(Opc) || Opc == AArch64::STRWui || Opc == AArch64::STURWi;
+}
+
+static bool isPromotableZeroStoreOpcode(MachineInstr &MI) {
+  return isPromotableZeroStoreOpcode(MI.getOpcode());
+}
+
+static bool isPromotableZeroStoreInst(MachineInstr &MI) {
+  return (isPromotableZeroStoreOpcode(MI)) &&
+         getLdStRegOp(MI).getReg() == AArch64::WZR;
+}
+
 MachineBasicBlock::iterator
-AArch64LoadStoreOpt::mergePairedInsns(MachineBasicBlock::iterator I,
-                                      MachineBasicBlock::iterator Paired,
+AArch64LoadStoreOpt::mergeNarrowInsns(MachineBasicBlock::iterator I,
+                                      MachineBasicBlock::iterator MergeMI,
                                       const LdStPairFlags &Flags) {
   MachineBasicBlock::iterator NextI = I;
   ++NextI;
@@ -623,128 +637,124 @@ AArch64LoadStoreOpt::mergePairedInsns(MachineBasicBlock::iterator I,
   // to skip one further. Either way we merge will invalidate the iterator,
   // and we don't need to scan the new instruction, as it's a pairwise
   // instruction, which we're not considering for further action anyway.
-  if (NextI == Paired)
+  if (NextI == MergeMI)
     ++NextI;
 
-  int SExtIdx = Flags.getSExtIdx();
-  unsigned Opc =
-      SExtIdx == -1 ? I->getOpcode() : getMatchingNonSExtOpcode(I->getOpcode());
-  bool IsUnscaled = isUnscaledLdSt(Opc);
-  int OffsetStride = IsUnscaled ? getMemScale(I) : 1;
+  unsigned Opc = I->getOpcode();
+  bool IsScaled = !TII->isUnscaledLdSt(Opc);
+  int OffsetStride = IsScaled ? 1 : getMemScale(*I);
 
   bool MergeForward = Flags.getMergeForward();
-  unsigned NewOpc = getMatchingPairOpcode(Opc);
   // Insert our new paired instruction after whichever of the paired
   // instructions MergeForward indicates.
-  MachineBasicBlock::iterator InsertionPoint = MergeForward ? Paired : I;
+  MachineBasicBlock::iterator InsertionPoint = MergeForward ? MergeMI : I;
   // Also based on MergeForward is from where we copy the base register operand
   // so we get the flags compatible with the input code.
   const MachineOperand &BaseRegOp =
-      MergeForward ? getLdStBaseOp(Paired) : getLdStBaseOp(I);
+      MergeForward ? getLdStBaseOp(*MergeMI) : getLdStBaseOp(*I);
 
   // Which register is Rt and which is Rt2 depends on the offset order.
   MachineInstr *RtMI, *Rt2MI;
-  if (getLdStOffsetOp(I).getImm() ==
-      getLdStOffsetOp(Paired).getImm() + OffsetStride) {
-    RtMI = Paired;
-    Rt2MI = I;
-    // Here we swapped the assumption made for SExtIdx.
-    // I.e., we turn ldp I, Paired into ldp Paired, I.
-    // Update the index accordingly.
-    if (SExtIdx != -1)
-      SExtIdx = (SExtIdx + 1) % 2;
+  if (getLdStOffsetOp(*I).getImm() ==
+      getLdStOffsetOp(*MergeMI).getImm() + OffsetStride) {
+    RtMI = &*MergeMI;
+    Rt2MI = &*I;
   } else {
-    RtMI = I;
-    Rt2MI = Paired;
+    RtMI = &*I;
+    Rt2MI = &*MergeMI;
   }
 
-  int OffsetImm = getLdStOffsetOp(RtMI).getImm();
+  int OffsetImm = getLdStOffsetOp(*RtMI).getImm();
+  // Change the scaled offset from small to large type.
+  if (IsScaled) {
+    assert(((OffsetImm & 1) == 0) && "Unexpected offset to merge");
+    OffsetImm /= 2;
+  }
 
+  DebugLoc DL = I->getDebugLoc();
+  MachineBasicBlock *MBB = I->getParent();
   if (isNarrowLoad(Opc)) {
-    // Change the scaled offset from small to large type.
-    if (!IsUnscaled) {
-      assert(((OffsetImm & 1) == 0) && "Unexpected offset to merge");
-      OffsetImm /= 2;
-    }
-    MachineInstr *RtNewDest = MergeForward ? I : Paired;
+    MachineInstr *RtNewDest = &*(MergeForward ? I : MergeMI);
     // When merging small (< 32 bit) loads for big-endian targets, the order of
     // the component parts gets swapped.
     if (!Subtarget->isLittleEndian())
       std::swap(RtMI, Rt2MI);
     // Construct the new load instruction.
     MachineInstr *NewMemMI, *BitExtMI1, *BitExtMI2;
-    NewMemMI = BuildMI(*I->getParent(), InsertionPoint, I->getDebugLoc(),
-                       TII->get(NewOpc))
-                   .addOperand(getLdStRegOp(RtNewDest))
-                   .addOperand(BaseRegOp)
-                   .addImm(OffsetImm)
-                   .setMemRefs(I->mergeMemRefsWith(*Paired));
+    NewMemMI =
+        BuildMI(*MBB, InsertionPoint, DL, TII->get(getMatchingWideOpcode(Opc)))
+            .addOperand(getLdStRegOp(*RtNewDest))
+            .addOperand(BaseRegOp)
+            .addImm(OffsetImm)
+            .setMemRefs(I->mergeMemRefsWith(*MergeMI));
+    (void)NewMemMI;
 
     DEBUG(
         dbgs()
         << "Creating the new load and extract. Replacing instructions:\n    ");
     DEBUG(I->print(dbgs()));
     DEBUG(dbgs() << "    ");
-    DEBUG(Paired->print(dbgs()));
+    DEBUG(MergeMI->print(dbgs()));
     DEBUG(dbgs() << "  with instructions:\n    ");
     DEBUG((NewMemMI)->print(dbgs()));
 
-    int Width = getMemScale(I) == 1 ? 8 : 16;
+    int Width = getMemScale(*I) == 1 ? 8 : 16;
     int LSBLow = 0;
     int LSBHigh = Width;
     int ImmsLow = LSBLow + Width - 1;
     int ImmsHigh = LSBHigh + Width - 1;
-    MachineInstr *ExtDestMI = MergeForward ? Paired : I;
+    MachineInstr *ExtDestMI = &*(MergeForward ? MergeMI : I);
     if ((ExtDestMI == Rt2MI) == Subtarget->isLittleEndian()) {
       // Create the bitfield extract for high bits.
-      BitExtMI1 = BuildMI(*I->getParent(), InsertionPoint, I->getDebugLoc(),
-                          TII->get(getBitExtrOpcode(Rt2MI)))
-                      .addOperand(getLdStRegOp(Rt2MI))
-                      .addReg(getLdStRegOp(RtNewDest).getReg())
-                      .addImm(LSBHigh)
-                      .addImm(ImmsHigh);
+      BitExtMI1 =
+          BuildMI(*MBB, InsertionPoint, DL, TII->get(getBitExtrOpcode(*Rt2MI)))
+              .addOperand(getLdStRegOp(*Rt2MI))
+              .addReg(getLdStRegOp(*RtNewDest).getReg())
+              .addImm(LSBHigh)
+              .addImm(ImmsHigh);
       // Create the bitfield extract for low bits.
       if (RtMI->getOpcode() == getMatchingNonSExtOpcode(RtMI->getOpcode())) {
         // For unsigned, prefer to use AND for low bits.
-        BitExtMI2 = BuildMI(*I->getParent(), InsertionPoint, I->getDebugLoc(),
-                            TII->get(AArch64::ANDWri))
-                        .addOperand(getLdStRegOp(RtMI))
-                        .addReg(getLdStRegOp(RtNewDest).getReg())
+        BitExtMI2 = BuildMI(*MBB, InsertionPoint, DL, TII->get(AArch64::ANDWri))
+                        .addOperand(getLdStRegOp(*RtMI))
+                        .addReg(getLdStRegOp(*RtNewDest).getReg())
                         .addImm(ImmsLow);
       } else {
-        BitExtMI2 = BuildMI(*I->getParent(), InsertionPoint, I->getDebugLoc(),
-                            TII->get(getBitExtrOpcode(RtMI)))
-                        .addOperand(getLdStRegOp(RtMI))
-                        .addReg(getLdStRegOp(RtNewDest).getReg())
-                        .addImm(LSBLow)
-                        .addImm(ImmsLow);
+        BitExtMI2 =
+            BuildMI(*MBB, InsertionPoint, DL, TII->get(getBitExtrOpcode(*RtMI)))
+                .addOperand(getLdStRegOp(*RtMI))
+                .addReg(getLdStRegOp(*RtNewDest).getReg())
+                .addImm(LSBLow)
+                .addImm(ImmsLow);
       }
     } else {
       // Create the bitfield extract for low bits.
       if (RtMI->getOpcode() == getMatchingNonSExtOpcode(RtMI->getOpcode())) {
         // For unsigned, prefer to use AND for low bits.
-        BitExtMI1 = BuildMI(*I->getParent(), InsertionPoint, I->getDebugLoc(),
-                            TII->get(AArch64::ANDWri))
-                        .addOperand(getLdStRegOp(RtMI))
-                        .addReg(getLdStRegOp(RtNewDest).getReg())
+        BitExtMI1 = BuildMI(*MBB, InsertionPoint, DL, TII->get(AArch64::ANDWri))
+                        .addOperand(getLdStRegOp(*RtMI))
+                        .addReg(getLdStRegOp(*RtNewDest).getReg())
                         .addImm(ImmsLow);
       } else {
-        BitExtMI1 = BuildMI(*I->getParent(), InsertionPoint, I->getDebugLoc(),
-                            TII->get(getBitExtrOpcode(RtMI)))
-                        .addOperand(getLdStRegOp(RtMI))
-                        .addReg(getLdStRegOp(RtNewDest).getReg())
-                        .addImm(LSBLow)
-                        .addImm(ImmsLow);
+        BitExtMI1 =
+            BuildMI(*MBB, InsertionPoint, DL, TII->get(getBitExtrOpcode(*RtMI)))
+                .addOperand(getLdStRegOp(*RtMI))
+                .addReg(getLdStRegOp(*RtNewDest).getReg())
+                .addImm(LSBLow)
+                .addImm(ImmsLow);
       }
 
       // Create the bitfield extract for high bits.
-      BitExtMI2 = BuildMI(*I->getParent(), InsertionPoint, I->getDebugLoc(),
-                          TII->get(getBitExtrOpcode(Rt2MI)))
-                      .addOperand(getLdStRegOp(Rt2MI))
-                      .addReg(getLdStRegOp(RtNewDest).getReg())
-                      .addImm(LSBHigh)
-                      .addImm(ImmsHigh);
+      BitExtMI2 =
+          BuildMI(*MBB, InsertionPoint, DL, TII->get(getBitExtrOpcode(*Rt2MI)))
+              .addOperand(getLdStRegOp(*Rt2MI))
+              .addReg(getLdStRegOp(*RtNewDest).getReg())
+              .addImm(LSBHigh)
+              .addImm(ImmsHigh);
     }
+    (void)BitExtMI1;
+    (void)BitExtMI2;
+
     DEBUG(dbgs() << "    ");
     DEBUG((BitExtMI1)->print(dbgs()));
     DEBUG(dbgs() << "    ");
@@ -753,47 +763,122 @@ AArch64LoadStoreOpt::mergePairedInsns(MachineBasicBlock::iterator I,
 
     // Erase the old instructions.
     I->eraseFromParent();
-    Paired->eraseFromParent();
+    MergeMI->eraseFromParent();
     return NextI;
   }
+  assert(isPromotableZeroStoreInst(*I) && isPromotableZeroStoreInst(*MergeMI) &&
+         "Expected promotable zero store");
 
   // Construct the new instruction.
   MachineInstrBuilder MIB;
-  if (isNarrowStore(Opc)) {
-    // Change the scaled offset from small to large type.
-    if (!IsUnscaled) {
-      assert(((OffsetImm & 1) == 0) && "Unexpected offset to merge");
-      OffsetImm /= 2;
+  MIB = BuildMI(*MBB, InsertionPoint, DL, TII->get(getMatchingWideOpcode(Opc)))
+            .addReg(isNarrowStore(Opc) ? AArch64::WZR : AArch64::XZR)
+            .addOperand(BaseRegOp)
+            .addImm(OffsetImm)
+            .setMemRefs(I->mergeMemRefsWith(*MergeMI));
+  (void)MIB;
+
+  DEBUG(dbgs() << "Creating wider load/store. Replacing instructions:\n    ");
+  DEBUG(I->print(dbgs()));
+  DEBUG(dbgs() << "    ");
+  DEBUG(MergeMI->print(dbgs()));
+  DEBUG(dbgs() << "  with instruction:\n    ");
+  DEBUG(((MachineInstr *)MIB)->print(dbgs()));
+  DEBUG(dbgs() << "\n");
+
+  // Erase the old instructions.
+  I->eraseFromParent();
+  MergeMI->eraseFromParent();
+  return NextI;
+}
+
+MachineBasicBlock::iterator
+AArch64LoadStoreOpt::mergePairedInsns(MachineBasicBlock::iterator I,
+                                      MachineBasicBlock::iterator Paired,
+                                      const LdStPairFlags &Flags) {
+  MachineBasicBlock::iterator NextI = I;
+  ++NextI;
+  // If NextI is the second of the two instructions to be merged, we need
+  // to skip one further. Either way we merge will invalidate the iterator,
+  // and we don't need to scan the new instruction, as it's a pairwise
+  // instruction, which we're not considering for further action anyway.
+  if (NextI == Paired)
+    ++NextI;
+
+  int SExtIdx = Flags.getSExtIdx();
+  unsigned Opc =
+      SExtIdx == -1 ? I->getOpcode() : getMatchingNonSExtOpcode(I->getOpcode());
+  bool IsUnscaled = TII->isUnscaledLdSt(Opc);
+  int OffsetStride = IsUnscaled ? getMemScale(*I) : 1;
+
+  bool MergeForward = Flags.getMergeForward();
+  // Insert our new paired instruction after whichever of the paired
+  // instructions MergeForward indicates.
+  MachineBasicBlock::iterator InsertionPoint = MergeForward ? Paired : I;
+  // Also based on MergeForward is from where we copy the base register operand
+  // so we get the flags compatible with the input code.
+  const MachineOperand &BaseRegOp =
+      MergeForward ? getLdStBaseOp(*Paired) : getLdStBaseOp(*I);
+
+  int Offset = getLdStOffsetOp(*I).getImm();
+  int PairedOffset = getLdStOffsetOp(*Paired).getImm();
+  bool PairedIsUnscaled = TII->isUnscaledLdSt(Paired->getOpcode());
+  if (IsUnscaled != PairedIsUnscaled) {
+    // We're trying to pair instructions that differ in how they are scaled.  If
+    // I is scaled then scale the offset of Paired accordingly.  Otherwise, do
+    // the opposite (i.e., make Paired's offset unscaled).
+    int MemSize = getMemScale(*Paired);
+    if (PairedIsUnscaled) {
+      // If the unscaled offset isn't a multiple of the MemSize, we can't
+      // pair the operations together.
+      assert(!(PairedOffset % getMemScale(*Paired)) &&
+             "Offset should be a multiple of the stride!");
+      PairedOffset /= MemSize;
+    } else {
+      PairedOffset *= MemSize;
     }
-    MIB = BuildMI(*I->getParent(), InsertionPoint, I->getDebugLoc(),
-                  TII->get(NewOpc))
-              .addOperand(getLdStRegOp(I))
-              .addOperand(BaseRegOp)
-              .addImm(OffsetImm)
-              .setMemRefs(I->mergeMemRefsWith(*Paired));
+  }
+
+  // Which register is Rt and which is Rt2 depends on the offset order.
+  MachineInstr *RtMI, *Rt2MI;
+  if (Offset == PairedOffset + OffsetStride) {
+    RtMI = &*Paired;
+    Rt2MI = &*I;
+    // Here we swapped the assumption made for SExtIdx.
+    // I.e., we turn ldp I, Paired into ldp Paired, I.
+    // Update the index accordingly.
+    if (SExtIdx != -1)
+      SExtIdx = (SExtIdx + 1) % 2;
   } else {
-    // Handle Unscaled
-    if (IsUnscaled)
-      OffsetImm /= OffsetStride;
-    MIB = BuildMI(*I->getParent(), InsertionPoint, I->getDebugLoc(),
-                  TII->get(NewOpc))
-              .addOperand(getLdStRegOp(RtMI))
-              .addOperand(getLdStRegOp(Rt2MI))
-              .addOperand(BaseRegOp)
-              .addImm(OffsetImm);
+    RtMI = &*I;
+    Rt2MI = &*Paired;
+  }
+  int OffsetImm = getLdStOffsetOp(*RtMI).getImm();
+  // Scale the immediate offset, if necessary.
+  if (TII->isUnscaledLdSt(RtMI->getOpcode())) {
+    assert(!(OffsetImm % getMemScale(*RtMI)) &&
+           "Unscaled offset cannot be scaled.");
+    OffsetImm /= getMemScale(*RtMI);
   }
 
-  (void)MIB;
+  // Construct the new instruction.
+  MachineInstrBuilder MIB;
+  DebugLoc DL = I->getDebugLoc();
+  MachineBasicBlock *MBB = I->getParent();
+  MIB = BuildMI(*MBB, InsertionPoint, DL, TII->get(getMatchingPairOpcode(Opc)))
+            .addOperand(getLdStRegOp(*RtMI))
+            .addOperand(getLdStRegOp(*Rt2MI))
+            .addOperand(BaseRegOp)
+            .addImm(OffsetImm)
+            .setMemRefs(I->mergeMemRefsWith(*Paired));
 
-  // FIXME: Do we need/want to copy the mem operands from the source
-  //        instructions? Probably. What uses them after this?
+  (void)MIB;
 
   DEBUG(dbgs() << "Creating pair load/store. Replacing instructions:\n    ");
   DEBUG(I->print(dbgs()));
   DEBUG(dbgs() << "    ");
   DEBUG(Paired->print(dbgs()));
   DEBUG(dbgs() << "  with instruction:\n    ");
-
   if (SExtIdx != -1) {
     // Generate the sign extension for the proper result of the ldp.
     // I.e., with X1, that would be:
@@ -814,26 +899,23 @@ AArch64LoadStoreOpt::mergePairedInsns(MachineBasicBlock::iterator I,
     // Insert this definition right after the generated LDP, i.e., before
     // InsertionPoint.
     MachineInstrBuilder MIBKill =
-        BuildMI(*I->getParent(), InsertionPoint, I->getDebugLoc(),
-                TII->get(TargetOpcode::KILL), DstRegW)
+        BuildMI(*MBB, InsertionPoint, DL, TII->get(TargetOpcode::KILL), DstRegW)
             .addReg(DstRegW)
             .addReg(DstRegX, RegState::Define);
     MIBKill->getOperand(2).setImplicit();
     // Create the sign extension.
     MachineInstrBuilder MIBSXTW =
-        BuildMI(*I->getParent(), InsertionPoint, I->getDebugLoc(),
-                TII->get(AArch64::SBFMXri), DstRegX)
+        BuildMI(*MBB, InsertionPoint, DL, TII->get(AArch64::SBFMXri), DstRegX)
             .addReg(DstRegX)
             .addImm(0)
             .addImm(31);
     (void)MIBSXTW;
     DEBUG(dbgs() << "  Extend operand:\n    ");
     DEBUG(((MachineInstr *)MIBSXTW)->print(dbgs()));
-    DEBUG(dbgs() << "\n");
   } else {
     DEBUG(((MachineInstr *)MIB)->print(dbgs()));
-    DEBUG(dbgs() << "\n");
   }
+  DEBUG(dbgs() << "\n");
 
   // Erase the old instructions.
   I->eraseFromParent();
@@ -848,10 +930,10 @@ AArch64LoadStoreOpt::promoteLoadFromStore(MachineBasicBlock::iterator LoadI,
   MachineBasicBlock::iterator NextI = LoadI;
   ++NextI;
 
-  int LoadSize = getMemScale(LoadI);
-  int StoreSize = getMemScale(StoreI);
-  unsigned LdRt = getLdStRegOp(LoadI).getReg();
-  unsigned StRt = getLdStRegOp(StoreI).getReg();
+  int LoadSize = getMemScale(*LoadI);
+  int StoreSize = getMemScale(*StoreI);
+  unsigned LdRt = getLdStRegOp(*LoadI).getReg();
+  unsigned StRt = getLdStRegOp(*StoreI).getReg();
   bool IsStoreXReg = TRI->getRegClass(AArch64::GPR64RegClassID)->contains(StRt);
 
   assert((IsStoreXReg ||
@@ -881,15 +963,16 @@ AArch64LoadStoreOpt::promoteLoadFromStore(MachineBasicBlock::iterator LoadI,
     // performance and correctness are verified only in little-endian.
     if (!Subtarget->isLittleEndian())
       return NextI;
-    bool IsUnscaled = isUnscaledLdSt(LoadI);
-    assert(IsUnscaled == isUnscaledLdSt(StoreI) && "Unsupported ld/st match");
+    bool IsUnscaled = TII->isUnscaledLdSt(*LoadI);
+    assert(IsUnscaled == TII->isUnscaledLdSt(*StoreI) &&
+           "Unsupported ld/st match");
     assert(LoadSize <= StoreSize && "Invalid load size");
     int UnscaledLdOffset = IsUnscaled
-                               ? getLdStOffsetOp(LoadI).getImm()
-                               : getLdStOffsetOp(LoadI).getImm() * LoadSize;
+                               ? getLdStOffsetOp(*LoadI).getImm()
+                               : getLdStOffsetOp(*LoadI).getImm() * LoadSize;
     int UnscaledStOffset = IsUnscaled
-                               ? getLdStOffsetOp(StoreI).getImm()
-                               : getLdStOffsetOp(StoreI).getImm() * StoreSize;
+                               ? getLdStOffsetOp(*StoreI).getImm()
+                               : getLdStOffsetOp(*StoreI).getImm() * StoreSize;
     int Width = LoadSize * 8;
     int Immr = 8 * (UnscaledLdOffset - UnscaledStOffset);
     int Imms = Immr + Width - 1;
@@ -926,6 +1009,7 @@ AArch64LoadStoreOpt::promoteLoadFromStore(MachineBasicBlock::iterator LoadI,
               .addImm(Imms);
     }
   }
+  (void)BitExtMI;
 
   DEBUG(dbgs() << "Promoting load by replacing :\n    ");
   DEBUG(StoreI->print(dbgs()));
@@ -944,16 +1028,18 @@ AArch64LoadStoreOpt::promoteLoadFromStore(MachineBasicBlock::iterator LoadI,
 
 /// trackRegDefsUses - Remember what registers the specified instruction uses
 /// and modifies.
-static void trackRegDefsUses(const MachineInstr *MI, BitVector &ModifiedRegs,
+static void trackRegDefsUses(const MachineInstr &MI, BitVector &ModifiedRegs,
                              BitVector &UsedRegs,
                              const TargetRegisterInfo *TRI) {
-  for (const MachineOperand &MO : MI->operands()) {
+  for (const MachineOperand &MO : MI.operands()) {
     if (MO.isRegMask())
       ModifiedRegs.setBitsNotInMask(MO.getRegMask());
 
     if (!MO.isReg())
       continue;
     unsigned Reg = MO.getReg();
+    if (!Reg)
+      continue;
     if (MO.isDef()) {
       for (MCRegAliasIterator AI(Reg, TRI, true); AI.isValid(); ++AI)
         ModifiedRegs.set(*AI);
@@ -968,38 +1054,42 @@ static void trackRegDefsUses(const MachineInstr *MI, BitVector &ModifiedRegs,
 static bool inBoundsForPair(bool IsUnscaled, int Offset, int OffsetStride) {
   // Convert the byte-offset used by unscaled into an "element" offset used
   // by the scaled pair load/store instructions.
-  if (IsUnscaled)
+  if (IsUnscaled) {
+    // If the byte-offset isn't a multiple of the stride, there's no point
+    // trying to match it.
+    if (Offset % OffsetStride)
+      return false;
     Offset /= OffsetStride;
-
+  }
   return Offset <= 63 && Offset >= -64;
 }
 
 // Do alignment, specialized to power of 2 and for signed ints,
 // avoiding having to do a C-style cast from uint_64t to int when
-// using RoundUpToAlignment from include/llvm/Support/MathExtras.h.
+// using alignTo from include/llvm/Support/MathExtras.h.
 // FIXME: Move this function to include/MathExtras.h?
 static int alignTo(int Num, int PowOf2) {
   return (Num + PowOf2 - 1) & ~(PowOf2 - 1);
 }
 
-static bool mayAlias(MachineInstr *MIa, MachineInstr *MIb,
+static bool mayAlias(MachineInstr &MIa, MachineInstr &MIb,
                      const AArch64InstrInfo *TII) {
   // One of the instructions must modify memory.
-  if (!MIa->mayStore() && !MIb->mayStore())
+  if (!MIa.mayStore() && !MIb.mayStore())
     return false;
 
   // Both instructions must be memory operations.
-  if (!MIa->mayLoadOrStore() && !MIb->mayLoadOrStore())
+  if (!MIa.mayLoadOrStore() && !MIb.mayLoadOrStore())
     return false;
 
   return !TII->areMemAccessesTriviallyDisjoint(MIa, MIb);
 }
 
-static bool mayAlias(MachineInstr *MIa,
+static bool mayAlias(MachineInstr &MIa,
                      SmallVectorImpl<MachineInstr *> &MemInsns,
                      const AArch64InstrInfo *TII) {
-  for (auto &MIb : MemInsns)
-    if (mayAlias(MIa, MIb, TII))
+  for (MachineInstr *MIb : MemInsns)
+    if (mayAlias(MIa, *MIb, TII))
       return true;
 
   return false;
@@ -1008,40 +1098,43 @@ static bool mayAlias(MachineInstr *MIa,
 bool AArch64LoadStoreOpt::findMatchingStore(
     MachineBasicBlock::iterator I, unsigned Limit,
     MachineBasicBlock::iterator &StoreI) {
-  MachineBasicBlock::iterator E = I->getParent()->begin();
+  MachineBasicBlock::iterator B = I->getParent()->begin();
   MachineBasicBlock::iterator MBBI = I;
-  MachineInstr *FirstMI = I;
-  unsigned BaseReg = getLdStBaseOp(FirstMI).getReg();
+  MachineInstr &LoadMI = *I;
+  unsigned BaseReg = getLdStBaseOp(LoadMI).getReg();
+
+  // If the load is the first instruction in the block, there's obviously
+  // not any matching store.
+  if (MBBI == B)
+    return false;
 
   // Track which registers have been modified and used between the first insn
   // and the second insn.
-  BitVector ModifiedRegs, UsedRegs;
-  ModifiedRegs.resize(TRI->getNumRegs());
-  UsedRegs.resize(TRI->getNumRegs());
+  ModifiedRegs.reset();
+  UsedRegs.reset();
 
-  for (unsigned Count = 0; MBBI != E && Count < Limit;) {
+  unsigned Count = 0;
+  do {
     --MBBI;
-    MachineInstr *MI = MBBI;
-    // Skip DBG_VALUE instructions. Otherwise debug info can affect the
-    // optimization by changing how far we scan.
-    if (MI->isDebugValue())
-      continue;
-    // Now that we know this is a real instruction, count it.
-    ++Count;
+    MachineInstr &MI = *MBBI;
+
+    // Don't count DBG_VALUE instructions towards the search limit.
+    if (!MI.isDebugValue())
+      ++Count;
 
     // If the load instruction reads directly from the address to which the
     // store instruction writes and the stored value is not modified, we can
     // promote the load. Since we do not handle stores with pre-/post-index,
     // it's unnecessary to check if BaseReg is modified by the store itself.
-    if (MI->mayStore() && isMatchingStore(FirstMI, MI) &&
+    if (MI.mayStore() && isMatchingStore(LoadMI, MI) &&
         BaseReg == getLdStBaseOp(MI).getReg() &&
-        isLdOffsetInRangeOfSt(FirstMI, MI) &&
+        isLdOffsetInRangeOfSt(LoadMI, MI, TII) &&
         !ModifiedRegs[getLdStRegOp(MI).getReg()]) {
       StoreI = MBBI;
       return true;
     }
 
-    if (MI->isCall())
+    if (MI.isCall())
       return false;
 
     // Update modified / uses register lists.
@@ -1053,139 +1146,165 @@ bool AArch64LoadStoreOpt::findMatchingStore(
       return false;
 
     // If we encounter a store aliased with the load, return early.
-    if (MI->mayStore() && mayAlias(FirstMI, MI, TII))
+    if (MI.mayStore() && mayAlias(LoadMI, MI, TII))
       return false;
-  }
+  } while (MBBI != B && Count < Limit);
   return false;
 }
 
-/// findMatchingInsn - Scan the instructions looking for a load/store that can
-/// be combined with the current instruction into a load/store pair.
+// Returns true if FirstMI and MI are candidates for merging or pairing.
+// Otherwise, returns false.
+static bool areCandidatesToMergeOrPair(MachineInstr &FirstMI, MachineInstr &MI,
+                                       LdStPairFlags &Flags,
+                                       const AArch64InstrInfo *TII) {
+  // If this is volatile or if pairing is suppressed, not a candidate.
+  if (MI.hasOrderedMemoryRef() || TII->isLdStPairSuppressed(MI))
+    return false;
+
+  // We should have already checked FirstMI for pair suppression and volatility.
+  assert(!FirstMI.hasOrderedMemoryRef() &&
+         !TII->isLdStPairSuppressed(FirstMI) &&
+         "FirstMI shouldn't get here if either of these checks are true.");
+
+  unsigned OpcA = FirstMI.getOpcode();
+  unsigned OpcB = MI.getOpcode();
+
+  // Opcodes match: nothing more to check.
+  if (OpcA == OpcB)
+    return true;
+
+  // Try to match a sign-extended load/store with a zero-extended load/store.
+  bool IsValidLdStrOpc, PairIsValidLdStrOpc;
+  unsigned NonSExtOpc = getMatchingNonSExtOpcode(OpcA, &IsValidLdStrOpc);
+  assert(IsValidLdStrOpc &&
+         "Given Opc should be a Load or Store with an immediate");
+  // OpcA will be the first instruction in the pair.
+  if (NonSExtOpc == getMatchingNonSExtOpcode(OpcB, &PairIsValidLdStrOpc)) {
+    Flags.setSExtIdx(NonSExtOpc == (unsigned)OpcA ? 1 : 0);
+    return true;
+  }
+
+  // If the second instruction isn't even a load/store, bail out.
+  if (!PairIsValidLdStrOpc)
+    return false;
+
+  // FIXME: We don't support merging narrow loads/stores with mixed
+  // scaled/unscaled offsets.
+  if (isNarrowLoadOrStore(OpcA) || isNarrowLoadOrStore(OpcB))
+    return false;
+
+  // Try to match an unscaled load/store with a scaled load/store.
+  return TII->isUnscaledLdSt(OpcA) != TII->isUnscaledLdSt(OpcB) &&
+         getMatchingPairOpcode(OpcA) == getMatchingPairOpcode(OpcB);
+
+  // FIXME: Can we also match a mixed sext/zext unscaled/scaled pair?
+}
+
+/// Scan the instructions looking for a load/store that can be combined with the
+/// current instruction into a wider equivalent or a load/store pair.
 MachineBasicBlock::iterator
 AArch64LoadStoreOpt::findMatchingInsn(MachineBasicBlock::iterator I,
-                                      LdStPairFlags &Flags, unsigned Limit) {
+                                      LdStPairFlags &Flags, unsigned Limit,
+                                      bool FindNarrowMerge) {
   MachineBasicBlock::iterator E = I->getParent()->end();
   MachineBasicBlock::iterator MBBI = I;
-  MachineInstr *FirstMI = I;
+  MachineInstr &FirstMI = *I;
   ++MBBI;
 
-  unsigned Opc = FirstMI->getOpcode();
-  bool MayLoad = FirstMI->mayLoad();
-  bool IsUnscaled = isUnscaledLdSt(FirstMI);
+  bool MayLoad = FirstMI.mayLoad();
+  bool IsUnscaled = TII->isUnscaledLdSt(FirstMI);
   unsigned Reg = getLdStRegOp(FirstMI).getReg();
   unsigned BaseReg = getLdStBaseOp(FirstMI).getReg();
   int Offset = getLdStOffsetOp(FirstMI).getImm();
-  bool IsNarrowStore = isNarrowStore(Opc);
-
-  // For narrow stores, find only the case where the stored value is WZR.
-  if (IsNarrowStore && Reg != AArch64::WZR)
-    return E;
-
-  // Early exit if the first instruction modifies the base register.
-  // e.g., ldr x0, [x0]
-  if (FirstMI->modifiesRegister(BaseReg, TRI))
-    return E;
-
-  // Early exit if the offset if not possible to match. (6 bits of positive
-  // range, plus allow an extra one in case we find a later insn that matches
-  // with Offset-1)
   int OffsetStride = IsUnscaled ? getMemScale(FirstMI) : 1;
-  if (!(isNarrowLoad(Opc) || IsNarrowStore) &&
-      !inBoundsForPair(IsUnscaled, Offset, OffsetStride))
-    return E;
+  bool IsPromotableZeroStore = isPromotableZeroStoreInst(FirstMI);
 
   // Track which registers have been modified and used between the first insn
   // (inclusive) and the second insn.
-  BitVector ModifiedRegs, UsedRegs;
-  ModifiedRegs.resize(TRI->getNumRegs());
-  UsedRegs.resize(TRI->getNumRegs());
+  ModifiedRegs.reset();
+  UsedRegs.reset();
 
   // Remember any instructions that read/write memory between FirstMI and MI.
   SmallVector<MachineInstr *, 4> MemInsns;
 
   for (unsigned Count = 0; MBBI != E && Count < Limit; ++MBBI) {
-    MachineInstr *MI = MBBI;
+    MachineInstr &MI = *MBBI;
     // Skip DBG_VALUE instructions. Otherwise debug info can affect the
     // optimization by changing how far we scan.
-    if (MI->isDebugValue())
+    if (MI.isDebugValue())
       continue;
 
     // Now that we know this is a real instruction, count it.
     ++Count;
 
-    bool CanMergeOpc = Opc == MI->getOpcode();
     Flags.setSExtIdx(-1);
-    if (!CanMergeOpc) {
-      bool IsValidLdStrOpc;
-      unsigned NonSExtOpc = getMatchingNonSExtOpcode(Opc, &IsValidLdStrOpc);
-      assert(IsValidLdStrOpc &&
-             "Given Opc should be a Load or Store with an immediate");
-      // Opc will be the first instruction in the pair.
-      Flags.setSExtIdx(NonSExtOpc == (unsigned)Opc ? 1 : 0);
-      CanMergeOpc = NonSExtOpc == getMatchingNonSExtOpcode(MI->getOpcode());
-    }
-
-    if (CanMergeOpc && getLdStOffsetOp(MI).isImm()) {
-      assert(MI->mayLoadOrStore() && "Expected memory operation.");
+    if (areCandidatesToMergeOrPair(FirstMI, MI, Flags, TII) &&
+        getLdStOffsetOp(MI).isImm()) {
+      assert(MI.mayLoadOrStore() && "Expected memory operation.");
       // If we've found another instruction with the same opcode, check to see
       // if the base and offset are compatible with our starting instruction.
       // These instructions all have scaled immediate operands, so we just
       // check for +1/-1. Make sure to check the new instruction offset is
       // actually an immediate and not a symbolic reference destined for
       // a relocation.
-      //
-      // Pairwise instructions have a 7-bit signed offset field. Single insns
-      // have a 12-bit unsigned offset field. To be a valid combine, the
-      // final offset must be in range.
       unsigned MIBaseReg = getLdStBaseOp(MI).getReg();
       int MIOffset = getLdStOffsetOp(MI).getImm();
+      bool MIIsUnscaled = TII->isUnscaledLdSt(MI);
+      if (IsUnscaled != MIIsUnscaled) {
+        // We're trying to pair instructions that differ in how they are scaled.
+        // If FirstMI is scaled then scale the offset of MI accordingly.
+        // Otherwise, do the opposite (i.e., make MI's offset unscaled).
+        int MemSize = getMemScale(MI);
+        if (MIIsUnscaled) {
+          // If the unscaled offset isn't a multiple of the MemSize, we can't
+          // pair the operations together: bail and keep looking.
+          if (MIOffset % MemSize)
+            continue;
+          MIOffset /= MemSize;
+        } else {
+          MIOffset *= MemSize;
+        }
+      }
+
       if (BaseReg == MIBaseReg && ((Offset == MIOffset + OffsetStride) ||
                                    (Offset + OffsetStride == MIOffset))) {
         int MinOffset = Offset < MIOffset ? Offset : MIOffset;
-        // If this is a volatile load/store that otherwise matched, stop looking
-        // as something is going on that we don't have enough information to
-        // safely transform. Similarly, stop if we see a hint to avoid pairs.
-        if (MI->hasOrderedMemoryRef() || TII->isLdStPairSuppressed(MI))
-          return E;
-        // If the resultant immediate offset of merging these instructions
-        // is out of range for a pairwise instruction, bail and keep looking.
-        bool MIIsUnscaled = isUnscaledLdSt(MI);
-        bool IsNarrowLoad = isNarrowLoad(MI->getOpcode());
-        if (!IsNarrowLoad &&
-            !inBoundsForPair(MIIsUnscaled, MinOffset, OffsetStride)) {
-          trackRegDefsUses(MI, ModifiedRegs, UsedRegs, TRI);
-          MemInsns.push_back(MI);
-          continue;
-        }
-
-        if (IsNarrowLoad || IsNarrowStore) {
+        if (FindNarrowMerge) {
           // If the alignment requirements of the scaled wide load/store
-          // instruction can't express the offset of the scaled narrow
-          // input, bail and keep looking.
-          if (!IsUnscaled && alignTo(MinOffset, 2) != MinOffset) {
+          // instruction can't express the offset of the scaled narrow input,
+          // bail and keep looking. For promotable zero stores, allow only when
+          // the stored value is the same (i.e., WZR).
+          if ((!IsUnscaled && alignTo(MinOffset, 2) != MinOffset) ||
+              (IsPromotableZeroStore && Reg != getLdStRegOp(MI).getReg())) {
             trackRegDefsUses(MI, ModifiedRegs, UsedRegs, TRI);
-            MemInsns.push_back(MI);
+            MemInsns.push_back(&MI);
             continue;
           }
         } else {
+          // Pairwise instructions have a 7-bit signed offset field. Single
+          // insns have a 12-bit unsigned offset field.  If the resultant
+          // immediate offset of merging these instructions is out of range for
+          // a pairwise instruction, bail and keep looking.
+          if (!inBoundsForPair(IsUnscaled, MinOffset, OffsetStride)) {
+            trackRegDefsUses(MI, ModifiedRegs, UsedRegs, TRI);
+            MemInsns.push_back(&MI);
+            continue;
+          }
           // If the alignment requirements of the paired (scaled) instruction
           // can't express the offset of the unscaled input, bail and keep
           // looking.
           if (IsUnscaled && (alignTo(MinOffset, OffsetStride) != MinOffset)) {
             trackRegDefsUses(MI, ModifiedRegs, UsedRegs, TRI);
-            MemInsns.push_back(MI);
+            MemInsns.push_back(&MI);
             continue;
           }
         }
         // If the destination register of the loads is the same register, bail
         // and keep looking. A load-pair instruction with both destination
         // registers the same is UNPREDICTABLE and will result in an exception.
-        // For narrow stores, allow only when the stored value is the same
-        // (i.e., WZR).
-        if ((MayLoad && Reg == getLdStRegOp(MI).getReg()) ||
-            (IsNarrowStore && Reg != getLdStRegOp(MI).getReg())) {
+        if (MayLoad && Reg == getLdStRegOp(MI).getReg()) {
           trackRegDefsUses(MI, ModifiedRegs, UsedRegs, TRI);
-          MemInsns.push_back(MI);
+          MemInsns.push_back(&MI);
           continue;
         }
 
@@ -1194,7 +1313,7 @@ AArch64LoadStoreOpt::findMatchingInsn(MachineBasicBlock::iterator I,
         // and first alias with the second, we can combine the second into the
         // first.
         if (!ModifiedRegs[getLdStRegOp(MI).getReg()] &&
-            !(MI->mayLoad() && UsedRegs[getLdStRegOp(MI).getReg()]) &&
+            !(MI.mayLoad() && UsedRegs[getLdStRegOp(MI).getReg()]) &&
             !mayAlias(MI, MemInsns, TII)) {
           Flags.setMergeForward(false);
           return MBBI;
@@ -1217,7 +1336,7 @@ AArch64LoadStoreOpt::findMatchingInsn(MachineBasicBlock::iterator I,
 
     // If the instruction wasn't a matching load or store.  Stop searching if we
     // encounter a call instruction that might modify memory.
-    if (MI->isCall())
+    if (MI.isCall())
       return E;
 
     // Update modified / uses register lists.
@@ -1229,8 +1348,8 @@ AArch64LoadStoreOpt::findMatchingInsn(MachineBasicBlock::iterator I,
       return E;
 
     // Update list of instructions that read/write memory.
-    if (MI->mayLoadOrStore())
-      MemInsns.push_back(MI);
+    if (MI.mayLoadOrStore())
+      MemInsns.push_back(&MI);
   }
   return E;
 }
@@ -1258,22 +1377,24 @@ AArch64LoadStoreOpt::mergeUpdateInsn(MachineBasicBlock::iterator I,
   unsigned NewOpc = IsPreIdx ? getPreIndexedOpcode(I->getOpcode())
                              : getPostIndexedOpcode(I->getOpcode());
   MachineInstrBuilder MIB;
-  if (!isPairedLdSt(I)) {
+  if (!isPairedLdSt(*I)) {
     // Non-paired instruction.
     MIB = BuildMI(*I->getParent(), I, I->getDebugLoc(), TII->get(NewOpc))
-              .addOperand(getLdStRegOp(Update))
-              .addOperand(getLdStRegOp(I))
-              .addOperand(getLdStBaseOp(I))
-              .addImm(Value);
+              .addOperand(getLdStRegOp(*Update))
+              .addOperand(getLdStRegOp(*I))
+              .addOperand(getLdStBaseOp(*I))
+              .addImm(Value)
+              .setMemRefs(I->memoperands_begin(), I->memoperands_end());
   } else {
     // Paired instruction.
-    int Scale = getMemScale(I);
+    int Scale = getMemScale(*I);
     MIB = BuildMI(*I->getParent(), I, I->getDebugLoc(), TII->get(NewOpc))
-              .addOperand(getLdStRegOp(Update))
-              .addOperand(getLdStRegOp(I, 0))
-              .addOperand(getLdStRegOp(I, 1))
-              .addOperand(getLdStBaseOp(I))
-              .addImm(Value / Scale);
+              .addOperand(getLdStRegOp(*Update))
+              .addOperand(getLdStRegOp(*I, 0))
+              .addOperand(getLdStRegOp(*I, 1))
+              .addOperand(getLdStBaseOp(*I))
+              .addImm(Value / Scale)
+              .setMemRefs(I->memoperands_begin(), I->memoperands_end());
   }
   (void)MIB;
 
@@ -1296,10 +1417,10 @@ AArch64LoadStoreOpt::mergeUpdateInsn(MachineBasicBlock::iterator I,
   return NextI;
 }
 
-bool AArch64LoadStoreOpt::isMatchingUpdateInsn(MachineInstr *MemMI,
-                                               MachineInstr *MI,
+bool AArch64LoadStoreOpt::isMatchingUpdateInsn(MachineInstr &MemMI,
+                                               MachineInstr &MI,
                                                unsigned BaseReg, int Offset) {
-  switch (MI->getOpcode()) {
+  switch (MI.getOpcode()) {
   default:
     break;
   case AArch64::SUBXri:
@@ -1309,20 +1430,20 @@ bool AArch64LoadStoreOpt::isMatchingUpdateInsn(MachineInstr *MemMI,
   case AArch64::ADDXri:
     // Make sure it's a vanilla immediate operand, not a relocation or
     // anything else we can't handle.
-    if (!MI->getOperand(2).isImm())
+    if (!MI.getOperand(2).isImm())
       break;
     // Watch out for 1 << 12 shifted value.
-    if (AArch64_AM::getShiftValue(MI->getOperand(3).getImm()))
+    if (AArch64_AM::getShiftValue(MI.getOperand(3).getImm()))
       break;
 
     // The update instruction source and destination register must be the
     // same as the load/store base register.
-    if (MI->getOperand(0).getReg() != BaseReg ||
-        MI->getOperand(1).getReg() != BaseReg)
+    if (MI.getOperand(0).getReg() != BaseReg ||
+        MI.getOperand(1).getReg() != BaseReg)
       break;
 
     bool IsPairedInsn = isPairedLdSt(MemMI);
-    int UpdateOffset = MI->getOperand(2).getImm();
+    int UpdateOffset = MI.getOperand(2).getImm();
     // For non-paired load/store instructions, the immediate must fit in a
     // signed 9-bit integer.
     if (!IsPairedInsn && (UpdateOffset > 255 || UpdateOffset < -256))
@@ -1343,7 +1464,7 @@ bool AArch64LoadStoreOpt::isMatchingUpdateInsn(MachineInstr *MemMI,
 
     // If we have a non-zero Offset, we check that it matches the amount
     // we're adding to the register.
-    if (!Offset || Offset == MI->getOperand(2).getImm())
+    if (!Offset || Offset == MI.getOperand(2).getImm())
       return true;
     break;
   }
@@ -1351,9 +1472,9 @@ bool AArch64LoadStoreOpt::isMatchingUpdateInsn(MachineInstr *MemMI,
 }
 
 MachineBasicBlock::iterator AArch64LoadStoreOpt::findMatchingUpdateInsnForward(
-    MachineBasicBlock::iterator I, unsigned Limit, int UnscaledOffset) {
+    MachineBasicBlock::iterator I, int UnscaledOffset, unsigned Limit) {
   MachineBasicBlock::iterator E = I->getParent()->end();
-  MachineInstr *MemMI = I;
+  MachineInstr &MemMI = *I;
   MachineBasicBlock::iterator MBBI = I;
 
   unsigned BaseReg = getLdStBaseOp(MemMI).getReg();
@@ -1376,22 +1497,20 @@ MachineBasicBlock::iterator AArch64LoadStoreOpt::findMatchingUpdateInsnForward(
 
   // Track which registers have been modified and used between the first insn
   // (inclusive) and the second insn.
-  BitVector ModifiedRegs, UsedRegs;
-  ModifiedRegs.resize(TRI->getNumRegs());
-  UsedRegs.resize(TRI->getNumRegs());
+  ModifiedRegs.reset();
+  UsedRegs.reset();
   ++MBBI;
-  for (unsigned Count = 0; MBBI != E; ++MBBI) {
-    MachineInstr *MI = MBBI;
-    // Skip DBG_VALUE instructions. Otherwise debug info can affect the
-    // optimization by changing how far we scan.
-    if (MI->isDebugValue())
+  for (unsigned Count = 0; MBBI != E && Count < Limit; ++MBBI) {
+    MachineInstr &MI = *MBBI;
+    // Skip DBG_VALUE instructions.
+    if (MI.isDebugValue())
       continue;
 
     // Now that we know this is a real instruction, count it.
     ++Count;
 
     // If we found a match, return it.
-    if (isMatchingUpdateInsn(I, MI, BaseReg, UnscaledOffset))
+    if (isMatchingUpdateInsn(*I, MI, BaseReg, UnscaledOffset))
       return MBBI;
 
     // Update the status of what the instruction clobbered and used.
@@ -1409,7 +1528,7 @@ MachineBasicBlock::iterator AArch64LoadStoreOpt::findMatchingUpdateInsnBackward(
     MachineBasicBlock::iterator I, unsigned Limit) {
   MachineBasicBlock::iterator B = I->getParent()->begin();
   MachineBasicBlock::iterator E = I->getParent()->end();
-  MachineInstr *MemMI = I;
+  MachineInstr &MemMI = *I;
   MachineBasicBlock::iterator MBBI = I;
 
   unsigned BaseReg = getLdStBaseOp(MemMI).getReg();
@@ -1430,22 +1549,19 @@ MachineBasicBlock::iterator AArch64LoadStoreOpt::findMatchingUpdateInsnBackward(
 
   // Track which registers have been modified and used between the first insn
   // (inclusive) and the second insn.
-  BitVector ModifiedRegs, UsedRegs;
-  ModifiedRegs.resize(TRI->getNumRegs());
-  UsedRegs.resize(TRI->getNumRegs());
-  --MBBI;
-  for (unsigned Count = 0; MBBI != B; --MBBI) {
-    MachineInstr *MI = MBBI;
-    // Skip DBG_VALUE instructions. Otherwise debug info can affect the
-    // optimization by changing how far we scan.
-    if (MI->isDebugValue())
-      continue;
+  ModifiedRegs.reset();
+  UsedRegs.reset();
+  unsigned Count = 0;
+  do {
+    --MBBI;
+    MachineInstr &MI = *MBBI;
 
-    // Now that we know this is a real instruction, count it.
-    ++Count;
+    // Don't count DBG_VALUE instructions towards the search limit.
+    if (!MI.isDebugValue())
+      ++Count;
 
     // If we found a match, return it.
-    if (isMatchingUpdateInsn(I, MI, BaseReg, Offset))
+    if (isMatchingUpdateInsn(*I, MI, BaseReg, Offset))
       return MBBI;
 
     // Update the status of what the instruction clobbered and used.
@@ -1455,15 +1571,15 @@ MachineBasicBlock::iterator AArch64LoadStoreOpt::findMatchingUpdateInsnBackward(
     // return early.
     if (ModifiedRegs[BaseReg] || UsedRegs[BaseReg])
       return E;
-  }
+  } while (MBBI != B && Count < Limit);
   return E;
 }
 
 bool AArch64LoadStoreOpt::tryToPromoteLoadFromStore(
     MachineBasicBlock::iterator &MBBI) {
-  MachineInstr *MI = MBBI;
+  MachineInstr &MI = *MBBI;
   // If this is a volatile load, don't mess with it.
-  if (MI->hasOrderedMemoryRef())
+  if (MI.hasOrderedMemoryRef())
     return false;
 
   // Make sure this is a reg+imm.
@@ -1471,9 +1587,9 @@ bool AArch64LoadStoreOpt::tryToPromoteLoadFromStore(
   if (!getLdStOffsetOp(MI).isImm())
     return false;
 
-  // Look backward up to ScanLimit instructions.
+  // Look backward up to LdStLimit instructions.
   MachineBasicBlock::iterator StoreI;
-  if (findMatchingStore(MBBI, ScanLimit, StoreI)) {
+  if (findMatchingStore(MBBI, LdStLimit, StoreI)) {
     ++NumLoadsFromStoresPromoted;
     // Promote the load. Keeping the iterator straight is a
     // pain, so we let the merge routine tell us what the next instruction
@@ -1484,40 +1600,70 @@ bool AArch64LoadStoreOpt::tryToPromoteLoadFromStore(
   return false;
 }
 
+// Find narrow loads that can be converted into a single wider load with
+// bitfield extract instructions.  Also merge adjacent zero stores into a wider
+// store.
 bool AArch64LoadStoreOpt::tryToMergeLdStInst(
     MachineBasicBlock::iterator &MBBI) {
-  MachineInstr *MI = MBBI;
-  MachineBasicBlock::iterator E = MI->getParent()->end();
-  // If this is a volatile load/store, don't mess with it.
-  if (MI->hasOrderedMemoryRef())
-    return false;
+  assert((isNarrowLoad(*MBBI) || isPromotableZeroStoreOpcode(*MBBI)) &&
+         "Expected narrow op.");
+  MachineInstr &MI = *MBBI;
+  MachineBasicBlock::iterator E = MI.getParent()->end();
 
-  // Make sure this is a reg+imm (as opposed to an address reloc).
-  if (!getLdStOffsetOp(MI).isImm())
+  if (!TII->isCandidateToMergeOrPair(MI))
     return false;
 
-  // Check if this load/store has a hint to avoid pair formation.
-  // MachineMemOperands hints are set by the AArch64StorePairSuppress pass.
-  if (TII->isLdStPairSuppressed(MI))
+  // For promotable zero stores, the stored value should be WZR.
+  if (isPromotableZeroStoreOpcode(MI) &&
+      getLdStRegOp(MI).getReg() != AArch64::WZR)
     return false;
 
-  // Look ahead up to ScanLimit instructions for a pairable instruction.
+  // Look ahead up to LdStLimit instructions for a mergable instruction.
   LdStPairFlags Flags;
-  MachineBasicBlock::iterator Paired = findMatchingInsn(MBBI, Flags, ScanLimit);
-  if (Paired != E) {
+  MachineBasicBlock::iterator MergeMI =
+      findMatchingInsn(MBBI, Flags, LdStLimit, /* FindNarrowMerge = */ true);
+  if (MergeMI != E) {
     if (isNarrowLoad(MI)) {
       ++NumNarrowLoadsPromoted;
-    } else if (isNarrowStore(MI)) {
+    } else if (isPromotableZeroStoreInst(MI)) {
       ++NumZeroStoresPromoted;
-    } else {
-      ++NumPairCreated;
-      if (isUnscaledLdSt(MI))
-        ++NumUnscaledPairCreated;
     }
+    // Keeping the iterator straight is a pain, so we let the merge routine tell
+    // us what the next instruction is after it's done mucking about.
+    MBBI = mergeNarrowInsns(MBBI, MergeMI, Flags);
+    return true;
+  }
+  return false;
+}
 
-    // Merge the loads into a pair. Keeping the iterator straight is a
-    // pain, so we let the merge routine tell us what the next instruction
-    // is after it's done mucking about.
+// Find loads and stores that can be merged into a single load or store pair
+// instruction.
+bool AArch64LoadStoreOpt::tryToPairLdStInst(MachineBasicBlock::iterator &MBBI) {
+  MachineInstr &MI = *MBBI;
+  MachineBasicBlock::iterator E = MI.getParent()->end();
+
+  if (!TII->isCandidateToMergeOrPair(MI))
+    return false;
+
+  // Early exit if the offset is not possible to match. (6 bits of positive
+  // range, plus allow an extra one in case we find a later insn that matches
+  // with Offset-1)
+  bool IsUnscaled = TII->isUnscaledLdSt(MI);
+  int Offset = getLdStOffsetOp(MI).getImm();
+  int OffsetStride = IsUnscaled ? getMemScale(MI) : 1;
+  if (!inBoundsForPair(IsUnscaled, Offset, OffsetStride))
+    return false;
+
+  // Look ahead up to LdStLimit instructions for a pairable instruction.
+  LdStPairFlags Flags;
+  MachineBasicBlock::iterator Paired =
+      findMatchingInsn(MBBI, Flags, LdStLimit, /* FindNarrowMerge = */ false);
+  if (Paired != E) {
+    ++NumPairCreated;
+    if (TII->isUnscaledLdSt(MI))
+      ++NumUnscaledPairCreated;
+    // Keeping the iterator straight is a pain, so we let the merge routine tell
+    // us what the next instruction is after it's done mucking about.
     MBBI = mergePairedInsns(MBBI, Paired, Flags);
     return true;
   }
@@ -1527,7 +1673,7 @@ bool AArch64LoadStoreOpt::tryToMergeLdStInst(
 bool AArch64LoadStoreOpt::optimizeBlock(MachineBasicBlock &MBB,
                                         bool enableNarrowLdOpt) {
   bool Modified = false;
-  // Three tranformations to do here:
+  // Four tranformations to do here:
   // 1) Find loads that directly read from stores and promote them by
   //    replacing with mov instructions. If the store is wider than the load,
   //    the load will be replaced with a bitfield extract.
@@ -1536,35 +1682,11 @@ bool AArch64LoadStoreOpt::optimizeBlock(MachineBasicBlock &MBB,
   //        ldrh w2, [x0, #6]
   //        ; becomes
   //        str w1, [x0, #4]
-  //        lsr	w2, w1, #16
-  // 2) Find narrow loads that can be converted into a single wider load
-  //    with bitfield extract instructions.
-  //      e.g.,
-  //        ldrh w0, [x2]
-  //        ldrh w1, [x2, #2]
-  //        ; becomes
-  //        ldr w0, [x2]
-  //        ubfx w1, w0, #16, #16
-  //        and w0, w0, #ffff
-  // 3) Find loads and stores that can be merged into a single load or store
-  //    pair instruction.
-  //      e.g.,
-  //        ldr x0, [x2]
-  //        ldr x1, [x2, #8]
-  //        ; becomes
-  //        ldp x0, x1, [x2]
-  // 4) Find base register updates that can be merged into the load or store
-  //    as a base-reg writeback.
-  //      e.g.,
-  //        ldr x0, [x2]
-  //        add x2, x2, #4
-  //        ; becomes
-  //        ldr x0, [x2], #4
-
+  //        lsr w2, w1, #16
   for (MachineBasicBlock::iterator MBBI = MBB.begin(), E = MBB.end();
        MBBI != E;) {
-    MachineInstr *MI = MBBI;
-    switch (MI->getOpcode()) {
+    MachineInstr &MI = *MBBI;
+    switch (MI.getOpcode()) {
     default:
       // Just move on to the next instruction.
       ++MBBI;
@@ -1586,47 +1708,49 @@ bool AArch64LoadStoreOpt::optimizeBlock(MachineBasicBlock &MBB,
       ++MBBI;
       break;
     }
-      // FIXME: Do the other instructions.
     }
   }
-
+  // 2) Find narrow loads that can be converted into a single wider load
+  //    with bitfield extract instructions.
+  //      e.g.,
+  //        ldrh w0, [x2]
+  //        ldrh w1, [x2, #2]
+  //        ; becomes
+  //        ldr w0, [x2]
+  //        ubfx w1, w0, #16, #16
+  //        and w0, w0, #ffff
+  //
+  //    Also merge adjacent zero stores into a wider store.
+  //      e.g.,
+  //        strh wzr, [x0]
+  //        strh wzr, [x0, #2]
+  //        ; becomes
+  //        str wzr, [x0]
   for (MachineBasicBlock::iterator MBBI = MBB.begin(), E = MBB.end();
        enableNarrowLdOpt && MBBI != E;) {
-    MachineInstr *MI = MBBI;
-    switch (MI->getOpcode()) {
-    default:
-      // Just move on to the next instruction.
-      ++MBBI;
-      break;
-    // Scaled instructions.
-    case AArch64::LDRBBui:
-    case AArch64::LDRHHui:
-    case AArch64::LDRSBWui:
-    case AArch64::LDRSHWui:
-    case AArch64::STRBBui:
-    case AArch64::STRHHui:
-    // Unscaled instructions.
-    case AArch64::LDURBBi:
-    case AArch64::LDURHHi:
-    case AArch64::LDURSBWi:
-    case AArch64::LDURSHWi:
-    case AArch64::STURBBi:
-    case AArch64::STURHHi: {
+    MachineInstr &MI = *MBBI;
+    unsigned Opc = MI.getOpcode();
+    if (isPromotableZeroStoreOpcode(Opc) ||
+        (EnableNarrowLdMerge && isNarrowLoad(Opc))) {
       if (tryToMergeLdStInst(MBBI)) {
         Modified = true;
-        break;
-      }
+      } else
+        ++MBBI;
+    } else
       ++MBBI;
-      break;
-    }
-      // FIXME: Do the other instructions.
-    }
   }
 
+  // 3) Find loads and stores that can be merged into a single load or store
+  //    pair instruction.
+  //      e.g.,
+  //        ldr x0, [x2]
+  //        ldr x1, [x2, #8]
+  //        ; becomes
+  //        ldp x0, x1, [x2]
   for (MachineBasicBlock::iterator MBBI = MBB.begin(), E = MBB.end();
        MBBI != E;) {
-    MachineInstr *MI = MBBI;
-    switch (MI->getOpcode()) {
+    MachineInstr &MI = *MBBI;
+    switch (MI.getOpcode()) {
     default:
       // Just move on to the next instruction.
       ++MBBI;
@@ -1655,23 +1779,28 @@ bool AArch64LoadStoreOpt::optimizeBlock(MachineBasicBlock &MBB,
     case AArch64::LDURWi:
     case AArch64::LDURXi:
     case AArch64::LDURSWi: {
-      if (tryToMergeLdStInst(MBBI)) {
+      if (tryToPairLdStInst(MBBI)) {
         Modified = true;
         break;
       }
       ++MBBI;
       break;
     }
-      // FIXME: Do the other instructions.
     }
   }
-
+  // 4) Find base register updates that can be merged into the load or store
+  //    as a base-reg writeback.
+  //      e.g.,
+  //        ldr x0, [x2]
+  //        add x2, x2, #4
+  //        ; becomes
+  //        ldr x0, [x2], #4
   for (MachineBasicBlock::iterator MBBI = MBB.begin(), E = MBB.end();
        MBBI != E;) {
-    MachineInstr *MI = MBBI;
+    MachineInstr &MI = *MBBI;
     // Do update merging. It's simpler to keep this separate from the above
-    // switch, though not strictly necessary.
-    unsigned Opc = MI->getOpcode();
+    // switchs, though not strictly necessary.
+    unsigned Opc = MI.getOpcode();
     switch (Opc) {
     default:
       // Just move on to the next instruction.
@@ -1726,7 +1855,7 @@ bool AArch64LoadStoreOpt::optimizeBlock(MachineBasicBlock &MBB,
       //   merged into:
       // ldr x0, [x20], #32
       MachineBasicBlock::iterator Update =
-          findMatchingUpdateInsnForward(MBBI, ScanLimit, 0);
+          findMatchingUpdateInsnForward(MBBI, 0, UpdateLimit);
       if (Update != E) {
         // Merge the update into the ld/st.
         MBBI = mergeUpdateInsn(MBBI, Update, /*IsPreIdx=*/false);
@@ -1736,7 +1865,7 @@ bool AArch64LoadStoreOpt::optimizeBlock(MachineBasicBlock &MBB,
       }
       // Don't know how to handle pre/post-index versions, so move to the next
       // instruction.
-      if (isUnscaledLdSt(Opc)) {
+      if (TII->isUnscaledLdSt(Opc)) {
         ++MBBI;
         break;
       }
@@ -1746,7 +1875,7 @@ bool AArch64LoadStoreOpt::optimizeBlock(MachineBasicBlock &MBB,
       // ldr x1, [x0]
       //   merged into:
       // ldr x1, [x0, #8]!
-      Update = findMatchingUpdateInsnBackward(MBBI, ScanLimit);
+      Update = findMatchingUpdateInsnBackward(MBBI, UpdateLimit);
       if (Update != E) {
         // Merge the update into the ld/st.
         MBBI = mergeUpdateInsn(MBBI, Update, /*IsPreIdx=*/true);
@@ -1764,7 +1893,7 @@ bool AArch64LoadStoreOpt::optimizeBlock(MachineBasicBlock &MBB,
       // add x0, x0, #64
       //   merged into:
       // ldr x1, [x0, #64]!
-      Update = findMatchingUpdateInsnForward(MBBI, ScanLimit, UnscaledOffset);
+      Update = findMatchingUpdateInsnForward(MBBI, UnscaledOffset, UpdateLimit);
       if (Update != E) {
         // Merge the update into the ld/st.
         MBBI = mergeUpdateInsn(MBBI, Update, /*IsPreIdx=*/true);
@@ -1777,29 +1906,29 @@ bool AArch64LoadStoreOpt::optimizeBlock(MachineBasicBlock &MBB,
       ++MBBI;
       break;
     }
-      // FIXME: Do the other instructions.
     }
   }
 
   return Modified;
 }
 
-bool AArch64LoadStoreOpt::enableNarrowLdMerge(MachineFunction &Fn) {
-  bool ProfitableArch = Subtarget->isCortexA57();
-  // FIXME: The benefit from converting narrow loads into a wider load could be
-  // microarchitectural as it assumes that a single load with two bitfield
-  // extracts is cheaper than two narrow loads. Currently, this conversion is
-  // enabled only in cortex-a57 on which performance benefits were verified.
-  return ProfitableArch && !Subtarget->requiresStrictAlign();
-}
-
 bool AArch64LoadStoreOpt::runOnMachineFunction(MachineFunction &Fn) {
+  if (skipFunction(*Fn.getFunction()))
+    return false;
+
   Subtarget = &static_cast<const AArch64Subtarget &>(Fn.getSubtarget());
   TII = static_cast<const AArch64InstrInfo *>(Subtarget->getInstrInfo());
   TRI = Subtarget->getRegisterInfo();
 
+  // Resize the modified and used register bitfield trackers.  We do this once
+  // per function and then clear the bitfield each time we optimize a load or
+  // store.
+  ModifiedRegs.resize(TRI->getNumRegs());
+  UsedRegs.resize(TRI->getNumRegs());
+
   bool Modified = false;
-  bool enableNarrowLdOpt = enableNarrowLdMerge(Fn);
+  bool enableNarrowLdOpt =
+    Subtarget->mergeNarrowLoads() && !Subtarget->requiresStrictAlign();
   for (auto &MBB : Fn)
     Modified |= optimizeBlock(MBB, enableNarrowLdOpt);
 
@@ -1809,6 +1938,11 @@ bool AArch64LoadStoreOpt::runOnMachineFunction(MachineFunction &Fn) {
 // FIXME: Do we need/want a pre-alloc pass like ARM has to try to keep
 // loads and stores near one another?
 
+// FIXME: When pairing store instructions it's very possible for this pass to
+// hoist a store with a KILL marker above another use (without a KILL marker).
+// The resulting IR is invalid, but nothing uses the KILL markers after this
+// pass, so it's never caused a problem in practice.
+
 /// createAArch64LoadStoreOptimizationPass - returns an instance of the
 /// load / store optimization pass.
 FunctionPass *llvm::createAArch64LoadStoreOptimizationPass() {
diff --git a/lib/Target/AArch64/AArch64MachineFunctionInfo.h b/lib/Target/AArch64/AArch64MachineFunctionInfo.h
index 318f83953505..49e7767741ea 100644
--- a/lib/Target/AArch64/AArch64MachineFunctionInfo.h
+++ b/lib/Target/AArch64/AArch64MachineFunctionInfo.h
@@ -48,6 +48,9 @@ class AArch64FunctionInfo : public MachineFunctionInfo {
   /// \brief Amount of stack frame size, not including callee-saved registers.
   unsigned LocalStackSize;
 
+  /// \brief Amount of stack frame size used for saving callee-saved registers.
+  unsigned CalleeSavedStackSize;
+
   /// \brief Number of TLS accesses using the special (combinable)
   /// _TLS_MODULE_BASE_ symbol.
   unsigned NumLocalDynamicTLSAccesses;
@@ -76,18 +79,28 @@ class AArch64FunctionInfo : public MachineFunctionInfo {
   /// copies.
   bool IsSplitCSR;
 
+  /// True when the stack gets realigned dynamically because the size of stack
+  /// frame is unknown at compile time. e.g., in case of VLAs.
+  bool StackRealigned;
+
+  /// True when the callee-save stack area has unused gaps that may be used for
+  /// other stack allocations.
+  bool CalleeSaveStackHasFreeSpace;
+
 public:
   AArch64FunctionInfo()
       : BytesInStackArgArea(0), ArgumentStackToRestore(0), HasStackFrame(false),
         NumLocalDynamicTLSAccesses(0), VarArgsStackIndex(0), VarArgsGPRIndex(0),
         VarArgsGPRSize(0), VarArgsFPRIndex(0), VarArgsFPRSize(0),
-        IsSplitCSR(false) {}
+        IsSplitCSR(false), StackRealigned(false),
+        CalleeSaveStackHasFreeSpace(false) {}
 
   explicit AArch64FunctionInfo(MachineFunction &MF)
       : BytesInStackArgArea(0), ArgumentStackToRestore(0), HasStackFrame(false),
         NumLocalDynamicTLSAccesses(0), VarArgsStackIndex(0), VarArgsGPRIndex(0),
         VarArgsGPRSize(0), VarArgsFPRIndex(0), VarArgsFPRSize(0),
-        IsSplitCSR(false) {
+        IsSplitCSR(false), StackRealigned(false),
+        CalleeSaveStackHasFreeSpace(false) {
     (void)MF;
   }
 
@@ -102,12 +115,25 @@ public:
   bool hasStackFrame() const { return HasStackFrame; }
   void setHasStackFrame(bool s) { HasStackFrame = s; }
 
+  bool isStackRealigned() const { return StackRealigned; }
+  void setStackRealigned(bool s) { StackRealigned = s; }
+
+  bool hasCalleeSaveStackFreeSpace() const {
+    return CalleeSaveStackHasFreeSpace;
+  }
+  void setCalleeSaveStackHasFreeSpace(bool s) {
+    CalleeSaveStackHasFreeSpace = s;
+  }
+
   bool isSplitCSR() const { return IsSplitCSR; }
   void setIsSplitCSR(bool s) { IsSplitCSR = s; }
 
   void setLocalStackSize(unsigned Size) { LocalStackSize = Size; }
   unsigned getLocalStackSize() const { return LocalStackSize; }
 
+  void setCalleeSavedStackSize(unsigned Size) { CalleeSavedStackSize = Size; }
+  unsigned getCalleeSavedStackSize() const { return CalleeSavedStackSize; }
+
   void incNumLocalDynamicTLSAccesses() { ++NumLocalDynamicTLSAccesses; }
   unsigned getNumLocalDynamicTLSAccesses() const {
     return NumLocalDynamicTLSAccesses;
@@ -140,15 +166,15 @@ public:
     SmallVector<const MachineInstr *, 3> Args;
 
   public:
-    typedef SmallVectorImpl<const MachineInstr *> LOHArgs;
+    typedef ArrayRef<const MachineInstr *> LOHArgs;
 
-    MILOHDirective(MCLOHType Kind, const LOHArgs &Args)
+    MILOHDirective(MCLOHType Kind, LOHArgs Args)
         : Kind(Kind), Args(Args.begin(), Args.end()) {
       assert(isValidMCLOHType(Kind) && "Invalid LOH directive type!");
     }
 
     MCLOHType getKind() const { return Kind; }
-    const LOHArgs &getArgs() const { return Args; }
+    LOHArgs getArgs() const { return Args; }
   };
 
   typedef MILOHDirective::LOHArgs MILOHArgs;
@@ -157,7 +183,7 @@ public:
   const MILOHContainer &getLOHContainer() const { return LOHContainerSet; }
 
   /// Add a LOH directive of this @p Kind and this @p Args.
-  void addLOHDirective(MCLOHType Kind, const MILOHArgs &Args) {
+  void addLOHDirective(MCLOHType Kind, MILOHArgs Args) {
     LOHContainerSet.push_back(MILOHDirective(Kind, Args));
     LOHRelated.insert(Args.begin(), Args.end());
   }
diff --git a/lib/Target/AArch64/AArch64PBQPRegAlloc.cpp b/lib/Target/AArch64/AArch64PBQPRegAlloc.cpp
index 5394875a6bc1..038162c6f54a 100644
--- a/lib/Target/AArch64/AArch64PBQPRegAlloc.cpp
+++ b/lib/Target/AArch64/AArch64PBQPRegAlloc.cpp
@@ -320,7 +320,7 @@ void A57ChainingConstraint::addInterChainConstraint(PBQPRAGraph &G, unsigned Rd,
 static bool regJustKilledBefore(const LiveIntervals &LIs, unsigned reg,
                                 const MachineInstr &MI) {
   const LiveInterval &LI = LIs.getInterval(reg);
-  SlotIndex SI = LIs.getInstructionIndex(&MI);
+  SlotIndex SI = LIs.getInstructionIndex(MI);
   return LI.expiredAt(SI);
 }
 
diff --git a/lib/Target/AArch64/AArch64PromoteConstant.cpp b/lib/Target/AArch64/AArch64PromoteConstant.cpp
index 79c09d9f058d..b1e40510b2ae 100644
--- a/lib/Target/AArch64/AArch64PromoteConstant.cpp
+++ b/lib/Target/AArch64/AArch64PromoteConstant.cpp
@@ -85,6 +85,21 @@ namespace {
 class AArch64PromoteConstant : public ModulePass {
 
 public:
+  struct PromotedConstant {
+    bool ShouldConvert = false;
+    GlobalVariable *GV = nullptr;
+  };
+  typedef SmallDenseMap<Constant *, PromotedConstant, 16> PromotionCacheTy;
+
+  struct UpdateRecord {
+    Constant *C;
+    Instruction *User;
+    unsigned Op;
+
+    UpdateRecord(Constant *C, Instruction *User, unsigned Op)
+        : C(C), User(User), Op(Op) {}
+  };
+
   static char ID;
   AArch64PromoteConstant() : ModulePass(ID) {}
 
@@ -94,9 +109,12 @@ public:
   /// global variables with module scope.
   bool runOnModule(Module &M) override {
     DEBUG(dbgs() << getPassName() << '\n');
+    if (skipModule(M))
+      return false;
     bool Changed = false;
+    PromotionCacheTy PromotionCache;
     for (auto &MF : M) {
-      Changed |= runOnFunction(MF);
+      Changed |= runOnFunction(MF, PromotionCache);
     }
     return Changed;
   }
@@ -105,7 +123,7 @@ private:
   /// Look for interesting constants used within the given function.
   /// Promote them into global variables, load these global variables within
   /// the related function, so that the number of inserted load is minimal.
-  bool runOnFunction(Function &F);
+  bool runOnFunction(Function &F, PromotionCacheTy &PromotionCache);
 
   // This transformation requires dominator info
   void getAnalysisUsage(AnalysisUsage &AU) const override {
@@ -115,79 +133,72 @@ private:
   }
 
   /// Type to store a list of Uses.
-  typedef SmallVector<Use *, 4> Uses;
+  typedef SmallVector<std::pair<Instruction *, unsigned>, 4> Uses;
   /// Map an insertion point to all the uses it dominates.
   typedef DenseMap<Instruction *, Uses> InsertionPoints;
-  /// Map a function to the required insertion point of load for a
-  /// global variable.
-  typedef DenseMap<Function *, InsertionPoints> InsertionPointsPerFunc;
 
   /// Find the closest point that dominates the given Use.
-  Instruction *findInsertionPoint(Use &Use);
+  Instruction *findInsertionPoint(Instruction &User, unsigned OpNo);
 
   /// Check if the given insertion point is dominated by an existing
   /// insertion point.
   /// If true, the given use is added to the list of dominated uses for
   /// the related existing point.
   /// \param NewPt the insertion point to be checked
-  /// \param Use the use to be added into the list of dominated uses
+  /// \param User the user of the constant
+  /// \param OpNo the operand number of the use
   /// \param InsertPts existing insertion points
   /// \pre NewPt and all instruction in InsertPts belong to the same function
   /// \return true if one of the insertion point in InsertPts dominates NewPt,
   ///         false otherwise
-  bool isDominated(Instruction *NewPt, Use &Use, InsertionPoints &InsertPts);
+  bool isDominated(Instruction *NewPt, Instruction *User, unsigned OpNo,
+                   InsertionPoints &InsertPts);
 
   /// Check if the given insertion point can be merged with an existing
   /// insertion point in a common dominator.
   /// If true, the given use is added to the list of the created insertion
   /// point.
   /// \param NewPt the insertion point to be checked
-  /// \param Use the use to be added into the list of dominated uses
+  /// \param User the user of the constant
+  /// \param OpNo the operand number of the use
   /// \param InsertPts existing insertion points
   /// \pre NewPt and all instruction in InsertPts belong to the same function
   /// \pre isDominated returns false for the exact same parameters.
   /// \return true if it exists an insertion point in InsertPts that could
   ///         have been merged with NewPt in a common dominator,
   ///         false otherwise
-  bool tryAndMerge(Instruction *NewPt, Use &Use, InsertionPoints &InsertPts);
+  bool tryAndMerge(Instruction *NewPt, Instruction *User, unsigned OpNo,
+                   InsertionPoints &InsertPts);
 
   /// Compute the minimal insertion points to dominates all the interesting
   /// uses of value.
   /// Insertion points are group per function and each insertion point
   /// contains a list of all the uses it dominates within the related function
-  /// \param Val constant to be examined
-  /// \param[out] InsPtsPerFunc output storage of the analysis
-  void computeInsertionPoints(Constant *Val,
-                              InsertionPointsPerFunc &InsPtsPerFunc);
+  /// \param User the user of the constant
+  /// \param OpNo the operand number of the constant
+  /// \param[out] InsertPts output storage of the analysis
+  void computeInsertionPoint(Instruction *User, unsigned OpNo,
+                             InsertionPoints &InsertPts);
 
   /// Insert a definition of a new global variable at each point contained in
   /// InsPtsPerFunc and update the related uses (also contained in
   /// InsPtsPerFunc).
-  bool insertDefinitions(Constant *Cst, InsertionPointsPerFunc &InsPtsPerFunc);
-
-  /// Compute the minimal insertion points to dominate all the interesting
-  /// uses of Val and insert a definition of a new global variable
-  /// at these points.
-  /// Also update the uses of Val accordingly.
-  /// Currently a use of Val is considered interesting if:
-  /// - Val is not UndefValue
-  /// - Val is not zeroinitialized
-  /// - Replacing Val per a load of a global variable is valid.
-  /// \see shouldConvert for more details
-  bool computeAndInsertDefinitions(Constant *Val);
-
-  /// Promote the given constant into a global variable if it is expected to
-  /// be profitable.
-  /// \return true if Cst has been promoted
-  bool promoteConstant(Constant *Cst);
+  void insertDefinitions(Function &F, GlobalVariable &GV,
+                         InsertionPoints &InsertPts);
+
+  /// Do the constant promotion indicated by the Updates records, keeping track
+  /// of globals in PromotionCache.
+  void promoteConstants(Function &F, SmallVectorImpl<UpdateRecord> &Updates,
+                        PromotionCacheTy &PromotionCache);
 
   /// Transfer the list of dominated uses of IPI to NewPt in InsertPts.
   /// Append Use to this list and delete the entry of IPI in InsertPts.
-  static void appendAndTransferDominatedUses(Instruction *NewPt, Use &Use,
+  static void appendAndTransferDominatedUses(Instruction *NewPt,
+                                             Instruction *User, unsigned OpNo,
                                              InsertionPoints::iterator &IPI,
                                              InsertionPoints &InsertPts) {
     // Record the dominated use.
-    IPI->second.push_back(&Use);
+    IPI->second.emplace_back(User, OpNo);
     // Transfer the dominated uses of IPI to NewPt
     // Inserting into the DenseMap may invalidate existing iterator.
     // Keep a copy of the key to find the iterator to erase.  Keep a copy of the
@@ -285,10 +296,7 @@ static bool shouldConvertUse(const Constant *Cst, const Instruction *Instr,
 
   // Do not mess with inline asm.
   const CallInst *CI = dyn_cast<const CallInst>(Instr);
-  if (CI && isa<const InlineAsm>(CI->getCalledValue()))
-    return false;
-
-  return true;
+  return !(CI && isa<const InlineAsm>(CI->getCalledValue()));
 }
 
 /// Check if the given Cst should be converted into
@@ -305,7 +313,7 @@ static bool shouldConvertUse(const Constant *Cst, const Instruction *Instr,
 /// for the regular approach, even for float).
 /// Again, the simplest solution would be to promote every
 /// constant and rematerialize them when they are actually cheap to create.
-static bool shouldConvert(const Constant *Cst) {
+static bool shouldConvertImpl(const Constant *Cst) {
   if (isa<const UndefValue>(Cst))
     return false;
 
@@ -328,18 +336,28 @@ static bool shouldConvert(const Constant *Cst) {
   return isConstantUsingVectorTy(Cst->getType());
 }
 
-Instruction *AArch64PromoteConstant::findInsertionPoint(Use &Use) {
-  Instruction *User = cast<Instruction>(Use.getUser());
+static bool
+shouldConvert(Constant &C,
+              AArch64PromoteConstant::PromotionCacheTy &PromotionCache) {
+  auto Converted = PromotionCache.insert(
+      std::make_pair(&C, AArch64PromoteConstant::PromotedConstant()));
+  if (Converted.second)
+    Converted.first->second.ShouldConvert = shouldConvertImpl(&C);
+  return Converted.first->second.ShouldConvert;
+}
 
+Instruction *AArch64PromoteConstant::findInsertionPoint(Instruction &User,
+                                                        unsigned OpNo) {
   // If this user is a phi, the insertion point is in the related
   // incoming basic block.
-  if (PHINode *PhiInst = dyn_cast<PHINode>(User))
-    return PhiInst->getIncomingBlock(Use.getOperandNo())->getTerminator();
+  if (PHINode *PhiInst = dyn_cast<PHINode>(&User))
+    return PhiInst->getIncomingBlock(OpNo)->getTerminator();
 
-  return User;
+  return &User;
 }
 
-bool AArch64PromoteConstant::isDominated(Instruction *NewPt, Use &Use,
+bool AArch64PromoteConstant::isDominated(Instruction *NewPt, Instruction *User,
+                                         unsigned OpNo,
                                          InsertionPoints &InsertPts) {
 
   DominatorTree &DT = getAnalysis<DominatorTreeWrapperPass>(
@@ -358,14 +376,15 @@ bool AArch64PromoteConstant::isDominated(Instruction *NewPt, Use &Use,
       DEBUG(dbgs() << "Insertion point dominated by:\n");
       DEBUG(IPI.first->print(dbgs()));
       DEBUG(dbgs() << '\n');
-      IPI.second.push_back(&Use);
+      IPI.second.emplace_back(User, OpNo);
       return true;
     }
   }
   return false;
 }
 
-bool AArch64PromoteConstant::tryAndMerge(Instruction *NewPt, Use &Use,
+bool AArch64PromoteConstant::tryAndMerge(Instruction *NewPt, Instruction *User,
+                                         unsigned OpNo,
                                          InsertionPoints &InsertPts) {
   DominatorTree &DT = getAnalysis<DominatorTreeWrapperPass>(
       *NewPt->getParent()->getParent()).getDomTree();
@@ -385,7 +404,7 @@ bool AArch64PromoteConstant::tryAndMerge(Instruction *NewPt, Use &Use,
       DEBUG(dbgs() << "Merge insertion point with:\n");
       DEBUG(IPI->first->print(dbgs()));
       DEBUG(dbgs() << "\nat considered insertion point.\n");
-      appendAndTransferDominatedUses(NewPt, Use, IPI, InsertPts);
+      appendAndTransferDominatedUses(NewPt, User, OpNo, IPI, InsertPts);
       return true;
     }
 
@@ -409,149 +428,141 @@ bool AArch64PromoteConstant::tryAndMerge(Instruction *NewPt, Use &Use,
     DEBUG(dbgs() << '\n');
     DEBUG(NewPt->print(dbgs()));
     DEBUG(dbgs() << '\n');
-    appendAndTransferDominatedUses(NewPt, Use, IPI, InsertPts);
+    appendAndTransferDominatedUses(NewPt, User, OpNo, IPI, InsertPts);
     return true;
   }
   return false;
 }
 
-void AArch64PromoteConstant::computeInsertionPoints(
-    Constant *Val, InsertionPointsPerFunc &InsPtsPerFunc) {
-  DEBUG(dbgs() << "** Compute insertion points **\n");
-  for (Use &Use : Val->uses()) {
-    Instruction *User = dyn_cast<Instruction>(Use.getUser());
-
-    // If the user is not an Instruction, we cannot modify it.
-    if (!User)
-      continue;
-
-    // Filter out uses that should not be converted.
-    if (!shouldConvertUse(Val, User, Use.getOperandNo()))
-      continue;
+void AArch64PromoteConstant::computeInsertionPoint(
+    Instruction *User, unsigned OpNo, InsertionPoints &InsertPts) {
+  DEBUG(dbgs() << "Considered use, opidx " << OpNo << ":\n");
+  DEBUG(User->print(dbgs()));
+  DEBUG(dbgs() << '\n');
 
-    DEBUG(dbgs() << "Considered use, opidx " << Use.getOperandNo() << ":\n");
-    DEBUG(User->print(dbgs()));
-    DEBUG(dbgs() << '\n');
+  Instruction *InsertionPoint = findInsertionPoint(*User, OpNo);
 
-    Instruction *InsertionPoint = findInsertionPoint(Use);
+  DEBUG(dbgs() << "Considered insertion point:\n");
+  DEBUG(InsertionPoint->print(dbgs()));
+  DEBUG(dbgs() << '\n');
 
-    DEBUG(dbgs() << "Considered insertion point:\n");
-    DEBUG(InsertionPoint->print(dbgs()));
-    DEBUG(dbgs() << '\n');
+  if (isDominated(InsertionPoint, User, OpNo, InsertPts))
+    return;
+  // This insertion point is useful, check if we can merge some insertion
+  // point in a common dominator or if NewPt dominates an existing one.
+  if (tryAndMerge(InsertionPoint, User, OpNo, InsertPts))
+    return;
 
-    // Check if the current insertion point is useless, i.e., it is dominated
-    // by another one.
-    InsertionPoints &InsertPts =
-        InsPtsPerFunc[InsertionPoint->getParent()->getParent()];
-    if (isDominated(InsertionPoint, Use, InsertPts))
-      continue;
-    // This insertion point is useful, check if we can merge some insertion
-    // point in a common dominator or if NewPt dominates an existing one.
-    if (tryAndMerge(InsertionPoint, Use, InsertPts))
-      continue;
-
-    DEBUG(dbgs() << "Keep considered insertion point\n");
+  DEBUG(dbgs() << "Keep considered insertion point\n");
 
-    // It is definitely useful by its own
-    InsertPts[InsertionPoint].push_back(&Use);
-  }
+  // It is definitely useful by its own
+  InsertPts[InsertionPoint].emplace_back(User, OpNo);
 }
 
-bool AArch64PromoteConstant::insertDefinitions(
-    Constant *Cst, InsertionPointsPerFunc &InsPtsPerFunc) {
-  // We will create one global variable per Module.
-  DenseMap<Module *, GlobalVariable *> ModuleToMergedGV;
-  bool HasChanged = false;
+static void ensurePromotedGV(Function &F, Constant &C,
+                             AArch64PromoteConstant::PromotedConstant &PC) {
+  assert(PC.ShouldConvert &&
+         "Expected that we should convert this to a global");
+  if (PC.GV)
+    return;
+  PC.GV = new GlobalVariable(
+      *F.getParent(), C.getType(), true, GlobalValue::InternalLinkage, nullptr,
+      "_PromotedConst", nullptr, GlobalVariable::NotThreadLocal);
+  PC.GV->setInitializer(&C);
+  DEBUG(dbgs() << "Global replacement: ");
+  DEBUG(PC.GV->print(dbgs()));
+  DEBUG(dbgs() << '\n');
+  ++NumPromoted;
+}
 
-  // Traverse all insertion points in all the function.
-  for (const auto &FctToInstPtsIt : InsPtsPerFunc) {
-    const InsertionPoints &InsertPts = FctToInstPtsIt.second;
-// Do more checking for debug purposes.
+void AArch64PromoteConstant::insertDefinitions(Function &F,
+                                               GlobalVariable &PromotedGV,
+                                               InsertionPoints &InsertPts) {
 #ifndef NDEBUG
-    DominatorTree &DT = getAnalysis<DominatorTreeWrapperPass>(
-                            *FctToInstPtsIt.first).getDomTree();
+  // Do more checking for debug purposes.
+  DominatorTree &DT = getAnalysis<DominatorTreeWrapperPass>(F).getDomTree();
 #endif
-    assert(!InsertPts.empty() && "Empty uses does not need a definition");
-
-    Module *M = FctToInstPtsIt.first->getParent();
-    GlobalVariable *&PromotedGV = ModuleToMergedGV[M];
-    if (!PromotedGV) {
-      PromotedGV = new GlobalVariable(
-          *M, Cst->getType(), true, GlobalValue::InternalLinkage, nullptr,
-          "_PromotedConst", nullptr, GlobalVariable::NotThreadLocal);
-      PromotedGV->setInitializer(Cst);
-      DEBUG(dbgs() << "Global replacement: ");
-      DEBUG(PromotedGV->print(dbgs()));
-      DEBUG(dbgs() << '\n');
-      ++NumPromoted;
-      HasChanged = true;
-    }
-
-    for (const auto &IPI : InsertPts) {
-      // Create the load of the global variable.
-      IRBuilder<> Builder(IPI.first);
-      LoadInst *LoadedCst = Builder.CreateLoad(PromotedGV);
-      DEBUG(dbgs() << "**********\n");
-      DEBUG(dbgs() << "New def: ");
-      DEBUG(LoadedCst->print(dbgs()));
-      DEBUG(dbgs() << '\n');
+  assert(!InsertPts.empty() && "Empty uses does not need a definition");
+
+  for (const auto &IPI : InsertPts) {
+    // Create the load of the global variable.
+    IRBuilder<> Builder(IPI.first);
+    LoadInst *LoadedCst = Builder.CreateLoad(&PromotedGV);
+    DEBUG(dbgs() << "**********\n");
+    DEBUG(dbgs() << "New def: ");
+    DEBUG(LoadedCst->print(dbgs()));
+    DEBUG(dbgs() << '\n');
 
-      // Update the dominated uses.
-      for (Use *Use : IPI.second) {
+    // Update the dominated uses.
+    for (auto Use : IPI.second) {
 #ifndef NDEBUG
-        assert(DT.dominates(LoadedCst, findInsertionPoint(*Use)) &&
-               "Inserted definition does not dominate all its uses!");
+      assert(DT.dominates(LoadedCst,
+                          findInsertionPoint(*Use.first, Use.second)) &&
+             "Inserted definition does not dominate all its uses!");
 #endif
-        DEBUG(dbgs() << "Use to update " << Use->getOperandNo() << ":");
-        DEBUG(Use->getUser()->print(dbgs()));
-        DEBUG(dbgs() << '\n');
-        Use->set(LoadedCst);
-        ++NumPromotedUses;
-      }
+      DEBUG({
+            dbgs() << "Use to update " << Use.second << ":";
+            Use.first->print(dbgs());
+            dbgs() << '\n';
+            });
+      Use.first->setOperand(Use.second, LoadedCst);
+      ++NumPromotedUses;
     }
   }
-  return HasChanged;
 }
 
-bool AArch64PromoteConstant::computeAndInsertDefinitions(Constant *Val) {
-  InsertionPointsPerFunc InsertPtsPerFunc;
-  computeInsertionPoints(Val, InsertPtsPerFunc);
-  return insertDefinitions(Val, InsertPtsPerFunc);
-}
-
-bool AArch64PromoteConstant::promoteConstant(Constant *Cst) {
-  assert(Cst && "Given variable is not a valid constant.");
-
-  if (!shouldConvert(Cst))
-    return false;
-
-  DEBUG(dbgs() << "******************************\n");
-  DEBUG(dbgs() << "Candidate constant: ");
-  DEBUG(Cst->print(dbgs()));
-  DEBUG(dbgs() << '\n');
-
-  return computeAndInsertDefinitions(Cst);
+void AArch64PromoteConstant::promoteConstants(
+    Function &F, SmallVectorImpl<UpdateRecord> &Updates,
+    PromotionCacheTy &PromotionCache) {
+  // Promote the constants.
+  for (auto U = Updates.begin(), E = Updates.end(); U != E;) {
+    DEBUG(dbgs() << "** Compute insertion points **\n");
+    auto First = U;
+    Constant *C = First->C;
+    InsertionPoints InsertPts;
+    do {
+      computeInsertionPoint(U->User, U->Op, InsertPts);
+    } while (++U != E && U->C == C);
+
+    auto &Promotion = PromotionCache[C];
+    ensurePromotedGV(F, *C, Promotion);
+    insertDefinitions(F, *Promotion.GV, InsertPts);
+  }
 }
 
-bool AArch64PromoteConstant::runOnFunction(Function &F) {
+bool AArch64PromoteConstant::runOnFunction(Function &F,
+                                           PromotionCacheTy &PromotionCache) {
   // Look for instructions using constant vector. Promote that constant to a
   // global variable. Create as few loads of this variable as possible and
   // update the uses accordingly.
-  bool LocalChange = false;
-  SmallPtrSet<Constant *, 8> AlreadyChecked;
-
+  SmallVector<UpdateRecord, 64> Updates;
   for (Instruction &I : instructions(&F)) {
     // Traverse the operand, looking for constant vectors. Replace them by a
     // load of a global variable of constant vector type.
-    for (Value *Op : I.operand_values()) {
-      Constant *Cst = dyn_cast<Constant>(Op);
+    for (Use &U : I.operands()) {
+      Constant *Cst = dyn_cast<Constant>(U);
       // There is no point in promoting global values as they are already
       // global. Do not promote constant expressions either, as they may
       // require some code expansion.
-      if (Cst && !isa<GlobalValue>(Cst) && !isa<ConstantExpr>(Cst) &&
-          AlreadyChecked.insert(Cst).second)
-        LocalChange |= promoteConstant(Cst);
+      if (!Cst || isa<GlobalValue>(Cst) || isa<ConstantExpr>(Cst))
+        continue;
+
+      // Check if this constant is worth promoting.
+      if (!shouldConvert(*Cst, PromotionCache))
+        continue;
+
+      // Check if this use should be promoted.
+      unsigned OpNo = &U - I.op_begin();
+      if (!shouldConvertUse(Cst, &I, OpNo))
+        continue;
+
+      Updates.emplace_back(Cst, &I, OpNo);
     }
   }
-  return LocalChange;
+
+  if (Updates.empty())
+    return false;
+
+  promoteConstants(F, Updates, PromotionCache);
+  return true;
 }
diff --git a/lib/Target/AArch64/AArch64RedundantCopyElimination.cpp b/lib/Target/AArch64/AArch64RedundantCopyElimination.cpp
new file mode 100644
index 000000000000..60d8bbd260bb
--- /dev/null
+++ b/lib/Target/AArch64/AArch64RedundantCopyElimination.cpp
@@ -0,0 +1,182 @@
+//=- AArch64RedundantCopyElimination.cpp - Remove useless copy for AArch64 -=//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+// This pass removes unnecessary zero copies in BBs that are targets of
+// cbz/cbnz instructions. For instance, the copy instruction in the code below
+// can be removed because the CBZW jumps to BB#2 when W0 is zero.
+//  BB#1:
+//    CBZW %W0, <BB#2>
+//  BB#2:
+//    %W0 = COPY %WZR
+// This pass should be run after register allocation.
+//
+// FIXME: This should be extended to handle any constant other than zero. E.g.,
+//   cmp w0, #1
+//     b.eq .BB1
+//   BB1:
+//     mov w0, #1
+//
+// FIXME: This could also be extended to check the whole dominance subtree below
+// the comparison if the compile time regression is acceptable.
+//
+//===----------------------------------------------------------------------===//
+
+#include "AArch64.h"
+#include "llvm/ADT/SetVector.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/ADT/iterator_range.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/Support/Debug.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "aarch64-copyelim"
+
+STATISTIC(NumCopiesRemoved, "Number of copies removed.");
+
+namespace llvm {
+void initializeAArch64RedundantCopyEliminationPass(PassRegistry &);
+}
+
+namespace {
+class AArch64RedundantCopyElimination : public MachineFunctionPass {
+  const MachineRegisterInfo *MRI;
+  const TargetRegisterInfo *TRI;
+
+public:
+  static char ID;
+  AArch64RedundantCopyElimination() : MachineFunctionPass(ID) {}
+  bool optimizeCopy(MachineBasicBlock *MBB);
+  bool runOnMachineFunction(MachineFunction &MF) override;
+  MachineFunctionProperties getRequiredProperties() const override {
+    return MachineFunctionProperties().set(
+        MachineFunctionProperties::Property::AllVRegsAllocated);
+  }
+  const char *getPassName() const override {
+    return "AArch64 Redundant Copy Elimination";
+  }
+};
+char AArch64RedundantCopyElimination::ID = 0;
+}
+
+INITIALIZE_PASS(AArch64RedundantCopyElimination, "aarch64-copyelim",
+                "AArch64 redundant copy elimination pass", false, false)
+
+static bool guaranteesZeroRegInBlock(MachineInstr &MI, MachineBasicBlock *MBB) {
+  unsigned Opc = MI.getOpcode();
+  // Check if the current basic block is the target block to which the
+  // CBZ/CBNZ instruction jumps when its Wt/Xt is zero.
+  if ((Opc == AArch64::CBZW || Opc == AArch64::CBZX) &&
+      MBB == MI.getOperand(1).getMBB())
+    return true;
+  else if ((Opc == AArch64::CBNZW || Opc == AArch64::CBNZX) &&
+           MBB != MI.getOperand(1).getMBB())
+    return true;
+
+  return false;
+}
+
+bool AArch64RedundantCopyElimination::optimizeCopy(MachineBasicBlock *MBB) {
+  // Check if the current basic block has a single predecessor.
+  if (MBB->pred_size() != 1)
+    return false;
+
+  MachineBasicBlock *PredMBB = *MBB->pred_begin();
+  MachineBasicBlock::iterator CompBr = PredMBB->getLastNonDebugInstr();
+  if (CompBr == PredMBB->end() || PredMBB->succ_size() != 2)
+    return false;
+
+  ++CompBr;
+  do {
+    --CompBr;
+    if (guaranteesZeroRegInBlock(*CompBr, MBB))
+      break;
+  } while (CompBr != PredMBB->begin() && CompBr->isTerminator());
+
+  // We've not found a CBZ/CBNZ, time to bail out.
+  if (!guaranteesZeroRegInBlock(*CompBr, MBB))
+    return false;
+
+  unsigned TargetReg = CompBr->getOperand(0).getReg();
+  if (!TargetReg)
+    return false;
+  assert(TargetRegisterInfo::isPhysicalRegister(TargetReg) &&
+         "Expect physical register");
+
+  // Remember all registers aliasing with TargetReg.
+  SmallSetVector<unsigned, 8> TargetRegs;
+  for (MCRegAliasIterator AI(TargetReg, TRI, true); AI.isValid(); ++AI)
+    TargetRegs.insert(*AI);
+
+  bool Changed = false;
+  MachineBasicBlock::iterator LastChange = MBB->begin();
+  unsigned SmallestDef = TargetReg;
+  // Remove redundant Copy instructions unless TargetReg is modified.
+  for (MachineBasicBlock::iterator I = MBB->begin(), E = MBB->end(); I != E;) {
+    MachineInstr *MI = &*I;
+    ++I;
+    if (MI->isCopy() && MI->getOperand(0).isReg() &&
+        MI->getOperand(1).isReg()) {
+
+      unsigned DefReg = MI->getOperand(0).getReg();
+      unsigned SrcReg = MI->getOperand(1).getReg();
+
+      if ((SrcReg == AArch64::XZR || SrcReg == AArch64::WZR) &&
+          !MRI->isReserved(DefReg) &&
+          (TargetReg == DefReg || TRI->isSuperRegister(DefReg, TargetReg))) {
+        DEBUG(dbgs() << "Remove redundant Copy : ");
+        DEBUG((MI)->print(dbgs()));
+
+        MI->eraseFromParent();
+        Changed = true;
+        LastChange = I;
+        NumCopiesRemoved++;
+        SmallestDef =
+            TRI->isSubRegister(SmallestDef, DefReg) ? DefReg : SmallestDef;
+        continue;
+      }
+    }
+
+    if (MI->modifiesRegister(TargetReg, TRI))
+      break;
+  }
+
+  if (!Changed)
+    return false;
+
+  // Otherwise, we have to fixup the use-def chain, starting with the
+  // CBZ/CBNZ. Conservatively mark as much as we can live.
+  CompBr->clearRegisterKills(SmallestDef, TRI);
+
+  if (std::none_of(TargetRegs.begin(), TargetRegs.end(),
+                   [&](unsigned Reg) { return MBB->isLiveIn(Reg); }))
+    MBB->addLiveIn(TargetReg);
+
+  // Clear any kills of TargetReg between CompBr and the last removed COPY.
+  for (MachineInstr &MMI :
+       make_range(MBB->begin()->getIterator(), LastChange->getIterator()))
+    MMI.clearRegisterKills(SmallestDef, TRI);
+
+  return true;
+}
+
+bool AArch64RedundantCopyElimination::runOnMachineFunction(
+    MachineFunction &MF) {
+  if (skipFunction(*MF.getFunction()))
+    return false;
+  TRI = MF.getSubtarget().getRegisterInfo();
+  MRI = &MF.getRegInfo();
+  bool Changed = false;
+  for (MachineBasicBlock &MBB : MF)
+    Changed |= optimizeCopy(&MBB);
+  return Changed;
+}
+
+FunctionPass *llvm::createAArch64RedundantCopyEliminationPass() {
+  return new AArch64RedundantCopyElimination();
+}
diff --git a/lib/Target/AArch64/AArch64RegisterBankInfo.cpp b/lib/Target/AArch64/AArch64RegisterBankInfo.cpp
new file mode 100644
index 000000000000..0a1831bd9a8c
--- /dev/null
+++ b/lib/Target/AArch64/AArch64RegisterBankInfo.cpp
@@ -0,0 +1,168 @@
+//===- AArch64RegisterBankInfo.cpp -------------------------------*- C++ -*-==//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+/// \file
+/// This file implements the targeting of the RegisterBankInfo class for
+/// AArch64.
+/// \todo This should be generated by TableGen.
+//===----------------------------------------------------------------------===//
+
+#include "AArch64RegisterBankInfo.h"
+#include "AArch64InstrInfo.h" // For XXXRegClassID.
+#include "llvm/CodeGen/GlobalISel/RegisterBank.h"
+#include "llvm/CodeGen/GlobalISel/RegisterBankInfo.h"
+#include "llvm/Target/TargetRegisterInfo.h"
+#include "llvm/Target/TargetSubtargetInfo.h"
+
+using namespace llvm;
+
+#ifndef LLVM_BUILD_GLOBAL_ISEL
+#error "You shouldn't build this"
+#endif
+
+AArch64RegisterBankInfo::AArch64RegisterBankInfo(const TargetRegisterInfo &TRI)
+    : RegisterBankInfo(AArch64::NumRegisterBanks) {
+  // Initialize the GPR bank.
+  createRegisterBank(AArch64::GPRRegBankID, "GPR");
+  // The GPR register bank is fully defined by all the registers in
+  // GR64all + its subclasses.
+  addRegBankCoverage(AArch64::GPRRegBankID, AArch64::GPR64allRegClassID, TRI);
+  const RegisterBank &RBGPR = getRegBank(AArch64::GPRRegBankID);
+  (void)RBGPR;
+  assert(RBGPR.covers(*TRI.getRegClass(AArch64::GPR32RegClassID)) &&
+         "Subclass not added?");
+  assert(RBGPR.getSize() == 64 && "GPRs should hold up to 64-bit");
+
+  // Initialize the FPR bank.
+  createRegisterBank(AArch64::FPRRegBankID, "FPR");
+  // The FPR register bank is fully defined by all the registers in
+  // GR64all + its subclasses.
+  addRegBankCoverage(AArch64::FPRRegBankID, AArch64::QQQQRegClassID, TRI);
+  const RegisterBank &RBFPR = getRegBank(AArch64::FPRRegBankID);
+  (void)RBFPR;
+  assert(RBFPR.covers(*TRI.getRegClass(AArch64::QQRegClassID)) &&
+         "Subclass not added?");
+  assert(RBFPR.covers(*TRI.getRegClass(AArch64::FPR64RegClassID)) &&
+         "Subclass not added?");
+  assert(RBFPR.getSize() == 512 &&
+         "FPRs should hold up to 512-bit via QQQQ sequence");
+
+  // Initialize the CCR bank.
+  createRegisterBank(AArch64::CCRRegBankID, "CCR");
+  addRegBankCoverage(AArch64::CCRRegBankID, AArch64::CCRRegClassID, TRI);
+  const RegisterBank &RBCCR = getRegBank(AArch64::CCRRegBankID);
+  (void)RBCCR;
+  assert(RBCCR.covers(*TRI.getRegClass(AArch64::CCRRegClassID)) &&
+         "Class not added?");
+  assert(RBCCR.getSize() == 32 && "CCR should hold up to 32-bit");
+
+  assert(verify(TRI) && "Invalid register bank information");
+}
+
+unsigned AArch64RegisterBankInfo::copyCost(const RegisterBank &A,
+                                           const RegisterBank &B,
+                                           unsigned Size) const {
+  // What do we do with different size?
+  // copy are same size.
+  // Will introduce other hooks for different size:
+  // * extract cost.
+  // * build_sequence cost.
+  // TODO: Add more accurate cost for FPR to/from GPR.
+  return RegisterBankInfo::copyCost(A, B, Size);
+}
+
+const RegisterBank &AArch64RegisterBankInfo::getRegBankFromRegClass(
+    const TargetRegisterClass &RC) const {
+  switch (RC.getID()) {
+  case AArch64::FPR8RegClassID:
+  case AArch64::FPR16RegClassID:
+  case AArch64::FPR32RegClassID:
+  case AArch64::FPR64RegClassID:
+  case AArch64::FPR128RegClassID:
+  case AArch64::FPR128_loRegClassID:
+  case AArch64::DDRegClassID:
+  case AArch64::DDDRegClassID:
+  case AArch64::DDDDRegClassID:
+  case AArch64::QQRegClassID:
+  case AArch64::QQQRegClassID:
+  case AArch64::QQQQRegClassID:
+    return getRegBank(AArch64::FPRRegBankID);
+  case AArch64::GPR32commonRegClassID:
+  case AArch64::GPR32RegClassID:
+  case AArch64::GPR32spRegClassID:
+  case AArch64::GPR32sponlyRegClassID:
+  case AArch64::GPR32allRegClassID:
+  case AArch64::GPR64commonRegClassID:
+  case AArch64::GPR64RegClassID:
+  case AArch64::GPR64spRegClassID:
+  case AArch64::GPR64sponlyRegClassID:
+  case AArch64::GPR64allRegClassID:
+  case AArch64::tcGPR64RegClassID:
+  case AArch64::WSeqPairsClassRegClassID:
+  case AArch64::XSeqPairsClassRegClassID:
+    return getRegBank(AArch64::GPRRegBankID);
+  case AArch64::CCRRegClassID:
+    return getRegBank(AArch64::CCRRegBankID);
+  default:
+    llvm_unreachable("Register class not supported");
+  }
+}
+
+RegisterBankInfo::InstructionMappings
+AArch64RegisterBankInfo::getInstrAlternativeMappings(
+    const MachineInstr &MI) const {
+  switch (MI.getOpcode()) {
+  case TargetOpcode::G_OR: {
+    // 32 and 64-bit or can be mapped on either FPR or
+    // GPR for the same cost.
+    const MachineFunction &MF = *MI.getParent()->getParent();
+    const TargetSubtargetInfo &STI = MF.getSubtarget();
+    const TargetRegisterInfo &TRI = *STI.getRegisterInfo();
+    const MachineRegisterInfo &MRI = MF.getRegInfo();
+
+    unsigned Size = getSizeInBits(MI.getOperand(0).getReg(), MRI, TRI);
+    if (Size != 32 && Size != 64)
+      break;
+
+    // If the instruction has any implicit-defs or uses,
+    // do not mess with it.
+    if (MI.getNumOperands() != 3)
+      break;
+    InstructionMappings AltMappings;
+    InstructionMapping GPRMapping(/*ID*/ 1, /*Cost*/ 1, /*NumOperands*/ 3);
+    InstructionMapping FPRMapping(/*ID*/ 2, /*Cost*/ 1, /*NumOperands*/ 3);
+    for (unsigned Idx = 0; Idx != 3; ++Idx) {
+      GPRMapping.setOperandMapping(Idx, Size,
+                                   getRegBank(AArch64::GPRRegBankID));
+      FPRMapping.setOperandMapping(Idx, Size,
+                                   getRegBank(AArch64::FPRRegBankID));
+    }
+    AltMappings.emplace_back(std::move(GPRMapping));
+    AltMappings.emplace_back(std::move(FPRMapping));
+    return AltMappings;
+  }
+  default:
+    break;
+  }
+  return RegisterBankInfo::getInstrAlternativeMappings(MI);
+}
+
+void AArch64RegisterBankInfo::applyMappingImpl(
+    const OperandsMapper &OpdMapper) const {
+  switch (OpdMapper.getMI().getOpcode()) {
+  case TargetOpcode::G_OR: {
+    // Those ID must match getInstrAlternativeMappings.
+    assert((OpdMapper.getInstrMapping().getID() == 1 ||
+            OpdMapper.getInstrMapping().getID() == 2) &&
+           "Don't know how to handle that ID");
+    return applyDefaultMapping(OpdMapper);
+  }
+  default:
+    llvm_unreachable("Don't know how to handle that operation");
+  }
+}
diff --git a/lib/Target/AArch64/AArch64RegisterBankInfo.h b/lib/Target/AArch64/AArch64RegisterBankInfo.h
new file mode 100644
index 000000000000..907bcfdea161
--- /dev/null
+++ b/lib/Target/AArch64/AArch64RegisterBankInfo.h
@@ -0,0 +1,69 @@
+//===- AArch64RegisterBankInfo -----------------------------------*- C++ -*-==//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+/// \file
+/// This file declares the targeting of the RegisterBankInfo class for AArch64.
+/// \todo This should be generated by TableGen.
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_AARCH64_AARCH64REGISTERBANKINFO_H
+#define LLVM_LIB_TARGET_AARCH64_AARCH64REGISTERBANKINFO_H
+
+#include "llvm/CodeGen/GlobalISel/RegisterBankInfo.h"
+
+namespace llvm {
+
+class TargetRegisterInfo;
+
+namespace AArch64 {
+enum {
+  GPRRegBankID = 0, /// General Purpose Registers: W, X.
+  FPRRegBankID = 1, /// Floating Point/Vector Registers: B, H, S, D, Q.
+  CCRRegBankID = 2, /// Conditional register: NZCV.
+  NumRegisterBanks
+};
+} // End AArch64 namespace.
+
+/// This class provides the information for the target register banks.
+class AArch64RegisterBankInfo : public RegisterBankInfo {
+  /// See RegisterBankInfo::applyMapping.
+  void applyMappingImpl(const OperandsMapper &OpdMapper) const override;
+
+public:
+  AArch64RegisterBankInfo(const TargetRegisterInfo &TRI);
+  /// Get the cost of a copy from \p B to \p A, or put differently,
+  /// get the cost of A = COPY B. Since register banks may cover
+  /// different size, \p Size specifies what will be the size in bits
+  /// that will be copied around.
+  ///
+  /// \note Since this is a copy, both registers have the same size.
+  unsigned copyCost(const RegisterBank &A, const RegisterBank &B,
+                    unsigned Size) const override;
+
+  /// Get a register bank that covers \p RC.
+  ///
+  /// \pre \p RC is a user-defined register class (as opposed as one
+  /// generated by TableGen).
+  ///
+  /// \note The mapping RC -> RegBank could be built while adding the
+  /// coverage for the register banks. However, we do not do it, because,
+  /// at least for now, we only need this information for register classes
+  /// that are used in the description of instruction. In other words,
+  /// there are just a handful of them and we do not want to waste space.
+  ///
+  /// \todo This should be TableGen'ed.
+  const RegisterBank &
+  getRegBankFromRegClass(const TargetRegisterClass &RC) const override;
+
+  /// Get the alternative mappings for \p MI.
+  /// Alternative in the sense different from getInstrMapping.
+  InstructionMappings
+  getInstrAlternativeMappings(const MachineInstr &MI) const override;
+};
+} // End llvm namespace.
+#endif
diff --git a/lib/Target/AArch64/AArch64RegisterInfo.cpp b/lib/Target/AArch64/AArch64RegisterInfo.cpp
index 32b4888f2f64..af867da4823d 100644
--- a/lib/Target/AArch64/AArch64RegisterInfo.cpp
+++ b/lib/Target/AArch64/AArch64RegisterInfo.cpp
@@ -25,7 +25,6 @@
 #include "llvm/CodeGen/MachineRegisterInfo.h"
 #include "llvm/CodeGen/RegisterScavenging.h"
 #include "llvm/IR/Function.h"
-#include "llvm/Support/CommandLine.h"
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/Target/TargetFrameLowering.h"
 #include "llvm/Target/TargetOptions.h"
@@ -51,6 +50,13 @@ AArch64RegisterInfo::getCalleeSavedRegs(const MachineFunction *MF) const {
     return MF->getInfo<AArch64FunctionInfo>()->isSplitCSR() ?
            CSR_AArch64_CXX_TLS_Darwin_PE_SaveList :
            CSR_AArch64_CXX_TLS_Darwin_SaveList;
+  if (MF->getSubtarget<AArch64Subtarget>().getTargetLowering()
+          ->supportSwiftError() &&
+      MF->getFunction()->getAttributes().hasAttrSomewhere(
+          Attribute::SwiftError))
+    return CSR_AArch64_AAPCS_SwiftError_SaveList;
+  if (MF->getFunction()->getCallingConv() == CallingConv::PreserveMost)
+    return CSR_AArch64_RT_MostRegs_SaveList;
   else
     return CSR_AArch64_AAPCS_SaveList;
 }
@@ -74,6 +80,12 @@ AArch64RegisterInfo::getCallPreservedMask(const MachineFunction &MF,
     return CSR_AArch64_AllRegs_RegMask;
   if (CC == CallingConv::CXX_FAST_TLS)
     return CSR_AArch64_CXX_TLS_Darwin_RegMask;
+  if (MF.getSubtarget<AArch64Subtarget>().getTargetLowering()
+          ->supportSwiftError() &&
+      MF.getFunction()->getAttributes().hasAttrSomewhere(Attribute::SwiftError))
+    return CSR_AArch64_AAPCS_SwiftError_RegMask;
+  if (CC == CallingConv::PreserveMost)
+    return CSR_AArch64_RT_MostRegs_RegMask;
   else
     return CSR_AArch64_AAPCS_RegMask;
 }
@@ -190,9 +202,7 @@ bool AArch64RegisterInfo::hasBasePointer(const MachineFunction &MF) const {
     // If it's wrong, we'll materialize the constant and still get to the
     // object; it's just suboptimal. Negative offsets use the unscaled
     // load/store instructions, which have a 9-bit signed immediate.
-    if (MFI->getLocalFrameSize() < 256)
-      return false;
-    return true;
+    return MFI->getLocalFrameSize() >= 256;
   }
 
   return false;
@@ -231,9 +241,7 @@ bool AArch64RegisterInfo::requiresFrameIndexScavenging(
 bool
 AArch64RegisterInfo::cannotEliminateFrame(const MachineFunction &MF) const {
   const MachineFrameInfo *MFI = MF.getFrameInfo();
-  // Only consider eliminating leaf frames.
-  if (MFI->hasCalls() || (MF.getTarget().Options.DisableFramePointerElim(MF) &&
-                          MFI->adjustsStack()))
+  if (MF.getTarget().Options.DisableFramePointerElim(MF) && MFI->adjustsStack())
     return true;
   return MFI->hasVarSizedObjects() || MFI->isFrameAddressTaken();
 }
@@ -396,8 +404,6 @@ void AArch64RegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II,
   MI.getOperand(FIOperandNum).ChangeToRegister(ScratchReg, false, false, true);
 }
 
-namespace llvm {
-
 unsigned AArch64RegisterInfo::getRegPressureLimit(const TargetRegisterClass *RC,
                                                   MachineFunction &MF) const {
   const AArch64FrameLowering *TFI = getFrameLowering(MF);
@@ -437,5 +443,3 @@ unsigned AArch64RegisterInfo::getRegPressureLimit(const TargetRegisterClass *RC,
     return 16;
   }
 }
-
-} // namespace llvm
diff --git a/lib/Target/AArch64/AArch64RegisterInfo.td b/lib/Target/AArch64/AArch64RegisterInfo.td
index a8c8b176efa9..5fbaff00a5e7 100644
--- a/lib/Target/AArch64/AArch64RegisterInfo.td
+++ b/lib/Target/AArch64/AArch64RegisterInfo.td
@@ -1,4 +1,4 @@
-//=- AArch64RegisterInfo.td - Describe the AArch64 Regisers --*- tablegen -*-=//
+//=- AArch64RegisterInfo.td - Describe the AArch64 Registers -*- tablegen -*-=//
 //
 //                     The LLVM Compiler Infrastructure
 //
diff --git a/lib/Target/AArch64/AArch64SchedA53.td b/lib/Target/AArch64/AArch64SchedA53.td
index d709bee7b9eb..93ca079275c8 100644
--- a/lib/Target/AArch64/AArch64SchedA53.td
+++ b/lib/Target/AArch64/AArch64SchedA53.td
@@ -19,13 +19,13 @@
 def CortexA53Model : SchedMachineModel {
   let MicroOpBufferSize = 0; // Explicitly set to zero since A53 is in-order.
   let IssueWidth = 2;        // 2 micro-ops are dispatched per cycle.
-  let MinLatency = 1 ;       // OperandCycles are interpreted as MinLatency.
   let LoadLatency = 3;       // Optimistic load latency assuming bypass.
                              // This is overriden by OperandCycles if the
                              // Itineraries are queried instead.
   let MispredictPenalty = 9; // Based on "Cortex-A53 Software Optimisation
                              // Specification - Instruction Timings"
                              // v 1.0 Spreadsheet
+  let CompleteModel = 1;
 }
 
 
@@ -109,6 +109,8 @@ def A53WriteVST2 : SchedWriteRes<[A53UnitLdSt]> { let Latency = 5;
 def A53WriteVST3 : SchedWriteRes<[A53UnitLdSt]> { let Latency = 6;
                                                   let ResourceCycles = [3]; }
 
+def : WriteRes<WriteAtomic, []> { let Unsupported = 1; }
+
 // Branch
 def : WriteRes<WriteBr, [A53UnitB]>;
 def : WriteRes<WriteBrReg, [A53UnitB]>;
diff --git a/lib/Target/AArch64/AArch64SchedA57.td b/lib/Target/AArch64/AArch64SchedA57.td
index ca4457af8525..a266351f7ffc 100644
--- a/lib/Target/AArch64/AArch64SchedA57.td
+++ b/lib/Target/AArch64/AArch64SchedA57.td
@@ -30,6 +30,7 @@ def CortexA57Model : SchedMachineModel {
   // Enable partial & runtime unrolling. The magic number is chosen based on
   // experiments and benchmarking data.
   let LoopMicroOpBufferSize = 16;
+  let CompleteModel = 1;
 }
 
 //===----------------------------------------------------------------------===//
@@ -96,6 +97,8 @@ def : SchedAlias<WriteV,     A57Write_3cyc_1V>;
 def : SchedAlias<WriteVLD,   A57Write_5cyc_1L>;
 def : SchedAlias<WriteVST,   A57Write_1cyc_1S>;
 
+def : WriteRes<WriteAtomic,  []> { let Unsupported = 1; }
+
 def : WriteRes<WriteSys,     []> { let Latency = 1; }
 def : WriteRes<WriteBarrier, []> { let Latency = 1; }
 def : WriteRes<WriteHint,    []> { let Latency = 1; }
diff --git a/lib/Target/AArch64/AArch64SchedCyclone.td b/lib/Target/AArch64/AArch64SchedCyclone.td
index a2a180237789..9fd3ae6818e5 100644
--- a/lib/Target/AArch64/AArch64SchedCyclone.td
+++ b/lib/Target/AArch64/AArch64SchedCyclone.td
@@ -1,4 +1,4 @@
-//=- ARMSchedCyclone.td - AArch64 Cyclone Scheduling Defs ----*- tablegen -*-=//
+//=- AArch64SchedCyclone.td - Cyclone Scheduling Definitions -*- tablegen -*-=//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -17,6 +17,7 @@ def CycloneModel : SchedMachineModel {
   let MicroOpBufferSize = 192; // Based on the reorder buffer.
   let LoadLatency = 4; // Optimistic load latency.
   let MispredictPenalty = 16; // 14-19 cycles are typical.
+  let CompleteModel = 1;
 }
 
 //===----------------------------------------------------------------------===//
@@ -107,7 +108,7 @@ def WriteX : SchedWriteRes<[]> { let Latency = 0; }
 // The move is replaced by a single nop micro-op.
 // MOVZ Rd, #0
 // AND Rd, Rzr, #imm
-def WriteZPred : SchedPredicate<[{TII->isGPRZero(MI)}]>;
+def WriteZPred : SchedPredicate<[{TII->isGPRZero(*MI)}]>;
 def WriteImmZ  : SchedWriteVariant<[
                    SchedVar<WriteZPred, [WriteX]>,
                    SchedVar<NoSchedPred, [WriteImm]>]>;
@@ -116,8 +117,8 @@ def : InstRW<[WriteImmZ], (instrs MOVZWi,MOVZXi,ANDWri,ANDXri)>;
 // Move GPR is a register rename and single nop micro-op.
 // ORR Xd, XZR, Xm
 // ADD Xd, Xn, #0
-def WriteIMovPred : SchedPredicate<[{TII->isGPRCopy(MI)}]>;
-def WriteVMovPred : SchedPredicate<[{TII->isFPRCopy(MI)}]>;
+def WriteIMovPred : SchedPredicate<[{TII->isGPRCopy(*MI)}]>;
+def WriteVMovPred : SchedPredicate<[{TII->isFPRCopy(*MI)}]>;
 def WriteMov      : SchedWriteVariant<[
                       SchedVar<WriteIMovPred, [WriteX]>,
                       SchedVar<WriteVMovPred, [WriteX]>,
@@ -726,7 +727,7 @@ def : InstRW<[WriteVLDShuffle, WriteAdr, WriteV, WriteV],
 def : InstRW<[WriteVLDShuffle, WriteVLDShuffle, WriteV],
              (instrs LD3Rv1d,LD3Rv2d)>;
 def : InstRW<[WriteVLDShuffle, WriteAdr, WriteVLDShuffle, WriteV],
-             (instrs LD3Rv2d_POST,LD3Rv2d_POST)>;
+             (instrs LD3Rv1d_POST,LD3Rv2d_POST)>;
 
 def : InstRW<[WriteVLDShuffle, WriteVLDShuffle, WriteV, WriteV],
              (instregex "LD4Fourv(8b|4h|2s)$")>;
@@ -851,6 +852,9 @@ def : InstRW<[WriteAdr, WriteVSTPairShuffle], (instregex "ST4i(8|16|32)_POST")>;
 def : InstRW<[WriteVSTShuffle, WriteVSTShuffle],          (instrs ST4i64)>;
 def : InstRW<[WriteAdr, WriteVSTShuffle, WriteVSTShuffle],(instrs ST4i64_POST)>;
 
+// Atomic operations are not supported.
+def : WriteRes<WriteAtomic, []> { let Unsupported = 1; }
+
 //---
 // Unused SchedRead types
 //---
diff --git a/lib/Target/AArch64/AArch64SchedKryo.td b/lib/Target/AArch64/AArch64SchedKryo.td
new file mode 100644
index 000000000000..4e491a04c78d
--- /dev/null
+++ b/lib/Target/AArch64/AArch64SchedKryo.td
@@ -0,0 +1,133 @@
+//==- AArch64SchedKryo.td - Qualcomm Kryo Scheduling Defs ---*- tablegen -*-==//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines the machine model for Qualcomm Kryo to support
+// instruction scheduling and other instruction cost heuristics.
+//
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+// The issue width is set to five, matching the five issue queues for expanded
+// uops. Now, the latency spreadsheet has information based on fragmented uops,
+// but these do not actually take up an issue queue.
+
+def KryoModel : SchedMachineModel {
+  let IssueWidth        =   5; // 5-wide issue for expanded uops
+  let MicroOpBufferSize = 128; // Out-of-order with temporary unified issue buffer
+  let LoadLatency       =   4; // Optimistic load latency
+  let MispredictPenalty =  14; // Fetch + Decode/Rename/Dispatch + Branch
+
+  // Enable partial & runtime unrolling. The magic number is chosen based on
+  // experiments and benchmarking data.
+  let LoopMicroOpBufferSize = 16;
+  let CompleteModel = 1;
+}
+
+//===----------------------------------------------------------------------===//
+// Define each kind of processor resource and number available on Kryo.
+
+let SchedModel = KryoModel in {
+  def KryoUnitXA : ProcResource<1>;                   // Type X(A) micro-ops
+  def KryoUnitXB : ProcResource<1>;                   // Type X(B) micro-ops
+  def KryoUnitYA : ProcResource<1>;                   // Type Y(A) micro-ops
+  def KryoUnitYB : ProcResource<1>;                   // Type Y(B) micro-ops
+  def KryoUnitX : ProcResGroup<[KryoUnitXA,          // Type X micro-ops
+                                KryoUnitXB]>;
+  def KryoUnitY : ProcResGroup<[KryoUnitYA,          // Type Y micro-ops
+                                KryoUnitYB]>;
+  def KryoUnitXY : ProcResGroup<[KryoUnitXA,         // Type XY micro-ops
+                                 KryoUnitXB,
+                                 KryoUnitYA,
+                                 KryoUnitYB]>;
+  def KryoUnitLSA : ProcResource<1>;                  // Type LS(A) micro-ops
+  def KryoUnitLSB : ProcResource<1>;                  // Type LS(B) micro-ops
+  def KryoUnitLS : ProcResGroup<[KryoUnitLSA,        // Type LS micro-ops
+                                 KryoUnitLSB]>;
+}
+
+let SchedModel = KryoModel in {
+
+//===----------------------------------------------------------------------===//
+// Map the target-defined scheduler read/write resources and latency for
+// Kryo.
+
+def : WriteRes<WriteImm,   [KryoUnitXY]> { let Latency = 1; }
+def : WriteRes<WriteI,     [KryoUnitXY]> { let Latency = 1; }
+def : WriteRes<WriteISReg, [KryoUnitXY, KryoUnitXY]>
+      { let Latency = 2; let NumMicroOps = 2; }
+def : WriteRes<WriteIEReg, [KryoUnitXY, KryoUnitXY]>
+      { let Latency = 2; let NumMicroOps = 2; }
+def : WriteRes<WriteExtr,  [KryoUnitXY, KryoUnitX]>
+      { let Latency = 2; let NumMicroOps = 2; }
+def : WriteRes<WriteIS,    [KryoUnitXY]> { let Latency = 2; }
+def : WriteRes<WriteID32,  [KryoUnitXA, KryoUnitY]>
+      { let Latency = 8; let NumMicroOps = 1; } // Fragent -1
+def : WriteRes<WriteID64,  [KryoUnitXA, KryoUnitY]>
+      { let Latency = 8; let NumMicroOps = 1; } // Fragent -1
+def : WriteRes<WriteIM32,  [KryoUnitX]> { let Latency = 5; }
+def : WriteRes<WriteIM64,  [KryoUnitX]> { let Latency = 5; }
+def : WriteRes<WriteBr,    [KryoUnitXY]> { let Latency = 1; }
+def : WriteRes<WriteBrReg, [KryoUnitXY]> { let Latency = 1; }
+def : WriteRes<WriteLD,    [KryoUnitLS]> { let Latency = 4; }
+def : WriteRes<WriteST,    [KryoUnitLS]> { let Latency = 4; }
+def : WriteRes<WriteSTP,   [KryoUnitLS]> { let Latency = 4; }
+def : WriteRes<WriteAdr,   [KryoUnitXY]> { let Latency = 6; }
+def : WriteRes<WriteLDIdx, [KryoUnitLS]> { let Latency = 4; }
+def : WriteRes<WriteSTIdx, [KryoUnitLS]> { let Latency = 4; }
+def : WriteRes<WriteF,     [KryoUnitXY, KryoUnitXY]>
+      { let Latency = 3; let NumMicroOps = 2; }
+def : WriteRes<WriteFCmp,  [KryoUnitXY]> { let Latency = 2; }
+def : WriteRes<WriteFCvt,  [KryoUnitX]> { let Latency = 4; }
+def : WriteRes<WriteFCopy, [KryoUnitXY]> { let Latency = 6; }
+def : WriteRes<WriteFImm,  [KryoUnitXY]> { let Latency = 6; }
+def : WriteRes<WriteFMul,  [KryoUnitX, KryoUnitX]>
+      { let Latency = 6; let NumMicroOps = 2; }
+def : WriteRes<WriteFDiv,  [KryoUnitXA, KryoUnitY]>
+      { let Latency = 12; let NumMicroOps = 2; } // Fragent -1 / NoRSV +1
+def : WriteRes<WriteV,     [KryoUnitXY]> { let Latency = 6; }
+def : WriteRes<WriteVLD,   [KryoUnitLS]> { let Latency = 4; }
+def : WriteRes<WriteVST,   [KryoUnitLS]> { let Latency = 4; }
+
+def : WriteRes<WriteSys,     []> { let Latency = 1; }
+def : WriteRes<WriteBarrier, []> { let Latency = 1; }
+def : WriteRes<WriteHint,    []> { let Latency = 1; }
+
+def : WriteRes<WriteLDHi,    []> { let Latency = 4; }
+
+def : WriteRes<WriteAtomic, []> { let Unsupported = 1; }
+
+// No forwarding logic is modelled yet.
+def : ReadAdvance<ReadI,       0>;
+def : ReadAdvance<ReadISReg,   0>;
+def : ReadAdvance<ReadIEReg,   0>;
+def : ReadAdvance<ReadIM,      0>;
+def : ReadAdvance<ReadIMA,     0>;
+def : ReadAdvance<ReadID,      0>;
+def : ReadAdvance<ReadExtrHi,  0>;
+def : ReadAdvance<ReadAdrBase, 0>;
+def : ReadAdvance<ReadVLD,     0>;
+
+
+//===----------------------------------------------------------------------===//
+// Specialize the coarse model by associating instruction groups with the
+// subtarget-defined types. As the modeled is refined, this will override most
+// of the above SchedWriteRes and SchedAlias mappings.
+
+// Miscellaneous
+// -----------------------------------------------------------------------------
+
+def : InstRW<[WriteI], (instrs COPY)>;
+
+
+// Detailed Refinedments
+// -----------------------------------------------------------------------------
+include "AArch64SchedKryoDetails.td"
+
+
+} // SchedModel = KryoModel
diff --git a/lib/Target/AArch64/AArch64SchedKryoDetails.td b/lib/Target/AArch64/AArch64SchedKryoDetails.td
new file mode 100644
index 000000000000..426ae6103e4b
--- /dev/null
+++ b/lib/Target/AArch64/AArch64SchedKryoDetails.td
@@ -0,0 +1,2358 @@
+//=- AArch64SchedKryoDetails.td - QC Kryo Scheduling Defs ----*- tablegen -*-=//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines the uop and latency details for the machine model for the
+// Qualcomm Kryo subtarget.
+//
+//===----------------------------------------------------------------------===//
+
+def KryoWrite_3cyc_X_noRSV_138ln :
+	SchedWriteRes<[KryoUnitX]> {
+    let Latency = 3; let NumMicroOps = 2;
+}
+def : InstRW<[KryoWrite_3cyc_X_noRSV_138ln],
+    (instregex "(S|U)R?SRA(d|(v2i32|v4i16|v8i8)_shift)")>;
+
+def KryoWrite_3cyc_X_X_139ln :
+	SchedWriteRes<[KryoUnitX, KryoUnitX]> {
+    let Latency = 3; let NumMicroOps = 2;
+}
+def : InstRW<[KryoWrite_3cyc_X_X_139ln],
+    (instregex "(S|U)R?SRA(v2i64|v4i32|v8i16|v16i8)_shift")>;
+
+def KryoWrite_4cyc_XY_XY_noRSV_172ln :
+    SchedWriteRes<[KryoUnitXY, KryoUnitXY]> {
+    let Latency = 4; let NumMicroOps = 3;
+}
+def : InstRW<[KryoWrite_4cyc_XY_XY_noRSV_172ln],
+	(instregex "(S|U)ABA(v8i8|v4i16|v2i32)")>;
+def KryoWrite_4cyc_XY_XY_XY_XY_178ln :
+    SchedWriteRes<[KryoUnitXY, KryoUnitXY, KryoUnitXY, KryoUnitXY]> {
+    let Latency = 4; let NumMicroOps = 4;
+}
+def : InstRW<[KryoWrite_4cyc_XY_XY_XY_XY_178ln],
+	(instregex "(S|U)ABA(v16i8|v8i16|v4i32)")>;
+def KryoWrite_3cyc_XY_XY_XY_XY_177ln :
+	SchedWriteRes<[KryoUnitXY, KryoUnitXY, KryoUnitXY, KryoUnitXY]> {
+    let Latency = 3; let NumMicroOps = 4;
+}
+def : InstRW<[KryoWrite_3cyc_XY_XY_XY_XY_177ln],
+	(instregex "(S|U)ABALv.*")>;
+def KryoWrite_3cyc_XY_XY_166ln :
+	SchedWriteRes<[KryoUnitXY, KryoUnitXY]> {
+    let Latency = 3; let NumMicroOps = 2;
+}
+def : InstRW<[KryoWrite_3cyc_XY_XY_166ln],
+	(instregex "(S|U)(ABD|QSUB|RHADD)(v16i8|v8i16|v4i32|v2i64)")>;
+def KryoWrite_3cyc_XY_noRSV_159ln :
+	SchedWriteRes<[KryoUnitXY]> {
+    let Latency = 3; let NumMicroOps = 2;
+}
+def : InstRW<[KryoWrite_3cyc_XY_noRSV_159ln],
+	(instregex "(S|U)(ABD|RHADD)(v8i8|v4i16|v2i32)")>;
+def KryoWrite_3cyc_XY_XY_165ln :
+	SchedWriteRes<[KryoUnitXY, KryoUnitXY]> {
+    let Latency = 3; let NumMicroOps = 2;
+}
+def : InstRW<[KryoWrite_3cyc_XY_XY_165ln],
+	(instregex "(S|U)ABDLv.*")>;
+def KryoWrite_3cyc_X_noRSV_154ln :
+	SchedWriteRes<[KryoUnitX]> {
+let Latency = 3; let NumMicroOps = 2;
+}
+def : InstRW<[KryoWrite_3cyc_X_noRSV_154ln],
+	(instregex "(S|U)ADALP(v8i8|v4i16|v2i32)_v.*")>;
+def KryoWrite_3cyc_X_X_155ln :
+	SchedWriteRes<[KryoUnitX, KryoUnitX]> {
+	let Latency = 3; let NumMicroOps = 2;
+}
+def : InstRW<[KryoWrite_3cyc_X_X_155ln],
+	(instregex "(S|U)ADALP(v16i8|v8i16|v4i32)_v.*")>;
+def KryoWrite_2cyc_XY_XY_151ln :
+	SchedWriteRes<[KryoUnitXY, KryoUnitXY]> {
+	let Latency = 2; let NumMicroOps = 2;
+}
+def : InstRW<[KryoWrite_2cyc_XY_XY_151ln],
+	(instregex "(S|U)(ADD|SUB)Lv.*")>;
+def KryoWrite_2cyc_XY_noRSV_148ln :
+	SchedWriteRes<[KryoUnitXY]> {
+	let Latency = 2; let NumMicroOps = 2;
+}
+def : InstRW<[KryoWrite_2cyc_XY_noRSV_148ln],
+	(instregex "((S|U)ADDLP|ABS)(v2i32|v4i16|v8i8)(_v.*)?")>;
+def KryoWrite_2cyc_XY_XY_150ln :
+	SchedWriteRes<[KryoUnitXY, KryoUnitXY]> {
+	let Latency = 2; let NumMicroOps = 2;
+}
+def : InstRW<[KryoWrite_2cyc_XY_XY_150ln],
+	(instregex "((S|U)ADDLP|ABS)(v2i64|v4i32|v8i16|v16i8)(_v.*)?")>;
+def KryoWrite_3cyc_XY_XY_XY_noRSV_179ln :
+	SchedWriteRes<[KryoUnitXY, KryoUnitXY, KryoUnitXY]> {
+	let Latency = 3; let NumMicroOps = 4;
+}
+def : InstRW<[KryoWrite_3cyc_XY_XY_XY_noRSV_179ln],
+	(instrs SADDLVv4i32v, UADDLVv4i32v)>;
+def KryoWrite_5cyc_XY_XY_XY_noRSV_180ln :
+	SchedWriteRes<[KryoUnitXY, KryoUnitXY, KryoUnitXY]> {
+	let Latency = 5; let NumMicroOps = 4;
+}
+def : InstRW<[KryoWrite_5cyc_XY_XY_XY_noRSV_180ln],
+	(instrs SADDLVv8i16v, UADDLVv8i16v)>;
+def KryoWrite_6cyc_XY_XY_X_noRSV_181ln :
+	SchedWriteRes<[KryoUnitXY, KryoUnitXY, KryoUnitX]> {
+	let Latency = 6; let NumMicroOps = 4;
+}
+def : InstRW<[KryoWrite_6cyc_XY_XY_X_noRSV_181ln],
+	(instrs SADDLVv16i8v, UADDLVv16i8v)>;
+def KryoWrite_3cyc_XY_noRSV_158ln :
+	SchedWriteRes<[KryoUnitXY]> {
+	let Latency = 3; let NumMicroOps = 2;
+}
+def : InstRW<[KryoWrite_3cyc_XY_noRSV_158ln],
+	(instrs SADDLVv4i16v, UADDLVv4i16v, ADDVv4i16v)>;
+def KryoWrite_4cyc_X_noRSV_169ln :
+	SchedWriteRes<[KryoUnitX]> {
+	let Latency = 4; let NumMicroOps = 2;
+}
+def : InstRW<[KryoWrite_4cyc_X_noRSV_169ln],
+	(instrs SADDLVv8i8v, UADDLVv8i8v, ADDVv8i8v)>;
+def KryoWrite_2cyc_XY_XY_XY_XY_176ln :
+	SchedWriteRes<[KryoUnitXY, KryoUnitXY, KryoUnitXY, KryoUnitXY]> {
+	let Latency = 2; let NumMicroOps = 4;
+}
+def : InstRW<[KryoWrite_2cyc_XY_XY_XY_XY_176ln],
+	(instregex "(S|U)(ADDW|SUBW)v.*")>;
+def KryoWrite_4cyc_X_noRSV_40ln :
+	SchedWriteRes<[KryoUnitX]> {
+	let Latency = 4; let NumMicroOps = 2;
+}
+def : InstRW<[KryoWrite_4cyc_X_noRSV_40ln],
+	(instregex "(S|U)CVTFS(W|X)(D|S)ri")>;
+def KryoWrite_4cyc_X_noRSV_97ln :
+	SchedWriteRes<[KryoUnitX]> {
+	let Latency = 4; let NumMicroOps = 2;
+}
+def : InstRW<[KryoWrite_4cyc_X_noRSV_97ln],
+	(instregex "(S|U)CVTFU(W|X)(D|S)ri")>;
+def KryoWrite_4cyc_X_noRSV_110ln :
+	SchedWriteRes<[KryoUnitX]> {
+	let Latency = 4; let NumMicroOps = 2;
+}
+def : InstRW<[KryoWrite_4cyc_X_noRSV_110ln],
+	(instregex "(S|U)CVTF(v1i32|v2i32|v1i64|v2f32|d|s)(_shift)?")>;
+def KryoWrite_4cyc_X_X_114ln :
+	SchedWriteRes<[KryoUnitX, KryoUnitX]> {
+	let Latency = 4; let NumMicroOps = 2;
+}
+def : InstRW<[KryoWrite_4cyc_X_X_114ln],
+	(instregex "(S|U)CVTF(v2i64|v4i32|v2f64|v4f32)(_shift)?")>;
+def KryoWrite_1cyc_XA_Y_98ln :
+	SchedWriteRes<[KryoUnitXA, KryoUnitY]> {
+	let Latency = 1; let NumMicroOps = 2;
+}
+def : InstRW<[KryoWrite_1cyc_XA_Y_98ln],
+	(instregex "(S|U)DIV(_Int)?(W|X)r")>;
+def KryoWrite_2cyc_XY_XY_152ln :
+	SchedWriteRes<[KryoUnitXY, KryoUnitXY]> {
+	let Latency = 2; let NumMicroOps = 2;
+}
+def : InstRW<[KryoWrite_2cyc_XY_XY_152ln],
+	(instregex "(S|U)H(ADD|SUB)(v16i8|v8i16|v4i32)")>;
+def KryoWrite_2cyc_XY_noRSV_149ln :
+	SchedWriteRes<[KryoUnitXY]> {
+	let Latency = 2; let NumMicroOps = 2;
+}
+def : InstRW<[KryoWrite_2cyc_XY_noRSV_149ln],
+	(instregex "((S|U)H(ADD|SUB)|ADDP)(v8i8|v4i16|v2i32)")>;
+def KryoWrite_4cyc_X_70ln :
+	SchedWriteRes<[KryoUnitX]> {
+	let Latency = 4; let NumMicroOps = 1;
+}
+def : InstRW<[KryoWrite_4cyc_X_70ln],
+	(instregex "(S|U)(MADDL|MSUBL)rrr")>;
+def KryoWrite_4cyc_X_X_191ln :
+	SchedWriteRes<[KryoUnitX, KryoUnitX]> {
+	let Latency = 4; let NumMicroOps = 2;
+}
+def : InstRW<[KryoWrite_4cyc_X_X_191ln],
+	(instregex "(S|U|SQD)(MLAL|MLSL|MULL)v.*")>;
+def KryoWrite_1cyc_XY_195ln :
+	SchedWriteRes<[KryoUnitXY]> {
+	let Latency = 1; let NumMicroOps = 1;
+}
+def : InstRW<[KryoWrite_1cyc_XY_195ln],
+	(instregex "(S|U)MOVv.*")>;
+def KryoWrite_5cyc_X_71ln :
+	SchedWriteRes<[KryoUnitX]> {
+	let Latency = 5; let NumMicroOps = 1;
+}
+def : InstRW<[KryoWrite_5cyc_X_71ln],
+	(instrs SMULHrr, UMULHrr)>;
+def KryoWrite_3cyc_XY_noRSV_186ln :
+	SchedWriteRes<[KryoUnitXY]> {
+	let Latency = 3; let NumMicroOps = 2;
+}
+def : InstRW<[KryoWrite_3cyc_XY_noRSV_186ln],
+	(instregex "^(S|U)QADD(v8i8|v4i16|v2i32)")>;
+def KryoWrite_3cyc_XY_XY_187ln :
+	SchedWriteRes<[KryoUnitXY, KryoUnitXY]> {
+	let Latency = 3; let NumMicroOps = 2;
+}
+def : InstRW<[KryoWrite_3cyc_XY_XY_187ln],
+	(instregex "^(S|U)QADD(v16i8|v8i16|v4i32|v2i64)")>;
+def KryoWrite_3cyc_XY_noRSV_69ln :
+	SchedWriteRes<[KryoUnitXY]> {
+	let Latency = 3; let NumMicroOps = 2;
+}
+def : InstRW<[KryoWrite_3cyc_XY_noRSV_69ln],
+	(instregex "(S|U|SU|US)QADD(v1i8|v1i16|v2i16|v1i32|v1i64)")>;
+def KryoWrite_3cyc_XY_noRSV_248ln :
+	SchedWriteRes<[KryoUnitXY]> {
+	let Latency = 3; let NumMicroOps = 2;
+}
+def : InstRW<[KryoWrite_3cyc_XY_noRSV_248ln],
+	(instregex "(S|U)QSHLU?(d|s|h|b|(v8i8|v4i16|v2i32)_shift)$")>;
+def KryoWrite_3cyc_XY_XY_250ln :
+	SchedWriteRes<[KryoUnitXY, KryoUnitXY]> {
+	let Latency = 3; let NumMicroOps = 2;
+}
+def : InstRW<[KryoWrite_3cyc_XY_XY_250ln],
+	(instregex "(S|U)(QSHLU?|RSHR)(v16i8|v8i16|v4i32|v2i64)_shift$")>;
+def KryoWrite_3cyc_XY_noRSV_246ln :
+	SchedWriteRes<[KryoUnitXY]> {
+	let Latency = 3; let NumMicroOps = 2;
+}
+def : InstRW<[KryoWrite_3cyc_XY_noRSV_246ln],
+	(instregex "(S|U)(QSHL|RSHL|QRSHL)(v1i8|v1i16|v1i32|v1i64|v8i8|v4i16|v2i32)$")>;
+def KryoWrite_3cyc_XY_XY_251ln :
+	SchedWriteRes<[KryoUnitXY, KryoUnitXY]> {
+	let Latency = 3; let NumMicroOps = 2;
+}
+def : InstRW<[KryoWrite_3cyc_XY_XY_251ln],
+	(instregex "(S|U)(QSHL|RSHL|QRSHL)(v16i8|v8i16|v4i32|v2i64)$")>;
+def KryoWrite_6cyc_XY_X_238ln :
+	SchedWriteRes<[KryoUnitXY, KryoUnitX]> {
+	let Latency = 6; let NumMicroOps = 2;
+}
+def : InstRW<[KryoWrite_6cyc_XY_X_238ln],
+	(instregex "((S|U)QR?SHRN|SQR?SHRUN)(v16i8|v8i16|v4i32)_shift$")>;
+def KryoWrite_3cyc_XY_noRSV_249ln :
+	SchedWriteRes<[KryoUnitXY]> {
+	let Latency = 3; let NumMicroOps = 2;
+}
+def : InstRW<[KryoWrite_3cyc_XY_noRSV_249ln],
+	(instregex "((S|U)QR?SHRN|SQR?SHRUN)(s|h|b)?")>;
+def KryoWrite_6cyc_XY_X_noRSV_252ln :
+	SchedWriteRes<[KryoUnitXY, KryoUnitX]> {
+	let Latency = 6; let NumMicroOps = 3;
+}
+def : InstRW<[KryoWrite_6cyc_XY_X_noRSV_252ln],
+	(instregex "((S|U)QR?SHRN|SQR?SHRUN)(v8i8|v4i16|v2i32)_shift?")>;
+def KryoWrite_3cyc_XY_noRSV_161ln :
+	SchedWriteRes<[KryoUnitXY]> {
+	let Latency = 3; let NumMicroOps = 2;
+}
+def : InstRW<[KryoWrite_3cyc_XY_noRSV_161ln],
+	(instregex "(S|U)QSUB(v8i8|v4i16|v2i32|v1i64|v1i32|v1i16|v1i8)")>;
+def KryoWrite_3cyc_XY_noRSV_163ln :
+	SchedWriteRes<[KryoUnitXY]> {
+	let Latency = 3; let NumMicroOps = 2;
+}
+def : InstRW<[KryoWrite_3cyc_XY_noRSV_163ln],
+	(instregex "(S|U)QXTU?N(v16i8|v8i16|v4i32|v8i8|v4i16|v2i32)")>;
+def KryoWrite_3cyc_XY_noRSV_162ln :
+	SchedWriteRes<[KryoUnitXY]> {
+	let Latency = 3; let NumMicroOps = 2;
+}
+def : InstRW<[KryoWrite_3cyc_XY_noRSV_162ln],
+	(instregex "(S|U)QXTU?N(v1i8|v1i16|v1i32)")>;
+def KryoWrite_3cyc_XY_noRSV_247ln :
+	SchedWriteRes<[KryoUnitXY]> {
+	let Latency = 3; let NumMicroOps = 2;
+}
+def : InstRW<[KryoWrite_3cyc_XY_noRSV_247ln],
+	(instregex "(S|U)RSHR(d|(v8i8|v4i16|v2i32)_shift)$")>;
+def KryoWrite_2cyc_XY_noRSV_239ln :
+	SchedWriteRes<[KryoUnitXY]> {
+	let Latency = 2; let NumMicroOps = 2;
+}
+def : InstRW<[KryoWrite_2cyc_XY_noRSV_239ln],
+	(instregex "(S|U)SHL(d|v8i8|v4i16|v2i32|v1i64)$")>;
+def KryoWrite_2cyc_XY_XY_243ln :
+	SchedWriteRes<[KryoUnitXY, KryoUnitXY]> {
+	let Latency = 2; let NumMicroOps = 2;
+}
+def : InstRW<[KryoWrite_2cyc_XY_XY_243ln],
+	(instregex "(S|U)SHL(v16i8|v8i16|v4i32|v2i64)$")>;
+def KryoWrite_2cyc_XY_XY_241ln :
+	SchedWriteRes<[KryoUnitXY, KryoUnitXY]> {
+	let Latency = 2; let NumMicroOps = 2;
+}
+def : InstRW<[KryoWrite_2cyc_XY_XY_241ln],
+	(instregex "(S|U)?SHLL(v16i8|v8i16|v4i32|v8i8|v4i16|v2i32)(_shift)?$")>;
+def KryoWrite_2cyc_XY_noRSV_240ln :
+	SchedWriteRes<[KryoUnitXY]> {
+	let Latency = 2; let NumMicroOps = 2;
+}
+def : InstRW<[KryoWrite_2cyc_XY_noRSV_240ln],
+	(instregex "((S|U)SHR|SHL)(d|(v8i8|v4i16|v2i32)_shift)$")>;
+def KryoWrite_2cyc_XY_XY_242ln :
+	SchedWriteRes<[KryoUnitXY, KryoUnitXY]> {
+	let Latency = 2; let NumMicroOps = 2;
+}
+def : InstRW<[KryoWrite_2cyc_XY_XY_242ln],
+	(instregex "((S|U)SHR|SHL)(v16i8|v8i16|v4i32|v2i64)_shift$")>;
+def KryoWrite_2cyc_XY_XY_183ln :
+	SchedWriteRes<[KryoUnitXY, KryoUnitXY]> {
+	let Latency = 2; let NumMicroOps = 2;
+}
+def : InstRW<[KryoWrite_2cyc_XY_XY_183ln],
+	(instregex "(S|U)(MAX|MIN)P?(v16i8|v8i16|v4i32)")>;
+def KryoWrite_2cyc_XY_noRSV_182ln :
+	SchedWriteRes<[KryoUnitXY]> {
+	let Latency = 2; let NumMicroOps = 2;
+}
+def : InstRW<[KryoWrite_2cyc_XY_noRSV_182ln],
+	(instregex "(S|U)(MAX|MIN)P?(v8i8|v4i16|v2i32)")>;
+def KryoWrite_3cyc_XY_noRSV_184ln :
+	SchedWriteRes<[KryoUnitXY]> {
+	let Latency = 3; let NumMicroOps = 2;
+}
+def : InstRW<[KryoWrite_3cyc_XY_noRSV_184ln],
+	(instregex "(S|U)(MAX|MIN)V(v4i16v|v8i8v|v4i32)")>;
+def KryoWrite_4cyc_X_noRSV_185ln :
+	SchedWriteRes<[KryoUnitX]> {
+	let Latency = 4; let NumMicroOps = 2;
+}
+def : InstRW<[KryoWrite_4cyc_X_noRSV_185ln],
+	(instregex "(S|U)(MAX|MIN)V(v16i8v|v8i16v)")>;
+def KryoWrite_2cyc_XY_noRSV_67ln :
+	SchedWriteRes<[KryoUnitXY]> {
+	let Latency = 2; let NumMicroOps = 2;
+}
+def : InstRW<[KryoWrite_2cyc_XY_noRSV_67ln],
+	(instrs ABSv1i64)>;
+def KryoWrite_1cyc_XY_63ln :
+	SchedWriteRes<[KryoUnitXY]> {
+	let Latency = 1; let NumMicroOps = 1;
+}
+def : InstRW<[KryoWrite_1cyc_XY_63ln, ReadI, ReadI],
+	(instregex "ADC.*")>;
+def KryoWrite_1cyc_XY_63_1ln :
+	SchedWriteRes<[KryoUnitXY]> {
+	let Latency = 1; let NumMicroOps = 1;
+}
+def : InstRW<[KryoWrite_1cyc_XY_63_1ln],
+	(instregex "ADR.*")>;
+def KryoWrite_1cyc_XY_62ln :
+	SchedWriteRes<[KryoUnitXY]> {
+	let Latency = 1; let NumMicroOps = 1;
+}
+def : InstRW<[KryoWrite_1cyc_XY_62ln, ReadI],
+	(instregex "ADDS?(W|X)ri")>;
+def KryoWrite_2cyc_XY_XY_64ln :
+	SchedWriteRes<[KryoUnitXY, KryoUnitXY]> {
+	let Latency = 2; let NumMicroOps = 2;
+}
+def : InstRW<[KryoWrite_2cyc_XY_XY_64ln, ReadI, ReadI],
+	(instregex "ADDS?(W|X)r(r|s|x)(64)?")>;
+def KryoWrite_1cyc_XY_noRSV_65ln :
+	SchedWriteRes<[KryoUnitXY]> {
+	let Latency = 1; let NumMicroOps = 2;
+}
+def : InstRW<[KryoWrite_1cyc_XY_noRSV_65ln],
+	(instrs ADDv1i64)>;
+def KryoWrite_1cyc_XY_noRSV_144ln :
+	SchedWriteRes<[KryoUnitXY]> {
+	let Latency = 1; let NumMicroOps = 2;
+}
+def : InstRW<[KryoWrite_1cyc_XY_noRSV_144ln],
+	(instregex "(ADD|SUB)(v8i8|v4i16|v2i32|v1i64)")>;
+def KryoWrite_1cyc_XY_XY_146ln :
+	SchedWriteRes<[KryoUnitXY, KryoUnitXY]> {
+	let Latency = 1; let NumMicroOps = 2;
+}
+def : InstRW<[KryoWrite_1cyc_XY_XY_146ln],
+	(instregex "(ADD|SUB)(v16i8|v8i16|v4i32|v2i64)")>;
+def KryoWrite_4cyc_XY_X_noRSV_171ln :
+	SchedWriteRes<[KryoUnitXY, KryoUnitX]> {
+	let Latency = 4; let NumMicroOps = 3;
+}
+def : InstRW<[KryoWrite_4cyc_XY_X_noRSV_171ln],
+	(instregex "(ADD|SUB)HNv.*")>;
+def KryoWrite_1cyc_XY_noRSV_66ln :
+	SchedWriteRes<[KryoUnitXY]> {
+	let Latency = 1; let NumMicroOps = 2;
+}
+def : InstRW<[KryoWrite_1cyc_XY_noRSV_66ln],
+	(instrs ADDPv2i64p)>;
+def KryoWrite_2cyc_XY_XY_153ln :
+	SchedWriteRes<[KryoUnitXY, KryoUnitXY]> {
+	let Latency = 2; let NumMicroOps = 2;
+}
+def : InstRW<[KryoWrite_2cyc_XY_XY_153ln],
+	(instregex "ADDP(v16i8|v8i16|v4i32|v2i64)")>;
+def KryoWrite_3cyc_XY_XY_noRSV_170ln :
+	SchedWriteRes<[KryoUnitXY, KryoUnitXY]> {
+	let Latency = 3; let NumMicroOps = 3;
+}
+def : InstRW<[KryoWrite_3cyc_XY_XY_noRSV_170ln],
+	(instrs ADDVv4i32v)>;
+def KryoWrite_4cyc_XY_XY_noRSV_173ln :
+	SchedWriteRes<[KryoUnitXY, KryoUnitXY]> {
+	let Latency = 4; let NumMicroOps = 3;
+}
+def : InstRW<[KryoWrite_4cyc_XY_XY_noRSV_173ln],
+	(instrs ADDVv8i16v)>;
+def KryoWrite_5cyc_XY_X_noRSV_174ln :
+	SchedWriteRes<[KryoUnitXY, KryoUnitX]> {
+	let Latency = 5; let NumMicroOps = 3;
+}
+def : InstRW<[KryoWrite_5cyc_XY_X_noRSV_174ln],
+	(instrs ADDVv16i8v)>;
+def KryoWrite_3cyc_XY_XY_X_X_27ln :
+	SchedWriteRes<[KryoUnitXY, KryoUnitXY, KryoUnitX, KryoUnitX]> {
+	let Latency = 3; let NumMicroOps = 4;
+}
+def : InstRW<[KryoWrite_3cyc_XY_XY_X_X_27ln],
+	(instrs AESDrr, AESErr)>;
+def KryoWrite_2cyc_X_X_22ln :
+	SchedWriteRes<[KryoUnitX, KryoUnitX]> {
+	let Latency = 2; let NumMicroOps = 2;
+}
+def : InstRW<[KryoWrite_2cyc_X_X_22ln],
+	(instrs AESIMCrr, AESMCrr)>;
+def KryoWrite_1cyc_XY_noRSV_76ln :
+	SchedWriteRes<[KryoUnitXY]> {
+	let Latency = 1; let NumMicroOps = 2;
+}
+def : InstRW<[KryoWrite_1cyc_XY_noRSV_76ln],
+	(instregex "((AND|ORN|EOR|EON)S?(Wr[rsi]|v8i8|v4i16|v2i32)|(ORR|BIC)S?(Wr[rs]|v8i8|v4i16|v2i32))")>;
+def KryoWrite_1cyc_XY_XY_79ln :
+	SchedWriteRes<[KryoUnitXY, KryoUnitXY]> {
+	let Latency = 1; let NumMicroOps = 2;
+}
+def : InstRW<[KryoWrite_1cyc_XY_XY_79ln],
+	(instregex "((AND|ORN|EOR|EON)S?(Xr[rsi]|v16i8|v8i16|v4i32)|(ORR|BIC)S?(Xr[rs]|v16i8|v8i16|v4i32))")>;
+def KryoWrite_1cyc_X_72ln :
+	SchedWriteRes<[KryoUnitX]> {
+	let Latency = 1; let NumMicroOps = 1;
+}
+def : InstRW<[KryoWrite_1cyc_X_72ln],
+	(instregex "(S|U)?BFM.*")>;
+def KryoWrite_1cyc_XY_noRSV_77ln :
+	SchedWriteRes<[KryoUnitXY]> {
+	let Latency = 1; let NumMicroOps = 2;
+}
+def : InstRW<[KryoWrite_1cyc_XY_noRSV_77ln],
+	(instregex "(BIC|ORR)S?Wri")>;
+def KryoWrite_1cyc_XY_XY_78ln :
+	SchedWriteRes<[KryoUnitXY, KryoUnitXY]> {
+	let Latency = 1; let NumMicroOps = 2;
+}
+def : InstRW<[KryoWrite_1cyc_XY_XY_78ln],
+	(instregex "(BIC|ORR)S?Xri")>;
+def KryoWrite_1cyc_X_noRSV_74ln :
+	SchedWriteRes<[KryoUnitX]> {
+	let Latency = 1; let NumMicroOps = 2;
+}
+def : InstRW<[KryoWrite_1cyc_X_noRSV_74ln],
+	(instrs BIFv8i8, BITv8i8, BSLv8i8)>;
+def KryoWrite_1cyc_X_X_75ln :
+	SchedWriteRes<[KryoUnitX, KryoUnitX]> {
+	let Latency = 1; let NumMicroOps = 2;
+}
+def : InstRW<[KryoWrite_1cyc_X_X_75ln],
+	(instrs BIFv16i8, BITv16i8, BSLv16i8)>;
+def KryoWrite_0cyc_noRSV_11ln :
+	SchedWriteRes<[]> {
+	let Latency = 0; let NumMicroOps = 1;
+}
+def : InstRW<[KryoWrite_0cyc_noRSV_11ln],
+	(instrs BRK, DCPS1, DCPS2, DCPS3, HLT, HVC, ISB, HINT, SMC, SVC)>;
+def KryoWrite_0cyc_XY_16ln :
+	SchedWriteRes<[KryoUnitXY]> {
+	let Latency = 0; let NumMicroOps = 1;
+}
+def : InstRW<[KryoWrite_0cyc_XY_16ln, ReadI],
+	(instregex "(CCMN|CCMP)(W|X)i")>;
+def KryoWrite_0cyc_XY_16_1ln :
+	SchedWriteRes<[KryoUnitXY]> {
+	let Latency = 0; let NumMicroOps = 1;
+}
+def : InstRW<[KryoWrite_0cyc_XY_16_1ln, ReadI, ReadI],
+	(instregex "(CCMN|CCMP)(W|X)r")>;
+def KryoWrite_2cyc_XY_3ln :
+	SchedWriteRes<[KryoUnitXY]> {
+	let Latency = 2; let NumMicroOps = 1;
+}
+def : InstRW<[KryoWrite_2cyc_XY_3ln, ReadI],
+	(instregex "(CLS|CLZ)(W|X)r")>;
+def KryoWrite_2cyc_XY_noRSV_7ln :
+	SchedWriteRes<[KryoUnitXY]> {
+	let Latency = 2; let NumMicroOps = 2;
+}
+def : InstRW<[KryoWrite_2cyc_XY_noRSV_7ln],
+	(instregex "(CLS|CLZ|CNT)(v4i32|v8i16|v16i8)")>;
+def KryoWrite_2cyc_XY_XY_8ln :
+	SchedWriteRes<[KryoUnitXY, KryoUnitXY]> {
+	let Latency = 2; let NumMicroOps = 2;
+}
+def : InstRW<[KryoWrite_2cyc_XY_XY_8ln],
+	(instregex "(CLS|CLZ|CNT)(v2i32|v4i16|v8i8)")>;
+def KryoWrite_2cyc_XY_noRSV_80ln :
+	SchedWriteRes<[KryoUnitXY]> {
+	let Latency = 2; let NumMicroOps = 2;
+}
+def : InstRW<[KryoWrite_2cyc_XY_noRSV_80ln],
+	(instregex "CM(EQ|GE|HS|GT|HI|TST)(v8i8|v4i16|v2i32|v1i64)$")>;
+def KryoWrite_2cyc_XY_XY_83ln :
+	SchedWriteRes<[KryoUnitXY, KryoUnitXY]> {
+	let Latency = 2; let NumMicroOps = 2;
+}
+def : InstRW<[KryoWrite_2cyc_XY_XY_83ln],
+	(instregex "CM(EQ|GE|HS|GT|HI|TST)(v16i8|v8i16|v4i32|v2i64)$")>;
+def KryoWrite_2cyc_XY_noRSV_81ln :
+	SchedWriteRes<[KryoUnitXY]> {
+	let Latency = 2; let NumMicroOps = 2;
+}
+def : InstRW<[KryoWrite_2cyc_XY_noRSV_81ln],
+	(instregex "CM(EQ|LE|GE|GT|LT)(v8i8|v4i16|v2i32|v1i64)rz$")>;
+def KryoWrite_2cyc_XY_XY_82ln :
+	SchedWriteRes<[KryoUnitXY, KryoUnitXY]> {
+	let Latency = 2; let NumMicroOps = 2;
+}
+def : InstRW<[KryoWrite_2cyc_XY_XY_82ln],
+	(instregex "CM(EQ|LE|GE|GT|LT)(v16i8|v8i16|v4i32|v2i64)rz$")>;
+def KryoWrite_3cyc_XY_4ln :
+	SchedWriteRes<[KryoUnitXY]> {
+	let Latency = 3; let NumMicroOps = 1;
+}
+def : InstRW<[KryoWrite_3cyc_XY_4ln, ReadI, ReadISReg],
+	(instregex "CRC32.*")>;
+def KryoWrite_1cyc_XY_20ln :
+	SchedWriteRes<[KryoUnitXY]> {
+	let Latency = 1; let NumMicroOps = 1;
+}
+def : InstRW<[KryoWrite_1cyc_XY_20ln, ReadI, ReadI],
+	(instregex "CSEL(W|X)r")>;
+def KryoWrite_1cyc_X_17ln :
+	SchedWriteRes<[KryoUnitX]> {
+	let Latency = 1; let NumMicroOps = 1;
+}
+def : InstRW<[KryoWrite_1cyc_X_17ln, ReadI, ReadI],
+	(instregex "(CSINC|CSNEG)(W|X)r")>;
+def KryoWrite_1cyc_XY_18ln :
+	SchedWriteRes<[KryoUnitXY]> {
+	let Latency = 1; let NumMicroOps = 1;
+}
+def : InstRW<[KryoWrite_1cyc_XY_18ln, ReadI, ReadI],
+	(instregex "(CSINV)(W|X)r")>;
+def KryoWrite_3cyc_LS_X_13ln :
+	SchedWriteRes<[KryoUnitLS, KryoUnitX]> {
+	let Latency = 3; let NumMicroOps = 2;
+}
+def : InstRW<[KryoWrite_3cyc_LS_X_13ln],
+	(instrs DRPS)>;
+def KryoWrite_0cyc_LS_10ln :
+	SchedWriteRes<[KryoUnitLS]> {
+	let Latency = 0; let NumMicroOps = 1;
+}
+def : InstRW<[KryoWrite_0cyc_LS_10ln],
+	(instrs DSB, DMB, CLREX)>;
+def KryoWrite_1cyc_X_noRSV_196ln :
+	SchedWriteRes<[KryoUnitX]> {
+	let Latency = 1; let NumMicroOps = 2;
+}
+def : InstRW<[KryoWrite_1cyc_X_noRSV_196ln],
+	(instregex "DUP(v8i8|v4i16|v2i32)(gpr|lane)")>;
+def KryoWrite_1cyc_X_X_197ln :
+	SchedWriteRes<[KryoUnitX, KryoUnitX]> {
+	let Latency = 1; let NumMicroOps = 2;
+}
+def : InstRW<[KryoWrite_1cyc_X_X_197ln],
+	(instregex "DUP(v16i8|v8i16|v4i32|v2i64)(gpr|lane)")>;
+def KryoWrite_3cyc_LS_LS_X_15ln :
+	SchedWriteRes<[KryoUnitLS, KryoUnitLS, KryoUnitX]> {
+	let Latency = 3; let NumMicroOps = 3;
+}
+def : InstRW<[KryoWrite_3cyc_LS_LS_X_15ln],
+	(instrs ERET)>;
+def KryoWrite_1cyc_X_noRSV_207ln :
+	SchedWriteRes<[KryoUnitX]> {
+	let Latency = 1; let NumMicroOps = 2;
+}
+def : InstRW<[KryoWrite_1cyc_X_noRSV_207ln],
+	(instrs EXTv8i8)>;
+def KryoWrite_1cyc_X_X_212ln :
+	SchedWriteRes<[KryoUnitX, KryoUnitX]> {
+	let Latency = 1; let NumMicroOps = 2;
+}
+def : InstRW<[KryoWrite_1cyc_X_X_212ln],
+	(instrs EXTv16i8)>;
+def KryoWrite_2cyc_XY_X_136ln :
+	SchedWriteRes<[KryoUnitXY, KryoUnitX]> {
+	let Latency = 2; let NumMicroOps = 2;
+}
+def : InstRW<[KryoWrite_2cyc_XY_X_136ln],
+	(instrs EXTRWrri, EXTRXrri)>;
+def KryoWrite_2cyc_XY_noRSV_35ln :
+	SchedWriteRes<[KryoUnitXY]> {
+	let Latency = 2; let NumMicroOps = 2;
+}
+def : InstRW<[KryoWrite_2cyc_XY_noRSV_35ln],
+	(instregex "F(MAX|MIN)(NM)?P?(D|S)rr")>;
+def KryoWrite_2cyc_XY_XY_106ln :
+	SchedWriteRes<[KryoUnitXY, KryoUnitXY]> {
+	let Latency = 2; let NumMicroOps = 2;
+}
+def : InstRW<[KryoWrite_2cyc_XY_XY_106ln],
+	(instregex "(F(MAX|MIN)(NM)?P?|FAC(GE|GT)|FCM(EQ|GE|GT))(v2i64p|v2f64|v4f32)")>;
+def KryoWrite_2cyc_XY_noRSV_104ln :
+	SchedWriteRes<[KryoUnitXY]> {
+	let Latency = 2; let NumMicroOps = 2;
+}
+def : InstRW<[KryoWrite_2cyc_XY_noRSV_104ln],
+	(instregex "(F(MAX|MIN)(NM)?P?|FAC(GE|GT)|FCM(EQ|GE|GT))(v2f32|v2i32p)")>;
+def KryoWrite_3cyc_XY_noRSV_107ln :
+	SchedWriteRes<[KryoUnitXY]> {
+	let Latency = 3; let NumMicroOps = 2;
+}
+def : InstRW<[KryoWrite_3cyc_XY_noRSV_107ln],
+	(instregex "F(MAX|MIN)(NM)?Vv4i32v")>;
+def KryoWrite_3cyc_XY_noRSV_101ln :
+	SchedWriteRes<[KryoUnitXY]> {
+	let Latency = 3; let NumMicroOps = 2;
+}
+def : InstRW<[KryoWrite_3cyc_XY_noRSV_101ln],
+	(instregex "FABD(32|64|v2f32)")>;
+def KryoWrite_3cyc_XY_XY_103ln :
+	SchedWriteRes<[KryoUnitXY, KryoUnitXY]> {
+	let Latency = 3; let NumMicroOps = 2;
+}
+def : InstRW<[KryoWrite_3cyc_XY_XY_103ln],
+	(instregex "(FABD|FADD|FSUB|FADDP)(v4f32|v2f64)")>;
+def KryoWrite_1cyc_XY_noRSV_48ln :
+	SchedWriteRes<[KryoUnitXY]> {
+	let Latency = 1; let NumMicroOps = 2;
+}
+def : InstRW<[KryoWrite_1cyc_XY_noRSV_48ln],
+	(instregex "F(ABS|NEG)(D|S)r")>;
+def KryoWrite_1cyc_XY_noRSV_124ln :
+	SchedWriteRes<[KryoUnitXY]> {
+	let Latency = 1; let NumMicroOps = 2;
+}
+def : InstRW<[KryoWrite_1cyc_XY_noRSV_124ln],
+	(instregex "F(ABS|NEG)v2f32")>;
+def KryoWrite_1cyc_XY_XY_125ln :
+	SchedWriteRes<[KryoUnitXY, KryoUnitXY]> {
+	let Latency = 1; let NumMicroOps = 2;
+}
+def : InstRW<[KryoWrite_1cyc_XY_XY_125ln],
+	(instregex "F(ABS|NEG)(v2f64|v4f32)")>;
+def KryoWrite_2cyc_XY_noRSV_33ln :
+	SchedWriteRes<[KryoUnitXY]> {
+	let Latency = 2; let NumMicroOps = 2;
+}
+def : InstRW<[KryoWrite_2cyc_XY_noRSV_33ln],
+	(instregex "(FAC(GE|GT)|FCM(EQ|GE|GT))(32|64)")>;
+def KryoWrite_3cyc_XY_noRSV_30ln :
+	SchedWriteRes<[KryoUnitXY]> {
+	let Latency = 3; let NumMicroOps = 2;
+}
+def : InstRW<[KryoWrite_3cyc_XY_noRSV_30ln],
+	(instregex "(FADD|FSUB)(D|S)rr")>;
+def KryoWrite_3cyc_XY_noRSV_100ln :
+	SchedWriteRes<[KryoUnitXY]> {
+	let Latency = 3; let NumMicroOps = 2;
+}
+def : InstRW<[KryoWrite_3cyc_XY_noRSV_100ln],
+	(instregex "(FADD|FSUB|FADDP)v2f32")>;
+def KryoWrite_3cyc_XY_noRSV_29ln :
+	SchedWriteRes<[KryoUnitXY]> {
+	let Latency = 3; let NumMicroOps = 2;
+}
+def : InstRW<[KryoWrite_3cyc_XY_noRSV_29ln],
+	(instregex "FADDP(v2i32p|v2i64p)")>;
+def KryoWrite_0cyc_XY_31ln :
+	SchedWriteRes<[KryoUnitXY]> {
+	let Latency = 0; let NumMicroOps = 1;
+}
+def : InstRW<[KryoWrite_0cyc_XY_31ln],
+	(instregex "FCCMPE?(D|S)rr")>;
+def KryoWrite_2cyc_XY_noRSV_34ln :
+	SchedWriteRes<[KryoUnitXY]> {
+	let Latency = 2; let NumMicroOps = 2;
+}
+def : InstRW<[KryoWrite_2cyc_XY_noRSV_34ln],
+	(instregex "FCM(EQ|LE|GE|GT|LT)(v1i32|v1i64)rz")>;
+def KryoWrite_2cyc_XY_XY_36ln :
+	SchedWriteRes<[KryoUnitXY, KryoUnitXY]> {
+	let Latency = 2; let NumMicroOps = 2;
+}
+def : InstRW<[KryoWrite_2cyc_XY_XY_36ln],
+	(instregex "FCM(EQ|LE|GE|GT|LT)(v2i64|v4i32)rz")>;
+def KryoWrite_2cyc_XY_noRSV_105ln :
+	SchedWriteRes<[KryoUnitXY]> {
+	let Latency = 2; let NumMicroOps = 2;
+}
+def : InstRW<[KryoWrite_2cyc_XY_noRSV_105ln],
+	(instregex "FCM(EQ|LE|GE|GT|LT)v2i32rz")>;
+def KryoWrite_0cyc_XY_32ln :
+	SchedWriteRes<[KryoUnitXY]> {
+	let Latency = 0; let NumMicroOps = 1;
+}
+def : InstRW<[KryoWrite_0cyc_XY_32ln],
+	(instregex "FCMPE?(D|S)r(r|i)")>;
+def KryoWrite_1cyc_XY_noRSV_49ln :
+	SchedWriteRes<[KryoUnitXY]> {
+	let Latency = 1; let NumMicroOps = 2;
+}
+def : InstRW<[KryoWrite_1cyc_XY_noRSV_49ln],
+	(instrs FCSELDrrr, FCSELSrrr)>;
+def KryoWrite_4cyc_X_noRSV_41ln :
+	SchedWriteRes<[KryoUnitX]> {
+	let Latency = 4; let NumMicroOps = 2;
+}
+def : InstRW<[KryoWrite_4cyc_X_noRSV_41ln],
+	(instrs FCVTDHr, FCVTDSr, FCVTHDr, FCVTHSr, FCVTSDr, FCVTSHr)>;
+def KryoWrite_4cyc_X_38ln :
+	SchedWriteRes<[KryoUnitX]> {
+	let Latency = 4; let NumMicroOps = 1;
+}
+def : InstRW<[KryoWrite_4cyc_X_38ln],
+	(instregex "FCVT(((A|N|M|P)(S|U)(S|U)|Z(S|U)_Int(S|U))(W|X)(D|S)ri?|Z(S|U)(d|s))$")>;
+def KryoWrite_4cyc_X_noRSV_113ln :
+	SchedWriteRes<[KryoUnitX]> {
+	let Latency = 4; let NumMicroOps = 2;
+}
+def : InstRW<[KryoWrite_4cyc_X_noRSV_113ln],
+	(instregex "FCVT((A|N|M|P)(S|U)|Z(S|U)_Int)(v1i32|v1i64|v2f32)$")>;
+def KryoWrite_4cyc_X_X_117ln :
+	SchedWriteRes<[KryoUnitX, KryoUnitX]> {
+	let Latency = 4; let NumMicroOps = 2;
+}
+def : InstRW<[KryoWrite_4cyc_X_X_117ln],
+	(instregex "FCVT((A|N|M|P)(S|U)|Z(S|U)_Int)(v4f32|v2f64)$")>;
+def KryoWrite_5cyc_X_X_XY_noRSV_119ln :
+	SchedWriteRes<[KryoUnitX, KryoUnitX, KryoUnitXY]> {
+	let Latency = 5; let NumMicroOps = 4;
+}
+def : InstRW<[KryoWrite_5cyc_X_X_XY_noRSV_119ln],
+	(instregex "FCVTX?N(v2f32|v4f32|v2i32|v4i16|v4i32|v8i16)$")>;
+def KryoWrite_4cyc_X_X_116ln :
+	SchedWriteRes<[KryoUnitX, KryoUnitX]> {
+	let Latency = 4; let NumMicroOps = 2;
+}
+def : InstRW<[KryoWrite_4cyc_X_X_116ln],
+	(instregex "FCVTL(v2i32|v4i16|v4i32|v8i16)$")>;
+def KryoWrite_4cyc_X_noRSV_112ln :
+	SchedWriteRes<[KryoUnitX]> {
+	let Latency = 4; let NumMicroOps = 2;
+}
+def : InstRW<[KryoWrite_4cyc_X_noRSV_112ln],
+	(instrs FCVTXNv1i64)>;
+def KryoWrite_4cyc_X_37ln :
+	SchedWriteRes<[KryoUnitX]> {
+	let Latency = 4; let NumMicroOps = 1;
+}
+def : InstRW<[KryoWrite_4cyc_X_37ln],
+	(instregex "FCVTZ(S|U)(S|U)(W|X)(D|S)ri?$")>;
+def KryoWrite_4cyc_X_noRSV_111ln :
+	SchedWriteRes<[KryoUnitX]> {
+	let Latency = 4; let NumMicroOps = 2;
+}
+def : InstRW<[KryoWrite_4cyc_X_noRSV_111ln],
+	(instregex "FCVTZ(S|U)(v2f32|v1i32|v1i64|v2i32(_shift)?)$")>;
+def KryoWrite_4cyc_X_X_115ln :
+	SchedWriteRes<[KryoUnitX, KryoUnitX]> {
+	let Latency = 4; let NumMicroOps = 2;
+}
+def : InstRW<[KryoWrite_4cyc_X_X_115ln],
+	(instregex "FCVTZ(S|U)(v2f64|v4f32|(v2i64|v4i32)(_shift)?)$")>;
+def KryoWrite_1cyc_XA_Y_noRSV_43ln :
+	SchedWriteRes<[KryoUnitXA, KryoUnitY]> {
+	let Latency = 1; let NumMicroOps = 3;
+}
+def : InstRW<[KryoWrite_1cyc_XA_Y_noRSV_43ln],
+	(instrs FDIVDrr, FDIVSrr)>;
+def KryoWrite_1cyc_XA_Y_noRSV_121ln :
+	SchedWriteRes<[KryoUnitXA, KryoUnitY]> {
+	let Latency = 1; let NumMicroOps = 3;
+}
+def : InstRW<[KryoWrite_1cyc_XA_Y_noRSV_121ln],
+	(instrs FDIVv2f32)>;
+def KryoWrite_1cyc_XA_Y_XA_Y_123ln :
+	SchedWriteRes<[KryoUnitXA, KryoUnitY, KryoUnitXA, KryoUnitY]> {
+	let Latency = 1; let NumMicroOps = 4;
+}
+def : InstRW<[KryoWrite_1cyc_XA_Y_XA_Y_123ln],
+	(instrs FDIVv2f64, FDIVv4f32)>;
+def KryoWrite_5cyc_X_noRSV_55ln :
+	SchedWriteRes<[KryoUnitX]> {
+	let Latency = 5; let NumMicroOps = 2;
+}
+def : InstRW<[KryoWrite_5cyc_X_noRSV_55ln],
+	(instregex "FN?M(ADD|SUB)Srrr")>;
+def KryoWrite_6cyc_X_noRSV_57ln :
+	SchedWriteRes<[KryoUnitX]> {
+	let Latency = 6; let NumMicroOps = 2;
+}
+def : InstRW<[KryoWrite_6cyc_X_noRSV_57ln],
+	(instregex "FN?M(ADD|SUB)Drrr")>;
+def KryoWrite_5cyc_X_noRSV_51ln :
+	SchedWriteRes<[KryoUnitX]> {
+	let Latency = 5; let NumMicroOps = 2;
+}
+def : InstRW<[KryoWrite_5cyc_X_noRSV_51ln],
+	(instrs FMLAv2f32, FMLSv2f32, FMLAv1i32_indexed, FMLSv1i32_indexed)>;
+def KryoWrite_5cyc_X_X_56ln :
+	SchedWriteRes<[KryoUnitX, KryoUnitX]> {
+	let Latency = 5; let NumMicroOps = 2;
+}
+def : InstRW<[KryoWrite_5cyc_X_X_56ln],
+	(instrs FMLAv4f32, FMLSv4f32)>;
+def KryoWrite_6cyc_X_X_61ln :
+	SchedWriteRes<[KryoUnitX, KryoUnitX]> {
+	let Latency = 6; let NumMicroOps = 2;
+}
+def : InstRW<[KryoWrite_6cyc_X_X_61ln],
+	(instrs FMLAv2f64, FMLSv2f64)>;
+def KryoWrite_5cyc_X_noRSV_128ln :
+	SchedWriteRes<[KryoUnitX]> {
+	let Latency = 5; let NumMicroOps = 2;
+}
+def : InstRW<[KryoWrite_5cyc_X_noRSV_128ln],
+	(instrs FMLAv2i32_indexed, FMLSv2i32_indexed)>;
+def KryoWrite_5cyc_X_X_131ln :
+	SchedWriteRes<[KryoUnitX, KryoUnitX]> {
+	let Latency = 5; let NumMicroOps = 2;
+}
+def : InstRW<[KryoWrite_5cyc_X_X_131ln],
+	(instrs FMLAv4i32_indexed, FMLSv4i32_indexed)>;
+def KryoWrite_6cyc_X_X_134ln :
+	SchedWriteRes<[KryoUnitX, KryoUnitX]> {
+	let Latency = 6; let NumMicroOps = 2;
+}
+def : InstRW<[KryoWrite_6cyc_X_X_134ln],
+	(instrs FMLAv2i64_indexed, FMLSv2i64_indexed)>;
+def KryoWrite_6cyc_X_noRSV_60ln :
+	SchedWriteRes<[KryoUnitX]> {
+	let Latency = 6; let NumMicroOps = 2;
+}
+def : InstRW<[KryoWrite_6cyc_X_noRSV_60ln],
+	(instrs FMLAv1i64_indexed, FMLSv1i64_indexed, FMULv1i64_indexed, FMULXv1i64_indexed)>;
+def KryoWrite_1cyc_XY_45ln :
+	SchedWriteRes<[KryoUnitXY]> {
+	let Latency = 1; let NumMicroOps = 1;
+}
+def : InstRW<[KryoWrite_1cyc_XY_45ln],
+	(instregex "FMOV(XDHigh|DXHigh|DX)r")>;
+def KryoWrite_1cyc_XY_noRSV_47ln :
+	SchedWriteRes<[KryoUnitXY]> {
+	let Latency = 1; let NumMicroOps = 2;
+}
+def : InstRW<[KryoWrite_1cyc_XY_noRSV_47ln],
+	(instregex "FMOV(Di|Dr|Si|Sr|SWr|WSr|XDr|v.*_ns)")>;
+def KryoWrite_5cyc_X_noRSV_53ln :
+	SchedWriteRes<[KryoUnitX]> {
+	let Latency = 5; let NumMicroOps = 2;
+}
+def : InstRW<[KryoWrite_5cyc_X_noRSV_53ln],
+	(instrs FMULv1i32_indexed, FMULXv1i32_indexed)>;
+def KryoWrite_5cyc_X_noRSV_127ln :
+	SchedWriteRes<[KryoUnitX]> {
+	let Latency = 5; let NumMicroOps = 2;
+}
+def : InstRW<[KryoWrite_5cyc_X_noRSV_127ln],
+	(instrs FMULv2f32, FMULXv2f32, FMULv2i32_indexed, FMULXv2i32_indexed)>;
+def KryoWrite_5cyc_X_X_130ln :
+	SchedWriteRes<[KryoUnitX, KryoUnitX]> {
+	let Latency = 5; let NumMicroOps = 2;
+}
+def : InstRW<[KryoWrite_5cyc_X_X_130ln],
+	(instrs FMULv4f32, FMULXv4f32, FMULv4i32_indexed, FMULXv4i32_indexed)>;
+def KryoWrite_6cyc_X_X_133ln :
+	SchedWriteRes<[KryoUnitX, KryoUnitX]> {
+	let Latency = 6; let NumMicroOps = 2;
+}
+def : InstRW<[KryoWrite_6cyc_X_X_133ln],
+	(instrs FMULv2f64, FMULXv2f64, FMULv2i64_indexed, FMULXv2i64_indexed)>;
+def KryoWrite_5cyc_X_noRSV_54ln :
+	SchedWriteRes<[KryoUnitX]> {
+	let Latency = 5; let NumMicroOps = 2;
+}
+def : InstRW<[KryoWrite_5cyc_X_noRSV_54ln],
+	(instrs FMULSrr, FNMULSrr, FMULX32)>;
+def KryoWrite_6cyc_X_noRSV_59ln :
+	SchedWriteRes<[KryoUnitX]> {
+	let Latency = 6; let NumMicroOps = 2;
+}
+def : InstRW<[KryoWrite_6cyc_X_noRSV_59ln],
+	(instrs FMULDrr, FNMULDrr, FMULX64)>;
+def KryoWrite_3cyc_XY_noRSV_28ln :
+	SchedWriteRes<[KryoUnitXY]> {
+	let Latency = 3; let NumMicroOps = 2;
+}
+def : InstRW<[KryoWrite_3cyc_XY_noRSV_28ln],
+	(instrs FRECPEv1i32, FRECPEv1i64, FRSQRTEv1i32, FRSQRTEv1i64 )>;
+def KryoWrite_3cyc_XY_noRSV_99ln :
+	SchedWriteRes<[KryoUnitXY]> {
+	let Latency = 3; let NumMicroOps = 2;
+}
+def : InstRW<[KryoWrite_3cyc_XY_noRSV_99ln],
+	(instrs FRECPEv2f32, FRSQRTEv2f32)>;
+def KryoWrite_3cyc_XY_XY_102ln :
+	SchedWriteRes<[KryoUnitXY, KryoUnitXY]> {
+	let Latency = 3; let NumMicroOps = 2;
+}
+def : InstRW<[KryoWrite_3cyc_XY_XY_102ln],
+	(instrs FRECPEv2f64, FRECPEv4f32, FRSQRTEv2f64, FRSQRTEv4f32)>;
+def KryoWrite_5cyc_X_noRSV_52ln :
+	SchedWriteRes<[KryoUnitX]> {
+	let Latency = 5; let NumMicroOps = 2;
+}
+def : InstRW<[KryoWrite_5cyc_X_noRSV_52ln],
+	(instrs FRECPS32, FRSQRTS32)>;
+def KryoWrite_6cyc_X_noRSV_58ln :
+	SchedWriteRes<[KryoUnitX]> {
+	let Latency = 6; let NumMicroOps = 2;
+}
+def : InstRW<[KryoWrite_6cyc_X_noRSV_58ln],
+	(instrs FRECPS64, FRSQRTS64)>;
+def KryoWrite_5cyc_X_noRSV_126ln :
+	SchedWriteRes<[KryoUnitX]> {
+	let Latency = 5; let NumMicroOps = 2;
+}
+def : InstRW<[KryoWrite_5cyc_X_noRSV_126ln],
+	(instrs FRECPSv2f32, FRSQRTSv2f32)>;
+def KryoWrite_5cyc_X_X_129ln :
+	SchedWriteRes<[KryoUnitX, KryoUnitX]> {
+	let Latency = 5; let NumMicroOps = 2;
+}
+def : InstRW<[KryoWrite_5cyc_X_X_129ln],
+	(instrs FRECPSv4f32, FRSQRTSv4f32)>;
+def KryoWrite_6cyc_X_X_132ln :
+	SchedWriteRes<[KryoUnitX, KryoUnitX]> {
+	let Latency = 6; let NumMicroOps = 2;
+}
+def : InstRW<[KryoWrite_6cyc_X_X_132ln],
+	(instrs FRECPSv2f64, FRSQRTSv2f64)>;
+def KryoWrite_3cyc_XY_noRSV_50ln :
+	SchedWriteRes<[KryoUnitXY]> {
+	let Latency = 3; let NumMicroOps = 2;
+}
+def : InstRW<[KryoWrite_3cyc_XY_noRSV_50ln],
+	(instrs FRECPXv1i32, FRECPXv1i64)>;
+def KryoWrite_2cyc_XY_noRSV_39ln :
+	SchedWriteRes<[KryoUnitXY]> {
+	let Latency = 2; let NumMicroOps = 2;
+}
+def : InstRW<[KryoWrite_2cyc_XY_noRSV_39ln],
+	(instregex "FRINT(A|I|M|N|P|X|Z)(S|D)r")>;
+def KryoWrite_2cyc_XY_noRSV_108ln :
+	SchedWriteRes<[KryoUnitXY]> {
+	let Latency = 2; let NumMicroOps = 2;
+}
+def : InstRW<[KryoWrite_2cyc_XY_noRSV_108ln],
+	(instregex "FRINT(A|I|M|N|P|X|Z)v2f32")>;
+def KryoWrite_2cyc_XY_XY_109ln :
+	SchedWriteRes<[KryoUnitXY, KryoUnitXY]> {
+	let Latency = 2; let NumMicroOps = 2;
+}
+def : InstRW<[KryoWrite_2cyc_XY_XY_109ln],
+	(instregex "FRINT(A|I|M|N|P|X|Z)(v2f64|v4f32)")>;
+def KryoWrite_1cyc_XA_Y_noRSV_42ln :
+	SchedWriteRes<[KryoUnitXA, KryoUnitY]> {
+	let Latency = 1; let NumMicroOps = 3;
+}
+def : InstRW<[KryoWrite_1cyc_XA_Y_noRSV_42ln],
+	(instregex "FSQRT(S|D)r")>;
+def KryoWrite_1cyc_XA_Y_noRSV_120ln :
+	SchedWriteRes<[KryoUnitXA, KryoUnitY]> {
+	let Latency = 1; let NumMicroOps = 3;
+}
+def : InstRW<[KryoWrite_1cyc_XA_Y_noRSV_120ln],
+	(instregex "FSQRTv2f32")>;
+def KryoWrite_1cyc_XA_Y_XA_Y_122ln :
+	SchedWriteRes<[KryoUnitXA, KryoUnitY, KryoUnitXA, KryoUnitY]> {
+	let Latency = 1; let NumMicroOps = 4;
+}
+def : InstRW<[KryoWrite_1cyc_XA_Y_XA_Y_122ln],
+	(instregex "FSQRT(v2f64|v4f32)")>;
+def KryoWrite_1cyc_X_201ln :
+	SchedWriteRes<[KryoUnitX]> {
+	let Latency = 1; let NumMicroOps = 1;
+}
+def : InstRW<[KryoWrite_1cyc_X_201ln],
+	(instregex "INSv.*")>;
+def KryoWrite_3cyc_LS_255ln :
+	SchedWriteRes<[KryoUnitLS]> {
+	let Latency = 3; let NumMicroOps = 1;
+}
+def : InstRW<[KryoWrite_3cyc_LS_255ln],
+	(instregex "LD1(One(v16b|v8h|v4s|v2d)|i64)$")>;
+def KryoWrite_4cyc_LS_X_270ln :
+	SchedWriteRes<[KryoUnitLS, KryoUnitX]> {
+	let Latency = 4; let NumMicroOps = 2;
+}
+def : InstRW<[KryoWrite_4cyc_LS_X_270ln],
+	(instregex "LD1(i8|i16|i32)$")>;
+def KryoWrite_3cyc_LS_noRSV_285ln :
+	SchedWriteRes<[KryoUnitLS]> {
+	let Latency = 3; let NumMicroOps = 2;
+}
+def : InstRW<[KryoWrite_3cyc_LS_noRSV_285ln],
+	(instregex "LD1One(v8b|v4h|v2s|v1d)$")>;
+def KryoWrite_3cyc_LS_XY_289ln :
+	SchedWriteRes<[KryoUnitLS, KryoUnitXY]> {
+	let Latency = 3; let NumMicroOps = 2;
+}
+def : InstRW<[KryoWrite_3cyc_LS_XY_289ln, WriteAdr],
+	(instregex "LD1(One(v16b|v8h|v4s|v2d)|i64)_POST$")>;
+def KryoWrite_4cyc_LS_XY_X_298ln :
+	SchedWriteRes<[KryoUnitLS, KryoUnitXY, KryoUnitX]> {
+	let Latency = 4; let NumMicroOps = 3;
+}
+def : InstRW<[KryoWrite_4cyc_LS_XY_X_298ln, WriteAdr],
+	(instregex "LD1(i8|i16|i32)_POST$")>;
+def KryoWrite_3cyc_LS_LS_LS_308ln :
+	SchedWriteRes<[KryoUnitLS, KryoUnitLS, KryoUnitLS]> {
+	let Latency = 3; let NumMicroOps = 3;
+}
+def : InstRW<[KryoWrite_3cyc_LS_LS_LS_308ln],
+	(instregex "LD1Three(v16b|v8h|v4s|v2d)$")>;
+def KryoWrite_3cyc_LS_XY_noRSV_317ln :
+	SchedWriteRes<[KryoUnitLS, KryoUnitXY]> {
+	let Latency = 3; let NumMicroOps = 3;
+}
+def : InstRW<[KryoWrite_3cyc_LS_XY_noRSV_317ln, WriteAdr],
+	(instregex "LD1One(v8b|v4h|v2s|v1d)_POST$")>;
+def KryoWrite_3cyc_LS_LS_LS_LS_328ln :
+	SchedWriteRes<[KryoUnitLS, KryoUnitLS, KryoUnitLS, KryoUnitLS]> {
+	let Latency = 3; let NumMicroOps = 4;
+}
+def : InstRW<[KryoWrite_3cyc_LS_LS_LS_LS_328ln, WriteAdr],
+	(instregex "LD1Four(v16b|v8h|v4s|v2d)_POST$")>;
+def KryoWrite_3cyc_LS_XY_LS_LS_332ln :
+	SchedWriteRes<[KryoUnitLS, KryoUnitXY, KryoUnitLS, KryoUnitLS]> {
+	let Latency = 3; let NumMicroOps = 4;
+}
+def : InstRW<[KryoWrite_3cyc_LS_XY_LS_LS_332ln, WriteAdr],
+	(instregex "LD1Three(v16b|v8h|v4s|v2d)_POST$")>;
+def KryoWrite_3cyc_LS_LS_noRSV_noRSV_noRSV_348ln :
+	SchedWriteRes<[KryoUnitLS, KryoUnitLS]> {
+	let Latency = 3; let NumMicroOps = 5;
+}
+def : InstRW<[KryoWrite_3cyc_LS_LS_noRSV_noRSV_noRSV_348ln],
+	(instregex "LD1Three(v8b|v4h|v2s|v1d)$")>;
+def KryoWrite_3cyc_LS_XY_LS_LS_LS_351ln :
+	SchedWriteRes<[KryoUnitLS, KryoUnitXY, KryoUnitLS, KryoUnitLS, KryoUnitLS]> {
+	let Latency = 3; let NumMicroOps = 5;
+}
+def : InstRW<[KryoWrite_3cyc_LS_XY_LS_LS_LS_351ln],
+	(instregex "LD1Four(v16b|v8h|v4s|v2d)$")>;
+def KryoWrite_3cyc_LS_LS_noRSV_noRSV_noRSV_noRSV_358ln :
+	SchedWriteRes<[KryoUnitLS, KryoUnitLS]> {
+	let Latency = 3; let NumMicroOps = 6;
+}
+def : InstRW<[KryoWrite_3cyc_LS_LS_noRSV_noRSV_noRSV_noRSV_358ln],
+	(instregex "LD1Four(v8b|v4h|v2s|v1d)$")>;
+def KryoWrite_3cyc_LS_XY_LS_noRSV_noRSV_noRSV_360ln :
+	SchedWriteRes<[KryoUnitLS, KryoUnitXY, KryoUnitLS]> {
+	let Latency = 3; let NumMicroOps = 6;
+}
+def : InstRW<[KryoWrite_3cyc_LS_XY_LS_noRSV_noRSV_noRSV_360ln, WriteAdr],
+	(instregex "LD1Three(v8b|v4h|v2s|v1d)_POST$")>;
+def KryoWrite_3cyc_LS_XY_LS_noRSV_noRSV_noRSV_noRSV_368ln :
+	SchedWriteRes<[KryoUnitLS, KryoUnitXY, KryoUnitLS]> {
+	let Latency = 3; let NumMicroOps = 7;
+}
+def : InstRW<[KryoWrite_3cyc_LS_XY_LS_noRSV_noRSV_noRSV_noRSV_368ln, WriteAdr],
+	(instregex "LD1Four(v8b|v4h|v2s|v1d)_POST$")>;
+def KryoWrite_3cyc_LS_LS_281ln :
+	SchedWriteRes<[KryoUnitLS, KryoUnitLS]> {
+	let Latency = 3; let NumMicroOps = 2;
+}
+def : InstRW<[KryoWrite_3cyc_LS_LS_281ln],
+	(instregex "LD(1|2)Two(v16b|v8h|v4s|v2d)$")>;
+def KryoWrite_3cyc_LS_noRSV_noRSV_311ln :
+	SchedWriteRes<[KryoUnitLS]> {
+	let Latency = 3; let NumMicroOps = 3;
+}
+def : InstRW<[KryoWrite_3cyc_LS_noRSV_noRSV_311ln],
+	(instregex "LD(1|2)Two(v8b|v4h|v2s|v1d)$")>;
+def KryoWrite_3cyc_LS_XY_LS_313ln :
+	SchedWriteRes<[KryoUnitLS, KryoUnitXY, KryoUnitLS]> {
+	let Latency = 3; let NumMicroOps = 3;
+}
+def : InstRW<[KryoWrite_3cyc_LS_XY_LS_313ln, WriteAdr],
+	(instregex "LD(1|2)Two(v16b|v8h|v4s|v2d)_POST$")>;
+def KryoWrite_3cyc_LS_XY_noRSV_noRSV_334ln :
+	SchedWriteRes<[KryoUnitLS, KryoUnitXY]> {
+	let Latency = 3; let NumMicroOps = 4;
+}
+def : InstRW<[KryoWrite_3cyc_LS_XY_noRSV_noRSV_334ln, WriteAdr],
+	(instregex "LD(1|2)Two(v8b|v4h|v2s|v1d)_POST$")>;
+def KryoWrite_3cyc_LS_256ln :
+	SchedWriteRes<[KryoUnitLS]> {
+	let Latency = 3; let NumMicroOps = 1;
+}
+def : InstRW<[KryoWrite_3cyc_LS_256ln],
+	(instregex "LD1R(v16b|v8h|v4s|v2d)$")>;
+def KryoWrite_3cyc_LS_noRSV_286ln :
+	SchedWriteRes<[KryoUnitLS]> {
+	let Latency = 3; let NumMicroOps = 2;
+}
+def : InstRW<[KryoWrite_3cyc_LS_noRSV_286ln],
+	(instregex "LD1R(v8b|v4h|v2s|v1d)$")>;
+def KryoWrite_3cyc_LS_XY_290ln :
+	SchedWriteRes<[KryoUnitLS, KryoUnitXY]> {
+	let Latency = 3; let NumMicroOps = 2;
+}
+def : InstRW<[KryoWrite_3cyc_LS_XY_290ln, WriteAdr],
+	(instregex "LD1R(v16b|v8h|v4s|v2d)_POST$")>;
+def KryoWrite_3cyc_LS_XY_noRSV_318ln :
+	SchedWriteRes<[KryoUnitLS, KryoUnitXY]> {
+	let Latency = 3; let NumMicroOps = 3;
+}
+def : InstRW<[KryoWrite_3cyc_LS_XY_noRSV_318ln, WriteAdr],
+	(instregex "LD1R(v8b|v4h|v2s|v1d)_POST$")>;
+def KryoWrite_3cyc_LS_257ln :
+	SchedWriteRes<[KryoUnitLS]> {
+	let Latency = 3; let NumMicroOps = 1;
+}
+def : InstRW<[KryoWrite_3cyc_LS_257ln],
+	(instregex "LD2i64$")>;
+def KryoWrite_3cyc_LS_XY_291ln :
+	SchedWriteRes<[KryoUnitLS, KryoUnitXY]> {
+	let Latency = 3; let NumMicroOps = 2;
+}
+def : InstRW<[KryoWrite_3cyc_LS_XY_291ln, WriteAdr],
+	(instregex "LD2i64_POST$")>;
+def KryoWrite_4cyc_LS_X_X_296ln :
+	SchedWriteRes<[KryoUnitLS, KryoUnitX, KryoUnitX]> {
+	let Latency = 4; let NumMicroOps = 3;
+}
+def : InstRW<[KryoWrite_4cyc_LS_X_X_296ln],
+	(instregex "LD2(i8|i16|i32)$")>;
+def KryoWrite_4cyc_LS_XY_X_X_321ln :
+	SchedWriteRes<[KryoUnitLS, KryoUnitXY, KryoUnitX, KryoUnitX]> {
+	let Latency = 4; let NumMicroOps = 4;
+}
+def : InstRW<[KryoWrite_4cyc_LS_XY_X_X_321ln, WriteAdr],
+	(instregex "LD2(i8|i16|i32)_POST$")>;
+def KryoWrite_3cyc_LS_LS_282ln :
+	SchedWriteRes<[KryoUnitLS, KryoUnitLS]> {
+	let Latency = 3; let NumMicroOps = 2;
+}
+def : InstRW<[KryoWrite_3cyc_LS_LS_282ln],
+	(instregex "LD2R(v16b|v8h|v4s|v2d)$")>;
+def KryoWrite_3cyc_LS_noRSV_noRSV_312ln :
+	SchedWriteRes<[KryoUnitLS]> {
+	let Latency = 3; let NumMicroOps = 3;
+}
+def : InstRW<[KryoWrite_3cyc_LS_noRSV_noRSV_312ln],
+	(instregex "LD2R(v8b|v4h|v2s|v1d)$")>;
+def KryoWrite_3cyc_LS_XY_LS_314ln :
+	SchedWriteRes<[KryoUnitLS, KryoUnitXY, KryoUnitLS]> {
+	let Latency = 3; let NumMicroOps = 3;
+}
+def : InstRW<[KryoWrite_3cyc_LS_XY_LS_314ln, WriteAdr],
+	(instregex "LD2R(v16b|v8h|v4s|v2d)_POST$")>;
+def KryoWrite_3cyc_LS_XY_noRSV_noRSV_335ln :
+	SchedWriteRes<[KryoUnitLS, KryoUnitXY]> {
+	let Latency = 3; let NumMicroOps = 4;
+}
+def : InstRW<[KryoWrite_3cyc_LS_XY_noRSV_noRSV_335ln, WriteAdr],
+	(instregex "LD2R(v8b|v4h|v2s|v1d)_POST$")>;
+def KryoWrite_3cyc_LS_LS_283ln :
+	SchedWriteRes<[KryoUnitLS, KryoUnitLS]> {
+	let Latency = 3; let NumMicroOps = 2;
+}
+def : InstRW<[KryoWrite_3cyc_LS_LS_283ln],
+	(instregex "LD3i64$")>;
+def KryoWrite_3cyc_LS_LS_LS_309ln :
+	SchedWriteRes<[KryoUnitLS, KryoUnitLS, KryoUnitLS]> {
+	let Latency = 3; let NumMicroOps = 3;
+}
+def : InstRW<[KryoWrite_3cyc_LS_LS_LS_309ln],
+	(instregex "LD3Threev2d$")>;
+def KryoWrite_3cyc_LS_XY_LS_315ln :
+	SchedWriteRes<[KryoUnitLS, KryoUnitXY, KryoUnitLS]> {
+	let Latency = 3; let NumMicroOps = 3;
+}
+def : InstRW<[KryoWrite_3cyc_LS_XY_LS_315ln, WriteAdr],
+	(instregex "LD3i64_POST$")>;
+def KryoWrite_4cyc_LS_X_X_X_320ln :
+	SchedWriteRes<[KryoUnitLS, KryoUnitX, KryoUnitX, KryoUnitX]> {
+	let Latency = 4; let NumMicroOps = 4;
+}
+def : InstRW<[KryoWrite_4cyc_LS_X_X_X_320ln],
+	(instregex "LD3(i8|i16|i32)$")>;
+def KryoWrite_3cyc_LS_XY_LS_LS_331ln :
+	SchedWriteRes<[KryoUnitLS, KryoUnitXY, KryoUnitLS, KryoUnitLS]> {
+	let Latency = 3; let NumMicroOps = 4;
+}
+def : InstRW<[KryoWrite_3cyc_LS_XY_LS_LS_331ln, WriteAdr],
+	(instregex "LD3Threev2d_POST$")>;
+def KryoWrite_4cyc_LS_XY_X_X_X_338ln :
+	SchedWriteRes<[KryoUnitLS, KryoUnitXY, KryoUnitX, KryoUnitX, KryoUnitX]> {
+	let Latency = 4; let NumMicroOps = 5;
+}
+def : InstRW<[KryoWrite_4cyc_LS_XY_X_X_X_338ln, WriteAdr],
+	(instregex "LD3(i8|i16|i32)_POST$")>;
+def KryoWrite_4cyc_LS_LS_X_X_X_noRSV_noRSV_noRSV_373ln :
+	SchedWriteRes<[KryoUnitLS, KryoUnitLS, KryoUnitX, KryoUnitX, KryoUnitX]> {
+	let Latency = 4; let NumMicroOps = 8;
+}
+def : InstRW<[KryoWrite_4cyc_LS_LS_X_X_X_noRSV_noRSV_noRSV_373ln],
+	(instregex "LD3Three(v8b|v4h|v2s)$")>;
+def KryoWrite_4cyc_LS_XY_LS_X_X_X_noRSV_noRSV_noRSV_380ln :
+	SchedWriteRes<[KryoUnitLS, KryoUnitXY, KryoUnitLS, KryoUnitX, KryoUnitX,
+                   KryoUnitX]> {
+	let Latency = 4; let NumMicroOps = 9;
+}
+def : InstRW<[KryoWrite_4cyc_LS_XY_LS_X_X_X_noRSV_noRSV_noRSV_380ln, WriteAdr],
+	(instregex "LD3Three(v8b|v4h|v2s)_POST$")>;
+def KryoWrite_4cyc_LS_LS_X_X_X_LS_LS_X_X_X_381ln :
+	SchedWriteRes<[KryoUnitLS, KryoUnitLS, KryoUnitX, KryoUnitX, KryoUnitX,
+                   KryoUnitLS, KryoUnitLS, KryoUnitX, KryoUnitX, KryoUnitX]> {
+	let Latency = 4; let NumMicroOps = 10;
+}
+def : InstRW<[KryoWrite_4cyc_LS_LS_X_X_X_LS_LS_X_X_X_381ln],
+	(instregex "LD3Three(v16b|v8h|v4s)$")>;
+def KryoWrite_4cyc_LS_LS_X_X_X_LS_XY_LS_X_X_X_383ln :
+	SchedWriteRes<[KryoUnitLS, KryoUnitLS, KryoUnitX, KryoUnitX, KryoUnitX,
+                   KryoUnitLS, KryoUnitXY, KryoUnitLS, KryoUnitX, KryoUnitX,
+                   KryoUnitX]> {
+	let Latency = 4; let NumMicroOps = 11;
+}
+def : InstRW<[KryoWrite_4cyc_LS_LS_X_X_X_LS_XY_LS_X_X_X_383ln, WriteAdr],
+	(instregex "LD3Three(v16b|v8h|v4s)_POST$")>;
+def KryoWrite_3cyc_LS_LS_LS_310ln :
+	SchedWriteRes<[KryoUnitLS, KryoUnitLS, KryoUnitLS]> {
+	let Latency = 3; let NumMicroOps = 3;
+}
+def : InstRW<[KryoWrite_3cyc_LS_LS_LS_310ln],
+	(instregex "LD3R(v16b|v8h|v4s|v2d)$")>;
+def KryoWrite_3cyc_LS_XY_LS_LS_333ln :
+	SchedWriteRes<[KryoUnitLS, KryoUnitXY, KryoUnitLS, KryoUnitLS]> {
+	let Latency = 3; let NumMicroOps = 4;
+}
+def : InstRW<[KryoWrite_3cyc_LS_XY_LS_LS_333ln, WriteAdr],
+	(instregex "LD3R(v16b|v8h|v4s|v2d)_POST$")>;
+def KryoWrite_3cyc_LS_LS_noRSV_noRSV_noRSV_349ln :
+	SchedWriteRes<[KryoUnitLS, KryoUnitLS]> {
+	let Latency = 3; let NumMicroOps = 5;
+}
+def : InstRW<[KryoWrite_3cyc_LS_LS_noRSV_noRSV_noRSV_349ln],
+	(instregex "LD3R(v8b|v4h|v2s|v1d)$")>;
+def KryoWrite_3cyc_LS_XY_LS_noRSV_noRSV_noRSV_361ln :
+	SchedWriteRes<[KryoUnitLS, KryoUnitXY, KryoUnitLS]> {
+	let Latency = 3; let NumMicroOps = 6;
+}
+def : InstRW<[KryoWrite_3cyc_LS_XY_LS_noRSV_noRSV_noRSV_361ln, WriteAdr],
+	(instregex "LD3R(v8b|v4h|v2s|v1d)_POST$")>;
+def KryoWrite_3cyc_LS_LS_284ln :
+	SchedWriteRes<[KryoUnitLS, KryoUnitLS]> {
+	let Latency = 3; let NumMicroOps = 2;
+}
+def : InstRW<[KryoWrite_3cyc_LS_LS_284ln],
+	(instregex "LD4i64$")>;
+def KryoWrite_3cyc_LS_XY_LS_316ln :
+	SchedWriteRes<[KryoUnitLS, KryoUnitXY, KryoUnitLS]> {
+	let Latency = 3; let NumMicroOps = 3;
+}
+def : InstRW<[KryoWrite_3cyc_LS_XY_LS_316ln, WriteAdr],
+	(instregex "LD4i64_POST$")>;
+def KryoWrite_3cyc_LS_LS_LS_LS_329ln :
+	SchedWriteRes<[KryoUnitLS, KryoUnitLS, KryoUnitLS, KryoUnitLS]> {
+	let Latency = 3; let NumMicroOps = 4;
+}
+def : InstRW<[KryoWrite_3cyc_LS_LS_LS_LS_329ln],
+	(instregex "LD4Four(v2d)$")>;
+def KryoWrite_4cyc_LS_X_X_X_X_337ln :
+	SchedWriteRes<[KryoUnitLS, KryoUnitX, KryoUnitX, KryoUnitX, KryoUnitX]> {
+	let Latency = 4; let NumMicroOps = 5;
+}
+def : InstRW<[KryoWrite_4cyc_LS_X_X_X_X_337ln],
+	(instregex "LD4(i8|i16|i32)$")>;
+def KryoWrite_3cyc_LS_XY_LS_LS_LS_350ln :
+	SchedWriteRes<[KryoUnitLS, KryoUnitXY, KryoUnitLS, KryoUnitLS, KryoUnitLS]> {
+	let Latency = 3; let NumMicroOps = 5;
+}
+def : InstRW<[KryoWrite_3cyc_LS_XY_LS_LS_LS_350ln, WriteAdr],
+	(instregex "LD4Four(v2d)_POST$")>;
+def KryoWrite_4cyc_LS_XY_X_X_X_X_355ln :
+	SchedWriteRes<[KryoUnitLS, KryoUnitXY, KryoUnitX, KryoUnitX, KryoUnitX,
+                   KryoUnitX]> {
+	let Latency = 4; let NumMicroOps = 6;
+}
+def : InstRW<[KryoWrite_4cyc_LS_XY_X_X_X_X_355ln, WriteAdr],
+	(instregex "LD4(i8|i16|i32)_POST$")>;
+def KryoWrite_4cyc_LS_LS_X_X_X_X_noRSV_noRSV_noRSV_noRSV_382ln :
+	SchedWriteRes<[KryoUnitLS, KryoUnitLS, KryoUnitX, KryoUnitX, KryoUnitX,
+                   KryoUnitX]> {
+	let Latency = 4; let NumMicroOps = 10;
+}
+def : InstRW<[KryoWrite_4cyc_LS_LS_X_X_X_X_noRSV_noRSV_noRSV_noRSV_382ln],
+	(instregex "LD4Four(v8b|v4h|v2s)$")>;
+def KryoWrite_4cyc_LS_XY_LS_X_X_X_X_noRSV_noRSV_noRSV_noRSV_384ln :
+	SchedWriteRes<[KryoUnitLS, KryoUnitXY, KryoUnitLS, KryoUnitX, KryoUnitX,
+                   KryoUnitX, KryoUnitX]> {
+	let Latency = 4; let NumMicroOps = 11;
+}
+def : InstRW<[KryoWrite_4cyc_LS_XY_LS_X_X_X_X_noRSV_noRSV_noRSV_noRSV_384ln, WriteAdr],
+	(instregex "LD4Four(v8b|v4h|v2s)_POST$")>;
+def KryoWrite_4cyc_LS_LS_X_X_X_X_LS_LS_X_X_X_X_386ln :
+	SchedWriteRes<[KryoUnitLS, KryoUnitLS, KryoUnitX, KryoUnitX, KryoUnitX,
+                   KryoUnitX, KryoUnitLS, KryoUnitLS, KryoUnitX, KryoUnitX,
+                   KryoUnitX, KryoUnitX]> {
+	let Latency = 4; let NumMicroOps = 12;
+}
+def : InstRW<[KryoWrite_4cyc_LS_LS_X_X_X_X_LS_LS_X_X_X_X_386ln],
+	(instregex "LD4Four(v16b|v8h|v4s)$")>;
+def KryoWrite_4cyc_LS_LS_X_X_X_X_LS_XY_LS_X_X_X_X_389ln :
+	SchedWriteRes<[KryoUnitLS, KryoUnitLS, KryoUnitX, KryoUnitX, KryoUnitX,
+                   KryoUnitX, KryoUnitLS, KryoUnitXY, KryoUnitLS, KryoUnitX,
+                   KryoUnitX, KryoUnitX, KryoUnitX]> {
+	let Latency = 4; let NumMicroOps = 13;
+}
+def : InstRW<[KryoWrite_4cyc_LS_LS_X_X_X_X_LS_XY_LS_X_X_X_X_389ln, WriteAdr],
+	(instregex "LD4Four(v16b|v8h|v4s)_POST$")>;
+def KryoWrite_3cyc_LS_LS_LS_LS_330ln :
+	SchedWriteRes<[KryoUnitLS, KryoUnitLS, KryoUnitLS, KryoUnitLS]> {
+	let Latency = 3; let NumMicroOps = 4;
+}
+def : InstRW<[KryoWrite_3cyc_LS_LS_LS_LS_330ln],
+	(instregex "LD4R(v16b|v8h|v4s|v2d)$")>;
+def KryoWrite_3cyc_LS_XY_LS_LS_LS_352ln :
+	SchedWriteRes<[KryoUnitLS, KryoUnitXY, KryoUnitLS, KryoUnitLS, KryoUnitLS]> {
+	let Latency = 3; let NumMicroOps = 5;
+}
+def : InstRW<[KryoWrite_3cyc_LS_XY_LS_LS_LS_352ln, WriteAdr],
+	(instregex "LD4R(v16b|v8h|v4s|v2d)_POST$")>;
+def KryoWrite_3cyc_LS_LS_noRSV_noRSV_noRSV_noRSV_359ln :
+	SchedWriteRes<[KryoUnitLS, KryoUnitLS]> {
+	let Latency = 3; let NumMicroOps = 6;
+}
+def : InstRW<[KryoWrite_3cyc_LS_LS_noRSV_noRSV_noRSV_noRSV_359ln],
+	(instregex "LD4R(v8b|v4h|v2s|v1d)$")>;
+def KryoWrite_3cyc_LS_XY_LS_noRSV_noRSV_noRSV_noRSV_369ln :
+	SchedWriteRes<[KryoUnitLS, KryoUnitXY, KryoUnitLS]> {
+	let Latency = 3; let NumMicroOps = 7;
+}
+def : InstRW<[KryoWrite_3cyc_LS_XY_LS_noRSV_noRSV_noRSV_noRSV_369ln, WriteAdr],
+	(instregex "LD4R(v8b|v4h|v2s|v1d)_POST$")>;
+def KryoWrite_3cyc_LS_LS_400ln :
+	SchedWriteRes<[KryoUnitLS, KryoUnitLS]> {
+	let Latency = 3; let NumMicroOps = 2;
+}
+def : InstRW<[KryoWrite_3cyc_LS_LS_400ln],
+	(instregex "(LDAX?R(B|H|W|X)|LDAXP(W|X))")>;
+def KryoWrite_3cyc_LS_LS_401ln :
+	SchedWriteRes<[KryoUnitLS, KryoUnitLS]> {
+	let Latency = 3; let NumMicroOps = 2;
+}
+def : InstRW<[KryoWrite_3cyc_LS_LS_401ln, WriteLDHi],
+	(instrs LDNPQi)>;
+def KryoWrite_3cyc_LS_noRSV_noRSV_408ln :
+	SchedWriteRes<[KryoUnitLS]> {
+	let Latency = 3; let NumMicroOps = 3;
+}
+def : InstRW<[KryoWrite_3cyc_LS_noRSV_noRSV_408ln, WriteLDHi],
+	(instrs LDNPDi, LDNPSi)>;
+def KryoWrite_3cyc_LS_394ln :
+	SchedWriteRes<[KryoUnitLS]> {
+	let Latency = 3; let NumMicroOps = 1;
+}
+def : InstRW<[KryoWrite_3cyc_LS_394ln, WriteLDHi],
+	(instrs LDNPWi, LDNPXi)>;
+def KryoWrite_3cyc_LS_LS_402ln :
+	SchedWriteRes<[KryoUnitLS, KryoUnitLS]> {
+	let Latency = 3; let NumMicroOps = 2;
+}
+def : InstRW<[KryoWrite_3cyc_LS_LS_402ln, WriteLDHi],
+	(instrs LDPQi)>;
+def KryoWrite_3cyc_LS_noRSV_noRSV_409ln :
+	SchedWriteRes<[KryoUnitLS]> {
+	let Latency = 3; let NumMicroOps = 3;
+}
+def : InstRW<[KryoWrite_3cyc_LS_noRSV_noRSV_409ln, WriteLDHi],
+	(instrs LDPDi, LDPSi)>;
+def KryoWrite_3cyc_LS_XY_LS_410ln :
+	SchedWriteRes<[KryoUnitLS, KryoUnitXY, KryoUnitLS]> {
+	let Latency = 3; let NumMicroOps = 3;
+}
+def : InstRW<[KryoWrite_3cyc_LS_XY_LS_410ln, WriteLDHi, WriteAdr],
+	(instregex "LDPQ(post|pre)")>;
+def KryoWrite_3cyc_LS_XY_noRSV_noRSV_411ln :
+	SchedWriteRes<[KryoUnitLS, KryoUnitXY]> {
+	let Latency = 3; let NumMicroOps = 4;
+}
+def : InstRW<[KryoWrite_3cyc_LS_XY_noRSV_noRSV_411ln, WriteLDHi, WriteAdr],
+	(instregex "LDP(D|S)(post|pre)")>;
+def KryoWrite_3cyc_LS_393ln :
+	SchedWriteRes<[KryoUnitLS]> {
+	let Latency = 3; let NumMicroOps = 1;
+}
+def : InstRW<[KryoWrite_3cyc_LS_393ln, WriteLDHi],
+	(instrs LDPWi, LDPXi)>;
+def KryoWrite_3cyc_LS_XY_403ln :
+	SchedWriteRes<[KryoUnitLS, KryoUnitXY]> {
+	let Latency = 3; let NumMicroOps = 2;
+}
+def : InstRW<[KryoWrite_3cyc_LS_XY_403ln, WriteLDHi, WriteAdr],
+	(instregex "LDP(W|X)(post|pre)")>;
+def KryoWrite_4cyc_LS_395ln :
+	SchedWriteRes<[KryoUnitLS]> {
+	let Latency = 4; let NumMicroOps = 1;
+}
+def : InstRW<[KryoWrite_4cyc_LS_395ln, WriteLDHi],
+	(instrs LDPSWi)>;
+def KryoWrite_4cyc_LS_XY_405ln :
+	SchedWriteRes<[KryoUnitLS, KryoUnitXY]> {
+	let Latency = 4; let NumMicroOps = 2;
+}
+def : InstRW<[KryoWrite_4cyc_LS_XY_405ln, WriteLDHi, WriteAdr],
+	(instrs LDPSWpost, LDPSWpre)>;
+def KryoWrite_3cyc_LS_264ln :
+	SchedWriteRes<[KryoUnitLS]> {
+	let Latency = 3; let NumMicroOps = 1;
+}
+def : InstRW<[KryoWrite_3cyc_LS_264ln],
+	(instrs LDRQui, LDRQl)>;
+def KryoWrite_4cyc_X_LS_271ln :
+	SchedWriteRes<[KryoUnitX, KryoUnitLS]> {
+	let Latency = 4; let NumMicroOps = 2;
+}
+def : InstRW<[KryoWrite_4cyc_X_LS_271ln],
+	(instrs LDRQroW, LDRQroX)>;
+def KryoWrite_3cyc_LS_noRSV_287ln :
+	SchedWriteRes<[KryoUnitLS]> {
+	let Latency = 3; let NumMicroOps = 2;
+}
+def : InstRW<[KryoWrite_3cyc_LS_noRSV_287ln],
+	(instregex "LDR((D|S)l|(D|S|H|B)ui)")>;
+def KryoWrite_3cyc_LS_XY_293ln :
+	SchedWriteRes<[KryoUnitLS, KryoUnitXY]> {
+	let Latency = 3; let NumMicroOps = 2;
+}
+def : InstRW<[KryoWrite_3cyc_LS_XY_293ln, WriteAdr],
+	(instrs LDRQpost, LDRQpre)>;
+def KryoWrite_4cyc_X_LS_noRSV_297ln :
+	SchedWriteRes<[KryoUnitX, KryoUnitLS]> {
+	let Latency = 4; let NumMicroOps = 3;
+}
+def : InstRW<[KryoWrite_4cyc_X_LS_noRSV_297ln],
+	(instregex "LDR(D|S|H|B)ro(W|X)")>;
+def KryoWrite_3cyc_LS_XY_noRSV_319ln :
+	SchedWriteRes<[KryoUnitLS, KryoUnitXY]> {
+	let Latency = 3; let NumMicroOps = 3;
+}
+def : InstRW<[KryoWrite_3cyc_LS_XY_noRSV_319ln, WriteAdr],
+	(instregex "LDR(D|S|H|B)(post|pre)")>;
+def KryoWrite_3cyc_LS_261ln :
+	SchedWriteRes<[KryoUnitLS]> {
+	let Latency = 3; let NumMicroOps = 1;
+}
+def : InstRW<[KryoWrite_3cyc_LS_261ln],
+	(instregex "LDR(BB|HH|W|X)ui")>;
+def KryoWrite_3cyc_LS_XY_292ln :
+	SchedWriteRes<[KryoUnitLS, KryoUnitXY]> {
+	let Latency = 3; let NumMicroOps = 2;
+}
+def : InstRW<[KryoWrite_3cyc_LS_XY_292ln, WriteAdr],
+	(instregex "LDR(BB|HH|W|X)(post|pre)")>;
+def KryoWrite_4cyc_X_LS_272ln :
+	SchedWriteRes<[KryoUnitX, KryoUnitLS]> {
+	let Latency = 4; let NumMicroOps = 2;
+}
+def : InstRW<[KryoWrite_4cyc_X_LS_272ln],
+	(instregex "(LDR(BB|HH|W|X)ro(W|X)|PRFMro(W|X))")>;
+def KryoWrite_3cyc_LS_262ln :
+	SchedWriteRes<[KryoUnitLS]> {
+	let Latency = 3; let NumMicroOps = 1;
+}
+def : InstRW<[KryoWrite_3cyc_LS_262ln],
+	(instrs LDRWl, LDRXl)>;
+def KryoWrite_4cyc_LS_268ln :
+	SchedWriteRes<[KryoUnitLS]> {
+	let Latency = 4; let NumMicroOps = 1;
+}
+def : InstRW<[KryoWrite_4cyc_LS_268ln],
+	(instregex "LDRS(BW|BX|HW|HX|W)ui")>;
+def KryoWrite_5cyc_X_LS_273ln :
+	SchedWriteRes<[KryoUnitX, KryoUnitLS]> {
+	let Latency = 5; let NumMicroOps = 2;
+}
+def : InstRW<[KryoWrite_5cyc_X_LS_273ln],
+	(instregex "LDRS(BW|BX|HW|HX|W)ro(W|X)")>;
+def KryoWrite_4cyc_LS_XY_294ln :
+	SchedWriteRes<[KryoUnitLS, KryoUnitXY]> {
+	let Latency = 4; let NumMicroOps = 2;
+}
+def : InstRW<[KryoWrite_4cyc_LS_XY_294ln, WriteAdr],
+	(instregex "LDRS(BW|BX|HW|HX|W)(post|pre)")>;
+def KryoWrite_4cyc_LS_269ln :
+	SchedWriteRes<[KryoUnitLS]> {
+	let Latency = 4; let NumMicroOps = 1;
+}
+def : InstRW<[KryoWrite_4cyc_LS_269ln],
+	(instrs LDRSWl)>;
+def KryoWrite_3cyc_LS_260ln :
+	SchedWriteRes<[KryoUnitLS]> {
+	let Latency = 3; let NumMicroOps = 1;
+}
+def : InstRW<[KryoWrite_3cyc_LS_260ln],
+	(instregex "LDTR(B|H|W|X)i")>;
+def KryoWrite_4cyc_LS_267ln :
+	SchedWriteRes<[KryoUnitLS]> {
+	let Latency = 4; let NumMicroOps = 1;
+}
+def : InstRW<[KryoWrite_4cyc_LS_267ln],
+	(instregex "LDTRS(BW|BX|HW|HX|W)i")>;
+def KryoWrite_3cyc_LS_263ln :
+	SchedWriteRes<[KryoUnitLS]> {
+	let Latency = 3; let NumMicroOps = 1;
+}
+def : InstRW<[KryoWrite_3cyc_LS_263ln],
+	(instrs LDURQi)>;
+def KryoWrite_3cyc_LS_noRSV_288ln :
+	SchedWriteRes<[KryoUnitLS]> {
+	let Latency = 3; let NumMicroOps = 2;
+}
+def : InstRW<[KryoWrite_3cyc_LS_noRSV_288ln],
+	(instregex "LDUR(D|S|H|B)i")>;
+def KryoWrite_3cyc_LS_259ln :
+	SchedWriteRes<[KryoUnitLS]> {
+	let Latency = 3; let NumMicroOps = 1;
+}
+def : InstRW<[KryoWrite_3cyc_LS_259ln],
+	(instregex "LDUR(BB|HH|W|X)i")>;
+def KryoWrite_4cyc_LS_266ln :
+	SchedWriteRes<[KryoUnitLS]> {
+	let Latency = 4; let NumMicroOps = 1;
+}
+def : InstRW<[KryoWrite_4cyc_LS_266ln],
+	(instregex "LDURS(B|H)?(W|X)i")>;
+def KryoWrite_3cyc_LS_258ln :
+	SchedWriteRes<[KryoUnitLS]> {
+	let Latency = 3; let NumMicroOps = 1;
+}
+def : InstRW<[KryoWrite_3cyc_LS_258ln],
+	(instregex "LDXP(W|X)")>;
+def KryoWrite_3cyc_LS_258_1ln :
+	SchedWriteRes<[KryoUnitLS]> {
+	let Latency = 3; let NumMicroOps = 1;
+}
+def : InstRW<[KryoWrite_3cyc_LS_258_1ln],
+	(instregex "LDXR(B|H|W|X)")>;
+def KryoWrite_2cyc_XY_XY_137ln :
+	SchedWriteRes<[KryoUnitXY, KryoUnitXY]> {
+	let Latency = 2; let NumMicroOps = 2;
+}
+def : InstRW<[KryoWrite_2cyc_XY_XY_137ln],
+	(instrs LSLVWr, LSLVXr)>;
+def KryoWrite_1cyc_XY_135ln :
+	SchedWriteRes<[KryoUnitXY]> {
+	let Latency = 1; let NumMicroOps = 1;
+}
+def : InstRW<[KryoWrite_1cyc_XY_135ln],
+	(instregex "(LS|AS|RO)RV(W|X)r")>;
+def KryoWrite_4cyc_X_84ln :
+	SchedWriteRes<[KryoUnitX]> {
+	let Latency = 4; let NumMicroOps = 1;
+}
+def : InstRW<[KryoWrite_4cyc_X_84ln],
+	(instrs MADDWrrr, MSUBWrrr)>;
+def KryoWrite_5cyc_X_85ln :
+	SchedWriteRes<[KryoUnitX]> {
+	let Latency = 5; let NumMicroOps = 1;
+}
+def : InstRW<[KryoWrite_5cyc_X_85ln],
+	(instrs MADDXrrr, MSUBXrrr)>;
+def KryoWrite_4cyc_X_noRSV_188ln :
+	SchedWriteRes<[KryoUnitX]> {
+	let Latency = 4; let NumMicroOps = 2;
+}
+def : InstRW<[KryoWrite_4cyc_X_noRSV_188ln],
+	(instregex "(MLA|MLS|MUL)(v8i8|v4i16|v2i32)(_indexed)?")>;
+def KryoWrite_4cyc_X_X_192ln :
+	SchedWriteRes<[KryoUnitX, KryoUnitX]> {
+	let Latency = 4; let NumMicroOps = 2;
+}
+def : InstRW<[KryoWrite_4cyc_X_X_192ln],
+	(instregex "(MLA|MLS|MUL|SQR?DMULH)(v16i8|v8i16|v4i32)(_indexed)?")>;
+def KryoWrite_1cyc_XY_noRSV_198ln :
+	SchedWriteRes<[KryoUnitXY]> {
+	let Latency = 1; let NumMicroOps = 2;
+}
+def : InstRW<[KryoWrite_1cyc_XY_noRSV_198ln],
+	(instregex "(MOVI|MVNI)(D|v8b_ns|v2i32|v4i16|v2s_msl)")>;
+def KryoWrite_1cyc_XY_XY_199ln :
+	SchedWriteRes<[KryoUnitXY, KryoUnitXY]> {
+	let Latency = 1; let NumMicroOps = 2;
+}
+def : InstRW<[KryoWrite_1cyc_XY_XY_199ln],
+	(instregex "(MOVI|MVNI)(v2d_ns|v16b_ns|v4i32|v8i16|v4s_msl)")>;
+def KryoWrite_1cyc_X_89ln :
+	SchedWriteRes<[KryoUnitX]> {
+	let Latency = 1; let NumMicroOps = 1;
+}
+def : InstRW<[KryoWrite_1cyc_X_89ln],
+	(instrs MOVKWi, MOVKXi)>;
+def KryoWrite_1cyc_XY_91ln :
+	SchedWriteRes<[KryoUnitXY]> {
+	let Latency = 1; let NumMicroOps = 1;
+}
+def : InstRW<[KryoWrite_1cyc_XY_91ln],
+	(instrs MOVNWi, MOVNXi)>;
+def KryoWrite_1cyc_XY_90ln :
+	SchedWriteRes<[KryoUnitXY]> {
+	let Latency = 1; let NumMicroOps = 1;
+}
+def : InstRW<[KryoWrite_1cyc_XY_90ln],
+	(instrs MOVZWi, MOVZXi)>;
+def KryoWrite_2cyc_XY_93ln :
+	SchedWriteRes<[KryoUnitXY]> {
+	let Latency = 2; let NumMicroOps = 1;
+}
+def : InstRW<[KryoWrite_2cyc_XY_93ln],
+	(instrs MRS)>;
+def KryoWrite_0cyc_X_87ln :
+	SchedWriteRes<[KryoUnitX]> {
+	let Latency = 0; let NumMicroOps = 1;
+}
+def : InstRW<[KryoWrite_0cyc_X_87ln],
+	(instrs MSRpstateImm4)>;
+def : InstRW<[KryoWrite_0cyc_X_87ln],
+	(instrs MSRpstateImm1)>;
+def KryoWrite_0cyc_XY_88ln :
+	SchedWriteRes<[KryoUnitXY]> {
+	let Latency = 0; let NumMicroOps = 1;
+}
+def : InstRW<[KryoWrite_0cyc_XY_88ln],
+	(instrs MSR)>;
+def KryoWrite_1cyc_XY_noRSV_143ln :
+	SchedWriteRes<[KryoUnitXY]> {
+	let Latency = 1; let NumMicroOps = 2;
+}
+def : InstRW<[KryoWrite_1cyc_XY_noRSV_143ln],
+	(instregex "NEG(v8i8|v4i16|v2i32|v1i64)")>;
+def KryoWrite_1cyc_XY_XY_145ln :
+	SchedWriteRes<[KryoUnitXY, KryoUnitXY]> {
+	let Latency = 1; let NumMicroOps = 2;
+}
+def : InstRW<[KryoWrite_1cyc_XY_XY_145ln],
+	(instregex "NEG(v16i8|v8i16|v4i32|v2i64)")>;
+def KryoWrite_1cyc_XY_noRSV_193ln :
+	SchedWriteRes<[KryoUnitXY]> {
+	let Latency = 1; let NumMicroOps = 2;
+}
+def : InstRW<[KryoWrite_1cyc_XY_noRSV_193ln],
+	(instrs NOTv8i8)>;
+def KryoWrite_1cyc_XY_XY_194ln :
+	SchedWriteRes<[KryoUnitXY, KryoUnitXY]> {
+	let Latency = 1; let NumMicroOps = 2;
+}
+def : InstRW<[KryoWrite_1cyc_XY_XY_194ln],
+	(instrs NOTv16i8)>;
+def KryoWrite_2cyc_XY_noRSV_234ln :
+	SchedWriteRes<[KryoUnitXY]> {
+	let Latency = 2; let NumMicroOps = 2;
+}
+def : InstRW<[KryoWrite_2cyc_XY_noRSV_234ln],
+	(instrs PMULv8i8)>;
+def KryoWrite_2cyc_XY_XY_236ln :
+	SchedWriteRes<[KryoUnitXY, KryoUnitXY]> {
+	let Latency = 2; let NumMicroOps = 2;
+}
+def : InstRW<[KryoWrite_2cyc_XY_XY_236ln],
+	(instrs PMULv16i8)>;
+def KryoWrite_2cyc_XY_XY_235ln :
+	SchedWriteRes<[KryoUnitXY, KryoUnitXY]> {
+	let Latency = 2; let NumMicroOps = 2;
+}
+def : InstRW<[KryoWrite_2cyc_XY_XY_235ln],
+	(instrs PMULLv8i8, PMULLv16i8)>;
+def KryoWrite_3cyc_XY_XY_237ln :
+	SchedWriteRes<[KryoUnitXY, KryoUnitXY]> {
+	let Latency = 3; let NumMicroOps = 2;
+}
+def : InstRW<[KryoWrite_3cyc_XY_XY_237ln],
+	(instrs PMULLv1i64, PMULLv2i64)>;
+def KryoWrite_0cyc_LS_254ln :
+	SchedWriteRes<[KryoUnitLS]> {
+	let Latency = 0; let NumMicroOps = 1;
+}
+def : InstRW<[KryoWrite_0cyc_LS_254ln],
+	(instrs PRFMl, PRFMui)>;
+def KryoWrite_0cyc_LS_253ln :
+	SchedWriteRes<[KryoUnitLS]> {
+	let Latency = 0; let NumMicroOps = 1;
+}
+def : InstRW<[KryoWrite_0cyc_LS_253ln],
+	(instrs PRFUMi)>;
+def KryoWrite_6cyc_XY_X_noRSV_175ln :
+	SchedWriteRes<[KryoUnitXY, KryoUnitX]> {
+	let Latency = 6; let NumMicroOps = 3;
+}
+def : InstRW<[KryoWrite_6cyc_XY_X_noRSV_175ln],
+	(instregex "R(ADD|SUB)HNv.*")>;
+def KryoWrite_2cyc_XY_204ln :
+	SchedWriteRes<[KryoUnitXY]> {
+	let Latency = 2; let NumMicroOps = 1;
+}
+def : InstRW<[KryoWrite_2cyc_XY_204ln],
+	(instrs RBITWr, RBITXr)>;
+def KryoWrite_2cyc_XY_noRSV_218ln :
+	SchedWriteRes<[KryoUnitXY]> {
+	let Latency = 2; let NumMicroOps = 2;
+}
+def : InstRW<[KryoWrite_2cyc_XY_noRSV_218ln],
+	(instrs RBITv8i8)>;
+def KryoWrite_2cyc_XY_XY_219ln :
+	SchedWriteRes<[KryoUnitXY, KryoUnitXY]> {
+	let Latency = 2; let NumMicroOps = 2;
+}
+def : InstRW<[KryoWrite_2cyc_XY_XY_219ln],
+	(instrs RBITv16i8)>;
+def KryoWrite_1cyc_X_202ln :
+	SchedWriteRes<[KryoUnitX]> {
+	let Latency = 1; let NumMicroOps = 1;
+}
+def : InstRW<[KryoWrite_1cyc_X_202ln],
+	(instregex "REV(16|32)?(W|X)r")>;
+def KryoWrite_1cyc_XY_noRSV_214ln :
+	SchedWriteRes<[KryoUnitXY]> {
+	let Latency = 1; let NumMicroOps = 2;
+}
+def : InstRW<[KryoWrite_1cyc_XY_noRSV_214ln],
+	(instregex "REV(16|32|64)(v8i8|v4i16|v2i32)")>;
+def KryoWrite_1cyc_XY_XY_216ln :
+	SchedWriteRes<[KryoUnitXY, KryoUnitXY]> {
+	let Latency = 1; let NumMicroOps = 2;
+}
+def : InstRW<[KryoWrite_1cyc_XY_XY_216ln],
+	(instregex "REV(16|32|64)(v16i8|v8i16|v4i32)")>;
+def KryoWrite_3cyc_X_noRSV_244ln :
+	SchedWriteRes<[KryoUnitX]> {
+	let Latency = 3; let NumMicroOps = 2;
+}
+def : InstRW<[KryoWrite_3cyc_X_noRSV_244ln],
+	(instregex "S(L|R)I(d|(v8i8|v4i16|v2i32)_shift)")>;
+def KryoWrite_3cyc_X_X_245ln :
+	SchedWriteRes<[KryoUnitX, KryoUnitX]> {
+	let Latency = 3; let NumMicroOps = 2;
+}
+def : InstRW<[KryoWrite_3cyc_X_X_245ln],
+	(instregex "S(L|R)I(v16i8|v8i16|v4i32|v2i64)_shift")>;
+def KryoWrite_1cyc_XY_2ln :
+	SchedWriteRes<[KryoUnitXY]> {
+	let Latency = 1; let NumMicroOps = 1;
+}
+def : InstRW<[KryoWrite_1cyc_XY_2ln, ReadI, ReadI],
+	(instregex "SBCS?(W|X)r")>;
+def KryoWrite_2cyc_XA_XA_XA_24ln :
+	SchedWriteRes<[KryoUnitXA, KryoUnitXA, KryoUnitXA]> {
+	let Latency = 2; let NumMicroOps = 3;
+}
+def : InstRW<[KryoWrite_2cyc_XA_XA_XA_24ln],
+	(instrs SHA1Crrr, SHA1Mrrr, SHA1Prrr)>;
+def KryoWrite_1cyc_XY_noRSV_21ln :
+	SchedWriteRes<[KryoUnitXY]> {
+	let Latency = 1; let NumMicroOps = 2;
+}
+def : InstRW<[KryoWrite_1cyc_XY_noRSV_21ln],
+	(instrs SHA1Hrr)>;
+def KryoWrite_2cyc_X_X_23ln :
+	SchedWriteRes<[KryoUnitX, KryoUnitX]> {
+	let Latency = 2; let NumMicroOps = 2;
+}
+def : InstRW<[KryoWrite_2cyc_X_X_23ln],
+	(instrs SHA1SU0rrr, SHA1SU1rr, SHA256SU0rr)>;
+def KryoWrite_4cyc_XA_XA_XA_25ln :
+	SchedWriteRes<[KryoUnitXA, KryoUnitXA, KryoUnitXA]> {
+	let Latency = 4; let NumMicroOps = 3;
+}
+def : InstRW<[KryoWrite_4cyc_XA_XA_XA_25ln],
+	(instrs SHA256Hrrr, SHA256H2rrr)>;
+def KryoWrite_3cyc_XY_XY_X_X_26ln :
+	SchedWriteRes<[KryoUnitXY, KryoUnitXY, KryoUnitX, KryoUnitX]> {
+	let Latency = 3; let NumMicroOps = 4;
+}
+def : InstRW<[KryoWrite_3cyc_XY_XY_X_X_26ln],
+	(instrs SHA256SU1rrr)>;
+def KryoWrite_4cyc_X_noRSV_189ln :
+	SchedWriteRes<[KryoUnitX]> {
+	let Latency = 4; let NumMicroOps = 2;
+}
+def : InstRW<[KryoWrite_4cyc_X_noRSV_189ln],
+	(instregex "SQR?DMULH(v8i8|v4i16|v1i32|v2i32|v1i16)(_indexed)?")>;
+def KryoWrite_3cyc_XY_noRSV_68ln :
+	SchedWriteRes<[KryoUnitXY]> {
+	let Latency = 3; let NumMicroOps = 2;
+}
+def : InstRW<[KryoWrite_3cyc_XY_noRSV_68ln],
+	(instregex "SQ(ABS|NEG)(v1i8|v1i16|v1i32|v1i64)")>;
+def KryoWrite_3cyc_XY_noRSV_157ln :
+	SchedWriteRes<[KryoUnitXY]> {
+	let Latency = 3; let NumMicroOps = 2;
+}
+def : InstRW<[KryoWrite_3cyc_XY_noRSV_157ln],
+	(instregex "SQ(ABS|NEG)(v8i8|v4i16|v2i32)")>;
+def KryoWrite_3cyc_XY_XY_164ln :
+	SchedWriteRes<[KryoUnitXY, KryoUnitXY]> {
+	let Latency = 3; let NumMicroOps = 2;
+}
+def : InstRW<[KryoWrite_3cyc_XY_XY_164ln],
+	(instregex "SQ(ABS|NEG)(v16i8|v8i16|v4i32|v2i64)")>;
+def KryoWrite_4cyc_X_noRSV_190ln :
+	SchedWriteRes<[KryoUnitX]> {
+	let Latency = 4; let NumMicroOps = 2;
+}
+def : InstRW<[KryoWrite_4cyc_X_noRSV_190ln],
+	(instregex "SQD(MLAL|MLSL|MULL)(i16|i32)")>;
+def KryoWrite_0cyc_LS_Y_274ln :
+	SchedWriteRes<[KryoUnitLS, KryoUnitY]> {
+	let Latency = 0; let NumMicroOps = 2;
+}
+def : InstRW<[KryoWrite_0cyc_LS_Y_274ln],
+	(instregex "ST1(One(v8b|v4h|v2s|v1d|v16b|v8h|v4s|v2d)|(i8|i16|i32|i64)|Two(v8b|v4h|v2s|v1d))$")>;
+def KryoWrite_1cyc_LS_Y_X_301ln :
+	SchedWriteRes<[KryoUnitLS, KryoUnitY, KryoUnitX]> {
+	let Latency = 1; let NumMicroOps = 3;
+}
+def : InstRW<[WriteAdr, KryoWrite_1cyc_LS_Y_X_301ln],
+	(instregex "ST1(One(v8b|v4h|v2s|v1d|v16b|v8h|v4s|v2d)|(i8|i16|i32|i64)|Two(v8b|v4h|v2s|v1d))_POST$")>;
+def KryoWrite_1cyc_LS_Y_XY_305ln :
+	SchedWriteRes<[KryoUnitLS, KryoUnitY, KryoUnitXY]> {
+	let Latency = 1; let NumMicroOps = 3;
+}
+def : InstRW<[WriteAdr, KryoWrite_1cyc_LS_Y_XY_305ln],
+	(instregex "ST1(One(v16b|v8h|v4s|v2d)|Two(v8b|v4h|v2s|v1d))_POST$")>;
+def KryoWrite_0cyc_LS_Y_LS_Y_323ln :
+	SchedWriteRes<[KryoUnitLS, KryoUnitY, KryoUnitLS, KryoUnitY]> {
+	let Latency = 0; let NumMicroOps = 4;
+}
+def : InstRW<[WriteAdr, KryoWrite_0cyc_LS_Y_LS_Y_323ln],
+	(instregex "ST1(Two(v16b|v8h|v4s|v2d)|(Three|Four)(v8b|v4h|v2s|v1d))_POST$")>;
+def KryoWrite_1cyc_LS_Y_XY_LS_Y_345ln :
+	SchedWriteRes<[KryoUnitLS, KryoUnitY, KryoUnitXY, KryoUnitLS, KryoUnitY]> {
+	let Latency = 1; let NumMicroOps = 5;
+}
+def : InstRW<[KryoWrite_1cyc_LS_Y_XY_LS_Y_345ln],
+	(instregex "ST1(Two(v16b|v8h|v4s|v2d)|(Three|Four)(v8b|v4h|v2s|v1d))$")>;
+def KryoWrite_0cyc_LS_Y_LS_Y_LS_Y_356ln :
+	SchedWriteRes<[KryoUnitLS, KryoUnitY, KryoUnitLS, KryoUnitY, KryoUnitLS,
+                   KryoUnitY]> {
+	let Latency = 0; let NumMicroOps = 6;
+}
+def : InstRW<[KryoWrite_0cyc_LS_Y_LS_Y_LS_Y_356ln],
+	(instregex "ST1Three(v16b|v8h|v4s|v2d)$")>;
+def KryoWrite_1cyc_LS_Y_XY_LS_Y_LS_Y_366ln :
+	SchedWriteRes<[KryoUnitLS, KryoUnitY, KryoUnitXY, KryoUnitLS, KryoUnitY,
+                   KryoUnitLS, KryoUnitY]> {
+	let Latency = 1; let NumMicroOps = 7;
+}
+def : InstRW<[WriteAdr, KryoWrite_1cyc_LS_Y_XY_LS_Y_LS_Y_366ln],
+	(instregex "ST1Three(v16b|v8h|v4s|v2d)_POST$")>;
+def KryoWrite_0cyc_LS_Y_LS_Y_LS_Y_LS_Y_371ln :
+	SchedWriteRes<[KryoUnitLS, KryoUnitY, KryoUnitLS, KryoUnitY, KryoUnitLS,
+                   KryoUnitY, KryoUnitLS, KryoUnitY]> {
+	let Latency = 0; let NumMicroOps = 8;
+}
+def : InstRW<[KryoWrite_0cyc_LS_Y_LS_Y_LS_Y_LS_Y_371ln],
+	(instregex "ST1Four(v16b|v8h|v4s|v2d)$")>;
+def KryoWrite_0cyc_LS_Y_LS_Y_XY_LS_Y_LS_Y_377ln :
+	SchedWriteRes<[KryoUnitLS, KryoUnitY, KryoUnitLS, KryoUnitY, KryoUnitXY,
+                   KryoUnitLS, KryoUnitY, KryoUnitLS, KryoUnitY]> {
+	let Latency = 0; let NumMicroOps = 9;
+}
+def : InstRW<[WriteAdr, KryoWrite_0cyc_LS_Y_LS_Y_XY_LS_Y_LS_Y_377ln],
+	(instregex "ST1Four(v16b|v8h|v4s|v2d)_POST$")>;
+def KryoWrite_0cyc_LS_Y_275ln :
+	SchedWriteRes<[KryoUnitLS, KryoUnitY]> {
+	let Latency = 0; let NumMicroOps = 2;
+}
+def : InstRW<[KryoWrite_0cyc_LS_Y_275ln],
+	(instregex "ST2(Two(v8b|v4h|v2s|v1d|v16b|v8h|v4s|v2d)|(i8|i16|i32|i64))$")>;
+def KryoWrite_1cyc_LS_Y_XY_306ln :
+	SchedWriteRes<[KryoUnitLS, KryoUnitY, KryoUnitXY]> {
+	let Latency = 1; let NumMicroOps = 3;
+}
+def : InstRW<[WriteAdr, KryoWrite_1cyc_LS_Y_XY_306ln],
+	(instregex "ST2(Two(v8b|v4h|v2s|v1d)|(i8|i16|i32|i64))_POST$")>;
+def KryoWrite_0cyc_LS_Y_LS_Y_322ln :
+	SchedWriteRes<[KryoUnitLS, KryoUnitY, KryoUnitLS, KryoUnitY]> {
+	let Latency = 0; let NumMicroOps = 4;
+}
+def : InstRW<[KryoWrite_0cyc_LS_Y_LS_Y_322ln],
+	(instregex "ST2Two(v16b|v8h|v4s|v2d)$")>;
+def KryoWrite_1cyc_LS_Y_XY_LS_Y_344ln :
+	SchedWriteRes<[KryoUnitLS, KryoUnitY, KryoUnitXY, KryoUnitLS, KryoUnitY]> {
+	let Latency = 1; let NumMicroOps = 5;
+}
+def : InstRW<[WriteAdr, KryoWrite_1cyc_LS_Y_XY_LS_Y_344ln],
+	(instregex "ST2Two(v16b|v8h|v4s|v2d)_POST$")>;
+def KryoWrite_0cyc_LS_Y_LS_Y_324ln :
+	SchedWriteRes<[KryoUnitLS, KryoUnitY, KryoUnitLS, KryoUnitY]> {
+	let Latency = 0; let NumMicroOps = 4;
+}
+def : InstRW<[KryoWrite_0cyc_LS_Y_LS_Y_324ln],
+	(instregex "ST3(Threev1d|(i8|i16|i32|i64))$")>;
+def KryoWrite_1cyc_LS_Y_XY_LS_Y_346ln :
+	SchedWriteRes<[KryoUnitLS, KryoUnitY, KryoUnitXY, KryoUnitLS, KryoUnitY]> {
+	let Latency = 1; let NumMicroOps = 5;
+}
+def : InstRW<[WriteAdr, KryoWrite_1cyc_LS_Y_XY_LS_Y_346ln],
+	(instregex "ST3(Threev1d|(i8|i16|i32|i64))_POST$")>;
+def KryoWrite_1cyc_X_X_LS_Y_LS_Y_353ln :
+	SchedWriteRes<[KryoUnitX, KryoUnitX, KryoUnitLS, KryoUnitY, KryoUnitLS,
+                   KryoUnitY]> {
+	let Latency = 1; let NumMicroOps = 6;
+}
+def : InstRW<[KryoWrite_1cyc_X_X_LS_Y_LS_Y_353ln],
+	(instregex "ST3Three(v8b|v4h|v2s)$")>;
+def KryoWrite_0cyc_LS_Y_LS_Y_LS_Y_357ln :
+	SchedWriteRes<[KryoUnitLS, KryoUnitY, KryoUnitLS, KryoUnitY, KryoUnitLS,
+                   KryoUnitY]> {
+	let Latency = 0; let NumMicroOps = 6;
+}
+def : InstRW<[KryoWrite_0cyc_LS_Y_LS_Y_LS_Y_357ln],
+	(instregex "ST3Threev2d$")>;
+def KryoWrite_1cyc_X_X_LS_Y_XY_LS_Y_363ln :
+	SchedWriteRes<[KryoUnitX, KryoUnitX, KryoUnitLS, KryoUnitY, KryoUnitXY,
+                   KryoUnitLS, KryoUnitY]> {
+	let Latency = 1; let NumMicroOps = 7;
+}
+def : InstRW<[WriteAdr, KryoWrite_1cyc_X_X_LS_Y_XY_LS_Y_363ln],
+	(instregex "ST3Three(v8b|v4h|v2s)_POST$")>;
+def KryoWrite_1cyc_LS_Y_XY_LS_Y_LS_Y_367ln :
+	SchedWriteRes<[KryoUnitLS, KryoUnitY, KryoUnitXY, KryoUnitLS, KryoUnitY,
+                   KryoUnitLS, KryoUnitY]> {
+	let Latency = 1; let NumMicroOps = 7;
+}
+def : InstRW<[WriteAdr, KryoWrite_1cyc_LS_Y_XY_LS_Y_LS_Y_367ln],
+	(instregex "ST3Threev2d_POST$")>;
+def KryoWrite_1cyc_X_X_LS_Y_LS_Y_X_X_LS_Y_LS_Y_385ln :
+	SchedWriteRes<[KryoUnitX, KryoUnitX, KryoUnitLS, KryoUnitY, KryoUnitLS,
+                   KryoUnitY, KryoUnitX, KryoUnitX, KryoUnitLS, KryoUnitY,
+                   KryoUnitLS, KryoUnitY]> {
+	let Latency = 1; let NumMicroOps = 12;
+}
+def : InstRW<[KryoWrite_1cyc_X_X_LS_Y_LS_Y_X_X_LS_Y_LS_Y_385ln],
+	(instregex "ST3Three(v16b|v8h|v4s)$")>;
+def KryoWrite_1cyc_X_X_LS_Y_LS_Y_X_X_LS_Y_XY_LS_Y_388ln :
+	SchedWriteRes<[KryoUnitX, KryoUnitX, KryoUnitLS, KryoUnitY, KryoUnitLS,
+                   KryoUnitY, KryoUnitX, KryoUnitX, KryoUnitLS, KryoUnitY,
+                   KryoUnitXY, KryoUnitLS, KryoUnitY]> {
+	let Latency = 1; let NumMicroOps = 13;
+}
+def : InstRW<[WriteAdr, KryoWrite_1cyc_X_X_LS_Y_LS_Y_X_X_LS_Y_XY_LS_Y_388ln],
+	(instregex "ST3Three(v16b|v8h|v4s)_POST$")>;
+def KryoWrite_0cyc_LS_Y_LS_Y_325ln :
+	SchedWriteRes<[KryoUnitLS, KryoUnitY, KryoUnitLS, KryoUnitY]> {
+	let Latency = 0; let NumMicroOps = 4;
+}
+def : InstRW<[KryoWrite_0cyc_LS_Y_LS_Y_325ln],
+	(instregex "ST4(Fourv1d|(i8|i16|i32|i64))$")>;
+def KryoWrite_1cyc_LS_Y_XY_LS_Y_347ln :
+	SchedWriteRes<[KryoUnitLS, KryoUnitY, KryoUnitXY, KryoUnitLS, KryoUnitY]> {
+	let Latency = 1; let NumMicroOps = 5;
+}
+def : InstRW<[WriteAdr, KryoWrite_1cyc_LS_Y_XY_LS_Y_347ln],
+	(instregex "ST4(Fourv1d|(i8|i16|i32|i64))_POST$")>;
+def KryoWrite_1cyc_X_X_LS_Y_X_X_LS_Y_370ln :
+	SchedWriteRes<[KryoUnitX, KryoUnitX, KryoUnitLS, KryoUnitY, KryoUnitX,
+                   KryoUnitX, KryoUnitLS, KryoUnitY]> {
+	let Latency = 1; let NumMicroOps = 8;
+}
+def : InstRW<[KryoWrite_1cyc_X_X_LS_Y_X_X_LS_Y_370ln],
+	(instregex "ST4Four(v8b|v4h|v2s)$")>;
+def KryoWrite_0cyc_LS_Y_LS_Y_LS_Y_LS_Y_372ln :
+	SchedWriteRes<[KryoUnitLS, KryoUnitY, KryoUnitLS, KryoUnitY, KryoUnitLS,
+                   KryoUnitY, KryoUnitLS, KryoUnitY]> {
+	let Latency = 0; let NumMicroOps = 8;
+}
+def : InstRW<[KryoWrite_0cyc_LS_Y_LS_Y_LS_Y_LS_Y_372ln],
+	(instregex "ST4Fourv2d$")>;
+def KryoWrite_1cyc_X_X_LS_Y_XY_X_X_LS_Y_375ln :
+	SchedWriteRes<[KryoUnitX, KryoUnitX, KryoUnitLS, KryoUnitY, KryoUnitXY,
+                   KryoUnitX, KryoUnitX, KryoUnitLS, KryoUnitY]> {
+	let Latency = 1; let NumMicroOps = 9;
+}
+def : InstRW<[WriteAdr, KryoWrite_1cyc_X_X_LS_Y_XY_X_X_LS_Y_375ln],
+	(instregex "ST4Four(v8b|v4h|v2s)_POST$")>;
+def KryoWrite_0cyc_LS_Y_LS_Y_XY_LS_Y_LS_Y_379ln :
+	SchedWriteRes<[KryoUnitLS, KryoUnitY, KryoUnitLS, KryoUnitY, KryoUnitXY,
+                   KryoUnitLS, KryoUnitY, KryoUnitLS, KryoUnitY]> {
+	let Latency = 0; let NumMicroOps = 9;
+}
+def : InstRW<[WriteAdr, KryoWrite_0cyc_LS_Y_LS_Y_XY_LS_Y_LS_Y_379ln],
+	(instregex "ST4Fourv2d_POST$")>;
+def KryoWrite_1cyc_X_X_LS_Y_X_X_LS_Y_X_X_LS_Y_X_X_LS_Y_390ln :
+	SchedWriteRes<[KryoUnitX, KryoUnitX, KryoUnitLS, KryoUnitY, KryoUnitX,
+                   KryoUnitX, KryoUnitLS, KryoUnitY, KryoUnitX, KryoUnitX,
+                   KryoUnitLS, KryoUnitY, KryoUnitX, KryoUnitX, KryoUnitLS,
+                   KryoUnitY]> {
+	let Latency = 1; let NumMicroOps = 16;
+}
+def : InstRW<[KryoWrite_1cyc_X_X_LS_Y_X_X_LS_Y_X_X_LS_Y_X_X_LS_Y_390ln],
+	(instregex "ST4Four(v16b|v8h|v4s)$")>;
+def KryoWrite_1cyc_X_X_LS_Y_X_X_LS_Y_X_X_LS_Y_XY_X_X_LS_Y_392ln :
+	SchedWriteRes<[KryoUnitX, KryoUnitX, KryoUnitLS, KryoUnitY, KryoUnitX,
+                   KryoUnitX, KryoUnitLS, KryoUnitY, KryoUnitX, KryoUnitX,
+                   KryoUnitLS, KryoUnitY, KryoUnitXY, KryoUnitX, KryoUnitX,
+                   KryoUnitLS, KryoUnitY]> {
+	let Latency = 1; let NumMicroOps = 17;
+}
+def : InstRW<[WriteAdr, KryoWrite_1cyc_X_X_LS_Y_X_X_LS_Y_X_X_LS_Y_XY_X_X_LS_Y_392ln],
+	(instregex "ST4Four(v16b|v8h|v4s)_POST$")>;
+def KryoWrite_0cyc_LS_LS_Y_299ln :
+	SchedWriteRes<[KryoUnitLS, KryoUnitLS, KryoUnitY]> {
+	let Latency = 0; let NumMicroOps = 3;
+}
+def : InstRW<[KryoWrite_0cyc_LS_LS_Y_299ln],
+	(instregex "STLR(B|H|W|X)")>;
+def KryoWrite_3cyc_LS_LS_Y_307ln :
+	SchedWriteRes<[KryoUnitLS, KryoUnitLS, KryoUnitY]> {
+	let Latency = 3; let NumMicroOps = 3;
+}
+def : InstRW<[KryoWrite_3cyc_LS_LS_Y_307ln],
+	(instregex "STLX(P(W|X)|R(B|H|W|X))")>;
+def KryoWrite_0cyc_LS_Y_276ln :
+	SchedWriteRes<[KryoUnitLS, KryoUnitY]> {
+	let Latency = 0; let NumMicroOps = 2;
+}
+def : InstRW<[KryoWrite_0cyc_LS_Y_276ln],
+	(instrs STNPDi, STNPSi)>;
+def KryoWrite_0cyc_LS_Y_LS_Y_326ln :
+	SchedWriteRes<[KryoUnitLS, KryoUnitY, KryoUnitLS, KryoUnitY]> {
+	let Latency = 0; let NumMicroOps = 4;
+}
+def : InstRW<[KryoWrite_0cyc_LS_Y_LS_Y_326ln],
+	(instrs STNPQi)>;
+def KryoWrite_0cyc_LS_Y_280ln :
+	SchedWriteRes<[KryoUnitLS, KryoUnitY]> {
+	let Latency = 0; let NumMicroOps = 2;
+}
+def : InstRW<[KryoWrite_0cyc_LS_Y_280ln],
+	(instrs STNPWi, STNPXi)>;
+def KryoWrite_0cyc_LS_Y_277ln :
+	SchedWriteRes<[KryoUnitLS, KryoUnitY]> {
+	let Latency = 0; let NumMicroOps = 2;
+}
+def : InstRW<[KryoWrite_0cyc_LS_Y_277ln],
+	(instregex "STP(D|S)i")>;
+def KryoWrite_1cyc_LS_Y_X_303ln :
+	SchedWriteRes<[KryoUnitLS, KryoUnitY, KryoUnitX]> {
+	let Latency = 1; let NumMicroOps = 3;
+}
+def : InstRW<[WriteAdr, KryoWrite_1cyc_LS_Y_X_303ln],
+	(instregex "STP(D|S)(post|pre)")>;
+def KryoWrite_0cyc_LS_Y_LS_Y_327ln :
+	SchedWriteRes<[KryoUnitLS, KryoUnitY, KryoUnitLS, KryoUnitY]> {
+	let Latency = 0; let NumMicroOps = 4;
+}
+def : InstRW<[KryoWrite_0cyc_LS_Y_LS_Y_327ln],
+	(instrs STPQi)>;
+def KryoWrite_1cyc_LS_Y_X_LS_Y_343ln :
+	SchedWriteRes<[KryoUnitLS, KryoUnitY, KryoUnitX, KryoUnitLS, KryoUnitY]> {
+	let Latency = 1; let NumMicroOps = 5;
+}
+def : InstRW<[WriteAdr, KryoWrite_1cyc_LS_Y_X_LS_Y_343ln],
+	(instrs STPQpost, STPQpre)>;
+def KryoWrite_0cyc_LS_Y_279ln :
+	SchedWriteRes<[KryoUnitLS, KryoUnitY]> {
+	let Latency = 0; let NumMicroOps = 2;
+}
+def : InstRW<[KryoWrite_0cyc_LS_Y_279ln],
+	(instregex "STP(W|X)i")>;
+def KryoWrite_1cyc_LS_X_Y_300ln :
+	SchedWriteRes<[KryoUnitLS, KryoUnitX, KryoUnitY]> {
+	let Latency = 1; let NumMicroOps = 3;
+}
+def : InstRW<[WriteAdr, KryoWrite_1cyc_LS_X_Y_300ln],
+	(instregex "STP(W|X)(post|pre)")>;
+def KryoWrite_0cyc_LS_Y_278ln :
+	SchedWriteRes<[KryoUnitLS, KryoUnitY]> {
+	let Latency = 0; let NumMicroOps = 2;
+}
+def : InstRW<[KryoWrite_0cyc_LS_Y_278ln],
+	(instregex "STR(Q|D|S|H|B)ui")>;
+def KryoWrite_1cyc_X_LS_Y_295ln :
+	SchedWriteRes<[KryoUnitX, KryoUnitLS, KryoUnitY]> {
+	let Latency = 1; let NumMicroOps = 3;
+}
+def : InstRW<[KryoWrite_1cyc_X_LS_Y_295ln],
+	(instregex "STR(D|S|H|B)ro(W|X)")>;
+def KryoWrite_1cyc_LS_Y_X_304ln :
+	SchedWriteRes<[KryoUnitLS, KryoUnitY, KryoUnitX]> {
+	let Latency = 1; let NumMicroOps = 3;
+}
+def : InstRW<[WriteAdr, KryoWrite_1cyc_LS_Y_X_304ln],
+	(instregex "STR(Q|D|S|H|B)(post|pre)")>;
+def KryoWrite_2cyc_X_LS_Y_XY_LS_Y_354ln :
+	SchedWriteRes<[KryoUnitX, KryoUnitLS, KryoUnitY, KryoUnitXY, KryoUnitLS,
+                   KryoUnitY]> {
+	let Latency = 2; let NumMicroOps = 6;
+}
+def : InstRW<[KryoWrite_2cyc_X_LS_Y_XY_LS_Y_354ln],
+	(instregex "STRQro(W|X)")>;
+def KryoWrite_0cyc_LS_Y_399ln :
+	SchedWriteRes<[KryoUnitLS, KryoUnitY]> {
+	let Latency = 0; let NumMicroOps = 2;
+}
+def : InstRW<[KryoWrite_0cyc_LS_Y_399ln],
+	(instregex "STR(BB|HH|W|X)ui")>;
+def KryoWrite_1cyc_X_LS_Y_406ln :
+	SchedWriteRes<[KryoUnitX, KryoUnitLS, KryoUnitY]> {
+	let Latency = 1; let NumMicroOps = 3;
+}
+def : InstRW<[KryoWrite_1cyc_X_LS_Y_406ln],
+	(instregex "STR(BB|HH|W|X)ro(W|X)")>;
+def KryoWrite_1cyc_LS_X_Y_407ln :
+	SchedWriteRes<[KryoUnitLS, KryoUnitX, KryoUnitY]> {
+	let Latency = 1; let NumMicroOps = 3;
+}
+def : InstRW<[WriteAdr, KryoWrite_1cyc_LS_X_Y_407ln],
+	(instregex "STR(BB|HH|W|X)(post|pre)")>;
+def KryoWrite_0cyc_LS_Y_398ln :
+	SchedWriteRes<[KryoUnitLS, KryoUnitY]> {
+	let Latency = 0; let NumMicroOps = 2;
+}
+def : InstRW<[KryoWrite_0cyc_LS_Y_398ln],
+	(instregex "STTR(B|H|W|X)i")>;
+def KryoWrite_0cyc_LS_Y_396ln :
+	SchedWriteRes<[KryoUnitLS, KryoUnitY]> {
+	let Latency = 0; let NumMicroOps = 2;
+}
+def : InstRW<[KryoWrite_0cyc_LS_Y_396ln],
+	(instregex "STUR(Q|D|S|H|B)i")>;
+def KryoWrite_0cyc_LS_Y_397ln :
+	SchedWriteRes<[KryoUnitLS, KryoUnitY]> {
+	let Latency = 0; let NumMicroOps = 2;
+}
+def : InstRW<[KryoWrite_0cyc_LS_Y_397ln],
+	(instregex "STUR(BB|HH|W|X)i")>;
+def KryoWrite_3cyc_LS_Y_404ln :
+	SchedWriteRes<[KryoUnitLS, KryoUnitY]> {
+	let Latency = 3; let NumMicroOps = 2;
+}
+def : InstRW<[KryoWrite_3cyc_LS_Y_404ln],
+	(instregex "STX(P(W|X)|R(B|H|W|X))")>;
+def KryoWrite_3cyc_XY_noRSV_160ln :
+	SchedWriteRes<[KryoUnitXY]> {
+	let Latency = 3; let NumMicroOps = 2;
+}
+def : InstRW<[KryoWrite_3cyc_XY_noRSV_160ln],
+	(instregex "^(SU|US)QADD(v8i8|v4i16|v2i32)")>;
+def KryoWrite_3cyc_XY_XY_167ln :
+	SchedWriteRes<[KryoUnitXY, KryoUnitXY]> {
+	let Latency = 3; let NumMicroOps = 2;
+}
+def : InstRW<[KryoWrite_3cyc_XY_XY_167ln],
+	(instregex "^(SU|US)QADD(v16i8|v8i16|v4i32|v2i64)")>;
+def KryoWrite_1cyc_XY_1ln :
+	SchedWriteRes<[KryoUnitXY]> {
+	let Latency = 1; let NumMicroOps = 1;
+}
+def : InstRW<[KryoWrite_1cyc_XY_1ln, ReadI],
+	(instregex "SUBS?(W|X)ri")>;
+def KryoWrite_2cyc_XY_XY_5ln :
+	SchedWriteRes<[KryoUnitXY, KryoUnitXY]> {
+	let Latency = 2; let NumMicroOps = 2;
+}
+def : InstRW<[KryoWrite_2cyc_XY_XY_5ln, ReadI, ReadIEReg],
+	(instregex "SUBS?(W|X)rx")>;
+def KryoWrite_2cyc_XY_XY_5_1ln :
+	SchedWriteRes<[KryoUnitXY, KryoUnitXY]> {
+	let Latency = 2; let NumMicroOps = 2;
+}
+def : InstRW<[KryoWrite_2cyc_XY_XY_5_1ln, ReadI, ReadISReg],
+	(instregex "SUBS?(W|X)rs")>;
+def KryoWrite_1cyc_XY_noRSV_6ln :
+	SchedWriteRes<[KryoUnitXY]> {
+	let Latency = 1; let NumMicroOps = 2;
+}
+def : InstRW<[KryoWrite_1cyc_XY_noRSV_6ln, ReadI, ReadI],
+	(instregex "SUBS?(W|X)rr")>;
+def KryoWrite_0cyc_LS_9ln :
+	SchedWriteRes<[KryoUnitLS]> {
+	let Latency = 0; let NumMicroOps = 1;
+}
+def : InstRW<[KryoWrite_0cyc_LS_9ln],
+	(instregex "SYSL?xt")>;
+def KryoWrite_1cyc_X_noRSV_205ln :
+	SchedWriteRes<[KryoUnitX]> {
+	let Latency = 1; let NumMicroOps = 2;
+}
+def : InstRW<[KryoWrite_1cyc_X_noRSV_205ln],
+	(instrs TBLv8i8One)>;
+def KryoWrite_1cyc_X_X_208ln :
+	SchedWriteRes<[KryoUnitX, KryoUnitX]> {
+	let Latency = 1; let NumMicroOps = 2;
+}
+def : InstRW<[KryoWrite_1cyc_X_X_208ln],
+	(instrs TBLv16i8One)>;
+def KryoWrite_2cyc_X_X_X_noRSV_222ln :
+	SchedWriteRes<[KryoUnitX, KryoUnitX, KryoUnitX]> {
+	let Latency = 2; let NumMicroOps = 4;
+}
+def : InstRW<[KryoWrite_2cyc_X_X_X_noRSV_222ln],
+	(instrs TBLv8i8Two)>;
+def KryoWrite_2cyc_X_X_X_X_X_X_224ln :
+	SchedWriteRes<[KryoUnitX, KryoUnitX, KryoUnitX, KryoUnitX, KryoUnitX,
+                   KryoUnitX]> {
+	let Latency = 2; let NumMicroOps = 6;
+}
+def : InstRW<[KryoWrite_2cyc_X_X_X_X_X_X_224ln],
+	(instrs TBLv16i8Two)>;
+def KryoWrite_3cyc_X_X_X_X_X_noRSV_225ln :
+	SchedWriteRes<[KryoUnitX, KryoUnitX, KryoUnitX, KryoUnitX, KryoUnitX]> {
+	let Latency = 3; let NumMicroOps = 6;
+}
+def : InstRW<[KryoWrite_3cyc_X_X_X_X_X_noRSV_225ln],
+	(instrs TBLv8i8Three)>;
+def KryoWrite_3cyc_X_X_X_X_X_X_X_noRSV_228ln :
+	SchedWriteRes<[KryoUnitX, KryoUnitX, KryoUnitX, KryoUnitX, KryoUnitX,
+                   KryoUnitX, KryoUnitX]> {
+	let Latency = 3; let NumMicroOps = 8;
+}
+def : InstRW<[KryoWrite_3cyc_X_X_X_X_X_X_X_noRSV_228ln],
+	(instrs TBLv8i8Four)>;
+def KryoWrite_4cyc_X_X_X_X_X_X_X_X_XY_X_X_230ln :
+	SchedWriteRes<[KryoUnitX, KryoUnitX, KryoUnitX, KryoUnitX, KryoUnitX,
+                   KryoUnitX, KryoUnitX, KryoUnitX, KryoUnitXY, KryoUnitX,
+                   KryoUnitX]> {
+	let Latency = 4; let NumMicroOps = 11;
+}
+def : InstRW<[KryoWrite_4cyc_X_X_X_X_X_X_X_X_XY_X_X_230ln],
+	(instrs TBLv16i8Three)>;
+def KryoWrite_4cyc_X_X_X_X_X_X_X_X_X_X_XY_X_X_X_X_232ln :
+	SchedWriteRes<[KryoUnitX, KryoUnitX, KryoUnitX, KryoUnitX, KryoUnitX,
+                   KryoUnitX, KryoUnitX, KryoUnitX, KryoUnitX, KryoUnitX,
+                   KryoUnitXY, KryoUnitX, KryoUnitX, KryoUnitX, KryoUnitX]> {
+	let Latency = 4; let NumMicroOps = 15;
+}
+def : InstRW<[KryoWrite_4cyc_X_X_X_X_X_X_X_X_X_X_XY_X_X_X_X_232ln],
+	(instrs TBLv16i8Four)>;
+def KryoWrite_2cyc_X_X_noRSV_220ln :
+	SchedWriteRes<[KryoUnitX, KryoUnitX]> {
+	let Latency = 2; let NumMicroOps = 3;
+}
+def : InstRW<[KryoWrite_2cyc_X_X_noRSV_220ln],
+	(instrs TBXv8i8One)>;
+def KryoWrite_2cyc_X_X_X_X_221ln :
+	SchedWriteRes<[KryoUnitX, KryoUnitX, KryoUnitX, KryoUnitX]> {
+	let Latency = 2; let NumMicroOps = 4;
+}
+def : InstRW<[KryoWrite_2cyc_X_X_X_X_221ln],
+	(instrs TBXv16i8One)>;
+def KryoWrite_3cyc_X_X_X_X_noRSV_223ln :
+	SchedWriteRes<[KryoUnitX, KryoUnitX, KryoUnitX, KryoUnitX]> {
+	let Latency = 3; let NumMicroOps = 5;
+}
+def : InstRW<[KryoWrite_3cyc_X_X_X_X_noRSV_223ln],
+	(instrs TBXv8i8Two)>;
+def KryoWrite_4cyc_X_X_X_X_X_X_noRSV_226ln :
+	SchedWriteRes<[KryoUnitX, KryoUnitX, KryoUnitX, KryoUnitX, KryoUnitX,
+                   KryoUnitX]> {
+	let Latency = 4; let NumMicroOps = 7;
+}
+def : InstRW<[KryoWrite_4cyc_X_X_X_X_X_X_noRSV_226ln],
+	(instrs TBXv8i8Three)>;
+def KryoWrite_3cyc_X_X_X_X_X_X_X_X_227ln :
+	SchedWriteRes<[KryoUnitX, KryoUnitX, KryoUnitX, KryoUnitX, KryoUnitX,
+                   KryoUnitX, KryoUnitX, KryoUnitX]> {
+	let Latency = 3; let NumMicroOps = 8;
+}
+def : InstRW<[KryoWrite_3cyc_X_X_X_X_X_X_X_X_227ln],
+	(instrs TBXv16i8Two)>;
+def KryoWrite_4cyc_X_X_X_X_X_X_X_X_noRSV_229ln :
+	SchedWriteRes<[KryoUnitX, KryoUnitX, KryoUnitX, KryoUnitX, KryoUnitX,
+                   KryoUnitX, KryoUnitX, KryoUnitX]> {
+	let Latency = 4; let NumMicroOps = 9;
+}
+def : InstRW<[KryoWrite_4cyc_X_X_X_X_X_X_X_X_noRSV_229ln],
+	(instrs TBXv8i8Four)>;
+def KryoWrite_5cyc_X_X_X_X_X_X_X_X_X_XY_X_X_X_231ln :
+	SchedWriteRes<[KryoUnitX, KryoUnitX, KryoUnitX, KryoUnitX, KryoUnitX,
+                   KryoUnitX, KryoUnitX, KryoUnitX, KryoUnitX, KryoUnitXY,
+                   KryoUnitX, KryoUnitX, KryoUnitX]> {
+	let Latency = 5; let NumMicroOps = 13;
+}
+def : InstRW<[KryoWrite_5cyc_X_X_X_X_X_X_X_X_X_XY_X_X_X_231ln],
+	(instrs TBXv16i8Three)>;
+def KryoWrite_5cyc_X_X_X_X_X_X_X_X_X_X_X_XY_X_X_X_X_X_233ln :
+    SchedWriteRes<[KryoUnitX, KryoUnitX, KryoUnitX, KryoUnitX, KryoUnitX,
+                   KryoUnitX, KryoUnitX, KryoUnitX, KryoUnitX, KryoUnitX,
+                   KryoUnitX, KryoUnitXY, KryoUnitX, KryoUnitX, KryoUnitX,
+                   KryoUnitX, KryoUnitX]> {
+	let Latency = 5; let NumMicroOps = 17;
+}
+def : InstRW<[KryoWrite_5cyc_X_X_X_X_X_X_X_X_X_X_X_XY_X_X_X_X_X_233ln],
+	(instrs TBXv16i8Four)>;
+def KryoWrite_1cyc_XY_XY_217ln :
+	SchedWriteRes<[KryoUnitXY, KryoUnitXY]> {
+	let Latency = 1; let NumMicroOps = 2;
+}
+def : InstRW<[KryoWrite_1cyc_XY_XY_217ln],
+	(instregex "((TRN1|TRN2|ZIP1|UZP1|UZP2)v2i64|ZIP2(v2i64|v4i32|v8i16|v16i8))")>;
+def KryoWrite_1cyc_X_X_211ln :
+	SchedWriteRes<[KryoUnitX, KryoUnitX]> {
+	let Latency = 1; let NumMicroOps = 2;
+}
+def : InstRW<[KryoWrite_1cyc_X_X_211ln],
+	(instregex "(TRN1|TRN2)(v4i32|v8i16|v16i8)")>;
+def KryoWrite_1cyc_X_XY_213ln :
+	SchedWriteRes<[KryoUnitX, KryoUnitXY]> {
+	let Latency = 1; let NumMicroOps = 2;
+}
+def : InstRW<[KryoWrite_1cyc_X_XY_213ln],
+	(instregex "(TRN1|TRN2)(v2i32|v4i16|v8i8)")>;
+def KryoWrite_3cyc_XY_noRSV_156ln :
+	SchedWriteRes<[KryoUnitXY]> {
+	let Latency = 3; let NumMicroOps = 2;
+}
+def : InstRW<[KryoWrite_3cyc_XY_noRSV_156ln],
+	(instrs URECPEv2i32, URSQRTEv2i32)>;
+def KryoWrite_3cyc_XY_XY_168ln :
+	SchedWriteRes<[KryoUnitXY, KryoUnitXY]> {
+	let Latency = 3; let NumMicroOps = 2;
+}
+def : InstRW<[KryoWrite_3cyc_XY_XY_168ln],
+	(instrs URECPEv4i32, URSQRTEv4i32)>;
+def KryoWrite_1cyc_X_X_210ln :
+	SchedWriteRes<[KryoUnitX, KryoUnitX]> {
+	let Latency = 1; let NumMicroOps = 2;
+}
+def : InstRW<[KryoWrite_1cyc_X_X_210ln],
+	(instregex "(UZP1|UZP2)(v4i32|v8i16|v16i8)")>;
+def KryoWrite_1cyc_X_noRSV_206ln :
+	SchedWriteRes<[KryoUnitX]> {
+	let Latency = 1; let NumMicroOps = 2;
+}
+def : InstRW<[KryoWrite_1cyc_X_noRSV_206ln],
+	(instregex "(UZP1|UZP2|ZIP1|ZIP2)(v2i32|v4i16|v8i8)")>;
+def KryoWrite_1cyc_XY_noRSV_215ln :
+	SchedWriteRes<[KryoUnitXY]> {
+	let Latency = 1; let NumMicroOps = 2;
+}
+def : InstRW<[KryoWrite_1cyc_XY_noRSV_215ln],
+	(instregex "XTNv.*")>;
+def KryoWrite_1cyc_X_X_209ln :
+	SchedWriteRes<[KryoUnitX, KryoUnitX]> {
+	let Latency = 1; let NumMicroOps = 2;
+}
+def : InstRW<[KryoWrite_1cyc_X_X_209ln],
+	(instregex "ZIP1(v4i32|v8i16|v16i8)")>;
diff --git a/lib/Target/AArch64/AArch64SchedM1.td b/lib/Target/AArch64/AArch64SchedM1.td
index 6525628dbfd6..2288b8dfc223 100644
--- a/lib/Target/AArch64/AArch64SchedM1.td
+++ b/lib/Target/AArch64/AArch64SchedM1.td
@@ -19,9 +19,8 @@
 
 def ExynosM1Model : SchedMachineModel {
   let IssueWidth            =  4; // Up to 4 uops per cycle.
-  let MinLatency            =  0; // OoO.
   let MicroOpBufferSize     = 96; // ROB size.
-  let LoopMicroOpBufferSize = 32; // Instruction queue size.
+  let LoopMicroOpBufferSize = 24; // Based on the instruction queue size.
   let LoadLatency           =  4; // Optimistic load cases.
   let MispredictPenalty     = 14; // Minimum branch misprediction penalty.
   let CompleteModel         =  0; // Use the default model otherwise.
@@ -142,12 +141,13 @@ def : WriteRes<WriteVST, [M1UnitS, M1UnitFST]> { let Latency = 1; }
 def : WriteRes<WriteV, [M1UnitFADD]> { let Latency = 3; }
 
 // Other miscellaneous instructions.
-def : WriteRes<WriteSys,     []> { let Latency = 1; }
+def : WriteRes<WriteAtomic,  []> { let Unsupported = 1; }
 def : WriteRes<WriteBarrier, []> { let Latency = 1; }
 def : WriteRes<WriteHint,    []> { let Latency = 1; }
+def : WriteRes<WriteSys,     []> { let Latency = 1; }
 
 //===----------------------------------------------------------------------===//
-// Fast forwarding.
+// Generic fast forwarding.
 
 // TODO: Add FP register forwarding rules.
 
@@ -187,6 +187,10 @@ def M1WriteNEONH   : SchedWriteRes<[M1UnitNALU,
                                     M1UnitFST]>    { let Latency = 3; }
 def M1WriteNEONI   : SchedWriteRes<[M1UnitFST,
                                     M1UnitL]>      { let Latency = 9; }
+def M1WriteNEONJ   : SchedWriteRes<[M1UnitNMISC,
+                                    M1UnitFMAC]>   { let Latency = 6; }
+def M1WriteNEONK   : SchedWriteRes<[M1UnitNMISC,
+                                    M1UnitFMAC]>   { let Latency = 7; }
 def M1WriteALU1    : SchedWriteRes<[M1UnitALU]>    { let Latency = 1; }
 def M1WriteB       : SchedWriteRes<[M1UnitB]>      { let Latency = 1; }
 // FIXME: This is the worst case, conditional branch and link.
@@ -305,8 +309,10 @@ def : InstRW<[M1WriteFVAR15], (instregex "FSQRTv.f32")>;
 def : InstRW<[M1WriteFVAR23], (instregex "FSQRTv2f64")>;
 def : InstRW<[M1WriteNMISC1], (instregex "^F(MAX|MIN)(NM)?V?v")>;
 def : InstRW<[M1WriteNMISC2], (instregex "^F(MAX|MIN)(NM)?Pv")>;
-def : InstRW<[M1WriteFMAC4],  (instregex "^FMULX?v")>;
-def : InstRW<[M1WriteFMAC5],  (instregex "^FML[AS]v")>;
+def : InstRW<[M1WriteNEONJ],  (instregex "^FMULX?v.i")>;
+def : InstRW<[M1WriteFMAC4],  (instregex "^FMULX?v.f")>;
+def : InstRW<[M1WriteNEONK],  (instregex "^FML[AS]v.i")>;
+def : InstRW<[M1WriteFMAC5],  (instregex "^FML[AS]v.f")>;
 def : InstRW<[M1WriteFCVT3],  (instregex "^FRINT[AIMNPXZ]v")>;
 
 // ASIMD miscellaneous instructions.
@@ -337,16 +343,19 @@ def : InstRW<[WriteSequence<[M1WriteNAL12], 4>],
                               (instregex "^TB[LX]v16i8Four")>;
 def : InstRW<[M1WriteNEOND],  (instregex "^[SU]MOVv")>;
 def : InstRW<[M1WriteNALU1],  (instregex "^INSv.+lane")>;
-def : InstRW<[M1WriteNALU1],  (instregex "^(TRN|UZP)(1|2)(v8i8|v4i16|v2i32)")>;
-def : InstRW<[M1WriteNALU2],  (instregex "^(TRN|UZP)(1|2)(v16i8|v8i16|v4i32|v2i64)")>;
-def : InstRW<[M1WriteNALU1],  (instregex "^ZIP(1|2)v")>;
+def : InstRW<[M1WriteNALU1],  (instregex "^(TRN|UZP)[12](v8i8|v4i16|v2i32)")>;
+def : InstRW<[M1WriteNALU2],  (instregex "^(TRN|UZP)[12](v16i8|v8i16|v4i32|v2i64)")>;
+def : InstRW<[M1WriteNALU1],  (instregex "^ZIP[12]v")>;
 
 // ASIMD load instructions.
 
 // ASIMD store instructions.
 
 // Cryptography instructions.
-def : InstRW<[M1WriteNCRYPT1], (instregex "^AES")>;
+def M1WriteAES : SchedWriteRes<[M1UnitNCRYPT]> { let Latency = 1; }
+def M1ReadAES  : SchedReadAdvance<1, [M1WriteAES]>;
+def : InstRW<[M1WriteAES, M1ReadAES], (instregex "^AES")>;
+
 def : InstRW<[M1WriteNCRYPT1], (instregex "^PMUL")>;
 def : InstRW<[M1WriteNCRYPT1], (instregex "^SHA1(H|SU)")>;
 def : InstRW<[M1WriteNCRYPT5], (instregex "^SHA1[CMP]")>;
diff --git a/lib/Target/AArch64/AArch64SchedVulcan.td b/lib/Target/AArch64/AArch64SchedVulcan.td
new file mode 100644
index 000000000000..0aa2462eba83
--- /dev/null
+++ b/lib/Target/AArch64/AArch64SchedVulcan.td
@@ -0,0 +1,855 @@
+//=- AArch64SchedVulcan.td - Vulcan Scheduling Defs ----------*- tablegen -*-=//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+// 1. Introduction
+//
+// This file defines the machine model for Broadcom Vulcan to support
+// instruction scheduling and other instruction cost heuristics.
+//
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+// 2. Pipeline Description.
+
+def VulcanModel : SchedMachineModel {
+  let IssueWidth            =   4; // 4 micro-ops dispatched at a time.
+  let MicroOpBufferSize     = 180; // 180 entries in micro-op re-order buffer.
+  let LoadLatency           =   4; // Optimistic load latency.
+  let MispredictPenalty     =  12; // Extra cycles for mispredicted branch.
+  // Determined via a mix of micro-arch details and experimentation.
+  let LoopMicroOpBufferSize =  32; 
+  let PostRAScheduler       =   1; // Using PostRA sched.
+  let CompleteModel         =   1;
+}
+
+// Define the issue ports.
+
+// Port 0: ALU, FP/SIMD.
+def VulcanP0 : ProcResource<1>;
+
+// Port 1: ALU, FP/SIMD, integer mul/div.
+def VulcanP1 : ProcResource<1>;
+
+// Port 2: ALU, Branch.
+def VulcanP2 : ProcResource<1>;
+
+// Port 3: Store data.
+def VulcanP3 : ProcResource<1>;
+
+// Port 4: Load/store.
+def VulcanP4 : ProcResource<1>;
+
+// Port 5: Load/store.
+def VulcanP5 : ProcResource<1>;
+
+let SchedModel = VulcanModel in {
+
+// Define groups for the functional units on each
+// issue port.  Each group created will be used
+// by a WriteRes later on.
+//
+// NOTE: Some groups only contain one member.  This
+// is a way to create names for the various functional
+// units that share a single issue port.  For example,
+// "VulcanI1" for ALU ops on port 1 and "VulcanF1" for
+// FP ops on port 1.
+
+// Integer divide and multiply micro-ops only on port 1.
+def VulcanI1 : ProcResGroup<[VulcanP1]>;
+
+// Branch micro-ops only on port 2.
+def VulcanI2 : ProcResGroup<[VulcanP2]>;
+
+// ALU micro-ops on ports 0, 1, and 2.
+def VulcanI012 : ProcResGroup<[VulcanP0, VulcanP1, VulcanP2]>;
+
+// Crypto FP/SIMD micro-ops only on port 1.
+def VulcanF1 : ProcResGroup<[VulcanP1]>;
+
+// FP/SIMD micro-ops on ports 0 and 1.
+def VulcanF01 : ProcResGroup<[VulcanP0, VulcanP1]>;
+
+// Store data micro-ops only on port 3.
+def VulcanSD : ProcResGroup<[VulcanP3]>;
+
+// Load/store micro-ops on ports 4 and 5.
+def VulcanLS01 : ProcResGroup<[VulcanP4, VulcanP5]>;
+
+// 60 entry unified scheduler.
+def VulcanAny : ProcResGroup<[VulcanP0, VulcanP1, VulcanP2,
+                              VulcanP3, VulcanP4, VulcanP5]> {
+  let BufferSize=60;
+}
+
+// Define commonly used write types for InstRW specializations.
+// All definitions follow the format: VulcanWrite_<NumCycles>Cyc_<Resources>.
+
+// 3 cycles on I1.
+def VulcanWrite_3Cyc_I1 : SchedWriteRes<[VulcanI1]> { let Latency = 3; }
+
+// 4 cycles on I1.
+def VulcanWrite_4Cyc_I1 : SchedWriteRes<[VulcanI1]> { let Latency = 4; }
+
+// 1 cycle on I0, I1, or I2.
+def VulcanWrite_1Cyc_I012 : SchedWriteRes<[VulcanI012]> { let Latency = 1; }
+
+// 5 cycles on F1.
+def VulcanWrite_5Cyc_F1 : SchedWriteRes<[VulcanF1]> { let Latency = 5; }
+
+// 7 cycles on F1.
+def VulcanWrite_7Cyc_F1 : SchedWriteRes<[VulcanF1]> { let Latency = 7; }
+
+// 4 cycles on F0 or F1.
+def VulcanWrite_4Cyc_F01 : SchedWriteRes<[VulcanF01]> { let Latency = 4; }
+
+// 5 cycles on F0 or F1.
+def VulcanWrite_5Cyc_F01 : SchedWriteRes<[VulcanF01]> { let Latency = 5; }
+
+// 6 cycles on F0 or F1.
+def VulcanWrite_6Cyc_F01 : SchedWriteRes<[VulcanF01]> { let Latency = 6; }
+
+// 7 cycles on F0 or F1.
+def VulcanWrite_7Cyc_F01 : SchedWriteRes<[VulcanF01]> { let Latency = 7; }
+
+// 8 cycles on F0 or F1.
+def VulcanWrite_8Cyc_F01 : SchedWriteRes<[VulcanF01]> { let Latency = 8; }
+
+// 16 cycles on F0 or F1.
+def VulcanWrite_16Cyc_F01 : SchedWriteRes<[VulcanF01]> {
+  let Latency = 16;
+  let ResourceCycles = [8];
+}
+
+// 23 cycles on F0 or F1.
+def VulcanWrite_23Cyc_F01 : SchedWriteRes<[VulcanF01]> {
+  let Latency = 23;
+  let ResourceCycles = [11];
+}
+
+// 1 cycles on LS0 or LS1.
+def VulcanWrite_1Cyc_LS01 : SchedWriteRes<[VulcanLS01]> { let Latency = 1; }
+
+// 4 cycles on LS0 or LS1.
+def VulcanWrite_4Cyc_LS01 : SchedWriteRes<[VulcanLS01]> { let Latency = 4; }
+
+// 5 cycles on LS0 or LS1.
+def VulcanWrite_5Cyc_LS01 : SchedWriteRes<[VulcanLS01]> { let Latency = 5; }
+
+// 6 cycles on LS0 or LS1.
+def VulcanWrite_6Cyc_LS01 : SchedWriteRes<[VulcanLS01]> { let Latency = 6; }
+
+// 5 cycles on LS0 or LS1 and I0, I1, or I2.
+def VulcanWrite_5Cyc_LS01_I012 : SchedWriteRes<[VulcanLS01, VulcanI012]> {
+  let Latency = 5;
+  let NumMicroOps = 2;
+}
+
+// 5 cycles on LS0 or LS1 and 2 of I0, I1, or I2.
+def VulcanWrite_6Cyc_LS01_I012_I012 : 
+  SchedWriteRes<[VulcanLS01, VulcanI012, VulcanI012]> {
+  let Latency = 6;
+  let NumMicroOps = 3;
+}
+
+// 1 cycles on LS0 or LS1 and F0 or F1.
+def VulcanWrite_1Cyc_LS01_F01 : SchedWriteRes<[VulcanLS01, VulcanF01]> {
+  let Latency = 1;
+  let NumMicroOps = 2;
+}
+
+// 5 cycles on LS0 or LS1 and F0 or F1.
+def VulcanWrite_5Cyc_LS01_F01 : SchedWriteRes<[VulcanLS01, VulcanF01]> {
+  let Latency = 5;
+  let NumMicroOps = 2;
+}
+
+// 6 cycles on LS0 or LS1 and F0 or F1.
+def VulcanWrite_6Cyc_LS01_F01 : SchedWriteRes<[VulcanLS01, VulcanF01]> {
+  let Latency = 6;
+  let NumMicroOps = 2;
+}
+
+// 7 cycles on LS0 or LS1 and F0 or F1.
+def VulcanWrite_7Cyc_LS01_F01 : SchedWriteRes<[VulcanLS01, VulcanF01]> {
+  let Latency = 7;
+  let NumMicroOps = 2;
+}
+
+// 8 cycles on LS0 or LS1 and F0 or F1.
+def VulcanWrite_8Cyc_LS01_F01 : SchedWriteRes<[VulcanLS01, VulcanF01]> {
+  let Latency = 8;
+  let NumMicroOps = 2;
+}
+
+// Define commonly used read types.
+
+// No forwarding is provided for these types.
+def : ReadAdvance<ReadI,       0>;
+def : ReadAdvance<ReadISReg,   0>;
+def : ReadAdvance<ReadIEReg,   0>;
+def : ReadAdvance<ReadIM,      0>;
+def : ReadAdvance<ReadIMA,     0>;
+def : ReadAdvance<ReadID,      0>;
+def : ReadAdvance<ReadExtrHi,  0>;
+def : ReadAdvance<ReadAdrBase, 0>;
+def : ReadAdvance<ReadVLD,     0>;
+
+}
+
+
+//===----------------------------------------------------------------------===//
+// 3. Instruction Tables.
+
+let SchedModel = VulcanModel in {
+
+//---
+// 3.1 Branch Instructions
+//---
+
+// Branch, immed
+// Branch and link, immed
+// Compare and branch
+def : WriteRes<WriteBr,      [VulcanI2]> { let Latency = 1; }
+
+def : WriteRes<WriteSys,     []> { let Latency = 1; }
+def : WriteRes<WriteBarrier, []> { let Latency = 1; }
+def : WriteRes<WriteHint,    []> { let Latency = 1; }
+
+def : WriteRes<WriteAtomic,  []> { let Unsupported = 1; }
+
+// Branch, register
+// Branch and link, register != LR
+// Branch and link, register = LR
+def : WriteRes<WriteBrReg,   [VulcanI2]> { let Latency = 1; }
+
+//---
+// 3.2 Arithmetic and Logical Instructions
+// 3.3 Move and Shift Instructions
+//---
+
+// ALU, basic
+// Conditional compare
+// Conditional select
+// Address generation
+def : WriteRes<WriteI,       [VulcanI012]> { let Latency = 1; }
+def : InstRW<[WriteI], (instrs COPY)>;
+
+// ALU, extend and/or shift
+def : WriteRes<WriteISReg,   [VulcanI012]> {
+  let Latency = 2;
+  let ResourceCycles = [2];
+}
+
+def : WriteRes<WriteIEReg,   [VulcanI012]> {
+  let Latency = 2;
+  let ResourceCycles = [2];
+}
+
+// Move immed
+def : WriteRes<WriteImm,     [VulcanI012]> { let Latency = 1; }
+
+// Variable shift
+def : WriteRes<WriteIS,      [VulcanI012]> { let Latency = 1; }
+
+//---
+// 3.4 Divide and Multiply Instructions
+//---
+
+// Divide, W-form
+// Latency range of 13-23.  Take the average.
+def : WriteRes<WriteID32,    [VulcanI1]> {
+  let Latency = 18;
+  let ResourceCycles = [18];
+}
+
+// Divide, X-form
+// Latency range of 13-39.  Take the average.
+def : WriteRes<WriteID64,    [VulcanI1]> {
+  let Latency = 26;
+  let ResourceCycles = [26];
+}
+
+// Multiply accumulate, W-form
+def : WriteRes<WriteIM32,    [VulcanI012]> { let Latency = 5; }
+
+// Multiply accumulate, X-form
+def : WriteRes<WriteIM64,    [VulcanI012]> { let Latency = 5; }
+
+// Bitfield extract, two reg
+def : WriteRes<WriteExtr,    [VulcanI012]> { let Latency = 1; }
+
+// Bitfield move, basic
+// Bitfield move, insert
+// NOTE: Handled by WriteIS.
+
+// Count leading
+def : InstRW<[VulcanWrite_3Cyc_I1], (instregex "^CLS(W|X)r$",
+                                               "^CLZ(W|X)r$")>;
+
+// Reverse bits/bytes
+// NOTE: Handled by WriteI.
+
+//---
+// 3.6 Load Instructions 
+// 3.10 FP Load Instructions
+//---
+
+// Load register, literal
+// Load register, unscaled immed
+// Load register, immed unprivileged
+// Load register, unsigned immed
+def : WriteRes<WriteLD,      [VulcanLS01]> { let Latency = 4; }
+
+// Load register, immed post-index
+// NOTE: Handled by WriteLD, WriteI.
+// Load register, immed pre-index
+// NOTE: Handled by WriteLD, WriteAdr.
+def : WriteRes<WriteAdr,     [VulcanI012]> { let Latency = 1; }
+
+// Load register offset, basic
+// Load register, register offset, scale by 4/8
+// Load register, register offset, scale by 2
+// Load register offset, extend
+// Load register, register offset, extend, scale by 4/8
+// Load register, register offset, extend, scale by 2
+def VulcanWriteLDIdx : SchedWriteVariant<[
+  SchedVar<ScaledIdxPred, [VulcanWrite_6Cyc_LS01_I012_I012]>,
+  SchedVar<NoSchedPred,   [VulcanWrite_5Cyc_LS01_I012]>]>;
+def : SchedAlias<WriteLDIdx, VulcanWriteLDIdx>;
+
+def VulcanReadAdrBase : SchedReadVariant<[
+  SchedVar<ScaledIdxPred, [ReadDefault]>,
+  SchedVar<NoSchedPred,   [ReadDefault]>]>;
+def : SchedAlias<ReadAdrBase, VulcanReadAdrBase>;
+
+// Load pair, immed offset, normal
+// Load pair, immed offset, signed words, base != SP
+// Load pair, immed offset signed words, base = SP
+// LDP only breaks into *one* LS micro-op.  Thus
+// the resources are handling by WriteLD.
+def : WriteRes<WriteLDHi,    []> {
+  let Latency = 5;
+}
+
+// Load pair, immed pre-index, normal
+// Load pair, immed pre-index, signed words
+// Load pair, immed post-index, normal
+// Load pair, immed post-index, signed words
+// NOTE: Handled by WriteLD, WriteLDHi, WriteAdr.
+
+//--
+// 3.7 Store Instructions 
+// 3.11 FP Store Instructions
+//--
+
+// Store register, unscaled immed
+// Store register, immed unprivileged
+// Store register, unsigned immed
+def : WriteRes<WriteST,      [VulcanLS01, VulcanSD]> {
+  let Latency = 1;
+  let NumMicroOps = 2;
+}
+
+// Store register, immed post-index
+// NOTE: Handled by WriteAdr, WriteST, ReadAdrBase
+
+// Store register, immed pre-index
+// NOTE: Handled by WriteAdr, WriteST
+
+// Store register, register offset, basic
+// Store register, register offset, scaled by 4/8
+// Store register, register offset, scaled by 2
+// Store register, register offset, extend
+// Store register, register offset, extend, scale by 4/8
+// Store register, register offset, extend, scale by 1
+def : WriteRes<WriteSTIdx, [VulcanLS01, VulcanSD, VulcanI012]> {
+  let Latency = 1;
+  let NumMicroOps = 3;
+}
+
+// Store pair, immed offset, W-form
+// Store pair, immed offset, X-form
+def : WriteRes<WriteSTP,     [VulcanLS01, VulcanSD]> {
+  let Latency = 1;
+  let NumMicroOps = 2;
+}
+
+// Store pair, immed post-index, W-form
+// Store pair, immed post-index, X-form
+// Store pair, immed pre-index, W-form
+// Store pair, immed pre-index, X-form
+// NOTE: Handled by WriteAdr, WriteSTP.
+
+//---
+// 3.8 FP Data Processing Instructions
+//---
+
+// FP absolute value
+// FP min/max
+// FP negate
+def : WriteRes<WriteF,       [VulcanF01]> { let Latency = 5; }
+
+// FP arithmetic
+def : InstRW<[VulcanWrite_6Cyc_F01], (instregex "^FADD", "^FSUB")>;
+
+// FP compare
+def : WriteRes<WriteFCmp,    [VulcanF01]> { let Latency = 5; }
+
+// FP divide, S-form
+// FP square root, S-form
+def : WriteRes<WriteFDiv,    [VulcanF01]> {
+  let Latency = 16;
+  let ResourceCycles = [8];
+}
+
+// FP divide, D-form
+// FP square root, D-form
+def : InstRW<[VulcanWrite_23Cyc_F01], (instrs FDIVDrr, FSQRTDr)>;
+
+// FP multiply
+// FP multiply accumulate
+def : WriteRes<WriteFMul, [VulcanF01]> { let Latency = 6; }
+
+// FP round to integral
+def : InstRW<[VulcanWrite_7Cyc_F01],
+            (instregex "^FRINT(A|I|M|N|P|X|Z)(Sr|Dr)")>;
+
+// FP select
+def : InstRW<[VulcanWrite_4Cyc_F01], (instregex "^FCSEL")>;
+
+//---
+// 3.9 FP Miscellaneous Instructions
+//---
+
+// FP convert, from vec to vec reg
+// FP convert, from gen to vec reg
+// FP convert, from vec to gen reg
+def : WriteRes<WriteFCvt, [VulcanF01]> { let Latency = 7; }
+
+// FP move, immed
+// FP move, register
+def : WriteRes<WriteFImm, [VulcanF01]> { let Latency = 4; }
+
+// FP transfer, from gen to vec reg
+// FP transfer, from vec to gen reg
+def : WriteRes<WriteFCopy, [VulcanF01]> { let Latency = 4; }
+def : InstRW<[VulcanWrite_5Cyc_F01], (instrs FMOVXDHighr, FMOVDXHighr)>;
+
+//---
+// 3.12 ASIMD Integer Instructions
+//---
+
+// ASIMD absolute diff, D-form
+// ASIMD absolute diff, Q-form
+// ASIMD absolute diff accum, D-form
+// ASIMD absolute diff accum, Q-form
+// ASIMD absolute diff accum long
+// ASIMD absolute diff long
+// ASIMD arith, basic
+// ASIMD arith, complex
+// ASIMD compare
+// ASIMD logical (AND, BIC, EOR)
+// ASIMD max/min, basic
+// ASIMD max/min, reduce, 4H/4S
+// ASIMD max/min, reduce, 8B/8H
+// ASIMD max/min, reduce, 16B
+// ASIMD multiply, D-form
+// ASIMD multiply, Q-form
+// ASIMD multiply accumulate long
+// ASIMD multiply accumulate saturating long
+// ASIMD multiply long
+// ASIMD pairwise add and accumulate
+// ASIMD shift accumulate
+// ASIMD shift by immed, basic
+// ASIMD shift by immed and insert, basic, D-form
+// ASIMD shift by immed and insert, basic, Q-form
+// ASIMD shift by immed, complex
+// ASIMD shift by register, basic, D-form
+// ASIMD shift by register, basic, Q-form
+// ASIMD shift by register, complex, D-form
+// ASIMD shift by register, complex, Q-form
+def : WriteRes<WriteV, [VulcanF01]> { let Latency = 7; }
+
+// ASIMD arith, reduce, 4H/4S
+// ASIMD arith, reduce, 8B/8H
+// ASIMD arith, reduce, 16B
+def : InstRW<[VulcanWrite_5Cyc_F01], 
+            (instregex "^ADDVv", "^SADDLVv", "^UADDLVv")>;
+
+// ASIMD logical (MOV, MVN, ORN, ORR)
+def : InstRW<[VulcanWrite_5Cyc_F01], (instregex "^ORRv", "^ORNv", "^NOTv")>;
+
+// ASIMD polynomial (8x8) multiply long
+def : InstRW<[VulcanWrite_5Cyc_F01], (instrs PMULLv8i8, PMULLv16i8)>;
+
+//---
+// 3.13 ASIMD Floating-point Instructions
+//---
+
+// ASIMD FP absolute value
+def : InstRW<[VulcanWrite_5Cyc_F01], (instregex "^FABSv")>;
+
+// ASIMD FP arith, normal, D-form
+// ASIMD FP arith, normal, Q-form
+def : InstRW<[VulcanWrite_6Cyc_F01], (instregex "^FABDv", "^FADDv", "^FSUBv")>;
+
+// ASIMD FP arith,pairwise, D-form
+// ASIMD FP arith, pairwise, Q-form
+def : InstRW<[VulcanWrite_6Cyc_F01], (instregex "^FADDPv")>;
+
+// ASIMD FP compare, D-form
+// ASIMD FP compare, Q-form
+def : InstRW<[VulcanWrite_5Cyc_F01], (instregex "^FACGEv", "^FACGTv")>;
+def : InstRW<[VulcanWrite_5Cyc_F01], (instregex "^FCMEQv", "^FCMGEv",
+                                                "^FCMGTv", "^FCMLEv",
+                                                "^FCMLTv")>;
+
+// ASIMD FP convert, long
+// ASIMD FP convert, narrow
+// ASIMD FP convert, other, D-form
+// ASIMD FP convert, other, Q-form
+// NOTE: Handled by WriteV.
+
+// ASIMD FP divide, D-form, F32
+def : InstRW<[VulcanWrite_16Cyc_F01], (instrs FDIVv2f32)>;
+
+// ASIMD FP divide, Q-form, F32
+def : InstRW<[VulcanWrite_16Cyc_F01], (instrs FDIVv4f32)>;
+
+// ASIMD FP divide, Q-form, F64
+def : InstRW<[VulcanWrite_23Cyc_F01], (instrs FDIVv2f64)>;
+
+// ASIMD FP max/min, normal, D-form
+// ASIMD FP max/min, normal, Q-form
+def : InstRW<[VulcanWrite_5Cyc_F01], (instregex "^FMAXv", "^FMAXNMv",
+                                                "^FMINv", "^FMINNMv")>;
+
+// ASIMD FP max/min, pairwise, D-form
+// ASIMD FP max/min, pairwise, Q-form
+def : InstRW<[VulcanWrite_5Cyc_F01], (instregex "^FMAXPv", "^FMAXNMPv",
+                                                "^FMINPv", "^FMINNMPv")>;
+
+// ASIMD FP max/min, reduce
+def : InstRW<[VulcanWrite_5Cyc_F01], (instregex "^FMAXVv", "^FMAXNMVv",
+                                                "^FMINVv", "^FMINNMVv")>;
+
+// ASIMD FP multiply, D-form, FZ
+// ASIMD FP multiply, D-form, no FZ
+// ASIMD FP multiply, Q-form, FZ
+// ASIMD FP multiply, Q-form, no FZ
+def : InstRW<[VulcanWrite_6Cyc_F01], (instregex "^FMULv", "^FMULXv")>;
+
+// ASIMD FP multiply accumulate, Dform, FZ
+// ASIMD FP multiply accumulate, Dform, no FZ
+// ASIMD FP multiply accumulate, Qform, FZ
+// ASIMD FP multiply accumulate, Qform, no FZ
+def : InstRW<[VulcanWrite_6Cyc_F01], (instregex "^FMLAv", "^FMLSv")>;
+
+// ASIMD FP negate
+def : InstRW<[VulcanWrite_5Cyc_F01], (instregex "^FNEGv")>;
+
+// ASIMD FP round, D-form
+// ASIMD FP round, Q-form
+// NOTE: Handled by WriteV.
+
+//--
+// 3.14 ASIMD Miscellaneous Instructions
+//--
+
+// ASIMD bit reverse
+def : InstRW<[VulcanWrite_5Cyc_F01], (instregex "^RBITv")>;
+
+// ASIMD bitwise insert, D-form
+// ASIMD bitwise insert, Q-form
+def : InstRW<[VulcanWrite_5Cyc_F01], (instregex "^BIFv", "^BITv", "^BSLv")>;
+
+// ASIMD count, D-form
+// ASIMD count, Q-form
+def : InstRW<[VulcanWrite_5Cyc_F01], (instregex "^CLSv", "^CLZv", "^CNTv")>;
+
+// ASIMD duplicate, gen reg
+// ASIMD duplicate, element
+def : InstRW<[VulcanWrite_5Cyc_F01], (instregex "^DUPv")>;
+
+// ASIMD extract
+def : InstRW<[VulcanWrite_5Cyc_F01], (instregex "^EXTv")>;
+
+// ASIMD extract narrow
+// ASIMD extract narrow, saturating
+// NOTE: Handled by WriteV.
+
+// ASIMD insert, element to element
+def : InstRW<[VulcanWrite_5Cyc_F01], (instregex "^INSv")>;
+
+// ASIMD move, integer immed
+def : InstRW<[VulcanWrite_5Cyc_F01], (instregex "^MOVIv", "^MOVIDv")>;
+
+// ASIMD move, FP immed
+def : InstRW<[VulcanWrite_5Cyc_F01], (instregex "^FMOVv")>;
+
+// ASIMD reciprocal estimate, D-form
+// ASIMD reciprocal estimate, Q-form
+def : InstRW<[VulcanWrite_5Cyc_F01], 
+            (instregex "^FRECPEv", "^FRECPXv", "^URECPEv",
+                                   "^FRSQRTEv", "^URSQRTEv")>;
+
+// ASIMD reciprocal step, D-form, FZ
+// ASIMD reciprocal step, D-form, no FZ
+// ASIMD reciprocal step, Q-form, FZ
+// ASIMD reciprocal step, Q-form, no FZ
+def : InstRW<[VulcanWrite_6Cyc_F01], (instregex "^FRECPSv", "^FRSQRTSv")>;
+
+// ASIMD reverse
+def : InstRW<[VulcanWrite_5Cyc_F01], 
+            (instregex "^REV16v", "^REV32v", "^REV64v")>;
+
+// ASIMD table lookup, D-form
+// ASIMD table lookup, Q-form
+def : InstRW<[VulcanWrite_8Cyc_F01], (instregex "^TBLv", "^TBXv")>;
+
+// ASIMD transfer, element to word or word
+def : InstRW<[VulcanWrite_5Cyc_F01], (instregex "^UMOVv")>;
+
+// ASIMD transfer, element to gen reg
+def : InstRW<[VulcanWrite_6Cyc_F01], (instregex "^SMOVv", "^UMOVv")>;
+
+// ASIMD transfer gen reg to element
+def : InstRW<[VulcanWrite_5Cyc_F01], (instregex "^INSv")>;
+
+// ASIMD transpose
+def : InstRW<[VulcanWrite_5Cyc_F01], (instregex "^TRN1v", "^TRN2v",
+                                                "^UZP1v", "^UZP2v")>;
+
+// ASIMD unzip/zip
+def : InstRW<[VulcanWrite_5Cyc_F01], (instregex "^ZIP1v", "^ZIP2v")>;
+
+//--
+// 3.15 ASIMD Load Instructions 
+//--
+
+// ASIMD load, 1 element, multiple, 1 reg, D-form
+// ASIMD load, 1 element, multiple, 1 reg, Q-form
+def : InstRW<[VulcanWrite_4Cyc_LS01], 
+            (instregex "^LD1Onev(8b|4h|2s|1d|16b|8h|4s|2d)$")>;
+def : InstRW<[VulcanWrite_4Cyc_LS01, WriteAdr], 
+            (instregex "^LD1Onev(8b|4h|2s|1d|16b|8h|4s|2d)_POST$")>;
+
+// ASIMD load, 1 element, multiple, 2 reg, D-form
+// ASIMD load, 1 element, multiple, 2 reg, Q-form
+def : InstRW<[VulcanWrite_4Cyc_LS01], 
+            (instregex "^LD1Twov(8b|4h|2s|1d|16b|8h|4s|2d)$")>;
+def : InstRW<[VulcanWrite_4Cyc_LS01, WriteAdr], 
+            (instregex "^LD1Twov(8b|4h|2s|1d|16b|8h|4s|2d)_POST$")>;
+
+// ASIMD load, 1 element, multiple, 3 reg, D-form
+// ASIMD load, 1 element, multiple, 3 reg, Q-form
+def : InstRW<[VulcanWrite_5Cyc_LS01], 
+            (instregex "^LD1Threev(8b|4h|2s|1d|16b|8h|4s|2d)$")>;
+def : InstRW<[VulcanWrite_5Cyc_LS01, WriteAdr], 
+            (instregex "^LD1Threev(8b|4h|2s|1d|16b|8h|4s|2d)_POST$")>;
+
+// ASIMD load, 1 element, multiple, 4 reg, D-form
+// ASIMD load, 1 element, multiple, 4 reg, Q-form
+def : InstRW<[VulcanWrite_6Cyc_LS01], 
+            (instregex "^LD1Fourv(8b|4h|2s|1d|16b|8h|4s|2d)$")>;
+def : InstRW<[VulcanWrite_6Cyc_LS01, WriteAdr], 
+            (instregex "^LD1Fourv(8b|4h|2s|1d|16b|8h|4s|2d)_POST$")>;
+
+// ASIMD load, 1 element, one lane, B/H/S
+// ASIMD load, 1 element, one lane, D
+def : InstRW<[VulcanWrite_5Cyc_LS01_F01], (instregex "^LD1i(8|16|32|64)$")>;
+def : InstRW<[VulcanWrite_5Cyc_LS01_F01, WriteAdr], 
+            (instregex "^LD1i(8|16|32|64)_POST$")>;
+
+// ASIMD load, 1 element, all lanes, D-form, B/H/S
+// ASIMD load, 1 element, all lanes, D-form, D
+// ASIMD load, 1 element, all lanes, Q-form
+def : InstRW<[VulcanWrite_5Cyc_LS01_F01], 
+            (instregex "^LD1Rv(8b|4h|2s|1d|16b|8h|4s|2d)$")>;
+def : InstRW<[VulcanWrite_5Cyc_LS01_F01, WriteAdr], 
+            (instregex "^LD1Rv(8b|4h|2s|1d|16b|8h|4s|2d)_POST$")>;
+
+// ASIMD load, 2 element, multiple, D-form, B/H/S
+// ASIMD load, 2 element, multiple, Q-form, D
+def : InstRW<[VulcanWrite_5Cyc_LS01_F01], 
+            (instregex "^LD2Twov(8b|4h|2s|16b|8h|4s|2d)$")>;
+def : InstRW<[VulcanWrite_5Cyc_LS01_F01, WriteAdr], 
+            (instregex "^LD2Twov(8b|4h|2s|16b|8h|4s|2d)_POST$")>;
+
+// ASIMD load, 2 element, one lane, B/H
+// ASIMD load, 2 element, one lane, S
+// ASIMD load, 2 element, one lane, D
+def : InstRW<[VulcanWrite_5Cyc_LS01_F01], (instregex "^LD2i(8|16|32|64)$")>;
+def : InstRW<[VulcanWrite_5Cyc_LS01_F01, WriteAdr], 
+            (instregex "^LD2i(8|16|32|64)_POST$")>;
+
+// ASIMD load, 2 element, all lanes, D-form, B/H/S
+// ASIMD load, 2 element, all lanes, D-form, D
+// ASIMD load, 2 element, all lanes, Q-form
+def : InstRW<[VulcanWrite_5Cyc_LS01_F01], 
+            (instregex "^LD2Rv(8b|4h|2s|1d|16b|8h|4s|2d)$")>;
+def : InstRW<[VulcanWrite_5Cyc_LS01_F01, WriteAdr], 
+            (instregex "^LD2Rv(8b|4h|2s|1d|16b|8h|4s|2d)_POST$")>;
+
+// ASIMD load, 3 element, multiple, D-form, B/H/S
+// ASIMD load, 3 element, multiple, Q-form, B/H/S
+// ASIMD load, 3 element, multiple, Q-form, D
+def : InstRW<[VulcanWrite_8Cyc_LS01_F01], 
+            (instregex "^LD3Threev(8b|4h|2s|16b|8h|4s|2d)$")>;
+def : InstRW<[VulcanWrite_8Cyc_LS01_F01, WriteAdr], 
+            (instregex "^LD3Threev(8b|4h|2s|16b|8h|4s|2d)_POST$")>;
+
+// ASIMD load, 3 element, one lone, B/H
+// ASIMD load, 3 element, one lane, S
+// ASIMD load, 3 element, one lane, D
+def : InstRW<[VulcanWrite_7Cyc_LS01_F01], (instregex "^LD3i(8|16|32|64)$")>;
+def : InstRW<[VulcanWrite_7Cyc_LS01_F01, WriteAdr], 
+            (instregex "^LD3i(8|16|32|64)_POST$")>;
+
+// ASIMD load, 3 element, all lanes, D-form, B/H/S
+// ASIMD load, 3 element, all lanes, D-form, D
+// ASIMD load, 3 element, all lanes, Q-form, B/H/S
+// ASIMD load, 3 element, all lanes, Q-form, D
+def : InstRW<[VulcanWrite_7Cyc_LS01_F01], 
+            (instregex "^LD3Rv(8b|4h|2s|1d|16b|8h|4s|2d)$")>;
+def : InstRW<[VulcanWrite_7Cyc_LS01_F01, WriteAdr], 
+            (instregex "^LD3Rv(8b|4h|2s|1d|16b|8h|4s|2d)_POST$")>;
+
+// ASIMD load, 4 element, multiple, D-form, B/H/S
+// ASIMD load, 4 element, multiple, Q-form, B/H/S
+// ASIMD load, 4 element, multiple, Q-form, D
+def : InstRW<[VulcanWrite_8Cyc_LS01_F01], 
+            (instregex "^LD4Fourv(8b|4h|2s|16b|8h|4s|2d)$")>;
+def : InstRW<[VulcanWrite_8Cyc_LS01_F01, WriteAdr], 
+            (instregex "^LD4Fourv(8b|4h|2s|16b|8h|4s|2d)_POST$")>;
+
+// ASIMD load, 4 element, one lane, B/H
+// ASIMD load, 4 element, one lane, S
+// ASIMD load, 4 element, one lane, D
+def : InstRW<[VulcanWrite_6Cyc_LS01_F01], (instregex "^LD4i(8|16|32|64)$")>;
+def : InstRW<[VulcanWrite_6Cyc_LS01_F01, WriteAdr], 
+            (instregex "^LD4i(8|16|32|64)_POST$")>;
+
+// ASIMD load, 4 element, all lanes, D-form, B/H/S
+// ASIMD load, 4 element, all lanes, D-form, D
+// ASIMD load, 4 element, all lanes, Q-form, B/H/S
+// ASIMD load, 4 element, all lanes, Q-form, D
+def : InstRW<[VulcanWrite_6Cyc_LS01_F01], 
+            (instregex "^LD4Rv(8b|4h|2s|1d|16b|8h|4s|2d)$")>;
+def : InstRW<[VulcanWrite_6Cyc_LS01_F01, WriteAdr], 
+            (instregex "^LD4Rv(8b|4h|2s|1d|16b|8h|4s|2d)_POST$")>;
+
+//--
+// 3.16 ASIMD Store Instructions
+//--
+
+// ASIMD store, 1 element, multiple, 1 reg, D-form
+// ASIMD store, 1 element, multiple, 1 reg, Q-form
+def : InstRW<[VulcanWrite_1Cyc_LS01], 
+            (instregex "^ST1Onev(8b|4h|2s|1d|16b|8h|4s|2d)$")>;
+def : InstRW<[VulcanWrite_1Cyc_LS01, WriteAdr], 
+            (instregex "^ST1Onev(8b|4h|2s|1d|16b|8h|4s|2d)_POST$")>;
+
+// ASIMD store, 1 element, multiple, 2 reg, D-form
+// ASIMD store, 1 element, multiple, 2 reg, Q-form
+def : InstRW<[VulcanWrite_1Cyc_LS01], 
+            (instregex "^ST1Twov(8b|4h|2s|1d|16b|8h|4s|2d)$")>;
+def : InstRW<[VulcanWrite_1Cyc_LS01, WriteAdr], 
+            (instregex "^ST1Twov(8b|4h|2s|1d|16b|8h|4s|2d)_POST$")>;
+
+// ASIMD store, 1 element, multiple, 3 reg, D-form
+// ASIMD store, 1 element, multiple, 3 reg, Q-form
+def : InstRW<[VulcanWrite_1Cyc_LS01], 
+            (instregex "^ST1Threev(8b|4h|2s|1d|16b|8h|4s|2d)$")>;
+def : InstRW<[VulcanWrite_1Cyc_LS01, WriteAdr], 
+            (instregex "^ST1Threev(8b|4h|2s|1d|16b|8h|4s|2d)_POST$")>;
+
+// ASIMD store, 1 element, multiple, 4 reg, D-form
+// ASIMD store, 1 element, multiple, 4 reg, Q-form
+def : InstRW<[VulcanWrite_1Cyc_LS01], 
+            (instregex "^ST1Fourv(8b|4h|2s|1d|16b|8h|4s|2d)$")>;
+def : InstRW<[VulcanWrite_1Cyc_LS01, WriteAdr], 
+            (instregex "^ST1Fourv(8b|4h|2s|1d|16b|8h|4s|2d)_POST$")>;
+
+// ASIMD store, 1 element, one lane, B/H/S
+// ASIMD store, 1 element, one lane, D
+def : InstRW<[VulcanWrite_1Cyc_LS01_F01], 
+            (instregex "^ST1i(8|16|32|64)$")>;
+def : InstRW<[VulcanWrite_1Cyc_LS01_F01, WriteAdr], 
+            (instregex "^ST1i(8|16|32|64)_POST$")>;
+
+// ASIMD store, 2 element, multiple, D-form, B/H/S
+// ASIMD store, 2 element, multiple, Q-form, B/H/S
+// ASIMD store, 2 element, multiple, Q-form, D
+def : InstRW<[VulcanWrite_1Cyc_LS01_F01], 
+            (instregex "^ST2Twov(8b|4h|2s|16b|8h|4s|2d)$")>;
+def : InstRW<[VulcanWrite_1Cyc_LS01_F01, WriteAdr], 
+            (instregex "^ST2Twov(8b|4h|2s|16b|8h|4s|2d)_POST$")>;
+
+// ASIMD store, 2 element, one lane, B/H/S
+// ASIMD store, 2 element, one lane, D
+def : InstRW<[VulcanWrite_1Cyc_LS01_F01], 
+            (instregex "^ST2i(8|16|32|64)$")>;
+def : InstRW<[VulcanWrite_1Cyc_LS01_F01, WriteAdr], 
+            (instregex "^ST2i(8|16|32|64)_POST$")>;
+
+// ASIMD store, 3 element, multiple, D-form, B/H/S
+// ASIMD store, 3 element, multiple, Q-form, B/H/S
+// ASIMD store, 3 element, multiple, Q-form, D
+def : InstRW<[VulcanWrite_1Cyc_LS01_F01], 
+            (instregex "^ST3Threev(8b|4h|2s|16b|8h|4s|2d)$")>;
+def : InstRW<[VulcanWrite_1Cyc_LS01_F01, WriteAdr], 
+            (instregex "^ST3Threev(8b|4h|2s|16b|8h|4s|2d)_POST$")>;
+
+// ASIMD store, 3 element, one lane, B/H
+// ASIMD store, 3 element, one lane, S
+// ASIMD store, 3 element, one lane, D
+def : InstRW<[VulcanWrite_1Cyc_LS01_F01], (instregex "^ST3i(8|16|32|64)$")>;
+def : InstRW<[VulcanWrite_1Cyc_LS01_F01, WriteAdr], 
+            (instregex "^ST3i(8|16|32|64)_POST$")>;
+
+// ASIMD store, 4 element, multiple, D-form, B/H/S
+// ASIMD store, 4 element, multiple, Q-form, B/H/S
+// ASIMD store, 4 element, multiple, Q-form, D
+def : InstRW<[VulcanWrite_1Cyc_LS01_F01], 
+            (instregex "^ST4Fourv(8b|4h|2s|16b|8h|4s|2d)$")>;
+def : InstRW<[VulcanWrite_1Cyc_LS01_F01, WriteAdr], 
+            (instregex "^ST4Fourv(8b|4h|2s|16b|8h|4s|2d)_POST$")>;
+
+// ASIMD store, 4 element, one lane, B/H
+// ASIMD store, 4 element, one lane, S
+// ASIMD store, 4 element, one lane, D
+def : InstRW<[VulcanWrite_1Cyc_LS01_F01], (instregex "^ST4i(8|16|32|64)$")>;
+def : InstRW<[VulcanWrite_1Cyc_LS01_F01, WriteAdr], 
+            (instregex "^ST4i(8|16|32|64)_POST$")>;
+
+//--
+// 3.17 Cryptography Extensions
+//--
+
+// Crypto AES ops
+def : InstRW<[VulcanWrite_5Cyc_F1], (instregex "^AES")>;
+
+// Crypto polynomial (64x64) multiply long
+def : InstRW<[VulcanWrite_5Cyc_F1], (instrs PMULLv1i64, PMULLv2i64)>;
+
+// Crypto SHA1 xor ops
+// Crypto SHA1 schedule acceleration ops
+// Crypto SHA256 schedule acceleration op (1 u-op)
+// Crypto SHA256 schedule acceleration op (2 u-ops)
+// Crypto SHA256 hash acceleration ops
+def : InstRW<[VulcanWrite_7Cyc_F1], (instregex "^SHA")>;
+
+//--
+// 3.18 CRC
+//--
+
+// CRC checksum ops
+def : InstRW<[VulcanWrite_4Cyc_I1], (instregex "^CRC32")>;
+
+} // SchedModel = VulcanModel
diff --git a/lib/Target/AArch64/AArch64Schedule.td b/lib/Target/AArch64/AArch64Schedule.td
index eaa9110ab1bc..ce81f48acf71 100644
--- a/lib/Target/AArch64/AArch64Schedule.td
+++ b/lib/Target/AArch64/AArch64Schedule.td
@@ -51,15 +51,15 @@ def WriteSTIdx : SchedWrite; // Store to a register index (maybe scaled).
 def ReadAdrBase : SchedRead; // Read the base resister of a reg-offset LD/ST.
 
 // Predicate for determining when a shiftable register is shifted.
-def RegShiftedPred : SchedPredicate<[{TII->hasShiftedReg(MI)}]>;
+def RegShiftedPred : SchedPredicate<[{TII->hasShiftedReg(*MI)}]>;
 
 // Predicate for determining when a extendedable register is extended.
-def RegExtendedPred : SchedPredicate<[{TII->hasExtendedReg(MI)}]>;
+def RegExtendedPred : SchedPredicate<[{TII->hasExtendedReg(*MI)}]>;
 
 // ScaledIdxPred is true if a WriteLDIdx operand will be
 // scaled. Subtargets can use this to dynamically select resources and
 // latency for WriteLDIdx and ReadAdrBase.
-def ScaledIdxPred : SchedPredicate<[{TII->isScaledAddr(MI)}]>;
+def ScaledIdxPred : SchedPredicate<[{TII->isScaledAddr(*MI)}]>;
 
 // Serialized two-level address load.
 // EXAMPLE: LOADGot
@@ -92,6 +92,8 @@ def WriteV   : SchedWrite; // Vector ops.
 def WriteVLD : SchedWrite; // Vector loads.
 def WriteVST : SchedWrite; // Vector stores.
 
+def WriteAtomic : SchedWrite; // Atomic memory operations (CAS, Swap, LDOP)
+
 // Read the unwritten lanes of the VLD's destination registers.
 def ReadVLD : SchedRead;
 
diff --git a/lib/Target/AArch64/AArch64SelectionDAGInfo.cpp b/lib/Target/AArch64/AArch64SelectionDAGInfo.cpp
index f40293021d74..66a8f332513a 100644
--- a/lib/Target/AArch64/AArch64SelectionDAGInfo.cpp
+++ b/lib/Target/AArch64/AArch64SelectionDAGInfo.cpp
@@ -17,7 +17,7 @@ using namespace llvm;
 #define DEBUG_TYPE "aarch64-selectiondag-info"
 
 SDValue AArch64SelectionDAGInfo::EmitTargetCodeForMemset(
-    SelectionDAG &DAG, SDLoc dl, SDValue Chain, SDValue Dst, SDValue Src,
+    SelectionDAG &DAG, const SDLoc &dl, SDValue Chain, SDValue Dst, SDValue Src,
     SDValue Size, unsigned Align, bool isVolatile,
     MachinePointerInfo DstPtrInfo) const {
   // Check to see if there is a specialized entry-point for memory zeroing.
@@ -44,10 +44,16 @@ SDValue AArch64SelectionDAGInfo::EmitTargetCodeForMemset(
     TargetLowering::CallLoweringInfo CLI(DAG);
     CLI.setDebugLoc(dl).setChain(Chain)
       .setCallee(CallingConv::C, Type::getVoidTy(*DAG.getContext()),
-                 DAG.getExternalSymbol(bzeroEntry, IntPtr), std::move(Args), 0)
+                 DAG.getExternalSymbol(bzeroEntry, IntPtr), std::move(Args))
       .setDiscardResult();
     std::pair<SDValue, SDValue> CallResult = TLI.LowerCallTo(CLI);
     return CallResult.second;
   }
   return SDValue();
 }
+bool AArch64SelectionDAGInfo::generateFMAsInMachineCombiner(
+    CodeGenOpt::Level OptLevel) const {
+  if (OptLevel >= CodeGenOpt::Aggressive)
+    return true;
+  return false;
+}
diff --git a/lib/Target/AArch64/AArch64SelectionDAGInfo.h b/lib/Target/AArch64/AArch64SelectionDAGInfo.h
index 97421b45b122..7e4f11091226 100644
--- a/lib/Target/AArch64/AArch64SelectionDAGInfo.h
+++ b/lib/Target/AArch64/AArch64SelectionDAGInfo.h
@@ -7,24 +7,24 @@
 //
 //===----------------------------------------------------------------------===//
 //
-// This file defines the AArch64 subclass for TargetSelectionDAGInfo.
+// This file defines the AArch64 subclass for SelectionDAGTargetInfo.
 //
 //===----------------------------------------------------------------------===//
 
 #ifndef LLVM_LIB_TARGET_AARCH64_AARCH64SELECTIONDAGINFO_H
 #define LLVM_LIB_TARGET_AARCH64_AARCH64SELECTIONDAGINFO_H
 
-#include "llvm/Target/TargetSelectionDAGInfo.h"
+#include "llvm/CodeGen/SelectionDAGTargetInfo.h"
 
 namespace llvm {
 
-class AArch64SelectionDAGInfo : public TargetSelectionDAGInfo {
+class AArch64SelectionDAGInfo : public SelectionDAGTargetInfo {
 public:
-
-  SDValue EmitTargetCodeForMemset(SelectionDAG &DAG, SDLoc dl, SDValue Chain,
-                                  SDValue Dst, SDValue Src, SDValue Size,
-                                  unsigned Align, bool isVolatile,
+  SDValue EmitTargetCodeForMemset(SelectionDAG &DAG, const SDLoc &dl,
+                                  SDValue Chain, SDValue Dst, SDValue Src,
+                                  SDValue Size, unsigned Align, bool isVolatile,
                                   MachinePointerInfo DstPtrInfo) const override;
+  bool generateFMAsInMachineCombiner(CodeGenOpt::Level OptLevel) const override;
 };
 }
 
diff --git a/lib/Target/AArch64/AArch64StorePairSuppress.cpp b/lib/Target/AArch64/AArch64StorePairSuppress.cpp
index 1c6b15790ea9..f904b2379416 100644
--- a/lib/Target/AArch64/AArch64StorePairSuppress.cpp
+++ b/lib/Target/AArch64/AArch64StorePairSuppress.cpp
@@ -115,6 +115,9 @@ bool AArch64StorePairSuppress::isNarrowFPStore(const MachineInstr &MI) {
 }
 
 bool AArch64StorePairSuppress::runOnMachineFunction(MachineFunction &MF) {
+  if (skipFunction(*MF.getFunction()))
+    return false;
+
   const TargetSubtargetInfo &ST = MF.getSubtarget();
   TII = static_cast<const AArch64InstrInfo *>(ST.getInstrInfo());
   TRI = ST.getRegisterInfo();
@@ -141,8 +144,8 @@ bool AArch64StorePairSuppress::runOnMachineFunction(MachineFunction &MF) {
       if (!isNarrowFPStore(MI))
         continue;
       unsigned BaseReg;
-      unsigned Offset;
-      if (TII->getMemOpBaseRegImmOfs(&MI, BaseReg, Offset, TRI)) {
+      int64_t Offset;
+      if (TII->getMemOpBaseRegImmOfs(MI, BaseReg, Offset, TRI)) {
         if (PrevBaseReg == BaseReg) {
           // If this block can take STPs, skip ahead to the next block.
           if (!SuppressSTP && shouldAddSTPToBlock(MI.getParent()))
@@ -150,7 +153,7 @@ bool AArch64StorePairSuppress::runOnMachineFunction(MachineFunction &MF) {
           // Otherwise, continue unpairing the stores in this block.
           DEBUG(dbgs() << "Unpairing store " << MI << "\n");
           SuppressSTP = true;
-          TII->suppressLdStPair(&MI);
+          TII->suppressLdStPair(MI);
         }
         PrevBaseReg = BaseReg;
       } else
diff --git a/lib/Target/AArch64/AArch64Subtarget.cpp b/lib/Target/AArch64/AArch64Subtarget.cpp
index f6ee8cf47a6a..7dd8ccbe6c25 100644
--- a/lib/Target/AArch64/AArch64Subtarget.cpp
+++ b/lib/Target/AArch64/AArch64Subtarget.cpp
@@ -11,10 +11,9 @@
 //
 //===----------------------------------------------------------------------===//
 
+#include "AArch64Subtarget.h"
 #include "AArch64InstrInfo.h"
 #include "AArch64PBQPRegAlloc.h"
-#include "AArch64Subtarget.h"
-#include "llvm/ADT/SmallVector.h"
 #include "llvm/CodeGen/MachineScheduler.h"
 #include "llvm/IR/GlobalValue.h"
 #include "llvm/Support/TargetRegistry.h"
@@ -44,58 +43,83 @@ AArch64Subtarget::initializeSubtargetDependencies(StringRef FS) {
     CPUString = "generic";
 
   ParseSubtargetFeatures(CPUString, FS);
+  initializeProperties();
+
   return *this;
 }
 
+void AArch64Subtarget::initializeProperties() {
+  // Initialize CPU specific properties. We should add a tablegen feature for
+  // this in the future so we can specify it together with the subtarget
+  // features.
+  switch (ARMProcFamily) {
+  case Cyclone:
+    CacheLineSize = 64;
+    PrefetchDistance = 280;
+    MinPrefetchStride = 2048;
+    MaxPrefetchIterationsAhead = 3;
+    break;
+  case CortexA57:
+    MaxInterleaveFactor = 4;
+    break;
+  case ExynosM1:
+    PrefFunctionAlignment = 4;
+    PrefLoopAlignment = 3;
+    break;
+  case Kryo:
+    MaxInterleaveFactor = 4;
+    VectorInsertExtractBaseCost = 2;
+    CacheLineSize = 128;
+    PrefetchDistance = 740;
+    MinPrefetchStride = 1024;
+    MaxPrefetchIterationsAhead = 11;
+    break;
+  case Vulcan:
+    MaxInterleaveFactor = 4;
+    break;
+  case CortexA35: break;
+  case CortexA53: break;
+  case CortexA72: break;
+  case CortexA73: break;
+  case Others: break;
+  }
+}
+
 AArch64Subtarget::AArch64Subtarget(const Triple &TT, const std::string &CPU,
                                    const std::string &FS,
                                    const TargetMachine &TM, bool LittleEndian)
-    : AArch64GenSubtargetInfo(TT, CPU, FS), ARMProcFamily(Others),
-      HasV8_1aOps(false), HasV8_2aOps(false), HasFPARMv8(false), HasNEON(false),
-      HasCrypto(false), HasCRC(false), HasPerfMon(false), HasFullFP16(false),
-      HasZeroCycleRegMove(false), HasZeroCycleZeroing(false),
-      StrictAlign(false), ReserveX18(TT.isOSDarwin()), IsLittle(LittleEndian),
-      CPUString(CPU), TargetTriple(TT), FrameLowering(),
+    : AArch64GenSubtargetInfo(TT, CPU, FS), ReserveX18(TT.isOSDarwin()),
+      IsLittle(LittleEndian), CPUString(CPU), TargetTriple(TT), FrameLowering(),
       InstrInfo(initializeSubtargetDependencies(FS)), TSInfo(),
-      TLInfo(TM, *this) {}
+      TLInfo(TM, *this), GISel() {}
+
+const CallLowering *AArch64Subtarget::getCallLowering() const {
+  assert(GISel && "Access to GlobalISel APIs not set");
+  return GISel->getCallLowering();
+}
+
+const RegisterBankInfo *AArch64Subtarget::getRegBankInfo() const {
+  assert(GISel && "Access to GlobalISel APIs not set");
+  return GISel->getRegBankInfo();
+}
 
-/// ClassifyGlobalReference - Find the target operand flags that describe
-/// how a global value should be referenced for the current subtarget.
+/// Find the target operand flags that describe how a global value should be
+/// referenced for the current subtarget.
 unsigned char
 AArch64Subtarget::ClassifyGlobalReference(const GlobalValue *GV,
-                                        const TargetMachine &TM) const {
-  bool isDef = GV->isStrongDefinitionForLinker();
-
+                                          const TargetMachine &TM) const {
   // MachO large model always goes via a GOT, simply to get a single 8-byte
   // absolute relocation on all global addresses.
   if (TM.getCodeModel() == CodeModel::Large && isTargetMachO())
     return AArch64II::MO_GOT;
 
+  if (!TM.shouldAssumeDSOLocal(*GV->getParent(), GV))
+    return AArch64II::MO_GOT;
+
   // The small code mode's direct accesses use ADRP, which cannot necessarily
   // produce the value 0 (if the code is above 4GB).
-  if (TM.getCodeModel() == CodeModel::Small && GV->hasExternalWeakLinkage()) {
-    // In PIC mode use the GOT, but in absolute mode use a constant pool load.
-    if (TM.getRelocationModel() == Reloc::Static)
-        return AArch64II::MO_CONSTPOOL;
-    else
-        return AArch64II::MO_GOT;
-  }
-
-  // If symbol visibility is hidden, the extra load is not needed if
-  // the symbol is definitely defined in the current translation unit.
-
-  // The handling of non-hidden symbols in PIC mode is rather target-dependent:
-  //   + On MachO, if the symbol is defined in this module the GOT can be
-  //     skipped.
-  //   + On ELF, the R_AARCH64_COPY relocation means that even symbols actually
-  //     defined could end up in unexpected places. Use a GOT.
-  if (TM.getRelocationModel() != Reloc::Static && GV->hasDefaultVisibility()) {
-    if (isTargetMachO())
-      return isDef ? AArch64II::MO_NO_FLAG : AArch64II::MO_GOT;
-    else
-      // No need to go through the GOT for local symbols on ELF.
-      return GV->hasLocalLinkage() ? AArch64II::MO_NO_FLAG : AArch64II::MO_GOT;
-  }
+  if (TM.getCodeModel() == CodeModel::Small && GV->hasExternalWeakLinkage())
+    return AArch64II::MO_GOT;
 
   return AArch64II::MO_NO_FLAG;
 }
@@ -114,8 +138,7 @@ const char *AArch64Subtarget::getBZeroEntry() const {
 }
 
 void AArch64Subtarget::overrideSchedPolicy(MachineSchedPolicy &Policy,
-                                         MachineInstr *begin, MachineInstr *end,
-                                         unsigned NumRegionInstrs) const {
+                                           unsigned NumRegionInstrs) const {
   // LNT run (at least on Cyclone) showed reasonably significant gains for
   // bi-directional scheduling. 253.perlbmk.
   Policy.OnlyTopDown = false;
@@ -123,8 +146,7 @@ void AArch64Subtarget::overrideSchedPolicy(MachineSchedPolicy &Policy,
   // Enabling or Disabling the latency heuristic is a close call: It seems to
   // help nearly no benchmark on out-of-order architectures, on the other hand
   // it regresses register pressure on a few benchmarking.
-  if (isCyclone())
-    Policy.DisableLatencyHeuristic = true;
+  Policy.DisableLatencyHeuristic = DisableLatencySchedHeuristic;
 }
 
 bool AArch64Subtarget::enableEarlyIfConversion() const {
@@ -146,8 +168,5 @@ bool AArch64Subtarget::supportsAddressTopByteIgnored() const {
 
 std::unique_ptr<PBQPRAConstraint>
 AArch64Subtarget::getCustomPBQPConstraints() const {
-  if (!isCortexA57())
-    return nullptr;
-
-  return llvm::make_unique<A57ChainingConstraint>();
+  return balanceFPOps() ? llvm::make_unique<A57ChainingConstraint>() : nullptr;
 }
diff --git a/lib/Target/AArch64/AArch64Subtarget.h b/lib/Target/AArch64/AArch64Subtarget.h
index 151133b2f32c..16a35405c892 100644
--- a/lib/Target/AArch64/AArch64Subtarget.h
+++ b/lib/Target/AArch64/AArch64Subtarget.h
@@ -19,6 +19,7 @@
 #include "AArch64InstrInfo.h"
 #include "AArch64RegisterInfo.h"
 #include "AArch64SelectionDAGInfo.h"
+#include "llvm/CodeGen/GlobalISel/GISelAccessor.h"
 #include "llvm/IR/DataLayout.h"
 #include "llvm/Target/TargetSubtargetInfo.h"
 #include <string>
@@ -32,38 +33,64 @@ class StringRef;
 class Triple;
 
 class AArch64Subtarget : public AArch64GenSubtargetInfo {
-protected:
-  enum ARMProcFamilyEnum {
+public:
+  enum ARMProcFamilyEnum : uint8_t {
     Others,
     CortexA35,
     CortexA53,
     CortexA57,
+    CortexA72,
+    CortexA73,
     Cyclone,
-    ExynosM1
+    ExynosM1,
+    Kryo,
+    Vulcan
   };
 
+protected:
   /// ARMProcFamily - ARM processor family: Cortex-A53, Cortex-A57, and others.
-  ARMProcFamilyEnum ARMProcFamily;
+  ARMProcFamilyEnum ARMProcFamily = Others;
 
-  bool HasV8_1aOps;
-  bool HasV8_2aOps;
+  bool HasV8_1aOps = false;
+  bool HasV8_2aOps = false;
 
-  bool HasFPARMv8;
-  bool HasNEON;
-  bool HasCrypto;
-  bool HasCRC;
-  bool HasPerfMon;
-  bool HasFullFP16;
-  bool HasSPE;
+  bool HasFPARMv8 = false;
+  bool HasNEON = false;
+  bool HasCrypto = false;
+  bool HasCRC = false;
+  bool HasRAS = false;
+  bool HasPerfMon = false;
+  bool HasFullFP16 = false;
+  bool HasSPE = false;
 
   // HasZeroCycleRegMove - Has zero-cycle register mov instructions.
-  bool HasZeroCycleRegMove;
+  bool HasZeroCycleRegMove = false;
 
   // HasZeroCycleZeroing - Has zero-cycle zeroing instructions.
-  bool HasZeroCycleZeroing;
+  bool HasZeroCycleZeroing = false;
 
   // StrictAlign - Disallow unaligned memory accesses.
-  bool StrictAlign;
+  bool StrictAlign = false;
+  bool MergeNarrowLoads = false;
+  bool UseAA = false;
+  bool PredictableSelectIsExpensive = false;
+  bool BalanceFPOps = false;
+  bool CustomAsCheapAsMove = false;
+  bool UsePostRAScheduler = false;
+  bool Misaligned128StoreIsSlow = false;
+  bool AvoidQuadLdStPairs = false;
+  bool UseAlternateSExtLoadCVTF32Pattern = false;
+  bool HasMacroOpFusion = false;
+  bool DisableLatencySchedHeuristic = false;
+  bool UseRSqrt = false;
+  uint8_t MaxInterleaveFactor = 2;
+  uint8_t VectorInsertExtractBaseCost = 3;
+  uint16_t CacheLineSize = 0;
+  uint16_t PrefetchDistance = 0;
+  uint16_t MinPrefetchStride = 1;
+  unsigned MaxPrefetchIterationsAhead = UINT_MAX;
+  unsigned PrefFunctionAlignment = 0;
+  unsigned PrefLoopAlignment = 0;
 
   // ReserveX18 - X18 is not available as a general purpose register.
   bool ReserveX18;
@@ -80,12 +107,20 @@ protected:
   AArch64InstrInfo InstrInfo;
   AArch64SelectionDAGInfo TSInfo;
   AArch64TargetLowering TLInfo;
+  /// Gather the accessor points to GlobalISel-related APIs.
+  /// This is used to avoid ifndefs spreading around while GISel is
+  /// an optional library.
+  std::unique_ptr<GISelAccessor> GISel;
+
 private:
   /// initializeSubtargetDependencies - Initializes using CPUString and the
   /// passed in feature string so that we can use initializer lists for
   /// subtarget initialization.
   AArch64Subtarget &initializeSubtargetDependencies(StringRef FS);
 
+  /// Initialize properties based on the selected processor family.
+  void initializeProperties();
+
 public:
   /// This constructor initializes the data members to match that
   /// of the specified triple.
@@ -93,6 +128,11 @@ public:
                    const std::string &FS, const TargetMachine &TM,
                    bool LittleEndian);
 
+  /// This object will take onwership of \p GISelAccessor.
+  void setGISelAccessor(GISelAccessor &GISel) {
+    this->GISel.reset(&GISel);
+  }
+
   const AArch64SelectionDAGInfo *getSelectionDAGInfo() const override {
     return &TSInfo;
   }
@@ -106,10 +146,20 @@ public:
   const AArch64RegisterInfo *getRegisterInfo() const override {
     return &getInstrInfo()->getRegisterInfo();
   }
+  const CallLowering *getCallLowering() const override;
+  const RegisterBankInfo *getRegBankInfo() const override;
   const Triple &getTargetTriple() const { return TargetTriple; }
   bool enableMachineScheduler() const override { return true; }
   bool enablePostRAScheduler() const override {
-    return isGeneric() || isCortexA53() || isCortexA57();
+    return UsePostRAScheduler;
+  }
+
+  /// Returns ARM processor family.
+  /// Avoid this function! CPU specifics should be kept local to this class
+  /// and preferably modeled with SubtargetFeatures or properties in
+  /// initializeProperties().
+  ARMProcFamilyEnum getProcFamily() const {
+    return ARMProcFamily;
   }
 
   bool hasV8_1aOps() const { return HasV8_1aOps; }
@@ -126,6 +176,33 @@ public:
   bool hasNEON() const { return HasNEON; }
   bool hasCrypto() const { return HasCrypto; }
   bool hasCRC() const { return HasCRC; }
+  bool hasRAS() const { return HasRAS; }
+  bool mergeNarrowLoads() const { return MergeNarrowLoads; }
+  bool balanceFPOps() const { return BalanceFPOps; }
+  bool predictableSelectIsExpensive() const {
+    return PredictableSelectIsExpensive;
+  }
+  bool hasCustomCheapAsMoveHandling() const { return CustomAsCheapAsMove; }
+  bool isMisaligned128StoreSlow() const { return Misaligned128StoreIsSlow; }
+  bool avoidQuadLdStPairs() const { return AvoidQuadLdStPairs; }
+  bool useAlternateSExtLoadCVTF32Pattern() const {
+    return UseAlternateSExtLoadCVTF32Pattern;
+  }
+  bool hasMacroOpFusion() const { return HasMacroOpFusion; }
+  bool useRSqrt() const { return UseRSqrt; }
+  unsigned getMaxInterleaveFactor() const { return MaxInterleaveFactor; }
+  unsigned getVectorInsertExtractBaseCost() const {
+    return VectorInsertExtractBaseCost;
+  }
+  unsigned getCacheLineSize() const { return CacheLineSize; }
+  unsigned getPrefetchDistance() const { return PrefetchDistance; }
+  unsigned getMinPrefetchStride() const { return MinPrefetchStride; }
+  unsigned getMaxPrefetchIterationsAhead() const {
+    return MaxPrefetchIterationsAhead;
+  }
+  unsigned getPrefFunctionAlignment() const { return PrefFunctionAlignment; }
+  unsigned getPrefLoopAlignment() const { return PrefLoopAlignment; }
+
   /// CPU has TBI (top byte of addresses is ignored during HW address
   /// translation) and OS enables it.
   bool supportsAddressTopByteIgnored() const;
@@ -146,13 +223,7 @@ public:
   bool isTargetELF() const { return TargetTriple.isOSBinFormatELF(); }
   bool isTargetMachO() const { return TargetTriple.isOSBinFormatMachO(); }
 
-  bool isGeneric() const { return CPUString == "generic"; }
-  bool isCyclone() const { return CPUString == "cyclone"; }
-  bool isCortexA57() const { return CPUString == "cortex-a57"; }
-  bool isCortexA53() const { return CPUString == "cortex-a53"; }
-  bool isExynosM1() const { return CPUString == "exynos-m1"; }
-
-  bool useAA() const override { return isCortexA53(); }
+  bool useAA() const override { return UseAA; }
 
   /// getMaxInlineSizeThreshold - Returns the maximum memset / memcpy size
   /// that still makes it profitable to inline the call.
@@ -174,8 +245,7 @@ public:
   /// returns null.
   const char *getBZeroEntry() const;
 
-  void overrideSchedPolicy(MachineSchedPolicy &Policy, MachineInstr *begin,
-                           MachineInstr *end,
+  void overrideSchedPolicy(MachineSchedPolicy &Policy,
                            unsigned NumRegionInstrs) const override;
 
   bool enableEarlyIfConversion() const override;
diff --git a/lib/Target/AArch64/AArch64SystemOperands.td b/lib/Target/AArch64/AArch64SystemOperands.td
new file mode 100644
index 000000000000..a3736c0868fb
--- /dev/null
+++ b/lib/Target/AArch64/AArch64SystemOperands.td
@@ -0,0 +1,1018 @@
+//===- AArch64SystemOperands.td ----------------------------*- tablegen -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines the symbolic operands permitted for various kinds of
+// AArch64 system instruction.
+//
+//===----------------------------------------------------------------------===//
+
+include "llvm/TableGen/SearchableTable.td"
+
+//===----------------------------------------------------------------------===//
+// AT (address translate) instruction options.
+//===----------------------------------------------------------------------===//
+
+class AT<string name, bits<2> op0, bits<3> op1, bits<4> crn, bits<4> crm,
+         bits<3> op2> : SearchableTable {
+  let SearchableFields = ["Name", "Encoding"];
+  let EnumValueField = "Encoding";
+
+  string Name = name;
+  bits<16> Encoding;
+  let Encoding{15-14} = op0;
+  let Encoding{13-11} = op1;
+  let Encoding{10-7} = crn;
+  let Encoding{6-3} = crm;
+  let Encoding{2-0} = op2;
+}
+
+def : AT<"S1E1R",  0b01, 0b000, 0b0111, 0b1000, 0b000>;
+def : AT<"S1E2R",  0b01, 0b100, 0b0111, 0b1000, 0b000>;
+def : AT<"S1E3R",  0b01, 0b110, 0b0111, 0b1000, 0b000>;
+def : AT<"S1E1W",  0b01, 0b000, 0b0111, 0b1000, 0b001>;
+def : AT<"S1E2W",  0b01, 0b100, 0b0111, 0b1000, 0b001>;
+def : AT<"S1E3W",  0b01, 0b110, 0b0111, 0b1000, 0b001>;
+def : AT<"S1E0R",  0b01, 0b000, 0b0111, 0b1000, 0b010>;
+def : AT<"S1E0W",  0b01, 0b000, 0b0111, 0b1000, 0b011>;
+def : AT<"S12E1R", 0b01, 0b100, 0b0111, 0b1000, 0b100>;
+def : AT<"S12E1W", 0b01, 0b100, 0b0111, 0b1000, 0b101>;
+def : AT<"S12E0R", 0b01, 0b100, 0b0111, 0b1000, 0b110>;
+def : AT<"S12E0W", 0b01, 0b100, 0b0111, 0b1000, 0b111>;
+def : AT<"S1E1RP", 0b01, 0b000, 0b0111, 0b1001, 0b000>;
+def : AT<"S1E1WP", 0b01, 0b000, 0b0111, 0b1001, 0b001>;
+
+
+//===----------------------------------------------------------------------===//
+// DMB/DSB (data barrier) instruction options.
+//===----------------------------------------------------------------------===//
+
+class DB<string name, bits<4> encoding> : SearchableTable {
+  let SearchableFields = ["Name", "Encoding"];
+  let EnumValueField = "Encoding";
+
+  string Name = name;
+  bits<4> Encoding = encoding;
+}
+
+def : DB<"oshld", 0x1>;
+def : DB<"oshst", 0x2>;
+def : DB<"osh",   0x3>;
+def : DB<"nshld", 0x5>;
+def : DB<"nshst", 0x6>;
+def : DB<"nsh",   0x7>;
+def : DB<"ishld", 0x9>;
+def : DB<"ishst", 0xa>;
+def : DB<"ish",   0xb>;
+def : DB<"ld",    0xd>;
+def : DB<"st",    0xe>;
+def : DB<"sy",    0xf>;
+
+//===----------------------------------------------------------------------===//
+// DC (data cache maintenance) instruction options.
+//===----------------------------------------------------------------------===//
+
+class DC<string name, bits<2> op0, bits<3> op1, bits<4> crn, bits<4> crm,
+         bits<3> op2> : SearchableTable {
+  let SearchableFields = ["Name", "Encoding"];
+  let EnumValueField = "Encoding";
+
+  string Name = name;
+  bits<16> Encoding;
+  let Encoding{15-14} = op0;
+  let Encoding{13-11} = op1;
+  let Encoding{10-7} = crn;
+  let Encoding{6-3} = crm;
+  let Encoding{2-0} = op2;
+}
+
+def : DC<"ZVA",   0b01, 0b011, 0b0111, 0b0100, 0b001>;
+def : DC<"IVAC",  0b01, 0b000, 0b0111, 0b0110, 0b001>;
+def : DC<"ISW",   0b01, 0b000, 0b0111, 0b0110, 0b010>;
+def : DC<"CVAC",  0b01, 0b011, 0b0111, 0b1010, 0b001>;
+def : DC<"CSW",   0b01, 0b000, 0b0111, 0b1010, 0b010>;
+def : DC<"CVAU",  0b01, 0b011, 0b0111, 0b1011, 0b001>;
+def : DC<"CIVAC", 0b01, 0b011, 0b0111, 0b1110, 0b001>;
+def : DC<"CISW",  0b01, 0b000, 0b0111, 0b1110, 0b010>;
+
+//===----------------------------------------------------------------------===//
+// IC (instruction cache maintenance) instruction options.
+//===----------------------------------------------------------------------===//
+
+class IC<string name, bits<3> op1, bits<4> crn, bits<4> crm, bits<3> op2,
+         bit needsreg> : SearchableTable {
+  let SearchableFields = ["Name", "Encoding"];
+  let EnumValueField = "Encoding";
+
+  string Name = name;
+  bits<14> Encoding;
+  let Encoding{13-11} = op1;
+  let Encoding{10-7} = crn;
+  let Encoding{6-3} = crm;
+  let Encoding{2-0} = op2;
+  bit NeedsReg = needsreg;
+}
+
+def : IC<"IALLUIS", 0b000, 0b0111, 0b0001, 0b000, 0>;
+def : IC<"IALLU",   0b000, 0b0111, 0b0101, 0b000, 0>;
+def : IC<"IVAU",    0b000, 0b0111, 0b0001, 0b000, 1>;
+
+//===----------------------------------------------------------------------===//
+// ISB (instruction-fetch barrier) instruction options.
+//===----------------------------------------------------------------------===//
+
+class ISB<string name, bits<4> encoding> : SearchableTable{
+  let SearchableFields = ["Name", "Encoding"];
+  let EnumValueField = "Encoding";
+
+  string Name = name;
+  bits<4> Encoding;
+  let Encoding = encoding;
+}
+
+def : ISB<"sy", 0xf>;
+
+//===----------------------------------------------------------------------===//
+// PRFM (prefetch) instruction options.
+//===----------------------------------------------------------------------===//
+
+class PRFM<string name, bits<5> encoding> : SearchableTable {
+  let SearchableFields = ["Name", "Encoding"];
+  let EnumValueField = "Encoding";
+
+  string Name = name;
+  bits<5> Encoding;
+  let Encoding = encoding;
+}
+
+def : PRFM<"pldl1keep", 0x00>;
+def : PRFM<"pldl1strm", 0x01>;
+def : PRFM<"pldl2keep", 0x02>;
+def : PRFM<"pldl2strm", 0x03>;
+def : PRFM<"pldl3keep", 0x04>;
+def : PRFM<"pldl3strm", 0x05>;
+def : PRFM<"plil1keep", 0x08>;
+def : PRFM<"plil1strm", 0x09>;
+def : PRFM<"plil2keep", 0x0a>;
+def : PRFM<"plil2strm", 0x0b>;
+def : PRFM<"plil3keep", 0x0c>;
+def : PRFM<"plil3strm", 0x0d>;
+def : PRFM<"pstl1keep", 0x10>;
+def : PRFM<"pstl1strm", 0x11>;
+def : PRFM<"pstl2keep", 0x12>;
+def : PRFM<"pstl2strm", 0x13>;
+def : PRFM<"pstl3keep", 0x14>;
+def : PRFM<"pstl3strm", 0x15>;
+
+//===----------------------------------------------------------------------===//
+// PState instruction options.
+//===----------------------------------------------------------------------===//
+
+class PState<string name, bits<5> encoding> : SearchableTable {
+  let SearchableFields = ["Name", "Encoding"];
+  let EnumValueField = "Encoding";
+
+  string Name = name;
+  bits<5> Encoding;
+  let Encoding = encoding;
+  code Requires = [{ {} }];
+}
+
+def : PState<"SPSel",   0b00101>;
+def : PState<"DAIFSet", 0b11110>;
+def : PState<"DAIFClr", 0b11111>;
+// v8.1a "Privileged Access Never" extension-specific PStates
+let Requires = [{ {AArch64::HasV8_1aOps} }] in
+def : PState<"PAN",     0b00100>;
+// v8.2a "User Access Override" extension-specific PStates
+let Requires = [{ {AArch64::HasV8_2aOps} }] in
+def : PState<"UAO",     0b00011>;
+
+
+//===----------------------------------------------------------------------===//
+// PSB instruction options.
+//===----------------------------------------------------------------------===//
+
+class PSB<string name, bits<5> encoding> : SearchableTable {
+  let SearchableFields = ["Name", "Encoding"];
+  let EnumValueField = "Encoding";
+
+  string Name = name;
+  bits<5> Encoding;
+  let Encoding = encoding;
+}
+
+def : PSB<"csync", 0x11>;
+
+//===----------------------------------------------------------------------===//
+// TLBI (translation lookaside buffer invalidate) instruction options.
+//===----------------------------------------------------------------------===//
+
+class TLBI<string name, bits<2> op0, bits<3> op1, bits<4> crn, bits<4> crm,
+             bits<3> op2, bit needsreg = 1> : SearchableTable {
+  let SearchableFields = ["Name", "Encoding"];
+  let EnumValueField = "Encoding";
+
+  string Name = name;
+  bits<16> Encoding;
+  let Encoding{15-14} = op0;
+  let Encoding{13-11} = op1;
+  let Encoding{10-7} = crn;
+  let Encoding{6-3} = crm;
+  let Encoding{2-0} = op2;
+  bit NeedsReg = needsreg;
+}
+
+def : TLBI<"IPAS2E1IS",    0b01, 0b100, 0b1000, 0b0000, 0b001>;
+def : TLBI<"IPAS2LE1IS",   0b01, 0b100, 0b1000, 0b0000, 0b101>;
+def : TLBI<"VMALLE1IS",    0b01, 0b000, 0b1000, 0b0011, 0b000, 0>;
+def : TLBI<"ALLE2IS",      0b01, 0b100, 0b1000, 0b0011, 0b000, 0>;
+def : TLBI<"ALLE3IS",      0b01, 0b110, 0b1000, 0b0011, 0b000, 0>;
+def : TLBI<"VAE1IS",       0b01, 0b000, 0b1000, 0b0011, 0b001>;
+def : TLBI<"VAE2IS",       0b01, 0b100, 0b1000, 0b0011, 0b001>;
+def : TLBI<"VAE3IS",       0b01, 0b110, 0b1000, 0b0011, 0b001>;
+def : TLBI<"ASIDE1IS",     0b01, 0b000, 0b1000, 0b0011, 0b010>;
+def : TLBI<"VAAE1IS",      0b01, 0b000, 0b1000, 0b0011, 0b011>;
+def : TLBI<"ALLE1IS",      0b01, 0b100, 0b1000, 0b0011, 0b100, 0>;
+def : TLBI<"VALE1IS",      0b01, 0b000, 0b1000, 0b0011, 0b101>;
+def : TLBI<"VALE2IS",      0b01, 0b100, 0b1000, 0b0011, 0b101>;
+def : TLBI<"VALE3IS",      0b01, 0b110, 0b1000, 0b0011, 0b101>;
+def : TLBI<"VMALLS12E1IS", 0b01, 0b100, 0b1000, 0b0011, 0b110, 0>;
+def : TLBI<"VAALE1IS",     0b01, 0b000, 0b1000, 0b0011, 0b111>;
+def : TLBI<"IPAS2E1",      0b01, 0b100, 0b1000, 0b0100, 0b001>;
+def : TLBI<"IPAS2LE1",     0b01, 0b100, 0b1000, 0b0100, 0b101>;
+def : TLBI<"VMALLE1",      0b01, 0b000, 0b1000, 0b0111, 0b000, 0>;
+def : TLBI<"ALLE2",        0b01, 0b100, 0b1000, 0b0111, 0b000, 0>;
+def : TLBI<"ALLE3",        0b01, 0b110, 0b1000, 0b0111, 0b000, 0>;
+def : TLBI<"VAE1",         0b01, 0b000, 0b1000, 0b0111, 0b001>;
+def : TLBI<"VAE2",         0b01, 0b100, 0b1000, 0b0111, 0b001>;
+def : TLBI<"VAE3",         0b01, 0b110, 0b1000, 0b0111, 0b001>;
+def : TLBI<"ASIDE1",       0b01, 0b000, 0b1000, 0b0111, 0b010>;
+def : TLBI<"VAAE1",        0b01, 0b000, 0b1000, 0b0111, 0b011>;
+def : TLBI<"ALLE1",        0b01, 0b100, 0b1000, 0b0111, 0b100, 0>;
+def : TLBI<"VALE1",        0b01, 0b000, 0b1000, 0b0111, 0b101>;
+def : TLBI<"VALE2",        0b01, 0b100, 0b1000, 0b0111, 0b101>;
+def : TLBI<"VALE3",        0b01, 0b110, 0b1000, 0b0111, 0b101>;
+def : TLBI<"VMALLS12E1",   0b01, 0b100, 0b1000, 0b0111, 0b110, 0>;
+def : TLBI<"VAALE1",       0b01, 0b000, 0b1000, 0b0111, 0b111>;
+
+
+//===----------------------------------------------------------------------===//
+// MRS/MSR (system register read/write) instruction options.
+//===----------------------------------------------------------------------===//
+
+class SysReg<string name, bits<2> op0, bits<3> op1, bits<4> crn, bits<4> crm,
+             bits<3> op2> : SearchableTable {
+  let SearchableFields = ["Name", "Encoding"];
+  let EnumValueField = "Encoding";
+
+  string Name = name;
+  bits<16> Encoding;
+  let Encoding{15-14} = op0;
+  let Encoding{13-11} = op1;
+  let Encoding{10-7} = crn;
+  let Encoding{6-3} = crm;
+  let Encoding{2-0} = op2;
+  bit Readable = ?;
+  bit Writeable = ?;
+  code Requires = [{ {} }];
+}
+
+class RWSysReg<string name, bits<2> op0, bits<3> op1, bits<4> crn, bits<4> crm,
+               bits<3> op2>
+    : SysReg<name, op0, op1, crn, crm, op2> {
+  let Readable = 1;
+  let Writeable = 1;
+}
+
+class ROSysReg<string name, bits<2> op0, bits<3> op1, bits<4> crn, bits<4> crm,
+               bits<3> op2>
+    : SysReg<name, op0, op1, crn, crm, op2> {
+  let Readable = 1;
+  let Writeable = 0;
+}
+
+class WOSysReg<string name, bits<2> op0, bits<3> op1, bits<4> crn, bits<4> crm,
+               bits<3> op2>
+    : SysReg<name, op0, op1, crn, crm, op2> {
+  let Readable = 0;
+  let Writeable = 1;
+}
+
+//===----------------------
+// Read-only regs
+//===----------------------
+
+//                                    Op0    Op1     CRn     CRm    Op2
+def : ROSysReg<"MDCCSR_EL0",         0b10, 0b011, 0b0000, 0b0001, 0b000>;
+def : ROSysReg<"DBGDTRRX_EL0",       0b10, 0b011, 0b0000, 0b0101, 0b000>;
+def : ROSysReg<"MDRAR_EL1",          0b10, 0b000, 0b0001, 0b0000, 0b000>;
+def : ROSysReg<"OSLSR_EL1",          0b10, 0b000, 0b0001, 0b0001, 0b100>;
+def : ROSysReg<"DBGAUTHSTATUS_EL1",  0b10, 0b000, 0b0111, 0b1110, 0b110>;
+def : ROSysReg<"PMCEID0_EL0",        0b11, 0b011, 0b1001, 0b1100, 0b110>;
+def : ROSysReg<"PMCEID1_EL0",        0b11, 0b011, 0b1001, 0b1100, 0b111>;
+def : ROSysReg<"MIDR_EL1",           0b11, 0b000, 0b0000, 0b0000, 0b000>;
+def : ROSysReg<"CCSIDR_EL1",         0b11, 0b001, 0b0000, 0b0000, 0b000>;
+def : ROSysReg<"CLIDR_EL1",          0b11, 0b001, 0b0000, 0b0000, 0b001>;
+def : ROSysReg<"CTR_EL0",            0b11, 0b011, 0b0000, 0b0000, 0b001>;
+def : ROSysReg<"MPIDR_EL1",          0b11, 0b000, 0b0000, 0b0000, 0b101>;
+def : ROSysReg<"REVIDR_EL1",         0b11, 0b000, 0b0000, 0b0000, 0b110>;
+def : ROSysReg<"AIDR_EL1",           0b11, 0b001, 0b0000, 0b0000, 0b111>;
+def : ROSysReg<"DCZID_EL0",          0b11, 0b011, 0b0000, 0b0000, 0b111>;
+def : ROSysReg<"ID_PFR0_EL1",        0b11, 0b000, 0b0000, 0b0001, 0b000>;
+def : ROSysReg<"ID_PFR1_EL1",        0b11, 0b000, 0b0000, 0b0001, 0b001>;
+def : ROSysReg<"ID_DFR0_EL1",        0b11, 0b000, 0b0000, 0b0001, 0b010>;
+def : ROSysReg<"ID_AFR0_EL1",        0b11, 0b000, 0b0000, 0b0001, 0b011>;
+def : ROSysReg<"ID_MMFR0_EL1",       0b11, 0b000, 0b0000, 0b0001, 0b100>;
+def : ROSysReg<"ID_MMFR1_EL1",       0b11, 0b000, 0b0000, 0b0001, 0b101>;
+def : ROSysReg<"ID_MMFR2_EL1",       0b11, 0b000, 0b0000, 0b0001, 0b110>;
+def : ROSysReg<"ID_MMFR3_EL1",       0b11, 0b000, 0b0000, 0b0001, 0b111>;
+def : ROSysReg<"ID_ISAR0_EL1",       0b11, 0b000, 0b0000, 0b0010, 0b000>;
+def : ROSysReg<"ID_ISAR1_EL1",       0b11, 0b000, 0b0000, 0b0010, 0b001>;
+def : ROSysReg<"ID_ISAR2_EL1",       0b11, 0b000, 0b0000, 0b0010, 0b010>;
+def : ROSysReg<"ID_ISAR3_EL1",       0b11, 0b000, 0b0000, 0b0010, 0b011>;
+def : ROSysReg<"ID_ISAR4_EL1",       0b11, 0b000, 0b0000, 0b0010, 0b100>;
+def : ROSysReg<"ID_ISAR5_EL1",       0b11, 0b000, 0b0000, 0b0010, 0b101>;
+def : ROSysReg<"ID_AA64PFR0_EL1",     0b11, 0b000, 0b0000, 0b0100, 0b000>;
+def : ROSysReg<"ID_AA64PFR1_EL1",     0b11, 0b000, 0b0000, 0b0100, 0b001>;
+def : ROSysReg<"ID_AA64DFR0_EL1",     0b11, 0b000, 0b0000, 0b0101, 0b000>;
+def : ROSysReg<"ID_AA64DFR1_EL1",     0b11, 0b000, 0b0000, 0b0101, 0b001>;
+def : ROSysReg<"ID_AA64AFR0_EL1",     0b11, 0b000, 0b0000, 0b0101, 0b100>;
+def : ROSysReg<"ID_AA64AFR1_EL1",     0b11, 0b000, 0b0000, 0b0101, 0b101>;
+def : ROSysReg<"ID_AA64ISAR0_EL1",    0b11, 0b000, 0b0000, 0b0110, 0b000>;
+def : ROSysReg<"ID_AA64ISAR1_EL1",    0b11, 0b000, 0b0000, 0b0110, 0b001>;
+def : ROSysReg<"ID_AA64MMFR0_EL1",    0b11, 0b000, 0b0000, 0b0111, 0b000>;
+def : ROSysReg<"ID_AA64MMFR1_EL1",    0b11, 0b000, 0b0000, 0b0111, 0b001>;
+def : ROSysReg<"ID_AA64MMFR2_EL1",    0b11, 0b000, 0b0000, 0b0111, 0b010> {
+  let Requires = [{ {AArch64::HasV8_2aOps} }];
+}
+def : ROSysReg<"MVFR0_EL1",          0b11, 0b000, 0b0000, 0b0011, 0b000>;
+def : ROSysReg<"MVFR1_EL1",          0b11, 0b000, 0b0000, 0b0011, 0b001>;
+def : ROSysReg<"MVFR2_EL1",          0b11, 0b000, 0b0000, 0b0011, 0b010>;
+def : ROSysReg<"RVBAR_EL1",          0b11, 0b000, 0b1100, 0b0000, 0b001>;
+def : ROSysReg<"RVBAR_EL2",          0b11, 0b100, 0b1100, 0b0000, 0b001>;
+def : ROSysReg<"RVBAR_EL3",          0b11, 0b110, 0b1100, 0b0000, 0b001>;
+def : ROSysReg<"ISR_EL1",            0b11, 0b000, 0b1100, 0b0001, 0b000>;
+def : ROSysReg<"CNTPCT_EL0",         0b11, 0b011, 0b1110, 0b0000, 0b001>;
+def : ROSysReg<"CNTVCT_EL0",         0b11, 0b011, 0b1110, 0b0000, 0b010>;
+def : ROSysReg<"ID_MMFR4_EL1",       0b11, 0b000, 0b0000, 0b0010, 0b110>;
+
+// Trace registers
+//                                 Op0    Op1     CRn     CRm    Op2
+def : ROSysReg<"TRCSTATR",           0b10, 0b001, 0b0000, 0b0011, 0b000>;
+def : ROSysReg<"TRCIDR8",            0b10, 0b001, 0b0000, 0b0000, 0b110>;
+def : ROSysReg<"TRCIDR9",            0b10, 0b001, 0b0000, 0b0001, 0b110>;
+def : ROSysReg<"TRCIDR10",           0b10, 0b001, 0b0000, 0b0010, 0b110>;
+def : ROSysReg<"TRCIDR11",           0b10, 0b001, 0b0000, 0b0011, 0b110>;
+def : ROSysReg<"TRCIDR12",           0b10, 0b001, 0b0000, 0b0100, 0b110>;
+def : ROSysReg<"TRCIDR13",           0b10, 0b001, 0b0000, 0b0101, 0b110>;
+def : ROSysReg<"TRCIDR0",            0b10, 0b001, 0b0000, 0b1000, 0b111>;
+def : ROSysReg<"TRCIDR1",            0b10, 0b001, 0b0000, 0b1001, 0b111>;
+def : ROSysReg<"TRCIDR2",            0b10, 0b001, 0b0000, 0b1010, 0b111>;
+def : ROSysReg<"TRCIDR3",            0b10, 0b001, 0b0000, 0b1011, 0b111>;
+def : ROSysReg<"TRCIDR4",            0b10, 0b001, 0b0000, 0b1100, 0b111>;
+def : ROSysReg<"TRCIDR5",            0b10, 0b001, 0b0000, 0b1101, 0b111>;
+def : ROSysReg<"TRCIDR6",            0b10, 0b001, 0b0000, 0b1110, 0b111>;
+def : ROSysReg<"TRCIDR7",            0b10, 0b001, 0b0000, 0b1111, 0b111>;
+def : ROSysReg<"TRCOSLSR",           0b10, 0b001, 0b0001, 0b0001, 0b100>;
+def : ROSysReg<"TRCPDSR",            0b10, 0b001, 0b0001, 0b0101, 0b100>;
+def : ROSysReg<"TRCDEVAFF0",         0b10, 0b001, 0b0111, 0b1010, 0b110>;
+def : ROSysReg<"TRCDEVAFF1",         0b10, 0b001, 0b0111, 0b1011, 0b110>;
+def : ROSysReg<"TRCLSR",             0b10, 0b001, 0b0111, 0b1101, 0b110>;
+def : ROSysReg<"TRCAUTHSTATUS",      0b10, 0b001, 0b0111, 0b1110, 0b110>;
+def : ROSysReg<"TRCDEVARCH",         0b10, 0b001, 0b0111, 0b1111, 0b110>;
+def : ROSysReg<"TRCDEVID",           0b10, 0b001, 0b0111, 0b0010, 0b111>;
+def : ROSysReg<"TRCDEVTYPE",         0b10, 0b001, 0b0111, 0b0011, 0b111>;
+def : ROSysReg<"TRCPIDR4",           0b10, 0b001, 0b0111, 0b0100, 0b111>;
+def : ROSysReg<"TRCPIDR5",           0b10, 0b001, 0b0111, 0b0101, 0b111>;
+def : ROSysReg<"TRCPIDR6",           0b10, 0b001, 0b0111, 0b0110, 0b111>;
+def : ROSysReg<"TRCPIDR7",           0b10, 0b001, 0b0111, 0b0111, 0b111>;
+def : ROSysReg<"TRCPIDR0",           0b10, 0b001, 0b0111, 0b1000, 0b111>;
+def : ROSysReg<"TRCPIDR1",           0b10, 0b001, 0b0111, 0b1001, 0b111>;
+def : ROSysReg<"TRCPIDR2",           0b10, 0b001, 0b0111, 0b1010, 0b111>;
+def : ROSysReg<"TRCPIDR3",           0b10, 0b001, 0b0111, 0b1011, 0b111>;
+def : ROSysReg<"TRCCIDR0",           0b10, 0b001, 0b0111, 0b1100, 0b111>;
+def : ROSysReg<"TRCCIDR1",           0b10, 0b001, 0b0111, 0b1101, 0b111>;
+def : ROSysReg<"TRCCIDR2",           0b10, 0b001, 0b0111, 0b1110, 0b111>;
+def : ROSysReg<"TRCCIDR3",           0b10, 0b001, 0b0111, 0b1111, 0b111>;
+
+// GICv3 registers
+//                                 Op0    Op1     CRn     CRm    Op2
+def : ROSysReg<"ICC_IAR1_EL1",       0b11, 0b000, 0b1100, 0b1100, 0b000>;
+def : ROSysReg<"ICC_IAR0_EL1",       0b11, 0b000, 0b1100, 0b1000, 0b000>;
+def : ROSysReg<"ICC_HPPIR1_EL1",     0b11, 0b000, 0b1100, 0b1100, 0b010>;
+def : ROSysReg<"ICC_HPPIR0_EL1",     0b11, 0b000, 0b1100, 0b1000, 0b010>;
+def : ROSysReg<"ICC_RPR_EL1",        0b11, 0b000, 0b1100, 0b1011, 0b011>;
+def : ROSysReg<"ICH_VTR_EL2",        0b11, 0b100, 0b1100, 0b1011, 0b001>;
+def : ROSysReg<"ICH_EISR_EL2",       0b11, 0b100, 0b1100, 0b1011, 0b011>;
+def : ROSysReg<"ICH_ELSR_EL2",       0b11, 0b100, 0b1100, 0b1011, 0b101>;
+
+// v8.1a "Limited Ordering Regions" extension-specific system register
+//                         Op0    Op1     CRn     CRm    Op2
+let Requires = [{ {AArch64::HasV8_1aOps} }] in
+def : ROSysReg<"LORID_EL1",  0b11, 0b000, 0b1010, 0b0100, 0b111>;
+
+// v8.2a "RAS extension" registers
+//                         Op0    Op1     CRn     CRm    Op2
+let Requires = [{ {AArch64::FeatureRAS} }] in {
+def : ROSysReg<"ERRIDR_EL1", 0b11, 0b000, 0b0101, 0b0011, 0b000>;
+def : ROSysReg<"ERXFR_EL1",  0b11, 0b000, 0b0101, 0b0100, 0b000>;
+}
+
+//===----------------------
+// Write-only regs
+//===----------------------
+
+//                                 Op0    Op1     CRn     CRm    Op2
+def : WOSysReg<"DBGDTRTX_EL0",       0b10, 0b011, 0b0000, 0b0101, 0b000>;
+def : WOSysReg<"OSLAR_EL1",          0b10, 0b000, 0b0001, 0b0000, 0b100>;
+def : WOSysReg<"PMSWINC_EL0",        0b11, 0b011, 0b1001, 0b1100, 0b100>;
+
+// Trace Registers
+//                                 Op0    Op1     CRn     CRm    Op2
+def : WOSysReg<"TRCOSLAR",           0b10, 0b001, 0b0001, 0b0000, 0b100>;
+def : WOSysReg<"TRCLAR",             0b10, 0b001, 0b0111, 0b1100, 0b110>;
+
+// GICv3 registers
+//                                 Op0    Op1     CRn     CRm    Op2
+def : WOSysReg<"ICC_EOIR1_EL1",      0b11, 0b000, 0b1100, 0b1100, 0b001>;
+def : WOSysReg<"ICC_EOIR0_EL1",      0b11, 0b000, 0b1100, 0b1000, 0b001>;
+def : WOSysReg<"ICC_DIR_EL1",        0b11, 0b000, 0b1100, 0b1011, 0b001>;
+def : WOSysReg<"ICC_SGI1R_EL1",      0b11, 0b000, 0b1100, 0b1011, 0b101>;
+def : WOSysReg<"ICC_ASGI1R_EL1",     0b11, 0b000, 0b1100, 0b1011, 0b110>;
+def : WOSysReg<"ICC_SGI0R_EL1",      0b11, 0b000, 0b1100, 0b1011, 0b111>;
+
+//===----------------------
+// Read-write regs
+//===----------------------
+
+//                                 Op0    Op1     CRn     CRm    Op2
+def : RWSysReg<"OSDTRRX_EL1",        0b10, 0b000, 0b0000, 0b0000, 0b010>;
+def : RWSysReg<"OSDTRTX_EL1",        0b10, 0b000, 0b0000, 0b0011, 0b010>;
+def : RWSysReg<"TEECR32_EL1",        0b10, 0b010, 0b0000, 0b0000, 0b000>;
+def : RWSysReg<"MDCCINT_EL1",        0b10, 0b000, 0b0000, 0b0010, 0b000>;
+def : RWSysReg<"MDSCR_EL1",          0b10, 0b000, 0b0000, 0b0010, 0b010>;
+def : RWSysReg<"DBGDTR_EL0",         0b10, 0b011, 0b0000, 0b0100, 0b000>;
+def : RWSysReg<"OSECCR_EL1",         0b10, 0b000, 0b0000, 0b0110, 0b010>;
+def : RWSysReg<"DBGVCR32_EL2",       0b10, 0b100, 0b0000, 0b0111, 0b000>;
+def : RWSysReg<"DBGBVR0_EL1",        0b10, 0b000, 0b0000, 0b0000, 0b100>;
+def : RWSysReg<"DBGBVR1_EL1",        0b10, 0b000, 0b0000, 0b0001, 0b100>;
+def : RWSysReg<"DBGBVR2_EL1",        0b10, 0b000, 0b0000, 0b0010, 0b100>;
+def : RWSysReg<"DBGBVR3_EL1",        0b10, 0b000, 0b0000, 0b0011, 0b100>;
+def : RWSysReg<"DBGBVR4_EL1",        0b10, 0b000, 0b0000, 0b0100, 0b100>;
+def : RWSysReg<"DBGBVR5_EL1",        0b10, 0b000, 0b0000, 0b0101, 0b100>;
+def : RWSysReg<"DBGBVR6_EL1",        0b10, 0b000, 0b0000, 0b0110, 0b100>;
+def : RWSysReg<"DBGBVR7_EL1",        0b10, 0b000, 0b0000, 0b0111, 0b100>;
+def : RWSysReg<"DBGBVR8_EL1",        0b10, 0b000, 0b0000, 0b1000, 0b100>;
+def : RWSysReg<"DBGBVR9_EL1",        0b10, 0b000, 0b0000, 0b1001, 0b100>;
+def : RWSysReg<"DBGBVR10_EL1",       0b10, 0b000, 0b0000, 0b1010, 0b100>;
+def : RWSysReg<"DBGBVR11_EL1",       0b10, 0b000, 0b0000, 0b1011, 0b100>;
+def : RWSysReg<"DBGBVR12_EL1",       0b10, 0b000, 0b0000, 0b1100, 0b100>;
+def : RWSysReg<"DBGBVR13_EL1",       0b10, 0b000, 0b0000, 0b1101, 0b100>;
+def : RWSysReg<"DBGBVR14_EL1",       0b10, 0b000, 0b0000, 0b1110, 0b100>;
+def : RWSysReg<"DBGBVR15_EL1",       0b10, 0b000, 0b0000, 0b1111, 0b100>;
+def : RWSysReg<"DBGBCR0_EL1",        0b10, 0b000, 0b0000, 0b0000, 0b101>;
+def : RWSysReg<"DBGBCR1_EL1",        0b10, 0b000, 0b0000, 0b0001, 0b101>;
+def : RWSysReg<"DBGBCR2_EL1",        0b10, 0b000, 0b0000, 0b0010, 0b101>;
+def : RWSysReg<"DBGBCR3_EL1",        0b10, 0b000, 0b0000, 0b0011, 0b101>;
+def : RWSysReg<"DBGBCR4_EL1",        0b10, 0b000, 0b0000, 0b0100, 0b101>;
+def : RWSysReg<"DBGBCR5_EL1",        0b10, 0b000, 0b0000, 0b0101, 0b101>;
+def : RWSysReg<"DBGBCR6_EL1",        0b10, 0b000, 0b0000, 0b0110, 0b101>;
+def : RWSysReg<"DBGBCR7_EL1",        0b10, 0b000, 0b0000, 0b0111, 0b101>;
+def : RWSysReg<"DBGBCR8_EL1",        0b10, 0b000, 0b0000, 0b1000, 0b101>;
+def : RWSysReg<"DBGBCR9_EL1",        0b10, 0b000, 0b0000, 0b1001, 0b101>;
+def : RWSysReg<"DBGBCR10_EL1",       0b10, 0b000, 0b0000, 0b1010, 0b101>;
+def : RWSysReg<"DBGBCR11_EL1",       0b10, 0b000, 0b0000, 0b1011, 0b101>;
+def : RWSysReg<"DBGBCR12_EL1",       0b10, 0b000, 0b0000, 0b1100, 0b101>;
+def : RWSysReg<"DBGBCR13_EL1",       0b10, 0b000, 0b0000, 0b1101, 0b101>;
+def : RWSysReg<"DBGBCR14_EL1",       0b10, 0b000, 0b0000, 0b1110, 0b101>;
+def : RWSysReg<"DBGBCR15_EL1",       0b10, 0b000, 0b0000, 0b1111, 0b101>;
+def : RWSysReg<"DBGWVR0_EL1",        0b10, 0b000, 0b0000, 0b0000, 0b110>;
+def : RWSysReg<"DBGWVR1_EL1",        0b10, 0b000, 0b0000, 0b0001, 0b110>;
+def : RWSysReg<"DBGWVR2_EL1",        0b10, 0b000, 0b0000, 0b0010, 0b110>;
+def : RWSysReg<"DBGWVR3_EL1",        0b10, 0b000, 0b0000, 0b0011, 0b110>;
+def : RWSysReg<"DBGWVR4_EL1",        0b10, 0b000, 0b0000, 0b0100, 0b110>;
+def : RWSysReg<"DBGWVR5_EL1",        0b10, 0b000, 0b0000, 0b0101, 0b110>;
+def : RWSysReg<"DBGWVR6_EL1",        0b10, 0b000, 0b0000, 0b0110, 0b110>;
+def : RWSysReg<"DBGWVR7_EL1",        0b10, 0b000, 0b0000, 0b0111, 0b110>;
+def : RWSysReg<"DBGWVR8_EL1",        0b10, 0b000, 0b0000, 0b1000, 0b110>;
+def : RWSysReg<"DBGWVR9_EL1",        0b10, 0b000, 0b0000, 0b1001, 0b110>;
+def : RWSysReg<"DBGWVR10_EL1",       0b10, 0b000, 0b0000, 0b1010, 0b110>;
+def : RWSysReg<"DBGWVR11_EL1",       0b10, 0b000, 0b0000, 0b1011, 0b110>;
+def : RWSysReg<"DBGWVR12_EL1",       0b10, 0b000, 0b0000, 0b1100, 0b110>;
+def : RWSysReg<"DBGWVR13_EL1",       0b10, 0b000, 0b0000, 0b1101, 0b110>;
+def : RWSysReg<"DBGWVR14_EL1",       0b10, 0b000, 0b0000, 0b1110, 0b110>;
+def : RWSysReg<"DBGWVR15_EL1",       0b10, 0b000, 0b0000, 0b1111, 0b110>;
+def : RWSysReg<"DBGWCR0_EL1",        0b10, 0b000, 0b0000, 0b0000, 0b111>;
+def : RWSysReg<"DBGWCR1_EL1",        0b10, 0b000, 0b0000, 0b0001, 0b111>;
+def : RWSysReg<"DBGWCR2_EL1",        0b10, 0b000, 0b0000, 0b0010, 0b111>;
+def : RWSysReg<"DBGWCR3_EL1",        0b10, 0b000, 0b0000, 0b0011, 0b111>;
+def : RWSysReg<"DBGWCR4_EL1",        0b10, 0b000, 0b0000, 0b0100, 0b111>;
+def : RWSysReg<"DBGWCR5_EL1",        0b10, 0b000, 0b0000, 0b0101, 0b111>;
+def : RWSysReg<"DBGWCR6_EL1",        0b10, 0b000, 0b0000, 0b0110, 0b111>;
+def : RWSysReg<"DBGWCR7_EL1",        0b10, 0b000, 0b0000, 0b0111, 0b111>;
+def : RWSysReg<"DBGWCR8_EL1",        0b10, 0b000, 0b0000, 0b1000, 0b111>;
+def : RWSysReg<"DBGWCR9_EL1",        0b10, 0b000, 0b0000, 0b1001, 0b111>;
+def : RWSysReg<"DBGWCR10_EL1",       0b10, 0b000, 0b0000, 0b1010, 0b111>;
+def : RWSysReg<"DBGWCR11_EL1",       0b10, 0b000, 0b0000, 0b1011, 0b111>;
+def : RWSysReg<"DBGWCR12_EL1",       0b10, 0b000, 0b0000, 0b1100, 0b111>;
+def : RWSysReg<"DBGWCR13_EL1",       0b10, 0b000, 0b0000, 0b1101, 0b111>;
+def : RWSysReg<"DBGWCR14_EL1",       0b10, 0b000, 0b0000, 0b1110, 0b111>;
+def : RWSysReg<"DBGWCR15_EL1",       0b10, 0b000, 0b0000, 0b1111, 0b111>;
+def : RWSysReg<"TEEHBR32_EL1",       0b10, 0b010, 0b0001, 0b0000, 0b000>;
+def : RWSysReg<"OSDLR_EL1",          0b10, 0b000, 0b0001, 0b0011, 0b100>;
+def : RWSysReg<"DBGPRCR_EL1",        0b10, 0b000, 0b0001, 0b0100, 0b100>;
+def : RWSysReg<"DBGCLAIMSET_EL1",    0b10, 0b000, 0b0111, 0b1000, 0b110>;
+def : RWSysReg<"DBGCLAIMCLR_EL1",    0b10, 0b000, 0b0111, 0b1001, 0b110>;
+def : RWSysReg<"CSSELR_EL1",         0b11, 0b010, 0b0000, 0b0000, 0b000>;
+def : RWSysReg<"VPIDR_EL2",          0b11, 0b100, 0b0000, 0b0000, 0b000>;
+def : RWSysReg<"VMPIDR_EL2",         0b11, 0b100, 0b0000, 0b0000, 0b101>;
+def : RWSysReg<"CPACR_EL1",          0b11, 0b000, 0b0001, 0b0000, 0b010>;
+def : RWSysReg<"SCTLR_EL1",          0b11, 0b000, 0b0001, 0b0000, 0b000>;
+def : RWSysReg<"SCTLR_EL2",          0b11, 0b100, 0b0001, 0b0000, 0b000>;
+def : RWSysReg<"SCTLR_EL3",          0b11, 0b110, 0b0001, 0b0000, 0b000>;
+def : RWSysReg<"ACTLR_EL1",          0b11, 0b000, 0b0001, 0b0000, 0b001>;
+def : RWSysReg<"ACTLR_EL2",          0b11, 0b100, 0b0001, 0b0000, 0b001>;
+def : RWSysReg<"ACTLR_EL3",          0b11, 0b110, 0b0001, 0b0000, 0b001>;
+def : RWSysReg<"HCR_EL2",            0b11, 0b100, 0b0001, 0b0001, 0b000>;
+def : RWSysReg<"SCR_EL3",            0b11, 0b110, 0b0001, 0b0001, 0b000>;
+def : RWSysReg<"MDCR_EL2",           0b11, 0b100, 0b0001, 0b0001, 0b001>;
+def : RWSysReg<"SDER32_EL3",         0b11, 0b110, 0b0001, 0b0001, 0b001>;
+def : RWSysReg<"CPTR_EL2",           0b11, 0b100, 0b0001, 0b0001, 0b010>;
+def : RWSysReg<"CPTR_EL3",           0b11, 0b110, 0b0001, 0b0001, 0b010>;
+def : RWSysReg<"HSTR_EL2",           0b11, 0b100, 0b0001, 0b0001, 0b011>;
+def : RWSysReg<"HACR_EL2",           0b11, 0b100, 0b0001, 0b0001, 0b111>;
+def : RWSysReg<"MDCR_EL3",           0b11, 0b110, 0b0001, 0b0011, 0b001>;
+def : RWSysReg<"TTBR0_EL1",          0b11, 0b000, 0b0010, 0b0000, 0b000>;
+def : RWSysReg<"TTBR0_EL2",          0b11, 0b100, 0b0010, 0b0000, 0b000>;
+def : RWSysReg<"TTBR0_EL3",          0b11, 0b110, 0b0010, 0b0000, 0b000>;
+def : RWSysReg<"TTBR1_EL1",          0b11, 0b000, 0b0010, 0b0000, 0b001>;
+def : RWSysReg<"TCR_EL1",            0b11, 0b000, 0b0010, 0b0000, 0b010>;
+def : RWSysReg<"TCR_EL2",            0b11, 0b100, 0b0010, 0b0000, 0b010>;
+def : RWSysReg<"TCR_EL3",            0b11, 0b110, 0b0010, 0b0000, 0b010>;
+def : RWSysReg<"VTTBR_EL2",          0b11, 0b100, 0b0010, 0b0001, 0b000>;
+def : RWSysReg<"VTCR_EL2",           0b11, 0b100, 0b0010, 0b0001, 0b010>;
+def : RWSysReg<"DACR32_EL2",         0b11, 0b100, 0b0011, 0b0000, 0b000>;
+def : RWSysReg<"SPSR_EL1",           0b11, 0b000, 0b0100, 0b0000, 0b000>;
+def : RWSysReg<"SPSR_EL2",           0b11, 0b100, 0b0100, 0b0000, 0b000>;
+def : RWSysReg<"SPSR_EL3",           0b11, 0b110, 0b0100, 0b0000, 0b000>;
+def : RWSysReg<"ELR_EL1",            0b11, 0b000, 0b0100, 0b0000, 0b001>;
+def : RWSysReg<"ELR_EL2",            0b11, 0b100, 0b0100, 0b0000, 0b001>;
+def : RWSysReg<"ELR_EL3",            0b11, 0b110, 0b0100, 0b0000, 0b001>;
+def : RWSysReg<"SP_EL0",             0b11, 0b000, 0b0100, 0b0001, 0b000>;
+def : RWSysReg<"SP_EL1",             0b11, 0b100, 0b0100, 0b0001, 0b000>;
+def : RWSysReg<"SP_EL2",             0b11, 0b110, 0b0100, 0b0001, 0b000>;
+def : RWSysReg<"SPSel",              0b11, 0b000, 0b0100, 0b0010, 0b000>;
+def : RWSysReg<"NZCV",               0b11, 0b011, 0b0100, 0b0010, 0b000>;
+def : RWSysReg<"DAIF",               0b11, 0b011, 0b0100, 0b0010, 0b001>;
+def : RWSysReg<"CurrentEL",          0b11, 0b000, 0b0100, 0b0010, 0b010>;
+def : RWSysReg<"SPSR_irq",           0b11, 0b100, 0b0100, 0b0011, 0b000>;
+def : RWSysReg<"SPSR_abt",           0b11, 0b100, 0b0100, 0b0011, 0b001>;
+def : RWSysReg<"SPSR_und",           0b11, 0b100, 0b0100, 0b0011, 0b010>;
+def : RWSysReg<"SPSR_fiq",           0b11, 0b100, 0b0100, 0b0011, 0b011>;
+def : RWSysReg<"FPCR",               0b11, 0b011, 0b0100, 0b0100, 0b000>;
+def : RWSysReg<"FPSR",               0b11, 0b011, 0b0100, 0b0100, 0b001>;
+def : RWSysReg<"DSPSR_EL0",          0b11, 0b011, 0b0100, 0b0101, 0b000>;
+def : RWSysReg<"DLR_EL0",            0b11, 0b011, 0b0100, 0b0101, 0b001>;
+def : RWSysReg<"IFSR32_EL2",         0b11, 0b100, 0b0101, 0b0000, 0b001>;
+def : RWSysReg<"AFSR0_EL1",          0b11, 0b000, 0b0101, 0b0001, 0b000>;
+def : RWSysReg<"AFSR0_EL2",          0b11, 0b100, 0b0101, 0b0001, 0b000>;
+def : RWSysReg<"AFSR0_EL3",          0b11, 0b110, 0b0101, 0b0001, 0b000>;
+def : RWSysReg<"AFSR1_EL1",          0b11, 0b000, 0b0101, 0b0001, 0b001>;
+def : RWSysReg<"AFSR1_EL2",          0b11, 0b100, 0b0101, 0b0001, 0b001>;
+def : RWSysReg<"AFSR1_EL3",          0b11, 0b110, 0b0101, 0b0001, 0b001>;
+def : RWSysReg<"ESR_EL1",            0b11, 0b000, 0b0101, 0b0010, 0b000>;
+def : RWSysReg<"ESR_EL2",            0b11, 0b100, 0b0101, 0b0010, 0b000>;
+def : RWSysReg<"ESR_EL3",            0b11, 0b110, 0b0101, 0b0010, 0b000>;
+def : RWSysReg<"FPEXC32_EL2",        0b11, 0b100, 0b0101, 0b0011, 0b000>;
+def : RWSysReg<"FAR_EL1",            0b11, 0b000, 0b0110, 0b0000, 0b000>;
+def : RWSysReg<"FAR_EL2",            0b11, 0b100, 0b0110, 0b0000, 0b000>;
+def : RWSysReg<"FAR_EL3",            0b11, 0b110, 0b0110, 0b0000, 0b000>;
+def : RWSysReg<"HPFAR_EL2",          0b11, 0b100, 0b0110, 0b0000, 0b100>;
+def : RWSysReg<"PAR_EL1",            0b11, 0b000, 0b0111, 0b0100, 0b000>;
+def : RWSysReg<"PMCR_EL0",           0b11, 0b011, 0b1001, 0b1100, 0b000>;
+def : RWSysReg<"PMCNTENSET_EL0",     0b11, 0b011, 0b1001, 0b1100, 0b001>;
+def : RWSysReg<"PMCNTENCLR_EL0",     0b11, 0b011, 0b1001, 0b1100, 0b010>;
+def : RWSysReg<"PMOVSCLR_EL0",       0b11, 0b011, 0b1001, 0b1100, 0b011>;
+def : RWSysReg<"PMSELR_EL0",         0b11, 0b011, 0b1001, 0b1100, 0b101>;
+def : RWSysReg<"PMCCNTR_EL0",        0b11, 0b011, 0b1001, 0b1101, 0b000>;
+def : RWSysReg<"PMXEVTYPER_EL0",     0b11, 0b011, 0b1001, 0b1101, 0b001>;
+def : RWSysReg<"PMXEVCNTR_EL0",      0b11, 0b011, 0b1001, 0b1101, 0b010>;
+def : RWSysReg<"PMUSERENR_EL0",      0b11, 0b011, 0b1001, 0b1110, 0b000>;
+def : RWSysReg<"PMINTENSET_EL1",     0b11, 0b000, 0b1001, 0b1110, 0b001>;
+def : RWSysReg<"PMINTENCLR_EL1",     0b11, 0b000, 0b1001, 0b1110, 0b010>;
+def : RWSysReg<"PMOVSSET_EL0",       0b11, 0b011, 0b1001, 0b1110, 0b011>;
+def : RWSysReg<"MAIR_EL1",           0b11, 0b000, 0b1010, 0b0010, 0b000>;
+def : RWSysReg<"MAIR_EL2",           0b11, 0b100, 0b1010, 0b0010, 0b000>;
+def : RWSysReg<"MAIR_EL3",           0b11, 0b110, 0b1010, 0b0010, 0b000>;
+def : RWSysReg<"AMAIR_EL1",          0b11, 0b000, 0b1010, 0b0011, 0b000>;
+def : RWSysReg<"AMAIR_EL2",          0b11, 0b100, 0b1010, 0b0011, 0b000>;
+def : RWSysReg<"AMAIR_EL3",          0b11, 0b110, 0b1010, 0b0011, 0b000>;
+def : RWSysReg<"VBAR_EL1",           0b11, 0b000, 0b1100, 0b0000, 0b000>;
+def : RWSysReg<"VBAR_EL2",           0b11, 0b100, 0b1100, 0b0000, 0b000>;
+def : RWSysReg<"VBAR_EL3",           0b11, 0b110, 0b1100, 0b0000, 0b000>;
+def : RWSysReg<"RMR_EL1",            0b11, 0b000, 0b1100, 0b0000, 0b010>;
+def : RWSysReg<"RMR_EL2",            0b11, 0b100, 0b1100, 0b0000, 0b010>;
+def : RWSysReg<"RMR_EL3",            0b11, 0b110, 0b1100, 0b0000, 0b010>;
+def : RWSysReg<"CONTEXTIDR_EL1",     0b11, 0b000, 0b1101, 0b0000, 0b001>;
+def : RWSysReg<"TPIDR_EL0",          0b11, 0b011, 0b1101, 0b0000, 0b010>;
+def : RWSysReg<"TPIDR_EL2",          0b11, 0b100, 0b1101, 0b0000, 0b010>;
+def : RWSysReg<"TPIDR_EL3",          0b11, 0b110, 0b1101, 0b0000, 0b010>;
+def : RWSysReg<"TPIDRRO_EL0",        0b11, 0b011, 0b1101, 0b0000, 0b011>;
+def : RWSysReg<"TPIDR_EL1",          0b11, 0b000, 0b1101, 0b0000, 0b100>;
+def : RWSysReg<"CNTFRQ_EL0",         0b11, 0b011, 0b1110, 0b0000, 0b000>;
+def : RWSysReg<"CNTVOFF_EL2",        0b11, 0b100, 0b1110, 0b0000, 0b011>;
+def : RWSysReg<"CNTKCTL_EL1",        0b11, 0b000, 0b1110, 0b0001, 0b000>;
+def : RWSysReg<"CNTHCTL_EL2",        0b11, 0b100, 0b1110, 0b0001, 0b000>;
+def : RWSysReg<"CNTP_TVAL_EL0",      0b11, 0b011, 0b1110, 0b0010, 0b000>;
+def : RWSysReg<"CNTHP_TVAL_EL2",     0b11, 0b100, 0b1110, 0b0010, 0b000>;
+def : RWSysReg<"CNTPS_TVAL_EL1",     0b11, 0b111, 0b1110, 0b0010, 0b000>;
+def : RWSysReg<"CNTP_CTL_EL0",       0b11, 0b011, 0b1110, 0b0010, 0b001>;
+def : RWSysReg<"CNTHP_CTL_EL2",      0b11, 0b100, 0b1110, 0b0010, 0b001>;
+def : RWSysReg<"CNTPS_CTL_EL1",      0b11, 0b111, 0b1110, 0b0010, 0b001>;
+def : RWSysReg<"CNTP_CVAL_EL0",      0b11, 0b011, 0b1110, 0b0010, 0b010>;
+def : RWSysReg<"CNTHP_CVAL_EL2",     0b11, 0b100, 0b1110, 0b0010, 0b010>;
+def : RWSysReg<"CNTPS_CVAL_EL1",     0b11, 0b111, 0b1110, 0b0010, 0b010>;
+def : RWSysReg<"CNTV_TVAL_EL0",      0b11, 0b011, 0b1110, 0b0011, 0b000>;
+def : RWSysReg<"CNTV_CTL_EL0",       0b11, 0b011, 0b1110, 0b0011, 0b001>;
+def : RWSysReg<"CNTV_CVAL_EL0",      0b11, 0b011, 0b1110, 0b0011, 0b010>;
+def : RWSysReg<"PMEVCNTR0_EL0",      0b11, 0b011, 0b1110, 0b1000, 0b000>;
+def : RWSysReg<"PMEVCNTR1_EL0",      0b11, 0b011, 0b1110, 0b1000, 0b001>;
+def : RWSysReg<"PMEVCNTR2_EL0",      0b11, 0b011, 0b1110, 0b1000, 0b010>;
+def : RWSysReg<"PMEVCNTR3_EL0",      0b11, 0b011, 0b1110, 0b1000, 0b011>;
+def : RWSysReg<"PMEVCNTR4_EL0",      0b11, 0b011, 0b1110, 0b1000, 0b100>;
+def : RWSysReg<"PMEVCNTR5_EL0",      0b11, 0b011, 0b1110, 0b1000, 0b101>;
+def : RWSysReg<"PMEVCNTR6_EL0",      0b11, 0b011, 0b1110, 0b1000, 0b110>;
+def : RWSysReg<"PMEVCNTR7_EL0",      0b11, 0b011, 0b1110, 0b1000, 0b111>;
+def : RWSysReg<"PMEVCNTR8_EL0",      0b11, 0b011, 0b1110, 0b1001, 0b000>;
+def : RWSysReg<"PMEVCNTR9_EL0",      0b11, 0b011, 0b1110, 0b1001, 0b001>;
+def : RWSysReg<"PMEVCNTR10_EL0",     0b11, 0b011, 0b1110, 0b1001, 0b010>;
+def : RWSysReg<"PMEVCNTR11_EL0",     0b11, 0b011, 0b1110, 0b1001, 0b011>;
+def : RWSysReg<"PMEVCNTR12_EL0",     0b11, 0b011, 0b1110, 0b1001, 0b100>;
+def : RWSysReg<"PMEVCNTR13_EL0",     0b11, 0b011, 0b1110, 0b1001, 0b101>;
+def : RWSysReg<"PMEVCNTR14_EL0",     0b11, 0b011, 0b1110, 0b1001, 0b110>;
+def : RWSysReg<"PMEVCNTR15_EL0",     0b11, 0b011, 0b1110, 0b1001, 0b111>;
+def : RWSysReg<"PMEVCNTR16_EL0",     0b11, 0b011, 0b1110, 0b1010, 0b000>;
+def : RWSysReg<"PMEVCNTR17_EL0",     0b11, 0b011, 0b1110, 0b1010, 0b001>;
+def : RWSysReg<"PMEVCNTR18_EL0",     0b11, 0b011, 0b1110, 0b1010, 0b010>;
+def : RWSysReg<"PMEVCNTR19_EL0",     0b11, 0b011, 0b1110, 0b1010, 0b011>;
+def : RWSysReg<"PMEVCNTR20_EL0",     0b11, 0b011, 0b1110, 0b1010, 0b100>;
+def : RWSysReg<"PMEVCNTR21_EL0",     0b11, 0b011, 0b1110, 0b1010, 0b101>;
+def : RWSysReg<"PMEVCNTR22_EL0",     0b11, 0b011, 0b1110, 0b1010, 0b110>;
+def : RWSysReg<"PMEVCNTR23_EL0",     0b11, 0b011, 0b1110, 0b1010, 0b111>;
+def : RWSysReg<"PMEVCNTR24_EL0",     0b11, 0b011, 0b1110, 0b1011, 0b000>;
+def : RWSysReg<"PMEVCNTR25_EL0",     0b11, 0b011, 0b1110, 0b1011, 0b001>;
+def : RWSysReg<"PMEVCNTR26_EL0",     0b11, 0b011, 0b1110, 0b1011, 0b010>;
+def : RWSysReg<"PMEVCNTR27_EL0",     0b11, 0b011, 0b1110, 0b1011, 0b011>;
+def : RWSysReg<"PMEVCNTR28_EL0",     0b11, 0b011, 0b1110, 0b1011, 0b100>;
+def : RWSysReg<"PMEVCNTR29_EL0",     0b11, 0b011, 0b1110, 0b1011, 0b101>;
+def : RWSysReg<"PMEVCNTR30_EL0",     0b11, 0b011, 0b1110, 0b1011, 0b110>;
+def : RWSysReg<"PMCCFILTR_EL0",      0b11, 0b011, 0b1110, 0b1111, 0b111>;
+def : RWSysReg<"PMEVTYPER0_EL0",     0b11, 0b011, 0b1110, 0b1100, 0b000>;
+def : RWSysReg<"PMEVTYPER1_EL0",     0b11, 0b011, 0b1110, 0b1100, 0b001>;
+def : RWSysReg<"PMEVTYPER2_EL0",     0b11, 0b011, 0b1110, 0b1100, 0b010>;
+def : RWSysReg<"PMEVTYPER3_EL0",     0b11, 0b011, 0b1110, 0b1100, 0b011>;
+def : RWSysReg<"PMEVTYPER4_EL0",     0b11, 0b011, 0b1110, 0b1100, 0b100>;
+def : RWSysReg<"PMEVTYPER5_EL0",     0b11, 0b011, 0b1110, 0b1100, 0b101>;
+def : RWSysReg<"PMEVTYPER6_EL0",     0b11, 0b011, 0b1110, 0b1100, 0b110>;
+def : RWSysReg<"PMEVTYPER7_EL0",     0b11, 0b011, 0b1110, 0b1100, 0b111>;
+def : RWSysReg<"PMEVTYPER8_EL0",     0b11, 0b011, 0b1110, 0b1101, 0b000>;
+def : RWSysReg<"PMEVTYPER9_EL0",     0b11, 0b011, 0b1110, 0b1101, 0b001>;
+def : RWSysReg<"PMEVTYPER10_EL0",    0b11, 0b011, 0b1110, 0b1101, 0b010>;
+def : RWSysReg<"PMEVTYPER11_EL0",    0b11, 0b011, 0b1110, 0b1101, 0b011>;
+def : RWSysReg<"PMEVTYPER12_EL0",    0b11, 0b011, 0b1110, 0b1101, 0b100>;
+def : RWSysReg<"PMEVTYPER13_EL0",    0b11, 0b011, 0b1110, 0b1101, 0b101>;
+def : RWSysReg<"PMEVTYPER14_EL0",    0b11, 0b011, 0b1110, 0b1101, 0b110>;
+def : RWSysReg<"PMEVTYPER15_EL0",    0b11, 0b011, 0b1110, 0b1101, 0b111>;
+def : RWSysReg<"PMEVTYPER16_EL0",    0b11, 0b011, 0b1110, 0b1110, 0b000>;
+def : RWSysReg<"PMEVTYPER17_EL0",    0b11, 0b011, 0b1110, 0b1110, 0b001>;
+def : RWSysReg<"PMEVTYPER18_EL0",    0b11, 0b011, 0b1110, 0b1110, 0b010>;
+def : RWSysReg<"PMEVTYPER19_EL0",    0b11, 0b011, 0b1110, 0b1110, 0b011>;
+def : RWSysReg<"PMEVTYPER20_EL0",    0b11, 0b011, 0b1110, 0b1110, 0b100>;
+def : RWSysReg<"PMEVTYPER21_EL0",    0b11, 0b011, 0b1110, 0b1110, 0b101>;
+def : RWSysReg<"PMEVTYPER22_EL0",    0b11, 0b011, 0b1110, 0b1110, 0b110>;
+def : RWSysReg<"PMEVTYPER23_EL0",    0b11, 0b011, 0b1110, 0b1110, 0b111>;
+def : RWSysReg<"PMEVTYPER24_EL0",    0b11, 0b011, 0b1110, 0b1111, 0b000>;
+def : RWSysReg<"PMEVTYPER25_EL0",    0b11, 0b011, 0b1110, 0b1111, 0b001>;
+def : RWSysReg<"PMEVTYPER26_EL0",    0b11, 0b011, 0b1110, 0b1111, 0b010>;
+def : RWSysReg<"PMEVTYPER27_EL0",    0b11, 0b011, 0b1110, 0b1111, 0b011>;
+def : RWSysReg<"PMEVTYPER28_EL0",    0b11, 0b011, 0b1110, 0b1111, 0b100>;
+def : RWSysReg<"PMEVTYPER29_EL0",    0b11, 0b011, 0b1110, 0b1111, 0b101>;
+def : RWSysReg<"PMEVTYPER30_EL0",    0b11, 0b011, 0b1110, 0b1111, 0b110>;
+
+// Trace registers
+//                                 Op0    Op1     CRn     CRm    Op2
+def : RWSysReg<"TRCPRGCTLR",         0b10, 0b001, 0b0000, 0b0001, 0b000>;
+def : RWSysReg<"TRCPROCSELR",        0b10, 0b001, 0b0000, 0b0010, 0b000>;
+def : RWSysReg<"TRCCONFIGR",         0b10, 0b001, 0b0000, 0b0100, 0b000>;
+def : RWSysReg<"TRCAUXCTLR",         0b10, 0b001, 0b0000, 0b0110, 0b000>;
+def : RWSysReg<"TRCEVENTCTL0R",      0b10, 0b001, 0b0000, 0b1000, 0b000>;
+def : RWSysReg<"TRCEVENTCTL1R",      0b10, 0b001, 0b0000, 0b1001, 0b000>;
+def : RWSysReg<"TRCSTALLCTLR",       0b10, 0b001, 0b0000, 0b1011, 0b000>;
+def : RWSysReg<"TRCTSCTLR",          0b10, 0b001, 0b0000, 0b1100, 0b000>;
+def : RWSysReg<"TRCSYNCPR",          0b10, 0b001, 0b0000, 0b1101, 0b000>;
+def : RWSysReg<"TRCCCCTLR",          0b10, 0b001, 0b0000, 0b1110, 0b000>;
+def : RWSysReg<"TRCBBCTLR",          0b10, 0b001, 0b0000, 0b1111, 0b000>;
+def : RWSysReg<"TRCTRACEIDR",        0b10, 0b001, 0b0000, 0b0000, 0b001>;
+def : RWSysReg<"TRCQCTLR",           0b10, 0b001, 0b0000, 0b0001, 0b001>;
+def : RWSysReg<"TRCVICTLR",          0b10, 0b001, 0b0000, 0b0000, 0b010>;
+def : RWSysReg<"TRCVIIECTLR",        0b10, 0b001, 0b0000, 0b0001, 0b010>;
+def : RWSysReg<"TRCVISSCTLR",        0b10, 0b001, 0b0000, 0b0010, 0b010>;
+def : RWSysReg<"TRCVIPCSSCTLR",      0b10, 0b001, 0b0000, 0b0011, 0b010>;
+def : RWSysReg<"TRCVDCTLR",          0b10, 0b001, 0b0000, 0b1000, 0b010>;
+def : RWSysReg<"TRCVDSACCTLR",       0b10, 0b001, 0b0000, 0b1001, 0b010>;
+def : RWSysReg<"TRCVDARCCTLR",       0b10, 0b001, 0b0000, 0b1010, 0b010>;
+def : RWSysReg<"TRCSEQEVR0",         0b10, 0b001, 0b0000, 0b0000, 0b100>;
+def : RWSysReg<"TRCSEQEVR1",         0b10, 0b001, 0b0000, 0b0001, 0b100>;
+def : RWSysReg<"TRCSEQEVR2",         0b10, 0b001, 0b0000, 0b0010, 0b100>;
+def : RWSysReg<"TRCSEQRSTEVR",       0b10, 0b001, 0b0000, 0b0110, 0b100>;
+def : RWSysReg<"TRCSEQSTR",          0b10, 0b001, 0b0000, 0b0111, 0b100>;
+def : RWSysReg<"TRCEXTINSELR",       0b10, 0b001, 0b0000, 0b1000, 0b100>;
+def : RWSysReg<"TRCCNTRLDVR0",       0b10, 0b001, 0b0000, 0b0000, 0b101>;
+def : RWSysReg<"TRCCNTRLDVR1",       0b10, 0b001, 0b0000, 0b0001, 0b101>;
+def : RWSysReg<"TRCCNTRLDVR2",       0b10, 0b001, 0b0000, 0b0010, 0b101>;
+def : RWSysReg<"TRCCNTRLDVR3",       0b10, 0b001, 0b0000, 0b0011, 0b101>;
+def : RWSysReg<"TRCCNTCTLR0",        0b10, 0b001, 0b0000, 0b0100, 0b101>;
+def : RWSysReg<"TRCCNTCTLR1",        0b10, 0b001, 0b0000, 0b0101, 0b101>;
+def : RWSysReg<"TRCCNTCTLR2",        0b10, 0b001, 0b0000, 0b0110, 0b101>;
+def : RWSysReg<"TRCCNTCTLR3",        0b10, 0b001, 0b0000, 0b0111, 0b101>;
+def : RWSysReg<"TRCCNTVR0",          0b10, 0b001, 0b0000, 0b1000, 0b101>;
+def : RWSysReg<"TRCCNTVR1",          0b10, 0b001, 0b0000, 0b1001, 0b101>;
+def : RWSysReg<"TRCCNTVR2",          0b10, 0b001, 0b0000, 0b1010, 0b101>;
+def : RWSysReg<"TRCCNTVR3",          0b10, 0b001, 0b0000, 0b1011, 0b101>;
+def : RWSysReg<"TRCIMSPEC0",         0b10, 0b001, 0b0000, 0b0000, 0b111>;
+def : RWSysReg<"TRCIMSPEC1",         0b10, 0b001, 0b0000, 0b0001, 0b111>;
+def : RWSysReg<"TRCIMSPEC2",         0b10, 0b001, 0b0000, 0b0010, 0b111>;
+def : RWSysReg<"TRCIMSPEC3",         0b10, 0b001, 0b0000, 0b0011, 0b111>;
+def : RWSysReg<"TRCIMSPEC4",         0b10, 0b001, 0b0000, 0b0100, 0b111>;
+def : RWSysReg<"TRCIMSPEC5",         0b10, 0b001, 0b0000, 0b0101, 0b111>;
+def : RWSysReg<"TRCIMSPEC6",         0b10, 0b001, 0b0000, 0b0110, 0b111>;
+def : RWSysReg<"TRCIMSPEC7",         0b10, 0b001, 0b0000, 0b0111, 0b111>;
+def : RWSysReg<"TRCRSCTLR2",         0b10, 0b001, 0b0001, 0b0010, 0b000>;
+def : RWSysReg<"TRCRSCTLR3",         0b10, 0b001, 0b0001, 0b0011, 0b000>;
+def : RWSysReg<"TRCRSCTLR4",         0b10, 0b001, 0b0001, 0b0100, 0b000>;
+def : RWSysReg<"TRCRSCTLR5",         0b10, 0b001, 0b0001, 0b0101, 0b000>;
+def : RWSysReg<"TRCRSCTLR6",         0b10, 0b001, 0b0001, 0b0110, 0b000>;
+def : RWSysReg<"TRCRSCTLR7",         0b10, 0b001, 0b0001, 0b0111, 0b000>;
+def : RWSysReg<"TRCRSCTLR8",         0b10, 0b001, 0b0001, 0b1000, 0b000>;
+def : RWSysReg<"TRCRSCTLR9",         0b10, 0b001, 0b0001, 0b1001, 0b000>;
+def : RWSysReg<"TRCRSCTLR10",        0b10, 0b001, 0b0001, 0b1010, 0b000>;
+def : RWSysReg<"TRCRSCTLR11",        0b10, 0b001, 0b0001, 0b1011, 0b000>;
+def : RWSysReg<"TRCRSCTLR12",        0b10, 0b001, 0b0001, 0b1100, 0b000>;
+def : RWSysReg<"TRCRSCTLR13",        0b10, 0b001, 0b0001, 0b1101, 0b000>;
+def : RWSysReg<"TRCRSCTLR14",        0b10, 0b001, 0b0001, 0b1110, 0b000>;
+def : RWSysReg<"TRCRSCTLR15",        0b10, 0b001, 0b0001, 0b1111, 0b000>;
+def : RWSysReg<"TRCRSCTLR16",        0b10, 0b001, 0b0001, 0b0000, 0b001>;
+def : RWSysReg<"TRCRSCTLR17",        0b10, 0b001, 0b0001, 0b0001, 0b001>;
+def : RWSysReg<"TRCRSCTLR18",        0b10, 0b001, 0b0001, 0b0010, 0b001>;
+def : RWSysReg<"TRCRSCTLR19",        0b10, 0b001, 0b0001, 0b0011, 0b001>;
+def : RWSysReg<"TRCRSCTLR20",        0b10, 0b001, 0b0001, 0b0100, 0b001>;
+def : RWSysReg<"TRCRSCTLR21",        0b10, 0b001, 0b0001, 0b0101, 0b001>;
+def : RWSysReg<"TRCRSCTLR22",        0b10, 0b001, 0b0001, 0b0110, 0b001>;
+def : RWSysReg<"TRCRSCTLR23",        0b10, 0b001, 0b0001, 0b0111, 0b001>;
+def : RWSysReg<"TRCRSCTLR24",        0b10, 0b001, 0b0001, 0b1000, 0b001>;
+def : RWSysReg<"TRCRSCTLR25",        0b10, 0b001, 0b0001, 0b1001, 0b001>;
+def : RWSysReg<"TRCRSCTLR26",        0b10, 0b001, 0b0001, 0b1010, 0b001>;
+def : RWSysReg<"TRCRSCTLR27",        0b10, 0b001, 0b0001, 0b1011, 0b001>;
+def : RWSysReg<"TRCRSCTLR28",        0b10, 0b001, 0b0001, 0b1100, 0b001>;
+def : RWSysReg<"TRCRSCTLR29",        0b10, 0b001, 0b0001, 0b1101, 0b001>;
+def : RWSysReg<"TRCRSCTLR30",        0b10, 0b001, 0b0001, 0b1110, 0b001>;
+def : RWSysReg<"TRCRSCTLR31",        0b10, 0b001, 0b0001, 0b1111, 0b001>;
+def : RWSysReg<"TRCSSCCR0",          0b10, 0b001, 0b0001, 0b0000, 0b010>;
+def : RWSysReg<"TRCSSCCR1",          0b10, 0b001, 0b0001, 0b0001, 0b010>;
+def : RWSysReg<"TRCSSCCR2",          0b10, 0b001, 0b0001, 0b0010, 0b010>;
+def : RWSysReg<"TRCSSCCR3",          0b10, 0b001, 0b0001, 0b0011, 0b010>;
+def : RWSysReg<"TRCSSCCR4",          0b10, 0b001, 0b0001, 0b0100, 0b010>;
+def : RWSysReg<"TRCSSCCR5",          0b10, 0b001, 0b0001, 0b0101, 0b010>;
+def : RWSysReg<"TRCSSCCR6",          0b10, 0b001, 0b0001, 0b0110, 0b010>;
+def : RWSysReg<"TRCSSCCR7",          0b10, 0b001, 0b0001, 0b0111, 0b010>;
+def : RWSysReg<"TRCSSCSR0",          0b10, 0b001, 0b0001, 0b1000, 0b010>;
+def : RWSysReg<"TRCSSCSR1",          0b10, 0b001, 0b0001, 0b1001, 0b010>;
+def : RWSysReg<"TRCSSCSR2",          0b10, 0b001, 0b0001, 0b1010, 0b010>;
+def : RWSysReg<"TRCSSCSR3",          0b10, 0b001, 0b0001, 0b1011, 0b010>;
+def : RWSysReg<"TRCSSCSR4",          0b10, 0b001, 0b0001, 0b1100, 0b010>;
+def : RWSysReg<"TRCSSCSR5",          0b10, 0b001, 0b0001, 0b1101, 0b010>;
+def : RWSysReg<"TRCSSCSR6",          0b10, 0b001, 0b0001, 0b1110, 0b010>;
+def : RWSysReg<"TRCSSCSR7",          0b10, 0b001, 0b0001, 0b1111, 0b010>;
+def : RWSysReg<"TRCSSPCICR0",        0b10, 0b001, 0b0001, 0b0000, 0b011>;
+def : RWSysReg<"TRCSSPCICR1",        0b10, 0b001, 0b0001, 0b0001, 0b011>;
+def : RWSysReg<"TRCSSPCICR2",        0b10, 0b001, 0b0001, 0b0010, 0b011>;
+def : RWSysReg<"TRCSSPCICR3",        0b10, 0b001, 0b0001, 0b0011, 0b011>;
+def : RWSysReg<"TRCSSPCICR4",        0b10, 0b001, 0b0001, 0b0100, 0b011>;
+def : RWSysReg<"TRCSSPCICR5",        0b10, 0b001, 0b0001, 0b0101, 0b011>;
+def : RWSysReg<"TRCSSPCICR6",        0b10, 0b001, 0b0001, 0b0110, 0b011>;
+def : RWSysReg<"TRCSSPCICR7",        0b10, 0b001, 0b0001, 0b0111, 0b011>;
+def : RWSysReg<"TRCPDCR",            0b10, 0b001, 0b0001, 0b0100, 0b100>;
+def : RWSysReg<"TRCACVR0",           0b10, 0b001, 0b0010, 0b0000, 0b000>;
+def : RWSysReg<"TRCACVR1",           0b10, 0b001, 0b0010, 0b0010, 0b000>;
+def : RWSysReg<"TRCACVR2",           0b10, 0b001, 0b0010, 0b0100, 0b000>;
+def : RWSysReg<"TRCACVR3",           0b10, 0b001, 0b0010, 0b0110, 0b000>;
+def : RWSysReg<"TRCACVR4",           0b10, 0b001, 0b0010, 0b1000, 0b000>;
+def : RWSysReg<"TRCACVR5",           0b10, 0b001, 0b0010, 0b1010, 0b000>;
+def : RWSysReg<"TRCACVR6",           0b10, 0b001, 0b0010, 0b1100, 0b000>;
+def : RWSysReg<"TRCACVR7",           0b10, 0b001, 0b0010, 0b1110, 0b000>;
+def : RWSysReg<"TRCACVR8",           0b10, 0b001, 0b0010, 0b0000, 0b001>;
+def : RWSysReg<"TRCACVR9",           0b10, 0b001, 0b0010, 0b0010, 0b001>;
+def : RWSysReg<"TRCACVR10",          0b10, 0b001, 0b0010, 0b0100, 0b001>;
+def : RWSysReg<"TRCACVR11",          0b10, 0b001, 0b0010, 0b0110, 0b001>;
+def : RWSysReg<"TRCACVR12",          0b10, 0b001, 0b0010, 0b1000, 0b001>;
+def : RWSysReg<"TRCACVR13",          0b10, 0b001, 0b0010, 0b1010, 0b001>;
+def : RWSysReg<"TRCACVR14",          0b10, 0b001, 0b0010, 0b1100, 0b001>;
+def : RWSysReg<"TRCACVR15",          0b10, 0b001, 0b0010, 0b1110, 0b001>;
+def : RWSysReg<"TRCACATR0",          0b10, 0b001, 0b0010, 0b0000, 0b010>;
+def : RWSysReg<"TRCACATR1",          0b10, 0b001, 0b0010, 0b0010, 0b010>;
+def : RWSysReg<"TRCACATR2",          0b10, 0b001, 0b0010, 0b0100, 0b010>;
+def : RWSysReg<"TRCACATR3",          0b10, 0b001, 0b0010, 0b0110, 0b010>;
+def : RWSysReg<"TRCACATR4",          0b10, 0b001, 0b0010, 0b1000, 0b010>;
+def : RWSysReg<"TRCACATR5",          0b10, 0b001, 0b0010, 0b1010, 0b010>;
+def : RWSysReg<"TRCACATR6",          0b10, 0b001, 0b0010, 0b1100, 0b010>;
+def : RWSysReg<"TRCACATR7",          0b10, 0b001, 0b0010, 0b1110, 0b010>;
+def : RWSysReg<"TRCACATR8",          0b10, 0b001, 0b0010, 0b0000, 0b011>;
+def : RWSysReg<"TRCACATR9",          0b10, 0b001, 0b0010, 0b0010, 0b011>;
+def : RWSysReg<"TRCACATR10",         0b10, 0b001, 0b0010, 0b0100, 0b011>;
+def : RWSysReg<"TRCACATR11",         0b10, 0b001, 0b0010, 0b0110, 0b011>;
+def : RWSysReg<"TRCACATR12",         0b10, 0b001, 0b0010, 0b1000, 0b011>;
+def : RWSysReg<"TRCACATR13",         0b10, 0b001, 0b0010, 0b1010, 0b011>;
+def : RWSysReg<"TRCACATR14",         0b10, 0b001, 0b0010, 0b1100, 0b011>;
+def : RWSysReg<"TRCACATR15",         0b10, 0b001, 0b0010, 0b1110, 0b011>;
+def : RWSysReg<"TRCDVCVR0",          0b10, 0b001, 0b0010, 0b0000, 0b100>;
+def : RWSysReg<"TRCDVCVR1",          0b10, 0b001, 0b0010, 0b0100, 0b100>;
+def : RWSysReg<"TRCDVCVR2",          0b10, 0b001, 0b0010, 0b1000, 0b100>;
+def : RWSysReg<"TRCDVCVR3",          0b10, 0b001, 0b0010, 0b1100, 0b100>;
+def : RWSysReg<"TRCDVCVR4",          0b10, 0b001, 0b0010, 0b0000, 0b101>;
+def : RWSysReg<"TRCDVCVR5",          0b10, 0b001, 0b0010, 0b0100, 0b101>;
+def : RWSysReg<"TRCDVCVR6",          0b10, 0b001, 0b0010, 0b1000, 0b101>;
+def : RWSysReg<"TRCDVCVR7",          0b10, 0b001, 0b0010, 0b1100, 0b101>;
+def : RWSysReg<"TRCDVCMR0",          0b10, 0b001, 0b0010, 0b0000, 0b110>;
+def : RWSysReg<"TRCDVCMR1",          0b10, 0b001, 0b0010, 0b0100, 0b110>;
+def : RWSysReg<"TRCDVCMR2",          0b10, 0b001, 0b0010, 0b1000, 0b110>;
+def : RWSysReg<"TRCDVCMR3",          0b10, 0b001, 0b0010, 0b1100, 0b110>;
+def : RWSysReg<"TRCDVCMR4",          0b10, 0b001, 0b0010, 0b0000, 0b111>;
+def : RWSysReg<"TRCDVCMR5",          0b10, 0b001, 0b0010, 0b0100, 0b111>;
+def : RWSysReg<"TRCDVCMR6",          0b10, 0b001, 0b0010, 0b1000, 0b111>;
+def : RWSysReg<"TRCDVCMR7",          0b10, 0b001, 0b0010, 0b1100, 0b111>;
+def : RWSysReg<"TRCCIDCVR0",         0b10, 0b001, 0b0011, 0b0000, 0b000>;
+def : RWSysReg<"TRCCIDCVR1",         0b10, 0b001, 0b0011, 0b0010, 0b000>;
+def : RWSysReg<"TRCCIDCVR2",         0b10, 0b001, 0b0011, 0b0100, 0b000>;
+def : RWSysReg<"TRCCIDCVR3",         0b10, 0b001, 0b0011, 0b0110, 0b000>;
+def : RWSysReg<"TRCCIDCVR4",         0b10, 0b001, 0b0011, 0b1000, 0b000>;
+def : RWSysReg<"TRCCIDCVR5",         0b10, 0b001, 0b0011, 0b1010, 0b000>;
+def : RWSysReg<"TRCCIDCVR6",         0b10, 0b001, 0b0011, 0b1100, 0b000>;
+def : RWSysReg<"TRCCIDCVR7",         0b10, 0b001, 0b0011, 0b1110, 0b000>;
+def : RWSysReg<"TRCVMIDCVR0",        0b10, 0b001, 0b0011, 0b0000, 0b001>;
+def : RWSysReg<"TRCVMIDCVR1",        0b10, 0b001, 0b0011, 0b0010, 0b001>;
+def : RWSysReg<"TRCVMIDCVR2",        0b10, 0b001, 0b0011, 0b0100, 0b001>;
+def : RWSysReg<"TRCVMIDCVR3",        0b10, 0b001, 0b0011, 0b0110, 0b001>;
+def : RWSysReg<"TRCVMIDCVR4",        0b10, 0b001, 0b0011, 0b1000, 0b001>;
+def : RWSysReg<"TRCVMIDCVR5",        0b10, 0b001, 0b0011, 0b1010, 0b001>;
+def : RWSysReg<"TRCVMIDCVR6",        0b10, 0b001, 0b0011, 0b1100, 0b001>;
+def : RWSysReg<"TRCVMIDCVR7",        0b10, 0b001, 0b0011, 0b1110, 0b001>;
+def : RWSysReg<"TRCCIDCCTLR0",       0b10, 0b001, 0b0011, 0b0000, 0b010>;
+def : RWSysReg<"TRCCIDCCTLR1",       0b10, 0b001, 0b0011, 0b0001, 0b010>;
+def : RWSysReg<"TRCVMIDCCTLR0",      0b10, 0b001, 0b0011, 0b0010, 0b010>;
+def : RWSysReg<"TRCVMIDCCTLR1",      0b10, 0b001, 0b0011, 0b0011, 0b010>;
+def : RWSysReg<"TRCITCTRL",          0b10, 0b001, 0b0111, 0b0000, 0b100>;
+def : RWSysReg<"TRCCLAIMSET",        0b10, 0b001, 0b0111, 0b1000, 0b110>;
+def : RWSysReg<"TRCCLAIMCLR",        0b10, 0b001, 0b0111, 0b1001, 0b110>;
+
+// GICv3 registers
+//                                 Op0    Op1     CRn     CRm    Op2
+def : RWSysReg<"ICC_BPR1_EL1",       0b11, 0b000, 0b1100, 0b1100, 0b011>;
+def : RWSysReg<"ICC_BPR0_EL1",       0b11, 0b000, 0b1100, 0b1000, 0b011>;
+def : RWSysReg<"ICC_PMR_EL1",        0b11, 0b000, 0b0100, 0b0110, 0b000>;
+def : RWSysReg<"ICC_CTLR_EL1",       0b11, 0b000, 0b1100, 0b1100, 0b100>;
+def : RWSysReg<"ICC_CTLR_EL3",       0b11, 0b110, 0b1100, 0b1100, 0b100>;
+def : RWSysReg<"ICC_SRE_EL1",        0b11, 0b000, 0b1100, 0b1100, 0b101>;
+def : RWSysReg<"ICC_SRE_EL2",        0b11, 0b100, 0b1100, 0b1001, 0b101>;
+def : RWSysReg<"ICC_SRE_EL3",        0b11, 0b110, 0b1100, 0b1100, 0b101>;
+def : RWSysReg<"ICC_IGRPEN0_EL1",    0b11, 0b000, 0b1100, 0b1100, 0b110>;
+def : RWSysReg<"ICC_IGRPEN1_EL1",    0b11, 0b000, 0b1100, 0b1100, 0b111>;
+def : RWSysReg<"ICC_IGRPEN1_EL3",    0b11, 0b110, 0b1100, 0b1100, 0b111>;
+def : RWSysReg<"ICC_SEIEN_EL1",      0b11, 0b000, 0b1100, 0b1101, 0b000>;
+def : RWSysReg<"ICC_AP0R0_EL1",      0b11, 0b000, 0b1100, 0b1000, 0b100>;
+def : RWSysReg<"ICC_AP0R1_EL1",      0b11, 0b000, 0b1100, 0b1000, 0b101>;
+def : RWSysReg<"ICC_AP0R2_EL1",      0b11, 0b000, 0b1100, 0b1000, 0b110>;
+def : RWSysReg<"ICC_AP0R3_EL1",      0b11, 0b000, 0b1100, 0b1000, 0b111>;
+def : RWSysReg<"ICC_AP1R0_EL1",      0b11, 0b000, 0b1100, 0b1001, 0b000>;
+def : RWSysReg<"ICC_AP1R1_EL1",      0b11, 0b000, 0b1100, 0b1001, 0b001>;
+def : RWSysReg<"ICC_AP1R2_EL1",      0b11, 0b000, 0b1100, 0b1001, 0b010>;
+def : RWSysReg<"ICC_AP1R3_EL1",      0b11, 0b000, 0b1100, 0b1001, 0b011>;
+def : RWSysReg<"ICH_AP0R0_EL2",      0b11, 0b100, 0b1100, 0b1000, 0b000>;
+def : RWSysReg<"ICH_AP0R1_EL2",      0b11, 0b100, 0b1100, 0b1000, 0b001>;
+def : RWSysReg<"ICH_AP0R2_EL2",      0b11, 0b100, 0b1100, 0b1000, 0b010>;
+def : RWSysReg<"ICH_AP0R3_EL2",      0b11, 0b100, 0b1100, 0b1000, 0b011>;
+def : RWSysReg<"ICH_AP1R0_EL2",      0b11, 0b100, 0b1100, 0b1001, 0b000>;
+def : RWSysReg<"ICH_AP1R1_EL2",      0b11, 0b100, 0b1100, 0b1001, 0b001>;
+def : RWSysReg<"ICH_AP1R2_EL2",      0b11, 0b100, 0b1100, 0b1001, 0b010>;
+def : RWSysReg<"ICH_AP1R3_EL2",      0b11, 0b100, 0b1100, 0b1001, 0b011>;
+def : RWSysReg<"ICH_HCR_EL2",        0b11, 0b100, 0b1100, 0b1011, 0b000>;
+def : RWSysReg<"ICH_MISR_EL2",       0b11, 0b100, 0b1100, 0b1011, 0b010>;
+def : RWSysReg<"ICH_VMCR_EL2",       0b11, 0b100, 0b1100, 0b1011, 0b111>;
+def : RWSysReg<"ICH_VSEIR_EL2",      0b11, 0b100, 0b1100, 0b1001, 0b100>;
+def : RWSysReg<"ICH_LR0_EL2",        0b11, 0b100, 0b1100, 0b1100, 0b000>;
+def : RWSysReg<"ICH_LR1_EL2",        0b11, 0b100, 0b1100, 0b1100, 0b001>;
+def : RWSysReg<"ICH_LR2_EL2",        0b11, 0b100, 0b1100, 0b1100, 0b010>;
+def : RWSysReg<"ICH_LR3_EL2",        0b11, 0b100, 0b1100, 0b1100, 0b011>;
+def : RWSysReg<"ICH_LR4_EL2",        0b11, 0b100, 0b1100, 0b1100, 0b100>;
+def : RWSysReg<"ICH_LR5_EL2",        0b11, 0b100, 0b1100, 0b1100, 0b101>;
+def : RWSysReg<"ICH_LR6_EL2",        0b11, 0b100, 0b1100, 0b1100, 0b110>;
+def : RWSysReg<"ICH_LR7_EL2",        0b11, 0b100, 0b1100, 0b1100, 0b111>;
+def : RWSysReg<"ICH_LR8_EL2",        0b11, 0b100, 0b1100, 0b1101, 0b000>;
+def : RWSysReg<"ICH_LR9_EL2",        0b11, 0b100, 0b1100, 0b1101, 0b001>;
+def : RWSysReg<"ICH_LR10_EL2",       0b11, 0b100, 0b1100, 0b1101, 0b010>;
+def : RWSysReg<"ICH_LR11_EL2",       0b11, 0b100, 0b1100, 0b1101, 0b011>;
+def : RWSysReg<"ICH_LR12_EL2",       0b11, 0b100, 0b1100, 0b1101, 0b100>;
+def : RWSysReg<"ICH_LR13_EL2",       0b11, 0b100, 0b1100, 0b1101, 0b101>;
+def : RWSysReg<"ICH_LR14_EL2",       0b11, 0b100, 0b1100, 0b1101, 0b110>;
+def : RWSysReg<"ICH_LR15_EL2",       0b11, 0b100, 0b1100, 0b1101, 0b111>;
+
+// v8.1a "Privileged Access Never" extension-specific system registers
+let Requires = [{ {AArch64::HasV8_1aOps} }] in
+def : RWSysReg<"PAN", 0b11, 0b000, 0b0100, 0b0010, 0b011>;
+
+// v8.1a "Limited Ordering Regions" extension-specific system registers
+//                         Op0    Op1     CRn     CRm    Op2
+let Requires = [{ {AArch64::HasV8_1aOps} }] in {
+def : RWSysReg<"LORSA_EL1",  0b11, 0b000, 0b1010, 0b0100, 0b000>;
+def : RWSysReg<"LOREA_EL1",  0b11, 0b000, 0b1010, 0b0100, 0b001>;
+def : RWSysReg<"LORN_EL1",   0b11, 0b000, 0b1010, 0b0100, 0b010>;
+def : RWSysReg<"LORC_EL1",   0b11, 0b000, 0b1010, 0b0100, 0b011>;
+}
+
+// v8.1a "Virtualization hos extensions" system registers
+//                              Op0    Op1     CRn     CRm    Op2
+let Requires = [{ {AArch64::HasV8_1aOps} }] in {
+def : RWSysReg<"TTBR1_EL2",       0b11, 0b100, 0b0010, 0b0000, 0b001>;
+def : RWSysReg<"CONTEXTIDR_EL2",  0b11, 0b100, 0b1101, 0b0000, 0b001>;
+def : RWSysReg<"CNTHV_TVAL_EL2",  0b11, 0b100, 0b1110, 0b0011, 0b000>;
+def : RWSysReg<"CNTHV_CVAL_EL2",  0b11, 0b100, 0b1110, 0b0011, 0b010>;
+def : RWSysReg<"CNTHV_CTL_EL2",   0b11, 0b100, 0b1110, 0b0011, 0b001>;
+def : RWSysReg<"SCTLR_EL12",      0b11, 0b101, 0b0001, 0b0000, 0b000>;
+def : RWSysReg<"CPACR_EL12",      0b11, 0b101, 0b0001, 0b0000, 0b010>;
+def : RWSysReg<"TTBR0_EL12",      0b11, 0b101, 0b0010, 0b0000, 0b000>;
+def : RWSysReg<"TTBR1_EL12",      0b11, 0b101, 0b0010, 0b0000, 0b001>;
+def : RWSysReg<"TCR_EL12",        0b11, 0b101, 0b0010, 0b0000, 0b010>;
+def : RWSysReg<"AFSR0_EL12",      0b11, 0b101, 0b0101, 0b0001, 0b000>;
+def : RWSysReg<"AFSR1_EL12",      0b11, 0b101, 0b0101, 0b0001, 0b001>;
+def : RWSysReg<"ESR_EL12",        0b11, 0b101, 0b0101, 0b0010, 0b000>;
+def : RWSysReg<"FAR_EL12",        0b11, 0b101, 0b0110, 0b0000, 0b000>;
+def : RWSysReg<"MAIR_EL12",       0b11, 0b101, 0b1010, 0b0010, 0b000>;
+def : RWSysReg<"AMAIR_EL12",      0b11, 0b101, 0b1010, 0b0011, 0b000>;
+def : RWSysReg<"VBAR_EL12",       0b11, 0b101, 0b1100, 0b0000, 0b000>;
+def : RWSysReg<"CONTEXTIDR_EL12", 0b11, 0b101, 0b1101, 0b0000, 0b001>;
+def : RWSysReg<"CNTKCTL_EL12",    0b11, 0b101, 0b1110, 0b0001, 0b000>;
+def : RWSysReg<"CNTP_TVAL_EL02",  0b11, 0b101, 0b1110, 0b0010, 0b000>;
+def : RWSysReg<"CNTP_CTL_EL02",   0b11, 0b101, 0b1110, 0b0010, 0b001>;
+def : RWSysReg<"CNTP_CVAL_EL02",  0b11, 0b101, 0b1110, 0b0010, 0b010>;
+def : RWSysReg<"CNTV_TVAL_EL02",  0b11, 0b101, 0b1110, 0b0011, 0b000>;
+def : RWSysReg<"CNTV_CTL_EL02",   0b11, 0b101, 0b1110, 0b0011, 0b001>;
+def : RWSysReg<"CNTV_CVAL_EL02",  0b11, 0b101, 0b1110, 0b0011, 0b010>;
+def : RWSysReg<"SPSR_EL12",       0b11, 0b101, 0b0100, 0b0000, 0b000>;
+def : RWSysReg<"ELR_EL12",        0b11, 0b101, 0b0100, 0b0000, 0b001>;
+}
+// v8.2a registers
+//                  Op0    Op1     CRn     CRm    Op2
+let Requires = [{ {AArch64::HasV8_2aOps} }] in
+def : RWSysReg<"UAO", 0b11, 0b000, 0b0100, 0b0010, 0b100>;
+
+// v8.2a "Statistical Profiling extension" registers
+//                            Op0    Op1     CRn     CRm    Op2
+let Requires = [{ {AArch64::FeatureSPE} }] in {
+def : RWSysReg<"PMBLIMITR_EL1", 0b11, 0b000, 0b1001, 0b1010, 0b000>;
+def : RWSysReg<"PMBPTR_EL1",    0b11, 0b000, 0b1001, 0b1010, 0b001>;
+def : RWSysReg<"PMBSR_EL1",     0b11, 0b000, 0b1001, 0b1010, 0b011>;
+def : RWSysReg<"PMBIDR_EL1",    0b11, 0b000, 0b1001, 0b1010, 0b111>;
+def : RWSysReg<"PMSCR_EL2",     0b11, 0b100, 0b1001, 0b1001, 0b000>;
+def : RWSysReg<"PMSCR_EL12",    0b11, 0b101, 0b1001, 0b1001, 0b000>;
+def : RWSysReg<"PMSCR_EL1",     0b11, 0b000, 0b1001, 0b1001, 0b000>;
+def : RWSysReg<"PMSICR_EL1",    0b11, 0b000, 0b1001, 0b1001, 0b010>;
+def : RWSysReg<"PMSIRR_EL1",    0b11, 0b000, 0b1001, 0b1001, 0b011>;
+def : RWSysReg<"PMSFCR_EL1",    0b11, 0b000, 0b1001, 0b1001, 0b100>;
+def : RWSysReg<"PMSEVFR_EL1",   0b11, 0b000, 0b1001, 0b1001, 0b101>;
+def : RWSysReg<"PMSLATFR_EL1",  0b11, 0b000, 0b1001, 0b1001, 0b110>;
+def : RWSysReg<"PMSIDR_EL1",    0b11, 0b000, 0b1001, 0b1001, 0b111>;
+}
+
+// v8.2a "RAS extension" registers
+//                         Op0    Op1     CRn     CRm    Op2
+let Requires = [{ {AArch64::FeatureRAS} }] in {
+def : RWSysReg<"ERRSELR_EL1",   0b11, 0b000, 0b0101, 0b0011, 0b001>;
+def : RWSysReg<"ERXCTLR_EL1",   0b11, 0b000, 0b0101, 0b0100, 0b001>;
+def : RWSysReg<"ERXSTATUS_EL1", 0b11, 0b000, 0b0101, 0b0100, 0b010>;
+def : RWSysReg<"ERXADDR_EL1",   0b11, 0b000, 0b0101, 0b0100, 0b011>;
+def : RWSysReg<"ERXMISC0_EL1",  0b11, 0b000, 0b0101, 0b0101, 0b000>;
+def : RWSysReg<"ERXMISC1_EL1",  0b11, 0b000, 0b0101, 0b0101, 0b001>;
+def : RWSysReg<"DISR_EL1",      0b11, 0b000, 0b1100, 0b0001, 0b001>;
+def : RWSysReg<"VDISR_EL2",     0b11, 0b100, 0b1100, 0b0001, 0b001>;
+def : RWSysReg<"VSESR_EL2",     0b11, 0b100, 0b0101, 0b0010, 0b011>;
+}
+
+// Cyclone specific system registers
+//                                 Op0    Op1     CRn     CRm    Op2
+let Requires = [{ {AArch64::ProcCyclone} }] in
+def : RWSysReg<"CPM_IOACC_CTL_EL3", 0b11, 0b111, 0b1111, 0b0010, 0b000>;
diff --git a/lib/Target/AArch64/AArch64TargetMachine.cpp b/lib/Target/AArch64/AArch64TargetMachine.cpp
index c52c5544fc7e..0b6345ff8011 100644
--- a/lib/Target/AArch64/AArch64TargetMachine.cpp
+++ b/lib/Target/AArch64/AArch64TargetMachine.cpp
@@ -11,13 +11,19 @@
 //===----------------------------------------------------------------------===//
 
 #include "AArch64.h"
+#include "AArch64CallLowering.h"
+#include "AArch64RegisterBankInfo.h"
 #include "AArch64TargetMachine.h"
 #include "AArch64TargetObjectFile.h"
 #include "AArch64TargetTransformInfo.h"
+#include "llvm/CodeGen/GlobalISel/IRTranslator.h"
+#include "llvm/CodeGen/GlobalISel/RegBankSelect.h"
 #include "llvm/CodeGen/Passes.h"
 #include "llvm/CodeGen/RegAllocRegistry.h"
+#include "llvm/CodeGen/TargetPassConfig.h"
 #include "llvm/IR/Function.h"
 #include "llvm/IR/LegacyPassManager.h"
+#include "llvm/InitializePasses.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/TargetRegistry.h"
 #include "llvm/Target/TargetOptions.h"
@@ -58,6 +64,11 @@ EnableDeadRegisterElimination("aarch64-dead-def-elimination", cl::Hidden,
                               cl::init(true));
 
 static cl::opt<bool>
+EnableRedundantCopyElimination("aarch64-redundant-copy-elim",
+              cl::desc("Enable the redundant copy elimination pass"),
+              cl::init(true), cl::Hidden);
+
+static cl::opt<bool>
 EnableLoadStoreOpt("aarch64-load-store-opt", cl::desc("Enable the load/store pair"
                    " optimization pass"), cl::init(true), cl::Hidden);
 
@@ -92,11 +103,19 @@ static cl::opt<cl::boolOrDefault>
 EnableGlobalMerge("aarch64-global-merge", cl::Hidden,
                   cl::desc("Enable the global merge pass"));
 
+static cl::opt<bool>
+    EnableLoopDataPrefetch("aarch64-loop-data-prefetch", cl::Hidden,
+                           cl::desc("Enable the loop data prefetch pass"),
+                           cl::init(true));
+
 extern "C" void LLVMInitializeAArch64Target() {
   // Register the target.
   RegisterTargetMachine<AArch64leTargetMachine> X(TheAArch64leTarget);
   RegisterTargetMachine<AArch64beTargetMachine> Y(TheAArch64beTarget);
   RegisterTargetMachine<AArch64leTargetMachine> Z(TheARM64Target);
+  auto PR = PassRegistry::getPassRegistry();
+  initializeGlobalISel(*PR);
+  initializeAArch64ExpandPseudoPass(*PR);
 }
 
 //===----------------------------------------------------------------------===//
@@ -114,29 +133,79 @@ static std::string computeDataLayout(const Triple &TT, bool LittleEndian) {
   if (TT.isOSBinFormatMachO())
     return "e-m:o-i64:64-i128:128-n32:64-S128";
   if (LittleEndian)
-    return "e-m:e-i64:64-i128:128-n32:64-S128";
-  return "E-m:e-i64:64-i128:128-n32:64-S128";
+    return "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128";
+  return "E-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128";
 }
 
-/// TargetMachine ctor - Create an AArch64 architecture model.
+// Helper function to set up the defaults for reciprocals.
+static void initReciprocals(AArch64TargetMachine& TM, AArch64Subtarget& ST)
+{
+  // For the estimates, convergence is quadratic, so essentially the number of
+  // digits is doubled after each iteration. ARMv8, the minimum architected
+  // accuracy of the initial estimate is 2^-8.  Therefore, the number of extra
+  // steps to refine the result for float (23 mantissa bits) and for double
+  // (52 mantissa bits) are 2 and 3, respectively.
+  unsigned ExtraStepsF = 2,
+           ExtraStepsD = ExtraStepsF + 1;
+  bool UseRsqrt = ST.useRSqrt();
+
+  TM.Options.Reciprocals.setDefaults("sqrtf", UseRsqrt, ExtraStepsF);
+  TM.Options.Reciprocals.setDefaults("sqrtd", UseRsqrt, ExtraStepsD);
+  TM.Options.Reciprocals.setDefaults("vec-sqrtf", UseRsqrt, ExtraStepsF);
+  TM.Options.Reciprocals.setDefaults("vec-sqrtd", UseRsqrt, ExtraStepsD);
+
+  TM.Options.Reciprocals.setDefaults("divf", false, ExtraStepsF);
+  TM.Options.Reciprocals.setDefaults("divd", false, ExtraStepsD);
+  TM.Options.Reciprocals.setDefaults("vec-divf", false, ExtraStepsF);
+  TM.Options.Reciprocals.setDefaults("vec-divd", false, ExtraStepsD);
+}
+
+static Reloc::Model getEffectiveRelocModel(const Triple &TT,
+                                           Optional<Reloc::Model> RM) {
+  // AArch64 Darwin is always PIC.
+  if (TT.isOSDarwin())
+    return Reloc::PIC_;
+  // On ELF platforms the default static relocation model has a smart enough
+  // linker to cope with referencing external symbols defined in a shared
+  // library. Hence DynamicNoPIC doesn't need to be promoted to PIC.
+  if (!RM.hasValue() || *RM == Reloc::DynamicNoPIC)
+    return Reloc::Static;
+  return *RM;
+}
+
+/// Create an AArch64 architecture model.
 ///
-AArch64TargetMachine::AArch64TargetMachine(const Target &T, const Triple &TT,
-                                           StringRef CPU, StringRef FS,
-                                           const TargetOptions &Options,
-                                           Reloc::Model RM, CodeModel::Model CM,
-                                           CodeGenOpt::Level OL,
-                                           bool LittleEndian)
+AArch64TargetMachine::AArch64TargetMachine(
+    const Target &T, const Triple &TT, StringRef CPU, StringRef FS,
+    const TargetOptions &Options, Optional<Reloc::Model> RM,
+    CodeModel::Model CM, CodeGenOpt::Level OL, bool LittleEndian)
     // This nested ternary is horrible, but DL needs to be properly
     // initialized before TLInfo is constructed.
     : LLVMTargetMachine(T, computeDataLayout(TT, LittleEndian), TT, CPU, FS,
-                        Options, RM, CM, OL),
+                        Options, getEffectiveRelocModel(TT, RM), CM, OL),
       TLOF(createTLOF(getTargetTriple())),
-      isLittle(LittleEndian) {
+      Subtarget(TT, CPU, FS, *this, LittleEndian) {
+  initReciprocals(*this, Subtarget);
   initAsmInfo();
 }
 
 AArch64TargetMachine::~AArch64TargetMachine() {}
 
+#ifdef LLVM_BUILD_GLOBAL_ISEL
+namespace {
+struct AArch64GISelActualAccessor : public GISelAccessor {
+  std::unique_ptr<CallLowering> CallLoweringInfo;
+  std::unique_ptr<RegisterBankInfo> RegBankInfo;
+  const CallLowering *getCallLowering() const override {
+    return CallLoweringInfo.get();
+  }
+  const RegisterBankInfo *getRegBankInfo() const override {
+    return RegBankInfo.get();
+  }
+};
+} // End anonymous namespace.
+#endif
+
 const AArch64Subtarget *
 AArch64TargetMachine::getSubtargetImpl(const Function &F) const {
   Attribute CPUAttr = F.getFnAttribute("target-cpu");
@@ -156,7 +225,18 @@ AArch64TargetMachine::getSubtargetImpl(const Function &F) const {
     // function that reside in TargetOptions.
     resetTargetOptions(F);
     I = llvm::make_unique<AArch64Subtarget>(TargetTriple, CPU, FS, *this,
-                                            isLittle);
+                                            Subtarget.isLittleEndian());
+#ifndef LLVM_BUILD_GLOBAL_ISEL
+   GISelAccessor *GISel = new GISelAccessor();
+#else
+    AArch64GISelActualAccessor *GISel =
+        new AArch64GISelActualAccessor();
+    GISel->CallLoweringInfo.reset(
+        new AArch64CallLowering(*I->getTargetLowering()));
+    GISel->RegBankInfo.reset(
+        new AArch64RegisterBankInfo(*I->getRegisterInfo()));
+#endif
+    I->setGISelAccessor(*GISel);
   }
   return I.get();
 }
@@ -165,16 +245,16 @@ void AArch64leTargetMachine::anchor() { }
 
 AArch64leTargetMachine::AArch64leTargetMachine(
     const Target &T, const Triple &TT, StringRef CPU, StringRef FS,
-    const TargetOptions &Options, Reloc::Model RM, CodeModel::Model CM,
-    CodeGenOpt::Level OL)
+    const TargetOptions &Options, Optional<Reloc::Model> RM,
+    CodeModel::Model CM, CodeGenOpt::Level OL)
     : AArch64TargetMachine(T, TT, CPU, FS, Options, RM, CM, OL, true) {}
 
 void AArch64beTargetMachine::anchor() { }
 
 AArch64beTargetMachine::AArch64beTargetMachine(
     const Target &T, const Triple &TT, StringRef CPU, StringRef FS,
-    const TargetOptions &Options, Reloc::Model RM, CodeModel::Model CM,
-    CodeGenOpt::Level OL)
+    const TargetOptions &Options, Optional<Reloc::Model> RM,
+    CodeModel::Model CM, CodeGenOpt::Level OL)
     : AArch64TargetMachine(T, TT, CPU, FS, Options, RM, CM, OL, false) {}
 
 namespace {
@@ -194,6 +274,10 @@ public:
   void addIRPasses()  override;
   bool addPreISel() override;
   bool addInstSelector() override;
+#ifdef LLVM_BUILD_GLOBAL_ISEL
+  bool addIRTranslator() override;
+  bool addRegBankSelect() override;
+#endif
   bool addILPOpts() override;
   void addPreRegAlloc() override;
   void addPostRegAlloc() override;
@@ -223,6 +307,13 @@ void AArch64PassConfig::addIRPasses() {
   if (TM->getOptLevel() != CodeGenOpt::None && EnableAtomicTidy)
     addPass(createCFGSimplificationPass());
 
+  // Run LoopDataPrefetch
+  //
+  // Run this before LSR to remove the multiplies involved in computing the
+  // pointer values N iterations ahead.
+  if (TM->getOptLevel() != CodeGenOpt::None && EnableLoopDataPrefetch)
+    addPass(createLoopDataPrefetchPass());
+
   TargetPassConfig::addIRPasses();
 
   // Match interleaved memory accesses to ldN/stN intrinsics.
@@ -278,6 +369,17 @@ bool AArch64PassConfig::addInstSelector() {
   return false;
 }
 
+#ifdef LLVM_BUILD_GLOBAL_ISEL
+bool AArch64PassConfig::addIRTranslator() {
+  addPass(new IRTranslator());
+  return false;
+}
+bool AArch64PassConfig::addRegBankSelect() {
+  addPass(new RegBankSelect());
+  return false;
+}
+#endif
+
 bool AArch64PassConfig::addILPOpts() {
   if (EnableCondOpt)
     addPass(createAArch64ConditionOptimizerPass());
@@ -303,6 +405,10 @@ void AArch64PassConfig::addPreRegAlloc() {
 }
 
 void AArch64PassConfig::addPostRegAlloc() {
+  // Remove redundant copy instructions.
+  if (TM->getOptLevel() != CodeGenOpt::None && EnableRedundantCopyElimination)
+    addPass(createAArch64RedundantCopyEliminationPass());
+
   // Change dead register definitions to refer to the zero register.
   if (TM->getOptLevel() != CodeGenOpt::None && EnableDeadRegisterElimination)
     addPass(createAArch64DeadRegisterDefinitions());
diff --git a/lib/Target/AArch64/AArch64TargetMachine.h b/lib/Target/AArch64/AArch64TargetMachine.h
index 8d49a29386ac..b44107b065bd 100644
--- a/lib/Target/AArch64/AArch64TargetMachine.h
+++ b/lib/Target/AArch64/AArch64TargetMachine.h
@@ -29,7 +29,7 @@ protected:
 public:
   AArch64TargetMachine(const Target &T, const Triple &TT, StringRef CPU,
                        StringRef FS, const TargetOptions &Options,
-                       Reloc::Model RM, CodeModel::Model CM,
+                       Optional<Reloc::Model> RM, CodeModel::Model CM,
                        CodeGenOpt::Level OL, bool IsLittleEndian);
 
   ~AArch64TargetMachine() override;
@@ -46,28 +46,28 @@ public:
   }
 
 private:
-  bool isLittle;
+  AArch64Subtarget Subtarget;
 };
 
-// AArch64leTargetMachine - AArch64 little endian target machine.
+// AArch64 little endian target machine.
 //
 class AArch64leTargetMachine : public AArch64TargetMachine {
   virtual void anchor();
 public:
   AArch64leTargetMachine(const Target &T, const Triple &TT, StringRef CPU,
                          StringRef FS, const TargetOptions &Options,
-                         Reloc::Model RM, CodeModel::Model CM,
+                         Optional<Reloc::Model> RM, CodeModel::Model CM,
                          CodeGenOpt::Level OL);
 };
 
-// AArch64beTargetMachine - AArch64 big endian target machine.
+// AArch64 big endian target machine.
 //
 class AArch64beTargetMachine : public AArch64TargetMachine {
   virtual void anchor();
 public:
   AArch64beTargetMachine(const Target &T, const Triple &TT, StringRef CPU,
                          StringRef FS, const TargetOptions &Options,
-                         Reloc::Model RM, CodeModel::Model CM,
+                         Optional<Reloc::Model> RM, CodeModel::Model CM,
                          CodeGenOpt::Level OL);
 };
 
diff --git a/lib/Target/AArch64/AArch64TargetTransformInfo.cpp b/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
index 9af0e6444789..ecf4d93068a4 100644
--- a/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
+++ b/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
@@ -291,6 +291,61 @@ int AArch64TTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src) {
   return BaseT::getCastInstrCost(Opcode, Dst, Src);
 }
 
+int AArch64TTIImpl::getExtractWithExtendCost(unsigned Opcode, Type *Dst,
+                                             VectorType *VecTy,
+                                             unsigned Index) {
+
+  // Make sure we were given a valid extend opcode.
+  assert((Opcode == Instruction::SExt || Opcode == Instruction::ZExt) &&
+         "Invalid opcode");
+
+  // We are extending an element we extract from a vector, so the source type
+  // of the extend is the element type of the vector.
+  auto *Src = VecTy->getElementType();
+
+  // Sign- and zero-extends are for integer types only.
+  assert(isa<IntegerType>(Dst) && isa<IntegerType>(Src) && "Invalid type");
+
+  // Get the cost for the extract. We compute the cost (if any) for the extend
+  // below.
+  auto Cost = getVectorInstrCost(Instruction::ExtractElement, VecTy, Index);
+
+  // Legalize the types.
+  auto VecLT = TLI->getTypeLegalizationCost(DL, VecTy);
+  auto DstVT = TLI->getValueType(DL, Dst);
+  auto SrcVT = TLI->getValueType(DL, Src);
+
+  // If the resulting type is still a vector and the destination type is legal,
+  // we may get the extension for free. If not, get the default cost for the
+  // extend.
+  if (!VecLT.second.isVector() || !TLI->isTypeLegal(DstVT))
+    return Cost + getCastInstrCost(Opcode, Dst, Src);
+
+  // The destination type should be larger than the element type. If not, get
+  // the default cost for the extend.
+  if (DstVT.getSizeInBits() < SrcVT.getSizeInBits())
+    return Cost + getCastInstrCost(Opcode, Dst, Src);
+
+  switch (Opcode) {
+  default:
+    llvm_unreachable("Opcode should be either SExt or ZExt");
+
+  // For sign-extends, we only need a smov, which performs the extension
+  // automatically.
+  case Instruction::SExt:
+    return Cost;
+
+  // For zero-extends, the extend is performed automatically by a umov unless
+  // the destination type is i64 and the element type is i8 or i16.
+  case Instruction::ZExt:
+    if (DstVT.getSizeInBits() != 64u || SrcVT.getSizeInBits() == 32u)
+      return Cost;
+  }
+
+  // If we are unable to perform the extend for free, get the default cost.
+  return Cost + getCastInstrCost(Opcode, Dst, Src);
+}
+
 int AArch64TTIImpl::getVectorInstrCost(unsigned Opcode, Type *Val,
                                        unsigned Index) {
   assert(Val->isVectorTy() && "This must be a vector type");
@@ -313,7 +368,7 @@ int AArch64TTIImpl::getVectorInstrCost(unsigned Opcode, Type *Val,
   }
 
   // All other insert/extracts cost this much.
-  return 3;
+  return ST->getVectorInsertExtractBaseCost();
 }
 
 int AArch64TTIImpl::getArithmeticInstrCost(
@@ -472,9 +527,7 @@ int AArch64TTIImpl::getCostOfKeepingLiveOverCall(ArrayRef<Type *> Tys) {
 }
 
 unsigned AArch64TTIImpl::getMaxInterleaveFactor(unsigned VF) {
-  if (ST->isCortexA57())
-    return 4;
-  return 2;
+  return ST->getMaxInterleaveFactor();
 }
 
 void AArch64TTIImpl::getUnrollingPreferences(Loop *L,
@@ -571,3 +624,19 @@ bool AArch64TTIImpl::getTgtMemIntrinsic(IntrinsicInst *Inst,
   }
   return true;
 }
+
+unsigned AArch64TTIImpl::getCacheLineSize() {
+  return ST->getCacheLineSize();
+}
+
+unsigned AArch64TTIImpl::getPrefetchDistance() {
+  return ST->getPrefetchDistance();
+}
+
+unsigned AArch64TTIImpl::getMinPrefetchStride() {
+  return ST->getMinPrefetchStride();
+}
+
+unsigned AArch64TTIImpl::getMaxPrefetchIterationsAhead() {
+  return ST->getMaxPrefetchIterationsAhead();
+}
diff --git a/lib/Target/AArch64/AArch64TargetTransformInfo.h b/lib/Target/AArch64/AArch64TargetTransformInfo.h
index ec58c4fe309f..4f2e8310d769 100644
--- a/lib/Target/AArch64/AArch64TargetTransformInfo.h
+++ b/lib/Target/AArch64/AArch64TargetTransformInfo.h
@@ -99,6 +99,9 @@ public:
 
   int getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src);
 
+  int getExtractWithExtendCost(unsigned Opcode, Type *Dst, VectorType *VecTy,
+                               unsigned Index);
+
   int getVectorInstrCost(unsigned Opcode, Type *Val, unsigned Index);
 
   int getArithmeticInstrCost(
@@ -127,6 +130,14 @@ public:
   int getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy, unsigned Factor,
                                  ArrayRef<unsigned> Indices, unsigned Alignment,
                                  unsigned AddressSpace);
+
+  unsigned getCacheLineSize();
+
+  unsigned getPrefetchDistance();
+
+  unsigned getMinPrefetchStride();
+
+  unsigned getMaxPrefetchIterationsAhead();
   /// @}
 };
 
diff --git a/lib/Target/AArch64/AsmParser/AArch64AsmParser.cpp b/lib/Target/AArch64/AsmParser/AArch64AsmParser.cpp
index 394c8e78581f..aebc370333e3 100644
--- a/lib/Target/AArch64/AsmParser/AArch64AsmParser.cpp
+++ b/lib/Target/AArch64/AsmParser/AArch64AsmParser.cpp
@@ -13,7 +13,6 @@
 #include "Utils/AArch64BaseInfo.h"
 #include "llvm/ADT/APInt.h"
 #include "llvm/ADT/STLExtras.h"
-#include "llvm/ADT/SmallString.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/StringSwitch.h"
 #include "llvm/ADT/Twine.h"
@@ -24,13 +23,14 @@
 #include "llvm/MC/MCParser/MCAsmLexer.h"
 #include "llvm/MC/MCParser/MCAsmParser.h"
 #include "llvm/MC/MCParser/MCParsedAsmOperand.h"
+#include "llvm/MC/MCParser/MCTargetAsmParser.h"
 #include "llvm/MC/MCRegisterInfo.h"
 #include "llvm/MC/MCStreamer.h"
 #include "llvm/MC/MCSubtargetInfo.h"
 #include "llvm/MC/MCSymbol.h"
-#include "llvm/MC/MCTargetAsmParser.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/SourceMgr.h"
+#include "llvm/Support/TargetParser.h"
 #include "llvm/Support/TargetRegistry.h"
 #include "llvm/Support/raw_ostream.h"
 #include <cstdio>
@@ -70,6 +70,8 @@ private:
   bool Error(SMLoc L, const Twine &Msg) { return getParser().Error(L, Msg); }
   bool showMatchError(SMLoc Loc, unsigned ErrCode);
 
+  bool parseDirectiveArch(SMLoc L);
+  bool parseDirectiveCPU(SMLoc L);
   bool parseDirectiveWord(unsigned Size, SMLoc L);
   bool parseDirectiveInst(SMLoc L);
 
@@ -866,14 +868,7 @@ public:
     if (!CE) return false;
     uint64_t Value = CE->getValue();
 
-    if (RegWidth == 32)
-      Value &= 0xffffffffULL;
-
-    // "lsl #0" takes precedence: in practice this only affects "#0, lsl #0".
-    if (Value == 0 && Shift != 0)
-      return false;
-
-    return (Value & ~(0xffffULL << Shift)) == 0;
+    return AArch64_AM::isMOVZMovAlias(Value, Shift, RegWidth);
   }
 
   template<int RegWidth, int Shift>
@@ -884,16 +879,7 @@ public:
     if (!CE) return false;
     uint64_t Value = CE->getValue();
 
-    // MOVZ takes precedence over MOVN.
-    for (int MOVZShift = 0; MOVZShift <= 48; MOVZShift += 16)
-      if ((Value & ~(0xffffULL << MOVZShift)) == 0)
-        return false;
-
-    Value = ~Value;
-    if (RegWidth == 32)
-      Value &= 0xffffffffULL;
-
-    return (Value & ~(0xffffULL << Shift)) == 0;
+    return AArch64_AM::isMOVNMovAlias(Value, Shift, RegWidth);
   }
 
   bool isFPImm() const { return Kind == k_FPImm; }
@@ -2087,12 +2073,9 @@ AArch64AsmParser::tryParsePrefetch(OperandVector &Operands) {
       return MatchOperand_ParseFail;
     }
 
-    bool Valid;
-    auto Mapper = AArch64PRFM::PRFMMapper();
-    StringRef Name = 
-        Mapper.toString(MCE->getValue(), getSTI().getFeatureBits(), Valid);
-    Operands.push_back(AArch64Operand::CreatePrefetch(prfop, Name,
-                                                      S, getContext()));
+    auto PRFM = AArch64PRFM::lookupPRFMByEncoding(MCE->getValue());
+    Operands.push_back(AArch64Operand::CreatePrefetch(
+        prfop, PRFM ? PRFM->Name : "", S, getContext()));
     return MatchOperand_Success;
   }
 
@@ -2101,18 +2084,15 @@ AArch64AsmParser::tryParsePrefetch(OperandVector &Operands) {
     return MatchOperand_ParseFail;
   }
 
-  bool Valid;
-  auto Mapper = AArch64PRFM::PRFMMapper();
-  unsigned prfop = 
-      Mapper.fromString(Tok.getString(), getSTI().getFeatureBits(), Valid);
-  if (!Valid) {
+  auto PRFM = AArch64PRFM::lookupPRFMByName(Tok.getString());
+  if (!PRFM) {
     TokError("pre-fetch hint expected");
     return MatchOperand_ParseFail;
   }
 
   Parser.Lex(); // Eat identifier token.
-  Operands.push_back(AArch64Operand::CreatePrefetch(prfop, Tok.getString(),
-                                                    S, getContext()));
+  Operands.push_back(AArch64Operand::CreatePrefetch(
+      PRFM->Encoding, Tok.getString(), S, getContext()));
   return MatchOperand_Success;
 }
 
@@ -2127,18 +2107,15 @@ AArch64AsmParser::tryParsePSBHint(OperandVector &Operands) {
     return MatchOperand_ParseFail;
   }
 
-  bool Valid;
-  auto Mapper = AArch64PSBHint::PSBHintMapper();
-  unsigned psbhint =
-      Mapper.fromString(Tok.getString(), getSTI().getFeatureBits(), Valid);
-  if (!Valid) {
+  auto PSB = AArch64PSBHint::lookupPSBByName(Tok.getString());
+  if (!PSB) {
     TokError("invalid operand for instruction");
     return MatchOperand_ParseFail;
   }
 
   Parser.Lex(); // Eat identifier token.
-  Operands.push_back(AArch64Operand::CreatePSBHint(psbhint, Tok.getString(),
-                                                   S, getContext()));
+  Operands.push_back(AArch64Operand::CreatePSBHint(
+      PSB->Encoding, Tok.getString(), S, getContext()));
   return MatchOperand_Success;
 }
 
@@ -2762,12 +2739,9 @@ AArch64AsmParser::tryParseBarrierOperand(OperandVector &Operands) {
       Error(ExprLoc, "barrier operand out of range");
       return MatchOperand_ParseFail;
     }
-    bool Valid;
-    auto Mapper = AArch64DB::DBarrierMapper();
-    StringRef Name = 
-        Mapper.toString(MCE->getValue(), getSTI().getFeatureBits(), Valid);
-    Operands.push_back( AArch64Operand::CreateBarrier(MCE->getValue(), Name,
-                                                      ExprLoc, getContext()));
+    auto DB = AArch64DB::lookupDBByEncoding(MCE->getValue());
+    Operands.push_back(AArch64Operand::CreateBarrier(
+        MCE->getValue(), DB ? DB->Name : "", ExprLoc, getContext()));
     return MatchOperand_Success;
   }
 
@@ -2776,23 +2750,20 @@ AArch64AsmParser::tryParseBarrierOperand(OperandVector &Operands) {
     return MatchOperand_ParseFail;
   }
 
-  bool Valid;
-  auto Mapper = AArch64DB::DBarrierMapper();
-  unsigned Opt = 
-      Mapper.fromString(Tok.getString(), getSTI().getFeatureBits(), Valid);
-  if (!Valid) {
+  auto DB = AArch64DB::lookupDBByName(Tok.getString());
+  if (!DB) {
     TokError("invalid barrier option name");
     return MatchOperand_ParseFail;
   }
 
   // The only valid named option for ISB is 'sy'
-  if (Mnemonic == "isb" && Opt != AArch64DB::SY) {
+  if (Mnemonic == "isb" && DB->Encoding != AArch64DB::sy) {
     TokError("'sy' or #imm operand expected");
     return MatchOperand_ParseFail;
   }
 
-  Operands.push_back( AArch64Operand::CreateBarrier(Opt, Tok.getString(),
-                                                    getLoc(), getContext()));
+  Operands.push_back(AArch64Operand::CreateBarrier(
+      DB->Encoding, Tok.getString(), getLoc(), getContext()));
   Parser.Lex(); // Consume the option
 
   return MatchOperand_Success;
@@ -2806,28 +2777,22 @@ AArch64AsmParser::tryParseSysReg(OperandVector &Operands) {
   if (Tok.isNot(AsmToken::Identifier))
     return MatchOperand_NoMatch;
 
-  bool IsKnown;
-  auto MRSMapper = AArch64SysReg::MRSMapper();
-  uint32_t MRSReg = MRSMapper.fromString(Tok.getString(),
-                                         getSTI().getFeatureBits(), IsKnown);
-  assert(IsKnown == (MRSReg != -1U) &&
-         "register should be -1 if and only if it's unknown");
-
-  auto MSRMapper = AArch64SysReg::MSRMapper();
-  uint32_t MSRReg = MSRMapper.fromString(Tok.getString(),
-                                         getSTI().getFeatureBits(), IsKnown);
-  assert(IsKnown == (MSRReg != -1U) &&
-         "register should be -1 if and only if it's unknown");
-
-  auto PStateMapper = AArch64PState::PStateMapper();
-  uint32_t PStateField = 
-      PStateMapper.fromString(Tok.getString(),
-                              getSTI().getFeatureBits(), IsKnown);
-  assert(IsKnown == (PStateField != -1U) &&
-         "register should be -1 if and only if it's unknown");
-
-  Operands.push_back(AArch64Operand::CreateSysReg(
-      Tok.getString(), getLoc(), MRSReg, MSRReg, PStateField, getContext()));
+  int MRSReg, MSRReg;
+  auto SysReg = AArch64SysReg::lookupSysRegByName(Tok.getString());
+  if (SysReg && SysReg->haveFeatures(getSTI().getFeatureBits())) {
+    MRSReg = SysReg->Readable ? SysReg->Encoding : -1;
+    MSRReg = SysReg->Writeable ? SysReg->Encoding : -1;
+  } else
+    MRSReg = MSRReg = AArch64SysReg::parseGenericRegister(Tok.getString());
+
+  auto PState = AArch64PState::lookupPStateByName(Tok.getString());
+  unsigned PStateImm = -1;
+  if (PState && PState->haveFeatures(getSTI().getFeatureBits()))
+    PStateImm = PState->Encoding;
+
+  Operands.push_back(
+      AArch64Operand::CreateSysReg(Tok.getString(), getLoc(), MRSReg, MSRReg,
+                                   PStateImm, getContext()));
   Parser.Lex(); // Eat identifier
 
   return MatchOperand_Success;
@@ -4195,6 +4160,10 @@ bool AArch64AsmParser::ParseDirective(AsmToken DirectiveID) {
 
   StringRef IDVal = DirectiveID.getIdentifier();
   SMLoc Loc = DirectiveID.getLoc();
+  if (IDVal == ".arch")
+    return parseDirectiveArch(Loc);
+  if (IDVal == ".cpu")
+    return parseDirectiveCPU(Loc);
   if (IDVal == ".hword")
     return parseDirectiveWord(2, Loc);
   if (IDVal == ".word")
@@ -4216,6 +4185,99 @@ bool AArch64AsmParser::ParseDirective(AsmToken DirectiveID) {
   return parseDirectiveLOH(IDVal, Loc);
 }
 
+static const struct {
+  const char *Name;
+  const FeatureBitset Features;
+} ExtensionMap[] = {
+  { "crc", {AArch64::FeatureCRC} },
+  { "crypto", {AArch64::FeatureCrypto} },
+  { "fp", {AArch64::FeatureFPARMv8} },
+  { "simd", {AArch64::FeatureNEON} },
+
+  // FIXME: Unsupported extensions
+  { "lse", {} },
+  { "pan", {} },
+  { "lor", {} },
+  { "rdma", {} },
+  { "profile", {} },
+};
+
+/// parseDirectiveArch
+///   ::= .arch token
+bool AArch64AsmParser::parseDirectiveArch(SMLoc L) {
+  SMLoc ArchLoc = getLoc();
+
+  StringRef Arch, ExtensionString;
+  std::tie(Arch, ExtensionString) =
+      getParser().parseStringToEndOfStatement().trim().split('+');
+
+  unsigned ID = AArch64::parseArch(Arch);
+  if (ID == ARM::AK_INVALID) {
+    Error(ArchLoc, "unknown arch name");
+    return false;
+  }
+
+  MCSubtargetInfo &STI = copySTI();
+  STI.setDefaultFeatures("", "");
+  if (!ExtensionString.empty())
+    STI.setDefaultFeatures("", ("+" + ExtensionString).str());
+  setAvailableFeatures(ComputeAvailableFeatures(STI.getFeatureBits()));
+
+  return false;
+}
+
+/// parseDirectiveCPU
+///   ::= .cpu id
+bool AArch64AsmParser::parseDirectiveCPU(SMLoc L) {
+  SMLoc CPULoc = getLoc();
+
+  StringRef CPU, ExtensionString;
+  std::tie(CPU, ExtensionString) =
+      getParser().parseStringToEndOfStatement().trim().split('+');
+
+  SmallVector<StringRef, 4> RequestedExtensions;
+  if (!ExtensionString.empty())
+    ExtensionString.split(RequestedExtensions, '+');
+
+  // FIXME This is using tablegen data, but should be moved to ARMTargetParser
+  // once that is tablegen'ed
+  if (!getSTI().isCPUStringValid(CPU)) {
+    Error(CPULoc, "unknown CPU name");
+    return false;
+  }
+
+  MCSubtargetInfo &STI = copySTI();
+  STI.setDefaultFeatures(CPU, "");
+
+  FeatureBitset Features = STI.getFeatureBits();
+  for (auto Name : RequestedExtensions) {
+    bool EnableFeature = true;
+
+    if (Name.startswith_lower("no")) {
+      EnableFeature = false;
+      Name = Name.substr(2);
+    }
+
+    for (const auto &Extension : ExtensionMap) {
+      if (Extension.Name != Name)
+        continue;
+
+      if (Extension.Features.none())
+        report_fatal_error("unsupported architectural extension: " + Name);
+
+      FeatureBitset ToggleFeatures = EnableFeature
+                                         ? (~Features & Extension.Features)
+                                         : ( Features & Extension.Features);
+      uint64_t Features =
+          ComputeAvailableFeatures(STI.ToggleFeature(ToggleFeatures));
+      setAvailableFeatures(Features);
+
+      break;
+    }
+  }
+  return false;
+}
+
 /// parseDirectiveWord
 ///  ::= .word [ expression (, expression)* ]
 bool AArch64AsmParser::parseDirectiveWord(unsigned Size, SMLoc L) {
diff --git a/lib/Target/AArch64/AsmParser/Makefile b/lib/Target/AArch64/AsmParser/Makefile
deleted file mode 100644
index 00268c76f8e8..000000000000
--- a/lib/Target/AArch64/AsmParser/Makefile
+++ /dev/null
@@ -1,15 +0,0 @@
-##===- lib/Target/AArch64/AsmParser/Makefile ---------------*- Makefile -*-===##
-#
-#                     The LLVM Compiler Infrastructure
-#
-# This file is distributed under the University of Illinois Open Source
-# License. See LICENSE.TXT for details.
-#
-##===----------------------------------------------------------------------===##
-LEVEL = ../../../..
-LIBRARYNAME = LLVMAArch64AsmParser
-
-# Hack: we need to include 'main' ARM target directory to grab private headers
-CPP.Flags += -I$(PROJ_OBJ_DIR)/.. -I$(PROJ_SRC_DIR)/..
-
-include $(LEVEL)/Makefile.common
diff --git a/lib/Target/AArch64/CMakeLists.txt b/lib/Target/AArch64/CMakeLists.txt
index f26327ff84ad..a79960ea9605 100644
--- a/lib/Target/AArch64/CMakeLists.txt
+++ b/lib/Target/AArch64/CMakeLists.txt
@@ -12,8 +12,25 @@ tablegen(LLVM AArch64GenFastISel.inc -gen-fast-isel)
 tablegen(LLVM AArch64GenCallingConv.inc -gen-callingconv)
 tablegen(LLVM AArch64GenSubtargetInfo.inc -gen-subtarget)
 tablegen(LLVM AArch64GenDisassemblerTables.inc -gen-disassembler)
+tablegen(LLVM AArch64GenSystemOperands.inc -gen-searchable-tables)
+
 add_public_tablegen_target(AArch64CommonTableGen)
 
+# List of all GlobalISel files.
+set(GLOBAL_ISEL_FILES
+      AArch64CallLowering.cpp
+      AArch64RegisterBankInfo.cpp
+      )
+
+# Add GlobalISel files to the dependencies if the user wants to build it.
+if(LLVM_BUILD_GLOBAL_ISEL)
+  set(GLOBAL_ISEL_BUILD_FILES ${GLOBAL_ISEL_FILES})
+else()
+  set(GLOBAL_ISEL_BUILD_FILES"")
+  set(LLVM_OPTIONAL_SOURCES LLVMGlobalISel ${GLOBAL_ISEL_FILES})
+endif()
+
+
 add_llvm_target(AArch64CodeGen
   AArch64A57FPLoadBalancing.cpp
   AArch64AddressTypePromotion.cpp
@@ -29,6 +46,7 @@ add_llvm_target(AArch64CodeGen
   AArch64A53Fix835769.cpp
   AArch64FrameLowering.cpp
   AArch64ConditionOptimizer.cpp
+  AArch64RedundantCopyElimination.cpp
   AArch64ISelDAGToDAG.cpp
   AArch64ISelLowering.cpp
   AArch64InstrInfo.cpp
@@ -43,6 +61,7 @@ add_llvm_target(AArch64CodeGen
   AArch64TargetMachine.cpp
   AArch64TargetObjectFile.cpp
   AArch64TargetTransformInfo.cpp
+  ${GLOBAL_ISEL_BUILD_FILES}
 )
 
 add_dependencies(LLVMAArch64CodeGen intrinsics_gen)
diff --git a/lib/Target/AArch64/Disassembler/AArch64Disassembler.cpp b/lib/Target/AArch64/Disassembler/AArch64Disassembler.cpp
index f1f968e73123..fe6ea31b9061 100644
--- a/lib/Target/AArch64/Disassembler/AArch64Disassembler.cpp
+++ b/lib/Target/AArch64/Disassembler/AArch64Disassembler.cpp
@@ -1523,13 +1523,12 @@ static DecodeStatus DecodeSystemPStateInstruction(llvm::MCInst &Inst,
   Inst.addOperand(MCOperand::createImm(pstate_field));
   Inst.addOperand(MCOperand::createImm(crm));
 
-  bool ValidNamed;
-  const AArch64Disassembler *Dis = 
+  const AArch64Disassembler *Dis =
       static_cast<const AArch64Disassembler *>(Decoder);
-  (void)AArch64PState::PStateMapper().toString(pstate_field, 
-      Dis->getSubtargetInfo().getFeatureBits(), ValidNamed);
-
-  return ValidNamed ? Success : Fail;
+  auto PState = AArch64PState::lookupPStateByEncoding(pstate_field);
+  if (PState && PState->haveFeatures(Dis->getSubtargetInfo().getFeatureBits()))
+    return Success;
+  return Fail;
 }
 
 static DecodeStatus DecodeTestAndBranch(llvm::MCInst &Inst, uint32_t insn,
@@ -1574,7 +1573,7 @@ static DecodeStatus DecodeWSeqPairsClassRegisterClass(MCInst &Inst,
                                                       unsigned RegNo,
                                                       uint64_t Addr,
                                                       const void *Decoder) {
-  return DecodeGPRSeqPairsClassRegisterClass(Inst, 
+  return DecodeGPRSeqPairsClassRegisterClass(Inst,
                                              AArch64::WSeqPairsClassRegClassID,
                                              RegNo, Addr, Decoder);
 }
@@ -1583,7 +1582,7 @@ static DecodeStatus DecodeXSeqPairsClassRegisterClass(MCInst &Inst,
                                                       unsigned RegNo,
                                                       uint64_t Addr,
                                                       const void *Decoder) {
-  return DecodeGPRSeqPairsClassRegisterClass(Inst, 
+  return DecodeGPRSeqPairsClassRegisterClass(Inst,
                                              AArch64::XSeqPairsClassRegClassID,
                                              RegNo, Addr, Decoder);
 }
diff --git a/lib/Target/AArch64/Disassembler/AArch64Disassembler.h b/lib/Target/AArch64/Disassembler/AArch64Disassembler.h
index 7fb57adfeeba..e475e505e7d1 100644
--- a/lib/Target/AArch64/Disassembler/AArch64Disassembler.h
+++ b/lib/Target/AArch64/Disassembler/AArch64Disassembler.h
@@ -13,7 +13,7 @@
 #ifndef LLVM_LIB_TARGET_AARCH64_DISASSEMBLER_AARCH64DISASSEMBLER_H
 #define LLVM_LIB_TARGET_AARCH64_DISASSEMBLER_AARCH64DISASSEMBLER_H
 
-#include "llvm/MC/MCDisassembler.h"
+#include "llvm/MC/MCDisassembler/MCDisassembler.h"
 
 namespace llvm {
 
diff --git a/lib/Target/AArch64/Disassembler/AArch64ExternalSymbolizer.cpp b/lib/Target/AArch64/Disassembler/AArch64ExternalSymbolizer.cpp
index 82bc949927ce..19d0ba2e1c41 100644
--- a/lib/Target/AArch64/Disassembler/AArch64ExternalSymbolizer.cpp
+++ b/lib/Target/AArch64/Disassembler/AArch64ExternalSymbolizer.cpp
@@ -134,9 +134,11 @@ bool AArch64ExternalSymbolizer::tryAddingSymbolicOperand(
       if (ReferenceType == LLVMDisassembler_ReferenceType_Out_LitPool_SymAddr)
         CommentStream << "literal pool symbol address: " << ReferenceName;
       else if (ReferenceType ==
-               LLVMDisassembler_ReferenceType_Out_LitPool_CstrAddr)
-        CommentStream << "literal pool for: \"" << ReferenceName << "\"";
-      else if (ReferenceType ==
+               LLVMDisassembler_ReferenceType_Out_LitPool_CstrAddr) {
+        CommentStream << "literal pool for: \"";
+        CommentStream.write_escaped(ReferenceName);
+        CommentStream << "\"";
+      } else if (ReferenceType ==
                LLVMDisassembler_ReferenceType_Out_Objc_CFString_Ref)
         CommentStream << "Objc cfstring ref: @\"" << ReferenceName << "\"";
       else if (ReferenceType ==
diff --git a/lib/Target/AArch64/Disassembler/AArch64ExternalSymbolizer.h b/lib/Target/AArch64/Disassembler/AArch64ExternalSymbolizer.h
index 12b8450b13c6..49e844963797 100644
--- a/lib/Target/AArch64/Disassembler/AArch64ExternalSymbolizer.h
+++ b/lib/Target/AArch64/Disassembler/AArch64ExternalSymbolizer.h
@@ -14,7 +14,7 @@
 #ifndef LLVM_LIB_TARGET_AARCH64_DISASSEMBLER_AARCH64EXTERNALSYMBOLIZER_H
 #define LLVM_LIB_TARGET_AARCH64_DISASSEMBLER_AARCH64EXTERNALSYMBOLIZER_H
 
-#include "llvm/MC/MCExternalSymbolizer.h"
+#include "llvm/MC/MCDisassembler/MCExternalSymbolizer.h"
 
 namespace llvm {
 
diff --git a/lib/Target/AArch64/Disassembler/Makefile b/lib/Target/AArch64/Disassembler/Makefile
deleted file mode 100644
index 741bb817a633..000000000000
--- a/lib/Target/AArch64/Disassembler/Makefile
+++ /dev/null
@@ -1,16 +0,0 @@
-##===- lib/Target/AArch64/Disassembler/Makefile ------------*- Makefile -*-===##
-#
-#                     The LLVM Compiler Infrastructure
-#
-# This file is distributed under the University of Illinois Open Source
-# License. See LICENSE.TXT for details.
-#
-##===----------------------------------------------------------------------===##
-
-LEVEL = ../../../..
-LIBRARYNAME = LLVMAArch64Disassembler
-
-# Hack: we need to include 'main' arm target directory to grab private headers
-CPPFLAGS = -I$(PROJ_OBJ_DIR)/.. -I$(PROJ_SRC_DIR)/..
-
-include $(LEVEL)/Makefile.common
diff --git a/lib/Target/AArch64/InstPrinter/AArch64InstPrinter.cpp b/lib/Target/AArch64/InstPrinter/AArch64InstPrinter.cpp
index d8a810824370..b4f85204714f 100644
--- a/lib/Target/AArch64/InstPrinter/AArch64InstPrinter.cpp
+++ b/lib/Target/AArch64/InstPrinter/AArch64InstPrinter.cpp
@@ -219,6 +219,54 @@ void AArch64InstPrinter::printInst(const MCInst *MI, raw_ostream &O,
     return;
   }
 
+  // MOVZ, MOVN and "ORR wzr, #imm" instructions are aliases for MOV, but their
+  // domains overlap so they need to be prioritized. The chain is "MOVZ lsl #0 >
+  // MOVZ lsl #N > MOVN lsl #0 > MOVN lsl #N > ORR". The highest instruction
+  // that can represent the move is the MOV alias, and the rest get printed
+  // normally.
+  if ((Opcode == AArch64::MOVZXi || Opcode == AArch64::MOVZWi) &&
+      MI->getOperand(1).isImm() && MI->getOperand(2).isImm()) {
+    int RegWidth = Opcode == AArch64::MOVZXi ? 64 : 32;
+    int Shift = MI->getOperand(2).getImm();
+    uint64_t Value = (uint64_t)MI->getOperand(1).getImm() << Shift;
+
+    if (AArch64_AM::isMOVZMovAlias(Value, Shift,
+                                   Opcode == AArch64::MOVZXi ? 64 : 32)) {
+      O << "\tmov\t" << getRegisterName(MI->getOperand(0).getReg()) << ", #"
+        << formatImm(SignExtend64(Value, RegWidth));
+      return;
+    }
+  }
+
+  if ((Opcode == AArch64::MOVNXi || Opcode == AArch64::MOVNWi) &&
+      MI->getOperand(1).isImm() && MI->getOperand(2).isImm()) {
+    int RegWidth = Opcode == AArch64::MOVNXi ? 64 : 32;
+    int Shift = MI->getOperand(2).getImm();
+    uint64_t Value = ~((uint64_t)MI->getOperand(1).getImm() << Shift);
+    if (RegWidth == 32)
+      Value = Value & 0xffffffff;
+
+    if (AArch64_AM::isMOVNMovAlias(Value, Shift, RegWidth)) {
+      O << "\tmov\t" << getRegisterName(MI->getOperand(0).getReg()) << ", #"
+        << formatImm(SignExtend64(Value, RegWidth));
+      return;
+    }
+  }
+
+  if ((Opcode == AArch64::ORRXri || Opcode == AArch64::ORRWri) &&
+      (MI->getOperand(1).getReg() == AArch64::XZR ||
+       MI->getOperand(1).getReg() == AArch64::WZR) &&
+      MI->getOperand(2).isImm()) {
+    int RegWidth = Opcode == AArch64::ORRXri ? 64 : 32;
+    uint64_t Value = AArch64_AM::decodeLogicalImmediate(
+        MI->getOperand(2).getImm(), RegWidth);
+    if (!AArch64_AM::isAnyMOVWMovAlias(Value, RegWidth)) {
+      O << "\tmov\t" << getRegisterName(MI->getOperand(0).getReg()) << ", #"
+        << formatImm(SignExtend64(Value, RegWidth));
+      return;
+    }
+  }
+
   if (!printAliasInstr(MI, STI, O))
     printInstruction(MI, STI, O);
 
@@ -928,14 +976,21 @@ void AArch64InstPrinter::printOperand(const MCInst *MI, unsigned OpNo,
     unsigned Reg = Op.getReg();
     O << getRegisterName(Reg);
   } else if (Op.isImm()) {
-    O << '#' << Op.getImm();
+    printImm(MI, OpNo, STI, O);
   } else {
     assert(Op.isExpr() && "unknown operand kind in printOperand");
     Op.getExpr()->print(O, &MAI);
   }
 }
 
-void AArch64InstPrinter::printHexImm(const MCInst *MI, unsigned OpNo,
+void AArch64InstPrinter::printImm(const MCInst *MI, unsigned OpNo,
+                                     const MCSubtargetInfo &STI,
+                                     raw_ostream &O) {
+  const MCOperand &Op = MI->getOperand(OpNo);
+  O << "#" << formatImm(Op.getImm());
+}
+
+void AArch64InstPrinter::printImmHex(const MCInst *MI, unsigned OpNo,
                                      const MCSubtargetInfo &STI,
                                      raw_ostream &O) {
   const MCOperand &Op = MI->getOperand(OpNo);
@@ -981,12 +1036,12 @@ void AArch64InstPrinter::printAddSubImm(const MCInst *MI, unsigned OpNum,
     assert(Val == MO.getImm() && "Add/sub immediate out of range!");
     unsigned Shift =
         AArch64_AM::getShiftValue(MI->getOperand(OpNum + 1).getImm());
-    O << '#' << Val;
+    O << '#' << formatImm(Val);
     if (Shift != 0)
       printShifter(MI, OpNum + 1, STI, O);
 
     if (CommentStream)
-      *CommentStream << '=' << (Val << Shift) << '\n';
+      *CommentStream << '=' << formatImm(Val << Shift) << '\n';
   } else {
     assert(MO.isExpr() && "Unexpected operand type!");
     MO.getExpr()->print(O, &MAI);
@@ -1104,14 +1159,14 @@ template<int Scale>
 void AArch64InstPrinter::printImmScale(const MCInst *MI, unsigned OpNum,
                                        const MCSubtargetInfo &STI,
                                        raw_ostream &O) {
-  O << '#' << Scale * MI->getOperand(OpNum).getImm();
+  O << '#' << formatImm(Scale * MI->getOperand(OpNum).getImm());
 }
 
 void AArch64InstPrinter::printUImm12Offset(const MCInst *MI, unsigned OpNum,
                                            unsigned Scale, raw_ostream &O) {
   const MCOperand MO = MI->getOperand(OpNum);
   if (MO.isImm()) {
-    O << "#" << (MO.getImm() * Scale);
+    O << "#" << formatImm(MO.getImm() * Scale);
   } else {
     assert(MO.isExpr() && "Unexpected operand type!");
     MO.getExpr()->print(O, &MAI);
@@ -1123,7 +1178,7 @@ void AArch64InstPrinter::printAMIndexedWB(const MCInst *MI, unsigned OpNum,
   const MCOperand MO1 = MI->getOperand(OpNum + 1);
   O << '[' << getRegisterName(MI->getOperand(OpNum).getReg());
   if (MO1.isImm()) {
-      O << ", #" << (MO1.getImm() * Scale);
+      O << ", #" << formatImm(MO1.getImm() * Scale);
   } else {
     assert(MO1.isExpr() && "Unexpected operand type!");
     O << ", ";
@@ -1136,26 +1191,22 @@ void AArch64InstPrinter::printPrefetchOp(const MCInst *MI, unsigned OpNum,
                                          const MCSubtargetInfo &STI,
                                          raw_ostream &O) {
   unsigned prfop = MI->getOperand(OpNum).getImm();
-  bool Valid;
-  StringRef Name =
-      AArch64PRFM::PRFMMapper().toString(prfop, STI.getFeatureBits(), Valid);
-  if (Valid)
-    O << Name;
+  auto PRFM = AArch64PRFM::lookupPRFMByEncoding(prfop);
+  if (PRFM)
+    O << PRFM->Name;
   else
-    O << '#' << prfop;
+    O << '#' << formatImm(prfop);
 }
 
 void AArch64InstPrinter::printPSBHintOp(const MCInst *MI, unsigned OpNum,
                                         const MCSubtargetInfo &STI,
                                         raw_ostream &O) {
   unsigned psbhintop = MI->getOperand(OpNum).getImm();
-  bool Valid;
-  StringRef Name =
-      AArch64PSBHint::PSBHintMapper().toString(psbhintop, STI.getFeatureBits(), Valid);
-  if (Valid)
-    O << Name;
+  auto PSB = AArch64PSBHint::lookupPSBByEncoding(psbhintop);
+  if (PSB)
+    O << PSB->Name;
   else
-    O << '#' << psbhintop;
+    O << '#' << formatImm(psbhintop);
 }
 
 void AArch64InstPrinter::printFPImmOperand(const MCInst *MI, unsigned OpNum,
@@ -1310,7 +1361,7 @@ void AArch64InstPrinter::printAlignedLabel(const MCInst *MI, unsigned OpNum,
   // If the label has already been resolved to an immediate offset (say, when
   // we're running the disassembler), just print the immediate.
   if (Op.isImm()) {
-    O << "#" << (Op.getImm() * 4);
+    O << "#" << formatImm(Op.getImm() * 4);
     return;
   }
 
@@ -1335,7 +1386,7 @@ void AArch64InstPrinter::printAdrpLabel(const MCInst *MI, unsigned OpNum,
   // If the label has already been resolved to an immediate offset (say, when
   // we're running the disassembler), just print the immediate.
   if (Op.isImm()) {
-    O << "#" << (Op.getImm() * (1 << 12));
+    O << "#" << formatImm(Op.getImm() * (1 << 12));
     return;
   }
 
@@ -1349,15 +1400,15 @@ void AArch64InstPrinter::printBarrierOption(const MCInst *MI, unsigned OpNo,
   unsigned Val = MI->getOperand(OpNo).getImm();
   unsigned Opcode = MI->getOpcode();
 
-  bool Valid;
   StringRef Name;
-  if (Opcode == AArch64::ISB)
-    Name = AArch64ISB::ISBMapper().toString(Val, STI.getFeatureBits(),
-                                            Valid);
-  else
-    Name = AArch64DB::DBarrierMapper().toString(Val, STI.getFeatureBits(),
-                                                Valid);
-  if (Valid)
+  if (Opcode == AArch64::ISB) {
+    auto ISB = AArch64ISB::lookupISBByEncoding(Val);
+    Name = ISB ? ISB->Name : "";
+  } else {
+    auto DB = AArch64DB::lookupDBByEncoding(Val);
+    Name = DB ? DB->Name : "";
+  }
+  if (!Name.empty())
     O << Name;
   else
     O << "#" << Val;
@@ -1368,10 +1419,19 @@ void AArch64InstPrinter::printMRSSystemRegister(const MCInst *MI, unsigned OpNo,
                                                 raw_ostream &O) {
   unsigned Val = MI->getOperand(OpNo).getImm();
 
-  auto Mapper = AArch64SysReg::MRSMapper();
-  std::string Name = Mapper.toString(Val, STI.getFeatureBits());
+  // Horrible hack for the one register that has identical encodings but
+  // different names in MSR and MRS. Because of this, one of MRS and MSR is
+  // going to get the wrong entry
+  if (Val == AArch64SysReg::DBGDTRRX_EL0) {
+    O << "DBGDTRRX_EL0";
+    return;
+  }
 
-  O << StringRef(Name).upper();
+  const AArch64SysReg::SysReg *Reg = AArch64SysReg::lookupSysRegByEncoding(Val);
+  if (Reg && Reg->Readable && Reg->haveFeatures(STI.getFeatureBits()))
+    O << Reg->Name;
+  else
+    O << AArch64SysReg::genericRegisterString(Val);
 }
 
 void AArch64InstPrinter::printMSRSystemRegister(const MCInst *MI, unsigned OpNo,
@@ -1379,10 +1439,19 @@ void AArch64InstPrinter::printMSRSystemRegister(const MCInst *MI, unsigned OpNo,
                                                 raw_ostream &O) {
   unsigned Val = MI->getOperand(OpNo).getImm();
 
-  auto Mapper = AArch64SysReg::MSRMapper();
-  std::string Name = Mapper.toString(Val, STI.getFeatureBits());
+  // Horrible hack for the one register that has identical encodings but
+  // different names in MSR and MRS. Because of this, one of MRS and MSR is
+  // going to get the wrong entry
+  if (Val == AArch64SysReg::DBGDTRTX_EL0) {
+    O << "DBGDTRTX_EL0";
+    return;
+  }
 
-  O << StringRef(Name).upper();
+  const AArch64SysReg::SysReg *Reg = AArch64SysReg::lookupSysRegByEncoding(Val);
+  if (Reg && Reg->Writeable && Reg->haveFeatures(STI.getFeatureBits()))
+    O << Reg->Name;
+  else
+    O << AArch64SysReg::genericRegisterString(Val);
 }
 
 void AArch64InstPrinter::printSystemPStateField(const MCInst *MI, unsigned OpNo,
@@ -1390,13 +1459,11 @@ void AArch64InstPrinter::printSystemPStateField(const MCInst *MI, unsigned OpNo,
                                                 raw_ostream &O) {
   unsigned Val = MI->getOperand(OpNo).getImm();
 
-  bool Valid;
-  StringRef Name =
-      AArch64PState::PStateMapper().toString(Val, STI.getFeatureBits(), Valid);
-  if (Valid)
-    O << Name.upper();
+  auto PState = AArch64PState::lookupPStateByEncoding(Val);
+  if (PState && PState->haveFeatures(STI.getFeatureBits()))
+    O << PState->Name;
   else
-    O << "#" << Val;
+    O << "#" << formatImm(Val);
 }
 
 void AArch64InstPrinter::printSIMDType10Operand(const MCInst *MI, unsigned OpNo,
diff --git a/lib/Target/AArch64/InstPrinter/AArch64InstPrinter.h b/lib/Target/AArch64/InstPrinter/AArch64InstPrinter.h
index ea68d9848b42..65dca99ed04e 100644
--- a/lib/Target/AArch64/InstPrinter/AArch64InstPrinter.h
+++ b/lib/Target/AArch64/InstPrinter/AArch64InstPrinter.h
@@ -49,7 +49,9 @@ protected:
   // Operand printers
   void printOperand(const MCInst *MI, unsigned OpNo, const MCSubtargetInfo &STI,
                     raw_ostream &O);
-  void printHexImm(const MCInst *MI, unsigned OpNo, const MCSubtargetInfo &STI,
+  void printImm(const MCInst *MI, unsigned OpNo, const MCSubtargetInfo &STI,
+                raw_ostream &O);
+  void printImmHex(const MCInst *MI, unsigned OpNo, const MCSubtargetInfo &STI,
                    raw_ostream &O);
   void printPostIncOperand(const MCInst *MI, unsigned OpNo, unsigned Imm,
                            raw_ostream &O);
diff --git a/lib/Target/AArch64/InstPrinter/Makefile b/lib/Target/AArch64/InstPrinter/Makefile
deleted file mode 100644
index b17e8d080119..000000000000
--- a/lib/Target/AArch64/InstPrinter/Makefile
+++ /dev/null
@@ -1,15 +0,0 @@
-##===- lib/Target/AArch64/AsmPrinter/Makefile --------------*- Makefile -*-===##
-#
-#                     The LLVM Compiler Infrastructure
-#
-# This file is distributed under the University of Illinois Open Source
-# License. See LICENSE.TXT for details.
-#
-##===----------------------------------------------------------------------===##
-LEVEL = ../../../..
-LIBRARYNAME = LLVMAArch64AsmPrinter
-
-# Hack: we need to include 'main' arm target directory to grab private headers
-CPP.Flags += -I$(PROJ_OBJ_DIR)/.. -I$(PROJ_SRC_DIR)/..
-
-include $(LEVEL)/Makefile.common
diff --git a/lib/Target/AArch64/LLVMBuild.txt b/lib/Target/AArch64/LLVMBuild.txt
index 642c18394a67..0196c505ba3c 100644
--- a/lib/Target/AArch64/LLVMBuild.txt
+++ b/lib/Target/AArch64/LLVMBuild.txt
@@ -31,5 +31,5 @@ has_jit = 1
 type = Library
 name = AArch64CodeGen
 parent = AArch64
-required_libraries = AArch64AsmPrinter AArch64Desc AArch64Info AArch64Utils Analysis AsmPrinter CodeGen Core MC Scalar SelectionDAG Support Target
+required_libraries = AArch64AsmPrinter AArch64Desc AArch64Info AArch64Utils Analysis AsmPrinter CodeGen Core MC Scalar SelectionDAG Support Target GlobalISel
 add_to_library_groups = AArch64
diff --git a/lib/Target/AArch64/MCTargetDesc/AArch64AddressingModes.h b/lib/Target/AArch64/MCTargetDesc/AArch64AddressingModes.h
index 648b1dfc8c5e..3e5ef4df4706 100644
--- a/lib/Target/AArch64/MCTargetDesc/AArch64AddressingModes.h
+++ b/lib/Target/AArch64/MCTargetDesc/AArch64AddressingModes.h
@@ -753,6 +753,49 @@ static inline uint64_t decodeAdvSIMDModImmType12(uint8_t Imm) {
   return (EncVal << 32) | EncVal;
 }
 
+inline static bool isAnyMOVZMovAlias(uint64_t Value, int RegWidth) {
+  for (int Shift = 0; Shift <= RegWidth - 16; Shift += 16)
+    if ((Value & ~(0xffffULL << Shift)) == 0)
+      return true;
+
+  return false;
+}
+
+inline static bool isMOVZMovAlias(uint64_t Value, int Shift, int RegWidth) {
+  if (RegWidth == 32)
+    Value &= 0xffffffffULL;
+
+  // "lsl #0" takes precedence: in practice this only affects "#0, lsl #0".
+  if (Value == 0 && Shift != 0)
+    return false;
+
+  return (Value & ~(0xffffULL << Shift)) == 0;
+}
+
+inline static bool isMOVNMovAlias(uint64_t Value, int Shift, int RegWidth) {
+  // MOVZ takes precedence over MOVN.
+  if (isAnyMOVZMovAlias(Value, RegWidth))
+    return false;
+
+  Value = ~Value;
+  if (RegWidth == 32)
+    Value &= 0xffffffffULL;
+
+  return isMOVZMovAlias(Value, Shift, RegWidth);
+}
+
+inline static bool isAnyMOVWMovAlias(uint64_t Value, int RegWidth) {
+  if (isAnyMOVZMovAlias(Value, RegWidth))
+    return true;
+
+  // It's not a MOVZ, but it might be a MOVN.
+  Value = ~Value;
+  if (RegWidth == 32)
+    Value &= 0xffffffffULL;
+
+  return isAnyMOVZMovAlias(Value, RegWidth);
+}
+
 } // end namespace AArch64_AM
 
 } // end namespace llvm
diff --git a/lib/Target/AArch64/MCTargetDesc/AArch64AsmBackend.cpp b/lib/Target/AArch64/MCTargetDesc/AArch64AsmBackend.cpp
index 7624c7240d68..27993246eb07 100644
--- a/lib/Target/AArch64/MCTargetDesc/AArch64AsmBackend.cpp
+++ b/lib/Target/AArch64/MCTargetDesc/AArch64AsmBackend.cpp
@@ -12,6 +12,7 @@
 #include "MCTargetDesc/AArch64FixupKinds.h"
 #include "llvm/ADT/Triple.h"
 #include "llvm/MC/MCAsmBackend.h"
+#include "llvm/MC/MCContext.h"
 #include "llvm/MC/MCDirectives.h"
 #include "llvm/MC/MCELFObjectWriter.h"
 #include "llvm/MC/MCFixupKindInfo.h"
@@ -28,9 +29,12 @@ namespace {
 class AArch64AsmBackend : public MCAsmBackend {
   static const unsigned PCRelFlagVal =
       MCFixupKindInfo::FKF_IsAlignedDownTo32Bits | MCFixupKindInfo::FKF_IsPCRel;
+public:
+  bool IsLittleEndian;
 
 public:
-  AArch64AsmBackend(const Target &T) : MCAsmBackend() {}
+  AArch64AsmBackend(const Target &T, bool IsLittleEndian)
+     : MCAsmBackend(), IsLittleEndian(IsLittleEndian) {}
 
   unsigned getNumFixupKinds() const override {
     return AArch64::NumTargetFixupKinds;
@@ -74,12 +78,15 @@ public:
   bool fixupNeedsRelaxation(const MCFixup &Fixup, uint64_t Value,
                             const MCRelaxableFragment *DF,
                             const MCAsmLayout &Layout) const override;
-  void relaxInstruction(const MCInst &Inst, MCInst &Res) const override;
+  void relaxInstruction(const MCInst &Inst, const MCSubtargetInfo &STI,
+                        MCInst &Res) const override;
   bool writeNopData(uint64_t Count, MCObjectWriter *OW) const override;
 
   void HandleAssemblerFlag(MCAssemblerFlag Flag) {}
 
   unsigned getPointerSize() const { return 8; }
+
+  unsigned getFixupKindContainereSizeInBytes(unsigned Kind) const;
 };
 
 } // end anonymous namespace
@@ -129,14 +136,16 @@ static unsigned AdrImmBits(unsigned Value) {
   return (hi19 << 5) | (lo2 << 29);
 }
 
-static uint64_t adjustFixupValue(unsigned Kind, uint64_t Value) {
+static uint64_t adjustFixupValue(const MCFixup &Fixup, uint64_t Value,
+                                 MCContext *Ctx) {
+  unsigned Kind = Fixup.getKind();
   int64_t SignedValue = static_cast<int64_t>(Value);
   switch (Kind) {
   default:
     llvm_unreachable("Unknown fixup kind!");
   case AArch64::fixup_aarch64_pcrel_adr_imm21:
-    if (SignedValue > 2097151 || SignedValue < -2097152)
-      report_fatal_error("fixup value out of range");
+    if (Ctx && (SignedValue > 2097151 || SignedValue < -2097152))
+      Ctx->reportError(Fixup.getLoc(), "fixup value out of range");
     return AdrImmBits(Value & 0x1fffffULL);
   case AArch64::fixup_aarch64_pcrel_adrp_imm21:
     return AdrImmBits((Value & 0x1fffff000ULL) >> 12);
@@ -144,54 +153,66 @@ static uint64_t adjustFixupValue(unsigned Kind, uint64_t Value) {
   case AArch64::fixup_aarch64_pcrel_branch19:
     // Signed 21-bit immediate
     if (SignedValue > 2097151 || SignedValue < -2097152)
-      report_fatal_error("fixup value out of range");
+      if (Ctx) Ctx->reportError(Fixup.getLoc(), "fixup value out of range");
+    if (Ctx && (Value & 0x3))
+      Ctx->reportError(Fixup.getLoc(), "fixup not sufficiently aligned");
     // Low two bits are not encoded.
     return (Value >> 2) & 0x7ffff;
   case AArch64::fixup_aarch64_add_imm12:
   case AArch64::fixup_aarch64_ldst_imm12_scale1:
     // Unsigned 12-bit immediate
-    if (Value >= 0x1000)
-      report_fatal_error("invalid imm12 fixup value");
+    if (Ctx && Value >= 0x1000)
+      Ctx->reportError(Fixup.getLoc(), "fixup value out of range");
     return Value;
   case AArch64::fixup_aarch64_ldst_imm12_scale2:
     // Unsigned 12-bit immediate which gets multiplied by 2
-    if (Value & 1 || Value >= 0x2000)
-      report_fatal_error("invalid imm12 fixup value");
+    if (Ctx && (Value >= 0x2000))
+      Ctx->reportError(Fixup.getLoc(), "fixup value out of range");
+    if (Ctx && (Value & 0x1))
+      Ctx->reportError(Fixup.getLoc(), "fixup must be 2-byte aligned");
     return Value >> 1;
   case AArch64::fixup_aarch64_ldst_imm12_scale4:
     // Unsigned 12-bit immediate which gets multiplied by 4
-    if (Value & 3 || Value >= 0x4000)
-      report_fatal_error("invalid imm12 fixup value");
+    if (Ctx && (Value >= 0x4000))
+      Ctx->reportError(Fixup.getLoc(), "fixup value out of range");
+    if (Ctx && (Value & 0x3))
+      Ctx->reportError(Fixup.getLoc(), "fixup must be 4-byte aligned");
     return Value >> 2;
   case AArch64::fixup_aarch64_ldst_imm12_scale8:
     // Unsigned 12-bit immediate which gets multiplied by 8
-    if (Value & 7 || Value >= 0x8000)
-      report_fatal_error("invalid imm12 fixup value");
+    if (Ctx && (Value >= 0x8000))
+      Ctx->reportError(Fixup.getLoc(), "fixup value out of range");
+    if (Ctx && (Value & 0x7))
+      Ctx->reportError(Fixup.getLoc(), "fixup must be 8-byte aligned");
     return Value >> 3;
   case AArch64::fixup_aarch64_ldst_imm12_scale16:
     // Unsigned 12-bit immediate which gets multiplied by 16
-    if (Value & 15 || Value >= 0x10000)
-      report_fatal_error("invalid imm12 fixup value");
+    if (Ctx && (Value >= 0x10000))
+      Ctx->reportError(Fixup.getLoc(), "fixup value out of range");
+    if (Ctx && (Value & 0xf))
+      Ctx->reportError(Fixup.getLoc(), "fixup must be 16-byte aligned");
     return Value >> 4;
   case AArch64::fixup_aarch64_movw:
-    report_fatal_error("no resolvable MOVZ/MOVK fixups supported yet");
+    if (Ctx)
+      Ctx->reportError(Fixup.getLoc(),
+                       "no resolvable MOVZ/MOVK fixups supported yet");
     return Value;
   case AArch64::fixup_aarch64_pcrel_branch14:
     // Signed 16-bit immediate
-    if (SignedValue > 32767 || SignedValue < -32768)
-      report_fatal_error("fixup value out of range");
+    if (Ctx && (SignedValue > 32767 || SignedValue < -32768))
+      Ctx->reportError(Fixup.getLoc(), "fixup value out of range");
     // Low two bits are not encoded (4-byte alignment assumed).
-    if (Value & 0x3)
-      report_fatal_error("fixup not sufficiently aligned");
+    if (Ctx && (Value & 0x3))
+      Ctx->reportError(Fixup.getLoc(), "fixup not sufficiently aligned");
     return (Value >> 2) & 0x3fff;
   case AArch64::fixup_aarch64_pcrel_branch26:
   case AArch64::fixup_aarch64_pcrel_call26:
     // Signed 28-bit immediate
-    if (SignedValue > 134217727 || SignedValue < -134217728)
-      report_fatal_error("fixup value out of range");
+    if (Ctx && (SignedValue > 134217727 || SignedValue < -134217728))
+      Ctx->reportError(Fixup.getLoc(), "fixup value out of range");
     // Low two bits are not encoded (4-byte alignment assumed).
-    if (Value & 0x3)
-      report_fatal_error("fixup not sufficiently aligned");
+    if (Ctx && (Value & 0x3))
+      Ctx->reportError(Fixup.getLoc(), "fixup not sufficiently aligned");
     return (Value >> 2) & 0x3ffffff;
   case FK_Data_1:
   case FK_Data_2:
@@ -201,6 +222,45 @@ static uint64_t adjustFixupValue(unsigned Kind, uint64_t Value) {
   }
 }
 
+/// getFixupKindContainereSizeInBytes - The number of bytes of the
+/// container involved in big endian or 0 if the item is little endian
+unsigned AArch64AsmBackend::getFixupKindContainereSizeInBytes(unsigned Kind) const {
+  if (IsLittleEndian)
+    return 0;
+
+  switch (Kind) {
+  default:
+    llvm_unreachable("Unknown fixup kind!");
+
+  case FK_Data_1:
+    return 1;
+  case FK_Data_2:
+    return 2;
+  case FK_Data_4:
+    return 4;
+  case FK_Data_8:
+    return 8;
+
+  case AArch64::fixup_aarch64_tlsdesc_call:
+  case AArch64::fixup_aarch64_movw:
+  case AArch64::fixup_aarch64_pcrel_branch14:
+  case AArch64::fixup_aarch64_add_imm12:
+  case AArch64::fixup_aarch64_ldst_imm12_scale1:
+  case AArch64::fixup_aarch64_ldst_imm12_scale2:
+  case AArch64::fixup_aarch64_ldst_imm12_scale4:
+  case AArch64::fixup_aarch64_ldst_imm12_scale8:
+  case AArch64::fixup_aarch64_ldst_imm12_scale16:
+  case AArch64::fixup_aarch64_ldr_pcrel_imm19:
+  case AArch64::fixup_aarch64_pcrel_branch19:
+  case AArch64::fixup_aarch64_pcrel_adr_imm21:
+  case AArch64::fixup_aarch64_pcrel_adrp_imm21:
+  case AArch64::fixup_aarch64_pcrel_branch26:
+  case AArch64::fixup_aarch64_pcrel_call26:
+    // Instructions are always little endian
+    return 0;
+  }
+}
+
 void AArch64AsmBackend::applyFixup(const MCFixup &Fixup, char *Data,
                                    unsigned DataSize, uint64_t Value,
                                    bool IsPCRel) const {
@@ -209,7 +269,7 @@ void AArch64AsmBackend::applyFixup(const MCFixup &Fixup, char *Data,
     return; // Doesn't change encoding.
   MCFixupKindInfo Info = getFixupKindInfo(Fixup.getKind());
   // Apply any target-specific value adjustments.
-  Value = adjustFixupValue(Fixup.getKind(), Value);
+  Value = adjustFixupValue(Fixup, Value, nullptr);
 
   // Shift the value into position.
   Value <<= Info.TargetOffset;
@@ -217,10 +277,25 @@ void AArch64AsmBackend::applyFixup(const MCFixup &Fixup, char *Data,
   unsigned Offset = Fixup.getOffset();
   assert(Offset + NumBytes <= DataSize && "Invalid fixup offset!");
 
+  // Used to point to big endian bytes.
+  unsigned FulleSizeInBytes = getFixupKindContainereSizeInBytes(Fixup.getKind());
+
   // For each byte of the fragment that the fixup touches, mask in the
   // bits from the fixup value.
-  for (unsigned i = 0; i != NumBytes; ++i)
-    Data[Offset + i] |= uint8_t((Value >> (i * 8)) & 0xff);
+  if (FulleSizeInBytes == 0) {
+    // Handle as little-endian
+    for (unsigned i = 0; i != NumBytes; ++i) {
+      Data[Offset + i] |= uint8_t((Value >> (i * 8)) & 0xff);
+    }
+  } else {
+    // Handle as big-endian
+    assert((Offset + FulleSizeInBytes) <= DataSize && "Invalid fixup size!");
+    assert(NumBytes <= FulleSizeInBytes && "Invalid fixup size!");
+    for (unsigned i = 0; i != NumBytes; ++i) {
+      unsigned Idx = FulleSizeInBytes - 1 - i;
+      Data[Offset + Idx] |= uint8_t((Value >> (i * 8)) & 0xff);
+    }
+  }
 }
 
 bool AArch64AsmBackend::mayNeedRelaxation(const MCInst &Inst) const {
@@ -239,6 +314,7 @@ bool AArch64AsmBackend::fixupNeedsRelaxation(const MCFixup &Fixup,
 }
 
 void AArch64AsmBackend::relaxInstruction(const MCInst &Inst,
+                                         const MCSubtargetInfo &STI,
                                          MCInst &Res) const {
   llvm_unreachable("AArch64AsmBackend::relaxInstruction() unimplemented");
 }
@@ -264,14 +340,14 @@ namespace CU {
 enum CompactUnwindEncodings {
   /// \brief A "frameless" leaf function, where no non-volatile registers are
   /// saved. The return remains in LR throughout the function.
-  UNWIND_AArch64_MODE_FRAMELESS = 0x02000000,
+  UNWIND_ARM64_MODE_FRAMELESS = 0x02000000,
 
   /// \brief No compact unwind encoding available. Instead the low 23-bits of
   /// the compact unwind encoding is the offset of the DWARF FDE in the
   /// __eh_frame section. This mode is never used in object files. It is only
   /// generated by the linker in final linked images, which have only DWARF info
   /// for a function.
-  UNWIND_AArch64_MODE_DWARF = 0x03000000,
+  UNWIND_ARM64_MODE_DWARF = 0x03000000,
 
   /// \brief This is a standard arm64 prologue where FP/LR are immediately
   /// pushed on the stack, then SP is copied to FP. If there are any
@@ -279,18 +355,18 @@ enum CompactUnwindEncodings {
   /// in a contiguous ranger right below the saved FP/LR pair. Any subset of the
   /// five X pairs and four D pairs can be saved, but the memory layout must be
   /// in register number order.
-  UNWIND_AArch64_MODE_FRAME = 0x04000000,
+  UNWIND_ARM64_MODE_FRAME = 0x04000000,
 
   /// \brief Frame register pair encodings.
-  UNWIND_AArch64_FRAME_X19_X20_PAIR = 0x00000001,
-  UNWIND_AArch64_FRAME_X21_X22_PAIR = 0x00000002,
-  UNWIND_AArch64_FRAME_X23_X24_PAIR = 0x00000004,
-  UNWIND_AArch64_FRAME_X25_X26_PAIR = 0x00000008,
-  UNWIND_AArch64_FRAME_X27_X28_PAIR = 0x00000010,
-  UNWIND_AArch64_FRAME_D8_D9_PAIR = 0x00000100,
-  UNWIND_AArch64_FRAME_D10_D11_PAIR = 0x00000200,
-  UNWIND_AArch64_FRAME_D12_D13_PAIR = 0x00000400,
-  UNWIND_AArch64_FRAME_D14_D15_PAIR = 0x00000800
+  UNWIND_ARM64_FRAME_X19_X20_PAIR = 0x00000001,
+  UNWIND_ARM64_FRAME_X21_X22_PAIR = 0x00000002,
+  UNWIND_ARM64_FRAME_X23_X24_PAIR = 0x00000004,
+  UNWIND_ARM64_FRAME_X25_X26_PAIR = 0x00000008,
+  UNWIND_ARM64_FRAME_X27_X28_PAIR = 0x00000010,
+  UNWIND_ARM64_FRAME_D8_D9_PAIR = 0x00000100,
+  UNWIND_ARM64_FRAME_D10_D11_PAIR = 0x00000200,
+  UNWIND_ARM64_FRAME_D12_D13_PAIR = 0x00000400,
+  UNWIND_ARM64_FRAME_D14_D15_PAIR = 0x00000800
 };
 
 } // end CU namespace
@@ -300,7 +376,7 @@ class DarwinAArch64AsmBackend : public AArch64AsmBackend {
   const MCRegisterInfo &MRI;
 
   /// \brief Encode compact unwind stack adjustment for frameless functions.
-  /// See UNWIND_AArch64_FRAMELESS_STACK_SIZE_MASK in compact_unwind_encoding.h.
+  /// See UNWIND_ARM64_FRAMELESS_STACK_SIZE_MASK in compact_unwind_encoding.h.
   /// The stack size always needs to be 16 byte aligned.
   uint32_t encodeStackAdjustment(uint32_t StackSize) const {
     return (StackSize / 16) << 12;
@@ -308,7 +384,7 @@ class DarwinAArch64AsmBackend : public AArch64AsmBackend {
 
 public:
   DarwinAArch64AsmBackend(const Target &T, const MCRegisterInfo &MRI)
-      : AArch64AsmBackend(T), MRI(MRI) {}
+      : AArch64AsmBackend(T, /*IsLittleEndian*/true), MRI(MRI) {}
 
   MCObjectWriter *createObjectWriter(raw_pwrite_stream &OS) const override {
     return createAArch64MachObjectWriter(OS, MachO::CPU_TYPE_ARM64,
@@ -319,7 +395,7 @@ public:
   uint32_t generateCompactUnwindEncoding(
                              ArrayRef<MCCFIInstruction> Instrs) const override {
     if (Instrs.empty())
-      return CU::UNWIND_AArch64_MODE_FRAMELESS;
+      return CU::UNWIND_ARM64_MODE_FRAMELESS;
 
     bool HasFP = false;
     unsigned StackSize = 0;
@@ -331,7 +407,7 @@ public:
       switch (Inst.getOperation()) {
       default:
         // Cannot handle this directive:  bail out.
-        return CU::UNWIND_AArch64_MODE_DWARF;
+        return CU::UNWIND_ARM64_MODE_DWARF;
       case MCCFIInstruction::OpDefCfa: {
         // Defines a frame pointer.
         assert(getXRegFromWReg(MRI.getLLVMRegNum(Inst.getRegister(), true)) ==
@@ -356,7 +432,7 @@ public:
                "Pushing invalid registers for frame!");
 
         // Indicate that the function has a frame.
-        CompactUnwindEncoding |= CU::UNWIND_AArch64_MODE_FRAME;
+        CompactUnwindEncoding |= CU::UNWIND_ARM64_MODE_FRAME;
         HasFP = true;
         break;
       }
@@ -370,11 +446,11 @@ public:
         // `.cfi_offset' instructions with the appropriate registers specified.
         unsigned Reg1 = MRI.getLLVMRegNum(Inst.getRegister(), true);
         if (i + 1 == e)
-          return CU::UNWIND_AArch64_MODE_DWARF;
+          return CU::UNWIND_ARM64_MODE_DWARF;
 
         const MCCFIInstruction &Inst2 = Instrs[++i];
         if (Inst2.getOperation() != MCCFIInstruction::OpOffset)
-          return CU::UNWIND_AArch64_MODE_DWARF;
+          return CU::UNWIND_ARM64_MODE_DWARF;
         unsigned Reg2 = MRI.getLLVMRegNum(Inst2.getRegister(), true);
 
         // N.B. The encodings must be in register number order, and the X
@@ -390,19 +466,19 @@ public:
 
         if (Reg1 == AArch64::X19 && Reg2 == AArch64::X20 &&
             (CompactUnwindEncoding & 0xF1E) == 0)
-          CompactUnwindEncoding |= CU::UNWIND_AArch64_FRAME_X19_X20_PAIR;
+          CompactUnwindEncoding |= CU::UNWIND_ARM64_FRAME_X19_X20_PAIR;
         else if (Reg1 == AArch64::X21 && Reg2 == AArch64::X22 &&
                  (CompactUnwindEncoding & 0xF1C) == 0)
-          CompactUnwindEncoding |= CU::UNWIND_AArch64_FRAME_X21_X22_PAIR;
+          CompactUnwindEncoding |= CU::UNWIND_ARM64_FRAME_X21_X22_PAIR;
         else if (Reg1 == AArch64::X23 && Reg2 == AArch64::X24 &&
                  (CompactUnwindEncoding & 0xF18) == 0)
-          CompactUnwindEncoding |= CU::UNWIND_AArch64_FRAME_X23_X24_PAIR;
+          CompactUnwindEncoding |= CU::UNWIND_ARM64_FRAME_X23_X24_PAIR;
         else if (Reg1 == AArch64::X25 && Reg2 == AArch64::X26 &&
                  (CompactUnwindEncoding & 0xF10) == 0)
-          CompactUnwindEncoding |= CU::UNWIND_AArch64_FRAME_X25_X26_PAIR;
+          CompactUnwindEncoding |= CU::UNWIND_ARM64_FRAME_X25_X26_PAIR;
         else if (Reg1 == AArch64::X27 && Reg2 == AArch64::X28 &&
                  (CompactUnwindEncoding & 0xF00) == 0)
-          CompactUnwindEncoding |= CU::UNWIND_AArch64_FRAME_X27_X28_PAIR;
+          CompactUnwindEncoding |= CU::UNWIND_ARM64_FRAME_X27_X28_PAIR;
         else {
           Reg1 = getDRegFromBReg(Reg1);
           Reg2 = getDRegFromBReg(Reg2);
@@ -413,18 +489,18 @@ public:
           // D14/D15 pair = 0x00000800
           if (Reg1 == AArch64::D8 && Reg2 == AArch64::D9 &&
               (CompactUnwindEncoding & 0xE00) == 0)
-            CompactUnwindEncoding |= CU::UNWIND_AArch64_FRAME_D8_D9_PAIR;
+            CompactUnwindEncoding |= CU::UNWIND_ARM64_FRAME_D8_D9_PAIR;
           else if (Reg1 == AArch64::D10 && Reg2 == AArch64::D11 &&
                    (CompactUnwindEncoding & 0xC00) == 0)
-            CompactUnwindEncoding |= CU::UNWIND_AArch64_FRAME_D10_D11_PAIR;
+            CompactUnwindEncoding |= CU::UNWIND_ARM64_FRAME_D10_D11_PAIR;
           else if (Reg1 == AArch64::D12 && Reg2 == AArch64::D13 &&
                    (CompactUnwindEncoding & 0x800) == 0)
-            CompactUnwindEncoding |= CU::UNWIND_AArch64_FRAME_D12_D13_PAIR;
+            CompactUnwindEncoding |= CU::UNWIND_ARM64_FRAME_D12_D13_PAIR;
           else if (Reg1 == AArch64::D14 && Reg2 == AArch64::D15)
-            CompactUnwindEncoding |= CU::UNWIND_AArch64_FRAME_D14_D15_PAIR;
+            CompactUnwindEncoding |= CU::UNWIND_ARM64_FRAME_D14_D15_PAIR;
           else
             // A pair was pushed which we cannot handle.
-            return CU::UNWIND_AArch64_MODE_DWARF;
+            return CU::UNWIND_ARM64_MODE_DWARF;
         }
 
         break;
@@ -436,9 +512,9 @@ public:
       // With compact unwind info we can only represent stack adjustments of up
       // to 65520 bytes.
       if (StackSize > 65520)
-        return CU::UNWIND_AArch64_MODE_DWARF;
+        return CU::UNWIND_ARM64_MODE_DWARF;
 
-      CompactUnwindEncoding |= CU::UNWIND_AArch64_MODE_FRAMELESS;
+      CompactUnwindEncoding |= CU::UNWIND_ARM64_MODE_FRAMELESS;
       CompactUnwindEncoding |= encodeStackAdjustment(StackSize);
     }
 
@@ -453,10 +529,9 @@ namespace {
 class ELFAArch64AsmBackend : public AArch64AsmBackend {
 public:
   uint8_t OSABI;
-  bool IsLittleEndian;
 
   ELFAArch64AsmBackend(const Target &T, uint8_t OSABI, bool IsLittleEndian)
-    : AArch64AsmBackend(T), OSABI(OSABI), IsLittleEndian(IsLittleEndian) {}
+    : AArch64AsmBackend(T, IsLittleEndian), OSABI(OSABI) {}
 
   MCObjectWriter *createObjectWriter(raw_pwrite_stream &OS) const override {
     return createAArch64ELFObjectWriter(OS, OSABI, IsLittleEndian);
@@ -466,9 +541,6 @@ public:
                          const MCFixup &Fixup, const MCFragment *DF,
                          const MCValue &Target, uint64_t &Value,
                          bool &IsResolved) override;
-
-  void applyFixup(const MCFixup &Fixup, char *Data, unsigned DataSize,
-                  uint64_t Value, bool IsPCRel) const override;
 };
 
 void ELFAArch64AsmBackend::processFixupValue(
@@ -489,34 +561,14 @@ void ELFAArch64AsmBackend::processFixupValue(
   // to the linker -- a relocation!
   if ((uint32_t)Fixup.getKind() == AArch64::fixup_aarch64_pcrel_adrp_imm21)
     IsResolved = false;
-}
-
-// Returns whether this fixup is based on an address in the .eh_frame section,
-// and therefore should be byte swapped.
-// FIXME: Should be replaced with something more principled.
-static bool isByteSwappedFixup(const MCExpr *E) {
-  MCValue Val;
-  if (!E->evaluateAsRelocatable(Val, nullptr, nullptr))
-    return false;
 
-  if (!Val.getSymA() || Val.getSymA()->getSymbol().isUndefined())
-    return false;
-
-  const MCSectionELF *SecELF =
-      dyn_cast<MCSectionELF>(&Val.getSymA()->getSymbol().getSection());
-  return SecELF->getSectionName() == ".eh_frame";
+  // Try to get the encoded value for the fixup as-if we're mapping it into
+  // the instruction. This allows adjustFixupValue() to issue a diagnostic
+  // if the value is invalid.
+  if (IsResolved)
+    (void)adjustFixupValue(Fixup, Value, &Asm.getContext());
 }
 
-void ELFAArch64AsmBackend::applyFixup(const MCFixup &Fixup, char *Data,
-                                      unsigned DataSize, uint64_t Value,
-                                      bool IsPCRel) const {
-  // store fixups in .eh_frame section in big endian order
-  if (!IsLittleEndian && Fixup.getKind() == FK_Data_4) {
-    if (isByteSwappedFixup(Fixup.getValue()))
-      Value = ByteSwap_32(unsigned(Value));
-  }
-  AArch64AsmBackend::applyFixup (Fixup, Data, DataSize, Value, IsPCRel);
-}
 }
 
 MCAsmBackend *llvm::createAArch64leAsmBackend(const Target &T,
diff --git a/lib/Target/AArch64/MCTargetDesc/AArch64ELFObjectWriter.cpp b/lib/Target/AArch64/MCTargetDesc/AArch64ELFObjectWriter.cpp
index 1f516d1db896..4b4c4097b97b 100644
--- a/lib/Target/AArch64/MCTargetDesc/AArch64ELFObjectWriter.cpp
+++ b/lib/Target/AArch64/MCTargetDesc/AArch64ELFObjectWriter.cpp
@@ -15,6 +15,7 @@
 #include "MCTargetDesc/AArch64FixupKinds.h"
 #include "MCTargetDesc/AArch64MCExpr.h"
 #include "MCTargetDesc/AArch64MCTargetDesc.h"
+#include "llvm/MC/MCContext.h"
 #include "llvm/MC/MCELFObjectWriter.h"
 #include "llvm/MC/MCValue.h"
 #include "llvm/Support/ErrorHandling.h"
@@ -29,8 +30,8 @@ public:
   ~AArch64ELFObjectWriter() override;
 
 protected:
-  unsigned GetRelocType(const MCValue &Target, const MCFixup &Fixup,
-                        bool IsPCRel) const override;
+  unsigned getRelocType(MCContext &Ctx, const MCValue &Target,
+                        const MCFixup &Fixup, bool IsPCRel) const override;
 
 private:
 };
@@ -43,9 +44,10 @@ AArch64ELFObjectWriter::AArch64ELFObjectWriter(uint8_t OSABI,
 
 AArch64ELFObjectWriter::~AArch64ELFObjectWriter() {}
 
-unsigned AArch64ELFObjectWriter::GetRelocType(const MCValue &Target,
-                                            const MCFixup &Fixup,
-                                            bool IsPCRel) const {
+unsigned AArch64ELFObjectWriter::getRelocType(MCContext &Ctx,
+                                              const MCValue &Target,
+                                              const MCFixup &Fixup,
+                                              bool IsPCRel) const {
   AArch64MCExpr::VariantKind RefKind =
       static_cast<AArch64MCExpr::VariantKind>(Target.getRefKind());
   AArch64MCExpr::VariantKind SymLoc = AArch64MCExpr::getSymbolLoc(RefKind);
@@ -61,6 +63,9 @@ unsigned AArch64ELFObjectWriter::GetRelocType(const MCValue &Target,
 
   if (IsPCRel) {
     switch ((unsigned)Fixup.getKind()) {
+    case FK_Data_1:
+      Ctx.reportError(Fixup.getLoc(), "1-byte data relocations not supported");
+      return ELF::R_AARCH64_NONE;
     case FK_Data_2:
       return ELF::R_AARCH64_PREL16;
     case FK_Data_4:
@@ -79,7 +84,9 @@ unsigned AArch64ELFObjectWriter::GetRelocType(const MCValue &Target,
         return ELF::R_AARCH64_TLSIE_ADR_GOTTPREL_PAGE21;
       if (SymLoc == AArch64MCExpr::VK_TLSDESC && !IsNC)
         return ELF::R_AARCH64_TLSDESC_ADR_PAGE21;
-      llvm_unreachable("invalid symbol kind for ADRP relocation");
+      Ctx.reportError(Fixup.getLoc(),
+                      "invalid symbol kind for ADRP relocation");
+      return ELF::R_AARCH64_NONE;
     case AArch64::fixup_aarch64_pcrel_branch26:
       return ELF::R_AARCH64_JUMP26;
     case AArch64::fixup_aarch64_pcrel_call26:
@@ -93,10 +100,14 @@ unsigned AArch64ELFObjectWriter::GetRelocType(const MCValue &Target,
     case AArch64::fixup_aarch64_pcrel_branch19:
       return ELF::R_AARCH64_CONDBR19;
     default:
-      llvm_unreachable("Unsupported pc-relative fixup kind");
+      Ctx.reportError(Fixup.getLoc(), "Unsupported pc-relative fixup kind");
+      return ELF::R_AARCH64_NONE;
     }
   } else {
     switch ((unsigned)Fixup.getKind()) {
+    case FK_Data_1:
+      Ctx.reportError(Fixup.getLoc(), "1-byte data relocations not supported");
+      return ELF::R_AARCH64_NONE;
     case FK_Data_2:
       return ELF::R_AARCH64_ABS16;
     case FK_Data_4:
@@ -121,8 +132,9 @@ unsigned AArch64ELFObjectWriter::GetRelocType(const MCValue &Target,
       if (SymLoc == AArch64MCExpr::VK_ABS && IsNC)
         return ELF::R_AARCH64_ADD_ABS_LO12_NC;
 
-      report_fatal_error("invalid fixup for add (uimm12) instruction");
-      return 0;
+      Ctx.reportError(Fixup.getLoc(),
+                      "invalid fixup for add (uimm12) instruction");
+      return ELF::R_AARCH64_NONE;
     case AArch64::fixup_aarch64_ldst_imm12_scale1:
       if (SymLoc == AArch64MCExpr::VK_ABS && IsNC)
         return ELF::R_AARCH64_LDST8_ABS_LO12_NC;
@@ -135,8 +147,9 @@ unsigned AArch64ELFObjectWriter::GetRelocType(const MCValue &Target,
       if (SymLoc == AArch64MCExpr::VK_TPREL && IsNC)
         return ELF::R_AARCH64_TLSLE_LDST8_TPREL_LO12_NC;
 
-      report_fatal_error("invalid fixup for 8-bit load/store instruction");
-      return 0;
+      Ctx.reportError(Fixup.getLoc(),
+                      "invalid fixup for 8-bit load/store instruction");
+      return ELF::R_AARCH64_NONE;
     case AArch64::fixup_aarch64_ldst_imm12_scale2:
       if (SymLoc == AArch64MCExpr::VK_ABS && IsNC)
         return ELF::R_AARCH64_LDST16_ABS_LO12_NC;
@@ -149,8 +162,9 @@ unsigned AArch64ELFObjectWriter::GetRelocType(const MCValue &Target,
       if (SymLoc == AArch64MCExpr::VK_TPREL && IsNC)
         return ELF::R_AARCH64_TLSLE_LDST16_TPREL_LO12_NC;
 
-      report_fatal_error("invalid fixup for 16-bit load/store instruction");
-      return 0;
+      Ctx.reportError(Fixup.getLoc(),
+                      "invalid fixup for 16-bit load/store instruction");
+      return ELF::R_AARCH64_NONE;
     case AArch64::fixup_aarch64_ldst_imm12_scale4:
       if (SymLoc == AArch64MCExpr::VK_ABS && IsNC)
         return ELF::R_AARCH64_LDST32_ABS_LO12_NC;
@@ -163,8 +177,9 @@ unsigned AArch64ELFObjectWriter::GetRelocType(const MCValue &Target,
       if (SymLoc == AArch64MCExpr::VK_TPREL && IsNC)
         return ELF::R_AARCH64_TLSLE_LDST32_TPREL_LO12_NC;
 
-      report_fatal_error("invalid fixup for 32-bit load/store instruction");
-      return 0;
+      Ctx.reportError(Fixup.getLoc(),
+                      "invalid fixup for 32-bit load/store instruction");
+      return ELF::R_AARCH64_NONE;
     case AArch64::fixup_aarch64_ldst_imm12_scale8:
       if (SymLoc == AArch64MCExpr::VK_ABS && IsNC)
         return ELF::R_AARCH64_LDST64_ABS_LO12_NC;
@@ -183,14 +198,16 @@ unsigned AArch64ELFObjectWriter::GetRelocType(const MCValue &Target,
       if (SymLoc == AArch64MCExpr::VK_TLSDESC && IsNC)
         return ELF::R_AARCH64_TLSDESC_LD64_LO12_NC;
 
-      report_fatal_error("invalid fixup for 64-bit load/store instruction");
-      return 0;
+      Ctx.reportError(Fixup.getLoc(),
+                      "invalid fixup for 64-bit load/store instruction");
+      return ELF::R_AARCH64_NONE;
     case AArch64::fixup_aarch64_ldst_imm12_scale16:
       if (SymLoc == AArch64MCExpr::VK_ABS && IsNC)
         return ELF::R_AARCH64_LDST128_ABS_LO12_NC;
 
-      report_fatal_error("invalid fixup for 128-bit load/store instruction");
-      return 0;
+      Ctx.reportError(Fixup.getLoc(),
+                      "invalid fixup for 128-bit load/store instruction");
+      return ELF::R_AARCH64_NONE;
     case AArch64::fixup_aarch64_movw:
       if (RefKind == AArch64MCExpr::VK_ABS_G3)
         return ELF::R_AARCH64_MOVW_UABS_G3;
@@ -236,12 +253,14 @@ unsigned AArch64ELFObjectWriter::GetRelocType(const MCValue &Target,
         return ELF::R_AARCH64_TLSIE_MOVW_GOTTPREL_G1;
       if (RefKind == AArch64MCExpr::VK_GOTTPREL_G0_NC)
         return ELF::R_AARCH64_TLSIE_MOVW_GOTTPREL_G0_NC;
-      report_fatal_error("invalid fixup for movz/movk instruction");
-      return 0;
+      Ctx.reportError(Fixup.getLoc(),
+                      "invalid fixup for movz/movk instruction");
+      return ELF::R_AARCH64_NONE;
     case AArch64::fixup_aarch64_tlsdesc_call:
       return ELF::R_AARCH64_TLSDESC_CALL;
     default:
-      llvm_unreachable("Unknown ELF relocation type");
+      Ctx.reportError(Fixup.getLoc(), "Unknown ELF relocation type");
+      return ELF::R_AARCH64_NONE;
     }
   }
 
diff --git a/lib/Target/AArch64/MCTargetDesc/AArch64MCCodeEmitter.cpp b/lib/Target/AArch64/MCTargetDesc/AArch64MCCodeEmitter.cpp
index 7d8e79bc63c8..7b9ff8fa0503 100644
--- a/lib/Target/AArch64/MCTargetDesc/AArch64MCCodeEmitter.cpp
+++ b/lib/Target/AArch64/MCTargetDesc/AArch64MCCodeEmitter.cpp
@@ -154,24 +154,6 @@ public:
                                 SmallVectorImpl<MCFixup> &Fixups,
                                 const MCSubtargetInfo &STI) const;
 
-  /// getSIMDShift64OpValue - Return the encoded value for the
-  // shift-by-immediate AdvSIMD instructions.
-  uint32_t getSIMDShift64OpValue(const MCInst &MI, unsigned OpIdx,
-                                 SmallVectorImpl<MCFixup> &Fixups,
-                                 const MCSubtargetInfo &STI) const;
-
-  uint32_t getSIMDShift64_32OpValue(const MCInst &MI, unsigned OpIdx,
-                                    SmallVectorImpl<MCFixup> &Fixups,
-                                    const MCSubtargetInfo &STI) const;
-
-  uint32_t getSIMDShift32OpValue(const MCInst &MI, unsigned OpIdx,
-                                 SmallVectorImpl<MCFixup> &Fixups,
-                                 const MCSubtargetInfo &STI) const;
-
-  uint32_t getSIMDShift16OpValue(const MCInst &MI, unsigned OpIdx,
-                                 SmallVectorImpl<MCFixup> &Fixups,
-                                 const MCSubtargetInfo &STI) const;
-
   unsigned fixMOVZ(const MCInst &MI, unsigned EncodedValue,
                    const MCSubtargetInfo &STI) const;
 
@@ -428,41 +410,6 @@ AArch64MCCodeEmitter::getVecShifterOpValue(const MCInst &MI, unsigned OpIdx,
   llvm_unreachable("Invalid value for vector shift amount!");
 }
 
-uint32_t
-AArch64MCCodeEmitter::getSIMDShift64OpValue(const MCInst &MI, unsigned OpIdx,
-                                            SmallVectorImpl<MCFixup> &Fixups,
-                                            const MCSubtargetInfo &STI) const {
-  const MCOperand &MO = MI.getOperand(OpIdx);
-  assert(MO.isImm() && "Expected an immediate value for the shift amount!");
-  return 64 - (MO.getImm());
-}
-
-uint32_t AArch64MCCodeEmitter::getSIMDShift64_32OpValue(
-    const MCInst &MI, unsigned OpIdx, SmallVectorImpl<MCFixup> &Fixups,
-    const MCSubtargetInfo &STI) const {
-  const MCOperand &MO = MI.getOperand(OpIdx);
-  assert(MO.isImm() && "Expected an immediate value for the shift amount!");
-  return 64 - (MO.getImm() | 32);
-}
-
-uint32_t
-AArch64MCCodeEmitter::getSIMDShift32OpValue(const MCInst &MI, unsigned OpIdx,
-                                            SmallVectorImpl<MCFixup> &Fixups,
-                                            const MCSubtargetInfo &STI) const {
-  const MCOperand &MO = MI.getOperand(OpIdx);
-  assert(MO.isImm() && "Expected an immediate value for the shift amount!");
-  return 32 - (MO.getImm() | 16);
-}
-
-uint32_t
-AArch64MCCodeEmitter::getSIMDShift16OpValue(const MCInst &MI, unsigned OpIdx,
-                                            SmallVectorImpl<MCFixup> &Fixups,
-                                            const MCSubtargetInfo &STI) const {
-  const MCOperand &MO = MI.getOperand(OpIdx);
-  assert(MO.isImm() && "Expected an immediate value for the shift amount!");
-  return 16 - (MO.getImm() | 8);
-}
-
 /// getFixedPointScaleOpValue - Return the encoded value for the
 // FP-to-fixed-point scale factor.
 uint32_t AArch64MCCodeEmitter::getFixedPointScaleOpValue(
diff --git a/lib/Target/AArch64/MCTargetDesc/AArch64MCTargetDesc.cpp b/lib/Target/AArch64/MCTargetDesc/AArch64MCTargetDesc.cpp
index 9f7bed0d3b12..702780621208 100644
--- a/lib/Target/AArch64/MCTargetDesc/AArch64MCTargetDesc.cpp
+++ b/lib/Target/AArch64/MCTargetDesc/AArch64MCTargetDesc.cpp
@@ -15,7 +15,6 @@
 #include "AArch64ELFStreamer.h"
 #include "AArch64MCAsmInfo.h"
 #include "InstPrinter/AArch64InstPrinter.h"
-#include "llvm/MC/MCCodeGenInfo.h"
 #include "llvm/MC/MCInstrInfo.h"
 #include "llvm/MC/MCRegisterInfo.h"
 #include "llvm/MC/MCStreamer.h"
@@ -72,10 +71,8 @@ static MCAsmInfo *createAArch64MCAsmInfo(const MCRegisterInfo &MRI,
   return MAI;
 }
 
-static MCCodeGenInfo *createAArch64MCCodeGenInfo(const Triple &TT,
-                                                 Reloc::Model RM,
-                                                 CodeModel::Model CM,
-                                                 CodeGenOpt::Level OL) {
+static void adjustCodeGenOpts(const Triple &TT, Reloc::Model RM,
+                              CodeModel::Model &CM) {
   assert((TT.isOSBinFormatELF() || TT.isOSBinFormatMachO()) &&
          "Only expect Darwin and ELF targets");
 
@@ -89,19 +86,6 @@ static MCCodeGenInfo *createAArch64MCCodeGenInfo(const Triple &TT,
   else if (CM != CodeModel::Small && CM != CodeModel::Large)
     report_fatal_error(
         "Only small and large code models are allowed on AArch64");
-
-  // AArch64 Darwin is always PIC.
-  if (TT.isOSDarwin())
-    RM = Reloc::PIC_;
-  // On ELF platforms the default static relocation model has a smart enough
-  // linker to cope with referencing external symbols defined in a shared
-  // library. Hence DynamicNoPIC doesn't need to be promoted to PIC.
-  else if (RM == Reloc::Default || RM == Reloc::DynamicNoPIC)
-    RM = Reloc::Static;
-
-  MCCodeGenInfo *X = new MCCodeGenInfo();
-  X->initMCCodeGenInfo(RM, CM, OL);
-  return X;
 }
 
 static MCInstPrinter *createAArch64MCInstPrinter(const Triple &T,
@@ -140,7 +124,7 @@ extern "C" void LLVMInitializeAArch64TargetMC() {
     RegisterMCAsmInfoFn X(*T, createAArch64MCAsmInfo);
 
     // Register the MC codegen info.
-    TargetRegistry::RegisterMCCodeGenInfo(*T, createAArch64MCCodeGenInfo);
+    TargetRegistry::registerMCAdjustCodeGenOpts(*T, adjustCodeGenOpts);
 
     // Register the MC instruction info.
     TargetRegistry::RegisterMCInstrInfo(*T, createAArch64MCInstrInfo);
diff --git a/lib/Target/AArch64/MCTargetDesc/AArch64MCTargetDesc.h b/lib/Target/AArch64/MCTargetDesc/AArch64MCTargetDesc.h
index 342384437c6a..39414cc0c6a5 100644
--- a/lib/Target/AArch64/MCTargetDesc/AArch64MCTargetDesc.h
+++ b/lib/Target/AArch64/MCTargetDesc/AArch64MCTargetDesc.h
@@ -15,7 +15,6 @@
 #define LLVM_LIB_TARGET_AARCH64_MCTARGETDESC_AARCH64MCTARGETDESC_H
 
 #include "llvm/Support/DataTypes.h"
-#include <string>
 
 namespace llvm {
 class formatted_raw_ostream;
diff --git a/lib/Target/AArch64/MCTargetDesc/Makefile b/lib/Target/AArch64/MCTargetDesc/Makefile
deleted file mode 100644
index 5779ac5ac60a..000000000000
--- a/lib/Target/AArch64/MCTargetDesc/Makefile
+++ /dev/null
@@ -1,16 +0,0 @@
-##===- lib/Target/AArch64/TargetDesc/Makefile --------------*- Makefile -*-===##
-#
-#                     The LLVM Compiler Infrastructure
-#
-# This file is distributed under the University of Illinois Open Source
-# License. See LICENSE.TXT for details.
-#
-##===----------------------------------------------------------------------===##
-
-LEVEL = ../../../..
-LIBRARYNAME = LLVMAArch64Desc
-
-# Hack: we need to include 'main' target directory to grab private headers
-CPP.Flags += -I$(PROJ_OBJ_DIR)/.. -I$(PROJ_SRC_DIR)/..
-
-include $(LEVEL)/Makefile.common
diff --git a/lib/Target/AArch64/Makefile b/lib/Target/AArch64/Makefile
deleted file mode 100644
index f356c5850413..000000000000
--- a/lib/Target/AArch64/Makefile
+++ /dev/null
@@ -1,25 +0,0 @@
-##===- lib/Target/AArch64/Makefile -------------------------*- Makefile -*-===##
-#
-#                     The LLVM Compiler Infrastructure
-#
-# This file is distributed under the University of Illinois Open Source
-# License. See LICENSE.TXT for details.
-#
-##===----------------------------------------------------------------------===##
-
-LEVEL = ../../..
-LIBRARYNAME = LLVMAArch64CodeGen
-TARGET = AArch64
-
-# Make sure that tblgen is run, first thing.
-BUILT_SOURCES = AArch64GenRegisterInfo.inc AArch64GenInstrInfo.inc \
-		AArch64GenAsmWriter.inc AArch64GenAsmWriter1.inc \
-		AArch64GenDAGISel.inc \
-		AArch64GenCallingConv.inc AArch64GenAsmMatcher.inc \
-		AArch64GenSubtargetInfo.inc AArch64GenMCCodeEmitter.inc \
-		AArch64GenFastISel.inc AArch64GenDisassemblerTables.inc \
-		AArch64GenMCPseudoLowering.inc
-
-DIRS = TargetInfo InstPrinter AsmParser Disassembler MCTargetDesc Utils
-
-include $(LEVEL)/Makefile.common
diff --git a/lib/Target/AArch64/TargetInfo/Makefile b/lib/Target/AArch64/TargetInfo/Makefile
deleted file mode 100644
index 9dc9aa4bccf7..000000000000
--- a/lib/Target/AArch64/TargetInfo/Makefile
+++ /dev/null
@@ -1,15 +0,0 @@
-##===- lib/Target/AArch64/TargetInfo/Makefile --------------*- Makefile -*-===##
-#
-#                     The LLVM Compiler Infrastructure
-#
-# This file is distributed under the University of Illinois Open Source
-# License. See LICENSE.TXT for details.
-#
-##===----------------------------------------------------------------------===##
-LEVEL = ../../../..
-LIBRARYNAME = LLVMAArch64Info
-
-# Hack: we need to include 'main' target directory to grab private headers
-CPPFLAGS = -I$(PROJ_OBJ_DIR)/.. -I$(PROJ_SRC_DIR)/..
-
-include $(LEVEL)/Makefile.common
diff --git a/lib/Target/AArch64/Utils/AArch64BaseInfo.cpp b/lib/Target/AArch64/Utils/AArch64BaseInfo.cpp
index cde1c6df2608..e65ba1f2401d 100644
--- a/lib/Target/AArch64/Utils/AArch64BaseInfo.cpp
+++ b/lib/Target/AArch64/Utils/AArch64BaseInfo.cpp
@@ -11,858 +11,84 @@
 //
 //===----------------------------------------------------------------------===//
 #include "AArch64BaseInfo.h"
-#include "llvm/ADT/APFloat.h"
+#include "llvm/ADT/ArrayRef.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/StringExtras.h"
 #include "llvm/Support/Regex.h"
 
 using namespace llvm;
 
-StringRef AArch64NamedImmMapper::toString(uint32_t Value,
-          const FeatureBitset& FeatureBits, bool &Valid) const {
-  for (unsigned i = 0; i < NumMappings; ++i) {
-    if (Mappings[i].isValueEqual(Value, FeatureBits)) {
-      Valid = true;
-      return Mappings[i].Name;
-    }
+namespace llvm {
+  namespace AArch64AT {
+#define GET_AT_IMPL
+#include "AArch64GenSystemOperands.inc"
   }
-
-  Valid = false;
-  return StringRef();
 }
 
-uint32_t AArch64NamedImmMapper::fromString(StringRef Name,
-         const FeatureBitset& FeatureBits, bool &Valid) const {
-  std::string LowerCaseName = Name.lower();
-  for (unsigned i = 0; i < NumMappings; ++i) {
-    if (Mappings[i].isNameEqual(LowerCaseName, FeatureBits)) {
-      Valid = true;
-      return Mappings[i].Value;
-    }
-  }
 
-  Valid = false;
-  return -1;
+namespace llvm {
+  namespace AArch64DB {
+#define GET_DB_IMPL
+#include "AArch64GenSystemOperands.inc"
+  }
 }
 
-bool AArch64NamedImmMapper::validImm(uint32_t Value) const {
-  return Value < TooBigImm;
+namespace llvm {
+  namespace AArch64DC {
+#define GET_DC_IMPL
+#include "AArch64GenSystemOperands.inc"
+  }
 }
 
-const AArch64NamedImmMapper::Mapping AArch64AT::ATMapper::ATMappings[] = {
-  {"s1e1r", S1E1R, {}},
-  {"s1e2r", S1E2R, {}},
-  {"s1e3r", S1E3R, {}},
-  {"s1e1w", S1E1W, {}},
-  {"s1e2w", S1E2W, {}},
-  {"s1e3w", S1E3W, {}},
-  {"s1e0r", S1E0R, {}},
-  {"s1e0w", S1E0W, {}},
-  {"s12e1r", S12E1R, {}},
-  {"s12e1w", S12E1W, {}},
-  {"s12e0r", S12E0R, {}},
-  {"s12e0w", S12E0W, {}},
-};
-
-AArch64AT::ATMapper::ATMapper()
-  : AArch64NamedImmMapper(ATMappings, 0) {}
-
-const AArch64NamedImmMapper::Mapping AArch64DB::DBarrierMapper::DBarrierMappings[] = {
-  {"oshld", OSHLD, {}},
-  {"oshst", OSHST, {}},
-  {"osh", OSH, {}},
-  {"nshld", NSHLD, {}},
-  {"nshst", NSHST, {}},
-  {"nsh", NSH, {}},
-  {"ishld", ISHLD, {}},
-  {"ishst", ISHST, {}},
-  {"ish", ISH, {}},
-  {"ld", LD, {}},
-  {"st", ST, {}},
-  {"sy", SY, {}}
-};
-
-AArch64DB::DBarrierMapper::DBarrierMapper()
-  : AArch64NamedImmMapper(DBarrierMappings, 16u) {}
-
-const AArch64NamedImmMapper::Mapping AArch64DC::DCMapper::DCMappings[] = {
-  {"zva", ZVA, {}},
-  {"ivac", IVAC, {}},
-  {"isw", ISW, {}},
-  {"cvac", CVAC, {}},
-  {"csw", CSW, {}},
-  {"cvau", CVAU, {}},
-  {"civac", CIVAC, {}},
-  {"cisw", CISW, {}}
-};
-
-AArch64DC::DCMapper::DCMapper()
-  : AArch64NamedImmMapper(DCMappings, 0) {}
-
-const AArch64NamedImmMapper::Mapping AArch64IC::ICMapper::ICMappings[] = {
-  {"ialluis",  IALLUIS, {}},
-  {"iallu", IALLU, {}},
-  {"ivau", IVAU, {}}
-};
-
-AArch64IC::ICMapper::ICMapper()
-  : AArch64NamedImmMapper(ICMappings, 0) {}
-
-const AArch64NamedImmMapper::Mapping AArch64ISB::ISBMapper::ISBMappings[] = {
-  {"sy",  SY, {}},
-};
-
-AArch64ISB::ISBMapper::ISBMapper()
-  : AArch64NamedImmMapper(ISBMappings, 16) {}
-
-const AArch64NamedImmMapper::Mapping AArch64PRFM::PRFMMapper::PRFMMappings[] = {
-  {"pldl1keep", PLDL1KEEP, {}},
-  {"pldl1strm", PLDL1STRM, {}},
-  {"pldl2keep", PLDL2KEEP, {}},
-  {"pldl2strm", PLDL2STRM, {}},
-  {"pldl3keep", PLDL3KEEP, {}},
-  {"pldl3strm", PLDL3STRM, {}},
-  {"plil1keep", PLIL1KEEP, {}},
-  {"plil1strm", PLIL1STRM, {}},
-  {"plil2keep", PLIL2KEEP, {}},
-  {"plil2strm", PLIL2STRM, {}},
-  {"plil3keep", PLIL3KEEP, {}},
-  {"plil3strm", PLIL3STRM, {}},
-  {"pstl1keep", PSTL1KEEP, {}},
-  {"pstl1strm", PSTL1STRM, {}},
-  {"pstl2keep", PSTL2KEEP, {}},
-  {"pstl2strm", PSTL2STRM, {}},
-  {"pstl3keep", PSTL3KEEP, {}},
-  {"pstl3strm", PSTL3STRM, {}}
-};
-
-AArch64PRFM::PRFMMapper::PRFMMapper()
-  : AArch64NamedImmMapper(PRFMMappings, 32) {}
-
-const AArch64NamedImmMapper::Mapping AArch64PState::PStateMapper::PStateMappings[] = {
-  {"spsel", SPSel, {}},
-  {"daifset", DAIFSet, {}},
-  {"daifclr", DAIFClr, {}},
-
-  // v8.1a "Privileged Access Never" extension-specific PStates
-  {"pan", PAN, {AArch64::HasV8_1aOps}},
-
-  // v8.2a
-  {"uao", UAO, {AArch64::HasV8_2aOps}},
-};
-
-AArch64PState::PStateMapper::PStateMapper()
-  : AArch64NamedImmMapper(PStateMappings, 0) {}
-
-const AArch64NamedImmMapper::Mapping AArch64PSBHint::PSBHintMapper::PSBHintMappings[] = {
-  // v8.2a "Statistical Profiling" extension-specific PSB operand
-  {"csync", CSync, {AArch64::FeatureSPE}},
-};
-
-AArch64PSBHint::PSBHintMapper::PSBHintMapper()
-  : AArch64NamedImmMapper(PSBHintMappings, 0) {}
-
-const AArch64NamedImmMapper::Mapping AArch64SysReg::MRSMapper::MRSMappings[] = {
-  {"mdccsr_el0", MDCCSR_EL0, {}},
-  {"dbgdtrrx_el0", DBGDTRRX_EL0, {}},
-  {"mdrar_el1", MDRAR_EL1, {}},
-  {"oslsr_el1", OSLSR_EL1, {}},
-  {"dbgauthstatus_el1", DBGAUTHSTATUS_EL1, {}},
-  {"pmceid0_el0", PMCEID0_EL0, {}},
-  {"pmceid1_el0", PMCEID1_EL0, {}},
-  {"midr_el1", MIDR_EL1, {}},
-  {"ccsidr_el1", CCSIDR_EL1, {}},
-  {"clidr_el1", CLIDR_EL1, {}},
-  {"ctr_el0", CTR_EL0, {}},
-  {"mpidr_el1", MPIDR_EL1, {}},
-  {"revidr_el1", REVIDR_EL1, {}},
-  {"aidr_el1", AIDR_EL1, {}},
-  {"dczid_el0", DCZID_EL0, {}},
-  {"id_pfr0_el1", ID_PFR0_EL1, {}},
-  {"id_pfr1_el1", ID_PFR1_EL1, {}},
-  {"id_dfr0_el1", ID_DFR0_EL1, {}},
-  {"id_afr0_el1", ID_AFR0_EL1, {}},
-  {"id_mmfr0_el1", ID_MMFR0_EL1, {}},
-  {"id_mmfr1_el1", ID_MMFR1_EL1, {}},
-  {"id_mmfr2_el1", ID_MMFR2_EL1, {}},
-  {"id_mmfr3_el1", ID_MMFR3_EL1, {}},
-  {"id_mmfr4_el1", ID_MMFR4_EL1, {}},
-  {"id_isar0_el1", ID_ISAR0_EL1, {}},
-  {"id_isar1_el1", ID_ISAR1_EL1, {}},
-  {"id_isar2_el1", ID_ISAR2_EL1, {}},
-  {"id_isar3_el1", ID_ISAR3_EL1, {}},
-  {"id_isar4_el1", ID_ISAR4_EL1, {}},
-  {"id_isar5_el1", ID_ISAR5_EL1, {}},
-  {"id_aa64pfr0_el1", ID_A64PFR0_EL1, {}},
-  {"id_aa64pfr1_el1", ID_A64PFR1_EL1, {}},
-  {"id_aa64dfr0_el1", ID_A64DFR0_EL1, {}},
-  {"id_aa64dfr1_el1", ID_A64DFR1_EL1, {}},
-  {"id_aa64afr0_el1", ID_A64AFR0_EL1, {}},
-  {"id_aa64afr1_el1", ID_A64AFR1_EL1, {}},
-  {"id_aa64isar0_el1", ID_A64ISAR0_EL1, {}},
-  {"id_aa64isar1_el1", ID_A64ISAR1_EL1, {}},
-  {"id_aa64mmfr0_el1", ID_A64MMFR0_EL1, {}},
-  {"id_aa64mmfr1_el1", ID_A64MMFR1_EL1, {}},
-  {"id_aa64mmfr2_el1", ID_A64MMFR2_EL1, {AArch64::HasV8_2aOps}},
-  {"mvfr0_el1", MVFR0_EL1, {}},
-  {"mvfr1_el1", MVFR1_EL1, {}},
-  {"mvfr2_el1", MVFR2_EL1, {}},
-  {"rvbar_el1", RVBAR_EL1, {}},
-  {"rvbar_el2", RVBAR_EL2, {}},
-  {"rvbar_el3", RVBAR_EL3, {}},
-  {"isr_el1", ISR_EL1, {}},
-  {"cntpct_el0", CNTPCT_EL0, {}},
-  {"cntvct_el0", CNTVCT_EL0, {}},
-
-  // Trace registers
-  {"trcstatr", TRCSTATR, {}},
-  {"trcidr8", TRCIDR8, {}},
-  {"trcidr9", TRCIDR9, {}},
-  {"trcidr10", TRCIDR10, {}},
-  {"trcidr11", TRCIDR11, {}},
-  {"trcidr12", TRCIDR12, {}},
-  {"trcidr13", TRCIDR13, {}},
-  {"trcidr0", TRCIDR0, {}},
-  {"trcidr1", TRCIDR1, {}},
-  {"trcidr2", TRCIDR2, {}},
-  {"trcidr3", TRCIDR3, {}},
-  {"trcidr4", TRCIDR4, {}},
-  {"trcidr5", TRCIDR5, {}},
-  {"trcidr6", TRCIDR6, {}},
-  {"trcidr7", TRCIDR7, {}},
-  {"trcoslsr", TRCOSLSR, {}},
-  {"trcpdsr", TRCPDSR, {}},
-  {"trcdevaff0", TRCDEVAFF0, {}},
-  {"trcdevaff1", TRCDEVAFF1, {}},
-  {"trclsr", TRCLSR, {}},
-  {"trcauthstatus", TRCAUTHSTATUS, {}},
-  {"trcdevarch", TRCDEVARCH, {}},
-  {"trcdevid", TRCDEVID, {}},
-  {"trcdevtype", TRCDEVTYPE, {}},
-  {"trcpidr4", TRCPIDR4, {}},
-  {"trcpidr5", TRCPIDR5, {}},
-  {"trcpidr6", TRCPIDR6, {}},
-  {"trcpidr7", TRCPIDR7, {}},
-  {"trcpidr0", TRCPIDR0, {}},
-  {"trcpidr1", TRCPIDR1, {}},
-  {"trcpidr2", TRCPIDR2, {}},
-  {"trcpidr3", TRCPIDR3, {}},
-  {"trccidr0", TRCCIDR0, {}},
-  {"trccidr1", TRCCIDR1, {}},
-  {"trccidr2", TRCCIDR2, {}},
-  {"trccidr3", TRCCIDR3, {}},
-
-  // GICv3 registers
-  {"icc_iar1_el1", ICC_IAR1_EL1, {}},
-  {"icc_iar0_el1", ICC_IAR0_EL1, {}},
-  {"icc_hppir1_el1", ICC_HPPIR1_EL1, {}},
-  {"icc_hppir0_el1", ICC_HPPIR0_EL1, {}},
-  {"icc_rpr_el1", ICC_RPR_EL1, {}},
-  {"ich_vtr_el2", ICH_VTR_EL2, {}},
-  {"ich_eisr_el2", ICH_EISR_EL2, {}},
-  {"ich_elsr_el2", ICH_ELSR_EL2, {}},
-
-  // v8.1a "Limited Ordering Regions" extension-specific system registers
-  {"lorid_el1", LORID_EL1, {AArch64::HasV8_1aOps}},
-};
-
-AArch64SysReg::MRSMapper::MRSMapper() {
-    InstMappings = &MRSMappings[0];
-    NumInstMappings = llvm::array_lengthof(MRSMappings);
+namespace llvm {
+  namespace AArch64IC {
+#define GET_IC_IMPL
+#include "AArch64GenSystemOperands.inc"
+  }
 }
 
-const AArch64NamedImmMapper::Mapping AArch64SysReg::MSRMapper::MSRMappings[] = {
-  {"dbgdtrtx_el0", DBGDTRTX_EL0, {}},
-  {"oslar_el1", OSLAR_EL1, {}},
-  {"pmswinc_el0", PMSWINC_EL0, {}},
-
-  // Trace registers
-  {"trcoslar", TRCOSLAR, {}},
-  {"trclar", TRCLAR, {}},
-
-  // GICv3 registers
-  {"icc_eoir1_el1", ICC_EOIR1_EL1, {}},
-  {"icc_eoir0_el1", ICC_EOIR0_EL1, {}},
-  {"icc_dir_el1", ICC_DIR_EL1, {}},
-  {"icc_sgi1r_el1", ICC_SGI1R_EL1, {}},
-  {"icc_asgi1r_el1", ICC_ASGI1R_EL1, {}},
-  {"icc_sgi0r_el1", ICC_SGI0R_EL1, {}},
-};
-
-AArch64SysReg::MSRMapper::MSRMapper() {
-    InstMappings = &MSRMappings[0];
-    NumInstMappings = llvm::array_lengthof(MSRMappings);
+namespace llvm {
+  namespace AArch64ISB {
+#define GET_ISB_IMPL
+#include "AArch64GenSystemOperands.inc"
+  }
+}
+namespace llvm {
+  namespace AArch64PRFM {
+#define GET_PRFM_IMPL
+#include "AArch64GenSystemOperands.inc"
+  }
 }
 
+namespace llvm {
+  namespace AArch64PState {
+#define GET_PSTATE_IMPL
+#include "AArch64GenSystemOperands.inc"
+  }
+}
 
-const AArch64NamedImmMapper::Mapping AArch64SysReg::SysRegMapper::SysRegMappings[] = {
-  {"osdtrrx_el1", OSDTRRX_EL1, {}},
-  {"osdtrtx_el1",  OSDTRTX_EL1, {}},
-  {"teecr32_el1", TEECR32_EL1, {}},
-  {"mdccint_el1", MDCCINT_EL1, {}},
-  {"mdscr_el1", MDSCR_EL1, {}},
-  {"dbgdtr_el0", DBGDTR_EL0, {}},
-  {"oseccr_el1", OSECCR_EL1, {}},
-  {"dbgvcr32_el2", DBGVCR32_EL2, {}},
-  {"dbgbvr0_el1", DBGBVR0_EL1, {}},
-  {"dbgbvr1_el1", DBGBVR1_EL1, {}},
-  {"dbgbvr2_el1", DBGBVR2_EL1, {}},
-  {"dbgbvr3_el1", DBGBVR3_EL1, {}},
-  {"dbgbvr4_el1", DBGBVR4_EL1, {}},
-  {"dbgbvr5_el1", DBGBVR5_EL1, {}},
-  {"dbgbvr6_el1", DBGBVR6_EL1, {}},
-  {"dbgbvr7_el1", DBGBVR7_EL1, {}},
-  {"dbgbvr8_el1", DBGBVR8_EL1, {}},
-  {"dbgbvr9_el1", DBGBVR9_EL1, {}},
-  {"dbgbvr10_el1", DBGBVR10_EL1, {}},
-  {"dbgbvr11_el1", DBGBVR11_EL1, {}},
-  {"dbgbvr12_el1", DBGBVR12_EL1, {}},
-  {"dbgbvr13_el1", DBGBVR13_EL1, {}},
-  {"dbgbvr14_el1", DBGBVR14_EL1, {}},
-  {"dbgbvr15_el1", DBGBVR15_EL1, {}},
-  {"dbgbcr0_el1", DBGBCR0_EL1, {}},
-  {"dbgbcr1_el1", DBGBCR1_EL1, {}},
-  {"dbgbcr2_el1", DBGBCR2_EL1, {}},
-  {"dbgbcr3_el1", DBGBCR3_EL1, {}},
-  {"dbgbcr4_el1", DBGBCR4_EL1, {}},
-  {"dbgbcr5_el1", DBGBCR5_EL1, {}},
-  {"dbgbcr6_el1", DBGBCR6_EL1, {}},
-  {"dbgbcr7_el1", DBGBCR7_EL1, {}},
-  {"dbgbcr8_el1", DBGBCR8_EL1, {}},
-  {"dbgbcr9_el1", DBGBCR9_EL1, {}},
-  {"dbgbcr10_el1", DBGBCR10_EL1, {}},
-  {"dbgbcr11_el1", DBGBCR11_EL1, {}},
-  {"dbgbcr12_el1", DBGBCR12_EL1, {}},
-  {"dbgbcr13_el1", DBGBCR13_EL1, {}},
-  {"dbgbcr14_el1", DBGBCR14_EL1, {}},
-  {"dbgbcr15_el1", DBGBCR15_EL1, {}},
-  {"dbgwvr0_el1", DBGWVR0_EL1, {}},
-  {"dbgwvr1_el1", DBGWVR1_EL1, {}},
-  {"dbgwvr2_el1", DBGWVR2_EL1, {}},
-  {"dbgwvr3_el1", DBGWVR3_EL1, {}},
-  {"dbgwvr4_el1", DBGWVR4_EL1, {}},
-  {"dbgwvr5_el1", DBGWVR5_EL1, {}},
-  {"dbgwvr6_el1", DBGWVR6_EL1, {}},
-  {"dbgwvr7_el1", DBGWVR7_EL1, {}},
-  {"dbgwvr8_el1", DBGWVR8_EL1, {}},
-  {"dbgwvr9_el1", DBGWVR9_EL1, {}},
-  {"dbgwvr10_el1", DBGWVR10_EL1, {}},
-  {"dbgwvr11_el1", DBGWVR11_EL1, {}},
-  {"dbgwvr12_el1", DBGWVR12_EL1, {}},
-  {"dbgwvr13_el1", DBGWVR13_EL1, {}},
-  {"dbgwvr14_el1", DBGWVR14_EL1, {}},
-  {"dbgwvr15_el1", DBGWVR15_EL1, {}},
-  {"dbgwcr0_el1", DBGWCR0_EL1, {}},
-  {"dbgwcr1_el1", DBGWCR1_EL1, {}},
-  {"dbgwcr2_el1", DBGWCR2_EL1, {}},
-  {"dbgwcr3_el1", DBGWCR3_EL1, {}},
-  {"dbgwcr4_el1", DBGWCR4_EL1, {}},
-  {"dbgwcr5_el1", DBGWCR5_EL1, {}},
-  {"dbgwcr6_el1", DBGWCR6_EL1, {}},
-  {"dbgwcr7_el1", DBGWCR7_EL1, {}},
-  {"dbgwcr8_el1", DBGWCR8_EL1, {}},
-  {"dbgwcr9_el1", DBGWCR9_EL1, {}},
-  {"dbgwcr10_el1", DBGWCR10_EL1, {}},
-  {"dbgwcr11_el1", DBGWCR11_EL1, {}},
-  {"dbgwcr12_el1", DBGWCR12_EL1, {}},
-  {"dbgwcr13_el1", DBGWCR13_EL1, {}},
-  {"dbgwcr14_el1", DBGWCR14_EL1, {}},
-  {"dbgwcr15_el1", DBGWCR15_EL1, {}},
-  {"teehbr32_el1", TEEHBR32_EL1, {}},
-  {"osdlr_el1", OSDLR_EL1, {}},
-  {"dbgprcr_el1", DBGPRCR_EL1, {}},
-  {"dbgclaimset_el1", DBGCLAIMSET_EL1, {}},
-  {"dbgclaimclr_el1", DBGCLAIMCLR_EL1, {}},
-  {"csselr_el1", CSSELR_EL1, {}},
-  {"vpidr_el2", VPIDR_EL2, {}},
-  {"vmpidr_el2", VMPIDR_EL2, {}},
-  {"sctlr_el1", SCTLR_EL1, {}},
-  {"sctlr_el2", SCTLR_EL2, {}},
-  {"sctlr_el3", SCTLR_EL3, {}},
-  {"actlr_el1", ACTLR_EL1, {}},
-  {"actlr_el2", ACTLR_EL2, {}},
-  {"actlr_el3", ACTLR_EL3, {}},
-  {"cpacr_el1", CPACR_EL1, {}},
-  {"hcr_el2", HCR_EL2, {}},
-  {"scr_el3", SCR_EL3, {}},
-  {"mdcr_el2", MDCR_EL2, {}},
-  {"sder32_el3", SDER32_EL3, {}},
-  {"cptr_el2", CPTR_EL2, {}},
-  {"cptr_el3", CPTR_EL3, {}},
-  {"hstr_el2", HSTR_EL2, {}},
-  {"hacr_el2", HACR_EL2, {}},
-  {"mdcr_el3", MDCR_EL3, {}},
-  {"ttbr0_el1", TTBR0_EL1, {}},
-  {"ttbr0_el2", TTBR0_EL2, {}},
-  {"ttbr0_el3", TTBR0_EL3, {}},
-  {"ttbr1_el1", TTBR1_EL1, {}},
-  {"tcr_el1", TCR_EL1, {}},
-  {"tcr_el2", TCR_EL2, {}},
-  {"tcr_el3", TCR_EL3, {}},
-  {"vttbr_el2", VTTBR_EL2, {}},
-  {"vtcr_el2", VTCR_EL2, {}},
-  {"dacr32_el2", DACR32_EL2, {}},
-  {"spsr_el1", SPSR_EL1, {}},
-  {"spsr_el2", SPSR_EL2, {}},
-  {"spsr_el3", SPSR_EL3, {}},
-  {"elr_el1", ELR_EL1, {}},
-  {"elr_el2", ELR_EL2, {}},
-  {"elr_el3", ELR_EL3, {}},
-  {"sp_el0", SP_EL0, {}},
-  {"sp_el1", SP_EL1, {}},
-  {"sp_el2", SP_EL2, {}},
-  {"spsel", SPSel, {}},
-  {"nzcv", NZCV, {}},
-  {"daif", DAIF, {}},
-  {"currentel", CurrentEL, {}},
-  {"spsr_irq", SPSR_irq, {}},
-  {"spsr_abt", SPSR_abt, {}},
-  {"spsr_und", SPSR_und, {}},
-  {"spsr_fiq", SPSR_fiq, {}},
-  {"fpcr", FPCR, {}},
-  {"fpsr", FPSR, {}},
-  {"dspsr_el0", DSPSR_EL0, {}},
-  {"dlr_el0", DLR_EL0, {}},
-  {"ifsr32_el2", IFSR32_EL2, {}},
-  {"afsr0_el1", AFSR0_EL1, {}},
-  {"afsr0_el2", AFSR0_EL2, {}},
-  {"afsr0_el3", AFSR0_EL3, {}},
-  {"afsr1_el1", AFSR1_EL1, {}},
-  {"afsr1_el2", AFSR1_EL2, {}},
-  {"afsr1_el3", AFSR1_EL3, {}},
-  {"esr_el1", ESR_EL1, {}},
-  {"esr_el2", ESR_EL2, {}},
-  {"esr_el3", ESR_EL3, {}},
-  {"fpexc32_el2", FPEXC32_EL2, {}},
-  {"far_el1", FAR_EL1, {}},
-  {"far_el2", FAR_EL2, {}},
-  {"far_el3", FAR_EL3, {}},
-  {"hpfar_el2", HPFAR_EL2, {}},
-  {"par_el1", PAR_EL1, {}},
-  {"pmcr_el0", PMCR_EL0, {}},
-  {"pmcntenset_el0", PMCNTENSET_EL0, {}},
-  {"pmcntenclr_el0", PMCNTENCLR_EL0, {}},
-  {"pmovsclr_el0", PMOVSCLR_EL0, {}},
-  {"pmselr_el0", PMSELR_EL0, {}},
-  {"pmccntr_el0", PMCCNTR_EL0, {}},
-  {"pmxevtyper_el0", PMXEVTYPER_EL0, {}},
-  {"pmxevcntr_el0", PMXEVCNTR_EL0, {}},
-  {"pmuserenr_el0", PMUSERENR_EL0, {}},
-  {"pmintenset_el1", PMINTENSET_EL1, {}},
-  {"pmintenclr_el1", PMINTENCLR_EL1, {}},
-  {"pmovsset_el0", PMOVSSET_EL0, {}},
-  {"mair_el1", MAIR_EL1, {}},
-  {"mair_el2", MAIR_EL2, {}},
-  {"mair_el3", MAIR_EL3, {}},
-  {"amair_el1", AMAIR_EL1, {}},
-  {"amair_el2", AMAIR_EL2, {}},
-  {"amair_el3", AMAIR_EL3, {}},
-  {"vbar_el1", VBAR_EL1, {}},
-  {"vbar_el2", VBAR_EL2, {}},
-  {"vbar_el3", VBAR_EL3, {}},
-  {"rmr_el1", RMR_EL1, {}},
-  {"rmr_el2", RMR_EL2, {}},
-  {"rmr_el3", RMR_EL3, {}},
-  {"contextidr_el1", CONTEXTIDR_EL1, {}},
-  {"tpidr_el0", TPIDR_EL0, {}},
-  {"tpidr_el2", TPIDR_EL2, {}},
-  {"tpidr_el3", TPIDR_EL3, {}},
-  {"tpidrro_el0", TPIDRRO_EL0, {}},
-  {"tpidr_el1", TPIDR_EL1, {}},
-  {"cntfrq_el0", CNTFRQ_EL0, {}},
-  {"cntvoff_el2", CNTVOFF_EL2, {}},
-  {"cntkctl_el1", CNTKCTL_EL1, {}},
-  {"cnthctl_el2", CNTHCTL_EL2, {}},
-  {"cntp_tval_el0", CNTP_TVAL_EL0, {}},
-  {"cnthp_tval_el2", CNTHP_TVAL_EL2, {}},
-  {"cntps_tval_el1", CNTPS_TVAL_EL1, {}},
-  {"cntp_ctl_el0", CNTP_CTL_EL0, {}},
-  {"cnthp_ctl_el2", CNTHP_CTL_EL2, {}},
-  {"cntps_ctl_el1", CNTPS_CTL_EL1, {}},
-  {"cntp_cval_el0", CNTP_CVAL_EL0, {}},
-  {"cnthp_cval_el2", CNTHP_CVAL_EL2, {}},
-  {"cntps_cval_el1", CNTPS_CVAL_EL1, {}},
-  {"cntv_tval_el0", CNTV_TVAL_EL0, {}},
-  {"cntv_ctl_el0", CNTV_CTL_EL0, {}},
-  {"cntv_cval_el0", CNTV_CVAL_EL0, {}},
-  {"pmevcntr0_el0", PMEVCNTR0_EL0, {}},
-  {"pmevcntr1_el0", PMEVCNTR1_EL0, {}},
-  {"pmevcntr2_el0", PMEVCNTR2_EL0, {}},
-  {"pmevcntr3_el0", PMEVCNTR3_EL0, {}},
-  {"pmevcntr4_el0", PMEVCNTR4_EL0, {}},
-  {"pmevcntr5_el0", PMEVCNTR5_EL0, {}},
-  {"pmevcntr6_el0", PMEVCNTR6_EL0, {}},
-  {"pmevcntr7_el0", PMEVCNTR7_EL0, {}},
-  {"pmevcntr8_el0", PMEVCNTR8_EL0, {}},
-  {"pmevcntr9_el0", PMEVCNTR9_EL0, {}},
-  {"pmevcntr10_el0", PMEVCNTR10_EL0, {}},
-  {"pmevcntr11_el0", PMEVCNTR11_EL0, {}},
-  {"pmevcntr12_el0", PMEVCNTR12_EL0, {}},
-  {"pmevcntr13_el0", PMEVCNTR13_EL0, {}},
-  {"pmevcntr14_el0", PMEVCNTR14_EL0, {}},
-  {"pmevcntr15_el0", PMEVCNTR15_EL0, {}},
-  {"pmevcntr16_el0", PMEVCNTR16_EL0, {}},
-  {"pmevcntr17_el0", PMEVCNTR17_EL0, {}},
-  {"pmevcntr18_el0", PMEVCNTR18_EL0, {}},
-  {"pmevcntr19_el0", PMEVCNTR19_EL0, {}},
-  {"pmevcntr20_el0", PMEVCNTR20_EL0, {}},
-  {"pmevcntr21_el0", PMEVCNTR21_EL0, {}},
-  {"pmevcntr22_el0", PMEVCNTR22_EL0, {}},
-  {"pmevcntr23_el0", PMEVCNTR23_EL0, {}},
-  {"pmevcntr24_el0", PMEVCNTR24_EL0, {}},
-  {"pmevcntr25_el0", PMEVCNTR25_EL0, {}},
-  {"pmevcntr26_el0", PMEVCNTR26_EL0, {}},
-  {"pmevcntr27_el0", PMEVCNTR27_EL0, {}},
-  {"pmevcntr28_el0", PMEVCNTR28_EL0, {}},
-  {"pmevcntr29_el0", PMEVCNTR29_EL0, {}},
-  {"pmevcntr30_el0", PMEVCNTR30_EL0, {}},
-  {"pmccfiltr_el0", PMCCFILTR_EL0, {}},
-  {"pmevtyper0_el0", PMEVTYPER0_EL0, {}},
-  {"pmevtyper1_el0", PMEVTYPER1_EL0, {}},
-  {"pmevtyper2_el0", PMEVTYPER2_EL0, {}},
-  {"pmevtyper3_el0", PMEVTYPER3_EL0, {}},
-  {"pmevtyper4_el0", PMEVTYPER4_EL0, {}},
-  {"pmevtyper5_el0", PMEVTYPER5_EL0, {}},
-  {"pmevtyper6_el0", PMEVTYPER6_EL0, {}},
-  {"pmevtyper7_el0", PMEVTYPER7_EL0, {}},
-  {"pmevtyper8_el0", PMEVTYPER8_EL0, {}},
-  {"pmevtyper9_el0", PMEVTYPER9_EL0, {}},
-  {"pmevtyper10_el0", PMEVTYPER10_EL0, {}},
-  {"pmevtyper11_el0", PMEVTYPER11_EL0, {}},
-  {"pmevtyper12_el0", PMEVTYPER12_EL0, {}},
-  {"pmevtyper13_el0", PMEVTYPER13_EL0, {}},
-  {"pmevtyper14_el0", PMEVTYPER14_EL0, {}},
-  {"pmevtyper15_el0", PMEVTYPER15_EL0, {}},
-  {"pmevtyper16_el0", PMEVTYPER16_EL0, {}},
-  {"pmevtyper17_el0", PMEVTYPER17_EL0, {}},
-  {"pmevtyper18_el0", PMEVTYPER18_EL0, {}},
-  {"pmevtyper19_el0", PMEVTYPER19_EL0, {}},
-  {"pmevtyper20_el0", PMEVTYPER20_EL0, {}},
-  {"pmevtyper21_el0", PMEVTYPER21_EL0, {}},
-  {"pmevtyper22_el0", PMEVTYPER22_EL0, {}},
-  {"pmevtyper23_el0", PMEVTYPER23_EL0, {}},
-  {"pmevtyper24_el0", PMEVTYPER24_EL0, {}},
-  {"pmevtyper25_el0", PMEVTYPER25_EL0, {}},
-  {"pmevtyper26_el0", PMEVTYPER26_EL0, {}},
-  {"pmevtyper27_el0", PMEVTYPER27_EL0, {}},
-  {"pmevtyper28_el0", PMEVTYPER28_EL0, {}},
-  {"pmevtyper29_el0", PMEVTYPER29_EL0, {}},
-  {"pmevtyper30_el0", PMEVTYPER30_EL0, {}},
-
-  // Trace registers
-  {"trcprgctlr", TRCPRGCTLR, {}},
-  {"trcprocselr", TRCPROCSELR, {}},
-  {"trcconfigr", TRCCONFIGR, {}},
-  {"trcauxctlr", TRCAUXCTLR, {}},
-  {"trceventctl0r", TRCEVENTCTL0R, {}},
-  {"trceventctl1r", TRCEVENTCTL1R, {}},
-  {"trcstallctlr", TRCSTALLCTLR, {}},
-  {"trctsctlr", TRCTSCTLR, {}},
-  {"trcsyncpr", TRCSYNCPR, {}},
-  {"trcccctlr", TRCCCCTLR, {}},
-  {"trcbbctlr", TRCBBCTLR, {}},
-  {"trctraceidr", TRCTRACEIDR, {}},
-  {"trcqctlr", TRCQCTLR, {}},
-  {"trcvictlr", TRCVICTLR, {}},
-  {"trcviiectlr", TRCVIIECTLR, {}},
-  {"trcvissctlr", TRCVISSCTLR, {}},
-  {"trcvipcssctlr", TRCVIPCSSCTLR, {}},
-  {"trcvdctlr", TRCVDCTLR, {}},
-  {"trcvdsacctlr", TRCVDSACCTLR, {}},
-  {"trcvdarcctlr", TRCVDARCCTLR, {}},
-  {"trcseqevr0", TRCSEQEVR0, {}},
-  {"trcseqevr1", TRCSEQEVR1, {}},
-  {"trcseqevr2", TRCSEQEVR2, {}},
-  {"trcseqrstevr", TRCSEQRSTEVR, {}},
-  {"trcseqstr", TRCSEQSTR, {}},
-  {"trcextinselr", TRCEXTINSELR, {}},
-  {"trccntrldvr0", TRCCNTRLDVR0, {}},
-  {"trccntrldvr1", TRCCNTRLDVR1, {}},
-  {"trccntrldvr2", TRCCNTRLDVR2, {}},
-  {"trccntrldvr3", TRCCNTRLDVR3, {}},
-  {"trccntctlr0", TRCCNTCTLR0, {}},
-  {"trccntctlr1", TRCCNTCTLR1, {}},
-  {"trccntctlr2", TRCCNTCTLR2, {}},
-  {"trccntctlr3", TRCCNTCTLR3, {}},
-  {"trccntvr0", TRCCNTVR0, {}},
-  {"trccntvr1", TRCCNTVR1, {}},
-  {"trccntvr2", TRCCNTVR2, {}},
-  {"trccntvr3", TRCCNTVR3, {}},
-  {"trcimspec0", TRCIMSPEC0, {}},
-  {"trcimspec1", TRCIMSPEC1, {}},
-  {"trcimspec2", TRCIMSPEC2, {}},
-  {"trcimspec3", TRCIMSPEC3, {}},
-  {"trcimspec4", TRCIMSPEC4, {}},
-  {"trcimspec5", TRCIMSPEC5, {}},
-  {"trcimspec6", TRCIMSPEC6, {}},
-  {"trcimspec7", TRCIMSPEC7, {}},
-  {"trcrsctlr2", TRCRSCTLR2, {}},
-  {"trcrsctlr3", TRCRSCTLR3, {}},
-  {"trcrsctlr4", TRCRSCTLR4, {}},
-  {"trcrsctlr5", TRCRSCTLR5, {}},
-  {"trcrsctlr6", TRCRSCTLR6, {}},
-  {"trcrsctlr7", TRCRSCTLR7, {}},
-  {"trcrsctlr8", TRCRSCTLR8, {}},
-  {"trcrsctlr9", TRCRSCTLR9, {}},
-  {"trcrsctlr10", TRCRSCTLR10, {}},
-  {"trcrsctlr11", TRCRSCTLR11, {}},
-  {"trcrsctlr12", TRCRSCTLR12, {}},
-  {"trcrsctlr13", TRCRSCTLR13, {}},
-  {"trcrsctlr14", TRCRSCTLR14, {}},
-  {"trcrsctlr15", TRCRSCTLR15, {}},
-  {"trcrsctlr16", TRCRSCTLR16, {}},
-  {"trcrsctlr17", TRCRSCTLR17, {}},
-  {"trcrsctlr18", TRCRSCTLR18, {}},
-  {"trcrsctlr19", TRCRSCTLR19, {}},
-  {"trcrsctlr20", TRCRSCTLR20, {}},
-  {"trcrsctlr21", TRCRSCTLR21, {}},
-  {"trcrsctlr22", TRCRSCTLR22, {}},
-  {"trcrsctlr23", TRCRSCTLR23, {}},
-  {"trcrsctlr24", TRCRSCTLR24, {}},
-  {"trcrsctlr25", TRCRSCTLR25, {}},
-  {"trcrsctlr26", TRCRSCTLR26, {}},
-  {"trcrsctlr27", TRCRSCTLR27, {}},
-  {"trcrsctlr28", TRCRSCTLR28, {}},
-  {"trcrsctlr29", TRCRSCTLR29, {}},
-  {"trcrsctlr30", TRCRSCTLR30, {}},
-  {"trcrsctlr31", TRCRSCTLR31, {}},
-  {"trcssccr0", TRCSSCCR0, {}},
-  {"trcssccr1", TRCSSCCR1, {}},
-  {"trcssccr2", TRCSSCCR2, {}},
-  {"trcssccr3", TRCSSCCR3, {}},
-  {"trcssccr4", TRCSSCCR4, {}},
-  {"trcssccr5", TRCSSCCR5, {}},
-  {"trcssccr6", TRCSSCCR6, {}},
-  {"trcssccr7", TRCSSCCR7, {}},
-  {"trcsscsr0", TRCSSCSR0, {}},
-  {"trcsscsr1", TRCSSCSR1, {}},
-  {"trcsscsr2", TRCSSCSR2, {}},
-  {"trcsscsr3", TRCSSCSR3, {}},
-  {"trcsscsr4", TRCSSCSR4, {}},
-  {"trcsscsr5", TRCSSCSR5, {}},
-  {"trcsscsr6", TRCSSCSR6, {}},
-  {"trcsscsr7", TRCSSCSR7, {}},
-  {"trcsspcicr0", TRCSSPCICR0, {}},
-  {"trcsspcicr1", TRCSSPCICR1, {}},
-  {"trcsspcicr2", TRCSSPCICR2, {}},
-  {"trcsspcicr3", TRCSSPCICR3, {}},
-  {"trcsspcicr4", TRCSSPCICR4, {}},
-  {"trcsspcicr5", TRCSSPCICR5, {}},
-  {"trcsspcicr6", TRCSSPCICR6, {}},
-  {"trcsspcicr7", TRCSSPCICR7, {}},
-  {"trcpdcr", TRCPDCR, {}},
-  {"trcacvr0", TRCACVR0, {}},
-  {"trcacvr1", TRCACVR1, {}},
-  {"trcacvr2", TRCACVR2, {}},
-  {"trcacvr3", TRCACVR3, {}},
-  {"trcacvr4", TRCACVR4, {}},
-  {"trcacvr5", TRCACVR5, {}},
-  {"trcacvr6", TRCACVR6, {}},
-  {"trcacvr7", TRCACVR7, {}},
-  {"trcacvr8", TRCACVR8, {}},
-  {"trcacvr9", TRCACVR9, {}},
-  {"trcacvr10", TRCACVR10, {}},
-  {"trcacvr11", TRCACVR11, {}},
-  {"trcacvr12", TRCACVR12, {}},
-  {"trcacvr13", TRCACVR13, {}},
-  {"trcacvr14", TRCACVR14, {}},
-  {"trcacvr15", TRCACVR15, {}},
-  {"trcacatr0", TRCACATR0, {}},
-  {"trcacatr1", TRCACATR1, {}},
-  {"trcacatr2", TRCACATR2, {}},
-  {"trcacatr3", TRCACATR3, {}},
-  {"trcacatr4", TRCACATR4, {}},
-  {"trcacatr5", TRCACATR5, {}},
-  {"trcacatr6", TRCACATR6, {}},
-  {"trcacatr7", TRCACATR7, {}},
-  {"trcacatr8", TRCACATR8, {}},
-  {"trcacatr9", TRCACATR9, {}},
-  {"trcacatr10", TRCACATR10, {}},
-  {"trcacatr11", TRCACATR11, {}},
-  {"trcacatr12", TRCACATR12, {}},
-  {"trcacatr13", TRCACATR13, {}},
-  {"trcacatr14", TRCACATR14, {}},
-  {"trcacatr15", TRCACATR15, {}},
-  {"trcdvcvr0", TRCDVCVR0, {}},
-  {"trcdvcvr1", TRCDVCVR1, {}},
-  {"trcdvcvr2", TRCDVCVR2, {}},
-  {"trcdvcvr3", TRCDVCVR3, {}},
-  {"trcdvcvr4", TRCDVCVR4, {}},
-  {"trcdvcvr5", TRCDVCVR5, {}},
-  {"trcdvcvr6", TRCDVCVR6, {}},
-  {"trcdvcvr7", TRCDVCVR7, {}},
-  {"trcdvcmr0", TRCDVCMR0, {}},
-  {"trcdvcmr1", TRCDVCMR1, {}},
-  {"trcdvcmr2", TRCDVCMR2, {}},
-  {"trcdvcmr3", TRCDVCMR3, {}},
-  {"trcdvcmr4", TRCDVCMR4, {}},
-  {"trcdvcmr5", TRCDVCMR5, {}},
-  {"trcdvcmr6", TRCDVCMR6, {}},
-  {"trcdvcmr7", TRCDVCMR7, {}},
-  {"trccidcvr0", TRCCIDCVR0, {}},
-  {"trccidcvr1", TRCCIDCVR1, {}},
-  {"trccidcvr2", TRCCIDCVR2, {}},
-  {"trccidcvr3", TRCCIDCVR3, {}},
-  {"trccidcvr4", TRCCIDCVR4, {}},
-  {"trccidcvr5", TRCCIDCVR5, {}},
-  {"trccidcvr6", TRCCIDCVR6, {}},
-  {"trccidcvr7", TRCCIDCVR7, {}},
-  {"trcvmidcvr0", TRCVMIDCVR0, {}},
-  {"trcvmidcvr1", TRCVMIDCVR1, {}},
-  {"trcvmidcvr2", TRCVMIDCVR2, {}},
-  {"trcvmidcvr3", TRCVMIDCVR3, {}},
-  {"trcvmidcvr4", TRCVMIDCVR4, {}},
-  {"trcvmidcvr5", TRCVMIDCVR5, {}},
-  {"trcvmidcvr6", TRCVMIDCVR6, {}},
-  {"trcvmidcvr7", TRCVMIDCVR7, {}},
-  {"trccidcctlr0", TRCCIDCCTLR0, {}},
-  {"trccidcctlr1", TRCCIDCCTLR1, {}},
-  {"trcvmidcctlr0", TRCVMIDCCTLR0, {}},
-  {"trcvmidcctlr1", TRCVMIDCCTLR1, {}},
-  {"trcitctrl", TRCITCTRL, {}},
-  {"trcclaimset", TRCCLAIMSET, {}},
-  {"trcclaimclr", TRCCLAIMCLR, {}},
-
-  // GICv3 registers
-  {"icc_bpr1_el1", ICC_BPR1_EL1, {}},
-  {"icc_bpr0_el1", ICC_BPR0_EL1, {}},
-  {"icc_pmr_el1", ICC_PMR_EL1, {}},
-  {"icc_ctlr_el1", ICC_CTLR_EL1, {}},
-  {"icc_ctlr_el3", ICC_CTLR_EL3, {}},
-  {"icc_sre_el1", ICC_SRE_EL1, {}},
-  {"icc_sre_el2", ICC_SRE_EL2, {}},
-  {"icc_sre_el3", ICC_SRE_EL3, {}},
-  {"icc_igrpen0_el1", ICC_IGRPEN0_EL1, {}},
-  {"icc_igrpen1_el1", ICC_IGRPEN1_EL1, {}},
-  {"icc_igrpen1_el3", ICC_IGRPEN1_EL3, {}},
-  {"icc_seien_el1", ICC_SEIEN_EL1, {}},
-  {"icc_ap0r0_el1", ICC_AP0R0_EL1, {}},
-  {"icc_ap0r1_el1", ICC_AP0R1_EL1, {}},
-  {"icc_ap0r2_el1", ICC_AP0R2_EL1, {}},
-  {"icc_ap0r3_el1", ICC_AP0R3_EL1, {}},
-  {"icc_ap1r0_el1", ICC_AP1R0_EL1, {}},
-  {"icc_ap1r1_el1", ICC_AP1R1_EL1, {}},
-  {"icc_ap1r2_el1", ICC_AP1R2_EL1, {}},
-  {"icc_ap1r3_el1", ICC_AP1R3_EL1, {}},
-  {"ich_ap0r0_el2", ICH_AP0R0_EL2, {}},
-  {"ich_ap0r1_el2", ICH_AP0R1_EL2, {}},
-  {"ich_ap0r2_el2", ICH_AP0R2_EL2, {}},
-  {"ich_ap0r3_el2", ICH_AP0R3_EL2, {}},
-  {"ich_ap1r0_el2", ICH_AP1R0_EL2, {}},
-  {"ich_ap1r1_el2", ICH_AP1R1_EL2, {}},
-  {"ich_ap1r2_el2", ICH_AP1R2_EL2, {}},
-  {"ich_ap1r3_el2", ICH_AP1R3_EL2, {}},
-  {"ich_hcr_el2", ICH_HCR_EL2, {}},
-  {"ich_misr_el2", ICH_MISR_EL2, {}},
-  {"ich_vmcr_el2", ICH_VMCR_EL2, {}},
-  {"ich_vseir_el2", ICH_VSEIR_EL2, {}},
-  {"ich_lr0_el2", ICH_LR0_EL2, {}},
-  {"ich_lr1_el2", ICH_LR1_EL2, {}},
-  {"ich_lr2_el2", ICH_LR2_EL2, {}},
-  {"ich_lr3_el2", ICH_LR3_EL2, {}},
-  {"ich_lr4_el2", ICH_LR4_EL2, {}},
-  {"ich_lr5_el2", ICH_LR5_EL2, {}},
-  {"ich_lr6_el2", ICH_LR6_EL2, {}},
-  {"ich_lr7_el2", ICH_LR7_EL2, {}},
-  {"ich_lr8_el2", ICH_LR8_EL2, {}},
-  {"ich_lr9_el2", ICH_LR9_EL2, {}},
-  {"ich_lr10_el2", ICH_LR10_EL2, {}},
-  {"ich_lr11_el2", ICH_LR11_EL2, {}},
-  {"ich_lr12_el2", ICH_LR12_EL2, {}},
-  {"ich_lr13_el2", ICH_LR13_EL2, {}},
-  {"ich_lr14_el2", ICH_LR14_EL2, {}},
-  {"ich_lr15_el2", ICH_LR15_EL2, {}},
-
-  // Cyclone registers
-  {"cpm_ioacc_ctl_el3", CPM_IOACC_CTL_EL3, {AArch64::ProcCyclone}},
-
-  // v8.1a "Privileged Access Never" extension-specific system registers
-  {"pan", PAN, {AArch64::HasV8_1aOps}},
-
-  // v8.1a "Limited Ordering Regions" extension-specific system registers
-  {"lorsa_el1", LORSA_EL1, {AArch64::HasV8_1aOps}},
-  {"lorea_el1", LOREA_EL1, {AArch64::HasV8_1aOps}},
-  {"lorn_el1", LORN_EL1, {AArch64::HasV8_1aOps}},
-  {"lorc_el1", LORC_EL1, {AArch64::HasV8_1aOps}},
-
-  // v8.1a "Virtualization host extensions" system registers
-  {"ttbr1_el2", TTBR1_EL2, {AArch64::HasV8_1aOps}},
-  {"contextidr_el2", CONTEXTIDR_EL2, {AArch64::HasV8_1aOps}},
-  {"cnthv_tval_el2", CNTHV_TVAL_EL2, {AArch64::HasV8_1aOps}},
-  {"cnthv_cval_el2", CNTHV_CVAL_EL2, {AArch64::HasV8_1aOps}},
-  {"cnthv_ctl_el2", CNTHV_CTL_EL2, {AArch64::HasV8_1aOps}},
-  {"sctlr_el12", SCTLR_EL12, {AArch64::HasV8_1aOps}},
-  {"cpacr_el12", CPACR_EL12, {AArch64::HasV8_1aOps}},
-  {"ttbr0_el12", TTBR0_EL12, {AArch64::HasV8_1aOps}},
-  {"ttbr1_el12", TTBR1_EL12, {AArch64::HasV8_1aOps}},
-  {"tcr_el12", TCR_EL12, {AArch64::HasV8_1aOps}},
-  {"afsr0_el12", AFSR0_EL12, {AArch64::HasV8_1aOps}},
-  {"afsr1_el12", AFSR1_EL12, {AArch64::HasV8_1aOps}},
-  {"esr_el12", ESR_EL12, {AArch64::HasV8_1aOps}},
-  {"far_el12", FAR_EL12, {AArch64::HasV8_1aOps}},
-  {"mair_el12", MAIR_EL12, {AArch64::HasV8_1aOps}},
-  {"amair_el12", AMAIR_EL12, {AArch64::HasV8_1aOps}},
-  {"vbar_el12", VBAR_EL12, {AArch64::HasV8_1aOps}},
-  {"contextidr_el12", CONTEXTIDR_EL12, {AArch64::HasV8_1aOps}},
-  {"cntkctl_el12", CNTKCTL_EL12, {AArch64::HasV8_1aOps}},
-  {"cntp_tval_el02", CNTP_TVAL_EL02, {AArch64::HasV8_1aOps}},
-  {"cntp_ctl_el02", CNTP_CTL_EL02, {AArch64::HasV8_1aOps}},
-  {"cntp_cval_el02", CNTP_CVAL_EL02, {AArch64::HasV8_1aOps}},
-  {"cntv_tval_el02", CNTV_TVAL_EL02, {AArch64::HasV8_1aOps}},
-  {"cntv_ctl_el02", CNTV_CTL_EL02, {AArch64::HasV8_1aOps}},
-  {"cntv_cval_el02", CNTV_CVAL_EL02, {AArch64::HasV8_1aOps}},
-  {"spsr_el12", SPSR_EL12, {AArch64::HasV8_1aOps}},
-  {"elr_el12", ELR_EL12, {AArch64::HasV8_1aOps}},
-
-  // v8.2a registers
-  {"uao",           UAO,           {AArch64::HasV8_2aOps}},
-
-  // v8.2a "Statistical Profiling extension" registers
-  {"pmblimitr_el1", PMBLIMITR_EL1, {AArch64::FeatureSPE}},
-  {"pmbptr_el1",    PMBPTR_EL1,    {AArch64::FeatureSPE}},
-  {"pmbsr_el1",     PMBSR_EL1,     {AArch64::FeatureSPE}},
-  {"pmbidr_el1",    PMBIDR_EL1,    {AArch64::FeatureSPE}},
-  {"pmscr_el2",     PMSCR_EL2,     {AArch64::FeatureSPE}},
-  {"pmscr_el12",    PMSCR_EL12,    {AArch64::FeatureSPE}},
-  {"pmscr_el1",     PMSCR_EL1,     {AArch64::FeatureSPE}},
-  {"pmsicr_el1",    PMSICR_EL1,    {AArch64::FeatureSPE}},
-  {"pmsirr_el1",    PMSIRR_EL1,    {AArch64::FeatureSPE}},
-  {"pmsfcr_el1",    PMSFCR_EL1,    {AArch64::FeatureSPE}},
-  {"pmsevfr_el1",   PMSEVFR_EL1,   {AArch64::FeatureSPE}},
-  {"pmslatfr_el1",  PMSLATFR_EL1,  {AArch64::FeatureSPE}},
-  {"pmsidr_el1",    PMSIDR_EL1,    {AArch64::FeatureSPE}},
-};
-
-uint32_t
-AArch64SysReg::SysRegMapper::fromString(StringRef Name,
-    const FeatureBitset& FeatureBits, bool &Valid) const {
-  std::string NameLower = Name.lower();
-
-  // First search the registers shared by all
-  for (unsigned i = 0; i < array_lengthof(SysRegMappings); ++i) {
-    if (SysRegMappings[i].isNameEqual(NameLower, FeatureBits)) {
-      Valid = true;
-      return SysRegMappings[i].Value;
-    }
+namespace llvm {
+  namespace AArch64PSBHint {
+#define GET_PSB_IMPL
+#include "AArch64GenSystemOperands.inc"
   }
+}
 
-  // Now try the instruction-specific registers (either read-only or
-  // write-only).
-  for (unsigned i = 0; i < NumInstMappings; ++i) {
-    if (InstMappings[i].isNameEqual(NameLower, FeatureBits)) {
-      Valid = true;
-      return InstMappings[i].Value;
-    }
+namespace llvm {
+  namespace AArch64SysReg {
+#define GET_SYSREG_IMPL
+#include "AArch64GenSystemOperands.inc"
   }
+}
 
+uint32_t AArch64SysReg::parseGenericRegister(StringRef Name) {
   // Try to parse an S<op0>_<op1>_<Cn>_<Cm>_<op2> register name
-  Regex GenericRegPattern("^s([0-3])_([0-7])_c([0-9]|1[0-5])_c([0-9]|1[0-5])_([0-7])$");
+  Regex GenericRegPattern("^S([0-3])_([0-7])_C([0-9]|1[0-5])_C([0-9]|1[0-5])_([0-7])$");
 
+  std::string UpperName = Name.upper();
   SmallVector<StringRef, 5> Ops;
-  if (!GenericRegPattern.match(NameLower, &Ops)) {
-    Valid = false;
+  if (!GenericRegPattern.match(UpperName, &Ops))
     return -1;
-  }
 
   uint32_t Op0 = 0, Op1 = 0, CRn = 0, CRm = 0, Op2 = 0;
   uint32_t Bits;
@@ -873,28 +99,10 @@ AArch64SysReg::SysRegMapper::fromString(StringRef Name,
   Ops[5].getAsInteger(10, Op2);
   Bits = (Op0 << 14) | (Op1 << 11) | (CRn << 7) | (CRm << 3) | Op2;
 
-  Valid = true;
   return Bits;
 }
 
-std::string
-AArch64SysReg::SysRegMapper::toString(uint32_t Bits,
-                                      const FeatureBitset& FeatureBits) const {
-  // First search the registers shared by all
-  for (unsigned i = 0; i < array_lengthof(SysRegMappings); ++i) {
-    if (SysRegMappings[i].isValueEqual(Bits, FeatureBits)) {
-      return SysRegMappings[i].Name;
-    }
-  }
-
-  // Now try the instruction-specific registers (either read-only or
-  // write-only).
-  for (unsigned i = 0; i < NumInstMappings; ++i) {
-    if (InstMappings[i].isValueEqual(Bits, FeatureBits)) {
-      return InstMappings[i].Name;
-    }
-  }
-
+std::string AArch64SysReg::genericRegisterString(uint32_t Bits) {
   assert(Bits < 0x10000);
   uint32_t Op0 = (Bits >> 14) & 0x3;
   uint32_t Op1 = (Bits >> 11) & 0x7;
@@ -902,44 +110,13 @@ AArch64SysReg::SysRegMapper::toString(uint32_t Bits,
   uint32_t CRm = (Bits >> 3) & 0xf;
   uint32_t Op2 = Bits & 0x7;
 
-  return "s" + utostr(Op0)+ "_" + utostr(Op1) + "_c" + utostr(CRn)
-               + "_c" + utostr(CRm) + "_" + utostr(Op2);
+  return "S" + utostr(Op0) + "_" + utostr(Op1) + "_C" + utostr(CRn) + "_C" +
+         utostr(CRm) + "_" + utostr(Op2);
 }
 
-const AArch64NamedImmMapper::Mapping AArch64TLBI::TLBIMapper::TLBIMappings[] = {
-  {"ipas2e1is", IPAS2E1IS, {}},
-  {"ipas2le1is", IPAS2LE1IS, {}},
-  {"vmalle1is", VMALLE1IS, {}},
-  {"alle2is", ALLE2IS, {}},
-  {"alle3is", ALLE3IS, {}},
-  {"vae1is", VAE1IS, {}},
-  {"vae2is", VAE2IS, {}},
-  {"vae3is", VAE3IS, {}},
-  {"aside1is", ASIDE1IS, {}},
-  {"vaae1is", VAAE1IS, {}},
-  {"alle1is", ALLE1IS, {}},
-  {"vale1is", VALE1IS, {}},
-  {"vale2is", VALE2IS, {}},
-  {"vale3is", VALE3IS, {}},
-  {"vmalls12e1is", VMALLS12E1IS, {}},
-  {"vaale1is", VAALE1IS, {}},
-  {"ipas2e1", IPAS2E1, {}},
-  {"ipas2le1", IPAS2LE1, {}},
-  {"vmalle1", VMALLE1, {}},
-  {"alle2", ALLE2, {}},
-  {"alle3", ALLE3, {}},
-  {"vae1", VAE1, {}},
-  {"vae2", VAE2, {}},
-  {"vae3", VAE3, {}},
-  {"aside1", ASIDE1, {}},
-  {"vaae1", VAAE1, {}},
-  {"alle1", ALLE1, {}},
-  {"vale1", VALE1, {}},
-  {"vale2", VALE2, {}},
-  {"vale3", VALE3, {}},
-  {"vmalls12e1", VMALLS12E1, {}},
-  {"vaale1", VAALE1, {}}
-};
-
-AArch64TLBI::TLBIMapper::TLBIMapper()
-  : AArch64NamedImmMapper(TLBIMappings, 0) {}
+namespace llvm {
+  namespace AArch64TLBI {
+#define GET_TLBI_IMPL
+#include "AArch64GenSystemOperands.inc"
+  }
+}
diff --git a/lib/Target/AArch64/Utils/AArch64BaseInfo.h b/lib/Target/AArch64/Utils/AArch64BaseInfo.h
index e63627eae123..dcc39176031c 100644
--- a/lib/Target/AArch64/Utils/AArch64BaseInfo.h
+++ b/lib/Target/AArch64/Utils/AArch64BaseInfo.h
@@ -266,231 +266,85 @@ inline static unsigned getNZCVToSatisfyCondCode(CondCode Code) {
 }
 } // end namespace AArch64CC
 
-/// Instances of this class can perform bidirectional mapping from random
-/// identifier strings to operand encodings. For example "MSR" takes a named
-/// system-register which must be encoded somehow and decoded for printing. This
-/// central location means that the information for those transformations is not
-/// duplicated and remains in sync.
-///
-/// FIXME: currently the algorithm is a completely unoptimised linear
-/// search. Obviously this could be improved, but we would probably want to work
-/// out just how often these instructions are emitted before working on it. It
-/// might even be optimal to just reorder the tables for the common instructions
-/// rather than changing the algorithm.
-struct AArch64NamedImmMapper {
-  struct Mapping {
+namespace AArch64AT{
+  struct AT {
     const char *Name;
-    uint32_t Value;
-    // Set of features this mapping is available for
-    // Zero value of FeatureBitSet means the mapping is always available
-    FeatureBitset FeatureBitSet;
-
-    bool isNameEqual(std::string Other,
-                     const FeatureBitset& FeatureBits) const {
-      if (FeatureBitSet.any() &&
-          (FeatureBitSet & FeatureBits).none())
-        return false;
-      return Name == Other;
-    }
-
-    bool isValueEqual(uint32_t Other,
-                      const FeatureBitset& FeatureBits) const {
-      if (FeatureBitSet.any() &&
-          (FeatureBitSet & FeatureBits).none())
-        return false;
-      return Value == Other;
-    }
-  };
-
-  template<int N>
-  AArch64NamedImmMapper(const Mapping (&Mappings)[N], uint32_t TooBigImm)
-    : Mappings(&Mappings[0]), NumMappings(N), TooBigImm(TooBigImm) {}
-
-  // Maps value to string, depending on availability for FeatureBits given
-  StringRef toString(uint32_t Value, const FeatureBitset& FeatureBits,
-                     bool &Valid) const;
-  // Maps string to value, depending on availability for FeatureBits given
-  uint32_t fromString(StringRef Name, const FeatureBitset& FeatureBits,
-                     bool &Valid) const;
-
-  /// Many of the instructions allow an alternative assembly form consisting of
-  /// a simple immediate. Currently the only valid forms are ranges [0, N) where
-  /// N being 0 indicates no immediate syntax-form is allowed.
-  bool validImm(uint32_t Value) const;
-protected:
-  const Mapping *Mappings;
-  size_t NumMappings;
-  uint32_t TooBigImm;
-};
-
-namespace AArch64AT {
-  enum ATValues {
-    Invalid = -1,    // Op0 Op1  CRn   CRm   Op2
-    S1E1R = 0x43c0,  // 01  000  0111  1000  000
-    S1E2R = 0x63c0,  // 01  100  0111  1000  000
-    S1E3R = 0x73c0,  // 01  110  0111  1000  000
-    S1E1W = 0x43c1,  // 01  000  0111  1000  001
-    S1E2W = 0x63c1,  // 01  100  0111  1000  001
-    S1E3W = 0x73c1,  // 01  110  0111  1000  001
-    S1E0R = 0x43c2,  // 01  000  0111  1000  010
-    S1E0W = 0x43c3,  // 01  000  0111  1000  011
-    S12E1R = 0x63c4, // 01  100  0111  1000  100
-    S12E1W = 0x63c5, // 01  100  0111  1000  101
-    S12E0R = 0x63c6, // 01  100  0111  1000  110
-    S12E0W = 0x63c7, // 01  100  0111  1000  111
-    S1E1RP = 0x43c8, // 01  000  0111  1001  000
-    S1E1WP = 0x43c9  // 01  000  0111  1001  001
+    uint16_t Encoding;
   };
 
-  struct ATMapper : AArch64NamedImmMapper {
-    const static Mapping ATMappings[];
-
-    ATMapper();
-  };
+  #define GET_AT_DECL
+  #include "AArch64GenSystemOperands.inc"
 
 }
 namespace AArch64DB {
-  enum DBValues {
-    Invalid = -1,
-    OSHLD = 0x1,
-    OSHST = 0x2,
-    OSH =   0x3,
-    NSHLD = 0x5,
-    NSHST = 0x6,
-    NSH =   0x7,
-    ISHLD = 0x9,
-    ISHST = 0xa,
-    ISH =   0xb,
-    LD =    0xd,
-    ST =    0xe,
-    SY =    0xf
+  struct DB {
+    const char *Name;
+    uint16_t Encoding;
   };
 
-  struct DBarrierMapper : AArch64NamedImmMapper {
-    const static Mapping DBarrierMappings[];
-
-    DBarrierMapper();
-  };
+  #define GET_DB_DECL
+  #include "AArch64GenSystemOperands.inc"
 }
 
 namespace  AArch64DC {
-  enum DCValues {
-    Invalid = -1,   // Op1  CRn   CRm   Op2
-    ZVA   = 0x5ba1, // 01  011  0111  0100  001
-    IVAC  = 0x43b1, // 01  000  0111  0110  001
-    ISW   = 0x43b2, // 01  000  0111  0110  010
-    CVAC  = 0x5bd1, // 01  011  0111  1010  001
-    CSW   = 0x43d2, // 01  000  0111  1010  010
-    CVAU  = 0x5bd9, // 01  011  0111  1011  001
-    CIVAC = 0x5bf1, // 01  011  0111  1110  001
-    CISW  = 0x43f2  // 01  000  0111  1110  010
-  };
-
-  struct DCMapper : AArch64NamedImmMapper {
-    const static Mapping DCMappings[];
-
-    DCMapper();
+  struct DC {
+    const char *Name;
+    uint16_t Encoding;
   };
 
+  #define GET_DC_DECL
+  #include "AArch64GenSystemOperands.inc"
 }
 
 namespace  AArch64IC {
-  enum ICValues {
-    Invalid = -1,     // Op1  CRn   CRm   Op2
-    IALLUIS = 0x0388, // 000  0111  0001  000
-    IALLU = 0x03a8,   // 000  0111  0101  000
-    IVAU = 0x1ba9     // 011  0111  0101  001
-  };
-
-
-  struct ICMapper : AArch64NamedImmMapper {
-    const static Mapping ICMappings[];
-
-    ICMapper();
+  struct IC {
+    const char *Name;
+    uint16_t Encoding;
+    bool NeedsReg;
   };
-
-  static inline bool NeedsRegister(ICValues Val) {
-    return Val == IVAU;
-  }
+  #define GET_IC_DECL
+  #include "AArch64GenSystemOperands.inc"
 }
 
 namespace  AArch64ISB {
-  enum ISBValues {
-    Invalid = -1,
-    SY = 0xf
-  };
-  struct ISBMapper : AArch64NamedImmMapper {
-    const static Mapping ISBMappings[];
-
-    ISBMapper();
+  struct ISB {
+    const char *Name;
+    uint16_t Encoding;
   };
+  #define GET_ISB_DECL
+  #include "AArch64GenSystemOperands.inc"
 }
 
 namespace AArch64PRFM {
-  enum PRFMValues {
-    Invalid = -1,
-    PLDL1KEEP = 0x00,
-    PLDL1STRM = 0x01,
-    PLDL2KEEP = 0x02,
-    PLDL2STRM = 0x03,
-    PLDL3KEEP = 0x04,
-    PLDL3STRM = 0x05,
-    PLIL1KEEP = 0x08,
-    PLIL1STRM = 0x09,
-    PLIL2KEEP = 0x0a,
-    PLIL2STRM = 0x0b,
-    PLIL3KEEP = 0x0c,
-    PLIL3STRM = 0x0d,
-    PSTL1KEEP = 0x10,
-    PSTL1STRM = 0x11,
-    PSTL2KEEP = 0x12,
-    PSTL2STRM = 0x13,
-    PSTL3KEEP = 0x14,
-    PSTL3STRM = 0x15
-  };
-
-  struct PRFMMapper : AArch64NamedImmMapper {
-    const static Mapping PRFMMappings[];
-
-    PRFMMapper();
+  struct PRFM {
+    const char *Name;
+    uint16_t Encoding;
   };
+  #define GET_PRFM_DECL
+  #include "AArch64GenSystemOperands.inc"
 }
 
 namespace AArch64PState {
-  enum PStateValues {
-    Invalid = -1,
-    SPSel = 0x05,
-    DAIFSet = 0x1e,
-    DAIFClr = 0x1f,
-
-    // v8.1a "Privileged Access Never" extension-specific PStates
-    PAN = 0x04,
-
-    // v8.2a "User Access Override" extension-specific PStates
-    UAO = 0x03
-  };
-
-  struct PStateMapper : AArch64NamedImmMapper {
-    const static Mapping PStateMappings[];
+  struct PState {
+    const char *Name;
+    uint16_t Encoding;
+    FeatureBitset FeaturesRequired;
 
-    PStateMapper();
+    bool haveFeatures(FeatureBitset ActiveFeatures) const {
+      return (FeaturesRequired & ActiveFeatures) == FeaturesRequired;
+    }
   };
-
+  #define GET_PSTATE_DECL
+  #include "AArch64GenSystemOperands.inc"
 }
 
 namespace AArch64PSBHint {
-  enum PSBHintValues {
-    Invalid = -1,
-    // v8.2a "Statistical Profiling" extension-specific PSB operands
-    CSync = 0x11,  // psb csync = hint #0x11
-  };
-
-  struct PSBHintMapper : AArch64NamedImmMapper {
-    const static Mapping PSBHintMappings[];
-
-    PSBHintMapper();
+  struct PSB {
+    const char *Name;
+    uint16_t Encoding;
   };
-
+  #define GET_PSB_DECL
+  #include "AArch64GenSystemOperands.inc"
 }
 
 namespace AArch64SE {
@@ -574,754 +428,36 @@ AArch64StringToVectorLayout(StringRef LayoutStr) {
 }
 
 namespace AArch64SysReg {
-  enum SysRegROValues {
-    MDCCSR_EL0        = 0x9808, // 10  011  0000  0001  000
-    DBGDTRRX_EL0      = 0x9828, // 10  011  0000  0101  000
-    MDRAR_EL1         = 0x8080, // 10  000  0001  0000  000
-    OSLSR_EL1         = 0x808c, // 10  000  0001  0001  100
-    DBGAUTHSTATUS_EL1 = 0x83f6, // 10  000  0111  1110  110
-    PMCEID0_EL0       = 0xdce6, // 11  011  1001  1100  110
-    PMCEID1_EL0       = 0xdce7, // 11  011  1001  1100  111
-    MIDR_EL1          = 0xc000, // 11  000  0000  0000  000
-    CCSIDR_EL1        = 0xc800, // 11  001  0000  0000  000
-    CLIDR_EL1         = 0xc801, // 11  001  0000  0000  001
-    CTR_EL0           = 0xd801, // 11  011  0000  0000  001
-    MPIDR_EL1         = 0xc005, // 11  000  0000  0000  101
-    REVIDR_EL1        = 0xc006, // 11  000  0000  0000  110
-    AIDR_EL1          = 0xc807, // 11  001  0000  0000  111
-    DCZID_EL0         = 0xd807, // 11  011  0000  0000  111
-    ID_PFR0_EL1       = 0xc008, // 11  000  0000  0001  000
-    ID_PFR1_EL1       = 0xc009, // 11  000  0000  0001  001
-    ID_DFR0_EL1       = 0xc00a, // 11  000  0000  0001  010
-    ID_AFR0_EL1       = 0xc00b, // 11  000  0000  0001  011
-    ID_MMFR0_EL1      = 0xc00c, // 11  000  0000  0001  100
-    ID_MMFR1_EL1      = 0xc00d, // 11  000  0000  0001  101
-    ID_MMFR2_EL1      = 0xc00e, // 11  000  0000  0001  110
-    ID_MMFR3_EL1      = 0xc00f, // 11  000  0000  0001  111
-    ID_ISAR0_EL1      = 0xc010, // 11  000  0000  0010  000
-    ID_ISAR1_EL1      = 0xc011, // 11  000  0000  0010  001
-    ID_ISAR2_EL1      = 0xc012, // 11  000  0000  0010  010
-    ID_ISAR3_EL1      = 0xc013, // 11  000  0000  0010  011
-    ID_ISAR4_EL1      = 0xc014, // 11  000  0000  0010  100
-    ID_ISAR5_EL1      = 0xc015, // 11  000  0000  0010  101
-    ID_A64PFR0_EL1    = 0xc020, // 11  000  0000  0100  000
-    ID_A64PFR1_EL1    = 0xc021, // 11  000  0000  0100  001
-    ID_A64DFR0_EL1    = 0xc028, // 11  000  0000  0101  000
-    ID_A64DFR1_EL1    = 0xc029, // 11  000  0000  0101  001
-    ID_A64AFR0_EL1    = 0xc02c, // 11  000  0000  0101  100
-    ID_A64AFR1_EL1    = 0xc02d, // 11  000  0000  0101  101
-    ID_A64ISAR0_EL1   = 0xc030, // 11  000  0000  0110  000
-    ID_A64ISAR1_EL1   = 0xc031, // 11  000  0000  0110  001
-    ID_A64MMFR0_EL1   = 0xc038, // 11  000  0000  0111  000
-    ID_A64MMFR1_EL1   = 0xc039, // 11  000  0000  0111  001
-    ID_A64MMFR2_EL1   = 0xc03a, // 11  000  0000  0111  010
-    MVFR0_EL1         = 0xc018, // 11  000  0000  0011  000
-    MVFR1_EL1         = 0xc019, // 11  000  0000  0011  001
-    MVFR2_EL1         = 0xc01a, // 11  000  0000  0011  010
-    RVBAR_EL1         = 0xc601, // 11  000  1100  0000  001
-    RVBAR_EL2         = 0xe601, // 11  100  1100  0000  001
-    RVBAR_EL3         = 0xf601, // 11  110  1100  0000  001
-    ISR_EL1           = 0xc608, // 11  000  1100  0001  000
-    CNTPCT_EL0        = 0xdf01, // 11  011  1110  0000  001
-    CNTVCT_EL0        = 0xdf02,  // 11  011  1110  0000  010
-    ID_MMFR4_EL1      = 0xc016,  // 11  000  0000  0010  110
-
-    // Trace registers
-    TRCSTATR          = 0x8818, // 10  001  0000  0011  000
-    TRCIDR8           = 0x8806, // 10  001  0000  0000  110
-    TRCIDR9           = 0x880e, // 10  001  0000  0001  110
-    TRCIDR10          = 0x8816, // 10  001  0000  0010  110
-    TRCIDR11          = 0x881e, // 10  001  0000  0011  110
-    TRCIDR12          = 0x8826, // 10  001  0000  0100  110
-    TRCIDR13          = 0x882e, // 10  001  0000  0101  110
-    TRCIDR0           = 0x8847, // 10  001  0000  1000  111
-    TRCIDR1           = 0x884f, // 10  001  0000  1001  111
-    TRCIDR2           = 0x8857, // 10  001  0000  1010  111
-    TRCIDR3           = 0x885f, // 10  001  0000  1011  111
-    TRCIDR4           = 0x8867, // 10  001  0000  1100  111
-    TRCIDR5           = 0x886f, // 10  001  0000  1101  111
-    TRCIDR6           = 0x8877, // 10  001  0000  1110  111
-    TRCIDR7           = 0x887f, // 10  001  0000  1111  111
-    TRCOSLSR          = 0x888c, // 10  001  0001  0001  100
-    TRCPDSR           = 0x88ac, // 10  001  0001  0101  100
-    TRCDEVAFF0        = 0x8bd6, // 10  001  0111  1010  110
-    TRCDEVAFF1        = 0x8bde, // 10  001  0111  1011  110
-    TRCLSR            = 0x8bee, // 10  001  0111  1101  110
-    TRCAUTHSTATUS     = 0x8bf6, // 10  001  0111  1110  110
-    TRCDEVARCH        = 0x8bfe, // 10  001  0111  1111  110
-    TRCDEVID          = 0x8b97, // 10  001  0111  0010  111
-    TRCDEVTYPE        = 0x8b9f, // 10  001  0111  0011  111
-    TRCPIDR4          = 0x8ba7, // 10  001  0111  0100  111
-    TRCPIDR5          = 0x8baf, // 10  001  0111  0101  111
-    TRCPIDR6          = 0x8bb7, // 10  001  0111  0110  111
-    TRCPIDR7          = 0x8bbf, // 10  001  0111  0111  111
-    TRCPIDR0          = 0x8bc7, // 10  001  0111  1000  111
-    TRCPIDR1          = 0x8bcf, // 10  001  0111  1001  111
-    TRCPIDR2          = 0x8bd7, // 10  001  0111  1010  111
-    TRCPIDR3          = 0x8bdf, // 10  001  0111  1011  111
-    TRCCIDR0          = 0x8be7, // 10  001  0111  1100  111
-    TRCCIDR1          = 0x8bef, // 10  001  0111  1101  111
-    TRCCIDR2          = 0x8bf7, // 10  001  0111  1110  111
-    TRCCIDR3          = 0x8bff, // 10  001  0111  1111  111
-
-    // GICv3 registers
-    ICC_IAR1_EL1      = 0xc660, // 11  000  1100  1100  000
-    ICC_IAR0_EL1      = 0xc640, // 11  000  1100  1000  000
-    ICC_HPPIR1_EL1    = 0xc662, // 11  000  1100  1100  010
-    ICC_HPPIR0_EL1    = 0xc642, // 11  000  1100  1000  010
-    ICC_RPR_EL1       = 0xc65b, // 11  000  1100  1011  011
-    ICH_VTR_EL2       = 0xe659, // 11  100  1100  1011  001
-    ICH_EISR_EL2      = 0xe65b, // 11  100  1100  1011  011
-    ICH_ELSR_EL2      = 0xe65d  // 11  100  1100  1011  101
-  };
-
-  enum SysRegWOValues {
-    DBGDTRTX_EL0      = 0x9828, // 10  011  0000  0101  000
-    OSLAR_EL1         = 0x8084, // 10  000  0001  0000  100
-    PMSWINC_EL0       = 0xdce4,  // 11  011  1001  1100  100
-
-    // Trace Registers
-    TRCOSLAR          = 0x8884, // 10  001  0001  0000  100
-    TRCLAR            = 0x8be6, // 10  001  0111  1100  110
-
-    // GICv3 registers
-    ICC_EOIR1_EL1     = 0xc661, // 11  000  1100  1100  001
-    ICC_EOIR0_EL1     = 0xc641, // 11  000  1100  1000  001
-    ICC_DIR_EL1       = 0xc659, // 11  000  1100  1011  001
-    ICC_SGI1R_EL1     = 0xc65d, // 11  000  1100  1011  101
-    ICC_ASGI1R_EL1    = 0xc65e, // 11  000  1100  1011  110
-    ICC_SGI0R_EL1     = 0xc65f  // 11  000  1100  1011  111
-  };
-
-  enum SysRegValues {
-    Invalid = -1,               // Op0 Op1  CRn   CRm   Op2
-    OSDTRRX_EL1       = 0x8002, // 10  000  0000  0000  010
-    OSDTRTX_EL1       = 0x801a, // 10  000  0000  0011  010
-    TEECR32_EL1       = 0x9000, // 10  010  0000  0000  000
-    MDCCINT_EL1       = 0x8010, // 10  000  0000  0010  000
-    MDSCR_EL1         = 0x8012, // 10  000  0000  0010  010
-    DBGDTR_EL0        = 0x9820, // 10  011  0000  0100  000
-    OSECCR_EL1        = 0x8032, // 10  000  0000  0110  010
-    DBGVCR32_EL2      = 0xa038, // 10  100  0000  0111  000
-    DBGBVR0_EL1       = 0x8004, // 10  000  0000  0000  100
-    DBGBVR1_EL1       = 0x800c, // 10  000  0000  0001  100
-    DBGBVR2_EL1       = 0x8014, // 10  000  0000  0010  100
-    DBGBVR3_EL1       = 0x801c, // 10  000  0000  0011  100
-    DBGBVR4_EL1       = 0x8024, // 10  000  0000  0100  100
-    DBGBVR5_EL1       = 0x802c, // 10  000  0000  0101  100
-    DBGBVR6_EL1       = 0x8034, // 10  000  0000  0110  100
-    DBGBVR7_EL1       = 0x803c, // 10  000  0000  0111  100
-    DBGBVR8_EL1       = 0x8044, // 10  000  0000  1000  100
-    DBGBVR9_EL1       = 0x804c, // 10  000  0000  1001  100
-    DBGBVR10_EL1      = 0x8054, // 10  000  0000  1010  100
-    DBGBVR11_EL1      = 0x805c, // 10  000  0000  1011  100
-    DBGBVR12_EL1      = 0x8064, // 10  000  0000  1100  100
-    DBGBVR13_EL1      = 0x806c, // 10  000  0000  1101  100
-    DBGBVR14_EL1      = 0x8074, // 10  000  0000  1110  100
-    DBGBVR15_EL1      = 0x807c, // 10  000  0000  1111  100
-    DBGBCR0_EL1       = 0x8005, // 10  000  0000  0000  101
-    DBGBCR1_EL1       = 0x800d, // 10  000  0000  0001  101
-    DBGBCR2_EL1       = 0x8015, // 10  000  0000  0010  101
-    DBGBCR3_EL1       = 0x801d, // 10  000  0000  0011  101
-    DBGBCR4_EL1       = 0x8025, // 10  000  0000  0100  101
-    DBGBCR5_EL1       = 0x802d, // 10  000  0000  0101  101
-    DBGBCR6_EL1       = 0x8035, // 10  000  0000  0110  101
-    DBGBCR7_EL1       = 0x803d, // 10  000  0000  0111  101
-    DBGBCR8_EL1       = 0x8045, // 10  000  0000  1000  101
-    DBGBCR9_EL1       = 0x804d, // 10  000  0000  1001  101
-    DBGBCR10_EL1      = 0x8055, // 10  000  0000  1010  101
-    DBGBCR11_EL1      = 0x805d, // 10  000  0000  1011  101
-    DBGBCR12_EL1      = 0x8065, // 10  000  0000  1100  101
-    DBGBCR13_EL1      = 0x806d, // 10  000  0000  1101  101
-    DBGBCR14_EL1      = 0x8075, // 10  000  0000  1110  101
-    DBGBCR15_EL1      = 0x807d, // 10  000  0000  1111  101
-    DBGWVR0_EL1       = 0x8006, // 10  000  0000  0000  110
-    DBGWVR1_EL1       = 0x800e, // 10  000  0000  0001  110
-    DBGWVR2_EL1       = 0x8016, // 10  000  0000  0010  110
-    DBGWVR3_EL1       = 0x801e, // 10  000  0000  0011  110
-    DBGWVR4_EL1       = 0x8026, // 10  000  0000  0100  110
-    DBGWVR5_EL1       = 0x802e, // 10  000  0000  0101  110
-    DBGWVR6_EL1       = 0x8036, // 10  000  0000  0110  110
-    DBGWVR7_EL1       = 0x803e, // 10  000  0000  0111  110
-    DBGWVR8_EL1       = 0x8046, // 10  000  0000  1000  110
-    DBGWVR9_EL1       = 0x804e, // 10  000  0000  1001  110
-    DBGWVR10_EL1      = 0x8056, // 10  000  0000  1010  110
-    DBGWVR11_EL1      = 0x805e, // 10  000  0000  1011  110
-    DBGWVR12_EL1      = 0x8066, // 10  000  0000  1100  110
-    DBGWVR13_EL1      = 0x806e, // 10  000  0000  1101  110
-    DBGWVR14_EL1      = 0x8076, // 10  000  0000  1110  110
-    DBGWVR15_EL1      = 0x807e, // 10  000  0000  1111  110
-    DBGWCR0_EL1       = 0x8007, // 10  000  0000  0000  111
-    DBGWCR1_EL1       = 0x800f, // 10  000  0000  0001  111
-    DBGWCR2_EL1       = 0x8017, // 10  000  0000  0010  111
-    DBGWCR3_EL1       = 0x801f, // 10  000  0000  0011  111
-    DBGWCR4_EL1       = 0x8027, // 10  000  0000  0100  111
-    DBGWCR5_EL1       = 0x802f, // 10  000  0000  0101  111
-    DBGWCR6_EL1       = 0x8037, // 10  000  0000  0110  111
-    DBGWCR7_EL1       = 0x803f, // 10  000  0000  0111  111
-    DBGWCR8_EL1       = 0x8047, // 10  000  0000  1000  111
-    DBGWCR9_EL1       = 0x804f, // 10  000  0000  1001  111
-    DBGWCR10_EL1      = 0x8057, // 10  000  0000  1010  111
-    DBGWCR11_EL1      = 0x805f, // 10  000  0000  1011  111
-    DBGWCR12_EL1      = 0x8067, // 10  000  0000  1100  111
-    DBGWCR13_EL1      = 0x806f, // 10  000  0000  1101  111
-    DBGWCR14_EL1      = 0x8077, // 10  000  0000  1110  111
-    DBGWCR15_EL1      = 0x807f, // 10  000  0000  1111  111
-    TEEHBR32_EL1      = 0x9080, // 10  010  0001  0000  000
-    OSDLR_EL1         = 0x809c, // 10  000  0001  0011  100
-    DBGPRCR_EL1       = 0x80a4, // 10  000  0001  0100  100
-    DBGCLAIMSET_EL1   = 0x83c6, // 10  000  0111  1000  110
-    DBGCLAIMCLR_EL1   = 0x83ce, // 10  000  0111  1001  110
-    CSSELR_EL1        = 0xd000, // 11  010  0000  0000  000
-    VPIDR_EL2         = 0xe000, // 11  100  0000  0000  000
-    VMPIDR_EL2        = 0xe005, // 11  100  0000  0000  101
-    CPACR_EL1         = 0xc082, // 11  000  0001  0000  010
-    SCTLR_EL1         = 0xc080, // 11  000  0001  0000  000
-    SCTLR_EL2         = 0xe080, // 11  100  0001  0000  000
-    SCTLR_EL3         = 0xf080, // 11  110  0001  0000  000
-    ACTLR_EL1         = 0xc081, // 11  000  0001  0000  001
-    ACTLR_EL2         = 0xe081, // 11  100  0001  0000  001
-    ACTLR_EL3         = 0xf081, // 11  110  0001  0000  001
-    HCR_EL2           = 0xe088, // 11  100  0001  0001  000
-    SCR_EL3           = 0xf088, // 11  110  0001  0001  000
-    MDCR_EL2          = 0xe089, // 11  100  0001  0001  001
-    SDER32_EL3        = 0xf089, // 11  110  0001  0001  001
-    CPTR_EL2          = 0xe08a, // 11  100  0001  0001  010
-    CPTR_EL3          = 0xf08a, // 11  110  0001  0001  010
-    HSTR_EL2          = 0xe08b, // 11  100  0001  0001  011
-    HACR_EL2          = 0xe08f, // 11  100  0001  0001  111
-    MDCR_EL3          = 0xf099, // 11  110  0001  0011  001
-    TTBR0_EL1         = 0xc100, // 11  000  0010  0000  000
-    TTBR0_EL2         = 0xe100, // 11  100  0010  0000  000
-    TTBR0_EL3         = 0xf100, // 11  110  0010  0000  000
-    TTBR1_EL1         = 0xc101, // 11  000  0010  0000  001
-    TCR_EL1           = 0xc102, // 11  000  0010  0000  010
-    TCR_EL2           = 0xe102, // 11  100  0010  0000  010
-    TCR_EL3           = 0xf102, // 11  110  0010  0000  010
-    VTTBR_EL2         = 0xe108, // 11  100  0010  0001  000
-    VTCR_EL2          = 0xe10a, // 11  100  0010  0001  010
-    DACR32_EL2        = 0xe180, // 11  100  0011  0000  000
-    SPSR_EL1          = 0xc200, // 11  000  0100  0000  000
-    SPSR_EL2          = 0xe200, // 11  100  0100  0000  000
-    SPSR_EL3          = 0xf200, // 11  110  0100  0000  000
-    ELR_EL1           = 0xc201, // 11  000  0100  0000  001
-    ELR_EL2           = 0xe201, // 11  100  0100  0000  001
-    ELR_EL3           = 0xf201, // 11  110  0100  0000  001
-    SP_EL0            = 0xc208, // 11  000  0100  0001  000
-    SP_EL1            = 0xe208, // 11  100  0100  0001  000
-    SP_EL2            = 0xf208, // 11  110  0100  0001  000
-    SPSel             = 0xc210, // 11  000  0100  0010  000
-    NZCV              = 0xda10, // 11  011  0100  0010  000
-    DAIF              = 0xda11, // 11  011  0100  0010  001
-    CurrentEL         = 0xc212, // 11  000  0100  0010  010
-    SPSR_irq          = 0xe218, // 11  100  0100  0011  000
-    SPSR_abt          = 0xe219, // 11  100  0100  0011  001
-    SPSR_und          = 0xe21a, // 11  100  0100  0011  010
-    SPSR_fiq          = 0xe21b, // 11  100  0100  0011  011
-    FPCR              = 0xda20, // 11  011  0100  0100  000
-    FPSR              = 0xda21, // 11  011  0100  0100  001
-    DSPSR_EL0         = 0xda28, // 11  011  0100  0101  000
-    DLR_EL0           = 0xda29, // 11  011  0100  0101  001
-    IFSR32_EL2        = 0xe281, // 11  100  0101  0000  001
-    AFSR0_EL1         = 0xc288, // 11  000  0101  0001  000
-    AFSR0_EL2         = 0xe288, // 11  100  0101  0001  000
-    AFSR0_EL3         = 0xf288, // 11  110  0101  0001  000
-    AFSR1_EL1         = 0xc289, // 11  000  0101  0001  001
-    AFSR1_EL2         = 0xe289, // 11  100  0101  0001  001
-    AFSR1_EL3         = 0xf289, // 11  110  0101  0001  001
-    ESR_EL1           = 0xc290, // 11  000  0101  0010  000
-    ESR_EL2           = 0xe290, // 11  100  0101  0010  000
-    ESR_EL3           = 0xf290, // 11  110  0101  0010  000
-    FPEXC32_EL2       = 0xe298, // 11  100  0101  0011  000
-    FAR_EL1           = 0xc300, // 11  000  0110  0000  000
-    FAR_EL2           = 0xe300, // 11  100  0110  0000  000
-    FAR_EL3           = 0xf300, // 11  110  0110  0000  000
-    HPFAR_EL2         = 0xe304, // 11  100  0110  0000  100
-    PAR_EL1           = 0xc3a0, // 11  000  0111  0100  000
-    PMCR_EL0          = 0xdce0, // 11  011  1001  1100  000
-    PMCNTENSET_EL0    = 0xdce1, // 11  011  1001  1100  001
-    PMCNTENCLR_EL0    = 0xdce2, // 11  011  1001  1100  010
-    PMOVSCLR_EL0      = 0xdce3, // 11  011  1001  1100  011
-    PMSELR_EL0        = 0xdce5, // 11  011  1001  1100  101
-    PMCCNTR_EL0       = 0xdce8, // 11  011  1001  1101  000
-    PMXEVTYPER_EL0    = 0xdce9, // 11  011  1001  1101  001
-    PMXEVCNTR_EL0     = 0xdcea, // 11  011  1001  1101  010
-    PMUSERENR_EL0     = 0xdcf0, // 11  011  1001  1110  000
-    PMINTENSET_EL1    = 0xc4f1, // 11  000  1001  1110  001
-    PMINTENCLR_EL1    = 0xc4f2, // 11  000  1001  1110  010
-    PMOVSSET_EL0      = 0xdcf3, // 11  011  1001  1110  011
-    MAIR_EL1          = 0xc510, // 11  000  1010  0010  000
-    MAIR_EL2          = 0xe510, // 11  100  1010  0010  000
-    MAIR_EL3          = 0xf510, // 11  110  1010  0010  000
-    AMAIR_EL1         = 0xc518, // 11  000  1010  0011  000
-    AMAIR_EL2         = 0xe518, // 11  100  1010  0011  000
-    AMAIR_EL3         = 0xf518, // 11  110  1010  0011  000
-    VBAR_EL1          = 0xc600, // 11  000  1100  0000  000
-    VBAR_EL2          = 0xe600, // 11  100  1100  0000  000
-    VBAR_EL3          = 0xf600, // 11  110  1100  0000  000
-    RMR_EL1           = 0xc602, // 11  000  1100  0000  010
-    RMR_EL2           = 0xe602, // 11  100  1100  0000  010
-    RMR_EL3           = 0xf602, // 11  110  1100  0000  010
-    CONTEXTIDR_EL1    = 0xc681, // 11  000  1101  0000  001
-    TPIDR_EL0         = 0xde82, // 11  011  1101  0000  010
-    TPIDR_EL2         = 0xe682, // 11  100  1101  0000  010
-    TPIDR_EL3         = 0xf682, // 11  110  1101  0000  010
-    TPIDRRO_EL0       = 0xde83, // 11  011  1101  0000  011
-    TPIDR_EL1         = 0xc684, // 11  000  1101  0000  100
-    CNTFRQ_EL0        = 0xdf00, // 11  011  1110  0000  000
-    CNTVOFF_EL2       = 0xe703, // 11  100  1110  0000  011
-    CNTKCTL_EL1       = 0xc708, // 11  000  1110  0001  000
-    CNTHCTL_EL2       = 0xe708, // 11  100  1110  0001  000
-    CNTP_TVAL_EL0     = 0xdf10, // 11  011  1110  0010  000
-    CNTHP_TVAL_EL2    = 0xe710, // 11  100  1110  0010  000
-    CNTPS_TVAL_EL1    = 0xff10, // 11  111  1110  0010  000
-    CNTP_CTL_EL0      = 0xdf11, // 11  011  1110  0010  001
-    CNTHP_CTL_EL2     = 0xe711, // 11  100  1110  0010  001
-    CNTPS_CTL_EL1     = 0xff11, // 11  111  1110  0010  001
-    CNTP_CVAL_EL0     = 0xdf12, // 11  011  1110  0010  010
-    CNTHP_CVAL_EL2    = 0xe712, // 11  100  1110  0010  010
-    CNTPS_CVAL_EL1    = 0xff12, // 11  111  1110  0010  010
-    CNTV_TVAL_EL0     = 0xdf18, // 11  011  1110  0011  000
-    CNTV_CTL_EL0      = 0xdf19, // 11  011  1110  0011  001
-    CNTV_CVAL_EL0     = 0xdf1a, // 11  011  1110  0011  010
-    PMEVCNTR0_EL0     = 0xdf40, // 11  011  1110  1000  000
-    PMEVCNTR1_EL0     = 0xdf41, // 11  011  1110  1000  001
-    PMEVCNTR2_EL0     = 0xdf42, // 11  011  1110  1000  010
-    PMEVCNTR3_EL0     = 0xdf43, // 11  011  1110  1000  011
-    PMEVCNTR4_EL0     = 0xdf44, // 11  011  1110  1000  100
-    PMEVCNTR5_EL0     = 0xdf45, // 11  011  1110  1000  101
-    PMEVCNTR6_EL0     = 0xdf46, // 11  011  1110  1000  110
-    PMEVCNTR7_EL0     = 0xdf47, // 11  011  1110  1000  111
-    PMEVCNTR8_EL0     = 0xdf48, // 11  011  1110  1001  000
-    PMEVCNTR9_EL0     = 0xdf49, // 11  011  1110  1001  001
-    PMEVCNTR10_EL0    = 0xdf4a, // 11  011  1110  1001  010
-    PMEVCNTR11_EL0    = 0xdf4b, // 11  011  1110  1001  011
-    PMEVCNTR12_EL0    = 0xdf4c, // 11  011  1110  1001  100
-    PMEVCNTR13_EL0    = 0xdf4d, // 11  011  1110  1001  101
-    PMEVCNTR14_EL0    = 0xdf4e, // 11  011  1110  1001  110
-    PMEVCNTR15_EL0    = 0xdf4f, // 11  011  1110  1001  111
-    PMEVCNTR16_EL0    = 0xdf50, // 11  011  1110  1010  000
-    PMEVCNTR17_EL0    = 0xdf51, // 11  011  1110  1010  001
-    PMEVCNTR18_EL0    = 0xdf52, // 11  011  1110  1010  010
-    PMEVCNTR19_EL0    = 0xdf53, // 11  011  1110  1010  011
-    PMEVCNTR20_EL0    = 0xdf54, // 11  011  1110  1010  100
-    PMEVCNTR21_EL0    = 0xdf55, // 11  011  1110  1010  101
-    PMEVCNTR22_EL0    = 0xdf56, // 11  011  1110  1010  110
-    PMEVCNTR23_EL0    = 0xdf57, // 11  011  1110  1010  111
-    PMEVCNTR24_EL0    = 0xdf58, // 11  011  1110  1011  000
-    PMEVCNTR25_EL0    = 0xdf59, // 11  011  1110  1011  001
-    PMEVCNTR26_EL0    = 0xdf5a, // 11  011  1110  1011  010
-    PMEVCNTR27_EL0    = 0xdf5b, // 11  011  1110  1011  011
-    PMEVCNTR28_EL0    = 0xdf5c, // 11  011  1110  1011  100
-    PMEVCNTR29_EL0    = 0xdf5d, // 11  011  1110  1011  101
-    PMEVCNTR30_EL0    = 0xdf5e, // 11  011  1110  1011  110
-    PMCCFILTR_EL0     = 0xdf7f, // 11  011  1110  1111  111
-    PMEVTYPER0_EL0    = 0xdf60, // 11  011  1110  1100  000
-    PMEVTYPER1_EL0    = 0xdf61, // 11  011  1110  1100  001
-    PMEVTYPER2_EL0    = 0xdf62, // 11  011  1110  1100  010
-    PMEVTYPER3_EL0    = 0xdf63, // 11  011  1110  1100  011
-    PMEVTYPER4_EL0    = 0xdf64, // 11  011  1110  1100  100
-    PMEVTYPER5_EL0    = 0xdf65, // 11  011  1110  1100  101
-    PMEVTYPER6_EL0    = 0xdf66, // 11  011  1110  1100  110
-    PMEVTYPER7_EL0    = 0xdf67, // 11  011  1110  1100  111
-    PMEVTYPER8_EL0    = 0xdf68, // 11  011  1110  1101  000
-    PMEVTYPER9_EL0    = 0xdf69, // 11  011  1110  1101  001
-    PMEVTYPER10_EL0   = 0xdf6a, // 11  011  1110  1101  010
-    PMEVTYPER11_EL0   = 0xdf6b, // 11  011  1110  1101  011
-    PMEVTYPER12_EL0   = 0xdf6c, // 11  011  1110  1101  100
-    PMEVTYPER13_EL0   = 0xdf6d, // 11  011  1110  1101  101
-    PMEVTYPER14_EL0   = 0xdf6e, // 11  011  1110  1101  110
-    PMEVTYPER15_EL0   = 0xdf6f, // 11  011  1110  1101  111
-    PMEVTYPER16_EL0   = 0xdf70, // 11  011  1110  1110  000
-    PMEVTYPER17_EL0   = 0xdf71, // 11  011  1110  1110  001
-    PMEVTYPER18_EL0   = 0xdf72, // 11  011  1110  1110  010
-    PMEVTYPER19_EL0   = 0xdf73, // 11  011  1110  1110  011
-    PMEVTYPER20_EL0   = 0xdf74, // 11  011  1110  1110  100
-    PMEVTYPER21_EL0   = 0xdf75, // 11  011  1110  1110  101
-    PMEVTYPER22_EL0   = 0xdf76, // 11  011  1110  1110  110
-    PMEVTYPER23_EL0   = 0xdf77, // 11  011  1110  1110  111
-    PMEVTYPER24_EL0   = 0xdf78, // 11  011  1110  1111  000
-    PMEVTYPER25_EL0   = 0xdf79, // 11  011  1110  1111  001
-    PMEVTYPER26_EL0   = 0xdf7a, // 11  011  1110  1111  010
-    PMEVTYPER27_EL0   = 0xdf7b, // 11  011  1110  1111  011
-    PMEVTYPER28_EL0   = 0xdf7c, // 11  011  1110  1111  100
-    PMEVTYPER29_EL0   = 0xdf7d, // 11  011  1110  1111  101
-    PMEVTYPER30_EL0   = 0xdf7e, // 11  011  1110  1111  110
-
-    // Trace registers
-    TRCPRGCTLR        = 0x8808, // 10  001  0000  0001  000
-    TRCPROCSELR       = 0x8810, // 10  001  0000  0010  000
-    TRCCONFIGR        = 0x8820, // 10  001  0000  0100  000
-    TRCAUXCTLR        = 0x8830, // 10  001  0000  0110  000
-    TRCEVENTCTL0R     = 0x8840, // 10  001  0000  1000  000
-    TRCEVENTCTL1R     = 0x8848, // 10  001  0000  1001  000
-    TRCSTALLCTLR      = 0x8858, // 10  001  0000  1011  000
-    TRCTSCTLR         = 0x8860, // 10  001  0000  1100  000
-    TRCSYNCPR         = 0x8868, // 10  001  0000  1101  000
-    TRCCCCTLR         = 0x8870, // 10  001  0000  1110  000
-    TRCBBCTLR         = 0x8878, // 10  001  0000  1111  000
-    TRCTRACEIDR       = 0x8801, // 10  001  0000  0000  001
-    TRCQCTLR          = 0x8809, // 10  001  0000  0001  001
-    TRCVICTLR         = 0x8802, // 10  001  0000  0000  010
-    TRCVIIECTLR       = 0x880a, // 10  001  0000  0001  010
-    TRCVISSCTLR       = 0x8812, // 10  001  0000  0010  010
-    TRCVIPCSSCTLR     = 0x881a, // 10  001  0000  0011  010
-    TRCVDCTLR         = 0x8842, // 10  001  0000  1000  010
-    TRCVDSACCTLR      = 0x884a, // 10  001  0000  1001  010
-    TRCVDARCCTLR      = 0x8852, // 10  001  0000  1010  010
-    TRCSEQEVR0        = 0x8804, // 10  001  0000  0000  100
-    TRCSEQEVR1        = 0x880c, // 10  001  0000  0001  100
-    TRCSEQEVR2        = 0x8814, // 10  001  0000  0010  100
-    TRCSEQRSTEVR      = 0x8834, // 10  001  0000  0110  100
-    TRCSEQSTR         = 0x883c, // 10  001  0000  0111  100
-    TRCEXTINSELR      = 0x8844, // 10  001  0000  1000  100
-    TRCCNTRLDVR0      = 0x8805, // 10  001  0000  0000  101
-    TRCCNTRLDVR1      = 0x880d, // 10  001  0000  0001  101
-    TRCCNTRLDVR2      = 0x8815, // 10  001  0000  0010  101
-    TRCCNTRLDVR3      = 0x881d, // 10  001  0000  0011  101
-    TRCCNTCTLR0       = 0x8825, // 10  001  0000  0100  101
-    TRCCNTCTLR1       = 0x882d, // 10  001  0000  0101  101
-    TRCCNTCTLR2       = 0x8835, // 10  001  0000  0110  101
-    TRCCNTCTLR3       = 0x883d, // 10  001  0000  0111  101
-    TRCCNTVR0         = 0x8845, // 10  001  0000  1000  101
-    TRCCNTVR1         = 0x884d, // 10  001  0000  1001  101
-    TRCCNTVR2         = 0x8855, // 10  001  0000  1010  101
-    TRCCNTVR3         = 0x885d, // 10  001  0000  1011  101
-    TRCIMSPEC0        = 0x8807, // 10  001  0000  0000  111
-    TRCIMSPEC1        = 0x880f, // 10  001  0000  0001  111
-    TRCIMSPEC2        = 0x8817, // 10  001  0000  0010  111
-    TRCIMSPEC3        = 0x881f, // 10  001  0000  0011  111
-    TRCIMSPEC4        = 0x8827, // 10  001  0000  0100  111
-    TRCIMSPEC5        = 0x882f, // 10  001  0000  0101  111
-    TRCIMSPEC6        = 0x8837, // 10  001  0000  0110  111
-    TRCIMSPEC7        = 0x883f, // 10  001  0000  0111  111
-    TRCRSCTLR2        = 0x8890, // 10  001  0001  0010  000
-    TRCRSCTLR3        = 0x8898, // 10  001  0001  0011  000
-    TRCRSCTLR4        = 0x88a0, // 10  001  0001  0100  000
-    TRCRSCTLR5        = 0x88a8, // 10  001  0001  0101  000
-    TRCRSCTLR6        = 0x88b0, // 10  001  0001  0110  000
-    TRCRSCTLR7        = 0x88b8, // 10  001  0001  0111  000
-    TRCRSCTLR8        = 0x88c0, // 10  001  0001  1000  000
-    TRCRSCTLR9        = 0x88c8, // 10  001  0001  1001  000
-    TRCRSCTLR10       = 0x88d0, // 10  001  0001  1010  000
-    TRCRSCTLR11       = 0x88d8, // 10  001  0001  1011  000
-    TRCRSCTLR12       = 0x88e0, // 10  001  0001  1100  000
-    TRCRSCTLR13       = 0x88e8, // 10  001  0001  1101  000
-    TRCRSCTLR14       = 0x88f0, // 10  001  0001  1110  000
-    TRCRSCTLR15       = 0x88f8, // 10  001  0001  1111  000
-    TRCRSCTLR16       = 0x8881, // 10  001  0001  0000  001
-    TRCRSCTLR17       = 0x8889, // 10  001  0001  0001  001
-    TRCRSCTLR18       = 0x8891, // 10  001  0001  0010  001
-    TRCRSCTLR19       = 0x8899, // 10  001  0001  0011  001
-    TRCRSCTLR20       = 0x88a1, // 10  001  0001  0100  001
-    TRCRSCTLR21       = 0x88a9, // 10  001  0001  0101  001
-    TRCRSCTLR22       = 0x88b1, // 10  001  0001  0110  001
-    TRCRSCTLR23       = 0x88b9, // 10  001  0001  0111  001
-    TRCRSCTLR24       = 0x88c1, // 10  001  0001  1000  001
-    TRCRSCTLR25       = 0x88c9, // 10  001  0001  1001  001
-    TRCRSCTLR26       = 0x88d1, // 10  001  0001  1010  001
-    TRCRSCTLR27       = 0x88d9, // 10  001  0001  1011  001
-    TRCRSCTLR28       = 0x88e1, // 10  001  0001  1100  001
-    TRCRSCTLR29       = 0x88e9, // 10  001  0001  1101  001
-    TRCRSCTLR30       = 0x88f1, // 10  001  0001  1110  001
-    TRCRSCTLR31       = 0x88f9, // 10  001  0001  1111  001
-    TRCSSCCR0         = 0x8882, // 10  001  0001  0000  010
-    TRCSSCCR1         = 0x888a, // 10  001  0001  0001  010
-    TRCSSCCR2         = 0x8892, // 10  001  0001  0010  010
-    TRCSSCCR3         = 0x889a, // 10  001  0001  0011  010
-    TRCSSCCR4         = 0x88a2, // 10  001  0001  0100  010
-    TRCSSCCR5         = 0x88aa, // 10  001  0001  0101  010
-    TRCSSCCR6         = 0x88b2, // 10  001  0001  0110  010
-    TRCSSCCR7         = 0x88ba, // 10  001  0001  0111  010
-    TRCSSCSR0         = 0x88c2, // 10  001  0001  1000  010
-    TRCSSCSR1         = 0x88ca, // 10  001  0001  1001  010
-    TRCSSCSR2         = 0x88d2, // 10  001  0001  1010  010
-    TRCSSCSR3         = 0x88da, // 10  001  0001  1011  010
-    TRCSSCSR4         = 0x88e2, // 10  001  0001  1100  010
-    TRCSSCSR5         = 0x88ea, // 10  001  0001  1101  010
-    TRCSSCSR6         = 0x88f2, // 10  001  0001  1110  010
-    TRCSSCSR7         = 0x88fa, // 10  001  0001  1111  010
-    TRCSSPCICR0       = 0x8883, // 10  001  0001  0000  011
-    TRCSSPCICR1       = 0x888b, // 10  001  0001  0001  011
-    TRCSSPCICR2       = 0x8893, // 10  001  0001  0010  011
-    TRCSSPCICR3       = 0x889b, // 10  001  0001  0011  011
-    TRCSSPCICR4       = 0x88a3, // 10  001  0001  0100  011
-    TRCSSPCICR5       = 0x88ab, // 10  001  0001  0101  011
-    TRCSSPCICR6       = 0x88b3, // 10  001  0001  0110  011
-    TRCSSPCICR7       = 0x88bb, // 10  001  0001  0111  011
-    TRCPDCR           = 0x88a4, // 10  001  0001  0100  100
-    TRCACVR0          = 0x8900, // 10  001  0010  0000  000
-    TRCACVR1          = 0x8910, // 10  001  0010  0010  000
-    TRCACVR2          = 0x8920, // 10  001  0010  0100  000
-    TRCACVR3          = 0x8930, // 10  001  0010  0110  000
-    TRCACVR4          = 0x8940, // 10  001  0010  1000  000
-    TRCACVR5          = 0x8950, // 10  001  0010  1010  000
-    TRCACVR6          = 0x8960, // 10  001  0010  1100  000
-    TRCACVR7          = 0x8970, // 10  001  0010  1110  000
-    TRCACVR8          = 0x8901, // 10  001  0010  0000  001
-    TRCACVR9          = 0x8911, // 10  001  0010  0010  001
-    TRCACVR10         = 0x8921, // 10  001  0010  0100  001
-    TRCACVR11         = 0x8931, // 10  001  0010  0110  001
-    TRCACVR12         = 0x8941, // 10  001  0010  1000  001
-    TRCACVR13         = 0x8951, // 10  001  0010  1010  001
-    TRCACVR14         = 0x8961, // 10  001  0010  1100  001
-    TRCACVR15         = 0x8971, // 10  001  0010  1110  001
-    TRCACATR0         = 0x8902, // 10  001  0010  0000  010
-    TRCACATR1         = 0x8912, // 10  001  0010  0010  010
-    TRCACATR2         = 0x8922, // 10  001  0010  0100  010
-    TRCACATR3         = 0x8932, // 10  001  0010  0110  010
-    TRCACATR4         = 0x8942, // 10  001  0010  1000  010
-    TRCACATR5         = 0x8952, // 10  001  0010  1010  010
-    TRCACATR6         = 0x8962, // 10  001  0010  1100  010
-    TRCACATR7         = 0x8972, // 10  001  0010  1110  010
-    TRCACATR8         = 0x8903, // 10  001  0010  0000  011
-    TRCACATR9         = 0x8913, // 10  001  0010  0010  011
-    TRCACATR10        = 0x8923, // 10  001  0010  0100  011
-    TRCACATR11        = 0x8933, // 10  001  0010  0110  011
-    TRCACATR12        = 0x8943, // 10  001  0010  1000  011
-    TRCACATR13        = 0x8953, // 10  001  0010  1010  011
-    TRCACATR14        = 0x8963, // 10  001  0010  1100  011
-    TRCACATR15        = 0x8973, // 10  001  0010  1110  011
-    TRCDVCVR0         = 0x8904, // 10  001  0010  0000  100
-    TRCDVCVR1         = 0x8924, // 10  001  0010  0100  100
-    TRCDVCVR2         = 0x8944, // 10  001  0010  1000  100
-    TRCDVCVR3         = 0x8964, // 10  001  0010  1100  100
-    TRCDVCVR4         = 0x8905, // 10  001  0010  0000  101
-    TRCDVCVR5         = 0x8925, // 10  001  0010  0100  101
-    TRCDVCVR6         = 0x8945, // 10  001  0010  1000  101
-    TRCDVCVR7         = 0x8965, // 10  001  0010  1100  101
-    TRCDVCMR0         = 0x8906, // 10  001  0010  0000  110
-    TRCDVCMR1         = 0x8926, // 10  001  0010  0100  110
-    TRCDVCMR2         = 0x8946, // 10  001  0010  1000  110
-    TRCDVCMR3         = 0x8966, // 10  001  0010  1100  110
-    TRCDVCMR4         = 0x8907, // 10  001  0010  0000  111
-    TRCDVCMR5         = 0x8927, // 10  001  0010  0100  111
-    TRCDVCMR6         = 0x8947, // 10  001  0010  1000  111
-    TRCDVCMR7         = 0x8967, // 10  001  0010  1100  111
-    TRCCIDCVR0        = 0x8980, // 10  001  0011  0000  000
-    TRCCIDCVR1        = 0x8990, // 10  001  0011  0010  000
-    TRCCIDCVR2        = 0x89a0, // 10  001  0011  0100  000
-    TRCCIDCVR3        = 0x89b0, // 10  001  0011  0110  000
-    TRCCIDCVR4        = 0x89c0, // 10  001  0011  1000  000
-    TRCCIDCVR5        = 0x89d0, // 10  001  0011  1010  000
-    TRCCIDCVR6        = 0x89e0, // 10  001  0011  1100  000
-    TRCCIDCVR7        = 0x89f0, // 10  001  0011  1110  000
-    TRCVMIDCVR0       = 0x8981, // 10  001  0011  0000  001
-    TRCVMIDCVR1       = 0x8991, // 10  001  0011  0010  001
-    TRCVMIDCVR2       = 0x89a1, // 10  001  0011  0100  001
-    TRCVMIDCVR3       = 0x89b1, // 10  001  0011  0110  001
-    TRCVMIDCVR4       = 0x89c1, // 10  001  0011  1000  001
-    TRCVMIDCVR5       = 0x89d1, // 10  001  0011  1010  001
-    TRCVMIDCVR6       = 0x89e1, // 10  001  0011  1100  001
-    TRCVMIDCVR7       = 0x89f1, // 10  001  0011  1110  001
-    TRCCIDCCTLR0      = 0x8982, // 10  001  0011  0000  010
-    TRCCIDCCTLR1      = 0x898a, // 10  001  0011  0001  010
-    TRCVMIDCCTLR0     = 0x8992, // 10  001  0011  0010  010
-    TRCVMIDCCTLR1     = 0x899a, // 10  001  0011  0011  010
-    TRCITCTRL         = 0x8b84, // 10  001  0111  0000  100
-    TRCCLAIMSET       = 0x8bc6, // 10  001  0111  1000  110
-    TRCCLAIMCLR       = 0x8bce, // 10  001  0111  1001  110
-
-    // GICv3 registers
-    ICC_BPR1_EL1      = 0xc663, // 11  000  1100  1100  011
-    ICC_BPR0_EL1      = 0xc643, // 11  000  1100  1000  011
-    ICC_PMR_EL1       = 0xc230, // 11  000  0100  0110  000
-    ICC_CTLR_EL1      = 0xc664, // 11  000  1100  1100  100
-    ICC_CTLR_EL3      = 0xf664, // 11  110  1100  1100  100
-    ICC_SRE_EL1       = 0xc665, // 11  000  1100  1100  101
-    ICC_SRE_EL2       = 0xe64d, // 11  100  1100  1001  101
-    ICC_SRE_EL3       = 0xf665, // 11  110  1100  1100  101
-    ICC_IGRPEN0_EL1   = 0xc666, // 11  000  1100  1100  110
-    ICC_IGRPEN1_EL1   = 0xc667, // 11  000  1100  1100  111
-    ICC_IGRPEN1_EL3   = 0xf667, // 11  110  1100  1100  111
-    ICC_SEIEN_EL1     = 0xc668, // 11  000  1100  1101  000
-    ICC_AP0R0_EL1     = 0xc644, // 11  000  1100  1000  100
-    ICC_AP0R1_EL1     = 0xc645, // 11  000  1100  1000  101
-    ICC_AP0R2_EL1     = 0xc646, // 11  000  1100  1000  110
-    ICC_AP0R3_EL1     = 0xc647, // 11  000  1100  1000  111
-    ICC_AP1R0_EL1     = 0xc648, // 11  000  1100  1001  000
-    ICC_AP1R1_EL1     = 0xc649, // 11  000  1100  1001  001
-    ICC_AP1R2_EL1     = 0xc64a, // 11  000  1100  1001  010
-    ICC_AP1R3_EL1     = 0xc64b, // 11  000  1100  1001  011
-    ICH_AP0R0_EL2     = 0xe640, // 11  100  1100  1000  000
-    ICH_AP0R1_EL2     = 0xe641, // 11  100  1100  1000  001
-    ICH_AP0R2_EL2     = 0xe642, // 11  100  1100  1000  010
-    ICH_AP0R3_EL2     = 0xe643, // 11  100  1100  1000  011
-    ICH_AP1R0_EL2     = 0xe648, // 11  100  1100  1001  000
-    ICH_AP1R1_EL2     = 0xe649, // 11  100  1100  1001  001
-    ICH_AP1R2_EL2     = 0xe64a, // 11  100  1100  1001  010
-    ICH_AP1R3_EL2     = 0xe64b, // 11  100  1100  1001  011
-    ICH_HCR_EL2       = 0xe658, // 11  100  1100  1011  000
-    ICH_MISR_EL2      = 0xe65a, // 11  100  1100  1011  010
-    ICH_VMCR_EL2      = 0xe65f, // 11  100  1100  1011  111
-    ICH_VSEIR_EL2     = 0xe64c, // 11  100  1100  1001  100
-    ICH_LR0_EL2       = 0xe660, // 11  100  1100  1100  000
-    ICH_LR1_EL2       = 0xe661, // 11  100  1100  1100  001
-    ICH_LR2_EL2       = 0xe662, // 11  100  1100  1100  010
-    ICH_LR3_EL2       = 0xe663, // 11  100  1100  1100  011
-    ICH_LR4_EL2       = 0xe664, // 11  100  1100  1100  100
-    ICH_LR5_EL2       = 0xe665, // 11  100  1100  1100  101
-    ICH_LR6_EL2       = 0xe666, // 11  100  1100  1100  110
-    ICH_LR7_EL2       = 0xe667, // 11  100  1100  1100  111
-    ICH_LR8_EL2       = 0xe668, // 11  100  1100  1101  000
-    ICH_LR9_EL2       = 0xe669, // 11  100  1100  1101  001
-    ICH_LR10_EL2      = 0xe66a, // 11  100  1100  1101  010
-    ICH_LR11_EL2      = 0xe66b, // 11  100  1100  1101  011
-    ICH_LR12_EL2      = 0xe66c, // 11  100  1100  1101  100
-    ICH_LR13_EL2      = 0xe66d, // 11  100  1100  1101  101
-    ICH_LR14_EL2      = 0xe66e, // 11  100  1100  1101  110
-    ICH_LR15_EL2      = 0xe66f, // 11  100  1100  1101  111
-
-    // v8.1a "Privileged Access Never" extension-specific system registers
-    PAN               = 0xc213, // 11  000  0100  0010  011
-
-    // v8.1a "Limited Ordering Regions" extension-specific system registers
-    LORSA_EL1         = 0xc520, // 11  000  1010  0100  000
-    LOREA_EL1         = 0xc521, // 11  000  1010  0100  001
-    LORN_EL1          = 0xc522, // 11  000  1010  0100  010
-    LORC_EL1          = 0xc523, // 11  000  1010  0100  011
-    LORID_EL1         = 0xc527, // 11  000  1010  0100  111
-
-    // v8.1a "Virtualization host extensions" system registers
-    TTBR1_EL2         = 0xe101, // 11  100  0010  0000  001
-    CONTEXTIDR_EL2    = 0xe681, // 11  100  1101  0000  001
-    CNTHV_TVAL_EL2    = 0xe718, // 11  100  1110  0011  000
-    CNTHV_CVAL_EL2    = 0xe71a, // 11  100  1110  0011  010
-    CNTHV_CTL_EL2     = 0xe719, // 11  100  1110  0011  001
-    SCTLR_EL12        = 0xe880, // 11  101  0001  0000  000
-    CPACR_EL12        = 0xe882, // 11  101  0001  0000  010
-    TTBR0_EL12        = 0xe900, // 11  101  0010  0000  000
-    TTBR1_EL12        = 0xe901, // 11  101  0010  0000  001
-    TCR_EL12          = 0xe902, // 11  101  0010  0000  010
-    AFSR0_EL12        = 0xea88, // 11  101  0101  0001  000
-    AFSR1_EL12        = 0xea89, // 11  101  0101  0001  001
-    ESR_EL12          = 0xea90, // 11  101  0101  0010  000
-    FAR_EL12          = 0xeb00, // 11  101  0110  0000  000
-    MAIR_EL12         = 0xed10, // 11  101  1010  0010  000
-    AMAIR_EL12        = 0xed18, // 11  101  1010  0011  000
-    VBAR_EL12         = 0xee00, // 11  101  1100  0000  000
-    CONTEXTIDR_EL12   = 0xee81, // 11  101  1101  0000  001
-    CNTKCTL_EL12      = 0xef08, // 11  101  1110  0001  000
-    CNTP_TVAL_EL02    = 0xef10, // 11  101  1110  0010  000
-    CNTP_CTL_EL02     = 0xef11, // 11  101  1110  0010  001
-    CNTP_CVAL_EL02    = 0xef12, // 11  101  1110  0010  010
-    CNTV_TVAL_EL02    = 0xef18, // 11  101  1110  0011  000
-    CNTV_CTL_EL02     = 0xef19, // 11  101  1110  0011  001
-    CNTV_CVAL_EL02    = 0xef1a, // 11  101  1110  0011  010
-    SPSR_EL12         = 0xea00, // 11  101  0100  0000  000
-    ELR_EL12          = 0xea01, // 11  101  0100  0000  001
-
-    // v8.2a registers
-    UAO               = 0xc214, // 11  000  0100  0010  100
-
-    // v8.2a "Statistical Profiling extension" registers
-    PMBLIMITR_EL1     = 0xc4d0, // 11  000  1001  1010  000
-    PMBPTR_EL1        = 0xc4d1, // 11  000  1001  1010  001
-    PMBSR_EL1         = 0xc4d3, // 11  000  1001  1010  011
-    PMBIDR_EL1        = 0xc4d7, // 11  000  1001  1010  111
-    PMSCR_EL2         = 0xe4c8, // 11  100  1001  1001  000
-    PMSCR_EL12        = 0xecc8, // 11  101  1001  1001  000
-    PMSCR_EL1         = 0xc4c8, // 11  000  1001  1001  000
-    PMSICR_EL1        = 0xc4ca, // 11  000  1001  1001  010
-    PMSIRR_EL1        = 0xc4cb, // 11  000  1001  1001  011
-    PMSFCR_EL1        = 0xc4cc, // 11  000  1001  1001  100
-    PMSEVFR_EL1       = 0xc4cd, // 11  000  1001  1001  101
-    PMSLATFR_EL1      = 0xc4ce, // 11  000  1001  1001  110
-    PMSIDR_EL1        = 0xc4cf, // 11  000  1001  1001  111
+  struct SysReg {
+    const char *Name;
+    unsigned Encoding;
+    bool Readable;
+    bool Writeable;
+    FeatureBitset FeaturesRequired;
 
-    // Cyclone specific system registers
-    CPM_IOACC_CTL_EL3 = 0xff90,
+    bool haveFeatures(FeatureBitset ActiveFeatures) const {
+      return (FeaturesRequired & ActiveFeatures) == FeaturesRequired;
+    }
   };
 
-  // Note that these do not inherit from AArch64NamedImmMapper. This class is
-  // sufficiently different in its behaviour that I don't believe it's worth
-  // burdening the common AArch64NamedImmMapper with abstractions only needed in
-  // this one case.
-  struct SysRegMapper {
-    static const AArch64NamedImmMapper::Mapping SysRegMappings[];
+  #define GET_SYSREG_DECL
+  #include "AArch64GenSystemOperands.inc"
 
-    const AArch64NamedImmMapper::Mapping *InstMappings;
-    size_t NumInstMappings;
+  const SysReg *lookupSysRegByName(StringRef);
+  const SysReg *lookupSysRegByEncoding(uint16_t);
 
-    SysRegMapper() { }
-    uint32_t fromString(StringRef Name, const FeatureBitset& FeatureBits,
-                        bool &Valid) const;
-    std::string toString(uint32_t Bits, const FeatureBitset& FeatureBits) const;
-  };
-
-  struct MSRMapper : SysRegMapper {
-    static const AArch64NamedImmMapper::Mapping MSRMappings[];
-    MSRMapper();
-  };
-
-  struct MRSMapper : SysRegMapper {
-    static const AArch64NamedImmMapper::Mapping MRSMappings[];
-    MRSMapper();
-  };
-
-  uint32_t ParseGenericRegister(StringRef Name, bool &Valid);
+  uint32_t parseGenericRegister(StringRef Name);
+  std::string genericRegisterString(uint32_t Bits);
 }
 
 namespace AArch64TLBI {
-  enum TLBIValues {
-    Invalid = -1,          // Op0 Op1  CRn   CRm   Op2
-    IPAS2E1IS    = 0x6401, // 01  100  1000  0000  001
-    IPAS2LE1IS   = 0x6405, // 01  100  1000  0000  101
-    VMALLE1IS    = 0x4418, // 01  000  1000  0011  000
-    ALLE2IS      = 0x6418, // 01  100  1000  0011  000
-    ALLE3IS      = 0x7418, // 01  110  1000  0011  000
-    VAE1IS       = 0x4419, // 01  000  1000  0011  001
-    VAE2IS       = 0x6419, // 01  100  1000  0011  001
-    VAE3IS       = 0x7419, // 01  110  1000  0011  001
-    ASIDE1IS     = 0x441a, // 01  000  1000  0011  010
-    VAAE1IS      = 0x441b, // 01  000  1000  0011  011
-    ALLE1IS      = 0x641c, // 01  100  1000  0011  100
-    VALE1IS      = 0x441d, // 01  000  1000  0011  101
-    VALE2IS      = 0x641d, // 01  100  1000  0011  101
-    VALE3IS      = 0x741d, // 01  110  1000  0011  101
-    VMALLS12E1IS = 0x641e, // 01  100  1000  0011  110
-    VAALE1IS     = 0x441f, // 01  000  1000  0011  111
-    IPAS2E1      = 0x6421, // 01  100  1000  0100  001
-    IPAS2LE1     = 0x6425, // 01  100  1000  0100  101
-    VMALLE1      = 0x4438, // 01  000  1000  0111  000
-    ALLE2        = 0x6438, // 01  100  1000  0111  000
-    ALLE3        = 0x7438, // 01  110  1000  0111  000
-    VAE1         = 0x4439, // 01  000  1000  0111  001
-    VAE2         = 0x6439, // 01  100  1000  0111  001
-    VAE3         = 0x7439, // 01  110  1000  0111  001
-    ASIDE1       = 0x443a, // 01  000  1000  0111  010
-    VAAE1        = 0x443b, // 01  000  1000  0111  011
-    ALLE1        = 0x643c, // 01  100  1000  0111  100
-    VALE1        = 0x443d, // 01  000  1000  0111  101
-    VALE2        = 0x643d, // 01  100  1000  0111  101
-    VALE3        = 0x743d, // 01  110  1000  0111  101
-    VMALLS12E1   = 0x643e, // 01  100  1000  0111  110
-    VAALE1       = 0x443f  // 01  000  1000  0111  111
-  };
-
-  struct TLBIMapper : AArch64NamedImmMapper {
-    const static Mapping TLBIMappings[];
-
-    TLBIMapper();
+  struct TLBI {
+    const char *Name;
+    uint16_t Encoding;
+    bool NeedsReg;
   };
-
-  static inline bool NeedsRegister(TLBIValues Val) {
-    switch (Val) {
-    case VMALLE1IS:
-    case ALLE2IS:
-    case ALLE3IS:
-    case ALLE1IS:
-    case VMALLS12E1IS:
-    case VMALLE1:
-    case ALLE2:
-    case ALLE3:
-    case ALLE1:
-    case VMALLS12E1:
-      return false;
-    default:
-      return true;
-    }
-  }
+  #define GET_TLBI_DECL
+  #include "AArch64GenSystemOperands.inc"
 }
 
 namespace AArch64II {
@@ -1379,12 +515,7 @@ namespace AArch64II {
     /// thread-local symbol. On Darwin, only one type of thread-local access
     /// exists (pre linker-relaxation), but on ELF the TLSModel used for the
     /// referee will affect interpretation.
-    MO_TLS = 0x40,
-
-    /// MO_CONSTPOOL - This flag indicates that a symbol operand represents
-    /// the address of a constant pool entry for the symbol, rather than the
-    /// address of the symbol itself.
-    MO_CONSTPOOL = 0x80
+    MO_TLS = 0x40
   };
 } // end namespace AArch64II
 
diff --git a/lib/Target/AArch64/Utils/Makefile b/lib/Target/AArch64/Utils/Makefile
deleted file mode 100644
index 0b80f82f2b99..000000000000
--- a/lib/Target/AArch64/Utils/Makefile
+++ /dev/null
@@ -1,16 +0,0 @@
-##===- lib/Target/AArch64/Utils/Makefile -------------------*- Makefile -*-===##
-#
-#                     The LLVM Compiler Infrastructure
-#
-# This file is distributed under the University of Illinois Open Source
-# License. See LICENSE.TXT for details.
-#
-##===----------------------------------------------------------------------===##
-LEVEL = ../../../..
-LIBRARYNAME = LLVMAArch64Utils
-
-# Hack: we need to include 'main' AArch64 target directory to grab private
-# headers
-CPP.Flags += -I$(PROJ_OBJ_DIR)/.. -I$(PROJ_SRC_DIR)/..
-
-include $(LEVEL)/Makefile.common
diff --git a/lib/Target/AMDGPU/AMDGPU.h b/lib/Target/AMDGPU/AMDGPU.h
index 4f718e1ca310..7e59710a427a 100644
--- a/lib/Target/AMDGPU/AMDGPU.h
+++ b/lib/Target/AMDGPU/AMDGPU.h
@@ -8,8 +8,8 @@
 /// \file
 //===----------------------------------------------------------------------===//
 
-#ifndef LLVM_LIB_TARGET_R600_AMDGPU_H
-#define LLVM_LIB_TARGET_R600_AMDGPU_H
+#ifndef LLVM_LIB_TARGET_AMDGPU_AMDGPU_H
+#define LLVM_LIB_TARGET_AMDGPU_AMDGPU_H
 
 #include "llvm/Support/TargetRegistry.h"
 #include "llvm/Target/TargetMachine.h"
@@ -29,7 +29,6 @@ class TargetMachine;
 
 // R600 Passes
 FunctionPass *createR600VectorRegMerger(TargetMachine &tm);
-FunctionPass *createR600TextureIntrinsicsReplacer();
 FunctionPass *createR600ExpandSpecialInstrsPass(TargetMachine &tm);
 FunctionPass *createR600EmitClauseMarkers();
 FunctionPass *createR600ClauseMergePass(TargetMachine &tm);
@@ -44,12 +43,14 @@ FunctionPass *createSIFoldOperandsPass();
 FunctionPass *createSILowerI1CopiesPass();
 FunctionPass *createSIShrinkInstructionsPass();
 FunctionPass *createSILoadStoreOptimizerPass(TargetMachine &tm);
-FunctionPass *createSILowerControlFlowPass(TargetMachine &tm);
+FunctionPass *createSIWholeQuadModePass();
+FunctionPass *createSILowerControlFlowPass();
 FunctionPass *createSIFixControlFlowLiveIntervalsPass();
 FunctionPass *createSIFixSGPRCopiesPass();
-FunctionPass *createSIFixSGPRLiveRangesPass();
 FunctionPass *createSICodeEmitterPass(formatted_raw_ostream &OS);
-FunctionPass *createSIInsertWaits(TargetMachine &tm);
+FunctionPass *createSIDebuggerInsertNopsPass();
+FunctionPass *createSIInsertWaitsPass();
+FunctionPass *createAMDGPUCodeGenPreparePass(const TargetMachine *TM = nullptr);
 
 ScheduleDAGInstrs *createSIMachineScheduler(MachineSchedContext *C);
 
@@ -60,6 +61,9 @@ extern char &AMDGPUAnnotateKernelFeaturesID;
 void initializeSIFoldOperandsPass(PassRegistry &);
 extern char &SIFoldOperandsID;
 
+void initializeSIShrinkInstructionsPass(PassRegistry&);
+extern char &SIShrinkInstructionsID;
+
 void initializeSIFixSGPRCopiesPass(PassRegistry &);
 extern char &SIFixSGPRCopiesID;
 
@@ -69,8 +73,19 @@ extern char &SILowerI1CopiesID;
 void initializeSILoadStoreOptimizerPass(PassRegistry &);
 extern char &SILoadStoreOptimizerID;
 
+void initializeSIWholeQuadModePass(PassRegistry &);
+extern char &SIWholeQuadModeID;
+
+void initializeSILowerControlFlowPass(PassRegistry &);
+extern char &SILowerControlFlowPassID;
+
+
 // Passes common to R600 and SI
-FunctionPass *createAMDGPUPromoteAlloca(const AMDGPUSubtarget &ST);
+FunctionPass *createAMDGPUPromoteAlloca(const TargetMachine *TM = nullptr);
+void initializeAMDGPUPromoteAllocaPass(PassRegistry&);
+extern char &AMDGPUPromoteAllocaID;
+
+FunctionPass *createAMDGPUAddDivergenceMetadata(const AMDGPUSubtarget &ST);
 Pass *createAMDGPUStructurizeCFGPass();
 FunctionPass *createAMDGPUISelDag(TargetMachine &tm);
 ModulePass *createAMDGPUAlwaysInlinePass();
@@ -80,12 +95,21 @@ FunctionPass *createAMDGPUAnnotateUniformValues();
 void initializeSIFixControlFlowLiveIntervalsPass(PassRegistry&);
 extern char &SIFixControlFlowLiveIntervalsID;
 
-void initializeSIFixSGPRLiveRangesPass(PassRegistry&);
-extern char &SIFixSGPRLiveRangesID;
-
 void initializeAMDGPUAnnotateUniformValuesPass(PassRegistry&);
 extern char &AMDGPUAnnotateUniformValuesPassID;
 
+void initializeAMDGPUCodeGenPreparePass(PassRegistry&);
+extern char &AMDGPUCodeGenPrepareID;
+
+void initializeSIAnnotateControlFlowPass(PassRegistry&);
+extern char &SIAnnotateControlFlowPassID;
+
+void initializeSIDebuggerInsertNopsPass(PassRegistry&);
+extern char &SIDebuggerInsertNopsID;
+
+void initializeSIInsertWaitsPass(PassRegistry&);
+extern char &SIInsertWaitsID;
+
 extern Target TheAMDGPUTarget;
 extern Target TheGCNTarget;
 
@@ -101,15 +125,6 @@ enum TargetIndex {
 
 } // End namespace llvm
 
-namespace ShaderType {
-  enum Type {
-    PIXEL = 0,
-    VERTEX = 1,
-    GEOMETRY = 2,
-    COMPUTE = 3
-  };
-}
-
 /// OpenCL uses address spaces to differentiate between
 /// various memory regions on the hardware. On the CPU
 /// all of the address spaces point to the same memory,
@@ -120,7 +135,7 @@ namespace AMDGPUAS {
 enum AddressSpaces : unsigned {
   PRIVATE_ADDRESS  = 0, ///< Address space for private memory.
   GLOBAL_ADDRESS   = 1, ///< Address space for global memory (RAT0, VTX0).
-  CONSTANT_ADDRESS = 2, ///< Address space for constant memory
+  CONSTANT_ADDRESS = 2, ///< Address space for constant memory (VTX2)
   LOCAL_ADDRESS    = 3, ///< Address space for local memory.
   FLAT_ADDRESS     = 4, ///< Address space for flat memory.
   REGION_ADDRESS   = 5, ///< Address space for region memory.
@@ -148,8 +163,6 @@ enum AddressSpaces : unsigned {
   CONSTANT_BUFFER_13 = 21,
   CONSTANT_BUFFER_14 = 22,
   CONSTANT_BUFFER_15 = 23,
-  ADDRESS_NONE = 24, ///< Address space for unknown memory.
-  LAST_ADDRESS = ADDRESS_NONE,
 
   // Some places use this if the address space can't be determined.
   UNKNOWN_ADDRESS_SPACE = ~0u
diff --git a/lib/Target/AMDGPU/AMDGPU.td b/lib/Target/AMDGPU/AMDGPU.td
index 844d89c737bf..72c455354411 100644
--- a/lib/Target/AMDGPU/AMDGPU.td
+++ b/lib/Target/AMDGPU/AMDGPU.td
@@ -1,182 +1,121 @@
-//===-- AMDGPU.td - AMDGPU Tablegen files ------------------*- tablegen -*-===//
+//===-- AMDGPU.td - AMDGPU Tablegen files --------*- tablegen -*-===//
 //
 //                     The LLVM Compiler Infrastructure
 //
 // This file is distributed under the University of Illinois Open Source
 // License. See LICENSE.TXT for details.
 //
-//===----------------------------------------------------------------------===//
+//===------------------------------------------------------------===//
 
 include "llvm/Target/Target.td"
 
-//===----------------------------------------------------------------------===//
-// Subtarget Features
-//===----------------------------------------------------------------------===//
-
-// Debugging Features
-
-def FeatureDumpCode : SubtargetFeature <"DumpCode",
-        "DumpCode",
-        "true",
-        "Dump MachineInstrs in the CodeEmitter">;
-
-def FeatureDumpCodeLower : SubtargetFeature <"dumpcode",
-        "DumpCode",
-        "true",
-        "Dump MachineInstrs in the CodeEmitter">;
-
-def FeatureIRStructurizer : SubtargetFeature <"disable-irstructurizer",
-        "EnableIRStructurizer",
-        "false",
-        "Disable IR Structurizer">;
-
-def FeaturePromoteAlloca : SubtargetFeature <"promote-alloca",
-        "EnablePromoteAlloca",
-        "true",
-        "Enable promote alloca pass">;
-
-// Target features
-
-def FeatureIfCvt : SubtargetFeature <"disable-ifcvt",
-        "EnableIfCvt",
-        "false",
-        "Disable the if conversion pass">;
+//===------------------------------------------------------------===//
+// Subtarget Features (device properties)
+//===------------------------------------------------------------===//
 
 def FeatureFP64 : SubtargetFeature<"fp64",
-        "FP64",
-        "true",
-        "Enable double precision operations">;
-
-def FeatureFP64Denormals : SubtargetFeature<"fp64-denormals",
-        "FP64Denormals",
-        "true",
-        "Enable double precision denormal handling",
-        [FeatureFP64]>;
+  "FP64",
+  "true",
+  "Enable double precision operations"
+>;
 
 def FeatureFastFMAF32 : SubtargetFeature<"fast-fmaf",
-        "FastFMAF32",
-        "true",
-        "Assuming f32 fma is at least as fast as mul + add",
-        []>;
-
-// Some instructions do not support denormals despite this flag. Using
-// fp32 denormals also causes instructions to run at the double
-// precision rate for the device.
-def FeatureFP32Denormals : SubtargetFeature<"fp32-denormals",
-        "FP32Denormals",
-        "true",
-        "Enable single precision denormal handling">;
+  "FastFMAF32",
+  "true",
+  "Assuming f32 fma is at least as fast as mul + add"
+>;
 
-def Feature64BitPtr : SubtargetFeature<"64BitPtr",
-        "Is64bit",
-        "true",
-        "Specify if 64-bit addressing should be used">;
+def HalfRate64Ops : SubtargetFeature<"half-rate-64-ops",
+  "HalfRate64Ops",
+  "true",
+  "Most fp64 instructions are half rate instead of quarter"
+>;
 
 def FeatureR600ALUInst : SubtargetFeature<"R600ALUInst",
-        "R600ALUInst",
-        "false",
-        "Older version of ALU instructions encoding">;
+  "R600ALUInst",
+  "false",
+  "Older version of ALU instructions encoding"
+>;
 
 def FeatureVertexCache : SubtargetFeature<"HasVertexCache",
-        "HasVertexCache",
-        "true",
-        "Specify use of dedicated vertex cache">;
+  "HasVertexCache",
+  "true",
+  "Specify use of dedicated vertex cache"
+>;
 
 def FeatureCaymanISA : SubtargetFeature<"caymanISA",
-        "CaymanISA",
-        "true",
-        "Use Cayman ISA">;
+  "CaymanISA",
+  "true",
+  "Use Cayman ISA"
+>;
 
 def FeatureCFALUBug : SubtargetFeature<"cfalubug",
-        "CFALUBug",
-        "true",
-        "GPU has CF_ALU bug">;
-
-// XXX - This should probably be removed once enabled by default
-def FeatureEnableLoadStoreOpt : SubtargetFeature <"load-store-opt",
-        "EnableLoadStoreOpt",
-        "true",
-        "Enable SI load/store optimizer pass">;
-
-// Performance debugging feature. Allow using DS instruction immediate
-// offsets even if the base pointer can't be proven to be base. On SI,
-// base pointer values that won't give the same result as a 16-bit add
-// are not safe to fold, but this will override the conservative test
-// for the base pointer.
-def FeatureEnableUnsafeDSOffsetFolding : SubtargetFeature <"unsafe-ds-offset-folding",
-        "EnableUnsafeDSOffsetFolding",
-        "true",
-        "Force using DS instruction immediate offsets on SI">;
-
-def FeatureFlatForGlobal : SubtargetFeature<"flat-for-global",
-        "FlatForGlobal",
-        "true",
-        "Force to generate flat instruction for global">;
+  "CFALUBug",
+  "true",
+  "GPU has CF_ALU bug"
+>;
 
 def FeatureFlatAddressSpace : SubtargetFeature<"flat-address-space",
-        "FlatAddressSpace",
-        "true",
-        "Support flat address space">;
+  "FlatAddressSpace",
+  "true",
+  "Support flat address space"
+>;
 
-def FeatureXNACK : SubtargetFeature<"xnack",
-        "EnableXNACK",
-        "true",
-        "Enable XNACK support">;
+def FeatureUnalignedBufferAccess : SubtargetFeature<"unaligned-buffer-access",
+  "UnalignedBufferAccess",
+  "true",
+  "Support unaligned global loads and stores"
+>;
 
-def FeatureVGPRSpilling : SubtargetFeature<"vgpr-spilling",
-        "EnableVGPRSpilling",
-        "true",
-        "Enable spilling of VGPRs to scratch memory">;
+def FeatureXNACK : SubtargetFeature<"xnack",
+  "EnableXNACK",
+  "true",
+  "Enable XNACK support"
+>;
 
 def FeatureSGPRInitBug : SubtargetFeature<"sgpr-init-bug",
-        "SGPRInitBug",
-        "true",
-        "VI SGPR initilization bug requiring a fixed SGPR allocation size">;
-
-def FeatureEnableHugeScratchBuffer : SubtargetFeature<"huge-scratch-buffer",
-        "EnableHugeScratchBuffer",
-        "true",
-        "Enable scratch buffer sizes greater than 128 GB">;
-
-def FeatureEnableSIScheduler : SubtargetFeature<"si-scheduler",
-        "EnableSIScheduler",
-        "true",
-        "Enable SI Machine Scheduler">;
+  "SGPRInitBug",
+  "true",
+  "VI SGPR initilization bug requiring a fixed SGPR allocation size"
+>;
 
 class SubtargetFeatureFetchLimit <string Value> :
                           SubtargetFeature <"fetch"#Value,
-        "TexVTXClauseSize",
-        Value,
-        "Limit the maximum number of fetches in a clause to "#Value>;
+  "TexVTXClauseSize",
+  Value,
+  "Limit the maximum number of fetches in a clause to "#Value
+>;
 
 def FeatureFetchLimit8 : SubtargetFeatureFetchLimit <"8">;
 def FeatureFetchLimit16 : SubtargetFeatureFetchLimit <"16">;
 
 class SubtargetFeatureWavefrontSize <int Value> : SubtargetFeature<
-        "wavefrontsize"#Value,
-        "WavefrontSize",
-        !cast<string>(Value),
-        "The number of threads per wavefront">;
+  "wavefrontsize"#Value,
+  "WavefrontSize",
+  !cast<string>(Value),
+  "The number of threads per wavefront"
+>;
 
 def FeatureWavefrontSize16 : SubtargetFeatureWavefrontSize<16>;
 def FeatureWavefrontSize32 : SubtargetFeatureWavefrontSize<32>;
 def FeatureWavefrontSize64 : SubtargetFeatureWavefrontSize<64>;
 
 class SubtargetFeatureLDSBankCount <int Value> : SubtargetFeature <
-      "ldsbankcount"#Value,
-      "LDSBankCount",
-      !cast<string>(Value),
-      "The number of LDS banks per compute unit.">;
+  "ldsbankcount"#Value,
+  "LDSBankCount",
+  !cast<string>(Value),
+  "The number of LDS banks per compute unit."
+>;
 
 def FeatureLDSBankCount16 : SubtargetFeatureLDSBankCount<16>;
 def FeatureLDSBankCount32 : SubtargetFeatureLDSBankCount<32>;
 
 class SubtargetFeatureISAVersion <int Major, int Minor, int Stepping>
                                  : SubtargetFeature <
-      "isaver"#Major#"."#Minor#"."#Stepping,
-      "IsaVersion",
-      "ISAVersion"#Major#"_"#Minor#"_"#Stepping,
-      "Instruction set version number"
+  "isaver"#Major#"."#Minor#"."#Stepping,
+  "IsaVersion",
+  "ISAVersion"#Major#"_"#Minor#"_"#Stepping,
+  "Instruction set version number"
 >;
 
 def FeatureISAVersion7_0_0 : SubtargetFeatureISAVersion <7,0,0>;
@@ -186,36 +125,145 @@ def FeatureISAVersion8_0_1 : SubtargetFeatureISAVersion <8,0,1>;
 def FeatureISAVersion8_0_3 : SubtargetFeatureISAVersion <8,0,3>;
 
 class SubtargetFeatureLocalMemorySize <int Value> : SubtargetFeature<
-        "localmemorysize"#Value,
-        "LocalMemorySize",
-        !cast<string>(Value),
-        "The size of local memory in bytes">;
+  "localmemorysize"#Value,
+  "LocalMemorySize",
+  !cast<string>(Value),
+  "The size of local memory in bytes"
+>;
 
 def FeatureGCN : SubtargetFeature<"gcn",
-        "IsGCN",
-        "true",
-        "GCN or newer GPU">;
+  "IsGCN",
+  "true",
+  "GCN or newer GPU"
+>;
 
 def FeatureGCN1Encoding : SubtargetFeature<"gcn1-encoding",
-        "GCN1Encoding",
-        "true",
-        "Encoding format for SI and CI">;
+  "GCN1Encoding",
+  "true",
+  "Encoding format for SI and CI"
+>;
 
 def FeatureGCN3Encoding : SubtargetFeature<"gcn3-encoding",
-        "GCN3Encoding",
-        "true",
-        "Encoding format for VI">;
+  "GCN3Encoding",
+  "true",
+  "Encoding format for VI"
+>;
 
 def FeatureCIInsts : SubtargetFeature<"ci-insts",
-        "CIInsts",
-        "true",
-        "Additional intstructions for CI+">;
+  "CIInsts",
+  "true",
+  "Additional intstructions for CI+"
+>;
+
+def FeatureSMemRealTime : SubtargetFeature<"s-memrealtime",
+  "HasSMemRealTime",
+  "true",
+  "Has s_memrealtime instruction"
+>;
+
+def Feature16BitInsts : SubtargetFeature<"16-bit-insts",
+  "Has16BitInsts",
+  "true",
+  "Has i16/f16 instructions"
+>;
+
+//===------------------------------------------------------------===//
+// Subtarget Features (options and debugging)
+//===------------------------------------------------------------===//
+
+// Some instructions do not support denormals despite this flag. Using
+// fp32 denormals also causes instructions to run at the double
+// precision rate for the device.
+def FeatureFP32Denormals : SubtargetFeature<"fp32-denormals",
+  "FP32Denormals",
+  "true",
+  "Enable single precision denormal handling"
+>;
+
+def FeatureFP64Denormals : SubtargetFeature<"fp64-denormals",
+  "FP64Denormals",
+  "true",
+  "Enable double precision denormal handling",
+  [FeatureFP64]
+>;
+
+def FeatureFPExceptions : SubtargetFeature<"fp-exceptions",
+  "FPExceptions",
+  "true",
+  "Enable floating point exceptions"
+>;
+
+class FeatureMaxPrivateElementSize<int size> : SubtargetFeature<
+  "max-private-element-size-"#size,
+  "MaxPrivateElementSize",
+  !cast<string>(size),
+  "Maximum private access size may be "#size
+>;
+
+def FeatureMaxPrivateElementSize4 : FeatureMaxPrivateElementSize<4>;
+def FeatureMaxPrivateElementSize8 : FeatureMaxPrivateElementSize<8>;
+def FeatureMaxPrivateElementSize16 : FeatureMaxPrivateElementSize<16>;
+
+def FeatureVGPRSpilling : SubtargetFeature<"vgpr-spilling",
+  "EnableVGPRSpilling",
+  "true",
+  "Enable spilling of VGPRs to scratch memory"
+>;
+
+def FeatureDumpCode : SubtargetFeature <"DumpCode",
+  "DumpCode",
+  "true",
+  "Dump MachineInstrs in the CodeEmitter"
+>;
+
+def FeatureDumpCodeLower : SubtargetFeature <"dumpcode",
+  "DumpCode",
+  "true",
+  "Dump MachineInstrs in the CodeEmitter"
+>;
+
+def FeaturePromoteAlloca : SubtargetFeature <"promote-alloca",
+  "EnablePromoteAlloca",
+  "true",
+  "Enable promote alloca pass"
+>;
+
+// XXX - This should probably be removed once enabled by default
+def FeatureEnableLoadStoreOpt : SubtargetFeature <"load-store-opt",
+  "EnableLoadStoreOpt",
+  "true",
+  "Enable SI load/store optimizer pass"
+>;
+
+// Performance debugging feature. Allow using DS instruction immediate
+// offsets even if the base pointer can't be proven to be base. On SI,
+// base pointer values that won't give the same result as a 16-bit add
+// are not safe to fold, but this will override the conservative test
+// for the base pointer.
+def FeatureEnableUnsafeDSOffsetFolding : SubtargetFeature <
+  "unsafe-ds-offset-folding",
+  "EnableUnsafeDSOffsetFolding",
+  "true",
+  "Force using DS instruction immediate offsets on SI"
+>;
+
+def FeatureEnableSIScheduler : SubtargetFeature<"si-scheduler",
+  "EnableSIScheduler",
+  "true",
+  "Enable SI Machine Scheduler"
+>;
+
+def FeatureFlatForGlobal : SubtargetFeature<"flat-for-global",
+  "FlatForGlobal",
+  "true",
+  "Force to generate flat instruction for global"
+>;
 
 // Dummy feature used to disable assembler instructions.
 def FeatureDisable : SubtargetFeature<"",
-                                      "FeatureDisable","true",
-                                      "Dummy feature to disable assembler"
-                                      " instructions">;
+  "FeatureDisable","true",
+  "Dummy feature to disable assembler instructions"
+>;
 
 class SubtargetFeatureGeneration <string Value,
                                   list<SubtargetFeature> Implies> :
@@ -227,33 +275,66 @@ def FeatureLocalMemorySize32768 : SubtargetFeatureLocalMemorySize<32768>;
 def FeatureLocalMemorySize65536 : SubtargetFeatureLocalMemorySize<65536>;
 
 def FeatureR600 : SubtargetFeatureGeneration<"R600",
-        [FeatureR600ALUInst, FeatureFetchLimit8, FeatureLocalMemorySize0]>;
+  [FeatureR600ALUInst, FeatureFetchLimit8, FeatureLocalMemorySize0]
+>;
 
 def FeatureR700 : SubtargetFeatureGeneration<"R700",
-        [FeatureFetchLimit16, FeatureLocalMemorySize0]>;
+  [FeatureFetchLimit16, FeatureLocalMemorySize0]
+>;
 
 def FeatureEvergreen : SubtargetFeatureGeneration<"EVERGREEN",
-        [FeatureFetchLimit16, FeatureLocalMemorySize32768]>;
+  [FeatureFetchLimit16, FeatureLocalMemorySize32768]
+>;
 
 def FeatureNorthernIslands : SubtargetFeatureGeneration<"NORTHERN_ISLANDS",
-        [FeatureFetchLimit16, FeatureWavefrontSize64,
-         FeatureLocalMemorySize32768]
+  [FeatureFetchLimit16, FeatureWavefrontSize64,
+   FeatureLocalMemorySize32768]
 >;
 
 def FeatureSouthernIslands : SubtargetFeatureGeneration<"SOUTHERN_ISLANDS",
-        [Feature64BitPtr, FeatureFP64, FeatureLocalMemorySize32768,
-         FeatureWavefrontSize64, FeatureGCN, FeatureGCN1Encoding,
-         FeatureLDSBankCount32]>;
+  [FeatureFP64, FeatureLocalMemorySize32768,
+  FeatureWavefrontSize64, FeatureGCN, FeatureGCN1Encoding,
+  FeatureLDSBankCount32]
+>;
 
 def FeatureSeaIslands : SubtargetFeatureGeneration<"SEA_ISLANDS",
-        [Feature64BitPtr, FeatureFP64, FeatureLocalMemorySize65536,
-         FeatureWavefrontSize64, FeatureGCN, FeatureFlatAddressSpace,
-         FeatureGCN1Encoding, FeatureCIInsts]>;
+  [FeatureFP64, FeatureLocalMemorySize65536,
+  FeatureWavefrontSize64, FeatureGCN, FeatureFlatAddressSpace,
+  FeatureGCN1Encoding, FeatureCIInsts]
+>;
 
 def FeatureVolcanicIslands : SubtargetFeatureGeneration<"VOLCANIC_ISLANDS",
-        [Feature64BitPtr, FeatureFP64, FeatureLocalMemorySize65536,
-         FeatureWavefrontSize64, FeatureFlatAddressSpace, FeatureGCN,
-         FeatureGCN3Encoding, FeatureCIInsts]>;
+  [FeatureFP64, FeatureLocalMemorySize65536,
+   FeatureWavefrontSize64, FeatureFlatAddressSpace, FeatureGCN,
+   FeatureGCN3Encoding, FeatureCIInsts, Feature16BitInsts,
+   FeatureSMemRealTime
+  ]
+>;
+
+//===----------------------------------------------------------------------===//
+// Debugger related subtarget features.
+//===----------------------------------------------------------------------===//
+
+def FeatureDebuggerInsertNops : SubtargetFeature<
+  "amdgpu-debugger-insert-nops",
+  "DebuggerInsertNops",
+  "true",
+  "Insert one nop instruction for each high level source statement"
+>;
+
+def FeatureDebuggerReserveRegs : SubtargetFeature<
+  "amdgpu-debugger-reserve-regs",
+  "DebuggerReserveRegs",
+  "true",
+  "Reserve registers for debugger usage"
+>;
+
+def FeatureDebuggerEmitPrologue : SubtargetFeature<
+  "amdgpu-debugger-emit-prologue",
+  "DebuggerEmitPrologue",
+  "true",
+  "Emit debugger prologue"
+>;
 
 //===----------------------------------------------------------------------===//
 
@@ -283,6 +364,7 @@ def NullALU : InstrItinClass;
 //===----------------------------------------------------------------------===//
 
 def TruePredicate : Predicate<"true">;
+
 def isSICI : Predicate<
   "Subtarget->getGeneration() == AMDGPUSubtarget::SOUTHERN_ISLANDS ||"
   "Subtarget->getGeneration() == AMDGPUSubtarget::SEA_ISLANDS"
@@ -292,6 +374,13 @@ def isVI : Predicate <
   "Subtarget->getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS">,
   AssemblerPredicate<"FeatureGCN3Encoding">;
 
+def isCIVI : Predicate <
+  "Subtarget->getGeneration() == AMDGPUSubtarget::SEA_ISLANDS || "
+  "Subtarget->getGeneration() == AMDGPUSubtarget::VOLCANIC_ISLANDS"
+>, AssemblerPredicate<"FeatureCIInsts">;
+
+def HasFlatAddressSpace : Predicate<"Subtarget->hasFlatAddressSpace()">;
+
 class PredicateControl {
   Predicate SubtargetPredicate;
   Predicate SIAssemblerPredicate = isSICI;
diff --git a/lib/Target/AMDGPU/AMDGPUAlwaysInlinePass.cpp b/lib/Target/AMDGPU/AMDGPUAlwaysInlinePass.cpp
index ad267d350850..63f5fb3cdf00 100644
--- a/lib/Target/AMDGPU/AMDGPUAlwaysInlinePass.cpp
+++ b/lib/Target/AMDGPU/AMDGPUAlwaysInlinePass.cpp
@@ -45,9 +45,8 @@ bool AMDGPUAlwaysInline::runOnModule(Module &M) {
 
   for (Function *F : FuncsToClone) {
     ValueToValueMapTy VMap;
-    Function *NewFunc = CloneFunction(F, VMap, false);
+    Function *NewFunc = CloneFunction(F, VMap);
     NewFunc->setLinkage(GlobalValue::InternalLinkage);
-    M.getFunctionList().push_back(NewFunc);
     F->replaceAllUsesWith(NewFunc);
   }
 
diff --git a/lib/Target/AMDGPU/AMDGPUAnnotateKernelFeatures.cpp b/lib/Target/AMDGPU/AMDGPUAnnotateKernelFeatures.cpp
index 378183927242..0910b2877b09 100644
--- a/lib/Target/AMDGPU/AMDGPUAnnotateKernelFeatures.cpp
+++ b/lib/Target/AMDGPU/AMDGPUAnnotateKernelFeatures.cpp
@@ -13,6 +13,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "AMDGPU.h"
+#include "llvm/IR/Constants.h"
 #include "llvm/IR/Instructions.h"
 #include "llvm/IR/Module.h"
 
@@ -24,6 +25,8 @@ namespace {
 
 class AMDGPUAnnotateKernelFeatures : public ModulePass {
 private:
+  static bool hasAddrSpaceCast(const Function &F);
+
   void addAttrToCallers(Function *Intrin, StringRef AttrName);
   bool addAttrsForIntrinsics(Module &M, ArrayRef<StringRef[2]>);
 
@@ -40,6 +43,11 @@ public:
     AU.setPreservesAll();
     ModulePass::getAnalysisUsage(AU);
   }
+
+  static bool visitConstantExpr(const ConstantExpr *CE);
+  static bool visitConstantExprsRecursively(
+    const Constant *EntryC,
+    SmallPtrSet<const Constant *, 8> &ConstantExprVisited);
 };
 
 }
@@ -48,12 +56,87 @@ char AMDGPUAnnotateKernelFeatures::ID = 0;
 
 char &llvm::AMDGPUAnnotateKernelFeaturesID = AMDGPUAnnotateKernelFeatures::ID;
 
+INITIALIZE_PASS(AMDGPUAnnotateKernelFeatures, DEBUG_TYPE,
+                "Add AMDGPU function attributes", false, false)
+
+
+// The queue ptr is only needed when casting to flat, not from it.
+static bool castRequiresQueuePtr(unsigned SrcAS) {
+  return SrcAS == AMDGPUAS::LOCAL_ADDRESS || SrcAS == AMDGPUAS::PRIVATE_ADDRESS;
+}
+
+static bool castRequiresQueuePtr(const AddrSpaceCastInst *ASC) {
+  return castRequiresQueuePtr(ASC->getSrcAddressSpace());
+}
+
+bool AMDGPUAnnotateKernelFeatures::visitConstantExpr(const ConstantExpr *CE) {
+  if (CE->getOpcode() == Instruction::AddrSpaceCast) {
+    unsigned SrcAS = CE->getOperand(0)->getType()->getPointerAddressSpace();
+    return castRequiresQueuePtr(SrcAS);
+  }
+
+  return false;
+}
+
+bool AMDGPUAnnotateKernelFeatures::visitConstantExprsRecursively(
+  const Constant *EntryC,
+  SmallPtrSet<const Constant *, 8> &ConstantExprVisited) {
 
-INITIALIZE_PASS_BEGIN(AMDGPUAnnotateKernelFeatures, DEBUG_TYPE,
-                      "Add AMDGPU function attributes", false, false)
-INITIALIZE_PASS_END(AMDGPUAnnotateKernelFeatures, DEBUG_TYPE,
-                    "Add AMDGPU function attributes", false, false)
+  if (!ConstantExprVisited.insert(EntryC).second)
+    return false;
 
+  SmallVector<const Constant *, 16> Stack;
+  Stack.push_back(EntryC);
+
+  while (!Stack.empty()) {
+    const Constant *C = Stack.pop_back_val();
+
+    // Check this constant expression.
+    if (const auto *CE = dyn_cast<ConstantExpr>(C)) {
+      if (visitConstantExpr(CE))
+        return true;
+    }
+
+    // Visit all sub-expressions.
+    for (const Use &U : C->operands()) {
+      const auto *OpC = dyn_cast<Constant>(U);
+      if (!OpC)
+        continue;
+
+      if (!ConstantExprVisited.insert(OpC).second)
+        continue;
+
+      Stack.push_back(OpC);
+    }
+  }
+
+  return false;
+}
+
+// Return true if an addrspacecast is used that requires the queue ptr.
+bool AMDGPUAnnotateKernelFeatures::hasAddrSpaceCast(const Function &F) {
+  SmallPtrSet<const Constant *, 8> ConstantExprVisited;
+
+  for (const BasicBlock &BB : F) {
+    for (const Instruction &I : BB) {
+      if (const AddrSpaceCastInst *ASC = dyn_cast<AddrSpaceCastInst>(&I)) {
+        if (castRequiresQueuePtr(ASC))
+          return true;
+      }
+
+      for (const Use &U : I.operands()) {
+        const auto *OpC = dyn_cast<Constant>(U);
+        if (!OpC)
+          continue;
+
+        if (visitConstantExprsRecursively(OpC, ConstantExprVisited))
+          return true;
+      }
+    }
+  }
+
+  return false;
+}
 
 void AMDGPUAnnotateKernelFeatures::addAttrToCallers(Function *Intrin,
                                                     StringRef AttrName) {
@@ -89,35 +172,46 @@ bool AMDGPUAnnotateKernelFeatures::runOnModule(Module &M) {
 
   static const StringRef IntrinsicToAttr[][2] = {
     // .x omitted
+    { "llvm.amdgcn.workitem.id.y", "amdgpu-work-item-id-y" },
+    { "llvm.amdgcn.workitem.id.z", "amdgpu-work-item-id-z" },
+
+    { "llvm.amdgcn.workgroup.id.y", "amdgpu-work-group-id-y" },
+    { "llvm.amdgcn.workgroup.id.z", "amdgpu-work-group-id-z" },
+
     { "llvm.r600.read.tgid.y", "amdgpu-work-group-id-y" },
     { "llvm.r600.read.tgid.z", "amdgpu-work-group-id-z" },
 
     // .x omitted
     { "llvm.r600.read.tidig.y", "amdgpu-work-item-id-y" },
     { "llvm.r600.read.tidig.z", "amdgpu-work-item-id-z" }
-
   };
 
   static const StringRef HSAIntrinsicToAttr[][2] = {
-    { "llvm.r600.read.local.size.x", "amdgpu-dispatch-ptr" },
-    { "llvm.r600.read.local.size.y", "amdgpu-dispatch-ptr" },
-    { "llvm.r600.read.local.size.z", "amdgpu-dispatch-ptr" },
-
-    { "llvm.r600.read.global.size.x", "amdgpu-dispatch-ptr" },
-    { "llvm.r600.read.global.size.y", "amdgpu-dispatch-ptr" },
-    { "llvm.r600.read.global.size.z", "amdgpu-dispatch-ptr" },
-    { "llvm.amdgcn.dispatch.ptr",     "amdgpu-dispatch-ptr" }
+    { "llvm.amdgcn.dispatch.ptr", "amdgpu-dispatch-ptr" },
+    { "llvm.amdgcn.queue.ptr", "amdgpu-queue-ptr" }
   };
 
+  // TODO: We should not add the attributes if the known compile time workgroup
+  // size is 1 for y/z.
+
   // TODO: Intrinsics that require queue ptr.
 
   // We do not need to note the x workitem or workgroup id because they are
   // always initialized.
 
   bool Changed = addAttrsForIntrinsics(M, IntrinsicToAttr);
-  if (TT.getOS() == Triple::AMDHSA)
+  if (TT.getOS() == Triple::AMDHSA) {
     Changed |= addAttrsForIntrinsics(M, HSAIntrinsicToAttr);
 
+    for (Function &F : M) {
+      if (F.hasFnAttribute("amdgpu-queue-ptr"))
+        continue;
+
+      if (hasAddrSpaceCast(F))
+        F.addFnAttr("amdgpu-queue-ptr");
+    }
+  }
+
   return Changed;
 }
 
diff --git a/lib/Target/AMDGPU/AMDGPUAnnotateUniformValues.cpp b/lib/Target/AMDGPU/AMDGPUAnnotateUniformValues.cpp
index dfddc345f286..2010cc952265 100644
--- a/lib/Target/AMDGPU/AMDGPUAnnotateUniformValues.cpp
+++ b/lib/Target/AMDGPU/AMDGPUAnnotateUniformValues.cpp
@@ -43,6 +43,7 @@ public:
     AU.setPreservesAll();
  }
 
+  void visitBranchInst(BranchInst &I);
   void visitLoadInst(LoadInst &I);
 
 };
@@ -57,13 +58,28 @@ INITIALIZE_PASS_END(AMDGPUAnnotateUniformValues, DEBUG_TYPE,
 
 char AMDGPUAnnotateUniformValues::ID = 0;
 
+static void setUniformMetadata(Instruction *I) {
+  I->setMetadata("amdgpu.uniform", MDNode::get(I->getContext(), {}));
+}
+
+void AMDGPUAnnotateUniformValues::visitBranchInst(BranchInst &I) {
+  if (I.isUnconditional())
+    return;
+
+  Value *Cond = I.getCondition();
+  if (!DA->isUniform(Cond))
+    return;
+
+  setUniformMetadata(I.getParent()->getTerminator());
+}
+
 void AMDGPUAnnotateUniformValues::visitLoadInst(LoadInst &I) {
   Value *Ptr = I.getPointerOperand();
   if (!DA->isUniform(Ptr))
     return;
 
   if (Instruction *PtrI = dyn_cast<Instruction>(Ptr))
-    PtrI->setMetadata("amdgpu.uniform", MDNode::get(I.getContext(), {}));
+    setUniformMetadata(PtrI);
 
 }
 
@@ -72,6 +88,9 @@ bool AMDGPUAnnotateUniformValues::doInitialization(Module &M) {
 }
 
 bool AMDGPUAnnotateUniformValues::runOnFunction(Function &F) {
+  if (skipFunction(F))
+    return false;
+
   DA = &getAnalysis<DivergenceAnalysis>();
   visit(F);
 
diff --git a/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp b/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp
index 1239dfb235ef..cfe6346fb6b1 100644
--- a/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp
+++ b/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp
@@ -28,8 +28,10 @@
 #include "R600RegisterInfo.h"
 #include "SIDefines.h"
 #include "SIMachineFunctionInfo.h"
+#include "SIInstrInfo.h"
 #include "SIRegisterInfo.h"
 #include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/IR/DiagnosticInfo.h"
 #include "llvm/MC/MCContext.h"
 #include "llvm/MC/MCSectionELF.h"
 #include "llvm/MC/MCStreamer.h"
@@ -37,7 +39,9 @@
 #include "llvm/Support/MathExtras.h"
 #include "llvm/Support/TargetRegistry.h"
 #include "llvm/Target/TargetLoweringObjectFile.h"
+#include "AMDGPURuntimeMetadata.h"
 
+using namespace ::AMDGPU;
 using namespace llvm;
 
 // TODO: This should get the default rounding mode from the kernel. We just set
@@ -61,7 +65,7 @@ using namespace llvm;
 // instructions to run at the double precision rate for the device so it's
 // probably best to just report no single precision denormals.
 static uint32_t getFPMode(const MachineFunction &F) {
-  const AMDGPUSubtarget& ST = F.getSubtarget<AMDGPUSubtarget>();
+  const SISubtarget& ST = F.getSubtarget<SISubtarget>();
   // TODO: Is there any real use for the flush in only / flush out only modes?
 
   uint32_t FP32Denormals =
@@ -104,10 +108,12 @@ void AMDGPUAsmPrinter::EmitStartOfAsmFile(Module &M) {
   AMDGPUTargetStreamer *TS =
       static_cast<AMDGPUTargetStreamer *>(OutStreamer->getTargetStreamer());
 
-  TS->EmitDirectiveHSACodeObjectVersion(1, 0);
+  TS->EmitDirectiveHSACodeObjectVersion(2, 1);
+
   AMDGPU::IsaVersion ISA = AMDGPU::getIsaVersion(STI->getFeatureBits());
   TS->EmitDirectiveHSACodeObjectISA(ISA.Major, ISA.Minor, ISA.Stepping,
                                     "AMD", "AMDGPU");
+  emitStartOfRuntimeMetadata(M);
 }
 
 void AMDGPUAsmPrinter::EmitFunctionBodyStart() {
@@ -132,54 +138,13 @@ void AMDGPUAsmPrinter::EmitFunctionEntryLabel() {
   AsmPrinter::EmitFunctionEntryLabel();
 }
 
-static bool isModuleLinkage(const GlobalValue *GV) {
-  switch (GV->getLinkage()) {
-  case GlobalValue::InternalLinkage:
-  case GlobalValue::CommonLinkage:
-   return true;
-  case GlobalValue::ExternalLinkage:
-   return false;
-  default: llvm_unreachable("unknown linkage type");
-  }
-}
-
 void AMDGPUAsmPrinter::EmitGlobalVariable(const GlobalVariable *GV) {
 
-  if (TM.getTargetTriple().getOS() != Triple::AMDHSA) {
-    AsmPrinter::EmitGlobalVariable(GV);
-    return;
-  }
-
-  if (GV->isDeclaration() || GV->getLinkage() == GlobalValue::PrivateLinkage) {
-    AsmPrinter::EmitGlobalVariable(GV);
-    return;
-  }
-
   // Group segment variables aren't emitted in HSA.
   if (AMDGPU::isGroupSegment(GV))
     return;
 
-  AMDGPUTargetStreamer *TS =
-      static_cast<AMDGPUTargetStreamer *>(OutStreamer->getTargetStreamer());
-  if (isModuleLinkage(GV)) {
-    TS->EmitAMDGPUHsaModuleScopeGlobal(GV->getName());
-  } else {
-    TS->EmitAMDGPUHsaProgramScopeGlobal(GV->getName());
-  }
-
-  MCSymbolELF *GVSym = cast<MCSymbolELF>(getSymbol(GV));
-  const DataLayout &DL = getDataLayout();
-
-  // Emit the size
-  uint64_t Size = DL.getTypeAllocSize(GV->getType()->getElementType());
-  OutStreamer->emitELFSize(GVSym, MCConstantExpr::create(Size, OutContext));
-  OutStreamer->PushSection();
-  OutStreamer->SwitchSection(
-      getObjFileLowering().SectionForGlobal(GV, *Mang, TM));
-  const Constant *C = GV->getInitializer();
-  OutStreamer->EmitLabel(GVSym);
-  EmitGlobalConstant(DL, C);
-  OutStreamer->PopSection();
+  AsmPrinter::EmitGlobalVariable(GV);
 }
 
 bool AMDGPUAsmPrinter::runOnMachineFunction(MachineFunction &MF) {
@@ -230,6 +195,20 @@ bool AMDGPUAsmPrinter::runOnMachineFunction(MachineFunction &MF) {
                                   false);
       OutStreamer->emitRawComment(" ScratchSize: " + Twine(KernelInfo.ScratchSize),
                                   false);
+      OutStreamer->emitRawComment(" LDSByteSize: " + Twine(KernelInfo.LDSSize) +
+                                  " bytes/workgroup (compile time only)", false);
+
+      OutStreamer->emitRawComment(" ReservedVGPRFirst: " + Twine(KernelInfo.ReservedVGPRFirst),
+                                  false);
+      OutStreamer->emitRawComment(" ReservedVGPRCount: " + Twine(KernelInfo.ReservedVGPRCount),
+                                  false);
+
+      if (MF.getSubtarget<SISubtarget>().debuggerEmitPrologue()) {
+        OutStreamer->emitRawComment(" DebuggerWavefrontPrivateSegmentOffsetSGPR: s" +
+                                    Twine(KernelInfo.DebuggerWavefrontPrivateSegmentOffsetSGPR), false);
+        OutStreamer->emitRawComment(" DebuggerPrivateSegmentBufferSGPR: s" +
+                                    Twine(KernelInfo.DebuggerPrivateSegmentBufferSGPR), false);
+      }
 
       OutStreamer->emitRawComment(" COMPUTE_PGM_RSRC2:USER_SGPR: " +
                                   Twine(G_00B84C_USER_SGPR(KernelInfo.ComputePGMRSrc2)),
@@ -268,15 +247,16 @@ bool AMDGPUAsmPrinter::runOnMachineFunction(MachineFunction &MF) {
     }
   }
 
+  emitRuntimeMetadata(*MF.getFunction());
+
   return false;
 }
 
 void AMDGPUAsmPrinter::EmitProgramInfoR600(const MachineFunction &MF) {
   unsigned MaxGPR = 0;
   bool killPixel = false;
-  const AMDGPUSubtarget &STM = MF.getSubtarget<AMDGPUSubtarget>();
-  const R600RegisterInfo *RI =
-      static_cast<const R600RegisterInfo *>(STM.getRegisterInfo());
+  const R600Subtarget &STM = MF.getSubtarget<R600Subtarget>();
+  const R600RegisterInfo *RI = STM.getRegisterInfo();
   const R600MachineFunctionInfo *MFI = MF.getInfo<R600MachineFunctionInfo>();
 
   for (const MachineBasicBlock &MBB : MF) {
@@ -299,23 +279,23 @@ void AMDGPUAsmPrinter::EmitProgramInfoR600(const MachineFunction &MF) {
   }
 
   unsigned RsrcReg;
-  if (STM.getGeneration() >= AMDGPUSubtarget::EVERGREEN) {
+  if (STM.getGeneration() >= R600Subtarget::EVERGREEN) {
     // Evergreen / Northern Islands
-    switch (MFI->getShaderType()) {
+    switch (MF.getFunction()->getCallingConv()) {
     default: // Fall through
-    case ShaderType::COMPUTE:  RsrcReg = R_0288D4_SQ_PGM_RESOURCES_LS; break;
-    case ShaderType::GEOMETRY: RsrcReg = R_028878_SQ_PGM_RESOURCES_GS; break;
-    case ShaderType::PIXEL:    RsrcReg = R_028844_SQ_PGM_RESOURCES_PS; break;
-    case ShaderType::VERTEX:   RsrcReg = R_028860_SQ_PGM_RESOURCES_VS; break;
+    case CallingConv::AMDGPU_CS: RsrcReg = R_0288D4_SQ_PGM_RESOURCES_LS; break;
+    case CallingConv::AMDGPU_GS: RsrcReg = R_028878_SQ_PGM_RESOURCES_GS; break;
+    case CallingConv::AMDGPU_PS: RsrcReg = R_028844_SQ_PGM_RESOURCES_PS; break;
+    case CallingConv::AMDGPU_VS: RsrcReg = R_028860_SQ_PGM_RESOURCES_VS; break;
     }
   } else {
     // R600 / R700
-    switch (MFI->getShaderType()) {
+    switch (MF.getFunction()->getCallingConv()) {
     default: // Fall through
-    case ShaderType::GEOMETRY: // Fall through
-    case ShaderType::COMPUTE:  // Fall through
-    case ShaderType::VERTEX:   RsrcReg = R_028868_SQ_PGM_RESOURCES_VS; break;
-    case ShaderType::PIXEL:    RsrcReg = R_028850_SQ_PGM_RESOURCES_PS; break;
+    case CallingConv::AMDGPU_GS: // Fall through
+    case CallingConv::AMDGPU_CS: // Fall through
+    case CallingConv::AMDGPU_VS: RsrcReg = R_028868_SQ_PGM_RESOURCES_VS; break;
+    case CallingConv::AMDGPU_PS: RsrcReg = R_028850_SQ_PGM_RESOURCES_PS; break;
     }
   }
 
@@ -325,23 +305,23 @@ void AMDGPUAsmPrinter::EmitProgramInfoR600(const MachineFunction &MF) {
   OutStreamer->EmitIntValue(R_02880C_DB_SHADER_CONTROL, 4);
   OutStreamer->EmitIntValue(S_02880C_KILL_ENABLE(killPixel), 4);
 
-  if (MFI->getShaderType() == ShaderType::COMPUTE) {
+  if (AMDGPU::isCompute(MF.getFunction()->getCallingConv())) {
     OutStreamer->EmitIntValue(R_0288E8_SQ_LDS_ALLOC, 4);
-    OutStreamer->EmitIntValue(RoundUpToAlignment(MFI->LDSSize, 4) >> 2, 4);
+    OutStreamer->EmitIntValue(alignTo(MFI->LDSSize, 4) >> 2, 4);
   }
 }
 
 void AMDGPUAsmPrinter::getSIProgramInfo(SIProgramInfo &ProgInfo,
                                         const MachineFunction &MF) const {
-  const AMDGPUSubtarget &STM = MF.getSubtarget<AMDGPUSubtarget>();
+  const SISubtarget &STM = MF.getSubtarget<SISubtarget>();
   const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
   uint64_t CodeSize = 0;
   unsigned MaxSGPR = 0;
   unsigned MaxVGPR = 0;
   bool VCCUsed = false;
   bool FlatUsed = false;
-  const SIRegisterInfo *RI =
-      static_cast<const SIRegisterInfo *>(STM.getRegisterInfo());
+  const SIRegisterInfo *RI = STM.getRegisterInfo();
+  const SIInstrInfo *TII = STM.getInstrInfo();
 
   for (const MachineBasicBlock &MBB : MF) {
     for (const MachineInstr &MI : MBB) {
@@ -351,8 +331,7 @@ void AMDGPUAsmPrinter::getSIProgramInfo(SIProgramInfo &ProgInfo,
       if (MI.isDebugValue())
         continue;
 
-      // FIXME: This is reporting 0 for many instructions.
-      CodeSize += MI.getDesc().Size;
+      CodeSize += TII->getInstSizeInBytes(MI);
 
       unsigned numOperands = MI.getNumOperands();
       for (unsigned op_idx = 0; op_idx < numOperands; op_idx++) {
@@ -366,6 +345,8 @@ void AMDGPUAsmPrinter::getSIProgramInfo(SIProgramInfo &ProgInfo,
         unsigned reg = MO.getReg();
         switch (reg) {
         case AMDGPU::EXEC:
+        case AMDGPU::EXEC_LO:
+        case AMDGPU::EXEC_HI:
         case AMDGPU::SCC:
         case AMDGPU::M0:
           continue;
@@ -382,17 +363,32 @@ void AMDGPUAsmPrinter::getSIProgramInfo(SIProgramInfo &ProgInfo,
           FlatUsed = true;
           continue;
 
+        case AMDGPU::TBA:
+        case AMDGPU::TBA_LO:
+        case AMDGPU::TBA_HI:
+        case AMDGPU::TMA:
+        case AMDGPU::TMA_LO:
+        case AMDGPU::TMA_HI:
+          llvm_unreachable("Trap Handler registers should not be used");
+          continue;
+
         default:
           break;
         }
 
         if (AMDGPU::SReg_32RegClass.contains(reg)) {
+          if (AMDGPU::TTMP_32RegClass.contains(reg)) {
+            llvm_unreachable("Trap Handler registers should not be used");
+          }
           isSGPR = true;
           width = 1;
         } else if (AMDGPU::VGPR_32RegClass.contains(reg)) {
           isSGPR = false;
           width = 1;
         } else if (AMDGPU::SReg_64RegClass.contains(reg)) {
+          if (AMDGPU::TTMP_64RegClass.contains(reg)) {
+            llvm_unreachable("Trap Handler registers should not be used");
+          }
           isSGPR = true;
           width = 2;
         } else if (AMDGPU::VReg_64RegClass.contains(reg)) {
@@ -438,7 +434,7 @@ void AMDGPUAsmPrinter::getSIProgramInfo(SIProgramInfo &ProgInfo,
   if (VCCUsed)
     ExtraSGPRs = 2;
 
-  if (STM.getGeneration() < AMDGPUSubtarget::VOLCANIC_ISLANDS) {
+  if (STM.getGeneration() < SISubtarget::VOLCANIC_ISLANDS) {
     if (FlatUsed)
       ExtraSGPRs = 4;
   } else {
@@ -451,23 +447,54 @@ void AMDGPUAsmPrinter::getSIProgramInfo(SIProgramInfo &ProgInfo,
 
   MaxSGPR += ExtraSGPRs;
 
+  // Record first reserved register and reserved register count fields, and
+  // update max register counts if "amdgpu-debugger-reserve-regs" attribute was
+  // specified.
+  if (STM.debuggerReserveRegs()) {
+    ProgInfo.ReservedVGPRFirst = MaxVGPR + 1;
+    ProgInfo.ReservedVGPRCount = MFI->getDebuggerReservedVGPRCount();
+    MaxVGPR += MFI->getDebuggerReservedVGPRCount();
+  }
+
+  // Update DebuggerWavefrontPrivateSegmentOffsetSGPR and
+  // DebuggerPrivateSegmentBufferSGPR fields if "amdgpu-debugger-emit-prologue"
+  // attribute was specified.
+  if (STM.debuggerEmitPrologue()) {
+    ProgInfo.DebuggerWavefrontPrivateSegmentOffsetSGPR =
+      RI->getHWRegIndex(MFI->getScratchWaveOffsetReg());
+    ProgInfo.DebuggerPrivateSegmentBufferSGPR =
+      RI->getHWRegIndex(MFI->getScratchRSrcReg());
+  }
+
   // We found the maximum register index. They start at 0, so add one to get the
   // number of registers.
   ProgInfo.NumVGPR = MaxVGPR + 1;
   ProgInfo.NumSGPR = MaxSGPR + 1;
 
   if (STM.hasSGPRInitBug()) {
-    if (ProgInfo.NumSGPR > AMDGPUSubtarget::FIXED_SGPR_COUNT_FOR_INIT_BUG) {
+    if (ProgInfo.NumSGPR > SISubtarget::FIXED_SGPR_COUNT_FOR_INIT_BUG) {
       LLVMContext &Ctx = MF.getFunction()->getContext();
-      Ctx.emitError("too many SGPRs used with the SGPR init bug");
+      DiagnosticInfoResourceLimit Diag(*MF.getFunction(),
+                                       "SGPRs with SGPR init bug",
+                                       ProgInfo.NumSGPR, DS_Error);
+      Ctx.diagnose(Diag);
     }
 
-    ProgInfo.NumSGPR = AMDGPUSubtarget::FIXED_SGPR_COUNT_FOR_INIT_BUG;
+    ProgInfo.NumSGPR = SISubtarget::FIXED_SGPR_COUNT_FOR_INIT_BUG;
   }
 
   if (MFI->NumUserSGPRs > STM.getMaxNumUserSGPRs()) {
     LLVMContext &Ctx = MF.getFunction()->getContext();
-    Ctx.emitError("too many user SGPRs used");
+    DiagnosticInfoResourceLimit Diag(*MF.getFunction(), "user SGPRs",
+                                     MFI->NumUserSGPRs, DS_Error);
+    Ctx.diagnose(Diag);
+  }
+
+  if (MFI->LDSSize > static_cast<unsigned>(STM.getLocalMemorySize())) {
+    LLVMContext &Ctx = MF.getFunction()->getContext();
+    DiagnosticInfoResourceLimit Diag(*MF.getFunction(), "local memory",
+                                     MFI->LDSSize, DS_Error);
+    Ctx.diagnose(Diag);
   }
 
   ProgInfo.VGPRBlocks = (ProgInfo.NumVGPR - 1) / 4;
@@ -476,21 +503,20 @@ void AMDGPUAsmPrinter::getSIProgramInfo(SIProgramInfo &ProgInfo,
   // register.
   ProgInfo.FloatMode = getFPMode(MF);
 
-  // XXX: Not quite sure what this does, but sc seems to unset this.
   ProgInfo.IEEEMode = 0;
 
-  // Do not clamp NAN to 0.
-  ProgInfo.DX10Clamp = 0;
+  // Make clamp modifier on NaN input returns 0.
+  ProgInfo.DX10Clamp = 1;
 
   const MachineFrameInfo *FrameInfo = MF.getFrameInfo();
-  ProgInfo.ScratchSize = FrameInfo->estimateStackSize(MF);
+  ProgInfo.ScratchSize = FrameInfo->getStackSize();
 
   ProgInfo.FlatUsed = FlatUsed;
   ProgInfo.VCCUsed = VCCUsed;
   ProgInfo.CodeLen = CodeSize;
 
   unsigned LDSAlignShift;
-  if (STM.getGeneration() < AMDGPUSubtarget::SEA_ISLANDS) {
+  if (STM.getGeneration() < SISubtarget::SEA_ISLANDS) {
     // LDS is allocated in 64 dword blocks.
     LDSAlignShift = 8;
   } else {
@@ -503,7 +529,7 @@ void AMDGPUAsmPrinter::getSIProgramInfo(SIProgramInfo &ProgInfo,
 
   ProgInfo.LDSSize = MFI->LDSSize + LDSSpillSize;
   ProgInfo.LDSBlocks =
-     RoundUpToAlignment(ProgInfo.LDSSize, 1 << LDSAlignShift) >> LDSAlignShift;
+      alignTo(ProgInfo.LDSSize, 1ULL << LDSAlignShift) >> LDSAlignShift;
 
   // Scratch is allocated in 256 dword blocks.
   unsigned ScratchAlignShift = 10;
@@ -511,8 +537,9 @@ void AMDGPUAsmPrinter::getSIProgramInfo(SIProgramInfo &ProgInfo,
   // is used by the entire wave.  ProgInfo.ScratchSize is the amount of
   // scratch memory used per thread.
   ProgInfo.ScratchBlocks =
-    RoundUpToAlignment(ProgInfo.ScratchSize * STM.getWavefrontSize(),
-                       1 << ScratchAlignShift) >> ScratchAlignShift;
+      alignTo(ProgInfo.ScratchSize * STM.getWavefrontSize(),
+              1ULL << ScratchAlignShift) >>
+      ScratchAlignShift;
 
   ProgInfo.ComputePGMRSrc1 =
       S_00B848_VGPRS(ProgInfo.VGPRBlocks) |
@@ -544,23 +571,23 @@ void AMDGPUAsmPrinter::getSIProgramInfo(SIProgramInfo &ProgInfo,
       S_00B84C_EXCP_EN(0);
 }
 
-static unsigned getRsrcReg(unsigned ShaderType) {
-  switch (ShaderType) {
+static unsigned getRsrcReg(CallingConv::ID CallConv) {
+  switch (CallConv) {
   default: // Fall through
-  case ShaderType::COMPUTE:  return R_00B848_COMPUTE_PGM_RSRC1;
-  case ShaderType::GEOMETRY: return R_00B228_SPI_SHADER_PGM_RSRC1_GS;
-  case ShaderType::PIXEL:    return R_00B028_SPI_SHADER_PGM_RSRC1_PS;
-  case ShaderType::VERTEX:   return R_00B128_SPI_SHADER_PGM_RSRC1_VS;
+  case CallingConv::AMDGPU_CS: return R_00B848_COMPUTE_PGM_RSRC1;
+  case CallingConv::AMDGPU_GS: return R_00B228_SPI_SHADER_PGM_RSRC1_GS;
+  case CallingConv::AMDGPU_PS: return R_00B028_SPI_SHADER_PGM_RSRC1_PS;
+  case CallingConv::AMDGPU_VS: return R_00B128_SPI_SHADER_PGM_RSRC1_VS;
   }
 }
 
 void AMDGPUAsmPrinter::EmitProgramInfoSI(const MachineFunction &MF,
                                          const SIProgramInfo &KernelInfo) {
-  const AMDGPUSubtarget &STM = MF.getSubtarget<AMDGPUSubtarget>();
+  const SISubtarget &STM = MF.getSubtarget<SISubtarget>();
   const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
-  unsigned RsrcReg = getRsrcReg(MFI->getShaderType());
+  unsigned RsrcReg = getRsrcReg(MF.getFunction()->getCallingConv());
 
-  if (MFI->getShaderType() == ShaderType::COMPUTE) {
+  if (AMDGPU::isCompute(MF.getFunction()->getCallingConv())) {
     OutStreamer->EmitIntValue(R_00B848_COMPUTE_PGM_RSRC1, 4);
 
     OutStreamer->EmitIntValue(KernelInfo.ComputePGMRSrc1, 4);
@@ -577,13 +604,13 @@ void AMDGPUAsmPrinter::EmitProgramInfoSI(const MachineFunction &MF,
     OutStreamer->EmitIntValue(RsrcReg, 4);
     OutStreamer->EmitIntValue(S_00B028_VGPRS(KernelInfo.VGPRBlocks) |
                               S_00B028_SGPRS(KernelInfo.SGPRBlocks), 4);
-    if (STM.isVGPRSpillingEnabled(MFI)) {
+    if (STM.isVGPRSpillingEnabled(*MF.getFunction())) {
       OutStreamer->EmitIntValue(R_0286E8_SPI_TMPRING_SIZE, 4);
       OutStreamer->EmitIntValue(S_0286E8_WAVESIZE(KernelInfo.ScratchBlocks), 4);
     }
   }
 
-  if (MFI->getShaderType() == ShaderType::PIXEL) {
+  if (MF.getFunction()->getCallingConv() == CallingConv::AMDGPU_PS) {
     OutStreamer->EmitIntValue(R_00B02C_SPI_SHADER_PGM_RSRC2_PS, 4);
     OutStreamer->EmitIntValue(S_00B02C_EXTRA_LDS_SIZE(KernelInfo.LDSBlocks), 4);
     OutStreamer->EmitIntValue(R_0286CC_SPI_PS_INPUT_ENA, 4);
@@ -591,12 +618,31 @@ void AMDGPUAsmPrinter::EmitProgramInfoSI(const MachineFunction &MF,
     OutStreamer->EmitIntValue(R_0286D0_SPI_PS_INPUT_ADDR, 4);
     OutStreamer->EmitIntValue(MFI->getPSInputAddr(), 4);
   }
+
+  OutStreamer->EmitIntValue(R_SPILLED_SGPRS, 4);
+  OutStreamer->EmitIntValue(MFI->getNumSpilledSGPRs(), 4);
+  OutStreamer->EmitIntValue(R_SPILLED_VGPRS, 4);
+  OutStreamer->EmitIntValue(MFI->getNumSpilledVGPRs(), 4);
+}
+
+// This is supposed to be log2(Size)
+static amd_element_byte_size_t getElementByteSizeValue(unsigned Size) {
+  switch (Size) {
+  case 4:
+    return AMD_ELEMENT_4_BYTES;
+  case 8:
+    return AMD_ELEMENT_8_BYTES;
+  case 16:
+    return AMD_ELEMENT_16_BYTES;
+  default:
+    llvm_unreachable("invalid private_element_size");
+  }
 }
 
 void AMDGPUAsmPrinter::EmitAmdKernelCodeT(const MachineFunction &MF,
                                          const SIProgramInfo &KernelInfo) const {
   const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
-  const AMDGPUSubtarget &STM = MF.getSubtarget<AMDGPUSubtarget>();
+  const SISubtarget &STM = MF.getSubtarget<SISubtarget>();
   amd_kernel_code_t header;
 
   AMDGPU::initDefaultAMDKernelCodeT(header, STM.getFeatureBits());
@@ -606,6 +652,11 @@ void AMDGPUAsmPrinter::EmitAmdKernelCodeT(const MachineFunction &MF,
       (KernelInfo.ComputePGMRSrc2 << 32);
   header.code_properties = AMD_CODE_PROPERTY_IS_PTR64;
 
+
+  AMD_HSA_BITS_SET(header.code_properties,
+                   AMD_CODE_PROPERTY_PRIVATE_ELEMENT_SIZE,
+                   getElementByteSizeValue(STM.getMaxPrivateElementSize()));
+
   if (MFI->hasPrivateSegmentBuffer()) {
     header.code_properties |=
       AMD_CODE_PROPERTY_ENABLE_SGPR_PRIVATE_SEGMENT_BUFFER;
@@ -646,6 +697,9 @@ void AMDGPUAsmPrinter::EmitAmdKernelCodeT(const MachineFunction &MF,
   if (MFI->hasDispatchPtr())
     header.code_properties |= AMD_CODE_PROPERTY_ENABLE_SGPR_DISPATCH_PTR;
 
+  if (STM.debuggerSupported())
+    header.code_properties |= AMD_CODE_PROPERTY_IS_DEBUG_SUPPORTED;
+
   if (STM.isXNACKEnabled())
     header.code_properties |= AMD_CODE_PROPERTY_IS_XNACK_SUPPORTED;
 
@@ -654,9 +708,20 @@ void AMDGPUAsmPrinter::EmitAmdKernelCodeT(const MachineFunction &MF,
   header.workitem_vgpr_count = KernelInfo.NumVGPR;
   header.workitem_private_segment_byte_size = KernelInfo.ScratchSize;
   header.workgroup_group_segment_byte_size = KernelInfo.LDSSize;
+  header.reserved_vgpr_first = KernelInfo.ReservedVGPRFirst;
+  header.reserved_vgpr_count = KernelInfo.ReservedVGPRCount;
+
+  if (STM.debuggerEmitPrologue()) {
+    header.debug_wavefront_private_segment_offset_sgpr =
+      KernelInfo.DebuggerWavefrontPrivateSegmentOffsetSGPR;
+    header.debug_private_segment_buffer_sgpr =
+      KernelInfo.DebuggerPrivateSegmentBufferSGPR;
+  }
 
   AMDGPUTargetStreamer *TS =
       static_cast<AMDGPUTargetStreamer *>(OutStreamer->getTargetStreamer());
+
+  OutStreamer->SwitchSection(getObjFileLowering().getTextSection());
   TS->EmitAMDKernelCodeT(header);
 }
 
@@ -680,3 +745,227 @@ bool AMDGPUAsmPrinter::PrintAsmOperand(const MachineInstr *MI, unsigned OpNo,
                    *TM.getSubtargetImpl(*MF->getFunction())->getRegisterInfo());
   return false;
 }
+
+// Emit a key and an integer value for runtime metadata.
+static void emitRuntimeMDIntValue(std::unique_ptr<MCStreamer> &Streamer,
+                                  RuntimeMD::Key K, uint64_t V,
+                                  unsigned Size) {
+  Streamer->EmitIntValue(K, 1);
+  Streamer->EmitIntValue(V, Size);
+}
+
+// Emit a key and a string value for runtime metadata.
+static void emitRuntimeMDStringValue(std::unique_ptr<MCStreamer> &Streamer,
+                                     RuntimeMD::Key K, StringRef S) {
+  Streamer->EmitIntValue(K, 1);
+  Streamer->EmitIntValue(S.size(), 4);
+  Streamer->EmitBytes(S);
+}
+
+// Emit a key and three integer values for runtime metadata.
+// The three integer values are obtained from MDNode \p Node;
+static void emitRuntimeMDThreeIntValues(std::unique_ptr<MCStreamer> &Streamer,
+                                        RuntimeMD::Key K, MDNode *Node,
+                                        unsigned Size) {
+  Streamer->EmitIntValue(K, 1);
+  Streamer->EmitIntValue(mdconst::extract<ConstantInt>(
+    Node->getOperand(0))->getZExtValue(), Size);
+  Streamer->EmitIntValue(mdconst::extract<ConstantInt>(
+    Node->getOperand(1))->getZExtValue(), Size);
+  Streamer->EmitIntValue(mdconst::extract<ConstantInt>(
+    Node->getOperand(2))->getZExtValue(), Size);
+}
+
+void AMDGPUAsmPrinter::emitStartOfRuntimeMetadata(const Module &M) {
+  OutStreamer->SwitchSection(getObjFileLowering().getContext()
+    .getELFSection(RuntimeMD::SectionName, ELF::SHT_PROGBITS, 0));
+
+  emitRuntimeMDIntValue(OutStreamer, RuntimeMD::KeyMDVersion,
+                        RuntimeMD::MDVersion << 8 | RuntimeMD::MDRevision, 2);
+  if (auto MD = M.getNamedMetadata("opencl.ocl.version")) {
+    emitRuntimeMDIntValue(OutStreamer, RuntimeMD::KeyLanguage,
+                          RuntimeMD::OpenCL_C, 1);
+    auto Node = MD->getOperand(0);
+    unsigned short Major = mdconst::extract<ConstantInt>(Node->getOperand(0))
+                             ->getZExtValue();
+    unsigned short Minor = mdconst::extract<ConstantInt>(Node->getOperand(1))
+                             ->getZExtValue();
+    emitRuntimeMDIntValue(OutStreamer, RuntimeMD::KeyLanguageVersion,
+                          Major * 100 + Minor * 10, 2);
+  }
+}
+
+static std::string getOCLTypeName(Type *Ty, bool isSigned) {
+  if (VectorType* VecTy = dyn_cast<VectorType>(Ty)) {
+    Type* EleTy = VecTy->getElementType();
+    unsigned Size = VecTy->getVectorNumElements();
+    return (Twine(getOCLTypeName(EleTy, isSigned)) + Twine(Size)).str();
+  }
+  switch (Ty->getTypeID()) {
+  case Type::HalfTyID:   return "half";
+  case Type::FloatTyID:  return "float";
+  case Type::DoubleTyID: return "double";
+  case Type::IntegerTyID: {
+    if (!isSigned)
+      return (Twine('u') + Twine(getOCLTypeName(Ty, true))).str();
+    auto IntTy = cast<IntegerType>(Ty);
+    auto BW = IntTy->getIntegerBitWidth();
+    switch (BW) {
+    case 8:
+      return "char";
+    case 16:
+      return "short";
+    case 32:
+      return "int";
+    case 64:
+      return "long";
+    default:
+      return (Twine('i') + Twine(BW)).str();
+    }
+  }
+  default:
+    llvm_unreachable("invalid type");
+  }
+}
+
+static RuntimeMD::KernelArg::ValueType getRuntimeMDValueType(
+         Type *Ty, StringRef TypeName) {
+  if (auto VT = dyn_cast<VectorType>(Ty))
+    return getRuntimeMDValueType(VT->getElementType(), TypeName);
+  else if (auto PT = dyn_cast<PointerType>(Ty))
+    return getRuntimeMDValueType(PT->getElementType(), TypeName);
+  else if (Ty->isHalfTy())
+    return RuntimeMD::KernelArg::F16;
+  else if (Ty->isFloatTy())
+    return RuntimeMD::KernelArg::F32;
+  else if (Ty->isDoubleTy())
+    return RuntimeMD::KernelArg::F64;
+  else if (IntegerType* intTy = dyn_cast<IntegerType>(Ty)) {
+    bool Signed = !TypeName.startswith("u");
+    switch (intTy->getIntegerBitWidth()) {
+    case 8:
+      return Signed ? RuntimeMD::KernelArg::I8 : RuntimeMD::KernelArg::U8;
+    case 16:
+      return Signed ? RuntimeMD::KernelArg::I16 : RuntimeMD::KernelArg::U16;
+    case 32:
+      return Signed ? RuntimeMD::KernelArg::I32 : RuntimeMD::KernelArg::U32;
+    case 64:
+      return Signed ? RuntimeMD::KernelArg::I64 : RuntimeMD::KernelArg::U64;
+    default:
+      // Runtime does not recognize other integer types. Report as
+      // struct type.
+      return RuntimeMD::KernelArg::Struct;
+    }
+  } else
+    return RuntimeMD::KernelArg::Struct;
+}
+
+void AMDGPUAsmPrinter::emitRuntimeMetadata(const Function &F) {
+  if (!F.getMetadata("kernel_arg_type"))
+    return;
+
+  MCContext &Context = getObjFileLowering().getContext();
+  OutStreamer->SwitchSection(
+      Context.getELFSection(RuntimeMD::SectionName, ELF::SHT_PROGBITS, 0));
+  OutStreamer->EmitIntValue(RuntimeMD::KeyKernelBegin, 1);
+  emitRuntimeMDStringValue(OutStreamer, RuntimeMD::KeyKernelName, F.getName());
+
+  for (auto &Arg:F.args()) {
+    // Emit KeyArgBegin.
+    unsigned I = Arg.getArgNo();
+    OutStreamer->EmitIntValue(RuntimeMD::KeyArgBegin, 1);
+
+    // Emit KeyArgSize and KeyArgAlign.
+    auto T = Arg.getType();
+    auto DL = F.getParent()->getDataLayout();
+    emitRuntimeMDIntValue(OutStreamer, RuntimeMD::KeyArgSize,
+                          DL.getTypeAllocSize(T), 4);
+    emitRuntimeMDIntValue(OutStreamer, RuntimeMD::KeyArgAlign,
+                          DL.getABITypeAlignment(T), 4);
+
+    // Emit KeyArgTypeName.
+    auto TypeName = dyn_cast<MDString>(F.getMetadata(
+      "kernel_arg_type")->getOperand(I))->getString();
+    emitRuntimeMDStringValue(OutStreamer, RuntimeMD::KeyArgTypeName, TypeName);
+
+    // Emit KeyArgName.
+    if (auto ArgNameMD = F.getMetadata("kernel_arg_name")) {
+      auto ArgName = cast<MDString>(ArgNameMD->getOperand(
+        I))->getString();
+      emitRuntimeMDStringValue(OutStreamer, RuntimeMD::KeyArgName, ArgName);
+    }
+
+    // Emit KeyArgIsVolatile, KeyArgIsRestrict, KeyArgIsConst and KeyArgIsPipe.
+    auto TypeQual = cast<MDString>(F.getMetadata(
+      "kernel_arg_type_qual")->getOperand(I))->getString();
+    SmallVector<StringRef, 1> SplitQ;
+    TypeQual.split(SplitQ, " ", -1, false/* drop empty entry*/);
+    for (auto &I:SplitQ) {
+      auto Key = StringSwitch<RuntimeMD::Key>(I)
+        .Case("volatile", RuntimeMD::KeyArgIsVolatile)
+        .Case("restrict", RuntimeMD::KeyArgIsRestrict)
+        .Case("const",    RuntimeMD::KeyArgIsConst)
+        .Case("pipe",     RuntimeMD::KeyArgIsPipe)
+        .Default(RuntimeMD::KeyNull);
+      OutStreamer->EmitIntValue(Key, 1);
+    }
+
+    // Emit KeyArgTypeKind.
+    auto BaseTypeName = cast<MDString>(
+      F.getMetadata("kernel_arg_base_type")->getOperand(I))->getString();
+    auto TypeKind = StringSwitch<RuntimeMD::KernelArg::TypeKind>(BaseTypeName)
+      .Case("sampler_t", RuntimeMD::KernelArg::Sampler)
+      .Case("queue_t",   RuntimeMD::KernelArg::Queue)
+      .Cases("image1d_t", "image1d_array_t", "image1d_buffer_t",
+             "image2d_t" , "image2d_array_t",  RuntimeMD::KernelArg::Image)
+      .Cases("image2d_depth_t", "image2d_array_depth_t",
+             "image2d_msaa_t", "image2d_array_msaa_t",
+             "image2d_msaa_depth_t",  RuntimeMD::KernelArg::Image)
+      .Cases("image2d_array_msaa_depth_t", "image3d_t",
+             RuntimeMD::KernelArg::Image)
+      .Default(isa<PointerType>(T) ? RuntimeMD::KernelArg::Pointer :
+               RuntimeMD::KernelArg::Value);
+    emitRuntimeMDIntValue(OutStreamer, RuntimeMD::KeyArgTypeKind, TypeKind, 1);
+
+    // Emit KeyArgValueType.
+    emitRuntimeMDIntValue(OutStreamer, RuntimeMD::KeyArgValueType,
+                          getRuntimeMDValueType(T, BaseTypeName), 2);
+
+    // Emit KeyArgAccQual.
+    auto AccQual = cast<MDString>(F.getMetadata(
+      "kernel_arg_access_qual")->getOperand(I))->getString();
+    auto AQ = StringSwitch<RuntimeMD::KernelArg::AccessQualifer>(AccQual)
+      .Case("read_only",  RuntimeMD::KernelArg::ReadOnly)
+      .Case("write_only", RuntimeMD::KernelArg::WriteOnly)
+      .Case("read_write", RuntimeMD::KernelArg::ReadWrite)
+      .Default(RuntimeMD::KernelArg::None);
+    emitRuntimeMDIntValue(OutStreamer, RuntimeMD::KeyArgAccQual,
+                          AQ, 1);
+
+    // Emit KeyArgAddrQual.
+    if (isa<PointerType>(T))
+      emitRuntimeMDIntValue(OutStreamer, RuntimeMD::KeyArgAddrQual,
+                            T->getPointerAddressSpace(), 1);
+
+    // Emit KeyArgEnd
+    OutStreamer->EmitIntValue(RuntimeMD::KeyArgEnd, 1);
+  }
+
+  // Emit KeyReqdWorkGroupSize, KeyWorkGroupSizeHint, and KeyVecTypeHint.
+  if (auto RWGS = F.getMetadata("reqd_work_group_size"))
+    emitRuntimeMDThreeIntValues(OutStreamer, RuntimeMD::KeyReqdWorkGroupSize,
+                                RWGS, 4);
+  if (auto WGSH = F.getMetadata("work_group_size_hint"))
+    emitRuntimeMDThreeIntValues(OutStreamer, RuntimeMD::KeyWorkGroupSizeHint,
+                                WGSH, 4);
+  if (auto VTH = F.getMetadata("vec_type_hint")) {
+    auto TypeName = getOCLTypeName(cast<ValueAsMetadata>(
+      VTH->getOperand(0))->getType(), mdconst::extract<ConstantInt>(
+      VTH->getOperand(1))->getZExtValue());
+    emitRuntimeMDStringValue(OutStreamer, RuntimeMD::KeyVecTypeHint,
+                             TypeName);
+  }
+
+  // Emit KeyKernelEnd
+  OutStreamer->EmitIntValue(RuntimeMD::KeyKernelEnd, 1);
+}
diff --git a/lib/Target/AMDGPU/AMDGPUAsmPrinter.h b/lib/Target/AMDGPU/AMDGPUAsmPrinter.h
index 99d4091670fe..7b04c539520d 100644
--- a/lib/Target/AMDGPU/AMDGPUAsmPrinter.h
+++ b/lib/Target/AMDGPU/AMDGPUAsmPrinter.h
@@ -12,15 +12,15 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef LLVM_LIB_TARGET_R600_AMDGPUASMPRINTER_H
-#define LLVM_LIB_TARGET_R600_AMDGPUASMPRINTER_H
+#ifndef LLVM_LIB_TARGET_AMDGPU_AMDGPUASMPRINTER_H
+#define LLVM_LIB_TARGET_AMDGPU_AMDGPUASMPRINTER_H
 
 #include "llvm/CodeGen/AsmPrinter.h"
 #include <vector>
 
 namespace llvm {
 
-class AMDGPUAsmPrinter : public AsmPrinter {
+class AMDGPUAsmPrinter final : public AsmPrinter {
 private:
   struct SIProgramInfo {
     SIProgramInfo() :
@@ -40,6 +40,10 @@ private:
       NumVGPR(0),
       NumSGPR(0),
       FlatUsed(false),
+      ReservedVGPRFirst(0),
+      ReservedVGPRCount(0),
+      DebuggerWavefrontPrivateSegmentOffsetSGPR((uint16_t)-1),
+      DebuggerPrivateSegmentBufferSGPR((uint16_t)-1),
       VCCUsed(false),
       CodeLen(0) {}
 
@@ -67,6 +71,20 @@ private:
     uint32_t LDSSize;
     bool FlatUsed;
 
+    // If ReservedVGPRCount is 0 then must be 0. Otherwise, this is the first
+    // fixed VGPR number reserved.
+    uint16_t ReservedVGPRFirst;
+    // The number of consecutive VGPRs reserved.
+    uint16_t ReservedVGPRCount;
+
+    // Fixed SGPR number used to hold wave scratch offset for entire kernel
+    // execution, or uint16_t(-1) if the register is not used or not known.
+    uint16_t DebuggerWavefrontPrivateSegmentOffsetSGPR;
+    // Fixed SGPR number of the first 4 SGPRs used to hold scratch V# for entire
+    // kernel execution, or uint16_t(-1) if the register is not used or not
+    // known.
+    uint16_t DebuggerPrivateSegmentBufferSGPR;
+
     // Bonus information for debugging.
     bool VCCUsed;
     uint64_t CodeLen;
@@ -109,6 +127,10 @@ public:
                        unsigned AsmVariant, const char *ExtraCode,
                        raw_ostream &O) override;
 
+  void emitStartOfRuntimeMetadata(const Module &M);
+
+  void emitRuntimeMetadata(const Function &F);
+
 protected:
   std::vector<std::string> DisasmLines, HexLines;
   size_t DisasmLineMaxLen;
diff --git a/lib/Target/AMDGPU/AMDGPUCallLowering.cpp b/lib/Target/AMDGPU/AMDGPUCallLowering.cpp
new file mode 100644
index 000000000000..1a1da8a254a7
--- /dev/null
+++ b/lib/Target/AMDGPU/AMDGPUCallLowering.cpp
@@ -0,0 +1,42 @@
+//===-- llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp - Call lowering ---===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// This file implements the lowering of LLVM calls to machine code calls for
+/// GlobalISel.
+///
+//===----------------------------------------------------------------------===//
+
+#include "AMDGPUCallLowering.h"
+#include "AMDGPUISelLowering.h"
+
+#include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+
+using namespace llvm;
+
+#ifndef LLVM_BUILD_GLOBAL_ISEL
+#error "This shouldn't be built without GISel"
+#endif
+
+AMDGPUCallLowering::AMDGPUCallLowering(const AMDGPUTargetLowering &TLI)
+  : CallLowering(&TLI) {
+}
+
+bool AMDGPUCallLowering::lowerReturn(MachineIRBuilder &MIRBuilder,
+                                        const Value *Val, unsigned VReg) const {
+  return true;
+}
+
+bool AMDGPUCallLowering::lowerFormalArguments(
+    MachineIRBuilder &MIRBuilder, const Function::ArgumentListType &Args,
+    const SmallVectorImpl<unsigned> &VRegs) const {
+  // TODO: Implement once there are generic loads/stores.
+  return true;
+}
diff --git a/lib/Target/AMDGPU/AMDGPUCallLowering.h b/lib/Target/AMDGPU/AMDGPUCallLowering.h
new file mode 100644
index 000000000000..61174bacdac3
--- /dev/null
+++ b/lib/Target/AMDGPU/AMDGPUCallLowering.h
@@ -0,0 +1,36 @@
+//===- lib/Target/AMDGPU/AMDGPUCallLowering.h - Call lowering -*- C++ -*---===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// This file describes how to lower LLVM calls to machine code calls.
+///
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_AMDGPU_AMDGPUCALLLOWERING_H
+#define LLVM_LIB_TARGET_AMDGPU_AMDGPUCALLLOWERING_H
+
+#include "llvm/CodeGen/GlobalISel/CallLowering.h"
+
+namespace llvm {
+
+class AMDGPUTargetLowering;
+
+class AMDGPUCallLowering: public CallLowering {
+ public:
+  AMDGPUCallLowering(const AMDGPUTargetLowering &TLI);
+
+  bool lowerReturn(MachineIRBuilder &MIRBuiler, const Value *Val,
+                   unsigned VReg) const override;
+  bool
+  lowerFormalArguments(MachineIRBuilder &MIRBuilder,
+                       const Function::ArgumentListType &Args,
+                       const SmallVectorImpl<unsigned> &VRegs) const override;
+};
+} // End of namespace llvm;
+#endif
diff --git a/lib/Target/AMDGPU/AMDGPUCallingConv.td b/lib/Target/AMDGPU/AMDGPUCallingConv.td
index b0db26124a0c..47dfa4992068 100644
--- a/lib/Target/AMDGPU/AMDGPUCallingConv.td
+++ b/lib/Target/AMDGPU/AMDGPUCallingConv.td
@@ -110,21 +110,19 @@ def CC_R600 : CallingConv<[
 
 // Calling convention for compute kernels
 def CC_AMDGPU_Kernel : CallingConv<[
-  CCCustom<"allocateStack">
+  CCCustom<"allocateKernArg">
 ]>;
 
 def CC_AMDGPU : CallingConv<[
   CCIf<"static_cast<const AMDGPUSubtarget&>"
         "(State.getMachineFunction().getSubtarget()).getGeneration() >="
           "AMDGPUSubtarget::SOUTHERN_ISLANDS && "
-        "State.getMachineFunction().getInfo<SIMachineFunctionInfo>()"
-         "->getShaderType() == ShaderType::COMPUTE",
+        "!AMDGPU::isShader(State.getCallingConv())",
        CCDelegateTo<CC_AMDGPU_Kernel>>,
   CCIf<"static_cast<const AMDGPUSubtarget&>"
         "(State.getMachineFunction().getSubtarget()).getGeneration() < "
           "AMDGPUSubtarget::SOUTHERN_ISLANDS && "
-         "State.getMachineFunction().getInfo<R600MachineFunctionInfo>()"
-          "->getShaderType() == ShaderType::COMPUTE",
+         "!AMDGPU::isShader(State.getCallingConv())",
         CCDelegateTo<CC_AMDGPU_Kernel>>,
    CCIf<"static_cast<const AMDGPUSubtarget&>"
          "(State.getMachineFunction().getSubtarget()).getGeneration() >= "
diff --git a/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp b/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp
new file mode 100644
index 000000000000..3b415774df49
--- /dev/null
+++ b/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp
@@ -0,0 +1,82 @@
+//===-- AMDGPUCodeGenPrepare.cpp ------------------------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+/// \file
+/// This pass does misc. AMDGPU optimizations on IR before instruction
+/// selection.
+//
+//===----------------------------------------------------------------------===//
+
+#include "AMDGPU.h"
+#include "AMDGPUSubtarget.h"
+
+#include "llvm/Analysis/DivergenceAnalysis.h"
+#include "llvm/CodeGen/Passes.h"
+#include "llvm/IR/InstVisitor.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+
+#define DEBUG_TYPE "amdgpu-codegenprepare"
+
+using namespace llvm;
+
+namespace {
+
+class AMDGPUCodeGenPrepare : public FunctionPass,
+                             public InstVisitor<AMDGPUCodeGenPrepare> {
+  DivergenceAnalysis *DA;
+  const TargetMachine *TM;
+
+public:
+  static char ID;
+  AMDGPUCodeGenPrepare(const TargetMachine *TM = nullptr) :
+    FunctionPass(ID),
+    TM(TM) { }
+
+  bool doInitialization(Module &M) override;
+  bool runOnFunction(Function &F) override;
+
+  const char *getPassName() const override {
+    return "AMDGPU IR optimizations";
+  }
+
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    AU.addRequired<DivergenceAnalysis>();
+    AU.setPreservesAll();
+ }
+};
+
+} // End anonymous namespace
+
+bool AMDGPUCodeGenPrepare::doInitialization(Module &M) {
+  return false;
+}
+
+bool AMDGPUCodeGenPrepare::runOnFunction(Function &F) {
+  if (!TM || skipFunction(F))
+    return false;
+
+  DA = &getAnalysis<DivergenceAnalysis>();
+  visit(F);
+
+  return true;
+}
+
+INITIALIZE_TM_PASS_BEGIN(AMDGPUCodeGenPrepare, DEBUG_TYPE,
+                      "AMDGPU IR optimizations", false, false)
+INITIALIZE_PASS_DEPENDENCY(DivergenceAnalysis)
+INITIALIZE_TM_PASS_END(AMDGPUCodeGenPrepare, DEBUG_TYPE,
+                       "AMDGPU IR optimizations", false, false)
+
+char AMDGPUCodeGenPrepare::ID = 0;
+
+FunctionPass *llvm::createAMDGPUCodeGenPreparePass(const TargetMachine *TM) {
+  return new AMDGPUCodeGenPrepare(TM);
+}
diff --git a/lib/Target/AMDGPU/AMDGPUDiagnosticInfoUnsupported.cpp b/lib/Target/AMDGPU/AMDGPUDiagnosticInfoUnsupported.cpp
deleted file mode 100644
index 2f6b3022dd6e..000000000000
--- a/lib/Target/AMDGPU/AMDGPUDiagnosticInfoUnsupported.cpp
+++ /dev/null
@@ -1,26 +0,0 @@
-//===-- AMDGPUDiagnosticInfoUnsupported.cpp -------------------------------===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-
-#include "AMDGPUDiagnosticInfoUnsupported.h"
-
-using namespace llvm;
-
-DiagnosticInfoUnsupported::DiagnosticInfoUnsupported(
-  const Function &Fn,
-  const Twine &Desc,
-  DiagnosticSeverity Severity)
-  : DiagnosticInfo(getKindID(), Severity),
-    Description(Desc),
-    Fn(Fn) { }
-
-int DiagnosticInfoUnsupported::KindID = 0;
-
-void DiagnosticInfoUnsupported::print(DiagnosticPrinter &DP) const {
-  DP << "unsupported " << getDescription() << " in " << Fn.getName();
-}
diff --git a/lib/Target/AMDGPU/AMDGPUDiagnosticInfoUnsupported.h b/lib/Target/AMDGPU/AMDGPUDiagnosticInfoUnsupported.h
deleted file mode 100644
index 0fd37e1ede6b..000000000000
--- a/lib/Target/AMDGPU/AMDGPUDiagnosticInfoUnsupported.h
+++ /dev/null
@@ -1,48 +0,0 @@
-//===-- AMDGPUDiagnosticInfoUnsupported.h - Error reporting -----*- C++ -*-===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef LLVM_LIB_TARGET_AMDGPU_AMDGPUDIAGNOSTICINFOUNSUPPORTED_H
-#define LLVM_LIB_TARGET_AMDGPU_AMDGPUDIAGNOSTICINFOUNSUPPORTED_H
-
-#include "llvm/IR/DiagnosticInfo.h"
-#include "llvm/IR/DiagnosticPrinter.h"
-
-namespace llvm {
-
-/// Diagnostic information for unimplemented or unsupported feature reporting.
-class DiagnosticInfoUnsupported : public DiagnosticInfo {
-private:
-  const Twine &Description;
-  const Function &Fn;
-
-  static int KindID;
-
-  static int getKindID() {
-    if (KindID == 0)
-      KindID = llvm::getNextAvailablePluginDiagnosticKind();
-    return KindID;
-  }
-
-public:
-  DiagnosticInfoUnsupported(const Function &Fn, const Twine &Desc,
-                            DiagnosticSeverity Severity = DS_Error);
-
-  const Function &getFunction() const { return Fn; }
-  const Twine &getDescription() const { return Description; }
-
-  void print(DiagnosticPrinter &DP) const override;
-
-  static bool classof(const DiagnosticInfo *DI) {
-    return DI->getKind() == getKindID();
-  }
-};
-
-}
-
-#endif
diff --git a/lib/Target/AMDGPU/AMDGPUFrameLowering.cpp b/lib/Target/AMDGPU/AMDGPUFrameLowering.cpp
index 4d84d281d998..bbc28b885721 100644
--- a/lib/Target/AMDGPU/AMDGPUFrameLowering.cpp
+++ b/lib/Target/AMDGPU/AMDGPUFrameLowering.cpp
@@ -7,12 +7,13 @@
 //
 //==-----------------------------------------------------------------------===//
 //
-// Interface to describe a layout of a stack frame on a AMDIL target machine
+// Interface to describe a layout of a stack frame on a AMDGPU target machine.
 //
 //===----------------------------------------------------------------------===//
 #include "AMDGPUFrameLowering.h"
 #include "AMDGPURegisterInfo.h"
-#include "R600MachineFunctionInfo.h"
+#include "AMDGPUSubtarget.h"
+
 #include "llvm/CodeGen/MachineFrameInfo.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
 #include "llvm/IR/Instructions.h"
@@ -57,7 +58,7 @@ unsigned AMDGPUFrameLowering::getStackWidth(const MachineFunction &MF) const {
   // T2.Y = stack[1].y
   // T3.X = stack[1].z
   // T3.Y = stack[1].w
-  // 
+  //
   // StackWidth = 4:
   // T0.X = stack[0].x
   // T0.Y = stack[0].y
@@ -75,7 +76,8 @@ int AMDGPUFrameLowering::getFrameIndexReference(const MachineFunction &MF,
                                                 int FI,
                                                 unsigned &FrameReg) const {
   const MachineFrameInfo *MFI = MF.getFrameInfo();
-  const TargetRegisterInfo *RI = MF.getSubtarget().getRegisterInfo();
+  const AMDGPURegisterInfo *RI
+    = MF.getSubtarget<AMDGPUSubtarget>().getRegisterInfo();
 
   // Fill in FrameReg output argument.
   FrameReg = RI->getFrameRegister(MF);
@@ -87,32 +89,16 @@ int AMDGPUFrameLowering::getFrameIndexReference(const MachineFunction &MF,
   int UpperBound = FI == -1 ? MFI->getNumObjects() : FI;
 
   for (int i = MFI->getObjectIndexBegin(); i < UpperBound; ++i) {
-    OffsetBytes = RoundUpToAlignment(OffsetBytes, MFI->getObjectAlignment(i));
+    OffsetBytes = alignTo(OffsetBytes, MFI->getObjectAlignment(i));
     OffsetBytes += MFI->getObjectSize(i);
     // Each register holds 4 bytes, so we must always align the offset to at
     // least 4 bytes, so that 2 frame objects won't share the same register.
-    OffsetBytes = RoundUpToAlignment(OffsetBytes, 4);
+    OffsetBytes = alignTo(OffsetBytes, 4);
   }
 
   if (FI != -1)
-    OffsetBytes = RoundUpToAlignment(OffsetBytes, MFI->getObjectAlignment(FI));
+    OffsetBytes = alignTo(OffsetBytes, MFI->getObjectAlignment(FI));
 
   return OffsetBytes / (getStackWidth(MF) * 4);
 }
 
-const TargetFrameLowering::SpillSlot *
-AMDGPUFrameLowering::getCalleeSavedSpillSlots(unsigned &NumEntries) const {
-  NumEntries = 0;
-  return nullptr;
-}
-void AMDGPUFrameLowering::emitPrologue(MachineFunction &MF,
-                                       MachineBasicBlock &MBB) const {}
-void
-AMDGPUFrameLowering::emitEpilogue(MachineFunction &MF,
-                                  MachineBasicBlock &MBB) const {
-}
-
-bool
-AMDGPUFrameLowering::hasFP(const MachineFunction &MF) const {
-  return false;
-}
diff --git a/lib/Target/AMDGPU/AMDGPUFrameLowering.h b/lib/Target/AMDGPU/AMDGPUFrameLowering.h
index 257a3da40589..513848a1d887 100644
--- a/lib/Target/AMDGPU/AMDGPUFrameLowering.h
+++ b/lib/Target/AMDGPU/AMDGPUFrameLowering.h
@@ -32,13 +32,13 @@ public:
   /// \returns The number of 32-bit sub-registers that are used when storing
   /// values to the stack.
   unsigned getStackWidth(const MachineFunction &MF) const;
+
   int getFrameIndexReference(const MachineFunction &MF, int FI,
                              unsigned &FrameReg) const override;
-  const SpillSlot *
-    getCalleeSavedSpillSlots(unsigned &NumEntries) const override;
-  void emitPrologue(MachineFunction &MF, MachineBasicBlock &MBB) const override;
-  void emitEpilogue(MachineFunction &MF, MachineBasicBlock &MBB) const override;
-  bool hasFP(const MachineFunction &MF) const override;
+
+  bool hasFP(const MachineFunction &MF) const override {
+    return false;
+  }
 };
 } // namespace llvm
 #endif
diff --git a/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp b/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
index b33040b4d06a..23c9352ce273 100644
--- a/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
+++ b/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
@@ -1,4 +1,4 @@
-//===-- AMDILISelDAGToDAG.cpp - A dag to dag inst selector for AMDIL ------===//
+//===-- AMDGPUISelDAGToDAG.cpp - A dag to dag inst selector for AMDGPU ----===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -12,30 +12,44 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "AMDGPUDiagnosticInfoUnsupported.h"
 #include "AMDGPUInstrInfo.h"
+#include "AMDGPUIntrinsicInfo.h"
 #include "AMDGPUISelLowering.h" // For AMDGPUISD
-#include "AMDGPURegisterInfo.h"
 #include "AMDGPUSubtarget.h"
-#include "R600InstrInfo.h"
-#include "SIDefines.h"
 #include "SIISelLowering.h"
 #include "SIMachineFunctionInfo.h"
+#include "llvm/Analysis/ValueTracking.h"
 #include "llvm/CodeGen/FunctionLoweringInfo.h"
 #include "llvm/CodeGen/MachineFrameInfo.h"
-#include "llvm/CodeGen/MachineRegisterInfo.h"
 #include "llvm/CodeGen/PseudoSourceValue.h"
 #include "llvm/CodeGen/SelectionDAG.h"
 #include "llvm/CodeGen/SelectionDAGISel.h"
-#include "llvm/IR/Function.h"
+#include "llvm/IR/DiagnosticInfo.h"
 
 using namespace llvm;
 
+namespace llvm {
+class R600InstrInfo;
+}
+
 //===----------------------------------------------------------------------===//
 // Instruction Selector Implementation
 //===----------------------------------------------------------------------===//
 
 namespace {
+
+static bool isCBranchSCC(const SDNode *N) {
+  assert(N->getOpcode() == ISD::BRCOND);
+  if (!N->hasOneUse())
+    return false;
+
+  SDValue Cond = N->getOperand(1);
+  if (Cond.getOpcode() == ISD::CopyToReg)
+    Cond = Cond.getOperand(2);
+  return Cond.getOpcode() == ISD::SETCC &&
+         Cond.getOperand(0).getValueType() == MVT::i32 && Cond.hasOneUse();
+}
+
 /// AMDGPU specific code to select AMDGPU machine instructions for
 /// SelectionDAG operations.
 class AMDGPUDAGToDAGISel : public SelectionDAGISel {
@@ -47,7 +61,7 @@ public:
   AMDGPUDAGToDAGISel(TargetMachine &TM);
   virtual ~AMDGPUDAGToDAGISel();
   bool runOnMachineFunction(MachineFunction &MF) override;
-  SDNode *Select(SDNode *N) override;
+  void Select(SDNode *N) override;
   const char *getPassName() const override;
   void PreprocessISelDAG() override;
   void PostprocessISelDAG() override;
@@ -59,28 +73,8 @@ private:
   bool FoldOperands(unsigned, const R600InstrInfo *, std::vector<SDValue> &);
   bool FoldDotOperands(unsigned, const R600InstrInfo *, std::vector<SDValue> &);
 
-  // Complex pattern selectors
-  bool SelectADDRParam(SDValue Addr, SDValue& R1, SDValue& R2);
-  bool SelectADDR(SDValue N, SDValue &R1, SDValue &R2);
-  bool SelectADDR64(SDValue N, SDValue &R1, SDValue &R2);
-
-  static bool checkType(const Value *ptr, unsigned int addrspace);
-  static bool checkPrivateAddress(const MachineMemOperand *Op);
-
-  static bool isGlobalStore(const StoreSDNode *N);
-  static bool isFlatStore(const StoreSDNode *N);
-  static bool isPrivateStore(const StoreSDNode *N);
-  static bool isLocalStore(const StoreSDNode *N);
-  static bool isRegionStore(const StoreSDNode *N);
-
-  bool isCPLoad(const LoadSDNode *N) const;
-  bool isConstantLoad(const LoadSDNode *N, int cbID) const;
-  bool isGlobalLoad(const LoadSDNode *N) const;
-  bool isFlatLoad(const LoadSDNode *N) const;
-  bool isParamLoad(const LoadSDNode *N) const;
-  bool isPrivateLoad(const LoadSDNode *N) const;
-  bool isLocalLoad(const LoadSDNode *N) const;
-  bool isRegionLoad(const LoadSDNode *N) const;
+  bool isConstantLoad(const MemSDNode *N, int cbID) const;
+  bool isUniformBr(const SDNode *N) const;
 
   SDNode *glueCopyToM0(SDNode *N) const;
 
@@ -111,7 +105,20 @@ private:
                          SDValue &Offset, SDValue &GLC, SDValue &SLC,
                          SDValue &TFE) const;
   bool SelectMUBUFOffset(SDValue Addr, SDValue &SRsrc, SDValue &Soffset,
-                         SDValue &Offset, SDValue &GLC) const;
+                         SDValue &Offset, SDValue &SLC) const;
+  bool SelectMUBUFOffset(SDValue Addr, SDValue &SRsrc, SDValue &Soffset,
+                         SDValue &Offset) const;
+  bool SelectMUBUFConstant(SDValue Constant,
+                           SDValue &SOffset,
+                           SDValue &ImmOffset) const;
+  bool SelectMUBUFIntrinsicOffset(SDValue Offset, SDValue &SOffset,
+                                  SDValue &ImmOffset) const;
+  bool SelectMUBUFIntrinsicVOffset(SDValue Offset, SDValue &SOffset,
+                                   SDValue &ImmOffset, SDValue &VOffset) const;
+
+  bool SelectFlat(SDValue Addr, SDValue &VAddr,
+                  SDValue &SLC, SDValue &TFE) const;
+
   bool SelectSMRDOffset(SDValue ByteOffsetNode, SDValue &Offset,
                         bool &Imm) const;
   bool SelectSMRD(SDValue Addr, SDValue &SBase, SDValue &Offset,
@@ -122,7 +129,7 @@ private:
   bool SelectSMRDBufferImm(SDValue Addr, SDValue &Offset) const;
   bool SelectSMRDBufferImm32(SDValue Addr, SDValue &Offset) const;
   bool SelectSMRDBufferSgpr(SDValue Addr, SDValue &Offset) const;
-  SDNode *SelectAddrSpaceCast(SDNode *N);
+  bool SelectMOVRELOffset(SDValue Index, SDValue &Base, SDValue &Offset) const;
   bool SelectVOP3Mods(SDValue In, SDValue &Src, SDValue &SrcMods) const;
   bool SelectVOP3NoMods(SDValue In, SDValue &Src, SDValue &SrcMods) const;
   bool SelectVOP3Mods0(SDValue In, SDValue &Src, SDValue &SrcMods,
@@ -136,13 +143,15 @@ private:
                                  SDValue &Clamp,
                                  SDValue &Omod) const;
 
-  SDNode *SelectADD_SUB_I64(SDNode *N);
-  SDNode *SelectDIV_SCALE(SDNode *N);
+  void SelectADD_SUB_I64(SDNode *N);
+  void SelectDIV_SCALE(SDNode *N);
 
-  SDNode *getS_BFE(unsigned Opcode, SDLoc DL, SDValue Val,
+  SDNode *getS_BFE(unsigned Opcode, const SDLoc &DL, SDValue Val,
                    uint32_t Offset, uint32_t Width);
-  SDNode *SelectS_BFEFromShifts(SDNode *N);
-  SDNode *SelectS_BFE(SDNode *N);
+  void SelectS_BFEFromShifts(SDNode *N);
+  void SelectS_BFE(SDNode *N);
+  void SelectBRCOND(SDNode *N);
+  void SelectATOMIC_CMP_SWAP(SDNode *N);
 
   // Include the pieces autogenerated from the target description.
 #include "AMDGPUGenDAGISel.inc"
@@ -159,7 +168,7 @@ AMDGPUDAGToDAGISel::AMDGPUDAGToDAGISel(TargetMachine &TM)
     : SelectionDAGISel(TM) {}
 
 bool AMDGPUDAGToDAGISel::runOnMachineFunction(MachineFunction &MF) {
-  Subtarget = &static_cast<const AMDGPUSubtarget &>(MF.getSubtarget());
+  Subtarget = &MF.getSubtarget<AMDGPUSubtarget>();
   return SelectionDAGISel::runOnMachineFunction(MF);
 }
 
@@ -207,64 +216,9 @@ const TargetRegisterClass *AMDGPUDAGToDAGISel::getOperandRegClass(SDNode *N,
   }
 }
 
-bool AMDGPUDAGToDAGISel::SelectADDRParam(
-  SDValue Addr, SDValue& R1, SDValue& R2) {
-
-  if (Addr.getOpcode() == ISD::FrameIndex) {
-    if (FrameIndexSDNode *FIN = dyn_cast<FrameIndexSDNode>(Addr)) {
-      R1 = CurDAG->getTargetFrameIndex(FIN->getIndex(), MVT::i32);
-      R2 = CurDAG->getTargetConstant(0, SDLoc(Addr), MVT::i32);
-    } else {
-      R1 = Addr;
-      R2 = CurDAG->getTargetConstant(0, SDLoc(Addr), MVT::i32);
-    }
-  } else if (Addr.getOpcode() == ISD::ADD) {
-    R1 = Addr.getOperand(0);
-    R2 = Addr.getOperand(1);
-  } else {
-    R1 = Addr;
-    R2 = CurDAG->getTargetConstant(0, SDLoc(Addr), MVT::i32);
-  }
-  return true;
-}
-
-bool AMDGPUDAGToDAGISel::SelectADDR(SDValue Addr, SDValue& R1, SDValue& R2) {
-  if (Addr.getOpcode() == ISD::TargetExternalSymbol ||
-      Addr.getOpcode() == ISD::TargetGlobalAddress) {
-    return false;
-  }
-  return SelectADDRParam(Addr, R1, R2);
-}
-
-
-bool AMDGPUDAGToDAGISel::SelectADDR64(SDValue Addr, SDValue& R1, SDValue& R2) {
-  if (Addr.getOpcode() == ISD::TargetExternalSymbol ||
-      Addr.getOpcode() == ISD::TargetGlobalAddress) {
-    return false;
-  }
-
-  if (Addr.getOpcode() == ISD::FrameIndex) {
-    if (FrameIndexSDNode *FIN = dyn_cast<FrameIndexSDNode>(Addr)) {
-      R1 = CurDAG->getTargetFrameIndex(FIN->getIndex(), MVT::i64);
-      R2 = CurDAG->getTargetConstant(0, SDLoc(Addr), MVT::i64);
-    } else {
-      R1 = Addr;
-      R2 = CurDAG->getTargetConstant(0, SDLoc(Addr), MVT::i64);
-    }
-  } else if (Addr.getOpcode() == ISD::ADD) {
-    R1 = Addr.getOperand(0);
-    R2 = Addr.getOperand(1);
-  } else {
-    R1 = Addr;
-    R2 = CurDAG->getTargetConstant(0, SDLoc(Addr), MVT::i64);
-  }
-  return true;
-}
-
 SDNode *AMDGPUDAGToDAGISel::glueCopyToM0(SDNode *N) const {
   if (Subtarget->getGeneration() < AMDGPUSubtarget::SOUTHERN_ISLANDS ||
-      !checkType(cast<MemSDNode>(N)->getMemOperand()->getValue(),
-                 AMDGPUAS::LOCAL_ADDRESS))
+      cast<MemSDNode>(N)->getAddressSpace() != AMDGPUAS::LOCAL_ADDRESS)
     return N;
 
   const SITargetLowering& Lowering =
@@ -304,14 +258,15 @@ static unsigned selectSGPRVectorRegClassID(unsigned NumVectorElts) {
   llvm_unreachable("invalid vector size");
 }
 
-SDNode *AMDGPUDAGToDAGISel::Select(SDNode *N) {
+void AMDGPUDAGToDAGISel::Select(SDNode *N) {
   unsigned int Opc = N->getOpcode();
   if (N->isMachineOpcode()) {
     N->setNodeId(-1);
-    return nullptr;   // Already selected.
+    return;   // Already selected.
   }
 
-  if (isa<AtomicSDNode>(N))
+  if (isa<AtomicSDNode>(N) ||
+      (Opc == AMDGPUISD::ATOMIC_INC || Opc == AMDGPUISD::ATOMIC_DEC))
     N = glueCopyToM0(N);
 
   switch (Opc) {
@@ -325,7 +280,8 @@ SDNode *AMDGPUDAGToDAGISel::Select(SDNode *N) {
         Subtarget->getGeneration() < AMDGPUSubtarget::SOUTHERN_ISLANDS)
       break;
 
-    return SelectADD_SUB_I64(N);
+    SelectADD_SUB_I64(N);
+    return;
   }
   case ISD::SCALAR_TO_VECTOR:
   case AMDGPUISD::BUILD_VERTICAL_VECTOR:
@@ -359,8 +315,9 @@ SDNode *AMDGPUDAGToDAGISel::Select(SDNode *N) {
     SDValue RegClass = CurDAG->getTargetConstant(RegClassID, DL, MVT::i32);
 
     if (NumVectorElts == 1) {
-      return CurDAG->SelectNodeTo(N, AMDGPU::COPY_TO_REGCLASS, EltVT,
-                                  N->getOperand(0), RegClass);
+      CurDAG->SelectNodeTo(N, AMDGPU::COPY_TO_REGCLASS, EltVT, N->getOperand(0),
+                           RegClass);
+      return;
     }
 
     assert(NumVectorElts <= 16 && "Vectors with more than 16 elements not "
@@ -400,8 +357,8 @@ SDNode *AMDGPUDAGToDAGISel::Select(SDNode *N) {
 
     if (!IsRegSeq)
       break;
-    return CurDAG->SelectNodeTo(N, AMDGPU::REG_SEQUENCE, N->getVTList(),
-                                RegSeqArgs);
+    CurDAG->SelectNodeTo(N, AMDGPU::REG_SEQUENCE, N->getVTList(), RegSeqArgs);
+    return;
   }
   case ISD::BUILD_PAIR: {
     SDValue RC, SubReg0, SubReg1;
@@ -422,8 +379,9 @@ SDNode *AMDGPUDAGToDAGISel::Select(SDNode *N) {
     }
     const SDValue Ops[] = { RC, N->getOperand(0), SubReg0,
                             N->getOperand(1), SubReg1 };
-    return CurDAG->getMachineNode(TargetOpcode::REG_SEQUENCE,
-                                  DL, N->getValueType(0), Ops);
+    ReplaceNode(N, CurDAG->getMachineNode(TargetOpcode::REG_SEQUENCE, DL,
+                                          N->getValueType(0), Ops));
+    return;
   }
 
   case ISD::Constant:
@@ -452,8 +410,9 @@ SDNode *AMDGPUDAGToDAGISel::Select(SDNode *N) {
       SDValue(Hi, 0), CurDAG->getTargetConstant(AMDGPU::sub1, DL, MVT::i32)
     };
 
-    return CurDAG->getMachineNode(TargetOpcode::REG_SEQUENCE, DL,
-                                  N->getValueType(0), Ops);
+    ReplaceNode(N, CurDAG->getMachineNode(TargetOpcode::REG_SEQUENCE, DL,
+                                          N->getValueType(0), Ops));
+    return;
   }
   case ISD::LOAD:
   case ISD::STORE: {
@@ -487,11 +446,13 @@ SDNode *AMDGPUDAGToDAGISel::Select(SDNode *N) {
     uint32_t OffsetVal = Offset->getZExtValue();
     uint32_t WidthVal = Width->getZExtValue();
 
-    return getS_BFE(Signed ? AMDGPU::S_BFE_I32 : AMDGPU::S_BFE_U32, SDLoc(N),
-                    N->getOperand(0), OffsetVal, WidthVal);
+    ReplaceNode(N, getS_BFE(Signed ? AMDGPU::S_BFE_I32 : AMDGPU::S_BFE_U32,
+                            SDLoc(N), N->getOperand(0), OffsetVal, WidthVal));
+    return;
   }
   case AMDGPUISD::DIV_SCALE: {
-    return SelectDIV_SCALE(N);
+    SelectDIV_SCALE(N);
+    return;
   }
   case ISD::CopyToReg: {
     const SITargetLowering& Lowering =
@@ -499,139 +460,48 @@ SDNode *AMDGPUDAGToDAGISel::Select(SDNode *N) {
     Lowering.legalizeTargetIndependentNode(N, *CurDAG);
     break;
   }
-  case ISD::ADDRSPACECAST:
-    return SelectAddrSpaceCast(N);
   case ISD::AND:
   case ISD::SRL:
   case ISD::SRA:
+  case ISD::SIGN_EXTEND_INREG:
     if (N->getValueType(0) != MVT::i32 ||
         Subtarget->getGeneration() < AMDGPUSubtarget::SOUTHERN_ISLANDS)
       break;
 
-    return SelectS_BFE(N);
+    SelectS_BFE(N);
+    return;
+  case ISD::BRCOND:
+    SelectBRCOND(N);
+    return;
+
+  case AMDGPUISD::ATOMIC_CMP_SWAP:
+    SelectATOMIC_CMP_SWAP(N);
+    return;
   }
 
-  return SelectCode(N);
+  SelectCode(N);
 }
 
-bool AMDGPUDAGToDAGISel::checkType(const Value *Ptr, unsigned AS) {
-  assert(AS != 0 && "Use checkPrivateAddress instead.");
-  if (!Ptr)
+bool AMDGPUDAGToDAGISel::isConstantLoad(const MemSDNode *N, int CbId) const {
+  if (!N->readMem())
     return false;
-
-  return Ptr->getType()->getPointerAddressSpace() == AS;
-}
-
-bool AMDGPUDAGToDAGISel::checkPrivateAddress(const MachineMemOperand *Op) {
-  if (Op->getPseudoValue())
-    return true;
-
-  if (PointerType *PT = dyn_cast<PointerType>(Op->getValue()->getType()))
-    return PT->getAddressSpace() == AMDGPUAS::PRIVATE_ADDRESS;
-
-  return false;
-}
-
-bool AMDGPUDAGToDAGISel::isGlobalStore(const StoreSDNode *N) {
-  return checkType(N->getMemOperand()->getValue(), AMDGPUAS::GLOBAL_ADDRESS);
-}
-
-bool AMDGPUDAGToDAGISel::isPrivateStore(const StoreSDNode *N) {
-  const Value *MemVal = N->getMemOperand()->getValue();
-  return (!checkType(MemVal, AMDGPUAS::LOCAL_ADDRESS) &&
-          !checkType(MemVal, AMDGPUAS::GLOBAL_ADDRESS) &&
-          !checkType(MemVal, AMDGPUAS::REGION_ADDRESS));
-}
-
-bool AMDGPUDAGToDAGISel::isLocalStore(const StoreSDNode *N) {
-  return checkType(N->getMemOperand()->getValue(), AMDGPUAS::LOCAL_ADDRESS);
-}
-
-bool AMDGPUDAGToDAGISel::isFlatStore(const StoreSDNode *N) {
-  return checkType(N->getMemOperand()->getValue(), AMDGPUAS::FLAT_ADDRESS);
-}
-
-bool AMDGPUDAGToDAGISel::isRegionStore(const StoreSDNode *N) {
-  return checkType(N->getMemOperand()->getValue(), AMDGPUAS::REGION_ADDRESS);
-}
-
-bool AMDGPUDAGToDAGISel::isConstantLoad(const LoadSDNode *N, int CbId) const {
-  const Value *MemVal = N->getMemOperand()->getValue();
   if (CbId == -1)
-    return checkType(MemVal, AMDGPUAS::CONSTANT_ADDRESS);
+    return N->getAddressSpace() == AMDGPUAS::CONSTANT_ADDRESS;
 
-  return checkType(MemVal, AMDGPUAS::CONSTANT_BUFFER_0 + CbId);
+  return N->getAddressSpace() == AMDGPUAS::CONSTANT_BUFFER_0 + CbId;
 }
 
-bool AMDGPUDAGToDAGISel::isGlobalLoad(const LoadSDNode *N) const {
-  if (N->getAddressSpace() == AMDGPUAS::CONSTANT_ADDRESS)
-    if (Subtarget->getGeneration() < AMDGPUSubtarget::SOUTHERN_ISLANDS ||
-        N->getMemoryVT().bitsLT(MVT::i32))
-      return true;
-
-  return checkType(N->getMemOperand()->getValue(), AMDGPUAS::GLOBAL_ADDRESS);
-}
-
-bool AMDGPUDAGToDAGISel::isParamLoad(const LoadSDNode *N) const {
-  return checkType(N->getMemOperand()->getValue(), AMDGPUAS::PARAM_I_ADDRESS);
-}
-
-bool AMDGPUDAGToDAGISel::isLocalLoad(const  LoadSDNode *N) const {
-  return checkType(N->getMemOperand()->getValue(), AMDGPUAS::LOCAL_ADDRESS);
-}
-
-bool AMDGPUDAGToDAGISel::isFlatLoad(const  LoadSDNode *N) const {
-  return checkType(N->getMemOperand()->getValue(), AMDGPUAS::FLAT_ADDRESS);
-}
-
-bool AMDGPUDAGToDAGISel::isRegionLoad(const  LoadSDNode *N) const {
-  return checkType(N->getMemOperand()->getValue(), AMDGPUAS::REGION_ADDRESS);
-}
-
-bool AMDGPUDAGToDAGISel::isCPLoad(const LoadSDNode *N) const {
-  MachineMemOperand *MMO = N->getMemOperand();
-  if (checkPrivateAddress(N->getMemOperand())) {
-    if (MMO) {
-      const PseudoSourceValue *PSV = MMO->getPseudoValue();
-      if (PSV && PSV->isConstantPool()) {
-        return true;
-      }
-    }
-  }
-  return false;
-}
-
-bool AMDGPUDAGToDAGISel::isPrivateLoad(const LoadSDNode *N) const {
-  if (checkPrivateAddress(N->getMemOperand())) {
-    // Check to make sure we are not a constant pool load or a constant load
-    // that is marked as a private load
-    if (isCPLoad(N) || isConstantLoad(N, -1)) {
-      return false;
-    }
-  }
-
-  const Value *MemVal = N->getMemOperand()->getValue();
-  if (!checkType(MemVal, AMDGPUAS::LOCAL_ADDRESS) &&
-      !checkType(MemVal, AMDGPUAS::GLOBAL_ADDRESS) &&
-      !checkType(MemVal, AMDGPUAS::FLAT_ADDRESS) &&
-      !checkType(MemVal, AMDGPUAS::REGION_ADDRESS) &&
-      !checkType(MemVal, AMDGPUAS::CONSTANT_ADDRESS) &&
-      !checkType(MemVal, AMDGPUAS::PARAM_D_ADDRESS) &&
-      !checkType(MemVal, AMDGPUAS::PARAM_I_ADDRESS)) {
-    return true;
-  }
-  return false;
+bool AMDGPUDAGToDAGISel::isUniformBr(const SDNode *N) const {
+  const BasicBlock *BB = FuncInfo->MBB->getBasicBlock();
+  const Instruction *Term = BB->getTerminator();
+  return Term->getMetadata("amdgpu.uniform") ||
+         Term->getMetadata("structurizecfg.uniform");
 }
 
 const char *AMDGPUDAGToDAGISel::getPassName() const {
   return "AMDGPU DAG->DAG Pattern Instruction Selection";
 }
 
-#ifdef DEBUGTMP
-#undef INT64_C
-#endif
-#undef DEBUGTMP
-
 //===----------------------------------------------------------------------===//
 // Complex Patterns
 //===----------------------------------------------------------------------===//
@@ -705,7 +575,7 @@ bool AMDGPUDAGToDAGISel::SelectADDRIndirect(SDValue Addr, SDValue &Base,
   return true;
 }
 
-SDNode *AMDGPUDAGToDAGISel::SelectADD_SUB_I64(SDNode *N) {
+void AMDGPUDAGToDAGISel::SelectADD_SUB_I64(SDNode *N) {
   SDLoc DL(N);
   SDValue LHS = N->getOperand(0);
   SDValue RHS = N->getOperand(1);
@@ -728,7 +598,6 @@ SDNode *AMDGPUDAGToDAGISel::SelectADD_SUB_I64(SDNode *N) {
   SDVTList VTList = CurDAG->getVTList(MVT::i32, MVT::Glue);
   SDValue AddLoArgs[] = { SDValue(Lo0, 0), SDValue(Lo1, 0) };
 
-
   unsigned Opc = IsAdd ? AMDGPU::S_ADD_U32 : AMDGPU::S_SUB_U32;
   unsigned CarryOpc = IsAdd ? AMDGPU::S_ADDC_U32 : AMDGPU::S_SUBB_U32;
 
@@ -745,12 +614,12 @@ SDNode *AMDGPUDAGToDAGISel::SelectADD_SUB_I64(SDNode *N) {
     SDValue(AddHi,0),
     Sub1,
   };
-  return CurDAG->SelectNodeTo(N, AMDGPU::REG_SEQUENCE, MVT::i64, Args);
+  CurDAG->SelectNodeTo(N, AMDGPU::REG_SEQUENCE, MVT::i64, Args);
 }
 
 // We need to handle this here because tablegen doesn't support matching
 // instructions with multiple outputs.
-SDNode *AMDGPUDAGToDAGISel::SelectDIV_SCALE(SDNode *N) {
+void AMDGPUDAGToDAGISel::SelectDIV_SCALE(SDNode *N) {
   SDLoc SL(N);
   EVT VT = N->getValueType(0);
 
@@ -766,7 +635,7 @@ SDNode *AMDGPUDAGToDAGISel::SelectDIV_SCALE(SDNode *N) {
   SelectVOP3Mods0(N->getOperand(0), Ops[1], Ops[0], Ops[6], Ops[7]);
   SelectVOP3Mods(N->getOperand(1), Ops[3], Ops[2]);
   SelectVOP3Mods(N->getOperand(2), Ops[5], Ops[4]);
-  return CurDAG->SelectNodeTo(N, Opc, VT, MVT::i1, Ops);
+  CurDAG->SelectNodeTo(N, Opc, VT, MVT::i1, Ops);
 }
 
 bool AMDGPUDAGToDAGISel::isDSOffsetLegal(const SDValue &Base, unsigned Offset,
@@ -786,6 +655,7 @@ bool AMDGPUDAGToDAGISel::isDSOffsetLegal(const SDValue &Base, unsigned Offset,
 
 bool AMDGPUDAGToDAGISel::SelectDS1Addr1Offset(SDValue Addr, SDValue &Base,
                                               SDValue &Offset) const {
+  SDLoc DL(Addr);
   if (CurDAG->isBaseWithConstantOffset(Addr)) {
     SDValue N0 = Addr.getOperand(0);
     SDValue N1 = Addr.getOperand(1);
@@ -793,7 +663,7 @@ bool AMDGPUDAGToDAGISel::SelectDS1Addr1Offset(SDValue Addr, SDValue &Base,
     if (isDSOffsetLegal(N0, C1->getSExtValue(), 16)) {
       // (add n0, c0)
       Base = N0;
-      Offset = N1;
+      Offset = CurDAG->getTargetConstant(C1->getZExtValue(), DL, MVT::i16);
       return true;
     }
   } else if (Addr.getOpcode() == ISD::SUB) {
@@ -801,7 +671,6 @@ bool AMDGPUDAGToDAGISel::SelectDS1Addr1Offset(SDValue Addr, SDValue &Base,
     if (const ConstantSDNode *C = dyn_cast<ConstantSDNode>(Addr.getOperand(0))) {
       int64_t ByteOffset = C->getSExtValue();
       if (isUInt<16>(ByteOffset)) {
-        SDLoc DL(Addr);
         SDValue Zero = CurDAG->getTargetConstant(0, DL, MVT::i32);
 
         // XXX - This is kind of hacky. Create a dummy sub node so we can check
@@ -816,7 +685,7 @@ bool AMDGPUDAGToDAGISel::SelectDS1Addr1Offset(SDValue Addr, SDValue &Base,
                                      Zero, Addr.getOperand(1));
 
           Base = SDValue(MachineSub, 0);
-          Offset = Addr.getOperand(0);
+          Offset = CurDAG->getTargetConstant(ByteOffset, DL, MVT::i16);
           return true;
         }
       }
@@ -834,7 +703,7 @@ bool AMDGPUDAGToDAGISel::SelectDS1Addr1Offset(SDValue Addr, SDValue &Base,
       MachineSDNode *MovZero = CurDAG->getMachineNode(AMDGPU::V_MOV_B32_e32,
                                  DL, MVT::i32, Zero);
       Base = SDValue(MovZero, 0);
-      Offset = Addr;
+      Offset = CurDAG->getTargetConstant(CAddr->getZExtValue(), DL, MVT::i16);
       return true;
     }
   }
@@ -932,8 +801,10 @@ bool AMDGPUDAGToDAGISel::SelectMUBUF(SDValue Addr, SDValue &Ptr,
 
   SDLoc DL(Addr);
 
-  GLC = CurDAG->getTargetConstant(0, DL, MVT::i1);
-  SLC = CurDAG->getTargetConstant(0, DL, MVT::i1);
+  if (!GLC.getNode())
+    GLC = CurDAG->getTargetConstant(0, DL, MVT::i1);
+  if (!SLC.getNode())
+    SLC = CurDAG->getTargetConstant(0, DL, MVT::i1);
   TFE = CurDAG->getTargetConstant(0, DL, MVT::i1);
 
   Idxen = CurDAG->getTargetConstant(0, DL, MVT::i1);
@@ -961,9 +832,11 @@ bool AMDGPUDAGToDAGISel::SelectMUBUF(SDValue Addr, SDValue &Ptr,
     }
 
     if (isLegalMUBUFImmOffset(C1)) {
-        Offset = CurDAG->getTargetConstant(C1->getZExtValue(), DL, MVT::i16);
-        return true;
-    } else if (isUInt<32>(C1->getZExtValue())) {
+      Offset = CurDAG->getTargetConstant(C1->getZExtValue(), DL, MVT::i16);
+      return true;
+    }
+
+    if (isUInt<32>(C1->getZExtValue())) {
       // Illegal offset, store it in soffset.
       Offset = CurDAG->getTargetConstant(0, DL, MVT::i16);
       SOffset = SDValue(CurDAG->getMachineNode(AMDGPU::S_MOV_B32, DL, MVT::i32,
@@ -1045,14 +918,13 @@ bool AMDGPUDAGToDAGISel::SelectMUBUFScratch(SDValue Addr, SDValue &Rsrc,
   if (CurDAG->isBaseWithConstantOffset(Addr)) {
     SDValue N0 = Addr.getOperand(0);
     SDValue N1 = Addr.getOperand(1);
+
     // Offsets in vaddr must be positive.
-    if (CurDAG->SignBitIsZero(N0)) {
-      ConstantSDNode *C1 = cast<ConstantSDNode>(N1);
-      if (isLegalMUBUFImmOffset(C1)) {
-        VAddr = N0;
-        ImmOffset = CurDAG->getTargetConstant(C1->getZExtValue(), DL, MVT::i16);
-        return true;
-      }
+    ConstantSDNode *C1 = cast<ConstantSDNode>(N1);
+    if (isLegalMUBUFImmOffset(C1)) {
+      VAddr = N0;
+      ImmOffset = CurDAG->getTargetConstant(C1->getZExtValue(), DL, MVT::i16);
+      return true;
     }
   }
 
@@ -1091,13 +963,118 @@ bool AMDGPUDAGToDAGISel::SelectMUBUFOffset(SDValue Addr, SDValue &SRsrc,
 }
 
 bool AMDGPUDAGToDAGISel::SelectMUBUFOffset(SDValue Addr, SDValue &SRsrc,
+                                           SDValue &Soffset, SDValue &Offset
+                                           ) const {
+  SDValue GLC, SLC, TFE;
+
+  return SelectMUBUFOffset(Addr, SRsrc, Soffset, Offset, GLC, SLC, TFE);
+}
+bool AMDGPUDAGToDAGISel::SelectMUBUFOffset(SDValue Addr, SDValue &SRsrc,
                                            SDValue &Soffset, SDValue &Offset,
-                                           SDValue &GLC) const {
-  SDValue SLC, TFE;
+                                           SDValue &SLC) const {
+  SDValue GLC, TFE;
 
   return SelectMUBUFOffset(Addr, SRsrc, Soffset, Offset, GLC, SLC, TFE);
 }
 
+bool AMDGPUDAGToDAGISel::SelectMUBUFConstant(SDValue Constant,
+                                             SDValue &SOffset,
+                                             SDValue &ImmOffset) const {
+  SDLoc DL(Constant);
+  uint32_t Imm = cast<ConstantSDNode>(Constant)->getZExtValue();
+  uint32_t Overflow = 0;
+
+  if (Imm >= 4096) {
+    if (Imm <= 4095 + 64) {
+      // Use an SOffset inline constant for 1..64
+      Overflow = Imm - 4095;
+      Imm = 4095;
+    } else {
+      // Try to keep the same value in SOffset for adjacent loads, so that
+      // the corresponding register contents can be re-used.
+      //
+      // Load values with all low-bits set into SOffset, so that a larger
+      // range of values can be covered using s_movk_i32
+      uint32_t High = (Imm + 1) & ~4095;
+      uint32_t Low = (Imm + 1) & 4095;
+      Imm = Low;
+      Overflow = High - 1;
+    }
+  }
+
+  // There is a hardware bug in SI and CI which prevents address clamping in
+  // MUBUF instructions from working correctly with SOffsets. The immediate
+  // offset is unaffected.
+  if (Overflow > 0 &&
+      Subtarget->getGeneration() <= AMDGPUSubtarget::SEA_ISLANDS)
+    return false;
+
+  ImmOffset = CurDAG->getTargetConstant(Imm, DL, MVT::i16);
+
+  if (Overflow <= 64)
+    SOffset = CurDAG->getTargetConstant(Overflow, DL, MVT::i32);
+  else
+    SOffset = SDValue(CurDAG->getMachineNode(AMDGPU::S_MOV_B32, DL, MVT::i32,
+                      CurDAG->getTargetConstant(Overflow, DL, MVT::i32)),
+                      0);
+
+  return true;
+}
+
+bool AMDGPUDAGToDAGISel::SelectMUBUFIntrinsicOffset(SDValue Offset,
+                                                    SDValue &SOffset,
+                                                    SDValue &ImmOffset) const {
+  SDLoc DL(Offset);
+
+  if (!isa<ConstantSDNode>(Offset))
+    return false;
+
+  return SelectMUBUFConstant(Offset, SOffset, ImmOffset);
+}
+
+bool AMDGPUDAGToDAGISel::SelectMUBUFIntrinsicVOffset(SDValue Offset,
+                                                     SDValue &SOffset,
+                                                     SDValue &ImmOffset,
+                                                     SDValue &VOffset) const {
+  SDLoc DL(Offset);
+
+  // Don't generate an unnecessary voffset for constant offsets.
+  if (isa<ConstantSDNode>(Offset)) {
+    SDValue Tmp1, Tmp2;
+
+    // When necessary, use a voffset in <= CI anyway to work around a hardware
+    // bug.
+    if (Subtarget->getGeneration() > AMDGPUSubtarget::SEA_ISLANDS ||
+        SelectMUBUFConstant(Offset, Tmp1, Tmp2))
+      return false;
+  }
+
+  if (CurDAG->isBaseWithConstantOffset(Offset)) {
+    SDValue N0 = Offset.getOperand(0);
+    SDValue N1 = Offset.getOperand(1);
+    if (cast<ConstantSDNode>(N1)->getSExtValue() >= 0 &&
+        SelectMUBUFConstant(N1, SOffset, ImmOffset)) {
+      VOffset = N0;
+      return true;
+    }
+  }
+
+  SOffset = CurDAG->getTargetConstant(0, DL, MVT::i32);
+  ImmOffset = CurDAG->getTargetConstant(0, DL, MVT::i16);
+  VOffset = Offset;
+
+  return true;
+}
+
+bool AMDGPUDAGToDAGISel::SelectFlat(SDValue Addr,
+                                    SDValue &VAddr,
+                                    SDValue &SLC,
+                                    SDValue &TFE) const {
+  VAddr = Addr;
+  TFE = SLC = CurDAG->getTargetConstant(0, SDLoc(), MVT::i1);
+  return true;
+}
+
 ///
 /// \param EncodedOffset This is the immediate value that will be encoded
 ///        directly into the instruction.  On SI/CI the \p EncodedOffset
@@ -1213,71 +1190,33 @@ bool AMDGPUDAGToDAGISel::SelectSMRDBufferSgpr(SDValue Addr,
          !isa<ConstantSDNode>(Offset);
 }
 
-// FIXME: This is incorrect and only enough to be able to compile.
-SDNode *AMDGPUDAGToDAGISel::SelectAddrSpaceCast(SDNode *N) {
-  AddrSpaceCastSDNode *ASC = cast<AddrSpaceCastSDNode>(N);
-  SDLoc DL(N);
-
-  const MachineFunction &MF = CurDAG->getMachineFunction();
-  DiagnosticInfoUnsupported NotImplemented(*MF.getFunction(),
-                                           "addrspacecast not implemented");
-  CurDAG->getContext()->diagnose(NotImplemented);
-
-  assert(Subtarget->hasFlatAddressSpace() &&
-         "addrspacecast only supported with flat address space!");
-
-  assert((ASC->getSrcAddressSpace() == AMDGPUAS::FLAT_ADDRESS ||
-          ASC->getDestAddressSpace() == AMDGPUAS::FLAT_ADDRESS) &&
-         "Can only cast to / from flat address space!");
-
-  // The flat instructions read the address as the index of the VGPR holding the
-  // address, so casting should just be reinterpreting the base VGPR, so just
-  // insert trunc / bitcast / zext.
-
-  SDValue Src = ASC->getOperand(0);
-  EVT DestVT = ASC->getValueType(0);
-  EVT SrcVT = Src.getValueType();
-
-  unsigned SrcSize = SrcVT.getSizeInBits();
-  unsigned DestSize = DestVT.getSizeInBits();
-
-  if (SrcSize > DestSize) {
-    assert(SrcSize == 64 && DestSize == 32);
-    return CurDAG->getMachineNode(
-      TargetOpcode::EXTRACT_SUBREG,
-      DL,
-      DestVT,
-      Src,
-      CurDAG->getTargetConstant(AMDGPU::sub0, DL, MVT::i32));
-  }
-
-  if (DestSize > SrcSize) {
-    assert(SrcSize == 32 && DestSize == 64);
-
-    // FIXME: This is probably wrong, we should never be defining
-    // a register class with both VGPRs and SGPRs
-    SDValue RC = CurDAG->getTargetConstant(AMDGPU::VS_64RegClassID, DL,
-                                           MVT::i32);
+bool AMDGPUDAGToDAGISel::SelectMOVRELOffset(SDValue Index,
+                                            SDValue &Base,
+                                            SDValue &Offset) const {
+  SDLoc DL(Index);
 
-    const SDValue Ops[] = {
-      RC,
-      Src,
-      CurDAG->getTargetConstant(AMDGPU::sub0, DL, MVT::i32),
-      SDValue(CurDAG->getMachineNode(AMDGPU::S_MOV_B32, DL, MVT::i32,
-                                     CurDAG->getConstant(0, DL, MVT::i32)), 0),
-      CurDAG->getTargetConstant(AMDGPU::sub1, DL, MVT::i32)
-    };
+  if (CurDAG->isBaseWithConstantOffset(Index)) {
+    SDValue N0 = Index.getOperand(0);
+    SDValue N1 = Index.getOperand(1);
+    ConstantSDNode *C1 = cast<ConstantSDNode>(N1);
 
-    return CurDAG->getMachineNode(TargetOpcode::REG_SEQUENCE,
-                                  DL, N->getValueType(0), Ops);
+    // (add n0, c0)
+    Base = N0;
+    Offset = CurDAG->getTargetConstant(C1->getZExtValue(), DL, MVT::i32);
+    return true;
   }
 
-  assert(SrcSize == 64 && DestSize == 64);
-  return CurDAG->getNode(ISD::BITCAST, DL, DestVT, Src).getNode();
+  if (isa<ConstantSDNode>(Index))
+    return false;
+
+  Base = Index;
+  Offset = CurDAG->getTargetConstant(0, DL, MVT::i32);
+  return true;
 }
 
-SDNode *AMDGPUDAGToDAGISel::getS_BFE(unsigned Opcode, SDLoc DL, SDValue Val,
-                                     uint32_t Offset, uint32_t Width) {
+SDNode *AMDGPUDAGToDAGISel::getS_BFE(unsigned Opcode, const SDLoc &DL,
+                                     SDValue Val, uint32_t Offset,
+                                     uint32_t Width) {
   // Transformation function, pack the offset and width of a BFE into
   // the format expected by the S_BFE_I32 / S_BFE_U32. In the second
   // source, bits [5:0] contain the offset and bits [22:16] the width.
@@ -1287,7 +1226,7 @@ SDNode *AMDGPUDAGToDAGISel::getS_BFE(unsigned Opcode, SDLoc DL, SDValue Val,
   return CurDAG->getMachineNode(Opcode, DL, MVT::i32, Val, PackedConst);
 }
 
-SDNode *AMDGPUDAGToDAGISel::SelectS_BFEFromShifts(SDNode *N) {
+void AMDGPUDAGToDAGISel::SelectS_BFEFromShifts(SDNode *N) {
   // "(a << b) srl c)" ---> "BFE_U32 a, (c-b), (32-c)
   // "(a << b) sra c)" ---> "BFE_I32 a, (c-b), (32-c)
   // Predicate: 0 < b <= c < 32
@@ -1304,14 +1243,15 @@ SDNode *AMDGPUDAGToDAGISel::SelectS_BFEFromShifts(SDNode *N) {
       bool Signed = N->getOpcode() == ISD::SRA;
       unsigned Opcode = Signed ? AMDGPU::S_BFE_I32 : AMDGPU::S_BFE_U32;
 
-      return getS_BFE(Opcode, SDLoc(N), Shl.getOperand(0),
-                      CVal - BVal, 32 - CVal);
+      ReplaceNode(N, getS_BFE(Opcode, SDLoc(N), Shl.getOperand(0), CVal - BVal,
+                              32 - CVal));
+      return;
     }
   }
-  return SelectCode(N);
+  SelectCode(N);
 }
 
-SDNode *AMDGPUDAGToDAGISel::SelectS_BFE(SDNode *N) {
+void AMDGPUDAGToDAGISel::SelectS_BFE(SDNode *N) {
   switch (N->getOpcode()) {
   case ISD::AND:
     if (N->getOperand(0).getOpcode() == ISD::SRL) {
@@ -1328,8 +1268,9 @@ SDNode *AMDGPUDAGToDAGISel::SelectS_BFE(SDNode *N) {
         if (isMask_32(MaskVal)) {
           uint32_t WidthVal = countPopulation(MaskVal);
 
-          return getS_BFE(AMDGPU::S_BFE_U32, SDLoc(N), Srl.getOperand(0),
-                          ShiftVal, WidthVal);
+          ReplaceNode(N, getS_BFE(AMDGPU::S_BFE_U32, SDLoc(N),
+                                  Srl.getOperand(0), ShiftVal, WidthVal));
+          return;
         }
       }
     }
@@ -1349,20 +1290,139 @@ SDNode *AMDGPUDAGToDAGISel::SelectS_BFE(SDNode *N) {
         if (isMask_32(MaskVal)) {
           uint32_t WidthVal = countPopulation(MaskVal);
 
-          return getS_BFE(AMDGPU::S_BFE_U32, SDLoc(N), And.getOperand(0),
-                          ShiftVal, WidthVal);
+          ReplaceNode(N, getS_BFE(AMDGPU::S_BFE_U32, SDLoc(N),
+                                  And.getOperand(0), ShiftVal, WidthVal));
+          return;
         }
       }
-    } else if (N->getOperand(0).getOpcode() == ISD::SHL)
-      return SelectS_BFEFromShifts(N);
+    } else if (N->getOperand(0).getOpcode() == ISD::SHL) {
+      SelectS_BFEFromShifts(N);
+      return;
+    }
     break;
   case ISD::SRA:
-    if (N->getOperand(0).getOpcode() == ISD::SHL)
-      return SelectS_BFEFromShifts(N);
+    if (N->getOperand(0).getOpcode() == ISD::SHL) {
+      SelectS_BFEFromShifts(N);
+      return;
+    }
     break;
+
+  case ISD::SIGN_EXTEND_INREG: {
+    // sext_inreg (srl x, 16), i8 -> bfe_i32 x, 16, 8
+    SDValue Src = N->getOperand(0);
+    if (Src.getOpcode() != ISD::SRL)
+      break;
+
+    const ConstantSDNode *Amt = dyn_cast<ConstantSDNode>(Src.getOperand(1));
+    if (!Amt)
+      break;
+
+    unsigned Width = cast<VTSDNode>(N->getOperand(1))->getVT().getSizeInBits();
+    ReplaceNode(N, getS_BFE(AMDGPU::S_BFE_I32, SDLoc(N), Src.getOperand(0),
+                            Amt->getZExtValue(), Width));
+    return;
+  }
   }
 
-  return SelectCode(N);
+  SelectCode(N);
+}
+
+void AMDGPUDAGToDAGISel::SelectBRCOND(SDNode *N) {
+  SDValue Cond = N->getOperand(1);
+
+  if (isCBranchSCC(N)) {
+    // This brcond will use S_CBRANCH_SCC*, so let tablegen handle it.
+    SelectCode(N);
+    return;
+  }
+
+  // The result of VOPC instructions is or'd against ~EXEC before it is
+  // written to vcc or another SGPR.  This means that the value '1' is always
+  // written to the corresponding bit for results that are masked.  In order
+  // to correctly check against vccz, we need to and VCC with the EXEC
+  // register in order to clear the value from the masked bits.
+
+  SDLoc SL(N);
+
+  SDNode *MaskedCond =
+        CurDAG->getMachineNode(AMDGPU::S_AND_B64, SL, MVT::i1,
+                               CurDAG->getRegister(AMDGPU::EXEC, MVT::i1),
+                               Cond);
+  SDValue VCC = CurDAG->getCopyToReg(N->getOperand(0), SL, AMDGPU::VCC,
+                                     SDValue(MaskedCond, 0),
+                                     SDValue()); // Passing SDValue() adds a
+                                                 // glue output.
+  CurDAG->SelectNodeTo(N, AMDGPU::S_CBRANCH_VCCNZ, MVT::Other,
+                       N->getOperand(2), // Basic Block
+                       VCC.getValue(0),  // Chain
+                       VCC.getValue(1)); // Glue
+  return;
+}
+
+// This is here because there isn't a way to use the generated sub0_sub1 as the
+// subreg index to EXTRACT_SUBREG in tablegen.
+void AMDGPUDAGToDAGISel::SelectATOMIC_CMP_SWAP(SDNode *N) {
+  MemSDNode *Mem = cast<MemSDNode>(N);
+  unsigned AS = Mem->getAddressSpace();
+  if (AS == AMDGPUAS::FLAT_ADDRESS) {
+    SelectCode(N);
+    return;
+  }
+
+  MVT VT = N->getSimpleValueType(0);
+  bool Is32 = (VT == MVT::i32);
+  SDLoc SL(N);
+
+  MachineSDNode *CmpSwap = nullptr;
+  if (Subtarget->hasAddr64()) {
+    SDValue SRsrc, VAddr, SOffset, Offset, GLC, SLC;
+
+    if (SelectMUBUFAddr64(Mem->getBasePtr(), SRsrc, VAddr, SOffset, Offset, SLC)) {
+      unsigned Opcode = Is32 ? AMDGPU::BUFFER_ATOMIC_CMPSWAP_RTN_ADDR64 :
+        AMDGPU::BUFFER_ATOMIC_CMPSWAP_X2_RTN_ADDR64;
+      SDValue CmpVal = Mem->getOperand(2);
+
+      // XXX - Do we care about glue operands?
+
+      SDValue Ops[] = {
+        CmpVal, VAddr, SRsrc, SOffset, Offset, SLC, Mem->getChain()
+      };
+
+      CmpSwap = CurDAG->getMachineNode(Opcode, SL, Mem->getVTList(), Ops);
+    }
+  }
+
+  if (!CmpSwap) {
+    SDValue SRsrc, SOffset, Offset, SLC;
+    if (SelectMUBUFOffset(Mem->getBasePtr(), SRsrc, SOffset, Offset, SLC)) {
+      unsigned Opcode = Is32 ? AMDGPU::BUFFER_ATOMIC_CMPSWAP_RTN_OFFSET :
+        AMDGPU::BUFFER_ATOMIC_CMPSWAP_X2_RTN_OFFSET;
+
+      SDValue CmpVal = Mem->getOperand(2);
+      SDValue Ops[] = {
+        CmpVal, SRsrc, SOffset, Offset, SLC, Mem->getChain()
+      };
+
+      CmpSwap = CurDAG->getMachineNode(Opcode, SL, Mem->getVTList(), Ops);
+    }
+  }
+
+  if (!CmpSwap) {
+    SelectCode(N);
+    return;
+  }
+
+  MachineSDNode::mmo_iterator MMOs = MF->allocateMemRefsArray(1);
+  *MMOs = Mem->getMemOperand();
+  CmpSwap->setMemRefs(MMOs, MMOs + 1);
+
+  unsigned SubReg = Is32 ? AMDGPU::sub0 : AMDGPU::sub0_sub1;
+  SDValue Extract
+    = CurDAG->getTargetExtractSubreg(SubReg, SL, VT, SDValue(CmpSwap, 0));
+
+  ReplaceUses(SDValue(N, 0), Extract);
+  ReplaceUses(SDValue(N, 1), SDValue(CmpSwap, 1));
+  CurDAG->RemoveDeadNode(N);
 }
 
 bool AMDGPUDAGToDAGISel::SelectVOP3Mods(SDValue In, SDValue &Src,
@@ -1432,62 +1492,59 @@ bool AMDGPUDAGToDAGISel::SelectVOP3Mods0Clamp0OMod(SDValue In, SDValue &Src,
 }
 
 void AMDGPUDAGToDAGISel::PreprocessISelDAG() {
-  bool Modified = false;
-
-  // XXX - Other targets seem to be able to do this without a worklist.
-  SmallVector<LoadSDNode *, 8> LoadsToReplace;
-  SmallVector<StoreSDNode *, 8> StoresToReplace;
-
-  for (SDNode &Node : CurDAG->allnodes()) {
-    if (LoadSDNode *LD = dyn_cast<LoadSDNode>(&Node)) {
-      EVT VT = LD->getValueType(0);
-      if (VT != MVT::i64 || LD->getExtensionType() != ISD::NON_EXTLOAD)
-        continue;
-
-      // To simplify the TableGen patters, we replace all i64 loads with v2i32
-      // loads.  Alternatively, we could promote i64 loads to v2i32 during DAG
-      // legalization, however, so places (ExpandUnalignedLoad) in the DAG
-      // legalizer assume that if i64 is legal, so doing this promotion early
-      // can cause problems.
-      LoadsToReplace.push_back(LD);
-    } else if (StoreSDNode *ST = dyn_cast<StoreSDNode>(&Node)) {
-      // Handle i64 stores here for the same reason mentioned above for loads.
-      SDValue Value = ST->getValue();
-      if (Value.getValueType() != MVT::i64 || ST->isTruncatingStore())
-        continue;
-      StoresToReplace.push_back(ST);
+  MachineFrameInfo *MFI = CurDAG->getMachineFunction().getFrameInfo();
+
+  // Handle the perverse case where a frame index is being stored. We don't
+  // want to see multiple frame index operands on the same instruction since
+  // it complicates things and violates some assumptions about frame index
+  // lowering.
+  for (int I = MFI->getObjectIndexBegin(), E = MFI->getObjectIndexEnd();
+       I != E; ++I) {
+    SDValue FI = CurDAG->getTargetFrameIndex(I, MVT::i32);
+
+    // It's possible that we have a frame index defined in the function that
+    // isn't used in this block.
+    if (FI.use_empty())
+      continue;
+
+    // Skip over the AssertZext inserted during lowering.
+    SDValue EffectiveFI = FI;
+    auto It = FI->use_begin();
+    if (It->getOpcode() == ISD::AssertZext && FI->hasOneUse()) {
+      EffectiveFI = SDValue(*It, 0);
+      It = EffectiveFI->use_begin();
     }
-  }
-
-  for (LoadSDNode *LD : LoadsToReplace) {
-    SDLoc SL(LD);
-
-    SDValue NewLoad = CurDAG->getLoad(MVT::v2i32, SL, LD->getChain(),
-                                      LD->getBasePtr(), LD->getMemOperand());
-    SDValue BitCast = CurDAG->getNode(ISD::BITCAST, SL,
-                                      MVT::i64, NewLoad);
-    CurDAG->ReplaceAllUsesOfValueWith(SDValue(LD, 1), NewLoad.getValue(1));
-    CurDAG->ReplaceAllUsesOfValueWith(SDValue(LD, 0), BitCast);
-    Modified = true;
-  }
 
-  for (StoreSDNode *ST : StoresToReplace) {
-    SDValue NewValue = CurDAG->getNode(ISD::BITCAST, SDLoc(ST),
-                                       MVT::v2i32, ST->getValue());
-    const SDValue StoreOps[] = {
-      ST->getChain(),
-      NewValue,
-      ST->getBasePtr(),
-      ST->getOffset()
-    };
+    for (auto It = EffectiveFI->use_begin(); !It.atEnd(); ) {
+      SDUse &Use = It.getUse();
+      SDNode *User = Use.getUser();
+      unsigned OpIdx = It.getOperandNo();
+      ++It;
+
+      if (MemSDNode *M = dyn_cast<MemSDNode>(User)) {
+        unsigned PtrIdx = M->getOpcode() == ISD::STORE ? 2 : 1;
+        if (OpIdx == PtrIdx)
+          continue;
+
+        unsigned OpN = M->getNumOperands();
+        SDValue NewOps[8];
+
+        assert(OpN < array_lengthof(NewOps));
+        for (unsigned Op = 0; Op != OpN; ++Op) {
+          if (Op != OpIdx) {
+            NewOps[Op] = M->getOperand(Op);
+            continue;
+          }
+
+          MachineSDNode *Mov = CurDAG->getMachineNode(AMDGPU::V_MOV_B32_e32,
+                                                      SDLoc(M), MVT::i32, FI);
+          NewOps[Op] = SDValue(Mov, 0);
+        }
 
-    CurDAG->UpdateNodeOperands(ST, StoreOps);
-    Modified = true;
+        CurDAG->UpdateNodeOperands(M, makeArrayRef(NewOps, OpN));
+      }
+    }
   }
-
-  // XXX - Is this necessary?
-  if (Modified)
-    CurDAG->RemoveDeadNodes();
 }
 
 void AMDGPUDAGToDAGISel::PostprocessISelDAG() {
diff --git a/lib/Target/AMDGPU/AMDGPUISelLowering.cpp b/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
index 1a59a460ee7d..352423ed3ad6 100644
--- a/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
+++ b/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
@@ -15,7 +15,6 @@
 
 #include "AMDGPUISelLowering.h"
 #include "AMDGPU.h"
-#include "AMDGPUDiagnosticInfoUnsupported.h"
 #include "AMDGPUFrameLowering.h"
 #include "AMDGPUIntrinsicInfo.h"
 #include "AMDGPURegisterInfo.h"
@@ -28,16 +27,19 @@
 #include "llvm/CodeGen/SelectionDAG.h"
 #include "llvm/CodeGen/TargetLoweringObjectFileImpl.h"
 #include "llvm/IR/DataLayout.h"
-
+#include "llvm/IR/DiagnosticInfo.h"
+#include "SIInstrInfo.h"
 using namespace llvm;
 
-static bool allocateStack(unsigned ValNo, MVT ValVT, MVT LocVT,
-                      CCValAssign::LocInfo LocInfo,
-                      ISD::ArgFlagsTy ArgFlags, CCState &State) {
-  unsigned Offset = State.AllocateStack(ValVT.getStoreSize(),
-                                        ArgFlags.getOrigAlign());
-  State.addLoc(CCValAssign::getMem(ValNo, ValVT, Offset, LocVT, LocInfo));
+static bool allocateKernArg(unsigned ValNo, MVT ValVT, MVT LocVT,
+                            CCValAssign::LocInfo LocInfo,
+                            ISD::ArgFlagsTy ArgFlags, CCState &State) {
+  MachineFunction &MF = State.getMachineFunction();
+  AMDGPUMachineFunction *MFI = MF.getInfo<AMDGPUMachineFunction>();
 
+  uint64_t Offset = MFI->allocateKernArg(ValVT.getStoreSize(),
+                                         ArgFlags.getOrigAlign());
+  State.addLoc(CCValAssign::getCustomMem(ValNo, ValVT, Offset, LocVT, LocInfo));
   return true;
 }
 
@@ -53,60 +55,104 @@ EVT AMDGPUTargetLowering::getEquivalentMemType(LLVMContext &Ctx, EVT VT) {
   return EVT::getVectorVT(Ctx, MVT::i32, StoreSize / 32);
 }
 
-// Type for a vector that will be loaded to.
-EVT AMDGPUTargetLowering::getEquivalentLoadRegType(LLVMContext &Ctx, EVT VT) {
+EVT AMDGPUTargetLowering::getEquivalentBitType(LLVMContext &Ctx, EVT VT) {
   unsigned StoreSize = VT.getStoreSizeInBits();
   if (StoreSize <= 32)
-    return EVT::getIntegerVT(Ctx, 32);
+    return EVT::getIntegerVT(Ctx, StoreSize);
 
   return EVT::getVectorVT(Ctx, MVT::i32, StoreSize / 32);
 }
 
-AMDGPUTargetLowering::AMDGPUTargetLowering(TargetMachine &TM,
+AMDGPUTargetLowering::AMDGPUTargetLowering(const TargetMachine &TM,
                                            const AMDGPUSubtarget &STI)
     : TargetLowering(TM), Subtarget(&STI) {
-  setOperationAction(ISD::Constant, MVT::i32, Legal);
-  setOperationAction(ISD::Constant, MVT::i64, Legal);
-  setOperationAction(ISD::ConstantFP, MVT::f32, Legal);
-  setOperationAction(ISD::ConstantFP, MVT::f64, Legal);
+  // Lower floating point store/load to integer store/load to reduce the number
+  // of patterns in tablegen.
+  setOperationAction(ISD::LOAD, MVT::f32, Promote);
+  AddPromotedToType(ISD::LOAD, MVT::f32, MVT::i32);
 
-  setOperationAction(ISD::BR_JT, MVT::Other, Expand);
-  setOperationAction(ISD::BRIND, MVT::Other, Expand);
+  setOperationAction(ISD::LOAD, MVT::v2f32, Promote);
+  AddPromotedToType(ISD::LOAD, MVT::v2f32, MVT::v2i32);
 
-  // This is totally unsupported, just custom lower to produce an error.
-  setOperationAction(ISD::DYNAMIC_STACKALLOC, MVT::i32, Custom);
+  setOperationAction(ISD::LOAD, MVT::v4f32, Promote);
+  AddPromotedToType(ISD::LOAD, MVT::v4f32, MVT::v4i32);
 
-  // We need to custom lower some of the intrinsics
-  setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::Other, Custom);
+  setOperationAction(ISD::LOAD, MVT::v8f32, Promote);
+  AddPromotedToType(ISD::LOAD, MVT::v8f32, MVT::v8i32);
 
-  // Library functions.  These default to Expand, but we have instructions
-  // for them.
-  setOperationAction(ISD::FCEIL,  MVT::f32, Legal);
-  setOperationAction(ISD::FEXP2,  MVT::f32, Legal);
-  setOperationAction(ISD::FPOW,   MVT::f32, Legal);
-  setOperationAction(ISD::FLOG2,  MVT::f32, Legal);
-  setOperationAction(ISD::FABS,   MVT::f32, Legal);
-  setOperationAction(ISD::FFLOOR, MVT::f32, Legal);
-  setOperationAction(ISD::FRINT,  MVT::f32, Legal);
-  setOperationAction(ISD::FTRUNC, MVT::f32, Legal);
-  setOperationAction(ISD::FMINNUM, MVT::f32, Legal);
-  setOperationAction(ISD::FMAXNUM, MVT::f32, Legal);
+  setOperationAction(ISD::LOAD, MVT::v16f32, Promote);
+  AddPromotedToType(ISD::LOAD, MVT::v16f32, MVT::v16i32);
 
-  setOperationAction(ISD::FROUND, MVT::f32, Custom);
-  setOperationAction(ISD::FROUND, MVT::f64, Custom);
+  setOperationAction(ISD::LOAD, MVT::i64, Promote);
+  AddPromotedToType(ISD::LOAD, MVT::i64, MVT::v2i32);
 
-  setOperationAction(ISD::FREM, MVT::f32, Custom);
-  setOperationAction(ISD::FREM, MVT::f64, Custom);
+  setOperationAction(ISD::LOAD, MVT::v2i64, Promote);
+  AddPromotedToType(ISD::LOAD, MVT::v2i64, MVT::v4i32);
 
-  // v_mad_f32 does not support denormals according to some sources.
-  if (!Subtarget->hasFP32Denormals())
-    setOperationAction(ISD::FMAD, MVT::f32, Legal);
+  setOperationAction(ISD::LOAD, MVT::f64, Promote);
+  AddPromotedToType(ISD::LOAD, MVT::f64, MVT::v2i32);
 
-  // Expand to fneg + fadd.
-  setOperationAction(ISD::FSUB, MVT::f64, Expand);
+  setOperationAction(ISD::LOAD, MVT::v2f64, Promote);
+  AddPromotedToType(ISD::LOAD, MVT::v2f64, MVT::v4i32);
+
+  // There are no 64-bit extloads. These should be done as a 32-bit extload and
+  // an extension to 64-bit.
+  for (MVT VT : MVT::integer_valuetypes()) {
+    setLoadExtAction(ISD::EXTLOAD, MVT::i64, VT, Expand);
+    setLoadExtAction(ISD::SEXTLOAD, MVT::i64, VT, Expand);
+    setLoadExtAction(ISD::ZEXTLOAD, MVT::i64, VT, Expand);
+  }
+
+  for (MVT VT : MVT::integer_valuetypes()) {
+    if (VT == MVT::i64)
+      continue;
+
+    setLoadExtAction(ISD::SEXTLOAD, VT, MVT::i1, Promote);
+    setLoadExtAction(ISD::SEXTLOAD, VT, MVT::i8, Legal);
+    setLoadExtAction(ISD::SEXTLOAD, VT, MVT::i16, Legal);
+    setLoadExtAction(ISD::SEXTLOAD, VT, MVT::i32, Expand);
+
+    setLoadExtAction(ISD::ZEXTLOAD, VT, MVT::i1, Promote);
+    setLoadExtAction(ISD::ZEXTLOAD, VT, MVT::i8, Legal);
+    setLoadExtAction(ISD::ZEXTLOAD, VT, MVT::i16, Legal);
+    setLoadExtAction(ISD::ZEXTLOAD, VT, MVT::i32, Expand);
+
+    setLoadExtAction(ISD::EXTLOAD, VT, MVT::i1, Promote);
+    setLoadExtAction(ISD::EXTLOAD, VT, MVT::i8, Legal);
+    setLoadExtAction(ISD::EXTLOAD, VT, MVT::i16, Legal);
+    setLoadExtAction(ISD::EXTLOAD, VT, MVT::i32, Expand);
+  }
+
+  for (MVT VT : MVT::integer_vector_valuetypes()) {
+    setLoadExtAction(ISD::EXTLOAD, VT, MVT::v2i8, Expand);
+    setLoadExtAction(ISD::SEXTLOAD, VT, MVT::v2i8, Expand);
+    setLoadExtAction(ISD::ZEXTLOAD, VT, MVT::v2i8, Expand);
+    setLoadExtAction(ISD::EXTLOAD, VT, MVT::v4i8, Expand);
+    setLoadExtAction(ISD::SEXTLOAD, VT, MVT::v4i8, Expand);
+    setLoadExtAction(ISD::ZEXTLOAD, VT, MVT::v4i8, Expand);
+    setLoadExtAction(ISD::EXTLOAD, VT, MVT::v2i16, Expand);
+    setLoadExtAction(ISD::SEXTLOAD, VT, MVT::v2i16, Expand);
+    setLoadExtAction(ISD::ZEXTLOAD, VT, MVT::v2i16, Expand);
+    setLoadExtAction(ISD::EXTLOAD, VT, MVT::v4i16, Expand);
+    setLoadExtAction(ISD::SEXTLOAD, VT, MVT::v4i16, Expand);
+    setLoadExtAction(ISD::ZEXTLOAD, VT, MVT::v4i16, Expand);
+  }
+
+  setLoadExtAction(ISD::EXTLOAD, MVT::f32, MVT::f16, Expand);
+  setLoadExtAction(ISD::EXTLOAD, MVT::v2f32, MVT::v2f16, Expand);
+  setLoadExtAction(ISD::EXTLOAD, MVT::v4f32, MVT::v4f16, Expand);
+  setLoadExtAction(ISD::EXTLOAD, MVT::v8f32, MVT::v8f16, Expand);
+
+  setLoadExtAction(ISD::EXTLOAD, MVT::f64, MVT::f32, Expand);
+  setLoadExtAction(ISD::EXTLOAD, MVT::v2f64, MVT::v2f32, Expand);
+  setLoadExtAction(ISD::EXTLOAD, MVT::v4f64, MVT::v4f32, Expand);
+  setLoadExtAction(ISD::EXTLOAD, MVT::v8f64, MVT::v8f32, Expand);
+
+  setLoadExtAction(ISD::EXTLOAD, MVT::f64, MVT::f16, Expand);
+  setLoadExtAction(ISD::EXTLOAD, MVT::v2f64, MVT::v2f16, Expand);
+  setLoadExtAction(ISD::EXTLOAD, MVT::v4f64, MVT::v4f16, Expand);
+  setLoadExtAction(ISD::EXTLOAD, MVT::v8f64, MVT::v8f16, Expand);
 
-  // Lower floating point store/load to integer store/load to reduce the number
-  // of patterns in tablegen.
   setOperationAction(ISD::STORE, MVT::f32, Promote);
   AddPromotedToType(ISD::STORE, MVT::f32, MVT::i32);
 
@@ -122,51 +168,99 @@ AMDGPUTargetLowering::AMDGPUTargetLowering(TargetMachine &TM,
   setOperationAction(ISD::STORE, MVT::v16f32, Promote);
   AddPromotedToType(ISD::STORE, MVT::v16f32, MVT::v16i32);
 
+  setOperationAction(ISD::STORE, MVT::i64, Promote);
+  AddPromotedToType(ISD::STORE, MVT::i64, MVT::v2i32);
+
+  setOperationAction(ISD::STORE, MVT::v2i64, Promote);
+  AddPromotedToType(ISD::STORE, MVT::v2i64, MVT::v4i32);
+
   setOperationAction(ISD::STORE, MVT::f64, Promote);
-  AddPromotedToType(ISD::STORE, MVT::f64, MVT::i64);
+  AddPromotedToType(ISD::STORE, MVT::f64, MVT::v2i32);
 
   setOperationAction(ISD::STORE, MVT::v2f64, Promote);
-  AddPromotedToType(ISD::STORE, MVT::v2f64, MVT::v2i64);
+  AddPromotedToType(ISD::STORE, MVT::v2f64, MVT::v4i32);
 
-  // Custom lowering of vector stores is required for local address space
-  // stores.
-  setOperationAction(ISD::STORE, MVT::v4i32, Custom);
-
-  setTruncStoreAction(MVT::v2i32, MVT::v2i16, Custom);
   setTruncStoreAction(MVT::v2i32, MVT::v2i8, Custom);
-  setTruncStoreAction(MVT::v4i32, MVT::v4i8, Custom);
+  setTruncStoreAction(MVT::v2i32, MVT::v2i16, Custom);
 
-  // XXX: This can be change to Custom, once ExpandVectorStores can
-  // handle 64-bit stores.
+  setTruncStoreAction(MVT::v4i32, MVT::v4i8, Custom);
   setTruncStoreAction(MVT::v4i32, MVT::v4i16, Expand);
 
-  setTruncStoreAction(MVT::i64, MVT::i16, Expand);
-  setTruncStoreAction(MVT::i64, MVT::i8, Expand);
+  setTruncStoreAction(MVT::v8i32, MVT::v8i16, Expand);
+  setTruncStoreAction(MVT::v16i32, MVT::v16i8, Expand);
+  setTruncStoreAction(MVT::v16i32, MVT::v16i16, Expand);
+
   setTruncStoreAction(MVT::i64, MVT::i1, Expand);
+  setTruncStoreAction(MVT::i64, MVT::i8, Expand);
+  setTruncStoreAction(MVT::i64, MVT::i16, Expand);
+  setTruncStoreAction(MVT::i64, MVT::i32, Expand);
+
   setTruncStoreAction(MVT::v2i64, MVT::v2i1, Expand);
-  setTruncStoreAction(MVT::v4i64, MVT::v4i1, Expand);
+  setTruncStoreAction(MVT::v2i64, MVT::v2i8, Expand);
+  setTruncStoreAction(MVT::v2i64, MVT::v2i16, Expand);
+  setTruncStoreAction(MVT::v2i64, MVT::v2i32, Expand);
 
+  setTruncStoreAction(MVT::f32, MVT::f16, Expand);
+  setTruncStoreAction(MVT::v2f32, MVT::v2f16, Expand);
+  setTruncStoreAction(MVT::v4f32, MVT::v4f16, Expand);
+  setTruncStoreAction(MVT::v8f32, MVT::v8f16, Expand);
 
-  setOperationAction(ISD::LOAD, MVT::f32, Promote);
-  AddPromotedToType(ISD::LOAD, MVT::f32, MVT::i32);
+  setTruncStoreAction(MVT::f64, MVT::f16, Expand);
+  setTruncStoreAction(MVT::f64, MVT::f32, Expand);
 
-  setOperationAction(ISD::LOAD, MVT::v2f32, Promote);
-  AddPromotedToType(ISD::LOAD, MVT::v2f32, MVT::v2i32);
+  setTruncStoreAction(MVT::v2f64, MVT::v2f32, Expand);
+  setTruncStoreAction(MVT::v2f64, MVT::v2f16, Expand);
 
-  setOperationAction(ISD::LOAD, MVT::v4f32, Promote);
-  AddPromotedToType(ISD::LOAD, MVT::v4f32, MVT::v4i32);
+  setTruncStoreAction(MVT::v4f64, MVT::v4f32, Expand);
+  setTruncStoreAction(MVT::v4f64, MVT::v4f16, Expand);
 
-  setOperationAction(ISD::LOAD, MVT::v8f32, Promote);
-  AddPromotedToType(ISD::LOAD, MVT::v8f32, MVT::v8i32);
+  setTruncStoreAction(MVT::v8f64, MVT::v8f32, Expand);
+  setTruncStoreAction(MVT::v8f64, MVT::v8f16, Expand);
 
-  setOperationAction(ISD::LOAD, MVT::v16f32, Promote);
-  AddPromotedToType(ISD::LOAD, MVT::v16f32, MVT::v16i32);
 
-  setOperationAction(ISD::LOAD, MVT::f64, Promote);
-  AddPromotedToType(ISD::LOAD, MVT::f64, MVT::i64);
+  setOperationAction(ISD::Constant, MVT::i32, Legal);
+  setOperationAction(ISD::Constant, MVT::i64, Legal);
+  setOperationAction(ISD::ConstantFP, MVT::f32, Legal);
+  setOperationAction(ISD::ConstantFP, MVT::f64, Legal);
 
-  setOperationAction(ISD::LOAD, MVT::v2f64, Promote);
-  AddPromotedToType(ISD::LOAD, MVT::v2f64, MVT::v2i64);
+  setOperationAction(ISD::BR_JT, MVT::Other, Expand);
+  setOperationAction(ISD::BRIND, MVT::Other, Expand);
+
+  // This is totally unsupported, just custom lower to produce an error.
+  setOperationAction(ISD::DYNAMIC_STACKALLOC, MVT::i32, Custom);
+
+  // We need to custom lower some of the intrinsics
+  setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::Other, Custom);
+  setOperationAction(ISD::INTRINSIC_VOID, MVT::Other, Custom);
+
+  // Library functions.  These default to Expand, but we have instructions
+  // for them.
+  setOperationAction(ISD::FCEIL,  MVT::f32, Legal);
+  setOperationAction(ISD::FEXP2,  MVT::f32, Legal);
+  setOperationAction(ISD::FPOW,   MVT::f32, Legal);
+  setOperationAction(ISD::FLOG2,  MVT::f32, Legal);
+  setOperationAction(ISD::FABS,   MVT::f32, Legal);
+  setOperationAction(ISD::FFLOOR, MVT::f32, Legal);
+  setOperationAction(ISD::FRINT,  MVT::f32, Legal);
+  setOperationAction(ISD::FTRUNC, MVT::f32, Legal);
+  setOperationAction(ISD::FMINNUM, MVT::f32, Legal);
+  setOperationAction(ISD::FMAXNUM, MVT::f32, Legal);
+
+  setOperationAction(ISD::FROUND, MVT::f32, Custom);
+  setOperationAction(ISD::FROUND, MVT::f64, Custom);
+
+  setOperationAction(ISD::FNEARBYINT, MVT::f32, Custom);
+  setOperationAction(ISD::FNEARBYINT, MVT::f64, Custom);
+
+  setOperationAction(ISD::FREM, MVT::f32, Custom);
+  setOperationAction(ISD::FREM, MVT::f64, Custom);
+
+  // v_mad_f32 does not support denormals according to some sources.
+  if (!Subtarget->hasFP32Denormals())
+    setOperationAction(ISD::FMAD, MVT::f32, Legal);
+
+  // Expand to fneg + fadd.
+  setOperationAction(ISD::FSUB, MVT::f64, Expand);
 
   setOperationAction(ISD::CONCAT_VECTORS, MVT::v4i32, Custom);
   setOperationAction(ISD::CONCAT_VECTORS, MVT::v4f32, Custom);
@@ -179,31 +273,6 @@ AMDGPUTargetLowering::AMDGPUTargetLowering(TargetMachine &TM,
   setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v8f32, Custom);
   setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v8i32, Custom);
 
-  // There are no 64-bit extloads. These should be done as a 32-bit extload and
-  // an extension to 64-bit.
-  for (MVT VT : MVT::integer_valuetypes()) {
-    setLoadExtAction(ISD::EXTLOAD, MVT::i64, VT, Expand);
-    setLoadExtAction(ISD::SEXTLOAD, MVT::i64, VT, Expand);
-    setLoadExtAction(ISD::ZEXTLOAD, MVT::i64, VT, Expand);
-  }
-
-  for (MVT VT : MVT::integer_vector_valuetypes()) {
-    setLoadExtAction(ISD::EXTLOAD, VT, MVT::v2i8, Expand);
-    setLoadExtAction(ISD::SEXTLOAD, VT, MVT::v2i8, Expand);
-    setLoadExtAction(ISD::ZEXTLOAD, VT, MVT::v2i8, Expand);
-    setLoadExtAction(ISD::EXTLOAD, VT, MVT::v4i8, Expand);
-    setLoadExtAction(ISD::SEXTLOAD, VT, MVT::v4i8, Expand);
-    setLoadExtAction(ISD::ZEXTLOAD, VT, MVT::v4i8, Expand);
-    setLoadExtAction(ISD::EXTLOAD, VT, MVT::v2i16, Expand);
-    setLoadExtAction(ISD::SEXTLOAD, VT, MVT::v2i16, Expand);
-    setLoadExtAction(ISD::ZEXTLOAD, VT, MVT::v2i16, Expand);
-    setLoadExtAction(ISD::EXTLOAD, VT, MVT::v4i16, Expand);
-    setLoadExtAction(ISD::SEXTLOAD, VT, MVT::v4i16, Expand);
-    setLoadExtAction(ISD::ZEXTLOAD, VT, MVT::v4i16, Expand);
-  }
-
-  setOperationAction(ISD::BR_CC, MVT::i1, Expand);
-
   if (Subtarget->getGeneration() < AMDGPUSubtarget::SEA_ISLANDS) {
     setOperationAction(ISD::FCEIL, MVT::f64, Custom);
     setOperationAction(ISD::FTRUNC, MVT::f64, Custom);
@@ -219,28 +288,13 @@ AMDGPUTargetLowering::AMDGPUTargetLowering(TargetMachine &TM,
 
   setOperationAction(ISD::FP16_TO_FP, MVT::f64, Expand);
 
-  setLoadExtAction(ISD::EXTLOAD, MVT::f32, MVT::f16, Expand);
-  setLoadExtAction(ISD::EXTLOAD, MVT::v2f32, MVT::v2f16, Expand);
-  setLoadExtAction(ISD::EXTLOAD, MVT::v4f32, MVT::v4f16, Expand);
-  setLoadExtAction(ISD::EXTLOAD, MVT::v8f32, MVT::v8f16, Expand);
-
-  setLoadExtAction(ISD::EXTLOAD, MVT::f64, MVT::f16, Expand);
-  setLoadExtAction(ISD::EXTLOAD, MVT::v2f64, MVT::v2f16, Expand);
-  setLoadExtAction(ISD::EXTLOAD, MVT::v4f64, MVT::v4f16, Expand);
-  setLoadExtAction(ISD::EXTLOAD, MVT::v8f64, MVT::v8f16, Expand);
-
-  setTruncStoreAction(MVT::f32, MVT::f16, Expand);
-  setTruncStoreAction(MVT::v2f32, MVT::v2f16, Expand);
-  setTruncStoreAction(MVT::v4f32, MVT::v4f16, Expand);
-  setTruncStoreAction(MVT::v8f32, MVT::v8f16, Expand);
-
-  setTruncStoreAction(MVT::f64, MVT::f16, Expand);
-  setTruncStoreAction(MVT::f64, MVT::f32, Expand);
-
   const MVT ScalarIntVTs[] = { MVT::i32, MVT::i64 };
   for (MVT VT : ScalarIntVTs) {
-    setOperationAction(ISD::SREM, VT, Expand);
+    // These should use [SU]DIVREM, so set them to expand
     setOperationAction(ISD::SDIV, VT, Expand);
+    setOperationAction(ISD::UDIV, VT, Expand);
+    setOperationAction(ISD::SREM, VT, Expand);
+    setOperationAction(ISD::UREM, VT, Expand);
 
     // GPU does not have divrem function for signed or unsigned.
     setOperationAction(ISD::SDIVREM, VT, Custom);
@@ -284,17 +338,24 @@ AMDGPUTargetLowering::AMDGPUTargetLowering(TargetMachine &TM,
 
   if (Subtarget->hasFFBH())
     setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i32, Custom);
-  else
-    setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i32, Expand);
-
-  if (!Subtarget->hasFFBL())
-    setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::i32, Expand);
 
-  setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::i64, Expand);
+  if (Subtarget->hasFFBL())
+    setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::i32, Legal);
 
   setOperationAction(ISD::CTLZ, MVT::i64, Custom);
   setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i64, Custom);
 
+  // We only really have 32-bit BFE instructions (and 16-bit on VI).
+  //
+  // On SI+ there are 64-bit BFEs, but they are scalar only and there isn't any
+  // effort to match them now. We want this to be false for i64 cases when the
+  // extraction isn't restricted to the upper or lower half. Ideally we would
+  // have some pass reduce 64-bit extracts to 32-bit if possible. Extracts that
+  // span the midpoint are probably relatively rare, so don't worry about them
+  // for now.
+  if (Subtarget->hasBFE())
+    setHasExtractBitsInsn(true);
+
   static const MVT::SimpleValueType VectorIntTypes[] = {
     MVT::v2i32, MVT::v4i32
   };
@@ -334,9 +395,7 @@ AMDGPUTargetLowering::AMDGPUTargetLowering(TargetMachine &TM,
     setOperationAction(ISD::BSWAP, VT, Expand);
     setOperationAction(ISD::CTPOP, VT, Expand);
     setOperationAction(ISD::CTTZ, VT, Expand);
-    setOperationAction(ISD::CTTZ_ZERO_UNDEF, VT, Expand);
     setOperationAction(ISD::CTLZ, VT, Expand);
-    setOperationAction(ISD::CTLZ_ZERO_UNDEF, VT, Expand);
     setOperationAction(ISD::VECTOR_SHUFFLE, VT, Expand);
   }
 
@@ -366,24 +425,20 @@ AMDGPUTargetLowering::AMDGPUTargetLowering(TargetMachine &TM,
     setOperationAction(ISD::FSIN, VT, Expand);
     setOperationAction(ISD::FSUB, VT, Expand);
     setOperationAction(ISD::FNEG, VT, Expand);
-    setOperationAction(ISD::SELECT, VT, Expand);
     setOperationAction(ISD::VSELECT, VT, Expand);
     setOperationAction(ISD::SELECT_CC, VT, Expand);
     setOperationAction(ISD::FCOPYSIGN, VT, Expand);
     setOperationAction(ISD::VECTOR_SHUFFLE, VT, Expand);
   }
 
-  setOperationAction(ISD::FNEARBYINT, MVT::f32, Custom);
-  setOperationAction(ISD::FNEARBYINT, MVT::f64, Custom);
-
-  setTargetDAGCombine(ISD::SHL);
-  setTargetDAGCombine(ISD::MUL);
-  setTargetDAGCombine(ISD::SELECT);
-  setTargetDAGCombine(ISD::SELECT_CC);
-  setTargetDAGCombine(ISD::STORE);
+  // This causes using an unrolled select operation rather than expansion with
+  // bit operations. This is in general better, but the alternative using BFI
+  // instructions may be better if the select sources are SGPRs.
+  setOperationAction(ISD::SELECT, MVT::v2f32, Promote);
+  AddPromotedToType(ISD::SELECT, MVT::v2f32, MVT::v2i32);
 
-  setTargetDAGCombine(ISD::FADD);
-  setTargetDAGCombine(ISD::FSUB);
+  setOperationAction(ISD::SELECT, MVT::v4f32, Promote);
+  AddPromotedToType(ISD::SELECT, MVT::v4f32, MVT::v4i32);
 
   setBooleanContents(ZeroOrNegativeOneBooleanContent);
   setBooleanVectorContents(ZeroOrNegativeOneBooleanContent);
@@ -394,7 +449,7 @@ AMDGPUTargetLowering::AMDGPUTargetLowering(TargetMachine &TM,
   // SI at least has hardware support for floating point exceptions, but no way
   // of using or handling them is implemented. They are also optional in OpenCL
   // (Section 7.3)
-  setHasFloatingPointExceptions(false);
+  setHasFloatingPointExceptions(Subtarget->hasFPExceptions());
 
   setSelectIsExpensive(false);
   PredictableSelectIsExpensive = false;
@@ -415,6 +470,18 @@ AMDGPUTargetLowering::AMDGPUTargetLowering(TargetMachine &TM,
   MaxStoresPerMemcpy  = 4096;
   MaxStoresPerMemmove = 4096;
   MaxStoresPerMemset  = 4096;
+
+  setTargetDAGCombine(ISD::BITCAST);
+  setTargetDAGCombine(ISD::AND);
+  setTargetDAGCombine(ISD::SHL);
+  setTargetDAGCombine(ISD::SRA);
+  setTargetDAGCombine(ISD::SRL);
+  setTargetDAGCombine(ISD::MUL);
+  setTargetDAGCombine(ISD::SELECT);
+  setTargetDAGCombine(ISD::SELECT_CC);
+  setTargetDAGCombine(ISD::STORE);
+  setTargetDAGCombine(ISD::FADD);
+  setTargetDAGCombine(ISD::FSUB);
 }
 
 //===----------------------------------------------------------------------===//
@@ -467,15 +534,17 @@ bool AMDGPUTargetLowering::shouldReduceLoadWidth(SDNode *N,
 
 bool AMDGPUTargetLowering::isLoadBitCastBeneficial(EVT LoadTy,
                                                    EVT CastTy) const {
-  if (LoadTy.getSizeInBits() != CastTy.getSizeInBits())
-    return true;
 
-  unsigned LScalarSize = LoadTy.getScalarType().getSizeInBits();
-  unsigned CastScalarSize = CastTy.getScalarType().getSizeInBits();
+  assert(LoadTy.getSizeInBits() == CastTy.getSizeInBits());
 
-  return ((LScalarSize <= CastScalarSize) ||
-          (CastScalarSize >= 32) ||
-          (LScalarSize < 32));
+  if (LoadTy.getScalarType() == MVT::i32)
+    return false;
+
+  unsigned LScalarSize = LoadTy.getScalarSizeInBits();
+  unsigned CastScalarSize = CastTy.getScalarSizeInBits();
+
+  return (LScalarSize < CastScalarSize) ||
+         (CastScalarSize >= 32);
 }
 
 // SI+ has instructions for cttz / ctlz for 32-bit values. This is probably also
@@ -578,14 +647,13 @@ void AMDGPUTargetLowering::AnalyzeReturn(CCState &State,
   State.AnalyzeReturn(Outs, RetCC_SI);
 }
 
-SDValue AMDGPUTargetLowering::LowerReturn(
-                                     SDValue Chain,
-                                     CallingConv::ID CallConv,
-                                     bool isVarArg,
-                                     const SmallVectorImpl<ISD::OutputArg> &Outs,
-                                     const SmallVectorImpl<SDValue> &OutVals,
-                                     SDLoc DL, SelectionDAG &DAG) const {
-  return DAG.getNode(AMDGPUISD::RET_FLAG, DL, MVT::Other, Chain);
+SDValue
+AMDGPUTargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv,
+                                  bool isVarArg,
+                                  const SmallVectorImpl<ISD::OutputArg> &Outs,
+                                  const SmallVectorImpl<SDValue> &OutVals,
+                                  const SDLoc &DL, SelectionDAG &DAG) const {
+  return DAG.getNode(AMDGPUISD::ENDPGM, DL, MVT::Other, Chain);
 }
 
 //===---------------------------------------------------------------------===//
@@ -606,32 +674,38 @@ SDValue AMDGPUTargetLowering::LowerCall(CallLoweringInfo &CLI,
   else if (const GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee))
     FuncName = G->getGlobal()->getName();
 
-  DiagnosticInfoUnsupported NoCalls(Fn, "call to function " + FuncName);
+  DiagnosticInfoUnsupported NoCalls(
+      Fn, "unsupported call to function " + FuncName, CLI.DL.getDebugLoc());
   DAG.getContext()->diagnose(NoCalls);
-  return SDValue();
+
+  for (unsigned I = 0, E = CLI.Ins.size(); I != E; ++I)
+    InVals.push_back(DAG.getUNDEF(CLI.Ins[I].VT));
+
+  return DAG.getEntryNode();
 }
 
 SDValue AMDGPUTargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op,
                                                       SelectionDAG &DAG) const {
   const Function &Fn = *DAG.getMachineFunction().getFunction();
 
-  DiagnosticInfoUnsupported NoDynamicAlloca(Fn, "dynamic alloca");
+  DiagnosticInfoUnsupported NoDynamicAlloca(Fn, "unsupported dynamic alloca",
+                                            SDLoc(Op).getDebugLoc());
   DAG.getContext()->diagnose(NoDynamicAlloca);
-  return SDValue();
+  auto Ops = {DAG.getConstant(0, SDLoc(), Op.getValueType()), Op.getOperand(0)};
+  return DAG.getMergeValues(Ops, SDLoc());
 }
 
 SDValue AMDGPUTargetLowering::LowerOperation(SDValue Op,
                                              SelectionDAG &DAG) const {
   switch (Op.getOpcode()) {
   default:
-    Op.getNode()->dump();
+    Op->dump(&DAG);
     llvm_unreachable("Custom lowering code for this"
                      "instruction is not implemented yet!");
     break;
   case ISD::SIGN_EXTEND_INREG: return LowerSIGN_EXTEND_INREG(Op, DAG);
   case ISD::CONCAT_VECTORS: return LowerCONCAT_VECTORS(Op, DAG);
   case ISD::EXTRACT_SUBVECTOR: return LowerEXTRACT_SUBVECTOR(Op, DAG);
-  case ISD::FrameIndex: return LowerFrameIndex(Op, DAG);
   case ISD::INTRINSIC_WO_CHAIN: return LowerINTRINSIC_WO_CHAIN(Op, DAG);
   case ISD::UDIVREM: return LowerUDIVREM(Op, DAG);
   case ISD::SDIVREM: return LowerSDIVREM(Op, DAG);
@@ -666,24 +740,6 @@ void AMDGPUTargetLowering::ReplaceNodeResults(SDNode *N,
     // ReplaceNodeResults to sext_in_reg to an illegal type, so we'll just do
     // nothing here and let the illegal result integer be handled normally.
     return;
-  case ISD::LOAD: {
-    SDNode *Node = LowerLOAD(SDValue(N, 0), DAG).getNode();
-    if (!Node)
-      return;
-
-    Results.push_back(SDValue(Node, 0));
-    Results.push_back(SDValue(Node, 1));
-    // XXX: LLVM seems not to replace Chain Value inside CustomWidenLowerNode
-    // function
-    DAG.ReplaceAllUsesOfValueWith(SDValue(N,1), SDValue(Node, 1));
-    return;
-  }
-  case ISD::STORE: {
-    SDValue Lowered = LowerSTORE(SDValue(N, 0), DAG);
-    if (Lowered.getNode())
-      Results.push_back(Lowered);
-    return;
-  }
   default:
     return;
   }
@@ -712,16 +768,16 @@ SDValue AMDGPUTargetLowering::LowerConstantInitializer(const Constant* Init,
     EVT VT = EVT::getEVT(InitTy);
     PointerType *PtrTy = PointerType::get(InitTy, AMDGPUAS::PRIVATE_ADDRESS);
     return DAG.getStore(Chain, DL, DAG.getConstant(*CI, DL, VT), InitPtr,
-                        MachinePointerInfo(UndefValue::get(PtrTy)), false,
-                        false, TD.getPrefTypeAlignment(InitTy));
+                        MachinePointerInfo(UndefValue::get(PtrTy)),
+                        TD.getPrefTypeAlignment(InitTy));
   }
 
   if (const ConstantFP *CFP = dyn_cast<ConstantFP>(Init)) {
     EVT VT = EVT::getEVT(CFP->getType());
     PointerType *PtrTy = PointerType::get(CFP->getType(), 0);
     return DAG.getStore(Chain, DL, DAG.getConstantFP(*CFP, DL, VT), InitPtr,
-                        MachinePointerInfo(UndefValue::get(PtrTy)), false,
-                        false, TD.getPrefTypeAlignment(CFP->getType()));
+                        MachinePointerInfo(UndefValue::get(PtrTy)),
+                        TD.getPrefTypeAlignment(CFP->getType()));
   }
 
   if (StructType *ST = dyn_cast<StructType>(InitTy)) {
@@ -769,8 +825,8 @@ SDValue AMDGPUTargetLowering::LowerConstantInitializer(const Constant* Init,
     EVT VT = EVT::getEVT(InitTy);
     PointerType *PtrTy = PointerType::get(InitTy, AMDGPUAS::PRIVATE_ADDRESS);
     return DAG.getStore(Chain, DL, DAG.getUNDEF(VT), InitPtr,
-                        MachinePointerInfo(UndefValue::get(PtrTy)), false,
-                        false, TD.getPrefTypeAlignment(InitTy));
+                        MachinePointerInfo(UndefValue::get(PtrTy)),
+                        TD.getPrefTypeAlignment(InitTy));
   }
 
   Init->dump();
@@ -782,10 +838,7 @@ static bool hasDefinedInitializer(const GlobalValue *GV) {
   if (!GVar || !GVar->hasInitializer())
     return false;
 
-  if (isa<UndefValue>(GVar->getInitializer()))
-    return false;
-
-  return true;
+  return !isa<UndefValue>(GVar->getInitializer());
 }
 
 SDValue AMDGPUTargetLowering::LowerGlobalAddress(AMDGPUMachineFunction* MFI,
@@ -797,6 +850,11 @@ SDValue AMDGPUTargetLowering::LowerGlobalAddress(AMDGPUMachineFunction* MFI,
   const GlobalValue *GV = G->getGlobal();
 
   switch (G->getAddressSpace()) {
+  case AMDGPUAS::CONSTANT_ADDRESS: {
+    MVT ConstPtrVT = getPointerTy(DL, AMDGPUAS::CONSTANT_ADDRESS);
+    SDValue GA = DAG.getTargetGlobalAddress(GV, SDLoc(G), ConstPtrVT);
+    return DAG.getNode(AMDGPUISD::CONST_DATA_PTR, SDLoc(G), ConstPtrVT, GA);
+  }
   case AMDGPUAS::LOCAL_ADDRESS: {
     // XXX: What does the value of G->getOffset() mean?
     assert(G->getOffset() == 0 &&
@@ -808,11 +866,16 @@ SDValue AMDGPUTargetLowering::LowerGlobalAddress(AMDGPUMachineFunction* MFI,
 
     unsigned Offset;
     if (MFI->LocalMemoryObjects.count(GV) == 0) {
-      uint64_t Size = DL.getTypeAllocSize(GV->getType()->getElementType());
-      Offset = MFI->LDSSize;
+      unsigned Align = GV->getAlignment();
+      if (Align == 0)
+        Align = DL.getABITypeAlignment(GV->getValueType());
+
+      /// TODO: We should sort these to minimize wasted space due to alignment
+      /// padding. Currently the padding is decided by the first encountered use
+      /// during lowering.
+      Offset = MFI->LDSSize = alignTo(MFI->LDSSize, Align);
       MFI->LocalMemoryObjects[GV] = Offset;
-      // XXX: Account for alignment?
-      MFI->LDSSize += Size;
+      MFI->LDSSize += DL.getTypeAllocSize(GV->getValueType());
     } else {
       Offset = MFI->LocalMemoryObjects[GV];
     }
@@ -820,50 +883,11 @@ SDValue AMDGPUTargetLowering::LowerGlobalAddress(AMDGPUMachineFunction* MFI,
     return DAG.getConstant(Offset, SDLoc(Op),
                            getPointerTy(DL, AMDGPUAS::LOCAL_ADDRESS));
   }
-  case AMDGPUAS::CONSTANT_ADDRESS: {
-    MachineFrameInfo *FrameInfo = DAG.getMachineFunction().getFrameInfo();
-    Type *EltType = GV->getType()->getElementType();
-    unsigned Size = DL.getTypeAllocSize(EltType);
-    unsigned Alignment = DL.getPrefTypeAlignment(EltType);
-
-    MVT PrivPtrVT = getPointerTy(DL, AMDGPUAS::PRIVATE_ADDRESS);
-    MVT ConstPtrVT = getPointerTy(DL, AMDGPUAS::CONSTANT_ADDRESS);
-
-    int FI = FrameInfo->CreateStackObject(Size, Alignment, false);
-    SDValue InitPtr = DAG.getFrameIndex(FI, PrivPtrVT);
-
-    const GlobalVariable *Var = cast<GlobalVariable>(GV);
-    if (!Var->hasInitializer()) {
-      // This has no use, but bugpoint will hit it.
-      return DAG.getZExtOrTrunc(InitPtr, SDLoc(Op), ConstPtrVT);
-    }
-
-    const Constant *Init = Var->getInitializer();
-    SmallVector<SDNode*, 8> WorkList;
-
-    for (SDNode::use_iterator I = DAG.getEntryNode()->use_begin(),
-                              E = DAG.getEntryNode()->use_end(); I != E; ++I) {
-      if (I->getOpcode() != AMDGPUISD::REGISTER_LOAD && I->getOpcode() != ISD::LOAD)
-        continue;
-      WorkList.push_back(*I);
-    }
-    SDValue Chain = LowerConstantInitializer(Init, GV, InitPtr, DAG.getEntryNode(), DAG);
-    for (SmallVector<SDNode*, 8>::iterator I = WorkList.begin(),
-                                           E = WorkList.end(); I != E; ++I) {
-      SmallVector<SDValue, 8> Ops;
-      Ops.push_back(Chain);
-      for (unsigned i = 1; i < (*I)->getNumOperands(); ++i) {
-        Ops.push_back((*I)->getOperand(i));
-      }
-      DAG.UpdateNodeOperands(*I, Ops);
-    }
-    return DAG.getZExtOrTrunc(InitPtr, SDLoc(Op), ConstPtrVT);
-  }
   }
 
   const Function &Fn = *DAG.getMachineFunction().getFunction();
-  DiagnosticInfoUnsupported BadInit(Fn,
-                                    "initializer for address space");
+  DiagnosticInfoUnsupported BadInit(
+      Fn, "unsupported initializer for address space", SDLoc(Op).getDebugLoc());
   DAG.getContext()->diagnose(BadInit);
   return SDValue();
 }
@@ -875,7 +899,7 @@ SDValue AMDGPUTargetLowering::LowerCONCAT_VECTORS(SDValue Op,
   for (const SDUse &U : Op->ops())
     DAG.ExtractVectorElements(U.get(), Args);
 
-  return DAG.getNode(ISD::BUILD_VECTOR, SDLoc(Op), Op.getValueType(), Args);
+  return DAG.getBuildVector(Op.getValueType(), SDLoc(Op), Args);
 }
 
 SDValue AMDGPUTargetLowering::LowerEXTRACT_SUBVECTOR(SDValue Op,
@@ -887,23 +911,7 @@ SDValue AMDGPUTargetLowering::LowerEXTRACT_SUBVECTOR(SDValue Op,
   DAG.ExtractVectorElements(Op.getOperand(0), Args, Start,
                             VT.getVectorNumElements());
 
-  return DAG.getNode(ISD::BUILD_VECTOR, SDLoc(Op), Op.getValueType(), Args);
-}
-
-SDValue AMDGPUTargetLowering::LowerFrameIndex(SDValue Op,
-                                              SelectionDAG &DAG) const {
-
-  MachineFunction &MF = DAG.getMachineFunction();
-  const AMDGPUFrameLowering *TFL = Subtarget->getFrameLowering();
-
-  FrameIndexSDNode *FIN = cast<FrameIndexSDNode>(Op);
-
-  unsigned FrameIndex = FIN->getIndex();
-  unsigned IgnoredFrameReg;
-  unsigned Offset =
-      TFL->getFrameIndexReference(MF, FrameIndex, IgnoredFrameReg);
-  return DAG.getConstant(Offset * 4 * TFL->getStackWidth(MF), SDLoc(Op),
-                         Op.getValueType());
+  return DAG.getBuildVector(Op.getValueType(), SDLoc(Op), Args);
 }
 
 SDValue AMDGPUTargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
@@ -914,121 +922,10 @@ SDValue AMDGPUTargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
 
   switch (IntrinsicID) {
     default: return Op;
-    case AMDGPUIntrinsic::AMDGPU_abs:
-    case AMDGPUIntrinsic::AMDIL_abs: // Legacy name.
-      return LowerIntrinsicIABS(Op, DAG);
-    case AMDGPUIntrinsic::AMDGPU_lrp:
-      return LowerIntrinsicLRP(Op, DAG);
-
-    case AMDGPUIntrinsic::AMDGPU_clamp:
-    case AMDGPUIntrinsic::AMDIL_clamp: // Legacy name.
+    case AMDGPUIntrinsic::AMDGPU_clamp: // Legacy name.
       return DAG.getNode(AMDGPUISD::CLAMP, DL, VT,
                          Op.getOperand(1), Op.getOperand(2), Op.getOperand(3));
 
-    case Intrinsic::AMDGPU_div_scale: {
-      // 3rd parameter required to be a constant.
-      const ConstantSDNode *Param = dyn_cast<ConstantSDNode>(Op.getOperand(3));
-      if (!Param)
-        return DAG.getUNDEF(VT);
-
-      // Translate to the operands expected by the machine instruction. The
-      // first parameter must be the same as the first instruction.
-      SDValue Numerator = Op.getOperand(1);
-      SDValue Denominator = Op.getOperand(2);
-
-      // Note this order is opposite of the machine instruction's operations,
-      // which is s0.f = Quotient, s1.f = Denominator, s2.f = Numerator. The
-      // intrinsic has the numerator as the first operand to match a normal
-      // division operation.
-
-      SDValue Src0 = Param->isAllOnesValue() ? Numerator : Denominator;
-
-      return DAG.getNode(AMDGPUISD::DIV_SCALE, DL, Op->getVTList(), Src0,
-                         Denominator, Numerator);
-    }
-
-    case Intrinsic::AMDGPU_div_fmas:
-      return DAG.getNode(AMDGPUISD::DIV_FMAS, DL, VT,
-                         Op.getOperand(1), Op.getOperand(2), Op.getOperand(3),
-                         Op.getOperand(4));
-
-    case Intrinsic::AMDGPU_div_fixup:
-      return DAG.getNode(AMDGPUISD::DIV_FIXUP, DL, VT,
-                         Op.getOperand(1), Op.getOperand(2), Op.getOperand(3));
-
-    case Intrinsic::AMDGPU_trig_preop:
-      return DAG.getNode(AMDGPUISD::TRIG_PREOP, DL, VT,
-                         Op.getOperand(1), Op.getOperand(2));
-
-    case Intrinsic::AMDGPU_rcp:
-      return DAG.getNode(AMDGPUISD::RCP, DL, VT, Op.getOperand(1));
-
-    case Intrinsic::AMDGPU_rsq:
-      return DAG.getNode(AMDGPUISD::RSQ, DL, VT, Op.getOperand(1));
-
-    case AMDGPUIntrinsic::AMDGPU_legacy_rsq:
-      return DAG.getNode(AMDGPUISD::RSQ_LEGACY, DL, VT, Op.getOperand(1));
-
-    case Intrinsic::AMDGPU_rsq_clamped:
-      if (Subtarget->getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS) {
-        Type *Type = VT.getTypeForEVT(*DAG.getContext());
-        APFloat Max = APFloat::getLargest(Type->getFltSemantics());
-        APFloat Min = APFloat::getLargest(Type->getFltSemantics(), true);
-
-        SDValue Rsq = DAG.getNode(AMDGPUISD::RSQ, DL, VT, Op.getOperand(1));
-        SDValue Tmp = DAG.getNode(ISD::FMINNUM, DL, VT, Rsq,
-                                  DAG.getConstantFP(Max, DL, VT));
-        return DAG.getNode(ISD::FMAXNUM, DL, VT, Tmp,
-                           DAG.getConstantFP(Min, DL, VT));
-      } else {
-        return DAG.getNode(AMDGPUISD::RSQ_CLAMPED, DL, VT, Op.getOperand(1));
-      }
-
-    case Intrinsic::AMDGPU_ldexp:
-      return DAG.getNode(AMDGPUISD::LDEXP, DL, VT, Op.getOperand(1),
-                                                   Op.getOperand(2));
-
-    case AMDGPUIntrinsic::AMDGPU_imax:
-      return DAG.getNode(ISD::SMAX, DL, VT, Op.getOperand(1),
-                                            Op.getOperand(2));
-    case AMDGPUIntrinsic::AMDGPU_umax:
-      return DAG.getNode(ISD::UMAX, DL, VT, Op.getOperand(1),
-                                            Op.getOperand(2));
-    case AMDGPUIntrinsic::AMDGPU_imin:
-      return DAG.getNode(ISD::SMIN, DL, VT, Op.getOperand(1),
-                                            Op.getOperand(2));
-    case AMDGPUIntrinsic::AMDGPU_umin:
-      return DAG.getNode(ISD::UMIN, DL, VT, Op.getOperand(1),
-                                            Op.getOperand(2));
-
-    case AMDGPUIntrinsic::AMDGPU_umul24:
-      return DAG.getNode(AMDGPUISD::MUL_U24, DL, VT,
-                         Op.getOperand(1), Op.getOperand(2));
-
-    case AMDGPUIntrinsic::AMDGPU_imul24:
-      return DAG.getNode(AMDGPUISD::MUL_I24, DL, VT,
-                         Op.getOperand(1), Op.getOperand(2));
-
-    case AMDGPUIntrinsic::AMDGPU_umad24:
-      return DAG.getNode(AMDGPUISD::MAD_U24, DL, VT,
-                         Op.getOperand(1), Op.getOperand(2), Op.getOperand(3));
-
-    case AMDGPUIntrinsic::AMDGPU_imad24:
-      return DAG.getNode(AMDGPUISD::MAD_I24, DL, VT,
-                         Op.getOperand(1), Op.getOperand(2), Op.getOperand(3));
-
-    case AMDGPUIntrinsic::AMDGPU_cvt_f32_ubyte0:
-      return DAG.getNode(AMDGPUISD::CVT_F32_UBYTE0, DL, VT, Op.getOperand(1));
-
-    case AMDGPUIntrinsic::AMDGPU_cvt_f32_ubyte1:
-      return DAG.getNode(AMDGPUISD::CVT_F32_UBYTE1, DL, VT, Op.getOperand(1));
-
-    case AMDGPUIntrinsic::AMDGPU_cvt_f32_ubyte2:
-      return DAG.getNode(AMDGPUISD::CVT_F32_UBYTE2, DL, VT, Op.getOperand(1));
-
-    case AMDGPUIntrinsic::AMDGPU_cvt_f32_ubyte3:
-      return DAG.getNode(AMDGPUISD::CVT_F32_UBYTE3, DL, VT, Op.getOperand(1));
-
     case AMDGPUIntrinsic::AMDGPU_bfe_i32:
       return DAG.getNode(AMDGPUISD::BFE_I32, DL, VT,
                          Op.getOperand(1),
@@ -1040,69 +937,13 @@ SDValue AMDGPUTargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
                          Op.getOperand(1),
                          Op.getOperand(2),
                          Op.getOperand(3));
-
-    case AMDGPUIntrinsic::AMDGPU_bfi:
-      return DAG.getNode(AMDGPUISD::BFI, DL, VT,
-                         Op.getOperand(1),
-                         Op.getOperand(2),
-                         Op.getOperand(3));
-
-    case AMDGPUIntrinsic::AMDGPU_bfm:
-      return DAG.getNode(AMDGPUISD::BFM, DL, VT,
-                         Op.getOperand(1),
-                         Op.getOperand(2));
-
-  case Intrinsic::AMDGPU_class:
-    return DAG.getNode(AMDGPUISD::FP_CLASS, DL, VT,
-                       Op.getOperand(1), Op.getOperand(2));
-
-    case AMDGPUIntrinsic::AMDIL_exp: // Legacy name.
-      return DAG.getNode(ISD::FEXP2, DL, VT, Op.getOperand(1));
-
-    case AMDGPUIntrinsic::AMDIL_round_nearest: // Legacy name.
-      return DAG.getNode(ISD::FRINT, DL, VT, Op.getOperand(1));
-    case AMDGPUIntrinsic::AMDGPU_trunc: // Legacy name.
-      return DAG.getNode(ISD::FTRUNC, DL, VT, Op.getOperand(1));
-    case AMDGPUIntrinsic::AMDGPU_brev: // Legacy name
-      return DAG.getNode(ISD::BITREVERSE, DL, VT, Op.getOperand(1));
   }
 }
 
-///IABS(a) = SMAX(sub(0, a), a)
-SDValue AMDGPUTargetLowering::LowerIntrinsicIABS(SDValue Op,
-                                                 SelectionDAG &DAG) const {
-  SDLoc DL(Op);
-  EVT VT = Op.getValueType();
-  SDValue Neg = DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, DL, VT),
-                            Op.getOperand(1));
-
-  return DAG.getNode(ISD::SMAX, DL, VT, Neg, Op.getOperand(1));
-}
-
-/// Linear Interpolation
-/// LRP(a, b, c) = muladd(a,  b, (1 - a) * c)
-SDValue AMDGPUTargetLowering::LowerIntrinsicLRP(SDValue Op,
-                                                SelectionDAG &DAG) const {
-  SDLoc DL(Op);
-  EVT VT = Op.getValueType();
-  // TODO: Should this propagate fast-math-flags?
-  SDValue OneSubA = DAG.getNode(ISD::FSUB, DL, VT,
-                                DAG.getConstantFP(1.0f, DL, MVT::f32),
-                                Op.getOperand(1));
-  SDValue OneSubAC = DAG.getNode(ISD::FMUL, DL, VT, OneSubA,
-                                                    Op.getOperand(3));
-  return DAG.getNode(ISD::FADD, DL, VT,
-      DAG.getNode(ISD::FMUL, DL, VT, Op.getOperand(1), Op.getOperand(2)),
-      OneSubAC);
-}
-
 /// \brief Generate Min/Max node
-SDValue AMDGPUTargetLowering::CombineFMinMaxLegacy(SDLoc DL,
-                                                   EVT VT,
-                                                   SDValue LHS,
-                                                   SDValue RHS,
-                                                   SDValue True,
-                                                   SDValue False,
+SDValue AMDGPUTargetLowering::CombineFMinMaxLegacy(const SDLoc &DL, EVT VT,
+                                                   SDValue LHS, SDValue RHS,
+                                                   SDValue True, SDValue False,
                                                    SDValue CC,
                                                    DAGCombinerInfo &DCI) const {
   if (Subtarget->getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS)
@@ -1176,56 +1017,48 @@ SDValue AMDGPUTargetLowering::CombineFMinMaxLegacy(SDLoc DL,
   return SDValue();
 }
 
-SDValue AMDGPUTargetLowering::ScalarizeVectorLoad(const SDValue Op,
-                                                  SelectionDAG &DAG) const {
-  LoadSDNode *Load = cast<LoadSDNode>(Op);
-  EVT MemVT = Load->getMemoryVT();
-  EVT MemEltVT = MemVT.getVectorElementType();
+std::pair<SDValue, SDValue>
+AMDGPUTargetLowering::split64BitValue(SDValue Op, SelectionDAG &DAG) const {
+  SDLoc SL(Op);
 
-  EVT LoadVT = Op.getValueType();
-  EVT EltVT = LoadVT.getVectorElementType();
-  EVT PtrVT = Load->getBasePtr().getValueType();
+  SDValue Vec = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, Op);
 
-  unsigned NumElts = Load->getMemoryVT().getVectorNumElements();
-  SmallVector<SDValue, 8> Loads;
-  SmallVector<SDValue, 8> Chains;
+  const SDValue Zero = DAG.getConstant(0, SL, MVT::i32);
+  const SDValue One = DAG.getConstant(1, SL, MVT::i32);
 
-  SDLoc SL(Op);
-  unsigned MemEltSize = MemEltVT.getStoreSize();
-  MachinePointerInfo SrcValue(Load->getMemOperand()->getValue());
+  SDValue Lo = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Vec, Zero);
+  SDValue Hi = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Vec, One);
 
-  for (unsigned i = 0; i < NumElts; ++i) {
-    SDValue Ptr = DAG.getNode(ISD::ADD, SL, PtrVT, Load->getBasePtr(),
-                              DAG.getConstant(i * MemEltSize, SL, PtrVT));
+  return std::make_pair(Lo, Hi);
+}
 
-    SDValue NewLoad
-      = DAG.getExtLoad(Load->getExtensionType(), SL, EltVT,
-                       Load->getChain(), Ptr,
-                       SrcValue.getWithOffset(i * MemEltSize),
-                       MemEltVT, Load->isVolatile(), Load->isNonTemporal(),
-                       Load->isInvariant(), Load->getAlignment());
-    Loads.push_back(NewLoad.getValue(0));
-    Chains.push_back(NewLoad.getValue(1));
-  }
+SDValue AMDGPUTargetLowering::getLoHalf64(SDValue Op, SelectionDAG &DAG) const {
+  SDLoc SL(Op);
 
-  SDValue Ops[] = {
-    DAG.getNode(ISD::BUILD_VECTOR, SL, LoadVT, Loads),
-    DAG.getNode(ISD::TokenFactor, SL, MVT::Other, Chains)
-  };
+  SDValue Vec = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, Op);
+  const SDValue Zero = DAG.getConstant(0, SL, MVT::i32);
+  return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Vec, Zero);
+}
 
-  return DAG.getMergeValues(Ops, SL);
+SDValue AMDGPUTargetLowering::getHiHalf64(SDValue Op, SelectionDAG &DAG) const {
+  SDLoc SL(Op);
+
+  SDValue Vec = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, Op);
+  const SDValue One = DAG.getConstant(1, SL, MVT::i32);
+  return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Vec, One);
 }
 
 SDValue AMDGPUTargetLowering::SplitVectorLoad(const SDValue Op,
                                               SelectionDAG &DAG) const {
+  LoadSDNode *Load = cast<LoadSDNode>(Op);
   EVT VT = Op.getValueType();
 
+
   // If this is a 2 element vector, we really want to scalarize and not create
   // weird 1 element vectors.
   if (VT.getVectorNumElements() == 2)
-    return ScalarizeVectorLoad(Op, DAG);
+    return scalarizeVectorLoad(Load, DAG);
 
-  LoadSDNode *Load = cast<LoadSDNode>(Op);
   SDValue BasePtr = Load->getBasePtr();
   EVT PtrVT = BasePtr.getValueType();
   EVT MemVT = Load->getMemoryVT();
@@ -1245,22 +1078,15 @@ SDValue AMDGPUTargetLowering::SplitVectorLoad(const SDValue Op,
   unsigned BaseAlign = Load->getAlignment();
   unsigned HiAlign = MinAlign(BaseAlign, Size);
 
-  SDValue LoLoad
-    = DAG.getExtLoad(Load->getExtensionType(), SL, LoVT,
-                     Load->getChain(), BasePtr,
-                     SrcValue,
-                     LoMemVT, Load->isVolatile(), Load->isNonTemporal(),
-                     Load->isInvariant(), BaseAlign);
-
+  SDValue LoLoad = DAG.getExtLoad(Load->getExtensionType(), SL, LoVT,
+                                  Load->getChain(), BasePtr, SrcValue, LoMemVT,
+                                  BaseAlign, Load->getMemOperand()->getFlags());
   SDValue HiPtr = DAG.getNode(ISD::ADD, SL, PtrVT, BasePtr,
                               DAG.getConstant(Size, SL, PtrVT));
-
-  SDValue HiLoad
-    = DAG.getExtLoad(Load->getExtensionType(), SL, HiVT,
-                     Load->getChain(), HiPtr,
-                     SrcValue.getWithOffset(LoMemVT.getStoreSize()),
-                     HiMemVT, Load->isVolatile(), Load->isNonTemporal(),
-                     Load->isInvariant(), HiAlign);
+  SDValue HiLoad =
+      DAG.getExtLoad(Load->getExtensionType(), SL, HiVT, Load->getChain(),
+                     HiPtr, SrcValue.getWithOffset(LoMemVT.getStoreSize()),
+                     HiMemVT, HiAlign, Load->getMemOperand()->getFlags());
 
   SDValue Ops[] = {
     DAG.getNode(ISD::CONCAT_VECTORS, SL, VT, LoLoad, HiLoad),
@@ -1271,6 +1097,8 @@ SDValue AMDGPUTargetLowering::SplitVectorLoad(const SDValue Op,
   return DAG.getMergeValues(Ops, SL);
 }
 
+// FIXME: This isn't doing anything for SI. This should be used in a target
+// combine during type legalization.
 SDValue AMDGPUTargetLowering::MergeVectorStore(const SDValue &Op,
                                                SelectionDAG &DAG) const {
   StoreSDNode *Store = cast<StoreSDNode>(Op);
@@ -1317,48 +1145,15 @@ SDValue AMDGPUTargetLowering::MergeVectorStore(const SDValue &Op,
   if (PackedSize < 32) {
     EVT PackedVT = EVT::getIntegerVT(*DAG.getContext(), PackedSize);
     return DAG.getTruncStore(Store->getChain(), DL, PackedValue, Ptr,
-                             Store->getMemOperand()->getPointerInfo(),
-                             PackedVT,
-                             Store->isNonTemporal(), Store->isVolatile(),
-                             Store->getAlignment());
+                             Store->getMemOperand()->getPointerInfo(), PackedVT,
+                             Store->getAlignment(),
+                             Store->getMemOperand()->getFlags());
   }
 
   return DAG.getStore(Store->getChain(), DL, PackedValue, Ptr,
                       Store->getMemOperand()->getPointerInfo(),
-                      Store->isVolatile(),  Store->isNonTemporal(),
-                      Store->getAlignment());
-}
-
-SDValue AMDGPUTargetLowering::ScalarizeVectorStore(SDValue Op,
-                                                   SelectionDAG &DAG) const {
-  StoreSDNode *Store = cast<StoreSDNode>(Op);
-  EVT MemEltVT = Store->getMemoryVT().getVectorElementType();
-  EVT EltVT = Store->getValue().getValueType().getVectorElementType();
-  EVT PtrVT = Store->getBasePtr().getValueType();
-  unsigned NumElts = Store->getMemoryVT().getVectorNumElements();
-  SDLoc SL(Op);
-
-  SmallVector<SDValue, 8> Chains;
-
-  unsigned EltSize = MemEltVT.getStoreSize();
-  MachinePointerInfo SrcValue(Store->getMemOperand()->getValue());
-
-  for (unsigned i = 0, e = NumElts; i != e; ++i) {
-    SDValue Val = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, EltVT,
-                              Store->getValue(),
-                              DAG.getConstant(i, SL, MVT::i32));
-
-    SDValue Offset = DAG.getConstant(i * MemEltVT.getStoreSize(), SL, PtrVT);
-    SDValue Ptr = DAG.getNode(ISD::ADD, SL, PtrVT, Store->getBasePtr(), Offset);
-    SDValue NewStore =
-      DAG.getTruncStore(Store->getChain(), SL, Val, Ptr,
-                        SrcValue.getWithOffset(i * EltSize),
-                        MemEltVT, Store->isNonTemporal(), Store->isVolatile(),
-                        Store->getAlignment());
-    Chains.push_back(NewStore);
-  }
-
-  return DAG.getNode(ISD::TokenFactor, SL, MVT::Other, Chains);
+                      Store->getAlignment(),
+                      Store->getMemOperand()->getFlags());
 }
 
 SDValue AMDGPUTargetLowering::SplitVectorStore(SDValue Op,
@@ -1370,7 +1165,7 @@ SDValue AMDGPUTargetLowering::SplitVectorStore(SDValue Op,
   // If this is a 2 element vector, we really want to scalarize and not create
   // weird 1 element vectors.
   if (VT.getVectorNumElements() == 2)
-    return ScalarizeVectorStore(Op, DAG);
+    return scalarizeVectorStore(Store, DAG);
 
   EVT MemVT = Store->getMemoryVT();
   SDValue Chain = Store->getChain();
@@ -1395,171 +1190,21 @@ SDValue AMDGPUTargetLowering::SplitVectorStore(SDValue Op,
   unsigned Size = LoMemVT.getStoreSize();
   unsigned HiAlign = MinAlign(BaseAlign, Size);
 
-  SDValue LoStore
-    = DAG.getTruncStore(Chain, SL, Lo,
-                        BasePtr,
-                        SrcValue,
-                        LoMemVT,
-                        Store->isNonTemporal(),
-                        Store->isVolatile(),
-                        BaseAlign);
-  SDValue HiStore
-    = DAG.getTruncStore(Chain, SL, Hi,
-                        HiPtr,
-                        SrcValue.getWithOffset(Size),
-                        HiMemVT,
-                        Store->isNonTemporal(),
-                        Store->isVolatile(),
-                        HiAlign);
+  SDValue LoStore =
+      DAG.getTruncStore(Chain, SL, Lo, BasePtr, SrcValue, LoMemVT, BaseAlign,
+                        Store->getMemOperand()->getFlags());
+  SDValue HiStore =
+      DAG.getTruncStore(Chain, SL, Hi, HiPtr, SrcValue.getWithOffset(Size),
+                        HiMemVT, HiAlign, Store->getMemOperand()->getFlags());
 
   return DAG.getNode(ISD::TokenFactor, SL, MVT::Other, LoStore, HiStore);
 }
 
-
-SDValue AMDGPUTargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const {
-  SDLoc DL(Op);
-  LoadSDNode *Load = cast<LoadSDNode>(Op);
-  ISD::LoadExtType ExtType = Load->getExtensionType();
-  EVT VT = Op.getValueType();
-  EVT MemVT = Load->getMemoryVT();
-
-  if (ExtType == ISD::NON_EXTLOAD && VT.getSizeInBits() < 32) {
-    assert(VT == MVT::i1 && "Only i1 non-extloads expected");
-    // FIXME: Copied from PPC
-    // First, load into 32 bits, then truncate to 1 bit.
-
-    SDValue Chain = Load->getChain();
-    SDValue BasePtr = Load->getBasePtr();
-    MachineMemOperand *MMO = Load->getMemOperand();
-
-    SDValue NewLD = DAG.getExtLoad(ISD::EXTLOAD, DL, MVT::i32, Chain,
-                                   BasePtr, MVT::i8, MMO);
-
-    SDValue Ops[] = {
-      DAG.getNode(ISD::TRUNCATE, DL, VT, NewLD),
-      NewLD.getValue(1)
-    };
-
-    return DAG.getMergeValues(Ops, DL);
-  }
-
-  if (Subtarget->getGeneration() >= AMDGPUSubtarget::SOUTHERN_ISLANDS ||
-      Load->getAddressSpace() != AMDGPUAS::PRIVATE_ADDRESS ||
-      ExtType == ISD::NON_EXTLOAD || Load->getMemoryVT().bitsGE(MVT::i32))
-    return SDValue();
-
-  // <SI && AS=PRIVATE && EXTLOAD && size < 32bit,
-  // register (2-)byte extract.
-
-  // Get Register holding the target.
-  SDValue Ptr = DAG.getNode(ISD::SRL, DL, MVT::i32, Load->getBasePtr(),
-                            DAG.getConstant(2, DL, MVT::i32));
-  // Load the Register.
-  SDValue Ret = DAG.getNode(AMDGPUISD::REGISTER_LOAD, DL, Op.getValueType(),
-                            Load->getChain(), Ptr,
-                            DAG.getTargetConstant(0, DL, MVT::i32),
-                            Op.getOperand(2));
-
-  // Get offset within the register.
-  SDValue ByteIdx = DAG.getNode(ISD::AND, DL, MVT::i32,
-                                Load->getBasePtr(),
-                                DAG.getConstant(0x3, DL, MVT::i32));
-
-  // Bit offset of target byte (byteIdx * 8).
-  SDValue ShiftAmt = DAG.getNode(ISD::SHL, DL, MVT::i32, ByteIdx,
-                                 DAG.getConstant(3, DL, MVT::i32));
-
-  // Shift to the right.
-  Ret = DAG.getNode(ISD::SRL, DL, MVT::i32, Ret, ShiftAmt);
-
-  // Eliminate the upper bits by setting them to ...
-  EVT MemEltVT = MemVT.getScalarType();
-
-  // ... ones.
-  if (ExtType == ISD::SEXTLOAD) {
-    SDValue MemEltVTNode = DAG.getValueType(MemEltVT);
-
-    SDValue Ops[] = {
-      DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, MVT::i32, Ret, MemEltVTNode),
-      Load->getChain()
-    };
-
-    return DAG.getMergeValues(Ops, DL);
-  }
-
-  // ... or zeros.
-  SDValue Ops[] = {
-    DAG.getZeroExtendInReg(Ret, DL, MemEltVT),
-    Load->getChain()
-  };
-
-  return DAG.getMergeValues(Ops, DL);
-}
-
-SDValue AMDGPUTargetLowering::LowerSTORE(SDValue Op, SelectionDAG &DAG) const {
-  SDLoc DL(Op);
-  SDValue Result = AMDGPUTargetLowering::MergeVectorStore(Op, DAG);
-  if (Result.getNode()) {
-    return Result;
-  }
-
-  StoreSDNode *Store = cast<StoreSDNode>(Op);
-  SDValue Chain = Store->getChain();
-  if ((Store->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS ||
-       Store->getAddressSpace() == AMDGPUAS::PRIVATE_ADDRESS) &&
-      Store->getValue().getValueType().isVector()) {
-    return SplitVectorStore(Op, DAG);
-  }
-
-  EVT MemVT = Store->getMemoryVT();
-  if (Store->getAddressSpace() == AMDGPUAS::PRIVATE_ADDRESS &&
-      MemVT.bitsLT(MVT::i32)) {
-    unsigned Mask = 0;
-    if (Store->getMemoryVT() == MVT::i8) {
-      Mask = 0xff;
-    } else if (Store->getMemoryVT() == MVT::i16) {
-      Mask = 0xffff;
-    }
-    SDValue BasePtr = Store->getBasePtr();
-    SDValue Ptr = DAG.getNode(ISD::SRL, DL, MVT::i32, BasePtr,
-                              DAG.getConstant(2, DL, MVT::i32));
-    SDValue Dst = DAG.getNode(AMDGPUISD::REGISTER_LOAD, DL, MVT::i32,
-                              Chain, Ptr,
-                              DAG.getTargetConstant(0, DL, MVT::i32));
-
-    SDValue ByteIdx = DAG.getNode(ISD::AND, DL, MVT::i32, BasePtr,
-                                  DAG.getConstant(0x3, DL, MVT::i32));
-
-    SDValue ShiftAmt = DAG.getNode(ISD::SHL, DL, MVT::i32, ByteIdx,
-                                   DAG.getConstant(3, DL, MVT::i32));
-
-    SDValue SExtValue = DAG.getNode(ISD::SIGN_EXTEND, DL, MVT::i32,
-                                    Store->getValue());
-
-    SDValue MaskedValue = DAG.getZeroExtendInReg(SExtValue, DL, MemVT);
-
-    SDValue ShiftedValue = DAG.getNode(ISD::SHL, DL, MVT::i32,
-                                       MaskedValue, ShiftAmt);
-
-    SDValue DstMask = DAG.getNode(ISD::SHL, DL, MVT::i32,
-                                  DAG.getConstant(Mask, DL, MVT::i32),
-                                  ShiftAmt);
-    DstMask = DAG.getNode(ISD::XOR, DL, MVT::i32, DstMask,
-                          DAG.getConstant(0xffffffff, DL, MVT::i32));
-    Dst = DAG.getNode(ISD::AND, DL, MVT::i32, Dst, DstMask);
-
-    SDValue Value = DAG.getNode(ISD::OR, DL, MVT::i32, Dst, ShiftedValue);
-    return DAG.getNode(AMDGPUISD::REGISTER_STORE, DL, MVT::Other,
-                       Chain, Value, Ptr,
-                       DAG.getTargetConstant(0, DL, MVT::i32));
-  }
-  return SDValue();
-}
-
 // This is a shortcut for integer division because we have fast i32<->f32
 // conversions, and fast f32 reciprocal instructions. The fractional part of a
-// float is enough to accurately represent up to a 24-bit integer.
-SDValue AMDGPUTargetLowering::LowerDIVREM24(SDValue Op, SelectionDAG &DAG, bool sign) const {
+// float is enough to accurately represent up to a 24-bit signed integer.
+SDValue AMDGPUTargetLowering::LowerDIVREM24(SDValue Op, SelectionDAG &DAG,
+                                            bool Sign) const {
   SDLoc DL(Op);
   EVT VT = Op.getValueType();
   SDValue LHS = Op.getOperand(0);
@@ -1567,20 +1212,26 @@ SDValue AMDGPUTargetLowering::LowerDIVREM24(SDValue Op, SelectionDAG &DAG, bool
   MVT IntVT = MVT::i32;
   MVT FltVT = MVT::f32;
 
-  ISD::NodeType ToFp  = sign ? ISD::SINT_TO_FP : ISD::UINT_TO_FP;
-  ISD::NodeType ToInt = sign ? ISD::FP_TO_SINT : ISD::FP_TO_UINT;
+  unsigned LHSSignBits = DAG.ComputeNumSignBits(LHS);
+  if (LHSSignBits < 9)
+    return SDValue();
 
-  if (VT.isVector()) {
-    unsigned NElts = VT.getVectorNumElements();
-    IntVT = MVT::getVectorVT(MVT::i32, NElts);
-    FltVT = MVT::getVectorVT(MVT::f32, NElts);
-  }
+  unsigned RHSSignBits = DAG.ComputeNumSignBits(RHS);
+  if (RHSSignBits < 9)
+    return SDValue();
+
+  unsigned BitSize = VT.getSizeInBits();
+  unsigned SignBits = std::min(LHSSignBits, RHSSignBits);
+  unsigned DivBits = BitSize - SignBits;
+  if (Sign)
+    ++DivBits;
 
-  unsigned BitSize = VT.getScalarType().getSizeInBits();
+  ISD::NodeType ToFp = Sign ? ISD::SINT_TO_FP : ISD::UINT_TO_FP;
+  ISD::NodeType ToInt = Sign ? ISD::FP_TO_SINT : ISD::FP_TO_UINT;
 
   SDValue jq = DAG.getConstant(1, DL, IntVT);
 
-  if (sign) {
+  if (Sign) {
     // char|short jq = ia ^ ib;
     jq = DAG.getNode(ISD::XOR, DL, VT, LHS, RHS);
 
@@ -1590,18 +1241,13 @@ SDValue AMDGPUTargetLowering::LowerDIVREM24(SDValue Op, SelectionDAG &DAG, bool
 
     // jq = jq | 0x1
     jq = DAG.getNode(ISD::OR, DL, VT, jq, DAG.getConstant(1, DL, VT));
-
-    // jq = (int)jq
-    jq = DAG.getSExtOrTrunc(jq, DL, IntVT);
   }
 
   // int ia = (int)LHS;
-  SDValue ia = sign ?
-    DAG.getSExtOrTrunc(LHS, DL, IntVT) : DAG.getZExtOrTrunc(LHS, DL, IntVT);
+  SDValue ia = LHS;
 
   // int ib, (int)RHS;
-  SDValue ib = sign ?
-    DAG.getSExtOrTrunc(RHS, DL, IntVT) : DAG.getZExtOrTrunc(RHS, DL, IntVT);
+  SDValue ib = RHS;
 
   // float fa = (float)ia;
   SDValue fa = DAG.getNode(ToFp, DL, FltVT, ia);
@@ -1609,8 +1255,6 @@ SDValue AMDGPUTargetLowering::LowerDIVREM24(SDValue Op, SelectionDAG &DAG, bool
   // float fb = (float)ib;
   SDValue fb = DAG.getNode(ToFp, DL, FltVT, ib);
 
-  // TODO: Should this propagate fast-math-flags?
-  // float fq = native_divide(fa, fb);
   SDValue fq = DAG.getNode(ISD::FMUL, DL, FltVT,
                            fa, DAG.getNode(AMDGPUISD::RCP, DL, FltVT, fb));
 
@@ -1621,8 +1265,7 @@ SDValue AMDGPUTargetLowering::LowerDIVREM24(SDValue Op, SelectionDAG &DAG, bool
   SDValue fqneg = DAG.getNode(ISD::FNEG, DL, FltVT, fq);
 
   // float fr = mad(fqneg, fb, fa);
-  SDValue fr = DAG.getNode(ISD::FADD, DL, FltVT,
-                           DAG.getNode(ISD::FMUL, DL, FltVT, fqneg, fb), fa);
+  SDValue fr = DAG.getNode(ISD::FMAD, DL, FltVT, fqneg, fb, fa);
 
   // int iq = (int)fq;
   SDValue iq = DAG.getNode(ToInt, DL, IntVT, fq);
@@ -1641,9 +1284,6 @@ SDValue AMDGPUTargetLowering::LowerDIVREM24(SDValue Op, SelectionDAG &DAG, bool
   // jq = (cv ? jq : 0);
   jq = DAG.getNode(ISD::SELECT, DL, VT, cv, jq, DAG.getConstant(0, DL, VT));
 
-  // dst = trunc/extend to legal type
-  iq = sign ? DAG.getSExtOrTrunc(iq, DL, VT) : DAG.getZExtOrTrunc(iq, DL, VT);
-
   // dst = iq + jq;
   SDValue Div = DAG.getNode(ISD::ADD, DL, VT, iq, jq);
 
@@ -1651,11 +1291,19 @@ SDValue AMDGPUTargetLowering::LowerDIVREM24(SDValue Op, SelectionDAG &DAG, bool
   SDValue Rem = DAG.getNode(ISD::MUL, DL, VT, Div, RHS);
   Rem = DAG.getNode(ISD::SUB, DL, VT, LHS, Rem);
 
-  SDValue Res[2] = {
-    Div,
-    Rem
-  };
-  return DAG.getMergeValues(Res, DL);
+  // Truncate to number of bits this divide really is.
+  if (Sign) {
+    SDValue InRegSize
+      = DAG.getValueType(EVT::getIntegerVT(*DAG.getContext(), DivBits));
+    Div = DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, VT, Div, InRegSize);
+    Rem = DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, VT, Rem, InRegSize);
+  } else {
+    SDValue TruncMask = DAG.getConstant((UINT64_C(1) << DivBits) - 1, DL, VT);
+    Div = DAG.getNode(ISD::AND, DL, VT, Div, TruncMask);
+    Rem = DAG.getNode(ISD::AND, DL, VT, Rem, TruncMask);
+  }
+
+  return DAG.getMergeValues({ Div, Rem }, DL);
 }
 
 void AMDGPUTargetLowering::LowerUDIVREM64(SDValue Op,
@@ -1686,10 +1334,11 @@ void AMDGPUTargetLowering::LowerUDIVREM64(SDValue Op,
     SDValue Res = DAG.getNode(ISD::UDIVREM, DL, DAG.getVTList(HalfVT, HalfVT),
                               LHS_Lo, RHS_Lo);
 
-    SDValue DIV = DAG.getNode(ISD::BUILD_PAIR, DL, VT, Res.getValue(0), zero);
-    SDValue REM = DAG.getNode(ISD::BUILD_PAIR, DL, VT, Res.getValue(1), zero);
-    Results.push_back(DIV);
-    Results.push_back(REM);
+    SDValue DIV = DAG.getBuildVector(MVT::v2i32, DL, {Res.getValue(0), zero});
+    SDValue REM = DAG.getBuildVector(MVT::v2i32, DL, {Res.getValue(1), zero});
+
+    Results.push_back(DAG.getNode(ISD::BITCAST, DL, MVT::i64, DIV));
+    Results.push_back(DAG.getNode(ISD::BITCAST, DL, MVT::i64, REM));
     return;
   }
 
@@ -1698,7 +1347,8 @@ void AMDGPUTargetLowering::LowerUDIVREM64(SDValue Op,
   SDValue REM_Part = DAG.getNode(ISD::UREM, DL, HalfVT, LHS_Hi, RHS_Lo);
 
   SDValue REM_Lo = DAG.getSelectCC(DL, RHS_Hi, zero, REM_Part, LHS_Hi, ISD::SETEQ);
-  SDValue REM = DAG.getNode(ISD::BUILD_PAIR, DL, VT, REM_Lo, zero);
+  SDValue REM = DAG.getBuildVector(MVT::v2i32, DL, {REM_Lo, zero});
+  REM = DAG.getNode(ISD::BITCAST, DL, MVT::i64, REM);
 
   SDValue DIV_Hi = DAG.getSelectCC(DL, RHS_Hi, zero, DIV_Part, zero, ISD::SETEQ);
   SDValue DIV_Lo = zero;
@@ -1718,7 +1368,7 @@ void AMDGPUTargetLowering::LowerUDIVREM64(SDValue Op,
     // Add LHS high bit
     REM = DAG.getNode(ISD::OR, DL, VT, REM, HBit);
 
-    SDValue BIT = DAG.getConstant(1 << bitPos, DL, HalfVT);
+    SDValue BIT = DAG.getConstant(1ULL << bitPos, DL, HalfVT);
     SDValue realBIT = DAG.getSelectCC(DL, REM, RHS, BIT, zero, ISD::SETUGE);
 
     DIV_Lo = DAG.getNode(ISD::OR, DL, HalfVT, DIV_Lo, realBIT);
@@ -1728,7 +1378,8 @@ void AMDGPUTargetLowering::LowerUDIVREM64(SDValue Op,
     REM = DAG.getSelectCC(DL, REM, RHS, REM_sub, REM, ISD::SETUGE);
   }
 
-  SDValue DIV = DAG.getNode(ISD::BUILD_PAIR, DL, VT, DIV_Lo, DIV_Hi);
+  SDValue DIV = DAG.getBuildVector(MVT::v2i32, DL, {DIV_Lo, DIV_Hi});
+  DIV = DAG.getNode(ISD::BITCAST, DL, MVT::i64, DIV);
   Results.push_back(DIV);
   Results.push_back(REM);
 }
@@ -1744,19 +1395,14 @@ SDValue AMDGPUTargetLowering::LowerUDIVREM(SDValue Op,
     return DAG.getMergeValues(Results, DL);
   }
 
-  SDValue Num = Op.getOperand(0);
-  SDValue Den = Op.getOperand(1);
-
   if (VT == MVT::i32) {
-    if (DAG.MaskedValueIsZero(Num, APInt::getHighBitsSet(32, 8)) &&
-        DAG.MaskedValueIsZero(Den, APInt::getHighBitsSet(32, 8))) {
-      // TODO: We technically could do this for i64, but shouldn't that just be
-      // handled by something generally reducing 64-bit division on 32-bit
-      // values to 32-bit?
-      return LowerDIVREM24(Op, DAG, false);
-    }
+    if (SDValue Res = LowerDIVREM24(Op, DAG, false))
+      return Res;
   }
 
+  SDValue Num = Op.getOperand(0);
+  SDValue Den = Op.getOperand(1);
+
   // RCP =  URECIP(Den) = 2^32 / Den + e
   // e is rounding error.
   SDValue RCP = DAG.getNode(AMDGPUISD::URECIP, DL, VT, Den);
@@ -1864,11 +1510,11 @@ SDValue AMDGPUTargetLowering::LowerSDIVREM(SDValue Op,
   SDValue Zero = DAG.getConstant(0, DL, VT);
   SDValue NegOne = DAG.getConstant(-1, DL, VT);
 
-  if (VT == MVT::i32 &&
-      DAG.ComputeNumSignBits(LHS) > 8 &&
-      DAG.ComputeNumSignBits(RHS) > 8) {
-    return LowerDIVREM24(Op, DAG, true);
+  if (VT == MVT::i32) {
+    if (SDValue Res = LowerDIVREM24(Op, DAG, true))
+      return Res;
   }
+
   if (VT == MVT::i64 &&
       DAG.ComputeNumSignBits(LHS) > 32 &&
       DAG.ComputeNumSignBits(RHS) > 32) {
@@ -1954,7 +1600,8 @@ SDValue AMDGPUTargetLowering::LowerFCEIL(SDValue Op, SelectionDAG &DAG) const {
   return DAG.getNode(ISD::FADD, SL, MVT::f64, Trunc, Add);
 }
 
-static SDValue extractF64Exponent(SDValue Hi, SDLoc SL, SelectionDAG &DAG) {
+static SDValue extractF64Exponent(SDValue Hi, const SDLoc &SL,
+                                  SelectionDAG &DAG) {
   const unsigned FractBits = 52;
   const unsigned ExpBits = 11;
 
@@ -1992,8 +1639,7 @@ SDValue AMDGPUTargetLowering::LowerFTRUNC(SDValue Op, SelectionDAG &DAG) const {
   SDValue SignBit = DAG.getNode(ISD::AND, SL, MVT::i32, Hi, SignBitMask);
 
   // Extend back to to 64-bits.
-  SDValue SignBit64 = DAG.getNode(ISD::BUILD_VECTOR, SL, MVT::v2i32,
-                                  Zero, SignBit);
+  SDValue SignBit64 = DAG.getBuildVector(MVT::v2i32, SL, {Zero, SignBit});
   SignBit64 = DAG.getNode(ISD::BITCAST, SL, MVT::i64, SignBit64);
 
   SDValue BcInt = DAG.getNode(ISD::BITCAST, SL, MVT::i64, Src);
@@ -2391,7 +2037,7 @@ SDValue AMDGPUTargetLowering::LowerFP64_TO_INT(SDValue Op, SelectionDAG &DAG,
                            MVT::i32, FloorMul);
   SDValue Lo = DAG.getNode(ISD::FP_TO_UINT, SL, MVT::i32, Fma);
 
-  SDValue Result = DAG.getNode(ISD::BUILD_VECTOR, SL, MVT::v2i32, Lo, Hi);
+  SDValue Result = DAG.getBuildVector(MVT::v2i32, SL, {Lo, Hi});
 
   return DAG.getNode(ISD::BITCAST, SL, MVT::i64, Result);
 }
@@ -2437,7 +2083,7 @@ SDValue AMDGPUTargetLowering::LowerSIGN_EXTEND_INREG(SDValue Op,
   for (unsigned I = 0; I < NElts; ++I)
     Args[I] = DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, ScalarVT, Args[I], VTOp);
 
-  return DAG.getNode(ISD::BUILD_VECTOR, DL, VT, Args);
+  return DAG.getBuildVector(VT, DL, Args);
 }
 
 //===----------------------------------------------------------------------===//
@@ -2476,8 +2122,8 @@ static void simplifyI24(SDValue Op, TargetLowering::DAGCombinerInfo &DCI) {
 }
 
 template <typename IntTy>
-static SDValue constantFoldBFE(SelectionDAG &DAG, IntTy Src0,
-                               uint32_t Offset, uint32_t Width, SDLoc DL) {
+static SDValue constantFoldBFE(SelectionDAG &DAG, IntTy Src0, uint32_t Offset,
+                               uint32_t Width, const SDLoc &DL) {
   if (Width + Offset < 32) {
     uint32_t Shl = static_cast<uint32_t>(Src0) << (32 - Offset - Width);
     IntTy Result = static_cast<IntTy>(Shl) >> (32 - Width);
@@ -2487,55 +2133,175 @@ static SDValue constantFoldBFE(SelectionDAG &DAG, IntTy Src0,
   return DAG.getConstant(Src0 >> Offset, DL, MVT::i32);
 }
 
-static bool usesAllNormalStores(SDNode *LoadVal) {
-  for (SDNode::use_iterator I = LoadVal->use_begin(); !I.atEnd(); ++I) {
-    if (!ISD::isNormalStore(*I))
-      return false;
+static bool hasVolatileUser(SDNode *Val) {
+  for (SDNode *U : Val->uses()) {
+    if (MemSDNode *M = dyn_cast<MemSDNode>(U)) {
+      if (M->isVolatile())
+        return true;
+    }
   }
 
+  return false;
+}
+
+bool AMDGPUTargetLowering::shouldCombineMemoryType(EVT VT) const {
+  // i32 vectors are the canonical memory type.
+  if (VT.getScalarType() == MVT::i32 || isTypeLegal(VT))
+    return false;
+
+  if (!VT.isByteSized())
+    return false;
+
+  unsigned Size = VT.getStoreSize();
+
+  if ((Size == 1 || Size == 2 || Size == 4) && !VT.isVector())
+    return false;
+
+  if (Size == 3 || (Size > 4 && (Size % 4 != 0)))
+    return false;
+
   return true;
 }
 
-// If we have a copy of an illegal type, replace it with a load / store of an
-// equivalently sized legal type. This avoids intermediate bit pack / unpack
-// instructions emitted when handling extloads and truncstores. Ideally we could
-// recognize the pack / unpack pattern to eliminate it.
+// Replace load of an illegal type with a store of a bitcast to a friendlier
+// type.
+SDValue AMDGPUTargetLowering::performLoadCombine(SDNode *N,
+                                                 DAGCombinerInfo &DCI) const {
+  if (!DCI.isBeforeLegalize())
+    return SDValue();
+
+  LoadSDNode *LN = cast<LoadSDNode>(N);
+  if (LN->isVolatile() || !ISD::isNormalLoad(LN) || hasVolatileUser(LN))
+    return SDValue();
+
+  SDLoc SL(N);
+  SelectionDAG &DAG = DCI.DAG;
+  EVT VT = LN->getMemoryVT();
+
+  unsigned Size = VT.getStoreSize();
+  unsigned Align = LN->getAlignment();
+  if (Align < Size && isTypeLegal(VT)) {
+    bool IsFast;
+    unsigned AS = LN->getAddressSpace();
+
+    // Expand unaligned loads earlier than legalization. Due to visitation order
+    // problems during legalization, the emitted instructions to pack and unpack
+    // the bytes again are not eliminated in the case of an unaligned copy.
+    if (!allowsMisalignedMemoryAccesses(VT, AS, Align, &IsFast)) {
+      SDValue Ops[2];
+      std::tie(Ops[0], Ops[1]) = expandUnalignedLoad(LN, DAG);
+      return DAG.getMergeValues(Ops, SDLoc(N));
+    }
+
+    if (!IsFast)
+      return SDValue();
+  }
+
+  if (!shouldCombineMemoryType(VT))
+    return SDValue();
+
+  EVT NewVT = getEquivalentMemType(*DAG.getContext(), VT);
+
+  SDValue NewLoad
+    = DAG.getLoad(NewVT, SL, LN->getChain(),
+                  LN->getBasePtr(), LN->getMemOperand());
+
+  SDValue BC = DAG.getNode(ISD::BITCAST, SL, VT, NewLoad);
+  DCI.CombineTo(N, BC, NewLoad.getValue(1));
+  return SDValue(N, 0);
+}
+
+// Replace store of an illegal type with a store of a bitcast to a friendlier
+// type.
 SDValue AMDGPUTargetLowering::performStoreCombine(SDNode *N,
                                                   DAGCombinerInfo &DCI) const {
   if (!DCI.isBeforeLegalize())
     return SDValue();
 
   StoreSDNode *SN = cast<StoreSDNode>(N);
-  SDValue Value = SN->getValue();
-  EVT VT = Value.getValueType();
+  if (SN->isVolatile() || !ISD::isNormalStore(SN))
+    return SDValue();
 
-  if (isTypeLegal(VT) || SN->isVolatile() ||
-      !ISD::isNormalLoad(Value.getNode()) || VT.getSizeInBits() < 8)
+  EVT VT = SN->getMemoryVT();
+  unsigned Size = VT.getStoreSize();
+
+  SDLoc SL(N);
+  SelectionDAG &DAG = DCI.DAG;
+  unsigned Align = SN->getAlignment();
+  if (Align < Size && isTypeLegal(VT)) {
+    bool IsFast;
+    unsigned AS = SN->getAddressSpace();
+
+    // Expand unaligned stores earlier than legalization. Due to visitation
+    // order problems during legalization, the emitted instructions to pack and
+    // unpack the bytes again are not eliminated in the case of an unaligned
+    // copy.
+    if (!allowsMisalignedMemoryAccesses(VT, AS, Align, &IsFast))
+      return expandUnalignedStore(SN, DAG);
+
+    if (!IsFast)
+      return SDValue();
+  }
+
+  if (!shouldCombineMemoryType(VT))
+    return SDValue();
+
+  EVT NewVT = getEquivalentMemType(*DAG.getContext(), VT);
+  SDValue Val = SN->getValue();
+
+  //DCI.AddToWorklist(Val.getNode());
+
+  bool OtherUses = !Val.hasOneUse();
+  SDValue CastVal = DAG.getNode(ISD::BITCAST, SL, NewVT, Val);
+  if (OtherUses) {
+    SDValue CastBack = DAG.getNode(ISD::BITCAST, SL, VT, CastVal);
+    DAG.ReplaceAllUsesOfValueWith(Val, CastBack);
+  }
+
+  return DAG.getStore(SN->getChain(), SL, CastVal,
+                      SN->getBasePtr(), SN->getMemOperand());
+}
+
+// TODO: Should repeat for other bit ops.
+SDValue AMDGPUTargetLowering::performAndCombine(SDNode *N,
+                                                DAGCombinerInfo &DCI) const {
+  if (N->getValueType(0) != MVT::i64)
     return SDValue();
 
-  LoadSDNode *LoadVal = cast<LoadSDNode>(Value);
-  if (LoadVal->isVolatile() || !usesAllNormalStores(LoadVal))
+  // Break up 64-bit and of a constant into two 32-bit ands. This will typically
+  // happen anyway for a VALU 64-bit and. This exposes other 32-bit integer
+  // combine opportunities since most 64-bit operations are decomposed this way.
+  // TODO: We won't want this for SALU especially if it is an inline immediate.
+  const ConstantSDNode *RHS = dyn_cast<ConstantSDNode>(N->getOperand(1));
+  if (!RHS)
     return SDValue();
 
-  EVT MemVT = LoadVal->getMemoryVT();
+  uint64_t Val = RHS->getZExtValue();
+  if (Lo_32(Val) != 0 && Hi_32(Val) != 0 && !RHS->hasOneUse()) {
+    // If either half of the constant is 0, this is really a 32-bit and, so
+    // split it. If we can re-use the full materialized constant, keep it.
+    return SDValue();
+  }
 
   SDLoc SL(N);
   SelectionDAG &DAG = DCI.DAG;
-  EVT LoadVT = getEquivalentMemType(*DAG.getContext(), MemVT);
 
-  SDValue NewLoad = DAG.getLoad(ISD::UNINDEXED, ISD::NON_EXTLOAD,
-                                LoadVT, SL,
-                                LoadVal->getChain(),
-                                LoadVal->getBasePtr(),
-                                LoadVal->getOffset(),
-                                LoadVT,
-                                LoadVal->getMemOperand());
+  SDValue Lo, Hi;
+  std::tie(Lo, Hi) = split64BitValue(N->getOperand(0), DAG);
 
-  SDValue CastLoad = DAG.getNode(ISD::BITCAST, SL, VT, NewLoad.getValue(0));
-  DCI.CombineTo(LoadVal, CastLoad, NewLoad.getValue(1), false);
+  SDValue LoRHS = DAG.getConstant(Lo_32(Val), SL, MVT::i32);
+  SDValue HiRHS = DAG.getConstant(Hi_32(Val), SL, MVT::i32);
 
-  return DAG.getStore(SN->getChain(), SL, NewLoad,
-                      SN->getBasePtr(), SN->getMemOperand());
+  SDValue LoAnd = DAG.getNode(ISD::AND, SL, MVT::i32, Lo, LoRHS);
+  SDValue HiAnd = DAG.getNode(ISD::AND, SL, MVT::i32, Hi, HiRHS);
+
+  // Re-visit the ands. It's possible we eliminated one of them and it could
+  // simplify the vector.
+  DCI.AddToWorklist(Lo.getNode());
+  DCI.AddToWorklist(Hi.getNode());
+
+  SDValue Vec = DAG.getBuildVector(MVT::v2i32, SL, {LoAnd, HiAnd});
+  return DAG.getNode(ISD::BITCAST, SL, MVT::i64, Vec);
 }
 
 SDValue AMDGPUTargetLowering::performShlCombine(SDNode *N,
@@ -2543,14 +2309,17 @@ SDValue AMDGPUTargetLowering::performShlCombine(SDNode *N,
   if (N->getValueType(0) != MVT::i64)
     return SDValue();
 
-  // i64 (shl x, 32) -> (build_pair 0, x)
+  // i64 (shl x, C) -> (build_pair 0, (shl x, C -32))
 
-  // Doing this with moves theoretically helps MI optimizations that understand
-  // copies. 2 v_mov_b32_e32 will have the same code size / cycle count as
-  // v_lshl_b64. In the SALU case, I think this is slightly worse since it
-  // doubles the code size and I'm unsure about cycle count.
+  // On some subtargets, 64-bit shift is a quarter rate instruction. In the
+  // common case, splitting this into a move and a 32-bit shift is faster and
+  // the same code size.
   const ConstantSDNode *RHS = dyn_cast<ConstantSDNode>(N->getOperand(1));
-  if (!RHS || RHS->getZExtValue() != 32)
+  if (!RHS)
+    return SDValue();
+
+  unsigned RHSVal = RHS->getZExtValue();
+  if (RHSVal < 32)
     return SDValue();
 
   SDValue LHS = N->getOperand(0);
@@ -2558,11 +2327,85 @@ SDValue AMDGPUTargetLowering::performShlCombine(SDNode *N,
   SDLoc SL(N);
   SelectionDAG &DAG = DCI.DAG;
 
-  // Extract low 32-bits.
+  SDValue ShiftAmt = DAG.getConstant(RHSVal - 32, SL, MVT::i32);
+
   SDValue Lo = DAG.getNode(ISD::TRUNCATE, SL, MVT::i32, LHS);
+  SDValue NewShift = DAG.getNode(ISD::SHL, SL, MVT::i32, Lo, ShiftAmt);
 
   const SDValue Zero = DAG.getConstant(0, SL, MVT::i32);
-  return DAG.getNode(ISD::BUILD_PAIR, SL, MVT::i64, Zero, Lo);
+
+  SDValue Vec = DAG.getBuildVector(MVT::v2i32, SL, {Zero, NewShift});
+  return DAG.getNode(ISD::BITCAST, SL, MVT::i64, Vec);
+}
+
+SDValue AMDGPUTargetLowering::performSraCombine(SDNode *N,
+                                                DAGCombinerInfo &DCI) const {
+  if (N->getValueType(0) != MVT::i64)
+    return SDValue();
+
+  const ConstantSDNode *RHS = dyn_cast<ConstantSDNode>(N->getOperand(1));
+  if (!RHS)
+    return SDValue();
+
+  SelectionDAG &DAG = DCI.DAG;
+  SDLoc SL(N);
+  unsigned RHSVal = RHS->getZExtValue();
+
+  // (sra i64:x, 32) -> build_pair x, (sra hi_32(x), 31)
+  if (RHSVal == 32) {
+    SDValue Hi = getHiHalf64(N->getOperand(0), DAG);
+    SDValue NewShift = DAG.getNode(ISD::SRA, SL, MVT::i32, Hi,
+                                   DAG.getConstant(31, SL, MVT::i32));
+
+    SDValue BuildVec = DAG.getBuildVector(MVT::v2i32, SL, {Hi, NewShift});
+    return DAG.getNode(ISD::BITCAST, SL, MVT::i64, BuildVec);
+  }
+
+  // (sra i64:x, 63) -> build_pair (sra hi_32(x), 31), (sra hi_32(x), 31)
+  if (RHSVal == 63) {
+    SDValue Hi = getHiHalf64(N->getOperand(0), DAG);
+    SDValue NewShift = DAG.getNode(ISD::SRA, SL, MVT::i32, Hi,
+                                   DAG.getConstant(31, SL, MVT::i32));
+    SDValue BuildVec = DAG.getBuildVector(MVT::v2i32, SL, {NewShift, NewShift});
+    return DAG.getNode(ISD::BITCAST, SL, MVT::i64, BuildVec);
+  }
+
+  return SDValue();
+}
+
+SDValue AMDGPUTargetLowering::performSrlCombine(SDNode *N,
+                                                DAGCombinerInfo &DCI) const {
+  if (N->getValueType(0) != MVT::i64)
+    return SDValue();
+
+  const ConstantSDNode *RHS = dyn_cast<ConstantSDNode>(N->getOperand(1));
+  if (!RHS)
+    return SDValue();
+
+  unsigned ShiftAmt = RHS->getZExtValue();
+  if (ShiftAmt < 32)
+    return SDValue();
+
+  // srl i64:x, C for C >= 32
+  // =>
+  //   build_pair (srl hi_32(x), C - 32), 0
+
+  SelectionDAG &DAG = DCI.DAG;
+  SDLoc SL(N);
+
+  SDValue One = DAG.getConstant(1, SL, MVT::i32);
+  SDValue Zero = DAG.getConstant(0, SL, MVT::i32);
+
+  SDValue VecOp = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, N->getOperand(0));
+  SDValue Hi = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32,
+                           VecOp, One);
+
+  SDValue NewConst = DAG.getConstant(ShiftAmt - 32, SL, MVT::i32);
+  SDValue NewShift = DAG.getNode(ISD::SRL, SL, MVT::i32, Hi, NewConst);
+
+  SDValue BuildPair = DAG.getBuildVector(MVT::v2i32, SL, {NewShift, Zero});
+
+  return DAG.getNode(ISD::BITCAST, SL, MVT::i64, BuildPair);
 }
 
 SDValue AMDGPUTargetLowering::performMulCombine(SDNode *N,
@@ -2610,8 +2453,8 @@ static bool isCtlzOpc(unsigned Opc) {
 // type VT.
 // Need to match pre-legalized type because the generic legalization inserts the
 // add/sub between the select and compare.
-static SDValue getFFBH_U32(const TargetLowering &TLI,
-                           SelectionDAG &DAG, SDLoc SL, SDValue Op) {
+static SDValue getFFBH_U32(const TargetLowering &TLI, SelectionDAG &DAG,
+                           const SDLoc &SL, SDValue Op) {
   EVT VT = Op.getValueType();
   EVT LegalVT = TLI.getTypeToTransformTo(*DAG.getContext(), VT);
   if (LegalVT != MVT::i32)
@@ -2634,10 +2477,8 @@ static SDValue getFFBH_U32(const TargetLowering &TLI,
 // against the bitwidth.
 //
 // TODO: Should probably combine against FFBH_U32 instead of ctlz directly.
-SDValue AMDGPUTargetLowering::performCtlzCombine(SDLoc SL,
-                                                 SDValue Cond,
-                                                 SDValue LHS,
-                                                 SDValue RHS,
+SDValue AMDGPUTargetLowering::performCtlzCombine(const SDLoc &SL, SDValue Cond,
+                                                 SDValue LHS, SDValue RHS,
                                                  DAGCombinerInfo &DCI) const {
   ConstantSDNode *CmpRhs = dyn_cast<ConstantSDNode>(Cond.getOperand(1));
   if (!CmpRhs || !CmpRhs->isNullValue())
@@ -2680,8 +2521,13 @@ SDValue AMDGPUTargetLowering::performSelectCombine(SDNode *N,
   SDValue True = N->getOperand(1);
   SDValue False = N->getOperand(2);
 
-  if (VT == MVT::f32 && Cond.hasOneUse())
-    return CombineFMinMaxLegacy(SDLoc(N), VT, LHS, RHS, True, False, CC, DCI);
+  if (VT == MVT::f32 && Cond.hasOneUse()) {
+    SDValue MinMax
+      = CombineFMinMaxLegacy(SDLoc(N), VT, LHS, RHS, True, False, CC, DCI);
+    // Revisit this node so we can catch min3/max3/med3 patterns.
+    //DCI.AddToWorklist(MinMax.getNode());
+    return MinMax;
+  }
 
   // There's no reason to not do this if the condition has other uses.
   return performCtlzCombine(SDLoc(N), Cond, True, False, DCI);
@@ -2695,12 +2541,62 @@ SDValue AMDGPUTargetLowering::PerformDAGCombine(SDNode *N,
   switch(N->getOpcode()) {
   default:
     break;
+  case ISD::BITCAST: {
+    EVT DestVT = N->getValueType(0);
+    if (DestVT.getSizeInBits() != 64 && !DestVT.isVector())
+      break;
+
+    // Fold bitcasts of constants.
+    //
+    // v2i32 (bitcast i64:k) -> build_vector lo_32(k), hi_32(k)
+    // TODO: Generalize and move to DAGCombiner
+    SDValue Src = N->getOperand(0);
+    if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Src)) {
+      assert(Src.getValueType() == MVT::i64);
+      SDLoc SL(N);
+      uint64_t CVal = C->getZExtValue();
+      return DAG.getNode(ISD::BUILD_VECTOR, SL, DestVT,
+                         DAG.getConstant(Lo_32(CVal), SL, MVT::i32),
+                         DAG.getConstant(Hi_32(CVal), SL, MVT::i32));
+    }
+
+    if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(Src)) {
+      const APInt &Val = C->getValueAPF().bitcastToAPInt();
+      SDLoc SL(N);
+      uint64_t CVal = Val.getZExtValue();
+      SDValue Vec = DAG.getNode(ISD::BUILD_VECTOR, SL, MVT::v2i32,
+                                DAG.getConstant(Lo_32(CVal), SL, MVT::i32),
+                                DAG.getConstant(Hi_32(CVal), SL, MVT::i32));
+
+      return DAG.getNode(ISD::BITCAST, SL, DestVT, Vec);
+    }
+
+    break;
+  }
   case ISD::SHL: {
     if (DCI.getDAGCombineLevel() < AfterLegalizeDAG)
       break;
 
     return performShlCombine(N, DCI);
   }
+  case ISD::SRL: {
+    if (DCI.getDAGCombineLevel() < AfterLegalizeDAG)
+      break;
+
+    return performSrlCombine(N, DCI);
+  }
+  case ISD::SRA: {
+    if (DCI.getDAGCombineLevel() < AfterLegalizeDAG)
+      break;
+
+    return performSraCombine(N, DCI);
+  }
+  case ISD::AND: {
+    if (DCI.getDAGCombineLevel() < AfterLegalizeDAG)
+      break;
+
+    return performAndCombine(N, DCI);
+  }
   case ISD::MUL:
     return performMulCombine(N, DCI);
   case AMDGPUISD::MUL_I24:
@@ -2797,7 +2693,8 @@ SDValue AMDGPUTargetLowering::PerformDAGCombine(SDNode *N,
 
     break;
   }
-
+  case ISD::LOAD:
+    return performLoadCombine(N, DCI);
   case ISD::STORE:
     return performStoreCombine(N, DCI);
   }
@@ -2840,20 +2737,6 @@ void AMDGPUTargetLowering::getOriginalFunctionArgs(
   }
 }
 
-bool AMDGPUTargetLowering::isHWTrueValue(SDValue Op) const {
-  if (ConstantFPSDNode * CFP = dyn_cast<ConstantFPSDNode>(Op)) {
-    return CFP->isExactlyValue(1.0);
-  }
-  return isAllOnesConstant(Op);
-}
-
-bool AMDGPUTargetLowering::isHWFalseValue(SDValue Op) const {
-  if (ConstantFPSDNode * CFP = dyn_cast<ConstantFPSDNode>(Op)) {
-    return CFP->getValueAPF().isZero();
-  }
-  return isNullConstant(Op);
-}
-
 SDValue AMDGPUTargetLowering::CreateLiveInRegister(SelectionDAG &DAG,
                                                   const TargetRegisterClass *RC,
                                                    unsigned Reg, EVT VT) const {
@@ -2889,10 +2772,11 @@ const char* AMDGPUTargetLowering::getTargetNodeName(unsigned Opcode) const {
   // AMDIL DAG nodes
   NODE_NAME_CASE(CALL);
   NODE_NAME_CASE(UMUL);
-  NODE_NAME_CASE(RET_FLAG);
   NODE_NAME_CASE(BRANCH_COND);
 
   // AMDGPU DAG nodes
+  NODE_NAME_CASE(ENDPGM)
+  NODE_NAME_CASE(RETURN)
   NODE_NAME_CASE(DWORDADDR)
   NODE_NAME_CASE(FRACT)
   NODE_NAME_CASE(CLAMP)
@@ -2906,6 +2790,9 @@ const char* AMDGPUTargetLowering::getTargetNodeName(unsigned Opcode) const {
   NODE_NAME_CASE(FMIN3)
   NODE_NAME_CASE(SMIN3)
   NODE_NAME_CASE(UMIN3)
+  NODE_NAME_CASE(FMED3)
+  NODE_NAME_CASE(SMED3)
+  NODE_NAME_CASE(UMED3)
   NODE_NAME_CASE(URECIP)
   NODE_NAME_CASE(DIV_SCALE)
   NODE_NAME_CASE(DIV_FMAS)
@@ -2914,7 +2801,7 @@ const char* AMDGPUTargetLowering::getTargetNodeName(unsigned Opcode) const {
   NODE_NAME_CASE(RCP)
   NODE_NAME_CASE(RSQ)
   NODE_NAME_CASE(RSQ_LEGACY)
-  NODE_NAME_CASE(RSQ_CLAMPED)
+  NODE_NAME_CASE(RSQ_CLAMP)
   NODE_NAME_CASE(LDEXP)
   NODE_NAME_CASE(FP_CLASS)
   NODE_NAME_CASE(DOT4)
@@ -2934,7 +2821,6 @@ const char* AMDGPUTargetLowering::getTargetNodeName(unsigned Opcode) const {
   NODE_NAME_CASE(CONST_ADDRESS)
   NODE_NAME_CASE(REGISTER_LOAD)
   NODE_NAME_CASE(REGISTER_STORE)
-  NODE_NAME_CASE(LOAD_CONSTANT)
   NODE_NAME_CASE(LOAD_INPUT)
   NODE_NAME_CASE(SAMPLE)
   NODE_NAME_CASE(SAMPLEB)
@@ -2946,13 +2832,18 @@ const char* AMDGPUTargetLowering::getTargetNodeName(unsigned Opcode) const {
   NODE_NAME_CASE(CVT_F32_UBYTE3)
   NODE_NAME_CASE(BUILD_VERTICAL_VECTOR)
   NODE_NAME_CASE(CONST_DATA_PTR)
+  NODE_NAME_CASE(PC_ADD_REL_OFFSET)
   case AMDGPUISD::FIRST_MEM_OPCODE_NUMBER: break;
   NODE_NAME_CASE(SENDMSG)
   NODE_NAME_CASE(INTERP_MOV)
   NODE_NAME_CASE(INTERP_P1)
   NODE_NAME_CASE(INTERP_P2)
   NODE_NAME_CASE(STORE_MSKOR)
+  NODE_NAME_CASE(LOAD_CONSTANT)
   NODE_NAME_CASE(TBUFFER_STORE_FORMAT)
+  NODE_NAME_CASE(ATOMIC_CMP_SWAP)
+  NODE_NAME_CASE(ATOMIC_INC)
+  NODE_NAME_CASE(ATOMIC_DEC)
   case AMDGPUISD::LAST_AMDGPU_ISD_NUMBER: break;
   }
   return nullptr;
@@ -2998,21 +2889,6 @@ SDValue AMDGPUTargetLowering::getRecipEstimate(SDValue Operand,
   return SDValue();
 }
 
-static void computeKnownBitsForMinMax(const SDValue Op0,
-                                      const SDValue Op1,
-                                      APInt &KnownZero,
-                                      APInt &KnownOne,
-                                      const SelectionDAG &DAG,
-                                      unsigned Depth) {
-  APInt Op0Zero, Op0One;
-  APInt Op1Zero, Op1One;
-  DAG.computeKnownBits(Op0, Op0Zero, Op0One, Depth);
-  DAG.computeKnownBits(Op1, Op1Zero, Op1One, Depth);
-
-  KnownZero = Op0Zero & Op1Zero;
-  KnownOne = Op0One & Op1One;
-}
-
 void AMDGPUTargetLowering::computeKnownBitsForTargetNode(
   const SDValue Op,
   APInt &KnownZero,
@@ -3029,22 +2905,6 @@ void AMDGPUTargetLowering::computeKnownBitsForTargetNode(
   switch (Opc) {
   default:
     break;
-  case ISD::INTRINSIC_WO_CHAIN: {
-    // FIXME: The intrinsic should just use the node.
-    switch (cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue()) {
-    case AMDGPUIntrinsic::AMDGPU_imax:
-    case AMDGPUIntrinsic::AMDGPU_umax:
-    case AMDGPUIntrinsic::AMDGPU_imin:
-    case AMDGPUIntrinsic::AMDGPU_umin:
-      computeKnownBitsForMinMax(Op.getOperand(1), Op.getOperand(2),
-                                KnownZero, KnownOne, DAG, Depth);
-      break;
-    default:
-      break;
-    }
-
-    break;
-  }
   case AMDGPUISD::CARRY:
   case AMDGPUISD::BORROW: {
     KnownZero = APInt::getHighBitsSet(32, 31);
diff --git a/lib/Target/AMDGPU/AMDGPUISelLowering.h b/lib/Target/AMDGPU/AMDGPUISelLowering.h
index 37925416a9c4..c2c758592d1c 100644
--- a/lib/Target/AMDGPU/AMDGPUISelLowering.h
+++ b/lib/Target/AMDGPU/AMDGPUISelLowering.h
@@ -13,8 +13,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef LLVM_LIB_TARGET_R600_AMDGPUISELLOWERING_H
-#define LLVM_LIB_TARGET_R600_AMDGPUISELLOWERING_H
+#ifndef LLVM_LIB_TARGET_AMDGPU_AMDGPUISELLOWERING_H
+#define LLVM_LIB_TARGET_AMDGPU_AMDGPUISELLOWERING_H
 
 #include "llvm/Target/TargetLowering.h"
 
@@ -28,12 +28,10 @@ class AMDGPUTargetLowering : public TargetLowering {
 protected:
   const AMDGPUSubtarget *Subtarget;
 
-private:
   SDValue LowerConstantInitializer(const Constant* Init, const GlobalValue *GV,
                                    const SDValue &InitPtr,
                                    SDValue Chain,
                                    SelectionDAG &DAG) const;
-  SDValue LowerFrameIndex(SDValue Op, SelectionDAG &DAG) const;
   SDValue LowerEXTRACT_SUBVECTOR(SDValue Op, SelectionDAG &DAG) const;
   SDValue LowerCONCAT_VECTORS(SDValue Op, SelectionDAG &DAG) const;
   SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG) const;
@@ -67,42 +65,43 @@ private:
 
   SDValue LowerSIGN_EXTEND_INREG(SDValue Op, SelectionDAG &DAG) const;
 
+protected:
+  bool shouldCombineMemoryType(EVT VT) const;
+  SDValue performLoadCombine(SDNode *N, DAGCombinerInfo &DCI) const;
   SDValue performStoreCombine(SDNode *N, DAGCombinerInfo &DCI) const;
+  SDValue performAndCombine(SDNode *N, DAGCombinerInfo &DCI) const;
   SDValue performShlCombine(SDNode *N, DAGCombinerInfo &DCI) const;
+  SDValue performSraCombine(SDNode *N, DAGCombinerInfo &DCI) const;
+  SDValue performSrlCombine(SDNode *N, DAGCombinerInfo &DCI) const;
   SDValue performMulCombine(SDNode *N, DAGCombinerInfo &DCI) const;
-  SDValue performCtlzCombine(SDLoc SL, SDValue Cond, SDValue LHS, SDValue RHS,
-                             DAGCombinerInfo &DCI) const;
+  SDValue performCtlzCombine(const SDLoc &SL, SDValue Cond, SDValue LHS,
+                             SDValue RHS, DAGCombinerInfo &DCI) const;
   SDValue performSelectCombine(SDNode *N, DAGCombinerInfo &DCI) const;
 
-protected:
   static EVT getEquivalentMemType(LLVMContext &Context, EVT VT);
-  static EVT getEquivalentLoadRegType(LLVMContext &Context, EVT VT);
+  static EVT getEquivalentBitType(LLVMContext &Context, EVT VT);
 
   virtual SDValue LowerGlobalAddress(AMDGPUMachineFunction *MFI, SDValue Op,
                                      SelectionDAG &DAG) const;
 
-  /// \brief Split a vector load into a scalar load of each component.
-  SDValue ScalarizeVectorLoad(SDValue Op, SelectionDAG &DAG) const;
+  /// Return 64-bit value Op as two 32-bit integers.
+  std::pair<SDValue, SDValue> split64BitValue(SDValue Op,
+                                              SelectionDAG &DAG) const;
+  SDValue getLoHalf64(SDValue Op, SelectionDAG &DAG) const;
+  SDValue getHiHalf64(SDValue Op, SelectionDAG &DAG) const;
 
   /// \brief Split a vector load into 2 loads of half the vector.
   SDValue SplitVectorLoad(SDValue Op, SelectionDAG &DAG) const;
 
-  /// \brief Split a vector store into a scalar store of each component.
-  SDValue ScalarizeVectorStore(SDValue Op, SelectionDAG &DAG) const;
-
   /// \brief Split a vector store into 2 stores of half the vector.
   SDValue SplitVectorStore(SDValue Op, SelectionDAG &DAG) const;
 
-  SDValue LowerLOAD(SDValue Op, SelectionDAG &DAG) const;
   SDValue LowerSTORE(SDValue Op, SelectionDAG &DAG) const;
   SDValue LowerSDIVREM(SDValue Op, SelectionDAG &DAG) const;
   SDValue LowerUDIVREM(SDValue Op, SelectionDAG &DAG) const;
   SDValue LowerDIVREM24(SDValue Op, SelectionDAG &DAG, bool sign) const;
   void LowerUDIVREM64(SDValue Op, SelectionDAG &DAG,
                                     SmallVectorImpl<SDValue> &Results) const;
-  bool isHWTrueValue(SDValue Op) const;
-  bool isHWFalseValue(SDValue Op) const;
-
   /// The SelectionDAGBuilder will automatically promote function arguments
   /// with illegal types.  However, this does not work for the AMDGPU targets
   /// since the function arguments are stored in memory as these illegal types.
@@ -119,7 +118,7 @@ protected:
                      const SmallVectorImpl<ISD::OutputArg> &Outs) const;
 
 public:
-  AMDGPUTargetLowering(TargetMachine &TM, const AMDGPUSubtarget &STI);
+  AMDGPUTargetLowering(const TargetMachine &TM, const AMDGPUSubtarget &STI);
 
   bool isFAbsFree(EVT VT) const override;
   bool isFNegFree(EVT VT) const override;
@@ -141,7 +140,7 @@ public:
                              ISD::LoadExtType ExtType,
                              EVT ExtVT) const override;
 
-  bool isLoadBitCastBeneficial(EVT, EVT) const override;
+  bool isLoadBitCastBeneficial(EVT, EVT) const final;
 
   bool storeOfVectorConstantIsCheap(EVT MemVT,
                                     unsigned NumElem,
@@ -150,11 +149,10 @@ public:
   bool isCheapToSpeculateCttz() const override;
   bool isCheapToSpeculateCtlz() const override;
 
-  SDValue LowerReturn(SDValue Chain, CallingConv::ID CallConv,
-                      bool isVarArg,
+  SDValue LowerReturn(SDValue Chain, CallingConv::ID CallConv, bool isVarArg,
                       const SmallVectorImpl<ISD::OutputArg> &Outs,
-                      const SmallVectorImpl<SDValue> &OutVals,
-                      SDLoc DL, SelectionDAG &DAG) const override;
+                      const SmallVectorImpl<SDValue> &OutVals, const SDLoc &DL,
+                      SelectionDAG &DAG) const override;
   SDValue LowerCall(CallLoweringInfo &CLI,
                     SmallVectorImpl<SDValue> &InVals) const override;
 
@@ -167,16 +165,9 @@ public:
                           SmallVectorImpl<SDValue> &Results,
                           SelectionDAG &DAG) const override;
 
-  SDValue LowerIntrinsicIABS(SDValue Op, SelectionDAG &DAG) const;
-  SDValue LowerIntrinsicLRP(SDValue Op, SelectionDAG &DAG) const;
-  SDValue CombineFMinMaxLegacy(SDLoc DL,
-                               EVT VT,
-                               SDValue LHS,
-                               SDValue RHS,
-                               SDValue True,
-                               SDValue False,
-                               SDValue CC,
-                               DAGCombinerInfo &DCI) const;
+  SDValue CombineFMinMaxLegacy(const SDLoc &DL, EVT VT, SDValue LHS,
+                               SDValue RHS, SDValue True, SDValue False,
+                               SDValue CC, DAGCombinerInfo &DCI) const;
 
   const char* getTargetNodeName(unsigned Opcode) const override;
 
@@ -189,9 +180,7 @@ public:
                            unsigned &RefinementSteps) const override;
 
   virtual SDNode *PostISelFolding(MachineSDNode *N,
-                                  SelectionDAG &DAG) const {
-    return N;
-  }
+                                  SelectionDAG &DAG) const = 0;
 
   /// \brief Determine which of the bits specified in \p Mask are known to be
   /// either zero or one and return them in the \p KnownZero and \p KnownOne
@@ -214,8 +203,9 @@ public:
                                        unsigned Reg, EVT VT) const;
 
   enum ImplicitParameter {
-    GRID_DIM,
-    GRID_OFFSET
+    FIRST_IMPLICIT,
+    GRID_DIM = FIRST_IMPLICIT,
+    GRID_OFFSET,
   };
 
   /// \brief Helper function that returns the byte offset of the given
@@ -231,9 +221,10 @@ enum NodeType : unsigned {
   FIRST_NUMBER = ISD::BUILTIN_OP_END,
   CALL,        // Function call based on a single integer
   UMUL,        // 32bit unsigned multiplication
-  RET_FLAG,
   BRANCH_COND,
   // End AMDIL ISD Opcodes
+  ENDPGM,
+  RETURN,
   DWORDADDR,
   FRACT,
   CLAMP,
@@ -250,6 +241,9 @@ enum NodeType : unsigned {
   FMIN3,
   SMIN3,
   UMIN3,
+  FMED3,
+  SMED3,
+  UMED3,
   URECIP,
   DIV_SCALE,
   DIV_FMAS,
@@ -261,7 +255,7 @@ enum NodeType : unsigned {
   RCP,
   RSQ,
   RSQ_LEGACY,
-  RSQ_CLAMPED,
+  RSQ_CLAMP,
   LDEXP,
   FP_CLASS,
   DOT4,
@@ -307,10 +301,14 @@ enum NodeType : unsigned {
   INTERP_MOV,
   INTERP_P1,
   INTERP_P2,
+  PC_ADD_REL_OFFSET,
   FIRST_MEM_OPCODE_NUMBER = ISD::FIRST_TARGET_MEMORY_OPCODE,
   STORE_MSKOR,
   LOAD_CONSTANT,
   TBUFFER_STORE_FORMAT,
+  ATOMIC_CMP_SWAP,
+  ATOMIC_INC,
+  ATOMIC_DEC,
   LAST_AMDGPU_ISD_NUMBER
 };
 
diff --git a/lib/Target/AMDGPU/AMDGPUInstrInfo.cpp b/lib/Target/AMDGPU/AMDGPUInstrInfo.cpp
index a266e711af5b..9a00ecb24ebe 100644
--- a/lib/Target/AMDGPU/AMDGPUInstrInfo.cpp
+++ b/lib/Target/AMDGPU/AMDGPUInstrInfo.cpp
@@ -30,163 +30,8 @@ using namespace llvm;
 // Pin the vtable to this file.
 void AMDGPUInstrInfo::anchor() {}
 
-AMDGPUInstrInfo::AMDGPUInstrInfo(const AMDGPUSubtarget &st)
-    : AMDGPUGenInstrInfo(-1, -1), ST(st) {}
-
-const AMDGPURegisterInfo &AMDGPUInstrInfo::getRegisterInfo() const {
-  return RI;
-}
-
-bool AMDGPUInstrInfo::isCoalescableExtInstr(const MachineInstr &MI,
-                                           unsigned &SrcReg, unsigned &DstReg,
-                                           unsigned &SubIdx) const {
-// TODO: Implement this function
-  return false;
-}
-
-unsigned AMDGPUInstrInfo::isLoadFromStackSlot(const MachineInstr *MI,
-                                             int &FrameIndex) const {
-// TODO: Implement this function
-  return 0;
-}
-
-unsigned AMDGPUInstrInfo::isLoadFromStackSlotPostFE(const MachineInstr *MI,
-                                                   int &FrameIndex) const {
-// TODO: Implement this function
-  return 0;
-}
-
-bool AMDGPUInstrInfo::hasLoadFromStackSlot(const MachineInstr *MI,
-                                          const MachineMemOperand *&MMO,
-                                          int &FrameIndex) const {
-// TODO: Implement this function
-  return false;
-}
-unsigned AMDGPUInstrInfo::isStoreFromStackSlot(const MachineInstr *MI,
-                                              int &FrameIndex) const {
-// TODO: Implement this function
-  return 0;
-}
-unsigned AMDGPUInstrInfo::isStoreFromStackSlotPostFE(const MachineInstr *MI,
-                                                    int &FrameIndex) const {
-// TODO: Implement this function
-  return 0;
-}
-bool AMDGPUInstrInfo::hasStoreFromStackSlot(const MachineInstr *MI,
-                                           const MachineMemOperand *&MMO,
-                                           int &FrameIndex) const {
-// TODO: Implement this function
-  return false;
-}
-
-MachineInstr *
-AMDGPUInstrInfo::convertToThreeAddress(MachineFunction::iterator &MFI,
-                                      MachineBasicBlock::iterator &MBBI,
-                                      LiveVariables *LV) const {
-// TODO: Implement this function
-  return nullptr;
-}
-
-void
-AMDGPUInstrInfo::storeRegToStackSlot(MachineBasicBlock &MBB,
-                                    MachineBasicBlock::iterator MI,
-                                    unsigned SrcReg, bool isKill,
-                                    int FrameIndex,
-                                    const TargetRegisterClass *RC,
-                                    const TargetRegisterInfo *TRI) const {
-  llvm_unreachable("Not Implemented");
-}
-
-void
-AMDGPUInstrInfo::loadRegFromStackSlot(MachineBasicBlock &MBB,
-                                     MachineBasicBlock::iterator MI,
-                                     unsigned DestReg, int FrameIndex,
-                                     const TargetRegisterClass *RC,
-                                     const TargetRegisterInfo *TRI) const {
-  llvm_unreachable("Not Implemented");
-}
-
-bool AMDGPUInstrInfo::expandPostRAPseudo (MachineBasicBlock::iterator MI) const {
-  MachineBasicBlock *MBB = MI->getParent();
-  int OffsetOpIdx = AMDGPU::getNamedOperandIdx(MI->getOpcode(),
-                                               AMDGPU::OpName::addr);
-   // addr is a custom operand with multiple MI operands, and only the
-   // first MI operand is given a name.
-  int RegOpIdx = OffsetOpIdx + 1;
-  int ChanOpIdx = AMDGPU::getNamedOperandIdx(MI->getOpcode(),
-                                             AMDGPU::OpName::chan);
-  if (isRegisterLoad(*MI)) {
-    int DstOpIdx = AMDGPU::getNamedOperandIdx(MI->getOpcode(),
-                                              AMDGPU::OpName::dst);
-    unsigned RegIndex = MI->getOperand(RegOpIdx).getImm();
-    unsigned Channel = MI->getOperand(ChanOpIdx).getImm();
-    unsigned Address = calculateIndirectAddress(RegIndex, Channel);
-    unsigned OffsetReg = MI->getOperand(OffsetOpIdx).getReg();
-    if (OffsetReg == AMDGPU::INDIRECT_BASE_ADDR) {
-      buildMovInstr(MBB, MI, MI->getOperand(DstOpIdx).getReg(),
-                    getIndirectAddrRegClass()->getRegister(Address));
-    } else {
-      buildIndirectRead(MBB, MI, MI->getOperand(DstOpIdx).getReg(),
-                        Address, OffsetReg);
-    }
-  } else if (isRegisterStore(*MI)) {
-    int ValOpIdx = AMDGPU::getNamedOperandIdx(MI->getOpcode(),
-                                              AMDGPU::OpName::val);
-    unsigned RegIndex = MI->getOperand(RegOpIdx).getImm();
-    unsigned Channel = MI->getOperand(ChanOpIdx).getImm();
-    unsigned Address = calculateIndirectAddress(RegIndex, Channel);
-    unsigned OffsetReg = MI->getOperand(OffsetOpIdx).getReg();
-    if (OffsetReg == AMDGPU::INDIRECT_BASE_ADDR) {
-      buildMovInstr(MBB, MI, getIndirectAddrRegClass()->getRegister(Address),
-                    MI->getOperand(ValOpIdx).getReg());
-    } else {
-      buildIndirectWrite(MBB, MI, MI->getOperand(ValOpIdx).getReg(),
-                         calculateIndirectAddress(RegIndex, Channel),
-                         OffsetReg);
-    }
-  } else {
-    return false;
-  }
-
-  MBB->erase(MI);
-  return true;
-}
-
-MachineInstr *AMDGPUInstrInfo::foldMemoryOperandImpl(
-    MachineFunction &MF, MachineInstr *MI, ArrayRef<unsigned> Ops,
-    MachineBasicBlock::iterator InsertPt, int FrameIndex) const {
-// TODO: Implement this function
-  return nullptr;
-}
-MachineInstr *AMDGPUInstrInfo::foldMemoryOperandImpl(
-    MachineFunction &MF, MachineInstr *MI, ArrayRef<unsigned> Ops,
-    MachineBasicBlock::iterator InsertPt, MachineInstr *LoadMI) const {
-  // TODO: Implement this function
-  return nullptr;
-}
-bool
-AMDGPUInstrInfo::unfoldMemoryOperand(MachineFunction &MF, MachineInstr *MI,
-                                 unsigned Reg, bool UnfoldLoad,
-                                 bool UnfoldStore,
-                                 SmallVectorImpl<MachineInstr*> &NewMIs) const {
-  // TODO: Implement this function
-  return false;
-}
-
-bool
-AMDGPUInstrInfo::unfoldMemoryOperand(SelectionDAG &DAG, SDNode *N,
-                                    SmallVectorImpl<SDNode*> &NewNodes) const {
-  // TODO: Implement this function
-  return false;
-}
-
-unsigned
-AMDGPUInstrInfo::getOpcodeAfterMemoryUnfold(unsigned Opc,
-                                           bool UnfoldLoad, bool UnfoldStore,
-                                           unsigned *LoadRegIndex) const {
-  // TODO: Implement this function
-  return 0;
-}
+AMDGPUInstrInfo::AMDGPUInstrInfo(const AMDGPUSubtarget &ST)
+  : AMDGPUGenInstrInfo(-1, -1), ST(ST) {}
 
 bool AMDGPUInstrInfo::enableClusterLoads() const {
   return true;
@@ -214,106 +59,6 @@ bool AMDGPUInstrInfo::shouldScheduleLoadsNear(SDNode *Load0, SDNode *Load1,
   return (NumLoads <= 16 && (Offset1 - Offset0) < 64);
 }
 
-bool
-AMDGPUInstrInfo::ReverseBranchCondition(SmallVectorImpl<MachineOperand> &Cond)
-  const {
-  // TODO: Implement this function
-  return true;
-}
-void AMDGPUInstrInfo::insertNoop(MachineBasicBlock &MBB,
-                                MachineBasicBlock::iterator MI) const {
-  // TODO: Implement this function
-}
-
-bool AMDGPUInstrInfo::isPredicated(const MachineInstr *MI) const {
-  // TODO: Implement this function
-  return false;
-}
-
-bool AMDGPUInstrInfo::SubsumesPredicate(ArrayRef<MachineOperand> Pred1,
-                                        ArrayRef<MachineOperand> Pred2) const {
-  // TODO: Implement this function
-  return false;
-}
-
-bool AMDGPUInstrInfo::DefinesPredicate(MachineInstr *MI,
-                                      std::vector<MachineOperand> &Pred) const {
-  // TODO: Implement this function
-  return false;
-}
-
-bool AMDGPUInstrInfo::isPredicable(MachineInstr *MI) const {
-  // TODO: Implement this function
-  return MI->getDesc().isPredicable();
-}
-
-bool
-AMDGPUInstrInfo::isSafeToMoveRegClassDefs(const TargetRegisterClass *RC) const {
-  // TODO: Implement this function
-  return true;
-}
-
-bool AMDGPUInstrInfo::isRegisterStore(const MachineInstr &MI) const {
-  return get(MI.getOpcode()).TSFlags & AMDGPU_FLAG_REGISTER_STORE;
-}
-
-bool AMDGPUInstrInfo::isRegisterLoad(const MachineInstr &MI) const {
-  return get(MI.getOpcode()).TSFlags & AMDGPU_FLAG_REGISTER_LOAD;
-}
-
-int AMDGPUInstrInfo::getIndirectIndexBegin(const MachineFunction &MF) const {
-  const MachineRegisterInfo &MRI = MF.getRegInfo();
-  const MachineFrameInfo *MFI = MF.getFrameInfo();
-  int Offset = -1;
-
-  if (MFI->getNumObjects() == 0) {
-    return -1;
-  }
-
-  if (MRI.livein_empty()) {
-    return 0;
-  }
-
-  const TargetRegisterClass *IndirectRC = getIndirectAddrRegClass();
-  for (MachineRegisterInfo::livein_iterator LI = MRI.livein_begin(),
-                                            LE = MRI.livein_end();
-                                            LI != LE; ++LI) {
-    unsigned Reg = LI->first;
-    if (TargetRegisterInfo::isVirtualRegister(Reg) ||
-        !IndirectRC->contains(Reg))
-      continue;
-
-    unsigned RegIndex;
-    unsigned RegEnd;
-    for (RegIndex = 0, RegEnd = IndirectRC->getNumRegs(); RegIndex != RegEnd;
-                                                          ++RegIndex) {
-      if (IndirectRC->getRegister(RegIndex) == Reg)
-        break;
-    }
-    Offset = std::max(Offset, (int)RegIndex);
-  }
-
-  return Offset + 1;
-}
-
-int AMDGPUInstrInfo::getIndirectIndexEnd(const MachineFunction &MF) const {
-  int Offset = 0;
-  const MachineFrameInfo *MFI = MF.getFrameInfo();
-
-  // Variable sized objects are not supported
-  assert(!MFI->hasVarSizedObjects());
-
-  if (MFI->getNumObjects() == 0) {
-    return -1;
-  }
-
-  unsigned IgnoredFrameReg;
-  Offset = MF.getSubtarget().getFrameLowering()->getFrameIndexReference(
-      MF, -1, IgnoredFrameReg);
-
-  return getIndirectIndexBegin(MF) + Offset;
-}
-
 int AMDGPUInstrInfo::getMaskedMIMGOp(uint16_t Opcode, unsigned Channels) const {
   switch (Channels) {
   default: return Opcode;
@@ -323,35 +68,44 @@ int AMDGPUInstrInfo::getMaskedMIMGOp(uint16_t Opcode, unsigned Channels) const {
   }
 }
 
+// This must be kept in sync with the SIEncodingFamily class in SIInstrInfo.td
+enum SIEncodingFamily {
+  SI = 0,
+  VI = 1
+};
+
 // Wrapper for Tablegen'd function.  enum Subtarget is not defined in any
 // header files, so we need to wrap it in a function that takes unsigned
 // instead.
 namespace llvm {
 namespace AMDGPU {
 static int getMCOpcode(uint16_t Opcode, unsigned Gen) {
-  return getMCOpcodeGen(Opcode, (enum Subtarget)Gen);
+  return getMCOpcodeGen(Opcode, static_cast<Subtarget>(Gen));
 }
 }
 }
 
-// This must be kept in sync with the SISubtarget class in SIInstrInfo.td
-enum SISubtarget {
-  SI = 0,
-  VI = 1
-};
-
-static enum SISubtarget AMDGPUSubtargetToSISubtarget(unsigned Gen) {
-  switch (Gen) {
-  default:
-    return SI;
+static SIEncodingFamily subtargetEncodingFamily(const AMDGPUSubtarget &ST) {
+  switch (ST.getGeneration()) {
+  case AMDGPUSubtarget::SOUTHERN_ISLANDS:
+  case AMDGPUSubtarget::SEA_ISLANDS:
+    return SIEncodingFamily::SI;
   case AMDGPUSubtarget::VOLCANIC_ISLANDS:
-    return VI;
+    return SIEncodingFamily::VI;
+
+  // FIXME: This should never be called for r600 GPUs.
+  case AMDGPUSubtarget::R600:
+  case AMDGPUSubtarget::R700:
+  case AMDGPUSubtarget::EVERGREEN:
+  case AMDGPUSubtarget::NORTHERN_ISLANDS:
+    return SIEncodingFamily::SI;
   }
+
+  llvm_unreachable("Unknown subtarget generation!");
 }
 
 int AMDGPUInstrInfo::pseudoToMCOpcode(int Opcode) const {
-  int MCOp = AMDGPU::getMCOpcode(
-      Opcode, AMDGPUSubtargetToSISubtarget(ST.getGeneration()));
+  int MCOp = AMDGPU::getMCOpcode(Opcode, subtargetEncodingFamily(ST));
 
   // -1 means that Opcode is already a native instruction.
   if (MCOp == -1)
@@ -364,14 +118,3 @@ int AMDGPUInstrInfo::pseudoToMCOpcode(int Opcode) const {
 
   return MCOp;
 }
-
-ArrayRef<std::pair<int, const char *>>
-AMDGPUInstrInfo::getSerializableTargetIndices() const {
-  static const std::pair<int, const char *> TargetIndices[] = {
-      {AMDGPU::TI_CONSTDATA_START, "amdgpu-constdata-start"},
-      {AMDGPU::TI_SCRATCH_RSRC_DWORD0, "amdgpu-scratch-rsrc-dword0"},
-      {AMDGPU::TI_SCRATCH_RSRC_DWORD1, "amdgpu-scratch-rsrc-dword1"},
-      {AMDGPU::TI_SCRATCH_RSRC_DWORD2, "amdgpu-scratch-rsrc-dword2"},
-      {AMDGPU::TI_SCRATCH_RSRC_DWORD3, "amdgpu-scratch-rsrc-dword3"}};
-  return makeArrayRef(TargetIndices);
-}
diff --git a/lib/Target/AMDGPU/AMDGPUInstrInfo.h b/lib/Target/AMDGPU/AMDGPUInstrInfo.h
index 53e8b23b3d62..a59eafadeb93 100644
--- a/lib/Target/AMDGPU/AMDGPUInstrInfo.h
+++ b/lib/Target/AMDGPU/AMDGPUInstrInfo.h
@@ -13,12 +13,10 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef LLVM_LIB_TARGET_R600_AMDGPUINSTRINFO_H
-#define LLVM_LIB_TARGET_R600_AMDGPUINSTRINFO_H
+#ifndef LLVM_LIB_TARGET_AMDGPU_AMDGPUINSTRINFO_H
+#define LLVM_LIB_TARGET_AMDGPU_AMDGPUINSTRINFO_H
 
-#include "AMDGPURegisterInfo.h"
 #include "llvm/Target/TargetInstrInfo.h"
-#include <map>
 
 #define GET_INSTRINFO_HEADER
 #define GET_INSTRINFO_ENUM
@@ -39,78 +37,12 @@ class MachineInstrBuilder;
 
 class AMDGPUInstrInfo : public AMDGPUGenInstrInfo {
 private:
-  const AMDGPURegisterInfo RI;
-  virtual void anchor();
-protected:
   const AMDGPUSubtarget &ST;
-public:
-  explicit AMDGPUInstrInfo(const AMDGPUSubtarget &st);
 
-  virtual const AMDGPURegisterInfo &getRegisterInfo() const = 0;
-
-  bool isCoalescableExtInstr(const MachineInstr &MI, unsigned &SrcReg,
-                             unsigned &DstReg, unsigned &SubIdx) const override;
-
-  unsigned isLoadFromStackSlot(const MachineInstr *MI,
-                               int &FrameIndex) const override;
-  unsigned isLoadFromStackSlotPostFE(const MachineInstr *MI,
-                                     int &FrameIndex) const override;
-  bool hasLoadFromStackSlot(const MachineInstr *MI,
-                            const MachineMemOperand *&MMO,
-                            int &FrameIndex) const override;
-  unsigned isStoreFromStackSlot(const MachineInstr *MI, int &FrameIndex) const;
-  unsigned isStoreFromStackSlotPostFE(const MachineInstr *MI,
-                                      int &FrameIndex) const;
-  bool hasStoreFromStackSlot(const MachineInstr *MI,
-                             const MachineMemOperand *&MMO,
-                             int &FrameIndex) const;
-
-  MachineInstr *
-  convertToThreeAddress(MachineFunction::iterator &MFI,
-                        MachineBasicBlock::iterator &MBBI,
-                        LiveVariables *LV) const override;
-
-
-  bool expandPostRAPseudo(MachineBasicBlock::iterator MI) const override;
-
-  void storeRegToStackSlot(MachineBasicBlock &MBB,
-                           MachineBasicBlock::iterator MI,
-                           unsigned SrcReg, bool isKill, int FrameIndex,
-                           const TargetRegisterClass *RC,
-                           const TargetRegisterInfo *TRI) const override;
-  void loadRegFromStackSlot(MachineBasicBlock &MBB,
-                            MachineBasicBlock::iterator MI,
-                            unsigned DestReg, int FrameIndex,
-                            const TargetRegisterClass *RC,
-                            const TargetRegisterInfo *TRI) const override;
-
-protected:
-  MachineInstr *foldMemoryOperandImpl(MachineFunction &MF, MachineInstr *MI,
-                                      ArrayRef<unsigned> Ops,
-                                      MachineBasicBlock::iterator InsertPt,
-                                      int FrameIndex) const override;
-  MachineInstr *foldMemoryOperandImpl(MachineFunction &MF, MachineInstr *MI,
-                                      ArrayRef<unsigned> Ops,
-                                      MachineBasicBlock::iterator InsertPt,
-                                      MachineInstr *LoadMI) const override;
+  virtual void anchor();
 
 public:
-  /// \returns the smallest register index that will be accessed by an indirect
-  /// read or write or -1 if indirect addressing is not used by this program.
-  int getIndirectIndexBegin(const MachineFunction &MF) const;
-
-  /// \returns the largest register index that will be accessed by an indirect
-  /// read or write or -1 if indirect addressing is not used by this program.
-  int getIndirectIndexEnd(const MachineFunction &MF) const;
-
-  bool unfoldMemoryOperand(MachineFunction &MF, MachineInstr *MI,
-                        unsigned Reg, bool UnfoldLoad, bool UnfoldStore,
-                        SmallVectorImpl<MachineInstr *> &NewMIs) const override;
-  bool unfoldMemoryOperand(SelectionDAG &DAG, SDNode *N,
-                           SmallVectorImpl<SDNode *> &NewNodes) const override;
-  unsigned getOpcodeAfterMemoryUnfold(unsigned Opc,
-                               bool UnfoldLoad, bool UnfoldStore,
-                               unsigned *LoadRegIndex = nullptr) const override;
+  explicit AMDGPUInstrInfo(const AMDGPUSubtarget &st);
 
   bool enableClusterLoads() const override;
 
@@ -118,81 +50,14 @@ public:
                                int64_t Offset1, int64_t Offset2,
                                unsigned NumLoads) const override;
 
-  bool
-  ReverseBranchCondition(SmallVectorImpl<MachineOperand> &Cond) const override;
-  void insertNoop(MachineBasicBlock &MBB,
-                  MachineBasicBlock::iterator MI) const override;
-  bool isPredicated(const MachineInstr *MI) const override;
-  bool SubsumesPredicate(ArrayRef<MachineOperand> Pred1,
-                         ArrayRef<MachineOperand> Pred2) const override;
-  bool DefinesPredicate(MachineInstr *MI,
-                        std::vector<MachineOperand> &Pred) const override;
-  bool isPredicable(MachineInstr *MI) const override;
-  bool isSafeToMoveRegClassDefs(const TargetRegisterClass *RC) const override;
-
-  // Helper functions that check the opcode for status information
-  bool isRegisterStore(const MachineInstr &MI) const;
-  bool isRegisterLoad(const MachineInstr &MI) const;
-
   /// \brief Return a target-specific opcode if Opcode is a pseudo instruction.
   /// Return -1 if the target-specific opcode for the pseudo instruction does
   /// not exist. If Opcode is not a pseudo instruction, this is identity.
   int pseudoToMCOpcode(int Opcode) const;
 
-  /// \brief Return the descriptor of the target-specific machine instruction
-  /// that corresponds to the specified pseudo or native opcode.
-  const MCInstrDesc &getMCOpcodeFromPseudo(unsigned Opcode) const {
-    return get(pseudoToMCOpcode(Opcode));
-  }
-
-  ArrayRef<std::pair<int, const char *>>
-  getSerializableTargetIndices() const override;
-
-//===---------------------------------------------------------------------===//
-// Pure virtual funtions to be implemented by sub-classes.
-//===---------------------------------------------------------------------===//
-
-  virtual bool isMov(unsigned opcode) const = 0;
-
-  /// \brief Calculate the "Indirect Address" for the given \p RegIndex and
-  ///        \p Channel
-  ///
-  /// We model indirect addressing using a virtual address space that can be
-  /// accesed with loads and stores.  The "Indirect Address" is the memory
-  /// address in this virtual address space that maps to the given \p RegIndex
-  /// and \p Channel.
-  virtual unsigned calculateIndirectAddress(unsigned RegIndex,
-                                            unsigned Channel) const = 0;
-
-  /// \returns The register class to be used for loading and storing values
-  /// from an "Indirect Address" .
-  virtual const TargetRegisterClass *getIndirectAddrRegClass() const = 0;
-
-  /// \brief Build instruction(s) for an indirect register write.
-  ///
-  /// \returns The instruction that performs the indirect register write
-  virtual MachineInstrBuilder buildIndirectWrite(MachineBasicBlock *MBB,
-                                    MachineBasicBlock::iterator I,
-                                    unsigned ValueReg, unsigned Address,
-                                    unsigned OffsetReg) const = 0;
-
-  /// \brief Build instruction(s) for an indirect register read.
-  ///
-  /// \returns The instruction that performs the indirect register read
-  virtual MachineInstrBuilder buildIndirectRead(MachineBasicBlock *MBB,
-                                    MachineBasicBlock::iterator I,
-                                    unsigned ValueReg, unsigned Address,
-                                    unsigned OffsetReg) const = 0;
-
-  /// \brief Build a MOV instruction.
-  virtual MachineInstr *buildMovInstr(MachineBasicBlock *MBB,
-                                      MachineBasicBlock::iterator I,
-                                      unsigned DstReg, unsigned SrcReg) const = 0;
-
   /// \brief Given a MIMG \p Opcode that writes all 4 channels, return the
   /// equivalent opcode that writes \p Channels Channels.
   int getMaskedMIMGOp(uint16_t Opcode, unsigned Channels) const;
-
 };
 
 namespace AMDGPU {
diff --git a/lib/Target/AMDGPU/AMDGPUInstrInfo.td b/lib/Target/AMDGPU/AMDGPUInstrInfo.td
index 575dfe413658..2b13bb9079ea 100644
--- a/lib/Target/AMDGPU/AMDGPUInstrInfo.td
+++ b/lib/Target/AMDGPU/AMDGPUInstrInfo.td
@@ -44,6 +44,11 @@ def AMDGPUFmasOp : SDTypeProfile<1, 4,
 // AMDGPU DAG Nodes
 //
 
+def AMDGPUconstdata_ptr : SDNode<
+  "AMDGPUISD::CONST_DATA_PTR", SDTypeProfile <1, 1, [SDTCisVT<0, iPTR>,
+                                                     SDTCisVT<0, iPTR>]>
+>;
+
 // This argument to this node is a dword address.
 def AMDGPUdwordaddr : SDNode<"AMDGPUISD::DWORDADDR", SDTIntUnaryOp>;
 
@@ -63,7 +68,7 @@ def AMDGPUrsq : SDNode<"AMDGPUISD::RSQ", SDTFPUnaryOp>;
 def AMDGPUrsq_legacy : SDNode<"AMDGPUISD::RSQ_LEGACY", SDTFPUnaryOp>;
 
 // out = 1.0 / sqrt(a) result clamped to +/- max_float.
-def AMDGPUrsq_clamped : SDNode<"AMDGPUISD::RSQ_CLAMPED", SDTFPUnaryOp>;
+def AMDGPUrsq_clamp : SDNode<"AMDGPUISD::RSQ_CLAMP", SDTFPUnaryOp>;
 
 def AMDGPUldexp : SDNode<"AMDGPUISD::LDEXP", AMDGPULdExpOp>;
 
@@ -183,6 +188,11 @@ def AMDGPUstore_mskor : SDNode<"AMDGPUISD::STORE_MSKOR",
                         SDTypeProfile<0, 2, []>,
                         [SDNPHasChain, SDNPMayStore, SDNPMemOperand]>;
 
+def AMDGPUatomic_cmp_swap : SDNode<"AMDGPUISD::ATOMIC_CMP_SWAP",
+                            SDTypeProfile<1, 2, [SDTCisPtrTy<1>, SDTCisVec<2>]>,
+                            [SDNPHasChain, SDNPMayStore, SDNPMayLoad,
+                             SDNPMemOperand]>;
+
 def AMDGPUround : SDNode<"ISD::FROUND",
                          SDTypeProfile<1, 1, [SDTCisFP<0>, SDTCisSameAs<0,1>]>>;
 
@@ -209,6 +219,16 @@ def AMDGPUmad_i24 : SDNode<"AMDGPUISD::MAD_I24", AMDGPUDTIntTernaryOp,
   []
 >;
 
+def AMDGPUsmed3 : SDNode<"AMDGPUISD::SMED3", AMDGPUDTIntTernaryOp,
+  []
+>;
+
+def AMDGPUumed3 : SDNode<"AMDGPUISD::UMED3", AMDGPUDTIntTernaryOp,
+  []
+>;
+
+def AMDGPUfmed3 : SDNode<"AMDGPUISD::FMED3", SDTFPTernaryOp, []>;
+
 def AMDGPUsendmsg : SDNode<"AMDGPUISD::SENDMSG",
                     SDTypeProfile<0, 1, [SDTCisInt<0>]>,
                     [SDNPHasChain, SDNPInGlue]>;
@@ -241,5 +261,8 @@ def IL_brcond      : SDNode<"AMDGPUISD::BRANCH_COND", SDTIL_BRCond, [SDNPHasChai
 //===----------------------------------------------------------------------===//
 // Call/Return DAG Nodes
 //===----------------------------------------------------------------------===//
-def IL_retflag       : SDNode<"AMDGPUISD::RET_FLAG", SDTNone,
+def AMDGPUendpgm : SDNode<"AMDGPUISD::ENDPGM", SDTNone,
+    [SDNPHasChain, SDNPOptInGlue]>;
+
+def AMDGPUreturn : SDNode<"AMDGPUISD::RETURN", SDTNone,
     [SDNPHasChain, SDNPOptInGlue, SDNPVariadic]>;
diff --git a/lib/Target/AMDGPU/AMDGPUInstructions.td b/lib/Target/AMDGPU/AMDGPUInstructions.td
index 2a7ce6a47176..6761b4b5df95 100644
--- a/lib/Target/AMDGPU/AMDGPUInstructions.td
+++ b/lib/Target/AMDGPU/AMDGPUInstructions.td
@@ -12,7 +12,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-class AMDGPUInst <dag outs, dag ins, string asm, list<dag> pattern> : Instruction {
+class AMDGPUInst <dag outs, dag ins, string asm = "",
+  list<dag> pattern = []> : Instruction {
   field bit isRegisterLoad = 0;
   field bit isRegisterStore = 0;
 
@@ -23,15 +24,22 @@ class AMDGPUInst <dag outs, dag ins, string asm, list<dag> pattern> : Instructio
   let Pattern = pattern;
   let Itinerary = NullALU;
 
+  // SoftFail is a field the disassembler can use to provide a way for
+  // instructions to not match without killing the whole decode process. It is
+  // mainly used for ARM, but Tablegen expects this field to exist or it fails
+  // to build the decode table.
+  field bits<64> SoftFail = 0;
+
+  let DecoderNamespace = Namespace;
+
   let TSFlags{63} = isRegisterLoad;
   let TSFlags{62} = isRegisterStore;
 }
 
-class AMDGPUShaderInst <dag outs, dag ins, string asm, list<dag> pattern>
-    : AMDGPUInst<outs, ins, asm, pattern> {
+class AMDGPUShaderInst <dag outs, dag ins, string asm = "",
+  list<dag> pattern = []> : AMDGPUInst<outs, ins, asm, pattern> {
 
   field bits<32> Inst = 0xffffffff;
-
 }
 
 def FP32Denormals : Predicate<"Subtarget.hasFP32Denormals()">;
@@ -41,6 +49,13 @@ def UnsafeFPMath : Predicate<"TM.Options.UnsafeFPMath">;
 def InstFlag : OperandWithDefaultOps <i32, (ops (i32 0))>;
 def ADDRIndirect : ComplexPattern<iPTR, 2, "SelectADDRIndirect", [], []>;
 
+// 32-bit VALU immediate operand that uses the constant bus.
+def u32kimm : Operand<i32> {
+  let OperandNamespace = "AMDGPU";
+  let OperandType = "OPERAND_KIMM32";
+  let PrintMethod = "printU32ImmOperand";
+}
+
 let OperandType = "OPERAND_IMMEDIATE" in {
 
 def u32imm : Operand<i32> {
@@ -146,6 +161,17 @@ def COND_NULL : PatLeaf <
   [{(void)N; return false;}]
 >;
 
+
+//===----------------------------------------------------------------------===//
+// Misc. PatFrags
+//===----------------------------------------------------------------------===//
+
+class HasOneUseBinOp<SDPatternOperator op> : PatFrag<
+  (ops node:$src0, node:$src1),
+  (op $src0, $src1),
+  [{ return N->hasOneUse(); }]
+>;
+
 //===----------------------------------------------------------------------===//
 // Load/Store Pattern Fragments
 //===----------------------------------------------------------------------===//
@@ -168,21 +194,58 @@ def truncstorei8_private : PrivateStore <truncstorei8>;
 def truncstorei16_private : PrivateStore <truncstorei16>;
 def store_private : PrivateStore <store>;
 
-def global_store : PatFrag<(ops node:$val, node:$ptr),
-    (store node:$val, node:$ptr), [{
-        return isGlobalStore(dyn_cast<StoreSDNode>(N));
+class GlobalMemOp <dag ops, dag frag> : PatFrag <ops, frag, [{
+  return cast<MemSDNode>(N)->getAddressSpace() == AMDGPUAS::GLOBAL_ADDRESS;
 }]>;
 
 // Global address space loads
-def global_load : PatFrag<(ops node:$ptr), (load node:$ptr), [{
-    return isGlobalLoad(dyn_cast<LoadSDNode>(N));
+class GlobalLoad <SDPatternOperator op> : GlobalMemOp <
+  (ops node:$ptr), (op node:$ptr)
+>;
+
+def global_load : GlobalLoad <load>;
+
+// Global address space stores
+class GlobalStore <SDPatternOperator op> : GlobalMemOp <
+  (ops node:$value, node:$ptr), (op node:$value, node:$ptr)
+>;
+
+def global_store : GlobalStore <store>;
+def global_store_atomic : GlobalStore<atomic_store>;
+
+
+class ConstantMemOp <dag ops, dag frag> : PatFrag <ops, frag, [{
+  return cast<MemSDNode>(N)->getAddressSpace() == AMDGPUAS::CONSTANT_ADDRESS;
 }]>;
 
 // Constant address space loads
-def constant_load : PatFrag<(ops node:$ptr), (load node:$ptr), [{
-    return isConstantLoad(dyn_cast<LoadSDNode>(N), -1);
+class ConstantLoad <SDPatternOperator op> : ConstantMemOp <
+  (ops node:$ptr), (op node:$ptr)
+>;
+
+def constant_load : ConstantLoad<load>;
+
+class LocalMemOp <dag ops, dag frag> : PatFrag <ops, frag, [{
+  return cast<MemSDNode>(N)->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS;
+}]>;
+
+// Local address space loads
+class LocalLoad <SDPatternOperator op> : LocalMemOp <
+  (ops node:$ptr), (op node:$ptr)
+>;
+
+class LocalStore <SDPatternOperator op> : LocalMemOp <
+  (ops node:$value, node:$ptr), (op node:$value, node:$ptr)
+>;
+
+class FlatMemOp <dag ops, dag frag> : PatFrag <ops, frag, [{
+  return cast<MemSDNode>(N)->getAddressSPace() == AMDGPUAS::FLAT_ADDRESS;
 }]>;
 
+class FlatLoad <SDPatternOperator op> : FlatMemOp <
+  (ops node:$ptr), (op node:$ptr)
+>;
+
 class AZExtLoadBase <SDPatternOperator ld_node>: PatFrag<(ops node:$ptr),
                                               (ld_node node:$ptr), [{
   LoadSDNode *L = cast<LoadSDNode>(N);
@@ -196,29 +259,14 @@ def az_extloadi8 : PatFrag<(ops node:$ptr), (az_extload node:$ptr), [{
   return cast<LoadSDNode>(N)->getMemoryVT() == MVT::i8;
 }]>;
 
-def az_extloadi8_global : PatFrag<(ops node:$ptr), (az_extloadi8 node:$ptr), [{
-    return isGlobalLoad(dyn_cast<LoadSDNode>(N));
-}]>;
-
-def sextloadi8_global : PatFrag<(ops node:$ptr), (sextloadi8 node:$ptr), [{
-    return isGlobalLoad(dyn_cast<LoadSDNode>(N));
-}]>;
+def az_extloadi8_global : GlobalLoad <az_extloadi8>;
+def sextloadi8_global : GlobalLoad <sextloadi8>;
 
-def az_extloadi8_constant : PatFrag<(ops node:$ptr), (az_extloadi8 node:$ptr), [{
-    return isConstantLoad(dyn_cast<LoadSDNode>(N), -1);
-}]>;
+def az_extloadi8_constant : ConstantLoad <az_extloadi8>;
+def sextloadi8_constant : ConstantLoad <sextloadi8>;
 
-def sextloadi8_constant : PatFrag<(ops node:$ptr), (sextloadi8 node:$ptr), [{
-    return isConstantLoad(dyn_cast<LoadSDNode>(N), -1);
-}]>;
-
-def az_extloadi8_local : PatFrag<(ops node:$ptr), (az_extloadi8 node:$ptr), [{
-    return isLocalLoad(dyn_cast<LoadSDNode>(N));
-}]>;
-
-def sextloadi8_local : PatFrag<(ops node:$ptr), (sextloadi8 node:$ptr), [{
-    return isLocalLoad(dyn_cast<LoadSDNode>(N));
-}]>;
+def az_extloadi8_local : LocalLoad <az_extloadi8>;
+def sextloadi8_local : LocalLoad <sextloadi8>;
 
 def extloadi8_private : PrivateLoad <az_extloadi8>;
 def sextloadi8_private : PrivateLoad <sextloadi8>;
@@ -227,29 +275,14 @@ def az_extloadi16 : PatFrag<(ops node:$ptr), (az_extload node:$ptr), [{
   return cast<LoadSDNode>(N)->getMemoryVT() == MVT::i16;
 }]>;
 
-def az_extloadi16_global : PatFrag<(ops node:$ptr), (az_extloadi16 node:$ptr), [{
-    return isGlobalLoad(dyn_cast<LoadSDNode>(N));
-}]>;
-
-def sextloadi16_global : PatFrag<(ops node:$ptr), (sextloadi16 node:$ptr), [{
-    return isGlobalLoad(dyn_cast<LoadSDNode>(N));
-}]>;
-
-def az_extloadi16_constant : PatFrag<(ops node:$ptr), (az_extloadi16 node:$ptr), [{
-    return isConstantLoad(dyn_cast<LoadSDNode>(N), -1);
-}]>;
-
-def sextloadi16_constant : PatFrag<(ops node:$ptr), (sextloadi16 node:$ptr), [{
-    return isConstantLoad(dyn_cast<LoadSDNode>(N), -1);
-}]>;
+def az_extloadi16_global : GlobalLoad <az_extloadi16>;
+def sextloadi16_global : GlobalLoad <sextloadi16>;
 
-def az_extloadi16_local : PatFrag<(ops node:$ptr), (az_extloadi16 node:$ptr), [{
-    return isLocalLoad(dyn_cast<LoadSDNode>(N));
-}]>;
+def az_extloadi16_constant : ConstantLoad <az_extloadi16>;
+def sextloadi16_constant : ConstantLoad <sextloadi16>;
 
-def sextloadi16_local : PatFrag<(ops node:$ptr), (sextloadi16 node:$ptr), [{
-    return isLocalLoad(dyn_cast<LoadSDNode>(N));
-}]>;
+def az_extloadi16_local : LocalLoad <az_extloadi16>;
+def sextloadi16_local : LocalLoad <sextloadi16>;
 
 def extloadi16_private : PrivateLoad <az_extloadi16>;
 def sextloadi16_private : PrivateLoad <sextloadi16>;
@@ -258,49 +291,20 @@ def az_extloadi32 : PatFrag<(ops node:$ptr), (az_extload node:$ptr), [{
   return cast<LoadSDNode>(N)->getMemoryVT() == MVT::i32;
 }]>;
 
-def az_extloadi32_global : PatFrag<(ops node:$ptr),
-                                   (az_extloadi32 node:$ptr), [{
-  return isGlobalLoad(dyn_cast<LoadSDNode>(N));
-}]>;
+def az_extloadi32_global : GlobalLoad <az_extloadi32>;
 
-def az_extloadi32_flat : PatFrag<(ops node:$ptr),
-                                   (az_extloadi32 node:$ptr), [{
-  return isFlatLoad(dyn_cast<LoadSDNode>(N));
-}]>;
+def az_extloadi32_flat : FlatLoad <az_extloadi32>;
 
-def az_extloadi32_constant : PatFrag<(ops node:$ptr),
-                                     (az_extloadi32 node:$ptr), [{
-  return isConstantLoad(dyn_cast<LoadSDNode>(N), -1);
-}]>;
+def az_extloadi32_constant : ConstantLoad <az_extloadi32>;
 
-def truncstorei8_global : PatFrag<(ops node:$val, node:$ptr),
-                                  (truncstorei8 node:$val, node:$ptr), [{
-  return isGlobalStore(dyn_cast<StoreSDNode>(N));
-}]>;
-
-def truncstorei16_global : PatFrag<(ops node:$val, node:$ptr),
-                                  (truncstorei16 node:$val, node:$ptr), [{
-  return isGlobalStore(dyn_cast<StoreSDNode>(N));
-}]>;
+def truncstorei8_global : GlobalStore <truncstorei8>;
+def truncstorei16_global : GlobalStore <truncstorei16>;
 
-def local_store : PatFrag<(ops node:$val, node:$ptr),
-                             (store node:$val, node:$ptr), [{
-  return isLocalStore(dyn_cast<StoreSDNode>(N));
-}]>;
+def local_store : LocalStore <store>;
+def truncstorei8_local : LocalStore <truncstorei8>;
+def truncstorei16_local : LocalStore <truncstorei16>;
 
-def truncstorei8_local : PatFrag<(ops node:$val, node:$ptr),
-                                  (truncstorei8 node:$val, node:$ptr), [{
-  return isLocalStore(dyn_cast<StoreSDNode>(N));
-}]>;
-
-def truncstorei16_local : PatFrag<(ops node:$val, node:$ptr),
-                                  (truncstorei16 node:$val, node:$ptr), [{
-  return isLocalStore(dyn_cast<StoreSDNode>(N));
-}]>;
-
-def local_load : PatFrag<(ops node:$ptr), (load node:$ptr), [{
-    return isLocalLoad(dyn_cast<LoadSDNode>(N));
-}]>;
+def local_load : LocalLoad <load>;
 
 class Aligned8Bytes <dag ops, dag frag> : PatFrag <ops, frag, [{
     return cast<MemSDNode>(N)->getAlignment() % 8 == 0;
@@ -370,6 +374,12 @@ class global_binary_atomic_op<SDNode atomic_op> : PatFrag<
   [{return cast<MemSDNode>(N)->getAddressSpace() == AMDGPUAS::GLOBAL_ADDRESS;}]
 >;
 
+class flat_binary_atomic_op<SDNode atomic_op> : PatFrag<
+  (ops node:$ptr, node:$value),
+  (atomic_op node:$ptr, node:$value),
+  [{return cast<MemSDNode>(N)->getAddressSpace() == AMDGPUAS::FLAT_ADDRESS;}]
+>;
+
 def atomic_swap_global : global_binary_atomic_op<atomic_swap>;
 def atomic_add_global : global_binary_atomic_op<atomic_load_add>;
 def atomic_and_global : global_binary_atomic_op<atomic_load_and>;
@@ -381,6 +391,26 @@ def atomic_umax_global : global_binary_atomic_op<atomic_load_umax>;
 def atomic_umin_global : global_binary_atomic_op<atomic_load_umin>;
 def atomic_xor_global : global_binary_atomic_op<atomic_load_xor>;
 
+def atomic_cmp_swap_global : global_binary_atomic_op<AMDGPUatomic_cmp_swap>;
+def atomic_cmp_swap_global_nortn : PatFrag<
+  (ops node:$ptr, node:$value),
+  (atomic_cmp_swap_global node:$ptr, node:$value),
+  [{ return SDValue(N, 0).use_empty(); }]
+>;
+
+def atomic_swap_flat : flat_binary_atomic_op<atomic_swap>;
+def atomic_add_flat : flat_binary_atomic_op<atomic_load_add>;
+def atomic_and_flat : flat_binary_atomic_op<atomic_load_and>;
+def atomic_max_flat : flat_binary_atomic_op<atomic_load_max>;
+def atomic_min_flat : flat_binary_atomic_op<atomic_load_min>;
+def atomic_or_flat : flat_binary_atomic_op<atomic_load_or>;
+def atomic_sub_flat : flat_binary_atomic_op<atomic_load_sub>;
+def atomic_umax_flat : flat_binary_atomic_op<atomic_load_umax>;
+def atomic_umin_flat : flat_binary_atomic_op<atomic_load_umin>;
+def atomic_xor_flat : flat_binary_atomic_op<atomic_load_xor>;
+
+def atomic_cmp_swap_flat : flat_binary_atomic_op<AMDGPUatomic_cmp_swap>;
+
 //===----------------------------------------------------------------------===//
 // Misc Pattern Fragments
 //===----------------------------------------------------------------------===//
@@ -392,6 +422,7 @@ int TWO_PI_INV = 0x3e22f983;
 int FP_UINT_MAX_PLUS_1 = 0x4f800000;    // 1 << 32 in floating point encoding
 int FP32_NEG_ONE = 0xbf800000;
 int FP32_ONE = 0x3f800000;
+int FP64_ONE = 0x3ff0000000000000;
 }
 def CONST : Constants;
 
@@ -570,6 +601,25 @@ class ROTRPattern <Instruction BIT_ALIGN> : Pat <
   (BIT_ALIGN $src0, $src0, $src1)
 >;
 
+// This matches 16 permutations of
+// max(min(x, y), min(max(x, y), z))
+class IntMed3Pat<Instruction med3Inst,
+                 SDPatternOperator max,
+                 SDPatternOperator max_oneuse,
+                 SDPatternOperator min_oneuse> : Pat<
+  (max (min_oneuse i32:$src0, i32:$src1),
+       (min_oneuse (max_oneuse i32:$src0, i32:$src1), i32:$src2)),
+  (med3Inst $src0, $src1, $src2)
+>;
+
+let Properties = [SDNPCommutative, SDNPAssociative] in {
+def smax_oneuse : HasOneUseBinOp<smax>;
+def smin_oneuse : HasOneUseBinOp<smin>;
+def umax_oneuse : HasOneUseBinOp<umax>;
+def umin_oneuse : HasOneUseBinOp<umin>;
+} // Properties = [SDNPCommutative, SDNPAssociative]
+
+
 // 24-bit arithmetic patterns
 def umul24 : PatFrag <(ops node:$x, node:$y), (mul node:$x, node:$y)>;
 
@@ -587,13 +637,6 @@ def cvt_flr_i32_f32 : PatFrag <
   [{ (void)N; return TM.Options.NoNaNsFPMath; }]
 >;
 
-/*
-class UMUL24Pattern <Instruction UMUL24> : Pat <
-  (mul U24:$x, U24:$y),
-  (UMUL24 $x, $y)
->;
-*/
-
 class IMad24Pat<Instruction Inst> : Pat <
   (add (AMDGPUmul_i24 i32:$src0, i32:$src1), i32:$src2),
   (Inst $src0, $src1, $src2)
@@ -604,30 +647,6 @@ class UMad24Pat<Instruction Inst> : Pat <
   (Inst $src0, $src1, $src2)
 >;
 
-multiclass Expand24IBitOps<Instruction MulInst, Instruction AddInst> {
-  def _expand_imad24 : Pat <
-    (AMDGPUmad_i24 i32:$src0, i32:$src1, i32:$src2),
-    (AddInst (MulInst $src0, $src1), $src2)
-  >;
-
-  def _expand_imul24 : Pat <
-    (AMDGPUmul_i24 i32:$src0, i32:$src1),
-    (MulInst $src0, $src1)
-  >;
-}
-
-multiclass Expand24UBitOps<Instruction MulInst, Instruction AddInst> {
-  def _expand_umad24 : Pat <
-    (AMDGPUmad_u24 i32:$src0, i32:$src1, i32:$src2),
-    (AddInst (MulInst $src0, $src1), $src2)
-  >;
-
-  def _expand_umul24 : Pat <
-    (AMDGPUmul_u24 i32:$src0, i32:$src1),
-    (MulInst $src0, $src1)
-  >;
-}
-
 class RcpPat<Instruction RcpInst, ValueType vt> : Pat <
   (fdiv FP_ONE, vt:$src),
   (RcpInst $src)
diff --git a/lib/Target/AMDGPU/AMDGPUIntrinsicInfo.cpp b/lib/Target/AMDGPU/AMDGPUIntrinsicInfo.cpp
index e94bb6013d83..791872a9db40 100644
--- a/lib/Target/AMDGPU/AMDGPUIntrinsicInfo.cpp
+++ b/lib/Target/AMDGPU/AMDGPUIntrinsicInfo.cpp
@@ -20,46 +20,44 @@
 
 using namespace llvm;
 
-#define GET_LLVM_INTRINSIC_FOR_GCC_BUILTIN
-#include "AMDGPUGenIntrinsics.inc"
-#undef GET_LLVM_INTRINSIC_FOR_GCC_BUILTIN
-
 AMDGPUIntrinsicInfo::AMDGPUIntrinsicInfo()
     : TargetIntrinsicInfo() {}
 
-std::string AMDGPUIntrinsicInfo::getName(unsigned IntrID, Type **Tys,
-                                         unsigned numTys) const {
-  static const char *const names[] = {
+static const char *const IntrinsicNameTable[] = {
 #define GET_INTRINSIC_NAME_TABLE
 #include "AMDGPUGenIntrinsics.inc"
 #undef GET_INTRINSIC_NAME_TABLE
-  };
+};
 
+std::string AMDGPUIntrinsicInfo::getName(unsigned IntrID, Type **Tys,
+                                         unsigned numTys) const {
   if (IntrID < Intrinsic::num_intrinsics) {
     return nullptr;
   }
   assert(IntrID < AMDGPUIntrinsic::num_AMDGPU_intrinsics &&
          "Invalid intrinsic ID");
 
-  std::string Result(names[IntrID - Intrinsic::num_intrinsics]);
+  std::string Result(IntrinsicNameTable[IntrID - Intrinsic::num_intrinsics]);
   return Result;
 }
 
-unsigned AMDGPUIntrinsicInfo::lookupName(const char *Name,
+unsigned AMDGPUIntrinsicInfo::lookupName(const char *NameData,
                                          unsigned Len) const {
-  if (!StringRef(Name, Len).startswith("llvm."))
+  StringRef Name(NameData, Len);
+  if (!Name.startswith("llvm."))
     return 0; // All intrinsics start with 'llvm.'
 
-#define GET_FUNCTION_RECOGNIZER
-#include "AMDGPUGenIntrinsics.inc"
-#undef GET_FUNCTION_RECOGNIZER
-  AMDGPUIntrinsic::ID IntrinsicID =
-      (AMDGPUIntrinsic::ID)Intrinsic::not_intrinsic;
-  IntrinsicID = getIntrinsicForGCCBuiltin("AMDGPU", Name);
-
-  if (IntrinsicID != (AMDGPUIntrinsic::ID)Intrinsic::not_intrinsic) {
-    return IntrinsicID;
+  // Look for a name match in our table.  If the intrinsic is not overloaded,
+  // require an exact match. If it is overloaded, require a prefix match. The
+  // AMDGPU enum enum starts at Intrinsic::num_intrinsics.
+  int Idx = Intrinsic::lookupLLVMIntrinsicByName(IntrinsicNameTable, Name);
+  if (Idx >= 0) {
+    bool IsPrefixMatch = Name.size() > strlen(IntrinsicNameTable[Idx]);
+    return IsPrefixMatch == isOverloaded(Idx + 1)
+               ? Intrinsic::num_intrinsics + Idx
+               : 0;
   }
+
   return 0;
 }
 
diff --git a/lib/Target/AMDGPU/AMDGPUIntrinsicInfo.h b/lib/Target/AMDGPU/AMDGPUIntrinsicInfo.h
index 4c95b5ec0974..f4173929259c 100644
--- a/lib/Target/AMDGPU/AMDGPUIntrinsicInfo.h
+++ b/lib/Target/AMDGPU/AMDGPUIntrinsicInfo.h
@@ -11,8 +11,8 @@
 /// \brief Interface for the AMDGPU Implementation of the Intrinsic Info class.
 //
 //===-----------------------------------------------------------------------===//
-#ifndef LLVM_LIB_TARGET_R600_AMDGPUINTRINSICINFO_H
-#define LLVM_LIB_TARGET_R600_AMDGPUINTRINSICINFO_H
+#ifndef LLVM_LIB_TARGET_AMDGPU_AMDGPUINTRINSICINFO_H
+#define LLVM_LIB_TARGET_AMDGPU_AMDGPUINTRINSICINFO_H
 
 #include "llvm/IR/Intrinsics.h"
 #include "llvm/Target/TargetIntrinsicInfo.h"
@@ -31,7 +31,7 @@ enum ID {
 
 } // end namespace AMDGPUIntrinsic
 
-class AMDGPUIntrinsicInfo : public TargetIntrinsicInfo {
+class AMDGPUIntrinsicInfo final : public TargetIntrinsicInfo {
 public:
   AMDGPUIntrinsicInfo();
   std::string getName(unsigned IntrId, Type **Tys = nullptr,
diff --git a/lib/Target/AMDGPU/AMDGPUIntrinsics.td b/lib/Target/AMDGPU/AMDGPUIntrinsics.td
index 1de3546485b1..2127391f18e7 100644
--- a/lib/Target/AMDGPU/AMDGPUIntrinsics.td
+++ b/lib/Target/AMDGPU/AMDGPUIntrinsics.td
@@ -12,79 +12,26 @@
 //===----------------------------------------------------------------------===//
 
 let TargetPrefix = "AMDGPU", isTarget = 1 in {
-
-  def int_AMDGPU_store_output : Intrinsic<[], [llvm_float_ty, llvm_i32_ty], []>;
-  def int_AMDGPU_swizzle : Intrinsic<[llvm_v4f32_ty], [llvm_v4f32_ty, llvm_i32_ty], [IntrNoMem]>;
-  def int_AMDGPU_abs : Intrinsic<[llvm_i32_ty], [llvm_i32_ty], [IntrNoMem]>;
-  def int_AMDGPU_arl : Intrinsic<[llvm_i32_ty], [llvm_float_ty], [IntrNoMem]>;
-  def int_AMDGPU_cndlt : Intrinsic<[llvm_float_ty], [llvm_float_ty, llvm_float_ty, llvm_float_ty], [IntrNoMem]>;
-  def int_AMDGPU_div : Intrinsic<[llvm_float_ty], [llvm_float_ty, llvm_float_ty], [IntrNoMem]>;
-  def int_AMDGPU_fract : Intrinsic<[llvm_anyfloat_ty], [LLVMMatchType<0>], [IntrNoMem]>;
   def int_AMDGPU_clamp : Intrinsic<[llvm_anyfloat_ty], [LLVMMatchType<0>, LLVMMatchType<0>, LLVMMatchType<0>], [IntrNoMem]>;
 
-  // This is named backwards (instead of rsq_legacy) so we don't have
-  // to define it with the public builtins intrinsics. This is a
-  // workaround for how intrinsic names are parsed. If the name is
-  // llvm.AMDGPU.rsq.legacy, the parser assumes that you meant
-  // llvm.AMDGPU.rsq.{f32 | f64} and incorrectly mangled the name.
-  def int_AMDGPU_legacy_rsq : Intrinsic<[llvm_float_ty], [llvm_float_ty], [IntrNoMem]>;
-
-  def int_AMDGPU_dp4 : Intrinsic<[llvm_float_ty], [llvm_v4f32_ty, llvm_v4f32_ty], [IntrNoMem]>;
   def int_AMDGPU_kill : Intrinsic<[], [llvm_float_ty], []>;
   def int_AMDGPU_kilp : Intrinsic<[], [], []>;
-  def int_AMDGPU_lrp : Intrinsic<[llvm_float_ty], [llvm_float_ty, llvm_float_ty, llvm_float_ty], [IntrNoMem]>;
-  def int_AMDGPU_mul : Intrinsic<[llvm_float_ty], [llvm_float_ty, llvm_float_ty], [IntrNoMem]>;
-  def int_AMDGPU_pow : Intrinsic<[llvm_float_ty], [llvm_float_ty, llvm_float_ty], [IntrNoMem]>;
-  def int_AMDGPU_seq : Intrinsic<[llvm_float_ty], [llvm_float_ty, llvm_float_ty], [IntrNoMem]>;
-  def int_AMDGPU_sgt : Intrinsic<[llvm_float_ty], [llvm_float_ty, llvm_float_ty], [IntrNoMem]>;
-  def int_AMDGPU_sge : Intrinsic<[llvm_float_ty], [llvm_float_ty, llvm_float_ty], [IntrNoMem]>;
-  def int_AMDGPU_sle : Intrinsic<[llvm_float_ty], [llvm_float_ty, llvm_float_ty], [IntrNoMem]>;
-  def int_AMDGPU_sne : Intrinsic<[llvm_float_ty], [llvm_float_ty, llvm_float_ty], [IntrNoMem]>;
-  def int_AMDGPU_mullit : Intrinsic<[llvm_v4f32_ty], [llvm_float_ty, llvm_float_ty, llvm_float_ty], [IntrNoMem]>;
-  def int_AMDGPU_tex : Intrinsic<[llvm_v4f32_ty], [llvm_v4f32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], [IntrNoMem]>;
-  def int_AMDGPU_txb : Intrinsic<[llvm_v4f32_ty], [llvm_v4f32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], [IntrNoMem]>;
-  def int_AMDGPU_txf : Intrinsic<[llvm_v4f32_ty], [llvm_v4f32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], [IntrNoMem]>;
-  def int_AMDGPU_txq : Intrinsic<[llvm_v4f32_ty], [llvm_v4f32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], [IntrNoMem]>;
-  def int_AMDGPU_txd : Intrinsic<[llvm_v4f32_ty], [llvm_v4f32_ty, llvm_v4f32_ty, llvm_v4f32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], [IntrNoMem]>;
-  def int_AMDGPU_txl : Intrinsic<[llvm_v4f32_ty], [llvm_v4f32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], [IntrNoMem]>;
-  def int_AMDGPU_trunc : Intrinsic<[llvm_float_ty], [llvm_float_ty], [IntrNoMem]>;
-  def int_AMDGPU_ddx : Intrinsic<[llvm_v4f32_ty], [llvm_v4f32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], [IntrNoMem]>;
-  def int_AMDGPU_ddy : Intrinsic<[llvm_v4f32_ty], [llvm_v4f32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], [IntrNoMem]>;
-  def int_AMDGPU_imax : Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty], [IntrNoMem]>;
-  def int_AMDGPU_imin : Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty], [IntrNoMem]>;
-  def int_AMDGPU_umax : Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty], [IntrNoMem]>;
-  def int_AMDGPU_umin : Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty], [IntrNoMem]>;
-  def int_AMDGPU_umul24 : Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty], [IntrNoMem]>;
-  def int_AMDGPU_imul24 : Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty], [IntrNoMem]>;
-  def int_AMDGPU_imad24 : Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], [IntrNoMem]>;
-  def int_AMDGPU_umad24 : Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], [IntrNoMem]>;
-  def int_AMDGPU_cvt_f32_ubyte0 : Intrinsic<[llvm_float_ty], [llvm_i32_ty], [IntrNoMem]>;
-  def int_AMDGPU_cvt_f32_ubyte1 : Intrinsic<[llvm_float_ty], [llvm_i32_ty], [IntrNoMem]>;
-  def int_AMDGPU_cvt_f32_ubyte2 : Intrinsic<[llvm_float_ty], [llvm_i32_ty], [IntrNoMem]>;
-  def int_AMDGPU_cvt_f32_ubyte3 : Intrinsic<[llvm_float_ty], [llvm_i32_ty], [IntrNoMem]>;
+  def int_AMDGPU_flbit_i32 : Intrinsic<[llvm_i32_ty], [llvm_i32_ty], [IntrNoMem]>;
+
+  // Deprecated in favor of separate int_amdgcn_cube* intrinsics.
   def int_AMDGPU_cube : Intrinsic<[llvm_v4f32_ty], [llvm_v4f32_ty], [IntrNoMem]>;
-  def int_AMDGPU_bfi : Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], [IntrNoMem]>;
+
+  // Deprecated in favor of expanded bit operations
   def int_AMDGPU_bfe_i32 : Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], [IntrNoMem]>;
   def int_AMDGPU_bfe_u32 : Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], [IntrNoMem]>;
-  def int_AMDGPU_bfm : Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty], [IntrNoMem]>;
-  def int_AMDGPU_brev : Intrinsic<[llvm_i32_ty], [llvm_i32_ty], [IntrNoMem]>;
-  def int_AMDGPU_flbit_i32 : Intrinsic<[llvm_i32_ty], [llvm_i32_ty], [IntrNoMem]>;
-  def int_AMDGPU_barrier_local  : Intrinsic<[], [], [IntrConvergent]>;
-  def int_AMDGPU_barrier_global  : Intrinsic<[], [], [IntrConvergent]>;
-}
-
-// Legacy names for compatibility.
-let TargetPrefix = "AMDIL", isTarget = 1 in {
-  def int_AMDIL_abs : Intrinsic<[llvm_anyint_ty], [LLVMMatchType<0>], [IntrNoMem]>;
-  def int_AMDIL_fraction : Intrinsic<[llvm_anyfloat_ty], [LLVMMatchType<0>], [IntrNoMem]>;
-  def int_AMDIL_clamp : Intrinsic<[llvm_anyfloat_ty], [LLVMMatchType<0>, LLVMMatchType<0>, LLVMMatchType<0>], [IntrNoMem]>;
-  def int_AMDIL_exp : Intrinsic<[llvm_anyfloat_ty], [LLVMMatchType<0>], [IntrNoMem]>;
-  def int_AMDIL_round_nearest : Intrinsic<[llvm_anyfloat_ty], [LLVMMatchType<0>], [IntrNoMem]>;
-}
 
-let TargetPrefix = "TGSI", isTarget = 1 in {
+  // Deprecated in favor of llvm.amdgcn.rsq
+  def int_AMDGPU_rsq : Intrinsic<
+    [llvm_anyfloat_ty], [LLVMMatchType<0>], [IntrNoMem]
+  >;
 
-  def int_TGSI_lit_z : Intrinsic<[llvm_float_ty], [llvm_float_ty, llvm_float_ty, llvm_float_ty],[IntrNoMem]>;
+  // Deprecated in favor of llvm.amdgcn.read.workdim
+  def int_AMDGPU_read_workdim : Intrinsic<[llvm_i32_ty], [], [IntrNoMem]>;
 }
 
 include "SIIntrinsics.td"
diff --git a/lib/Target/AMDGPU/AMDGPUMCInstLower.cpp b/lib/Target/AMDGPU/AMDGPUMCInstLower.cpp
index dfc652f31da5..ad8d3e4d3545 100644
--- a/lib/Target/AMDGPU/AMDGPUMCInstLower.cpp
+++ b/lib/Target/AMDGPU/AMDGPUMCInstLower.cpp
@@ -15,9 +15,9 @@
 
 #include "AMDGPUMCInstLower.h"
 #include "AMDGPUAsmPrinter.h"
+#include "AMDGPUSubtarget.h"
 #include "AMDGPUTargetMachine.h"
 #include "InstPrinter/AMDGPUInstPrinter.h"
-#include "R600InstrInfo.h"
 #include "SIInstrInfo.h"
 #include "llvm/CodeGen/MachineBasicBlock.h"
 #include "llvm/CodeGen/MachineInstr.h"
@@ -37,8 +37,14 @@
 using namespace llvm;
 
 AMDGPUMCInstLower::AMDGPUMCInstLower(MCContext &ctx, const AMDGPUSubtarget &st):
-  Ctx(ctx), ST(st)
-{ }
+  Ctx(ctx), ST(st) { }
+
+static MCSymbolRefExpr::VariantKind getVariantKind(unsigned MOFlags) {
+  switch (MOFlags) {
+  default: return MCSymbolRefExpr::VK_None;
+  case SIInstrInfo::MO_GOTPCREL: return MCSymbolRefExpr::VK_GOTPCREL;
+  }
+}
 
 void AMDGPUMCInstLower::lower(const MachineInstr *MI, MCInst &OutMI) const {
 
@@ -70,11 +76,16 @@ void AMDGPUMCInstLower::lower(const MachineInstr *MI, MCInst &OutMI) const {
     case MachineOperand::MO_GlobalAddress: {
       const GlobalValue *GV = MO.getGlobal();
       MCSymbol *Sym = Ctx.getOrCreateSymbol(StringRef(GV->getName()));
-      MCOp = MCOperand::createExpr(MCSymbolRefExpr::create(Sym, Ctx));
+      const MCExpr *SymExpr =
+          MCSymbolRefExpr::create(Sym, getVariantKind(MO.getTargetFlags()),Ctx);
+      const MCExpr *Expr = MCBinaryExpr::createAdd(SymExpr,
+          MCConstantExpr::create(MO.getOffset(), Ctx), Ctx);
+      MCOp = MCOperand::createExpr(Expr);
       break;
     }
     case MachineOperand::MO_ExternalSymbol: {
       MCSymbol *Sym = Ctx.getOrCreateSymbol(StringRef(MO.getSymbolName()));
+      Sym->setExternal(true);
       const MCSymbolRefExpr *Expr = MCSymbolRefExpr::create(Sym, Ctx);
       MCOp = MCOperand::createExpr(Expr);
       break;
@@ -88,13 +99,13 @@ void AMDGPUAsmPrinter::EmitInstruction(const MachineInstr *MI) {
   const AMDGPUSubtarget &STI = MF->getSubtarget<AMDGPUSubtarget>();
   AMDGPUMCInstLower MCInstLowering(OutContext, STI);
 
-#ifdef _DEBUG
   StringRef Err;
-  if (!STI.getInstrInfo()->verifyInstruction(MI, Err)) {
-    errs() << "Warning: Illegal instruction detected: " << Err << "\n";
+  if (!STI.getInstrInfo()->verifyInstruction(*MI, Err)) {
+    LLVMContext &C = MI->getParent()->getParent()->getFunction()->getContext();
+    C.emitError("Illegal instruction detected: " + Err);
     MI->dump();
   }
-#endif
+
   if (MI->isBundle()) {
     const MachineBasicBlock *MBB = MI->getParent();
     MachineBasicBlock::const_instr_iterator I = ++MI->getIterator();
@@ -103,6 +114,29 @@ void AMDGPUAsmPrinter::EmitInstruction(const MachineInstr *MI) {
       ++I;
     }
   } else {
+    // We don't want SI_MASK_BRANCH/SI_RETURN encoded. They are placeholder
+    // terminator instructions and should only be printed as comments.
+    if (MI->getOpcode() == AMDGPU::SI_MASK_BRANCH) {
+      if (isVerbose()) {
+        SmallVector<char, 16> BBStr;
+        raw_svector_ostream Str(BBStr);
+
+        const MachineBasicBlock *MBB = MI->getOperand(0).getMBB();
+        const MCSymbolRefExpr *Expr
+          = MCSymbolRefExpr::create(MBB->getSymbol(), OutContext);
+        Expr->print(Str, MAI);
+        OutStreamer->emitRawComment(" mask branch " + BBStr);
+      }
+
+      return;
+    }
+
+    if (MI->getOpcode() == AMDGPU::SI_RETURN) {
+      if (isVerbose())
+        OutStreamer->emitRawComment(" return");
+      return;
+    }
+
     MCInst TmpInst;
     MCInstLowering.lower(MI, TmpInst);
     EmitToStreamer(*OutStreamer, TmpInst);
@@ -114,10 +148,9 @@ void AMDGPUAsmPrinter::EmitInstruction(const MachineInstr *MI) {
       raw_string_ostream DisasmStream(DisasmLine);
 
       AMDGPUInstPrinter InstPrinter(*TM.getMCAsmInfo(),
-                                    *MF->getSubtarget().getInstrInfo(),
-                                    *MF->getSubtarget().getRegisterInfo());
-      InstPrinter.printInst(&TmpInst, DisasmStream, StringRef(),
-                            MF->getSubtarget());
+                                    *STI.getInstrInfo(),
+                                    *STI.getRegisterInfo());
+      InstPrinter.printInst(&TmpInst, DisasmStream, StringRef(), STI);
 
       // Disassemble instruction/operands to hex representation.
       SmallVector<MCFixup, 4> Fixups;
diff --git a/lib/Target/AMDGPU/AMDGPUMCInstLower.h b/lib/Target/AMDGPU/AMDGPUMCInstLower.h
index d322fe072b2b..957dcd0de8ef 100644
--- a/lib/Target/AMDGPU/AMDGPUMCInstLower.h
+++ b/lib/Target/AMDGPU/AMDGPUMCInstLower.h
@@ -8,8 +8,8 @@
 /// \file
 //===----------------------------------------------------------------------===//
 
-#ifndef LLVM_LIB_TARGET_R600_AMDGPUMCINSTLOWER_H
-#define LLVM_LIB_TARGET_R600_AMDGPUMCINSTLOWER_H
+#ifndef LLVM_LIB_TARGET_AMDGPU_AMDGPUMCINSTLOWER_H
+#define LLVM_LIB_TARGET_AMDGPU_AMDGPUMCINSTLOWER_H
 
 namespace llvm {
 
diff --git a/lib/Target/AMDGPU/AMDGPUMachineFunction.cpp b/lib/Target/AMDGPU/AMDGPUMachineFunction.cpp
index 54137177e4c0..44516dab04f1 100644
--- a/lib/Target/AMDGPU/AMDGPUMachineFunction.cpp
+++ b/lib/Target/AMDGPU/AMDGPUMachineFunction.cpp
@@ -1,8 +1,5 @@
 #include "AMDGPUMachineFunction.h"
-#include "AMDGPU.h"
-#include "Utils/AMDGPUBaseInfo.h"
-#include "llvm/IR/Attributes.h"
-#include "llvm/IR/Function.h"
+
 using namespace llvm;
 
 // Pin the vtable to this file.
@@ -10,11 +7,17 @@ void AMDGPUMachineFunction::anchor() {}
 
 AMDGPUMachineFunction::AMDGPUMachineFunction(const MachineFunction &MF) :
   MachineFunctionInfo(),
-  ShaderType(ShaderType::COMPUTE),
+  KernArgSize(0),
+  MaxKernArgAlign(0),
   LDSSize(0),
   ABIArgOffset(0),
   ScratchSize(0),
-  IsKernel(true) {
+  IsKernel(MF.getFunction()->getCallingConv() == llvm::CallingConv::AMDGPU_KERNEL ||
+           MF.getFunction()->getCallingConv() == llvm::CallingConv::SPIR_KERNEL)
+{
+}
 
-  ShaderType = AMDGPU::getShaderType(*MF.getFunction());
+bool AMDGPUMachineFunction::isKernel() const
+{
+  return IsKernel;
 }
diff --git a/lib/Target/AMDGPU/AMDGPUMachineFunction.h b/lib/Target/AMDGPU/AMDGPUMachineFunction.h
index 46fcee874887..6b31f63e1a9d 100644
--- a/lib/Target/AMDGPU/AMDGPUMachineFunction.h
+++ b/lib/Target/AMDGPU/AMDGPUMachineFunction.h
@@ -1,4 +1,4 @@
-//===-- R600MachineFunctionInfo.h - R600 Machine Function Info ----*- C++ -*-=//
+//===-- AMDGPUMachineFunctionInfo.h -------------------------------*- C++ -*-=//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -6,12 +6,9 @@
 // License. See LICENSE.TXT for details.
 //
 //===----------------------------------------------------------------------===//
-//
-/// \file
-//===----------------------------------------------------------------------===//
 
-#ifndef LLVM_LIB_TARGET_R600_AMDGPUMACHINEFUNCTION_H
-#define LLVM_LIB_TARGET_R600_AMDGPUMACHINEFUNCTION_H
+#ifndef LLVM_LIB_TARGET_AMDGPU_AMDGPUMACHINEFUNCTION_H
+#define LLVM_LIB_TARGET_AMDGPU_AMDGPUMACHINEFUNCTION_H
 
 #include "llvm/CodeGen/MachineFunction.h"
 #include <map>
@@ -19,11 +16,25 @@
 namespace llvm {
 
 class AMDGPUMachineFunction : public MachineFunctionInfo {
+  uint64_t KernArgSize;
+  unsigned MaxKernArgAlign;
+
   virtual void anchor();
-  unsigned ShaderType;
 
 public:
   AMDGPUMachineFunction(const MachineFunction &MF);
+
+  uint64_t allocateKernArg(uint64_t Size, unsigned Align) {
+    assert(isPowerOf2_32(Align));
+    KernArgSize = alignTo(KernArgSize, Align);
+
+    uint64_t Result = KernArgSize;
+    KernArgSize += Size;
+
+    MaxKernArgAlign = std::max(Align, MaxKernArgAlign);
+    return Result;
+  }
+
   /// A map to keep track of local memory objects and their offsets within
   /// the local memory space.
   std::map<const GlobalValue *, unsigned> LocalMemoryObjects;
@@ -33,14 +44,7 @@ public:
   /// Start of implicit kernel args
   unsigned ABIArgOffset;
 
-  unsigned getShaderType() const {
-    return ShaderType;
-  }
-
-  bool isKernel() const {
-    // FIXME: Assume everything is a kernel until function calls are supported.
-    return true;
-  }
+  bool isKernel() const;
 
   unsigned ScratchSize;
   bool IsKernel;
diff --git a/lib/Target/AMDGPU/AMDGPUOpenCLImageTypeLoweringPass.cpp b/lib/Target/AMDGPU/AMDGPUOpenCLImageTypeLoweringPass.cpp
index 554bf1da81f5..8bc7b53435be 100644
--- a/lib/Target/AMDGPU/AMDGPUOpenCLImageTypeLoweringPass.cpp
+++ b/lib/Target/AMDGPU/AMDGPUOpenCLImageTypeLoweringPass.cpp
@@ -25,7 +25,6 @@
 //===----------------------------------------------------------------------===//
 
 #include "AMDGPU.h"
-#include "llvm/ADT/DenseSet.h"
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/Analysis/Passes.h"
diff --git a/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp b/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp
index 87d50d587059..775463809634 100644
--- a/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp
+++ b/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp
@@ -16,7 +16,8 @@
 #include "AMDGPUSubtarget.h"
 #include "llvm/Analysis/ValueTracking.h"
 #include "llvm/IR/IRBuilder.h"
-#include "llvm/IR/InstVisitor.h"
+#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/MDBuilder.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/raw_ostream.h"
 
@@ -26,79 +27,317 @@ using namespace llvm;
 
 namespace {
 
-class AMDGPUPromoteAlloca : public FunctionPass,
-                       public InstVisitor<AMDGPUPromoteAlloca> {
-
-  static char ID;
+// FIXME: This can create globals so should be a module pass.
+class AMDGPUPromoteAlloca : public FunctionPass {
+private:
+  const TargetMachine *TM;
   Module *Mod;
-  const AMDGPUSubtarget &ST;
-  int LocalMemAvailable;
+  const DataLayout *DL;
+  MDNode *MaxWorkGroupSizeRange;
+
+  // FIXME: This should be per-kernel.
+  uint32_t LocalMemLimit;
+  uint32_t CurrentLocalMemUsage;
+
+  bool IsAMDGCN;
+  bool IsAMDHSA;
+
+  std::pair<Value *, Value *> getLocalSizeYZ(IRBuilder<> &Builder);
+  Value *getWorkitemID(IRBuilder<> &Builder, unsigned N);
+
+  /// BaseAlloca is the alloca root the search started from.
+  /// Val may be that alloca or a recursive user of it.
+  bool collectUsesWithPtrTypes(Value *BaseAlloca,
+                               Value *Val,
+                               std::vector<Value*> &WorkList) const;
+
+  /// Val is a derived pointer from Alloca. OpIdx0/OpIdx1 are the operand
+  /// indices to an instruction with 2 pointer inputs (e.g. select, icmp).
+  /// Returns true if both operands are derived from the same alloca. Val should
+  /// be the same value as one of the input operands of UseInst.
+  bool binaryOpIsDerivedFromSameAlloca(Value *Alloca, Value *Val,
+                                       Instruction *UseInst,
+                                       int OpIdx0, int OpIdx1) const;
 
 public:
-  AMDGPUPromoteAlloca(const AMDGPUSubtarget &st) : FunctionPass(ID), ST(st),
-                                                   LocalMemAvailable(0) { }
+  static char ID;
+
+  AMDGPUPromoteAlloca(const TargetMachine *TM_ = nullptr) :
+    FunctionPass(ID),
+    TM(TM_),
+    Mod(nullptr),
+    DL(nullptr),
+    MaxWorkGroupSizeRange(nullptr),
+    LocalMemLimit(0),
+    CurrentLocalMemUsage(0),
+    IsAMDGCN(false),
+    IsAMDHSA(false) { }
+
   bool doInitialization(Module &M) override;
   bool runOnFunction(Function &F) override;
-  const char *getPassName() const override { return "AMDGPU Promote Alloca"; }
-  void visitAlloca(AllocaInst &I);
+
+  const char *getPassName() const override {
+    return "AMDGPU Promote Alloca";
+  }
+
+  void handleAlloca(AllocaInst &I);
+
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    AU.setPreservesCFG();
+    FunctionPass::getAnalysisUsage(AU);
+  }
 };
 
 } // End anonymous namespace
 
 char AMDGPUPromoteAlloca::ID = 0;
 
+INITIALIZE_TM_PASS(AMDGPUPromoteAlloca, DEBUG_TYPE,
+                   "AMDGPU promote alloca to vector or LDS", false, false)
+
+char &llvm::AMDGPUPromoteAllocaID = AMDGPUPromoteAlloca::ID;
+
+
 bool AMDGPUPromoteAlloca::doInitialization(Module &M) {
+  if (!TM)
+    return false;
+
   Mod = &M;
+  DL = &Mod->getDataLayout();
+
+  // The maximum workitem id.
+  //
+  // FIXME: Should get as subtarget property. Usually runtime enforced max is
+  // 256.
+  MDBuilder MDB(Mod->getContext());
+  MaxWorkGroupSizeRange = MDB.createRange(APInt(32, 0), APInt(32, 2048));
+
+  const Triple &TT = TM->getTargetTriple();
+
+  IsAMDGCN = TT.getArch() == Triple::amdgcn;
+  IsAMDHSA = TT.getOS() == Triple::AMDHSA;
+
   return false;
 }
 
 bool AMDGPUPromoteAlloca::runOnFunction(Function &F) {
+  if (!TM || skipFunction(F))
+    return false;
 
-  FunctionType *FTy = F.getFunctionType();
-
-  LocalMemAvailable = ST.getLocalMemorySize();
+  const AMDGPUSubtarget &ST = TM->getSubtarget<AMDGPUSubtarget>(F);
+  if (!ST.isPromoteAllocaEnabled())
+    return false;
 
+  FunctionType *FTy = F.getFunctionType();
 
   // If the function has any arguments in the local address space, then it's
   // possible these arguments require the entire local memory space, so
   // we cannot use local memory in the pass.
-  for (unsigned i = 0, e = FTy->getNumParams(); i != e; ++i) {
-    Type *ParamTy = FTy->getParamType(i);
-    if (ParamTy->isPointerTy() &&
-        ParamTy->getPointerAddressSpace() == AMDGPUAS::LOCAL_ADDRESS) {
-      LocalMemAvailable = 0;
-      DEBUG(dbgs() << "Function has local memory argument.  Promoting to "
+  for (Type *ParamTy : FTy->params()) {
+    PointerType *PtrTy = dyn_cast<PointerType>(ParamTy);
+    if (PtrTy && PtrTy->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS) {
+      LocalMemLimit = 0;
+      DEBUG(dbgs() << "Function has local memory argument. Promoting to "
                       "local memory disabled.\n");
-      break;
+      return false;
     }
   }
 
-  if (LocalMemAvailable > 0) {
-    // Check how much local memory is being used by global objects
-    for (Module::global_iterator I = Mod->global_begin(),
-                                 E = Mod->global_end(); I != E; ++I) {
-      GlobalVariable *GV = &*I;
-      PointerType *GVTy = GV->getType();
-      if (GVTy->getAddressSpace() != AMDGPUAS::LOCAL_ADDRESS)
+  LocalMemLimit = ST.getLocalMemorySize();
+  if (LocalMemLimit == 0)
+    return false;
+
+  const DataLayout &DL = Mod->getDataLayout();
+
+  // Check how much local memory is being used by global objects
+  CurrentLocalMemUsage = 0;
+  for (GlobalVariable &GV : Mod->globals()) {
+    if (GV.getType()->getAddressSpace() != AMDGPUAS::LOCAL_ADDRESS)
+      continue;
+
+    for (const User *U : GV.users()) {
+      const Instruction *Use = dyn_cast<Instruction>(U);
+      if (!Use)
         continue;
-      for (Value::use_iterator U = GV->use_begin(),
-                               UE = GV->use_end(); U != UE; ++U) {
-        Instruction *Use = dyn_cast<Instruction>(*U);
-        if (!Use)
-          continue;
-        if (Use->getParent()->getParent() == &F)
-          LocalMemAvailable -=
-              Mod->getDataLayout().getTypeAllocSize(GVTy->getElementType());
+
+      if (Use->getParent()->getParent() == &F) {
+        unsigned Align = GV.getAlignment();
+        if (Align == 0)
+          Align = DL.getABITypeAlignment(GV.getValueType());
+
+        // FIXME: Try to account for padding here. The padding is currently
+        // determined from the inverse order of uses in the function. I'm not
+        // sure if the use list order is in any way connected to this, so the
+        // total reported size is likely incorrect.
+        uint64_t AllocSize = DL.getTypeAllocSize(GV.getValueType());
+        CurrentLocalMemUsage = alignTo(CurrentLocalMemUsage, Align);
+        CurrentLocalMemUsage += AllocSize;
+        break;
       }
     }
   }
 
-  LocalMemAvailable = std::max(0, LocalMemAvailable);
-  DEBUG(dbgs() << LocalMemAvailable << "bytes free in local memory.\n");
+  unsigned MaxOccupancy = ST.getOccupancyWithLocalMemSize(CurrentLocalMemUsage);
 
-  visit(F);
+  // Restrict local memory usage so that we don't drastically reduce occupancy,
+  // unless it is already significantly reduced.
 
-  return false;
+  // TODO: Have some sort of hint or other heuristics to guess occupancy based
+  // on other factors..
+  unsigned OccupancyHint
+    = AMDGPU::getIntegerAttribute(F, "amdgpu-max-waves-per-eu", 0);
+  if (OccupancyHint == 0)
+    OccupancyHint = 7;
+
+  // Clamp to max value.
+  OccupancyHint = std::min(OccupancyHint, ST.getMaxWavesPerCU());
+
+  // Check the hint but ignore it if it's obviously wrong from the existing LDS
+  // usage.
+  MaxOccupancy = std::min(OccupancyHint, MaxOccupancy);
+
+
+  // Round up to the next tier of usage.
+  unsigned MaxSizeWithWaveCount
+    = ST.getMaxLocalMemSizeWithWaveCount(MaxOccupancy);
+
+  // Program is possibly broken by using more local mem than available.
+  if (CurrentLocalMemUsage > MaxSizeWithWaveCount)
+    return false;
+
+  LocalMemLimit = MaxSizeWithWaveCount;
+
+  DEBUG(
+    dbgs() << F.getName() << " uses " << CurrentLocalMemUsage << " bytes of LDS\n"
+    << "  Rounding size to " << MaxSizeWithWaveCount
+    << " with a maximum occupancy of " << MaxOccupancy << '\n'
+    << " and " << (LocalMemLimit - CurrentLocalMemUsage)
+    << " available for promotion\n"
+  );
+
+  BasicBlock &EntryBB = *F.begin();
+  for (auto I = EntryBB.begin(), E = EntryBB.end(); I != E; ) {
+    AllocaInst *AI = dyn_cast<AllocaInst>(I);
+
+    ++I;
+    if (AI)
+      handleAlloca(*AI);
+  }
+
+  return true;
+}
+
+std::pair<Value *, Value *>
+AMDGPUPromoteAlloca::getLocalSizeYZ(IRBuilder<> &Builder) {
+  if (!IsAMDHSA) {
+    Function *LocalSizeYFn
+      = Intrinsic::getDeclaration(Mod, Intrinsic::r600_read_local_size_y);
+    Function *LocalSizeZFn
+      = Intrinsic::getDeclaration(Mod, Intrinsic::r600_read_local_size_z);
+
+    CallInst *LocalSizeY = Builder.CreateCall(LocalSizeYFn, {});
+    CallInst *LocalSizeZ = Builder.CreateCall(LocalSizeZFn, {});
+
+    LocalSizeY->setMetadata(LLVMContext::MD_range, MaxWorkGroupSizeRange);
+    LocalSizeZ->setMetadata(LLVMContext::MD_range, MaxWorkGroupSizeRange);
+
+    return std::make_pair(LocalSizeY, LocalSizeZ);
+  }
+
+  // We must read the size out of the dispatch pointer.
+  assert(IsAMDGCN);
+
+  // We are indexing into this struct, and want to extract the workgroup_size_*
+  // fields.
+  //
+  //   typedef struct hsa_kernel_dispatch_packet_s {
+  //     uint16_t header;
+  //     uint16_t setup;
+  //     uint16_t workgroup_size_x ;
+  //     uint16_t workgroup_size_y;
+  //     uint16_t workgroup_size_z;
+  //     uint16_t reserved0;
+  //     uint32_t grid_size_x ;
+  //     uint32_t grid_size_y ;
+  //     uint32_t grid_size_z;
+  //
+  //     uint32_t private_segment_size;
+  //     uint32_t group_segment_size;
+  //     uint64_t kernel_object;
+  //
+  // #ifdef HSA_LARGE_MODEL
+  //     void *kernarg_address;
+  // #elif defined HSA_LITTLE_ENDIAN
+  //     void *kernarg_address;
+  //     uint32_t reserved1;
+  // #else
+  //     uint32_t reserved1;
+  //     void *kernarg_address;
+  // #endif
+  //     uint64_t reserved2;
+  //     hsa_signal_t completion_signal; // uint64_t wrapper
+  //   } hsa_kernel_dispatch_packet_t
+  //
+  Function *DispatchPtrFn
+    = Intrinsic::getDeclaration(Mod, Intrinsic::amdgcn_dispatch_ptr);
+
+  CallInst *DispatchPtr = Builder.CreateCall(DispatchPtrFn, {});
+  DispatchPtr->addAttribute(AttributeSet::ReturnIndex, Attribute::NoAlias);
+  DispatchPtr->addAttribute(AttributeSet::ReturnIndex, Attribute::NonNull);
+
+  // Size of the dispatch packet struct.
+  DispatchPtr->addDereferenceableAttr(AttributeSet::ReturnIndex, 64);
+
+  Type *I32Ty = Type::getInt32Ty(Mod->getContext());
+  Value *CastDispatchPtr = Builder.CreateBitCast(
+    DispatchPtr, PointerType::get(I32Ty, AMDGPUAS::CONSTANT_ADDRESS));
+
+  // We could do a single 64-bit load here, but it's likely that the basic
+  // 32-bit and extract sequence is already present, and it is probably easier
+  // to CSE this. The loads should be mergable later anyway.
+  Value *GEPXY = Builder.CreateConstInBoundsGEP1_64(CastDispatchPtr, 1);
+  LoadInst *LoadXY = Builder.CreateAlignedLoad(GEPXY, 4);
+
+  Value *GEPZU = Builder.CreateConstInBoundsGEP1_64(CastDispatchPtr, 2);
+  LoadInst *LoadZU = Builder.CreateAlignedLoad(GEPZU, 4);
+
+  MDNode *MD = llvm::MDNode::get(Mod->getContext(), None);
+  LoadXY->setMetadata(LLVMContext::MD_invariant_load, MD);
+  LoadZU->setMetadata(LLVMContext::MD_invariant_load, MD);
+  LoadZU->setMetadata(LLVMContext::MD_range, MaxWorkGroupSizeRange);
+
+  // Extract y component. Upper half of LoadZU should be zero already.
+  Value *Y = Builder.CreateLShr(LoadXY, 16);
+
+  return std::make_pair(Y, LoadZU);
+}
+
+Value *AMDGPUPromoteAlloca::getWorkitemID(IRBuilder<> &Builder, unsigned N) {
+  Intrinsic::ID IntrID = Intrinsic::ID::not_intrinsic;
+
+  switch (N) {
+  case 0:
+    IntrID = IsAMDGCN ? Intrinsic::amdgcn_workitem_id_x
+      : Intrinsic::r600_read_tidig_x;
+    break;
+  case 1:
+    IntrID = IsAMDGCN ? Intrinsic::amdgcn_workitem_id_y
+      : Intrinsic::r600_read_tidig_y;
+    break;
+
+  case 2:
+    IntrID = IsAMDGCN ? Intrinsic::amdgcn_workitem_id_z
+      : Intrinsic::r600_read_tidig_z;
+    break;
+  default:
+    llvm_unreachable("invalid dimension");
+  }
+
+  Function *WorkitemIdFn = Intrinsic::getDeclaration(Mod, IntrID);
+  CallInst *CI = Builder.CreateCall(WorkitemIdFn);
+  CI->setMetadata(LLVMContext::MD_range, MaxWorkGroupSizeRange);
+
+  return CI;
 }
 
 static VectorType *arrayTypeToVecType(Type *ArrayTy) {
@@ -151,17 +390,16 @@ static bool canVectorizeInst(Instruction *Inst, User *User) {
 }
 
 static bool tryPromoteAllocaToVector(AllocaInst *Alloca) {
-  Type *AllocaTy = Alloca->getAllocatedType();
+  ArrayType *AllocaTy = dyn_cast<ArrayType>(Alloca->getAllocatedType());
 
-  DEBUG(dbgs() << "Alloca Candidate for vectorization \n");
+  DEBUG(dbgs() << "Alloca candidate for vectorization\n");
 
   // FIXME: There is no reason why we can't support larger arrays, we
   // are just being conservative for now.
-  if (!AllocaTy->isArrayTy() ||
-      AllocaTy->getArrayElementType()->isVectorTy() ||
-      AllocaTy->getArrayNumElements() > 4) {
-
-    DEBUG(dbgs() << "  Cannot convert type to vector");
+  if (!AllocaTy ||
+      AllocaTy->getElementType()->isVectorTy() ||
+      AllocaTy->getNumElements() > 4) {
+    DEBUG(dbgs() << "  Cannot convert type to vector\n");
     return false;
   }
 
@@ -200,9 +438,8 @@ static bool tryPromoteAllocaToVector(AllocaInst *Alloca) {
   DEBUG(dbgs() << "  Converting alloca to vector "
         << *AllocaTy << " -> " << *VectorTy << '\n');
 
-  for (std::vector<Value*>::iterator I = WorkList.begin(),
-                                     E = WorkList.end(); I != E; ++I) {
-    Instruction *Inst = cast<Instruction>(*I);
+  for (Value *V : WorkList) {
+    Instruction *Inst = cast<Instruction>(V);
     IRBuilder<> Builder(Inst);
     switch (Inst->getOpcode()) {
     case Instruction::Load: {
@@ -239,44 +476,163 @@ static bool tryPromoteAllocaToVector(AllocaInst *Alloca) {
   return true;
 }
 
-static bool collectUsesWithPtrTypes(Value *Val, std::vector<Value*> &WorkList) {
-  bool Success = true;
+static bool isCallPromotable(CallInst *CI) {
+  // TODO: We might be able to handle some cases where the callee is a
+  // constantexpr bitcast of a function.
+  if (!CI->getCalledFunction())
+    return false;
+
+  IntrinsicInst *II = dyn_cast<IntrinsicInst>(CI);
+  if (!II)
+    return false;
+
+  switch (II->getIntrinsicID()) {
+  case Intrinsic::memcpy:
+  case Intrinsic::memmove:
+  case Intrinsic::memset:
+  case Intrinsic::lifetime_start:
+  case Intrinsic::lifetime_end:
+  case Intrinsic::invariant_start:
+  case Intrinsic::invariant_end:
+  case Intrinsic::invariant_group_barrier:
+  case Intrinsic::objectsize:
+    return true;
+  default:
+    return false;
+  }
+}
+
+bool AMDGPUPromoteAlloca::binaryOpIsDerivedFromSameAlloca(Value *BaseAlloca,
+                                                          Value *Val,
+                                                          Instruction *Inst,
+                                                          int OpIdx0,
+                                                          int OpIdx1) const {
+  // Figure out which operand is the one we might not be promoting.
+  Value *OtherOp = Inst->getOperand(OpIdx0);
+  if (Val == OtherOp)
+    OtherOp = Inst->getOperand(OpIdx1);
+
+  if (isa<ConstantPointerNull>(OtherOp))
+    return true;
+
+  Value *OtherObj = GetUnderlyingObject(OtherOp, *DL);
+  if (!isa<AllocaInst>(OtherObj))
+    return false;
+
+  // TODO: We should be able to replace undefs with the right pointer type.
+
+  // TODO: If we know the other base object is another promotable
+  // alloca, not necessarily this alloca, we can do this. The
+  // important part is both must have the same address space at
+  // the end.
+  if (OtherObj != BaseAlloca) {
+    DEBUG(dbgs() << "Found a binary instruction with another alloca object\n");
+    return false;
+  }
+
+  return true;
+}
+
+bool AMDGPUPromoteAlloca::collectUsesWithPtrTypes(
+  Value *BaseAlloca,
+  Value *Val,
+  std::vector<Value*> &WorkList) const {
+
   for (User *User : Val->users()) {
-    if(std::find(WorkList.begin(), WorkList.end(), User) != WorkList.end())
+    if (std::find(WorkList.begin(), WorkList.end(), User) != WorkList.end())
       continue;
+
     if (CallInst *CI = dyn_cast<CallInst>(User)) {
-      // TODO: We might be able to handle some cases where the callee is a
-      // constantexpr bitcast of a function.
-      if (!CI->getCalledFunction())
+      if (!isCallPromotable(CI))
         return false;
 
       WorkList.push_back(User);
       continue;
     }
 
-    // FIXME: Correctly handle ptrtoint instructions.
-    Instruction *UseInst = dyn_cast<Instruction>(User);
-    if (UseInst && UseInst->getOpcode() == Instruction::PtrToInt)
+    Instruction *UseInst = cast<Instruction>(User);
+    if (UseInst->getOpcode() == Instruction::PtrToInt)
       return false;
 
-    if (StoreInst *SI = dyn_cast_or_null<StoreInst>(UseInst)) {
+    if (LoadInst *LI = dyn_cast_or_null<LoadInst>(UseInst)) {
+      if (LI->isVolatile())
+        return false;
+
+      continue;
+    }
+
+    if (StoreInst *SI = dyn_cast<StoreInst>(UseInst)) {
+      if (SI->isVolatile())
+        return false;
+
       // Reject if the stored value is not the pointer operand.
       if (SI->getPointerOperand() != Val)
         return false;
+    } else if (AtomicRMWInst *RMW = dyn_cast_or_null<AtomicRMWInst>(UseInst)) {
+      if (RMW->isVolatile())
+        return false;
+    } else if (AtomicCmpXchgInst *CAS
+               = dyn_cast_or_null<AtomicCmpXchgInst>(UseInst)) {
+      if (CAS->isVolatile())
+        return false;
+    }
+
+    // Only promote a select if we know that the other select operand
+    // is from another pointer that will also be promoted.
+    if (ICmpInst *ICmp = dyn_cast<ICmpInst>(UseInst)) {
+      if (!binaryOpIsDerivedFromSameAlloca(BaseAlloca, Val, ICmp, 0, 1))
+        return false;
+
+      // May need to rewrite constant operands.
+      WorkList.push_back(ICmp);
     }
 
     if (!User->getType()->isPointerTy())
       continue;
 
-    WorkList.push_back(User);
+    if (GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(UseInst)) {
+      // Be conservative if an address could be computed outside the bounds of
+      // the alloca.
+      if (!GEP->isInBounds())
+        return false;
+    }
 
-    Success &= collectUsesWithPtrTypes(User, WorkList);
+    // Only promote a select if we know that the other select operand is from
+    // another pointer that will also be promoted.
+    if (SelectInst *SI = dyn_cast<SelectInst>(UseInst)) {
+      if (!binaryOpIsDerivedFromSameAlloca(BaseAlloca, Val, SI, 1, 2))
+        return false;
+    }
+
+    // Repeat for phis.
+    if (PHINode *Phi = dyn_cast<PHINode>(UseInst)) {
+      // TODO: Handle more complex cases. We should be able to replace loops
+      // over arrays.
+      switch (Phi->getNumIncomingValues()) {
+      case 1:
+        break;
+      case 2:
+        if (!binaryOpIsDerivedFromSameAlloca(BaseAlloca, Val, Phi, 0, 1))
+          return false;
+        break;
+      default:
+        return false;
+      }
+    }
+
+    WorkList.push_back(User);
+    if (!collectUsesWithPtrTypes(BaseAlloca, User, WorkList))
+      return false;
   }
-  return Success;
+
+  return true;
 }
 
-void AMDGPUPromoteAlloca::visitAlloca(AllocaInst &I) {
-  if (!I.isStaticAlloca())
+// FIXME: Should try to pick the most likely to be profitable allocas first.
+void AMDGPUPromoteAlloca::handleAlloca(AllocaInst &I) {
+  // Array allocations are probably not worth handling, since an allocation of
+  // the array type is the canonical form.
+  if (!I.isStaticAlloca() || I.isArrayAllocation())
     return;
 
   IRBuilder<> Builder(&I);
@@ -286,95 +642,144 @@ void AMDGPUPromoteAlloca::visitAlloca(AllocaInst &I) {
 
   DEBUG(dbgs() << "Trying to promote " << I << '\n');
 
-  if (tryPromoteAllocaToVector(&I))
+  if (tryPromoteAllocaToVector(&I)) {
+    DEBUG(dbgs() << " alloca is not a candidate for vectorization.\n");
+    return;
+  }
+
+  const Function &ContainingFunction = *I.getParent()->getParent();
+
+  // Don't promote the alloca to LDS for shader calling conventions as the work
+  // item ID intrinsics are not supported for these calling conventions.
+  // Furthermore not all LDS is available for some of the stages.
+  if (AMDGPU::isShader(ContainingFunction.getCallingConv()))
     return;
 
-  DEBUG(dbgs() << " alloca is not a candidate for vectorization.\n");
+  // FIXME: We should also try to get this value from the reqd_work_group_size
+  // function attribute if it is available.
+  unsigned WorkGroupSize = AMDGPU::getMaximumWorkGroupSize(ContainingFunction);
 
-  // FIXME: This is the maximum work group size.  We should try to get
-  // value from the reqd_work_group_size function attribute if it is
-  // available.
-  unsigned WorkGroupSize = 256;
-  int AllocaSize =
-      WorkGroupSize * Mod->getDataLayout().getTypeAllocSize(AllocaTy);
+  const DataLayout &DL = Mod->getDataLayout();
 
-  if (AllocaSize > LocalMemAvailable) {
-    DEBUG(dbgs() << " Not enough local memory to promote alloca.\n");
+  unsigned Align = I.getAlignment();
+  if (Align == 0)
+    Align = DL.getABITypeAlignment(I.getAllocatedType());
+
+  // FIXME: This computed padding is likely wrong since it depends on inverse
+  // usage order.
+  //
+  // FIXME: It is also possible that if we're allowed to use all of the memory
+  // could could end up using more than the maximum due to alignment padding.
+
+  uint32_t NewSize = alignTo(CurrentLocalMemUsage, Align);
+  uint32_t AllocSize = WorkGroupSize * DL.getTypeAllocSize(AllocaTy);
+  NewSize += AllocSize;
+
+  if (NewSize > LocalMemLimit) {
+    DEBUG(dbgs() << "  " << AllocSize
+          << " bytes of local memory not available to promote\n");
     return;
   }
 
+  CurrentLocalMemUsage = NewSize;
+
   std::vector<Value*> WorkList;
 
-  if (!collectUsesWithPtrTypes(&I, WorkList)) {
+  if (!collectUsesWithPtrTypes(&I, &I, WorkList)) {
     DEBUG(dbgs() << " Do not know how to convert all uses\n");
     return;
   }
 
   DEBUG(dbgs() << "Promoting alloca to local memory\n");
-  LocalMemAvailable -= AllocaSize;
 
-  Type *GVTy = ArrayType::get(I.getAllocatedType(), 256);
+  Function *F = I.getParent()->getParent();
+
+  Type *GVTy = ArrayType::get(I.getAllocatedType(), WorkGroupSize);
   GlobalVariable *GV = new GlobalVariable(
-      *Mod, GVTy, false, GlobalValue::ExternalLinkage, 0, I.getName(), 0,
-      GlobalVariable::NotThreadLocal, AMDGPUAS::LOCAL_ADDRESS);
-
-  FunctionType *FTy = FunctionType::get(
-      Type::getInt32Ty(Mod->getContext()), false);
-  AttributeSet AttrSet;
-  AttrSet.addAttribute(Mod->getContext(), 0, Attribute::ReadNone);
-
-  Value *ReadLocalSizeY = Mod->getOrInsertFunction(
-      "llvm.r600.read.local.size.y", FTy, AttrSet);
-  Value *ReadLocalSizeZ = Mod->getOrInsertFunction(
-      "llvm.r600.read.local.size.z", FTy, AttrSet);
-  Value *ReadTIDIGX = Mod->getOrInsertFunction(
-      "llvm.r600.read.tidig.x", FTy, AttrSet);
-  Value *ReadTIDIGY = Mod->getOrInsertFunction(
-      "llvm.r600.read.tidig.y", FTy, AttrSet);
-  Value *ReadTIDIGZ = Mod->getOrInsertFunction(
-      "llvm.r600.read.tidig.z", FTy, AttrSet);
-
-  Value *TCntY = Builder.CreateCall(ReadLocalSizeY, {});
-  Value *TCntZ = Builder.CreateCall(ReadLocalSizeZ, {});
-  Value *TIdX = Builder.CreateCall(ReadTIDIGX, {});
-  Value *TIdY = Builder.CreateCall(ReadTIDIGY, {});
-  Value *TIdZ = Builder.CreateCall(ReadTIDIGZ, {});
-
-  Value *Tmp0 = Builder.CreateMul(TCntY, TCntZ);
+      *Mod, GVTy, false, GlobalValue::InternalLinkage,
+      UndefValue::get(GVTy),
+      Twine(F->getName()) + Twine('.') + I.getName(),
+      nullptr,
+      GlobalVariable::NotThreadLocal,
+      AMDGPUAS::LOCAL_ADDRESS);
+  GV->setUnnamedAddr(GlobalValue::UnnamedAddr::Global);
+  GV->setAlignment(I.getAlignment());
+
+  Value *TCntY, *TCntZ;
+
+  std::tie(TCntY, TCntZ) = getLocalSizeYZ(Builder);
+  Value *TIdX = getWorkitemID(Builder, 0);
+  Value *TIdY = getWorkitemID(Builder, 1);
+  Value *TIdZ = getWorkitemID(Builder, 2);
+
+  Value *Tmp0 = Builder.CreateMul(TCntY, TCntZ, "", true, true);
   Tmp0 = Builder.CreateMul(Tmp0, TIdX);
-  Value *Tmp1 = Builder.CreateMul(TIdY, TCntZ);
+  Value *Tmp1 = Builder.CreateMul(TIdY, TCntZ, "", true, true);
   Value *TID = Builder.CreateAdd(Tmp0, Tmp1);
   TID = Builder.CreateAdd(TID, TIdZ);
 
-  std::vector<Value*> Indices;
-  Indices.push_back(Constant::getNullValue(Type::getInt32Ty(Mod->getContext())));
-  Indices.push_back(TID);
+  Value *Indices[] = {
+    Constant::getNullValue(Type::getInt32Ty(Mod->getContext())),
+    TID
+  };
 
-  Value *Offset = Builder.CreateGEP(GVTy, GV, Indices);
+  Value *Offset = Builder.CreateInBoundsGEP(GVTy, GV, Indices);
   I.mutateType(Offset->getType());
   I.replaceAllUsesWith(Offset);
   I.eraseFromParent();
 
-  for (std::vector<Value*>::iterator i = WorkList.begin(),
-                                     e = WorkList.end(); i != e; ++i) {
-    Value *V = *i;
+  for (Value *V : WorkList) {
     CallInst *Call = dyn_cast<CallInst>(V);
     if (!Call) {
-      Type *EltTy = V->getType()->getPointerElementType();
-      PointerType *NewTy = PointerType::get(EltTy, AMDGPUAS::LOCAL_ADDRESS);
+      if (ICmpInst *CI = dyn_cast<ICmpInst>(V)) {
+        Value *Src0 = CI->getOperand(0);
+        Type *EltTy = Src0->getType()->getPointerElementType();
+        PointerType *NewTy = PointerType::get(EltTy, AMDGPUAS::LOCAL_ADDRESS);
+
+        if (isa<ConstantPointerNull>(CI->getOperand(0)))
+          CI->setOperand(0, ConstantPointerNull::get(NewTy));
+
+        if (isa<ConstantPointerNull>(CI->getOperand(1)))
+          CI->setOperand(1, ConstantPointerNull::get(NewTy));
+
+        continue;
+      }
 
       // The operand's value should be corrected on its own.
       if (isa<AddrSpaceCastInst>(V))
         continue;
 
+      Type *EltTy = V->getType()->getPointerElementType();
+      PointerType *NewTy = PointerType::get(EltTy, AMDGPUAS::LOCAL_ADDRESS);
+
       // FIXME: It doesn't really make sense to try to do this for all
       // instructions.
       V->mutateType(NewTy);
+
+      // Adjust the types of any constant operands.
+      if (SelectInst *SI = dyn_cast<SelectInst>(V)) {
+        if (isa<ConstantPointerNull>(SI->getOperand(1)))
+          SI->setOperand(1, ConstantPointerNull::get(NewTy));
+
+        if (isa<ConstantPointerNull>(SI->getOperand(2)))
+          SI->setOperand(2, ConstantPointerNull::get(NewTy));
+      } else if (PHINode *Phi = dyn_cast<PHINode>(V)) {
+        for (unsigned I = 0, E = Phi->getNumIncomingValues(); I != E; ++I) {
+          if (isa<ConstantPointerNull>(Phi->getIncomingValue(I)))
+            Phi->setIncomingValue(I, ConstantPointerNull::get(NewTy));
+        }
+      }
+
       continue;
     }
 
     IntrinsicInst *Intr = dyn_cast<IntrinsicInst>(Call);
     if (!Intr) {
+      // FIXME: What is this for? It doesn't make sense to promote arbitrary
+      // function calls. If the call is to a defined function that can also be
+      // promoted, we should be able to do this once that function is also
+      // rewritten.
+
       std::vector<Type*> ArgTypes;
       for (unsigned ArgIdx = 0, ArgEnd = Call->getNumArgOperands();
                                 ArgIdx != ArgEnd; ++ArgIdx) {
@@ -405,6 +810,14 @@ void AMDGPUPromoteAlloca::visitAlloca(AllocaInst &I) {
       Intr->eraseFromParent();
       continue;
     }
+    case Intrinsic::memmove: {
+      MemMoveInst *MemMove = cast<MemMoveInst>(Intr);
+      Builder.CreateMemMove(MemMove->getRawDest(), MemMove->getRawSource(),
+                            MemMove->getLength(), MemMove->getAlignment(),
+                            MemMove->isVolatile());
+      Intr->eraseFromParent();
+      continue;
+    }
     case Intrinsic::memset: {
       MemSetInst *MemSet = cast<MemSetInst>(Intr);
       Builder.CreateMemSet(MemSet->getRawDest(), MemSet->getValue(),
@@ -413,6 +826,28 @@ void AMDGPUPromoteAlloca::visitAlloca(AllocaInst &I) {
       Intr->eraseFromParent();
       continue;
     }
+    case Intrinsic::invariant_start:
+    case Intrinsic::invariant_end:
+    case Intrinsic::invariant_group_barrier:
+      Intr->eraseFromParent();
+      // FIXME: I think the invariant marker should still theoretically apply,
+      // but the intrinsics need to be changed to accept pointers with any
+      // address space.
+      continue;
+    case Intrinsic::objectsize: {
+      Value *Src = Intr->getOperand(0);
+      Type *SrcTy = Src->getType()->getPointerElementType();
+      Function *ObjectSize = Intrinsic::getDeclaration(Mod,
+        Intrinsic::objectsize,
+        { Intr->getType(), PointerType::get(SrcTy, AMDGPUAS::LOCAL_ADDRESS) }
+      );
+
+      CallInst *NewCall
+        = Builder.CreateCall(ObjectSize, { Src, Intr->getOperand(1) });
+      Intr->replaceAllUsesWith(NewCall);
+      Intr->eraseFromParent();
+      continue;
+    }
     default:
       Intr->dump();
       llvm_unreachable("Don't know how to promote alloca intrinsic use.");
@@ -420,6 +855,6 @@ void AMDGPUPromoteAlloca::visitAlloca(AllocaInst &I) {
   }
 }
 
-FunctionPass *llvm::createAMDGPUPromoteAlloca(const AMDGPUSubtarget &ST) {
-  return new AMDGPUPromoteAlloca(ST);
+FunctionPass *llvm::createAMDGPUPromoteAlloca(const TargetMachine *TM) {
+  return new AMDGPUPromoteAlloca(TM);
 }
diff --git a/lib/Target/AMDGPU/AMDGPURegisterInfo.cpp b/lib/Target/AMDGPU/AMDGPURegisterInfo.cpp
index 3ca0eca3417f..941f2d8a468a 100644
--- a/lib/Target/AMDGPU/AMDGPURegisterInfo.cpp
+++ b/lib/Target/AMDGPU/AMDGPURegisterInfo.cpp
@@ -24,20 +24,14 @@ AMDGPURegisterInfo::AMDGPURegisterInfo() : AMDGPUGenRegisterInfo(0) {}
 // they are not supported at this time.
 //===----------------------------------------------------------------------===//
 
-const MCPhysReg AMDGPURegisterInfo::CalleeSavedReg = AMDGPU::NoRegister;
+// Dummy to not crash RegisterClassInfo.
+static const MCPhysReg CalleeSavedReg = AMDGPU::NoRegister;
 
-const MCPhysReg*
-AMDGPURegisterInfo::getCalleeSavedRegs(const MachineFunction *MF) const {
+const MCPhysReg *AMDGPURegisterInfo::getCalleeSavedRegs(
+  const MachineFunction *) const {
   return &CalleeSavedReg;
 }
 
-void AMDGPURegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator MI,
-                                             int SPAdj,
-                                             unsigned FIOperandNum,
-                                             RegScavenger *RS) const {
-  llvm_unreachable("Subroutines not supported yet");
-}
-
 unsigned AMDGPURegisterInfo::getFrameRegister(const MachineFunction &MF) const {
   return AMDGPU::NoRegister;
 }
@@ -54,10 +48,5 @@ unsigned AMDGPURegisterInfo::getSubRegFromChannel(unsigned Channel) const {
   return SubRegs[Channel];
 }
 
-unsigned AMDGPURegisterInfo::getIndirectSubReg(unsigned IndirectIndex) const {
-
-  return getSubRegFromChannel(IndirectIndex);
-}
-
 #define GET_REGINFO_TARGET_DESC
 #include "AMDGPUGenRegisterInfo.inc"
diff --git a/lib/Target/AMDGPU/AMDGPURegisterInfo.h b/lib/Target/AMDGPU/AMDGPURegisterInfo.h
index 0344834328f6..ef51aad95dce 100644
--- a/lib/Target/AMDGPU/AMDGPURegisterInfo.h
+++ b/lib/Target/AMDGPU/AMDGPURegisterInfo.h
@@ -13,10 +13,9 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef LLVM_LIB_TARGET_R600_AMDGPUREGISTERINFO_H
-#define LLVM_LIB_TARGET_R600_AMDGPUREGISTERINFO_H
+#ifndef LLVM_LIB_TARGET_AMDGPU_AMDGPUREGISTERINFO_H
+#define LLVM_LIB_TARGET_AMDGPU_AMDGPUREGISTERINFO_H
 
-#include "llvm/ADT/BitVector.h"
 #include "llvm/Target/TargetRegisterInfo.h"
 
 #define GET_REGINFO_HEADER
@@ -29,30 +28,14 @@ class AMDGPUSubtarget;
 class TargetInstrInfo;
 
 struct AMDGPURegisterInfo : public AMDGPUGenRegisterInfo {
-  static const MCPhysReg CalleeSavedReg;
-
   AMDGPURegisterInfo();
 
-  BitVector getReservedRegs(const MachineFunction &MF) const override {
-    assert(!"Unimplemented");  return BitVector();
-  }
-
-  virtual unsigned getHWRegIndex(unsigned Reg) const {
-    assert(!"Unimplemented"); return 0;
-  }
-
   /// \returns the sub reg enum value for the given \p Channel
   /// (e.g. getSubRegFromChannel(0) -> AMDGPU::sub0)
   unsigned getSubRegFromChannel(unsigned Channel) const;
 
   const MCPhysReg* getCalleeSavedRegs(const MachineFunction *MF) const override;
-  void eliminateFrameIndex(MachineBasicBlock::iterator MI, int SPAdj,
-                           unsigned FIOperandNum,
-                           RegScavenger *RS) const override;
   unsigned getFrameRegister(const MachineFunction &MF) const override;
-
-  unsigned getIndirectSubReg(unsigned IndirectIndex) const;
-
 };
 
 } // End namespace llvm
diff --git a/lib/Target/AMDGPU/AMDGPURuntimeMetadata.h b/lib/Target/AMDGPU/AMDGPURuntimeMetadata.h
new file mode 100644
index 000000000000..40f639434507
--- /dev/null
+++ b/lib/Target/AMDGPU/AMDGPURuntimeMetadata.h
@@ -0,0 +1,138 @@
+//===-- AMDGPURuntimeMetadata.h - AMDGPU Runtime Metadata -------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+/// \file
+///
+/// Enums and structure types used by runtime metadata.
+///
+/// Runtime requests certain information (metadata) about kernels to be able
+/// to execute the kernels and answer the queries about the kernels.
+/// The metadata is represented as a byte stream in an ELF section of a
+/// binary (code object). The byte stream consists of key-value pairs.
+/// Each key is an 8 bit unsigned integer. Each value can be an integer,
+/// a string, or a stream of key-value pairs. There are 3 levels of key-value
+/// pair streams. At the beginning of the ELF section is the top level
+/// key-value pair stream. A kernel-level key-value pair stream starts after
+/// encountering KeyKernelBegin and ends immediately before encountering
+/// KeyKernelEnd. A kernel-argument-level key-value pair stream starts
+/// after encountering KeyArgBegin and ends immediately before encountering
+/// KeyArgEnd. A kernel-level key-value pair stream can only appear in a top
+/// level key-value pair stream. A kernel-argument-level key-value pair stream
+/// can only appear in a kernel-level key-value pair stream.
+///
+/// The format should be kept backward compatible. New enum values and bit
+/// fields should be appended at the end. It is suggested to bump up the
+/// revision number whenever the format changes and document the change
+/// in the revision in this header.
+///
+//
+//===----------------------------------------------------------------------===//
+//
+#ifndef LLVM_LIB_TARGET_AMDGPU_AMDGPURUNTIMEMETADATA_H
+#define LLVM_LIB_TARGET_AMDGPU_AMDGPURUNTIMEMETADATA_H
+
+#include <stdint.h>
+
+namespace AMDGPU {
+
+namespace RuntimeMD {
+
+  // Version and revision of runtime metadata
+  const unsigned char MDVersion   = 1;
+  const unsigned char MDRevision  = 0;
+
+  // ELF section name containing runtime metadata
+  const char SectionName[] = ".AMDGPU.runtime_metadata";
+
+  // Enumeration values of keys in runtime metadata.
+  enum Key {
+    KeyNull                     = 0, // Place holder. Ignored when encountered
+    KeyMDVersion                = 1, // Runtime metadata version
+    KeyLanguage                 = 2, // Language
+    KeyLanguageVersion          = 3, // Language version
+    KeyKernelBegin              = 4, // Beginning of kernel-level stream
+    KeyKernelEnd                = 5, // End of kernel-level stream
+    KeyKernelName               = 6, // Kernel name
+    KeyArgBegin                 = 7, // Beginning of kernel-arg-level stream
+    KeyArgEnd                   = 8, // End of kernel-arg-level stream
+    KeyArgSize                  = 9, // Kernel arg size
+    KeyArgAlign                 = 10, // Kernel arg alignment
+    KeyArgTypeName              = 11, // Kernel type name
+    KeyArgName                  = 12, // Kernel name
+    KeyArgTypeKind              = 13, // Kernel argument type kind
+    KeyArgValueType             = 14, // Kernel argument value type
+    KeyArgAddrQual              = 15, // Kernel argument address qualifier
+    KeyArgAccQual               = 16, // Kernel argument access qualifier
+    KeyArgIsConst               = 17, // Kernel argument is const qualified
+    KeyArgIsRestrict            = 18, // Kernel argument is restrict qualified
+    KeyArgIsVolatile            = 19, // Kernel argument is volatile qualified
+    KeyArgIsPipe                = 20, // Kernel argument is pipe qualified
+    KeyReqdWorkGroupSize        = 21, // Required work group size
+    KeyWorkGroupSizeHint        = 22, // Work group size hint
+    KeyVecTypeHint              = 23, // Vector type hint
+    KeyKernelIndex              = 24, // Kernel index for device enqueue
+    KeySGPRs                    = 25, // Number of SGPRs
+    KeyVGPRs                    = 26, // Number of VGPRs
+    KeyMinWavesPerSIMD          = 27, // Minimum number of waves per SIMD
+    KeyMaxWavesPerSIMD          = 28, // Maximum number of waves per SIMD
+    KeyFlatWorkGroupSizeLimits  = 29, // Flat work group size limits
+    KeyMaxWorkGroupSize         = 30, // Maximum work group size
+    KeyNoPartialWorkGroups      = 31, // No partial work groups
+  };
+
+  enum Language : uint8_t {
+    OpenCL_C      = 0,
+    HCC           = 1,
+    OpenMP        = 2,
+    OpenCL_CPP    = 3,
+};
+
+  enum LanguageVersion : uint16_t {
+    V100          = 100,
+    V110          = 110,
+    V120          = 120,
+    V200          = 200,
+    V210          = 210,
+  };
+
+  namespace KernelArg {
+    enum TypeKind : uint8_t {
+      Value     = 0,
+      Pointer   = 1,
+      Image     = 2,
+      Sampler   = 3,
+      Queue     = 4,
+    };
+
+    enum ValueType : uint16_t {
+      Struct  = 0,
+      I8      = 1,
+      U8      = 2,
+      I16     = 3,
+      U16     = 4,
+      F16     = 5,
+      I32     = 6,
+      U32     = 7,
+      F32     = 8,
+      I64     = 9,
+      U64     = 10,
+      F64     = 11,
+    };
+
+    enum AccessQualifer : uint8_t {
+      None       = 0,
+      ReadOnly   = 1,
+      WriteOnly  = 2,
+      ReadWrite  = 3,
+    };
+  } // namespace KernelArg
+} // namespace RuntimeMD
+} // namespace AMDGPU
+
+#endif // LLVM_LIB_TARGET_AMDGPU_AMDGPURUNTIMEMETADATA_H
diff --git a/lib/Target/AMDGPU/AMDGPUSubtarget.cpp b/lib/Target/AMDGPU/AMDGPUSubtarget.cpp
index 7d70fa73da29..10fa9cf46737 100644
--- a/lib/Target/AMDGPU/AMDGPUSubtarget.cpp
+++ b/lib/Target/AMDGPU/AMDGPUSubtarget.cpp
@@ -15,7 +15,6 @@
 #include "AMDGPUSubtarget.h"
 #include "R600ISelLowering.h"
 #include "R600InstrInfo.h"
-#include "R600MachineScheduler.h"
 #include "SIFrameLowering.h"
 #include "SIISelLowering.h"
 #include "SIInstrInfo.h"
@@ -32,6 +31,8 @@ using namespace llvm;
 #define GET_SUBTARGETINFO_CTOR
 #include "AMDGPUGenSubtargetInfo.inc"
 
+AMDGPUSubtarget::~AMDGPUSubtarget() {}
+
 AMDGPUSubtarget &
 AMDGPUSubtarget::initializeSubtargetDependencies(const Triple &TT,
                                                  StringRef GPU, StringRef FS) {
@@ -44,14 +45,11 @@ AMDGPUSubtarget::initializeSubtargetDependencies(const Triple &TT,
   // for SI has the unhelpful behavior that it unsets everything else if you
   // disable it.
 
-  SmallString<256> FullFS("+promote-alloca,+fp64-denormals,");
+  SmallString<256> FullFS("+promote-alloca,+fp64-denormals,+load-store-opt,");
   if (isAmdHsaOS()) // Turn on FlatForGlobal for HSA.
-    FullFS += "+flat-for-global,";
+    FullFS += "+flat-for-global,+unaligned-buffer-access,";
   FullFS += FS;
 
-  if (GPU == "" && TT.getArch() == Triple::amdgcn)
-    GPU = "SI";
-
   ParseSubtargetFeatures(GPU, FullFS);
 
   // FIXME: I don't think think Evergreen has any useful support for
@@ -61,52 +59,142 @@ AMDGPUSubtarget::initializeSubtargetDependencies(const Triple &TT,
     FP32Denormals = false;
     FP64Denormals = false;
   }
+
+  // Set defaults if needed.
+  if (MaxPrivateElementSize == 0)
+    MaxPrivateElementSize = 4;
+
   return *this;
 }
 
 AMDGPUSubtarget::AMDGPUSubtarget(const Triple &TT, StringRef GPU, StringRef FS,
-                                 TargetMachine &TM)
-    : AMDGPUGenSubtargetInfo(TT, GPU, FS), DevName(GPU), Is64bit(false),
-      DumpCode(false), R600ALUInst(false), HasVertexCache(false),
-      TexVTXClauseSize(0), Gen(AMDGPUSubtarget::R600), FP64(false),
-      FP64Denormals(false), FP32Denormals(false), FastFMAF32(false),
-      CaymanISA(false), FlatAddressSpace(false), FlatForGlobal(false),
-      EnableIRStructurizer(true), EnablePromoteAlloca(false), EnableIfCvt(true),
-      EnableLoadStoreOpt(false), EnableUnsafeDSOffsetFolding(false),
-      EnableXNACK(false),
-      WavefrontSize(0), CFALUBug(false), LocalMemorySize(0),
-      EnableVGPRSpilling(false), SGPRInitBug(false), IsGCN(false),
-      GCN1Encoding(false), GCN3Encoding(false), CIInsts(false), LDSBankCount(0),
-      IsaVersion(ISAVersion0_0_0), EnableHugeScratchBuffer(false),
-      EnableSIScheduler(false), FrameLowering(nullptr),
-      InstrItins(getInstrItineraryForCPU(GPU)), TargetTriple(TT) {
-
+                                 const TargetMachine &TM)
+  : AMDGPUGenSubtargetInfo(TT, GPU, FS),
+    TargetTriple(TT),
+    Gen(TT.getArch() == Triple::amdgcn ? SOUTHERN_ISLANDS : R600),
+    IsaVersion(ISAVersion0_0_0),
+    WavefrontSize(64),
+    LocalMemorySize(0),
+    LDSBankCount(0),
+    MaxPrivateElementSize(0),
+
+    FastFMAF32(false),
+    HalfRate64Ops(false),
+
+    FP32Denormals(false),
+    FP64Denormals(false),
+    FPExceptions(false),
+    FlatForGlobal(false),
+    UnalignedBufferAccess(false),
+
+    EnableXNACK(false),
+    DebuggerInsertNops(false),
+    DebuggerReserveRegs(false),
+    DebuggerEmitPrologue(false),
+
+    EnableVGPRSpilling(false),
+    EnablePromoteAlloca(false),
+    EnableLoadStoreOpt(false),
+    EnableUnsafeDSOffsetFolding(false),
+    EnableSIScheduler(false),
+    DumpCode(false),
+
+    FP64(false),
+    IsGCN(false),
+    GCN1Encoding(false),
+    GCN3Encoding(false),
+    CIInsts(false),
+    SGPRInitBug(false),
+    HasSMemRealTime(false),
+    Has16BitInsts(false),
+    FlatAddressSpace(false),
+
+    R600ALUInst(false),
+    CaymanISA(false),
+    CFALUBug(false),
+    HasVertexCache(false),
+    TexVTXClauseSize(0),
+
+    FeatureDisable(false),
+    InstrItins(getInstrItineraryForCPU(GPU)) {
   initializeSubtargetDependencies(TT, GPU, FS);
+}
 
-  const unsigned MaxStackAlign = 64 * 16; // Maximum stack alignment (long16)
-
-  if (getGeneration() <= AMDGPUSubtarget::NORTHERN_ISLANDS) {
-    InstrInfo.reset(new R600InstrInfo(*this));
-    TLInfo.reset(new R600TargetLowering(TM, *this));
-
-    // FIXME: Should have R600 specific FrameLowering
-    FrameLowering.reset(new AMDGPUFrameLowering(
-                          TargetFrameLowering::StackGrowsUp,
-                          MaxStackAlign,
-                          0));
-  } else {
-    InstrInfo.reset(new SIInstrInfo(*this));
-    TLInfo.reset(new SITargetLowering(TM, *this));
-    FrameLowering.reset(new SIFrameLowering(
-                          TargetFrameLowering::StackGrowsUp,
-                          MaxStackAlign,
-                          0));
+// FIXME: These limits are for SI. Did they change with the larger maximum LDS
+// size?
+unsigned AMDGPUSubtarget::getMaxLocalMemSizeWithWaveCount(unsigned NWaves) const {
+  switch (NWaves) {
+  case 10:
+    return 1638;
+  case 9:
+    return 1820;
+  case 8:
+    return 2048;
+  case 7:
+    return 2340;
+  case 6:
+    return 2730;
+  case 5:
+    return 3276;
+  case 4:
+    return 4096;
+  case 3:
+    return 5461;
+  case 2:
+    return 8192;
+  default:
+    return getLocalMemorySize();
   }
 }
 
-unsigned AMDGPUSubtarget::getStackEntrySize() const {
-  assert(getGeneration() <= NORTHERN_ISLANDS);
-  switch(getWavefrontSize()) {
+unsigned AMDGPUSubtarget::getOccupancyWithLocalMemSize(uint32_t Bytes) const {
+  if (Bytes <= 1638)
+    return 10;
+
+  if (Bytes <= 1820)
+    return 9;
+
+  if (Bytes <= 2048)
+    return 8;
+
+  if (Bytes <= 2340)
+    return 7;
+
+  if (Bytes <= 2730)
+    return 6;
+
+  if (Bytes <= 3276)
+    return 5;
+
+  if (Bytes <= 4096)
+    return 4;
+
+  if (Bytes <= 5461)
+    return 3;
+
+  if (Bytes <= 8192)
+    return 2;
+
+  return 1;
+}
+
+R600Subtarget::R600Subtarget(const Triple &TT, StringRef GPU, StringRef FS,
+                             const TargetMachine &TM) :
+  AMDGPUSubtarget(TT, GPU, FS, TM),
+  InstrInfo(*this),
+  FrameLowering(TargetFrameLowering::StackGrowsUp, getStackAlignment(), 0),
+  TLInfo(TM, *this) {}
+
+SISubtarget::SISubtarget(const Triple &TT, StringRef GPU, StringRef FS,
+                         const TargetMachine &TM) :
+  AMDGPUSubtarget(TT, GPU, FS, TM),
+  InstrInfo(*this),
+  FrameLowering(TargetFrameLowering::StackGrowsUp, getStackAlignment(), 0),
+  TLInfo(TM, *this),
+  GISel() {}
+
+unsigned R600Subtarget::getStackEntrySize() const {
+  switch (getWavefrontSize()) {
   case 16:
     return 8;
   case 32:
@@ -118,37 +206,36 @@ unsigned AMDGPUSubtarget::getStackEntrySize() const {
   }
 }
 
-unsigned AMDGPUSubtarget::getAmdKernelCodeChipID() const {
-  switch(getGeneration()) {
-  default: llvm_unreachable("ChipID unknown");
-  case SEA_ISLANDS: return 12;
-  }
-}
-
-AMDGPU::IsaVersion AMDGPUSubtarget::getIsaVersion() const {
-  return AMDGPU::getIsaVersion(getFeatureBits());
+void SISubtarget::overrideSchedPolicy(MachineSchedPolicy &Policy,
+                                      unsigned NumRegionInstrs) const {
+  // Track register pressure so the scheduler can try to decrease
+  // pressure once register usage is above the threshold defined by
+  // SIRegisterInfo::getRegPressureSetLimit()
+  Policy.ShouldTrackPressure = true;
+
+  // Enabling both top down and bottom up scheduling seems to give us less
+  // register spills than just using one of these approaches on its own.
+  Policy.OnlyTopDown = false;
+  Policy.OnlyBottomUp = false;
+
+  // Enabling ShouldTrackLaneMasks crashes the SI Machine Scheduler.
+  if (!enableSIScheduler())
+    Policy.ShouldTrackLaneMasks = true;
 }
 
-bool AMDGPUSubtarget::isVGPRSpillingEnabled(
-                                       const SIMachineFunctionInfo *MFI) const {
-  return MFI->getShaderType() == ShaderType::COMPUTE || EnableVGPRSpilling;
+bool SISubtarget::isVGPRSpillingEnabled(const Function& F) const {
+  return EnableVGPRSpilling || !AMDGPU::isShader(F.getCallingConv());
 }
 
-void AMDGPUSubtarget::overrideSchedPolicy(MachineSchedPolicy &Policy,
-                                          MachineInstr *begin,
-                                          MachineInstr *end,
-                                          unsigned NumRegionInstrs) const {
-  if (getGeneration() >= SOUTHERN_ISLANDS) {
-
-    // Track register pressure so the scheduler can try to decrease
-    // pressure once register usage is above the threshold defined by
-    // SIRegisterInfo::getRegPressureSetLimit()
-    Policy.ShouldTrackPressure = true;
-
-    // Enabling both top down and bottom up scheduling seems to give us less
-    // register spills than just using one of these approaches on its own.
-    Policy.OnlyTopDown = false;
-    Policy.OnlyBottomUp = false;
+unsigned SISubtarget::getAmdKernelCodeChipID() const {
+  switch (getGeneration()) {
+  case SEA_ISLANDS:
+    return 12;
+  default:
+    llvm_unreachable("ChipID unknown");
   }
 }
 
+AMDGPU::IsaVersion SISubtarget::getIsaVersion() const {
+  return AMDGPU::getIsaVersion(getFeatureBits());
+}
diff --git a/lib/Target/AMDGPU/AMDGPUSubtarget.h b/lib/Target/AMDGPU/AMDGPUSubtarget.h
index 49c94f1eceb8..3fe61aa449e0 100644
--- a/lib/Target/AMDGPU/AMDGPUSubtarget.h
+++ b/lib/Target/AMDGPU/AMDGPUSubtarget.h
@@ -16,12 +16,14 @@
 #define LLVM_LIB_TARGET_AMDGPU_AMDGPUSUBTARGET_H
 
 #include "AMDGPU.h"
-#include "AMDGPUFrameLowering.h"
-#include "AMDGPUInstrInfo.h"
-#include "AMDGPUISelLowering.h"
-#include "AMDGPUSubtarget.h"
+#include "R600InstrInfo.h"
+#include "R600ISelLowering.h"
+#include "R600FrameLowering.h"
+#include "SIInstrInfo.h"
+#include "SIISelLowering.h"
+#include "SIFrameLowering.h"
 #include "Utils/AMDGPUBaseInfo.h"
-#include "llvm/ADT/StringRef.h"
+#include "llvm/CodeGen/GlobalISel/GISelAccessor.h"
 #include "llvm/Target/TargetSubtargetInfo.h"
 
 #define GET_SUBTARGETINFO_HEADER
@@ -30,9 +32,9 @@
 namespace llvm {
 
 class SIMachineFunctionInfo;
+class StringRef;
 
 class AMDGPUSubtarget : public AMDGPUGenSubtargetInfo {
-
 public:
   enum Generation {
     R600 = 0,
@@ -45,10 +47,6 @@ public:
   };
 
   enum {
-    FIXED_SGPR_COUNT_FOR_INIT_BUG = 80
-  };
-
-  enum {
     ISAVersion0_0_0,
     ISAVersion7_0_0,
     ISAVersion7_0_1,
@@ -57,114 +55,116 @@ public:
     ISAVersion8_0_3
   };
 
-private:
-  std::string DevName;
-  bool Is64bit;
-  bool DumpCode;
-  bool R600ALUInst;
-  bool HasVertexCache;
-  short TexVTXClauseSize;
+protected:
+  // Basic subtarget description.
+  Triple TargetTriple;
   Generation Gen;
-  bool FP64;
-  bool FP64Denormals;
-  bool FP32Denormals;
+  unsigned IsaVersion;
+  unsigned WavefrontSize;
+  int LocalMemorySize;
+  int LDSBankCount;
+  unsigned MaxPrivateElementSize;
+
+  // Possibly statically set by tablegen, but may want to be overridden.
   bool FastFMAF32;
-  bool CaymanISA;
-  bool FlatAddressSpace;
+  bool HalfRate64Ops;
+
+  // Dynamially set bits that enable features.
+  bool FP32Denormals;
+  bool FP64Denormals;
+  bool FPExceptions;
   bool FlatForGlobal;
-  bool EnableIRStructurizer;
+  bool UnalignedBufferAccess;
+  bool EnableXNACK;
+  bool DebuggerInsertNops;
+  bool DebuggerReserveRegs;
+  bool DebuggerEmitPrologue;
+
+  // Used as options.
+  bool EnableVGPRSpilling;
   bool EnablePromoteAlloca;
-  bool EnableIfCvt;
   bool EnableLoadStoreOpt;
   bool EnableUnsafeDSOffsetFolding;
-  bool EnableXNACK;
-  unsigned WavefrontSize;
-  bool CFALUBug;
-  int LocalMemorySize;
-  bool EnableVGPRSpilling;
-  bool SGPRInitBug;
+  bool EnableSIScheduler;
+  bool DumpCode;
+
+  // Subtarget statically properties set by tablegen
+  bool FP64;
   bool IsGCN;
   bool GCN1Encoding;
   bool GCN3Encoding;
   bool CIInsts;
+  bool SGPRInitBug;
+  bool HasSMemRealTime;
+  bool Has16BitInsts;
+  bool FlatAddressSpace;
+  bool R600ALUInst;
+  bool CaymanISA;
+  bool CFALUBug;
+  bool HasVertexCache;
+  short TexVTXClauseSize;
+
+  // Dummy feature to use for assembler in tablegen.
   bool FeatureDisable;
-  int LDSBankCount;
-  unsigned IsaVersion;
-  bool EnableHugeScratchBuffer;
-  bool EnableSIScheduler;
 
-  std::unique_ptr<AMDGPUFrameLowering> FrameLowering;
-  std::unique_ptr<AMDGPUTargetLowering> TLInfo;
-  std::unique_ptr<AMDGPUInstrInfo> InstrInfo;
   InstrItineraryData InstrItins;
-  Triple TargetTriple;
 
 public:
-  AMDGPUSubtarget(const Triple &TT, StringRef CPU, StringRef FS,
-                  TargetMachine &TM);
+  AMDGPUSubtarget(const Triple &TT, StringRef GPU, StringRef FS,
+                  const TargetMachine &TM);
+  virtual ~AMDGPUSubtarget();
   AMDGPUSubtarget &initializeSubtargetDependencies(const Triple &TT,
                                                    StringRef GPU, StringRef FS);
 
-  const AMDGPUFrameLowering *getFrameLowering() const override {
-    return FrameLowering.get();
-  }
-  const AMDGPUInstrInfo *getInstrInfo() const override {
-    return InstrInfo.get();
-  }
-  const AMDGPURegisterInfo *getRegisterInfo() const override {
-    return &InstrInfo->getRegisterInfo();
-  }
-  AMDGPUTargetLowering *getTargetLowering() const override {
-    return TLInfo.get();
-  }
+  const AMDGPUInstrInfo *getInstrInfo() const override;
+  const AMDGPUFrameLowering *getFrameLowering() const override;
+  const AMDGPUTargetLowering *getTargetLowering() const override;
+  const AMDGPURegisterInfo *getRegisterInfo() const override;
+
   const InstrItineraryData *getInstrItineraryData() const override {
     return &InstrItins;
   }
 
   void ParseSubtargetFeatures(StringRef CPU, StringRef FS);
 
-  bool is64bit() const {
-    return Is64bit;
-  }
-
-  bool hasVertexCache() const {
-    return HasVertexCache;
-  }
-
-  short getTexVTXClauseSize() const {
-    return TexVTXClauseSize;
+  bool isAmdHsaOS() const {
+    return TargetTriple.getOS() == Triple::AMDHSA;
   }
 
   Generation getGeneration() const {
     return Gen;
   }
 
-  bool hasHWFP64() const {
-    return FP64;
+  unsigned getWavefrontSize() const {
+    return WavefrontSize;
   }
 
-  bool hasCaymanISA() const {
-    return CaymanISA;
+  int getLocalMemorySize() const {
+    return LocalMemorySize;
   }
 
-  bool hasFP32Denormals() const {
-    return FP32Denormals;
+  int getLDSBankCount() const {
+    return LDSBankCount;
   }
 
-  bool hasFP64Denormals() const {
-    return FP64Denormals;
+  unsigned getMaxPrivateElementSize() const {
+    return MaxPrivateElementSize;
+  }
+
+  bool hasHWFP64() const {
+    return FP64;
   }
 
   bool hasFastFMAF32() const {
     return FastFMAF32;
   }
 
-  bool hasFlatAddressSpace() const {
-    return FlatAddressSpace;
+  bool hasHalfRate64Ops() const {
+    return HalfRate64Ops;
   }
 
-  bool useFlatForGlobal() const {
-    return FlatForGlobal;
+  bool hasAddr64() const {
+    return (getGeneration() < VOLCANIC_ISLANDS);
   }
 
   bool hasBFE() const {
@@ -214,116 +214,249 @@ public:
     return (getGeneration() >= EVERGREEN);
   }
 
-  bool IsIRStructurizerEnabled() const {
-    return EnableIRStructurizer;
+  bool hasCaymanISA() const {
+    return CaymanISA;
   }
 
   bool isPromoteAllocaEnabled() const {
     return EnablePromoteAlloca;
   }
 
-  bool isIfCvtEnabled() const {
-    return EnableIfCvt;
+  bool unsafeDSOffsetFoldingEnabled() const {
+    return EnableUnsafeDSOffsetFolding;
   }
 
-  bool loadStoreOptEnabled() const {
-    return EnableLoadStoreOpt;
+  bool dumpCode() const {
+    return DumpCode;
   }
 
-  bool unsafeDSOffsetFoldingEnabled() const {
-    return EnableUnsafeDSOffsetFolding;
+  /// Return the amount of LDS that can be used that will not restrict the
+  /// occupancy lower than WaveCount.
+  unsigned getMaxLocalMemSizeWithWaveCount(unsigned WaveCount) const;
+
+  /// Inverse of getMaxLocalMemWithWaveCount. Return the maximum wavecount if
+  /// the given LDS memory size is the only constraint.
+  unsigned getOccupancyWithLocalMemSize(uint32_t Bytes) const;
+
+
+  bool hasFP32Denormals() const {
+    return FP32Denormals;
   }
 
-  unsigned getWavefrontSize() const {
-    return WavefrontSize;
+  bool hasFP64Denormals() const {
+    return FP64Denormals;
   }
 
-  unsigned getStackEntrySize() const;
+  bool hasFPExceptions() const {
+    return FPExceptions;
+  }
 
-  bool hasCFAluBug() const {
-    assert(getGeneration() <= NORTHERN_ISLANDS);
-    return CFALUBug;
+  bool useFlatForGlobal() const {
+    return FlatForGlobal;
   }
 
-  int getLocalMemorySize() const {
-    return LocalMemorySize;
+  bool hasUnalignedBufferAccess() const {
+    return UnalignedBufferAccess;
   }
 
-  bool hasSGPRInitBug() const {
-    return SGPRInitBug;
+  bool isXNACKEnabled() const {
+    return EnableXNACK;
   }
 
-  int getLDSBankCount() const {
-    return LDSBankCount;
+  unsigned getMaxWavesPerCU() const {
+    if (getGeneration() >= AMDGPUSubtarget::SOUTHERN_ISLANDS)
+      return 10;
+
+    // FIXME: Not sure what this is for other subtagets.
+    return 8;
   }
 
-  unsigned getAmdKernelCodeChipID() const;
+  /// \brief Returns the offset in bytes from the start of the input buffer
+  ///        of the first explicit kernel argument.
+  unsigned getExplicitKernelArgOffset() const {
+    return isAmdHsaOS() ? 0 : 36;
+  }
 
-  AMDGPU::IsaVersion getIsaVersion() const;
+  unsigned getStackAlignment() const {
+    // Scratch is allocated in 256 dword per wave blocks.
+    return 4 * 256 / getWavefrontSize();
+  }
 
   bool enableMachineScheduler() const override {
     return true;
   }
 
-  void overrideSchedPolicy(MachineSchedPolicy &Policy,
-                           MachineInstr *begin, MachineInstr *end,
-                           unsigned NumRegionInstrs) const override;
+  bool enableSubRegLiveness() const override {
+    return true;
+  }
+};
 
-  // Helper functions to simplify if statements
-  bool isTargetELF() const {
-    return false;
+class R600Subtarget final : public AMDGPUSubtarget {
+private:
+  R600InstrInfo InstrInfo;
+  R600FrameLowering FrameLowering;
+  R600TargetLowering TLInfo;
+
+public:
+  R600Subtarget(const Triple &TT, StringRef CPU, StringRef FS,
+                const TargetMachine &TM);
+
+  const R600InstrInfo *getInstrInfo() const override {
+    return &InstrInfo;
   }
 
-  StringRef getDeviceName() const {
-    return DevName;
+  const R600FrameLowering *getFrameLowering() const override {
+    return &FrameLowering;
   }
 
-  bool enableHugeScratchBuffer() const {
-    return EnableHugeScratchBuffer;
+  const R600TargetLowering *getTargetLowering() const override {
+    return &TLInfo;
   }
 
-  bool enableSIScheduler() const {
-    return EnableSIScheduler;
+  const R600RegisterInfo *getRegisterInfo() const override {
+    return &InstrInfo.getRegisterInfo();
   }
 
-  bool dumpCode() const {
-    return DumpCode;
+  bool hasCFAluBug() const {
+    return CFALUBug;
   }
-  bool r600ALUEncoding() const {
-    return R600ALUInst;
+
+  bool hasVertexCache() const {
+    return HasVertexCache;
   }
-  bool isAmdHsaOS() const {
-    return TargetTriple.getOS() == Triple::AMDHSA;
+
+  short getTexVTXClauseSize() const {
+    return TexVTXClauseSize;
   }
-  bool isVGPRSpillingEnabled(const SIMachineFunctionInfo *MFI) const;
 
-  bool isXNACKEnabled() const {
-    return EnableXNACK;
+  unsigned getStackEntrySize() const;
+};
+
+class SISubtarget final : public AMDGPUSubtarget {
+public:
+  enum {
+    FIXED_SGPR_COUNT_FOR_INIT_BUG = 80
+  };
+
+private:
+  SIInstrInfo InstrInfo;
+  SIFrameLowering FrameLowering;
+  SITargetLowering TLInfo;
+  std::unique_ptr<GISelAccessor> GISel;
+
+public:
+  SISubtarget(const Triple &TT, StringRef CPU, StringRef FS,
+              const TargetMachine &TM);
+
+  const SIInstrInfo *getInstrInfo() const override {
+    return &InstrInfo;
   }
 
-  unsigned getMaxWavesPerCU() const {
-    if (getGeneration() >= AMDGPUSubtarget::SOUTHERN_ISLANDS)
-      return 10;
+  const SIFrameLowering *getFrameLowering() const override {
+    return &FrameLowering;
+  }
 
-    // FIXME: Not sure what this is for other subtagets.
-    llvm_unreachable("do not know max waves per CU for this subtarget.");
+  const SITargetLowering *getTargetLowering() const override {
+    return &TLInfo;
   }
 
-  bool enableSubRegLiveness() const override {
-    return true;
+  const CallLowering *getCallLowering() const override {
+    assert(GISel && "Access to GlobalISel APIs not set");
+    return GISel->getCallLowering();
   }
 
-  /// \brief Returns the offset in bytes from the start of the input buffer
-  ///        of the first explicit kernel argument.
-  unsigned getExplicitKernelArgOffset() const {
-    return isAmdHsaOS() ? 0 : 36;
+  const SIRegisterInfo *getRegisterInfo() const override {
+    return &InstrInfo.getRegisterInfo();
+  }
+
+  void setGISelAccessor(GISelAccessor &GISel) {
+    this->GISel.reset(&GISel);
   }
 
+  void overrideSchedPolicy(MachineSchedPolicy &Policy,
+                           unsigned NumRegionInstrs) const override;
+
+  bool isVGPRSpillingEnabled(const Function& F) const;
+
+  unsigned getAmdKernelCodeChipID() const;
+
+  AMDGPU::IsaVersion getIsaVersion() const;
+
   unsigned getMaxNumUserSGPRs() const {
     return 16;
   }
+
+  bool hasFlatAddressSpace() const {
+    return FlatAddressSpace;
+  }
+
+  bool hasSMemRealTime() const {
+    return HasSMemRealTime;
+  }
+
+  bool has16BitInsts() const {
+    return Has16BitInsts;
+  }
+
+  bool enableSIScheduler() const {
+    return EnableSIScheduler;
+  }
+
+  bool debuggerSupported() const {
+    return debuggerInsertNops() && debuggerReserveRegs() &&
+      debuggerEmitPrologue();
+  }
+
+  bool debuggerInsertNops() const {
+    return DebuggerInsertNops;
+  }
+
+  bool debuggerReserveRegs() const {
+    return DebuggerReserveRegs;
+  }
+
+  bool debuggerEmitPrologue() const {
+    return DebuggerEmitPrologue;
+  }
+
+  bool loadStoreOptEnabled() const {
+    return EnableLoadStoreOpt;
+  }
+
+  bool hasSGPRInitBug() const {
+    return SGPRInitBug;
+  }
 };
 
+
+inline const AMDGPUInstrInfo *AMDGPUSubtarget::getInstrInfo() const {
+  if (getGeneration() >= SOUTHERN_ISLANDS)
+    return static_cast<const SISubtarget *>(this)->getInstrInfo();
+
+  return static_cast<const R600Subtarget *>(this)->getInstrInfo();
+}
+
+inline const AMDGPUFrameLowering *AMDGPUSubtarget::getFrameLowering() const  {
+  if (getGeneration() >= SOUTHERN_ISLANDS)
+    return static_cast<const SISubtarget *>(this)->getFrameLowering();
+
+  return static_cast<const R600Subtarget *>(this)->getFrameLowering();
+}
+
+inline const AMDGPUTargetLowering *AMDGPUSubtarget::getTargetLowering() const  {
+  if (getGeneration() >= SOUTHERN_ISLANDS)
+    return static_cast<const SISubtarget *>(this)->getTargetLowering();
+
+  return static_cast<const R600Subtarget *>(this)->getTargetLowering();
+}
+
+inline const AMDGPURegisterInfo *AMDGPUSubtarget::getRegisterInfo() const  {
+  if (getGeneration() >= SOUTHERN_ISLANDS)
+    return static_cast<const SISubtarget *>(this)->getRegisterInfo();
+
+  return static_cast<const R600Subtarget *>(this)->getRegisterInfo();
+}
+
 } // End namespace llvm
 
 #endif
diff --git a/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp b/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
index 519ae5cc748d..3e53f52c689f 100644
--- a/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
+++ b/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
@@ -14,19 +14,23 @@
 //===----------------------------------------------------------------------===//
 
 #include "AMDGPUTargetMachine.h"
-#include "AMDGPUTargetObjectFile.h"
 #include "AMDGPU.h"
+#include "AMDGPUCallLowering.h"
+#include "AMDGPUTargetObjectFile.h"
 #include "AMDGPUTargetTransformInfo.h"
 #include "R600ISelLowering.h"
 #include "R600InstrInfo.h"
 #include "R600MachineScheduler.h"
 #include "SIISelLowering.h"
 #include "SIInstrInfo.h"
+
 #include "llvm/Analysis/Passes.h"
+#include "llvm/CodeGen/GlobalISel/IRTranslator.h"
 #include "llvm/CodeGen/MachineFunctionAnalysis.h"
-#include "llvm/CodeGen/TargetLoweringObjectFileImpl.h"
 #include "llvm/CodeGen/MachineModuleInfo.h"
 #include "llvm/CodeGen/Passes.h"
+#include "llvm/CodeGen/TargetLoweringObjectFileImpl.h"
+#include "llvm/CodeGen/TargetPassConfig.h"
 #include "llvm/IR/Verifier.h"
 #include "llvm/MC/MCAsmInfo.h"
 #include "llvm/IR/LegacyPassManager.h"
@@ -34,10 +38,35 @@
 #include "llvm/Support/raw_os_ostream.h"
 #include "llvm/Transforms/IPO.h"
 #include "llvm/Transforms/Scalar.h"
-#include <llvm/CodeGen/Passes.h>
+#include "llvm/Transforms/Scalar/GVN.h"
+#include "llvm/Transforms/Vectorize.h"
 
 using namespace llvm;
 
+static cl::opt<bool> EnableR600StructurizeCFG(
+  "r600-ir-structurize",
+  cl::desc("Use StructurizeCFG IR pass"),
+  cl::init(true));
+
+static cl::opt<bool> EnableSROA(
+  "amdgpu-sroa",
+  cl::desc("Run SROA after promote alloca pass"),
+  cl::ReallyHidden,
+  cl::init(true));
+
+static cl::opt<bool> EnableR600IfConvert(
+  "r600-if-convert",
+  cl::desc("Use if conversion pass"),
+  cl::ReallyHidden,
+  cl::init(true));
+
+// Option to disable vectorizer for tests.
+static cl::opt<bool> EnableLoadStoreVectorizer(
+  "amdgpu-load-store-vectorizer",
+  cl::desc("Enable load store vectorizer"),
+  cl::init(false),
+  cl::Hidden);
+
 extern "C" void LLVMInitializeAMDGPUTarget() {
   // Register the target
   RegisterTargetMachine<R600TargetMachine> X(TheAMDGPUTarget);
@@ -47,17 +76,22 @@ extern "C" void LLVMInitializeAMDGPUTarget() {
   initializeSILowerI1CopiesPass(*PR);
   initializeSIFixSGPRCopiesPass(*PR);
   initializeSIFoldOperandsPass(*PR);
-  initializeSIFixSGPRLiveRangesPass(*PR);
+  initializeSIShrinkInstructionsPass(*PR);
   initializeSIFixControlFlowLiveIntervalsPass(*PR);
   initializeSILoadStoreOptimizerPass(*PR);
   initializeAMDGPUAnnotateKernelFeaturesPass(*PR);
   initializeAMDGPUAnnotateUniformValuesPass(*PR);
+  initializeAMDGPUPromoteAllocaPass(*PR);
+  initializeAMDGPUCodeGenPreparePass(*PR);
+  initializeSIAnnotateControlFlowPass(*PR);
+  initializeSIDebuggerInsertNopsPass(*PR);
+  initializeSIInsertWaitsPass(*PR);
+  initializeSIWholeQuadModePass(*PR);
+  initializeSILowerControlFlowPass(*PR);
+  initializeSIDebuggerInsertNopsPass(*PR);
 }
 
 static std::unique_ptr<TargetLoweringObjectFile> createTLOF(const Triple &TT) {
-  if (TT.getOS() == Triple::AMDHSA)
-    return make_unique<AMDGPUHSATargetObjectFile>();
-
   return make_unique<AMDGPUTargetObjectFile>();
 }
 
@@ -73,60 +107,156 @@ static MachineSchedRegistry
 SISchedRegistry("si", "Run SI's custom scheduler",
                 createSIMachineScheduler);
 
-static std::string computeDataLayout(const Triple &TT) {
-  std::string Ret = "e-p:32:32";
-
-  if (TT.getArch() == Triple::amdgcn) {
-    // 32-bit private, local, and region pointers. 64-bit global and constant.
-    Ret += "-p1:64:64-p2:64:64-p3:32:32-p4:64:64-p5:32:32-p24:64:64";
+static StringRef computeDataLayout(const Triple &TT) {
+  if (TT.getArch() == Triple::r600) {
+    // 32-bit pointers.
+    return "e-p:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128"
+            "-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64";
   }
 
-  Ret += "-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256"
-         "-v512:512-v1024:1024-v2048:2048-n32:64";
+  // 32-bit private, local, and region pointers. 64-bit global, constant and
+  // flat.
+  return "e-p:32:32-p1:64:64-p2:64:64-p3:32:32-p4:64:64-p5:32:32"
+         "-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128"
+         "-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64";
+}
+
+LLVM_READNONE
+static StringRef getGPUOrDefault(const Triple &TT, StringRef GPU) {
+  if (!GPU.empty())
+    return GPU;
 
-  return Ret;
+  // HSA only supports CI+, so change the default GPU to a CI for HSA.
+  if (TT.getArch() == Triple::amdgcn)
+    return (TT.getOS() == Triple::AMDHSA) ? "kaveri" : "tahiti";
+
+  return "r600";
+}
+
+static Reloc::Model getEffectiveRelocModel(Optional<Reloc::Model> RM) {
+  // The AMDGPU toolchain only supports generating shared objects, so we
+  // must always use PIC.
+  return Reloc::PIC_;
 }
 
 AMDGPUTargetMachine::AMDGPUTargetMachine(const Target &T, const Triple &TT,
                                          StringRef CPU, StringRef FS,
-                                         TargetOptions Options, Reloc::Model RM,
+                                         TargetOptions Options,
+                                         Optional<Reloc::Model> RM,
                                          CodeModel::Model CM,
                                          CodeGenOpt::Level OptLevel)
-    : LLVMTargetMachine(T, computeDataLayout(TT), TT, CPU, FS, Options, RM, CM,
-                        OptLevel),
-      TLOF(createTLOF(getTargetTriple())), Subtarget(TT, CPU, FS, *this),
-      IntrinsicInfo() {
+  : LLVMTargetMachine(T, computeDataLayout(TT), TT, getGPUOrDefault(TT, CPU),
+                      FS, Options, getEffectiveRelocModel(RM), CM, OptLevel),
+    TLOF(createTLOF(getTargetTriple())),
+    IntrinsicInfo() {
   setRequiresStructuredCFG(true);
   initAsmInfo();
 }
 
 AMDGPUTargetMachine::~AMDGPUTargetMachine() { }
 
+StringRef AMDGPUTargetMachine::getGPUName(const Function &F) const {
+  Attribute GPUAttr = F.getFnAttribute("target-cpu");
+  return GPUAttr.hasAttribute(Attribute::None) ?
+    getTargetCPU() : GPUAttr.getValueAsString();
+}
+
+StringRef AMDGPUTargetMachine::getFeatureString(const Function &F) const {
+  Attribute FSAttr = F.getFnAttribute("target-features");
+
+  return FSAttr.hasAttribute(Attribute::None) ?
+    getTargetFeatureString() :
+    FSAttr.getValueAsString();
+}
+
 //===----------------------------------------------------------------------===//
 // R600 Target Machine (R600 -> Cayman)
 //===----------------------------------------------------------------------===//
 
 R600TargetMachine::R600TargetMachine(const Target &T, const Triple &TT,
-                                     StringRef FS, StringRef CPU,
-                                     TargetOptions Options, Reloc::Model RM,
+                                     StringRef CPU, StringRef FS,
+                                     TargetOptions Options,
+                                     Optional<Reloc::Model> RM,
                                      CodeModel::Model CM, CodeGenOpt::Level OL)
-    : AMDGPUTargetMachine(T, TT, FS, CPU, Options, RM, CM, OL) {}
+  : AMDGPUTargetMachine(T, TT, CPU, FS, Options, RM, CM, OL) {}
+
+const R600Subtarget *R600TargetMachine::getSubtargetImpl(
+  const Function &F) const {
+  StringRef GPU = getGPUName(F);
+  StringRef FS = getFeatureString(F);
+
+  SmallString<128> SubtargetKey(GPU);
+  SubtargetKey.append(FS);
+
+  auto &I = SubtargetMap[SubtargetKey];
+  if (!I) {
+    // This needs to be done before we create a new subtarget since any
+    // creation will depend on the TM and the code generation flags on the
+    // function that reside in TargetOptions.
+    resetTargetOptions(F);
+    I = llvm::make_unique<R600Subtarget>(TargetTriple, GPU, FS, *this);
+  }
+
+  return I.get();
+}
 
 //===----------------------------------------------------------------------===//
 // GCN Target Machine (SI+)
 //===----------------------------------------------------------------------===//
 
+#ifdef LLVM_BUILD_GLOBAL_ISEL
+namespace {
+struct SIGISelActualAccessor : public GISelAccessor {
+  std::unique_ptr<AMDGPUCallLowering> CallLoweringInfo;
+  const AMDGPUCallLowering *getCallLowering() const override {
+    return CallLoweringInfo.get();
+  }
+};
+} // End anonymous namespace.
+#endif
+
 GCNTargetMachine::GCNTargetMachine(const Target &T, const Triple &TT,
-                                   StringRef FS, StringRef CPU,
-                                   TargetOptions Options, Reloc::Model RM,
+                                   StringRef CPU, StringRef FS,
+                                   TargetOptions Options,
+                                   Optional<Reloc::Model> RM,
                                    CodeModel::Model CM, CodeGenOpt::Level OL)
-    : AMDGPUTargetMachine(T, TT, FS, CPU, Options, RM, CM, OL) {}
+  : AMDGPUTargetMachine(T, TT, CPU, FS, Options, RM, CM, OL) {}
+
+const SISubtarget *GCNTargetMachine::getSubtargetImpl(const Function &F) const {
+  StringRef GPU = getGPUName(F);
+  StringRef FS = getFeatureString(F);
+
+  SmallString<128> SubtargetKey(GPU);
+  SubtargetKey.append(FS);
+
+  auto &I = SubtargetMap[SubtargetKey];
+  if (!I) {
+    // This needs to be done before we create a new subtarget since any
+    // creation will depend on the TM and the code generation flags on the
+    // function that reside in TargetOptions.
+    resetTargetOptions(F);
+    I = llvm::make_unique<SISubtarget>(TargetTriple, GPU, FS, *this);
+
+#ifndef LLVM_BUILD_GLOBAL_ISEL
+    GISelAccessor *GISel = new GISelAccessor();
+#else
+    SIGISelActualAccessor *GISel = new SIGISelActualAccessor();
+    GISel->CallLoweringInfo.reset(
+      new AMDGPUCallLowering(*I->getTargetLowering()));
+#endif
+
+    I->setGISelAccessor(*GISel);
+  }
+
+  return I.get();
+}
 
 //===----------------------------------------------------------------------===//
 // AMDGPU Pass Setup
 //===----------------------------------------------------------------------===//
 
 namespace {
+
 class AMDGPUPassConfig : public TargetPassConfig {
 public:
   AMDGPUPassConfig(TargetMachine *TM, PassManagerBase &PM)
@@ -142,16 +272,8 @@ public:
     return getTM<AMDGPUTargetMachine>();
   }
 
-  ScheduleDAGInstrs *
-  createMachineScheduler(MachineSchedContext *C) const override {
-    const AMDGPUSubtarget &ST = *getAMDGPUTargetMachine().getSubtargetImpl();
-    if (ST.getGeneration() <= AMDGPUSubtarget::NORTHERN_ISLANDS)
-      return createR600MachineScheduler(C);
-    else if (ST.enableSIScheduler())
-      return createSIMachineScheduler(C);
-    return nullptr;
-  }
-
+  void addEarlyCSEOrGVNPass();
+  void addStraightLineScalarOptimizationPasses();
   void addIRPasses() override;
   void addCodeGenPrepare() override;
   bool addPreISel() override;
@@ -159,27 +281,44 @@ public:
   bool addGCPasses() override;
 };
 
-class R600PassConfig : public AMDGPUPassConfig {
+class R600PassConfig final : public AMDGPUPassConfig {
 public:
   R600PassConfig(TargetMachine *TM, PassManagerBase &PM)
     : AMDGPUPassConfig(TM, PM) { }
 
+  ScheduleDAGInstrs *createMachineScheduler(
+    MachineSchedContext *C) const override {
+    return createR600MachineScheduler(C);
+  }
+
   bool addPreISel() override;
   void addPreRegAlloc() override;
   void addPreSched2() override;
   void addPreEmitPass() override;
 };
 
-class GCNPassConfig : public AMDGPUPassConfig {
+class GCNPassConfig final : public AMDGPUPassConfig {
 public:
   GCNPassConfig(TargetMachine *TM, PassManagerBase &PM)
     : AMDGPUPassConfig(TM, PM) { }
+
+  GCNTargetMachine &getGCNTargetMachine() const {
+    return getTM<GCNTargetMachine>();
+  }
+
+  ScheduleDAGInstrs *
+  createMachineScheduler(MachineSchedContext *C) const override;
+
   bool addPreISel() override;
+  void addMachineSSAOptimization() override;
   bool addInstSelector() override;
+#ifdef LLVM_BUILD_GLOBAL_ISEL
+  bool addIRTranslator() override;
+  bool addRegBankSelect() override;
+#endif
   void addFastRegAlloc(FunctionPass *RegAllocPass) override;
   void addOptimizedRegAlloc(FunctionPass *RegAllocPass) override;
   void addPreRegAlloc() override;
-  void addPostRegAlloc() override;
   void addPreSched2() override;
   void addPreEmitPass() override;
 };
@@ -188,12 +327,39 @@ public:
 
 TargetIRAnalysis AMDGPUTargetMachine::getTargetIRAnalysis() {
   return TargetIRAnalysis([this](const Function &F) {
-    return TargetTransformInfo(
-        AMDGPUTTIImpl(this, F.getParent()->getDataLayout()));
+    return TargetTransformInfo(AMDGPUTTIImpl(this, F));
   });
 }
 
+void AMDGPUPassConfig::addEarlyCSEOrGVNPass() {
+  if (getOptLevel() == CodeGenOpt::Aggressive)
+    addPass(createGVNPass());
+  else
+    addPass(createEarlyCSEPass());
+}
+
+void AMDGPUPassConfig::addStraightLineScalarOptimizationPasses() {
+  addPass(createSeparateConstOffsetFromGEPPass());
+  addPass(createSpeculativeExecutionPass());
+  // ReassociateGEPs exposes more opportunites for SLSR. See
+  // the example in reassociate-geps-and-slsr.ll.
+  addPass(createStraightLineStrengthReducePass());
+  // SeparateConstOffsetFromGEP and SLSR creates common expressions which GVN or
+  // EarlyCSE can reuse.
+  addEarlyCSEOrGVNPass();
+  // Run NaryReassociate after EarlyCSE/GVN to be more effective.
+  addPass(createNaryReassociatePass());
+  // NaryReassociate on GEPs creates redundant common expressions, so run
+  // EarlyCSE after it.
+  addPass(createEarlyCSEPass());
+}
+
 void AMDGPUPassConfig::addIRPasses() {
+  // There is no reason to run these.
+  disablePass(&StackMapLivenessID);
+  disablePass(&FuncletLayoutID);
+  disablePass(&PatchableFunctionID);
+
   // Function calls are not supported, so make sure we inline everything.
   addPass(createAMDGPUAlwaysInlinePass());
   addPass(createAlwaysInlinerPass());
@@ -207,24 +373,43 @@ void AMDGPUPassConfig::addIRPasses() {
   // Handle uses of OpenCL image2d_t, image3d_t and sampler_t arguments.
   addPass(createAMDGPUOpenCLImageTypeLoweringPass());
 
+  const AMDGPUTargetMachine &TM = getAMDGPUTargetMachine();
+  if (TM.getOptLevel() > CodeGenOpt::None) {
+    addPass(createAMDGPUPromoteAlloca(&TM));
+
+    if (EnableSROA)
+      addPass(createSROAPass());
+  }
+
+  addStraightLineScalarOptimizationPasses();
+
   TargetPassConfig::addIRPasses();
+
+  // EarlyCSE is not always strong enough to clean up what LSR produces. For
+  // example, GVN can combine
+  //
+  //   %0 = add %a, %b
+  //   %1 = add %b, %a
+  //
+  // and
+  //
+  //   %0 = shl nsw %a, 2
+  //   %1 = shl %a, 2
+  //
+  // but EarlyCSE can do neither of them.
+  if (getOptLevel() != CodeGenOpt::None)
+    addEarlyCSEOrGVNPass();
 }
 
 void AMDGPUPassConfig::addCodeGenPrepare() {
-  const AMDGPUSubtarget &ST = *getAMDGPUTargetMachine().getSubtargetImpl();
-  if (ST.isPromoteAllocaEnabled()) {
-    addPass(createAMDGPUPromoteAlloca(ST));
-    addPass(createSROAPass());
-  }
   TargetPassConfig::addCodeGenPrepare();
+
+  if (EnableLoadStoreVectorizer)
+    addPass(createLoadStoreVectorizerPass());
 }
 
-bool
-AMDGPUPassConfig::addPreISel() {
-  const AMDGPUSubtarget &ST = *getAMDGPUTargetMachine().getSubtargetImpl();
+bool AMDGPUPassConfig::addPreISel() {
   addPass(createFlattenCFGPass());
-  if (ST.IsIRStructurizerEnabled())
-    addPass(createStructurizeCFGPass());
   return false;
 }
 
@@ -244,7 +429,9 @@ bool AMDGPUPassConfig::addGCPasses() {
 
 bool R600PassConfig::addPreISel() {
   AMDGPUPassConfig::addPreISel();
-  addPass(createR600TextureIntrinsicsReplacer());
+
+  if (EnableR600StructurizeCFG)
+    addPass(createStructurizeCFGPass());
   return false;
 }
 
@@ -253,9 +440,8 @@ void R600PassConfig::addPreRegAlloc() {
 }
 
 void R600PassConfig::addPreSched2() {
-  const AMDGPUSubtarget &ST = *getAMDGPUTargetMachine().getSubtargetImpl();
   addPass(createR600EmitClauseMarkers(), false);
-  if (ST.isIfCvtEnabled())
+  if (EnableR600IfConvert)
     addPass(&IfConverterID, false);
   addPass(createR600ClauseMergePass(*TM), false);
 }
@@ -276,32 +462,62 @@ TargetPassConfig *R600TargetMachine::createPassConfig(PassManagerBase &PM) {
 // GCN Pass Setup
 //===----------------------------------------------------------------------===//
 
+ScheduleDAGInstrs *GCNPassConfig::createMachineScheduler(
+  MachineSchedContext *C) const {
+  const SISubtarget &ST = C->MF->getSubtarget<SISubtarget>();
+  if (ST.enableSIScheduler())
+    return createSIMachineScheduler(C);
+  return nullptr;
+}
+
 bool GCNPassConfig::addPreISel() {
   AMDGPUPassConfig::addPreISel();
 
   // FIXME: We need to run a pass to propagate the attributes when calls are
   // supported.
   addPass(&AMDGPUAnnotateKernelFeaturesID);
-
+  addPass(createStructurizeCFGPass(true)); // true -> SkipUniformRegions
   addPass(createSinkingPass());
   addPass(createSITypeRewriter());
-  addPass(createSIAnnotateControlFlowPass());
   addPass(createAMDGPUAnnotateUniformValues());
+  addPass(createSIAnnotateControlFlowPass());
 
   return false;
 }
 
+void GCNPassConfig::addMachineSSAOptimization() {
+  TargetPassConfig::addMachineSSAOptimization();
+
+  // We want to fold operands after PeepholeOptimizer has run (or as part of
+  // it), because it will eliminate extra copies making it easier to fold the
+  // real source operand. We want to eliminate dead instructions after, so that
+  // we see fewer uses of the copies. We then need to clean up the dead
+  // instructions leftover after the operands are folded as well.
+  //
+  // XXX - Can we get away without running DeadMachineInstructionElim again?
+  addPass(&SIFoldOperandsID);
+  addPass(&DeadMachineInstructionElimID);
+}
+
 bool GCNPassConfig::addInstSelector() {
   AMDGPUPassConfig::addInstSelector();
   addPass(createSILowerI1CopiesPass());
   addPass(&SIFixSGPRCopiesID);
-  addPass(createSIFoldOperandsPass());
   return false;
 }
 
-void GCNPassConfig::addPreRegAlloc() {
-  const AMDGPUSubtarget &ST = *getAMDGPUTargetMachine().getSubtargetImpl();
+#ifdef LLVM_BUILD_GLOBAL_ISEL
+bool GCNPassConfig::addIRTranslator() {
+  addPass(new IRTranslator());
+  return false;
+}
 
+bool GCNPassConfig::addRegBankSelect() {
+  return false;
+}
+#endif
+
+void GCNPassConfig::addPreRegAlloc() {
   // This needs to be run directly before register allocation because
   // earlier passes might recompute live intervals.
   // TODO: handle CodeGenOpt::None; fast RA ignores spill weights set by the pass
@@ -309,42 +525,48 @@ void GCNPassConfig::addPreRegAlloc() {
     insertPass(&MachineSchedulerID, &SIFixControlFlowLiveIntervalsID);
   }
 
-  if (getOptLevel() > CodeGenOpt::None && ST.loadStoreOptEnabled()) {
+  if (getOptLevel() > CodeGenOpt::None) {
     // Don't do this with no optimizations since it throws away debug info by
     // merging nonadjacent loads.
 
     // This should be run after scheduling, but before register allocation. It
     // also need extra copies to the address operand to be eliminated.
+
+    // FIXME: Move pre-RA and remove extra reg coalescer run.
     insertPass(&MachineSchedulerID, &SILoadStoreOptimizerID);
     insertPass(&MachineSchedulerID, &RegisterCoalescerID);
   }
-  addPass(createSIShrinkInstructionsPass(), false);
+
+  addPass(createSIShrinkInstructionsPass());
+  addPass(createSIWholeQuadModePass());
 }
 
 void GCNPassConfig::addFastRegAlloc(FunctionPass *RegAllocPass) {
-  addPass(&SIFixSGPRLiveRangesID);
   TargetPassConfig::addFastRegAlloc(RegAllocPass);
 }
 
 void GCNPassConfig::addOptimizedRegAlloc(FunctionPass *RegAllocPass) {
-  // We want to run this after LiveVariables is computed to avoid computing them
-  // twice.
-  // FIXME: We shouldn't disable the verifier here. r249087 introduced a failure
-  // that needs to be fixed.
-  insertPass(&LiveVariablesID, &SIFixSGPRLiveRangesID, /*VerifyAfter=*/false);
   TargetPassConfig::addOptimizedRegAlloc(RegAllocPass);
 }
 
-void GCNPassConfig::addPostRegAlloc() {
-  addPass(createSIShrinkInstructionsPass(), false);
-}
-
 void GCNPassConfig::addPreSched2() {
 }
 
 void GCNPassConfig::addPreEmitPass() {
-  addPass(createSIInsertWaits(*TM), false);
-  addPass(createSILowerControlFlowPass(*TM), false);
+  // The hazard recognizer that runs as part of the post-ra scheduler does not
+  // guarantee to be able handle all hazards correctly. This is because if there
+  // are multiple scheduling regions in a basic block, the regions are scheduled
+  // bottom up, so when we begin to schedule a region we don't know what
+  // instructions were emitted directly before it.
+  //
+  // Here we add a stand-alone hazard recognizer pass which can handle all
+  // cases.
+  addPass(&PostRAHazardRecognizerID);
+
+  addPass(createSIInsertWaitsPass());
+  addPass(createSIShrinkInstructionsPass());
+  addPass(createSILowerControlFlowPass());
+  addPass(createSIDebuggerInsertNopsPass());
 }
 
 TargetPassConfig *GCNTargetMachine::createPassConfig(PassManagerBase &PM) {
diff --git a/lib/Target/AMDGPU/AMDGPUTargetMachine.h b/lib/Target/AMDGPU/AMDGPUTargetMachine.h
index 236e3f824030..b0eb3a9a15f7 100644
--- a/lib/Target/AMDGPU/AMDGPUTargetMachine.h
+++ b/lib/Target/AMDGPU/AMDGPUTargetMachine.h
@@ -12,15 +12,11 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef LLVM_LIB_TARGET_R600_AMDGPUTARGETMACHINE_H
-#define LLVM_LIB_TARGET_R600_AMDGPUTARGETMACHINE_H
+#ifndef LLVM_LIB_TARGET_AMDGPU_AMDGPUTARGETMACHINE_H
+#define LLVM_LIB_TARGET_AMDGPU_AMDGPUTARGETMACHINE_H
 
-#include "AMDGPUFrameLowering.h"
-#include "AMDGPUInstrInfo.h"
 #include "AMDGPUIntrinsicInfo.h"
 #include "AMDGPUSubtarget.h"
-#include "R600ISelLowering.h"
-#include "llvm/IR/DataLayout.h"
 
 namespace llvm {
 
@@ -29,23 +25,23 @@ namespace llvm {
 //===----------------------------------------------------------------------===//
 
 class AMDGPUTargetMachine : public LLVMTargetMachine {
-private:
-
 protected:
   std::unique_ptr<TargetLoweringObjectFile> TLOF;
-  AMDGPUSubtarget Subtarget;
   AMDGPUIntrinsicInfo IntrinsicInfo;
 
+  StringRef getGPUName(const Function &F) const;
+  StringRef getFeatureString(const Function &F) const;
+
 public:
-  AMDGPUTargetMachine(const Target &T, const Triple &TT, StringRef FS,
-                      StringRef CPU, TargetOptions Options, Reloc::Model RM,
-                      CodeModel::Model CM, CodeGenOpt::Level OL);
+  AMDGPUTargetMachine(const Target &T, const Triple &TT, StringRef CPU,
+                      StringRef FS, TargetOptions Options,
+                      Optional<Reloc::Model> RM, CodeModel::Model CM,
+                      CodeGenOpt::Level OL);
   ~AMDGPUTargetMachine();
 
-  const AMDGPUSubtarget *getSubtargetImpl() const { return &Subtarget; }
-  const AMDGPUSubtarget *getSubtargetImpl(const Function &) const override {
-    return &Subtarget;
-  }
+  const AMDGPUSubtarget *getSubtargetImpl() const;
+  const AMDGPUSubtarget *getSubtargetImpl(const Function &) const override;
+
   const AMDGPUIntrinsicInfo *getIntrinsicInfo() const override {
     return &IntrinsicInfo;
   }
@@ -60,30 +56,47 @@ public:
 // R600 Target Machine (R600 -> Cayman)
 //===----------------------------------------------------------------------===//
 
-class R600TargetMachine : public AMDGPUTargetMachine {
+class R600TargetMachine final : public AMDGPUTargetMachine {
+private:
+  mutable StringMap<std::unique_ptr<R600Subtarget>> SubtargetMap;
 
 public:
-  R600TargetMachine(const Target &T, const Triple &TT, StringRef FS,
-                    StringRef CPU, TargetOptions Options, Reloc::Model RM,
-                    CodeModel::Model CM, CodeGenOpt::Level OL);
+  R600TargetMachine(const Target &T, const Triple &TT, StringRef CPU,
+                    StringRef FS, TargetOptions Options,
+                    Optional<Reloc::Model> RM, CodeModel::Model CM,
+                    CodeGenOpt::Level OL);
 
   TargetPassConfig *createPassConfig(PassManagerBase &PM) override;
+
+  const R600Subtarget *getSubtargetImpl(const Function &) const override;
 };
 
 //===----------------------------------------------------------------------===//
 // GCN Target Machine (SI+)
 //===----------------------------------------------------------------------===//
 
-class GCNTargetMachine : public AMDGPUTargetMachine {
+class GCNTargetMachine final : public AMDGPUTargetMachine {
+private:
+  mutable StringMap<std::unique_ptr<SISubtarget>> SubtargetMap;
 
 public:
-  GCNTargetMachine(const Target &T, const Triple &TT, StringRef FS,
-                   StringRef CPU, TargetOptions Options, Reloc::Model RM,
-                   CodeModel::Model CM, CodeGenOpt::Level OL);
+  GCNTargetMachine(const Target &T, const Triple &TT, StringRef CPU,
+                   StringRef FS, TargetOptions Options,
+                   Optional<Reloc::Model> RM, CodeModel::Model CM,
+                   CodeGenOpt::Level OL);
 
   TargetPassConfig *createPassConfig(PassManagerBase &PM) override;
+
+  const SISubtarget *getSubtargetImpl(const Function &) const override;
 };
 
+inline const AMDGPUSubtarget *AMDGPUTargetMachine::getSubtargetImpl(
+  const Function &F) const {
+  if (getTargetTriple().getArch() == Triple::amdgcn)
+    return static_cast<const GCNTargetMachine *>(this)->getSubtargetImpl(F);
+  return static_cast<const R600TargetMachine *>(this)->getSubtargetImpl(F);
+}
+
 } // End namespace llvm
 
 #endif
diff --git a/lib/Target/AMDGPU/AMDGPUTargetObjectFile.cpp b/lib/Target/AMDGPU/AMDGPUTargetObjectFile.cpp
index e050f21091ba..03d1e2c764de 100644
--- a/lib/Target/AMDGPU/AMDGPUTargetObjectFile.cpp
+++ b/lib/Target/AMDGPU/AMDGPUTargetObjectFile.cpp
@@ -29,59 +29,3 @@ MCSection *AMDGPUTargetObjectFile::SelectSectionForGlobal(const GlobalValue *GV,
 
   return TargetLoweringObjectFileELF::SelectSectionForGlobal(GV, Kind, Mang, TM);
 }
-
-//===----------------------------------------------------------------------===//
-// HSA Object File
-//===----------------------------------------------------------------------===//
-
-
-void AMDGPUHSATargetObjectFile::Initialize(MCContext &Ctx,
-                                           const TargetMachine &TM){
-  TargetLoweringObjectFileELF::Initialize(Ctx, TM);
-  InitializeELF(TM.Options.UseInitArray);
-
-  TextSection = AMDGPU::getHSATextSection(Ctx);
-
-  DataGlobalAgentSection = AMDGPU::getHSADataGlobalAgentSection(Ctx);
-  DataGlobalProgramSection = AMDGPU::getHSADataGlobalProgramSection(Ctx);
-
-  RodataReadonlyAgentSection = AMDGPU::getHSARodataReadonlyAgentSection(Ctx);
-}
-
-bool AMDGPUHSATargetObjectFile::isAgentAllocationSection(
-    const char *SectionName) const {
-  return cast<MCSectionELF>(DataGlobalAgentSection)
-      ->getSectionName()
-      .equals(SectionName);
-}
-
-bool AMDGPUHSATargetObjectFile::isAgentAllocation(const GlobalValue *GV) const {
-  // Read-only segments can only have agent allocation.
-  return AMDGPU::isReadOnlySegment(GV) ||
-         (AMDGPU::isGlobalSegment(GV) && GV->hasSection() &&
-          isAgentAllocationSection(GV->getSection()));
-}
-
-bool AMDGPUHSATargetObjectFile::isProgramAllocation(
-    const GlobalValue *GV) const {
-  // The default for global segments is program allocation.
-  return AMDGPU::isGlobalSegment(GV) && !isAgentAllocation(GV);
-}
-
-MCSection *AMDGPUHSATargetObjectFile::SelectSectionForGlobal(
-                                        const GlobalValue *GV, SectionKind Kind,
-                                        Mangler &Mang,
-                                        const TargetMachine &TM) const {
-  if (Kind.isText() && !GV->hasComdat())
-    return getTextSection();
-
-  if (AMDGPU::isGlobalSegment(GV)) {
-    if (isAgentAllocation(GV))
-      return DataGlobalAgentSection;
-
-    if (isProgramAllocation(GV))
-      return DataGlobalProgramSection;
-  }
-
-  return AMDGPUTargetObjectFile::SelectSectionForGlobal(GV, Kind, Mang, TM);
-}
diff --git a/lib/Target/AMDGPU/AMDGPUTargetObjectFile.h b/lib/Target/AMDGPU/AMDGPUTargetObjectFile.h
index 921341ebb897..f530e0952a74 100644
--- a/lib/Target/AMDGPU/AMDGPUTargetObjectFile.h
+++ b/lib/Target/AMDGPU/AMDGPUTargetObjectFile.h
@@ -28,24 +28,6 @@ class AMDGPUTargetObjectFile : public TargetLoweringObjectFileELF {
                                       const TargetMachine &TM) const override;
 };
 
-class AMDGPUHSATargetObjectFile final : public AMDGPUTargetObjectFile {
-private:
-  MCSection *DataGlobalAgentSection;
-  MCSection *DataGlobalProgramSection;
-  MCSection *RodataReadonlyAgentSection;
-
-  bool isAgentAllocationSection(const char *SectionName) const;
-  bool isAgentAllocation(const GlobalValue *GV) const;
-  bool isProgramAllocation(const GlobalValue *GV) const;
-
-public:
-  void Initialize(MCContext &Ctx, const TargetMachine &TM) override;
-
-  MCSection *SelectSectionForGlobal(const GlobalValue *GV, SectionKind Kind,
-                                    Mangler &Mang,
-                                    const TargetMachine &TM) const override;
-};
-
 } // end namespace llvm
 
 #endif
diff --git a/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp b/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp
index 54a003d6a9cf..3d630fe3ea9d 100644
--- a/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp
+++ b/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp
@@ -21,6 +21,7 @@
 #include "llvm/Analysis/ValueTracking.h"
 #include "llvm/CodeGen/BasicTTIImpl.h"
 #include "llvm/IR/Module.h"
+#include "llvm/IR/Intrinsics.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Target/CostTable.h"
 #include "llvm/Target/TargetLowering.h"
@@ -28,6 +29,7 @@ using namespace llvm;
 
 #define DEBUG_TYPE "AMDGPUtti"
 
+
 void AMDGPUTTIImpl::getUnrollingPreferences(Loop *L,
                                             TTI::UnrollingPreferences &UP) {
   UP.Threshold = 300; // Twice the default.
@@ -78,11 +80,127 @@ unsigned AMDGPUTTIImpl::getRegisterBitWidth(bool Vector) {
   return Vector ? 0 : 32;
 }
 
+unsigned AMDGPUTTIImpl::getLoadStoreVecRegBitWidth(unsigned AddrSpace) {
+  switch (AddrSpace) {
+  case AMDGPUAS::GLOBAL_ADDRESS:
+  case AMDGPUAS::CONSTANT_ADDRESS:
+  case AMDGPUAS::FLAT_ADDRESS:
+    return 128;
+  case AMDGPUAS::LOCAL_ADDRESS:
+  case AMDGPUAS::REGION_ADDRESS:
+    return 64;
+  case AMDGPUAS::PRIVATE_ADDRESS:
+    return 8 * ST->getMaxPrivateElementSize();
+  default:
+    if (ST->getGeneration() <= AMDGPUSubtarget::NORTHERN_ISLANDS &&
+        (AddrSpace == AMDGPUAS::PARAM_D_ADDRESS ||
+         AddrSpace == AMDGPUAS::PARAM_I_ADDRESS ||
+         (AddrSpace >= AMDGPUAS::CONSTANT_BUFFER_0 &&
+          AddrSpace <= AMDGPUAS::CONSTANT_BUFFER_15)))
+      return 128;
+    llvm_unreachable("unhandled address space");
+  }
+}
+
 unsigned AMDGPUTTIImpl::getMaxInterleaveFactor(unsigned VF) {
   // Semi-arbitrary large amount.
   return 64;
 }
 
+int AMDGPUTTIImpl::getArithmeticInstrCost(
+    unsigned Opcode, Type *Ty, TTI::OperandValueKind Opd1Info,
+    TTI::OperandValueKind Opd2Info, TTI::OperandValueProperties Opd1PropInfo,
+    TTI::OperandValueProperties Opd2PropInfo) {
+
+  EVT OrigTy = TLI->getValueType(DL, Ty);
+  if (!OrigTy.isSimple()) {
+    return BaseT::getArithmeticInstrCost(Opcode, Ty, Opd1Info, Opd2Info,
+                                         Opd1PropInfo, Opd2PropInfo);
+  }
+
+  // Legalize the type.
+  std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, Ty);
+  int ISD = TLI->InstructionOpcodeToISD(Opcode);
+
+  // Because we don't have any legal vector operations, but the legal types, we
+  // need to account for split vectors.
+  unsigned NElts = LT.second.isVector() ?
+    LT.second.getVectorNumElements() : 1;
+
+  MVT::SimpleValueType SLT = LT.second.getScalarType().SimpleTy;
+
+  switch (ISD) {
+  case ISD::SHL:
+  case ISD::SRL:
+  case ISD::SRA: {
+    if (SLT == MVT::i64)
+      return get64BitInstrCost() * LT.first * NElts;
+
+    // i32
+    return getFullRateInstrCost() * LT.first * NElts;
+  }
+  case ISD::ADD:
+  case ISD::SUB:
+  case ISD::AND:
+  case ISD::OR:
+  case ISD::XOR: {
+    if (SLT == MVT::i64){
+      // and, or and xor are typically split into 2 VALU instructions.
+      return 2 * getFullRateInstrCost() * LT.first * NElts;
+    }
+
+    return LT.first * NElts * getFullRateInstrCost();
+  }
+  case ISD::MUL: {
+    const int QuarterRateCost = getQuarterRateInstrCost();
+    if (SLT == MVT::i64) {
+      const int FullRateCost = getFullRateInstrCost();
+      return (4 * QuarterRateCost + (2 * 2) * FullRateCost) * LT.first * NElts;
+    }
+
+    // i32
+    return QuarterRateCost * NElts * LT.first;
+  }
+  case ISD::FADD:
+  case ISD::FSUB:
+  case ISD::FMUL:
+    if (SLT == MVT::f64)
+      return LT.first * NElts * get64BitInstrCost();
+
+    if (SLT == MVT::f32 || SLT == MVT::f16)
+      return LT.first * NElts * getFullRateInstrCost();
+    break;
+
+  case ISD::FDIV:
+  case ISD::FREM:
+    // FIXME: frem should be handled separately. The fdiv in it is most of it,
+    // but the current lowering is also not entirely correct.
+    if (SLT == MVT::f64) {
+      int Cost = 4 * get64BitInstrCost() + 7 * getQuarterRateInstrCost();
+
+      // Add cost of workaround.
+      if (ST->getGeneration() == AMDGPUSubtarget::SOUTHERN_ISLANDS)
+        Cost += 3 * getFullRateInstrCost();
+
+      return LT.first * Cost * NElts;
+    }
+
+    // Assuming no fp32 denormals lowering.
+    if (SLT == MVT::f32 || SLT == MVT::f16) {
+      assert(!ST->hasFP32Denormals() && "will change when supported");
+      int Cost = 7 * getFullRateInstrCost() + 1 * getQuarterRateInstrCost();
+      return LT.first * NElts * Cost;
+    }
+
+    break;
+  default:
+    break;
+  }
+
+  return BaseT::getArithmeticInstrCost(Opcode, Ty, Opd1Info, Opd2Info,
+                                       Opd1PropInfo, Opd2PropInfo);
+}
+
 unsigned AMDGPUTTIImpl::getCFInstrCost(unsigned Opcode) {
   // XXX - For some reason this isn't called for switch.
   switch (Opcode) {
@@ -98,6 +216,11 @@ int AMDGPUTTIImpl::getVectorInstrCost(unsigned Opcode, Type *ValTy,
                                       unsigned Index) {
   switch (Opcode) {
   case Instruction::ExtractElement:
+  case Instruction::InsertElement:
+    // Extracts are just reads of a subregister, so are free. Inserts are
+    // considered free because we don't want to have any cost for scalarizing
+    // operations, and we don't have to copy into a different register class.
+
     // Dynamic indexing isn't free and is best avoided.
     return Index == ~0u ? 2 : 0;
   default:
@@ -115,6 +238,9 @@ static bool isIntrinsicSourceOfDivergence(const TargetIntrinsicInfo *TII,
     // IntrinsicsAMDGPU.td
     break;
 
+  case Intrinsic::amdgcn_workitem_id_x:
+  case Intrinsic::amdgcn_workitem_id_y:
+  case Intrinsic::amdgcn_workitem_id_z:
   case Intrinsic::amdgcn_interp_p1:
   case Intrinsic::amdgcn_interp_p2:
   case Intrinsic::amdgcn_mbcnt_hi:
@@ -122,6 +248,31 @@ static bool isIntrinsicSourceOfDivergence(const TargetIntrinsicInfo *TII,
   case Intrinsic::r600_read_tidig_x:
   case Intrinsic::r600_read_tidig_y:
   case Intrinsic::r600_read_tidig_z:
+  case Intrinsic::amdgcn_image_atomic_swap:
+  case Intrinsic::amdgcn_image_atomic_add:
+  case Intrinsic::amdgcn_image_atomic_sub:
+  case Intrinsic::amdgcn_image_atomic_smin:
+  case Intrinsic::amdgcn_image_atomic_umin:
+  case Intrinsic::amdgcn_image_atomic_smax:
+  case Intrinsic::amdgcn_image_atomic_umax:
+  case Intrinsic::amdgcn_image_atomic_and:
+  case Intrinsic::amdgcn_image_atomic_or:
+  case Intrinsic::amdgcn_image_atomic_xor:
+  case Intrinsic::amdgcn_image_atomic_inc:
+  case Intrinsic::amdgcn_image_atomic_dec:
+  case Intrinsic::amdgcn_image_atomic_cmpswap:
+  case Intrinsic::amdgcn_buffer_atomic_swap:
+  case Intrinsic::amdgcn_buffer_atomic_add:
+  case Intrinsic::amdgcn_buffer_atomic_sub:
+  case Intrinsic::amdgcn_buffer_atomic_smin:
+  case Intrinsic::amdgcn_buffer_atomic_umin:
+  case Intrinsic::amdgcn_buffer_atomic_smax:
+  case Intrinsic::amdgcn_buffer_atomic_umax:
+  case Intrinsic::amdgcn_buffer_atomic_and:
+  case Intrinsic::amdgcn_buffer_atomic_or:
+  case Intrinsic::amdgcn_buffer_atomic_xor:
+  case Intrinsic::amdgcn_buffer_atomic_cmpswap:
+  case Intrinsic::amdgcn_ps_live:
     return true;
   }
 
@@ -129,18 +280,17 @@ static bool isIntrinsicSourceOfDivergence(const TargetIntrinsicInfo *TII,
   switch (TII->lookupName((const char *)Name.bytes_begin(), Name.size())) {
   default:
     return false;
-  case AMDGPUIntrinsic::SI_tid:
   case AMDGPUIntrinsic::SI_fs_interp:
+  case AMDGPUIntrinsic::SI_fs_constant:
     return true;
   }
 }
 
 static bool isArgPassedInSGPR(const Argument *A) {
   const Function *F = A->getParent();
-  unsigned ShaderType = AMDGPU::getShaderType(*F);
 
   // Arguments to compute shaders are never a source of divergence.
-  if (ShaderType == ShaderType::COMPUTE)
+  if (!AMDGPU::isShader(F->getCallingConv()))
     return true;
 
   // For non-compute shaders, SGPR inputs are marked with either inreg or byval.
@@ -169,6 +319,13 @@ bool AMDGPUTTIImpl::isSourceOfDivergence(const Value *V) const {
   if (const LoadInst *Load = dyn_cast<LoadInst>(V))
     return Load->getPointerAddressSpace() == AMDGPUAS::PRIVATE_ADDRESS;
 
+  // Atomics are divergent because they are executed sequentially: when an
+  // atomic operation refers to the same address in each thread, then each
+  // thread after the first sees the value written by the previous thread as
+  // original value.
+  if (isa<AtomicRMWInst>(V) || isa<AtomicCmpXchgInst>(V))
+    return true;
+
   if (const IntrinsicInst *Intrinsic = dyn_cast<IntrinsicInst>(V)) {
     const TargetMachine &TM = getTLI()->getTargetMachine();
     return isIntrinsicSourceOfDivergence(TM.getIntrinsicInfo(), Intrinsic);
diff --git a/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h b/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h
index 976afb03443b..a82a07458086 100644
--- a/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h
+++ b/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h
@@ -14,18 +14,18 @@
 ///
 //===----------------------------------------------------------------------===//
 
-#ifndef LLVM_LIB_TARGET_R600_AMDGPUTARGETTRANSFORMINFO_H
-#define LLVM_LIB_TARGET_R600_AMDGPUTARGETTRANSFORMINFO_H
+#ifndef LLVM_LIB_TARGET_AMDGPU_AMDGPUTARGETTRANSFORMINFO_H
+#define LLVM_LIB_TARGET_AMDGPU_AMDGPUTARGETTRANSFORMINFO_H
 
 #include "AMDGPU.h"
 #include "AMDGPUTargetMachine.h"
 #include "llvm/Analysis/TargetTransformInfo.h"
 #include "llvm/CodeGen/BasicTTIImpl.h"
-#include "llvm/Target/TargetLowering.h"
 
 namespace llvm {
+class AMDGPUTargetLowering;
 
-class AMDGPUTTIImpl : public BasicTTIImplBase<AMDGPUTTIImpl> {
+class AMDGPUTTIImpl final : public BasicTTIImplBase<AMDGPUTTIImpl> {
   typedef BasicTTIImplBase<AMDGPUTTIImpl> BaseT;
   typedef TargetTransformInfo TTI;
   friend BaseT;
@@ -36,10 +36,33 @@ class AMDGPUTTIImpl : public BasicTTIImplBase<AMDGPUTTIImpl> {
   const AMDGPUSubtarget *getST() const { return ST; }
   const AMDGPUTargetLowering *getTLI() const { return TLI; }
 
+
+  static inline int getFullRateInstrCost() {
+    return TargetTransformInfo::TCC_Basic;
+  }
+
+  static inline int getHalfRateInstrCost() {
+    return 2 * TargetTransformInfo::TCC_Basic;
+  }
+
+  // TODO: The size is usually 8 bytes, but takes 4x as many cycles. Maybe
+  // should be 2 or 4.
+  static inline int getQuarterRateInstrCost() {
+    return 3 * TargetTransformInfo::TCC_Basic;
+  }
+
+   // On some parts, normal fp64 operations are half rate, and others
+   // quarter. This also applies to some integer operations.
+  inline int get64BitInstrCost() const {
+    return ST->hasHalfRate64Ops() ?
+      getHalfRateInstrCost() : getQuarterRateInstrCost();
+  }
+
 public:
-  explicit AMDGPUTTIImpl(const AMDGPUTargetMachine *TM, const DataLayout &DL)
-      : BaseT(TM, DL), ST(TM->getSubtargetImpl()),
-        TLI(ST->getTargetLowering()) {}
+  explicit AMDGPUTTIImpl(const AMDGPUTargetMachine *TM, const Function &F)
+    : BaseT(TM, F.getParent()->getDataLayout()),
+      ST(TM->getSubtargetImpl(F)),
+      TLI(ST->getTargetLowering()) {}
 
   // Provide value semantics. MSVC requires that we spell all of these out.
   AMDGPUTTIImpl(const AMDGPUTTIImpl &Arg)
@@ -54,17 +77,27 @@ public:
 
   TTI::PopcntSupportKind getPopcntSupport(unsigned TyWidth) {
     assert(isPowerOf2_32(TyWidth) && "Ty width must be power of 2");
-    return ST->hasBCNT(TyWidth) ? TTI::PSK_FastHardware : TTI::PSK_Software;
+    return TTI::PSK_FastHardware;
   }
 
   unsigned getNumberOfRegisters(bool Vector);
   unsigned getRegisterBitWidth(bool Vector);
+  unsigned getLoadStoreVecRegBitWidth(unsigned AddrSpace);
   unsigned getMaxInterleaveFactor(unsigned VF);
 
+  int getArithmeticInstrCost(
+    unsigned Opcode, Type *Ty,
+    TTI::OperandValueKind Opd1Info = TTI::OK_AnyValue,
+    TTI::OperandValueKind Opd2Info = TTI::OK_AnyValue,
+    TTI::OperandValueProperties Opd1PropInfo = TTI::OP_None,
+    TTI::OperandValueProperties Opd2PropInfo = TTI::OP_None);
+
   unsigned getCFInstrCost(unsigned Opcode);
 
   int getVectorInstrCost(unsigned Opcode, Type *ValTy, unsigned Index);
   bool isSourceOfDivergence(const Value *V) const;
+
+  unsigned getVectorSplitCost() { return 0; }
 };
 
 } // end namespace llvm
diff --git a/lib/Target/AMDGPU/AMDILCFGStructurizer.cpp b/lib/Target/AMDGPU/AMDILCFGStructurizer.cpp
index 917efd149e00..21de76396b16 100644
--- a/lib/Target/AMDGPU/AMDILCFGStructurizer.cpp
+++ b/lib/Target/AMDGPU/AMDILCFGStructurizer.cpp
@@ -50,8 +50,6 @@ STATISTIC(numSerialPatternMatch,    "CFGStructurizer number of serial pattern "
     "matched");
 STATISTIC(numIfPatternMatch,        "CFGStructurizer number of if pattern "
     "matched");
-STATISTIC(numLoopcontPatternMatch,  "CFGStructurizer number of loop-continue "
-    "pattern matched");
 STATISTIC(numClonedBlock,           "CFGStructurizer cloned blocks");
 STATISTIC(numClonedInstr,           "CFGStructurizer cloned instructions");
 
@@ -162,7 +160,7 @@ public:
   bool prepare();
 
   bool runOnMachineFunction(MachineFunction &MF) override {
-    TII = static_cast<const R600InstrInfo *>(MF.getSubtarget().getInstrInfo());
+    TII = MF.getSubtarget<R600Subtarget>().getInstrInfo();
     TRI = &TII->getRegisterInfo();
     DEBUG(MF.dump(););
     OrderedBlks.clear();
@@ -213,7 +211,6 @@ protected:
   int getSCCNum(MachineBasicBlock *MBB) const;
   MachineBasicBlock *getLoopLandInfo(MachineLoop *LoopRep) const;
   bool hasBackEdge(MachineBasicBlock *MBB) const;
-  static unsigned getLoopDepth(MachineLoop *LoopRep);
   bool isRetiredBlock(MachineBasicBlock *MBB) const;
   bool isActiveLoophead(MachineBasicBlock *MBB) const;
   PathToKind singlePathTo(MachineBasicBlock *SrcMBB, MachineBasicBlock *DstMBB,
@@ -229,16 +226,15 @@ protected:
 
   // Function originally from CFGStructTraits
   void insertInstrEnd(MachineBasicBlock *MBB, int NewOpcode,
-      DebugLoc DL = DebugLoc());
+                      const DebugLoc &DL = DebugLoc());
   MachineInstr *insertInstrBefore(MachineBasicBlock *MBB, int NewOpcode,
-    DebugLoc DL = DebugLoc());
+                                  const DebugLoc &DL = DebugLoc());
   MachineInstr *insertInstrBefore(MachineBasicBlock::iterator I, int NewOpcode);
   void insertCondBranchBefore(MachineBasicBlock::iterator I, int NewOpcode,
-      DebugLoc DL);
+                              const DebugLoc &DL);
   void insertCondBranchBefore(MachineBasicBlock *MBB,
-      MachineBasicBlock::iterator I, int NewOpcode, int RegNum,
-      DebugLoc DL);
-  void insertCondBranchEnd(MachineBasicBlock *MBB, int NewOpcode, int RegNum);
+                              MachineBasicBlock::iterator I, int NewOpcode,
+                              int RegNum, const DebugLoc &DL);
   static int getBranchNzeroOpcode(int OldOpcode);
   static int getBranchZeroOpcode(int OldOpcode);
   static int getContinueNzeroOpcode(int OldOpcode);
@@ -257,7 +253,6 @@ protected:
   /// instruction.  Such move instruction "belong to" the loop backward-edge.
   MachineInstr *getLoopendBlockBranchInstr(MachineBasicBlock *MBB);
   static MachineInstr *getReturnInstr(MachineBasicBlock *MBB);
-  static MachineInstr *getContinueInstr(MachineBasicBlock *MBB);
   static bool isReturnBlock(MachineBasicBlock *MBB);
   static void cloneSuccessorList(MachineBasicBlock *DstMBB,
       MachineBasicBlock *SrcMBB) ;
@@ -276,11 +271,7 @@ protected:
   int ifPatternMatch(MachineBasicBlock *MBB);
   int loopendPatternMatch();
   int mergeLoop(MachineLoop *LoopRep);
-  int loopcontPatternMatch(MachineLoop *LoopRep, MachineBasicBlock *LoopHeader);
 
-  void handleLoopcontBlock(MachineBasicBlock *ContingMBB,
-      MachineLoop *ContingLoop, MachineBasicBlock *ContMBB,
-      MachineLoop *ContLoop);
   /// return true iff src1Blk->succ_size() == 0 && src1Blk and src2Blk are in
   /// the same loop with LoopLandInfo without explicitly keeping track of
   /// loopContBlks and loopBreakBlks, this is a method to get the information.
@@ -337,13 +328,7 @@ protected:
       MachineBasicBlock *DstMBB, MachineBasicBlock::iterator I);
   void recordSccnum(MachineBasicBlock *MBB, int SCCNum);
   void retireBlock(MachineBasicBlock *MBB);
-  void setLoopLandBlock(MachineLoop *LoopRep, MachineBasicBlock *MBB = nullptr);
 
-  MachineBasicBlock *findNearestCommonPostDom(std::set<MachineBasicBlock *>&);
-  /// This is work around solution for findNearestCommonDominator not available
-  /// to post dom a proper fix should go to Dominators.h.
-  MachineBasicBlock *findNearestCommonPostDom(MachineBasicBlock *MBB1,
-      MachineBasicBlock *MBB2);
 
 private:
   MBBInfoMap BlockInfoMap;
@@ -376,10 +361,6 @@ bool AMDGPUCFGStructurizer::hasBackEdge(MachineBasicBlock *MBB) const {
   return MBB->isSuccessor(LoopHeader);
 }
 
-unsigned AMDGPUCFGStructurizer::getLoopDepth(MachineLoop *LoopRep) {
-  return LoopRep ? LoopRep->getLoopDepth() : 0;
-}
-
 bool AMDGPUCFGStructurizer::isRetiredBlock(MachineBasicBlock *MBB) const {
   MBBInfoMap::const_iterator It = BlockInfoMap.find(MBB);
   if (It == BlockInfoMap.end())
@@ -442,7 +423,8 @@ bool AMDGPUCFGStructurizer::needMigrateBlock(MachineBasicBlock *MBB) const {
 
 void AMDGPUCFGStructurizer::reversePredicateSetter(
     MachineBasicBlock::iterator I) {
-  while (I--) {
+  assert(static_cast<MachineInstr *>(I) && "Expected valid iterator");
+  for (;; --I) {
     if (I->getOpcode() == AMDGPU::PRED_X) {
       switch (static_cast<MachineInstr *>(I)->getOperand(2).getImm()) {
       case OPCODE_IS_ZERO_INT:
@@ -469,16 +451,17 @@ void AMDGPUCFGStructurizer::reversePredicateSetter(
 }
 
 void AMDGPUCFGStructurizer::insertInstrEnd(MachineBasicBlock *MBB,
-    int NewOpcode, DebugLoc DL) {
- MachineInstr *MI = MBB->getParent()
-    ->CreateMachineInstr(TII->get(NewOpcode), DL);
+                                           int NewOpcode, const DebugLoc &DL) {
+  MachineInstr *MI =
+      MBB->getParent()->CreateMachineInstr(TII->get(NewOpcode), DL);
   MBB->push_back(MI);
   //assume the instruction doesn't take any reg operand ...
   SHOWNEWINSTR(MI);
 }
 
 MachineInstr *AMDGPUCFGStructurizer::insertInstrBefore(MachineBasicBlock *MBB,
-    int NewOpcode, DebugLoc DL) {
+                                                       int NewOpcode,
+                                                       const DebugLoc &DL) {
   MachineInstr *MI =
       MBB->getParent()->CreateMachineInstr(TII->get(NewOpcode), DL);
   if (MBB->begin() != MBB->end())
@@ -502,7 +485,7 @@ MachineInstr *AMDGPUCFGStructurizer::insertInstrBefore(
 }
 
 void AMDGPUCFGStructurizer::insertCondBranchBefore(
-    MachineBasicBlock::iterator I, int NewOpcode, DebugLoc DL) {
+    MachineBasicBlock::iterator I, int NewOpcode, const DebugLoc &DL) {
   MachineInstr *OldMI = &(*I);
   MachineBasicBlock *MBB = OldMI->getParent();
   MachineFunction *MF = MBB->getParent();
@@ -514,9 +497,9 @@ void AMDGPUCFGStructurizer::insertCondBranchBefore(
   //erase later oldInstr->eraseFromParent();
 }
 
-void AMDGPUCFGStructurizer::insertCondBranchBefore(MachineBasicBlock *blk,
-    MachineBasicBlock::iterator I, int NewOpcode, int RegNum,
-    DebugLoc DL) {
+void AMDGPUCFGStructurizer::insertCondBranchBefore(
+    MachineBasicBlock *blk, MachineBasicBlock::iterator I, int NewOpcode,
+    int RegNum, const DebugLoc &DL) {
   MachineFunction *MF = blk->getParent();
   MachineInstr *NewInstr = MF->CreateMachineInstr(TII->get(NewOpcode), DL);
   //insert before
@@ -525,16 +508,6 @@ void AMDGPUCFGStructurizer::insertCondBranchBefore(MachineBasicBlock *blk,
   SHOWNEWINSTR(NewInstr);
 }
 
-void AMDGPUCFGStructurizer::insertCondBranchEnd(MachineBasicBlock *MBB,
-    int NewOpcode, int RegNum) {
-  MachineFunction *MF = MBB->getParent();
-  MachineInstr *NewInstr =
-    MF->CreateMachineInstr(TII->get(NewOpcode), DebugLoc());
-  MBB->push_back(NewInstr);
-  MachineInstrBuilder(*MF, NewInstr).addReg(RegNum, false);
-  SHOWNEWINSTR(NewInstr);
-}
-
 int AMDGPUCFGStructurizer::getBranchNzeroOpcode(int OldOpcode) {
   switch(OldOpcode) {
   case AMDGPU::JUMP_COND:
@@ -664,16 +637,6 @@ MachineInstr *AMDGPUCFGStructurizer::getReturnInstr(MachineBasicBlock *MBB) {
   return nullptr;
 }
 
-MachineInstr *AMDGPUCFGStructurizer::getContinueInstr(MachineBasicBlock *MBB) {
-  MachineBasicBlock::reverse_iterator It = MBB->rbegin();
-  if (It != MBB->rend()) {
-    MachineInstr *MI = &(*It);
-    if (MI->getOpcode() == AMDGPU::CONTINUE)
-      return MI;
-  }
-  return nullptr;
-}
-
 bool AMDGPUCFGStructurizer::isReturnBlock(MachineBasicBlock *MBB) {
   MachineInstr *MI = getReturnInstr(MBB);
   bool IsReturn = (MBB->succ_size() == 0);
@@ -697,11 +660,8 @@ MachineBasicBlock *AMDGPUCFGStructurizer::clone(MachineBasicBlock *MBB) {
   MachineFunction *Func = MBB->getParent();
   MachineBasicBlock *NewMBB = Func->CreateMachineBasicBlock();
   Func->push_back(NewMBB);  //insert to function
-  for (MachineBasicBlock::iterator It = MBB->begin(), E = MBB->end();
-      It != E; ++It) {
-    MachineInstr *MI = Func->CloneMachineInstr(It);
-    NewMBB->push_back(MI);
-  }
+  for (const MachineInstr &It : *MBB)
+    NewMBB->push_back(Func->CloneMachineInstr(&It));
   return NewMBB;
 }
 
@@ -727,7 +687,7 @@ void AMDGPUCFGStructurizer::wrapup(MachineBasicBlock *MBB) {
    while (It != E) {
      if (Pre->getOpcode() == AMDGPU::CONTINUE
          && It->getOpcode() == AMDGPU::ENDLOOP)
-       ContInstr.push_back(Pre);
+       ContInstr.push_back(&*Pre);
      Pre = It;
      ++It;
    }
@@ -923,7 +883,7 @@ bool AMDGPUCFGStructurizer::run() {
 
   if (!Finish) {
     DEBUG(FuncRep->viewCFG());
-    llvm_unreachable("IRREDUCIBLE_CFG");
+    report_fatal_error("IRREDUCIBLE_CFG");
   }
 
   return true;
@@ -1145,34 +1105,6 @@ int AMDGPUCFGStructurizer::mergeLoop(MachineLoop *LoopRep) {
   return 1;
 }
 
-int AMDGPUCFGStructurizer::loopcontPatternMatch(MachineLoop *LoopRep,
-    MachineBasicBlock *LoopHeader) {
-  int NumCont = 0;
-  SmallVector<MachineBasicBlock *, DEFAULT_VEC_SLOTS> ContMBB;
-  typedef GraphTraits<Inverse<MachineBasicBlock *> > GTIM;
-  GTIM::ChildIteratorType It = GTIM::child_begin(LoopHeader),
-      E = GTIM::child_end(LoopHeader);
-  for (; It != E; ++It) {
-    MachineBasicBlock *MBB = *It;
-    if (LoopRep->contains(MBB)) {
-      handleLoopcontBlock(MBB, MLI->getLoopFor(MBB),
-                          LoopHeader, LoopRep);
-      ContMBB.push_back(MBB);
-      ++NumCont;
-    }
-  }
-
-  for (SmallVectorImpl<MachineBasicBlock *>::iterator It = ContMBB.begin(),
-      E = ContMBB.end(); It != E; ++It) {
-    (*It)->removeSuccessor(LoopHeader, true);
-  }
-
-  numLoopcontPatternMatch += NumCont;
-
-  return NumCont;
-}
-
-
 bool AMDGPUCFGStructurizer::isSameloopDetachedContbreak(
     MachineBasicBlock *Src1MBB, MachineBasicBlock *Src2MBB) {
   if (Src1MBB->succ_size() == 0) {
@@ -1413,10 +1345,10 @@ int AMDGPUCFGStructurizer::improveSimpleJumpintoIf(MachineBasicBlock *HeadMBB,
   MachineBasicBlock::iterator I = insertInstrBefore(LandBlk, AMDGPU::ENDIF);
 
   if (LandBlkHasOtherPred) {
-    llvm_unreachable("Extra register needed to handle CFG");
+    report_fatal_error("Extra register needed to handle CFG");
     unsigned CmpResReg =
       HeadMBB->getParent()->getRegInfo().createVirtualRegister(I32RC);
-    llvm_unreachable("Extra compare instruction needed to handle CFG");
+    report_fatal_error("Extra compare instruction needed to handle CFG");
     insertCondBranchBefore(LandBlk, I, AMDGPU::IF_PREDICATE_SET,
         CmpResReg, DebugLoc());
   }
@@ -1433,7 +1365,7 @@ int AMDGPUCFGStructurizer::improveSimpleJumpintoIf(MachineBasicBlock *HeadMBB,
     // need to uncondionally insert the assignment to ensure a path from its
     // predecessor rather than headBlk has valid value in initReg if
     // (initVal != 1).
-    llvm_unreachable("Extra register needed to handle CFG");
+    report_fatal_error("Extra register needed to handle CFG");
   }
   insertInstrBefore(I, AMDGPU::ELSE);
 
@@ -1442,7 +1374,7 @@ int AMDGPUCFGStructurizer::improveSimpleJumpintoIf(MachineBasicBlock *HeadMBB,
     // need to uncondionally insert the assignment to ensure a path from its
     // predecessor rather than headBlk has valid value in initReg if
     // (initVal != 0)
-    llvm_unreachable("Extra register needed to handle CFG");
+    report_fatal_error("Extra register needed to handle CFG");
   }
 
   if (LandBlkHasOtherPred) {
@@ -1454,7 +1386,7 @@ int AMDGPUCFGStructurizer::improveSimpleJumpintoIf(MachineBasicBlock *HeadMBB,
          PE = LandBlk->pred_end(); PI != PE; ++PI) {
       MachineBasicBlock *MBB = *PI;
       if (MBB != TrueMBB && MBB != FalseMBB)
-        llvm_unreachable("Extra register needed to handle CFG");
+        report_fatal_error("Extra register needed to handle CFG");
     }
   }
   DEBUG(
@@ -1468,17 +1400,6 @@ int AMDGPUCFGStructurizer::improveSimpleJumpintoIf(MachineBasicBlock *HeadMBB,
   return NumNewBlk;
 }
 
-void AMDGPUCFGStructurizer::handleLoopcontBlock(MachineBasicBlock *ContingMBB,
-    MachineLoop *ContingLoop, MachineBasicBlock *ContMBB,
-    MachineLoop *ContLoop) {
-  DEBUG(dbgs() << "loopcontPattern cont = BB" << ContingMBB->getNumber()
-               << " header = BB" << ContMBB->getNumber() << "\n";
-        dbgs() << "Trying to continue loop-depth = "
-               << getLoopDepth(ContLoop)
-               << " from loop-depth = " << getLoopDepth(ContingLoop) << "\n";);
-  settleLoopcontBlock(ContingMBB, ContMBB);
-}
-
 void AMDGPUCFGStructurizer::mergeSerialBlock(MachineBasicBlock *DstMBB,
     MachineBasicBlock *SrcMBB) {
   DEBUG(
@@ -1809,76 +1730,6 @@ void AMDGPUCFGStructurizer::retireBlock(MachineBasicBlock *MBB) {
          && "can't retire block yet");
 }
 
-void AMDGPUCFGStructurizer::setLoopLandBlock(MachineLoop *loopRep,
-    MachineBasicBlock *MBB) {
-  MachineBasicBlock *&TheEntry = LLInfoMap[loopRep];
-  if (!MBB) {
-    MBB = FuncRep->CreateMachineBasicBlock();
-    FuncRep->push_back(MBB);  //insert to function
-    SHOWNEWBLK(MBB, "DummyLandingBlock for loop without break: ");
-  }
-  TheEntry = MBB;
-  DEBUG(
-    dbgs() << "setLoopLandBlock loop-header = BB"
-           << loopRep->getHeader()->getNumber()
-           << "  landing-block = BB" << MBB->getNumber() << "\n";
-  );
-}
-
-MachineBasicBlock *
-AMDGPUCFGStructurizer::findNearestCommonPostDom(MachineBasicBlock *MBB1,
-    MachineBasicBlock *MBB2) {
-
-  if (PDT->dominates(MBB1, MBB2))
-    return MBB1;
-  if (PDT->dominates(MBB2, MBB1))
-    return MBB2;
-
-  MachineDomTreeNode *Node1 = PDT->getNode(MBB1);
-  MachineDomTreeNode *Node2 = PDT->getNode(MBB2);
-
-  // Handle newly cloned node.
-  if (!Node1 && MBB1->succ_size() == 1)
-    return findNearestCommonPostDom(*MBB1->succ_begin(), MBB2);
-  if (!Node2 && MBB2->succ_size() == 1)
-    return findNearestCommonPostDom(MBB1, *MBB2->succ_begin());
-
-  if (!Node1 || !Node2)
-    return nullptr;
-
-  Node1 = Node1->getIDom();
-  while (Node1) {
-    if (PDT->dominates(Node1, Node2))
-      return Node1->getBlock();
-    Node1 = Node1->getIDom();
-  }
-
-  return nullptr;
-}
-
-MachineBasicBlock *
-AMDGPUCFGStructurizer::findNearestCommonPostDom(
-    std::set<MachineBasicBlock *> &MBBs) {
-  MachineBasicBlock *CommonDom;
-  std::set<MachineBasicBlock *>::const_iterator It = MBBs.begin();
-  std::set<MachineBasicBlock *>::const_iterator E = MBBs.end();
-  for (CommonDom = *It; It != E && CommonDom; ++It) {
-    MachineBasicBlock *MBB = *It;
-    if (MBB != CommonDom)
-      CommonDom = findNearestCommonPostDom(MBB, CommonDom);
-  }
-
-  DEBUG(
-    dbgs() << "Common post dominator for exit blocks is ";
-    if (CommonDom)
-          dbgs() << "BB" << CommonDom->getNumber() << "\n";
-    else
-      dbgs() << "NULL\n";
-  );
-
-  return CommonDom;
-}
-
 char AMDGPUCFGStructurizer::ID = 0;
 
 } // end anonymous namespace
diff --git a/lib/Target/AMDGPU/AMDKernelCodeT.h b/lib/Target/AMDGPU/AMDKernelCodeT.h
index a9ba60c8cbad..5d243e949fd3 100644
--- a/lib/Target/AMDGPU/AMDKernelCodeT.h
+++ b/lib/Target/AMDGPU/AMDKernelCodeT.h
@@ -44,6 +44,15 @@ enum amd_code_version_t {
   AMD_CODE_VERSION_MINOR = 1
 };
 
+// Sets val bits for specified mask in specified dst packed instance.
+#define AMD_HSA_BITS_SET(dst, mask, val)                                       \
+  dst &= (~(1 << mask ## _SHIFT) & ~mask);                                     \
+  dst |= (((val) << mask ## _SHIFT) & mask)
+
+// Gets bits for specified mask from specified src packed instance.
+#define AMD_HSA_BITS_GET(src, mask)                                            \
+  ((src & mask) >> mask ## _SHIFT)                                             \
+
 /// The values used to define the number of bytes to use for the
 /// swizzle element size.
 enum amd_element_byte_size_t {
@@ -118,10 +127,14 @@ enum amd_code_property_mask_t {
   AMD_CODE_PROPERTY_ENABLE_SGPR_GRID_WORKGROUP_COUNT_Z_WIDTH = 1,
   AMD_CODE_PROPERTY_ENABLE_SGPR_GRID_WORKGROUP_COUNT_Z = ((1 << AMD_CODE_PROPERTY_ENABLE_SGPR_GRID_WORKGROUP_COUNT_Z_WIDTH) - 1) << AMD_CODE_PROPERTY_ENABLE_SGPR_GRID_WORKGROUP_COUNT_Z_SHIFT,
 
+  AMD_CODE_PROPERTY_RESERVED1_SHIFT = 10,
+  AMD_CODE_PROPERTY_RESERVED1_WIDTH = 6,
+  AMD_CODE_PROPERTY_RESERVED1 = ((1 << AMD_CODE_PROPERTY_RESERVED1_WIDTH) - 1) << AMD_CODE_PROPERTY_RESERVED1_SHIFT,
+
   /// Control wave ID base counter for GDS ordered-append. Used to set
   /// COMPUTE_DISPATCH_INITIATOR.ORDERED_APPEND_ENBL. (Not sure if
   /// ORDERED_APPEND_MODE also needs to be settable)
-  AMD_CODE_PROPERTY_ENABLE_ORDERED_APPEND_GDS_SHIFT = 10,
+  AMD_CODE_PROPERTY_ENABLE_ORDERED_APPEND_GDS_SHIFT = 16,
   AMD_CODE_PROPERTY_ENABLE_ORDERED_APPEND_GDS_WIDTH = 1,
   AMD_CODE_PROPERTY_ENABLE_ORDERED_APPEND_GDS = ((1 << AMD_CODE_PROPERTY_ENABLE_ORDERED_APPEND_GDS_WIDTH) - 1) << AMD_CODE_PROPERTY_ENABLE_ORDERED_APPEND_GDS_SHIFT,
 
@@ -146,7 +159,7 @@ enum amd_code_property_mask_t {
   /// is generally DWORD.
   ///
   /// uSE VALUES FROM THE AMD_ELEMENT_BYTE_SIZE_T ENUM.
-  AMD_CODE_PROPERTY_PRIVATE_ELEMENT_SIZE_SHIFT = 11,
+  AMD_CODE_PROPERTY_PRIVATE_ELEMENT_SIZE_SHIFT = 17,
   AMD_CODE_PROPERTY_PRIVATE_ELEMENT_SIZE_WIDTH = 2,
   AMD_CODE_PROPERTY_PRIVATE_ELEMENT_SIZE = ((1 << AMD_CODE_PROPERTY_PRIVATE_ELEMENT_SIZE_WIDTH) - 1) << AMD_CODE_PROPERTY_PRIVATE_ELEMENT_SIZE_SHIFT,
 
@@ -155,7 +168,7 @@ enum amd_code_property_mask_t {
   /// HSA_MACHINE_LARGE. Must also match
   /// SH_MEM_CONFIG.PTR32 (GFX6 (SI)/GFX7 (CI)),
   /// SH_MEM_CONFIG.ADDRESS_MODE (GFX8 (VI)+).
-  AMD_CODE_PROPERTY_IS_PTR64_SHIFT = 13,
+  AMD_CODE_PROPERTY_IS_PTR64_SHIFT = 19,
   AMD_CODE_PROPERTY_IS_PTR64_WIDTH = 1,
   AMD_CODE_PROPERTY_IS_PTR64 = ((1 << AMD_CODE_PROPERTY_IS_PTR64_WIDTH) - 1) << AMD_CODE_PROPERTY_IS_PTR64_SHIFT,
 
@@ -167,18 +180,22 @@ enum amd_code_property_mask_t {
   /// workitem_private_segment_byte_size only specifies the statically
   /// know private segment size, and additional space must be added
   /// for the call stack.
-  AMD_CODE_PROPERTY_IS_DYNAMIC_CALLSTACK_SHIFT = 14,
+  AMD_CODE_PROPERTY_IS_DYNAMIC_CALLSTACK_SHIFT = 20,
   AMD_CODE_PROPERTY_IS_DYNAMIC_CALLSTACK_WIDTH = 1,
   AMD_CODE_PROPERTY_IS_DYNAMIC_CALLSTACK = ((1 << AMD_CODE_PROPERTY_IS_DYNAMIC_CALLSTACK_WIDTH) - 1) << AMD_CODE_PROPERTY_IS_DYNAMIC_CALLSTACK_SHIFT,
 
   /// Indicate if code generated has support for debugging.
-  AMD_CODE_PROPERTY_IS_DEBUG_SUPPORTED_SHIFT = 15,
+  AMD_CODE_PROPERTY_IS_DEBUG_SUPPORTED_SHIFT = 21,
   AMD_CODE_PROPERTY_IS_DEBUG_SUPPORTED_WIDTH = 1,
   AMD_CODE_PROPERTY_IS_DEBUG_SUPPORTED = ((1 << AMD_CODE_PROPERTY_IS_DEBUG_SUPPORTED_WIDTH) - 1) << AMD_CODE_PROPERTY_IS_DEBUG_SUPPORTED_SHIFT,
 
-  AMD_CODE_PROPERTY_IS_XNACK_SUPPORTED_SHIFT = 15,
+  AMD_CODE_PROPERTY_IS_XNACK_SUPPORTED_SHIFT = 22,
   AMD_CODE_PROPERTY_IS_XNACK_SUPPORTED_WIDTH = 1,
-  AMD_CODE_PROPERTY_IS_XNACK_SUPPORTED = ((1 << AMD_CODE_PROPERTY_IS_XNACK_SUPPORTED_WIDTH) - 1) << AMD_CODE_PROPERTY_IS_XNACK_SUPPORTED_SHIFT
+  AMD_CODE_PROPERTY_IS_XNACK_SUPPORTED = ((1 << AMD_CODE_PROPERTY_IS_XNACK_SUPPORTED_WIDTH) - 1) << AMD_CODE_PROPERTY_IS_XNACK_SUPPORTED_SHIFT,
+
+  AMD_CODE_PROPERTY_RESERVED2_SHIFT = 23,
+  AMD_CODE_PROPERTY_RESERVED2_WIDTH = 9,
+  AMD_CODE_PROPERTY_RESERVED2 = ((1 << AMD_CODE_PROPERTY_RESERVED2_WIDTH) - 1) << AMD_CODE_PROPERTY_RESERVED2_SHIFT
 };
 
 /// @brief The hsa_ext_control_directives_t specifies the values for the HSAIL
diff --git a/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp b/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp
index d9f753f40133..efcf1b23adaa 100644
--- a/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp
+++ b/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp
@@ -1,4 +1,4 @@
-//===-- AMDGPUAsmParser.cpp - Parse SI asm to MCInst instructions ----------===//
+//===-- AMDGPUAsmParser.cpp - Parse SI asm to MCInst instructions ---------===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -7,15 +7,17 @@
 //
 //===----------------------------------------------------------------------===//
 
+#include "AMDKernelCodeT.h"
 #include "MCTargetDesc/AMDGPUMCTargetDesc.h"
 #include "MCTargetDesc/AMDGPUTargetStreamer.h"
-#include "Utils/AMDGPUBaseInfo.h"
-#include "AMDKernelCodeT.h"
 #include "SIDefines.h"
+#include "Utils/AMDGPUBaseInfo.h"
+#include "Utils/AMDKernelCodeTUtils.h"
+#include "Utils/AMDGPUAsmUtils.h"
 #include "llvm/ADT/APFloat.h"
-#include "llvm/ADT/SmallString.h"
-#include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/SmallBitVector.h"
+#include "llvm/ADT/SmallString.h"
 #include "llvm/ADT/StringSwitch.h"
 #include "llvm/ADT/Twine.h"
 #include "llvm/MC/MCContext.h"
@@ -25,16 +27,17 @@
 #include "llvm/MC/MCParser/MCAsmLexer.h"
 #include "llvm/MC/MCParser/MCAsmParser.h"
 #include "llvm/MC/MCParser/MCParsedAsmOperand.h"
+#include "llvm/MC/MCParser/MCTargetAsmParser.h"
 #include "llvm/MC/MCRegisterInfo.h"
 #include "llvm/MC/MCStreamer.h"
 #include "llvm/MC/MCSubtargetInfo.h"
 #include "llvm/MC/MCSymbolELF.h"
-#include "llvm/MC/MCTargetAsmParser.h"
+#include "llvm/Support/Debug.h"
 #include "llvm/Support/ELF.h"
 #include "llvm/Support/SourceMgr.h"
 #include "llvm/Support/TargetRegistry.h"
 #include "llvm/Support/raw_ostream.h"
-#include "llvm/Support/Debug.h"
+#include "llvm/Support/MathExtras.h"
 
 using namespace llvm;
 
@@ -42,6 +45,8 @@ namespace {
 
 struct OptionalOperand;
 
+enum RegisterKind { IS_UNKNOWN, IS_VGPR, IS_SGPR, IS_TTMP, IS_SPECIAL };
+
 class AMDGPUOperand : public MCParsedAsmOperand {
   enum KindTy {
     Token,
@@ -55,19 +60,74 @@ class AMDGPUOperand : public MCParsedAsmOperand {
 public:
   AMDGPUOperand(enum KindTy K) : MCParsedAsmOperand(), Kind(K) {}
 
-  MCContext *Ctx;
+  typedef std::unique_ptr<AMDGPUOperand> Ptr;
+
+  struct Modifiers {
+    bool Abs;
+    bool Neg;
+    bool Sext;
+
+    bool hasFPModifiers() const { return Abs || Neg; }
+    bool hasIntModifiers() const { return Sext; }
+    bool hasModifiers() const { return hasFPModifiers() || hasIntModifiers(); }
+
+    int64_t getFPModifiersOperand() const {
+      int64_t Operand = 0;
+      Operand |= Abs ? SISrcMods::ABS : 0;
+      Operand |= Neg ? SISrcMods::NEG : 0;
+      return Operand;
+    }
+
+    int64_t getIntModifiersOperand() const {
+      int64_t Operand = 0;
+      Operand |= Sext ? SISrcMods::SEXT : 0;
+      return Operand;
+    }
+
+    int64_t getModifiersOperand() const {
+      assert(!(hasFPModifiers() && hasIntModifiers())
+           && "fp and int modifiers should not be used simultaneously");
+      if (hasFPModifiers()) {
+        return getFPModifiersOperand();
+      } else if (hasIntModifiers()) {
+        return getIntModifiersOperand();
+      } else {
+        return 0;
+      }
+    }
+
+    friend raw_ostream &operator <<(raw_ostream &OS, AMDGPUOperand::Modifiers Mods);
+  };
 
   enum ImmTy {
     ImmTyNone,
-    ImmTyDSOffset0,
-    ImmTyDSOffset1,
     ImmTyGDS,
+    ImmTyOffen,
+    ImmTyIdxen,
+    ImmTyAddr64,
     ImmTyOffset,
+    ImmTyOffset0,
+    ImmTyOffset1,
     ImmTyGLC,
     ImmTySLC,
     ImmTyTFE,
-    ImmTyClamp,
-    ImmTyOMod
+    ImmTyClampSI,
+    ImmTyOModSI,
+    ImmTyDppCtrl,
+    ImmTyDppRowMask,
+    ImmTyDppBankMask,
+    ImmTyDppBoundCtrl,
+    ImmTySdwaDstSel,
+    ImmTySdwaSrc0Sel,
+    ImmTySdwaSrc1Sel,
+    ImmTySdwaDstUnused,
+    ImmTyDMask,
+    ImmTyUNorm,
+    ImmTyDA,
+    ImmTyR128,
+    ImmTyLWE,
+    ImmTyHwreg,
+    ImmTySendMsg,
   };
 
   struct TokOp {
@@ -79,11 +139,12 @@ public:
     bool IsFPImm;
     ImmTy Type;
     int64_t Val;
+    Modifiers Mods;
   };
 
   struct RegOp {
     unsigned RegNo;
-    int Modifiers;
+    Modifiers Mods;
     const MCRegisterInfo *TRI;
     const MCSubtargetInfo *STI;
     bool IsForcedVOP3;
@@ -96,175 +157,323 @@ public:
     const MCExpr *Expr;
   };
 
-  void addImmOperands(MCInst &Inst, unsigned N) const {
-    Inst.addOperand(MCOperand::createImm(getImm()));
+  bool isToken() const override {
+    if (Kind == Token)
+      return true;
+
+    if (Kind != Expression || !Expr)
+      return false;
+
+    // When parsing operands, we can't always tell if something was meant to be
+    // a token, like 'gds', or an expression that references a global variable.
+    // In this case, we assume the string is an expression, and if we need to
+    // interpret is a token, then we treat the symbol name as the token.
+    return isa<MCSymbolRefExpr>(Expr);
   }
 
-  StringRef getToken() const {
-    return StringRef(Tok.Data, Tok.Length);
+  bool isImm() const override {
+    return Kind == Immediate;
   }
 
-  void addRegOperands(MCInst &Inst, unsigned N) const {
-    Inst.addOperand(MCOperand::createReg(AMDGPU::getMCReg(getReg(), *Reg.STI)));
+  bool isInlinableImm() const {
+    if (!isImmTy(ImmTyNone)) {
+      // Only plain immediates are inlinable (e.g. "clamp" attribute is not)
+      return false;
+    }
+    // TODO: We should avoid using host float here. It would be better to
+    // check the float bit values which is what a few other places do.
+    // We've had bot failures before due to weird NaN support on mips hosts.
+    const float F = BitsToFloat(Imm.Val);
+    // TODO: Add 1/(2*pi) for VI
+    return (Imm.Val <= 64 && Imm.Val >= -16) ||
+           (F == 0.0 || F == 0.5 || F == -0.5 || F == 1.0 || F == -1.0 ||
+           F == 2.0 || F == -2.0 || F == 4.0 || F == -4.0);
   }
 
-  void addRegOrImmOperands(MCInst &Inst, unsigned N) const {
-    if (isReg())
-      addRegOperands(Inst, N);
-    else
-      addImmOperands(Inst, N);
+  bool isRegKind() const {
+    return Kind == Register;
   }
 
-  void addRegWithInputModsOperands(MCInst &Inst, unsigned N) const {
-    Inst.addOperand(MCOperand::createImm(
-        Reg.Modifiers == -1 ? 0 : Reg.Modifiers));
-    addRegOperands(Inst, N);
+  bool isReg() const override {
+    return isRegKind() && !Reg.Mods.hasModifiers();
+  }
+
+  bool isRegOrImmWithInputMods() const {
+    return isRegKind() || isInlinableImm();
+  }
+
+  bool isImmTy(ImmTy ImmT) const {
+    return isImm() && Imm.Type == ImmT;
+  }
+  
+  bool isImmModifier() const {
+    return isImm() && Imm.Type != ImmTyNone;
+  }
+  
+  bool isClampSI() const { return isImmTy(ImmTyClampSI); }
+  bool isOModSI() const { return isImmTy(ImmTyOModSI); }
+  bool isDMask() const { return isImmTy(ImmTyDMask); }
+  bool isUNorm() const { return isImmTy(ImmTyUNorm); }
+  bool isDA() const { return isImmTy(ImmTyDA); }
+  bool isR128() const { return isImmTy(ImmTyUNorm); }
+  bool isLWE() const { return isImmTy(ImmTyLWE); }
+  bool isOffen() const { return isImmTy(ImmTyOffen); }
+  bool isIdxen() const { return isImmTy(ImmTyIdxen); }
+  bool isAddr64() const { return isImmTy(ImmTyAddr64); }
+  bool isOffset() const { return isImmTy(ImmTyOffset) && isUInt<16>(getImm()); }
+  bool isOffset0() const { return isImmTy(ImmTyOffset0) && isUInt<16>(getImm()); }
+  bool isOffset1() const { return isImmTy(ImmTyOffset1) && isUInt<8>(getImm()); }
+  bool isGDS() const { return isImmTy(ImmTyGDS); }
+  bool isGLC() const { return isImmTy(ImmTyGLC); }
+  bool isSLC() const { return isImmTy(ImmTySLC); }
+  bool isTFE() const { return isImmTy(ImmTyTFE); }
+  bool isBankMask() const { return isImmTy(ImmTyDppBankMask); }
+  bool isRowMask() const { return isImmTy(ImmTyDppRowMask); }
+  bool isBoundCtrl() const { return isImmTy(ImmTyDppBoundCtrl); }
+  bool isSDWADstSel() const { return isImmTy(ImmTySdwaDstSel); }
+  bool isSDWASrc0Sel() const { return isImmTy(ImmTySdwaSrc0Sel); }
+  bool isSDWASrc1Sel() const { return isImmTy(ImmTySdwaSrc1Sel); }
+  bool isSDWADstUnused() const { return isImmTy(ImmTySdwaDstUnused); }
+  
+  bool isMod() const {
+    return isClampSI() || isOModSI();
   }
 
-  void addSoppBrTargetOperands(MCInst &Inst, unsigned N) const {
-    if (isImm())
-      addImmOperands(Inst, N);
-    else {
-      assert(isExpr());
-      Inst.addOperand(MCOperand::createExpr(Expr));
-    }
+  bool isRegOrImm() const {
+    return isReg() || isImm();
   }
 
-  bool defaultTokenHasSuffix() const {
-    StringRef Token(Tok.Data, Tok.Length);
+  bool isRegClass(unsigned RCID) const {
+    return isReg() && Reg.TRI->getRegClass(RCID).contains(getReg());
+  }
 
-    return Token.endswith("_e32") || Token.endswith("_e64");
+  bool isSCSrc32() const {
+    return isInlinableImm() || isRegClass(AMDGPU::SReg_32RegClassID);
   }
 
-  bool isToken() const override {
-    return Kind == Token;
+  bool isSCSrc64() const {
+    return isInlinableImm() || isRegClass(AMDGPU::SReg_64RegClassID);
   }
 
-  bool isImm() const override {
-    return Kind == Immediate;
+  bool isSSrc32() const {
+    return isImm() || isSCSrc32() || isExpr();
   }
 
-  bool isInlineImm() const {
-    float F = BitsToFloat(Imm.Val);
-    // TODO: Add 0.5pi for VI
-    return isImm() && ((Imm.Val <= 64 && Imm.Val >= -16) ||
-           (F == 0.0 || F == 0.5 || F == -0.5 || F == 1.0 || F == -1.0 ||
-           F == 2.0 || F == -2.0 || F == 4.0 || F == -4.0));
+  bool isSSrc64() const {
+    // TODO: Find out how SALU supports extension of 32-bit literals to 64 bits.
+    // See isVSrc64().
+    return isImm() || isSCSrc64();
   }
 
-  bool isDSOffset0() const {
-    assert(isImm());
-    return Imm.Type == ImmTyDSOffset0;
+  bool isVCSrc32() const {
+    return isInlinableImm() || isRegClass(AMDGPU::VS_32RegClassID);
   }
 
-  bool isDSOffset1() const {
-    assert(isImm());
-    return Imm.Type == ImmTyDSOffset1;
+  bool isVCSrc64() const {
+    return isInlinableImm() || isRegClass(AMDGPU::VS_64RegClassID);
   }
 
-  int64_t getImm() const {
-    return Imm.Val;
+  bool isVSrc32() const {
+    return isImm() || isVCSrc32();
   }
 
-  enum ImmTy getImmTy() const {
-    assert(isImm());
-    return Imm.Type;
+  bool isVSrc64() const {
+    // TODO: Check if the 64-bit value (coming from assembly source) can be
+    // narrowed to 32 bits (in the instruction stream). That require knowledge
+    // of instruction type (unsigned/signed, floating or "untyped"/B64),
+    // see [AMD GCN3 ISA 6.3.1].
+    // TODO: How 64-bit values are formed from 32-bit literals in _B64 insns?
+    return isImm() || isVCSrc64();
   }
 
-  bool isRegKind() const {
-    return Kind == Register;
+  bool isMem() const override {
+    return false;
   }
 
-  bool isReg() const override {
-    return Kind == Register && Reg.Modifiers == -1;
+  bool isExpr() const {
+    return Kind == Expression;
   }
 
-  bool isRegWithInputMods() const {
-    return Kind == Register && (Reg.IsForcedVOP3 || Reg.Modifiers != -1);
+  bool isSoppBrTarget() const {
+    return isExpr() || isImm();
   }
 
-  void setModifiers(unsigned Mods) {
-    assert(isReg());
-    Reg.Modifiers = Mods;
+  bool isSWaitCnt() const;
+  bool isHwreg() const;
+  bool isSendMsg() const;
+  bool isSMRDOffset() const;
+  bool isSMRDLiteralOffset() const;
+  bool isDPPCtrl() const;
+
+  StringRef getExpressionAsToken() const {
+    assert(isExpr());
+    const MCSymbolRefExpr *S = cast<MCSymbolRefExpr>(Expr);
+    return S->getSymbol().getName();
   }
 
-  bool hasModifiers() const {
-    assert(isRegKind());
-    return Reg.Modifiers != -1;
+
+  StringRef getToken() const {
+    assert(isToken());
+
+    if (Kind == Expression)
+      return getExpressionAsToken();
+
+    return StringRef(Tok.Data, Tok.Length);
+  }
+
+  int64_t getImm() const {
+    assert(isImm());
+    return Imm.Val;
+  }
+
+  enum ImmTy getImmTy() const {
+    assert(isImm());
+    return Imm.Type;
   }
 
   unsigned getReg() const override {
     return Reg.RegNo;
   }
 
-  bool isRegOrImm() const {
-    return isReg() || isImm();
+  SMLoc getStartLoc() const override {
+    return StartLoc;
   }
 
-  bool isRegClass(unsigned RCID) const {
-    return Reg.TRI->getRegClass(RCID).contains(getReg());
+  SMLoc getEndLoc() const override {
+    return EndLoc;
   }
 
-  bool isSCSrc32() const {
-    return isInlineImm() || (isReg() && isRegClass(AMDGPU::SReg_32RegClassID));
+  Modifiers getModifiers() const {
+    assert(isRegKind() || isImmTy(ImmTyNone));
+    return isRegKind() ? Reg.Mods : Imm.Mods;
   }
 
-  bool isSSrc32() const {
-    return isImm() || (isReg() && isRegClass(AMDGPU::SReg_32RegClassID));
+  void setModifiers(Modifiers Mods) {
+    assert(isRegKind() || isImmTy(ImmTyNone));
+    if (isRegKind())
+      Reg.Mods = Mods;
+    else
+      Imm.Mods = Mods;
   }
 
-  bool isSSrc64() const {
-    return isImm() || isInlineImm() ||
-           (isReg() && isRegClass(AMDGPU::SReg_64RegClassID));
+  bool hasModifiers() const {
+    return getModifiers().hasModifiers();
   }
-
-  bool isSCSrc64() const {
-    return (isReg() && isRegClass(AMDGPU::SReg_64RegClassID)) || isInlineImm();
+  
+  bool hasFPModifiers() const {
+    return getModifiers().hasFPModifiers();
   }
 
-  bool isVCSrc32() const {
-    return isInlineImm() || (isReg() && isRegClass(AMDGPU::VS_32RegClassID));
+  bool hasIntModifiers() const {
+    return getModifiers().hasIntModifiers();
   }
 
-  bool isVCSrc64() const {
-    return isInlineImm() || (isReg() && isRegClass(AMDGPU::VS_64RegClassID));
+  void addImmOperands(MCInst &Inst, unsigned N, bool ApplyModifiers = true) const {
+    if (isImmTy(ImmTyNone) && ApplyModifiers && Imm.Mods.hasFPModifiers()) {
+      // Apply modifiers to immediate value 
+      int64_t Val = Imm.Val;
+      bool Negate = Imm.Mods.Neg; // Only negate can get here
+      if (Imm.IsFPImm) {
+        APFloat F(BitsToFloat(Val));
+        if (Negate) {
+          F.changeSign();
+        }
+        Val = F.bitcastToAPInt().getZExtValue();
+      } else {
+        Val = Negate ? -Val : Val;
+      }
+      Inst.addOperand(MCOperand::createImm(Val));
+    } else {
+      Inst.addOperand(MCOperand::createImm(getImm()));
+    }
   }
 
-  bool isVSrc32() const {
-    return isImm() || (isReg() && isRegClass(AMDGPU::VS_32RegClassID));
+  void addRegOperands(MCInst &Inst, unsigned N) const {
+    Inst.addOperand(MCOperand::createReg(AMDGPU::getMCReg(getReg(), *Reg.STI)));
   }
 
-  bool isVSrc64() const {
-    return isImm() || (isReg() && isRegClass(AMDGPU::VS_64RegClassID));
+  void addRegOrImmOperands(MCInst &Inst, unsigned N) const {
+    if (isRegKind())
+      addRegOperands(Inst, N);
+    else if (isExpr())
+      Inst.addOperand(MCOperand::createExpr(Expr));
+    else
+      addImmOperands(Inst, N);
   }
 
-  bool isMem() const override {
-    return false;
+  void addRegOrImmWithInputModsOperands(MCInst &Inst, unsigned N) const {
+    Modifiers Mods = getModifiers();
+    Inst.addOperand(MCOperand::createImm(Mods.getModifiersOperand()));
+    if (isRegKind()) {
+      addRegOperands(Inst, N);
+    } else {
+      addImmOperands(Inst, N, false);
+    }
   }
 
-  bool isExpr() const {
-    return Kind == Expression;
+  void addRegOrImmWithFPInputModsOperands(MCInst &Inst, unsigned N) const {
+    assert(!hasIntModifiers());
+    addRegOrImmWithInputModsOperands(Inst, N);
   }
 
-  bool isSoppBrTarget() const {
-    return isExpr() || isImm();
+  void addRegOrImmWithIntInputModsOperands(MCInst &Inst, unsigned N) const {
+    assert(!hasFPModifiers());
+    addRegOrImmWithInputModsOperands(Inst, N);
   }
 
-  SMLoc getStartLoc() const override {
-    return StartLoc;
+  void addSoppBrTargetOperands(MCInst &Inst, unsigned N) const {
+    if (isImm())
+      addImmOperands(Inst, N);
+    else {
+      assert(isExpr());
+      Inst.addOperand(MCOperand::createExpr(Expr));
+    }
   }
 
-  SMLoc getEndLoc() const override {
-    return EndLoc;
+  void printImmTy(raw_ostream& OS, ImmTy Type) const {
+    switch (Type) {
+    case ImmTyNone: OS << "None"; break;
+    case ImmTyGDS: OS << "GDS"; break;
+    case ImmTyOffen: OS << "Offen"; break;
+    case ImmTyIdxen: OS << "Idxen"; break;
+    case ImmTyAddr64: OS << "Addr64"; break;
+    case ImmTyOffset: OS << "Offset"; break;
+    case ImmTyOffset0: OS << "Offset0"; break;
+    case ImmTyOffset1: OS << "Offset1"; break;
+    case ImmTyGLC: OS << "GLC"; break;
+    case ImmTySLC: OS << "SLC"; break;
+    case ImmTyTFE: OS << "TFE"; break;
+    case ImmTyClampSI: OS << "ClampSI"; break;
+    case ImmTyOModSI: OS << "OModSI"; break;
+    case ImmTyDppCtrl: OS << "DppCtrl"; break;
+    case ImmTyDppRowMask: OS << "DppRowMask"; break;
+    case ImmTyDppBankMask: OS << "DppBankMask"; break;
+    case ImmTyDppBoundCtrl: OS << "DppBoundCtrl"; break;
+    case ImmTySdwaDstSel: OS << "SdwaDstSel"; break;
+    case ImmTySdwaSrc0Sel: OS << "SdwaSrc0Sel"; break;
+    case ImmTySdwaSrc1Sel: OS << "SdwaSrc1Sel"; break;
+    case ImmTySdwaDstUnused: OS << "SdwaDstUnused"; break;
+    case ImmTyDMask: OS << "DMask"; break;
+    case ImmTyUNorm: OS << "UNorm"; break;
+    case ImmTyDA: OS << "DA"; break;
+    case ImmTyR128: OS << "R128"; break;
+    case ImmTyLWE: OS << "LWE"; break;
+    case ImmTyHwreg: OS << "Hwreg"; break;
+    case ImmTySendMsg: OS << "SendMsg"; break;
+    }
   }
 
   void print(raw_ostream &OS) const override {
     switch (Kind) {
     case Register:
-      OS << "<register " << getReg() << " mods: " << Reg.Modifiers << '>';
+      OS << "<register " << getReg() << " mods: " << Reg.Mods << '>';
       break;
     case Immediate:
-      OS << getImm();
+      OS << '<' << getImm();
+      if (getImmTy() != ImmTyNone) {
+        OS << " type: "; printImmTy(OS, getImmTy());
+      }
+      OS << " mods: " << Imm.Mods << '>';
       break;
     case Token:
       OS << '\'' << getToken() << '\'';
@@ -275,20 +484,21 @@ public:
     }
   }
 
-  static std::unique_ptr<AMDGPUOperand> CreateImm(int64_t Val, SMLoc Loc,
-                                                  enum ImmTy Type = ImmTyNone,
-                                                  bool IsFPImm = false) {
+  static AMDGPUOperand::Ptr CreateImm(int64_t Val, SMLoc Loc,
+                                      enum ImmTy Type = ImmTyNone,
+                                      bool IsFPImm = false) {
     auto Op = llvm::make_unique<AMDGPUOperand>(Immediate);
     Op->Imm.Val = Val;
     Op->Imm.IsFPImm = IsFPImm;
     Op->Imm.Type = Type;
+    Op->Imm.Mods = {false, false, false};
     Op->StartLoc = Loc;
     Op->EndLoc = Loc;
     return Op;
   }
 
-  static std::unique_ptr<AMDGPUOperand> CreateToken(StringRef Str, SMLoc Loc,
-                                           bool HasExplicitEncodingSize = true) {
+  static AMDGPUOperand::Ptr CreateToken(StringRef Str, SMLoc Loc,
+                                        bool HasExplicitEncodingSize = true) {
     auto Res = llvm::make_unique<AMDGPUOperand>(Token);
     Res->Tok.Data = Str.data();
     Res->Tok.Length = Str.size();
@@ -297,43 +507,43 @@ public:
     return Res;
   }
 
-  static std::unique_ptr<AMDGPUOperand> CreateReg(unsigned RegNo, SMLoc S,
-                                                  SMLoc E,
-                                                  const MCRegisterInfo *TRI,
-                                                  const MCSubtargetInfo *STI,
-                                                  bool ForceVOP3) {
+  static AMDGPUOperand::Ptr CreateReg(unsigned RegNo, SMLoc S,
+                                      SMLoc E,
+                                      const MCRegisterInfo *TRI,
+                                      const MCSubtargetInfo *STI,
+                                      bool ForceVOP3) {
     auto Op = llvm::make_unique<AMDGPUOperand>(Register);
     Op->Reg.RegNo = RegNo;
     Op->Reg.TRI = TRI;
     Op->Reg.STI = STI;
-    Op->Reg.Modifiers = -1;
+    Op->Reg.Mods = {false, false, false};
     Op->Reg.IsForcedVOP3 = ForceVOP3;
     Op->StartLoc = S;
     Op->EndLoc = E;
     return Op;
   }
 
-  static std::unique_ptr<AMDGPUOperand> CreateExpr(const class MCExpr *Expr, SMLoc S) {
+  static AMDGPUOperand::Ptr CreateExpr(const class MCExpr *Expr, SMLoc S) {
     auto Op = llvm::make_unique<AMDGPUOperand>(Expression);
     Op->Expr = Expr;
     Op->StartLoc = S;
     Op->EndLoc = S;
     return Op;
   }
-
-  bool isDSOffset() const;
-  bool isDSOffset01() const;
-  bool isSWaitCnt() const;
-  bool isMubufOffset() const;
-  bool isSMRDOffset() const;
-  bool isSMRDLiteralOffset() const;
 };
 
+raw_ostream &operator <<(raw_ostream &OS, AMDGPUOperand::Modifiers Mods) {
+  OS << "abs:" << Mods.Abs << " neg: " << Mods.Neg << " sext:" << Mods.Sext;
+  return OS;
+}
+
 class AMDGPUAsmParser : public MCTargetAsmParser {
   const MCInstrInfo &MII;
   MCAsmParser &Parser;
 
   unsigned ForcedEncodingSize;
+  bool ForcedDPP;
+  bool ForcedSDWA;
 
   bool isSI() const {
     return AMDGPU::isSI(getSTI());
@@ -373,9 +583,11 @@ private:
   bool ParseSectionDirectiveHSADataGlobalAgent();
   bool ParseSectionDirectiveHSADataGlobalProgram();
   bool ParseSectionDirectiveHSARodataReadonlyAgent();
+  bool AddNextRegisterToList(unsigned& Reg, unsigned& RegWidth, RegisterKind RegKind, unsigned Reg1, unsigned RegNum);
+  bool ParseAMDGPURegister(RegisterKind& RegKind, unsigned& Reg, unsigned& RegNum, unsigned& RegWidth);
+  void cvtMubufImpl(MCInst &Inst, const OperandVector &Operands, bool IsAtomic, bool IsAtomicReturn);
 
 public:
-public:
   enum AMDGPUMatchResultTy {
     Match_PreferE32 = FIRST_TARGET_MATCH_RESULT_TY
   };
@@ -384,7 +596,9 @@ public:
                const MCInstrInfo &MII,
                const MCTargetOptions &Options)
       : MCTargetAsmParser(Options, STI), MII(MII), Parser(_Parser),
-        ForcedEncodingSize(0) {
+        ForcedEncodingSize(0),
+        ForcedDPP(false),
+        ForcedSDWA(false) {
     MCAsmParserExtension::Initialize(Parser);
 
     if (getSTI().getFeatureBits().none()) {
@@ -393,6 +607,21 @@ public:
     }
 
     setAvailableFeatures(ComputeAvailableFeatures(getSTI().getFeatureBits()));
+
+    {
+      // TODO: make those pre-defined variables read-only.
+      // Currently there is none suitable machinery in the core llvm-mc for this.
+      // MCSymbol::isRedefinable is intended for another purpose, and
+      // AsmParser::parseDirectiveSet() cannot be specialized for specific target.
+      AMDGPU::IsaVersion Isa = AMDGPU::getIsaVersion(getSTI().getFeatureBits());
+      MCContext &Ctx = getContext();
+      MCSymbol *Sym = Ctx.getOrCreateSymbol(Twine(".option.machine_version_major"));
+      Sym->setVariableValue(MCConstantExpr::create(Isa.Major, Ctx));
+      Sym = Ctx.getOrCreateSymbol(Twine(".option.machine_version_minor"));
+      Sym->setVariableValue(MCConstantExpr::create(Isa.Minor, Ctx));
+      Sym = Ctx.getOrCreateSymbol(Twine(".option.machine_version_stepping"));
+      Sym->setVariableValue(MCConstantExpr::create(Isa.Stepping, Ctx));
+    }
   }
 
   AMDGPUTargetStreamer &getTargetStreamer() {
@@ -400,84 +629,117 @@ public:
     return static_cast<AMDGPUTargetStreamer &>(TS);
   }
 
-  unsigned getForcedEncodingSize() const {
-    return ForcedEncodingSize;
-  }
-
-  void setForcedEncodingSize(unsigned Size) {
-    ForcedEncodingSize = Size;
-  }
+  void setForcedEncodingSize(unsigned Size) { ForcedEncodingSize = Size; }
+  void setForcedDPP(bool ForceDPP_) { ForcedDPP = ForceDPP_; }
+  void setForcedSDWA(bool ForceSDWA_) { ForcedSDWA = ForceSDWA_; }
 
-  bool isForcedVOP3() const {
-    return ForcedEncodingSize == 64;
-  }
+  unsigned getForcedEncodingSize() const { return ForcedEncodingSize; }
+  bool isForcedVOP3() const { return ForcedEncodingSize == 64; }
+  bool isForcedDPP() const { return ForcedDPP; }
+  bool isForcedSDWA() const { return ForcedSDWA; }
 
+  std::unique_ptr<AMDGPUOperand> parseRegister();
   bool ParseRegister(unsigned &RegNo, SMLoc &StartLoc, SMLoc &EndLoc) override;
   unsigned checkTargetMatchPredicate(MCInst &Inst) override;
+  unsigned validateTargetOperandClass(MCParsedAsmOperand &Op,
+                                      unsigned Kind) override;
   bool MatchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode,
                                OperandVector &Operands, MCStreamer &Out,
                                uint64_t &ErrorInfo,
                                bool MatchingInlineAsm) override;
   bool ParseDirective(AsmToken DirectiveID) override;
   OperandMatchResultTy parseOperand(OperandVector &Operands, StringRef Mnemonic);
+  StringRef parseMnemonicSuffix(StringRef Name);
   bool ParseInstruction(ParseInstructionInfo &Info, StringRef Name,
                         SMLoc NameLoc, OperandVector &Operands) override;
 
-  OperandMatchResultTy parseIntWithPrefix(const char *Prefix, int64_t &Int,
-                                          int64_t Default = 0);
+  OperandMatchResultTy parseIntWithPrefix(const char *Prefix, int64_t &Int);
   OperandMatchResultTy parseIntWithPrefix(const char *Prefix,
                                           OperandVector &Operands,
-                                          enum AMDGPUOperand::ImmTy ImmTy =
-                                                      AMDGPUOperand::ImmTyNone);
+                                          enum AMDGPUOperand::ImmTy ImmTy = AMDGPUOperand::ImmTyNone,
+                                          bool (*ConvertResult)(int64_t&) = 0);
   OperandMatchResultTy parseNamedBit(const char *Name, OperandVector &Operands,
-                                     enum AMDGPUOperand::ImmTy ImmTy =
-                                                      AMDGPUOperand::ImmTyNone);
-  OperandMatchResultTy parseOptionalOps(
-                                   const ArrayRef<OptionalOperand> &OptionalOps,
-                                   OperandVector &Operands);
+                                     enum AMDGPUOperand::ImmTy ImmTy = AMDGPUOperand::ImmTyNone);
+  OperandMatchResultTy parseStringWithPrefix(StringRef Prefix, StringRef &Value);
 
+  OperandMatchResultTy parseImm(OperandVector &Operands);
+  OperandMatchResultTy parseRegOrImm(OperandVector &Operands);
+  OperandMatchResultTy parseRegOrImmWithFPInputMods(OperandVector &Operands);
+  OperandMatchResultTy parseRegOrImmWithIntInputMods(OperandVector &Operands);
 
   void cvtDSOffset01(MCInst &Inst, const OperandVector &Operands);
   void cvtDS(MCInst &Inst, const OperandVector &Operands);
-  OperandMatchResultTy parseDSOptionalOps(OperandVector &Operands);
-  OperandMatchResultTy parseDSOff01OptionalOps(OperandVector &Operands);
-  OperandMatchResultTy parseDSOffsetOptional(OperandVector &Operands);
 
   bool parseCnt(int64_t &IntVal);
   OperandMatchResultTy parseSWaitCntOps(OperandVector &Operands);
-  OperandMatchResultTy parseSOppBrTarget(OperandVector &Operands);
+  OperandMatchResultTy parseHwreg(OperandVector &Operands);
 
-  OperandMatchResultTy parseFlatOptionalOps(OperandVector &Operands);
-  OperandMatchResultTy parseFlatAtomicOptionalOps(OperandVector &Operands);
-  void cvtFlat(MCInst &Inst, const OperandVector &Operands);
+private:
+  struct OperandInfoTy {
+    int64_t Id;
+    bool IsSymbolic;
+    OperandInfoTy(int64_t Id_) : Id(Id_), IsSymbolic(false) { }
+  };
 
-  void cvtMubuf(MCInst &Inst, const OperandVector &Operands);
-  OperandMatchResultTy parseOffset(OperandVector &Operands);
-  OperandMatchResultTy parseMubufOptionalOps(OperandVector &Operands);
-  OperandMatchResultTy parseGLC(OperandVector &Operands);
-  OperandMatchResultTy parseSLC(OperandVector &Operands);
-  OperandMatchResultTy parseTFE(OperandVector &Operands);
+  bool parseSendMsgConstruct(OperandInfoTy &Msg, OperandInfoTy &Operation, int64_t &StreamId);
+  bool parseHwregConstruct(OperandInfoTy &HwReg, int64_t &Offset, int64_t &Width);
+public:
+  OperandMatchResultTy parseOptionalOperand(OperandVector &Operands);
 
-  OperandMatchResultTy parseDMask(OperandVector &Operands);
-  OperandMatchResultTy parseUNorm(OperandVector &Operands);
-  OperandMatchResultTy parseR128(OperandVector &Operands);
+  OperandMatchResultTy parseSendMsgOp(OperandVector &Operands);
+  OperandMatchResultTy parseSOppBrTarget(OperandVector &Operands);
 
+  void cvtMubuf(MCInst &Inst, const OperandVector &Operands) { cvtMubufImpl(Inst, Operands, false, false); }
+  void cvtMubufAtomic(MCInst &Inst, const OperandVector &Operands) { cvtMubufImpl(Inst, Operands, true, false); }
+  void cvtMubufAtomicReturn(MCInst &Inst, const OperandVector &Operands) { cvtMubufImpl(Inst, Operands, true, true); }
+  AMDGPUOperand::Ptr defaultGLC() const;
+  AMDGPUOperand::Ptr defaultSLC() const;
+  AMDGPUOperand::Ptr defaultTFE() const;
+
+  AMDGPUOperand::Ptr defaultDMask() const;
+  AMDGPUOperand::Ptr defaultUNorm() const;
+  AMDGPUOperand::Ptr defaultDA() const;
+  AMDGPUOperand::Ptr defaultR128() const;
+  AMDGPUOperand::Ptr defaultLWE() const;
+  AMDGPUOperand::Ptr defaultSMRDOffset() const;
+  AMDGPUOperand::Ptr defaultSMRDLiteralOffset() const;
+
+  OperandMatchResultTy parseOModOperand(OperandVector &Operands);
+
+  void cvtId(MCInst &Inst, const OperandVector &Operands);
+  void cvtVOP3_2_mod(MCInst &Inst, const OperandVector &Operands);
   void cvtVOP3(MCInst &Inst, const OperandVector &Operands);
-  OperandMatchResultTy parseVOP3OptionalOps(OperandVector &Operands);
+
+  void cvtMIMG(MCInst &Inst, const OperandVector &Operands);
+  void cvtMIMGAtomic(MCInst &Inst, const OperandVector &Operands);
+
+  OperandMatchResultTy parseDPPCtrl(OperandVector &Operands);
+  AMDGPUOperand::Ptr defaultRowMask() const;
+  AMDGPUOperand::Ptr defaultBankMask() const;
+  AMDGPUOperand::Ptr defaultBoundCtrl() const;
+  void cvtDPP(MCInst &Inst, const OperandVector &Operands);
+
+  OperandMatchResultTy parseSDWASel(OperandVector &Operands, StringRef Prefix,
+                                    AMDGPUOperand::ImmTy Type);
+  OperandMatchResultTy parseSDWADstUnused(OperandVector &Operands);
+  void cvtSdwaVOP1(MCInst &Inst, const OperandVector &Operands);
+  void cvtSdwaVOP2(MCInst &Inst, const OperandVector &Operands);
+  void cvtSdwaVOPC(MCInst &Inst, const OperandVector &Operands);
+  void cvtSDWA(MCInst &Inst, const OperandVector &Operands,
+               uint64_t BasicInstType);
 };
 
 struct OptionalOperand {
   const char *Name;
   AMDGPUOperand::ImmTy Type;
   bool IsBit;
-  int64_t Default;
   bool (*ConvertResult)(int64_t&);
 };
 
 }
 
-static int getRegClass(bool IsVgpr, unsigned RegWidth) {
-  if (IsVgpr) {
+static int getRegClass(RegisterKind Is, unsigned RegWidth) {
+  if (Is == IS_VGPR) {
     switch (RegWidth) {
       default: return -1;
       case 1: return AMDGPU::VGPR_32RegClassID;
@@ -487,109 +749,379 @@ static int getRegClass(bool IsVgpr, unsigned RegWidth) {
       case 8: return AMDGPU::VReg_256RegClassID;
       case 16: return AMDGPU::VReg_512RegClassID;
     }
+  } else if (Is == IS_TTMP) {
+    switch (RegWidth) {
+      default: return -1;
+      case 1: return AMDGPU::TTMP_32RegClassID;
+      case 2: return AMDGPU::TTMP_64RegClassID;
+      case 4: return AMDGPU::TTMP_128RegClassID;
+    }
+  } else if (Is == IS_SGPR) {
+    switch (RegWidth) {
+      default: return -1;
+      case 1: return AMDGPU::SGPR_32RegClassID;
+      case 2: return AMDGPU::SGPR_64RegClassID;
+      case 4: return AMDGPU::SGPR_128RegClassID;
+      case 8: return AMDGPU::SReg_256RegClassID;
+      case 16: return AMDGPU::SReg_512RegClassID;
+    }
   }
-
-  switch (RegWidth) {
-    default: return -1;
-    case 1: return AMDGPU::SGPR_32RegClassID;
-    case 2: return AMDGPU::SGPR_64RegClassID;
-    case 4: return AMDGPU::SReg_128RegClassID;
-    case 8: return AMDGPU::SReg_256RegClassID;
-    case 16: return AMDGPU::SReg_512RegClassID;
-  }
+  return -1;
 }
 
-static unsigned getRegForName(StringRef RegName) {
-
+static unsigned getSpecialRegForName(StringRef RegName) {
   return StringSwitch<unsigned>(RegName)
     .Case("exec", AMDGPU::EXEC)
     .Case("vcc", AMDGPU::VCC)
     .Case("flat_scratch", AMDGPU::FLAT_SCR)
     .Case("m0", AMDGPU::M0)
     .Case("scc", AMDGPU::SCC)
+    .Case("tba", AMDGPU::TBA)
+    .Case("tma", AMDGPU::TMA)
     .Case("flat_scratch_lo", AMDGPU::FLAT_SCR_LO)
     .Case("flat_scratch_hi", AMDGPU::FLAT_SCR_HI)
     .Case("vcc_lo", AMDGPU::VCC_LO)
     .Case("vcc_hi", AMDGPU::VCC_HI)
     .Case("exec_lo", AMDGPU::EXEC_LO)
     .Case("exec_hi", AMDGPU::EXEC_HI)
+    .Case("tma_lo", AMDGPU::TMA_LO)
+    .Case("tma_hi", AMDGPU::TMA_HI)
+    .Case("tba_lo", AMDGPU::TBA_LO)
+    .Case("tba_hi", AMDGPU::TBA_HI)
     .Default(0);
 }
 
 bool AMDGPUAsmParser::ParseRegister(unsigned &RegNo, SMLoc &StartLoc, SMLoc &EndLoc) {
-  const AsmToken Tok = Parser.getTok();
-  StartLoc = Tok.getLoc();
-  EndLoc = Tok.getEndLoc();
+  auto R = parseRegister();
+  if (!R) return true;
+  assert(R->isReg());
+  RegNo = R->getReg();
+  StartLoc = R->getStartLoc();
+  EndLoc = R->getEndLoc();
+  return false;
+}
+
+bool AMDGPUAsmParser::AddNextRegisterToList(unsigned& Reg, unsigned& RegWidth, RegisterKind RegKind, unsigned Reg1, unsigned RegNum)
+{
+  switch (RegKind) {
+  case IS_SPECIAL:
+    if (Reg == AMDGPU::EXEC_LO && Reg1 == AMDGPU::EXEC_HI) { Reg = AMDGPU::EXEC; RegWidth = 2; return true; }
+    if (Reg == AMDGPU::FLAT_SCR_LO && Reg1 == AMDGPU::FLAT_SCR_HI) { Reg = AMDGPU::FLAT_SCR; RegWidth = 2; return true; }
+    if (Reg == AMDGPU::VCC_LO && Reg1 == AMDGPU::VCC_HI) { Reg = AMDGPU::VCC; RegWidth = 2; return true; }
+    if (Reg == AMDGPU::TBA_LO && Reg1 == AMDGPU::TBA_HI) { Reg = AMDGPU::TBA; RegWidth = 2; return true; }
+    if (Reg == AMDGPU::TMA_LO && Reg1 == AMDGPU::TMA_HI) { Reg = AMDGPU::TMA; RegWidth = 2; return true; }
+    return false;
+  case IS_VGPR:
+  case IS_SGPR:
+  case IS_TTMP:
+    if (Reg1 != Reg + RegWidth) { return false; }
+    RegWidth++;
+    return true;
+  default:
+    assert(false); return false;
+  }
+}
+
+bool AMDGPUAsmParser::ParseAMDGPURegister(RegisterKind& RegKind, unsigned& Reg, unsigned& RegNum, unsigned& RegWidth)
+{
+  const MCRegisterInfo *TRI = getContext().getRegisterInfo();
+  if (getLexer().is(AsmToken::Identifier)) {
+    StringRef RegName = Parser.getTok().getString();
+    if ((Reg = getSpecialRegForName(RegName))) {
+      Parser.Lex();
+      RegKind = IS_SPECIAL;
+    } else {
+      unsigned RegNumIndex = 0;
+      if (RegName[0] == 'v') {
+        RegNumIndex = 1;
+        RegKind = IS_VGPR;
+      } else if (RegName[0] == 's') {
+        RegNumIndex = 1;
+        RegKind = IS_SGPR;
+      } else if (RegName.startswith("ttmp")) {
+        RegNumIndex = strlen("ttmp");
+        RegKind = IS_TTMP;
+      } else {
+        return false;
+      }
+      if (RegName.size() > RegNumIndex) {
+        // Single 32-bit register: vXX.
+        if (RegName.substr(RegNumIndex).getAsInteger(10, RegNum))
+          return false;
+        Parser.Lex();
+        RegWidth = 1;
+      } else {
+        // Range of registers: v[XX:YY]. ":YY" is optional.
+        Parser.Lex();
+        int64_t RegLo, RegHi;
+        if (getLexer().isNot(AsmToken::LBrac))
+          return false;
+        Parser.Lex();
+
+        if (getParser().parseAbsoluteExpression(RegLo))
+          return false;
+
+        const bool isRBrace = getLexer().is(AsmToken::RBrac);
+        if (!isRBrace && getLexer().isNot(AsmToken::Colon))
+          return false;
+        Parser.Lex();
+
+        if (isRBrace) {
+          RegHi = RegLo;
+        } else {
+          if (getParser().parseAbsoluteExpression(RegHi))
+            return false;
+
+          if (getLexer().isNot(AsmToken::RBrac))
+            return false;
+          Parser.Lex();
+        }
+        RegNum = (unsigned) RegLo;
+        RegWidth = (RegHi - RegLo) + 1;
+      }
+    }
+  } else if (getLexer().is(AsmToken::LBrac)) {
+    // List of consecutive registers: [s0,s1,s2,s3]
+    Parser.Lex();
+    if (!ParseAMDGPURegister(RegKind, Reg, RegNum, RegWidth))
+      return false;
+    if (RegWidth != 1)
+      return false;
+    RegisterKind RegKind1;
+    unsigned Reg1, RegNum1, RegWidth1;
+    do {
+      if (getLexer().is(AsmToken::Comma)) {
+        Parser.Lex();
+      } else if (getLexer().is(AsmToken::RBrac)) {
+        Parser.Lex();
+        break;
+      } else if (ParseAMDGPURegister(RegKind1, Reg1, RegNum1, RegWidth1)) {
+        if (RegWidth1 != 1) {
+          return false;
+        }
+        if (RegKind1 != RegKind) {
+          return false;
+        }
+        if (!AddNextRegisterToList(Reg, RegWidth, RegKind1, Reg1, RegNum1)) {
+          return false;
+        }
+      } else {
+        return false;
+      }
+    } while (true);
+  } else {
+    return false;
+  }
+  switch (RegKind) {
+  case IS_SPECIAL:
+    RegNum = 0;
+    RegWidth = 1;
+    break;
+  case IS_VGPR:
+  case IS_SGPR:
+  case IS_TTMP:
+  {
+    unsigned Size = 1;
+    if (RegKind == IS_SGPR || RegKind == IS_TTMP) {
+      // SGPR and TTMP registers must be are aligned. Max required alignment is 4 dwords.
+      Size = std::min(RegWidth, 4u);
+    }
+    if (RegNum % Size != 0)
+      return false;
+    RegNum = RegNum / Size;
+    int RCID = getRegClass(RegKind, RegWidth);
+    if (RCID == -1)
+      return false;
+    const MCRegisterClass RC = TRI->getRegClass(RCID);
+    if (RegNum >= RC.getNumRegs())
+      return false;
+    Reg = RC.getRegister(RegNum);
+    break;
+  }
+
+  default:
+    assert(false); return false;
+  }
+
+  if (!subtargetHasRegister(*TRI, Reg))
+    return false;
+  return true;
+}
+
+std::unique_ptr<AMDGPUOperand> AMDGPUAsmParser::parseRegister() {
+  const auto &Tok = Parser.getTok();
+  SMLoc StartLoc = Tok.getLoc();
+  SMLoc EndLoc = Tok.getEndLoc();
   const MCRegisterInfo *TRI = getContext().getRegisterInfo();
 
-  StringRef RegName = Tok.getString();
-  RegNo = getRegForName(RegName);
+  RegisterKind RegKind;
+  unsigned Reg, RegNum, RegWidth;
 
-  if (RegNo) {
+  if (!ParseAMDGPURegister(RegKind, Reg, RegNum, RegWidth)) {
+    return nullptr;
+  }
+  return AMDGPUOperand::CreateReg(Reg, StartLoc, EndLoc,
+                                  TRI, &getSTI(), false);
+}
+
+AMDGPUAsmParser::OperandMatchResultTy
+AMDGPUAsmParser::parseImm(OperandVector &Operands) {
+  bool Minus = false;
+  if (getLexer().getKind() == AsmToken::Minus) {
+    Minus = true;
     Parser.Lex();
-    return !subtargetHasRegister(*TRI, RegNo);
   }
 
-  // Match vgprs and sgprs
-  if (RegName[0] != 's' && RegName[0] != 'v')
-    return true;
+  SMLoc S = Parser.getTok().getLoc();
+  switch(getLexer().getKind()) {
+  case AsmToken::Integer: {
+    int64_t IntVal;
+    if (getParser().parseAbsoluteExpression(IntVal))
+      return MatchOperand_ParseFail;
+    if (!isInt<32>(IntVal) && !isUInt<32>(IntVal)) {
+      Error(S, "invalid immediate: only 32-bit values are legal");
+      return MatchOperand_ParseFail;
+    }
 
-  bool IsVgpr = RegName[0] == 'v';
-  unsigned RegWidth;
-  unsigned RegIndexInClass;
-  if (RegName.size() > 1) {
-    // We have a 32-bit register
-    RegWidth = 1;
-    if (RegName.substr(1).getAsInteger(10, RegIndexInClass))
-      return true;
+    if (Minus)
+      IntVal *= -1;
+    Operands.push_back(AMDGPUOperand::CreateImm(IntVal, S));
+    return MatchOperand_Success;
+  }
+  case AsmToken::Real: {
+    // FIXME: We should emit an error if a double precisions floating-point
+    // value is used.  I'm not sure the best way to detect this.
+    int64_t IntVal;
+    if (getParser().parseAbsoluteExpression(IntVal))
+      return MatchOperand_ParseFail;
+
+    APFloat F((float)BitsToDouble(IntVal));
+    if (Minus)
+      F.changeSign();
+    Operands.push_back(
+        AMDGPUOperand::CreateImm(F.bitcastToAPInt().getZExtValue(), S,
+                                 AMDGPUOperand::ImmTyNone, true));
+    return MatchOperand_Success;
+  }
+  default:
+    return Minus ? MatchOperand_ParseFail : MatchOperand_NoMatch;
+  }
+}
+
+AMDGPUAsmParser::OperandMatchResultTy
+AMDGPUAsmParser::parseRegOrImm(OperandVector &Operands) {
+  auto res = parseImm(Operands);
+  if (res != MatchOperand_NoMatch) {
+    return res;
+  }
+
+  if (auto R = parseRegister()) {
+    assert(R->isReg());
+    R->Reg.IsForcedVOP3 = isForcedVOP3();
+    Operands.push_back(std::move(R));
+    return MatchOperand_Success;
+  }
+  return MatchOperand_ParseFail;
+}
+
+AMDGPUAsmParser::OperandMatchResultTy
+AMDGPUAsmParser::parseRegOrImmWithFPInputMods(OperandVector &Operands) {
+  // XXX: During parsing we can't determine if minus sign means
+  // negate-modifier or negative immediate value.
+  // By default we suppose it is modifier.
+  bool Negate = false, Abs = false, Abs2 = false;
+
+  if (getLexer().getKind()== AsmToken::Minus) {
     Parser.Lex();
-  } else {
-    // We have a register greater than 32-bits.
+    Negate = true;
+  }
 
-    int64_t RegLo, RegHi;
+  if (getLexer().getKind() == AsmToken::Identifier && Parser.getTok().getString() == "abs") {
     Parser.Lex();
-    if (getLexer().isNot(AsmToken::LBrac))
-      return true;
+    Abs2 = true;
+    if (getLexer().isNot(AsmToken::LParen)) {
+      Error(Parser.getTok().getLoc(), "expected left paren after abs");
+      return MatchOperand_ParseFail;
+    }
+    Parser.Lex();
+  }
 
+  if (getLexer().getKind() == AsmToken::Pipe) {
+    if (Abs2) {
+      Error(Parser.getTok().getLoc(), "expected register or immediate");
+      return MatchOperand_ParseFail;
+    }
     Parser.Lex();
-    if (getParser().parseAbsoluteExpression(RegLo))
-      return true;
+    Abs = true;
+  }
 
-    if (getLexer().isNot(AsmToken::Colon))
-      return true;
+  auto Res = parseRegOrImm(Operands);
+  if (Res != MatchOperand_Success) {
+    return Res;
+  }
 
+  AMDGPUOperand::Modifiers Mods = {false, false, false};
+  if (Negate) {
+    Mods.Neg = true;
+  }
+  if (Abs) {
+    if (getLexer().getKind() != AsmToken::Pipe) {
+      Error(Parser.getTok().getLoc(), "expected vertical bar");
+      return MatchOperand_ParseFail;
+    }
     Parser.Lex();
-    if (getParser().parseAbsoluteExpression(RegHi))
-      return true;
+    Mods.Abs = true;
+  }
+  if (Abs2) {
+    if (getLexer().isNot(AsmToken::RParen)) {
+      Error(Parser.getTok().getLoc(), "expected closing parentheses");
+      return MatchOperand_ParseFail;
+    }
+    Parser.Lex();
+    Mods.Abs = true;
+  }
 
-    if (getLexer().isNot(AsmToken::RBrac))
-      return true;
+  if (Mods.hasFPModifiers()) {
+    AMDGPUOperand &Op = static_cast<AMDGPUOperand &>(*Operands.back());
+    Op.setModifiers(Mods);
+  }
+  return MatchOperand_Success;
+}
 
-    Parser.Lex();
-    RegWidth = (RegHi - RegLo) + 1;
-    if (IsVgpr) {
-      // VGPR registers aren't aligned.
-      RegIndexInClass = RegLo;
-    } else {
-      // SGPR registers are aligned.  Max alignment is 4 dwords.
-      unsigned Size = std::min(RegWidth, 4u);
-      if (RegLo % Size != 0)
-        return true;
+AMDGPUAsmParser::OperandMatchResultTy
+AMDGPUAsmParser::parseRegOrImmWithIntInputMods(OperandVector &Operands) {
+  bool Sext = false;
 
-      RegIndexInClass = RegLo / Size;
+  if (getLexer().getKind() == AsmToken::Identifier && Parser.getTok().getString() == "sext") {
+    Parser.Lex();
+    Sext = true;
+    if (getLexer().isNot(AsmToken::LParen)) {
+      Error(Parser.getTok().getLoc(), "expected left paren after sext");
+      return MatchOperand_ParseFail;
     }
+    Parser.Lex();
   }
 
-  int RCID = getRegClass(IsVgpr, RegWidth);
-  if (RCID == -1)
-    return true;
-
-  const MCRegisterClass RC = TRI->getRegClass(RCID);
-  if (RegIndexInClass >= RC.getNumRegs())
-    return true;
+  auto Res = parseRegOrImm(Operands);
+  if (Res != MatchOperand_Success) {
+    return Res;
+  }
 
-  RegNo = RC.getRegister(RegIndexInClass);
-  return !subtargetHasRegister(*TRI, RegNo);
+  AMDGPUOperand::Modifiers Mods = {false, false, false};
+  if (Sext) {
+    if (getLexer().isNot(AsmToken::RParen)) {
+      Error(Parser.getTok().getLoc(), "expected closing parentheses");
+      return MatchOperand_ParseFail;
+    }
+    Parser.Lex();
+    Mods.Sext = true;
+  }
+  
+  if (Mods.hasIntModifiers()) {
+    AMDGPUOperand &Op = static_cast<AMDGPUOperand &>(*Operands.back());
+    Op.setModifiers(Mods);
+  }
+  return MatchOperand_Success;
 }
 
 unsigned AMDGPUAsmParser::checkTargetMatchPredicate(MCInst &Inst) {
@@ -597,7 +1129,9 @@ unsigned AMDGPUAsmParser::checkTargetMatchPredicate(MCInst &Inst) {
   uint64_t TSFlags = MII.get(Inst.getOpcode()).TSFlags;
 
   if ((getForcedEncodingSize() == 32 && (TSFlags & SIInstrFlags::VOP3)) ||
-      (getForcedEncodingSize() == 64 && !(TSFlags & SIInstrFlags::VOP3)))
+      (getForcedEncodingSize() == 64 && !(TSFlags & SIInstrFlags::VOP3)) ||
+      (isForcedDPP() && !(TSFlags & SIInstrFlags::DPP)) ||
+      (isForcedSDWA() && !(TSFlags & SIInstrFlags::SDWA)) )
     return Match_InvalidOperand;
 
   if ((TSFlags & SIInstrFlags::VOP3) &&
@@ -608,7 +1142,6 @@ unsigned AMDGPUAsmParser::checkTargetMatchPredicate(MCInst &Inst) {
   return Match_Success;
 }
 
-
 bool AMDGPUAsmParser::MatchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode,
                                               OperandVector &Operands,
                                               MCStreamer &Out,
@@ -632,31 +1165,8 @@ bool AMDGPUAsmParser::MatchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode,
       SMLoc ErrorLoc = IDLoc;
       if (ErrorInfo != ~0ULL) {
         if (ErrorInfo >= Operands.size()) {
-          if (isForcedVOP3()) {
-            // If 64-bit encoding has been forced we can end up with no
-            // clamp or omod operands if none of the registers have modifiers,
-            // so we need to add these to the operand list.
-            AMDGPUOperand &LastOp =
-                ((AMDGPUOperand &)*Operands[Operands.size() - 1]);
-            if (LastOp.isRegKind() ||
-               (LastOp.isImm() &&
-                LastOp.getImmTy() != AMDGPUOperand::ImmTyNone)) {
-              SMLoc S = Parser.getTok().getLoc();
-              Operands.push_back(AMDGPUOperand::CreateImm(0, S,
-                                 AMDGPUOperand::ImmTyClamp));
-              Operands.push_back(AMDGPUOperand::CreateImm(0, S,
-                                 AMDGPUOperand::ImmTyOMod));
-              bool Res = MatchAndEmitInstruction(IDLoc, Opcode, Operands,
-                                                 Out, ErrorInfo,
-                                                 MatchingInlineAsm);
-              if (!Res)
-                return Res;
-            }
-
-          }
           return Error(IDLoc, "too few operands for instruction");
         }
-
         ErrorLoc = ((AMDGPUOperand &)*Operands[ErrorInfo]).getStartLoc();
         if (ErrorLoc == SMLoc())
           ErrorLoc = IDLoc;
@@ -762,164 +1272,12 @@ bool AMDGPUAsmParser::ParseDirectiveHSACodeObjectISA() {
 
 bool AMDGPUAsmParser::ParseAMDKernelCodeTValue(StringRef ID,
                                                amd_kernel_code_t &Header) {
-
-  if (getLexer().isNot(AsmToken::Equal))
-    return TokError("expected '='");
-  Lex();
-
-  if (getLexer().isNot(AsmToken::Integer))
-    return TokError("amd_kernel_code_t values must be integers");
-
-  uint64_t Value = getLexer().getTok().getIntVal();
+  SmallString<40> ErrStr;
+  raw_svector_ostream Err(ErrStr);
+  if (!parseAmdKernelCodeField(ID, getParser(), Header, Err)) {
+    return TokError(Err.str());
+  }
   Lex();
-
-  if (ID == "kernel_code_version_major")
-    Header.amd_kernel_code_version_major = Value;
-  else if (ID == "kernel_code_version_minor")
-    Header.amd_kernel_code_version_minor = Value;
-  else if (ID == "machine_kind")
-    Header.amd_machine_kind = Value;
-  else if (ID == "machine_version_major")
-    Header.amd_machine_version_major = Value;
-  else if (ID == "machine_version_minor")
-    Header.amd_machine_version_minor = Value;
-  else if (ID == "machine_version_stepping")
-    Header.amd_machine_version_stepping = Value;
-  else if (ID == "kernel_code_entry_byte_offset")
-    Header.kernel_code_entry_byte_offset = Value;
-  else if (ID == "kernel_code_prefetch_byte_size")
-    Header.kernel_code_prefetch_byte_size = Value;
-  else if (ID == "max_scratch_backing_memory_byte_size")
-    Header.max_scratch_backing_memory_byte_size = Value;
-  else if (ID == "compute_pgm_rsrc1_vgprs")
-    Header.compute_pgm_resource_registers |= S_00B848_VGPRS(Value);
-  else if (ID == "compute_pgm_rsrc1_sgprs")
-    Header.compute_pgm_resource_registers |= S_00B848_SGPRS(Value);
-  else if (ID == "compute_pgm_rsrc1_priority")
-    Header.compute_pgm_resource_registers |= S_00B848_PRIORITY(Value);
-  else if (ID == "compute_pgm_rsrc1_float_mode")
-    Header.compute_pgm_resource_registers |= S_00B848_FLOAT_MODE(Value);
-  else if (ID == "compute_pgm_rsrc1_priv")
-    Header.compute_pgm_resource_registers |= S_00B848_PRIV(Value);
-  else if (ID == "compute_pgm_rsrc1_dx10_clamp")
-    Header.compute_pgm_resource_registers |= S_00B848_DX10_CLAMP(Value);
-  else if (ID == "compute_pgm_rsrc1_debug_mode")
-    Header.compute_pgm_resource_registers |= S_00B848_DEBUG_MODE(Value);
-  else if (ID == "compute_pgm_rsrc1_ieee_mode")
-    Header.compute_pgm_resource_registers |= S_00B848_IEEE_MODE(Value);
-  else if (ID == "compute_pgm_rsrc2_scratch_en")
-    Header.compute_pgm_resource_registers |= (S_00B84C_SCRATCH_EN(Value) << 32);
-  else if (ID == "compute_pgm_rsrc2_user_sgpr")
-    Header.compute_pgm_resource_registers |= (S_00B84C_USER_SGPR(Value) << 32);
-  else if (ID == "compute_pgm_rsrc2_tgid_x_en")
-    Header.compute_pgm_resource_registers |= (S_00B84C_TGID_X_EN(Value) << 32);
-  else if (ID == "compute_pgm_rsrc2_tgid_y_en")
-    Header.compute_pgm_resource_registers |= (S_00B84C_TGID_Y_EN(Value) << 32);
-  else if (ID == "compute_pgm_rsrc2_tgid_z_en")
-    Header.compute_pgm_resource_registers |= (S_00B84C_TGID_Z_EN(Value) << 32);
-  else if (ID == "compute_pgm_rsrc2_tg_size_en")
-    Header.compute_pgm_resource_registers |= (S_00B84C_TG_SIZE_EN(Value) << 32);
-  else if (ID == "compute_pgm_rsrc2_tidig_comp_cnt")
-    Header.compute_pgm_resource_registers |=
-        (S_00B84C_TIDIG_COMP_CNT(Value) << 32);
-  else if (ID == "compute_pgm_rsrc2_excp_en_msb")
-    Header.compute_pgm_resource_registers |=
-        (S_00B84C_EXCP_EN_MSB(Value) << 32);
-  else if (ID == "compute_pgm_rsrc2_lds_size")
-    Header.compute_pgm_resource_registers |= (S_00B84C_LDS_SIZE(Value) << 32);
-  else if (ID == "compute_pgm_rsrc2_excp_en")
-    Header.compute_pgm_resource_registers |= (S_00B84C_EXCP_EN(Value) << 32);
-  else if (ID == "compute_pgm_resource_registers")
-    Header.compute_pgm_resource_registers = Value;
-  else if (ID == "enable_sgpr_private_segment_buffer")
-    Header.code_properties |=
-        (Value << AMD_CODE_PROPERTY_ENABLE_SGPR_PRIVATE_SEGMENT_BUFFER_SHIFT);
-  else if (ID == "enable_sgpr_dispatch_ptr")
-    Header.code_properties |=
-        (Value << AMD_CODE_PROPERTY_ENABLE_SGPR_DISPATCH_PTR_SHIFT);
-  else if (ID == "enable_sgpr_queue_ptr")
-    Header.code_properties |=
-        (Value << AMD_CODE_PROPERTY_ENABLE_SGPR_QUEUE_PTR_SHIFT);
-  else if (ID == "enable_sgpr_kernarg_segment_ptr")
-    Header.code_properties |=
-        (Value << AMD_CODE_PROPERTY_ENABLE_SGPR_KERNARG_SEGMENT_PTR_SHIFT);
-  else if (ID == "enable_sgpr_dispatch_id")
-    Header.code_properties |=
-        (Value << AMD_CODE_PROPERTY_ENABLE_SGPR_DISPATCH_ID_SHIFT);
-  else if (ID == "enable_sgpr_flat_scratch_init")
-    Header.code_properties |=
-        (Value << AMD_CODE_PROPERTY_ENABLE_SGPR_FLAT_SCRATCH_INIT_SHIFT);
-  else if (ID == "enable_sgpr_private_segment_size")
-    Header.code_properties |=
-        (Value << AMD_CODE_PROPERTY_ENABLE_SGPR_PRIVATE_SEGMENT_SIZE_SHIFT);
-  else if (ID == "enable_sgpr_grid_workgroup_count_x")
-    Header.code_properties |=
-        (Value << AMD_CODE_PROPERTY_ENABLE_SGPR_GRID_WORKGROUP_COUNT_X_SHIFT);
-  else if (ID == "enable_sgpr_grid_workgroup_count_y")
-    Header.code_properties |=
-        (Value << AMD_CODE_PROPERTY_ENABLE_SGPR_GRID_WORKGROUP_COUNT_Y_SHIFT);
-  else if (ID == "enable_sgpr_grid_workgroup_count_z")
-    Header.code_properties |=
-        (Value << AMD_CODE_PROPERTY_ENABLE_SGPR_GRID_WORKGROUP_COUNT_Z_SHIFT);
-  else if (ID == "enable_ordered_append_gds")
-    Header.code_properties |=
-        (Value << AMD_CODE_PROPERTY_ENABLE_ORDERED_APPEND_GDS_SHIFT);
-  else if (ID == "private_element_size")
-    Header.code_properties |=
-        (Value << AMD_CODE_PROPERTY_PRIVATE_ELEMENT_SIZE_SHIFT);
-  else if (ID == "is_ptr64")
-    Header.code_properties |=
-        (Value << AMD_CODE_PROPERTY_IS_PTR64_SHIFT);
-  else if (ID == "is_dynamic_callstack")
-    Header.code_properties |=
-        (Value << AMD_CODE_PROPERTY_IS_DYNAMIC_CALLSTACK_SHIFT);
-  else if (ID == "is_debug_enabled")
-    Header.code_properties |=
-        (Value << AMD_CODE_PROPERTY_IS_DEBUG_SUPPORTED_SHIFT);
-  else if (ID == "is_xnack_enabled")
-    Header.code_properties |=
-        (Value << AMD_CODE_PROPERTY_IS_XNACK_SUPPORTED_SHIFT);
-  else if (ID == "workitem_private_segment_byte_size")
-    Header.workitem_private_segment_byte_size = Value;
-  else if (ID == "workgroup_group_segment_byte_size")
-    Header.workgroup_group_segment_byte_size = Value;
-  else if (ID == "gds_segment_byte_size")
-    Header.gds_segment_byte_size = Value;
-  else if (ID == "kernarg_segment_byte_size")
-    Header.kernarg_segment_byte_size = Value;
-  else if (ID == "workgroup_fbarrier_count")
-    Header.workgroup_fbarrier_count = Value;
-  else if (ID == "wavefront_sgpr_count")
-    Header.wavefront_sgpr_count = Value;
-  else if (ID == "workitem_vgpr_count")
-    Header.workitem_vgpr_count = Value;
-  else if (ID == "reserved_vgpr_first")
-    Header.reserved_vgpr_first = Value;
-  else if (ID == "reserved_vgpr_count")
-    Header.reserved_vgpr_count = Value;
-  else if (ID == "reserved_sgpr_first")
-    Header.reserved_sgpr_first = Value;
-  else if (ID == "reserved_sgpr_count")
-    Header.reserved_sgpr_count = Value;
-  else if (ID == "debug_wavefront_private_segment_offset_sgpr")
-    Header.debug_wavefront_private_segment_offset_sgpr = Value;
-  else if (ID == "debug_private_segment_buffer_sgpr")
-    Header.debug_private_segment_buffer_sgpr = Value;
-  else if (ID == "kernarg_segment_alignment")
-    Header.kernarg_segment_alignment = Value;
-  else if (ID == "group_segment_alignment")
-    Header.group_segment_alignment = Value;
-  else if (ID == "private_segment_alignment")
-    Header.private_segment_alignment = Value;
-  else if (ID == "wavefront_size")
-    Header.wavefront_size = Value;
-  else if (ID == "call_convention")
-    Header.call_convention = Value;
-  else if (ID == "runtime_loader_kernel_symbol")
-    Header.runtime_loader_kernel_symbol = Value;
-  else
-    return TokError("amd_kernel_code_t value not recognized.");
-
   return false;
 }
 
@@ -930,9 +1288,6 @@ bool AMDGPUAsmParser::ParseDirectiveAMDKernelCodeT() {
 
   while (true) {
 
-    if (getLexer().isNot(AsmToken::EndOfStatement))
-      return TokError("amd_kernel_code_t values must begin on a new line");
-
     // Lex EndOfStatement.  This is in a while loop, because lexing a comment
     // will set the current token to EndOfStatement.
     while(getLexer().is(AsmToken::EndOfStatement))
@@ -1026,7 +1381,7 @@ bool AMDGPUAsmParser::ParseDirective(AsmToken DirectiveID) {
   if (IDVal == ".amd_kernel_code_t")
     return ParseDirectiveAMDKernelCodeT();
 
-  if (IDVal == ".hsatext" || IDVal == ".text")
+  if (IDVal == ".hsatext")
     return ParseSectionDirectiveHSAText();
 
   if (IDVal == ".amdgpu_hsa_kernel")
@@ -1078,19 +1433,6 @@ bool AMDGPUAsmParser::subtargetHasRegister(const MCRegisterInfo &MRI,
   return true;
 }
 
-static bool operandsHaveModifiers(const OperandVector &Operands) {
-
-  for (unsigned i = 0, e = Operands.size(); i != e; ++i) {
-    const AMDGPUOperand &Op = ((AMDGPUOperand&)*Operands[i]);
-    if (Op.isRegKind() && Op.hasModifiers())
-      return true;
-    if (Op.isImm() && (Op.getImmTy() == AMDGPUOperand::ImmTyOMod ||
-                       Op.getImmTy() == AMDGPUOperand::ImmTyClamp))
-      return true;
-  }
-  return false;
-}
-
 AMDGPUAsmParser::OperandMatchResultTy
 AMDGPUAsmParser::parseOperand(OperandVector &Operands, StringRef Mnemonic) {
 
@@ -1107,113 +1449,59 @@ AMDGPUAsmParser::parseOperand(OperandVector &Operands, StringRef Mnemonic) {
       getLexer().is(AsmToken::EndOfStatement))
     return ResTy;
 
-  bool Negate = false, Abs = false;
-  if (getLexer().getKind()== AsmToken::Minus) {
-    Parser.Lex();
-    Negate = true;
-  }
+  ResTy = parseRegOrImm(Operands);
 
-  if (getLexer().getKind() == AsmToken::Pipe) {
-    Parser.Lex();
-    Abs = true;
-  }
-
-  switch(getLexer().getKind()) {
-    case AsmToken::Integer: {
-      SMLoc S = Parser.getTok().getLoc();
-      int64_t IntVal;
-      if (getParser().parseAbsoluteExpression(IntVal))
-        return MatchOperand_ParseFail;
-      if (!isInt<32>(IntVal) && !isUInt<32>(IntVal)) {
-        Error(S, "invalid immediate: only 32-bit values are legal");
-        return MatchOperand_ParseFail;
-      }
-
-      if (Negate)
-        IntVal *= -1;
-      Operands.push_back(AMDGPUOperand::CreateImm(IntVal, S));
-      return MatchOperand_Success;
-    }
-    case AsmToken::Real: {
-      // FIXME: We should emit an error if a double precisions floating-point
-      // value is used.  I'm not sure the best way to detect this.
-      SMLoc S = Parser.getTok().getLoc();
-      int64_t IntVal;
-      if (getParser().parseAbsoluteExpression(IntVal))
-        return MatchOperand_ParseFail;
+  if (ResTy == MatchOperand_Success)
+    return ResTy;
 
-      APFloat F((float)BitsToDouble(IntVal));
-      if (Negate)
-        F.changeSign();
-      Operands.push_back(
-          AMDGPUOperand::CreateImm(F.bitcastToAPInt().getZExtValue(), S));
+  if (getLexer().getKind() == AsmToken::Identifier) {
+    // If this identifier is a symbol, we want to create an expression for it.
+    // It is a little difficult to distinguish between a symbol name, and
+    // an instruction flag like 'gds'.  In order to do this, we parse
+    // all tokens as expressions and then treate the symbol name as the token
+    // string when we want to interpret the operand as a token.
+    const auto &Tok = Parser.getTok();
+    SMLoc S = Tok.getLoc();
+    const MCExpr *Expr = nullptr;
+    if (!Parser.parseExpression(Expr)) {
+      Operands.push_back(AMDGPUOperand::CreateExpr(Expr, S));
       return MatchOperand_Success;
     }
-    case AsmToken::Identifier: {
-      SMLoc S, E;
-      unsigned RegNo;
-      if (!ParseRegister(RegNo, S, E)) {
-
-        bool HasModifiers = operandsHaveModifiers(Operands);
-        unsigned Modifiers = 0;
 
-        if (Negate)
-          Modifiers |= 0x1;
-
-        if (Abs) {
-          if (getLexer().getKind() != AsmToken::Pipe)
-            return MatchOperand_ParseFail;
-          Parser.Lex();
-          Modifiers |= 0x2;
-        }
-
-        if (Modifiers && !HasModifiers) {
-          // We are adding a modifier to src1 or src2 and previous sources
-          // don't have modifiers, so we need to go back and empty modifers
-          // for each previous source.
-          for (unsigned PrevRegIdx = Operands.size() - 1; PrevRegIdx > 1;
-               --PrevRegIdx) {
-
-            AMDGPUOperand &RegOp = ((AMDGPUOperand&)*Operands[PrevRegIdx]);
-            RegOp.setModifiers(0);
-          }
-        }
-
-
-        Operands.push_back(AMDGPUOperand::CreateReg(
-            RegNo, S, E, getContext().getRegisterInfo(), &getSTI(),
-            isForcedVOP3()));
-
-        if (HasModifiers || Modifiers) {
-          AMDGPUOperand &RegOp = ((AMDGPUOperand&)*Operands[Operands.size() - 1]);
-          RegOp.setModifiers(Modifiers);
-
-        }
-     }  else {
-      Operands.push_back(AMDGPUOperand::CreateToken(Parser.getTok().getString(),
-                                                    S));
-      Parser.Lex();
-     }
-     return MatchOperand_Success;
-    }
-    default:
-      return MatchOperand_NoMatch;
+    Operands.push_back(AMDGPUOperand::CreateToken(Tok.getString(), Tok.getLoc()));
+    Parser.Lex();
+    return MatchOperand_Success;
   }
+  return MatchOperand_NoMatch;
 }
 
-bool AMDGPUAsmParser::ParseInstruction(ParseInstructionInfo &Info,
-                                       StringRef Name,
-                                       SMLoc NameLoc, OperandVector &Operands) {
-
+StringRef AMDGPUAsmParser::parseMnemonicSuffix(StringRef Name) {
   // Clear any forced encodings from the previous instruction.
   setForcedEncodingSize(0);
+  setForcedDPP(false);
+  setForcedSDWA(false);
 
-  if (Name.endswith("_e64"))
+  if (Name.endswith("_e64")) {
     setForcedEncodingSize(64);
-  else if (Name.endswith("_e32"))
+    return Name.substr(0, Name.size() - 4);
+  } else if (Name.endswith("_e32")) {
     setForcedEncodingSize(32);
+    return Name.substr(0, Name.size() - 4);
+  } else if (Name.endswith("_dpp")) {
+    setForcedDPP(true);
+    return Name.substr(0, Name.size() - 4);
+  } else if (Name.endswith("_sdwa")) {
+    setForcedSDWA(true);
+    return Name.substr(0, Name.size() - 5);
+  }
+  return Name;
+}
 
+bool AMDGPUAsmParser::ParseInstruction(ParseInstructionInfo &Info,
+                                       StringRef Name,
+                                       SMLoc NameLoc, OperandVector &Operands) {
   // Add the instruction mnemonic
+  Name = parseMnemonicSuffix(Name);
   Operands.push_back(AMDGPUOperand::CreateToken(Name, NameLoc));
 
   while (!getLexer().is(AsmToken::EndOfStatement)) {
@@ -1225,20 +1513,21 @@ bool AMDGPUAsmParser::ParseInstruction(ParseInstructionInfo &Info,
 
     switch (Res) {
       case MatchOperand_Success: break;
-      case MatchOperand_ParseFail: return Error(getLexer().getLoc(),
-                                                "failed parsing operand.");
-      case MatchOperand_NoMatch: return Error(getLexer().getLoc(),
-                                              "not a valid operand.");
+      case MatchOperand_ParseFail:
+        Error(getLexer().getLoc(), "failed parsing operand.");
+        while (!getLexer().is(AsmToken::EndOfStatement)) {
+          Parser.Lex();
+        }
+        return true;
+      case MatchOperand_NoMatch:
+        Error(getLexer().getLoc(), "not a valid operand.");
+        while (!getLexer().is(AsmToken::EndOfStatement)) {
+          Parser.Lex();
+        }
+        return true;
     }
   }
 
-  // Once we reach end of statement, continue parsing so we can add default
-  // values for optional arguments.
-  AMDGPUAsmParser::OperandMatchResultTy Res;
-  while ((Res = parseOperand(Operands, Name)) != MatchOperand_NoMatch) {
-    if (Res != MatchOperand_Success)
-      return Error(getLexer().getLoc(), "failed parsing operand.");
-  }
   return false;
 }
 
@@ -1247,22 +1536,14 @@ bool AMDGPUAsmParser::ParseInstruction(ParseInstructionInfo &Info,
 //===----------------------------------------------------------------------===//
 
 AMDGPUAsmParser::OperandMatchResultTy
-AMDGPUAsmParser::parseIntWithPrefix(const char *Prefix, int64_t &Int,
-                                    int64_t Default) {
-
-  // We are at the end of the statement, and this is a default argument, so
-  // use a default value.
-  if (getLexer().is(AsmToken::EndOfStatement)) {
-    Int = Default;
-    return MatchOperand_Success;
-  }
-
+AMDGPUAsmParser::parseIntWithPrefix(const char *Prefix, int64_t &Int) {
   switch(getLexer().getKind()) {
     default: return MatchOperand_NoMatch;
     case AsmToken::Identifier: {
-      StringRef OffsetName = Parser.getTok().getString();
-      if (!OffsetName.equals(Prefix))
+      StringRef Name = Parser.getTok().getString();
+      if (!Name.equals(Prefix)) {
         return MatchOperand_NoMatch;
+      }
 
       Parser.Lex();
       if (getLexer().isNot(AsmToken::Colon))
@@ -1282,16 +1563,21 @@ AMDGPUAsmParser::parseIntWithPrefix(const char *Prefix, int64_t &Int,
 
 AMDGPUAsmParser::OperandMatchResultTy
 AMDGPUAsmParser::parseIntWithPrefix(const char *Prefix, OperandVector &Operands,
-                                    enum AMDGPUOperand::ImmTy ImmTy) {
+                                    enum AMDGPUOperand::ImmTy ImmTy,
+                                    bool (*ConvertResult)(int64_t&)) {
 
   SMLoc S = Parser.getTok().getLoc();
-  int64_t Offset = 0;
+  int64_t Value = 0;
 
-  AMDGPUAsmParser::OperandMatchResultTy Res = parseIntWithPrefix(Prefix, Offset);
+  AMDGPUAsmParser::OperandMatchResultTy Res = parseIntWithPrefix(Prefix, Value);
   if (Res != MatchOperand_Success)
     return Res;
 
-  Operands.push_back(AMDGPUOperand::CreateImm(Offset, S, ImmTy));
+  if (ConvertResult && !ConvertResult(Value)) {
+    return MatchOperand_ParseFail;
+  }
+
+  Operands.push_back(AMDGPUOperand::CreateImm(Value, S, ImmTy));
   return MatchOperand_Success;
 }
 
@@ -1327,101 +1613,52 @@ AMDGPUAsmParser::parseNamedBit(const char *Name, OperandVector &Operands,
   return MatchOperand_Success;
 }
 
-static bool operandsHasOptionalOp(const OperandVector &Operands,
-                                  const OptionalOperand &OOp) {
-  for (unsigned i = 0; i < Operands.size(); i++) {
-    const AMDGPUOperand &ParsedOp = ((const AMDGPUOperand &)*Operands[i]);
-    if ((ParsedOp.isImm() && ParsedOp.getImmTy() == OOp.Type) ||
-        (ParsedOp.isToken() && ParsedOp.getToken() == OOp.Name))
-      return true;
+typedef std::map<enum AMDGPUOperand::ImmTy, unsigned> OptionalImmIndexMap;
 
+void addOptionalImmOperand(MCInst& Inst, const OperandVector& Operands,
+                           OptionalImmIndexMap& OptionalIdx,
+                           enum AMDGPUOperand::ImmTy ImmT, int64_t Default = 0) {
+  auto i = OptionalIdx.find(ImmT);
+  if (i != OptionalIdx.end()) {
+    unsigned Idx = i->second;
+    ((AMDGPUOperand &)*Operands[Idx]).addImmOperands(Inst, 1);
+  } else {
+    Inst.addOperand(MCOperand::createImm(Default));
   }
-  return false;
 }
 
 AMDGPUAsmParser::OperandMatchResultTy
-AMDGPUAsmParser::parseOptionalOps(const ArrayRef<OptionalOperand> &OptionalOps,
-                                   OperandVector &Operands) {
-  SMLoc S = Parser.getTok().getLoc();
-  for (const OptionalOperand &Op : OptionalOps) {
-    if (operandsHasOptionalOp(Operands, Op))
-      continue;
-    AMDGPUAsmParser::OperandMatchResultTy Res;
-    int64_t Value;
-    if (Op.IsBit) {
-      Res = parseNamedBit(Op.Name, Operands, Op.Type);
-      if (Res == MatchOperand_NoMatch)
-        continue;
-      return Res;
-    }
-
-    Res = parseIntWithPrefix(Op.Name, Value, Op.Default);
-
-    if (Res == MatchOperand_NoMatch)
-      continue;
-
-    if (Res != MatchOperand_Success)
-      return Res;
+AMDGPUAsmParser::parseStringWithPrefix(StringRef Prefix, StringRef &Value) {
+  if (getLexer().isNot(AsmToken::Identifier)) {
+    return MatchOperand_NoMatch;
+  }
+  StringRef Tok = Parser.getTok().getString();
+  if (Tok != Prefix) {
+    return MatchOperand_NoMatch;
+  }
 
-    if (Op.ConvertResult && !Op.ConvertResult(Value)) {
-      return MatchOperand_ParseFail;
-    }
+  Parser.Lex();
+  if (getLexer().isNot(AsmToken::Colon)) {
+    return MatchOperand_ParseFail;
+  }
 
-    Operands.push_back(AMDGPUOperand::CreateImm(Value, S, Op.Type));
-    return MatchOperand_Success;
+  Parser.Lex();
+  if (getLexer().isNot(AsmToken::Identifier)) {
+    return MatchOperand_ParseFail;
   }
-  return MatchOperand_NoMatch;
+
+  Value = Parser.getTok().getString();
+  return MatchOperand_Success;
 }
 
 //===----------------------------------------------------------------------===//
 // ds
 //===----------------------------------------------------------------------===//
 
-static const OptionalOperand DSOptionalOps [] = {
-  {"offset",  AMDGPUOperand::ImmTyOffset, false, 0, nullptr},
-  {"gds",     AMDGPUOperand::ImmTyGDS, true, 0, nullptr}
-};
-
-static const OptionalOperand DSOptionalOpsOff01 [] = {
-  {"offset0", AMDGPUOperand::ImmTyDSOffset0, false, 0, nullptr},
-  {"offset1", AMDGPUOperand::ImmTyDSOffset1, false, 0, nullptr},
-  {"gds",     AMDGPUOperand::ImmTyGDS, true, 0, nullptr}
-};
-
-AMDGPUAsmParser::OperandMatchResultTy
-AMDGPUAsmParser::parseDSOptionalOps(OperandVector &Operands) {
-  return parseOptionalOps(DSOptionalOps, Operands);
-}
-AMDGPUAsmParser::OperandMatchResultTy
-AMDGPUAsmParser::parseDSOff01OptionalOps(OperandVector &Operands) {
-  return parseOptionalOps(DSOptionalOpsOff01, Operands);
-}
-
-AMDGPUAsmParser::OperandMatchResultTy
-AMDGPUAsmParser::parseDSOffsetOptional(OperandVector &Operands) {
-  SMLoc S = Parser.getTok().getLoc();
-  AMDGPUAsmParser::OperandMatchResultTy Res =
-    parseIntWithPrefix("offset", Operands, AMDGPUOperand::ImmTyOffset);
-  if (Res == MatchOperand_NoMatch) {
-    Operands.push_back(AMDGPUOperand::CreateImm(0, S,
-                       AMDGPUOperand::ImmTyOffset));
-    Res = MatchOperand_Success;
-  }
-  return Res;
-}
-
-bool AMDGPUOperand::isDSOffset() const {
-  return isImm() && isUInt<16>(getImm());
-}
-
-bool AMDGPUOperand::isDSOffset01() const {
-  return isImm() && isUInt<8>(getImm());
-}
-
 void AMDGPUAsmParser::cvtDSOffset01(MCInst &Inst,
                                     const OperandVector &Operands) {
 
-  std::map<enum AMDGPUOperand::ImmTy, unsigned> OptionalIdx;
+  OptionalImmIndexMap OptionalIdx;
 
   for (unsigned i = 1, e = Operands.size(); i != e; ++i) {
     AMDGPUOperand &Op = ((AMDGPUOperand &)*Operands[i]);
@@ -1436,13 +1673,10 @@ void AMDGPUAsmParser::cvtDSOffset01(MCInst &Inst,
     OptionalIdx[Op.getImmTy()] = i;
   }
 
-  unsigned Offset0Idx = OptionalIdx[AMDGPUOperand::ImmTyDSOffset0];
-  unsigned Offset1Idx = OptionalIdx[AMDGPUOperand::ImmTyDSOffset1];
-  unsigned GDSIdx = OptionalIdx[AMDGPUOperand::ImmTyGDS];
+  addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyOffset0);
+  addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyOffset1);
+  addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyGDS);
 
-  ((AMDGPUOperand &)*Operands[Offset0Idx]).addImmOperands(Inst, 1); // offset0
-  ((AMDGPUOperand &)*Operands[Offset1Idx]).addImmOperands(Inst, 1); // offset1
-  ((AMDGPUOperand &)*Operands[GDSIdx]).addImmOperands(Inst, 1); // gds
   Inst.addOperand(MCOperand::createReg(AMDGPU::M0)); // m0
 }
 
@@ -1469,12 +1703,11 @@ void AMDGPUAsmParser::cvtDS(MCInst &Inst, const OperandVector &Operands) {
     OptionalIdx[Op.getImmTy()] = i;
   }
 
-  unsigned OffsetIdx = OptionalIdx[AMDGPUOperand::ImmTyOffset];
-  ((AMDGPUOperand &)*Operands[OffsetIdx]).addImmOperands(Inst, 1); // offset
+  addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyOffset);
+  addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyGDS);
 
   if (!GDSOnly) {
-    unsigned GDSIdx = OptionalIdx[AMDGPUOperand::ImmTyGDS];
-    ((AMDGPUOperand &)*Operands[GDSIdx]).addImmOperands(Inst, 1); // gds
+    addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyGDS);
   }
   Inst.addOperand(MCOperand::createReg(AMDGPU::M0)); // m0
 }
@@ -1516,7 +1749,7 @@ bool AMDGPUAsmParser::parseCnt(int64_t &IntVal) {
     CntMask = 0x7;
     CntShift = 4;
   } else if (CntName == "lgkmcnt") {
-    CntMask = 0x7;
+    CntMask = 0xf;
     CntShift = 8;
   } else {
     return true;
@@ -1532,8 +1765,8 @@ AMDGPUAsmParser::parseSWaitCntOps(OperandVector &Operands) {
   // Disable all counters by default.
   // vmcnt   [3:0]
   // expcnt  [6:4]
-  // lgkmcnt [10:8]
-  int64_t CntVal = 0x77f;
+  // lgkmcnt [11:8]
+  int64_t CntVal = 0xf7f;
   SMLoc S = Parser.getTok().getLoc();
 
   switch(getLexer().getKind()) {
@@ -1555,10 +1788,298 @@ AMDGPUAsmParser::parseSWaitCntOps(OperandVector &Operands) {
   return MatchOperand_Success;
 }
 
+bool AMDGPUAsmParser::parseHwregConstruct(OperandInfoTy &HwReg, int64_t &Offset, int64_t &Width) {
+  using namespace llvm::AMDGPU::Hwreg;
+
+  if (Parser.getTok().getString() != "hwreg")
+    return true;
+  Parser.Lex();
+
+  if (getLexer().isNot(AsmToken::LParen))
+    return true;
+  Parser.Lex();
+
+  if (getLexer().is(AsmToken::Identifier)) {
+    HwReg.IsSymbolic = true;
+    HwReg.Id = ID_UNKNOWN_;
+    const StringRef tok = Parser.getTok().getString();
+    for (int i = ID_SYMBOLIC_FIRST_; i < ID_SYMBOLIC_LAST_; ++i) {
+      if (tok == IdSymbolic[i]) {
+        HwReg.Id = i;
+        break;
+      }
+    }
+    Parser.Lex();
+  } else {
+    HwReg.IsSymbolic = false;
+    if (getLexer().isNot(AsmToken::Integer))
+      return true;
+    if (getParser().parseAbsoluteExpression(HwReg.Id))
+      return true;
+  }
+
+  if (getLexer().is(AsmToken::RParen)) {
+    Parser.Lex();
+    return false;
+  }
+
+  // optional params
+  if (getLexer().isNot(AsmToken::Comma))
+    return true;
+  Parser.Lex();
+
+  if (getLexer().isNot(AsmToken::Integer))
+    return true;
+  if (getParser().parseAbsoluteExpression(Offset))
+    return true;
+
+  if (getLexer().isNot(AsmToken::Comma))
+    return true;
+  Parser.Lex();
+
+  if (getLexer().isNot(AsmToken::Integer))
+    return true;
+  if (getParser().parseAbsoluteExpression(Width))
+    return true;
+
+  if (getLexer().isNot(AsmToken::RParen))
+    return true;
+  Parser.Lex();
+
+  return false;
+}
+
+AMDGPUAsmParser::OperandMatchResultTy
+AMDGPUAsmParser::parseHwreg(OperandVector &Operands) {
+  using namespace llvm::AMDGPU::Hwreg;
+
+  int64_t Imm16Val = 0;
+  SMLoc S = Parser.getTok().getLoc();
+
+  switch(getLexer().getKind()) {
+    default: return MatchOperand_NoMatch;
+    case AsmToken::Integer:
+      // The operand can be an integer value.
+      if (getParser().parseAbsoluteExpression(Imm16Val))
+        return MatchOperand_NoMatch;
+      if (Imm16Val < 0 || !isUInt<16>(Imm16Val)) {
+        Error(S, "invalid immediate: only 16-bit values are legal");
+        // Do not return error code, but create an imm operand anyway and proceed
+        // to the next operand, if any. That avoids unneccessary error messages.
+      }
+      break;
+
+    case AsmToken::Identifier: {
+        OperandInfoTy HwReg(ID_UNKNOWN_);
+        int64_t Offset = OFFSET_DEFAULT_;
+        int64_t Width = WIDTH_M1_DEFAULT_ + 1;
+        if (parseHwregConstruct(HwReg, Offset, Width))
+          return MatchOperand_ParseFail;
+        if (HwReg.Id < 0 || !isUInt<ID_WIDTH_>(HwReg.Id)) {
+          if (HwReg.IsSymbolic)
+            Error(S, "invalid symbolic name of hardware register");
+          else
+            Error(S, "invalid code of hardware register: only 6-bit values are legal");
+        }
+        if (Offset < 0 || !isUInt<OFFSET_WIDTH_>(Offset))
+          Error(S, "invalid bit offset: only 5-bit values are legal");
+        if ((Width-1) < 0 || !isUInt<WIDTH_M1_WIDTH_>(Width-1))
+          Error(S, "invalid bitfield width: only values from 1 to 32 are legal");
+        Imm16Val = (HwReg.Id << ID_SHIFT_) | (Offset << OFFSET_SHIFT_) | ((Width-1) << WIDTH_M1_SHIFT_);
+      }
+      break;
+  }
+  Operands.push_back(AMDGPUOperand::CreateImm(Imm16Val, S, AMDGPUOperand::ImmTyHwreg));
+  return MatchOperand_Success;
+}
+
 bool AMDGPUOperand::isSWaitCnt() const {
   return isImm();
 }
 
+bool AMDGPUOperand::isHwreg() const {
+  return isImmTy(ImmTyHwreg);
+}
+
+bool AMDGPUAsmParser::parseSendMsgConstruct(OperandInfoTy &Msg, OperandInfoTy &Operation, int64_t &StreamId) {
+  using namespace llvm::AMDGPU::SendMsg;
+
+  if (Parser.getTok().getString() != "sendmsg")
+    return true;
+  Parser.Lex();
+
+  if (getLexer().isNot(AsmToken::LParen))
+    return true;
+  Parser.Lex();
+
+  if (getLexer().is(AsmToken::Identifier)) {
+    Msg.IsSymbolic = true;
+    Msg.Id = ID_UNKNOWN_;
+    const std::string tok = Parser.getTok().getString();
+    for (int i = ID_GAPS_FIRST_; i < ID_GAPS_LAST_; ++i) {
+      switch(i) {
+        default: continue; // Omit gaps.
+        case ID_INTERRUPT: case ID_GS: case ID_GS_DONE:  case ID_SYSMSG: break;
+      }
+      if (tok == IdSymbolic[i]) {
+        Msg.Id = i;
+        break;
+      }
+    }
+    Parser.Lex();
+  } else {
+    Msg.IsSymbolic = false;
+    if (getLexer().isNot(AsmToken::Integer))
+      return true;
+    if (getParser().parseAbsoluteExpression(Msg.Id))
+      return true;
+    if (getLexer().is(AsmToken::Integer))
+      if (getParser().parseAbsoluteExpression(Msg.Id))
+        Msg.Id = ID_UNKNOWN_;
+  }
+  if (Msg.Id == ID_UNKNOWN_) // Don't know how to parse the rest.
+    return false;
+
+  if (!(Msg.Id == ID_GS || Msg.Id == ID_GS_DONE || Msg.Id == ID_SYSMSG)) {
+    if (getLexer().isNot(AsmToken::RParen))
+      return true;
+    Parser.Lex();
+    return false;
+  }
+
+  if (getLexer().isNot(AsmToken::Comma))
+    return true;
+  Parser.Lex();
+
+  assert(Msg.Id == ID_GS || Msg.Id == ID_GS_DONE || Msg.Id == ID_SYSMSG);
+  Operation.Id = ID_UNKNOWN_;
+  if (getLexer().is(AsmToken::Identifier)) {
+    Operation.IsSymbolic = true;
+    const char* const *S = (Msg.Id == ID_SYSMSG) ? OpSysSymbolic : OpGsSymbolic;
+    const int F = (Msg.Id == ID_SYSMSG) ? OP_SYS_FIRST_ : OP_GS_FIRST_;
+    const int L = (Msg.Id == ID_SYSMSG) ? OP_SYS_LAST_ : OP_GS_LAST_;
+    const StringRef Tok = Parser.getTok().getString();
+    for (int i = F; i < L; ++i) {
+      if (Tok == S[i]) {
+        Operation.Id = i;
+        break;
+      }
+    }
+    Parser.Lex();
+  } else {
+    Operation.IsSymbolic = false;
+    if (getLexer().isNot(AsmToken::Integer))
+      return true;
+    if (getParser().parseAbsoluteExpression(Operation.Id))
+      return true;
+  }
+
+  if ((Msg.Id == ID_GS || Msg.Id == ID_GS_DONE) && Operation.Id != OP_GS_NOP) {
+    // Stream id is optional.
+    if (getLexer().is(AsmToken::RParen)) {
+      Parser.Lex();
+      return false;
+    }
+
+    if (getLexer().isNot(AsmToken::Comma))
+      return true;
+    Parser.Lex();
+
+    if (getLexer().isNot(AsmToken::Integer))
+      return true;
+    if (getParser().parseAbsoluteExpression(StreamId))
+      return true;
+  }
+
+  if (getLexer().isNot(AsmToken::RParen))
+    return true;
+  Parser.Lex();
+  return false;
+}
+
+AMDGPUAsmParser::OperandMatchResultTy
+AMDGPUAsmParser::parseSendMsgOp(OperandVector &Operands) {
+  using namespace llvm::AMDGPU::SendMsg;
+
+  int64_t Imm16Val = 0;
+  SMLoc S = Parser.getTok().getLoc();
+
+  switch(getLexer().getKind()) {
+  default:
+    return MatchOperand_NoMatch;
+  case AsmToken::Integer:
+    // The operand can be an integer value.
+    if (getParser().parseAbsoluteExpression(Imm16Val))
+      return MatchOperand_NoMatch;
+    if (Imm16Val < 0 || !isUInt<16>(Imm16Val)) {
+      Error(S, "invalid immediate: only 16-bit values are legal");
+      // Do not return error code, but create an imm operand anyway and proceed
+      // to the next operand, if any. That avoids unneccessary error messages.
+    }
+    break;
+  case AsmToken::Identifier: {
+      OperandInfoTy Msg(ID_UNKNOWN_);
+      OperandInfoTy Operation(OP_UNKNOWN_);
+      int64_t StreamId = STREAM_ID_DEFAULT_;
+      if (parseSendMsgConstruct(Msg, Operation, StreamId))
+        return MatchOperand_ParseFail;
+      do {
+        // Validate and encode message ID.
+        if (! ((ID_INTERRUPT <= Msg.Id && Msg.Id <= ID_GS_DONE)
+                || Msg.Id == ID_SYSMSG)) {
+          if (Msg.IsSymbolic)
+            Error(S, "invalid/unsupported symbolic name of message");
+          else
+            Error(S, "invalid/unsupported code of message");
+          break;
+        }
+        Imm16Val = (Msg.Id << ID_SHIFT_);
+        // Validate and encode operation ID.
+        if (Msg.Id == ID_GS || Msg.Id == ID_GS_DONE) {
+          if (! (OP_GS_FIRST_ <= Operation.Id && Operation.Id < OP_GS_LAST_)) {
+            if (Operation.IsSymbolic)
+              Error(S, "invalid symbolic name of GS_OP");
+            else
+              Error(S, "invalid code of GS_OP: only 2-bit values are legal");
+            break;
+          }
+          if (Operation.Id == OP_GS_NOP
+              && Msg.Id != ID_GS_DONE) {
+            Error(S, "invalid GS_OP: NOP is for GS_DONE only");
+            break;
+          }
+          Imm16Val |= (Operation.Id << OP_SHIFT_);
+        }
+        if (Msg.Id == ID_SYSMSG) {
+          if (! (OP_SYS_FIRST_ <= Operation.Id && Operation.Id < OP_SYS_LAST_)) {
+            if (Operation.IsSymbolic)
+              Error(S, "invalid/unsupported symbolic name of SYSMSG_OP");
+            else
+              Error(S, "invalid/unsupported code of SYSMSG_OP");
+            break;
+          }
+          Imm16Val |= (Operation.Id << OP_SHIFT_);
+        }
+        // Validate and encode stream ID.
+        if ((Msg.Id == ID_GS || Msg.Id == ID_GS_DONE) && Operation.Id != OP_GS_NOP) {
+          if (! (STREAM_ID_FIRST_ <= StreamId && StreamId < STREAM_ID_LAST_)) {
+            Error(S, "invalid stream id: only 2-bit values are legal");
+            break;
+          }
+          Imm16Val |= (StreamId << STREAM_ID_SHIFT_);
+        }
+      } while (0);
+    }
+    break;
+  }
+  Operands.push_back(AMDGPUOperand::CreateImm(Imm16Val, S, AMDGPUOperand::ImmTySendMsg));
+  return MatchOperand_Success;
+}
+
+bool AMDGPUOperand::isSendMsg() const {
+  return isImmTy(ImmTySendMsg);
+}
+
 //===----------------------------------------------------------------------===//
 // sopp branch targets
 //===----------------------------------------------------------------------===//
@@ -1587,33 +2108,26 @@ AMDGPUAsmParser::parseSOppBrTarget(OperandVector &Operands) {
 }
 
 //===----------------------------------------------------------------------===//
-// flat
+// mubuf
 //===----------------------------------------------------------------------===//
 
-static const OptionalOperand FlatOptionalOps [] = {
-  {"glc",    AMDGPUOperand::ImmTyGLC, true, 0, nullptr},
-  {"slc",    AMDGPUOperand::ImmTySLC, true, 0, nullptr},
-  {"tfe",    AMDGPUOperand::ImmTyTFE, true, 0, nullptr}
-};
-
-static const OptionalOperand FlatAtomicOptionalOps [] = {
-  {"slc",    AMDGPUOperand::ImmTySLC, true, 0, nullptr},
-  {"tfe",    AMDGPUOperand::ImmTyTFE, true, 0, nullptr}
-};
+AMDGPUOperand::Ptr AMDGPUAsmParser::defaultGLC() const {
+  return AMDGPUOperand::CreateImm(0, SMLoc(), AMDGPUOperand::ImmTyGLC);
+}
 
-AMDGPUAsmParser::OperandMatchResultTy
-AMDGPUAsmParser::parseFlatOptionalOps(OperandVector &Operands) {
-  return parseOptionalOps(FlatOptionalOps, Operands);
+AMDGPUOperand::Ptr AMDGPUAsmParser::defaultSLC() const {
+  return AMDGPUOperand::CreateImm(0, SMLoc(), AMDGPUOperand::ImmTySLC);
 }
 
-AMDGPUAsmParser::OperandMatchResultTy
-AMDGPUAsmParser::parseFlatAtomicOptionalOps(OperandVector &Operands) {
-  return parseOptionalOps(FlatAtomicOptionalOps, Operands);
+AMDGPUOperand::Ptr AMDGPUAsmParser::defaultTFE() const {
+  return AMDGPUOperand::CreateImm(0, SMLoc(), AMDGPUOperand::ImmTyTFE);
 }
 
-void AMDGPUAsmParser::cvtFlat(MCInst &Inst,
-                               const OperandVector &Operands) {
-  std::map<AMDGPUOperand::ImmTy, unsigned> OptionalIdx;
+void AMDGPUAsmParser::cvtMubufImpl(MCInst &Inst,
+                               const OperandVector &Operands,
+                               bool IsAtomic, bool IsAtomicReturn) {
+  OptionalImmIndexMap OptionalIdx;
+  assert(IsAtomicReturn ? IsAtomic : true);
 
   for (unsigned i = 1, e = Operands.size(); i != e; ++i) {
     AMDGPUOperand &Op = ((AMDGPUOperand &)*Operands[i]);
@@ -1624,129 +2138,128 @@ void AMDGPUAsmParser::cvtFlat(MCInst &Inst,
       continue;
     }
 
-    // Handle 'glc' token which is sometimes hard-coded into the
+    // Handle the case where soffset is an immediate
+    if (Op.isImm() && Op.getImmTy() == AMDGPUOperand::ImmTyNone) {
+      Op.addImmOperands(Inst, 1);
+      continue;
+    }
+
+    // Handle tokens like 'offen' which are sometimes hard-coded into the
     // asm string.  There are no MCInst operands for these.
-    if (Op.isToken())
+    if (Op.isToken()) {
       continue;
+    }
+    assert(Op.isImm());
 
     // Handle optional arguments
     OptionalIdx[Op.getImmTy()] = i;
-
   }
 
-  // flat atomic instructions don't have a glc argument.
-  if (OptionalIdx.count(AMDGPUOperand::ImmTyGLC)) {
-    unsigned GLCIdx = OptionalIdx[AMDGPUOperand::ImmTyGLC];
-    ((AMDGPUOperand &)*Operands[GLCIdx]).addImmOperands(Inst, 1);
+  // Copy $vdata_in operand and insert as $vdata for MUBUF_Atomic RTN insns.
+  if (IsAtomicReturn) {
+    MCInst::iterator I = Inst.begin(); // $vdata_in is always at the beginning.
+    Inst.insert(I, *I);
   }
 
-  unsigned SLCIdx = OptionalIdx[AMDGPUOperand::ImmTySLC];
-  unsigned TFEIdx = OptionalIdx[AMDGPUOperand::ImmTyTFE];
-
-  ((AMDGPUOperand &)*Operands[SLCIdx]).addImmOperands(Inst, 1);
-  ((AMDGPUOperand &)*Operands[TFEIdx]).addImmOperands(Inst, 1);
+  addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyOffset);
+  if (!IsAtomic) { // glc is hard-coded.
+    addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyGLC);
+  }
+  addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTySLC);
+  addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyTFE);
 }
 
 //===----------------------------------------------------------------------===//
-// mubuf
+// mimg
 //===----------------------------------------------------------------------===//
 
-static const OptionalOperand MubufOptionalOps [] = {
-  {"offset", AMDGPUOperand::ImmTyOffset, false, 0, nullptr},
-  {"glc",    AMDGPUOperand::ImmTyGLC, true, 0, nullptr},
-  {"slc",    AMDGPUOperand::ImmTySLC, true, 0, nullptr},
-  {"tfe",    AMDGPUOperand::ImmTyTFE, true, 0, nullptr}
-};
+void AMDGPUAsmParser::cvtMIMG(MCInst &Inst, const OperandVector &Operands) {
+  unsigned I = 1;
+  const MCInstrDesc &Desc = MII.get(Inst.getOpcode());
+  for (unsigned J = 0; J < Desc.getNumDefs(); ++J) {
+    ((AMDGPUOperand &)*Operands[I++]).addRegOperands(Inst, 1);
+  }
 
-AMDGPUAsmParser::OperandMatchResultTy
-AMDGPUAsmParser::parseMubufOptionalOps(OperandVector &Operands) {
-  return parseOptionalOps(MubufOptionalOps, Operands);
-}
+  OptionalImmIndexMap OptionalIdx;
 
-AMDGPUAsmParser::OperandMatchResultTy
-AMDGPUAsmParser::parseOffset(OperandVector &Operands) {
-  return parseIntWithPrefix("offset", Operands);
-}
+  for (unsigned E = Operands.size(); I != E; ++I) {
+    AMDGPUOperand &Op = ((AMDGPUOperand &)*Operands[I]);
 
-AMDGPUAsmParser::OperandMatchResultTy
-AMDGPUAsmParser::parseGLC(OperandVector &Operands) {
-  return parseNamedBit("glc", Operands);
-}
+    // Add the register arguments
+    if (Op.isRegOrImm()) {
+      Op.addRegOrImmOperands(Inst, 1);
+      continue;
+    } else if (Op.isImmModifier()) {
+      OptionalIdx[Op.getImmTy()] = I;
+    } else {
+      assert(false);
+    }
+  }
 
-AMDGPUAsmParser::OperandMatchResultTy
-AMDGPUAsmParser::parseSLC(OperandVector &Operands) {
-  return parseNamedBit("slc", Operands);
+  addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyDMask);
+  addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyUNorm);
+  addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyGLC);
+  addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyDA);
+  addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyR128);
+  addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyTFE);
+  addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyLWE);
+  addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTySLC);
 }
 
-AMDGPUAsmParser::OperandMatchResultTy
-AMDGPUAsmParser::parseTFE(OperandVector &Operands) {
-  return parseNamedBit("tfe", Operands);
-}
+void AMDGPUAsmParser::cvtMIMGAtomic(MCInst &Inst, const OperandVector &Operands) {
+  unsigned I = 1;
+  const MCInstrDesc &Desc = MII.get(Inst.getOpcode());
+  for (unsigned J = 0; J < Desc.getNumDefs(); ++J) {
+    ((AMDGPUOperand &)*Operands[I++]).addRegOperands(Inst, 1);
+  }
 
-bool AMDGPUOperand::isMubufOffset() const {
-  return isImm() && isUInt<12>(getImm());
-}
+  // Add src, same as dst
+  ((AMDGPUOperand &)*Operands[I]).addRegOperands(Inst, 1);
 
-void AMDGPUAsmParser::cvtMubuf(MCInst &Inst,
-                               const OperandVector &Operands) {
-  std::map<enum AMDGPUOperand::ImmTy, unsigned> OptionalIdx;
+  OptionalImmIndexMap OptionalIdx;
 
-  for (unsigned i = 1, e = Operands.size(); i != e; ++i) {
-    AMDGPUOperand &Op = ((AMDGPUOperand &)*Operands[i]);
+  for (unsigned E = Operands.size(); I != E; ++I) {
+    AMDGPUOperand &Op = ((AMDGPUOperand &)*Operands[I]);
 
     // Add the register arguments
-    if (Op.isReg()) {
-      Op.addRegOperands(Inst, 1);
-      continue;
-    }
-
-    // Handle the case where soffset is an immediate
-    if (Op.isImm() && Op.getImmTy() == AMDGPUOperand::ImmTyNone) {
-      Op.addImmOperands(Inst, 1);
-      continue;
-    }
-
-    // Handle tokens like 'offen' which are sometimes hard-coded into the
-    // asm string.  There are no MCInst operands for these.
-    if (Op.isToken()) {
+    if (Op.isRegOrImm()) {
+      Op.addRegOrImmOperands(Inst, 1);
       continue;
+    } else if (Op.isImmModifier()) {
+      OptionalIdx[Op.getImmTy()] = I;
+    } else {
+      assert(false);
     }
-    assert(Op.isImm());
-
-    // Handle optional arguments
-    OptionalIdx[Op.getImmTy()] = i;
   }
 
-  assert(OptionalIdx.size() == 4);
-
-  unsigned OffsetIdx = OptionalIdx[AMDGPUOperand::ImmTyOffset];
-  unsigned GLCIdx = OptionalIdx[AMDGPUOperand::ImmTyGLC];
-  unsigned SLCIdx = OptionalIdx[AMDGPUOperand::ImmTySLC];
-  unsigned TFEIdx = OptionalIdx[AMDGPUOperand::ImmTyTFE];
+  addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyDMask);
+  addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyUNorm);
+  addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyGLC);
+  addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyDA);
+  addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyR128);
+  addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyTFE);
+  addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyLWE);
+  addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTySLC);
+}
 
-  ((AMDGPUOperand &)*Operands[OffsetIdx]).addImmOperands(Inst, 1);
-  ((AMDGPUOperand &)*Operands[GLCIdx]).addImmOperands(Inst, 1);
-  ((AMDGPUOperand &)*Operands[SLCIdx]).addImmOperands(Inst, 1);
-  ((AMDGPUOperand &)*Operands[TFEIdx]).addImmOperands(Inst, 1);
+AMDGPUOperand::Ptr AMDGPUAsmParser::defaultDMask() const {
+  return AMDGPUOperand::CreateImm(0, SMLoc(), AMDGPUOperand::ImmTyDMask);
 }
 
-//===----------------------------------------------------------------------===//
-// mimg
-//===----------------------------------------------------------------------===//
+AMDGPUOperand::Ptr AMDGPUAsmParser::defaultUNorm() const {
+  return AMDGPUOperand::CreateImm(0, SMLoc(), AMDGPUOperand::ImmTyUNorm);
+}
 
-AMDGPUAsmParser::OperandMatchResultTy
-AMDGPUAsmParser::parseDMask(OperandVector &Operands) {
-  return parseIntWithPrefix("dmask", Operands);
+AMDGPUOperand::Ptr AMDGPUAsmParser::defaultDA() const {
+  return AMDGPUOperand::CreateImm(0, SMLoc(), AMDGPUOperand::ImmTyDA);
 }
 
-AMDGPUAsmParser::OperandMatchResultTy
-AMDGPUAsmParser::parseUNorm(OperandVector &Operands) {
-  return parseNamedBit("unorm", Operands);
+AMDGPUOperand::Ptr AMDGPUAsmParser::defaultR128() const {
+  return AMDGPUOperand::CreateImm(0, SMLoc(), AMDGPUOperand::ImmTyR128);
 }
 
-AMDGPUAsmParser::OperandMatchResultTy
-AMDGPUAsmParser::parseR128(OperandVector &Operands) {
-  return parseNamedBit("r128", Operands);
+AMDGPUOperand::Ptr AMDGPUAsmParser::defaultLWE() const {
+  return AMDGPUOperand::CreateImm(0, SMLoc(), AMDGPUOperand::ImmTyLWE);
 }
 
 //===----------------------------------------------------------------------===//
@@ -1766,6 +2279,14 @@ bool AMDGPUOperand::isSMRDLiteralOffset() const {
   return isImm() && !isUInt<8>(getImm()) && isUInt<32>(getImm());
 }
 
+AMDGPUOperand::Ptr AMDGPUAsmParser::defaultSMRDOffset() const {
+  return AMDGPUOperand::CreateImm(0, SMLoc(), AMDGPUOperand::ImmTyOffset);
+}
+
+AMDGPUOperand::Ptr AMDGPUAsmParser::defaultSMRDLiteralOffset() const {
+  return AMDGPUOperand::CreateImm(0, SMLoc(), AMDGPUOperand::ImmTyOffset);
+}
+
 //===----------------------------------------------------------------------===//
 // vop3
 //===----------------------------------------------------------------------===//
@@ -1792,91 +2313,435 @@ static bool ConvertOmodDiv(int64_t &Div) {
   return false;
 }
 
-static const OptionalOperand VOP3OptionalOps [] = {
-  {"clamp", AMDGPUOperand::ImmTyClamp, true, 0, nullptr},
-  {"mul",   AMDGPUOperand::ImmTyOMod, false, 1, ConvertOmodMul},
-  {"div",   AMDGPUOperand::ImmTyOMod, false, 1, ConvertOmodDiv},
+static bool ConvertBoundCtrl(int64_t &BoundCtrl) {
+  if (BoundCtrl == 0) {
+    BoundCtrl = 1;
+    return true;
+  } else if (BoundCtrl == -1) {
+    BoundCtrl = 0;
+    return true;
+  }
+  return false;
+}
+
+// Note: the order in this table matches the order of operands in AsmString.
+static const OptionalOperand AMDGPUOptionalOperandTable[] = {
+  {"offen",   AMDGPUOperand::ImmTyOffen, true, nullptr},
+  {"idxen",   AMDGPUOperand::ImmTyIdxen, true, nullptr},
+  {"addr64",  AMDGPUOperand::ImmTyAddr64, true, nullptr},
+  {"offset0", AMDGPUOperand::ImmTyOffset0, false, nullptr},
+  {"offset1", AMDGPUOperand::ImmTyOffset1, false, nullptr},
+  {"gds",     AMDGPUOperand::ImmTyGDS, true, nullptr},
+  {"offset",  AMDGPUOperand::ImmTyOffset, false, nullptr},
+  {"glc",     AMDGPUOperand::ImmTyGLC, true, nullptr},
+  {"slc",     AMDGPUOperand::ImmTySLC, true, nullptr},
+  {"tfe",     AMDGPUOperand::ImmTyTFE, true, nullptr},
+  {"clamp",   AMDGPUOperand::ImmTyClampSI, true, nullptr},
+  {"omod",    AMDGPUOperand::ImmTyOModSI, false, ConvertOmodMul},
+  {"unorm",   AMDGPUOperand::ImmTyUNorm, true, nullptr},
+  {"da",      AMDGPUOperand::ImmTyDA,    true, nullptr},
+  {"r128",    AMDGPUOperand::ImmTyR128,  true, nullptr},
+  {"lwe",     AMDGPUOperand::ImmTyLWE,   true, nullptr},
+  {"dmask",   AMDGPUOperand::ImmTyDMask, false, nullptr},
+  {"row_mask",   AMDGPUOperand::ImmTyDppRowMask, false, nullptr},
+  {"bank_mask",  AMDGPUOperand::ImmTyDppBankMask, false, nullptr},
+  {"bound_ctrl", AMDGPUOperand::ImmTyDppBoundCtrl, false, ConvertBoundCtrl},
+  {"dst_sel",    AMDGPUOperand::ImmTySdwaDstSel, false, nullptr},
+  {"src0_sel",   AMDGPUOperand::ImmTySdwaSrc0Sel, false, nullptr},
+  {"src1_sel",   AMDGPUOperand::ImmTySdwaSrc1Sel, false, nullptr},
+  {"dst_unused", AMDGPUOperand::ImmTySdwaDstUnused, false, nullptr},
 };
 
-static bool isVOP3(OperandVector &Operands) {
-  if (operandsHaveModifiers(Operands))
-    return true;
+AMDGPUAsmParser::OperandMatchResultTy AMDGPUAsmParser::parseOptionalOperand(OperandVector &Operands) {
+  OperandMatchResultTy res;
+  for (const OptionalOperand &Op : AMDGPUOptionalOperandTable) {
+    // try to parse any optional operand here
+    if (Op.IsBit) {
+      res = parseNamedBit(Op.Name, Operands, Op.Type);
+    } else if (Op.Type == AMDGPUOperand::ImmTyOModSI) {
+      res = parseOModOperand(Operands);
+    } else if (Op.Type == AMDGPUOperand::ImmTySdwaDstSel ||
+               Op.Type == AMDGPUOperand::ImmTySdwaSrc0Sel ||
+               Op.Type == AMDGPUOperand::ImmTySdwaSrc1Sel) {
+      res = parseSDWASel(Operands, Op.Name, Op.Type);
+    } else if (Op.Type == AMDGPUOperand::ImmTySdwaDstUnused) {
+      res = parseSDWADstUnused(Operands);
+    } else {
+      res = parseIntWithPrefix(Op.Name, Operands, Op.Type, Op.ConvertResult);
+    }
+    if (res != MatchOperand_NoMatch) {
+      return res;
+    }
+  }
+  return MatchOperand_NoMatch;
+}
 
-  AMDGPUOperand &DstOp = ((AMDGPUOperand&)*Operands[1]);
+AMDGPUAsmParser::OperandMatchResultTy AMDGPUAsmParser::parseOModOperand(OperandVector &Operands)
+{
+  StringRef Name = Parser.getTok().getString();
+  if (Name == "mul") {
+    return parseIntWithPrefix("mul", Operands, AMDGPUOperand::ImmTyOModSI, ConvertOmodMul);
+  } else if (Name == "div") {
+    return parseIntWithPrefix("div", Operands, AMDGPUOperand::ImmTyOModSI, ConvertOmodDiv);
+  } else {
+    return MatchOperand_NoMatch;
+  }
+}
 
-  if (DstOp.isReg() && DstOp.isRegClass(AMDGPU::SGPR_64RegClassID))
-    return true;
+void AMDGPUAsmParser::cvtId(MCInst &Inst, const OperandVector &Operands) {
+  unsigned I = 1;
+  const MCInstrDesc &Desc = MII.get(Inst.getOpcode());
+  for (unsigned J = 0; J < Desc.getNumDefs(); ++J) {
+    ((AMDGPUOperand &)*Operands[I++]).addRegOperands(Inst, 1);
+  }
+  for (unsigned E = Operands.size(); I != E; ++I)
+    ((AMDGPUOperand &)*Operands[I]).addRegOrImmOperands(Inst, 1);
+}
 
-  if (Operands.size() >= 5)
-    return true;
+void AMDGPUAsmParser::cvtVOP3_2_mod(MCInst &Inst, const OperandVector &Operands) {
+  uint64_t TSFlags = MII.get(Inst.getOpcode()).TSFlags;
+  if (TSFlags & SIInstrFlags::VOP3) {
+    cvtVOP3(Inst, Operands);
+  } else {
+    cvtId(Inst, Operands);
+  }
+}
 
-  if (Operands.size() > 3) {
-    AMDGPUOperand &Src1Op = ((AMDGPUOperand&)*Operands[3]);
-    if (Src1Op.getReg() && (Src1Op.isRegClass(AMDGPU::SReg_32RegClassID) ||
-                            Src1Op.isRegClass(AMDGPU::SReg_64RegClassID)))
-      return true;
+void AMDGPUAsmParser::cvtVOP3(MCInst &Inst, const OperandVector &Operands) {
+  OptionalImmIndexMap OptionalIdx;
+  unsigned I = 1;
+  const MCInstrDesc &Desc = MII.get(Inst.getOpcode());
+  for (unsigned J = 0; J < Desc.getNumDefs(); ++J) {
+    ((AMDGPUOperand &)*Operands[I++]).addRegOperands(Inst, 1);
+  }
+
+  for (unsigned E = Operands.size(); I != E; ++I) {
+    AMDGPUOperand &Op = ((AMDGPUOperand &)*Operands[I]);
+    if (Op.isRegOrImmWithInputMods()) {
+      // only fp modifiers allowed in VOP3
+      Op.addRegOrImmWithFPInputModsOperands(Inst, 2);
+    } else if (Op.isImm()) {
+      OptionalIdx[Op.getImmTy()] = I;
+    } else {
+      assert(false);
+    }
+  }
+
+  addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyClampSI);
+  addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyOModSI);
+}
+
+//===----------------------------------------------------------------------===//
+// dpp
+//===----------------------------------------------------------------------===//
+
+bool AMDGPUOperand::isDPPCtrl() const {
+  bool result = isImm() && getImmTy() == ImmTyDppCtrl && isUInt<9>(getImm());
+  if (result) {
+    int64_t Imm = getImm();
+    return ((Imm >= 0x000) && (Imm <= 0x0ff)) ||
+           ((Imm >= 0x101) && (Imm <= 0x10f)) ||
+           ((Imm >= 0x111) && (Imm <= 0x11f)) ||
+           ((Imm >= 0x121) && (Imm <= 0x12f)) ||
+           (Imm == 0x130) ||
+           (Imm == 0x134) ||
+           (Imm == 0x138) ||
+           (Imm == 0x13c) ||
+           (Imm == 0x140) ||
+           (Imm == 0x141) ||
+           (Imm == 0x142) ||
+           (Imm == 0x143);
   }
   return false;
 }
 
 AMDGPUAsmParser::OperandMatchResultTy
-AMDGPUAsmParser::parseVOP3OptionalOps(OperandVector &Operands) {
-
-  // The value returned by this function may change after parsing
-  // an operand so store the original value here.
-  bool HasModifiers = operandsHaveModifiers(Operands);
-
-  bool IsVOP3 = isVOP3(Operands);
-  if (HasModifiers || IsVOP3 ||
-      getLexer().isNot(AsmToken::EndOfStatement) ||
-      getForcedEncodingSize() == 64) {
-
-    AMDGPUAsmParser::OperandMatchResultTy Res =
-        parseOptionalOps(VOP3OptionalOps, Operands);
-
-    if (!HasModifiers && Res == MatchOperand_Success) {
-      // We have added a modifier operation, so we need to make sure all
-      // previous register operands have modifiers
-      for (unsigned i = 2, e = Operands.size(); i != e; ++i) {
-        AMDGPUOperand &Op = ((AMDGPUOperand&)*Operands[i]);
-        if (Op.isReg())
-          Op.setModifiers(0);
+AMDGPUAsmParser::parseDPPCtrl(OperandVector &Operands) {
+  SMLoc S = Parser.getTok().getLoc();
+  StringRef Prefix;
+  int64_t Int;
+
+  if (getLexer().getKind() == AsmToken::Identifier) {
+    Prefix = Parser.getTok().getString();
+  } else {
+    return MatchOperand_NoMatch;
+  }
+
+  if (Prefix == "row_mirror") {
+    Int = 0x140;
+  } else if (Prefix == "row_half_mirror") {
+    Int = 0x141;
+  } else {
+    // Check to prevent parseDPPCtrlOps from eating invalid tokens
+    if (Prefix != "quad_perm"
+        && Prefix != "row_shl"
+        && Prefix != "row_shr"
+        && Prefix != "row_ror"
+        && Prefix != "wave_shl"
+        && Prefix != "wave_rol"
+        && Prefix != "wave_shr"
+        && Prefix != "wave_ror"
+        && Prefix != "row_bcast") {
+      return MatchOperand_NoMatch;
+    }
+
+    Parser.Lex();
+    if (getLexer().isNot(AsmToken::Colon))
+      return MatchOperand_ParseFail;
+
+    if (Prefix == "quad_perm") {
+      // quad_perm:[%d,%d,%d,%d]
+      Parser.Lex();
+      if (getLexer().isNot(AsmToken::LBrac))
+        return MatchOperand_ParseFail;
+
+      Parser.Lex();
+      if (getLexer().isNot(AsmToken::Integer))
+        return MatchOperand_ParseFail;
+      Int = getLexer().getTok().getIntVal();
+
+      Parser.Lex();
+      if (getLexer().isNot(AsmToken::Comma))
+        return MatchOperand_ParseFail;
+      Parser.Lex();
+      if (getLexer().isNot(AsmToken::Integer))
+        return MatchOperand_ParseFail;
+      Int += (getLexer().getTok().getIntVal() << 2);
+
+      Parser.Lex();
+      if (getLexer().isNot(AsmToken::Comma))
+        return MatchOperand_ParseFail;
+      Parser.Lex();
+      if (getLexer().isNot(AsmToken::Integer))
+        return MatchOperand_ParseFail;
+      Int += (getLexer().getTok().getIntVal() << 4);
+
+      Parser.Lex();
+      if (getLexer().isNot(AsmToken::Comma))
+        return MatchOperand_ParseFail;
+      Parser.Lex();
+      if (getLexer().isNot(AsmToken::Integer))
+        return MatchOperand_ParseFail;
+      Int += (getLexer().getTok().getIntVal() << 6);
+
+      Parser.Lex();
+      if (getLexer().isNot(AsmToken::RBrac))
+        return MatchOperand_ParseFail;
+
+    } else {
+      // sel:%d
+      Parser.Lex();
+      if (getLexer().isNot(AsmToken::Integer))
+        return MatchOperand_ParseFail;
+      Int = getLexer().getTok().getIntVal();
+
+      if (Prefix == "row_shl") {
+        Int |= 0x100;
+      } else if (Prefix == "row_shr") {
+        Int |= 0x110;
+      } else if (Prefix == "row_ror") {
+        Int |= 0x120;
+      } else if (Prefix == "wave_shl") {
+        Int = 0x130;
+      } else if (Prefix == "wave_rol") {
+        Int = 0x134;
+      } else if (Prefix == "wave_shr") {
+        Int = 0x138;
+      } else if (Prefix == "wave_ror") {
+        Int = 0x13C;
+      } else if (Prefix == "row_bcast") {
+        if (Int == 15) {
+          Int = 0x142;
+        } else if (Int == 31) {
+          Int = 0x143;
+        } else {
+          return MatchOperand_ParseFail;
+        }
+      } else {
+        return MatchOperand_ParseFail;
       }
     }
-    return Res;
   }
-  return MatchOperand_NoMatch;
+  Parser.Lex(); // eat last token
+
+  Operands.push_back(AMDGPUOperand::CreateImm(Int, S,
+                                              AMDGPUOperand::ImmTyDppCtrl));
+  return MatchOperand_Success;
 }
 
-void AMDGPUAsmParser::cvtVOP3(MCInst &Inst, const OperandVector &Operands) {
+AMDGPUOperand::Ptr AMDGPUAsmParser::defaultRowMask() const {
+  return AMDGPUOperand::CreateImm(0xf, SMLoc(), AMDGPUOperand::ImmTyDppRowMask);
+}
+
+AMDGPUOperand::Ptr AMDGPUAsmParser::defaultBankMask() const {
+  return AMDGPUOperand::CreateImm(0xf, SMLoc(), AMDGPUOperand::ImmTyDppBankMask);
+}
+
+AMDGPUOperand::Ptr AMDGPUAsmParser::defaultBoundCtrl() const {
+  return AMDGPUOperand::CreateImm(0, SMLoc(), AMDGPUOperand::ImmTyDppBoundCtrl);
+}
+
+void AMDGPUAsmParser::cvtDPP(MCInst &Inst, const OperandVector &Operands) {
+  OptionalImmIndexMap OptionalIdx;
 
-  unsigned i = 1;
+  unsigned I = 1;
   const MCInstrDesc &Desc = MII.get(Inst.getOpcode());
-  if (Desc.getNumDefs() > 0) {
-    ((AMDGPUOperand &)*Operands[i++]).addRegOperands(Inst, 1);
+  for (unsigned J = 0; J < Desc.getNumDefs(); ++J) {
+    ((AMDGPUOperand &)*Operands[I++]).addRegOperands(Inst, 1);
   }
 
-  std::map<enum AMDGPUOperand::ImmTy, unsigned> OptionalIdx;
+  for (unsigned E = Operands.size(); I != E; ++I) {
+    AMDGPUOperand &Op = ((AMDGPUOperand &)*Operands[I]);
+    // Add the register arguments
+    if (Op.isRegOrImmWithInputMods()) {
+      // Only float modifiers supported in DPP
+      Op.addRegOrImmWithFPInputModsOperands(Inst, 2);
+    } else if (Op.isDPPCtrl()) {
+      Op.addImmOperands(Inst, 1);
+    } else if (Op.isImm()) {
+      // Handle optional arguments
+      OptionalIdx[Op.getImmTy()] = I;
+    } else {
+      llvm_unreachable("Invalid operand type");
+    }
+  }
 
-  if (operandsHaveModifiers(Operands)) {
-    for (unsigned e = Operands.size(); i != e; ++i) {
-      AMDGPUOperand &Op = ((AMDGPUOperand &)*Operands[i]);
+  addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyDppRowMask, 0xf);
+  addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyDppBankMask, 0xf);
+  addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyDppBoundCtrl);
+}
 
-      if (Op.isRegWithInputMods()) {
-        ((AMDGPUOperand &)*Operands[i]).addRegWithInputModsOperands(Inst, 2);
-        continue;
-      }
-      OptionalIdx[Op.getImmTy()] = i;
-    }
+//===----------------------------------------------------------------------===//
+// sdwa
+//===----------------------------------------------------------------------===//
 
-    unsigned ClampIdx = OptionalIdx[AMDGPUOperand::ImmTyClamp];
-    unsigned OModIdx = OptionalIdx[AMDGPUOperand::ImmTyOMod];
+AMDGPUAsmParser::OperandMatchResultTy
+AMDGPUAsmParser::parseSDWASel(OperandVector &Operands, StringRef Prefix,
+                              AMDGPUOperand::ImmTy Type) {
+  SMLoc S = Parser.getTok().getLoc();
+  StringRef Value;
+  AMDGPUAsmParser::OperandMatchResultTy res;
 
-    ((AMDGPUOperand &)*Operands[ClampIdx]).addImmOperands(Inst, 1);
-    ((AMDGPUOperand &)*Operands[OModIdx]).addImmOperands(Inst, 1);
-  } else {
-    for (unsigned e = Operands.size(); i != e; ++i)
-      ((AMDGPUOperand &)*Operands[i]).addRegOrImmOperands(Inst, 1);
+  res = parseStringWithPrefix(Prefix, Value);
+  if (res != MatchOperand_Success) {
+    return res;
+  }
+
+  int64_t Int;
+  Int = StringSwitch<int64_t>(Value)
+        .Case("BYTE_0", 0)
+        .Case("BYTE_1", 1)
+        .Case("BYTE_2", 2)
+        .Case("BYTE_3", 3)
+        .Case("WORD_0", 4)
+        .Case("WORD_1", 5)
+        .Case("DWORD", 6)
+        .Default(0xffffffff);
+  Parser.Lex(); // eat last token
+
+  if (Int == 0xffffffff) {
+    return MatchOperand_ParseFail;
+  }
+
+  Operands.push_back(AMDGPUOperand::CreateImm(Int, S, Type));
+  return MatchOperand_Success;
+}
+
+AMDGPUAsmParser::OperandMatchResultTy
+AMDGPUAsmParser::parseSDWADstUnused(OperandVector &Operands) {
+  SMLoc S = Parser.getTok().getLoc();
+  StringRef Value;
+  AMDGPUAsmParser::OperandMatchResultTy res;
+
+  res = parseStringWithPrefix("dst_unused", Value);
+  if (res != MatchOperand_Success) {
+    return res;
+  }
+
+  int64_t Int;
+  Int = StringSwitch<int64_t>(Value)
+        .Case("UNUSED_PAD", 0)
+        .Case("UNUSED_SEXT", 1)
+        .Case("UNUSED_PRESERVE", 2)
+        .Default(0xffffffff);
+  Parser.Lex(); // eat last token
+
+  if (Int == 0xffffffff) {
+    return MatchOperand_ParseFail;
+  }
+
+  Operands.push_back(AMDGPUOperand::CreateImm(Int, S,
+                                              AMDGPUOperand::ImmTySdwaDstUnused));
+  return MatchOperand_Success;
+}
+
+void AMDGPUAsmParser::cvtSdwaVOP1(MCInst &Inst, const OperandVector &Operands) {
+  cvtSDWA(Inst, Operands, SIInstrFlags::VOP1);
+}
+
+void AMDGPUAsmParser::cvtSdwaVOP2(MCInst &Inst, const OperandVector &Operands) {
+  cvtSDWA(Inst, Operands, SIInstrFlags::VOP2);
+}
+
+void AMDGPUAsmParser::cvtSdwaVOPC(MCInst &Inst, const OperandVector &Operands) {
+  cvtSDWA(Inst, Operands, SIInstrFlags::VOPC);
+}
+
+void AMDGPUAsmParser::cvtSDWA(MCInst &Inst, const OperandVector &Operands,
+                              uint64_t BasicInstType) {
+  OptionalImmIndexMap OptionalIdx;
+
+  unsigned I = 1;
+  const MCInstrDesc &Desc = MII.get(Inst.getOpcode());
+  for (unsigned J = 0; J < Desc.getNumDefs(); ++J) {
+    ((AMDGPUOperand &)*Operands[I++]).addRegOperands(Inst, 1);
+  }
+
+  for (unsigned E = Operands.size(); I != E; ++I) {
+    AMDGPUOperand &Op = ((AMDGPUOperand &)*Operands[I]);
+    // Add the register arguments
+    if (BasicInstType == SIInstrFlags::VOPC &&
+        Op.isReg() &&
+        Op.Reg.RegNo == AMDGPU::VCC) {
+      // VOPC sdwa use "vcc" token as dst. Skip it.
+      continue;
+    } else if (Op.isRegOrImmWithInputMods()) {
+       Op.addRegOrImmWithInputModsOperands(Inst, 2);
+    } else if (Op.isImm()) {
+      // Handle optional arguments
+      OptionalIdx[Op.getImmTy()] = I;
+    } else {
+      llvm_unreachable("Invalid operand type");
+    }
+  }
+
+  addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyClampSI, 0);
+  
+  if (Inst.getOpcode() == AMDGPU::V_NOP_sdwa) {
+    // V_NOP_sdwa has no optional sdwa arguments
+    return;
+  }
+  switch (BasicInstType) {
+  case SIInstrFlags::VOP1: {
+    addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTySdwaDstSel, 6);
+    addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTySdwaDstUnused, 2);
+    addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTySdwaSrc0Sel, 6);
+    break;
+  }
+  case SIInstrFlags::VOP2: {
+    addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTySdwaDstSel, 6);
+    addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTySdwaDstUnused, 2);
+    addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTySdwaSrc0Sel, 6);
+    addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTySdwaSrc1Sel, 6);
+    break;
+  }
+  case SIInstrFlags::VOPC: {
+    addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTySdwaSrc0Sel, 6);
+    addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTySdwaSrc1Sel, 6);
+    break;
+  }
+  default:
+    llvm_unreachable("Invalid instruction type. Only VOP1, VOP2 and VOPC allowed");
   }
 }
 
@@ -1890,3 +2755,37 @@ extern "C" void LLVMInitializeAMDGPUAsmParser() {
 #define GET_MATCHER_IMPLEMENTATION
 #include "AMDGPUGenAsmMatcher.inc"
 
+
+// This fuction should be defined after auto-generated include so that we have
+// MatchClassKind enum defined
+unsigned AMDGPUAsmParser::validateTargetOperandClass(MCParsedAsmOperand &Op,
+                                                     unsigned Kind) {
+  // Tokens like "glc" would be parsed as immediate operands in ParseOperand().
+  // But MatchInstructionImpl() expects to meet token and fails to validate
+  // operand. This method checks if we are given immediate operand but expect to
+  // get corresponding token.
+  AMDGPUOperand &Operand = (AMDGPUOperand&)Op;
+  switch (Kind) {
+  case MCK_addr64:
+    return Operand.isAddr64() ? Match_Success : Match_InvalidOperand;
+  case MCK_gds:
+    return Operand.isGDS() ? Match_Success : Match_InvalidOperand;
+  case MCK_glc:
+    return Operand.isGLC() ? Match_Success : Match_InvalidOperand;
+  case MCK_idxen:
+    return Operand.isIdxen() ? Match_Success : Match_InvalidOperand;
+  case MCK_offen:
+    return Operand.isOffen() ? Match_Success : Match_InvalidOperand;
+  case MCK_SSrc32:
+    // When operands have expression values, they will return true for isToken,
+    // because it is not possible to distinguish between a token and an
+    // expression at parse time. MatchInstructionImpl() will always try to
+    // match an operand as a token, when isToken returns true, and when the
+    // name of the expression is not a valid token, the match will fail,
+    // so we need to handle it here.
+    return Operand.isSSrc32() ? Match_Success : Match_InvalidOperand;
+  case MCK_SoppBrTarget:
+    return Operand.isSoppBrTarget() ? Match_Success : Match_InvalidOperand;
+  default: return Match_InvalidOperand;
+  }
+}
diff --git a/lib/Target/AMDGPU/AsmParser/CMakeLists.txt b/lib/Target/AMDGPU/AsmParser/CMakeLists.txt
index 21ddc4eb83d2..70be7bb6eb36 100644
--- a/lib/Target/AMDGPU/AsmParser/CMakeLists.txt
+++ b/lib/Target/AMDGPU/AsmParser/CMakeLists.txt
@@ -1,3 +1,5 @@
 add_llvm_library(LLVMAMDGPUAsmParser
   AMDGPUAsmParser.cpp
   )
+
+add_dependencies(LLVMAMDGPUAsmParser LLVMAMDGPUUtils)
diff --git a/lib/Target/AMDGPU/AsmParser/Makefile b/lib/Target/AMDGPU/AsmParser/Makefile
deleted file mode 100644
index 5ad219028036..000000000000
--- a/lib/Target/AMDGPU/AsmParser/Makefile
+++ /dev/null
@@ -1,15 +0,0 @@
-##===- lib/Target/AMDGPU/AsmParser/Makefile ----------------*- Makefile -*-===##
-#
-#                     The LLVM Compiler Infrastructure
-#
-# This file is distributed under the University of Illinois Open Source
-# License. See LICENSE.TXT for details.
-#
-##===----------------------------------------------------------------------===##
-LEVEL = ../../../..
-LIBRARYNAME = LLVMAMDGPUAsmParser
-
-# Hack: we need to include 'main' AMDGPU target directory to grab private headers
-CPP.Flags += -I$(PROJ_OBJ_DIR)/.. -I$(PROJ_SRC_DIR)/..
-
-include $(LEVEL)/Makefile.common
diff --git a/lib/Target/AMDGPU/CIInstructions.td b/lib/Target/AMDGPU/CIInstructions.td
index c543814cae0d..f9a9f79126bd 100644
--- a/lib/Target/AMDGPU/CIInstructions.td
+++ b/lib/Target/AMDGPU/CIInstructions.td
@@ -25,14 +25,6 @@
 // BUFFER_LOAD_DWORDX3
 // BUFFER_STORE_DWORDX3
 
-
-def isCIVI : Predicate <
-  "Subtarget->getGeneration() == AMDGPUSubtarget::SEA_ISLANDS || "
-  "Subtarget->getGeneration() == AMDGPUSubtarget::VOLCANIC_ISLANDS"
->, AssemblerPredicate<"FeatureCIInsts">;
-
-def HasFlatAddressSpace : Predicate<"Subtarget->hasFlatAddressSpace()">;
-
 //===----------------------------------------------------------------------===//
 // VOP1 Instructions
 //===----------------------------------------------------------------------===//
@@ -108,9 +100,11 @@ defm S_DCACHE_INV_VOL : SMRD_Inval <smrd<0x1d, 0x22>,
 // MUBUF Instructions
 //===----------------------------------------------------------------------===//
 
+let DisableSIDecoder = 1 in {
 defm BUFFER_WBINVL1_VOL : MUBUF_Invalidate <mubuf<0x70, 0x3f>,
   "buffer_wbinvl1_vol", int_amdgcn_buffer_wbinvl1_vol
 >;
+}
 
 //===----------------------------------------------------------------------===//
 // Flat Instructions
@@ -159,129 +153,114 @@ defm FLAT_STORE_DWORDX3 : FLAT_Store_Helper <
   flat<0x1f, 0x1e>, "flat_store_dwordx3", VReg_96
 >;
 defm FLAT_ATOMIC_SWAP : FLAT_ATOMIC <
-  flat<0x30, 0x40>, "flat_atomic_swap", VGPR_32
+  flat<0x30, 0x40>, "flat_atomic_swap", VGPR_32, i32, atomic_swap_flat
 >;
 defm FLAT_ATOMIC_CMPSWAP : FLAT_ATOMIC <
-  flat<0x31, 0x41>, "flat_atomic_cmpswap", VGPR_32, VReg_64
+  flat<0x31, 0x41>, "flat_atomic_cmpswap", VGPR_32, i32,
+    atomic_cmp_swap_flat, v2i32, VReg_64
 >;
 defm FLAT_ATOMIC_ADD : FLAT_ATOMIC <
-  flat<0x32, 0x42>, "flat_atomic_add", VGPR_32
+  flat<0x32, 0x42>, "flat_atomic_add", VGPR_32, i32, atomic_add_flat
 >;
 defm FLAT_ATOMIC_SUB : FLAT_ATOMIC <
-  flat<0x33, 0x43>, "flat_atomic_sub", VGPR_32
+  flat<0x33, 0x43>, "flat_atomic_sub", VGPR_32, i32, atomic_sub_flat
 >;
 defm FLAT_ATOMIC_SMIN : FLAT_ATOMIC <
-  flat<0x35, 0x44>, "flat_atomic_smin", VGPR_32
+  flat<0x35, 0x44>, "flat_atomic_smin", VGPR_32, i32, atomic_min_flat
 >;
 defm FLAT_ATOMIC_UMIN : FLAT_ATOMIC <
-  flat<0x36, 0x45>, "flat_atomic_umin", VGPR_32
+  flat<0x36, 0x45>, "flat_atomic_umin", VGPR_32, i32, atomic_umin_flat
 >;
 defm FLAT_ATOMIC_SMAX : FLAT_ATOMIC <
-  flat<0x37, 0x46>, "flat_atomic_smax", VGPR_32
+  flat<0x37, 0x46>, "flat_atomic_smax", VGPR_32, i32, atomic_max_flat
 >;
 defm FLAT_ATOMIC_UMAX : FLAT_ATOMIC <
-  flat<0x38, 0x47>, "flat_atomic_umax", VGPR_32
+  flat<0x38, 0x47>, "flat_atomic_umax", VGPR_32, i32, atomic_umax_flat
 >;
 defm FLAT_ATOMIC_AND : FLAT_ATOMIC <
-  flat<0x39, 0x48>, "flat_atomic_and", VGPR_32
+  flat<0x39, 0x48>, "flat_atomic_and", VGPR_32, i32, atomic_and_flat
 >;
 defm FLAT_ATOMIC_OR : FLAT_ATOMIC <
-  flat<0x3a, 0x49>, "flat_atomic_or", VGPR_32
+  flat<0x3a, 0x49>, "flat_atomic_or", VGPR_32, i32, atomic_or_flat
 >;
 defm FLAT_ATOMIC_XOR : FLAT_ATOMIC <
-  flat<0x3b, 0x4a>, "flat_atomic_xor", VGPR_32
+  flat<0x3b, 0x4a>, "flat_atomic_xor", VGPR_32, i32, atomic_xor_flat
 >;
 defm FLAT_ATOMIC_INC : FLAT_ATOMIC <
-  flat<0x3c, 0x4b>, "flat_atomic_inc", VGPR_32
+  flat<0x3c, 0x4b>, "flat_atomic_inc", VGPR_32, i32, atomic_inc_flat
 >;
 defm FLAT_ATOMIC_DEC : FLAT_ATOMIC <
-  flat<0x3d, 0x4c>, "flat_atomic_dec", VGPR_32
+  flat<0x3d, 0x4c>, "flat_atomic_dec", VGPR_32, i32, atomic_dec_flat
 >;
 defm FLAT_ATOMIC_SWAP_X2 : FLAT_ATOMIC <
-  flat<0x50, 0x60>, "flat_atomic_swap_x2", VReg_64
+  flat<0x50, 0x60>, "flat_atomic_swap_x2", VReg_64, i64, atomic_swap_flat
 >;
 defm FLAT_ATOMIC_CMPSWAP_X2 : FLAT_ATOMIC <
-  flat<0x51, 0x61>, "flat_atomic_cmpswap_x2", VReg_64, VReg_128
+  flat<0x51, 0x61>, "flat_atomic_cmpswap_x2", VReg_64, i64,
+    atomic_cmp_swap_flat, v2i64, VReg_128
 >;
 defm FLAT_ATOMIC_ADD_X2 : FLAT_ATOMIC <
-  flat<0x52, 0x62>, "flat_atomic_add_x2", VReg_64
+  flat<0x52, 0x62>, "flat_atomic_add_x2", VReg_64, i64, atomic_add_flat
 >;
 defm FLAT_ATOMIC_SUB_X2 : FLAT_ATOMIC <
-  flat<0x53, 0x63>, "flat_atomic_sub_x2", VReg_64
+  flat<0x53, 0x63>, "flat_atomic_sub_x2", VReg_64, i64, atomic_sub_flat
 >;
 defm FLAT_ATOMIC_SMIN_X2 : FLAT_ATOMIC <
-  flat<0x55, 0x64>, "flat_atomic_smin_x2", VReg_64
+  flat<0x55, 0x64>, "flat_atomic_smin_x2", VReg_64, i64, atomic_min_flat
 >;
 defm FLAT_ATOMIC_UMIN_X2 : FLAT_ATOMIC <
-  flat<0x56, 0x65>, "flat_atomic_umin_x2", VReg_64
+  flat<0x56, 0x65>, "flat_atomic_umin_x2", VReg_64, i64, atomic_umin_flat
 >;
 defm FLAT_ATOMIC_SMAX_X2 : FLAT_ATOMIC <
-  flat<0x57, 0x66>, "flat_atomic_smax_x2", VReg_64
+  flat<0x57, 0x66>, "flat_atomic_smax_x2", VReg_64, i64, atomic_max_flat
 >;
 defm FLAT_ATOMIC_UMAX_X2 : FLAT_ATOMIC <
-  flat<0x58, 0x67>, "flat_atomic_umax_x2", VReg_64
+  flat<0x58, 0x67>, "flat_atomic_umax_x2", VReg_64, i64, atomic_umax_flat
 >;
 defm FLAT_ATOMIC_AND_X2 : FLAT_ATOMIC <
-  flat<0x59, 0x68>, "flat_atomic_and_x2", VReg_64
+  flat<0x59, 0x68>, "flat_atomic_and_x2", VReg_64, i64, atomic_and_flat
 >;
 defm FLAT_ATOMIC_OR_X2 : FLAT_ATOMIC <
-  flat<0x5a, 0x69>, "flat_atomic_or_x2", VReg_64
+  flat<0x5a, 0x69>, "flat_atomic_or_x2", VReg_64, i64, atomic_or_flat
 >;
 defm FLAT_ATOMIC_XOR_X2 : FLAT_ATOMIC <
-  flat<0x5b, 0x6a>, "flat_atomic_xor_x2", VReg_64
+  flat<0x5b, 0x6a>, "flat_atomic_xor_x2", VReg_64, i64, atomic_xor_flat
 >;
 defm FLAT_ATOMIC_INC_X2 : FLAT_ATOMIC <
-  flat<0x5c, 0x6b>, "flat_atomic_inc_x2", VReg_64
+  flat<0x5c, 0x6b>, "flat_atomic_inc_x2", VReg_64, i64, atomic_inc_flat
 >;
 defm FLAT_ATOMIC_DEC_X2 : FLAT_ATOMIC <
-  flat<0x5d, 0x6c>, "flat_atomic_dec_x2", VReg_64
+  flat<0x5d, 0x6c>, "flat_atomic_dec_x2", VReg_64, i64, atomic_dec_flat
 >;
 
 } // End SubtargetPredicate = isCIVI
 
 // CI Only flat instructions
 
-let SubtargetPredicate = isCI, VIAssemblerPredicate = DisableInst in {
+let SubtargetPredicate = isCI, VIAssemblerPredicate = DisableInst, DisableVIDecoder = 1 in {
 
 defm FLAT_ATOMIC_FCMPSWAP : FLAT_ATOMIC <
-  flat<0x3e>, "flat_atomic_fcmpswap", VGPR_32, VReg_64
+  flat<0x3e>, "flat_atomic_fcmpswap", VGPR_32, f32,
+    null_frag, v2f32, VReg_64
 >;
 defm FLAT_ATOMIC_FMIN : FLAT_ATOMIC <
-  flat<0x3f>, "flat_atomic_fmin", VGPR_32
+  flat<0x3f>, "flat_atomic_fmin", VGPR_32, f32
 >;
 defm FLAT_ATOMIC_FMAX : FLAT_ATOMIC <
-  flat<0x40>, "flat_atomic_fmax", VGPR_32
+  flat<0x40>, "flat_atomic_fmax", VGPR_32, f32
 >;
 defm FLAT_ATOMIC_FCMPSWAP_X2 : FLAT_ATOMIC <
-  flat<0x5e>, "flat_atomic_fcmpswap_x2", VReg_64, VReg_128
+  flat<0x5e>, "flat_atomic_fcmpswap_x2", VReg_64, f64,
+  null_frag, v2f64, VReg_128
 >;
 defm FLAT_ATOMIC_FMIN_X2 : FLAT_ATOMIC <
-  flat<0x5f>, "flat_atomic_fmin_x2", VReg_64
+  flat<0x5f>, "flat_atomic_fmin_x2", VReg_64, f64
 >;
 defm FLAT_ATOMIC_FMAX_X2 : FLAT_ATOMIC <
-  flat<0x60>, "flat_atomic_fmax_x2", VReg_64
+  flat<0x60>, "flat_atomic_fmax_x2", VReg_64, f64
 >;
 
-} // End let SubtargetPredicate = isCI, VIAssemblerPredicate = DisableInst
-
-let Predicates = [isCI] in {
-
-// Convert (x - floor(x)) to fract(x)
-def : Pat <
-  (f32 (fsub (f32 (VOP3Mods f32:$x, i32:$mods)),
-             (f32 (ffloor (f32 (VOP3Mods f32:$x, i32:$mods)))))),
-  (V_FRACT_F32_e64 $mods, $x, DSTCLAMP.NONE, DSTOMOD.NONE)
->;
-
-// Convert (x + (-floor(x))) to fract(x)
-def : Pat <
-  (f64 (fadd (f64 (VOP3Mods f64:$x, i32:$mods)),
-             (f64 (fneg (f64 (ffloor (f64 (VOP3Mods f64:$x, i32:$mods)))))))),
-  (V_FRACT_F64_e64 $mods, $x, DSTCLAMP.NONE, DSTOMOD.NONE)
->;
-
-} // End Predicates = [isCI]
-
+} // End SubtargetPredicate = isCI, VIAssemblerPredicate = DisableInst, DisableVIDecoder = 1
 
 //===----------------------------------------------------------------------===//
 // Flat Patterns
@@ -289,12 +268,17 @@ def : Pat <
 
 let Predicates = [isCIVI] in {
 
-// Patterns for global loads with no offset
+// Patterns for global loads with no offset.
 class FlatLoadPat <FLAT inst, SDPatternOperator node, ValueType vt> : Pat <
   (vt (node i64:$addr)),
   (inst $addr, 0, 0, 0)
 >;
 
+class FlatLoadAtomicPat <FLAT inst, SDPatternOperator node, ValueType vt> : Pat <
+  (vt (node i64:$addr)),
+  (inst $addr, 1, 0, 0)
+>;
+
 def : FlatLoadPat <FLAT_LOAD_UBYTE, flat_az_extloadi8, i32>;
 def : FlatLoadPat <FLAT_LOAD_SBYTE, flat_sextloadi8, i32>;
 def : FlatLoadPat <FLAT_LOAD_USHORT, flat_az_extloadi16, i32>;
@@ -303,9 +287,20 @@ def : FlatLoadPat <FLAT_LOAD_DWORD, flat_load, i32>;
 def : FlatLoadPat <FLAT_LOAD_DWORDX2, flat_load, v2i32>;
 def : FlatLoadPat <FLAT_LOAD_DWORDX4, flat_load, v4i32>;
 
+def : FlatLoadAtomicPat <FLAT_LOAD_DWORD, atomic_flat_load, i32>;
+def : FlatLoadAtomicPat <FLAT_LOAD_DWORDX2, atomic_flat_load, i64>;
+
+
 class FlatStorePat <FLAT inst, SDPatternOperator node, ValueType vt> : Pat <
   (node vt:$data, i64:$addr),
-  (inst $data, $addr, 0, 0, 0)
+  (inst $addr, $data, 0, 0, 0)
+>;
+
+class FlatStoreAtomicPat <FLAT inst, SDPatternOperator node, ValueType vt> : Pat <
+  // atomic store follows atomic binop convention so the address comes
+  // first.
+  (node i64:$addr, vt:$data),
+  (inst $addr, $data, 1, 0, 0)
 >;
 
 def : FlatStorePat <FLAT_STORE_BYTE, flat_truncstorei8, i32>;
@@ -314,20 +309,41 @@ def : FlatStorePat <FLAT_STORE_DWORD, flat_store, i32>;
 def : FlatStorePat <FLAT_STORE_DWORDX2, flat_store, v2i32>;
 def : FlatStorePat <FLAT_STORE_DWORDX4, flat_store, v4i32>;
 
-class FlatAtomicPat <FLAT inst, SDPatternOperator node, ValueType vt> : Pat <
-  (vt (node i64:$addr, vt:$data)),
+def : FlatStoreAtomicPat <FLAT_STORE_DWORD, atomic_flat_store, i32>;
+def : FlatStoreAtomicPat <FLAT_STORE_DWORDX2, atomic_flat_store, i64>;
+
+class FlatAtomicPat <FLAT inst, SDPatternOperator node, ValueType vt,
+                     ValueType data_vt = vt> : Pat <
+  (vt (node i64:$addr, data_vt:$data)),
   (inst $addr, $data, 0, 0)
 >;
 
 def : FlatAtomicPat <FLAT_ATOMIC_ADD_RTN, atomic_add_global, i32>;
-def : FlatAtomicPat <FLAT_ATOMIC_AND_RTN, atomic_and_global, i32>;
 def : FlatAtomicPat <FLAT_ATOMIC_SUB_RTN, atomic_sub_global, i32>;
+def : FlatAtomicPat <FLAT_ATOMIC_INC_RTN, atomic_inc_global, i32>;
+def : FlatAtomicPat <FLAT_ATOMIC_DEC_RTN, atomic_dec_global, i32>;
+def : FlatAtomicPat <FLAT_ATOMIC_AND_RTN, atomic_and_global, i32>;
 def : FlatAtomicPat <FLAT_ATOMIC_SMAX_RTN, atomic_max_global, i32>;
 def : FlatAtomicPat <FLAT_ATOMIC_UMAX_RTN, atomic_umax_global, i32>;
 def : FlatAtomicPat <FLAT_ATOMIC_SMIN_RTN, atomic_min_global, i32>;
 def : FlatAtomicPat <FLAT_ATOMIC_UMIN_RTN, atomic_umin_global, i32>;
 def : FlatAtomicPat <FLAT_ATOMIC_OR_RTN, atomic_or_global, i32>;
 def : FlatAtomicPat <FLAT_ATOMIC_SWAP_RTN, atomic_swap_global, i32>;
+def : FlatAtomicPat <FLAT_ATOMIC_CMPSWAP_RTN, atomic_cmp_swap_global, i32, v2i32>;
 def : FlatAtomicPat <FLAT_ATOMIC_XOR_RTN, atomic_xor_global, i32>;
 
+def : FlatAtomicPat <FLAT_ATOMIC_ADD_X2_RTN, atomic_add_global, i64>;
+def : FlatAtomicPat <FLAT_ATOMIC_SUB_X2_RTN, atomic_sub_global, i64>;
+def : FlatAtomicPat <FLAT_ATOMIC_INC_X2_RTN, atomic_inc_global, i64>;
+def : FlatAtomicPat <FLAT_ATOMIC_DEC_X2_RTN, atomic_dec_global, i64>;
+def : FlatAtomicPat <FLAT_ATOMIC_AND_X2_RTN, atomic_and_global, i64>;
+def : FlatAtomicPat <FLAT_ATOMIC_SMAX_X2_RTN, atomic_max_global, i64>;
+def : FlatAtomicPat <FLAT_ATOMIC_UMAX_X2_RTN, atomic_umax_global, i64>;
+def : FlatAtomicPat <FLAT_ATOMIC_SMIN_X2_RTN, atomic_min_global, i64>;
+def : FlatAtomicPat <FLAT_ATOMIC_UMIN_X2_RTN, atomic_umin_global, i64>;
+def : FlatAtomicPat <FLAT_ATOMIC_OR_X2_RTN, atomic_or_global, i64>;
+def : FlatAtomicPat <FLAT_ATOMIC_SWAP_X2_RTN, atomic_swap_global, i64>;
+def : FlatAtomicPat <FLAT_ATOMIC_CMPSWAP_X2_RTN, atomic_cmp_swap_global, i64, v2i64>;
+def : FlatAtomicPat <FLAT_ATOMIC_XOR_X2_RTN, atomic_xor_global, i64>;
+
 } // End Predicates = [isCIVI]
diff --git a/lib/Target/AMDGPU/CMakeLists.txt b/lib/Target/AMDGPU/CMakeLists.txt
index b9ef0e821763..45825c9cc76a 100644
--- a/lib/Target/AMDGPU/CMakeLists.txt
+++ b/lib/Target/AMDGPU/CMakeLists.txt
@@ -10,15 +10,30 @@ tablegen(LLVM AMDGPUGenMCCodeEmitter.inc -gen-emitter)
 tablegen(LLVM AMDGPUGenDFAPacketizer.inc -gen-dfa-packetizer)
 tablegen(LLVM AMDGPUGenAsmWriter.inc -gen-asm-writer)
 tablegen(LLVM AMDGPUGenAsmMatcher.inc -gen-asm-matcher)
+tablegen(LLVM AMDGPUGenDisassemblerTables.inc -gen-disassembler)
 add_public_tablegen_target(AMDGPUCommonTableGen)
 
+# List of all GlobalISel files.
+set(GLOBAL_ISEL_FILES
+  AMDGPUCallLowering.cpp
+  )
+
+# Add GlobalISel files to the dependencies if the user wants to build it.
+if(LLVM_BUILD_GLOBAL_ISEL)
+  set(GLOBAL_ISEL_BUILD_FILES ${GLOBAL_ISEL_FILES})
+else()
+  set(GLOBAL_ISEL_BUILD_FILES"")
+  set(LLVM_OPTIONAL_SOURCES LLVMGlobalISel ${GLOBAL_ISEL_FILES})
+endif()
+
+
 add_llvm_target(AMDGPUCodeGen
   AMDILCFGStructurizer.cpp
   AMDGPUAlwaysInlinePass.cpp
   AMDGPUAnnotateKernelFeatures.cpp
   AMDGPUAnnotateUniformValues.cpp
   AMDGPUAsmPrinter.cpp
-  AMDGPUDiagnosticInfoUnsupported.cpp
+  AMDGPUCodeGenPrepare.cpp
   AMDGPUFrameLowering.cpp
   AMDGPUTargetObjectFile.cpp
   AMDGPUIntrinsicInfo.cpp
@@ -33,10 +48,12 @@ add_llvm_target(AMDGPUCodeGen
   AMDGPUInstrInfo.cpp
   AMDGPUPromoteAlloca.cpp
   AMDGPURegisterInfo.cpp
+  GCNHazardRecognizer.cpp
   R600ClauseMergePass.cpp
   R600ControlFlowFinalizer.cpp
   R600EmitClauseMarkers.cpp
   R600ExpandSpecialInstrs.cpp
+  R600FrameLowering.cpp
   R600InstrInfo.cpp
   R600ISelLowering.cpp
   R600MachineFunctionInfo.cpp
@@ -44,11 +61,10 @@ add_llvm_target(AMDGPUCodeGen
   R600OptimizeVectorRegisters.cpp
   R600Packetizer.cpp
   R600RegisterInfo.cpp
-  R600TextureIntrinsicsReplacer.cpp
   SIAnnotateControlFlow.cpp
+  SIDebuggerInsertNops.cpp
   SIFixControlFlowLiveIntervals.cpp
   SIFixSGPRCopies.cpp
-  SIFixSGPRLiveRanges.cpp
   SIFoldOperands.cpp
   SIFrameLowering.cpp
   SIInsertWaits.cpp
@@ -62,10 +78,13 @@ add_llvm_target(AMDGPUCodeGen
   SIRegisterInfo.cpp
   SIShrinkInstructions.cpp
   SITypeRewriter.cpp
+  SIWholeQuadMode.cpp
+  ${GLOBAL_ISEL_BUILD_FILES}
   )
 
 add_subdirectory(AsmParser)
 add_subdirectory(InstPrinter)
+add_subdirectory(Disassembler)
 add_subdirectory(TargetInfo)
 add_subdirectory(MCTargetDesc)
 add_subdirectory(Utils)
diff --git a/lib/Target/AMDGPU/CaymanInstructions.td b/lib/Target/AMDGPU/CaymanInstructions.td
index a6c3785c815b..98bc6e856ea2 100644
--- a/lib/Target/AMDGPU/CaymanInstructions.td
+++ b/lib/Target/AMDGPU/CaymanInstructions.td
@@ -51,7 +51,6 @@ def : RsqPat<RECIPSQRT_IEEE_cm, f32>;
 def : POW_Common <LOG_IEEE_cm, EXP_IEEE_cm, MUL>;
 
 defm DIV_cm : DIV_Common<RECIP_IEEE_cm>;
-defm : Expand24UBitOps<MULLO_UINT_cm, ADD_INT>;
 
 // RECIP_UINT emulation for Cayman
 // The multiplication scales from [0,1] to the unsigned integer range
@@ -203,27 +202,53 @@ def VTX_READ_PARAM_128_cm : VTX_READ_128_cm <0,
 //===----------------------------------------------------------------------===//
 
 // 8-bit reads
-def VTX_READ_GLOBAL_8_cm : VTX_READ_8_cm <1,
-  [(set i32:$dst_gpr, (az_extloadi8_global ADDRVTX_READ:$src_gpr))]
+def VTX_READ_ID1_8_cm : VTX_READ_8_cm <1,
+  [(set i32:$dst_gpr, (vtx_id1_az_extloadi8 ADDRVTX_READ:$src_gpr))]
 >;
 
-def VTX_READ_GLOBAL_16_cm : VTX_READ_16_cm <1,
-  [(set i32:$dst_gpr, (az_extloadi16_global ADDRVTX_READ:$src_gpr))]
+// 16-bit reads
+def VTX_READ_ID1_16_cm : VTX_READ_16_cm <1,
+  [(set i32:$dst_gpr, (vtx_id1_az_extloadi16 ADDRVTX_READ:$src_gpr))]
 >;
 
 // 32-bit reads
-def VTX_READ_GLOBAL_32_cm : VTX_READ_32_cm <1,
-  [(set i32:$dst_gpr, (global_load ADDRVTX_READ:$src_gpr))]
+def VTX_READ_ID1_32_cm : VTX_READ_32_cm <1,
+  [(set i32:$dst_gpr, (vtx_id1_load ADDRVTX_READ:$src_gpr))]
 >;
 
 // 64-bit reads
-def VTX_READ_GLOBAL_64_cm : VTX_READ_64_cm <1,
-  [(set v2i32:$dst_gpr, (global_load ADDRVTX_READ:$src_gpr))]
+def VTX_READ_ID1_64_cm : VTX_READ_64_cm <1,
+  [(set v2i32:$dst_gpr, (vtx_id1_load ADDRVTX_READ:$src_gpr))]
 >;
 
 // 128-bit reads
-def VTX_READ_GLOBAL_128_cm : VTX_READ_128_cm <1,
-  [(set v4i32:$dst_gpr, (global_load ADDRVTX_READ:$src_gpr))]
+def VTX_READ_ID1_128_cm : VTX_READ_128_cm <1,
+  [(set v4i32:$dst_gpr, (vtx_id1_load ADDRVTX_READ:$src_gpr))]
+>;
+
+// 8-bit reads
+def VTX_READ_ID2_8_cm : VTX_READ_8_cm <2,
+  [(set i32:$dst_gpr, (vtx_id2_az_extloadi8 ADDRVTX_READ:$src_gpr))]
+>;
+
+// 16-bit reads
+def VTX_READ_ID2_16_cm : VTX_READ_16_cm <2,
+  [(set i32:$dst_gpr, (vtx_id2_az_extloadi16 ADDRVTX_READ:$src_gpr))]
+>;
+
+// 32-bit reads
+def VTX_READ_ID2_32_cm : VTX_READ_32_cm <2,
+  [(set i32:$dst_gpr, (vtx_id2_load ADDRVTX_READ:$src_gpr))]
+>;
+
+// 64-bit reads
+def VTX_READ_ID2_64_cm : VTX_READ_64_cm <2,
+  [(set v2i32:$dst_gpr, (vtx_id2_load ADDRVTX_READ:$src_gpr))]
+>;
+
+// 128-bit reads
+def VTX_READ_ID2_128_cm : VTX_READ_128_cm <2,
+  [(set v4i32:$dst_gpr, (vtx_id2_load ADDRVTX_READ:$src_gpr))]
 >;
 
 } // End isCayman
diff --git a/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp b/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp
new file mode 100644
index 000000000000..e11de855fe5f
--- /dev/null
+++ b/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp
@@ -0,0 +1,437 @@
+//===-- AMDGPUDisassembler.cpp - Disassembler for AMDGPU ISA --------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+//===----------------------------------------------------------------------===//
+//
+/// \file
+///
+/// This file contains definition for AMDGPU ISA disassembler
+//
+//===----------------------------------------------------------------------===//
+
+// ToDo: What to do with instruction suffixes (v_mov_b32 vs v_mov_b32_e32)?
+
+#include "AMDGPUDisassembler.h"
+#include "AMDGPU.h"
+#include "AMDGPURegisterInfo.h"
+#include "SIDefines.h"
+#include "Utils/AMDGPUBaseInfo.h"
+
+#include "llvm/MC/MCContext.h"
+#include "llvm/MC/MCFixedLenDisassembler.h"
+#include "llvm/MC/MCInst.h"
+#include "llvm/MC/MCInstrDesc.h"
+#include "llvm/MC/MCSubtargetInfo.h"
+#include "llvm/Support/Endian.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/TargetRegistry.h"
+
+
+using namespace llvm;
+
+#define DEBUG_TYPE "amdgpu-disassembler"
+
+typedef llvm::MCDisassembler::DecodeStatus DecodeStatus;
+
+
+inline static MCDisassembler::DecodeStatus
+addOperand(MCInst &Inst, const MCOperand& Opnd) {
+  Inst.addOperand(Opnd);
+  return Opnd.isValid() ?
+    MCDisassembler::Success :
+    MCDisassembler::SoftFail;
+}
+
+#define DECODE_OPERAND2(RegClass, DecName) \
+static DecodeStatus Decode##RegClass##RegisterClass(MCInst &Inst, \
+                                                    unsigned Imm, \
+                                                    uint64_t /*Addr*/, \
+                                                    const void *Decoder) { \
+  auto DAsm = static_cast<const AMDGPUDisassembler*>(Decoder); \
+  return addOperand(Inst, DAsm->decodeOperand_##DecName(Imm)); \
+}
+
+#define DECODE_OPERAND(RegClass) DECODE_OPERAND2(RegClass, RegClass)
+
+DECODE_OPERAND(VGPR_32)
+DECODE_OPERAND(VS_32)
+DECODE_OPERAND(VS_64)
+
+DECODE_OPERAND(VReg_64)
+DECODE_OPERAND(VReg_96)
+DECODE_OPERAND(VReg_128)
+
+DECODE_OPERAND(SReg_32)
+DECODE_OPERAND(SReg_32_XM0)
+DECODE_OPERAND(SReg_64)
+DECODE_OPERAND(SReg_128)
+DECODE_OPERAND(SReg_256)
+DECODE_OPERAND(SReg_512)
+
+#define GET_SUBTARGETINFO_ENUM
+#include "AMDGPUGenSubtargetInfo.inc"
+#undef GET_SUBTARGETINFO_ENUM
+
+#include "AMDGPUGenDisassemblerTables.inc"
+
+//===----------------------------------------------------------------------===//
+//
+//===----------------------------------------------------------------------===//
+
+template <typename T> static inline T eatBytes(ArrayRef<uint8_t>& Bytes) {
+  assert(Bytes.size() >= sizeof(T));
+  const auto Res = support::endian::read<T, support::endianness::little>(Bytes.data());
+  Bytes = Bytes.slice(sizeof(T));
+  return Res;
+}
+
+DecodeStatus AMDGPUDisassembler::tryDecodeInst(const uint8_t* Table,
+                                               MCInst &MI,
+                                               uint64_t Inst,
+                                               uint64_t Address) const {
+  assert(MI.getOpcode() == 0);
+  assert(MI.getNumOperands() == 0);
+  MCInst TmpInst;
+  const auto SavedBytes = Bytes;
+  if (decodeInstruction(Table, TmpInst, Inst, Address, this, STI)) {
+    MI = TmpInst;
+    return MCDisassembler::Success;
+  }
+  Bytes = SavedBytes;
+  return MCDisassembler::Fail;
+}
+
+DecodeStatus AMDGPUDisassembler::getInstruction(MCInst &MI, uint64_t &Size,
+                                                ArrayRef<uint8_t> Bytes_,
+                                                uint64_t Address,
+                                                raw_ostream &WS,
+                                                raw_ostream &CS) const {
+  CommentStream = &CS;
+
+  // ToDo: AMDGPUDisassembler supports only VI ISA.
+  assert(AMDGPU::isVI(STI) && "Can disassemble only VI ISA.");
+
+  const unsigned MaxInstBytesNum = (std::min)((size_t)8, Bytes_.size());
+  Bytes = Bytes_.slice(0, MaxInstBytesNum);
+
+  DecodeStatus Res = MCDisassembler::Fail;
+  do {
+    // ToDo: better to switch encoding length using some bit predicate
+    // but it is unknown yet, so try all we can
+
+    // Try to decode DPP and SDWA first to solve conflict with VOP1 and VOP2
+    // encodings
+    if (Bytes.size() >= 8) {
+      const uint64_t QW = eatBytes<uint64_t>(Bytes);
+      Res = tryDecodeInst(DecoderTableDPP64, MI, QW, Address);
+      if (Res) break;
+
+      Res = tryDecodeInst(DecoderTableSDWA64, MI, QW, Address);
+      if (Res) break;
+    }
+
+    // Reinitialize Bytes as DPP64 could have eaten too much
+    Bytes = Bytes_.slice(0, MaxInstBytesNum);
+
+    // Try decode 32-bit instruction
+    if (Bytes.size() < 4) break;
+    const uint32_t DW = eatBytes<uint32_t>(Bytes);
+    Res = tryDecodeInst(DecoderTableVI32, MI, DW, Address);
+    if (Res) break;
+
+    Res = tryDecodeInst(DecoderTableAMDGPU32, MI, DW, Address);
+    if (Res) break;
+
+    if (Bytes.size() < 4) break;
+    const uint64_t QW = ((uint64_t)eatBytes<uint32_t>(Bytes) << 32) | DW;
+    Res = tryDecodeInst(DecoderTableVI64, MI, QW, Address);
+    if (Res) break;
+
+    Res = tryDecodeInst(DecoderTableAMDGPU64, MI, QW, Address);
+  } while (false);
+
+  Size = Res ? (MaxInstBytesNum - Bytes.size()) : 0;
+  return Res;
+}
+
+const char* AMDGPUDisassembler::getRegClassName(unsigned RegClassID) const {
+  return getContext().getRegisterInfo()->
+    getRegClassName(&AMDGPUMCRegisterClasses[RegClassID]);
+}
+
+inline
+MCOperand AMDGPUDisassembler::errOperand(unsigned V,
+                                         const Twine& ErrMsg) const {
+  *CommentStream << "Error: " + ErrMsg;
+
+  // ToDo: add support for error operands to MCInst.h
+  // return MCOperand::createError(V);
+  return MCOperand();
+}
+
+inline
+MCOperand AMDGPUDisassembler::createRegOperand(unsigned int RegId) const {
+  return MCOperand::createReg(RegId);
+}
+
+inline
+MCOperand AMDGPUDisassembler::createRegOperand(unsigned RegClassID,
+                                               unsigned Val) const {
+  const auto& RegCl = AMDGPUMCRegisterClasses[RegClassID];
+  if (Val >= RegCl.getNumRegs())
+    return errOperand(Val, Twine(getRegClassName(RegClassID)) +
+                           ": unknown register " + Twine(Val));
+  return createRegOperand(RegCl.getRegister(Val));
+}
+
+inline
+MCOperand AMDGPUDisassembler::createSRegOperand(unsigned SRegClassID,
+                                                unsigned Val) const {
+  // ToDo: SI/CI have 104 SGPRs, VI - 102
+  // Valery: here we accepting as much as we can, let assembler sort it out
+  int shift = 0;
+  switch (SRegClassID) {
+  case AMDGPU::SGPR_32RegClassID:
+  case AMDGPU::TTMP_32RegClassID:
+    break;
+  case AMDGPU::SGPR_64RegClassID:
+  case AMDGPU::TTMP_64RegClassID:
+    shift = 1;
+    break;
+  case AMDGPU::SGPR_128RegClassID:
+  case AMDGPU::TTMP_128RegClassID:
+  // ToDo: unclear if s[100:104] is available on VI. Can we use VCC as SGPR in
+  // this bundle?
+  case AMDGPU::SReg_256RegClassID:
+  // ToDo: unclear if s[96:104] is available on VI. Can we use VCC as SGPR in
+  // this bundle?
+  case AMDGPU::SReg_512RegClassID:
+    shift = 2;
+    break;
+  // ToDo: unclear if s[88:104] is available on VI. Can we use VCC as SGPR in
+  // this bundle?
+  default:
+    assert(false);
+    break;
+  }
+  if (Val % (1 << shift))
+    *CommentStream << "Warning: " << getRegClassName(SRegClassID)
+                   << ": scalar reg isn't aligned " << Val;
+  return createRegOperand(SRegClassID, Val >> shift);
+}
+
+MCOperand AMDGPUDisassembler::decodeOperand_VS_32(unsigned Val) const {
+  return decodeSrcOp(OPW32, Val);
+}
+
+MCOperand AMDGPUDisassembler::decodeOperand_VS_64(unsigned Val) const {
+  return decodeSrcOp(OPW64, Val);
+}
+
+MCOperand AMDGPUDisassembler::decodeOperand_VGPR_32(unsigned Val) const {
+  return createRegOperand(AMDGPU::VGPR_32RegClassID, Val);
+}
+
+MCOperand AMDGPUDisassembler::decodeOperand_VReg_64(unsigned Val) const {
+  return createRegOperand(AMDGPU::VReg_64RegClassID, Val);
+}
+
+MCOperand AMDGPUDisassembler::decodeOperand_VReg_96(unsigned Val) const {
+  return createRegOperand(AMDGPU::VReg_96RegClassID, Val);
+}
+
+MCOperand AMDGPUDisassembler::decodeOperand_VReg_128(unsigned Val) const {
+  return createRegOperand(AMDGPU::VReg_128RegClassID, Val);
+}
+
+MCOperand AMDGPUDisassembler::decodeOperand_SReg_32(unsigned Val) const {
+  // table-gen generated disassembler doesn't care about operand types
+  // leaving only registry class so SSrc_32 operand turns into SReg_32
+  // and therefore we accept immediates and literals here as well
+  return decodeSrcOp(OPW32, Val);
+}
+
+MCOperand AMDGPUDisassembler::decodeOperand_SReg_32_XM0(unsigned Val) const {
+  // SReg_32_XM0 is SReg_32 without M0
+  return decodeOperand_SReg_32(Val);
+}
+
+MCOperand AMDGPUDisassembler::decodeOperand_SReg_64(unsigned Val) const {
+  // see decodeOperand_SReg_32 comment
+  return decodeSrcOp(OPW64, Val);
+}
+
+MCOperand AMDGPUDisassembler::decodeOperand_SReg_128(unsigned Val) const {
+  return decodeSrcOp(OPW128, Val);
+}
+
+MCOperand AMDGPUDisassembler::decodeOperand_SReg_256(unsigned Val) const {
+  return createSRegOperand(AMDGPU::SReg_256RegClassID, Val);
+}
+
+MCOperand AMDGPUDisassembler::decodeOperand_SReg_512(unsigned Val) const {
+  return createSRegOperand(AMDGPU::SReg_512RegClassID, Val);
+}
+
+
+MCOperand AMDGPUDisassembler::decodeLiteralConstant() const {
+  // For now all literal constants are supposed to be unsigned integer
+  // ToDo: deal with signed/unsigned 64-bit integer constants
+  // ToDo: deal with float/double constants
+  if (Bytes.size() < 4)
+    return errOperand(0, "cannot read literal, inst bytes left " +
+                         Twine(Bytes.size()));
+  return MCOperand::createImm(eatBytes<uint32_t>(Bytes));
+}
+
+MCOperand AMDGPUDisassembler::decodeIntImmed(unsigned Imm) {
+  using namespace AMDGPU::EncValues;
+  assert(Imm >= INLINE_INTEGER_C_MIN && Imm <= INLINE_INTEGER_C_MAX);
+  return MCOperand::createImm((Imm <= INLINE_INTEGER_C_POSITIVE_MAX) ?
+    (static_cast<int64_t>(Imm) - INLINE_INTEGER_C_MIN) :
+    (INLINE_INTEGER_C_POSITIVE_MAX - static_cast<int64_t>(Imm)));
+      // Cast prevents negative overflow.
+}
+
+MCOperand AMDGPUDisassembler::decodeFPImmed(bool Is32, unsigned Imm) {
+  assert(Imm >= AMDGPU::EncValues::INLINE_FLOATING_C_MIN
+      && Imm <= AMDGPU::EncValues::INLINE_FLOATING_C_MAX);
+  // ToDo: case 248: 1/(2*PI) - is allowed only on VI
+  // ToDo: AMDGPUInstPrinter does not support 1/(2*PI). It consider 1/(2*PI) as
+  // literal constant.
+  float V = 0.0f;
+  switch (Imm) {
+  case 240: V =  0.5f; break;
+  case 241: V = -0.5f; break;
+  case 242: V =  1.0f; break;
+  case 243: V = -1.0f; break;
+  case 244: V =  2.0f; break;
+  case 245: V = -2.0f; break;
+  case 246: V =  4.0f; break;
+  case 247: V = -4.0f; break;
+  case 248: return MCOperand::createImm(Is32 ?         // 1/(2*PI)
+                                          0x3e22f983 :
+                                          0x3fc45f306dc9c882);
+  default: break;
+  }
+  return MCOperand::createImm(Is32? FloatToBits(V) : DoubleToBits(V));
+}
+
+unsigned AMDGPUDisassembler::getVgprClassId(const OpWidthTy Width) const {
+  using namespace AMDGPU;
+  assert(OPW_FIRST_ <= Width && Width < OPW_LAST_);
+  switch (Width) {
+  default: // fall
+  case OPW32: return VGPR_32RegClassID;
+  case OPW64: return VReg_64RegClassID;
+  case OPW128: return VReg_128RegClassID;
+  }
+}
+
+unsigned AMDGPUDisassembler::getSgprClassId(const OpWidthTy Width) const {
+  using namespace AMDGPU;
+  assert(OPW_FIRST_ <= Width && Width < OPW_LAST_);
+  switch (Width) {
+  default: // fall
+  case OPW32: return SGPR_32RegClassID;
+  case OPW64: return SGPR_64RegClassID;
+  case OPW128: return SGPR_128RegClassID;
+  }
+}
+
+unsigned AMDGPUDisassembler::getTtmpClassId(const OpWidthTy Width) const {
+  using namespace AMDGPU;
+  assert(OPW_FIRST_ <= Width && Width < OPW_LAST_);
+  switch (Width) {
+  default: // fall
+  case OPW32: return TTMP_32RegClassID;
+  case OPW64: return TTMP_64RegClassID;
+  case OPW128: return TTMP_128RegClassID;
+  }
+}
+
+MCOperand AMDGPUDisassembler::decodeSrcOp(const OpWidthTy Width, unsigned Val) const {
+  using namespace AMDGPU::EncValues;
+  assert(Val < 512); // enum9
+
+  if (VGPR_MIN <= Val && Val <= VGPR_MAX) {
+    return createRegOperand(getVgprClassId(Width), Val - VGPR_MIN);
+  }
+  if (Val <= SGPR_MAX) {
+    assert(SGPR_MIN == 0); // "SGPR_MIN <= Val" is always true and causes compilation warning.
+    return createSRegOperand(getSgprClassId(Width), Val - SGPR_MIN);
+  }
+  if (TTMP_MIN <= Val && Val <= TTMP_MAX) {
+    return createSRegOperand(getTtmpClassId(Width), Val - TTMP_MIN);
+  }
+
+  assert(Width == OPW32 || Width == OPW64);
+  const bool Is32 = (Width == OPW32);
+
+  if (INLINE_INTEGER_C_MIN <= Val && Val <= INLINE_INTEGER_C_MAX)
+    return decodeIntImmed(Val);
+
+  if (INLINE_FLOATING_C_MIN <= Val && Val <= INLINE_FLOATING_C_MAX)
+    return decodeFPImmed(Is32, Val);
+
+  if (Val == LITERAL_CONST)
+    return decodeLiteralConstant();
+
+  return Is32 ? decodeSpecialReg32(Val) : decodeSpecialReg64(Val);
+}
+
+MCOperand AMDGPUDisassembler::decodeSpecialReg32(unsigned Val) const {
+  using namespace AMDGPU;
+  switch (Val) {
+  case 102: return createRegOperand(getMCReg(FLAT_SCR_LO, STI));
+  case 103: return createRegOperand(getMCReg(FLAT_SCR_HI, STI));
+    // ToDo: no support for xnack_mask_lo/_hi register
+  case 104:
+  case 105: break;
+  case 106: return createRegOperand(VCC_LO);
+  case 107: return createRegOperand(VCC_HI);
+  case 108: return createRegOperand(TBA_LO);
+  case 109: return createRegOperand(TBA_HI);
+  case 110: return createRegOperand(TMA_LO);
+  case 111: return createRegOperand(TMA_HI);
+  case 124: return createRegOperand(M0);
+  case 126: return createRegOperand(EXEC_LO);
+  case 127: return createRegOperand(EXEC_HI);
+    // ToDo: no support for vccz register
+  case 251: break;
+    // ToDo: no support for execz register
+  case 252: break;
+  case 253: return createRegOperand(SCC);
+  default: break;
+  }
+  return errOperand(Val, "unknown operand encoding " + Twine(Val));
+}
+
+MCOperand AMDGPUDisassembler::decodeSpecialReg64(unsigned Val) const {
+  using namespace AMDGPU;
+  switch (Val) {
+  case 102: return createRegOperand(getMCReg(FLAT_SCR, STI));
+  case 106: return createRegOperand(VCC);
+  case 108: return createRegOperand(TBA);
+  case 110: return createRegOperand(TMA);
+  case 126: return createRegOperand(EXEC);
+  default: break;
+  }
+  return errOperand(Val, "unknown operand encoding " + Twine(Val));
+}
+
+static MCDisassembler *createAMDGPUDisassembler(const Target &T,
+                                                const MCSubtargetInfo &STI,
+                                                MCContext &Ctx) {
+  return new AMDGPUDisassembler(STI, Ctx);
+}
+
+extern "C" void LLVMInitializeAMDGPUDisassembler() {
+  TargetRegistry::RegisterMCDisassembler(TheGCNTarget, createAMDGPUDisassembler);
+}
diff --git a/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.h b/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.h
new file mode 100644
index 000000000000..dff26a044bf5
--- /dev/null
+++ b/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.h
@@ -0,0 +1,93 @@
+//===-- AMDGPUDisassembler.hpp - Disassembler for AMDGPU ISA ---*- C++ -*--===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+/// \file
+///
+/// This file contains declaration for AMDGPU ISA disassembler
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_AMDGPU_DISASSEMBLER_AMDGPUDISASSEMBLER_H
+#define LLVM_LIB_TARGET_AMDGPU_DISASSEMBLER_AMDGPUDISASSEMBLER_H
+
+#include "llvm/ADT/ArrayRef.h"
+#include "llvm/MC/MCDisassembler/MCDisassembler.h"
+
+namespace llvm {
+
+  class MCContext;
+  class MCInst;
+  class MCOperand;
+  class MCSubtargetInfo;
+  class Twine;
+
+  class AMDGPUDisassembler : public MCDisassembler {
+  private:
+    mutable ArrayRef<uint8_t> Bytes;
+
+  public:
+    AMDGPUDisassembler(const MCSubtargetInfo &STI, MCContext &Ctx) :
+      MCDisassembler(STI, Ctx) {}
+
+    ~AMDGPUDisassembler() {}
+
+    DecodeStatus getInstruction(MCInst &MI, uint64_t &Size,
+                                ArrayRef<uint8_t> Bytes, uint64_t Address,
+                                raw_ostream &WS, raw_ostream &CS) const override;
+
+    const char* getRegClassName(unsigned RegClassID) const;
+
+    MCOperand createRegOperand(unsigned int RegId) const;
+    MCOperand createRegOperand(unsigned RegClassID, unsigned Val) const;
+    MCOperand createSRegOperand(unsigned SRegClassID, unsigned Val) const;
+
+    MCOperand errOperand(unsigned V, const llvm::Twine& ErrMsg) const;
+
+    DecodeStatus tryDecodeInst(const uint8_t* Table,
+                               MCInst &MI,
+                               uint64_t Inst,
+                               uint64_t Address) const;
+
+    MCOperand decodeOperand_VGPR_32(unsigned Val) const;
+    MCOperand decodeOperand_VS_32(unsigned Val) const;
+    MCOperand decodeOperand_VS_64(unsigned Val) const;
+
+    MCOperand decodeOperand_VReg_64(unsigned Val) const;
+    MCOperand decodeOperand_VReg_96(unsigned Val) const;
+    MCOperand decodeOperand_VReg_128(unsigned Val) const;
+
+    MCOperand decodeOperand_SReg_32(unsigned Val) const;
+    MCOperand decodeOperand_SReg_32_XM0(unsigned Val) const;
+    MCOperand decodeOperand_SReg_64(unsigned Val) const;
+    MCOperand decodeOperand_SReg_128(unsigned Val) const;
+    MCOperand decodeOperand_SReg_256(unsigned Val) const;
+    MCOperand decodeOperand_SReg_512(unsigned Val) const;
+
+    enum OpWidthTy {
+      OPW32,
+      OPW64,
+      OPW128,
+      OPW_LAST_,
+      OPW_FIRST_ = OPW32
+    };
+    unsigned getVgprClassId(const OpWidthTy Width) const;
+    unsigned getSgprClassId(const OpWidthTy Width) const;
+    unsigned getTtmpClassId(const OpWidthTy Width) const;
+
+    static MCOperand decodeIntImmed(unsigned Imm);
+    static MCOperand decodeFPImmed(bool Is32, unsigned Imm);
+    MCOperand decodeLiteralConstant() const;
+
+    MCOperand decodeSrcOp(const OpWidthTy Width, unsigned Val) const;
+    MCOperand decodeSpecialReg32(unsigned Val) const;
+    MCOperand decodeSpecialReg64(unsigned Val) const;
+  };
+} // namespace llvm
+
+#endif //LLVM_LIB_TARGET_AMDGPU_DISASSEMBLER_AMDGPUDISASSEMBLER_H
diff --git a/lib/Target/AMDGPU/Disassembler/CMakeLists.txt b/lib/Target/AMDGPU/Disassembler/CMakeLists.txt
new file mode 100644
index 000000000000..fb9231576919
--- /dev/null
+++ b/lib/Target/AMDGPU/Disassembler/CMakeLists.txt
@@ -0,0 +1,7 @@
+include_directories( ${CMAKE_CURRENT_BINARY_DIR}/.. ${CMAKE_CURRENT_SOURCE_DIR}/.. )
+
+add_llvm_library(LLVMAMDGPUDisassembler
+  AMDGPUDisassembler.cpp
+  )
+
+add_dependencies(LLVMAMDGPUDisassembler AMDGPUCommonTableGen LLVMAMDGPUUtils)
diff --git a/lib/Target/AMDGPU/Disassembler/LLVMBuild.txt b/lib/Target/AMDGPU/Disassembler/LLVMBuild.txt
new file mode 100644
index 000000000000..c9005f8a7884
--- /dev/null
+++ b/lib/Target/AMDGPU/Disassembler/LLVMBuild.txt
@@ -0,0 +1,23 @@
+;===- ./lib/Target/AMDGPU/Disassembler/LLVMBuild.txt ------------*- Conf -*--===;
+;
+;                     The LLVM Compiler Infrastructure
+;
+; This file is distributed under the University of Illinois Open Source
+; License. See LICENSE.TXT for details.
+;
+;===------------------------------------------------------------------------===;
+;
+; This is an LLVMBuild description file for the components in this subdirectory.
+;
+; For more information on the LLVMBuild system, please see:
+;
+;   http://llvm.org/docs/LLVMBuild.html
+;
+;===------------------------------------------------------------------------===;
+
+[component_0]
+type = Library
+name = AMDGPUDisassembler
+parent = AMDGPU
+required_libraries = AMDGPUDesc AMDGPUInfo AMDGPUUtils MC MCDisassembler Support
+add_to_library_groups = AMDGPU
diff --git a/lib/Target/AMDGPU/EvergreenInstructions.td b/lib/Target/AMDGPU/EvergreenInstructions.td
index 2245f1417e53..94f05cc41aff 100644
--- a/lib/Target/AMDGPU/EvergreenInstructions.td
+++ b/lib/Target/AMDGPU/EvergreenInstructions.td
@@ -85,8 +85,6 @@ def COS_eg : COS_Common<0x8E>;
 def : POW_Common <LOG_IEEE_eg, EXP_IEEE_eg, MUL>;
 def : Pat<(fsqrt f32:$src), (MUL $src, (RECIPSQRT_CLAMPED_eg $src))>;
 
-defm : Expand24IBitOps<MULLO_INT_eg, ADD_INT>;
-
 //===----------------------------------------------------------------------===//
 // Memory read/write instructions
 //===----------------------------------------------------------------------===//
@@ -212,23 +210,23 @@ class VTX_READ_128_eg <bits<8> buffer_id, list<dag> pattern>
 // VTX Read from parameter memory space
 //===----------------------------------------------------------------------===//
 
-def VTX_READ_PARAM_8_eg : VTX_READ_8_eg <0,
+def VTX_READ_PARAM_8_eg : VTX_READ_8_eg <3,
   [(set i32:$dst_gpr, (load_param_exti8 ADDRVTX_READ:$src_gpr))]
 >;
 
-def VTX_READ_PARAM_16_eg : VTX_READ_16_eg <0,
+def VTX_READ_PARAM_16_eg : VTX_READ_16_eg <3,
   [(set i32:$dst_gpr, (load_param_exti16 ADDRVTX_READ:$src_gpr))]
 >;
 
-def VTX_READ_PARAM_32_eg : VTX_READ_32_eg <0,
+def VTX_READ_PARAM_32_eg : VTX_READ_32_eg <3,
   [(set i32:$dst_gpr, (load_param ADDRVTX_READ:$src_gpr))]
 >;
 
-def VTX_READ_PARAM_64_eg : VTX_READ_64_eg <0,
+def VTX_READ_PARAM_64_eg : VTX_READ_64_eg <3,
   [(set v2i32:$dst_gpr, (load_param ADDRVTX_READ:$src_gpr))]
 >;
 
-def VTX_READ_PARAM_128_eg : VTX_READ_128_eg <0,
+def VTX_READ_PARAM_128_eg : VTX_READ_128_eg <3,
   [(set v4i32:$dst_gpr, (load_param ADDRVTX_READ:$src_gpr))]
 >;
 
@@ -237,27 +235,53 @@ def VTX_READ_PARAM_128_eg : VTX_READ_128_eg <0,
 //===----------------------------------------------------------------------===//
 
 // 8-bit reads
-def VTX_READ_GLOBAL_8_eg : VTX_READ_8_eg <1,
-  [(set i32:$dst_gpr, (az_extloadi8_global ADDRVTX_READ:$src_gpr))]
+def VTX_READ_ID1_8_eg : VTX_READ_8_eg <1,
+  [(set i32:$dst_gpr, (vtx_id1_az_extloadi8 ADDRVTX_READ:$src_gpr))]
+>;
+
+// 16-bit reads
+def VTX_READ_ID1_16_eg : VTX_READ_16_eg <1,
+  [(set i32:$dst_gpr, (vtx_id1_az_extloadi16 ADDRVTX_READ:$src_gpr))]
+>;
+
+// 32-bit reads
+def VTX_READ_ID1_32_eg : VTX_READ_32_eg <1,
+  [(set i32:$dst_gpr, (vtx_id1_load ADDRVTX_READ:$src_gpr))]
+>;
+
+// 64-bit reads
+def VTX_READ_ID1_64_eg : VTX_READ_64_eg <1,
+  [(set v2i32:$dst_gpr, (vtx_id1_load ADDRVTX_READ:$src_gpr))]
+>;
+
+// 128-bit reads
+def VTX_READ_ID1_128_eg : VTX_READ_128_eg <1,
+  [(set v4i32:$dst_gpr, (vtx_id1_load ADDRVTX_READ:$src_gpr))]
+>;
+
+// 8-bit reads
+def VTX_READ_ID2_8_eg : VTX_READ_8_eg <2,
+  [(set i32:$dst_gpr, (vtx_id2_az_extloadi8 ADDRVTX_READ:$src_gpr))]
 >;
 
-def VTX_READ_GLOBAL_16_eg : VTX_READ_16_eg <1,
-  [(set i32:$dst_gpr, (az_extloadi16_global ADDRVTX_READ:$src_gpr))]
+// 16-bit reads
+def VTX_READ_ID2_16_eg : VTX_READ_16_eg <2,
+  [(set i32:$dst_gpr, (vtx_id2_az_extloadi16 ADDRVTX_READ:$src_gpr))]
 >;
 
 // 32-bit reads
-def VTX_READ_GLOBAL_32_eg : VTX_READ_32_eg <1,
-  [(set i32:$dst_gpr, (global_load ADDRVTX_READ:$src_gpr))]
+def VTX_READ_ID2_32_eg : VTX_READ_32_eg <2,
+  [(set i32:$dst_gpr, (vtx_id2_load ADDRVTX_READ:$src_gpr))]
 >;
 
 // 64-bit reads
-def VTX_READ_GLOBAL_64_eg : VTX_READ_64_eg <1,
-  [(set v2i32:$dst_gpr, (global_load ADDRVTX_READ:$src_gpr))]
+def VTX_READ_ID2_64_eg : VTX_READ_64_eg <2,
+  [(set v2i32:$dst_gpr, (vtx_id2_load ADDRVTX_READ:$src_gpr))]
 >;
 
 // 128-bit reads
-def VTX_READ_GLOBAL_128_eg : VTX_READ_128_eg <1,
-  [(set v4i32:$dst_gpr, (global_load ADDRVTX_READ:$src_gpr))]
+def VTX_READ_ID2_128_eg : VTX_READ_128_eg <2,
+  [(set v4i32:$dst_gpr, (vtx_id2_load ADDRVTX_READ:$src_gpr))]
 >;
 
 } // End Predicates = [isEG]
@@ -356,8 +380,6 @@ let hasSideEffects = 1 in {
   def MOVA_INT_eg : R600_1OP <0xCC, "MOVA_INT", [], VecALU>;
 }
 
-def TGSI_LIT_Z_eg : TGSI_LIT_Z_Common<MUL_LIT_eg, LOG_CLAMPED_eg, EXP_IEEE_eg>;
-
 def FLT_TO_INT_eg : FLT_TO_INT_Common<0x50> {
   let Pattern = [];
   let Itinerary = AnyALU;
@@ -372,7 +394,7 @@ def FLT_TO_UINT_eg : FLT_TO_UINT_Common<0x9A> {
 def UINT_TO_FLT_eg : UINT_TO_FLT_Common<0x9C>;
 
 def GROUP_BARRIER : InstR600 <
-    (outs), (ins), "  GROUP_BARRIER", [(int_AMDGPU_barrier_local), (int_AMDGPU_barrier_global)], AnyALU>,
+    (outs), (ins), "  GROUP_BARRIER", [(int_r600_group_barrier)], AnyALU>,
     R600ALU_Word0,
     R600ALU_Word1_OP2 <0x54> {
 
@@ -401,11 +423,6 @@ def GROUP_BARRIER : InstR600 <
   let ALUInst = 1;
 }
 
-def : Pat <
-	(int_AMDGPU_barrier_global),
-	(GROUP_BARRIER)
->;
-
 //===----------------------------------------------------------------------===//
 // LDS Instructions
 //===----------------------------------------------------------------------===//
diff --git a/lib/Target/AMDGPU/GCNHazardRecognizer.cpp b/lib/Target/AMDGPU/GCNHazardRecognizer.cpp
new file mode 100644
index 000000000000..29b1f79187d5
--- /dev/null
+++ b/lib/Target/AMDGPU/GCNHazardRecognizer.cpp
@@ -0,0 +1,264 @@
+//===-- GCNHazardRecognizers.cpp - GCN Hazard Recognizer Impls ------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements hazard recognizers for scheduling on GCN processors.
+//
+//===----------------------------------------------------------------------===//
+
+#include "GCNHazardRecognizer.h"
+#include "AMDGPUSubtarget.h"
+#include "SIInstrInfo.h"
+#include "llvm/CodeGen/ScheduleDAG.h"
+#include "llvm/Support/Debug.h"
+
+using namespace llvm;
+
+//===----------------------------------------------------------------------===//
+// Hazard Recoginizer Implementation
+//===----------------------------------------------------------------------===//
+
+GCNHazardRecognizer::GCNHazardRecognizer(const MachineFunction &MF) :
+  CurrCycleInstr(nullptr),
+  MF(MF),
+  ST(MF.getSubtarget<SISubtarget>()) {
+  MaxLookAhead = 5;
+}
+
+void GCNHazardRecognizer::EmitInstruction(SUnit *SU) {
+  EmitInstruction(SU->getInstr());
+}
+
+void GCNHazardRecognizer::EmitInstruction(MachineInstr *MI) {
+  CurrCycleInstr = MI;
+}
+
+ScheduleHazardRecognizer::HazardType
+GCNHazardRecognizer::getHazardType(SUnit *SU, int Stalls) {
+  MachineInstr *MI = SU->getInstr();
+
+  if (SIInstrInfo::isSMRD(*MI) && checkSMRDHazards(MI) > 0)
+    return NoopHazard;
+
+  if (SIInstrInfo::isVMEM(*MI) && checkVMEMHazards(MI) > 0)
+    return NoopHazard;
+
+  if (SIInstrInfo::isDPP(*MI) && checkDPPHazards(MI) > 0)
+    return NoopHazard;
+
+  return NoHazard;
+}
+
+unsigned GCNHazardRecognizer::PreEmitNoops(SUnit *SU) {
+  return PreEmitNoops(SU->getInstr());
+}
+
+unsigned GCNHazardRecognizer::PreEmitNoops(MachineInstr *MI) {
+  if (SIInstrInfo::isSMRD(*MI))
+    return std::max(0, checkSMRDHazards(MI));
+
+  if (SIInstrInfo::isVMEM(*MI))
+    return std::max(0, checkVMEMHazards(MI));
+
+  if (SIInstrInfo::isDPP(*MI))
+    return std::max(0, checkDPPHazards(MI));
+
+  return 0;
+}
+
+void GCNHazardRecognizer::EmitNoop() {
+  EmittedInstrs.push_front(nullptr);
+}
+
+void GCNHazardRecognizer::AdvanceCycle() {
+
+  // When the scheduler detects a stall, it will call AdvanceCycle() without
+  // emitting any instructions.
+  if (!CurrCycleInstr)
+    return;
+
+  const SIInstrInfo *TII = ST.getInstrInfo();
+  unsigned NumWaitStates = TII->getNumWaitStates(*CurrCycleInstr);
+
+  // Keep track of emitted instructions
+  EmittedInstrs.push_front(CurrCycleInstr);
+
+  // Add a nullptr for each additional wait state after the first.  Make sure
+  // not to add more than getMaxLookAhead() items to the list, since we
+  // truncate the list to that size right after this loop.
+  for (unsigned i = 1, e = std::min(NumWaitStates, getMaxLookAhead());
+       i < e; ++i) {
+    EmittedInstrs.push_front(nullptr);
+  }
+
+  // getMaxLookahead() is the largest number of wait states we will ever need
+  // to insert, so there is no point in keeping track of more than that many
+  // wait states.
+  EmittedInstrs.resize(getMaxLookAhead());
+
+  CurrCycleInstr = nullptr;
+}
+
+void GCNHazardRecognizer::RecedeCycle() {
+  llvm_unreachable("hazard recognizer does not support bottom-up scheduling.");
+}
+
+//===----------------------------------------------------------------------===//
+// Helper Functions
+//===----------------------------------------------------------------------===//
+
+int GCNHazardRecognizer::getWaitStatesSinceDef(
+    unsigned Reg, function_ref<bool(MachineInstr *)> IsHazardDef) {
+  const SIRegisterInfo *TRI = ST.getRegisterInfo();
+
+  int WaitStates = -1;
+  for (MachineInstr *MI : EmittedInstrs) {
+    ++WaitStates;
+    if (!MI || !IsHazardDef(MI))
+      continue;
+    if (MI->modifiesRegister(Reg, TRI))
+      return WaitStates;
+  }
+  return std::numeric_limits<int>::max();
+}
+
+//===----------------------------------------------------------------------===//
+// No-op Hazard Detection
+//===----------------------------------------------------------------------===//
+
+static void addRegsToSet(iterator_range<MachineInstr::const_mop_iterator> Ops,
+                         std::set<unsigned> &Set) {
+  for (const MachineOperand &Op : Ops) {
+    if (Op.isReg())
+      Set.insert(Op.getReg());
+  }
+}
+
+int GCNHazardRecognizer::checkSMEMSoftClauseHazards(MachineInstr *SMEM) {
+  // SMEM soft clause are only present on VI+
+  if (ST.getGeneration() < SISubtarget::VOLCANIC_ISLANDS)
+    return 0;
+
+  // A soft-clause is any group of consecutive SMEM instructions.  The
+  // instructions in this group may return out of order and/or may be
+  // replayed (i.e. the same instruction issued more than once).
+  //
+  // In order to handle these situations correctly we need to make sure
+  // that when a clause has more than one instruction, no instruction in the
+  // clause writes to a register that is read another instruction in the clause
+  // (including itself). If we encounter this situaion, we need to break the
+  // clause by inserting a non SMEM instruction.
+
+  std::set<unsigned> ClauseDefs;
+  std::set<unsigned> ClauseUses;
+
+  for (MachineInstr *MI : EmittedInstrs) {
+
+    // When we hit a non-SMEM instruction then we have passed the start of the
+    // clause and we can stop.
+    if (!MI || !SIInstrInfo::isSMRD(*MI))
+      break;
+
+    addRegsToSet(MI->defs(), ClauseDefs);
+    addRegsToSet(MI->uses(), ClauseUses);
+  }
+
+  if (ClauseDefs.empty())
+    return 0;
+
+  // FIXME: When we support stores, we need to make sure not to put loads and
+  // stores in the same clause if they use the same address.  For now, just
+  // start a new clause whenever we see a store.
+  if (SMEM->mayStore())
+    return 1;
+
+  addRegsToSet(SMEM->defs(), ClauseDefs);
+  addRegsToSet(SMEM->uses(), ClauseUses);
+
+  std::vector<unsigned> Result(std::max(ClauseDefs.size(), ClauseUses.size()));
+  std::vector<unsigned>::iterator End;
+
+  End = std::set_intersection(ClauseDefs.begin(), ClauseDefs.end(),
+                              ClauseUses.begin(), ClauseUses.end(), Result.begin());
+
+  // If the set of defs and uses intersect then we cannot add this instruction
+  // to the clause, so we have a hazard.
+  if (End != Result.begin())
+    return 1;
+
+  return 0;
+}
+
+int GCNHazardRecognizer::checkSMRDHazards(MachineInstr *SMRD) {
+  const SISubtarget &ST = MF.getSubtarget<SISubtarget>();
+  const SIInstrInfo *TII = ST.getInstrInfo();
+  int WaitStatesNeeded = 0;
+
+  WaitStatesNeeded = checkSMEMSoftClauseHazards(SMRD);
+
+  // This SMRD hazard only affects SI.
+  if (ST.getGeneration() != SISubtarget::SOUTHERN_ISLANDS)
+    return WaitStatesNeeded;
+
+  // A read of an SGPR by SMRD instruction requires 4 wait states when the
+  // SGPR was written by a VALU instruction.
+  int SmrdSgprWaitStates = 4;
+  auto IsHazardDefFn = [TII] (MachineInstr *MI) { return TII->isVALU(*MI); };
+
+  for (const MachineOperand &Use : SMRD->uses()) {
+    if (!Use.isReg())
+      continue;
+    int WaitStatesNeededForUse =
+        SmrdSgprWaitStates - getWaitStatesSinceDef(Use.getReg(), IsHazardDefFn);
+    WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
+  }
+  return WaitStatesNeeded;
+}
+
+int GCNHazardRecognizer::checkVMEMHazards(MachineInstr* VMEM) {
+  const SIInstrInfo *TII = ST.getInstrInfo();
+
+  if (ST.getGeneration() < SISubtarget::VOLCANIC_ISLANDS)
+    return 0;
+
+  const SIRegisterInfo &TRI = TII->getRegisterInfo();
+
+  // A read of an SGPR by a VMEM instruction requires 5 wait states when the
+  // SGPR was written by a VALU Instruction.
+  int VmemSgprWaitStates = 5;
+  int WaitStatesNeeded = 0;
+  auto IsHazardDefFn = [TII] (MachineInstr *MI) { return TII->isVALU(*MI); };
+
+  for (const MachineOperand &Use : VMEM->uses()) {
+    if (!Use.isReg() || TRI.isVGPR(MF.getRegInfo(), Use.getReg()))
+      continue;
+
+    int WaitStatesNeededForUse =
+        VmemSgprWaitStates - getWaitStatesSinceDef(Use.getReg(), IsHazardDefFn);
+    WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
+  }
+  return WaitStatesNeeded;
+}
+
+int GCNHazardRecognizer::checkDPPHazards(MachineInstr *DPP) {
+  const SIRegisterInfo *TRI = ST.getRegisterInfo();
+
+  // Check for DPP VGPR read after VALU VGPR write.
+  int DppVgprWaitStates = 2;
+  int WaitStatesNeeded = 0;
+
+  for (const MachineOperand &Use : DPP->uses()) {
+    if (!Use.isReg() || !TRI->isVGPR(MF.getRegInfo(), Use.getReg()))
+      continue;
+    int WaitStatesNeededForUse =
+        DppVgprWaitStates - getWaitStatesSinceDef(Use.getReg());
+    WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
+  }
+
+  return WaitStatesNeeded;
+}
diff --git a/lib/Target/AMDGPU/GCNHazardRecognizer.h b/lib/Target/AMDGPU/GCNHazardRecognizer.h
new file mode 100644
index 000000000000..d82041c5f174
--- /dev/null
+++ b/lib/Target/AMDGPU/GCNHazardRecognizer.h
@@ -0,0 +1,62 @@
+//===-- GCNHazardRecognizers.h - GCN Hazard Recognizers ---------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines hazard recognizers for scheduling on GCN processors.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_AMDGPUHAZARDRECOGNIZERS_H
+#define LLVM_LIB_TARGET_AMDGPUHAZARDRECOGNIZERS_H
+
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/CodeGen/ScheduleHazardRecognizer.h"
+#include <list>
+
+namespace llvm {
+
+class MachineFunction;
+class MachineInstr;
+class ScheduleDAG;
+class SIInstrInfo;
+class SISubtarget;
+
+class GCNHazardRecognizer final : public ScheduleHazardRecognizer {
+  // This variable stores the instruction that has been emitted this cycle. It
+  // will be added to EmittedInstrs, when AdvanceCycle() or RecedeCycle() is
+  // called.
+  MachineInstr *CurrCycleInstr;
+  std::list<MachineInstr*> EmittedInstrs;
+  const MachineFunction &MF;
+  const SISubtarget &ST;
+
+  int getWaitStatesSinceDef(unsigned Reg,
+                            function_ref<bool(MachineInstr *)> IsHazardDef =
+                                [](MachineInstr *) { return true; });
+
+  int checkSMEMSoftClauseHazards(MachineInstr *SMEM);
+  int checkSMRDHazards(MachineInstr *SMRD);
+  int checkVMEMHazards(MachineInstr* VMEM);
+  int checkDPPHazards(MachineInstr *DPP);
+public:
+  GCNHazardRecognizer(const MachineFunction &MF);
+  // We can only issue one instruction per cycle.
+  bool atIssueLimit() const override { return true; }
+  void EmitInstruction(SUnit *SU) override;
+  void EmitInstruction(MachineInstr *MI) override;
+  HazardType getHazardType(SUnit *SU, int Stalls) override;
+  void EmitNoop() override;
+  unsigned PreEmitNoops(SUnit *SU) override;
+  unsigned PreEmitNoops(MachineInstr *) override;
+  void AdvanceCycle() override;
+  void RecedeCycle() override;
+};
+
+} // end namespace llvm
+
+#endif //LLVM_LIB_TARGET_AMDGPUHAZARDRECOGNIZERS_H
diff --git a/lib/Target/AMDGPU/InstPrinter/AMDGPUInstPrinter.cpp b/lib/Target/AMDGPU/InstPrinter/AMDGPUInstPrinter.cpp
index a187de88f639..2932d3bb1580 100644
--- a/lib/Target/AMDGPU/InstPrinter/AMDGPUInstPrinter.cpp
+++ b/lib/Target/AMDGPU/InstPrinter/AMDGPUInstPrinter.cpp
@@ -11,6 +11,7 @@
 #include "AMDGPUInstPrinter.h"
 #include "MCTargetDesc/AMDGPUMCTargetDesc.h"
 #include "SIDefines.h"
+#include "Utils/AMDGPUAsmUtils.h"
 #include "llvm/MC/MCExpr.h"
 #include "llvm/MC/MCInst.h"
 #include "llvm/MC/MCInstrInfo.h"
@@ -18,6 +19,8 @@
 #include "llvm/Support/MathExtras.h"
 #include "llvm/Support/raw_ostream.h"
 
+#include <string>
+
 using namespace llvm;
 
 void AMDGPUInstPrinter::printInst(const MCInst *MI, raw_ostream &OS,
@@ -28,6 +31,11 @@ void AMDGPUInstPrinter::printInst(const MCInst *MI, raw_ostream &OS,
   printAnnotation(OS, Annot);
 }
 
+void AMDGPUInstPrinter::printU4ImmOperand(const MCInst *MI, unsigned OpNo,
+                                           raw_ostream &O) {
+  O << formatHex(MI->getOperand(OpNo).getImm() & 0xf);
+}
+
 void AMDGPUInstPrinter::printU8ImmOperand(const MCInst *MI, unsigned OpNo,
                                            raw_ostream &O) {
   O << formatHex(MI->getOperand(OpNo).getImm() & 0xff);
@@ -43,6 +51,11 @@ void AMDGPUInstPrinter::printU32ImmOperand(const MCInst *MI, unsigned OpNo,
   O << formatHex(MI->getOperand(OpNo).getImm() & 0xffffffff);
 }
 
+void AMDGPUInstPrinter::printU4ImmDecOperand(const MCInst *MI, unsigned OpNo,
+                                             raw_ostream &O) {
+  O << formatDec(MI->getOperand(OpNo).getImm() & 0xf);
+}
+
 void AMDGPUInstPrinter::printU8ImmDecOperand(const MCInst *MI, unsigned OpNo,
                                              raw_ostream &O) {
   O << formatDec(MI->getOperand(OpNo).getImm() & 0xff);
@@ -53,22 +66,26 @@ void AMDGPUInstPrinter::printU16ImmDecOperand(const MCInst *MI, unsigned OpNo,
   O << formatDec(MI->getOperand(OpNo).getImm() & 0xffff);
 }
 
+void AMDGPUInstPrinter::printNamedBit(const MCInst* MI, unsigned OpNo,
+                                      raw_ostream& O, StringRef BitName) {
+  if (MI->getOperand(OpNo).getImm()) {
+    O << ' ' << BitName;
+  }
+}
+
 void AMDGPUInstPrinter::printOffen(const MCInst *MI, unsigned OpNo,
                                    raw_ostream &O) {
-  if (MI->getOperand(OpNo).getImm())
-    O << " offen";
+  printNamedBit(MI, OpNo, O, "offen");
 }
 
 void AMDGPUInstPrinter::printIdxen(const MCInst *MI, unsigned OpNo,
                                    raw_ostream &O) {
-  if (MI->getOperand(OpNo).getImm())
-    O << " idxen";
+  printNamedBit(MI, OpNo, O, "idxen");
 }
 
 void AMDGPUInstPrinter::printAddr64(const MCInst *MI, unsigned OpNo,
                                     raw_ostream &O) {
-  if (MI->getOperand(OpNo).getImm())
-    O << " addr64";
+  printNamedBit(MI, OpNo, O, "addr64");
 }
 
 void AMDGPUInstPrinter::printMBUFOffset(const MCInst *MI, unsigned OpNo,
@@ -79,7 +96,7 @@ void AMDGPUInstPrinter::printMBUFOffset(const MCInst *MI, unsigned OpNo,
   }
 }
 
-void AMDGPUInstPrinter::printDSOffset(const MCInst *MI, unsigned OpNo,
+void AMDGPUInstPrinter::printOffset(const MCInst *MI, unsigned OpNo,
                                       raw_ostream &O) {
   uint16_t Imm = MI->getOperand(OpNo).getImm();
   if (Imm != 0) {
@@ -88,7 +105,7 @@ void AMDGPUInstPrinter::printDSOffset(const MCInst *MI, unsigned OpNo,
   }
 }
 
-void AMDGPUInstPrinter::printDSOffset0(const MCInst *MI, unsigned OpNo,
+void AMDGPUInstPrinter::printOffset0(const MCInst *MI, unsigned OpNo,
                                         raw_ostream &O) {
   if (MI->getOperand(OpNo).getImm()) {
     O << " offset0:";
@@ -96,7 +113,7 @@ void AMDGPUInstPrinter::printDSOffset0(const MCInst *MI, unsigned OpNo,
   }
 }
 
-void AMDGPUInstPrinter::printDSOffset1(const MCInst *MI, unsigned OpNo,
+void AMDGPUInstPrinter::printOffset1(const MCInst *MI, unsigned OpNo,
                                         raw_ostream &O) {
   if (MI->getOperand(OpNo).getImm()) {
     O << " offset1:";
@@ -104,28 +121,62 @@ void AMDGPUInstPrinter::printDSOffset1(const MCInst *MI, unsigned OpNo,
   }
 }
 
+void AMDGPUInstPrinter::printSMRDOffset(const MCInst *MI, unsigned OpNo,
+                                        raw_ostream &O) {
+  printU32ImmOperand(MI, OpNo, O);
+}
+
+void AMDGPUInstPrinter::printSMRDLiteralOffset(const MCInst *MI, unsigned OpNo,
+                                               raw_ostream &O) {
+  printU32ImmOperand(MI, OpNo, O);
+}
+
 void AMDGPUInstPrinter::printGDS(const MCInst *MI, unsigned OpNo,
                                  raw_ostream &O) {
-  if (MI->getOperand(OpNo).getImm())
-    O << " gds";
+  printNamedBit(MI, OpNo, O, "gds");
 }
 
 void AMDGPUInstPrinter::printGLC(const MCInst *MI, unsigned OpNo,
                                  raw_ostream &O) {
-  if (MI->getOperand(OpNo).getImm())
-    O << " glc";
+  printNamedBit(MI, OpNo, O, "glc");
 }
 
 void AMDGPUInstPrinter::printSLC(const MCInst *MI, unsigned OpNo,
                                  raw_ostream &O) {
-  if (MI->getOperand(OpNo).getImm())
-    O << " slc";
+  printNamedBit(MI, OpNo, O, "slc");
 }
 
 void AMDGPUInstPrinter::printTFE(const MCInst *MI, unsigned OpNo,
                                  raw_ostream &O) {
-  if (MI->getOperand(OpNo).getImm())
-    O << " tfe";
+  printNamedBit(MI, OpNo, O, "tfe");
+}
+
+void AMDGPUInstPrinter::printDMask(const MCInst *MI, unsigned OpNo,
+                                   raw_ostream &O) {
+  if (MI->getOperand(OpNo).getImm()) {
+    O << " dmask:";
+    printU16ImmOperand(MI, OpNo, O);
+  }
+}
+
+void AMDGPUInstPrinter::printUNorm(const MCInst *MI, unsigned OpNo,
+                                   raw_ostream &O) {
+  printNamedBit(MI, OpNo, O, "unorm");
+}
+
+void AMDGPUInstPrinter::printDA(const MCInst *MI, unsigned OpNo,
+                                raw_ostream &O) {
+  printNamedBit(MI, OpNo, O, "da");
+}
+
+void AMDGPUInstPrinter::printR128(const MCInst *MI, unsigned OpNo,
+                                  raw_ostream &O) {
+  printNamedBit(MI, OpNo, O, "r128");
+}
+
+void AMDGPUInstPrinter::printLWE(const MCInst *MI, unsigned OpNo,
+                                   raw_ostream &O) {
+  printNamedBit(MI, OpNo, O, "lwe");
 }
 
 void AMDGPUInstPrinter::printRegOperand(unsigned reg, raw_ostream &O,
@@ -152,6 +203,18 @@ void AMDGPUInstPrinter::printRegOperand(unsigned reg, raw_ostream &O,
   case AMDGPU::VCC_HI:
     O << "vcc_hi";
     return;
+  case AMDGPU::TBA_LO:
+    O << "tba_lo";
+    return;
+  case AMDGPU::TBA_HI:
+    O << "tba_hi";
+    return;
+  case AMDGPU::TMA_LO:
+    O << "tma_lo";
+    return;
+  case AMDGPU::TMA_HI:
+    O << "tma_hi";
+    return;
   case AMDGPU::EXEC_LO:
     O << "exec_lo";
     return;
@@ -168,62 +231,73 @@ void AMDGPUInstPrinter::printRegOperand(unsigned reg, raw_ostream &O,
     break;
   }
 
-  char Type;
-  unsigned NumRegs;
+  // The low 8 bits of the encoding value is the register index, for both VGPRs
+  // and SGPRs.
+  unsigned RegIdx = MRI.getEncodingValue(reg) & ((1 << 8) - 1);
 
+  unsigned NumRegs;
   if (MRI.getRegClass(AMDGPU::VGPR_32RegClassID).contains(reg)) {
-    Type = 'v';
+    O << 'v';
     NumRegs = 1;
   } else  if (MRI.getRegClass(AMDGPU::SGPR_32RegClassID).contains(reg)) {
-    Type = 's';
+    O << 's';
     NumRegs = 1;
   } else if (MRI.getRegClass(AMDGPU::VReg_64RegClassID).contains(reg)) {
-    Type = 'v';
+    O <<'v';
     NumRegs = 2;
-  } else  if (MRI.getRegClass(AMDGPU::SReg_64RegClassID).contains(reg)) {
-    Type = 's';
+  } else  if (MRI.getRegClass(AMDGPU::SGPR_64RegClassID).contains(reg)) {
+    O << 's';
     NumRegs = 2;
   } else if (MRI.getRegClass(AMDGPU::VReg_128RegClassID).contains(reg)) {
-    Type = 'v';
+    O << 'v';
     NumRegs = 4;
-  } else  if (MRI.getRegClass(AMDGPU::SReg_128RegClassID).contains(reg)) {
-    Type = 's';
+  } else  if (MRI.getRegClass(AMDGPU::SGPR_128RegClassID).contains(reg)) {
+    O << 's';
     NumRegs = 4;
   } else if (MRI.getRegClass(AMDGPU::VReg_96RegClassID).contains(reg)) {
-    Type = 'v';
+    O << 'v';
     NumRegs = 3;
   } else if (MRI.getRegClass(AMDGPU::VReg_256RegClassID).contains(reg)) {
-    Type = 'v';
+    O << 'v';
     NumRegs = 8;
   } else if (MRI.getRegClass(AMDGPU::SReg_256RegClassID).contains(reg)) {
-    Type = 's';
+    O << 's';
     NumRegs = 8;
   } else if (MRI.getRegClass(AMDGPU::VReg_512RegClassID).contains(reg)) {
-    Type = 'v';
+    O << 'v';
     NumRegs = 16;
   } else if (MRI.getRegClass(AMDGPU::SReg_512RegClassID).contains(reg)) {
-    Type = 's';
+    O << 's';
     NumRegs = 16;
+  } else if (MRI.getRegClass(AMDGPU::TTMP_64RegClassID).contains(reg)) {
+    O << "ttmp";
+    NumRegs = 2;
+    RegIdx -= 112; // Trap temps start at offset 112. TODO: Get this from tablegen.
+  } else if (MRI.getRegClass(AMDGPU::TTMP_128RegClassID).contains(reg)) {
+    O << "ttmp";
+    NumRegs = 4;
+    RegIdx -= 112; // Trap temps start at offset 112. TODO: Get this from tablegen.
   } else {
     O << getRegisterName(reg);
     return;
   }
 
-  // The low 8 bits of the encoding value is the register index, for both VGPRs
-  // and SGPRs.
-  unsigned RegIdx = MRI.getEncodingValue(reg) & ((1 << 8) - 1);
   if (NumRegs == 1) {
-    O << Type << RegIdx;
+    O << RegIdx;
     return;
   }
 
-  O << Type << '[' << RegIdx << ':' << (RegIdx + NumRegs - 1) << ']';
+  O << '[' << RegIdx << ':' << (RegIdx + NumRegs - 1) << ']';
 }
 
 void AMDGPUInstPrinter::printVOPDst(const MCInst *MI, unsigned OpNo,
                                     raw_ostream &O) {
   if (MII.get(MI->getOpcode()).TSFlags & SIInstrFlags::VOP3)
     O << "_e64 ";
+  else if (MII.get(MI->getOpcode()).TSFlags & SIInstrFlags::DPP)
+    O << "_dpp ";
+  else if (MII.get(MI->getOpcode()).TSFlags & SIInstrFlags::SDWA)
+    O << "_sdwa ";
   else
     O << "_e32 ";
 
@@ -345,12 +419,13 @@ void AMDGPUInstPrinter::printOperand(const MCInst *MI, unsigned OpNo,
     const MCExpr *Exp = Op.getExpr();
     Exp->print(O, &MAI);
   } else {
-    llvm_unreachable("unknown operand type in printOperand");
+    O << "/*INV_OP*/";
   }
 }
 
-void AMDGPUInstPrinter::printOperandAndMods(const MCInst *MI, unsigned OpNo,
-                                            raw_ostream &O) {
+void AMDGPUInstPrinter::printOperandAndFPInputMods(const MCInst *MI,
+                                                      unsigned OpNo,
+                                                      raw_ostream &O) {
   unsigned InputModifiers = MI->getOperand(OpNo).getImm();
   if (InputModifiers & SISrcMods::NEG)
     O << '-';
@@ -361,6 +436,122 @@ void AMDGPUInstPrinter::printOperandAndMods(const MCInst *MI, unsigned OpNo,
     O << '|';
 }
 
+void AMDGPUInstPrinter::printOperandAndIntInputMods(const MCInst *MI,
+                                                     unsigned OpNo,
+                                                     raw_ostream &O) {
+  unsigned InputModifiers = MI->getOperand(OpNo).getImm();
+  if (InputModifiers & SISrcMods::SEXT)
+    O << "sext(";
+  printOperand(MI, OpNo + 1, O);
+  if (InputModifiers & SISrcMods::SEXT)
+    O << ')';
+}
+
+
+void AMDGPUInstPrinter::printDPPCtrl(const MCInst *MI, unsigned OpNo,
+                                             raw_ostream &O) {
+  unsigned Imm = MI->getOperand(OpNo).getImm();
+  if (Imm <= 0x0ff) {
+    O << " quad_perm:[";
+    O << formatDec(Imm & 0x3)         << ',';
+    O << formatDec((Imm & 0xc)  >> 2) << ',';
+    O << formatDec((Imm & 0x30) >> 4) << ',';
+    O << formatDec((Imm & 0xc0) >> 6) << ']';
+  } else if ((Imm >= 0x101) && (Imm <= 0x10f)) {
+    O << " row_shl:";
+    printU4ImmDecOperand(MI, OpNo, O);
+  } else if ((Imm >= 0x111) && (Imm <= 0x11f)) {
+    O << " row_shr:";
+    printU4ImmDecOperand(MI, OpNo, O);
+  } else if ((Imm >= 0x121) && (Imm <= 0x12f)) {
+    O << " row_ror:";
+    printU4ImmDecOperand(MI, OpNo, O);
+  } else if (Imm == 0x130) {
+    O << " wave_shl:1";
+  } else if (Imm == 0x134) {
+    O << " wave_rol:1";
+  } else if (Imm == 0x138) {
+    O << " wave_shr:1";
+  } else if (Imm == 0x13c) {
+    O << " wave_ror:1";
+  } else if (Imm == 0x140) {
+    O << " row_mirror";
+  } else if (Imm == 0x141) {
+    O << " row_half_mirror";
+  } else if (Imm == 0x142) {
+    O << " row_bcast:15";
+  } else if (Imm == 0x143) {
+    O << " row_bcast:31";
+  } else {
+    llvm_unreachable("Invalid dpp_ctrl value");
+  }
+}
+
+void AMDGPUInstPrinter::printRowMask(const MCInst *MI, unsigned OpNo,
+                                            raw_ostream &O) {
+  O << " row_mask:";
+  printU4ImmOperand(MI, OpNo, O);
+}
+
+void AMDGPUInstPrinter::printBankMask(const MCInst *MI, unsigned OpNo,
+                                             raw_ostream &O) {
+  O << " bank_mask:";
+  printU4ImmOperand(MI, OpNo, O);
+}
+
+void AMDGPUInstPrinter::printBoundCtrl(const MCInst *MI, unsigned OpNo,
+                                              raw_ostream &O) {
+  unsigned Imm = MI->getOperand(OpNo).getImm();
+  if (Imm) {
+    O << " bound_ctrl:0"; // XXX - this syntax is used in sp3
+  }
+}
+
+void AMDGPUInstPrinter::printSDWASel(const MCInst *MI, unsigned OpNo,
+                                     raw_ostream &O) {
+  unsigned Imm = MI->getOperand(OpNo).getImm();
+  switch (Imm) {
+  case 0: O << "BYTE_0"; break;
+  case 1: O << "BYTE_1"; break;
+  case 2: O << "BYTE_2"; break;
+  case 3: O << "BYTE_3"; break;
+  case 4: O << "WORD_0"; break;
+  case 5: O << "WORD_1"; break;
+  case 6: O << "DWORD"; break;
+  default: llvm_unreachable("Invalid SDWA data select operand");
+  }
+}
+
+void AMDGPUInstPrinter::printSDWADstSel(const MCInst *MI, unsigned OpNo,
+                                        raw_ostream &O) {
+  O << "dst_sel:";
+  printSDWASel(MI, OpNo, O);
+}
+
+void AMDGPUInstPrinter::printSDWASrc0Sel(const MCInst *MI, unsigned OpNo,
+                                         raw_ostream &O) {
+  O << "src0_sel:";
+  printSDWASel(MI, OpNo, O);
+}
+
+void AMDGPUInstPrinter::printSDWASrc1Sel(const MCInst *MI, unsigned OpNo,
+                                         raw_ostream &O) {
+  O << "src1_sel:";
+  printSDWASel(MI, OpNo, O);
+}
+
+void AMDGPUInstPrinter::printSDWADstUnused(const MCInst *MI, unsigned OpNo,
+                                           raw_ostream &O) {
+  O << "dst_unused:";
+  unsigned Imm = MI->getOperand(OpNo).getImm();
+  switch (Imm) {
+  case 0: O << "UNUSED_PAD"; break;
+  case 1: O << "UNUSED_SEXT"; break;
+  case 2: O << "UNUSED_PRESERVE"; break;
+  default: llvm_unreachable("Invalid SDWA dest_unused operand");
+  }
+}
+
 void AMDGPUInstPrinter::printInterpSlot(const MCInst *MI, unsigned OpNum,
                                         raw_ostream &O) {
   unsigned Imm = MI->getOperand(OpNum).getImm();
@@ -395,9 +586,17 @@ void AMDGPUInstPrinter::printIfSet(const MCInst *MI, unsigned OpNo,
   }
 }
 
+void AMDGPUInstPrinter::printIfSet(const MCInst *MI, unsigned OpNo,
+                                   raw_ostream &O, char Asm) {
+  const MCOperand &Op = MI->getOperand(OpNo);
+  assert(Op.isImm());
+  if (Op.getImm() == 1)
+    O << Asm;
+}
+
 void AMDGPUInstPrinter::printAbs(const MCInst *MI, unsigned OpNo,
                                  raw_ostream &O) {
-  printIfSet(MI, OpNo, O, "|");
+  printIfSet(MI, OpNo, O, '|');
 }
 
 void AMDGPUInstPrinter::printClamp(const MCInst *MI, unsigned OpNo,
@@ -424,8 +623,15 @@ void AMDGPUInstPrinter::printOModSI(const MCInst *MI, unsigned OpNo,
 
 void AMDGPUInstPrinter::printLiteral(const MCInst *MI, unsigned OpNo,
                                      raw_ostream &O) {
-  int32_t Imm = MI->getOperand(OpNo).getImm();
-  O << Imm << '(' << BitsToFloat(Imm) << ')';
+  const MCOperand &Op = MI->getOperand(OpNo);
+  assert(Op.isImm() || Op.isExpr());
+  if (Op.isImm()) {
+    int64_t Imm = Op.getImm();
+    O << Imm << '(' << BitsToFloat(Imm) << ')';
+  }
+  if (Op.isExpr()) {
+    Op.getExpr()->print(O << '@', &MAI);
+  }
 }
 
 void AMDGPUInstPrinter::printLast(const MCInst *MI, unsigned OpNo,
@@ -435,7 +641,7 @@ void AMDGPUInstPrinter::printLast(const MCInst *MI, unsigned OpNo,
 
 void AMDGPUInstPrinter::printNeg(const MCInst *MI, unsigned OpNo,
                                  raw_ostream &O) {
-  printIfSet(MI, OpNo, O, "-");
+  printIfSet(MI, OpNo, O, '-');
 }
 
 void AMDGPUInstPrinter::printOMOD(const MCInst *MI, unsigned OpNo,
@@ -456,7 +662,7 @@ void AMDGPUInstPrinter::printOMOD(const MCInst *MI, unsigned OpNo,
 
 void AMDGPUInstPrinter::printRel(const MCInst *MI, unsigned OpNo,
                                  raw_ostream &O) {
-  printIfSet(MI, OpNo, O, "+");
+  printIfSet(MI, OpNo, O, '+');
 }
 
 void AMDGPUInstPrinter::printUpdateExecMask(const MCInst *MI, unsigned OpNo,
@@ -585,43 +791,49 @@ void AMDGPUInstPrinter::printKCache(const MCInst *MI, unsigned OpNo,
 
 void AMDGPUInstPrinter::printSendMsg(const MCInst *MI, unsigned OpNo,
                                      raw_ostream &O) {
-  unsigned SImm16 = MI->getOperand(OpNo).getImm();
-  unsigned Msg = SImm16 & 0xF;
-  if (Msg == 2 || Msg == 3) {
-    unsigned Op = (SImm16 >> 4) & 0xF;
-    if (Msg == 3)
-      O << "Gs_done(";
-    else
-      O << "Gs(";
-    if (Op == 0) {
-      O << "nop";
-    } else {
-      unsigned Stream = (SImm16 >> 8) & 0x3;
-      if (Op == 1)
-        O << "cut";
-      else if (Op == 2)
-        O << "emit";
-      else if (Op == 3)
-        O << "emit-cut";
-      O << " stream " << Stream;
+  using namespace llvm::AMDGPU::SendMsg;
+
+  const unsigned SImm16 = MI->getOperand(OpNo).getImm();
+  const unsigned Id = SImm16 & ID_MASK_;
+  do {
+    if (Id == ID_INTERRUPT) {
+      if ((SImm16 & ~ID_MASK_) != 0) // Unused/unknown bits must be 0.
+        break;
+      O << "sendmsg(" << IdSymbolic[Id] << ')';
+      return;
     }
-    O << "), [m0] ";
-  } else if (Msg == 1)
-    O << "interrupt ";
-  else if (Msg == 15)
-    O << "system ";
-  else
-    O << "unknown(" << Msg << ") ";
+    if (Id == ID_GS || Id == ID_GS_DONE) {
+      if ((SImm16 & ~(ID_MASK_|OP_GS_MASK_|STREAM_ID_MASK_)) != 0) // Unused/unknown bits must be 0.
+        break;
+      const unsigned OpGs = (SImm16 & OP_GS_MASK_) >> OP_SHIFT_;
+      const unsigned StreamId = (SImm16 & STREAM_ID_MASK_) >> STREAM_ID_SHIFT_;
+      if (OpGs == OP_GS_NOP && Id != ID_GS_DONE) // NOP to be used for GS_DONE only.
+        break;
+      if (OpGs == OP_GS_NOP && StreamId != 0) // NOP does not use/define stream id bits.
+        break;
+      O << "sendmsg(" << IdSymbolic[Id] << ", " << OpGsSymbolic[OpGs];
+      if (OpGs != OP_GS_NOP) {  O << ", " << StreamId; }
+      O << ')';
+      return;
+    }
+    if (Id == ID_SYSMSG) {
+      if ((SImm16 & ~(ID_MASK_|OP_SYS_MASK_)) != 0) // Unused/unknown bits must be 0.
+        break;
+      const unsigned OpSys = (SImm16 & OP_SYS_MASK_) >> OP_SHIFT_;
+      if (! (OP_SYS_FIRST_ <= OpSys && OpSys < OP_SYS_LAST_)) // Unused/unknown.
+        break;
+      O << "sendmsg(" << IdSymbolic[Id] << ", " << OpSysSymbolic[OpSys] << ')';
+      return;
+    }
+  } while (0);
+  O << SImm16; // Unknown simm16 code.
 }
 
 void AMDGPUInstPrinter::printWaitFlag(const MCInst *MI, unsigned OpNo,
                                       raw_ostream &O) {
-  // Note: Mask values are taken from SIInsertWaits.cpp and not from ISA docs
-  // SIInsertWaits.cpp bits usage does not match ISA docs description but it
-  // works so it might be a misprint in docs.
   unsigned SImm16 = MI->getOperand(OpNo).getImm();
   unsigned Vmcnt = SImm16 & 0xF;
-  unsigned Expcnt = (SImm16 >> 4) & 0xF;
+  unsigned Expcnt = (SImm16 >> 4) & 0x7;
   unsigned Lgkmcnt = (SImm16 >> 8) & 0xF;
 
   bool NeedSpace = false;
@@ -638,11 +850,32 @@ void AMDGPUInstPrinter::printWaitFlag(const MCInst *MI, unsigned OpNo,
     NeedSpace = true;
   }
 
-  if (Lgkmcnt != 0x7) {
+  if (Lgkmcnt != 0xF) {
     if (NeedSpace)
       O << ' ';
     O << "lgkmcnt(" << Lgkmcnt << ')';
   }
 }
 
+void AMDGPUInstPrinter::printHwreg(const MCInst *MI, unsigned OpNo,
+                                   raw_ostream &O) {
+  using namespace llvm::AMDGPU::Hwreg;
+
+  unsigned SImm16 = MI->getOperand(OpNo).getImm();
+  const unsigned Id = (SImm16 & ID_MASK_) >> ID_SHIFT_;
+  const unsigned Offset = (SImm16 & OFFSET_MASK_) >> OFFSET_SHIFT_;
+  const unsigned Width = ((SImm16 & WIDTH_M1_MASK_) >> WIDTH_M1_SHIFT_) + 1;
+
+  O << "hwreg(";
+  if (ID_SYMBOLIC_FIRST_ <= Id && Id < ID_SYMBOLIC_LAST_) {
+    O << IdSymbolic[Id];
+  } else {
+    O << Id;
+  }
+  if (Width != WIDTH_M1_DEFAULT_ + 1 || Offset != OFFSET_DEFAULT_) {
+    O << ", " << Offset << ", " << Width;
+  }
+  O << ')';
+}
+
 #include "AMDGPUGenAsmWriter.inc"
diff --git a/lib/Target/AMDGPU/InstPrinter/AMDGPUInstPrinter.h b/lib/Target/AMDGPU/InstPrinter/AMDGPUInstPrinter.h
index 90541d86132d..f5a290f16045 100644
--- a/lib/Target/AMDGPU/InstPrinter/AMDGPUInstPrinter.h
+++ b/lib/Target/AMDGPU/InstPrinter/AMDGPUInstPrinter.h
@@ -10,8 +10,8 @@
 /// \file
 //===----------------------------------------------------------------------===//
 
-#ifndef LLVM_LIB_TARGET_R600_INSTPRINTER_AMDGPUINSTPRINTER_H
-#define LLVM_LIB_TARGET_R600_INSTPRINTER_AMDGPUINSTPRINTER_H
+#ifndef LLVM_LIB_TARGET_AMDGPU_INSTPRINTER_AMDGPUINSTPRINTER_H
+#define LLVM_LIB_TARGET_AMDGPU_INSTPRINTER_AMDGPUINSTPRINTER_H
 
 #include "llvm/MC/MCInstPrinter.h"
 
@@ -33,37 +33,60 @@ public:
                               const MCRegisterInfo &MRI);
 
 private:
+  void printU4ImmOperand(const MCInst *MI, unsigned OpNo, raw_ostream &O);
   void printU8ImmOperand(const MCInst *MI, unsigned OpNo, raw_ostream &O);
   void printU16ImmOperand(const MCInst *MI, unsigned OpNo, raw_ostream &O);
+  void printU4ImmDecOperand(const MCInst *MI, unsigned OpNo, raw_ostream &O);
   void printU8ImmDecOperand(const MCInst *MI, unsigned OpNo, raw_ostream &O);
   void printU16ImmDecOperand(const MCInst *MI, unsigned OpNo, raw_ostream &O);
   void printU32ImmOperand(const MCInst *MI, unsigned OpNo, raw_ostream &O);
+  void printNamedBit(const MCInst* MI, unsigned OpNo, raw_ostream& O,
+                     StringRef BitName);
   void printOffen(const MCInst *MI, unsigned OpNo, raw_ostream &O);
   void printIdxen(const MCInst *MI, unsigned OpNo, raw_ostream &O);
   void printAddr64(const MCInst *MI, unsigned OpNo, raw_ostream &O);
   void printMBUFOffset(const MCInst *MI, unsigned OpNo, raw_ostream &O);
-  void printDSOffset(const MCInst *MI, unsigned OpNo, raw_ostream &O);
-  void printDSOffset0(const MCInst *MI, unsigned OpNo, raw_ostream &O);
-  void printDSOffset1(const MCInst *MI, unsigned OpNo, raw_ostream &O);
+  void printOffset(const MCInst *MI, unsigned OpNo, raw_ostream &O);
+  void printOffset0(const MCInst *MI, unsigned OpNo, raw_ostream &O);
+  void printOffset1(const MCInst *MI, unsigned OpNo, raw_ostream &O);
+  void printSMRDOffset(const MCInst *MI, unsigned OpNo, raw_ostream &O);
+  void printSMRDLiteralOffset(const MCInst *MI, unsigned OpNo, raw_ostream &O);
   void printGDS(const MCInst *MI, unsigned OpNo, raw_ostream &O);
   void printGLC(const MCInst *MI, unsigned OpNo, raw_ostream &O);
   void printSLC(const MCInst *MI, unsigned OpNo, raw_ostream &O);
   void printTFE(const MCInst *MI, unsigned OpNo, raw_ostream &O);
+  void printDMask(const MCInst *MI, unsigned OpNo, raw_ostream &O);
+  void printUNorm(const MCInst *MI, unsigned OpNo, raw_ostream &O);
+  void printDA(const MCInst *MI, unsigned OpNo, raw_ostream &O);
+  void printR128(const MCInst *MI, unsigned OpNo, raw_ostream &O);
+  void printLWE(const MCInst *MI, unsigned OpNo, raw_ostream &O);
   void printRegOperand(unsigned RegNo, raw_ostream &O);
   void printVOPDst(const MCInst *MI, unsigned OpNo, raw_ostream &O);
   void printImmediate32(uint32_t I, raw_ostream &O);
   void printImmediate64(uint64_t I, raw_ostream &O);
   void printOperand(const MCInst *MI, unsigned OpNo, raw_ostream &O);
-  void printOperandAndMods(const MCInst *MI, unsigned OpNo, raw_ostream &O);
+  void printOperandAndFPInputMods(const MCInst *MI, unsigned OpNo, raw_ostream &O);
+  void printOperandAndIntInputMods(const MCInst *MI, unsigned OpNo, raw_ostream &O);
+  void printDPPCtrl(const MCInst *MI, unsigned OpNo, raw_ostream &O);
+  void printRowMask(const MCInst *MI, unsigned OpNo, raw_ostream &O);
+  void printBankMask(const MCInst *MI, unsigned OpNo, raw_ostream &O);
+  void printBoundCtrl(const MCInst *MI, unsigned OpNo, raw_ostream &O);
+  void printSDWASel(const MCInst *MI, unsigned OpNo, raw_ostream &O);
+  void printSDWADstSel(const MCInst *MI, unsigned OpNo, raw_ostream &O);
+  void printSDWASrc0Sel(const MCInst *MI, unsigned OpNo, raw_ostream &O);
+  void printSDWASrc1Sel(const MCInst *MI, unsigned OpNo, raw_ostream &O);
+  void printSDWADstUnused(const MCInst *MI, unsigned OpNo, raw_ostream &O);
   static void printInterpSlot(const MCInst *MI, unsigned OpNum, raw_ostream &O);
   void printMemOperand(const MCInst *MI, unsigned OpNo, raw_ostream &O);
   static void printIfSet(const MCInst *MI, unsigned OpNo, raw_ostream &O,
                          StringRef Asm, StringRef Default = "");
+  static void printIfSet(const MCInst *MI, unsigned OpNo,
+                         raw_ostream &O, char Asm);
   static void printAbs(const MCInst *MI, unsigned OpNo, raw_ostream &O);
   static void printClamp(const MCInst *MI, unsigned OpNo, raw_ostream &O);
   static void printClampSI(const MCInst *MI, unsigned OpNo, raw_ostream &O);
   static void printOModSI(const MCInst *MI, unsigned OpNo, raw_ostream &O);
-  static void printLiteral(const MCInst *MI, unsigned OpNo, raw_ostream &O);
+  void printLiteral(const MCInst *MI, unsigned OpNo, raw_ostream &O);
   static void printLast(const MCInst *MI, unsigned OpNo, raw_ostream &O);
   static void printNeg(const MCInst *MI, unsigned OpNo, raw_ostream &O);
   static void printOMOD(const MCInst *MI, unsigned OpNo, raw_ostream &O);
@@ -79,6 +102,7 @@ private:
   static void printKCache(const MCInst *MI, unsigned OpNo, raw_ostream &O);
   static void printSendMsg(const MCInst *MI, unsigned OpNo, raw_ostream &O);
   static void printWaitFlag(const MCInst *MI, unsigned OpNo, raw_ostream &O);
+  static void printHwreg(const MCInst *MI, unsigned OpNo, raw_ostream &O);
 };
 
 } // End namespace llvm
diff --git a/lib/Target/AMDGPU/InstPrinter/CMakeLists.txt b/lib/Target/AMDGPU/InstPrinter/CMakeLists.txt
index ce63bd553b9c..7191ff2c4577 100644
--- a/lib/Target/AMDGPU/InstPrinter/CMakeLists.txt
+++ b/lib/Target/AMDGPU/InstPrinter/CMakeLists.txt
@@ -1,3 +1,5 @@
 add_llvm_library(LLVMAMDGPUAsmPrinter
   AMDGPUInstPrinter.cpp
   )
+
+add_dependencies(LLVMAMDGPUAsmPrinter LLVMAMDGPUUtils)
diff --git a/lib/Target/AMDGPU/InstPrinter/LLVMBuild.txt b/lib/Target/AMDGPU/InstPrinter/LLVMBuild.txt
index fdb43844dc63..30c2670316c8 100644
--- a/lib/Target/AMDGPU/InstPrinter/LLVMBuild.txt
+++ b/lib/Target/AMDGPU/InstPrinter/LLVMBuild.txt
@@ -19,6 +19,6 @@
 type = Library
 name = AMDGPUAsmPrinter
 parent = AMDGPU
-required_libraries = MC Support
+required_libraries = MC Support AMDGPUUtils
 add_to_library_groups = AMDGPU
 
diff --git a/lib/Target/AMDGPU/InstPrinter/Makefile b/lib/Target/AMDGPU/InstPrinter/Makefile
deleted file mode 100644
index 4e48ac7e28a9..000000000000
--- a/lib/Target/AMDGPU/InstPrinter/Makefile
+++ /dev/null
@@ -1,15 +0,0 @@
-#===- lib/Target/R600/AsmPrinter/Makefile ------------------*- Makefile -*-===##
-#
-#                     The LLVM Compiler Infrastructure
-#
-# This file is distributed under the University of Illinois Open Source
-# License. See LICENSE.TXT for details.
-#
-##===----------------------------------------------------------------------===##
-LEVEL = ../../../..
-LIBRARYNAME = LLVMAMDGPUAsmPrinter
-
-# Hack: we need to include 'main' x86 target directory to grab private headers
-CPP.Flags += -I$(PROJ_OBJ_DIR)/.. -I$(PROJ_SRC_DIR)/..
-
-include $(LEVEL)/Makefile.common
diff --git a/lib/Target/AMDGPU/LLVMBuild.txt b/lib/Target/AMDGPU/LLVMBuild.txt
index 38c5489586f1..bbdd17737cf0 100644
--- a/lib/Target/AMDGPU/LLVMBuild.txt
+++ b/lib/Target/AMDGPU/LLVMBuild.txt
@@ -1,4 +1,4 @@
-;===- ./lib/Target/AMDIL/LLVMBuild.txt -------------------------*- Conf -*--===;
+;===- ./lib/Target/AMDGPU/LLVMBuild.txt ------------------------*- Conf -*--===;
 ;
 ;                     The LLVM Compiler Infrastructure
 ;
@@ -16,7 +16,7 @@
 ;===------------------------------------------------------------------------===;
 
 [common]
-subdirectories = AsmParser InstPrinter MCTargetDesc TargetInfo Utils
+subdirectories = AsmParser Disassembler InstPrinter MCTargetDesc TargetInfo Utils
 
 [component_0]
 type = TargetGroup
@@ -24,10 +24,11 @@ name = AMDGPU
 parent = Target
 has_asmparser = 1
 has_asmprinter = 1
+has_disassembler = 1
 
 [component_1]
 type = Library
 name = AMDGPUCodeGen
 parent = AMDGPU
-required_libraries = Analysis AsmPrinter CodeGen Core IPO MC AMDGPUAsmParser AMDGPUAsmPrinter AMDGPUDesc AMDGPUInfo AMDGPUUtils Scalar SelectionDAG Support Target TransformUtils
+required_libraries = Analysis AsmPrinter CodeGen Core IPO MC AMDGPUAsmPrinter AMDGPUDesc AMDGPUInfo AMDGPUUtils Scalar SelectionDAG Support Target TransformUtils Vectorize
 add_to_library_groups = AMDGPU
diff --git a/lib/Target/AMDGPU/MCTargetDesc/AMDGPUAsmBackend.cpp b/lib/Target/AMDGPU/MCTargetDesc/AMDGPUAsmBackend.cpp
index 60e8c8f3d303..1cb9d21408c6 100644
--- a/lib/Target/AMDGPU/MCTargetDesc/AMDGPUAsmBackend.cpp
+++ b/lib/Target/AMDGPU/MCTargetDesc/AMDGPUAsmBackend.cpp
@@ -53,7 +53,8 @@ public:
                             const MCAsmLayout &Layout) const override {
     return false;
   }
-  void relaxInstruction(const MCInst &Inst, MCInst &Res) const override {
+  void relaxInstruction(const MCInst &Inst, const MCSubtargetInfo &STI,
+                        MCInst &Res) const override {
     assert(!"Not implemented");
   }
   bool mayNeedRelaxation(const MCInst &Inst) const override { return false; }
@@ -73,12 +74,17 @@ void AMDGPUMCObjectWriter::writeObject(MCAssembler &Asm,
 
 static unsigned getFixupKindNumBytes(unsigned Kind) {
   switch (Kind) {
+  case FK_SecRel_1:
   case FK_Data_1:
     return 1;
+  case FK_SecRel_2:
   case FK_Data_2:
     return 2;
+  case FK_SecRel_4:
   case FK_Data_4:
+  case FK_PCRel_4:
     return 4;
+  case FK_SecRel_8:
   case FK_Data_8:
     return 8;
   default:
@@ -92,32 +98,15 @@ void AMDGPUAsmBackend::applyFixup(const MCFixup &Fixup, char *Data,
 
   switch ((unsigned)Fixup.getKind()) {
     case AMDGPU::fixup_si_sopp_br: {
+      int64_t BrImm = ((int64_t)Value - 4) / 4;
+      if (!isInt<16>(BrImm))
+        report_fatal_error("branch size exceeds simm16");
+
       uint16_t *Dst = (uint16_t*)(Data + Fixup.getOffset());
-      *Dst = (Value - 4) / 4;
+      *Dst = BrImm;
       break;
     }
 
-    case AMDGPU::fixup_si_rodata: {
-      uint32_t *Dst = (uint32_t*)(Data + Fixup.getOffset());
-      // We emit constant data at the end of the text section and generate its
-      // address using the following code sequence:
-      // s_getpc_b64 s[0:1]
-      // s_add_u32 s0, s0, $symbol
-      // s_addc_u32 s1, s1, 0
-      //
-      // s_getpc_b64 returns the address of the s_add_u32 instruction and then
-      // the fixup replaces $symbol with a literal constant, which is a
-      // pc-relative  offset from the encoding of the $symbol operand to the
-      // constant data.
-      //
-      // What we want here is an offset from the start of the s_add_u32
-      // instruction to the constant data, but since the encoding of $symbol
-      // starts 4 bytes after the start of the add instruction, we end up
-      // with an offset that is 4 bytes too small.  This requires us to
-      // add 4 to the fixup value before applying it.
-      *Dst = Value + 4;
-      break;
-    }
     default: {
       // FIXME: Copied from AArch64
       unsigned NumBytes = getFixupKindNumBytes(Fixup.getKind());
@@ -144,7 +133,6 @@ const MCFixupKindInfo &AMDGPUAsmBackend::getFixupKindInfo(
   const static MCFixupKindInfo Infos[AMDGPU::NumTargetFixupKinds] = {
     // name                   offset bits  flags
     { "fixup_si_sopp_br",     0,     16,   MCFixupKindInfo::FKF_IsPCRel },
-    { "fixup_si_rodata",      0,     32,   MCFixupKindInfo::FKF_IsPCRel }
   };
 
   if (Kind < FirstTargetFixupKind)
@@ -167,13 +155,15 @@ namespace {
 
 class ELFAMDGPUAsmBackend : public AMDGPUAsmBackend {
   bool Is64Bit;
+  bool HasRelocationAddend;
 
 public:
-  ELFAMDGPUAsmBackend(const Target &T, bool Is64Bit) :
-      AMDGPUAsmBackend(T), Is64Bit(Is64Bit) { }
+  ELFAMDGPUAsmBackend(const Target &T, const Triple &TT) :
+      AMDGPUAsmBackend(T), Is64Bit(TT.getArch() == Triple::amdgcn),
+      HasRelocationAddend(TT.getOS() == Triple::AMDHSA) { }
 
   MCObjectWriter *createObjectWriter(raw_pwrite_stream &OS) const override {
-    return createAMDGPUELFObjectWriter(Is64Bit, OS);
+    return createAMDGPUELFObjectWriter(Is64Bit, HasRelocationAddend, OS);
   }
 };
 
@@ -182,8 +172,6 @@ public:
 MCAsmBackend *llvm::createAMDGPUAsmBackend(const Target &T,
                                            const MCRegisterInfo &MRI,
                                            const Triple &TT, StringRef CPU) {
-  Triple TargetTriple(TT);
-
   // Use 64-bit ELF for amdgcn
-  return new ELFAMDGPUAsmBackend(T, TargetTriple.getArch() == Triple::amdgcn);
+  return new ELFAMDGPUAsmBackend(T, TT);
 }
diff --git a/lib/Target/AMDGPU/MCTargetDesc/AMDGPUELFObjectWriter.cpp b/lib/Target/AMDGPU/MCTargetDesc/AMDGPUELFObjectWriter.cpp
index 820f17df8960..b4e3b8e896bd 100644
--- a/lib/Target/AMDGPU/MCTargetDesc/AMDGPUELFObjectWriter.cpp
+++ b/lib/Target/AMDGPU/MCTargetDesc/AMDGPUELFObjectWriter.cpp
@@ -18,23 +18,56 @@ namespace {
 
 class AMDGPUELFObjectWriter : public MCELFObjectTargetWriter {
 public:
-  AMDGPUELFObjectWriter(bool Is64Bit);
+  AMDGPUELFObjectWriter(bool Is64Bit, bool HasRelocationAddend);
 protected:
-  unsigned GetRelocType(const MCValue &Target, const MCFixup &Fixup,
-                        bool IsPCRel) const override {
-    return Fixup.getKind();
-  }
-
+  unsigned getRelocType(MCContext &Ctx, const MCValue &Target,
+                        const MCFixup &Fixup, bool IsPCRel) const override;
 };
 
 
 } // End anonymous namespace
 
-AMDGPUELFObjectWriter::AMDGPUELFObjectWriter(bool Is64Bit)
-  : MCELFObjectTargetWriter(Is64Bit, ELF::ELFOSABI_AMDGPU_HSA,
-                            ELF::EM_AMDGPU, false) { }
+AMDGPUELFObjectWriter::AMDGPUELFObjectWriter(bool Is64Bit,
+                                             bool HasRelocationAddend)
+  : MCELFObjectTargetWriter(Is64Bit,
+                            ELF::ELFOSABI_AMDGPU_HSA,
+                            ELF::EM_AMDGPU,
+                            HasRelocationAddend) { }
+
+unsigned AMDGPUELFObjectWriter::getRelocType(MCContext &Ctx,
+                                             const MCValue &Target,
+                                             const MCFixup &Fixup,
+                                             bool IsPCRel) const {
+  // SCRATCH_RSRC_DWORD[01] is a special global variable that represents
+  // the scratch buffer.
+  if (Target.getSymA()->getSymbol().getName() == "SCRATCH_RSRC_DWORD0")
+    return ELF::R_AMDGPU_ABS32_LO;
+  if (Target.getSymA()->getSymbol().getName() == "SCRATCH_RSRC_DWORD1")
+    return ELF::R_AMDGPU_ABS32_HI;
+
+  switch (Target.getAccessVariant()) {
+  default:
+    break;
+  case MCSymbolRefExpr::VK_GOTPCREL:
+    return ELF::R_AMDGPU_GOTPCREL;
+  }
+
+  switch (Fixup.getKind()) {
+  default: break;
+  case FK_PCRel_4:
+    return ELF::R_AMDGPU_REL32;
+  case FK_SecRel_4:
+    return ELF::R_AMDGPU_ABS32;
+  }
+
+  llvm_unreachable("unhandled relocation type");
+}
+
 
-MCObjectWriter *llvm::createAMDGPUELFObjectWriter(bool Is64Bit, raw_pwrite_stream &OS) {
-  MCELFObjectTargetWriter *MOTW = new AMDGPUELFObjectWriter(Is64Bit);
+MCObjectWriter *llvm::createAMDGPUELFObjectWriter(bool Is64Bit,
+                                                  bool HasRelocationAddend,
+                                                  raw_pwrite_stream &OS) {
+  MCELFObjectTargetWriter *MOTW =
+      new AMDGPUELFObjectWriter(Is64Bit, HasRelocationAddend);
   return createELFObjectWriter(MOTW, OS, true);
 }
diff --git a/lib/Target/AMDGPU/MCTargetDesc/AMDGPUELFStreamer.cpp b/lib/Target/AMDGPU/MCTargetDesc/AMDGPUELFStreamer.cpp
index 9ff9fe794d2b..43338a5bebd2 100644
--- a/lib/Target/AMDGPU/MCTargetDesc/AMDGPUELFStreamer.cpp
+++ b/lib/Target/AMDGPU/MCTargetDesc/AMDGPUELFStreamer.cpp
@@ -12,11 +12,6 @@
 
 using namespace llvm;
 
-void AMDGPUELFStreamer::InitSections(bool NoExecStack) {
-  // Start with the .hsatext section by default.
-  SwitchSection(AMDGPU::getHSATextSection(getContext()));
-}
-
 MCELFStreamer *llvm::createAMDGPUELFStreamer(MCContext &Context,
                                            MCAsmBackend &MAB,
                                            raw_pwrite_stream &OS,
diff --git a/lib/Target/AMDGPU/MCTargetDesc/AMDGPUELFStreamer.h b/lib/Target/AMDGPU/MCTargetDesc/AMDGPUELFStreamer.h
index 488d7e74d741..5319b65d65f9 100644
--- a/lib/Target/AMDGPU/MCTargetDesc/AMDGPUELFStreamer.h
+++ b/lib/Target/AMDGPU/MCTargetDesc/AMDGPUELFStreamer.h
@@ -1,4 +1,4 @@
-//===-------- AMDGPUELFStreamer.h - ELF Object Output ---------------------===//
+//===-------- AMDGPUELFStreamer.h - ELF Object Output -----------*- C++ -*-===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -29,7 +29,6 @@ public:
                   MCCodeEmitter *Emitter)
       : MCELFStreamer(Context, MAB, OS, Emitter) { }
 
-  virtual void InitSections(bool NoExecStac) override;
 };
 
 MCELFStreamer *createAMDGPUELFStreamer(MCContext &Context, MCAsmBackend &MAB,
diff --git a/lib/Target/AMDGPU/MCTargetDesc/AMDGPUFixupKinds.h b/lib/Target/AMDGPU/MCTargetDesc/AMDGPUFixupKinds.h
index 59a9178082f6..20c1adfbc6b9 100644
--- a/lib/Target/AMDGPU/MCTargetDesc/AMDGPUFixupKinds.h
+++ b/lib/Target/AMDGPU/MCTargetDesc/AMDGPUFixupKinds.h
@@ -7,8 +7,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef LLVM_LIB_TARGET_R600_MCTARGETDESC_AMDGPUFIXUPKINDS_H
-#define LLVM_LIB_TARGET_R600_MCTARGETDESC_AMDGPUFIXUPKINDS_H
+#ifndef LLVM_LIB_TARGET_AMDGPU_MCTARGETDESC_AMDGPUFIXUPKINDS_H
+#define LLVM_LIB_TARGET_AMDGPU_MCTARGETDESC_AMDGPUFIXUPKINDS_H
 
 #include "llvm/MC/MCFixup.h"
 
@@ -18,9 +18,6 @@ enum Fixups {
   /// 16-bit PC relative fixup for SOPP branch instructions.
   fixup_si_sopp_br = FirstTargetFixupKind,
 
-  /// fixup for global addresses with constant initializers
-  fixup_si_rodata,
-
   // Marker
   LastTargetFixupKind,
   NumTargetFixupKinds = LastTargetFixupKind - FirstTargetFixupKind
diff --git a/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCAsmInfo.cpp b/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCAsmInfo.cpp
index 4bc80a028936..1655591abf39 100644
--- a/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCAsmInfo.cpp
+++ b/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCAsmInfo.cpp
@@ -9,12 +9,15 @@
 //===----------------------------------------------------------------------===//
 
 #include "AMDGPUMCAsmInfo.h"
+#include "llvm/ADT/Triple.h"
 
 using namespace llvm;
+
 AMDGPUMCAsmInfo::AMDGPUMCAsmInfo(const Triple &TT) : MCAsmInfoELF() {
   HasSingleParameterDotFile = false;
   //===------------------------------------------------------------------===//
-  MaxInstLength = 16;
+  MinInstAlignment = 4;
+  MaxInstLength = (TT.getArch() == Triple::amdgcn) ? 8 : 16;
   SeparatorString = "\n";
   CommentString = ";";
   PrivateLabelPrefix = "";
diff --git a/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCAsmInfo.h b/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCAsmInfo.h
index a546961705d7..8cb33a3179cd 100644
--- a/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCAsmInfo.h
+++ b/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCAsmInfo.h
@@ -11,8 +11,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef LLVM_LIB_TARGET_R600_MCTARGETDESC_AMDGPUMCASMINFO_H
-#define LLVM_LIB_TARGET_R600_MCTARGETDESC_AMDGPUMCASMINFO_H
+#ifndef LLVM_LIB_TARGET_AMDGPU_MCTARGETDESC_AMDGPUMCASMINFO_H
+#define LLVM_LIB_TARGET_AMDGPU_MCTARGETDESC_AMDGPUMCASMINFO_H
 
 #include "llvm/MC/MCAsmInfoELF.h"
 namespace llvm {
diff --git a/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCCodeEmitter.h b/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCCodeEmitter.h
index c95742762233..c942ea904085 100644
--- a/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCCodeEmitter.h
+++ b/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCCodeEmitter.h
@@ -12,8 +12,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef LLVM_LIB_TARGET_R600_MCTARGETDESC_AMDGPUMCCODEEMITTER_H
-#define LLVM_LIB_TARGET_R600_MCTARGETDESC_AMDGPUMCCODEEMITTER_H
+#ifndef LLVM_LIB_TARGET_AMDGPU_MCTARGETDESC_AMDGPUMCCODEEMITTER_H
+#define LLVM_LIB_TARGET_AMDGPU_MCTARGETDESC_AMDGPUMCCODEEMITTER_H
 
 #include "llvm/MC/MCCodeEmitter.h"
 #include "llvm/Support/raw_ostream.h"
diff --git a/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCTargetDesc.cpp b/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCTargetDesc.cpp
index f70409470276..a0d9aab114fc 100644
--- a/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCTargetDesc.cpp
+++ b/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCTargetDesc.cpp
@@ -18,7 +18,6 @@
 #include "AMDGPUTargetStreamer.h"
 #include "InstPrinter/AMDGPUInstPrinter.h"
 #include "SIDefines.h"
-#include "llvm/MC/MCCodeGenInfo.h"
 #include "llvm/MC/MCContext.h"
 #include "llvm/MC/MCInstrInfo.h"
 #include "llvm/MC/MCRegisterInfo.h"
@@ -56,15 +55,6 @@ createAMDGPUMCSubtargetInfo(const Triple &TT, StringRef CPU, StringRef FS) {
   return createAMDGPUMCSubtargetInfoImpl(TT, CPU, FS);
 }
 
-static MCCodeGenInfo *createAMDGPUMCCodeGenInfo(const Triple &TT,
-                                                Reloc::Model RM,
-                                                CodeModel::Model CM,
-                                                CodeGenOpt::Level OL) {
-  MCCodeGenInfo *X = new MCCodeGenInfo();
-  X->initMCCodeGenInfo(RM, CM, OL);
-  return X;
-}
-
 static MCInstPrinter *createAMDGPUMCInstPrinter(const Triple &T,
                                                 unsigned SyntaxVariant,
                                                 const MCAsmInfo &MAI,
@@ -99,7 +89,6 @@ extern "C" void LLVMInitializeAMDGPUTargetMC() {
   for (Target *T : {&TheAMDGPUTarget, &TheGCNTarget}) {
     RegisterMCAsmInfo<AMDGPUMCAsmInfo> X(*T);
 
-    TargetRegistry::RegisterMCCodeGenInfo(*T, createAMDGPUMCCodeGenInfo);
     TargetRegistry::RegisterMCInstrInfo(*T, createAMDGPUMCInstrInfo);
     TargetRegistry::RegisterMCRegInfo(*T, createAMDGPUMCRegisterInfo);
     TargetRegistry::RegisterMCSubtargetInfo(*T, createAMDGPUMCSubtargetInfo);
diff --git a/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCTargetDesc.h b/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCTargetDesc.h
index 5d1b86b8c0c2..9ab7940812ba 100644
--- a/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCTargetDesc.h
+++ b/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCTargetDesc.h
@@ -13,13 +13,13 @@
 //===----------------------------------------------------------------------===//
 //
 
-#ifndef LLVM_LIB_TARGET_R600_MCTARGETDESC_AMDGPUMCTARGETDESC_H
-#define LLVM_LIB_TARGET_R600_MCTARGETDESC_AMDGPUMCTARGETDESC_H
+#ifndef LLVM_LIB_TARGET_AMDGPU_MCTARGETDESC_AMDGPUMCTARGETDESC_H
+#define LLVM_LIB_TARGET_AMDGPU_MCTARGETDESC_AMDGPUMCTARGETDESC_H
 
 #include "llvm/Support/DataTypes.h"
-#include "llvm/ADT/StringRef.h"
 
 namespace llvm {
+class StringRef;
 class MCAsmBackend;
 class MCCodeEmitter;
 class MCContext;
@@ -47,6 +47,7 @@ MCAsmBackend *createAMDGPUAsmBackend(const Target &T, const MCRegisterInfo &MRI,
                                      const Triple &TT, StringRef CPU);
 
 MCObjectWriter *createAMDGPUELFObjectWriter(bool Is64Bit,
+                                            bool HasRelocationAddend,
                                             raw_pwrite_stream &OS);
 } // End llvm namespace
 
diff --git a/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.cpp b/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.cpp
index b91134d2ee9b..83dcaacb738f 100644
--- a/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.cpp
+++ b/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.cpp
@@ -312,10 +312,6 @@ AMDGPUTargetELFStreamer::EmitAMDKernelCodeT(const amd_kernel_code_t &Header) {
 
   MCStreamer &OS = getStreamer();
   OS.PushSection();
-  // The MCObjectFileInfo that is available to the assembler is a generic
-  // implementation and not AMDGPUHSATargetObjectFile, so we can't use
-  // MCObjectFileInfo::getTextSection() here for fetching the HSATextSection.
-  OS.SwitchSection(AMDGPU::getHSATextSection(OS.getContext()));
   OS.EmitBytes(StringRef((const char*)&Header, sizeof(Header)));
   OS.PopSection();
 }
diff --git a/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.h b/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.h
index 83bb728f541c..b3d59e8f396e 100644
--- a/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.h
+++ b/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.h
@@ -7,16 +7,16 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef LLVM_LIB_TARGET_R600_MCTARGETDESC_AMDGPUTARGETSTREAMER_H
-#define LLVM_LIB_TARGET_R600_MCTARGETDESC_AMDGPUTARGETSTREAMER_H
+#ifndef LLVM_LIB_TARGET_AMDGPU_MCTARGETDESC_AMDGPUTARGETSTREAMER_H
+#define LLVM_LIB_TARGET_AMDGPU_MCTARGETDESC_AMDGPUTARGETSTREAMER_H
 
 #include "AMDKernelCodeT.h"
 #include "llvm/MC/MCStreamer.h"
-#include "llvm/MC/MCSymbol.h"
-#include "llvm/Support/Debug.h"
+
 namespace llvm {
 
 class MCELFStreamer;
+class MCSymbol;
 
 class AMDGPUTargetStreamer : public MCTargetStreamer {
 public:
diff --git a/lib/Target/AMDGPU/MCTargetDesc/Makefile b/lib/Target/AMDGPU/MCTargetDesc/Makefile
deleted file mode 100644
index 5ad68662d98c..000000000000
--- a/lib/Target/AMDGPU/MCTargetDesc/Makefile
+++ /dev/null
@@ -1,16 +0,0 @@
-##===- lib/Target/AMDGPU/TargetDesc/Makefile ----------------*- Makefile -*-===##
-#
-#                     The LLVM Compiler Infrastructure
-#
-# This file is distributed under the University of Illinois Open Source
-# License. See LICENSE.TXT for details.
-#
-##===----------------------------------------------------------------------===##
-
-LEVEL = ../../../..
-LIBRARYNAME = LLVMAMDGPUDesc
-
-# Hack: we need to include 'main' target directory to grab private headers
-CPP.Flags += -I$(PROJ_OBJ_DIR)/.. -I$(PROJ_SRC_DIR)/..
-
-include $(LEVEL)/Makefile.common
diff --git a/lib/Target/AMDGPU/MCTargetDesc/R600MCCodeEmitter.cpp b/lib/Target/AMDGPU/MCTargetDesc/R600MCCodeEmitter.cpp
index 3c1142dd664b..5e8e6ceb7ca2 100644
--- a/lib/Target/AMDGPU/MCTargetDesc/R600MCCodeEmitter.cpp
+++ b/lib/Target/AMDGPU/MCTargetDesc/R600MCCodeEmitter.cpp
@@ -15,6 +15,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "R600Defines.h"
+#include "MCTargetDesc/AMDGPUFixupKinds.h"
 #include "MCTargetDesc/AMDGPUMCCodeEmitter.h"
 #include "MCTargetDesc/AMDGPUMCTargetDesc.h"
 #include "llvm/MC/MCCodeEmitter.h"
@@ -51,12 +52,9 @@ public:
                              const MCSubtargetInfo &STI) const override;
 
 private:
-  void EmitByte(unsigned int byte, raw_ostream &OS) const;
-
   void Emit(uint32_t value, raw_ostream &OS) const;
   void Emit(uint64_t value, raw_ostream &OS) const;
 
-  unsigned getHWRegChan(unsigned reg) const;
   unsigned getHWReg(unsigned regNo) const;
 };
 
@@ -142,10 +140,6 @@ void R600MCCodeEmitter::encodeInstruction(const MCInst &MI, raw_ostream &OS,
   }
 }
 
-void R600MCCodeEmitter::EmitByte(unsigned int Byte, raw_ostream &OS) const {
-  OS.write((uint8_t) Byte & 0xff);
-}
-
 void R600MCCodeEmitter::Emit(uint32_t Value, raw_ostream &OS) const {
   support::endian::Writer<support::little>(OS).write(Value);
 }
@@ -154,17 +148,13 @@ void R600MCCodeEmitter::Emit(uint64_t Value, raw_ostream &OS) const {
   support::endian::Writer<support::little>(OS).write(Value);
 }
 
-unsigned R600MCCodeEmitter::getHWRegChan(unsigned reg) const {
-  return MRI.getEncodingValue(reg) >> HW_CHAN_SHIFT;
-}
-
 unsigned R600MCCodeEmitter::getHWReg(unsigned RegNo) const {
   return MRI.getEncodingValue(RegNo) & HW_REG_MASK;
 }
 
 uint64_t R600MCCodeEmitter::getMachineOpValue(const MCInst &MI,
                                               const MCOperand &MO,
-                                        SmallVectorImpl<MCFixup> &Fixup,
+                                        SmallVectorImpl<MCFixup> &Fixups,
                                         const MCSubtargetInfo &STI) const {
   if (MO.isReg()) {
     if (HAS_NATIVE_OPERANDS(MCII.get(MI.getOpcode()).TSFlags))
@@ -172,6 +162,18 @@ uint64_t R600MCCodeEmitter::getMachineOpValue(const MCInst &MI,
     return getHWReg(MO.getReg());
   }
 
+  if (MO.isExpr()) {
+    // We put rodata at the end of code section, then map the entire
+    // code secetion as vtx buf. Thus the section relative address is the
+    // correct one.
+    // Each R600 literal instruction has two operands
+    // We can't easily get the order of the current one, so compare against
+    // the first one and adjust offset.
+    const unsigned offset = (&MO == &MI.getOperand(0)) ? 0 : 4;
+    Fixups.push_back(MCFixup::create(offset, MO.getExpr(), FK_SecRel_4, MI.getLoc()));
+    return 0;
+  }
+
   assert(MO.isImm());
   return MO.getImm();
 }
diff --git a/lib/Target/AMDGPU/MCTargetDesc/SIMCCodeEmitter.cpp b/lib/Target/AMDGPU/MCTargetDesc/SIMCCodeEmitter.cpp
index 9eb3dadbc5e2..71b585c25ac5 100644
--- a/lib/Target/AMDGPU/MCTargetDesc/SIMCCodeEmitter.cpp
+++ b/lib/Target/AMDGPU/MCTargetDesc/SIMCCodeEmitter.cpp
@@ -162,20 +162,30 @@ static uint32_t getLit64Encoding(uint64_t Val) {
 
 uint32_t SIMCCodeEmitter::getLitEncoding(const MCOperand &MO,
                                          unsigned OpSize) const {
-  if (MO.isExpr())
-    return 255;
 
-  assert(!MO.isFPImm());
+  int64_t Imm;
+  if (MO.isExpr()) {
+    const MCConstantExpr *C = dyn_cast<MCConstantExpr>(MO.getExpr());
+    if (!C)
+      return 255;
+
+    Imm = C->getValue();
+  } else {
 
-  if (!MO.isImm())
-    return ~0;
+    assert(!MO.isFPImm());
+
+    if (!MO.isImm())
+      return ~0;
+
+    Imm = MO.getImm();
+  }
 
   if (OpSize == 4)
-    return getLit32Encoding(static_cast<uint32_t>(MO.getImm()));
+    return getLit32Encoding(static_cast<uint32_t>(Imm));
 
   assert(OpSize == 8);
 
-  return getLit64Encoding(static_cast<uint64_t>(MO.getImm()));
+  return getLit64Encoding(static_cast<uint64_t>(Imm));
 }
 
 void SIMCCodeEmitter::encodeInstruction(const MCInst &MI, raw_ostream &OS,
@@ -213,7 +223,11 @@ void SIMCCodeEmitter::encodeInstruction(const MCInst &MI, raw_ostream &OS,
 
     if (Op.isImm())
       Imm = Op.getImm();
-    else if (!Op.isExpr()) // Exprs will be replaced with a fixup value.
+    else if (Op.isExpr()) {
+      if (const MCConstantExpr *C = dyn_cast<MCConstantExpr>(Op.getExpr()))
+        Imm = C->getValue();
+
+    } else if (!Op.isExpr()) // Exprs will be replaced with a fixup value.
       llvm_unreachable("Must be immediate or expr");
 
     for (unsigned j = 0; j < 4; j++) {
@@ -247,10 +261,14 @@ uint64_t SIMCCodeEmitter::getMachineOpValue(const MCInst &MI,
   if (MO.isReg())
     return MRI.getEncodingValue(MO.getReg());
 
-  if (MO.isExpr()) {
-    const MCSymbolRefExpr *Expr = cast<MCSymbolRefExpr>(MO.getExpr());
-    MCFixupKind Kind = (MCFixupKind)AMDGPU::fixup_si_rodata;
-    Fixups.push_back(MCFixup::create(4, Expr, Kind, MI.getLoc()));
+  if (MO.isExpr() && MO.getExpr()->getKind() != MCExpr::Constant) {
+    const MCSymbolRefExpr *Expr = dyn_cast<MCSymbolRefExpr>(MO.getExpr());
+    MCFixupKind Kind;
+    if (Expr && Expr->getSymbol().isExternal())
+      Kind = FK_Data_4;
+    else
+      Kind = FK_PCRel_4;
+    Fixups.push_back(MCFixup::create(4, MO.getExpr(), Kind, MI.getLoc()));
   }
 
   // Figure out the operand number, needed for isSrcOperand check
diff --git a/lib/Target/AMDGPU/Makefile b/lib/Target/AMDGPU/Makefile
deleted file mode 100644
index 219f34daa24f..000000000000
--- a/lib/Target/AMDGPU/Makefile
+++ /dev/null
@@ -1,23 +0,0 @@
-##===- lib/Target/R600/Makefile ---------------------------*- Makefile -*-===##
-#
-#                     The LLVM Compiler Infrastructure
-#
-# This file is distributed under the University of Illinois Open Source
-# License. See LICENSE.TXT for details.
-#
-##===----------------------------------------------------------------------===##
-
-LEVEL = ../../..
-LIBRARYNAME = LLVMAMDGPUCodeGen
-TARGET = AMDGPU
-
-# Make sure that tblgen is run, first thing.
-BUILT_SOURCES = AMDGPUGenRegisterInfo.inc AMDGPUGenInstrInfo.inc \
-		AMDGPUGenDAGISel.inc  AMDGPUGenSubtargetInfo.inc \
-		AMDGPUGenMCCodeEmitter.inc AMDGPUGenCallingConv.inc \
-		AMDGPUGenIntrinsics.inc AMDGPUGenDFAPacketizer.inc \
-		AMDGPUGenAsmWriter.inc AMDGPUGenAsmMatcher.inc
-
-DIRS = AsmParser InstPrinter TargetInfo MCTargetDesc Utils
-
-include $(LEVEL)/Makefile.common
diff --git a/lib/Target/AMDGPU/Processors.td b/lib/Target/AMDGPU/Processors.td
index 4300d972d46b..f5f1eb14e993 100644
--- a/lib/Target/AMDGPU/Processors.td
+++ b/lib/Target/AMDGPU/Processors.td
@@ -13,11 +13,8 @@ class Proc<string Name, ProcessorItineraries itin, list<SubtargetFeature> Featur
 //===----------------------------------------------------------------------===//
 // R600
 //===----------------------------------------------------------------------===//
-def : Proc<"",           R600_VLIW5_Itin,
-    [FeatureR600, FeatureVertexCache]>;
-
 def : Proc<"r600",       R600_VLIW5_Itin,
-    [FeatureR600 , FeatureVertexCache, FeatureWavefrontSize64]>;
+    [FeatureR600, FeatureVertexCache, FeatureWavefrontSize64]>;
 
 def : Proc<"r630",       R600_VLIW5_Itin,
     [FeatureR600, FeatureVertexCache, FeatureWavefrontSize32]>;
@@ -84,11 +81,11 @@ def : Proc<"cayman",     R600_VLIW4_Itin,
 //===----------------------------------------------------------------------===//
 
 def : ProcessorModel<"SI", SIFullSpeedModel,
-  [FeatureSouthernIslands, FeatureFastFMAF32]
+  [FeatureSouthernIslands, FeatureFastFMAF32, HalfRate64Ops]
 >;
 
-def : ProcessorModel<"tahiti",   SIFullSpeedModel,
-  [FeatureSouthernIslands, FeatureFastFMAF32]
+def : ProcessorModel<"tahiti", SIFullSpeedModel,
+  [FeatureSouthernIslands, FeatureFastFMAF32, HalfRate64Ops]
 >;
 
 def : ProcessorModel<"pitcairn", SIQuarterSpeedModel, [FeatureSouthernIslands]>;
@@ -116,8 +113,8 @@ def : ProcessorModel<"kaveri",     SIQuarterSpeedModel,
 >;
 
 def : ProcessorModel<"hawaii", SIFullSpeedModel,
-  [FeatureSeaIslands, FeatureFastFMAF32, FeatureLDSBankCount32,
-   FeatureISAVersion7_0_1]
+  [FeatureSeaIslands, FeatureFastFMAF32, HalfRate64Ops,
+   FeatureLDSBankCount32, FeatureISAVersion7_0_1]
 >;
 
 def : ProcessorModel<"mullins",    SIQuarterSpeedModel,
@@ -148,3 +145,11 @@ def : ProcessorModel<"fiji", SIQuarterSpeedModel,
 def : ProcessorModel<"stoney", SIQuarterSpeedModel,
   [FeatureVolcanicIslands, FeatureISAVersion8_0_1, FeatureLDSBankCount16]
 >;
+
+def : ProcessorModel<"polaris10", SIQuarterSpeedModel,
+  [FeatureVolcanicIslands, FeatureISAVersion8_0_1, FeatureLDSBankCount32]
+>;
+
+def : ProcessorModel<"polaris11", SIQuarterSpeedModel,
+  [FeatureVolcanicIslands, FeatureISAVersion8_0_1, FeatureLDSBankCount32]
+>;
diff --git a/lib/Target/AMDGPU/R600ClauseMergePass.cpp b/lib/Target/AMDGPU/R600ClauseMergePass.cpp
index 3cb90218a7d5..3ccde79e2df4 100644
--- a/lib/Target/AMDGPU/R600ClauseMergePass.cpp
+++ b/lib/Target/AMDGPU/R600ClauseMergePass.cpp
@@ -31,8 +31,8 @@ using namespace llvm;
 
 namespace {
 
-static bool isCFAlu(const MachineInstr *MI) {
-  switch (MI->getOpcode()) {
+static bool isCFAlu(const MachineInstr &MI) {
+  switch (MI.getOpcode()) {
   case AMDGPU::CF_ALU:
   case AMDGPU::CF_ALU_PUSH_BEFORE:
     return true;
@@ -47,19 +47,19 @@ private:
   static char ID;
   const R600InstrInfo *TII;
 
-  unsigned getCFAluSize(const MachineInstr *MI) const;
-  bool isCFAluEnabled(const MachineInstr *MI) const;
+  unsigned getCFAluSize(const MachineInstr &MI) const;
+  bool isCFAluEnabled(const MachineInstr &MI) const;
 
   /// IfCvt pass can generate "disabled" ALU clause marker that need to be
   /// removed and their content affected to the previous alu clause.
   /// This function parse instructions after CFAlu until it find a disabled
   /// CFAlu and merge the content, or an enabled CFAlu.
-  void cleanPotentialDisabledCFAlu(MachineInstr *CFAlu) const;
+  void cleanPotentialDisabledCFAlu(MachineInstr &CFAlu) const;
 
   /// Check whether LatrCFAlu can be merged into RootCFAlu and do it if
   /// it is the case.
-  bool mergeIfPossible(MachineInstr *RootCFAlu, const MachineInstr *LatrCFAlu)
-      const;
+  bool mergeIfPossible(MachineInstr &RootCFAlu,
+                       const MachineInstr &LatrCFAlu) const;
 
 public:
   R600ClauseMergePass(TargetMachine &tm) : MachineFunctionPass(ID) { }
@@ -71,38 +71,40 @@ public:
 
 char R600ClauseMergePass::ID = 0;
 
-unsigned R600ClauseMergePass::getCFAluSize(const MachineInstr *MI) const {
+unsigned R600ClauseMergePass::getCFAluSize(const MachineInstr &MI) const {
   assert(isCFAlu(MI));
-  return MI->getOperand(
-      TII->getOperandIdx(MI->getOpcode(), AMDGPU::OpName::COUNT)).getImm();
+  return MI
+      .getOperand(TII->getOperandIdx(MI.getOpcode(), AMDGPU::OpName::COUNT))
+      .getImm();
 }
 
-bool R600ClauseMergePass::isCFAluEnabled(const MachineInstr *MI) const {
+bool R600ClauseMergePass::isCFAluEnabled(const MachineInstr &MI) const {
   assert(isCFAlu(MI));
-  return MI->getOperand(
-      TII->getOperandIdx(MI->getOpcode(), AMDGPU::OpName::Enabled)).getImm();
+  return MI
+      .getOperand(TII->getOperandIdx(MI.getOpcode(), AMDGPU::OpName::Enabled))
+      .getImm();
 }
 
-void R600ClauseMergePass::cleanPotentialDisabledCFAlu(MachineInstr *CFAlu)
-    const {
+void R600ClauseMergePass::cleanPotentialDisabledCFAlu(
+    MachineInstr &CFAlu) const {
   int CntIdx = TII->getOperandIdx(AMDGPU::CF_ALU, AMDGPU::OpName::COUNT);
-  MachineBasicBlock::iterator I = CFAlu, E = CFAlu->getParent()->end();
+  MachineBasicBlock::iterator I = CFAlu, E = CFAlu.getParent()->end();
   I++;
   do {
-    while (I!= E && !isCFAlu(I))
+    while (I != E && !isCFAlu(*I))
       I++;
     if (I == E)
       return;
-    MachineInstr *MI = I++;
+    MachineInstr &MI = *I++;
     if (isCFAluEnabled(MI))
       break;
-    CFAlu->getOperand(CntIdx).setImm(getCFAluSize(CFAlu) + getCFAluSize(MI));
-    MI->eraseFromParent();
+    CFAlu.getOperand(CntIdx).setImm(getCFAluSize(CFAlu) + getCFAluSize(MI));
+    MI.eraseFromParent();
   } while (I != E);
 }
 
-bool R600ClauseMergePass::mergeIfPossible(MachineInstr *RootCFAlu,
-                                          const MachineInstr *LatrCFAlu) const {
+bool R600ClauseMergePass::mergeIfPossible(MachineInstr &RootCFAlu,
+                                          const MachineInstr &LatrCFAlu) const {
   assert(isCFAlu(RootCFAlu) && isCFAlu(LatrCFAlu));
   int CntIdx = TII->getOperandIdx(AMDGPU::CF_ALU, AMDGPU::OpName::COUNT);
   unsigned RootInstCount = getCFAluSize(RootCFAlu),
@@ -112,7 +114,7 @@ bool R600ClauseMergePass::mergeIfPossible(MachineInstr *RootCFAlu,
     DEBUG(dbgs() << "Excess inst counts\n");
     return false;
   }
-  if (RootCFAlu->getOpcode() == AMDGPU::CF_ALU_PUSH_BEFORE)
+  if (RootCFAlu.getOpcode() == AMDGPU::CF_ALU_PUSH_BEFORE)
     return false;
   // Is KCache Bank 0 compatible ?
   int Mode0Idx =
@@ -121,12 +123,12 @@ bool R600ClauseMergePass::mergeIfPossible(MachineInstr *RootCFAlu,
       TII->getOperandIdx(AMDGPU::CF_ALU, AMDGPU::OpName::KCACHE_BANK0);
   int KBank0LineIdx =
       TII->getOperandIdx(AMDGPU::CF_ALU, AMDGPU::OpName::KCACHE_ADDR0);
-  if (LatrCFAlu->getOperand(Mode0Idx).getImm() &&
-      RootCFAlu->getOperand(Mode0Idx).getImm() &&
-      (LatrCFAlu->getOperand(KBank0Idx).getImm() !=
-       RootCFAlu->getOperand(KBank0Idx).getImm() ||
-      LatrCFAlu->getOperand(KBank0LineIdx).getImm() !=
-      RootCFAlu->getOperand(KBank0LineIdx).getImm())) {
+  if (LatrCFAlu.getOperand(Mode0Idx).getImm() &&
+      RootCFAlu.getOperand(Mode0Idx).getImm() &&
+      (LatrCFAlu.getOperand(KBank0Idx).getImm() !=
+           RootCFAlu.getOperand(KBank0Idx).getImm() ||
+       LatrCFAlu.getOperand(KBank0LineIdx).getImm() !=
+           RootCFAlu.getOperand(KBank0LineIdx).getImm())) {
     DEBUG(dbgs() << "Wrong KC0\n");
     return false;
   }
@@ -137,56 +139,61 @@ bool R600ClauseMergePass::mergeIfPossible(MachineInstr *RootCFAlu,
       TII->getOperandIdx(AMDGPU::CF_ALU, AMDGPU::OpName::KCACHE_BANK1);
   int KBank1LineIdx =
       TII->getOperandIdx(AMDGPU::CF_ALU, AMDGPU::OpName::KCACHE_ADDR1);
-  if (LatrCFAlu->getOperand(Mode1Idx).getImm() &&
-      RootCFAlu->getOperand(Mode1Idx).getImm() &&
-      (LatrCFAlu->getOperand(KBank1Idx).getImm() !=
-      RootCFAlu->getOperand(KBank1Idx).getImm() ||
-      LatrCFAlu->getOperand(KBank1LineIdx).getImm() !=
-      RootCFAlu->getOperand(KBank1LineIdx).getImm())) {
+  if (LatrCFAlu.getOperand(Mode1Idx).getImm() &&
+      RootCFAlu.getOperand(Mode1Idx).getImm() &&
+      (LatrCFAlu.getOperand(KBank1Idx).getImm() !=
+           RootCFAlu.getOperand(KBank1Idx).getImm() ||
+       LatrCFAlu.getOperand(KBank1LineIdx).getImm() !=
+           RootCFAlu.getOperand(KBank1LineIdx).getImm())) {
     DEBUG(dbgs() << "Wrong KC0\n");
     return false;
   }
-  if (LatrCFAlu->getOperand(Mode0Idx).getImm()) {
-    RootCFAlu->getOperand(Mode0Idx).setImm(
-        LatrCFAlu->getOperand(Mode0Idx).getImm());
-    RootCFAlu->getOperand(KBank0Idx).setImm(
-        LatrCFAlu->getOperand(KBank0Idx).getImm());
-    RootCFAlu->getOperand(KBank0LineIdx).setImm(
-        LatrCFAlu->getOperand(KBank0LineIdx).getImm());
+  if (LatrCFAlu.getOperand(Mode0Idx).getImm()) {
+    RootCFAlu.getOperand(Mode0Idx).setImm(
+        LatrCFAlu.getOperand(Mode0Idx).getImm());
+    RootCFAlu.getOperand(KBank0Idx).setImm(
+        LatrCFAlu.getOperand(KBank0Idx).getImm());
+    RootCFAlu.getOperand(KBank0LineIdx)
+        .setImm(LatrCFAlu.getOperand(KBank0LineIdx).getImm());
   }
-  if (LatrCFAlu->getOperand(Mode1Idx).getImm()) {
-    RootCFAlu->getOperand(Mode1Idx).setImm(
-        LatrCFAlu->getOperand(Mode1Idx).getImm());
-    RootCFAlu->getOperand(KBank1Idx).setImm(
-        LatrCFAlu->getOperand(KBank1Idx).getImm());
-    RootCFAlu->getOperand(KBank1LineIdx).setImm(
-        LatrCFAlu->getOperand(KBank1LineIdx).getImm());
+  if (LatrCFAlu.getOperand(Mode1Idx).getImm()) {
+    RootCFAlu.getOperand(Mode1Idx).setImm(
+        LatrCFAlu.getOperand(Mode1Idx).getImm());
+    RootCFAlu.getOperand(KBank1Idx).setImm(
+        LatrCFAlu.getOperand(KBank1Idx).getImm());
+    RootCFAlu.getOperand(KBank1LineIdx)
+        .setImm(LatrCFAlu.getOperand(KBank1LineIdx).getImm());
   }
-  RootCFAlu->getOperand(CntIdx).setImm(CumuledInsts);
-  RootCFAlu->setDesc(TII->get(LatrCFAlu->getOpcode()));
+  RootCFAlu.getOperand(CntIdx).setImm(CumuledInsts);
+  RootCFAlu.setDesc(TII->get(LatrCFAlu.getOpcode()));
   return true;
 }
 
 bool R600ClauseMergePass::runOnMachineFunction(MachineFunction &MF) {
-  TII = static_cast<const R600InstrInfo *>(MF.getSubtarget().getInstrInfo());
+  if (skipFunction(*MF.getFunction()))
+    return false;
+
+  const R600Subtarget &ST = MF.getSubtarget<R600Subtarget>();
+  TII = ST.getInstrInfo();
+
   for (MachineFunction::iterator BB = MF.begin(), BB_E = MF.end();
                                                   BB != BB_E; ++BB) {
     MachineBasicBlock &MBB = *BB;
     MachineBasicBlock::iterator I = MBB.begin(),  E = MBB.end();
     MachineBasicBlock::iterator LatestCFAlu = E;
     while (I != E) {
-      MachineInstr *MI = I++;
+      MachineInstr &MI = *I++;
       if ((!TII->canBeConsideredALU(MI) && !isCFAlu(MI)) ||
-          TII->mustBeLastInClause(MI->getOpcode()))
+          TII->mustBeLastInClause(MI.getOpcode()))
         LatestCFAlu = E;
       if (!isCFAlu(MI))
         continue;
       cleanPotentialDisabledCFAlu(MI);
 
-      if (LatestCFAlu != E && mergeIfPossible(LatestCFAlu, MI)) {
-        MI->eraseFromParent();
+      if (LatestCFAlu != E && mergeIfPossible(*LatestCFAlu, MI)) {
+        MI.eraseFromParent();
       } else {
-        assert(MI->getOperand(8).getImm() && "CF ALU instruction disabled");
+        assert(MI.getOperand(8).getImm() && "CF ALU instruction disabled");
         LatestCFAlu = MI;
       }
     }
diff --git a/lib/Target/AMDGPU/R600ControlFlowFinalizer.cpp b/lib/Target/AMDGPU/R600ControlFlowFinalizer.cpp
index bd80bb211b4f..d5bda4a8303e 100644
--- a/lib/Target/AMDGPU/R600ControlFlowFinalizer.cpp
+++ b/lib/Target/AMDGPU/R600ControlFlowFinalizer.cpp
@@ -39,16 +39,16 @@ struct CFStack {
     FIRST_NON_WQM_PUSH_W_FULL_ENTRY = 3
   };
 
-  const AMDGPUSubtarget *ST;
+  const R600Subtarget *ST;
   std::vector<StackItem> BranchStack;
   std::vector<StackItem> LoopStack;
   unsigned MaxStackSize;
   unsigned CurrentEntries;
   unsigned CurrentSubEntries;
 
-  CFStack(const AMDGPUSubtarget *st, unsigned ShaderType) : ST(st),
+  CFStack(const R600Subtarget *st, CallingConv::ID cc) : ST(st),
       // We need to reserve a stack entry for CALL_FS in vertex shaders.
-      MaxStackSize(ShaderType == ShaderType::VERTEX ? 1 : 0),
+      MaxStackSize(cc == CallingConv::AMDGPU_VS ? 1 : 0),
       CurrentEntries(0), CurrentSubEntries(0) { }
 
   unsigned getLoopDepth();
@@ -119,7 +119,7 @@ unsigned CFStack::getSubEntrySize(CFStack::StackItem Item) {
     return 0;
   case CFStack::FIRST_NON_WQM_PUSH:
   assert(!ST->hasCaymanISA());
-  if (ST->getGeneration() <= AMDGPUSubtarget::R700) {
+  if (ST->getGeneration() <= R600Subtarget::R700) {
     // +1 For the push operation.
     // +2 Extra space required.
     return 3;
@@ -132,7 +132,7 @@ unsigned CFStack::getSubEntrySize(CFStack::StackItem Item) {
     return 2;
   }
   case CFStack::FIRST_NON_WQM_PUSH_W_FULL_ENTRY:
-    assert(ST->getGeneration() >= AMDGPUSubtarget::EVERGREEN);
+    assert(ST->getGeneration() >= R600Subtarget::EVERGREEN);
     // +1 For the push operation.
     // +1 Extra space required.
     return 2;
@@ -142,8 +142,8 @@ unsigned CFStack::getSubEntrySize(CFStack::StackItem Item) {
 }
 
 void CFStack::updateMaxStackSize() {
-  unsigned CurrentStackSize = CurrentEntries +
-                              (RoundUpToAlignment(CurrentSubEntries, 4) / 4);
+  unsigned CurrentStackSize =
+      CurrentEntries + (alignTo(CurrentSubEntries, 4) / 4);
   MaxStackSize = std::max(CurrentStackSize, MaxStackSize);
 }
 
@@ -159,7 +159,7 @@ void CFStack::pushBranch(unsigned Opcode, bool isWQM) {
                                              // See comment in
                                              // CFStack::getSubEntrySize()
       else if (CurrentEntries > 0 &&
-               ST->getGeneration() > AMDGPUSubtarget::EVERGREEN &&
+               ST->getGeneration() > R600Subtarget::EVERGREEN &&
                !ST->hasCaymanISA() &&
                !branchStackContains(CFStack::FIRST_NON_WQM_PUSH_W_FULL_ENTRY))
         Item = CFStack::FIRST_NON_WQM_PUSH_W_FULL_ENTRY;
@@ -220,10 +220,10 @@ private:
   const R600InstrInfo *TII;
   const R600RegisterInfo *TRI;
   unsigned MaxFetchInst;
-  const AMDGPUSubtarget *ST;
+  const R600Subtarget *ST;
 
-  bool IsTrivialInst(MachineInstr *MI) const {
-    switch (MI->getOpcode()) {
+  bool IsTrivialInst(MachineInstr &MI) const {
+    switch (MI.getOpcode()) {
     case AMDGPU::KILL:
     case AMDGPU::RETURN:
       return true;
@@ -234,7 +234,7 @@ private:
 
   const MCInstrDesc &getHWInstrDesc(ControlFlowInstruction CFI) const {
     unsigned Opcode = 0;
-    bool isEg = (ST->getGeneration() >= AMDGPUSubtarget::EVERGREEN);
+    bool isEg = (ST->getGeneration() >= R600Subtarget::EVERGREEN);
     switch (CFI) {
     case CF_TC:
       Opcode = isEg ? AMDGPU::CF_TC_EG : AMDGPU::CF_TC_R600;
@@ -278,11 +278,12 @@ private:
     return TII->get(Opcode);
   }
 
-  bool isCompatibleWithClause(const MachineInstr *MI,
-      std::set<unsigned> &DstRegs) const {
+  bool isCompatibleWithClause(const MachineInstr &MI,
+                              std::set<unsigned> &DstRegs) const {
     unsigned DstMI, SrcMI;
-    for (MachineInstr::const_mop_iterator I = MI->operands_begin(),
-        E = MI->operands_end(); I != E; ++I) {
+    for (MachineInstr::const_mop_iterator I = MI.operands_begin(),
+                                          E = MI.operands_end();
+         I != E; ++I) {
       const MachineOperand &MO = *I;
       if (!MO.isReg())
         continue;
@@ -318,20 +319,20 @@ private:
     MachineBasicBlock::iterator ClauseHead = I;
     std::vector<MachineInstr *> ClauseContent;
     unsigned AluInstCount = 0;
-    bool IsTex = TII->usesTextureCache(ClauseHead);
+    bool IsTex = TII->usesTextureCache(*ClauseHead);
     std::set<unsigned> DstRegs;
     for (MachineBasicBlock::iterator E = MBB.end(); I != E; ++I) {
-      if (IsTrivialInst(I))
+      if (IsTrivialInst(*I))
         continue;
       if (AluInstCount >= MaxFetchInst)
         break;
-      if ((IsTex && !TII->usesTextureCache(I)) ||
-          (!IsTex && !TII->usesVertexCache(I)))
+      if ((IsTex && !TII->usesTextureCache(*I)) ||
+          (!IsTex && !TII->usesVertexCache(*I)))
         break;
-      if (!isCompatibleWithClause(I, DstRegs))
+      if (!isCompatibleWithClause(*I, DstRegs))
         break;
       AluInstCount ++;
-      ClauseContent.push_back(I);
+      ClauseContent.push_back(&*I);
     }
     MachineInstr *MIb = BuildMI(MBB, ClauseHead, MBB.findDebugLoc(ClauseHead),
         getHWInstrDesc(IsTex?CF_TC:CF_VC))
@@ -340,28 +341,37 @@ private:
     return ClauseFile(MIb, std::move(ClauseContent));
   }
 
-  void getLiteral(MachineInstr *MI, std::vector<int64_t> &Lits) const {
+  void getLiteral(MachineInstr &MI, std::vector<MachineOperand *> &Lits) const {
     static const unsigned LiteralRegs[] = {
       AMDGPU::ALU_LITERAL_X,
       AMDGPU::ALU_LITERAL_Y,
       AMDGPU::ALU_LITERAL_Z,
       AMDGPU::ALU_LITERAL_W
     };
-    const SmallVector<std::pair<MachineOperand *, int64_t>, 3 > Srcs =
+    const SmallVector<std::pair<MachineOperand *, int64_t>, 3> Srcs =
         TII->getSrcs(MI);
-    for (unsigned i = 0, e = Srcs.size(); i < e; ++i) {
-      if (Srcs[i].first->getReg() != AMDGPU::ALU_LITERAL_X)
+    for (const auto &Src:Srcs) {
+      if (Src.first->getReg() != AMDGPU::ALU_LITERAL_X)
         continue;
-      int64_t Imm = Srcs[i].second;
-      std::vector<int64_t>::iterator It =
-          std::find(Lits.begin(), Lits.end(), Imm);
+      int64_t Imm = Src.second;
+      std::vector<MachineOperand*>::iterator It =
+          std::find_if(Lits.begin(), Lits.end(),
+                    [&](MachineOperand* val)
+                        { return val->isImm() && (val->getImm() == Imm);});
+
+      // Get corresponding Operand
+      MachineOperand &Operand = MI.getOperand(
+          TII->getOperandIdx(MI.getOpcode(), AMDGPU::OpName::literal));
+
       if (It != Lits.end()) {
+        // Reuse existing literal reg
         unsigned Index = It - Lits.begin();
-        Srcs[i].first->setReg(LiteralRegs[Index]);
+        Src.first->setReg(LiteralRegs[Index]);
       } else {
+        // Allocate new literal reg
         assert(Lits.size() < 4 && "Too many literals in Instruction Group");
-        Srcs[i].first->setReg(LiteralRegs[Lits.size()]);
-        Lits.push_back(Imm);
+        Src.first->setReg(LiteralRegs[Lits.size()]);
+        Lits.push_back(&Operand);
       }
     }
   }
@@ -384,56 +394,66 @@ private:
   ClauseFile
   MakeALUClause(MachineBasicBlock &MBB, MachineBasicBlock::iterator &I)
       const {
-    MachineBasicBlock::iterator ClauseHead = I;
+    MachineInstr &ClauseHead = *I;
     std::vector<MachineInstr *> ClauseContent;
     I++;
     for (MachineBasicBlock::instr_iterator E = MBB.instr_end(); I != E;) {
-      if (IsTrivialInst(I)) {
+      if (IsTrivialInst(*I)) {
         ++I;
         continue;
       }
       if (!I->isBundle() && !TII->isALUInstr(I->getOpcode()))
         break;
-      std::vector<int64_t> Literals;
+      std::vector<MachineOperand *>Literals;
       if (I->isBundle()) {
-        MachineInstr *DeleteMI = I;
+        MachineInstr &DeleteMI = *I;
         MachineBasicBlock::instr_iterator BI = I.getInstrIterator();
         while (++BI != E && BI->isBundledWithPred()) {
           BI->unbundleFromPred();
-          for (unsigned i = 0, e = BI->getNumOperands(); i != e; ++i) {
-            MachineOperand &MO = BI->getOperand(i);
+          for (MachineOperand &MO : BI->operands()) {
             if (MO.isReg() && MO.isInternalRead())
               MO.setIsInternalRead(false);
           }
-          getLiteral(&*BI, Literals);
+          getLiteral(*BI, Literals);
           ClauseContent.push_back(&*BI);
         }
         I = BI;
-        DeleteMI->eraseFromParent();
+        DeleteMI.eraseFromParent();
       } else {
-        getLiteral(I, Literals);
-        ClauseContent.push_back(I);
+        getLiteral(*I, Literals);
+        ClauseContent.push_back(&*I);
         I++;
       }
-      for (unsigned i = 0, e = Literals.size(); i < e; i+=2) {
-        unsigned literal0 = Literals[i];
-        unsigned literal2 = (i + 1 < e)?Literals[i + 1]:0;
-        MachineInstr *MILit = BuildMI(MBB, I, I->getDebugLoc(),
-            TII->get(AMDGPU::LITERALS))
-            .addImm(literal0)
-            .addImm(literal2);
+      for (unsigned i = 0, e = Literals.size(); i < e; i += 2) {
+        MachineInstrBuilder MILit = BuildMI(MBB, I, I->getDebugLoc(),
+            TII->get(AMDGPU::LITERALS));
+        if (Literals[i]->isImm()) {
+            MILit.addImm(Literals[i]->getImm());
+        } else {
+            MILit.addGlobalAddress(Literals[i]->getGlobal(),
+                                   Literals[i]->getOffset());
+        }
+        if (i + 1 < e) {
+          if (Literals[i + 1]->isImm()) {
+            MILit.addImm(Literals[i + 1]->getImm());
+          } else {
+            MILit.addGlobalAddress(Literals[i + 1]->getGlobal(),
+                                   Literals[i + 1]->getOffset());
+          }
+        } else
+          MILit.addImm(0);
         ClauseContent.push_back(MILit);
       }
     }
     assert(ClauseContent.size() < 128 && "ALU clause is too big");
-    ClauseHead->getOperand(7).setImm(ClauseContent.size() - 1);
-    return ClauseFile(ClauseHead, std::move(ClauseContent));
+    ClauseHead.getOperand(7).setImm(ClauseContent.size() - 1);
+    return ClauseFile(&ClauseHead, std::move(ClauseContent));
   }
 
   void
   EmitFetchClause(MachineBasicBlock::iterator InsertPos, ClauseFile &Clause,
       unsigned &CfCount) {
-    CounterPropagateAddr(Clause.first, CfCount);
+    CounterPropagateAddr(*Clause.first, CfCount);
     MachineBasicBlock *BB = Clause.first->getParent();
     BuildMI(BB, InsertPos->getDebugLoc(), TII->get(AMDGPU::FETCH_CLAUSE))
         .addImm(CfCount);
@@ -447,7 +467,7 @@ private:
   EmitALUClause(MachineBasicBlock::iterator InsertPos, ClauseFile &Clause,
       unsigned &CfCount) {
     Clause.first->getOperand(0).setImm(0);
-    CounterPropagateAddr(Clause.first, CfCount);
+    CounterPropagateAddr(*Clause.first, CfCount);
     MachineBasicBlock *BB = Clause.first->getParent();
     BuildMI(BB, InsertPos->getDebugLoc(), TII->get(AMDGPU::ALU_CLAUSE))
         .addImm(CfCount);
@@ -457,13 +477,13 @@ private:
     CfCount += Clause.second.size();
   }
 
-  void CounterPropagateAddr(MachineInstr *MI, unsigned Addr) const {
-    MI->getOperand(0).setImm(Addr + MI->getOperand(0).getImm());
+  void CounterPropagateAddr(MachineInstr &MI, unsigned Addr) const {
+    MI.getOperand(0).setImm(Addr + MI.getOperand(0).getImm());
   }
   void CounterPropagateAddr(const std::set<MachineInstr *> &MIs,
                             unsigned Addr) const {
     for (MachineInstr *MI : MIs) {
-      CounterPropagateAddr(MI, Addr);
+      CounterPropagateAddr(*MI, Addr);
     }
   }
 
@@ -472,20 +492,21 @@ public:
       : MachineFunctionPass(ID), TII(nullptr), TRI(nullptr), ST(nullptr) {}
 
   bool runOnMachineFunction(MachineFunction &MF) override {
-    ST = &MF.getSubtarget<AMDGPUSubtarget>();
+    ST = &MF.getSubtarget<R600Subtarget>();
     MaxFetchInst = ST->getTexVTXClauseSize();
-    TII = static_cast<const R600InstrInfo *>(ST->getInstrInfo());
-    TRI = static_cast<const R600RegisterInfo *>(ST->getRegisterInfo());
+    TII = ST->getInstrInfo();
+    TRI = ST->getRegisterInfo();
+
     R600MachineFunctionInfo *MFI = MF.getInfo<R600MachineFunctionInfo>();
 
-    CFStack CFStack(ST, MFI->getShaderType());
+    CFStack CFStack(ST, MF.getFunction()->getCallingConv());
     for (MachineFunction::iterator MB = MF.begin(), ME = MF.end(); MB != ME;
         ++MB) {
       MachineBasicBlock &MBB = *MB;
       unsigned CfCount = 0;
       std::vector<std::pair<unsigned, std::set<MachineInstr *> > > LoopStack;
       std::vector<MachineInstr * > IfThenElseStack;
-      if (MFI->getShaderType() == ShaderType::VERTEX) {
+      if (MF.getFunction()->getCallingConv() == CallingConv::AMDGPU_VS) {
         BuildMI(MBB, MBB.begin(), MBB.findDebugLoc(MBB.begin()),
             getHWInstrDesc(CF_CALL_FS));
         CfCount++;
@@ -493,10 +514,10 @@ public:
       std::vector<ClauseFile> FetchClauses, AluClauses;
       std::vector<MachineInstr *> LastAlu(1);
       std::vector<MachineInstr *> ToPopAfter;
-      
+
       for (MachineBasicBlock::iterator I = MBB.begin(), E = MBB.end();
           I != E;) {
-        if (TII->usesTextureCache(I) || TII->usesVertexCache(I)) {
+        if (TII->usesTextureCache(*I) || TII->usesVertexCache(*I)) {
           DEBUG(dbgs() << CfCount << ":"; I->dump(););
           FetchClauses.push_back(MakeFetchClause(MBB, I));
           CfCount++;
@@ -508,7 +529,7 @@ public:
         if (MI->getOpcode() != AMDGPU::ENDIF)
           LastAlu.back() = nullptr;
         if (MI->getOpcode() == AMDGPU::CF_ALU)
-          LastAlu.back() = MI;
+          LastAlu.back() = &*MI;
         I++;
         bool RequiresWorkAround =
             CFStack.requiresWorkAroundForInst(MI->getOpcode());
@@ -571,7 +592,7 @@ public:
         case AMDGPU::ELSE: {
           MachineInstr * JumpInst = IfThenElseStack.back();
           IfThenElseStack.pop_back();
-          CounterPropagateAddr(JumpInst, CfCount);
+          CounterPropagateAddr(*JumpInst, CfCount);
           MachineInstr *MIb = BuildMI(MBB, MI, MBB.findDebugLoc(MI),
               getHWInstrDesc(CF_ELSE))
               .addImm(0)
@@ -595,10 +616,10 @@ public:
             DEBUG(dbgs() << CfCount << ":"; MIb->dump(););
             CfCount++;
           }
-          
+
           MachineInstr *IfOrElseInst = IfThenElseStack.back();
           IfThenElseStack.pop_back();
-          CounterPropagateAddr(IfOrElseInst, CfCount);
+          CounterPropagateAddr(*IfOrElseInst, CfCount);
           IfOrElseInst->getOperand(1).setImm(1);
           LastAlu.pop_back();
           MI->eraseFromParent();
@@ -625,15 +646,16 @@ public:
         case AMDGPU::RETURN: {
           BuildMI(MBB, MI, MBB.findDebugLoc(MI), getHWInstrDesc(CF_END));
           CfCount++;
-          MI->eraseFromParent();
           if (CfCount % 2) {
             BuildMI(MBB, I, MBB.findDebugLoc(MI), TII->get(AMDGPU::PAD));
             CfCount++;
           }
+          MI->eraseFromParent();
           for (unsigned i = 0, e = FetchClauses.size(); i < e; i++)
             EmitFetchClause(I, FetchClauses[i], CfCount);
           for (unsigned i = 0, e = AluClauses.size(); i < e; i++)
             EmitALUClause(I, AluClauses[i], CfCount);
+          break;
         }
         default:
           if (TII->isExport(MI->getOpcode())) {
diff --git a/lib/Target/AMDGPU/R600Defines.h b/lib/Target/AMDGPU/R600Defines.h
index 51d87eda31d1..534461adc59f 100644
--- a/lib/Target/AMDGPU/R600Defines.h
+++ b/lib/Target/AMDGPU/R600Defines.h
@@ -8,8 +8,8 @@
 /// \file
 //===----------------------------------------------------------------------===//
 
-#ifndef LLVM_LIB_TARGET_R600_R600DEFINES_H
-#define LLVM_LIB_TARGET_R600_R600DEFINES_H
+#ifndef LLVM_LIB_TARGET_AMDGPU_R600DEFINES_H
+#define LLVM_LIB_TARGET_AMDGPU_R600DEFINES_H
 
 #include "llvm/MC/MCRegisterInfo.h"
 
diff --git a/lib/Target/AMDGPU/R600EmitClauseMarkers.cpp b/lib/Target/AMDGPU/R600EmitClauseMarkers.cpp
index fdc20302f4a3..93ed5be94a54 100644
--- a/lib/Target/AMDGPU/R600EmitClauseMarkers.cpp
+++ b/lib/Target/AMDGPU/R600EmitClauseMarkers.cpp
@@ -38,8 +38,8 @@ private:
   const R600InstrInfo *TII;
   int Address;
 
-  unsigned OccupiedDwords(MachineInstr *MI) const {
-    switch (MI->getOpcode()) {
+  unsigned OccupiedDwords(MachineInstr &MI) const {
+    switch (MI.getOpcode()) {
     case AMDGPU::INTERP_PAIR_XY:
     case AMDGPU::INTERP_PAIR_ZW:
     case AMDGPU::INTERP_VEC_LOAD:
@@ -53,17 +53,17 @@ private:
 
     // These will be expanded to two ALU instructions in the
     // ExpandSpecialInstructions pass.
-    if (TII->isLDSRetInstr(MI->getOpcode()))
+    if (TII->isLDSRetInstr(MI.getOpcode()))
       return 2;
 
-    if(TII->isVector(*MI) ||
-        TII->isCubeOp(MI->getOpcode()) ||
-        TII->isReductionOp(MI->getOpcode()))
+    if (TII->isVector(MI) || TII->isCubeOp(MI.getOpcode()) ||
+        TII->isReductionOp(MI.getOpcode()))
       return 4;
 
     unsigned NumLiteral = 0;
-    for (MachineInstr::mop_iterator It = MI->operands_begin(),
-        E = MI->operands_end(); It != E; ++It) {
+    for (MachineInstr::mop_iterator It = MI.operands_begin(),
+                                    E = MI.operands_end();
+         It != E; ++It) {
       MachineOperand &MO = *It;
       if (MO.isReg() && MO.getReg() == AMDGPU::ALU_LITERAL_X)
         ++NumLiteral;
@@ -71,12 +71,12 @@ private:
     return 1 + NumLiteral;
   }
 
-  bool isALU(const MachineInstr *MI) const {
-    if (TII->isALUInstr(MI->getOpcode()))
+  bool isALU(const MachineInstr &MI) const {
+    if (TII->isALUInstr(MI.getOpcode()))
       return true;
-    if (TII->isVector(*MI) || TII->isCubeOp(MI->getOpcode()))
+    if (TII->isVector(MI) || TII->isCubeOp(MI.getOpcode()))
       return true;
-    switch (MI->getOpcode()) {
+    switch (MI.getOpcode()) {
     case AMDGPU::PRED_X:
     case AMDGPU::INTERP_PAIR_XY:
     case AMDGPU::INTERP_PAIR_ZW:
@@ -89,8 +89,8 @@ private:
     }
   }
 
-  bool IsTrivialInst(MachineInstr *MI) const {
-    switch (MI->getOpcode()) {
+  bool IsTrivialInst(MachineInstr &MI) const {
+    switch (MI.getOpcode()) {
     case AMDGPU::KILL:
     case AMDGPU::RETURN:
     case AMDGPU::IMPLICIT_DEF:
@@ -114,18 +114,20 @@ private:
         ((((Sel >> 2) - 512) & 4095) >> 5) << 1);
   }
 
-  bool SubstituteKCacheBank(MachineInstr *MI,
-      std::vector<std::pair<unsigned, unsigned> > &CachedConsts,
-      bool UpdateInstr = true) const {
+  bool
+  SubstituteKCacheBank(MachineInstr &MI,
+                       std::vector<std::pair<unsigned, unsigned>> &CachedConsts,
+                       bool UpdateInstr = true) const {
     std::vector<std::pair<unsigned, unsigned> > UsedKCache;
 
-    if (!TII->isALUInstr(MI->getOpcode()) && MI->getOpcode() != AMDGPU::DOT_4)
+    if (!TII->isALUInstr(MI.getOpcode()) && MI.getOpcode() != AMDGPU::DOT_4)
       return true;
 
-    const SmallVectorImpl<std::pair<MachineOperand *, int64_t> > &Consts =
+    const SmallVectorImpl<std::pair<MachineOperand *, int64_t>> &Consts =
         TII->getSrcs(MI);
-    assert((TII->isALUInstr(MI->getOpcode()) ||
-        MI->getOpcode() == AMDGPU::DOT_4) && "Can't assign Const");
+    assert(
+        (TII->isALUInstr(MI.getOpcode()) || MI.getOpcode() == AMDGPU::DOT_4) &&
+        "Can't assign Const");
     for (unsigned i = 0, n = Consts.size(); i < n; ++i) {
       if (Consts[i].first->getReg() != AMDGPU::ALU_CONST)
         continue;
@@ -194,9 +196,9 @@ private:
       // in the clause.
       unsigned LastUseCount = 0;
       for (MachineBasicBlock::iterator UseI = Def; UseI != BBEnd; ++UseI) {
-        AluInstCount += OccupiedDwords(UseI);
+        AluInstCount += OccupiedDwords(*UseI);
         // Make sure we won't need to end the clause due to KCache limitations.
-        if (!SubstituteKCacheBank(UseI, KCacheBanks, false))
+        if (!SubstituteKCacheBank(*UseI, KCacheBanks, false))
           return false;
 
         // We have reached the maximum instruction limit before finding the
@@ -230,9 +232,9 @@ private:
     bool PushBeforeModifier = false;
     unsigned AluInstCount = 0;
     for (MachineBasicBlock::iterator E = MBB.end(); I != E; ++I) {
-      if (IsTrivialInst(I))
+      if (IsTrivialInst(*I))
         continue;
-      if (!isALU(I))
+      if (!isALU(*I))
         break;
       if (AluInstCount > TII->getMaxAlusPerClause())
         break;
@@ -245,7 +247,7 @@ private:
         // clause as predicated alus).
         if (AluInstCount > 0)
           break;
-        if (TII->getFlagOp(I).getImm() & MO_FLAG_PUSH)
+        if (TII->getFlagOp(*I).getImm() & MO_FLAG_PUSH)
           PushBeforeModifier = true;
         AluInstCount ++;
         continue;
@@ -267,16 +269,16 @@ private:
       if (!canClauseLocalKillFitInClause(AluInstCount, KCacheBanks, I, E))
         break;
 
-      if (!SubstituteKCacheBank(I, KCacheBanks))
+      if (!SubstituteKCacheBank(*I, KCacheBanks))
         break;
-      AluInstCount += OccupiedDwords(I);
+      AluInstCount += OccupiedDwords(*I);
     }
     unsigned Opcode = PushBeforeModifier ?
         AMDGPU::CF_ALU_PUSH_BEFORE : AMDGPU::CF_ALU;
     BuildMI(MBB, ClauseHead, MBB.findDebugLoc(ClauseHead), TII->get(Opcode))
     // We don't use the ADDR field until R600ControlFlowFinalizer pass, where
     // it is safe to assume it is 0. However if we always put 0 here, the ifcvt
-    // pass may assume that identical ALU clause starter at the beginning of a 
+    // pass may assume that identical ALU clause starter at the beginning of a
     // true and false branch can be factorized which is not the case.
         .addImm(Address++) // ADDR
         .addImm(KCacheBanks.empty()?0:KCacheBanks[0].first) // KB0
@@ -298,7 +300,8 @@ public:
   }
 
   bool runOnMachineFunction(MachineFunction &MF) override {
-    TII = static_cast<const R600InstrInfo *>(MF.getSubtarget().getInstrInfo());
+    const R600Subtarget &ST = MF.getSubtarget<R600Subtarget>();
+    TII = ST.getInstrInfo();
 
     for (MachineFunction::iterator BB = MF.begin(), BB_E = MF.end();
                                                     BB != BB_E; ++BB) {
@@ -307,7 +310,7 @@ public:
       if (I->getOpcode() == AMDGPU::CF_ALU)
         continue; // BB was already parsed
       for (MachineBasicBlock::iterator E = MBB.end(); I != E;) {
-        if (isALU(I))
+        if (isALU(*I))
           I = MakeALUClause(MBB, I);
         else
           ++I;
diff --git a/lib/Target/AMDGPU/R600ExpandSpecialInstrs.cpp b/lib/Target/AMDGPU/R600ExpandSpecialInstrs.cpp
index 211d392e8fcc..0385b6283f37 100644
--- a/lib/Target/AMDGPU/R600ExpandSpecialInstrs.cpp
+++ b/lib/Target/AMDGPU/R600ExpandSpecialInstrs.cpp
@@ -29,7 +29,6 @@ using namespace llvm;
 namespace {
 
 class R600ExpandSpecialInstrsPass : public MachineFunctionPass {
-
 private:
   static char ID;
   const R600InstrInfo *TII;
@@ -61,12 +60,13 @@ void R600ExpandSpecialInstrsPass::SetFlagInNewMI(MachineInstr *NewMI,
   int OpIdx = TII->getOperandIdx(*OldMI, Op);
   if (OpIdx > -1) {
     uint64_t Val = OldMI->getOperand(OpIdx).getImm();
-    TII->setImmOperand(NewMI, Op, Val);
+    TII->setImmOperand(*NewMI, Op, Val);
   }
 }
 
 bool R600ExpandSpecialInstrsPass::runOnMachineFunction(MachineFunction &MF) {
-  TII = static_cast<const R600InstrInfo *>(MF.getSubtarget().getInstrInfo());
+  const R600Subtarget &ST = MF.getSubtarget<R600Subtarget>();
+  TII = ST.getInstrInfo();
 
   const R600RegisterInfo &TRI = TII->getRegisterInfo();
 
@@ -107,11 +107,11 @@ bool R600ExpandSpecialInstrsPass::runOnMachineFunction(MachineFunction &MF) {
                                             MI.getOperand(0).getReg(), // dst
                                             MI.getOperand(1).getReg(), // src0
                                             AMDGPU::ZERO);             // src1
-        TII->addFlag(PredSet, 0, MO_FLAG_MASK);
+        TII->addFlag(*PredSet, 0, MO_FLAG_MASK);
         if (Flags & MO_FLAG_PUSH) {
-          TII->setImmOperand(PredSet, AMDGPU::OpName::update_exec_mask, 1);
+          TII->setImmOperand(*PredSet, AMDGPU::OpName::update_exec_mask, 1);
         } else {
-          TII->setImmOperand(PredSet, AMDGPU::OpName::update_pred, 1);
+          TII->setImmOperand(*PredSet, AMDGPU::OpName::update_pred, 1);
         }
         MI.eraseFromParent();
         continue;
@@ -137,9 +137,9 @@ bool R600ExpandSpecialInstrsPass::runOnMachineFunction(MachineFunction &MF) {
             BMI->bundleWithPred();
           }
           if (Chan >= 2)
-            TII->addFlag(BMI, 0, MO_FLAG_MASK);
+            TII->addFlag(*BMI, 0, MO_FLAG_MASK);
           if (Chan != 3)
-            TII->addFlag(BMI, 0, MO_FLAG_NOT_LAST);
+            TII->addFlag(*BMI, 0, MO_FLAG_NOT_LAST);
         }
 
         MI.eraseFromParent();
@@ -166,9 +166,9 @@ bool R600ExpandSpecialInstrsPass::runOnMachineFunction(MachineFunction &MF) {
             BMI->bundleWithPred();
           }
           if (Chan < 2)
-            TII->addFlag(BMI, 0, MO_FLAG_MASK);
+            TII->addFlag(*BMI, 0, MO_FLAG_MASK);
           if (Chan != 3)
-            TII->addFlag(BMI, 0, MO_FLAG_NOT_LAST);
+            TII->addFlag(*BMI, 0, MO_FLAG_NOT_LAST);
         }
 
         MI.eraseFromParent();
@@ -189,7 +189,7 @@ bool R600ExpandSpecialInstrsPass::runOnMachineFunction(MachineFunction &MF) {
             BMI->bundleWithPred();
           }
           if (Chan != 3)
-            TII->addFlag(BMI, 0, MO_FLAG_NOT_LAST);
+            TII->addFlag(*BMI, 0, MO_FLAG_NOT_LAST);
         }
 
         MI.eraseFromParent();
@@ -212,10 +212,10 @@ bool R600ExpandSpecialInstrsPass::runOnMachineFunction(MachineFunction &MF) {
             BMI->bundleWithPred();
           }
           if (Mask) {
-            TII->addFlag(BMI, 0, MO_FLAG_MASK);
+            TII->addFlag(*BMI, 0, MO_FLAG_MASK);
           }
           if (Chan != 3)
-            TII->addFlag(BMI, 0, MO_FLAG_NOT_LAST);
+            TII->addFlag(*BMI, 0, MO_FLAG_NOT_LAST);
           unsigned Opcode = BMI->getOpcode();
           // While not strictly necessary from hw point of view, we force
           // all src operands of a dot4 inst to belong to the same slot.
@@ -330,10 +330,10 @@ bool R600ExpandSpecialInstrsPass::runOnMachineFunction(MachineFunction &MF) {
         if (Chan != 0)
           NewMI->bundleWithPred();
         if (Mask) {
-          TII->addFlag(NewMI, 0, MO_FLAG_MASK);
+          TII->addFlag(*NewMI, 0, MO_FLAG_MASK);
         }
         if (NotLast) {
-          TII->addFlag(NewMI, 0, MO_FLAG_NOT_LAST);
+          TII->addFlag(*NewMI, 0, MO_FLAG_NOT_LAST);
         }
         SetFlagInNewMI(NewMI, &MI, AMDGPU::OpName::clamp);
         SetFlagInNewMI(NewMI, &MI, AMDGPU::OpName::literal);
diff --git a/lib/Target/AMDGPU/R600FrameLowering.cpp b/lib/Target/AMDGPU/R600FrameLowering.cpp
new file mode 100644
index 000000000000..dd5681ff5e8b
--- /dev/null
+++ b/lib/Target/AMDGPU/R600FrameLowering.cpp
@@ -0,0 +1,15 @@
+//===----------------------- R600FrameLowering.cpp ------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//==-----------------------------------------------------------------------===//
+
+#include "R600FrameLowering.h"
+
+using namespace llvm;
+
+R600FrameLowering::~R600FrameLowering() {
+}
diff --git a/lib/Target/AMDGPU/R600FrameLowering.h b/lib/Target/AMDGPU/R600FrameLowering.h
new file mode 100644
index 000000000000..5fe4e0d201ac
--- /dev/null
+++ b/lib/Target/AMDGPU/R600FrameLowering.h
@@ -0,0 +1,30 @@
+//===--------------------- R600FrameLowering.h ------------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_AMDGPU_R600FRAMELOWERING_H
+#define LLVM_LIB_TARGET_AMDGPU_R600FRAMELOWERING_H
+
+#include "AMDGPUFrameLowering.h"
+
+namespace llvm {
+
+class R600FrameLowering : public AMDGPUFrameLowering {
+public:
+  R600FrameLowering(StackDirection D, unsigned StackAl, int LAO,
+                    unsigned TransAl = 1) :
+    AMDGPUFrameLowering(D, StackAl, LAO, TransAl) {}
+  virtual ~R600FrameLowering();
+
+  void emitPrologue(MachineFunction &MF, MachineBasicBlock &MBB) const {}
+  void emitEpilogue(MachineFunction &MF, MachineBasicBlock &MBB) const {}
+};
+
+}
+
+#endif
diff --git a/lib/Target/AMDGPU/R600ISelLowering.cpp b/lib/Target/AMDGPU/R600ISelLowering.cpp
index 124a9c6e0f56..8f78edd76a51 100644
--- a/lib/Target/AMDGPU/R600ISelLowering.cpp
+++ b/lib/Target/AMDGPU/R600ISelLowering.cpp
@@ -30,18 +30,61 @@
 
 using namespace llvm;
 
-R600TargetLowering::R600TargetLowering(TargetMachine &TM,
-                                       const AMDGPUSubtarget &STI)
+R600TargetLowering::R600TargetLowering(const TargetMachine &TM,
+                                       const R600Subtarget &STI)
     : AMDGPUTargetLowering(TM, STI), Gen(STI.getGeneration()) {
-  addRegisterClass(MVT::v4f32, &AMDGPU::R600_Reg128RegClass);
   addRegisterClass(MVT::f32, &AMDGPU::R600_Reg32RegClass);
-  addRegisterClass(MVT::v4i32, &AMDGPU::R600_Reg128RegClass);
   addRegisterClass(MVT::i32, &AMDGPU::R600_Reg32RegClass);
   addRegisterClass(MVT::v2f32, &AMDGPU::R600_Reg64RegClass);
   addRegisterClass(MVT::v2i32, &AMDGPU::R600_Reg64RegClass);
+  addRegisterClass(MVT::v4f32, &AMDGPU::R600_Reg128RegClass);
+  addRegisterClass(MVT::v4i32, &AMDGPU::R600_Reg128RegClass);
 
   computeRegisterProperties(STI.getRegisterInfo());
 
+  // Legalize loads and stores to the private address space.
+  setOperationAction(ISD::LOAD, MVT::i32, Custom);
+  setOperationAction(ISD::LOAD, MVT::v2i32, Custom);
+  setOperationAction(ISD::LOAD, MVT::v4i32, Custom);
+
+  // EXTLOAD should be the same as ZEXTLOAD. It is legal for some address
+  // spaces, so it is custom lowered to handle those where it isn't.
+  for (MVT VT : MVT::integer_valuetypes()) {
+    setLoadExtAction(ISD::SEXTLOAD, VT, MVT::i1, Promote);
+    setLoadExtAction(ISD::SEXTLOAD, VT, MVT::i8, Custom);
+    setLoadExtAction(ISD::SEXTLOAD, VT, MVT::i16, Custom);
+
+    setLoadExtAction(ISD::ZEXTLOAD, VT, MVT::i1, Promote);
+    setLoadExtAction(ISD::ZEXTLOAD, VT, MVT::i8, Custom);
+    setLoadExtAction(ISD::ZEXTLOAD, VT, MVT::i16, Custom);
+
+    setLoadExtAction(ISD::EXTLOAD, VT, MVT::i1, Promote);
+    setLoadExtAction(ISD::EXTLOAD, VT, MVT::i8, Custom);
+    setLoadExtAction(ISD::EXTLOAD, VT, MVT::i16, Custom);
+  }
+
+  // Workaround for LegalizeDAG asserting on expansion of i1 vector loads.
+  setLoadExtAction(ISD::EXTLOAD, MVT::v2i32, MVT::v2i1, Expand);
+  setLoadExtAction(ISD::SEXTLOAD, MVT::v2i32, MVT::v2i1, Expand);
+  setLoadExtAction(ISD::ZEXTLOAD, MVT::v2i32, MVT::v2i1, Expand);
+
+  setLoadExtAction(ISD::EXTLOAD, MVT::v4i32, MVT::v4i1, Expand);
+  setLoadExtAction(ISD::SEXTLOAD, MVT::v4i32, MVT::v4i1, Expand);
+  setLoadExtAction(ISD::ZEXTLOAD, MVT::v4i32, MVT::v4i1, Expand);
+
+
+  setOperationAction(ISD::STORE, MVT::i8, Custom);
+  setOperationAction(ISD::STORE, MVT::i32, Custom);
+  setOperationAction(ISD::STORE, MVT::v2i32, Custom);
+  setOperationAction(ISD::STORE, MVT::v4i32, Custom);
+
+  setTruncStoreAction(MVT::i32, MVT::i8, Custom);
+  setTruncStoreAction(MVT::i32, MVT::i16, Custom);
+
+  // Workaround for LegalizeDAG asserting on expansion of i1 vector stores.
+  setTruncStoreAction(MVT::v2i32, MVT::v2i1, Expand);
+  setTruncStoreAction(MVT::v4i32, MVT::v4i1, Expand);
+
   // Set condition code actions
   setCondCodeAction(ISD::SETO,   MVT::f32, Expand);
   setCondCodeAction(ISD::SETUO,  MVT::f32, Expand);
@@ -73,10 +116,6 @@ R600TargetLowering::R600TargetLowering(TargetMachine &TM,
 
   setOperationAction(ISD::FSUB, MVT::f32, Expand);
 
-  setOperationAction(ISD::INTRINSIC_VOID, MVT::Other, Custom);
-  setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::Other, Custom);
-  setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::i1, Custom);
-
   setOperationAction(ISD::SELECT_CC, MVT::f32, Custom);
   setOperationAction(ISD::SELECT_CC, MVT::i32, Custom);
 
@@ -122,37 +161,6 @@ R600TargetLowering::R600TargetLowering(TargetMachine &TM,
 
   setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::Other, Expand);
 
-
-  // Legalize loads and stores to the private address space.
-  setOperationAction(ISD::LOAD, MVT::i32, Custom);
-  setOperationAction(ISD::LOAD, MVT::v2i32, Custom);
-  setOperationAction(ISD::LOAD, MVT::v4i32, Custom);
-
-  // EXTLOAD should be the same as ZEXTLOAD. It is legal for some address
-  // spaces, so it is custom lowered to handle those where it isn't.
-  for (MVT VT : MVT::integer_valuetypes()) {
-    setLoadExtAction(ISD::SEXTLOAD, VT, MVT::i1, Promote);
-    setLoadExtAction(ISD::SEXTLOAD, VT, MVT::i8, Custom);
-    setLoadExtAction(ISD::SEXTLOAD, VT, MVT::i16, Custom);
-
-    setLoadExtAction(ISD::ZEXTLOAD, VT, MVT::i1, Promote);
-    setLoadExtAction(ISD::ZEXTLOAD, VT, MVT::i8, Custom);
-    setLoadExtAction(ISD::ZEXTLOAD, VT, MVT::i16, Custom);
-
-    setLoadExtAction(ISD::EXTLOAD, VT, MVT::i1, Promote);
-    setLoadExtAction(ISD::EXTLOAD, VT, MVT::i8, Custom);
-    setLoadExtAction(ISD::EXTLOAD, VT, MVT::i16, Custom);
-  }
-
-  setOperationAction(ISD::STORE, MVT::i8, Custom);
-  setOperationAction(ISD::STORE, MVT::i32, Custom);
-  setOperationAction(ISD::STORE, MVT::v2i32, Custom);
-  setOperationAction(ISD::STORE, MVT::v4i32, Custom);
-  setTruncStoreAction(MVT::i32, MVT::i8, Custom);
-  setTruncStoreAction(MVT::i32, MVT::i16, Custom);
-
-  setOperationAction(ISD::LOAD, MVT::i32, Custom);
-  setOperationAction(ISD::LOAD, MVT::v4i32, Custom);
   setOperationAction(ISD::FrameIndex, MVT::i32, Custom);
 
   setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2i32, Custom);
@@ -165,12 +173,6 @@ R600TargetLowering::R600TargetLowering(TargetMachine &TM,
   setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4i32, Custom);
   setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4f32, Custom);
 
-  setTargetDAGCombine(ISD::FP_ROUND);
-  setTargetDAGCombine(ISD::FP_TO_SINT);
-  setTargetDAGCombine(ISD::EXTRACT_VECTOR_ELT);
-  setTargetDAGCombine(ISD::SELECT_CC);
-  setTargetDAGCombine(ISD::INSERT_VECTOR_ELT);
-
   // We don't have 64-bit shifts. Thus we need either SHX i64 or SHX_PARTS i32
   //  to be Legal/Custom in order to avoid library calls.
   setOperationAction(ISD::SHL_PARTS, MVT::i32, Custom);
@@ -188,119 +190,138 @@ R600TargetLowering::R600TargetLowering(TargetMachine &TM,
   }
 
   setSchedulingPreference(Sched::Source);
+
+
+  setTargetDAGCombine(ISD::FP_ROUND);
+  setTargetDAGCombine(ISD::FP_TO_SINT);
+  setTargetDAGCombine(ISD::EXTRACT_VECTOR_ELT);
+  setTargetDAGCombine(ISD::SELECT_CC);
+  setTargetDAGCombine(ISD::INSERT_VECTOR_ELT);
+}
+
+const R600Subtarget *R600TargetLowering::getSubtarget() const {
+  return static_cast<const R600Subtarget *>(Subtarget);
 }
 
 static inline bool isEOP(MachineBasicBlock::iterator I) {
   return std::next(I)->getOpcode() == AMDGPU::RETURN;
 }
 
-MachineBasicBlock * R600TargetLowering::EmitInstrWithCustomInserter(
-    MachineInstr * MI, MachineBasicBlock * BB) const {
+MachineBasicBlock *
+R600TargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI,
+                                                MachineBasicBlock *BB) const {
   MachineFunction * MF = BB->getParent();
   MachineRegisterInfo &MRI = MF->getRegInfo();
-  MachineBasicBlock::iterator I = *MI;
-  const R600InstrInfo *TII =
-      static_cast<const R600InstrInfo *>(Subtarget->getInstrInfo());
+  MachineBasicBlock::iterator I = MI;
+  const R600InstrInfo *TII = getSubtarget()->getInstrInfo();
 
-  switch (MI->getOpcode()) {
+  switch (MI.getOpcode()) {
   default:
     // Replace LDS_*_RET instruction that don't have any uses with the
     // equivalent LDS_*_NORET instruction.
-    if (TII->isLDSRetInstr(MI->getOpcode())) {
-      int DstIdx = TII->getOperandIdx(MI->getOpcode(), AMDGPU::OpName::dst);
+    if (TII->isLDSRetInstr(MI.getOpcode())) {
+      int DstIdx = TII->getOperandIdx(MI.getOpcode(), AMDGPU::OpName::dst);
       assert(DstIdx != -1);
       MachineInstrBuilder NewMI;
       // FIXME: getLDSNoRetOp method only handles LDS_1A1D LDS ops. Add
       //        LDS_1A2D support and remove this special case.
-      if (!MRI.use_empty(MI->getOperand(DstIdx).getReg()) ||
-           MI->getOpcode() == AMDGPU::LDS_CMPST_RET)
+      if (!MRI.use_empty(MI.getOperand(DstIdx).getReg()) ||
+          MI.getOpcode() == AMDGPU::LDS_CMPST_RET)
         return BB;
 
       NewMI = BuildMI(*BB, I, BB->findDebugLoc(I),
-                      TII->get(AMDGPU::getLDSNoRetOp(MI->getOpcode())));
-      for (unsigned i = 1, e = MI->getNumOperands(); i < e; ++i) {
-        NewMI.addOperand(MI->getOperand(i));
+                      TII->get(AMDGPU::getLDSNoRetOp(MI.getOpcode())));
+      for (unsigned i = 1, e = MI.getNumOperands(); i < e; ++i) {
+        NewMI.addOperand(MI.getOperand(i));
       }
     } else {
       return AMDGPUTargetLowering::EmitInstrWithCustomInserter(MI, BB);
     }
     break;
   case AMDGPU::CLAMP_R600: {
-    MachineInstr *NewMI = TII->buildDefaultInstruction(*BB, I,
-                                                   AMDGPU::MOV,
-                                                   MI->getOperand(0).getReg(),
-                                                   MI->getOperand(1).getReg());
-    TII->addFlag(NewMI, 0, MO_FLAG_CLAMP);
+    MachineInstr *NewMI = TII->buildDefaultInstruction(
+        *BB, I, AMDGPU::MOV, MI.getOperand(0).getReg(),
+        MI.getOperand(1).getReg());
+    TII->addFlag(*NewMI, 0, MO_FLAG_CLAMP);
     break;
   }
 
   case AMDGPU::FABS_R600: {
-    MachineInstr *NewMI = TII->buildDefaultInstruction(*BB, I,
-                                                    AMDGPU::MOV,
-                                                    MI->getOperand(0).getReg(),
-                                                    MI->getOperand(1).getReg());
-    TII->addFlag(NewMI, 0, MO_FLAG_ABS);
+    MachineInstr *NewMI = TII->buildDefaultInstruction(
+        *BB, I, AMDGPU::MOV, MI.getOperand(0).getReg(),
+        MI.getOperand(1).getReg());
+    TII->addFlag(*NewMI, 0, MO_FLAG_ABS);
     break;
   }
 
   case AMDGPU::FNEG_R600: {
-    MachineInstr *NewMI = TII->buildDefaultInstruction(*BB, I,
-                                                    AMDGPU::MOV,
-                                                    MI->getOperand(0).getReg(),
-                                                    MI->getOperand(1).getReg());
-    TII->addFlag(NewMI, 0, MO_FLAG_NEG);
+    MachineInstr *NewMI = TII->buildDefaultInstruction(
+        *BB, I, AMDGPU::MOV, MI.getOperand(0).getReg(),
+        MI.getOperand(1).getReg());
+    TII->addFlag(*NewMI, 0, MO_FLAG_NEG);
     break;
   }
 
   case AMDGPU::MASK_WRITE: {
-    unsigned maskedRegister = MI->getOperand(0).getReg();
+    unsigned maskedRegister = MI.getOperand(0).getReg();
     assert(TargetRegisterInfo::isVirtualRegister(maskedRegister));
     MachineInstr * defInstr = MRI.getVRegDef(maskedRegister);
-    TII->addFlag(defInstr, 0, MO_FLAG_MASK);
+    TII->addFlag(*defInstr, 0, MO_FLAG_MASK);
     break;
   }
 
   case AMDGPU::MOV_IMM_F32:
-    TII->buildMovImm(*BB, I, MI->getOperand(0).getReg(),
-                     MI->getOperand(1).getFPImm()->getValueAPF()
-                         .bitcastToAPInt().getZExtValue());
+    TII->buildMovImm(*BB, I, MI.getOperand(0).getReg(), MI.getOperand(1)
+                                                            .getFPImm()
+                                                            ->getValueAPF()
+                                                            .bitcastToAPInt()
+                                                            .getZExtValue());
     break;
   case AMDGPU::MOV_IMM_I32:
-    TII->buildMovImm(*BB, I, MI->getOperand(0).getReg(),
-                     MI->getOperand(1).getImm());
+    TII->buildMovImm(*BB, I, MI.getOperand(0).getReg(),
+                     MI.getOperand(1).getImm());
     break;
+  case AMDGPU::MOV_IMM_GLOBAL_ADDR: {
+    //TODO: Perhaps combine this instruction with the next if possible
+    auto MIB = TII->buildDefaultInstruction(
+        *BB, MI, AMDGPU::MOV, MI.getOperand(0).getReg(), AMDGPU::ALU_LITERAL_X);
+    int Idx = TII->getOperandIdx(*MIB, AMDGPU::OpName::literal);
+    //TODO: Ugh this is rather ugly
+    MIB->getOperand(Idx) = MI.getOperand(1);
+    break;
+  }
   case AMDGPU::CONST_COPY: {
-    MachineInstr *NewMI = TII->buildDefaultInstruction(*BB, MI, AMDGPU::MOV,
-        MI->getOperand(0).getReg(), AMDGPU::ALU_CONST);
-    TII->setImmOperand(NewMI, AMDGPU::OpName::src0_sel,
-        MI->getOperand(1).getImm());
+    MachineInstr *NewMI = TII->buildDefaultInstruction(
+        *BB, MI, AMDGPU::MOV, MI.getOperand(0).getReg(), AMDGPU::ALU_CONST);
+    TII->setImmOperand(*NewMI, AMDGPU::OpName::src0_sel,
+                       MI.getOperand(1).getImm());
     break;
   }
 
   case AMDGPU::RAT_WRITE_CACHELESS_32_eg:
   case AMDGPU::RAT_WRITE_CACHELESS_64_eg:
   case AMDGPU::RAT_WRITE_CACHELESS_128_eg: {
-    BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(MI->getOpcode()))
-            .addOperand(MI->getOperand(0))
-            .addOperand(MI->getOperand(1))
-            .addImm(isEOP(I)); // Set End of program bit
+    BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(MI.getOpcode()))
+        .addOperand(MI.getOperand(0))
+        .addOperand(MI.getOperand(1))
+        .addImm(isEOP(I)); // Set End of program bit
     break;
   }
   case AMDGPU::RAT_STORE_TYPED_eg: {
-    BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(MI->getOpcode()))
-            .addOperand(MI->getOperand(0))
-            .addOperand(MI->getOperand(1))
-            .addOperand(MI->getOperand(2))
-            .addImm(isEOP(I)); // Set End of program bit
+    BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(MI.getOpcode()))
+        .addOperand(MI.getOperand(0))
+        .addOperand(MI.getOperand(1))
+        .addOperand(MI.getOperand(2))
+        .addImm(isEOP(I)); // Set End of program bit
     break;
   }
 
   case AMDGPU::TXD: {
     unsigned T0 = MRI.createVirtualRegister(&AMDGPU::R600_Reg128RegClass);
     unsigned T1 = MRI.createVirtualRegister(&AMDGPU::R600_Reg128RegClass);
-    MachineOperand &RID = MI->getOperand(4);
-    MachineOperand &SID = MI->getOperand(5);
-    unsigned TextureId = MI->getOperand(6).getImm();
+    MachineOperand &RID = MI.getOperand(4);
+    MachineOperand &SID = MI.getOperand(5);
+    unsigned TextureId = MI.getOperand(6).getImm();
     unsigned SrcX = 0, SrcY = 1, SrcZ = 2, SrcW = 3;
     unsigned CTX = 1, CTY = 1, CTZ = 1, CTW = 1;
 
@@ -333,75 +354,77 @@ MachineBasicBlock * R600TargetLowering::EmitInstrWithCustomInserter(
       CTZ = 0;
       break;
     }
-    BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::TEX_SET_GRADIENTS_H), T0)
-            .addOperand(MI->getOperand(3))
-            .addImm(SrcX)
-            .addImm(SrcY)
-            .addImm(SrcZ)
-            .addImm(SrcW)
-            .addImm(0)
-            .addImm(0)
-            .addImm(0)
-            .addImm(0)
-            .addImm(1)
-            .addImm(2)
-            .addImm(3)
-            .addOperand(RID)
-            .addOperand(SID)
-            .addImm(CTX)
-            .addImm(CTY)
-            .addImm(CTZ)
-            .addImm(CTW);
-    BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::TEX_SET_GRADIENTS_V), T1)
-            .addOperand(MI->getOperand(2))
-            .addImm(SrcX)
-            .addImm(SrcY)
-            .addImm(SrcZ)
-            .addImm(SrcW)
-            .addImm(0)
-            .addImm(0)
-            .addImm(0)
-            .addImm(0)
-            .addImm(1)
-            .addImm(2)
-            .addImm(3)
-            .addOperand(RID)
-            .addOperand(SID)
-            .addImm(CTX)
-            .addImm(CTY)
-            .addImm(CTZ)
-            .addImm(CTW);
+    BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::TEX_SET_GRADIENTS_H),
+            T0)
+        .addOperand(MI.getOperand(3))
+        .addImm(SrcX)
+        .addImm(SrcY)
+        .addImm(SrcZ)
+        .addImm(SrcW)
+        .addImm(0)
+        .addImm(0)
+        .addImm(0)
+        .addImm(0)
+        .addImm(1)
+        .addImm(2)
+        .addImm(3)
+        .addOperand(RID)
+        .addOperand(SID)
+        .addImm(CTX)
+        .addImm(CTY)
+        .addImm(CTZ)
+        .addImm(CTW);
+    BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::TEX_SET_GRADIENTS_V),
+            T1)
+        .addOperand(MI.getOperand(2))
+        .addImm(SrcX)
+        .addImm(SrcY)
+        .addImm(SrcZ)
+        .addImm(SrcW)
+        .addImm(0)
+        .addImm(0)
+        .addImm(0)
+        .addImm(0)
+        .addImm(1)
+        .addImm(2)
+        .addImm(3)
+        .addOperand(RID)
+        .addOperand(SID)
+        .addImm(CTX)
+        .addImm(CTY)
+        .addImm(CTZ)
+        .addImm(CTW);
     BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::TEX_SAMPLE_G))
-            .addOperand(MI->getOperand(0))
-            .addOperand(MI->getOperand(1))
-            .addImm(SrcX)
-            .addImm(SrcY)
-            .addImm(SrcZ)
-            .addImm(SrcW)
-            .addImm(0)
-            .addImm(0)
-            .addImm(0)
-            .addImm(0)
-            .addImm(1)
-            .addImm(2)
-            .addImm(3)
-            .addOperand(RID)
-            .addOperand(SID)
-            .addImm(CTX)
-            .addImm(CTY)
-            .addImm(CTZ)
-            .addImm(CTW)
-            .addReg(T0, RegState::Implicit)
-            .addReg(T1, RegState::Implicit);
+        .addOperand(MI.getOperand(0))
+        .addOperand(MI.getOperand(1))
+        .addImm(SrcX)
+        .addImm(SrcY)
+        .addImm(SrcZ)
+        .addImm(SrcW)
+        .addImm(0)
+        .addImm(0)
+        .addImm(0)
+        .addImm(0)
+        .addImm(1)
+        .addImm(2)
+        .addImm(3)
+        .addOperand(RID)
+        .addOperand(SID)
+        .addImm(CTX)
+        .addImm(CTY)
+        .addImm(CTZ)
+        .addImm(CTW)
+        .addReg(T0, RegState::Implicit)
+        .addReg(T1, RegState::Implicit);
     break;
   }
 
   case AMDGPU::TXD_SHADOW: {
     unsigned T0 = MRI.createVirtualRegister(&AMDGPU::R600_Reg128RegClass);
     unsigned T1 = MRI.createVirtualRegister(&AMDGPU::R600_Reg128RegClass);
-    MachineOperand &RID = MI->getOperand(4);
-    MachineOperand &SID = MI->getOperand(5);
-    unsigned TextureId = MI->getOperand(6).getImm();
+    MachineOperand &RID = MI.getOperand(4);
+    MachineOperand &SID = MI.getOperand(5);
+    unsigned TextureId = MI.getOperand(6).getImm();
     unsigned SrcX = 0, SrcY = 1, SrcZ = 2, SrcW = 3;
     unsigned CTX = 1, CTY = 1, CTZ = 1, CTW = 1;
 
@@ -435,99 +458,101 @@ MachineBasicBlock * R600TargetLowering::EmitInstrWithCustomInserter(
       break;
     }
 
-    BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::TEX_SET_GRADIENTS_H), T0)
-            .addOperand(MI->getOperand(3))
-            .addImm(SrcX)
-            .addImm(SrcY)
-            .addImm(SrcZ)
-            .addImm(SrcW)
-            .addImm(0)
-            .addImm(0)
-            .addImm(0)
-            .addImm(0)
-            .addImm(1)
-            .addImm(2)
-            .addImm(3)
-            .addOperand(RID)
-            .addOperand(SID)
-            .addImm(CTX)
-            .addImm(CTY)
-            .addImm(CTZ)
-            .addImm(CTW);
-    BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::TEX_SET_GRADIENTS_V), T1)
-            .addOperand(MI->getOperand(2))
-            .addImm(SrcX)
-            .addImm(SrcY)
-            .addImm(SrcZ)
-            .addImm(SrcW)
-            .addImm(0)
-            .addImm(0)
-            .addImm(0)
-            .addImm(0)
-            .addImm(1)
-            .addImm(2)
-            .addImm(3)
-            .addOperand(RID)
-            .addOperand(SID)
-            .addImm(CTX)
-            .addImm(CTY)
-            .addImm(CTZ)
-            .addImm(CTW);
+    BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::TEX_SET_GRADIENTS_H),
+            T0)
+        .addOperand(MI.getOperand(3))
+        .addImm(SrcX)
+        .addImm(SrcY)
+        .addImm(SrcZ)
+        .addImm(SrcW)
+        .addImm(0)
+        .addImm(0)
+        .addImm(0)
+        .addImm(0)
+        .addImm(1)
+        .addImm(2)
+        .addImm(3)
+        .addOperand(RID)
+        .addOperand(SID)
+        .addImm(CTX)
+        .addImm(CTY)
+        .addImm(CTZ)
+        .addImm(CTW);
+    BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::TEX_SET_GRADIENTS_V),
+            T1)
+        .addOperand(MI.getOperand(2))
+        .addImm(SrcX)
+        .addImm(SrcY)
+        .addImm(SrcZ)
+        .addImm(SrcW)
+        .addImm(0)
+        .addImm(0)
+        .addImm(0)
+        .addImm(0)
+        .addImm(1)
+        .addImm(2)
+        .addImm(3)
+        .addOperand(RID)
+        .addOperand(SID)
+        .addImm(CTX)
+        .addImm(CTY)
+        .addImm(CTZ)
+        .addImm(CTW);
     BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::TEX_SAMPLE_C_G))
-            .addOperand(MI->getOperand(0))
-            .addOperand(MI->getOperand(1))
-            .addImm(SrcX)
-            .addImm(SrcY)
-            .addImm(SrcZ)
-            .addImm(SrcW)
-            .addImm(0)
-            .addImm(0)
-            .addImm(0)
-            .addImm(0)
-            .addImm(1)
-            .addImm(2)
-            .addImm(3)
-            .addOperand(RID)
-            .addOperand(SID)
-            .addImm(CTX)
-            .addImm(CTY)
-            .addImm(CTZ)
-            .addImm(CTW)
-            .addReg(T0, RegState::Implicit)
-            .addReg(T1, RegState::Implicit);
+        .addOperand(MI.getOperand(0))
+        .addOperand(MI.getOperand(1))
+        .addImm(SrcX)
+        .addImm(SrcY)
+        .addImm(SrcZ)
+        .addImm(SrcW)
+        .addImm(0)
+        .addImm(0)
+        .addImm(0)
+        .addImm(0)
+        .addImm(1)
+        .addImm(2)
+        .addImm(3)
+        .addOperand(RID)
+        .addOperand(SID)
+        .addImm(CTX)
+        .addImm(CTY)
+        .addImm(CTZ)
+        .addImm(CTW)
+        .addReg(T0, RegState::Implicit)
+        .addReg(T1, RegState::Implicit);
     break;
   }
 
   case AMDGPU::BRANCH:
-      BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::JUMP))
-              .addOperand(MI->getOperand(0));
-      break;
+    BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::JUMP))
+        .addOperand(MI.getOperand(0));
+    break;
 
   case AMDGPU::BRANCH_COND_f32: {
     MachineInstr *NewMI =
-      BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::PRED_X),
-              AMDGPU::PREDICATE_BIT)
-              .addOperand(MI->getOperand(1))
-              .addImm(OPCODE_IS_NOT_ZERO)
-              .addImm(0); // Flags
-    TII->addFlag(NewMI, 0, MO_FLAG_PUSH);
+        BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::PRED_X),
+                AMDGPU::PREDICATE_BIT)
+            .addOperand(MI.getOperand(1))
+            .addImm(OPCODE_IS_NOT_ZERO)
+            .addImm(0); // Flags
+    TII->addFlag(*NewMI, 0, MO_FLAG_PUSH);
     BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::JUMP_COND))
-            .addOperand(MI->getOperand(0))
-            .addReg(AMDGPU::PREDICATE_BIT, RegState::Kill);
+        .addOperand(MI.getOperand(0))
+        .addReg(AMDGPU::PREDICATE_BIT, RegState::Kill);
     break;
   }
 
   case AMDGPU::BRANCH_COND_i32: {
     MachineInstr *NewMI =
-      BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::PRED_X),
-            AMDGPU::PREDICATE_BIT)
-            .addOperand(MI->getOperand(1))
+        BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::PRED_X),
+                AMDGPU::PREDICATE_BIT)
+            .addOperand(MI.getOperand(1))
             .addImm(OPCODE_IS_NOT_ZERO_INT)
             .addImm(0); // Flags
-    TII->addFlag(NewMI, 0, MO_FLAG_PUSH);
+    TII->addFlag(*NewMI, 0, MO_FLAG_PUSH);
     BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::JUMP_COND))
-           .addOperand(MI->getOperand(0))
-            .addReg(AMDGPU::PREDICATE_BIT, RegState::Kill);
+        .addOperand(MI.getOperand(0))
+        .addReg(AMDGPU::PREDICATE_BIT, RegState::Kill);
     break;
   }
 
@@ -535,7 +560,7 @@ MachineBasicBlock * R600TargetLowering::EmitInstrWithCustomInserter(
   case AMDGPU::R600_ExportSwz: {
     // Instruction is left unmodified if its not the last one of its type
     bool isLastInstructionOfItsType = true;
-    unsigned InstExportType = MI->getOperand(1).getImm();
+    unsigned InstExportType = MI.getOperand(1).getImm();
     for (MachineBasicBlock::iterator NextExportInst = std::next(I),
          EndBlock = BB->end(); NextExportInst != EndBlock;
          NextExportInst = std::next(NextExportInst)) {
@@ -552,17 +577,17 @@ MachineBasicBlock * R600TargetLowering::EmitInstrWithCustomInserter(
     bool EOP = isEOP(I);
     if (!EOP && !isLastInstructionOfItsType)
       return BB;
-    unsigned CfInst = (MI->getOpcode() == AMDGPU::EG_ExportSwz)? 84 : 40;
-    BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(MI->getOpcode()))
-            .addOperand(MI->getOperand(0))
-            .addOperand(MI->getOperand(1))
-            .addOperand(MI->getOperand(2))
-            .addOperand(MI->getOperand(3))
-            .addOperand(MI->getOperand(4))
-            .addOperand(MI->getOperand(5))
-            .addOperand(MI->getOperand(6))
-            .addImm(CfInst)
-            .addImm(EOP);
+    unsigned CfInst = (MI.getOpcode() == AMDGPU::EG_ExportSwz) ? 84 : 40;
+    BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(MI.getOpcode()))
+        .addOperand(MI.getOperand(0))
+        .addOperand(MI.getOperand(1))
+        .addOperand(MI.getOperand(2))
+        .addOperand(MI.getOperand(3))
+        .addOperand(MI.getOperand(4))
+        .addOperand(MI.getOperand(5))
+        .addOperand(MI.getOperand(6))
+        .addImm(CfInst)
+        .addImm(EOP);
     break;
   }
   case AMDGPU::RETURN: {
@@ -576,7 +601,7 @@ MachineBasicBlock * R600TargetLowering::EmitInstrWithCustomInserter(
   }
   }
 
-  MI->eraseFromParent();
+  MI.eraseFromParent();
   return BB;
 }
 
@@ -610,18 +635,13 @@ SDValue R600TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const
 
   case ISD::BRCOND: return LowerBRCOND(Op, DAG);
   case ISD::GlobalAddress: return LowerGlobalAddress(MFI, Op, DAG);
+  case ISD::FrameIndex: return lowerFrameIndex(Op, DAG);
   case ISD::INTRINSIC_VOID: {
     SDValue Chain = Op.getOperand(0);
     unsigned IntrinsicID =
                          cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue();
     switch (IntrinsicID) {
-    case AMDGPUIntrinsic::AMDGPU_store_output: {
-      int64_t RegIndex = cast<ConstantSDNode>(Op.getOperand(3))->getZExtValue();
-      unsigned Reg = AMDGPU::R600_TReg32RegClass.getRegister(RegIndex);
-      MFI->LiveOuts.push_back(Reg);
-      return DAG.getCopyToReg(Chain, SDLoc(Op), Reg, Op.getOperand(2));
-    }
-    case AMDGPUIntrinsic::R600_store_swizzle: {
+    case AMDGPUIntrinsic::r600_store_swizzle: {
       SDLoc DL(Op);
       const SDValue Args[8] = {
         Chain,
@@ -649,114 +669,48 @@ SDValue R600TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const
     SDLoc DL(Op);
     switch(IntrinsicID) {
     default: return AMDGPUTargetLowering::LowerOperation(Op, DAG);
-    case AMDGPUIntrinsic::R600_load_input: {
-      int64_t RegIndex = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue();
-      unsigned Reg = AMDGPU::R600_TReg32RegClass.getRegister(RegIndex);
-      MachineFunction &MF = DAG.getMachineFunction();
-      MachineRegisterInfo &MRI = MF.getRegInfo();
-      MRI.addLiveIn(Reg);
-      return DAG.getCopyFromReg(DAG.getEntryNode(),
-          SDLoc(DAG.getEntryNode()), Reg, VT);
-    }
-
-    case AMDGPUIntrinsic::R600_interp_input: {
-      int slot = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue();
-      int ijb = cast<ConstantSDNode>(Op.getOperand(2))->getSExtValue();
-      MachineSDNode *interp;
-      if (ijb < 0) {
-        const R600InstrInfo *TII =
-            static_cast<const R600InstrInfo *>(Subtarget->getInstrInfo());
-        interp = DAG.getMachineNode(AMDGPU::INTERP_VEC_LOAD, DL,
-            MVT::v4f32, DAG.getTargetConstant(slot / 4, DL, MVT::i32));
-        return DAG.getTargetExtractSubreg(
-            TII->getRegisterInfo().getSubRegFromChannel(slot % 4),
-            DL, MVT::f32, SDValue(interp, 0));
-      }
-      MachineFunction &MF = DAG.getMachineFunction();
-      MachineRegisterInfo &MRI = MF.getRegInfo();
-      unsigned RegisterI = AMDGPU::R600_TReg32RegClass.getRegister(2 * ijb);
-      unsigned RegisterJ = AMDGPU::R600_TReg32RegClass.getRegister(2 * ijb + 1);
-      MRI.addLiveIn(RegisterI);
-      MRI.addLiveIn(RegisterJ);
-      SDValue RegisterINode = DAG.getCopyFromReg(DAG.getEntryNode(),
-          SDLoc(DAG.getEntryNode()), RegisterI, MVT::f32);
-      SDValue RegisterJNode = DAG.getCopyFromReg(DAG.getEntryNode(),
-          SDLoc(DAG.getEntryNode()), RegisterJ, MVT::f32);
-
-      if (slot % 4 < 2)
-        interp = DAG.getMachineNode(AMDGPU::INTERP_PAIR_XY, DL,
-            MVT::f32, MVT::f32, DAG.getTargetConstant(slot / 4, DL, MVT::i32),
-            RegisterJNode, RegisterINode);
-      else
-        interp = DAG.getMachineNode(AMDGPU::INTERP_PAIR_ZW, DL,
-            MVT::f32, MVT::f32, DAG.getTargetConstant(slot / 4, DL, MVT::i32),
-            RegisterJNode, RegisterINode);
-      return SDValue(interp, slot % 2);
-    }
-    case AMDGPUIntrinsic::R600_interp_xy:
-    case AMDGPUIntrinsic::R600_interp_zw: {
-      int slot = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue();
-      MachineSDNode *interp;
-      SDValue RegisterINode = Op.getOperand(2);
-      SDValue RegisterJNode = Op.getOperand(3);
-
-      if (IntrinsicID == AMDGPUIntrinsic::R600_interp_xy)
-        interp = DAG.getMachineNode(AMDGPU::INTERP_PAIR_XY, DL,
-            MVT::f32, MVT::f32, DAG.getTargetConstant(slot, DL, MVT::i32),
-            RegisterJNode, RegisterINode);
-      else
-        interp = DAG.getMachineNode(AMDGPU::INTERP_PAIR_ZW, DL,
-            MVT::f32, MVT::f32, DAG.getTargetConstant(slot, DL, MVT::i32),
-            RegisterJNode, RegisterINode);
-      return DAG.getNode(ISD::BUILD_VECTOR, DL, MVT::v2f32,
-          SDValue(interp, 0), SDValue(interp, 1));
-    }
-    case AMDGPUIntrinsic::R600_tex:
-    case AMDGPUIntrinsic::R600_texc:
-    case AMDGPUIntrinsic::R600_txl:
-    case AMDGPUIntrinsic::R600_txlc:
-    case AMDGPUIntrinsic::R600_txb:
-    case AMDGPUIntrinsic::R600_txbc:
-    case AMDGPUIntrinsic::R600_txf:
-    case AMDGPUIntrinsic::R600_txq:
-    case AMDGPUIntrinsic::R600_ddx:
-    case AMDGPUIntrinsic::R600_ddy:
-    case AMDGPUIntrinsic::R600_ldptr: {
+    case AMDGPUIntrinsic::r600_tex:
+    case AMDGPUIntrinsic::r600_texc:
+    case AMDGPUIntrinsic::r600_txl:
+    case AMDGPUIntrinsic::r600_txlc:
+    case AMDGPUIntrinsic::r600_txb:
+    case AMDGPUIntrinsic::r600_txbc:
+    case AMDGPUIntrinsic::r600_txf:
+    case AMDGPUIntrinsic::r600_txq:
+    case AMDGPUIntrinsic::r600_ddx:
+    case AMDGPUIntrinsic::r600_ddy: {
       unsigned TextureOp;
       switch (IntrinsicID) {
-      case AMDGPUIntrinsic::R600_tex:
+      case AMDGPUIntrinsic::r600_tex:
         TextureOp = 0;
         break;
-      case AMDGPUIntrinsic::R600_texc:
+      case AMDGPUIntrinsic::r600_texc:
         TextureOp = 1;
         break;
-      case AMDGPUIntrinsic::R600_txl:
+      case AMDGPUIntrinsic::r600_txl:
         TextureOp = 2;
         break;
-      case AMDGPUIntrinsic::R600_txlc:
+      case AMDGPUIntrinsic::r600_txlc:
         TextureOp = 3;
         break;
-      case AMDGPUIntrinsic::R600_txb:
+      case AMDGPUIntrinsic::r600_txb:
         TextureOp = 4;
         break;
-      case AMDGPUIntrinsic::R600_txbc:
+      case AMDGPUIntrinsic::r600_txbc:
         TextureOp = 5;
         break;
-      case AMDGPUIntrinsic::R600_txf:
+      case AMDGPUIntrinsic::r600_txf:
         TextureOp = 6;
         break;
-      case AMDGPUIntrinsic::R600_txq:
+      case AMDGPUIntrinsic::r600_txq:
         TextureOp = 7;
         break;
-      case AMDGPUIntrinsic::R600_ddx:
+      case AMDGPUIntrinsic::r600_ddx:
         TextureOp = 8;
         break;
-      case AMDGPUIntrinsic::R600_ddy:
+      case AMDGPUIntrinsic::r600_ddy:
         TextureOp = 9;
         break;
-      case AMDGPUIntrinsic::R600_ldptr:
-        TextureOp = 10;
-        break;
       default:
         llvm_unreachable("Unknow Texture Operation");
       }
@@ -784,7 +738,7 @@ SDValue R600TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const
       };
       return DAG.getNode(AMDGPUISD::TEXTURE_FETCH, DL, MVT::v4f32, TexArgs);
     }
-    case AMDGPUIntrinsic::AMDGPU_dp4: {
+    case AMDGPUIntrinsic::r600_dot4: {
       SDValue Args[8] = {
       DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, Op.getOperand(1),
           DAG.getConstant(0, DL, MVT::i32)),
@@ -806,6 +760,11 @@ SDValue R600TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const
       return DAG.getNode(AMDGPUISD::DOT4, DL, MVT::f32, Args);
     }
 
+    case Intrinsic::r600_implicitarg_ptr: {
+      MVT PtrVT = getPointerTy(DAG.getDataLayout(), AMDGPUAS::PARAM_I_ADDRESS);
+      uint32_t ByteOffset = getImplicitParameterOffset(MFI, FIRST_IMPLICIT);
+      return DAG.getConstant(ByteOffset, DL, PtrVT);
+    }
     case Intrinsic::r600_read_ngroups_x:
       return LowerImplicitParameter(DAG, VT, DL, 0);
     case Intrinsic::r600_read_ngroups_y:
@@ -825,7 +784,8 @@ SDValue R600TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const
     case Intrinsic::r600_read_local_size_z:
       return LowerImplicitParameter(DAG, VT, DL, 8);
 
-    case Intrinsic::AMDGPU_read_workdim: {
+    case Intrinsic::r600_read_workdim:
+    case AMDGPUIntrinsic::AMDGPU_read_workdim: { // Legacy name.
       uint32_t ByteOffset = getImplicitParameterOffset(MFI, GRID_DIM);
       return LowerImplicitParameter(DAG, VT, DL, ByteOffset / 4);
     }
@@ -848,14 +808,14 @@ SDValue R600TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const
     case Intrinsic::r600_read_tidig_z:
       return CreateLiveInRegister(DAG, &AMDGPU::R600_TReg32RegClass,
                                   AMDGPU::T0_Z, VT);
-    case Intrinsic::AMDGPU_rsq:
-      // XXX - I'm assuming SI's RSQ_LEGACY matches R600's behavior.
-      return DAG.getNode(AMDGPUISD::RSQ_LEGACY, DL, VT, Op.getOperand(1));
 
-    case AMDGPUIntrinsic::AMDGPU_fract:
-    case AMDGPUIntrinsic::AMDIL_fraction: // Legacy name.
-      return DAG.getNode(AMDGPUISD::FRACT, DL, VT, Op.getOperand(1));
+    case Intrinsic::r600_recipsqrt_ieee:
+      return DAG.getNode(AMDGPUISD::RSQ, DL, VT, Op.getOperand(1));
+
+    case Intrinsic::r600_recipsqrt_clamped:
+      return DAG.getNode(AMDGPUISD::RSQ_CLAMP, DL, VT, Op.getOperand(1));
     }
+
     // break out of case ISD::INTRINSIC_WO_CHAIN in switch(Op.getOpcode())
     break;
   }
@@ -950,6 +910,22 @@ SDValue R600TargetLowering::LowerINSERT_VECTOR_ELT(SDValue Op,
   return vectorToVerticalVector(DAG, Insert);
 }
 
+SDValue R600TargetLowering::LowerGlobalAddress(AMDGPUMachineFunction *MFI,
+                                               SDValue Op,
+                                               SelectionDAG &DAG) const {
+
+  GlobalAddressSDNode *GSD = cast<GlobalAddressSDNode>(Op);
+  if (GSD->getAddressSpace() != AMDGPUAS::CONSTANT_ADDRESS)
+    return AMDGPUTargetLowering::LowerGlobalAddress(MFI, Op, DAG);
+
+  const DataLayout &DL = DAG.getDataLayout();
+  const GlobalValue *GV = GSD->getGlobal();
+  MVT ConstPtrVT = getPointerTy(DL, AMDGPUAS::CONSTANT_ADDRESS);
+
+  SDValue GA = DAG.getTargetGlobalAddress(GV, SDLoc(GSD), ConstPtrVT);
+  return DAG.getNode(AMDGPUISD::CONST_DATA_PTR, SDLoc(GSD), ConstPtrVT, GA);
+}
+
 SDValue R600TargetLowering::LowerTrig(SDValue Op, SelectionDAG &DAG) const {
   // On hw >= R700, COS/SIN input must be between -1. and 1.
   // Thus we lower them to TRIG ( FRACT ( x / 2Pi + 0.5) - 0.5)
@@ -977,7 +953,7 @@ SDValue R600TargetLowering::LowerTrig(SDValue Op, SelectionDAG &DAG) const {
   SDValue TrigVal = DAG.getNode(TrigNode, DL, VT,
       DAG.getNode(ISD::FADD, DL, VT, FractPart,
         DAG.getConstantFP(-0.5, DL, MVT::f32)));
-  if (Gen >= AMDGPUSubtarget::R700)
+  if (Gen >= R600Subtarget::R700)
     return TrigVal;
   // On R600 hw, COS/SIN input must be between -Pi and Pi.
   return DAG.getNode(ISD::FMUL, DL, VT, TrigVal,
@@ -1088,7 +1064,7 @@ SDValue R600TargetLowering::LowerFPTOUINT(SDValue Op, SelectionDAG &DAG) const {
 }
 
 SDValue R600TargetLowering::LowerImplicitParameter(SelectionDAG &DAG, EVT VT,
-                                                   SDLoc DL,
+                                                   const SDLoc &DL,
                                                    unsigned DwordOffset) const {
   unsigned ByteOffset = DwordOffset * 4;
   PointerType * PtrType = PointerType::get(VT.getTypeForEVT(*DAG.getContext()),
@@ -1099,8 +1075,7 @@ SDValue R600TargetLowering::LowerImplicitParameter(SelectionDAG &DAG, EVT VT,
 
   return DAG.getLoad(VT, DL, DAG.getEntryNode(),
                      DAG.getConstant(ByteOffset, DL, MVT::i32), // PTR
-                     MachinePointerInfo(ConstantPointerNull::get(PtrType)),
-                     false, false, false, 0);
+                     MachinePointerInfo(ConstantPointerNull::get(PtrType)));
 }
 
 bool R600TargetLowering::isZero(SDValue Op) const {
@@ -1113,6 +1088,20 @@ bool R600TargetLowering::isZero(SDValue Op) const {
   }
 }
 
+bool R600TargetLowering::isHWTrueValue(SDValue Op) const {
+  if (ConstantFPSDNode * CFP = dyn_cast<ConstantFPSDNode>(Op)) {
+    return CFP->isExactlyValue(1.0);
+  }
+  return isAllOnesConstant(Op);
+}
+
+bool R600TargetLowering::isHWFalseValue(SDValue Op) const {
+  if (ConstantFPSDNode * CFP = dyn_cast<ConstantFPSDNode>(Op)) {
+    return CFP->getValueAPF().isZero();
+  }
+  return isNullConstant(Op);
+}
+
 SDValue R600TargetLowering::LowerSELECT_CC(SDValue Op, SelectionDAG &DAG) const {
   SDLoc DL(Op);
   EVT VT = Op.getValueType();
@@ -1311,19 +1300,73 @@ void R600TargetLowering::getStackAddress(unsigned StackWidth,
   }
 }
 
+SDValue R600TargetLowering::lowerPrivateTruncStore(StoreSDNode *Store,
+                                                   SelectionDAG &DAG) const {
+  SDLoc DL(Store);
+
+  unsigned Mask = 0;
+  if (Store->getMemoryVT() == MVT::i8) {
+    Mask = 0xff;
+  } else if (Store->getMemoryVT() == MVT::i16) {
+    Mask = 0xffff;
+  }
+
+  SDValue Chain = Store->getChain();
+  SDValue BasePtr = Store->getBasePtr();
+  EVT MemVT = Store->getMemoryVT();
+
+  SDValue Ptr = DAG.getNode(ISD::SRL, DL, MVT::i32, BasePtr,
+                            DAG.getConstant(2, DL, MVT::i32));
+  SDValue Dst = DAG.getNode(AMDGPUISD::REGISTER_LOAD, DL, MVT::i32,
+                            Chain, Ptr,
+                            DAG.getTargetConstant(0, DL, MVT::i32));
+
+  SDValue ByteIdx = DAG.getNode(ISD::AND, DL, MVT::i32, BasePtr,
+                                DAG.getConstant(0x3, DL, MVT::i32));
+
+  SDValue ShiftAmt = DAG.getNode(ISD::SHL, DL, MVT::i32, ByteIdx,
+                                 DAG.getConstant(3, DL, MVT::i32));
+
+  SDValue SExtValue = DAG.getNode(ISD::SIGN_EXTEND, DL, MVT::i32,
+                                  Store->getValue());
+
+  SDValue MaskedValue = DAG.getZeroExtendInReg(SExtValue, DL, MemVT);
+
+  SDValue ShiftedValue = DAG.getNode(ISD::SHL, DL, MVT::i32,
+                                     MaskedValue, ShiftAmt);
+
+  SDValue DstMask = DAG.getNode(ISD::SHL, DL, MVT::i32,
+                                DAG.getConstant(Mask, DL, MVT::i32),
+                                ShiftAmt);
+  DstMask = DAG.getNode(ISD::XOR, DL, MVT::i32, DstMask,
+                        DAG.getConstant(0xffffffff, DL, MVT::i32));
+  Dst = DAG.getNode(ISD::AND, DL, MVT::i32, Dst, DstMask);
+
+  SDValue Value = DAG.getNode(ISD::OR, DL, MVT::i32, Dst, ShiftedValue);
+  return DAG.getNode(AMDGPUISD::REGISTER_STORE, DL, MVT::Other,
+                     Chain, Value, Ptr,
+                     DAG.getTargetConstant(0, DL, MVT::i32));
+}
+
 SDValue R600TargetLowering::LowerSTORE(SDValue Op, SelectionDAG &DAG) const {
-  SDLoc DL(Op);
+  if (SDValue Result = AMDGPUTargetLowering::MergeVectorStore(Op, DAG))
+    return Result;
+
   StoreSDNode *StoreNode = cast<StoreSDNode>(Op);
-  SDValue Chain = Op.getOperand(0);
-  SDValue Value = Op.getOperand(1);
-  SDValue Ptr = Op.getOperand(2);
+  unsigned AS = StoreNode->getAddressSpace();
+  SDValue Value = StoreNode->getValue();
+  EVT ValueVT = Value.getValueType();
 
-  SDValue Result = AMDGPUTargetLowering::LowerSTORE(Op, DAG);
-  if (Result.getNode()) {
-    return Result;
+  if ((AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::PRIVATE_ADDRESS) &&
+      ValueVT.isVector()) {
+    return SplitVectorStore(Op, DAG);
   }
 
-  if (StoreNode->getAddressSpace() == AMDGPUAS::GLOBAL_ADDRESS) {
+  SDLoc DL(Op);
+  SDValue Chain = StoreNode->getChain();
+  SDValue Ptr = StoreNode->getBasePtr();
+
+  if (AS == AMDGPUAS::GLOBAL_ADDRESS) {
     if (StoreNode->isTruncatingStore()) {
       EVT VT = Value.getValueType();
       assert(VT.bitsLE(MVT::i32));
@@ -1352,13 +1395,13 @@ SDValue R600TargetLowering::LowerSTORE(SDValue Op, SelectionDAG &DAG) const {
         DAG.getConstant(0, DL, MVT::i32),
         Mask
       };
-      SDValue Input = DAG.getNode(ISD::BUILD_VECTOR, DL, MVT::v4i32, Src);
+      SDValue Input = DAG.getBuildVector(MVT::v4i32, DL, Src);
       SDValue Args[3] = { Chain, Input, DWordAddr };
       return DAG.getMemIntrinsicNode(AMDGPUISD::STORE_MSKOR, DL,
                                      Op->getVTList(), Args, MemVT,
                                      StoreNode->getMemOperand());
     } else if (Ptr->getOpcode() != AMDGPUISD::DWORDADDR &&
-               Value.getValueType().bitsGE(MVT::i32)) {
+               ValueVT.bitsGE(MVT::i32)) {
       // Convert pointer from byte address to dword address.
       Ptr = DAG.getNode(AMDGPUISD::DWORDADDR, DL, Ptr.getValueType(),
                         DAG.getNode(ISD::SRL, DL, Ptr.getValueType(),
@@ -1373,21 +1416,16 @@ SDValue R600TargetLowering::LowerSTORE(SDValue Op, SelectionDAG &DAG) const {
     }
   }
 
-  EVT ValueVT = Value.getValueType();
-
-  if (StoreNode->getAddressSpace() != AMDGPUAS::PRIVATE_ADDRESS) {
+  if (AS != AMDGPUAS::PRIVATE_ADDRESS)
     return SDValue();
-  }
 
-  SDValue Ret = AMDGPUTargetLowering::LowerSTORE(Op, DAG);
-  if (Ret.getNode()) {
-    return Ret;
-  }
-  // Lowering for indirect addressing
+  EVT MemVT = StoreNode->getMemoryVT();
+  if (MemVT.bitsLT(MVT::i32))
+    return lowerPrivateTruncStore(StoreNode, DAG);
 
+  // Lowering for indirect addressing
   const MachineFunction &MF = DAG.getMachineFunction();
-  const AMDGPUFrameLowering *TFL =
-      static_cast<const AMDGPUFrameLowering *>(Subtarget->getFrameLowering());
+  const R600FrameLowering *TFL = getSubtarget()->getFrameLowering();
   unsigned StackWidth = TFL->getStackWidth(MF);
 
   Ptr = stackPtrToRegIndex(Ptr, StackWidth, DAG);
@@ -1465,37 +1503,81 @@ ConstantAddressBlock(unsigned AddressSpace) {
   }
 }
 
-SDValue R600TargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const
-{
-  EVT VT = Op.getValueType();
+SDValue R600TargetLowering::lowerPrivateExtLoad(SDValue Op,
+                                                SelectionDAG &DAG) const {
   SDLoc DL(Op);
-  LoadSDNode *LoadNode = cast<LoadSDNode>(Op);
-  SDValue Chain = Op.getOperand(0);
-  SDValue Ptr = Op.getOperand(1);
-  SDValue LoweredLoad;
+  LoadSDNode *Load = cast<LoadSDNode>(Op);
+  ISD::LoadExtType ExtType = Load->getExtensionType();
+  EVT MemVT = Load->getMemoryVT();
+
+  // <SI && AS=PRIVATE && EXTLOAD && size < 32bit,
+  // register (2-)byte extract.
+
+  // Get Register holding the target.
+  SDValue Ptr = DAG.getNode(ISD::SRL, DL, MVT::i32, Load->getBasePtr(),
+                            DAG.getConstant(2, DL, MVT::i32));
+  // Load the Register.
+  SDValue Ret = DAG.getNode(AMDGPUISD::REGISTER_LOAD, DL, Op.getValueType(),
+                            Load->getChain(),
+                            Ptr,
+                            DAG.getTargetConstant(0, DL, MVT::i32),
+                            Op.getOperand(2));
+
+  // Get offset within the register.
+  SDValue ByteIdx = DAG.getNode(ISD::AND, DL, MVT::i32,
+                                Load->getBasePtr(),
+                                DAG.getConstant(0x3, DL, MVT::i32));
+
+  // Bit offset of target byte (byteIdx * 8).
+  SDValue ShiftAmt = DAG.getNode(ISD::SHL, DL, MVT::i32, ByteIdx,
+                                 DAG.getConstant(3, DL, MVT::i32));
+
+  // Shift to the right.
+  Ret = DAG.getNode(ISD::SRL, DL, MVT::i32, Ret, ShiftAmt);
+
+  // Eliminate the upper bits by setting them to ...
+  EVT MemEltVT = MemVT.getScalarType();
+
+  // ... ones.
+  if (ExtType == ISD::SEXTLOAD) {
+    SDValue MemEltVTNode = DAG.getValueType(MemEltVT);
+
+    SDValue Ops[] = {
+      DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, MVT::i32, Ret, MemEltVTNode),
+      Load->getChain()
+    };
 
-  if (SDValue Ret = AMDGPUTargetLowering::LowerLOAD(Op, DAG))
-    return Ret;
+    return DAG.getMergeValues(Ops, DL);
+  }
+
+  // ... or zeros.
+  SDValue Ops[] = {
+    DAG.getZeroExtendInReg(Ret, DL, MemEltVT),
+    Load->getChain()
+  };
 
-  // Lower loads constant address space global variable loads
-  if (LoadNode->getAddressSpace() == AMDGPUAS::CONSTANT_ADDRESS &&
-      isa<GlobalVariable>(GetUnderlyingObject(
-          LoadNode->getMemOperand()->getValue(), DAG.getDataLayout()))) {
+  return DAG.getMergeValues(Ops, DL);
+}
 
-    SDValue Ptr = DAG.getZExtOrTrunc(
-        LoadNode->getBasePtr(), DL,
-        getPointerTy(DAG.getDataLayout(), AMDGPUAS::PRIVATE_ADDRESS));
-    Ptr = DAG.getNode(ISD::SRL, DL, MVT::i32, Ptr,
-        DAG.getConstant(2, DL, MVT::i32));
-    return DAG.getNode(AMDGPUISD::REGISTER_LOAD, DL, Op->getVTList(),
-                       LoadNode->getChain(), Ptr,
-                       DAG.getTargetConstant(0, DL, MVT::i32),
-                       Op.getOperand(2));
+SDValue R600TargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const {
+  LoadSDNode *LoadNode = cast<LoadSDNode>(Op);
+  unsigned AS = LoadNode->getAddressSpace();
+  EVT MemVT = LoadNode->getMemoryVT();
+  ISD::LoadExtType ExtType = LoadNode->getExtensionType();
+
+  if (AS == AMDGPUAS::PRIVATE_ADDRESS &&
+      ExtType != ISD::NON_EXTLOAD && MemVT.bitsLT(MVT::i32)) {
+    return lowerPrivateExtLoad(Op, DAG);
   }
 
+  SDLoc DL(Op);
+  EVT VT = Op.getValueType();
+  SDValue Chain = LoadNode->getChain();
+  SDValue Ptr = LoadNode->getBasePtr();
+
   if (LoadNode->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS && VT.isVector()) {
     SDValue MergedValues[2] = {
-      ScalarizeVectorLoad(Op, DAG),
+      scalarizeVectorLoad(LoadNode, DAG),
       Chain
     };
     return DAG.getMergeValues(MergedValues, DL);
@@ -1526,8 +1608,7 @@ SDValue R600TargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const
         NewVT = VT;
         NumElements = VT.getVectorNumElements();
       }
-      Result = DAG.getNode(ISD::BUILD_VECTOR, DL, NewVT,
-                           makeArrayRef(Slots, NumElements));
+      Result = DAG.getBuildVector(NewVT, DL, makeArrayRef(Slots, NumElements));
     } else {
       // non-constant ptr can't be folded, keeps it as a v4f32 load
       Result = DAG.getNode(AMDGPUISD::CONST_ADDRESS, DL, MVT::v4i32,
@@ -1550,6 +1631,8 @@ SDValue R600TargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const
     return DAG.getMergeValues(MergedValues, DL);
   }
 
+  SDValue LoweredLoad;
+
   // For most operations returning SDValue() will result in the node being
   // expanded by the DAG Legalizer. This is not the case for ISD::LOAD, so we
   // need to manually expand loads that may be legal in some address spaces and
@@ -1560,12 +1643,9 @@ SDValue R600TargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const
   if (LoadNode->getExtensionType() == ISD::SEXTLOAD) {
     EVT MemVT = LoadNode->getMemoryVT();
     assert(!MemVT.isVector() && (MemVT == MVT::i16 || MemVT == MVT::i8));
-    SDValue NewLoad = DAG.getExtLoad(ISD::EXTLOAD, DL, VT, Chain, Ptr,
-                                  LoadNode->getPointerInfo(), MemVT,
-                                  LoadNode->isVolatile(),
-                                  LoadNode->isNonTemporal(),
-                                  LoadNode->isInvariant(),
-                                  LoadNode->getAlignment());
+    SDValue NewLoad = DAG.getExtLoad(
+        ISD::EXTLOAD, DL, VT, Chain, Ptr, LoadNode->getPointerInfo(), MemVT,
+        LoadNode->getAlignment(), LoadNode->getMemOperand()->getFlags());
     SDValue Res = DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, VT, NewLoad,
                               DAG.getValueType(MemVT));
 
@@ -1579,8 +1659,7 @@ SDValue R600TargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const
 
   // Lowering for indirect addressing
   const MachineFunction &MF = DAG.getMachineFunction();
-  const AMDGPUFrameLowering *TFL =
-      static_cast<const AMDGPUFrameLowering *>(Subtarget->getFrameLowering());
+  const R600FrameLowering *TFL = getSubtarget()->getFrameLowering();
   unsigned StackWidth = TFL->getStackWidth(MF);
 
   Ptr = stackPtrToRegIndex(Ptr, StackWidth, DAG);
@@ -1590,6 +1669,7 @@ SDValue R600TargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const
     EVT ElemVT = VT.getVectorElementType();
     SDValue Loads[4];
 
+    assert(NumElemVT <= 4);
     assert(NumElemVT >= StackWidth && "Stack width cannot be greater than "
                                       "vector width in load");
 
@@ -1603,11 +1683,8 @@ SDValue R600TargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const
                              DAG.getTargetConstant(Channel, DL, MVT::i32),
                              Op.getOperand(2));
     }
-    for (unsigned i = NumElemVT; i < 4; ++i) {
-      Loads[i] = DAG.getUNDEF(ElemVT);
-    }
-    EVT TargetVT = EVT::getVectorVT(*DAG.getContext(), ElemVT, 4);
-    LoweredLoad = DAG.getNode(ISD::BUILD_VECTOR, DL, TargetVT, Loads);
+    EVT TargetVT = EVT::getVectorVT(*DAG.getContext(), ElemVT, NumElemVT);
+    LoweredLoad = DAG.getBuildVector(TargetVT, DL, makeArrayRef(Loads, NumElemVT));
   } else {
     LoweredLoad = DAG.getNode(AMDGPUISD::REGISTER_LOAD, DL, VT,
                               Chain, Ptr,
@@ -1632,16 +1709,28 @@ SDValue R600TargetLowering::LowerBRCOND(SDValue Op, SelectionDAG &DAG) const {
                      Chain, Jump, Cond);
 }
 
+SDValue R600TargetLowering::lowerFrameIndex(SDValue Op,
+                                            SelectionDAG &DAG) const {
+  MachineFunction &MF = DAG.getMachineFunction();
+  const R600FrameLowering *TFL = getSubtarget()->getFrameLowering();
+
+  FrameIndexSDNode *FIN = cast<FrameIndexSDNode>(Op);
+
+  unsigned FrameIndex = FIN->getIndex();
+  unsigned IgnoredFrameReg;
+  unsigned Offset =
+    TFL->getFrameIndexReference(MF, FrameIndex, IgnoredFrameReg);
+  return DAG.getConstant(Offset * 4 * TFL->getStackWidth(MF), SDLoc(Op),
+                         Op.getValueType());
+}
+
 /// XXX Only kernel functions are supported, so we can assume for now that
 /// every function is a kernel function, but in the future we should use
 /// separate calling conventions for kernel and non-kernel functions.
 SDValue R600TargetLowering::LowerFormalArguments(
-                                      SDValue Chain,
-                                      CallingConv::ID CallConv,
-                                      bool isVarArg,
-                                      const SmallVectorImpl<ISD::InputArg> &Ins,
-                                      SDLoc DL, SelectionDAG &DAG,
-                                      SmallVectorImpl<SDValue> &InVals) const {
+    SDValue Chain, CallingConv::ID CallConv, bool isVarArg,
+    const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &DL,
+    SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals) const {
   SmallVector<CCValAssign, 16> ArgLocs;
   CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), ArgLocs,
                  *DAG.getContext());
@@ -1664,7 +1753,7 @@ SDValue R600TargetLowering::LowerFormalArguments(
       MemVT = MemVT.getVectorElementType();
     }
 
-    if (MFI->getShaderType() != ShaderType::COMPUTE) {
+    if (AMDGPU::isShader(CallConv)) {
       unsigned Reg = MF.addLiveIn(VA.getLocReg(), &AMDGPU::R600_Reg128RegClass);
       SDValue Register = DAG.getCopyFromReg(Chain, DL, Reg, VT);
       InVals.push_back(Register);
@@ -1699,11 +1788,11 @@ SDValue R600TargetLowering::LowerFormalArguments(
     unsigned Offset = 36 + VA.getLocMemOffset();
 
     MachinePointerInfo PtrInfo(UndefValue::get(PtrTy), PartOffset - ValBase);
-    SDValue Arg = DAG.getLoad(ISD::UNINDEXED, Ext, VT, DL, Chain,
-                              DAG.getConstant(Offset, DL, MVT::i32),
-                              DAG.getUNDEF(MVT::i32),
-                              PtrInfo,
-                              MemVT, false, true, true, 4);
+    SDValue Arg = DAG.getLoad(
+        ISD::UNINDEXED, Ext, VT, DL, Chain,
+        DAG.getConstant(Offset, DL, MVT::i32), DAG.getUNDEF(MVT::i32), PtrInfo,
+        MemVT, /* Alignment = */ 4,
+        MachineMemOperand::MONonTemporal | MachineMemOperand::MOInvariant);
 
     // 4 is the preferred alignment for the CONSTANT memory space.
     InVals.push_back(Arg);
@@ -1719,6 +1808,26 @@ EVT R600TargetLowering::getSetCCResultType(const DataLayout &DL, LLVMContext &,
    return VT.changeVectorElementTypeToInteger();
 }
 
+bool R600TargetLowering::allowsMisalignedMemoryAccesses(EVT VT,
+                                                        unsigned AddrSpace,
+                                                        unsigned Align,
+                                                        bool *IsFast) const {
+  if (IsFast)
+    *IsFast = false;
+
+  if (!VT.isSimple() || VT == MVT::Other)
+    return false;
+
+  if (VT.bitsLT(MVT::i32))
+    return false;
+
+  // TODO: This is a rough estimate.
+  if (IsFast)
+    *IsFast = true;
+
+  return VT.bitsGT(MVT::i32) && Align % 4 == 0;
+}
+
 static SDValue CompactSwizzlableVector(
   SelectionDAG &DAG, SDValue VectorEntry,
   DenseMap<unsigned, unsigned> &RemapSwizzle) {
@@ -1732,7 +1841,7 @@ static SDValue CompactSwizzlableVector(
   };
 
   for (unsigned i = 0; i < 4; i++) {
-    if (NewBldVec[i].getOpcode() == ISD::UNDEF)
+    if (NewBldVec[i].isUndef())
       // We mask write here to teach later passes that the ith element of this
       // vector is undef. Thus we can use it to reduce 128 bits reg usage,
       // break false dependencies and additionnaly make assembly easier to read.
@@ -1747,7 +1856,7 @@ static SDValue CompactSwizzlableVector(
       }
     }
 
-    if (NewBldVec[i].getOpcode() == ISD::UNDEF)
+    if (NewBldVec[i].isUndef())
       continue;
     for (unsigned j = 0; j < i; j++) {
       if (NewBldVec[i] == NewBldVec[j]) {
@@ -1758,8 +1867,8 @@ static SDValue CompactSwizzlableVector(
     }
   }
 
-  return DAG.getNode(ISD::BUILD_VECTOR, SDLoc(VectorEntry),
-                     VectorEntry.getValueType(), NewBldVec);
+  return DAG.getBuildVector(VectorEntry.getValueType(), SDLoc(VectorEntry),
+                            NewBldVec);
 }
 
 static SDValue ReorganizeVector(SelectionDAG &DAG, SDValue VectorEntry,
@@ -1796,14 +1905,13 @@ static SDValue ReorganizeVector(SelectionDAG &DAG, SDValue VectorEntry,
     }
   }
 
-  return DAG.getNode(ISD::BUILD_VECTOR, SDLoc(VectorEntry),
-                     VectorEntry.getValueType(), NewBldVec);
+  return DAG.getBuildVector(VectorEntry.getValueType(), SDLoc(VectorEntry),
+                            NewBldVec);
 }
 
-
-SDValue R600TargetLowering::OptimizeSwizzle(SDValue BuildVector,
-                                            SDValue Swz[4], SelectionDAG &DAG,
-                                            SDLoc DL) const {
+SDValue R600TargetLowering::OptimizeSwizzle(SDValue BuildVector, SDValue Swz[4],
+                                            SelectionDAG &DAG,
+                                            const SDLoc &DL) const {
   assert(BuildVector.getOpcode() == ISD::BUILD_VECTOR);
   // Old -> New swizzle values
   DenseMap<unsigned, unsigned> SwizzleRemap;
@@ -1886,7 +1994,7 @@ SDValue R600TargetLowering::PerformDAGCombine(SDNode *N,
     SDLoc dl(N);
 
     // If the inserted element is an UNDEF, just use the input vector.
-    if (InVal.getOpcode() == ISD::UNDEF)
+    if (InVal.isUndef())
       return InVec;
 
     EVT VT = InVec.getValueType();
@@ -1907,7 +2015,7 @@ SDValue R600TargetLowering::PerformDAGCombine(SDNode *N,
     if (InVec.getOpcode() == ISD::BUILD_VECTOR) {
       Ops.append(InVec.getNode()->op_begin(),
                  InVec.getNode()->op_end());
-    } else if (InVec.getOpcode() == ISD::UNDEF) {
+    } else if (InVec.isUndef()) {
       unsigned NElts = VT.getVectorNumElements();
       Ops.append(NElts, DAG.getUNDEF(InVal.getValueType()));
     } else {
@@ -1927,7 +2035,7 @@ SDValue R600TargetLowering::PerformDAGCombine(SDNode *N,
     }
 
     // Return the new vector
-    return DAG.getNode(ISD::BUILD_VECTOR, dl, VT, Ops);
+    return DAG.getBuildVector(VT, dl, Ops);
   }
 
   // Extract_vec (Build_vector) generated by custom lowering
@@ -1953,8 +2061,7 @@ SDValue R600TargetLowering::PerformDAGCombine(SDNode *N,
 
   case ISD::SELECT_CC: {
     // Try common optimizations
-    SDValue Ret = AMDGPUTargetLowering::PerformDAGCombine(N, DCI);
-    if (Ret.getNode())
+    if (SDValue Ret = AMDGPUTargetLowering::PerformDAGCombine(N, DCI))
       return Ret;
 
     // fold selectcc (selectcc x, y, a, b, cc), b, a, b, seteq ->
@@ -2053,13 +2160,14 @@ SDValue R600TargetLowering::PerformDAGCombine(SDNode *N,
   return AMDGPUTargetLowering::PerformDAGCombine(N, DCI);
 }
 
-static bool
-FoldOperand(SDNode *ParentNode, unsigned SrcIdx, SDValue &Src, SDValue &Neg,
-            SDValue &Abs, SDValue &Sel, SDValue &Imm, SelectionDAG &DAG) {
-  const R600InstrInfo *TII =
-      static_cast<const R600InstrInfo *>(DAG.getSubtarget().getInstrInfo());
+bool R600TargetLowering::FoldOperand(SDNode *ParentNode, unsigned SrcIdx,
+                                     SDValue &Src, SDValue &Neg, SDValue &Abs,
+                                     SDValue &Sel, SDValue &Imm,
+                                     SelectionDAG &DAG) const {
+  const R600InstrInfo *TII = getSubtarget()->getInstrInfo();
   if (!Src.isMachineOpcode())
     return false;
+
   switch (Src.getMachineOpcode()) {
   case AMDGPU::FNEG_R600:
     if (!Neg.getNode())
@@ -2127,6 +2235,13 @@ FoldOperand(SDNode *ParentNode, unsigned SrcIdx, SDValue &Src, SDValue &Neg,
     Src = DAG.getRegister(AMDGPU::ALU_CONST, MVT::f32);
     return true;
   }
+  case AMDGPU::MOV_IMM_GLOBAL_ADDR:
+    // Check if the Imm slot is used. Taken from below.
+    if (cast<ConstantSDNode>(Imm)->getZExtValue())
+      return false;
+    Imm = Src.getOperand(0);
+    Src = DAG.getRegister(AMDGPU::ALU_LITERAL_X, MVT::i32);
+    return true;
   case AMDGPU::MOV_IMM_I32:
   case AMDGPU::MOV_IMM_F32: {
     unsigned ImmReg = AMDGPU::ALU_LITERAL_X;
@@ -2177,14 +2292,13 @@ FoldOperand(SDNode *ParentNode, unsigned SrcIdx, SDValue &Src, SDValue &Neg,
   }
 }
 
-
 /// \brief Fold the instructions after selecting them
 SDNode *R600TargetLowering::PostISelFolding(MachineSDNode *Node,
                                             SelectionDAG &DAG) const {
-  const R600InstrInfo *TII =
-      static_cast<const R600InstrInfo *>(DAG.getSubtarget().getInstrInfo());
+  const R600InstrInfo *TII = getSubtarget()->getInstrInfo();
   if (!Node->isMachineOpcode())
     return Node;
+
   unsigned Opcode = Node->getMachineOpcode();
   SDValue FakeOp;
 
diff --git a/lib/Target/AMDGPU/R600ISelLowering.h b/lib/Target/AMDGPU/R600ISelLowering.h
index 4dbac97af2a1..2fb6ee25caa9 100644
--- a/lib/Target/AMDGPU/R600ISelLowering.h
+++ b/lib/Target/AMDGPU/R600ISelLowering.h
@@ -12,55 +12,69 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef LLVM_LIB_TARGET_R600_R600ISELLOWERING_H
-#define LLVM_LIB_TARGET_R600_R600ISELLOWERING_H
+#ifndef LLVM_LIB_TARGET_AMDGPU_R600ISELLOWERING_H
+#define LLVM_LIB_TARGET_AMDGPU_R600ISELLOWERING_H
 
 #include "AMDGPUISelLowering.h"
 
 namespace llvm {
 
 class R600InstrInfo;
+class R600Subtarget;
 
-class R600TargetLowering : public AMDGPUTargetLowering {
+class R600TargetLowering final : public AMDGPUTargetLowering {
 public:
-  R600TargetLowering(TargetMachine &TM, const AMDGPUSubtarget &STI);
-  MachineBasicBlock * EmitInstrWithCustomInserter(MachineInstr *MI,
-      MachineBasicBlock * BB) const override;
+  R600TargetLowering(const TargetMachine &TM, const R600Subtarget &STI);
+
+  const R600Subtarget *getSubtarget() const;
+
+  MachineBasicBlock *
+  EmitInstrWithCustomInserter(MachineInstr &MI,
+                              MachineBasicBlock *BB) const override;
   SDValue LowerOperation(SDValue Op, SelectionDAG &DAG) const override;
   SDValue PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const override;
   void ReplaceNodeResults(SDNode * N,
                           SmallVectorImpl<SDValue> &Results,
                           SelectionDAG &DAG) const override;
-  SDValue LowerFormalArguments(
-                              SDValue Chain,
-                              CallingConv::ID CallConv,
-                              bool isVarArg,
-                              const SmallVectorImpl<ISD::InputArg> &Ins,
-                              SDLoc DL, SelectionDAG &DAG,
-                              SmallVectorImpl<SDValue> &InVals) const override;
+  SDValue LowerFormalArguments(SDValue Chain, CallingConv::ID CallConv,
+                               bool isVarArg,
+                               const SmallVectorImpl<ISD::InputArg> &Ins,
+                               const SDLoc &DL, SelectionDAG &DAG,
+                               SmallVectorImpl<SDValue> &InVals) const override;
   EVT getSetCCResultType(const DataLayout &DL, LLVMContext &,
                          EVT VT) const override;
 
+  bool allowsMisalignedMemoryAccesses(EVT VT, unsigned AS,
+                                      unsigned Align,
+                                      bool *IsFast) const override;
+
 private:
   unsigned Gen;
   /// Each OpenCL kernel has nine implicit parameters that are stored in the
   /// first nine dwords of a Vertex Buffer.  These implicit parameters are
   /// lowered to load instructions which retrieve the values from the Vertex
   /// Buffer.
-  SDValue LowerImplicitParameter(SelectionDAG &DAG, EVT VT,
-                                 SDLoc DL, unsigned DwordOffset) const;
+  SDValue LowerImplicitParameter(SelectionDAG &DAG, EVT VT, const SDLoc &DL,
+                                 unsigned DwordOffset) const;
 
   void lowerImplicitParameter(MachineInstr *MI, MachineBasicBlock &BB,
       MachineRegisterInfo & MRI, unsigned dword_offset) const;
   SDValue OptimizeSwizzle(SDValue BuildVector, SDValue Swz[], SelectionDAG &DAG,
-                          SDLoc DL) const;
+                          const SDLoc &DL) const;
   SDValue vectorToVerticalVector(SelectionDAG &DAG, SDValue Vector) const;
 
+  SDValue lowerFrameIndex(SDValue Op, SelectionDAG &DAG) const;
   SDValue LowerEXTRACT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG) const;
   SDValue LowerINSERT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG) const;
+  SDValue LowerGlobalAddress(AMDGPUMachineFunction *MFI, SDValue Op,
+                             SelectionDAG &DAG) const override;
   SDValue LowerSELECT_CC(SDValue Op, SelectionDAG &DAG) const;
+
+  SDValue lowerPrivateTruncStore(StoreSDNode *Store, SelectionDAG &DAG) const;
   SDValue LowerSTORE(SDValue Op, SelectionDAG &DAG) const;
   SDValue LowerFPTOUINT(SDValue Op, SelectionDAG &DAG) const;
+
+  SDValue lowerPrivateExtLoad(SDValue Op, SelectionDAG &DAG) const;
   SDValue LowerLOAD(SDValue Op, SelectionDAG &DAG) const;
   SDValue LowerBRCOND(SDValue Op, SelectionDAG &DAG) const;
   SDValue LowerTrig(SDValue Op, SelectionDAG &DAG) const;
@@ -74,6 +88,13 @@ private:
   void getStackAddress(unsigned StackWidth, unsigned ElemIdx,
                        unsigned &Channel, unsigned &PtrIncr) const;
   bool isZero(SDValue Op) const;
+  bool isHWTrueValue(SDValue Op) const;
+  bool isHWFalseValue(SDValue Op) const;
+
+ bool FoldOperand(SDNode *ParentNode, unsigned SrcIdx, SDValue &Src,
+                  SDValue &Neg, SDValue &Abs, SDValue &Sel, SDValue &Imm,
+                  SelectionDAG &DAG) const;
+
   SDNode *PostISelFolding(MachineSDNode *N, SelectionDAG &DAG) const override;
 };
 
diff --git a/lib/Target/AMDGPU/R600InstrInfo.cpp b/lib/Target/AMDGPU/R600InstrInfo.cpp
index 8b6eea17130b..1c5f7ec1b6ef 100644
--- a/lib/Target/AMDGPU/R600InstrInfo.cpp
+++ b/lib/Target/AMDGPU/R600InstrInfo.cpp
@@ -28,26 +28,17 @@ using namespace llvm;
 #define GET_INSTRINFO_CTOR_DTOR
 #include "AMDGPUGenDFAPacketizer.inc"
 
-R600InstrInfo::R600InstrInfo(const AMDGPUSubtarget &st)
-    : AMDGPUInstrInfo(st), RI() {}
-
-const R600RegisterInfo &R600InstrInfo::getRegisterInfo() const {
-  return RI;
-}
-
-bool R600InstrInfo::isTrig(const MachineInstr &MI) const {
-  return get(MI.getOpcode()).TSFlags & R600_InstFlag::TRIG;
-}
+R600InstrInfo::R600InstrInfo(const R600Subtarget &ST)
+  : AMDGPUInstrInfo(ST), RI(), ST(ST) {}
 
 bool R600InstrInfo::isVector(const MachineInstr &MI) const {
   return get(MI.getOpcode()).TSFlags & R600_InstFlag::VECTOR;
 }
 
-void
-R600InstrInfo::copyPhysReg(MachineBasicBlock &MBB,
-                           MachineBasicBlock::iterator MI, DebugLoc DL,
-                           unsigned DestReg, unsigned SrcReg,
-                           bool KillSrc) const {
+void R600InstrInfo::copyPhysReg(MachineBasicBlock &MBB,
+                                MachineBasicBlock::iterator MI,
+                                const DebugLoc &DL, unsigned DestReg,
+                                unsigned SrcReg, bool KillSrc) const {
   unsigned VectorComponents = 0;
   if ((AMDGPU::R600_Reg128RegClass.contains(DestReg) ||
       AMDGPU::R600_Reg128VerticalRegClass.contains(DestReg)) &&
@@ -91,10 +82,9 @@ bool R600InstrInfo::isLegalToSplitMBBAt(MachineBasicBlock &MBB,
 }
 
 bool R600InstrInfo::isMov(unsigned Opcode) const {
-
-
   switch(Opcode) {
-  default: return false;
+  default:
+    return false;
   case AMDGPU::MOV:
   case AMDGPU::MOV_IMM_F32:
   case AMDGPU::MOV_IMM_I32:
@@ -102,17 +92,6 @@ bool R600InstrInfo::isMov(unsigned Opcode) const {
   }
 }
 
-// Some instructions act as place holders to emulate operations that the GPU
-// hardware does automatically. This function can be used to check if
-// an opcode falls into this category.
-bool R600InstrInfo::isPlaceHolderOpcode(unsigned Opcode) const {
-  switch (Opcode) {
-  default: return false;
-  case AMDGPU::RETURN:
-    return true;
-  }
-}
-
 bool R600InstrInfo::isReductionOp(unsigned Opcode) const {
   return false;
 }
@@ -150,20 +129,16 @@ bool R600InstrInfo::isLDSInstr(unsigned Opcode) const {
           (TargetFlags & R600_InstFlag::LDS_1A2D));
 }
 
-bool R600InstrInfo::isLDSNoRetInstr(unsigned Opcode) const {
-  return isLDSInstr(Opcode) && getOperandIdx(Opcode, AMDGPU::OpName::dst) == -1;
-}
-
 bool R600InstrInfo::isLDSRetInstr(unsigned Opcode) const {
   return isLDSInstr(Opcode) && getOperandIdx(Opcode, AMDGPU::OpName::dst) != -1;
 }
 
-bool R600InstrInfo::canBeConsideredALU(const MachineInstr *MI) const {
-  if (isALUInstr(MI->getOpcode()))
+bool R600InstrInfo::canBeConsideredALU(const MachineInstr &MI) const {
+  if (isALUInstr(MI.getOpcode()))
     return true;
-  if (isVector(*MI) || isCubeOp(MI->getOpcode()))
+  if (isVector(MI) || isCubeOp(MI.getOpcode()))
     return true;
-  switch (MI->getOpcode()) {
+  switch (MI.getOpcode()) {
   case AMDGPU::PRED_X:
   case AMDGPU::INTERP_PAIR_XY:
   case AMDGPU::INTERP_PAIR_ZW:
@@ -182,16 +157,16 @@ bool R600InstrInfo::isTransOnly(unsigned Opcode) const {
   return (get(Opcode).getSchedClass() == AMDGPU::Sched::TransALU);
 }
 
-bool R600InstrInfo::isTransOnly(const MachineInstr *MI) const {
-  return isTransOnly(MI->getOpcode());
+bool R600InstrInfo::isTransOnly(const MachineInstr &MI) const {
+  return isTransOnly(MI.getOpcode());
 }
 
 bool R600InstrInfo::isVectorOnly(unsigned Opcode) const {
   return (get(Opcode).getSchedClass() == AMDGPU::Sched::VecALU);
 }
 
-bool R600InstrInfo::isVectorOnly(const MachineInstr *MI) const {
-  return isVectorOnly(MI->getOpcode());
+bool R600InstrInfo::isVectorOnly(const MachineInstr &MI) const {
+  return isVectorOnly(MI.getOpcode());
 }
 
 bool R600InstrInfo::isExport(unsigned Opcode) const {
@@ -202,23 +177,21 @@ bool R600InstrInfo::usesVertexCache(unsigned Opcode) const {
   return ST.hasVertexCache() && IS_VTX(get(Opcode));
 }
 
-bool R600InstrInfo::usesVertexCache(const MachineInstr *MI) const {
-  const MachineFunction *MF = MI->getParent()->getParent();
-  const R600MachineFunctionInfo *MFI = MF->getInfo<R600MachineFunctionInfo>();
-  return MFI->getShaderType() != ShaderType::COMPUTE &&
-    usesVertexCache(MI->getOpcode());
+bool R600InstrInfo::usesVertexCache(const MachineInstr &MI) const {
+  const MachineFunction *MF = MI.getParent()->getParent();
+  return !AMDGPU::isCompute(MF->getFunction()->getCallingConv()) &&
+         usesVertexCache(MI.getOpcode());
 }
 
 bool R600InstrInfo::usesTextureCache(unsigned Opcode) const {
   return (!ST.hasVertexCache() && IS_VTX(get(Opcode))) || IS_TEX(get(Opcode));
 }
 
-bool R600InstrInfo::usesTextureCache(const MachineInstr *MI) const {
-  const MachineFunction *MF = MI->getParent()->getParent();
-  const R600MachineFunctionInfo *MFI = MF->getInfo<R600MachineFunctionInfo>();
-  return (MFI->getShaderType() == ShaderType::COMPUTE &&
-          usesVertexCache(MI->getOpcode())) ||
-    usesTextureCache(MI->getOpcode());
+bool R600InstrInfo::usesTextureCache(const MachineInstr &MI) const {
+  const MachineFunction *MF = MI.getParent()->getParent();
+  return (AMDGPU::isCompute(MF->getFunction()->getCallingConv()) &&
+          usesVertexCache(MI.getOpcode())) ||
+         usesTextureCache(MI.getOpcode());
 }
 
 bool R600InstrInfo::mustBeLastInClause(unsigned Opcode) const {
@@ -231,20 +204,21 @@ bool R600InstrInfo::mustBeLastInClause(unsigned Opcode) const {
   }
 }
 
-bool R600InstrInfo::usesAddressRegister(MachineInstr *MI) const {
-  return  MI->findRegisterUseOperandIdx(AMDGPU::AR_X) != -1;
+bool R600InstrInfo::usesAddressRegister(MachineInstr &MI) const {
+  return MI.findRegisterUseOperandIdx(AMDGPU::AR_X) != -1;
 }
 
-bool R600InstrInfo::definesAddressRegister(MachineInstr *MI) const {
-  return MI->findRegisterDefOperandIdx(AMDGPU::AR_X) != -1;
+bool R600InstrInfo::definesAddressRegister(MachineInstr &MI) const {
+  return MI.findRegisterDefOperandIdx(AMDGPU::AR_X) != -1;
 }
 
-bool R600InstrInfo::readsLDSSrcReg(const MachineInstr *MI) const {
-  if (!isALUInstr(MI->getOpcode())) {
+bool R600InstrInfo::readsLDSSrcReg(const MachineInstr &MI) const {
+  if (!isALUInstr(MI.getOpcode())) {
     return false;
   }
-  for (MachineInstr::const_mop_iterator I = MI->operands_begin(),
-                                        E = MI->operands_end(); I != E; ++I) {
+  for (MachineInstr::const_mop_iterator I = MI.operands_begin(),
+                                        E = MI.operands_end();
+       I != E; ++I) {
     if (!I->isReg() || !I->isUse() ||
         TargetRegisterInfo::isVirtualRegister(I->getReg()))
       continue;
@@ -255,17 +229,6 @@ bool R600InstrInfo::readsLDSSrcReg(const MachineInstr *MI) const {
   return false;
 }
 
-int R600InstrInfo::getSrcIdx(unsigned Opcode, unsigned SrcNum) const {
-  static const unsigned OpTable[] = {
-    AMDGPU::OpName::src0,
-    AMDGPU::OpName::src1,
-    AMDGPU::OpName::src2
-  };
-
-  assert (SrcNum < 3);
-  return getOperandIdx(Opcode, OpTable[SrcNum]);
-}
-
 int R600InstrInfo::getSelIdx(unsigned Opcode, unsigned SrcIdx) const {
   static const unsigned SrcSelTable[][2] = {
     {AMDGPU::OpName::src0, AMDGPU::OpName::src0_sel},
@@ -290,10 +253,10 @@ int R600InstrInfo::getSelIdx(unsigned Opcode, unsigned SrcIdx) const {
 }
 
 SmallVector<std::pair<MachineOperand *, int64_t>, 3>
-R600InstrInfo::getSrcs(MachineInstr *MI) const {
+R600InstrInfo::getSrcs(MachineInstr &MI) const {
   SmallVector<std::pair<MachineOperand *, int64_t>, 3> Result;
 
-  if (MI->getOpcode() == AMDGPU::DOT_4) {
+  if (MI.getOpcode() == AMDGPU::DOT_4) {
     static const unsigned OpTable[8][2] = {
       {AMDGPU::OpName::src0_X, AMDGPU::OpName::src0_sel_X},
       {AMDGPU::OpName::src0_Y, AMDGPU::OpName::src0_sel_Y},
@@ -306,13 +269,13 @@ R600InstrInfo::getSrcs(MachineInstr *MI) const {
     };
 
     for (unsigned j = 0; j < 8; j++) {
-      MachineOperand &MO = MI->getOperand(getOperandIdx(MI->getOpcode(),
-                                                        OpTable[j][0]));
+      MachineOperand &MO =
+          MI.getOperand(getOperandIdx(MI.getOpcode(), OpTable[j][0]));
       unsigned Reg = MO.getReg();
       if (Reg == AMDGPU::ALU_CONST) {
-        unsigned Sel = MI->getOperand(getOperandIdx(MI->getOpcode(),
-                                                    OpTable[j][1])).getImm();
-        Result.push_back(std::pair<MachineOperand *, int64_t>(&MO, Sel));
+        MachineOperand &Sel =
+            MI.getOperand(getOperandIdx(MI.getOpcode(), OpTable[j][1]));
+        Result.push_back(std::make_pair(&MO, Sel.getImm()));
         continue;
       }
 
@@ -327,30 +290,33 @@ R600InstrInfo::getSrcs(MachineInstr *MI) const {
   };
 
   for (unsigned j = 0; j < 3; j++) {
-    int SrcIdx = getOperandIdx(MI->getOpcode(), OpTable[j][0]);
+    int SrcIdx = getOperandIdx(MI.getOpcode(), OpTable[j][0]);
     if (SrcIdx < 0)
       break;
-    MachineOperand &MO = MI->getOperand(SrcIdx);
-    unsigned Reg = MI->getOperand(SrcIdx).getReg();
+    MachineOperand &MO = MI.getOperand(SrcIdx);
+    unsigned Reg = MO.getReg();
     if (Reg == AMDGPU::ALU_CONST) {
-      unsigned Sel = MI->getOperand(
-          getOperandIdx(MI->getOpcode(), OpTable[j][1])).getImm();
-      Result.push_back(std::pair<MachineOperand *, int64_t>(&MO, Sel));
+      MachineOperand &Sel =
+          MI.getOperand(getOperandIdx(MI.getOpcode(), OpTable[j][1]));
+      Result.push_back(std::make_pair(&MO, Sel.getImm()));
       continue;
     }
     if (Reg == AMDGPU::ALU_LITERAL_X) {
-      unsigned Imm = MI->getOperand(
-          getOperandIdx(MI->getOpcode(), AMDGPU::OpName::literal)).getImm();
-      Result.push_back(std::pair<MachineOperand *, int64_t>(&MO, Imm));
-      continue;
+      MachineOperand &Operand =
+          MI.getOperand(getOperandIdx(MI.getOpcode(), AMDGPU::OpName::literal));
+      if (Operand.isImm()) {
+        Result.push_back(std::make_pair(&MO, Operand.getImm()));
+        continue;
+      }
+      assert(Operand.isGlobal());
     }
-    Result.push_back(std::pair<MachineOperand *, int64_t>(&MO, 0));
+    Result.push_back(std::make_pair(&MO, 0));
   }
   return Result;
 }
 
-std::vector<std::pair<int, unsigned> >
-R600InstrInfo::ExtractSrcs(MachineInstr *MI,
+std::vector<std::pair<int, unsigned>>
+R600InstrInfo::ExtractSrcs(MachineInstr &MI,
                            const DenseMap<unsigned, unsigned> &PV,
                            unsigned &ConstCount) const {
   ConstCount = 0;
@@ -360,13 +326,13 @@ R600InstrInfo::ExtractSrcs(MachineInstr *MI,
   unsigned i = 0;
   for (unsigned n = Srcs.size(); i < n; ++i) {
     unsigned Reg = Srcs[i].first->getReg();
-    unsigned Index = RI.getEncodingValue(Reg) & 0xff;
+    int Index = RI.getEncodingValue(Reg) & 0xff;
     if (Reg == AMDGPU::OQAP) {
-      Result.push_back(std::pair<int, unsigned>(Index, 0));
+      Result.push_back(std::make_pair(Index, 0U));
     }
     if (PV.find(Reg) != PV.end()) {
       // 255 is used to tells its a PS/PV reg
-      Result.push_back(std::pair<int, unsigned>(255, 0));
+      Result.push_back(std::make_pair(255, 0U));
       continue;
     }
     if (Index > 127) {
@@ -375,7 +341,7 @@ R600InstrInfo::ExtractSrcs(MachineInstr *MI,
       continue;
     }
     unsigned Chan = RI.getHWRegChan(Reg);
-    Result.push_back(std::pair<int, unsigned>(Index, Chan));
+    Result.push_back(std::make_pair(Index, Chan));
   }
   for (; i < 3; ++i)
     Result.push_back(DummyPair);
@@ -411,8 +377,7 @@ Swizzle(std::vector<std::pair<int, unsigned> > Src,
   return Src;
 }
 
-static unsigned
-getTransSwizzle(R600InstrInfo::BankSwizzle Swz, unsigned Op) {
+static unsigned getTransSwizzle(R600InstrInfo::BankSwizzle Swz, unsigned Op) {
   switch (Swz) {
   case R600InstrInfo::ALU_VEC_012_SCL_210: {
     unsigned Cycles[3] = { 2, 1, 0};
@@ -432,7 +397,6 @@ getTransSwizzle(R600InstrInfo::BankSwizzle Swz, unsigned Op) {
   }
   default:
     llvm_unreachable("Wrong Swizzle for Trans Slot");
-    return 0;
   }
 }
 
@@ -557,7 +521,7 @@ R600InstrInfo::fitsReadPortLimitations(const std::vector<MachineInstr *> &IG,
   unsigned ConstCount;
   BankSwizzle TransBS = ALU_VEC_012_SCL_210;
   for (unsigned i = 0, e = IG.size(); i < e; ++i) {
-    IGSrcs.push_back(ExtractSrcs(IG[i], PV, ConstCount));
+    IGSrcs.push_back(ExtractSrcs(*IG[i], PV, ConstCount));
     unsigned Op = getOperandIdx(IG[i]->getOpcode(),
         AMDGPU::OpName::bank_swizzle);
     ValidSwizzle.push_back( (R600InstrInfo::BankSwizzle)
@@ -624,14 +588,13 @@ R600InstrInfo::fitsConstReadLimitations(const std::vector<MachineInstr *> &MIs)
   std::vector<unsigned> Consts;
   SmallSet<int64_t, 4> Literals;
   for (unsigned i = 0, n = MIs.size(); i < n; i++) {
-    MachineInstr *MI = MIs[i];
-    if (!isALUInstr(MI->getOpcode()))
+    MachineInstr &MI = *MIs[i];
+    if (!isALUInstr(MI.getOpcode()))
       continue;
 
     ArrayRef<std::pair<MachineOperand *, int64_t>> Srcs = getSrcs(MI);
 
-    for (unsigned j = 0, e = Srcs.size(); j < e; j++) {
-      std::pair<MachineOperand *, unsigned> Src = Srcs[j];
+    for (const auto &Src:Srcs) {
       if (Src.first->getReg() == AMDGPU::ALU_LITERAL_X)
         Literals.insert(Src.second);
       if (Literals.size() > 4)
@@ -652,7 +615,7 @@ R600InstrInfo::fitsConstReadLimitations(const std::vector<MachineInstr *> &MIs)
 DFAPacketizer *
 R600InstrInfo::CreateTargetScheduleState(const TargetSubtargetInfo &STI) const {
   const InstrItineraryData *II = STI.getInstrItineraryData();
-  return static_cast<const AMDGPUSubtarget &>(STI).createDFAPacketizer(II);
+  return static_cast<const R600Subtarget &>(STI).createDFAPacketizer(II);
 }
 
 static bool
@@ -670,9 +633,9 @@ findFirstPredicateSetterFrom(MachineBasicBlock &MBB,
                              MachineBasicBlock::iterator I) {
   while (I != MBB.begin()) {
     --I;
-    MachineInstr *MI = I;
-    if (isPredicateSetter(MI->getOpcode()))
-      return MI;
+    MachineInstr &MI = *I;
+    if (isPredicateSetter(MI.getOpcode()))
+      return &MI;
   }
 
   return nullptr;
@@ -688,12 +651,11 @@ static bool isBranch(unsigned Opcode) {
       Opcode == AMDGPU::BRANCH_COND_f32;
 }
 
-bool
-R600InstrInfo::AnalyzeBranch(MachineBasicBlock &MBB,
-                             MachineBasicBlock *&TBB,
-                             MachineBasicBlock *&FBB,
-                             SmallVectorImpl<MachineOperand> &Cond,
-                             bool AllowModify) const {
+bool R600InstrInfo::analyzeBranch(MachineBasicBlock &MBB,
+                                  MachineBasicBlock *&TBB,
+                                  MachineBasicBlock *&FBB,
+                                  SmallVectorImpl<MachineOperand> &Cond,
+                                  bool AllowModify) const {
   // Most of the following comes from the ARM implementation of AnalyzeBranch
 
   // If the block has no terminators, it just falls into the block after it.
@@ -716,21 +678,21 @@ R600InstrInfo::AnalyzeBranch(MachineBasicBlock &MBB,
         I->removeFromParent();
       I = PriorI;
   }
-  MachineInstr *LastInst = I;
+  MachineInstr &LastInst = *I;
 
   // If there is only one terminator instruction, process it.
-  unsigned LastOpc = LastInst->getOpcode();
+  unsigned LastOpc = LastInst.getOpcode();
   if (I == MBB.begin() ||
           !isJump(static_cast<MachineInstr *>(--I)->getOpcode())) {
     if (LastOpc == AMDGPU::JUMP) {
-      TBB = LastInst->getOperand(0).getMBB();
+      TBB = LastInst.getOperand(0).getMBB();
       return false;
     } else if (LastOpc == AMDGPU::JUMP_COND) {
-      MachineInstr *predSet = I;
+      auto predSet = I;
       while (!isPredicateSetter(predSet->getOpcode())) {
         predSet = --I;
       }
-      TBB = LastInst->getOperand(0).getMBB();
+      TBB = LastInst.getOperand(0).getMBB();
       Cond.push_back(predSet->getOperand(1));
       Cond.push_back(predSet->getOperand(2));
       Cond.push_back(MachineOperand::CreateReg(AMDGPU::PRED_SEL_ONE, false));
@@ -740,17 +702,17 @@ R600InstrInfo::AnalyzeBranch(MachineBasicBlock &MBB,
   }
 
   // Get the instruction before it if it is a terminator.
-  MachineInstr *SecondLastInst = I;
-  unsigned SecondLastOpc = SecondLastInst->getOpcode();
+  MachineInstr &SecondLastInst = *I;
+  unsigned SecondLastOpc = SecondLastInst.getOpcode();
 
   // If the block ends with a B and a Bcc, handle it.
   if (SecondLastOpc == AMDGPU::JUMP_COND && LastOpc == AMDGPU::JUMP) {
-    MachineInstr *predSet = --I;
+    auto predSet = --I;
     while (!isPredicateSetter(predSet->getOpcode())) {
       predSet = --I;
     }
-    TBB = SecondLastInst->getOperand(0).getMBB();
-    FBB = LastInst->getOperand(0).getMBB();
+    TBB = SecondLastInst.getOperand(0).getMBB();
+    FBB = LastInst.getOperand(0).getMBB();
     Cond.push_back(predSet->getOperand(1));
     Cond.push_back(predSet->getOperand(2));
     Cond.push_back(MachineOperand::CreateReg(AMDGPU::PRED_SEL_ONE, false));
@@ -772,12 +734,11 @@ MachineBasicBlock::iterator FindLastAluClause(MachineBasicBlock &MBB) {
   return MBB.end();
 }
 
-unsigned
-R600InstrInfo::InsertBranch(MachineBasicBlock &MBB,
-                            MachineBasicBlock *TBB,
-                            MachineBasicBlock *FBB,
-                            ArrayRef<MachineOperand> Cond,
-                            DebugLoc DL) const {
+unsigned R600InstrInfo::InsertBranch(MachineBasicBlock &MBB,
+                                     MachineBasicBlock *TBB,
+                                     MachineBasicBlock *FBB,
+                                     ArrayRef<MachineOperand> Cond,
+                                     const DebugLoc &DL) const {
   assert(TBB && "InsertBranch must not be told to insert a fallthrough");
 
   if (!FBB) {
@@ -787,7 +748,7 @@ R600InstrInfo::InsertBranch(MachineBasicBlock &MBB,
     } else {
       MachineInstr *PredSet = findFirstPredicateSetterFrom(MBB, MBB.end());
       assert(PredSet && "No previous predicate !");
-      addFlag(PredSet, 0, MO_FLAG_PUSH);
+      addFlag(*PredSet, 0, MO_FLAG_PUSH);
       PredSet->getOperand(2).setImm(Cond[1].getImm());
 
       BuildMI(&MBB, DL, get(AMDGPU::JUMP_COND))
@@ -803,7 +764,7 @@ R600InstrInfo::InsertBranch(MachineBasicBlock &MBB,
   } else {
     MachineInstr *PredSet = findFirstPredicateSetterFrom(MBB, MBB.end());
     assert(PredSet && "No previous predicate !");
-    addFlag(PredSet, 0, MO_FLAG_PUSH);
+    addFlag(*PredSet, 0, MO_FLAG_PUSH);
     PredSet->getOperand(2).setImm(Cond[1].getImm());
     BuildMI(&MBB, DL, get(AMDGPU::JUMP_COND))
             .addMBB(TBB)
@@ -835,7 +796,7 @@ R600InstrInfo::RemoveBranch(MachineBasicBlock &MBB) const {
     return 0;
   case AMDGPU::JUMP_COND: {
     MachineInstr *predSet = findFirstPredicateSetterFrom(MBB, I);
-    clearFlag(predSet, 0, MO_FLAG_PUSH);
+    clearFlag(*predSet, 0, MO_FLAG_PUSH);
     I->eraseFromParent();
     MachineBasicBlock::iterator CfAlu = FindLastAluClause(MBB);
     if (CfAlu == MBB.end())
@@ -860,7 +821,7 @@ R600InstrInfo::RemoveBranch(MachineBasicBlock &MBB) const {
     return 1;
   case AMDGPU::JUMP_COND: {
     MachineInstr *predSet = findFirstPredicateSetterFrom(MBB, I);
-    clearFlag(predSet, 0, MO_FLAG_PUSH);
+    clearFlag(*predSet, 0, MO_FLAG_PUSH);
     I->eraseFromParent();
     MachineBasicBlock::iterator CfAlu = FindLastAluClause(MBB);
     if (CfAlu == MBB.end())
@@ -876,13 +837,12 @@ R600InstrInfo::RemoveBranch(MachineBasicBlock &MBB) const {
   return 2;
 }
 
-bool
-R600InstrInfo::isPredicated(const MachineInstr *MI) const {
-  int idx = MI->findFirstPredOperandIdx();
+bool R600InstrInfo::isPredicated(const MachineInstr &MI) const {
+  int idx = MI.findFirstPredOperandIdx();
   if (idx < 0)
     return false;
 
-  unsigned Reg = MI->getOperand(idx).getReg();
+  unsigned Reg = MI.getOperand(idx).getReg();
   switch (Reg) {
   default: return false;
   case AMDGPU::PRED_SEL_ONE:
@@ -892,25 +852,22 @@ R600InstrInfo::isPredicated(const MachineInstr *MI) const {
   }
 }
 
-bool
-R600InstrInfo::isPredicable(MachineInstr *MI) const {
+bool R600InstrInfo::isPredicable(MachineInstr &MI) const {
   // XXX: KILL* instructions can be predicated, but they must be the last
   // instruction in a clause, so this means any instructions after them cannot
   // be predicated.  Until we have proper support for instruction clauses in the
   // backend, we will mark KILL* instructions as unpredicable.
 
-  if (MI->getOpcode() == AMDGPU::KILLGT) {
+  if (MI.getOpcode() == AMDGPU::KILLGT) {
     return false;
-  } else if (MI->getOpcode() == AMDGPU::CF_ALU) {
+  } else if (MI.getOpcode() == AMDGPU::CF_ALU) {
     // If the clause start in the middle of MBB then the MBB has more
     // than a single clause, unable to predicate several clauses.
-    if (MI->getParent()->begin() != MachineBasicBlock::iterator(MI))
+    if (MI.getParent()->begin() != MachineBasicBlock::iterator(MI))
       return false;
     // TODO: We don't support KC merging atm
-    if (MI->getOperand(3).getImm() != 0 || MI->getOperand(4).getImm() != 0)
-      return false;
-    return true;
-  } else if (isVector(*MI)) {
+    return MI.getOperand(3).getImm() == 0 && MI.getOperand(4).getImm() == 0;
+  } else if (isVector(MI)) {
     return false;
   } else {
     return AMDGPUInstrInfo::isPredicable(MI);
@@ -986,48 +943,39 @@ R600InstrInfo::ReverseBranchCondition(SmallVectorImpl<MachineOperand> &Cond) con
   return false;
 }
 
-bool
-R600InstrInfo::DefinesPredicate(MachineInstr *MI,
-                                std::vector<MachineOperand> &Pred) const {
-  return isPredicateSetter(MI->getOpcode());
+bool R600InstrInfo::DefinesPredicate(MachineInstr &MI,
+                                     std::vector<MachineOperand> &Pred) const {
+  return isPredicateSetter(MI.getOpcode());
 }
 
 
-bool
-R600InstrInfo::SubsumesPredicate(ArrayRef<MachineOperand> Pred1,
-                                 ArrayRef<MachineOperand> Pred2) const {
-  return false;
-}
-
-
-bool
-R600InstrInfo::PredicateInstruction(MachineInstr *MI,
-                                    ArrayRef<MachineOperand> Pred) const {
-  int PIdx = MI->findFirstPredOperandIdx();
+bool R600InstrInfo::PredicateInstruction(MachineInstr &MI,
+                                         ArrayRef<MachineOperand> Pred) const {
+  int PIdx = MI.findFirstPredOperandIdx();
 
-  if (MI->getOpcode() == AMDGPU::CF_ALU) {
-    MI->getOperand(8).setImm(0);
+  if (MI.getOpcode() == AMDGPU::CF_ALU) {
+    MI.getOperand(8).setImm(0);
     return true;
   }
 
-  if (MI->getOpcode() == AMDGPU::DOT_4) {
-    MI->getOperand(getOperandIdx(*MI, AMDGPU::OpName::pred_sel_X))
+  if (MI.getOpcode() == AMDGPU::DOT_4) {
+    MI.getOperand(getOperandIdx(MI, AMDGPU::OpName::pred_sel_X))
         .setReg(Pred[2].getReg());
-    MI->getOperand(getOperandIdx(*MI, AMDGPU::OpName::pred_sel_Y))
+    MI.getOperand(getOperandIdx(MI, AMDGPU::OpName::pred_sel_Y))
         .setReg(Pred[2].getReg());
-    MI->getOperand(getOperandIdx(*MI, AMDGPU::OpName::pred_sel_Z))
+    MI.getOperand(getOperandIdx(MI, AMDGPU::OpName::pred_sel_Z))
         .setReg(Pred[2].getReg());
-    MI->getOperand(getOperandIdx(*MI, AMDGPU::OpName::pred_sel_W))
+    MI.getOperand(getOperandIdx(MI, AMDGPU::OpName::pred_sel_W))
         .setReg(Pred[2].getReg());
-    MachineInstrBuilder MIB(*MI->getParent()->getParent(), MI);
+    MachineInstrBuilder MIB(*MI.getParent()->getParent(), MI);
     MIB.addReg(AMDGPU::PREDICATE_BIT, RegState::Implicit);
     return true;
   }
 
   if (PIdx != -1) {
-    MachineOperand &PMO = MI->getOperand(PIdx);
+    MachineOperand &PMO = MI.getOperand(PIdx);
     PMO.setReg(Pred[2].getReg());
-    MachineInstrBuilder MIB(*MI->getParent()->getParent(), MI);
+    MachineInstrBuilder MIB(*MI.getParent()->getParent(), MI);
     MIB.addReg(AMDGPU::PREDICATE_BIT, RegState::Implicit);
     return true;
   }
@@ -1035,45 +983,94 @@ R600InstrInfo::PredicateInstruction(MachineInstr *MI,
   return false;
 }
 
-unsigned int R600InstrInfo::getPredicationCost(const MachineInstr *) const {
+unsigned int R600InstrInfo::getPredicationCost(const MachineInstr &) const {
   return 2;
 }
 
 unsigned int R600InstrInfo::getInstrLatency(const InstrItineraryData *ItinData,
-                                            const MachineInstr *MI,
+                                            const MachineInstr &,
                                             unsigned *PredCost) const {
   if (PredCost)
     *PredCost = 2;
   return 2;
 }
 
-bool R600InstrInfo::expandPostRAPseudo(MachineBasicBlock::iterator MI) const {
+unsigned R600InstrInfo::calculateIndirectAddress(unsigned RegIndex,
+                                                   unsigned Channel) const {
+  assert(Channel == 0);
+  return RegIndex;
+}
 
-  switch(MI->getOpcode()) {
-  default: return AMDGPUInstrInfo::expandPostRAPseudo(MI);
+bool R600InstrInfo::expandPostRAPseudo(MachineInstr &MI) const {
+  switch (MI.getOpcode()) {
+  default: {
+    MachineBasicBlock *MBB = MI.getParent();
+    int OffsetOpIdx =
+        AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::addr);
+    // addr is a custom operand with multiple MI operands, and only the
+    // first MI operand is given a name.
+    int RegOpIdx = OffsetOpIdx + 1;
+    int ChanOpIdx =
+        AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::chan);
+    if (isRegisterLoad(MI)) {
+      int DstOpIdx =
+          AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::dst);
+      unsigned RegIndex = MI.getOperand(RegOpIdx).getImm();
+      unsigned Channel = MI.getOperand(ChanOpIdx).getImm();
+      unsigned Address = calculateIndirectAddress(RegIndex, Channel);
+      unsigned OffsetReg = MI.getOperand(OffsetOpIdx).getReg();
+      if (OffsetReg == AMDGPU::INDIRECT_BASE_ADDR) {
+        buildMovInstr(MBB, MI, MI.getOperand(DstOpIdx).getReg(),
+                      getIndirectAddrRegClass()->getRegister(Address));
+      } else {
+        buildIndirectRead(MBB, MI, MI.getOperand(DstOpIdx).getReg(), Address,
+                          OffsetReg);
+      }
+    } else if (isRegisterStore(MI)) {
+      int ValOpIdx =
+          AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::val);
+      unsigned RegIndex = MI.getOperand(RegOpIdx).getImm();
+      unsigned Channel = MI.getOperand(ChanOpIdx).getImm();
+      unsigned Address = calculateIndirectAddress(RegIndex, Channel);
+      unsigned OffsetReg = MI.getOperand(OffsetOpIdx).getReg();
+      if (OffsetReg == AMDGPU::INDIRECT_BASE_ADDR) {
+        buildMovInstr(MBB, MI, getIndirectAddrRegClass()->getRegister(Address),
+                      MI.getOperand(ValOpIdx).getReg());
+      } else {
+        buildIndirectWrite(MBB, MI, MI.getOperand(ValOpIdx).getReg(),
+                           calculateIndirectAddress(RegIndex, Channel),
+                           OffsetReg);
+      }
+    } else {
+      return false;
+    }
+
+    MBB->erase(MI);
+    return true;
+  }
   case AMDGPU::R600_EXTRACT_ELT_V2:
   case AMDGPU::R600_EXTRACT_ELT_V4:
-    buildIndirectRead(MI->getParent(), MI, MI->getOperand(0).getReg(),
-                      RI.getHWRegIndex(MI->getOperand(1).getReg()), //  Address
-                      MI->getOperand(2).getReg(),
-                      RI.getHWRegChan(MI->getOperand(1).getReg()));
+    buildIndirectRead(MI.getParent(), MI, MI.getOperand(0).getReg(),
+                      RI.getHWRegIndex(MI.getOperand(1).getReg()), //  Address
+                      MI.getOperand(2).getReg(),
+                      RI.getHWRegChan(MI.getOperand(1).getReg()));
     break;
   case AMDGPU::R600_INSERT_ELT_V2:
   case AMDGPU::R600_INSERT_ELT_V4:
-    buildIndirectWrite(MI->getParent(), MI, MI->getOperand(2).getReg(), // Value
-                       RI.getHWRegIndex(MI->getOperand(1).getReg()),  // Address
-                       MI->getOperand(3).getReg(),                    // Offset
-                       RI.getHWRegChan(MI->getOperand(1).getReg()));  // Channel
+    buildIndirectWrite(MI.getParent(), MI, MI.getOperand(2).getReg(), // Value
+                       RI.getHWRegIndex(MI.getOperand(1).getReg()),   // Address
+                       MI.getOperand(3).getReg(),                     // Offset
+                       RI.getHWRegChan(MI.getOperand(1).getReg()));   // Channel
     break;
   }
-  MI->eraseFromParent();
+  MI.eraseFromParent();
   return true;
 }
 
 void  R600InstrInfo::reserveIndirectRegisters(BitVector &Reserved,
                                              const MachineFunction &MF) const {
-  const AMDGPUFrameLowering *TFL = static_cast<const AMDGPUFrameLowering *>(
-      MF.getSubtarget().getFrameLowering());
+  const R600Subtarget &ST = MF.getSubtarget<R600Subtarget>();
+  const R600FrameLowering *TFL = ST.getFrameLowering();
 
   unsigned StackWidth = TFL->getStackWidth(MF);
   int End = getIndirectIndexEnd(MF);
@@ -1091,13 +1088,6 @@ void  R600InstrInfo::reserveIndirectRegisters(BitVector &Reserved,
   }
 }
 
-unsigned R600InstrInfo::calculateIndirectAddress(unsigned RegIndex,
-                                                 unsigned Channel) const {
-  // XXX: Remove when we support a stack width > 2
-  assert(Channel == 0);
-  return RegIndex;
-}
-
 const TargetRegisterClass *R600InstrInfo::getIndirectAddrRegClass() const {
   return &AMDGPU::R600_TReg32_XRegClass;
 }
@@ -1124,13 +1114,13 @@ MachineInstrBuilder R600InstrInfo::buildIndirectWrite(MachineBasicBlock *MBB,
   }
   MachineInstr *MOVA = buildDefaultInstruction(*MBB, I, AMDGPU::MOVA_INT_eg,
                                                AMDGPU::AR_X, OffsetReg);
-  setImmOperand(MOVA, AMDGPU::OpName::write, 0);
+  setImmOperand(*MOVA, AMDGPU::OpName::write, 0);
 
   MachineInstrBuilder Mov = buildDefaultInstruction(*MBB, I, AMDGPU::MOV,
                                       AddrReg, ValueReg)
                                       .addReg(AMDGPU::AR_X,
                                            RegState::Implicit | RegState::Kill);
-  setImmOperand(Mov, AMDGPU::OpName::dst_rel, 1);
+  setImmOperand(*Mov, AMDGPU::OpName::dst_rel, 1);
   return Mov;
 }
 
@@ -1157,17 +1147,74 @@ MachineInstrBuilder R600InstrInfo::buildIndirectRead(MachineBasicBlock *MBB,
   MachineInstr *MOVA = buildDefaultInstruction(*MBB, I, AMDGPU::MOVA_INT_eg,
                                                        AMDGPU::AR_X,
                                                        OffsetReg);
-  setImmOperand(MOVA, AMDGPU::OpName::write, 0);
+  setImmOperand(*MOVA, AMDGPU::OpName::write, 0);
   MachineInstrBuilder Mov = buildDefaultInstruction(*MBB, I, AMDGPU::MOV,
                                       ValueReg,
                                       AddrReg)
                                       .addReg(AMDGPU::AR_X,
                                            RegState::Implicit | RegState::Kill);
-  setImmOperand(Mov, AMDGPU::OpName::src0_rel, 1);
+  setImmOperand(*Mov, AMDGPU::OpName::src0_rel, 1);
 
   return Mov;
 }
 
+int R600InstrInfo::getIndirectIndexBegin(const MachineFunction &MF) const {
+  const MachineRegisterInfo &MRI = MF.getRegInfo();
+  const MachineFrameInfo *MFI = MF.getFrameInfo();
+  int Offset = -1;
+
+  if (MFI->getNumObjects() == 0) {
+    return -1;
+  }
+
+  if (MRI.livein_empty()) {
+    return 0;
+  }
+
+  const TargetRegisterClass *IndirectRC = getIndirectAddrRegClass();
+  for (MachineRegisterInfo::livein_iterator LI = MRI.livein_begin(),
+                                            LE = MRI.livein_end();
+                                            LI != LE; ++LI) {
+    unsigned Reg = LI->first;
+    if (TargetRegisterInfo::isVirtualRegister(Reg) ||
+        !IndirectRC->contains(Reg))
+      continue;
+
+    unsigned RegIndex;
+    unsigned RegEnd;
+    for (RegIndex = 0, RegEnd = IndirectRC->getNumRegs(); RegIndex != RegEnd;
+                                                          ++RegIndex) {
+      if (IndirectRC->getRegister(RegIndex) == Reg)
+        break;
+    }
+    Offset = std::max(Offset, (int)RegIndex);
+  }
+
+  return Offset + 1;
+}
+
+int R600InstrInfo::getIndirectIndexEnd(const MachineFunction &MF) const {
+  int Offset = 0;
+  const MachineFrameInfo *MFI = MF.getFrameInfo();
+
+  // Variable sized objects are not supported
+  if (MFI->hasVarSizedObjects()) {
+    return -1;
+  }
+
+  if (MFI->getNumObjects() == 0) {
+    return -1;
+  }
+
+  const R600Subtarget &ST = MF.getSubtarget<R600Subtarget>();
+  const R600FrameLowering *TFL = ST.getFrameLowering();
+
+  unsigned IgnoredFrameReg;
+  Offset = TFL->getFrameIndexReference(MF, -1, IgnoredFrameReg);
+
+  return getIndirectIndexBegin(MF) + Offset;
+}
+
 unsigned R600InstrInfo::getMaxAlusPerClause() const {
   return 115;
 }
@@ -1256,7 +1303,7 @@ MachineInstr *R600InstrInfo::buildSlotOfVectorInstruction(
     const {
   assert (MI->getOpcode() == AMDGPU::DOT_4 && "Not Implemented");
   unsigned Opcode;
-  if (ST.getGeneration() <= AMDGPUSubtarget::R700)
+  if (ST.getGeneration() <= R600Subtarget::R700)
     Opcode = AMDGPU::DOT4_r600;
   else
     Opcode = AMDGPU::DOT4_eg;
@@ -1293,7 +1340,7 @@ MachineInstr *R600InstrInfo::buildSlotOfVectorInstruction(
     MachineOperand &MO = MI->getOperand(
         getOperandIdx(MI->getOpcode(), getSlotedOps(Operands[i], Slot)));
     assert (MO.isImm());
-    setImmOperand(MIB, Operands[i], MO.getImm());
+    setImmOperand(*MIB, Operands[i], MO.getImm());
   }
   MIB->getOperand(20).setImm(0);
   return MIB;
@@ -1305,7 +1352,7 @@ MachineInstr *R600InstrInfo::buildMovImm(MachineBasicBlock &BB,
                                          uint64_t Imm) const {
   MachineInstr *MovImm = buildDefaultInstruction(BB, I, AMDGPU::MOV, DstReg,
                                                   AMDGPU::ALU_LITERAL_X);
-  setImmOperand(MovImm, AMDGPU::OpName::literal, Imm);
+  setImmOperand(*MovImm, AMDGPU::OpName::literal, Imm);
   return MovImm;
 }
 
@@ -1323,25 +1370,21 @@ int R600InstrInfo::getOperandIdx(unsigned Opcode, unsigned Op) const {
   return AMDGPU::getNamedOperandIdx(Opcode, Op);
 }
 
-void R600InstrInfo::setImmOperand(MachineInstr *MI, unsigned Op,
+void R600InstrInfo::setImmOperand(MachineInstr &MI, unsigned Op,
                                   int64_t Imm) const {
-  int Idx = getOperandIdx(*MI, Op);
+  int Idx = getOperandIdx(MI, Op);
   assert(Idx != -1 && "Operand not supported for this instruction.");
-  assert(MI->getOperand(Idx).isImm());
-  MI->getOperand(Idx).setImm(Imm);
+  assert(MI.getOperand(Idx).isImm());
+  MI.getOperand(Idx).setImm(Imm);
 }
 
 //===----------------------------------------------------------------------===//
 // Instruction flag getters/setters
 //===----------------------------------------------------------------------===//
 
-bool R600InstrInfo::hasFlagOperand(const MachineInstr &MI) const {
-  return GET_FLAG_OPERAND_IDX(get(MI.getOpcode()).TSFlags) != 0;
-}
-
-MachineOperand &R600InstrInfo::getFlagOp(MachineInstr *MI, unsigned SrcIdx,
+MachineOperand &R600InstrInfo::getFlagOp(MachineInstr &MI, unsigned SrcIdx,
                                          unsigned Flag) const {
-  unsigned TargetFlags = get(MI->getOpcode()).TSFlags;
+  unsigned TargetFlags = get(MI.getOpcode()).TSFlags;
   int FlagIndex = 0;
   if (Flag != 0) {
     // If we pass something other than the default value of Flag to this
@@ -1351,20 +1394,26 @@ MachineOperand &R600InstrInfo::getFlagOp(MachineInstr *MI, unsigned SrcIdx,
     bool IsOP3 = (TargetFlags & R600_InstFlag::OP3) == R600_InstFlag::OP3;
     switch (Flag) {
     case MO_FLAG_CLAMP:
-      FlagIndex = getOperandIdx(*MI, AMDGPU::OpName::clamp);
+      FlagIndex = getOperandIdx(MI, AMDGPU::OpName::clamp);
       break;
     case MO_FLAG_MASK:
-      FlagIndex = getOperandIdx(*MI, AMDGPU::OpName::write);
+      FlagIndex = getOperandIdx(MI, AMDGPU::OpName::write);
       break;
     case MO_FLAG_NOT_LAST:
     case MO_FLAG_LAST:
-      FlagIndex = getOperandIdx(*MI, AMDGPU::OpName::last);
+      FlagIndex = getOperandIdx(MI, AMDGPU::OpName::last);
       break;
     case MO_FLAG_NEG:
       switch (SrcIdx) {
-      case 0: FlagIndex = getOperandIdx(*MI, AMDGPU::OpName::src0_neg); break;
-      case 1: FlagIndex = getOperandIdx(*MI, AMDGPU::OpName::src1_neg); break;
-      case 2: FlagIndex = getOperandIdx(*MI, AMDGPU::OpName::src2_neg); break;
+      case 0:
+        FlagIndex = getOperandIdx(MI, AMDGPU::OpName::src0_neg);
+        break;
+      case 1:
+        FlagIndex = getOperandIdx(MI, AMDGPU::OpName::src1_neg);
+        break;
+      case 2:
+        FlagIndex = getOperandIdx(MI, AMDGPU::OpName::src2_neg);
+        break;
       }
       break;
 
@@ -1373,8 +1422,12 @@ MachineOperand &R600InstrInfo::getFlagOp(MachineInstr *MI, unsigned SrcIdx,
                        "instructions.");
       (void)IsOP3;
       switch (SrcIdx) {
-      case 0: FlagIndex = getOperandIdx(*MI, AMDGPU::OpName::src0_abs); break;
-      case 1: FlagIndex = getOperandIdx(*MI, AMDGPU::OpName::src1_abs); break;
+      case 0:
+        FlagIndex = getOperandIdx(MI, AMDGPU::OpName::src0_abs);
+        break;
+      case 1:
+        FlagIndex = getOperandIdx(MI, AMDGPU::OpName::src1_abs);
+        break;
       }
       break;
 
@@ -1389,14 +1442,14 @@ MachineOperand &R600InstrInfo::getFlagOp(MachineInstr *MI, unsigned SrcIdx,
          "Instruction flags not supported for this instruction");
   }
 
-  MachineOperand &FlagOp = MI->getOperand(FlagIndex);
+  MachineOperand &FlagOp = MI.getOperand(FlagIndex);
   assert(FlagOp.isImm());
   return FlagOp;
 }
 
-void R600InstrInfo::addFlag(MachineInstr *MI, unsigned Operand,
+void R600InstrInfo::addFlag(MachineInstr &MI, unsigned Operand,
                             unsigned Flag) const {
-  unsigned TargetFlags = get(MI->getOpcode()).TSFlags;
+  unsigned TargetFlags = get(MI.getOpcode()).TSFlags;
   if (Flag == 0) {
     return;
   }
@@ -1415,9 +1468,9 @@ void R600InstrInfo::addFlag(MachineInstr *MI, unsigned Operand,
   }
 }
 
-void R600InstrInfo::clearFlag(MachineInstr *MI, unsigned Operand,
+void R600InstrInfo::clearFlag(MachineInstr &MI, unsigned Operand,
                               unsigned Flag) const {
-  unsigned TargetFlags = get(MI->getOpcode()).TSFlags;
+  unsigned TargetFlags = get(MI.getOpcode()).TSFlags;
   if (HAS_NATIVE_OPERANDS(TargetFlags)) {
     MachineOperand &FlagOp = getFlagOp(MI, Operand, Flag);
     FlagOp.setImm(0);
@@ -1428,3 +1481,11 @@ void R600InstrInfo::clearFlag(MachineInstr *MI, unsigned Operand,
     FlagOp.setImm(InstFlags);
   }
 }
+
+bool R600InstrInfo::isRegisterStore(const MachineInstr &MI) const {
+  return get(MI.getOpcode()).TSFlags & AMDGPU_FLAG_REGISTER_STORE;
+}
+
+bool R600InstrInfo::isRegisterLoad(const MachineInstr &MI) const {
+  return get(MI.getOpcode()).TSFlags & AMDGPU_FLAG_REGISTER_LOAD;
+}
diff --git a/lib/Target/AMDGPU/R600InstrInfo.h b/lib/Target/AMDGPU/R600InstrInfo.h
index e7251c31107b..feaca98def44 100644
--- a/lib/Target/AMDGPU/R600InstrInfo.h
+++ b/lib/Target/AMDGPU/R600InstrInfo.h
@@ -12,30 +12,28 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef LLVM_LIB_TARGET_R600_R600INSTRINFO_H
-#define LLVM_LIB_TARGET_R600_R600INSTRINFO_H
+#ifndef LLVM_LIB_TARGET_AMDGPU_R600INSTRINFO_H
+#define LLVM_LIB_TARGET_AMDGPU_R600INSTRINFO_H
 
 #include "AMDGPUInstrInfo.h"
-#include "R600Defines.h"
 #include "R600RegisterInfo.h"
-#include <map>
 
 namespace llvm {
-
-  class AMDGPUTargetMachine;
-  class DFAPacketizer;
-  class ScheduleDAG;
-  class MachineFunction;
-  class MachineInstr;
-  class MachineInstrBuilder;
-
-  class R600InstrInfo : public AMDGPUInstrInfo {
-  private:
+class AMDGPUTargetMachine;
+class DFAPacketizer;
+class MachineFunction;
+class MachineInstr;
+class MachineInstrBuilder;
+class R600Subtarget;
+
+class R600InstrInfo final : public AMDGPUInstrInfo {
+private:
   const R600RegisterInfo RI;
+  const R600Subtarget &ST;
 
-  std::vector<std::pair<int, unsigned> >
-  ExtractSrcs(MachineInstr *MI, const DenseMap<unsigned, unsigned> &PV, unsigned &ConstCount) const;
-
+  std::vector<std::pair<int, unsigned>>
+  ExtractSrcs(MachineInstr &MI, const DenseMap<unsigned, unsigned> &PV,
+              unsigned &ConstCount) const;
 
   MachineInstrBuilder buildIndirectRead(MachineBasicBlock *MBB,
                                         MachineBasicBlock::iterator I,
@@ -44,11 +42,11 @@ namespace llvm {
                                         unsigned AddrChan) const;
 
   MachineInstrBuilder buildIndirectWrite(MachineBasicBlock *MBB,
-                                        MachineBasicBlock::iterator I,
-                                        unsigned ValueReg, unsigned Address,
-                                        unsigned OffsetReg,
-                                        unsigned AddrChan) const;
-  public:
+                                         MachineBasicBlock::iterator I,
+                                         unsigned ValueReg, unsigned Address,
+                                         unsigned OffsetReg,
+                                         unsigned AddrChan) const;
+public:
   enum BankSwizzle {
     ALU_VEC_012_SCL_210 = 0,
     ALU_VEC_021_SCL_122,
@@ -58,18 +56,18 @@ namespace llvm {
     ALU_VEC_210
   };
 
-  explicit R600InstrInfo(const AMDGPUSubtarget &st);
+  explicit R600InstrInfo(const R600Subtarget &);
 
-  const R600RegisterInfo &getRegisterInfo() const override;
-  void copyPhysReg(MachineBasicBlock &MBB,
-                   MachineBasicBlock::iterator MI, DebugLoc DL,
-                   unsigned DestReg, unsigned SrcReg,
+  const R600RegisterInfo &getRegisterInfo() const {
+    return RI;
+  }
+
+  void copyPhysReg(MachineBasicBlock &MBB, MachineBasicBlock::iterator MI,
+                   const DebugLoc &DL, unsigned DestReg, unsigned SrcReg,
                    bool KillSrc) const override;
   bool isLegalToSplitMBBAt(MachineBasicBlock &MBB,
                            MachineBasicBlock::iterator MBBI) const override;
 
-  bool isTrig(const MachineInstr &MI) const;
-  bool isPlaceHolderOpcode(unsigned opcode) const;
   bool isReductionOp(unsigned opcode) const;
   bool isCubeOp(unsigned opcode) const;
 
@@ -77,32 +75,28 @@ namespace llvm {
   bool isALUInstr(unsigned Opcode) const;
   bool hasInstrModifiers(unsigned Opcode) const;
   bool isLDSInstr(unsigned Opcode) const;
-  bool isLDSNoRetInstr(unsigned Opcode) const;
   bool isLDSRetInstr(unsigned Opcode) const;
 
   /// \returns true if this \p Opcode represents an ALU instruction or an
   /// instruction that will be lowered in ExpandSpecialInstrs Pass.
-  bool canBeConsideredALU(const MachineInstr *MI) const;
+  bool canBeConsideredALU(const MachineInstr &MI) const;
 
   bool isTransOnly(unsigned Opcode) const;
-  bool isTransOnly(const MachineInstr *MI) const;
+  bool isTransOnly(const MachineInstr &MI) const;
   bool isVectorOnly(unsigned Opcode) const;
-  bool isVectorOnly(const MachineInstr *MI) const;
+  bool isVectorOnly(const MachineInstr &MI) const;
   bool isExport(unsigned Opcode) const;
 
   bool usesVertexCache(unsigned Opcode) const;
-  bool usesVertexCache(const MachineInstr *MI) const;
+  bool usesVertexCache(const MachineInstr &MI) const;
   bool usesTextureCache(unsigned Opcode) const;
-  bool usesTextureCache(const MachineInstr *MI) const;
+  bool usesTextureCache(const MachineInstr &MI) const;
 
   bool mustBeLastInClause(unsigned Opcode) const;
-  bool usesAddressRegister(MachineInstr *MI) const;
-  bool definesAddressRegister(MachineInstr *MI) const;
-  bool readsLDSSrcReg(const MachineInstr *MI) const;
+  bool usesAddressRegister(MachineInstr &MI) const;
+  bool definesAddressRegister(MachineInstr &MI) const;
+  bool readsLDSSrcReg(const MachineInstr &MI) const;
 
-  /// \returns The operand index for the given source number.  Legal values
-  /// for SrcNum are 0, 1, and 2.
-  int getSrcIdx(unsigned Opcode, unsigned SrcNum) const;
   /// \returns The operand Index for the Sel operand given an index to one
   /// of the instruction's src operands.
   int getSelIdx(unsigned Opcode, unsigned SrcIdx) const;
@@ -113,7 +107,7 @@ namespace llvm {
   /// If register is ALU_LITERAL, second member is IMM.
   /// Otherwise, second member value is undefined.
   SmallVector<std::pair<MachineOperand *, int64_t>, 3>
-      getSrcs(MachineInstr *MI) const;
+  getSrcs(MachineInstr &MI) const;
 
   unsigned  isLegalUpTo(
     const std::vector<std::vector<std::pair<int, unsigned> > > &IGSrcs,
@@ -152,89 +146,107 @@ namespace llvm {
   /// instruction slots within an instruction group.
   bool isVector(const MachineInstr &MI) const;
 
-  bool isMov(unsigned Opcode) const override;
+  bool isMov(unsigned Opcode) const;
 
   DFAPacketizer *
   CreateTargetScheduleState(const TargetSubtargetInfo &) const override;
 
-  bool ReverseBranchCondition(SmallVectorImpl<MachineOperand> &Cond) const override;
+  bool ReverseBranchCondition(
+    SmallVectorImpl<MachineOperand> &Cond) const override;
 
-  bool AnalyzeBranch(MachineBasicBlock &MBB, MachineBasicBlock *&TBB, MachineBasicBlock *&FBB,
-                     SmallVectorImpl<MachineOperand> &Cond, bool AllowModify) const override;
+  bool analyzeBranch(MachineBasicBlock &MBB, MachineBasicBlock *&TBB,
+                     MachineBasicBlock *&FBB,
+                     SmallVectorImpl<MachineOperand> &Cond,
+                     bool AllowModify) const override;
 
   unsigned InsertBranch(MachineBasicBlock &MBB, MachineBasicBlock *TBB,
                         MachineBasicBlock *FBB, ArrayRef<MachineOperand> Cond,
-                        DebugLoc DL) const override;
+                        const DebugLoc &DL) const override;
 
   unsigned RemoveBranch(MachineBasicBlock &MBB) const override;
 
-  bool isPredicated(const MachineInstr *MI) const override;
+  bool isPredicated(const MachineInstr &MI) const override;
 
-  bool isPredicable(MachineInstr *MI) const override;
+  bool isPredicable(MachineInstr &MI) const override;
 
-  bool
-   isProfitableToDupForIfCvt(MachineBasicBlock &MBB, unsigned NumCyles,
-                             BranchProbability Probability) const override;
+  bool isProfitableToDupForIfCvt(MachineBasicBlock &MBB, unsigned NumCyles,
+                                 BranchProbability Probability) const override;
 
   bool isProfitableToIfCvt(MachineBasicBlock &MBB, unsigned NumCyles,
                            unsigned ExtraPredCycles,
                            BranchProbability Probability) const override ;
 
-  bool
-   isProfitableToIfCvt(MachineBasicBlock &TMBB,
-                       unsigned NumTCycles, unsigned ExtraTCycles,
-                       MachineBasicBlock &FMBB,
-                       unsigned NumFCycles, unsigned ExtraFCycles,
-                       BranchProbability Probability) const override;
-
-  bool DefinesPredicate(MachineInstr *MI,
-                                  std::vector<MachineOperand> &Pred) const override;
+  bool isProfitableToIfCvt(MachineBasicBlock &TMBB,
+                           unsigned NumTCycles, unsigned ExtraTCycles,
+                           MachineBasicBlock &FMBB,
+                           unsigned NumFCycles, unsigned ExtraFCycles,
+                           BranchProbability Probability) const override;
 
-  bool SubsumesPredicate(ArrayRef<MachineOperand> Pred1,
-                         ArrayRef<MachineOperand> Pred2) const override;
+  bool DefinesPredicate(MachineInstr &MI,
+                        std::vector<MachineOperand> &Pred) const override;
 
   bool isProfitableToUnpredicate(MachineBasicBlock &TMBB,
-                                          MachineBasicBlock &FMBB) const override;
+                                 MachineBasicBlock &FMBB) const override;
 
-  bool PredicateInstruction(MachineInstr *MI,
+  bool PredicateInstruction(MachineInstr &MI,
                             ArrayRef<MachineOperand> Pred) const override;
 
-  unsigned int getPredicationCost(const MachineInstr *) const override;
+  unsigned int getPredicationCost(const MachineInstr &) const override;
 
   unsigned int getInstrLatency(const InstrItineraryData *ItinData,
-                               const MachineInstr *MI,
+                               const MachineInstr &MI,
                                unsigned *PredCost = nullptr) const override;
 
-  int getInstrLatency(const InstrItineraryData *ItinData,
-                      SDNode *Node) const override { return 1;}
-
-  bool expandPostRAPseudo(MachineBasicBlock::iterator MI) const override;
+  bool expandPostRAPseudo(MachineInstr &MI) const override;
 
   /// \brief Reserve the registers that may be accesed using indirect addressing.
   void reserveIndirectRegisters(BitVector &Reserved,
                                 const MachineFunction &MF) const;
 
-  unsigned calculateIndirectAddress(unsigned RegIndex,
-                                    unsigned Channel) const override;
+  /// Calculate the "Indirect Address" for the given \p RegIndex and
+  /// \p Channel
+  ///
+  /// We model indirect addressing using a virtual address space that can be
+  /// accesed with loads and stores.  The "Indirect Address" is the memory
+  /// address in this virtual address space that maps to the given \p RegIndex
+  /// and \p Channel.
+  unsigned calculateIndirectAddress(unsigned RegIndex, unsigned Channel) const;
+
+
+  /// \returns The register class to be used for loading and storing values
+  /// from an "Indirect Address" .
+  const TargetRegisterClass *getIndirectAddrRegClass() const;
+
+  /// \returns the smallest register index that will be accessed by an indirect
+  /// read or write or -1 if indirect addressing is not used by this program.
+  int getIndirectIndexBegin(const MachineFunction &MF) const;
 
-  const TargetRegisterClass *getIndirectAddrRegClass() const override;
+  /// \returns the largest register index that will be accessed by an indirect
+  /// read or write or -1 if indirect addressing is not used by this program.
+  int getIndirectIndexEnd(const MachineFunction &MF) const;
 
+  /// \brief Build instruction(s) for an indirect register write.
+  ///
+  /// \returns The instruction that performs the indirect register write
   MachineInstrBuilder buildIndirectWrite(MachineBasicBlock *MBB,
-                          MachineBasicBlock::iterator I,
-                          unsigned ValueReg, unsigned Address,
-                          unsigned OffsetReg) const override;
+                                         MachineBasicBlock::iterator I,
+                                         unsigned ValueReg, unsigned Address,
+                                         unsigned OffsetReg) const;
 
+  /// \brief Build instruction(s) for an indirect register read.
+  ///
+  /// \returns The instruction that performs the indirect register read
   MachineInstrBuilder buildIndirectRead(MachineBasicBlock *MBB,
                                         MachineBasicBlock::iterator I,
                                         unsigned ValueReg, unsigned Address,
-                                        unsigned OffsetReg) const override;
+                                        unsigned OffsetReg) const;
 
   unsigned getMaxAlusPerClause() const;
 
-  ///buildDefaultInstruction - This function returns a MachineInstr with
-  /// all the instruction modifiers initialized to their default values.
-  /// You can use this function to avoid manually specifying each instruction
-  /// modifier operand when building a new instruction.
+  /// buildDefaultInstruction - This function returns a MachineInstr with all
+  /// the instruction modifiers initialized to their default values.  You can
+  /// use this function to avoid manually specifying each instruction modifier
+  /// operand when building a new instruction.
   ///
   /// \returns a MachineInstr with all the instruction modifiers initialized
   /// to their default values.
@@ -251,13 +263,13 @@ namespace llvm {
                                              unsigned DstReg) const;
 
   MachineInstr *buildMovImm(MachineBasicBlock &BB,
-                                  MachineBasicBlock::iterator I,
-                                  unsigned DstReg,
-                                  uint64_t Imm) const;
+                            MachineBasicBlock::iterator I,
+                            unsigned DstReg,
+                            uint64_t Imm) const;
 
   MachineInstr *buildMovInstr(MachineBasicBlock *MBB,
                               MachineBasicBlock::iterator I,
-                              unsigned DstReg, unsigned SrcReg) const override;
+                              unsigned DstReg, unsigned SrcReg) const;
 
   /// \brief Get the index of Op in the MachineInstr.
   ///
@@ -270,13 +282,10 @@ namespace llvm {
   int getOperandIdx(unsigned Opcode, unsigned Op) const;
 
   /// \brief Helper function for setting instruction flag values.
-  void setImmOperand(MachineInstr *MI, unsigned Op, int64_t Imm) const;
-
-  /// \returns true if this instruction has an operand for storing target flags.
-  bool hasFlagOperand(const MachineInstr &MI) const;
+  void setImmOperand(MachineInstr &MI, unsigned Op, int64_t Imm) const;
 
   ///\brief Add one of the MO_FLAG* flags to the specified \p Operand.
-  void addFlag(MachineInstr *MI, unsigned Operand, unsigned Flag) const;
+  void addFlag(MachineInstr &MI, unsigned Operand, unsigned Flag) const;
 
   ///\brief Determine if the specified \p Flag is set on this \p Operand.
   bool isFlagSet(const MachineInstr &MI, unsigned Operand, unsigned Flag) const;
@@ -285,11 +294,15 @@ namespace llvm {
   /// \param Flag The flag being set.
   ///
   /// \returns the operand containing the flags for this instruction.
-  MachineOperand &getFlagOp(MachineInstr *MI, unsigned SrcIdx = 0,
+  MachineOperand &getFlagOp(MachineInstr &MI, unsigned SrcIdx = 0,
                             unsigned Flag = 0) const;
 
   /// \brief Clear the specified flag on the instruction.
-  void clearFlag(MachineInstr *MI, unsigned Operand, unsigned Flag) const;
+  void clearFlag(MachineInstr &MI, unsigned Operand, unsigned Flag) const;
+
+  // Helper functions that check the opcode for status information
+  bool isRegisterStore(const MachineInstr &MI) const;
+  bool isRegisterLoad(const MachineInstr &MI) const;
 };
 
 namespace AMDGPU {
diff --git a/lib/Target/AMDGPU/R600Instructions.td b/lib/Target/AMDGPU/R600Instructions.td
index 33ef6a4e19ea..b6b576d95278 100644
--- a/lib/Target/AMDGPU/R600Instructions.td
+++ b/lib/Target/AMDGPU/R600Instructions.td
@@ -15,7 +15,7 @@
 include "R600Intrinsics.td"
 include "R600InstrFormats.td"
 
-class InstR600ISA <dag outs, dag ins, string asm, list<dag> pattern> :
+class InstR600ISA <dag outs, dag ins, string asm, list<dag> pattern = []> :
     InstR600 <outs, ins, asm, pattern, NullALU> {
 
   let Namespace = "AMDGPU";
@@ -160,7 +160,8 @@ class R600_2OP <bits<11> inst, string opName, list<dag> pattern,
   let Inst{63-32} = Word1;
 }
 
-class R600_2OP_Helper <bits<11> inst, string opName, SDPatternOperator node,
+class R600_2OP_Helper <bits<11> inst, string opName,
+                       SDPatternOperator node = null_frag,
                        InstrItinClass itin = AnyALU> :
     R600_2OP <inst, opName,
               [(set R600_Reg32:$dst, (node R600_Reg32:$src0,
@@ -283,7 +284,7 @@ class EG_CF_RAT <bits <8> cfinst, bits <6> ratinst, bits<4> ratid, bits<4> mask,
 }
 
 class VTX_READ <string name, bits<8> buffer_id, dag outs, list<dag> pattern>
-    : InstR600ISA <outs, (ins MEMxi:$src_gpr), name, pattern>,
+    : InstR600ISA <outs, (ins MEMxi:$src_gpr), !strconcat("  ", name), pattern>,
       VTX_WORD1_GPR {
 
   // Static fields
@@ -328,18 +329,44 @@ class VTX_READ <string name, bits<8> buffer_id, dag outs, list<dag> pattern>
 
 class LoadParamFrag <PatFrag load_type> : PatFrag <
   (ops node:$ptr), (load_type node:$ptr),
-  [{ return isConstantLoad(dyn_cast<LoadSDNode>(N), 0); }]
+  [{ return isConstantLoad(cast<LoadSDNode>(N), 0) ||
+            (cast<LoadSDNode>(N)->getAddressSpace() == AMDGPUAS::PARAM_I_ADDRESS); }]
 >;
 
 def load_param : LoadParamFrag<load>;
 def load_param_exti8 : LoadParamFrag<az_extloadi8>;
 def load_param_exti16 : LoadParamFrag<az_extloadi16>;
 
-def isR600 : Predicate<"Subtarget->getGeneration() <= AMDGPUSubtarget::R700">;
+class LoadVtxId1 <PatFrag load> : PatFrag <
+  (ops node:$ptr), (load node:$ptr), [{
+  const MemSDNode *LD = cast<MemSDNode>(N);
+  return LD->getAddressSpace() == AMDGPUAS::GLOBAL_ADDRESS ||
+         (LD->getAddressSpace() == AMDGPUAS::CONSTANT_ADDRESS &&
+           !isa<GlobalValue>(GetUnderlyingObject(
+           LD->getMemOperand()->getValue(), CurDAG->getDataLayout())));
+}]>;
+
+def vtx_id1_az_extloadi8 : LoadVtxId1 <az_extloadi8>;
+def vtx_id1_az_extloadi16 : LoadVtxId1 <az_extloadi16>;
+def vtx_id1_load : LoadVtxId1 <load>;
+
+class LoadVtxId2 <PatFrag load> : PatFrag <
+  (ops node:$ptr), (load node:$ptr), [{
+  const MemSDNode *LD = cast<MemSDNode>(N);
+  return LD->getAddressSpace() == AMDGPUAS::CONSTANT_ADDRESS &&
+         isa<GlobalValue>(GetUnderlyingObject(
+         LD->getMemOperand()->getValue(), CurDAG->getDataLayout()));
+}]>;
+
+def vtx_id2_az_extloadi8 : LoadVtxId2 <az_extloadi8>;
+def vtx_id2_az_extloadi16 : LoadVtxId2 <az_extloadi16>;
+def vtx_id2_load : LoadVtxId2 <load>;
+
+def isR600 : Predicate<"Subtarget->getGeneration() <= R600Subtarget::R700">;
 
 def isR600toCayman
     : Predicate<
-          "Subtarget->getGeneration() <= AMDGPUSubtarget::NORTHERN_ISLANDS">;
+          "Subtarget->getGeneration() <= R600Subtarget::NORTHERN_ISLANDS">;
 
 //===----------------------------------------------------------------------===//
 // R600 SDNodes
@@ -407,8 +434,7 @@ def : Pat<(TEXTURE_FETCH (i32 TextureOp), vt:$SRC_GPR,
 def INTERP_VEC_LOAD :  AMDGPUShaderInst <
   (outs R600_Reg128:$dst),
   (ins i32imm:$src0),
-  "INTERP_LOAD $src0 : $dst",
-  [(set R600_Reg128:$dst, (int_R600_interp_const imm:$src0))]>;
+  "INTERP_LOAD $src0 : $dst">;
 
 def INTERP_XY : R600_2OP <0xD6, "INTERP_XY", []> {
   let bank_swizzle = 5;
@@ -474,28 +500,6 @@ class ExportBufWord1 {
 }
 
 multiclass ExportPattern<Instruction ExportInst, bits<8> cf_inst> {
-  def : Pat<(int_R600_store_pixel_depth R600_Reg32:$reg),
-    (ExportInst
-        (INSERT_SUBREG (v4f32 (IMPLICIT_DEF)), $reg, sub0),
-        0, 61, 0, 7, 7, 7, cf_inst, 0)
-  >;
-
-  def : Pat<(int_R600_store_pixel_stencil R600_Reg32:$reg),
-    (ExportInst
-        (INSERT_SUBREG (v4f32 (IMPLICIT_DEF)), $reg, sub0),
-        0, 61, 7, 0, 7, 7, cf_inst, 0)
-  >;
-
-  def : Pat<(int_R600_store_dummy (i32 imm:$type)),
-    (ExportInst
-        (v4f32 (IMPLICIT_DEF)), imm:$type, 0, 7, 7, 7, 7, cf_inst, 0)
-  >;
-
-  def : Pat<(int_R600_store_dummy 1),
-    (ExportInst
-        (v4f32 (IMPLICIT_DEF)), 1, 60, 7, 7, 7, 7, cf_inst, 0)
-  >;
-
   def : Pat<(EXPORT (v4f32 R600_Reg128:$src), (i32 imm:$base), (i32 imm:$type),
     (i32 imm:$swz_x), (i32 imm:$swz_y), (i32 imm:$swz_z), (i32 imm:$swz_w)),
         (ExportInst R600_Reg128:$src, imm:$type, imm:$base,
@@ -507,22 +511,22 @@ multiclass ExportPattern<Instruction ExportInst, bits<8> cf_inst> {
 multiclass SteamOutputExportPattern<Instruction ExportInst,
     bits<8> buf0inst, bits<8> buf1inst, bits<8> buf2inst, bits<8> buf3inst> {
 // Stream0
-  def : Pat<(int_R600_store_stream_output (v4f32 R600_Reg128:$src),
+  def : Pat<(int_r600_store_stream_output (v4f32 R600_Reg128:$src),
       (i32 imm:$arraybase), (i32 0), (i32 imm:$mask)),
       (ExportInst R600_Reg128:$src, 0, imm:$arraybase,
       4095, imm:$mask, buf0inst, 0)>;
 // Stream1
-  def : Pat<(int_R600_store_stream_output (v4f32 R600_Reg128:$src),
+  def : Pat<(int_r600_store_stream_output (v4f32 R600_Reg128:$src),
       (i32 imm:$arraybase), (i32 1), (i32 imm:$mask)),
       (ExportInst $src, 0, imm:$arraybase,
       4095, imm:$mask, buf1inst, 0)>;
 // Stream2
-  def : Pat<(int_R600_store_stream_output (v4f32 R600_Reg128:$src),
+  def : Pat<(int_r600_store_stream_output (v4f32 R600_Reg128:$src),
       (i32 imm:$arraybase), (i32 2), (i32 imm:$mask)),
       (ExportInst $src, 0, imm:$arraybase,
       4095, imm:$mask, buf2inst, 0)>;
 // Stream3
-  def : Pat<(int_R600_store_stream_output (v4f32 R600_Reg128:$src),
+  def : Pat<(int_r600_store_stream_output (v4f32 R600_Reg128:$src),
       (i32 imm:$arraybase), (i32 3), (i32 imm:$mask)),
       (ExportInst $src, 0, imm:$arraybase,
       4095, imm:$mask, buf3inst, 0)>;
@@ -678,7 +682,7 @@ let Predicates = [isR600toCayman] in {
 
 def ADD : R600_2OP_Helper <0x0, "ADD", fadd>;
 // Non-IEEE MUL: 0 * anything = 0
-def MUL : R600_2OP_Helper <0x1, "MUL NON-IEEE", int_AMDGPU_mul>;
+def MUL : R600_2OP_Helper <0x1, "MUL NON-IEEE">;
 def MUL_IEEE : R600_2OP_Helper <0x2, "MUL_IEEE", fmul>;
 // TODO: Do these actually match the regular fmin/fmax behavior?
 def MAX : R600_2OP_Helper <0x3, "MAX", AMDGPUfmax_legacy>;
@@ -733,6 +737,7 @@ def SETNE_DX10 : R600_2OP <
   [(set i32:$dst, (selectcc f32:$src0, f32:$src1, -1, 0, COND_UNE_NE))]
 >;
 
+// FIXME: Need combine for AMDGPUfract
 def FRACT : R600_1OP_Helper <0x10, "FRACT", AMDGPUfract>;
 def TRUNC : R600_1OP_Helper <0x11, "TRUNC", ftrunc>;
 def CEIL : R600_1OP_Helper <0x12, "CEIL", fceil>;
@@ -758,6 +763,13 @@ def : Pat <
   (MOV_IMM_I32 imm:$val)
 >;
 
+def MOV_IMM_GLOBAL_ADDR : MOV_IMM<iPTR, i32imm>;
+def : Pat <
+  (AMDGPUconstdata_ptr tglobaladdr:$addr),
+  (MOV_IMM_GLOBAL_ADDR tglobaladdr:$addr)
+>;
+
+
 def MOV_IMM_F32 : MOV_IMM<f32, f32imm>;
 def : Pat <
   (fpimm:$val),
@@ -851,7 +863,7 @@ class R600_TEX <bits<11> inst, string opName> :
           i32imm:$RESOURCE_ID, i32imm:$SAMPLER_ID,
           CT:$COORD_TYPE_X, CT:$COORD_TYPE_Y, CT:$COORD_TYPE_Z,
           CT:$COORD_TYPE_W),
-          !strconcat(opName,
+          !strconcat("  ", opName,
           " $DST_GPR.$DST_SEL_X$DST_SEL_Y$DST_SEL_Z$DST_SEL_W, "
           "$SRC_GPR.$srcx$srcy$srcz$srcw "
           "RID:$RESOURCE_ID SID:$SAMPLER_ID "
@@ -1099,14 +1111,13 @@ class RECIP_UINT_Common <bits<11> inst> : R600_1OP_Helper <
 
 // Clamped to maximum.
 class RECIPSQRT_CLAMPED_Common <bits<11> inst> : R600_1OP_Helper <
-  inst, "RECIPSQRT_CLAMPED", AMDGPUrsq_clamped
+  inst, "RECIPSQRT_CLAMPED", AMDGPUrsq_clamp
 > {
   let Itinerary = TransALU;
 }
 
 class RECIPSQRT_IEEE_Common <bits<11> inst> : R600_1OP_Helper <
-  inst, "RECIPSQRT_IEEE", AMDGPUrsq_legacy
-> {
+  inst, "RECIPSQRT_IEEE", AMDGPUrsq> {
   let Itinerary = TransALU;
 }
 
@@ -1135,11 +1146,6 @@ def FNEG_R600 : FNEG<R600_Reg32>;
 // FIXME: Should be predicated on unsafe fp math.
 multiclass DIV_Common <InstR600 recip_ieee> {
 def : Pat<
-  (int_AMDGPU_div f32:$src0, f32:$src1),
-  (MUL_IEEE $src0, (recip_ieee $src1))
->;
-
-def : Pat<
   (fdiv f32:$src0, f32:$src1),
   (MUL_IEEE $src0, (recip_ieee $src1))
 >;
@@ -1147,12 +1153,6 @@ def : Pat<
 def : RcpPat<recip_ieee, f32>;
 }
 
-class TGSI_LIT_Z_Common <InstR600 mul_lit, InstR600 log_clamped, InstR600 exp_ieee>
-  : Pat <
-  (int_TGSI_lit_z f32:$src_x, f32:$src_y, f32:$src_w),
-  (exp_ieee (mul_lit (log_clamped (MAX $src_y, (f32 ZERO))), $src_w, $src_x))
->;
-
 //===----------------------------------------------------------------------===//
 // R600 / R700 Instructions
 //===----------------------------------------------------------------------===//
@@ -1191,7 +1191,6 @@ let Predicates = [isR600] in {
 
   defm DIV_r600 : DIV_Common<RECIP_IEEE_r600>;
   def : POW_Common <LOG_IEEE_r600, EXP_IEEE_r600, MUL>;
-  def TGSI_LIT_Z_r600 : TGSI_LIT_Z_Common<MUL_LIT_r600, LOG_CLAMPED_r600, EXP_IEEE_r600>;
 
   def : Pat<(fsqrt f32:$src), (MUL $src, (RECIPSQRT_CLAMPED_r600 $src))>;
   def : RsqPat<RECIPSQRT_IEEE_r600, f32>;
@@ -1332,9 +1331,7 @@ def TXD: InstR600 <
   (outs R600_Reg128:$dst),
   (ins R600_Reg128:$src0, R600_Reg128:$src1, R600_Reg128:$src2,
        i32imm:$resourceId, i32imm:$samplerId, i32imm:$textureTarget),
-  "TXD $dst, $src0, $src1, $src2, $resourceId, $samplerId, $textureTarget",
-  [(set v4f32:$dst, (int_AMDGPU_txd v4f32:$src0, v4f32:$src1, v4f32:$src2,
-                     imm:$resourceId, imm:$samplerId, imm:$textureTarget))],
+  "TXD $dst, $src0, $src1, $src2, $resourceId, $samplerId, $textureTarget", [],
   NullALU > {
   let TEXInst = 1;
 }
@@ -1344,10 +1341,7 @@ def TXD_SHADOW: InstR600 <
   (ins R600_Reg128:$src0, R600_Reg128:$src1, R600_Reg128:$src2,
        i32imm:$resourceId, i32imm:$samplerId, i32imm:$textureTarget),
   "TXD_SHADOW $dst, $src0, $src1, $src2, $resourceId, $samplerId, $textureTarget",
-  [(set v4f32:$dst, (int_AMDGPU_txd v4f32:$src0, v4f32:$src1, v4f32:$src2,
-        imm:$resourceId, imm:$samplerId, TEX_SHADOW:$textureTarget))],
-   NullALU
-> {
+  [], NullALU> {
   let TEXInst = 1;
 }
 } // End isPseudo = 1
@@ -1426,8 +1420,7 @@ def TEX_VTX_CONSTBUF :
 }
 
 def TEX_VTX_TEXBUF:
-  InstR600ISA <(outs R600_Reg128:$dst), (ins MEMxi:$ptr, i32imm:$BUFFER_ID), "TEX_VTX_EXPLICIT_READ $dst, $ptr",
-      [(set v4f32:$dst, (int_R600_load_texbuf ADDRGA_VAR_OFFSET:$ptr, imm:$BUFFER_ID))]>,
+  InstR600ISA <(outs R600_Reg128:$dst), (ins MEMxi:$ptr, i32imm:$BUFFER_ID), "TEX_VTX_EXPLICIT_READ $dst, $ptr">,
 VTX_WORD1_GPR, VTX_WORD0_eg {
 
 let VC_INST = 0;
@@ -1542,8 +1535,9 @@ let isTerminator = 1, usesCustomInserter = 1, isBranch = 1, isBarrier = 1 in {
 //===---------------------------------------------------------------------===//
 let isTerminator = 1, isReturn = 1, hasCtrlDep = 1,
     usesCustomInserter = 1 in {
-  def RETURN          : ILFormat<(outs), (ins variable_ops),
-      "RETURN", [(IL_retflag)]>;
+  def RETURN : ILFormat<(outs), (ins variable_ops),
+    "RETURN", [(AMDGPUendpgm)]
+  >;
 }
 
 //===----------------------------------------------------------------------===//
@@ -1729,12 +1723,6 @@ def : DwordAddrPat  <i32, R600_Reg32>;
 
 } // End isR600toCayman Predicate
 
-let Predicates = [isR600] in {
-// Intrinsic patterns
-defm : Expand24IBitOps<MULLO_INT_r600, ADD_INT>;
-defm : Expand24UBitOps<MULLO_UINT_r600, ADD_INT>;
-} // End isR600
-
 def getLDSNoRetOp : InstrMapping {
   let FilterClass = "R600_LDS_1A1D";
   let RowFields = ["BaseOp"];
diff --git a/lib/Target/AMDGPU/R600Intrinsics.td b/lib/Target/AMDGPU/R600Intrinsics.td
index 9681747006d9..a5310e9fd6d0 100644
--- a/lib/Target/AMDGPU/R600Intrinsics.td
+++ b/lib/Target/AMDGPU/R600Intrinsics.td
@@ -11,65 +11,57 @@
 //
 //===----------------------------------------------------------------------===//
 
-let TargetPrefix = "R600", isTarget = 1 in {
-  class TextureIntrinsicFloatInput :
-    Intrinsic<[llvm_v4f32_ty], [
-      llvm_v4f32_ty, // Coord
-      llvm_i32_ty, // offset_x
-      llvm_i32_ty, // offset_y,
-      llvm_i32_ty, // offset_z,
-      llvm_i32_ty, // resource_id
-      llvm_i32_ty, // samplerid
-      llvm_i32_ty, // coord_type_x
-      llvm_i32_ty, // coord_type_y
-      llvm_i32_ty, // coord_type_z
-      llvm_i32_ty // coord_type_w
-    ], [IntrNoMem]>;
-  class TextureIntrinsicInt32Input :
-    Intrinsic<[llvm_v4i32_ty], [
-      llvm_v4i32_ty, // Coord
-      llvm_i32_ty, // offset_x
-      llvm_i32_ty, // offset_y,
-      llvm_i32_ty, // offset_z,
-      llvm_i32_ty, // resource_id
-      llvm_i32_ty, // samplerid
-      llvm_i32_ty, // coord_type_x
-      llvm_i32_ty, // coord_type_y
-      llvm_i32_ty, // coord_type_z
-      llvm_i32_ty // coord_type_w
-    ], [IntrNoMem]>;
+class TextureIntrinsicFloatInput : Intrinsic<[llvm_v4f32_ty], [
+  llvm_v4f32_ty, // Coord
+  llvm_i32_ty,   // offset_x
+  llvm_i32_ty,   // offset_y,
+  llvm_i32_ty,   // offset_z,
+  llvm_i32_ty,   // resource_id
+  llvm_i32_ty,   // samplerid
+  llvm_i32_ty,   // coord_type_x
+  llvm_i32_ty,   // coord_type_y
+  llvm_i32_ty,   // coord_type_z
+  llvm_i32_ty],  // coord_type_w
+  [IntrNoMem]
+>;
 
-  def int_R600_load_input :
-    Intrinsic<[llvm_float_ty], [llvm_i32_ty], [IntrNoMem]>;
-  def int_R600_interp_input :
-    Intrinsic<[llvm_float_ty], [llvm_i32_ty, llvm_i32_ty], [IntrNoMem]>;
-  def int_R600_interp_const :
-    Intrinsic<[llvm_v4f32_ty], [llvm_i32_ty], [IntrNoMem]>;
-def int_R600_interp_xy :
-    Intrinsic<[llvm_v2f32_ty], [llvm_i32_ty, llvm_float_ty, llvm_float_ty], [IntrNoMem]>;
-def int_R600_interp_zw :
-    Intrinsic<[llvm_v2f32_ty], [llvm_i32_ty, llvm_float_ty, llvm_float_ty], [IntrNoMem]>;
-  def int_R600_load_texbuf :
-    Intrinsic<[llvm_v4f32_ty], [llvm_i32_ty, llvm_i32_ty], [IntrNoMem]>;
-  def int_R600_tex : TextureIntrinsicFloatInput;
-  def int_R600_texc : TextureIntrinsicFloatInput;
-  def int_R600_txl : TextureIntrinsicFloatInput;
-  def int_R600_txlc : TextureIntrinsicFloatInput;
-  def int_R600_txb : TextureIntrinsicFloatInput;
-  def int_R600_txbc : TextureIntrinsicFloatInput;
-  def int_R600_txf : TextureIntrinsicInt32Input;
-  def int_R600_ldptr : TextureIntrinsicInt32Input;
-  def int_R600_txq : TextureIntrinsicInt32Input;
-  def int_R600_ddx : TextureIntrinsicFloatInput;
-  def int_R600_ddy : TextureIntrinsicFloatInput;
-  def int_R600_store_swizzle :
-    Intrinsic<[], [llvm_v4f32_ty, llvm_i32_ty, llvm_i32_ty], []>;
-  def int_R600_store_stream_output :
-    Intrinsic<[], [llvm_v4f32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], []>;
-  def int_R600_store_pixel_depth :
-      Intrinsic<[], [llvm_float_ty], []>;
-  def int_R600_store_pixel_stencil :
-      Intrinsic<[], [llvm_float_ty], []>;
-  def int_R600_store_dummy :
-      Intrinsic<[], [llvm_i32_ty], []>;
-}
+class TextureIntrinsicInt32Input : Intrinsic<[llvm_v4i32_ty], [
+    llvm_v4i32_ty, // Coord
+    llvm_i32_ty,   // offset_x
+    llvm_i32_ty,   // offset_y,
+    llvm_i32_ty,   // offset_z,
+    llvm_i32_ty,   // resource_id
+    llvm_i32_ty,   // samplerid
+    llvm_i32_ty,   // coord_type_x
+    llvm_i32_ty,   // coord_type_y
+    llvm_i32_ty,   // coord_type_z
+    llvm_i32_ty],  // coord_type_w
+    [IntrNoMem]
+>;
+
+let TargetPrefix = "r600", isTarget = 1 in {
+
+def int_r600_store_swizzle :
+  Intrinsic<[], [llvm_v4f32_ty, llvm_i32_ty, llvm_i32_ty], []
+>;
+
+def int_r600_store_stream_output : Intrinsic<
+  [], [llvm_v4f32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], []
+>;
+
+def int_r600_tex : TextureIntrinsicFloatInput;
+def int_r600_texc : TextureIntrinsicFloatInput;
+def int_r600_txl : TextureIntrinsicFloatInput;
+def int_r600_txlc : TextureIntrinsicFloatInput;
+def int_r600_txb : TextureIntrinsicFloatInput;
+def int_r600_txbc : TextureIntrinsicFloatInput;
+def int_r600_txf : TextureIntrinsicInt32Input;
+def int_r600_txq : TextureIntrinsicInt32Input;
+def int_r600_ddx : TextureIntrinsicFloatInput;
+def int_r600_ddy : TextureIntrinsicFloatInput;
+
+def int_r600_dot4 : Intrinsic<[llvm_float_ty],
+  [llvm_v4f32_ty, llvm_v4f32_ty], [IntrNoMem]
+>;
+
+} // End TargetPrefix = "r600", isTarget = 1
diff --git a/lib/Target/AMDGPU/R600MachineFunctionInfo.h b/lib/Target/AMDGPU/R600MachineFunctionInfo.h
index 263561edd30d..04a4436ebe03 100644
--- a/lib/Target/AMDGPU/R600MachineFunctionInfo.h
+++ b/lib/Target/AMDGPU/R600MachineFunctionInfo.h
@@ -10,17 +10,16 @@
 /// \file
 //===----------------------------------------------------------------------===//
 
-#ifndef LLVM_LIB_TARGET_R600_R600MACHINEFUNCTIONINFO_H
-#define LLVM_LIB_TARGET_R600_R600MACHINEFUNCTIONINFO_H
+#ifndef LLVM_LIB_TARGET_AMDGPU_R600MACHINEFUNCTIONINFO_H
+#define LLVM_LIB_TARGET_AMDGPU_R600MACHINEFUNCTIONINFO_H
 
 #include "AMDGPUMachineFunction.h"
-#include "llvm/ADT/BitVector.h"
 #include "llvm/CodeGen/SelectionDAG.h"
 #include <vector>
 
 namespace llvm {
 
-class R600MachineFunctionInfo : public AMDGPUMachineFunction {
+class R600MachineFunctionInfo final : public AMDGPUMachineFunction {
   void anchor() override;
 public:
   R600MachineFunctionInfo(const MachineFunction &MF);
diff --git a/lib/Target/AMDGPU/R600MachineScheduler.cpp b/lib/Target/AMDGPU/R600MachineScheduler.cpp
index bcde5fb50dac..db18e5bd1afa 100644
--- a/lib/Target/AMDGPU/R600MachineScheduler.cpp
+++ b/lib/Target/AMDGPU/R600MachineScheduler.cpp
@@ -13,6 +13,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "R600MachineScheduler.h"
+#include "R600InstrInfo.h"
 #include "AMDGPUSubtarget.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
 #include "llvm/Pass.h"
@@ -26,7 +27,7 @@ using namespace llvm;
 void R600SchedStrategy::initialize(ScheduleDAGMI *dag) {
   assert(dag->hasVRegLiveness() && "R600SchedStrategy needs vreg liveness");
   DAG = static_cast<ScheduleDAGMILive*>(dag);
-  const AMDGPUSubtarget &ST = DAG->MF.getSubtarget<AMDGPUSubtarget>();
+  const R600Subtarget &ST = DAG->MF.getSubtarget<R600Subtarget>();
   TII = static_cast<const R600InstrInfo*>(DAG->TII);
   TRI = static_cast<const R600RegisterInfo*>(DAG->TRI);
   VLIW5 = !ST.hasCaymanISA();
@@ -48,8 +49,7 @@ void R600SchedStrategy::MoveUnits(std::vector<SUnit *> &QSrc,
   QSrc.clear();
 }
 
-static
-unsigned getWFCountLimitedByGPR(unsigned GPRCount) {
+static unsigned getWFCountLimitedByGPR(unsigned GPRCount) {
   assert (GPRCount && "GPRCount cannot be 0");
   return 248 / GPRCount;
 }
@@ -222,75 +222,74 @@ bool R600SchedStrategy::regBelongsToClass(unsigned Reg,
 R600SchedStrategy::AluKind R600SchedStrategy::getAluKind(SUnit *SU) const {
   MachineInstr *MI = SU->getInstr();
 
-  if (TII->isTransOnly(MI))
+  if (TII->isTransOnly(*MI))
     return AluTrans;
 
-    switch (MI->getOpcode()) {
-    case AMDGPU::PRED_X:
-      return AluPredX;
-    case AMDGPU::INTERP_PAIR_XY:
-    case AMDGPU::INTERP_PAIR_ZW:
-    case AMDGPU::INTERP_VEC_LOAD:
-    case AMDGPU::DOT_4:
-      return AluT_XYZW;
-    case AMDGPU::COPY:
-      if (MI->getOperand(1).isUndef()) {
-        // MI will become a KILL, don't considers it in scheduling
-        return AluDiscarded;
-      }
-    default:
-      break;
-    }
-
-    // Does the instruction take a whole IG ?
-    // XXX: Is it possible to add a helper function in R600InstrInfo that can
-    // be used here and in R600PacketizerList::isSoloInstruction() ?
-    if(TII->isVector(*MI) ||
-        TII->isCubeOp(MI->getOpcode()) ||
-        TII->isReductionOp(MI->getOpcode()) ||
-        MI->getOpcode() == AMDGPU::GROUP_BARRIER) {
-      return AluT_XYZW;
+  switch (MI->getOpcode()) {
+  case AMDGPU::PRED_X:
+    return AluPredX;
+  case AMDGPU::INTERP_PAIR_XY:
+  case AMDGPU::INTERP_PAIR_ZW:
+  case AMDGPU::INTERP_VEC_LOAD:
+  case AMDGPU::DOT_4:
+    return AluT_XYZW;
+  case AMDGPU::COPY:
+    if (MI->getOperand(1).isUndef()) {
+      // MI will become a KILL, don't considers it in scheduling
+      return AluDiscarded;
     }
+  default:
+    break;
+  }
 
-    if (TII->isLDSInstr(MI->getOpcode())) {
-      return AluT_X;
-    }
+  // Does the instruction take a whole IG ?
+  // XXX: Is it possible to add a helper function in R600InstrInfo that can
+  // be used here and in R600PacketizerList::isSoloInstruction() ?
+  if(TII->isVector(*MI) ||
+     TII->isCubeOp(MI->getOpcode()) ||
+     TII->isReductionOp(MI->getOpcode()) ||
+     MI->getOpcode() == AMDGPU::GROUP_BARRIER) {
+    return AluT_XYZW;
+  }
 
-    // Is the result already assigned to a channel ?
-    unsigned DestSubReg = MI->getOperand(0).getSubReg();
-    switch (DestSubReg) {
-    case AMDGPU::sub0:
-      return AluT_X;
-    case AMDGPU::sub1:
-      return AluT_Y;
-    case AMDGPU::sub2:
-      return AluT_Z;
-    case AMDGPU::sub3:
-      return AluT_W;
-    default:
-      break;
-    }
+  if (TII->isLDSInstr(MI->getOpcode())) {
+    return AluT_X;
+  }
 
-    // Is the result already member of a X/Y/Z/W class ?
-    unsigned DestReg = MI->getOperand(0).getReg();
-    if (regBelongsToClass(DestReg, &AMDGPU::R600_TReg32_XRegClass) ||
-        regBelongsToClass(DestReg, &AMDGPU::R600_AddrRegClass))
-      return AluT_X;
-    if (regBelongsToClass(DestReg, &AMDGPU::R600_TReg32_YRegClass))
-      return AluT_Y;
-    if (regBelongsToClass(DestReg, &AMDGPU::R600_TReg32_ZRegClass))
-      return AluT_Z;
-    if (regBelongsToClass(DestReg, &AMDGPU::R600_TReg32_WRegClass))
-      return AluT_W;
-    if (regBelongsToClass(DestReg, &AMDGPU::R600_Reg128RegClass))
-      return AluT_XYZW;
-
-    // LDS src registers cannot be used in the Trans slot.
-    if (TII->readsLDSSrcReg(MI))
-      return AluT_XYZW;
-
-    return AluAny;
+  // Is the result already assigned to a channel ?
+  unsigned DestSubReg = MI->getOperand(0).getSubReg();
+  switch (DestSubReg) {
+  case AMDGPU::sub0:
+    return AluT_X;
+  case AMDGPU::sub1:
+    return AluT_Y;
+  case AMDGPU::sub2:
+    return AluT_Z;
+  case AMDGPU::sub3:
+    return AluT_W;
+  default:
+    break;
+  }
 
+  // Is the result already member of a X/Y/Z/W class ?
+  unsigned DestReg = MI->getOperand(0).getReg();
+  if (regBelongsToClass(DestReg, &AMDGPU::R600_TReg32_XRegClass) ||
+      regBelongsToClass(DestReg, &AMDGPU::R600_AddrRegClass))
+    return AluT_X;
+  if (regBelongsToClass(DestReg, &AMDGPU::R600_TReg32_YRegClass))
+    return AluT_Y;
+  if (regBelongsToClass(DestReg, &AMDGPU::R600_TReg32_ZRegClass))
+    return AluT_Z;
+  if (regBelongsToClass(DestReg, &AMDGPU::R600_TReg32_WRegClass))
+    return AluT_W;
+  if (regBelongsToClass(DestReg, &AMDGPU::R600_Reg128RegClass))
+    return AluT_XYZW;
+
+  // LDS src registers cannot be used in the Trans slot.
+  if (TII->readsLDSSrcReg(*MI))
+    return AluT_XYZW;
+
+  return AluAny;
 }
 
 int R600SchedStrategy::getInstKind(SUnit* SU) {
@@ -324,9 +323,8 @@ SUnit *R600SchedStrategy::PopInst(std::vector<SUnit *> &Q, bool AnyALU) {
       It != E; ++It) {
     SUnit *SU = *It;
     InstructionsGroupCandidate.push_back(SU->getInstr());
-    if (TII->fitsConstReadLimitations(InstructionsGroupCandidate)
-        && (!AnyALU || !TII->isVectorOnly(SU->getInstr()))
-    ) {
+    if (TII->fitsConstReadLimitations(InstructionsGroupCandidate) &&
+        (!AnyALU || !TII->isVectorOnly(*SU->getInstr()))) {
       InstructionsGroupCandidate.pop_back();
       Q.erase((It + 1).base());
       return SU;
@@ -350,7 +348,7 @@ void R600SchedStrategy::PrepareNextSlot() {
   DEBUG(dbgs() << "New Slot\n");
   assert (OccupedSlotsMask && "Slot wasn't filled");
   OccupedSlotsMask = 0;
-//  if (HwGen == AMDGPUSubtarget::NORTHERN_ISLANDS)
+//  if (HwGen == R600Subtarget::NORTHERN_ISLANDS)
 //    OccupedSlotsMask |= 16;
   InstructionsGroupCandidate.clear();
   LoadAlu();
diff --git a/lib/Target/AMDGPU/R600MachineScheduler.h b/lib/Target/AMDGPU/R600MachineScheduler.h
index fc5b95c28e71..16d5d939708c 100644
--- a/lib/Target/AMDGPU/R600MachineScheduler.h
+++ b/lib/Target/AMDGPU/R600MachineScheduler.h
@@ -12,20 +12,19 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef LLVM_LIB_TARGET_R600_R600MACHINESCHEDULER_H
-#define LLVM_LIB_TARGET_R600_R600MACHINESCHEDULER_H
+#ifndef LLVM_LIB_TARGET_AMDGPU_R600MACHINESCHEDULER_H
+#define LLVM_LIB_TARGET_AMDGPU_R600MACHINESCHEDULER_H
 
-#include "R600InstrInfo.h"
-#include "llvm/ADT/PriorityQueue.h"
 #include "llvm/CodeGen/MachineScheduler.h"
-#include "llvm/Support/Debug.h"
 
 using namespace llvm;
 
 namespace llvm {
 
-class R600SchedStrategy : public MachineSchedStrategy {
+class R600InstrInfo;
+struct R600RegisterInfo;
 
+class R600SchedStrategy final : public MachineSchedStrategy {
   const ScheduleDAGMILive *DAG;
   const R600InstrInfo *TII;
   const R600RegisterInfo *TRI;
diff --git a/lib/Target/AMDGPU/R600OptimizeVectorRegisters.cpp b/lib/Target/AMDGPU/R600OptimizeVectorRegisters.cpp
index 5efb3b9fc20e..ecae27d2233d 100644
--- a/lib/Target/AMDGPU/R600OptimizeVectorRegisters.cpp
+++ b/lib/Target/AMDGPU/R600OptimizeVectorRegisters.cpp
@@ -29,6 +29,7 @@
 
 #include "AMDGPU.h"
 #include "AMDGPUSubtarget.h"
+#include "R600Defines.h"
 #include "R600InstrInfo.h"
 #include "llvm/CodeGen/DFAPacketizer.h"
 #include "llvm/CodeGen/MachineDominators.h"
@@ -210,9 +211,9 @@ MachineInstr *R600VectorRegMerger::RebuildVector(
     (void)Tmp;
     SrcVec = DstReg;
   }
-  Pos = BuildMI(MBB, Pos, DL, TII->get(AMDGPU::COPY), Reg)
-      .addReg(SrcVec);
-  DEBUG(dbgs() << "    ->"; Pos->dump(););
+  MachineInstr *NewMI =
+      BuildMI(MBB, Pos, DL, TII->get(AMDGPU::COPY), Reg).addReg(SrcVec);
+  DEBUG(dbgs() << "    ->"; NewMI->dump(););
 
   DEBUG(dbgs() << "  Updating Swizzle:\n");
   for (MachineRegisterInfo::use_instr_iterator It = MRI->use_instr_begin(Reg),
@@ -224,11 +225,11 @@ MachineInstr *R600VectorRegMerger::RebuildVector(
   RSI->Instr->eraseFromParent();
 
   // Update RSI
-  RSI->Instr = Pos;
+  RSI->Instr = NewMI;
   RSI->RegToChan = UpdatedRegToChan;
   RSI->UndefReg = UpdatedUndef;
 
-  return Pos;
+  return NewMI;
 }
 
 void R600VectorRegMerger::RemoveMI(MachineInstr *MI) {
@@ -314,8 +315,13 @@ void R600VectorRegMerger::trackRSI(const RegSeqInfo &RSI) {
 }
 
 bool R600VectorRegMerger::runOnMachineFunction(MachineFunction &Fn) {
-  TII = static_cast<const R600InstrInfo *>(Fn.getSubtarget().getInstrInfo());
-  MRI = &(Fn.getRegInfo());
+  if (skipFunction(*Fn.getFunction()))
+    return false;
+
+  const R600Subtarget &ST = Fn.getSubtarget<R600Subtarget>();
+  TII = ST.getInstrInfo();
+  MRI = &Fn.getRegInfo();
+
   for (MachineFunction::iterator MBB = Fn.begin(), MBBe = Fn.end();
        MBB != MBBe; ++MBB) {
     MachineBasicBlock *MB = &*MBB;
@@ -325,10 +331,10 @@ bool R600VectorRegMerger::runOnMachineFunction(MachineFunction &Fn) {
 
     for (MachineBasicBlock::iterator MII = MB->begin(), MIIE = MB->end();
          MII != MIIE; ++MII) {
-      MachineInstr *MI = MII;
-      if (MI->getOpcode() != AMDGPU::REG_SEQUENCE) {
-        if (TII->get(MI->getOpcode()).TSFlags & R600_InstFlag::TEX_INST) {
-          unsigned Reg = MI->getOperand(1).getReg();
+      MachineInstr &MI = *MII;
+      if (MI.getOpcode() != AMDGPU::REG_SEQUENCE) {
+        if (TII->get(MI.getOpcode()).TSFlags & R600_InstFlag::TEX_INST) {
+          unsigned Reg = MI.getOperand(1).getReg();
           for (MachineRegisterInfo::def_instr_iterator
                It = MRI->def_instr_begin(Reg), E = MRI->def_instr_end();
                It != E; ++It) {
@@ -338,17 +344,17 @@ bool R600VectorRegMerger::runOnMachineFunction(MachineFunction &Fn) {
         continue;
       }
 
-
-      RegSeqInfo RSI(*MRI, MI);
+      RegSeqInfo RSI(*MRI, &MI);
 
       // All uses of MI are swizzeable ?
-      unsigned Reg = MI->getOperand(0).getReg();
+      unsigned Reg = MI.getOperand(0).getReg();
       if (!areAllUsesSwizzeable(Reg))
         continue;
 
-      DEBUG (dbgs() << "Trying to optimize ";
-          MI->dump();
-      );
+      DEBUG({
+        dbgs() << "Trying to optimize ";
+        MI.dump();
+      });
 
       RegSeqInfo CandidateRSI;
       std::vector<std::pair<unsigned, unsigned> > RemapChan;
diff --git a/lib/Target/AMDGPU/R600Packetizer.cpp b/lib/Target/AMDGPU/R600Packetizer.cpp
index 21269613a305..c84866469ae8 100644
--- a/lib/Target/AMDGPU/R600Packetizer.cpp
+++ b/lib/Target/AMDGPU/R600Packetizer.cpp
@@ -56,15 +56,14 @@ public:
 char R600Packetizer::ID = 0;
 
 class R600PacketizerList : public VLIWPacketizerList {
-
 private:
   const R600InstrInfo *TII;
   const R600RegisterInfo &TRI;
   bool VLIW5;
   bool ConsideredInstUsesAlreadyWrittenVectorElement;
 
-  unsigned getSlot(const MachineInstr *MI) const {
-    return TRI.getHWRegChan(MI->getOperand(0).getReg());
+  unsigned getSlot(const MachineInstr &MI) const {
+    return TRI.getHWRegChan(MI.getOperand(0).getReg());
   }
 
   /// \returns register to PV chan mapping for bundle/single instructions that
@@ -81,11 +80,11 @@ private:
     int LastDstChan = -1;
     do {
       bool isTrans = false;
-      int BISlot = getSlot(&*BI);
+      int BISlot = getSlot(*BI);
       if (LastDstChan >= BISlot)
         isTrans = true;
       LastDstChan = BISlot;
-      if (TII->isPredicated(&*BI))
+      if (TII->isPredicated(*BI))
         continue;
       int OperandIdx = TII->getOperandIdx(BI->getOpcode(), AMDGPU::OpName::write);
       if (OperandIdx > -1 && BI->getOperand(OperandIdx).getImm() == 0)
@@ -95,7 +94,7 @@ private:
         continue;
       }
       unsigned Dst = BI->getOperand(DstIdx).getReg();
-      if (isTrans || TII->isTransOnly(&*BI)) {
+      if (isTrans || TII->isTransOnly(*BI)) {
         Result[Dst] = AMDGPU::PS;
         continue;
       }
@@ -129,7 +128,7 @@ private:
     return Result;
   }
 
-  void substitutePV(MachineInstr *MI, const DenseMap<unsigned, unsigned> &PVs)
+  void substitutePV(MachineInstr &MI, const DenseMap<unsigned, unsigned> &PVs)
       const {
     unsigned Ops[] = {
       AMDGPU::OpName::src0,
@@ -137,23 +136,23 @@ private:
       AMDGPU::OpName::src2
     };
     for (unsigned i = 0; i < 3; i++) {
-      int OperandIdx = TII->getOperandIdx(MI->getOpcode(), Ops[i]);
+      int OperandIdx = TII->getOperandIdx(MI.getOpcode(), Ops[i]);
       if (OperandIdx < 0)
         continue;
-      unsigned Src = MI->getOperand(OperandIdx).getReg();
+      unsigned Src = MI.getOperand(OperandIdx).getReg();
       const DenseMap<unsigned, unsigned>::const_iterator It = PVs.find(Src);
       if (It != PVs.end())
-        MI->getOperand(OperandIdx).setReg(It->second);
+        MI.getOperand(OperandIdx).setReg(It->second);
     }
   }
 public:
   // Ctor.
-  R600PacketizerList(MachineFunction &MF, MachineLoopInfo &MLI)
+  R600PacketizerList(MachineFunction &MF, const R600Subtarget &ST,
+                     MachineLoopInfo &MLI)
       : VLIWPacketizerList(MF, MLI, nullptr),
-        TII(static_cast<const R600InstrInfo *>(
-            MF.getSubtarget().getInstrInfo())),
+        TII(ST.getInstrInfo()),
         TRI(TII->getRegisterInfo()) {
-    VLIW5 = !MF.getSubtarget<AMDGPUSubtarget>().hasCaymanISA();
+    VLIW5 = !ST.hasCaymanISA();
   }
 
   // initPacketizerState - initialize some internal flags.
@@ -162,32 +161,30 @@ public:
   }
 
   // ignorePseudoInstruction - Ignore bundling of pseudo instructions.
-  bool ignorePseudoInstruction(const MachineInstr *MI,
+  bool ignorePseudoInstruction(const MachineInstr &MI,
                                const MachineBasicBlock *MBB) override {
     return false;
   }
 
   // isSoloInstruction - return true if instruction MI can not be packetized
   // with any other instruction, which means that MI itself is a packet.
-  bool isSoloInstruction(const MachineInstr *MI) override {
-    if (TII->isVector(*MI))
+  bool isSoloInstruction(const MachineInstr &MI) override {
+    if (TII->isVector(MI))
       return true;
-    if (!TII->isALUInstr(MI->getOpcode()))
+    if (!TII->isALUInstr(MI.getOpcode()))
       return true;
-    if (MI->getOpcode() == AMDGPU::GROUP_BARRIER)
+    if (MI.getOpcode() == AMDGPU::GROUP_BARRIER)
       return true;
     // XXX: This can be removed once the packetizer properly handles all the
     // LDS instruction group restrictions.
-    if (TII->isLDSInstr(MI->getOpcode()))
-      return true;
-    return false;
+    return TII->isLDSInstr(MI.getOpcode());
   }
 
   // isLegalToPacketizeTogether - Is it legal to packetize SUI and SUJ
   // together.
   bool isLegalToPacketizeTogether(SUnit *SUI, SUnit *SUJ) override {
     MachineInstr *MII = SUI->getInstr(), *MIJ = SUJ->getInstr();
-    if (getSlot(MII) == getSlot(MIJ))
+    if (getSlot(*MII) == getSlot(*MIJ))
       ConsideredInstUsesAlreadyWrittenVectorElement = true;
     // Does MII and MIJ share the same pred_sel ?
     int OpI = TII->getOperandIdx(MII->getOpcode(), AMDGPU::OpName::pred_sel),
@@ -210,14 +207,12 @@ public:
       }
     }
 
-    bool ARDef = TII->definesAddressRegister(MII) ||
-                 TII->definesAddressRegister(MIJ);
-    bool ARUse = TII->usesAddressRegister(MII) ||
-                 TII->usesAddressRegister(MIJ);
-    if (ARDef && ARUse)
-      return false;
+    bool ARDef =
+        TII->definesAddressRegister(*MII) || TII->definesAddressRegister(*MIJ);
+    bool ARUse =
+        TII->usesAddressRegister(*MII) || TII->usesAddressRegister(*MIJ);
 
-    return true;
+    return !ARDef || !ARUse;
   }
 
   // isLegalToPruneDependencies - Is it legal to prune dependece between SUI
@@ -231,7 +226,7 @@ public:
     MI->getOperand(LastOp).setImm(Bit);
   }
 
-  bool isBundlableWithCurrentPMI(MachineInstr *MI,
+  bool isBundlableWithCurrentPMI(MachineInstr &MI,
                                  const DenseMap<unsigned, unsigned> &PV,
                                  std::vector<R600InstrInfo::BankSwizzle> &BS,
                                  bool &isTransSlot) {
@@ -240,11 +235,14 @@ public:
 
     // Is the dst reg sequence legal ?
     if (!isTransSlot && !CurrentPacketMIs.empty()) {
-      if (getSlot(MI) <= getSlot(CurrentPacketMIs.back())) {
-        if (ConsideredInstUsesAlreadyWrittenVectorElement  &&
+      if (getSlot(MI) <= getSlot(*CurrentPacketMIs.back())) {
+        if (ConsideredInstUsesAlreadyWrittenVectorElement &&
             !TII->isVectorOnly(MI) && VLIW5) {
           isTransSlot = true;
-          DEBUG(dbgs() << "Considering as Trans Inst :"; MI->dump(););
+          DEBUG({
+            dbgs() << "Considering as Trans Inst :";
+            MI.dump();
+          });
         }
         else
           return false;
@@ -252,18 +250,18 @@ public:
     }
 
     // Are the Constants limitations met ?
-    CurrentPacketMIs.push_back(MI);
+    CurrentPacketMIs.push_back(&MI);
     if (!TII->fitsConstReadLimitations(CurrentPacketMIs)) {
-      DEBUG(
+      DEBUG({
         dbgs() << "Couldn't pack :\n";
-        MI->dump();
+        MI.dump();
         dbgs() << "with the following packets :\n";
         for (unsigned i = 0, e = CurrentPacketMIs.size() - 1; i < e; i++) {
           CurrentPacketMIs[i]->dump();
           dbgs() << "\n";
         }
         dbgs() << "because of Consts read limitations\n";
-      );
+      });
       CurrentPacketMIs.pop_back();
       return false;
     }
@@ -271,16 +269,16 @@ public:
     // Is there a BankSwizzle set that meet Read Port limitations ?
     if (!TII->fitsReadPortLimitations(CurrentPacketMIs,
             PV, BS, isTransSlot)) {
-      DEBUG(
+      DEBUG({
         dbgs() << "Couldn't pack :\n";
-        MI->dump();
+        MI.dump();
         dbgs() << "with the following packets :\n";
         for (unsigned i = 0, e = CurrentPacketMIs.size() - 1; i < e; i++) {
           CurrentPacketMIs[i]->dump();
           dbgs() << "\n";
         }
         dbgs() << "because of Read port limitations\n";
-      );
+      });
       CurrentPacketMIs.pop_back();
       return false;
     }
@@ -293,9 +291,9 @@ public:
     return true;
   }
 
-  MachineBasicBlock::iterator addToPacket(MachineInstr *MI) override {
+  MachineBasicBlock::iterator addToPacket(MachineInstr &MI) override {
     MachineBasicBlock::iterator FirstInBundle =
-        CurrentPacketMIs.empty() ? MI : CurrentPacketMIs.front();
+        CurrentPacketMIs.empty() ? &MI : CurrentPacketMIs.front();
     const DenseMap<unsigned, unsigned> &PV =
         getPreviousVector(FirstInBundle);
     std::vector<R600InstrInfo::BankSwizzle> BS;
@@ -308,9 +306,9 @@ public:
             AMDGPU::OpName::bank_swizzle);
         MI->getOperand(Op).setImm(BS[i]);
       }
-      unsigned Op = TII->getOperandIdx(MI->getOpcode(),
-          AMDGPU::OpName::bank_swizzle);
-      MI->getOperand(Op).setImm(BS.back());
+      unsigned Op =
+          TII->getOperandIdx(MI.getOpcode(), AMDGPU::OpName::bank_swizzle);
+      MI.getOperand(Op).setImm(BS.back());
       if (!CurrentPacketMIs.empty())
         setIsLastBit(CurrentPacketMIs.back(), 0);
       substitutePV(MI, PV);
@@ -320,7 +318,7 @@ public:
       }
       return It;
     }
-    endPacket(MI->getParent(), MI);
+    endPacket(MI.getParent(), MI);
     if (TII->isTransOnly(MI))
       return MI;
     return VLIWPacketizerList::addToPacket(MI);
@@ -328,15 +326,20 @@ public:
 };
 
 bool R600Packetizer::runOnMachineFunction(MachineFunction &Fn) {
-  const TargetInstrInfo *TII = Fn.getSubtarget().getInstrInfo();
+  const R600Subtarget &ST = Fn.getSubtarget<R600Subtarget>();
+  const R600InstrInfo *TII = ST.getInstrInfo();
+
   MachineLoopInfo &MLI = getAnalysis<MachineLoopInfo>();
 
   // Instantiate the packetizer.
-  R600PacketizerList Packetizer(Fn, MLI);
+  R600PacketizerList Packetizer(Fn, ST, MLI);
 
   // DFA state table should not be empty.
   assert(Packetizer.getResourceTracker() && "Empty DFA table!");
 
+  if (Packetizer.getResourceTracker()->getInstrItins()->isEmpty())
+    return false;
+
   //
   // Loop over all basic blocks and remove KILL pseudo-instructions
   // These instructions confuse the dependence analysis. Consider:
@@ -375,7 +378,7 @@ bool R600Packetizer::runOnMachineFunction(MachineFunction &Fn) {
       // instruction stream until we find the nearest boundary.
       MachineBasicBlock::iterator I = RegionEnd;
       for(;I != MBB->begin(); --I, --RemainingCount) {
-        if (TII->isSchedulingBoundary(&*std::prev(I), &*MBB, Fn))
+        if (TII->isSchedulingBoundary(*std::prev(I), &*MBB, Fn))
           break;
       }
       I = MBB->begin();
diff --git a/lib/Target/AMDGPU/R600RegisterInfo.cpp b/lib/Target/AMDGPU/R600RegisterInfo.cpp
index fb0359cfc651..dfdc602b80cd 100644
--- a/lib/Target/AMDGPU/R600RegisterInfo.cpp
+++ b/lib/Target/AMDGPU/R600RegisterInfo.cpp
@@ -28,8 +28,8 @@ R600RegisterInfo::R600RegisterInfo() : AMDGPURegisterInfo() {
 BitVector R600RegisterInfo::getReservedRegs(const MachineFunction &MF) const {
   BitVector Reserved(getNumRegs());
 
-  const R600InstrInfo *TII =
-      static_cast<const R600InstrInfo *>(MF.getSubtarget().getInstrInfo());
+  const R600Subtarget &ST = MF.getSubtarget<R600Subtarget>();
+  const R600InstrInfo *TII = ST.getInstrInfo();
 
   Reserved.set(AMDGPU::ZERO);
   Reserved.set(AMDGPU::HALF);
@@ -89,3 +89,10 @@ bool R600RegisterInfo::isPhysRegLiveAcrossClauses(unsigned Reg) const {
     return true;
   }
 }
+
+void R600RegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator MI,
+                                           int SPAdj,
+                                           unsigned FIOperandNum,
+                                           RegScavenger *RS) const {
+  llvm_unreachable("Subroutines not supported yet");
+}
diff --git a/lib/Target/AMDGPU/R600RegisterInfo.h b/lib/Target/AMDGPU/R600RegisterInfo.h
index 4f8a129ce4a6..9dfb3106c6cc 100644
--- a/lib/Target/AMDGPU/R600RegisterInfo.h
+++ b/lib/Target/AMDGPU/R600RegisterInfo.h
@@ -12,8 +12,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef LLVM_LIB_TARGET_R600_R600REGISTERINFO_H
-#define LLVM_LIB_TARGET_R600_R600REGISTERINFO_H
+#ifndef LLVM_LIB_TARGET_AMDGPU_R600REGISTERINFO_H
+#define LLVM_LIB_TARGET_AMDGPU_R600REGISTERINFO_H
 
 #include "AMDGPURegisterInfo.h"
 
@@ -21,7 +21,7 @@ namespace llvm {
 
 class AMDGPUSubtarget;
 
-struct R600RegisterInfo : public AMDGPURegisterInfo {
+struct R600RegisterInfo final : public AMDGPURegisterInfo {
   RegClassWeight RCW;
 
   R600RegisterInfo();
@@ -31,7 +31,7 @@ struct R600RegisterInfo : public AMDGPURegisterInfo {
   /// \brief get the HW encoding for a register's channel.
   unsigned getHWRegChan(unsigned reg) const;
 
-  unsigned getHWRegIndex(unsigned Reg) const override;
+  unsigned getHWRegIndex(unsigned Reg) const;
 
   /// \brief get the register class of the specified type to use in the
   /// CFGStructurizer
@@ -40,8 +40,13 @@ struct R600RegisterInfo : public AMDGPURegisterInfo {
   const RegClassWeight &
     getRegClassWeight(const TargetRegisterClass *RC) const override;
 
-  // \returns true if \p Reg can be defined in one ALU caluse and used in another.
+  // \returns true if \p Reg can be defined in one ALU clause and used in
+  // another.
   bool isPhysRegLiveAcrossClauses(unsigned Reg) const;
+
+  void eliminateFrameIndex(MachineBasicBlock::iterator MI, int SPAdj,
+                           unsigned FIOperandNum,
+                           RegScavenger *RS = nullptr) const override;
 };
 
 } // End namespace llvm
diff --git a/lib/Target/AMDGPU/R600Schedule.td b/lib/Target/AMDGPU/R600Schedule.td
index df62bf85c0ad..70fb46c1a7d6 100644
--- a/lib/Target/AMDGPU/R600Schedule.td
+++ b/lib/Target/AMDGPU/R600Schedule.td
@@ -9,7 +9,7 @@
 //
 // R600 has a VLIW architecture.  On pre-cayman cards there are 5 instruction
 // slots ALU.X, ALU.Y, ALU.Z, ALU.W, and TRANS.  For cayman cards, the TRANS
-// slot has been removed. 
+// slot has been removed.
 //
 //===----------------------------------------------------------------------===//
 
diff --git a/lib/Target/AMDGPU/R600TextureIntrinsicsReplacer.cpp b/lib/Target/AMDGPU/R600TextureIntrinsicsReplacer.cpp
deleted file mode 100644
index 2fc7b02f673f..000000000000
--- a/lib/Target/AMDGPU/R600TextureIntrinsicsReplacer.cpp
+++ /dev/null
@@ -1,303 +0,0 @@
-//===-- R600TextureIntrinsicsReplacer.cpp ---------------------------------===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-//
-/// \file
-/// This pass translates tgsi-like texture intrinsics into R600 texture
-/// closer to hardware intrinsics.
-//===----------------------------------------------------------------------===//
-
-#include "AMDGPU.h"
-#include "llvm/ADT/Statistic.h"
-#include "llvm/Analysis/Passes.h"
-#include "llvm/IR/Function.h"
-#include "llvm/IR/GlobalValue.h"
-#include "llvm/IR/IRBuilder.h"
-#include "llvm/IR/InstVisitor.h"
-
-using namespace llvm;
-
-namespace {
-class R600TextureIntrinsicsReplacer :
-    public FunctionPass, public InstVisitor<R600TextureIntrinsicsReplacer> {
-  static char ID;
-
-  Module *Mod;
-  Type *FloatType;
-  Type *Int32Type;
-  Type *V4f32Type;
-  Type *V4i32Type;
-  FunctionType *TexSign;
-  FunctionType *TexQSign;
-
-  void getAdjustmentFromTextureTarget(unsigned TextureType, bool hasLOD,
-                                      unsigned SrcSelect[4], unsigned CT[4],
-                                      bool &useShadowVariant) {
-    enum TextureTypes {
-      TEXTURE_1D = 1,
-      TEXTURE_2D,
-      TEXTURE_3D,
-      TEXTURE_CUBE,
-      TEXTURE_RECT,
-      TEXTURE_SHADOW1D,
-      TEXTURE_SHADOW2D,
-      TEXTURE_SHADOWRECT,
-      TEXTURE_1D_ARRAY,
-      TEXTURE_2D_ARRAY,
-      TEXTURE_SHADOW1D_ARRAY,
-      TEXTURE_SHADOW2D_ARRAY,
-      TEXTURE_SHADOWCUBE,
-      TEXTURE_2D_MSAA,
-      TEXTURE_2D_ARRAY_MSAA,
-      TEXTURE_CUBE_ARRAY,
-      TEXTURE_SHADOWCUBE_ARRAY
-    };
-
-    switch (TextureType) {
-    case 0:
-      useShadowVariant = false;
-      return;
-    case TEXTURE_RECT:
-    case TEXTURE_1D:
-    case TEXTURE_2D:
-    case TEXTURE_3D:
-    case TEXTURE_CUBE:
-    case TEXTURE_1D_ARRAY:
-    case TEXTURE_2D_ARRAY:
-    case TEXTURE_CUBE_ARRAY:
-    case TEXTURE_2D_MSAA:
-    case TEXTURE_2D_ARRAY_MSAA:
-      useShadowVariant = false;
-      break;
-    case TEXTURE_SHADOW1D:
-    case TEXTURE_SHADOW2D:
-    case TEXTURE_SHADOWRECT:
-    case TEXTURE_SHADOW1D_ARRAY:
-    case TEXTURE_SHADOW2D_ARRAY:
-    case TEXTURE_SHADOWCUBE:
-    case TEXTURE_SHADOWCUBE_ARRAY:
-      useShadowVariant = true;
-      break;
-    default:
-      llvm_unreachable("Unknow Texture Type");
-    }
-
-    if (TextureType == TEXTURE_RECT ||
-        TextureType == TEXTURE_SHADOWRECT) {
-      CT[0] = 0;
-      CT[1] = 0;
-    }
-
-    if (TextureType == TEXTURE_CUBE_ARRAY ||
-        TextureType == TEXTURE_SHADOWCUBE_ARRAY)
-      CT[2] = 0;
-
-    if (TextureType == TEXTURE_1D_ARRAY ||
-        TextureType == TEXTURE_SHADOW1D_ARRAY) {
-      if (hasLOD && useShadowVariant) {
-        CT[1] = 0;
-      } else {
-        CT[2] = 0;
-        SrcSelect[2] = 1;
-      }
-    } else if (TextureType == TEXTURE_2D_ARRAY ||
-        TextureType == TEXTURE_SHADOW2D_ARRAY) {
-      CT[2] = 0;
-    }
-
-    if ((TextureType == TEXTURE_SHADOW1D ||
-        TextureType == TEXTURE_SHADOW2D ||
-        TextureType == TEXTURE_SHADOWRECT ||
-        TextureType == TEXTURE_SHADOW1D_ARRAY) &&
-        !(hasLOD && useShadowVariant))
-      SrcSelect[3] = 2;
-  }
-
-  void ReplaceCallInst(CallInst &I, FunctionType *FT, const char *Name,
-                       unsigned SrcSelect[4], Value *Offset[3], Value *Resource,
-                       Value *Sampler, unsigned CT[4], Value *Coord) {
-    IRBuilder<> Builder(&I);
-    Constant *Mask[] = {
-      ConstantInt::get(Int32Type, SrcSelect[0]),
-      ConstantInt::get(Int32Type, SrcSelect[1]),
-      ConstantInt::get(Int32Type, SrcSelect[2]),
-      ConstantInt::get(Int32Type, SrcSelect[3])
-    };
-    Value *SwizzleMask = ConstantVector::get(Mask);
-    Value *SwizzledCoord =
-        Builder.CreateShuffleVector(Coord, Coord, SwizzleMask);
-
-    Value *Args[] = {
-      SwizzledCoord,
-      Offset[0],
-      Offset[1],
-      Offset[2],
-      Resource,
-      Sampler,
-      ConstantInt::get(Int32Type, CT[0]),
-      ConstantInt::get(Int32Type, CT[1]),
-      ConstantInt::get(Int32Type, CT[2]),
-      ConstantInt::get(Int32Type, CT[3])
-    };
-
-    Function *F = Mod->getFunction(Name);
-    if (!F) {
-      F = Function::Create(FT, GlobalValue::ExternalLinkage, Name, Mod);
-      F->addFnAttr(Attribute::ReadNone);
-    }
-    I.replaceAllUsesWith(Builder.CreateCall(F, Args));
-    I.eraseFromParent();
-  }
-
-  void ReplaceTexIntrinsic(CallInst &I, bool hasLOD, FunctionType *FT,
-                           const char *VanillaInt,
-                           const char *ShadowInt) {
-    Value *Coord = I.getArgOperand(0);
-    Value *ResourceId = I.getArgOperand(1);
-    Value *SamplerId = I.getArgOperand(2);
-
-    unsigned TextureType =
-        cast<ConstantInt>(I.getArgOperand(3))->getZExtValue();
-
-    unsigned SrcSelect[4] = { 0, 1, 2, 3 };
-    unsigned CT[4] = {1, 1, 1, 1};
-    Value *Offset[3] = {
-      ConstantInt::get(Int32Type, 0),
-      ConstantInt::get(Int32Type, 0),
-      ConstantInt::get(Int32Type, 0)
-    };
-    bool useShadowVariant;
-
-    getAdjustmentFromTextureTarget(TextureType, hasLOD, SrcSelect, CT,
-                                   useShadowVariant);
-
-    ReplaceCallInst(I, FT, useShadowVariant?ShadowInt:VanillaInt, SrcSelect,
-                    Offset, ResourceId, SamplerId, CT, Coord);
-  }
-
-  void ReplaceTXF(CallInst &I) {
-    Value *Coord = I.getArgOperand(0);
-    Value *ResourceId = I.getArgOperand(4);
-    Value *SamplerId = I.getArgOperand(5);
-
-    unsigned TextureType =
-        cast<ConstantInt>(I.getArgOperand(6))->getZExtValue();
-
-    unsigned SrcSelect[4] = { 0, 1, 2, 3 };
-    unsigned CT[4] = {1, 1, 1, 1};
-    Value *Offset[3] = {
-      I.getArgOperand(1),
-      I.getArgOperand(2),
-      I.getArgOperand(3),
-    };
-    bool useShadowVariant;
-
-    getAdjustmentFromTextureTarget(TextureType, false, SrcSelect, CT,
-                                   useShadowVariant);
-
-    ReplaceCallInst(I, TexQSign, "llvm.R600.txf", SrcSelect,
-                    Offset, ResourceId, SamplerId, CT, Coord);
-  }
-
-public:
-  R600TextureIntrinsicsReplacer():
-    FunctionPass(ID) {
-  }
-
-  bool doInitialization(Module &M) override {
-    LLVMContext &Ctx = M.getContext();
-    Mod = &M;
-    FloatType = Type::getFloatTy(Ctx);
-    Int32Type = Type::getInt32Ty(Ctx);
-    V4f32Type = VectorType::get(FloatType, 4);
-    V4i32Type = VectorType::get(Int32Type, 4);
-    Type *ArgsType[] = {
-      V4f32Type,
-      Int32Type,
-      Int32Type,
-      Int32Type,
-      Int32Type,
-      Int32Type,
-      Int32Type,
-      Int32Type,
-      Int32Type,
-      Int32Type,
-    };
-    TexSign = FunctionType::get(V4f32Type, ArgsType, /*isVarArg=*/false);
-    Type *ArgsQType[] = {
-      V4i32Type,
-      Int32Type,
-      Int32Type,
-      Int32Type,
-      Int32Type,
-      Int32Type,
-      Int32Type,
-      Int32Type,
-      Int32Type,
-      Int32Type,
-    };
-    TexQSign = FunctionType::get(V4f32Type, ArgsQType, /*isVarArg=*/false);
-    return false;
-  }
-
-  bool runOnFunction(Function &F) override {
-    visit(F);
-    return false;
-  }
-
-  const char *getPassName() const override {
-    return "R600 Texture Intrinsics Replacer";
-  }
-
-  void getAnalysisUsage(AnalysisUsage &AU) const override {
-  }
-
-  void visitCallInst(CallInst &I) {
-    if (!I.getCalledFunction())
-      return;
-
-    StringRef Name = I.getCalledFunction()->getName();
-    if (Name == "llvm.AMDGPU.tex") {
-      ReplaceTexIntrinsic(I, false, TexSign, "llvm.R600.tex", "llvm.R600.texc");
-      return;
-    }
-    if (Name == "llvm.AMDGPU.txl") {
-      ReplaceTexIntrinsic(I, true, TexSign, "llvm.R600.txl", "llvm.R600.txlc");
-      return;
-    }
-    if (Name == "llvm.AMDGPU.txb") {
-      ReplaceTexIntrinsic(I, true, TexSign, "llvm.R600.txb", "llvm.R600.txbc");
-      return;
-    }
-    if (Name == "llvm.AMDGPU.txf") {
-      ReplaceTXF(I);
-      return;
-    }
-    if (Name == "llvm.AMDGPU.txq") {
-      ReplaceTexIntrinsic(I, false, TexQSign, "llvm.R600.txq", "llvm.R600.txq");
-      return;
-    }
-    if (Name == "llvm.AMDGPU.ddx") {
-      ReplaceTexIntrinsic(I, false, TexSign, "llvm.R600.ddx", "llvm.R600.ddx");
-      return;
-    }
-    if (Name == "llvm.AMDGPU.ddy") {
-      ReplaceTexIntrinsic(I, false, TexSign, "llvm.R600.ddy", "llvm.R600.ddy");
-      return;
-    }
-  }
-
-};
-
-char R600TextureIntrinsicsReplacer::ID = 0;
-
-}
-
-FunctionPass *llvm::createR600TextureIntrinsicsReplacer() {
-  return new R600TextureIntrinsicsReplacer();
-}
diff --git a/lib/Target/AMDGPU/SIAnnotateControlFlow.cpp b/lib/Target/AMDGPU/SIAnnotateControlFlow.cpp
index fa4d24a2f25a..5f182c5304c6 100644
--- a/lib/Target/AMDGPU/SIAnnotateControlFlow.cpp
+++ b/lib/Target/AMDGPU/SIAnnotateControlFlow.cpp
@@ -14,6 +14,7 @@
 
 #include "AMDGPU.h"
 #include "llvm/ADT/DepthFirstIterator.h"
+#include "llvm/Analysis/DivergenceAnalysis.h"
 #include "llvm/Analysis/LoopInfo.h"
 #include "llvm/IR/Constants.h"
 #include "llvm/IR/Dominators.h"
@@ -34,17 +35,16 @@ typedef std::pair<BasicBlock *, Value *> StackEntry;
 typedef SmallVector<StackEntry, 16> StackVector;
 
 // Intrinsic names the control flow is annotated with
-static const char *const IfIntrinsic = "llvm.SI.if";
-static const char *const ElseIntrinsic = "llvm.SI.else";
-static const char *const BreakIntrinsic = "llvm.SI.break";
-static const char *const IfBreakIntrinsic = "llvm.SI.if.break";
-static const char *const ElseBreakIntrinsic = "llvm.SI.else.break";
-static const char *const LoopIntrinsic = "llvm.SI.loop";
-static const char *const EndCfIntrinsic = "llvm.SI.end.cf";
+static const char *const IfIntrinsic = "llvm.amdgcn.if";
+static const char *const ElseIntrinsic = "llvm.amdgcn.else";
+static const char *const BreakIntrinsic = "llvm.amdgcn.break";
+static const char *const IfBreakIntrinsic = "llvm.amdgcn.if.break";
+static const char *const ElseBreakIntrinsic = "llvm.amdgcn.else.break";
+static const char *const LoopIntrinsic = "llvm.amdgcn.loop";
+static const char *const EndCfIntrinsic = "llvm.amdgcn.end.cf";
 
 class SIAnnotateControlFlow : public FunctionPass {
-
-  static char ID;
+  DivergenceAnalysis *DA;
 
   Type *Boolean;
   Type *Void;
@@ -69,6 +69,8 @@ class SIAnnotateControlFlow : public FunctionPass {
 
   LoopInfo *LI;
 
+  bool isUniform(BranchInst *T);
+
   bool isTopOfStack(BasicBlock *BB);
 
   Value *popSaved();
@@ -83,13 +85,16 @@ class SIAnnotateControlFlow : public FunctionPass {
 
   void insertElse(BranchInst *Term);
 
-  Value *handleLoopCondition(Value *Cond, PHINode *Broken, llvm::Loop *L);
+  Value *handleLoopCondition(Value *Cond, PHINode *Broken,
+                             llvm::Loop *L, BranchInst *Term);
 
   void handleLoop(BranchInst *Term);
 
   void closeControlFlow(BasicBlock *BB);
 
 public:
+  static char ID;
+
   SIAnnotateControlFlow():
     FunctionPass(ID) { }
 
@@ -104,6 +109,7 @@ public:
   void getAnalysisUsage(AnalysisUsage &AU) const override {
     AU.addRequired<LoopInfoWrapperPass>();
     AU.addRequired<DominatorTreeWrapperPass>();
+    AU.addRequired<DivergenceAnalysis>();
     AU.addPreserved<DominatorTreeWrapperPass>();
     FunctionPass::getAnalysisUsage(AU);
   }
@@ -112,6 +118,12 @@ public:
 
 } // end anonymous namespace
 
+INITIALIZE_PASS_BEGIN(SIAnnotateControlFlow, DEBUG_TYPE,
+                      "Annotate SI Control Flow", false, false)
+INITIALIZE_PASS_DEPENDENCY(DivergenceAnalysis)
+INITIALIZE_PASS_END(SIAnnotateControlFlow, DEBUG_TYPE,
+                    "Annotate SI Control Flow", false, false)
+
 char SIAnnotateControlFlow::ID = 0;
 
 /// \brief Initialize all the types and constants used in the pass
@@ -152,6 +164,13 @@ bool SIAnnotateControlFlow::doInitialization(Module &M) {
   return false;
 }
 
+/// \brief Is the branch condition uniform or did the StructurizeCFG pass
+/// consider it as such?
+bool SIAnnotateControlFlow::isUniform(BranchInst *T) {
+  return DA->isUniform(T->getCondition()) ||
+         T->getMetadata("structurizecfg.uniform") != nullptr;
+}
+
 /// \brief Is BB the last block saved on the stack ?
 bool SIAnnotateControlFlow::isTopOfStack(BasicBlock *BB) {
   return !Stack.empty() && Stack.back().first == BB;
@@ -194,6 +213,9 @@ void SIAnnotateControlFlow::eraseIfUnused(PHINode *Phi) {
 
 /// \brief Open a new "If" block
 void SIAnnotateControlFlow::openIf(BranchInst *Term) {
+  if (isUniform(Term)) {
+    return;
+  }
   Value *Ret = CallInst::Create(If, Term->getCondition(), "", Term);
   Term->setCondition(ExtractValueInst::Create(Ret, 0, "", Term));
   push(Term->getSuccessor(1), ExtractValueInst::Create(Ret, 1, "", Term));
@@ -201,6 +223,9 @@ void SIAnnotateControlFlow::openIf(BranchInst *Term) {
 
 /// \brief Close the last "If" block and open a new "Else" block
 void SIAnnotateControlFlow::insertElse(BranchInst *Term) {
+  if (isUniform(Term)) {
+    return;
+  }
   Value *Ret = CallInst::Create(Else, popSaved(), "", Term);
   Term->setCondition(ExtractValueInst::Create(Ret, 0, "", Term));
   push(Term->getSuccessor(1), ExtractValueInst::Create(Ret, 1, "", Term));
@@ -208,7 +233,7 @@ void SIAnnotateControlFlow::insertElse(BranchInst *Term) {
 
 /// \brief Recursively handle the condition leading to a loop
 Value *SIAnnotateControlFlow::handleLoopCondition(Value *Cond, PHINode *Broken,
-                                                  llvm::Loop *L) {
+                                             llvm::Loop *L, BranchInst *Term) {
 
   // Only search through PHI nodes which are inside the loop.  If we try this
   // with PHI nodes that are outside of the loop, we end up inserting new PHI
@@ -232,7 +257,7 @@ Value *SIAnnotateControlFlow::handleLoopCondition(Value *Cond, PHINode *Broken,
       }
 
       Phi->setIncomingValue(i, BoolFalse);
-      Value *PhiArg = handleLoopCondition(Incoming, Broken, L);
+      Value *PhiArg = handleLoopCondition(Incoming, Broken, L, Term);
       NewPhi->addIncoming(PhiArg, From);
     }
 
@@ -246,7 +271,23 @@ Value *SIAnnotateControlFlow::handleLoopCondition(Value *Cond, PHINode *Broken,
 
       BasicBlock *From = Phi->getIncomingBlock(i);
       if (From == IDom) {
+        // We're in the following situation:
+        //   IDom/From
+        //      |   \
+        //      |   If-block
+        //      |   /
+        //     Parent
+        // where we want to break out of the loop if the If-block is not taken.
+        // Due to the depth-first traversal, there should be an end.cf
+        // intrinsic in Parent, and we insert an else.break before it.
+        //
+        // Note that the end.cf need not be the first non-phi instruction
+        // of parent, particularly when we're dealing with a multi-level
+        // break, but it should occur within a group of intrinsic calls
+        // at the beginning of the block.
         CallInst *OldEnd = dyn_cast<CallInst>(Parent->getFirstInsertionPt());
+        while (OldEnd && OldEnd->getCalledFunction() != EndCf)
+          OldEnd = dyn_cast<CallInst>(OldEnd->getNextNode());
         if (OldEnd && OldEnd->getCalledFunction() == EndCf) {
           Value *Args[] = { OldEnd->getArgOperand(0), NewPhi };
           Ret = CallInst::Create(ElseBreak, Args, "", OldEnd);
@@ -271,14 +312,23 @@ Value *SIAnnotateControlFlow::handleLoopCondition(Value *Cond, PHINode *Broken,
     Value *Args[] = { Cond, Broken };
     return CallInst::Create(IfBreak, Args, "", Insert);
 
+  // Insert IfBreak before TERM for constant COND.
+  } else if (isa<ConstantInt>(Cond)) {
+    Value *Args[] = { Cond, Broken };
+    return CallInst::Create(IfBreak, Args, "", Term);
+
   } else {
     llvm_unreachable("Unhandled loop condition!");
   }
-  return 0;
+  return nullptr;
 }
 
 /// \brief Handle a back edge (loop)
 void SIAnnotateControlFlow::handleLoop(BranchInst *Term) {
+  if (isUniform(Term)) {
+    return;
+  }
+
   BasicBlock *BB = Term->getParent();
   llvm::Loop *L = LI->getLoopFor(BB);
   BasicBlock *Target = Term->getSuccessor(1);
@@ -286,7 +336,7 @@ void SIAnnotateControlFlow::handleLoop(BranchInst *Term) {
 
   Value *Cond = Term->getCondition();
   Term->setCondition(BoolTrue);
-  Value *Arg = handleLoopCondition(Cond, Broken, L);
+  Value *Arg = handleLoopCondition(Cond, Broken, L, Term);
 
   for (pred_iterator PI = pred_begin(Target), PE = pred_end(Target);
        PI != PE; ++PI) {
@@ -300,6 +350,8 @@ void SIAnnotateControlFlow::handleLoop(BranchInst *Term) {
 void SIAnnotateControlFlow::closeControlFlow(BasicBlock *BB) {
   llvm::Loop *L = LI->getLoopFor(BB);
 
+  assert(Stack.back().first == BB);
+
   if (L && L->getHeader() == BB) {
     // We can't insert an EndCF call into a loop header, because it will
     // get executed on every iteration of the loop, when it should be
@@ -315,14 +367,18 @@ void SIAnnotateControlFlow::closeControlFlow(BasicBlock *BB) {
     BB = llvm::SplitBlockPredecessors(BB, Preds, "endcf.split", DT, LI, false);
   }
 
-  CallInst::Create(EndCf, popSaved(), "", &*BB->getFirstInsertionPt());
+  Value *Exec = popSaved();
+  if (!isa<UndefValue>(Exec))
+    CallInst::Create(EndCf, Exec, "", &*BB->getFirstInsertionPt());
 }
 
 /// \brief Annotate the control flow with intrinsics so the backend can
 /// recognize if/then/else and loops.
 bool SIAnnotateControlFlow::runOnFunction(Function &F) {
+
   DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
   LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
+  DA = &getAnalysis<DivergenceAnalysis>();
 
   for (df_iterator<BasicBlock *> I = df_begin(&F.getEntryBlock()),
        E = df_end(&F.getEntryBlock()); I != E; ++I) {
@@ -332,12 +388,14 @@ bool SIAnnotateControlFlow::runOnFunction(Function &F) {
     if (!Term || Term->isUnconditional()) {
       if (isTopOfStack(*I))
         closeControlFlow(*I);
+
       continue;
     }
 
     if (I.nodeVisited(Term->getSuccessor(1))) {
       if (isTopOfStack(*I))
         closeControlFlow(*I);
+
       handleLoop(Term);
       continue;
     }
diff --git a/lib/Target/AMDGPU/SIDebuggerInsertNops.cpp b/lib/Target/AMDGPU/SIDebuggerInsertNops.cpp
new file mode 100644
index 000000000000..65ceff3930ac
--- /dev/null
+++ b/lib/Target/AMDGPU/SIDebuggerInsertNops.cpp
@@ -0,0 +1,96 @@
+//===--- SIDebuggerInsertNops.cpp - Inserts nops for debugger usage -------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+/// \file
+/// \brief Inserts one nop instruction for each high level source statement for
+/// debugger usage.
+///
+/// Tools, such as a debugger, need to pause execution based on user input (i.e.
+/// breakpoint). In order to do this, one nop instruction is inserted before the
+/// first isa instruction of each high level source statement. Further, the
+/// debugger may replace nop instructions with trap instructions based on user
+/// input.
+//
+//===----------------------------------------------------------------------===//
+
+#include "SIInstrInfo.h"
+#include "AMDGPUSubtarget.h"
+#include "llvm/ADT/DenseSet.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineModuleInfo.h"
+using namespace llvm;
+
+#define DEBUG_TYPE "si-debugger-insert-nops"
+#define PASS_NAME "SI Debugger Insert Nops"
+
+namespace {
+
+class SIDebuggerInsertNops : public MachineFunctionPass {
+public:
+  static char ID;
+
+  SIDebuggerInsertNops() : MachineFunctionPass(ID) { }
+  const char *getPassName() const override { return PASS_NAME; }
+
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    AU.setPreservesCFG();
+    MachineFunctionPass::getAnalysisUsage(AU);
+  }
+
+  bool runOnMachineFunction(MachineFunction &MF) override;
+};
+
+} // anonymous namespace
+
+INITIALIZE_PASS(SIDebuggerInsertNops, DEBUG_TYPE, PASS_NAME, false, false)
+
+char SIDebuggerInsertNops::ID = 0;
+char &llvm::SIDebuggerInsertNopsID = SIDebuggerInsertNops::ID;
+
+FunctionPass *llvm::createSIDebuggerInsertNopsPass() {
+  return new SIDebuggerInsertNops();
+}
+
+bool SIDebuggerInsertNops::runOnMachineFunction(MachineFunction &MF) {
+  // Skip this pass if "amdgpu-debugger-insert-nops" attribute was not
+  // specified.
+  const SISubtarget &ST = MF.getSubtarget<SISubtarget>();
+  if (!ST.debuggerInsertNops())
+    return false;
+
+  // Skip machine functions without debug info.
+  if (!MF.getMMI().hasDebugInfo())
+    return false;
+
+  // Target instruction info.
+  const SIInstrInfo *TII = ST.getInstrInfo();
+
+  // Set containing line numbers that have nop inserted.
+  DenseSet<unsigned> NopInserted;
+
+  for (auto &MBB : MF) {
+    for (auto MI = MBB.begin(); MI != MBB.end(); ++MI) {
+      // Skip DBG_VALUE instructions and instructions without location.
+      if (MI->isDebugValue() || !MI->getDebugLoc())
+        continue;
+
+      // Insert nop instruction if line number does not have nop inserted.
+      auto DL = MI->getDebugLoc();
+      if (NopInserted.find(DL.getLine()) == NopInserted.end()) {
+        BuildMI(MBB, *MI, DL, TII->get(AMDGPU::S_NOP))
+          .addImm(0);
+        NopInserted.insert(DL.getLine());
+      }
+    }
+  }
+
+  return true;
+}
diff --git a/lib/Target/AMDGPU/SIDefines.h b/lib/Target/AMDGPU/SIDefines.h
index aa1e352ed748..54efdc0a0466 100644
--- a/lib/Target/AMDGPU/SIDefines.h
+++ b/lib/Target/AMDGPU/SIDefines.h
@@ -10,8 +10,8 @@
 
 #include "llvm/MC/MCInstrDesc.h"
 
-#ifndef LLVM_LIB_TARGET_R600_SIDEFINES_H
-#define LLVM_LIB_TARGET_R600_SIDEFINES_H
+#ifndef LLVM_LIB_TARGET_AMDGPU_SIDEFINES_H
+#define LLVM_LIB_TARGET_AMDGPU_SIDEFINES_H
 
 namespace SIInstrFlags {
 // This needs to be kept in sync with the field bits in InstSI.
@@ -29,16 +29,19 @@ enum {
   VOP2 = 1 << 11,
   VOP3 = 1 << 12,
   VOPC = 1 << 13,
+  SDWA = 1 << 14,
+  DPP = 1 << 15,
 
-  MUBUF = 1 << 14,
-  MTBUF = 1 << 15,
-  SMRD = 1 << 16,
-  DS = 1 << 17,
-  MIMG = 1 << 18,
-  FLAT = 1 << 19,
-  WQM = 1 << 20,
-  VGPRSpill = 1 << 21,
-  VOPAsmPrefer32Bit = 1 << 22
+  MUBUF = 1 << 16,
+  MTBUF = 1 << 17,
+  SMRD = 1 << 18,
+  DS = 1 << 19,
+  MIMG = 1 << 20,
+  FLAT = 1 << 21,
+  WQM = 1 << 22,
+  VGPRSpill = 1 << 23,
+  VOPAsmPrefer32Bit = 1 << 24,
+  Gather4 = 1 << 25
 };
 }
 
@@ -46,9 +49,14 @@ namespace llvm {
 namespace AMDGPU {
   enum OperandType {
     /// Operand with register or 32-bit immediate
-    OPERAND_REG_IMM32 = llvm::MCOI::OPERAND_FIRST_TARGET,
+    OPERAND_REG_IMM32 = MCOI::OPERAND_FIRST_TARGET,
     /// Operand with register or inline constant
-    OPERAND_REG_INLINE_C
+    OPERAND_REG_INLINE_C,
+
+    /// Operand with 32-bit immediate that uses the constant bus. The standard
+    /// OPERAND_IMMEDIATE should be used for special immediates such as source
+    /// modifiers.
+    OPERAND_KIMM32
   };
 }
 }
@@ -77,10 +85,13 @@ namespace SIInstrFlags {
   };
 }
 
+// Input operand modifiers bit-masks
+// NEG and SEXT share same bit-mask because they can't be set simultaneously.
 namespace SISrcMods {
   enum {
-   NEG = 1 << 0,
-   ABS = 1 << 1
+   NEG = 1 << 0,  // Floating-point negate modifier
+   ABS = 1 << 1,  // Floating-point absolute modifier
+   SEXT = 1 << 0  // Integer sign-extend modifier
   };
 }
 
@@ -93,6 +104,109 @@ namespace SIOutMods {
   };
 }
 
+namespace llvm {
+namespace AMDGPU {
+namespace EncValues { // Encoding values of enum9/8/7 operands
+
+enum {
+  SGPR_MIN = 0,
+  SGPR_MAX = 101,
+  TTMP_MIN = 112,
+  TTMP_MAX = 123,
+  INLINE_INTEGER_C_MIN = 128,
+  INLINE_INTEGER_C_POSITIVE_MAX = 192, // 64
+  INLINE_INTEGER_C_MAX = 208,
+  INLINE_FLOATING_C_MIN = 240,
+  INLINE_FLOATING_C_MAX = 248,
+  LITERAL_CONST = 255,
+  VGPR_MIN = 256,
+  VGPR_MAX = 511
+};
+
+} // namespace EncValues
+} // namespace AMDGPU
+} // namespace llvm
+
+namespace llvm {
+namespace AMDGPU {
+namespace SendMsg { // Encoding of SIMM16 used in s_sendmsg* insns.
+
+enum Id { // Message ID, width(4) [3:0].
+  ID_UNKNOWN_ = -1,
+  ID_INTERRUPT = 1,
+  ID_GS,
+  ID_GS_DONE,
+  ID_SYSMSG = 15,
+  ID_GAPS_LAST_, // Indicate that sequence has gaps.
+  ID_GAPS_FIRST_ = ID_INTERRUPT,
+  ID_SHIFT_ = 0,
+  ID_WIDTH_ = 4,
+  ID_MASK_ = (((1 << ID_WIDTH_) - 1) << ID_SHIFT_)
+};
+
+enum Op { // Both GS and SYS operation IDs.
+  OP_UNKNOWN_ = -1,
+  OP_SHIFT_ = 4,
+  // width(2) [5:4]
+  OP_GS_NOP = 0,
+  OP_GS_CUT,
+  OP_GS_EMIT,
+  OP_GS_EMIT_CUT,
+  OP_GS_LAST_,
+  OP_GS_FIRST_ = OP_GS_NOP,
+  OP_GS_WIDTH_ = 2,
+  OP_GS_MASK_ = (((1 << OP_GS_WIDTH_) - 1) << OP_SHIFT_),
+  // width(3) [6:4]
+  OP_SYS_ECC_ERR_INTERRUPT = 1,
+  OP_SYS_REG_RD,
+  OP_SYS_HOST_TRAP_ACK,
+  OP_SYS_TTRACE_PC,
+  OP_SYS_LAST_,
+  OP_SYS_FIRST_ = OP_SYS_ECC_ERR_INTERRUPT,
+  OP_SYS_WIDTH_ = 3,
+  OP_SYS_MASK_ = (((1 << OP_SYS_WIDTH_) - 1) << OP_SHIFT_)
+};
+
+enum StreamId { // Stream ID, (2) [9:8].
+  STREAM_ID_DEFAULT_ = 0,
+  STREAM_ID_LAST_ = 4,
+  STREAM_ID_FIRST_ = STREAM_ID_DEFAULT_,
+  STREAM_ID_SHIFT_ = 8,
+  STREAM_ID_WIDTH_=  2,
+  STREAM_ID_MASK_ = (((1 << STREAM_ID_WIDTH_) - 1) << STREAM_ID_SHIFT_)
+};
+
+} // namespace SendMsg
+
+namespace Hwreg { // Encoding of SIMM16 used in s_setreg/getreg* insns.
+
+enum Id { // HwRegCode, (6) [5:0]
+  ID_UNKNOWN_ = -1,
+  ID_SYMBOLIC_FIRST_ = 1, // There are corresponding symbolic names defined.
+  ID_SYMBOLIC_LAST_ = 8,
+  ID_SHIFT_ = 0,
+  ID_WIDTH_ = 6,
+  ID_MASK_ = (((1 << ID_WIDTH_) - 1) << ID_SHIFT_)
+};
+
+enum Offset { // Offset, (5) [10:6]
+  OFFSET_DEFAULT_ = 0,
+  OFFSET_SHIFT_ = 6,
+  OFFSET_WIDTH_ = 5,
+  OFFSET_MASK_ = (((1 << OFFSET_WIDTH_) - 1) << OFFSET_SHIFT_)
+};
+
+enum WidthMinusOne { // WidthMinusOne, (5) [15:11]
+  WIDTH_M1_DEFAULT_ = 31,
+  WIDTH_M1_SHIFT_ = 11,
+  WIDTH_M1_WIDTH_ = 5,
+  WIDTH_M1_MASK_ = (((1 << WIDTH_M1_WIDTH_) - 1) << WIDTH_M1_SHIFT_)
+};
+
+} // namespace Hwreg
+} // namespace AMDGPU
+} // namespace llvm
+
 #define R_00B028_SPI_SHADER_PGM_RSRC1_PS                                0x00B028
 #define R_00B02C_SPI_SHADER_PGM_RSRC2_PS                                0x00B02C
 #define   S_00B02C_EXTRA_LDS_SIZE(x)                                  (((x) & 0xFF) << 8)
@@ -134,7 +248,7 @@ namespace SIOutMods {
 #define   C_00B84C_LDS_SIZE                                           0xFF007FFF
 #define   S_00B84C_EXCP_EN(x)                                         (((x) & 0x7F) << 24)
 #define   G_00B84C_EXCP_EN(x)                                         (((x) >> 24) & 0x7F)
-#define   C_00B84C_EXCP_EN 
+#define   C_00B84C_EXCP_EN
 
 #define R_0286CC_SPI_PS_INPUT_ENA                                       0x0286CC
 #define R_0286D0_SPI_PS_INPUT_ADDR                                      0x0286D0
@@ -194,5 +308,7 @@ namespace SIOutMods {
 #define R_0286E8_SPI_TMPRING_SIZE                                       0x0286E8
 #define   S_0286E8_WAVESIZE(x)                                        (((x) & 0x1FFF) << 12)
 
+#define R_SPILLED_SGPRS         0x4
+#define R_SPILLED_VGPRS         0x8
 
 #endif
diff --git a/lib/Target/AMDGPU/SIFixSGPRCopies.cpp b/lib/Target/AMDGPU/SIFixSGPRCopies.cpp
index f59d9948f98e..9e0086b79087 100644
--- a/lib/Target/AMDGPU/SIFixSGPRCopies.cpp
+++ b/lib/Target/AMDGPU/SIFixSGPRCopies.cpp
@@ -77,7 +77,7 @@
 
 using namespace llvm;
 
-#define DEBUG_TYPE "sgpr-copies"
+#define DEBUG_TYPE "si-fix-sgpr-copies"
 
 namespace {
 
@@ -237,11 +237,10 @@ static bool foldVGPRCopyIntoRegSequence(MachineInstr &MI,
 }
 
 bool SIFixSGPRCopies::runOnMachineFunction(MachineFunction &MF) {
+  const SISubtarget &ST = MF.getSubtarget<SISubtarget>();
   MachineRegisterInfo &MRI = MF.getRegInfo();
-  const SIRegisterInfo *TRI =
-      static_cast<const SIRegisterInfo *>(MF.getSubtarget().getRegisterInfo());
-  const SIInstrInfo *TII =
-      static_cast<const SIInstrInfo *>(MF.getSubtarget().getInstrInfo());
+  const SIRegisterInfo *TRI = ST.getRegisterInfo();
+  const SIInstrInfo *TII = ST.getInstrInfo();
 
   SmallVector<MachineInstr *, 16> Worklist;
 
diff --git a/lib/Target/AMDGPU/SIFixSGPRLiveRanges.cpp b/lib/Target/AMDGPU/SIFixSGPRLiveRanges.cpp
deleted file mode 100644
index 8bda283f0fca..000000000000
--- a/lib/Target/AMDGPU/SIFixSGPRLiveRanges.cpp
+++ /dev/null
@@ -1,219 +0,0 @@
-//===-- SIFixSGPRLiveRanges.cpp - Fix SGPR live ranges ----------------------===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-//
-/// \file SALU instructions ignore the execution mask, so we need to modify the
-/// live ranges of the registers they define in some cases.
-///
-/// The main case we need to handle is when a def is used in one side of a
-/// branch and not another.  For example:
-///
-/// %def
-/// IF
-///   ...
-///   ...
-/// ELSE
-///   %use
-///   ...
-/// ENDIF
-///
-/// Here we need the register allocator to avoid assigning any of the defs
-/// inside of the IF to the same register as %def.  In traditional live
-/// interval analysis %def is not live inside the IF branch, however, since
-/// SALU instructions inside of IF will be executed even if the branch is not
-/// taken, there is the chance that one of the instructions will overwrite the
-/// value of %def, so the use in ELSE will see the wrong value.
-///
-/// The strategy we use for solving this is to add an extra use after the ENDIF:
-///
-/// %def
-/// IF
-///   ...
-///   ...
-/// ELSE
-///   %use
-///   ...
-/// ENDIF
-/// %use
-///
-/// Adding this use will make the def live throughout the IF branch, which is
-/// what we want.
-
-#include "AMDGPU.h"
-#include "SIInstrInfo.h"
-#include "SIRegisterInfo.h"
-#include "llvm/ADT/DepthFirstIterator.h"
-#include "llvm/CodeGen/LiveIntervalAnalysis.h"
-#include "llvm/CodeGen/LiveVariables.h"
-#include "llvm/CodeGen/MachineFunctionPass.h"
-#include "llvm/CodeGen/MachineInstrBuilder.h"
-#include "llvm/CodeGen/MachinePostDominators.h"
-#include "llvm/CodeGen/MachineRegisterInfo.h"
-#include "llvm/Support/Debug.h"
-#include "llvm/Support/raw_ostream.h"
-#include "llvm/Target/TargetMachine.h"
-
-using namespace llvm;
-
-#define DEBUG_TYPE "si-fix-sgpr-live-ranges"
-
-namespace {
-
-class SIFixSGPRLiveRanges : public MachineFunctionPass {
-public:
-  static char ID;
-
-public:
-  SIFixSGPRLiveRanges() : MachineFunctionPass(ID) {
-    initializeSIFixSGPRLiveRangesPass(*PassRegistry::getPassRegistry());
-  }
-
-  bool runOnMachineFunction(MachineFunction &MF) override;
-
-  const char *getPassName() const override {
-    return "SI Fix SGPR live ranges";
-  }
-
-  void getAnalysisUsage(AnalysisUsage &AU) const override {
-    AU.addRequired<LiveVariables>();
-    AU.addPreserved<LiveVariables>();
-
-    AU.addRequired<MachinePostDominatorTree>();
-    AU.addPreserved<MachinePostDominatorTree>();
-    AU.setPreservesCFG();
-
-    MachineFunctionPass::getAnalysisUsage(AU);
-  }
-};
-
-} // End anonymous namespace.
-
-INITIALIZE_PASS_BEGIN(SIFixSGPRLiveRanges, DEBUG_TYPE,
-                      "SI Fix SGPR Live Ranges", false, false)
-INITIALIZE_PASS_DEPENDENCY(LiveVariables)
-INITIALIZE_PASS_DEPENDENCY(MachinePostDominatorTree)
-INITIALIZE_PASS_END(SIFixSGPRLiveRanges, DEBUG_TYPE,
-                    "SI Fix SGPR Live Ranges", false, false)
-
-char SIFixSGPRLiveRanges::ID = 0;
-
-char &llvm::SIFixSGPRLiveRangesID = SIFixSGPRLiveRanges::ID;
-
-FunctionPass *llvm::createSIFixSGPRLiveRangesPass() {
-  return new SIFixSGPRLiveRanges();
-}
-
-bool SIFixSGPRLiveRanges::runOnMachineFunction(MachineFunction &MF) {
-  MachineRegisterInfo &MRI = MF.getRegInfo();
-  const TargetInstrInfo *TII = MF.getSubtarget().getInstrInfo();
-  const SIRegisterInfo *TRI = static_cast<const SIRegisterInfo *>(
-      MF.getSubtarget().getRegisterInfo());
-  bool MadeChange = false;
-
-  MachinePostDominatorTree *PDT = &getAnalysis<MachinePostDominatorTree>();
-  SmallVector<unsigned, 16> SGPRLiveRanges;
-
-  LiveVariables *LV = &getAnalysis<LiveVariables>();
-  MachineBasicBlock *Entry = &MF.front();
-
-  // Use a depth first order so that in SSA, we encounter all defs before
-  // uses. Once the defs of the block have been found, attempt to insert
-  // SGPR_USE instructions in successor blocks if required.
-  for (MachineBasicBlock *MBB : depth_first(Entry)) {
-    for (const MachineInstr &MI : *MBB) {
-      for (const MachineOperand &MO : MI.defs()) {
-        // We should never see a live out def of a physical register, so we also
-        // do not need to worry about implicit_defs().
-        unsigned Def = MO.getReg();
-        if (TargetRegisterInfo::isVirtualRegister(Def)) {
-          if (TRI->isSGPRClass(MRI.getRegClass(Def))) {
-            // Only consider defs that are live outs. We don't care about def /
-            // use within the same block.
-
-            // LiveVariables does not consider registers that are only used in a
-            // phi in a sucessor block as live out, unlike LiveIntervals.
-            //
-            // This is OK because SIFixSGPRCopies replaced any SGPR phis with
-            // VGPRs.
-            if (LV->isLiveOut(Def, *MBB))
-              SGPRLiveRanges.push_back(Def);
-          }
-        }
-      }
-    }
-
-    if (MBB->succ_size() < 2)
-      continue;
-
-    // We have structured control flow, so the number of successors should be
-    // two.
-    assert(MBB->succ_size() == 2);
-    MachineBasicBlock *SuccA = *MBB->succ_begin();
-    MachineBasicBlock *SuccB = *(++MBB->succ_begin());
-    MachineBasicBlock *NCD = PDT->findNearestCommonDominator(SuccA, SuccB);
-
-    if (!NCD)
-      continue;
-
-    MachineBasicBlock::iterator NCDTerm = NCD->getFirstTerminator();
-
-    if (NCDTerm != NCD->end() && NCDTerm->getOpcode() == AMDGPU::SI_ELSE) {
-      assert(NCD->succ_size() == 2);
-      // We want to make sure we insert the Use after the ENDIF, not after
-      // the ELSE.
-      NCD = PDT->findNearestCommonDominator(*NCD->succ_begin(),
-                                            *(++NCD->succ_begin()));
-    }
-
-    for (unsigned Reg : SGPRLiveRanges) {
-      // FIXME: We could be smarter here. If the register is Live-In to one
-      // block, but the other doesn't have any SGPR defs, then there won't be a
-      // conflict. Also, if the branch condition is uniform then there will be
-      // no conflict.
-      bool LiveInToA = LV->isLiveIn(Reg, *SuccA);
-      bool LiveInToB = LV->isLiveIn(Reg, *SuccB);
-
-      if (!LiveInToA && !LiveInToB) {
-        DEBUG(dbgs() << PrintReg(Reg, TRI, 0)
-              << " is live into neither successor\n");
-        continue;
-      }
-
-      if (LiveInToA && LiveInToB) {
-        DEBUG(dbgs() << PrintReg(Reg, TRI, 0)
-              << " is live into both successors\n");
-        continue;
-      }
-
-      // This interval is live in to one successor, but not the other, so
-      // we need to update its range so it is live in to both.
-      DEBUG(dbgs() << "Possible SGPR conflict detected for "
-            << PrintReg(Reg, TRI, 0)
-            << " BB#" << SuccA->getNumber()
-            << ", BB#" << SuccB->getNumber()
-            << " with NCD = BB#" << NCD->getNumber() << '\n');
-
-      assert(TargetRegisterInfo::isVirtualRegister(Reg) &&
-             "Not expecting to extend live range of physreg");
-
-      // FIXME: Need to figure out how to update LiveRange here so this pass
-      // will be able to preserve LiveInterval analysis.
-      MachineInstr *NCDSGPRUse =
-        BuildMI(*NCD, NCD->getFirstNonPHI(), DebugLoc(),
-                TII->get(AMDGPU::SGPR_USE))
-        .addReg(Reg, RegState::Implicit);
-
-      MadeChange = true;
-      LV->HandleVirtRegUse(Reg, NCD, NCDSGPRUse);
-
-      DEBUG(NCDSGPRUse->dump());
-    }
-  }
-
-  return MadeChange;
-}
diff --git a/lib/Target/AMDGPU/SIFoldOperands.cpp b/lib/Target/AMDGPU/SIFoldOperands.cpp
index 6230d1e28b74..4ecc0fcc6232 100644
--- a/lib/Target/AMDGPU/SIFoldOperands.cpp
+++ b/lib/Target/AMDGPU/SIFoldOperands.cpp
@@ -13,12 +13,9 @@
 #include "AMDGPUSubtarget.h"
 #include "SIInstrInfo.h"
 #include "llvm/CodeGen/LiveIntervalAnalysis.h"
-#include "llvm/CodeGen/MachineDominators.h"
 #include "llvm/CodeGen/MachineFunctionPass.h"
 #include "llvm/CodeGen/MachineInstrBuilder.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
-#include "llvm/IR/Function.h"
-#include "llvm/IR/LLVMContext.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/Target/TargetMachine.h"
@@ -44,8 +41,6 @@ public:
   }
 
   void getAnalysisUsage(AnalysisUsage &AU) const override {
-    AU.addRequired<MachineDominatorTree>();
-    AU.addPreserved<MachineDominatorTree>();
     AU.setPreservesCFG();
     MachineFunctionPass::getAnalysisUsage(AU);
   }
@@ -76,11 +71,8 @@ struct FoldCandidate {
 
 } // End anonymous namespace.
 
-INITIALIZE_PASS_BEGIN(SIFoldOperands, DEBUG_TYPE,
-                      "SI Fold Operands", false, false)
-INITIALIZE_PASS_DEPENDENCY(MachineDominatorTree)
-INITIALIZE_PASS_END(SIFoldOperands, DEBUG_TYPE,
-                    "SI Fold Operands", false, false)
+INITIALIZE_PASS(SIFoldOperands, DEBUG_TYPE,
+                "SI Fold Operands", false, false)
 
 char SIFoldOperands::ID = 0;
 
@@ -140,7 +132,7 @@ static bool tryAddToFoldList(std::vector<FoldCandidate> &FoldList,
                              MachineInstr *MI, unsigned OpNo,
                              MachineOperand *OpToFold,
                              const SIInstrInfo *TII) {
-  if (!TII->isOperandLegal(MI, OpNo, OpToFold)) {
+  if (!TII->isOperandLegal(*MI, OpNo, OpToFold)) {
 
     // Special case for v_mac_f32_e64 if we are trying to fold into src2
     unsigned Opc = MI->getOpcode();
@@ -167,7 +159,7 @@ static bool tryAddToFoldList(std::vector<FoldCandidate> &FoldList,
     // see if this makes it possible to fold.
     unsigned CommuteIdx0 = TargetInstrInfo::CommuteAnyOperandIndex;
     unsigned CommuteIdx1 = TargetInstrInfo::CommuteAnyOperandIndex;
-    bool CanCommute = TII->findCommutedOpIndices(MI, CommuteIdx0, CommuteIdx1);
+    bool CanCommute = TII->findCommutedOpIndices(*MI, CommuteIdx0, CommuteIdx1);
 
     if (CanCommute) {
       if (CommuteIdx0 == OpNo)
@@ -185,10 +177,10 @@ static bool tryAddToFoldList(std::vector<FoldCandidate> &FoldList,
       return false;
 
     if (!CanCommute ||
-        !TII->commuteInstruction(MI, false, CommuteIdx0, CommuteIdx1))
+        !TII->commuteInstruction(*MI, false, CommuteIdx0, CommuteIdx1))
       return false;
 
-    if (!TII->isOperandLegal(MI, OpNo, OpToFold))
+    if (!TII->isOperandLegal(*MI, OpNo, OpToFold))
       return false;
   }
 
@@ -301,9 +293,13 @@ static void foldOperand(MachineOperand &OpToFold, MachineInstr *UseMI,
 }
 
 bool SIFoldOperands::runOnMachineFunction(MachineFunction &MF) {
+  if (skipFunction(*MF.getFunction()))
+    return false;
+
+  const SISubtarget &ST = MF.getSubtarget<SISubtarget>();
+
   MachineRegisterInfo &MRI = MF.getRegInfo();
-  const SIInstrInfo *TII =
-      static_cast<const SIInstrInfo *>(MF.getSubtarget().getInstrInfo());
+  const SIInstrInfo *TII = ST.getInstrInfo();
   const SIRegisterInfo &TRI = TII->getRegisterInfo();
 
   for (MachineFunction::iterator BI = MF.begin(), BE = MF.end();
diff --git a/lib/Target/AMDGPU/SIFrameLowering.cpp b/lib/Target/AMDGPU/SIFrameLowering.cpp
index 7d20509c464d..03b11f0fd38d 100644
--- a/lib/Target/AMDGPU/SIFrameLowering.cpp
+++ b/lib/Target/AMDGPU/SIFrameLowering.cpp
@@ -11,6 +11,8 @@
 #include "SIInstrInfo.h"
 #include "SIMachineFunctionInfo.h"
 #include "SIRegisterInfo.h"
+#include "AMDGPUSubtarget.h"
+
 #include "llvm/CodeGen/MachineFrameInfo.h"
 #include "llvm/CodeGen/MachineFunction.h"
 #include "llvm/CodeGen/MachineInstrBuilder.h"
@@ -21,24 +23,13 @@ using namespace llvm;
 
 static bool hasOnlySGPRSpills(const SIMachineFunctionInfo *FuncInfo,
                               const MachineFrameInfo *FrameInfo) {
-  if (!FuncInfo->hasSpilledSGPRs())
-    return false;
-
-  if (FuncInfo->hasSpilledVGPRs())
-    return false;
-
-  for (int I = FrameInfo->getObjectIndexBegin(),
-         E = FrameInfo->getObjectIndexEnd(); I != E; ++I) {
-    if (!FrameInfo->isSpillSlotObjectIndex(I))
-      return false;
-  }
-
-  return true;
+  return FuncInfo->hasSpilledSGPRs() &&
+    (!FuncInfo->hasSpilledVGPRs() && !FuncInfo->hasNonSpillStackObjects());
 }
 
 static ArrayRef<MCPhysReg> getAllSGPR128() {
-  return makeArrayRef(AMDGPU::SReg_128RegClass.begin(),
-                      AMDGPU::SReg_128RegClass.getNumRegs());
+  return makeArrayRef(AMDGPU::SGPR_128RegClass.begin(),
+                      AMDGPU::SGPR_128RegClass.getNumRegs());
 }
 
 static ArrayRef<MCPhysReg> getAllSGPRs() {
@@ -48,6 +39,12 @@ static ArrayRef<MCPhysReg> getAllSGPRs() {
 
 void SIFrameLowering::emitPrologue(MachineFunction &MF,
                                    MachineBasicBlock &MBB) const {
+  // Emit debugger prologue if "amdgpu-debugger-emit-prologue" attribute was
+  // specified.
+  const SISubtarget &ST = MF.getSubtarget<SISubtarget>();
+  if (ST.debuggerEmitPrologue())
+    emitDebuggerPrologue(MF, MBB);
+
   if (!MF.getFrameInfo()->hasStackObjects())
     return;
 
@@ -63,10 +60,10 @@ void SIFrameLowering::emitPrologue(MachineFunction &MF,
   if (hasOnlySGPRSpills(MFI, MF.getFrameInfo()))
     return;
 
-  const SIInstrInfo *TII =
-      static_cast<const SIInstrInfo *>(MF.getSubtarget().getInstrInfo());
+  const SIInstrInfo *TII = ST.getInstrInfo();
   const SIRegisterInfo *TRI = &TII->getRegisterInfo();
-  const AMDGPUSubtarget &ST = MF.getSubtarget<AMDGPUSubtarget>();
+  MachineRegisterInfo &MRI = MF.getRegInfo();
+  MachineBasicBlock::iterator I = MBB.begin();
 
   // We need to insert initialization of the scratch resource descriptor.
   unsigned ScratchRsrcReg = MFI->getScratchRSrcReg();
@@ -84,6 +81,46 @@ void SIFrameLowering::emitPrologue(MachineFunction &MF,
       MF, SIRegisterInfo::PRIVATE_SEGMENT_BUFFER);
   }
 
+  if (MFI->hasFlatScratchInit()) {
+    // We don't need this if we only have spills since there is no user facing
+    // scratch.
+
+    // TODO: If we know we don't have flat instructions earlier, we can omit
+    // this from the input registers.
+    //
+    // TODO: We only need to know if we access scratch space through a flat
+    // pointer. Because we only detect if flat instructions are used at all,
+    // this will be used more often than necessary on VI.
+
+    // Debug location must be unknown since the first debug location is used to
+    // determine the end of the prologue.
+    DebugLoc DL;
+
+    unsigned FlatScratchInitReg
+      = TRI->getPreloadedValue(MF, SIRegisterInfo::FLAT_SCRATCH_INIT);
+
+    MRI.addLiveIn(FlatScratchInitReg);
+    MBB.addLiveIn(FlatScratchInitReg);
+
+    // Copy the size in bytes.
+    unsigned FlatScrInitHi = TRI->getSubReg(FlatScratchInitReg, AMDGPU::sub1);
+    BuildMI(MBB, I, DL, TII->get(AMDGPU::S_MOV_B32), AMDGPU::FLAT_SCR_LO)
+      .addReg(FlatScrInitHi, RegState::Kill);
+
+    unsigned FlatScrInitLo = TRI->getSubReg(FlatScratchInitReg, AMDGPU::sub0);
+
+    // Add wave offset in bytes to private base offset.
+    // See comment in AMDKernelCodeT.h for enable_sgpr_flat_scratch_init.
+    BuildMI(MBB, I, DL, TII->get(AMDGPU::S_ADD_U32), FlatScrInitLo)
+      .addReg(FlatScrInitLo)
+      .addReg(ScratchWaveOffsetReg);
+
+    // Convert offset to 256-byte units.
+    BuildMI(MBB, I, DL, TII->get(AMDGPU::S_LSHR_B32), AMDGPU::FLAT_SCR_HI)
+      .addReg(FlatScrInitLo, RegState::Kill)
+      .addImm(8);
+  }
+
   // If we reserved the original input registers, we don't need to copy to the
   // reserved registers.
   if (ScratchRsrcReg == PreloadedPrivateBufferReg) {
@@ -96,7 +133,6 @@ void SIFrameLowering::emitPrologue(MachineFunction &MF,
 
   // We added live-ins during argument lowering, but since they were not used
   // they were deleted. We're adding the uses now, so add them back.
-  MachineRegisterInfo &MRI = MF.getRegInfo();
   MRI.addLiveIn(PreloadedScratchWaveOffsetReg);
   MBB.addLiveIn(PreloadedScratchWaveOffsetReg);
 
@@ -137,15 +173,28 @@ void SIFrameLowering::emitPrologue(MachineFunction &MF,
 
     if (ScratchWaveOffsetReg == TRI->reservedPrivateSegmentWaveByteOffsetReg(MF)) {
       MachineRegisterInfo &MRI = MF.getRegInfo();
-      // Skip the last 2 elements because the last one is reserved for VCC, and
-      // this is the 2nd to last element already.
       unsigned NumPreloaded = MFI->getNumPreloadedSGPRs();
-      for (MCPhysReg Reg : getAllSGPRs().drop_back(6).slice(NumPreloaded)) {
+
+      // We need to drop register from the end of the list that we cannot use
+      // for the scratch wave offset.
+      // + 2 s102 and s103 do not exist on VI.
+      // + 2 for vcc
+      // + 2 for xnack_mask
+      // + 2 for flat_scratch
+      // + 4 for registers reserved for scratch resource register
+      // + 1 for register reserved for scratch wave offset.  (By exluding this
+      //     register from the list to consider, it means that when this
+      //     register is being used for the scratch wave offset and there
+      //     are no other free SGPRs, then the value will stay in this register.
+      // ----
+      //  13
+      for (MCPhysReg Reg : getAllSGPRs().drop_back(13).slice(NumPreloaded)) {
         // Pick the first unallocated SGPR. Be careful not to pick an alias of the
         // scratch descriptor, since we haven’t added its uses yet.
         if (!MRI.isPhysRegUsed(Reg)) {
-          assert(MRI.isAllocatable(Reg) &&
-                !TRI->isSubRegisterEq(ScratchRsrcReg, Reg));
+          if (!MRI.isAllocatable(Reg) ||
+              TRI->isSubRegisterEq(ScratchRsrcReg, Reg))
+            continue;
 
           MRI.replaceRegWith(ScratchWaveOffsetReg, Reg);
           ScratchWaveOffsetReg = Reg;
@@ -160,7 +209,6 @@ void SIFrameLowering::emitPrologue(MachineFunction &MF,
   assert(!TRI->isSubRegister(ScratchRsrcReg, ScratchWaveOffsetReg));
 
   const MCInstrDesc &SMovB32 = TII->get(AMDGPU::S_MOV_B32);
-  MachineBasicBlock::iterator I = MBB.begin();
   DebugLoc DL;
 
   if (PreloadedScratchWaveOffsetReg != ScratchWaveOffsetReg) {
@@ -223,6 +271,11 @@ void SIFrameLowering::emitPrologue(MachineFunction &MF,
   }
 }
 
+void SIFrameLowering::emitEpilogue(MachineFunction &MF,
+                                   MachineBasicBlock &MBB) const {
+
+}
+
 void SIFrameLowering::processFunctionBeforeFrameFinalized(
   MachineFunction &MF,
   RegScavenger *RS) const {
@@ -243,3 +296,44 @@ void SIFrameLowering::processFunctionBeforeFrameFinalized(
     RS->addScavengingFrameIndex(ScavengeFI);
   }
 }
+
+void SIFrameLowering::emitDebuggerPrologue(MachineFunction &MF,
+                                           MachineBasicBlock &MBB) const {
+  const SISubtarget &ST = MF.getSubtarget<SISubtarget>();
+  const SIInstrInfo *TII = ST.getInstrInfo();
+  const SIRegisterInfo *TRI = &TII->getRegisterInfo();
+  const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
+
+  MachineBasicBlock::iterator I = MBB.begin();
+  DebugLoc DL;
+
+  // For each dimension:
+  for (unsigned i = 0; i < 3; ++i) {
+    // Get work group ID SGPR, and make it live-in again.
+    unsigned WorkGroupIDSGPR = MFI->getWorkGroupIDSGPR(i);
+    MF.getRegInfo().addLiveIn(WorkGroupIDSGPR);
+    MBB.addLiveIn(WorkGroupIDSGPR);
+
+    // Since SGPRs are spilled into VGPRs, copy work group ID SGPR to VGPR in
+    // order to spill it to scratch.
+    unsigned WorkGroupIDVGPR =
+      MF.getRegInfo().createVirtualRegister(&AMDGPU::VGPR_32RegClass);
+    BuildMI(MBB, I, DL, TII->get(AMDGPU::V_MOV_B32_e32), WorkGroupIDVGPR)
+      .addReg(WorkGroupIDSGPR);
+
+    // Spill work group ID.
+    int WorkGroupIDObjectIdx = MFI->getDebuggerWorkGroupIDStackObjectIndex(i);
+    TII->storeRegToStackSlot(MBB, I, WorkGroupIDVGPR, false,
+      WorkGroupIDObjectIdx, &AMDGPU::VGPR_32RegClass, TRI);
+
+    // Get work item ID VGPR, and make it live-in again.
+    unsigned WorkItemIDVGPR = MFI->getWorkItemIDVGPR(i);
+    MF.getRegInfo().addLiveIn(WorkItemIDVGPR);
+    MBB.addLiveIn(WorkItemIDVGPR);
+
+    // Spill work item ID.
+    int WorkItemIDObjectIdx = MFI->getDebuggerWorkItemIDStackObjectIndex(i);
+    TII->storeRegToStackSlot(MBB, I, WorkItemIDVGPR, false,
+      WorkItemIDObjectIdx, &AMDGPU::VGPR_32RegClass, TRI);
+  }
+}
diff --git a/lib/Target/AMDGPU/SIFrameLowering.h b/lib/Target/AMDGPU/SIFrameLowering.h
index a9152fd8b2aa..37417d098f31 100644
--- a/lib/Target/AMDGPU/SIFrameLowering.h
+++ b/lib/Target/AMDGPU/SIFrameLowering.h
@@ -23,10 +23,16 @@ public:
 
   void emitPrologue(MachineFunction &MF,
                     MachineBasicBlock &MBB) const override;
+  void emitEpilogue(MachineFunction &MF,
+                    MachineBasicBlock &MBB) const override;
 
   void processFunctionBeforeFrameFinalized(
     MachineFunction &MF,
     RegScavenger *RS = nullptr) const override;
+
+private:
+  /// \brief Emits debugger prologue.
+  void emitDebuggerPrologue(MachineFunction &MF, MachineBasicBlock &MBB) const;
 };
 
 }
diff --git a/lib/Target/AMDGPU/SIISelLowering.cpp b/lib/Target/AMDGPU/SIISelLowering.cpp
index 544867513d9c..51241cf0a432 100644
--- a/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -18,33 +18,46 @@
 #include <cmath>
 #endif
 
-#include "SIISelLowering.h"
 #include "AMDGPU.h"
-#include "AMDGPUDiagnosticInfoUnsupported.h"
 #include "AMDGPUIntrinsicInfo.h"
 #include "AMDGPUSubtarget.h"
+#include "SIISelLowering.h"
 #include "SIInstrInfo.h"
 #include "SIMachineFunctionInfo.h"
 #include "SIRegisterInfo.h"
 #include "llvm/ADT/BitVector.h"
+#include "llvm/ADT/StringSwitch.h"
 #include "llvm/CodeGen/CallingConvLower.h"
 #include "llvm/CodeGen/MachineInstrBuilder.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
 #include "llvm/CodeGen/SelectionDAG.h"
+#include "llvm/IR/DiagnosticInfo.h"
 #include "llvm/IR/Function.h"
-#include "llvm/ADT/SmallString.h"
 
 using namespace llvm;
 
-SITargetLowering::SITargetLowering(TargetMachine &TM,
-                                   const AMDGPUSubtarget &STI)
+// -amdgpu-fast-fdiv - Command line option to enable faster 2.5 ulp fdiv.
+static cl::opt<bool> EnableAMDGPUFastFDIV(
+  "amdgpu-fast-fdiv",
+  cl::desc("Enable faster 2.5 ulp fdiv"),
+  cl::init(false));
+
+static unsigned findFirstFreeSGPR(CCState &CCInfo) {
+  unsigned NumSGPRs = AMDGPU::SGPR_32RegClass.getNumRegs();
+  for (unsigned Reg = 0; Reg < NumSGPRs; ++Reg) {
+    if (!CCInfo.isAllocated(AMDGPU::SGPR0 + Reg)) {
+      return AMDGPU::SGPR0 + Reg;
+    }
+  }
+  llvm_unreachable("Cannot allocate sgpr");
+}
+
+SITargetLowering::SITargetLowering(const TargetMachine &TM,
+                                   const SISubtarget &STI)
     : AMDGPUTargetLowering(TM, STI) {
   addRegisterClass(MVT::i1, &AMDGPU::VReg_1RegClass);
   addRegisterClass(MVT::i64, &AMDGPU::SReg_64RegClass);
 
-  addRegisterClass(MVT::v32i8, &AMDGPU::SReg_256RegClass);
-  addRegisterClass(MVT::v64i8, &AMDGPU::SReg_512RegClass);
-
   addRegisterClass(MVT::i32, &AMDGPU::SReg_32RegClass);
   addRegisterClass(MVT::f32, &AMDGPU::VGPR_32RegClass);
 
@@ -66,34 +79,25 @@ SITargetLowering::SITargetLowering(TargetMachine &TM,
 
   computeRegisterProperties(STI.getRegisterInfo());
 
-  setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v8i32, Expand);
-  setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v8f32, Expand);
-  setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v16i32, Expand);
-  setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v16f32, Expand);
-
-  setOperationAction(ISD::ADD, MVT::i32, Legal);
-  setOperationAction(ISD::ADDC, MVT::i32, Legal);
-  setOperationAction(ISD::ADDE, MVT::i32, Legal);
-  setOperationAction(ISD::SUBC, MVT::i32, Legal);
-  setOperationAction(ISD::SUBE, MVT::i32, Legal);
-
-  setOperationAction(ISD::FSIN, MVT::f32, Custom);
-  setOperationAction(ISD::FCOS, MVT::f32, Custom);
-
-  setOperationAction(ISD::FMINNUM, MVT::f64, Legal);
-  setOperationAction(ISD::FMAXNUM, MVT::f64, Legal);
-
   // We need to custom lower vector stores from local memory
+  setOperationAction(ISD::LOAD, MVT::v2i32, Custom);
   setOperationAction(ISD::LOAD, MVT::v4i32, Custom);
   setOperationAction(ISD::LOAD, MVT::v8i32, Custom);
   setOperationAction(ISD::LOAD, MVT::v16i32, Custom);
+  setOperationAction(ISD::LOAD, MVT::i1, Custom);
 
+  setOperationAction(ISD::STORE, MVT::v2i32, Custom);
+  setOperationAction(ISD::STORE, MVT::v4i32, Custom);
   setOperationAction(ISD::STORE, MVT::v8i32, Custom);
   setOperationAction(ISD::STORE, MVT::v16i32, Custom);
-
   setOperationAction(ISD::STORE, MVT::i1, Custom);
-  setOperationAction(ISD::STORE, MVT::v4i32, Custom);
 
+  setOperationAction(ISD::GlobalAddress, MVT::i32, Custom);
+  setOperationAction(ISD::GlobalAddress, MVT::i64, Custom);
+  setOperationAction(ISD::FrameIndex, MVT::i32, Custom);
+  setOperationAction(ISD::ConstantPool, MVT::v2i64, Expand);
+
+  setOperationAction(ISD::SELECT, MVT::i1, Promote);
   setOperationAction(ISD::SELECT, MVT::i64, Custom);
   setOperationAction(ISD::SELECT, MVT::f64, Promote);
   AddPromotedToType(ISD::SELECT, MVT::f64, MVT::i64);
@@ -102,109 +106,39 @@ SITargetLowering::SITargetLowering(TargetMachine &TM,
   setOperationAction(ISD::SELECT_CC, MVT::i32, Expand);
   setOperationAction(ISD::SELECT_CC, MVT::i64, Expand);
   setOperationAction(ISD::SELECT_CC, MVT::f64, Expand);
+  setOperationAction(ISD::SELECT_CC, MVT::i1, Expand);
 
+  setOperationAction(ISD::SETCC, MVT::i1, Promote);
   setOperationAction(ISD::SETCC, MVT::v2i1, Expand);
   setOperationAction(ISD::SETCC, MVT::v4i1, Expand);
 
-  setOperationAction(ISD::BSWAP, MVT::i32, Legal);
-  setOperationAction(ISD::BITREVERSE, MVT::i32, Legal);
+  setOperationAction(ISD::TRUNCATE, MVT::v2i32, Expand);
+  setOperationAction(ISD::FP_ROUND, MVT::v2f32, Expand);
 
-  setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i1, Legal);
   setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v2i1, Custom);
   setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v4i1, Custom);
-
-  setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i8, Legal);
   setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v2i8, Custom);
   setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v4i8, Custom);
-
-  setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i16, Legal);
   setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v2i16, Custom);
   setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v4i16, Custom);
-
-  setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i32, Legal);
   setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::Other, Custom);
 
-  setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::Other, Custom);
   setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::f32, Custom);
-  setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::v16i8, Custom);
   setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::v4f32, Custom);
+  setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::Other, Custom);
 
-  setOperationAction(ISD::INTRINSIC_VOID, MVT::Other, Custom);
   setOperationAction(ISD::BRCOND, MVT::Other, Custom);
-
-  for (MVT VT : MVT::integer_valuetypes()) {
-    if (VT == MVT::i64)
-      continue;
-
-    setLoadExtAction(ISD::SEXTLOAD, VT, MVT::i1, Promote);
-    setLoadExtAction(ISD::SEXTLOAD, VT, MVT::i8, Legal);
-    setLoadExtAction(ISD::SEXTLOAD, VT, MVT::i16, Legal);
-    setLoadExtAction(ISD::SEXTLOAD, VT, MVT::i32, Expand);
-
-    setLoadExtAction(ISD::ZEXTLOAD, VT, MVT::i1, Promote);
-    setLoadExtAction(ISD::ZEXTLOAD, VT, MVT::i8, Legal);
-    setLoadExtAction(ISD::ZEXTLOAD, VT, MVT::i16, Legal);
-    setLoadExtAction(ISD::ZEXTLOAD, VT, MVT::i32, Expand);
-
-    setLoadExtAction(ISD::EXTLOAD, VT, MVT::i1, Promote);
-    setLoadExtAction(ISD::EXTLOAD, VT, MVT::i8, Legal);
-    setLoadExtAction(ISD::EXTLOAD, VT, MVT::i16, Legal);
-    setLoadExtAction(ISD::EXTLOAD, VT, MVT::i32, Expand);
-  }
-
-  for (MVT VT : MVT::integer_vector_valuetypes()) {
-    setLoadExtAction(ISD::SEXTLOAD, VT, MVT::v8i16, Expand);
-    setLoadExtAction(ISD::SEXTLOAD, VT, MVT::v16i16, Expand);
-  }
-
-  for (MVT VT : MVT::fp_valuetypes())
-    setLoadExtAction(ISD::EXTLOAD, VT, MVT::f32, Expand);
-
-  setLoadExtAction(ISD::EXTLOAD, MVT::v2f64, MVT::v2f16, Expand);
-  setLoadExtAction(ISD::EXTLOAD, MVT::v2f64, MVT::v2f32, Expand);
-
-  setTruncStoreAction(MVT::i64, MVT::i32, Expand);
-  setTruncStoreAction(MVT::v8i32, MVT::v8i16, Expand);
-  setTruncStoreAction(MVT::v16i32, MVT::v16i8, Expand);
-  setTruncStoreAction(MVT::v16i32, MVT::v16i16, Expand);
-
-
-  setTruncStoreAction(MVT::v2i64, MVT::v2i32, Expand);
-
-  setTruncStoreAction(MVT::v2f64, MVT::v2f32, Expand);
-  setTruncStoreAction(MVT::v2f64, MVT::v2f16, Expand);
-
-  setOperationAction(ISD::LOAD, MVT::i1, Custom);
-
-  setOperationAction(ISD::LOAD, MVT::v2i64, Promote);
-  AddPromotedToType(ISD::LOAD, MVT::v2i64, MVT::v4i32);
-
-  setOperationAction(ISD::STORE, MVT::v2i64, Promote);
-  AddPromotedToType(ISD::STORE, MVT::v2i64, MVT::v4i32);
-
-  setOperationAction(ISD::ConstantPool, MVT::v2i64, Expand);
-
-  setOperationAction(ISD::GlobalAddress, MVT::i32, Custom);
-  setOperationAction(ISD::GlobalAddress, MVT::i64, Custom);
-  setOperationAction(ISD::FrameIndex, MVT::i32, Custom);
-
-  // These should use UDIVREM, so set them to expand
-  setOperationAction(ISD::UDIV, MVT::i64, Expand);
-  setOperationAction(ISD::UREM, MVT::i64, Expand);
-
-  setOperationAction(ISD::SELECT_CC, MVT::i1, Expand);
-  setOperationAction(ISD::SELECT, MVT::i1, Promote);
-
-  setOperationAction(ISD::TRUNCATE, MVT::v2i32, Expand);
-
-
-  setOperationAction(ISD::FP_ROUND, MVT::v2f32, Expand);
+  setOperationAction(ISD::BR_CC, MVT::i1, Expand);
+  setOperationAction(ISD::BR_CC, MVT::i32, Expand);
+  setOperationAction(ISD::BR_CC, MVT::i64, Expand);
+  setOperationAction(ISD::BR_CC, MVT::f32, Expand);
+  setOperationAction(ISD::BR_CC, MVT::f64, Expand);
 
   // We only support LOAD/STORE and vector manipulation ops for vectors
   // with > 4 elements.
   for (MVT VT : {MVT::v8i32, MVT::v8f32, MVT::v16i32, MVT::v16f32, MVT::v2i64, MVT::v2f64}) {
     for (unsigned Op = 0; Op < ISD::BUILTIN_OP_END; ++Op) {
-      switch(Op) {
+      switch (Op) {
       case ISD::LOAD:
       case ISD::STORE:
       case ISD::BUILD_VECTOR:
@@ -241,13 +175,46 @@ SITargetLowering::SITargetLowering(TargetMachine &TM,
     AddPromotedToType(ISD::SCALAR_TO_VECTOR, Vec64, MVT::v4i32);
   }
 
-  if (Subtarget->getGeneration() >= AMDGPUSubtarget::SEA_ISLANDS) {
+  setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v8i32, Expand);
+  setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v8f32, Expand);
+  setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v16i32, Expand);
+  setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v16f32, Expand);
+
+  // BUFFER/FLAT_ATOMIC_CMP_SWAP on GCN GPUs needs input marshalling,
+  // and output demarshalling
+  setOperationAction(ISD::ATOMIC_CMP_SWAP, MVT::i32, Custom);
+  setOperationAction(ISD::ATOMIC_CMP_SWAP, MVT::i64, Custom);
+
+  // We can't return success/failure, only the old value,
+  // let LLVM add the comparison
+  setOperationAction(ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS, MVT::i32, Expand);
+  setOperationAction(ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS, MVT::i64, Expand);
+
+  if (getSubtarget()->hasFlatAddressSpace()) {
+    setOperationAction(ISD::ADDRSPACECAST, MVT::i32, Custom);
+    setOperationAction(ISD::ADDRSPACECAST, MVT::i64, Custom);
+  }
+
+  setOperationAction(ISD::BSWAP, MVT::i32, Legal);
+  setOperationAction(ISD::BITREVERSE, MVT::i32, Legal);
+
+  // On SI this is s_memtime and s_memrealtime on VI.
+  setOperationAction(ISD::READCYCLECOUNTER, MVT::i64, Legal);
+  setOperationAction(ISD::TRAP, MVT::Other, Custom);
+
+  setOperationAction(ISD::FMINNUM, MVT::f64, Legal);
+  setOperationAction(ISD::FMAXNUM, MVT::f64, Legal);
+
+  if (Subtarget->getGeneration() >= SISubtarget::SEA_ISLANDS) {
     setOperationAction(ISD::FTRUNC, MVT::f64, Legal);
     setOperationAction(ISD::FCEIL, MVT::f64, Legal);
     setOperationAction(ISD::FRINT, MVT::f64, Legal);
   }
 
   setOperationAction(ISD::FFLOOR, MVT::f64, Legal);
+
+  setOperationAction(ISD::FSIN, MVT::f32, Custom);
+  setOperationAction(ISD::FCOS, MVT::f32, Custom);
   setOperationAction(ISD::FDIV, MVT::f32, Custom);
   setOperationAction(ISD::FDIV, MVT::f64, Custom);
 
@@ -263,6 +230,7 @@ SITargetLowering::SITargetLowering(TargetMachine &TM,
   setTargetDAGCombine(ISD::AND);
   setTargetDAGCombine(ISD::OR);
   setTargetDAGCombine(ISD::UINT_TO_FP);
+  setTargetDAGCombine(ISD::FCANONICALIZE);
 
   // All memory operations. Some folding on the pointer operand is done to help
   // matching the constant offsets in the addressing modes.
@@ -287,10 +255,33 @@ SITargetLowering::SITargetLowering(TargetMachine &TM,
   setSchedulingPreference(Sched::RegPressure);
 }
 
+const SISubtarget *SITargetLowering::getSubtarget() const {
+  return static_cast<const SISubtarget *>(Subtarget);
+}
+
 //===----------------------------------------------------------------------===//
 // TargetLowering queries
 //===----------------------------------------------------------------------===//
 
+bool SITargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
+                                          const CallInst &CI,
+                                          unsigned IntrID) const {
+  switch (IntrID) {
+  case Intrinsic::amdgcn_atomic_inc:
+  case Intrinsic::amdgcn_atomic_dec:
+    Info.opc = ISD::INTRINSIC_W_CHAIN;
+    Info.memVT = MVT::getVT(CI.getType());
+    Info.ptrVal = CI.getOperand(0);
+    Info.align = 0;
+    Info.vol = false;
+    Info.readMem = true;
+    Info.writeMem = true;
+    return true;
+  default:
+    return false;
+  }
+}
+
 bool SITargetLowering::isShuffleMaskLegal(const SmallVectorImpl<int> &,
                                           EVT) const {
   // SI has some legal vector types, but no legal vector operations. Say no
@@ -348,7 +339,7 @@ bool SITargetLowering::isLegalAddressingMode(const DataLayout &DL,
 
   switch (AS) {
   case AMDGPUAS::GLOBAL_ADDRESS: {
-    if (Subtarget->getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS) {
+    if (Subtarget->getGeneration() >= SISubtarget::VOLCANIC_ISLANDS) {
       // Assume the we will use FLAT for all global memory accesses
       // on VI.
       // FIXME: This assumption is currently wrong.  On VI we still use
@@ -376,16 +367,16 @@ bool SITargetLowering::isLegalAddressingMode(const DataLayout &DL,
     if (DL.getTypeStoreSize(Ty) < 4)
       return isLegalMUBUFAddressingMode(AM);
 
-    if (Subtarget->getGeneration() == AMDGPUSubtarget::SOUTHERN_ISLANDS) {
+    if (Subtarget->getGeneration() == SISubtarget::SOUTHERN_ISLANDS) {
       // SMRD instructions have an 8-bit, dword offset on SI.
       if (!isUInt<8>(AM.BaseOffs / 4))
         return false;
-    } else if (Subtarget->getGeneration() == AMDGPUSubtarget::SEA_ISLANDS) {
+    } else if (Subtarget->getGeneration() == SISubtarget::SEA_ISLANDS) {
       // On CI+, this can also be a 32-bit literal constant offset. If it fits
       // in 8-bits, it can use a smaller encoding.
       if (!isUInt<32>(AM.BaseOffs / 4))
         return false;
-    } else if (Subtarget->getGeneration() == AMDGPUSubtarget::VOLCANIC_ISLANDS) {
+    } else if (Subtarget->getGeneration() == SISubtarget::VOLCANIC_ISLANDS) {
       // On VI, these use the SMEM format and the offset is 20-bit in bytes.
       if (!isUInt<20>(AM.BaseOffs))
         return false;
@@ -402,7 +393,6 @@ bool SITargetLowering::isLegalAddressingMode(const DataLayout &DL,
   }
 
   case AMDGPUAS::PRIVATE_ADDRESS:
-  case AMDGPUAS::UNKNOWN_ADDRESS_SPACE:
     return isLegalMUBUFAddressingMode(AM);
 
   case AMDGPUAS::LOCAL_ADDRESS:
@@ -423,6 +413,12 @@ bool SITargetLowering::isLegalAddressingMode(const DataLayout &DL,
     return false;
   }
   case AMDGPUAS::FLAT_ADDRESS:
+  case AMDGPUAS::UNKNOWN_ADDRESS_SPACE:
+    // For an unknown address space, this usually means that this is for some
+    // reason being used for pure arithmetic, and not based on some addressing
+    // computation. We don't have instructions that compute pointers with any
+    // addressing modes, so treat them as having no offset like flat
+    // instructions.
     return isLegalFlatAddressingMode(AM);
 
   default:
@@ -442,24 +438,30 @@ bool SITargetLowering::allowsMisalignedMemoryAccesses(EVT VT,
   if (!VT.isSimple() || VT == MVT::Other)
     return false;
 
-  // TODO - CI+ supports unaligned memory accesses, but this requires driver
-  // support.
-
-  // XXX - The only mention I see of this in the ISA manual is for LDS direct
-  // reads the "byte address and must be dword aligned". Is it also true for the
-  // normal loads and stores?
-  if (AddrSpace == AMDGPUAS::LOCAL_ADDRESS) {
+  if (AddrSpace == AMDGPUAS::LOCAL_ADDRESS ||
+      AddrSpace == AMDGPUAS::REGION_ADDRESS) {
     // ds_read/write_b64 require 8-byte alignment, but we can do a 4 byte
     // aligned, 8 byte access in a single operation using ds_read2/write2_b32
     // with adjacent offsets.
     bool AlignedBy4 = (Align % 4 == 0);
     if (IsFast)
       *IsFast = AlignedBy4;
+
     return AlignedBy4;
   }
 
+  if (Subtarget->hasUnalignedBufferAccess()) {
+    // If we have an uniform constant load, it still requires using a slow
+    // buffer instruction if unaligned.
+    if (IsFast) {
+      *IsFast = (AddrSpace == AMDGPUAS::CONSTANT_ADDRESS) ?
+        (Align % 4 == 0) : true;
+    }
+
+    return true;
+  }
+
   // Smaller than dword value must be aligned.
-  // FIXME: This should be allowed on CI+
   if (VT.bitsLT(MVT::i32))
     return false;
 
@@ -500,21 +502,22 @@ static bool isFlatGlobalAddrSpace(unsigned AS) {
 
 bool SITargetLowering::isNoopAddrSpaceCast(unsigned SrcAS,
                                            unsigned DestAS) const {
-  return isFlatGlobalAddrSpace(SrcAS) &&  isFlatGlobalAddrSpace(DestAS);
+  return isFlatGlobalAddrSpace(SrcAS) && isFlatGlobalAddrSpace(DestAS);
 }
 
-
 bool SITargetLowering::isMemOpUniform(const SDNode *N) const {
   const MemSDNode *MemNode = cast<MemSDNode>(N);
   const Value *Ptr = MemNode->getMemOperand()->getValue();
 
   // UndefValue means this is a load of a kernel input.  These are uniform.
-  // Sometimes LDS instructions have constant pointers
-  if (isa<UndefValue>(Ptr) || isa<Argument>(Ptr) || isa<Constant>(Ptr) ||
-      isa<GlobalValue>(Ptr))
+  // Sometimes LDS instructions have constant pointers.
+  // If Ptr is null, then that means this mem operand contains a
+  // PseudoSourceValue like GOT.
+  if (!Ptr || isa<UndefValue>(Ptr) || isa<Argument>(Ptr) ||
+      isa<Constant>(Ptr) || isa<GlobalValue>(Ptr))
     return true;
 
-  const Instruction *I = dyn_cast_or_null<Instruction>(Ptr);
+  const Instruction *I = dyn_cast<Instruction>(Ptr);
   return I && I->getMetadata("amdgpu.uniform");
 }
 
@@ -528,29 +531,42 @@ SITargetLowering::getPreferredVectorAction(EVT VT) const {
 
 bool SITargetLowering::shouldConvertConstantLoadToIntImm(const APInt &Imm,
                                                          Type *Ty) const {
-  const SIInstrInfo *TII =
-      static_cast<const SIInstrInfo *>(Subtarget->getInstrInfo());
+  const SIInstrInfo *TII = getSubtarget()->getInstrInfo();
   return TII->isInlineConstant(Imm);
 }
 
-SDValue SITargetLowering::LowerParameter(SelectionDAG &DAG, EVT VT, EVT MemVT,
-                                         SDLoc SL, SDValue Chain,
-                                         unsigned Offset, bool Signed) const {
+bool SITargetLowering::isTypeDesirableForOp(unsigned Op, EVT VT) const {
+
+  // SimplifySetCC uses this function to determine whether or not it should
+  // create setcc with i1 operands.  We don't have instructions for i1 setcc.
+  if (VT == MVT::i1 && Op == ISD::SETCC)
+    return false;
+
+  return TargetLowering::isTypeDesirableForOp(Op, VT);
+}
+
+SDValue SITargetLowering::LowerParameterPtr(SelectionDAG &DAG,
+                                            const SDLoc &SL, SDValue Chain,
+                                            unsigned Offset) const {
   const DataLayout &DL = DAG.getDataLayout();
   MachineFunction &MF = DAG.getMachineFunction();
-  const SIRegisterInfo *TRI =
-      static_cast<const SIRegisterInfo*>(Subtarget->getRegisterInfo());
+  const SIRegisterInfo *TRI = getSubtarget()->getRegisterInfo();
   unsigned InputPtrReg = TRI->getPreloadedValue(MF, SIRegisterInfo::KERNARG_SEGMENT_PTR);
 
-  Type *Ty = VT.getTypeForEVT(*DAG.getContext());
-
   MachineRegisterInfo &MRI = DAG.getMachineFunction().getRegInfo();
   MVT PtrVT = getPointerTy(DL, AMDGPUAS::CONSTANT_ADDRESS);
-  PointerType *PtrTy = PointerType::get(Ty, AMDGPUAS::CONSTANT_ADDRESS);
   SDValue BasePtr = DAG.getCopyFromReg(Chain, SL,
                                        MRI.getLiveInVirtReg(InputPtrReg), PtrVT);
-  SDValue Ptr = DAG.getNode(ISD::ADD, SL, PtrVT, BasePtr,
-                            DAG.getConstant(Offset, SL, PtrVT));
+  return DAG.getNode(ISD::ADD, SL, PtrVT, BasePtr,
+                     DAG.getConstant(Offset, SL, PtrVT));
+}
+SDValue SITargetLowering::LowerParameter(SelectionDAG &DAG, EVT VT, EVT MemVT,
+                                         const SDLoc &SL, SDValue Chain,
+                                         unsigned Offset, bool Signed) const {
+  const DataLayout &DL = DAG.getDataLayout();
+  Type *Ty = VT.getTypeForEVT(*DAG.getContext());
+  MVT PtrVT = getPointerTy(DL, AMDGPUAS::CONSTANT_ADDRESS);
+  PointerType *PtrTy = PointerType::get(Ty, AMDGPUAS::CONSTANT_ADDRESS);
   SDValue PtrOffset = DAG.getUNDEF(PtrVT);
   MachinePointerInfo PtrInfo(UndefValue::get(PtrTy));
 
@@ -560,34 +576,35 @@ SDValue SITargetLowering::LowerParameter(SelectionDAG &DAG, EVT VT, EVT MemVT,
   if (MemVT.isFloatingPoint())
     ExtTy = ISD::EXTLOAD;
 
-  return DAG.getLoad(ISD::UNINDEXED, ExtTy,
-                     VT, SL, Chain, Ptr, PtrOffset, PtrInfo, MemVT,
-                     false, // isVolatile
-                     true, // isNonTemporal
-                     true, // isInvariant
-                     Align); // Alignment
+  SDValue Ptr = LowerParameterPtr(DAG, SL, Chain, Offset);
+  return DAG.getLoad(ISD::UNINDEXED, ExtTy, VT, SL, Chain, Ptr, PtrOffset,
+                     PtrInfo, MemVT, Align, MachineMemOperand::MONonTemporal |
+                                                MachineMemOperand::MOInvariant);
 }
 
 SDValue SITargetLowering::LowerFormalArguments(
     SDValue Chain, CallingConv::ID CallConv, bool isVarArg,
-    const SmallVectorImpl<ISD::InputArg> &Ins, SDLoc DL, SelectionDAG &DAG,
-    SmallVectorImpl<SDValue> &InVals) const {
-  const SIRegisterInfo *TRI =
-      static_cast<const SIRegisterInfo *>(Subtarget->getRegisterInfo());
+    const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &DL,
+    SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals) const {
+  const SIRegisterInfo *TRI = getSubtarget()->getRegisterInfo();
 
   MachineFunction &MF = DAG.getMachineFunction();
   FunctionType *FType = MF.getFunction()->getFunctionType();
   SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
-  const AMDGPUSubtarget &ST = MF.getSubtarget<AMDGPUSubtarget>();
+  const SISubtarget &ST = MF.getSubtarget<SISubtarget>();
 
-  if (Subtarget->isAmdHsaOS() && Info->getShaderType() != ShaderType::COMPUTE) {
+  if (Subtarget->isAmdHsaOS() && AMDGPU::isShader(CallConv)) {
     const Function *Fn = MF.getFunction();
-    DiagnosticInfoUnsupported NoGraphicsHSA(*Fn, "non-compute shaders with HSA");
+    DiagnosticInfoUnsupported NoGraphicsHSA(
+        *Fn, "unsupported non-compute shaders with HSA", DL.getDebugLoc());
     DAG.getContext()->diagnose(NoGraphicsHSA);
-    return SDValue();
+    return DAG.getEntryNode();
   }
 
-  // FIXME: We currently assume all calling conventions are kernels.
+  // Create stack objects that are used for emitting debugger prologue if
+  // "amdgpu-debugger-emit-prologue" attribute was specified.
+  if (ST.debuggerEmitPrologue())
+    createDebuggerPrologueStackObjects(MF);
 
   SmallVector<ISD::InputArg, 16> Splits;
   BitVector Skipped(Ins.size());
@@ -596,7 +613,7 @@ SDValue SITargetLowering::LowerFormalArguments(
     const ISD::InputArg &Arg = Ins[i];
 
     // First check if it's a PS input addr
-    if (Info->getShaderType() == ShaderType::PIXEL && !Arg.Flags.isInReg() &&
+    if (CallConv == CallingConv::AMDGPU_PS && !Arg.Flags.isInReg() &&
         !Arg.Flags.isByVal() && PSInputNum <= 15) {
 
       if (!Arg.Used && !Info->isPSInputAllocated(PSInputNum)) {
@@ -613,25 +630,26 @@ SDValue SITargetLowering::LowerFormalArguments(
       ++PSInputNum;
     }
 
-    // Second split vertices into their elements
-    if (Info->getShaderType() != ShaderType::COMPUTE && Arg.VT.isVector()) {
-      ISD::InputArg NewArg = Arg;
-      NewArg.Flags.setSplit();
-      NewArg.VT = Arg.VT.getVectorElementType();
-
-      // We REALLY want the ORIGINAL number of vertex elements here, e.g. a
-      // three or five element vertex only needs three or five registers,
-      // NOT four or eight.
-      Type *ParamType = FType->getParamType(Arg.getOrigArgIndex());
-      unsigned NumElements = ParamType->getVectorNumElements();
-
-      for (unsigned j = 0; j != NumElements; ++j) {
-        Splits.push_back(NewArg);
-        NewArg.PartOffset += NewArg.VT.getStoreSize();
+    if (AMDGPU::isShader(CallConv)) {
+      // Second split vertices into their elements
+      if (Arg.VT.isVector()) {
+        ISD::InputArg NewArg = Arg;
+        NewArg.Flags.setSplit();
+        NewArg.VT = Arg.VT.getVectorElementType();
+
+        // We REALLY want the ORIGINAL number of vertex elements here, e.g. a
+        // three or five element vertex only needs three or five registers,
+        // NOT four or eight.
+        Type *ParamType = FType->getParamType(Arg.getOrigArgIndex());
+        unsigned NumElements = ParamType->getVectorNumElements();
+
+        for (unsigned j = 0; j != NumElements; ++j) {
+          Splits.push_back(NewArg);
+          NewArg.PartOffset += NewArg.VT.getStoreSize();
+        }
+      } else {
+        Splits.push_back(Arg);
       }
-
-    } else if (Info->getShaderType() != ShaderType::COMPUTE) {
-      Splits.push_back(Arg);
     }
   }
 
@@ -651,19 +669,27 @@ SDValue SITargetLowering::LowerFormalArguments(
   // - At least one of PERSP_* (0xF) or LINEAR_* (0x70) must be enabled.
   // - If POS_W_FLOAT (11) is enabled, at least one of PERSP_* must be
   //   enabled too.
-  if (Info->getShaderType() == ShaderType::PIXEL &&
+  if (CallConv == CallingConv::AMDGPU_PS &&
       ((Info->getPSInputAddr() & 0x7F) == 0 ||
-       ((Info->getPSInputAddr() & 0xF) == 0 &&
-	Info->isPSInputAllocated(11)))) {
+       ((Info->getPSInputAddr() & 0xF) == 0 && Info->isPSInputAllocated(11)))) {
     CCInfo.AllocateReg(AMDGPU::VGPR0);
     CCInfo.AllocateReg(AMDGPU::VGPR1);
     Info->markPSInputAllocated(0);
     Info->PSInputEna |= 1;
   }
 
-  if (Info->getShaderType() == ShaderType::COMPUTE) {
+  if (!AMDGPU::isShader(CallConv)) {
     getOriginalFunctionArgs(DAG, DAG.getMachineFunction().getFunction(), Ins,
                             Splits);
+
+    assert(Info->hasWorkGroupIDX() && Info->hasWorkItemIDX());
+  } else {
+    assert(!Info->hasPrivateSegmentBuffer() && !Info->hasDispatchPtr() &&
+           !Info->hasKernargSegmentPtr() && !Info->hasFlatScratchInit() &&
+           !Info->hasWorkGroupIDX() && !Info->hasWorkGroupIDY() &&
+           !Info->hasWorkGroupIDZ() && !Info->hasWorkGroupInfo() &&
+           !Info->hasWorkItemIDX() && !Info->hasWorkItemIDY() &&
+           !Info->hasWorkItemIDZ());
   }
 
   // FIXME: How should these inputs interact with inreg / custom SGPR inputs?
@@ -679,12 +705,24 @@ SDValue SITargetLowering::LowerFormalArguments(
     CCInfo.AllocateReg(DispatchPtrReg);
   }
 
+  if (Info->hasQueuePtr()) {
+    unsigned QueuePtrReg = Info->addQueuePtr(*TRI);
+    MF.addLiveIn(QueuePtrReg, &AMDGPU::SReg_64RegClass);
+    CCInfo.AllocateReg(QueuePtrReg);
+  }
+
   if (Info->hasKernargSegmentPtr()) {
     unsigned InputPtrReg = Info->addKernargSegmentPtr(*TRI);
     MF.addLiveIn(InputPtrReg, &AMDGPU::SReg_64RegClass);
     CCInfo.AllocateReg(InputPtrReg);
   }
 
+  if (Info->hasFlatScratchInit()) {
+    unsigned FlatScratchInitReg = Info->addFlatScratchInit(*TRI);
+    MF.addLiveIn(FlatScratchInitReg, &AMDGPU::SReg_64RegClass);
+    CCInfo.AllocateReg(FlatScratchInitReg);
+  }
+
   AnalyzeFormalArguments(CCInfo, Splits);
 
   SmallVector<SDValue, 16> Chains;
@@ -713,7 +751,7 @@ SDValue SITargetLowering::LowerFormalArguments(
 
       auto *ParamTy =
         dyn_cast<PointerType>(FType->getParamType(Ins[i].getOrigArgIndex()));
-      if (Subtarget->getGeneration() == AMDGPUSubtarget::SOUTHERN_ISLANDS &&
+      if (Subtarget->getGeneration() == SISubtarget::SOUTHERN_ISLANDS &&
           ParamTy && ParamTy->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS) {
         // On SI local pointers are just offsets into LDS, so they are always
         // less than 16-bits.  On CI and newer they could potentially be
@@ -765,7 +803,7 @@ SDValue SITargetLowering::LowerFormalArguments(
       NumElements = Arg.VT.getVectorNumElements() - NumElements;
       Regs.append(NumElements, DAG.getUNDEF(VT));
 
-      InVals.push_back(DAG.getNode(ISD::BUILD_VECTOR, DL, Arg.VT, Regs));
+      InVals.push_back(DAG.getBuildVector(Arg.VT, DL, Regs));
       continue;
     }
 
@@ -780,8 +818,7 @@ SDValue SITargetLowering::LowerFormalArguments(
     unsigned Reg = Info->addWorkGroupIDX();
     MF.addLiveIn(Reg, &AMDGPU::SReg_32RegClass);
     CCInfo.AllocateReg(Reg);
-  } else
-    llvm_unreachable("work group id x is always enabled");
+  }
 
   if (Info->hasWorkGroupIDY()) {
     unsigned Reg = Info->addWorkGroupIDY();
@@ -803,8 +840,13 @@ SDValue SITargetLowering::LowerFormalArguments(
 
   if (Info->hasPrivateSegmentWaveByteOffset()) {
     // Scratch wave offset passed in system SGPR.
-    unsigned PrivateSegmentWaveByteOffsetReg
-      = Info->addPrivateSegmentWaveByteOffset();
+    unsigned PrivateSegmentWaveByteOffsetReg;
+
+    if (AMDGPU::isShader(CallConv)) {
+      PrivateSegmentWaveByteOffsetReg = findFirstFreeSGPR(CCInfo);
+      Info->setPrivateSegmentWaveByteOffset(PrivateSegmentWaveByteOffsetReg);
+    } else
+      PrivateSegmentWaveByteOffsetReg = Info->addPrivateSegmentWaveByteOffset();
 
     MF.addLiveIn(PrivateSegmentWaveByteOffsetReg, &AMDGPU::SGPR_32RegClass);
     CCInfo.AllocateReg(PrivateSegmentWaveByteOffsetReg);
@@ -812,8 +854,11 @@ SDValue SITargetLowering::LowerFormalArguments(
 
   // Now that we've figured out where the scratch register inputs are, see if
   // should reserve the arguments and use them directly.
-
   bool HasStackObjects = MF.getFrameInfo()->hasStackObjects();
+  // Record that we know we have non-spill stack objects so we don't need to
+  // check all stack objects later.
+  if (HasStackObjects)
+    Info->setHasNonSpillStackObjects(true);
 
   if (ST.isAmdHsaOS()) {
     // TODO: Assume we will spill without optimizations.
@@ -866,8 +911,7 @@ SDValue SITargetLowering::LowerFormalArguments(
     unsigned Reg = TRI->getPreloadedValue(MF, SIRegisterInfo::WORKITEM_ID_X);
     MF.addLiveIn(Reg, &AMDGPU::VGPR_32RegClass);
     CCInfo.AllocateReg(Reg);
-  } else
-    llvm_unreachable("workitem id x should always be enabled");
+  }
 
   if (Info->hasWorkItemIDY()) {
     unsigned Reg = TRI->getPreloadedValue(MF, SIRegisterInfo::WORKITEM_ID_Y);
@@ -887,16 +931,16 @@ SDValue SITargetLowering::LowerFormalArguments(
   return DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Chains);
 }
 
-SDValue SITargetLowering::LowerReturn(SDValue Chain,
-                                      CallingConv::ID CallConv,
-                                      bool isVarArg,
-                                      const SmallVectorImpl<ISD::OutputArg> &Outs,
-                                      const SmallVectorImpl<SDValue> &OutVals,
-                                      SDLoc DL, SelectionDAG &DAG) const {
+SDValue
+SITargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv,
+                              bool isVarArg,
+                              const SmallVectorImpl<ISD::OutputArg> &Outs,
+                              const SmallVectorImpl<SDValue> &OutVals,
+                              const SDLoc &DL, SelectionDAG &DAG) const {
   MachineFunction &MF = DAG.getMachineFunction();
   SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
 
-  if (Info->getShaderType() == ShaderType::COMPUTE)
+  if (!AMDGPU::isShader(CallConv))
     return AMDGPUTargetLowering::LowerReturn(Chain, CallConv, isVarArg, Outs,
                                              OutVals, DL, DAG);
 
@@ -975,17 +1019,131 @@ SDValue SITargetLowering::LowerReturn(SDValue Chain,
   if (Flag.getNode())
     RetOps.push_back(Flag);
 
-  return DAG.getNode(AMDGPUISD::RET_FLAG, DL, MVT::Other, RetOps);
+  unsigned Opc = Info->returnsVoid() ? AMDGPUISD::ENDPGM : AMDGPUISD::RETURN;
+  return DAG.getNode(Opc, DL, MVT::Other, RetOps);
 }
 
-MachineBasicBlock * SITargetLowering::EmitInstrWithCustomInserter(
-    MachineInstr * MI, MachineBasicBlock * BB) const {
-
-  switch (MI->getOpcode()) {
+unsigned SITargetLowering::getRegisterByName(const char* RegName, EVT VT,
+                                             SelectionDAG &DAG) const {
+  unsigned Reg = StringSwitch<unsigned>(RegName)
+    .Case("m0", AMDGPU::M0)
+    .Case("exec", AMDGPU::EXEC)
+    .Case("exec_lo", AMDGPU::EXEC_LO)
+    .Case("exec_hi", AMDGPU::EXEC_HI)
+    .Case("flat_scratch", AMDGPU::FLAT_SCR)
+    .Case("flat_scratch_lo", AMDGPU::FLAT_SCR_LO)
+    .Case("flat_scratch_hi", AMDGPU::FLAT_SCR_HI)
+    .Default(AMDGPU::NoRegister);
+
+  if (Reg == AMDGPU::NoRegister) {
+    report_fatal_error(Twine("invalid register name \""
+                             + StringRef(RegName)  + "\"."));
+
+  }
+
+  if (Subtarget->getGeneration() == SISubtarget::SOUTHERN_ISLANDS &&
+      Subtarget->getRegisterInfo()->regsOverlap(Reg, AMDGPU::FLAT_SCR)) {
+    report_fatal_error(Twine("invalid register \""
+                             + StringRef(RegName)  + "\" for subtarget."));
+  }
+
+  switch (Reg) {
+  case AMDGPU::M0:
+  case AMDGPU::EXEC_LO:
+  case AMDGPU::EXEC_HI:
+  case AMDGPU::FLAT_SCR_LO:
+  case AMDGPU::FLAT_SCR_HI:
+    if (VT.getSizeInBits() == 32)
+      return Reg;
+    break;
+  case AMDGPU::EXEC:
+  case AMDGPU::FLAT_SCR:
+    if (VT.getSizeInBits() == 64)
+      return Reg;
+    break;
   default:
-    return AMDGPUTargetLowering::EmitInstrWithCustomInserter(MI, BB);
+    llvm_unreachable("missing register type checking");
+  }
+
+  report_fatal_error(Twine("invalid type for register \""
+                           + StringRef(RegName) + "\"."));
+}
+
+// If kill is not the last instruction, split the block so kill is always a
+// proper terminator.
+MachineBasicBlock *SITargetLowering::splitKillBlock(MachineInstr &MI,
+                                                    MachineBasicBlock *BB) const {
+  const SIInstrInfo *TII = getSubtarget()->getInstrInfo();
+
+  MachineBasicBlock::iterator SplitPoint(&MI);
+  ++SplitPoint;
+
+  if (SplitPoint == BB->end()) {
+    // Don't bother with a new block.
+    MI.setDesc(TII->get(AMDGPU::SI_KILL_TERMINATOR));
+    return BB;
+  }
+
+  MachineFunction *MF = BB->getParent();
+  MachineBasicBlock *SplitBB
+    = MF->CreateMachineBasicBlock(BB->getBasicBlock());
+
+  // Fix the block phi references to point to the new block for the defs in the
+  // second piece of the block.
+  for (MachineBasicBlock *Succ : BB->successors()) {
+    for (MachineInstr &MI : *Succ) {
+      if (!MI.isPHI())
+        break;
+
+      for (unsigned I = 2, E = MI.getNumOperands(); I != E; I += 2) {
+        MachineOperand &FromBB = MI.getOperand(I);
+        if (BB == FromBB.getMBB()) {
+          FromBB.setMBB(SplitBB);
+          break;
+        }
+      }
+    }
+  }
+
+  MF->insert(++MachineFunction::iterator(BB), SplitBB);
+  SplitBB->splice(SplitBB->begin(), BB, SplitPoint, BB->end());
+
+  SplitBB->transferSuccessors(BB);
+  BB->addSuccessor(SplitBB);
+
+  MI.setDesc(TII->get(AMDGPU::SI_KILL_TERMINATOR));
+  return SplitBB;
+}
+
+MachineBasicBlock *SITargetLowering::EmitInstrWithCustomInserter(
+  MachineInstr &MI, MachineBasicBlock *BB) const {
+  switch (MI.getOpcode()) {
+  case AMDGPU::SI_INIT_M0: {
+    const SIInstrInfo *TII = getSubtarget()->getInstrInfo();
+    BuildMI(*BB, MI.getIterator(), MI.getDebugLoc(),
+            TII->get(AMDGPU::S_MOV_B32), AMDGPU::M0)
+        .addOperand(MI.getOperand(0));
+    MI.eraseFromParent();
+    break;
+  }
   case AMDGPU::BRANCH:
     return BB;
+  case AMDGPU::GET_GROUPSTATICSIZE: {
+    const SIInstrInfo *TII = getSubtarget()->getInstrInfo();
+
+    MachineFunction *MF = BB->getParent();
+    SIMachineFunctionInfo *MFI = MF->getInfo<SIMachineFunctionInfo>();
+    DebugLoc DL = MI.getDebugLoc();
+    BuildMI(*BB, MI, DL, TII->get(AMDGPU::S_MOVK_I32))
+        .addOperand(MI.getOperand(0))
+        .addImm(MFI->LDSSize);
+    MI.eraseFromParent();
+    return BB;
+  }
+  case AMDGPU::SI_KILL:
+    return splitKillBlock(MI, BB);
+  default:
+    return AMDGPUTargetLowering::EmitInstrWithCustomInserter(MI, BB);
   }
   return BB;
 }
@@ -1072,6 +1230,7 @@ SDValue SITargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
     return LowerTrig(Op, DAG);
   case ISD::SELECT: return LowerSELECT(Op, DAG);
   case ISD::FDIV: return LowerFDIV(Op, DAG);
+  case ISD::ATOMIC_CMP_SWAP: return LowerATOMIC_CMP_SWAP(Op, DAG);
   case ISD::STORE: return LowerSTORE(Op, DAG);
   case ISD::GlobalAddress: {
     MachineFunction &MF = DAG.getMachineFunction();
@@ -1079,7 +1238,10 @@ SDValue SITargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
     return LowerGlobalAddress(MFI, Op, DAG);
   }
   case ISD::INTRINSIC_WO_CHAIN: return LowerINTRINSIC_WO_CHAIN(Op, DAG);
+  case ISD::INTRINSIC_W_CHAIN: return LowerINTRINSIC_W_CHAIN(Op, DAG);
   case ISD::INTRINSIC_VOID: return LowerINTRINSIC_VOID(Op, DAG);
+  case ISD::ADDRSPACECAST: return lowerADDRSPACECAST(Op, DAG);
+  case ISD::TRAP: return lowerTRAP(Op, DAG);
   }
   return SDValue();
 }
@@ -1106,25 +1268,78 @@ SDValue SITargetLowering::LowerFrameIndex(SDValue Op, SelectionDAG &DAG) const {
   FrameIndexSDNode *FINode = cast<FrameIndexSDNode>(Op);
   unsigned FrameIndex = FINode->getIndex();
 
-  // A FrameIndex node represents a 32-bit offset into scratch memory.  If
-  // the high bit of a frame index offset were to be set, this would mean
-  // that it represented an offset of ~2GB * 64 = ~128GB from the start of the
-  // scratch buffer, with 64 being the number of threads per wave.
+  // A FrameIndex node represents a 32-bit offset into scratch memory. If the
+  // high bit of a frame index offset were to be set, this would mean that it
+  // represented an offset of ~2GB * 64 = ~128GB from the start of the scratch
+  // buffer, with 64 being the number of threads per wave.
   //
-  // If we know the machine uses less than 128GB of scratch, then we can
-  // amrk the high bit of the FrameIndex node as known zero,
-  // which is important, because it means in most situations we can
-  // prove that values derived from FrameIndex nodes are non-negative.
-  // This enables us to take advantage of more addressing modes when
-  // accessing scratch buffers, since for scratch reads/writes, the register
-  // offset must always be positive.
+  // The maximum private allocation for the entire GPU is 4G, and we are
+  // concerned with the largest the index could ever be for an individual
+  // workitem. This will occur with the minmum dispatch size. If a program
+  // requires more, the dispatch size will be reduced.
+  //
+  // With this limit, we can mark the high bit of the FrameIndex node as known
+  // zero, which is important, because it means in most situations we can prove
+  // that values derived from FrameIndex nodes are non-negative. This enables us
+  // to take advantage of more addressing modes when accessing scratch buffers,
+  // since for scratch reads/writes, the register offset must always be
+  // positive.
 
-  SDValue TFI = DAG.getTargetFrameIndex(FrameIndex, MVT::i32);
-  if (Subtarget->enableHugeScratchBuffer())
-    return TFI;
+  uint64_t MaxGPUAlloc = UINT64_C(4) * 1024 * 1024 * 1024;
 
+  // XXX - It is unclear if partial dispatch works. Assume it works at half wave
+  // granularity. It is probably a full wave.
+  uint64_t MinGranularity = 32;
+
+  unsigned KnownBits = Log2_64(MaxGPUAlloc / MinGranularity);
+  EVT ExtVT = EVT::getIntegerVT(*DAG.getContext(), KnownBits);
+
+  SDValue TFI = DAG.getTargetFrameIndex(FrameIndex, MVT::i32);
   return DAG.getNode(ISD::AssertZext, SL, MVT::i32, TFI,
-                    DAG.getValueType(EVT::getIntegerVT(*DAG.getContext(), 31)));
+                     DAG.getValueType(ExtVT));
+}
+
+bool SITargetLowering::isCFIntrinsic(const SDNode *Intr) const {
+  if (Intr->getOpcode() != ISD::INTRINSIC_W_CHAIN)
+    return false;
+
+  switch (cast<ConstantSDNode>(Intr->getOperand(1))->getZExtValue()) {
+  default: return false;
+  case AMDGPUIntrinsic::amdgcn_if:
+  case AMDGPUIntrinsic::amdgcn_else:
+  case AMDGPUIntrinsic::amdgcn_break:
+  case AMDGPUIntrinsic::amdgcn_if_break:
+  case AMDGPUIntrinsic::amdgcn_else_break:
+  case AMDGPUIntrinsic::amdgcn_loop:
+  case AMDGPUIntrinsic::amdgcn_end_cf:
+    return true;
+  }
+}
+
+void SITargetLowering::createDebuggerPrologueStackObjects(
+    MachineFunction &MF) const {
+  // Create stack objects that are used for emitting debugger prologue.
+  //
+  // Debugger prologue writes work group IDs and work item IDs to scratch memory
+  // at fixed location in the following format:
+  //   offset 0:  work group ID x
+  //   offset 4:  work group ID y
+  //   offset 8:  work group ID z
+  //   offset 16: work item ID x
+  //   offset 20: work item ID y
+  //   offset 24: work item ID z
+  SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
+  int ObjectIdx = 0;
+
+  // For each dimension:
+  for (unsigned i = 0; i < 3; ++i) {
+    // Create fixed stack object for work group ID.
+    ObjectIdx = MF.getFrameInfo()->CreateFixedObject(4, i * 4, true);
+    Info->setDebuggerWorkGroupIDStackObjectIndex(i, ObjectIdx);
+    // Create fixed stack object for work item ID.
+    ObjectIdx = MF.getFrameInfo()->CreateFixedObject(4, i * 4 + 16, true);
+    Info->setDebuggerWorkItemIDStackObjectIndex(i, ObjectIdx);
+  }
 }
 
 /// This transforms the control flow intrinsics to get the branch destination as
@@ -1137,13 +1352,11 @@ SDValue SITargetLowering::LowerBRCOND(SDValue BRCOND,
   SDNode *Intr = BRCOND.getOperand(1).getNode();
   SDValue Target = BRCOND.getOperand(2);
   SDNode *BR = nullptr;
+  SDNode *SetCC = nullptr;
 
   if (Intr->getOpcode() == ISD::SETCC) {
     // As long as we negate the condition everything is fine
-    SDNode *SetCC = Intr;
-    assert(SetCC->getConstantOperandVal(1) == 1);
-    assert(cast<CondCodeSDNode>(SetCC->getOperand(2).getNode())->get() ==
-           ISD::SETNE);
+    SetCC = Intr;
     Intr = SetCC->getOperand(0).getNode();
 
   } else {
@@ -1152,7 +1365,15 @@ SDValue SITargetLowering::LowerBRCOND(SDValue BRCOND,
     Target = BR->getOperand(1);
   }
 
-  assert(Intr->getOpcode() == ISD::INTRINSIC_W_CHAIN);
+  if (!isCFIntrinsic(Intr)) {
+    // This is a uniform branch so we don't need to legalize.
+    return BRCOND;
+  }
+
+  assert(!SetCC ||
+        (SetCC->getConstantOperandVal(1) == 1 &&
+         cast<CondCodeSDNode>(SetCC->getOperand(2).getNode())->get() ==
+                                                             ISD::SETNE));
 
   // Build the result and
   ArrayRef<EVT> Res(Intr->value_begin() + 1, Intr->value_end());
@@ -1204,37 +1425,185 @@ SDValue SITargetLowering::LowerBRCOND(SDValue BRCOND,
   return Chain;
 }
 
+SDValue SITargetLowering::getSegmentAperture(unsigned AS,
+                                             SelectionDAG &DAG) const {
+  SDLoc SL;
+  MachineFunction &MF = DAG.getMachineFunction();
+  SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
+  unsigned UserSGPR = Info->getQueuePtrUserSGPR();
+  assert(UserSGPR != AMDGPU::NoRegister);
+
+  SDValue QueuePtr = CreateLiveInRegister(
+    DAG, &AMDGPU::SReg_64RegClass, UserSGPR, MVT::i64);
+
+  // Offset into amd_queue_t for group_segment_aperture_base_hi /
+  // private_segment_aperture_base_hi.
+  uint32_t StructOffset = (AS == AMDGPUAS::LOCAL_ADDRESS) ? 0x40 : 0x44;
+
+  SDValue Ptr = DAG.getNode(ISD::ADD, SL, MVT::i64, QueuePtr,
+                            DAG.getConstant(StructOffset, SL, MVT::i64));
+
+  // TODO: Use custom target PseudoSourceValue.
+  // TODO: We should use the value from the IR intrinsic call, but it might not
+  // be available and how do we get it?
+  Value *V = UndefValue::get(PointerType::get(Type::getInt8Ty(*DAG.getContext()),
+                                              AMDGPUAS::CONSTANT_ADDRESS));
+
+  MachinePointerInfo PtrInfo(V, StructOffset);
+  return DAG.getLoad(MVT::i32, SL, QueuePtr.getValue(1), Ptr, PtrInfo,
+                     MinAlign(64, StructOffset),
+                     MachineMemOperand::MOInvariant);
+}
+
+SDValue SITargetLowering::lowerADDRSPACECAST(SDValue Op,
+                                             SelectionDAG &DAG) const {
+  SDLoc SL(Op);
+  const AddrSpaceCastSDNode *ASC = cast<AddrSpaceCastSDNode>(Op);
+
+  SDValue Src = ASC->getOperand(0);
+
+  // FIXME: Really support non-0 null pointers.
+  SDValue SegmentNullPtr = DAG.getConstant(-1, SL, MVT::i32);
+  SDValue FlatNullPtr = DAG.getConstant(0, SL, MVT::i64);
+
+  // flat -> local/private
+  if (ASC->getSrcAddressSpace() == AMDGPUAS::FLAT_ADDRESS) {
+    if (ASC->getDestAddressSpace() == AMDGPUAS::LOCAL_ADDRESS ||
+        ASC->getDestAddressSpace() == AMDGPUAS::PRIVATE_ADDRESS) {
+      SDValue NonNull = DAG.getSetCC(SL, MVT::i1, Src, FlatNullPtr, ISD::SETNE);
+      SDValue Ptr = DAG.getNode(ISD::TRUNCATE, SL, MVT::i32, Src);
+
+      return DAG.getNode(ISD::SELECT, SL, MVT::i32,
+                         NonNull, Ptr, SegmentNullPtr);
+    }
+  }
+
+  // local/private -> flat
+  if (ASC->getDestAddressSpace() == AMDGPUAS::FLAT_ADDRESS) {
+    if (ASC->getSrcAddressSpace() == AMDGPUAS::LOCAL_ADDRESS ||
+        ASC->getSrcAddressSpace() == AMDGPUAS::PRIVATE_ADDRESS) {
+      SDValue NonNull
+        = DAG.getSetCC(SL, MVT::i1, Src, SegmentNullPtr, ISD::SETNE);
+
+      SDValue Aperture = getSegmentAperture(ASC->getSrcAddressSpace(), DAG);
+      SDValue CvtPtr
+        = DAG.getNode(ISD::BUILD_VECTOR, SL, MVT::v2i32, Src, Aperture);
+
+      return DAG.getNode(ISD::SELECT, SL, MVT::i64, NonNull,
+                         DAG.getNode(ISD::BITCAST, SL, MVT::i64, CvtPtr),
+                         FlatNullPtr);
+    }
+  }
+
+  // global <-> flat are no-ops and never emitted.
+
+  const MachineFunction &MF = DAG.getMachineFunction();
+  DiagnosticInfoUnsupported InvalidAddrSpaceCast(
+    *MF.getFunction(), "invalid addrspacecast", SL.getDebugLoc());
+  DAG.getContext()->diagnose(InvalidAddrSpaceCast);
+
+  return DAG.getUNDEF(ASC->getValueType(0));
+}
+
+static bool shouldEmitGOTReloc(const GlobalValue *GV,
+                               const TargetMachine &TM) {
+  return GV->getType()->getAddressSpace() == AMDGPUAS::GLOBAL_ADDRESS &&
+         !TM.shouldAssumeDSOLocal(*GV->getParent(), GV);
+}
+
+bool
+SITargetLowering::isOffsetFoldingLegal(const GlobalAddressSDNode *GA) const {
+  // We can fold offsets for anything that doesn't require a GOT relocation.
+  return GA->getAddressSpace() == AMDGPUAS::GLOBAL_ADDRESS &&
+         !shouldEmitGOTReloc(GA->getGlobal(), getTargetMachine());
+}
+
+static SDValue buildPCRelGlobalAddress(SelectionDAG &DAG, const GlobalValue *GV,
+                                      SDLoc DL, unsigned Offset, EVT PtrVT,
+                                      unsigned GAFlags = SIInstrInfo::MO_NONE) {
+  // In order to support pc-relative addressing, the PC_ADD_REL_OFFSET SDNode is
+  // lowered to the following code sequence:
+  // s_getpc_b64 s[0:1]
+  // s_add_u32 s0, s0, $symbol
+  // s_addc_u32 s1, s1, 0
+  //
+  // s_getpc_b64 returns the address of the s_add_u32 instruction and then
+  // a fixup or relocation is emitted to replace $symbol with a literal
+  // constant, which is a pc-relative offset from the encoding of the $symbol
+  // operand to the global variable.
+  //
+  // What we want here is an offset from the value returned by s_getpc
+  // (which is the address of the s_add_u32 instruction) to the global
+  // variable, but since the encoding of $symbol starts 4 bytes after the start
+  // of the s_add_u32 instruction, we end up with an offset that is 4 bytes too
+  // small. This requires us to add 4 to the global variable offset in order to
+  // compute the correct address.
+  SDValue GA = DAG.getTargetGlobalAddress(GV, DL, MVT::i32, Offset + 4,
+                                          GAFlags);
+  return DAG.getNode(AMDGPUISD::PC_ADD_REL_OFFSET, DL, PtrVT, GA);
+}
+
 SDValue SITargetLowering::LowerGlobalAddress(AMDGPUMachineFunction *MFI,
                                              SDValue Op,
                                              SelectionDAG &DAG) const {
   GlobalAddressSDNode *GSD = cast<GlobalAddressSDNode>(Op);
 
-  if (GSD->getAddressSpace() != AMDGPUAS::CONSTANT_ADDRESS)
+  if (GSD->getAddressSpace() != AMDGPUAS::CONSTANT_ADDRESS &&
+      GSD->getAddressSpace() != AMDGPUAS::GLOBAL_ADDRESS)
     return AMDGPUTargetLowering::LowerGlobalAddress(MFI, Op, DAG);
 
   SDLoc DL(GSD);
   const GlobalValue *GV = GSD->getGlobal();
-  MVT PtrVT = getPointerTy(DAG.getDataLayout(), GSD->getAddressSpace());
+  EVT PtrVT = Op.getValueType();
+
+  if (!shouldEmitGOTReloc(GV, getTargetMachine()))
+    return buildPCRelGlobalAddress(DAG, GV, DL, GSD->getOffset(), PtrVT);
+
+  SDValue GOTAddr = buildPCRelGlobalAddress(DAG, GV, DL, 0, PtrVT,
+                                            SIInstrInfo::MO_GOTPCREL);
+
+  Type *Ty = PtrVT.getTypeForEVT(*DAG.getContext());
+  PointerType *PtrTy = PointerType::get(Ty, AMDGPUAS::CONSTANT_ADDRESS);
+  const DataLayout &DataLayout = DAG.getDataLayout();
+  unsigned Align = DataLayout.getABITypeAlignment(PtrTy);
+  // FIXME: Use a PseudoSourceValue once those can be assigned an address space.
+  MachinePointerInfo PtrInfo(UndefValue::get(PtrTy));
 
-  SDValue GA = DAG.getTargetGlobalAddress(GV, DL, MVT::i32);
-  return DAG.getNode(AMDGPUISD::CONST_DATA_PTR, DL, PtrVT, GA);
+  return DAG.getLoad(PtrVT, DL, DAG.getEntryNode(), GOTAddr, PtrInfo, Align,
+                     MachineMemOperand::MOInvariant);
 }
 
-SDValue SITargetLowering::copyToM0(SelectionDAG &DAG, SDValue Chain, SDLoc DL,
-                                   SDValue V) const {
+SDValue SITargetLowering::lowerTRAP(SDValue Op,
+                                    SelectionDAG &DAG) const {
+  const MachineFunction &MF = DAG.getMachineFunction();
+  DiagnosticInfoUnsupported NoTrap(*MF.getFunction(),
+                                   "trap handler not supported",
+                                   Op.getDebugLoc(),
+                                   DS_Warning);
+  DAG.getContext()->diagnose(NoTrap);
+
+  // Emit s_endpgm.
+
+  // FIXME: This should really be selected to s_trap, but that requires
+  // setting up the trap handler for it o do anything.
+  return DAG.getNode(AMDGPUISD::ENDPGM, SDLoc(Op), MVT::Other,
+                     Op.getOperand(0));
+}
+
+SDValue SITargetLowering::copyToM0(SelectionDAG &DAG, SDValue Chain,
+                                   const SDLoc &DL, SDValue V) const {
+  // We can't use S_MOV_B32 directly, because there is no way to specify m0 as
+  // the destination register.
+  //
   // We can't use CopyToReg, because MachineCSE won't combine COPY instructions,
   // so we will end up with redundant moves to m0.
   //
-  // We can't use S_MOV_B32, because there is no way to specify m0 as the
-  // destination register.
-  //
-  // We have to use them both.  Machine cse will combine all the S_MOV_B32
-  // instructions and the register coalescer eliminate the extra copies.
-  SDNode *M0 = DAG.getMachineNode(AMDGPU::S_MOV_B32, DL, V.getValueType(), V);
-  return DAG.getCopyToReg(Chain, DL, DAG.getRegister(AMDGPU::M0, MVT::i32),
-                          SDValue(M0, 0), SDValue()); // Glue
-                                                      // A Null SDValue creates
-                                                      // a glue result.
+  // We use a pseudo to ensure we emit s_mov_b32 with m0 as the direct result.
+
+  // A Null SDValue creates a glue result.
+  SDNode *M0 = DAG.getMachineNode(AMDGPU::SI_INIT_M0, DL, MVT::Other, MVT::Glue,
+                                  V, Chain);
+  return SDValue(M0, 0);
 }
 
 SDValue SITargetLowering::lowerImplicitZextParam(SelectionDAG &DAG,
@@ -1249,12 +1618,27 @@ SDValue SITargetLowering::lowerImplicitZextParam(SelectionDAG &DAG,
                      DAG.getValueType(VT));
 }
 
+static SDValue emitNonHSAIntrinsicError(SelectionDAG& DAG, SDLoc DL, EVT VT) {
+  DiagnosticInfoUnsupported BadIntrin(*DAG.getMachineFunction().getFunction(),
+                                      "non-hsa intrinsic with hsa target",
+                                      DL.getDebugLoc());
+  DAG.getContext()->diagnose(BadIntrin);
+  return DAG.getUNDEF(VT);
+}
+
+static SDValue emitRemovedIntrinsicError(SelectionDAG& DAG, SDLoc DL, EVT VT) {
+  DiagnosticInfoUnsupported BadIntrin(*DAG.getMachineFunction().getFunction(),
+                                      "intrinsic not supported on subtarget",
+                                      DL.getDebugLoc());
+  DAG.getContext()->diagnose(BadIntrin);
+  return DAG.getUNDEF(VT);
+}
+
 SDValue SITargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
                                                   SelectionDAG &DAG) const {
   MachineFunction &MF = DAG.getMachineFunction();
   auto MFI = MF.getInfo<SIMachineFunctionInfo>();
-  const SIRegisterInfo *TRI =
-      static_cast<const SIRegisterInfo *>(Subtarget->getRegisterInfo());
+  const SIRegisterInfo *TRI = getSubtarget()->getRegisterInfo();
 
   EVT VT = Op.getValueType();
   SDLoc DL(Op);
@@ -1264,62 +1648,134 @@ SDValue SITargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
 
   switch (IntrinsicID) {
   case Intrinsic::amdgcn_dispatch_ptr:
+  case Intrinsic::amdgcn_queue_ptr: {
     if (!Subtarget->isAmdHsaOS()) {
-      DiagnosticInfoUnsupported BadIntrin(*MF.getFunction(),
-                                          "hsa intrinsic without hsa target");
+      DiagnosticInfoUnsupported BadIntrin(
+          *MF.getFunction(), "unsupported hsa intrinsic without hsa target",
+          DL.getDebugLoc());
       DAG.getContext()->diagnose(BadIntrin);
       return DAG.getUNDEF(VT);
     }
 
+    auto Reg = IntrinsicID == Intrinsic::amdgcn_dispatch_ptr ?
+      SIRegisterInfo::DISPATCH_PTR : SIRegisterInfo::QUEUE_PTR;
     return CreateLiveInRegister(DAG, &AMDGPU::SReg_64RegClass,
-      TRI->getPreloadedValue(MF, SIRegisterInfo::DISPATCH_PTR), VT);
-
+                                TRI->getPreloadedValue(MF, Reg), VT);
+  }
+  case Intrinsic::amdgcn_implicitarg_ptr: {
+    unsigned offset = getImplicitParameterOffset(MFI, FIRST_IMPLICIT);
+    return LowerParameterPtr(DAG, DL, DAG.getEntryNode(), offset);
+  }
+  case Intrinsic::amdgcn_kernarg_segment_ptr: {
+    unsigned Reg
+      = TRI->getPreloadedValue(MF, SIRegisterInfo::KERNARG_SEGMENT_PTR);
+    return CreateLiveInRegister(DAG, &AMDGPU::SReg_64RegClass, Reg, VT);
+  }
+  case Intrinsic::amdgcn_rcp:
+    return DAG.getNode(AMDGPUISD::RCP, DL, VT, Op.getOperand(1));
+  case Intrinsic::amdgcn_rsq:
+  case AMDGPUIntrinsic::AMDGPU_rsq: // Legacy name
+    return DAG.getNode(AMDGPUISD::RSQ, DL, VT, Op.getOperand(1));
+  case Intrinsic::amdgcn_rsq_legacy: {
+    if (Subtarget->getGeneration() >= SISubtarget::VOLCANIC_ISLANDS)
+      return emitRemovedIntrinsicError(DAG, DL, VT);
+
+    return DAG.getNode(AMDGPUISD::RSQ_LEGACY, DL, VT, Op.getOperand(1));
+  }
+  case Intrinsic::amdgcn_rsq_clamp: {
+    if (Subtarget->getGeneration() < SISubtarget::VOLCANIC_ISLANDS)
+      return DAG.getNode(AMDGPUISD::RSQ_CLAMP, DL, VT, Op.getOperand(1));
+
+    Type *Type = VT.getTypeForEVT(*DAG.getContext());
+    APFloat Max = APFloat::getLargest(Type->getFltSemantics());
+    APFloat Min = APFloat::getLargest(Type->getFltSemantics(), true);
+
+    SDValue Rsq = DAG.getNode(AMDGPUISD::RSQ, DL, VT, Op.getOperand(1));
+    SDValue Tmp = DAG.getNode(ISD::FMINNUM, DL, VT, Rsq,
+                              DAG.getConstantFP(Max, DL, VT));
+    return DAG.getNode(ISD::FMAXNUM, DL, VT, Tmp,
+                       DAG.getConstantFP(Min, DL, VT));
+  }
   case Intrinsic::r600_read_ngroups_x:
+    if (Subtarget->isAmdHsaOS())
+      return emitNonHSAIntrinsicError(DAG, DL, VT);
+
     return LowerParameter(DAG, VT, VT, DL, DAG.getEntryNode(),
                           SI::KernelInputOffsets::NGROUPS_X, false);
   case Intrinsic::r600_read_ngroups_y:
+    if (Subtarget->isAmdHsaOS())
+      return emitNonHSAIntrinsicError(DAG, DL, VT);
+
     return LowerParameter(DAG, VT, VT, DL, DAG.getEntryNode(),
                           SI::KernelInputOffsets::NGROUPS_Y, false);
   case Intrinsic::r600_read_ngroups_z:
+    if (Subtarget->isAmdHsaOS())
+      return emitNonHSAIntrinsicError(DAG, DL, VT);
+
     return LowerParameter(DAG, VT, VT, DL, DAG.getEntryNode(),
                           SI::KernelInputOffsets::NGROUPS_Z, false);
   case Intrinsic::r600_read_global_size_x:
+    if (Subtarget->isAmdHsaOS())
+      return emitNonHSAIntrinsicError(DAG, DL, VT);
+
     return LowerParameter(DAG, VT, VT, DL, DAG.getEntryNode(),
                           SI::KernelInputOffsets::GLOBAL_SIZE_X, false);
   case Intrinsic::r600_read_global_size_y:
+    if (Subtarget->isAmdHsaOS())
+      return emitNonHSAIntrinsicError(DAG, DL, VT);
+
     return LowerParameter(DAG, VT, VT, DL, DAG.getEntryNode(),
                           SI::KernelInputOffsets::GLOBAL_SIZE_Y, false);
   case Intrinsic::r600_read_global_size_z:
+    if (Subtarget->isAmdHsaOS())
+      return emitNonHSAIntrinsicError(DAG, DL, VT);
+
     return LowerParameter(DAG, VT, VT, DL, DAG.getEntryNode(),
                           SI::KernelInputOffsets::GLOBAL_SIZE_Z, false);
   case Intrinsic::r600_read_local_size_x:
+    if (Subtarget->isAmdHsaOS())
+      return emitNonHSAIntrinsicError(DAG, DL, VT);
+
     return lowerImplicitZextParam(DAG, Op, MVT::i16,
                                   SI::KernelInputOffsets::LOCAL_SIZE_X);
   case Intrinsic::r600_read_local_size_y:
+    if (Subtarget->isAmdHsaOS())
+      return emitNonHSAIntrinsicError(DAG, DL, VT);
+
     return lowerImplicitZextParam(DAG, Op, MVT::i16,
                                   SI::KernelInputOffsets::LOCAL_SIZE_Y);
   case Intrinsic::r600_read_local_size_z:
+    if (Subtarget->isAmdHsaOS())
+      return emitNonHSAIntrinsicError(DAG, DL, VT);
+
     return lowerImplicitZextParam(DAG, Op, MVT::i16,
                                   SI::KernelInputOffsets::LOCAL_SIZE_Z);
-  case Intrinsic::AMDGPU_read_workdim:
+  case Intrinsic::amdgcn_read_workdim:
+  case AMDGPUIntrinsic::AMDGPU_read_workdim: // Legacy name.
     // Really only 2 bits.
     return lowerImplicitZextParam(DAG, Op, MVT::i8,
                                   getImplicitParameterOffset(MFI, GRID_DIM));
+  case Intrinsic::amdgcn_workgroup_id_x:
   case Intrinsic::r600_read_tgid_x:
     return CreateLiveInRegister(DAG, &AMDGPU::SReg_32RegClass,
       TRI->getPreloadedValue(MF, SIRegisterInfo::WORKGROUP_ID_X), VT);
+  case Intrinsic::amdgcn_workgroup_id_y:
   case Intrinsic::r600_read_tgid_y:
     return CreateLiveInRegister(DAG, &AMDGPU::SReg_32RegClass,
       TRI->getPreloadedValue(MF, SIRegisterInfo::WORKGROUP_ID_Y), VT);
+  case Intrinsic::amdgcn_workgroup_id_z:
   case Intrinsic::r600_read_tgid_z:
     return CreateLiveInRegister(DAG, &AMDGPU::SReg_32RegClass,
       TRI->getPreloadedValue(MF, SIRegisterInfo::WORKGROUP_ID_Z), VT);
+  case Intrinsic::amdgcn_workitem_id_x:
   case Intrinsic::r600_read_tidig_x:
     return CreateLiveInRegister(DAG, &AMDGPU::VGPR_32RegClass,
       TRI->getPreloadedValue(MF, SIRegisterInfo::WORKITEM_ID_X), VT);
+  case Intrinsic::amdgcn_workitem_id_y:
   case Intrinsic::r600_read_tidig_y:
     return CreateLiveInRegister(DAG, &AMDGPU::VGPR_32RegClass,
       TRI->getPreloadedValue(MF, SIRegisterInfo::WORKITEM_ID_Y), VT);
+  case Intrinsic::amdgcn_workitem_id_z:
   case Intrinsic::r600_read_tidig_z:
     return CreateLiveInRegister(DAG, &AMDGPU::VGPR_32RegClass,
       TRI->getPreloadedValue(MF, SIRegisterInfo::WORKITEM_ID_Z), VT);
@@ -1336,24 +1792,12 @@ SDValue SITargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
     return DAG.getMemIntrinsicNode(AMDGPUISD::LOAD_CONSTANT, DL,
                                    Op->getVTList(), Ops, VT, MMO);
   }
-  case AMDGPUIntrinsic::SI_sample:
-    return LowerSampleIntrinsic(AMDGPUISD::SAMPLE, Op, DAG);
-  case AMDGPUIntrinsic::SI_sampleb:
-    return LowerSampleIntrinsic(AMDGPUISD::SAMPLEB, Op, DAG);
-  case AMDGPUIntrinsic::SI_sampled:
-    return LowerSampleIntrinsic(AMDGPUISD::SAMPLED, Op, DAG);
-  case AMDGPUIntrinsic::SI_samplel:
-    return LowerSampleIntrinsic(AMDGPUISD::SAMPLEL, Op, DAG);
   case AMDGPUIntrinsic::SI_vs_load_input:
     return DAG.getNode(AMDGPUISD::LOAD_INPUT, DL, VT,
                        Op.getOperand(1),
                        Op.getOperand(2),
                        Op.getOperand(3));
 
-  case AMDGPUIntrinsic::AMDGPU_fract:
-  case AMDGPUIntrinsic::AMDIL_fraction: // Legacy name.
-    return DAG.getNode(ISD::FSUB, DL, VT, Op.getOperand(1),
-                       DAG.getNode(ISD::FFLOOR, DL, VT, Op.getOperand(1)));
   case AMDGPUIntrinsic::SI_fs_constant: {
     SDValue M0 = copyToM0(DAG, DAG.getEntryNode(), DL, Op.getOperand(3));
     SDValue Glue = M0.getValue(1);
@@ -1393,11 +1837,93 @@ SDValue SITargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
                        Op.getOperand(2), Op.getOperand(3), Op.getOperand(4),
                        Glue);
   }
+  case Intrinsic::amdgcn_sin:
+    return DAG.getNode(AMDGPUISD::SIN_HW, DL, VT, Op.getOperand(1));
+
+  case Intrinsic::amdgcn_cos:
+    return DAG.getNode(AMDGPUISD::COS_HW, DL, VT, Op.getOperand(1));
+
+  case Intrinsic::amdgcn_log_clamp: {
+    if (Subtarget->getGeneration() < SISubtarget::VOLCANIC_ISLANDS)
+      return SDValue();
+
+    DiagnosticInfoUnsupported BadIntrin(
+      *MF.getFunction(), "intrinsic not supported on subtarget",
+      DL.getDebugLoc());
+      DAG.getContext()->diagnose(BadIntrin);
+      return DAG.getUNDEF(VT);
+  }
+  case Intrinsic::amdgcn_ldexp:
+    return DAG.getNode(AMDGPUISD::LDEXP, DL, VT,
+                       Op.getOperand(1), Op.getOperand(2));
+
+  case Intrinsic::amdgcn_fract:
+    return DAG.getNode(AMDGPUISD::FRACT, DL, VT, Op.getOperand(1));
+
+  case Intrinsic::amdgcn_class:
+    return DAG.getNode(AMDGPUISD::FP_CLASS, DL, VT,
+                       Op.getOperand(1), Op.getOperand(2));
+  case Intrinsic::amdgcn_div_fmas:
+    return DAG.getNode(AMDGPUISD::DIV_FMAS, DL, VT,
+                       Op.getOperand(1), Op.getOperand(2), Op.getOperand(3),
+                       Op.getOperand(4));
+
+  case Intrinsic::amdgcn_div_fixup:
+    return DAG.getNode(AMDGPUISD::DIV_FIXUP, DL, VT,
+                       Op.getOperand(1), Op.getOperand(2), Op.getOperand(3));
+
+  case Intrinsic::amdgcn_trig_preop:
+    return DAG.getNode(AMDGPUISD::TRIG_PREOP, DL, VT,
+                       Op.getOperand(1), Op.getOperand(2));
+  case Intrinsic::amdgcn_div_scale: {
+    // 3rd parameter required to be a constant.
+    const ConstantSDNode *Param = dyn_cast<ConstantSDNode>(Op.getOperand(3));
+    if (!Param)
+      return DAG.getUNDEF(VT);
+
+    // Translate to the operands expected by the machine instruction. The
+    // first parameter must be the same as the first instruction.
+    SDValue Numerator = Op.getOperand(1);
+    SDValue Denominator = Op.getOperand(2);
+
+    // Note this order is opposite of the machine instruction's operations,
+    // which is s0.f = Quotient, s1.f = Denominator, s2.f = Numerator. The
+    // intrinsic has the numerator as the first operand to match a normal
+    // division operation.
+
+    SDValue Src0 = Param->isAllOnesValue() ? Numerator : Denominator;
+
+    return DAG.getNode(AMDGPUISD::DIV_SCALE, DL, Op->getVTList(), Src0,
+                       Denominator, Numerator);
+  }
   default:
     return AMDGPUTargetLowering::LowerOperation(Op, DAG);
   }
 }
 
+SDValue SITargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op,
+                                                 SelectionDAG &DAG) const {
+  unsigned IntrID = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue();
+  switch (IntrID) {
+  case Intrinsic::amdgcn_atomic_inc:
+  case Intrinsic::amdgcn_atomic_dec: {
+    MemSDNode *M = cast<MemSDNode>(Op);
+    unsigned Opc = (IntrID == Intrinsic::amdgcn_atomic_inc) ?
+      AMDGPUISD::ATOMIC_INC : AMDGPUISD::ATOMIC_DEC;
+    SDValue Ops[] = {
+      M->getOperand(0), // Chain
+      M->getOperand(2), // Ptr
+      M->getOperand(3)  // Value
+    };
+
+    return DAG.getMemIntrinsicNode(Opc, SDLoc(Op), M->getVTList(), Ops,
+                                   M->getMemoryVT(), M->getMemOperand());
+  }
+  default:
+    return SDValue();
+  }
+}
+
 SDValue SITargetLowering::LowerINTRINSIC_VOID(SDValue Op,
                                               SelectionDAG &DAG) const {
   MachineFunction &MF = DAG.getMachineFunction();
@@ -1439,6 +1965,14 @@ SDValue SITargetLowering::LowerINTRINSIC_VOID(SDValue Op,
     return DAG.getMemIntrinsicNode(AMDGPUISD::TBUFFER_STORE_FORMAT, DL,
                                    Op->getVTList(), Ops, VT, MMO);
   }
+  case AMDGPUIntrinsic::AMDGPU_kill: {
+    if (const ConstantFPSDNode *K = dyn_cast<ConstantFPSDNode>(Op.getOperand(2))) {
+      if (!K->isNegative())
+        return Chain;
+    }
+
+    return Op;
+  }
   default:
     return SDValue();
   }
@@ -1447,48 +1981,92 @@ SDValue SITargetLowering::LowerINTRINSIC_VOID(SDValue Op,
 SDValue SITargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const {
   SDLoc DL(Op);
   LoadSDNode *Load = cast<LoadSDNode>(Op);
+  ISD::LoadExtType ExtType = Load->getExtensionType();
+  EVT MemVT = Load->getMemoryVT();
 
-  if (Op.getValueType().isVector()) {
-    assert(Op.getValueType().getVectorElementType() == MVT::i32 &&
-           "Custom lowering for non-i32 vectors hasn't been implemented.");
-    unsigned NumElements = Op.getValueType().getVectorNumElements();
-    assert(NumElements != 2 && "v2 loads are supported for all address spaces.");
+  if (ExtType == ISD::NON_EXTLOAD && MemVT.getSizeInBits() < 32) {
+    assert(MemVT == MVT::i1 && "Only i1 non-extloads expected");
+    // FIXME: Copied from PPC
+    // First, load into 32 bits, then truncate to 1 bit.
 
-    switch (Load->getAddressSpace()) {
-      default: break;
-      case AMDGPUAS::CONSTANT_ADDRESS:
-      if (isMemOpUniform(Load))
-        break;
-        // Non-uniform loads will be selected to MUBUF instructions, so they
-        // have the same legalization requires ments as global and private
-        // loads.
-        //
-        // Fall-through
-      case AMDGPUAS::GLOBAL_ADDRESS:
-      case AMDGPUAS::PRIVATE_ADDRESS:
-        if (NumElements >= 8)
-          return SplitVectorLoad(Op, DAG);
-
-        // v4 loads are supported for private and global memory.
-        if (NumElements <= 4)
-          break;
-        // fall-through
-      case AMDGPUAS::LOCAL_ADDRESS:
-        // If properly aligned, if we split we might be able to use ds_read_b64.
+    SDValue Chain = Load->getChain();
+    SDValue BasePtr = Load->getBasePtr();
+    MachineMemOperand *MMO = Load->getMemOperand();
+
+    SDValue NewLD = DAG.getExtLoad(ISD::EXTLOAD, DL, MVT::i32, Chain,
+                                   BasePtr, MVT::i8, MMO);
+
+    SDValue Ops[] = {
+      DAG.getNode(ISD::TRUNCATE, DL, MemVT, NewLD),
+      NewLD.getValue(1)
+    };
+
+    return DAG.getMergeValues(Ops, DL);
+  }
+
+  if (!MemVT.isVector())
+    return SDValue();
+
+  assert(Op.getValueType().getVectorElementType() == MVT::i32 &&
+         "Custom lowering for non-i32 vectors hasn't been implemented.");
+
+  unsigned AS = Load->getAddressSpace();
+  if (!allowsMemoryAccess(*DAG.getContext(), DAG.getDataLayout(), MemVT,
+                          AS, Load->getAlignment())) {
+    SDValue Ops[2];
+    std::tie(Ops[0], Ops[1]) = expandUnalignedLoad(Load, DAG);
+    return DAG.getMergeValues(Ops, DL);
+  }
+
+  unsigned NumElements = MemVT.getVectorNumElements();
+  switch (AS) {
+  case AMDGPUAS::CONSTANT_ADDRESS:
+    if (isMemOpUniform(Load))
+      return SDValue();
+    // Non-uniform loads will be selected to MUBUF instructions, so they
+    // have the same legalization requires ments as global and private
+    // loads.
+    //
+    // Fall-through
+  case AMDGPUAS::GLOBAL_ADDRESS:
+  case AMDGPUAS::FLAT_ADDRESS:
+    if (NumElements > 4)
+      return SplitVectorLoad(Op, DAG);
+    // v4 loads are supported for private and global memory.
+    return SDValue();
+  case AMDGPUAS::PRIVATE_ADDRESS: {
+    // Depending on the setting of the private_element_size field in the
+    // resource descriptor, we can only make private accesses up to a certain
+    // size.
+    switch (Subtarget->getMaxPrivateElementSize()) {
+    case 4:
+      return scalarizeVectorLoad(Load, DAG);
+    case 8:
+      if (NumElements > 2)
+        return SplitVectorLoad(Op, DAG);
+      return SDValue();
+    case 16:
+      // Same as global/flat
+      if (NumElements > 4)
         return SplitVectorLoad(Op, DAG);
+      return SDValue();
+    default:
+      llvm_unreachable("unsupported private_element_size");
     }
   }
+  case AMDGPUAS::LOCAL_ADDRESS: {
+    if (NumElements > 2)
+      return SplitVectorLoad(Op, DAG);
 
-  return AMDGPUTargetLowering::LowerLOAD(Op, DAG);
-}
+    if (NumElements == 2)
+      return SDValue();
 
-SDValue SITargetLowering::LowerSampleIntrinsic(unsigned Opcode,
-                                               const SDValue &Op,
-                                               SelectionDAG &DAG) const {
-  return DAG.getNode(Opcode, SDLoc(Op), Op.getValueType(), Op.getOperand(1),
-                     Op.getOperand(2),
-                     Op.getOperand(3),
-                     Op.getOperand(4));
+    // If properly aligned, if we split we might be able to use ds_read_b64.
+    return SplitVectorLoad(Op, DAG);
+  }
+  default:
+    return SDValue();
+  }
 }
 
 SDValue SITargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const {
@@ -1514,7 +2092,7 @@ SDValue SITargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const {
 
   SDValue Hi = DAG.getSelect(DL, MVT::i32, Cond, Hi0, Hi1);
 
-  SDValue Res = DAG.getNode(ISD::BUILD_VECTOR, DL, MVT::v2i32, Lo, Hi);
+  SDValue Res = DAG.getBuildVector(MVT::v2i32, DL, {Lo, Hi});
   return DAG.getNode(ISD::BITCAST, DL, MVT::i64, Res);
 }
 
@@ -1547,7 +2125,9 @@ SDValue SITargetLowering::LowerFastFDIV(SDValue Op, SelectionDAG &DAG) const {
     }
   }
 
-  if (Unsafe) {
+  const SDNodeFlags *Flags = Op->getFlags();
+
+  if (Unsafe || Flags->hasAllowReciprocal()) {
     // Turn into multiply by the reciprocal.
     // x / y -> x * (1.0 / y)
     SDNodeFlags Flags;
@@ -1560,45 +2140,71 @@ SDValue SITargetLowering::LowerFastFDIV(SDValue Op, SelectionDAG &DAG) const {
 }
 
 SDValue SITargetLowering::LowerFDIV32(SDValue Op, SelectionDAG &DAG) const {
-  SDValue FastLowered = LowerFastFDIV(Op, DAG);
-  if (FastLowered.getNode())
+  if (SDValue FastLowered = LowerFastFDIV(Op, DAG))
     return FastLowered;
 
-  // This uses v_rcp_f32 which does not handle denormals. Let this hit a
-  // selection error for now rather than do something incorrect.
-  if (Subtarget->hasFP32Denormals())
-    return SDValue();
-
   SDLoc SL(Op);
   SDValue LHS = Op.getOperand(0);
   SDValue RHS = Op.getOperand(1);
 
-  SDValue r1 = DAG.getNode(ISD::FABS, SL, MVT::f32, RHS);
+  // faster 2.5 ulp fdiv when using -amdgpu-fast-fdiv flag
+  if (EnableAMDGPUFastFDIV) {
+    // This does not support denormals.
+    SDValue r1 = DAG.getNode(ISD::FABS, SL, MVT::f32, RHS);
+
+    const APFloat K0Val(BitsToFloat(0x6f800000));
+    const SDValue K0 = DAG.getConstantFP(K0Val, SL, MVT::f32);
+
+    const APFloat K1Val(BitsToFloat(0x2f800000));
+    const SDValue K1 = DAG.getConstantFP(K1Val, SL, MVT::f32);
+
+    const SDValue One = DAG.getConstantFP(1.0, SL, MVT::f32);
+
+    EVT SetCCVT =
+        getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), MVT::f32);
+
+    SDValue r2 = DAG.getSetCC(SL, SetCCVT, r1, K0, ISD::SETOGT);
+
+    SDValue r3 = DAG.getNode(ISD::SELECT, SL, MVT::f32, r2, K1, One);
+
+    // TODO: Should this propagate fast-math-flags?
 
-  const APFloat K0Val(BitsToFloat(0x6f800000));
-  const SDValue K0 = DAG.getConstantFP(K0Val, SL, MVT::f32);
+    r1 = DAG.getNode(ISD::FMUL, SL, MVT::f32, RHS, r3);
 
-  const APFloat K1Val(BitsToFloat(0x2f800000));
-  const SDValue K1 = DAG.getConstantFP(K1Val, SL, MVT::f32);
+    // rcp does not support denormals.
+    SDValue r0 = DAG.getNode(AMDGPUISD::RCP, SL, MVT::f32, r1);
 
+    SDValue Mul = DAG.getNode(ISD::FMUL, SL, MVT::f32, LHS, r0);
+
+    return DAG.getNode(ISD::FMUL, SL, MVT::f32, r3, Mul);
+  }
+
+  // Generates more precise fpdiv32.
   const SDValue One = DAG.getConstantFP(1.0, SL, MVT::f32);
 
-  EVT SetCCVT =
-      getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), MVT::f32);
+  SDVTList ScaleVT = DAG.getVTList(MVT::f32, MVT::i1);
 
-  SDValue r2 = DAG.getSetCC(SL, SetCCVT, r1, K0, ISD::SETOGT);
+  SDValue DenominatorScaled = DAG.getNode(AMDGPUISD::DIV_SCALE, SL, ScaleVT, RHS, RHS, LHS);
+  SDValue NumeratorScaled = DAG.getNode(AMDGPUISD::DIV_SCALE, SL, ScaleVT, LHS, RHS, LHS);
 
-  SDValue r3 = DAG.getNode(ISD::SELECT, SL, MVT::f32, r2, K1, One);
+  // Denominator is scaled to not be denormal, so using rcp is ok.
+  SDValue ApproxRcp = DAG.getNode(AMDGPUISD::RCP, SL, MVT::f32, DenominatorScaled);
 
-  // TODO: Should this propagate fast-math-flags?
+  SDValue NegDivScale0 = DAG.getNode(ISD::FNEG, SL, MVT::f32, DenominatorScaled);
 
-  r1 = DAG.getNode(ISD::FMUL, SL, MVT::f32, RHS, r3);
+  SDValue Fma0 = DAG.getNode(ISD::FMA, SL, MVT::f32, NegDivScale0, ApproxRcp, One);
+  SDValue Fma1 = DAG.getNode(ISD::FMA, SL, MVT::f32, Fma0, ApproxRcp, ApproxRcp);
 
-  SDValue r0 = DAG.getNode(AMDGPUISD::RCP, SL, MVT::f32, r1);
+  SDValue Mul = DAG.getNode(ISD::FMUL, SL, MVT::f32, NumeratorScaled, Fma1);
 
-  SDValue Mul = DAG.getNode(ISD::FMUL, SL, MVT::f32, LHS, r0);
+  SDValue Fma2 = DAG.getNode(ISD::FMA, SL, MVT::f32, NegDivScale0, Mul, NumeratorScaled);
+  SDValue Fma3 = DAG.getNode(ISD::FMA, SL, MVT::f32, Fma2, Fma1, Mul);
+  SDValue Fma4 = DAG.getNode(ISD::FMA, SL, MVT::f32, NegDivScale0, Fma3, NumeratorScaled);
 
-  return DAG.getNode(ISD::FMUL, SL, MVT::f32, r3, Mul);
+  SDValue Scale = NumeratorScaled.getValue(1);
+  SDValue Fmas = DAG.getNode(AMDGPUISD::DIV_FMAS, SL, MVT::f32, Fma4, Fma1, Fma3, Scale);
+
+  return DAG.getNode(AMDGPUISD::DIV_FIXUP, SL, MVT::f32, Fmas, RHS, LHS);
 }
 
 SDValue SITargetLowering::LowerFDIV64(SDValue Op, SelectionDAG &DAG) const {
@@ -1635,7 +2241,7 @@ SDValue SITargetLowering::LowerFDIV64(SDValue Op, SelectionDAG &DAG) const {
 
   SDValue Scale;
 
-  if (Subtarget->getGeneration() == AMDGPUSubtarget::SOUTHERN_ISLANDS) {
+  if (Subtarget->getGeneration() == SISubtarget::SOUTHERN_ISLANDS) {
     // Workaround a hardware bug on SI where the condition output from div_scale
     // is not usable.
 
@@ -1685,26 +2291,57 @@ SDValue SITargetLowering::LowerSTORE(SDValue Op, SelectionDAG &DAG) const {
   StoreSDNode *Store = cast<StoreSDNode>(Op);
   EVT VT = Store->getMemoryVT();
 
-  // These stores are legal.
-  if (Store->getAddressSpace() == AMDGPUAS::PRIVATE_ADDRESS) {
-    if (VT.isVector() && VT.getVectorNumElements() > 4)
-      return ScalarizeVectorStore(Op, DAG);
-    return SDValue();
+  if (VT == MVT::i1) {
+    return DAG.getTruncStore(Store->getChain(), DL,
+       DAG.getSExtOrTrunc(Store->getValue(), DL, MVT::i32),
+       Store->getBasePtr(), MVT::i1, Store->getMemOperand());
   }
 
-  SDValue Ret = AMDGPUTargetLowering::LowerSTORE(Op, DAG);
-  if (Ret.getNode())
-    return Ret;
+  assert(VT.isVector() &&
+         Store->getValue().getValueType().getScalarType() == MVT::i32);
 
-  if (VT.isVector() && VT.getVectorNumElements() >= 8)
+  unsigned AS = Store->getAddressSpace();
+  if (!allowsMemoryAccess(*DAG.getContext(), DAG.getDataLayout(), VT,
+                          AS, Store->getAlignment())) {
+    return expandUnalignedStore(Store, DAG);
+  }
+
+  unsigned NumElements = VT.getVectorNumElements();
+  switch (AS) {
+  case AMDGPUAS::GLOBAL_ADDRESS:
+  case AMDGPUAS::FLAT_ADDRESS:
+    if (NumElements > 4)
+      return SplitVectorStore(Op, DAG);
+    return SDValue();
+  case AMDGPUAS::PRIVATE_ADDRESS: {
+    switch (Subtarget->getMaxPrivateElementSize()) {
+    case 4:
+      return scalarizeVectorStore(Store, DAG);
+    case 8:
+      if (NumElements > 2)
+        return SplitVectorStore(Op, DAG);
+      return SDValue();
+    case 16:
+      if (NumElements > 4)
+        return SplitVectorStore(Op, DAG);
+      return SDValue();
+    default:
+      llvm_unreachable("unsupported private_element_size");
+    }
+  }
+  case AMDGPUAS::LOCAL_ADDRESS: {
+    if (NumElements > 2)
       return SplitVectorStore(Op, DAG);
 
-  if (VT == MVT::i1)
-    return DAG.getTruncStore(Store->getChain(), DL,
-                        DAG.getSExtOrTrunc(Store->getValue(), DL, MVT::i32),
-                        Store->getBasePtr(), MVT::i1, Store->getMemOperand());
+    if (NumElements == 2)
+      return Op;
 
-  return SDValue();
+    // If properly aligned, if we split we might be able to use ds_write_b64.
+    return SplitVectorStore(Op, DAG);
+  }
+  default:
+    llvm_unreachable("unhandled address space");
+  }
 }
 
 SDValue SITargetLowering::LowerTrig(SDValue Op, SelectionDAG &DAG) const {
@@ -1727,6 +2364,33 @@ SDValue SITargetLowering::LowerTrig(SDValue Op, SelectionDAG &DAG) const {
   }
 }
 
+SDValue SITargetLowering::LowerATOMIC_CMP_SWAP(SDValue Op, SelectionDAG &DAG) const {
+  AtomicSDNode *AtomicNode = cast<AtomicSDNode>(Op);
+  assert(AtomicNode->isCompareAndSwap());
+  unsigned AS = AtomicNode->getAddressSpace();
+
+  // No custom lowering required for local address space
+  if (!isFlatGlobalAddrSpace(AS))
+    return Op;
+
+  // Non-local address space requires custom lowering for atomic compare
+  // and swap; cmp and swap should be in a v2i32 or v2i64 in case of _X2
+  SDLoc DL(Op);
+  SDValue ChainIn = Op.getOperand(0);
+  SDValue Addr = Op.getOperand(1);
+  SDValue Old = Op.getOperand(2);
+  SDValue New = Op.getOperand(3);
+  EVT VT = Op.getValueType();
+  MVT SimpleVT = VT.getSimpleVT();
+  MVT VecType = MVT::getVectorVT(SimpleVT, 2);
+
+  SDValue NewOld = DAG.getBuildVector(VecType, DL, {New, Old});
+  SDValue Ops[] = { ChainIn, Addr, NewOld };
+
+  return DAG.getMemIntrinsicNode(AMDGPUISD::ATOMIC_CMP_SWAP, DL, Op->getVTList(),
+                                 Ops, VT, AtomicNode->getMemOperand());
+}
+
 //===----------------------------------------------------------------------===//
 // Custom DAG optimizations
 //===----------------------------------------------------------------------===//
@@ -1756,88 +2420,13 @@ SDValue SITargetLowering::performUCharToFloatCombine(SDNode *N,
     }
   }
 
-  // We are primarily trying to catch operations on illegal vector types
-  // before they are expanded.
-  // For scalars, we can use the more flexible method of checking masked bits
-  // after legalization.
-  if (!DCI.isBeforeLegalize() ||
-      !SrcVT.isVector() ||
-      SrcVT.getVectorElementType() != MVT::i8) {
-    return SDValue();
-  }
-
-  assert(DCI.isBeforeLegalize() && "Unexpected legal type");
-
-  // Weird sized vectors are a pain to handle, but we know 3 is really the same
-  // size as 4.
-  unsigned NElts = SrcVT.getVectorNumElements();
-  if (!SrcVT.isSimple() && NElts != 3)
-    return SDValue();
-
-  // Handle v4i8 -> v4f32 extload. Replace the v4i8 with a legal i32 load to
-  // prevent a mess from expanding to v4i32 and repacking.
-  if (ISD::isNormalLoad(Src.getNode()) && Src.hasOneUse()) {
-    EVT LoadVT = getEquivalentMemType(*DAG.getContext(), SrcVT);
-    EVT RegVT = getEquivalentLoadRegType(*DAG.getContext(), SrcVT);
-    EVT FloatVT = EVT::getVectorVT(*DAG.getContext(), MVT::f32, NElts);
-    LoadSDNode *Load = cast<LoadSDNode>(Src);
-
-    unsigned AS = Load->getAddressSpace();
-    unsigned Align = Load->getAlignment();
-    Type *Ty = LoadVT.getTypeForEVT(*DAG.getContext());
-    unsigned ABIAlignment = DAG.getDataLayout().getABITypeAlignment(Ty);
-
-    // Don't try to replace the load if we have to expand it due to alignment
-    // problems. Otherwise we will end up scalarizing the load, and trying to
-    // repack into the vector for no real reason.
-    if (Align < ABIAlignment &&
-        !allowsMisalignedMemoryAccesses(LoadVT, AS, Align, nullptr)) {
-      return SDValue();
-    }
-
-    SDValue NewLoad = DAG.getExtLoad(ISD::ZEXTLOAD, DL, RegVT,
-                                     Load->getChain(),
-                                     Load->getBasePtr(),
-                                     LoadVT,
-                                     Load->getMemOperand());
-
-    // Make sure successors of the original load stay after it by updating
-    // them to use the new Chain.
-    DAG.ReplaceAllUsesOfValueWith(SDValue(Load, 1), NewLoad.getValue(1));
-
-    SmallVector<SDValue, 4> Elts;
-    if (RegVT.isVector())
-      DAG.ExtractVectorElements(NewLoad, Elts);
-    else
-      Elts.push_back(NewLoad);
-
-    SmallVector<SDValue, 4> Ops;
-
-    unsigned EltIdx = 0;
-    for (SDValue Elt : Elts) {
-      unsigned ComponentsInElt = std::min(4u, NElts - 4 * EltIdx);
-      for (unsigned I = 0; I < ComponentsInElt; ++I) {
-        unsigned Opc = AMDGPUISD::CVT_F32_UBYTE0 + I;
-        SDValue Cvt = DAG.getNode(Opc, DL, MVT::f32, Elt);
-        DCI.AddToWorklist(Cvt.getNode());
-        Ops.push_back(Cvt);
-      }
-
-      ++EltIdx;
-    }
-
-    assert(Ops.size() == NElts);
-
-    return DAG.getNode(ISD::BUILD_VECTOR, DL, FloatVT, Ops);
-  }
-
   return SDValue();
 }
 
 /// \brief Return true if the given offset Size in bytes can be folded into
 /// the immediate offsets of a memory instruction for the given address space.
 static bool canFoldOffset(unsigned OffsetSize, unsigned AS,
-                          const AMDGPUSubtarget &STI) {
+                          const SISubtarget &STI) {
   switch (AS) {
   case AMDGPUAS::GLOBAL_ADDRESS: {
     // MUBUF instructions a 12-bit offset in bytes.
@@ -1846,7 +2435,7 @@ static bool canFoldOffset(unsigned OffsetSize, unsigned AS,
   case AMDGPUAS::CONSTANT_ADDRESS: {
     // SMRD instructions have an 8-bit offset in dwords on SI and
     // a 20-bit offset in bytes on VI.
-    if (STI.getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS)
+    if (STI.getGeneration() >= SISubtarget::VOLCANIC_ISLANDS)
       return isUInt<20>(OffsetSize);
     else
       return (OffsetSize % 4 == 0) && isUInt<8>(OffsetSize / 4);
@@ -1897,7 +2486,7 @@ SDValue SITargetLowering::performSHLPtrCombine(SDNode *N,
   // If the resulting offset is too large, we can't fold it into the addressing
   // mode offset.
   APInt Offset = CAdd->getAPIntValue() << CN1->getAPIntValue();
-  if (!canFoldOffset(Offset.getZExtValue(), AddrSpace, *Subtarget))
+  if (!canFoldOffset(Offset.getZExtValue(), AddrSpace, *getSubtarget()))
     return SDValue();
 
   SelectionDAG &DAG = DCI.DAG;
@@ -1915,6 +2504,9 @@ SDValue SITargetLowering::performAndCombine(SDNode *N,
   if (DCI.isBeforeLegalize())
     return SDValue();
 
+  if (SDValue Base = AMDGPUTargetLowering::performAndCombine(N, DCI))
+    return Base;
+
   SelectionDAG &DAG = DCI.DAG;
 
   // (and (fcmp ord x, x), (fcmp une (fabs x), inf)) ->
@@ -1970,6 +2562,36 @@ SDValue SITargetLowering::performOrCombine(SDNode *N,
   SDValue LHS = N->getOperand(0);
   SDValue RHS = N->getOperand(1);
 
+  EVT VT = N->getValueType(0);
+  if (VT == MVT::i64) {
+    // TODO: This could be a generic combine with a predicate for extracting the
+    // high half of an integer being free.
+
+    // (or i64:x, (zero_extend i32:y)) ->
+    //   i64 (bitcast (v2i32 build_vector (or i32:y, lo_32(x)), hi_32(x)))
+    if (LHS.getOpcode() == ISD::ZERO_EXTEND &&
+        RHS.getOpcode() != ISD::ZERO_EXTEND)
+      std::swap(LHS, RHS);
+
+    if (RHS.getOpcode() == ISD::ZERO_EXTEND) {
+      SDValue ExtSrc = RHS.getOperand(0);
+      EVT SrcVT = ExtSrc.getValueType();
+      if (SrcVT == MVT::i32) {
+        SDLoc SL(N);
+        SDValue LowLHS, HiBits;
+        std::tie(LowLHS, HiBits) = split64BitValue(LHS, DAG);
+        SDValue LowOr = DAG.getNode(ISD::OR, SL, MVT::i32, LowLHS, ExtSrc);
+
+        DCI.AddToWorklist(LowOr.getNode());
+        DCI.AddToWorklist(HiBits.getNode());
+
+        SDValue Vec = DAG.getNode(ISD::BUILD_VECTOR, SL, MVT::v2i32,
+                                  LowOr, HiBits);
+        return DAG.getNode(ISD::BITCAST, SL, MVT::i64, Vec);
+      }
+    }
+  }
+
   // or (fp_class x, c1), (fp_class x, c2) -> fp_class x, (c1 | c2)
   if (LHS.getOpcode() == AMDGPUISD::FP_CLASS &&
       RHS.getOpcode() == AMDGPUISD::FP_CLASS) {
@@ -2005,9 +2627,52 @@ SDValue SITargetLowering::performClassCombine(SDNode *N,
       return DAG.getConstant(0, SDLoc(N), MVT::i1);
   }
 
+  if (N->getOperand(0).isUndef())
+    return DAG.getUNDEF(MVT::i1);
+
   return SDValue();
 }
 
+// Constant fold canonicalize.
+SDValue SITargetLowering::performFCanonicalizeCombine(
+  SDNode *N,
+  DAGCombinerInfo &DCI) const {
+  ConstantFPSDNode *CFP = dyn_cast<ConstantFPSDNode>(N->getOperand(0));
+  if (!CFP)
+    return SDValue();
+
+  SelectionDAG &DAG = DCI.DAG;
+  const APFloat &C = CFP->getValueAPF();
+
+  // Flush denormals to 0 if not enabled.
+  if (C.isDenormal()) {
+    EVT VT = N->getValueType(0);
+    if (VT == MVT::f32 && !Subtarget->hasFP32Denormals())
+      return DAG.getConstantFP(0.0, SDLoc(N), VT);
+
+    if (VT == MVT::f64 && !Subtarget->hasFP64Denormals())
+      return DAG.getConstantFP(0.0, SDLoc(N), VT);
+  }
+
+  if (C.isNaN()) {
+    EVT VT = N->getValueType(0);
+    APFloat CanonicalQNaN = APFloat::getQNaN(C.getSemantics());
+    if (C.isSignaling()) {
+      // Quiet a signaling NaN.
+      return DAG.getConstantFP(CanonicalQNaN, SDLoc(N), VT);
+    }
+
+    // Make sure it is the canonical NaN bitpattern.
+    //
+    // TODO: Can we use -1 as the canonical NaN value since it's an inline
+    // immediate?
+    if (C.bitcastToAPInt() != CanonicalQNaN.bitcastToAPInt())
+      return DAG.getConstantFP(CanonicalQNaN, SDLoc(N), VT);
+  }
+
+  return SDValue(CFP, 0);
+}
+
 static unsigned minMaxOpcToMin3Max3Opc(unsigned Opc) {
   switch (Opc) {
   case ISD::FMAXNUM:
@@ -2027,8 +2692,64 @@ static unsigned minMaxOpcToMin3Max3Opc(unsigned Opc) {
   }
 }
 
-SDValue SITargetLowering::performMin3Max3Combine(SDNode *N,
-                                                 DAGCombinerInfo &DCI) const {
+static SDValue performIntMed3ImmCombine(SelectionDAG &DAG, const SDLoc &SL,
+                                        SDValue Op0, SDValue Op1, bool Signed) {
+  ConstantSDNode *K1 = dyn_cast<ConstantSDNode>(Op1);
+  if (!K1)
+    return SDValue();
+
+  ConstantSDNode *K0 = dyn_cast<ConstantSDNode>(Op0.getOperand(1));
+  if (!K0)
+    return SDValue();
+
+  if (Signed) {
+    if (K0->getAPIntValue().sge(K1->getAPIntValue()))
+      return SDValue();
+  } else {
+    if (K0->getAPIntValue().uge(K1->getAPIntValue()))
+      return SDValue();
+  }
+
+  EVT VT = K0->getValueType(0);
+  return DAG.getNode(Signed ? AMDGPUISD::SMED3 : AMDGPUISD::UMED3, SL, VT,
+                     Op0.getOperand(0), SDValue(K0, 0), SDValue(K1, 0));
+}
+
+static bool isKnownNeverSNan(SelectionDAG &DAG, SDValue Op) {
+  if (!DAG.getTargetLoweringInfo().hasFloatingPointExceptions())
+    return true;
+
+  return DAG.isKnownNeverNaN(Op);
+}
+
+static SDValue performFPMed3ImmCombine(SelectionDAG &DAG, const SDLoc &SL,
+                                       SDValue Op0, SDValue Op1) {
+  ConstantFPSDNode *K1 = dyn_cast<ConstantFPSDNode>(Op1);
+  if (!K1)
+    return SDValue();
+
+  ConstantFPSDNode *K0 = dyn_cast<ConstantFPSDNode>(Op0.getOperand(1));
+  if (!K0)
+    return SDValue();
+
+  // Ordered >= (although NaN inputs should have folded away by now).
+  APFloat::cmpResult Cmp = K0->getValueAPF().compare(K1->getValueAPF());
+  if (Cmp == APFloat::cmpGreaterThan)
+    return SDValue();
+
+  // This isn't safe with signaling NaNs because in IEEE mode, min/max on a
+  // signaling NaN gives a quiet NaN. The quiet NaN input to the min would then
+  // give the other result, which is different from med3 with a NaN input.
+  SDValue Var = Op0.getOperand(0);
+  if (!isKnownNeverSNan(DAG, Var))
+    return SDValue();
+
+  return DAG.getNode(AMDGPUISD::FMED3, SL, K0->getValueType(0),
+                     Var, SDValue(K0, 0), SDValue(K1, 0));
+}
+
+SDValue SITargetLowering::performMinMaxCombine(SDNode *N,
+                                               DAGCombinerInfo &DCI) const {
   SelectionDAG &DAG = DCI.DAG;
 
   unsigned Opc = N->getOpcode();
@@ -2038,26 +2759,51 @@ SDValue SITargetLowering::performMin3Max3Combine(SDNode *N,
   // Only do this if the inner op has one use since this will just increases
   // register pressure for no benefit.
 
-  // max(max(a, b), c)
-  if (Op0.getOpcode() == Opc && Op0.hasOneUse()) {
-    SDLoc DL(N);
-    return DAG.getNode(minMaxOpcToMin3Max3Opc(Opc),
-                       DL,
-                       N->getValueType(0),
-                       Op0.getOperand(0),
-                       Op0.getOperand(1),
-                       Op1);
+  if (Opc != AMDGPUISD::FMIN_LEGACY && Opc != AMDGPUISD::FMAX_LEGACY) {
+    // max(max(a, b), c) -> max3(a, b, c)
+    // min(min(a, b), c) -> min3(a, b, c)
+    if (Op0.getOpcode() == Opc && Op0.hasOneUse()) {
+      SDLoc DL(N);
+      return DAG.getNode(minMaxOpcToMin3Max3Opc(Opc),
+                         DL,
+                         N->getValueType(0),
+                         Op0.getOperand(0),
+                         Op0.getOperand(1),
+                         Op1);
+    }
+
+    // Try commuted.
+    // max(a, max(b, c)) -> max3(a, b, c)
+    // min(a, min(b, c)) -> min3(a, b, c)
+    if (Op1.getOpcode() == Opc && Op1.hasOneUse()) {
+      SDLoc DL(N);
+      return DAG.getNode(minMaxOpcToMin3Max3Opc(Opc),
+                         DL,
+                         N->getValueType(0),
+                         Op0,
+                         Op1.getOperand(0),
+                         Op1.getOperand(1));
+    }
   }
 
-  // max(a, max(b, c))
-  if (Op1.getOpcode() == Opc && Op1.hasOneUse()) {
-    SDLoc DL(N);
-    return DAG.getNode(minMaxOpcToMin3Max3Opc(Opc),
-                       DL,
-                       N->getValueType(0),
-                       Op0,
-                       Op1.getOperand(0),
-                       Op1.getOperand(1));
+  // min(max(x, K0), K1), K0 < K1 -> med3(x, K0, K1)
+  if (Opc == ISD::SMIN && Op0.getOpcode() == ISD::SMAX && Op0.hasOneUse()) {
+    if (SDValue Med3 = performIntMed3ImmCombine(DAG, SDLoc(N), Op0, Op1, true))
+      return Med3;
+  }
+
+  if (Opc == ISD::UMIN && Op0.getOpcode() == ISD::UMAX && Op0.hasOneUse()) {
+    if (SDValue Med3 = performIntMed3ImmCombine(DAG, SDLoc(N), Op0, Op1, false))
+      return Med3;
+  }
+
+  // fminnum(fmaxnum(x, K0), K1), K0 < K1 && !is_snan(x) -> fmed3(x, K0, K1)
+  if (((Opc == ISD::FMINNUM && Op0.getOpcode() == ISD::FMAXNUM) ||
+       (Opc == AMDGPUISD::FMIN_LEGACY &&
+        Op0.getOpcode() == AMDGPUISD::FMAX_LEGACY)) &&
+      N->getValueType(0) == MVT::f32 && Op0.hasOneUse()) {
+    if (SDValue Res = performFPMed3ImmCombine(DAG, SDLoc(N), Op0, Op1))
+      return Res;
   }
 
   return SDValue();
@@ -2104,16 +2850,18 @@ SDValue SITargetLowering::PerformDAGCombine(SDNode *N,
     return AMDGPUTargetLowering::PerformDAGCombine(N, DCI);
   case ISD::SETCC:
     return performSetCCCombine(N, DCI);
-  case ISD::FMAXNUM: // TODO: What about fmax_legacy?
+  case ISD::FMAXNUM:
   case ISD::FMINNUM:
   case ISD::SMAX:
   case ISD::SMIN:
   case ISD::UMAX:
-  case ISD::UMIN: {
+  case ISD::UMIN:
+  case AMDGPUISD::FMIN_LEGACY:
+  case AMDGPUISD::FMAX_LEGACY: {
     if (DCI.getDAGCombineLevel() >= AfterLegalizeDAG &&
         N->getValueType(0) != MVT::f64 &&
         getTargetMachine().getOptLevel() > CodeGenOpt::None)
-      return performMin3Max3Combine(N, DCI);
+      return performMinMaxCombine(N, DCI);
     break;
   }
 
@@ -2122,8 +2870,23 @@ SDValue SITargetLowering::PerformDAGCombine(SDNode *N,
   case AMDGPUISD::CVT_F32_UBYTE2:
   case AMDGPUISD::CVT_F32_UBYTE3: {
     unsigned Offset = N->getOpcode() - AMDGPUISD::CVT_F32_UBYTE0;
-
     SDValue Src = N->getOperand(0);
+
+    // TODO: Handle (or x, (srl y, 8)) pattern when known bits are zero.
+    if (Src.getOpcode() == ISD::SRL) {
+      // cvt_f32_ubyte0 (srl x, 16) -> cvt_f32_ubyte2 x
+      // cvt_f32_ubyte1 (srl x, 16) -> cvt_f32_ubyte3 x
+      // cvt_f32_ubyte0 (srl x, 8) -> cvt_f32_ubyte1 x
+
+      if (const ConstantSDNode *C = dyn_cast<ConstantSDNode>(Src.getOperand(1))) {
+        unsigned SrcOffset = C->getZExtValue() + 8 * Offset;
+        if (SrcOffset < 32 && SrcOffset % 8 == 0) {
+          return DAG.getNode(AMDGPUISD::CVT_F32_UBYTE0 + SrcOffset / 8, DL,
+                             MVT::f32, Src.getOperand(0));
+        }
+      }
+    }
+
     APInt Demanded = APInt::getBitsSet(32, 8 * Offset, 8 * Offset + 8);
 
     APInt KnownZero, KnownOne;
@@ -2238,7 +3001,9 @@ SDValue SITargetLowering::PerformDAGCombine(SDNode *N,
   case ISD::ATOMIC_LOAD_MIN:
   case ISD::ATOMIC_LOAD_MAX:
   case ISD::ATOMIC_LOAD_UMIN:
-  case ISD::ATOMIC_LOAD_UMAX: { // TODO: Target mem intrinsics.
+  case ISD::ATOMIC_LOAD_UMAX:
+  case AMDGPUISD::ATOMIC_INC:
+  case AMDGPUISD::ATOMIC_DEC: { // TODO: Target mem intrinsics.
     if (DCI.isBeforeLegalize())
       break;
 
@@ -2264,6 +3029,19 @@ SDValue SITargetLowering::PerformDAGCombine(SDNode *N,
     return performOrCombine(N, DCI);
   case AMDGPUISD::FP_CLASS:
     return performClassCombine(N, DCI);
+  case ISD::FCANONICALIZE:
+    return performFCanonicalizeCombine(N, DCI);
+  case AMDGPUISD::FRACT:
+  case AMDGPUISD::RCP:
+  case AMDGPUISD::RSQ:
+  case AMDGPUISD::RSQ_LEGACY:
+  case AMDGPUISD::RSQ_CLAMP:
+  case AMDGPUISD::LDEXP: {
+    SDValue Src = N->getOperand(0);
+    if (Src.isUndef())
+      return Src;
+    break;
+  }
   }
   return AMDGPUTargetLowering::PerformDAGCombine(N, DCI);
 }
@@ -2273,9 +3051,7 @@ SDValue SITargetLowering::PerformDAGCombine(SDNode *N,
 /// Returns -1 if it isn't an immediate, 0 if it's and inline immediate
 /// and the immediate value if it's a literal immediate
 int32_t SITargetLowering::analyzeImmediate(const SDNode *N) const {
-
-  const SIInstrInfo *TII =
-      static_cast<const SIInstrInfo *>(Subtarget->getInstrInfo());
+  const SIInstrInfo *TII = getSubtarget()->getInstrInfo();
 
   if (const ConstantSDNode *Node = dyn_cast<ConstantSDNode>(N)) {
     if (TII->isInlineConstant(Node->getAPIntValue()))
@@ -2314,7 +3090,8 @@ void SITargetLowering::adjustWritemask(MachineSDNode *&Node,
                                        SelectionDAG &DAG) const {
   SDNode *Users[4] = { };
   unsigned Lane = 0;
-  unsigned OldDmask = Node->getConstantOperandVal(0);
+  unsigned DmaskIdx = (Node->getNumOperands() - Node->getNumValues() == 9) ? 2 : 3;
+  unsigned OldDmask = Node->getConstantOperandVal(DmaskIdx);
   unsigned NewDmask = 0;
 
   // Try to figure out the used register components
@@ -2354,8 +3131,9 @@ void SITargetLowering::adjustWritemask(MachineSDNode *&Node,
 
   // Adjust the writemask in the node
   std::vector<SDValue> Ops;
+  Ops.insert(Ops.end(), Node->op_begin(), Node->op_begin() + DmaskIdx);
   Ops.push_back(DAG.getTargetConstant(NewDmask, SDLoc(Node), MVT::i32));
-  Ops.insert(Ops.end(), Node->op_begin() + 1, Node->op_end());
+  Ops.insert(Ops.end(), Node->op_begin() + DmaskIdx + 1, Node->op_end());
   Node = (MachineSDNode*)DAG.UpdateNodeOperands(Node, Ops);
 
   // If we only got one lane, replace it with a copy
@@ -2421,14 +3199,15 @@ void SITargetLowering::legalizeTargetIndependentNode(SDNode *Node,
 /// \brief Fold the instructions after selecting them.
 SDNode *SITargetLowering::PostISelFolding(MachineSDNode *Node,
                                           SelectionDAG &DAG) const {
-  const SIInstrInfo *TII =
-      static_cast<const SIInstrInfo *>(Subtarget->getInstrInfo());
+  const SIInstrInfo *TII = getSubtarget()->getInstrInfo();
+  unsigned Opcode = Node->getMachineOpcode();
 
-  if (TII->isMIMG(Node->getMachineOpcode()))
+  if (TII->isMIMG(Opcode) && !TII->get(Opcode).mayStore() &&
+      !TII->isGather4(Opcode))
     adjustWritemask(Node, DAG);
 
-  if (Node->getMachineOpcode() == AMDGPU::INSERT_SUBREG ||
-      Node->getMachineOpcode() == AMDGPU::REG_SEQUENCE) {
+  if (Opcode == AMDGPU::INSERT_SUBREG ||
+      Opcode == AMDGPU::REG_SEQUENCE) {
     legalizeTargetIndependentNode(Node, DAG);
     return Node;
   }
@@ -2437,22 +3216,22 @@ SDNode *SITargetLowering::PostISelFolding(MachineSDNode *Node,
 
 /// \brief Assign the register class depending on the number of
 /// bits set in the writemask
-void SITargetLowering::AdjustInstrPostInstrSelection(MachineInstr *MI,
+void SITargetLowering::AdjustInstrPostInstrSelection(MachineInstr &MI,
                                                      SDNode *Node) const {
-  const SIInstrInfo *TII =
-      static_cast<const SIInstrInfo *>(Subtarget->getInstrInfo());
+  const SIInstrInfo *TII = getSubtarget()->getInstrInfo();
 
-  MachineRegisterInfo &MRI = MI->getParent()->getParent()->getRegInfo();
+  MachineRegisterInfo &MRI = MI.getParent()->getParent()->getRegInfo();
 
-  if (TII->isVOP3(MI->getOpcode())) {
+  if (TII->isVOP3(MI.getOpcode())) {
     // Make sure constant bus requirements are respected.
     TII->legalizeOperandsVOP3(MRI, MI);
     return;
   }
 
-  if (TII->isMIMG(*MI)) {
-    unsigned VReg = MI->getOperand(0).getReg();
-    unsigned Writemask = MI->getOperand(1).getImm();
+  if (TII->isMIMG(MI)) {
+    unsigned VReg = MI.getOperand(0).getReg();
+    unsigned DmaskIdx = MI.getNumOperands() == 12 ? 3 : 4;
+    unsigned Writemask = MI.getOperand(DmaskIdx).getImm();
     unsigned BitsSet = 0;
     for (unsigned i = 0; i < 4; ++i)
       BitsSet += Writemask & (1 << i) ? 1 : 0;
@@ -2465,34 +3244,58 @@ void SITargetLowering::AdjustInstrPostInstrSelection(MachineInstr *MI,
     case 3:  RC = &AMDGPU::VReg_96RegClass; break;
     }
 
-    unsigned NewOpcode = TII->getMaskedMIMGOp(MI->getOpcode(), BitsSet);
-    MI->setDesc(TII->get(NewOpcode));
+    unsigned NewOpcode = TII->getMaskedMIMGOp(MI.getOpcode(), BitsSet);
+    MI.setDesc(TII->get(NewOpcode));
     MRI.setRegClass(VReg, RC);
     return;
   }
 
   // Replace unused atomics with the no return version.
-  int NoRetAtomicOp = AMDGPU::getAtomicNoRetOp(MI->getOpcode());
+  int NoRetAtomicOp = AMDGPU::getAtomicNoRetOp(MI.getOpcode());
   if (NoRetAtomicOp != -1) {
     if (!Node->hasAnyUseOfValue(0)) {
-      MI->setDesc(TII->get(NoRetAtomicOp));
-      MI->RemoveOperand(0);
+      MI.setDesc(TII->get(NoRetAtomicOp));
+      MI.RemoveOperand(0);
+      return;
     }
 
+    // For mubuf_atomic_cmpswap, we need to have tablegen use an extract_subreg
+    // instruction, because the return type of these instructions is a vec2 of
+    // the memory type, so it can be tied to the input operand.
+    // This means these instructions always have a use, so we need to add a
+    // special case to check if the atomic has only one extract_subreg use,
+    // which itself has no uses.
+    if ((Node->hasNUsesOfValue(1, 0) &&
+         Node->use_begin()->isMachineOpcode() &&
+         Node->use_begin()->getMachineOpcode() == AMDGPU::EXTRACT_SUBREG &&
+         !Node->use_begin()->hasAnyUseOfValue(0))) {
+      unsigned Def = MI.getOperand(0).getReg();
+
+      // Change this into a noret atomic.
+      MI.setDesc(TII->get(NoRetAtomicOp));
+      MI.RemoveOperand(0);
+
+      // If we only remove the def operand from the atomic instruction, the
+      // extract_subreg will be left with a use of a vreg without a def.
+      // So we need to insert an implicit_def to avoid machine verifier
+      // errors.
+      BuildMI(*MI.getParent(), MI, MI.getDebugLoc(),
+              TII->get(AMDGPU::IMPLICIT_DEF), Def);
+    }
     return;
   }
 }
 
-static SDValue buildSMovImm32(SelectionDAG &DAG, SDLoc DL, uint64_t Val) {
+static SDValue buildSMovImm32(SelectionDAG &DAG, const SDLoc &DL,
+                              uint64_t Val) {
   SDValue K = DAG.getTargetConstant(Val, DL, MVT::i32);
   return SDValue(DAG.getMachineNode(AMDGPU::S_MOV_B32, DL, MVT::i32, K), 0);
 }
 
 MachineSDNode *SITargetLowering::wrapAddr64Rsrc(SelectionDAG &DAG,
-                                                SDLoc DL,
+                                                const SDLoc &DL,
                                                 SDValue Ptr) const {
-  const SIInstrInfo *TII =
-    static_cast<const SIInstrInfo *>(Subtarget->getInstrInfo());
+  const SIInstrInfo *TII = getSubtarget()->getInstrInfo();
 
   // Build the half of the subregister with the constants before building the
   // full 128-bit register. If we are building multiple resource descriptors,
@@ -2524,10 +3327,8 @@ MachineSDNode *SITargetLowering::wrapAddr64Rsrc(SelectionDAG &DAG,
 ///        The TID (Thread ID) is multiplied by the stride value (bits [61:48]
 ///        of the resource descriptor) to create an offset, which is added to
 ///        the resource pointer.
-MachineSDNode *SITargetLowering::buildRSRC(SelectionDAG &DAG,
-                                           SDLoc DL,
-                                           SDValue Ptr,
-                                           uint32_t RsrcDword1,
+MachineSDNode *SITargetLowering::buildRSRC(SelectionDAG &DAG, const SDLoc &DL,
+                                           SDValue Ptr, uint32_t RsrcDword1,
                                            uint64_t RsrcDword2And3) const {
   SDValue PtrLo = DAG.getTargetExtractSubreg(AMDGPU::sub0, DL, MVT::i32, Ptr);
   SDValue PtrHi = DAG.getTargetExtractSubreg(AMDGPU::sub1, DL, MVT::i32, Ptr);
diff --git a/lib/Target/AMDGPU/SIISelLowering.h b/lib/Target/AMDGPU/SIISelLowering.h
index f01b2c0d09f3..8e055eea58c2 100644
--- a/lib/Target/AMDGPU/SIISelLowering.h
+++ b/lib/Target/AMDGPU/SIISelLowering.h
@@ -12,26 +12,26 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef LLVM_LIB_TARGET_R600_SIISELLOWERING_H
-#define LLVM_LIB_TARGET_R600_SIISELLOWERING_H
+#ifndef LLVM_LIB_TARGET_AMDGPU_SIISELLOWERING_H
+#define LLVM_LIB_TARGET_AMDGPU_SIISELLOWERING_H
 
 #include "AMDGPUISelLowering.h"
 #include "SIInstrInfo.h"
 
 namespace llvm {
 
-class SITargetLowering : public AMDGPUTargetLowering {
-  SDValue LowerParameter(SelectionDAG &DAG, EVT VT, EVT MemVT, SDLoc DL,
+class SITargetLowering final : public AMDGPUTargetLowering {
+  SDValue LowerParameterPtr(SelectionDAG &DAG, const SDLoc &SL, SDValue Chain,
+                            unsigned Offset) const;
+  SDValue LowerParameter(SelectionDAG &DAG, EVT VT, EVT MemVT, const SDLoc &SL,
                          SDValue Chain, unsigned Offset, bool Signed) const;
-  SDValue LowerSampleIntrinsic(unsigned Opcode, const SDValue &Op,
-                               SelectionDAG &DAG) const;
   SDValue LowerGlobalAddress(AMDGPUMachineFunction *MFI, SDValue Op,
                              SelectionDAG &DAG) const override;
-
   SDValue lowerImplicitZextParam(SelectionDAG &DAG, SDValue Op,
                                  MVT VT, unsigned Offset) const;
 
   SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG) const;
+  SDValue LowerINTRINSIC_W_CHAIN(SDValue Op, SelectionDAG &DAG) const;
   SDValue LowerINTRINSIC_VOID(SDValue Op, SelectionDAG &DAG) const;
   SDValue LowerFrameIndex(SDValue Op, SelectionDAG &DAG) const;
   SDValue LowerLOAD(SDValue Op, SelectionDAG &DAG) const;
@@ -43,8 +43,13 @@ class SITargetLowering : public AMDGPUTargetLowering {
   SDValue LowerINT_TO_FP(SDValue Op, SelectionDAG &DAG, bool Signed) const;
   SDValue LowerSTORE(SDValue Op, SelectionDAG &DAG) const;
   SDValue LowerTrig(SDValue Op, SelectionDAG &DAG) const;
+  SDValue LowerATOMIC_CMP_SWAP(SDValue Op, SelectionDAG &DAG) const;
   SDValue LowerBRCOND(SDValue Op, SelectionDAG &DAG) const;
 
+  SDValue getSegmentAperture(unsigned AS, SelectionDAG &DAG) const;
+  SDValue lowerADDRSPACECAST(SDValue Op, SelectionDAG &DAG) const;
+  SDValue lowerTRAP(SDValue Op, SelectionDAG &DAG) const;
+
   void adjustWritemask(MachineSDNode *&N, SelectionDAG &DAG) const;
 
   SDValue performUCharToFloatCombine(SDNode *N,
@@ -55,14 +60,25 @@ class SITargetLowering : public AMDGPUTargetLowering {
   SDValue performAndCombine(SDNode *N, DAGCombinerInfo &DCI) const;
   SDValue performOrCombine(SDNode *N, DAGCombinerInfo &DCI) const;
   SDValue performClassCombine(SDNode *N, DAGCombinerInfo &DCI) const;
+  SDValue performFCanonicalizeCombine(SDNode *N, DAGCombinerInfo &DCI) const;
+
+  SDValue performMinMaxCombine(SDNode *N, DAGCombinerInfo &DCI) const;
 
-  SDValue performMin3Max3Combine(SDNode *N, DAGCombinerInfo &DCI) const;
   SDValue performSetCCCombine(SDNode *N, DAGCombinerInfo &DCI) const;
 
   bool isLegalFlatAddressingMode(const AddrMode &AM) const;
   bool isLegalMUBUFAddressingMode(const AddrMode &AM) const;
+
+  bool isCFIntrinsic(const SDNode *Intr) const;
+
+  void createDebuggerPrologueStackObjects(MachineFunction &MF) const;
 public:
-  SITargetLowering(TargetMachine &tm, const AMDGPUSubtarget &STI);
+  SITargetLowering(const TargetMachine &tm, const SISubtarget &STI);
+
+  const SISubtarget *getSubtarget() const;
+
+  bool getTgtMemIntrinsic(IntrinsicInfo &, const CallInst &,
+                          unsigned IntrinsicID) const override;
 
   bool isShuffleMaskLegal(const SmallVectorImpl<int> &/*Mask*/,
                           EVT /*VT*/) const override;
@@ -89,21 +105,30 @@ public:
   bool shouldConvertConstantLoadToIntImm(const APInt &Imm,
                                         Type *Ty) const override;
 
+  bool isTypeDesirableForOp(unsigned Op, EVT VT) const override;
+
+  bool isOffsetFoldingLegal(const GlobalAddressSDNode *GA) const override;
+
   SDValue LowerFormalArguments(SDValue Chain, CallingConv::ID CallConv,
                                bool isVarArg,
                                const SmallVectorImpl<ISD::InputArg> &Ins,
-                               SDLoc DL, SelectionDAG &DAG,
+                               const SDLoc &DL, SelectionDAG &DAG,
                                SmallVectorImpl<SDValue> &InVals) const override;
 
-  SDValue LowerReturn(SDValue Chain,
-                      CallingConv::ID CallConv,
-                      bool isVarArg,
+  SDValue LowerReturn(SDValue Chain, CallingConv::ID CallConv, bool isVarArg,
                       const SmallVectorImpl<ISD::OutputArg> &Outs,
-                      const SmallVectorImpl<SDValue> &OutVals,
-                      SDLoc DL, SelectionDAG &DAG) const override;
+                      const SmallVectorImpl<SDValue> &OutVals, const SDLoc &DL,
+                      SelectionDAG &DAG) const override;
+
+  unsigned getRegisterByName(const char* RegName, EVT VT,
+                             SelectionDAG &DAG) const override;
+
+  MachineBasicBlock *splitKillBlock(MachineInstr &MI,
+                                    MachineBasicBlock *BB) const;
 
-  MachineBasicBlock * EmitInstrWithCustomInserter(MachineInstr * MI,
-                                      MachineBasicBlock * BB) const override;
+  MachineBasicBlock *
+  EmitInstrWithCustomInserter(MachineInstr &MI,
+                              MachineBasicBlock *BB) const override;
   bool enableAggressiveFMAFusion(EVT VT) const override;
   EVT getSetCCResultType(const DataLayout &DL, LLVMContext &Context,
                          EVT VT) const override;
@@ -112,7 +137,7 @@ public:
   SDValue LowerOperation(SDValue Op, SelectionDAG &DAG) const override;
   SDValue PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const override;
   SDNode *PostISelFolding(MachineSDNode *N, SelectionDAG &DAG) const override;
-  void AdjustInstrPostInstrSelection(MachineInstr *MI,
+  void AdjustInstrPostInstrSelection(MachineInstr &MI,
                                      SDNode *Node) const override;
 
   int32_t analyzeImmediate(const SDNode *N) const;
@@ -120,17 +145,16 @@ public:
                                unsigned Reg, EVT VT) const override;
   void legalizeTargetIndependentNode(SDNode *Node, SelectionDAG &DAG) const;
 
-  MachineSDNode *wrapAddr64Rsrc(SelectionDAG &DAG, SDLoc DL, SDValue Ptr) const;
-  MachineSDNode *buildRSRC(SelectionDAG &DAG,
-                           SDLoc DL,
-                           SDValue Ptr,
-                           uint32_t RsrcDword1,
-                           uint64_t RsrcDword2And3) const;
+  MachineSDNode *wrapAddr64Rsrc(SelectionDAG &DAG, const SDLoc &DL,
+                                SDValue Ptr) const;
+  MachineSDNode *buildRSRC(SelectionDAG &DAG, const SDLoc &DL, SDValue Ptr,
+                           uint32_t RsrcDword1, uint64_t RsrcDword2And3) const;
   std::pair<unsigned, const TargetRegisterClass *>
   getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI,
                                StringRef Constraint, MVT VT) const override;
   ConstraintType getConstraintType(StringRef Constraint) const override;
-  SDValue copyToM0(SelectionDAG &DAG, SDValue Chain, SDLoc DL, SDValue V) const;
+  SDValue copyToM0(SelectionDAG &DAG, SDValue Chain, const SDLoc &DL,
+                   SDValue V) const;
 };
 
 } // End namespace llvm
diff --git a/lib/Target/AMDGPU/SIInsertWaits.cpp b/lib/Target/AMDGPU/SIInsertWaits.cpp
index 94e614750d2f..d24588d6c143 100644
--- a/lib/Target/AMDGPU/SIInsertWaits.cpp
+++ b/lib/Target/AMDGPU/SIInsertWaits.cpp
@@ -26,6 +26,8 @@
 #include "llvm/CodeGen/MachineInstrBuilder.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
 
+#define DEBUG_TYPE "si-insert-waits"
+
 using namespace llvm;
 
 namespace {
@@ -53,7 +55,7 @@ typedef std::pair<unsigned, unsigned> RegInterval;
 class SIInsertWaits : public MachineFunctionPass {
 
 private:
-  static char ID;
+  const SISubtarget *ST;
   const SIInstrInfo *TII;
   const SIRegisterInfo *TRI;
   const MachineRegisterInfo *MRI;
@@ -67,6 +69,10 @@ private:
   /// \brief Counter values we have already waited on.
   Counters WaitedOn;
 
+  /// \brief Counter values that we must wait on before the next counter
+  /// increase.
+  Counters DelayedWaitOn;
+
   /// \brief Counter values for last instruction issued.
   Counters LastIssued;
 
@@ -87,6 +93,9 @@ private:
   /// \brief Whether the machine function returns void
   bool ReturnsVoid;
 
+  /// Whether the VCCZ bit is possibly corrupt
+  bool VCCZCorrupt;
+
   /// \brief Get increment/decrement amount for this instruction.
   Counters getHwCounts(MachineInstr &MI);
 
@@ -99,13 +108,17 @@ private:
 
   /// \brief Handle instructions async components
   void pushInstruction(MachineBasicBlock &MBB,
-                       MachineBasicBlock::iterator I);
+                       MachineBasicBlock::iterator I,
+                       const Counters& Increment);
 
   /// \brief Insert the actual wait instruction
   bool insertWait(MachineBasicBlock &MBB,
                   MachineBasicBlock::iterator I,
                   const Counters &Counts);
 
+  /// \brief Handle existing wait instructions (from intrinsics)
+  void handleExistingWait(MachineBasicBlock::iterator I);
+
   /// \brief Do we need def2def checks?
   bool unorderedDefines(MachineInstr &MI);
 
@@ -115,12 +128,20 @@ private:
   /// \brief Insert S_NOP between an instruction writing M0 and S_SENDMSG.
   void handleSendMsg(MachineBasicBlock &MBB, MachineBasicBlock::iterator I);
 
+  /// Return true if there are LGKM instrucitons that haven't been waited on
+  /// yet.
+  bool hasOutstandingLGKM() const;
+
 public:
-  SIInsertWaits(TargetMachine &tm) :
+  static char ID;
+
+  SIInsertWaits() :
     MachineFunctionPass(ID),
+    ST(nullptr),
     TII(nullptr),
     TRI(nullptr),
-    ExpInstrTypesSeen(0) { }
+    ExpInstrTypesSeen(0),
+    VCCZCorrupt(false) { }
 
   bool runOnMachineFunction(MachineFunction &MF) override;
 
@@ -136,13 +157,28 @@ public:
 
 } // End anonymous namespace
 
+INITIALIZE_PASS_BEGIN(SIInsertWaits, DEBUG_TYPE,
+                      "SI Insert Waits", false, false)
+INITIALIZE_PASS_END(SIInsertWaits, DEBUG_TYPE,
+                    "SI Insert Waits", false, false)
+
 char SIInsertWaits::ID = 0;
 
-const Counters SIInsertWaits::WaitCounts = { { 15, 7, 7 } };
+char &llvm::SIInsertWaitsID = SIInsertWaits::ID;
+
+FunctionPass *llvm::createSIInsertWaitsPass() {
+  return new SIInsertWaits();
+}
+
+const Counters SIInsertWaits::WaitCounts = { { 15, 7, 15 } };
 const Counters SIInsertWaits::ZeroCounts = { { 0, 0, 0 } };
 
-FunctionPass *llvm::createSIInsertWaits(TargetMachine &tm) {
-  return new SIInsertWaits(tm);
+static bool readsVCCZ(unsigned Opcode) {
+  return Opcode == AMDGPU::S_CBRANCH_VCCNZ || Opcode == AMDGPU::S_CBRANCH_VCCZ;
+}
+
+bool SIInsertWaits::hasOutstandingLGKM() const {
+  return WaitedOn.Named.LGKM != LastIssued.Named.LGKM;
 }
 
 Counters SIInsertWaits::getHwCounts(MachineInstr &MI) {
@@ -205,24 +241,23 @@ bool SIInsertWaits::isOpRelevant(MachineOperand &Op) {
     return false;
 
   // Check if this operand is the value being stored.
-  // Special case for DS instructions, since the address
+  // Special case for DS/FLAT instructions, since the address
   // operand comes before the value operand and it may have
   // multiple data operands.
 
-  if (TII->isDS(MI)) {
+  if (TII->isDS(MI) || TII->isFLAT(MI)) {
     MachineOperand *Data = TII->getNamedOperand(MI, AMDGPU::OpName::data);
     if (Data && Op.isIdenticalTo(*Data))
       return true;
+  }
 
+  if (TII->isDS(MI)) {
     MachineOperand *Data0 = TII->getNamedOperand(MI, AMDGPU::OpName::data0);
     if (Data0 && Op.isIdenticalTo(*Data0))
       return true;
 
     MachineOperand *Data1 = TII->getNamedOperand(MI, AMDGPU::OpName::data1);
-    if (Data1 && Op.isIdenticalTo(*Data1))
-      return true;
-
-    return false;
+    return Data1 && Op.isIdenticalTo(*Data1);
   }
 
   // NOTE: This assumes that the value operand is before the
@@ -250,10 +285,10 @@ RegInterval SIInsertWaits::getRegInterval(const TargetRegisterClass *RC,
 }
 
 void SIInsertWaits::pushInstruction(MachineBasicBlock &MBB,
-                                    MachineBasicBlock::iterator I) {
+                                    MachineBasicBlock::iterator I,
+                                    const Counters &Increment) {
 
   // Get the hardware counter increments and sum them up
-  Counters Increment = getHwCounts(*I);
   Counters Limit = ZeroCounts;
   unsigned Sum = 0;
 
@@ -270,8 +305,7 @@ void SIInsertWaits::pushInstruction(MachineBasicBlock &MBB,
     return;
   }
 
-  if (MBB.getParent()->getSubtarget<AMDGPUSubtarget>().getGeneration() >=
-      AMDGPUSubtarget::VOLCANIC_ISLANDS) {
+  if (ST->getGeneration() >= SISubtarget::VOLCANIC_ISLANDS) {
     // Any occurrence of consecutive VMEM or SMEM instructions forms a VMEM
     // or SMEM clause, respectively.
     //
@@ -281,8 +315,7 @@ void SIInsertWaits::pushInstruction(MachineBasicBlock &MBB,
     // and destination registers don't overlap, e.g. this is illegal:
     //   r0 = load r2
     //   r2 = load r0
-    if ((LastOpcodeType == SMEM && TII->isSMRD(*I)) ||
-        (LastOpcodeType == VMEM && Increment.Named.VM)) {
+    if (LastOpcodeType == VMEM && Increment.Named.VM) {
       // Insert a NOP to break the clause.
       BuildMI(MBB, I, DebugLoc(), TII->get(AMDGPU::S_NOP))
           .addImm(0);
@@ -379,7 +412,7 @@ bool SIInsertWaits::insertWait(MachineBasicBlock &MBB,
   BuildMI(MBB, I, DebugLoc(), TII->get(AMDGPU::S_WAITCNT))
           .addImm((Counts.Named.VM & 0xF) |
                   ((Counts.Named.EXP & 0x7) << 4) |
-                  ((Counts.Named.LGKM & 0x7) << 8));
+                  ((Counts.Named.LGKM & 0xF) << 8));
 
   LastOpcodeType = OTHER;
   LastInstWritesM0 = false;
@@ -393,16 +426,38 @@ static void increaseCounters(Counters &Dst, const Counters &Src) {
     Dst.Array[i] = std::max(Dst.Array[i], Src.Array[i]);
 }
 
+/// \brief check whether any of the counters is non-zero
+static bool countersNonZero(const Counters &Counter) {
+  for (unsigned i = 0; i < 3; ++i)
+    if (Counter.Array[i])
+      return true;
+  return false;
+}
+
+void SIInsertWaits::handleExistingWait(MachineBasicBlock::iterator I) {
+  assert(I->getOpcode() == AMDGPU::S_WAITCNT);
+
+  unsigned Imm = I->getOperand(0).getImm();
+  Counters Counts, WaitOn;
+
+  Counts.Named.VM = Imm & 0xF;
+  Counts.Named.EXP = (Imm >> 4) & 0x7;
+  Counts.Named.LGKM = (Imm >> 8) & 0xF;
+
+  for (unsigned i = 0; i < 3; ++i) {
+    if (Counts.Array[i] <= LastIssued.Array[i])
+      WaitOn.Array[i] = LastIssued.Array[i] - Counts.Array[i];
+    else
+      WaitOn.Array[i] = 0;
+  }
+
+  increaseCounters(DelayedWaitOn, WaitOn);
+}
+
 Counters SIInsertWaits::handleOperands(MachineInstr &MI) {
 
   Counters Result = ZeroCounts;
 
-  // S_SENDMSG implicitly waits for all outstanding LGKM transfers to finish,
-  // but we also want to wait for any other outstanding transfers before
-  // signalling other hardware blocks
-  if (MI.getOpcode() == AMDGPU::S_SENDMSG)
-    return LastIssued;
-
   // For each register affected by this instruction increase the result
   // sequence.
   //
@@ -432,8 +487,7 @@ Counters SIInsertWaits::handleOperands(MachineInstr &MI) {
 
 void SIInsertWaits::handleSendMsg(MachineBasicBlock &MBB,
                                   MachineBasicBlock::iterator I) {
-  if (MBB.getParent()->getSubtarget<AMDGPUSubtarget>().getGeneration() <
-      AMDGPUSubtarget::VOLCANIC_ISLANDS)
+  if (ST->getGeneration() < SISubtarget::VOLCANIC_ISLANDS)
     return;
 
   // There must be "S_NOP 0" between an instruction writing M0 and S_SENDMSG.
@@ -460,13 +514,13 @@ void SIInsertWaits::handleSendMsg(MachineBasicBlock &MBB,
 bool SIInsertWaits::runOnMachineFunction(MachineFunction &MF) {
   bool Changes = false;
 
-  TII = static_cast<const SIInstrInfo *>(MF.getSubtarget().getInstrInfo());
-  TRI =
-      static_cast<const SIRegisterInfo *>(MF.getSubtarget().getRegisterInfo());
-
+  ST = &MF.getSubtarget<SISubtarget>();
+  TII = ST->getInstrInfo();
+  TRI = &TII->getRegisterInfo();
   MRI = &MF.getRegInfo();
 
   WaitedOn = ZeroCounts;
+  DelayedWaitOn = ZeroCounts;
   LastIssued = ZeroCounts;
   LastOpcodeType = OTHER;
   LastInstWritesM0 = false;
@@ -475,6 +529,8 @@ bool SIInsertWaits::runOnMachineFunction(MachineFunction &MF) {
   memset(&UsedRegs, 0, sizeof(UsedRegs));
   memset(&DefinedRegs, 0, sizeof(DefinedRegs));
 
+  SmallVector<MachineInstr *, 4> RemoveMI;
+
   for (MachineFunction::iterator BI = MF.begin(), BE = MF.end();
        BI != BE; ++BI) {
 
@@ -482,27 +538,81 @@ bool SIInsertWaits::runOnMachineFunction(MachineFunction &MF) {
     for (MachineBasicBlock::iterator I = MBB.begin(), E = MBB.end();
          I != E; ++I) {
 
+      if (ST->getGeneration() <= SISubtarget::SEA_ISLANDS) {
+        // There is a hardware bug on CI/SI where SMRD instruction may corrupt
+        // vccz bit, so when we detect that an instruction may read from a
+        // corrupt vccz bit, we need to:
+        // 1. Insert s_waitcnt lgkm(0) to wait for all outstanding SMRD operations to
+        //    complete.
+        // 2. Restore the correct value of vccz by writing the current value
+        //    of vcc back to vcc.
+
+        if (TII->isSMRD(I->getOpcode())) {
+          VCCZCorrupt = true;
+        } else if (!hasOutstandingLGKM() && I->modifiesRegister(AMDGPU::VCC, TRI)) {
+          // FIXME: We only care about SMRD instructions here, not LDS or GDS.
+          // Whenever we store a value in vcc, the correct value of vccz is
+          // restored.
+          VCCZCorrupt = false;
+        }
+
+        // Check if we need to apply the bug work-around
+        if (readsVCCZ(I->getOpcode()) && VCCZCorrupt) {
+          DEBUG(dbgs() << "Inserting vccz bug work-around before: " << *I << '\n');
+
+          // Wait on everything, not just LGKM.  vccz reads usually come from
+          // terminators, and we always wait on everything at the end of the
+          // block, so if we only wait on LGKM here, we might end up with
+          // another s_waitcnt inserted right after this if there are non-LGKM
+          // instructions still outstanding.
+          insertWait(MBB, I, LastIssued);
+
+          // Restore the vccz bit.  Any time a value is written to vcc, the vcc
+          // bit is updated, so we can restore the bit by reading the value of
+          // vcc and then writing it back to the register.
+          BuildMI(MBB, I, I->getDebugLoc(), TII->get(AMDGPU::S_MOV_B64),
+                  AMDGPU::VCC)
+                  .addReg(AMDGPU::VCC);
+        }
+      }
+
+      // Record pre-existing, explicitly requested waits
+      if (I->getOpcode() == AMDGPU::S_WAITCNT) {
+        handleExistingWait(*I);
+        RemoveMI.push_back(&*I);
+        continue;
+      }
+
+      Counters Required;
+
       // Wait for everything before a barrier.
-      if (I->getOpcode() == AMDGPU::S_BARRIER)
-        Changes |= insertWait(MBB, I, LastIssued);
+      //
+      // S_SENDMSG implicitly waits for all outstanding LGKM transfers to finish,
+      // but we also want to wait for any other outstanding transfers before
+      // signalling other hardware blocks
+      if (I->getOpcode() == AMDGPU::S_BARRIER ||
+          I->getOpcode() == AMDGPU::S_SENDMSG)
+        Required = LastIssued;
       else
-        Changes |= insertWait(MBB, I, handleOperands(*I));
+        Required = handleOperands(*I);
+
+      Counters Increment = getHwCounts(*I);
 
-      pushInstruction(MBB, I);
+      if (countersNonZero(Required) || countersNonZero(Increment))
+        increaseCounters(Required, DelayedWaitOn);
+
+      Changes |= insertWait(MBB, I, Required);
+
+      pushInstruction(MBB, I, Increment);
       handleSendMsg(MBB, I);
     }
 
     // Wait for everything at the end of the MBB
     Changes |= insertWait(MBB, MBB.getFirstTerminator(), LastIssued);
-
-    // Functions returning something shouldn't contain S_ENDPGM, because other
-    // bytecode will be appended after it.
-    if (!ReturnsVoid) {
-      MachineBasicBlock::iterator I = MBB.getFirstTerminator();
-      if (I != MBB.end() && I->getOpcode() == AMDGPU::S_ENDPGM)
-        I->eraseFromParent();
-    }
   }
 
+  for (MachineInstr *I : RemoveMI)
+    I->eraseFromParent();
+
   return Changes;
 }
diff --git a/lib/Target/AMDGPU/SIInstrFormats.td b/lib/Target/AMDGPU/SIInstrFormats.td
index 0e883f64caa3..2f63d4ed13b3 100644
--- a/lib/Target/AMDGPU/SIInstrFormats.td
+++ b/lib/Target/AMDGPU/SIInstrFormats.td
@@ -11,8 +11,9 @@
 //
 //===----------------------------------------------------------------------===//
 
-class InstSI <dag outs, dag ins, string asm, list<dag> pattern> :
-    AMDGPUInst<outs, ins, asm, pattern>, PredicateControl {
+class InstSI <dag outs, dag ins, string asm = "",
+              list<dag> pattern = []> :
+  AMDGPUInst<outs, ins, asm, pattern>, PredicateControl {
 
   field bits<1> VM_CNT = 0;
   field bits<1> EXP_CNT = 0;
@@ -31,6 +32,8 @@ class InstSI <dag outs, dag ins, string asm, list<dag> pattern> :
   field bits<1> VOP2 = 0;
   field bits<1> VOP3 = 0;
   field bits<1> VOPC = 0;
+  field bits<1> SDWA = 0;
+  field bits<1> DPP = 0;
 
   field bits<1> MUBUF = 0;
   field bits<1> MTBUF = 0;
@@ -45,6 +48,8 @@ class InstSI <dag outs, dag ins, string asm, list<dag> pattern> :
   // is unable to infer the encoding from the operands.
   field bits<1> VOPAsmPrefer32Bit = 0;
 
+  field bits<1> Gather4 = 0;
+
   // These need to be kept in sync with the enum in SIInstrFlags.
   let TSFlags{0} = VM_CNT;
   let TSFlags{1} = EXP_CNT;
@@ -63,18 +68,33 @@ class InstSI <dag outs, dag ins, string asm, list<dag> pattern> :
   let TSFlags{11} = VOP2;
   let TSFlags{12} = VOP3;
   let TSFlags{13} = VOPC;
-
-  let TSFlags{14} = MUBUF;
-  let TSFlags{15} = MTBUF;
-  let TSFlags{16} = SMRD;
-  let TSFlags{17} = DS;
-  let TSFlags{18} = MIMG;
-  let TSFlags{19} = FLAT;
-  let TSFlags{20} = WQM;
-  let TSFlags{21} = VGPRSpill;
-  let TSFlags{22} = VOPAsmPrefer32Bit;
+  let TSFlags{14} = SDWA;
+  let TSFlags{15} = DPP;
+
+  let TSFlags{16} = MUBUF;
+  let TSFlags{17} = MTBUF;
+  let TSFlags{18} = SMRD;
+  let TSFlags{19} = DS;
+  let TSFlags{20} = MIMG;
+  let TSFlags{21} = FLAT;
+  let TSFlags{22} = WQM;
+  let TSFlags{23} = VGPRSpill;
+  let TSFlags{24} = VOPAsmPrefer32Bit;
+  let TSFlags{25} = Gather4;
 
   let SchedRW = [Write32Bit];
+
+  field bits<1> DisableSIDecoder = 0;
+  field bits<1> DisableVIDecoder = 0;
+  field bits<1> DisableDecoder = 0;
+
+  let isAsmParserOnly = !if(!eq(DisableDecoder{0}, {0}), 0, 1);
+}
+
+class PseudoInstSI<dag outs, dag ins, list<dag> pattern = []>
+  : InstSI<outs, ins, "", pattern> {
+  let isPseudo = 1;
+  let isCodeGenOnly = 1;
 }
 
 class Enc32 {
@@ -123,8 +143,10 @@ class VOP2Common <dag outs, dag ins, string asm, list<dag> pattern> :
   let Size = 4;
 }
 
-class VOP3Common <dag outs, dag ins, string asm, list<dag> pattern> :
-    VOPAnyCommon <outs, ins, asm, pattern> {
+class VOP3Common <dag outs, dag ins, string asm = "",
+                  list<dag> pattern = [], bit HasMods = 0,
+                  bit VOP3Only = 0> :
+  VOPAnyCommon <outs, ins, asm, pattern> {
 
   // Using complex patterns gives VOP3 patterns a very high complexity rating,
   // but standalone patterns are almost always prefered, so we need to adjust the
@@ -135,7 +157,11 @@ class VOP3Common <dag outs, dag ins, string asm, list<dag> pattern> :
   let VOP3 = 1;
   let VALU = 1;
 
-  let AsmMatchConverter = "cvtVOP3";
+  let AsmMatchConverter =
+    !if(!eq(VOP3Only,1),
+        "cvtVOP3",
+        !if(!eq(HasMods,1), "cvtVOP3_2_mod", ""));
+
   let isCodeGenOnly = 0;
 
   int Size = 8;
@@ -154,9 +180,9 @@ class VOP3Common <dag outs, dag ins, string asm, list<dag> pattern> :
 
 class SOP1e <bits<8> op> : Enc32 {
   bits<7> sdst;
-  bits<8> ssrc0;
+  bits<8> src0;
 
-  let Inst{7-0} = ssrc0;
+  let Inst{7-0} = src0;
   let Inst{15-8} = op;
   let Inst{22-16} = sdst;
   let Inst{31-23} = 0x17d; //encoding;
@@ -164,22 +190,22 @@ class SOP1e <bits<8> op> : Enc32 {
 
 class SOP2e <bits<7> op> : Enc32 {
   bits<7> sdst;
-  bits<8> ssrc0;
-  bits<8> ssrc1;
+  bits<8> src0;
+  bits<8> src1;
 
-  let Inst{7-0} = ssrc0;
-  let Inst{15-8} = ssrc1;
+  let Inst{7-0} = src0;
+  let Inst{15-8} = src1;
   let Inst{22-16} = sdst;
   let Inst{29-23} = op;
   let Inst{31-30} = 0x2; // encoding
 }
 
 class SOPCe <bits<7> op> : Enc32 {
-  bits<8> ssrc0;
-  bits<8> ssrc1;
+  bits<8> src0;
+  bits<8> src1;
 
-  let Inst{7-0} = ssrc0;
-  let Inst{15-8} = ssrc1;
+  let Inst{7-0} = src0;
+  let Inst{15-8} = src1;
   let Inst{22-16} = op;
   let Inst{31-23} = 0x17e;
 }
@@ -218,9 +244,7 @@ class SOPPe <bits<7> op> : Enc32 {
 class SMRDe <bits<5> op, bits<1> imm> : Enc32 {
   bits<7> sdst;
   bits<7> sbase;
-  bits<8> offset;
 
-  let Inst{7-0} = offset;
   let Inst{8} = imm;
   let Inst{14-9} = sbase{6-1};
   let Inst{21-15} = sdst;
@@ -228,6 +252,18 @@ class SMRDe <bits<5> op, bits<1> imm> : Enc32 {
   let Inst{31-27} = 0x18; //encoding
 }
 
+class SMRD_IMMe <bits<5> op> : SMRDe<op, 1> {
+  bits<8> offset;
+  let Inst{7-0} = offset;
+}
+
+class SMRD_SOFFe <bits<5> op> : SMRDe<op, 0> {
+  bits<8> soff;
+  let Inst{7-0} = soff;
+}
+
+
+
 class SMRD_IMMe_ci <bits<5> op> : Enc64 {
   bits<7> sdst;
   bits<7> sbase;
@@ -348,19 +384,18 @@ class VOP2_MADKe <bits<6> op> : Enc64 {
 
   bits<8>  vdst;
   bits<9>  src0;
-  bits<8>  vsrc1;
-  bits<32> src2;
+  bits<8>  src1;
+  bits<32> imm;
 
   let Inst{8-0} = src0;
-  let Inst{16-9} = vsrc1;
+  let Inst{16-9} = src1;
   let Inst{24-17} = vdst;
   let Inst{30-25} = op;
   let Inst{31} = 0x0; // encoding
-  let Inst{63-32} = src2;
+  let Inst{63-32} = imm;
 }
 
-class VOP3e <bits<9> op> : Enc64 {
-  bits<8> vdst;
+class VOP3a <bits<9> op> : Enc64 {
   bits<2> src0_modifiers;
   bits<9> src0;
   bits<2> src1_modifiers;
@@ -370,7 +405,6 @@ class VOP3e <bits<9> op> : Enc64 {
   bits<1> clamp;
   bits<2> omod;
 
-  let Inst{7-0} = vdst;
   let Inst{8} = src0_modifiers{1};
   let Inst{9} = src1_modifiers{1};
   let Inst{10} = src2_modifiers{1};
@@ -386,6 +420,20 @@ class VOP3e <bits<9> op> : Enc64 {
   let Inst{63} = src2_modifiers{0};
 }
 
+class VOP3e <bits<9> op> : VOP3a <op> {
+  bits<8> vdst;
+
+  let Inst{7-0} = vdst;
+}
+
+// Encoding used for VOPC instructions encoded as VOP3
+// Differs from VOP3e by destination name (sdst) as VOPC doesn't have vector dst
+class VOP3ce <bits<9> op> : VOP3a <op> {
+  bits<8> sdst;
+
+  let Inst{7-0} = sdst;
+}
+
 class VOP3be <bits<9> op> : Enc64 {
   bits<8> vdst;
   bits<2> src0_modifiers;
@@ -412,10 +460,10 @@ class VOP3be <bits<9> op> : Enc64 {
 
 class VOPCe <bits<8> op> : Enc32 {
   bits<9> src0;
-  bits<8> vsrc1;
+  bits<8> src1;
 
   let Inst{8-0} = src0;
-  let Inst{16-9} = vsrc1;
+  let Inst{16-9} = src1;
   let Inst{24-17} = op;
   let Inst{31-25} = 0x3e;
 }
@@ -675,17 +723,17 @@ class FLAT <bits<7> op, dag outs, dag ins, string asm, list<dag> pattern> :
 
   let UseNamedOperandTable = 1;
   let hasSideEffects = 0;
-  let AsmMatchConverter = "cvtFlat";
   let SchedRW = [WriteVMEM];
 }
 
-class MIMG <bits<7> op, dag outs, dag ins, string asm, list<dag> pattern> :
-    InstSI <outs, ins, asm, pattern>, MIMGe <op> {
+class MIMG <dag outs, dag ins, string asm, list<dag> pattern> :
+    InstSI <outs, ins, asm, pattern> {
 
   let VM_CNT = 1;
   let EXP_CNT = 1;
   let MIMG = 1;
   let Uses = [EXEC];
 
+  let UseNamedOperandTable = 1;
   let hasSideEffects = 0; // XXX ????
 }
diff --git a/lib/Target/AMDGPU/SIInstrInfo.cpp b/lib/Target/AMDGPU/SIInstrInfo.cpp
index 1e10d25e8fb7..d171e21c8a4f 100644
--- a/lib/Target/AMDGPU/SIInstrInfo.cpp
+++ b/lib/Target/AMDGPU/SIInstrInfo.cpp
@@ -12,14 +12,15 @@
 //
 //===----------------------------------------------------------------------===//
 
-
 #include "SIInstrInfo.h"
 #include "AMDGPUTargetMachine.h"
+#include "GCNHazardRecognizer.h"
 #include "SIDefines.h"
 #include "SIMachineFunctionInfo.h"
 #include "llvm/CodeGen/MachineFrameInfo.h"
 #include "llvm/CodeGen/MachineInstrBuilder.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/ScheduleDAG.h"
 #include "llvm/IR/Function.h"
 #include "llvm/CodeGen/RegisterScavenging.h"
 #include "llvm/MC/MCInstrDesc.h"
@@ -27,8 +28,8 @@
 
 using namespace llvm;
 
-SIInstrInfo::SIInstrInfo(const AMDGPUSubtarget &st)
-    : AMDGPUInstrInfo(st), RI() {}
+SIInstrInfo::SIInstrInfo(const SISubtarget &ST)
+  : AMDGPUInstrInfo(ST), RI(), ST(ST) {}
 
 //===----------------------------------------------------------------------===//
 // TargetInstrInfo callbacks
@@ -74,12 +75,12 @@ static bool nodesHaveSameOperandValue(SDNode *N0, SDNode* N1, unsigned OpName) {
   return N0->getOperand(Op0Idx) == N1->getOperand(Op1Idx);
 }
 
-bool SIInstrInfo::isReallyTriviallyReMaterializable(const MachineInstr *MI,
+bool SIInstrInfo::isReallyTriviallyReMaterializable(const MachineInstr &MI,
                                                     AliasAnalysis *AA) const {
   // TODO: The generic check fails for VALU instructions that should be
   // rematerializable due to implicit reads of exec. We really want all of the
   // generic logic for this except for this.
-  switch (MI->getOpcode()) {
+  switch (MI.getOpcode()) {
   case AMDGPU::V_MOV_B32_e32:
   case AMDGPU::V_MOV_B32_e64:
   case AMDGPU::V_MOV_B64_PSEUDO:
@@ -201,18 +202,18 @@ static bool isStride64(unsigned Opc) {
   }
 }
 
-bool SIInstrInfo::getMemOpBaseRegImmOfs(MachineInstr *LdSt, unsigned &BaseReg,
-                                        unsigned &Offset,
+bool SIInstrInfo::getMemOpBaseRegImmOfs(MachineInstr &LdSt, unsigned &BaseReg,
+                                        int64_t &Offset,
                                         const TargetRegisterInfo *TRI) const {
-  unsigned Opc = LdSt->getOpcode();
+  unsigned Opc = LdSt.getOpcode();
 
-  if (isDS(*LdSt)) {
-    const MachineOperand *OffsetImm = getNamedOperand(*LdSt,
-                                                      AMDGPU::OpName::offset);
+  if (isDS(LdSt)) {
+    const MachineOperand *OffsetImm =
+        getNamedOperand(LdSt, AMDGPU::OpName::offset);
     if (OffsetImm) {
       // Normal, single offset LDS instruction.
-      const MachineOperand *AddrReg = getNamedOperand(*LdSt,
-                                                      AMDGPU::OpName::addr);
+      const MachineOperand *AddrReg =
+          getNamedOperand(LdSt, AMDGPU::OpName::addr);
 
       BaseReg = AddrReg->getReg();
       Offset = OffsetImm->getImm();
@@ -222,10 +223,10 @@ bool SIInstrInfo::getMemOpBaseRegImmOfs(MachineInstr *LdSt, unsigned &BaseReg,
     // The 2 offset instructions use offset0 and offset1 instead. We can treat
     // these as a load with a single offset if the 2 offsets are consecutive. We
     // will use this for some partially aligned loads.
-    const MachineOperand *Offset0Imm = getNamedOperand(*LdSt,
-                                                       AMDGPU::OpName::offset0);
-    const MachineOperand *Offset1Imm = getNamedOperand(*LdSt,
-                                                       AMDGPU::OpName::offset1);
+    const MachineOperand *Offset0Imm =
+        getNamedOperand(LdSt, AMDGPU::OpName::offset0);
+    const MachineOperand *Offset1Imm =
+        getNamedOperand(LdSt, AMDGPU::OpName::offset1);
 
     uint8_t Offset0 = Offset0Imm->getImm();
     uint8_t Offset1 = Offset1Imm->getImm();
@@ -235,19 +236,19 @@ bool SIInstrInfo::getMemOpBaseRegImmOfs(MachineInstr *LdSt, unsigned &BaseReg,
       // to bytes of the individual reads.
 
       unsigned EltSize;
-      if (LdSt->mayLoad())
-        EltSize = getOpRegClass(*LdSt, 0)->getSize() / 2;
+      if (LdSt.mayLoad())
+        EltSize = getOpRegClass(LdSt, 0)->getSize() / 2;
       else {
-        assert(LdSt->mayStore());
+        assert(LdSt.mayStore());
         int Data0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::data0);
-        EltSize = getOpRegClass(*LdSt, Data0Idx)->getSize();
+        EltSize = getOpRegClass(LdSt, Data0Idx)->getSize();
       }
 
       if (isStride64(Opc))
         EltSize *= 64;
 
-      const MachineOperand *AddrReg = getNamedOperand(*LdSt,
-                                                      AMDGPU::OpName::addr);
+      const MachineOperand *AddrReg =
+          getNamedOperand(LdSt, AMDGPU::OpName::addr);
       BaseReg = AddrReg->getReg();
       Offset = EltSize * Offset0;
       return true;
@@ -256,63 +257,91 @@ bool SIInstrInfo::getMemOpBaseRegImmOfs(MachineInstr *LdSt, unsigned &BaseReg,
     return false;
   }
 
-  if (isMUBUF(*LdSt) || isMTBUF(*LdSt)) {
+  if (isMUBUF(LdSt) || isMTBUF(LdSt)) {
     if (AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::soffset) != -1)
       return false;
 
-    const MachineOperand *AddrReg = getNamedOperand(*LdSt,
-                                                    AMDGPU::OpName::vaddr);
+    const MachineOperand *AddrReg =
+        getNamedOperand(LdSt, AMDGPU::OpName::vaddr);
     if (!AddrReg)
       return false;
 
-    const MachineOperand *OffsetImm = getNamedOperand(*LdSt,
-                                                      AMDGPU::OpName::offset);
+    const MachineOperand *OffsetImm =
+        getNamedOperand(LdSt, AMDGPU::OpName::offset);
     BaseReg = AddrReg->getReg();
     Offset = OffsetImm->getImm();
     return true;
   }
 
-  if (isSMRD(*LdSt)) {
-    const MachineOperand *OffsetImm = getNamedOperand(*LdSt,
-                                                      AMDGPU::OpName::offset);
+  if (isSMRD(LdSt)) {
+    const MachineOperand *OffsetImm =
+        getNamedOperand(LdSt, AMDGPU::OpName::offset);
     if (!OffsetImm)
       return false;
 
-    const MachineOperand *SBaseReg = getNamedOperand(*LdSt,
-                                                     AMDGPU::OpName::sbase);
+    const MachineOperand *SBaseReg =
+        getNamedOperand(LdSt, AMDGPU::OpName::sbase);
     BaseReg = SBaseReg->getReg();
     Offset = OffsetImm->getImm();
     return true;
   }
 
+  if (isFLAT(LdSt)) {
+    const MachineOperand *AddrReg = getNamedOperand(LdSt, AMDGPU::OpName::addr);
+    BaseReg = AddrReg->getReg();
+    Offset = 0;
+    return true;
+  }
+
   return false;
 }
 
-bool SIInstrInfo::shouldClusterLoads(MachineInstr *FirstLdSt,
-                                     MachineInstr *SecondLdSt,
-                                     unsigned NumLoads) const {
-  // TODO: This needs finer tuning
-  if (NumLoads > 4)
+bool SIInstrInfo::shouldClusterMemOps(MachineInstr &FirstLdSt,
+                                      MachineInstr &SecondLdSt,
+                                      unsigned NumLoads) const {
+  const MachineOperand *FirstDst = nullptr;
+  const MachineOperand *SecondDst = nullptr;
+
+  if (isDS(FirstLdSt) && isDS(SecondLdSt)) {
+    FirstDst = getNamedOperand(FirstLdSt, AMDGPU::OpName::vdst);
+    SecondDst = getNamedOperand(SecondLdSt, AMDGPU::OpName::vdst);
+  }
+
+  if (isSMRD(FirstLdSt) && isSMRD(SecondLdSt)) {
+    FirstDst = getNamedOperand(FirstLdSt, AMDGPU::OpName::sdst);
+    SecondDst = getNamedOperand(SecondLdSt, AMDGPU::OpName::sdst);
+  }
+
+  if ((isMUBUF(FirstLdSt) && isMUBUF(SecondLdSt)) ||
+      (isMTBUF(FirstLdSt) && isMTBUF(SecondLdSt))) {
+    FirstDst = getNamedOperand(FirstLdSt, AMDGPU::OpName::vdata);
+    SecondDst = getNamedOperand(SecondLdSt, AMDGPU::OpName::vdata);
+  }
+
+  if (!FirstDst || !SecondDst)
     return false;
 
-  if (isDS(*FirstLdSt) && isDS(*SecondLdSt))
-    return true;
+  // Try to limit clustering based on the total number of bytes loaded
+  // rather than the number of instructions.  This is done to help reduce
+  // register pressure.  The method used is somewhat inexact, though,
+  // because it assumes that all loads in the cluster will load the
+  // same number of bytes as FirstLdSt.
 
-  if (isSMRD(*FirstLdSt) && isSMRD(*SecondLdSt))
-    return true;
+  // The unit of this value is bytes.
+  // FIXME: This needs finer tuning.
+  unsigned LoadClusterThreshold = 16;
 
-  if ((isMUBUF(*FirstLdSt) || isMTBUF(*FirstLdSt)) &&
-      (isMUBUF(*SecondLdSt) || isMTBUF(*SecondLdSt)))
-    return true;
+  const MachineRegisterInfo &MRI =
+      FirstLdSt.getParent()->getParent()->getRegInfo();
+  const TargetRegisterClass *DstRC = MRI.getRegClass(FirstDst->getReg());
 
-  return false;
+  return (NumLoads * DstRC->getSize()) <= LoadClusterThreshold;
 }
 
-void
-SIInstrInfo::copyPhysReg(MachineBasicBlock &MBB,
-                         MachineBasicBlock::iterator MI, DebugLoc DL,
-                         unsigned DestReg, unsigned SrcReg,
-                         bool KillSrc) const {
+void SIInstrInfo::copyPhysReg(MachineBasicBlock &MBB,
+                              MachineBasicBlock::iterator MI,
+                              const DebugLoc &DL, unsigned DestReg,
+                              unsigned SrcReg, bool KillSrc) const {
 
   // If we are trying to copy to or from SCC, there is a bug somewhere else in
   // the backend.  While it may be theoretically possible to do this, it should
@@ -361,7 +390,6 @@ SIInstrInfo::copyPhysReg(MachineBasicBlock &MBB,
 
   unsigned Opcode;
   ArrayRef<int16_t> SubIndices;
-  bool Forward;
 
   if (AMDGPU::SReg_32RegClass.contains(DestReg)) {
     assert(AMDGPU::SReg_32RegClass.contains(SrcReg));
@@ -445,10 +473,7 @@ SIInstrInfo::copyPhysReg(MachineBasicBlock &MBB,
     llvm_unreachable("Can't copy register!");
   }
 
-  if (RI.getHWRegIndex(DestReg) <= RI.getHWRegIndex(SrcReg))
-    Forward = true;
-  else
-    Forward = false;
+  bool Forward = RI.getHWRegIndex(DestReg) <= RI.getHWRegIndex(SrcReg);
 
   for (unsigned Idx = 0; Idx < SubIndices.size(); ++Idx) {
     unsigned SubIdx;
@@ -463,10 +488,12 @@ SIInstrInfo::copyPhysReg(MachineBasicBlock &MBB,
     Builder.addReg(RI.getSubReg(SrcReg, SubIdx));
 
     if (Idx == SubIndices.size() - 1)
-      Builder.addReg(SrcReg, RegState::Kill | RegState::Implicit);
+      Builder.addReg(SrcReg, getKillRegState(KillSrc) | RegState::Implicit);
 
     if (Idx == 0)
       Builder.addReg(DestReg, RegState::Define | RegState::Implicit);
+
+    Builder.addReg(SrcReg, RegState::Implicit);
   }
 }
 
@@ -525,6 +552,8 @@ static unsigned getVGPRSpillSaveOpcode(unsigned Size) {
     return AMDGPU::SI_SPILL_V32_SAVE;
   case 8:
     return AMDGPU::SI_SPILL_V64_SAVE;
+  case 12:
+    return AMDGPU::SI_SPILL_V96_SAVE;
   case 16:
     return AMDGPU::SI_SPILL_V128_SAVE;
   case 32:
@@ -558,19 +587,25 @@ void SIInstrInfo::storeRegToStackSlot(MachineBasicBlock &MBB,
   if (RI.isSGPRClass(RC)) {
     MFI->setHasSpilledSGPRs();
 
+    if (TargetRegisterInfo::isVirtualRegister(SrcReg) && RC->getSize() == 4) {
+      // m0 may not be allowed for readlane.
+      MachineRegisterInfo &MRI = MF->getRegInfo();
+      MRI.constrainRegClass(SrcReg, &AMDGPU::SReg_32_XM0RegClass);
+    }
+
     // We are only allowed to create one new instruction when spilling
     // registers, so we need to use pseudo instruction for spilling
     // SGPRs.
     unsigned Opcode = getSGPRSpillSaveOpcode(RC->getSize());
     BuildMI(MBB, MI, DL, get(Opcode))
-      .addReg(SrcReg)            // src
+      .addReg(SrcReg, getKillRegState(isKill)) // src
       .addFrameIndex(FrameIndex) // frame_idx
       .addMemOperand(MMO);
 
     return;
   }
 
-  if (!ST.isVGPRSpillingEnabled(MFI)) {
+  if (!ST.isVGPRSpillingEnabled(*MF->getFunction())) {
     LLVMContext &Ctx = MF->getFunction()->getContext();
     Ctx.emitError("SIInstrInfo::storeRegToStackSlot - Do not know how to"
                   " spill register");
@@ -585,10 +620,11 @@ void SIInstrInfo::storeRegToStackSlot(MachineBasicBlock &MBB,
   unsigned Opcode = getVGPRSpillSaveOpcode(RC->getSize());
   MFI->setHasSpilledVGPRs();
   BuildMI(MBB, MI, DL, get(Opcode))
-    .addReg(SrcReg)                   // src
+    .addReg(SrcReg, getKillRegState(isKill)) // src
     .addFrameIndex(FrameIndex)        // frame_idx
     .addReg(MFI->getScratchRSrcReg())       // scratch_rsrc
     .addReg(MFI->getScratchWaveOffsetReg()) // scratch_offset
+    .addImm(0)                              // offset
     .addMemOperand(MMO);
 }
 
@@ -615,6 +651,8 @@ static unsigned getVGPRSpillRestoreOpcode(unsigned Size) {
     return AMDGPU::SI_SPILL_V32_RESTORE;
   case 8:
     return AMDGPU::SI_SPILL_V64_RESTORE;
+  case 12:
+    return AMDGPU::SI_SPILL_V96_RESTORE;
   case 16:
     return AMDGPU::SI_SPILL_V128_RESTORE;
   case 32:
@@ -648,6 +686,13 @@ void SIInstrInfo::loadRegFromStackSlot(MachineBasicBlock &MBB,
     // FIXME: Maybe this should not include a memoperand because it will be
     // lowered to non-memory instructions.
     unsigned Opcode = getSGPRSpillRestoreOpcode(RC->getSize());
+
+    if (TargetRegisterInfo::isVirtualRegister(DestReg) && RC->getSize() == 4) {
+      // m0 may not be allowed for readlane.
+      MachineRegisterInfo &MRI = MF->getRegInfo();
+      MRI.constrainRegClass(DestReg, &AMDGPU::SReg_32_XM0RegClass);
+    }
+
     BuildMI(MBB, MI, DL, get(Opcode), DestReg)
       .addFrameIndex(FrameIndex) // frame_idx
       .addMemOperand(MMO);
@@ -655,7 +700,7 @@ void SIInstrInfo::loadRegFromStackSlot(MachineBasicBlock &MBB,
     return;
   }
 
-  if (!ST.isVGPRSpillingEnabled(MFI)) {
+  if (!ST.isVGPRSpillingEnabled(*MF->getFunction())) {
     LLVMContext &Ctx = MF->getFunction()->getContext();
     Ctx.emitError("SIInstrInfo::loadRegFromStackSlot - Do not know how to"
                   " restore register");
@@ -671,20 +716,18 @@ void SIInstrInfo::loadRegFromStackSlot(MachineBasicBlock &MBB,
     .addFrameIndex(FrameIndex)        // frame_idx
     .addReg(MFI->getScratchRSrcReg())       // scratch_rsrc
     .addReg(MFI->getScratchWaveOffsetReg()) // scratch_offset
+    .addImm(0)                              // offset
     .addMemOperand(MMO);
 }
 
 /// \param @Offset Offset in bytes of the FrameIndex being spilled
-unsigned SIInstrInfo::calculateLDSSpillAddress(MachineBasicBlock &MBB,
-                                               MachineBasicBlock::iterator MI,
-                                               RegScavenger *RS, unsigned TmpReg,
-                                               unsigned FrameOffset,
-                                               unsigned Size) const {
+unsigned SIInstrInfo::calculateLDSSpillAddress(
+    MachineBasicBlock &MBB, MachineInstr &MI, RegScavenger *RS, unsigned TmpReg,
+    unsigned FrameOffset, unsigned Size) const {
   MachineFunction *MF = MBB.getParent();
   SIMachineFunctionInfo *MFI = MF->getInfo<SIMachineFunctionInfo>();
-  const AMDGPUSubtarget &ST = MF->getSubtarget<AMDGPUSubtarget>();
-  const SIRegisterInfo *TRI =
-      static_cast<const SIRegisterInfo*>(ST.getRegisterInfo());
+  const SISubtarget &ST = MF->getSubtarget<SISubtarget>();
+  const SIRegisterInfo *TRI = ST.getRegisterInfo();
   DebugLoc DL = MBB.findDebugLoc(MI);
   unsigned WorkGroupSize = MFI->getMaximumWorkGroupSize(*MF);
   unsigned WavefrontSize = ST.getWavefrontSize();
@@ -699,8 +742,7 @@ unsigned SIInstrInfo::calculateLDSSpillAddress(MachineBasicBlock &MBB,
     if (TIDReg == AMDGPU::NoRegister)
       return TIDReg;
 
-
-    if (MFI->getShaderType() == ShaderType::COMPUTE &&
+    if (!AMDGPU::isShader(MF->getFunction()->getCallingConv()) &&
         WorkGroupSize > WavefrontSize) {
 
       unsigned TIDIGXReg
@@ -716,7 +758,7 @@ unsigned SIInstrInfo::calculateLDSSpillAddress(MachineBasicBlock &MBB,
           Entry.addLiveIn(Reg);
       }
 
-      RS->enterBasicBlock(&Entry);
+      RS->enterBasicBlock(Entry);
       // FIXME: Can we scavenge an SReg_64 and access the subregs?
       unsigned STmp0 = RS->scavengeRegister(&AMDGPU::SGPR_32RegClass, 0);
       unsigned STmp1 = RS->scavengeRegister(&AMDGPU::SGPR_32RegClass, 0);
@@ -773,8 +815,10 @@ unsigned SIInstrInfo::calculateLDSSpillAddress(MachineBasicBlock &MBB,
   return TmpReg;
 }
 
-void SIInstrInfo::insertWaitStates(MachineBasicBlock::iterator MI,
+void SIInstrInfo::insertWaitStates(MachineBasicBlock &MBB,
+                                   MachineBasicBlock::iterator MI,
                                    int Count) const {
+  DebugLoc DL = MBB.findDebugLoc(MI);
   while (Count > 0) {
     int Arg;
     if (Count >= 8)
@@ -782,76 +826,87 @@ void SIInstrInfo::insertWaitStates(MachineBasicBlock::iterator MI,
     else
       Arg = Count - 1;
     Count -= 8;
-    BuildMI(*MI->getParent(), MI, MI->getDebugLoc(), get(AMDGPU::S_NOP))
+    BuildMI(MBB, MI, DL, get(AMDGPU::S_NOP))
             .addImm(Arg);
   }
 }
 
-bool SIInstrInfo::expandPostRAPseudo(MachineBasicBlock::iterator MI) const {
-  MachineBasicBlock &MBB = *MI->getParent();
+void SIInstrInfo::insertNoop(MachineBasicBlock &MBB,
+                             MachineBasicBlock::iterator MI) const {
+  insertWaitStates(MBB, MI, 1);
+}
+
+unsigned SIInstrInfo::getNumWaitStates(const MachineInstr &MI) const {
+  switch (MI.getOpcode()) {
+  default: return 1; // FIXME: Do wait states equal cycles?
+
+  case AMDGPU::S_NOP:
+    return MI.getOperand(0).getImm() + 1;
+  }
+}
+
+bool SIInstrInfo::expandPostRAPseudo(MachineInstr &MI) const {
+  MachineBasicBlock &MBB = *MI.getParent();
   DebugLoc DL = MBB.findDebugLoc(MI);
-  switch (MI->getOpcode()) {
+  switch (MI.getOpcode()) {
   default: return AMDGPUInstrInfo::expandPostRAPseudo(MI);
 
-  case AMDGPU::SGPR_USE:
-    // This is just a placeholder for register allocation.
-    MI->eraseFromParent();
-    break;
-
   case AMDGPU::V_MOV_B64_PSEUDO: {
-    unsigned Dst = MI->getOperand(0).getReg();
+    unsigned Dst = MI.getOperand(0).getReg();
     unsigned DstLo = RI.getSubReg(Dst, AMDGPU::sub0);
     unsigned DstHi = RI.getSubReg(Dst, AMDGPU::sub1);
 
-    const MachineOperand &SrcOp = MI->getOperand(1);
+    const MachineOperand &SrcOp = MI.getOperand(1);
     // FIXME: Will this work for 64-bit floating point immediates?
     assert(!SrcOp.isFPImm());
     if (SrcOp.isImm()) {
       APInt Imm(64, SrcOp.getImm());
       BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), DstLo)
-              .addImm(Imm.getLoBits(32).getZExtValue())
-              .addReg(Dst, RegState::Implicit);
+        .addImm(Imm.getLoBits(32).getZExtValue())
+        .addReg(Dst, RegState::Implicit | RegState::Define);
       BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), DstHi)
-              .addImm(Imm.getHiBits(32).getZExtValue())
-              .addReg(Dst, RegState::Implicit);
+        .addImm(Imm.getHiBits(32).getZExtValue())
+        .addReg(Dst, RegState::Implicit | RegState::Define);
     } else {
       assert(SrcOp.isReg());
       BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), DstLo)
-              .addReg(RI.getSubReg(SrcOp.getReg(), AMDGPU::sub0))
-              .addReg(Dst, RegState::Implicit);
+        .addReg(RI.getSubReg(SrcOp.getReg(), AMDGPU::sub0))
+        .addReg(Dst, RegState::Implicit | RegState::Define);
       BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), DstHi)
-              .addReg(RI.getSubReg(SrcOp.getReg(), AMDGPU::sub1))
-              .addReg(Dst, RegState::Implicit);
+        .addReg(RI.getSubReg(SrcOp.getReg(), AMDGPU::sub1))
+        .addReg(Dst, RegState::Implicit | RegState::Define);
     }
-    MI->eraseFromParent();
+    MI.eraseFromParent();
     break;
   }
 
   case AMDGPU::V_CNDMASK_B64_PSEUDO: {
-    unsigned Dst = MI->getOperand(0).getReg();
+    unsigned Dst = MI.getOperand(0).getReg();
     unsigned DstLo = RI.getSubReg(Dst, AMDGPU::sub0);
     unsigned DstHi = RI.getSubReg(Dst, AMDGPU::sub1);
-    unsigned Src0 = MI->getOperand(1).getReg();
-    unsigned Src1 = MI->getOperand(2).getReg();
-    const MachineOperand &SrcCond = MI->getOperand(3);
+    unsigned Src0 = MI.getOperand(1).getReg();
+    unsigned Src1 = MI.getOperand(2).getReg();
+    const MachineOperand &SrcCond = MI.getOperand(3);
 
     BuildMI(MBB, MI, DL, get(AMDGPU::V_CNDMASK_B32_e64), DstLo)
-        .addReg(RI.getSubReg(Src0, AMDGPU::sub0))
-        .addReg(RI.getSubReg(Src1, AMDGPU::sub0))
-        .addOperand(SrcCond);
+      .addReg(RI.getSubReg(Src0, AMDGPU::sub0))
+      .addReg(RI.getSubReg(Src1, AMDGPU::sub0))
+      .addReg(SrcCond.getReg())
+      .addReg(Dst, RegState::Implicit | RegState::Define);
     BuildMI(MBB, MI, DL, get(AMDGPU::V_CNDMASK_B32_e64), DstHi)
-        .addReg(RI.getSubReg(Src0, AMDGPU::sub1))
-        .addReg(RI.getSubReg(Src1, AMDGPU::sub1))
-        .addOperand(SrcCond);
-    MI->eraseFromParent();
+      .addReg(RI.getSubReg(Src0, AMDGPU::sub1))
+      .addReg(RI.getSubReg(Src1, AMDGPU::sub1))
+      .addReg(SrcCond.getReg(), getKillRegState(SrcCond.isKill()))
+      .addReg(Dst, RegState::Implicit | RegState::Define);
+    MI.eraseFromParent();
     break;
   }
 
-  case AMDGPU::SI_CONSTDATA_PTR: {
-    const SIRegisterInfo *TRI =
-        static_cast<const SIRegisterInfo *>(ST.getRegisterInfo());
+  case AMDGPU::SI_PC_ADD_REL_OFFSET: {
+    const SIRegisterInfo *TRI
+      = static_cast<const SIRegisterInfo *>(ST.getRegisterInfo());
     MachineFunction &MF = *MBB.getParent();
-    unsigned Reg = MI->getOperand(0).getReg();
+    unsigned Reg = MI.getOperand(0).getReg();
     unsigned RegLo = TRI->getSubReg(Reg, AMDGPU::sub0);
     unsigned RegHi = TRI->getSubReg(Reg, AMDGPU::sub1);
 
@@ -863,15 +918,15 @@ bool SIInstrInfo::expandPostRAPseudo(MachineBasicBlock::iterator MI) const {
     // Add 32-bit offset from this instruction to the start of the
     // constant data.
     Bundler.append(BuildMI(MF, DL, get(AMDGPU::S_ADD_U32), RegLo)
-                           .addReg(RegLo)
-                           .addOperand(MI->getOperand(1)));
+                       .addReg(RegLo)
+                       .addOperand(MI.getOperand(1)));
     Bundler.append(BuildMI(MF, DL, get(AMDGPU::S_ADDC_U32), RegHi)
                            .addReg(RegHi)
                            .addImm(0));
 
     llvm::finalizeBundle(MBB, Bundler.begin());
 
-    MI->eraseFromParent();
+    MI.eraseFromParent();
     break;
   }
   }
@@ -885,22 +940,21 @@ bool SIInstrInfo::expandPostRAPseudo(MachineBasicBlock::iterator MI) const {
 /// non-commutable pair of operand indices OpIdx0 and OpIdx1.
 /// Even though the instruction is commutable, the method may still
 /// fail to commute the operands, null pointer is returned in such cases.
-MachineInstr *SIInstrInfo::commuteInstructionImpl(MachineInstr *MI,
-                                                  bool NewMI,
+MachineInstr *SIInstrInfo::commuteInstructionImpl(MachineInstr &MI, bool NewMI,
                                                   unsigned OpIdx0,
                                                   unsigned OpIdx1) const {
-  int CommutedOpcode = commuteOpcode(*MI);
+  int CommutedOpcode = commuteOpcode(MI);
   if (CommutedOpcode == -1)
     return nullptr;
 
-  int Src0Idx = AMDGPU::getNamedOperandIdx(MI->getOpcode(),
-                                           AMDGPU::OpName::src0);
-  MachineOperand &Src0 = MI->getOperand(Src0Idx);
+  int Src0Idx =
+      AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::src0);
+  MachineOperand &Src0 = MI.getOperand(Src0Idx);
   if (!Src0.isReg())
     return nullptr;
 
-  int Src1Idx = AMDGPU::getNamedOperandIdx(MI->getOpcode(),
-                                           AMDGPU::OpName::src1);
+  int Src1Idx =
+      AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::src1);
 
   if ((OpIdx0 != static_cast<unsigned>(Src0Idx) ||
        OpIdx1 != static_cast<unsigned>(Src1Idx)) &&
@@ -908,33 +962,32 @@ MachineInstr *SIInstrInfo::commuteInstructionImpl(MachineInstr *MI,
        OpIdx1 != static_cast<unsigned>(Src0Idx)))
     return nullptr;
 
-  MachineOperand &Src1 = MI->getOperand(Src1Idx);
+  MachineOperand &Src1 = MI.getOperand(Src1Idx);
 
-
-  if (isVOP2(*MI)) {
-    const MCInstrDesc &InstrDesc = MI->getDesc();
-    // For VOP2 instructions, any operand type is valid to use for src0.  Make
-    // sure we can use the src1 as src0.
+  if (isVOP2(MI) || isVOPC(MI)) {
+    const MCInstrDesc &InstrDesc = MI.getDesc();
+    // For VOP2 and VOPC instructions, any operand type is valid to use for
+    // src0.  Make sure we can use the src0 as src1.
     //
     // We could be stricter here and only allow commuting if there is a reason
     // to do so. i.e. if both operands are VGPRs there is no real benefit,
     // although MachineCSE attempts to find matches by commuting.
-    const MachineRegisterInfo &MRI = MI->getParent()->getParent()->getRegInfo();
+    const MachineRegisterInfo &MRI = MI.getParent()->getParent()->getRegInfo();
     if (!isLegalRegOperand(MRI, InstrDesc.OpInfo[Src1Idx], Src0))
       return nullptr;
   }
 
+  MachineInstr *CommutedMI = &MI;
   if (!Src1.isReg()) {
     // Allow commuting instructions with Imm operands.
-    if (NewMI || !Src1.isImm() ||
-        (!isVOP2(*MI) && !isVOP3(*MI))) {
+    if (NewMI || !Src1.isImm() || (!isVOP2(MI) && !isVOP3(MI))) {
       return nullptr;
     }
     // Be sure to copy the source modifiers to the right place.
-    if (MachineOperand *Src0Mods
-          = getNamedOperand(*MI, AMDGPU::OpName::src0_modifiers)) {
-      MachineOperand *Src1Mods
-        = getNamedOperand(*MI, AMDGPU::OpName::src1_modifiers);
+    if (MachineOperand *Src0Mods =
+            getNamedOperand(MI, AMDGPU::OpName::src0_modifiers)) {
+      MachineOperand *Src1Mods =
+          getNamedOperand(MI, AMDGPU::OpName::src1_modifiers);
 
       int Src0ModsVal = Src0Mods->getImm();
       if (!Src1Mods && Src0ModsVal != 0)
@@ -959,26 +1012,26 @@ MachineInstr *SIInstrInfo::commuteInstructionImpl(MachineInstr *MI,
     Src1.ChangeToRegister(Reg, false);
     Src1.setSubReg(SubReg);
   } else {
-    MI = TargetInstrInfo::commuteInstructionImpl(MI, NewMI, OpIdx0, OpIdx1);
+    CommutedMI =
+        TargetInstrInfo::commuteInstructionImpl(MI, NewMI, OpIdx0, OpIdx1);
   }
 
-  if (MI)
-    MI->setDesc(get(CommutedOpcode));
+  if (CommutedMI)
+    CommutedMI->setDesc(get(CommutedOpcode));
 
-  return MI;
+  return CommutedMI;
 }
 
 // This needs to be implemented because the source modifiers may be inserted
 // between the true commutable operands, and the base
 // TargetInstrInfo::commuteInstruction uses it.
-bool SIInstrInfo::findCommutedOpIndices(MachineInstr *MI,
-                                        unsigned &SrcOpIdx0,
+bool SIInstrInfo::findCommutedOpIndices(MachineInstr &MI, unsigned &SrcOpIdx0,
                                         unsigned &SrcOpIdx1) const {
-  const MCInstrDesc &MCID = MI->getDesc();
+  const MCInstrDesc &MCID = MI.getDesc();
   if (!MCID.isCommutable())
     return false;
 
-  unsigned Opc = MI->getOpcode();
+  unsigned Opc = MI.getOpcode();
   int Src0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0);
   if (Src0Idx == -1)
     return false;
@@ -986,24 +1039,24 @@ bool SIInstrInfo::findCommutedOpIndices(MachineInstr *MI,
   // FIXME: Workaround TargetInstrInfo::commuteInstruction asserting on
   // immediate. Also, immediate src0 operand is not handled in
   // SIInstrInfo::commuteInstruction();
-  if (!MI->getOperand(Src0Idx).isReg())
+  if (!MI.getOperand(Src0Idx).isReg())
     return false;
 
   int Src1Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src1);
   if (Src1Idx == -1)
     return false;
 
-  MachineOperand &Src1 = MI->getOperand(Src1Idx);
+  MachineOperand &Src1 = MI.getOperand(Src1Idx);
   if (Src1.isImm()) {
     // SIInstrInfo::commuteInstruction() does support commuting the immediate
     // operand src1 in 2 and 3 operand instructions.
-    if (!isVOP2(MI->getOpcode()) && !isVOP3(MI->getOpcode()))
+    if (!isVOP2(MI.getOpcode()) && !isVOP3(MI.getOpcode()))
       return false;
   } else if (Src1.isReg()) {
     // If any source modifiers are set, the generic instruction commuting won't
     // understand how to copy the source modifiers.
-    if (hasModifiersSet(*MI, AMDGPU::OpName::src0_modifiers) ||
-        hasModifiersSet(*MI, AMDGPU::OpName::src1_modifiers))
+    if (hasModifiersSet(MI, AMDGPU::OpName::src0_modifiers) ||
+        hasModifiersSet(MI, AMDGPU::OpName::src1_modifiers))
       return false;
   } else
     return false;
@@ -1011,23 +1064,135 @@ bool SIInstrInfo::findCommutedOpIndices(MachineInstr *MI,
   return fixCommutedOpIndices(SrcOpIdx0, SrcOpIdx1, Src0Idx, Src1Idx);
 }
 
-MachineInstr *SIInstrInfo::buildMovInstr(MachineBasicBlock *MBB,
-                                         MachineBasicBlock::iterator I,
-                                         unsigned DstReg,
-                                         unsigned SrcReg) const {
-  return BuildMI(*MBB, I, MBB->findDebugLoc(I), get(AMDGPU::V_MOV_B32_e32),
-                 DstReg) .addReg(SrcReg);
+unsigned SIInstrInfo::getBranchOpcode(SIInstrInfo::BranchPredicate Cond) {
+  switch (Cond) {
+  case SIInstrInfo::SCC_TRUE:
+    return AMDGPU::S_CBRANCH_SCC1;
+  case SIInstrInfo::SCC_FALSE:
+    return AMDGPU::S_CBRANCH_SCC0;
+  case SIInstrInfo::VCCNZ:
+    return AMDGPU::S_CBRANCH_VCCNZ;
+  case SIInstrInfo::VCCZ:
+    return AMDGPU::S_CBRANCH_VCCZ;
+  case SIInstrInfo::EXECNZ:
+    return AMDGPU::S_CBRANCH_EXECNZ;
+  case SIInstrInfo::EXECZ:
+    return AMDGPU::S_CBRANCH_EXECZ;
+  default:
+    llvm_unreachable("invalid branch predicate");
+  }
+}
+
+SIInstrInfo::BranchPredicate SIInstrInfo::getBranchPredicate(unsigned Opcode) {
+  switch (Opcode) {
+  case AMDGPU::S_CBRANCH_SCC0:
+    return SCC_FALSE;
+  case AMDGPU::S_CBRANCH_SCC1:
+    return SCC_TRUE;
+  case AMDGPU::S_CBRANCH_VCCNZ:
+    return VCCNZ;
+  case AMDGPU::S_CBRANCH_VCCZ:
+    return VCCZ;
+  case AMDGPU::S_CBRANCH_EXECNZ:
+    return EXECNZ;
+  case AMDGPU::S_CBRANCH_EXECZ:
+    return EXECZ;
+  default:
+    return INVALID_BR;
+  }
 }
 
-bool SIInstrInfo::isMov(unsigned Opcode) const {
-  switch(Opcode) {
-  default: return false;
-  case AMDGPU::S_MOV_B32:
-  case AMDGPU::S_MOV_B64:
-  case AMDGPU::V_MOV_B32_e32:
-  case AMDGPU::V_MOV_B32_e64:
+bool SIInstrInfo::analyzeBranch(MachineBasicBlock &MBB, MachineBasicBlock *&TBB,
+                                MachineBasicBlock *&FBB,
+                                SmallVectorImpl<MachineOperand> &Cond,
+                                bool AllowModify) const {
+  MachineBasicBlock::iterator I = MBB.getFirstTerminator();
+
+  if (I == MBB.end())
+    return false;
+
+  if (I->getOpcode() == AMDGPU::S_BRANCH) {
+    // Unconditional Branch
+    TBB = I->getOperand(0).getMBB();
+    return false;
+  }
+
+  BranchPredicate Pred = getBranchPredicate(I->getOpcode());
+  if (Pred == INVALID_BR)
     return true;
+
+  MachineBasicBlock *CondBB = I->getOperand(0).getMBB();
+  Cond.push_back(MachineOperand::CreateImm(Pred));
+
+  ++I;
+
+  if (I == MBB.end()) {
+    // Conditional branch followed by fall-through.
+    TBB = CondBB;
+    return false;
+  }
+
+  if (I->getOpcode() == AMDGPU::S_BRANCH) {
+    TBB = CondBB;
+    FBB = I->getOperand(0).getMBB();
+    return false;
+  }
+
+  return true;
+}
+
+unsigned SIInstrInfo::RemoveBranch(MachineBasicBlock &MBB) const {
+  MachineBasicBlock::iterator I = MBB.getFirstTerminator();
+
+  unsigned Count = 0;
+  while (I != MBB.end()) {
+    MachineBasicBlock::iterator Next = std::next(I);
+    I->eraseFromParent();
+    ++Count;
+    I = Next;
+  }
+
+  return Count;
+}
+
+unsigned SIInstrInfo::InsertBranch(MachineBasicBlock &MBB,
+                                   MachineBasicBlock *TBB,
+                                   MachineBasicBlock *FBB,
+                                   ArrayRef<MachineOperand> Cond,
+                                   const DebugLoc &DL) const {
+
+  if (!FBB && Cond.empty()) {
+    BuildMI(&MBB, DL, get(AMDGPU::S_BRANCH))
+      .addMBB(TBB);
+    return 1;
+  }
+
+  assert(TBB && Cond[0].isImm());
+
+  unsigned Opcode
+    = getBranchOpcode(static_cast<BranchPredicate>(Cond[0].getImm()));
+
+  if (!FBB) {
+    BuildMI(&MBB, DL, get(Opcode))
+      .addMBB(TBB);
+    return 1;
   }
+
+  assert(TBB && FBB);
+
+  BuildMI(&MBB, DL, get(Opcode))
+    .addMBB(TBB);
+  BuildMI(&MBB, DL, get(AMDGPU::S_BRANCH))
+    .addMBB(FBB);
+
+  return 2;
+}
+
+bool SIInstrInfo::ReverseBranchCondition(
+  SmallVectorImpl<MachineOperand> &Cond) const {
+  assert(Cond.size() == 1);
+  Cond[0].setImm(-Cond[0].getImm());
+  return false;
 }
 
 static void removeModOperands(MachineInstr &MI) {
@@ -1044,81 +1209,76 @@ static void removeModOperands(MachineInstr &MI) {
   MI.RemoveOperand(Src0ModIdx);
 }
 
-bool SIInstrInfo::FoldImmediate(MachineInstr *UseMI, MachineInstr *DefMI,
+// TODO: Maybe this should be removed this and custom fold everything in
+// SIFoldOperands?
+bool SIInstrInfo::FoldImmediate(MachineInstr &UseMI, MachineInstr &DefMI,
                                 unsigned Reg, MachineRegisterInfo *MRI) const {
   if (!MRI->hasOneNonDBGUse(Reg))
     return false;
 
-  unsigned Opc = UseMI->getOpcode();
+  unsigned Opc = UseMI.getOpcode();
   if (Opc == AMDGPU::V_MAD_F32 || Opc == AMDGPU::V_MAC_F32_e64) {
     // Don't fold if we are using source modifiers. The new VOP2 instructions
     // don't have them.
-    if (hasModifiersSet(*UseMI, AMDGPU::OpName::src0_modifiers) ||
-        hasModifiersSet(*UseMI, AMDGPU::OpName::src1_modifiers) ||
-        hasModifiersSet(*UseMI, AMDGPU::OpName::src2_modifiers)) {
+    if (hasModifiersSet(UseMI, AMDGPU::OpName::src0_modifiers) ||
+        hasModifiersSet(UseMI, AMDGPU::OpName::src1_modifiers) ||
+        hasModifiersSet(UseMI, AMDGPU::OpName::src2_modifiers)) {
       return false;
     }
 
-    MachineOperand *Src0 = getNamedOperand(*UseMI, AMDGPU::OpName::src0);
-    MachineOperand *Src1 = getNamedOperand(*UseMI, AMDGPU::OpName::src1);
-    MachineOperand *Src2 = getNamedOperand(*UseMI, AMDGPU::OpName::src2);
+    const MachineOperand &ImmOp = DefMI.getOperand(1);
+
+    // If this is a free constant, there's no reason to do this.
+    // TODO: We could fold this here instead of letting SIFoldOperands do it
+    // later.
+    if (isInlineConstant(ImmOp, 4))
+      return false;
+
+    MachineOperand *Src0 = getNamedOperand(UseMI, AMDGPU::OpName::src0);
+    MachineOperand *Src1 = getNamedOperand(UseMI, AMDGPU::OpName::src1);
+    MachineOperand *Src2 = getNamedOperand(UseMI, AMDGPU::OpName::src2);
 
     // Multiplied part is the constant: Use v_madmk_f32
     // We should only expect these to be on src0 due to canonicalizations.
     if (Src0->isReg() && Src0->getReg() == Reg) {
-      if (!Src1->isReg() ||
-          (Src1->isReg() && RI.isSGPRClass(MRI->getRegClass(Src1->getReg()))))
+      if (!Src1->isReg() || RI.isSGPRClass(MRI->getRegClass(Src1->getReg())))
         return false;
 
-      if (!Src2->isReg() ||
-          (Src2->isReg() && RI.isSGPRClass(MRI->getRegClass(Src2->getReg()))))
+      if (!Src2->isReg() || RI.isSGPRClass(MRI->getRegClass(Src2->getReg())))
         return false;
 
-      // We need to do some weird looking operand shuffling since the madmk
-      // operands are out of the normal expected order with the multiplied
-      // constant as the last operand.
-      //
-      // v_mad_f32 src0, src1, src2 -> v_madmk_f32 src0 * src2K + src1
-      // src0 -> src2 K
-      // src1 -> src0
-      // src2 -> src1
+      // We need to swap operands 0 and 1 since madmk constant is at operand 1.
 
-      const int64_t Imm = DefMI->getOperand(1).getImm();
+      const int64_t Imm = DefMI.getOperand(1).getImm();
 
       // FIXME: This would be a lot easier if we could return a new instruction
       // instead of having to modify in place.
 
       // Remove these first since they are at the end.
-      UseMI->RemoveOperand(AMDGPU::getNamedOperandIdx(Opc,
-                                                      AMDGPU::OpName::omod));
-      UseMI->RemoveOperand(AMDGPU::getNamedOperandIdx(Opc,
-                                                      AMDGPU::OpName::clamp));
+      UseMI.RemoveOperand(
+          AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::omod));
+      UseMI.RemoveOperand(
+          AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::clamp));
 
       unsigned Src1Reg = Src1->getReg();
       unsigned Src1SubReg = Src1->getSubReg();
-      unsigned Src2Reg = Src2->getReg();
-      unsigned Src2SubReg = Src2->getSubReg();
       Src0->setReg(Src1Reg);
       Src0->setSubReg(Src1SubReg);
       Src0->setIsKill(Src1->isKill());
 
-      Src1->setReg(Src2Reg);
-      Src1->setSubReg(Src2SubReg);
-      Src1->setIsKill(Src2->isKill());
-
       if (Opc == AMDGPU::V_MAC_F32_e64) {
-        UseMI->untieRegOperand(
-          AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src2));
+        UseMI.untieRegOperand(
+            AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src2));
       }
 
-      Src2->ChangeToImmediate(Imm);
+      Src1->ChangeToImmediate(Imm);
 
-      removeModOperands(*UseMI);
-      UseMI->setDesc(get(AMDGPU::V_MADMK_F32));
+      removeModOperands(UseMI);
+      UseMI.setDesc(get(AMDGPU::V_MADMK_F32));
 
       bool DeleteDef = MRI->hasOneNonDBGUse(Reg);
       if (DeleteDef)
-        DefMI->eraseFromParent();
+        DefMI.eraseFromParent();
 
       return true;
     }
@@ -1131,36 +1291,35 @@ bool SIInstrInfo::FoldImmediate(MachineInstr *UseMI, MachineInstr *DefMI,
           (Src0->isReg() && RI.isSGPRClass(MRI->getRegClass(Src0->getReg()))))
         return false;
 
-      if (!Src1->isReg() ||
-          (Src1->isReg() && RI.isSGPRClass(MRI->getRegClass(Src1->getReg()))))
+      if (!Src1->isReg() || RI.isSGPRClass(MRI->getRegClass(Src1->getReg())))
         return false;
 
-      const int64_t Imm = DefMI->getOperand(1).getImm();
+      const int64_t Imm = DefMI.getOperand(1).getImm();
 
       // FIXME: This would be a lot easier if we could return a new instruction
       // instead of having to modify in place.
 
       // Remove these first since they are at the end.
-      UseMI->RemoveOperand(AMDGPU::getNamedOperandIdx(Opc,
-                                                      AMDGPU::OpName::omod));
-      UseMI->RemoveOperand(AMDGPU::getNamedOperandIdx(Opc,
-                                                      AMDGPU::OpName::clamp));
+      UseMI.RemoveOperand(
+          AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::omod));
+      UseMI.RemoveOperand(
+          AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::clamp));
 
       if (Opc == AMDGPU::V_MAC_F32_e64) {
-        UseMI->untieRegOperand(
-          AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src2));
+        UseMI.untieRegOperand(
+            AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src2));
       }
 
       // ChangingToImmediate adds Src2 back to the instruction.
       Src2->ChangeToImmediate(Imm);
 
       // These come before src2.
-      removeModOperands(*UseMI);
-      UseMI->setDesc(get(AMDGPU::V_MADAK_F32));
+      removeModOperands(UseMI);
+      UseMI.setDesc(get(AMDGPU::V_MADAK_F32));
 
       bool DeleteDef = MRI->hasOneNonDBGUse(Reg);
       if (DeleteDef)
-        DefMI->eraseFromParent();
+        DefMI.eraseFromParent();
 
       return true;
     }
@@ -1177,17 +1336,20 @@ static bool offsetsDoNotOverlap(int WidthA, int OffsetA,
   return LowOffset + LowWidth <= HighOffset;
 }
 
-bool SIInstrInfo::checkInstOffsetsDoNotOverlap(MachineInstr *MIa,
-                                               MachineInstr *MIb) const {
-  unsigned BaseReg0, Offset0;
-  unsigned BaseReg1, Offset1;
+bool SIInstrInfo::checkInstOffsetsDoNotOverlap(MachineInstr &MIa,
+                                               MachineInstr &MIb) const {
+  unsigned BaseReg0, BaseReg1;
+  int64_t Offset0, Offset1;
 
   if (getMemOpBaseRegImmOfs(MIa, BaseReg0, Offset0, &RI) &&
       getMemOpBaseRegImmOfs(MIb, BaseReg1, Offset1, &RI)) {
-    assert(MIa->hasOneMemOperand() && MIb->hasOneMemOperand() &&
-           "read2 / write2 not expected here yet");
-    unsigned Width0 = (*MIa->memoperands_begin())->getSize();
-    unsigned Width1 = (*MIb->memoperands_begin())->getSize();
+
+    if (!MIa.hasOneMemOperand() || !MIb.hasOneMemOperand()) {
+      // FIXME: Handle ds_read2 / ds_write2.
+      return false;
+    }
+    unsigned Width0 = (*MIa.memoperands_begin())->getSize();
+    unsigned Width1 = (*MIb.memoperands_begin())->getSize();
     if (BaseReg0 == BaseReg1 &&
         offsetsDoNotOverlap(Width0, Offset0, Width1, Offset1)) {
       return true;
@@ -1197,19 +1359,19 @@ bool SIInstrInfo::checkInstOffsetsDoNotOverlap(MachineInstr *MIa,
   return false;
 }
 
-bool SIInstrInfo::areMemAccessesTriviallyDisjoint(MachineInstr *MIa,
-                                                  MachineInstr *MIb,
+bool SIInstrInfo::areMemAccessesTriviallyDisjoint(MachineInstr &MIa,
+                                                  MachineInstr &MIb,
                                                   AliasAnalysis *AA) const {
-  assert(MIa && (MIa->mayLoad() || MIa->mayStore()) &&
+  assert((MIa.mayLoad() || MIa.mayStore()) &&
          "MIa must load from or modify a memory location");
-  assert(MIb && (MIb->mayLoad() || MIb->mayStore()) &&
+  assert((MIb.mayLoad() || MIb.mayStore()) &&
          "MIb must load from or modify a memory location");
 
-  if (MIa->hasUnmodeledSideEffects() || MIb->hasUnmodeledSideEffects())
+  if (MIa.hasUnmodeledSideEffects() || MIb.hasUnmodeledSideEffects())
     return false;
 
   // XXX - Can we relax this between address spaces?
-  if (MIa->hasOrderedMemoryRef() || MIb->hasOrderedMemoryRef())
+  if (MIa.hasOrderedMemoryRef() || MIb.hasOrderedMemoryRef())
     return false;
 
   // TODO: Should we check the address space from the MachineMemOperand? That
@@ -1217,29 +1379,29 @@ bool SIInstrInfo::areMemAccessesTriviallyDisjoint(MachineInstr *MIa,
   // underlying address space, even if it was lowered to a different one,
   // e.g. private accesses lowered to use MUBUF instructions on a scratch
   // buffer.
-  if (isDS(*MIa)) {
-    if (isDS(*MIb))
+  if (isDS(MIa)) {
+    if (isDS(MIb))
       return checkInstOffsetsDoNotOverlap(MIa, MIb);
 
-    return !isFLAT(*MIb);
+    return !isFLAT(MIb);
   }
 
-  if (isMUBUF(*MIa) || isMTBUF(*MIa)) {
-    if (isMUBUF(*MIb) || isMTBUF(*MIb))
+  if (isMUBUF(MIa) || isMTBUF(MIa)) {
+    if (isMUBUF(MIb) || isMTBUF(MIb))
       return checkInstOffsetsDoNotOverlap(MIa, MIb);
 
-    return !isFLAT(*MIb) && !isSMRD(*MIb);
+    return !isFLAT(MIb) && !isSMRD(MIb);
   }
 
-  if (isSMRD(*MIa)) {
-    if (isSMRD(*MIb))
+  if (isSMRD(MIa)) {
+    if (isSMRD(MIb))
       return checkInstOffsetsDoNotOverlap(MIa, MIb);
 
-    return !isFLAT(*MIb) && !isMUBUF(*MIa) && !isMTBUF(*MIa);
+    return !isFLAT(MIb) && !isMUBUF(MIa) && !isMTBUF(MIa);
   }
 
-  if (isFLAT(*MIa)) {
-    if (isFLAT(*MIb))
+  if (isFLAT(MIa)) {
+    if (isFLAT(MIb))
       return checkInstOffsetsDoNotOverlap(MIa, MIb);
 
     return false;
@@ -1249,35 +1411,49 @@ bool SIInstrInfo::areMemAccessesTriviallyDisjoint(MachineInstr *MIa,
 }
 
 MachineInstr *SIInstrInfo::convertToThreeAddress(MachineFunction::iterator &MBB,
-                                                MachineBasicBlock::iterator &MI,
-                                                LiveVariables *LV) const {
-
-  switch (MI->getOpcode()) {
-    default: return nullptr;
-    case AMDGPU::V_MAC_F32_e64: break;
-    case AMDGPU::V_MAC_F32_e32: {
-      const MachineOperand *Src0 = getNamedOperand(*MI, AMDGPU::OpName::src0);
-      if (Src0->isImm() && !isInlineConstant(*Src0, 4))
-        return nullptr;
-      break;
-    }
+                                                 MachineInstr &MI,
+                                                 LiveVariables *LV) const {
+
+  switch (MI.getOpcode()) {
+  default:
+    return nullptr;
+  case AMDGPU::V_MAC_F32_e64:
+    break;
+  case AMDGPU::V_MAC_F32_e32: {
+    const MachineOperand *Src0 = getNamedOperand(MI, AMDGPU::OpName::src0);
+    if (Src0->isImm() && !isInlineConstant(*Src0, 4))
+      return nullptr;
+    break;
+  }
   }
 
-  const MachineOperand *Dst = getNamedOperand(*MI, AMDGPU::OpName::dst);
-  const MachineOperand *Src0 = getNamedOperand(*MI, AMDGPU::OpName::src0);
-  const MachineOperand *Src1 = getNamedOperand(*MI, AMDGPU::OpName::src1);
-  const MachineOperand *Src2 = getNamedOperand(*MI, AMDGPU::OpName::src2);
+  const MachineOperand *Dst = getNamedOperand(MI, AMDGPU::OpName::vdst);
+  const MachineOperand *Src0 = getNamedOperand(MI, AMDGPU::OpName::src0);
+  const MachineOperand *Src1 = getNamedOperand(MI, AMDGPU::OpName::src1);
+  const MachineOperand *Src2 = getNamedOperand(MI, AMDGPU::OpName::src2);
 
-  return BuildMI(*MBB, MI, MI->getDebugLoc(), get(AMDGPU::V_MAD_F32))
-                 .addOperand(*Dst)
-                 .addImm(0) // Src0 mods
-                 .addOperand(*Src0)
-                 .addImm(0) // Src1 mods
-                 .addOperand(*Src1)
-                 .addImm(0) // Src mods
-                 .addOperand(*Src2)
-                 .addImm(0)  // clamp
-                 .addImm(0); // omod
+  return BuildMI(*MBB, MI, MI.getDebugLoc(), get(AMDGPU::V_MAD_F32))
+      .addOperand(*Dst)
+      .addImm(0) // Src0 mods
+      .addOperand(*Src0)
+      .addImm(0) // Src1 mods
+      .addOperand(*Src1)
+      .addImm(0) // Src mods
+      .addOperand(*Src2)
+      .addImm(0)  // clamp
+      .addImm(0); // omod
+}
+
+bool SIInstrInfo::isSchedulingBoundary(const MachineInstr &MI,
+                                       const MachineBasicBlock *MBB,
+                                       const MachineFunction &MF) const {
+  // XXX - Do we want the SP check in the base implementation?
+
+  // Target-independent instructions do not have an implicit-use of EXEC, even
+  // when they operate on VGPRs. Treating EXEC modifications as scheduling
+  // boundaries prevents incorrect movements of such instructions.
+  return TargetInstrInfo::isSchedulingBoundary(MI, MBB, MF) ||
+         MI.modifiesRegister(AMDGPU::EXEC, &RI);
 }
 
 bool SIInstrInfo::isInlineConstant(const APInt &Imm) const {
@@ -1355,9 +1531,9 @@ static bool compareMachineOp(const MachineOperand &Op0,
   }
 }
 
-bool SIInstrInfo::isImmOperandLegal(const MachineInstr *MI, unsigned OpNo,
-                                 const MachineOperand &MO) const {
-  const MCOperandInfo &OpInfo = get(MI->getOpcode()).OpInfo[OpNo];
+bool SIInstrInfo::isImmOperandLegal(const MachineInstr &MI, unsigned OpNo,
+                                    const MachineOperand &MO) const {
+  const MCOperandInfo &OpInfo = get(MI.getOpcode()).OpInfo[OpNo];
 
   assert(MO.isImm() || MO.isTargetIndex() || MO.isFI());
 
@@ -1418,14 +1594,10 @@ bool SIInstrInfo::usesConstantBus(const MachineRegisterInfo &MRI,
     return true;
 
   // SGPRs use the constant bus
-  if (MO.getReg() == AMDGPU::M0 || MO.getReg() == AMDGPU::VCC ||
-      (!MO.isImplicit() &&
-      (AMDGPU::SGPR_32RegClass.contains(MO.getReg()) ||
-       AMDGPU::SGPR_64RegClass.contains(MO.getReg())))) {
-    return true;
-  }
-
-  return false;
+  return (MO.getReg() == AMDGPU::VCC || MO.getReg() == AMDGPU::M0 ||
+          (!MO.isImplicit() &&
+           (AMDGPU::SGPR_32RegClass.contains(MO.getReg()) ||
+            AMDGPU::SGPR_64RegClass.contains(MO.getReg()))));
 }
 
 static unsigned findImplicitSGPRRead(const MachineInstr &MI) {
@@ -1448,10 +1620,33 @@ static unsigned findImplicitSGPRRead(const MachineInstr &MI) {
   return AMDGPU::NoRegister;
 }
 
-bool SIInstrInfo::verifyInstruction(const MachineInstr *MI,
+static bool shouldReadExec(const MachineInstr &MI) {
+  if (SIInstrInfo::isVALU(MI)) {
+    switch (MI.getOpcode()) {
+    case AMDGPU::V_READLANE_B32:
+    case AMDGPU::V_READLANE_B32_si:
+    case AMDGPU::V_READLANE_B32_vi:
+    case AMDGPU::V_WRITELANE_B32:
+    case AMDGPU::V_WRITELANE_B32_si:
+    case AMDGPU::V_WRITELANE_B32_vi:
+      return false;
+    }
+
+    return true;
+  }
+
+  if (SIInstrInfo::isGenericOpcode(MI.getOpcode()) ||
+      SIInstrInfo::isSALU(MI) ||
+      SIInstrInfo::isSMRD(MI))
+    return false;
+
+  return true;
+}
+
+bool SIInstrInfo::verifyInstruction(const MachineInstr &MI,
                                     StringRef &ErrInfo) const {
-  uint16_t Opcode = MI->getOpcode();
-  const MachineRegisterInfo &MRI = MI->getParent()->getParent()->getRegInfo();
+  uint16_t Opcode = MI.getOpcode();
+  const MachineRegisterInfo &MRI = MI.getParent()->getParent()->getRegInfo();
   int Src0Idx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::src0);
   int Src1Idx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::src1);
   int Src2Idx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::src2);
@@ -1459,14 +1654,14 @@ bool SIInstrInfo::verifyInstruction(const MachineInstr *MI,
   // Make sure the number of operands is correct.
   const MCInstrDesc &Desc = get(Opcode);
   if (!Desc.isVariadic() &&
-      Desc.getNumOperands() != MI->getNumExplicitOperands()) {
-     ErrInfo = "Instruction has wrong number of operands.";
-     return false;
+      Desc.getNumOperands() != MI.getNumExplicitOperands()) {
+    ErrInfo = "Instruction has wrong number of operands.";
+    return false;
   }
 
   // Make sure the register classes are correct.
   for (int i = 0, e = Desc.getNumOperands(); i != e; ++i) {
-    if (MI->getOperand(i).isFPImm()) {
+    if (MI.getOperand(i).isFPImm()) {
       ErrInfo = "FPImm Machine Operands are not supported. ISel should bitcast "
                 "all fp values to integers.";
       return false;
@@ -1476,7 +1671,7 @@ bool SIInstrInfo::verifyInstruction(const MachineInstr *MI,
 
     switch (Desc.OpInfo[i].OperandType) {
     case MCOI::OPERAND_REGISTER:
-      if (MI->getOperand(i).isImm()) {
+      if (MI.getOperand(i).isImm()) {
         ErrInfo = "Illegal immediate value for operand.";
         return false;
       }
@@ -1484,17 +1679,18 @@ bool SIInstrInfo::verifyInstruction(const MachineInstr *MI,
     case AMDGPU::OPERAND_REG_IMM32:
       break;
     case AMDGPU::OPERAND_REG_INLINE_C:
-      if (isLiteralConstant(MI->getOperand(i),
+      if (isLiteralConstant(MI.getOperand(i),
                             RI.getRegClass(RegClass)->getSize())) {
         ErrInfo = "Illegal immediate value for operand.";
         return false;
       }
       break;
     case MCOI::OPERAND_IMMEDIATE:
+    case AMDGPU::OPERAND_KIMM32:
       // Check if this operand is an immediate.
       // FrameIndex operands will be replaced by immediates, so they are
       // allowed.
-      if (!MI->getOperand(i).isImm() && !MI->getOperand(i).isFI()) {
+      if (!MI.getOperand(i).isImm() && !MI.getOperand(i).isFI()) {
         ErrInfo = "Expected immediate, but got non-immediate";
         return false;
       }
@@ -1503,12 +1699,13 @@ bool SIInstrInfo::verifyInstruction(const MachineInstr *MI,
       continue;
     }
 
-    if (!MI->getOperand(i).isReg())
+    if (!MI.getOperand(i).isReg())
       continue;
 
     if (RegClass != -1) {
-      unsigned Reg = MI->getOperand(i).getReg();
-      if (TargetRegisterInfo::isVirtualRegister(Reg))
+      unsigned Reg = MI.getOperand(i).getReg();
+      if (Reg == AMDGPU::NoRegister ||
+          TargetRegisterInfo::isVirtualRegister(Reg))
         continue;
 
       const TargetRegisterClass *RC = RI.getRegClass(RegClass);
@@ -1519,23 +1716,26 @@ bool SIInstrInfo::verifyInstruction(const MachineInstr *MI,
     }
   }
 
-
   // Verify VOP*
-  if (isVOP1(*MI) || isVOP2(*MI) || isVOP3(*MI) || isVOPC(*MI)) {
+  if (isVOP1(MI) || isVOP2(MI) || isVOP3(MI) || isVOPC(MI)) {
     // Only look at the true operands. Only a real operand can use the constant
     // bus, and we don't want to check pseudo-operands like the source modifier
     // flags.
     const int OpIndices[] = { Src0Idx, Src1Idx, Src2Idx };
 
     unsigned ConstantBusCount = 0;
-    unsigned SGPRUsed = findImplicitSGPRRead(*MI);
+
+    if (AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::imm) != -1)
+      ++ConstantBusCount;
+
+    unsigned SGPRUsed = findImplicitSGPRRead(MI);
     if (SGPRUsed != AMDGPU::NoRegister)
       ++ConstantBusCount;
 
     for (int OpIdx : OpIndices) {
       if (OpIdx == -1)
         break;
-      const MachineOperand &MO = MI->getOperand(OpIdx);
+      const MachineOperand &MO = MI.getOperand(OpIdx);
       if (usesConstantBus(MRI, MO, getOpSize(Opcode, OpIdx))) {
         if (MO.isReg()) {
           if (MO.getReg() != SGPRUsed)
@@ -1555,9 +1755,9 @@ bool SIInstrInfo::verifyInstruction(const MachineInstr *MI,
   // Verify misc. restrictions on specific instructions.
   if (Desc.getOpcode() == AMDGPU::V_DIV_SCALE_F32 ||
       Desc.getOpcode() == AMDGPU::V_DIV_SCALE_F64) {
-    const MachineOperand &Src0 = MI->getOperand(Src0Idx);
-    const MachineOperand &Src1 = MI->getOperand(Src1Idx);
-    const MachineOperand &Src2 = MI->getOperand(Src2Idx);
+    const MachineOperand &Src0 = MI.getOperand(Src0Idx);
+    const MachineOperand &Src1 = MI.getOperand(Src1Idx);
+    const MachineOperand &Src2 = MI.getOperand(Src2Idx);
     if (Src0.isReg() && Src1.isReg() && Src2.isReg()) {
       if (!compareMachineOp(Src0, Src1) &&
           !compareMachineOp(Src0, Src2)) {
@@ -1569,9 +1769,8 @@ bool SIInstrInfo::verifyInstruction(const MachineInstr *MI,
 
   // Make sure we aren't losing exec uses in the td files. This mostly requires
   // being careful when using let Uses to try to add other use registers.
-  if (!isGenericOpcode(Opcode) && !isSALU(Opcode) && !isSMRD(Opcode)) {
-    const MachineOperand *Exec = MI->findRegisterUseOperand(AMDGPU::EXEC);
-    if (!Exec || !Exec->isImplicit()) {
+  if (shouldReadExec(MI)) {
+    if (!MI.hasRegisterImplicitUseOperand(AMDGPU::EXEC)) {
       ErrInfo = "VALU instruction does not implicitly read exec mask";
       return false;
     }
@@ -1624,22 +1823,18 @@ unsigned SIInstrInfo::getVALUOp(const MachineInstr &MI) {
   case AMDGPU::S_CMP_GE_I32: return AMDGPU::V_CMP_GE_I32_e32;
   case AMDGPU::S_CMP_LT_I32: return AMDGPU::V_CMP_LT_I32_e32;
   case AMDGPU::S_CMP_LE_I32: return AMDGPU::V_CMP_LE_I32_e32;
-  case AMDGPU::S_LOAD_DWORD_IMM:
-  case AMDGPU::S_LOAD_DWORD_SGPR:
-  case AMDGPU::S_LOAD_DWORD_IMM_ci:
-    return AMDGPU::BUFFER_LOAD_DWORD_ADDR64;
-  case AMDGPU::S_LOAD_DWORDX2_IMM:
-  case AMDGPU::S_LOAD_DWORDX2_SGPR:
-  case AMDGPU::S_LOAD_DWORDX2_IMM_ci:
-    return AMDGPU::BUFFER_LOAD_DWORDX2_ADDR64;
-  case AMDGPU::S_LOAD_DWORDX4_IMM:
-  case AMDGPU::S_LOAD_DWORDX4_SGPR:
-  case AMDGPU::S_LOAD_DWORDX4_IMM_ci:
-    return AMDGPU::BUFFER_LOAD_DWORDX4_ADDR64;
+  case AMDGPU::S_CMP_EQ_U32: return AMDGPU::V_CMP_EQ_U32_e32;
+  case AMDGPU::S_CMP_LG_U32: return AMDGPU::V_CMP_NE_U32_e32;
+  case AMDGPU::S_CMP_GT_U32: return AMDGPU::V_CMP_GT_U32_e32;
+  case AMDGPU::S_CMP_GE_U32: return AMDGPU::V_CMP_GE_U32_e32;
+  case AMDGPU::S_CMP_LT_U32: return AMDGPU::V_CMP_LT_U32_e32;
+  case AMDGPU::S_CMP_LE_U32: return AMDGPU::V_CMP_LE_U32_e32;
   case AMDGPU::S_BCNT1_I32_B32: return AMDGPU::V_BCNT_U32_B32_e64;
   case AMDGPU::S_FF1_I32_B32: return AMDGPU::V_FFBL_B32_e32;
   case AMDGPU::S_FLBIT_I32_B32: return AMDGPU::V_FFBH_U32_e32;
   case AMDGPU::S_FLBIT_I32: return AMDGPU::V_FFBH_I32_e64;
+  case AMDGPU::S_CBRANCH_SCC0: return AMDGPU::S_CBRANCH_VCCZ;
+  case AMDGPU::S_CBRANCH_SCC1: return AMDGPU::S_CBRANCH_VCCNZ;
   }
 }
 
@@ -1676,12 +1871,12 @@ bool SIInstrInfo::canReadVGPR(const MachineInstr &MI, unsigned OpNo) const {
   }
 }
 
-void SIInstrInfo::legalizeOpWithMove(MachineInstr *MI, unsigned OpIdx) const {
+void SIInstrInfo::legalizeOpWithMove(MachineInstr &MI, unsigned OpIdx) const {
   MachineBasicBlock::iterator I = MI;
-  MachineBasicBlock *MBB = MI->getParent();
-  MachineOperand &MO = MI->getOperand(OpIdx);
+  MachineBasicBlock *MBB = MI.getParent();
+  MachineOperand &MO = MI.getOperand(OpIdx);
   MachineRegisterInfo &MRI = MBB->getParent()->getRegInfo();
-  unsigned RCID = get(MI->getOpcode()).OpInfo[OpIdx].RegClass;
+  unsigned RCID = get(MI.getOpcode()).OpInfo[OpIdx].RegClass;
   const TargetRegisterClass *RC = RI.getRegClass(RCID);
   unsigned Opcode = AMDGPU::V_MOV_B32_e32;
   if (MO.isReg())
@@ -1689,7 +1884,6 @@ void SIInstrInfo::legalizeOpWithMove(MachineInstr *MI, unsigned OpIdx) const {
   else if (RI.isSGPRClass(RC))
     Opcode = AMDGPU::S_MOV_B32;
 
-
   const TargetRegisterClass *VRC = RI.getEquivalentVGPRClass(RC);
   if (RI.getCommonSubClass(&AMDGPU::VReg_64RegClass, VRC))
     VRC = &AMDGPU::VReg_64RegClass;
@@ -1698,8 +1892,7 @@ void SIInstrInfo::legalizeOpWithMove(MachineInstr *MI, unsigned OpIdx) const {
 
   unsigned Reg = MRI.createVirtualRegister(VRC);
   DebugLoc DL = MBB->findDebugLoc(I);
-  BuildMI(*MI->getParent(), I, DL, get(Opcode), Reg)
-    .addOperand(MO);
+  BuildMI(*MI.getParent(), I, DL, get(Opcode), Reg).addOperand(MO);
   MO.ChangeToRegister(Reg, false);
 }
 
@@ -1758,11 +1951,11 @@ MachineOperand SIInstrInfo::buildExtractSubRegOrImm(
 }
 
 // Change the order of operands from (0, 1, 2) to (0, 2, 1)
-void SIInstrInfo::swapOperands(MachineBasicBlock::iterator Inst) const {
-  assert(Inst->getNumExplicitOperands() == 3);
-  MachineOperand Op1 = Inst->getOperand(1);
-  Inst->RemoveOperand(1);
-  Inst->addOperand(Op1);
+void SIInstrInfo::swapOperands(MachineInstr &Inst) const {
+  assert(Inst.getNumExplicitOperands() == 3);
+  MachineOperand Op1 = Inst.getOperand(1);
+  Inst.RemoveOperand(1);
+  Inst.addOperand(Op1);
 }
 
 bool SIInstrInfo::isLegalRegOperand(const MachineRegisterInfo &MRI,
@@ -1804,26 +1997,32 @@ bool SIInstrInfo::isLegalVSrcOperand(const MachineRegisterInfo &MRI,
   return true;
 }
 
-bool SIInstrInfo::isOperandLegal(const MachineInstr *MI, unsigned OpIdx,
+bool SIInstrInfo::isOperandLegal(const MachineInstr &MI, unsigned OpIdx,
                                  const MachineOperand *MO) const {
-  const MachineRegisterInfo &MRI = MI->getParent()->getParent()->getRegInfo();
-  const MCInstrDesc &InstDesc = get(MI->getOpcode());
+  const MachineRegisterInfo &MRI = MI.getParent()->getParent()->getRegInfo();
+  const MCInstrDesc &InstDesc = MI.getDesc();
   const MCOperandInfo &OpInfo = InstDesc.OpInfo[OpIdx];
   const TargetRegisterClass *DefinedRC =
       OpInfo.RegClass != -1 ? RI.getRegClass(OpInfo.RegClass) : nullptr;
   if (!MO)
-    MO = &MI->getOperand(OpIdx);
+    MO = &MI.getOperand(OpIdx);
+
+  if (isVALU(MI) && usesConstantBus(MRI, *MO, DefinedRC->getSize())) {
+
+    RegSubRegPair SGPRUsed;
+    if (MO->isReg())
+      SGPRUsed = RegSubRegPair(MO->getReg(), MO->getSubReg());
 
-  if (isVALU(*MI) &&
-      usesConstantBus(MRI, *MO, DefinedRC->getSize())) {
-    unsigned SGPRUsed =
-        MO->isReg() ? MO->getReg() : (unsigned)AMDGPU::NoRegister;
-    for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) {
+    for (unsigned i = 0, e = MI.getNumOperands(); i != e; ++i) {
       if (i == OpIdx)
         continue;
-      const MachineOperand &Op = MI->getOperand(i);
-      if (Op.isReg() && Op.getReg() != SGPRUsed &&
-          usesConstantBus(MRI, Op, getOpSize(*MI, i))) {
+      const MachineOperand &Op = MI.getOperand(i);
+      if (Op.isReg()) {
+        if ((Op.getReg() != SGPRUsed.Reg || Op.getSubReg() != SGPRUsed.SubReg) &&
+            usesConstantBus(MRI, Op, getOpSize(MI, i))) {
+          return false;
+        }
+      } else if (InstDesc.OpInfo[i].OperandType == AMDGPU::OPERAND_KIMM32) {
         return false;
       }
     }
@@ -1834,7 +2033,6 @@ bool SIInstrInfo::isOperandLegal(const MachineInstr *MI, unsigned OpIdx,
     return isLegalRegOperand(MRI, OpInfo, *MO);
   }
 
-
   // Handle non-register types that are treated like immediates.
   assert(MO->isImm() || MO->isTargetIndex() || MO->isFI());
 
@@ -1847,12 +2045,12 @@ bool SIInstrInfo::isOperandLegal(const MachineInstr *MI, unsigned OpIdx,
 }
 
 void SIInstrInfo::legalizeOperandsVOP2(MachineRegisterInfo &MRI,
-                                       MachineInstr *MI) const {
-  unsigned Opc = MI->getOpcode();
+                                       MachineInstr &MI) const {
+  unsigned Opc = MI.getOpcode();
   const MCInstrDesc &InstrDesc = get(Opc);
 
   int Src1Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src1);
-  MachineOperand &Src1 = MI->getOperand(Src1Idx);
+  MachineOperand &Src1 = MI.getOperand(Src1Idx);
 
   // If there is an implicit SGPR use such as VCC use for v_addc_u32/v_subb_u32
   // we need to only have one constant bus use.
@@ -1860,10 +2058,10 @@ void SIInstrInfo::legalizeOperandsVOP2(MachineRegisterInfo &MRI,
   // Note we do not need to worry about literal constants here. They are
   // disabled for the operand type for instructions because they will always
   // violate the one constant bus use rule.
-  bool HasImplicitSGPR = findImplicitSGPRRead(*MI) != AMDGPU::NoRegister;
+  bool HasImplicitSGPR = findImplicitSGPRRead(MI) != AMDGPU::NoRegister;
   if (HasImplicitSGPR) {
     int Src0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0);
-    MachineOperand &Src0 = MI->getOperand(Src0Idx);
+    MachineOperand &Src0 = MI.getOperand(Src0Idx);
 
     if (Src0.isReg() && RI.isSGPRReg(MRI, Src0.getReg()))
       legalizeOpWithMove(MI, Src0Idx);
@@ -1878,13 +2076,13 @@ void SIInstrInfo::legalizeOperandsVOP2(MachineRegisterInfo &MRI,
   // commute if it is possible. We only want to commute here if it improves
   // legality. This can be called a fairly large number of times so don't waste
   // compile time pointlessly swapping and checking legality again.
-  if (HasImplicitSGPR || !MI->isCommutable()) {
+  if (HasImplicitSGPR || !MI.isCommutable()) {
     legalizeOpWithMove(MI, Src1Idx);
     return;
   }
 
   int Src0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0);
-  MachineOperand &Src0 = MI->getOperand(Src0Idx);
+  MachineOperand &Src0 = MI.getOperand(Src0Idx);
 
   // If src0 can be used as src1, commuting will make the operands legal.
   // Otherwise we have to give up and insert a move.
@@ -1897,13 +2095,13 @@ void SIInstrInfo::legalizeOperandsVOP2(MachineRegisterInfo &MRI,
     return;
   }
 
-  int CommutedOpc = commuteOpcode(*MI);
+  int CommutedOpc = commuteOpcode(MI);
   if (CommutedOpc == -1) {
     legalizeOpWithMove(MI, Src1Idx);
     return;
   }
 
-  MI->setDesc(get(CommutedOpc));
+  MI.setDesc(get(CommutedOpc));
 
   unsigned Src0Reg = Src0.getReg();
   unsigned Src0SubReg = Src0.getSubReg();
@@ -1925,10 +2123,9 @@ void SIInstrInfo::legalizeOperandsVOP2(MachineRegisterInfo &MRI,
 // operand, and since literal constants are not allowed and should never be
 // seen, we only need to worry about inserting copies if we use multiple SGPR
 // operands.
-void SIInstrInfo::legalizeOperandsVOP3(
-  MachineRegisterInfo &MRI,
-  MachineInstr *MI) const {
-  unsigned Opc = MI->getOpcode();
+void SIInstrInfo::legalizeOperandsVOP3(MachineRegisterInfo &MRI,
+                                       MachineInstr &MI) const {
+  unsigned Opc = MI.getOpcode();
 
   int VOP3Idx[3] = {
     AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0),
@@ -1943,7 +2140,7 @@ void SIInstrInfo::legalizeOperandsVOP3(
     int Idx = VOP3Idx[i];
     if (Idx == -1)
       break;
-    MachineOperand &MO = MI->getOperand(Idx);
+    MachineOperand &MO = MI.getOperand(Idx);
 
     // We should never see a VOP3 instruction with an illegal immediate operand.
     if (!MO.isReg())
@@ -1964,32 +2161,78 @@ void SIInstrInfo::legalizeOperandsVOP3(
   }
 }
 
-void SIInstrInfo::legalizeOperands(MachineInstr *MI) const {
-  MachineRegisterInfo &MRI = MI->getParent()->getParent()->getRegInfo();
+unsigned SIInstrInfo::readlaneVGPRToSGPR(unsigned SrcReg, MachineInstr &UseMI,
+                                         MachineRegisterInfo &MRI) const {
+  const TargetRegisterClass *VRC = MRI.getRegClass(SrcReg);
+  const TargetRegisterClass *SRC = RI.getEquivalentSGPRClass(VRC);
+  unsigned DstReg = MRI.createVirtualRegister(SRC);
+  unsigned SubRegs = VRC->getSize() / 4;
+
+  SmallVector<unsigned, 8> SRegs;
+  for (unsigned i = 0; i < SubRegs; ++i) {
+    unsigned SGPR = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass);
+    BuildMI(*UseMI.getParent(), UseMI, UseMI.getDebugLoc(),
+            get(AMDGPU::V_READFIRSTLANE_B32), SGPR)
+        .addReg(SrcReg, 0, RI.getSubRegFromChannel(i));
+    SRegs.push_back(SGPR);
+  }
+
+  MachineInstrBuilder MIB =
+      BuildMI(*UseMI.getParent(), UseMI, UseMI.getDebugLoc(),
+              get(AMDGPU::REG_SEQUENCE), DstReg);
+  for (unsigned i = 0; i < SubRegs; ++i) {
+    MIB.addReg(SRegs[i]);
+    MIB.addImm(RI.getSubRegFromChannel(i));
+  }
+  return DstReg;
+}
+
+void SIInstrInfo::legalizeOperandsSMRD(MachineRegisterInfo &MRI,
+                                       MachineInstr &MI) const {
+
+  // If the pointer is store in VGPRs, then we need to move them to
+  // SGPRs using v_readfirstlane.  This is safe because we only select
+  // loads with uniform pointers to SMRD instruction so we know the
+  // pointer value is uniform.
+  MachineOperand *SBase = getNamedOperand(MI, AMDGPU::OpName::sbase);
+  if (SBase && !RI.isSGPRClass(MRI.getRegClass(SBase->getReg()))) {
+      unsigned SGPR = readlaneVGPRToSGPR(SBase->getReg(), MI, MRI);
+      SBase->setReg(SGPR);
+  }
+}
+
+void SIInstrInfo::legalizeOperands(MachineInstr &MI) const {
+  MachineRegisterInfo &MRI = MI.getParent()->getParent()->getRegInfo();
 
   // Legalize VOP2
-  if (isVOP2(*MI)) {
+  if (isVOP2(MI) || isVOPC(MI)) {
     legalizeOperandsVOP2(MRI, MI);
     return;
   }
 
   // Legalize VOP3
-  if (isVOP3(*MI)) {
+  if (isVOP3(MI)) {
     legalizeOperandsVOP3(MRI, MI);
     return;
   }
 
+  // Legalize SMRD
+  if (isSMRD(MI)) {
+    legalizeOperandsSMRD(MRI, MI);
+    return;
+  }
+
   // Legalize REG_SEQUENCE and PHI
   // The register class of the operands much be the same type as the register
   // class of the output.
-  if (MI->getOpcode() == AMDGPU::PHI) {
+  if (MI.getOpcode() == AMDGPU::PHI) {
     const TargetRegisterClass *RC = nullptr, *SRC = nullptr, *VRC = nullptr;
-    for (unsigned i = 1, e = MI->getNumOperands(); i != e; i+=2) {
-      if (!MI->getOperand(i).isReg() ||
-          !TargetRegisterInfo::isVirtualRegister(MI->getOperand(i).getReg()))
+    for (unsigned i = 1, e = MI.getNumOperands(); i != e; i += 2) {
+      if (!MI.getOperand(i).isReg() ||
+          !TargetRegisterInfo::isVirtualRegister(MI.getOperand(i).getReg()))
         continue;
       const TargetRegisterClass *OpRC =
-              MRI.getRegClass(MI->getOperand(i).getReg());
+          MRI.getRegClass(MI.getOperand(i).getReg());
       if (RI.hasVGPRs(OpRC)) {
         VRC = OpRC;
       } else {
@@ -2000,7 +2243,7 @@ void SIInstrInfo::legalizeOperands(MachineInstr *MI) const {
     // If any of the operands are VGPR registers, then they all most be
     // otherwise we will create illegal VGPR->SGPR copies when legalizing
     // them.
-    if (VRC || !RI.isSGPRClass(getOpRegClass(*MI, 0))) {
+    if (VRC || !RI.isSGPRClass(getOpRegClass(MI, 0))) {
       if (!VRC) {
         assert(SRC);
         VRC = RI.getEquivalentVGPRClass(SRC);
@@ -2011,18 +2254,18 @@ void SIInstrInfo::legalizeOperands(MachineInstr *MI) const {
     }
 
     // Update all the operands so they have the same type.
-    for (unsigned I = 1, E = MI->getNumOperands(); I != E; I += 2) {
-      MachineOperand &Op = MI->getOperand(I);
+    for (unsigned I = 1, E = MI.getNumOperands(); I != E; I += 2) {
+      MachineOperand &Op = MI.getOperand(I);
       if (!Op.isReg() || !TargetRegisterInfo::isVirtualRegister(Op.getReg()))
         continue;
       unsigned DstReg = MRI.createVirtualRegister(RC);
 
       // MI is a PHI instruction.
-      MachineBasicBlock *InsertBB = MI->getOperand(I + 1).getMBB();
+      MachineBasicBlock *InsertBB = MI.getOperand(I + 1).getMBB();
       MachineBasicBlock::iterator Insert = InsertBB->getFirstTerminator();
 
-      BuildMI(*InsertBB, Insert, MI->getDebugLoc(), get(AMDGPU::COPY), DstReg)
-        .addOperand(Op);
+      BuildMI(*InsertBB, Insert, MI.getDebugLoc(), get(AMDGPU::COPY), DstReg)
+          .addOperand(Op);
       Op.setReg(DstReg);
     }
   }
@@ -2030,15 +2273,15 @@ void SIInstrInfo::legalizeOperands(MachineInstr *MI) const {
   // REG_SEQUENCE doesn't really require operand legalization, but if one has a
   // VGPR dest type and SGPR sources, insert copies so all operands are
   // VGPRs. This seems to help operand folding / the register coalescer.
-  if (MI->getOpcode() == AMDGPU::REG_SEQUENCE) {
-    MachineBasicBlock *MBB = MI->getParent();
-    const TargetRegisterClass *DstRC = getOpRegClass(*MI, 0);
+  if (MI.getOpcode() == AMDGPU::REG_SEQUENCE) {
+    MachineBasicBlock *MBB = MI.getParent();
+    const TargetRegisterClass *DstRC = getOpRegClass(MI, 0);
     if (RI.hasVGPRs(DstRC)) {
       // Update all the operands so they are VGPR register classes. These may
       // not be the same register class because REG_SEQUENCE supports mixing
       // subregister index types e.g. sub0_sub1 + sub2 + sub3
-      for (unsigned I = 1, E = MI->getNumOperands(); I != E; I += 2) {
-        MachineOperand &Op = MI->getOperand(I);
+      for (unsigned I = 1, E = MI.getNumOperands(); I != E; I += 2) {
+        MachineOperand &Op = MI.getOperand(I);
         if (!Op.isReg() || !TargetRegisterInfo::isVirtualRegister(Op.getReg()))
           continue;
 
@@ -2049,8 +2292,8 @@ void SIInstrInfo::legalizeOperands(MachineInstr *MI) const {
 
         unsigned DstReg = MRI.createVirtualRegister(VRC);
 
-        BuildMI(*MBB, MI, MI->getDebugLoc(), get(AMDGPU::COPY), DstReg)
-          .addOperand(Op);
+        BuildMI(*MBB, MI, MI.getDebugLoc(), get(AMDGPU::COPY), DstReg)
+            .addOperand(Op);
 
         Op.setReg(DstReg);
         Op.setIsKill();
@@ -2062,17 +2305,33 @@ void SIInstrInfo::legalizeOperands(MachineInstr *MI) const {
 
   // Legalize INSERT_SUBREG
   // src0 must have the same register class as dst
-  if (MI->getOpcode() == AMDGPU::INSERT_SUBREG) {
-    unsigned Dst = MI->getOperand(0).getReg();
-    unsigned Src0 = MI->getOperand(1).getReg();
+  if (MI.getOpcode() == AMDGPU::INSERT_SUBREG) {
+    unsigned Dst = MI.getOperand(0).getReg();
+    unsigned Src0 = MI.getOperand(1).getReg();
     const TargetRegisterClass *DstRC = MRI.getRegClass(Dst);
     const TargetRegisterClass *Src0RC = MRI.getRegClass(Src0);
     if (DstRC != Src0RC) {
-      MachineBasicBlock &MBB = *MI->getParent();
+      MachineBasicBlock &MBB = *MI.getParent();
       unsigned NewSrc0 = MRI.createVirtualRegister(DstRC);
-      BuildMI(MBB, MI, MI->getDebugLoc(), get(AMDGPU::COPY), NewSrc0)
-              .addReg(Src0);
-      MI->getOperand(1).setReg(NewSrc0);
+      BuildMI(MBB, MI, MI.getDebugLoc(), get(AMDGPU::COPY), NewSrc0)
+          .addReg(Src0);
+      MI.getOperand(1).setReg(NewSrc0);
+    }
+    return;
+  }
+
+  // Legalize MIMG
+  if (isMIMG(MI)) {
+    MachineOperand *SRsrc = getNamedOperand(MI, AMDGPU::OpName::srsrc);
+    if (SRsrc && !RI.isSGPRClass(MRI.getRegClass(SRsrc->getReg()))) {
+      unsigned SGPR = readlaneVGPRToSGPR(SRsrc->getReg(), MI, MRI);
+      SRsrc->setReg(SGPR);
+    }
+
+    MachineOperand *SSamp = getNamedOperand(MI, AMDGPU::OpName::ssamp);
+    if (SSamp && !RI.isSGPRClass(MRI.getRegClass(SSamp->getReg()))) {
+      unsigned SGPR = readlaneVGPRToSGPR(SSamp->getReg(), MI, MRI);
+      SSamp->setReg(SGPR);
     }
     return;
   }
@@ -2081,11 +2340,11 @@ void SIInstrInfo::legalizeOperands(MachineInstr *MI) const {
   // FIXME: If we start using the non-addr64 instructions for compute, we
   // may need to legalize them here.
   int SRsrcIdx =
-      AMDGPU::getNamedOperandIdx(MI->getOpcode(), AMDGPU::OpName::srsrc);
+      AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::srsrc);
   if (SRsrcIdx != -1) {
     // We have an MUBUF instruction
-    MachineOperand *SRsrc = &MI->getOperand(SRsrcIdx);
-    unsigned SRsrcRC = get(MI->getOpcode()).OpInfo[SRsrcIdx].RegClass;
+    MachineOperand *SRsrc = &MI.getOperand(SRsrcIdx);
+    unsigned SRsrcRC = get(MI.getOpcode()).OpInfo[SRsrcIdx].RegClass;
     if (RI.getCommonSubClass(MRI.getRegClass(SRsrc->getReg()),
                                              RI.getRegClass(SRsrcRC))) {
       // The operands are legal.
@@ -2093,7 +2352,7 @@ void SIInstrInfo::legalizeOperands(MachineInstr *MI) const {
       return;
     }
 
-    MachineBasicBlock &MBB = *MI->getParent();
+    MachineBasicBlock &MBB = *MI.getParent();
 
     // Extract the ptr from the resource descriptor.
     unsigned SRsrcPtr = buildExtractSubReg(MI, MRI, *SRsrc,
@@ -2107,30 +2366,27 @@ void SIInstrInfo::legalizeOperands(MachineInstr *MI) const {
     uint64_t RsrcDataFormat = getDefaultRsrcDataFormat();
 
     // Zero64 = 0
-    BuildMI(MBB, MI, MI->getDebugLoc(), get(AMDGPU::S_MOV_B64),
-            Zero64)
-            .addImm(0);
+    BuildMI(MBB, MI, MI.getDebugLoc(), get(AMDGPU::S_MOV_B64), Zero64)
+        .addImm(0);
 
     // SRsrcFormatLo = RSRC_DATA_FORMAT{31-0}
-    BuildMI(MBB, MI, MI->getDebugLoc(), get(AMDGPU::S_MOV_B32),
-            SRsrcFormatLo)
-            .addImm(RsrcDataFormat & 0xFFFFFFFF);
+    BuildMI(MBB, MI, MI.getDebugLoc(), get(AMDGPU::S_MOV_B32), SRsrcFormatLo)
+        .addImm(RsrcDataFormat & 0xFFFFFFFF);
 
     // SRsrcFormatHi = RSRC_DATA_FORMAT{63-32}
-    BuildMI(MBB, MI, MI->getDebugLoc(), get(AMDGPU::S_MOV_B32),
-            SRsrcFormatHi)
-            .addImm(RsrcDataFormat >> 32);
+    BuildMI(MBB, MI, MI.getDebugLoc(), get(AMDGPU::S_MOV_B32), SRsrcFormatHi)
+        .addImm(RsrcDataFormat >> 32);
 
     // NewSRsrc = {Zero64, SRsrcFormat}
-    BuildMI(MBB, MI, MI->getDebugLoc(), get(AMDGPU::REG_SEQUENCE), NewSRsrc)
-      .addReg(Zero64)
-      .addImm(AMDGPU::sub0_sub1)
-      .addReg(SRsrcFormatLo)
-      .addImm(AMDGPU::sub2)
-      .addReg(SRsrcFormatHi)
-      .addImm(AMDGPU::sub3);
-
-    MachineOperand *VAddr = getNamedOperand(*MI, AMDGPU::OpName::vaddr);
+    BuildMI(MBB, MI, MI.getDebugLoc(), get(AMDGPU::REG_SEQUENCE), NewSRsrc)
+        .addReg(Zero64)
+        .addImm(AMDGPU::sub0_sub1)
+        .addReg(SRsrcFormatLo)
+        .addImm(AMDGPU::sub2)
+        .addReg(SRsrcFormatHi)
+        .addImm(AMDGPU::sub3);
+
+    MachineOperand *VAddr = getNamedOperand(MI, AMDGPU::OpName::vaddr);
     unsigned NewVAddr = MRI.createVirtualRegister(&AMDGPU::VReg_64RegClass);
     if (VAddr) {
       // This is already an ADDR64 instruction so we need to add the pointer
@@ -2139,7 +2395,7 @@ void SIInstrInfo::legalizeOperands(MachineInstr *MI) const {
       unsigned NewVAddrHi = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
 
       // NewVaddrLo = SRsrcPtr:sub0 + VAddr:sub0
-      DebugLoc DL = MI->getDebugLoc();
+      DebugLoc DL = MI.getDebugLoc();
       BuildMI(MBB, MI, DL, get(AMDGPU::V_ADD_I32_e32), NewVAddrLo)
         .addReg(SRsrcPtr, 0, AMDGPU::sub0)
         .addReg(VAddr->getReg(), 0, AMDGPU::sub0);
@@ -2150,82 +2406,82 @@ void SIInstrInfo::legalizeOperands(MachineInstr *MI) const {
         .addReg(VAddr->getReg(), 0, AMDGPU::sub1);
 
       // NewVaddr = {NewVaddrHi, NewVaddrLo}
-      BuildMI(MBB, MI, MI->getDebugLoc(), get(AMDGPU::REG_SEQUENCE), NewVAddr)
-        .addReg(NewVAddrLo)
-        .addImm(AMDGPU::sub0)
-        .addReg(NewVAddrHi)
-        .addImm(AMDGPU::sub1);
+      BuildMI(MBB, MI, MI.getDebugLoc(), get(AMDGPU::REG_SEQUENCE), NewVAddr)
+          .addReg(NewVAddrLo)
+          .addImm(AMDGPU::sub0)
+          .addReg(NewVAddrHi)
+          .addImm(AMDGPU::sub1);
     } else {
       // This instructions is the _OFFSET variant, so we need to convert it to
       // ADDR64.
-      assert(MBB.getParent()->getSubtarget<AMDGPUSubtarget>().getGeneration()
-             < AMDGPUSubtarget::VOLCANIC_ISLANDS &&
+      assert(MBB.getParent()->getSubtarget<SISubtarget>().getGeneration()
+             < SISubtarget::VOLCANIC_ISLANDS &&
              "FIXME: Need to emit flat atomics here");
 
-      MachineOperand *VData = getNamedOperand(*MI, AMDGPU::OpName::vdata);
-      MachineOperand *Offset = getNamedOperand(*MI, AMDGPU::OpName::offset);
-      MachineOperand *SOffset = getNamedOperand(*MI, AMDGPU::OpName::soffset);
-      unsigned Addr64Opcode = AMDGPU::getAddr64Inst(MI->getOpcode());
+      MachineOperand *VData = getNamedOperand(MI, AMDGPU::OpName::vdata);
+      MachineOperand *Offset = getNamedOperand(MI, AMDGPU::OpName::offset);
+      MachineOperand *SOffset = getNamedOperand(MI, AMDGPU::OpName::soffset);
+      unsigned Addr64Opcode = AMDGPU::getAddr64Inst(MI.getOpcode());
 
       // Atomics rith return have have an additional tied operand and are
       // missing some of the special bits.
-      MachineOperand *VDataIn = getNamedOperand(*MI, AMDGPU::OpName::vdata_in);
+      MachineOperand *VDataIn = getNamedOperand(MI, AMDGPU::OpName::vdata_in);
       MachineInstr *Addr64;
 
       if (!VDataIn) {
         // Regular buffer load / store.
-        MachineInstrBuilder MIB
-          = BuildMI(MBB, MI, MI->getDebugLoc(), get(Addr64Opcode))
-          .addOperand(*VData)
-          .addReg(AMDGPU::NoRegister) // Dummy value for vaddr.
-          // This will be replaced later
-          // with the new value of vaddr.
-          .addOperand(*SRsrc)
-          .addOperand(*SOffset)
-          .addOperand(*Offset);
+        MachineInstrBuilder MIB =
+            BuildMI(MBB, MI, MI.getDebugLoc(), get(Addr64Opcode))
+                .addOperand(*VData)
+                .addReg(AMDGPU::NoRegister) // Dummy value for vaddr.
+                // This will be replaced later
+                // with the new value of vaddr.
+                .addOperand(*SRsrc)
+                .addOperand(*SOffset)
+                .addOperand(*Offset);
 
         // Atomics do not have this operand.
-        if (const MachineOperand *GLC
-            = getNamedOperand(*MI, AMDGPU::OpName::glc)) {
+        if (const MachineOperand *GLC =
+                getNamedOperand(MI, AMDGPU::OpName::glc)) {
           MIB.addImm(GLC->getImm());
         }
 
-        MIB.addImm(getNamedImmOperand(*MI, AMDGPU::OpName::slc));
+        MIB.addImm(getNamedImmOperand(MI, AMDGPU::OpName::slc));
 
-        if (const MachineOperand *TFE
-            = getNamedOperand(*MI, AMDGPU::OpName::tfe)) {
+        if (const MachineOperand *TFE =
+                getNamedOperand(MI, AMDGPU::OpName::tfe)) {
           MIB.addImm(TFE->getImm());
         }
 
-        MIB.setMemRefs(MI->memoperands_begin(), MI->memoperands_end());
+        MIB.setMemRefs(MI.memoperands_begin(), MI.memoperands_end());
         Addr64 = MIB;
       } else {
         // Atomics with return.
-        Addr64 = BuildMI(MBB, MI, MI->getDebugLoc(), get(Addr64Opcode))
-          .addOperand(*VData)
-          .addOperand(*VDataIn)
-          .addReg(AMDGPU::NoRegister) // Dummy value for vaddr.
-          // This will be replaced later
-          // with the new value of vaddr.
-          .addOperand(*SRsrc)
-          .addOperand(*SOffset)
-          .addOperand(*Offset)
-          .addImm(getNamedImmOperand(*MI, AMDGPU::OpName::slc))
-          .setMemRefs(MI->memoperands_begin(), MI->memoperands_end());
+        Addr64 = BuildMI(MBB, MI, MI.getDebugLoc(), get(Addr64Opcode))
+                     .addOperand(*VData)
+                     .addOperand(*VDataIn)
+                     .addReg(AMDGPU::NoRegister) // Dummy value for vaddr.
+                     // This will be replaced later
+                     // with the new value of vaddr.
+                     .addOperand(*SRsrc)
+                     .addOperand(*SOffset)
+                     .addOperand(*Offset)
+                     .addImm(getNamedImmOperand(MI, AMDGPU::OpName::slc))
+                     .setMemRefs(MI.memoperands_begin(), MI.memoperands_end());
       }
 
-      MI->removeFromParent();
-      MI = Addr64;
+      MI.removeFromParent();
 
       // NewVaddr = {NewVaddrHi, NewVaddrLo}
-      BuildMI(MBB, MI, MI->getDebugLoc(), get(AMDGPU::REG_SEQUENCE), NewVAddr)
-        .addReg(SRsrcPtr, 0, AMDGPU::sub0)
-        .addImm(AMDGPU::sub0)
-        .addReg(SRsrcPtr, 0, AMDGPU::sub1)
-        .addImm(AMDGPU::sub1);
-
-      VAddr = getNamedOperand(*MI, AMDGPU::OpName::vaddr);
-      SRsrc = getNamedOperand(*MI, AMDGPU::OpName::srsrc);
+      BuildMI(MBB, Addr64, Addr64->getDebugLoc(), get(AMDGPU::REG_SEQUENCE),
+              NewVAddr)
+          .addReg(SRsrcPtr, 0, AMDGPU::sub0)
+          .addImm(AMDGPU::sub0)
+          .addReg(SRsrcPtr, 0, AMDGPU::sub1)
+          .addImm(AMDGPU::sub1);
+
+      VAddr = getNamedOperand(*Addr64, AMDGPU::OpName::vaddr);
+      SRsrc = getNamedOperand(*Addr64, AMDGPU::OpName::srsrc);
     }
 
     // Update the instruction to use NewVaddr
@@ -2235,300 +2491,85 @@ void SIInstrInfo::legalizeOperands(MachineInstr *MI) const {
   }
 }
 
-void SIInstrInfo::splitSMRD(MachineInstr *MI,
-                            const TargetRegisterClass *HalfRC,
-                            unsigned HalfImmOp, unsigned HalfSGPROp,
-                            MachineInstr *&Lo, MachineInstr *&Hi) const {
-
-  DebugLoc DL = MI->getDebugLoc();
-  MachineBasicBlock *MBB = MI->getParent();
-  MachineRegisterInfo &MRI = MBB->getParent()->getRegInfo();
-  unsigned RegLo = MRI.createVirtualRegister(HalfRC);
-  unsigned RegHi = MRI.createVirtualRegister(HalfRC);
-  unsigned HalfSize = HalfRC->getSize();
-  const MachineOperand *OffOp =
-      getNamedOperand(*MI, AMDGPU::OpName::offset);
-  const MachineOperand *SBase = getNamedOperand(*MI, AMDGPU::OpName::sbase);
-
-  // The SMRD has an 8-bit offset in dwords on SI and a 20-bit offset in bytes
-  // on VI.
-
-  bool IsKill = SBase->isKill();
-  if (OffOp) {
-    bool isVI =
-        MBB->getParent()->getSubtarget<AMDGPUSubtarget>().getGeneration() >=
-        AMDGPUSubtarget::VOLCANIC_ISLANDS;
-    unsigned OffScale = isVI ? 1 : 4;
-    // Handle the _IMM variant
-    unsigned LoOffset = OffOp->getImm() * OffScale;
-    unsigned HiOffset = LoOffset + HalfSize;
-    Lo = BuildMI(*MBB, MI, DL, get(HalfImmOp), RegLo)
-                  // Use addReg instead of addOperand
-                  // to make sure kill flag is cleared.
-                  .addReg(SBase->getReg(), 0, SBase->getSubReg())
-                  .addImm(LoOffset / OffScale);
-
-    if (!isUInt<20>(HiOffset) || (!isVI && !isUInt<8>(HiOffset / OffScale))) {
-      unsigned OffsetSGPR =
-          MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
-      BuildMI(*MBB, MI, DL, get(AMDGPU::S_MOV_B32), OffsetSGPR)
-              .addImm(HiOffset); // The offset in register is in bytes.
-      Hi = BuildMI(*MBB, MI, DL, get(HalfSGPROp), RegHi)
-                    .addReg(SBase->getReg(), getKillRegState(IsKill),
-                            SBase->getSubReg())
-                    .addReg(OffsetSGPR);
-    } else {
-      Hi = BuildMI(*MBB, MI, DL, get(HalfImmOp), RegHi)
-                     .addReg(SBase->getReg(), getKillRegState(IsKill),
-                             SBase->getSubReg())
-                     .addImm(HiOffset / OffScale);
-    }
-  } else {
-    // Handle the _SGPR variant
-    MachineOperand *SOff = getNamedOperand(*MI, AMDGPU::OpName::soff);
-    Lo = BuildMI(*MBB, MI, DL, get(HalfSGPROp), RegLo)
-                  .addReg(SBase->getReg(), 0, SBase->getSubReg())
-                  .addOperand(*SOff);
-    unsigned OffsetSGPR = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
-    BuildMI(*MBB, MI, DL, get(AMDGPU::S_ADD_I32), OffsetSGPR)
-      .addReg(SOff->getReg(), 0, SOff->getSubReg())
-      .addImm(HalfSize);
-    Hi = BuildMI(*MBB, MI, DL, get(HalfSGPROp), RegHi)
-                  .addReg(SBase->getReg(), getKillRegState(IsKill),
-                          SBase->getSubReg())
-                  .addReg(OffsetSGPR);
-  }
-
-  unsigned SubLo, SubHi;
-  const TargetRegisterClass *NewDstRC;
-  switch (HalfSize) {
-    case 4:
-      SubLo = AMDGPU::sub0;
-      SubHi = AMDGPU::sub1;
-      NewDstRC = &AMDGPU::VReg_64RegClass;
-      break;
-    case 8:
-      SubLo = AMDGPU::sub0_sub1;
-      SubHi = AMDGPU::sub2_sub3;
-      NewDstRC = &AMDGPU::VReg_128RegClass;
-      break;
-    case 16:
-      SubLo = AMDGPU::sub0_sub1_sub2_sub3;
-      SubHi = AMDGPU::sub4_sub5_sub6_sub7;
-      NewDstRC = &AMDGPU::VReg_256RegClass;
-      break;
-    case 32:
-      SubLo = AMDGPU::sub0_sub1_sub2_sub3_sub4_sub5_sub6_sub7;
-      SubHi = AMDGPU::sub8_sub9_sub10_sub11_sub12_sub13_sub14_sub15;
-      NewDstRC = &AMDGPU::VReg_512RegClass;
-      break;
-    default:
-      llvm_unreachable("Unhandled HalfSize");
-  }
-
-  unsigned OldDst = MI->getOperand(0).getReg();
-  unsigned NewDst = MRI.createVirtualRegister(NewDstRC);
-
-  MRI.replaceRegWith(OldDst, NewDst);
-
-  BuildMI(*MBB, MI, DL, get(AMDGPU::REG_SEQUENCE), NewDst)
-    .addReg(RegLo)
-    .addImm(SubLo)
-    .addReg(RegHi)
-    .addImm(SubHi);
-}
-
-void SIInstrInfo::moveSMRDToVALU(MachineInstr *MI,
-                                 MachineRegisterInfo &MRI,
-                                 SmallVectorImpl<MachineInstr *> &Worklist) const {
-  MachineBasicBlock *MBB = MI->getParent();
-  int DstIdx = AMDGPU::getNamedOperandIdx(MI->getOpcode(), AMDGPU::OpName::dst);
-  assert(DstIdx != -1);
-  unsigned DstRCID = get(MI->getOpcode()).OpInfo[DstIdx].RegClass;
-  switch(RI.getRegClass(DstRCID)->getSize()) {
-    case 4:
-    case 8:
-    case 16: {
-      unsigned NewOpcode = getVALUOp(*MI);
-      unsigned RegOffset;
-      unsigned ImmOffset;
-
-      if (MI->getOperand(2).isReg()) {
-        RegOffset = MI->getOperand(2).getReg();
-        ImmOffset = 0;
-      } else {
-        assert(MI->getOperand(2).isImm());
-        // SMRD instructions take a dword offsets on SI and byte offset on VI
-        // and MUBUF instructions always take a byte offset.
-        ImmOffset = MI->getOperand(2).getImm();
-        if (MBB->getParent()->getSubtarget<AMDGPUSubtarget>().getGeneration() <=
-            AMDGPUSubtarget::SEA_ISLANDS)
-          ImmOffset <<= 2;
-        RegOffset = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass);
-
-        if (isUInt<12>(ImmOffset)) {
-          BuildMI(*MBB, MI, MI->getDebugLoc(), get(AMDGPU::S_MOV_B32),
-                  RegOffset)
-                  .addImm(0);
-        } else {
-          BuildMI(*MBB, MI, MI->getDebugLoc(), get(AMDGPU::S_MOV_B32),
-                  RegOffset)
-                  .addImm(ImmOffset);
-          ImmOffset = 0;
-        }
-      }
-
-      unsigned SRsrc = MRI.createVirtualRegister(&AMDGPU::SReg_128RegClass);
-      unsigned DWord0 = RegOffset;
-      unsigned DWord1 = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass);
-      unsigned DWord2 = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass);
-      unsigned DWord3 = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass);
-      uint64_t RsrcDataFormat = getDefaultRsrcDataFormat();
-
-      BuildMI(*MBB, MI, MI->getDebugLoc(), get(AMDGPU::S_MOV_B32), DWord1)
-              .addImm(0);
-      BuildMI(*MBB, MI, MI->getDebugLoc(), get(AMDGPU::S_MOV_B32), DWord2)
-              .addImm(RsrcDataFormat & 0xFFFFFFFF);
-      BuildMI(*MBB, MI, MI->getDebugLoc(), get(AMDGPU::S_MOV_B32), DWord3)
-              .addImm(RsrcDataFormat >> 32);
-      BuildMI(*MBB, MI, MI->getDebugLoc(), get(AMDGPU::REG_SEQUENCE), SRsrc)
-        .addReg(DWord0)
-        .addImm(AMDGPU::sub0)
-        .addReg(DWord1)
-        .addImm(AMDGPU::sub1)
-        .addReg(DWord2)
-        .addImm(AMDGPU::sub2)
-        .addReg(DWord3)
-        .addImm(AMDGPU::sub3);
-
-      const MCInstrDesc &NewInstDesc = get(NewOpcode);
-      const TargetRegisterClass *NewDstRC
-        = RI.getRegClass(NewInstDesc.OpInfo[0].RegClass);
-      unsigned NewDstReg = MRI.createVirtualRegister(NewDstRC);
-      unsigned DstReg = MI->getOperand(0).getReg();
-      MRI.replaceRegWith(DstReg, NewDstReg);
-
-      MachineInstr *NewInst =
-        BuildMI(*MBB, MI, MI->getDebugLoc(), NewInstDesc, NewDstReg)
-        .addOperand(MI->getOperand(1)) // sbase
-        .addReg(SRsrc)
-        .addImm(0)
-        .addImm(ImmOffset)
-        .addImm(0) // glc
-        .addImm(0) // slc
-        .addImm(0) // tfe
-        .setMemRefs(MI->memoperands_begin(), MI->memoperands_end());
-      MI->eraseFromParent();
-
-      legalizeOperands(NewInst);
-      addUsersToMoveToVALUWorklist(NewDstReg, MRI, Worklist);
-      break;
-    }
-    case 32: {
-      MachineInstr *Lo, *Hi;
-      splitSMRD(MI, &AMDGPU::SReg_128RegClass, AMDGPU::S_LOAD_DWORDX4_IMM,
-                AMDGPU::S_LOAD_DWORDX4_SGPR, Lo, Hi);
-      MI->eraseFromParent();
-      moveSMRDToVALU(Lo, MRI, Worklist);
-      moveSMRDToVALU(Hi, MRI, Worklist);
-      break;
-    }
-
-    case 64: {
-      MachineInstr *Lo, *Hi;
-      splitSMRD(MI, &AMDGPU::SReg_256RegClass, AMDGPU::S_LOAD_DWORDX8_IMM,
-                AMDGPU::S_LOAD_DWORDX8_SGPR, Lo, Hi);
-      MI->eraseFromParent();
-      moveSMRDToVALU(Lo, MRI, Worklist);
-      moveSMRDToVALU(Hi, MRI, Worklist);
-      break;
-    }
-  }
-}
-
 void SIInstrInfo::moveToVALU(MachineInstr &TopInst) const {
   SmallVector<MachineInstr *, 128> Worklist;
   Worklist.push_back(&TopInst);
 
   while (!Worklist.empty()) {
-    MachineInstr *Inst = Worklist.pop_back_val();
-    MachineBasicBlock *MBB = Inst->getParent();
+    MachineInstr &Inst = *Worklist.pop_back_val();
+    MachineBasicBlock *MBB = Inst.getParent();
     MachineRegisterInfo &MRI = MBB->getParent()->getRegInfo();
 
-    unsigned Opcode = Inst->getOpcode();
-    unsigned NewOpcode = getVALUOp(*Inst);
+    unsigned Opcode = Inst.getOpcode();
+    unsigned NewOpcode = getVALUOp(Inst);
 
     // Handle some special cases
     switch (Opcode) {
     default:
-      if (isSMRD(*Inst)) {
-        moveSMRDToVALU(Inst, MRI, Worklist);
-        continue;
-      }
       break;
     case AMDGPU::S_AND_B64:
       splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::V_AND_B32_e64);
-      Inst->eraseFromParent();
+      Inst.eraseFromParent();
       continue;
 
     case AMDGPU::S_OR_B64:
       splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::V_OR_B32_e64);
-      Inst->eraseFromParent();
+      Inst.eraseFromParent();
       continue;
 
     case AMDGPU::S_XOR_B64:
       splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::V_XOR_B32_e64);
-      Inst->eraseFromParent();
+      Inst.eraseFromParent();
       continue;
 
     case AMDGPU::S_NOT_B64:
       splitScalar64BitUnaryOp(Worklist, Inst, AMDGPU::V_NOT_B32_e32);
-      Inst->eraseFromParent();
+      Inst.eraseFromParent();
       continue;
 
     case AMDGPU::S_BCNT1_I32_B64:
       splitScalar64BitBCNT(Worklist, Inst);
-      Inst->eraseFromParent();
+      Inst.eraseFromParent();
       continue;
 
     case AMDGPU::S_BFE_I64: {
       splitScalar64BitBFE(Worklist, Inst);
-      Inst->eraseFromParent();
+      Inst.eraseFromParent();
       continue;
     }
 
     case AMDGPU::S_LSHL_B32:
-      if (ST.getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS) {
+      if (ST.getGeneration() >= SISubtarget::VOLCANIC_ISLANDS) {
         NewOpcode = AMDGPU::V_LSHLREV_B32_e64;
         swapOperands(Inst);
       }
       break;
     case AMDGPU::S_ASHR_I32:
-      if (ST.getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS) {
+      if (ST.getGeneration() >= SISubtarget::VOLCANIC_ISLANDS) {
         NewOpcode = AMDGPU::V_ASHRREV_I32_e64;
         swapOperands(Inst);
       }
       break;
     case AMDGPU::S_LSHR_B32:
-      if (ST.getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS) {
+      if (ST.getGeneration() >= SISubtarget::VOLCANIC_ISLANDS) {
         NewOpcode = AMDGPU::V_LSHRREV_B32_e64;
         swapOperands(Inst);
       }
       break;
     case AMDGPU::S_LSHL_B64:
-      if (ST.getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS) {
+      if (ST.getGeneration() >= SISubtarget::VOLCANIC_ISLANDS) {
         NewOpcode = AMDGPU::V_LSHLREV_B64;
         swapOperands(Inst);
       }
       break;
     case AMDGPU::S_ASHR_I64:
-      if (ST.getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS) {
+      if (ST.getGeneration() >= SISubtarget::VOLCANIC_ISLANDS) {
         NewOpcode = AMDGPU::V_ASHRREV_I64;
         swapOperands(Inst);
       }
       break;
     case AMDGPU::S_LSHR_B64:
-      if (ST.getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS) {
+      if (ST.getGeneration() >= SISubtarget::VOLCANIC_ISLANDS) {
         NewOpcode = AMDGPU::V_LSHRREV_B64;
         swapOperands(Inst);
       }
@@ -2536,9 +2577,18 @@ void SIInstrInfo::moveToVALU(MachineInstr &TopInst) const {
 
     case AMDGPU::S_ABS_I32:
       lowerScalarAbs(Worklist, Inst);
-      Inst->eraseFromParent();
+      Inst.eraseFromParent();
       continue;
 
+    case AMDGPU::S_CBRANCH_SCC0:
+    case AMDGPU::S_CBRANCH_SCC1:
+      // Clear unused bits of vcc
+      BuildMI(*MBB, Inst, Inst.getDebugLoc(), get(AMDGPU::S_AND_B64),
+              AMDGPU::VCC)
+          .addReg(AMDGPU::EXEC)
+          .addReg(AMDGPU::VCC);
+      break;
+
     case AMDGPU::S_BFE_U64:
     case AMDGPU::S_BFM_B64:
       llvm_unreachable("Moving this op to VALU not implemented");
@@ -2553,34 +2603,36 @@ void SIInstrInfo::moveToVALU(MachineInstr &TopInst) const {
 
     // Use the new VALU Opcode.
     const MCInstrDesc &NewDesc = get(NewOpcode);
-    Inst->setDesc(NewDesc);
+    Inst.setDesc(NewDesc);
 
     // Remove any references to SCC. Vector instructions can't read from it, and
     // We're just about to add the implicit use / defs of VCC, and we don't want
     // both.
-    for (unsigned i = Inst->getNumOperands() - 1; i > 0; --i) {
-      MachineOperand &Op = Inst->getOperand(i);
-      if (Op.isReg() && Op.getReg() == AMDGPU::SCC)
-        Inst->RemoveOperand(i);
+    for (unsigned i = Inst.getNumOperands() - 1; i > 0; --i) {
+      MachineOperand &Op = Inst.getOperand(i);
+      if (Op.isReg() && Op.getReg() == AMDGPU::SCC) {
+        Inst.RemoveOperand(i);
+        addSCCDefUsersToVALUWorklist(Inst, Worklist);
+      }
     }
 
     if (Opcode == AMDGPU::S_SEXT_I32_I8 || Opcode == AMDGPU::S_SEXT_I32_I16) {
       // We are converting these to a BFE, so we need to add the missing
       // operands for the size and offset.
       unsigned Size = (Opcode == AMDGPU::S_SEXT_I32_I8) ? 8 : 16;
-      Inst->addOperand(MachineOperand::CreateImm(0));
-      Inst->addOperand(MachineOperand::CreateImm(Size));
+      Inst.addOperand(MachineOperand::CreateImm(0));
+      Inst.addOperand(MachineOperand::CreateImm(Size));
 
     } else if (Opcode == AMDGPU::S_BCNT1_I32_B32) {
       // The VALU version adds the second operand to the result, so insert an
       // extra 0 operand.
-      Inst->addOperand(MachineOperand::CreateImm(0));
+      Inst.addOperand(MachineOperand::CreateImm(0));
     }
 
-    Inst->addImplicitDefUseOperands(*Inst->getParent()->getParent());
+    Inst.addImplicitDefUseOperands(*Inst.getParent()->getParent());
 
     if (Opcode == AMDGPU::S_BFE_I32 || Opcode == AMDGPU::S_BFE_U32) {
-      const MachineOperand &OffsetWidthOp = Inst->getOperand(2);
+      const MachineOperand &OffsetWidthOp = Inst.getOperand(2);
       // If we need to move this to VGPRs, we need to unpack the second operand
       // back into the 2 separate ones for bit offset and width.
       assert(OffsetWidthOp.isImm() &&
@@ -2589,50 +2641,41 @@ void SIInstrInfo::moveToVALU(MachineInstr &TopInst) const {
 
       uint32_t Offset = Imm & 0x3f; // Extract bits [5:0].
       uint32_t BitWidth = (Imm & 0x7f0000) >> 16; // Extract bits [22:16].
-      Inst->RemoveOperand(2); // Remove old immediate.
-      Inst->addOperand(MachineOperand::CreateImm(Offset));
-      Inst->addOperand(MachineOperand::CreateImm(BitWidth));
+      Inst.RemoveOperand(2);                     // Remove old immediate.
+      Inst.addOperand(MachineOperand::CreateImm(Offset));
+      Inst.addOperand(MachineOperand::CreateImm(BitWidth));
     }
 
-    // Update the destination register class.
-    const TargetRegisterClass *NewDstRC = getDestEquivalentVGPRClass(*Inst);
-    if (!NewDstRC)
-      continue;
+    bool HasDst = Inst.getOperand(0).isReg() && Inst.getOperand(0).isDef();
+    unsigned NewDstReg = AMDGPU::NoRegister;
+    if (HasDst) {
+      // Update the destination register class.
+      const TargetRegisterClass *NewDstRC = getDestEquivalentVGPRClass(Inst);
+      if (!NewDstRC)
+        continue;
 
-    unsigned DstReg = Inst->getOperand(0).getReg();
-    unsigned NewDstReg = MRI.createVirtualRegister(NewDstRC);
-    MRI.replaceRegWith(DstReg, NewDstReg);
+      unsigned DstReg = Inst.getOperand(0).getReg();
+      NewDstReg = MRI.createVirtualRegister(NewDstRC);
+      MRI.replaceRegWith(DstReg, NewDstReg);
+    }
 
     // Legalize the operands
     legalizeOperands(Inst);
 
-    addUsersToMoveToVALUWorklist(NewDstReg, MRI, Worklist);
+    if (HasDst)
+     addUsersToMoveToVALUWorklist(NewDstReg, MRI, Worklist);
   }
 }
 
-//===----------------------------------------------------------------------===//
-// Indirect addressing callbacks
-//===----------------------------------------------------------------------===//
-
-unsigned SIInstrInfo::calculateIndirectAddress(unsigned RegIndex,
-                                                 unsigned Channel) const {
-  assert(Channel == 0);
-  return RegIndex;
-}
-
-const TargetRegisterClass *SIInstrInfo::getIndirectAddrRegClass() const {
-  return &AMDGPU::VGPR_32RegClass;
-}
-
 void SIInstrInfo::lowerScalarAbs(SmallVectorImpl<MachineInstr *> &Worklist,
-                                 MachineInstr *Inst) const {
-  MachineBasicBlock &MBB = *Inst->getParent();
+                                 MachineInstr &Inst) const {
+  MachineBasicBlock &MBB = *Inst.getParent();
   MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
   MachineBasicBlock::iterator MII = Inst;
-  DebugLoc DL = Inst->getDebugLoc();
+  DebugLoc DL = Inst.getDebugLoc();
 
-  MachineOperand &Dest = Inst->getOperand(0);
-  MachineOperand &Src = Inst->getOperand(1);
+  MachineOperand &Dest = Inst.getOperand(0);
+  MachineOperand &Src = Inst.getOperand(1);
   unsigned TmpReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
   unsigned ResultReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
 
@@ -2649,15 +2692,14 @@ void SIInstrInfo::lowerScalarAbs(SmallVectorImpl<MachineInstr *> &Worklist,
 }
 
 void SIInstrInfo::splitScalar64BitUnaryOp(
-  SmallVectorImpl<MachineInstr *> &Worklist,
-  MachineInstr *Inst,
-  unsigned Opcode) const {
-  MachineBasicBlock &MBB = *Inst->getParent();
+    SmallVectorImpl<MachineInstr *> &Worklist, MachineInstr &Inst,
+    unsigned Opcode) const {
+  MachineBasicBlock &MBB = *Inst.getParent();
   MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
 
-  MachineOperand &Dest = Inst->getOperand(0);
-  MachineOperand &Src0 = Inst->getOperand(1);
-  DebugLoc DL = Inst->getDebugLoc();
+  MachineOperand &Dest = Inst.getOperand(0);
+  MachineOperand &Src0 = Inst.getOperand(1);
+  DebugLoc DL = Inst.getDebugLoc();
 
   MachineBasicBlock::iterator MII = Inst;
 
@@ -2703,16 +2745,15 @@ void SIInstrInfo::splitScalar64BitUnaryOp(
 }
 
 void SIInstrInfo::splitScalar64BitBinaryOp(
-  SmallVectorImpl<MachineInstr *> &Worklist,
-  MachineInstr *Inst,
-  unsigned Opcode) const {
-  MachineBasicBlock &MBB = *Inst->getParent();
+    SmallVectorImpl<MachineInstr *> &Worklist, MachineInstr &Inst,
+    unsigned Opcode) const {
+  MachineBasicBlock &MBB = *Inst.getParent();
   MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
 
-  MachineOperand &Dest = Inst->getOperand(0);
-  MachineOperand &Src0 = Inst->getOperand(1);
-  MachineOperand &Src1 = Inst->getOperand(2);
-  DebugLoc DL = Inst->getDebugLoc();
+  MachineOperand &Dest = Inst.getOperand(0);
+  MachineOperand &Src0 = Inst.getOperand(1);
+  MachineOperand &Src1 = Inst.getOperand(2);
+  DebugLoc DL = Inst.getDebugLoc();
 
   MachineBasicBlock::iterator MII = Inst;
 
@@ -2738,9 +2779,9 @@ void SIInstrInfo::splitScalar64BitBinaryOp(
   const TargetRegisterClass *NewDestSubRC = RI.getSubRegClass(NewDestRC, AMDGPU::sub0);
 
   unsigned DestSub0 = MRI.createVirtualRegister(NewDestSubRC);
-  MachineInstr *LoHalf = BuildMI(MBB, MII, DL, InstDesc, DestSub0)
-    .addOperand(SrcReg0Sub0)
-    .addOperand(SrcReg1Sub0);
+  MachineInstr &LoHalf = *BuildMI(MBB, MII, DL, InstDesc, DestSub0)
+                              .addOperand(SrcReg0Sub0)
+                              .addOperand(SrcReg1Sub0);
 
   MachineOperand SrcReg0Sub1 = buildExtractSubRegOrImm(MII, MRI, Src0, Src0RC,
                                                        AMDGPU::sub1, Src0SubRC);
@@ -2748,9 +2789,9 @@ void SIInstrInfo::splitScalar64BitBinaryOp(
                                                        AMDGPU::sub1, Src1SubRC);
 
   unsigned DestSub1 = MRI.createVirtualRegister(NewDestSubRC);
-  MachineInstr *HiHalf = BuildMI(MBB, MII, DL, InstDesc, DestSub1)
-    .addOperand(SrcReg0Sub1)
-    .addOperand(SrcReg1Sub1);
+  MachineInstr &HiHalf = *BuildMI(MBB, MII, DL, InstDesc, DestSub1)
+                              .addOperand(SrcReg0Sub1)
+                              .addOperand(SrcReg1Sub1);
 
   unsigned FullDestReg = MRI.createVirtualRegister(NewDestRC);
   BuildMI(MBB, MII, DL, get(TargetOpcode::REG_SEQUENCE), FullDestReg)
@@ -2770,16 +2811,16 @@ void SIInstrInfo::splitScalar64BitBinaryOp(
   addUsersToMoveToVALUWorklist(FullDestReg, MRI, Worklist);
 }
 
-void SIInstrInfo::splitScalar64BitBCNT(SmallVectorImpl<MachineInstr *> &Worklist,
-                                       MachineInstr *Inst) const {
-  MachineBasicBlock &MBB = *Inst->getParent();
+void SIInstrInfo::splitScalar64BitBCNT(
+    SmallVectorImpl<MachineInstr *> &Worklist, MachineInstr &Inst) const {
+  MachineBasicBlock &MBB = *Inst.getParent();
   MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
 
   MachineBasicBlock::iterator MII = Inst;
-  DebugLoc DL = Inst->getDebugLoc();
+  DebugLoc DL = Inst.getDebugLoc();
 
-  MachineOperand &Dest = Inst->getOperand(0);
-  MachineOperand &Src = Inst->getOperand(1);
+  MachineOperand &Dest = Inst.getOperand(0);
+  MachineOperand &Src = Inst.getOperand(1);
 
   const MCInstrDesc &InstDesc = get(AMDGPU::V_BCNT_U32_B32_e64);
   const TargetRegisterClass *SrcRC = Src.isReg() ?
@@ -2812,24 +2853,22 @@ void SIInstrInfo::splitScalar64BitBCNT(SmallVectorImpl<MachineInstr *> &Worklist
 }
 
 void SIInstrInfo::splitScalar64BitBFE(SmallVectorImpl<MachineInstr *> &Worklist,
-                                      MachineInstr *Inst) const {
-  MachineBasicBlock &MBB = *Inst->getParent();
+                                      MachineInstr &Inst) const {
+  MachineBasicBlock &MBB = *Inst.getParent();
   MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
   MachineBasicBlock::iterator MII = Inst;
-  DebugLoc DL = Inst->getDebugLoc();
+  DebugLoc DL = Inst.getDebugLoc();
 
-  MachineOperand &Dest = Inst->getOperand(0);
-  uint32_t Imm = Inst->getOperand(2).getImm();
+  MachineOperand &Dest = Inst.getOperand(0);
+  uint32_t Imm = Inst.getOperand(2).getImm();
   uint32_t Offset = Imm & 0x3f; // Extract bits [5:0].
   uint32_t BitWidth = (Imm & 0x7f0000) >> 16; // Extract bits [22:16].
 
   (void) Offset;
 
   // Only sext_inreg cases handled.
-  assert(Inst->getOpcode() == AMDGPU::S_BFE_I64 &&
-         BitWidth <= 32 &&
-         Offset == 0 &&
-         "Not implemented");
+  assert(Inst.getOpcode() == AMDGPU::S_BFE_I64 && BitWidth <= 32 &&
+         Offset == 0 && "Not implemented");
 
   if (BitWidth < 32) {
     unsigned MidRegLo = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
@@ -2837,9 +2876,9 @@ void SIInstrInfo::splitScalar64BitBFE(SmallVectorImpl<MachineInstr *> &Worklist,
     unsigned ResultReg = MRI.createVirtualRegister(&AMDGPU::VReg_64RegClass);
 
     BuildMI(MBB, MII, DL, get(AMDGPU::V_BFE_I32), MidRegLo)
-      .addReg(Inst->getOperand(1).getReg(), 0, AMDGPU::sub0)
-      .addImm(0)
-      .addImm(BitWidth);
+        .addReg(Inst.getOperand(1).getReg(), 0, AMDGPU::sub0)
+        .addImm(0)
+        .addImm(BitWidth);
 
     BuildMI(MBB, MII, DL, get(AMDGPU::V_ASHRREV_I32_e32), MidRegHi)
       .addImm(31)
@@ -2856,7 +2895,7 @@ void SIInstrInfo::splitScalar64BitBFE(SmallVectorImpl<MachineInstr *> &Worklist,
     return;
   }
 
-  MachineOperand &Src = Inst->getOperand(1);
+  MachineOperand &Src = Inst.getOperand(1);
   unsigned TmpReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
   unsigned ResultReg = MRI.createVirtualRegister(&AMDGPU::VReg_64RegClass);
 
@@ -2887,6 +2926,22 @@ void SIInstrInfo::addUsersToMoveToVALUWorklist(
   }
 }
 
+void SIInstrInfo::addSCCDefUsersToVALUWorklist(
+    MachineInstr &SCCDefInst, SmallVectorImpl<MachineInstr *> &Worklist) const {
+  // This assumes that all the users of SCC are in the same block
+  // as the SCC def.
+  for (MachineInstr &MI :
+       llvm::make_range(MachineBasicBlock::iterator(SCCDefInst),
+                        SCCDefInst.getParent()->end())) {
+    // Exit if we find another SCC def.
+    if (MI.findRegisterDefOperandIdx(AMDGPU::SCC) != -1)
+      return;
+
+    if (MI.findRegisterUseOperandIdx(AMDGPU::SCC) != -1)
+      Worklist.push_back(&MI);
+  }
+}
+
 const TargetRegisterClass *SIInstrInfo::getDestEquivalentVGPRClass(
   const MachineInstr &Inst) const {
   const TargetRegisterClass *NewDstRC = getOpRegClass(Inst, 0);
@@ -2912,9 +2967,9 @@ const TargetRegisterClass *SIInstrInfo::getDestEquivalentVGPRClass(
 }
 
 // Find the one SGPR operand we are allowed to use.
-unsigned SIInstrInfo::findUsedSGPR(const MachineInstr *MI,
+unsigned SIInstrInfo::findUsedSGPR(const MachineInstr &MI,
                                    int OpIndices[3]) const {
-  const MCInstrDesc &Desc = MI->getDesc();
+  const MCInstrDesc &Desc = MI.getDesc();
 
   // Find the one SGPR operand we are allowed to use.
   //
@@ -2925,19 +2980,19 @@ unsigned SIInstrInfo::findUsedSGPR(const MachineInstr *MI,
   //
   // If the operand's class is an SGPR, we can never move it.
 
-  unsigned SGPRReg = findImplicitSGPRRead(*MI);
+  unsigned SGPRReg = findImplicitSGPRRead(MI);
   if (SGPRReg != AMDGPU::NoRegister)
     return SGPRReg;
 
   unsigned UsedSGPRs[3] = { AMDGPU::NoRegister };
-  const MachineRegisterInfo &MRI = MI->getParent()->getParent()->getRegInfo();
+  const MachineRegisterInfo &MRI = MI.getParent()->getParent()->getRegInfo();
 
   for (unsigned i = 0; i < 3; ++i) {
     int Idx = OpIndices[i];
     if (Idx == -1)
       break;
 
-    const MachineOperand &MO = MI->getOperand(Idx);
+    const MachineOperand &MO = MI.getOperand(Idx);
     if (!MO.isReg())
       continue;
 
@@ -2981,70 +3036,6 @@ unsigned SIInstrInfo::findUsedSGPR(const MachineInstr *MI,
   return SGPRReg;
 }
 
-MachineInstrBuilder SIInstrInfo::buildIndirectWrite(
-                                   MachineBasicBlock *MBB,
-                                   MachineBasicBlock::iterator I,
-                                   unsigned ValueReg,
-                                   unsigned Address, unsigned OffsetReg) const {
-  const DebugLoc &DL = MBB->findDebugLoc(I);
-  unsigned IndirectBaseReg = AMDGPU::VGPR_32RegClass.getRegister(
-                                      getIndirectIndexBegin(*MBB->getParent()));
-
-  return BuildMI(*MBB, I, DL, get(AMDGPU::SI_INDIRECT_DST_V1))
-          .addReg(IndirectBaseReg, RegState::Define)
-          .addOperand(I->getOperand(0))
-          .addReg(IndirectBaseReg)
-          .addReg(OffsetReg)
-          .addImm(0)
-          .addReg(ValueReg);
-}
-
-MachineInstrBuilder SIInstrInfo::buildIndirectRead(
-                                   MachineBasicBlock *MBB,
-                                   MachineBasicBlock::iterator I,
-                                   unsigned ValueReg,
-                                   unsigned Address, unsigned OffsetReg) const {
-  const DebugLoc &DL = MBB->findDebugLoc(I);
-  unsigned IndirectBaseReg = AMDGPU::VGPR_32RegClass.getRegister(
-                                      getIndirectIndexBegin(*MBB->getParent()));
-
-  return BuildMI(*MBB, I, DL, get(AMDGPU::SI_INDIRECT_SRC_V1))
-          .addOperand(I->getOperand(0))
-          .addOperand(I->getOperand(1))
-          .addReg(IndirectBaseReg)
-          .addReg(OffsetReg)
-          .addImm(0);
-
-}
-
-void SIInstrInfo::reserveIndirectRegisters(BitVector &Reserved,
-                                            const MachineFunction &MF) const {
-  int End = getIndirectIndexEnd(MF);
-  int Begin = getIndirectIndexBegin(MF);
-
-  if (End == -1)
-    return;
-
-
-  for (int Index = Begin; Index <= End; ++Index)
-    Reserved.set(AMDGPU::VGPR_32RegClass.getRegister(Index));
-
-  for (int Index = std::max(0, Begin - 1); Index <= End; ++Index)
-    Reserved.set(AMDGPU::VReg_64RegClass.getRegister(Index));
-
-  for (int Index = std::max(0, Begin - 2); Index <= End; ++Index)
-    Reserved.set(AMDGPU::VReg_96RegClass.getRegister(Index));
-
-  for (int Index = std::max(0, Begin - 3); Index <= End; ++Index)
-    Reserved.set(AMDGPU::VReg_128RegClass.getRegister(Index));
-
-  for (int Index = std::max(0, Begin - 7); Index <= End; ++Index)
-    Reserved.set(AMDGPU::VReg_256RegClass.getRegister(Index));
-
-  for (int Index = std::max(0, Begin - 15); Index <= End; ++Index)
-    Reserved.set(AMDGPU::VReg_512RegClass.getRegister(Index));
-}
-
 MachineOperand *SIInstrInfo::getNamedOperand(MachineInstr &MI,
                                              unsigned OperandName) const {
   int Idx = AMDGPU::getNamedOperandIdx(MI.getOpcode(), OperandName);
@@ -3059,9 +3050,9 @@ uint64_t SIInstrInfo::getDefaultRsrcDataFormat() const {
   if (ST.isAmdHsaOS()) {
     RsrcDataFormat |= (1ULL << 56);
 
-  if (ST.getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS)
-    // Set MTYPE = 2
-    RsrcDataFormat |= (2ULL << 59);
+    if (ST.getGeneration() >= SISubtarget::VOLCANIC_ISLANDS)
+      // Set MTYPE = 2
+      RsrcDataFormat |= (2ULL << 59);
   }
 
   return RsrcDataFormat;
@@ -3072,22 +3063,103 @@ uint64_t SIInstrInfo::getScratchRsrcWords23() const {
                     AMDGPU::RSRC_TID_ENABLE |
                     0xffffffff; // Size;
 
+  uint64_t EltSizeValue = Log2_32(ST.getMaxPrivateElementSize()) - 1;
+
+  Rsrc23 |= (EltSizeValue << AMDGPU::RSRC_ELEMENT_SIZE_SHIFT) |
+            // IndexStride = 64
+            (UINT64_C(3) << AMDGPU::RSRC_INDEX_STRIDE_SHIFT);
+
   // If TID_ENABLE is set, DATA_FORMAT specifies stride bits [14:17].
   // Clear them unless we want a huge stride.
-  if (ST.getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS)
+  if (ST.getGeneration() >= SISubtarget::VOLCANIC_ISLANDS)
     Rsrc23 &= ~AMDGPU::RSRC_DATA_FORMAT;
 
   return Rsrc23;
 }
 
-bool SIInstrInfo::isLowLatencyInstruction(const MachineInstr *MI) const {
-  unsigned Opc = MI->getOpcode();
+bool SIInstrInfo::isLowLatencyInstruction(const MachineInstr &MI) const {
+  unsigned Opc = MI.getOpcode();
 
   return isSMRD(Opc);
 }
 
-bool SIInstrInfo::isHighLatencyInstruction(const MachineInstr *MI) const {
-  unsigned Opc = MI->getOpcode();
+bool SIInstrInfo::isHighLatencyInstruction(const MachineInstr &MI) const {
+  unsigned Opc = MI.getOpcode();
 
   return isMUBUF(Opc) || isMTBUF(Opc) || isMIMG(Opc);
 }
+
+unsigned SIInstrInfo::getInstSizeInBytes(const MachineInstr &MI) const {
+  unsigned Opc = MI.getOpcode();
+  const MCInstrDesc &Desc = getMCOpcodeFromPseudo(Opc);
+  unsigned DescSize = Desc.getSize();
+
+  // If we have a definitive size, we can use it. Otherwise we need to inspect
+  // the operands to know the size.
+  if (DescSize == 8 || DescSize == 4)
+    return DescSize;
+
+  assert(DescSize == 0);
+
+  // 4-byte instructions may have a 32-bit literal encoded after them. Check
+  // operands that coud ever be literals.
+  if (isVALU(MI) || isSALU(MI)) {
+    int Src0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0);
+    if (Src0Idx == -1)
+      return 4; // No operands.
+
+    if (isLiteralConstant(MI.getOperand(Src0Idx), getOpSize(MI, Src0Idx)))
+      return 8;
+
+    int Src1Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src1);
+    if (Src1Idx == -1)
+      return 4;
+
+    if (isLiteralConstant(MI.getOperand(Src1Idx), getOpSize(MI, Src1Idx)))
+      return 8;
+
+    return 4;
+  }
+
+  switch (Opc) {
+  case TargetOpcode::IMPLICIT_DEF:
+  case TargetOpcode::KILL:
+  case TargetOpcode::DBG_VALUE:
+  case TargetOpcode::BUNDLE:
+  case TargetOpcode::EH_LABEL:
+    return 0;
+  case TargetOpcode::INLINEASM: {
+    const MachineFunction *MF = MI.getParent()->getParent();
+    const char *AsmStr = MI.getOperand(0).getSymbolName();
+    return getInlineAsmLength(AsmStr, *MF->getTarget().getMCAsmInfo());
+  }
+  default:
+    llvm_unreachable("unable to find instruction size");
+  }
+}
+
+ArrayRef<std::pair<int, const char *>>
+SIInstrInfo::getSerializableTargetIndices() const {
+  static const std::pair<int, const char *> TargetIndices[] = {
+      {AMDGPU::TI_CONSTDATA_START, "amdgpu-constdata-start"},
+      {AMDGPU::TI_SCRATCH_RSRC_DWORD0, "amdgpu-scratch-rsrc-dword0"},
+      {AMDGPU::TI_SCRATCH_RSRC_DWORD1, "amdgpu-scratch-rsrc-dword1"},
+      {AMDGPU::TI_SCRATCH_RSRC_DWORD2, "amdgpu-scratch-rsrc-dword2"},
+      {AMDGPU::TI_SCRATCH_RSRC_DWORD3, "amdgpu-scratch-rsrc-dword3"}};
+  return makeArrayRef(TargetIndices);
+}
+
+/// This is used by the post-RA scheduler (SchedulePostRAList.cpp).  The
+/// post-RA version of misched uses CreateTargetMIHazardRecognizer.
+ScheduleHazardRecognizer *
+SIInstrInfo::CreateTargetPostRAHazardRecognizer(const InstrItineraryData *II,
+                                            const ScheduleDAG *DAG) const {
+  return new GCNHazardRecognizer(DAG->MF);
+}
+
+/// This is the hazard recognizer used at -O0 by the PostRAHazardRecognizer
+/// pass.
+ScheduleHazardRecognizer *
+SIInstrInfo::CreateTargetPostRAHazardRecognizer(const MachineFunction &MF) const {
+  return new GCNHazardRecognizer(MF);
+}
diff --git a/lib/Target/AMDGPU/SIInstrInfo.h b/lib/Target/AMDGPU/SIInstrInfo.h
index cce1ae725611..227b817227c2 100644
--- a/lib/Target/AMDGPU/SIInstrInfo.h
+++ b/lib/Target/AMDGPU/SIInstrInfo.h
@@ -13,8 +13,8 @@
 //===----------------------------------------------------------------------===//
 
 
-#ifndef LLVM_LIB_TARGET_R600_SIINSTRINFO_H
-#define LLVM_LIB_TARGET_R600_SIINSTRINFO_H
+#ifndef LLVM_LIB_TARGET_AMDGPU_SIINSTRINFO_H
+#define LLVM_LIB_TARGET_AMDGPU_SIINSTRINFO_H
 
 #include "AMDGPUInstrInfo.h"
 #include "SIDefines.h"
@@ -22,9 +22,24 @@
 
 namespace llvm {
 
-class SIInstrInfo : public AMDGPUInstrInfo {
+class SIInstrInfo final : public AMDGPUInstrInfo {
 private:
   const SIRegisterInfo RI;
+  const SISubtarget &ST;
+
+  // The the inverse predicate should have the negative value.
+  enum BranchPredicate {
+    INVALID_BR = 0,
+    SCC_TRUE = 1,
+    SCC_FALSE = -1,
+    VCCNZ = 2,
+    VCCZ = -2,
+    EXECNZ = -3,
+    EXECZ = 3
+  };
+
+  static unsigned getBranchOpcode(BranchPredicate Cond);
+  static BranchPredicate getBranchPredicate(unsigned Opcode);
 
   unsigned buildExtractSubReg(MachineBasicBlock::iterator MI,
                               MachineRegisterInfo &MRI,
@@ -39,87 +54,89 @@ private:
                                          unsigned SubIdx,
                                          const TargetRegisterClass *SubRC) const;
 
-  void swapOperands(MachineBasicBlock::iterator Inst) const;
+  void swapOperands(MachineInstr &Inst) const;
 
   void lowerScalarAbs(SmallVectorImpl<MachineInstr *> &Worklist,
-                      MachineInstr *Inst) const;
+                      MachineInstr &Inst) const;
 
   void splitScalar64BitUnaryOp(SmallVectorImpl<MachineInstr *> &Worklist,
-                               MachineInstr *Inst, unsigned Opcode) const;
+                               MachineInstr &Inst, unsigned Opcode) const;
 
   void splitScalar64BitBinaryOp(SmallVectorImpl<MachineInstr *> &Worklist,
-                                MachineInstr *Inst, unsigned Opcode) const;
+                                MachineInstr &Inst, unsigned Opcode) const;
 
   void splitScalar64BitBCNT(SmallVectorImpl<MachineInstr *> &Worklist,
-                            MachineInstr *Inst) const;
+                            MachineInstr &Inst) const;
   void splitScalar64BitBFE(SmallVectorImpl<MachineInstr *> &Worklist,
-                           MachineInstr *Inst) const;
+                           MachineInstr &Inst) const;
 
   void addUsersToMoveToVALUWorklist(
     unsigned Reg, MachineRegisterInfo &MRI,
     SmallVectorImpl<MachineInstr *> &Worklist) const;
 
+  void
+  addSCCDefUsersToVALUWorklist(MachineInstr &SCCDefInst,
+                               SmallVectorImpl<MachineInstr *> &Worklist) const;
+
   const TargetRegisterClass *
   getDestEquivalentVGPRClass(const MachineInstr &Inst) const;
 
-  bool checkInstOffsetsDoNotOverlap(MachineInstr *MIa,
-                                    MachineInstr *MIb) const;
+  bool checkInstOffsetsDoNotOverlap(MachineInstr &MIa, MachineInstr &MIb) const;
 
-  unsigned findUsedSGPR(const MachineInstr *MI, int OpIndices[3]) const;
+  unsigned findUsedSGPR(const MachineInstr &MI, int OpIndices[3]) const;
 
 protected:
-  MachineInstr *commuteInstructionImpl(MachineInstr *MI,
-                                       bool NewMI,
+  MachineInstr *commuteInstructionImpl(MachineInstr &MI, bool NewMI,
                                        unsigned OpIdx0,
                                        unsigned OpIdx1) const override;
 
 public:
-  explicit SIInstrInfo(const AMDGPUSubtarget &st);
 
-  const SIRegisterInfo &getRegisterInfo() const override {
+  enum TargetOperandFlags {
+    MO_NONE = 0,
+    MO_GOTPCREL = 1
+  };
+
+  explicit SIInstrInfo(const SISubtarget &);
+
+  const SIRegisterInfo &getRegisterInfo() const {
     return RI;
   }
 
-  bool isReallyTriviallyReMaterializable(const MachineInstr *MI,
+  bool isReallyTriviallyReMaterializable(const MachineInstr &MI,
                                          AliasAnalysis *AA) const override;
 
   bool areLoadsFromSameBasePtr(SDNode *Load1, SDNode *Load2,
                                int64_t &Offset1,
                                int64_t &Offset2) const override;
 
-  bool getMemOpBaseRegImmOfs(MachineInstr *LdSt, unsigned &BaseReg,
-                             unsigned &Offset,
+  bool getMemOpBaseRegImmOfs(MachineInstr &LdSt, unsigned &BaseReg,
+                             int64_t &Offset,
                              const TargetRegisterInfo *TRI) const final;
 
-  bool shouldClusterLoads(MachineInstr *FirstLdSt,
-                          MachineInstr *SecondLdSt,
-                          unsigned NumLoads) const final;
+  bool shouldClusterMemOps(MachineInstr &FirstLdSt, MachineInstr &SecondLdSt,
+                           unsigned NumLoads) const final;
 
-  void copyPhysReg(MachineBasicBlock &MBB,
-                   MachineBasicBlock::iterator MI, DebugLoc DL,
-                   unsigned DestReg, unsigned SrcReg,
+  void copyPhysReg(MachineBasicBlock &MBB, MachineBasicBlock::iterator MI,
+                   const DebugLoc &DL, unsigned DestReg, unsigned SrcReg,
                    bool KillSrc) const override;
 
-  unsigned calculateLDSSpillAddress(MachineBasicBlock &MBB,
-                                    MachineBasicBlock::iterator MI,
-                                    RegScavenger *RS,
-                                    unsigned TmpReg,
-                                    unsigned Offset,
-                                    unsigned Size) const;
+  unsigned calculateLDSSpillAddress(MachineBasicBlock &MBB, MachineInstr &MI,
+                                    RegScavenger *RS, unsigned TmpReg,
+                                    unsigned Offset, unsigned Size) const;
 
   void storeRegToStackSlot(MachineBasicBlock &MBB,
-                           MachineBasicBlock::iterator MI,
-                           unsigned SrcReg, bool isKill, int FrameIndex,
+                           MachineBasicBlock::iterator MI, unsigned SrcReg,
+                           bool isKill, int FrameIndex,
                            const TargetRegisterClass *RC,
                            const TargetRegisterInfo *TRI) const override;
 
   void loadRegFromStackSlot(MachineBasicBlock &MBB,
-                            MachineBasicBlock::iterator MI,
-                            unsigned DestReg, int FrameIndex,
-                            const TargetRegisterClass *RC,
+                            MachineBasicBlock::iterator MI, unsigned DestReg,
+                            int FrameIndex, const TargetRegisterClass *RC,
                             const TargetRegisterInfo *TRI) const override;
 
-  bool expandPostRAPseudo(MachineBasicBlock::iterator MI) const override;
+  bool expandPostRAPseudo(MachineInstr &MI) const override;
 
   // \brief Returns an opcode that can be used to move a value to a \p DstRC
   // register.  If there is no hardware instruction that can store to \p
@@ -129,28 +146,40 @@ public:
   LLVM_READONLY
   int commuteOpcode(const MachineInstr &MI) const;
 
-  bool findCommutedOpIndices(MachineInstr *MI,
-                             unsigned &SrcOpIdx1,
+  bool findCommutedOpIndices(MachineInstr &MI, unsigned &SrcOpIdx1,
                              unsigned &SrcOpIdx2) const override;
 
-  bool areMemAccessesTriviallyDisjoint(
-    MachineInstr *MIa, MachineInstr *MIb,
-    AliasAnalysis *AA = nullptr) const override;
+  bool analyzeBranch(MachineBasicBlock &MBB, MachineBasicBlock *&TBB,
+                     MachineBasicBlock *&FBB,
+                     SmallVectorImpl<MachineOperand> &Cond,
+                     bool AllowModify) const override;
+
+  unsigned RemoveBranch(MachineBasicBlock &MBB) const override;
+
+  unsigned InsertBranch(MachineBasicBlock &MBB, MachineBasicBlock *TBB,
+                        MachineBasicBlock *FBB, ArrayRef<MachineOperand> Cond,
+                        const DebugLoc &DL) const override;
 
-  MachineInstr *buildMovInstr(MachineBasicBlock *MBB,
-                              MachineBasicBlock::iterator I,
-                              unsigned DstReg, unsigned SrcReg) const override;
-  bool isMov(unsigned Opcode) const override;
+  bool ReverseBranchCondition(
+    SmallVectorImpl<MachineOperand> &Cond) const override;
 
-  bool FoldImmediate(MachineInstr *UseMI, MachineInstr *DefMI,
-                     unsigned Reg, MachineRegisterInfo *MRI) const final;
+  bool
+  areMemAccessesTriviallyDisjoint(MachineInstr &MIa, MachineInstr &MIb,
+                                  AliasAnalysis *AA = nullptr) const override;
+
+  bool FoldImmediate(MachineInstr &UseMI, MachineInstr &DefMI, unsigned Reg,
+                     MachineRegisterInfo *MRI) const final;
 
   unsigned getMachineCSELookAheadLimit() const override { return 500; }
 
   MachineInstr *convertToThreeAddress(MachineFunction::iterator &MBB,
-                                      MachineBasicBlock::iterator &MI,
+                                      MachineInstr &MI,
                                       LiveVariables *LV) const override;
 
+  bool isSchedulingBoundary(const MachineInstr &MI,
+                            const MachineBasicBlock *MBB,
+                            const MachineFunction &MF) const override;
+
   static bool isSALU(const MachineInstr &MI) {
     return MI.getDesc().TSFlags & SIInstrFlags::SALU;
   }
@@ -167,6 +196,14 @@ public:
     return get(Opcode).TSFlags & SIInstrFlags::VALU;
   }
 
+  static bool isVMEM(const MachineInstr &MI) {
+    return isMUBUF(MI) || isMTBUF(MI) || isMIMG(MI);
+  }
+
+  bool isVMEM(uint16_t Opcode) const {
+    return isMUBUF(Opcode) || isMTBUF(Opcode) || isMIMG(Opcode);
+  }
+
   static bool isSOP1(const MachineInstr &MI) {
     return MI.getDesc().TSFlags & SIInstrFlags::SOP1;
   }
@@ -279,6 +316,14 @@ public:
     return get(Opcode).TSFlags & SIInstrFlags::MIMG;
   }
 
+  static bool isGather4(const MachineInstr &MI) {
+    return MI.getDesc().TSFlags & SIInstrFlags::Gather4;
+  }
+
+  bool isGather4(uint16_t Opcode) const {
+    return get(Opcode).TSFlags & SIInstrFlags::Gather4;
+  }
+
   static bool isFLAT(const MachineInstr &MI) {
     return MI.getDesc().TSFlags & SIInstrFlags::FLAT;
   }
@@ -303,11 +348,35 @@ public:
     return get(Opcode).TSFlags & SIInstrFlags::VGPRSpill;
   }
 
+  static bool isDPP(const MachineInstr &MI) {
+    return MI.getDesc().TSFlags & SIInstrFlags::DPP;
+  }
+
+  bool isDPP(uint16_t Opcode) const {
+    return get(Opcode).TSFlags & SIInstrFlags::DPP;
+  }
+
+  static bool isScalarUnit(const MachineInstr &MI) {
+    return MI.getDesc().TSFlags & (SIInstrFlags::SALU | SIInstrFlags::SMRD);
+  }
+
+  static bool usesVM_CNT(const MachineInstr &MI) {
+    return MI.getDesc().TSFlags & SIInstrFlags::VM_CNT;
+  }
+
+  bool isVGPRCopy(const MachineInstr &MI) const {
+    assert(MI.isCopy());
+    unsigned Dest = MI.getOperand(0).getReg();
+    const MachineFunction &MF = *MI.getParent()->getParent();
+    const MachineRegisterInfo &MRI = MF.getRegInfo();
+    return !RI.isSGPRReg(MRI, Dest);
+  }
+
   bool isInlineConstant(const APInt &Imm) const;
   bool isInlineConstant(const MachineOperand &MO, unsigned OpSize) const;
   bool isLiteralConstant(const MachineOperand &MO, unsigned OpSize) const;
 
-  bool isImmOperandLegal(const MachineInstr *MI, unsigned OpNo,
+  bool isImmOperandLegal(const MachineInstr &MI, unsigned OpNo,
                          const MachineOperand &MO) const;
 
   /// \brief Return true if this 64-bit VALU instruction has a 32-bit encoding.
@@ -326,7 +395,7 @@ public:
   bool hasModifiersSet(const MachineInstr &MI,
                        unsigned OpName) const;
 
-  bool verifyInstruction(const MachineInstr *MI,
+  bool verifyInstruction(const MachineInstr &MI,
                          StringRef &ErrInfo) const override;
 
   static unsigned getVALUOp(const MachineInstr &MI);
@@ -374,11 +443,11 @@ public:
   ///
   /// If the operand being legalized is a register, then a COPY will be used
   /// instead of MOV.
-  void legalizeOpWithMove(MachineInstr *MI, unsigned OpIdx) const;
+  void legalizeOpWithMove(MachineInstr &MI, unsigned OpIdx) const;
 
   /// \brief Check if \p MO is a legal operand if it was the \p OpIdx Operand
   /// for \p MI.
-  bool isOperandLegal(const MachineInstr *MI, unsigned OpIdx,
+  bool isOperandLegal(const MachineInstr &MI, unsigned OpIdx,
                       const MachineOperand *MO = nullptr) const;
 
   /// \brief Check if \p MO would be a valid operand for the given operand
@@ -396,52 +465,38 @@ public:
 
   /// \brief Legalize operands in \p MI by either commuting it or inserting a
   /// copy of src1.
-  void legalizeOperandsVOP2(MachineRegisterInfo &MRI, MachineInstr *MI) const;
+  void legalizeOperandsVOP2(MachineRegisterInfo &MRI, MachineInstr &MI) const;
 
   /// \brief Fix operands in \p MI to satisfy constant bus requirements.
-  void legalizeOperandsVOP3(MachineRegisterInfo &MRI, MachineInstr *MI) const;
+  void legalizeOperandsVOP3(MachineRegisterInfo &MRI, MachineInstr &MI) const;
 
-  /// \brief Legalize all operands in this instruction.  This function may
-  /// create new instruction and insert them before \p MI.
-  void legalizeOperands(MachineInstr *MI) const;
+  /// Copy a value from a VGPR (\p SrcReg) to SGPR.  This function can only
+  /// be used when it is know that the value in SrcReg is same across all
+  /// threads in the wave.
+  /// \returns The SGPR register that \p SrcReg was copied to.
+  unsigned readlaneVGPRToSGPR(unsigned SrcReg, MachineInstr &UseMI,
+                              MachineRegisterInfo &MRI) const;
 
-  /// \brief Split an SMRD instruction into two smaller loads of half the
-  //  size storing the results in \p Lo and \p Hi.
-  void splitSMRD(MachineInstr *MI, const TargetRegisterClass *HalfRC,
-                 unsigned HalfImmOp, unsigned HalfSGPROp,
-                 MachineInstr *&Lo, MachineInstr *&Hi) const;
+  void legalizeOperandsSMRD(MachineRegisterInfo &MRI, MachineInstr &MI) const;
 
-  void moveSMRDToVALU(MachineInstr *MI, MachineRegisterInfo &MRI,
-                      SmallVectorImpl<MachineInstr *> &Worklist) const;
+  /// \brief Legalize all operands in this instruction.  This function may
+  /// create new instruction and insert them before \p MI.
+  void legalizeOperands(MachineInstr &MI) const;
 
   /// \brief Replace this instruction's opcode with the equivalent VALU
   /// opcode.  This function will also move the users of \p MI to the
   /// VALU if necessary.
   void moveToVALU(MachineInstr &MI) const;
 
-  unsigned calculateIndirectAddress(unsigned RegIndex,
-                                    unsigned Channel) const override;
-
-  const TargetRegisterClass *getIndirectAddrRegClass() const override;
+  void insertWaitStates(MachineBasicBlock &MBB,MachineBasicBlock::iterator MI,
+                        int Count) const;
 
-  MachineInstrBuilder buildIndirectWrite(MachineBasicBlock *MBB,
-                                         MachineBasicBlock::iterator I,
-                                         unsigned ValueReg,
-                                         unsigned Address,
-                                         unsigned OffsetReg) const override;
+  void insertNoop(MachineBasicBlock &MBB,
+                  MachineBasicBlock::iterator MI) const override;
 
-  MachineInstrBuilder buildIndirectRead(MachineBasicBlock *MBB,
-                                        MachineBasicBlock::iterator I,
-                                        unsigned ValueReg,
-                                        unsigned Address,
-                                        unsigned OffsetReg) const override;
-  void reserveIndirectRegisters(BitVector &Reserved,
-                                const MachineFunction &MF) const;
-
-  void LoadM0(MachineInstr *MoveRel, MachineBasicBlock::iterator I,
-              unsigned SavReg, unsigned IndexReg) const;
-
-  void insertWaitStates(MachineBasicBlock::iterator MI, int Count) const;
+  /// \brief Return the number of wait states that result from executing this
+  /// instruction.
+  unsigned getNumWaitStates(const MachineInstr &MI) const;
 
   /// \brief Returns the operand named \p Op.  If \p MI does not have an
   /// operand named \c Op, this function returns nullptr.
@@ -463,8 +518,26 @@ public:
   uint64_t getDefaultRsrcDataFormat() const;
   uint64_t getScratchRsrcWords23() const;
 
-  bool isLowLatencyInstruction(const MachineInstr *MI) const;
-  bool isHighLatencyInstruction(const MachineInstr *MI) const;
+  bool isLowLatencyInstruction(const MachineInstr &MI) const;
+  bool isHighLatencyInstruction(const MachineInstr &MI) const;
+
+  /// \brief Return the descriptor of the target-specific machine instruction
+  /// that corresponds to the specified pseudo or native opcode.
+  const MCInstrDesc &getMCOpcodeFromPseudo(unsigned Opcode) const {
+    return get(pseudoToMCOpcode(Opcode));
+  }
+
+  unsigned getInstSizeInBytes(const MachineInstr &MI) const;
+
+  ArrayRef<std::pair<int, const char *>>
+  getSerializableTargetIndices() const override;
+
+  ScheduleHazardRecognizer *
+  CreateTargetPostRAHazardRecognizer(const InstrItineraryData *II,
+                                 const ScheduleDAG *DAG) const override;
+
+  ScheduleHazardRecognizer *
+  CreateTargetPostRAHazardRecognizer(const MachineFunction &MF) const override;
 };
 
 namespace AMDGPU {
@@ -490,8 +563,9 @@ namespace AMDGPU {
   int getAtomicNoRetOp(uint16_t Opcode);
 
   const uint64_t RSRC_DATA_FORMAT = 0xf00000000000LL;
-  const uint64_t RSRC_TID_ENABLE = 1LL << 55;
-
+  const uint64_t RSRC_ELEMENT_SIZE_SHIFT = (32 + 19);
+  const uint64_t RSRC_INDEX_STRIDE_SHIFT = (32 + 21);
+  const uint64_t RSRC_TID_ENABLE = UINT64_C(1) << (32 + 23);
 } // End namespace AMDGPU
 
 namespace SI {
diff --git a/lib/Target/AMDGPU/SIInstrInfo.td b/lib/Target/AMDGPU/SIInstrInfo.td
index 8735277149a6..253cc32b27e4 100644
--- a/lib/Target/AMDGPU/SIInstrInfo.td
+++ b/lib/Target/AMDGPU/SIInstrInfo.td
@@ -7,9 +7,9 @@
 //
 //===----------------------------------------------------------------------===//
 def isCI : Predicate<"Subtarget->getGeneration() "
-                      ">= AMDGPUSubtarget::SEA_ISLANDS">;
+                      ">= SISubtarget::SEA_ISLANDS">;
 def isCIOnly : Predicate<"Subtarget->getGeneration() =="
-                         "AMDGPUSubtarget::SEA_ISLANDS">,
+                         "SISubtarget::SEA_ISLANDS">,
   AssemblerPredicate <"FeatureSeaIslands">;
 
 def DisableInst : Predicate <"false">, AssemblerPredicate<"FeatureDisable">;
@@ -69,6 +69,11 @@ class sopk <bits<5> si, bits<5> vi = si> {
   field bits<5> VI = vi;
 }
 
+class dsop <bits<8> si, bits<8> vi = si> {
+  field bits<8> SI = si;
+  field bits<8> VI = vi;
+}
+
 // Specify an SMRD opcode for SI and SMEM opcode for VI
 
 // FIXME: This should really be bits<5> si, Tablegen crashes if
@@ -78,9 +83,9 @@ class smrd<bits<8> si, bits<8> vi = si> {
   field bits<8> VI = vi;
 }
 
-// Execpt for the NONE field, this must be kept in sync with the SISubtarget enum
-// in AMDGPUInstrInfo.cpp
-def SISubtarget {
+// Execpt for the NONE field, this must be kept in sync with the
+// SIEncodingFamily enum in AMDGPUInstrInfo.cpp
+def SIEncodingFamily {
   int NONE = -1;
   int SI = 0;
   int VI = 1;
@@ -95,6 +100,14 @@ def SIload_constant : SDNode<"AMDGPUISD::LOAD_CONSTANT",
                       [SDNPMayLoad, SDNPMemOperand]
 >;
 
+def SIatomic_inc : SDNode<"AMDGPUISD::ATOMIC_INC", SDTAtomic2,
+  [SDNPMayLoad, SDNPMayStore, SDNPMemOperand, SDNPHasChain]
+>;
+
+def SIatomic_dec : SDNode<"AMDGPUISD::ATOMIC_DEC", SDTAtomic2,
+  [SDNPMayLoad, SDNPMayStore, SDNPMemOperand, SDNPHasChain]
+>;
+
 def SItbuffer_store : SDNode<"AMDGPUISD::TBUFFER_STORE_FORMAT",
   SDTypeProfile<0, 13,
     [SDTCisVT<0, v4i32>,   // rsrc(SGPR)
@@ -120,7 +133,7 @@ def SIload_input : SDNode<"AMDGPUISD::LOAD_INPUT",
 >;
 
 class SDSample<string opcode> : SDNode <opcode,
-  SDTypeProfile<1, 4, [SDTCisVT<0, v4f32>, SDTCisVT<2, v32i8>,
+  SDTypeProfile<1, 4, [SDTCisVT<0, v4f32>, SDTCisVT<2, v8i32>,
                        SDTCisVT<3, v4i32>, SDTCisVT<4, i32>]>
 >;
 
@@ -129,9 +142,8 @@ def SIsampleb : SDSample<"AMDGPUISD::SAMPLEB">;
 def SIsampled : SDSample<"AMDGPUISD::SAMPLED">;
 def SIsamplel : SDSample<"AMDGPUISD::SAMPLEL">;
 
-def SIconstdata_ptr : SDNode<
-  "AMDGPUISD::CONST_DATA_PTR", SDTypeProfile <1, 1, [SDTCisVT<0, i64>,
-                                                     SDTCisVT<0, i64>]>
+def SIpc_add_rel_offset : SDNode<"AMDGPUISD::PC_ADD_REL_OFFSET",
+  SDTypeProfile<1, 1, [SDTCisVT<0, iPTR>, SDTCisSameAs<0,1>]>
 >;
 
 //===----------------------------------------------------------------------===//
@@ -140,12 +152,14 @@ def SIconstdata_ptr : SDNode<
 
 class flat_ld <SDPatternOperator ld> : PatFrag<(ops node:$ptr),
                                                (ld node:$ptr), [{
-  return isFlatLoad(dyn_cast<LoadSDNode>(N)) ||
-         isGlobalLoad(dyn_cast<LoadSDNode>(N)) ||
-         isConstantLoad(cast<LoadSDNode>(N), -1);
+  const MemSDNode *LD = cast<MemSDNode>(N);
+  return LD->getAddressSpace() == AMDGPUAS::FLAT_ADDRESS ||
+         LD->getAddressSpace() == AMDGPUAS::GLOBAL_ADDRESS ||
+         LD->getAddressSpace() == AMDGPUAS::CONSTANT_ADDRESS;
 }]>;
 
 def flat_load : flat_ld <load>;
+def atomic_flat_load : flat_ld<atomic_load>;
 def flat_az_extloadi8 : flat_ld <az_extloadi8>;
 def flat_sextloadi8 : flat_ld <sextloadi8>;
 def flat_az_extloadi16 : flat_ld <az_extloadi16>;
@@ -153,26 +167,50 @@ def flat_sextloadi16 : flat_ld <sextloadi16>;
 
 class flat_st <SDPatternOperator st> : PatFrag<(ops node:$val, node:$ptr),
                                                (st node:$val, node:$ptr), [{
-  return isFlatStore(dyn_cast<StoreSDNode>(N)) ||
-         isGlobalStore(dyn_cast<StoreSDNode>(N));
+  const MemSDNode *ST = cast<MemSDNode>(N);
+  return ST->getAddressSpace() == AMDGPUAS::FLAT_ADDRESS ||
+         ST->getAddressSpace() == AMDGPUAS::GLOBAL_ADDRESS;
 }]>;
 
 def flat_store: flat_st <store>;
+def atomic_flat_store: flat_st <atomic_store>;
 def flat_truncstorei8 : flat_st <truncstorei8>;
 def flat_truncstorei16 : flat_st <truncstorei16>;
 
+class MubufLoad <SDPatternOperator op> : PatFrag <
+  (ops node:$ptr), (op node:$ptr), [{
 
-def mubuf_load : PatFrag <(ops node:$ptr), (load node:$ptr), [{
-	return isGlobalLoad(cast<LoadSDNode>(N)) ||
-         isConstantLoad(cast<LoadSDNode>(N), -1);
+  const MemSDNode *LD = cast<MemSDNode>(N);
+  return LD->getAddressSpace() == AMDGPUAS::GLOBAL_ADDRESS ||
+         LD->getAddressSpace() == AMDGPUAS::CONSTANT_ADDRESS;
 }]>;
 
+def mubuf_load : MubufLoad <load>;
+def mubuf_az_extloadi8 : MubufLoad <az_extloadi8>;
+def mubuf_sextloadi8 : MubufLoad <sextloadi8>;
+def mubuf_az_extloadi16 : MubufLoad <az_extloadi16>;
+def mubuf_sextloadi16 : MubufLoad <sextloadi16>;
+
+def mubuf_load_atomic : MubufLoad <atomic_load>;
+
 def smrd_load : PatFrag <(ops node:$ptr), (load node:$ptr), [{
-  return isConstantLoad(cast<LoadSDNode>(N), -1) &&
-  static_cast<const SITargetLowering *>(getTargetLowering())->isMemOpUniform(N);
+  auto Ld = cast<LoadSDNode>(N);
+  return Ld->getAlignment() >= 4  &&
+    Ld->getAddressSpace() == AMDGPUAS::CONSTANT_ADDRESS &&
+    static_cast<const SITargetLowering *>(getTargetLowering())->isMemOpUniform(N);
 }]>;
 
 //===----------------------------------------------------------------------===//
+// PatFrags for global memory operations
+//===----------------------------------------------------------------------===//
+
+def atomic_inc_global : global_binary_atomic_op<SIatomic_inc>;
+def atomic_dec_global : global_binary_atomic_op<SIatomic_dec>;
+
+def atomic_inc_flat : flat_binary_atomic_op<SIatomic_inc>;
+def atomic_dec_flat : flat_binary_atomic_op<SIatomic_dec>;
+
+//===----------------------------------------------------------------------===//
 // SDNodes and PatFrag for local loads and stores to enable s_mov_b32 m0, -1
 // to be glued to the memory instructions.
 //===----------------------------------------------------------------------===//
@@ -182,7 +220,7 @@ def SIld_local : SDNode <"ISD::LOAD", SDTLoad,
 >;
 
 def si_ld_local : PatFrag <(ops node:$ptr), (SIld_local node:$ptr), [{
-  return isLocalLoad(cast<LoadSDNode>(N));
+  return cast<LoadSDNode>(N)->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS;
 }]>;
 
 def si_load_local : PatFrag <(ops node:$ptr), (si_ld_local node:$ptr), [{
@@ -219,7 +257,7 @@ def SIst_local : SDNode <"ISD::STORE", SDTStore,
 
 def si_st_local : PatFrag <
   (ops node:$val, node:$ptr), (SIst_local node:$val, node:$ptr), [{
-  return isLocalStore(cast<StoreSDNode>(N));
+  return cast<StoreSDNode>(N)->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS;
 }]>;
 
 def si_store_local : PatFrag <
@@ -247,9 +285,34 @@ def si_truncstore_local_i16 : PatFrag <
   return cast<StoreSDNode>(N)->getMemoryVT() == MVT::i16;
 }]>;
 
-multiclass SIAtomicM0Glue2 <string op_name> {
+def si_setcc_uniform : PatFrag <
+  (ops node:$lhs, node:$rhs, node:$cond),
+  (setcc node:$lhs, node:$rhs, node:$cond), [{
+  for (SDNode *Use : N->uses()) {
+    if (Use->isMachineOpcode() || Use->getOpcode() != ISD::CopyToReg)
+      return false;
+
+    unsigned Reg = cast<RegisterSDNode>(Use->getOperand(1))->getReg();
+    if (Reg != AMDGPU::SCC)
+      return false;
+  }
+  return true;
+}]>;
+
+def si_uniform_br : PatFrag <
+  (ops node:$cond, node:$bb), (brcond node:$cond, node:$bb), [{
+  return isUniformBr(N);
+}]>;
+
+def si_uniform_br_scc : PatFrag <
+  (ops node:$cond, node:$bb), (si_uniform_br node:$cond, node:$bb), [{
+  return isCBranchSCC(N);
+}]>;
+
+multiclass SIAtomicM0Glue2 <string op_name, bit is_amdgpu = 0> {
 
-  def _glue : SDNode <"ISD::ATOMIC_"#op_name, SDTAtomic2,
+  def _glue : SDNode <
+    !if(is_amdgpu, "AMDGPUISD", "ISD")#"::ATOMIC_"#op_name, SDTAtomic2,
     [SDNPHasChain, SDNPMayStore, SDNPMayLoad, SDNPMemOperand, SDNPInGlue]
   >;
 
@@ -257,11 +320,13 @@ multiclass SIAtomicM0Glue2 <string op_name> {
 }
 
 defm si_atomic_load_add : SIAtomicM0Glue2 <"LOAD_ADD">;
+defm si_atomic_load_sub : SIAtomicM0Glue2 <"LOAD_SUB">;
+defm si_atomic_inc : SIAtomicM0Glue2 <"INC", 1>;
+defm si_atomic_dec : SIAtomicM0Glue2 <"DEC", 1>;
 defm si_atomic_load_and : SIAtomicM0Glue2 <"LOAD_AND">;
 defm si_atomic_load_min : SIAtomicM0Glue2 <"LOAD_MIN">;
 defm si_atomic_load_max : SIAtomicM0Glue2 <"LOAD_MAX">;
 defm si_atomic_load_or : SIAtomicM0Glue2 <"LOAD_OR">;
-defm si_atomic_load_sub : SIAtomicM0Glue2 <"LOAD_SUB">;
 defm si_atomic_load_xor : SIAtomicM0Glue2 <"LOAD_XOR">;
 defm si_atomic_load_umin : SIAtomicM0Glue2 <"LOAD_UMIN">;
 defm si_atomic_load_umax : SIAtomicM0Glue2 <"LOAD_UMAX">;
@@ -347,6 +412,10 @@ def IMM16bit : PatLeaf <(imm),
   [{return isUInt<16>(N->getZExtValue());}]
 >;
 
+def SIMM16bit : PatLeaf <(imm),
+  [{return isInt<16>(N->getSExtValue());}]
+>;
+
 def IMM20bit : PatLeaf <(imm),
   [{return isUInt<20>(N->getZExtValue());}]
 >;
@@ -369,7 +438,7 @@ class InlineFPImm <ValueType vt> : PatLeaf <(vt fpimm), [{
 }]>;
 
 class SGPRImm <dag frag> : PatLeaf<frag, [{
-  if (Subtarget->getGeneration() < AMDGPUSubtarget::SOUTHERN_ISLANDS) {
+  if (Subtarget->getGeneration() < SISubtarget::SOUTHERN_ISLANDS) {
     return false;
   }
   const SIRegisterInfo *SIRI =
@@ -402,188 +471,133 @@ def sopp_brtarget : Operand<OtherVT> {
   let ParserMatchClass = SoppBrTarget;
 }
 
-def const_ga : Operand<iPTR>;
-
-include "SIInstrFormats.td"
-include "VIInstrFormats.td"
+def si_ga : Operand<iPTR>;
 
-def MubufOffsetMatchClass : AsmOperandClass {
-  let Name = "MubufOffset";
-  let ParserMethod = "parseMubufOptionalOps";
-  let RenderMethod = "addImmOperands";
+def InterpSlot : Operand<i32> {
+  let PrintMethod = "printInterpSlot";
 }
 
-class DSOffsetBaseMatchClass <string parser> : AsmOperandClass {
-  let Name = "DSOffset"#parser;
-  let ParserMethod = parser;
+def SendMsgMatchClass : AsmOperandClass {
+  let Name = "SendMsg";
+  let PredicateMethod = "isSendMsg";
+  let ParserMethod = "parseSendMsgOp";
   let RenderMethod = "addImmOperands";
-  let PredicateMethod = "isDSOffset";
 }
 
-def DSOffsetMatchClass : DSOffsetBaseMatchClass <"parseDSOptionalOps">;
-def DSOffsetGDSMatchClass : DSOffsetBaseMatchClass <"parseDSOffsetOptional">;
-
-def DSOffset01MatchClass : AsmOperandClass {
-  let Name = "DSOffset1";
-  let ParserMethod = "parseDSOff01OptionalOps";
-  let RenderMethod = "addImmOperands";
-  let PredicateMethod = "isDSOffset01";
+def SendMsgImm : Operand<i32> {
+  let PrintMethod = "printSendMsg";
+  let ParserMatchClass = SendMsgMatchClass;
 }
 
-class GDSBaseMatchClass <string parser> : AsmOperandClass {
-  let Name = "GDS"#parser;
-  let PredicateMethod = "isImm";
-  let ParserMethod = parser;
+def SWaitMatchClass : AsmOperandClass {
+  let Name = "SWaitCnt";
   let RenderMethod = "addImmOperands";
+  let ParserMethod = "parseSWaitCntOps";
 }
 
-def GDSMatchClass : GDSBaseMatchClass <"parseDSOptionalOps">;
-def GDS01MatchClass : GDSBaseMatchClass <"parseDSOff01OptionalOps">;
-
-class GLCBaseMatchClass <string parser> : AsmOperandClass {
-  let Name = "GLC"#parser;
-  let PredicateMethod = "isImm";
-  let ParserMethod = parser;
-  let RenderMethod = "addImmOperands";
+def WAIT_FLAG : Operand <i32> {
+  let ParserMatchClass = SWaitMatchClass;
+  let PrintMethod = "printWaitFlag";
 }
 
-def GLCMubufMatchClass : GLCBaseMatchClass <"parseMubufOptionalOps">;
-def GLCFlatMatchClass : GLCBaseMatchClass <"parseFlatOptionalOps">;
+include "SIInstrFormats.td"
+include "VIInstrFormats.td"
 
-class SLCBaseMatchClass <string parser> : AsmOperandClass {
-  let Name = "SLC"#parser;
-  let PredicateMethod = "isImm";
-  let ParserMethod = parser;
+class NamedMatchClass<string CName, bit Optional = 1> : AsmOperandClass {
+  let Name = "Imm"#CName;
+  let PredicateMethod = "is"#CName;
+  let ParserMethod = !if(Optional, "parseOptionalOperand", "parse"#CName);
   let RenderMethod = "addImmOperands";
+  let IsOptional = Optional;
+  let DefaultMethod = !if(Optional, "default"#CName, ?);
 }
 
-def SLCMubufMatchClass : SLCBaseMatchClass <"parseMubufOptionalOps">;
-def SLCFlatMatchClass : SLCBaseMatchClass <"parseFlatOptionalOps">;
-def SLCFlatAtomicMatchClass : SLCBaseMatchClass <"parseFlatAtomicOptionalOps">;
-
-class TFEBaseMatchClass <string parser> : AsmOperandClass {
-  let Name = "TFE"#parser;
-  let PredicateMethod = "isImm";
-  let ParserMethod = parser;
-  let RenderMethod = "addImmOperands";
+class NamedOperandBit<string Name, AsmOperandClass MatchClass> : Operand<i1> {
+  let PrintMethod = "print"#Name;
+  let ParserMatchClass = MatchClass;
 }
 
-def TFEMubufMatchClass : TFEBaseMatchClass <"parseMubufOptionalOps">;
-def TFEFlatMatchClass : TFEBaseMatchClass <"parseFlatOptionalOps">;
-def TFEFlatAtomicMatchClass : TFEBaseMatchClass <"parseFlatAtomicOptionalOps">;
-
-def OModMatchClass : AsmOperandClass {
-  let Name = "OMod";
-  let PredicateMethod = "isImm";
-  let ParserMethod = "parseVOP3OptionalOps";
-  let RenderMethod = "addImmOperands";
+class NamedOperandU8<string Name, AsmOperandClass MatchClass> : Operand<i8> {
+  let PrintMethod = "print"#Name;
+  let ParserMatchClass = MatchClass;
 }
 
-def ClampMatchClass : AsmOperandClass {
-  let Name = "Clamp";
-  let PredicateMethod = "isImm";
-  let ParserMethod = "parseVOP3OptionalOps";
-  let RenderMethod = "addImmOperands";
+class NamedOperandU16<string Name, AsmOperandClass MatchClass> : Operand<i16> {
+  let PrintMethod = "print"#Name;
+  let ParserMatchClass = MatchClass;
 }
 
-class SMRDOffsetBaseMatchClass <string predicate> : AsmOperandClass {
-  let Name = "SMRDOffset"#predicate;
-  let PredicateMethod = predicate;
-  let RenderMethod = "addImmOperands";
+class NamedOperandU32<string Name, AsmOperandClass MatchClass> : Operand<i32> {
+  let PrintMethod = "print"#Name;
+  let ParserMatchClass = MatchClass;
 }
 
-def SMRDOffsetMatchClass : SMRDOffsetBaseMatchClass <"isSMRDOffset">;
-def SMRDLiteralOffsetMatchClass : SMRDOffsetBaseMatchClass <
-  "isSMRDLiteralOffset"
->;
-
 let OperandType = "OPERAND_IMMEDIATE" in {
 
-def offen : Operand<i1> {
-  let PrintMethod = "printOffen";
-}
-def idxen : Operand<i1> {
-  let PrintMethod = "printIdxen";
-}
-def addr64 : Operand<i1> {
-  let PrintMethod = "printAddr64";
-}
-def mbuf_offset : Operand<i16> {
-  let PrintMethod = "printMBUFOffset";
-  let ParserMatchClass = MubufOffsetMatchClass;
-}
-class ds_offset_base <AsmOperandClass mc> : Operand<i16> {
-  let PrintMethod = "printDSOffset";
-  let ParserMatchClass = mc;
-}
-def ds_offset : ds_offset_base <DSOffsetMatchClass>;
-def ds_offset_gds : ds_offset_base <DSOffsetGDSMatchClass>;
+def offen : NamedOperandBit<"Offen", NamedMatchClass<"Offen">>;
+def idxen : NamedOperandBit<"Idxen", NamedMatchClass<"Idxen">>;
+def addr64 : NamedOperandBit<"Addr64", NamedMatchClass<"Addr64">>;
 
-def ds_offset0 : Operand<i8> {
-  let PrintMethod = "printDSOffset0";
-  let ParserMatchClass = DSOffset01MatchClass;
-}
-def ds_offset1 : Operand<i8> {
-  let PrintMethod = "printDSOffset1";
-  let ParserMatchClass = DSOffset01MatchClass;
-}
-class gds_base <AsmOperandClass mc> : Operand <i1> {
-  let PrintMethod = "printGDS";
-  let ParserMatchClass = mc;
-}
-def gds : gds_base <GDSMatchClass>;
+def offset : NamedOperandU16<"Offset", NamedMatchClass<"Offset">>;
+def offset0 : NamedOperandU8<"Offset0", NamedMatchClass<"Offset0">>;
+def offset1 : NamedOperandU8<"Offset1", NamedMatchClass<"Offset1">>;
 
-def gds01 : gds_base <GDS01MatchClass>;
+def gds : NamedOperandBit<"GDS", NamedMatchClass<"GDS">>;
 
-class glc_base <AsmOperandClass mc> : Operand <i1> {
-  let PrintMethod = "printGLC";
-  let ParserMatchClass = mc;
-}
+def omod : NamedOperandU32<"OModSI", NamedMatchClass<"OModSI">>;
+def clampmod : NamedOperandBit<"ClampSI", NamedMatchClass<"ClampSI">>;
 
-def glc : glc_base <GLCMubufMatchClass>;
-def glc_flat : glc_base <GLCFlatMatchClass>;
+def smrd_offset : NamedOperandU32<"SMRDOffset", NamedMatchClass<"SMRDOffset">>;
+def smrd_literal_offset : NamedOperandU32<"SMRDLiteralOffset", NamedMatchClass<"SMRDLiteralOffset">>;
 
-class slc_base <AsmOperandClass mc> : Operand <i1> {
-  let PrintMethod = "printSLC";
-  let ParserMatchClass = mc;
-}
+def glc : NamedOperandBit<"GLC", NamedMatchClass<"GLC">>;
+def slc : NamedOperandBit<"SLC", NamedMatchClass<"SLC">>;
+def tfe : NamedOperandBit<"TFE", NamedMatchClass<"TFE">>;
+def unorm : NamedOperandBit<"UNorm", NamedMatchClass<"UNorm">>;
+def da : NamedOperandBit<"DA", NamedMatchClass<"DA">>;
+def r128 : NamedOperandBit<"R128", NamedMatchClass<"R128">>;
+def lwe : NamedOperandBit<"LWE", NamedMatchClass<"LWE">>;
 
-def slc : slc_base <SLCMubufMatchClass>;
-def slc_flat : slc_base <SLCFlatMatchClass>;
-def slc_flat_atomic : slc_base <SLCFlatAtomicMatchClass>;
+def dmask : NamedOperandU16<"DMask", NamedMatchClass<"DMask">>;
 
-class tfe_base <AsmOperandClass mc> : Operand <i1> {
-  let PrintMethod = "printTFE";
-  let ParserMatchClass = mc;
-}
+def dpp_ctrl : NamedOperandU32<"DPPCtrl", NamedMatchClass<"DPPCtrl", 0>>;
+def row_mask : NamedOperandU32<"RowMask", NamedMatchClass<"RowMask">>;
+def bank_mask : NamedOperandU32<"BankMask", NamedMatchClass<"BankMask">>;
+def bound_ctrl : NamedOperandBit<"BoundCtrl", NamedMatchClass<"BoundCtrl">>;
 
-def tfe : tfe_base <TFEMubufMatchClass>;
-def tfe_flat : tfe_base <TFEFlatMatchClass>;
-def tfe_flat_atomic : tfe_base <TFEFlatAtomicMatchClass>;
+def dst_sel : NamedOperandU32<"SDWADstSel", NamedMatchClass<"SDWADstSel">>;
+def src0_sel : NamedOperandU32<"SDWASrc0Sel", NamedMatchClass<"SDWASrc0Sel">>;
+def src1_sel : NamedOperandU32<"SDWASrc1Sel", NamedMatchClass<"SDWASrc1Sel">>;
+def dst_unused : NamedOperandU32<"SDWADstUnused", NamedMatchClass<"SDWADstUnused">>;
 
-def omod : Operand <i32> {
-  let PrintMethod = "printOModSI";
-  let ParserMatchClass = OModMatchClass;
-}
+def hwreg : NamedOperandU16<"Hwreg", NamedMatchClass<"Hwreg", 0>>;
+
+} // End OperandType = "OPERAND_IMMEDIATE"
 
-def ClampMod : Operand <i1> {
-  let PrintMethod = "printClampSI";
-  let ParserMatchClass = ClampMatchClass;
-}
 
-def smrd_offset : Operand <i32> {
-  let PrintMethod = "printU32ImmOperand";
-  let ParserMatchClass = SMRDOffsetMatchClass;
+def VOPDstS64 : VOPDstOperand <SReg_64>;
+
+def FPInputModsMatchClass : AsmOperandClass {
+  let Name = "RegOrImmWithFPInputMods";
+  let ParserMethod = "parseRegOrImmWithFPInputMods";
+  let PredicateMethod = "isRegOrImmWithInputMods";
 }
 
-def smrd_literal_offset : Operand <i32> {
-  let PrintMethod = "printU32ImmOperand";
-  let ParserMatchClass = SMRDLiteralOffsetMatchClass;
+def FPInputMods : Operand <i32> {
+  let PrintMethod = "printOperandAndFPInputMods";
+  let ParserMatchClass = FPInputModsMatchClass;
 }
 
-} // End OperandType = "OPERAND_IMMEDIATE"
+def IntInputModsMatchClass : AsmOperandClass {
+  let Name = "RegOrImmWithIntInputMods";
+  let ParserMethod = "parseRegOrImmWithIntInputMods";
+  let PredicateMethod = "isRegOrImmWithInputMods";
+}
 
-def VOPDstS64 : VOPDstOperand <SReg_64>;
+def IntInputMods: Operand <i32> {
+  let PrintMethod = "printOperandAndIntInputMods";
+  let ParserMatchClass = IntInputModsMatchClass;
+}
 
 //===----------------------------------------------------------------------===//
 // Complex patterns
@@ -595,9 +609,13 @@ def DS64Bit4ByteAligned : ComplexPattern<i32, 3, "SelectDS64Bit4ByteAligned">;
 def MUBUFAddr32 : ComplexPattern<i64, 9, "SelectMUBUFAddr32">;
 def MUBUFAddr64 : ComplexPattern<i64, 7, "SelectMUBUFAddr64">;
 def MUBUFAddr64Atomic : ComplexPattern<i64, 5, "SelectMUBUFAddr64">;
+def FLATAtomic : ComplexPattern<i64, 3, "SelectFlat">;
 def MUBUFScratch : ComplexPattern<i64, 4, "SelectMUBUFScratch">;
 def MUBUFOffset : ComplexPattern<i64, 6, "SelectMUBUFOffset">;
+def MUBUFOffsetNoGLC : ComplexPattern<i64, 3, "SelectMUBUFOffset">;
 def MUBUFOffsetAtomic : ComplexPattern<i64, 4, "SelectMUBUFOffset">;
+def MUBUFIntrinsicOffset : ComplexPattern<i32, 2, "SelectMUBUFIntrinsicOffset">;
+def MUBUFIntrinsicVOffset : ComplexPattern<i32, 3, "SelectMUBUFIntrinsicVOffset">;
 
 def SMRDImm   : ComplexPattern<i64, 2, "SelectSMRDImm">;
 def SMRDImm32 : ComplexPattern<i64, 2, "SelectSMRDImm32">;
@@ -606,6 +624,8 @@ def SMRDBufferImm   : ComplexPattern<i32, 1, "SelectSMRDBufferImm">;
 def SMRDBufferImm32 : ComplexPattern<i32, 1, "SelectSMRDBufferImm32">;
 def SMRDBufferSgpr  : ComplexPattern<i32, 1, "SelectSMRDBufferSgpr">;
 
+def MOVRELOffset : ComplexPattern<i32, 2, "SelectMOVRELOffset">;
+
 def VOP3Mods0 : ComplexPattern<untyped, 4, "SelectVOP3Mods0">;
 def VOP3NoMods0 : ComplexPattern<untyped, 4, "SelectVOP3NoMods0">;
 def VOP3Mods0Clamp : ComplexPattern<untyped, 3, "SelectVOP3Mods0Clamp">;
@@ -670,17 +690,24 @@ class EXPCommon : InstSI<
 
   let EXP_CNT = 1;
   let Uses = [EXEC];
+  let SchedRW = [WriteExport];
 }
 
 multiclass EXP_m {
 
   let isPseudo = 1, isCodeGenOnly = 1 in {
-    def "" : EXPCommon, SIMCInstr <"exp", SISubtarget.NONE> ;
+    def "" : EXPCommon, SIMCInstr <"exp", SIEncodingFamily.NONE> ;
   }
 
-  def _si : EXPCommon, SIMCInstr <"exp", SISubtarget.SI>, EXPe;
+  def _si : EXPCommon, SIMCInstr <"exp", SIEncodingFamily.SI>, EXPe {
+    let DecoderNamespace="SICI";
+    let DisableDecoder = DisableSIDecoder;
+  }
 
-  def _vi : EXPCommon, SIMCInstr <"exp", SISubtarget.VI>, EXPe_vi;
+  def _vi : EXPCommon, SIMCInstr <"exp", SIEncodingFamily.VI>, EXPe_vi {
+    let DecoderNamespace="VI";
+    let DisableDecoder = DisableVIDecoder;
+  }
 }
 
 //===----------------------------------------------------------------------===//
@@ -689,7 +716,7 @@ multiclass EXP_m {
 
 class SOP1_Pseudo <string opName, dag outs, dag ins, list<dag> pattern> :
   SOP1 <outs, ins, "", pattern>,
-  SIMCInstr<opName, SISubtarget.NONE> {
+  SIMCInstr<opName, SIEncodingFamily.NONE> {
   let isPseudo = 1;
   let isCodeGenOnly = 1;
 }
@@ -697,17 +724,21 @@ class SOP1_Pseudo <string opName, dag outs, dag ins, list<dag> pattern> :
 class SOP1_Real_si <sop1 op, string opName, dag outs, dag ins, string asm> :
   SOP1 <outs, ins, asm, []>,
   SOP1e <op.SI>,
-  SIMCInstr<opName, SISubtarget.SI> {
+  SIMCInstr<opName, SIEncodingFamily.SI> {
   let isCodeGenOnly = 0;
   let AssemblerPredicates = [isSICI];
+  let DecoderNamespace = "SICI";
+  let DisableDecoder = DisableSIDecoder;
 }
 
 class SOP1_Real_vi <sop1 op, string opName, dag outs, dag ins, string asm> :
   SOP1 <outs, ins, asm, []>,
   SOP1e <op.VI>,
-  SIMCInstr<opName, SISubtarget.VI> {
+  SIMCInstr<opName, SIEncodingFamily.VI> {
   let isCodeGenOnly = 0;
   let AssemblerPredicates = [isVI];
+  let DecoderNamespace = "VI";
+  let DisableDecoder = DisableVIDecoder;
 }
 
 multiclass SOP1_m <sop1 op, string opName, dag outs, dag ins, string asm,
@@ -722,27 +753,27 @@ multiclass SOP1_m <sop1 op, string opName, dag outs, dag ins, string asm,
 }
 
 multiclass SOP1_32 <sop1 op, string opName, list<dag> pattern> : SOP1_m <
-    op, opName, (outs SReg_32:$dst), (ins SSrc_32:$src0),
-    opName#" $dst, $src0", pattern
+    op, opName, (outs SReg_32:$sdst), (ins SSrc_32:$src0),
+    opName#" $sdst, $src0", pattern
 >;
 
 multiclass SOP1_64 <sop1 op, string opName, list<dag> pattern> : SOP1_m <
-    op, opName, (outs SReg_64:$dst), (ins SSrc_64:$src0),
-    opName#" $dst, $src0", pattern
+    op, opName, (outs SReg_64:$sdst), (ins SSrc_64:$src0),
+    opName#" $sdst, $src0", pattern
 >;
 
 // no input, 64-bit output.
 multiclass SOP1_64_0 <sop1 op, string opName, list<dag> pattern> {
-  def "" : SOP1_Pseudo <opName, (outs SReg_64:$dst), (ins), pattern>;
+  def "" : SOP1_Pseudo <opName, (outs SReg_64:$sdst), (ins), pattern>;
 
-  def _si : SOP1_Real_si <op, opName, (outs SReg_64:$dst), (ins),
-    opName#" $dst"> {
-    let ssrc0 = 0;
+  def _si : SOP1_Real_si <op, opName, (outs SReg_64:$sdst), (ins),
+    opName#" $sdst"> {
+    let src0 = 0;
   }
 
-  def _vi : SOP1_Real_vi <op, opName, (outs SReg_64:$dst), (ins),
-    opName#" $dst"> {
-    let ssrc0 = 0;
+  def _vi : SOP1_Real_vi <op, opName, (outs SReg_64:$sdst), (ins),
+    opName#" $sdst"> {
+    let src0 = 0;
   }
 }
 
@@ -763,13 +794,19 @@ multiclass SOP1_1 <sop1 op, string opName, list<dag> pattern> {
 
 // 64-bit input, 32-bit output.
 multiclass SOP1_32_64 <sop1 op, string opName, list<dag> pattern> : SOP1_m <
-    op, opName, (outs SReg_32:$dst), (ins SSrc_64:$src0),
-    opName#" $dst, $src0", pattern
+    op, opName, (outs SReg_32:$sdst), (ins SSrc_64:$src0),
+    opName#" $sdst, $src0", pattern
+>;
+
+// 32-bit input, 64-bit output.
+multiclass SOP1_64_32 <sop1 op, string opName, list<dag> pattern> : SOP1_m <
+    op, opName, (outs SReg_64:$sdst), (ins SSrc_32:$src0),
+    opName#" $sdst, $src0", pattern
 >;
 
 class SOP2_Pseudo<string opName, dag outs, dag ins, list<dag> pattern> :
   SOP2<outs, ins, "", pattern>,
-  SIMCInstr<opName, SISubtarget.NONE> {
+  SIMCInstr<opName, SIEncodingFamily.NONE> {
   let isPseudo = 1;
   let isCodeGenOnly = 1;
   let Size = 4;
@@ -784,15 +821,19 @@ class SOP2_Pseudo<string opName, dag outs, dag ins, list<dag> pattern> :
 class SOP2_Real_si<sop2 op, string opName, dag outs, dag ins, string asm> :
   SOP2<outs, ins, asm, []>,
   SOP2e<op.SI>,
-  SIMCInstr<opName, SISubtarget.SI> {
+  SIMCInstr<opName, SIEncodingFamily.SI> {
   let AssemblerPredicates = [isSICI];
+  let DecoderNamespace = "SICI";
+  let DisableDecoder = DisableSIDecoder;
 }
 
 class SOP2_Real_vi<sop2 op, string opName, dag outs, dag ins, string asm> :
   SOP2<outs, ins, asm, []>,
   SOP2e<op.VI>,
-  SIMCInstr<opName, SISubtarget.VI> {
+  SIMCInstr<opName, SIEncodingFamily.VI> {
   let AssemblerPredicates = [isVI];
+  let DecoderNamespace = "VI";
+  let DisableDecoder = DisableVIDecoder;
 }
 
 multiclass SOP2_m <sop2 op, string opName, dag outs, dag ins, string asm,
@@ -807,36 +848,49 @@ multiclass SOP2_m <sop2 op, string opName, dag outs, dag ins, string asm,
 }
 
 multiclass SOP2_32 <sop2 op, string opName, list<dag> pattern> : SOP2_m <
-    op, opName, (outs SReg_32:$dst), (ins SSrc_32:$src0, SSrc_32:$src1),
-    opName#" $dst, $src0, $src1", pattern
+    op, opName, (outs SReg_32:$sdst), (ins SSrc_32:$src0, SSrc_32:$src1),
+    opName#" $sdst, $src0, $src1", pattern
 >;
 
 multiclass SOP2_64 <sop2 op, string opName, list<dag> pattern> : SOP2_m <
-    op, opName, (outs SReg_64:$dst), (ins SSrc_64:$src0, SSrc_64:$src1),
-    opName#" $dst, $src0, $src1", pattern
+    op, opName, (outs SReg_64:$sdst), (ins SSrc_64:$src0, SSrc_64:$src1),
+    opName#" $sdst, $src0, $src1", pattern
 >;
 
 multiclass SOP2_64_32 <sop2 op, string opName, list<dag> pattern> : SOP2_m <
-    op, opName, (outs SReg_64:$dst), (ins SSrc_64:$src0, SSrc_32:$src1),
-    opName#" $dst, $src0, $src1", pattern
+    op, opName, (outs SReg_64:$sdst), (ins SSrc_64:$src0, SSrc_32:$src1),
+    opName#" $sdst, $src0, $src1", pattern
 >;
 
-class SOPC_Helper <bits<7> op, RegisterOperand rc, ValueType vt,
-                    string opName, PatLeaf cond> : SOPC <
-  op, (outs), (ins rc:$src0, rc:$src1),
-  opName#" $src0, $src1", []> {
+multiclass SOP2_64_32_32 <sop2 op, string opName, list<dag> pattern> : SOP2_m <
+    op, opName, (outs SReg_64:$sdst), (ins SSrc_32:$src0, SSrc_32:$src1),
+    opName#" $sdst, $src0, $src1", pattern
+>;
+
+class SOPC_Base <bits<7> op, RegisterOperand rc0, RegisterOperand rc1,
+                 string opName, list<dag> pattern = []> : SOPC <
+  op, (outs), (ins rc0:$src0, rc1:$src1),
+  opName#" $src0, $src1", pattern > {
   let Defs = [SCC];
 }
+class SOPC_Helper <bits<7> op, RegisterOperand rc, ValueType vt,
+                    string opName, PatLeaf cond> : SOPC_Base <
+  op, rc, rc, opName,
+  [(set SCC, (si_setcc_uniform vt:$src0, vt:$src1, cond))] > {
+}
 
-class SOPC_32<bits<7> op, string opName, PatLeaf cond = COND_NULL>
+class SOPC_CMP_32<bits<7> op, string opName, PatLeaf cond = COND_NULL>
   : SOPC_Helper<op, SSrc_32, i32, opName, cond>;
 
-class SOPC_64<bits<7> op, string opName, PatLeaf cond = COND_NULL>
-  : SOPC_Helper<op, SSrc_64, i64, opName, cond>;
+class SOPC_32<bits<7> op, string opName, list<dag> pattern = []>
+  : SOPC_Base<op, SSrc_32, SSrc_32, opName, pattern>;
+
+class SOPC_64_32<bits<7> op, string opName, list<dag> pattern = []>
+  : SOPC_Base<op, SSrc_64, SSrc_32, opName, pattern>;
 
 class SOPK_Pseudo <string opName, dag outs, dag ins, list<dag> pattern> :
   SOPK <outs, ins, "", pattern>,
-  SIMCInstr<opName, SISubtarget.NONE> {
+  SIMCInstr<opName, SIEncodingFamily.NONE> {
   let isPseudo = 1;
   let isCodeGenOnly = 1;
 }
@@ -844,16 +898,20 @@ class SOPK_Pseudo <string opName, dag outs, dag ins, list<dag> pattern> :
 class SOPK_Real_si <sopk op, string opName, dag outs, dag ins, string asm> :
   SOPK <outs, ins, asm, []>,
   SOPKe <op.SI>,
-  SIMCInstr<opName, SISubtarget.SI> {
+  SIMCInstr<opName, SIEncodingFamily.SI> {
   let AssemblerPredicates = [isSICI];
+  let DecoderNamespace = "SICI";
+  let DisableDecoder = DisableSIDecoder;
   let isCodeGenOnly = 0;
 }
 
 class SOPK_Real_vi <sopk op, string opName, dag outs, dag ins, string asm> :
   SOPK <outs, ins, asm, []>,
   SOPKe <op.VI>,
-  SIMCInstr<opName, SISubtarget.VI> {
+  SIMCInstr<opName, SIEncodingFamily.VI> {
   let AssemblerPredicates = [isVI];
+  let DecoderNamespace = "VI";
+  let DisableDecoder = DisableVIDecoder;
   let isCodeGenOnly = 0;
 }
 
@@ -868,14 +926,14 @@ multiclass SOPK_m <sopk op, string opName, dag outs, dag ins, string opAsm,
 }
 
 multiclass SOPK_32 <sopk op, string opName, list<dag> pattern> {
-  def "" : SOPK_Pseudo <opName, (outs SReg_32:$dst), (ins u16imm:$src0),
+  def "" : SOPK_Pseudo <opName, (outs SReg_32:$sdst), (ins u16imm:$simm16),
     pattern>;
 
-  def _si : SOPK_Real_si <op, opName, (outs SReg_32:$dst), (ins u16imm:$src0),
-    opName#" $dst, $src0">;
+  def _si : SOPK_Real_si <op, opName, (outs SReg_32:$sdst), (ins u16imm:$simm16),
+    opName#" $sdst, $simm16">;
 
-  def _vi : SOPK_Real_vi <op, opName, (outs SReg_32:$dst), (ins u16imm:$src0),
-    opName#" $dst, $src0">;
+  def _vi : SOPK_Real_vi <op, opName, (outs SReg_32:$sdst), (ins u16imm:$simm16),
+    opName#" $sdst, $simm16">;
 }
 
 multiclass SOPK_SCC <sopk op, string opName, list<dag> pattern> {
@@ -908,15 +966,19 @@ multiclass SOPK_IMM32 <sopk op, string opName, dag outs, dag ins,
 
   def _si : SOPK <outs, ins, asm, []>,
             SOPK64e <op.SI>,
-            SIMCInstr<opName, SISubtarget.SI> {
+            SIMCInstr<opName, SIEncodingFamily.SI> {
               let AssemblerPredicates = [isSICI];
+              let DecoderNamespace = "SICI";
+              let DisableDecoder = DisableSIDecoder;
               let isCodeGenOnly = 0;
             }
 
   def _vi : SOPK <outs, ins, asm, []>,
             SOPK64e <op.VI>,
-            SIMCInstr<opName, SISubtarget.VI> {
+            SIMCInstr<opName, SIEncodingFamily.VI> {
               let AssemblerPredicates = [isVI];
+              let DecoderNamespace = "VI";
+              let DisableDecoder = DisableVIDecoder;
               let isCodeGenOnly = 0;
             }
 }
@@ -926,86 +988,145 @@ multiclass SOPK_IMM32 <sopk op, string opName, dag outs, dag ins,
 
 class SMRD_Pseudo <string opName, dag outs, dag ins, list<dag> pattern> :
   SMRD <outs, ins, "", pattern>,
-  SIMCInstr<opName, SISubtarget.NONE> {
+  SIMCInstr<opName, SIEncodingFamily.NONE> {
   let isPseudo = 1;
   let isCodeGenOnly = 1;
 }
 
-class SMRD_Real_si <bits<5> op, string opName, bit imm, dag outs, dag ins,
-                    string asm> :
+class SMRD_IMM_Real_si <bits<5> op, string opName, dag outs, dag ins,
+                        string asm> :
+  SMRD <outs, ins, asm, []>,
+  SMRD_IMMe <op>,
+  SIMCInstr<opName, SIEncodingFamily.SI> {
+  let AssemblerPredicates = [isSICI];
+  let DecoderNamespace = "SICI";
+  let DisableDecoder = DisableSIDecoder;
+}
+
+class SMRD_SOFF_Real_si <bits<5> op, string opName, dag outs, dag ins,
+                         string asm> :
   SMRD <outs, ins, asm, []>,
-  SMRDe <op, imm>,
-  SIMCInstr<opName, SISubtarget.SI> {
+  SMRD_SOFFe <op>,
+  SIMCInstr<opName, SIEncodingFamily.SI> {
   let AssemblerPredicates = [isSICI];
+  let DecoderNamespace = "SICI";
+  let DisableDecoder = DisableSIDecoder;
+}
+
+
+class SMRD_IMM_Real_vi <bits<8> op, string opName, dag outs, dag ins,
+                        string asm, list<dag> pattern = []> :
+  SMRD <outs, ins, asm, pattern>,
+  SMEM_IMMe_vi <op>,
+  SIMCInstr<opName, SIEncodingFamily.VI> {
+  let AssemblerPredicates = [isVI];
+  let DecoderNamespace = "VI";
+  let DisableDecoder = DisableVIDecoder;
 }
 
-class SMRD_Real_vi <bits<8> op, string opName, bit imm, dag outs, dag ins,
-                    string asm, list<dag> pattern = []> :
+class SMRD_SOFF_Real_vi <bits<8> op, string opName, dag outs, dag ins,
+                         string asm, list<dag> pattern = []> :
   SMRD <outs, ins, asm, pattern>,
-  SMEMe_vi <op, imm>,
-  SIMCInstr<opName, SISubtarget.VI> {
+  SMEM_SOFFe_vi <op>,
+  SIMCInstr<opName, SIEncodingFamily.VI> {
   let AssemblerPredicates = [isVI];
+  let DecoderNamespace = "VI";
+  let DisableDecoder = DisableVIDecoder;
 }
 
-multiclass SMRD_m <smrd op, string opName, bit imm, dag outs, dag ins,
+
+multiclass SMRD_IMM_m <smrd op, string opName, dag outs, dag ins,
                    string asm, list<dag> pattern> {
 
   def "" : SMRD_Pseudo <opName, outs, ins, pattern>;
 
-  def _si : SMRD_Real_si <op.SI, opName, imm, outs, ins, asm>;
+  def _si : SMRD_IMM_Real_si <op.SI, opName, outs, ins, asm>;
 
   // glc is only applicable to scalar stores, which are not yet
   // implemented.
   let glc = 0 in {
-    def _vi : SMRD_Real_vi <op.VI, opName, imm, outs, ins, asm>;
+    def _vi : SMRD_IMM_Real_vi <op.VI, opName, outs, ins, asm>;
   }
 }
 
-multiclass SMRD_Inval <smrd op, string opName,
-                       SDPatternOperator node> {
-  let hasSideEffects = 1, mayStore = 1 in {
-    def "" : SMRD_Pseudo <opName, (outs), (ins), [(node)]>;
+multiclass SMRD_SOFF_m <smrd op, string opName, dag outs, dag ins,
+                        string asm, list<dag> pattern> {
 
-    let sbase = 0, offset = 0 in {
-      let sdst = 0 in {
-        def _si : SMRD_Real_si <op.SI, opName, 0, (outs), (ins), opName>;
-      }
+  def "" : SMRD_Pseudo <opName, outs, ins, pattern>;
+
+  def _si : SMRD_SOFF_Real_si <op.SI, opName, outs, ins, asm>;
+
+  // glc is only applicable to scalar stores, which are not yet
+  // implemented.
+  let glc = 0 in {
+    def _vi : SMRD_SOFF_Real_vi <op.VI, opName, outs, ins, asm>;
+  }
+}
+
+multiclass SMRD_Special <smrd op, string opName, dag outs,
+                       int sdst_ = ?,
+                       string opStr = "",
+                       list<dag> pattern = []> {
+  let hasSideEffects = 1 in {
+    def "" : SMRD_Pseudo <opName, outs, (ins), pattern>;
+
+    let sbase = 0, soff = 0, sdst = sdst_ in {
+      def _si : SMRD_SOFF_Real_si <op.SI, opName, outs, (ins), opName#opStr>;
 
-      let glc = 0, sdata = 0 in {
-        def _vi : SMRD_Real_vi <op.VI, opName, 0, (outs), (ins), opName>;
+      let glc = 0 in {
+        def _vi : SMRD_SOFF_Real_vi <op.VI, opName, outs, (ins), opName#opStr>;
       }
     }
   }
 }
 
+multiclass SMRD_Inval <smrd op, string opName,
+                     SDPatternOperator node> {
+  let mayStore = 1 in {
+    defm : SMRD_Special<op, opName, (outs), 0, "", [(node)]>;
+  }
+}
+
 class SMEM_Inval <bits<8> op, string opName, SDPatternOperator node> :
-  SMRD_Real_vi<op, opName, 0, (outs), (ins), opName, [(node)]> {
+  SMRD_SOFF_Real_vi<op, opName, (outs), (ins), opName, [(node)]> {
   let hasSideEffects = 1;
   let mayStore = 1;
   let sbase = 0;
-  let sdata = 0;
+  let sdst = 0;
+  let glc = 0;
+  let soff = 0;
+}
+
+class SMEM_Ret <bits<8> op, string opName, SDPatternOperator node> :
+  SMRD_SOFF_Real_vi<op, opName, (outs SReg_64:$sdst), (ins),
+  opName#" $sdst", [(set i64:$sdst, (node))]> {
+  let hasSideEffects = 1;
+  let mayStore = ?;
+  let mayLoad = ?;
+  let sbase = 0;
   let glc = 0;
-  let offset = 0;
+  let soff = 0;
 }
 
 multiclass SMRD_Helper <smrd op, string opName, RegisterClass baseClass,
                         RegisterClass dstClass> {
-  defm _IMM : SMRD_m <
-    op, opName#"_IMM", 1, (outs dstClass:$dst),
+  defm _IMM : SMRD_IMM_m <
+    op, opName#"_IMM", (outs dstClass:$sdst),
     (ins baseClass:$sbase, smrd_offset:$offset),
-    opName#" $dst, $sbase, $offset", []
+    opName#" $sdst, $sbase, $offset", []
   >;
 
   def _IMM_ci : SMRD <
-    (outs dstClass:$dst), (ins baseClass:$sbase, smrd_literal_offset:$offset),
-    opName#" $dst, $sbase, $offset", []>, SMRD_IMMe_ci <op.SI> {
+    (outs dstClass:$sdst), (ins baseClass:$sbase, smrd_literal_offset:$offset),
+    opName#" $sdst, $sbase, $offset", []>, SMRD_IMMe_ci <op.SI> {
     let AssemblerPredicates = [isCIOnly];
+    let DecoderNamespace = "CI";
   }
 
-  defm _SGPR : SMRD_m <
-    op, opName#"_SGPR", 0, (outs dstClass:$dst),
+  defm _SGPR : SMRD_SOFF_m <
+    op, opName#"_SGPR", (outs dstClass:$sdst),
     (ins baseClass:$sbase, SReg_32:$soff),
-    opName#" $dst, $sbase, $soff", []
+    opName#" $sdst, $sbase, $soff", []
   >;
 }
 
@@ -1013,20 +1134,6 @@ multiclass SMRD_Helper <smrd op, string opName, RegisterClass baseClass,
 // Vector ALU classes
 //===----------------------------------------------------------------------===//
 
-// This must always be right before the operand being input modified.
-def InputMods : OperandWithDefaultOps <i32, (ops (i32 0))> {
-  let PrintMethod = "printOperandAndMods";
-}
-
-def InputModsMatchClass : AsmOperandClass {
-  let Name = "RegWithInputMods";
-}
-
-def InputModsNoDefault : Operand <i32> {
-  let PrintMethod = "printOperandAndMods";
-  let ParserMatchClass = InputModsMatchClass;
-}
-
 class getNumSrcArgs<ValueType Src0, ValueType Src1, ValueType Src2> {
   int ret =
     !if (!eq(Src0.Value, untyped.Value),      0,
@@ -1050,12 +1157,12 @@ class getVOPSrc0ForVT<ValueType VT> {
   RegisterOperand ret = !if(!eq(VT.Size, 64), VSrc_64, VSrc_32);
 }
 
-// Returns the register class to use for source 1 of VOP[12C] for the
-// given VT.
-class getVOPSrc1ForVT<ValueType VT> {
+// Returns the vreg register class to use for source operand given VT
+class getVregSrcForVT<ValueType VT> {
   RegisterClass ret = !if(!eq(VT.Size, 64), VReg_64, VGPR_32);
 }
 
+
 // Returns the register class to use for sources of VOP3 instructions for the
 // given VT.
 class getVOP3SrcForVT<ValueType VT> {
@@ -1072,8 +1179,10 @@ class getVOP3SrcForVT<ValueType VT> {
 // Returns 1 if the source arguments have modifiers, 0 if they do not.
 // XXX - do f16 instructions?
 class hasModifiers<ValueType SrcVT> {
-  bit ret = !if(!eq(SrcVT.Value, f32.Value), 1,
-            !if(!eq(SrcVT.Value, f64.Value), 1, 0));
+  bit ret =
+    !if(!eq(SrcVT.Value, f32.Value), 1,
+    !if(!eq(SrcVT.Value, f64.Value), 1,
+    0));
 }
 
 // Returns the input arguments for VOP[12C] instructions for the given SrcVT.
@@ -1089,11 +1198,15 @@ class getIns64 <RegisterOperand Src0RC, RegisterOperand Src1RC,
                 bit HasModifiers> {
 
   dag ret =
+    !if (!eq(NumSrcArgs, 0),
+      // VOP1 without input operands (V_NOP, V_CLREXCP)
+      (ins),
+      /* else */
     !if (!eq(NumSrcArgs, 1),
       !if (!eq(HasModifiers, 1),
         // VOP1 with modifiers
-        (ins InputModsNoDefault:$src0_modifiers, Src0RC:$src0,
-             ClampMod:$clamp, omod:$omod)
+        (ins FPInputMods:$src0_modifiers, Src0RC:$src0,
+             clampmod:$clamp, omod:$omod)
       /* else */,
         // VOP1 without modifiers
         (ins Src0RC:$src0)
@@ -1101,9 +1214,9 @@ class getIns64 <RegisterOperand Src0RC, RegisterOperand Src1RC,
     !if (!eq(NumSrcArgs, 2),
       !if (!eq(HasModifiers, 1),
         // VOP 2 with modifiers
-        (ins InputModsNoDefault:$src0_modifiers, Src0RC:$src0,
-             InputModsNoDefault:$src1_modifiers, Src1RC:$src1,
-             ClampMod:$clamp, omod:$omod)
+        (ins FPInputMods:$src0_modifiers, Src0RC:$src0,
+             FPInputMods:$src1_modifiers, Src1RC:$src1,
+             clampmod:$clamp, omod:$omod)
       /* else */,
         // VOP2 without modifiers
         (ins Src0RC:$src0, Src1RC:$src1)
@@ -1111,21 +1224,109 @@ class getIns64 <RegisterOperand Src0RC, RegisterOperand Src1RC,
     /* NumSrcArgs == 3 */,
       !if (!eq(HasModifiers, 1),
         // VOP3 with modifiers
-        (ins InputModsNoDefault:$src0_modifiers, Src0RC:$src0,
-             InputModsNoDefault:$src1_modifiers, Src1RC:$src1,
-             InputModsNoDefault:$src2_modifiers, Src2RC:$src2,
-             ClampMod:$clamp, omod:$omod)
+        (ins FPInputMods:$src0_modifiers, Src0RC:$src0,
+             FPInputMods:$src1_modifiers, Src1RC:$src1,
+             FPInputMods:$src2_modifiers, Src2RC:$src2,
+             clampmod:$clamp, omod:$omod)
       /* else */,
         // VOP3 without modifiers
         (ins Src0RC:$src0, Src1RC:$src1, Src2RC:$src2)
-      /* endif */ )));
+      /* endif */ ))));
+}
+
+class getInsDPP <RegisterClass Src0RC, RegisterClass Src1RC, int NumSrcArgs,
+                                                             bit HasModifiers> {
+
+  dag ret = !if (!eq(NumSrcArgs, 0),
+                // VOP1 without input operands (V_NOP)
+                (ins dpp_ctrl:$dpp_ctrl, row_mask:$row_mask,
+                     bank_mask:$bank_mask, bound_ctrl:$bound_ctrl),
+            !if (!eq(NumSrcArgs, 1),
+              !if (!eq(HasModifiers, 1),
+                // VOP1_DPP with modifiers
+                (ins FPInputMods:$src0_modifiers, Src0RC:$src0,
+                     dpp_ctrl:$dpp_ctrl, row_mask:$row_mask,
+                     bank_mask:$bank_mask, bound_ctrl:$bound_ctrl)
+              /* else */,
+                // VOP1_DPP without modifiers
+                (ins Src0RC:$src0, dpp_ctrl:$dpp_ctrl, row_mask:$row_mask,
+                bank_mask:$bank_mask, bound_ctrl:$bound_ctrl)
+              /* endif */)
+              /* NumSrcArgs == 2 */,
+              !if (!eq(HasModifiers, 1),
+                // VOP2_DPP with modifiers
+                (ins FPInputMods:$src0_modifiers, Src0RC:$src0,
+                     FPInputMods:$src1_modifiers, Src1RC:$src1,
+                     dpp_ctrl:$dpp_ctrl, row_mask:$row_mask,
+                     bank_mask:$bank_mask, bound_ctrl:$bound_ctrl)
+              /* else */,
+                // VOP2_DPP without modifiers
+                (ins Src0RC:$src0, Src1RC:$src1, dpp_ctrl:$dpp_ctrl,
+                row_mask:$row_mask, bank_mask:$bank_mask,
+                bound_ctrl:$bound_ctrl)
+             /* endif */)));
+}
+
+class getInsSDWA <RegisterClass Src0RC, RegisterClass Src1RC, int NumSrcArgs,
+                  bit HasFloatModifiers, ValueType DstVT> {
+
+  dag ret = !if(!eq(NumSrcArgs, 0),
+               // VOP1 without input operands (V_NOP)
+               (ins),
+            !if(!eq(NumSrcArgs, 1),
+                !if(HasFloatModifiers,
+                    // VOP1_SDWA with float modifiers
+                    (ins FPInputMods:$src0_fmodifiers, Src0RC:$src0,
+                         clampmod:$clamp, dst_sel:$dst_sel, dst_unused:$dst_unused,
+                         src0_sel:$src0_sel)
+                /* else */,
+                    // VOP1_SDWA with sext modifier
+                    (ins IntInputMods:$src0_imodifiers, Src0RC:$src0,
+                         clampmod:$clamp, dst_sel:$dst_sel, dst_unused:$dst_unused,
+                         src0_sel:$src0_sel)
+                /* endif */)
+              /* NumSrcArgs == 2 */,
+              !if(HasFloatModifiers,
+                  !if(!eq(DstVT.Size, 1),
+                      // VOPC_SDWA with float modifiers
+                      (ins FPInputMods:$src0_fmodifiers, Src0RC:$src0,
+                           FPInputMods:$src1_fmodifiers, Src1RC:$src1,
+                           clampmod:$clamp, src0_sel:$src0_sel, src1_sel:$src1_sel),
+                      // VOP2_SDWA or VOPC_SDWA with float modifiers
+                      (ins FPInputMods:$src0_fmodifiers, Src0RC:$src0,
+                           FPInputMods:$src1_fmodifiers, Src1RC:$src1,
+                           clampmod:$clamp, dst_sel:$dst_sel, dst_unused:$dst_unused,
+                           src0_sel:$src0_sel, src1_sel:$src1_sel)
+                  ),
+              /* else */
+                !if(!eq(DstVT.Size, 1),
+                    // VOPC_SDWA with sext modifiers
+                    (ins IntInputMods:$src0_imodifiers, Src0RC:$src0,
+                         IntInputMods:$src1_imodifiers, Src1RC:$src1,
+                         clampmod:$clamp, src0_sel:$src0_sel, src1_sel:$src1_sel),
+                    // VOP2_SDWA or VOPC_SDWA with sext modifier
+                    (ins IntInputMods:$src0_imodifiers, Src0RC:$src0,
+                         IntInputMods:$src1_imodifiers, Src1RC:$src1,
+                         clampmod:$clamp, dst_sel:$dst_sel, dst_unused:$dst_unused,
+                         src0_sel:$src0_sel, src1_sel:$src1_sel)
+                )
+             /* endif */)));
+}
+
+// Outs for DPP and SDWA
+class getOutsExt <bit HasDst, ValueType DstVT, RegisterOperand DstRCDPP> {
+  dag ret = !if(HasDst,
+                !if(!eq(DstVT.Size, 1),
+                    (outs), // no dst for VOPC, we use "vcc"-token as dst in SDWA VOPC instructions
+                    (outs DstRCDPP:$vdst)),
+                (outs)); // V_NOP
 }
 
 // Returns the assembly string for the inputs and outputs of a VOP[12C]
 // instruction.  This does not add the _e32 suffix, so it can be reused
 // by getAsm64.
-class getAsm32 <bit HasDst, int NumSrcArgs> {
-  string dst = "$dst";
+class getAsm32 <bit HasDst, int NumSrcArgs, ValueType DstVT = i32> {
+  string dst = !if(!eq(DstVT.Size, 1), "$sdst", "$vdst"); // use $sdst for VOPC
   string src0 = ", $src0";
   string src1 = ", $src1";
   string src2 = ", $src2";
@@ -1137,7 +1338,8 @@ class getAsm32 <bit HasDst, int NumSrcArgs> {
 
 // Returns the assembly string for the inputs and outputs of a VOP3
 // instruction.
-class getAsm64 <bit HasDst, int NumSrcArgs, bit HasModifiers> {
+class getAsm64 <bit HasDst, int NumSrcArgs, bit HasModifiers, ValueType DstVT = i32> {
+  string dst = !if(!eq(DstVT.Size, 1), "$sdst", "$vdst"); // use $sdst for VOPC
   string src0 = !if(!eq(NumSrcArgs, 1), "$src0_modifiers", "$src0_modifiers,");
   string src1 = !if(!eq(NumSrcArgs, 1), "",
                    !if(!eq(NumSrcArgs, 2), " $src1_modifiers",
@@ -1145,8 +1347,71 @@ class getAsm64 <bit HasDst, int NumSrcArgs, bit HasModifiers> {
   string src2 = !if(!eq(NumSrcArgs, 3), " $src2_modifiers", "");
   string ret =
   !if(!eq(HasModifiers, 0),
-      getAsm32<HasDst, NumSrcArgs>.ret,
-      "$dst, "#src0#src1#src2#"$clamp"#"$omod");
+      getAsm32<HasDst, NumSrcArgs, DstVT>.ret,
+      dst#", "#src0#src1#src2#"$clamp"#"$omod");
+}
+
+class getAsmDPP <bit HasDst, int NumSrcArgs, bit HasModifiers, ValueType DstVT = i32> {
+  string dst = !if(HasDst,
+                   !if(!eq(DstVT.Size, 1),
+                       "$sdst",
+                       "$vdst"),
+                    ""); // use $sdst for VOPC
+  string src0 = !if(!eq(NumSrcArgs, 1), "$src0_modifiers", "$src0_modifiers,");
+  string src1 = !if(!eq(NumSrcArgs, 1), "",
+                   !if(!eq(NumSrcArgs, 2), " $src1_modifiers",
+                                           " $src1_modifiers,"));
+  string args = !if(!eq(HasModifiers, 0),
+                     getAsm32<0, NumSrcArgs, DstVT>.ret,
+                     ", "#src0#src1);
+  string ret = dst#args#" $dpp_ctrl$row_mask$bank_mask$bound_ctrl";
+}
+
+class getAsmSDWA <bit HasDst, int NumSrcArgs, bit HasFloatModifiers,
+                  ValueType DstVT = i32> {
+  string dst = !if(HasDst,
+                   !if(!eq(DstVT.Size, 1),
+                       " vcc", // use vcc token as dst for VOPC instructioins
+                       "$vdst"),
+                    "");
+  string src0 = !if(HasFloatModifiers, "$src0_fmodifiers", "$src0_imodifiers");
+  string src1 = !if(HasFloatModifiers, "$src1_fmodifiers", "$src1_imodifiers");
+  string args = !if(!eq(NumSrcArgs, 0),
+                    "",
+                    !if(!eq(NumSrcArgs, 1),
+                        ", "#src0#"$clamp",
+                        ", "#src0#", "#src1#"$clamp"
+                     )
+                );
+  string sdwa = !if(!eq(NumSrcArgs, 0),
+                    "",
+                    !if(!eq(NumSrcArgs, 1),
+                        " $dst_sel $dst_unused $src0_sel",
+                        !if(!eq(DstVT.Size, 1),
+                            " $src0_sel $src1_sel", // No dst_sel and dst_unused for VOPC
+                            " $dst_sel $dst_unused $src0_sel $src1_sel"
+                        )
+                    )
+                );
+  string ret = dst#args#sdwa;
+}
+
+// Function that checks if instruction supports DPP and SDWA
+class getHasExt <int NumSrcArgs, ValueType DstVT = i32, ValueType Src0VT = i32,
+                 ValueType Src1VT = i32> {
+  bit ret = !if(!eq(NumSrcArgs, 3),
+                0, // NumSrcArgs == 3 - No DPP or SDWA for VOP3
+                !if(!eq(DstVT.Size, 64),
+                    0, // 64-bit dst - No DPP or SDWA for 64-bit operands
+                    !if(!eq(Src0VT.Size, 64),
+                        0, // 64-bit src0
+                        !if(!eq(Src0VT.Size, 64),
+                            0, // 64-bit src2
+                            1
+                        )
+                    )
+                )
+            );
 }
 
 class VOPProfile <list<ValueType> _ArgVT> {
@@ -1158,30 +1423,48 @@ class VOPProfile <list<ValueType> _ArgVT> {
   field ValueType Src1VT = ArgVT[2];
   field ValueType Src2VT = ArgVT[3];
   field RegisterOperand DstRC = getVALUDstForVT<DstVT>.ret;
+  field RegisterOperand DstRCDPP = getVALUDstForVT<DstVT>.ret;
+  field RegisterOperand DstRCSDWA = getVALUDstForVT<DstVT>.ret;
   field RegisterOperand Src0RC32 = getVOPSrc0ForVT<Src0VT>.ret;
-  field RegisterClass Src1RC32 = getVOPSrc1ForVT<Src1VT>.ret;
+  field RegisterClass Src1RC32 = getVregSrcForVT<Src1VT>.ret;
   field RegisterOperand Src0RC64 = getVOP3SrcForVT<Src0VT>.ret;
   field RegisterOperand Src1RC64 = getVOP3SrcForVT<Src1VT>.ret;
   field RegisterOperand Src2RC64 = getVOP3SrcForVT<Src2VT>.ret;
+  field RegisterClass Src0DPP = getVregSrcForVT<Src0VT>.ret;
+  field RegisterClass Src1DPP = getVregSrcForVT<Src1VT>.ret;
+  field RegisterClass Src0SDWA = getVregSrcForVT<Src0VT>.ret;
+  field RegisterClass Src1SDWA = getVregSrcForVT<Src1VT>.ret;
 
   field bit HasDst = !if(!eq(DstVT.Value, untyped.Value), 0, 1);
   field bit HasDst32 = HasDst;
   field int NumSrcArgs = getNumSrcArgs<Src0VT, Src1VT, Src2VT>.ret;
   field bit HasModifiers = hasModifiers<Src0VT>.ret;
 
-  field dag Outs = !if(HasDst,(outs DstRC:$dst),(outs));
+  field bit HasExt = getHasExt<NumSrcArgs, DstVT, Src0VT, Src1VT>.ret;
+
+  field dag Outs = !if(HasDst,(outs DstRC:$vdst),(outs));
 
   // VOP3b instructions are a special case with a second explicit
   // output. This is manually overridden for them.
   field dag Outs32 = Outs;
   field dag Outs64 = Outs;
+  field dag OutsDPP = getOutsExt<HasDst, DstVT, DstRCDPP>.ret;
+  field dag OutsSDWA = getOutsExt<HasDst, DstVT, DstRCDPP>.ret;
 
   field dag Ins32 = getIns32<Src0RC32, Src1RC32, NumSrcArgs>.ret;
   field dag Ins64 = getIns64<Src0RC64, Src1RC64, Src2RC64, NumSrcArgs,
                              HasModifiers>.ret;
+  field dag InsDPP = getInsDPP<Src0DPP, Src1DPP, NumSrcArgs, HasModifiers>.ret;
+  field dag InsSDWA = getInsSDWA<Src0SDWA, Src1SDWA, NumSrcArgs, HasModifiers, DstVT>.ret;
+
+  field string Asm32 = getAsm32<HasDst, NumSrcArgs, DstVT>.ret;
+  field string Asm64 = getAsm64<HasDst, NumSrcArgs, HasModifiers, DstVT>.ret;
+  field string AsmDPP = getAsmDPP<HasDst, NumSrcArgs, HasModifiers, DstVT>.ret;
+  field string AsmSDWA = getAsmSDWA<HasDst, NumSrcArgs, HasModifiers, DstVT>.ret;
+}
 
-  field string Asm32 = getAsm32<HasDst, NumSrcArgs>.ret;
-  field string Asm64 = getAsm64<HasDst, NumSrcArgs, HasModifiers>.ret;
+class VOP_NO_EXT <VOPProfile p> : VOPProfile <p.ArgVT> {
+  let HasExt = 0;
 }
 
 // FIXME: I think these F16/I16 profiles will need to use f16/i16 types in order
@@ -1194,6 +1477,9 @@ def VOP_F16_F16_F16 : VOPProfile <[f16, f16, f16, untyped]>;
 def VOP_F16_F16_I16 : VOPProfile <[f16, f16, i32, untyped]>;
 def VOP_I16_I16_I16 : VOPProfile <[i32, i32, i32, untyped]>;
 
+def VOP_I16_I16_I16_I16 : VOPProfile <[i32, i32, i32, i32, untyped]>;
+def VOP_F16_F16_F16_F16 : VOPProfile <[f16, f16, f16, f16, untyped]>;
+
 def VOP_NONE : VOPProfile <[untyped, untyped, untyped, untyped]>;
 
 def VOP_F32_F32 : VOPProfile <[f32, f32, untyped, untyped]>;
@@ -1216,10 +1502,10 @@ def VOP_I32_I32_I32 : VOPProfile <[i32, i32, i32, untyped]>;
 
 // Write out to vcc or arbitrary SGPR.
 def VOP2b_I32_I1_I32_I32 : VOPProfile<[i32, i32, i32, untyped]> {
-  let Asm32 = "$dst, vcc, $src0, $src1";
-  let Asm64 = "$dst, $sdst, $src0, $src1";
-  let Outs32 = (outs DstRC:$dst);
-  let Outs64 = (outs DstRC:$dst, SReg_64:$sdst);
+  let Asm32 = "$vdst, vcc, $src0, $src1";
+  let Asm64 = "$vdst, $sdst, $src0, $src1";
+  let Outs32 = (outs DstRC:$vdst);
+  let Outs64 = (outs DstRC:$vdst, SReg_64:$sdst);
 }
 
 // Write out to vcc or arbitrary SGPR and read in from vcc or
@@ -1231,10 +1517,23 @@ def VOP2b_I32_I1_I32_I32_I1 : VOPProfile<[i32, i32, i32, i1]> {
   // restriction. SGPRs are still allowed because it should
   // technically be possible to use VCC again as src0.
   let Src0RC32 = VCSrc_32;
-  let Asm32 = "$dst, vcc, $src0, $src1, vcc";
-  let Asm64 = "$dst, $sdst, $src0, $src1, $src2";
-  let Outs32 = (outs DstRC:$dst);
-  let Outs64 = (outs DstRC:$dst, SReg_64:$sdst);
+  let Asm32 = "$vdst, vcc, $src0, $src1, vcc";
+  let Asm64 = "$vdst, $sdst, $src0, $src1, $src2";
+  let Outs32 = (outs DstRC:$vdst);
+  let Outs64 = (outs DstRC:$vdst, SReg_64:$sdst);
+
+  // Suppress src2 implied by type since the 32-bit encoding uses an
+  // implicit VCC use.
+  let Ins32 = (ins Src0RC32:$src0, Src1RC32:$src1);
+}
+
+// Read in from vcc or arbitrary SGPR
+def VOP2e_I32_I32_I32_I1 : VOPProfile<[i32, i32, i32, i1]> {
+  let Src0RC32 = VCSrc_32; // See comment in def VOP2b_I32_I1_I32_I32_I1 above.
+  let Asm32 = "$vdst, $src0, $src1, vcc";
+  let Asm64 = "$vdst, $src0, $src1, $src2";
+  let Outs32 = (outs DstRC:$vdst);
+  let Outs64 = (outs DstRC:$vdst);
 
   // Suppress src2 implied by type since the 32-bit encoding uses an
   // implicit VCC use.
@@ -1263,11 +1562,17 @@ class VOPC_Profile<ValueType vt0, ValueType vt1 = vt0> : VOPProfile <[i1, vt0, v
   let Asm32 = "vcc, $src0, $src1";
   // The destination for 32-bit encoding is implicit.
   let HasDst32 = 0;
+  let Outs64 = (outs DstRC:$sdst);
 }
 
 class VOPC_Class_Profile<ValueType vt> : VOPC_Profile<vt, i32> {
-  let Ins64 = (ins InputModsNoDefault:$src0_modifiers, Src0RC64:$src0, Src1RC64:$src1);
-  let Asm64 = "$dst, $src0_modifiers, $src1";
+  let Ins64 = (ins FPInputMods:$src0_modifiers, Src0RC64:$src0, Src1RC64:$src1);
+  let Asm64 = "$sdst, $src0_modifiers, $src1";
+  let InsSDWA = (ins FPInputMods:$src0_fmodifiers, Src0RC64:$src0,
+                     IntInputMods:$src1_imodifiers, Src1RC64:$src1,
+                     clampmod:$clamp, src0_sel:$src0_sel, src1_sel:$src1_sel);
+  let AsmSDWA = " vcc, $src0_fmodifiers, $src1_imodifiers$clamp $src0_sel $src1_sel";
+
 }
 
 def VOPC_I1_F32_F32 : VOPC_Profile<f32>;
@@ -1281,28 +1586,42 @@ def VOPC_I1_F64_I32 : VOPC_Class_Profile<f64>;
 def VOP_I64_I64_I32 : VOPProfile <[i64, i64, i32, untyped]>;
 def VOP_I64_I32_I64 : VOPProfile <[i64, i32, i64, untyped]>;
 def VOP_I64_I64_I64 : VOPProfile <[i64, i64, i64, untyped]>;
-def VOP_CNDMASK : VOPProfile <[i32, i32, i32, untyped]> {
-  let Ins32 = (ins Src0RC32:$src0, Src1RC32:$src1);
-  let Ins64 = (ins Src0RC64:$src0, Src1RC64:$src1, SSrc_64:$src2);
-  let Asm64 = "$dst, $src0, $src1, $src2";
-}
 
 def VOP_F32_F32_F32_F32 : VOPProfile <[f32, f32, f32, f32]>;
-def VOP_MADK : VOPProfile <[f32, f32, f32, f32]> {
-  field dag Ins = (ins VCSrc_32:$src0, VGPR_32:$vsrc1, u32imm:$src2);
-  field string Asm = "$dst, $src0, $vsrc1, $src2";
+def VOP_MADAK : VOPProfile <[f32, f32, f32, f32]> {
+  field dag Ins32 = (ins VCSrc_32:$src0, VGPR_32:$src1, u32kimm:$imm);
+  field string Asm32 = "$vdst, $src0, $src1, $imm";
+  field bit HasExt = 0;
+}
+def VOP_MADMK : VOPProfile <[f32, f32, f32, f32]> {
+  field dag Ins32 = (ins VCSrc_32:$src0, u32kimm:$imm, VGPR_32:$src1);
+  field string Asm32 = "$vdst, $src0, $imm, $src1";
+  field bit HasExt = 0;
 }
 def VOP_MAC : VOPProfile <[f32, f32, f32, f32]> {
   let Ins32 = (ins Src0RC32:$src0, Src1RC32:$src1, VGPR_32:$src2);
   let Ins64 = getIns64<Src0RC64, Src1RC64, RegisterOperand<VGPR_32>, 3,
                              HasModifiers>.ret;
-  let Asm32 = getAsm32<1, 2>.ret;
-  let Asm64 = getAsm64<1, 2, HasModifiers>.ret;
+  let InsDPP = (ins FPInputMods:$src0_modifiers, Src0RC32:$src0,
+                    FPInputMods:$src1_modifiers, Src1RC32:$src1,
+                    VGPR_32:$src2, // stub argument
+                    dpp_ctrl:$dpp_ctrl, row_mask:$row_mask,
+                    bank_mask:$bank_mask, bound_ctrl:$bound_ctrl);
+  let InsSDWA = (ins FPInputMods:$src0_fmodifiers, Src0RC32:$src0,
+                     FPInputMods:$src1_fmodifiers, Src1RC32:$src1,
+                     VGPR_32:$src2, // stub argument
+                     clampmod:$clamp, dst_sel:$dst_sel, dst_unused:$dst_unused,
+                     src0_sel:$src0_sel, src1_sel:$src1_sel);
+  let Asm32 = getAsm32<1, 2, f32>.ret;
+  let Asm64 = getAsm64<1, 2, HasModifiers, f32>.ret;
+  let AsmDPP = getAsmDPP<1, 2, HasModifiers, f32>.ret;
+  let AsmSDWA = getAsmSDWA<1, 2, HasModifiers, f32>.ret;
 }
 def VOP_F64_F64_F64_F64 : VOPProfile <[f64, f64, f64, f64]>;
 def VOP_I32_I32_I32_I32 : VOPProfile <[i32, i32, i32, i32]>;
 def VOP_I64_I32_I32_I64 : VOPProfile <[i64, i32, i32, i64]>;
 
+// This class is used only with VOPC instructions. Use $sdst for out operand
 class SIInstAlias <string asm, Instruction inst, VOPProfile p> :
     InstAlias <asm, (inst)>, PredicateControl {
 
@@ -1313,13 +1632,13 @@ class SIInstAlias <string asm, Instruction inst, VOPProfile p> :
     !if (p.HasDst32,
       !if (!eq(p.NumSrcArgs, 0),
         // 1 dst, 0 src
-        (inst p.DstRC:$dst),
+        (inst p.DstRC:$sdst),
       !if (!eq(p.NumSrcArgs, 1),
         // 1 dst, 1 src
-        (inst p.DstRC:$dst, p.Src0RC32:$src0),
+        (inst p.DstRC:$sdst, p.Src0RC32:$src0),
       !if (!eq(p.NumSrcArgs, 2),
         // 1 dst, 2 src
-        (inst p.DstRC:$dst, p.Src0RC32:$src0, p.Src1RC32:$src1),
+        (inst p.DstRC:$sdst, p.Src0RC32:$src0, p.Src1RC32:$src1),
       // else - unreachable
         (inst)))),
     // else
@@ -1368,7 +1687,7 @@ class AtomicNoRet <string noRetOp, bit isRet> {
 class VOP1_Pseudo <dag outs, dag ins, list<dag> pattern, string opName> :
   VOP1Common <outs, ins, "", pattern>,
   VOP <opName>,
-  SIMCInstr <opName#"_e32", SISubtarget.NONE>,
+  SIMCInstr <opName#"_e32", SIEncodingFamily.NONE>,
   MnemonicAlias<opName#"_e32", opName> {
   let isPseudo = 1;
   let isCodeGenOnly = 1;
@@ -1379,14 +1698,18 @@ class VOP1_Pseudo <dag outs, dag ins, list<dag> pattern, string opName> :
 
 class VOP1_Real_si <string opName, vop1 op, dag outs, dag ins, string asm> :
   VOP1<op.SI, outs, ins, asm, []>,
-  SIMCInstr <opName#"_e32", SISubtarget.SI> {
+  SIMCInstr <opName#"_e32", SIEncodingFamily.SI> {
   let AssemblerPredicate = SIAssemblerPredicate;
+  let DecoderNamespace = "SICI";
+  let DisableDecoder = DisableSIDecoder;
 }
 
 class VOP1_Real_vi <string opName, vop1 op, dag outs, dag ins, string asm> :
   VOP1<op.VI, outs, ins, asm, []>,
-  SIMCInstr <opName#"_e32", SISubtarget.VI> {
+  SIMCInstr <opName#"_e32", SIEncodingFamily.VI> {
   let AssemblerPredicates = [isVI];
+  let DecoderNamespace = "VI";
+  let DisableDecoder = DisableVIDecoder;
 }
 
 multiclass VOP1_m <vop1 op, string opName, VOPProfile p, list<dag> pattern,
@@ -1399,6 +1722,49 @@ multiclass VOP1_m <vop1 op, string opName, VOPProfile p, list<dag> pattern,
 
 }
 
+class VOP1_DPP <vop1 op, string opName, VOPProfile p> :
+  VOP1_DPPe <op.VI>,
+  VOP_DPP <p.OutsDPP, p.InsDPP, opName#p.AsmDPP, [], p.HasModifiers> {
+  let AssemblerPredicates = !if(p.HasExt, [isVI], [DisableInst]);
+  let DecoderNamespace = "DPP";
+  let DisableDecoder = DisableVIDecoder;
+  let src0_modifiers = !if(p.HasModifiers, ?, 0);
+  let src1_modifiers = 0;
+}
+
+class SDWADisableFields <VOPProfile p> {
+  bits<8> src0 = !if(!eq(p.NumSrcArgs, 0), 0, ?);
+  bits<3> src0_sel = !if(!eq(p.NumSrcArgs, 0), 6, ?);
+  bits<2> src0_fmodifiers = !if(!eq(p.NumSrcArgs, 0),
+                                0,
+                                !if(p.HasModifiers, ?, 0));
+  bits<1> src0_imodifiers = !if(!eq(p.NumSrcArgs, 0),
+                                0,
+                                !if(p.HasModifiers, 0, ?));
+  bits<3> src1_sel = !if(!eq(p.NumSrcArgs, 0), 6,
+                         !if(!eq(p.NumSrcArgs, 1), 6,
+                             ?));
+  bits<2> src1_fmodifiers = !if(!eq(p.NumSrcArgs, 0), 0,
+                                !if(!eq(p.NumSrcArgs, 1), 0,
+                                    !if(p.HasModifiers, ?, 0)));
+  bits<1> src1_imodifiers = !if(!eq(p.NumSrcArgs, 0), 0,
+                                !if(!eq(p.NumSrcArgs, 1), 0,
+                                    !if(p.HasModifiers, 0, ?)));
+  bits<3> dst_sel = !if(p.HasDst, ?, 6);
+  bits<2> dst_unused = !if(p.HasDst, ?, 2);
+  bits<1> clamp = !if(!eq(p.NumSrcArgs, 0), 0, ?);
+}
+
+class VOP1_SDWA <vop1 op, string opName, VOPProfile p> :
+  VOP1_SDWAe <op.VI>,
+  VOP_SDWA <p.OutsSDWA, p.InsSDWA, opName#p.AsmSDWA, [], p.HasModifiers>,
+  SDWADisableFields <p> {
+  let AsmMatchConverter = "cvtSdwaVOP1";
+  let AssemblerPredicates = !if(p.HasExt, [isVI], [DisableInst]);
+  let DecoderNamespace = "SDWA";
+  let DisableDecoder = DisableVIDecoder;
+}
+
 multiclass VOP1SI_m <vop1 op, string opName, VOPProfile p, list<dag> pattern,
                      string asm = opName#p.Asm32> {
 
@@ -1410,7 +1776,7 @@ multiclass VOP1SI_m <vop1 op, string opName, VOPProfile p, list<dag> pattern,
 class VOP2_Pseudo <dag outs, dag ins, list<dag> pattern, string opName> :
   VOP2Common <outs, ins, "", pattern>,
   VOP <opName>,
-  SIMCInstr<opName#"_e32", SISubtarget.NONE>,
+  SIMCInstr<opName#"_e32", SIEncodingFamily.NONE>,
   MnemonicAlias<opName#"_e32", opName> {
   let isPseudo = 1;
   let isCodeGenOnly = 1;
@@ -1418,14 +1784,18 @@ class VOP2_Pseudo <dag outs, dag ins, list<dag> pattern, string opName> :
 
 class VOP2_Real_si <string opName, vop2 op, dag outs, dag ins, string asm> :
   VOP2 <op.SI, outs, ins, opName#asm, []>,
-  SIMCInstr <opName#"_e32", SISubtarget.SI> {
+  SIMCInstr <opName#"_e32", SIEncodingFamily.SI> {
   let AssemblerPredicates = [isSICI];
+  let DecoderNamespace = "SICI";
+  let DisableDecoder = DisableSIDecoder;
 }
 
 class VOP2_Real_vi <string opName, vop2 op, dag outs, dag ins, string asm> :
   VOP2 <op.VI, outs, ins, opName#asm, []>,
-  SIMCInstr <opName#"_e32", SISubtarget.VI> {
+  SIMCInstr <opName#"_e32", SIEncodingFamily.VI> {
   let AssemblerPredicates = [isVI];
+  let DecoderNamespace = "VI";
+  let DisableDecoder = DisableVIDecoder;
 }
 
 multiclass VOP2SI_m <vop2 op, string opName, VOPProfile p, list<dag> pattern,
@@ -1449,6 +1819,26 @@ multiclass VOP2_m <vop2 op, string opName, VOPProfile p, list <dag> pattern,
 
 }
 
+class VOP2_DPP <vop2 op, string opName, VOPProfile p> :
+  VOP2_DPPe <op.VI>,
+  VOP_DPP <p.OutsDPP, p.InsDPP, opName#p.AsmDPP, [], p.HasModifiers> {
+  let AssemblerPredicates = !if(p.HasExt, [isVI], [DisableInst]);
+  let DecoderNamespace = "DPP";
+  let DisableDecoder = DisableVIDecoder;
+  let src0_modifiers = !if(p.HasModifiers, ?, 0);
+  let src1_modifiers = !if(p.HasModifiers, ?, 0);
+}
+
+class VOP2_SDWA <vop2 op, string opName, VOPProfile p> :
+  VOP2_SDWAe <op.VI>,
+  VOP_SDWA <p.OutsSDWA, p.InsSDWA, opName#p.AsmSDWA, [], p.HasModifiers>,
+  SDWADisableFields <p> {
+  let AsmMatchConverter = "cvtSdwaVOP2";
+  let AssemblerPredicates = !if(p.HasExt, [isVI], [DisableInst]);
+  let DecoderNamespace = "SDWA";
+  let DisableDecoder = DisableVIDecoder;
+}
+
 class VOP3DisableFields <bit HasSrc1, bit HasSrc2, bit HasModifiers> {
 
   bits<2> src0_modifiers = !if(HasModifiers, ?, 0);
@@ -1471,10 +1861,11 @@ class VOP3DisableModFields <bit HasSrc0Mods,
   bits<1> clamp = !if(HasOutputMods, ?, 0);
 }
 
-class VOP3_Pseudo <dag outs, dag ins, list<dag> pattern, string opName> :
-  VOP3Common <outs, ins, "", pattern>,
+class VOP3_Pseudo <dag outs, dag ins, list<dag> pattern, string opName,
+                   bit HasMods = 0, bit VOP3Only = 0> :
+  VOP3Common <outs, ins, "", pattern, HasMods, VOP3Only>,
   VOP <opName>,
-  SIMCInstr<opName#"_e64", SISubtarget.NONE>,
+  SIMCInstr<opName#"_e64", SIEncodingFamily.NONE>,
   MnemonicAlias<opName#"_e64", opName> {
   let isPseudo = 1;
   let isCodeGenOnly = 1;
@@ -1483,44 +1874,96 @@ class VOP3_Pseudo <dag outs, dag ins, list<dag> pattern, string opName> :
   field bit src0;
 }
 
-class VOP3_Real_si <bits<9> op, dag outs, dag ins, string asm, string opName> :
-  VOP3Common <outs, ins, asm, []>,
+class VOP3_Real_si <bits<9> op, dag outs, dag ins, string asm, string opName,
+                    bit HasMods = 0, bit VOP3Only = 0> :
+  VOP3Common <outs, ins, asm, [], HasMods, VOP3Only>,
   VOP3e <op>,
-  SIMCInstr<opName#"_e64", SISubtarget.SI> {
+  SIMCInstr<opName#"_e64", SIEncodingFamily.SI> {
   let AssemblerPredicates = [isSICI];
+  let DecoderNamespace = "SICI";
+  let DisableDecoder = DisableSIDecoder;
 }
 
-class VOP3_Real_vi <bits<10> op, dag outs, dag ins, string asm, string opName> :
-  VOP3Common <outs, ins, asm, []>,
+class VOP3_Real_vi <bits<10> op, dag outs, dag ins, string asm, string opName,
+                    bit HasMods = 0, bit VOP3Only = 0> :
+  VOP3Common <outs, ins, asm, [], HasMods, VOP3Only>,
   VOP3e_vi <op>,
-  SIMCInstr <opName#"_e64", SISubtarget.VI> {
+  SIMCInstr <opName#"_e64", SIEncodingFamily.VI> {
+  let AssemblerPredicates = [isVI];
+  let DecoderNamespace = "VI";
+  let DisableDecoder = DisableVIDecoder;
+}
+
+class VOP3_C_Real_si <bits<9> op, dag outs, dag ins, string asm, string opName,
+                     bit HasMods = 0, bit VOP3Only = 0> :
+  VOP3Common <outs, ins, asm, [], HasMods, VOP3Only>,
+  VOP3ce <op>,
+  SIMCInstr<opName#"_e64", SIEncodingFamily.SI> {
+  let AssemblerPredicates = [isSICI];
+  let DecoderNamespace = "SICI";
+  let DisableDecoder = DisableSIDecoder;
+}
+
+class VOP3_C_Real_vi <bits<10> op, dag outs, dag ins, string asm, string opName,
+                      bit HasMods = 0, bit VOP3Only = 0> :
+  VOP3Common <outs, ins, asm, [], HasMods, VOP3Only>,
+  VOP3ce_vi <op>,
+  SIMCInstr <opName#"_e64", SIEncodingFamily.VI> {
   let AssemblerPredicates = [isVI];
+  let DecoderNamespace = "VI";
+  let DisableDecoder = DisableVIDecoder;
 }
 
-class VOP3b_Real_si <bits<9> op, dag outs, dag ins, string asm, string opName> :
-  VOP3Common <outs, ins, asm, []>,
+class VOP3b_Real_si <bits<9> op, dag outs, dag ins, string asm, string opName,
+                     bit HasMods = 0, bit VOP3Only = 0> :
+  VOP3Common <outs, ins, asm, [], HasMods, VOP3Only>,
   VOP3be <op>,
-  SIMCInstr<opName#"_e64", SISubtarget.SI> {
+  SIMCInstr<opName#"_e64", SIEncodingFamily.SI> {
   let AssemblerPredicates = [isSICI];
+  let DecoderNamespace = "SICI";
+  let DisableDecoder = DisableSIDecoder;
 }
 
-class VOP3b_Real_vi <bits<10> op, dag outs, dag ins, string asm, string opName> :
-  VOP3Common <outs, ins, asm, []>,
+class VOP3b_Real_vi <bits<10> op, dag outs, dag ins, string asm, string opName,
+                     bit HasMods = 0, bit VOP3Only = 0> :
+  VOP3Common <outs, ins, asm, [], HasMods, VOP3Only>,
   VOP3be_vi <op>,
-  SIMCInstr <opName#"_e64", SISubtarget.VI> {
+  SIMCInstr <opName#"_e64", SIEncodingFamily.VI> {
+  let AssemblerPredicates = [isVI];
+  let DecoderNamespace = "VI";
+  let DisableDecoder = DisableVIDecoder;
+}
+
+class VOP3e_Real_si <bits<9> op, dag outs, dag ins, string asm, string opName,
+                     bit HasMods = 0, bit VOP3Only = 0> :
+  VOP3Common <outs, ins, asm, [], HasMods, VOP3Only>,
+  VOP3e <op>,
+  SIMCInstr<opName#"_e64", SIEncodingFamily.SI> {
+  let AssemblerPredicates = [isSICI];
+  let DecoderNamespace = "SICI";
+  let DisableDecoder = DisableSIDecoder;
+}
+
+class VOP3e_Real_vi <bits<10> op, dag outs, dag ins, string asm, string opName,
+                     bit HasMods = 0, bit VOP3Only = 0> :
+  VOP3Common <outs, ins, asm, [], HasMods, VOP3Only>,
+  VOP3e_vi <op>,
+  SIMCInstr <opName#"_e64", SIEncodingFamily.VI> {
   let AssemblerPredicates = [isVI];
+  let DecoderNamespace = "VI";
+  let DisableDecoder = DisableVIDecoder;
 }
 
 multiclass VOP3_m <vop op, dag outs, dag ins, string asm, list<dag> pattern,
-                   string opName, int NumSrcArgs, bit HasMods = 1> {
+                   string opName, int NumSrcArgs, bit HasMods = 1, bit VOP3Only = 0> {
 
   def "" : VOP3_Pseudo <outs, ins, pattern, opName>;
 
-  def _si : VOP3_Real_si <op.SI3, outs, ins, asm, opName>,
+  def _si : VOP3_Real_si <op.SI3, outs, ins, asm, opName, HasMods, VOP3Only>,
             VOP3DisableFields<!if(!eq(NumSrcArgs, 1), 0, 1),
                               !if(!eq(NumSrcArgs, 2), 0, 1),
                               HasMods>;
-  def _vi : VOP3_Real_vi <op.VI3, outs, ins, asm, opName>,
+  def _vi : VOP3_Real_vi <op.VI3, outs, ins, asm, opName, HasMods, VOP3Only>,
             VOP3DisableFields<!if(!eq(NumSrcArgs, 1), 0, 1),
                               !if(!eq(NumSrcArgs, 2), 0, 1),
                               HasMods>;
@@ -1529,21 +1972,21 @@ multiclass VOP3_m <vop op, dag outs, dag ins, string asm, list<dag> pattern,
 multiclass VOP3_1_m <vop op, dag outs, dag ins, string asm,
                      list<dag> pattern, string opName, bit HasMods = 1> {
 
-  def "" : VOP3_Pseudo <outs, ins, pattern, opName>;
+  def "" : VOP3_Pseudo <outs, ins, pattern, opName, HasMods>;
 
-  def _si : VOP3_Real_si <op.SI3, outs, ins, asm, opName>,
+  def _si : VOP3_Real_si <op.SI3, outs, ins, asm, opName, HasMods>,
             VOP3DisableFields<0, 0, HasMods>;
 
-  def _vi : VOP3_Real_vi <op.VI3, outs, ins, asm, opName>,
+  def _vi : VOP3_Real_vi <op.VI3, outs, ins, asm, opName, HasMods>,
             VOP3DisableFields<0, 0, HasMods>;
 }
 
 multiclass VOP3SI_1_m <vop op, dag outs, dag ins, string asm,
                      list<dag> pattern, string opName, bit HasMods = 1> {
 
-  def "" : VOP3_Pseudo <outs, ins, pattern, opName>;
+  def "" : VOP3_Pseudo <outs, ins, pattern, opName, HasMods>;
 
-  def _si : VOP3_Real_si <op.SI3, outs, ins, asm, opName>,
+  def _si : VOP3_Real_si <op.SI3, outs, ins, asm, opName, HasMods>,
             VOP3DisableFields<0, 0, HasMods>;
   // No VI instruction. This class is for SI only.
 }
@@ -1552,13 +1995,13 @@ multiclass VOP3_2_m <vop op, dag outs, dag ins, string asm,
                      list<dag> pattern, string opName, string revOp,
                      bit HasMods = 1> {
 
-  def "" : VOP3_Pseudo <outs, ins, pattern, opName>,
+  def "" : VOP3_Pseudo <outs, ins, pattern, opName, HasMods>,
            VOP2_REV<revOp#"_e64", !eq(revOp, opName)>;
 
-  def _si : VOP3_Real_si <op.SI3, outs, ins, asm, opName>,
+  def _si : VOP3_Real_si <op.SI3, outs, ins, asm, opName, HasMods>,
             VOP3DisableFields<1, 0, HasMods>;
 
-  def _vi : VOP3_Real_vi <op.VI3, outs, ins, asm, opName>,
+  def _vi : VOP3_Real_vi <op.VI3, outs, ins, asm, opName, HasMods>,
             VOP3DisableFields<1, 0, HasMods>;
 }
 
@@ -1566,10 +2009,10 @@ multiclass VOP3SI_2_m <vop op, dag outs, dag ins, string asm,
                      list<dag> pattern, string opName, string revOp,
                      bit HasMods = 1> {
 
-  def "" : VOP3_Pseudo <outs, ins, pattern, opName>,
+  def "" : VOP3_Pseudo <outs, ins, pattern, opName, HasMods>,
            VOP2_REV<revOp#"_e64", !eq(revOp, opName)>;
 
-  def _si : VOP3_Real_si <op.SI3, outs, ins, asm, opName>,
+  def _si : VOP3_Real_si <op.SI3, outs, ins, asm, opName, HasMods>,
             VOP3DisableFields<1, 0, HasMods>;
 
   // No VI instruction. This class is for SI only.
@@ -1579,13 +2022,26 @@ multiclass VOP3SI_2_m <vop op, dag outs, dag ins, string asm,
 // instead of an implicit VCC as in the VOP2b format.
 multiclass VOP3b_2_3_m <vop op, dag outs, dag ins, string asm,
                         list<dag> pattern, string opName, string revOp,
-                        bit HasMods = 1, bit useSrc2Input = 0> {
-  def "" : VOP3_Pseudo <outs, ins, pattern, opName>;
+                        bit HasMods = 1, bit useSrc2Input = 0, bit VOP3Only = 0> {
+  def "" : VOP3_Pseudo <outs, ins, pattern, opName, HasMods, VOP3Only>;
+
+  def _si : VOP3b_Real_si <op.SI3, outs, ins, asm, opName, HasMods, VOP3Only>,
+            VOP3DisableFields<1, useSrc2Input, HasMods>;
+
+  def _vi : VOP3b_Real_vi <op.VI3, outs, ins, asm, opName, HasMods, VOP3Only>,
+            VOP3DisableFields<1, useSrc2Input, HasMods>;
+}
 
-  def _si : VOP3b_Real_si <op.SI3, outs, ins, asm, opName>,
+// Same as VOP3b_2_3_m but no 2nd destination (sdst), e.g. v_cndmask_b32.
+multiclass VOP3e_2_3_m <vop op, dag outs, dag ins, string asm,
+                        list<dag> pattern, string opName, string revOp,
+                        bit HasMods = 1, bit useSrc2Input = 0, bit VOP3Only = 0> {
+  def "" : VOP3_Pseudo <outs, ins, pattern, opName, HasMods, VOP3Only>;
+
+  def _si : VOP3e_Real_si <op.SI3, outs, ins, asm, opName, HasMods, VOP3Only>,
             VOP3DisableFields<1, useSrc2Input, HasMods>;
 
-  def _vi : VOP3b_Real_vi <op.VI3, outs, ins, asm, opName>,
+  def _vi : VOP3e_Real_vi <op.VI3, outs, ins, asm, opName, HasMods, VOP3Only>,
             VOP3DisableFields<1, useSrc2Input, HasMods>;
 }
 
@@ -1594,19 +2050,19 @@ multiclass VOP3_C_m <vop op, dag outs, dag ins, string asm,
                      bit HasMods, bit defExec,
                      string revOp, list<SchedReadWrite> sched> {
 
-  def "" : VOP3_Pseudo <outs, ins, pattern, opName>,
+  def "" : VOP3_Pseudo <outs, ins, pattern, opName, HasMods>,
            VOP2_REV<revOp#"_e64", !eq(revOp, opName)> {
     let Defs = !if(defExec, [EXEC], []);
     let SchedRW = sched;
   }
 
-  def _si : VOP3_Real_si <op.SI3, outs, ins, asm, opName>,
+  def _si : VOP3_C_Real_si <op.SI3, outs, ins, asm, opName, HasMods>,
             VOP3DisableFields<1, 0, HasMods> {
     let Defs = !if(defExec, [EXEC], []);
     let SchedRW = sched;
   }
 
-  def _vi : VOP3_Real_vi <op.VI3, outs, ins, asm, opName>,
+  def _vi : VOP3_C_Real_vi <op.VI3, outs, ins, asm, opName, HasMods>,
             VOP3DisableFields<1, 0, HasMods> {
     let Defs = !if(defExec, [EXEC], []);
     let SchedRW = sched;
@@ -1618,19 +2074,23 @@ multiclass VOP2SI_3VI_m <vop3 op, string opName, dag outs, dag ins,
                          string asm, list<dag> pattern = []> {
   let isPseudo = 1, isCodeGenOnly = 1 in {
     def "" : VOPAnyCommon <outs, ins, "", pattern>,
-             SIMCInstr<opName, SISubtarget.NONE>;
+             SIMCInstr<opName, SIEncodingFamily.NONE>;
   }
 
   def _si : VOP2 <op.SI3{5-0}, outs, ins, asm, []>,
-            SIMCInstr <opName, SISubtarget.SI> {
+            SIMCInstr <opName, SIEncodingFamily.SI> {
             let AssemblerPredicates = [isSICI];
+            let DecoderNamespace = "SICI";
+            let DisableDecoder = DisableSIDecoder;
   }
 
   def _vi : VOP3Common <outs, ins, asm, []>,
             VOP3e_vi <op.VI3>,
             VOP3DisableFields <1, 0, 0>,
-            SIMCInstr <opName, SISubtarget.VI> {
+            SIMCInstr <opName, SIEncodingFamily.VI> {
             let AssemblerPredicates = [isVI];
+            let DecoderNamespace = "VI";
+            let DisableDecoder = DisableVIDecoder;
   }
 }
 
@@ -1641,15 +2101,19 @@ multiclass VOP1_Helper <vop1 op, string opName, VOPProfile p, list<dag> pat32,
 
   defm _e64 : VOP3_1_m <op, p.Outs, p.Ins64, opName#p.Asm64, pat64, opName,
                         p.HasModifiers>;
+
+  def _dpp : VOP1_DPP <op, opName, p>;
+
+  def _sdwa : VOP1_SDWA <op, opName, p>;
 }
 
 multiclass VOP1Inst <vop1 op, string opName, VOPProfile P,
                      SDPatternOperator node = null_frag> : VOP1_Helper <
   op, opName, P, [],
   !if(P.HasModifiers,
-      [(set P.DstVT:$dst, (node (P.Src0VT (VOP3Mods0 P.Src0VT:$src0,
+      [(set P.DstVT:$vdst, (node (P.Src0VT (VOP3Mods0 P.Src0VT:$src0,
                                 i32:$src0_modifiers, i1:$clamp, i32:$omod))))],
-      [(set P.DstVT:$dst, (node P.Src0VT:$src0))])
+      [(set P.DstVT:$vdst, (node P.Src0VT:$src0))])
 >;
 
 multiclass VOP1InstSI <vop1 op, string opName, VOPProfile P,
@@ -1659,9 +2123,9 @@ multiclass VOP1InstSI <vop1 op, string opName, VOPProfile P,
 
   defm _e64 : VOP3SI_1_m <op, P.Outs, P.Ins64, opName#P.Asm64,
     !if(P.HasModifiers,
-      [(set P.DstVT:$dst, (node (P.Src0VT (VOP3Mods0 P.Src0VT:$src0,
+      [(set P.DstVT:$vdst, (node (P.Src0VT (VOP3Mods0 P.Src0VT:$src0,
                                 i32:$src0_modifiers, i1:$clamp, i32:$omod))))],
-      [(set P.DstVT:$dst, (node P.Src0VT:$src0))]),
+      [(set P.DstVT:$vdst, (node P.Src0VT:$src0))]),
     opName, P.HasModifiers>;
 }
 
@@ -1672,6 +2136,10 @@ multiclass VOP2_Helper <vop2 op, string opName, VOPProfile p, list<dag> pat32,
 
   defm _e64 : VOP3_2_m <op, p.Outs, p.Ins64, opName#p.Asm64, pat64, opName,
                         revOp, p.HasModifiers>;
+
+  def _dpp : VOP2_DPP <op, opName, p>;
+
+  def _sdwa : VOP2_SDWA <op, opName, p>;
 }
 
 multiclass VOP2Inst <vop2 op, string opName, VOPProfile P,
@@ -1679,11 +2147,11 @@ multiclass VOP2Inst <vop2 op, string opName, VOPProfile P,
                      string revOp = opName> : VOP2_Helper <
   op, opName, P, [],
   !if(P.HasModifiers,
-      [(set P.DstVT:$dst,
+      [(set P.DstVT:$vdst,
            (node (P.Src0VT (VOP3Mods0 P.Src0VT:$src0, i32:$src0_modifiers,
                                       i1:$clamp, i32:$omod)),
                  (P.Src1VT (VOP3Mods P.Src1VT:$src1, i32:$src1_modifiers))))],
-      [(set P.DstVT:$dst, (node P.Src0VT:$src0, P.Src1VT:$src1))]),
+      [(set P.DstVT:$vdst, (node P.Src0VT:$src0, P.Src1VT:$src1))]),
   revOp
 >;
 
@@ -1695,14 +2163,41 @@ multiclass VOP2InstSI <vop2 op, string opName, VOPProfile P,
 
   defm _e64 : VOP3SI_2_m <op, P.Outs, P.Ins64, opName#P.Asm64,
     !if(P.HasModifiers,
-        [(set P.DstVT:$dst,
+        [(set P.DstVT:$vdst,
              (node (P.Src0VT (VOP3Mods0 P.Src0VT:$src0, i32:$src0_modifiers,
                                         i1:$clamp, i32:$omod)),
                    (P.Src1VT (VOP3Mods P.Src1VT:$src1, i32:$src1_modifiers))))],
-        [(set P.DstVT:$dst, (node P.Src0VT:$src0, P.Src1VT:$src1))]),
+        [(set P.DstVT:$vdst, (node P.Src0VT:$src0, P.Src1VT:$src1))]),
     opName, revOp, P.HasModifiers>;
 }
 
+multiclass VOP2e_Helper <vop2 op, string opName, VOPProfile p,
+                         list<dag> pat32, list<dag> pat64,
+                         string revOp, bit useSGPRInput> {
+
+  let SchedRW = [Write32Bit] in {
+    let Uses = !if(useSGPRInput, [VCC, EXEC], [EXEC]) in {
+      defm _e32 : VOP2_m <op, opName, p, pat32, revOp>;
+    }
+
+    defm _e64 : VOP3e_2_3_m <op, p.Outs64, p.Ins64, opName#p.Asm64, pat64,
+                             opName, revOp, p.HasModifiers, useSGPRInput>;
+  }
+}
+
+multiclass VOP2eInst <vop2 op, string opName, VOPProfile P,
+                      SDPatternOperator node = null_frag,
+                      string revOp = opName> : VOP2e_Helper <
+  op, opName, P, [],
+  !if(P.HasModifiers,
+      [(set P.DstVT:$vdst,
+           (node (P.Src0VT (VOP3Mods0 P.Src0VT:$src0, i32:$src0_modifiers,
+                                      i1:$clamp, i32:$omod)),
+                 (P.Src1VT (VOP3Mods P.Src1VT:$src1, i32:$src1_modifiers))))],
+      [(set P.DstVT:$vdst, (node P.Src0VT:$src0, P.Src1VT:$src1))]),
+  revOp, !eq(P.NumSrcArgs, 3)
+>;
+
 multiclass VOP2b_Helper <vop2 op, string opName, VOPProfile p,
                          list<dag> pat32, list<dag> pat64,
                          string revOp, bit useSGPRInput> {
@@ -1722,11 +2217,11 @@ multiclass VOP2bInst <vop2 op, string opName, VOPProfile P,
                       string revOp = opName> : VOP2b_Helper <
   op, opName, P, [],
   !if(P.HasModifiers,
-      [(set P.DstVT:$dst,
+      [(set P.DstVT:$vdst,
            (node (P.Src0VT (VOP3Mods0 P.Src0VT:$src0, i32:$src0_modifiers,
                                       i1:$clamp, i32:$omod)),
                  (P.Src1VT (VOP3Mods P.Src1VT:$src1, i32:$src1_modifiers))))],
-      [(set P.DstVT:$dst, (node P.Src0VT:$src0, P.Src1VT:$src1))]),
+      [(set P.DstVT:$vdst, (node P.Src0VT:$src0, P.Src1VT:$src1))]),
   revOp, !eq(P.NumSrcArgs, 3)
 >;
 
@@ -1746,31 +2241,35 @@ multiclass VOP2_VI3_Inst <vop23 op, string opName, VOPProfile P,
                           : VOP2_VI3_Helper <
   op, opName, P, [],
   !if(P.HasModifiers,
-      [(set P.DstVT:$dst,
+      [(set P.DstVT:$vdst,
            (node (P.Src0VT (VOP3Mods0 P.Src0VT:$src0, i32:$src0_modifiers,
                                       i1:$clamp, i32:$omod)),
                  (P.Src1VT (VOP3Mods P.Src1VT:$src1, i32:$src1_modifiers))))],
-      [(set P.DstVT:$dst, (node P.Src0VT:$src0, P.Src1VT:$src1))]),
+      [(set P.DstVT:$vdst, (node P.Src0VT:$src0, P.Src1VT:$src1))]),
   revOp
 >;
 
-multiclass VOP2MADK <vop2 op, string opName, list<dag> pattern = []> {
+multiclass VOP2MADK <vop2 op, string opName, VOPProfile P, list<dag> pattern = []> {
 
-  def "" : VOP2_Pseudo <VOP_MADK.Outs, VOP_MADK.Ins, pattern, opName>;
+  def "" : VOP2_Pseudo <P.Outs, P.Ins32, pattern, opName>;
 
 let isCodeGenOnly = 0 in {
-  def _si : VOP2Common <VOP_MADK.Outs, VOP_MADK.Ins,
-                        !strconcat(opName, VOP_MADK.Asm), []>,
-            SIMCInstr <opName#"_e32", SISubtarget.SI>,
+  def _si : VOP2Common <P.Outs, P.Ins32,
+                        !strconcat(opName, P.Asm32), []>,
+            SIMCInstr <opName#"_e32", SIEncodingFamily.SI>,
             VOP2_MADKe <op.SI> {
             let AssemblerPredicates = [isSICI];
+            let DecoderNamespace = "SICI";
+            let DisableDecoder = DisableSIDecoder;
             }
 
-  def _vi : VOP2Common <VOP_MADK.Outs, VOP_MADK.Ins,
-                        !strconcat(opName, VOP_MADK.Asm), []>,
-            SIMCInstr <opName#"_e32", SISubtarget.VI>,
+  def _vi : VOP2Common <P.Outs, P.Ins32,
+                        !strconcat(opName, P.Asm32), []>,
+            SIMCInstr <opName#"_e32", SIEncodingFamily.VI>,
             VOP2_MADKe <op.VI> {
             let AssemblerPredicates = [isVI];
+            let DecoderNamespace = "VI";
+            let DisableDecoder = DisableVIDecoder;
             }
 } // End isCodeGenOnly = 0
 }
@@ -1778,37 +2277,55 @@ let isCodeGenOnly = 0 in {
 class VOPC_Pseudo <dag ins, list<dag> pattern, string opName> :
   VOPCCommon <ins, "", pattern>,
   VOP <opName>,
-  SIMCInstr<opName#"_e32", SISubtarget.NONE> {
+  SIMCInstr<opName#"_e32", SIEncodingFamily.NONE> {
   let isPseudo = 1;
   let isCodeGenOnly = 1;
 }
 
+class VOPC_SDWA <vopc op, string opName, bit DefExec, VOPProfile p> :
+    VOPC_SDWAe <op.VI>,
+    VOP_SDWA <p.OutsSDWA, p.InsSDWA, opName#p.AsmSDWA, [], p.HasModifiers>,
+    SDWADisableFields <p> {
+  let Defs = !if(DefExec, [VCC, EXEC], [VCC]);
+  let hasSideEffects = DefExec;
+  let AsmMatchConverter = "cvtSdwaVOPC";
+  let AssemblerPredicates = !if(p.HasExt, [isVI], [DisableInst]);
+  let DecoderNamespace = "SDWA";
+  let DisableDecoder = DisableVIDecoder;
+}
+
 multiclass VOPC_m <vopc op, dag ins, string op_asm, list<dag> pattern,
                    string opName, bit DefExec, VOPProfile p,
                    list<SchedReadWrite> sched,
                    string revOpName = "", string asm = opName#"_e32 "#op_asm,
                    string alias_asm = opName#" "#op_asm> {
-  def "" : VOPC_Pseudo <ins, pattern, opName> {
+  def "" : VOPC_Pseudo <ins, pattern, opName>,
+           VOP2_REV<revOpName#"_e32", !eq(revOpName, opName)> {
     let Defs = !if(DefExec, [VCC, EXEC], [VCC]);
     let SchedRW = sched;
+    let isConvergent = DefExec;
   }
 
   let AssemblerPredicates = [isSICI] in {
     def _si : VOPC<op.SI, ins, asm, []>,
-              SIMCInstr <opName#"_e32", SISubtarget.SI> {
+              SIMCInstr <opName#"_e32", SIEncodingFamily.SI> {
       let Defs = !if(DefExec, [VCC, EXEC], [VCC]);
-      let hasSideEffects = DefExec;
+      let isConvergent = DefExec;
       let SchedRW = sched;
+      let DecoderNamespace = "SICI";
+      let DisableDecoder = DisableSIDecoder;
     }
 
   } // End AssemblerPredicates = [isSICI]
 
   let AssemblerPredicates = [isVI] in {
     def _vi : VOPC<op.VI, ins, asm, []>,
-              SIMCInstr <opName#"_e32", SISubtarget.VI> {
+              SIMCInstr <opName#"_e32", SIEncodingFamily.VI> {
       let Defs = !if(DefExec, [VCC, EXEC], [VCC]);
-      let hasSideEffects = DefExec;
+      let isConvergent = DefExec;
       let SchedRW = sched;
+      let DecoderNamespace = "VI";
+      let DisableDecoder = DisableVIDecoder;
     }
 
   } // End AssemblerPredicates = [isVI]
@@ -1819,10 +2336,13 @@ multiclass VOPC_m <vopc op, dag ins, string op_asm, list<dag> pattern,
 multiclass VOPC_Helper <vopc op, string opName, list<dag> pat32,
                         list<dag> pat64, bit DefExec, string revOp,
                         VOPProfile p, list<SchedReadWrite> sched> {
-  defm _e32 : VOPC_m <op, p.Ins32, p.Asm32, pat32, opName, DefExec, p, sched>;
+  defm _e32 : VOPC_m <op, p.Ins32, p.Asm32, pat32, opName, DefExec, p, sched,
+                      revOp>;
 
-  defm _e64 : VOP3_C_m <op, (outs VOPDstS64:$dst), p.Ins64, opName#p.Asm64, pat64,
+  defm _e64 : VOP3_C_m <op, (outs VOPDstS64:$sdst), p.Ins64, opName#p.Asm64, pat64,
                         opName, p.HasModifiers, DefExec, revOp, sched>;
+
+  def _sdwa : VOPC_SDWA <op, opName, DefExec, p>;
 }
 
 // Special case for class instructions which only have modifiers on
@@ -1832,9 +2352,14 @@ multiclass VOPC_Class_Helper <vopc op, string opName, list<dag> pat32,
                               VOPProfile p, list<SchedReadWrite> sched> {
   defm _e32 : VOPC_m <op, p.Ins32, p.Asm32, pat32, opName, DefExec, p, sched>;
 
-  defm _e64 : VOP3_C_m <op, (outs VOPDstS64:$dst), p.Ins64, opName#p.Asm64, pat64,
+  defm _e64 : VOP3_C_m <op, (outs VOPDstS64:$sdst), p.Ins64, opName#p.Asm64, pat64,
                         opName, p.HasModifiers, DefExec, revOp, sched>,
                         VOP3DisableModFields<1, 0, 0>;
+
+  def _sdwa : VOPC_SDWA <op, opName, DefExec, p> {
+    let src1_fmodifiers = 0;
+    let src1_imodifiers = ?;
+  }
 }
 
 multiclass VOPCInst <vopc op, string opName,
@@ -1845,12 +2370,12 @@ multiclass VOPCInst <vopc op, string opName,
                      VOPC_Helper <
   op, opName, [],
   !if(P.HasModifiers,
-      [(set i1:$dst,
+      [(set i1:$sdst,
           (setcc (P.Src0VT (VOP3Mods0 P.Src0VT:$src0, i32:$src0_modifiers,
                                       i1:$clamp, i32:$omod)),
                  (P.Src1VT (VOP3Mods P.Src1VT:$src1, i32:$src1_modifiers)),
                  cond))],
-      [(set i1:$dst, (setcc P.Src0VT:$src0, P.Src1VT:$src1, cond))]),
+      [(set i1:$sdst, (setcc P.Src0VT:$src0, P.Src1VT:$src1, cond))]),
   DefExec, revOp, P, sched
 >;
 
@@ -1859,9 +2384,9 @@ multiclass VOPCClassInst <vopc op, string opName, VOPProfile P,
                      list<SchedReadWrite> sched> : VOPC_Class_Helper <
   op, opName, [],
   !if(P.HasModifiers,
-      [(set i1:$dst,
+      [(set i1:$sdst,
           (AMDGPUfp_class (P.Src0VT (VOP3Mods0Clamp0OMod P.Src0VT:$src0, i32:$src0_modifiers)), P.Src1VT:$src1))],
-      [(set i1:$dst, (AMDGPUfp_class P.Src0VT:$src0, P.Src1VT:$src1))]),
+      [(set i1:$sdst, (AMDGPUfp_class P.Src0VT:$src0, P.Src1VT:$src1))]),
   DefExec, opName, P, sched
 >;
 
@@ -1897,10 +2422,6 @@ multiclass VOPCX_I32 <vopc op, string opName, string revOp = opName> :
 multiclass VOPCX_I64 <vopc op, string opName, string revOp = opName> :
   VOPCX <op, opName, VOPC_I1_I64_I64, COND_NULL, [Write64Bit], revOp>;
 
-multiclass VOP3_Helper <vop3 op, string opName, dag outs, dag ins, string asm,
-                        list<dag> pat, int NumSrcArgs, bit HasMods> : VOP3_m <
-    op, outs, ins, opName#" "#asm, pat, opName, NumSrcArgs, HasMods
->;
 
 multiclass VOPC_CLASS_F32 <vopc op, string opName> :
   VOPCClassInst <op, opName, VOPC_I1_F32_I32, 0, [Write32Bit]>;
@@ -1914,32 +2435,40 @@ multiclass VOPC_CLASS_F64 <vopc op, string opName> :
 multiclass VOPCX_CLASS_F64 <vopc op, string opName> :
   VOPCClassInst <op, opName, VOPC_I1_F64_I32, 1, [WriteDoubleAdd]>;
 
+
+multiclass VOP3_Helper <vop3 op, string opName, dag outs, dag ins, string asm,
+                        list<dag> pat, int NumSrcArgs, bit HasMods,
+                        bit VOP3Only = 0> : VOP3_m <
+    op, outs, ins, opName#" "#asm, pat, opName, NumSrcArgs, HasMods, VOP3Only
+>;
+
 multiclass VOP3Inst <vop3 op, string opName, VOPProfile P,
-                     SDPatternOperator node = null_frag> : VOP3_Helper <
-  op, opName, (outs P.DstRC.RegClass:$dst), P.Ins64, P.Asm64,
+                     SDPatternOperator node = null_frag, bit VOP3Only = 0> :
+  VOP3_Helper <
+  op, opName, (outs P.DstRC.RegClass:$vdst), P.Ins64, P.Asm64,
   !if(!eq(P.NumSrcArgs, 3),
     !if(P.HasModifiers,
-        [(set P.DstVT:$dst,
+        [(set P.DstVT:$vdst,
             (node (P.Src0VT (VOP3Mods0 P.Src0VT:$src0, i32:$src0_modifiers,
                                        i1:$clamp, i32:$omod)),
                   (P.Src1VT (VOP3Mods P.Src1VT:$src1, i32:$src1_modifiers)),
                   (P.Src2VT (VOP3Mods P.Src2VT:$src2, i32:$src2_modifiers))))],
-        [(set P.DstVT:$dst, (node P.Src0VT:$src0, P.Src1VT:$src1,
+        [(set P.DstVT:$vdst, (node P.Src0VT:$src0, P.Src1VT:$src1,
                                   P.Src2VT:$src2))]),
   !if(!eq(P.NumSrcArgs, 2),
     !if(P.HasModifiers,
-        [(set P.DstVT:$dst,
+        [(set P.DstVT:$vdst,
             (node (P.Src0VT (VOP3Mods0 P.Src0VT:$src0, i32:$src0_modifiers,
                                        i1:$clamp, i32:$omod)),
                   (P.Src1VT (VOP3Mods P.Src1VT:$src1, i32:$src1_modifiers))))],
-        [(set P.DstVT:$dst, (node P.Src0VT:$src0, P.Src1VT:$src1))])
+        [(set P.DstVT:$vdst, (node P.Src0VT:$src0, P.Src1VT:$src1))])
   /* P.NumSrcArgs == 1 */,
     !if(P.HasModifiers,
-        [(set P.DstVT:$dst,
+        [(set P.DstVT:$vdst,
             (node (P.Src0VT (VOP3Mods0 P.Src0VT:$src0, i32:$src0_modifiers,
                                        i1:$clamp, i32:$omod))))],
-        [(set P.DstVT:$dst, (node P.Src0VT:$src0))]))),
-  P.NumSrcArgs, P.HasModifiers
+        [(set P.DstVT:$vdst, (node P.Src0VT:$src0))]))),
+  P.NumSrcArgs, P.HasModifiers, VOP3Only
 >;
 
 // Special case for v_div_fmas_{f32|f64}, since it seems to be the
@@ -1948,14 +2477,14 @@ multiclass VOP3_VCC_Inst <vop3 op, string opName,
                           VOPProfile P,
                           SDPatternOperator node = null_frag> : VOP3_Helper <
   op, opName,
-  (outs P.DstRC.RegClass:$dst),
-  (ins InputModsNoDefault:$src0_modifiers, P.Src0RC64:$src0,
-       InputModsNoDefault:$src1_modifiers, P.Src1RC64:$src1,
-       InputModsNoDefault:$src2_modifiers, P.Src2RC64:$src2,
-       ClampMod:$clamp,
+  (outs P.DstRC.RegClass:$vdst),
+  (ins FPInputMods:$src0_modifiers, P.Src0RC64:$src0,
+       FPInputMods:$src1_modifiers, P.Src1RC64:$src1,
+       FPInputMods:$src2_modifiers, P.Src2RC64:$src2,
+       clampmod:$clamp,
        omod:$omod),
-  "$dst, $src0_modifiers, $src1_modifiers, $src2_modifiers"#"$clamp"#"$omod",
-  [(set P.DstVT:$dst,
+  "$vdst, $src0_modifiers, $src1_modifiers, $src2_modifiers"#"$clamp"#"$omod",
+  [(set P.DstVT:$vdst,
             (node (P.Src0VT (VOP3Mods0 P.Src0VT:$src0, i32:$src0_modifiers,
                                        i1:$clamp, i32:$omod)),
                   (P.Src1VT (VOP3Mods P.Src1VT:$src1, i32:$src1_modifiers)),
@@ -1964,11 +2493,11 @@ multiclass VOP3_VCC_Inst <vop3 op, string opName,
   3, 1
 >;
 
-multiclass VOP3bInst <vop op, string opName, VOPProfile P, list<dag> pattern = []> :
+multiclass VOP3bInst <vop op, string opName, VOPProfile P, list<dag> pattern = [], bit VOP3Only = 0> :
   VOP3b_2_3_m <
   op, P.Outs64, P.Ins64,
   opName#" "#P.Asm64, pattern,
-  opName, "", 1, 1
+  opName, "", 1, 1, VOP3Only
 >;
 
 class Vop3ModPat<Instruction Inst, VOPProfile P, SDPatternOperator node> : Pat<
@@ -1987,7 +2516,7 @@ class Vop3ModPat<Instruction Inst, VOPProfile P, SDPatternOperator node> : Pat<
 
 class VINTRP_Pseudo <string opName, dag outs, dag ins, list<dag> pattern> :
   VINTRPCommon <outs, ins, "", pattern>,
-  SIMCInstr<opName, SISubtarget.NONE> {
+  SIMCInstr<opName, SIEncodingFamily.NONE> {
   let isPseudo = 1;
   let isCodeGenOnly = 1;
 }
@@ -1996,13 +2525,21 @@ class VINTRP_Real_si <bits <2> op, string opName, dag outs, dag ins,
                       string asm> :
   VINTRPCommon <outs, ins, asm, []>,
   VINTRPe <op>,
-  SIMCInstr<opName, SISubtarget.SI>;
+  SIMCInstr<opName, SIEncodingFamily.SI> {
+  let AssemblerPredicate = SIAssemblerPredicate;
+  let DecoderNamespace = "SICI";
+  let DisableDecoder = DisableSIDecoder;
+}
 
 class VINTRP_Real_vi <bits <2> op, string opName, dag outs, dag ins,
                       string asm> :
   VINTRPCommon <outs, ins, asm, []>,
   VINTRPe_vi <op>,
-  SIMCInstr<opName, SISubtarget.VI>;
+  SIMCInstr<opName, SIEncodingFamily.VI> {
+  let AssemblerPredicate = VIAssemblerPredicate;
+  let DecoderNamespace = "VI";
+  let DisableDecoder = DisableVIDecoder;
+}
 
 multiclass VINTRP_m <bits <2> op, dag outs, dag ins, string asm,
                      list<dag> pattern = []> {
@@ -2019,7 +2556,7 @@ multiclass VINTRP_m <bits <2> op, dag outs, dag ins, string asm,
 
 class DS_Pseudo <string opName, dag outs, dag ins, list<dag> pattern> :
   DS <outs, ins, "", pattern>,
-  SIMCInstr <opName, SISubtarget.NONE> {
+  SIMCInstr <opName, SIEncodingFamily.NONE> {
   let isPseudo = 1;
   let isCodeGenOnly = 1;
 }
@@ -2027,14 +2564,22 @@ class DS_Pseudo <string opName, dag outs, dag ins, list<dag> pattern> :
 class DS_Real_si <bits<8> op, string opName, dag outs, dag ins, string asm> :
   DS <outs, ins, asm, []>,
   DSe <op>,
-  SIMCInstr <opName, SISubtarget.SI> {
+  SIMCInstr <opName, SIEncodingFamily.SI> {
   let isCodeGenOnly = 0;
+  let AssemblerPredicates = [isSICI];
+  let DecoderNamespace="SICI";
+  let DisableDecoder = DisableSIDecoder;
 }
 
 class DS_Real_vi <bits<8> op, string opName, dag outs, dag ins, string asm> :
   DS <outs, ins, asm, []>,
   DSe_vi <op>,
-  SIMCInstr <opName, SISubtarget.VI>;
+  SIMCInstr <opName, SIEncodingFamily.VI> {
+  let isCodeGenOnly = 0;
+  let AssemblerPredicates = [isVI];
+  let DecoderNamespace="VI";
+  let DisableDecoder = DisableVIDecoder;
+}
 
 class DS_Off16_Real_si <bits<8> op, string opName, dag outs, dag ins, string asm> :
   DS_Real_si <op,opName, outs, ins, asm> {
@@ -2043,7 +2588,6 @@ class DS_Off16_Real_si <bits<8> op, string opName, dag outs, dag ins, string asm
   bits<16> offset;
   let offset0 = offset{7-0};
   let offset1 = offset{15-8};
-  let isCodeGenOnly = 0;
 }
 
 class DS_Off16_Real_vi <bits<8> op, string opName, dag outs, dag ins, string asm> :
@@ -2055,9 +2599,24 @@ class DS_Off16_Real_vi <bits<8> op, string opName, dag outs, dag ins, string asm
   let offset1 = offset{15-8};
 }
 
+multiclass DS_1A_RET_ <dsop op, string opName, RegisterClass rc,
+  dag outs = (outs rc:$vdst),
+  dag ins = (ins VGPR_32:$addr, offset:$offset, gds:$gds),
+  string asm = opName#" $vdst, $addr"#"$offset$gds"> {
+
+  def "" : DS_Pseudo <opName, outs, ins, []>;
+
+  let data0 = 0, data1 = 0 in {
+    def _si : DS_Off16_Real_si <op.SI, opName, outs, ins, asm>;
+    def _vi : DS_Off16_Real_vi <op.VI, opName, outs, ins, asm>;
+  }
+}
+
+// TODO: DS_1A_RET can be inherited from DS_1A_RET_ but its not working
+// for some reason. In fact we can remove this class if use dsop everywhere
 multiclass DS_1A_RET <bits<8> op, string opName, RegisterClass rc,
   dag outs = (outs rc:$vdst),
-  dag ins = (ins VGPR_32:$addr, ds_offset:$offset, gds:$gds),
+  dag ins = (ins VGPR_32:$addr, offset:$offset, gds:$gds),
   string asm = opName#" $vdst, $addr"#"$offset$gds"> {
 
   def "" : DS_Pseudo <opName, outs, ins, []>;
@@ -2070,8 +2629,8 @@ multiclass DS_1A_RET <bits<8> op, string opName, RegisterClass rc,
 
 multiclass DS_1A_Off8_RET <bits<8> op, string opName, RegisterClass rc,
   dag outs = (outs rc:$vdst),
-  dag ins = (ins VGPR_32:$addr, ds_offset0:$offset0, ds_offset1:$offset1,
-                 gds01:$gds),
+  dag ins = (ins VGPR_32:$addr, offset0:$offset0, offset1:$offset1,
+                 gds:$gds),
   string asm = opName#" $vdst, $addr"#"$offset0"#"$offset1$gds"> {
 
   def "" : DS_Pseudo <opName, outs, ins, []>;
@@ -2084,7 +2643,7 @@ multiclass DS_1A_Off8_RET <bits<8> op, string opName, RegisterClass rc,
 
 multiclass DS_1A1D_NORET <bits<8> op, string opName, RegisterClass rc,
   dag outs = (outs),
-  dag ins = (ins VGPR_32:$addr, rc:$data0, ds_offset:$offset, gds:$gds),
+  dag ins = (ins VGPR_32:$addr, rc:$data0, offset:$offset, gds:$gds),
   string asm = opName#" $addr, $data0"#"$offset$gds"> {
 
   def "" : DS_Pseudo <opName, outs, ins, []>,
@@ -2096,11 +2655,25 @@ multiclass DS_1A1D_NORET <bits<8> op, string opName, RegisterClass rc,
   }
 }
 
-multiclass DS_1A1D_Off8_NORET <bits<8> op, string opName, RegisterClass rc,
+multiclass DS_1A_Off8_NORET <bits<8> op, string opName,
+  dag outs = (outs),
+  dag ins = (ins VGPR_32:$addr,
+              offset0:$offset0, offset1:$offset1, gds:$gds),
+  string asm = opName#" $addr $offset0"#"$offset1$gds"> {
+
+  def "" : DS_Pseudo <opName, outs, ins, []>;
+
+  let data0 = 0, data1 = 0, vdst = 0, AsmMatchConverter = "cvtDSOffset01" in {
+    def _si : DS_Real_si <op, opName, outs, ins, asm>;
+    def _vi : DS_Real_vi <op, opName, outs, ins, asm>;
+  }
+}
+
+multiclass DS_1A2D_Off8_NORET <bits<8> op, string opName, RegisterClass rc,
   dag outs = (outs),
   dag ins = (ins VGPR_32:$addr, rc:$data0, rc:$data1,
-              ds_offset0:$offset0, ds_offset1:$offset1, gds01:$gds),
-  string asm = opName#" $addr, $data0, $data1"#"$offset0"#"$offset1"#"$gds"> {
+              offset0:$offset0, offset1:$offset1, gds:$gds),
+  string asm = opName#" $addr, $data0, $data1$offset0$offset1$gds"> {
 
   def "" : DS_Pseudo <opName, outs, ins, []>;
 
@@ -2113,7 +2686,7 @@ multiclass DS_1A1D_Off8_NORET <bits<8> op, string opName, RegisterClass rc,
 multiclass DS_1A1D_RET <bits<8> op, string opName, RegisterClass rc,
                         string noRetOp = "",
   dag outs = (outs rc:$vdst),
-  dag ins = (ins VGPR_32:$addr, rc:$data0, ds_offset:$offset, gds:$gds),
+  dag ins = (ins VGPR_32:$addr, rc:$data0, offset:$offset, gds:$gds),
   string asm = opName#" $vdst, $addr, $data0"#"$offset$gds"> {
 
   let hasPostISelHook = 1 in {
@@ -2127,6 +2700,23 @@ multiclass DS_1A1D_RET <bits<8> op, string opName, RegisterClass rc,
   }
 }
 
+multiclass DS_1A1D_PERMUTE <bits<8> op, string opName, RegisterClass rc,
+                            SDPatternOperator node = null_frag,
+  dag outs = (outs rc:$vdst),
+  dag ins = (ins VGPR_32:$addr, rc:$data0, offset:$offset),
+  string asm = opName#" $vdst, $addr, $data0"#"$offset"> {
+
+  let mayLoad = 0, mayStore = 0, isConvergent = 1 in {
+    def "" : DS_Pseudo <opName, outs, ins,
+     [(set i32:$vdst,
+         (node (DS1Addr1Offset i32:$addr, i16:$offset), i32:$data0))]>;
+
+    let data1 = 0, gds = 0  in {
+      def "_vi" : DS_Off16_Real_vi <op, opName, outs, ins, asm>;
+    }
+  }
+}
+
 multiclass DS_1A2D_RET_m <bits<8> op, string opName, RegisterClass rc,
                           string noRetOp = "", dag ins,
   dag outs = (outs rc:$vdst),
@@ -2145,14 +2735,14 @@ multiclass DS_1A2D_RET <bits<8> op, string asm, RegisterClass rc,
                         string noRetOp = "", RegisterClass src = rc> :
   DS_1A2D_RET_m <op, asm, rc, noRetOp,
                  (ins VGPR_32:$addr, src:$data0, src:$data1,
-                      ds_offset:$offset, gds:$gds)
+                      offset:$offset, gds:$gds)
 >;
 
 multiclass DS_1A2D_NORET <bits<8> op, string opName, RegisterClass rc,
                           string noRetOp = opName,
   dag outs = (outs),
   dag ins = (ins VGPR_32:$addr, rc:$data0, rc:$data1,
-                 ds_offset:$offset, gds:$gds),
+                 offset:$offset, gds:$gds),
   string asm = opName#" $addr, $data0, $data1"#"$offset"#"$gds"> {
 
   def "" : DS_Pseudo <opName, outs, ins, []>,
@@ -2166,7 +2756,7 @@ multiclass DS_1A2D_NORET <bits<8> op, string opName, RegisterClass rc,
 
 multiclass DS_0A_RET <bits<8> op, string opName,
   dag outs = (outs VGPR_32:$vdst),
-  dag ins = (ins ds_offset:$offset, gds:$gds),
+  dag ins = (ins offset:$offset, gds:$gds),
   string asm = opName#" $vdst"#"$offset"#"$gds"> {
 
   let mayLoad = 1, mayStore = 1 in {
@@ -2181,7 +2771,7 @@ multiclass DS_0A_RET <bits<8> op, string opName,
 
 multiclass DS_1A_RET_GDS <bits<8> op, string opName,
   dag outs = (outs VGPR_32:$vdst),
-  dag ins = (ins VGPR_32:$addr, ds_offset_gds:$offset),
+  dag ins = (ins VGPR_32:$addr, offset:$offset),
   string asm = opName#" $vdst, $addr"#"$offset gds"> {
 
   def "" : DS_Pseudo <opName, outs, ins, []>;
@@ -2207,7 +2797,7 @@ multiclass DS_1A_GDS <bits<8> op, string opName,
 
 multiclass DS_1A <bits<8> op, string opName,
   dag outs = (outs),
-  dag ins = (ins VGPR_32:$addr, ds_offset:$offset, gds:$gds),
+  dag ins = (ins VGPR_32:$addr, offset:$offset, gds:$gds),
   string asm = opName#" $addr"#"$offset"#"$gds"> {
 
   let mayLoad = 1, mayStore = 1 in {
@@ -2226,7 +2816,7 @@ multiclass DS_1A <bits<8> op, string opName,
 
 class MTBUF_Pseudo <string opName, dag outs, dag ins, list<dag> pattern> :
   MTBUF <outs, ins, "", pattern>,
-  SIMCInstr<opName, SISubtarget.NONE> {
+  SIMCInstr<opName, SIEncodingFamily.NONE> {
   let isPseudo = 1;
   let isCodeGenOnly = 1;
 }
@@ -2235,12 +2825,18 @@ class MTBUF_Real_si <bits<3> op, string opName, dag outs, dag ins,
                     string asm> :
   MTBUF <outs, ins, asm, []>,
   MTBUFe <op>,
-  SIMCInstr<opName, SISubtarget.SI>;
+  SIMCInstr<opName, SIEncodingFamily.SI> {
+  let DecoderNamespace="SICI";
+  let DisableDecoder = DisableSIDecoder;
+}
 
 class MTBUF_Real_vi <bits<4> op, string opName, dag outs, dag ins, string asm> :
   MTBUF <outs, ins, asm, []>,
   MTBUFe_vi <op>,
-  SIMCInstr <opName, SISubtarget.VI>;
+  SIMCInstr <opName, SIEncodingFamily.VI> {
+  let DecoderNamespace="VI";
+  let DisableDecoder = DisableVIDecoder;
+}
 
 multiclass MTBUF_m <bits<3> op, string opName, dag outs, dag ins, string asm,
                     list<dag> pattern> {
@@ -2311,7 +2907,7 @@ class MUBUFAddr64Table <bit is_addr64, string suffix = ""> {
 
 class MUBUF_Pseudo <string opName, dag outs, dag ins, list<dag> pattern> :
   MUBUF <outs, ins, "", pattern>,
-  SIMCInstr<opName, SISubtarget.NONE> {
+  SIMCInstr<opName, SIEncodingFamily.NONE> {
   let isPseudo = 1;
   let isCodeGenOnly = 1;
 
@@ -2329,16 +2925,22 @@ class MUBUF_Real_si <mubuf op, string opName, dag outs, dag ins,
                      string asm> :
   MUBUF <outs, ins, asm, []>,
   MUBUFe <op.SI>,
-  SIMCInstr<opName, SISubtarget.SI> {
+  SIMCInstr<opName, SIEncodingFamily.SI> {
   let lds = 0;
+  let AssemblerPredicate = SIAssemblerPredicate;
+  let DecoderNamespace="SICI";
+  let DisableDecoder = DisableSIDecoder;
 }
 
 class MUBUF_Real_vi <mubuf op, string opName, dag outs, dag ins,
                      string asm> :
   MUBUF <outs, ins, asm, []>,
   MUBUFe_vi <op.VI>,
-  SIMCInstr<opName, SISubtarget.VI> {
+  SIMCInstr<opName, SIEncodingFamily.VI> {
   let lds = 0;
+  let AssemblerPredicate = VIAssemblerPredicate;
+  let DecoderNamespace="VI";
+  let DisableDecoder = DisableVIDecoder;
 }
 
 multiclass MUBUF_m <mubuf op, string opName, dag outs, dag ins, string asm,
@@ -2399,38 +3001,82 @@ multiclass MUBUFAtomicAddr64_m <mubuf op, string opName, dag outs, dag ins,
   // for VI appropriately.
 }
 
+multiclass MUBUFAtomicOther_m <mubuf op, string opName, dag outs, dag ins,
+                               string asm, list<dag> pattern, bit is_return> {
+
+  def "" : MUBUF_Pseudo <opName, outs, ins, pattern>,
+           AtomicNoRet<opName, is_return>;
+
+  let tfe = 0 in {
+    let addr64 = 0 in {
+      def _si : MUBUF_Real_si <op, opName, outs, ins, asm>;
+    }
+
+    def _vi : MUBUF_Real_vi <op, opName, outs, ins, asm>;
+  }
+}
+
 multiclass MUBUF_Atomic <mubuf op, string name, RegisterClass rc,
                          ValueType vt, SDPatternOperator atomic> {
 
-  let mayStore = 1, mayLoad = 1, hasPostISelHook = 1 in {
+  let mayStore = 1, mayLoad = 1, hasPostISelHook = 1, hasSideEffects = 1 in {
 
     // No return variants
-    let glc = 0 in {
+    let glc = 0, AsmMatchConverter = "cvtMubufAtomic" in {
 
       defm _ADDR64 : MUBUFAtomicAddr64_m <
         op, name#"_addr64", (outs),
         (ins rc:$vdata, VReg_64:$vaddr, SReg_128:$srsrc,
-             SCSrc_32:$soffset, mbuf_offset:$offset, slc:$slc),
-        name#" $vdata, $vaddr, $srsrc, $soffset addr64"#"$offset"#"$slc", [], 0
+             SCSrc_32:$soffset, offset:$offset, slc:$slc),
+        name#" $vdata, $vaddr, $srsrc, $soffset addr64$offset$slc", [], 0
       >;
 
       defm _OFFSET : MUBUFAtomicOffset_m <
         op, name#"_offset", (outs),
-        (ins rc:$vdata, SReg_128:$srsrc, SCSrc_32:$soffset, mbuf_offset:$offset,
+        (ins rc:$vdata, SReg_128:$srsrc, SCSrc_32:$soffset, offset:$offset,
              slc:$slc),
-        name#" $vdata, $srsrc, $soffset"#"$offset"#"$slc", [], 0
+        name#" $vdata, off, $srsrc, $soffset$offset$slc", [], 0
       >;
+
+      let offen = 1, idxen = 0 in {
+        defm _OFFEN : MUBUFAtomicOther_m <
+          op, name#"_offen", (outs),
+          (ins rc:$vdata, VGPR_32:$vaddr, SReg_128:$srsrc, SCSrc_32:$soffset,
+                offset:$offset, slc:$slc),
+          name#" $vdata, $vaddr, $srsrc, $soffset offen$offset$slc", [], 0
+        >;
+      }
+
+      let offen = 0, idxen = 1 in {
+        defm _IDXEN : MUBUFAtomicOther_m <
+          op, name#"_idxen", (outs),
+          (ins rc:$vdata, VGPR_32:$vaddr, SReg_128:$srsrc, SCSrc_32:$soffset,
+                offset:$offset, slc:$slc),
+          name#" $vdata, $vaddr, $srsrc, $soffset idxen$offset$slc", [], 0
+        >;
+      }
+
+      let offen = 1, idxen = 1 in {
+        defm _BOTHEN : MUBUFAtomicOther_m <
+          op, name#"_bothen", (outs),
+          (ins rc:$vdata, VReg_64:$vaddr, SReg_128:$srsrc, SCSrc_32:$soffset,
+                offset:$offset, slc:$slc),
+          name#" $vdata, $vaddr, $srsrc, $soffset idxen offen$offset$slc",
+          [], 0
+        >;
+      }
     } // glc = 0
 
     // Variant that return values
     let glc = 1, Constraints = "$vdata = $vdata_in",
+        AsmMatchConverter = "cvtMubufAtomicReturn",
         DisableEncoding = "$vdata_in"  in {
 
       defm _RTN_ADDR64 : MUBUFAtomicAddr64_m <
         op, name#"_rtn_addr64", (outs rc:$vdata),
         (ins rc:$vdata_in, VReg_64:$vaddr, SReg_128:$srsrc,
-             SCSrc_32:$soffset, mbuf_offset:$offset, slc:$slc),
-        name#" $vdata, $vaddr, $srsrc, $soffset addr64"#"$offset"#" glc"#"$slc",
+             SCSrc_32:$soffset, offset:$offset, slc:$slc),
+        name#" $vdata, $vaddr, $srsrc, $soffset addr64$offset glc$slc",
         [(set vt:$vdata,
          (atomic (MUBUFAddr64Atomic v4i32:$srsrc, i64:$vaddr, i32:$soffset,
 	                            i16:$offset, i1:$slc), vt:$vdata_in))], 1
@@ -2439,13 +3085,42 @@ multiclass MUBUF_Atomic <mubuf op, string name, RegisterClass rc,
       defm _RTN_OFFSET : MUBUFAtomicOffset_m <
         op, name#"_rtn_offset", (outs rc:$vdata),
         (ins rc:$vdata_in, SReg_128:$srsrc, SCSrc_32:$soffset,
-             mbuf_offset:$offset, slc:$slc),
-        name#" $vdata, $srsrc, $soffset"#"$offset"#" glc$slc",
+             offset:$offset, slc:$slc),
+        name#" $vdata, off, $srsrc, $soffset$offset glc$slc",
         [(set vt:$vdata,
          (atomic (MUBUFOffsetAtomic v4i32:$srsrc, i32:$soffset, i16:$offset,
                                     i1:$slc), vt:$vdata_in))], 1
       >;
 
+      let offen = 1, idxen = 0 in {
+        defm _RTN_OFFEN : MUBUFAtomicOther_m <
+          op, name#"_rtn_offen", (outs rc:$vdata),
+          (ins rc:$vdata_in, VGPR_32:$vaddr, SReg_128:$srsrc, SCSrc_32:$soffset,
+                offset:$offset, slc:$slc),
+          name#" $vdata, $vaddr, $srsrc, $soffset offen$offset glc$slc",
+          [], 1
+        >;
+      }
+
+      let offen = 0, idxen = 1 in {
+        defm _RTN_IDXEN : MUBUFAtomicOther_m <
+          op, name#"_rtn_idxen", (outs rc:$vdata),
+          (ins rc:$vdata_in, VGPR_32:$vaddr, SReg_128:$srsrc, SCSrc_32:$soffset,
+                offset:$offset, slc:$slc),
+          name#" $vdata, $vaddr, $srsrc, $soffset idxen$offset glc$slc",
+          [], 1
+        >;
+      }
+
+      let offen = 1, idxen = 1 in {
+        defm _RTN_BOTHEN : MUBUFAtomicOther_m <
+          op, name#"_rtn_bothen", (outs rc:$vdata),
+          (ins rc:$vdata_in, VReg_64:$vaddr, SReg_128:$srsrc, SCSrc_32:$soffset,
+                offset:$offset, slc:$slc),
+          name#" $vdata, $vaddr, $srsrc, $soffset idxen offen$offset glc$slc",
+          [], 1
+        >;
+      }
     } // glc = 1
 
   } // mayStore = 1, mayLoad = 1, hasPostISelHook = 1
@@ -2461,8 +3136,8 @@ multiclass MUBUF_Load_Helper <mubuf op, string name, RegisterClass regClass,
     let offen = 0, idxen = 0, vaddr = 0 in {
       defm _OFFSET : MUBUF_m <op, name#"_offset", (outs regClass:$vdata),
                            (ins SReg_128:$srsrc, SCSrc_32:$soffset,
-                           mbuf_offset:$offset, glc:$glc, slc:$slc, tfe:$tfe),
-                           name#" $vdata, $srsrc, $soffset"#"$offset"#"$glc"#"$slc"#"$tfe",
+                           offset:$offset, glc:$glc, slc:$slc, tfe:$tfe),
+                           name#" $vdata, off, $srsrc, $soffset$offset$glc$slc$tfe",
                            [(set load_vt:$vdata, (ld (MUBUFOffset v4i32:$srsrc,
                                                      i32:$soffset, i16:$offset,
                                                      i1:$glc, i1:$slc, i1:$tfe)))]>;
@@ -2471,33 +3146,32 @@ multiclass MUBUF_Load_Helper <mubuf op, string name, RegisterClass regClass,
     let offen = 1, idxen = 0  in {
       defm _OFFEN  : MUBUF_m <op, name#"_offen", (outs regClass:$vdata),
                            (ins VGPR_32:$vaddr, SReg_128:$srsrc,
-                           SCSrc_32:$soffset, mbuf_offset:$offset, glc:$glc, slc:$slc,
+                           SCSrc_32:$soffset, offset:$offset, glc:$glc, slc:$slc,
                            tfe:$tfe),
-                           name#" $vdata, $vaddr, $srsrc, $soffset offen"#"$offset"#"$glc"#"$slc"#"$tfe", []>;
+                           name#" $vdata, $vaddr, $srsrc, $soffset offen$offset$glc$slc$tfe", []>;
     }
 
     let offen = 0, idxen = 1 in {
       defm _IDXEN  : MUBUF_m <op, name#"_idxen", (outs regClass:$vdata),
                            (ins VGPR_32:$vaddr, SReg_128:$srsrc,
-                           SCSrc_32:$soffset, mbuf_offset:$offset, glc:$glc,
+                           SCSrc_32:$soffset, offset:$offset, glc:$glc,
                            slc:$slc, tfe:$tfe),
-                           name#" $vdata, $vaddr, $srsrc, $soffset idxen"#"$offset"#"$glc"#"$slc"#"$tfe", []>;
+                           name#" $vdata, $vaddr, $srsrc, $soffset idxen$offset$glc$slc$tfe", []>;
     }
 
     let offen = 1, idxen = 1 in {
       defm _BOTHEN : MUBUF_m <op, name#"_bothen", (outs regClass:$vdata),
                            (ins VReg_64:$vaddr, SReg_128:$srsrc, SCSrc_32:$soffset,
-                           mbuf_offset:$offset, glc:$glc, slc:$slc, tfe:$tfe),
-                           name#" $vdata, $vaddr, $srsrc, $soffset idxen offen"#"$offset"#"$glc"#"$slc"#"$tfe", []>;
+                           offset:$offset, glc:$glc, slc:$slc, tfe:$tfe),
+                           name#" $vdata, $vaddr, $srsrc, $soffset idxen offen$offset$glc$slc$tfe", []>;
     }
 
     let offen = 0, idxen = 0 in {
       defm _ADDR64 : MUBUFAddr64_m <op, name#"_addr64", (outs regClass:$vdata),
                            (ins VReg_64:$vaddr, SReg_128:$srsrc,
-                                SCSrc_32:$soffset, mbuf_offset:$offset,
+                                SCSrc_32:$soffset, offset:$offset,
 				glc:$glc, slc:$slc, tfe:$tfe),
-                           name#" $vdata, $vaddr, $srsrc, $soffset addr64"#"$offset"#
-                                "$glc"#"$slc"#"$tfe",
+                           name#" $vdata, $vaddr, $srsrc, $soffset addr64$offset$glc$slc$tfe",
                            [(set load_vt:$vdata, (ld (MUBUFAddr64 v4i32:$srsrc,
                                                   i64:$vaddr, i32:$soffset,
                                                   i16:$offset, i1:$glc, i1:$slc,
@@ -2509,18 +3183,11 @@ multiclass MUBUF_Load_Helper <mubuf op, string name, RegisterClass regClass,
 multiclass MUBUF_Store_Helper <mubuf op, string name, RegisterClass vdataClass,
                           ValueType store_vt = i32, SDPatternOperator st = null_frag> {
   let mayLoad = 0, mayStore = 1 in {
-    defm : MUBUF_m <op, name, (outs),
-                    (ins vdataClass:$vdata, VGPR_32:$vaddr, SReg_128:$srsrc, SCSrc_32:$soffset,
-                    mbuf_offset:$offset, offen:$offen, idxen:$idxen, glc:$glc, slc:$slc,
-                    tfe:$tfe),
-                    name#" $vdata, $vaddr, $srsrc, $soffset"#"$offen"#"$idxen"#"$offset"#
-                         "$glc"#"$slc"#"$tfe", []>;
-
     let offen = 0, idxen = 0, vaddr = 0 in {
       defm _OFFSET : MUBUF_m <op, name#"_offset",(outs),
                               (ins vdataClass:$vdata, SReg_128:$srsrc, SCSrc_32:$soffset,
-                              mbuf_offset:$offset, glc:$glc, slc:$slc, tfe:$tfe),
-                              name#" $vdata, $srsrc, $soffset"#"$offset"#"$glc"#"$slc"#"$tfe",
+                              offset:$offset, glc:$glc, slc:$slc, tfe:$tfe),
+                              name#" $vdata, off, $srsrc, $soffset$offset$glc$slc$tfe",
                               [(st store_vt:$vdata, (MUBUFOffset v4i32:$srsrc, i32:$soffset,
                                    i16:$offset, i1:$glc, i1:$slc, i1:$tfe))]>;
     } // offen = 0, idxen = 0, vaddr = 0
@@ -2528,35 +3195,35 @@ multiclass MUBUF_Store_Helper <mubuf op, string name, RegisterClass vdataClass,
     let offen = 1, idxen = 0  in {
       defm _OFFEN : MUBUF_m <op, name#"_offen", (outs),
                              (ins vdataClass:$vdata, VGPR_32:$vaddr, SReg_128:$srsrc,
-                              SCSrc_32:$soffset, mbuf_offset:$offset, glc:$glc,
+                              SCSrc_32:$soffset, offset:$offset, glc:$glc,
                               slc:$slc, tfe:$tfe),
-                             name#" $vdata, $vaddr, $srsrc, $soffset offen"#"$offset"#
-                             "$glc"#"$slc"#"$tfe", []>;
+                             name#" $vdata, $vaddr, $srsrc, $soffset offen"#
+                             "$offset$glc$slc$tfe", []>;
     } // end offen = 1, idxen = 0
 
     let offen = 0, idxen = 1 in {
       defm _IDXEN  : MUBUF_m <op, name#"_idxen", (outs),
                            (ins vdataClass:$vdata, VGPR_32:$vaddr, SReg_128:$srsrc,
-                           SCSrc_32:$soffset, mbuf_offset:$offset, glc:$glc,
+                           SCSrc_32:$soffset, offset:$offset, glc:$glc,
                            slc:$slc, tfe:$tfe),
-                           name#" $vdata, $vaddr, $srsrc, $soffset idxen"#"$offset"#"$glc"#"$slc"#"$tfe", []>;
+                           name#" $vdata, $vaddr, $srsrc, $soffset idxen$offset$glc$slc$tfe", []>;
     }
 
     let offen = 1, idxen = 1 in {
       defm _BOTHEN : MUBUF_m <op, name#"_bothen", (outs),
                            (ins vdataClass:$vdata, VReg_64:$vaddr, SReg_128:$srsrc, SCSrc_32:$soffset,
-                           mbuf_offset:$offset, glc:$glc, slc:$slc, tfe:$tfe),
-                           name#" $vdata, $vaddr, $srsrc, $soffset idxen offen"#"$offset"#"$glc"#"$slc"#"$tfe", []>;
+                           offset:$offset, glc:$glc, slc:$slc, tfe:$tfe),
+                           name#" $vdata, $vaddr, $srsrc, $soffset idxen offen$offset$glc$slc$tfe", []>;
     }
 
     let offen = 0, idxen = 0 in {
       defm _ADDR64 : MUBUFAddr64_m <op, name#"_addr64", (outs),
                                     (ins vdataClass:$vdata, VReg_64:$vaddr, SReg_128:$srsrc,
                                          SCSrc_32:$soffset,
-                                         mbuf_offset:$offset, glc:$glc, slc:$slc,
+                                         offset:$offset, glc:$glc, slc:$slc,
                                          tfe:$tfe),
                                     name#" $vdata, $vaddr, $srsrc, $soffset addr64"#
-                                         "$offset"#"$glc"#"$slc"#"$tfe",
+                                         "$offset$glc$slc$tfe",
                                     [(st store_vt:$vdata,
                                       (MUBUFAddr64 v4i32:$srsrc, i64:$vaddr,
                                                    i32:$soffset, i16:$offset,
@@ -2593,21 +3260,24 @@ class flat <bits<7> ci, bits<7> vi = ci> {
 
 class FLAT_Pseudo <string opName, dag outs, dag ins, list<dag> pattern> :
      FLAT <0, outs, ins, "", pattern>,
-      SIMCInstr<opName, SISubtarget.NONE> {
+      SIMCInstr<opName, SIEncodingFamily.NONE> {
   let isPseudo = 1;
   let isCodeGenOnly = 1;
 }
 
 class FLAT_Real_ci <bits<7> op, string opName, dag outs, dag ins, string asm> :
     FLAT <op, outs, ins, asm, []>,
-    SIMCInstr<opName, SISubtarget.SI> {
+    SIMCInstr<opName, SIEncodingFamily.SI> {
   let AssemblerPredicate = isCIOnly;
+  let DecoderNamespace="CI";
 }
 
 class FLAT_Real_vi <bits<7> op, string opName, dag outs, dag ins, string asm> :
     FLAT <op, outs, ins, asm, []>,
-    SIMCInstr<opName, SISubtarget.VI> {
+    SIMCInstr<opName, SIEncodingFamily.VI> {
   let AssemblerPredicate = VIAssemblerPredicate;
+  let DecoderNamespace="VI";
+  let DisableDecoder = DisableVIDecoder;
 }
 
 multiclass FLAT_AtomicRet_m <flat op, dag outs, dag ins, string asm,
@@ -2623,8 +3293,8 @@ multiclass FLAT_AtomicRet_m <flat op, dag outs, dag ins, string asm,
 multiclass FLAT_Load_Helper <flat op, string asm_name,
     RegisterClass regClass,
     dag outs = (outs regClass:$vdst),
-    dag ins = (ins VReg_64:$addr, glc_flat:$glc, slc_flat:$slc, tfe_flat:$tfe),
-    string asm = asm_name#" $vdst, $addr"#"$glc"#"$slc"#"$tfe"> {
+    dag ins = (ins VReg_64:$addr, glc:$glc, slc:$slc, tfe:$tfe),
+    string asm = asm_name#" $vdst, $addr$glc$slc$tfe"> {
 
   let data = 0, mayLoad = 1 in {
 
@@ -2639,9 +3309,9 @@ multiclass FLAT_Load_Helper <flat op, string asm_name,
 multiclass FLAT_Store_Helper <flat op, string asm_name,
     RegisterClass vdataClass,
     dag outs = (outs),
-    dag ins = (ins vdataClass:$data, VReg_64:$addr, glc_flat:$glc,
-                   slc_flat:$slc, tfe_flat:$tfe),
-    string asm = asm_name#" $data, $addr"#"$glc"#"$slc"#"$tfe"> {
+    dag ins = (ins VReg_64:$addr, vdataClass:$data, glc:$glc,
+                   slc:$slc, tfe:$tfe),
+    string asm = asm_name#" $addr, $data$glc$slc$tfe"> {
 
   let mayLoad = 0, mayStore = 1, vdst = 0 in {
 
@@ -2654,32 +3324,36 @@ multiclass FLAT_Store_Helper <flat op, string asm_name,
 }
 
 multiclass FLAT_ATOMIC <flat op, string asm_name, RegisterClass vdst_rc,
+    ValueType vt, SDPatternOperator atomic = null_frag,
+    ValueType data_vt = vt,
     RegisterClass data_rc = vdst_rc,
-    dag outs_noret = (outs),
     string asm_noret = asm_name#" $addr, $data"#"$slc"#"$tfe"> {
 
   let mayLoad = 1, mayStore = 1, glc = 0, vdst = 0 in {
-    def "" : FLAT_Pseudo <NAME, outs_noret,
+    def "" : FLAT_Pseudo <NAME, (outs),
                           (ins VReg_64:$addr, data_rc:$data,
-                               slc_flat_atomic:$slc, tfe_flat_atomic:$tfe), []>,
+                               slc:$slc, tfe:$tfe), []>,
              AtomicNoRet <NAME, 0>;
 
-    def _ci : FLAT_Real_ci <op.CI, NAME, outs_noret,
+    def _ci : FLAT_Real_ci <op.CI, NAME, (outs),
                             (ins VReg_64:$addr, data_rc:$data,
-                                 slc_flat_atomic:$slc, tfe_flat_atomic:$tfe),
+                                 slc:$slc, tfe:$tfe),
                             asm_noret>;
 
-    def _vi : FLAT_Real_vi <op.VI, NAME, outs_noret,
+    def _vi : FLAT_Real_vi <op.VI, NAME, (outs),
                             (ins VReg_64:$addr, data_rc:$data,
-                                 slc_flat_atomic:$slc, tfe_flat_atomic:$tfe),
+                                 slc:$slc, tfe:$tfe),
                             asm_noret>;
   }
 
   let glc = 1, hasPostISelHook = 1 in {
-    defm _RTN : FLAT_AtomicRet_m <op, (outs vdst_rc:$vdst),
-                        (ins VReg_64:$addr, data_rc:$data, slc_flat_atomic:$slc,
-                             tfe_flat_atomic:$tfe),
-                        asm_name#" $vdst, $addr, $data glc"#"$slc"#"$tfe", []>;
+    defm _RTN : FLAT_AtomicRet_m <
+      op, (outs vdst_rc:$vdst),
+      (ins VReg_64:$addr, data_rc:$data, slc:$slc, tfe:$tfe),
+      asm_name#" $vdst, $addr, $data glc$slc$tfe",
+      [(set vt:$vdst,
+         (atomic (FLATAtomic i64:$addr, i1:$slc, i1:$tfe), data_vt:$data))]
+    >;
   }
 }
 
@@ -2688,27 +3362,39 @@ class MIMG_Mask <string op, int channels> {
   int Channels = channels;
 }
 
+class mimg <bits<7> si, bits<7> vi = si> {
+  field bits<7> SI = si;
+  field bits<7> VI = vi;
+}
+
+class MIMG_Helper <dag outs, dag ins, string asm,
+                   string dns=""> : MIMG<outs, ins, asm,[]> {
+  let mayLoad = 1;
+  let mayStore = 0;
+  let hasPostISelHook = 1;
+  let DecoderNamespace = dns;
+  let isAsmParserOnly = !if(!eq(dns,""), 1, 0);
+  let AsmMatchConverter = "cvtMIMG";
+}
+
 class MIMG_NoSampler_Helper <bits<7> op, string asm,
                              RegisterClass dst_rc,
-                             RegisterClass src_rc> : MIMG <
-  op,
+                             RegisterClass addr_rc,
+                             string dns=""> : MIMG_Helper <
   (outs dst_rc:$vdata),
-  (ins i32imm:$dmask, i1imm:$unorm, i1imm:$glc, i1imm:$da, i1imm:$r128,
-       i1imm:$tfe, i1imm:$lwe, i1imm:$slc, src_rc:$vaddr,
-       SReg_256:$srsrc),
-  asm#" $vdata, $dmask, $unorm, $glc, $da, $r128,"
-     #" $tfe, $lwe, $slc, $vaddr, $srsrc",
-  []> {
+  (ins addr_rc:$vaddr, SReg_256:$srsrc,
+       dmask:$dmask, unorm:$unorm, glc:$glc, slc:$slc,
+       r128:$r128, tfe:$tfe, lwe:$lwe, da:$da),
+  asm#" $vdata, $vaddr, $srsrc$dmask$unorm$glc$slc$r128$tfe$lwe$da",
+  dns>, MIMGe<op> {
   let ssamp = 0;
-  let mayLoad = 1;
-  let mayStore = 0;
-  let hasPostISelHook = 1;
 }
 
 multiclass MIMG_NoSampler_Src_Helper <bits<7> op, string asm,
                                       RegisterClass dst_rc,
                                       int channels> {
-  def _V1 : MIMG_NoSampler_Helper <op, asm, dst_rc, VGPR_32>,
+  def _V1 : MIMG_NoSampler_Helper <op, asm, dst_rc, VGPR_32,
+                                   !if(!eq(channels, 1), "AMDGPU", "")>,
             MIMG_Mask<asm#"_V1", channels>;
   def _V2 : MIMG_NoSampler_Helper <op, asm, dst_rc, VReg_64>,
             MIMG_Mask<asm#"_V2", channels>;
@@ -2723,27 +3409,116 @@ multiclass MIMG_NoSampler <bits<7> op, string asm> {
   defm _V4 : MIMG_NoSampler_Src_Helper <op, asm, VReg_128, 4>;
 }
 
+class MIMG_Store_Helper <bits<7> op, string asm,
+                         RegisterClass data_rc,
+                         RegisterClass addr_rc> : MIMG_Helper <
+  (outs),
+  (ins data_rc:$vdata, addr_rc:$vaddr, SReg_256:$srsrc,
+       dmask:$dmask, unorm:$unorm, glc:$glc, slc:$slc,
+       r128:$r128, tfe:$tfe, lwe:$lwe, da:$da),
+  asm#" $vdata, $vaddr, $srsrc$dmask$unorm$glc$slc$r128$tfe$lwe$da"
+     >, MIMGe<op> {
+  let ssamp = 0;
+  let mayLoad = 1; // TableGen requires this for matching with the intrinsics
+  let mayStore = 1;
+  let hasSideEffects = 1;
+  let hasPostISelHook = 0;
+}
+
+multiclass MIMG_Store_Addr_Helper <bits<7> op, string asm,
+                                  RegisterClass data_rc,
+                                  int channels> {
+  def _V1 : MIMG_Store_Helper <op, asm, data_rc, VGPR_32>,
+            MIMG_Mask<asm#"_V1", channels>;
+  def _V2 : MIMG_Store_Helper <op, asm, data_rc, VReg_64>,
+            MIMG_Mask<asm#"_V2", channels>;
+  def _V4 : MIMG_Store_Helper <op, asm, data_rc, VReg_128>,
+            MIMG_Mask<asm#"_V4", channels>;
+}
+
+multiclass MIMG_Store <bits<7> op, string asm> {
+  defm _V1 : MIMG_Store_Addr_Helper <op, asm, VGPR_32, 1>;
+  defm _V2 : MIMG_Store_Addr_Helper <op, asm, VReg_64, 2>;
+  defm _V3 : MIMG_Store_Addr_Helper <op, asm, VReg_96, 3>;
+  defm _V4 : MIMG_Store_Addr_Helper <op, asm, VReg_128, 4>;
+}
+
+class MIMG_Atomic_Helper <string asm, RegisterClass data_rc,
+                          RegisterClass addr_rc> : MIMG_Helper <
+    (outs data_rc:$vdst),
+    (ins data_rc:$vdata, addr_rc:$vaddr, SReg_256:$srsrc,
+         dmask:$dmask, unorm:$unorm, glc:$glc, slc:$slc,
+         r128:$r128, tfe:$tfe, lwe:$lwe, da:$da),
+    asm#" $vdst, $vaddr, $srsrc$dmask$unorm$glc$slc$r128$tfe$lwe$da"
+  > {
+  let mayStore = 1;
+  let hasSideEffects = 1;
+  let hasPostISelHook = 0;
+  let Constraints = "$vdst = $vdata";
+  let AsmMatchConverter = "cvtMIMGAtomic";
+}
+
+class MIMG_Atomic_Real_si<mimg op, string name, string asm,
+  RegisterClass data_rc, RegisterClass addr_rc> :
+  MIMG_Atomic_Helper<asm, data_rc, addr_rc>,
+  SIMCInstr<name, SIEncodingFamily.SI>,
+  MIMGe<op.SI> {
+  let isCodeGenOnly = 0;
+  let AssemblerPredicates = [isSICI];
+  let DecoderNamespace = "SICI";
+  let DisableDecoder = DisableSIDecoder;
+}
+
+class MIMG_Atomic_Real_vi<mimg op, string name, string asm,
+  RegisterClass data_rc, RegisterClass addr_rc> :
+  MIMG_Atomic_Helper<asm, data_rc, addr_rc>,
+  SIMCInstr<name, SIEncodingFamily.VI>,
+  MIMGe<op.VI> {
+  let isCodeGenOnly = 0;
+  let AssemblerPredicates = [isVI];
+  let DecoderNamespace = "VI";
+  let DisableDecoder = DisableVIDecoder;
+}
+
+multiclass MIMG_Atomic_Helper_m <mimg op, string name, string asm,
+                                 RegisterClass data_rc, RegisterClass addr_rc> {
+  let isPseudo = 1, isCodeGenOnly = 1 in {
+    def "" : MIMG_Atomic_Helper<asm, data_rc, addr_rc>,
+             SIMCInstr<name, SIEncodingFamily.NONE>;
+  }
+
+  let ssamp = 0 in {
+    def _si : MIMG_Atomic_Real_si<op, name, asm, data_rc, addr_rc>;
+
+    def _vi : MIMG_Atomic_Real_vi<op, name, asm, data_rc, addr_rc>;
+  }
+}
+
+multiclass MIMG_Atomic <mimg op, string asm, RegisterClass data_rc = VGPR_32> {
+  defm _V1 : MIMG_Atomic_Helper_m <op, asm # "_V1", asm, data_rc, VGPR_32>;
+  defm _V2 : MIMG_Atomic_Helper_m <op, asm # "_V2", asm, data_rc, VReg_64>;
+  defm _V4 : MIMG_Atomic_Helper_m <op, asm # "_V3", asm, data_rc, VReg_128>;
+}
+
 class MIMG_Sampler_Helper <bits<7> op, string asm,
                            RegisterClass dst_rc,
-                           RegisterClass src_rc, int wqm> : MIMG <
-  op,
+                           RegisterClass src_rc,
+                           int wqm,
+                           string dns=""> : MIMG_Helper <
   (outs dst_rc:$vdata),
-  (ins i32imm:$dmask, i1imm:$unorm, i1imm:$glc, i1imm:$da, i1imm:$r128,
-       i1imm:$tfe, i1imm:$lwe, i1imm:$slc, src_rc:$vaddr,
-       SReg_256:$srsrc, SReg_128:$ssamp),
-  asm#" $vdata, $dmask, $unorm, $glc, $da, $r128,"
-     #" $tfe, $lwe, $slc, $vaddr, $srsrc, $ssamp",
-  []> {
-  let mayLoad = 1;
-  let mayStore = 0;
-  let hasPostISelHook = 1;
+  (ins src_rc:$vaddr, SReg_256:$srsrc, SReg_128:$ssamp,
+       dmask:$dmask, unorm:$unorm, glc:$glc, slc:$slc,
+       r128:$r128, tfe:$tfe, lwe:$lwe, da:$da),
+  asm#" $vdata, $vaddr, $srsrc, $ssamp$dmask$unorm$glc$slc$r128$tfe$lwe$da",
+  dns>, MIMGe<op> {
   let WQM = wqm;
 }
 
 multiclass MIMG_Sampler_Src_Helper <bits<7> op, string asm,
                                     RegisterClass dst_rc,
                                     int channels, int wqm> {
-  def _V1 : MIMG_Sampler_Helper <op, asm, dst_rc, VGPR_32, wqm>,
+  def _V1 : MIMG_Sampler_Helper <op, asm, dst_rc, VGPR_32, wqm,
+                                 !if(!eq(channels, 1), "AMDGPU", "")>,
             MIMG_Mask<asm#"_V1", channels>;
   def _V2 : MIMG_Sampler_Helper <op, asm, dst_rc, VReg_64, wqm>,
             MIMG_Mask<asm#"_V2", channels>;
@@ -2755,31 +3530,24 @@ multiclass MIMG_Sampler_Src_Helper <bits<7> op, string asm,
             MIMG_Mask<asm#"_V16", channels>;
 }
 
-multiclass MIMG_Sampler <bits<7> op, string asm> {
-  defm _V1 : MIMG_Sampler_Src_Helper<op, asm, VGPR_32, 1, 0>;
-  defm _V2 : MIMG_Sampler_Src_Helper<op, asm, VReg_64, 2, 0>;
-  defm _V3 : MIMG_Sampler_Src_Helper<op, asm, VReg_96, 3, 0>;
-  defm _V4 : MIMG_Sampler_Src_Helper<op, asm, VReg_128, 4, 0>;
+multiclass MIMG_Sampler <bits<7> op, string asm, int wqm=0> {
+  defm _V1 : MIMG_Sampler_Src_Helper<op, asm, VGPR_32, 1, wqm>;
+  defm _V2 : MIMG_Sampler_Src_Helper<op, asm, VReg_64, 2, wqm>;
+  defm _V3 : MIMG_Sampler_Src_Helper<op, asm, VReg_96, 3, wqm>;
+  defm _V4 : MIMG_Sampler_Src_Helper<op, asm, VReg_128, 4, wqm>;
 }
 
-multiclass MIMG_Sampler_WQM <bits<7> op, string asm> {
-  defm _V1 : MIMG_Sampler_Src_Helper<op, asm, VGPR_32, 1, 1>;
-  defm _V2 : MIMG_Sampler_Src_Helper<op, asm, VReg_64, 2, 1>;
-  defm _V3 : MIMG_Sampler_Src_Helper<op, asm, VReg_96, 3, 1>;
-  defm _V4 : MIMG_Sampler_Src_Helper<op, asm, VReg_128, 4, 1>;
-}
+multiclass MIMG_Sampler_WQM <bits<7> op, string asm> : MIMG_Sampler<op, asm, 1>;
 
 class MIMG_Gather_Helper <bits<7> op, string asm,
                           RegisterClass dst_rc,
                           RegisterClass src_rc, int wqm> : MIMG <
-  op,
   (outs dst_rc:$vdata),
-  (ins i32imm:$dmask, i1imm:$unorm, i1imm:$glc, i1imm:$da, i1imm:$r128,
-       i1imm:$tfe, i1imm:$lwe, i1imm:$slc, src_rc:$vaddr,
-       SReg_256:$srsrc, SReg_128:$ssamp),
-  asm#" $vdata, $dmask, $unorm, $glc, $da, $r128,"
-     #" $tfe, $lwe, $slc, $vaddr, $srsrc, $ssamp",
-  []> {
+  (ins src_rc:$vaddr, SReg_256:$srsrc, SReg_128:$ssamp,
+       dmask:$dmask, unorm:$unorm, glc:$glc, slc:$slc,
+       r128:$r128, tfe:$tfe, lwe:$lwe, da:$da),
+  asm#" $vdata, $vaddr, $srsrc, $ssamp$dmask$unorm$glc$slc$r128$tfe$lwe$da",
+  []>, MIMGe<op> {
   let mayLoad = 1;
   let mayStore = 0;
 
@@ -2789,10 +3557,12 @@ class MIMG_Gather_Helper <bits<7> op, string asm,
   // 1=red, 2=green, 4=blue, 8=alpha. (e.g. 1 returns
   // (red,red,red,red) etc.) The ISA document doesn't mention
   // this.
-  // Therefore, disable all code which updates DMASK by setting these two:
-  let MIMG = 0;
+  // Therefore, disable all code which updates DMASK by setting this:
+  let Gather4 = 1;
   let hasPostISelHook = 0;
   let WQM = wqm;
+
+  let isAsmParserOnly = 1; // TBD: fix it later
 }
 
 multiclass MIMG_Gather_Src_Helper <bits<7> op, string asm,
@@ -2810,19 +3580,14 @@ multiclass MIMG_Gather_Src_Helper <bits<7> op, string asm,
             MIMG_Mask<asm#"_V16", channels>;
 }
 
-multiclass MIMG_Gather <bits<7> op, string asm> {
-  defm _V1 : MIMG_Gather_Src_Helper<op, asm, VGPR_32, 1, 0>;
-  defm _V2 : MIMG_Gather_Src_Helper<op, asm, VReg_64, 2, 0>;
-  defm _V3 : MIMG_Gather_Src_Helper<op, asm, VReg_96, 3, 0>;
-  defm _V4 : MIMG_Gather_Src_Helper<op, asm, VReg_128, 4, 0>;
+multiclass MIMG_Gather <bits<7> op, string asm, int wqm=0> {
+  defm _V1 : MIMG_Gather_Src_Helper<op, asm, VGPR_32, 1, wqm>;
+  defm _V2 : MIMG_Gather_Src_Helper<op, asm, VReg_64, 2, wqm>;
+  defm _V3 : MIMG_Gather_Src_Helper<op, asm, VReg_96, 3, wqm>;
+  defm _V4 : MIMG_Gather_Src_Helper<op, asm, VReg_128, 4, wqm>;
 }
 
-multiclass MIMG_Gather_WQM <bits<7> op, string asm> {
-  defm _V1 : MIMG_Gather_Src_Helper<op, asm, VGPR_32, 1, 1>;
-  defm _V2 : MIMG_Gather_Src_Helper<op, asm, VReg_64, 2, 1>;
-  defm _V3 : MIMG_Gather_Src_Helper<op, asm, VReg_96, 3, 1>;
-  defm _V4 : MIMG_Gather_Src_Helper<op, asm, VReg_128, 4, 1>;
-}
+multiclass MIMG_Gather_WQM <bits<7> op, string asm> : MIMG_Gather<op, asm, 1>;
 
 //===----------------------------------------------------------------------===//
 // Vector instruction mappings
@@ -2894,8 +3659,9 @@ def getMCOpcodeGen : InstrMapping {
   let FilterClass = "SIMCInstr";
   let RowFields = ["PseudoInstr"];
   let ColFields = ["Subtarget"];
-  let KeyCol = [!cast<string>(SISubtarget.NONE)];
-  let ValueCols = [[!cast<string>(SISubtarget.SI)],[!cast<string>(SISubtarget.VI)]];
+  let KeyCol = [!cast<string>(SIEncodingFamily.NONE)];
+  let ValueCols = [[!cast<string>(SIEncodingFamily.SI)],
+                   [!cast<string>(SIEncodingFamily.VI)]];
 }
 
 def getAddr64Inst : InstrMapping {
diff --git a/lib/Target/AMDGPU/SIInstructions.td b/lib/Target/AMDGPU/SIInstructions.td
index 89692ab71f4d..6427db87cd6f 100644
--- a/lib/Target/AMDGPU/SIInstructions.td
+++ b/lib/Target/AMDGPU/SIInstructions.td
@@ -18,35 +18,17 @@ int P20 = 1;
 }
 def INTERP : InterpSlots;
 
-def InterpSlot : Operand<i32> {
-  let PrintMethod = "printInterpSlot";
-}
-
-def SendMsgImm : Operand<i32> {
-  let PrintMethod = "printSendMsg";
-}
-
 def isGCN : Predicate<"Subtarget->getGeneration() "
-                      ">= AMDGPUSubtarget::SOUTHERN_ISLANDS">,
+                      ">= SISubtarget::SOUTHERN_ISLANDS">,
             AssemblerPredicate<"FeatureGCN">;
 def isSI : Predicate<"Subtarget->getGeneration() "
-                      "== AMDGPUSubtarget::SOUTHERN_ISLANDS">,
+                      "== SISubtarget::SOUTHERN_ISLANDS">,
            AssemblerPredicate<"FeatureSouthernIslands">;
 
 
 def has16BankLDS : Predicate<"Subtarget->getLDSBankCount() == 16">;
 def has32BankLDS : Predicate<"Subtarget->getLDSBankCount() == 32">;
 
-def SWaitMatchClass : AsmOperandClass {
-  let Name = "SWaitCnt";
-  let RenderMethod = "addImmOperands";
-  let ParserMethod = "parseSWaitCntOps";
-}
-
-def WAIT_FLAG : InstFlag<"printWaitFlag"> {
-  let ParserMatchClass = SWaitMatchClass;
-}
-
 let SubtargetPredicate = isGCN in {
 
 //===----------------------------------------------------------------------===//
@@ -59,17 +41,17 @@ defm EXP : EXP_m;
 // SMRD Instructions
 //===----------------------------------------------------------------------===//
 
-// We are using the SGPR_32 and not the SReg_32 register class for 32-bit
-// SMRD instructions, because the SGPR_32 register class does not include M0
+// We are using the SReg_32_XM0 and not the SReg_32 register class for 32-bit
+// SMRD instructions, because the SReg_32_XM0 register class does not include M0
 // and writing to M0 from an SMRD instruction will hang the GPU.
-defm S_LOAD_DWORD : SMRD_Helper <smrd<0x00>, "s_load_dword", SReg_64, SGPR_32>;
+defm S_LOAD_DWORD : SMRD_Helper <smrd<0x00>, "s_load_dword", SReg_64, SReg_32_XM0>;
 defm S_LOAD_DWORDX2 : SMRD_Helper <smrd<0x01>, "s_load_dwordx2", SReg_64, SReg_64>;
 defm S_LOAD_DWORDX4 : SMRD_Helper <smrd<0x02>, "s_load_dwordx4", SReg_64, SReg_128>;
 defm S_LOAD_DWORDX8 : SMRD_Helper <smrd<0x03>, "s_load_dwordx8", SReg_64, SReg_256>;
 defm S_LOAD_DWORDX16 : SMRD_Helper <smrd<0x04>, "s_load_dwordx16", SReg_64, SReg_512>;
 
 defm S_BUFFER_LOAD_DWORD : SMRD_Helper <
-  smrd<0x08>, "s_buffer_load_dword", SReg_128, SGPR_32
+  smrd<0x08>, "s_buffer_load_dword", SReg_128, SReg_32_XM0
 >;
 
 defm S_BUFFER_LOAD_DWORDX2 : SMRD_Helper <
@@ -88,7 +70,15 @@ defm S_BUFFER_LOAD_DWORDX16 : SMRD_Helper <
   smrd<0x0c>, "s_buffer_load_dwordx16", SReg_128, SReg_512
 >;
 
-//def S_MEMTIME : SMRD_ <0x0000001e, "s_memtime", []>;
+let mayStore = ? in {
+// FIXME: mayStore = ? is a workaround for tablegen bug for different
+// inferred mayStore flags for the instruction pattern vs. standalone
+// Pat. Each considers the other contradictory.
+
+defm S_MEMTIME : SMRD_Special <smrd<0x1e, 0x24>, "s_memtime",
+  (outs SReg_64:$sdst), ?, " $sdst", [(set i64:$sdst, (int_amdgcn_s_memtime))]
+>;
+}
 
 defm S_DCACHE_INV : SMRD_Inval <smrd<0x1f, 0x20>, "s_dcache_inv",
   int_amdgcn_s_dcache_inv>;
@@ -101,7 +91,7 @@ let isMoveImm = 1 in {
   let isReMaterializable = 1, isAsCheapAsAMove = 1 in {
     defm S_MOV_B32 : SOP1_32 <sop1<0x03, 0x00>, "s_mov_b32", []>;
     defm S_MOV_B64 : SOP1_64 <sop1<0x04, 0x01>, "s_mov_b64", []>;
-  } // let isRematerializeable = 1
+  } // End isRematerializeable = 1
 
   let Uses = [SCC] in {
     defm S_CMOV_B32 : SOP1_32 <sop1<0x05, 0x02>, "s_cmov_b32", []>;
@@ -111,11 +101,11 @@ let isMoveImm = 1 in {
 
 let Defs = [SCC] in {
   defm S_NOT_B32 : SOP1_32 <sop1<0x07, 0x04>, "s_not_b32",
-    [(set i32:$dst, (not i32:$src0))]
+    [(set i32:$sdst, (not i32:$src0))]
   >;
 
   defm S_NOT_B64 : SOP1_64 <sop1<0x08, 0x05>, "s_not_b64",
-    [(set i64:$dst, (not i64:$src0))]
+    [(set i64:$sdst, (not i64:$src0))]
   >;
   defm S_WQM_B32 : SOP1_32 <sop1<0x09, 0x06>, "s_wqm_b32", []>;
   defm S_WQM_B64 : SOP1_64 <sop1<0x0a, 0x07>, "s_wqm_b64", []>;
@@ -123,7 +113,7 @@ let Defs = [SCC] in {
 
 
 defm S_BREV_B32 : SOP1_32 <sop1<0x0b, 0x08>, "s_brev_b32",
-  [(set i32:$dst, (bitreverse i32:$src0))]
+  [(set i32:$sdst, (bitreverse i32:$src0))]
 >;
 defm S_BREV_B64 : SOP1_64 <sop1<0x0c, 0x09>, "s_brev_b64", []>;
 
@@ -131,7 +121,7 @@ let Defs = [SCC] in {
   defm S_BCNT0_I32_B32 : SOP1_32 <sop1<0x0d, 0x0a>, "s_bcnt0_i32_b32", []>;
   defm S_BCNT0_I32_B64 : SOP1_32_64 <sop1<0x0e, 0x0b>, "s_bcnt0_i32_b64", []>;
   defm S_BCNT1_I32_B32 : SOP1_32 <sop1<0x0f, 0x0c>, "s_bcnt1_i32_b32",
-    [(set i32:$dst, (ctpop i32:$src0))]
+    [(set i32:$sdst, (ctpop i32:$src0))]
   >;
   defm S_BCNT1_I32_B64 : SOP1_32_64 <sop1<0x10, 0x0d>, "s_bcnt1_i32_b64", []>;
 } // End Defs = [SCC]
@@ -139,34 +129,34 @@ let Defs = [SCC] in {
 defm S_FF0_I32_B32 : SOP1_32 <sop1<0x11, 0x0e>, "s_ff0_i32_b32", []>;
 defm S_FF0_I32_B64 : SOP1_32_64 <sop1<0x12, 0x0f>, "s_ff0_i32_b64", []>;
 defm S_FF1_I32_B32 : SOP1_32 <sop1<0x13, 0x10>, "s_ff1_i32_b32",
-  [(set i32:$dst, (cttz_zero_undef i32:$src0))]
+  [(set i32:$sdst, (cttz_zero_undef i32:$src0))]
 >;
 defm S_FF1_I32_B64 : SOP1_32_64 <sop1<0x14, 0x11>, "s_ff1_i32_b64", []>;
 
 defm S_FLBIT_I32_B32 : SOP1_32 <sop1<0x15, 0x12>, "s_flbit_i32_b32",
-  [(set i32:$dst, (AMDGPUffbh_u32 i32:$src0))]
+  [(set i32:$sdst, (AMDGPUffbh_u32 i32:$src0))]
 >;
 
 defm S_FLBIT_I32_B64 : SOP1_32_64 <sop1<0x16, 0x13>, "s_flbit_i32_b64", []>;
 defm S_FLBIT_I32 : SOP1_32 <sop1<0x17, 0x14>, "s_flbit_i32",
-  [(set i32:$dst, (int_AMDGPU_flbit_i32 i32:$src0))]
+  [(set i32:$sdst, (int_AMDGPU_flbit_i32 i32:$src0))]
 >;
 defm S_FLBIT_I32_I64 : SOP1_32_64 <sop1<0x18, 0x15>, "s_flbit_i32_i64", []>;
 defm S_SEXT_I32_I8 : SOP1_32 <sop1<0x19, 0x16>, "s_sext_i32_i8",
-  [(set i32:$dst, (sext_inreg i32:$src0, i8))]
+  [(set i32:$sdst, (sext_inreg i32:$src0, i8))]
 >;
 defm S_SEXT_I32_I16 : SOP1_32 <sop1<0x1a, 0x17>, "s_sext_i32_i16",
-  [(set i32:$dst, (sext_inreg i32:$src0, i16))]
+  [(set i32:$sdst, (sext_inreg i32:$src0, i16))]
 >;
 
 defm S_BITSET0_B32 : SOP1_32 <sop1<0x1b, 0x18>, "s_bitset0_b32", []>;
-defm S_BITSET0_B64 : SOP1_64 <sop1<0x1c, 0x19>, "s_bitset0_b64", []>;
+defm S_BITSET0_B64 : SOP1_64_32 <sop1<0x1c, 0x19>, "s_bitset0_b64", []>;
 defm S_BITSET1_B32 : SOP1_32 <sop1<0x1d, 0x1a>, "s_bitset1_b32", []>;
-defm S_BITSET1_B64 : SOP1_64 <sop1<0x1e, 0x1b>, "s_bitset1_b64", []>;
+defm S_BITSET1_B64 : SOP1_64_32 <sop1<0x1e, 0x1b>, "s_bitset1_b64", []>;
 defm S_GETPC_B64 : SOP1_64_0 <sop1<0x1f, 0x1c>, "s_getpc_b64", []>;
-defm S_SETPC_B64 : SOP1_64 <sop1<0x20, 0x1d>, "s_setpc_b64", []>;
+defm S_SETPC_B64 : SOP1_1 <sop1<0x20, 0x1d>, "s_setpc_b64", []>;
 defm S_SWAPPC_B64 : SOP1_64 <sop1<0x21, 0x1e>, "s_swappc_b64", []>;
-defm S_RFE_B64 : SOP1_64 <sop1<0x22, 0x1f>, "s_rfe_b64", []>;
+defm S_RFE_B64 : SOP1_1 <sop1<0x22, 0x1f>, "s_rfe_b64", []>;
 
 let hasSideEffects = 1, Uses = [EXEC], Defs = [EXEC, SCC] in {
 
@@ -206,36 +196,36 @@ let Defs = [SCC] in { // Carry out goes to SCC
 let isCommutable = 1 in {
 defm S_ADD_U32 : SOP2_32 <sop2<0x00>, "s_add_u32", []>;
 defm S_ADD_I32 : SOP2_32 <sop2<0x02>, "s_add_i32",
-  [(set i32:$dst, (add SSrc_32:$src0, SSrc_32:$src1))]
+  [(set i32:$sdst, (add SSrc_32:$src0, SSrc_32:$src1))]
 >;
 } // End isCommutable = 1
 
 defm S_SUB_U32 : SOP2_32 <sop2<0x01>, "s_sub_u32", []>;
 defm S_SUB_I32 : SOP2_32 <sop2<0x03>, "s_sub_i32",
-  [(set i32:$dst, (sub SSrc_32:$src0, SSrc_32:$src1))]
+  [(set i32:$sdst, (sub SSrc_32:$src0, SSrc_32:$src1))]
 >;
 
 let Uses = [SCC] in { // Carry in comes from SCC
 let isCommutable = 1 in {
 defm S_ADDC_U32 : SOP2_32 <sop2<0x04>, "s_addc_u32",
-  [(set i32:$dst, (adde (i32 SSrc_32:$src0), (i32 SSrc_32:$src1)))]>;
+  [(set i32:$sdst, (adde (i32 SSrc_32:$src0), (i32 SSrc_32:$src1)))]>;
 } // End isCommutable = 1
 
 defm S_SUBB_U32 : SOP2_32 <sop2<0x05>, "s_subb_u32",
-  [(set i32:$dst, (sube (i32 SSrc_32:$src0), (i32 SSrc_32:$src1)))]>;
+  [(set i32:$sdst, (sube (i32 SSrc_32:$src0), (i32 SSrc_32:$src1)))]>;
 } // End Uses = [SCC]
 
 defm S_MIN_I32 : SOP2_32 <sop2<0x06>, "s_min_i32",
-  [(set i32:$dst, (smin i32:$src0, i32:$src1))]
+  [(set i32:$sdst, (smin i32:$src0, i32:$src1))]
 >;
 defm S_MIN_U32 : SOP2_32 <sop2<0x07>, "s_min_u32",
-  [(set i32:$dst, (umin i32:$src0, i32:$src1))]
+  [(set i32:$sdst, (umin i32:$src0, i32:$src1))]
 >;
 defm S_MAX_I32 : SOP2_32 <sop2<0x08>, "s_max_i32",
-  [(set i32:$dst, (smax i32:$src0, i32:$src1))]
+  [(set i32:$sdst, (smax i32:$src0, i32:$src1))]
 >;
 defm S_MAX_U32 : SOP2_32 <sop2<0x09>, "s_max_u32",
-  [(set i32:$dst, (umax i32:$src0, i32:$src1))]
+  [(set i32:$sdst, (umax i32:$src0, i32:$src1))]
 >;
 } // End Defs = [SCC]
 
@@ -247,27 +237,27 @@ let Uses = [SCC] in {
 
 let Defs = [SCC] in {
 defm S_AND_B32 : SOP2_32 <sop2<0x0e, 0x0c>, "s_and_b32",
-  [(set i32:$dst, (and i32:$src0, i32:$src1))]
+  [(set i32:$sdst, (and i32:$src0, i32:$src1))]
 >;
 
 defm S_AND_B64 : SOP2_64 <sop2<0x0f, 0x0d>, "s_and_b64",
-  [(set i64:$dst, (and i64:$src0, i64:$src1))]
+  [(set i64:$sdst, (and i64:$src0, i64:$src1))]
 >;
 
 defm S_OR_B32 : SOP2_32 <sop2<0x10, 0x0e>, "s_or_b32",
-  [(set i32:$dst, (or i32:$src0, i32:$src1))]
+  [(set i32:$sdst, (or i32:$src0, i32:$src1))]
 >;
 
 defm S_OR_B64 : SOP2_64 <sop2<0x11, 0x0f>, "s_or_b64",
-  [(set i64:$dst, (or i64:$src0, i64:$src1))]
+  [(set i64:$sdst, (or i64:$src0, i64:$src1))]
 >;
 
 defm S_XOR_B32 : SOP2_32 <sop2<0x12, 0x10>, "s_xor_b32",
-  [(set i32:$dst, (xor i32:$src0, i32:$src1))]
+  [(set i32:$sdst, (xor i32:$src0, i32:$src1))]
 >;
 
 defm S_XOR_B64 : SOP2_64 <sop2<0x13, 0x11>, "s_xor_b64",
-  [(set i64:$dst, (xor i64:$src0, i64:$src1))]
+  [(set i64:$sdst, (xor i64:$src0, i64:$src1))]
 >;
 defm S_ANDN2_B32 : SOP2_32 <sop2<0x14, 0x12>, "s_andn2_b32", []>;
 defm S_ANDN2_B64 : SOP2_64 <sop2<0x15, 0x13>, "s_andn2_b64", []>;
@@ -286,30 +276,30 @@ let AddedComplexity = 1 in {
 let Defs = [SCC] in {
 
 defm S_LSHL_B32 : SOP2_32 <sop2<0x1e, 0x1c>, "s_lshl_b32",
-  [(set i32:$dst, (shl i32:$src0, i32:$src1))]
+  [(set i32:$sdst, (shl i32:$src0, i32:$src1))]
 >;
 defm S_LSHL_B64 : SOP2_64_32 <sop2<0x1f, 0x1d>, "s_lshl_b64",
-  [(set i64:$dst, (shl i64:$src0, i32:$src1))]
+  [(set i64:$sdst, (shl i64:$src0, i32:$src1))]
 >;
 defm S_LSHR_B32 : SOP2_32 <sop2<0x20, 0x1e>, "s_lshr_b32",
-  [(set i32:$dst, (srl i32:$src0, i32:$src1))]
+  [(set i32:$sdst, (srl i32:$src0, i32:$src1))]
 >;
 defm S_LSHR_B64 : SOP2_64_32 <sop2<0x21, 0x1f>, "s_lshr_b64",
-  [(set i64:$dst, (srl i64:$src0, i32:$src1))]
+  [(set i64:$sdst, (srl i64:$src0, i32:$src1))]
 >;
 defm S_ASHR_I32 : SOP2_32 <sop2<0x22, 0x20>, "s_ashr_i32",
-  [(set i32:$dst, (sra i32:$src0, i32:$src1))]
+  [(set i32:$sdst, (sra i32:$src0, i32:$src1))]
 >;
 defm S_ASHR_I64 : SOP2_64_32 <sop2<0x23, 0x21>, "s_ashr_i64",
-  [(set i64:$dst, (sra i64:$src0, i32:$src1))]
+  [(set i64:$sdst, (sra i64:$src0, i32:$src1))]
 >;
 } // End Defs = [SCC]
 
 defm S_BFM_B32 : SOP2_32 <sop2<0x24, 0x22>, "s_bfm_b32",
-  [(set i32:$dst, (AMDGPUbfm i32:$src0, i32:$src1))]>;
-defm S_BFM_B64 : SOP2_64 <sop2<0x25, 0x23>, "s_bfm_b64", []>;
+  [(set i32:$sdst, (AMDGPUbfm i32:$src0, i32:$src1))]>;
+defm S_BFM_B64 : SOP2_64_32_32 <sop2<0x25, 0x23>, "s_bfm_b64", []>;
 defm S_MUL_I32 : SOP2_32 <sop2<0x26, 0x24>, "s_mul_i32",
-  [(set i32:$dst, (mul i32:$src0, i32:$src1))]
+  [(set i32:$sdst, (mul i32:$src0, i32:$src1))]
 >;
 
 } // End AddedComplexity = 1
@@ -317,7 +307,7 @@ defm S_MUL_I32 : SOP2_32 <sop2<0x26, 0x24>, "s_mul_i32",
 let Defs = [SCC] in {
 defm S_BFE_U32 : SOP2_32 <sop2<0x27, 0x25>, "s_bfe_u32", []>;
 defm S_BFE_I32 : SOP2_32 <sop2<0x28, 0x26>, "s_bfe_i32", []>;
-defm S_BFE_U64 : SOP2_64 <sop2<0x29, 0x27>, "s_bfe_u64", []>;
+defm S_BFE_U64 : SOP2_64_32 <sop2<0x29, 0x27>, "s_bfe_u64", []>;
 defm S_BFE_I64 : SOP2_64_32 <sop2<0x2a, 0x28>, "s_bfe_i64", []>;
 } // End Defs = [SCC]
 
@@ -336,23 +326,23 @@ defm S_ABSDIFF_I32 : SOP2_32 <sop2<0x2c, 0x2a>, "s_absdiff_i32", []>;
 // SOPC Instructions
 //===----------------------------------------------------------------------===//
 
-def S_CMP_EQ_I32 : SOPC_32 <0x00000000, "s_cmp_eq_i32">;
-def S_CMP_LG_I32 : SOPC_32 <0x00000001, "s_cmp_lg_i32">;
-def S_CMP_GT_I32 : SOPC_32 <0x00000002, "s_cmp_gt_i32">;
-def S_CMP_GE_I32 : SOPC_32 <0x00000003, "s_cmp_ge_i32">;
-def S_CMP_LT_I32 : SOPC_32 <0x00000004, "s_cmp_lt_i32">;
-def S_CMP_LE_I32 : SOPC_32 <0x00000005, "s_cmp_le_i32">;
-def S_CMP_EQ_U32 : SOPC_32 <0x00000006, "s_cmp_eq_u32">;
-def S_CMP_LG_U32 : SOPC_32 <0x00000007, "s_cmp_lg_u32">;
-def S_CMP_GT_U32 : SOPC_32 <0x00000008, "s_cmp_gt_u32">;
-def S_CMP_GE_U32 : SOPC_32 <0x00000009, "s_cmp_ge_u32">;
-def S_CMP_LT_U32 : SOPC_32 <0x0000000a, "s_cmp_lt_u32">;
-def S_CMP_LE_U32 : SOPC_32 <0x0000000b, "s_cmp_le_u32">;
-////def S_BITCMP0_B32 : SOPC_BITCMP0 <0x0000000c, "s_bitcmp0_b32", []>;
-////def S_BITCMP1_B32 : SOPC_BITCMP1 <0x0000000d, "s_bitcmp1_b32", []>;
-////def S_BITCMP0_B64 : SOPC_BITCMP0 <0x0000000e, "s_bitcmp0_b64", []>;
-////def S_BITCMP1_B64 : SOPC_BITCMP1 <0x0000000f, "s_bitcmp1_b64", []>;
-//def S_SETVSKIP : SOPC_ <0x00000010, "s_setvskip", []>;
+def S_CMP_EQ_I32 : SOPC_CMP_32 <0x00000000, "s_cmp_eq_i32", COND_EQ>;
+def S_CMP_LG_I32 : SOPC_CMP_32 <0x00000001, "s_cmp_lg_i32", COND_NE>;
+def S_CMP_GT_I32 : SOPC_CMP_32 <0x00000002, "s_cmp_gt_i32", COND_SGT>;
+def S_CMP_GE_I32 : SOPC_CMP_32 <0x00000003, "s_cmp_ge_i32", COND_SGE>;
+def S_CMP_LT_I32 : SOPC_CMP_32 <0x00000004, "s_cmp_lt_i32", COND_SLT>;
+def S_CMP_LE_I32 : SOPC_CMP_32 <0x00000005, "s_cmp_le_i32", COND_SLE>;
+def S_CMP_EQ_U32 : SOPC_CMP_32 <0x00000006, "s_cmp_eq_u32", COND_EQ>;
+def S_CMP_LG_U32 : SOPC_CMP_32 <0x00000007, "s_cmp_lg_u32", COND_NE >;
+def S_CMP_GT_U32 : SOPC_CMP_32 <0x00000008, "s_cmp_gt_u32", COND_UGT>;
+def S_CMP_GE_U32 : SOPC_CMP_32 <0x00000009, "s_cmp_ge_u32", COND_UGE>;
+def S_CMP_LT_U32 : SOPC_CMP_32 <0x0000000a, "s_cmp_lt_u32", COND_ULT>;
+def S_CMP_LE_U32 : SOPC_CMP_32 <0x0000000b, "s_cmp_le_u32", COND_ULE>;
+def S_BITCMP0_B32 : SOPC_32 <0x0000000c, "s_bitcmp0_b32">;
+def S_BITCMP1_B32 : SOPC_32 <0x0000000d, "s_bitcmp1_b32">;
+def S_BITCMP0_B64 : SOPC_64_32 <0x0000000e, "s_bitcmp0_b64">;
+def S_BITCMP1_B64 : SOPC_64_32 <0x0000000f, "s_bitcmp1_b64">;
+def S_SETVSKIP : SOPC_32 <0x00000010, "s_setvskip">;
 
 //===----------------------------------------------------------------------===//
 // SOPK Instructions
@@ -408,16 +398,23 @@ defm S_CBRANCH_I_FORK : SOPK_m <
   sopk<0x11, 0x10>, "s_cbranch_i_fork", (outs),
   (ins SReg_64:$sdst, u16imm:$simm16), " $sdst, $simm16"
 >;
-defm S_GETREG_B32 : SOPK_32 <sopk<0x12, 0x11>, "s_getreg_b32", []>;
+
+let mayLoad = 1 in {
+defm S_GETREG_B32 : SOPK_m <
+  sopk<0x12, 0x11>, "s_getreg_b32", (outs SReg_32:$sdst),
+  (ins hwreg:$simm16), " $sdst, $simm16"
+>;
+}
+
 defm S_SETREG_B32 : SOPK_m <
   sopk<0x13, 0x12>, "s_setreg_b32", (outs),
-  (ins SReg_32:$sdst, u16imm:$simm16), " $sdst, $simm16"
+  (ins SReg_32:$sdst, hwreg:$simm16), " $simm16, $sdst"
 >;
 // FIXME: Not on SI?
 //defm S_GETREG_REGRD_B32 : SOPK_32 <sopk<0x14, 0x13>, "s_getreg_regrd_b32", []>;
 defm S_SETREG_IMM32_B32 : SOPK_IMM32 <
   sopk<0x15, 0x14>, "s_setreg_imm32_b32", (outs),
-  (ins i32imm:$imm, u16imm:$simm16), " $imm, $simm16"
+  (ins i32imm:$imm, hwreg:$simm16), " $simm16, $imm"
 >;
 
 //===----------------------------------------------------------------------===//
@@ -429,10 +426,11 @@ def S_NOP : SOPP <0x00000000, (ins i16imm:$simm16), "s_nop $simm16">;
 let isTerminator = 1 in {
 
 def S_ENDPGM : SOPP <0x00000001, (ins), "s_endpgm",
-  [(IL_retflag)]> {
+  [(AMDGPUendpgm)]> {
   let simm16 = 0;
   let isBarrier = 1;
   let hasCtrlDep = 1;
+  let hasSideEffects = 1;
 }
 
 let isBranch = 1 in {
@@ -449,7 +447,8 @@ def S_CBRANCH_SCC0 : SOPP <
 >;
 def S_CBRANCH_SCC1 : SOPP <
   0x00000005, (ins sopp_brtarget:$simm16),
-  "s_cbranch_scc1 $simm16"
+  "s_cbranch_scc1 $simm16",
+  [(si_uniform_br_scc SCC, bb:$simm16)]
 >;
 } // End Uses = [SCC]
 
@@ -481,7 +480,7 @@ def S_CBRANCH_EXECNZ : SOPP <
 
 let hasSideEffects = 1 in {
 def S_BARRIER : SOPP <0x0000000a, (ins), "s_barrier",
-  [(int_AMDGPU_barrier_local)]
+  [(int_amdgcn_s_barrier)]
 > {
   let SchedRW = [WriteBarrier];
   let simm16 = 0;
@@ -490,18 +489,31 @@ def S_BARRIER : SOPP <0x0000000a, (ins), "s_barrier",
   let isConvergent = 1;
 }
 
+let mayLoad = 1, mayStore = 1, hasSideEffects = 1 in
 def S_WAITCNT : SOPP <0x0000000c, (ins WAIT_FLAG:$simm16), "s_waitcnt $simm16">;
 def S_SETHALT : SOPP <0x0000000d, (ins i16imm:$simm16), "s_sethalt $simm16">;
-def S_SLEEP : SOPP <0x0000000e, (ins i16imm:$simm16), "s_sleep $simm16">;
-def S_SETPRIO : SOPP <0x0000000f, (ins i16imm:$sim16), "s_setprio $sim16">;
+
+// On SI the documentation says sleep for approximately 64 * low 2
+// bits, consistent with the reported maximum of 448. On VI the
+// maximum reported is 960 cycles, so 960 / 64 = 15 max, so is the
+// maximum really 15 on VI?
+def S_SLEEP : SOPP <0x0000000e, (ins i32imm:$simm16),
+  "s_sleep $simm16", [(int_amdgcn_s_sleep SIMM16bit:$simm16)]> {
+  let hasSideEffects = 1;
+  let mayLoad = 1;
+  let mayStore = 1;
+}
+
+def S_SETPRIO : SOPP <0x0000000f, (ins i16imm:$simm16), "s_setprio $simm16">;
 
 let Uses = [EXEC, M0] in {
+  // FIXME: Should this be mayLoad+mayStore?
   def S_SENDMSG : SOPP <0x00000010, (ins SendMsgImm:$simm16), "s_sendmsg $simm16",
       [(AMDGPUsendmsg (i32 imm:$simm16))]
   >;
 } // End Uses = [EXEC, M0]
 
-def S_SENDMSGHALT : SOPP <0x00000011, (ins i16imm:$simm16), "s_sendmsghalt $simm16">;
+def S_SENDMSGHALT : SOPP <0x00000011, (ins SendMsgImm:$simm16), "s_sendmsghalt $simm16">;
 def S_TRAP : SOPP <0x00000012, (ins i16imm:$simm16), "s_trap $simm16">;
 def S_ICACHE_INV : SOPP <0x00000013, (ins), "s_icache_inv"> {
 	let simm16 = 0;
@@ -770,8 +782,8 @@ defm DS_XOR_B32 : DS_1A1D_NORET <0xb, "ds_xor_b32", VGPR_32>;
 defm DS_MSKOR_B32 : DS_1A2D_NORET <0xc, "ds_mskor_b32", VGPR_32>;
 let mayLoad = 0 in {
 defm DS_WRITE_B32 : DS_1A1D_NORET <0xd, "ds_write_b32", VGPR_32>;
-defm DS_WRITE2_B32 : DS_1A1D_Off8_NORET <0xe, "ds_write2_b32", VGPR_32>;
-defm DS_WRITE2ST64_B32 : DS_1A1D_Off8_NORET <0xf, "ds_write2st64_b32", VGPR_32>;
+defm DS_WRITE2_B32 : DS_1A2D_Off8_NORET <0xe, "ds_write2_b32", VGPR_32>;
+defm DS_WRITE2ST64_B32 : DS_1A2D_Off8_NORET <0xf, "ds_write2st64_b32", VGPR_32>;
 }
 defm DS_CMPST_B32 : DS_1A2D_NORET <0x10, "ds_cmpst_b32", VGPR_32>;
 defm DS_CMPST_F32 : DS_1A2D_NORET <0x11, "ds_cmpst_f32", VGPR_32>;
@@ -811,7 +823,11 @@ defm DS_CMPST_RTN_B32 : DS_1A2D_RET <0x30, "ds_cmpst_rtn_b32", VGPR_32, "ds_cmps
 defm DS_CMPST_RTN_F32 : DS_1A2D_RET <0x31, "ds_cmpst_rtn_f32", VGPR_32, "ds_cmpst_f32">;
 defm DS_MIN_RTN_F32 : DS_1A2D_RET <0x32, "ds_min_rtn_f32", VGPR_32, "ds_min_f32">;
 defm DS_MAX_RTN_F32 : DS_1A2D_RET <0x33, "ds_max_rtn_f32", VGPR_32, "ds_max_f32">;
-defm DS_SWIZZLE_B32 : DS_1A_RET <0x35, "ds_swizzle_b32", VGPR_32>;
+
+let Uses = [EXEC], mayLoad =0, mayStore = 0, isConvergent = 1 in {
+defm DS_SWIZZLE_B32 : DS_1A_RET_ <dsop<0x35, 0x3d>, "ds_swizzle_b32", VGPR_32>;
+}
+
 let mayStore = 0 in {
 defm DS_READ_B32 : DS_1A_RET <0x36, "ds_read_b32", VGPR_32>;
 defm DS_READ2_B32 : DS_1A_Off8_RET <0x37, "ds_read2_b32", VReg_64>;
@@ -839,8 +855,8 @@ defm DS_XOR_B64 : DS_1A1D_NORET <0x4b, "ds_xor_b64", VReg_64>;
 defm DS_MSKOR_B64 : DS_1A2D_NORET <0x4c, "ds_mskor_b64", VReg_64>;
 let mayLoad = 0 in {
 defm DS_WRITE_B64 : DS_1A1D_NORET <0x4d, "ds_write_b64", VReg_64>;
-defm DS_WRITE2_B64 : DS_1A1D_Off8_NORET <0x4E, "ds_write2_b64", VReg_64>;
-defm DS_WRITE2ST64_B64 : DS_1A1D_Off8_NORET <0x4f, "ds_write2st64_b64", VReg_64>;
+defm DS_WRITE2_B64 : DS_1A2D_Off8_NORET <0x4E, "ds_write2_b64", VReg_64>;
+defm DS_WRITE2ST64_B64 : DS_1A2D_Off8_NORET <0x4f, "ds_write2st64_b64", VReg_64>;
 }
 defm DS_CMPST_B64 : DS_1A2D_NORET <0x50, "ds_cmpst_b64", VReg_64>;
 defm DS_CMPST_F64 : DS_1A2D_NORET <0x51, "ds_cmpst_f64", VReg_64>;
@@ -886,7 +902,7 @@ defm DS_MAX_SRC2_U32 : DS_1A <0x88, "ds_max_src2_u32">;
 defm DS_AND_SRC2_B32 : DS_1A <0x89, "ds_and_src_b32">;
 defm DS_OR_SRC2_B32 : DS_1A <0x8a, "ds_or_src2_b32">;
 defm DS_XOR_SRC2_B32 : DS_1A <0x8b, "ds_xor_src2_b32">;
-defm DS_WRITE_SRC2_B32 : DS_1A <0x8c, "ds_write_src2_b32">;
+defm DS_WRITE_SRC2_B32 : DS_1A_Off8_NORET <0x8d, "ds_write_src2_b32">;
 
 defm DS_MIN_SRC2_F32 : DS_1A <0x92, "ds_min_src2_f32">;
 defm DS_MAX_SRC2_F32 : DS_1A <0x93, "ds_max_src2_f32">;
@@ -903,7 +919,7 @@ defm DS_MAX_SRC2_U64 : DS_1A <0xc8, "ds_max_src2_u64">;
 defm DS_AND_SRC2_B64 : DS_1A <0xc9, "ds_and_src2_b64">;
 defm DS_OR_SRC2_B64 : DS_1A <0xca, "ds_or_src2_b64">;
 defm DS_XOR_SRC2_B64 : DS_1A <0xcb, "ds_xor_src2_b64">;
-defm DS_WRITE_SRC2_B64 : DS_1A <0xcc, "ds_write_src2_b64">;
+defm DS_WRITE_SRC2_B64 : DS_1A_Off8_NORET <0xcd, "ds_write_src2_b64">;
 
 defm DS_MIN_SRC2_F64 : DS_1A <0xd2, "ds_min_src2_f64">;
 defm DS_MAX_SRC2_F64 : DS_1A <0xd3, "ds_max_src2_f64">;
@@ -937,16 +953,16 @@ defm BUFFER_STORE_FORMAT_XYZW : MUBUF_Store_Helper <
   mubuf<0x07>, "buffer_store_format_xyzw", VReg_128
 >;
 defm BUFFER_LOAD_UBYTE : MUBUF_Load_Helper <
-  mubuf<0x08, 0x10>, "buffer_load_ubyte", VGPR_32, i32, az_extloadi8_global
+  mubuf<0x08, 0x10>, "buffer_load_ubyte", VGPR_32, i32, mubuf_az_extloadi8
 >;
 defm BUFFER_LOAD_SBYTE : MUBUF_Load_Helper <
-  mubuf<0x09, 0x11>, "buffer_load_sbyte", VGPR_32, i32, sextloadi8_global
+  mubuf<0x09, 0x11>, "buffer_load_sbyte", VGPR_32, i32, mubuf_sextloadi8
 >;
 defm BUFFER_LOAD_USHORT : MUBUF_Load_Helper <
-  mubuf<0x0a, 0x12>, "buffer_load_ushort", VGPR_32, i32, az_extloadi16_global
+  mubuf<0x0a, 0x12>, "buffer_load_ushort", VGPR_32, i32, mubuf_az_extloadi16
 >;
 defm BUFFER_LOAD_SSHORT : MUBUF_Load_Helper <
-  mubuf<0x0b, 0x13>, "buffer_load_sshort", VGPR_32, i32, sextloadi16_global
+  mubuf<0x0b, 0x13>, "buffer_load_sshort", VGPR_32, i32, mubuf_sextloadi16
 >;
 defm BUFFER_LOAD_DWORD : MUBUF_Load_Helper <
   mubuf<0x0c, 0x14>, "buffer_load_dword", VGPR_32, i32, mubuf_load
@@ -981,7 +997,9 @@ defm BUFFER_STORE_DWORDX4 : MUBUF_Store_Helper <
 defm BUFFER_ATOMIC_SWAP : MUBUF_Atomic <
   mubuf<0x30, 0x40>, "buffer_atomic_swap", VGPR_32, i32, atomic_swap_global
 >;
-//def BUFFER_ATOMIC_CMPSWAP : MUBUF_ <mubuf<0x31, 0x41>, "buffer_atomic_cmpswap", []>;
+defm BUFFER_ATOMIC_CMPSWAP : MUBUF_Atomic <
+  mubuf<0x31, 0x41>, "buffer_atomic_cmpswap", VReg_64, v2i32, null_frag
+>;
 defm BUFFER_ATOMIC_ADD : MUBUF_Atomic <
   mubuf<0x32, 0x42>, "buffer_atomic_add", VGPR_32, i32, atomic_add_global
 >;
@@ -1010,30 +1028,61 @@ defm BUFFER_ATOMIC_OR : MUBUF_Atomic <
 defm BUFFER_ATOMIC_XOR : MUBUF_Atomic <
   mubuf<0x3b, 0x4a>, "buffer_atomic_xor", VGPR_32, i32, atomic_xor_global
 >;
-//def BUFFER_ATOMIC_INC : MUBUF_ <mubuf<0x3c, 0x4b>, "buffer_atomic_inc", []>;
-//def BUFFER_ATOMIC_DEC : MUBUF_ <mubuf<0x3d, 0x4c>, "buffer_atomic_dec", []>;
-//def BUFFER_ATOMIC_FCMPSWAP : MUBUF_ <mubuf<0x3e>, "buffer_atomic_fcmpswap", []>; // isn't on VI
-//def BUFFER_ATOMIC_FMIN : MUBUF_ <mubuf<0x3f>, "buffer_atomic_fmin", []>; // isn't on VI
-//def BUFFER_ATOMIC_FMAX : MUBUF_ <mubuf<0x40>, "buffer_atomic_fmax", []>; // isn't on VI
-//def BUFFER_ATOMIC_SWAP_X2 : MUBUF_X2 <mubuf<0x50, 0x60>, "buffer_atomic_swap_x2", []>;
-//def BUFFER_ATOMIC_CMPSWAP_X2 : MUBUF_X2 <mubuf<0x51, 0x61>, "buffer_atomic_cmpswap_x2", []>;
-//def BUFFER_ATOMIC_ADD_X2 : MUBUF_X2 <mubuf<0x52, 0x62>, "buffer_atomic_add_x2", []>;
-//def BUFFER_ATOMIC_SUB_X2 : MUBUF_X2 <mubuf<0x53, 0x63>, "buffer_atomic_sub_x2", []>;
-//def BUFFER_ATOMIC_RSUB_X2 : MUBUF_X2 <mubuf<0x54>, "buffer_atomic_rsub_x2", []>; // isn't on CI & VI
-//def BUFFER_ATOMIC_SMIN_X2 : MUBUF_X2 <mubuf<0x55, 0x64>, "buffer_atomic_smin_x2", []>;
-//def BUFFER_ATOMIC_UMIN_X2 : MUBUF_X2 <mubuf<0x56, 0x65>, "buffer_atomic_umin_x2", []>;
-//def BUFFER_ATOMIC_SMAX_X2 : MUBUF_X2 <mubuf<0x57, 0x66>, "buffer_atomic_smax_x2", []>;
-//def BUFFER_ATOMIC_UMAX_X2 : MUBUF_X2 <mubuf<0x58, 0x67>, "buffer_atomic_umax_x2", []>;
-//def BUFFER_ATOMIC_AND_X2 : MUBUF_X2 <mubuf<0x59, 0x68>, "buffer_atomic_and_x2", []>;
-//def BUFFER_ATOMIC_OR_X2 : MUBUF_X2 <mubuf<0x5a, 0x69>, "buffer_atomic_or_x2", []>;
-//def BUFFER_ATOMIC_XOR_X2 : MUBUF_X2 <mubuf<0x5b, 0x6a>, "buffer_atomic_xor_x2", []>;
-//def BUFFER_ATOMIC_INC_X2 : MUBUF_X2 <mubuf<0x5c, 0x6b>, "buffer_atomic_inc_x2", []>;
-//def BUFFER_ATOMIC_DEC_X2 : MUBUF_X2 <mubuf<0x5d, 0x6c>, "buffer_atomic_dec_x2", []>;
+defm BUFFER_ATOMIC_INC : MUBUF_Atomic <
+  mubuf<0x3c, 0x4b>, "buffer_atomic_inc", VGPR_32, i32, atomic_inc_global
+>;
+defm BUFFER_ATOMIC_DEC : MUBUF_Atomic <
+  mubuf<0x3d, 0x4c>, "buffer_atomic_dec", VGPR_32, i32, atomic_dec_global
+>;
+
+//def BUFFER_ATOMIC_FCMPSWAP : MUBUF_Atomic <mubuf<0x3e>, "buffer_atomic_fcmpswap", []>; // isn't on VI
+//def BUFFER_ATOMIC_FMIN : MUBUF_Atomic <mubuf<0x3f>, "buffer_atomic_fmin", []>; // isn't on VI
+//def BUFFER_ATOMIC_FMAX : MUBUF_Atomic <mubuf<0x40>, "buffer_atomic_fmax", []>; // isn't on VI
+defm BUFFER_ATOMIC_SWAP_X2 : MUBUF_Atomic <
+  mubuf<0x50, 0x60>, "buffer_atomic_swap_x2", VReg_64, i64, atomic_swap_global
+>;
+defm BUFFER_ATOMIC_CMPSWAP_X2 : MUBUF_Atomic <
+  mubuf<0x51, 0x61>, "buffer_atomic_cmpswap_x2", VReg_128, v2i64, null_frag
+>;
+defm BUFFER_ATOMIC_ADD_X2 : MUBUF_Atomic <
+  mubuf<0x52, 0x62>, "buffer_atomic_add_x2", VReg_64, i64, atomic_add_global
+>;
+defm BUFFER_ATOMIC_SUB_X2 : MUBUF_Atomic <
+  mubuf<0x53, 0x63>, "buffer_atomic_sub_x2", VReg_64, i64, atomic_sub_global
+>;
+//defm BUFFER_ATOMIC_RSUB_X2 : MUBUF_Atomic <mubuf<0x54>, "buffer_atomic_rsub_x2", []>; // isn't on CI & VI
+defm BUFFER_ATOMIC_SMIN_X2 : MUBUF_Atomic <
+  mubuf<0x55, 0x64>, "buffer_atomic_smin_x2", VReg_64, i64, atomic_min_global
+>;
+defm BUFFER_ATOMIC_UMIN_X2 : MUBUF_Atomic <
+  mubuf<0x56, 0x65>, "buffer_atomic_umin_x2", VReg_64, i64, atomic_umin_global
+>;
+defm BUFFER_ATOMIC_SMAX_X2 : MUBUF_Atomic <
+  mubuf<0x57, 0x66>, "buffer_atomic_smax_x2", VReg_64, i64, atomic_max_global
+>;
+defm BUFFER_ATOMIC_UMAX_X2 : MUBUF_Atomic <
+  mubuf<0x58, 0x67>, "buffer_atomic_umax_x2", VReg_64, i64, atomic_umax_global
+>;
+defm BUFFER_ATOMIC_AND_X2 : MUBUF_Atomic <
+  mubuf<0x59, 0x68>, "buffer_atomic_and_x2", VReg_64, i64, atomic_and_global
+>;
+defm BUFFER_ATOMIC_OR_X2 : MUBUF_Atomic <
+  mubuf<0x5a, 0x69>, "buffer_atomic_or_x2", VReg_64, i64, atomic_or_global
+>;
+defm BUFFER_ATOMIC_XOR_X2 : MUBUF_Atomic <
+  mubuf<0x5b, 0x6a>, "buffer_atomic_xor_x2", VReg_64, i64, atomic_xor_global
+>;
+defm BUFFER_ATOMIC_INC_X2 : MUBUF_Atomic <
+  mubuf<0x5c, 0x6b>, "buffer_atomic_inc_x2", VReg_64, i64, atomic_inc_global
+>;
+defm BUFFER_ATOMIC_DEC_X2 : MUBUF_Atomic <
+  mubuf<0x5d, 0x6c>, "buffer_atomic_dec_x2", VReg_64, i64, atomic_dec_global
+>;
 //def BUFFER_ATOMIC_FCMPSWAP_X2 : MUBUF_X2 <mubuf<0x5e>, "buffer_atomic_fcmpswap_x2", []>; // isn't on VI
 //def BUFFER_ATOMIC_FMIN_X2 : MUBUF_X2 <mubuf<0x5f>, "buffer_atomic_fmin_x2", []>; // isn't on VI
 //def BUFFER_ATOMIC_FMAX_X2 : MUBUF_X2 <mubuf<0x60>, "buffer_atomic_fmax_x2", []>; // isn't on VI
 
-let SubtargetPredicate = isSI in {
+let SubtargetPredicate = isSI, DisableVIDecoder = 1 in {
 defm BUFFER_WBINVL1_SC : MUBUF_Invalidate <mubuf<0x70>, "buffer_wbinvl1_sc", int_amdgcn_buffer_wbinvl1_sc>; // isn't on CI & VI
 }
 
@@ -1062,28 +1111,28 @@ defm IMAGE_LOAD_MIP : MIMG_NoSampler <0x00000001, "image_load_mip">;
 //def IMAGE_LOAD_PCK_SGN : MIMG_NoPattern_ <"image_load_pck_sgn", 0x00000003>;
 //def IMAGE_LOAD_MIP_PCK : MIMG_NoPattern_ <"image_load_mip_pck", 0x00000004>;
 //def IMAGE_LOAD_MIP_PCK_SGN : MIMG_NoPattern_ <"image_load_mip_pck_sgn", 0x00000005>;
-//def IMAGE_STORE : MIMG_NoPattern_ <"image_store", 0x00000008>;
-//def IMAGE_STORE_MIP : MIMG_NoPattern_ <"image_store_mip", 0x00000009>;
+defm IMAGE_STORE : MIMG_Store <0x00000008, "image_store">;
+defm IMAGE_STORE_MIP : MIMG_Store <0x00000009, "image_store_mip">;
 //def IMAGE_STORE_PCK : MIMG_NoPattern_ <"image_store_pck", 0x0000000a>;
 //def IMAGE_STORE_MIP_PCK : MIMG_NoPattern_ <"image_store_mip_pck", 0x0000000b>;
 defm IMAGE_GET_RESINFO : MIMG_NoSampler <0x0000000e, "image_get_resinfo">;
-//def IMAGE_ATOMIC_SWAP : MIMG_NoPattern_ <"image_atomic_swap", 0x0000000f>;
-//def IMAGE_ATOMIC_CMPSWAP : MIMG_NoPattern_ <"image_atomic_cmpswap", 0x00000010>;
-//def IMAGE_ATOMIC_ADD : MIMG_NoPattern_ <"image_atomic_add", 0x00000011>;
-//def IMAGE_ATOMIC_SUB : MIMG_NoPattern_ <"image_atomic_sub", 0x00000012>;
-//def IMAGE_ATOMIC_RSUB : MIMG_NoPattern_ <"image_atomic_rsub", 0x00000013>;
-//def IMAGE_ATOMIC_SMIN : MIMG_NoPattern_ <"image_atomic_smin", 0x00000014>;
-//def IMAGE_ATOMIC_UMIN : MIMG_NoPattern_ <"image_atomic_umin", 0x00000015>;
-//def IMAGE_ATOMIC_SMAX : MIMG_NoPattern_ <"image_atomic_smax", 0x00000016>;
-//def IMAGE_ATOMIC_UMAX : MIMG_NoPattern_ <"image_atomic_umax", 0x00000017>;
-//def IMAGE_ATOMIC_AND : MIMG_NoPattern_ <"image_atomic_and", 0x00000018>;
-//def IMAGE_ATOMIC_OR : MIMG_NoPattern_ <"image_atomic_or", 0x00000019>;
-//def IMAGE_ATOMIC_XOR : MIMG_NoPattern_ <"image_atomic_xor", 0x0000001a>;
-//def IMAGE_ATOMIC_INC : MIMG_NoPattern_ <"image_atomic_inc", 0x0000001b>;
-//def IMAGE_ATOMIC_DEC : MIMG_NoPattern_ <"image_atomic_dec", 0x0000001c>;
-//def IMAGE_ATOMIC_FCMPSWAP : MIMG_NoPattern_ <"image_atomic_fcmpswap", 0x0000001d>;
-//def IMAGE_ATOMIC_FMIN : MIMG_NoPattern_ <"image_atomic_fmin", 0x0000001e>;
-//def IMAGE_ATOMIC_FMAX : MIMG_NoPattern_ <"image_atomic_fmax", 0x0000001f>;
+defm IMAGE_ATOMIC_SWAP : MIMG_Atomic <mimg<0x0f, 0x10>, "image_atomic_swap">;
+defm IMAGE_ATOMIC_CMPSWAP : MIMG_Atomic <mimg<0x10, 0x11>, "image_atomic_cmpswap", VReg_64>;
+defm IMAGE_ATOMIC_ADD : MIMG_Atomic <mimg<0x11, 0x12>, "image_atomic_add">;
+defm IMAGE_ATOMIC_SUB : MIMG_Atomic <mimg<0x12, 0x13>, "image_atomic_sub">;
+//def IMAGE_ATOMIC_RSUB : MIMG_NoPattern_ <"image_atomic_rsub", 0x00000013>; -- not on VI
+defm IMAGE_ATOMIC_SMIN : MIMG_Atomic <mimg<0x14>, "image_atomic_smin">;
+defm IMAGE_ATOMIC_UMIN : MIMG_Atomic <mimg<0x15>, "image_atomic_umin">;
+defm IMAGE_ATOMIC_SMAX : MIMG_Atomic <mimg<0x16>, "image_atomic_smax">;
+defm IMAGE_ATOMIC_UMAX : MIMG_Atomic <mimg<0x17>, "image_atomic_umax">;
+defm IMAGE_ATOMIC_AND : MIMG_Atomic <mimg<0x18>, "image_atomic_and">;
+defm IMAGE_ATOMIC_OR : MIMG_Atomic <mimg<0x19>, "image_atomic_or">;
+defm IMAGE_ATOMIC_XOR : MIMG_Atomic <mimg<0x1a>, "image_atomic_xor">;
+defm IMAGE_ATOMIC_INC : MIMG_Atomic <mimg<0x1b>, "image_atomic_inc">;
+defm IMAGE_ATOMIC_DEC : MIMG_Atomic <mimg<0x1c>, "image_atomic_dec">;
+//def IMAGE_ATOMIC_FCMPSWAP : MIMG_NoPattern_ <"image_atomic_fcmpswap", 0x0000001d>; -- not on VI
+//def IMAGE_ATOMIC_FMIN : MIMG_NoPattern_ <"image_atomic_fmin", 0x0000001e>; -- not on VI
+//def IMAGE_ATOMIC_FMAX : MIMG_NoPattern_ <"image_atomic_fmax", 0x0000001f>; -- not on VI
 defm IMAGE_SAMPLE           : MIMG_Sampler_WQM <0x00000020, "image_sample">;
 defm IMAGE_SAMPLE_CL        : MIMG_Sampler_WQM <0x00000021, "image_sample_cl">;
 defm IMAGE_SAMPLE_D         : MIMG_Sampler <0x00000022, "image_sample_d">;
@@ -1171,10 +1220,12 @@ let Uses = [EXEC] in {
 def V_READFIRSTLANE_B32 : VOP1 <
   0x00000002,
   (outs SReg_32:$vdst),
-  (ins VGPR_32:$src0),
+  (ins VS_32:$src0),
   "v_readfirstlane_b32 $vdst, $src0",
   []
->;
+> {
+  let isConvergent = 1;
+}
 
 }
 
@@ -1234,7 +1285,7 @@ defm V_CVT_F64_U32 : VOP1Inst <vop1<0x16>, "v_cvt_f64_u32",
   VOP_F64_I32, uint_to_fp
 >;
 
-} // let SchedRW = [WriteQuarterRate32]
+} // End SchedRW = [WriteQuarterRate32]
 
 defm V_FRACT_F32 : VOP1Inst <vop1<0x20, 0x1b>, "v_fract_f32",
   VOP_F32_F32, AMDGPUfract
@@ -1270,7 +1321,7 @@ defm V_RSQ_F32 : VOP1Inst <vop1<0x2e, 0x24>, "v_rsq_f32",
   VOP_F32_F32, AMDGPUrsq
 >;
 
-} //let SchedRW = [WriteQuarterRate32]
+} // End SchedRW = [WriteQuarterRate32]
 
 let SchedRW = [WriteDouble] in {
 
@@ -1281,7 +1332,7 @@ defm V_RSQ_F64 : VOP1Inst <vop1<0x31, 0x26>, "v_rsq_f64",
   VOP_F64_F64, AMDGPUrsq
 >;
 
-} // let SchedRW = [WriteDouble];
+} // End SchedRW = [WriteDouble];
 
 defm V_SQRT_F32 : VOP1Inst <vop1<0x33, 0x27>, "v_sqrt_f32",
   VOP_F32_F32, fsqrt
@@ -1312,34 +1363,34 @@ defm V_FFBH_U32 : VOP1Inst <vop1<0x39, 0x2d>, "v_ffbh_u32", VOP_I32_I32>;
 defm V_FFBL_B32 : VOP1Inst <vop1<0x3a, 0x2e>, "v_ffbl_b32", VOP_I32_I32>;
 defm V_FFBH_I32 : VOP1Inst <vop1<0x3b, 0x2f>, "v_ffbh_i32", VOP_I32_I32>;
 defm V_FREXP_EXP_I32_F64 : VOP1Inst <vop1<0x3c,0x30>, "v_frexp_exp_i32_f64",
-  VOP_I32_F64
+  VOP_I32_F64, int_amdgcn_frexp_exp
 >;
 
 let SchedRW = [WriteDoubleAdd] in {
 defm V_FREXP_MANT_F64 : VOP1Inst <vop1<0x3d, 0x31>, "v_frexp_mant_f64",
-  VOP_F64_F64
+  VOP_F64_F64, int_amdgcn_frexp_mant
 >;
 
 defm V_FRACT_F64 : VOP1Inst <vop1<0x3e, 0x32>, "v_fract_f64",
-  VOP_F64_F64
+  VOP_F64_F64, AMDGPUfract
 >;
 } // End SchedRW = [WriteDoubleAdd]
 
 
 defm V_FREXP_EXP_I32_F32 : VOP1Inst <vop1<0x3f, 0x33>, "v_frexp_exp_i32_f32",
-  VOP_I32_F32
+  VOP_I32_F32, int_amdgcn_frexp_exp
 >;
 defm V_FREXP_MANT_F32 : VOP1Inst <vop1<0x40, 0x34>, "v_frexp_mant_f32",
-  VOP_F32_F32
+  VOP_F32_F32, int_amdgcn_frexp_mant
 >;
 let vdst = 0, src0 = 0, VOPAsmPrefer32Bit = 1 in {
-defm V_CLREXCP : VOP1Inst <vop1<0x41,0x35>, "v_clrexcp", VOP_NONE>;
+defm V_CLREXCP : VOP1Inst <vop1<0x41,0x35>, "v_clrexcp", VOP_NO_EXT<VOP_NONE>>;
 }
 
 let Uses = [M0, EXEC] in {
-defm V_MOVRELD_B32 : VOP1Inst <vop1<0x42, 0x36>, "v_movreld_b32", VOP_I32_I32>;
-defm V_MOVRELS_B32 : VOP1Inst <vop1<0x43, 0x37>, "v_movrels_b32", VOP_I32_I32>;
-defm V_MOVRELSD_B32 : VOP1Inst <vop1<0x44, 0x38>, "v_movrelsd_b32", VOP_I32_I32>;
+defm V_MOVRELD_B32 : VOP1Inst <vop1<0x42, 0x36>, "v_movreld_b32", VOP_NO_EXT<VOP_I32_I32>>;
+defm V_MOVRELS_B32 : VOP1Inst <vop1<0x43, 0x37>, "v_movrels_b32", VOP_NO_EXT<VOP_I32_I32>>;
+defm V_MOVRELSD_B32 : VOP1Inst <vop1<0x44, 0x38>, "v_movrelsd_b32", VOP_NO_EXT<VOP_I32_I32>>;
 } // End Uses = [M0, EXEC]
 
 // These instruction only exist on SI and CI
@@ -1348,11 +1399,12 @@ let SubtargetPredicate = isSICI in {
 let SchedRW = [WriteQuarterRate32] in {
 
 defm V_MOV_FED_B32 : VOP1InstSI <vop1<0x9>, "v_mov_fed_b32", VOP_I32_I32>;
-defm V_LOG_CLAMP_F32 : VOP1InstSI <vop1<0x26>, "v_log_clamp_f32", VOP_F32_F32>;
+defm V_LOG_CLAMP_F32 : VOP1InstSI <vop1<0x26>, "v_log_clamp_f32",
+  VOP_F32_F32, int_amdgcn_log_clamp>;
 defm V_RCP_CLAMP_F32 : VOP1InstSI <vop1<0x28>, "v_rcp_clamp_f32", VOP_F32_F32>;
 defm V_RCP_LEGACY_F32 : VOP1InstSI <vop1<0x29>, "v_rcp_legacy_f32", VOP_F32_F32>;
 defm V_RSQ_CLAMP_F32 : VOP1InstSI <vop1<0x2c>, "v_rsq_clamp_f32",
-  VOP_F32_F32, AMDGPUrsq_clamped
+  VOP_F32_F32, AMDGPUrsq_clamp
 >;
 defm V_RSQ_LEGACY_F32 : VOP1InstSI <vop1<0x2d>, "v_rsq_legacy_f32",
   VOP_F32_F32, AMDGPUrsq_legacy
@@ -1364,7 +1416,7 @@ let SchedRW = [WriteDouble] in {
 
 defm V_RCP_CLAMP_F64 : VOP1InstSI <vop1<0x30>, "v_rcp_clamp_f64", VOP_F64_F64>;
 defm V_RSQ_CLAMP_F64 : VOP1InstSI <vop1<0x32>, "v_rsq_clamp_f64",
-  VOP_F64_F64, AMDGPUrsq_clamped
+  VOP_F64_F64, AMDGPUrsq_clamp
 >;
 
 } // End SchedRW = [WriteDouble]
@@ -1394,11 +1446,11 @@ defm V_INTERP_P1_F32 : V_INTERP_P1_F32_m;
 
 } // End OtherPredicates = [has32BankLDS]
 
-let OtherPredicates = [has16BankLDS], Constraints = "@earlyclobber $dst" in {
+let OtherPredicates = [has16BankLDS], Constraints = "@earlyclobber $dst", isAsmParserOnly=1 in {
 
 defm V_INTERP_P1_F32_16bank : V_INTERP_P1_F32_m;
 
-} // End OtherPredicates = [has32BankLDS], Constraints = "@earlyclobber $dst"
+} // End OtherPredicates = [has32BankLDS], Constraints = "@earlyclobber $dst", isAsmParserOnly=1
 
 let DisableEncoding = "$src0", Constraints = "$src0 = $dst" in {
 
@@ -1426,15 +1478,9 @@ defm V_INTERP_MOV_F32 : VINTRP_m <
 // VOP2 Instructions
 //===----------------------------------------------------------------------===//
 
-multiclass V_CNDMASK <vop2 op, string name> {
-  defm _e32 : VOP2_m <op, name, VOP_CNDMASK, [], name>;
-
-  defm _e64  : VOP3_m <
-      op, VOP_CNDMASK.Outs, VOP_CNDMASK.Ins64,
-      name#!cast<string>(VOP_CNDMASK.Asm64), [], name, 3>;
-}
-
-defm V_CNDMASK_B32 : V_CNDMASK<vop2<0x0>, "v_cndmask_b32">;
+defm V_CNDMASK_B32 : VOP2eInst <vop2<0x0, 0x0>, "v_cndmask_b32",
+  VOP2e_I32_I32_I32_I1
+>;
 
 let isCommutable = 1 in {
 defm V_ADD_F32 : VOP2Inst <vop2<0x3, 0x1>, "v_add_f32",
@@ -1450,7 +1496,7 @@ defm V_SUBREV_F32 : VOP2Inst <vop2<0x5, 0x3>, "v_subrev_f32",
 let isCommutable = 1 in {
 
 defm V_MUL_LEGACY_F32 : VOP2Inst <vop2<0x7, 0x4>, "v_mul_legacy_f32",
-  VOP_F32_F32_F32, int_AMDGPU_mul
+  VOP_F32_F32_F32
 >;
 
 defm V_MUL_F32 : VOP2Inst <vop2<0x8, 0x5>, "v_mul_f32",
@@ -1501,16 +1547,16 @@ defm V_AND_B32 : VOP2Inst <vop2<0x1b, 0x13>, "v_and_b32", VOP_I32_I32_I32>;
 defm V_OR_B32 : VOP2Inst <vop2<0x1c, 0x14>, "v_or_b32", VOP_I32_I32_I32>;
 defm V_XOR_B32 : VOP2Inst <vop2<0x1d, 0x15>, "v_xor_b32", VOP_I32_I32_I32>;
 
-let Constraints = "$dst = $src2", DisableEncoding="$src2",
+let Constraints = "$vdst = $src2", DisableEncoding="$src2",
     isConvertibleToThreeAddress = 1 in {
 defm V_MAC_F32 : VOP2Inst <vop2<0x1f, 0x16>, "v_mac_f32", VOP_MAC>;
 }
 } // End isCommutable = 1
 
-defm V_MADMK_F32 : VOP2MADK <vop2<0x20, 0x17>, "v_madmk_f32">;
+defm V_MADMK_F32 : VOP2MADK <vop2<0x20, 0x17>, "v_madmk_f32", VOP_MADMK>;
 
 let isCommutable = 1 in {
-defm V_MADAK_F32 : VOP2MADK <vop2<0x21, 0x18>, "v_madak_f32">;
+defm V_MADAK_F32 : VOP2MADK <vop2<0x21, 0x18>, "v_madak_f32", VOP_MADAK>;
 } // End isCommutable = 1
 
 let isCommutable = 1 in {
@@ -1540,11 +1586,14 @@ defm V_SUBBREV_U32 : VOP2bInst <vop2<0x2a, 0x1e>, "v_subbrev_u32",
 
 } // End isCommutable = 1
 
+// These are special and do not read the exec mask.
+let isConvergent = 1, Uses = []<Register> in {
+
 defm V_READLANE_B32 : VOP2SI_3VI_m <
   vop3 <0x001, 0x289>,
   "v_readlane_b32",
   (outs SReg_32:$vdst),
-  (ins VGPR_32:$src0, SCSrc_32:$src1),
+  (ins VS_32:$src0, SCSrc_32:$src1),
   "v_readlane_b32 $vdst, $src0, $src1"
 >;
 
@@ -1556,6 +1605,8 @@ defm V_WRITELANE_B32 : VOP2SI_3VI_m <
   "v_writelane_b32 $vdst, $src0, $src1"
 >;
 
+} // End isConvergent = 1
+
 // These instructions only exist on SI and CI
 let SubtargetPredicate = isSICI in {
 
@@ -1636,16 +1687,16 @@ defm V_MAD_U32_U24 : VOP3Inst <vop3<0x143, 0x1c3>, "v_mad_u32_u24",
 } // End isCommutable = 1
 
 defm V_CUBEID_F32 : VOP3Inst <vop3<0x144, 0x1c4>, "v_cubeid_f32",
-  VOP_F32_F32_F32_F32
+  VOP_F32_F32_F32_F32, int_amdgcn_cubeid
 >;
 defm V_CUBESC_F32 : VOP3Inst <vop3<0x145, 0x1c5>, "v_cubesc_f32",
-  VOP_F32_F32_F32_F32
+  VOP_F32_F32_F32_F32, int_amdgcn_cubesc
 >;
 defm V_CUBETC_F32 : VOP3Inst <vop3<0x146, 0x1c6>, "v_cubetc_f32",
-  VOP_F32_F32_F32_F32
+  VOP_F32_F32_F32_F32, int_amdgcn_cubetc
 >;
 defm V_CUBEMA_F32 : VOP3Inst <vop3<0x147, 0x1c7>, "v_cubema_f32",
-  VOP_F32_F32_F32_F32
+  VOP_F32_F32_F32_F32, int_amdgcn_cubema
 >;
 
 defm V_BFE_U32 : VOP3Inst <vop3<0x148, 0x1c8>, "v_bfe_u32",
@@ -1666,6 +1717,10 @@ defm V_FMA_F32 : VOP3Inst <vop3<0x14b, 0x1cb>, "v_fma_f32",
 defm V_FMA_F64 : VOP3Inst <vop3<0x14c, 0x1cc>, "v_fma_f64",
   VOP_F64_F64_F64_F64, fma
 >;
+
+defm V_LERP_U8 : VOP3Inst <vop3<0x14d, 0x1cd>, "v_lerp_u8",
+  VOP_I32_I32_I32_I32, int_amdgcn_lerp
+>;
 } // End isCommutable = 1
 
 //def V_LERP_U8 : VOP3_U8 <0x0000014d, "v_lerp_u8", []>;
@@ -1695,13 +1750,13 @@ defm V_MAX3_U32 : VOP3Inst <vop3<0x156, 0x1d5>, "v_max3_u32",
   VOP_I32_I32_I32_I32, AMDGPUumax3
 >;
 defm V_MED3_F32 : VOP3Inst <vop3<0x157, 0x1d6>, "v_med3_f32",
-  VOP_F32_F32_F32_F32
+  VOP_F32_F32_F32_F32, AMDGPUfmed3
 >;
 defm V_MED3_I32 : VOP3Inst <vop3<0x158, 0x1d7>, "v_med3_i32",
-  VOP_I32_I32_I32_I32
+  VOP_I32_I32_I32_I32, AMDGPUsmed3
 >;
 defm V_MED3_U32 : VOP3Inst <vop3<0x159, 0x1d8>, "v_med3_u32",
-  VOP_I32_I32_I32_I32
+  VOP_I32_I32_I32_I32, AMDGPUumed3
 >;
 
 //def V_SAD_U8 : VOP3_U8 <0x0000015a, "v_sad_u8", []>;
@@ -1710,7 +1765,7 @@ defm V_MED3_U32 : VOP3Inst <vop3<0x159, 0x1d8>, "v_med3_u32",
 defm V_SAD_U32 : VOP3Inst <vop3<0x15d, 0x1dc>, "v_sad_u32",
   VOP_I32_I32_I32_I32
 >;
-////def V_CVT_PK_U8_F32 : VOP3_U8 <0x0000015e, "v_cvt_pk_u8_f32", []>;
+//def V_CVT_PK_U8_F32 : VOP3_U8 <0x0000015e, "v_cvt_pk_u8_f32", []>;
 defm V_DIV_FIXUP_F32 : VOP3Inst <
   vop3<0x15f, 0x1de>, "v_div_fixup_f32", VOP_F32_F32_F32_F32, AMDGPUdiv_fixup
 >;
@@ -1727,26 +1782,26 @@ let SchedRW = [WriteDoubleAdd] in {
 let isCommutable = 1 in {
 
 defm V_ADD_F64 : VOP3Inst <vop3<0x164, 0x280>, "v_add_f64",
-  VOP_F64_F64_F64, fadd
+  VOP_F64_F64_F64, fadd, 1
 >;
 defm V_MUL_F64 : VOP3Inst <vop3<0x165, 0x281>, "v_mul_f64",
-  VOP_F64_F64_F64, fmul
+  VOP_F64_F64_F64, fmul, 1
 >;
 
 defm V_MIN_F64 : VOP3Inst <vop3<0x166, 0x282>, "v_min_f64",
-  VOP_F64_F64_F64, fminnum
+  VOP_F64_F64_F64, fminnum, 1
 >;
 defm V_MAX_F64 : VOP3Inst <vop3<0x167, 0x283>, "v_max_f64",
-  VOP_F64_F64_F64, fmaxnum
+  VOP_F64_F64_F64, fmaxnum, 1
 >;
 
-} // isCommutable = 1
+} // End isCommutable = 1
 
 defm V_LDEXP_F64 : VOP3Inst <vop3<0x168, 0x284>, "v_ldexp_f64",
-  VOP_F64_F64_I32, AMDGPUldexp
+  VOP_F64_F64_I32, AMDGPUldexp, 1
 >;
 
-} // let SchedRW = [WriteDoubleAdd]
+} // End let SchedRW = [WriteDoubleAdd]
 
 let isCommutable = 1, SchedRW = [WriteQuarterRate32] in {
 
@@ -1754,30 +1809,33 @@ defm V_MUL_LO_U32 : VOP3Inst <vop3<0x169, 0x285>, "v_mul_lo_u32",
   VOP_I32_I32_I32
 >;
 defm V_MUL_HI_U32 : VOP3Inst <vop3<0x16a, 0x286>, "v_mul_hi_u32",
-  VOP_I32_I32_I32
+  VOP_I32_I32_I32, mulhu
 >;
 
+let DisableVIDecoder=1 in { // removed from VI as identical to V_MUL_LO_U32
 defm V_MUL_LO_I32 : VOP3Inst <vop3<0x16b, 0x285>, "v_mul_lo_i32",
   VOP_I32_I32_I32
 >;
+}
+
 defm V_MUL_HI_I32 : VOP3Inst <vop3<0x16c, 0x287>, "v_mul_hi_i32",
-  VOP_I32_I32_I32
+  VOP_I32_I32_I32, mulhs
 >;
 
-} // isCommutable = 1, SchedRW = [WriteQuarterRate32]
+} // End isCommutable = 1, SchedRW = [WriteQuarterRate32]
 
 let SchedRW = [WriteFloatFMA, WriteSALU] in {
 defm V_DIV_SCALE_F32 : VOP3bInst <vop3<0x16d, 0x1e0>, "v_div_scale_f32",
-  VOP3b_F32_I1_F32_F32_F32
+  VOP3b_F32_I1_F32_F32_F32, [], 1
 >;
 }
 
 let SchedRW = [WriteDouble, WriteSALU] in {
 // Double precision division pre-scale.
 defm V_DIV_SCALE_F64 : VOP3bInst <vop3<0x16e, 0x1e1>, "v_div_scale_f64",
-  VOP3b_F64_I1_F64_F64_F64
+  VOP3b_F64_I1_F64_F64_F64, [], 1
 >;
-} // let SchedRW = [WriteDouble]
+} // End SchedRW = [WriteDouble]
 
 let isCommutable = 1, Uses = [VCC, EXEC] in {
 
@@ -1814,7 +1872,7 @@ defm V_TRIG_PREOP_F64 : VOP3Inst <
   vop3<0x174, 0x292>, "v_trig_preop_f64", VOP_F64_F64_I32, AMDGPUtrig_preop
 >;
 
-} // let SchedRW = [WriteDouble]
+} // End SchedRW = [WriteDouble]
 
 // These instructions only exist on SI and CI
 let SubtargetPredicate = isSICI in {
@@ -1828,7 +1886,7 @@ defm V_MULLIT_F32 : VOP3Inst <vop3<0x150>, "v_mullit_f32",
 
 } // End SubtargetPredicate = isSICI
 
-let SubtargetPredicate = isVI in {
+let SubtargetPredicate = isVI, DisableSIDecoder = 1 in {
 
 defm V_LSHLREV_B64 : VOP3Inst <vop3<0, 0x28f>, "v_lshlrev_b64",
   VOP_I64_I32_I64
@@ -1845,113 +1903,145 @@ defm V_ASHRREV_I64 : VOP3Inst <vop3<0, 0x291>, "v_ashrrev_i64",
 //===----------------------------------------------------------------------===//
 // Pseudo Instructions
 //===----------------------------------------------------------------------===//
-let isCodeGenOnly = 1, isPseudo = 1 in {
+
+let hasSideEffects = 0, mayLoad = 0, mayStore = 0, Uses = [EXEC] in {
 
 // For use in patterns
-def V_CNDMASK_B64_PSEUDO : VOP3Common <(outs VReg_64:$dst),
-  (ins VSrc_64:$src0, VSrc_64:$src1, SSrc_64:$src2), "", []
->;
+def V_CNDMASK_B64_PSEUDO : VOP3Common <(outs VReg_64:$vdst),
+  (ins VSrc_64:$src0, VSrc_64:$src1, SSrc_64:$src2), "", []> {
+  let isPseudo = 1;
+  let isCodeGenOnly = 1;
+}
 
-let hasSideEffects = 0, mayLoad = 0, mayStore = 0, Uses = [EXEC] in {
 // 64-bit vector move instruction.  This is mainly used by the SIFoldOperands
 // pass to enable folding of inline immediates.
-def V_MOV_B64_PSEUDO : InstSI <(outs VReg_64:$dst), (ins VSrc_64:$src0), "", []>;
-} // end let hasSideEffects = 0, mayLoad = 0, mayStore = 0
-
-let hasSideEffects = 1, SALU = 1 in {
-def SGPR_USE : InstSI <(outs),(ins), "", []>;
+def V_MOV_B64_PSEUDO : PseudoInstSI <(outs VReg_64:$vdst), (ins VSrc_64:$src0)> {
+  let VALU = 1;
 }
+} // End let hasSideEffects = 0, mayLoad = 0, mayStore = 0, Uses = [EXEC]
+
+let usesCustomInserter = 1, SALU = 1 in {
+def GET_GROUPSTATICSIZE : PseudoInstSI <(outs SReg_32:$sdst), (ins),
+  [(set SReg_32:$sdst, (int_amdgcn_groupstaticsize))]>;
+} // End let usesCustomInserter = 1, SALU = 1
 
 // SI pseudo instructions. These are used by the CFG structurizer pass
 // and should be lowered to ISA instructions prior to codegen.
 
-let mayLoad = 1, mayStore = 1, hasSideEffects = 1 in {
-let Uses = [EXEC], Defs = [EXEC] in {
+let hasSideEffects = 1 in {
+
+// Dummy terminator instruction to use after control flow instructions
+// replaced with exec mask operations.
+def SI_MASK_BRANCH : PseudoInstSI <
+  (outs), (ins brtarget:$target, SReg_64:$dst)> {
+  let isBranch = 1;
+  let isTerminator = 1;
+  let isBarrier = 1;
+  let SALU = 1;
+}
+
+let Uses = [EXEC], Defs = [EXEC, SCC] in {
 
 let isBranch = 1, isTerminator = 1 in {
 
-def SI_IF: InstSI <
-  (outs SReg_64:$dst),
-  (ins SReg_64:$vcc, brtarget:$target),
-  "",
-  [(set i64:$dst, (int_SI_if i1:$vcc, bb:$target))]
->;
+def SI_IF: PseudoInstSI <
+  (outs SReg_64:$dst), (ins SReg_64:$vcc, brtarget:$target),
+  [(set i64:$dst, (int_amdgcn_if i1:$vcc, bb:$target))]> {
+  let Constraints = "";
+}
 
-def SI_ELSE : InstSI <
-  (outs SReg_64:$dst),
-  (ins SReg_64:$src, brtarget:$target),
-  "",
-  [(set i64:$dst, (int_SI_else i64:$src, bb:$target))]
-> {
+def SI_ELSE : PseudoInstSI <
+  (outs SReg_64:$dst), (ins SReg_64:$src, brtarget:$target),
+  [(set i64:$dst, (int_amdgcn_else i64:$src, bb:$target))]> {
   let Constraints = "$src = $dst";
 }
 
-def SI_LOOP : InstSI <
-  (outs),
-  (ins SReg_64:$saved, brtarget:$target),
-  "si_loop $saved, $target",
-  [(int_SI_loop i64:$saved, bb:$target)]
+def SI_LOOP : PseudoInstSI <
+  (outs), (ins SReg_64:$saved, brtarget:$target),
+  [(int_amdgcn_loop i64:$saved, bb:$target)]
 >;
 
-} // end isBranch = 1, isTerminator = 1
+} // End isBranch = 1, isTerminator = 1
 
-def SI_BREAK : InstSI <
-  (outs SReg_64:$dst),
-  (ins SReg_64:$src),
-  "si_else $dst, $src",
-  [(set i64:$dst, (int_SI_break i64:$src))]
+
+def SI_BREAK : PseudoInstSI <
+  (outs SReg_64:$dst), (ins SReg_64:$src),
+  [(set i64:$dst, (int_amdgcn_break i64:$src))]
 >;
 
-def SI_IF_BREAK : InstSI <
-  (outs SReg_64:$dst),
-  (ins SReg_64:$vcc, SReg_64:$src),
-  "si_if_break $dst, $vcc, $src",
-  [(set i64:$dst, (int_SI_if_break i1:$vcc, i64:$src))]
+def SI_IF_BREAK : PseudoInstSI <
+  (outs SReg_64:$dst), (ins SReg_64:$vcc, SReg_64:$src),
+  [(set i64:$dst, (int_amdgcn_if_break i1:$vcc, i64:$src))]
 >;
 
-def SI_ELSE_BREAK : InstSI <
-  (outs SReg_64:$dst),
-  (ins SReg_64:$src0, SReg_64:$src1),
-  "si_else_break $dst, $src0, $src1",
-  [(set i64:$dst, (int_SI_else_break i64:$src0, i64:$src1))]
+def SI_ELSE_BREAK : PseudoInstSI <
+  (outs SReg_64:$dst), (ins SReg_64:$src0, SReg_64:$src1),
+  [(set i64:$dst, (int_amdgcn_else_break i64:$src0, i64:$src1))]
 >;
 
-def SI_END_CF : InstSI <
-  (outs),
-  (ins SReg_64:$saved),
-  "si_end_cf $saved",
-  [(int_SI_end_cf i64:$saved)]
+def SI_END_CF : PseudoInstSI <
+  (outs), (ins SReg_64:$saved),
+  [(int_amdgcn_end_cf i64:$saved)]
 >;
 
-} // End Uses = [EXEC], Defs = [EXEC]
+} // End Uses = [EXEC], Defs = [EXEC, SCC]
 
 let Uses = [EXEC], Defs = [EXEC,VCC] in {
-def SI_KILL : InstSI <
-  (outs),
-  (ins VSrc_32:$src),
-  "si_kill $src",
-  [(int_AMDGPU_kill f32:$src)]
->;
+def SI_KILL : PseudoInstSI <
+  (outs), (ins VSrc_32:$src),
+  [(int_AMDGPU_kill f32:$src)]> {
+  let isConvergent = 1;
+  let usesCustomInserter = 1;
+}
+
+def SI_KILL_TERMINATOR : PseudoInstSI <
+  (outs), (ins VSrc_32:$src)> {
+  let isTerminator = 1;
+}
+
 } // End Uses = [EXEC], Defs = [EXEC,VCC]
 
-} // end mayLoad = 1, mayStore = 1, hasSideEffects = 1
+} // End mayLoad = 1, mayStore = 1, hasSideEffects = 1
 
-let Uses = [EXEC], Defs = [EXEC,VCC,M0] in {
+def SI_PS_LIVE : PseudoInstSI <
+  (outs SReg_64:$dst), (ins),
+  [(set i1:$dst, (int_amdgcn_ps_live))]> {
+  let SALU = 1;
+}
 
-class SI_INDIRECT_SRC<RegisterClass rc> : InstSI <
-  (outs VGPR_32:$dst, SReg_64:$temp),
-  (ins rc:$src, VSrc_32:$idx, i32imm:$off),
-  "si_indirect_src $dst, $temp, $src, $idx, $off",
-  []
->;
+// Used as an isel pseudo to directly emit initialization with an
+// s_mov_b32 rather than a copy of another initialized
+// register. MachineCSE skips copies, and we don't want to have to
+// fold operands before it runs.
+def SI_INIT_M0 : PseudoInstSI <(outs), (ins SSrc_32:$src)> {
+  let Defs = [M0];
+  let usesCustomInserter = 1;
+  let isAsCheapAsAMove = 1;
+  let SALU = 1;
+  let isReMaterializable = 1;
+}
 
-class SI_INDIRECT_DST<RegisterClass rc> : InstSI <
-  (outs rc:$dst, SReg_64:$temp),
-  (ins unknown:$src, VSrc_32:$idx, i32imm:$off, VGPR_32:$val),
-  "si_indirect_dst $dst, $temp, $src, $idx, $off, $val",
-  []
-> {
-  let Constraints = "$src = $dst";
+def SI_RETURN : PseudoInstSI <
+  (outs), (ins variable_ops), [(AMDGPUreturn)]> {
+  let isTerminator = 1;
+  let isBarrier = 1;
+  let isReturn = 1;
+  let hasSideEffects = 1;
+  let SALU = 1;
+  let hasNoSchedulingInfo = 1;
+}
+
+let Uses = [EXEC], Defs = [EXEC, VCC, M0],
+  UseNamedOperandTable = 1 in {
+
+class SI_INDIRECT_SRC<RegisterClass rc> : PseudoInstSI <
+  (outs VGPR_32:$vdst, SReg_64:$sdst),
+  (ins rc:$src, VS_32:$idx, i32imm:$offset)>;
+
+class SI_INDIRECT_DST<RegisterClass rc> : PseudoInstSI <
+  (outs rc:$vdst, SReg_64:$sdst),
+  (ins unknown:$src, VS_32:$idx, i32imm:$offset, VGPR_32:$val)> {
+  let Constraints = "$src = $vdst";
 }
 
 // TODO: We can support indirect SGPR access.
@@ -1967,25 +2057,20 @@ def SI_INDIRECT_DST_V4 : SI_INDIRECT_DST<VReg_128>;
 def SI_INDIRECT_DST_V8 : SI_INDIRECT_DST<VReg_256>;
 def SI_INDIRECT_DST_V16 : SI_INDIRECT_DST<VReg_512>;
 
-} // Uses = [EXEC,VCC,M0], Defs = [EXEC,VCC,M0]
+} // End Uses = [EXEC], Defs = [EXEC,VCC,M0]
 
 multiclass SI_SPILL_SGPR <RegisterClass sgpr_class> {
-
   let UseNamedOperandTable = 1, Uses = [EXEC] in {
-    def _SAVE : InstSI <
+    def _SAVE : PseudoInstSI <
       (outs),
-      (ins sgpr_class:$src, i32imm:$frame_idx),
-      "", []
-    > {
+      (ins sgpr_class:$src, i32imm:$frame_idx)> {
       let mayStore = 1;
       let mayLoad = 0;
     }
 
-    def _RESTORE : InstSI <
+    def _RESTORE : PseudoInstSI <
       (outs sgpr_class:$dst),
-      (ins i32imm:$frame_idx),
-      "", []
-    > {
+      (ins i32imm:$frame_idx)> {
       let mayStore = 0;
       let mayLoad = 1;
     }
@@ -1993,9 +2078,9 @@ multiclass SI_SPILL_SGPR <RegisterClass sgpr_class> {
 }
 
 // It's unclear whether you can use M0 as the output of v_readlane_b32
-// instructions, so use SGPR_32 register class for spills to prevent
+// instructions, so use SReg_32_XM0 register class for spills to prevent
 // this from happening.
-defm SI_SPILL_S32  : SI_SPILL_SGPR <SGPR_32>;
+defm SI_SPILL_S32  : SI_SPILL_SGPR <SReg_32_XM0>;
 defm SI_SPILL_S64  : SI_SPILL_SGPR <SReg_64>;
 defm SI_SPILL_S128 : SI_SPILL_SGPR <SReg_128>;
 defm SI_SPILL_S256 : SI_SPILL_SGPR <SReg_256>;
@@ -2003,21 +2088,18 @@ defm SI_SPILL_S512 : SI_SPILL_SGPR <SReg_512>;
 
 multiclass SI_SPILL_VGPR <RegisterClass vgpr_class> {
   let UseNamedOperandTable = 1, VGPRSpill = 1, Uses = [EXEC] in {
-    def _SAVE : InstSI <
+    def _SAVE : PseudoInstSI <
       (outs),
       (ins vgpr_class:$src, i32imm:$frame_idx, SReg_128:$scratch_rsrc,
-           SReg_32:$scratch_offset),
-      "", []
-    > {
+           SReg_32:$scratch_offset, i32imm:$offset)> {
       let mayStore = 1;
       let mayLoad = 0;
     }
 
-    def _RESTORE : InstSI <
+    def _RESTORE : PseudoInstSI <
       (outs vgpr_class:$dst),
-      (ins i32imm:$frame_idx, SReg_128:$scratch_rsrc, SReg_32:$scratch_offset),
-      "", []
-    > {
+      (ins i32imm:$frame_idx, SReg_128:$scratch_rsrc, SReg_32:$scratch_offset,
+           i32imm:$offset)> {
       let mayStore = 0;
       let mayLoad = 1;
     }
@@ -2033,29 +2115,19 @@ defm SI_SPILL_V512 : SI_SPILL_VGPR <VReg_512>;
 
 let Defs = [SCC] in {
 
-def SI_CONSTDATA_PTR : InstSI <
+def SI_PC_ADD_REL_OFFSET : PseudoInstSI <
   (outs SReg_64:$dst),
-  (ins const_ga:$ptr),
-  "", [(set SReg_64:$dst, (i64 (SIconstdata_ptr (tglobaladdr:$ptr))))]
-> {
+  (ins si_ga:$ptr),
+  [(set SReg_64:$dst, (i64 (SIpc_add_rel_offset (tglobaladdr:$ptr))))]> {
   let SALU = 1;
 }
 
 } // End Defs = [SCC]
 
-} // end IsCodeGenOnly, isPseudo
-
-} // end SubtargetPredicate = isGCN
+} // End SubtargetPredicate = isGCN
 
 let Predicates = [isGCN] in {
 
-def : Pat<
-  (int_AMDGPU_cndlt f32:$src0, f32:$src1, f32:$src2),
-  (V_CNDMASK_B32_e64 $src2, $src1,
-                     (V_CMP_GT_F32_e64 SRCMODS.NONE, 0, SRCMODS.NONE, $src0,
-                                       DSTCLAMP.NONE, DSTOMOD.NONE))
->;
-
 def : Pat <
   (int_AMDGPU_kilp),
   (SI_KILL 0xbf800000)
@@ -2067,7 +2139,6 @@ def : Pat<
   (BUFFER_LOAD_FORMAT_XYZW_IDXEN $buf_idx_vgpr, $tlst, 0, imm:$attr_offset, 0, 0, 0)
 >;
 
-/* int_SI_export */
 def : Pat <
   (int_SI_export imm:$en, imm:$vm, imm:$done, imm:$tgt, imm:$compr,
                  f32:$src0, f32:$src1, f32:$src2, f32:$src3),
@@ -2076,6 +2147,217 @@ def : Pat <
 >;
 
 //===----------------------------------------------------------------------===//
+// buffer_load/store_format patterns
+//===----------------------------------------------------------------------===//
+
+multiclass MUBUF_LoadIntrinsicPat<SDPatternOperator name, ValueType vt,
+                                  string opcode> {
+  def : Pat<
+    (vt (name v4i32:$rsrc, 0,
+              (MUBUFIntrinsicOffset i32:$soffset, i16:$offset),
+              imm:$glc, imm:$slc)),
+    (!cast<MUBUF>(opcode # _OFFSET) $rsrc, $soffset, (as_i16imm $offset),
+      (as_i1imm $glc), (as_i1imm $slc), 0)
+  >;
+
+  def : Pat<
+    (vt (name v4i32:$rsrc, i32:$vindex,
+              (MUBUFIntrinsicOffset i32:$soffset, i16:$offset),
+              imm:$glc, imm:$slc)),
+    (!cast<MUBUF>(opcode # _IDXEN) $vindex, $rsrc, $soffset, (as_i16imm $offset),
+      (as_i1imm $glc), (as_i1imm $slc), 0)
+  >;
+
+  def : Pat<
+    (vt (name v4i32:$rsrc, 0,
+              (MUBUFIntrinsicVOffset i32:$soffset, i16:$offset, i32:$voffset),
+              imm:$glc, imm:$slc)),
+    (!cast<MUBUF>(opcode # _OFFEN) $voffset, $rsrc, $soffset, (as_i16imm $offset),
+      (as_i1imm $glc), (as_i1imm $slc), 0)
+  >;
+
+  def : Pat<
+    (vt (name v4i32:$rsrc, i32:$vindex,
+              (MUBUFIntrinsicVOffset i32:$soffset, i16:$offset, i32:$voffset),
+              imm:$glc, imm:$slc)),
+    (!cast<MUBUF>(opcode # _BOTHEN)
+      (REG_SEQUENCE VReg_64, $vindex, sub0, $voffset, sub1),
+      $rsrc, $soffset, (as_i16imm $offset),
+      (as_i1imm $glc), (as_i1imm $slc), 0)
+  >;
+}
+
+defm : MUBUF_LoadIntrinsicPat<int_amdgcn_buffer_load_format, f32, "BUFFER_LOAD_FORMAT_X">;
+defm : MUBUF_LoadIntrinsicPat<int_amdgcn_buffer_load_format, v2f32, "BUFFER_LOAD_FORMAT_XY">;
+defm : MUBUF_LoadIntrinsicPat<int_amdgcn_buffer_load_format, v4f32, "BUFFER_LOAD_FORMAT_XYZW">;
+defm : MUBUF_LoadIntrinsicPat<int_amdgcn_buffer_load, f32, "BUFFER_LOAD_DWORD">;
+defm : MUBUF_LoadIntrinsicPat<int_amdgcn_buffer_load, v2f32, "BUFFER_LOAD_DWORDX2">;
+defm : MUBUF_LoadIntrinsicPat<int_amdgcn_buffer_load, v4f32, "BUFFER_LOAD_DWORDX4">;
+
+multiclass MUBUF_StoreIntrinsicPat<SDPatternOperator name, ValueType vt,
+                                   string opcode> {
+  def : Pat<
+    (name vt:$vdata, v4i32:$rsrc, 0,
+          (MUBUFIntrinsicOffset i32:$soffset, i16:$offset),
+          imm:$glc, imm:$slc),
+    (!cast<MUBUF>(opcode # _OFFSET) $vdata, $rsrc, $soffset, (as_i16imm $offset),
+                                    (as_i1imm $glc), (as_i1imm $slc), 0)
+  >;
+
+  def : Pat<
+    (name vt:$vdata, v4i32:$rsrc, i32:$vindex,
+          (MUBUFIntrinsicOffset i32:$soffset, i16:$offset),
+          imm:$glc, imm:$slc),
+    (!cast<MUBUF>(opcode # _IDXEN) $vdata, $vindex, $rsrc, $soffset,
+                                   (as_i16imm $offset), (as_i1imm $glc),
+                                   (as_i1imm $slc), 0)
+  >;
+
+  def : Pat<
+    (name vt:$vdata, v4i32:$rsrc, 0,
+          (MUBUFIntrinsicVOffset i32:$soffset, i16:$offset, i32:$voffset),
+          imm:$glc, imm:$slc),
+    (!cast<MUBUF>(opcode # _OFFEN) $vdata, $voffset, $rsrc, $soffset,
+                                   (as_i16imm $offset), (as_i1imm $glc),
+                                   (as_i1imm $slc), 0)
+  >;
+
+  def : Pat<
+    (name vt:$vdata, v4i32:$rsrc, i32:$vindex,
+          (MUBUFIntrinsicVOffset i32:$soffset, i16:$offset, i32:$voffset),
+          imm:$glc, imm:$slc),
+    (!cast<MUBUF>(opcode # _BOTHEN)
+      $vdata,
+      (REG_SEQUENCE VReg_64, $vindex, sub0, $voffset, sub1),
+      $rsrc, $soffset, (as_i16imm $offset),
+      (as_i1imm $glc), (as_i1imm $slc), 0)
+  >;
+}
+
+defm : MUBUF_StoreIntrinsicPat<int_amdgcn_buffer_store_format, f32, "BUFFER_STORE_FORMAT_X">;
+defm : MUBUF_StoreIntrinsicPat<int_amdgcn_buffer_store_format, v2f32, "BUFFER_STORE_FORMAT_XY">;
+defm : MUBUF_StoreIntrinsicPat<int_amdgcn_buffer_store_format, v4f32, "BUFFER_STORE_FORMAT_XYZW">;
+defm : MUBUF_StoreIntrinsicPat<int_amdgcn_buffer_store, f32, "BUFFER_STORE_DWORD">;
+defm : MUBUF_StoreIntrinsicPat<int_amdgcn_buffer_store, v2f32, "BUFFER_STORE_DWORDX2">;
+defm : MUBUF_StoreIntrinsicPat<int_amdgcn_buffer_store, v4f32, "BUFFER_STORE_DWORDX4">;
+
+//===----------------------------------------------------------------------===//
+// buffer_atomic patterns
+//===----------------------------------------------------------------------===//
+multiclass BufferAtomicPatterns<SDPatternOperator name, string opcode> {
+  def : Pat<
+    (name i32:$vdata_in, v4i32:$rsrc, 0,
+          (MUBUFIntrinsicOffset i32:$soffset, i16:$offset),
+          imm:$slc),
+    (!cast<MUBUF>(opcode # _RTN_OFFSET) $vdata_in, $rsrc, $soffset,
+                                        (as_i16imm $offset), (as_i1imm $slc))
+  >;
+
+  def : Pat<
+    (name i32:$vdata_in, v4i32:$rsrc, i32:$vindex,
+          (MUBUFIntrinsicOffset i32:$soffset, i16:$offset),
+          imm:$slc),
+    (!cast<MUBUF>(opcode # _RTN_IDXEN) $vdata_in, $vindex, $rsrc, $soffset,
+                                       (as_i16imm $offset), (as_i1imm $slc))
+  >;
+
+  def : Pat<
+    (name i32:$vdata_in, v4i32:$rsrc, 0,
+          (MUBUFIntrinsicVOffset i32:$soffset, i16:$offset, i32:$voffset),
+          imm:$slc),
+    (!cast<MUBUF>(opcode # _RTN_OFFEN) $vdata_in, $voffset, $rsrc, $soffset,
+                                       (as_i16imm $offset), (as_i1imm $slc))
+  >;
+
+  def : Pat<
+    (name i32:$vdata_in, v4i32:$rsrc, i32:$vindex,
+          (MUBUFIntrinsicVOffset i32:$soffset, i16:$offset, i32:$voffset),
+          imm:$slc),
+    (!cast<MUBUF>(opcode # _RTN_BOTHEN)
+      $vdata_in,
+      (REG_SEQUENCE VReg_64, $vindex, sub0, $voffset, sub1),
+      $rsrc, $soffset, (as_i16imm $offset), (as_i1imm $slc))
+  >;
+}
+
+defm : BufferAtomicPatterns<int_amdgcn_buffer_atomic_swap, "BUFFER_ATOMIC_SWAP">;
+defm : BufferAtomicPatterns<int_amdgcn_buffer_atomic_add, "BUFFER_ATOMIC_ADD">;
+defm : BufferAtomicPatterns<int_amdgcn_buffer_atomic_sub, "BUFFER_ATOMIC_SUB">;
+defm : BufferAtomicPatterns<int_amdgcn_buffer_atomic_smin, "BUFFER_ATOMIC_SMIN">;
+defm : BufferAtomicPatterns<int_amdgcn_buffer_atomic_umin, "BUFFER_ATOMIC_UMIN">;
+defm : BufferAtomicPatterns<int_amdgcn_buffer_atomic_smax, "BUFFER_ATOMIC_SMAX">;
+defm : BufferAtomicPatterns<int_amdgcn_buffer_atomic_umax, "BUFFER_ATOMIC_UMAX">;
+defm : BufferAtomicPatterns<int_amdgcn_buffer_atomic_and, "BUFFER_ATOMIC_AND">;
+defm : BufferAtomicPatterns<int_amdgcn_buffer_atomic_or, "BUFFER_ATOMIC_OR">;
+defm : BufferAtomicPatterns<int_amdgcn_buffer_atomic_xor, "BUFFER_ATOMIC_XOR">;
+
+def : Pat<
+  (int_amdgcn_buffer_atomic_cmpswap
+      i32:$data, i32:$cmp, v4i32:$rsrc, 0,
+      (MUBUFIntrinsicOffset i32:$soffset, i16:$offset),
+      imm:$slc),
+  (EXTRACT_SUBREG
+    (BUFFER_ATOMIC_CMPSWAP_RTN_OFFSET
+      (REG_SEQUENCE VReg_64, $data, sub0, $cmp, sub1),
+      $rsrc, $soffset, (as_i16imm $offset), (as_i1imm $slc)),
+    sub0)
+>;
+
+def : Pat<
+  (int_amdgcn_buffer_atomic_cmpswap
+      i32:$data, i32:$cmp, v4i32:$rsrc, i32:$vindex,
+      (MUBUFIntrinsicOffset i32:$soffset, i16:$offset),
+      imm:$slc),
+  (EXTRACT_SUBREG
+    (BUFFER_ATOMIC_CMPSWAP_RTN_IDXEN
+      (REG_SEQUENCE VReg_64, $data, sub0, $cmp, sub1),
+      $vindex, $rsrc, $soffset, (as_i16imm $offset), (as_i1imm $slc)),
+    sub0)
+>;
+
+def : Pat<
+  (int_amdgcn_buffer_atomic_cmpswap
+      i32:$data, i32:$cmp, v4i32:$rsrc, 0,
+      (MUBUFIntrinsicVOffset i32:$soffset, i16:$offset, i32:$voffset),
+      imm:$slc),
+  (EXTRACT_SUBREG
+    (BUFFER_ATOMIC_CMPSWAP_RTN_OFFEN
+      (REG_SEQUENCE VReg_64, $data, sub0, $cmp, sub1),
+      $voffset, $rsrc, $soffset, (as_i16imm $offset), (as_i1imm $slc)),
+    sub0)
+>;
+
+def : Pat<
+  (int_amdgcn_buffer_atomic_cmpswap
+      i32:$data, i32:$cmp, v4i32:$rsrc, i32:$vindex,
+      (MUBUFIntrinsicVOffset i32:$soffset, i16:$offset, i32:$voffset),
+      imm:$slc),
+  (EXTRACT_SUBREG
+    (BUFFER_ATOMIC_CMPSWAP_RTN_BOTHEN
+      (REG_SEQUENCE VReg_64, $data, sub0, $cmp, sub1),
+      (REG_SEQUENCE VReg_64, $vindex, sub0, $voffset, sub1),
+      $rsrc, $soffset, (as_i16imm $offset), (as_i1imm $slc)),
+    sub0)
+>;
+
+
+//===----------------------------------------------------------------------===//
+// S_GETREG_B32 Intrinsic Pattern.
+//===----------------------------------------------------------------------===//
+def : Pat <
+  (int_amdgcn_s_getreg imm:$simm16),
+  (S_GETREG_B32 (as_i16imm $simm16))
+>;
+
+//===----------------------------------------------------------------------===//
+// DS_SWIZZLE Intrinsic Pattern.
+//===----------------------------------------------------------------------===//
+def : Pat <
+  (int_amdgcn_ds_swizzle i32:$src, imm:$offset16),
+  (DS_SWIZZLE_B32 $src, (as_i16imm $offset16), (i1 0))
+>;
+
+//===----------------------------------------------------------------------===//
 // SMRD Patterns
 //===----------------------------------------------------------------------===//
 
@@ -2109,7 +2391,6 @@ let AddedComplexity = 100 in {
 defm : SMRD_Pattern <"S_LOAD_DWORD", i32>;
 defm : SMRD_Pattern <"S_LOAD_DWORDX2", v2i32>;
 defm : SMRD_Pattern <"S_LOAD_DWORDX4", v4i32>;
-defm : SMRD_Pattern <"S_LOAD_DWORDX8", v32i8>;
 defm : SMRD_Pattern <"S_LOAD_DWORDX8", v8i32>;
 defm : SMRD_Pattern <"S_LOAD_DWORDX16", v16i32>;
 
@@ -2143,7 +2424,7 @@ def : Pat <
 def : Pat <
   (i64 (ctpop i64:$src)),
     (i64 (REG_SEQUENCE SReg_64,
-     (S_BCNT1_I32_B64 $src), sub0,
+     (i32 (COPY_TO_REGCLASS (S_BCNT1_I32_B64 $src), SReg_32)), sub0,
      (S_MOV_B32 0), sub1))
 >;
 
@@ -2168,8 +2449,8 @@ def : Pat <
 //===----------------------------------------------------------------------===//
 
 def : Pat <
-  (int_AMDGPU_barrier_global),
-  (S_BARRIER)
+  (int_amdgcn_s_waitcnt i32:$simm16),
+  (S_WAITCNT (as_i16imm $simm16))
 >;
 
 //===----------------------------------------------------------------------===//
@@ -2184,7 +2465,22 @@ let Predicates = [UnsafeFPMath] in {
 
 def : RsqPat<V_RSQ_F32_e32, f32>;
 def : RsqPat<V_RSQ_F64_e32, f64>;
-}
+
+// Convert (x - floor(x)) to fract(x)
+def : Pat <
+  (f32 (fsub (f32 (VOP3Mods f32:$x, i32:$mods)),
+             (f32 (ffloor (f32 (VOP3Mods f32:$x, i32:$mods)))))),
+  (V_FRACT_F32_e64 $mods, $x, DSTCLAMP.NONE, DSTOMOD.NONE)
+>;
+
+// Convert (x + (-floor(x))) to fract(x)
+def : Pat <
+  (f64 (fadd (f64 (VOP3Mods f64:$x, i32:$mods)),
+             (f64 (fneg (f64 (ffloor (f64 (VOP3Mods f64:$x, i32:$mods)))))))),
+  (V_FRACT_F64_e64 $mods, $x, DSTCLAMP.NONE, DSTOMOD.NONE)
+>;
+
+} // End Predicates = [UnsafeFPMath]
 
 //===----------------------------------------------------------------------===//
 // VOP2 Patterns
@@ -2217,9 +2513,9 @@ def : Pat <
 class SampleRawPattern<SDPatternOperator name, MIMG opcode, ValueType vt> : Pat <
   (name vt:$addr, v8i32:$rsrc, v4i32:$sampler, i32:$dmask, i32:$unorm,
         i32:$r128, i32:$da, i32:$glc, i32:$slc, i32:$tfe, i32:$lwe),
-  (opcode (as_i32imm $dmask), (as_i1imm $unorm), (as_i1imm $glc), (as_i1imm $da),
-          (as_i1imm $r128), (as_i1imm $tfe), (as_i1imm $lwe), (as_i1imm $slc),
-          $addr, $rsrc, $sampler)
+  (opcode $addr, $rsrc, $sampler,
+          (as_i32imm $dmask), (as_i1imm $unorm), (as_i1imm $glc), (as_i1imm $slc),
+          (as_i1imm $r128), (as_i1imm $tfe), (as_i1imm $lwe), (as_i1imm $da))
 >;
 
 multiclass SampleRawPatterns<SDPatternOperator name, string opcode> {
@@ -2232,11 +2528,11 @@ multiclass SampleRawPatterns<SDPatternOperator name, string opcode> {
 
 // Image only
 class ImagePattern<SDPatternOperator name, MIMG opcode, ValueType vt> : Pat <
-  (name vt:$addr, v8i32:$rsrc, i32:$dmask, i32:$unorm,
-        i32:$r128, i32:$da, i32:$glc, i32:$slc, i32:$tfe, i32:$lwe),
-  (opcode (as_i32imm $dmask), (as_i1imm $unorm), (as_i1imm $glc), (as_i1imm $da),
-          (as_i1imm $r128), (as_i1imm $tfe), (as_i1imm $lwe), (as_i1imm $slc),
-          $addr, $rsrc)
+  (name vt:$addr, v8i32:$rsrc, imm:$dmask, imm:$unorm,
+        imm:$r128, imm:$da, imm:$glc, imm:$slc, imm:$tfe, imm:$lwe),
+  (opcode $addr, $rsrc,
+          (as_i32imm $dmask), (as_i1imm $unorm), (as_i1imm $glc), (as_i1imm $slc),
+          (as_i1imm $r128), (as_i1imm $tfe), (as_i1imm $lwe), (as_i1imm $da))
 >;
 
 multiclass ImagePatterns<SDPatternOperator name, string opcode> {
@@ -2245,6 +2541,54 @@ multiclass ImagePatterns<SDPatternOperator name, string opcode> {
   def : ImagePattern<name, !cast<MIMG>(opcode # _V4_V4), v4i32>;
 }
 
+class ImageLoadPattern<SDPatternOperator name, MIMG opcode, ValueType vt> : Pat <
+  (name vt:$addr, v8i32:$rsrc, imm:$dmask, imm:$r128, imm:$da, imm:$glc,
+        imm:$slc),
+  (opcode $addr, $rsrc,
+          (as_i32imm $dmask), 1, (as_i1imm $glc), (as_i1imm $slc),
+          (as_i1imm $r128), 0, 0, (as_i1imm $da))
+>;
+
+multiclass ImageLoadPatterns<SDPatternOperator name, string opcode> {
+  def : ImageLoadPattern<name, !cast<MIMG>(opcode # _V4_V1), i32>;
+  def : ImageLoadPattern<name, !cast<MIMG>(opcode # _V4_V2), v2i32>;
+  def : ImageLoadPattern<name, !cast<MIMG>(opcode # _V4_V4), v4i32>;
+}
+
+class ImageStorePattern<SDPatternOperator name, MIMG opcode, ValueType vt> : Pat <
+  (name v4f32:$data, vt:$addr, v8i32:$rsrc, i32:$dmask, imm:$r128, imm:$da,
+        imm:$glc, imm:$slc),
+  (opcode $data, $addr, $rsrc,
+          (as_i32imm $dmask), 1, (as_i1imm $glc), (as_i1imm $slc),
+          (as_i1imm $r128), 0, 0, (as_i1imm $da))
+>;
+
+multiclass ImageStorePatterns<SDPatternOperator name, string opcode> {
+  def : ImageStorePattern<name, !cast<MIMG>(opcode # _V4_V1), i32>;
+  def : ImageStorePattern<name, !cast<MIMG>(opcode # _V4_V2), v2i32>;
+  def : ImageStorePattern<name, !cast<MIMG>(opcode # _V4_V4), v4i32>;
+}
+
+class ImageAtomicPattern<SDPatternOperator name, MIMG opcode, ValueType vt> : Pat <
+  (name i32:$vdata, vt:$addr, v8i32:$rsrc, imm:$r128, imm:$da, imm:$slc),
+  (opcode $vdata, $addr, $rsrc, 1, 1, 1, (as_i1imm $slc), (as_i1imm $r128), 0, 0, (as_i1imm $da))
+>;
+
+multiclass ImageAtomicPatterns<SDPatternOperator name, string opcode> {
+  def : ImageAtomicPattern<name, !cast<MIMG>(opcode # _V1), i32>;
+  def : ImageAtomicPattern<name, !cast<MIMG>(opcode # _V2), v2i32>;
+  def : ImageAtomicPattern<name, !cast<MIMG>(opcode # _V4), v4i32>;
+}
+
+class ImageAtomicCmpSwapPattern<MIMG opcode, ValueType vt> : Pat <
+  (int_amdgcn_image_atomic_cmpswap i32:$vsrc, i32:$vcmp, vt:$addr, v8i32:$rsrc,
+                                   imm:$r128, imm:$da, imm:$slc),
+  (EXTRACT_SUBREG
+    (opcode (REG_SEQUENCE VReg_64, $vsrc, sub0, $vcmp, sub1),
+            $addr, $rsrc, 3, 1, 1, (as_i1imm $slc), (as_i1imm $r128), 0, 0, (as_i1imm $da)),
+    sub0)
+>;
+
 // Basic sample
 defm : SampleRawPatterns<int_SI_image_sample,           "IMAGE_SAMPLE">;
 defm : SampleRawPatterns<int_SI_image_sample_cl,        "IMAGE_SAMPLE_CL">;
@@ -2341,38 +2685,57 @@ def : SampleRawPattern<int_SI_getlod, IMAGE_GET_LOD_V4_V4, v4i32>;
 def : ImagePattern<int_SI_getresinfo, IMAGE_GET_RESINFO_V4_V1, i32>;
 defm : ImagePatterns<int_SI_image_load, "IMAGE_LOAD">;
 defm : ImagePatterns<int_SI_image_load_mip, "IMAGE_LOAD_MIP">;
+defm : ImageLoadPatterns<int_amdgcn_image_load, "IMAGE_LOAD">;
+defm : ImageLoadPatterns<int_amdgcn_image_load_mip, "IMAGE_LOAD_MIP">;
+defm : ImageStorePatterns<int_amdgcn_image_store, "IMAGE_STORE">;
+defm : ImageStorePatterns<int_amdgcn_image_store_mip, "IMAGE_STORE_MIP">;
+defm : ImageAtomicPatterns<int_amdgcn_image_atomic_swap, "IMAGE_ATOMIC_SWAP">;
+def : ImageAtomicCmpSwapPattern<IMAGE_ATOMIC_CMPSWAP_V1, i32>;
+def : ImageAtomicCmpSwapPattern<IMAGE_ATOMIC_CMPSWAP_V2, v2i32>;
+def : ImageAtomicCmpSwapPattern<IMAGE_ATOMIC_CMPSWAP_V4, v4i32>;
+defm : ImageAtomicPatterns<int_amdgcn_image_atomic_add, "IMAGE_ATOMIC_ADD">;
+defm : ImageAtomicPatterns<int_amdgcn_image_atomic_sub, "IMAGE_ATOMIC_SUB">;
+defm : ImageAtomicPatterns<int_amdgcn_image_atomic_smin, "IMAGE_ATOMIC_SMIN">;
+defm : ImageAtomicPatterns<int_amdgcn_image_atomic_umin, "IMAGE_ATOMIC_UMIN">;
+defm : ImageAtomicPatterns<int_amdgcn_image_atomic_smax, "IMAGE_ATOMIC_SMAX">;
+defm : ImageAtomicPatterns<int_amdgcn_image_atomic_umax, "IMAGE_ATOMIC_UMAX">;
+defm : ImageAtomicPatterns<int_amdgcn_image_atomic_and, "IMAGE_ATOMIC_AND">;
+defm : ImageAtomicPatterns<int_amdgcn_image_atomic_or, "IMAGE_ATOMIC_OR">;
+defm : ImageAtomicPatterns<int_amdgcn_image_atomic_xor, "IMAGE_ATOMIC_XOR">;
+defm : ImageAtomicPatterns<int_amdgcn_image_atomic_inc, "IMAGE_ATOMIC_INC">;
+defm : ImageAtomicPatterns<int_amdgcn_image_atomic_dec, "IMAGE_ATOMIC_DEC">;
 
 /* SIsample for simple 1D texture lookup */
 def : Pat <
-  (SIsample i32:$addr, v32i8:$rsrc, v4i32:$sampler, imm),
-  (IMAGE_SAMPLE_V4_V1 0xf, 0, 0, 0, 0, 0, 0, 0, $addr, $rsrc, $sampler)
+  (SIsample i32:$addr, v8i32:$rsrc, v4i32:$sampler, imm),
+  (IMAGE_SAMPLE_V4_V1 $addr, $rsrc, $sampler, 0xf, 0, 0, 0, 0, 0, 0, 0)
 >;
 
 class SamplePattern<SDNode name, MIMG opcode, ValueType vt> : Pat <
-    (name vt:$addr, v32i8:$rsrc, v4i32:$sampler, imm),
-    (opcode 0xf, 0, 0, 0, 0, 0, 0, 0, $addr, $rsrc, $sampler)
+    (name vt:$addr, v8i32:$rsrc, v4i32:$sampler, imm),
+    (opcode $addr, $rsrc, $sampler, 0xf, 0, 0, 0, 0, 0, 0, 0)
 >;
 
 class SampleRectPattern<SDNode name, MIMG opcode, ValueType vt> : Pat <
-    (name vt:$addr, v32i8:$rsrc, v4i32:$sampler, TEX_RECT),
-    (opcode 0xf, 1, 0, 0, 0, 0, 0, 0, $addr, $rsrc, $sampler)
+    (name vt:$addr, v8i32:$rsrc, v4i32:$sampler, TEX_RECT),
+    (opcode $addr, $rsrc, $sampler, 0xf, 1, 0, 0, 0, 0, 0, 0)
 >;
 
 class SampleArrayPattern<SDNode name, MIMG opcode, ValueType vt> : Pat <
-    (name vt:$addr, v32i8:$rsrc, v4i32:$sampler, TEX_ARRAY),
-    (opcode 0xf, 0, 0, 1, 0, 0, 0, 0, $addr, $rsrc, $sampler)
+    (name vt:$addr, v8i32:$rsrc, v4i32:$sampler, TEX_ARRAY),
+    (opcode $addr, $rsrc, $sampler, 0xf, 0, 0, 0, 0, 0, 0, 1)
 >;
 
 class SampleShadowPattern<SDNode name, MIMG opcode,
                           ValueType vt> : Pat <
-    (name vt:$addr, v32i8:$rsrc, v4i32:$sampler, TEX_SHADOW),
-    (opcode 0xf, 0, 0, 0, 0, 0, 0, 0, $addr, $rsrc, $sampler)
+    (name vt:$addr, v8i32:$rsrc, v4i32:$sampler, TEX_SHADOW),
+    (opcode $addr, $rsrc, $sampler, 0xf, 0, 0, 0, 0, 0, 0, 0)
 >;
 
 class SampleShadowArrayPattern<SDNode name, MIMG opcode,
                                ValueType vt> : Pat <
-    (name vt:$addr, v32i8:$rsrc, v4i32:$sampler, TEX_SHADOW_ARRAY),
-    (opcode 0xf, 0, 0, 1, 0, 0, 0, 0, $addr, $rsrc, $sampler)
+    (name vt:$addr, v8i32:$rsrc, v4i32:$sampler, TEX_SHADOW_ARRAY),
+    (opcode $addr, $rsrc, $sampler, 0xf, 0, 0, 0, 0, 0, 0, 1)
 >;
 
 /* SIsample* for texture lookups consuming more address parameters */
@@ -2422,68 +2785,10 @@ defm : SamplePatterns<IMAGE_SAMPLE_V4_V16, IMAGE_SAMPLE_C_V4_V16,
                       IMAGE_SAMPLE_D_V4_V16, IMAGE_SAMPLE_C_D_V4_V16,
                       v16i32>;
 
-/* int_SI_imageload for texture fetches consuming varying address parameters */
-class ImageLoadPattern<Intrinsic name, MIMG opcode, ValueType addr_type> : Pat <
-    (name addr_type:$addr, v32i8:$rsrc, imm),
-    (opcode 0xf, 0, 0, 0, 0, 0, 0, 0, $addr, $rsrc)
->;
-
-class ImageLoadArrayPattern<Intrinsic name, MIMG opcode, ValueType addr_type> : Pat <
-    (name addr_type:$addr, v32i8:$rsrc, TEX_ARRAY),
-    (opcode 0xf, 0, 0, 1, 0, 0, 0, 0, $addr, $rsrc)
->;
-
-class ImageLoadMSAAPattern<Intrinsic name, MIMG opcode, ValueType addr_type> : Pat <
-    (name addr_type:$addr, v32i8:$rsrc, TEX_MSAA),
-    (opcode 0xf, 0, 0, 0, 0, 0, 0, 0, $addr, $rsrc)
->;
-
-class ImageLoadArrayMSAAPattern<Intrinsic name, MIMG opcode, ValueType addr_type> : Pat <
-    (name addr_type:$addr, v32i8:$rsrc, TEX_ARRAY_MSAA),
-    (opcode 0xf, 0, 0, 1, 0, 0, 0, 0, $addr, $rsrc)
->;
-
-multiclass ImageLoadPatterns<MIMG opcode, ValueType addr_type> {
-  def : ImageLoadPattern <int_SI_imageload, opcode, addr_type>;
-  def : ImageLoadArrayPattern <int_SI_imageload, opcode, addr_type>;
-}
-
-multiclass ImageLoadMSAAPatterns<MIMG opcode, ValueType addr_type> {
-  def : ImageLoadMSAAPattern <int_SI_imageload, opcode, addr_type>;
-  def : ImageLoadArrayMSAAPattern <int_SI_imageload, opcode, addr_type>;
-}
-
-defm : ImageLoadPatterns<IMAGE_LOAD_MIP_V4_V2, v2i32>;
-defm : ImageLoadPatterns<IMAGE_LOAD_MIP_V4_V4, v4i32>;
-
-defm : ImageLoadMSAAPatterns<IMAGE_LOAD_V4_V2, v2i32>;
-defm : ImageLoadMSAAPatterns<IMAGE_LOAD_V4_V4, v4i32>;
-
-/* Image resource information */
-def : Pat <
-  (int_SI_resinfo i32:$mipid, v32i8:$rsrc, imm),
-  (IMAGE_GET_RESINFO_V4_V1 0xf, 0, 0, 0, 0, 0, 0, 0, (V_MOV_B32_e32 $mipid), $rsrc)
->;
-
-def : Pat <
-  (int_SI_resinfo i32:$mipid, v32i8:$rsrc, TEX_ARRAY),
-  (IMAGE_GET_RESINFO_V4_V1 0xf, 0, 0, 1, 0, 0, 0, 0, (V_MOV_B32_e32 $mipid), $rsrc)
->;
-
-def : Pat <
-  (int_SI_resinfo i32:$mipid, v32i8:$rsrc, TEX_ARRAY_MSAA),
-  (IMAGE_GET_RESINFO_V4_V1 0xf, 0, 0, 1, 0, 0, 0, 0, (V_MOV_B32_e32 $mipid), $rsrc)
->;
-
 /********** ============================================ **********/
 /********** Extraction, Insertion, Building and Casting  **********/
 /********** ============================================ **********/
 
-//def : Extract_Element<i64, v2i64, 0, sub0_sub1>;
-//def : Extract_Element<i64, v2i64, 1, sub2_sub3>;
-//def : Extract_Element<f64, v2f64, 0, sub0_sub1>;
-//def : Extract_Element<f64, v2f64, 1, sub2_sub3>;
-
 foreach Index = 0-2 in {
   def Extract_Element_v2i32_#Index : Extract_Element <
     i32, v2i32, Index, !cast<SubRegIndex>(sub#Index)
@@ -2548,50 +2853,47 @@ foreach Index = 0-15 in {
   >;
 }
 
-def : BitConvert <i32, f32, SReg_32>;
+// FIXME: Why do only some of these type combinations for SReg and
+// VReg?
+// 32-bit bitcast
 def : BitConvert <i32, f32, VGPR_32>;
-
-def : BitConvert <f32, i32, SReg_32>;
 def : BitConvert <f32, i32, VGPR_32>;
+def : BitConvert <i32, f32, SReg_32>;
+def : BitConvert <f32, i32, SReg_32>;
 
+// 64-bit bitcast
 def : BitConvert <i64, f64, VReg_64>;
-
 def : BitConvert <f64, i64, VReg_64>;
-
-def : BitConvert <v2f32, v2i32, VReg_64>;
 def : BitConvert <v2i32, v2f32, VReg_64>;
-def : BitConvert <v2i32, i64, VReg_64>;
+def : BitConvert <v2f32, v2i32, VReg_64>;
 def : BitConvert <i64, v2i32, VReg_64>;
-def : BitConvert <v2f32, i64, VReg_64>;
+def : BitConvert <v2i32, i64, VReg_64>;
 def : BitConvert <i64, v2f32, VReg_64>;
-def : BitConvert <v2f32, f64, VReg_64>;
-def : BitConvert <v2i32, f64, VReg_64>;
+def : BitConvert <v2f32, i64, VReg_64>;
 def : BitConvert <f64, v2f32, VReg_64>;
+def : BitConvert <v2f32, f64, VReg_64>;
 def : BitConvert <f64, v2i32, VReg_64>;
-def : BitConvert <v4f32, v4i32, VReg_128>;
+def : BitConvert <v2i32, f64, VReg_64>;
 def : BitConvert <v4i32, v4f32, VReg_128>;
+def : BitConvert <v4f32, v4i32, VReg_128>;
 
-
+// 128-bit bitcast
 def : BitConvert <v2i64, v4i32, SReg_128>;
 def : BitConvert <v4i32, v2i64, SReg_128>;
-
 def : BitConvert <v2f64, v4f32, VReg_128>;
 def : BitConvert <v2f64, v4i32, VReg_128>;
 def : BitConvert <v4f32, v2f64, VReg_128>;
 def : BitConvert <v4i32, v2f64, VReg_128>;
+def : BitConvert <v2i64, v2f64, VReg_128>;
+def : BitConvert <v2f64, v2i64, VReg_128>;
 
-
-
-
-def : BitConvert <v8f32, v8i32, SReg_256>;
+// 256-bit bitcast
 def : BitConvert <v8i32, v8f32, SReg_256>;
-def : BitConvert <v8i32, v32i8, SReg_256>;
-def : BitConvert <v32i8, v8i32, SReg_256>;
-def : BitConvert <v8i32, v32i8, VReg_256>;
+def : BitConvert <v8f32, v8i32, SReg_256>;
 def : BitConvert <v8i32, v8f32, VReg_256>;
 def : BitConvert <v8f32, v8i32, VReg_256>;
-def : BitConvert <v32i8, v8i32, VReg_256>;
 
+// 512-bit bitcast
 def : BitConvert <v16i32, v16f32, VReg_512>;
 def : BitConvert <v16f32, v16i32, VReg_512>;
 
@@ -2613,7 +2915,7 @@ def : Pat <
 
 def : Pat <
   (fneg (fabs f32:$src)),
-  (S_OR_B32 $src, 0x80000000) /* Set sign bit */
+  (S_OR_B32 $src, 0x80000000) // Set sign bit
 >;
 
 // FIXME: Should use S_OR_B32
@@ -2703,15 +3005,9 @@ def : Pat <
 /********** Intrinsic Patterns **********/
 /********** ================== **********/
 
-/* llvm.AMDGPU.pow */
 def : POW_Common <V_LOG_F32_e32, V_EXP_F32_e32, V_MUL_LEGACY_F32_e32>;
 
 def : Pat <
-  (int_AMDGPU_div f32:$src0, f32:$src1),
-  (V_MUL_LEGACY_F32_e32 $src0, (V_RCP_LEGACY_F32_e32 $src1))
->;
-
-def : Pat <
   (int_AMDGPU_cube v4f32:$src),
   (REG_SEQUENCE VReg_128,
     (V_CUBETC_F32 0 /* src0_modifiers */, (EXTRACT_SUBREG $src, sub0),
@@ -2745,7 +3041,7 @@ class Ext32Pat <SDNode ext> : Pat <
 def : Ext32Pat <zext>;
 def : Ext32Pat <anyext>;
 
-// Offset in an 32Bit VGPR
+// Offset in an 32-bit VGPR
 def : Pat <
   (SIload_constant v4i32:$sbase, i32:$voff),
   (BUFFER_LOAD_DWORD_OFFEN $voff, $sbase, 0, 0, 0, 0, 0)
@@ -2759,12 +3055,6 @@ def : Pat <
                    (V_RCP_IFLAG_F32_e32 (V_CVT_F32_U32_e32 $src0))))
 >;
 
-def : Pat <
-  (int_SI_tid),
-  (V_MBCNT_HI_U32_B32_e64 0xffffffff,
-                          (V_MBCNT_LO_U32_B32_e64 0xffffffff, 0))
->;
-
 //===----------------------------------------------------------------------===//
 // VOP3 Patterns
 //===----------------------------------------------------------------------===//
@@ -2772,16 +3062,6 @@ def : Pat <
 def : IMad24Pat<V_MAD_I32_I24>;
 def : UMad24Pat<V_MAD_U32_U24>;
 
-def : Pat <
-  (mulhu i32:$src0, i32:$src1),
-  (V_MUL_HI_U32 $src0, $src1)
->;
-
-def : Pat <
-  (mulhs i32:$src0, i32:$src1),
-  (V_MUL_HI_I32 $src0, $src1)
->;
-
 defm : BFIPatterns <V_BFI_B32, S_MOV_B32, SReg_64>;
 def : ROTRPattern <V_ALIGNBIT_B32>;
 
@@ -2839,19 +3119,6 @@ class DSAtomicRetPat<DS inst, ValueType vt, PatFrag frag> : Pat <
   (inst $ptr, $value, (as_i16imm $offset), (i1 0))
 >;
 
-// Special case of DSAtomicRetPat for add / sub 1 -> inc / dec
-//
-// We need to use something for the data0, so we set a register to
-// -1. For the non-rtn variants, the manual says it does
-// DS[A] = (DS[A] >= D0) ? 0 : DS[A] + 1, and setting D0 to uint_max
-// will always do the increment so I'm assuming it's the same.
-class DSAtomicIncRetPat<DS inst, ValueType vt,
-                        Instruction LoadImm, PatFrag frag> : Pat <
-  (frag (DS1Addr1Offset i32:$ptr, i32:$offset), (vt 1)),
-  (inst $ptr, (LoadImm (vt -1)), (as_i16imm $offset), (i1 0))
->;
-
-
 class DSAtomicCmpXChg <DS inst, ValueType vt, PatFrag frag> : Pat <
   (frag (DS1Addr1Offset i32:$ptr, i32:$offset), vt:$cmp, vt:$swap),
   (inst $ptr, $cmp, $swap, (as_i16imm $offset), (i1 0))
@@ -2859,14 +3126,11 @@ class DSAtomicCmpXChg <DS inst, ValueType vt, PatFrag frag> : Pat <
 
 
 // 32-bit atomics.
-def : DSAtomicIncRetPat<DS_INC_RTN_U32, i32,
-                        V_MOV_B32_e32, si_atomic_load_add_local>;
-def : DSAtomicIncRetPat<DS_DEC_RTN_U32, i32,
-                        V_MOV_B32_e32, si_atomic_load_sub_local>;
-
 def : DSAtomicRetPat<DS_WRXCHG_RTN_B32, i32, si_atomic_swap_local>;
 def : DSAtomicRetPat<DS_ADD_RTN_U32, i32, si_atomic_load_add_local>;
 def : DSAtomicRetPat<DS_SUB_RTN_U32, i32, si_atomic_load_sub_local>;
+def : DSAtomicRetPat<DS_INC_RTN_U32, i32, si_atomic_inc_local>;
+def : DSAtomicRetPat<DS_DEC_RTN_U32, i32, si_atomic_dec_local>;
 def : DSAtomicRetPat<DS_AND_RTN_B32, i32, si_atomic_load_and_local>;
 def : DSAtomicRetPat<DS_OR_RTN_B32, i32, si_atomic_load_or_local>;
 def : DSAtomicRetPat<DS_XOR_RTN_B32, i32, si_atomic_load_xor_local>;
@@ -2874,18 +3138,14 @@ def : DSAtomicRetPat<DS_MIN_RTN_I32, i32, si_atomic_load_min_local>;
 def : DSAtomicRetPat<DS_MAX_RTN_I32, i32, si_atomic_load_max_local>;
 def : DSAtomicRetPat<DS_MIN_RTN_U32, i32, si_atomic_load_umin_local>;
 def : DSAtomicRetPat<DS_MAX_RTN_U32, i32, si_atomic_load_umax_local>;
-
 def : DSAtomicCmpXChg<DS_CMPST_RTN_B32, i32, si_atomic_cmp_swap_32_local>;
 
 // 64-bit atomics.
-def : DSAtomicIncRetPat<DS_INC_RTN_U64, i64,
-                        V_MOV_B64_PSEUDO, si_atomic_load_add_local>;
-def : DSAtomicIncRetPat<DS_DEC_RTN_U64, i64,
-                        V_MOV_B64_PSEUDO, si_atomic_load_sub_local>;
-
 def : DSAtomicRetPat<DS_WRXCHG_RTN_B64, i64, si_atomic_swap_local>;
 def : DSAtomicRetPat<DS_ADD_RTN_U64, i64, si_atomic_load_add_local>;
 def : DSAtomicRetPat<DS_SUB_RTN_U64, i64, si_atomic_load_sub_local>;
+def : DSAtomicRetPat<DS_INC_RTN_U64, i64, si_atomic_inc_local>;
+def : DSAtomicRetPat<DS_DEC_RTN_U64, i64, si_atomic_dec_local>;
 def : DSAtomicRetPat<DS_AND_RTN_B64, i64, si_atomic_load_and_local>;
 def : DSAtomicRetPat<DS_OR_RTN_B64, i64, si_atomic_load_or_local>;
 def : DSAtomicRetPat<DS_XOR_RTN_B64, i64, si_atomic_load_xor_local>;
@@ -2901,20 +3161,35 @@ def : DSAtomicCmpXChg<DS_CMPST_RTN_B64, i64, si_atomic_cmp_swap_64_local>;
 // MUBUF Patterns
 //===----------------------------------------------------------------------===//
 
-multiclass MUBUFLoad_Pattern <MUBUF Instr_ADDR64, ValueType vt,
-                              PatFrag constant_ld> {
-  def : Pat <
+class MUBUFLoad_Pattern <MUBUF Instr_ADDR64, ValueType vt,
+                              PatFrag constant_ld> : Pat <
      (vt (constant_ld (MUBUFAddr64 v4i32:$srsrc, i64:$vaddr, i32:$soffset,
                                    i16:$offset, i1:$glc, i1:$slc, i1:$tfe))),
      (Instr_ADDR64 $vaddr, $srsrc, $soffset, $offset, $glc, $slc, $tfe)
   >;
+
+multiclass MUBUFLoad_Atomic_Pattern <MUBUF Instr_ADDR64, MUBUF Instr_OFFSET,
+                                     ValueType vt, PatFrag atomic_ld> {
+  def : Pat <
+     (vt (atomic_ld (MUBUFAddr64 v4i32:$srsrc, i64:$vaddr, i32:$soffset,
+                                   i16:$offset, i1:$slc))),
+     (Instr_ADDR64 $vaddr, $srsrc, $soffset, $offset, 1, $slc, 0)
+  >;
+
+  def : Pat <
+    (vt (atomic_ld (MUBUFOffsetNoGLC v4i32:$rsrc, i32:$soffset, i16:$offset))),
+    (Instr_OFFSET $rsrc, $soffset, (as_i16imm $offset), 1, 0, 0)
+  >;
 }
 
 let Predicates = [isSICI] in {
-defm : MUBUFLoad_Pattern <BUFFER_LOAD_SBYTE_ADDR64, i32, sextloadi8_constant>;
-defm : MUBUFLoad_Pattern <BUFFER_LOAD_UBYTE_ADDR64, i32, az_extloadi8_constant>;
-defm : MUBUFLoad_Pattern <BUFFER_LOAD_SSHORT_ADDR64, i32, sextloadi16_constant>;
-defm : MUBUFLoad_Pattern <BUFFER_LOAD_USHORT_ADDR64, i32, az_extloadi16_constant>;
+def : MUBUFLoad_Pattern <BUFFER_LOAD_SBYTE_ADDR64, i32, sextloadi8_constant>;
+def : MUBUFLoad_Pattern <BUFFER_LOAD_UBYTE_ADDR64, i32, az_extloadi8_constant>;
+def : MUBUFLoad_Pattern <BUFFER_LOAD_SSHORT_ADDR64, i32, sextloadi16_constant>;
+def : MUBUFLoad_Pattern <BUFFER_LOAD_USHORT_ADDR64, i32, az_extloadi16_constant>;
+
+defm : MUBUFLoad_Atomic_Pattern <BUFFER_LOAD_DWORD_ADDR64, BUFFER_LOAD_DWORD_OFFSET, i32, mubuf_load_atomic>;
+defm : MUBUFLoad_Atomic_Pattern <BUFFER_LOAD_DWORDX2_ADDR64, BUFFER_LOAD_DWORDX2_OFFSET, i64, mubuf_load_atomic>;
 } // End Predicates = [isSICI]
 
 class MUBUFScratchLoadPat <MUBUF Instr, ValueType vt, PatFrag ld> : Pat <
@@ -2975,6 +3250,25 @@ defm : MUBUF_Load_Dword <v2i32, BUFFER_LOAD_DWORDX2_OFFSET, BUFFER_LOAD_DWORDX2_
 defm : MUBUF_Load_Dword <v4i32, BUFFER_LOAD_DWORDX4_OFFSET, BUFFER_LOAD_DWORDX4_OFFEN,
                          BUFFER_LOAD_DWORDX4_IDXEN, BUFFER_LOAD_DWORDX4_BOTHEN>;
 
+multiclass MUBUFStore_Atomic_Pattern <MUBUF Instr_ADDR64, MUBUF Instr_OFFSET,
+                                      ValueType vt, PatFrag atomic_st> {
+  // Store follows atomic op convention so address is forst
+  def : Pat <
+     (atomic_st (MUBUFAddr64 v4i32:$srsrc, i64:$vaddr, i32:$soffset,
+                                   i16:$offset, i1:$slc), vt:$val),
+     (Instr_ADDR64 $val, $vaddr, $srsrc, $soffset, $offset, 1, $slc, 0)
+  >;
+
+  def : Pat <
+    (atomic_st (MUBUFOffsetNoGLC v4i32:$rsrc, i32:$soffset, i16:$offset), vt:$val),
+    (Instr_OFFSET $val, $rsrc, $soffset, (as_i16imm $offset), 1, 0, 0)
+  >;
+}
+let Predicates = [isSICI] in {
+defm : MUBUFStore_Atomic_Pattern <BUFFER_STORE_DWORD_ADDR64, BUFFER_STORE_DWORD_OFFSET, i32, global_store_atomic>;
+defm : MUBUFStore_Atomic_Pattern <BUFFER_STORE_DWORDX2_ADDR64, BUFFER_STORE_DWORDX2_OFFSET, i64, global_store_atomic>;
+} // End Predicates = [isSICI]
+
 class MUBUFScratchStorePat <MUBUF Instr, ValueType vt, PatFrag st> : Pat <
   (st vt:$value, (MUBUFScratch v4i32:$srsrc, i32:$vaddr, i32:$soffset,
                                u16imm:$offset)),
@@ -2987,22 +3281,6 @@ def : MUBUFScratchStorePat <BUFFER_STORE_DWORD_OFFEN, i32, store_private>;
 def : MUBUFScratchStorePat <BUFFER_STORE_DWORDX2_OFFEN, v2i32, store_private>;
 def : MUBUFScratchStorePat <BUFFER_STORE_DWORDX4_OFFEN, v4i32, store_private>;
 
-/*
-class MUBUFStore_Pattern <MUBUF Instr, ValueType vt, PatFrag st> : Pat <
-  (st vt:$value, (MUBUFScratch v4i32:$srsrc, i64:$vaddr, u16imm:$offset)),
-  (Instr $value, $srsrc, $vaddr, $offset)
->;
-
-let Predicates = [isSICI] in {
-def : MUBUFStore_Pattern <BUFFER_STORE_BYTE_ADDR64, i32, truncstorei8_private>;
-def : MUBUFStore_Pattern <BUFFER_STORE_SHORT_ADDR64, i32, truncstorei16_private>;
-def : MUBUFStore_Pattern <BUFFER_STORE_DWORD_ADDR64, i32, store_private>;
-def : MUBUFStore_Pattern <BUFFER_STORE_DWORDX2_ADDR64, v2i32, store_private>;
-def : MUBUFStore_Pattern <BUFFER_STORE_DWORDX4_ADDR64, v4i32, store_private>;
-} // End Predicates = [isSICI]
-
-*/
-
 //===----------------------------------------------------------------------===//
 // MTBUF Patterns
 //===----------------------------------------------------------------------===//
@@ -3029,29 +3307,16 @@ def : MTBUF_StoreResource <v4i32, 4, TBUFFER_STORE_FORMAT_XYZW>;
 /********** ====================== **********/
 
 multiclass SI_INDIRECT_Pattern <ValueType vt, ValueType eltvt, string VecSize> {
-
-  // 1. Extract with offset
+  // Extract with offset
   def : Pat<
-    (eltvt (extractelt vt:$vec, (add i32:$idx, imm:$off))),
-    (!cast<Instruction>("SI_INDIRECT_SRC_"#VecSize) $vec, $idx, imm:$off)
+    (eltvt (extractelt vt:$src, (MOVRELOffset i32:$idx, (i32 imm:$offset)))),
+    (!cast<Instruction>("SI_INDIRECT_SRC_"#VecSize) $src, $idx, imm:$offset)
   >;
 
-  // 2. Extract without offset
+  // Insert with offset
   def : Pat<
-    (eltvt (extractelt vt:$vec, i32:$idx)),
-    (!cast<Instruction>("SI_INDIRECT_SRC_"#VecSize) $vec, $idx, 0)
-  >;
-
-  // 3. Insert with offset
-  def : Pat<
-    (insertelt vt:$vec, eltvt:$val, (add i32:$idx, imm:$off)),
-    (!cast<Instruction>("SI_INDIRECT_DST_"#VecSize) $vec, $idx, imm:$off, $val)
-  >;
-
-  // 4. Insert without offset
-  def : Pat<
-    (insertelt vt:$vec, eltvt:$val, i32:$idx),
-    (!cast<Instruction>("SI_INDIRECT_DST_"#VecSize) $vec, $idx, 0, $val)
+    (insertelt vt:$src, eltvt:$val, (MOVRELOffset i32:$idx, (i32 imm:$offset))),
+    (!cast<Instruction>("SI_INDIRECT_DST_"#VecSize) $src, $idx, imm:$offset, $val)
   >;
 }
 
@@ -3111,10 +3376,12 @@ def : ZExt_i64_i32_Pat<anyext>;
 def : ZExt_i64_i1_Pat<zext>;
 def : ZExt_i64_i1_Pat<anyext>;
 
+// FIXME: We need to use COPY_TO_REGCLASS to work-around the fact that
+// REG_SEQUENCE patterns don't support instructions with multiple outputs.
 def : Pat <
   (i64 (sext i32:$src)),
     (REG_SEQUENCE SReg_64, $src, sub0,
-    (S_ASHR_I32 $src, 31), sub1)
+    (i32 (COPY_TO_REGCLASS (S_ASHR_I32 $src, 31), SReg_32_XM0)), sub1)
 >;
 
 def : Pat <
@@ -3214,6 +3481,23 @@ defm : BFMPatterns <i32, S_BFM_B32, S_MOV_B32>;
 
 def : BFEPattern <V_BFE_U32, S_MOV_B32>;
 
+let Predicates = [isSICI] in {
+def : Pat <
+  (i64 (readcyclecounter)),
+  (S_MEMTIME)
+>;
+}
+
+def : Pat<
+  (fcanonicalize f32:$src),
+  (V_MUL_F32_e64 0, CONST.FP32_ONE, 0, $src, 0, 0)
+>;
+
+def : Pat<
+  (fcanonicalize f64:$src),
+  (V_MUL_F64 0, CONST.FP64_ONE, 0, $src, 0, 0)
+>;
+
 //===----------------------------------------------------------------------===//
 // Fract Patterns
 //===----------------------------------------------------------------------===//
@@ -3226,21 +3510,6 @@ let Predicates = [isSI] in {
 // The workaround for the V_FRACT bug is:
 //    fract(x) = isnan(x) ? x : min(V_FRACT(x), 0.99999999999999999)
 
-// Convert (x + (-floor(x)) to fract(x)
-def : Pat <
-  (f64 (fadd (f64 (VOP3Mods f64:$x, i32:$mods)),
-             (f64 (fneg (f64 (ffloor (f64 (VOP3Mods f64:$x, i32:$mods)))))))),
-  (V_CNDMASK_B64_PSEUDO
-      (V_MIN_F64
-          SRCMODS.NONE,
-          (V_FRACT_F64_e64 $mods, $x, DSTCLAMP.NONE, DSTOMOD.NONE),
-          SRCMODS.NONE,
-          (V_MOV_B64_PSEUDO 0x3fefffffffffffff),
-          DSTCLAMP.NONE, DSTOMOD.NONE),
-      $x,
-      (V_CMP_CLASS_F64_e64 SRCMODS.NONE, $x, 3/*NaN*/))
->;
-
 // Convert floor(x) to (x - fract(x))
 def : Pat <
   (f64 (ffloor (f64 (VOP3Mods f64:$x, i32:$mods)))),
@@ -3268,6 +3537,9 @@ def : Pat <
 
 def : SHA256MaPattern <V_BFI_B32, V_XOR_B32_e64>;
 
+def : IntMed3Pat<V_MED3_I32, smax, smax_oneuse, smin_oneuse>;
+def : IntMed3Pat<V_MED3_U32, umax, umax_oneuse, umin_oneuse>;
+
 //============================================================================//
 // Assembler aliases
 //============================================================================//
diff --git a/lib/Target/AMDGPU/SIIntrinsics.td b/lib/Target/AMDGPU/SIIntrinsics.td
index 027a0a2f5167..a9b7c39096e7 100644
--- a/lib/Target/AMDGPU/SIIntrinsics.td
+++ b/lib/Target/AMDGPU/SIIntrinsics.td
@@ -13,8 +13,6 @@
 
 
 let TargetPrefix = "SI", isTarget = 1 in {
-
-  def int_SI_tid : Intrinsic <[llvm_i32_ty], [], [IntrNoMem]>;
   def int_SI_packf16 : Intrinsic <[llvm_i32_ty], [llvm_float_ty, llvm_float_ty], [IntrNoMem]>;
   def int_SI_export : Intrinsic <[], [llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_float_ty, llvm_float_ty, llvm_float_ty, llvm_float_ty], []>;
   def int_SI_load_const : Intrinsic <[llvm_float_ty], [llvm_anyint_ty, llvm_i32_ty], [IntrNoMem]>;
@@ -50,9 +48,9 @@ let TargetPrefix = "SI", isTarget = 1 in {
      llvm_i32_ty,     // glc(imm)
      llvm_i32_ty,     // slc(imm)
      llvm_i32_ty],    // tfe(imm)
-    [IntrReadArgMem]>;
+    [IntrReadMem, IntrArgMemOnly]>;
 
-  def int_SI_sendmsg : Intrinsic <[], [llvm_i32_ty, llvm_i32_ty], [IntrNoMem]>;
+  def int_SI_sendmsg : Intrinsic <[], [llvm_i32_ty, llvm_i32_ty], []>;
 
   // Fully-flexible SAMPLE instruction.
   class SampleRaw : Intrinsic <
@@ -172,28 +170,20 @@ let TargetPrefix = "SI", isTarget = 1 in {
   def int_SI_image_load_mip : Image;
   def int_SI_getresinfo : Image;
 
-  // Deprecated image and sample intrinsics.
-  class Sample : Intrinsic <[llvm_v4f32_ty], [llvm_anyvector_ty, llvm_v32i8_ty, llvm_anyint_ty, llvm_i32_ty], [IntrNoMem]>;
-
-  def int_SI_sample : Sample;
-  def int_SI_sampleb : Sample;
-  def int_SI_sampled : Sample;
-  def int_SI_samplel : Sample;
-  def int_SI_imageload : Intrinsic <[llvm_v4i32_ty], [llvm_anyvector_ty, llvm_v32i8_ty, llvm_i32_ty], [IntrNoMem]>;
-  def int_SI_resinfo : Intrinsic <[llvm_v4i32_ty], [llvm_i32_ty, llvm_v32i8_ty, llvm_i32_ty], [IntrNoMem]>;
-
   /* Interpolation Intrinsics */
 
   def int_SI_fs_constant : Intrinsic <[llvm_float_ty], [llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], [IntrNoMem]>;
   def int_SI_fs_interp : Intrinsic <[llvm_float_ty], [llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_v2i32_ty], [IntrNoMem]>;
+} // End TargetPrefix = "SI", isTarget = 1
 
+let TargetPrefix = "amdgcn", isTarget = 1 in {
   /* Control flow Intrinsics */
 
-  def int_SI_if : Intrinsic<[llvm_i64_ty], [llvm_i1_ty, llvm_empty_ty], []>;
-  def int_SI_else : Intrinsic<[llvm_i64_ty], [llvm_i64_ty, llvm_empty_ty], []>;
-  def int_SI_break : Intrinsic<[llvm_i64_ty], [llvm_i64_ty], []>;
-  def int_SI_if_break : Intrinsic<[llvm_i64_ty], [llvm_i1_ty, llvm_i64_ty], []>;
-  def int_SI_else_break : Intrinsic<[llvm_i64_ty], [llvm_i64_ty, llvm_i64_ty], []>;
-  def int_SI_loop : Intrinsic<[], [llvm_i64_ty, llvm_empty_ty], []>;
-  def int_SI_end_cf : Intrinsic<[], [llvm_i64_ty], []>;
+  def int_amdgcn_if : Intrinsic<[llvm_i64_ty], [llvm_i1_ty, llvm_empty_ty], []>;
+  def int_amdgcn_else : Intrinsic<[llvm_i64_ty], [llvm_i64_ty, llvm_empty_ty], []>;
+  def int_amdgcn_break : Intrinsic<[llvm_i64_ty], [llvm_i64_ty], []>;
+  def int_amdgcn_if_break : Intrinsic<[llvm_i64_ty], [llvm_i1_ty, llvm_i64_ty], []>;
+  def int_amdgcn_else_break : Intrinsic<[llvm_i64_ty], [llvm_i64_ty, llvm_i64_ty], []>;
+  def int_amdgcn_loop : Intrinsic<[], [llvm_i64_ty, llvm_empty_ty], []>;
+  def int_amdgcn_end_cf : Intrinsic<[], [llvm_i64_ty], []>;
 }
diff --git a/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp b/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp
index 1bdb1f0ee9f9..9e972a569a0f 100644
--- a/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp
+++ b/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp
@@ -26,7 +26,7 @@
 //
 // - Live interval recomputing seems inefficient. This currently only matches
 //   one pair, and recomputes live intervals and moves on to the next pair. It
-//   would be better to compute a list of all merges that need to occur
+//   would be better to compute a list of all merges that need to occur.
 //
 // - With a list of instructions to process, we can also merge more. If a
 //   cluster of loads have offsets that are too large to fit in the 8-bit
@@ -36,6 +36,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "AMDGPU.h"
+#include "AMDGPUSubtarget.h"
 #include "SIInstrInfo.h"
 #include "SIRegisterInfo.h"
 #include "llvm/CodeGen/LiveIntervalAnalysis.h"
@@ -61,7 +62,6 @@ private:
   MachineRegisterInfo *MRI;
   LiveIntervals *LIS;
 
-
   static bool offsetsCanBeCombined(unsigned Offset0,
                                    unsigned Offset1,
                                    unsigned EltSize);
@@ -69,10 +69,6 @@ private:
   MachineBasicBlock::iterator findMatchingDSInst(MachineBasicBlock::iterator I,
                                                  unsigned EltSize);
 
-  void updateRegDefsUses(unsigned SrcReg,
-                         unsigned DstReg,
-                         unsigned SubIdx);
-
   MachineBasicBlock::iterator mergeRead2Pair(
     MachineBasicBlock::iterator I,
     MachineBasicBlock::iterator Paired,
@@ -193,17 +189,6 @@ SILoadStoreOptimizer::findMatchingDSInst(MachineBasicBlock::iterator I,
   return E;
 }
 
-void SILoadStoreOptimizer::updateRegDefsUses(unsigned SrcReg,
-                                             unsigned DstReg,
-                                             unsigned SubIdx) {
-  for (MachineRegisterInfo::reg_iterator I = MRI->reg_begin(SrcReg),
-         E = MRI->reg_end(); I != E; ) {
-    MachineOperand &O = *I;
-    ++I;
-    O.substVirtReg(DstReg, SubIdx, *TRI);
-  }
-}
-
 MachineBasicBlock::iterator  SILoadStoreOptimizer::mergeRead2Pair(
   MachineBasicBlock::iterator I,
   MachineBasicBlock::iterator Paired,
@@ -268,19 +253,19 @@ MachineBasicBlock::iterator  SILoadStoreOptimizer::mergeRead2Pair(
     .addOperand(*Dest1)
     .addReg(DestReg, RegState::Kill, SubRegIdx1);
 
-  LIS->InsertMachineInstrInMaps(Read2);
+  LIS->InsertMachineInstrInMaps(*Read2);
 
   // repairLiveintervalsInRange() doesn't handle physical register, so we have
   // to update the M0 range manually.
-  SlotIndex PairedIndex = LIS->getInstructionIndex(Paired);
+  SlotIndex PairedIndex = LIS->getInstructionIndex(*Paired);
   LiveRange &M0Range = LIS->getRegUnit(*MCRegUnitIterator(AMDGPU::M0, TRI));
   LiveRange::Segment *M0Segment = M0Range.getSegmentContaining(PairedIndex);
   bool UpdateM0Range = M0Segment->end == PairedIndex.getRegSlot();
 
   // The new write to the original destination register is now the copy. Steal
   // the old SlotIndex.
-  LIS->ReplaceMachineInstrInMaps(I, Copy0);
-  LIS->ReplaceMachineInstrInMaps(Paired, Copy1);
+  LIS->ReplaceMachineInstrInMaps(*I, *Copy0);
+  LIS->ReplaceMachineInstrInMaps(*Paired, *Copy1);
 
   I->eraseFromParent();
   Paired->eraseFromParent();
@@ -291,7 +276,7 @@ MachineBasicBlock::iterator  SILoadStoreOptimizer::mergeRead2Pair(
   LIS->createAndComputeVirtRegInterval(DestReg);
 
   if (UpdateM0Range) {
-    SlotIndex Read2Index = LIS->getInstructionIndex(Read2);
+    SlotIndex Read2Index = LIS->getInstructionIndex(*Read2);
     M0Segment->end = Read2Index.getRegSlot();
   }
 
@@ -340,7 +325,7 @@ MachineBasicBlock::iterator SILoadStoreOptimizer::mergeWrite2Pair(
 
   // repairLiveintervalsInRange() doesn't handle physical register, so we have
   // to update the M0 range manually.
-  SlotIndex PairedIndex = LIS->getInstructionIndex(Paired);
+  SlotIndex PairedIndex = LIS->getInstructionIndex(*Paired);
   LiveRange &M0Range = LIS->getRegUnit(*MCRegUnitIterator(AMDGPU::M0, TRI));
   LiveRange::Segment *M0Segment = M0Range.getSegmentContaining(PairedIndex);
   bool UpdateM0Range = M0Segment->end == PairedIndex.getRegSlot();
@@ -359,8 +344,8 @@ MachineBasicBlock::iterator SILoadStoreOptimizer::mergeWrite2Pair(
   // XXX - How do we express subregisters here?
   unsigned OrigRegs[] = { Data0->getReg(), Data1->getReg(), Addr->getReg() };
 
-  LIS->RemoveMachineInstrFromMaps(I);
-  LIS->RemoveMachineInstrFromMaps(Paired);
+  LIS->RemoveMachineInstrFromMaps(*I);
+  LIS->RemoveMachineInstrFromMaps(*Paired);
   I->eraseFromParent();
   Paired->eraseFromParent();
 
@@ -368,7 +353,7 @@ MachineBasicBlock::iterator SILoadStoreOptimizer::mergeWrite2Pair(
   LIS->repairIntervalsInRange(MBB, Write2, Write2, OrigRegs);
 
   if (UpdateM0Range) {
-    SlotIndex Write2Index = LIS->getInstructionIndex(Write2);
+    SlotIndex Write2Index = LIS->getInstructionIndex(*Write2);
     M0Segment->end = Write2Index.getRegSlot();
   }
 
@@ -423,9 +408,16 @@ bool SILoadStoreOptimizer::optimizeBlock(MachineBasicBlock &MBB) {
 }
 
 bool SILoadStoreOptimizer::runOnMachineFunction(MachineFunction &MF) {
-  const TargetSubtargetInfo &STM = MF.getSubtarget();
-  TRI = static_cast<const SIRegisterInfo *>(STM.getRegisterInfo());
-  TII = static_cast<const SIInstrInfo *>(STM.getInstrInfo());
+  if (skipFunction(*MF.getFunction()))
+    return false;
+
+  const SISubtarget &STM = MF.getSubtarget<SISubtarget>();
+  if (!STM.loadStoreOptEnabled())
+    return false;
+
+  TII = STM.getInstrInfo();
+  TRI = &TII->getRegisterInfo();
+
   MRI = &MF.getRegInfo();
 
   LIS = &getAnalysis<LiveIntervals>();
diff --git a/lib/Target/AMDGPU/SILowerControlFlow.cpp b/lib/Target/AMDGPU/SILowerControlFlow.cpp
index 126f6245dfc0..ee1d5dae70b7 100644
--- a/lib/Target/AMDGPU/SILowerControlFlow.cpp
+++ b/lib/Target/AMDGPU/SILowerControlFlow.cpp
@@ -52,6 +52,7 @@
 #include "AMDGPUSubtarget.h"
 #include "SIInstrInfo.h"
 #include "SIMachineFunctionInfo.h"
+#include "llvm/CodeGen/LivePhysRegs.h"
 #include "llvm/CodeGen/MachineFrameInfo.h"
 #include "llvm/CodeGen/MachineFunction.h"
 #include "llvm/CodeGen/MachineFunctionPass.h"
@@ -61,24 +62,24 @@
 
 using namespace llvm;
 
-namespace {
+#define DEBUG_TYPE "si-lower-control-flow"
 
-class SILowerControlFlowPass : public MachineFunctionPass {
+namespace {
 
+class SILowerControlFlow : public MachineFunctionPass {
 private:
   static const unsigned SkipThreshold = 12;
 
-  static char ID;
   const SIRegisterInfo *TRI;
   const SIInstrInfo *TII;
 
   bool shouldSkip(MachineBasicBlock *From, MachineBasicBlock *To);
 
   void Skip(MachineInstr &From, MachineOperand &To);
-  void SkipIfDead(MachineInstr &MI);
+  bool skipIfDead(MachineInstr &MI, MachineBasicBlock &NextBB);
 
   void If(MachineInstr &MI);
-  void Else(MachineInstr &MI);
+  void Else(MachineInstr &MI, bool ExecModified);
   void Break(MachineInstr &MI);
   void IfBreak(MachineInstr &MI);
   void ElseBreak(MachineInstr &MI);
@@ -88,56 +89,118 @@ private:
   void Kill(MachineInstr &MI);
   void Branch(MachineInstr &MI);
 
-  void LoadM0(MachineInstr &MI, MachineInstr *MovRel, int Offset = 0);
-  void computeIndirectRegAndOffset(unsigned VecReg, unsigned &Reg, int &Offset);
-  void IndirectSrc(MachineInstr &MI);
-  void IndirectDst(MachineInstr &MI);
+  MachineBasicBlock *insertSkipBlock(MachineBasicBlock &MBB,
+                                     MachineBasicBlock::iterator I) const;
+
+  std::pair<MachineBasicBlock *, MachineBasicBlock *>
+  splitBlock(MachineBasicBlock &MBB, MachineBasicBlock::iterator I);
+
+  void splitLoadM0BlockLiveIns(LivePhysRegs &RemainderLiveRegs,
+                               const MachineRegisterInfo &MRI,
+                               const MachineInstr &MI,
+                               MachineBasicBlock &LoopBB,
+                               MachineBasicBlock &RemainderBB,
+                               unsigned SaveReg,
+                               const MachineOperand &IdxReg);
+
+  void emitLoadM0FromVGPRLoop(MachineBasicBlock &LoopBB, DebugLoc DL,
+                              MachineInstr *MovRel,
+                              const MachineOperand &IdxReg,
+                              int Offset);
+
+  bool loadM0(MachineInstr &MI, MachineInstr *MovRel, int Offset = 0);
+  std::pair<unsigned, int> computeIndirectRegAndOffset(unsigned VecReg,
+                                                       int Offset) const;
+  bool indirectSrc(MachineInstr &MI);
+  bool indirectDst(MachineInstr &MI);
 
 public:
-  SILowerControlFlowPass(TargetMachine &tm) :
+  static char ID;
+
+  SILowerControlFlow() :
     MachineFunctionPass(ID), TRI(nullptr), TII(nullptr) { }
 
   bool runOnMachineFunction(MachineFunction &MF) override;
 
   const char *getPassName() const override {
-    return "SI Lower control flow instructions";
-  }
-
-  void getAnalysisUsage(AnalysisUsage &AU) const override {
-    AU.setPreservesCFG();
-    MachineFunctionPass::getAnalysisUsage(AU);
+    return "SI Lower control flow pseudo instructions";
   }
 };
 
 } // End anonymous namespace
 
-char SILowerControlFlowPass::ID = 0;
+char SILowerControlFlow::ID = 0;
+
+INITIALIZE_PASS(SILowerControlFlow, DEBUG_TYPE,
+                "SI lower control flow", false, false)
 
-FunctionPass *llvm::createSILowerControlFlowPass(TargetMachine &tm) {
-  return new SILowerControlFlowPass(tm);
+char &llvm::SILowerControlFlowPassID = SILowerControlFlow::ID;
+
+
+FunctionPass *llvm::createSILowerControlFlowPass() {
+  return new SILowerControlFlow();
 }
 
-bool SILowerControlFlowPass::shouldSkip(MachineBasicBlock *From,
-                                        MachineBasicBlock *To) {
+static bool opcodeEmitsNoInsts(unsigned Opc) {
+  switch (Opc) {
+  case TargetOpcode::IMPLICIT_DEF:
+  case TargetOpcode::KILL:
+  case TargetOpcode::BUNDLE:
+  case TargetOpcode::CFI_INSTRUCTION:
+  case TargetOpcode::EH_LABEL:
+  case TargetOpcode::GC_LABEL:
+  case TargetOpcode::DBG_VALUE:
+    return true;
+  default:
+    return false;
+  }
+}
+
+bool SILowerControlFlow::shouldSkip(MachineBasicBlock *From,
+                                    MachineBasicBlock *To) {
+  if (From->succ_empty())
+    return false;
 
   unsigned NumInstr = 0;
+  MachineFunction *MF = From->getParent();
 
-  for (MachineBasicBlock *MBB = From; MBB != To && !MBB->succ_empty();
-       MBB = *MBB->succ_begin()) {
+  for (MachineFunction::iterator MBBI(From), ToI(To), End = MF->end();
+       MBBI != End && MBBI != ToI; ++MBBI) {
+    MachineBasicBlock &MBB = *MBBI;
 
-    for (MachineBasicBlock::iterator I = MBB->begin(), E = MBB->end();
+    for (MachineBasicBlock::iterator I = MBB.begin(), E = MBB.end();
          NumInstr < SkipThreshold && I != E; ++I) {
+      if (opcodeEmitsNoInsts(I->getOpcode()))
+        continue;
+
+      // When a uniform loop is inside non-uniform control flow, the branch
+      // leaving the loop might be an S_CBRANCH_VCCNZ, which is never taken
+      // when EXEC = 0. We should skip the loop lest it becomes infinite.
+      if (I->getOpcode() == AMDGPU::S_CBRANCH_VCCNZ ||
+          I->getOpcode() == AMDGPU::S_CBRANCH_VCCZ)
+        return true;
+
+      if (I->isInlineAsm()) {
+        const MCAsmInfo *MAI = MF->getTarget().getMCAsmInfo();
+        const char *AsmStr = I->getOperand(0).getSymbolName();
+
+        // inlineasm length estimate is number of bytes assuming the longest
+        // instruction.
+        uint64_t MaxAsmSize = TII->getInlineAsmLength(AsmStr, *MAI);
+        NumInstr += MaxAsmSize / MAI->getMaxInstLength();
+      } else {
+        ++NumInstr;
+      }
 
-      if (I->isBundle() || !I->isBundled())
-        if (++NumInstr >= SkipThreshold)
-          return true;
+      if (NumInstr >= SkipThreshold)
+        return true;
     }
   }
 
   return false;
 }
 
-void SILowerControlFlowPass::Skip(MachineInstr &From, MachineOperand &To) {
+void SILowerControlFlow::Skip(MachineInstr &From, MachineOperand &To) {
 
   if (!shouldSkip(*From.getParent()->succ_begin(), To.getMBB()))
     return;
@@ -147,40 +210,44 @@ void SILowerControlFlowPass::Skip(MachineInstr &From, MachineOperand &To) {
     .addOperand(To);
 }
 
-void SILowerControlFlowPass::SkipIfDead(MachineInstr &MI) {
-
+bool SILowerControlFlow::skipIfDead(MachineInstr &MI, MachineBasicBlock &NextBB) {
   MachineBasicBlock &MBB = *MI.getParent();
-  DebugLoc DL = MI.getDebugLoc();
+  MachineFunction *MF = MBB.getParent();
 
-  if (MBB.getParent()->getInfo<SIMachineFunctionInfo>()->getShaderType() !=
-      ShaderType::PIXEL ||
+  if (MF->getFunction()->getCallingConv() != CallingConv::AMDGPU_PS ||
       !shouldSkip(&MBB, &MBB.getParent()->back()))
-    return;
+    return false;
+
+  MachineBasicBlock *SkipBB = insertSkipBlock(MBB, MI.getIterator());
+  MBB.addSuccessor(SkipBB);
 
-  MachineBasicBlock::iterator Insert = &MI;
-  ++Insert;
+  const DebugLoc &DL = MI.getDebugLoc();
 
   // If the exec mask is non-zero, skip the next two instructions
-  BuildMI(MBB, Insert, DL, TII->get(AMDGPU::S_CBRANCH_EXECNZ))
-    .addImm(3);
+  BuildMI(&MBB, DL, TII->get(AMDGPU::S_CBRANCH_EXECNZ))
+    .addMBB(&NextBB);
+
+  MachineBasicBlock::iterator Insert = SkipBB->begin();
 
   // Exec mask is zero: Export to NULL target...
-  BuildMI(MBB, Insert, DL, TII->get(AMDGPU::EXP))
-          .addImm(0)
-          .addImm(0x09) // V_008DFC_SQ_EXP_NULL
-          .addImm(0)
-          .addImm(1)
-          .addImm(1)
-          .addReg(AMDGPU::VGPR0)
-          .addReg(AMDGPU::VGPR0)
-          .addReg(AMDGPU::VGPR0)
-          .addReg(AMDGPU::VGPR0);
-
-  // ... and terminate wavefront
-  BuildMI(MBB, Insert, DL, TII->get(AMDGPU::S_ENDPGM));
+  BuildMI(*SkipBB, Insert, DL, TII->get(AMDGPU::EXP))
+    .addImm(0)
+    .addImm(0x09) // V_008DFC_SQ_EXP_NULL
+    .addImm(0)
+    .addImm(1)
+    .addImm(1)
+    .addReg(AMDGPU::VGPR0, RegState::Undef)
+    .addReg(AMDGPU::VGPR0, RegState::Undef)
+    .addReg(AMDGPU::VGPR0, RegState::Undef)
+    .addReg(AMDGPU::VGPR0, RegState::Undef);
+
+  // ... and terminate wavefront.
+  BuildMI(*SkipBB, Insert, DL, TII->get(AMDGPU::S_ENDPGM));
+
+  return true;
 }
 
-void SILowerControlFlowPass::If(MachineInstr &MI) {
+void SILowerControlFlow::If(MachineInstr &MI) {
   MachineBasicBlock &MBB = *MI.getParent();
   DebugLoc DL = MI.getDebugLoc();
   unsigned Reg = MI.getOperand(0).getReg();
@@ -195,10 +262,15 @@ void SILowerControlFlowPass::If(MachineInstr &MI) {
 
   Skip(MI, MI.getOperand(2));
 
+  // Insert a pseudo terminator to help keep the verifier happy.
+  BuildMI(MBB, &MI, DL, TII->get(AMDGPU::SI_MASK_BRANCH))
+    .addOperand(MI.getOperand(2))
+    .addReg(Reg);
+
   MI.eraseFromParent();
 }
 
-void SILowerControlFlowPass::Else(MachineInstr &MI) {
+void SILowerControlFlow::Else(MachineInstr &MI, bool ExecModified) {
   MachineBasicBlock &MBB = *MI.getParent();
   DebugLoc DL = MI.getDebugLoc();
   unsigned Dst = MI.getOperand(0).getReg();
@@ -208,22 +280,36 @@ void SILowerControlFlowPass::Else(MachineInstr &MI) {
           TII->get(AMDGPU::S_OR_SAVEEXEC_B64), Dst)
           .addReg(Src); // Saved EXEC
 
+  if (ExecModified) {
+    // Adjust the saved exec to account for the modifications during the flow
+    // block that contains the ELSE. This can happen when WQM mode is switched
+    // off.
+    BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_AND_B64), Dst)
+            .addReg(AMDGPU::EXEC)
+            .addReg(Dst);
+  }
+
   BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_XOR_B64), AMDGPU::EXEC)
           .addReg(AMDGPU::EXEC)
           .addReg(Dst);
 
   Skip(MI, MI.getOperand(2));
 
+  // Insert a pseudo terminator to help keep the verifier happy.
+  BuildMI(MBB, &MI, DL, TII->get(AMDGPU::SI_MASK_BRANCH))
+    .addOperand(MI.getOperand(2))
+    .addReg(Dst);
+
   MI.eraseFromParent();
 }
 
-void SILowerControlFlowPass::Break(MachineInstr &MI) {
+void SILowerControlFlow::Break(MachineInstr &MI) {
   MachineBasicBlock &MBB = *MI.getParent();
   DebugLoc DL = MI.getDebugLoc();
 
   unsigned Dst = MI.getOperand(0).getReg();
   unsigned Src = MI.getOperand(1).getReg();
- 
+
   BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_OR_B64), Dst)
           .addReg(AMDGPU::EXEC)
           .addReg(Src);
@@ -231,14 +317,14 @@ void SILowerControlFlowPass::Break(MachineInstr &MI) {
   MI.eraseFromParent();
 }
 
-void SILowerControlFlowPass::IfBreak(MachineInstr &MI) {
+void SILowerControlFlow::IfBreak(MachineInstr &MI) {
   MachineBasicBlock &MBB = *MI.getParent();
   DebugLoc DL = MI.getDebugLoc();
 
   unsigned Dst = MI.getOperand(0).getReg();
   unsigned Vcc = MI.getOperand(1).getReg();
   unsigned Src = MI.getOperand(2).getReg();
- 
+
   BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_OR_B64), Dst)
           .addReg(Vcc)
           .addReg(Src);
@@ -246,14 +332,14 @@ void SILowerControlFlowPass::IfBreak(MachineInstr &MI) {
   MI.eraseFromParent();
 }
 
-void SILowerControlFlowPass::ElseBreak(MachineInstr &MI) {
+void SILowerControlFlow::ElseBreak(MachineInstr &MI) {
   MachineBasicBlock &MBB = *MI.getParent();
   DebugLoc DL = MI.getDebugLoc();
 
   unsigned Dst = MI.getOperand(0).getReg();
   unsigned Saved = MI.getOperand(1).getReg();
   unsigned Src = MI.getOperand(2).getReg();
- 
+
   BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_OR_B64), Dst)
           .addReg(Saved)
           .addReg(Src);
@@ -261,7 +347,7 @@ void SILowerControlFlowPass::ElseBreak(MachineInstr &MI) {
   MI.eraseFromParent();
 }
 
-void SILowerControlFlowPass::Loop(MachineInstr &MI) {
+void SILowerControlFlow::Loop(MachineInstr &MI) {
   MachineBasicBlock &MBB = *MI.getParent();
   DebugLoc DL = MI.getDebugLoc();
   unsigned Src = MI.getOperand(0).getReg();
@@ -276,7 +362,7 @@ void SILowerControlFlowPass::Loop(MachineInstr &MI) {
   MI.eraseFromParent();
 }
 
-void SILowerControlFlowPass::EndCf(MachineInstr &MI) {
+void SILowerControlFlow::EndCf(MachineInstr &MI) {
   MachineBasicBlock &MBB = *MI.getParent();
   DebugLoc DL = MI.getDebugLoc();
   unsigned Reg = MI.getOperand(0).getReg();
@@ -289,24 +375,24 @@ void SILowerControlFlowPass::EndCf(MachineInstr &MI) {
   MI.eraseFromParent();
 }
 
-void SILowerControlFlowPass::Branch(MachineInstr &MI) {
-  if (MI.getOperand(0).getMBB() == MI.getParent()->getNextNode())
+void SILowerControlFlow::Branch(MachineInstr &MI) {
+  MachineBasicBlock *MBB = MI.getOperand(0).getMBB();
+  if (MBB == MI.getParent()->getNextNode())
     MI.eraseFromParent();
 
   // If these aren't equal, this is probably an infinite loop.
 }
 
-void SILowerControlFlowPass::Kill(MachineInstr &MI) {
+void SILowerControlFlow::Kill(MachineInstr &MI) {
   MachineBasicBlock &MBB = *MI.getParent();
   DebugLoc DL = MI.getDebugLoc();
   const MachineOperand &Op = MI.getOperand(0);
 
 #ifndef NDEBUG
-  const SIMachineFunctionInfo *MFI
-    = MBB.getParent()->getInfo<SIMachineFunctionInfo>();
+  CallingConv::ID CallConv = MBB.getParent()->getFunction()->getCallingConv();
   // Kill is only allowed in pixel / geometry shaders.
-  assert(MFI->getShaderType() == ShaderType::PIXEL ||
-         MFI->getShaderType() == ShaderType::GEOMETRY);
+  assert(CallConv == CallingConv::AMDGPU_PS ||
+         CallConv == CallingConv::AMDGPU_GS);
 #endif
 
   // Clear this thread from the exec mask if the operand is negative
@@ -325,94 +411,209 @@ void SILowerControlFlowPass::Kill(MachineInstr &MI) {
   MI.eraseFromParent();
 }
 
-void SILowerControlFlowPass::LoadM0(MachineInstr &MI, MachineInstr *MovRel, int Offset) {
+// All currently live registers must remain so in the remainder block.
+void SILowerControlFlow::splitLoadM0BlockLiveIns(LivePhysRegs &RemainderLiveRegs,
+                                                 const MachineRegisterInfo &MRI,
+                                                 const MachineInstr &MI,
+                                                 MachineBasicBlock &LoopBB,
+                                                 MachineBasicBlock &RemainderBB,
+                                                 unsigned SaveReg,
+                                                 const MachineOperand &IdxReg) {
+  // Add reg defined in loop body.
+  RemainderLiveRegs.addReg(SaveReg);
+
+  if (const MachineOperand *Val = TII->getNamedOperand(MI, AMDGPU::OpName::val)) {
+    if (!Val->isUndef()) {
+      RemainderLiveRegs.addReg(Val->getReg());
+      LoopBB.addLiveIn(Val->getReg());
+    }
+  }
+
+  for (unsigned Reg : RemainderLiveRegs) {
+    if (MRI.isAllocatable(Reg))
+      RemainderBB.addLiveIn(Reg);
+  }
+
+  const MachineOperand *Src = TII->getNamedOperand(MI, AMDGPU::OpName::src);
+  if (!Src->isUndef())
+    LoopBB.addLiveIn(Src->getReg());
+
+  if (!IdxReg.isUndef())
+    LoopBB.addLiveIn(IdxReg.getReg());
+  LoopBB.sortUniqueLiveIns();
+}
+
+void SILowerControlFlow::emitLoadM0FromVGPRLoop(MachineBasicBlock &LoopBB,
+                                                DebugLoc DL,
+                                                MachineInstr *MovRel,
+                                                const MachineOperand &IdxReg,
+                                                int Offset) {
+  MachineBasicBlock::iterator I = LoopBB.begin();
+
+  // Read the next variant into VCC (lower 32 bits) <- also loop target
+  BuildMI(LoopBB, I, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32), AMDGPU::VCC_LO)
+    .addReg(IdxReg.getReg(), getUndefRegState(IdxReg.isUndef()));
+
+  // Move index from VCC into M0
+  BuildMI(LoopBB, I, DL, TII->get(AMDGPU::S_MOV_B32), AMDGPU::M0)
+    .addReg(AMDGPU::VCC_LO);
+
+  // Compare the just read M0 value to all possible Idx values
+  BuildMI(LoopBB, I, DL, TII->get(AMDGPU::V_CMP_EQ_U32_e32))
+    .addReg(AMDGPU::M0)
+    .addReg(IdxReg.getReg(), getUndefRegState(IdxReg.isUndef()));
+
+  // Update EXEC, save the original EXEC value to VCC
+  BuildMI(LoopBB, I, DL, TII->get(AMDGPU::S_AND_SAVEEXEC_B64), AMDGPU::VCC)
+    .addReg(AMDGPU::VCC);
+
+  if (Offset != 0) {
+    BuildMI(LoopBB, I, DL, TII->get(AMDGPU::S_ADD_I32), AMDGPU::M0)
+      .addReg(AMDGPU::M0)
+      .addImm(Offset);
+  }
+
+  // Do the actual move
+  LoopBB.insert(I, MovRel);
+
+  // Update EXEC, switch all done bits to 0 and all todo bits to 1
+  BuildMI(LoopBB, I, DL, TII->get(AMDGPU::S_XOR_B64), AMDGPU::EXEC)
+    .addReg(AMDGPU::EXEC)
+    .addReg(AMDGPU::VCC);
+
+  // Loop back to V_READFIRSTLANE_B32 if there are still variants to cover
+  BuildMI(LoopBB, I, DL, TII->get(AMDGPU::S_CBRANCH_EXECNZ))
+    .addMBB(&LoopBB);
+}
+
+MachineBasicBlock *SILowerControlFlow::insertSkipBlock(
+  MachineBasicBlock &MBB, MachineBasicBlock::iterator I) const {
+  MachineFunction *MF = MBB.getParent();
+
+  MachineBasicBlock *SkipBB = MF->CreateMachineBasicBlock();
+  MachineFunction::iterator MBBI(MBB);
+  ++MBBI;
+
+  MF->insert(MBBI, SkipBB);
+
+  return SkipBB;
+}
+
+std::pair<MachineBasicBlock *, MachineBasicBlock *>
+SILowerControlFlow::splitBlock(MachineBasicBlock &MBB,
+                               MachineBasicBlock::iterator I) {
+  MachineFunction *MF = MBB.getParent();
 
+  // To insert the loop we need to split the block. Move everything after this
+  // point to a new block, and insert a new empty block between the two.
+  MachineBasicBlock *LoopBB = MF->CreateMachineBasicBlock();
+  MachineBasicBlock *RemainderBB = MF->CreateMachineBasicBlock();
+  MachineFunction::iterator MBBI(MBB);
+  ++MBBI;
+
+  MF->insert(MBBI, LoopBB);
+  MF->insert(MBBI, RemainderBB);
+
+  // Move the rest of the block into a new block.
+  RemainderBB->transferSuccessors(&MBB);
+  RemainderBB->splice(RemainderBB->begin(), &MBB, I, MBB.end());
+
+  MBB.addSuccessor(LoopBB);
+
+  return std::make_pair(LoopBB, RemainderBB);
+}
+
+// Returns true if a new block was inserted.
+bool SILowerControlFlow::loadM0(MachineInstr &MI, MachineInstr *MovRel, int Offset) {
   MachineBasicBlock &MBB = *MI.getParent();
   DebugLoc DL = MI.getDebugLoc();
-  MachineBasicBlock::iterator I = MI;
+  MachineBasicBlock::iterator I(&MI);
 
-  unsigned Save = MI.getOperand(1).getReg();
-  unsigned Idx = MI.getOperand(3).getReg();
+  const MachineOperand *Idx = TII->getNamedOperand(MI, AMDGPU::OpName::idx);
 
-  if (AMDGPU::SReg_32RegClass.contains(Idx)) {
-    if (Offset) {
-      BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_ADD_I32), AMDGPU::M0)
-              .addReg(Idx)
-              .addImm(Offset);
+  if (AMDGPU::SReg_32RegClass.contains(Idx->getReg())) {
+    if (Offset != 0) {
+      BuildMI(MBB, I, DL, TII->get(AMDGPU::S_ADD_I32), AMDGPU::M0)
+        .addReg(Idx->getReg(), getUndefRegState(Idx->isUndef()))
+        .addImm(Offset);
     } else {
-      BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_MOV_B32), AMDGPU::M0)
-              .addReg(Idx);
+      BuildMI(MBB, I, DL, TII->get(AMDGPU::S_MOV_B32), AMDGPU::M0)
+        .addReg(Idx->getReg(), getUndefRegState(Idx->isUndef()));
     }
+
     MBB.insert(I, MovRel);
-  } else {
+    MI.eraseFromParent();
+    return false;
+  }
 
-    assert(AMDGPU::SReg_64RegClass.contains(Save));
-    assert(AMDGPU::VGPR_32RegClass.contains(Idx));
+  MachineOperand *SaveOp = TII->getNamedOperand(MI, AMDGPU::OpName::sdst);
+  SaveOp->setIsDead(false);
+  unsigned Save = SaveOp->getReg();
 
-    // Save the EXEC mask
-    BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_MOV_B64), Save)
-            .addReg(AMDGPU::EXEC);
+  // Reading from a VGPR requires looping over all workitems in the wavefront.
+  assert(AMDGPU::SReg_64RegClass.contains(Save) &&
+         AMDGPU::VGPR_32RegClass.contains(Idx->getReg()));
 
-    // Read the next variant into VCC (lower 32 bits) <- also loop target
-    BuildMI(MBB, &MI, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32),
-            AMDGPU::VCC_LO)
-            .addReg(Idx);
+  // Save the EXEC mask
+  BuildMI(MBB, I, DL, TII->get(AMDGPU::S_MOV_B64), Save)
+    .addReg(AMDGPU::EXEC);
 
-    // Move index from VCC into M0
-    BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_MOV_B32), AMDGPU::M0)
-            .addReg(AMDGPU::VCC_LO);
+  LivePhysRegs RemainderLiveRegs(TRI);
 
-    // Compare the just read M0 value to all possible Idx values
-    BuildMI(MBB, &MI, DL, TII->get(AMDGPU::V_CMP_EQ_U32_e32))
-      .addReg(AMDGPU::M0)
-      .addReg(Idx);
+  RemainderLiveRegs.addLiveOuts(MBB);
 
-    // Update EXEC, save the original EXEC value to VCC
-    BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_AND_SAVEEXEC_B64), AMDGPU::VCC)
-            .addReg(AMDGPU::VCC);
+  MachineBasicBlock *LoopBB;
+  MachineBasicBlock *RemainderBB;
 
-    if (Offset) {
-      BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_ADD_I32), AMDGPU::M0)
-              .addReg(AMDGPU::M0)
-              .addImm(Offset);
-    }
-    // Do the actual move
-    MBB.insert(I, MovRel);
+  std::tie(LoopBB, RemainderBB) = splitBlock(MBB, I);
 
-    // Update EXEC, switch all done bits to 0 and all todo bits to 1
-    BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_XOR_B64), AMDGPU::EXEC)
-            .addReg(AMDGPU::EXEC)
-            .addReg(AMDGPU::VCC);
+  for (const MachineInstr &Inst : reverse(*RemainderBB))
+    RemainderLiveRegs.stepBackward(Inst);
 
-    // Loop back to V_READFIRSTLANE_B32 if there are still variants to cover
-    BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_CBRANCH_EXECNZ))
-      .addImm(-7);
+  MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
+  LoopBB->addSuccessor(RemainderBB);
+  LoopBB->addSuccessor(LoopBB);
 
-    // Restore EXEC
-    BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_MOV_B64), AMDGPU::EXEC)
-            .addReg(Save);
+  splitLoadM0BlockLiveIns(RemainderLiveRegs, MRI, MI, *LoopBB,
+                          *RemainderBB, Save, *Idx);
+
+  emitLoadM0FromVGPRLoop(*LoopBB, DL, MovRel, *Idx, Offset);
+
+  MachineBasicBlock::iterator First = RemainderBB->begin();
+  BuildMI(*RemainderBB, First, DL, TII->get(AMDGPU::S_MOV_B64), AMDGPU::EXEC)
+    .addReg(Save);
 
-  }
   MI.eraseFromParent();
+  return true;
 }
 
-/// \param @VecReg The register which holds element zero of the vector
-///                 being addressed into.
-/// \param[out] @Reg The base register to use in the indirect addressing instruction.
-/// \param[in,out] @Offset As an input, this is the constant offset part of the
-//                         indirect Index. e.g. v0 = v[VecReg + Offset]
-//                         As an output, this is a constant value that needs
-//                         to be added to the value stored in M0.
-void SILowerControlFlowPass::computeIndirectRegAndOffset(unsigned VecReg,
-                                                         unsigned &Reg,
-                                                         int &Offset) {
+/// \param @VecReg The register which holds element zero of the vector being
+///                 addressed into.
+//
+/// \param[in] @Idx The index operand from the movrel instruction. This must be
+// a register, but may be NoRegister.
+///
+/// \param[in] @Offset As an input, this is the constant offset part of the
+// indirect Index. e.g. v0 = v[VecReg + Offset] As an output, this is a constant
+// value that needs to be added to the value stored in M0.
+std::pair<unsigned, int>
+SILowerControlFlow::computeIndirectRegAndOffset(unsigned VecReg, int Offset) const {
   unsigned SubReg = TRI->getSubReg(VecReg, AMDGPU::sub0);
   if (!SubReg)
     SubReg = VecReg;
 
+  const TargetRegisterClass *SuperRC = TRI->getPhysRegClass(VecReg);
   const TargetRegisterClass *RC = TRI->getPhysRegClass(SubReg);
-  int RegIdx = TRI->getHWRegIndex(SubReg) + Offset;
+  int NumElts = SuperRC->getSize() / RC->getSize();
+
+  int BaseRegIdx = TRI->getHWRegIndex(SubReg);
+
+  // Skip out of bounds offsets, or else we would end up using an undefined
+  // register.
+  if (Offset >= NumElts)
+    return std::make_pair(RC->getRegister(BaseRegIdx), Offset);
 
+  int RegIdx = BaseRegIdx + Offset;
   if (RegIdx < 0) {
     Offset = RegIdx;
     RegIdx = 0;
@@ -420,77 +621,102 @@ void SILowerControlFlowPass::computeIndirectRegAndOffset(unsigned VecReg,
     Offset = 0;
   }
 
-  Reg = RC->getRegister(RegIdx);
+  unsigned Reg = RC->getRegister(RegIdx);
+  return std::make_pair(Reg, Offset);
 }
 
-void SILowerControlFlowPass::IndirectSrc(MachineInstr &MI) {
-
+// Return true if a new block was inserted.
+bool SILowerControlFlow::indirectSrc(MachineInstr &MI) {
   MachineBasicBlock &MBB = *MI.getParent();
-  DebugLoc DL = MI.getDebugLoc();
+  const DebugLoc &DL = MI.getDebugLoc();
 
   unsigned Dst = MI.getOperand(0).getReg();
-  unsigned Vec = MI.getOperand(2).getReg();
-  int Off = MI.getOperand(4).getImm();
+  const MachineOperand *SrcVec = TII->getNamedOperand(MI, AMDGPU::OpName::src);
+  int Offset = TII->getNamedOperand(MI, AMDGPU::OpName::offset)->getImm();
   unsigned Reg;
 
-  computeIndirectRegAndOffset(Vec, Reg, Off);
+  std::tie(Reg, Offset) = computeIndirectRegAndOffset(SrcVec->getReg(), Offset);
+
+  const MachineOperand *Idx = TII->getNamedOperand(MI, AMDGPU::OpName::idx);
+  if (Idx->getReg() == AMDGPU::NoRegister) {
+    // Only had a constant offset, copy the register directly.
+    BuildMI(MBB, MI.getIterator(), DL, TII->get(AMDGPU::V_MOV_B32_e32), Dst)
+      .addReg(Reg, getUndefRegState(SrcVec->isUndef()));
+    MI.eraseFromParent();
+    return false;
+  }
 
   MachineInstr *MovRel =
     BuildMI(*MBB.getParent(), DL, TII->get(AMDGPU::V_MOVRELS_B32_e32), Dst)
-            .addReg(Reg)
-            .addReg(Vec, RegState::Implicit);
+    .addReg(Reg, getUndefRegState(SrcVec->isUndef()))
+    .addReg(SrcVec->getReg(), RegState::Implicit);
 
-  LoadM0(MI, MovRel, Off);
+  return loadM0(MI, MovRel, Offset);
 }
 
-void SILowerControlFlowPass::IndirectDst(MachineInstr &MI) {
-
+// Return true if a new block was inserted.
+bool SILowerControlFlow::indirectDst(MachineInstr &MI) {
   MachineBasicBlock &MBB = *MI.getParent();
-  DebugLoc DL = MI.getDebugLoc();
+  const DebugLoc &DL = MI.getDebugLoc();
 
   unsigned Dst = MI.getOperand(0).getReg();
-  int Off = MI.getOperand(4).getImm();
-  unsigned Val = MI.getOperand(5).getReg();
+  int Offset = TII->getNamedOperand(MI, AMDGPU::OpName::offset)->getImm();
   unsigned Reg;
 
-  computeIndirectRegAndOffset(Dst, Reg, Off);
+  const MachineOperand *Val = TII->getNamedOperand(MI, AMDGPU::OpName::val);
+  std::tie(Reg, Offset) = computeIndirectRegAndOffset(Dst, Offset);
 
-  MachineInstr *MovRel = 
-    BuildMI(*MBB.getParent(), DL, TII->get(AMDGPU::V_MOVRELD_B32_e32))
-            .addReg(Reg, RegState::Define)
-            .addReg(Val)
-            .addReg(Dst, RegState::Implicit);
+  MachineOperand *Idx = TII->getNamedOperand(MI, AMDGPU::OpName::idx);
+  if (Idx->getReg() == AMDGPU::NoRegister) {
+    // Only had a constant offset, copy the register directly.
+    BuildMI(MBB, MI.getIterator(), DL, TII->get(AMDGPU::V_MOV_B32_e32), Reg)
+      .addOperand(*Val);
+    MI.eraseFromParent();
+    return false;
+  }
+
+  MachineInstr *MovRel =
+    BuildMI(*MBB.getParent(), DL, TII->get(AMDGPU::V_MOVRELD_B32_e32), Reg)
+    .addReg(Val->getReg(), getUndefRegState(Val->isUndef()))
+    .addReg(Dst, RegState::Implicit);
 
-  LoadM0(MI, MovRel, Off);
+  return loadM0(MI, MovRel, Offset);
 }
 
-bool SILowerControlFlowPass::runOnMachineFunction(MachineFunction &MF) {
-  TII = static_cast<const SIInstrInfo *>(MF.getSubtarget().getInstrInfo());
-  TRI =
-      static_cast<const SIRegisterInfo *>(MF.getSubtarget().getRegisterInfo());
+bool SILowerControlFlow::runOnMachineFunction(MachineFunction &MF) {
+  const SISubtarget &ST = MF.getSubtarget<SISubtarget>();
+  TII = ST.getInstrInfo();
+  TRI = &TII->getRegisterInfo();
+
   SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
 
   bool HaveKill = false;
-  bool NeedWQM = false;
   bool NeedFlat = false;
   unsigned Depth = 0;
 
-  for (MachineFunction::iterator BI = MF.begin(), BE = MF.end();
-       BI != BE; ++BI) {
+  MachineFunction::iterator NextBB;
 
+  for (MachineFunction::iterator BI = MF.begin(), BE = MF.end();
+       BI != BE; BI = NextBB) {
+    NextBB = std::next(BI);
     MachineBasicBlock &MBB = *BI;
+
+    MachineBasicBlock *EmptyMBBAtEnd = nullptr;
     MachineBasicBlock::iterator I, Next;
+    bool ExecModified = false;
+
     for (I = MBB.begin(); I != MBB.end(); I = Next) {
       Next = std::next(I);
 
       MachineInstr &MI = *I;
-      if (TII->isWQM(MI) || TII->isDS(MI))
-        NeedWQM = true;
 
       // Flat uses m0 in case it needs to access LDS.
       if (TII->isFLAT(MI))
         NeedFlat = true;
 
+      if (I->modifiesRegister(AMDGPU::EXEC, TRI))
+        ExecModified = true;
+
       switch (MI.getOpcode()) {
         default: break;
         case AMDGPU::SI_IF:
@@ -499,7 +725,7 @@ bool SILowerControlFlowPass::runOnMachineFunction(MachineFunction &MF) {
           break;
 
         case AMDGPU::SI_ELSE:
-          Else(MI);
+          Else(MI, ExecModified);
           break;
 
         case AMDGPU::SI_BREAK:
@@ -521,16 +747,20 @@ bool SILowerControlFlowPass::runOnMachineFunction(MachineFunction &MF) {
 
         case AMDGPU::SI_END_CF:
           if (--Depth == 0 && HaveKill) {
-            SkipIfDead(MI);
             HaveKill = false;
+            // TODO: Insert skip if exec is 0?
           }
+
           EndCf(MI);
           break;
 
-        case AMDGPU::SI_KILL:
-          if (Depth == 0)
-            SkipIfDead(MI);
-          else
+        case AMDGPU::SI_KILL_TERMINATOR:
+          if (Depth == 0) {
+            if (skipIfDead(MI, *NextBB)) {
+              NextBB = std::next(BI);
+              BE = MF.end();
+            }
+          } else
             HaveKill = true;
           Kill(MI);
           break;
@@ -544,7 +774,15 @@ bool SILowerControlFlowPass::runOnMachineFunction(MachineFunction &MF) {
         case AMDGPU::SI_INDIRECT_SRC_V4:
         case AMDGPU::SI_INDIRECT_SRC_V8:
         case AMDGPU::SI_INDIRECT_SRC_V16:
-          IndirectSrc(MI);
+          if (indirectSrc(MI)) {
+            // The block was split at this point. We can safely skip the middle
+            // inserted block to the following which contains the rest of this
+            // block's instructions.
+            NextBB = std::next(BI);
+            BE = MF.end();
+            Next = MBB.end();
+          }
+
           break;
 
         case AMDGPU::SI_INDIRECT_DST_V1:
@@ -552,55 +790,46 @@ bool SILowerControlFlowPass::runOnMachineFunction(MachineFunction &MF) {
         case AMDGPU::SI_INDIRECT_DST_V4:
         case AMDGPU::SI_INDIRECT_DST_V8:
         case AMDGPU::SI_INDIRECT_DST_V16:
-          IndirectDst(MI);
+          if (indirectDst(MI)) {
+            // The block was split at this point. We can safely skip the middle
+            // inserted block to the following which contains the rest of this
+            // block's instructions.
+            NextBB = std::next(BI);
+            BE = MF.end();
+            Next = MBB.end();
+          }
+
           break;
+
+        case AMDGPU::SI_RETURN: {
+          assert(!MF.getInfo<SIMachineFunctionInfo>()->returnsVoid());
+
+          // Graphics shaders returning non-void shouldn't contain S_ENDPGM,
+          // because external bytecode will be appended at the end.
+          if (BI != --MF.end() || I != MBB.getFirstTerminator()) {
+            // SI_RETURN is not the last instruction. Add an empty block at
+            // the end and jump there.
+            if (!EmptyMBBAtEnd) {
+              EmptyMBBAtEnd = MF.CreateMachineBasicBlock();
+              MF.insert(MF.end(), EmptyMBBAtEnd);
+            }
+
+            MBB.addSuccessor(EmptyMBBAtEnd);
+            BuildMI(*BI, I, MI.getDebugLoc(), TII->get(AMDGPU::S_BRANCH))
+                    .addMBB(EmptyMBBAtEnd);
+            I->eraseFromParent();
+          }
+          break;
+        }
       }
     }
   }
 
-  if (NeedWQM && MFI->getShaderType() == ShaderType::PIXEL) {
-    MachineBasicBlock &MBB = MF.front();
-    BuildMI(MBB, MBB.getFirstNonPHI(), DebugLoc(), TII->get(AMDGPU::S_WQM_B64),
-            AMDGPU::EXEC).addReg(AMDGPU::EXEC);
-  }
-
-  // FIXME: This seems inappropriate to do here.
   if (NeedFlat && MFI->IsKernel) {
-    // Insert the prologue initializing the SGPRs pointing to the scratch space
-    // for flat accesses.
-    const MachineFrameInfo *FrameInfo = MF.getFrameInfo();
-
     // TODO: What to use with function calls?
-
-    // FIXME: This is reporting stack size that is used in a scratch buffer
-    // rather than registers as well.
-    uint64_t StackSizeBytes = FrameInfo->getStackSize();
-
-    int IndirectBegin
-      = static_cast<const AMDGPUInstrInfo*>(TII)->getIndirectIndexBegin(MF);
-    // Convert register index to 256-byte unit.
-    uint64_t StackOffset = IndirectBegin < 0 ? 0 : (4 * IndirectBegin / 256);
-
-    assert((StackSizeBytes < 0xffff) && StackOffset < 0xffff &&
-           "Stack limits should be smaller than 16-bits");
-
-    // Initialize the flat scratch register pair.
-    // TODO: Can we use one s_mov_b64 here?
-
-    // Offset is in units of 256-bytes.
-    MachineBasicBlock &MBB = MF.front();
-    DebugLoc NoDL;
-    MachineBasicBlock::iterator Start = MBB.getFirstNonPHI();
-    const MCInstrDesc &SMovK = TII->get(AMDGPU::S_MOVK_I32);
-
-    assert(isInt<16>(StackOffset) && isInt<16>(StackSizeBytes));
-
-    BuildMI(MBB, Start, NoDL, SMovK, AMDGPU::FLAT_SCR_LO)
-      .addImm(StackOffset);
-
-    // Documentation says size is "per-thread scratch size in bytes"
-    BuildMI(MBB, Start, NoDL, SMovK, AMDGPU::FLAT_SCR_HI)
-      .addImm(StackSizeBytes);
+    // We will need to Initialize the flat scratch register pair.
+    if (NeedFlat)
+      MFI->setHasFlatInstructions(true);
   }
 
   return true;
diff --git a/lib/Target/AMDGPU/SILowerI1Copies.cpp b/lib/Target/AMDGPU/SILowerI1Copies.cpp
index a2fa5fd93aad..dc1d20ddb274 100644
--- a/lib/Target/AMDGPU/SILowerI1Copies.cpp
+++ b/lib/Target/AMDGPU/SILowerI1Copies.cpp
@@ -18,7 +18,6 @@
 #include "AMDGPUSubtarget.h"
 #include "SIInstrInfo.h"
 #include "llvm/CodeGen/LiveIntervalAnalysis.h"
-#include "llvm/CodeGen/MachineDominators.h"
 #include "llvm/CodeGen/MachineFunctionPass.h"
 #include "llvm/CodeGen/MachineInstrBuilder.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
@@ -47,8 +46,6 @@ public:
   }
 
   void getAnalysisUsage(AnalysisUsage &AU) const override {
-    AU.addRequired<MachineDominatorTree>();
-    AU.addPreserved<MachineDominatorTree>();
     AU.setPreservesCFG();
     MachineFunctionPass::getAnalysisUsage(AU);
   }
@@ -56,11 +53,8 @@ public:
 
 } // End anonymous namespace.
 
-INITIALIZE_PASS_BEGIN(SILowerI1Copies, DEBUG_TYPE,
-                      "SI Lower i1 Copies", false, false)
-INITIALIZE_PASS_DEPENDENCY(MachineDominatorTree)
-INITIALIZE_PASS_END(SILowerI1Copies, DEBUG_TYPE,
-                    "SI Lower i1 Copies", false, false)
+INITIALIZE_PASS(SILowerI1Copies, DEBUG_TYPE,
+                "SI Lower i1 Copies", false, false)
 
 char SILowerI1Copies::ID = 0;
 
@@ -72,9 +66,10 @@ FunctionPass *llvm::createSILowerI1CopiesPass() {
 
 bool SILowerI1Copies::runOnMachineFunction(MachineFunction &MF) {
   MachineRegisterInfo &MRI = MF.getRegInfo();
-  const SIInstrInfo *TII =
-      static_cast<const SIInstrInfo *>(MF.getSubtarget().getInstrInfo());
-  const TargetRegisterInfo *TRI = MF.getSubtarget().getRegisterInfo();
+  const SISubtarget &ST = MF.getSubtarget<SISubtarget>();
+  const SIInstrInfo *TII = ST.getInstrInfo();
+  const TargetRegisterInfo *TRI = &TII->getRegisterInfo();
+
   std::vector<unsigned> I1Defs;
 
   for (MachineFunction::iterator BI = MF.begin(), BE = MF.end();
diff --git a/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp b/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp
index 49677fc2b0a3..4d12a1ef9a93 100644
--- a/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp
+++ b/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp
@@ -1,19 +1,17 @@
-//===-- SIMachineFunctionInfo.cpp - SI Machine Function Info -------===//
+//===-- SIMachineFunctionInfo.cpp -------- SI Machine Function Info -------===//
 //
 //                     The LLVM Compiler Infrastructure
 //
 // This file is distributed under the University of Illinois Open Source
 // License. See LICENSE.TXT for details.
 //
-/// \file
 //===----------------------------------------------------------------------===//
 
-
 #include "SIMachineFunctionInfo.h"
 #include "AMDGPUSubtarget.h"
 #include "SIInstrInfo.h"
-#include "llvm/CodeGen/MachineInstrBuilder.h"
 #include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
 #include "llvm/IR/Function.h"
 #include "llvm/IR/LLVMContext.h"
@@ -22,6 +20,11 @@
 
 using namespace llvm;
 
+static cl::opt<bool> EnableSpillSGPRToVGPR(
+  "amdgpu-spill-sgpr-to-vgpr",
+  cl::desc("Enable spilling VGPRs to SGPRs"),
+  cl::ReallyHidden,
+  cl::init(true));
 
 // Pin the vtable to this file.
 void SIMachineFunctionInfo::anchor() {}
@@ -48,12 +51,20 @@ SIMachineFunctionInfo::SIMachineFunctionInfo(const MachineFunction &MF)
     PrivateSegmentWaveByteOffsetSystemSGPR(AMDGPU::NoRegister),
     PSInputAddr(0),
     ReturnsVoid(true),
+    MaximumWorkGroupSize(0),
+    DebuggerReservedVGPRCount(0),
+    DebuggerWorkGroupIDStackObjectIndices({{0, 0, 0}}),
+    DebuggerWorkItemIDStackObjectIndices({{0, 0, 0}}),
     LDSWaveSpillSize(0),
     PSInputEna(0),
     NumUserSGPRs(0),
     NumSystemSGPRs(0),
     HasSpilledSGPRs(false),
     HasSpilledVGPRs(false),
+    HasNonSpillStackObjects(false),
+    HasFlatInstructions(false),
+    NumSpilledSGPRs(0),
+    NumSpilledVGPRs(0),
     PrivateSegmentBuffer(false),
     DispatchPtr(false),
     QueuePtr(false),
@@ -63,37 +74,45 @@ SIMachineFunctionInfo::SIMachineFunctionInfo(const MachineFunction &MF)
     GridWorkgroupCountX(false),
     GridWorkgroupCountY(false),
     GridWorkgroupCountZ(false),
-    WorkGroupIDX(true),
+    WorkGroupIDX(false),
     WorkGroupIDY(false),
     WorkGroupIDZ(false),
     WorkGroupInfo(false),
     PrivateSegmentWaveByteOffset(false),
-    WorkItemIDX(true),
+    WorkItemIDX(false),
     WorkItemIDY(false),
     WorkItemIDZ(false) {
-  const AMDGPUSubtarget &ST = MF.getSubtarget<AMDGPUSubtarget>();
+  const SISubtarget &ST = MF.getSubtarget<SISubtarget>();
   const Function *F = MF.getFunction();
 
   PSInputAddr = AMDGPU::getInitialPSInputAddr(*F);
 
   const MachineFrameInfo *FrameInfo = MF.getFrameInfo();
 
-  if (getShaderType() == ShaderType::COMPUTE)
+  if (!AMDGPU::isShader(F->getCallingConv())) {
     KernargSegmentPtr = true;
+    WorkGroupIDX = true;
+    WorkItemIDX = true;
+  }
 
-  if (F->hasFnAttribute("amdgpu-work-group-id-y"))
+  if (F->hasFnAttribute("amdgpu-work-group-id-y") || ST.debuggerEmitPrologue())
     WorkGroupIDY = true;
 
-  if (F->hasFnAttribute("amdgpu-work-group-id-z"))
+  if (F->hasFnAttribute("amdgpu-work-group-id-z") || ST.debuggerEmitPrologue())
     WorkGroupIDZ = true;
 
-  if (F->hasFnAttribute("amdgpu-work-item-id-y"))
+  if (F->hasFnAttribute("amdgpu-work-item-id-y") || ST.debuggerEmitPrologue())
     WorkItemIDY = true;
 
-  if (F->hasFnAttribute("amdgpu-work-item-id-z"))
+  if (F->hasFnAttribute("amdgpu-work-item-id-z") || ST.debuggerEmitPrologue())
     WorkItemIDZ = true;
 
-  bool MaySpill = ST.isVGPRSpillingEnabled(this);
+  // X, XY, and XYZ are the only supported combinations, so make sure Y is
+  // enabled if Z is.
+  if (WorkItemIDZ)
+    WorkItemIDY = true;
+
+  bool MaySpill = ST.isVGPRSpillingEnabled(*F);
   bool HasStackObjects = FrameInfo->hasStackObjects();
 
   if (HasStackObjects || MaySpill)
@@ -105,12 +124,25 @@ SIMachineFunctionInfo::SIMachineFunctionInfo(const MachineFunction &MF)
 
     if (F->hasFnAttribute("amdgpu-dispatch-ptr"))
       DispatchPtr = true;
+
+    if (F->hasFnAttribute("amdgpu-queue-ptr"))
+      QueuePtr = true;
   }
 
-  // X, XY, and XYZ are the only supported combinations, so make sure Y is
-  // enabled if Z is.
-  if (WorkItemIDZ)
-    WorkItemIDY = true;
+  // We don't need to worry about accessing spills with flat instructions.
+  // TODO: On VI where we must use flat for global, we should be able to omit
+  // this if it is never used for generic access.
+  if (HasStackObjects && ST.getGeneration() >= SISubtarget::SEA_ISLANDS &&
+      ST.isAmdHsaOS())
+    FlatScratchInit = true;
+
+  if (AMDGPU::isCompute(F->getCallingConv()))
+    MaximumWorkGroupSize = AMDGPU::getMaximumWorkGroupSize(*F);
+  else
+    MaximumWorkGroupSize = ST.getWavefrontSize();
+
+  if (ST.debuggerReserveRegs())
+    DebuggerReservedVGPRCount = 4;
 }
 
 unsigned SIMachineFunctionInfo::addPrivateSegmentBuffer(
@@ -142,13 +174,24 @@ unsigned SIMachineFunctionInfo::addKernargSegmentPtr(const SIRegisterInfo &TRI)
   return KernargSegmentPtrUserSGPR;
 }
 
-SIMachineFunctionInfo::SpilledReg SIMachineFunctionInfo::getSpilledReg(
+unsigned SIMachineFunctionInfo::addFlatScratchInit(const SIRegisterInfo &TRI) {
+  FlatScratchInitUserSGPR = TRI.getMatchingSuperReg(
+    getNextUserSGPR(), AMDGPU::sub0, &AMDGPU::SReg_64RegClass);
+  NumUserSGPRs += 2;
+  return FlatScratchInitUserSGPR;
+}
+
+SIMachineFunctionInfo::SpilledReg SIMachineFunctionInfo::getSpilledReg (
                                                        MachineFunction *MF,
                                                        unsigned FrameIndex,
                                                        unsigned SubIdx) {
-  const MachineFrameInfo *FrameInfo = MF->getFrameInfo();
-  const SIRegisterInfo *TRI = static_cast<const SIRegisterInfo *>(
-      MF->getSubtarget<AMDGPUSubtarget>().getRegisterInfo());
+  if (!EnableSpillSGPRToVGPR)
+    return SpilledReg();
+
+  const SISubtarget &ST = MF->getSubtarget<SISubtarget>();
+  const SIRegisterInfo *TRI = ST.getRegisterInfo();
+
+  MachineFrameInfo *FrameInfo = MF->getFrameInfo();
   MachineRegisterInfo &MRI = MF->getRegInfo();
   int64_t Offset = FrameInfo->getObjectOffset(FrameIndex);
   Offset += SubIdx * 4;
@@ -157,19 +200,14 @@ SIMachineFunctionInfo::SpilledReg SIMachineFunctionInfo::getSpilledReg(
   unsigned Lane = (Offset / 4) % 64;
 
   struct SpilledReg Spill;
+  Spill.Lane = Lane;
 
   if (!LaneVGPRs.count(LaneVGPRIdx)) {
     unsigned LaneVGPR = TRI->findUnusedRegister(MRI, &AMDGPU::VGPR_32RegClass);
 
-    if (LaneVGPR == AMDGPU::NoRegister) {
-      LLVMContext &Ctx = MF->getFunction()->getContext();
-      Ctx.emitError("Ran out of VGPRs for spilling SGPR");
-
-      // When compiling from inside Mesa, the compilation continues.
-      // Select an arbitrary register to avoid triggering assertions
-      // during subsequent passes.
-      LaneVGPR = AMDGPU::VGPR0;
-    }
+    if (LaneVGPR == AMDGPU::NoRegister)
+      // We have no VGPRs left for spilling SGPRs.
+      return Spill;
 
     LaneVGPRs[LaneVGPRIdx] = LaneVGPR;
 
@@ -182,14 +220,10 @@ SIMachineFunctionInfo::SpilledReg SIMachineFunctionInfo::getSpilledReg(
   }
 
   Spill.VGPR = LaneVGPRs[LaneVGPRIdx];
-  Spill.Lane = Lane;
   return Spill;
 }
 
 unsigned SIMachineFunctionInfo::getMaximumWorkGroupSize(
                                               const MachineFunction &MF) const {
-  const AMDGPUSubtarget &ST = MF.getSubtarget<AMDGPUSubtarget>();
-  // FIXME: We should get this information from kernel attributes if it
-  // is available.
-  return getShaderType() == ShaderType::COMPUTE ? 256 : ST.getWavefrontSize();
+  return MaximumWorkGroupSize;
 }
diff --git a/lib/Target/AMDGPU/SIMachineFunctionInfo.h b/lib/Target/AMDGPU/SIMachineFunctionInfo.h
index 846ee5de057d..f5bd6366c717 100644
--- a/lib/Target/AMDGPU/SIMachineFunctionInfo.h
+++ b/lib/Target/AMDGPU/SIMachineFunctionInfo.h
@@ -11,12 +11,12 @@
 //
 //===----------------------------------------------------------------------===//
 
-
-#ifndef LLVM_LIB_TARGET_R600_SIMACHINEFUNCTIONINFO_H
-#define LLVM_LIB_TARGET_R600_SIMACHINEFUNCTIONINFO_H
+#ifndef LLVM_LIB_TARGET_AMDGPU_SIMACHINEFUNCTIONINFO_H
+#define LLVM_LIB_TARGET_AMDGPU_SIMACHINEFUNCTIONINFO_H
 
 #include "AMDGPUMachineFunction.h"
 #include "SIRegisterInfo.h"
+#include <array>
 #include <map>
 
 namespace llvm {
@@ -25,7 +25,7 @@ class MachineRegisterInfo;
 
 /// This class keeps track of the SPI_SP_INPUT_ADDR config register, which
 /// tells the hardware which interpolation parameters to load.
-class SIMachineFunctionInfo : public AMDGPUMachineFunction {
+class SIMachineFunctionInfo final : public AMDGPUMachineFunction {
   // FIXME: This should be removed and getPreloadedValue moved here.
   friend struct SIRegisterInfo;
   void anchor() override;
@@ -61,6 +61,15 @@ class SIMachineFunctionInfo : public AMDGPUMachineFunction {
   unsigned PSInputAddr;
   bool ReturnsVoid;
 
+  unsigned MaximumWorkGroupSize;
+
+  // Number of reserved VGPRs for debugger usage.
+  unsigned DebuggerReservedVGPRCount;
+  // Stack object indices for work group IDs.
+  std::array<int, 3> DebuggerWorkGroupIDStackObjectIndices;
+  // Stack object indices for work item IDs.
+  std::array<int, 3> DebuggerWorkItemIDStackObjectIndices;
+
 public:
   // FIXME: Make private
   unsigned LDSWaveSpillSize;
@@ -73,6 +82,11 @@ public:
 private:
   bool HasSpilledSGPRs;
   bool HasSpilledVGPRs;
+  bool HasNonSpillStackObjects;
+  bool HasFlatInstructions;
+
+  unsigned NumSpilledSGPRs;
+  unsigned NumSpilledVGPRs;
 
   // Feature bits required for inputs passed in user SGPRs.
   bool PrivateSegmentBuffer : 1;
@@ -96,7 +110,6 @@ private:
   bool WorkItemIDY : 1;
   bool WorkItemIDZ : 1;
 
-
   MCPhysReg getNextUserSGPR() const {
     assert(NumSystemSGPRs == 0 && "System SGPRs must be added after user SGPRs");
     return AMDGPU::SGPR0 + NumUserSGPRs;
@@ -111,8 +124,9 @@ public:
     unsigned VGPR;
     int Lane;
     SpilledReg(unsigned R, int L) : VGPR (R), Lane (L) { }
-    SpilledReg() : VGPR(0), Lane(-1) { }
+    SpilledReg() : VGPR(AMDGPU::NoRegister), Lane(-1) { }
     bool hasLane() { return Lane != -1;}
+    bool hasReg() { return VGPR != AMDGPU::NoRegister;}
   };
 
   // SIMachineFunctionInfo definition
@@ -129,6 +143,7 @@ public:
   unsigned addDispatchPtr(const SIRegisterInfo &TRI);
   unsigned addQueuePtr(const SIRegisterInfo &TRI);
   unsigned addKernargSegmentPtr(const SIRegisterInfo &TRI);
+  unsigned addFlatScratchInit(const SIRegisterInfo &TRI);
 
   // Add system SGPRs.
   unsigned addWorkGroupIDX() {
@@ -161,6 +176,10 @@ public:
     return PrivateSegmentWaveByteOffsetSystemSGPR;
   }
 
+  void setPrivateSegmentWaveByteOffset(unsigned Reg) {
+    PrivateSegmentWaveByteOffsetSystemSGPR = Reg;
+  }
+
   bool hasPrivateSegmentBuffer() const {
     return PrivateSegmentBuffer;
   }
@@ -261,6 +280,10 @@ public:
     ScratchWaveOffsetReg = Reg;
   }
 
+  unsigned getQueuePtrUserSGPR() const {
+    return QueuePtrUserSGPR;
+  }
+
   bool hasSpilledSGPRs() const {
     return HasSpilledSGPRs;
   }
@@ -277,6 +300,38 @@ public:
     HasSpilledVGPRs = Spill;
   }
 
+  bool hasNonSpillStackObjects() const {
+    return HasNonSpillStackObjects;
+  }
+
+  void setHasNonSpillStackObjects(bool StackObject = true) {
+    HasNonSpillStackObjects = StackObject;
+  }
+
+  bool hasFlatInstructions() const {
+    return HasFlatInstructions;
+  }
+
+  void setHasFlatInstructions(bool UseFlat = true) {
+    HasFlatInstructions = UseFlat;
+  }
+
+  unsigned getNumSpilledSGPRs() const {
+    return NumSpilledSGPRs;
+  }
+
+  unsigned getNumSpilledVGPRs() const {
+    return NumSpilledVGPRs;
+  }
+
+  void addToSpilledSGPRs(unsigned num) {
+    NumSpilledSGPRs += num;
+  }
+
+  void addToSpilledVGPRs(unsigned num) {
+    NumSpilledVGPRs += num;
+  }
+
   unsigned getPSInputAddr() const {
     return PSInputAddr;
   }
@@ -297,10 +352,70 @@ public:
     ReturnsVoid = Value;
   }
 
+  /// \returns Number of reserved VGPRs for debugger usage.
+  unsigned getDebuggerReservedVGPRCount() const {
+    return DebuggerReservedVGPRCount;
+  }
+
+  /// \returns Stack object index for \p Dim's work group ID.
+  int getDebuggerWorkGroupIDStackObjectIndex(unsigned Dim) const {
+    assert(Dim < 3);
+    return DebuggerWorkGroupIDStackObjectIndices[Dim];
+  }
+
+  /// \brief Sets stack object index for \p Dim's work group ID to \p ObjectIdx.
+  void setDebuggerWorkGroupIDStackObjectIndex(unsigned Dim, int ObjectIdx) {
+    assert(Dim < 3);
+    DebuggerWorkGroupIDStackObjectIndices[Dim] = ObjectIdx;
+  }
+
+  /// \returns Stack object index for \p Dim's work item ID.
+  int getDebuggerWorkItemIDStackObjectIndex(unsigned Dim) const {
+    assert(Dim < 3);
+    return DebuggerWorkItemIDStackObjectIndices[Dim];
+  }
+
+  /// \brief Sets stack object index for \p Dim's work item ID to \p ObjectIdx.
+  void setDebuggerWorkItemIDStackObjectIndex(unsigned Dim, int ObjectIdx) {
+    assert(Dim < 3);
+    DebuggerWorkItemIDStackObjectIndices[Dim] = ObjectIdx;
+  }
+
+  /// \returns SGPR used for \p Dim's work group ID.
+  unsigned getWorkGroupIDSGPR(unsigned Dim) const {
+    switch (Dim) {
+    case 0:
+      assert(hasWorkGroupIDX());
+      return WorkGroupIDXSystemSGPR;
+    case 1:
+      assert(hasWorkGroupIDY());
+      return WorkGroupIDYSystemSGPR;
+    case 2:
+      assert(hasWorkGroupIDZ());
+      return WorkGroupIDZSystemSGPR;
+    }
+    llvm_unreachable("unexpected dimension");
+  }
+
+  /// \returns VGPR used for \p Dim' work item ID.
+  unsigned getWorkItemIDVGPR(unsigned Dim) const {
+    switch (Dim) {
+    case 0:
+      assert(hasWorkItemIDX());
+      return AMDGPU::VGPR0;
+    case 1:
+      assert(hasWorkItemIDY());
+      return AMDGPU::VGPR1;
+    case 2:
+      assert(hasWorkItemIDZ());
+      return AMDGPU::VGPR2;
+    }
+    llvm_unreachable("unexpected dimension");
+  }
+
   unsigned getMaximumWorkGroupSize(const MachineFunction &MF) const;
 };
 
 } // End namespace llvm
 
-
 #endif
diff --git a/lib/Target/AMDGPU/SIMachineScheduler.cpp b/lib/Target/AMDGPU/SIMachineScheduler.cpp
index 1cfa98430020..7125b411c603 100644
--- a/lib/Target/AMDGPU/SIMachineScheduler.cpp
+++ b/lib/Target/AMDGPU/SIMachineScheduler.cpp
@@ -12,8 +12,8 @@
 //
 //===----------------------------------------------------------------------===//
 
+#include "AMDGPU.h"
 #include "SIMachineScheduler.h"
-#include "AMDGPUSubtarget.h"
 #include "llvm/CodeGen/LiveInterval.h"
 #include "llvm/CodeGen/LiveIntervalAnalysis.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
@@ -295,7 +295,7 @@ static bool isDefBetween(unsigned Reg,
     const MachineInstr* MI = &*UI;
     if (MI->isDebugValue())
       continue;
-    SlotIndex InstSlot = LIS->getInstructionIndex(MI).getRegSlot();
+    SlotIndex InstSlot = LIS->getInstructionIndex(*MI).getRegSlot();
     if (InstSlot >= First && InstSlot <= Last)
       return true;
   }
@@ -327,9 +327,9 @@ void SIScheduleBlock::initRegPressure(MachineBasicBlock::iterator BeginBlock,
   BotRPTracker.addLiveRegs(RPTracker.getPressure().LiveOutRegs);
 
   // Do not Track Physical Registers, because it messes up.
-  for (unsigned Reg : RPTracker.getPressure().LiveInRegs) {
-    if (TargetRegisterInfo::isVirtualRegister(Reg))
-      LiveInRegs.insert(Reg);
+  for (const auto &RegMaskPair : RPTracker.getPressure().LiveInRegs) {
+    if (TargetRegisterInfo::isVirtualRegister(RegMaskPair.RegUnit))
+      LiveInRegs.insert(RegMaskPair.RegUnit);
   }
   LiveOutRegs.clear();
   // There is several possibilities to distinguish:
@@ -354,11 +354,12 @@ void SIScheduleBlock::initRegPressure(MachineBasicBlock::iterator BeginBlock,
   // The RPTracker's LiveOutRegs has 1, 3, (some correct or incorrect)4, 5, 7
   // Comparing to LiveInRegs is not sufficient to differenciate 4 vs 5, 7
   // The use of findDefBetween removes the case 4.
-  for (unsigned Reg : RPTracker.getPressure().LiveOutRegs) {
+  for (const auto &RegMaskPair : RPTracker.getPressure().LiveOutRegs) {
+    unsigned Reg = RegMaskPair.RegUnit;
     if (TargetRegisterInfo::isVirtualRegister(Reg) &&
-        isDefBetween(Reg, LIS->getInstructionIndex(BeginBlock).getRegSlot(),
-                       LIS->getInstructionIndex(EndBlock).getRegSlot(),
-                       MRI, LIS)) {
+        isDefBetween(Reg, LIS->getInstructionIndex(*BeginBlock).getRegSlot(),
+                     LIS->getInstructionIndex(*EndBlock).getRegSlot(), MRI,
+                     LIS)) {
       LiveOutRegs.insert(Reg);
     }
   }
@@ -463,6 +464,9 @@ void SIScheduleBlock::releaseSuccessors(SUnit *SU, bool InOrOutBlock) {
   for (SDep& Succ : SU->Succs) {
     SUnit *SuccSU = Succ.getSUnit();
 
+    if (SuccSU->NodeNum >= DAG->SUnits.size())
+        continue;
+
     if (BC->isSUInBlock(SuccSU, ID) != InOrOutBlock)
       continue;
 
@@ -521,12 +525,9 @@ void SIScheduleBlock::addPred(SIScheduleBlock *Pred) {
   }
   Preds.push_back(Pred);
 
-#ifndef NDEBUG
-  for (SIScheduleBlock* S : Succs) {
-    if (PredID == S->getID())
-      assert(!"Loop in the Block Graph!\n");
-  }
-#endif
+  assert(none_of(Succs,
+                 [=](SIScheduleBlock *S) { return PredID == S->getID(); }) &&
+         "Loop in the Block Graph!");
 }
 
 void SIScheduleBlock::addSucc(SIScheduleBlock *Succ) {
@@ -540,12 +541,9 @@ void SIScheduleBlock::addSucc(SIScheduleBlock *Succ) {
   if (Succ->isHighLatencyBlock())
     ++NumHighLatencySuccessors;
   Succs.push_back(Succ);
-#ifndef NDEBUG
-  for (SIScheduleBlock* P : Preds) {
-    if (SuccID == P->getID())
-      assert("Loop in the Block Graph!\n");
-  }
-#endif
+  assert(none_of(Preds,
+                 [=](SIScheduleBlock *P) { return SuccID == P->getID(); }) &&
+         "Loop in the Block Graph!");
 }
 
 #ifndef NDEBUG
@@ -712,8 +710,8 @@ void SIScheduleBlockCreator::colorComputeReservedDependencies() {
   // Traverse TopDown, and give different colors to SUs depending
   // on which combination of High Latencies they depend on.
 
-  for (unsigned i = 0, e = DAGSize; i != e; ++i) {
-    SUnit *SU = &DAG->SUnits[DAG->TopDownIndex2SU[i]];
+  for (unsigned SUNum : DAG->TopDownIndex2SU) {
+    SUnit *SU = &DAG->SUnits[SUNum];
     std::set<unsigned> SUColors;
 
     // Already given.
@@ -754,8 +752,8 @@ void SIScheduleBlockCreator::colorComputeReservedDependencies() {
 
   // Same as before, but BottomUp.
 
-  for (unsigned i = 0, e = DAGSize; i != e; ++i) {
-    SUnit *SU = &DAG->SUnits[DAG->BottomUpIndex2SU[i]];
+  for (unsigned SUNum : DAG->BottomUpIndex2SU) {
+    SUnit *SU = &DAG->SUnits[SUNum];
     std::set<unsigned> SUColors;
 
     // Already given.
@@ -826,8 +824,8 @@ void SIScheduleBlockCreator::colorEndsAccordingToDependencies() {
   unsigned DAGSize = DAG->SUnits.size();
   std::vector<int> PendingColoring = CurrentColoring;
 
-  for (unsigned i = 0, e = DAGSize; i != e; ++i) {
-    SUnit *SU = &DAG->SUnits[DAG->BottomUpIndex2SU[i]];
+  for (unsigned SUNum : DAG->BottomUpIndex2SU) {
+    SUnit *SU = &DAG->SUnits[SUNum];
     std::set<unsigned> SUColors;
     std::set<unsigned> SUColorsPending;
 
@@ -893,8 +891,8 @@ void SIScheduleBlockCreator::colorForceConsecutiveOrderInGroup() {
 void SIScheduleBlockCreator::colorMergeConstantLoadsNextGroup() {
   unsigned DAGSize = DAG->SUnits.size();
 
-  for (unsigned i = 0, e = DAGSize; i != e; ++i) {
-    SUnit *SU = &DAG->SUnits[DAG->BottomUpIndex2SU[i]];
+  for (unsigned SUNum : DAG->BottomUpIndex2SU) {
+    SUnit *SU = &DAG->SUnits[SUNum];
     std::set<unsigned> SUColors;
 
     if (CurrentColoring[SU->NodeNum] <= (int)DAGSize)
@@ -919,8 +917,8 @@ void SIScheduleBlockCreator::colorMergeConstantLoadsNextGroup() {
 void SIScheduleBlockCreator::colorMergeIfPossibleNextGroup() {
   unsigned DAGSize = DAG->SUnits.size();
 
-  for (unsigned i = 0, e = DAGSize; i != e; ++i) {
-    SUnit *SU = &DAG->SUnits[DAG->BottomUpIndex2SU[i]];
+  for (unsigned SUNum : DAG->BottomUpIndex2SU) {
+    SUnit *SU = &DAG->SUnits[SUNum];
     std::set<unsigned> SUColors;
 
     if (CurrentColoring[SU->NodeNum] <= (int)DAGSize)
@@ -940,8 +938,8 @@ void SIScheduleBlockCreator::colorMergeIfPossibleNextGroup() {
 void SIScheduleBlockCreator::colorMergeIfPossibleNextGroupOnlyForReserved() {
   unsigned DAGSize = DAG->SUnits.size();
 
-  for (unsigned i = 0, e = DAGSize; i != e; ++i) {
-    SUnit *SU = &DAG->SUnits[DAG->BottomUpIndex2SU[i]];
+  for (unsigned SUNum : DAG->BottomUpIndex2SU) {
+    SUnit *SU = &DAG->SUnits[SUNum];
     std::set<unsigned> SUColors;
 
     if (CurrentColoring[SU->NodeNum] <= (int)DAGSize)
@@ -962,8 +960,8 @@ void SIScheduleBlockCreator::colorMergeIfPossibleSmallGroupsToNextGroup() {
   unsigned DAGSize = DAG->SUnits.size();
   std::map<unsigned, unsigned> ColorCount;
 
-  for (unsigned i = 0, e = DAGSize; i != e; ++i) {
-    SUnit *SU = &DAG->SUnits[DAG->BottomUpIndex2SU[i]];
+  for (unsigned SUNum : DAG->BottomUpIndex2SU) {
+    SUnit *SU = &DAG->SUnits[SUNum];
     unsigned color = CurrentColoring[SU->NodeNum];
     std::map<unsigned, unsigned>::iterator Pos = ColorCount.find(color);
       if (Pos != ColorCount.end()) {
@@ -973,8 +971,8 @@ void SIScheduleBlockCreator::colorMergeIfPossibleSmallGroupsToNextGroup() {
       }
   }
 
-  for (unsigned i = 0, e = DAGSize; i != e; ++i) {
-    SUnit *SU = &DAG->SUnits[DAG->BottomUpIndex2SU[i]];
+  for (unsigned SUNum : DAG->BottomUpIndex2SU) {
+    SUnit *SU = &DAG->SUnits[SUNum];
     unsigned color = CurrentColoring[SU->NodeNum];
     std::set<unsigned> SUColors;
 
@@ -1006,8 +1004,8 @@ void SIScheduleBlockCreator::regroupNoUserInstructions() {
   unsigned DAGSize = DAG->SUnits.size();
   int GroupID = NextNonReservedID++;
 
-  for (unsigned i = 0, e = DAGSize; i != e; ++i) {
-    SUnit *SU = &DAG->SUnits[DAG->BottomUpIndex2SU[i]];
+  for (unsigned SUNum : DAG->BottomUpIndex2SU) {
+    SUnit *SU = &DAG->SUnits[SUNum];
     bool hasSuccessor = false;
 
     if (CurrentColoring[SU->NodeNum] <= (int)DAGSize)
@@ -1223,7 +1221,7 @@ void SIScheduleBlockCreator::scheduleInsideBlocks() {
         // is the most cpu intensive operation of the scheduler.
         // It would gain a lot if there was a way to recompute the
         // LiveIntervals for the entire scheduling region.
-        DAG->getLIS()->handleMove(MI, /*UpdateFlags=*/true);
+        DAG->getLIS()->handleMove(*MI, /*UpdateFlags=*/true);
         PosNew.push_back(CurrentTopFastSched);
       }
     }
@@ -1249,7 +1247,7 @@ void SIScheduleBlockCreator::scheduleInsideBlocks() {
       DAG->getBB()->splice(POld, DAG->getBB(), PNew);
 
       // Update LiveIntervals.
-      DAG->getLIS()->handleMove(POld, /*UpdateFlags=*/true);
+      DAG->getLIS()->handleMove(*POld, /*UpdateFlags=*/true);
     }
   }
 
@@ -1675,70 +1673,10 @@ ScheduleDAGInstrs *llvm::createSIMachineScheduler(MachineSchedContext *C) {
 // Does a topological sort over the SUs.
 // Both TopDown and BottomUp
 void SIScheduleDAGMI::topologicalSort() {
-  std::vector<int> TopDownSU2Index;
-  unsigned DAGSize = SUnits.size();
-  std::vector<SUnit*> WorkList;
-
-  DEBUG(dbgs() << "Topological Sort\n");
-  WorkList.reserve(DAGSize);
-
-  TopDownIndex2SU.resize(DAGSize);
-  TopDownSU2Index.resize(DAGSize);
-  BottomUpIndex2SU.resize(DAGSize);
-
-  WorkList.push_back(&getExitSU());
-  for (unsigned i = 0, e = DAGSize; i != e; ++i) {
-    SUnit *SU = &SUnits[i];
-    int NodeNum = SU->NodeNum;
-    unsigned Degree = SU->Succs.size();
-    TopDownSU2Index[NodeNum] = Degree;
-    if (Degree == 0) {
-      assert(SU->Succs.empty() && "SUnit should have no successors");
-      WorkList.push_back(SU);
-    }
-  }
-
-  int Id = DAGSize;
-  while (!WorkList.empty()) {
-    SUnit *SU = WorkList.back();
-    WorkList.pop_back();
-    if (SU->NodeNum < DAGSize) {
-      TopDownSU2Index[SU->NodeNum] = --Id;
-      TopDownIndex2SU[Id] = SU->NodeNum;
-    }
-    for (SDep& Pred : SU->Preds) {
-      SUnit *SU = Pred.getSUnit();
-      if (SU->NodeNum < DAGSize && !--TopDownSU2Index[SU->NodeNum])
-        WorkList.push_back(SU);
-    }
-  }
-
-  BottomUpIndex2SU = std::vector<int>(TopDownIndex2SU.rbegin(),
-                                      TopDownIndex2SU.rend());
+  Topo.InitDAGTopologicalSorting();
 
-#ifndef NDEBUG
-  // Check correctness of the ordering
-  for (unsigned i = 0, e = DAGSize; i != e; ++i) {
-    SUnit *SU = &SUnits[i];
-    for (SDep& Pred : SU->Preds) {
-      if (Pred.getSUnit()->NodeNum >= DAGSize)
-        continue;
-      assert(TopDownSU2Index[SU->NodeNum] >
-             TopDownSU2Index[Pred.getSUnit()->NodeNum] &&
-             "Wrong Top Down topological sorting");
-    }
-  }
-  for (unsigned i = 0, e = DAGSize; i != e; ++i) {
-    SUnit *SU = &SUnits[i];
-    for (SDep& Succ : SU->Succs) {
-      if (Succ.getSUnit()->NodeNum >= DAGSize)
-        continue;
-      assert(TopDownSU2Index[SU->NodeNum] <
-             TopDownSU2Index[Succ.getSUnit()->NodeNum] &&
-             "Wrong Bottom Up topological sorting");
-    }
-  }
-#endif
+  TopDownIndex2SU = std::vector<int>(Topo.begin(), Topo.end());
+  BottomUpIndex2SU = std::vector<int>(Topo.rbegin(), Topo.rend());
 }
 
 // Move low latencies further from their user without
@@ -1759,7 +1697,7 @@ void SIScheduleDAGMI::moveLowLatencies() {
 
     for (SDep& PredDep : SU->Preds) {
       SUnit *Pred = PredDep.getSUnit();
-      if (SITII->isLowLatencyInstruction(Pred->getInstr())) {
+      if (SITII->isLowLatencyInstruction(*Pred->getInstr())) {
         IsLowLatencyUser = true;
       }
       if (Pred->NodeNum >= DAGSize)
@@ -1769,7 +1707,7 @@ void SIScheduleDAGMI::moveLowLatencies() {
         MinPos = PredPos + 1;
     }
 
-    if (SITII->isLowLatencyInstruction(SU->getInstr())) {
+    if (SITII->isLowLatencyInstruction(*SU->getInstr())) {
       unsigned BestPos = LastLowLatencyUser + 1;
       if ((int)BestPos <= LastLowLatencyPos)
         BestPos = LastLowLatencyPos + 1;
@@ -1794,7 +1732,7 @@ void SIScheduleDAGMI::moveLowLatencies() {
       bool CopyForLowLat = false;
       for (SDep& SuccDep : SU->Succs) {
         SUnit *Succ = SuccDep.getSUnit();
-        if (SITII->isLowLatencyInstruction(Succ->getInstr())) {
+        if (SITII->isLowLatencyInstruction(*Succ->getInstr())) {
           CopyForLowLat = true;
         }
       }
@@ -1855,7 +1793,6 @@ void SIScheduleDAGMI::schedule()
        SU.dumpAll(this)
   );
 
-  Topo.InitDAGTopologicalSorting();
   topologicalSort();
   findRootsAndBiasEdges(TopRoots, BotRoots);
   // We reuse several ScheduleDAGMI and ScheduleDAGMILive
@@ -1878,20 +1815,21 @@ void SIScheduleDAGMI::schedule()
 
   for (unsigned i = 0, e = (unsigned)SUnits.size(); i != e; ++i) {
     SUnit *SU = &SUnits[i];
-    unsigned BaseLatReg, OffLatReg;
-    if (SITII->isLowLatencyInstruction(SU->getInstr())) {
+    unsigned BaseLatReg;
+    int64_t OffLatReg;
+    if (SITII->isLowLatencyInstruction(*SU->getInstr())) {
       IsLowLatencySU[i] = 1;
-      if (SITII->getMemOpBaseRegImmOfs(SU->getInstr(), BaseLatReg,
-                                      OffLatReg, TRI))
+      if (SITII->getMemOpBaseRegImmOfs(*SU->getInstr(), BaseLatReg, OffLatReg,
+                                       TRI))
         LowLatencyOffset[i] = OffLatReg;
-    } else if (SITII->isHighLatencyInstruction(SU->getInstr()))
+    } else if (SITII->isHighLatencyInstruction(*SU->getInstr()))
       IsHighLatencySU[i] = 1;
   }
 
   SIScheduler Scheduler(this);
   Best = Scheduler.scheduleVariant(SISchedulerBlockCreatorVariant::LatenciesAlone,
                                    SISchedulerBlockSchedulerVariant::BlockLatencyRegUsage);
-#if 0 // To enable when handleMove fix lands
+
   // if VGPR usage is extremely high, try other good performing variants
   // which could lead to lower VGPR usage
   if (Best.MaxVGPRUsage > 180) {
@@ -1930,7 +1868,7 @@ void SIScheduleDAGMI::schedule()
         Best = Temp;
     }
   }
-#endif
+
   ScheduledSUnits = Best.SUs;
   ScheduledSUnitsInv.resize(SUnits.size());
 
diff --git a/lib/Target/AMDGPU/SIMachineScheduler.h b/lib/Target/AMDGPU/SIMachineScheduler.h
index b270136811c6..117aed497cc2 100644
--- a/lib/Target/AMDGPU/SIMachineScheduler.h
+++ b/lib/Target/AMDGPU/SIMachineScheduler.h
@@ -418,7 +418,7 @@ public:
                   SISchedulerBlockSchedulerVariant ScheduleVariant);
 };
 
-class SIScheduleDAGMI : public ScheduleDAGMILive {
+class SIScheduleDAGMI final : public ScheduleDAGMILive {
   const SIInstrInfo *SITII;
   const SIRegisterInfo *SITRI;
 
@@ -441,7 +441,7 @@ public:
 
   // To init Block's RPTracker.
   void initRPTracker(RegPressureTracker &RPTracker) {
-    RPTracker.init(&MF, RegClassInfo, LIS, BB, RegionBegin);
+    RPTracker.init(&MF, RegClassInfo, LIS, BB, RegionBegin, false, false);
   }
 
   MachineBasicBlock *getBB() { return BB; }
@@ -460,8 +460,10 @@ public:
                                                      unsigned &VgprUsage,
                                                      unsigned &SgprUsage);
   std::set<unsigned> getInRegs() {
-    std::set<unsigned> InRegs (RPTracker.getPressure().LiveInRegs.begin(),
-                               RPTracker.getPressure().LiveInRegs.end());
+    std::set<unsigned> InRegs;
+    for (const auto &RegMaskPair : RPTracker.getPressure().LiveInRegs) {
+      InRegs.insert(RegMaskPair.RegUnit);
+    }
     return InRegs;
   };
 
diff --git a/lib/Target/AMDGPU/SIRegisterInfo.cpp b/lib/Target/AMDGPU/SIRegisterInfo.cpp
index 025ed2b5b76b..0dd88ee45c58 100644
--- a/lib/Target/AMDGPU/SIRegisterInfo.cpp
+++ b/lib/Target/AMDGPU/SIRegisterInfo.cpp
@@ -15,6 +15,7 @@
 #include "SIRegisterInfo.h"
 #include "SIInstrInfo.h"
 #include "SIMachineFunctionInfo.h"
+#include "AMDGPUSubtarget.h"
 #include "llvm/CodeGen/MachineFrameInfo.h"
 #include "llvm/CodeGen/MachineInstrBuilder.h"
 #include "llvm/CodeGen/RegisterScavenging.h"
@@ -23,7 +24,75 @@
 
 using namespace llvm;
 
-SIRegisterInfo::SIRegisterInfo() : AMDGPURegisterInfo() {
+static unsigned getMaxWaveCountPerSIMD(const MachineFunction &MF) {
+  const SIMachineFunctionInfo &MFI = *MF.getInfo<SIMachineFunctionInfo>();
+  const SISubtarget &ST = MF.getSubtarget<SISubtarget>();
+  unsigned SIMDPerCU = 4;
+
+  unsigned MaxInvocationsPerWave = SIMDPerCU * ST.getWavefrontSize();
+  return alignTo(MFI.getMaximumWorkGroupSize(MF), MaxInvocationsPerWave) /
+           MaxInvocationsPerWave;
+}
+
+static unsigned getMaxWorkGroupSGPRCount(const MachineFunction &MF) {
+  const SISubtarget &ST = MF.getSubtarget<SISubtarget>();
+  unsigned MaxWaveCountPerSIMD = getMaxWaveCountPerSIMD(MF);
+
+  unsigned TotalSGPRCountPerSIMD, AddressableSGPRCount, SGPRUsageAlignment;
+  unsigned ReservedSGPRCount;
+
+  if (ST.getGeneration() >= SISubtarget::VOLCANIC_ISLANDS) {
+    TotalSGPRCountPerSIMD = 800;
+    AddressableSGPRCount = 102;
+    SGPRUsageAlignment = 16;
+    ReservedSGPRCount = 6; // VCC, FLAT_SCRATCH, XNACK
+  } else {
+    TotalSGPRCountPerSIMD = 512;
+    AddressableSGPRCount = 104;
+    SGPRUsageAlignment = 8;
+    ReservedSGPRCount = 2; // VCC
+  }
+
+  unsigned MaxSGPRCount = (TotalSGPRCountPerSIMD / MaxWaveCountPerSIMD);
+  MaxSGPRCount = alignDown(MaxSGPRCount, SGPRUsageAlignment);
+
+  if (ST.hasSGPRInitBug())
+    MaxSGPRCount = SISubtarget::FIXED_SGPR_COUNT_FOR_INIT_BUG;
+
+  return std::min(MaxSGPRCount - ReservedSGPRCount, AddressableSGPRCount);
+}
+
+static unsigned getMaxWorkGroupVGPRCount(const MachineFunction &MF) {
+  unsigned MaxWaveCountPerSIMD = getMaxWaveCountPerSIMD(MF);
+  unsigned TotalVGPRCountPerSIMD = 256;
+  unsigned VGPRUsageAlignment = 4;
+
+  return alignDown(TotalVGPRCountPerSIMD / MaxWaveCountPerSIMD,
+                   VGPRUsageAlignment);
+}
+
+static bool hasPressureSet(const int *PSets, unsigned PSetID) {
+  for (unsigned i = 0; PSets[i] != -1; ++i) {
+    if (PSets[i] == (int)PSetID)
+      return true;
+  }
+  return false;
+}
+
+void SIRegisterInfo::classifyPressureSet(unsigned PSetID, unsigned Reg,
+                                         BitVector &PressureSets) const {
+  for (MCRegUnitIterator U(Reg, this); U.isValid(); ++U) {
+    const int *PSets = getRegUnitPressureSets(*U);
+    if (hasPressureSet(PSets, PSetID)) {
+      PressureSets.set(PSetID);
+      break;
+    }
+  }
+}
+
+SIRegisterInfo::SIRegisterInfo() : AMDGPURegisterInfo(),
+                                   SGPRPressureSets(getNumRegPressureSets()),
+                                   VGPRPressureSets(getNumRegPressureSets()) {
   unsigned NumRegPressureSets = getNumRegPressureSets();
 
   SGPR32SetID = NumRegPressureSets;
@@ -33,6 +102,9 @@ SIRegisterInfo::SIRegisterInfo() : AMDGPURegisterInfo() {
       SGPR32SetID = i;
     else if (strncmp("VGPR_32", getRegPressureSetName(i), 7) == 0)
       VGPR32SetID = i;
+
+    classifyPressureSet(i, AMDGPU::SGPR0, SGPRPressureSets);
+    classifyPressureSet(i, AMDGPU::VGPR0, VGPRPressureSets);
   }
   assert(SGPR32SetID < NumRegPressureSets &&
          VGPR32SetID < NumRegPressureSets);
@@ -47,38 +119,27 @@ void SIRegisterInfo::reserveRegisterTuples(BitVector &Reserved, unsigned Reg) co
 
 unsigned SIRegisterInfo::reservedPrivateSegmentBufferReg(
   const MachineFunction &MF) const {
-  const AMDGPUSubtarget &ST = MF.getSubtarget<AMDGPUSubtarget>();
-  if (ST.hasSGPRInitBug()) {
-    // Leave space for flat_scr, xnack_mask, vcc, and alignment
-    unsigned BaseIdx = AMDGPUSubtarget::FIXED_SGPR_COUNT_FOR_INIT_BUG - 8 - 4;
-    unsigned BaseReg(AMDGPU::SGPR_32RegClass.getRegister(BaseIdx));
-    return getMatchingSuperReg(BaseReg, AMDGPU::sub0, &AMDGPU::SReg_128RegClass);
-  }
-
-  if (ST.getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS) {
-    // 96/97 need to be reserved for flat_scr, 98/99 for xnack_mask, and
-    // 100/101 for vcc. This is the next sgpr128 down.
-    return AMDGPU::SGPR92_SGPR93_SGPR94_SGPR95;
-  }
-
-  return AMDGPU::SGPR96_SGPR97_SGPR98_SGPR99;
+  unsigned BaseIdx = alignDown(getMaxWorkGroupSGPRCount(MF), 4) - 4;
+  unsigned BaseReg(AMDGPU::SGPR_32RegClass.getRegister(BaseIdx));
+  return getMatchingSuperReg(BaseReg, AMDGPU::sub0, &AMDGPU::SReg_128RegClass);
 }
 
 unsigned SIRegisterInfo::reservedPrivateSegmentWaveByteOffsetReg(
   const MachineFunction &MF) const {
-  const AMDGPUSubtarget &ST = MF.getSubtarget<AMDGPUSubtarget>();
-  if (ST.hasSGPRInitBug()) {
-    unsigned Idx = AMDGPUSubtarget::FIXED_SGPR_COUNT_FOR_INIT_BUG - 6 - 1;
-    return AMDGPU::SGPR_32RegClass.getRegister(Idx);
-  }
-
-  if (ST.getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS) {
-    // Next register before reservations for flat_scr, xnack_mask, vcc,
-    // and scratch resource.
-    return AMDGPU::SGPR91;
+  unsigned RegCount = getMaxWorkGroupSGPRCount(MF);
+  unsigned Reg;
+
+  // Try to place it in a hole after PrivateSegmentbufferReg.
+  if (RegCount & 3) {
+    // We cannot put the segment buffer in (Idx - 4) ... (Idx - 1) due to
+    // alignment constraints, so we have a hole where can put the wave offset.
+    Reg = RegCount - 1;
+  } else {
+    // We can put the segment buffer in (Idx - 4) ... (Idx - 1) and put the
+    // wave offset before it.
+    Reg = RegCount - 5;
   }
-
-  return AMDGPU::SGPR95;
+  return AMDGPU::SGPR_32RegClass.getRegister(Reg);
 }
 
 BitVector SIRegisterInfo::getReservedRegs(const MachineFunction &MF) const {
@@ -90,35 +151,30 @@ BitVector SIRegisterInfo::getReservedRegs(const MachineFunction &MF) const {
   reserveRegisterTuples(Reserved, AMDGPU::EXEC);
   reserveRegisterTuples(Reserved, AMDGPU::FLAT_SCR);
 
-  // Reserve the last 2 registers so we will always have at least 2 more that
-  // will physically contain VCC.
-  reserveRegisterTuples(Reserved, AMDGPU::SGPR102_SGPR103);
-
-  const AMDGPUSubtarget &ST = MF.getSubtarget<AMDGPUSubtarget>();
-
-  if (ST.getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS) {
-    // SI/CI have 104 SGPRs. VI has 102. We need to shift down the reservation
-    // for VCC/XNACK_MASK/FLAT_SCR.
-    //
-    // TODO The SGPRs that alias to XNACK_MASK could be used as general purpose
-    // SGPRs when the XNACK feature is not used. This is currently not done
-    // because the code that counts SGPRs cannot account for such holes.
-    reserveRegisterTuples(Reserved, AMDGPU::SGPR96_SGPR97);
-    reserveRegisterTuples(Reserved, AMDGPU::SGPR98_SGPR99);
-    reserveRegisterTuples(Reserved, AMDGPU::SGPR100_SGPR101);
+  // Reserve Trap Handler registers - support is not implemented in Codegen.
+  reserveRegisterTuples(Reserved, AMDGPU::TBA);
+  reserveRegisterTuples(Reserved, AMDGPU::TMA);
+  reserveRegisterTuples(Reserved, AMDGPU::TTMP0_TTMP1);
+  reserveRegisterTuples(Reserved, AMDGPU::TTMP2_TTMP3);
+  reserveRegisterTuples(Reserved, AMDGPU::TTMP4_TTMP5);
+  reserveRegisterTuples(Reserved, AMDGPU::TTMP6_TTMP7);
+  reserveRegisterTuples(Reserved, AMDGPU::TTMP8_TTMP9);
+  reserveRegisterTuples(Reserved, AMDGPU::TTMP10_TTMP11);
+
+  unsigned MaxWorkGroupSGPRCount = getMaxWorkGroupSGPRCount(MF);
+  unsigned MaxWorkGroupVGPRCount = getMaxWorkGroupVGPRCount(MF);
+
+  unsigned NumSGPRs = AMDGPU::SGPR_32RegClass.getNumRegs();
+  unsigned NumVGPRs = AMDGPU::VGPR_32RegClass.getNumRegs();
+  for (unsigned i = MaxWorkGroupSGPRCount; i < NumSGPRs; ++i) {
+    unsigned Reg = AMDGPU::SGPR_32RegClass.getRegister(i);
+    reserveRegisterTuples(Reserved, Reg);
   }
 
-  // Tonga and Iceland can only allocate a fixed number of SGPRs due
-  // to a hw bug.
-  if (ST.hasSGPRInitBug()) {
-    unsigned NumSGPRs = AMDGPU::SGPR_32RegClass.getNumRegs();
-    // Reserve some SGPRs for FLAT_SCRATCH, XNACK_MASK, and VCC (6 SGPRs).
-    unsigned Limit = AMDGPUSubtarget::FIXED_SGPR_COUNT_FOR_INIT_BUG - 6;
 
-    for (unsigned i = Limit; i < NumSGPRs; ++i) {
-      unsigned Reg = AMDGPU::SGPR_32RegClass.getRegister(i);
-      reserveRegisterTuples(Reserved, Reg);
-    }
+  for (unsigned i = MaxWorkGroupVGPRCount; i < NumVGPRs; ++i) {
+    unsigned Reg = AMDGPU::VGPR_32RegClass.getRegister(i);
+    reserveRegisterTuples(Reserved, Reg);
   }
 
   const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
@@ -138,48 +194,182 @@ BitVector SIRegisterInfo::getReservedRegs(const MachineFunction &MF) const {
     assert(!isSubRegister(ScratchRSrcReg, ScratchWaveOffsetReg));
   }
 
+  // Reserve registers for debugger usage if "amdgpu-debugger-reserve-trap-regs"
+  // attribute was specified.
+  const SISubtarget &ST = MF.getSubtarget<SISubtarget>();
+  if (ST.debuggerReserveRegs()) {
+    unsigned ReservedVGPRFirst =
+      MaxWorkGroupVGPRCount - MFI->getDebuggerReservedVGPRCount();
+    for (unsigned i = ReservedVGPRFirst; i < MaxWorkGroupVGPRCount; ++i) {
+      unsigned Reg = AMDGPU::VGPR_32RegClass.getRegister(i);
+      reserveRegisterTuples(Reserved, Reg);
+    }
+  }
+
   return Reserved;
 }
 
 unsigned SIRegisterInfo::getRegPressureSetLimit(const MachineFunction &MF,
                                                 unsigned Idx) const {
-  const AMDGPUSubtarget &STI = MF.getSubtarget<AMDGPUSubtarget>();
+  const SISubtarget &STI = MF.getSubtarget<SISubtarget>();
   // FIXME: We should adjust the max number of waves based on LDS size.
-  unsigned SGPRLimit = getNumSGPRsAllowed(STI.getGeneration(),
-                                          STI.getMaxWavesPerCU());
+  unsigned SGPRLimit = getNumSGPRsAllowed(STI, STI.getMaxWavesPerCU());
   unsigned VGPRLimit = getNumVGPRsAllowed(STI.getMaxWavesPerCU());
 
   unsigned VSLimit = SGPRLimit + VGPRLimit;
 
-  for (regclass_iterator I = regclass_begin(), E = regclass_end();
-       I != E; ++I) {
-    const TargetRegisterClass *RC = *I;
+  if (SGPRPressureSets.test(Idx) && VGPRPressureSets.test(Idx)) {
+    // FIXME: This is a hack. We should never be considering the pressure of
+    // these since no virtual register should ever have this class.
+    return VSLimit;
+  }
 
-    unsigned NumSubRegs = std::max((int)RC->getSize() / 4, 1);
-    unsigned Limit;
+  if (SGPRPressureSets.test(Idx))
+    return SGPRLimit;
 
-    if (isPseudoRegClass(RC)) {
-      // FIXME: This is a hack. We should never be considering the pressure of
-      // these since no virtual register should ever have this class.
-      Limit = VSLimit;
-    } else if (isSGPRClass(RC)) {
-      Limit = SGPRLimit / NumSubRegs;
-    } else {
-      Limit = VGPRLimit / NumSubRegs;
-    }
+  return VGPRLimit;
+}
+
+bool SIRegisterInfo::requiresRegisterScavenging(const MachineFunction &Fn) const {
+  return Fn.getFrameInfo()->hasStackObjects();
+}
+
+bool
+SIRegisterInfo::requiresFrameIndexScavenging(const MachineFunction &MF) const {
+  return MF.getFrameInfo()->hasStackObjects();
+}
+
+bool SIRegisterInfo::requiresVirtualBaseRegisters(
+  const MachineFunction &) const {
+  // There are no special dedicated stack or frame pointers.
+  return true;
+}
+
+bool SIRegisterInfo::trackLivenessAfterRegAlloc(const MachineFunction &MF) const {
+  // This helps catch bugs as verifier errors.
+  return true;
+}
+
+int64_t SIRegisterInfo::getFrameIndexInstrOffset(const MachineInstr *MI,
+                                                 int Idx) const {
+  if (!SIInstrInfo::isMUBUF(*MI))
+    return 0;
+
+  assert(Idx == AMDGPU::getNamedOperandIdx(MI->getOpcode(),
+                                           AMDGPU::OpName::vaddr) &&
+         "Should never see frame index on non-address operand");
+
+  int OffIdx = AMDGPU::getNamedOperandIdx(MI->getOpcode(),
+                                          AMDGPU::OpName::offset);
+  return MI->getOperand(OffIdx).getImm();
+}
+
+bool SIRegisterInfo::needsFrameBaseReg(MachineInstr *MI, int64_t Offset) const {
+  return MI->mayLoadOrStore();
+}
 
-    const int *Sets = getRegClassPressureSets(RC);
-    assert(Sets);
-    for (unsigned i = 0; Sets[i] != -1; ++i) {
-      if (Sets[i] == (int)Idx)
-        return Limit;
+void SIRegisterInfo::materializeFrameBaseRegister(MachineBasicBlock *MBB,
+                                                  unsigned BaseReg,
+                                                  int FrameIdx,
+                                                  int64_t Offset) const {
+  MachineBasicBlock::iterator Ins = MBB->begin();
+  DebugLoc DL; // Defaults to "unknown"
+
+  if (Ins != MBB->end())
+    DL = Ins->getDebugLoc();
+
+  MachineFunction *MF = MBB->getParent();
+  const SISubtarget &Subtarget = MF->getSubtarget<SISubtarget>();
+  const SIInstrInfo *TII = Subtarget.getInstrInfo();
+
+  if (Offset == 0) {
+    BuildMI(*MBB, Ins, DL, TII->get(AMDGPU::V_MOV_B32_e32), BaseReg)
+      .addFrameIndex(FrameIdx);
+    return;
+  }
+
+  MachineRegisterInfo &MRI = MF->getRegInfo();
+  unsigned UnusedCarry = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);
+  unsigned OffsetReg = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
+
+  BuildMI(*MBB, Ins, DL, TII->get(AMDGPU::S_MOV_B32), OffsetReg)
+    .addImm(Offset);
+  BuildMI(*MBB, Ins, DL, TII->get(AMDGPU::V_ADD_I32_e64), BaseReg)
+    .addReg(UnusedCarry, RegState::Define | RegState::Dead)
+    .addReg(OffsetReg, RegState::Kill)
+    .addFrameIndex(FrameIdx);
+}
+
+void SIRegisterInfo::resolveFrameIndex(MachineInstr &MI, unsigned BaseReg,
+                                       int64_t Offset) const {
+
+  MachineBasicBlock *MBB = MI.getParent();
+  MachineFunction *MF = MBB->getParent();
+  const SISubtarget &Subtarget = MF->getSubtarget<SISubtarget>();
+  const SIInstrInfo *TII = Subtarget.getInstrInfo();
+
+#ifndef NDEBUG
+  // FIXME: Is it possible to be storing a frame index to itself?
+  bool SeenFI = false;
+  for (const MachineOperand &MO: MI.operands()) {
+    if (MO.isFI()) {
+      if (SeenFI)
+        llvm_unreachable("should not see multiple frame indices");
+
+      SeenFI = true;
     }
   }
-  return 256;
+#endif
+
+  MachineOperand *FIOp = TII->getNamedOperand(MI, AMDGPU::OpName::vaddr);
+  assert(FIOp && FIOp->isFI() && "frame index must be address operand");
+
+  assert(TII->isMUBUF(MI));
+
+  MachineOperand *OffsetOp = TII->getNamedOperand(MI, AMDGPU::OpName::offset);
+  int64_t NewOffset = OffsetOp->getImm() + Offset;
+  if (isUInt<12>(NewOffset)) {
+    // If we have a legal offset, fold it directly into the instruction.
+    FIOp->ChangeToRegister(BaseReg, false);
+    OffsetOp->setImm(NewOffset);
+    return;
+  }
+
+  // The offset is not legal, so we must insert an add of the offset.
+  MachineRegisterInfo &MRI = MF->getRegInfo();
+  unsigned NewReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
+  DebugLoc DL = MI.getDebugLoc();
+
+  assert(Offset != 0 && "Non-zero offset expected");
+
+  unsigned UnusedCarry = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);
+  unsigned OffsetReg = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
+
+  // In the case the instruction already had an immediate offset, here only
+  // the requested new offset is added because we are leaving the original
+  // immediate in place.
+  BuildMI(*MBB, MI, DL, TII->get(AMDGPU::S_MOV_B32), OffsetReg)
+    .addImm(Offset);
+  BuildMI(*MBB, MI, DL, TII->get(AMDGPU::V_ADD_I32_e64), NewReg)
+    .addReg(UnusedCarry, RegState::Define | RegState::Dead)
+    .addReg(OffsetReg, RegState::Kill)
+    .addReg(BaseReg);
+
+  FIOp->ChangeToRegister(NewReg, false);
 }
 
-bool SIRegisterInfo::requiresRegisterScavenging(const MachineFunction &Fn) const {
-  return Fn.getFrameInfo()->hasStackObjects();
+bool SIRegisterInfo::isFrameOffsetLegal(const MachineInstr *MI,
+                                        unsigned BaseReg,
+                                        int64_t Offset) const {
+  return SIInstrInfo::isMUBUF(*MI) && isUInt<12>(Offset);
+}
+
+const TargetRegisterClass *SIRegisterInfo::getPointerRegClass(
+  const MachineFunction &MF, unsigned Kind) const {
+  // This is inaccurate. It depends on the instruction and address space. The
+  // only place where we should hit this is for dealing with frame indexes /
+  // private accesses, so this is correct in that case.
+  return &AMDGPU::VGPR_32RegClass;
 }
 
 static unsigned getNumSubRegsForSpillOp(unsigned Op) {
@@ -219,32 +409,48 @@ static unsigned getNumSubRegsForSpillOp(unsigned Op) {
 
 void SIRegisterInfo::buildScratchLoadStore(MachineBasicBlock::iterator MI,
                                            unsigned LoadStoreOp,
-                                           unsigned Value,
+                                           const MachineOperand *SrcDst,
                                            unsigned ScratchRsrcReg,
                                            unsigned ScratchOffset,
                                            int64_t Offset,
                                            RegScavenger *RS) const {
 
+  unsigned Value = SrcDst->getReg();
+  bool IsKill = SrcDst->isKill();
   MachineBasicBlock *MBB = MI->getParent();
-  const MachineFunction *MF = MI->getParent()->getParent();
-  const SIInstrInfo *TII =
-      static_cast<const SIInstrInfo *>(MF->getSubtarget().getInstrInfo());
-  LLVMContext &Ctx = MF->getFunction()->getContext();
+  MachineFunction *MF = MI->getParent()->getParent();
+  const SISubtarget &ST =  MF->getSubtarget<SISubtarget>();
+  const SIInstrInfo *TII = ST.getInstrInfo();
+
   DebugLoc DL = MI->getDebugLoc();
-  bool IsLoad = TII->get(LoadStoreOp).mayLoad();
+  bool IsStore = MI->mayStore();
 
   bool RanOutOfSGPRs = false;
   bool Scavenged = false;
   unsigned SOffset = ScratchOffset;
+  unsigned OriginalImmOffset = Offset;
 
   unsigned NumSubRegs = getNumSubRegsForSpillOp(MI->getOpcode());
   unsigned Size = NumSubRegs * 4;
 
   if (!isUInt<12>(Offset + Size)) {
-    SOffset = RS->scavengeRegister(&AMDGPU::SGPR_32RegClass, MI, 0);
+    SOffset = AMDGPU::NoRegister;
+
+    // We don't have access to the register scavenger if this function is called
+    // during  PEI::scavengeFrameVirtualRegs().
+    if (RS)
+      SOffset = RS->FindUnusedReg(&AMDGPU::SGPR_32RegClass);
+
     if (SOffset == AMDGPU::NoRegister) {
+      // There are no free SGPRs, and since we are in the process of spilling
+      // VGPRs too.  Since we need a VGPR in order to spill SGPRs (this is true
+      // on SI/CI and on VI it is true until we implement spilling using scalar
+      // stores), we have no way to free up an SGPR.  Our solution here is to
+      // add the offset directly to the ScratchOffset register, and then
+      // subtract the offset after the spill to return ScratchOffset to it's
+      // original value.
       RanOutOfSGPRs = true;
-      SOffset = AMDGPU::SGPR0;
+      SOffset = ScratchOffset;
     } else {
       Scavenged = true;
     }
@@ -254,40 +460,48 @@ void SIRegisterInfo::buildScratchLoadStore(MachineBasicBlock::iterator MI,
     Offset = 0;
   }
 
-  if (RanOutOfSGPRs)
-    Ctx.emitError("Ran out of SGPRs for spilling VGPRS");
-
   for (unsigned i = 0, e = NumSubRegs; i != e; ++i, Offset += 4) {
     unsigned SubReg = NumSubRegs > 1 ?
         getPhysRegSubReg(Value, &AMDGPU::VGPR_32RegClass, i) :
         Value;
 
     unsigned SOffsetRegState = 0;
-    if (i + 1 == e && Scavenged)
-      SOffsetRegState |= RegState::Kill;
+    unsigned SrcDstRegState = getDefRegState(!IsStore);
+    if (i + 1 == e) {
+      SOffsetRegState |= getKillRegState(Scavenged);
+      // The last implicit use carries the "Kill" flag.
+      SrcDstRegState |= getKillRegState(IsKill);
+    }
 
     BuildMI(*MBB, MI, DL, TII->get(LoadStoreOp))
-      .addReg(SubReg, getDefRegState(IsLoad))
+      .addReg(SubReg, getDefRegState(!IsStore))
       .addReg(ScratchRsrcReg)
       .addReg(SOffset, SOffsetRegState)
       .addImm(Offset)
       .addImm(0) // glc
       .addImm(0) // slc
       .addImm(0) // tfe
-      .addReg(Value, RegState::Implicit | getDefRegState(IsLoad))
+      .addReg(Value, RegState::Implicit | SrcDstRegState)
       .setMemRefs(MI->memoperands_begin(), MI->memoperands_end());
   }
+  if (RanOutOfSGPRs) {
+    // Subtract the offset we added to the ScratchOffset register.
+    BuildMI(*MBB, MI, DL, TII->get(AMDGPU::S_SUB_U32), ScratchOffset)
+            .addReg(ScratchOffset)
+            .addImm(OriginalImmOffset);
+  }
 }
 
 void SIRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator MI,
                                         int SPAdj, unsigned FIOperandNum,
                                         RegScavenger *RS) const {
   MachineFunction *MF = MI->getParent()->getParent();
+  MachineRegisterInfo &MRI = MF->getRegInfo();
   MachineBasicBlock *MBB = MI->getParent();
   SIMachineFunctionInfo *MFI = MF->getInfo<SIMachineFunctionInfo>();
   MachineFrameInfo *FrameInfo = MF->getFrameInfo();
-  const SIInstrInfo *TII =
-      static_cast<const SIInstrInfo *>(MF->getSubtarget().getInstrInfo());
+  const SISubtarget &ST =  MF->getSubtarget<SISubtarget>();
+  const SIInstrInfo *TII = ST.getInstrInfo();
   DebugLoc DL = MI->getDebugLoc();
 
   MachineOperand &FIOp = MI->getOperand(FIOperandNum);
@@ -301,24 +515,65 @@ void SIRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator MI,
     case AMDGPU::SI_SPILL_S64_SAVE:
     case AMDGPU::SI_SPILL_S32_SAVE: {
       unsigned NumSubRegs = getNumSubRegsForSpillOp(MI->getOpcode());
+      unsigned TmpReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
 
+      unsigned SuperReg = MI->getOperand(0).getReg();
+      bool IsKill = MI->getOperand(0).isKill();
+      // SubReg carries the "Kill" flag when SubReg == SuperReg.
+      unsigned SubKillState = getKillRegState((NumSubRegs == 1) && IsKill);
       for (unsigned i = 0, e = NumSubRegs; i < e; ++i) {
-        unsigned SubReg = getPhysRegSubReg(MI->getOperand(0).getReg(),
+        unsigned SubReg = getPhysRegSubReg(SuperReg,
                                            &AMDGPU::SGPR_32RegClass, i);
+
         struct SIMachineFunctionInfo::SpilledReg Spill =
             MFI->getSpilledReg(MF, Index, i);
 
-        BuildMI(*MBB, MI, DL,
-                TII->getMCOpcodeFromPseudo(AMDGPU::V_WRITELANE_B32),
-                Spill.VGPR)
-                .addReg(SubReg)
-                .addImm(Spill.Lane);
-
-        // FIXME: Since this spills to another register instead of an actual
-        // frame index, we should delete the frame index when all references to
-        // it are fixed.
+        if (Spill.hasReg()) {
+          BuildMI(*MBB, MI, DL,
+                  TII->getMCOpcodeFromPseudo(AMDGPU::V_WRITELANE_B32),
+                  Spill.VGPR)
+                  .addReg(SubReg, getKillRegState(IsKill))
+                  .addImm(Spill.Lane);
+
+          // FIXME: Since this spills to another register instead of an actual
+          // frame index, we should delete the frame index when all references to
+          // it are fixed.
+        } else {
+          // Spill SGPR to a frame index.
+          // FIXME we should use S_STORE_DWORD here for VI.
+          MachineInstrBuilder Mov
+            = BuildMI(*MBB, MI, DL, TII->get(AMDGPU::V_MOV_B32_e32), TmpReg)
+            .addReg(SubReg, SubKillState);
+
+
+          // There could be undef components of a spilled super register.
+          // TODO: Can we detect this and skip the spill?
+          if (NumSubRegs > 1) {
+            // The last implicit use of the SuperReg carries the "Kill" flag.
+            unsigned SuperKillState = 0;
+            if (i + 1 == e)
+              SuperKillState |= getKillRegState(IsKill);
+            Mov.addReg(SuperReg, RegState::Implicit | SuperKillState);
+          }
+
+          unsigned Size = FrameInfo->getObjectSize(Index);
+          unsigned Align = FrameInfo->getObjectAlignment(Index);
+          MachinePointerInfo PtrInfo
+              = MachinePointerInfo::getFixedStack(*MF, Index);
+          MachineMemOperand *MMO
+              = MF->getMachineMemOperand(PtrInfo, MachineMemOperand::MOStore,
+                                         Size, Align);
+          BuildMI(*MBB, MI, DL, TII->get(AMDGPU::SI_SPILL_V32_SAVE))
+                  .addReg(TmpReg, RegState::Kill)         // src
+                  .addFrameIndex(Index)                   // frame_idx
+                  .addReg(MFI->getScratchRSrcReg())       // scratch_rsrc
+                  .addReg(MFI->getScratchWaveOffsetReg()) // scratch_offset
+                  .addImm(i * 4)                          // offset
+                  .addMemOperand(MMO);
+        }
       }
       MI->eraseFromParent();
+      MFI->addToSpilledSGPRs(NumSubRegs);
       break;
     }
 
@@ -329,6 +584,7 @@ void SIRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator MI,
     case AMDGPU::SI_SPILL_S64_RESTORE:
     case AMDGPU::SI_SPILL_S32_RESTORE: {
       unsigned NumSubRegs = getNumSubRegsForSpillOp(MI->getOpcode());
+      unsigned TmpReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
 
       for (unsigned i = 0, e = NumSubRegs; i < e; ++i) {
         unsigned SubReg = getPhysRegSubReg(MI->getOperand(0).getReg(),
@@ -336,28 +592,37 @@ void SIRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator MI,
         struct SIMachineFunctionInfo::SpilledReg Spill =
             MFI->getSpilledReg(MF, Index, i);
 
-        BuildMI(*MBB, MI, DL,
-                TII->getMCOpcodeFromPseudo(AMDGPU::V_READLANE_B32),
-                SubReg)
-                .addReg(Spill.VGPR)
-                .addImm(Spill.Lane)
-                .addReg(MI->getOperand(0).getReg(), RegState::ImplicitDefine);
-      }
-
-      // TODO: only do this when it is needed
-      switch (MF->getSubtarget<AMDGPUSubtarget>().getGeneration()) {
-      case AMDGPUSubtarget::SOUTHERN_ISLANDS:
-        // "VALU writes SGPR" -> "SMRD reads that SGPR" needs 4 wait states
-        // ("S_NOP 3") on SI
-        TII->insertWaitStates(MI, 4);
-        break;
-      case AMDGPUSubtarget::SEA_ISLANDS:
-        break;
-      default: // VOLCANIC_ISLANDS and later
-        // "VALU writes SGPR -> VMEM reads that SGPR" needs 5 wait states
-        // ("S_NOP 4") on VI and later. This also applies to VALUs which write
-        // VCC, but we're unlikely to see VMEM use VCC.
-        TII->insertWaitStates(MI, 5);
+        if (Spill.hasReg()) {
+          BuildMI(*MBB, MI, DL,
+                  TII->getMCOpcodeFromPseudo(AMDGPU::V_READLANE_B32),
+                  SubReg)
+                  .addReg(Spill.VGPR)
+                  .addImm(Spill.Lane)
+                  .addReg(MI->getOperand(0).getReg(), RegState::ImplicitDefine);
+        } else {
+          // Restore SGPR from a stack slot.
+          // FIXME: We should use S_LOAD_DWORD here for VI.
+
+          unsigned Align = FrameInfo->getObjectAlignment(Index);
+          unsigned Size = FrameInfo->getObjectSize(Index);
+
+          MachinePointerInfo PtrInfo
+              = MachinePointerInfo::getFixedStack(*MF, Index);
+
+          MachineMemOperand *MMO = MF->getMachineMemOperand(
+              PtrInfo, MachineMemOperand::MOLoad, Size, Align);
+
+          BuildMI(*MBB, MI, DL, TII->get(AMDGPU::SI_SPILL_V32_RESTORE), TmpReg)
+                  .addFrameIndex(Index)                   // frame_idx
+                  .addReg(MFI->getScratchRSrcReg())       // scratch_rsrc
+                  .addReg(MFI->getScratchWaveOffsetReg()) // scratch_offset
+                  .addImm(i * 4)                          // offset
+                  .addMemOperand(MMO);
+          BuildMI(*MBB, MI, DL,
+                  TII->get(AMDGPU::V_READFIRSTLANE_B32), SubReg)
+                  .addReg(TmpReg, RegState::Kill)
+                  .addReg(MI->getOperand(0).getReg(), RegState::ImplicitDefine);
+        }
       }
 
       MI->eraseFromParent();
@@ -372,11 +637,13 @@ void SIRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator MI,
     case AMDGPU::SI_SPILL_V64_SAVE:
     case AMDGPU::SI_SPILL_V32_SAVE:
       buildScratchLoadStore(MI, AMDGPU::BUFFER_STORE_DWORD_OFFSET,
-            TII->getNamedOperand(*MI, AMDGPU::OpName::src)->getReg(),
+            TII->getNamedOperand(*MI, AMDGPU::OpName::src),
             TII->getNamedOperand(*MI, AMDGPU::OpName::scratch_rsrc)->getReg(),
             TII->getNamedOperand(*MI, AMDGPU::OpName::scratch_offset)->getReg(),
-             FrameInfo->getObjectOffset(Index), RS);
+            FrameInfo->getObjectOffset(Index) +
+            TII->getNamedOperand(*MI, AMDGPU::OpName::offset)->getImm(), RS);
       MI->eraseFromParent();
+      MFI->addToSpilledVGPRs(getNumSubRegsForSpillOp(MI->getOpcode()));
       break;
     case AMDGPU::SI_SPILL_V32_RESTORE:
     case AMDGPU::SI_SPILL_V64_RESTORE:
@@ -385,10 +652,11 @@ void SIRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator MI,
     case AMDGPU::SI_SPILL_V256_RESTORE:
     case AMDGPU::SI_SPILL_V512_RESTORE: {
       buildScratchLoadStore(MI, AMDGPU::BUFFER_LOAD_DWORD_OFFSET,
-            TII->getNamedOperand(*MI, AMDGPU::OpName::dst)->getReg(),
+            TII->getNamedOperand(*MI, AMDGPU::OpName::dst),
             TII->getNamedOperand(*MI, AMDGPU::OpName::scratch_rsrc)->getReg(),
             TII->getNamedOperand(*MI, AMDGPU::OpName::scratch_offset)->getReg(),
-            FrameInfo->getObjectOffset(Index), RS);
+            FrameInfo->getObjectOffset(Index) +
+            TII->getNamedOperand(*MI, AMDGPU::OpName::offset)->getImm(), RS);
       MI->eraseFromParent();
       break;
     }
@@ -396,8 +664,8 @@ void SIRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator MI,
     default: {
       int64_t Offset = FrameInfo->getObjectOffset(Index);
       FIOp.ChangeToImmediate(Offset);
-      if (!TII->isImmOperandLegal(MI, FIOperandNum, FIOp)) {
-        unsigned TmpReg = RS->scavengeRegister(&AMDGPU::VGPR_32RegClass, MI, SPAdj);
+      if (!TII->isImmOperandLegal(*MI, FIOperandNum, FIOp)) {
+        unsigned TmpReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
         BuildMI(*MBB, MI, MI->getDebugLoc(),
                 TII->get(AMDGPU::V_MOV_B32_e32), TmpReg)
                 .addImm(Offset);
@@ -407,10 +675,6 @@ void SIRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator MI,
   }
 }
 
-unsigned SIRegisterInfo::getHWRegIndex(unsigned Reg) const {
-  return getEncodingValue(Reg) & 0xff;
-}
-
 // FIXME: This is very slow. It might be worth creating a map from physreg to
 // register class.
 const TargetRegisterClass *SIRegisterInfo::getPhysRegClass(unsigned Reg) const {
@@ -427,7 +691,8 @@ const TargetRegisterClass *SIRegisterInfo::getPhysRegClass(unsigned Reg) const {
     &AMDGPU::VReg_256RegClass,
     &AMDGPU::SReg_256RegClass,
     &AMDGPU::VReg_512RegClass,
-    &AMDGPU::SReg_512RegClass
+    &AMDGPU::SReg_512RegClass,
+    &AMDGPU::SCC_CLASSRegClass,
   };
 
   for (const TargetRegisterClass *BaseClass : BaseClasses) {
@@ -442,6 +707,8 @@ const TargetRegisterClass *SIRegisterInfo::getPhysRegClass(unsigned Reg) const {
 // TargetRegisterClass to mark which classes are VGPRs to make this trivial.
 bool SIRegisterInfo::hasVGPRs(const TargetRegisterClass *RC) const {
   switch (RC->getSize()) {
+  case 0: return false;
+  case 1: return false;
   case 4:
     return getCommonSubClass(&AMDGPU::VGPR_32RegClass, RC) != nullptr;
   case 8:
@@ -479,6 +746,24 @@ const TargetRegisterClass *SIRegisterInfo::getEquivalentVGPRClass(
   }
 }
 
+const TargetRegisterClass *SIRegisterInfo::getEquivalentSGPRClass(
+                                         const TargetRegisterClass *VRC) const {
+  switch (VRC->getSize()) {
+  case 4:
+    return &AMDGPU::SGPR_32RegClass;
+  case 8:
+    return &AMDGPU::SReg_64RegClass;
+  case 16:
+    return &AMDGPU::SReg_128RegClass;
+  case 32:
+    return &AMDGPU::SReg_256RegClass;
+  case 64:
+    return &AMDGPU::SReg_512RegClass;
+  default:
+    llvm_unreachable("Invalid register class size");
+  }
+}
+
 const TargetRegisterClass *SIRegisterInfo::getSubRegClass(
                          const TargetRegisterClass *RC, unsigned SubIdx) const {
   if (SubIdx == AMDGPU::NoSubRegister)
@@ -552,7 +837,21 @@ unsigned SIRegisterInfo::getPhysRegSubReg(unsigned Reg,
       switch(Channel) {
         case 0: return AMDGPU::VCC_LO;
         case 1: return AMDGPU::VCC_HI;
-        default: llvm_unreachable("Invalid SubIdx for VCC");
+        default: llvm_unreachable("Invalid SubIdx for VCC"); break;
+      }
+
+    case AMDGPU::TBA:
+      switch(Channel) {
+        case 0: return AMDGPU::TBA_LO;
+        case 1: return AMDGPU::TBA_HI;
+        default: llvm_unreachable("Invalid SubIdx for TBA"); break;
+      }
+
+    case AMDGPU::TMA:
+      switch(Channel) {
+        case 0: return AMDGPU::TMA_LO;
+        case 1: return AMDGPU::TMA_HI;
+        default: llvm_unreachable("Invalid SubIdx for TMA"); break;
       }
 
   case AMDGPU::FLAT_SCR:
@@ -610,7 +909,7 @@ unsigned SIRegisterInfo::getPreloadedValue(const MachineFunction &MF,
                                            enum PreloadedValue Value) const {
 
   const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
-  const AMDGPUSubtarget &ST = MF.getSubtarget<AMDGPUSubtarget>();
+  const SISubtarget &ST = MF.getSubtarget<SISubtarget>();
   (void)ST;
   switch (Value) {
   case SIRegisterInfo::WORKGROUP_ID_X:
@@ -631,11 +930,17 @@ unsigned SIRegisterInfo::getPreloadedValue(const MachineFunction &MF,
   case SIRegisterInfo::KERNARG_SEGMENT_PTR:
     assert(MFI->hasKernargSegmentPtr());
     return MFI->KernargSegmentPtrUserSGPR;
+  case SIRegisterInfo::DISPATCH_ID:
+    llvm_unreachable("unimplemented");
+  case SIRegisterInfo::FLAT_SCRATCH_INIT:
+    assert(MFI->hasFlatScratchInit());
+    return MFI->FlatScratchInitUserSGPR;
   case SIRegisterInfo::DISPATCH_PTR:
     assert(MFI->hasDispatchPtr());
     return MFI->DispatchPtrUserSGPR;
   case SIRegisterInfo::QUEUE_PTR:
-    llvm_unreachable("not implemented");
+    assert(MFI->hasQueuePtr());
+    return MFI->QueuePtrUserSGPR;
   case SIRegisterInfo::WORKITEM_ID_X:
     assert(MFI->hasWorkItemIDX());
     return AMDGPU::VGPR0;
@@ -675,9 +980,9 @@ unsigned SIRegisterInfo::getNumVGPRsAllowed(unsigned WaveCount) const {
   }
 }
 
-unsigned SIRegisterInfo::getNumSGPRsAllowed(AMDGPUSubtarget::Generation gen,
+unsigned SIRegisterInfo::getNumSGPRsAllowed(const SISubtarget &ST,
                                             unsigned WaveCount) const {
-  if (gen >= AMDGPUSubtarget::VOLCANIC_ISLANDS) {
+  if (ST.getGeneration() >= SISubtarget::VOLCANIC_ISLANDS) {
     switch (WaveCount) {
       case 10: return 80;
       case 9:  return 80;
@@ -696,3 +1001,14 @@ unsigned SIRegisterInfo::getNumSGPRsAllowed(AMDGPUSubtarget::Generation gen,
     }
   }
 }
+
+bool SIRegisterInfo::isVGPR(const MachineRegisterInfo &MRI,
+                            unsigned Reg) const {
+  const TargetRegisterClass *RC;
+  if (TargetRegisterInfo::isVirtualRegister(Reg))
+    RC = MRI.getRegClass(Reg);
+  else
+    RC = getPhysRegClass(Reg);
+
+  return hasVGPRs(RC);
+}
diff --git a/lib/Target/AMDGPU/SIRegisterInfo.h b/lib/Target/AMDGPU/SIRegisterInfo.h
index 9410e2049cba..6e97b1b910a9 100644
--- a/lib/Target/AMDGPU/SIRegisterInfo.h
+++ b/lib/Target/AMDGPU/SIRegisterInfo.h
@@ -12,23 +12,27 @@
 //
 //===----------------------------------------------------------------------===//
 
-
-#ifndef LLVM_LIB_TARGET_R600_SIREGISTERINFO_H
-#define LLVM_LIB_TARGET_R600_SIREGISTERINFO_H
+#ifndef LLVM_LIB_TARGET_AMDGPU_SIREGISTERINFO_H
+#define LLVM_LIB_TARGET_AMDGPU_SIREGISTERINFO_H
 
 #include "AMDGPURegisterInfo.h"
-#include "AMDGPUSubtarget.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
-#include "llvm/Support/Debug.h"
 
 namespace llvm {
 
-struct SIRegisterInfo : public AMDGPURegisterInfo {
+class SISubtarget;
+class MachineRegisterInfo;
+
+struct SIRegisterInfo final : public AMDGPURegisterInfo {
 private:
   unsigned SGPR32SetID;
   unsigned VGPR32SetID;
+  BitVector SGPRPressureSets;
+  BitVector VGPRPressureSets;
 
   void reserveRegisterTuples(BitVector &, unsigned Reg) const;
+  void classifyPressureSet(unsigned PSetID, unsigned Reg,
+                           BitVector &PressureSets) const;
 
 public:
   SIRegisterInfo();
@@ -47,13 +51,39 @@ public:
   unsigned getRegPressureSetLimit(const MachineFunction &MF,
                                   unsigned Idx) const override;
 
+
   bool requiresRegisterScavenging(const MachineFunction &Fn) const override;
 
+
+  bool requiresFrameIndexScavenging(const MachineFunction &MF) const override;
+  bool requiresVirtualBaseRegisters(const MachineFunction &Fn) const override;
+  bool trackLivenessAfterRegAlloc(const MachineFunction &MF) const override;
+
+  int64_t getFrameIndexInstrOffset(const MachineInstr *MI,
+                                   int Idx) const override;
+
+  bool needsFrameBaseReg(MachineInstr *MI, int64_t Offset) const override;
+
+  void materializeFrameBaseRegister(MachineBasicBlock *MBB,
+                                    unsigned BaseReg, int FrameIdx,
+                                    int64_t Offset) const override;
+
+  void resolveFrameIndex(MachineInstr &MI, unsigned BaseReg,
+                         int64_t Offset) const override;
+
+  bool isFrameOffsetLegal(const MachineInstr *MI, unsigned BaseReg,
+                          int64_t Offset) const override;
+
+  const TargetRegisterClass *getPointerRegClass(
+    const MachineFunction &MF, unsigned Kind = 0) const override;
+
   void eliminateFrameIndex(MachineBasicBlock::iterator MI, int SPAdj,
                            unsigned FIOperandNum,
                            RegScavenger *RS) const override;
 
-  unsigned getHWRegIndex(unsigned Reg) const override;
+  unsigned getHWRegIndex(unsigned Reg) const {
+    return getEncodingValue(Reg) & 0xff;
+  }
 
   /// \brief Return the 'base' register class for this register.
   /// e.g. SGPR0 => SReg_32, VGPR => VGPR_32 SGPR0_SGPR1 -> SReg_32, etc.
@@ -70,9 +100,12 @@ public:
   }
 
   bool isSGPRReg(const MachineRegisterInfo &MRI, unsigned Reg) const {
+    const TargetRegisterClass *RC;
     if (TargetRegisterInfo::isVirtualRegister(Reg))
-      return isSGPRClass(MRI.getRegClass(Reg));
-    return getPhysRegClass(Reg);
+      RC = MRI.getRegClass(Reg);
+    else
+      RC = getPhysRegClass(Reg);
+    return isSGPRClass(RC);
   }
 
   /// \returns true if this class contains VGPR registers.
@@ -89,6 +122,10 @@ public:
   const TargetRegisterClass *getEquivalentVGPRClass(
                                           const TargetRegisterClass *SRC) const;
 
+  /// \returns A SGPR reg class with the same width as \p SRC
+  const TargetRegisterClass *getEquivalentSGPRClass(
+                                           const TargetRegisterClass *VRC) const;
+
   /// \returns The register class that is used for a sub-register of \p RC for
   /// the given \p SubIdx.  If \p SubIdx equals NoSubRegister, \p RC will
   /// be returned.
@@ -117,10 +154,12 @@ public:
 
   enum PreloadedValue {
     // SGPRS:
-    PRIVATE_SEGMENT_BUFFER =  0,
+    PRIVATE_SEGMENT_BUFFER = 0,
     DISPATCH_PTR        =  1,
     QUEUE_PTR           =  2,
     KERNARG_SEGMENT_PTR =  3,
+    DISPATCH_ID         =  4,
+    FLAT_SCRATCH_INIT   =  5,
     WORKGROUP_ID_X      = 10,
     WORKGROUP_ID_Y      = 11,
     WORKGROUP_ID_Z      = 12,
@@ -143,8 +182,7 @@ public:
 
   /// \brief Give the maximum number of SGPRs that can be used by \p WaveCount
   ///        concurrent waves.
-  unsigned getNumSGPRsAllowed(AMDGPUSubtarget::Generation gen,
-                              unsigned WaveCount) const;
+  unsigned getNumSGPRsAllowed(const SISubtarget &ST, unsigned WaveCount) const;
 
   unsigned findUnusedRegister(const MachineRegisterInfo &MRI,
                               const TargetRegisterClass *RC) const;
@@ -152,11 +190,14 @@ public:
   unsigned getSGPR32PressureSet() const { return SGPR32SetID; };
   unsigned getVGPR32PressureSet() const { return VGPR32SetID; };
 
+  bool isVGPR(const MachineRegisterInfo &MRI, unsigned Reg) const;
+
 private:
   void buildScratchLoadStore(MachineBasicBlock::iterator MI,
-                             unsigned LoadStoreOp, unsigned Value,
+                             unsigned LoadStoreOp, const MachineOperand *SrcDst,
                              unsigned ScratchRsrcReg, unsigned ScratchOffset,
-                             int64_t Offset, RegScavenger *RS) const;
+                             int64_t Offset,
+                             RegScavenger *RS) const;
 };
 
 } // End namespace llvm
diff --git a/lib/Target/AMDGPU/SIRegisterInfo.td b/lib/Target/AMDGPU/SIRegisterInfo.td
index bfaf93709d8c..c427874d467a 100644
--- a/lib/Target/AMDGPU/SIRegisterInfo.td
+++ b/lib/Target/AMDGPU/SIRegisterInfo.td
@@ -44,6 +44,40 @@ def EXEC : RegisterWithSubRegs<"EXEC", [EXEC_LO, EXEC_HI]>,
 def SCC : SIReg<"scc", 253>;
 def M0 : SIReg <"m0", 124>;
 
+// Trap handler registers
+def TBA_LO : SIReg<"tba_lo", 108>;
+def TBA_HI : SIReg<"tba_hi", 109>;
+
+def TBA : RegisterWithSubRegs<"tba", [TBA_LO, TBA_HI]>,
+          DwarfRegAlias<TBA_LO> {
+  let Namespace = "AMDGPU";
+  let SubRegIndices = [sub0, sub1];
+  let HWEncoding = 108;
+}
+
+def TMA_LO : SIReg<"tma_lo", 110>;
+def TMA_HI : SIReg<"tma_hi", 111>;
+
+def TMA : RegisterWithSubRegs<"tma", [TMA_LO, TMA_HI]>,
+          DwarfRegAlias<TMA_LO> {
+  let Namespace = "AMDGPU";
+  let SubRegIndices = [sub0, sub1];
+  let HWEncoding = 110;
+}
+
+def TTMP0 : SIReg <"ttmp0", 112>;
+def TTMP1 : SIReg <"ttmp1", 113>;
+def TTMP2 : SIReg <"ttmp2", 114>;
+def TTMP3 : SIReg <"ttmp3", 115>;
+def TTMP4 : SIReg <"ttmp4", 116>;
+def TTMP5 : SIReg <"ttmp5", 117>;
+def TTMP6 : SIReg <"ttmp6", 118>;
+def TTMP7 : SIReg <"ttmp7", 119>;
+def TTMP8 : SIReg <"ttmp8", 120>;
+def TTMP9 : SIReg <"ttmp9", 121>;
+def TTMP10 : SIReg <"ttmp10", 122>;
+def TTMP11 : SIReg <"ttmp11", 123>;
+
 multiclass FLAT_SCR_LOHI_m <string n, bits<16> ci_e, bits<16> vi_e> {
   def _ci : SIReg<n, ci_e>;
   def _vi : SIReg<n, vi_e>;
@@ -81,11 +115,18 @@ foreach Index = 0-255 in {
 //  Groupings using register classes and tuples
 //===----------------------------------------------------------------------===//
 
+def SCC_CLASS : RegisterClass<"AMDGPU", [i1], 1, (add SCC)> {
+  let CopyCost = -1;
+  let isAllocatable = 0;
+}
+
 // TODO: Do we need to set DwarfRegAlias on register tuples?
 
 // SGPR 32-bit registers
 def SGPR_32 : RegisterClass<"AMDGPU", [i32, f32], 32,
-                            (add (sequence "SGPR%u", 0, 103))>;
+                            (add (sequence "SGPR%u", 0, 103))> {
+  let AllocationPriority = 1;
+}
 
 // SGPR 64-bit registers
 def SGPR_64Regs : RegisterTuples<[sub0, sub1],
@@ -93,7 +134,7 @@ def SGPR_64Regs : RegisterTuples<[sub0, sub1],
                               (add (decimate (shl SGPR_32, 1), 2))]>;
 
 // SGPR 128-bit registers
-def SGPR_128 : RegisterTuples<[sub0, sub1, sub2, sub3],
+def SGPR_128Regs : RegisterTuples<[sub0, sub1, sub2, sub3],
                               [(add (decimate SGPR_32, 4)),
                                (add (decimate (shl SGPR_32, 1), 4)),
                                (add (decimate (shl SGPR_32, 2), 4)),
@@ -130,9 +171,29 @@ def SGPR_512 : RegisterTuples<[sub0, sub1, sub2, sub3, sub4, sub5, sub6, sub7,
                                (add (decimate (shl SGPR_32, 14), 4)),
                                (add (decimate (shl SGPR_32, 15), 4))]>;
 
+// Trap handler TMP 32-bit registers
+def TTMP_32 : RegisterClass<"AMDGPU", [i32, f32], 32,
+                            (add (sequence "TTMP%u", 0, 11))> {
+  let isAllocatable = 0;
+}
+
+// Trap handler TMP 64-bit registers
+def TTMP_64Regs : RegisterTuples<[sub0, sub1],
+                             [(add (decimate TTMP_32, 2)),
+                              (add (decimate (shl TTMP_32, 1), 2))]>;
+
+// Trap handler TMP 128-bit registers
+def TTMP_128Regs : RegisterTuples<[sub0, sub1, sub2, sub3],
+                              [(add (decimate TTMP_32, 4)),
+                               (add (decimate (shl TTMP_32, 1), 4)),
+                               (add (decimate (shl TTMP_32, 2), 4)),
+                               (add (decimate (shl TTMP_32, 3), 4))]>;
+
 // VGPR 32-bit registers
 def VGPR_32 : RegisterClass<"AMDGPU", [i32, f32], 32,
-                            (add (sequence "VGPR%u", 0, 255))>;
+                            (add (sequence "VGPR%u", 0, 255))> {
+  let AllocationPriority = 1;
+}
 
 // VGPR 64-bit registers
 def VGPR_64 : RegisterTuples<[sub0, sub1],
@@ -192,36 +253,67 @@ class RegImmMatcher<string name> : AsmOperandClass {
   let RenderMethod = "addRegOrImmOperands";
 }
 
+// Subset of SReg_32 without M0 for SMRD instructions and alike.
+// See comments in SIInstructions.td for more info.
+def SReg_32_XM0 : RegisterClass<"AMDGPU", [i32, f32], 32,
+  (add SGPR_32, VCC_LO, VCC_HI, EXEC_LO, EXEC_HI, FLAT_SCR_LO, FLAT_SCR_HI,
+   TTMP_32, TMA_LO, TMA_HI, TBA_LO, TBA_HI)> {
+  let AllocationPriority = 1;
+}
+
 // Register class for all scalar registers (SGPRs + Special Registers)
 def SReg_32 : RegisterClass<"AMDGPU", [i32, f32], 32,
-  (add SGPR_32, M0, VCC_LO, VCC_HI, EXEC_LO, EXEC_HI, FLAT_SCR_LO, FLAT_SCR_HI)
->;
+  (add SReg_32_XM0, M0)> {
+  let AllocationPriority = 1;
+}
+
+def SGPR_64 : RegisterClass<"AMDGPU", [v2i32, i64, f64], 32, (add SGPR_64Regs)> {
+  let AllocationPriority = 2;
+}
 
-def SGPR_64 : RegisterClass<"AMDGPU", [v2i32, i64, f64], 32, (add SGPR_64Regs)>;
+def TTMP_64 : RegisterClass<"AMDGPU", [v2i32, i64, f64], 32, (add TTMP_64Regs)> {
+  let isAllocatable = 0;
+}
 
 def SReg_64 : RegisterClass<"AMDGPU", [v2i32, i64, f64, i1], 32,
-  (add SGPR_64, VCC, EXEC, FLAT_SCR)
->;
+  (add SGPR_64, VCC, EXEC, FLAT_SCR, TTMP_64, TBA, TMA)> {
+  let AllocationPriority = 2;
+}
 
-def SReg_128 : RegisterClass<"AMDGPU", [v4i32, v16i8, v2i64], 32, (add SGPR_128)> {
-  // Requires 2 s_mov_b64 to copy
-  let CopyCost = 2;
+// Requires 2 s_mov_b64 to copy
+let CopyCost = 2 in {
+
+def SGPR_128 : RegisterClass<"AMDGPU", [v4i32, v16i8, v2i64], 32, (add SGPR_128Regs)> {
+  let AllocationPriority = 4;
+}
+
+def TTMP_128 : RegisterClass<"AMDGPU", [v4i32, v16i8, v2i64], 32, (add TTMP_128Regs)> {
+  let isAllocatable = 0;
+}
+
+def SReg_128 : RegisterClass<"AMDGPU", [v4i32, v16i8, v2i64], 32, (add SGPR_128, TTMP_128)> {
+  let AllocationPriority = 4;
 }
 
-def SReg_256 : RegisterClass<"AMDGPU", [v32i8, v8i32, v8f32], 32, (add SGPR_256)> {
+} // End CopyCost = 2
+
+def SReg_256 : RegisterClass<"AMDGPU", [v8i32, v8f32], 32, (add SGPR_256)> {
   // Requires 4 s_mov_b64 to copy
   let CopyCost = 4;
+  let AllocationPriority = 5;
 }
 
 def SReg_512 : RegisterClass<"AMDGPU", [v64i8, v16i32], 32, (add SGPR_512)> {
   // Requires 8 s_mov_b64 to copy
   let CopyCost = 8;
+  let AllocationPriority = 6;
 }
 
 // Register class for all vector registers (VGPRs + Interploation Registers)
 def VReg_64 : RegisterClass<"AMDGPU", [i64, f64, v2i32, v2f32], 32, (add VGPR_64)> {
   // Requires 2 v_mov_b32 to copy
   let CopyCost = 2;
+  let AllocationPriority = 2;
 }
 
 def VReg_96 : RegisterClass<"AMDGPU", [untyped], 32, (add VGPR_96)> {
@@ -229,19 +321,23 @@ def VReg_96 : RegisterClass<"AMDGPU", [untyped], 32, (add VGPR_96)> {
 
   // Requires 3 v_mov_b32 to copy
   let CopyCost = 3;
+  let AllocationPriority = 3;
 }
 
 def VReg_128 : RegisterClass<"AMDGPU", [v4i32, v4f32, v2i64, v2f64], 32, (add VGPR_128)> {
   // Requires 4 v_mov_b32 to copy
   let CopyCost = 4;
+  let AllocationPriority = 4;
 }
 
-def VReg_256 : RegisterClass<"AMDGPU", [v32i8, v8i32, v8f32], 32, (add VGPR_256)> {
+def VReg_256 : RegisterClass<"AMDGPU", [v8i32, v8f32], 32, (add VGPR_256)> {
   let CopyCost = 8;
+  let AllocationPriority = 5;
 }
 
 def VReg_512 : RegisterClass<"AMDGPU", [v16i32, v16f32], 32, (add VGPR_512)> {
   let CopyCost = 16;
+  let AllocationPriority = 6;
 }
 
 def VReg_1 : RegisterClass<"AMDGPU", [i1], 32, (add VGPR_32)> {
diff --git a/lib/Target/AMDGPU/SISchedule.td b/lib/Target/AMDGPU/SISchedule.td
index cd77e519abb2..ed19217226b8 100644
--- a/lib/Target/AMDGPU/SISchedule.td
+++ b/lib/Target/AMDGPU/SISchedule.td
@@ -11,6 +11,12 @@
 //
 //===----------------------------------------------------------------------===//
 
+def : PredicateProlog<[{
+  const SIInstrInfo *TII =
+    static_cast<const SIInstrInfo*>(SchedModel->getInstrInfo());
+  (void)TII;
+}]>;
+
 def WriteBranch : SchedWrite;
 def WriteExport : SchedWrite;
 def WriteLDS    : SchedWrite;
@@ -39,20 +45,33 @@ def Write64Bit : SchedWrite;
 // instructions and have VALU rates, but write to the SALU (i.e. VOPC
 // instructions)
 
-def SIFullSpeedModel : SchedMachineModel;
-def SIQuarterSpeedModel : SchedMachineModel;
+class SISchedMachineModel : SchedMachineModel {
+  let CompleteModel = 0;
+  let IssueWidth = 1;
+  let PostRAScheduler = 1;
+}
 
-// BufferSize = 0 means the processors are in-order.
-let BufferSize = 0 in {
+def SIFullSpeedModel : SISchedMachineModel;
+def SIQuarterSpeedModel : SISchedMachineModel;
 
 // XXX: Are the resource counts correct?
-def HWBranch : ProcResource<1>;
-def HWExport : ProcResource<7>;   // Taken from S_WAITCNT
-def HWLGKM   : ProcResource<31>;  // Taken from S_WAITCNT
-def HWSALU   : ProcResource<1>;
-def HWVMEM   : ProcResource<15>;  // Taken from S_WAITCNT
-def HWVALU   : ProcResource<1>;
-
+def HWBranch : ProcResource<1> {
+  let BufferSize = 1;
+}
+def HWExport : ProcResource<1> {
+  let BufferSize = 7; // Taken from S_WAITCNT
+}
+def HWLGKM   : ProcResource<1> {
+  let BufferSize = 31;  // Taken from S_WAITCNT
+}
+def HWSALU   : ProcResource<1> {
+  let BufferSize = 1;
+}
+def HWVMEM   : ProcResource<1> {
+  let BufferSize = 15;  // Taken from S_WAITCNT
+}
+def HWVALU   : ProcResource<1> {
+  let BufferSize = 1;
 }
 
 class HWWriteRes<SchedWrite write, list<ProcResourceKind> resources,
@@ -70,12 +89,12 @@ class HWVALUWriteRes<SchedWrite write, int latency> :
 // The latency values are 1 / (operations / cycle) / 4.
 multiclass SICommonWriteRes {
 
-  def : HWWriteRes<WriteBranch,  [HWBranch], 100>; // XXX: Guessed ???
-  def : HWWriteRes<WriteExport,  [HWExport], 100>; // XXX: Guessed ???
-  def : HWWriteRes<WriteLDS,     [HWLGKM],    32>; // 2 - 64
-  def : HWWriteRes<WriteSALU,    [HWSALU],     1>;
-  def : HWWriteRes<WriteSMEM,    [HWLGKM],    10>; // XXX: Guessed ???
-  def : HWWriteRes<WriteVMEM,    [HWVMEM],   450>; // 300 - 600
+  def : HWWriteRes<WriteBranch,  [HWBranch], 8>;
+  def : HWWriteRes<WriteExport,  [HWExport], 4>;
+  def : HWWriteRes<WriteLDS,     [HWLGKM],   5>; // Can be between 2 and 64
+  def : HWWriteRes<WriteSALU,    [HWSALU],   1>;
+  def : HWWriteRes<WriteSMEM,    [HWLGKM],   5>;
+  def : HWWriteRes<WriteVMEM,    [HWVMEM],   80>;
   def : HWWriteRes<WriteBarrier, [HWBranch], 500>; // XXX: Guessed ???
 
   def : HWVALUWriteRes<Write32Bit,         1>;
@@ -83,6 +102,12 @@ multiclass SICommonWriteRes {
   def : HWVALUWriteRes<WriteQuarterRate32, 4>;
 }
 
+def PredIsVGPR32Copy : SchedPredicate<[{TII->isVGPRCopy(*MI) && TII->getOpSize(*MI, 0) <= 32}]>;
+def PredIsVGPR64Copy : SchedPredicate<[{TII->isVGPRCopy(*MI) && TII->getOpSize(*MI, 0) > 32}]>;
+def WriteCopy : SchedWriteVariant<[
+    SchedVar<PredIsVGPR32Copy, [Write32Bit]>,
+    SchedVar<PredIsVGPR64Copy, [Write64Bit]>,
+    SchedVar<NoSchedPred, [WriteSALU]>]>;
 
 let SchedModel = SIFullSpeedModel in {
 
@@ -92,6 +117,8 @@ def : HWVALUWriteRes<WriteFloatFMA,   1>;
 def : HWVALUWriteRes<WriteDouble,     4>;
 def : HWVALUWriteRes<WriteDoubleAdd,  2>;
 
+def : InstRW<[WriteCopy], (instrs COPY)>;
+
 } // End SchedModel = SIFullSpeedModel
 
 let SchedModel = SIQuarterSpeedModel in {
@@ -102,4 +129,6 @@ def : HWVALUWriteRes<WriteFloatFMA, 16>;
 def : HWVALUWriteRes<WriteDouble,   16>;
 def : HWVALUWriteRes<WriteDoubleAdd, 8>;
 
+def : InstRW<[WriteCopy], (instrs COPY)>;
+
 }  // End SchedModel = SIQuarterSpeedModel
diff --git a/lib/Target/AMDGPU/SIShrinkInstructions.cpp b/lib/Target/AMDGPU/SIShrinkInstructions.cpp
index 4f0913fe62f2..6cba55300a8c 100644
--- a/lib/Target/AMDGPU/SIShrinkInstructions.cpp
+++ b/lib/Target/AMDGPU/SIShrinkInstructions.cpp
@@ -31,10 +31,6 @@ STATISTIC(NumInstructionsShrunk,
 STATISTIC(NumLiteralConstantsFolded,
           "Number of literal constants folded into 32-bit instructions.");
 
-namespace llvm {
-  void initializeSIShrinkInstructionsPass(PassRegistry&);
-}
-
 using namespace llvm;
 
 namespace {
@@ -61,10 +57,8 @@ public:
 
 } // End anonymous namespace.
 
-INITIALIZE_PASS_BEGIN(SIShrinkInstructions, DEBUG_TYPE,
-                      "SI Lower il Copies", false, false)
-INITIALIZE_PASS_END(SIShrinkInstructions, DEBUG_TYPE,
-                    "SI Lower il Copies", false, false)
+INITIALIZE_PASS(SIShrinkInstructions, DEBUG_TYPE,
+                "SI Shrink Instructions", false, false)
 
 char SIShrinkInstructions::ID = 0;
 
@@ -125,10 +119,7 @@ static bool canShrink(MachineInstr &MI, const SIInstrInfo *TII,
   if (TII->hasModifiersSet(MI, AMDGPU::OpName::omod))
     return false;
 
-  if (TII->hasModifiersSet(MI, AMDGPU::OpName::clamp))
-    return false;
-
-  return true;
+  return !TII->hasModifiersSet(MI, AMDGPU::OpName::clamp);
 }
 
 /// \brief This function checks \p MI for operands defined by a move immediate
@@ -181,31 +172,37 @@ static void foldImmediates(MachineInstr &MI, const SIInstrInfo *TII,
   }
 
   // We have failed to fold src0, so commute the instruction and try again.
-  if (TryToCommute && MI.isCommutable() && TII->commuteInstruction(&MI))
+  if (TryToCommute && MI.isCommutable() && TII->commuteInstruction(MI))
     foldImmediates(MI, TII, MRI, false);
 
 }
 
 // Copy MachineOperand with all flags except setting it as implicit.
-static MachineOperand copyRegOperandAsImplicit(const MachineOperand &Orig) {
-  assert(!Orig.isImplicit());
-  return MachineOperand::CreateReg(Orig.getReg(),
-                                   Orig.isDef(),
-                                   true,
-                                   Orig.isKill(),
-                                   Orig.isDead(),
-                                   Orig.isUndef(),
-                                   Orig.isEarlyClobber(),
-                                   Orig.getSubReg(),
-                                   Orig.isDebug(),
-                                   Orig.isInternalRead());
+static void copyFlagsToImplicitVCC(MachineInstr &MI,
+                                   const MachineOperand &Orig) {
+
+  for (MachineOperand &Use : MI.implicit_operands()) {
+    if (Use.getReg() == AMDGPU::VCC) {
+      Use.setIsUndef(Orig.isUndef());
+      Use.setIsKill(Orig.isKill());
+      return;
+    }
+  }
+}
+
+static bool isKImmOperand(const SIInstrInfo *TII, const MachineOperand &Src) {
+  return isInt<16>(Src.getImm()) && !TII->isInlineConstant(Src, 4);
 }
 
 bool SIShrinkInstructions::runOnMachineFunction(MachineFunction &MF) {
+  if (skipFunction(*MF.getFunction()))
+    return false;
+
   MachineRegisterInfo &MRI = MF.getRegInfo();
-  const SIInstrInfo *TII =
-      static_cast<const SIInstrInfo *>(MF.getSubtarget().getInstrInfo());
+  const SISubtarget &ST = MF.getSubtarget<SISubtarget>();
+  const SIInstrInfo *TII = ST.getInstrInfo();
   const SIRegisterInfo &TRI = TII->getRegisterInfo();
+
   std::vector<unsigned> I1Defs;
 
   for (MachineFunction::iterator BI = MF.begin(), BE = MF.end();
@@ -217,14 +214,94 @@ bool SIShrinkInstructions::runOnMachineFunction(MachineFunction &MF) {
       Next = std::next(I);
       MachineInstr &MI = *I;
 
+      if (MI.getOpcode() == AMDGPU::V_MOV_B32_e32) {
+        // If this has a literal constant source that is the same as the
+        // reversed bits of an inline immediate, replace with a bitreverse of
+        // that constant. This saves 4 bytes in the common case of materializing
+        // sign bits.
+
+        // Test if we are after regalloc. We only want to do this after any
+        // optimizations happen because this will confuse them.
+        // XXX - not exactly a check for post-regalloc run.
+        MachineOperand &Src = MI.getOperand(1);
+        if (Src.isImm() &&
+            TargetRegisterInfo::isPhysicalRegister(MI.getOperand(0).getReg())) {
+          int64_t Imm = Src.getImm();
+          if (isInt<32>(Imm) && !TII->isInlineConstant(Src, 4)) {
+            int32_t ReverseImm = reverseBits<int32_t>(static_cast<int32_t>(Imm));
+            if (ReverseImm >= -16 && ReverseImm <= 64) {
+              MI.setDesc(TII->get(AMDGPU::V_BFREV_B32_e32));
+              Src.setImm(ReverseImm);
+              continue;
+            }
+          }
+        }
+      }
+
+      // Combine adjacent s_nops to use the immediate operand encoding how long
+      // to wait.
+      //
+      // s_nop N
+      // s_nop M
+      //  =>
+      // s_nop (N + M)
+      if (MI.getOpcode() == AMDGPU::S_NOP &&
+          Next != MBB.end() &&
+          (*Next).getOpcode() == AMDGPU::S_NOP) {
+
+        MachineInstr &NextMI = *Next;
+        // The instruction encodes the amount to wait with an offset of 1,
+        // i.e. 0 is wait 1 cycle. Convert both to cycles and then convert back
+        // after adding.
+        uint8_t Nop0 = MI.getOperand(0).getImm() + 1;
+        uint8_t Nop1 = NextMI.getOperand(0).getImm() + 1;
+
+        // Make sure we don't overflow the bounds.
+        if (Nop0 + Nop1 <= 8) {
+          NextMI.getOperand(0).setImm(Nop0 + Nop1 - 1);
+          MI.eraseFromParent();
+        }
+
+        continue;
+      }
+
+      // FIXME: We also need to consider movs of constant operands since
+      // immediate operands are not folded if they have more than one use, and
+      // the operand folding pass is unaware if the immediate will be free since
+      // it won't know if the src == dest constraint will end up being
+      // satisfied.
+      if (MI.getOpcode() == AMDGPU::S_ADD_I32 ||
+          MI.getOpcode() == AMDGPU::S_MUL_I32) {
+        const MachineOperand &Dest = MI.getOperand(0);
+        const MachineOperand &Src0 = MI.getOperand(1);
+        const MachineOperand &Src1 = MI.getOperand(2);
+
+        // FIXME: This could work better if hints worked with subregisters. If
+        // we have a vector add of a constant, we usually don't get the correct
+        // allocation due to the subregister usage.
+        if (TargetRegisterInfo::isVirtualRegister(Dest.getReg()) &&
+            Src0.isReg()) {
+          MRI.setRegAllocationHint(Dest.getReg(), 0, Src0.getReg());
+          continue;
+        }
+
+        if (Src0.isReg() && Src0.getReg() == Dest.getReg()) {
+          if (Src1.isImm() && isKImmOperand(TII, Src1)) {
+            unsigned Opc = (MI.getOpcode() == AMDGPU::S_ADD_I32) ?
+              AMDGPU::S_ADDK_I32 : AMDGPU::S_MULK_I32;
+
+            MI.setDesc(TII->get(Opc));
+            MI.tieOperands(0, 1);
+          }
+        }
+      }
+
       // Try to use S_MOVK_I32, which will save 4 bytes for small immediates.
       if (MI.getOpcode() == AMDGPU::S_MOV_B32) {
         const MachineOperand &Src = MI.getOperand(1);
 
-        if (Src.isImm()) {
-          if (isInt<16>(Src.getImm()) && !TII->isInlineConstant(Src, 4))
-            MI.setDesc(TII->get(AMDGPU::S_MOVK_I32));
-        }
+        if (Src.isImm() && isKImmOperand(TII, Src))
+          MI.setDesc(TII->get(AMDGPU::S_MOVK_I32));
 
         continue;
       }
@@ -235,7 +312,7 @@ bool SIShrinkInstructions::runOnMachineFunction(MachineFunction &MF) {
       if (!canShrink(MI, TII, TRI, MRI)) {
         // Try commuting the instruction and see if that enables us to shrink
         // it.
-        if (!MI.isCommutable() || !TII->commuteInstruction(&MI) ||
+        if (!MI.isCommutable() || !TII->commuteInstruction(MI) ||
             !canShrink(MI, TII, TRI, MRI))
           continue;
       }
@@ -287,9 +364,9 @@ bool SIShrinkInstructions::runOnMachineFunction(MachineFunction &MF) {
       MachineInstrBuilder Inst32 =
           BuildMI(MBB, I, MI.getDebugLoc(), TII->get(Op32));
 
-      // Add the dst operand if the 32-bit encoding also has an explicit $dst.
+      // Add the dst operand if the 32-bit encoding also has an explicit $vdst.
       // For VOPC instructions, this is replaced by an implicit def of vcc.
-      int Op32DstIdx = AMDGPU::getNamedOperandIdx(Op32, AMDGPU::OpName::dst);
+      int Op32DstIdx = AMDGPU::getNamedOperandIdx(Op32, AMDGPU::OpName::vdst);
       if (Op32DstIdx != -1) {
         // dst
         Inst32.addOperand(MI.getOperand(0));
@@ -314,10 +391,9 @@ bool SIShrinkInstructions::runOnMachineFunction(MachineFunction &MF) {
           Inst32.addOperand(*Src2);
         } else {
           // In the case of V_CNDMASK_B32_e32, the explicit operand src2 is
-          // replaced with an implicit read of vcc.
-          assert(Src2->getReg() == AMDGPU::VCC &&
-                 "Unexpected missing register operand");
-          Inst32.addOperand(copyRegOperandAsImplicit(*Src2));
+          // replaced with an implicit read of vcc. This was already added
+          // during the initial BuildMI, so find it to preserve the flags.
+          copyFlagsToImplicitVCC(*Inst32, *Src2);
         }
       }
 
diff --git a/lib/Target/AMDGPU/SITypeRewriter.cpp b/lib/Target/AMDGPU/SITypeRewriter.cpp
index d36c5d29b127..facc0c7df1dc 100644
--- a/lib/Target/AMDGPU/SITypeRewriter.cpp
+++ b/lib/Target/AMDGPU/SITypeRewriter.cpp
@@ -62,7 +62,7 @@ bool SITypeRewriter::doInitialization(Module &M) {
 }
 
 bool SITypeRewriter::runOnFunction(Function &F) {
-  if (AMDGPU::getShaderType(F) == ShaderType::COMPUTE)
+  if (!AMDGPU::isShader(F.getCallingConv()))
     return false;
 
   visit(F);
diff --git a/lib/Target/AMDGPU/SIWholeQuadMode.cpp b/lib/Target/AMDGPU/SIWholeQuadMode.cpp
new file mode 100644
index 000000000000..c1a237ea5f51
--- /dev/null
+++ b/lib/Target/AMDGPU/SIWholeQuadMode.cpp
@@ -0,0 +1,509 @@
+//===-- SIWholeQuadMode.cpp - enter and suspend whole quad mode -----------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+/// \file
+/// \brief This pass adds instructions to enable whole quad mode for pixel
+/// shaders.
+///
+/// Whole quad mode is required for derivative computations, but it interferes
+/// with shader side effects (stores and atomics). This pass is run on the
+/// scheduled machine IR but before register coalescing, so that machine SSA is
+/// available for analysis. It ensures that WQM is enabled when necessary, but
+/// disabled around stores and atomics.
+///
+/// When necessary, this pass creates a function prolog
+///
+///   S_MOV_B64 LiveMask, EXEC
+///   S_WQM_B64 EXEC, EXEC
+///
+/// to enter WQM at the top of the function and surrounds blocks of Exact
+/// instructions by
+///
+///   S_AND_SAVEEXEC_B64 Tmp, LiveMask
+///   ...
+///   S_MOV_B64 EXEC, Tmp
+///
+/// In order to avoid excessive switching during sequences of Exact
+/// instructions, the pass first analyzes which instructions must be run in WQM
+/// (aka which instructions produce values that lead to derivative
+/// computations).
+///
+/// Basic blocks are always exited in WQM as long as some successor needs WQM.
+///
+/// There is room for improvement given better control flow analysis:
+///
+///  (1) at the top level (outside of control flow statements, and as long as
+///      kill hasn't been used), one SGPR can be saved by recovering WQM from
+///      the LiveMask (this is implemented for the entry block).
+///
+///  (2) when entire regions (e.g. if-else blocks or entire loops) only
+///      consist of exact and don't-care instructions, the switch only has to
+///      be done at the entry and exit points rather than potentially in each
+///      block of the region.
+///
+//===----------------------------------------------------------------------===//
+
+#include "AMDGPU.h"
+#include "AMDGPUSubtarget.h"
+#include "SIInstrInfo.h"
+#include "SIMachineFunctionInfo.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "si-wqm"
+
+namespace {
+
+enum {
+  StateWQM = 0x1,
+  StateExact = 0x2,
+};
+
+struct InstrInfo {
+  char Needs = 0;
+  char OutNeeds = 0;
+};
+
+struct BlockInfo {
+  char Needs = 0;
+  char InNeeds = 0;
+  char OutNeeds = 0;
+};
+
+struct WorkItem {
+  MachineBasicBlock *MBB = nullptr;
+  MachineInstr *MI = nullptr;
+
+  WorkItem() {}
+  WorkItem(MachineBasicBlock *MBB) : MBB(MBB) {}
+  WorkItem(MachineInstr *MI) : MI(MI) {}
+};
+
+class SIWholeQuadMode : public MachineFunctionPass {
+private:
+  const SIInstrInfo *TII;
+  const SIRegisterInfo *TRI;
+  MachineRegisterInfo *MRI;
+
+  DenseMap<const MachineInstr *, InstrInfo> Instructions;
+  DenseMap<MachineBasicBlock *, BlockInfo> Blocks;
+  SmallVector<const MachineInstr *, 2> ExecExports;
+  SmallVector<MachineInstr *, 1> LiveMaskQueries;
+
+  char scanInstructions(MachineFunction &MF, std::vector<WorkItem> &Worklist);
+  void propagateInstruction(MachineInstr &MI, std::vector<WorkItem> &Worklist);
+  void propagateBlock(MachineBasicBlock &MBB, std::vector<WorkItem> &Worklist);
+  char analyzeFunction(MachineFunction &MF);
+
+  void toExact(MachineBasicBlock &MBB, MachineBasicBlock::iterator Before,
+               unsigned SaveWQM, unsigned LiveMaskReg);
+  void toWQM(MachineBasicBlock &MBB, MachineBasicBlock::iterator Before,
+             unsigned SavedWQM);
+  void processBlock(MachineBasicBlock &MBB, unsigned LiveMaskReg, bool isEntry);
+
+  void lowerLiveMaskQueries(unsigned LiveMaskReg);
+
+public:
+  static char ID;
+
+  SIWholeQuadMode() :
+    MachineFunctionPass(ID) { }
+
+  bool runOnMachineFunction(MachineFunction &MF) override;
+
+  const char *getPassName() const override {
+    return "SI Whole Quad Mode";
+  }
+
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    AU.setPreservesCFG();
+    MachineFunctionPass::getAnalysisUsage(AU);
+  }
+};
+
+} // End anonymous namespace
+
+char SIWholeQuadMode::ID = 0;
+
+INITIALIZE_PASS(SIWholeQuadMode, DEBUG_TYPE,
+                "SI Whole Quad Mode", false, false)
+
+char &llvm::SIWholeQuadModeID = SIWholeQuadMode::ID;
+
+FunctionPass *llvm::createSIWholeQuadModePass() {
+  return new SIWholeQuadMode;
+}
+
+// Scan instructions to determine which ones require an Exact execmask and
+// which ones seed WQM requirements.
+char SIWholeQuadMode::scanInstructions(MachineFunction &MF,
+                                       std::vector<WorkItem> &Worklist) {
+  char GlobalFlags = 0;
+  bool WQMOutputs = MF.getFunction()->hasFnAttribute("amdgpu-ps-wqm-outputs");
+
+  for (auto BI = MF.begin(), BE = MF.end(); BI != BE; ++BI) {
+    MachineBasicBlock &MBB = *BI;
+
+    for (auto II = MBB.begin(), IE = MBB.end(); II != IE; ++II) {
+      MachineInstr &MI = *II;
+      unsigned Opcode = MI.getOpcode();
+      char Flags = 0;
+
+      if (TII->isWQM(Opcode) || TII->isDS(Opcode)) {
+        Flags = StateWQM;
+      } else if (MI.mayStore() && TII->usesVM_CNT(MI)) {
+        Flags = StateExact;
+      } else {
+        // Handle export instructions with the exec mask valid flag set
+        if (Opcode == AMDGPU::EXP) {
+          if (MI.getOperand(4).getImm() != 0)
+            ExecExports.push_back(&MI);
+        } else if (Opcode == AMDGPU::SI_PS_LIVE) {
+          LiveMaskQueries.push_back(&MI);
+        } else if (WQMOutputs) {
+          // The function is in machine SSA form, which means that physical
+          // VGPRs correspond to shader inputs and outputs. Inputs are
+          // only used, outputs are only defined.
+          for (const MachineOperand &MO : MI.defs()) {
+            if (!MO.isReg())
+              continue;
+
+            unsigned Reg = MO.getReg();
+
+            if (!TRI->isVirtualRegister(Reg) &&
+                TRI->hasVGPRs(TRI->getPhysRegClass(Reg))) {
+              Flags = StateWQM;
+              break;
+            }
+          }
+        }
+
+        if (!Flags)
+          continue;
+      }
+
+      Instructions[&MI].Needs = Flags;
+      Worklist.push_back(&MI);
+      GlobalFlags |= Flags;
+    }
+
+    if (WQMOutputs && MBB.succ_empty()) {
+      // This is a prolog shader. Make sure we go back to exact mode at the end.
+      Blocks[&MBB].OutNeeds = StateExact;
+      Worklist.push_back(&MBB);
+      GlobalFlags |= StateExact;
+    }
+  }
+
+  return GlobalFlags;
+}
+
+void SIWholeQuadMode::propagateInstruction(MachineInstr &MI,
+                                           std::vector<WorkItem>& Worklist) {
+  MachineBasicBlock *MBB = MI.getParent();
+  InstrInfo II = Instructions[&MI]; // take a copy to prevent dangling references
+  BlockInfo &BI = Blocks[MBB];
+
+  // Control flow-type instructions that are followed by WQM computations
+  // must themselves be in WQM.
+  if ((II.OutNeeds & StateWQM) && !(II.Needs & StateWQM) && MI.isTerminator()) {
+    Instructions[&MI].Needs = StateWQM;
+    II.Needs = StateWQM;
+  }
+
+  // Propagate to block level
+  BI.Needs |= II.Needs;
+  if ((BI.InNeeds | II.Needs) != BI.InNeeds) {
+    BI.InNeeds |= II.Needs;
+    Worklist.push_back(MBB);
+  }
+
+  // Propagate backwards within block
+  if (MachineInstr *PrevMI = MI.getPrevNode()) {
+    char InNeeds = II.Needs | II.OutNeeds;
+    if (!PrevMI->isPHI()) {
+      InstrInfo &PrevII = Instructions[PrevMI];
+      if ((PrevII.OutNeeds | InNeeds) != PrevII.OutNeeds) {
+        PrevII.OutNeeds |= InNeeds;
+        Worklist.push_back(PrevMI);
+      }
+    }
+  }
+
+  // Propagate WQM flag to instruction inputs
+  assert(II.Needs != (StateWQM | StateExact));
+  if (II.Needs != StateWQM)
+    return;
+
+  for (const MachineOperand &Use : MI.uses()) {
+    if (!Use.isReg() || !Use.isUse())
+      continue;
+
+    // At this point, physical registers appear as inputs or outputs
+    // and following them makes no sense (and would in fact be incorrect
+    // when the same VGPR is used as both an output and an input that leads
+    // to a NeedsWQM instruction).
+    //
+    // Note: VCC appears e.g. in 64-bit addition with carry - theoretically we
+    // have to trace this, in practice it happens for 64-bit computations like
+    // pointers where both dwords are followed already anyway.
+    if (!TargetRegisterInfo::isVirtualRegister(Use.getReg()))
+      continue;
+
+    for (MachineInstr &DefMI : MRI->def_instructions(Use.getReg())) {
+      InstrInfo &DefII = Instructions[&DefMI];
+
+      // Obviously skip if DefMI is already flagged as NeedWQM.
+      //
+      // The instruction might also be flagged as NeedExact. This happens when
+      // the result of an atomic is used in a WQM computation. In this case,
+      // the atomic must not run for helper pixels and the WQM result is
+      // undefined.
+      if (DefII.Needs != 0)
+        continue;
+
+      DefII.Needs = StateWQM;
+      Worklist.push_back(&DefMI);
+    }
+  }
+}
+
+void SIWholeQuadMode::propagateBlock(MachineBasicBlock &MBB,
+                                     std::vector<WorkItem>& Worklist) {
+  BlockInfo BI = Blocks[&MBB]; // Make a copy to prevent dangling references.
+
+  // Propagate through instructions
+  if (!MBB.empty()) {
+    MachineInstr *LastMI = &*MBB.rbegin();
+    InstrInfo &LastII = Instructions[LastMI];
+    if ((LastII.OutNeeds | BI.OutNeeds) != LastII.OutNeeds) {
+      LastII.OutNeeds |= BI.OutNeeds;
+      Worklist.push_back(LastMI);
+    }
+  }
+
+  // Predecessor blocks must provide for our WQM/Exact needs.
+  for (MachineBasicBlock *Pred : MBB.predecessors()) {
+    BlockInfo &PredBI = Blocks[Pred];
+    if ((PredBI.OutNeeds | BI.InNeeds) == PredBI.OutNeeds)
+      continue;
+
+    PredBI.OutNeeds |= BI.InNeeds;
+    PredBI.InNeeds |= BI.InNeeds;
+    Worklist.push_back(Pred);
+  }
+
+  // All successors must be prepared to accept the same set of WQM/Exact data.
+  for (MachineBasicBlock *Succ : MBB.successors()) {
+    BlockInfo &SuccBI = Blocks[Succ];
+    if ((SuccBI.InNeeds | BI.OutNeeds) == SuccBI.InNeeds)
+      continue;
+
+    SuccBI.InNeeds |= BI.OutNeeds;
+    Worklist.push_back(Succ);
+  }
+}
+
+char SIWholeQuadMode::analyzeFunction(MachineFunction &MF) {
+  std::vector<WorkItem> Worklist;
+  char GlobalFlags = scanInstructions(MF, Worklist);
+
+  while (!Worklist.empty()) {
+    WorkItem WI = Worklist.back();
+    Worklist.pop_back();
+
+    if (WI.MI)
+      propagateInstruction(*WI.MI, Worklist);
+    else
+      propagateBlock(*WI.MBB, Worklist);
+  }
+
+  return GlobalFlags;
+}
+
+void SIWholeQuadMode::toExact(MachineBasicBlock &MBB,
+                              MachineBasicBlock::iterator Before,
+                              unsigned SaveWQM, unsigned LiveMaskReg) {
+  if (SaveWQM) {
+    BuildMI(MBB, Before, DebugLoc(), TII->get(AMDGPU::S_AND_SAVEEXEC_B64),
+            SaveWQM)
+        .addReg(LiveMaskReg);
+  } else {
+    BuildMI(MBB, Before, DebugLoc(), TII->get(AMDGPU::S_AND_B64),
+            AMDGPU::EXEC)
+        .addReg(AMDGPU::EXEC)
+        .addReg(LiveMaskReg);
+  }
+}
+
+void SIWholeQuadMode::toWQM(MachineBasicBlock &MBB,
+                            MachineBasicBlock::iterator Before,
+                            unsigned SavedWQM) {
+  if (SavedWQM) {
+    BuildMI(MBB, Before, DebugLoc(), TII->get(AMDGPU::COPY), AMDGPU::EXEC)
+        .addReg(SavedWQM);
+  } else {
+    BuildMI(MBB, Before, DebugLoc(), TII->get(AMDGPU::S_WQM_B64),
+            AMDGPU::EXEC)
+        .addReg(AMDGPU::EXEC);
+  }
+}
+
+void SIWholeQuadMode::processBlock(MachineBasicBlock &MBB, unsigned LiveMaskReg,
+                                   bool isEntry) {
+  auto BII = Blocks.find(&MBB);
+  if (BII == Blocks.end())
+    return;
+
+  const BlockInfo &BI = BII->second;
+
+  if (!(BI.InNeeds & StateWQM))
+    return;
+
+  // This is a non-entry block that is WQM throughout, so no need to do
+  // anything.
+  if (!isEntry && !(BI.Needs & StateExact) && BI.OutNeeds != StateExact)
+    return;
+
+  unsigned SavedWQMReg = 0;
+  bool WQMFromExec = isEntry;
+  char State = isEntry ? StateExact : StateWQM;
+
+  auto II = MBB.getFirstNonPHI(), IE = MBB.end();
+  while (II != IE) {
+    MachineInstr &MI = *II;
+    ++II;
+
+    // Skip instructions that are not affected by EXEC
+    if (TII->isScalarUnit(MI) && !MI.isTerminator())
+      continue;
+
+    // Generic instructions such as COPY will either disappear by register
+    // coalescing or be lowered to SALU or VALU instructions.
+    if (TargetInstrInfo::isGenericOpcode(MI.getOpcode())) {
+      if (MI.getNumExplicitOperands() >= 1) {
+        const MachineOperand &Op = MI.getOperand(0);
+        if (Op.isReg()) {
+          if (TRI->isSGPRReg(*MRI, Op.getReg())) {
+            // SGPR instructions are not affected by EXEC
+            continue;
+          }
+        }
+      }
+    }
+
+    char Needs = 0;
+    char OutNeeds = 0;
+    auto InstrInfoIt = Instructions.find(&MI);
+    if (InstrInfoIt != Instructions.end()) {
+      Needs = InstrInfoIt->second.Needs;
+      OutNeeds = InstrInfoIt->second.OutNeeds;
+
+      // Make sure to switch to Exact mode before the end of the block when
+      // Exact and only Exact is needed further downstream.
+      if (OutNeeds == StateExact && MI.isTerminator()) {
+        assert(Needs == 0);
+        Needs = StateExact;
+      }
+    }
+
+    // State switching
+    if (Needs && State != Needs) {
+      if (Needs == StateExact) {
+        assert(!SavedWQMReg);
+
+        if (!WQMFromExec && (OutNeeds & StateWQM))
+          SavedWQMReg = MRI->createVirtualRegister(&AMDGPU::SReg_64RegClass);
+
+        toExact(MBB, &MI, SavedWQMReg, LiveMaskReg);
+      } else {
+        assert(WQMFromExec == (SavedWQMReg == 0));
+        toWQM(MBB, &MI, SavedWQMReg);
+        SavedWQMReg = 0;
+      }
+
+      State = Needs;
+    }
+  }
+
+  if ((BI.OutNeeds & StateWQM) && State != StateWQM) {
+    assert(WQMFromExec == (SavedWQMReg == 0));
+    toWQM(MBB, MBB.end(), SavedWQMReg);
+  } else if (BI.OutNeeds == StateExact && State != StateExact) {
+    toExact(MBB, MBB.end(), 0, LiveMaskReg);
+  }
+}
+
+void SIWholeQuadMode::lowerLiveMaskQueries(unsigned LiveMaskReg) {
+  for (MachineInstr *MI : LiveMaskQueries) {
+    const DebugLoc &DL = MI->getDebugLoc();
+    unsigned Dest = MI->getOperand(0).getReg();
+    BuildMI(*MI->getParent(), MI, DL, TII->get(AMDGPU::COPY), Dest)
+        .addReg(LiveMaskReg);
+    MI->eraseFromParent();
+  }
+}
+
+bool SIWholeQuadMode::runOnMachineFunction(MachineFunction &MF) {
+  if (MF.getFunction()->getCallingConv() != CallingConv::AMDGPU_PS)
+    return false;
+
+  Instructions.clear();
+  Blocks.clear();
+  ExecExports.clear();
+  LiveMaskQueries.clear();
+
+  const SISubtarget &ST = MF.getSubtarget<SISubtarget>();
+
+  TII = ST.getInstrInfo();
+  TRI = &TII->getRegisterInfo();
+  MRI = &MF.getRegInfo();
+
+  char GlobalFlags = analyzeFunction(MF);
+  if (!(GlobalFlags & StateWQM)) {
+    lowerLiveMaskQueries(AMDGPU::EXEC);
+    return !LiveMaskQueries.empty();
+  }
+
+  // Store a copy of the original live mask when required
+  unsigned LiveMaskReg = 0;
+  {
+    MachineBasicBlock &Entry = MF.front();
+    MachineBasicBlock::iterator EntryMI = Entry.getFirstNonPHI();
+
+    if (GlobalFlags & StateExact || !LiveMaskQueries.empty()) {
+      LiveMaskReg = MRI->createVirtualRegister(&AMDGPU::SReg_64RegClass);
+      BuildMI(Entry, EntryMI, DebugLoc(), TII->get(AMDGPU::COPY), LiveMaskReg)
+          .addReg(AMDGPU::EXEC);
+    }
+
+    if (GlobalFlags == StateWQM) {
+      // For a shader that needs only WQM, we can just set it once.
+      BuildMI(Entry, EntryMI, DebugLoc(), TII->get(AMDGPU::S_WQM_B64),
+              AMDGPU::EXEC)
+          .addReg(AMDGPU::EXEC);
+
+      lowerLiveMaskQueries(LiveMaskReg);
+      // EntryMI may become invalid here
+      return true;
+    }
+  }
+
+  lowerLiveMaskQueries(LiveMaskReg);
+
+  // Handle the general case
+  for (auto BII : Blocks)
+    processBlock(*BII.first, LiveMaskReg, BII.first == &*MF.begin());
+
+  return true;
+}
diff --git a/lib/Target/AMDGPU/TargetInfo/Makefile b/lib/Target/AMDGPU/TargetInfo/Makefile
deleted file mode 100644
index 1b232871bd62..000000000000
--- a/lib/Target/AMDGPU/TargetInfo/Makefile
+++ /dev/null
@@ -1,15 +0,0 @@
-##===- lib/Target/AMDGPU/TargetInfo/Makefile ----------------*- Makefile -*-===##
-#
-#                     The LLVM Compiler Infrastructure
-#
-# This file is distributed under the University of Illinois Open Source
-# License. See LICENSE.TXT for details.
-#
-##===----------------------------------------------------------------------===##
-LEVEL = ../../../..
-LIBRARYNAME = LLVMAMDGPUInfo
-
-# Hack: we need to include 'main' target directory to grab private headers
-CPPFLAGS = -I$(PROJ_OBJ_DIR)/.. -I$(PROJ_SRC_DIR)/..
-
-include $(LEVEL)/Makefile.common
diff --git a/lib/Target/AMDGPU/Utils/AMDGPUAsmUtils.cpp b/lib/Target/AMDGPU/Utils/AMDGPUAsmUtils.cpp
new file mode 100644
index 000000000000..b6868de6a74e
--- /dev/null
+++ b/lib/Target/AMDGPU/Utils/AMDGPUAsmUtils.cpp
@@ -0,0 +1,69 @@
+//===-- AMDGPUAsmUtils.cpp - AsmParser/InstPrinter common -----------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+#include "AMDGPUAsmUtils.h"
+
+namespace llvm {
+namespace AMDGPU {
+namespace SendMsg {
+
+// This must be in sync with llvm::AMDGPU::SendMsg::Id enum members, see SIDefines.h.
+const char* const IdSymbolic[] = {
+  nullptr,
+  "MSG_INTERRUPT",
+  "MSG_GS",
+  "MSG_GS_DONE",
+  nullptr,
+  nullptr,
+  nullptr,
+  nullptr,
+  nullptr,
+  nullptr,
+  nullptr,
+  nullptr,
+  nullptr,
+  nullptr,
+  nullptr,
+  "MSG_SYSMSG"
+};
+
+// These two must be in sync with llvm::AMDGPU::SendMsg::Op enum members, see SIDefines.h.
+const char* const OpSysSymbolic[] = {
+  nullptr,
+  "SYSMSG_OP_ECC_ERR_INTERRUPT",
+  "SYSMSG_OP_REG_RD",
+  "SYSMSG_OP_HOST_TRAP_ACK",
+  "SYSMSG_OP_TTRACE_PC"
+};
+
+const char* const OpGsSymbolic[] = {
+  "GS_OP_NOP",
+  "GS_OP_CUT",
+  "GS_OP_EMIT",
+  "GS_OP_EMIT_CUT"
+};
+
+} // namespace SendMsg
+
+namespace Hwreg {
+
+// This must be in sync with llvm::AMDGPU::Hwreg::ID_SYMBOLIC_FIRST_/LAST_, see SIDefines.h.
+const char* const IdSymbolic[] = {
+  nullptr,
+  "HW_REG_MODE",
+  "HW_REG_STATUS",
+  "HW_REG_TRAPSTS",
+  "HW_REG_HW_ID",
+  "HW_REG_GPR_ALLOC",
+  "HW_REG_LDS_ALLOC",
+  "HW_REG_IB_STS"
+};
+
+} // namespace Hwreg
+} // namespace AMDGPU
+} // namespace llvm
diff --git a/lib/Target/AMDGPU/Utils/AMDGPUAsmUtils.h b/lib/Target/AMDGPU/Utils/AMDGPUAsmUtils.h
new file mode 100644
index 000000000000..b2dc2c0e364c
--- /dev/null
+++ b/lib/Target/AMDGPU/Utils/AMDGPUAsmUtils.h
@@ -0,0 +1,31 @@
+//===-- AMDGPUAsmUtils.h - AsmParser/InstPrinter common ---------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_AMDGPU_UTILS_AMDGPUASMUTILS_H
+#define LLVM_LIB_TARGET_AMDGPU_UTILS_AMDGPUASMUTILS_H
+
+namespace llvm {
+namespace AMDGPU {
+namespace SendMsg { // Symbolic names for the sendmsg(...) syntax.
+
+extern const char* const IdSymbolic[];
+extern const char* const OpSysSymbolic[];
+extern const char* const OpGsSymbolic[];
+
+} // namespace SendMsg
+
+namespace Hwreg { // Symbolic names for the hwreg(...) syntax.
+
+extern const char* const IdSymbolic[];
+
+} // namespace Hwreg
+} // namespace AMDGPU
+} // namespace llvm
+
+#endif
diff --git a/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp b/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
index 1f5deaef9d3b..c6f9142c0aa5 100644
--- a/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
+++ b/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
@@ -109,29 +109,45 @@ bool isReadOnlySegment(const GlobalValue *GV) {
   return GV->getType()->getAddressSpace() == AMDGPUAS::CONSTANT_ADDRESS;
 }
 
-static unsigned getIntegerAttribute(const Function &F, const char *Name,
-                                    unsigned Default) {
+int getIntegerAttribute(const Function &F, StringRef Name, int Default) {
   Attribute A = F.getFnAttribute(Name);
-  unsigned Result = Default;
+  int Result = Default;
 
   if (A.isStringAttribute()) {
     StringRef Str = A.getValueAsString();
     if (Str.getAsInteger(0, Result)) {
       LLVMContext &Ctx = F.getContext();
-      Ctx.emitError("can't parse shader type");
+      Ctx.emitError("can't parse integer attribute " + Name);
     }
   }
+
   return Result;
 }
 
-unsigned getShaderType(const Function &F) {
-  return getIntegerAttribute(F, "ShaderType", ShaderType::COMPUTE);
+unsigned getMaximumWorkGroupSize(const Function &F) {
+  return getIntegerAttribute(F, "amdgpu-max-work-group-size", 256);
 }
 
 unsigned getInitialPSInputAddr(const Function &F) {
   return getIntegerAttribute(F, "InitialPSInputAddr", 0);
 }
 
+bool isShader(CallingConv::ID cc) {
+  switch(cc) {
+    case CallingConv::AMDGPU_VS:
+    case CallingConv::AMDGPU_GS:
+    case CallingConv::AMDGPU_PS:
+    case CallingConv::AMDGPU_CS:
+      return true;
+    default:
+      return false;
+  }
+}
+
+bool isCompute(CallingConv::ID cc) {
+  return !isShader(cc) || cc == CallingConv::AMDGPU_CS;
+}
+
 bool isSI(const MCSubtargetInfo &STI) {
   return STI.getFeatureBits()[AMDGPU::FeatureSouthernIslands];
 }
diff --git a/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h b/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h
index 57cbe1b58f98..995a9041fb36 100644
--- a/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h
+++ b/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h
@@ -11,6 +11,7 @@
 #define LLVM_LIB_TARGET_AMDGPU_UTILS_AMDGPUBASEINFO_H
 
 #include "AMDKernelCodeT.h"
+#include "llvm/IR/CallingConv.h"
 
 namespace llvm {
 
@@ -44,9 +45,13 @@ bool isGroupSegment(const GlobalValue *GV);
 bool isGlobalSegment(const GlobalValue *GV);
 bool isReadOnlySegment(const GlobalValue *GV);
 
-unsigned getShaderType(const Function &F);
+int getIntegerAttribute(const Function &F, StringRef Name, int Default);
+
+unsigned getMaximumWorkGroupSize(const Function &F);
 unsigned getInitialPSInputAddr(const Function &F);
 
+bool isShader(CallingConv::ID cc);
+bool isCompute(CallingConv::ID cc);
 
 bool isSI(const MCSubtargetInfo &STI);
 bool isCI(const MCSubtargetInfo &STI);
diff --git a/lib/Target/AMDGPU/Utils/AMDKernelCodeTInfo.h b/lib/Target/AMDGPU/Utils/AMDKernelCodeTInfo.h
new file mode 100644
index 000000000000..3a5ff60601d0
--- /dev/null
+++ b/lib/Target/AMDGPU/Utils/AMDKernelCodeTInfo.h
@@ -0,0 +1,165 @@
+//===--------------------- AMDKernelCodeTInfo.h ---------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+//===----------------------------------------------------------------------===//
+//
+/// \file - specifies tables for amd_kernel_code_t structure parsing/printing
+//
+//===----------------------------------------------------------------------===//
+
+#define QNAME(name) amd_kernel_code_t::name
+#define FLD_T(name) decltype(QNAME(name)), &QNAME(name)
+
+#define FIELD2(sname, name) \
+  RECORD(sname, printField<FLD_T(name)>, parseField<FLD_T(name)>)
+
+#define FIELD(name) FIELD2(name, name)
+
+
+#define PRINTCODEPROP(name) \
+  printBitField<FLD_T(code_properties),\
+                AMD_CODE_PROPERTY_##name##_SHIFT,\
+                AMD_CODE_PROPERTY_##name##_WIDTH>
+
+#define PARSECODEPROP(name) \
+  parseBitField<FLD_T(code_properties),\
+                AMD_CODE_PROPERTY_##name##_SHIFT,\
+                AMD_CODE_PROPERTY_##name##_WIDTH>
+
+#define CODEPROP(name, shift) \
+  RECORD(name, PRINTCODEPROP(shift), PARSECODEPROP(shift))
+
+// have to define these lambdas because of Set/GetMacro
+#define PRINTCOMP(GetMacro, Shift) \
+[](StringRef Name, const amd_kernel_code_t &C, raw_ostream &OS) { \
+   printName(OS, Name) << \
+     (int)GetMacro(C.compute_pgm_resource_registers >> Shift); \
+}
+#define PARSECOMP(SetMacro, Shift) \
+[](amd_kernel_code_t &C, MCAsmParser &MCParser, raw_ostream &Err) { \
+   int64_t Value = 0; \
+   if (!expectAbsExpression(MCParser, Value, Err)) \
+     return false; \
+   C.compute_pgm_resource_registers |= SetMacro(Value) << Shift; \
+   return true; \
+}
+
+#define COMPPGM(name, GetMacro, SetMacro, Shift) \
+  RECORD(name, PRINTCOMP(GetMacro, Shift), PARSECOMP(SetMacro, Shift))
+
+#define COMPPGM1(name, AccMacro) \
+  COMPPGM(compute_pgm_rsrc1_##name, \
+          G_00B848_##AccMacro, S_00B848_##AccMacro, 0)
+
+#define COMPPGM2(name, AccMacro) \
+  COMPPGM(compute_pgm_rsrc2_##name, \
+          G_00B84C_##AccMacro, S_00B84C_##AccMacro, 32)
+
+///////////////////////////////////////////////////////////////////////////////
+// Begin of the table
+// Define RECORD(name, print, parse) in your code to get field definitions
+// and include this file
+
+FIELD2(kernel_code_version_major, amd_kernel_code_version_major),
+FIELD2(kernel_code_version_minor, amd_kernel_code_version_minor),
+FIELD2(machine_kind,              amd_machine_kind),
+FIELD2(machine_version_major,     amd_machine_version_major),
+FIELD2(machine_version_minor,     amd_machine_version_minor),
+FIELD2(machine_version_stepping,  amd_machine_version_stepping),
+FIELD(kernel_code_entry_byte_offset),
+FIELD(kernel_code_prefetch_byte_size),
+FIELD(max_scratch_backing_memory_byte_size),
+FIELD(compute_pgm_resource_registers),
+FIELD(workitem_private_segment_byte_size),
+FIELD(workgroup_group_segment_byte_size),
+FIELD(gds_segment_byte_size),
+FIELD(kernarg_segment_byte_size),
+FIELD(workgroup_fbarrier_count),
+FIELD(wavefront_sgpr_count),
+FIELD(workitem_vgpr_count),
+FIELD(reserved_vgpr_first),
+FIELD(reserved_vgpr_count),
+FIELD(reserved_sgpr_first),
+FIELD(reserved_sgpr_count),
+FIELD(debug_wavefront_private_segment_offset_sgpr),
+FIELD(debug_private_segment_buffer_sgpr),
+FIELD(kernarg_segment_alignment),
+FIELD(group_segment_alignment),
+FIELD(private_segment_alignment),
+FIELD(wavefront_size),
+FIELD(call_convention),
+FIELD(runtime_loader_kernel_symbol),
+
+COMPPGM1(vgprs,          VGPRS),
+COMPPGM1(sgprs,          SGPRS),
+COMPPGM1(priority,       PRIORITY),
+COMPPGM1(float_mode,     FLOAT_MODE),
+COMPPGM1(priv,           PRIV),
+COMPPGM1(dx10_clamp,     DX10_CLAMP),
+COMPPGM1(debug_mode,     DEBUG_MODE),
+COMPPGM1(ieee_mode,      IEEE_MODE),
+COMPPGM2(scratch_en,     SCRATCH_EN),
+COMPPGM2(user_sgpr,      USER_SGPR),
+COMPPGM2(tgid_x_en,      TGID_X_EN),
+COMPPGM2(tgid_y_en,      TGID_Y_EN),
+COMPPGM2(tgid_z_en,      TGID_Z_EN),
+COMPPGM2(tg_size_en,     TG_SIZE_EN),
+COMPPGM2(tidig_comp_cnt, TIDIG_COMP_CNT),
+COMPPGM2(excp_en_msb,    EXCP_EN_MSB),
+COMPPGM2(lds_size,       LDS_SIZE),
+COMPPGM2(excp_en,        EXCP_EN),
+
+CODEPROP(enable_sgpr_private_segment_buffer,
+         ENABLE_SGPR_PRIVATE_SEGMENT_BUFFER),
+CODEPROP(enable_sgpr_dispatch_ptr,
+         ENABLE_SGPR_DISPATCH_PTR),
+CODEPROP(enable_sgpr_queue_ptr,
+         ENABLE_SGPR_QUEUE_PTR),
+CODEPROP(enable_sgpr_kernarg_segment_ptr,
+         ENABLE_SGPR_KERNARG_SEGMENT_PTR),
+CODEPROP(enable_sgpr_dispatch_id,
+         ENABLE_SGPR_DISPATCH_ID),
+CODEPROP(enable_sgpr_flat_scratch_init,
+         ENABLE_SGPR_FLAT_SCRATCH_INIT),
+CODEPROP(enable_sgpr_private_segment_size,
+         ENABLE_SGPR_PRIVATE_SEGMENT_SIZE),
+CODEPROP(enable_sgpr_grid_workgroup_count_x,
+         ENABLE_SGPR_GRID_WORKGROUP_COUNT_X),
+CODEPROP(enable_sgpr_grid_workgroup_count_y,
+         ENABLE_SGPR_GRID_WORKGROUP_COUNT_Y),
+CODEPROP(enable_sgpr_grid_workgroup_count_z,
+         ENABLE_SGPR_GRID_WORKGROUP_COUNT_Z),
+CODEPROP(enable_ordered_append_gds,
+         ENABLE_ORDERED_APPEND_GDS),
+CODEPROP(private_element_size,
+         PRIVATE_ELEMENT_SIZE),
+CODEPROP(is_ptr64,
+         IS_PTR64),
+CODEPROP(is_dynamic_callstack,
+         IS_DYNAMIC_CALLSTACK),
+CODEPROP(is_debug_enabled,
+         IS_DEBUG_SUPPORTED),
+CODEPROP(is_xnack_enabled,
+         IS_XNACK_SUPPORTED)
+
+// end of the table
+///////////////////////////////////////////////////////////////////////////////
+
+#undef QNAME
+#undef FLD_T
+#undef FIELD2
+#undef FIELD
+#undef PRINTCODEPROP
+#undef PARSECODEPROP
+#undef CODEPROP
+#undef PRINTCOMP
+#undef PAPSECOMP
+#undef COMPPGM
+#undef COMPPGM1
+#undef COMPPGM2
diff --git a/lib/Target/AMDGPU/Utils/AMDKernelCodeTUtils.cpp b/lib/Target/AMDGPU/Utils/AMDKernelCodeTUtils.cpp
new file mode 100644
index 000000000000..f64973afa44f
--- /dev/null
+++ b/lib/Target/AMDGPU/Utils/AMDKernelCodeTUtils.cpp
@@ -0,0 +1,166 @@
+//===--------------------AMDKernelCodeTUtils.cpp --------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+//===----------------------------------------------------------------------===//
+//
+/// \file - utility functions to parse/print amd_kernel_code_t structure
+//
+//===----------------------------------------------------------------------===//
+
+#include "AMDKernelCodeTUtils.h"
+#include "SIDefines.h"
+#include <llvm/MC/MCParser/MCAsmLexer.h>
+#include <llvm/MC/MCParser/MCAsmParser.h>
+#include <llvm/Support/raw_ostream.h>
+
+using namespace llvm;
+
+static ArrayRef<StringRef> get_amd_kernel_code_t_FldNames() {
+  static StringRef const Table[] = {
+    "", // not found placeholder
+#define RECORD(name, print, parse) #name
+#include "AMDKernelCodeTInfo.h"
+#undef RECORD
+  };
+  return makeArrayRef(Table);
+}
+
+static StringMap<int> createIndexMap(const ArrayRef<StringRef> &a) {
+  StringMap<int> map;
+  for (auto Name : a)
+    map.insert(std::make_pair(Name, map.size()));
+  return map;
+}
+
+static int get_amd_kernel_code_t_FieldIndex(StringRef name) {
+  static const auto map = createIndexMap(get_amd_kernel_code_t_FldNames());
+  return map.lookup(name) - 1; // returns -1 if not found
+}
+
+static StringRef get_amd_kernel_code_t_FieldName(int index) {
+  return get_amd_kernel_code_t_FldNames()[index + 1];
+}
+
+
+// Field printing
+
+static raw_ostream &printName(raw_ostream &OS, StringRef Name) {
+  return OS << Name << " = ";
+}
+
+template <typename T, T amd_kernel_code_t::*ptr>
+static void printField(StringRef Name, const amd_kernel_code_t &C,
+                       raw_ostream &OS) {
+  printName(OS, Name) << (int)(C.*ptr);
+}
+
+template <typename T, T amd_kernel_code_t::*ptr, int shift, int width = 1>
+static void printBitField(StringRef Name, const amd_kernel_code_t &c,
+                          raw_ostream &OS) {
+  const auto Mask = (static_cast<T>(1) << width) - 1;
+  printName(OS, Name) << (int)((c.*ptr >> shift) & Mask);
+}
+
+typedef void(*PrintFx)(StringRef,
+                       const amd_kernel_code_t &,
+                       raw_ostream &);
+
+static ArrayRef<PrintFx> getPrinterTable() {
+  static const PrintFx Table[] = {
+#define RECORD(name, print, parse) print
+#include "AMDKernelCodeTInfo.h"
+#undef RECORD
+  };
+  return makeArrayRef(Table);
+}
+
+void llvm::printAmdKernelCodeField(const amd_kernel_code_t &C,
+                                   int FldIndex,
+                                   raw_ostream &OS) {
+  auto Printer = getPrinterTable()[FldIndex];
+  if (Printer)
+    Printer(get_amd_kernel_code_t_FieldName(FldIndex), C, OS);
+}
+
+void llvm::dumpAmdKernelCode(const amd_kernel_code_t *C,
+                             raw_ostream &OS,
+                             const char *tab) {
+  const int Size = getPrinterTable().size();
+  for (int i = 0; i < Size; ++i) {
+    OS << tab;
+    printAmdKernelCodeField(*C, i, OS);
+    OS << '\n';
+  }
+}
+
+
+// Field parsing
+
+static bool expectAbsExpression(MCAsmParser &MCParser, int64_t &Value, raw_ostream& Err) {
+
+  if (MCParser.getLexer().isNot(AsmToken::Equal)) {
+    Err << "expected '='";
+    return false;
+  }
+  MCParser.getLexer().Lex();
+
+  if (MCParser.parseAbsoluteExpression(Value)) {
+    Err << "integer absolute expression expected";
+    return false;
+  }
+  return true;
+}
+
+template <typename T, T amd_kernel_code_t::*ptr>
+static bool parseField(amd_kernel_code_t &C, MCAsmParser &MCParser,
+                       raw_ostream &Err) {
+  int64_t Value = 0;
+  if (!expectAbsExpression(MCParser, Value, Err))
+    return false;
+  C.*ptr = (T)Value;
+  return true;
+}
+
+template <typename T, T amd_kernel_code_t::*ptr, int shift, int width = 1>
+static bool parseBitField(amd_kernel_code_t &C, MCAsmParser &MCParser,
+                          raw_ostream &Err) {
+  int64_t Value = 0;
+  if (!expectAbsExpression(MCParser, Value, Err))
+    return false;
+  const uint64_t Mask = ((UINT64_C(1)  << width) - 1) << shift;
+  C.*ptr &= (T)~Mask;
+  C.*ptr |= (T)((Value << shift) & Mask);
+  return true;
+}
+
+typedef bool(*ParseFx)(amd_kernel_code_t &,
+                       MCAsmParser &MCParser,
+                       raw_ostream &Err);
+
+static ArrayRef<ParseFx> getParserTable() {
+  static const ParseFx Table[] = {
+#define RECORD(name, print, parse) parse
+#include "AMDKernelCodeTInfo.h"
+#undef RECORD
+  };
+  return makeArrayRef(Table);
+}
+
+bool llvm::parseAmdKernelCodeField(StringRef ID,
+                                   MCAsmParser &MCParser,
+                                   amd_kernel_code_t &C,
+                                   raw_ostream &Err) {
+  const int Idx = get_amd_kernel_code_t_FieldIndex(ID);
+  if (Idx < 0) {
+    Err << "unexpected amd_kernel_code_t field name " << ID;
+    return false;
+  }
+  auto Parser = getParserTable()[Idx];
+  return Parser ? Parser(C, MCParser, Err) : false;
+}
diff --git a/lib/Target/AMDGPU/Utils/AMDKernelCodeTUtils.h b/lib/Target/AMDGPU/Utils/AMDKernelCodeTUtils.h
new file mode 100644
index 000000000000..d9edca7a82ac
--- /dev/null
+++ b/lib/Target/AMDGPU/Utils/AMDKernelCodeTUtils.h
@@ -0,0 +1,39 @@
+//===- AMDGPUKernelCodeTUtils.h - helpers for amd_kernel_code_t  *- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+/// \file AMDKernelCodeTUtils.h
+//===----------------------------------------------------------------------===//
+
+#ifndef AMDKERNELCODETUTILS_H
+#define AMDKERNELCODETUTILS_H
+
+#include "AMDKernelCodeT.h"
+
+namespace llvm {
+
+class MCAsmLexer;
+class MCAsmParser;
+class raw_ostream;
+class StringRef;
+
+void printAmdKernelCodeField(const amd_kernel_code_t &C,
+  int FldIndex,
+  raw_ostream &OS);
+
+void dumpAmdKernelCode(const amd_kernel_code_t *C,
+  raw_ostream &OS,
+  const char *tab);
+
+bool parseAmdKernelCodeField(StringRef ID,
+  MCAsmParser &Parser,
+  amd_kernel_code_t &C,
+  raw_ostream &Err);
+
+}
+
+#endif // AMDKERNELCODETUTILS_H
diff --git a/lib/Target/AMDGPU/Utils/CMakeLists.txt b/lib/Target/AMDGPU/Utils/CMakeLists.txt
index 2c07aeab7dd3..01b80ebe8d3d 100644
--- a/lib/Target/AMDGPU/Utils/CMakeLists.txt
+++ b/lib/Target/AMDGPU/Utils/CMakeLists.txt
@@ -1,3 +1,5 @@
 add_llvm_library(LLVMAMDGPUUtils
   AMDGPUBaseInfo.cpp
+  AMDKernelCodeTUtils.cpp
+  AMDGPUAsmUtils.cpp
   )
diff --git a/lib/Target/AMDGPU/Utils/Makefile b/lib/Target/AMDGPU/Utils/Makefile
deleted file mode 100644
index 1019e726d50e..000000000000
--- a/lib/Target/AMDGPU/Utils/Makefile
+++ /dev/null
@@ -1,16 +0,0 @@
-##===- lib/Target/AMDGPU/Utils/Makefile --------------------*- Makefile -*-===##
-#
-#                     The LLVM Compiler Infrastructure
-#
-# This file is distributed under the University of Illinois Open Source
-# License. See LICENSE.TXT for details.
-#
-##===----------------------------------------------------------------------===##
-LEVEL = ../../../..
-LIBRARYNAME = LLVMAMDGPUUtils
-
-# Hack: we need to include 'main' AMDGPU target directory to grab private
-# headers
-CPP.Flags += -I$(PROJ_OBJ_DIR)/.. -I$(PROJ_SRC_DIR)/..
-
-include $(LEVEL)/Makefile.common
diff --git a/lib/Target/AMDGPU/VIInstrFormats.td b/lib/Target/AMDGPU/VIInstrFormats.td
index d8738f992630..912ed5329bfe 100644
--- a/lib/Target/AMDGPU/VIInstrFormats.td
+++ b/lib/Target/AMDGPU/VIInstrFormats.td
@@ -91,21 +91,28 @@ class MTBUFe_vi <bits<4> op> : Enc64 {
 
 class SMEMe_vi <bits<8> op, bit imm> : Enc64 {
   bits<7>  sbase;
-  bits<7>  sdata;
+  bits<7>  sdst;
   bits<1>  glc;
-  bits<20> offset;
 
   let Inst{5-0}   = sbase{6-1};
-  let Inst{12-6}  = sdata;
+  let Inst{12-6}  = sdst;
   let Inst{16}    = glc;
   let Inst{17}    = imm;
   let Inst{25-18} = op;
   let Inst{31-26} = 0x30; //encoding
+}
+
+class SMEM_IMMe_vi <bits<8> op> : SMEMe_vi<op, 1> {
+  bits<20> offset;
   let Inst{51-32} = offset;
 }
 
-class VOP3e_vi <bits<10> op> : Enc64 {
-  bits<8> vdst;
+class SMEM_SOFFe_vi <bits<8> op> : SMEMe_vi<op, 0> {
+  bits<20> soff;
+  let Inst{51-32} = soff;
+}
+
+class VOP3a_vi <bits<10> op> : Enc64 {
   bits<2> src0_modifiers;
   bits<9> src0;
   bits<2> src1_modifiers;
@@ -115,7 +122,6 @@ class VOP3e_vi <bits<10> op> : Enc64 {
   bits<1> clamp;
   bits<2> omod;
 
-  let Inst{7-0}   = vdst;
   let Inst{8}     = src0_modifiers{1};
   let Inst{9}     = src1_modifiers{1};
   let Inst{10}    = src2_modifiers{1};
@@ -131,6 +137,20 @@ class VOP3e_vi <bits<10> op> : Enc64 {
   let Inst{63} = src2_modifiers{0};
 }
 
+class VOP3e_vi <bits<10> op> : VOP3a_vi <op> {
+  bits<8> vdst;
+
+  let Inst{7-0} = vdst;
+}
+
+// Encoding used for VOPC instructions encoded as VOP3
+// Differs from VOP3e by destination name (sdst) as VOPC doesn't have vector dst
+class VOP3ce_vi <bits<10> op> : VOP3a_vi <op> {
+  bits<8> sdst;
+
+  let Inst{7-0} = sdst;
+}
+
 class VOP3be_vi <bits<10> op> : Enc64 {
   bits<8> vdst;
   bits<2> src0_modifiers;
@@ -157,6 +177,117 @@ class VOP3be_vi <bits<10> op> : Enc64 {
   let Inst{63} = src2_modifiers{0};
 }
 
+class VOP_DPP <dag outs, dag ins, string asm, list<dag> pattern, bit HasMods = 0> :
+    VOPAnyCommon <outs, ins, asm, pattern> {
+  let DPP = 1;
+  let Size = 8;
+
+  let AsmMatchConverter = !if(!eq(HasMods,1), "cvtDPP", "");
+}
+
+class VOP_DPPe : Enc64 {
+  bits<2> src0_modifiers;
+  bits<8> src0;
+  bits<2> src1_modifiers;
+  bits<9> dpp_ctrl;
+  bits<1> bound_ctrl;
+  bits<4> bank_mask;
+  bits<4> row_mask;
+
+  let Inst{39-32} = src0;
+  let Inst{48-40} = dpp_ctrl;
+  let Inst{51}    = bound_ctrl;
+  let Inst{52}    = src0_modifiers{0}; // src0_neg
+  let Inst{53}    = src0_modifiers{1}; // src0_abs
+  let Inst{54}    = src1_modifiers{0}; // src1_neg
+  let Inst{55}    = src1_modifiers{1}; // src1_abs
+  let Inst{59-56} = bank_mask;
+  let Inst{63-60} = row_mask;
+}
+
+class VOP1_DPPe <bits<8> op> : VOP_DPPe {
+  bits<8> vdst;
+
+  let Inst{8-0} = 0xfa; // dpp
+  let Inst{16-9} = op;
+  let Inst{24-17} = vdst;
+  let Inst{31-25} = 0x3f; //encoding
+}
+
+class VOP2_DPPe <bits<6> op> : VOP_DPPe {
+  bits<8> vdst;
+  bits<8> src1;
+
+  let Inst{8-0} = 0xfa; //dpp
+  let Inst{16-9} = src1;
+  let Inst{24-17} = vdst;
+  let Inst{30-25} = op;
+  let Inst{31} = 0x0; //encoding
+}
+
+class VOP_SDWA <dag outs, dag ins, string asm, list<dag> pattern, bit HasMods = 0> :
+    VOPAnyCommon <outs, ins, asm, pattern> {
+  let SDWA = 1;
+  let Size = 8;
+}
+
+class VOP_SDWAe : Enc64 {
+  bits<8> src0;
+  bits<3> src0_sel;
+  bits<2> src0_fmodifiers; // {abs,neg}
+  bits<1> src0_imodifiers; // sext
+  bits<3> src1_sel;
+  bits<2> src1_fmodifiers;
+  bits<1> src1_imodifiers;
+  bits<3> dst_sel;
+  bits<2> dst_unused;
+  bits<1> clamp;
+
+  let Inst{39-32} = src0;
+  let Inst{42-40} = dst_sel;
+  let Inst{44-43} = dst_unused;
+  let Inst{45} = clamp;
+  let Inst{50-48} = src0_sel;
+  let Inst{53-52} = src0_fmodifiers;
+  let Inst{51} = src0_imodifiers;
+  let Inst{58-56} = src1_sel;
+  let Inst{61-60} = src1_fmodifiers;
+  let Inst{59} = src1_imodifiers;
+}
+
+class VOP1_SDWAe <bits<8> op> : VOP_SDWAe {
+  bits<8> vdst;
+
+  let Inst{8-0} = 0xf9; // sdwa
+  let Inst{16-9} = op;
+  let Inst{24-17} = vdst;
+  let Inst{31-25} = 0x3f; // encoding
+}
+
+class VOP2_SDWAe <bits<6> op> : VOP_SDWAe {
+  bits<8> vdst;
+  bits<8> src1;
+
+  let Inst{8-0} = 0xf9; // sdwa
+  let Inst{16-9} = src1;
+  let Inst{24-17} = vdst;
+  let Inst{30-25} = op;
+  let Inst{31} = 0x0; // encoding
+}
+
+class VOPC_SDWAe <bits<8> op> : VOP_SDWAe {
+  bits<8> src1;
+
+  let Inst{8-0} = 0xf9; // sdwa
+  let Inst{16-9} = src1;
+  let Inst{24-17} = op;
+  let Inst{31-25} = 0x3e; // encoding
+
+  // VOPC disallows dst_sel and dst_unused as they have no effect on destination
+  let Inst{42-40} = 0x6;
+  let Inst{44-43} = 0x2;
+}
+
 class EXPe_vi : EXPe {
   let Inst{31-26} = 0x31; //encoding
 }
diff --git a/lib/Target/AMDGPU/VIInstructions.td b/lib/Target/AMDGPU/VIInstructions.td
index 1a7801c92bd7..5c490ab900f2 100644
--- a/lib/Target/AMDGPU/VIInstructions.td
+++ b/lib/Target/AMDGPU/VIInstructions.td
@@ -11,6 +11,8 @@
 
 let SIAssemblerPredicate = DisableInst, SubtargetPredicate = isVI in {
 
+let DisableSIDecoder = 1 in {
+
 //===----------------------------------------------------------------------===//
 // VOP1 Instructions
 //===----------------------------------------------------------------------===//
@@ -52,9 +54,9 @@ defm V_SUBREV_F16 : VOP2Inst <vop2<0, 0x21>, "v_subrev_f16", VOP_F16_F16_F16,
 defm V_MUL_F16 : VOP2Inst <vop2<0, 0x22>, "v_mul_f16", VOP_F16_F16_F16>;
 defm V_MAC_F16 : VOP2Inst <vop2<0, 0x23>, "v_mac_f16", VOP_F16_F16_F16>;
 } // End isCommutable = 1
-defm V_MADMK_F16 : VOP2MADK <vop2<0,0x24>, "v_madmk_f16">;
+defm V_MADMK_F16 : VOP2MADK <vop2<0,0x24>, "v_madmk_f16", VOP_MADMK>;
 let isCommutable = 1 in {
-defm V_MADAK_F16 : VOP2MADK <vop2<0,0x25>, "v_madak_f16">;
+defm V_MADAK_F16 : VOP2MADK <vop2<0,0x25>, "v_madak_f16", VOP_MADAK>;
 defm V_ADD_U16 : VOP2Inst <vop2<0,0x26>, "v_add_u16", VOP_I16_I16_I16>;
 defm V_SUB_U16 : VOP2Inst <vop2<0,0x27>, "v_sub_u16" , VOP_I16_I16_I16>;
 defm V_SUBREV_U16 : VOP2Inst <vop2<0,0x28>, "v_subrev_u16", VOP_I16_I16_I16>;
@@ -73,6 +75,16 @@ defm V_MIN_I16 : VOP2Inst <vop2<0,0x32>, "v_min_i16", VOP_I16_I16_I16>;
 } // End isCommutable = 1
 defm V_LDEXP_F16 : VOP2Inst <vop2<0,0x33>, "v_ldexp_f16", VOP_F16_F16_I16>;
 
+//===----------------------------------------------------------------------===//
+// VOP3 Instructions
+//===----------------------------------------------------------------------===//
+let isCommutable = 1 in {
+    defm V_MAD_F16 : VOP3Inst <vop3<0, 0x1ea>, "v_mad_f16", VOP_F16_F16_F16_F16>;
+    defm V_MAD_U16 : VOP3Inst <vop3<0, 0x1eb>, "v_mad_u16", VOP_I16_I16_I16_I16>;
+    defm V_MAD_I16 : VOP3Inst <vop3<0, 0x1ec>, "v_mad_i16", VOP_I16_I16_I16_I16>;
+}
+} // let DisableSIDecoder = 1
+
 // Aliases to simplify matching of floating-point instructions that
 // are VOP2 on SI and VOP3 on VI.
 
@@ -99,6 +111,9 @@ def S_DCACHE_WB : SMEM_Inval <0x21,
 def S_DCACHE_WB_VOL : SMEM_Inval <0x23,
   "s_dcache_wb_vol", int_amdgcn_s_dcache_wb_vol>;
 
+def S_MEMREALTIME : SMEM_Ret<0x25,
+  "s_memrealtime", int_amdgcn_s_memrealtime>;
+
 } // End SIAssemblerPredicate = DisableInst, SubtargetPredicate = isVI
 
 let Predicates = [isVI] in {
@@ -109,4 +124,35 @@ def : Pat <
   (S_BUFFER_LOAD_DWORD_IMM $sbase, (as_i32imm $offset))
 >;
 
+//===----------------------------------------------------------------------===//
+// DPP Patterns
+//===----------------------------------------------------------------------===//
+
+def : Pat <
+  (int_amdgcn_mov_dpp i32:$src, imm:$dpp_ctrl, imm:$row_mask, imm:$bank_mask,
+                      imm:$bound_ctrl),
+  (V_MOV_B32_dpp $src, (as_i32imm $dpp_ctrl), (as_i32imm $row_mask),
+                       (as_i32imm $bank_mask), (as_i1imm $bound_ctrl))
+>;
+
+//===----------------------------------------------------------------------===//
+// Misc Patterns
+//===----------------------------------------------------------------------===//
+
+def : Pat <
+  (i64 (readcyclecounter)),
+  (S_MEMREALTIME)
+>;
+
+//===----------------------------------------------------------------------===//
+// DS_PERMUTE/DS_BPERMUTE Instructions.
+//===----------------------------------------------------------------------===//
+
+let Uses = [EXEC] in {
+defm DS_PERMUTE_B32 : DS_1A1D_PERMUTE <0x3e, "ds_permute_b32", VGPR_32,
+                                       int_amdgcn_ds_permute>;
+defm DS_BPERMUTE_B32 : DS_1A1D_PERMUTE <0x3f, "ds_bpermute_b32", VGPR_32,
+                                       int_amdgcn_ds_bpermute>;
+}
+
 } // End Predicates = [isVI]
diff --git a/lib/Target/ARM/A15SDOptimizer.cpp b/lib/Target/ARM/A15SDOptimizer.cpp
index 7a1865ce5fd6..9228cc2d7a9c 100644
--- a/lib/Target/ARM/A15SDOptimizer.cpp
+++ b/lib/Target/ARM/A15SDOptimizer.cpp
@@ -68,34 +68,31 @@ namespace {
     //
     unsigned createDupLane(MachineBasicBlock &MBB,
                            MachineBasicBlock::iterator InsertBefore,
-                           DebugLoc DL,
-                           unsigned Reg, unsigned Lane,
-                           bool QPR=false);
+                           const DebugLoc &DL, unsigned Reg, unsigned Lane,
+                           bool QPR = false);
 
     unsigned createExtractSubreg(MachineBasicBlock &MBB,
                                  MachineBasicBlock::iterator InsertBefore,
-                                 DebugLoc DL,
-                                 unsigned DReg, unsigned Lane,
-                                 const TargetRegisterClass *TRC);
+                                 const DebugLoc &DL, unsigned DReg,
+                                 unsigned Lane, const TargetRegisterClass *TRC);
 
     unsigned createVExt(MachineBasicBlock &MBB,
                         MachineBasicBlock::iterator InsertBefore,
-                        DebugLoc DL,
-                        unsigned Ssub0, unsigned Ssub1);
+                        const DebugLoc &DL, unsigned Ssub0, unsigned Ssub1);
 
     unsigned createRegSequence(MachineBasicBlock &MBB,
                                MachineBasicBlock::iterator InsertBefore,
-                               DebugLoc DL,
-                               unsigned Reg1, unsigned Reg2);
+                               const DebugLoc &DL, unsigned Reg1,
+                               unsigned Reg2);
 
     unsigned createInsertSubreg(MachineBasicBlock &MBB,
                                 MachineBasicBlock::iterator InsertBefore,
-                                DebugLoc DL, unsigned DReg, unsigned Lane,
-                                unsigned ToInsert);
+                                const DebugLoc &DL, unsigned DReg,
+                                unsigned Lane, unsigned ToInsert);
 
     unsigned createImplicitDef(MachineBasicBlock &MBB,
                                MachineBasicBlock::iterator InsertBefore,
-                               DebugLoc DL);
+                               const DebugLoc &DL);
 
     //
     // Various property checkers
@@ -426,11 +423,10 @@ SmallVector<unsigned, 8> A15SDOptimizer::getReadDPRs(MachineInstr *MI) {
 }
 
 // Creates a DPR register from an SPR one by using a VDUP.
-unsigned
-A15SDOptimizer::createDupLane(MachineBasicBlock &MBB,
-                              MachineBasicBlock::iterator InsertBefore,
-                              DebugLoc DL,
-                              unsigned Reg, unsigned Lane, bool QPR) {
+unsigned A15SDOptimizer::createDupLane(MachineBasicBlock &MBB,
+                                       MachineBasicBlock::iterator InsertBefore,
+                                       const DebugLoc &DL, unsigned Reg,
+                                       unsigned Lane, bool QPR) {
   unsigned Out = MRI->createVirtualRegister(QPR ? &ARM::QPRRegClass :
                                                   &ARM::DPRRegClass);
   AddDefaultPred(BuildMI(MBB,
@@ -445,12 +441,10 @@ A15SDOptimizer::createDupLane(MachineBasicBlock &MBB,
 }
 
 // Creates a SPR register from a DPR by copying the value in lane 0.
-unsigned
-A15SDOptimizer::createExtractSubreg(MachineBasicBlock &MBB,
-                                    MachineBasicBlock::iterator InsertBefore,
-                                    DebugLoc DL,
-                                    unsigned DReg, unsigned Lane,
-                                    const TargetRegisterClass *TRC) {
+unsigned A15SDOptimizer::createExtractSubreg(
+    MachineBasicBlock &MBB, MachineBasicBlock::iterator InsertBefore,
+    const DebugLoc &DL, unsigned DReg, unsigned Lane,
+    const TargetRegisterClass *TRC) {
   unsigned Out = MRI->createVirtualRegister(TRC);
   BuildMI(MBB,
           InsertBefore,
@@ -462,11 +456,9 @@ A15SDOptimizer::createExtractSubreg(MachineBasicBlock &MBB,
 }
 
 // Takes two SPR registers and creates a DPR by using a REG_SEQUENCE.
-unsigned
-A15SDOptimizer::createRegSequence(MachineBasicBlock &MBB,
-                                  MachineBasicBlock::iterator InsertBefore,
-                                  DebugLoc DL,
-                                  unsigned Reg1, unsigned Reg2) {
+unsigned A15SDOptimizer::createRegSequence(
+    MachineBasicBlock &MBB, MachineBasicBlock::iterator InsertBefore,
+    const DebugLoc &DL, unsigned Reg1, unsigned Reg2) {
   unsigned Out = MRI->createVirtualRegister(&ARM::QPRRegClass);
   BuildMI(MBB,
           InsertBefore,
@@ -481,11 +473,10 @@ A15SDOptimizer::createRegSequence(MachineBasicBlock &MBB,
 
 // Takes two DPR registers that have previously been VDUPed (Ssub0 and Ssub1)
 // and merges them into one DPR register.
-unsigned
-A15SDOptimizer::createVExt(MachineBasicBlock &MBB,
-                           MachineBasicBlock::iterator InsertBefore,
-                           DebugLoc DL,
-                           unsigned Ssub0, unsigned Ssub1) {
+unsigned A15SDOptimizer::createVExt(MachineBasicBlock &MBB,
+                                    MachineBasicBlock::iterator InsertBefore,
+                                    const DebugLoc &DL, unsigned Ssub0,
+                                    unsigned Ssub1) {
   unsigned Out = MRI->createVirtualRegister(&ARM::DPRRegClass);
   AddDefaultPred(BuildMI(MBB,
                          InsertBefore,
@@ -497,11 +488,9 @@ A15SDOptimizer::createVExt(MachineBasicBlock &MBB,
   return Out;
 }
 
-unsigned
-A15SDOptimizer::createInsertSubreg(MachineBasicBlock &MBB,
-                                   MachineBasicBlock::iterator InsertBefore,
-                                   DebugLoc DL, unsigned DReg, unsigned Lane,
-                                   unsigned ToInsert) {
+unsigned A15SDOptimizer::createInsertSubreg(
+    MachineBasicBlock &MBB, MachineBasicBlock::iterator InsertBefore,
+    const DebugLoc &DL, unsigned DReg, unsigned Lane, unsigned ToInsert) {
   unsigned Out = MRI->createVirtualRegister(&ARM::DPR_VFP2RegClass);
   BuildMI(MBB,
           InsertBefore,
@@ -517,7 +506,7 @@ A15SDOptimizer::createInsertSubreg(MachineBasicBlock &MBB,
 unsigned
 A15SDOptimizer::createImplicitDef(MachineBasicBlock &MBB,
                                   MachineBasicBlock::iterator InsertBefore,
-                                  DebugLoc DL) {
+                                  const DebugLoc &DL) {
   unsigned Out = MRI->createVirtualRegister(&ARM::DPRRegClass);
   BuildMI(MBB,
           InsertBefore,
@@ -681,6 +670,9 @@ bool A15SDOptimizer::runOnInstruction(MachineInstr *MI) {
 }
 
 bool A15SDOptimizer::runOnMachineFunction(MachineFunction &Fn) {
+  if (skipFunction(*Fn.getFunction()))
+    return false;
+
   const ARMSubtarget &STI = Fn.getSubtarget<ARMSubtarget>();
   // Since the A15SDOptimizer pass can insert VDUP instructions, it can only be
   // enabled when NEON is available.
@@ -701,7 +693,7 @@ bool A15SDOptimizer::runOnMachineFunction(MachineFunction &Fn) {
 
     for (MachineBasicBlock::iterator MI = MFI->begin(), ME = MFI->end();
       MI != ME;) {
-      Modified |= runOnInstruction(MI++);
+      Modified |= runOnInstruction(&*MI++);
     }
 
   }
diff --git a/lib/Target/ARM/ARM.h b/lib/Target/ARM/ARM.h
index cd7540e52410..690ff86a0c86 100644
--- a/lib/Target/ARM/ARM.h
+++ b/lib/Target/ARM/ARM.h
@@ -27,6 +27,7 @@ class FunctionPass;
 class ImmutablePass;
 class MachineInstr;
 class MCInst;
+class PassRegistry;
 class TargetLowering;
 class TargetMachine;
 
@@ -45,6 +46,9 @@ FunctionPass *createThumb2SizeReductionPass(
 void LowerARMMachineInstrToMCInst(const MachineInstr *MI, MCInst &OutMI,
                                   ARMAsmPrinter &AP);
 
+void initializeARMLoadStoreOptPass(PassRegistry &);
+void initializeARMPreAllocLoadStoreOptPass(PassRegistry &);
+
 } // end namespace llvm;
 
 #endif
diff --git a/lib/Target/ARM/ARM.td b/lib/Target/ARM/ARM.td
index c171656b48ab..ef626b66a1e7 100644
--- a/lib/Target/ARM/ARM.td
+++ b/lib/Target/ARM/ARM.td
@@ -76,6 +76,11 @@ def FeatureT2XtPk : SubtargetFeature<"t2xtpk", "HasT2ExtractPack", "true",
                                  "Enable Thumb2 extract and pack instructions">;
 def FeatureDB     : SubtargetFeature<"db", "HasDataBarrier", "true",
                                    "Has data barrier (dmb / dsb) instructions">;
+def FeatureV7Clrex : SubtargetFeature<"v7clrex", "HasV7Clrex", "true",
+                                      "Has v7 clrex instruction">;
+def FeatureAcquireRelease : SubtargetFeature<"acquire-release",
+                                             "HasAcquireRelease", "true",
+                         "Has v8 acquire/release (lda/ldaex etc) instructions">;
 def FeatureSlowFPBrcc : SubtargetFeature<"slow-fp-brcc", "SlowFPBrcc", "true",
                                          "FP compare + branch is slow">;
 def FeatureVFPOnlySP : SubtargetFeature<"fp-only-sp", "FPOnlySP", "true",
@@ -84,17 +89,98 @@ def FeaturePerfMon : SubtargetFeature<"perfmon", "HasPerfMon", "true",
                            "Enable support for Performance Monitor extensions">;
 def FeatureTrustZone : SubtargetFeature<"trustzone", "HasTrustZone", "true",
                           "Enable support for TrustZone security extensions">;
+def Feature8MSecExt : SubtargetFeature<"8msecext", "Has8MSecExt", "true",
+                          "Enable support for ARMv8-M Security Extensions">;
 def FeatureCrypto : SubtargetFeature<"crypto", "HasCrypto", "true",
                           "Enable support for Cryptography extensions",
                           [FeatureNEON]>;
 def FeatureCRC : SubtargetFeature<"crc", "HasCRC", "true",
                           "Enable support for CRC instructions">;
+// Not to be confused with FeatureHasRetAddrStack (return address stack)
+def FeatureRAS : SubtargetFeature<"ras", "HasRAS", "true",
+                "Enable Reliability, Availability and Serviceability extensions">;
+
 
 // Cyclone has preferred instructions for zeroing VFP registers, which can
 // execute in 0 cycles.
 def FeatureZCZeroing : SubtargetFeature<"zcz", "HasZeroCycleZeroing", "true",
                                         "Has zero-cycle zeroing instructions">;
 
+// Whether or not it may be profitable to unpredicate certain instructions
+// during if conversion.
+def FeatureProfUnpredicate : SubtargetFeature<"prof-unpr",
+                                              "IsProfitableToUnpredicate",
+                                              "true",
+                                              "Is profitable to unpredicate">;
+
+// Some targets (e.g. Swift) have microcoded VGETLNi32.
+def FeatureSlowVGETLNi32 : SubtargetFeature<"slow-vgetlni32",
+                                            "HasSlowVGETLNi32", "true",
+                                            "Has slow VGETLNi32 - prefer VMOV">;
+
+// Some targets (e.g. Swift) have microcoded VDUP32.
+def FeatureSlowVDUP32 : SubtargetFeature<"slow-vdup32", "HasSlowVDUP32", "true",
+                                         "Has slow VDUP32 - prefer VMOV">;
+
+// Some targets (e.g. Cortex-A9) prefer VMOVSR to VMOVDRR even when using NEON
+// for scalar FP, as this allows more effective execution domain optimization.
+def FeaturePreferVMOVSR : SubtargetFeature<"prefer-vmovsr", "PreferVMOVSR",
+                                           "true", "Prefer VMOVSR">;
+
+// Swift has ISHST barriers compatible with Atomic Release semantics but weaker
+// than ISH
+def FeaturePrefISHSTBarrier : SubtargetFeature<"prefer-ishst", "PreferISHST",
+                                           "true", "Prefer ISHST barriers">;
+
+// Some targets (e.g. Cortex-A9) have muxed AGU and NEON/FPU.
+def FeatureMuxedUnits : SubtargetFeature<"muxed-units", "HasMuxedUnits", "true",
+                                         "Has muxed AGU and NEON/FPU">;
+
+// On some targets, a VLDM/VSTM starting with an odd register number needs more
+// microops than single VLDRS.
+def FeatureSlowOddRegister : SubtargetFeature<"slow-odd-reg", "SlowOddRegister",
+                     "true", "VLDM/VSTM starting with an odd register is slow">;
+
+// Some targets have a renaming dependency when loading into D subregisters.
+def FeatureSlowLoadDSubreg : SubtargetFeature<"slow-load-D-subreg",
+                                              "SlowLoadDSubregister", "true",
+                                              "Loading into D subregs is slow">;
+// Some targets (e.g. Cortex-A15) never want VMOVS to be widened to VMOVD.
+def FeatureDontWidenVMOVS : SubtargetFeature<"dont-widen-vmovs",
+                                             "DontWidenVMOVS", "true",
+                                             "Don't widen VMOVS to VMOVD">;
+
+// Whether or not it is profitable to expand VFP/NEON MLA/MLS instructions.
+def FeatureExpandMLx : SubtargetFeature<"expand-fp-mlx", "ExpandMLx", "true",
+                                        "Expand VFP/NEON MLA/MLS instructions">;
+
+// Some targets have special RAW hazards for VFP/NEON VMLA/VMLS.
+def FeatureHasVMLxHazards : SubtargetFeature<"vmlx-hazards", "HasVMLxHazards",
+                                             "true", "Has VMLx hazards">;
+
+// Some targets (e.g. Cortex-A9) want to convert VMOVRS, VMOVSR and VMOVS from
+// VFP to NEON, as an execution domain optimization.
+def FeatureNEONForFPMovs : SubtargetFeature<"neon-fpmovs", "UseNEONForFPMovs",
+                              "true", "Convert VMOVSR, VMOVRS, VMOVS to NEON">;
+
+// Some processors benefit from using NEON instructions for scalar
+// single-precision FP operations. This affects instruction selection and should
+// only be enabled if the handling of denormals is not important.
+def FeatureNEONForFP : SubtargetFeature<"neonfp", "UseNEONForSinglePrecisionFP",
+                                        "true",
+                                        "Use NEON for single precision FP">;
+
+// On some processors, VLDn instructions that access unaligned data take one
+// extra cycle. Take that into account when computing operand latencies.
+def FeatureCheckVLDnAlign : SubtargetFeature<"vldn-align", "CheckVLDnAlign",
+                                             "true",
+                                             "Check for VLDn unaligned access">;
+
+// Some processors have a nonpipelined VFP coprocessor.
+def FeatureNonpipelinedVFP : SubtargetFeature<"nonpipelined-vfp",
+                                              "NonpipelinedVFP", "true",
+                                          "VFP instructions are not pipelined">;
+
 // Some processors have FP multiply-accumulate instructions that don't
 // play nicely with other VFP / NEON instructions, and it's generally better
 // to just not use them.
@@ -106,12 +192,6 @@ def FeatureVMLxForwarding : SubtargetFeature<"vmlx-forwarding",
                                        "HasVMLxForwarding", "true",
                                        "Has multiplier accumulator forwarding">;
 
-// Some processors benefit from using NEON instructions for scalar
-// single-precision FP operations.
-def FeatureNEONForFP : SubtargetFeature<"neonfp", "UseNEONForSinglePrecisionFP",
-                                        "true",
-                                        "Use NEON for single precision FP">;
-
 // Disable 32-bit to 16-bit narrowing for experimentation.
 def FeaturePref32BitThumb : SubtargetFeature<"32bit", "Pref32BitThumb", "true",
                                              "Prefer 32-bit Thumb instrs">;
@@ -130,7 +210,7 @@ def FeatureAvoidMOVsShOp : SubtargetFeature<"avoid-movs-shop",
 
 // Some processors perform return stack prediction. CodeGen should avoid issue
 // "normal" call instructions to callees which do not return.
-def FeatureHasRAS : SubtargetFeature<"ras", "HasRAS", "true",
+def FeatureHasRetAddrStack : SubtargetFeature<"ret-addr-stack", "HasRetAddrStack", "true",
                                      "Has return address stack">;
 
 /// DSP extension.
@@ -200,24 +280,31 @@ def HasV6Ops    : SubtargetFeature<"v6", "HasV6Ops", "true",
 def HasV6MOps   : SubtargetFeature<"v6m", "HasV6MOps", "true",
                                    "Support ARM v6M instructions",
                                    [HasV6Ops]>;
+def HasV8MBaselineOps : SubtargetFeature<"v8m", "HasV8MBaselineOps", "true",
+                                         "Support ARM v8M Baseline instructions",
+                                         [HasV6MOps]>;
 def HasV6KOps   : SubtargetFeature<"v6k", "HasV6KOps", "true",
                                    "Support ARM v6k instructions",
                                    [HasV6Ops]>;
 def HasV6T2Ops  : SubtargetFeature<"v6t2", "HasV6T2Ops", "true",
                                    "Support ARM v6t2 instructions",
-                                   [HasV6MOps, HasV6KOps, FeatureThumb2]>;
+                                   [HasV8MBaselineOps, HasV6KOps, FeatureThumb2]>;
 def HasV7Ops    : SubtargetFeature<"v7", "HasV7Ops", "true",
                                    "Support ARM v7 instructions",
-                                   [HasV6T2Ops, FeaturePerfMon]>;
+                                   [HasV6T2Ops, FeaturePerfMon,
+                                    FeatureV7Clrex]>;
 def HasV8Ops    : SubtargetFeature<"v8", "HasV8Ops", "true",
                                    "Support ARM v8 instructions",
-                                   [HasV7Ops]>;
+                                   [HasV7Ops, FeatureAcquireRelease]>;
 def HasV8_1aOps : SubtargetFeature<"v8.1a", "HasV8_1aOps", "true",
                                    "Support ARM v8.1a instructions",
                                    [HasV8Ops]>;
 def HasV8_2aOps   : SubtargetFeature<"v8.2a", "HasV8_2aOps", "true",
                                    "Support ARM v8.2a instructions",
                                    [HasV8_1aOps]>;
+def HasV8MMainlineOps : SubtargetFeature<"v8m.main", "HasV8MMainlineOps", "true",
+                                         "Support ARM v8M Mainline instructions",
+                                         [HasV7Ops]>;
 
 
 //===----------------------------------------------------------------------===//
@@ -238,6 +325,8 @@ def ProcA15     : SubtargetFeature<"a15", "ARMProcFamily", "CortexA15",
                                    "Cortex-A15 ARM processors", []>;
 def ProcA17     : SubtargetFeature<"a17", "ARMProcFamily", "CortexA17",
                                    "Cortex-A17 ARM processors", []>;
+def ProcA32     : SubtargetFeature<"a32", "ARMProcFamily", "CortexA32",
+                                   "Cortex-A32 ARM processors", []>;
 def ProcA35     : SubtargetFeature<"a35", "ARMProcFamily", "CortexA35",
                                    "Cortex-A35 ARM processors", []>;
 def ProcA53     : SubtargetFeature<"a53", "ARMProcFamily", "CortexA53",
@@ -246,6 +335,8 @@ def ProcA57     : SubtargetFeature<"a57", "ARMProcFamily", "CortexA57",
                                    "Cortex-A57 ARM processors", []>;
 def ProcA72     : SubtargetFeature<"a72", "ARMProcFamily", "CortexA72",
                                    "Cortex-A72 ARM processors", []>;
+def ProcA73     : SubtargetFeature<"a73", "ARMProcFamily", "CortexA73",
+                                   "Cortex-A73 ARM processors", []>;
 
 def ProcKrait   : SubtargetFeature<"krait", "ARMProcFamily", "Krait",
                                    "Qualcomm ARM processors", []>;
@@ -256,12 +347,14 @@ def ProcExynosM1 : SubtargetFeature<"exynosm1", "ARMProcFamily", "ExynosM1",
                                     "Samsung Exynos-M1 processors", []>;
 
 def ProcR4      : SubtargetFeature<"r4", "ARMProcFamily", "CortexR4",
-                                    "Cortex-R4 ARM processors", []>;
+                                   "Cortex-R4 ARM processors", []>;
 def ProcR5      : SubtargetFeature<"r5", "ARMProcFamily", "CortexR5",
                                    "Cortex-R5 ARM processors", []>;
 def ProcR7      : SubtargetFeature<"r7", "ARMProcFamily", "CortexR7",
                                    "Cortex-R7 ARM processors", []>;
 
+def ProcM3      : SubtargetFeature<"m3", "ARMProcFamily", "CortexM3",
+                                   "Cortex-M3 ARM processors", []>;
 
 //===----------------------------------------------------------------------===//
 // ARM schedules.
@@ -374,7 +467,27 @@ def ARMv82a   : Architecture<"armv8.2-a", "ARMv82a",  [HasV8_2aOps,
                                                        FeatureMP,
                                                        FeatureVirtualization,
                                                        FeatureCrypto,
-                                                       FeatureCRC]>;
+                                                       FeatureCRC,
+                                                       FeatureRAS]>;
+
+def ARMv8mBaseline : Architecture<"armv8-m.base", "ARMv8mBaseline",
+                                                      [HasV8MBaselineOps,
+                                                       FeatureNoARM,
+                                                       FeatureDB,
+                                                       FeatureHWDiv,
+                                                       FeatureV7Clrex,
+                                                       Feature8MSecExt,
+                                                       FeatureAcquireRelease,
+                                                       FeatureMClass]>;
+
+def ARMv8mMainline : Architecture<"armv8-m.main", "ARMv8mMainline",
+                                                      [HasV8MMainlineOps,
+                                                       FeatureNoARM,
+                                                       FeatureDB,
+                                                       FeatureHWDiv,
+                                                       Feature8MSecExt,
+                                                       FeatureAcquireRelease,
+                                                       FeatureMClass]>;
 
 // Aliases
 def IWMMXT   : Architecture<"iwmmxt",      "ARMv5te",  [ARMv5te]>;
@@ -452,7 +565,7 @@ def : Processor<"arm1156t2f-s",     ARMV6Itineraries,   [ARMv6t2,
 
 // FIXME: A5 has currently the same Schedule model as A8
 def : ProcessorModel<"cortex-a5",   CortexA8Model,      [ARMv7a, ProcA5,
-                                                         FeatureHasRAS,
+                                                         FeatureHasRetAddrStack,
                                                          FeatureTrustZone,
                                                          FeatureSlowFPBrcc,
                                                          FeatureHasSlowFPVMLx,
@@ -462,9 +575,10 @@ def : ProcessorModel<"cortex-a5",   CortexA8Model,      [ARMv7a, ProcA5,
                                                          FeatureVFP4]>;
 
 def : ProcessorModel<"cortex-a7",   CortexA8Model,      [ARMv7a, ProcA7,
-                                                         FeatureHasRAS,
+                                                         FeatureHasRetAddrStack,
                                                          FeatureTrustZone,
                                                          FeatureSlowFPBrcc,
+                                                         FeatureHasVMLxHazards,
                                                          FeatureHasSlowFPVMLx,
                                                          FeatureVMLxForwarding,
                                                          FeatureT2XtPk,
@@ -475,25 +589,33 @@ def : ProcessorModel<"cortex-a7",   CortexA8Model,      [ARMv7a, ProcA7,
                                                          FeatureVirtualization]>;
 
 def : ProcessorModel<"cortex-a8",   CortexA8Model,      [ARMv7a, ProcA8,
-                                                         FeatureHasRAS,
+                                                         FeatureHasRetAddrStack,
+                                                         FeatureNonpipelinedVFP,
                                                          FeatureTrustZone,
                                                          FeatureSlowFPBrcc,
+                                                         FeatureHasVMLxHazards,
                                                          FeatureHasSlowFPVMLx,
                                                          FeatureVMLxForwarding,
                                                          FeatureT2XtPk]>;
 
 def : ProcessorModel<"cortex-a9",   CortexA9Model,      [ARMv7a, ProcA9,
-                                                         FeatureHasRAS,
+                                                         FeatureHasRetAddrStack,
                                                          FeatureTrustZone,
+                                                         FeatureHasVMLxHazards,
                                                          FeatureVMLxForwarding,
                                                          FeatureT2XtPk,
                                                          FeatureFP16,
                                                          FeatureAvoidPartialCPSR,
+                                                         FeatureExpandMLx,
+                                                         FeaturePreferVMOVSR,
+                                                         FeatureMuxedUnits,
+                                                         FeatureNEONForFPMovs,
+                                                         FeatureCheckVLDnAlign,
                                                          FeatureMP]>;
 
 // FIXME: A12 has currently the same Schedule model as A9
 def : ProcessorModel<"cortex-a12",  CortexA9Model,      [ARMv7a, ProcA12,
-                                                         FeatureHasRAS,
+                                                         FeatureHasRetAddrStack,
                                                          FeatureTrustZone,
                                                          FeatureVMLxForwarding,
                                                          FeatureT2XtPk,
@@ -506,11 +628,14 @@ def : ProcessorModel<"cortex-a12",  CortexA9Model,      [ARMv7a, ProcA12,
 
 // FIXME: A15 has currently the same Schedule model as A9.
 def : ProcessorModel<"cortex-a15",  CortexA9Model,      [ARMv7a, ProcA15,
-                                                         FeatureHasRAS,
+                                                         FeatureDontWidenVMOVS,
+                                                         FeatureHasRetAddrStack,
+                                                         FeatureMuxedUnits,
                                                          FeatureTrustZone,
                                                          FeatureT2XtPk,
                                                          FeatureVFP4,
                                                          FeatureMP,
+                                                         FeatureCheckVLDnAlign,
                                                          FeatureHWDiv,
                                                          FeatureHWDivARM,
                                                          FeatureAvoidPartialCPSR,
@@ -518,7 +643,7 @@ def : ProcessorModel<"cortex-a15",  CortexA9Model,      [ARMv7a, ProcA15,
 
 // FIXME: A17 has currently the same Schedule model as A9
 def : ProcessorModel<"cortex-a17",  CortexA9Model,      [ARMv7a, ProcA17,
-                                                         FeatureHasRAS,
+                                                         FeatureHasRetAddrStack,
                                                          FeatureTrustZone,
                                                          FeatureMP,
                                                          FeatureVMLxForwarding,
@@ -533,7 +658,9 @@ def : ProcessorModel<"cortex-a17",  CortexA9Model,      [ARMv7a, ProcA17,
 // FIXME: krait has currently the same features as A9 plus VFP4 and hardware
 //        division features.
 def : ProcessorModel<"krait",       CortexA9Model,      [ARMv7a, ProcKrait,
-                                                         FeatureHasRAS,
+                                                         FeatureHasRetAddrStack,
+                                                         FeatureMuxedUnits,
+                                                         FeatureCheckVLDnAlign,
                                                          FeatureVMLxForwarding,
                                                          FeatureT2XtPk,
                                                          FeatureFP16,
@@ -543,7 +670,7 @@ def : ProcessorModel<"krait",       CortexA9Model,      [ARMv7a, ProcKrait,
                                                          FeatureHWDivARM]>;
 
 def : ProcessorModel<"swift",       SwiftModel,         [ARMv7a, ProcSwift,
-                                                         FeatureHasRAS,
+                                                         FeatureHasRetAddrStack,
                                                          FeatureNEONForFP,
                                                          FeatureT2XtPk,
                                                          FeatureVFP4,
@@ -552,17 +679,24 @@ def : ProcessorModel<"swift",       SwiftModel,         [ARMv7a, ProcSwift,
                                                          FeatureHWDivARM,
                                                          FeatureAvoidPartialCPSR,
                                                          FeatureAvoidMOVsShOp,
-                                                         FeatureHasSlowFPVMLx]>;
+                                                         FeatureHasSlowFPVMLx,
+                                                         FeatureHasVMLxHazards,
+                                                         FeatureProfUnpredicate,
+                                                         FeaturePrefISHSTBarrier,
+                                                         FeatureSlowOddRegister,
+                                                         FeatureSlowLoadDSubreg,
+                                                         FeatureSlowVGETLNi32,
+                                                         FeatureSlowVDUP32]>;
 
 // FIXME: R4 has currently the same ProcessorModel as A8.
 def : ProcessorModel<"cortex-r4",   CortexA8Model,      [ARMv7r, ProcR4,
-                                                         FeatureHasRAS,
+                                                         FeatureHasRetAddrStack,
                                                          FeatureAvoidPartialCPSR,
                                                          FeatureT2XtPk]>;
 
 // FIXME: R4F has currently the same ProcessorModel as A8.
 def : ProcessorModel<"cortex-r4f",  CortexA8Model,      [ARMv7r, ProcR4,
-                                                         FeatureHasRAS,
+                                                         FeatureHasRetAddrStack,
                                                          FeatureSlowFPBrcc,
                                                          FeatureHasSlowFPVMLx,
                                                          FeatureVFP3,
@@ -572,7 +706,7 @@ def : ProcessorModel<"cortex-r4f",  CortexA8Model,      [ARMv7r, ProcR4,
 
 // FIXME: R5 has currently the same ProcessorModel as A8.
 def : ProcessorModel<"cortex-r5",   CortexA8Model,      [ARMv7r, ProcR5,
-                                                         FeatureHasRAS,
+                                                         FeatureHasRetAddrStack,
                                                          FeatureVFP3,
                                                          FeatureD16,
                                                          FeatureSlowFPBrcc,
@@ -583,9 +717,20 @@ def : ProcessorModel<"cortex-r5",   CortexA8Model,      [ARMv7r, ProcR5,
 
 // FIXME: R7 has currently the same ProcessorModel as A8 and is modelled as R5.
 def : ProcessorModel<"cortex-r7",   CortexA8Model,      [ARMv7r, ProcR7,
-                                                         FeatureHasRAS,
+                                                         FeatureHasRetAddrStack,
+                                                         FeatureVFP3,
+                                                         FeatureD16,
+                                                         FeatureFP16,
+                                                         FeatureMP,
+                                                         FeatureSlowFPBrcc,
+                                                         FeatureHWDivARM,
+                                                         FeatureHasSlowFPVMLx,
+                                                         FeatureAvoidPartialCPSR,
+                                                         FeatureT2XtPk]>;
+
+def : ProcessorModel<"cortex-r8",   CortexA8Model,      [ARMv7r,
+                                                         FeatureHasRetAddrStack,
                                                          FeatureVFP3,
-                                                         FeatureVFPOnlySP,
                                                          FeatureD16,
                                                          FeatureFP16,
                                                          FeatureMP,
@@ -595,8 +740,8 @@ def : ProcessorModel<"cortex-r7",   CortexA8Model,      [ARMv7r, ProcR7,
                                                          FeatureAvoidPartialCPSR,
                                                          FeatureT2XtPk]>;
 
-def : ProcNoItin<"cortex-m3",                           [ARMv7m]>;
-def : ProcNoItin<"sc300",                               [ARMv7m]>;
+def : ProcNoItin<"cortex-m3",                           [ARMv7m, ProcM3]>;
+def : ProcNoItin<"sc300",                               [ARMv7m, ProcM3]>;
 
 def : ProcNoItin<"cortex-m4",                           [ARMv7em,
                                                          FeatureVFP4,
@@ -607,6 +752,12 @@ def : ProcNoItin<"cortex-m7",                           [ARMv7em,
                                                          FeatureFPARMv8,
                                                          FeatureD16]>;
 
+def : ProcNoItin<"cortex-a32",                           [ARMv8a,
+                                                         FeatureHWDiv,
+                                                         FeatureHWDivARM,
+                                                         FeatureT2XtPk,
+                                                         FeatureCrypto,
+                                                         FeatureCRC]>;
 
 def : ProcNoItin<"cortex-a35",                          [ARMv8a, ProcA35,
                                                          FeatureHWDiv,
@@ -636,9 +787,16 @@ def : ProcNoItin<"cortex-a72",                          [ARMv8a, ProcA72,
                                                          FeatureCrypto,
                                                          FeatureCRC]>;
 
+def : ProcNoItin<"cortex-a73",                          [ARMv8a, ProcA73,
+                                                         FeatureHWDiv,
+                                                         FeatureHWDivARM,
+                                                         FeatureT2XtPk,
+                                                         FeatureCrypto,
+                                                         FeatureCRC]>;
+
 // Cyclone is very similar to swift
 def : ProcessorModel<"cyclone",     SwiftModel,         [ARMv8a, ProcSwift,
-                                                         FeatureHasRAS,
+                                                         FeatureHasRetAddrStack,
                                                          FeatureNEONForFP,
                                                          FeatureT2XtPk,
                                                          FeatureVFP4,
diff --git a/lib/Target/ARM/ARMAsmPrinter.cpp b/lib/Target/ARM/ARMAsmPrinter.cpp
index 206db9619a2f..04863a7ecf8f 100644
--- a/lib/Target/ARM/ARMAsmPrinter.cpp
+++ b/lib/Target/ARM/ARMAsmPrinter.cpp
@@ -43,12 +43,11 @@
 #include "llvm/MC/MCStreamer.h"
 #include "llvm/MC/MCSymbol.h"
 #include "llvm/Support/ARMBuildAttributes.h"
-#include "llvm/Support/TargetParser.h"
 #include "llvm/Support/COFF.h"
-#include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/ELF.h"
 #include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/TargetParser.h"
 #include "llvm/Support/TargetRegistry.h"
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/Target/TargetMachine.h"
@@ -213,8 +212,6 @@ void ARMAsmPrinter::printOperand(const MachineInstr *MI, int OpNum,
     GetARMGVSymbol(GV, TF)->print(O, MAI);
 
     printOffset(MO.getOffset(), O);
-    if (TF == ARMII::MO_PLT)
-      O << "(PLT)";
     break;
   }
   case MachineOperand::MO_ConstantPoolIndex:
@@ -516,9 +513,10 @@ void ARMAsmPrinter::EmitEndOfAsmFile(Module &M) {
       OutStreamer->AddBlankLine();
     }
 
-    Stubs = MMIMacho.GetHiddenGVStubList();
+    Stubs = MMIMacho.GetThreadLocalGVStubList();
     if (!Stubs.empty()) {
-      OutStreamer->SwitchSection(TLOFMacho.getNonLazySymbolPointerSection());
+      // Switch with ".non_lazy_symbol_pointer" directive.
+      OutStreamer->SwitchSection(TLOFMacho.getThreadLocalPointerSection());
       EmitAlignment(2);
 
       for (auto &Stub : Stubs)
@@ -536,18 +534,48 @@ void ARMAsmPrinter::EmitEndOfAsmFile(Module &M) {
     OutStreamer->EmitAssemblerFlag(MCAF_SubsectionsViaSymbols);
   }
 
+  if (TT.isOSBinFormatCOFF()) {
+    const auto &TLOF =
+        static_cast<const TargetLoweringObjectFileCOFF &>(getObjFileLowering());
+
+    std::string Flags;
+    raw_string_ostream OS(Flags);
+
+    for (const auto &Function : M)
+      TLOF.emitLinkerFlagsForGlobal(OS, &Function, *Mang);
+    for (const auto &Global : M.globals())
+      TLOF.emitLinkerFlagsForGlobal(OS, &Global, *Mang);
+    for (const auto &Alias : M.aliases())
+      TLOF.emitLinkerFlagsForGlobal(OS, &Alias, *Mang);
+
+    OS.flush();
+
+    // Output collected flags
+    if (!Flags.empty()) {
+      OutStreamer->SwitchSection(TLOF.getDrectveSection());
+      OutStreamer->EmitBytes(Flags);
+    }
+  }
+
   // The last attribute to be emitted is ABI_optimization_goals
   MCTargetStreamer &TS = *OutStreamer->getTargetStreamer();
   ARMTargetStreamer &ATS = static_cast<ARMTargetStreamer &>(TS);
 
   if (OptimizationGoals > 0 &&
-      (Subtarget->isTargetAEABI() || Subtarget->isTargetGNUAEABI()))
+      (Subtarget->isTargetAEABI() || Subtarget->isTargetGNUAEABI() ||
+       Subtarget->isTargetMuslAEABI()))
     ATS.emitAttribute(ARMBuildAttrs::ABI_optimization_goals, OptimizationGoals);
   OptimizationGoals = -1;
 
   ATS.finishAttributeSection();
 }
 
+static bool isV8M(const ARMSubtarget *Subtarget) {
+  // Note that v8M Baseline is a subset of v6T2!
+  return (Subtarget->hasV8MBaselineOps() && !Subtarget->hasV6T2Ops()) ||
+         Subtarget->hasV8MMainlineOps();
+}
+
 //===----------------------------------------------------------------------===//
 // Helper routines for EmitStartOfAsmFile() and EmitEndOfAsmFile()
 // FIXME:
@@ -561,13 +589,17 @@ static ARMBuildAttrs::CPUArch getArchForCPU(StringRef CPU,
     return ARMBuildAttrs::v5TEJ;
 
   if (Subtarget->hasV8Ops())
-    return ARMBuildAttrs::v8;
+    return ARMBuildAttrs::v8_A;
+  else if (Subtarget->hasV8MMainlineOps())
+    return ARMBuildAttrs::v8_M_Main;
   else if (Subtarget->hasV7Ops()) {
     if (Subtarget->isMClass() && Subtarget->hasDSP())
       return ARMBuildAttrs::v7E_M;
     return ARMBuildAttrs::v7;
   } else if (Subtarget->hasV6T2Ops())
     return ARMBuildAttrs::v6T2;
+  else if (Subtarget->hasV8MBaselineOps())
+    return ARMBuildAttrs::v8_M_Base;
   else if (Subtarget->hasV6MOps())
     return ARMBuildAttrs::v6S_M;
   else if (Subtarget->hasV6Ops())
@@ -609,9 +641,9 @@ void ARMAsmPrinter::emitAttributes() {
       static_cast<const ARMBaseTargetMachine &>(TM);
   const ARMSubtarget STI(TT, CPU, ArchFS, ATM, ATM.isLittleEndian());
 
-  std::string CPUString = STI.getCPUString();
+  const std::string &CPUString = STI.getCPUString();
 
-  if (CPUString.find("generic") != 0) { //CPUString doesn't start with "generic"
+  if (!StringRef(CPUString).startswith("generic")) {
     // FIXME: remove krait check when GNU tools support krait cpu
     if (STI.isKrait()) {
       ATS.emitTextAttribute(ARMBuildAttrs::CPU_name, "cortex-a9");
@@ -627,7 +659,7 @@ void ARMAsmPrinter::emitAttributes() {
 
   // Tag_CPU_arch_profile must have the default value of 0 when "Architecture
   // profile is not applicable (e.g. pre v7, or cross-profile code)".
-  if (STI.hasV7Ops()) {
+  if (STI.hasV7Ops() || isV8M(&STI)) {
     if (STI.isAClass()) {
       ATS.emitAttribute(ARMBuildAttrs::CPU_arch_profile,
                         ARMBuildAttrs::ApplicationProfile);
@@ -643,7 +675,10 @@ void ARMAsmPrinter::emitAttributes() {
   ATS.emitAttribute(ARMBuildAttrs::ARM_ISA_use,
                     STI.hasARMOps() ? ARMBuildAttrs::Allowed
                                     : ARMBuildAttrs::Not_Allowed);
-  if (STI.isThumb1Only()) {
+  if (isV8M(&STI)) {
+    ATS.emitAttribute(ARMBuildAttrs::THUMB_ISA_use,
+                      ARMBuildAttrs::AllowThumbDerived);
+  } else if (STI.isThumb1Only()) {
     ATS.emitAttribute(ARMBuildAttrs::THUMB_ISA_use, ARMBuildAttrs::Allowed);
   } else if (STI.hasThumb2()) {
     ATS.emitAttribute(ARMBuildAttrs::THUMB_ISA_use,
@@ -690,7 +725,7 @@ void ARMAsmPrinter::emitAttributes() {
       ATS.emitFPU(ARM::FK_VFPV2);
   }
 
-  if (TM.getRelocationModel() == Reloc::PIC_) {
+  if (isPositionIndependent()) {
     // PIC specific attributes.
     ATS.emitAttribute(ARMBuildAttrs::ABI_PCS_RW_data,
                       ARMBuildAttrs::AddressRWPCRel);
@@ -794,6 +829,9 @@ void ARMAsmPrinter::emitAttributes() {
   if (STI.hasDivideInARMMode() && !STI.hasV8Ops())
     ATS.emitAttribute(ARMBuildAttrs::DIV_use, ARMBuildAttrs::AllowDIVExt);
 
+  if (STI.hasDSP() && isV8M(&STI))
+    ATS.emitAttribute(ARMBuildAttrs::DSP_extension, ARMBuildAttrs::Allowed);
+
   if (MMI) {
     if (const Module *SourceModule = MMI->getModule()) {
       // ABI_PCS_wchar_t to indicate wchar_t width
@@ -853,11 +891,18 @@ static MCSymbol *getPICLabel(const char *Prefix, unsigned FunctionNumber,
 static MCSymbolRefExpr::VariantKind
 getModifierVariantKind(ARMCP::ARMCPModifier Modifier) {
   switch (Modifier) {
-  case ARMCP::no_modifier: return MCSymbolRefExpr::VK_None;
-  case ARMCP::TLSGD:       return MCSymbolRefExpr::VK_TLSGD;
-  case ARMCP::TPOFF:       return MCSymbolRefExpr::VK_TPOFF;
-  case ARMCP::GOTTPOFF:    return MCSymbolRefExpr::VK_GOTTPOFF;
-  case ARMCP::GOT_PREL:    return MCSymbolRefExpr::VK_ARM_GOT_PREL;
+  case ARMCP::no_modifier:
+    return MCSymbolRefExpr::VK_None;
+  case ARMCP::TLSGD:
+    return MCSymbolRefExpr::VK_TLSGD;
+  case ARMCP::TPOFF:
+    return MCSymbolRefExpr::VK_TPOFF;
+  case ARMCP::GOTTPOFF:
+    return MCSymbolRefExpr::VK_GOTTPOFF;
+  case ARMCP::GOT_PREL:
+    return MCSymbolRefExpr::VK_ARM_GOT_PREL;
+  case ARMCP::SECREL:
+    return MCSymbolRefExpr::VK_SECREL;
   }
   llvm_unreachable("Invalid ARMCPModifier!");
 }
@@ -865,8 +910,8 @@ getModifierVariantKind(ARMCP::ARMCPModifier Modifier) {
 MCSymbol *ARMAsmPrinter::GetARMGVSymbol(const GlobalValue *GV,
                                         unsigned char TargetFlags) {
   if (Subtarget->isTargetMachO()) {
-    bool IsIndirect = (TargetFlags & ARMII::MO_NONLAZY) &&
-      Subtarget->GVIsIndirectSymbol(GV, TM.getRelocationModel());
+    bool IsIndirect =
+        (TargetFlags & ARMII::MO_NONLAZY) && Subtarget->isGVIndirectSymbol(GV);
 
     if (!IsIndirect)
       return getSymbol(GV);
@@ -876,8 +921,9 @@ MCSymbol *ARMAsmPrinter::GetARMGVSymbol(const GlobalValue *GV,
     MachineModuleInfoMachO &MMIMachO =
       MMI->getObjFileInfo<MachineModuleInfoMachO>();
     MachineModuleInfoImpl::StubValueTy &StubSym =
-      GV->hasHiddenVisibility() ? MMIMachO.getHiddenGVStubEntry(MCSym)
-                                : MMIMachO.getGVStubEntry(MCSym);
+        GV->isThreadLocal() ? MMIMachO.getThreadLocalGVStubEntry(MCSym)
+                            : MMIMachO.getGVStubEntry(MCSym);
+
     if (!StubSym.getPointer())
       StubSym = MachineModuleInfoImpl::StubValueTy(getSymbol(GV),
                                                    !GV->hasInternalLinkage());
@@ -991,7 +1037,7 @@ void ARMAsmPrinter::EmitJumpTableAddrs(const MachineInstr *MI) {
     //    .word (LBB1 - LJTI_0_0)
     const MCExpr *Expr = MCSymbolRefExpr::create(MBB->getSymbol(), OutContext);
 
-    if (TM.getRelocationModel() == Reloc::PIC_)
+    if (isPositionIndependent())
       Expr = MCBinaryExpr::createSub(Expr, MCSymbolRefExpr::create(JTISymbol,
                                                                    OutContext),
                                      OutContext);
@@ -1227,6 +1273,8 @@ void ARMAsmPrinter::EmitUnwindingInstruction(const MachineInstr *MI) {
 
 void ARMAsmPrinter::EmitInstruction(const MachineInstr *MI) {
   const DataLayout &DL = getDataLayout();
+  MCTargetStreamer &TS = *OutStreamer->getTargetStreamer();
+  ARMTargetStreamer &ATS = static_cast<ARMTargetStreamer &>(TS);
 
   // If we just ended a constant pool, mark it as such.
   if (InConstantPool && MI->getOpcode() != ARM::CONSTPOOL_ENTRY) {
@@ -1643,29 +1691,26 @@ void ARMAsmPrinter::EmitInstruction(const MachineInstr *MI) {
     // Non-Darwin binutils don't yet support the "trap" mnemonic.
     // FIXME: Remove this special case when they do.
     if (!Subtarget->isTargetMachO()) {
-      //.long 0xe7ffdefe @ trap
       uint32_t Val = 0xe7ffdefeUL;
       OutStreamer->AddComment("trap");
-      OutStreamer->EmitIntValue(Val, 4);
+      ATS.emitInst(Val);
       return;
     }
     break;
   }
   case ARM::TRAPNaCl: {
-    //.long 0xe7fedef0 @ trap
     uint32_t Val = 0xe7fedef0UL;
     OutStreamer->AddComment("trap");
-    OutStreamer->EmitIntValue(Val, 4);
+    ATS.emitInst(Val);
     return;
   }
   case ARM::tTRAP: {
     // Non-Darwin binutils don't yet support the "trap" mnemonic.
     // FIXME: Remove this special case when they do.
     if (!Subtarget->isTargetMachO()) {
-      //.short 57086 @ trap
       uint16_t Val = 0xdefe;
       OutStreamer->AddComment("trap");
-      OutStreamer->EmitIntValue(Val, 2);
+      ATS.emitInst(Val, 'n');
       return;
     }
     break;
@@ -1845,6 +1890,7 @@ void ARMAsmPrinter::EmitInstruction(const MachineInstr *MI) {
     // bx $scratch
     unsigned SrcReg = MI->getOperand(0).getReg();
     unsigned ScratchReg = MI->getOperand(1).getReg();
+
     EmitToStreamer(*OutStreamer, MCInstBuilder(ARM::tLDRi)
       .addReg(ScratchReg)
       .addReg(SrcReg)
@@ -1885,6 +1931,36 @@ void ARMAsmPrinter::EmitInstruction(const MachineInstr *MI) {
       .addReg(0));
     return;
   }
+  case ARM::tInt_WIN_eh_sjlj_longjmp: {
+    // ldr.w r11, [$src, #0]
+    // ldr.w  sp, [$src, #8]
+    // ldr.w  pc, [$src, #4]
+
+    unsigned SrcReg = MI->getOperand(0).getReg();
+
+    EmitToStreamer(*OutStreamer, MCInstBuilder(ARM::t2LDRi12)
+                                     .addReg(ARM::R11)
+                                     .addReg(SrcReg)
+                                     .addImm(0)
+                                     // Predicate
+                                     .addImm(ARMCC::AL)
+                                     .addReg(0));
+    EmitToStreamer(*OutStreamer, MCInstBuilder(ARM::t2LDRi12)
+                                     .addReg(ARM::SP)
+                                     .addReg(SrcReg)
+                                     .addImm(8)
+                                     // Predicate
+                                     .addImm(ARMCC::AL)
+                                     .addReg(0));
+    EmitToStreamer(*OutStreamer, MCInstBuilder(ARM::t2LDRi12)
+                                     .addReg(ARM::PC)
+                                     .addReg(SrcReg)
+                                     .addImm(4)
+                                     // Predicate
+                                     .addImm(ARMCC::AL)
+                                     .addReg(0));
+    return;
+  }
   }
 
   MCInst TmpInst;
diff --git a/lib/Target/ARM/ARMAsmPrinter.h b/lib/Target/ARM/ARMAsmPrinter.h
index ed7be2de51ca..97f5ca0ecbc2 100644
--- a/lib/Target/ARM/ARMAsmPrinter.h
+++ b/lib/Target/ARM/ARMAsmPrinter.h
@@ -95,6 +95,7 @@ public:
   bool lowerOperand(const MachineOperand &MO, MCOperand &MCOp);
 
 private:
+
   // Helpers for EmitStartOfAsmFile() and EmitEndOfAsmFile()
   void emitAttributes();
 
diff --git a/lib/Target/ARM/ARMBaseInstrInfo.cpp b/lib/Target/ARM/ARMBaseInstrInfo.cpp
index 49f328852667..693f16499717 100644
--- a/lib/Target/ARM/ARMBaseInstrInfo.cpp
+++ b/lib/Target/ARM/ARMBaseInstrInfo.cpp
@@ -51,15 +51,6 @@ static cl::opt<bool>
 EnableARM3Addr("enable-arm-3-addr-conv", cl::Hidden,
                cl::desc("Enable ARM 2-addr to 3-addr conv"));
 
-static cl::opt<bool>
-WidenVMOVS("widen-vmovs", cl::Hidden, cl::init(true),
-           cl::desc("Widen ARM vmovs to vmovd when possible"));
-
-static cl::opt<unsigned>
-SwiftPartialUpdateClearance("swift-partial-update-clearance",
-     cl::Hidden, cl::init(12),
-     cl::desc("Clearance before partial register updates"));
-
 /// ARM_MLxEntry - Record information about MLA / MLS instructions.
 struct ARM_MLxEntry {
   uint16_t MLxOpc;     // MLA / MLS opcode
@@ -124,18 +115,15 @@ CreateTargetPostRAHazardRecognizer(const InstrItineraryData *II,
   return TargetInstrInfo::CreateTargetPostRAHazardRecognizer(II, DAG);
 }
 
-MachineInstr *
-ARMBaseInstrInfo::convertToThreeAddress(MachineFunction::iterator &MFI,
-                                        MachineBasicBlock::iterator &MBBI,
-                                        LiveVariables *LV) const {
+MachineInstr *ARMBaseInstrInfo::convertToThreeAddress(
+    MachineFunction::iterator &MFI, MachineInstr &MI, LiveVariables *LV) const {
   // FIXME: Thumb2 support.
 
   if (!EnableARM3Addr)
     return nullptr;
 
-  MachineInstr *MI = MBBI;
-  MachineFunction &MF = *MI->getParent()->getParent();
-  uint64_t TSFlags = MI->getDesc().TSFlags;
+  MachineFunction &MF = *MI.getParent()->getParent();
+  uint64_t TSFlags = MI.getDesc().TSFlags;
   bool isPre = false;
   switch ((TSFlags & ARMII::IndexModeMask) >> ARMII::IndexModeShift) {
   default: return nullptr;
@@ -148,24 +136,24 @@ ARMBaseInstrInfo::convertToThreeAddress(MachineFunction::iterator &MFI,
 
   // Try splitting an indexed load/store to an un-indexed one plus an add/sub
   // operation.
-  unsigned MemOpc = getUnindexedOpcode(MI->getOpcode());
+  unsigned MemOpc = getUnindexedOpcode(MI.getOpcode());
   if (MemOpc == 0)
     return nullptr;
 
   MachineInstr *UpdateMI = nullptr;
   MachineInstr *MemMI = nullptr;
   unsigned AddrMode = (TSFlags & ARMII::AddrModeMask);
-  const MCInstrDesc &MCID = MI->getDesc();
+  const MCInstrDesc &MCID = MI.getDesc();
   unsigned NumOps = MCID.getNumOperands();
-  bool isLoad = !MI->mayStore();
-  const MachineOperand &WB = isLoad ? MI->getOperand(1) : MI->getOperand(0);
-  const MachineOperand &Base = MI->getOperand(2);
-  const MachineOperand &Offset = MI->getOperand(NumOps-3);
+  bool isLoad = !MI.mayStore();
+  const MachineOperand &WB = isLoad ? MI.getOperand(1) : MI.getOperand(0);
+  const MachineOperand &Base = MI.getOperand(2);
+  const MachineOperand &Offset = MI.getOperand(NumOps - 3);
   unsigned WBReg = WB.getReg();
   unsigned BaseReg = Base.getReg();
   unsigned OffReg = Offset.getReg();
-  unsigned OffImm = MI->getOperand(NumOps-2).getImm();
-  ARMCC::CondCodes Pred = (ARMCC::CondCodes)MI->getOperand(NumOps-1).getImm();
+  unsigned OffImm = MI.getOperand(NumOps - 2).getImm();
+  ARMCC::CondCodes Pred = (ARMCC::CondCodes)MI.getOperand(NumOps - 1).getImm();
   switch (AddrMode) {
   default: llvm_unreachable("Unknown indexed op!");
   case ARMII::AddrMode2: {
@@ -176,22 +164,33 @@ ARMBaseInstrInfo::convertToThreeAddress(MachineFunction::iterator &MFI,
         // Can't encode it in a so_imm operand. This transformation will
         // add more than 1 instruction. Abandon!
         return nullptr;
-      UpdateMI = BuildMI(MF, MI->getDebugLoc(),
+      UpdateMI = BuildMI(MF, MI.getDebugLoc(),
                          get(isSub ? ARM::SUBri : ARM::ADDri), WBReg)
-        .addReg(BaseReg).addImm(Amt)
-        .addImm(Pred).addReg(0).addReg(0);
+                     .addReg(BaseReg)
+                     .addImm(Amt)
+                     .addImm(Pred)
+                     .addReg(0)
+                     .addReg(0);
     } else if (Amt != 0) {
       ARM_AM::ShiftOpc ShOpc = ARM_AM::getAM2ShiftOpc(OffImm);
       unsigned SOOpc = ARM_AM::getSORegOpc(ShOpc, Amt);
-      UpdateMI = BuildMI(MF, MI->getDebugLoc(),
+      UpdateMI = BuildMI(MF, MI.getDebugLoc(),
                          get(isSub ? ARM::SUBrsi : ARM::ADDrsi), WBReg)
-        .addReg(BaseReg).addReg(OffReg).addReg(0).addImm(SOOpc)
-        .addImm(Pred).addReg(0).addReg(0);
+                     .addReg(BaseReg)
+                     .addReg(OffReg)
+                     .addReg(0)
+                     .addImm(SOOpc)
+                     .addImm(Pred)
+                     .addReg(0)
+                     .addReg(0);
     } else
-      UpdateMI = BuildMI(MF, MI->getDebugLoc(),
+      UpdateMI = BuildMI(MF, MI.getDebugLoc(),
                          get(isSub ? ARM::SUBrr : ARM::ADDrr), WBReg)
-        .addReg(BaseReg).addReg(OffReg)
-        .addImm(Pred).addReg(0).addReg(0);
+                     .addReg(BaseReg)
+                     .addReg(OffReg)
+                     .addImm(Pred)
+                     .addReg(0)
+                     .addReg(0);
     break;
   }
   case ARMII::AddrMode3 : {
@@ -199,15 +198,21 @@ ARMBaseInstrInfo::convertToThreeAddress(MachineFunction::iterator &MFI,
     unsigned Amt = ARM_AM::getAM3Offset(OffImm);
     if (OffReg == 0)
       // Immediate is 8-bits. It's guaranteed to fit in a so_imm operand.
-      UpdateMI = BuildMI(MF, MI->getDebugLoc(),
+      UpdateMI = BuildMI(MF, MI.getDebugLoc(),
                          get(isSub ? ARM::SUBri : ARM::ADDri), WBReg)
-        .addReg(BaseReg).addImm(Amt)
-        .addImm(Pred).addReg(0).addReg(0);
+                     .addReg(BaseReg)
+                     .addImm(Amt)
+                     .addImm(Pred)
+                     .addReg(0)
+                     .addReg(0);
     else
-      UpdateMI = BuildMI(MF, MI->getDebugLoc(),
+      UpdateMI = BuildMI(MF, MI.getDebugLoc(),
                          get(isSub ? ARM::SUBrr : ARM::ADDrr), WBReg)
-        .addReg(BaseReg).addReg(OffReg)
-        .addImm(Pred).addReg(0).addReg(0);
+                     .addReg(BaseReg)
+                     .addReg(OffReg)
+                     .addImm(Pred)
+                     .addReg(0)
+                     .addReg(0);
     break;
   }
   }
@@ -215,24 +220,34 @@ ARMBaseInstrInfo::convertToThreeAddress(MachineFunction::iterator &MFI,
   std::vector<MachineInstr*> NewMIs;
   if (isPre) {
     if (isLoad)
-      MemMI = BuildMI(MF, MI->getDebugLoc(),
-                      get(MemOpc), MI->getOperand(0).getReg())
-        .addReg(WBReg).addImm(0).addImm(Pred);
+      MemMI =
+          BuildMI(MF, MI.getDebugLoc(), get(MemOpc), MI.getOperand(0).getReg())
+              .addReg(WBReg)
+              .addImm(0)
+              .addImm(Pred);
     else
-      MemMI = BuildMI(MF, MI->getDebugLoc(),
-                      get(MemOpc)).addReg(MI->getOperand(1).getReg())
-        .addReg(WBReg).addReg(0).addImm(0).addImm(Pred);
+      MemMI = BuildMI(MF, MI.getDebugLoc(), get(MemOpc))
+                  .addReg(MI.getOperand(1).getReg())
+                  .addReg(WBReg)
+                  .addReg(0)
+                  .addImm(0)
+                  .addImm(Pred);
     NewMIs.push_back(MemMI);
     NewMIs.push_back(UpdateMI);
   } else {
     if (isLoad)
-      MemMI = BuildMI(MF, MI->getDebugLoc(),
-                      get(MemOpc), MI->getOperand(0).getReg())
-        .addReg(BaseReg).addImm(0).addImm(Pred);
+      MemMI =
+          BuildMI(MF, MI.getDebugLoc(), get(MemOpc), MI.getOperand(0).getReg())
+              .addReg(BaseReg)
+              .addImm(0)
+              .addImm(Pred);
     else
-      MemMI = BuildMI(MF, MI->getDebugLoc(),
-                      get(MemOpc)).addReg(MI->getOperand(1).getReg())
-        .addReg(BaseReg).addReg(0).addImm(0).addImm(Pred);
+      MemMI = BuildMI(MF, MI.getDebugLoc(), get(MemOpc))
+                  .addReg(MI.getOperand(1).getReg())
+                  .addReg(BaseReg)
+                  .addReg(0)
+                  .addImm(0)
+                  .addImm(Pred);
     if (WB.isDead())
       UpdateMI->getOperand(0).setIsDead();
     NewMIs.push_back(UpdateMI);
@@ -241,8 +256,8 @@ ARMBaseInstrInfo::convertToThreeAddress(MachineFunction::iterator &MFI,
 
   // Transfer LiveVariables states, kill / dead info.
   if (LV) {
-    for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) {
-      MachineOperand &MO = MI->getOperand(i);
+    for (unsigned i = 0, e = MI.getNumOperands(); i != e; ++i) {
+      MachineOperand &MO = MI.getOperand(i);
       if (MO.isReg() && TargetRegisterInfo::isVirtualRegister(MO.getReg())) {
         unsigned Reg = MO.getReg();
 
@@ -250,7 +265,7 @@ ARMBaseInstrInfo::convertToThreeAddress(MachineFunction::iterator &MFI,
         if (MO.isDef()) {
           MachineInstr *NewMI = (Reg == WBReg) ? UpdateMI : MemMI;
           if (MO.isDead())
-            LV->addVirtualRegisterDead(Reg, NewMI);
+            LV->addVirtualRegisterDead(Reg, *NewMI);
         }
         if (MO.isUse() && MO.isKill()) {
           for (unsigned j = 0; j < 2; ++j) {
@@ -258,7 +273,7 @@ ARMBaseInstrInfo::convertToThreeAddress(MachineFunction::iterator &MFI,
             MachineInstr *NewMI = NewMIs[j];
             if (!NewMI->readsRegister(Reg))
               continue;
-            LV->addVirtualRegisterKilled(Reg, NewMI);
+            LV->addVirtualRegisterKilled(Reg, *NewMI);
             if (VI.removeKill(MI))
               VI.Kills.push_back(NewMI);
             break;
@@ -268,17 +283,18 @@ ARMBaseInstrInfo::convertToThreeAddress(MachineFunction::iterator &MFI,
     }
   }
 
+  MachineBasicBlock::iterator MBBI = MI.getIterator();
   MFI->insert(MBBI, NewMIs[1]);
   MFI->insert(MBBI, NewMIs[0]);
   return NewMIs[0];
 }
 
 // Branch analysis.
-bool
-ARMBaseInstrInfo::AnalyzeBranch(MachineBasicBlock &MBB,MachineBasicBlock *&TBB,
-                                MachineBasicBlock *&FBB,
-                                SmallVectorImpl<MachineOperand> &Cond,
-                                bool AllowModify) const {
+bool ARMBaseInstrInfo::analyzeBranch(MachineBasicBlock &MBB,
+                                     MachineBasicBlock *&TBB,
+                                     MachineBasicBlock *&FBB,
+                                     SmallVectorImpl<MachineOperand> &Cond,
+                                     bool AllowModify) const {
   TBB = nullptr;
   FBB = nullptr;
 
@@ -289,7 +305,7 @@ ARMBaseInstrInfo::AnalyzeBranch(MachineBasicBlock &MBB,MachineBasicBlock *&TBB,
 
   // Walk backwards from the end of the basic block until the branch is
   // analyzed or we give up.
-  while (isPredicated(I) || I->isTerminator() || I->isDebugValue()) {
+  while (isPredicated(*I) || I->isTerminator() || I->isDebugValue()) {
 
     // Flag to be raised on unanalyzeable instructions. This is useful in cases
     // where we want to clean up on the end of the basic block before we bail
@@ -322,7 +338,7 @@ ARMBaseInstrInfo::AnalyzeBranch(MachineBasicBlock &MBB,MachineBasicBlock *&TBB,
       Cond.push_back(I->getOperand(2));
     } else if (I->isReturn()) {
       // Returns can't be analyzed, but we should run cleanup.
-      CantAnalyze = !isPredicated(I);
+      CantAnalyze = !isPredicated(*I);
     } else {
       // We encountered other unrecognized terminator. Bail out immediately.
       return true;
@@ -330,7 +346,7 @@ ARMBaseInstrInfo::AnalyzeBranch(MachineBasicBlock &MBB,MachineBasicBlock *&TBB,
 
     // Cleanup code - to be run for unpredicated unconditional branches and
     //                returns.
-    if (!isPredicated(I) &&
+    if (!isPredicated(*I) &&
           (isUncondBranchOpcode(I->getOpcode()) ||
            isIndirectBranchOpcode(I->getOpcode()) ||
            isJumpTableBranchOpcode(I->getOpcode()) ||
@@ -344,9 +360,9 @@ ARMBaseInstrInfo::AnalyzeBranch(MachineBasicBlock &MBB,MachineBasicBlock *&TBB,
       if (AllowModify) {
         MachineBasicBlock::iterator DI = std::next(I);
         while (DI != MBB.end()) {
-          MachineInstr *InstToDelete = DI;
+          MachineInstr &InstToDelete = *DI;
           ++DI;
-          InstToDelete->eraseFromParent();
+          InstToDelete.eraseFromParent();
         }
       }
     }
@@ -390,11 +406,11 @@ unsigned ARMBaseInstrInfo::RemoveBranch(MachineBasicBlock &MBB) const {
   return 2;
 }
 
-unsigned
-ARMBaseInstrInfo::InsertBranch(MachineBasicBlock &MBB, MachineBasicBlock *TBB,
-                               MachineBasicBlock *FBB,
-                               ArrayRef<MachineOperand> Cond,
-                               DebugLoc DL) const {
+unsigned ARMBaseInstrInfo::InsertBranch(MachineBasicBlock &MBB,
+                                        MachineBasicBlock *TBB,
+                                        MachineBasicBlock *FBB,
+                                        ArrayRef<MachineOperand> Cond,
+                                        const DebugLoc &DL) const {
   ARMFunctionInfo *AFI = MBB.getParent()->getInfo<ARMFunctionInfo>();
   int BOpc   = !AFI->isThumbFunction()
     ? ARM::B : (AFI->isThumb2Function() ? ARM::t2B : ARM::tB);
@@ -438,10 +454,10 @@ ReverseBranchCondition(SmallVectorImpl<MachineOperand> &Cond) const {
   return false;
 }
 
-bool ARMBaseInstrInfo::isPredicated(const MachineInstr *MI) const {
-  if (MI->isBundle()) {
-    MachineBasicBlock::const_instr_iterator I = MI->getIterator();
-    MachineBasicBlock::const_instr_iterator E = MI->getParent()->instr_end();
+bool ARMBaseInstrInfo::isPredicated(const MachineInstr &MI) const {
+  if (MI.isBundle()) {
+    MachineBasicBlock::const_instr_iterator I = MI.getIterator();
+    MachineBasicBlock::const_instr_iterator E = MI.getParent()->instr_end();
     while (++I != E && I->isInsideBundle()) {
       int PIdx = I->findFirstPredOperandIdx();
       if (PIdx != -1 && I->getOperand(PIdx).getImm() != ARMCC::AL)
@@ -450,26 +466,26 @@ bool ARMBaseInstrInfo::isPredicated(const MachineInstr *MI) const {
     return false;
   }
 
-  int PIdx = MI->findFirstPredOperandIdx();
-  return PIdx != -1 && MI->getOperand(PIdx).getImm() != ARMCC::AL;
+  int PIdx = MI.findFirstPredOperandIdx();
+  return PIdx != -1 && MI.getOperand(PIdx).getImm() != ARMCC::AL;
 }
 
-bool ARMBaseInstrInfo::
-PredicateInstruction(MachineInstr *MI, ArrayRef<MachineOperand> Pred) const {
-  unsigned Opc = MI->getOpcode();
+bool ARMBaseInstrInfo::PredicateInstruction(
+    MachineInstr &MI, ArrayRef<MachineOperand> Pred) const {
+  unsigned Opc = MI.getOpcode();
   if (isUncondBranchOpcode(Opc)) {
-    MI->setDesc(get(getMatchingCondBranchOpcode(Opc)));
-    MachineInstrBuilder(*MI->getParent()->getParent(), MI)
+    MI.setDesc(get(getMatchingCondBranchOpcode(Opc)));
+    MachineInstrBuilder(*MI.getParent()->getParent(), MI)
       .addImm(Pred[0].getImm())
       .addReg(Pred[1].getReg());
     return true;
   }
 
-  int PIdx = MI->findFirstPredOperandIdx();
+  int PIdx = MI.findFirstPredOperandIdx();
   if (PIdx != -1) {
-    MachineOperand &PMO = MI->getOperand(PIdx);
+    MachineOperand &PMO = MI.getOperand(PIdx);
     PMO.setImm(Pred[0].getImm());
-    MI->getOperand(PIdx+1).setReg(Pred[1].getReg());
+    MI.getOperand(PIdx+1).setReg(Pred[1].getReg());
     return true;
   }
   return false;
@@ -501,11 +517,11 @@ bool ARMBaseInstrInfo::SubsumesPredicate(ArrayRef<MachineOperand> Pred1,
   }
 }
 
-bool ARMBaseInstrInfo::DefinesPredicate(MachineInstr *MI,
-                                    std::vector<MachineOperand> &Pred) const {
+bool ARMBaseInstrInfo::DefinesPredicate(
+    MachineInstr &MI, std::vector<MachineOperand> &Pred) const {
   bool Found = false;
-  for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) {
-    const MachineOperand &MO = MI->getOperand(i);
+  for (unsigned i = 0, e = MI.getNumOperands(); i != e; ++i) {
+    const MachineOperand &MO = MI.getOperand(i);
     if ((MO.isRegMask() && MO.clobbersPhysReg(ARM::CPSR)) ||
         (MO.isReg() && MO.isDef() && MO.getReg() == ARM::CPSR)) {
       Pred.push_back(MO);
@@ -555,21 +571,21 @@ static bool isEligibleForITBlock(const MachineInstr *MI) {
 /// isPredicable - Return true if the specified instruction can be predicated.
 /// By default, this returns true for every instruction with a
 /// PredicateOperand.
-bool ARMBaseInstrInfo::isPredicable(MachineInstr *MI) const {
-  if (!MI->isPredicable())
+bool ARMBaseInstrInfo::isPredicable(MachineInstr &MI) const {
+  if (!MI.isPredicable())
     return false;
 
-  if (!isEligibleForITBlock(MI))
+  if (!isEligibleForITBlock(&MI))
     return false;
 
   ARMFunctionInfo *AFI =
-    MI->getParent()->getParent()->getInfo<ARMFunctionInfo>();
+      MI.getParent()->getParent()->getInfo<ARMFunctionInfo>();
 
   if (AFI->isThumb2Function()) {
     if (getSubtarget().restrictIT())
-      return isV8EligibleForIT(MI);
+      return isV8EligibleForIT(&MI);
   } else { // non-Thumb
-    if ((MI->getDesc().TSFlags & ARMII::DomainMask) == ARMII::DomainNEON)
+    if ((MI.getDesc().TSFlags & ARMII::DomainMask) == ARMII::DomainNEON)
       return false;
   }
 
@@ -594,19 +610,19 @@ template <> bool IsCPSRDead<MachineInstr>(MachineInstr *MI) {
 
 /// GetInstSize - Return the size of the specified MachineInstr.
 ///
-unsigned ARMBaseInstrInfo::GetInstSizeInBytes(const MachineInstr *MI) const {
-  const MachineBasicBlock &MBB = *MI->getParent();
+unsigned ARMBaseInstrInfo::GetInstSizeInBytes(const MachineInstr &MI) const {
+  const MachineBasicBlock &MBB = *MI.getParent();
   const MachineFunction *MF = MBB.getParent();
   const MCAsmInfo *MAI = MF->getTarget().getMCAsmInfo();
 
-  const MCInstrDesc &MCID = MI->getDesc();
+  const MCInstrDesc &MCID = MI.getDesc();
   if (MCID.getSize())
     return MCID.getSize();
 
   // If this machine instr is an inline asm, measure it.
-  if (MI->getOpcode() == ARM::INLINEASM)
-    return getInlineAsmLength(MI->getOperand(0).getSymbolName(), *MAI);
-  unsigned Opc = MI->getOpcode();
+  if (MI.getOpcode() == ARM::INLINEASM)
+    return getInlineAsmLength(MI.getOperand(0).getSymbolName(), *MAI);
+  unsigned Opc = MI.getOpcode();
   switch (Opc) {
   default:
     // pseudo-instruction sizes are zero.
@@ -628,11 +644,13 @@ unsigned ARMBaseInstrInfo::GetInstSizeInBytes(const MachineInstr *MI) const {
   case ARM::JUMPTABLE_TBH:
     // If this machine instr is a constant pool entry, its size is recorded as
     // operand #2.
-    return MI->getOperand(2).getImm();
+    return MI.getOperand(2).getImm();
   case ARM::Int_eh_sjlj_longjmp:
     return 16;
   case ARM::tInt_eh_sjlj_longjmp:
     return 10;
+  case ARM::tInt_WIN_eh_sjlj_longjmp:
+    return 12;
   case ARM::Int_eh_sjlj_setjmp:
   case ARM::Int_eh_sjlj_setjmp_nofp:
     return 20;
@@ -641,17 +659,17 @@ unsigned ARMBaseInstrInfo::GetInstSizeInBytes(const MachineInstr *MI) const {
   case ARM::t2Int_eh_sjlj_setjmp_nofp:
     return 12;
   case ARM::SPACE:
-    return MI->getOperand(1).getImm();
+    return MI.getOperand(1).getImm();
   }
 }
 
-unsigned ARMBaseInstrInfo::getInstBundleLength(const MachineInstr *MI) const {
+unsigned ARMBaseInstrInfo::getInstBundleLength(const MachineInstr &MI) const {
   unsigned Size = 0;
-  MachineBasicBlock::const_instr_iterator I = MI->getIterator();
-  MachineBasicBlock::const_instr_iterator E = MI->getParent()->instr_end();
+  MachineBasicBlock::const_instr_iterator I = MI.getIterator();
+  MachineBasicBlock::const_instr_iterator E = MI.getParent()->instr_end();
   while (++I != E && I->isInsideBundle()) {
     assert(!I->isBundle() && "No nested bundle!");
-    Size += GetInstSizeInBytes(&*I);
+    Size += GetInstSizeInBytes(*I);
   }
   return Size;
 }
@@ -700,9 +718,9 @@ void ARMBaseInstrInfo::copyToCPSR(MachineBasicBlock &MBB,
 }
 
 void ARMBaseInstrInfo::copyPhysReg(MachineBasicBlock &MBB,
-                                   MachineBasicBlock::iterator I, DebugLoc DL,
-                                   unsigned DestReg, unsigned SrcReg,
-                                   bool KillSrc) const {
+                                   MachineBasicBlock::iterator I,
+                                   const DebugLoc &DL, unsigned DestReg,
+                                   unsigned SrcReg, bool KillSrc) const {
   bool GPRDest = ARM::GPRRegClass.contains(DestReg);
   bool GPRSrc = ARM::GPRRegClass.contains(SrcReg);
 
@@ -976,20 +994,17 @@ storeRegToStackSlot(MachineBasicBlock &MBB, MachineBasicBlock::iterator I,
   }
 }
 
-unsigned
-ARMBaseInstrInfo::isStoreToStackSlot(const MachineInstr *MI,
-                                     int &FrameIndex) const {
-  switch (MI->getOpcode()) {
+unsigned ARMBaseInstrInfo::isStoreToStackSlot(const MachineInstr &MI,
+                                              int &FrameIndex) const {
+  switch (MI.getOpcode()) {
   default: break;
   case ARM::STRrs:
   case ARM::t2STRs: // FIXME: don't use t2STRs to access frame.
-    if (MI->getOperand(1).isFI() &&
-        MI->getOperand(2).isReg() &&
-        MI->getOperand(3).isImm() &&
-        MI->getOperand(2).getReg() == 0 &&
-        MI->getOperand(3).getImm() == 0) {
-      FrameIndex = MI->getOperand(1).getIndex();
-      return MI->getOperand(0).getReg();
+    if (MI.getOperand(1).isFI() && MI.getOperand(2).isReg() &&
+        MI.getOperand(3).isImm() && MI.getOperand(2).getReg() == 0 &&
+        MI.getOperand(3).getImm() == 0) {
+      FrameIndex = MI.getOperand(1).getIndex();
+      return MI.getOperand(0).getReg();
     }
     break;
   case ARM::STRi12:
@@ -997,27 +1012,24 @@ ARMBaseInstrInfo::isStoreToStackSlot(const MachineInstr *MI,
   case ARM::tSTRspi:
   case ARM::VSTRD:
   case ARM::VSTRS:
-    if (MI->getOperand(1).isFI() &&
-        MI->getOperand(2).isImm() &&
-        MI->getOperand(2).getImm() == 0) {
-      FrameIndex = MI->getOperand(1).getIndex();
-      return MI->getOperand(0).getReg();
+    if (MI.getOperand(1).isFI() && MI.getOperand(2).isImm() &&
+        MI.getOperand(2).getImm() == 0) {
+      FrameIndex = MI.getOperand(1).getIndex();
+      return MI.getOperand(0).getReg();
     }
     break;
   case ARM::VST1q64:
   case ARM::VST1d64TPseudo:
   case ARM::VST1d64QPseudo:
-    if (MI->getOperand(0).isFI() &&
-        MI->getOperand(2).getSubReg() == 0) {
-      FrameIndex = MI->getOperand(0).getIndex();
-      return MI->getOperand(2).getReg();
+    if (MI.getOperand(0).isFI() && MI.getOperand(2).getSubReg() == 0) {
+      FrameIndex = MI.getOperand(0).getIndex();
+      return MI.getOperand(2).getReg();
     }
     break;
   case ARM::VSTMQIA:
-    if (MI->getOperand(1).isFI() &&
-        MI->getOperand(0).getSubReg() == 0) {
-      FrameIndex = MI->getOperand(1).getIndex();
-      return MI->getOperand(0).getReg();
+    if (MI.getOperand(1).isFI() && MI.getOperand(0).getSubReg() == 0) {
+      FrameIndex = MI.getOperand(1).getIndex();
+      return MI.getOperand(0).getReg();
     }
     break;
   }
@@ -1025,10 +1037,10 @@ ARMBaseInstrInfo::isStoreToStackSlot(const MachineInstr *MI,
   return 0;
 }
 
-unsigned ARMBaseInstrInfo::isStoreToStackSlotPostFE(const MachineInstr *MI,
+unsigned ARMBaseInstrInfo::isStoreToStackSlotPostFE(const MachineInstr &MI,
                                                     int &FrameIndex) const {
   const MachineMemOperand *Dummy;
-  return MI->mayStore() && hasStoreToStackSlot(MI, Dummy, FrameIndex);
+  return MI.mayStore() && hasStoreToStackSlot(MI, Dummy, FrameIndex);
 }
 
 void ARMBaseInstrInfo::
@@ -1164,20 +1176,17 @@ loadRegFromStackSlot(MachineBasicBlock &MBB, MachineBasicBlock::iterator I,
   }
 }
 
-unsigned
-ARMBaseInstrInfo::isLoadFromStackSlot(const MachineInstr *MI,
-                                      int &FrameIndex) const {
-  switch (MI->getOpcode()) {
+unsigned ARMBaseInstrInfo::isLoadFromStackSlot(const MachineInstr &MI,
+                                               int &FrameIndex) const {
+  switch (MI.getOpcode()) {
   default: break;
   case ARM::LDRrs:
   case ARM::t2LDRs:  // FIXME: don't use t2LDRs to access frame.
-    if (MI->getOperand(1).isFI() &&
-        MI->getOperand(2).isReg() &&
-        MI->getOperand(3).isImm() &&
-        MI->getOperand(2).getReg() == 0 &&
-        MI->getOperand(3).getImm() == 0) {
-      FrameIndex = MI->getOperand(1).getIndex();
-      return MI->getOperand(0).getReg();
+    if (MI.getOperand(1).isFI() && MI.getOperand(2).isReg() &&
+        MI.getOperand(3).isImm() && MI.getOperand(2).getReg() == 0 &&
+        MI.getOperand(3).getImm() == 0) {
+      FrameIndex = MI.getOperand(1).getIndex();
+      return MI.getOperand(0).getReg();
     }
     break;
   case ARM::LDRi12:
@@ -1185,27 +1194,24 @@ ARMBaseInstrInfo::isLoadFromStackSlot(const MachineInstr *MI,
   case ARM::tLDRspi:
   case ARM::VLDRD:
   case ARM::VLDRS:
-    if (MI->getOperand(1).isFI() &&
-        MI->getOperand(2).isImm() &&
-        MI->getOperand(2).getImm() == 0) {
-      FrameIndex = MI->getOperand(1).getIndex();
-      return MI->getOperand(0).getReg();
+    if (MI.getOperand(1).isFI() && MI.getOperand(2).isImm() &&
+        MI.getOperand(2).getImm() == 0) {
+      FrameIndex = MI.getOperand(1).getIndex();
+      return MI.getOperand(0).getReg();
     }
     break;
   case ARM::VLD1q64:
   case ARM::VLD1d64TPseudo:
   case ARM::VLD1d64QPseudo:
-    if (MI->getOperand(1).isFI() &&
-        MI->getOperand(0).getSubReg() == 0) {
-      FrameIndex = MI->getOperand(1).getIndex();
-      return MI->getOperand(0).getReg();
+    if (MI.getOperand(1).isFI() && MI.getOperand(0).getSubReg() == 0) {
+      FrameIndex = MI.getOperand(1).getIndex();
+      return MI.getOperand(0).getReg();
     }
     break;
   case ARM::VLDMQIA:
-    if (MI->getOperand(1).isFI() &&
-        MI->getOperand(0).getSubReg() == 0) {
-      FrameIndex = MI->getOperand(1).getIndex();
-      return MI->getOperand(0).getReg();
+    if (MI.getOperand(1).isFI() && MI.getOperand(0).getSubReg() == 0) {
+      FrameIndex = MI.getOperand(1).getIndex();
+      return MI.getOperand(0).getReg();
     }
     break;
   }
@@ -1213,20 +1219,19 @@ ARMBaseInstrInfo::isLoadFromStackSlot(const MachineInstr *MI,
   return 0;
 }
 
-unsigned ARMBaseInstrInfo::isLoadFromStackSlotPostFE(const MachineInstr *MI,
-                                             int &FrameIndex) const {
+unsigned ARMBaseInstrInfo::isLoadFromStackSlotPostFE(const MachineInstr &MI,
+                                                     int &FrameIndex) const {
   const MachineMemOperand *Dummy;
-  return MI->mayLoad() && hasLoadFromStackSlot(MI, Dummy, FrameIndex);
+  return MI.mayLoad() && hasLoadFromStackSlot(MI, Dummy, FrameIndex);
 }
 
 /// \brief Expands MEMCPY to either LDMIA/STMIA or LDMIA_UPD/STMID_UPD
 /// depending on whether the result is used.
-void ARMBaseInstrInfo::expandMEMCPY(MachineBasicBlock::iterator MBBI) const {
+void ARMBaseInstrInfo::expandMEMCPY(MachineBasicBlock::iterator MI) const {
   bool isThumb1 = Subtarget.isThumb1Only();
   bool isThumb2 = Subtarget.isThumb2();
   const ARMBaseInstrInfo *TII = Subtarget.getInstrInfo();
 
-  MachineInstr *MI = MBBI;
   DebugLoc dl = MI->getDebugLoc();
   MachineBasicBlock *BB = MI->getParent();
 
@@ -1269,24 +1274,20 @@ void ARMBaseInstrInfo::expandMEMCPY(MachineBasicBlock::iterator MBBI) const {
     STM.addReg(Reg, RegState::Kill);
   }
 
-  BB->erase(MBBI);
+  BB->erase(MI);
 }
 
 
-bool
-ARMBaseInstrInfo::expandPostRAPseudo(MachineBasicBlock::iterator MI) const {
-  MachineFunction &MF = *MI->getParent()->getParent();
-  Reloc::Model RM = MF.getTarget().getRelocationModel();
-
-  if (MI->getOpcode() == TargetOpcode::LOAD_STACK_GUARD) {
+bool ARMBaseInstrInfo::expandPostRAPseudo(MachineInstr &MI) const {
+  if (MI.getOpcode() == TargetOpcode::LOAD_STACK_GUARD) {
     assert(getSubtarget().getTargetTriple().isOSBinFormatMachO() &&
            "LOAD_STACK_GUARD currently supported only for MachO.");
-    expandLoadStackGuard(MI, RM);
-    MI->getParent()->erase(MI);
+    expandLoadStackGuard(MI);
+    MI.getParent()->erase(MI);
     return true;
   }
 
-  if (MI->getOpcode() == ARM::MEMCPY) {
+  if (MI.getOpcode() == ARM::MEMCPY) {
     expandMEMCPY(MI);
     return true;
   }
@@ -1295,14 +1296,13 @@ ARMBaseInstrInfo::expandPostRAPseudo(MachineBasicBlock::iterator MI) const {
   // copyPhysReg() calls.  Look for VMOVS instructions that can legally be
   // widened to VMOVD.  We prefer the VMOVD when possible because it may be
   // changed into a VORR that can go down the NEON pipeline.
-  if (!WidenVMOVS || !MI->isCopy() || Subtarget.isCortexA15() ||
-      Subtarget.isFPOnlySP())
+  if (!MI.isCopy() || Subtarget.dontWidenVMOVS() || Subtarget.isFPOnlySP())
     return false;
 
   // Look for a copy between even S-registers.  That is where we keep floats
   // when using NEON v2f32 instructions for f32 arithmetic.
-  unsigned DstRegS = MI->getOperand(0).getReg();
-  unsigned SrcRegS = MI->getOperand(1).getReg();
+  unsigned DstRegS = MI.getOperand(0).getReg();
+  unsigned SrcRegS = MI.getOperand(1).getReg();
   if (!ARM::SPRRegClass.contains(DstRegS, SrcRegS))
     return false;
 
@@ -1317,44 +1317,44 @@ ARMBaseInstrInfo::expandPostRAPseudo(MachineBasicBlock::iterator MI) const {
   // We want to widen this into a DstRegD = VMOVD SrcRegD copy.  This is only
   // legal if the COPY already defines the full DstRegD, and it isn't a
   // sub-register insertion.
-  if (!MI->definesRegister(DstRegD, TRI) || MI->readsRegister(DstRegD, TRI))
+  if (!MI.definesRegister(DstRegD, TRI) || MI.readsRegister(DstRegD, TRI))
     return false;
 
   // A dead copy shouldn't show up here, but reject it just in case.
-  if (MI->getOperand(0).isDead())
+  if (MI.getOperand(0).isDead())
     return false;
 
   // All clear, widen the COPY.
-  DEBUG(dbgs() << "widening:    " << *MI);
-  MachineInstrBuilder MIB(*MI->getParent()->getParent(), MI);
+  DEBUG(dbgs() << "widening:    " << MI);
+  MachineInstrBuilder MIB(*MI.getParent()->getParent(), MI);
 
   // Get rid of the old <imp-def> of DstRegD.  Leave it if it defines a Q-reg
   // or some other super-register.
-  int ImpDefIdx = MI->findRegisterDefOperandIdx(DstRegD);
+  int ImpDefIdx = MI.findRegisterDefOperandIdx(DstRegD);
   if (ImpDefIdx != -1)
-    MI->RemoveOperand(ImpDefIdx);
+    MI.RemoveOperand(ImpDefIdx);
 
   // Change the opcode and operands.
-  MI->setDesc(get(ARM::VMOVD));
-  MI->getOperand(0).setReg(DstRegD);
-  MI->getOperand(1).setReg(SrcRegD);
+  MI.setDesc(get(ARM::VMOVD));
+  MI.getOperand(0).setReg(DstRegD);
+  MI.getOperand(1).setReg(SrcRegD);
   AddDefaultPred(MIB);
 
   // We are now reading SrcRegD instead of SrcRegS.  This may upset the
   // register scavenger and machine verifier, so we need to indicate that we
   // are reading an undefined value from SrcRegD, but a proper value from
   // SrcRegS.
-  MI->getOperand(1).setIsUndef();
+  MI.getOperand(1).setIsUndef();
   MIB.addReg(SrcRegS, RegState::Implicit);
 
   // SrcRegD may actually contain an unrelated value in the ssub_1
   // sub-register.  Don't kill it.  Only kill the ssub_0 sub-register.
-  if (MI->getOperand(1).isKill()) {
-    MI->getOperand(1).setIsKill(false);
-    MI->addRegisterKilled(SrcRegS, TRI, true);
+  if (MI.getOperand(1).isKill()) {
+    MI.getOperand(1).setIsKill(false);
+    MI.addRegisterKilled(SrcRegS, TRI, true);
   }
 
-  DEBUG(dbgs() << "replaced by: " << *MI);
+  DEBUG(dbgs() << "replaced by: " << MI);
   return true;
 }
 
@@ -1403,54 +1403,54 @@ static unsigned duplicateCPV(MachineFunction &MF, unsigned &CPI) {
   return PCLabelId;
 }
 
-void ARMBaseInstrInfo::
-reMaterialize(MachineBasicBlock &MBB,
-              MachineBasicBlock::iterator I,
-              unsigned DestReg, unsigned SubIdx,
-              const MachineInstr *Orig,
-              const TargetRegisterInfo &TRI) const {
-  unsigned Opcode = Orig->getOpcode();
+void ARMBaseInstrInfo::reMaterialize(MachineBasicBlock &MBB,
+                                     MachineBasicBlock::iterator I,
+                                     unsigned DestReg, unsigned SubIdx,
+                                     const MachineInstr &Orig,
+                                     const TargetRegisterInfo &TRI) const {
+  unsigned Opcode = Orig.getOpcode();
   switch (Opcode) {
   default: {
-    MachineInstr *MI = MBB.getParent()->CloneMachineInstr(Orig);
-    MI->substituteRegister(Orig->getOperand(0).getReg(), DestReg, SubIdx, TRI);
+    MachineInstr *MI = MBB.getParent()->CloneMachineInstr(&Orig);
+    MI->substituteRegister(Orig.getOperand(0).getReg(), DestReg, SubIdx, TRI);
     MBB.insert(I, MI);
     break;
   }
   case ARM::tLDRpci_pic:
   case ARM::t2LDRpci_pic: {
     MachineFunction &MF = *MBB.getParent();
-    unsigned CPI = Orig->getOperand(1).getIndex();
+    unsigned CPI = Orig.getOperand(1).getIndex();
     unsigned PCLabelId = duplicateCPV(MF, CPI);
-    MachineInstrBuilder MIB = BuildMI(MBB, I, Orig->getDebugLoc(), get(Opcode),
-                                      DestReg)
-      .addConstantPoolIndex(CPI).addImm(PCLabelId);
-    MIB->setMemRefs(Orig->memoperands_begin(), Orig->memoperands_end());
+    MachineInstrBuilder MIB =
+        BuildMI(MBB, I, Orig.getDebugLoc(), get(Opcode), DestReg)
+            .addConstantPoolIndex(CPI)
+            .addImm(PCLabelId);
+    MIB->setMemRefs(Orig.memoperands_begin(), Orig.memoperands_end());
     break;
   }
   }
 }
 
-MachineInstr *
-ARMBaseInstrInfo::duplicate(MachineInstr *Orig, MachineFunction &MF) const {
+MachineInstr *ARMBaseInstrInfo::duplicate(MachineInstr &Orig,
+                                          MachineFunction &MF) const {
   MachineInstr *MI = TargetInstrInfo::duplicate(Orig, MF);
-  switch(Orig->getOpcode()) {
+  switch (Orig.getOpcode()) {
   case ARM::tLDRpci_pic:
   case ARM::t2LDRpci_pic: {
-    unsigned CPI = Orig->getOperand(1).getIndex();
+    unsigned CPI = Orig.getOperand(1).getIndex();
     unsigned PCLabelId = duplicateCPV(MF, CPI);
-    Orig->getOperand(1).setIndex(CPI);
-    Orig->getOperand(2).setImm(PCLabelId);
+    Orig.getOperand(1).setIndex(CPI);
+    Orig.getOperand(2).setImm(PCLabelId);
     break;
   }
   }
   return MI;
 }
 
-bool ARMBaseInstrInfo::produceSameValue(const MachineInstr *MI0,
-                                        const MachineInstr *MI1,
+bool ARMBaseInstrInfo::produceSameValue(const MachineInstr &MI0,
+                                        const MachineInstr &MI1,
                                         const MachineRegisterInfo *MRI) const {
-  unsigned Opcode = MI0->getOpcode();
+  unsigned Opcode = MI0.getOpcode();
   if (Opcode == ARM::t2LDRpci ||
       Opcode == ARM::t2LDRpci_pic ||
       Opcode == ARM::tLDRpci ||
@@ -1461,13 +1461,13 @@ bool ARMBaseInstrInfo::produceSameValue(const MachineInstr *MI0,
       Opcode == ARM::MOV_ga_pcrel ||
       Opcode == ARM::MOV_ga_pcrel_ldr ||
       Opcode == ARM::t2MOV_ga_pcrel) {
-    if (MI1->getOpcode() != Opcode)
+    if (MI1.getOpcode() != Opcode)
       return false;
-    if (MI0->getNumOperands() != MI1->getNumOperands())
+    if (MI0.getNumOperands() != MI1.getNumOperands())
       return false;
 
-    const MachineOperand &MO0 = MI0->getOperand(1);
-    const MachineOperand &MO1 = MI1->getOperand(1);
+    const MachineOperand &MO0 = MI0.getOperand(1);
+    const MachineOperand &MO1 = MI1.getOperand(1);
     if (MO0.getOffset() != MO1.getOffset())
       return false;
 
@@ -1480,7 +1480,7 @@ bool ARMBaseInstrInfo::produceSameValue(const MachineInstr *MI0,
       // Ignore the PC labels.
       return MO0.getGlobal() == MO1.getGlobal();
 
-    const MachineFunction *MF = MI0->getParent()->getParent();
+    const MachineFunction *MF = MI0.getParent()->getParent();
     const MachineConstantPool *MCP = MF->getConstantPool();
     int CPI0 = MO0.getIndex();
     int CPI1 = MO1.getIndex();
@@ -1499,13 +1499,13 @@ bool ARMBaseInstrInfo::produceSameValue(const MachineInstr *MI0,
     }
     return false;
   } else if (Opcode == ARM::PICLDR) {
-    if (MI1->getOpcode() != Opcode)
+    if (MI1.getOpcode() != Opcode)
       return false;
-    if (MI0->getNumOperands() != MI1->getNumOperands())
+    if (MI0.getNumOperands() != MI1.getNumOperands())
       return false;
 
-    unsigned Addr0 = MI0->getOperand(1).getReg();
-    unsigned Addr1 = MI1->getOperand(1).getReg();
+    unsigned Addr0 = MI0.getOperand(1).getReg();
+    unsigned Addr1 = MI1.getOperand(1).getReg();
     if (Addr0 != Addr1) {
       if (!MRI ||
           !TargetRegisterInfo::isVirtualRegister(Addr0) ||
@@ -1517,21 +1517,21 @@ bool ARMBaseInstrInfo::produceSameValue(const MachineInstr *MI0,
       MachineInstr *Def1 = MRI->getVRegDef(Addr1);
       // Check if the loaded value, e.g. a constantpool of a global address, are
       // the same.
-      if (!produceSameValue(Def0, Def1, MRI))
+      if (!produceSameValue(*Def0, *Def1, MRI))
         return false;
     }
 
-    for (unsigned i = 3, e = MI0->getNumOperands(); i != e; ++i) {
+    for (unsigned i = 3, e = MI0.getNumOperands(); i != e; ++i) {
       // %vreg12<def> = PICLDR %vreg11, 0, pred:14, pred:%noreg
-      const MachineOperand &MO0 = MI0->getOperand(i);
-      const MachineOperand &MO1 = MI1->getOperand(i);
+      const MachineOperand &MO0 = MI0.getOperand(i);
+      const MachineOperand &MO1 = MI1.getOperand(i);
       if (!MO0.isIdenticalTo(MO1))
         return false;
     }
     return true;
   }
 
-  return MI0->isIdenticalTo(MI1, MachineInstr::IgnoreVRegDefs);
+  return MI0.isIdenticalTo(MI1, MachineInstr::IgnoreVRegDefs);
 }
 
 /// areLoadsFromSameBasePtr - This is used by the pre-regalloc scheduler to
@@ -1653,7 +1653,7 @@ bool ARMBaseInstrInfo::shouldScheduleLoadsNear(SDNode *Load1, SDNode *Load2,
   return true;
 }
 
-bool ARMBaseInstrInfo::isSchedulingBoundary(const MachineInstr *MI,
+bool ARMBaseInstrInfo::isSchedulingBoundary(const MachineInstr &MI,
                                             const MachineBasicBlock *MBB,
                                             const MachineFunction &MF) const {
   // Debug info is never a scheduling boundary. It's necessary to be explicit
@@ -1662,11 +1662,11 @@ bool ARMBaseInstrInfo::isSchedulingBoundary(const MachineInstr *MI,
   // considered a scheduling hazard, which is wrong. It should be the actual
   // instruction preceding the dbg_value instruction(s), just like it is
   // when debug info is not present.
-  if (MI->isDebugValue())
+  if (MI.isDebugValue())
     return false;
 
   // Terminators and labels can't be scheduled around.
-  if (MI->isTerminator() || MI->isPosition())
+  if (MI.isTerminator() || MI.isPosition())
     return true;
 
   // Treat the start of the IT block as a scheduling boundary, but schedule
@@ -1690,7 +1690,7 @@ bool ARMBaseInstrInfo::isSchedulingBoundary(const MachineInstr *MI,
   // Calls don't actually change the stack pointer, even if they have imp-defs.
   // No ARM calling conventions change the stack pointer. (X86 calling
   // conventions sometimes do).
-  if (!MI->isCall() && MI->definesRegister(ARM::SP))
+  if (!MI.isCall() && MI.definesRegister(ARM::SP))
     return true;
 
   return false;
@@ -1718,7 +1718,7 @@ isProfitableToIfCvt(MachineBasicBlock &MBB,
               CmpMI->getOpcode() == ARM::t2CMPri) {
             unsigned Reg = CmpMI->getOperand(0).getReg();
             unsigned PredReg = 0;
-            ARMCC::CondCodes P = getInstrPredicate(CmpMI, PredReg);
+            ARMCC::CondCodes P = getInstrPredicate(*CmpMI, PredReg);
             if (P == ARMCC::AL && CmpMI->getOperand(1).getImm() == 0 &&
                 isARMLowRegister(Reg))
               return false;
@@ -1765,24 +1765,24 @@ isProfitableToIfCvt(MachineBasicBlock &TMBB,
 bool
 ARMBaseInstrInfo::isProfitableToUnpredicate(MachineBasicBlock &TMBB,
                                             MachineBasicBlock &FMBB) const {
-  // Reduce false anti-dependencies to let Swift's out-of-order execution
+  // Reduce false anti-dependencies to let the target's out-of-order execution
   // engine do its thing.
-  return Subtarget.isSwift();
+  return Subtarget.isProfitableToUnpredicate();
 }
 
 /// getInstrPredicate - If instruction is predicated, returns its predicate
 /// condition, otherwise returns AL. It also returns the condition code
 /// register by reference.
-ARMCC::CondCodes
-llvm::getInstrPredicate(const MachineInstr *MI, unsigned &PredReg) {
-  int PIdx = MI->findFirstPredOperandIdx();
+ARMCC::CondCodes llvm::getInstrPredicate(const MachineInstr &MI,
+                                         unsigned &PredReg) {
+  int PIdx = MI.findFirstPredOperandIdx();
   if (PIdx == -1) {
     PredReg = 0;
     return ARMCC::AL;
   }
 
-  PredReg = MI->getOperand(PIdx+1).getReg();
-  return (ARMCC::CondCodes)MI->getOperand(PIdx).getImm();
+  PredReg = MI.getOperand(PIdx+1).getReg();
+  return (ARMCC::CondCodes)MI.getOperand(PIdx).getImm();
 }
 
 
@@ -1797,11 +1797,11 @@ unsigned llvm::getMatchingCondBranchOpcode(unsigned Opc) {
   llvm_unreachable("Unknown unconditional branch opcode!");
 }
 
-MachineInstr *ARMBaseInstrInfo::commuteInstructionImpl(MachineInstr *MI,
+MachineInstr *ARMBaseInstrInfo::commuteInstructionImpl(MachineInstr &MI,
                                                        bool NewMI,
                                                        unsigned OpIdx1,
                                                        unsigned OpIdx2) const {
-  switch (MI->getOpcode()) {
+  switch (MI.getOpcode()) {
   case ARM::MOVCCr:
   case ARM::t2MOVCCr: {
     // MOVCC can be commuted by inverting the condition.
@@ -1810,13 +1810,14 @@ MachineInstr *ARMBaseInstrInfo::commuteInstructionImpl(MachineInstr *MI,
     // MOVCC AL can't be inverted. Shouldn't happen.
     if (CC == ARMCC::AL || PredReg != ARM::CPSR)
       return nullptr;
-    MI = TargetInstrInfo::commuteInstructionImpl(MI, NewMI, OpIdx1, OpIdx2);
-    if (!MI)
+    MachineInstr *CommutedMI =
+        TargetInstrInfo::commuteInstructionImpl(MI, NewMI, OpIdx1, OpIdx2);
+    if (!CommutedMI)
       return nullptr;
     // After swapping the MOVCC operands, also invert the condition.
-    MI->getOperand(MI->findFirstPredOperandIdx())
-      .setImm(ARMCC::getOppositeCondition(CC));
-    return MI;
+    CommutedMI->getOperand(CommutedMI->findFirstPredOperandIdx())
+        .setImm(ARMCC::getOppositeCondition(CC));
+    return CommutedMI;
   }
   }
   return TargetInstrInfo::commuteInstructionImpl(MI, NewMI, OpIdx1, OpIdx2);
@@ -1860,11 +1861,11 @@ static MachineInstr *canFoldIntoMOVCC(unsigned Reg,
   return MI;
 }
 
-bool ARMBaseInstrInfo::analyzeSelect(const MachineInstr *MI,
+bool ARMBaseInstrInfo::analyzeSelect(const MachineInstr &MI,
                                      SmallVectorImpl<MachineOperand> &Cond,
                                      unsigned &TrueOp, unsigned &FalseOp,
                                      bool &Optimizable) const {
-  assert((MI->getOpcode() == ARM::MOVCCr || MI->getOpcode() == ARM::t2MOVCCr) &&
+  assert((MI.getOpcode() == ARM::MOVCCr || MI.getOpcode() == ARM::t2MOVCCr) &&
          "Unknown select instruction");
   // MOVCC operands:
   // 0: Def.
@@ -1874,38 +1875,38 @@ bool ARMBaseInstrInfo::analyzeSelect(const MachineInstr *MI,
   // 4: CPSR use.
   TrueOp = 1;
   FalseOp = 2;
-  Cond.push_back(MI->getOperand(3));
-  Cond.push_back(MI->getOperand(4));
+  Cond.push_back(MI.getOperand(3));
+  Cond.push_back(MI.getOperand(4));
   // We can always fold a def.
   Optimizable = true;
   return false;
 }
 
 MachineInstr *
-ARMBaseInstrInfo::optimizeSelect(MachineInstr *MI,
+ARMBaseInstrInfo::optimizeSelect(MachineInstr &MI,
                                  SmallPtrSetImpl<MachineInstr *> &SeenMIs,
                                  bool PreferFalse) const {
-  assert((MI->getOpcode() == ARM::MOVCCr || MI->getOpcode() == ARM::t2MOVCCr) &&
+  assert((MI.getOpcode() == ARM::MOVCCr || MI.getOpcode() == ARM::t2MOVCCr) &&
          "Unknown select instruction");
-  MachineRegisterInfo &MRI = MI->getParent()->getParent()->getRegInfo();
-  MachineInstr *DefMI = canFoldIntoMOVCC(MI->getOperand(2).getReg(), MRI, this);
+  MachineRegisterInfo &MRI = MI.getParent()->getParent()->getRegInfo();
+  MachineInstr *DefMI = canFoldIntoMOVCC(MI.getOperand(2).getReg(), MRI, this);
   bool Invert = !DefMI;
   if (!DefMI)
-    DefMI = canFoldIntoMOVCC(MI->getOperand(1).getReg(), MRI, this);
+    DefMI = canFoldIntoMOVCC(MI.getOperand(1).getReg(), MRI, this);
   if (!DefMI)
     return nullptr;
 
   // Find new register class to use.
-  MachineOperand FalseReg = MI->getOperand(Invert ? 2 : 1);
-  unsigned       DestReg  = MI->getOperand(0).getReg();
+  MachineOperand FalseReg = MI.getOperand(Invert ? 2 : 1);
+  unsigned DestReg = MI.getOperand(0).getReg();
   const TargetRegisterClass *PreviousClass = MRI.getRegClass(FalseReg.getReg());
   if (!MRI.constrainRegClass(DestReg, PreviousClass))
     return nullptr;
 
   // Create a new predicated version of DefMI.
   // Rfalse is the first use.
-  MachineInstrBuilder NewMI = BuildMI(*MI->getParent(), MI, MI->getDebugLoc(),
-                                      DefMI->getDesc(), DestReg);
+  MachineInstrBuilder NewMI =
+      BuildMI(*MI.getParent(), MI, MI.getDebugLoc(), DefMI->getDesc(), DestReg);
 
   // Copy all the DefMI operands, excluding its (null) predicate.
   const MCInstrDesc &DefDesc = DefMI->getDesc();
@@ -1913,12 +1914,12 @@ ARMBaseInstrInfo::optimizeSelect(MachineInstr *MI,
        i != e && !DefDesc.OpInfo[i].isPredicate(); ++i)
     NewMI.addOperand(DefMI->getOperand(i));
 
-  unsigned CondCode = MI->getOperand(3).getImm();
+  unsigned CondCode = MI.getOperand(3).getImm();
   if (Invert)
     NewMI.addImm(ARMCC::getOppositeCondition(ARMCC::CondCodes(CondCode)));
   else
     NewMI.addImm(CondCode);
-  NewMI.addOperand(MI->getOperand(4));
+  NewMI.addOperand(MI.getOperand(4));
 
   // DefMI is not the -S version that sets CPSR, so add an optional %noreg.
   if (NewMI->hasOptionalDef())
@@ -1940,7 +1941,7 @@ ARMBaseInstrInfo::optimizeSelect(MachineInstr *MI,
   // DefMI would be invalid when tranferred inside the loop.  Checking for a
   // loop is expensive, but at least remove kill flags if they are in different
   // BBs.
-  if (DefMI->getParent() != MI->getParent())
+  if (DefMI->getParent() != MI.getParent())
     NewMI->clearKillInfo();
 
   // The caller will erase MI, but not DefMI.
@@ -1994,10 +1995,12 @@ unsigned llvm::convertAddSubFlagsOpcode(unsigned OldOpc) {
 }
 
 void llvm::emitARMRegPlusImmediate(MachineBasicBlock &MBB,
-                               MachineBasicBlock::iterator &MBBI, DebugLoc dl,
-                               unsigned DestReg, unsigned BaseReg, int NumBytes,
-                               ARMCC::CondCodes Pred, unsigned PredReg,
-                               const ARMBaseInstrInfo &TII, unsigned MIFlags) {
+                                   MachineBasicBlock::iterator &MBBI,
+                                   const DebugLoc &dl, unsigned DestReg,
+                                   unsigned BaseReg, int NumBytes,
+                                   ARMCC::CondCodes Pred, unsigned PredReg,
+                                   const ARMBaseInstrInfo &TII,
+                                   unsigned MIFlags) {
   if (NumBytes == 0 && DestReg != BaseReg) {
     BuildMI(MBB, MBBI, dl, TII.get(ARM::MOVr), DestReg)
       .addReg(BaseReg, RegState::Kill)
@@ -2281,30 +2284,30 @@ bool llvm::rewriteARMFrameIndex(MachineInstr &MI, unsigned FrameRegIdx,
 /// in SrcReg and SrcReg2 if having two register operands, and the value it
 /// compares against in CmpValue. Return true if the comparison instruction
 /// can be analyzed.
-bool ARMBaseInstrInfo::
-analyzeCompare(const MachineInstr *MI, unsigned &SrcReg, unsigned &SrcReg2,
-               int &CmpMask, int &CmpValue) const {
-  switch (MI->getOpcode()) {
+bool ARMBaseInstrInfo::analyzeCompare(const MachineInstr &MI, unsigned &SrcReg,
+                                      unsigned &SrcReg2, int &CmpMask,
+                                      int &CmpValue) const {
+  switch (MI.getOpcode()) {
   default: break;
   case ARM::CMPri:
   case ARM::t2CMPri:
-    SrcReg = MI->getOperand(0).getReg();
+    SrcReg = MI.getOperand(0).getReg();
     SrcReg2 = 0;
     CmpMask = ~0;
-    CmpValue = MI->getOperand(1).getImm();
+    CmpValue = MI.getOperand(1).getImm();
     return true;
   case ARM::CMPrr:
   case ARM::t2CMPrr:
-    SrcReg = MI->getOperand(0).getReg();
-    SrcReg2 = MI->getOperand(1).getReg();
+    SrcReg = MI.getOperand(0).getReg();
+    SrcReg2 = MI.getOperand(1).getReg();
     CmpMask = ~0;
     CmpValue = 0;
     return true;
   case ARM::TSTri:
   case ARM::t2TSTri:
-    SrcReg = MI->getOperand(0).getReg();
+    SrcReg = MI.getOperand(0).getReg();
     SrcReg2 = 0;
-    CmpMask = MI->getOperand(1).getImm();
+    CmpMask = MI.getOperand(1).getImm();
     CmpValue = 0;
     return true;
   }
@@ -2385,25 +2388,25 @@ inline static bool isRedundantFlagInstr(MachineInstr *CmpI, unsigned SrcReg,
 /// E.g. SUBrr(r1,r2) and CMPrr(r1,r2). We also handle the case where two
 /// operands are swapped: SUBrr(r1,r2) and CMPrr(r2,r1), by updating the
 /// condition code of instructions which use the flags.
-bool ARMBaseInstrInfo::
-optimizeCompareInstr(MachineInstr *CmpInstr, unsigned SrcReg, unsigned SrcReg2,
-                     int CmpMask, int CmpValue,
-                     const MachineRegisterInfo *MRI) const {
+bool ARMBaseInstrInfo::optimizeCompareInstr(
+    MachineInstr &CmpInstr, unsigned SrcReg, unsigned SrcReg2, int CmpMask,
+    int CmpValue, const MachineRegisterInfo *MRI) const {
   // Get the unique definition of SrcReg.
   MachineInstr *MI = MRI->getUniqueVRegDef(SrcReg);
   if (!MI) return false;
 
   // Masked compares sometimes use the same register as the corresponding 'and'.
   if (CmpMask != ~0) {
-    if (!isSuitableForMask(MI, SrcReg, CmpMask, false) || isPredicated(MI)) {
+    if (!isSuitableForMask(MI, SrcReg, CmpMask, false) || isPredicated(*MI)) {
       MI = nullptr;
       for (MachineRegisterInfo::use_instr_iterator
            UI = MRI->use_instr_begin(SrcReg), UE = MRI->use_instr_end();
            UI != UE; ++UI) {
-        if (UI->getParent() != CmpInstr->getParent()) continue;
+        if (UI->getParent() != CmpInstr.getParent())
+          continue;
         MachineInstr *PotentialAND = &*UI;
         if (!isSuitableForMask(PotentialAND, SrcReg, CmpMask, true) ||
-            isPredicated(PotentialAND))
+            isPredicated(*PotentialAND))
           continue;
         MI = PotentialAND;
         break;
@@ -2414,7 +2417,7 @@ optimizeCompareInstr(MachineInstr *CmpInstr, unsigned SrcReg, unsigned SrcReg2,
 
   // Get ready to iterate backward from CmpInstr.
   MachineBasicBlock::iterator I = CmpInstr, E = MI,
-                              B = CmpInstr->getParent()->begin();
+                              B = CmpInstr.getParent()->begin();
 
   // Early exit if CmpInstr is at the beginning of the BB.
   if (I == B) return false;
@@ -2427,13 +2430,13 @@ optimizeCompareInstr(MachineInstr *CmpInstr, unsigned SrcReg, unsigned SrcReg2,
   if (SrcReg2 != 0)
     // MI is not a candidate for CMPrr.
     MI = nullptr;
-  else if (MI->getParent() != CmpInstr->getParent() || CmpValue != 0) {
+  else if (MI->getParent() != CmpInstr.getParent() || CmpValue != 0) {
     // Conservatively refuse to convert an instruction which isn't in the same
     // BB as the comparison.
     // For CMPri w/ CmpValue != 0, a Sub may still be a candidate.
     // Thus we cannot return here.
-    if (CmpInstr->getOpcode() == ARM::CMPri ||
-       CmpInstr->getOpcode() == ARM::t2CMPri)
+    if (CmpInstr.getOpcode() == ARM::CMPri ||
+        CmpInstr.getOpcode() == ARM::t2CMPri)
       MI = nullptr;
     else
       return false;
@@ -2453,7 +2456,7 @@ optimizeCompareInstr(MachineInstr *CmpInstr, unsigned SrcReg, unsigned SrcReg2,
       return false;
 
     // Check whether CmpInstr can be made redundant by the current instruction.
-    if (isRedundantFlagInstr(CmpInstr, SrcReg, SrcReg2, CmpValue, &*I)) {
+    if (isRedundantFlagInstr(&CmpInstr, SrcReg, SrcReg2, CmpValue, &*I)) {
       Sub = &*I;
       break;
     }
@@ -2471,7 +2474,7 @@ optimizeCompareInstr(MachineInstr *CmpInstr, unsigned SrcReg, unsigned SrcReg2,
   if (!MI) MI = Sub;
 
   // We can't use a predicated instruction - it doesn't always write the flags.
-  if (isPredicated(MI))
+  if (isPredicated(*MI))
     return false;
 
   switch (MI->getOpcode()) {
@@ -2519,7 +2522,7 @@ optimizeCompareInstr(MachineInstr *CmpInstr, unsigned SrcReg, unsigned SrcReg2,
         OperandsToUpdate;
     bool isSafe = false;
     I = CmpInstr;
-    E = CmpInstr->getParent()->end();
+    E = CmpInstr.getParent()->end();
     while (!isSafe && ++I != E) {
       const MachineInstr &Instr = *I;
       for (unsigned IO = 0, EO = Instr.getNumOperands();
@@ -2608,7 +2611,7 @@ optimizeCompareInstr(MachineInstr *CmpInstr, unsigned SrcReg, unsigned SrcReg2,
     // If CPSR is not killed nor re-defined, we should check whether it is
     // live-out. If it is live-out, do not optimize.
     if (!isSafe) {
-      MachineBasicBlock *MBB = CmpInstr->getParent();
+      MachineBasicBlock *MBB = CmpInstr.getParent();
       for (MachineBasicBlock::succ_iterator SI = MBB->succ_begin(),
                SE = MBB->succ_end(); SI != SE; ++SI)
         if ((*SI)->isLiveIn(ARM::CPSR))
@@ -2618,8 +2621,8 @@ optimizeCompareInstr(MachineInstr *CmpInstr, unsigned SrcReg, unsigned SrcReg2,
     // Toggle the optional operand to CPSR.
     MI->getOperand(5).setReg(ARM::CPSR);
     MI->getOperand(5).setIsDef(true);
-    assert(!isPredicated(MI) && "Can't use flags from predicated instruction");
-    CmpInstr->eraseFromParent();
+    assert(!isPredicated(*MI) && "Can't use flags from predicated instruction");
+    CmpInstr.eraseFromParent();
 
     // Modify the condition code of operands in OperandsToUpdate.
     // Since we have SUB(r1, r2) and CMP(r2, r1), the condition code needs to
@@ -2633,42 +2636,42 @@ optimizeCompareInstr(MachineInstr *CmpInstr, unsigned SrcReg, unsigned SrcReg2,
   return false;
 }
 
-bool ARMBaseInstrInfo::FoldImmediate(MachineInstr *UseMI,
-                                     MachineInstr *DefMI, unsigned Reg,
+bool ARMBaseInstrInfo::FoldImmediate(MachineInstr &UseMI, MachineInstr &DefMI,
+                                     unsigned Reg,
                                      MachineRegisterInfo *MRI) const {
   // Fold large immediates into add, sub, or, xor.
-  unsigned DefOpc = DefMI->getOpcode();
+  unsigned DefOpc = DefMI.getOpcode();
   if (DefOpc != ARM::t2MOVi32imm && DefOpc != ARM::MOVi32imm)
     return false;
-  if (!DefMI->getOperand(1).isImm())
+  if (!DefMI.getOperand(1).isImm())
     // Could be t2MOVi32imm <ga:xx>
     return false;
 
   if (!MRI->hasOneNonDBGUse(Reg))
     return false;
 
-  const MCInstrDesc &DefMCID = DefMI->getDesc();
+  const MCInstrDesc &DefMCID = DefMI.getDesc();
   if (DefMCID.hasOptionalDef()) {
     unsigned NumOps = DefMCID.getNumOperands();
-    const MachineOperand &MO = DefMI->getOperand(NumOps-1);
+    const MachineOperand &MO = DefMI.getOperand(NumOps - 1);
     if (MO.getReg() == ARM::CPSR && !MO.isDead())
       // If DefMI defines CPSR and it is not dead, it's obviously not safe
       // to delete DefMI.
       return false;
   }
 
-  const MCInstrDesc &UseMCID = UseMI->getDesc();
+  const MCInstrDesc &UseMCID = UseMI.getDesc();
   if (UseMCID.hasOptionalDef()) {
     unsigned NumOps = UseMCID.getNumOperands();
-    if (UseMI->getOperand(NumOps-1).getReg() == ARM::CPSR)
+    if (UseMI.getOperand(NumOps - 1).getReg() == ARM::CPSR)
       // If the instruction sets the flag, do not attempt this optimization
       // since it may change the semantics of the code.
       return false;
   }
 
-  unsigned UseOpc = UseMI->getOpcode();
+  unsigned UseOpc = UseMI.getOpcode();
   unsigned NewUseOpc = 0;
-  uint32_t ImmVal = (uint32_t)DefMI->getOperand(1).getImm();
+  uint32_t ImmVal = (uint32_t)DefMI.getOperand(1).getImm();
   uint32_t SOImmValV1 = 0, SOImmValV2 = 0;
   bool Commute = false;
   switch (UseOpc) {
@@ -2681,17 +2684,27 @@ bool ARMBaseInstrInfo::FoldImmediate(MachineInstr *UseMI,
   case ARM::t2ADDrr:
   case ARM::t2ORRrr:
   case ARM::t2EORrr: {
-    Commute = UseMI->getOperand(2).getReg() != Reg;
+    Commute = UseMI.getOperand(2).getReg() != Reg;
     switch (UseOpc) {
     default: break;
+    case ARM::ADDrr:
     case ARM::SUBrr: {
-      if (Commute)
+      if (UseOpc == ARM::SUBrr && Commute)
+        return false;
+
+      // ADD/SUB are special because they're essentially the same operation, so
+      // we can handle a larger range of immediates.
+      if (ARM_AM::isSOImmTwoPartVal(ImmVal))
+        NewUseOpc = UseOpc == ARM::ADDrr ? ARM::ADDri : ARM::SUBri;
+      else if (ARM_AM::isSOImmTwoPartVal(-ImmVal)) {
+        ImmVal = -ImmVal;
+        NewUseOpc = UseOpc == ARM::ADDrr ? ARM::SUBri : ARM::ADDri;
+      } else
         return false;
-      ImmVal = -ImmVal;
-      NewUseOpc = ARM::SUBri;
-      // Fallthrough
+      SOImmValV1 = (uint32_t)ARM_AM::getSOImmTwoPartFirst(ImmVal);
+      SOImmValV2 = (uint32_t)ARM_AM::getSOImmTwoPartSecond(ImmVal);
+      break;
     }
-    case ARM::ADDrr:
     case ARM::ORRrr:
     case ARM::EORrr: {
       if (!ARM_AM::isSOImmTwoPartVal(ImmVal))
@@ -2700,20 +2713,29 @@ bool ARMBaseInstrInfo::FoldImmediate(MachineInstr *UseMI,
       SOImmValV2 = (uint32_t)ARM_AM::getSOImmTwoPartSecond(ImmVal);
       switch (UseOpc) {
       default: break;
-      case ARM::ADDrr: NewUseOpc = ARM::ADDri; break;
       case ARM::ORRrr: NewUseOpc = ARM::ORRri; break;
       case ARM::EORrr: NewUseOpc = ARM::EORri; break;
       }
       break;
     }
+    case ARM::t2ADDrr:
     case ARM::t2SUBrr: {
-      if (Commute)
+      if (UseOpc == ARM::t2SUBrr && Commute)
         return false;
-      ImmVal = -ImmVal;
-      NewUseOpc = ARM::t2SUBri;
-      // Fallthrough
+
+      // ADD/SUB are special because they're essentially the same operation, so
+      // we can handle a larger range of immediates.
+      if (ARM_AM::isT2SOImmTwoPartVal(ImmVal))
+        NewUseOpc = UseOpc == ARM::t2ADDrr ? ARM::t2ADDri : ARM::t2SUBri;
+      else if (ARM_AM::isT2SOImmTwoPartVal(-ImmVal)) {
+        ImmVal = -ImmVal;
+        NewUseOpc = UseOpc == ARM::t2ADDrr ? ARM::t2SUBri : ARM::t2ADDri;
+      } else
+        return false;
+      SOImmValV1 = (uint32_t)ARM_AM::getT2SOImmTwoPartFirst(ImmVal);
+      SOImmValV2 = (uint32_t)ARM_AM::getT2SOImmTwoPartSecond(ImmVal);
+      break;
     }
-    case ARM::t2ADDrr:
     case ARM::t2ORRrr:
     case ARM::t2EORrr: {
       if (!ARM_AM::isT2SOImmTwoPartVal(ImmVal))
@@ -2722,7 +2744,6 @@ bool ARMBaseInstrInfo::FoldImmediate(MachineInstr *UseMI,
       SOImmValV2 = (uint32_t)ARM_AM::getT2SOImmTwoPartSecond(ImmVal);
       switch (UseOpc) {
       default: break;
-      case ARM::t2ADDrr: NewUseOpc = ARM::t2ADDri; break;
       case ARM::t2ORRrr: NewUseOpc = ARM::t2ORRri; break;
       case ARM::t2EORrr: NewUseOpc = ARM::t2EORri; break;
       }
@@ -2733,27 +2754,27 @@ bool ARMBaseInstrInfo::FoldImmediate(MachineInstr *UseMI,
   }
 
   unsigned OpIdx = Commute ? 2 : 1;
-  unsigned Reg1 = UseMI->getOperand(OpIdx).getReg();
-  bool isKill = UseMI->getOperand(OpIdx).isKill();
+  unsigned Reg1 = UseMI.getOperand(OpIdx).getReg();
+  bool isKill = UseMI.getOperand(OpIdx).isKill();
   unsigned NewReg = MRI->createVirtualRegister(MRI->getRegClass(Reg));
-  AddDefaultCC(AddDefaultPred(BuildMI(*UseMI->getParent(),
-                                      UseMI, UseMI->getDebugLoc(),
-                                      get(NewUseOpc), NewReg)
-                              .addReg(Reg1, getKillRegState(isKill))
-                              .addImm(SOImmValV1)));
-  UseMI->setDesc(get(NewUseOpc));
-  UseMI->getOperand(1).setReg(NewReg);
-  UseMI->getOperand(1).setIsKill();
-  UseMI->getOperand(2).ChangeToImmediate(SOImmValV2);
-  DefMI->eraseFromParent();
+  AddDefaultCC(
+      AddDefaultPred(BuildMI(*UseMI.getParent(), UseMI, UseMI.getDebugLoc(),
+                             get(NewUseOpc), NewReg)
+                         .addReg(Reg1, getKillRegState(isKill))
+                         .addImm(SOImmValV1)));
+  UseMI.setDesc(get(NewUseOpc));
+  UseMI.getOperand(1).setReg(NewReg);
+  UseMI.getOperand(1).setIsKill();
+  UseMI.getOperand(2).ChangeToImmediate(SOImmValV2);
+  DefMI.eraseFromParent();
   return true;
 }
 
 static unsigned getNumMicroOpsSwiftLdSt(const InstrItineraryData *ItinData,
-                                        const MachineInstr *MI) {
-  switch (MI->getOpcode()) {
+                                        const MachineInstr &MI) {
+  switch (MI.getOpcode()) {
   default: {
-    const MCInstrDesc &Desc = MI->getDesc();
+    const MCInstrDesc &Desc = MI.getDesc();
     int UOps = ItinData->getNumMicroOps(Desc.getSchedClass());
     assert(UOps >= 0 && "bad # UOps");
     return UOps;
@@ -2763,7 +2784,7 @@ static unsigned getNumMicroOpsSwiftLdSt(const InstrItineraryData *ItinData,
   case ARM::LDRBrs:
   case ARM::STRrs:
   case ARM::STRBrs: {
-    unsigned ShOpVal = MI->getOperand(3).getImm();
+    unsigned ShOpVal = MI.getOperand(3).getImm();
     bool isSub = ARM_AM::getAM2Op(ShOpVal) == ARM_AM::sub;
     unsigned ShImm = ARM_AM::getAM2Offset(ShOpVal);
     if (!isSub &&
@@ -2776,10 +2797,10 @@ static unsigned getNumMicroOpsSwiftLdSt(const InstrItineraryData *ItinData,
 
   case ARM::LDRH:
   case ARM::STRH: {
-    if (!MI->getOperand(2).getReg())
+    if (!MI.getOperand(2).getReg())
       return 1;
 
-    unsigned ShOpVal = MI->getOperand(3).getImm();
+    unsigned ShOpVal = MI.getOperand(3).getImm();
     bool isSub = ARM_AM::getAM2Op(ShOpVal) == ARM_AM::sub;
     unsigned ShImm = ARM_AM::getAM2Offset(ShOpVal);
     if (!isSub &&
@@ -2792,22 +2813,22 @@ static unsigned getNumMicroOpsSwiftLdSt(const InstrItineraryData *ItinData,
 
   case ARM::LDRSB:
   case ARM::LDRSH:
-    return (ARM_AM::getAM3Op(MI->getOperand(3).getImm()) == ARM_AM::sub) ? 3:2;
+    return (ARM_AM::getAM3Op(MI.getOperand(3).getImm()) == ARM_AM::sub) ? 3 : 2;
 
   case ARM::LDRSB_POST:
   case ARM::LDRSH_POST: {
-    unsigned Rt = MI->getOperand(0).getReg();
-    unsigned Rm = MI->getOperand(3).getReg();
+    unsigned Rt = MI.getOperand(0).getReg();
+    unsigned Rm = MI.getOperand(3).getReg();
     return (Rt == Rm) ? 4 : 3;
   }
 
   case ARM::LDR_PRE_REG:
   case ARM::LDRB_PRE_REG: {
-    unsigned Rt = MI->getOperand(0).getReg();
-    unsigned Rm = MI->getOperand(3).getReg();
+    unsigned Rt = MI.getOperand(0).getReg();
+    unsigned Rm = MI.getOperand(3).getReg();
     if (Rt == Rm)
       return 3;
-    unsigned ShOpVal = MI->getOperand(4).getImm();
+    unsigned ShOpVal = MI.getOperand(4).getImm();
     bool isSub = ARM_AM::getAM2Op(ShOpVal) == ARM_AM::sub;
     unsigned ShImm = ARM_AM::getAM2Offset(ShOpVal);
     if (!isSub &&
@@ -2820,7 +2841,7 @@ static unsigned getNumMicroOpsSwiftLdSt(const InstrItineraryData *ItinData,
 
   case ARM::STR_PRE_REG:
   case ARM::STRB_PRE_REG: {
-    unsigned ShOpVal = MI->getOperand(4).getImm();
+    unsigned ShOpVal = MI.getOperand(4).getImm();
     bool isSub = ARM_AM::getAM2Op(ShOpVal) == ARM_AM::sub;
     unsigned ShImm = ARM_AM::getAM2Offset(ShOpVal);
     if (!isSub &&
@@ -2833,21 +2854,20 @@ static unsigned getNumMicroOpsSwiftLdSt(const InstrItineraryData *ItinData,
 
   case ARM::LDRH_PRE:
   case ARM::STRH_PRE: {
-    unsigned Rt = MI->getOperand(0).getReg();
-    unsigned Rm = MI->getOperand(3).getReg();
+    unsigned Rt = MI.getOperand(0).getReg();
+    unsigned Rm = MI.getOperand(3).getReg();
     if (!Rm)
       return 2;
     if (Rt == Rm)
       return 3;
-    return (ARM_AM::getAM3Op(MI->getOperand(4).getImm()) == ARM_AM::sub)
-      ? 3 : 2;
+    return (ARM_AM::getAM3Op(MI.getOperand(4).getImm()) == ARM_AM::sub) ? 3 : 2;
   }
 
   case ARM::LDR_POST_REG:
   case ARM::LDRB_POST_REG:
   case ARM::LDRH_POST: {
-    unsigned Rt = MI->getOperand(0).getReg();
-    unsigned Rm = MI->getOperand(3).getReg();
+    unsigned Rt = MI.getOperand(0).getReg();
+    unsigned Rm = MI.getOperand(3).getReg();
     return (Rt == Rm) ? 3 : 2;
   }
 
@@ -2866,13 +2886,13 @@ static unsigned getNumMicroOpsSwiftLdSt(const InstrItineraryData *ItinData,
 
   case ARM::LDRSB_PRE:
   case ARM::LDRSH_PRE: {
-    unsigned Rm = MI->getOperand(3).getReg();
+    unsigned Rm = MI.getOperand(3).getReg();
     if (Rm == 0)
       return 3;
-    unsigned Rt = MI->getOperand(0).getReg();
+    unsigned Rt = MI.getOperand(0).getReg();
     if (Rt == Rm)
       return 4;
-    unsigned ShOpVal = MI->getOperand(4).getImm();
+    unsigned ShOpVal = MI.getOperand(4).getImm();
     bool isSub = ARM_AM::getAM2Op(ShOpVal) == ARM_AM::sub;
     unsigned ShImm = ARM_AM::getAM2Offset(ShOpVal);
     if (!isSub &&
@@ -2884,18 +2904,20 @@ static unsigned getNumMicroOpsSwiftLdSt(const InstrItineraryData *ItinData,
   }
 
   case ARM::LDRD: {
-    unsigned Rt = MI->getOperand(0).getReg();
-    unsigned Rn = MI->getOperand(2).getReg();
-    unsigned Rm = MI->getOperand(3).getReg();
+    unsigned Rt = MI.getOperand(0).getReg();
+    unsigned Rn = MI.getOperand(2).getReg();
+    unsigned Rm = MI.getOperand(3).getReg();
     if (Rm)
-      return (ARM_AM::getAM3Op(MI->getOperand(4).getImm()) == ARM_AM::sub) ?4:3;
+      return (ARM_AM::getAM3Op(MI.getOperand(4).getImm()) == ARM_AM::sub) ? 4
+                                                                          : 3;
     return (Rt == Rn) ? 3 : 2;
   }
 
   case ARM::STRD: {
-    unsigned Rm = MI->getOperand(3).getReg();
+    unsigned Rm = MI.getOperand(3).getReg();
     if (Rm)
-      return (ARM_AM::getAM3Op(MI->getOperand(4).getImm()) == ARM_AM::sub) ?4:3;
+      return (ARM_AM::getAM3Op(MI.getOperand(4).getImm()) == ARM_AM::sub) ? 4
+                                                                          : 3;
     return 2;
   }
 
@@ -2908,24 +2930,26 @@ static unsigned getNumMicroOpsSwiftLdSt(const InstrItineraryData *ItinData,
     return 4;
 
   case ARM::LDRD_PRE: {
-    unsigned Rt = MI->getOperand(0).getReg();
-    unsigned Rn = MI->getOperand(3).getReg();
-    unsigned Rm = MI->getOperand(4).getReg();
+    unsigned Rt = MI.getOperand(0).getReg();
+    unsigned Rn = MI.getOperand(3).getReg();
+    unsigned Rm = MI.getOperand(4).getReg();
     if (Rm)
-      return (ARM_AM::getAM3Op(MI->getOperand(5).getImm()) == ARM_AM::sub) ?5:4;
+      return (ARM_AM::getAM3Op(MI.getOperand(5).getImm()) == ARM_AM::sub) ? 5
+                                                                          : 4;
     return (Rt == Rn) ? 4 : 3;
   }
 
   case ARM::t2LDRD_PRE: {
-    unsigned Rt = MI->getOperand(0).getReg();
-    unsigned Rn = MI->getOperand(3).getReg();
+    unsigned Rt = MI.getOperand(0).getReg();
+    unsigned Rn = MI.getOperand(3).getReg();
     return (Rt == Rn) ? 4 : 3;
   }
 
   case ARM::STRD_PRE: {
-    unsigned Rm = MI->getOperand(4).getReg();
+    unsigned Rm = MI.getOperand(4).getReg();
     if (Rm)
-      return (ARM_AM::getAM3Op(MI->getOperand(5).getImm()) == ARM_AM::sub) ?5:4;
+      return (ARM_AM::getAM3Op(MI.getOperand(5).getImm()) == ARM_AM::sub) ? 5
+                                                                          : 4;
     return 3;
   }
 
@@ -2953,8 +2977,8 @@ static unsigned getNumMicroOpsSwiftLdSt(const InstrItineraryData *ItinData,
     return 2;
 
   case ARM::t2LDRDi8: {
-    unsigned Rt = MI->getOperand(0).getReg();
-    unsigned Rn = MI->getOperand(2).getReg();
+    unsigned Rt = MI.getOperand(0).getReg();
+    unsigned Rn = MI.getOperand(2).getReg();
     return (Rt == Rn) ? 3 : 2;
   }
 
@@ -2994,22 +3018,61 @@ static unsigned getNumMicroOpsSwiftLdSt(const InstrItineraryData *ItinData,
 // sizes during MC lowering. That target hook should be local to MC lowering
 // because we can't ensure that it is aware of other MI forms. Doing this will
 // ensure that MachineMemOperands are correctly propagated through all passes.
-unsigned ARMBaseInstrInfo::getNumLDMAddresses(const MachineInstr *MI) const {
+unsigned ARMBaseInstrInfo::getNumLDMAddresses(const MachineInstr &MI) const {
   unsigned Size = 0;
-  for (MachineInstr::mmo_iterator I = MI->memoperands_begin(),
-         E = MI->memoperands_end(); I != E; ++I) {
+  for (MachineInstr::mmo_iterator I = MI.memoperands_begin(),
+                                  E = MI.memoperands_end();
+       I != E; ++I) {
     Size += (*I)->getSize();
   }
   return Size / 4;
 }
 
-unsigned
-ARMBaseInstrInfo::getNumMicroOps(const InstrItineraryData *ItinData,
-                                 const MachineInstr *MI) const {
+static unsigned getNumMicroOpsSingleIssuePlusExtras(unsigned Opc,
+                                                    unsigned NumRegs) {
+  unsigned UOps = 1 + NumRegs; // 1 for address computation.
+  switch (Opc) {
+  default:
+    break;
+  case ARM::VLDMDIA_UPD:
+  case ARM::VLDMDDB_UPD:
+  case ARM::VLDMSIA_UPD:
+  case ARM::VLDMSDB_UPD:
+  case ARM::VSTMDIA_UPD:
+  case ARM::VSTMDDB_UPD:
+  case ARM::VSTMSIA_UPD:
+  case ARM::VSTMSDB_UPD:
+  case ARM::LDMIA_UPD:
+  case ARM::LDMDA_UPD:
+  case ARM::LDMDB_UPD:
+  case ARM::LDMIB_UPD:
+  case ARM::STMIA_UPD:
+  case ARM::STMDA_UPD:
+  case ARM::STMDB_UPD:
+  case ARM::STMIB_UPD:
+  case ARM::tLDMIA_UPD:
+  case ARM::tSTMIA_UPD:
+  case ARM::t2LDMIA_UPD:
+  case ARM::t2LDMDB_UPD:
+  case ARM::t2STMIA_UPD:
+  case ARM::t2STMDB_UPD:
+    ++UOps; // One for base register writeback.
+    break;
+  case ARM::LDMIA_RET:
+  case ARM::tPOP_RET:
+  case ARM::t2LDMIA_RET:
+    UOps += 2; // One for base reg wb, one for write to pc.
+    break;
+  }
+  return UOps;
+}
+
+unsigned ARMBaseInstrInfo::getNumMicroOps(const InstrItineraryData *ItinData,
+                                          const MachineInstr &MI) const {
   if (!ItinData || ItinData->isEmpty())
     return 1;
 
-  const MCInstrDesc &Desc = MI->getDesc();
+  const MCInstrDesc &Desc = MI.getDesc();
   unsigned Class = Desc.getSchedClass();
   int ItinUOps = ItinData->getNumMicroOps(Class);
   if (ItinUOps >= 0) {
@@ -3019,7 +3082,7 @@ ARMBaseInstrInfo::getNumMicroOps(const InstrItineraryData *ItinData,
     return ItinUOps;
   }
 
-  unsigned Opc = MI->getOpcode();
+  unsigned Opc = MI.getOpcode();
   switch (Opc) {
   default:
     llvm_unreachable("Unexpected multi-uops instruction!");
@@ -3049,7 +3112,7 @@ ARMBaseInstrInfo::getNumMicroOps(const InstrItineraryData *ItinData,
   case ARM::VSTMSIA:
   case ARM::VSTMSIA_UPD:
   case ARM::VSTMSDB_UPD: {
-    unsigned NumRegs = MI->getNumOperands() - Desc.getNumOperands();
+    unsigned NumRegs = MI.getNumOperands() - Desc.getNumOperands();
     return (NumRegs / 2) + (NumRegs % 2) + 1;
   }
 
@@ -3085,66 +3148,36 @@ ARMBaseInstrInfo::getNumMicroOps(const InstrItineraryData *ItinData,
   case ARM::t2STMDB:
   case ARM::t2STMIA_UPD:
   case ARM::t2STMDB_UPD: {
-    unsigned NumRegs = MI->getNumOperands() - Desc.getNumOperands() + 1;
-    if (Subtarget.isSwift()) {
-      int UOps = 1 + NumRegs;  // One for address computation, one for each ld / st.
-      switch (Opc) {
-      default: break;
-      case ARM::VLDMDIA_UPD:
-      case ARM::VLDMDDB_UPD:
-      case ARM::VLDMSIA_UPD:
-      case ARM::VLDMSDB_UPD:
-      case ARM::VSTMDIA_UPD:
-      case ARM::VSTMDDB_UPD:
-      case ARM::VSTMSIA_UPD:
-      case ARM::VSTMSDB_UPD:
-      case ARM::LDMIA_UPD:
-      case ARM::LDMDA_UPD:
-      case ARM::LDMDB_UPD:
-      case ARM::LDMIB_UPD:
-      case ARM::STMIA_UPD:
-      case ARM::STMDA_UPD:
-      case ARM::STMDB_UPD:
-      case ARM::STMIB_UPD:
-      case ARM::tLDMIA_UPD:
-      case ARM::tSTMIA_UPD:
-      case ARM::t2LDMIA_UPD:
-      case ARM::t2LDMDB_UPD:
-      case ARM::t2STMIA_UPD:
-      case ARM::t2STMDB_UPD:
-        ++UOps; // One for base register writeback.
-        break;
-      case ARM::LDMIA_RET:
-      case ARM::tPOP_RET:
-      case ARM::t2LDMIA_RET:
-        UOps += 2; // One for base reg wb, one for write to pc.
-        break;
-      }
-      return UOps;
-    } else if (Subtarget.isCortexA8() || Subtarget.isCortexA7()) {
+    unsigned NumRegs = MI.getNumOperands() - Desc.getNumOperands() + 1;
+    switch (Subtarget.getLdStMultipleTiming()) {
+    case ARMSubtarget::SingleIssuePlusExtras:
+      return getNumMicroOpsSingleIssuePlusExtras(Opc, NumRegs);
+    case ARMSubtarget::SingleIssue:
+      // Assume the worst.
+      return NumRegs;
+    case ARMSubtarget::DoubleIssue: {
       if (NumRegs < 4)
         return 2;
       // 4 registers would be issued: 2, 2.
       // 5 registers would be issued: 2, 2, 1.
-      int A8UOps = (NumRegs / 2);
+      unsigned UOps = (NumRegs / 2);
       if (NumRegs % 2)
-        ++A8UOps;
-      return A8UOps;
-    } else if (Subtarget.isLikeA9() || Subtarget.isSwift()) {
-      int A9UOps = (NumRegs / 2);
+        ++UOps;
+      return UOps;
+    }
+    case ARMSubtarget::DoubleIssueCheckUnalignedAccess: {
+      unsigned UOps = (NumRegs / 2);
       // If there are odd number of registers or if it's not 64-bit aligned,
       // then it takes an extra AGU (Address Generation Unit) cycle.
-      if ((NumRegs % 2) ||
-          !MI->hasOneMemOperand() ||
-          (*MI->memoperands_begin())->getAlignment() < 8)
-        ++A9UOps;
-      return A9UOps;
-    } else {
-      // Assume the worst.
-      return NumRegs;
+      if ((NumRegs % 2) || !MI.hasOneMemOperand() ||
+          (*MI.memoperands_begin())->getAlignment() < 8)
+        ++UOps;
+      return UOps;
+      }
     }
   }
   }
+  llvm_unreachable("Didn't find the number of microops");
 }
 
 int
@@ -3428,13 +3461,13 @@ static const MachineInstr *getBundledDefMI(const TargetRegisterInfo *TRI,
 }
 
 static const MachineInstr *getBundledUseMI(const TargetRegisterInfo *TRI,
-                                           const MachineInstr *MI, unsigned Reg,
+                                           const MachineInstr &MI, unsigned Reg,
                                            unsigned &UseIdx, unsigned &Dist) {
   Dist = 0;
 
-  MachineBasicBlock::const_instr_iterator II = ++MI->getIterator();
+  MachineBasicBlock::const_instr_iterator II = ++MI.getIterator();
   assert(II->isInsideBundle() && "Empty bundle?");
-  MachineBasicBlock::const_instr_iterator E = MI->getParent()->instr_end();
+  MachineBasicBlock::const_instr_iterator E = MI.getParent()->instr_end();
 
   // FIXME: This doesn't properly handle multiple uses.
   int Idx = -1;
@@ -3460,17 +3493,17 @@ static const MachineInstr *getBundledUseMI(const TargetRegisterInfo *TRI,
 /// itinerary based on the def opcode and alignment. The caller will ensure that
 /// adjusted latency is at least one cycle.
 static int adjustDefLatency(const ARMSubtarget &Subtarget,
-                            const MachineInstr *DefMI,
-                            const MCInstrDesc *DefMCID, unsigned DefAlign) {
+                            const MachineInstr &DefMI,
+                            const MCInstrDesc &DefMCID, unsigned DefAlign) {
   int Adjust = 0;
   if (Subtarget.isCortexA8() || Subtarget.isLikeA9() || Subtarget.isCortexA7()) {
     // FIXME: Shifter op hack: no shift (i.e. [r +/- r]) or [r + r << 2]
     // variants are one cycle cheaper.
-    switch (DefMCID->getOpcode()) {
+    switch (DefMCID.getOpcode()) {
     default: break;
     case ARM::LDRrs:
     case ARM::LDRBrs: {
-      unsigned ShOpVal = DefMI->getOperand(3).getImm();
+      unsigned ShOpVal = DefMI.getOperand(3).getImm();
       unsigned ShImm = ARM_AM::getAM2Offset(ShOpVal);
       if (ShImm == 0 ||
           (ShImm == 2 && ARM_AM::getAM2ShiftOpc(ShOpVal) == ARM_AM::lsl))
@@ -3482,7 +3515,7 @@ static int adjustDefLatency(const ARMSubtarget &Subtarget,
     case ARM::t2LDRHs:
     case ARM::t2LDRSHs: {
       // Thumb2 mode: lsl only.
-      unsigned ShAmt = DefMI->getOperand(3).getImm();
+      unsigned ShAmt = DefMI.getOperand(3).getImm();
       if (ShAmt == 0 || ShAmt == 2)
         --Adjust;
       break;
@@ -3491,11 +3524,11 @@ static int adjustDefLatency(const ARMSubtarget &Subtarget,
   } else if (Subtarget.isSwift()) {
     // FIXME: Properly handle all of the latency adjustments for address
     // writeback.
-    switch (DefMCID->getOpcode()) {
+    switch (DefMCID.getOpcode()) {
     default: break;
     case ARM::LDRrs:
     case ARM::LDRBrs: {
-      unsigned ShOpVal = DefMI->getOperand(3).getImm();
+      unsigned ShOpVal = DefMI.getOperand(3).getImm();
       bool isSub = ARM_AM::getAM2Op(ShOpVal) == ARM_AM::sub;
       unsigned ShImm = ARM_AM::getAM2Offset(ShOpVal);
       if (!isSub &&
@@ -3513,7 +3546,7 @@ static int adjustDefLatency(const ARMSubtarget &Subtarget,
     case ARM::t2LDRHs:
     case ARM::t2LDRSHs: {
       // Thumb2 mode: lsl only.
-      unsigned ShAmt = DefMI->getOperand(3).getImm();
+      unsigned ShAmt = DefMI.getOperand(3).getImm();
       if (ShAmt == 0 || ShAmt == 1 || ShAmt == 2 || ShAmt == 3)
         Adjust -= 2;
       break;
@@ -3521,8 +3554,8 @@ static int adjustDefLatency(const ARMSubtarget &Subtarget,
     }
   }
 
-  if (DefAlign < 8 && Subtarget.isLikeA9()) {
-    switch (DefMCID->getOpcode()) {
+  if (DefAlign < 8 && Subtarget.checkVLDnAccessAlignment()) {
+    switch (DefMCID.getOpcode()) {
     default: break;
     case ARM::VLD1q8:
     case ARM::VLD1q16:
@@ -3637,53 +3670,55 @@ static int adjustDefLatency(const ARMSubtarget &Subtarget,
   return Adjust;
 }
 
-
-
-int
-ARMBaseInstrInfo::getOperandLatency(const InstrItineraryData *ItinData,
-                                    const MachineInstr *DefMI, unsigned DefIdx,
-                                    const MachineInstr *UseMI,
-                                    unsigned UseIdx) const {
+int ARMBaseInstrInfo::getOperandLatency(const InstrItineraryData *ItinData,
+                                        const MachineInstr &DefMI,
+                                        unsigned DefIdx,
+                                        const MachineInstr &UseMI,
+                                        unsigned UseIdx) const {
   // No operand latency. The caller may fall back to getInstrLatency.
   if (!ItinData || ItinData->isEmpty())
     return -1;
 
-  const MachineOperand &DefMO = DefMI->getOperand(DefIdx);
+  const MachineOperand &DefMO = DefMI.getOperand(DefIdx);
   unsigned Reg = DefMO.getReg();
-  const MCInstrDesc *DefMCID = &DefMI->getDesc();
-  const MCInstrDesc *UseMCID = &UseMI->getDesc();
 
+  const MachineInstr *ResolvedDefMI = &DefMI;
   unsigned DefAdj = 0;
-  if (DefMI->isBundle()) {
-    DefMI = getBundledDefMI(&getRegisterInfo(), DefMI, Reg, DefIdx, DefAdj);
-    DefMCID = &DefMI->getDesc();
-  }
-  if (DefMI->isCopyLike() || DefMI->isInsertSubreg() ||
-      DefMI->isRegSequence() || DefMI->isImplicitDef()) {
+  if (DefMI.isBundle())
+    ResolvedDefMI =
+        getBundledDefMI(&getRegisterInfo(), &DefMI, Reg, DefIdx, DefAdj);
+  if (ResolvedDefMI->isCopyLike() || ResolvedDefMI->isInsertSubreg() ||
+      ResolvedDefMI->isRegSequence() || ResolvedDefMI->isImplicitDef()) {
     return 1;
   }
 
+  const MachineInstr *ResolvedUseMI = &UseMI;
   unsigned UseAdj = 0;
-  if (UseMI->isBundle()) {
-    unsigned NewUseIdx;
-    const MachineInstr *NewUseMI = getBundledUseMI(&getRegisterInfo(), UseMI,
-                                                   Reg, NewUseIdx, UseAdj);
-    if (!NewUseMI)
+  if (UseMI.isBundle()) {
+    ResolvedUseMI =
+        getBundledUseMI(&getRegisterInfo(), UseMI, Reg, UseIdx, UseAdj);
+    if (!ResolvedUseMI)
       return -1;
-
-    UseMI = NewUseMI;
-    UseIdx = NewUseIdx;
-    UseMCID = &UseMI->getDesc();
   }
 
+  return getOperandLatencyImpl(
+      ItinData, *ResolvedDefMI, DefIdx, ResolvedDefMI->getDesc(), DefAdj, DefMO,
+      Reg, *ResolvedUseMI, UseIdx, ResolvedUseMI->getDesc(), UseAdj);
+}
+
+int ARMBaseInstrInfo::getOperandLatencyImpl(
+    const InstrItineraryData *ItinData, const MachineInstr &DefMI,
+    unsigned DefIdx, const MCInstrDesc &DefMCID, unsigned DefAdj,
+    const MachineOperand &DefMO, unsigned Reg, const MachineInstr &UseMI,
+    unsigned UseIdx, const MCInstrDesc &UseMCID, unsigned UseAdj) const {
   if (Reg == ARM::CPSR) {
-    if (DefMI->getOpcode() == ARM::FMSTAT) {
+    if (DefMI.getOpcode() == ARM::FMSTAT) {
       // fpscr -> cpsr stalls over 20 cycles on A8 (and earlier?)
       return Subtarget.isLikeA9() ? 1 : 20;
     }
 
     // CPSR set and branch can be paired in the same cycle.
-    if (UseMI->isBranch())
+    if (UseMI.isBranch())
       return 0;
 
     // Otherwise it takes the instruction latency (generally one).
@@ -3694,7 +3729,7 @@ ARMBaseInstrInfo::getOperandLatency(const InstrItineraryData *ItinData,
     // incur a code size penalty (not able to use the CPSR setting 16-bit
     // instructions).
     if (Latency > 0 && Subtarget.isThumb2()) {
-      const MachineFunction *MF = DefMI->getParent()->getParent();
+      const MachineFunction *MF = DefMI.getParent()->getParent();
       // FIXME: Use Function::optForSize().
       if (MF->getFunction()->hasFnAttribute(Attribute::OptimizeForSize))
         --Latency;
@@ -3702,17 +3737,19 @@ ARMBaseInstrInfo::getOperandLatency(const InstrItineraryData *ItinData,
     return Latency;
   }
 
-  if (DefMO.isImplicit() || UseMI->getOperand(UseIdx).isImplicit())
+  if (DefMO.isImplicit() || UseMI.getOperand(UseIdx).isImplicit())
     return -1;
 
-  unsigned DefAlign = DefMI->hasOneMemOperand()
-    ? (*DefMI->memoperands_begin())->getAlignment() : 0;
-  unsigned UseAlign = UseMI->hasOneMemOperand()
-    ? (*UseMI->memoperands_begin())->getAlignment() : 0;
+  unsigned DefAlign = DefMI.hasOneMemOperand()
+                          ? (*DefMI.memoperands_begin())->getAlignment()
+                          : 0;
+  unsigned UseAlign = UseMI.hasOneMemOperand()
+                          ? (*UseMI.memoperands_begin())->getAlignment()
+                          : 0;
 
   // Get the itinerary's latency if possible, and handle variable_ops.
-  int Latency = getOperandLatency(ItinData, *DefMCID, DefIdx, DefAlign,
-                                  *UseMCID, UseIdx, UseAlign);
+  int Latency = getOperandLatency(ItinData, DefMCID, DefIdx, DefAlign, UseMCID,
+                                  UseIdx, UseAlign);
   // Unable to find operand latency. The caller may resort to getInstrLatency.
   if (Latency < 0)
     return Latency;
@@ -3746,10 +3783,9 @@ ARMBaseInstrInfo::getOperandLatency(const InstrItineraryData *ItinData,
 
   if (!UseNode->isMachineOpcode()) {
     int Latency = ItinData->getOperandCycle(DefMCID.getSchedClass(), DefIdx);
-    if (Subtarget.isLikeA9() || Subtarget.isSwift())
-      return Latency <= 2 ? 1 : Latency - 1;
-    else
-      return Latency <= 3 ? 1 : Latency - 2;
+    int Adj = Subtarget.getPreISelOperandLatencyAdjustment();
+    int Threshold = 1 + Adj;
+    return Latency <= Threshold ? 1 : Latency - Adj;
   }
 
   const MCInstrDesc &UseMCID = get(UseNode->getMachineOpcode());
@@ -3820,7 +3856,7 @@ ARMBaseInstrInfo::getOperandLatency(const InstrItineraryData *ItinData,
     }
   }
 
-  if (DefAlign < 8 && Subtarget.isLikeA9())
+  if (DefAlign < 8 && Subtarget.checkVLDnAccessAlignment())
     switch (DefMCID.getOpcode()) {
     default: break;
     case ARM::VLD1q8:
@@ -3946,15 +3982,15 @@ ARMBaseInstrInfo::getOperandLatency(const InstrItineraryData *ItinData,
   return Latency;
 }
 
-unsigned ARMBaseInstrInfo::getPredicationCost(const MachineInstr *MI) const {
-   if (MI->isCopyLike() || MI->isInsertSubreg() ||
-      MI->isRegSequence() || MI->isImplicitDef())
+unsigned ARMBaseInstrInfo::getPredicationCost(const MachineInstr &MI) const {
+  if (MI.isCopyLike() || MI.isInsertSubreg() || MI.isRegSequence() ||
+      MI.isImplicitDef())
     return 0;
 
-  if (MI->isBundle())
+  if (MI.isBundle())
     return 0;
 
-  const MCInstrDesc &MCID = MI->getDesc();
+  const MCInstrDesc &MCID = MI.getDesc();
 
   if (MCID.isCall() || MCID.hasImplicitDefOfPhysReg(ARM::CPSR)) {
     // When predicated, CPSR is an additional source operand for CPSR updating
@@ -3965,26 +4001,26 @@ unsigned ARMBaseInstrInfo::getPredicationCost(const MachineInstr *MI) const {
 }
 
 unsigned ARMBaseInstrInfo::getInstrLatency(const InstrItineraryData *ItinData,
-                                           const MachineInstr *MI,
+                                           const MachineInstr &MI,
                                            unsigned *PredCost) const {
-  if (MI->isCopyLike() || MI->isInsertSubreg() ||
-      MI->isRegSequence() || MI->isImplicitDef())
+  if (MI.isCopyLike() || MI.isInsertSubreg() || MI.isRegSequence() ||
+      MI.isImplicitDef())
     return 1;
 
   // An instruction scheduler typically runs on unbundled instructions, however
   // other passes may query the latency of a bundled instruction.
-  if (MI->isBundle()) {
+  if (MI.isBundle()) {
     unsigned Latency = 0;
-    MachineBasicBlock::const_instr_iterator I = MI->getIterator();
-    MachineBasicBlock::const_instr_iterator E = MI->getParent()->instr_end();
+    MachineBasicBlock::const_instr_iterator I = MI.getIterator();
+    MachineBasicBlock::const_instr_iterator E = MI.getParent()->instr_end();
     while (++I != E && I->isInsideBundle()) {
       if (I->getOpcode() != ARM::t2IT)
-        Latency += getInstrLatency(ItinData, &*I, PredCost);
+        Latency += getInstrLatency(ItinData, *I, PredCost);
     }
     return Latency;
   }
 
-  const MCInstrDesc &MCID = MI->getDesc();
+  const MCInstrDesc &MCID = MI.getDesc();
   if (PredCost && (MCID.isCall() || MCID.hasImplicitDefOfPhysReg(ARM::CPSR))) {
     // When predicated, CPSR is an additional source operand for CPSR updating
     // instructions, this apparently increases their latencies.
@@ -3993,7 +4029,7 @@ unsigned ARMBaseInstrInfo::getInstrLatency(const InstrItineraryData *ItinData,
   // Be sure to call getStageLatency for an empty itinerary in case it has a
   // valid MinLatency property.
   if (!ItinData)
-    return MI->mayLoad() ? 3 : 1;
+    return MI.mayLoad() ? 3 : 1;
 
   unsigned Class = MCID.getSchedClass();
 
@@ -4005,9 +4041,9 @@ unsigned ARMBaseInstrInfo::getInstrLatency(const InstrItineraryData *ItinData,
   unsigned Latency = ItinData->getStageLatency(Class);
 
   // Adjust for dynamic def-side opcode variants not captured by the itinerary.
-  unsigned DefAlign = MI->hasOneMemOperand()
-    ? (*MI->memoperands_begin())->getAlignment() : 0;
-  int Adj = adjustDefLatency(Subtarget, MI, &MCID, DefAlign);
+  unsigned DefAlign =
+      MI.hasOneMemOperand() ? (*MI.memoperands_begin())->getAlignment() : 0;
+  int Adj = adjustDefLatency(Subtarget, MI, MCID, DefAlign);
   if (Adj >= 0 || (int)Latency > -Adj) {
     return Latency + Adj;
   }
@@ -4032,46 +4068,46 @@ int ARMBaseInstrInfo::getInstrLatency(const InstrItineraryData *ItinData,
   }
 }
 
-bool ARMBaseInstrInfo::
-hasHighOperandLatency(const TargetSchedModel &SchedModel,
-                      const MachineRegisterInfo *MRI,
-                      const MachineInstr *DefMI, unsigned DefIdx,
-                      const MachineInstr *UseMI, unsigned UseIdx) const {
-  unsigned DDomain = DefMI->getDesc().TSFlags & ARMII::DomainMask;
-  unsigned UDomain = UseMI->getDesc().TSFlags & ARMII::DomainMask;
-  if (Subtarget.isCortexA8() &&
+bool ARMBaseInstrInfo::hasHighOperandLatency(const TargetSchedModel &SchedModel,
+                                             const MachineRegisterInfo *MRI,
+                                             const MachineInstr &DefMI,
+                                             unsigned DefIdx,
+                                             const MachineInstr &UseMI,
+                                             unsigned UseIdx) const {
+  unsigned DDomain = DefMI.getDesc().TSFlags & ARMII::DomainMask;
+  unsigned UDomain = UseMI.getDesc().TSFlags & ARMII::DomainMask;
+  if (Subtarget.nonpipelinedVFP() &&
       (DDomain == ARMII::DomainVFP || UDomain == ARMII::DomainVFP))
-    // CortexA8 VFP instructions are not pipelined.
     return true;
 
   // Hoist VFP / NEON instructions with 4 or higher latency.
-  unsigned Latency
-    = SchedModel.computeOperandLatency(DefMI, DefIdx, UseMI, UseIdx);
+  unsigned Latency =
+      SchedModel.computeOperandLatency(&DefMI, DefIdx, &UseMI, UseIdx);
   if (Latency <= 3)
     return false;
   return DDomain == ARMII::DomainVFP || DDomain == ARMII::DomainNEON ||
          UDomain == ARMII::DomainVFP || UDomain == ARMII::DomainNEON;
 }
 
-bool ARMBaseInstrInfo::
-hasLowDefLatency(const TargetSchedModel &SchedModel,
-                 const MachineInstr *DefMI, unsigned DefIdx) const {
+bool ARMBaseInstrInfo::hasLowDefLatency(const TargetSchedModel &SchedModel,
+                                        const MachineInstr &DefMI,
+                                        unsigned DefIdx) const {
   const InstrItineraryData *ItinData = SchedModel.getInstrItineraries();
   if (!ItinData || ItinData->isEmpty())
     return false;
 
-  unsigned DDomain = DefMI->getDesc().TSFlags & ARMII::DomainMask;
+  unsigned DDomain = DefMI.getDesc().TSFlags & ARMII::DomainMask;
   if (DDomain == ARMII::DomainGeneral) {
-    unsigned DefClass = DefMI->getDesc().getSchedClass();
+    unsigned DefClass = DefMI.getDesc().getSchedClass();
     int DefCycle = ItinData->getOperandCycle(DefClass, DefIdx);
     return (DefCycle != -1 && DefCycle <= 2);
   }
   return false;
 }
 
-bool ARMBaseInstrInfo::verifyInstruction(const MachineInstr *MI,
+bool ARMBaseInstrInfo::verifyInstruction(const MachineInstr &MI,
                                          StringRef &ErrInfo) const {
-  if (convertAddSubFlagsOpcode(MI->getOpcode())) {
+  if (convertAddSubFlagsOpcode(MI.getOpcode())) {
     ErrInfo = "Pseudo flag setting opcodes only exist in Selection DAG";
     return false;
   }
@@ -4082,8 +4118,7 @@ bool ARMBaseInstrInfo::verifyInstruction(const MachineInstr *MI,
 // sequence is needed for other targets.
 void ARMBaseInstrInfo::expandLoadStackGuardBase(MachineBasicBlock::iterator MI,
                                                 unsigned LoadImmOpc,
-                                                unsigned LoadOpc,
-                                                Reloc::Model RM) const {
+                                                unsigned LoadOpc) const {
   MachineBasicBlock &MBB = *MI->getParent();
   DebugLoc DL = MI->getDebugLoc();
   unsigned Reg = MI->getOperand(0).getReg();
@@ -4094,12 +4129,12 @@ void ARMBaseInstrInfo::expandLoadStackGuardBase(MachineBasicBlock::iterator MI,
   BuildMI(MBB, MI, DL, get(LoadImmOpc), Reg)
       .addGlobalAddress(GV, 0, ARMII::MO_NONLAZY);
 
-  if (Subtarget.GVIsIndirectSymbol(GV, RM)) {
+  if (Subtarget.isGVIndirectSymbol(GV)) {
     MIB = BuildMI(MBB, MI, DL, get(LoadOpc), Reg);
     MIB.addReg(Reg, RegState::Kill).addImm(0);
-    unsigned Flag = MachineMemOperand::MOLoad | MachineMemOperand::MOInvariant;
+    auto Flags = MachineMemOperand::MOLoad | MachineMemOperand::MOInvariant;
     MachineMemOperand *MMO = MBB.getParent()->getMachineMemOperand(
-        MachinePointerInfo::getGOT(*MBB.getParent()), Flag, 4, 4);
+        MachinePointerInfo::getGOT(*MBB.getParent()), Flags, 4, 4);
     MIB.addMemOperand(MMO);
     AddDefaultPred(MIB);
   }
@@ -4146,24 +4181,24 @@ enum ARMExeDomain {
 // Also see ARMInstrFormats.td and Domain* enums in ARMBaseInfo.h
 //
 std::pair<uint16_t, uint16_t>
-ARMBaseInstrInfo::getExecutionDomain(const MachineInstr *MI) const {
+ARMBaseInstrInfo::getExecutionDomain(const MachineInstr &MI) const {
   // If we don't have access to NEON instructions then we won't be able
   // to swizzle anything to the NEON domain. Check to make sure.
   if (Subtarget.hasNEON()) {
     // VMOVD, VMOVRS and VMOVSR are VFP instructions, but can be changed to NEON
     // if they are not predicated.
-    if (MI->getOpcode() == ARM::VMOVD && !isPredicated(MI))
+    if (MI.getOpcode() == ARM::VMOVD && !isPredicated(MI))
       return std::make_pair(ExeVFP, (1 << ExeVFP) | (1 << ExeNEON));
 
     // CortexA9 is particularly picky about mixing the two and wants these
     // converted.
-    if (Subtarget.isCortexA9() && !isPredicated(MI) &&
-        (MI->getOpcode() == ARM::VMOVRS || MI->getOpcode() == ARM::VMOVSR ||
-         MI->getOpcode() == ARM::VMOVS))
+    if (Subtarget.useNEONForFPMovs() && !isPredicated(MI) &&
+        (MI.getOpcode() == ARM::VMOVRS || MI.getOpcode() == ARM::VMOVSR ||
+         MI.getOpcode() == ARM::VMOVS))
       return std::make_pair(ExeVFP, (1 << ExeVFP) | (1 << ExeNEON));
   }
   // No other instructions can be swizzled, so just determine their domain.
-  unsigned Domain = MI->getDesc().TSFlags & ARMII::DomainMask;
+  unsigned Domain = MI.getDesc().TSFlags & ARMII::DomainMask;
 
   if (Domain & ARMII::DomainNEON)
     return std::make_pair(ExeNEON, 0);
@@ -4210,12 +4245,11 @@ static unsigned getCorrespondingDRegAndLane(const TargetRegisterInfo *TRI,
 /// (including the case where the DPR itself is defined), it should not.
 ///
 static bool getImplicitSPRUseForDPRUse(const TargetRegisterInfo *TRI,
-                                       MachineInstr *MI,
-                                       unsigned DReg, unsigned Lane,
-                                       unsigned &ImplicitSReg) {
+                                       MachineInstr &MI, unsigned DReg,
+                                       unsigned Lane, unsigned &ImplicitSReg) {
   // If the DPR is defined or used already, the other SPR lane will be chained
   // correctly, so there is nothing to be done.
-  if (MI->definesRegister(DReg, TRI) || MI->readsRegister(DReg, TRI)) {
+  if (MI.definesRegister(DReg, TRI) || MI.readsRegister(DReg, TRI)) {
     ImplicitSReg = 0;
     return true;
   }
@@ -4224,7 +4258,7 @@ static bool getImplicitSPRUseForDPRUse(const TargetRegisterInfo *TRI,
   ImplicitSReg = TRI->getSubReg(DReg,
                                 (Lane & 1) ? ARM::ssub_0 : ARM::ssub_1);
   MachineBasicBlock::LivenessQueryResult LQR =
-    MI->getParent()->computeRegisterLiveness(TRI, ImplicitSReg, MI);
+      MI.getParent()->computeRegisterLiveness(TRI, ImplicitSReg, MI);
 
   if (LQR == MachineBasicBlock::LQR_Live)
     return true;
@@ -4237,106 +4271,105 @@ static bool getImplicitSPRUseForDPRUse(const TargetRegisterInfo *TRI,
   return true;
 }
 
-void
-ARMBaseInstrInfo::setExecutionDomain(MachineInstr *MI, unsigned Domain) const {
+void ARMBaseInstrInfo::setExecutionDomain(MachineInstr &MI,
+                                          unsigned Domain) const {
   unsigned DstReg, SrcReg, DReg;
   unsigned Lane;
-  MachineInstrBuilder MIB(*MI->getParent()->getParent(), MI);
+  MachineInstrBuilder MIB(*MI.getParent()->getParent(), MI);
   const TargetRegisterInfo *TRI = &getRegisterInfo();
-  switch (MI->getOpcode()) {
-    default:
-      llvm_unreachable("cannot handle opcode!");
+  switch (MI.getOpcode()) {
+  default:
+    llvm_unreachable("cannot handle opcode!");
+    break;
+  case ARM::VMOVD:
+    if (Domain != ExeNEON)
       break;
-    case ARM::VMOVD:
-      if (Domain != ExeNEON)
-        break;
 
-      // Zap the predicate operands.
-      assert(!isPredicated(MI) && "Cannot predicate a VORRd");
+    // Zap the predicate operands.
+    assert(!isPredicated(MI) && "Cannot predicate a VORRd");
 
-      // Make sure we've got NEON instructions.
-      assert(Subtarget.hasNEON() && "VORRd requires NEON");
+    // Make sure we've got NEON instructions.
+    assert(Subtarget.hasNEON() && "VORRd requires NEON");
 
-      // Source instruction is %DDst = VMOVD %DSrc, 14, %noreg (; implicits)
-      DstReg = MI->getOperand(0).getReg();
-      SrcReg = MI->getOperand(1).getReg();
+    // Source instruction is %DDst = VMOVD %DSrc, 14, %noreg (; implicits)
+    DstReg = MI.getOperand(0).getReg();
+    SrcReg = MI.getOperand(1).getReg();
 
-      for (unsigned i = MI->getDesc().getNumOperands(); i; --i)
-        MI->RemoveOperand(i-1);
+    for (unsigned i = MI.getDesc().getNumOperands(); i; --i)
+      MI.RemoveOperand(i - 1);
 
-      // Change to a %DDst = VORRd %DSrc, %DSrc, 14, %noreg (; implicits)
-      MI->setDesc(get(ARM::VORRd));
-      AddDefaultPred(MIB.addReg(DstReg, RegState::Define)
-                        .addReg(SrcReg)
-                        .addReg(SrcReg));
+    // Change to a %DDst = VORRd %DSrc, %DSrc, 14, %noreg (; implicits)
+    MI.setDesc(get(ARM::VORRd));
+    AddDefaultPred(
+        MIB.addReg(DstReg, RegState::Define).addReg(SrcReg).addReg(SrcReg));
+    break;
+  case ARM::VMOVRS:
+    if (Domain != ExeNEON)
       break;
-    case ARM::VMOVRS:
-      if (Domain != ExeNEON)
-        break;
-      assert(!isPredicated(MI) && "Cannot predicate a VGETLN");
+    assert(!isPredicated(MI) && "Cannot predicate a VGETLN");
 
-      // Source instruction is %RDst = VMOVRS %SSrc, 14, %noreg (; implicits)
-      DstReg = MI->getOperand(0).getReg();
-      SrcReg = MI->getOperand(1).getReg();
+    // Source instruction is %RDst = VMOVRS %SSrc, 14, %noreg (; implicits)
+    DstReg = MI.getOperand(0).getReg();
+    SrcReg = MI.getOperand(1).getReg();
 
-      for (unsigned i = MI->getDesc().getNumOperands(); i; --i)
-        MI->RemoveOperand(i-1);
+    for (unsigned i = MI.getDesc().getNumOperands(); i; --i)
+      MI.RemoveOperand(i - 1);
 
-      DReg = getCorrespondingDRegAndLane(TRI, SrcReg, Lane);
+    DReg = getCorrespondingDRegAndLane(TRI, SrcReg, Lane);
 
-      // Convert to %RDst = VGETLNi32 %DSrc, Lane, 14, %noreg (; imps)
-      // Note that DSrc has been widened and the other lane may be undef, which
-      // contaminates the entire register.
-      MI->setDesc(get(ARM::VGETLNi32));
-      AddDefaultPred(MIB.addReg(DstReg, RegState::Define)
-                        .addReg(DReg, RegState::Undef)
-                        .addImm(Lane));
+    // Convert to %RDst = VGETLNi32 %DSrc, Lane, 14, %noreg (; imps)
+    // Note that DSrc has been widened and the other lane may be undef, which
+    // contaminates the entire register.
+    MI.setDesc(get(ARM::VGETLNi32));
+    AddDefaultPred(MIB.addReg(DstReg, RegState::Define)
+                       .addReg(DReg, RegState::Undef)
+                       .addImm(Lane));
 
-      // The old source should be an implicit use, otherwise we might think it
-      // was dead before here.
-      MIB.addReg(SrcReg, RegState::Implicit);
+    // The old source should be an implicit use, otherwise we might think it
+    // was dead before here.
+    MIB.addReg(SrcReg, RegState::Implicit);
+    break;
+  case ARM::VMOVSR: {
+    if (Domain != ExeNEON)
       break;
-    case ARM::VMOVSR: {
-      if (Domain != ExeNEON)
-        break;
-      assert(!isPredicated(MI) && "Cannot predicate a VSETLN");
+    assert(!isPredicated(MI) && "Cannot predicate a VSETLN");
 
-      // Source instruction is %SDst = VMOVSR %RSrc, 14, %noreg (; implicits)
-      DstReg = MI->getOperand(0).getReg();
-      SrcReg = MI->getOperand(1).getReg();
+    // Source instruction is %SDst = VMOVSR %RSrc, 14, %noreg (; implicits)
+    DstReg = MI.getOperand(0).getReg();
+    SrcReg = MI.getOperand(1).getReg();
 
-      DReg = getCorrespondingDRegAndLane(TRI, DstReg, Lane);
+    DReg = getCorrespondingDRegAndLane(TRI, DstReg, Lane);
 
-      unsigned ImplicitSReg;
-      if (!getImplicitSPRUseForDPRUse(TRI, MI, DReg, Lane, ImplicitSReg))
-        break;
+    unsigned ImplicitSReg;
+    if (!getImplicitSPRUseForDPRUse(TRI, MI, DReg, Lane, ImplicitSReg))
+      break;
 
-      for (unsigned i = MI->getDesc().getNumOperands(); i; --i)
-        MI->RemoveOperand(i-1);
+    for (unsigned i = MI.getDesc().getNumOperands(); i; --i)
+      MI.RemoveOperand(i - 1);
 
-      // Convert to %DDst = VSETLNi32 %DDst, %RSrc, Lane, 14, %noreg (; imps)
-      // Again DDst may be undefined at the beginning of this instruction.
-      MI->setDesc(get(ARM::VSETLNi32));
-      MIB.addReg(DReg, RegState::Define)
-         .addReg(DReg, getUndefRegState(!MI->readsRegister(DReg, TRI)))
-         .addReg(SrcReg)
-         .addImm(Lane);
-      AddDefaultPred(MIB);
+    // Convert to %DDst = VSETLNi32 %DDst, %RSrc, Lane, 14, %noreg (; imps)
+    // Again DDst may be undefined at the beginning of this instruction.
+    MI.setDesc(get(ARM::VSETLNi32));
+    MIB.addReg(DReg, RegState::Define)
+        .addReg(DReg, getUndefRegState(!MI.readsRegister(DReg, TRI)))
+        .addReg(SrcReg)
+        .addImm(Lane);
+    AddDefaultPred(MIB);
 
-      // The narrower destination must be marked as set to keep previous chains
-      // in place.
-      MIB.addReg(DstReg, RegState::Define | RegState::Implicit);
-      if (ImplicitSReg != 0)
-        MIB.addReg(ImplicitSReg, RegState::Implicit);
-      break;
+    // The narrower destination must be marked as set to keep previous chains
+    // in place.
+    MIB.addReg(DstReg, RegState::Define | RegState::Implicit);
+    if (ImplicitSReg != 0)
+      MIB.addReg(ImplicitSReg, RegState::Implicit);
+    break;
     }
     case ARM::VMOVS: {
       if (Domain != ExeNEON)
         break;
 
       // Source instruction is %SDst = VMOVS %SSrc, 14, %noreg (; implicits)
-      DstReg = MI->getOperand(0).getReg();
-      SrcReg = MI->getOperand(1).getReg();
+      DstReg = MI.getOperand(0).getReg();
+      SrcReg = MI.getOperand(1).getReg();
 
       unsigned DstLane = 0, SrcLane = 0, DDst, DSrc;
       DDst = getCorrespondingDRegAndLane(TRI, DstReg, DstLane);
@@ -4346,16 +4379,16 @@ ARMBaseInstrInfo::setExecutionDomain(MachineInstr *MI, unsigned Domain) const {
       if (!getImplicitSPRUseForDPRUse(TRI, MI, DSrc, SrcLane, ImplicitSReg))
         break;
 
-      for (unsigned i = MI->getDesc().getNumOperands(); i; --i)
-        MI->RemoveOperand(i-1);
+      for (unsigned i = MI.getDesc().getNumOperands(); i; --i)
+        MI.RemoveOperand(i - 1);
 
       if (DSrc == DDst) {
         // Destination can be:
         //     %DDst = VDUPLN32d %DDst, Lane, 14, %noreg (; implicits)
-        MI->setDesc(get(ARM::VDUPLN32d));
+        MI.setDesc(get(ARM::VDUPLN32d));
         MIB.addReg(DDst, RegState::Define)
-           .addReg(DDst, getUndefRegState(!MI->readsRegister(DDst, TRI)))
-           .addImm(SrcLane);
+            .addReg(DDst, getUndefRegState(!MI.readsRegister(DDst, TRI)))
+            .addImm(SrcLane);
         AddDefaultPred(MIB);
 
         // Neither the source or the destination are naturally represented any
@@ -4380,18 +4413,18 @@ ARMBaseInstrInfo::setExecutionDomain(MachineInstr *MI, unsigned Domain) const {
       // Pattern of the MachineInstrs is:
       //     %DDst = VEXTd32 %DSrc1, %DSrc2, Lane, 14, %noreg (;implicits)
       MachineInstrBuilder NewMIB;
-      NewMIB = BuildMI(*MI->getParent(), MI, MI->getDebugLoc(),
-                       get(ARM::VEXTd32), DDst);
+      NewMIB = BuildMI(*MI.getParent(), MI, MI.getDebugLoc(), get(ARM::VEXTd32),
+                       DDst);
 
       // On the first instruction, both DSrc and DDst may be <undef> if present.
       // Specifically when the original instruction didn't have them as an
       // <imp-use>.
       unsigned CurReg = SrcLane == 1 && DstLane == 1 ? DSrc : DDst;
-      bool CurUndef = !MI->readsRegister(CurReg, TRI);
+      bool CurUndef = !MI.readsRegister(CurReg, TRI);
       NewMIB.addReg(CurReg, getUndefRegState(CurUndef));
 
       CurReg = SrcLane == 0 && DstLane == 0 ? DSrc : DDst;
-      CurUndef = !MI->readsRegister(CurReg, TRI);
+      CurUndef = !MI.readsRegister(CurReg, TRI);
       NewMIB.addReg(CurReg, getUndefRegState(CurUndef));
 
       NewMIB.addImm(1);
@@ -4400,17 +4433,17 @@ ARMBaseInstrInfo::setExecutionDomain(MachineInstr *MI, unsigned Domain) const {
       if (SrcLane == DstLane)
         NewMIB.addReg(SrcReg, RegState::Implicit);
 
-      MI->setDesc(get(ARM::VEXTd32));
+      MI.setDesc(get(ARM::VEXTd32));
       MIB.addReg(DDst, RegState::Define);
 
       // On the second instruction, DDst has definitely been defined above, so
       // it is not <undef>. DSrc, if present, can be <undef> as above.
       CurReg = SrcLane == 1 && DstLane == 0 ? DSrc : DDst;
-      CurUndef = CurReg == DSrc && !MI->readsRegister(CurReg, TRI);
+      CurUndef = CurReg == DSrc && !MI.readsRegister(CurReg, TRI);
       MIB.addReg(CurReg, getUndefRegState(CurUndef));
 
       CurReg = SrcLane == 0 && DstLane == 1 ? DSrc : DDst;
-      CurUndef = CurReg == DSrc && !MI->readsRegister(CurReg, TRI);
+      CurUndef = CurReg == DSrc && !MI.readsRegister(CurReg, TRI);
       MIB.addReg(CurReg, getUndefRegState(CurUndef));
 
       MIB.addImm(1);
@@ -4446,24 +4479,23 @@ ARMBaseInstrInfo::setExecutionDomain(MachineInstr *MI, unsigned Domain) const {
 // VLD1DUPd32 - Writes all D-regs, no partial reg update, 2 uops.
 //
 // FCONSTD can be used as a dependency-breaking instruction.
-unsigned ARMBaseInstrInfo::
-getPartialRegUpdateClearance(const MachineInstr *MI,
-                             unsigned OpNum,
-                             const TargetRegisterInfo *TRI) const {
-  if (!SwiftPartialUpdateClearance ||
-      !(Subtarget.isSwift() || Subtarget.isCortexA15()))
+unsigned ARMBaseInstrInfo::getPartialRegUpdateClearance(
+    const MachineInstr &MI, unsigned OpNum,
+    const TargetRegisterInfo *TRI) const {
+  auto PartialUpdateClearance = Subtarget.getPartialUpdateClearance();
+  if (!PartialUpdateClearance)
     return 0;
 
   assert(TRI && "Need TRI instance");
 
-  const MachineOperand &MO = MI->getOperand(OpNum);
+  const MachineOperand &MO = MI.getOperand(OpNum);
   if (MO.readsReg())
     return 0;
   unsigned Reg = MO.getReg();
   int UseOp = -1;
 
-  switch(MI->getOpcode()) {
-    // Normal instructions writing only an S-register.
+  switch (MI.getOpcode()) {
+  // Normal instructions writing only an S-register.
   case ARM::VLDRS:
   case ARM::FCONSTS:
   case ARM::VMOVSR:
@@ -4472,7 +4504,7 @@ getPartialRegUpdateClearance(const MachineInstr *MI,
   case ARM::VMOVv2i32:
   case ARM::VMOVv2f32:
   case ARM::VMOVv1i64:
-    UseOp = MI->findRegisterUseOperandIdx(Reg, false, TRI);
+    UseOp = MI.findRegisterUseOperandIdx(Reg, false, TRI);
     break;
 
     // Explicitly reads the dependency.
@@ -4485,37 +4517,35 @@ getPartialRegUpdateClearance(const MachineInstr *MI,
 
   // If this instruction actually reads a value from Reg, there is no unwanted
   // dependency.
-  if (UseOp != -1 && MI->getOperand(UseOp).readsReg())
+  if (UseOp != -1 && MI.getOperand(UseOp).readsReg())
     return 0;
 
   // We must be able to clobber the whole D-reg.
   if (TargetRegisterInfo::isVirtualRegister(Reg)) {
     // Virtual register must be a foo:ssub_0<def,undef> operand.
-    if (!MO.getSubReg() || MI->readsVirtualRegister(Reg))
+    if (!MO.getSubReg() || MI.readsVirtualRegister(Reg))
       return 0;
   } else if (ARM::SPRRegClass.contains(Reg)) {
     // Physical register: MI must define the full D-reg.
     unsigned DReg = TRI->getMatchingSuperReg(Reg, ARM::ssub_0,
                                              &ARM::DPRRegClass);
-    if (!DReg || !MI->definesRegister(DReg, TRI))
+    if (!DReg || !MI.definesRegister(DReg, TRI))
       return 0;
   }
 
   // MI has an unwanted D-register dependency.
   // Avoid defs in the previous N instructrions.
-  return SwiftPartialUpdateClearance;
+  return PartialUpdateClearance;
 }
 
 // Break a partial register dependency after getPartialRegUpdateClearance
 // returned non-zero.
-void ARMBaseInstrInfo::
-breakPartialRegDependency(MachineBasicBlock::iterator MI,
-                          unsigned OpNum,
-                          const TargetRegisterInfo *TRI) const {
-  assert(MI && OpNum < MI->getDesc().getNumDefs() && "OpNum is not a def");
+void ARMBaseInstrInfo::breakPartialRegDependency(
+    MachineInstr &MI, unsigned OpNum, const TargetRegisterInfo *TRI) const {
+  assert(OpNum < MI.getDesc().getNumDefs() && "OpNum is not a def");
   assert(TRI && "Need TRI instance");
 
-  const MachineOperand &MO = MI->getOperand(OpNum);
+  const MachineOperand &MO = MI.getOperand(OpNum);
   unsigned Reg = MO.getReg();
   assert(TargetRegisterInfo::isPhysicalRegister(Reg) &&
          "Can't break virtual register dependencies.");
@@ -4528,7 +4558,7 @@ breakPartialRegDependency(MachineBasicBlock::iterator MI,
   }
 
   assert(ARM::DPRRegClass.contains(DReg) && "Can only break D-reg deps");
-  assert(MI->definesRegister(DReg, TRI) && "MI doesn't clobber full D-reg");
+  assert(MI.definesRegister(DReg, TRI) && "MI doesn't clobber full D-reg");
 
   // FIXME: In some cases, VLDRS can be changed to a VLD1DUPd32 which defines
   // the full D-register by loading the same value to both lanes.  The
@@ -4538,9 +4568,10 @@ breakPartialRegDependency(MachineBasicBlock::iterator MI,
 
   // Insert the dependency-breaking FCONSTD before MI.
   // 96 is the encoding of 0.5, but the actual value doesn't matter here.
-  AddDefaultPred(BuildMI(*MI->getParent(), MI, MI->getDebugLoc(),
-                         get(ARM::FCONSTD), DReg).addImm(96));
-  MI->addRegisterKilled(DReg, TRI, true);
+  AddDefaultPred(
+      BuildMI(*MI.getParent(), MI, MI.getDebugLoc(), get(ARM::FCONSTD), DReg)
+          .addImm(96));
+  MI.addRegisterKilled(DReg, TRI, true);
 }
 
 bool ARMBaseInstrInfo::hasNOP() const {
diff --git a/lib/Target/ARM/ARMBaseInstrInfo.h b/lib/Target/ARM/ARMBaseInstrInfo.h
index d80c49494c77..52b0ff17dea2 100644
--- a/lib/Target/ARM/ARMBaseInstrInfo.h
+++ b/lib/Target/ARM/ARMBaseInstrInfo.h
@@ -36,8 +36,7 @@ protected:
   explicit ARMBaseInstrInfo(const ARMSubtarget &STI);
 
   void expandLoadStackGuardBase(MachineBasicBlock::iterator MI,
-                                unsigned LoadImmOpc, unsigned LoadOpc,
-                                Reloc::Model RM) const;
+                                unsigned LoadImmOpc, unsigned LoadOpc) const;
 
   /// Build the equivalent inputs of a REG_SEQUENCE for the given \p MI
   /// and \p DefIdx.
@@ -93,8 +92,7 @@ protected:
   /// non-commutable pair of operand indices OpIdx1 and OpIdx2.
   /// Even though the instruction is commutable, the method may still
   /// fail to commute the operands, null pointer is returned in such cases.
-  MachineInstr *commuteInstructionImpl(MachineInstr *MI,
-                                       bool NewMI,
+  MachineInstr *commuteInstructionImpl(MachineInstr &MI, bool NewMI,
                                        unsigned OpIdx1,
                                        unsigned OpIdx2) const override;
 
@@ -107,7 +105,7 @@ public:
   virtual unsigned getUnindexedOpcode(unsigned Opc) const =0;
 
   MachineInstr *convertToThreeAddress(MachineFunction::iterator &MFI,
-                                      MachineBasicBlock::iterator &MBBI,
+                                      MachineInstr &MI,
                                       LiveVariables *LV) const override;
 
   virtual const ARMBaseRegisterInfo &getRegisterInfo() const = 0;
@@ -122,49 +120,49 @@ public:
                                      const ScheduleDAG *DAG) const override;
 
   // Branch analysis.
-  bool AnalyzeBranch(MachineBasicBlock &MBB, MachineBasicBlock *&TBB,
+  bool analyzeBranch(MachineBasicBlock &MBB, MachineBasicBlock *&TBB,
                      MachineBasicBlock *&FBB,
                      SmallVectorImpl<MachineOperand> &Cond,
                      bool AllowModify = false) const override;
   unsigned RemoveBranch(MachineBasicBlock &MBB) const override;
   unsigned InsertBranch(MachineBasicBlock &MBB, MachineBasicBlock *TBB,
                         MachineBasicBlock *FBB, ArrayRef<MachineOperand> Cond,
-                        DebugLoc DL) const override;
+                        const DebugLoc &DL) const override;
 
   bool
   ReverseBranchCondition(SmallVectorImpl<MachineOperand> &Cond) const override;
 
   // Predication support.
-  bool isPredicated(const MachineInstr *MI) const override;
+  bool isPredicated(const MachineInstr &MI) const override;
 
-  ARMCC::CondCodes getPredicate(const MachineInstr *MI) const {
-    int PIdx = MI->findFirstPredOperandIdx();
-    return PIdx != -1 ? (ARMCC::CondCodes)MI->getOperand(PIdx).getImm()
+  ARMCC::CondCodes getPredicate(const MachineInstr &MI) const {
+    int PIdx = MI.findFirstPredOperandIdx();
+    return PIdx != -1 ? (ARMCC::CondCodes)MI.getOperand(PIdx).getImm()
                       : ARMCC::AL;
   }
 
-  bool PredicateInstruction(MachineInstr *MI,
-                    ArrayRef<MachineOperand> Pred) const override;
+  bool PredicateInstruction(MachineInstr &MI,
+                            ArrayRef<MachineOperand> Pred) const override;
 
   bool SubsumesPredicate(ArrayRef<MachineOperand> Pred1,
                          ArrayRef<MachineOperand> Pred2) const override;
 
-  bool DefinesPredicate(MachineInstr *MI,
+  bool DefinesPredicate(MachineInstr &MI,
                         std::vector<MachineOperand> &Pred) const override;
 
-  bool isPredicable(MachineInstr *MI) const override;
+  bool isPredicable(MachineInstr &MI) const override;
 
   /// GetInstSize - Returns the size of the specified MachineInstr.
   ///
-  virtual unsigned GetInstSizeInBytes(const MachineInstr* MI) const;
+  virtual unsigned GetInstSizeInBytes(const MachineInstr &MI) const;
 
-  unsigned isLoadFromStackSlot(const MachineInstr *MI,
+  unsigned isLoadFromStackSlot(const MachineInstr &MI,
                                int &FrameIndex) const override;
-  unsigned isStoreToStackSlot(const MachineInstr *MI,
+  unsigned isStoreToStackSlot(const MachineInstr &MI,
                               int &FrameIndex) const override;
-  unsigned isLoadFromStackSlotPostFE(const MachineInstr *MI,
+  unsigned isLoadFromStackSlotPostFE(const MachineInstr &MI,
                                      int &FrameIndex) const override;
-  unsigned isStoreToStackSlotPostFE(const MachineInstr *MI,
+  unsigned isStoreToStackSlotPostFE(const MachineInstr &MI,
                                     int &FrameIndex) const override;
 
   void copyToCPSR(MachineBasicBlock &MBB, MachineBasicBlock::iterator I,
@@ -175,7 +173,7 @@ public:
                     const ARMSubtarget &Subtarget) const;
 
   void copyPhysReg(MachineBasicBlock &MBB, MachineBasicBlock::iterator I,
-                   DebugLoc DL, unsigned DestReg, unsigned SrcReg,
+                   const DebugLoc &DL, unsigned DestReg, unsigned SrcReg,
                    bool KillSrc) const override;
 
   void storeRegToStackSlot(MachineBasicBlock &MBB,
@@ -190,21 +188,21 @@ public:
                             const TargetRegisterClass *RC,
                             const TargetRegisterInfo *TRI) const override;
 
-  bool expandPostRAPseudo(MachineBasicBlock::iterator MI) const override;
+  bool expandPostRAPseudo(MachineInstr &MI) const override;
 
   void reMaterialize(MachineBasicBlock &MBB, MachineBasicBlock::iterator MI,
                      unsigned DestReg, unsigned SubIdx,
-                     const MachineInstr *Orig,
+                     const MachineInstr &Orig,
                      const TargetRegisterInfo &TRI) const override;
 
-  MachineInstr *duplicate(MachineInstr *Orig,
+  MachineInstr *duplicate(MachineInstr &Orig,
                           MachineFunction &MF) const override;
 
   const MachineInstrBuilder &AddDReg(MachineInstrBuilder &MIB, unsigned Reg,
                                      unsigned SubIdx, unsigned State,
                                      const TargetRegisterInfo *TRI) const;
 
-  bool produceSameValue(const MachineInstr *MI0, const MachineInstr *MI1,
+  bool produceSameValue(const MachineInstr &MI0, const MachineInstr &MI1,
                         const MachineRegisterInfo *MRI) const override;
 
   /// areLoadsFromSameBasePtr - This is used by the pre-regalloc scheduler to
@@ -227,7 +225,7 @@ public:
                                int64_t Offset1, int64_t Offset2,
                                unsigned NumLoads) const override;
 
-  bool isSchedulingBoundary(const MachineInstr *MI,
+  bool isSchedulingBoundary(const MachineInstr &MI,
                             const MachineBasicBlock *MBB,
                             const MachineFunction &MF) const override;
 
@@ -252,7 +250,7 @@ public:
   /// in SrcReg and SrcReg2 if having two register operands, and the value it
   /// compares against in CmpValue. Return true if the comparison instruction
   /// can be analyzed.
-  bool analyzeCompare(const MachineInstr *MI, unsigned &SrcReg,
+  bool analyzeCompare(const MachineInstr &MI, unsigned &SrcReg,
                       unsigned &SrcReg2, int &CmpMask,
                       int &CmpValue) const override;
 
@@ -260,30 +258,29 @@ public:
   /// that we can remove a "comparison with zero"; Remove a redundant CMP
   /// instruction if the flags can be updated in the same way by an earlier
   /// instruction such as SUB.
-  bool optimizeCompareInstr(MachineInstr *CmpInstr, unsigned SrcReg,
+  bool optimizeCompareInstr(MachineInstr &CmpInstr, unsigned SrcReg,
                             unsigned SrcReg2, int CmpMask, int CmpValue,
                             const MachineRegisterInfo *MRI) const override;
 
-  bool analyzeSelect(const MachineInstr *MI,
-                     SmallVectorImpl<MachineOperand> &Cond,
-                     unsigned &TrueOp, unsigned &FalseOp,
-                     bool &Optimizable) const override;
+  bool analyzeSelect(const MachineInstr &MI,
+                     SmallVectorImpl<MachineOperand> &Cond, unsigned &TrueOp,
+                     unsigned &FalseOp, bool &Optimizable) const override;
 
-  MachineInstr *optimizeSelect(MachineInstr *MI,
+  MachineInstr *optimizeSelect(MachineInstr &MI,
                                SmallPtrSetImpl<MachineInstr *> &SeenMIs,
                                bool) const override;
 
   /// FoldImmediate - 'Reg' is known to be defined by a move immediate
   /// instruction, try to fold the immediate into the use instruction.
-  bool FoldImmediate(MachineInstr *UseMI, MachineInstr *DefMI,
-                     unsigned Reg, MachineRegisterInfo *MRI) const override;
+  bool FoldImmediate(MachineInstr &UseMI, MachineInstr &DefMI, unsigned Reg,
+                     MachineRegisterInfo *MRI) const override;
 
   unsigned getNumMicroOps(const InstrItineraryData *ItinData,
-                          const MachineInstr *MI) const override;
+                          const MachineInstr &MI) const override;
 
   int getOperandLatency(const InstrItineraryData *ItinData,
-                        const MachineInstr *DefMI, unsigned DefIdx,
-                        const MachineInstr *UseMI,
+                        const MachineInstr &DefMI, unsigned DefIdx,
+                        const MachineInstr &UseMI,
                         unsigned UseIdx) const override;
   int getOperandLatency(const InstrItineraryData *ItinData,
                         SDNode *DefNode, unsigned DefIdx,
@@ -291,19 +288,20 @@ public:
 
   /// VFP/NEON execution domains.
   std::pair<uint16_t, uint16_t>
-  getExecutionDomain(const MachineInstr *MI) const override;
-  void setExecutionDomain(MachineInstr *MI, unsigned Domain) const override;
+  getExecutionDomain(const MachineInstr &MI) const override;
+  void setExecutionDomain(MachineInstr &MI, unsigned Domain) const override;
 
-  unsigned getPartialRegUpdateClearance(const MachineInstr*, unsigned,
-                                      const TargetRegisterInfo*) const override;
-  void breakPartialRegDependency(MachineBasicBlock::iterator, unsigned,
+  unsigned
+  getPartialRegUpdateClearance(const MachineInstr &, unsigned,
+                               const TargetRegisterInfo *) const override;
+  void breakPartialRegDependency(MachineInstr &, unsigned,
                                  const TargetRegisterInfo *TRI) const override;
 
   /// Get the number of addresses by LDM or VLDM or zero for unknown.
-  unsigned getNumLDMAddresses(const MachineInstr *MI) const;
+  unsigned getNumLDMAddresses(const MachineInstr &MI) const;
 
 private:
-  unsigned getInstBundleLength(const MachineInstr *MI) const;
+  unsigned getInstBundleLength(const MachineInstr &MI) const;
 
   int getVLDMDefCycle(const InstrItineraryData *ItinData,
                       const MCInstrDesc &DefMCID,
@@ -327,10 +325,17 @@ private:
                         const MCInstrDesc &UseMCID,
                         unsigned UseIdx, unsigned UseAlign) const;
 
-  unsigned getPredicationCost(const MachineInstr *MI) const override;
+  int getOperandLatencyImpl(const InstrItineraryData *ItinData,
+                            const MachineInstr &DefMI, unsigned DefIdx,
+                            const MCInstrDesc &DefMCID, unsigned DefAdj,
+                            const MachineOperand &DefMO, unsigned Reg,
+                            const MachineInstr &UseMI, unsigned UseIdx,
+                            const MCInstrDesc &UseMCID, unsigned UseAdj) const;
+
+  unsigned getPredicationCost(const MachineInstr &MI) const override;
 
   unsigned getInstrLatency(const InstrItineraryData *ItinData,
-                           const MachineInstr *MI,
+                           const MachineInstr &MI,
                            unsigned *PredCost = nullptr) const override;
 
   int getInstrLatency(const InstrItineraryData *ItinData,
@@ -338,19 +343,18 @@ private:
 
   bool hasHighOperandLatency(const TargetSchedModel &SchedModel,
                              const MachineRegisterInfo *MRI,
-                             const MachineInstr *DefMI, unsigned DefIdx,
-                             const MachineInstr *UseMI,
+                             const MachineInstr &DefMI, unsigned DefIdx,
+                             const MachineInstr &UseMI,
                              unsigned UseIdx) const override;
   bool hasLowDefLatency(const TargetSchedModel &SchedModel,
-                        const MachineInstr *DefMI,
+                        const MachineInstr &DefMI,
                         unsigned DefIdx) const override;
 
   /// verifyInstruction - Perform target specific instruction verification.
-  bool verifyInstruction(const MachineInstr *MI,
+  bool verifyInstruction(const MachineInstr &MI,
                          StringRef &ErrInfo) const override;
 
-  virtual void expandLoadStackGuard(MachineBasicBlock::iterator MI,
-                                    Reloc::Model RM) const = 0;
+  virtual void expandLoadStackGuard(MachineBasicBlock::iterator MI) const = 0;
 
   void expandMEMCPY(MachineBasicBlock::iterator) const;
 
@@ -447,7 +451,7 @@ static inline bool isPushOpcode(int Opc) {
 /// getInstrPredicate - If instruction is predicated, returns its predicate
 /// condition, otherwise returns AL. It also returns the condition code
 /// register by reference.
-ARMCC::CondCodes getInstrPredicate(const MachineInstr *MI, unsigned &PredReg);
+ARMCC::CondCodes getInstrPredicate(const MachineInstr &MI, unsigned &PredReg);
 
 unsigned getMatchingCondBranchOpcode(unsigned Opc);
 
@@ -466,21 +470,24 @@ unsigned convertAddSubFlagsOpcode(unsigned OldOpc);
 /// instructions to materializea destreg = basereg + immediate in ARM / Thumb2
 /// code.
 void emitARMRegPlusImmediate(MachineBasicBlock &MBB,
-                             MachineBasicBlock::iterator &MBBI, DebugLoc dl,
-                             unsigned DestReg, unsigned BaseReg, int NumBytes,
+                             MachineBasicBlock::iterator &MBBI,
+                             const DebugLoc &dl, unsigned DestReg,
+                             unsigned BaseReg, int NumBytes,
                              ARMCC::CondCodes Pred, unsigned PredReg,
                              const ARMBaseInstrInfo &TII, unsigned MIFlags = 0);
 
 void emitT2RegPlusImmediate(MachineBasicBlock &MBB,
-                            MachineBasicBlock::iterator &MBBI, DebugLoc dl,
-                            unsigned DestReg, unsigned BaseReg, int NumBytes,
+                            MachineBasicBlock::iterator &MBBI,
+                            const DebugLoc &dl, unsigned DestReg,
+                            unsigned BaseReg, int NumBytes,
                             ARMCC::CondCodes Pred, unsigned PredReg,
                             const ARMBaseInstrInfo &TII, unsigned MIFlags = 0);
 void emitThumbRegPlusImmediate(MachineBasicBlock &MBB,
-                               MachineBasicBlock::iterator &MBBI, DebugLoc dl,
-                               unsigned DestReg, unsigned BaseReg,
-                               int NumBytes, const TargetInstrInfo &TII,
-                               const ARMBaseRegisterInfo& MRI,
+                               MachineBasicBlock::iterator &MBBI,
+                               const DebugLoc &dl, unsigned DestReg,
+                               unsigned BaseReg, int NumBytes,
+                               const TargetInstrInfo &TII,
+                               const ARMBaseRegisterInfo &MRI,
                                unsigned MIFlags = 0);
 
 /// Tries to add registers to the reglist of a given base-updating
diff --git a/lib/Target/ARM/ARMBaseRegisterInfo.cpp b/lib/Target/ARM/ARMBaseRegisterInfo.cpp
index a5207705fc69..aa968efc37d4 100644
--- a/lib/Target/ARM/ARMBaseRegisterInfo.cpp
+++ b/lib/Target/ARM/ARMBaseRegisterInfo.cpp
@@ -49,12 +49,9 @@ ARMBaseRegisterInfo::ARMBaseRegisterInfo()
     : ARMGenRegisterInfo(ARM::LR, 0, 0, ARM::PC), BasePtr(ARM::R6) {}
 
 static unsigned getFramePointerReg(const ARMSubtarget &STI) {
-  if (STI.isTargetMachO()) {
-    if (STI.isTargetDarwin() || STI.isThumb1Only())
-      return ARM::R7;
-    else
-      return ARM::R11;
-  } else if (STI.isTargetWindows())
+  if (STI.isTargetMachO())
+    return ARM::R7;
+  else if (STI.isTargetWindows())
     return ARM::R11;
   else // ARM EABI
     return STI.isThumb() ? ARM::R7 : ARM::R11;
@@ -63,8 +60,11 @@ static unsigned getFramePointerReg(const ARMSubtarget &STI) {
 const MCPhysReg*
 ARMBaseRegisterInfo::getCalleeSavedRegs(const MachineFunction *MF) const {
   const ARMSubtarget &STI = MF->getSubtarget<ARMSubtarget>();
+  bool UseSplitPush = STI.splitFramePushPop();
   const MCPhysReg *RegList =
-      STI.isTargetDarwin() ? CSR_iOS_SaveList : CSR_AAPCS_SaveList;
+      STI.isTargetDarwin()
+          ? CSR_iOS_SaveList
+          : (UseSplitPush ? CSR_AAPCS_SplitPush_SaveList : CSR_AAPCS_SaveList);
 
   const Function *F = MF->getFunction();
   if (F->getCallingConv() == CallingConv::GHC) {
@@ -75,7 +75,7 @@ ARMBaseRegisterInfo::getCalleeSavedRegs(const MachineFunction *MF) const {
     if (STI.isMClass()) {
       // M-class CPUs have hardware which saves the registers needed to allow a
       // function conforming to the AAPCS to function as a handler.
-      return CSR_AAPCS_SaveList;
+      return UseSplitPush ? CSR_AAPCS_SplitPush_SaveList : CSR_AAPCS_SaveList;
     } else if (F->getFnAttribute("interrupt").getValueAsString() == "FIQ") {
       // Fast interrupt mode gives the handler a private copy of R8-R14, so less
       // need to be saved to restore user-mode state.
@@ -87,6 +87,10 @@ ARMBaseRegisterInfo::getCalleeSavedRegs(const MachineFunction *MF) const {
     }
   }
 
+  if (STI.isTargetDarwin() && STI.getTargetLowering()->supportSwiftError() &&
+      F->getAttributes().hasAttrSomewhere(Attribute::SwiftError))
+    return CSR_iOS_SwiftError_SaveList;
+
   if (STI.isTargetDarwin() && F->getCallingConv() == CallingConv::CXX_FAST_TLS)
     return MF->getInfo<ARMFunctionInfo>()->isSplitCSR()
                ? CSR_iOS_CXX_TLS_PE_SaveList
@@ -110,6 +114,11 @@ ARMBaseRegisterInfo::getCallPreservedMask(const MachineFunction &MF,
   if (CC == CallingConv::GHC)
     // This is academic becase all GHC calls are (supposed to be) tail calls
     return CSR_NoRegs_RegMask;
+
+  if (STI.isTargetDarwin() && STI.getTargetLowering()->supportSwiftError() &&
+      MF.getFunction()->getAttributes().hasAttrSomewhere(Attribute::SwiftError))
+    return CSR_iOS_SwiftError_RegMask;
+
   if (STI.isTargetDarwin() && CC == CallingConv::CXX_FAST_TLS)
     return CSR_iOS_CXX_TLS_RegMask;
   return STI.isTargetDarwin() ? CSR_iOS_RegMask : CSR_AAPCS_RegMask;
@@ -167,9 +176,8 @@ getReservedRegs(const MachineFunction &MF) const {
     Reserved.set(ARM::R9);
   // Reserve D16-D31 if the subtarget doesn't support them.
   if (!STI.hasVFP3() || STI.hasD16()) {
-    assert(ARM::D31 == ARM::D16 + 15);
-    for (unsigned i = 0; i != 16; ++i)
-      Reserved.set(ARM::D16 + i);
+    static_assert(ARM::D31 == ARM::D16 + 15, "Register list not consecutive!");
+    Reserved.set(ARM::D16, ARM::D31 + 1);
   }
   const TargetRegisterClass *RC  = &ARM::GPRPairRegClass;
   for(TargetRegisterClass::iterator I = RC->begin(), E = RC->end(); I!=E; ++I)
@@ -400,13 +408,10 @@ ARMBaseRegisterInfo::getFrameRegister(const MachineFunction &MF) const {
 
 /// emitLoadConstPool - Emits a load from constpool to materialize the
 /// specified immediate.
-void ARMBaseRegisterInfo::
-emitLoadConstPool(MachineBasicBlock &MBB,
-                  MachineBasicBlock::iterator &MBBI,
-                  DebugLoc dl,
-                  unsigned DestReg, unsigned SubIdx, int Val,
-                  ARMCC::CondCodes Pred,
-                  unsigned PredReg, unsigned MIFlags) const {
+void ARMBaseRegisterInfo::emitLoadConstPool(
+    MachineBasicBlock &MBB, MachineBasicBlock::iterator &MBBI,
+    const DebugLoc &dl, unsigned DestReg, unsigned SubIdx, int Val,
+    ARMCC::CondCodes Pred, unsigned PredReg, unsigned MIFlags) const {
   MachineFunction &MF = *MBB.getParent();
   const TargetInstrInfo &TII = *MF.getSubtarget().getInstrInfo();
   MachineConstantPool *ConstantPool = MF.getConstantPool();
diff --git a/lib/Target/ARM/ARMBaseRegisterInfo.h b/lib/Target/ARM/ARMBaseRegisterInfo.h
index 6a9a45a65687..1eee94857e05 100644
--- a/lib/Target/ARM/ARMBaseRegisterInfo.h
+++ b/lib/Target/ARM/ARMBaseRegisterInfo.h
@@ -166,12 +166,12 @@ public:
 
   /// emitLoadConstPool - Emits a load from constpool to materialize the
   /// specified immediate.
-  virtual void emitLoadConstPool(MachineBasicBlock &MBB,
-                                 MachineBasicBlock::iterator &MBBI,
-                                 DebugLoc dl, unsigned DestReg, unsigned SubIdx,
-                                 int Val, ARMCC::CondCodes Pred = ARMCC::AL,
-                                 unsigned PredReg = 0,
-                                 unsigned MIFlags = MachineInstr::NoFlags)const;
+  virtual void
+  emitLoadConstPool(MachineBasicBlock &MBB, MachineBasicBlock::iterator &MBBI,
+                    const DebugLoc &dl, unsigned DestReg, unsigned SubIdx,
+                    int Val, ARMCC::CondCodes Pred = ARMCC::AL,
+                    unsigned PredReg = 0,
+                    unsigned MIFlags = MachineInstr::NoFlags) const;
 
   /// Code Generation virtual methods...
   bool requiresRegisterScavenging(const MachineFunction &MF) const override;
diff --git a/lib/Target/ARM/ARMCallingConv.h b/lib/Target/ARM/ARMCallingConv.h
index a731d00883a1..71b819362404 100644
--- a/lib/Target/ARM/ARMCallingConv.h
+++ b/lib/Target/ARM/ARMCallingConv.h
@@ -211,7 +211,7 @@ static bool CC_ARM_AAPCS_Custom_Aggregate(unsigned &ValNo, MVT &ValVT,
 
     // First consume all registers that would give an unaligned object. Whether
     // we go on stack or in regs, no-one will be using them in future.
-    unsigned RegAlign = RoundUpToAlignment(Align, 4) / 4;
+    unsigned RegAlign = alignTo(Align, 4) / 4;
     while (RegIdx % RegAlign != 0 && RegIdx < RegList.size())
       State.AllocateReg(RegList[RegIdx++]);
 
diff --git a/lib/Target/ARM/ARMCallingConv.td b/lib/Target/ARM/ARMCallingConv.td
index 847ef87c1b26..edb69581b9d3 100644
--- a/lib/Target/ARM/ARMCallingConv.td
+++ b/lib/Target/ARM/ARMCallingConv.td
@@ -23,6 +23,12 @@ def CC_ARM_APCS : CallingConv<[
     
   CCIfType<[i1, i8, i16], CCPromoteToType<i32>>,
 
+  // Pass SwiftSelf in a callee saved register.
+  CCIfSwiftSelf<CCIfType<[i32], CCAssignToReg<[R10]>>>,
+
+  // A SwiftError is passed in R6.
+  CCIfSwiftError<CCIfType<[i32], CCAssignToReg<[R6]>>>,
+
   // Handle all vector types as either f64 or v2f64.
   CCIfType<[v1i64, v2i32, v4i16, v8i8, v2f32], CCBitConvertToType<f64>>,
   CCIfType<[v2i64, v4i32, v8i16, v16i8, v4f32], CCBitConvertToType<v2f64>>,
@@ -42,6 +48,12 @@ def RetCC_ARM_APCS : CallingConv<[
   CCIfType<[i1, i8, i16], CCPromoteToType<i32>>,
   CCIfType<[f32], CCBitConvertToType<i32>>,
 
+  // Pass SwiftSelf in a callee saved register.
+  CCIfSwiftSelf<CCIfType<[i32], CCAssignToReg<[R10]>>>,
+
+  // A SwiftError is returned in R6.
+  CCIfSwiftError<CCIfType<[i32], CCAssignToReg<[R6]>>>,
+
   // Handle all vector types as either f64 or v2f64.
   CCIfType<[v1i64, v2i32, v4i16, v8i8, v2f32], CCBitConvertToType<f64>>,
   CCIfType<[v2i64, v4i32, v8i16, v16i8, v4f32], CCBitConvertToType<v2f64>>,
@@ -151,6 +163,12 @@ def CC_ARM_AAPCS : CallingConv<[
   CCIfType<[v1i64, v2i32, v4i16, v8i8, v2f32], CCBitConvertToType<f64>>,
   CCIfType<[v2i64, v4i32, v8i16, v16i8, v4f32], CCBitConvertToType<v2f64>>,
 
+  // Pass SwiftSelf in a callee saved register.
+  CCIfSwiftSelf<CCIfType<[i32], CCAssignToReg<[R10]>>>,
+
+  // A SwiftError is passed in R6.
+  CCIfSwiftError<CCIfType<[i32], CCAssignToReg<[R6]>>>,
+
   CCIfType<[f64, v2f64], CCCustom<"CC_ARM_AAPCS_Custom_f64">>,
   CCIfType<[f32], CCBitConvertToType<i32>>,
   CCDelegateTo<CC_ARM_AAPCS_Common>
@@ -161,6 +179,12 @@ def RetCC_ARM_AAPCS : CallingConv<[
   CCIfType<[v1i64, v2i32, v4i16, v8i8, v2f32], CCBitConvertToType<f64>>,
   CCIfType<[v2i64, v4i32, v8i16, v16i8, v4f32], CCBitConvertToType<v2f64>>,
 
+  // Pass SwiftSelf in a callee saved register.
+  CCIfSwiftSelf<CCIfType<[i32], CCAssignToReg<[R10]>>>,
+
+  // A SwiftError is returned in R6.
+  CCIfSwiftError<CCIfType<[i32], CCAssignToReg<[R6]>>>,
+
   CCIfType<[f64, v2f64], CCCustom<"RetCC_ARM_AAPCS_Custom_f64">>,
   CCIfType<[f32], CCBitConvertToType<i32>>,
   CCDelegateTo<RetCC_ARM_AAPCS_Common>
@@ -179,6 +203,12 @@ def CC_ARM_AAPCS_VFP : CallingConv<[
   CCIfType<[v1i64, v2i32, v4i16, v8i8, v2f32], CCBitConvertToType<f64>>,
   CCIfType<[v2i64, v4i32, v8i16, v16i8, v4f32], CCBitConvertToType<v2f64>>,
 
+  // Pass SwiftSelf in a callee saved register.
+  CCIfSwiftSelf<CCIfType<[i32], CCAssignToReg<[R10]>>>,
+
+  // A SwiftError is passed in R6.
+  CCIfSwiftError<CCIfType<[i32], CCAssignToReg<[R6]>>>,
+
   // HFAs are passed in a contiguous block of registers, or on the stack
   CCIfConsecutiveRegs<CCCustom<"CC_ARM_AAPCS_Custom_Aggregate">>,
 
@@ -194,6 +224,12 @@ def RetCC_ARM_AAPCS_VFP : CallingConv<[
   CCIfType<[v1i64, v2i32, v4i16, v8i8, v2f32], CCBitConvertToType<f64>>,
   CCIfType<[v2i64, v4i32, v8i16, v16i8, v4f32], CCBitConvertToType<v2f64>>,
 
+  // Pass SwiftSelf in a callee saved register.
+  CCIfSwiftSelf<CCIfType<[i32], CCAssignToReg<[R10]>>>,
+
+  // A SwiftError is returned in R6.
+  CCIfSwiftError<CCIfType<[i32], CCAssignToReg<[R6]>>>,
+
   CCIfType<[v2f64], CCAssignToReg<[Q0, Q1, Q2, Q3]>>,
   CCIfType<[f64], CCAssignToReg<[D0, D1, D2, D3, D4, D5, D6, D7]>>,
   CCIfType<[f32], CCAssignToReg<[S0, S1, S2, S3, S4, S5, S6, S7, S8,
@@ -210,6 +246,14 @@ def CSR_NoRegs : CalleeSavedRegs<(add)>;
 def CSR_AAPCS : CalleeSavedRegs<(add LR, R11, R10, R9, R8, R7, R6, R5, R4,
                                      (sequence "D%u", 15, 8))>;
 
+// The order of callee-saved registers needs to match the order we actually push
+// them in FrameLowering, because this order is what's used by
+// PrologEpilogInserter to allocate frame index slots. So when R7 is the frame
+// pointer, we use this AAPCS alternative.
+def CSR_AAPCS_SplitPush : CalleeSavedRegs<(add LR, R7, R6, R5, R4,
+                                               R11, R10, R9, R8,
+                                               (sequence "D%u", 15, 8))>;
+
 // Constructors and destructors return 'this' in the ARM C++ ABI; since 'this'
 // and the pointer return value are both passed in R0 in these cases, this can
 // be partially modelled by treating R0 as a callee-saved register
@@ -222,6 +266,9 @@ def CSR_AAPCS_ThisReturn : CalleeSavedRegs<(add LR, R11, R10, R9, R8, R7, R6,
 // Also save R7-R4 first to match the stack frame fixed spill areas.
 def CSR_iOS : CalleeSavedRegs<(add LR, R7, R6, R5, R4, (sub CSR_AAPCS, R9))>;
 
+// R6 is used to pass swifterror, remove it from CSR.
+def CSR_iOS_SwiftError : CalleeSavedRegs<(sub CSR_iOS, R6)>;
+
 def CSR_iOS_ThisReturn : CalleeSavedRegs<(add LR, R7, R6, R5, R4,
                                          (sub CSR_AAPCS_ThisReturn, R9))>;
 
@@ -235,10 +282,11 @@ def CSR_iOS_CXX_TLS : CalleeSavedRegs<(add CSR_iOS, (sequence "R%u", 12, 1),
                                            (sequence "D%u", 31, 0))>;
 
 // CSRs that are handled by prologue, epilogue.
-def CSR_iOS_CXX_TLS_PE : CalleeSavedRegs<(add LR)>;
+def CSR_iOS_CXX_TLS_PE : CalleeSavedRegs<(add LR, R12, R11, R7, R5, R4)>;
 
 // CSRs that are handled explicitly via copies.
-def CSR_iOS_CXX_TLS_ViaCopy : CalleeSavedRegs<(sub CSR_iOS_CXX_TLS, LR)>;
+def CSR_iOS_CXX_TLS_ViaCopy : CalleeSavedRegs<(sub CSR_iOS_CXX_TLS,
+                                                   CSR_iOS_CXX_TLS_PE)>;
 
 // The "interrupt" attribute is used to generate code that is acceptable in
 // exception-handlers of various kinds. It makes us use a different return
diff --git a/lib/Target/ARM/ARMConstantIslandPass.cpp b/lib/Target/ARM/ARMConstantIslandPass.cpp
index 55c1684028c2..8511f67dccd5 100644
--- a/lib/Target/ARM/ARMConstantIslandPass.cpp
+++ b/lib/Target/ARM/ARMConstantIslandPass.cpp
@@ -53,6 +53,11 @@ static cl::opt<bool>
 AdjustJumpTableBlocks("arm-adjust-jump-tables", cl::Hidden, cl::init(true),
           cl::desc("Adjust basic block layout to better use TB[BH]"));
 
+static cl::opt<unsigned>
+CPMaxIteration("arm-constant-island-max-iteration", cl::Hidden, cl::init(30),
+          cl::desc("The max number of iteration for converge"));
+
+
 /// UnknownPadding - Return the worst case padding that could result from
 /// unknown offset bits.  This does not include alignment padding caused by
 /// known offset bits.
@@ -274,6 +279,11 @@ namespace {
 
     bool runOnMachineFunction(MachineFunction &MF) override;
 
+    MachineFunctionProperties getRequiredProperties() const override {
+      return MachineFunctionProperties().set(
+          MachineFunctionProperties::Property::AllVRegsAllocated);
+    }
+
     const char *getPassName() const override {
       return "ARM constant island placement and branch shortening pass";
     }
@@ -293,10 +303,10 @@ namespace {
     unsigned getCombinedIndex(const MachineInstr *CPEMI);
     int findInRangeCPEntry(CPUser& U, unsigned UserOffset);
     bool findAvailableWater(CPUser&U, unsigned UserOffset,
-                            water_iterator &WaterIter);
+                            water_iterator &WaterIter, bool CloserWater);
     void createNewWater(unsigned CPUserIndex, unsigned UserOffset,
                         MachineBasicBlock *&NewMBB);
-    bool handleConstantPoolUser(unsigned CPUserIndex);
+    bool handleConstantPoolUser(unsigned CPUserIndex, bool CloserWater);
     void removeDeadCPEMI(MachineInstr *CPEMI);
     bool removeUnusedCPEntries();
     bool isCPEntryInRange(MachineInstr *MI, unsigned UserOffset,
@@ -456,8 +466,11 @@ bool ARMConstantIslands::runOnMachineFunction(MachineFunction &mf) {
     DEBUG(dbgs() << "Beginning CP iteration #" << NoCPIters << '\n');
     bool CPChange = false;
     for (unsigned i = 0, e = CPUsers.size(); i != e; ++i)
-      CPChange |= handleConstantPoolUser(i);
-    if (CPChange && ++NoCPIters > 30)
+      // For most inputs, it converges in no more than 5 iterations.
+      // If it doesn't end in 10, the input may have huge BB or many CPEs.
+      // In this case, we will try different heuristics.
+      CPChange |= handleConstantPoolUser(i, NoCPIters >= CPMaxIteration / 2);
+    if (CPChange && ++NoCPIters > CPMaxIteration)
       report_fatal_error("Constant Island pass failed to converge!");
     DEBUG(dumpBBs());
 
@@ -478,10 +491,18 @@ bool ARMConstantIslands::runOnMachineFunction(MachineFunction &mf) {
     MadeChange = true;
   }
 
-  // Shrink 32-bit Thumb2 branch, load, and store instructions.
+  // Shrink 32-bit Thumb2 load and store instructions.
   if (isThumb2 && !STI->prefers32BitThumb())
     MadeChange |= optimizeThumb2Instructions();
 
+  // Shrink 32-bit branch instructions.
+  if (isThumb && STI->hasV8MBaselineOps())
+    MadeChange |= optimizeThumb2Branches();
+
+  // Optimize jump tables using TBB / TBH.
+  if (isThumb2)
+    MadeChange |= optimizeThumb2JumpTables();
+
   // After a while, this might be made debug-only, but it is not expensive.
   verify();
 
@@ -654,7 +675,7 @@ bool ARMConstantIslands::BBHasFallthrough(MachineBasicBlock *MBB) {
   // have an unconditional branch for whatever reason.
   MachineBasicBlock *TBB, *FBB;
   SmallVector<MachineOperand, 4> Cond;
-  bool TooDifficult = TII->AnalyzeBranch(*MBB, TBB, FBB, Cond);
+  bool TooDifficult = TII->analyzeBranch(*MBB, TBB, FBB, Cond);
   return TooDifficult || FBB == nullptr;
 }
 
@@ -701,14 +722,10 @@ unsigned ARMConstantIslands::getCPELogAlign(const MachineInstr *CPEMI) {
 /// information about the sizes of each block and the locations of all
 /// the jump tables.
 void ARMConstantIslands::scanFunctionJumpTables() {
-  for (MachineFunction::iterator MBBI = MF->begin(), E = MF->end();
-       MBBI != E; ++MBBI) {
-    MachineBasicBlock &MBB = *MBBI;
-
-    for (MachineBasicBlock::iterator I = MBB.begin(), E = MBB.end();
-         I != E; ++I)
-      if (I->isBranch() && I->getOpcode() == ARM::t2BR_JT)
-        T2JumpTables.push_back(I);
+  for (MachineBasicBlock &MBB : *MF) {
+    for (MachineInstr &I : MBB)
+      if (I.isBranch() && I.getOpcode() == ARM::t2BR_JT)
+        T2JumpTables.push_back(&I);
   }
 }
 
@@ -735,22 +752,18 @@ initializeFunctionInfo(const std::vector<MachineInstr*> &CPEMIs) {
   adjustBBOffsetsAfter(&MF->front());
 
   // Now go back through the instructions and build up our data structures.
-  for (MachineFunction::iterator MBBI = MF->begin(), E = MF->end();
-       MBBI != E; ++MBBI) {
-    MachineBasicBlock &MBB = *MBBI;
-
+  for (MachineBasicBlock &MBB : *MF) {
     // If this block doesn't fall through into the next MBB, then this is
     // 'water' that a constant pool island could be placed.
     if (!BBHasFallthrough(&MBB))
       WaterList.push_back(&MBB);
 
-    for (MachineBasicBlock::iterator I = MBB.begin(), E = MBB.end();
-         I != E; ++I) {
-      if (I->isDebugValue())
+    for (MachineInstr &I : MBB) {
+      if (I.isDebugValue())
         continue;
 
-      unsigned Opc = I->getOpcode();
-      if (I->isBranch()) {
+      unsigned Opc = I.getOpcode();
+      if (I.isBranch()) {
         bool isCond = false;
         unsigned Bits = 0;
         unsigned Scale = 1;
@@ -759,7 +772,7 @@ initializeFunctionInfo(const std::vector<MachineInstr*> &CPEMIs) {
         default:
           continue;  // Ignore other JT branches
         case ARM::t2BR_JT:
-          T2JumpTables.push_back(I);
+          T2JumpTables.push_back(&I);
           continue;   // Does not get an entry in ImmBranches
         case ARM::Bcc:
           isCond = true;
@@ -793,11 +806,11 @@ initializeFunctionInfo(const std::vector<MachineInstr*> &CPEMIs) {
 
         // Record this immediate branch.
         unsigned MaxOffs = ((1 << (Bits-1))-1) * Scale;
-        ImmBranches.push_back(ImmBranch(I, MaxOffs, isCond, UOpc));
+        ImmBranches.push_back(ImmBranch(&I, MaxOffs, isCond, UOpc));
       }
 
       if (Opc == ARM::tPUSH || Opc == ARM::tPOP_RET)
-        PushPopMIs.push_back(I);
+        PushPopMIs.push_back(&I);
 
       if (Opc == ARM::CONSTPOOL_ENTRY || Opc == ARM::JUMPTABLE_ADDRS ||
           Opc == ARM::JUMPTABLE_INSTS || Opc == ARM::JUMPTABLE_TBB ||
@@ -805,8 +818,8 @@ initializeFunctionInfo(const std::vector<MachineInstr*> &CPEMIs) {
         continue;
 
       // Scan the instructions for constant pool operands.
-      for (unsigned op = 0, e = I->getNumOperands(); op != e; ++op)
-        if (I->getOperand(op).isCPI() || I->getOperand(op).isJTI()) {
+      for (unsigned op = 0, e = I.getNumOperands(); op != e; ++op)
+        if (I.getOperand(op).isCPI() || I.getOperand(op).isJTI()) {
           // We found one.  The addressing mode tells us the max displacement
           // from the PC that this instruction permits.
 
@@ -865,15 +878,15 @@ initializeFunctionInfo(const std::vector<MachineInstr*> &CPEMIs) {
           }
 
           // Remember that this is a user of a CP entry.
-          unsigned CPI = I->getOperand(op).getIndex();
-          if (I->getOperand(op).isJTI()) {
+          unsigned CPI = I.getOperand(op).getIndex();
+          if (I.getOperand(op).isJTI()) {
             JumpTableUserIndices.insert(std::make_pair(CPI, CPUsers.size()));
             CPI = JumpTableEntryIndices[CPI];
           }
 
           MachineInstr *CPEMI = CPEMIs[CPI];
           unsigned MaxOffs = ((1 << Bits)-1) * Scale;
-          CPUsers.push_back(CPUser(I, CPEMI, MaxOffs, NegOk, IsSoImm));
+          CPUsers.push_back(CPUser(&I, CPEMI, MaxOffs, NegOk, IsSoImm));
 
           // Increment corresponding CPEntry reference count.
           CPEntry *CPE = findConstPoolEntry(CPI, CPEMI);
@@ -896,15 +909,14 @@ void ARMConstantIslands::computeBlockSize(MachineBasicBlock *MBB) {
   BBI.Unalign = 0;
   BBI.PostAlign = 0;
 
-  for (MachineBasicBlock::iterator I = MBB->begin(), E = MBB->end(); I != E;
-       ++I) {
+  for (MachineInstr &I : *MBB) {
     BBI.Size += TII->GetInstSizeInBytes(I);
     // For inline asm, GetInstSizeInBytes returns a conservative estimate.
     // The actual size may be smaller, but still a multiple of the instr size.
-    if (I->isInlineAsm())
+    if (I.isInlineAsm())
       BBI.Unalign = isThumb ? 1 : 2;
     // Also consider instructions that may be shrunk later.
-    else if (isThumb && mayOptimizeThumb2Instruction(I))
+    else if (isThumb && mayOptimizeThumb2Instruction(&I))
       BBI.Unalign = 1;
   }
 
@@ -929,7 +941,7 @@ unsigned ARMConstantIslands::getOffsetOf(MachineInstr *MI) const {
   // Sum instructions before MI in MBB.
   for (MachineBasicBlock::iterator I = MBB->begin(); &*I != MI; ++I) {
     assert(I != MBB->end() && "Didn't find MI in its own basic block?");
-    Offset += TII->GetInstSizeInBytes(I);
+    Offset += TII->GetInstSizeInBytes(*I);
   }
   return Offset;
 }
@@ -1108,7 +1120,7 @@ bool ARMConstantIslands::isWaterInRange(unsigned UserOffset,
     Growth = CPEEnd - NextBlockOffset;
     // Compute the padding that would go at the end of the CPE to align the next
     // block.
-    Growth += OffsetToAlignment(CPEEnd, 1u << NextBlockAlignment);
+    Growth += OffsetToAlignment(CPEEnd, 1ULL << NextBlockAlignment);
 
     // If the CPE is to be inserted before the instruction, that will raise
     // the offset of the instruction. Also account for unknown alignment padding
@@ -1285,11 +1297,27 @@ static inline unsigned getUnconditionalBrDisp(int Opc) {
 /// move to a lower address, so search backward from the end of the list and
 /// prefer the first water that is in range.
 bool ARMConstantIslands::findAvailableWater(CPUser &U, unsigned UserOffset,
-                                      water_iterator &WaterIter) {
+                                            water_iterator &WaterIter,
+                                            bool CloserWater) {
   if (WaterList.empty())
     return false;
 
   unsigned BestGrowth = ~0u;
+  // The nearest water without splitting the UserBB is right after it.
+  // If the distance is still large (we have a big BB), then we need to split it
+  // if we don't converge after certain iterations. This helps the following
+  // situation to converge:
+  //   BB0:
+  //      Big BB
+  //   BB1:
+  //      Constant Pool
+  // When a CP access is out of range, BB0 may be used as water. However,
+  // inserting islands between BB0 and BB1 makes other accesses out of range.
+  MachineBasicBlock *UserBB = U.MI->getParent();
+  unsigned MinNoSplitDisp =
+      BBInfo[UserBB->getNumber()].postOffset(getCPELogAlign(U.CPEMI));
+  if (CloserWater && MinNoSplitDisp > U.getMaxDisp() / 2)
+    return false;
   for (water_iterator IP = std::prev(WaterList.end()), B = WaterList.begin();;
        --IP) {
     MachineBasicBlock* WaterBB = *IP;
@@ -1301,6 +1329,8 @@ bool ARMConstantIslands::findAvailableWater(CPUser &U, unsigned UserOffset,
     // should be relatively uncommon and when it does happen, we want to be
     // sure to take advantage of it for all the CPEs near that block, so that
     // we don't insert more branches than necessary.
+    // When CloserWater is true, we try to find the lowest address after (or
+    // equal to) user MI's BB no matter of padding growth.
     unsigned Growth;
     if (isWaterInRange(UserOffset, WaterBB, U, Growth) &&
         (WaterBB->getNumber() < U.HighWaterMark->getNumber() ||
@@ -1312,8 +1342,11 @@ bool ARMConstantIslands::findAvailableWater(CPUser &U, unsigned UserOffset,
       DEBUG(dbgs() << "Found water after BB#" << WaterBB->getNumber()
                    << " Growth=" << Growth << '\n');
 
-      // Keep looking unless it is perfect.
-      if (BestGrowth == 0)
+      if (CloserWater && WaterBB == U.MI->getParent())
+        return true;
+      // Keep looking unless it is perfect and we're not looking for the lowest
+      // possible address.
+      if (!CloserWater && BestGrowth == 0)
         return true;
     }
     if (IP == B)
@@ -1416,7 +1449,7 @@ void ARMConstantIslands::createNewWater(unsigned CPUserIndex,
     // iterates at least once.
     BaseInsertOffset =
         std::max(UserBBI.postOffset() - UPad - 8,
-                 UserOffset + TII->GetInstSizeInBytes(UserMI) + 1);
+                 UserOffset + TII->GetInstSizeInBytes(*UserMI) + 1);
     DEBUG(dbgs() << format("Move inside block: %#x\n", BaseInsertOffset));
   }
   unsigned EndInsertOffset = BaseInsertOffset + 4 + UPad +
@@ -1426,11 +1459,11 @@ void ARMConstantIslands::createNewWater(unsigned CPUserIndex,
   unsigned CPUIndex = CPUserIndex+1;
   unsigned NumCPUsers = CPUsers.size();
   MachineInstr *LastIT = nullptr;
-  for (unsigned Offset = UserOffset+TII->GetInstSizeInBytes(UserMI);
+  for (unsigned Offset = UserOffset + TII->GetInstSizeInBytes(*UserMI);
        Offset < BaseInsertOffset;
-       Offset += TII->GetInstSizeInBytes(MI), MI = std::next(MI)) {
+       Offset += TII->GetInstSizeInBytes(*MI), MI = std::next(MI)) {
     assert(MI != UserMBB->end() && "Fell off end of block");
-    if (CPUIndex < NumCPUsers && CPUsers[CPUIndex].MI == MI) {
+    if (CPUIndex < NumCPUsers && CPUsers[CPUIndex].MI == &*MI) {
       CPUser &U = CPUsers[CPUIndex];
       if (!isOffsetInRange(Offset, EndInsertOffset, U)) {
         // Shift intertion point by one unit of alignment so it is within reach.
@@ -1447,7 +1480,7 @@ void ARMConstantIslands::createNewWater(unsigned CPUserIndex,
 
     // Remember the last IT instruction.
     if (MI->getOpcode() == ARM::t2IT)
-      LastIT = MI;
+      LastIT = &*MI;
   }
 
   --MI;
@@ -1455,23 +1488,24 @@ void ARMConstantIslands::createNewWater(unsigned CPUserIndex,
   // Avoid splitting an IT block.
   if (LastIT) {
     unsigned PredReg = 0;
-    ARMCC::CondCodes CC = getITInstrPredicate(MI, PredReg);
+    ARMCC::CondCodes CC = getITInstrPredicate(*MI, PredReg);
     if (CC != ARMCC::AL)
       MI = LastIT;
   }
 
   // We really must not split an IT block.
   DEBUG(unsigned PredReg;
-        assert(!isThumb || getITInstrPredicate(MI, PredReg) == ARMCC::AL));
+        assert(!isThumb || getITInstrPredicate(*MI, PredReg) == ARMCC::AL));
 
-  NewMBB = splitBlockBeforeInstr(MI);
+  NewMBB = splitBlockBeforeInstr(&*MI);
 }
 
 /// handleConstantPoolUser - Analyze the specified user, checking to see if it
 /// is out-of-range.  If so, pick up the constant pool value and move it some
 /// place in-range.  Return true if we changed any addresses (thus must run
 /// another pass of branch lengthening), false otherwise.
-bool ARMConstantIslands::handleConstantPoolUser(unsigned CPUserIndex) {
+bool ARMConstantIslands::handleConstantPoolUser(unsigned CPUserIndex,
+                                                bool CloserWater) {
   CPUser &U = CPUsers[CPUserIndex];
   MachineInstr *UserMI = U.MI;
   MachineInstr *CPEMI  = U.CPEMI;
@@ -1494,7 +1528,7 @@ bool ARMConstantIslands::handleConstantPoolUser(unsigned CPUserIndex) {
   MachineBasicBlock *NewIsland = MF->CreateMachineBasicBlock();
   MachineBasicBlock *NewMBB;
   water_iterator IP;
-  if (findAvailableWater(U, UserOffset, IP)) {
+  if (findAvailableWater(U, UserOffset, IP, CloserWater)) {
     DEBUG(dbgs() << "Found water in range\n");
     MachineBasicBlock *WaterBB = *IP;
 
@@ -1584,7 +1618,7 @@ void ARMConstantIslands::removeDeadCPEMI(MachineInstr *CPEMI) {
     CPEBB->setAlignment(0);
   } else
     // Entries are sorted by descending alignment, so realign from the front.
-    CPEBB->setAlignment(getCPELogAlign(CPEBB->begin()));
+    CPEBB->setAlignment(getCPELogAlign(&*CPEBB->begin()));
 
   adjustBBOffsetsAfter(CPEBB);
   // An island has only one predecessor BB and one successor BB. Check if
@@ -1728,7 +1762,7 @@ ARMConstantIslands::fixupConditionalBr(ImmBranch &Br) {
     splitBlockBeforeInstr(MI);
     // No need for the branch to the next block. We're adding an unconditional
     // branch to the destination.
-    int delta = TII->GetInstSizeInBytes(&MBB->back());
+    int delta = TII->GetInstSizeInBytes(MBB->back());
     BBInfo[MBB->getNumber()].Size -= delta;
     MBB->back().eraseFromParent();
     // BBInfo[SplitBB].Offset is wrong temporarily, fixed below
@@ -1744,18 +1778,18 @@ ARMConstantIslands::fixupConditionalBr(ImmBranch &Br) {
   BuildMI(MBB, DebugLoc(), TII->get(MI->getOpcode()))
     .addMBB(NextBB).addImm(CC).addReg(CCReg);
   Br.MI = &MBB->back();
-  BBInfo[MBB->getNumber()].Size += TII->GetInstSizeInBytes(&MBB->back());
+  BBInfo[MBB->getNumber()].Size += TII->GetInstSizeInBytes(MBB->back());
   if (isThumb)
     BuildMI(MBB, DebugLoc(), TII->get(Br.UncondBr)).addMBB(DestBB)
             .addImm(ARMCC::AL).addReg(0);
   else
     BuildMI(MBB, DebugLoc(), TII->get(Br.UncondBr)).addMBB(DestBB);
-  BBInfo[MBB->getNumber()].Size += TII->GetInstSizeInBytes(&MBB->back());
+  BBInfo[MBB->getNumber()].Size += TII->GetInstSizeInBytes(MBB->back());
   unsigned MaxDisp = getUnconditionalBrDisp(Br.UncondBr);
   ImmBranches.push_back(ImmBranch(&MBB->back(), MaxDisp, false, Br.UncondBr));
 
   // Remove the old conditional branch.  It may or may not still be in MBB.
-  BBInfo[MI->getParent()->getNumber()].Size -= TII->GetInstSizeInBytes(MI);
+  BBInfo[MI->getParent()->getNumber()].Size -= TII->GetInstSizeInBytes(*MI);
   MI->eraseFromParent();
   adjustBBOffsetsAfter(MBB);
   return true;
@@ -1852,8 +1886,6 @@ bool ARMConstantIslands::optimizeThumb2Instructions() {
     }
   }
 
-  MadeChange |= optimizeThumb2Branches();
-  MadeChange |= optimizeThumb2JumpTables();
   return MadeChange;
 }
 
@@ -1910,7 +1942,7 @@ bool ARMConstantIslands::optimizeThumb2Branches() {
 
     NewOpc = 0;
     unsigned PredReg = 0;
-    ARMCC::CondCodes Pred = getInstrPredicate(Br.MI, PredReg);
+    ARMCC::CondCodes Pred = getInstrPredicate(*Br.MI, PredReg);
     if (Pred == ARMCC::EQ)
       NewOpc = ARM::tCBZ;
     else if (Pred == ARMCC::NE)
@@ -1928,7 +1960,7 @@ bool ARMConstantIslands::optimizeThumb2Branches() {
         --CmpMI;
         if (CmpMI->getOpcode() == ARM::tCMPi8) {
           unsigned Reg = CmpMI->getOperand(0).getReg();
-          Pred = getInstrPredicate(CmpMI, PredReg);
+          Pred = getInstrPredicate(*CmpMI, PredReg);
           if (Pred == ARMCC::AL &&
               CmpMI->getOperand(1).getImm() == 0 &&
               isARMLowRegister(Reg)) {
@@ -2170,8 +2202,8 @@ bool ARMConstantIslands::optimizeThumb2JumpTables() {
       }
     }
 
-    unsigned NewSize = TII->GetInstSizeInBytes(NewJTMI);
-    unsigned OrigSize = TII->GetInstSizeInBytes(MI);
+    unsigned NewSize = TII->GetInstSizeInBytes(*NewJTMI);
+    unsigned OrigSize = TII->GetInstSizeInBytes(*MI);
     MI->eraseFromParent();
 
     int Delta = OrigSize - NewSize + DeadSize;
@@ -2240,13 +2272,13 @@ adjustJTTargetBlockForward(MachineBasicBlock *BB, MachineBasicBlock *JTBB) {
   MachineFunction::iterator OldPrior = std::prev(BBi);
 
   // If the block terminator isn't analyzable, don't try to move the block
-  bool B = TII->AnalyzeBranch(*BB, TBB, FBB, Cond);
+  bool B = TII->analyzeBranch(*BB, TBB, FBB, Cond);
 
   // If the block ends in an unconditional branch, move it. The prior block
   // has to have an analyzable terminator for us to move this one. Be paranoid
   // and make sure we're not trying to move the entry block of the function.
-  if (!B && Cond.empty() && BB != MF->begin() &&
-      !TII->AnalyzeBranch(*OldPrior, TBB, FBB, CondPrior)) {
+  if (!B && Cond.empty() && BB != &MF->front() &&
+      !TII->analyzeBranch(*OldPrior, TBB, FBB, CondPrior)) {
     BB->moveAfter(JTBB);
     OldPrior->updateTerminator();
     BB->updateTerminator();
diff --git a/lib/Target/ARM/ARMConstantPoolValue.cpp b/lib/Target/ARM/ARMConstantPoolValue.cpp
index c9849b2605ea..c0db001cb6f1 100644
--- a/lib/Target/ARM/ARMConstantPoolValue.cpp
+++ b/lib/Target/ARM/ARMConstantPoolValue.cpp
@@ -50,11 +50,18 @@ const char *ARMConstantPoolValue::getModifierText() const {
   switch (Modifier) {
     // FIXME: Are these case sensitive? It'd be nice to lower-case all the
     // strings if that's legal.
-  case ARMCP::no_modifier: return "none";
-  case ARMCP::TLSGD:       return "tlsgd";
-  case ARMCP::GOT_PREL:    return "GOT_PREL";
-  case ARMCP::GOTTPOFF:    return "gottpoff";
-  case ARMCP::TPOFF:       return "tpoff";
+  case ARMCP::no_modifier:
+    return "none";
+  case ARMCP::TLSGD:
+    return "tlsgd";
+  case ARMCP::GOT_PREL:
+    return "GOT_PREL";
+  case ARMCP::GOTTPOFF:
+    return "gottpoff";
+  case ARMCP::TPOFF:
+    return "tpoff";
+  case ARMCP::SECREL:
+    return "secrel32";
   }
   llvm_unreachable("Unknown modifier!");
 }
@@ -74,9 +81,9 @@ bool
 ARMConstantPoolValue::hasSameValue(ARMConstantPoolValue *ACPV) {
   if (ACPV->Kind == Kind &&
       ACPV->PCAdjust == PCAdjust &&
-      ACPV->Modifier == Modifier) {
-    if (ACPV->LabelId == LabelId)
-      return true;
+      ACPV->Modifier == Modifier &&
+      ACPV->LabelId == LabelId &&
+      ACPV->AddCurrentAddress == AddCurrentAddress) {
     // Two PC relative constpool entries containing the same GV address or
     // external symbols. FIXME: What about blockaddress?
     if (Kind == ARMCP::CPValue || Kind == ARMCP::CPExtSymbol)
@@ -85,7 +92,7 @@ ARMConstantPoolValue::hasSameValue(ARMConstantPoolValue *ACPV) {
   return false;
 }
 
-void ARMConstantPoolValue::dump() const {
+LLVM_DUMP_METHOD void ARMConstantPoolValue::dump() const {
   errs() << "  " << *this;
 }
 
diff --git a/lib/Target/ARM/ARMConstantPoolValue.h b/lib/Target/ARM/ARMConstantPoolValue.h
index 6b18a4e52878..c07331d71dad 100644
--- a/lib/Target/ARM/ARMConstantPoolValue.h
+++ b/lib/Target/ARM/ARMConstantPoolValue.h
@@ -37,11 +37,12 @@ namespace ARMCP {
   };
 
   enum ARMCPModifier {
-    no_modifier,
-    TLSGD,
-    GOT_PREL,
-    GOTTPOFF,
-    TPOFF
+    no_modifier, /// None
+    TLSGD,       /// Thread Local Storage (General Dynamic Mode)
+    GOT_PREL,    /// Global Offset Table, PC Relative
+    GOTTPOFF,    /// Global Offset Table, Thread Pointer Offset
+    TPOFF,       /// Thread Pointer Offset
+    SECREL,      /// Section Relative (Windows TLS)
   };
 }
 
diff --git a/lib/Target/ARM/ARMExpandPseudoInsts.cpp b/lib/Target/ARM/ARMExpandPseudoInsts.cpp
index 56f3498e1204..56f5728ecfb8 100644
--- a/lib/Target/ARM/ARMExpandPseudoInsts.cpp
+++ b/lib/Target/ARM/ARMExpandPseudoInsts.cpp
@@ -20,6 +20,7 @@
 #include "ARMConstantPoolValue.h"
 #include "ARMMachineFunctionInfo.h"
 #include "MCTargetDesc/ARMAddressingModes.h"
+#include "llvm/CodeGen/LivePhysRegs.h"
 #include "llvm/CodeGen/MachineFrameInfo.h"
 #include "llvm/CodeGen/MachineFunctionPass.h"
 #include "llvm/CodeGen/MachineInstrBuilder.h"
@@ -50,6 +51,11 @@ namespace {
 
     bool runOnMachineFunction(MachineFunction &Fn) override;
 
+    MachineFunctionProperties getRequiredProperties() const override {
+      return MachineFunctionProperties().set(
+          MachineFunctionProperties::Property::AllVRegsAllocated);
+    }
+
     const char *getPassName() const override {
       return "ARM pseudo instruction expansion pass";
     }
@@ -58,7 +64,8 @@ namespace {
     void TransferImpOps(MachineInstr &OldMI,
                         MachineInstrBuilder &UseMI, MachineInstrBuilder &DefMI);
     bool ExpandMI(MachineBasicBlock &MBB,
-                  MachineBasicBlock::iterator MBBI);
+                  MachineBasicBlock::iterator MBBI,
+                  MachineBasicBlock::iterator &NextMBBI);
     bool ExpandMBB(MachineBasicBlock &MBB);
     void ExpandVLD(MachineBasicBlock::iterator &MBBI);
     void ExpandVST(MachineBasicBlock::iterator &MBBI);
@@ -67,6 +74,14 @@ namespace {
                     unsigned Opc, bool IsExt);
     void ExpandMOV32BitImm(MachineBasicBlock &MBB,
                            MachineBasicBlock::iterator &MBBI);
+    bool ExpandCMP_SWAP(MachineBasicBlock &MBB,
+                        MachineBasicBlock::iterator MBBI, unsigned LdrexOp,
+                        unsigned StrexOp, unsigned UxtOp,
+                        MachineBasicBlock::iterator &NextMBBI);
+
+    bool ExpandCMP_SWAP_64(MachineBasicBlock &MBB,
+                           MachineBasicBlock::iterator MBBI,
+                           MachineBasicBlock::iterator &NextMBBI);
   };
   char ARMExpandPseudo::ID = 0;
 }
@@ -651,7 +666,7 @@ void ARMExpandPseudo::ExpandMOV32BitImm(MachineBasicBlock &MBB,
   MachineInstr &MI = *MBBI;
   unsigned Opcode = MI.getOpcode();
   unsigned PredReg = 0;
-  ARMCC::CondCodes Pred = getInstrPredicate(&MI, PredReg);
+  ARMCC::CondCodes Pred = getInstrPredicate(MI, PredReg);
   unsigned DstReg = MI.getOperand(0).getReg();
   bool DstIsDead = MI.getOperand(0).isDead();
   bool isCC = Opcode == ARM::MOVCCi32imm || Opcode == ARM::t2MOVCCi32imm;
@@ -737,8 +752,242 @@ void ARMExpandPseudo::ExpandMOV32BitImm(MachineBasicBlock &MBB,
   MI.eraseFromParent();
 }
 
+static void addPostLoopLiveIns(MachineBasicBlock *MBB, LivePhysRegs &LiveRegs) {
+  for (auto I = LiveRegs.begin(); I != LiveRegs.end(); ++I)
+    MBB->addLiveIn(*I);
+}
+
+/// Expand a CMP_SWAP pseudo-inst to an ldrex/strex loop as simply as
+/// possible. This only gets used at -O0 so we don't care about efficiency of the
+/// generated code.
+bool ARMExpandPseudo::ExpandCMP_SWAP(MachineBasicBlock &MBB,
+                                     MachineBasicBlock::iterator MBBI,
+                                     unsigned LdrexOp, unsigned StrexOp,
+                                     unsigned UxtOp,
+                                     MachineBasicBlock::iterator &NextMBBI) {
+  bool IsThumb = STI->isThumb();
+  MachineInstr &MI = *MBBI;
+  DebugLoc DL = MI.getDebugLoc();
+  MachineOperand &Dest = MI.getOperand(0);
+  unsigned StatusReg = MI.getOperand(1).getReg();
+  MachineOperand &Addr = MI.getOperand(2);
+  MachineOperand &Desired = MI.getOperand(3);
+  MachineOperand &New = MI.getOperand(4);
+
+  LivePhysRegs LiveRegs(&TII->getRegisterInfo());
+  LiveRegs.addLiveOuts(MBB);
+  for (auto I = std::prev(MBB.end()); I != MBBI; --I)
+    LiveRegs.stepBackward(*I);
+
+  MachineFunction *MF = MBB.getParent();
+  auto LoadCmpBB = MF->CreateMachineBasicBlock(MBB.getBasicBlock());
+  auto StoreBB = MF->CreateMachineBasicBlock(MBB.getBasicBlock());
+  auto DoneBB = MF->CreateMachineBasicBlock(MBB.getBasicBlock());
+
+  MF->insert(++MBB.getIterator(), LoadCmpBB);
+  MF->insert(++LoadCmpBB->getIterator(), StoreBB);
+  MF->insert(++StoreBB->getIterator(), DoneBB);
+
+  if (UxtOp) {
+    MachineInstrBuilder MIB =
+        BuildMI(MBB, MBBI, DL, TII->get(UxtOp), Desired.getReg())
+            .addReg(Desired.getReg(), RegState::Kill);
+    if (!IsThumb)
+      MIB.addImm(0);
+    AddDefaultPred(MIB);
+  }
+
+  // .Lloadcmp:
+  //     ldrex rDest, [rAddr]
+  //     cmp rDest, rDesired
+  //     bne .Ldone
+  LoadCmpBB->addLiveIn(Addr.getReg());
+  LoadCmpBB->addLiveIn(Dest.getReg());
+  LoadCmpBB->addLiveIn(Desired.getReg());
+  addPostLoopLiveIns(LoadCmpBB, LiveRegs);
+
+  MachineInstrBuilder MIB;
+  MIB = BuildMI(LoadCmpBB, DL, TII->get(LdrexOp), Dest.getReg());
+  MIB.addReg(Addr.getReg());
+  if (LdrexOp == ARM::t2LDREX)
+    MIB.addImm(0); // a 32-bit Thumb ldrex (only) allows an offset.
+  AddDefaultPred(MIB);
+
+  unsigned CMPrr = IsThumb ? ARM::tCMPhir : ARM::CMPrr;
+  AddDefaultPred(BuildMI(LoadCmpBB, DL, TII->get(CMPrr))
+                     .addReg(Dest.getReg(), getKillRegState(Dest.isDead()))
+                     .addOperand(Desired));
+  unsigned Bcc = IsThumb ? ARM::tBcc : ARM::Bcc;
+  BuildMI(LoadCmpBB, DL, TII->get(Bcc))
+      .addMBB(DoneBB)
+      .addImm(ARMCC::NE)
+      .addReg(ARM::CPSR, RegState::Kill);
+  LoadCmpBB->addSuccessor(DoneBB);
+  LoadCmpBB->addSuccessor(StoreBB);
+
+  // .Lstore:
+  //     strex rStatus, rNew, [rAddr]
+  //     cmp rStatus, #0
+  //     bne .Lloadcmp
+  StoreBB->addLiveIn(Addr.getReg());
+  StoreBB->addLiveIn(New.getReg());
+  addPostLoopLiveIns(StoreBB, LiveRegs);
+
+
+  MIB = BuildMI(StoreBB, DL, TII->get(StrexOp), StatusReg);
+  MIB.addOperand(New);
+  MIB.addOperand(Addr);
+  if (StrexOp == ARM::t2STREX)
+    MIB.addImm(0); // a 32-bit Thumb strex (only) allows an offset.
+  AddDefaultPred(MIB);
+
+  unsigned CMPri = IsThumb ? ARM::t2CMPri : ARM::CMPri;
+  AddDefaultPred(BuildMI(StoreBB, DL, TII->get(CMPri))
+                     .addReg(StatusReg, RegState::Kill)
+                     .addImm(0));
+  BuildMI(StoreBB, DL, TII->get(Bcc))
+      .addMBB(LoadCmpBB)
+      .addImm(ARMCC::NE)
+      .addReg(ARM::CPSR, RegState::Kill);
+  StoreBB->addSuccessor(LoadCmpBB);
+  StoreBB->addSuccessor(DoneBB);
+
+  DoneBB->splice(DoneBB->end(), &MBB, MI, MBB.end());
+  DoneBB->transferSuccessors(&MBB);
+  addPostLoopLiveIns(DoneBB, LiveRegs);
+
+  MBB.addSuccessor(LoadCmpBB);
+
+  NextMBBI = MBB.end();
+  MI.eraseFromParent();
+  return true;
+}
+
+/// ARM's ldrexd/strexd take a consecutive register pair (represented as a
+/// single GPRPair register), Thumb's take two separate registers so we need to
+/// extract the subregs from the pair.
+static void addExclusiveRegPair(MachineInstrBuilder &MIB, MachineOperand &Reg,
+                                unsigned Flags, bool IsThumb,
+                                const TargetRegisterInfo *TRI) {
+  if (IsThumb) {
+    unsigned RegLo = TRI->getSubReg(Reg.getReg(), ARM::gsub_0);
+    unsigned RegHi = TRI->getSubReg(Reg.getReg(), ARM::gsub_1);
+    MIB.addReg(RegLo, Flags | getKillRegState(Reg.isDead()));
+    MIB.addReg(RegHi, Flags | getKillRegState(Reg.isDead()));
+  } else
+    MIB.addReg(Reg.getReg(), Flags | getKillRegState(Reg.isDead()));
+}
+
+/// Expand a 64-bit CMP_SWAP to an ldrexd/strexd loop.
+bool ARMExpandPseudo::ExpandCMP_SWAP_64(MachineBasicBlock &MBB,
+                                        MachineBasicBlock::iterator MBBI,
+                                        MachineBasicBlock::iterator &NextMBBI) {
+  bool IsThumb = STI->isThumb();
+  MachineInstr &MI = *MBBI;
+  DebugLoc DL = MI.getDebugLoc();
+  MachineOperand &Dest = MI.getOperand(0);
+  unsigned StatusReg = MI.getOperand(1).getReg();
+  MachineOperand &Addr = MI.getOperand(2);
+  MachineOperand &Desired = MI.getOperand(3);
+  MachineOperand &New = MI.getOperand(4);
+
+  unsigned DestLo = TRI->getSubReg(Dest.getReg(), ARM::gsub_0);
+  unsigned DestHi = TRI->getSubReg(Dest.getReg(), ARM::gsub_1);
+  unsigned DesiredLo = TRI->getSubReg(Desired.getReg(), ARM::gsub_0);
+  unsigned DesiredHi = TRI->getSubReg(Desired.getReg(), ARM::gsub_1);
+
+  LivePhysRegs LiveRegs(&TII->getRegisterInfo());
+  LiveRegs.addLiveOuts(MBB);
+  for (auto I = std::prev(MBB.end()); I != MBBI; --I)
+    LiveRegs.stepBackward(*I);
+
+  MachineFunction *MF = MBB.getParent();
+  auto LoadCmpBB = MF->CreateMachineBasicBlock(MBB.getBasicBlock());
+  auto StoreBB = MF->CreateMachineBasicBlock(MBB.getBasicBlock());
+  auto DoneBB = MF->CreateMachineBasicBlock(MBB.getBasicBlock());
+
+  MF->insert(++MBB.getIterator(), LoadCmpBB);
+  MF->insert(++LoadCmpBB->getIterator(), StoreBB);
+  MF->insert(++StoreBB->getIterator(), DoneBB);
+
+  // .Lloadcmp:
+  //     ldrexd rDestLo, rDestHi, [rAddr]
+  //     cmp rDestLo, rDesiredLo
+  //     sbcs rStatus<dead>, rDestHi, rDesiredHi
+  //     bne .Ldone
+  LoadCmpBB->addLiveIn(Addr.getReg());
+  LoadCmpBB->addLiveIn(Dest.getReg());
+  LoadCmpBB->addLiveIn(Desired.getReg());
+  addPostLoopLiveIns(LoadCmpBB, LiveRegs);
+
+  unsigned LDREXD = IsThumb ? ARM::t2LDREXD : ARM::LDREXD;
+  MachineInstrBuilder MIB;
+  MIB = BuildMI(LoadCmpBB, DL, TII->get(LDREXD));
+  addExclusiveRegPair(MIB, Dest, RegState::Define, IsThumb, TRI);
+  MIB.addReg(Addr.getReg());
+  AddDefaultPred(MIB);
+
+  unsigned CMPrr = IsThumb ? ARM::tCMPhir : ARM::CMPrr;
+  AddDefaultPred(BuildMI(LoadCmpBB, DL, TII->get(CMPrr))
+                     .addReg(DestLo, getKillRegState(Dest.isDead()))
+                     .addReg(DesiredLo, getKillRegState(Desired.isDead())));
+
+  unsigned SBCrr = IsThumb ? ARM::t2SBCrr : ARM::SBCrr;
+  MIB = BuildMI(LoadCmpBB, DL, TII->get(SBCrr))
+            .addReg(StatusReg, RegState::Define | RegState::Dead)
+            .addReg(DestHi, getKillRegState(Dest.isDead()))
+            .addReg(DesiredHi, getKillRegState(Desired.isDead()));
+  AddDefaultPred(MIB);
+  MIB.addReg(ARM::CPSR, RegState::Kill);
+
+  unsigned Bcc = IsThumb ? ARM::tBcc : ARM::Bcc;
+  BuildMI(LoadCmpBB, DL, TII->get(Bcc))
+      .addMBB(DoneBB)
+      .addImm(ARMCC::NE)
+      .addReg(ARM::CPSR, RegState::Kill);
+  LoadCmpBB->addSuccessor(DoneBB);
+  LoadCmpBB->addSuccessor(StoreBB);
+
+  // .Lstore:
+  //     strexd rStatus, rNewLo, rNewHi, [rAddr]
+  //     cmp rStatus, #0
+  //     bne .Lloadcmp
+  StoreBB->addLiveIn(Addr.getReg());
+  StoreBB->addLiveIn(New.getReg());
+  addPostLoopLiveIns(StoreBB, LiveRegs);
+
+  unsigned STREXD = IsThumb ? ARM::t2STREXD : ARM::STREXD;
+  MIB = BuildMI(StoreBB, DL, TII->get(STREXD), StatusReg);
+  addExclusiveRegPair(MIB, New, 0, IsThumb, TRI);
+  MIB.addOperand(Addr);
+  AddDefaultPred(MIB);
+
+  unsigned CMPri = IsThumb ? ARM::t2CMPri : ARM::CMPri;
+  AddDefaultPred(BuildMI(StoreBB, DL, TII->get(CMPri))
+                     .addReg(StatusReg, RegState::Kill)
+                     .addImm(0));
+  BuildMI(StoreBB, DL, TII->get(Bcc))
+      .addMBB(LoadCmpBB)
+      .addImm(ARMCC::NE)
+      .addReg(ARM::CPSR, RegState::Kill);
+  StoreBB->addSuccessor(LoadCmpBB);
+  StoreBB->addSuccessor(DoneBB);
+
+  DoneBB->splice(DoneBB->end(), &MBB, MI, MBB.end());
+  DoneBB->transferSuccessors(&MBB);
+  addPostLoopLiveIns(DoneBB, LiveRegs);
+
+  MBB.addSuccessor(LoadCmpBB);
+
+  NextMBBI = MBB.end();
+  MI.eraseFromParent();
+  return true;
+}
+
+
 bool ARMExpandPseudo::ExpandMI(MachineBasicBlock &MBB,
-                               MachineBasicBlock::iterator MBBI) {
+                               MachineBasicBlock::iterator MBBI,
+                               MachineBasicBlock::iterator &NextMBBI) {
   MachineInstr &MI = *MBBI;
   unsigned Opcode = MI.getOpcode();
   switch (Opcode) {
@@ -784,7 +1033,7 @@ bool ARMExpandPseudo::ExpandMI(MachineBasicBlock &MBB,
             .addReg(JumpTarget.getReg(), RegState::Kill);
       }
 
-      MachineInstr *NewMI = std::prev(MBBI);
+      auto NewMI = std::prev(MBBI);
       for (unsigned i = 1, e = MBBI->getNumOperands(); i != e; ++i)
         NewMI->addOperand(MBBI->getOperand(i));
 
@@ -1375,6 +1624,30 @@ bool ARMExpandPseudo::ExpandMI(MachineBasicBlock &MBB,
     case ARM::VTBL4Pseudo: ExpandVTBL(MBBI, ARM::VTBL4, false); return true;
     case ARM::VTBX3Pseudo: ExpandVTBL(MBBI, ARM::VTBX3, true); return true;
     case ARM::VTBX4Pseudo: ExpandVTBL(MBBI, ARM::VTBX4, true); return true;
+
+    case ARM::CMP_SWAP_8:
+      if (STI->isThumb())
+        return ExpandCMP_SWAP(MBB, MBBI, ARM::t2LDREXB, ARM::t2STREXB,
+                              ARM::tUXTB, NextMBBI);
+      else
+        return ExpandCMP_SWAP(MBB, MBBI, ARM::LDREXB, ARM::STREXB,
+                              ARM::UXTB, NextMBBI);
+    case ARM::CMP_SWAP_16:
+      if (STI->isThumb())
+        return ExpandCMP_SWAP(MBB, MBBI, ARM::t2LDREXH, ARM::t2STREXH,
+                              ARM::tUXTH, NextMBBI);
+      else
+        return ExpandCMP_SWAP(MBB, MBBI, ARM::LDREXH, ARM::STREXH,
+                              ARM::UXTH, NextMBBI);
+    case ARM::CMP_SWAP_32:
+      if (STI->isThumb())
+        return ExpandCMP_SWAP(MBB, MBBI, ARM::t2LDREX, ARM::t2STREX, 0,
+                              NextMBBI);
+      else
+        return ExpandCMP_SWAP(MBB, MBBI, ARM::LDREX, ARM::STREX, 0, NextMBBI);
+
+    case ARM::CMP_SWAP_64:
+      return ExpandCMP_SWAP_64(MBB, MBBI, NextMBBI);
   }
 }
 
@@ -1384,7 +1657,7 @@ bool ARMExpandPseudo::ExpandMBB(MachineBasicBlock &MBB) {
   MachineBasicBlock::iterator MBBI = MBB.begin(), E = MBB.end();
   while (MBBI != E) {
     MachineBasicBlock::iterator NMBBI = std::next(MBBI);
-    Modified |= ExpandMI(MBB, MBBI);
+    Modified |= ExpandMI(MBB, MBBI, NMBBI);
     MBBI = NMBBI;
   }
 
diff --git a/lib/Target/ARM/ARMFastISel.cpp b/lib/Target/ARM/ARMFastISel.cpp
index ff2fcfa349dc..13724da5d4f7 100644
--- a/lib/Target/ARM/ARMFastISel.cpp
+++ b/lib/Target/ARM/ARMFastISel.cpp
@@ -22,7 +22,6 @@
 #include "ARMSubtarget.h"
 #include "MCTargetDesc/ARMAddressingModes.h"
 #include "llvm/ADT/STLExtras.h"
-#include "llvm/CodeGen/Analysis.h"
 #include "llvm/CodeGen/FastISel.h"
 #include "llvm/CodeGen/FunctionLoweringInfo.h"
 #include "llvm/CodeGen/MachineConstantPool.h"
@@ -41,7 +40,6 @@
 #include "llvm/IR/IntrinsicInst.h"
 #include "llvm/IR/Module.h"
 #include "llvm/IR/Operator.h"
-#include "llvm/Support/CommandLine.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Target/TargetInstrInfo.h"
 #include "llvm/Target/TargetLowering.h"
@@ -110,11 +108,6 @@ class ARMFastISel final : public FastISel {
                              const TargetRegisterClass *RC,
                              unsigned Op0, bool Op0IsKill,
                              unsigned Op1, bool Op1IsKill);
-    unsigned fastEmitInst_rrr(unsigned MachineInstOpcode,
-                              const TargetRegisterClass *RC,
-                              unsigned Op0, bool Op0IsKill,
-                              unsigned Op1, bool Op1IsKill,
-                              unsigned Op2, bool Op2IsKill);
     unsigned fastEmitInst_ri(unsigned MachineInstOpcode,
                              const TargetRegisterClass *RC,
                              unsigned Op0, bool Op0IsKill,
@@ -164,6 +157,7 @@ class ARMFastISel final : public FastISel {
 
     // Utility routines.
   private:
+    bool isPositionIndependent() const;
     bool isTypeLegal(Type *Ty, MVT &VT);
     bool isLoadTypeLegal(Type *Ty, MVT &VT);
     bool ARMEmitCmp(const Value *Src1Value, const Value *Src2Value,
@@ -215,7 +209,7 @@ class ARMFastISel final : public FastISel {
     const MachineInstrBuilder &AddOptionalDefs(const MachineInstrBuilder &MIB);
     void AddLoadStoreOperands(MVT VT, Address &Addr,
                               const MachineInstrBuilder &MIB,
-                              unsigned Flags, bool useAM3);
+                              MachineMemOperand::Flags Flags, bool useAM3);
 };
 
 } // end anonymous namespace
@@ -331,38 +325,6 @@ unsigned ARMFastISel::fastEmitInst_rr(unsigned MachineInstOpcode,
   return ResultReg;
 }
 
-unsigned ARMFastISel::fastEmitInst_rrr(unsigned MachineInstOpcode,
-                                       const TargetRegisterClass *RC,
-                                       unsigned Op0, bool Op0IsKill,
-                                       unsigned Op1, bool Op1IsKill,
-                                       unsigned Op2, bool Op2IsKill) {
-  unsigned ResultReg = createResultReg(RC);
-  const MCInstrDesc &II = TII.get(MachineInstOpcode);
-
-  // Make sure the input operands are sufficiently constrained to be legal
-  // for this instruction.
-  Op0 = constrainOperandRegClass(II, Op0, 1);
-  Op1 = constrainOperandRegClass(II, Op1, 2);
-  Op2 = constrainOperandRegClass(II, Op1, 3);
-
-  if (II.getNumDefs() >= 1) {
-    AddOptionalDefs(
-        BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, II, ResultReg)
-            .addReg(Op0, Op0IsKill * RegState::Kill)
-            .addReg(Op1, Op1IsKill * RegState::Kill)
-            .addReg(Op2, Op2IsKill * RegState::Kill));
-  } else {
-    AddOptionalDefs(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, II)
-                   .addReg(Op0, Op0IsKill * RegState::Kill)
-                   .addReg(Op1, Op1IsKill * RegState::Kill)
-                   .addReg(Op2, Op2IsKill * RegState::Kill));
-    AddOptionalDefs(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
-                           TII.get(TargetOpcode::COPY), ResultReg)
-                   .addReg(II.ImplicitDefs[0]));
-  }
-  return ResultReg;
-}
-
 unsigned ARMFastISel::fastEmitInst_ri(unsigned MachineInstOpcode,
                                       const TargetRegisterClass *RC,
                                       unsigned Op0, bool Op0IsKill,
@@ -576,12 +538,15 @@ unsigned ARMFastISel::ARMMaterializeInt(const Constant *C, MVT VT) {
   return ResultReg;
 }
 
+bool ARMFastISel::isPositionIndependent() const {
+  return TLI.isPositionIndependent();
+}
+
 unsigned ARMFastISel::ARMMaterializeGV(const GlobalValue *GV, MVT VT) {
   // For now 32-bit only.
   if (VT != MVT::i32 || GV->isThreadLocal()) return 0;
 
-  Reloc::Model RelocM = TM.getRelocationModel();
-  bool IsIndirect = Subtarget->GVIsIndirectSymbol(GV, RelocM);
+  bool IsIndirect = Subtarget->isGVIndirectSymbol(GV);
   const TargetRegisterClass *RC = isThumb2 ? &ARM::rGPRRegClass
                                            : &ARM::GPRRegClass;
   unsigned DestReg = createResultReg(RC);
@@ -591,23 +556,20 @@ unsigned ARMFastISel::ARMMaterializeGV(const GlobalValue *GV, MVT VT) {
   bool IsThreadLocal = GVar && GVar->isThreadLocal();
   if (!Subtarget->isTargetMachO() && IsThreadLocal) return 0;
 
+  bool IsPositionIndependent = isPositionIndependent();
   // Use movw+movt when possible, it avoids constant pool entries.
   // Non-darwin targets only support static movt relocations in FastISel.
   if (Subtarget->useMovt(*FuncInfo.MF) &&
-      (Subtarget->isTargetMachO() || RelocM == Reloc::Static)) {
+      (Subtarget->isTargetMachO() || !IsPositionIndependent)) {
     unsigned Opc;
     unsigned char TF = 0;
     if (Subtarget->isTargetMachO())
       TF = ARMII::MO_NONLAZY;
 
-    switch (RelocM) {
-    case Reloc::PIC_:
+    if (IsPositionIndependent)
       Opc = isThumb2 ? ARM::t2MOV_ga_pcrel : ARM::MOV_ga_pcrel;
-      break;
-    default:
+    else
       Opc = isThumb2 ? ARM::t2MOVi32imm : ARM::MOVi32imm;
-      break;
-    }
     AddOptionalDefs(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
                             TII.get(Opc), DestReg).addGlobalAddress(GV, 0, TF));
   } else {
@@ -618,12 +580,11 @@ unsigned ARMFastISel::ARMMaterializeGV(const GlobalValue *GV, MVT VT) {
       Align = DL.getTypeAllocSize(GV->getType());
     }
 
-    if (Subtarget->isTargetELF() && RelocM == Reloc::PIC_)
+    if (Subtarget->isTargetELF() && IsPositionIndependent)
       return ARMLowerPICELF(GV, Align, VT);
 
     // Grab index.
-    unsigned PCAdj = (RelocM != Reloc::PIC_) ? 0 :
-      (Subtarget->isThumb() ? 4 : 8);
+    unsigned PCAdj = IsPositionIndependent ? (Subtarget->isThumb() ? 4 : 8) : 0;
     unsigned Id = AFI->createPICLabelUId();
     ARMConstantPoolValue *CPV = ARMConstantPoolConstant::Create(GV, Id,
                                                                 ARMCP::CPValue,
@@ -633,10 +594,10 @@ unsigned ARMFastISel::ARMMaterializeGV(const GlobalValue *GV, MVT VT) {
     // Load value.
     MachineInstrBuilder MIB;
     if (isThumb2) {
-      unsigned Opc = (RelocM!=Reloc::PIC_) ? ARM::t2LDRpci : ARM::t2LDRpci_pic;
+      unsigned Opc = IsPositionIndependent ? ARM::t2LDRpci_pic : ARM::t2LDRpci;
       MIB = BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(Opc),
                     DestReg).addConstantPoolIndex(Idx);
-      if (RelocM == Reloc::PIC_)
+      if (IsPositionIndependent)
         MIB.addImm(Id);
       AddOptionalDefs(MIB);
     } else {
@@ -648,7 +609,7 @@ unsigned ARMFastISel::ARMMaterializeGV(const GlobalValue *GV, MVT VT) {
                 .addImm(0);
       AddOptionalDefs(MIB);
 
-      if (RelocM == Reloc::PIC_) {
+      if (IsPositionIndependent) {
         unsigned Opc = IsIndirect ? ARM::PICLDR : ARM::PICADD;
         unsigned NewDestReg = createResultReg(TLI.getRegClassFor(VT));
 
@@ -912,7 +873,8 @@ void ARMFastISel::ARMSimplifyAddress(Address &Addr, MVT VT, bool useAM3) {
 
 void ARMFastISel::AddLoadStoreOperands(MVT VT, Address &Addr,
                                        const MachineInstrBuilder &MIB,
-                                       unsigned Flags, bool useAM3) {
+                                       MachineMemOperand::Flags Flags,
+                                       bool useAM3) {
   // addrmode5 output depends on the selection dag addressing dividing the
   // offset by 4 that it then later multiplies. Do this here as well.
   if (VT.SimpleTy == MVT::f32 || VT.SimpleTy == MVT::f64)
@@ -931,7 +893,7 @@ void ARMFastISel::AddLoadStoreOperands(MVT VT, Address &Addr,
     // ARM halfword load/stores and signed byte loads need an additional
     // operand.
     if (useAM3) {
-      signed Imm = (Addr.Offset < 0) ? (0x100 | -Addr.Offset) : Addr.Offset;
+      int Imm = (Addr.Offset < 0) ? (0x100 | -Addr.Offset) : Addr.Offset;
       MIB.addReg(0);
       MIB.addImm(Imm);
     } else {
@@ -945,7 +907,7 @@ void ARMFastISel::AddLoadStoreOperands(MVT VT, Address &Addr,
     // ARM halfword load/stores and signed byte loads need an additional
     // operand.
     if (useAM3) {
-      signed Imm = (Addr.Offset < 0) ? (0x100 | -Addr.Offset) : Addr.Offset;
+      int Imm = (Addr.Offset < 0) ? (0x100 | -Addr.Offset) : Addr.Offset;
       MIB.addReg(0);
       MIB.addImm(Imm);
     } else {
@@ -1062,6 +1024,21 @@ bool ARMFastISel::SelectLoad(const Instruction *I) {
   if (cast<LoadInst>(I)->isAtomic())
     return false;
 
+  const Value *SV = I->getOperand(0);
+  if (TLI.supportSwiftError()) {
+    // Swifterror values can come from either a function parameter with
+    // swifterror attribute or an alloca with swifterror attribute.
+    if (const Argument *Arg = dyn_cast<Argument>(SV)) {
+      if (Arg->hasSwiftErrorAttr())
+        return false;
+    }
+
+    if (const AllocaInst *Alloca = dyn_cast<AllocaInst>(SV)) {
+      if (Alloca->isSwiftError())
+        return false;
+    }
+  }
+
   // Verify we have a legal type before going any further.
   MVT VT;
   if (!isLoadTypeLegal(I->getType(), VT))
@@ -1177,6 +1154,21 @@ bool ARMFastISel::SelectStore(const Instruction *I) {
   if (cast<StoreInst>(I)->isAtomic())
     return false;
 
+  const Value *PtrV = I->getOperand(1);
+  if (TLI.supportSwiftError()) {
+    // Swifterror values can come from either a function parameter with
+    // swifterror attribute or an alloca with swifterror attribute.
+    if (const Argument *Arg = dyn_cast<Argument>(PtrV)) {
+      if (Arg->hasSwiftErrorAttr())
+        return false;
+    }
+
+    if (const AllocaInst *Alloca = dyn_cast<AllocaInst>(PtrV)) {
+      if (Alloca->isSwiftError())
+        return false;
+    }
+  }
+
   // Verify we have a legal type before going any further.
   MVT VT;
   if (!isLoadTypeLegal(I->getOperand(0)->getType(), VT))
@@ -1726,6 +1718,13 @@ bool ARMFastISel::SelectRem(const Instruction *I, bool isSigned) {
   if (!isTypeLegal(Ty, VT))
     return false;
 
+  // Many ABIs do not provide a libcall for standalone remainder, so we need to
+  // use divrem (see the RTABI 4.3.1). Since FastISel can't handle non-double
+  // multi-reg returns, we'll have to bail out.
+  if (!TLI.hasStandaloneRem(VT)) {
+    return false;
+  }
+
   RTLIB::Libcall LC = RTLIB::UNKNOWN_LIBCALL;
   if (VT == MVT::i8)
     LC = isSigned ? RTLIB::SREM_I8 : RTLIB::UREM_I8;
@@ -1847,6 +1846,7 @@ CCAssignFn *ARMFastISel::CCAssignFnForCall(CallingConv::ID CC,
     }
     // Fallthrough
   case CallingConv::C:
+  case CallingConv::CXX_FAST_TLS:
     // Use target triple & subtarget features to do actual dispatch.
     if (Subtarget->isAAPCS_ABI()) {
       if (Subtarget->hasVFP2() &&
@@ -1858,6 +1858,7 @@ CCAssignFn *ARMFastISel::CCAssignFnForCall(CallingConv::ID CC,
       return (Return ? RetCC_ARM_APCS: CC_ARM_APCS);
     }
   case CallingConv::ARM_AAPCS_VFP:
+  case CallingConv::Swift:
     if (!isVarArg)
       return (Return ? RetCC_ARM_AAPCS_VFP: CC_ARM_AAPCS_VFP);
     // Fall through to soft float variant, variadic functions don't
@@ -2083,6 +2084,10 @@ bool ARMFastISel::SelectRet(const Instruction *I) {
   if (!FuncInfo.CanLowerReturn)
     return false;
 
+  if (TLI.supportSwiftError() &&
+      F.getAttributes().hasAttrSomewhere(Attribute::SwiftError))
+    return false;
+
   if (TLI.supportSplitCSR(FuncInfo.MF))
     return false;
 
@@ -2295,8 +2300,7 @@ bool ARMFastISel::SelectCall(const Instruction *I,
 
   // TODO: Avoid some calling conventions?
 
-  PointerType *PT = cast<PointerType>(CS.getCalledValue()->getType());
-  FunctionType *FTy = cast<FunctionType>(PT->getElementType());
+  FunctionType *FTy = CS.getFunctionType();
   bool isVarArg = FTy->isVarArg();
 
   // Handle *simple* calls for now.
@@ -2345,6 +2349,8 @@ bool ARMFastISel::SelectCall(const Instruction *I,
     // FIXME: Only handle *easy* calls for now.
     if (CS.paramHasAttr(AttrInd, Attribute::InReg) ||
         CS.paramHasAttr(AttrInd, Attribute::StructRet) ||
+        CS.paramHasAttr(AttrInd, Attribute::SwiftSelf) ||
+        CS.paramHasAttr(AttrInd, Attribute::SwiftError) ||
         CS.paramHasAttr(AttrInd, Attribute::Nest) ||
         CS.paramHasAttr(AttrInd, Attribute::ByVal))
       return false;
@@ -2394,22 +2400,15 @@ bool ARMFastISel::SelectCall(const Instruction *I,
   MachineInstrBuilder MIB = BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt,
                                     DbgLoc, TII.get(CallOpc));
 
-  unsigned char OpFlags = 0;
-
-  // Add MO_PLT for global address or external symbol in the PIC relocation
-  // model.
-  if (Subtarget->isTargetELF() && TM.getRelocationModel() == Reloc::PIC_)
-    OpFlags = ARMII::MO_PLT;
-
   // ARM calls don't take a predicate, but tBL / tBLX do.
   if(isThumb2)
     AddDefaultPred(MIB);
   if (UseReg)
     MIB.addReg(CalleeReg);
   else if (!IntrMemName)
-    MIB.addGlobalAddress(GV, 0, OpFlags);
+    MIB.addGlobalAddress(GV, 0, 0);
   else
-    MIB.addExternalSymbol(IntrMemName, OpFlags);
+    MIB.addExternalSymbol(IntrMemName, 0);
 
   // Add implicit physical register uses to the call.
   for (unsigned i = 0, e = RegArgs.size(); i != e; ++i)
@@ -2942,8 +2941,7 @@ bool ARMFastISel::tryToFoldLoadIntoMI(MachineInstr *MI, unsigned OpNo,
 
 unsigned ARMFastISel::ARMLowerPICELF(const GlobalValue *GV,
                                      unsigned Align, MVT VT) {
-  bool UseGOT_PREL =
-      !(GV->hasHiddenVisibility() || GV->hasLocalLinkage());
+  bool UseGOT_PREL = !TM.shouldAssumeDSOLocal(*GV->getParent(), GV);
 
   LLVMContext *Context = &MF->getFunction()->getContext();
   unsigned ARMPCLabelIndex = AFI->createPICLabelUId();
@@ -3006,6 +3004,7 @@ bool ARMFastISel::fastLowerArguments() {
   case CallingConv::ARM_AAPCS_VFP:
   case CallingConv::ARM_AAPCS:
   case CallingConv::ARM_APCS:
+  case CallingConv::Swift:
     break;
   }
 
@@ -3019,6 +3018,8 @@ bool ARMFastISel::fastLowerArguments() {
 
     if (F->getAttributes().hasAttribute(Idx, Attribute::InReg) ||
         F->getAttributes().hasAttribute(Idx, Attribute::StructRet) ||
+        F->getAttributes().hasAttribute(Idx, Attribute::SwiftSelf) ||
+        F->getAttributes().hasAttribute(Idx, Attribute::SwiftError) ||
         F->getAttributes().hasAttribute(Idx, Attribute::ByVal))
       return false;
 
diff --git a/lib/Target/ARM/ARMFrameLowering.cpp b/lib/Target/ARM/ARMFrameLowering.cpp
index c5990bb7d1fb..e8c9f610ea64 100644
--- a/lib/Target/ARM/ARMFrameLowering.cpp
+++ b/lib/Target/ARM/ARMFrameLowering.cpp
@@ -98,35 +98,32 @@ ARMFrameLowering::canSimplifyCallFramePseudos(const MachineFunction &MF) const {
   return hasReservedCallFrame(MF) || MF.getFrameInfo()->hasVarSizedObjects();
 }
 
-static bool isCSRestore(MachineInstr *MI,
-                        const ARMBaseInstrInfo &TII,
+static bool isCSRestore(MachineInstr &MI, const ARMBaseInstrInfo &TII,
                         const MCPhysReg *CSRegs) {
   // Integer spill area is handled with "pop".
-  if (isPopOpcode(MI->getOpcode())) {
+  if (isPopOpcode(MI.getOpcode())) {
     // The first two operands are predicates. The last two are
     // imp-def and imp-use of SP. Check everything in between.
-    for (int i = 5, e = MI->getNumOperands(); i != e; ++i)
-      if (!isCalleeSavedRegister(MI->getOperand(i).getReg(), CSRegs))
+    for (int i = 5, e = MI.getNumOperands(); i != e; ++i)
+      if (!isCalleeSavedRegister(MI.getOperand(i).getReg(), CSRegs))
         return false;
     return true;
   }
-  if ((MI->getOpcode() == ARM::LDR_POST_IMM ||
-       MI->getOpcode() == ARM::LDR_POST_REG ||
-       MI->getOpcode() == ARM::t2LDR_POST) &&
-      isCalleeSavedRegister(MI->getOperand(0).getReg(), CSRegs) &&
-      MI->getOperand(1).getReg() == ARM::SP)
+  if ((MI.getOpcode() == ARM::LDR_POST_IMM ||
+       MI.getOpcode() == ARM::LDR_POST_REG ||
+       MI.getOpcode() == ARM::t2LDR_POST) &&
+      isCalleeSavedRegister(MI.getOperand(0).getReg(), CSRegs) &&
+      MI.getOperand(1).getReg() == ARM::SP)
     return true;
 
   return false;
 }
 
-static void emitRegPlusImmediate(bool isARM, MachineBasicBlock &MBB,
-                                 MachineBasicBlock::iterator &MBBI, DebugLoc dl,
-                                 const ARMBaseInstrInfo &TII, unsigned DestReg,
-                                 unsigned SrcReg, int NumBytes,
-                                 unsigned MIFlags = MachineInstr::NoFlags,
-                                 ARMCC::CondCodes Pred = ARMCC::AL,
-                                 unsigned PredReg = 0) {
+static void emitRegPlusImmediate(
+    bool isARM, MachineBasicBlock &MBB, MachineBasicBlock::iterator &MBBI,
+    const DebugLoc &dl, const ARMBaseInstrInfo &TII, unsigned DestReg,
+    unsigned SrcReg, int NumBytes, unsigned MIFlags = MachineInstr::NoFlags,
+    ARMCC::CondCodes Pred = ARMCC::AL, unsigned PredReg = 0) {
   if (isARM)
     emitARMRegPlusImmediate(MBB, MBBI, dl, DestReg, SrcReg, NumBytes,
                             Pred, PredReg, TII, MIFlags);
@@ -136,7 +133,7 @@ static void emitRegPlusImmediate(bool isARM, MachineBasicBlock &MBB,
 }
 
 static void emitSPUpdate(bool isARM, MachineBasicBlock &MBB,
-                         MachineBasicBlock::iterator &MBBI, DebugLoc dl,
+                         MachineBasicBlock::iterator &MBBI, const DebugLoc &dl,
                          const ARMBaseInstrInfo &TII, int NumBytes,
                          unsigned MIFlags = MachineInstr::NoFlags,
                          ARMCC::CondCodes Pred = ARMCC::AL,
@@ -145,9 +142,9 @@ static void emitSPUpdate(bool isARM, MachineBasicBlock &MBB,
                        MIFlags, Pred, PredReg);
 }
 
-static int sizeOfSPAdjustment(const MachineInstr *MI) {
+static int sizeOfSPAdjustment(const MachineInstr &MI) {
   int RegSize;
-  switch (MI->getOpcode()) {
+  switch (MI.getOpcode()) {
   case ARM::VSTMDDB_UPD:
     RegSize = 8;
     break;
@@ -165,7 +162,7 @@ static int sizeOfSPAdjustment(const MachineInstr *MI) {
   int count = 0;
   // ARM and Thumb2 push/pop insts have explicit "sp, sp" operands (+
   // pred) so the list starts at 4.
-  for (int i = MI->getNumOperands() - 1; i >= 4; --i)
+  for (int i = MI.getNumOperands() - 1; i >= 4; --i)
     count += RegSize;
   return count;
 }
@@ -206,7 +203,8 @@ struct StackAdjustingInsts {
   }
 
   void emitDefCFAOffsets(MachineModuleInfo &MMI, MachineBasicBlock &MBB,
-                         DebugLoc dl, const ARMBaseInstrInfo &TII, bool HasFP) {
+                         const DebugLoc &dl, const ARMBaseInstrInfo &TII,
+                         bool HasFP) {
     unsigned CFAOffset = 0;
     for (auto &Info : Insts) {
       if (HasFP && !Info.BeforeFPSet)
@@ -235,7 +233,7 @@ static void emitAligningInstructions(MachineFunction &MF, ARMFunctionInfo *AFI,
                                      const TargetInstrInfo &TII,
                                      MachineBasicBlock &MBB,
                                      MachineBasicBlock::iterator MBBI,
-                                     DebugLoc DL, const unsigned Reg,
+                                     const DebugLoc &DL, const unsigned Reg,
                                      const unsigned Alignment,
                                      const bool MustBeSingleInstruction) {
   const ARMSubtarget &AST =
@@ -355,7 +353,7 @@ void ARMFrameLowering::emitPrologue(MachineFunction &MF,
     case ARM::R10:
     case ARM::R11:
     case ARM::R12:
-      if (STI.isTargetDarwin()) {
+      if (STI.splitFramePushPop()) {
         GPRCS2Size += 4;
         break;
       }
@@ -416,7 +414,7 @@ void ARMFrameLowering::emitPrologue(MachineFunction &MF,
   // .cfi_offset operations will reflect that.
   if (DPRGapSize) {
     assert(DPRGapSize == 4 && "unexpected alignment requirements for DPRs");
-    if (tryFoldSPUpdateIntoPushPop(STI, MF, LastPush, DPRGapSize))
+    if (tryFoldSPUpdateIntoPushPop(STI, MF, &*LastPush, DPRGapSize))
       DefCFAOffsetCandidates.addExtraBytes(LastPush, DPRGapSize);
     else {
       emitSPUpdate(isARM, MBB, MBBI, dl, TII, -DPRGapSize,
@@ -430,7 +428,7 @@ void ARMFrameLowering::emitPrologue(MachineFunction &MF,
     // Since vpush register list cannot have gaps, there may be multiple vpush
     // instructions in the prologue.
     while (MBBI->getOpcode() == ARM::VSTMDDB_UPD) {
-      DefCFAOffsetCandidates.addInst(MBBI, sizeOfSPAdjustment(MBBI));
+      DefCFAOffsetCandidates.addInst(MBBI, sizeOfSPAdjustment(*MBBI));
       LastPush = MBBI++;
     }
   }
@@ -485,7 +483,7 @@ void ARMFrameLowering::emitPrologue(MachineFunction &MF,
 
     AddDefaultCC(AddDefaultPred(BuildMI(MBB, MBBI, dl, TII.get(ARM::t2SUBrr),
                                         ARM::SP)
-                                .addReg(ARM::SP, RegState::Define)
+                                .addReg(ARM::SP, RegState::Kill)
                                 .addReg(ARM::R4, RegState::Kill)
                                 .setMIFlags(MachineInstr::FrameSetup)));
     NumBytes = 0;
@@ -494,7 +492,7 @@ void ARMFrameLowering::emitPrologue(MachineFunction &MF,
   if (NumBytes) {
     // Adjust SP after all the callee-save spills.
     if (AFI->getNumAlignedDPRCS2Regs() == 0 &&
-        tryFoldSPUpdateIntoPushPop(STI, MF, LastPush, NumBytes))
+        tryFoldSPUpdateIntoPushPop(STI, MF, &*LastPush, NumBytes))
       DefCFAOffsetCandidates.addExtraBytes(LastPush, NumBytes);
     else {
       emitSPUpdate(isARM, MBB, MBBI, dl, TII, -NumBytes,
@@ -522,7 +520,7 @@ void ARMFrameLowering::emitPrologue(MachineFunction &MF,
   // that push.
   if (HasFP) {
     MachineBasicBlock::iterator AfterPush = std::next(GPRCS1Push);
-    unsigned PushSize = sizeOfSPAdjustment(GPRCS1Push);
+    unsigned PushSize = sizeOfSPAdjustment(*GPRCS1Push);
     emitRegPlusImmediate(!AFI->isThumbFunction(), MBB, AfterPush,
                          dl, TII, FramePtr, ARM::SP,
                          PushSize + FramePtrOffsetInPush,
@@ -559,7 +557,7 @@ void ARMFrameLowering::emitPrologue(MachineFunction &MF,
       case ARM::R10:
       case ARM::R11:
       case ARM::R12:
-        if (STI.isTargetDarwin())
+        if (STI.splitFramePushPop())
           break;
         // fallthrough
       case ARM::R0:
@@ -592,7 +590,7 @@ void ARMFrameLowering::emitPrologue(MachineFunction &MF,
       case ARM::R10:
       case ARM::R11:
       case ARM::R12:
-        if (STI.isTargetDarwin()) {
+        if (STI.splitFramePushPop()) {
           unsigned DwarfReg =  MRI->getDwarfRegNum(Reg, true);
           unsigned Offset = MFI->getObjectOffset(FI);
           unsigned CFIIndex = MMI.addFrameInst(
@@ -727,8 +725,8 @@ void ARMFrameLowering::emitEpilogue(MachineFunction &MF,
     if (MBBI != MBB.begin()) {
       do {
         --MBBI;
-      } while (MBBI != MBB.begin() && isCSRestore(MBBI, TII, CSRegs));
-      if (!isCSRestore(MBBI, TII, CSRegs))
+      } while (MBBI != MBB.begin() && isCSRestore(*MBBI, TII, CSRegs));
+      if (!isCSRestore(*MBBI, TII, CSRegs))
         ++MBBI;
     }
 
@@ -774,8 +772,8 @@ void ARMFrameLowering::emitEpilogue(MachineFunction &MF,
             .addReg(FramePtr));
       }
     } else if (NumBytes &&
-               !tryFoldSPUpdateIntoPushPop(STI, MF, MBBI, NumBytes))
-        emitSPUpdate(isARM, MBB, MBBI, dl, TII, NumBytes);
+               !tryFoldSPUpdateIntoPushPop(STI, MF, &*MBBI, NumBytes))
+      emitSPUpdate(isARM, MBB, MBBI, dl, TII, NumBytes);
 
     // Increment past our save areas.
     if (AFI->getDPRCalleeSavedAreaSize()) {
@@ -904,33 +902,27 @@ void ARMFrameLowering::emitPushInst(MachineBasicBlock &MBB,
     unsigned LastReg = 0;
     for (; i != 0; --i) {
       unsigned Reg = CSI[i-1].getReg();
-      if (!(Func)(Reg, STI.isTargetDarwin())) continue;
+      if (!(Func)(Reg, STI.splitFramePushPop())) continue;
 
       // D-registers in the aligned area DPRCS2 are NOT spilled here.
       if (Reg >= ARM::D8 && Reg < ARM::D8 + NumAlignedDPRCS2Regs)
         continue;
 
-      // Add the callee-saved register as live-in unless it's LR and
-      // @llvm.returnaddress is called. If LR is returned for
-      // @llvm.returnaddress then it's already added to the function and
-      // entry block live-in sets.
-      bool isKill = true;
-      if (Reg == ARM::LR) {
-        if (MF.getFrameInfo()->isReturnAddressTaken() &&
-            MF.getRegInfo().isLiveIn(Reg))
-          isKill = false;
-      }
-
-      if (isKill)
+      bool isLiveIn = MF.getRegInfo().isLiveIn(Reg);
+      if (!isLiveIn)
         MBB.addLiveIn(Reg);
-
       // If NoGap is true, push consecutive registers and then leave the rest
       // for other instructions. e.g.
       // vpush {d8, d10, d11} -> vpush {d8}, vpush {d10, d11}
       if (NoGap && LastReg && LastReg != Reg-1)
         break;
       LastReg = Reg;
-      Regs.push_back(std::make_pair(Reg, isKill));
+      // Do not set a kill flag on values that are also marked as live-in. This
+      // happens with the @llvm-returnaddress intrinsic and with arguments
+      // passed in callee saved registers.
+      // Omitting the kill flags is conservatively correct even if the live-in
+      // is not used after all.
+      Regs.push_back(std::make_pair(Reg, /*isKill=*/!isLiveIn));
     }
 
     if (Regs.empty())
@@ -991,7 +983,7 @@ void ARMFrameLowering::emitPopInst(MachineBasicBlock &MBB,
     bool DeleteRet = false;
     for (; i != 0; --i) {
       unsigned Reg = CSI[i-1].getReg();
-      if (!(Func)(Reg, STI.isTargetDarwin())) continue;
+      if (!(Func)(Reg, STI.splitFramePushPop())) continue;
 
       // The aligned reloads from area DPRCS2 are not inserted here.
       if (Reg >= ARM::D8 && Reg < ARM::D8 + NumAlignedDPRCS2Regs)
@@ -1027,7 +1019,7 @@ void ARMFrameLowering::emitPopInst(MachineBasicBlock &MBB,
       for (unsigned i = 0, e = Regs.size(); i < e; ++i)
         MIB.addReg(Regs[i], getDefRegState(true));
       if (DeleteRet && MI != MBB.end()) {
-        MIB.copyImplicitOps(&*MI);
+        MIB.copyImplicitOps(*MI);
         MI->eraseFromParent();
       }
       MI = MIB;
@@ -1367,7 +1359,7 @@ static unsigned GetFunctionSizeInBytes(const MachineFunction &MF,
   unsigned FnSize = 0;
   for (auto &MBB : MF) {
     for (auto &MI : MBB)
-      FnSize += TII.GetInstSizeInBytes(&MI);
+      FnSize += TII.GetInstSizeInBytes(MI);
   }
   return FnSize;
 }
@@ -1485,6 +1477,7 @@ void ARMFrameLowering::determineCalleeSaves(MachineFunction &MF,
   bool CS1Spilled = false;
   bool LRSpilled = false;
   unsigned NumGPRSpills = 0;
+  unsigned NumFPRSpills = 0;
   SmallVector<unsigned, 4> UnspilledCS1GPRs;
   SmallVector<unsigned, 4> UnspilledCS2GPRs;
   const ARMBaseRegisterInfo *RegInfo = static_cast<const ARMBaseRegisterInfo *>(
@@ -1539,13 +1532,22 @@ void ARMFrameLowering::determineCalleeSaves(MachineFunction &MF,
       CanEliminateFrame = false;
     }
 
-    if (!ARM::GPRRegClass.contains(Reg))
+    if (!ARM::GPRRegClass.contains(Reg)) {
+      if (Spilled) {
+        if (ARM::SPRRegClass.contains(Reg))
+          NumFPRSpills++;
+        else if (ARM::DPRRegClass.contains(Reg))
+          NumFPRSpills += 2;
+        else if (ARM::QPRRegClass.contains(Reg))
+          NumFPRSpills += 4;
+      }
       continue;
+    }
 
     if (Spilled) {
       NumGPRSpills++;
 
-      if (!STI.isTargetDarwin()) {
+      if (!STI.splitFramePushPop()) {
         if (Reg == ARM::LR)
           LRSpilled = true;
         CS1Spilled = true;
@@ -1567,7 +1569,7 @@ void ARMFrameLowering::determineCalleeSaves(MachineFunction &MF,
         break;
       }
     } else {
-      if (!STI.isTargetDarwin()) {
+      if (!STI.splitFramePushPop()) {
         UnspilledCS1GPRs.push_back(Reg);
         continue;
       }
@@ -1613,12 +1615,21 @@ void ARMFrameLowering::determineCalleeSaves(MachineFunction &MF,
   // FIXME: We could add logic to be more precise about negative offsets
   //        and which instructions will need a scratch register for them. Is it
   //        worth the effort and added fragility?
-  bool BigStack = (RS && (MFI->estimateStackSize(MF) +
-                              ((hasFP(MF) && AFI->hasStackFrame()) ? 4 : 0) >=
-                          estimateRSStackSizeLimit(MF, this))) ||
+  unsigned EstimatedStackSize =
+      MFI->estimateStackSize(MF) + 4 * (NumGPRSpills + NumFPRSpills);
+  if (hasFP(MF)) {
+    if (AFI->hasStackFrame())
+      EstimatedStackSize += 4;
+  } else {
+    // If FP is not used, SP will be used to access arguments, so count the
+    // size of arguments into the estimation.
+    EstimatedStackSize += MF.getInfo<ARMFunctionInfo>()->getArgumentStackSize();
+  }
+  EstimatedStackSize += 16; // For possible paddings.
+
+  bool BigStack = EstimatedStackSize >= estimateRSStackSizeLimit(MF, this) ||
                   MFI->hasVarSizedObjects() ||
                   (MFI->adjustsStack() && !canSimplifyCallFramePseudos(MF));
-
   bool ExtraCSSpill = false;
   if (BigStack || !CanEliminateFrame || RegInfo->cannotEliminateFrame(MF)) {
     AFI->setHasStackFrame(true);
@@ -1712,6 +1723,7 @@ void ARMFrameLowering::determineCalleeSaves(MachineFunction &MF,
       } else if (!AFI->isThumb1OnlyFunction()) {
         // note: Thumb1 functions spill to R12, not the stack.  Reserve a slot
         // closest to SP or frame pointer.
+        assert(RS && "Register scavenging not provided");
         const TargetRegisterClass *RC = &ARM::GPRRegClass;
         RS->addScavengingFrameIndex(MFI->CreateStackObject(RC->getSize(),
                                                            RC->getAlignment(),
@@ -1726,19 +1738,18 @@ void ARMFrameLowering::determineCalleeSaves(MachineFunction &MF,
   }
 }
 
-
-void ARMFrameLowering::
-eliminateCallFramePseudoInstr(MachineFunction &MF, MachineBasicBlock &MBB,
-                              MachineBasicBlock::iterator I) const {
+MachineBasicBlock::iterator ARMFrameLowering::eliminateCallFramePseudoInstr(
+    MachineFunction &MF, MachineBasicBlock &MBB,
+    MachineBasicBlock::iterator I) const {
   const ARMBaseInstrInfo &TII =
       *static_cast<const ARMBaseInstrInfo *>(MF.getSubtarget().getInstrInfo());
   if (!hasReservedCallFrame(MF)) {
     // If we have alloca, convert as follows:
     // ADJCALLSTACKDOWN -> sub, sp, sp, amount
     // ADJCALLSTACKUP   -> add, sp, sp, amount
-    MachineInstr *Old = I;
-    DebugLoc dl = Old->getDebugLoc();
-    unsigned Amount = Old->getOperand(0).getImm();
+    MachineInstr &Old = *I;
+    DebugLoc dl = Old.getDebugLoc();
+    unsigned Amount = Old.getOperand(0).getImm();
     if (Amount != 0) {
       // We need to keep the stack aligned properly.  To do this, we round the
       // amount of space needed for the outgoing arguments up to the next
@@ -1751,25 +1762,26 @@ eliminateCallFramePseudoInstr(MachineFunction &MF, MachineBasicBlock &MBB,
       bool isARM = !AFI->isThumbFunction();
 
       // Replace the pseudo instruction with a new instruction...
-      unsigned Opc = Old->getOpcode();
-      int PIdx = Old->findFirstPredOperandIdx();
-      ARMCC::CondCodes Pred = (PIdx == -1)
-        ? ARMCC::AL : (ARMCC::CondCodes)Old->getOperand(PIdx).getImm();
+      unsigned Opc = Old.getOpcode();
+      int PIdx = Old.findFirstPredOperandIdx();
+      ARMCC::CondCodes Pred =
+          (PIdx == -1) ? ARMCC::AL
+                       : (ARMCC::CondCodes)Old.getOperand(PIdx).getImm();
       if (Opc == ARM::ADJCALLSTACKDOWN || Opc == ARM::tADJCALLSTACKDOWN) {
         // Note: PredReg is operand 2 for ADJCALLSTACKDOWN.
-        unsigned PredReg = Old->getOperand(2).getReg();
+        unsigned PredReg = Old.getOperand(2).getReg();
         emitSPUpdate(isARM, MBB, I, dl, TII, -Amount, MachineInstr::NoFlags,
                      Pred, PredReg);
       } else {
         // Note: PredReg is operand 3 for ADJCALLSTACKUP.
-        unsigned PredReg = Old->getOperand(3).getReg();
+        unsigned PredReg = Old.getOperand(3).getReg();
         assert(Opc == ARM::ADJCALLSTACKUP || Opc == ARM::tADJCALLSTACKUP);
         emitSPUpdate(isARM, MBB, I, dl, TII, Amount, MachineInstr::NoFlags,
                      Pred, PredReg);
       }
     }
   }
-  MBB.erase(I);
+  return MBB.erase(I);
 }
 
 /// Get the minimum constant for ARM that is greater than or equal to the
@@ -2162,7 +2174,7 @@ void ARMFrameLowering::adjustForSegmentedStacks(
 
   PrevStackMBB->addSuccessor(McrMBB);
 
-#ifdef XDEBUG
+#ifdef EXPENSIVE_CHECKS
   MF.verify();
 #endif
 }
diff --git a/lib/Target/ARM/ARMFrameLowering.h b/lib/Target/ARM/ARMFrameLowering.h
index 66f4dfb6ef52..21cd78da395c 100644
--- a/lib/Target/ARM/ARMFrameLowering.h
+++ b/lib/Target/ARM/ARMFrameLowering.h
@@ -74,7 +74,7 @@ public:
                    bool(*Func)(unsigned, bool),
                    unsigned NumAlignedDPRCS2Regs) const;
 
-  void
+  MachineBasicBlock::iterator
   eliminateCallFramePseudoInstr(MachineFunction &MF,
                                 MachineBasicBlock &MBB,
                                 MachineBasicBlock::iterator MI) const override;
diff --git a/lib/Target/ARM/ARMHazardRecognizer.cpp b/lib/Target/ARM/ARMHazardRecognizer.cpp
index 0157c0a35286..0d904ecb6296 100644
--- a/lib/Target/ARM/ARMHazardRecognizer.cpp
+++ b/lib/Target/ARM/ARMHazardRecognizer.cpp
@@ -50,8 +50,7 @@ ARMHazardRecognizer::getHazardType(SUnit *SU, int Stalls) {
 
       // Skip over one non-VFP / NEON instruction.
       if (!LastMI->isBarrier() &&
-          // On A9, AGU and NEON/FPU are muxed.
-          !(TII.getSubtarget().isLikeA9() && LastMI->mayLoadOrStore()) &&
+          !(TII.getSubtarget().hasMuxedUnits() && LastMI->mayLoadOrStore()) &&
           (LastMCID.TSFlags & ARMII::DomainMask) == ARMII::DomainGeneral) {
         MachineBasicBlock::iterator I = LastMI;
         if (I != LastMI->getParent()->begin()) {
diff --git a/lib/Target/ARM/ARMISelDAGToDAG.cpp b/lib/Target/ARM/ARMISelDAGToDAG.cpp
index 6e7edbf9fb15..20db3d39bcae 100644
--- a/lib/Target/ARM/ARMISelDAGToDAG.cpp
+++ b/lib/Target/ARM/ARMISelDAGToDAG.cpp
@@ -29,7 +29,6 @@
 #include "llvm/IR/Intrinsics.h"
 #include "llvm/IR/LLVMContext.h"
 #include "llvm/Support/CommandLine.h"
-#include "llvm/Support/Compiler.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Target/TargetLowering.h"
@@ -44,11 +43,6 @@ DisableShifterOp("disable-shifter-op", cl::Hidden,
   cl::desc("Disable isel of shifter-op"),
   cl::init(false));
 
-static cl::opt<bool>
-CheckVMLxHazard("check-vmlx-hazard", cl::Hidden,
-  cl::desc("Check fp vmla / vmls hazard at isel time"),
-  cl::init(true));
-
 //===--------------------------------------------------------------------===//
 /// ARMDAGToDAGISel - ARM specific code to select ARM machine
 /// instructions for SelectionDAG operations.
@@ -84,12 +78,11 @@ public:
 
   /// getI32Imm - Return a target constant of type i32 with the specified
   /// value.
-  inline SDValue getI32Imm(unsigned Imm, SDLoc dl) {
+  inline SDValue getI32Imm(unsigned Imm, const SDLoc &dl) {
     return CurDAG->getTargetConstant(Imm, dl, MVT::i32);
   }
 
-  SDNode *Select(SDNode *N) override;
-
+  void Select(SDNode *N) override;
 
   bool hasNoVMLxHazardUse(SDNode *N) const;
   bool isShifterOpProfitable(const SDValue &Shift,
@@ -200,57 +193,61 @@ public:
 #include "ARMGenDAGISel.inc"
 
 private:
-  /// SelectARMIndexedLoad - Indexed (pre/post inc/dec) load matching code for
-  /// ARM.
-  SDNode *SelectARMIndexedLoad(SDNode *N);
-  SDNode *SelectT2IndexedLoad(SDNode *N);
+  /// Indexed (pre/post inc/dec) load matching code for ARM.
+  bool tryARMIndexedLoad(SDNode *N);
+  bool tryT1IndexedLoad(SDNode *N);
+  bool tryT2IndexedLoad(SDNode *N);
 
   /// SelectVLD - Select NEON load intrinsics.  NumVecs should be
   /// 1, 2, 3 or 4.  The opcode arrays specify the instructions used for
   /// loads of D registers and even subregs and odd subregs of Q registers.
   /// For NumVecs <= 2, QOpcodes1 is not used.
-  SDNode *SelectVLD(SDNode *N, bool isUpdating, unsigned NumVecs,
-                    const uint16_t *DOpcodes,
-                    const uint16_t *QOpcodes0, const uint16_t *QOpcodes1);
+  void SelectVLD(SDNode *N, bool isUpdating, unsigned NumVecs,
+                 const uint16_t *DOpcodes, const uint16_t *QOpcodes0,
+                 const uint16_t *QOpcodes1);
 
   /// SelectVST - Select NEON store intrinsics.  NumVecs should
   /// be 1, 2, 3 or 4.  The opcode arrays specify the instructions used for
   /// stores of D registers and even subregs and odd subregs of Q registers.
   /// For NumVecs <= 2, QOpcodes1 is not used.
-  SDNode *SelectVST(SDNode *N, bool isUpdating, unsigned NumVecs,
-                    const uint16_t *DOpcodes,
-                    const uint16_t *QOpcodes0, const uint16_t *QOpcodes1);
+  void SelectVST(SDNode *N, bool isUpdating, unsigned NumVecs,
+                 const uint16_t *DOpcodes, const uint16_t *QOpcodes0,
+                 const uint16_t *QOpcodes1);
 
   /// SelectVLDSTLane - Select NEON load/store lane intrinsics.  NumVecs should
   /// be 2, 3 or 4.  The opcode arrays specify the instructions used for
   /// load/store of D registers and Q registers.
-  SDNode *SelectVLDSTLane(SDNode *N, bool IsLoad,
-                          bool isUpdating, unsigned NumVecs,
-                          const uint16_t *DOpcodes, const uint16_t *QOpcodes);
+  void SelectVLDSTLane(SDNode *N, bool IsLoad, bool isUpdating,
+                       unsigned NumVecs, const uint16_t *DOpcodes,
+                       const uint16_t *QOpcodes);
 
   /// SelectVLDDup - Select NEON load-duplicate intrinsics.  NumVecs
   /// should be 2, 3 or 4.  The opcode array specifies the instructions used
   /// for loading D registers.  (Q registers are not supported.)
-  SDNode *SelectVLDDup(SDNode *N, bool isUpdating, unsigned NumVecs,
-                       const uint16_t *Opcodes);
+  void SelectVLDDup(SDNode *N, bool isUpdating, unsigned NumVecs,
+                    const uint16_t *Opcodes);
 
   /// SelectVTBL - Select NEON VTBL and VTBX intrinsics.  NumVecs should be 2,
   /// 3 or 4.  These are custom-selected so that a REG_SEQUENCE can be
   /// generated to force the table registers to be consecutive.
-  SDNode *SelectVTBL(SDNode *N, bool IsExt, unsigned NumVecs, unsigned Opc);
+  void SelectVTBL(SDNode *N, bool IsExt, unsigned NumVecs, unsigned Opc);
 
-  /// SelectV6T2BitfieldExtractOp - Select SBFX/UBFX instructions for ARM.
-  SDNode *SelectV6T2BitfieldExtractOp(SDNode *N, bool isSigned);
+  /// Try to select SBFX/UBFX instructions for ARM.
+  bool tryV6T2BitfieldExtractOp(SDNode *N, bool isSigned);
 
   // Select special operations if node forms integer ABS pattern
-  SDNode *SelectABSOp(SDNode *N);
+  bool tryABSOp(SDNode *N);
+
+  bool tryReadRegister(SDNode *N);
+  bool tryWriteRegister(SDNode *N);
 
-  SDNode *SelectReadRegister(SDNode *N);
-  SDNode *SelectWriteRegister(SDNode *N);
+  bool tryInlineAsm(SDNode *N);
 
-  SDNode *SelectInlineAsm(SDNode *N);
+  void SelectConcatVector(SDNode *N);
 
-  SDNode *SelectConcatVector(SDNode *N);
+  bool trySMLAWSMULW(SDNode *N);
+
+  void SelectCMP_SWAP(SDNode *N);
 
   /// SelectInlineAsmMemoryOperand - Implement addressing mode selection for
   /// inline asm expressions.
@@ -269,7 +266,7 @@ private:
   SDNode *createQuadQRegsNode(EVT VT, SDValue V0, SDValue V1, SDValue V2, SDValue V3);
 
   // Get the alignment operand for a NEON VLD or VST instruction.
-  SDValue GetVLDSTAlign(SDValue Align, SDLoc dl, unsigned NumVecs,
+  SDValue GetVLDSTAlign(SDValue Align, const SDLoc &dl, unsigned NumVecs,
                         bool is64BitVector);
 
   /// Returns the number of instructions required to materialize the given
@@ -426,11 +423,7 @@ bool ARMDAGToDAGISel::hasNoVMLxHazardUse(SDNode *N) const {
   if (OptLevel == CodeGenOpt::None)
     return true;
 
-  if (!CheckVMLxHazard)
-    return true;
-
-  if (!Subtarget->isCortexA7() && !Subtarget->isCortexA8() &&
-      !Subtarget->isCortexA9() && !Subtarget->isSwift())
+  if (!Subtarget->hasVMLxHazards())
     return true;
 
   if (!N->hasOneUse())
@@ -484,6 +477,7 @@ unsigned ARMDAGToDAGISel::ConstantMaterializationCost(unsigned Val) const {
   if (Subtarget->isThumb()) {
     if (Val <= 255) return 1;                               // MOV
     if (Subtarget->hasV6T2Ops() && Val <= 0xffff) return 1; // MOVW
+    if (Val <= 510) return 2;                               // MOV + ADDi8
     if (~Val <= 255) return 2;                              // MOV + MVN
     if (ARM_AM::isThumbImmShiftedVal(Val)) return 2;        // MOV + LSL
   } else {
@@ -548,11 +542,9 @@ bool ARMDAGToDAGISel::SelectImmShifterOperand(SDValue N,
     unsigned PowerOfTwo = 0;
     SDValue NewMulConst;
     if (canExtractShiftFromMul(N, 31, PowerOfTwo, NewMulConst)) {
-      BaseReg = SDValue(Select(CurDAG->getNode(ISD::MUL, SDLoc(N), MVT::i32,
-                                               N.getOperand(0), NewMulConst)
-                                   .getNode()),
-                        0);
+      HandleSDNode Handle(N);
       replaceDAGValue(N.getOperand(1), NewMulConst);
+      BaseReg = Handle.getValue();
       Opc = CurDAG->getTargetConstant(ARM_AM::getSORegOpc(ARM_AM::lsl,
                                                           PowerOfTwo),
                                       SDLoc(N), MVT::i32);
@@ -623,6 +615,7 @@ bool ARMDAGToDAGISel::SelectAddrModeImm12(SDValue N,
 
     if (N.getOpcode() == ARMISD::Wrapper &&
         N.getOperand(0).getOpcode() != ISD::TargetGlobalAddress &&
+        N.getOperand(0).getOpcode() != ISD::TargetExternalSymbol &&
         N.getOperand(0).getOpcode() != ISD::TargetGlobalTLSAddress) {
       Base = N.getOperand(0);
     } else
@@ -803,6 +796,7 @@ AddrMode2Type ARMDAGToDAGISel::SelectAddrMode2Worker(SDValue N,
           FI, TLI->getPointerTy(CurDAG->getDataLayout()));
     } else if (N.getOpcode() == ARMISD::Wrapper &&
                N.getOperand(0).getOpcode() != ISD::TargetGlobalAddress &&
+               N.getOperand(0).getOpcode() != ISD::TargetExternalSymbol &&
                N.getOperand(0).getOpcode() != ISD::TargetGlobalTLSAddress) {
       Base = N.getOperand(0);
     }
@@ -1070,6 +1064,7 @@ bool ARMDAGToDAGISel::SelectAddrMode5(SDValue N,
           FI, TLI->getPointerTy(CurDAG->getDataLayout()));
     } else if (N.getOpcode() == ARMISD::Wrapper &&
                N.getOperand(0).getOpcode() != ISD::TargetGlobalAddress &&
+               N.getOperand(0).getOpcode() != ISD::TargetExternalSymbol &&
                N.getOperand(0).getOpcode() != ISD::TargetGlobalTLSAddress) {
       Base = N.getOperand(0);
     }
@@ -1190,6 +1185,7 @@ ARMDAGToDAGISel::SelectThumbAddrModeImm5S(SDValue N, unsigned Scale,
       return false; // We want to select register offset instead
     } else if (N.getOpcode() == ARMISD::Wrapper &&
         N.getOperand(0).getOpcode() != ISD::TargetGlobalAddress &&
+        N.getOperand(0).getOpcode() != ISD::TargetExternalSymbol &&
         N.getOperand(0).getOpcode() != ISD::TargetGlobalTLSAddress) {
       Base = N.getOperand(0);
     } else {
@@ -1297,6 +1293,7 @@ bool ARMDAGToDAGISel::SelectT2AddrModeImm12(SDValue N,
 
     if (N.getOpcode() == ARMISD::Wrapper &&
         N.getOperand(0).getOpcode() != ISD::TargetGlobalAddress &&
+        N.getOperand(0).getOpcode() != ISD::TargetExternalSymbol &&
         N.getOperand(0).getOpcode() != ISD::TargetGlobalTLSAddress) {
       Base = N.getOperand(0);
       if (Base.getOpcode() == ISD::TargetConstantPool)
@@ -1468,15 +1465,15 @@ bool ARMDAGToDAGISel::SelectT2AddrModeExclusive(SDValue N, SDValue &Base,
 //===--------------------------------------------------------------------===//
 
 /// getAL - Returns a ARMCC::AL immediate node.
-static inline SDValue getAL(SelectionDAG *CurDAG, SDLoc dl) {
+static inline SDValue getAL(SelectionDAG *CurDAG, const SDLoc &dl) {
   return CurDAG->getTargetConstant((uint64_t)ARMCC::AL, dl, MVT::i32);
 }
 
-SDNode *ARMDAGToDAGISel::SelectARMIndexedLoad(SDNode *N) {
+bool ARMDAGToDAGISel::tryARMIndexedLoad(SDNode *N) {
   LoadSDNode *LD = cast<LoadSDNode>(N);
   ISD::MemIndexedMode AM = LD->getAddressingMode();
   if (AM == ISD::UNINDEXED)
-    return nullptr;
+    return false;
 
   EVT LoadedVT = LD->getMemoryVT();
   SDValue Offset, AMOpc;
@@ -1530,26 +1527,53 @@ SDNode *ARMDAGToDAGISel::SelectARMIndexedLoad(SDNode *N) {
       SDValue Base = LD->getBasePtr();
       SDValue Ops[]= { Base, AMOpc, getAL(CurDAG, SDLoc(N)),
                        CurDAG->getRegister(0, MVT::i32), Chain };
-      return CurDAG->getMachineNode(Opcode, SDLoc(N), MVT::i32,
-                                    MVT::i32, MVT::Other, Ops);
+      ReplaceNode(N, CurDAG->getMachineNode(Opcode, SDLoc(N), MVT::i32,
+                                            MVT::i32, MVT::Other, Ops));
+      return true;
     } else {
       SDValue Chain = LD->getChain();
       SDValue Base = LD->getBasePtr();
       SDValue Ops[]= { Base, Offset, AMOpc, getAL(CurDAG, SDLoc(N)),
                        CurDAG->getRegister(0, MVT::i32), Chain };
-      return CurDAG->getMachineNode(Opcode, SDLoc(N), MVT::i32,
-                                    MVT::i32, MVT::Other, Ops);
+      ReplaceNode(N, CurDAG->getMachineNode(Opcode, SDLoc(N), MVT::i32,
+                                            MVT::i32, MVT::Other, Ops));
+      return true;
     }
   }
 
-  return nullptr;
+  return false;
+}
+
+bool ARMDAGToDAGISel::tryT1IndexedLoad(SDNode *N) {
+  LoadSDNode *LD = cast<LoadSDNode>(N);
+  EVT LoadedVT = LD->getMemoryVT();
+  ISD::MemIndexedMode AM = LD->getAddressingMode();
+  if (AM == ISD::UNINDEXED || LD->getExtensionType() != ISD::NON_EXTLOAD ||
+      AM != ISD::POST_INC || LoadedVT.getSimpleVT().SimpleTy != MVT::i32)
+    return false;
+
+  auto *COffs = dyn_cast<ConstantSDNode>(LD->getOffset());
+  if (!COffs || COffs->getZExtValue() != 4)
+    return false;
+
+  // A T1 post-indexed load is just a single register LDM: LDM r0!, {r1}.
+  // The encoding of LDM is not how the rest of ISel expects a post-inc load to
+  // look however, so we use a pseudo here and switch it for a tLDMIA_UPD after
+  // ISel.
+  SDValue Chain = LD->getChain();
+  SDValue Base = LD->getBasePtr();
+  SDValue Ops[]= { Base, getAL(CurDAG, SDLoc(N)),
+                   CurDAG->getRegister(0, MVT::i32), Chain };
+  ReplaceNode(N, CurDAG->getMachineNode(ARM::tLDR_postidx, SDLoc(N), MVT::i32, MVT::i32,
+                                        MVT::Other, Ops));
+  return true;
 }
 
-SDNode *ARMDAGToDAGISel::SelectT2IndexedLoad(SDNode *N) {
+bool ARMDAGToDAGISel::tryT2IndexedLoad(SDNode *N) {
   LoadSDNode *LD = cast<LoadSDNode>(N);
   ISD::MemIndexedMode AM = LD->getAddressingMode();
   if (AM == ISD::UNINDEXED)
-    return nullptr;
+    return false;
 
   EVT LoadedVT = LD->getMemoryVT();
   bool isSExtLd = LD->getExtensionType() == ISD::SEXTLOAD;
@@ -1576,7 +1600,7 @@ SDNode *ARMDAGToDAGISel::SelectT2IndexedLoad(SDNode *N) {
         Opcode = isPre ? ARM::t2LDRB_PRE : ARM::t2LDRB_POST;
       break;
     default:
-      return nullptr;
+      return false;
     }
     Match = true;
   }
@@ -1586,11 +1610,12 @@ SDNode *ARMDAGToDAGISel::SelectT2IndexedLoad(SDNode *N) {
     SDValue Base = LD->getBasePtr();
     SDValue Ops[]= { Base, Offset, getAL(CurDAG, SDLoc(N)),
                      CurDAG->getRegister(0, MVT::i32), Chain };
-    return CurDAG->getMachineNode(Opcode, SDLoc(N), MVT::i32, MVT::i32,
-                                  MVT::Other, Ops);
+    ReplaceNode(N, CurDAG->getMachineNode(Opcode, SDLoc(N), MVT::i32, MVT::i32,
+                                          MVT::Other, Ops));
+    return true;
   }
 
-  return nullptr;
+  return false;
 }
 
 /// \brief Form a GPRPair pseudo register from a pair of GPR regs.
@@ -1685,7 +1710,7 @@ SDNode *ARMDAGToDAGISel::createQuadQRegsNode(EVT VT, SDValue V0, SDValue V1,
 /// GetVLDSTAlign - Get the alignment (in bytes) for the alignment operand
 /// of a NEON VLD or VST instruction.  The supported values depend on the
 /// number of registers being loaded.
-SDValue ARMDAGToDAGISel::GetVLDSTAlign(SDValue Align, SDLoc dl,
+SDValue ARMDAGToDAGISel::GetVLDSTAlign(SDValue Align, const SDLoc &dl,
                                        unsigned NumVecs, bool is64BitVector) {
   unsigned NumRegs = NumVecs;
   if (!is64BitVector && NumVecs < 3)
@@ -1806,17 +1831,17 @@ static unsigned getVLDSTRegisterUpdateOpcode(unsigned Opc) {
   return Opc; // If not one we handle, return it unchanged.
 }
 
-SDNode *ARMDAGToDAGISel::SelectVLD(SDNode *N, bool isUpdating, unsigned NumVecs,
-                                   const uint16_t *DOpcodes,
-                                   const uint16_t *QOpcodes0,
-                                   const uint16_t *QOpcodes1) {
+void ARMDAGToDAGISel::SelectVLD(SDNode *N, bool isUpdating, unsigned NumVecs,
+                                const uint16_t *DOpcodes,
+                                const uint16_t *QOpcodes0,
+                                const uint16_t *QOpcodes1) {
   assert(NumVecs >= 1 && NumVecs <= 4 && "VLD NumVecs out-of-range");
   SDLoc dl(N);
 
   SDValue MemAddr, Align;
   unsigned AddrOpIdx = isUpdating ? 1 : 2;
   if (!SelectAddrMode6(N, N->getOperand(AddrOpIdx), MemAddr, Align))
-    return nullptr;
+    return;
 
   SDValue Chain = N->getOperand(0);
   EVT VT = N->getValueType(0);
@@ -1922,13 +1947,16 @@ SDNode *ARMDAGToDAGISel::SelectVLD(SDNode *N, bool isUpdating, unsigned NumVecs,
   MemOp[0] = cast<MemIntrinsicSDNode>(N)->getMemOperand();
   cast<MachineSDNode>(VLd)->setMemRefs(MemOp, MemOp + 1);
 
-  if (NumVecs == 1)
-    return VLd;
+  if (NumVecs == 1) {
+    ReplaceNode(N, VLd);
+    return;
+  }
 
   // Extract out the subregisters.
   SDValue SuperReg = SDValue(VLd, 0);
-  assert(ARM::dsub_7 == ARM::dsub_0+7 &&
-         ARM::qsub_3 == ARM::qsub_0+3 && "Unexpected subreg numbering");
+  static_assert(ARM::dsub_7 == ARM::dsub_0 + 7 &&
+                    ARM::qsub_3 == ARM::qsub_0 + 3,
+                "Unexpected subreg numbering");
   unsigned Sub0 = (is64BitVector ? ARM::dsub_0 : ARM::qsub_0);
   for (unsigned Vec = 0; Vec < NumVecs; ++Vec)
     ReplaceUses(SDValue(N, Vec),
@@ -1936,13 +1964,13 @@ SDNode *ARMDAGToDAGISel::SelectVLD(SDNode *N, bool isUpdating, unsigned NumVecs,
   ReplaceUses(SDValue(N, NumVecs), SDValue(VLd, 1));
   if (isUpdating)
     ReplaceUses(SDValue(N, NumVecs + 1), SDValue(VLd, 2));
-  return nullptr;
+  CurDAG->RemoveDeadNode(N);
 }
 
-SDNode *ARMDAGToDAGISel::SelectVST(SDNode *N, bool isUpdating, unsigned NumVecs,
-                                   const uint16_t *DOpcodes,
-                                   const uint16_t *QOpcodes0,
-                                   const uint16_t *QOpcodes1) {
+void ARMDAGToDAGISel::SelectVST(SDNode *N, bool isUpdating, unsigned NumVecs,
+                                const uint16_t *DOpcodes,
+                                const uint16_t *QOpcodes0,
+                                const uint16_t *QOpcodes1) {
   assert(NumVecs >= 1 && NumVecs <= 4 && "VST NumVecs out-of-range");
   SDLoc dl(N);
 
@@ -1950,7 +1978,7 @@ SDNode *ARMDAGToDAGISel::SelectVST(SDNode *N, bool isUpdating, unsigned NumVecs,
   unsigned AddrOpIdx = isUpdating ? 1 : 2;
   unsigned Vec0Idx = 3; // AddrOpIdx + (isUpdating ? 2 : 1)
   if (!SelectAddrMode6(N, N->getOperand(AddrOpIdx), MemAddr, Align))
-    return nullptr;
+    return;
 
   MachineSDNode::mmo_iterator MemOp = MF->allocateMemRefsArray(1);
   MemOp[0] = cast<MemIntrinsicSDNode>(N)->getMemOperand();
@@ -2042,7 +2070,8 @@ SDNode *ARMDAGToDAGISel::SelectVST(SDNode *N, bool isUpdating, unsigned NumVecs,
     // Transfer memoperands.
     cast<MachineSDNode>(VSt)->setMemRefs(MemOp, MemOp + 1);
 
-    return VSt;
+    ReplaceNode(N, VSt);
+    return;
   }
 
   // Otherwise, quad registers are stored with two separate instructions,
@@ -2083,13 +2112,13 @@ SDNode *ARMDAGToDAGISel::SelectVST(SDNode *N, bool isUpdating, unsigned NumVecs,
   SDNode *VStB = CurDAG->getMachineNode(QOpcodes1[OpcodeIndex], dl, ResTys,
                                         Ops);
   cast<MachineSDNode>(VStB)->setMemRefs(MemOp, MemOp + 1);
-  return VStB;
+  ReplaceNode(N, VStB);
 }
 
-SDNode *ARMDAGToDAGISel::SelectVLDSTLane(SDNode *N, bool IsLoad,
-                                         bool isUpdating, unsigned NumVecs,
-                                         const uint16_t *DOpcodes,
-                                         const uint16_t *QOpcodes) {
+void ARMDAGToDAGISel::SelectVLDSTLane(SDNode *N, bool IsLoad, bool isUpdating,
+                                      unsigned NumVecs,
+                                      const uint16_t *DOpcodes,
+                                      const uint16_t *QOpcodes) {
   assert(NumVecs >=2 && NumVecs <= 4 && "VLDSTLane NumVecs out-of-range");
   SDLoc dl(N);
 
@@ -2097,7 +2126,7 @@ SDNode *ARMDAGToDAGISel::SelectVLDSTLane(SDNode *N, bool IsLoad,
   unsigned AddrOpIdx = isUpdating ? 1 : 2;
   unsigned Vec0Idx = 3; // AddrOpIdx + (isUpdating ? 2 : 1)
   if (!SelectAddrMode6(N, N->getOperand(AddrOpIdx), MemAddr, Align))
-    return nullptr;
+    return;
 
   MachineSDNode::mmo_iterator MemOp = MF->allocateMemRefsArray(1);
   MemOp[0] = cast<MemIntrinsicSDNode>(N)->getMemOperand();
@@ -2188,13 +2217,16 @@ SDNode *ARMDAGToDAGISel::SelectVLDSTLane(SDNode *N, bool IsLoad,
                                   QOpcodes[OpcodeIndex]);
   SDNode *VLdLn = CurDAG->getMachineNode(Opc, dl, ResTys, Ops);
   cast<MachineSDNode>(VLdLn)->setMemRefs(MemOp, MemOp + 1);
-  if (!IsLoad)
-    return VLdLn;
+  if (!IsLoad) {
+    ReplaceNode(N, VLdLn);
+    return;
+  }
 
   // Extract the subregisters.
   SuperReg = SDValue(VLdLn, 0);
-  assert(ARM::dsub_7 == ARM::dsub_0+7 &&
-         ARM::qsub_3 == ARM::qsub_0+3 && "Unexpected subreg numbering");
+  static_assert(ARM::dsub_7 == ARM::dsub_0 + 7 &&
+                    ARM::qsub_3 == ARM::qsub_0 + 3,
+                "Unexpected subreg numbering");
   unsigned Sub0 = is64BitVector ? ARM::dsub_0 : ARM::qsub_0;
   for (unsigned Vec = 0; Vec < NumVecs; ++Vec)
     ReplaceUses(SDValue(N, Vec),
@@ -2202,18 +2234,17 @@ SDNode *ARMDAGToDAGISel::SelectVLDSTLane(SDNode *N, bool IsLoad,
   ReplaceUses(SDValue(N, NumVecs), SDValue(VLdLn, 1));
   if (isUpdating)
     ReplaceUses(SDValue(N, NumVecs + 1), SDValue(VLdLn, 2));
-  return nullptr;
+  CurDAG->RemoveDeadNode(N);
 }
 
-SDNode *ARMDAGToDAGISel::SelectVLDDup(SDNode *N, bool isUpdating,
-                                      unsigned NumVecs,
-                                      const uint16_t *Opcodes) {
+void ARMDAGToDAGISel::SelectVLDDup(SDNode *N, bool isUpdating, unsigned NumVecs,
+                                   const uint16_t *Opcodes) {
   assert(NumVecs >=2 && NumVecs <= 4 && "VLDDup NumVecs out-of-range");
   SDLoc dl(N);
 
   SDValue MemAddr, Align;
   if (!SelectAddrMode6(N, N->getOperand(1), MemAddr, Align))
-    return nullptr;
+    return;
 
   MachineSDNode::mmo_iterator MemOp = MF->allocateMemRefsArray(1);
   MemOp[0] = cast<MemIntrinsicSDNode>(N)->getMemOperand();
@@ -2277,7 +2308,7 @@ SDNode *ARMDAGToDAGISel::SelectVLDDup(SDNode *N, bool isUpdating,
   SuperReg = SDValue(VLdDup, 0);
 
   // Extract the subregisters.
-  assert(ARM::dsub_7 == ARM::dsub_0+7 && "Unexpected subreg numbering");
+  static_assert(ARM::dsub_7 == ARM::dsub_0 + 7, "Unexpected subreg numbering");
   unsigned SubIdx = ARM::dsub_0;
   for (unsigned Vec = 0; Vec < NumVecs; ++Vec)
     ReplaceUses(SDValue(N, Vec),
@@ -2285,11 +2316,11 @@ SDNode *ARMDAGToDAGISel::SelectVLDDup(SDNode *N, bool isUpdating,
   ReplaceUses(SDValue(N, NumVecs), SDValue(VLdDup, 1));
   if (isUpdating)
     ReplaceUses(SDValue(N, NumVecs + 1), SDValue(VLdDup, 2));
-  return nullptr;
+  CurDAG->RemoveDeadNode(N);
 }
 
-SDNode *ARMDAGToDAGISel::SelectVTBL(SDNode *N, bool IsExt, unsigned NumVecs,
-                                    unsigned Opc) {
+void ARMDAGToDAGISel::SelectVTBL(SDNode *N, bool IsExt, unsigned NumVecs,
+                                 unsigned Opc) {
   assert(NumVecs >= 2 && NumVecs <= 4 && "VTBL NumVecs out-of-range");
   SDLoc dl(N);
   EVT VT = N->getValueType(0);
@@ -2318,13 +2349,12 @@ SDNode *ARMDAGToDAGISel::SelectVTBL(SDNode *N, bool IsExt, unsigned NumVecs,
   Ops.push_back(N->getOperand(FirstTblReg + NumVecs));
   Ops.push_back(getAL(CurDAG, dl)); // predicate
   Ops.push_back(CurDAG->getRegister(0, MVT::i32)); // predicate register
-  return CurDAG->getMachineNode(Opc, dl, VT, Ops);
+  ReplaceNode(N, CurDAG->getMachineNode(Opc, dl, VT, Ops));
 }
 
-SDNode *ARMDAGToDAGISel::SelectV6T2BitfieldExtractOp(SDNode *N,
-                                                     bool isSigned) {
+bool ARMDAGToDAGISel::tryV6T2BitfieldExtractOp(SDNode *N, bool isSigned) {
   if (!Subtarget->hasV6T2Ops())
-    return nullptr;
+    return false;
 
   unsigned Opc = isSigned
     ? (Subtarget->isThumb() ? ARM::t2SBFX : ARM::SBFX)
@@ -2338,7 +2368,7 @@ SDNode *ARMDAGToDAGISel::SelectV6T2BitfieldExtractOp(SDNode *N,
 
       // The immediate is a mask of the low bits iff imm & (imm+1) == 0
       if (And_imm & (And_imm + 1))
-        return nullptr;
+        return false;
 
       unsigned Srl_imm = 0;
       if (isOpcWithIntImmediate(N->getOperand(0).getNode(), ISD::SRL,
@@ -2358,7 +2388,8 @@ SDNode *ARMDAGToDAGISel::SelectV6T2BitfieldExtractOp(SDNode *N,
             SDValue Ops[] = { N->getOperand(0).getOperand(0),
                               CurDAG->getTargetConstant(LSB, dl, MVT::i32),
                               getAL(CurDAG, dl), Reg0, Reg0 };
-            return CurDAG->SelectNodeTo(N, Opc, MVT::i32, Ops);
+            CurDAG->SelectNodeTo(N, Opc, MVT::i32, Ops);
+            return true;
           }
 
           // ARM models shift instructions as MOVsi with shifter operand.
@@ -2368,17 +2399,19 @@ SDNode *ARMDAGToDAGISel::SelectV6T2BitfieldExtractOp(SDNode *N,
                                       MVT::i32);
           SDValue Ops[] = { N->getOperand(0).getOperand(0), ShOpc,
                             getAL(CurDAG, dl), Reg0, Reg0 };
-          return CurDAG->SelectNodeTo(N, ARM::MOVsi, MVT::i32, Ops);
+          CurDAG->SelectNodeTo(N, ARM::MOVsi, MVT::i32, Ops);
+          return true;
         }
 
         SDValue Ops[] = { N->getOperand(0).getOperand(0),
                           CurDAG->getTargetConstant(LSB, dl, MVT::i32),
                           CurDAG->getTargetConstant(Width, dl, MVT::i32),
                           getAL(CurDAG, dl), Reg0 };
-        return CurDAG->SelectNodeTo(N, Opc, MVT::i32, Ops);
+        CurDAG->SelectNodeTo(N, Opc, MVT::i32, Ops);
+        return true;
       }
     }
-    return nullptr;
+    return false;
   }
 
   // Otherwise, we're looking for a shift of a shift
@@ -2392,13 +2425,35 @@ SDNode *ARMDAGToDAGISel::SelectV6T2BitfieldExtractOp(SDNode *N,
       unsigned Width = 32 - Srl_imm - 1;
       int LSB = Srl_imm - Shl_imm;
       if (LSB < 0)
-        return nullptr;
+        return false;
       SDValue Reg0 = CurDAG->getRegister(0, MVT::i32);
       SDValue Ops[] = { N->getOperand(0).getOperand(0),
                         CurDAG->getTargetConstant(LSB, dl, MVT::i32),
                         CurDAG->getTargetConstant(Width, dl, MVT::i32),
                         getAL(CurDAG, dl), Reg0 };
-      return CurDAG->SelectNodeTo(N, Opc, MVT::i32, Ops);
+      CurDAG->SelectNodeTo(N, Opc, MVT::i32, Ops);
+      return true;
+    }
+  }
+
+  // Or we are looking for a shift of an and, with a mask operand
+  if (isOpcWithIntImmediate(N->getOperand(0).getNode(), ISD::AND, And_imm) &&
+      isShiftedMask_32(And_imm)) {
+    unsigned Srl_imm = 0;
+    unsigned LSB = countTrailingZeros(And_imm);
+    // Shift must be the same as the ands lsb
+    if (isInt32Immediate(N->getOperand(1), Srl_imm) && Srl_imm == LSB) {
+      assert(Srl_imm > 0 && Srl_imm < 32 && "bad amount in shift node!");
+      unsigned MSB = 31 - countLeadingZeros(And_imm);
+      // Note: The width operand is encoded as width-1.
+      unsigned Width = MSB - LSB;
+      SDValue Reg0 = CurDAG->getRegister(0, MVT::i32);
+      SDValue Ops[] = { N->getOperand(0).getOperand(0),
+                        CurDAG->getTargetConstant(Srl_imm, dl, MVT::i32),
+                        CurDAG->getTargetConstant(Width, dl, MVT::i32),
+                        getAL(CurDAG, dl), Reg0 };
+      CurDAG->SelectNodeTo(N, Opc, MVT::i32, Ops);
+      return true;
     }
   }
 
@@ -2407,20 +2462,21 @@ SDNode *ARMDAGToDAGISel::SelectV6T2BitfieldExtractOp(SDNode *N,
     unsigned LSB = 0;
     if (!isOpcWithIntImmediate(N->getOperand(0).getNode(), ISD::SRL, LSB) &&
         !isOpcWithIntImmediate(N->getOperand(0).getNode(), ISD::SRA, LSB))
-      return nullptr;
+      return false;
 
     if (LSB + Width > 32)
-      return nullptr;
+      return false;
 
     SDValue Reg0 = CurDAG->getRegister(0, MVT::i32);
     SDValue Ops[] = { N->getOperand(0).getOperand(0),
                       CurDAG->getTargetConstant(LSB, dl, MVT::i32),
                       CurDAG->getTargetConstant(Width - 1, dl, MVT::i32),
                       getAL(CurDAG, dl), Reg0 };
-    return CurDAG->SelectNodeTo(N, Opc, MVT::i32, Ops);
+    CurDAG->SelectNodeTo(N, Opc, MVT::i32, Ops);
+    return true;
   }
 
-  return nullptr;
+  return false;
 }
 
 /// Target-specific DAG combining for ISD::XOR.
@@ -2433,16 +2489,16 @@ SDNode *ARMDAGToDAGISel::SelectV6T2BitfieldExtractOp(SDNode *N,
 /// Y = sra (X, size(X)-1); xor (add (X, Y), Y)
 /// ARM instruction selection detects the latter and matches it to
 /// ARM::ABS or ARM::t2ABS machine node.
-SDNode *ARMDAGToDAGISel::SelectABSOp(SDNode *N){
+bool ARMDAGToDAGISel::tryABSOp(SDNode *N){
   SDValue XORSrc0 = N->getOperand(0);
   SDValue XORSrc1 = N->getOperand(1);
   EVT VT = N->getValueType(0);
 
   if (Subtarget->isThumb1Only())
-    return nullptr;
+    return false;
 
   if (XORSrc0.getOpcode() != ISD::ADD || XORSrc1.getOpcode() != ISD::SRA)
-    return nullptr;
+    return false;
 
   SDValue ADDSrc0 = XORSrc0.getOperand(0);
   SDValue ADDSrc1 = XORSrc0.getOperand(1);
@@ -2456,57 +2512,214 @@ SDNode *ARMDAGToDAGISel::SelectABSOp(SDNode *N){
       XType.isInteger() && SRAConstant != nullptr &&
       Size == SRAConstant->getZExtValue()) {
     unsigned Opcode = Subtarget->isThumb2() ? ARM::t2ABS : ARM::ABS;
-    return CurDAG->SelectNodeTo(N, Opcode, VT, ADDSrc0);
+    CurDAG->SelectNodeTo(N, Opcode, VT, ADDSrc0);
+    return true;
+  }
+
+  return false;
+}
+
+static bool SearchSignedMulShort(SDValue SignExt, unsigned *Opc, SDValue &Src1,
+                                 bool Accumulate) {
+  // For SM*WB, we need to some form of sext.
+  // For SM*WT, we need to search for (sra X, 16)
+  // Src1 then gets set to X.
+  if ((SignExt.getOpcode() == ISD::SIGN_EXTEND ||
+       SignExt.getOpcode() == ISD::SIGN_EXTEND_INREG ||
+       SignExt.getOpcode() == ISD::AssertSext) &&
+       SignExt.getValueType() == MVT::i32) {
+
+    *Opc = Accumulate ? ARM::SMLAWB : ARM::SMULWB;
+    Src1 = SignExt.getOperand(0);
+    return true;
   }
 
-  return nullptr;
+  if (SignExt.getOpcode() != ISD::SRA)
+    return false;
+
+  ConstantSDNode *SRASrc1 = dyn_cast<ConstantSDNode>(SignExt.getOperand(1));
+  if (!SRASrc1 || SRASrc1->getZExtValue() != 16)
+    return false;
+
+  SDValue Op0 = SignExt.getOperand(0);
+
+  // The sign extend operand for SM*WB could be generated by a shl and ashr.
+  if (Op0.getOpcode() == ISD::SHL) {
+    SDValue SHL = Op0;
+    ConstantSDNode *SHLSrc1 = dyn_cast<ConstantSDNode>(SHL.getOperand(1));
+    if (!SHLSrc1 || SHLSrc1->getZExtValue() != 16)
+      return false;
+
+    *Opc = Accumulate ? ARM::SMLAWB : ARM::SMULWB;
+    Src1 = Op0.getOperand(0);
+    return true;
+  }
+  *Opc = Accumulate ? ARM::SMLAWT : ARM::SMULWT;
+  Src1 = SignExt.getOperand(0);
+  return true;
 }
 
-SDNode *ARMDAGToDAGISel::SelectConcatVector(SDNode *N) {
+static bool SearchSignedMulLong(SDValue OR, unsigned *Opc, SDValue &Src0,
+                                SDValue &Src1, bool Accumulate) {
+  // First we look for:
+  // (add (or (srl ?, 16), (shl ?, 16)))
+  if (OR.getOpcode() != ISD::OR)
+    return false;
+
+  SDValue SRL = OR.getOperand(0);
+  SDValue SHL = OR.getOperand(1);
+
+  if (SRL.getOpcode() != ISD::SRL || SHL.getOpcode() != ISD::SHL) {
+    SRL = OR.getOperand(1);
+    SHL = OR.getOperand(0);
+    if (SRL.getOpcode() != ISD::SRL || SHL.getOpcode() != ISD::SHL)
+      return false;
+  }
+
+  ConstantSDNode *SRLSrc1 = dyn_cast<ConstantSDNode>(SRL.getOperand(1));
+  ConstantSDNode *SHLSrc1 = dyn_cast<ConstantSDNode>(SHL.getOperand(1));
+  if (!SRLSrc1 || !SHLSrc1 || SRLSrc1->getZExtValue() != 16 ||
+      SHLSrc1->getZExtValue() != 16)
+    return false;
+
+  // The first operands to the shifts need to be the two results from the
+  // same smul_lohi node.
+  if ((SRL.getOperand(0).getNode() != SHL.getOperand(0).getNode()) ||
+       SRL.getOperand(0).getOpcode() != ISD::SMUL_LOHI)
+    return false;
+
+  SDNode *SMULLOHI = SRL.getOperand(0).getNode();
+  if (SRL.getOperand(0) != SDValue(SMULLOHI, 0) ||
+      SHL.getOperand(0) != SDValue(SMULLOHI, 1))
+    return false;
+
+  // Now we have:
+  // (add (or (srl (smul_lohi ?, ?), 16), (shl (smul_lohi ?, ?), 16)))
+  // For SMLAW[B|T] smul_lohi will take a 32-bit and a 16-bit arguments.
+  // For SMLAWB the 16-bit value will signed extended somehow.
+  // For SMLAWT only the SRA is required.
+
+  // Check both sides of SMUL_LOHI
+  if (SearchSignedMulShort(SMULLOHI->getOperand(0), Opc, Src1, Accumulate)) {
+    Src0 = SMULLOHI->getOperand(1);
+  } else if (SearchSignedMulShort(SMULLOHI->getOperand(1), Opc, Src1,
+                                  Accumulate)) {
+    Src0 = SMULLOHI->getOperand(0);
+  } else {
+    return false;
+  }
+  return true;
+}
+
+bool ARMDAGToDAGISel::trySMLAWSMULW(SDNode *N) {
+  SDLoc dl(N);
+  SDValue Src0 = N->getOperand(0);
+  SDValue Src1 = N->getOperand(1);
+  SDValue A, B;
+  unsigned Opc = 0;
+
+  if (N->getOpcode() == ISD::ADD) {
+    if (Src0.getOpcode() != ISD::OR && Src1.getOpcode() != ISD::OR)
+      return false;
+
+    SDValue Acc;
+    if (SearchSignedMulLong(Src0, &Opc, A, B, true)) {
+      Acc = Src1;
+    } else if (SearchSignedMulLong(Src1, &Opc, A, B, true)) {
+      Acc = Src0;
+    } else {
+      return false;
+    }
+    if (Opc == 0)
+      return false;
+
+    SDValue Ops[] = { A, B, Acc, getAL(CurDAG, dl),
+                      CurDAG->getRegister(0, MVT::i32) };
+    CurDAG->SelectNodeTo(N, Opc, MVT::i32, MVT::Other, Ops);
+    return true;
+  } else if (N->getOpcode() == ISD::OR &&
+             SearchSignedMulLong(SDValue(N, 0), &Opc, A, B, false)) {
+    if (Opc == 0)
+      return false;
+
+    SDValue Ops[] = { A, B, getAL(CurDAG, dl),
+                      CurDAG->getRegister(0, MVT::i32)};
+    CurDAG->SelectNodeTo(N, Opc, MVT::i32, Ops);
+    return true;
+  }
+  return false;
+}
+
+/// We've got special pseudo-instructions for these
+void ARMDAGToDAGISel::SelectCMP_SWAP(SDNode *N) {
+  unsigned Opcode;
+  EVT MemTy = cast<MemSDNode>(N)->getMemoryVT();
+  if (MemTy == MVT::i8)
+    Opcode = ARM::CMP_SWAP_8;
+  else if (MemTy == MVT::i16)
+    Opcode = ARM::CMP_SWAP_16;
+  else if (MemTy == MVT::i32)
+    Opcode = ARM::CMP_SWAP_32;
+  else
+    llvm_unreachable("Unknown AtomicCmpSwap type");
+
+  SDValue Ops[] = {N->getOperand(1), N->getOperand(2), N->getOperand(3),
+                   N->getOperand(0)};
+  SDNode *CmpSwap = CurDAG->getMachineNode(
+      Opcode, SDLoc(N),
+      CurDAG->getVTList(MVT::i32, MVT::i32, MVT::Other), Ops);
+
+  MachineSDNode::mmo_iterator MemOp = MF->allocateMemRefsArray(1);
+  MemOp[0] = cast<MemSDNode>(N)->getMemOperand();
+  cast<MachineSDNode>(CmpSwap)->setMemRefs(MemOp, MemOp + 1);
+
+  ReplaceUses(SDValue(N, 0), SDValue(CmpSwap, 0));
+  ReplaceUses(SDValue(N, 1), SDValue(CmpSwap, 2));
+  CurDAG->RemoveDeadNode(N);
+}
+
+void ARMDAGToDAGISel::SelectConcatVector(SDNode *N) {
   // The only time a CONCAT_VECTORS operation can have legal types is when
   // two 64-bit vectors are concatenated to a 128-bit vector.
   EVT VT = N->getValueType(0);
   if (!VT.is128BitVector() || N->getNumOperands() != 2)
     llvm_unreachable("unexpected CONCAT_VECTORS");
-  return createDRegPairNode(VT, N->getOperand(0), N->getOperand(1));
+  ReplaceNode(N, createDRegPairNode(VT, N->getOperand(0), N->getOperand(1)));
 }
 
-SDNode *ARMDAGToDAGISel::Select(SDNode *N) {
+void ARMDAGToDAGISel::Select(SDNode *N) {
   SDLoc dl(N);
 
   if (N->isMachineOpcode()) {
     N->setNodeId(-1);
-    return nullptr;   // Already selected.
+    return;   // Already selected.
   }
 
   switch (N->getOpcode()) {
   default: break;
-  case ISD::WRITE_REGISTER: {
-    SDNode *ResNode = SelectWriteRegister(N);
-    if (ResNode)
-      return ResNode;
+  case ISD::ADD:
+  case ISD::OR:
+    if (trySMLAWSMULW(N))
+      return;
     break;
-  }
-  case ISD::READ_REGISTER: {
-    SDNode *ResNode = SelectReadRegister(N);
-    if (ResNode)
-      return ResNode;
+  case ISD::WRITE_REGISTER:
+    if (tryWriteRegister(N))
+      return;
     break;
-  }
-  case ISD::INLINEASM: {
-    SDNode *ResNode = SelectInlineAsm(N);
-    if (ResNode)
-      return ResNode;
+  case ISD::READ_REGISTER:
+    if (tryReadRegister(N))
+      return;
     break;
-  }
-  case ISD::XOR: {
+  case ISD::INLINEASM:
+    if (tryInlineAsm(N))
+      return;
+    break;
+  case ISD::XOR:
     // Select special operations if XOR node forms integer ABS pattern
-    SDNode *ResNode = SelectABSOp(N);
-    if (ResNode)
-      return ResNode;
+    if (tryABSOp(N))
+      return;
     // Other cases are autogenerated.
     break;
-  }
   case ISD::Constant: {
     unsigned Val = cast<ConstantSDNode>(N)->getZExtValue();
     // If we can't materialize the constant we need to use a literal pool
@@ -2530,11 +2743,11 @@ SDNode *ARMDAGToDAGISel::Select(SDNode *N) {
           CurDAG->getRegister(0, MVT::i32),
           CurDAG->getEntryNode()
         };
-        ResNode=CurDAG->getMachineNode(ARM::LDRcp, dl, MVT::i32, MVT::Other,
-                                       Ops);
+        ResNode = CurDAG->getMachineNode(ARM::LDRcp, dl, MVT::i32, MVT::Other,
+                                         Ops);
       }
-      ReplaceUses(SDValue(N, 0), SDValue(ResNode, 0));
-      return nullptr;
+      ReplaceNode(N, ResNode);
+      return;
     }
 
     // Other cases are autogenerated.
@@ -2551,25 +2764,27 @@ SDNode *ARMDAGToDAGISel::Select(SDNode *N) {
       MachineFrameInfo *MFI = MF->getFrameInfo();
       if (MFI->getObjectAlignment(FI) < 4)
         MFI->setObjectAlignment(FI, 4);
-      return CurDAG->SelectNodeTo(N, ARM::tADDframe, MVT::i32, TFI,
-                                  CurDAG->getTargetConstant(0, dl, MVT::i32));
+      CurDAG->SelectNodeTo(N, ARM::tADDframe, MVT::i32, TFI,
+                           CurDAG->getTargetConstant(0, dl, MVT::i32));
+      return;
     } else {
       unsigned Opc = ((Subtarget->isThumb() && Subtarget->hasThumb2()) ?
                       ARM::t2ADDri : ARM::ADDri);
       SDValue Ops[] = { TFI, CurDAG->getTargetConstant(0, dl, MVT::i32),
                         getAL(CurDAG, dl), CurDAG->getRegister(0, MVT::i32),
                         CurDAG->getRegister(0, MVT::i32) };
-      return CurDAG->SelectNodeTo(N, Opc, MVT::i32, Ops);
+      CurDAG->SelectNodeTo(N, Opc, MVT::i32, Ops);
+      return;
     }
   }
   case ISD::SRL:
-    if (SDNode *I = SelectV6T2BitfieldExtractOp(N, false))
-      return I;
+    if (tryV6T2BitfieldExtractOp(N, false))
+      return;
     break;
   case ISD::SIGN_EXTEND_INREG:
   case ISD::SRA:
-    if (SDNode *I = SelectV6T2BitfieldExtractOp(N, true))
-      return I;
+    if (tryV6T2BitfieldExtractOp(N, true))
+      return;
     break;
   case ISD::MUL:
     if (Subtarget->isThumb1Only())
@@ -2587,11 +2802,13 @@ SDNode *ARMDAGToDAGISel::Select(SDNode *N) {
         SDValue Reg0 = CurDAG->getRegister(0, MVT::i32);
         if (Subtarget->isThumb()) {
           SDValue Ops[] = { V, V, ShImmOp, getAL(CurDAG, dl), Reg0, Reg0 };
-          return CurDAG->SelectNodeTo(N, ARM::t2ADDrs, MVT::i32, Ops);
+          CurDAG->SelectNodeTo(N, ARM::t2ADDrs, MVT::i32, Ops);
+          return;
         } else {
           SDValue Ops[] = { V, V, Reg0, ShImmOp, getAL(CurDAG, dl), Reg0,
                             Reg0 };
-          return CurDAG->SelectNodeTo(N, ARM::ADDrsi, MVT::i32, Ops);
+          CurDAG->SelectNodeTo(N, ARM::ADDrsi, MVT::i32, Ops);
+          return;
         }
       }
       if (isPowerOf2_32(RHSV+1)) {  // 2^n-1?
@@ -2604,19 +2821,63 @@ SDNode *ARMDAGToDAGISel::Select(SDNode *N) {
         SDValue Reg0 = CurDAG->getRegister(0, MVT::i32);
         if (Subtarget->isThumb()) {
           SDValue Ops[] = { V, V, ShImmOp, getAL(CurDAG, dl), Reg0, Reg0 };
-          return CurDAG->SelectNodeTo(N, ARM::t2RSBrs, MVT::i32, Ops);
+          CurDAG->SelectNodeTo(N, ARM::t2RSBrs, MVT::i32, Ops);
+          return;
         } else {
           SDValue Ops[] = { V, V, Reg0, ShImmOp, getAL(CurDAG, dl), Reg0,
                             Reg0 };
-          return CurDAG->SelectNodeTo(N, ARM::RSBrsi, MVT::i32, Ops);
+          CurDAG->SelectNodeTo(N, ARM::RSBrsi, MVT::i32, Ops);
+          return;
         }
       }
     }
     break;
   case ISD::AND: {
     // Check for unsigned bitfield extract
-    if (SDNode *I = SelectV6T2BitfieldExtractOp(N, false))
-      return I;
+    if (tryV6T2BitfieldExtractOp(N, false))
+      return;
+
+    // If an immediate is used in an AND node, it is possible that the immediate
+    // can be more optimally materialized when negated. If this is the case we
+    // can negate the immediate and use a BIC instead.
+    auto *N1C = dyn_cast<ConstantSDNode>(N->getOperand(1));
+    if (N1C && N1C->hasOneUse() && Subtarget->isThumb()) {
+      uint32_t Imm = (uint32_t) N1C->getZExtValue();
+
+      // In Thumb2 mode, an AND can take a 12-bit immediate. If this
+      // immediate can be negated and fit in the immediate operand of
+      // a t2BIC, don't do any manual transform here as this can be
+      // handled by the generic ISel machinery.
+      bool PreferImmediateEncoding =
+        Subtarget->hasThumb2() && (is_t2_so_imm(Imm) || is_t2_so_imm_not(Imm));
+      if (!PreferImmediateEncoding &&
+          ConstantMaterializationCost(Imm) >
+              ConstantMaterializationCost(~Imm)) {
+        // The current immediate costs more to materialize than a negated
+        // immediate, so negate the immediate and use a BIC.
+        SDValue NewImm =
+          CurDAG->getConstant(~N1C->getZExtValue(), dl, MVT::i32);
+        // If the new constant didn't exist before, reposition it in the topological
+        // ordering so it is just before N. Otherwise, don't touch its location.
+        if (NewImm->getNodeId() == -1)
+          CurDAG->RepositionNode(N->getIterator(), NewImm.getNode());
+
+        if (!Subtarget->hasThumb2()) {
+          SDValue Ops[] = {CurDAG->getRegister(ARM::CPSR, MVT::i32),
+                           N->getOperand(0), NewImm, getAL(CurDAG, dl),
+                           CurDAG->getRegister(0, MVT::i32)};
+          ReplaceNode(N, CurDAG->getMachineNode(ARM::tBIC, dl, MVT::i32, Ops));
+          return;
+        } else {
+          SDValue Ops[] = {N->getOperand(0), NewImm, getAL(CurDAG, dl),
+                           CurDAG->getRegister(0, MVT::i32),
+                           CurDAG->getRegister(0, MVT::i32)};
+          ReplaceNode(N,
+                      CurDAG->getMachineNode(ARM::t2BICrr, dl, MVT::i32, Ops));
+          return;
+        }
+      }
+    }
 
     // (and (or x, c2), c1) and top 16-bits of c1 and c2 match, lower 16-bits
     // of c1 are 0xffff, and lower 16-bit of c2 are 0. That is, the top 16-bits
@@ -2632,7 +2893,7 @@ SDNode *ARMDAGToDAGISel::Select(SDNode *N) {
     if (!Opc)
       break;
     SDValue N0 = N->getOperand(0), N1 = N->getOperand(1);
-    ConstantSDNode *N1C = dyn_cast<ConstantSDNode>(N1);
+    N1C = dyn_cast<ConstantSDNode>(N1);
     if (!N1C)
       break;
     if (N0.getOpcode() == ISD::OR && N0.getNode()->hasOneUse()) {
@@ -2649,29 +2910,34 @@ SDNode *ARMDAGToDAGISel::Select(SDNode *N) {
                                                   dl, MVT::i32);
         SDValue Ops[] = { N0.getOperand(0), Imm16,
                           getAL(CurDAG, dl), CurDAG->getRegister(0, MVT::i32) };
-        return CurDAG->getMachineNode(Opc, dl, VT, Ops);
+        ReplaceNode(N, CurDAG->getMachineNode(Opc, dl, VT, Ops));
+        return;
       }
     }
     break;
   }
   case ARMISD::VMOVRRD:
-    return CurDAG->getMachineNode(ARM::VMOVRRD, dl, MVT::i32, MVT::i32,
-                                  N->getOperand(0), getAL(CurDAG, dl),
-                                  CurDAG->getRegister(0, MVT::i32));
+    ReplaceNode(N, CurDAG->getMachineNode(ARM::VMOVRRD, dl, MVT::i32, MVT::i32,
+                                          N->getOperand(0), getAL(CurDAG, dl),
+                                          CurDAG->getRegister(0, MVT::i32)));
+    return;
   case ISD::UMUL_LOHI: {
     if (Subtarget->isThumb1Only())
       break;
     if (Subtarget->isThumb()) {
       SDValue Ops[] = { N->getOperand(0), N->getOperand(1),
                         getAL(CurDAG, dl), CurDAG->getRegister(0, MVT::i32) };
-      return CurDAG->getMachineNode(ARM::t2UMULL, dl, MVT::i32, MVT::i32, Ops);
+      ReplaceNode(
+          N, CurDAG->getMachineNode(ARM::t2UMULL, dl, MVT::i32, MVT::i32, Ops));
+      return;
     } else {
       SDValue Ops[] = { N->getOperand(0), N->getOperand(1),
                         getAL(CurDAG, dl), CurDAG->getRegister(0, MVT::i32),
                         CurDAG->getRegister(0, MVT::i32) };
-      return CurDAG->getMachineNode(Subtarget->hasV6Ops() ?
-                                    ARM::UMULL : ARM::UMULLv5,
-                                    dl, MVT::i32, MVT::i32, Ops);
+      ReplaceNode(N, CurDAG->getMachineNode(
+                         Subtarget->hasV6Ops() ? ARM::UMULL : ARM::UMULLv5, dl,
+                         MVT::i32, MVT::i32, Ops));
+      return;
     }
   }
   case ISD::SMUL_LOHI: {
@@ -2680,30 +2946,76 @@ SDNode *ARMDAGToDAGISel::Select(SDNode *N) {
     if (Subtarget->isThumb()) {
       SDValue Ops[] = { N->getOperand(0), N->getOperand(1),
                         getAL(CurDAG, dl), CurDAG->getRegister(0, MVT::i32) };
-      return CurDAG->getMachineNode(ARM::t2SMULL, dl, MVT::i32, MVT::i32, Ops);
+      ReplaceNode(
+          N, CurDAG->getMachineNode(ARM::t2SMULL, dl, MVT::i32, MVT::i32, Ops));
+      return;
     } else {
       SDValue Ops[] = { N->getOperand(0), N->getOperand(1),
                         getAL(CurDAG, dl), CurDAG->getRegister(0, MVT::i32),
                         CurDAG->getRegister(0, MVT::i32) };
-      return CurDAG->getMachineNode(Subtarget->hasV6Ops() ?
-                                    ARM::SMULL : ARM::SMULLv5,
-                                    dl, MVT::i32, MVT::i32, Ops);
+      ReplaceNode(N, CurDAG->getMachineNode(
+                         Subtarget->hasV6Ops() ? ARM::SMULL : ARM::SMULLv5, dl,
+                         MVT::i32, MVT::i32, Ops));
+      return;
     }
   }
+  case ARMISD::UMAAL: {
+    unsigned Opc = Subtarget->isThumb() ? ARM::t2UMAAL : ARM::UMAAL;
+    SDValue Ops[] = { N->getOperand(0), N->getOperand(1),
+                      N->getOperand(2), N->getOperand(3),
+                      getAL(CurDAG, dl),
+                      CurDAG->getRegister(0, MVT::i32) };
+    ReplaceNode(N, CurDAG->getMachineNode(Opc, dl, MVT::i32, MVT::i32, Ops));
+    return;
+  }
   case ARMISD::UMLAL:{
+    // UMAAL is similar to UMLAL but it adds two 32-bit values to the
+    // 64-bit multiplication result.
+    if (Subtarget->hasV6Ops() && N->getOperand(2).getOpcode() == ARMISD::ADDC &&
+        N->getOperand(3).getOpcode() == ARMISD::ADDE) {
+
+      SDValue Addc = N->getOperand(2);
+      SDValue Adde = N->getOperand(3);
+
+      if (Adde.getOperand(2).getNode() == Addc.getNode()) {
+
+        ConstantSDNode *Op0 = dyn_cast<ConstantSDNode>(Adde.getOperand(0));
+        ConstantSDNode *Op1 = dyn_cast<ConstantSDNode>(Adde.getOperand(1));
+
+        if (Op0 && Op1 && Op0->getZExtValue() == 0 && Op1->getZExtValue() == 0)
+        {
+          // Select UMAAL instead: UMAAL RdLo, RdHi, Rn, Rm
+          // RdLo = one operand to be added, lower 32-bits of res
+          // RdHi = other operand to be added, upper 32-bits of res
+          // Rn = first multiply operand
+          // Rm = second multiply operand
+          SDValue Ops[] = { N->getOperand(0), N->getOperand(1),
+                            Addc.getOperand(0), Addc.getOperand(1),
+                            getAL(CurDAG, dl),
+                            CurDAG->getRegister(0, MVT::i32) };
+          unsigned opc = Subtarget->isThumb() ? ARM::t2UMAAL : ARM::UMAAL;
+          CurDAG->SelectNodeTo(N, opc, MVT::i32, MVT::i32, Ops);
+          return;
+        }
+      }
+    }
+
     if (Subtarget->isThumb()) {
       SDValue Ops[] = { N->getOperand(0), N->getOperand(1), N->getOperand(2),
                         N->getOperand(3), getAL(CurDAG, dl),
                         CurDAG->getRegister(0, MVT::i32)};
-      return CurDAG->getMachineNode(ARM::t2UMLAL, dl, MVT::i32, MVT::i32, Ops);
+      ReplaceNode(
+          N, CurDAG->getMachineNode(ARM::t2UMLAL, dl, MVT::i32, MVT::i32, Ops));
+      return;
     }else{
       SDValue Ops[] = { N->getOperand(0), N->getOperand(1), N->getOperand(2),
                         N->getOperand(3), getAL(CurDAG, dl),
                         CurDAG->getRegister(0, MVT::i32),
                         CurDAG->getRegister(0, MVT::i32) };
-      return CurDAG->getMachineNode(Subtarget->hasV6Ops() ?
-                                      ARM::UMLAL : ARM::UMLALv5,
-                                      dl, MVT::i32, MVT::i32, Ops);
+      ReplaceNode(N, CurDAG->getMachineNode(
+                         Subtarget->hasV6Ops() ? ARM::UMLAL : ARM::UMLALv5, dl,
+                         MVT::i32, MVT::i32, Ops));
+      return;
     }
   }
   case ARMISD::SMLAL:{
@@ -2711,25 +3023,29 @@ SDNode *ARMDAGToDAGISel::Select(SDNode *N) {
       SDValue Ops[] = { N->getOperand(0), N->getOperand(1), N->getOperand(2),
                         N->getOperand(3), getAL(CurDAG, dl),
                         CurDAG->getRegister(0, MVT::i32)};
-      return CurDAG->getMachineNode(ARM::t2SMLAL, dl, MVT::i32, MVT::i32, Ops);
+      ReplaceNode(
+          N, CurDAG->getMachineNode(ARM::t2SMLAL, dl, MVT::i32, MVT::i32, Ops));
+      return;
     }else{
       SDValue Ops[] = { N->getOperand(0), N->getOperand(1), N->getOperand(2),
                         N->getOperand(3), getAL(CurDAG, dl),
                         CurDAG->getRegister(0, MVT::i32),
                         CurDAG->getRegister(0, MVT::i32) };
-      return CurDAG->getMachineNode(Subtarget->hasV6Ops() ?
-                                      ARM::SMLAL : ARM::SMLALv5,
-                                      dl, MVT::i32, MVT::i32, Ops);
+      ReplaceNode(N, CurDAG->getMachineNode(
+                         Subtarget->hasV6Ops() ? ARM::SMLAL : ARM::SMLALv5, dl,
+                         MVT::i32, MVT::i32, Ops));
+      return;
     }
   }
   case ISD::LOAD: {
-    SDNode *ResNode = nullptr;
-    if (Subtarget->isThumb() && Subtarget->hasThumb2())
-      ResNode = SelectT2IndexedLoad(N);
-    else
-      ResNode = SelectARMIndexedLoad(N);
-    if (ResNode)
-      return ResNode;
+    if (Subtarget->isThumb() && Subtarget->hasThumb2()) {
+      if (tryT2IndexedLoad(N))
+        return;
+    } else if (Subtarget->isThumb()) {
+      if (tryT1IndexedLoad(N))
+        return;
+    } else if (tryARMIndexedLoad(N))
+      return;
     // Other cases are autogenerated.
     break;
   }
@@ -2770,13 +3086,14 @@ SDNode *ARMDAGToDAGISel::Select(SDNode *N) {
     }
     ReplaceUses(SDValue(N, 0),
                 SDValue(Chain.getNode(), Chain.getResNo()));
-    return nullptr;
+    CurDAG->RemoveDeadNode(N);
+    return;
   }
   case ARMISD::VZIP: {
     unsigned Opc = 0;
     EVT VT = N->getValueType(0);
     switch (VT.getSimpleVT().SimpleTy) {
-    default: return nullptr;
+    default: return;
     case MVT::v8i8:  Opc = ARM::VZIPd8; break;
     case MVT::v4i16: Opc = ARM::VZIPd16; break;
     case MVT::v2f32:
@@ -2790,13 +3107,14 @@ SDNode *ARMDAGToDAGISel::Select(SDNode *N) {
     SDValue Pred = getAL(CurDAG, dl);
     SDValue PredReg = CurDAG->getRegister(0, MVT::i32);
     SDValue Ops[] = { N->getOperand(0), N->getOperand(1), Pred, PredReg };
-    return CurDAG->getMachineNode(Opc, dl, VT, VT, Ops);
+    ReplaceNode(N, CurDAG->getMachineNode(Opc, dl, VT, VT, Ops));
+    return;
   }
   case ARMISD::VUZP: {
     unsigned Opc = 0;
     EVT VT = N->getValueType(0);
     switch (VT.getSimpleVT().SimpleTy) {
-    default: return nullptr;
+    default: return;
     case MVT::v8i8:  Opc = ARM::VUZPd8; break;
     case MVT::v4i16: Opc = ARM::VUZPd16; break;
     case MVT::v2f32:
@@ -2810,13 +3128,14 @@ SDNode *ARMDAGToDAGISel::Select(SDNode *N) {
     SDValue Pred = getAL(CurDAG, dl);
     SDValue PredReg = CurDAG->getRegister(0, MVT::i32);
     SDValue Ops[] = { N->getOperand(0), N->getOperand(1), Pred, PredReg };
-    return CurDAG->getMachineNode(Opc, dl, VT, VT, Ops);
+    ReplaceNode(N, CurDAG->getMachineNode(Opc, dl, VT, VT, Ops));
+    return;
   }
   case ARMISD::VTRN: {
     unsigned Opc = 0;
     EVT VT = N->getValueType(0);
     switch (VT.getSimpleVT().SimpleTy) {
-    default: return nullptr;
+    default: return;
     case MVT::v8i8:  Opc = ARM::VTRNd8; break;
     case MVT::v4i16: Opc = ARM::VTRNd16; break;
     case MVT::v2f32:
@@ -2829,7 +3148,8 @@ SDNode *ARMDAGToDAGISel::Select(SDNode *N) {
     SDValue Pred = getAL(CurDAG, dl);
     SDValue PredReg = CurDAG->getRegister(0, MVT::i32);
     SDValue Ops[] = { N->getOperand(0), N->getOperand(1), Pred, PredReg };
-    return CurDAG->getMachineNode(Opc, dl, VT, VT, Ops);
+    ReplaceNode(N, CurDAG->getMachineNode(Opc, dl, VT, VT, Ops));
+    return;
   }
   case ARMISD::BUILD_VECTOR: {
     EVT VecVT = N->getValueType(0);
@@ -2837,55 +3157,68 @@ SDNode *ARMDAGToDAGISel::Select(SDNode *N) {
     unsigned NumElts = VecVT.getVectorNumElements();
     if (EltVT == MVT::f64) {
       assert(NumElts == 2 && "unexpected type for BUILD_VECTOR");
-      return createDRegPairNode(VecVT, N->getOperand(0), N->getOperand(1));
+      ReplaceNode(
+          N, createDRegPairNode(VecVT, N->getOperand(0), N->getOperand(1)));
+      return;
     }
     assert(EltVT == MVT::f32 && "unexpected type for BUILD_VECTOR");
-    if (NumElts == 2)
-      return createSRegPairNode(VecVT, N->getOperand(0), N->getOperand(1));
+    if (NumElts == 2) {
+      ReplaceNode(
+          N, createSRegPairNode(VecVT, N->getOperand(0), N->getOperand(1)));
+      return;
+    }
     assert(NumElts == 4 && "unexpected type for BUILD_VECTOR");
-    return createQuadSRegsNode(VecVT, N->getOperand(0), N->getOperand(1),
-                     N->getOperand(2), N->getOperand(3));
+    ReplaceNode(N,
+                createQuadSRegsNode(VecVT, N->getOperand(0), N->getOperand(1),
+                                    N->getOperand(2), N->getOperand(3)));
+    return;
   }
 
   case ARMISD::VLD2DUP: {
     static const uint16_t Opcodes[] = { ARM::VLD2DUPd8, ARM::VLD2DUPd16,
                                         ARM::VLD2DUPd32 };
-    return SelectVLDDup(N, false, 2, Opcodes);
+    SelectVLDDup(N, false, 2, Opcodes);
+    return;
   }
 
   case ARMISD::VLD3DUP: {
     static const uint16_t Opcodes[] = { ARM::VLD3DUPd8Pseudo,
                                         ARM::VLD3DUPd16Pseudo,
                                         ARM::VLD3DUPd32Pseudo };
-    return SelectVLDDup(N, false, 3, Opcodes);
+    SelectVLDDup(N, false, 3, Opcodes);
+    return;
   }
 
   case ARMISD::VLD4DUP: {
     static const uint16_t Opcodes[] = { ARM::VLD4DUPd8Pseudo,
                                         ARM::VLD4DUPd16Pseudo,
                                         ARM::VLD4DUPd32Pseudo };
-    return SelectVLDDup(N, false, 4, Opcodes);
+    SelectVLDDup(N, false, 4, Opcodes);
+    return;
   }
 
   case ARMISD::VLD2DUP_UPD: {
     static const uint16_t Opcodes[] = { ARM::VLD2DUPd8wb_fixed,
                                         ARM::VLD2DUPd16wb_fixed,
                                         ARM::VLD2DUPd32wb_fixed };
-    return SelectVLDDup(N, true, 2, Opcodes);
+    SelectVLDDup(N, true, 2, Opcodes);
+    return;
   }
 
   case ARMISD::VLD3DUP_UPD: {
     static const uint16_t Opcodes[] = { ARM::VLD3DUPd8Pseudo_UPD,
                                         ARM::VLD3DUPd16Pseudo_UPD,
                                         ARM::VLD3DUPd32Pseudo_UPD };
-    return SelectVLDDup(N, true, 3, Opcodes);
+    SelectVLDDup(N, true, 3, Opcodes);
+    return;
   }
 
   case ARMISD::VLD4DUP_UPD: {
     static const uint16_t Opcodes[] = { ARM::VLD4DUPd8Pseudo_UPD,
                                         ARM::VLD4DUPd16Pseudo_UPD,
                                         ARM::VLD4DUPd32Pseudo_UPD };
-    return SelectVLDDup(N, true, 4, Opcodes);
+    SelectVLDDup(N, true, 4, Opcodes);
+    return;
   }
 
   case ARMISD::VLD1_UPD: {
@@ -2897,7 +3230,8 @@ SDNode *ARMDAGToDAGISel::Select(SDNode *N) {
                                          ARM::VLD1q16wb_fixed,
                                          ARM::VLD1q32wb_fixed,
                                          ARM::VLD1q64wb_fixed };
-    return SelectVLD(N, true, 1, DOpcodes, QOpcodes, nullptr);
+    SelectVLD(N, true, 1, DOpcodes, QOpcodes, nullptr);
+    return;
   }
 
   case ARMISD::VLD2_UPD: {
@@ -2908,7 +3242,8 @@ SDNode *ARMDAGToDAGISel::Select(SDNode *N) {
     static const uint16_t QOpcodes[] = { ARM::VLD2q8PseudoWB_fixed,
                                          ARM::VLD2q16PseudoWB_fixed,
                                          ARM::VLD2q32PseudoWB_fixed };
-    return SelectVLD(N, true, 2, DOpcodes, QOpcodes, nullptr);
+    SelectVLD(N, true, 2, DOpcodes, QOpcodes, nullptr);
+    return;
   }
 
   case ARMISD::VLD3_UPD: {
@@ -2922,7 +3257,8 @@ SDNode *ARMDAGToDAGISel::Select(SDNode *N) {
     static const uint16_t QOpcodes1[] = { ARM::VLD3q8oddPseudo_UPD,
                                           ARM::VLD3q16oddPseudo_UPD,
                                           ARM::VLD3q32oddPseudo_UPD };
-    return SelectVLD(N, true, 3, DOpcodes, QOpcodes0, QOpcodes1);
+    SelectVLD(N, true, 3, DOpcodes, QOpcodes0, QOpcodes1);
+    return;
   }
 
   case ARMISD::VLD4_UPD: {
@@ -2936,7 +3272,8 @@ SDNode *ARMDAGToDAGISel::Select(SDNode *N) {
     static const uint16_t QOpcodes1[] = { ARM::VLD4q8oddPseudo_UPD,
                                           ARM::VLD4q16oddPseudo_UPD,
                                           ARM::VLD4q32oddPseudo_UPD };
-    return SelectVLD(N, true, 4, DOpcodes, QOpcodes0, QOpcodes1);
+    SelectVLD(N, true, 4, DOpcodes, QOpcodes0, QOpcodes1);
+    return;
   }
 
   case ARMISD::VLD2LN_UPD: {
@@ -2945,7 +3282,8 @@ SDNode *ARMDAGToDAGISel::Select(SDNode *N) {
                                          ARM::VLD2LNd32Pseudo_UPD };
     static const uint16_t QOpcodes[] = { ARM::VLD2LNq16Pseudo_UPD,
                                          ARM::VLD2LNq32Pseudo_UPD };
-    return SelectVLDSTLane(N, true, true, 2, DOpcodes, QOpcodes);
+    SelectVLDSTLane(N, true, true, 2, DOpcodes, QOpcodes);
+    return;
   }
 
   case ARMISD::VLD3LN_UPD: {
@@ -2954,7 +3292,8 @@ SDNode *ARMDAGToDAGISel::Select(SDNode *N) {
                                          ARM::VLD3LNd32Pseudo_UPD };
     static const uint16_t QOpcodes[] = { ARM::VLD3LNq16Pseudo_UPD,
                                          ARM::VLD3LNq32Pseudo_UPD };
-    return SelectVLDSTLane(N, true, true, 3, DOpcodes, QOpcodes);
+    SelectVLDSTLane(N, true, true, 3, DOpcodes, QOpcodes);
+    return;
   }
 
   case ARMISD::VLD4LN_UPD: {
@@ -2963,7 +3302,8 @@ SDNode *ARMDAGToDAGISel::Select(SDNode *N) {
                                          ARM::VLD4LNd32Pseudo_UPD };
     static const uint16_t QOpcodes[] = { ARM::VLD4LNq16Pseudo_UPD,
                                          ARM::VLD4LNq32Pseudo_UPD };
-    return SelectVLDSTLane(N, true, true, 4, DOpcodes, QOpcodes);
+    SelectVLDSTLane(N, true, true, 4, DOpcodes, QOpcodes);
+    return;
   }
 
   case ARMISD::VST1_UPD: {
@@ -2975,7 +3315,8 @@ SDNode *ARMDAGToDAGISel::Select(SDNode *N) {
                                          ARM::VST1q16wb_fixed,
                                          ARM::VST1q32wb_fixed,
                                          ARM::VST1q64wb_fixed };
-    return SelectVST(N, true, 1, DOpcodes, QOpcodes, nullptr);
+    SelectVST(N, true, 1, DOpcodes, QOpcodes, nullptr);
+    return;
   }
 
   case ARMISD::VST2_UPD: {
@@ -2986,7 +3327,8 @@ SDNode *ARMDAGToDAGISel::Select(SDNode *N) {
     static const uint16_t QOpcodes[] = { ARM::VST2q8PseudoWB_fixed,
                                          ARM::VST2q16PseudoWB_fixed,
                                          ARM::VST2q32PseudoWB_fixed };
-    return SelectVST(N, true, 2, DOpcodes, QOpcodes, nullptr);
+    SelectVST(N, true, 2, DOpcodes, QOpcodes, nullptr);
+    return;
   }
 
   case ARMISD::VST3_UPD: {
@@ -3000,7 +3342,8 @@ SDNode *ARMDAGToDAGISel::Select(SDNode *N) {
     static const uint16_t QOpcodes1[] = { ARM::VST3q8oddPseudo_UPD,
                                           ARM::VST3q16oddPseudo_UPD,
                                           ARM::VST3q32oddPseudo_UPD };
-    return SelectVST(N, true, 3, DOpcodes, QOpcodes0, QOpcodes1);
+    SelectVST(N, true, 3, DOpcodes, QOpcodes0, QOpcodes1);
+    return;
   }
 
   case ARMISD::VST4_UPD: {
@@ -3014,7 +3357,8 @@ SDNode *ARMDAGToDAGISel::Select(SDNode *N) {
     static const uint16_t QOpcodes1[] = { ARM::VST4q8oddPseudo_UPD,
                                           ARM::VST4q16oddPseudo_UPD,
                                           ARM::VST4q32oddPseudo_UPD };
-    return SelectVST(N, true, 4, DOpcodes, QOpcodes0, QOpcodes1);
+    SelectVST(N, true, 4, DOpcodes, QOpcodes0, QOpcodes1);
+    return;
   }
 
   case ARMISD::VST2LN_UPD: {
@@ -3023,7 +3367,8 @@ SDNode *ARMDAGToDAGISel::Select(SDNode *N) {
                                          ARM::VST2LNd32Pseudo_UPD };
     static const uint16_t QOpcodes[] = { ARM::VST2LNq16Pseudo_UPD,
                                          ARM::VST2LNq32Pseudo_UPD };
-    return SelectVLDSTLane(N, false, true, 2, DOpcodes, QOpcodes);
+    SelectVLDSTLane(N, false, true, 2, DOpcodes, QOpcodes);
+    return;
   }
 
   case ARMISD::VST3LN_UPD: {
@@ -3032,7 +3377,8 @@ SDNode *ARMDAGToDAGISel::Select(SDNode *N) {
                                          ARM::VST3LNd32Pseudo_UPD };
     static const uint16_t QOpcodes[] = { ARM::VST3LNq16Pseudo_UPD,
                                          ARM::VST3LNq32Pseudo_UPD };
-    return SelectVLDSTLane(N, false, true, 3, DOpcodes, QOpcodes);
+    SelectVLDSTLane(N, false, true, 3, DOpcodes, QOpcodes);
+    return;
   }
 
   case ARMISD::VST4LN_UPD: {
@@ -3041,7 +3387,8 @@ SDNode *ARMDAGToDAGISel::Select(SDNode *N) {
                                          ARM::VST4LNd32Pseudo_UPD };
     static const uint16_t QOpcodes[] = { ARM::VST4LNq16Pseudo_UPD,
                                          ARM::VST4LNq32Pseudo_UPD };
-    return SelectVLDSTLane(N, false, true, 4, DOpcodes, QOpcodes);
+    SelectVLDSTLane(N, false, true, 4, DOpcodes, QOpcodes);
+    return;
   }
 
   case ISD::INTRINSIC_VOID:
@@ -3051,12 +3398,44 @@ SDNode *ARMDAGToDAGISel::Select(SDNode *N) {
     default:
       break;
 
+    case Intrinsic::arm_mrrc:
+    case Intrinsic::arm_mrrc2: {
+      SDLoc dl(N);
+      SDValue Chain = N->getOperand(0);
+      unsigned Opc;
+
+      if (Subtarget->isThumb())
+        Opc = (IntNo == Intrinsic::arm_mrrc ? ARM::t2MRRC : ARM::t2MRRC2);
+      else
+        Opc = (IntNo == Intrinsic::arm_mrrc ? ARM::MRRC : ARM::MRRC2);
+
+      SmallVector<SDValue, 5> Ops;
+      Ops.push_back(getI32Imm(cast<ConstantSDNode>(N->getOperand(2))->getZExtValue(), dl)); /* coproc */
+      Ops.push_back(getI32Imm(cast<ConstantSDNode>(N->getOperand(3))->getZExtValue(), dl)); /* opc */
+      Ops.push_back(getI32Imm(cast<ConstantSDNode>(N->getOperand(4))->getZExtValue(), dl)); /* CRm */
+
+      // The mrrc2 instruction in ARM doesn't allow predicates, the top 4 bits of the encoded
+      // instruction will always be '1111' but it is possible in assembly language to specify
+      // AL as a predicate to mrrc2 but it doesn't make any difference to the encoded instruction.
+      if (Opc != ARM::MRRC2) {
+        Ops.push_back(getAL(CurDAG, dl));
+        Ops.push_back(CurDAG->getRegister(0, MVT::i32));
+      }
+
+      Ops.push_back(Chain);
+
+      // Writes to two registers.
+      const EVT RetType[] = {MVT::i32, MVT::i32, MVT::Other};
+
+      ReplaceNode(N, CurDAG->getMachineNode(Opc, dl, RetType, Ops));
+      return;
+    }
     case Intrinsic::arm_ldaexd:
     case Intrinsic::arm_ldrexd: {
       SDLoc dl(N);
       SDValue Chain = N->getOperand(0);
       SDValue MemAddr = N->getOperand(2);
-      bool isThumb = Subtarget->isThumb() && Subtarget->hasThumb2();
+      bool isThumb = Subtarget->isThumb() && Subtarget->hasV8MBaselineOps();
 
       bool IsAcquire = IntNo == Intrinsic::arm_ldaexd;
       unsigned NewOpc = isThumb ? (IsAcquire ? ARM::t2LDAEXD : ARM::t2LDREXD)
@@ -3072,11 +3451,8 @@ SDNode *ARMDAGToDAGISel::Select(SDNode *N) {
       ResTys.push_back(MVT::Other);
 
       // Place arguments in the right order.
-      SmallVector<SDValue, 7> Ops;
-      Ops.push_back(MemAddr);
-      Ops.push_back(getAL(CurDAG, dl));
-      Ops.push_back(CurDAG->getRegister(0, MVT::i32));
-      Ops.push_back(Chain);
+      SDValue Ops[] = {MemAddr, getAL(CurDAG, dl),
+                       CurDAG->getRegister(0, MVT::i32), Chain};
       SDNode *Ld = CurDAG->getMachineNode(NewOpc, dl, ResTys, Ops);
       // Transfer memoperands.
       MachineSDNode::mmo_iterator MemOp = MF->allocateMemRefsArray(1);
@@ -3112,7 +3488,8 @@ SDNode *ARMDAGToDAGISel::Select(SDNode *N) {
         ReplaceUses(SDValue(N, 1), Result);
       }
       ReplaceUses(SDValue(N, 2), OutChain);
-      return nullptr;
+      CurDAG->RemoveDeadNode(N);
+      return;
     }
     case Intrinsic::arm_stlexd:
     case Intrinsic::arm_strexd: {
@@ -3150,7 +3527,8 @@ SDNode *ARMDAGToDAGISel::Select(SDNode *N) {
       MemOp[0] = cast<MemIntrinsicSDNode>(N)->getMemOperand();
       cast<MachineSDNode>(St)->setMemRefs(MemOp, MemOp + 1);
 
-      return St;
+      ReplaceNode(N, St);
+      return;
     }
 
     case Intrinsic::arm_neon_vld1: {
@@ -3158,7 +3536,8 @@ SDNode *ARMDAGToDAGISel::Select(SDNode *N) {
                                            ARM::VLD1d32, ARM::VLD1d64 };
       static const uint16_t QOpcodes[] = { ARM::VLD1q8, ARM::VLD1q16,
                                            ARM::VLD1q32, ARM::VLD1q64};
-      return SelectVLD(N, false, 1, DOpcodes, QOpcodes, nullptr);
+      SelectVLD(N, false, 1, DOpcodes, QOpcodes, nullptr);
+      return;
     }
 
     case Intrinsic::arm_neon_vld2: {
@@ -3166,7 +3545,8 @@ SDNode *ARMDAGToDAGISel::Select(SDNode *N) {
                                            ARM::VLD2d32, ARM::VLD1q64 };
       static const uint16_t QOpcodes[] = { ARM::VLD2q8Pseudo, ARM::VLD2q16Pseudo,
                                            ARM::VLD2q32Pseudo };
-      return SelectVLD(N, false, 2, DOpcodes, QOpcodes, nullptr);
+      SelectVLD(N, false, 2, DOpcodes, QOpcodes, nullptr);
+      return;
     }
 
     case Intrinsic::arm_neon_vld3: {
@@ -3180,7 +3560,8 @@ SDNode *ARMDAGToDAGISel::Select(SDNode *N) {
       static const uint16_t QOpcodes1[] = { ARM::VLD3q8oddPseudo,
                                             ARM::VLD3q16oddPseudo,
                                             ARM::VLD3q32oddPseudo };
-      return SelectVLD(N, false, 3, DOpcodes, QOpcodes0, QOpcodes1);
+      SelectVLD(N, false, 3, DOpcodes, QOpcodes0, QOpcodes1);
+      return;
     }
 
     case Intrinsic::arm_neon_vld4: {
@@ -3194,7 +3575,8 @@ SDNode *ARMDAGToDAGISel::Select(SDNode *N) {
       static const uint16_t QOpcodes1[] = { ARM::VLD4q8oddPseudo,
                                             ARM::VLD4q16oddPseudo,
                                             ARM::VLD4q32oddPseudo };
-      return SelectVLD(N, false, 4, DOpcodes, QOpcodes0, QOpcodes1);
+      SelectVLD(N, false, 4, DOpcodes, QOpcodes0, QOpcodes1);
+      return;
     }
 
     case Intrinsic::arm_neon_vld2lane: {
@@ -3203,7 +3585,8 @@ SDNode *ARMDAGToDAGISel::Select(SDNode *N) {
                                            ARM::VLD2LNd32Pseudo };
       static const uint16_t QOpcodes[] = { ARM::VLD2LNq16Pseudo,
                                            ARM::VLD2LNq32Pseudo };
-      return SelectVLDSTLane(N, true, false, 2, DOpcodes, QOpcodes);
+      SelectVLDSTLane(N, true, false, 2, DOpcodes, QOpcodes);
+      return;
     }
 
     case Intrinsic::arm_neon_vld3lane: {
@@ -3212,7 +3595,8 @@ SDNode *ARMDAGToDAGISel::Select(SDNode *N) {
                                            ARM::VLD3LNd32Pseudo };
       static const uint16_t QOpcodes[] = { ARM::VLD3LNq16Pseudo,
                                            ARM::VLD3LNq32Pseudo };
-      return SelectVLDSTLane(N, true, false, 3, DOpcodes, QOpcodes);
+      SelectVLDSTLane(N, true, false, 3, DOpcodes, QOpcodes);
+      return;
     }
 
     case Intrinsic::arm_neon_vld4lane: {
@@ -3221,7 +3605,8 @@ SDNode *ARMDAGToDAGISel::Select(SDNode *N) {
                                            ARM::VLD4LNd32Pseudo };
       static const uint16_t QOpcodes[] = { ARM::VLD4LNq16Pseudo,
                                            ARM::VLD4LNq32Pseudo };
-      return SelectVLDSTLane(N, true, false, 4, DOpcodes, QOpcodes);
+      SelectVLDSTLane(N, true, false, 4, DOpcodes, QOpcodes);
+      return;
     }
 
     case Intrinsic::arm_neon_vst1: {
@@ -3229,15 +3614,17 @@ SDNode *ARMDAGToDAGISel::Select(SDNode *N) {
                                            ARM::VST1d32, ARM::VST1d64 };
       static const uint16_t QOpcodes[] = { ARM::VST1q8, ARM::VST1q16,
                                            ARM::VST1q32, ARM::VST1q64 };
-      return SelectVST(N, false, 1, DOpcodes, QOpcodes, nullptr);
+      SelectVST(N, false, 1, DOpcodes, QOpcodes, nullptr);
+      return;
     }
 
     case Intrinsic::arm_neon_vst2: {
       static const uint16_t DOpcodes[] = { ARM::VST2d8, ARM::VST2d16,
                                            ARM::VST2d32, ARM::VST1q64 };
-      static uint16_t QOpcodes[] = { ARM::VST2q8Pseudo, ARM::VST2q16Pseudo,
-                                     ARM::VST2q32Pseudo };
-      return SelectVST(N, false, 2, DOpcodes, QOpcodes, nullptr);
+      static const uint16_t QOpcodes[] = { ARM::VST2q8Pseudo, ARM::VST2q16Pseudo,
+                                           ARM::VST2q32Pseudo };
+      SelectVST(N, false, 2, DOpcodes, QOpcodes, nullptr);
+      return;
     }
 
     case Intrinsic::arm_neon_vst3: {
@@ -3251,7 +3638,8 @@ SDNode *ARMDAGToDAGISel::Select(SDNode *N) {
       static const uint16_t QOpcodes1[] = { ARM::VST3q8oddPseudo,
                                             ARM::VST3q16oddPseudo,
                                             ARM::VST3q32oddPseudo };
-      return SelectVST(N, false, 3, DOpcodes, QOpcodes0, QOpcodes1);
+      SelectVST(N, false, 3, DOpcodes, QOpcodes0, QOpcodes1);
+      return;
     }
 
     case Intrinsic::arm_neon_vst4: {
@@ -3265,7 +3653,8 @@ SDNode *ARMDAGToDAGISel::Select(SDNode *N) {
       static const uint16_t QOpcodes1[] = { ARM::VST4q8oddPseudo,
                                             ARM::VST4q16oddPseudo,
                                             ARM::VST4q32oddPseudo };
-      return SelectVST(N, false, 4, DOpcodes, QOpcodes0, QOpcodes1);
+      SelectVST(N, false, 4, DOpcodes, QOpcodes0, QOpcodes1);
+      return;
     }
 
     case Intrinsic::arm_neon_vst2lane: {
@@ -3274,7 +3663,8 @@ SDNode *ARMDAGToDAGISel::Select(SDNode *N) {
                                            ARM::VST2LNd32Pseudo };
       static const uint16_t QOpcodes[] = { ARM::VST2LNq16Pseudo,
                                            ARM::VST2LNq32Pseudo };
-      return SelectVLDSTLane(N, false, false, 2, DOpcodes, QOpcodes);
+      SelectVLDSTLane(N, false, false, 2, DOpcodes, QOpcodes);
+      return;
     }
 
     case Intrinsic::arm_neon_vst3lane: {
@@ -3283,7 +3673,8 @@ SDNode *ARMDAGToDAGISel::Select(SDNode *N) {
                                            ARM::VST3LNd32Pseudo };
       static const uint16_t QOpcodes[] = { ARM::VST3LNq16Pseudo,
                                            ARM::VST3LNq32Pseudo };
-      return SelectVLDSTLane(N, false, false, 3, DOpcodes, QOpcodes);
+      SelectVLDSTLane(N, false, false, 3, DOpcodes, QOpcodes);
+      return;
     }
 
     case Intrinsic::arm_neon_vst4lane: {
@@ -3292,7 +3683,8 @@ SDNode *ARMDAGToDAGISel::Select(SDNode *N) {
                                            ARM::VST4LNd32Pseudo };
       static const uint16_t QOpcodes[] = { ARM::VST4LNq16Pseudo,
                                            ARM::VST4LNq32Pseudo };
-      return SelectVLDSTLane(N, false, false, 4, DOpcodes, QOpcodes);
+      SelectVLDSTLane(N, false, false, 4, DOpcodes, QOpcodes);
+      return;
     }
     }
     break;
@@ -3305,18 +3697,24 @@ SDNode *ARMDAGToDAGISel::Select(SDNode *N) {
       break;
 
     case Intrinsic::arm_neon_vtbl2:
-      return SelectVTBL(N, false, 2, ARM::VTBL2);
+      SelectVTBL(N, false, 2, ARM::VTBL2);
+      return;
     case Intrinsic::arm_neon_vtbl3:
-      return SelectVTBL(N, false, 3, ARM::VTBL3Pseudo);
+      SelectVTBL(N, false, 3, ARM::VTBL3Pseudo);
+      return;
     case Intrinsic::arm_neon_vtbl4:
-      return SelectVTBL(N, false, 4, ARM::VTBL4Pseudo);
+      SelectVTBL(N, false, 4, ARM::VTBL4Pseudo);
+      return;
 
     case Intrinsic::arm_neon_vtbx2:
-      return SelectVTBL(N, true, 2, ARM::VTBX2);
+      SelectVTBL(N, true, 2, ARM::VTBX2);
+      return;
     case Intrinsic::arm_neon_vtbx3:
-      return SelectVTBL(N, true, 3, ARM::VTBX3Pseudo);
+      SelectVTBL(N, true, 3, ARM::VTBX3Pseudo);
+      return;
     case Intrinsic::arm_neon_vtbx4:
-      return SelectVTBL(N, true, 4, ARM::VTBX4Pseudo);
+      SelectVTBL(N, true, 4, ARM::VTBX4Pseudo);
+      return;
     }
     break;
   }
@@ -3324,13 +3722,11 @@ SDNode *ARMDAGToDAGISel::Select(SDNode *N) {
   case ARMISD::VTBL1: {
     SDLoc dl(N);
     EVT VT = N->getValueType(0);
-    SmallVector<SDValue, 6> Ops;
-
-    Ops.push_back(N->getOperand(0));
-    Ops.push_back(N->getOperand(1));
-    Ops.push_back(getAL(CurDAG, dl));                // Predicate
-    Ops.push_back(CurDAG->getRegister(0, MVT::i32)); // Predicate Register
-    return CurDAG->getMachineNode(ARM::VTBL1, dl, VT, Ops);
+    SDValue Ops[] = {N->getOperand(0), N->getOperand(1),
+                     getAL(CurDAG, dl),                 // Predicate
+                     CurDAG->getRegister(0, MVT::i32)}; // Predicate Register
+    ReplaceNode(N, CurDAG->getMachineNode(ARM::VTBL1, dl, VT, Ops));
+    return;
   }
   case ARMISD::VTBL2: {
     SDLoc dl(N);
@@ -3341,19 +3737,22 @@ SDNode *ARMDAGToDAGISel::Select(SDNode *N) {
     SDValue V1 = N->getOperand(1);
     SDValue RegSeq = SDValue(createDRegPairNode(MVT::v16i8, V0, V1), 0);
 
-    SmallVector<SDValue, 6> Ops;
-    Ops.push_back(RegSeq);
-    Ops.push_back(N->getOperand(2));
-    Ops.push_back(getAL(CurDAG, dl));                // Predicate
-    Ops.push_back(CurDAG->getRegister(0, MVT::i32)); // Predicate Register
-    return CurDAG->getMachineNode(ARM::VTBL2, dl, VT, Ops);
+    SDValue Ops[] = {RegSeq, N->getOperand(2), getAL(CurDAG, dl), // Predicate
+                     CurDAG->getRegister(0, MVT::i32)}; // Predicate Register
+    ReplaceNode(N, CurDAG->getMachineNode(ARM::VTBL2, dl, VT, Ops));
+    return;
   }
 
   case ISD::CONCAT_VECTORS:
-    return SelectConcatVector(N);
+    SelectConcatVector(N);
+    return;
+
+  case ISD::ATOMIC_CMP_SWAP:
+    SelectCMP_SWAP(N);
+    return;
   }
 
-  return SelectCode(N);
+  SelectCode(N);
 }
 
 // Inspect a register string of the form
@@ -3362,8 +3761,9 @@ SDNode *ARMDAGToDAGISel::Select(SDNode *N) {
 // and obtain the integer operands from them, adding these operands to the
 // provided vector.
 static void getIntOperandsFromRegisterString(StringRef RegString,
-                                             SelectionDAG *CurDAG, SDLoc DL,
-                                             std::vector<SDValue>& Ops) {
+                                             SelectionDAG *CurDAG,
+                                             const SDLoc &DL,
+                                             std::vector<SDValue> &Ops) {
   SmallVector<StringRef, 5> Fields;
   RegString.split(Fields, ':');
 
@@ -3444,6 +3844,9 @@ static inline int getMClassRegisterSYSmValueMask(StringRef RegString) {
           .Case("basepri_max", 0x12)
           .Case("faultmask", 0x13)
           .Case("control", 0x14)
+          .Case("msplim", 0x0a)
+          .Case("psplim", 0x0b)
+          .Case("sp", 0x18)
           .Default(-1);
 }
 
@@ -3473,11 +3876,27 @@ static int getMClassRegisterMask(StringRef Reg, StringRef Flags, bool IsRead,
   if (!Subtarget->hasV7Ops() && SYSmvalue >= 0x11 && SYSmvalue <= 0x13)
     return -1;
 
+  if (Subtarget->has8MSecExt() && Flags.lower() == "ns") {
+    Flags = "";
+    SYSmvalue |= 0x80;
+  }
+
+  if (!Subtarget->has8MSecExt() &&
+      (SYSmvalue == 0xa || SYSmvalue == 0xb || SYSmvalue > 0x14))
+    return -1;
+
+  if (!Subtarget->hasV8MMainlineOps() &&
+      (SYSmvalue == 0x8a || SYSmvalue == 0x8b || SYSmvalue == 0x91 ||
+       SYSmvalue == 0x93))
+    return -1;
+
   // If it was a read then we won't be expecting flags and so at this point
   // we can return the mask.
   if (IsRead) {
-    assert (Flags.empty() && "Unexpected flags for reading M class register.");
-    return SYSmvalue;
+    if (Flags.empty())
+      return SYSmvalue;
+    else
+      return -1;
   }
 
   // We know we are now handling a write so need to get the mask for the flags.
@@ -3563,7 +3982,7 @@ static int getARClassRegisterMask(StringRef Reg, StringRef Flags) {
 // Lower the read_register intrinsic to ARM specific DAG nodes
 // using the supplied metadata string to select the instruction node to use
 // and the registers/masks to construct as operands for the node.
-SDNode *ARMDAGToDAGISel::SelectReadRegister(SDNode *N){
+bool ARMDAGToDAGISel::tryReadRegister(SDNode *N){
   const MDNodeSDNode *MD = dyn_cast<MDNodeSDNode>(N->getOperand(1));
   const MDString *RegString = dyn_cast<MDString>(MD->getMD()->getOperand(0));
   bool IsThumb2 = Subtarget->isThumb2();
@@ -3592,7 +4011,8 @@ SDNode *ARMDAGToDAGISel::SelectReadRegister(SDNode *N){
     Ops.push_back(getAL(CurDAG, DL));
     Ops.push_back(CurDAG->getRegister(0, MVT::i32));
     Ops.push_back(N->getOperand(0));
-    return CurDAG->getMachineNode(Opcode, DL, ResTypes, Ops);
+    ReplaceNode(N, CurDAG->getMachineNode(Opcode, DL, ResTypes, Ops));
+    return true;
   }
 
   std::string SpecialReg = RegString->getString().lower();
@@ -3602,8 +4022,10 @@ SDNode *ARMDAGToDAGISel::SelectReadRegister(SDNode *N){
     Ops = { CurDAG->getTargetConstant(BankedReg, DL, MVT::i32),
             getAL(CurDAG, DL), CurDAG->getRegister(0, MVT::i32),
             N->getOperand(0) };
-    return CurDAG->getMachineNode(IsThumb2 ? ARM::t2MRSbanked : ARM::MRSbanked,
-                                  DL, MVT::i32, MVT::Other, Ops);
+    ReplaceNode(
+        N, CurDAG->getMachineNode(IsThumb2 ? ARM::t2MRSbanked : ARM::MRSbanked,
+                                  DL, MVT::i32, MVT::Other, Ops));
+    return true;
   }
 
   // The VFP registers are read by creating SelectionDAG nodes with opcodes
@@ -3623,27 +4045,37 @@ SDNode *ARMDAGToDAGISel::SelectReadRegister(SDNode *N){
   // If an opcode was found then we can lower the read to a VFP instruction.
   if (Opcode) {
     if (!Subtarget->hasVFP2())
-      return nullptr;
+      return false;
     if (Opcode == ARM::VMRS_MVFR2 && !Subtarget->hasFPARMv8())
-      return nullptr;
+      return false;
 
     Ops = { getAL(CurDAG, DL), CurDAG->getRegister(0, MVT::i32),
             N->getOperand(0) };
-    return CurDAG->getMachineNode(Opcode, DL, MVT::i32, MVT::Other, Ops);
+    ReplaceNode(N,
+                CurDAG->getMachineNode(Opcode, DL, MVT::i32, MVT::Other, Ops));
+    return true;
   }
 
   // If the target is M Class then need to validate that the register string
   // is an acceptable value, so check that a mask can be constructed from the
   // string.
   if (Subtarget->isMClass()) {
-    int SYSmValue = getMClassRegisterMask(SpecialReg, "", true, Subtarget);
+    StringRef Flags = "", Reg = SpecialReg;
+    if (Reg.endswith("_ns")) {
+      Flags = "ns";
+      Reg = Reg.drop_back(3);
+    }
+
+    int SYSmValue = getMClassRegisterMask(Reg, Flags, true, Subtarget);
     if (SYSmValue == -1)
-      return nullptr;
+      return false;
 
     SDValue Ops[] = { CurDAG->getTargetConstant(SYSmValue, DL, MVT::i32),
                       getAL(CurDAG, DL), CurDAG->getRegister(0, MVT::i32),
                       N->getOperand(0) };
-    return CurDAG->getMachineNode(ARM::t2MRS_M, DL, MVT::i32, MVT::Other, Ops);
+    ReplaceNode(
+        N, CurDAG->getMachineNode(ARM::t2MRS_M, DL, MVT::i32, MVT::Other, Ops));
+    return true;
   }
 
   // Here we know the target is not M Class so we need to check if it is one
@@ -3651,24 +4083,27 @@ SDNode *ARMDAGToDAGISel::SelectReadRegister(SDNode *N){
   if (SpecialReg == "apsr" || SpecialReg == "cpsr") {
     Ops = { getAL(CurDAG, DL), CurDAG->getRegister(0, MVT::i32),
             N->getOperand(0) };
-    return CurDAG->getMachineNode(IsThumb2 ? ARM::t2MRS_AR : ARM::MRS, DL,
-                                  MVT::i32, MVT::Other, Ops);
+    ReplaceNode(N, CurDAG->getMachineNode(IsThumb2 ? ARM::t2MRS_AR : ARM::MRS,
+                                          DL, MVT::i32, MVT::Other, Ops));
+    return true;
   }
 
   if (SpecialReg == "spsr") {
     Ops = { getAL(CurDAG, DL), CurDAG->getRegister(0, MVT::i32),
             N->getOperand(0) };
-    return CurDAG->getMachineNode(IsThumb2 ? ARM::t2MRSsys_AR : ARM::MRSsys,
-                                  DL, MVT::i32, MVT::Other, Ops);
+    ReplaceNode(
+        N, CurDAG->getMachineNode(IsThumb2 ? ARM::t2MRSsys_AR : ARM::MRSsys, DL,
+                                  MVT::i32, MVT::Other, Ops));
+    return true;
   }
 
-  return nullptr;
+  return false;
 }
 
 // Lower the write_register intrinsic to ARM specific DAG nodes
 // using the supplied metadata string to select the instruction node to use
 // and the registers/masks to use in the nodes
-SDNode *ARMDAGToDAGISel::SelectWriteRegister(SDNode *N){
+bool ARMDAGToDAGISel::tryWriteRegister(SDNode *N){
   const MDNodeSDNode *MD = dyn_cast<MDNodeSDNode>(N->getOperand(1));
   const MDString *RegString = dyn_cast<MDString>(MD->getMD()->getOperand(0));
   bool IsThumb2 = Subtarget->isThumb2();
@@ -3698,7 +4133,8 @@ SDNode *ARMDAGToDAGISel::SelectWriteRegister(SDNode *N){
     Ops.push_back(CurDAG->getRegister(0, MVT::i32));
     Ops.push_back(N->getOperand(0));
 
-    return CurDAG->getMachineNode(Opcode, DL, MVT::Other, Ops);
+    ReplaceNode(N, CurDAG->getMachineNode(Opcode, DL, MVT::Other, Ops));
+    return true;
   }
 
   std::string SpecialReg = RegString->getString().lower();
@@ -3707,8 +4143,10 @@ SDNode *ARMDAGToDAGISel::SelectWriteRegister(SDNode *N){
     Ops = { CurDAG->getTargetConstant(BankedReg, DL, MVT::i32), N->getOperand(2),
             getAL(CurDAG, DL), CurDAG->getRegister(0, MVT::i32),
             N->getOperand(0) };
-    return CurDAG->getMachineNode(IsThumb2 ? ARM::t2MSRbanked : ARM::MSRbanked,
-                                  DL, MVT::Other, Ops);
+    ReplaceNode(
+        N, CurDAG->getMachineNode(IsThumb2 ? ARM::t2MSRbanked : ARM::MSRbanked,
+                                  DL, MVT::Other, Ops));
+    return true;
   }
 
   // The VFP registers are written to by creating SelectionDAG nodes with
@@ -3724,16 +4162,17 @@ SDNode *ARMDAGToDAGISel::SelectWriteRegister(SDNode *N){
 
   if (Opcode) {
     if (!Subtarget->hasVFP2())
-      return nullptr;
+      return false;
     Ops = { N->getOperand(2), getAL(CurDAG, DL),
             CurDAG->getRegister(0, MVT::i32), N->getOperand(0) };
-    return CurDAG->getMachineNode(Opcode, DL, MVT::Other, Ops);
+    ReplaceNode(N, CurDAG->getMachineNode(Opcode, DL, MVT::Other, Ops));
+    return true;
   }
 
-  SmallVector<StringRef, 5> Fields;
-  StringRef(SpecialReg).split(Fields, '_', 1, false);
-  std::string Reg = Fields[0].str();
-  StringRef Flags = Fields.size() == 2 ? Fields[1] : "";
+  std::pair<StringRef, StringRef> Fields;
+  Fields = StringRef(SpecialReg).rsplit('_');
+  std::string Reg = Fields.first.str();
+  StringRef Flags = Fields.second;
 
   // If the target was M Class then need to validate the special register value
   // and retrieve the mask for use in the instruction node.
@@ -3745,12 +4184,13 @@ SDNode *ARMDAGToDAGISel::SelectWriteRegister(SDNode *N){
     }
     int SYSmValue = getMClassRegisterMask(Reg, Flags, false, Subtarget);
     if (SYSmValue == -1)
-      return nullptr;
+      return false;
 
     SDValue Ops[] = { CurDAG->getTargetConstant(SYSmValue, DL, MVT::i32),
                       N->getOperand(2), getAL(CurDAG, DL),
                       CurDAG->getRegister(0, MVT::i32), N->getOperand(0) };
-    return CurDAG->getMachineNode(ARM::t2MSR_M, DL, MVT::Other, Ops);
+    ReplaceNode(N, CurDAG->getMachineNode(ARM::t2MSR_M, DL, MVT::Other, Ops));
+    return true;
   }
 
   // We then check to see if a valid mask can be constructed for one of the
@@ -3761,14 +4201,15 @@ SDNode *ARMDAGToDAGISel::SelectWriteRegister(SDNode *N){
     Ops = { CurDAG->getTargetConstant(Mask, DL, MVT::i32), N->getOperand(2),
             getAL(CurDAG, DL), CurDAG->getRegister(0, MVT::i32),
             N->getOperand(0) };
-    return CurDAG->getMachineNode(IsThumb2 ? ARM::t2MSR_AR : ARM::MSR,
-                                  DL, MVT::Other, Ops);
+    ReplaceNode(N, CurDAG->getMachineNode(IsThumb2 ? ARM::t2MSR_AR : ARM::MSR,
+                                          DL, MVT::Other, Ops));
+    return true;
   }
 
-  return nullptr;
+  return false;
 }
 
-SDNode *ARMDAGToDAGISel::SelectInlineAsm(SDNode *N){
+bool ARMDAGToDAGISel::tryInlineAsm(SDNode *N){
   std::vector<SDValue> AsmNodeOperands;
   unsigned Flag, Kind;
   bool Changed = false;
@@ -3823,6 +4264,17 @@ SDNode *ARMDAGToDAGISel::SelectInlineAsm(SDNode *N){
     if (Changed && InlineAsm::isUseOperandTiedToDef(Flag, DefIdx))
       IsTiedToChangedOp = OpChanged[DefIdx];
 
+    // Memory operands to inline asm in the SelectionDAG are modeled with two
+    // operands: a constant of value InlineAsm::Kind_Mem followed by the input
+    // operand. If we get here and we have a Kind_Mem, skip the next operand (so
+    // it doesn't get misinterpreted), and continue. We do this here because
+    // it's important to update the OpChanged array correctly before moving on.
+    if (Kind == InlineAsm::Kind_Mem) {
+      SDValue op = N->getOperand(++i);
+      AsmNodeOperands.push_back(op);
+      continue;
+    }
+
     if (Kind != InlineAsm::Kind_RegUse && Kind != InlineAsm::Kind_RegDef
         && Kind != InlineAsm::Kind_RegDefEarlyClobber)
       continue;
@@ -3912,12 +4364,13 @@ SDNode *ARMDAGToDAGISel::SelectInlineAsm(SDNode *N){
   if (Glue.getNode())
     AsmNodeOperands.push_back(Glue);
   if (!Changed)
-    return nullptr;
+    return false;
 
   SDValue New = CurDAG->getNode(ISD::INLINEASM, SDLoc(N),
       CurDAG->getVTList(MVT::Other, MVT::Glue), AsmNodeOperands);
   New->setNodeId(-1);
-  return New.getNode();
+  ReplaceNode(N, New.getNode());
+  return true;
 }
 
 
diff --git a/lib/Target/ARM/ARMISelLowering.cpp b/lib/Target/ARM/ARMISelLowering.cpp
index 978e99cf511e..d6e7caf98a80 100644
--- a/lib/Target/ARM/ARMISelLowering.cpp
+++ b/lib/Target/ARM/ARMISelLowering.cpp
@@ -65,6 +65,13 @@ ARMInterworking("arm-interworking", cl::Hidden,
   cl::desc("Enable / disable ARM interworking (for debugging only)"),
   cl::init(true));
 
+// Disabled for causing self-hosting failures once returned-attribute inference
+// was enabled.
+static cl::opt<bool>
+EnableThisRetForwarding("arm-this-return-forwarding", cl::Hidden,
+                        cl::desc("Directly forward this return"),
+                        cl::init(false));
+
 namespace {
   class ARMCCState : public CCState {
   public:
@@ -240,7 +247,7 @@ ARMTargetLowering::ARMTargetLowering(const TargetMachine &TM,
 
     // Set the correct calling convention for ARMv7k WatchOS. It's just
     // AAPCS_VFP for functions as simple as libcalls.
-    if (Subtarget->isTargetWatchOS()) {
+    if (Subtarget->isTargetWatchABI()) {
       for (int i = 0; i < RTLIB::UNKNOWN_LIBCALL; ++i)
         setLibcallCallingConv((RTLIB::Libcall)i, CallingConv::ARM_AAPCS_VFP);
     }
@@ -254,7 +261,7 @@ ARMTargetLowering::ARMTargetLowering(const TargetMachine &TM,
   // RTLIB
   if (Subtarget->isAAPCS_ABI() &&
       (Subtarget->isTargetAEABI() || Subtarget->isTargetGNUAEABI() ||
-       Subtarget->isTargetAndroid())) {
+       Subtarget->isTargetMuslAEABI() || Subtarget->isTargetAndroid())) {
     static const struct {
       const RTLIB::Libcall Op;
       const char * const Name;
@@ -390,10 +397,6 @@ ARMTargetLowering::ARMTargetLowering(const TargetMachine &TM,
       { RTLIB::SINTTOFP_I64_F64, "__i64tod", CallingConv::ARM_AAPCS_VFP },
       { RTLIB::UINTTOFP_I64_F32, "__u64tos", CallingConv::ARM_AAPCS_VFP },
       { RTLIB::UINTTOFP_I64_F64, "__u64tod", CallingConv::ARM_AAPCS_VFP },
-      { RTLIB::SDIV_I32, "__rt_sdiv",   CallingConv::ARM_AAPCS_VFP },
-      { RTLIB::UDIV_I32, "__rt_udiv",   CallingConv::ARM_AAPCS_VFP },
-      { RTLIB::SDIV_I64, "__rt_sdiv64", CallingConv::ARM_AAPCS_VFP },
-      { RTLIB::UDIV_I64, "__rt_udiv64", CallingConv::ARM_AAPCS_VFP },
     };
 
     for (const auto &LC : LibraryCalls) {
@@ -410,17 +413,19 @@ ARMTargetLowering::ARMTargetLowering(const TargetMachine &TM,
     setLibcallName(RTLIB::UDIVREM_I32, "__udivmodsi4");
   }
 
-  // The half <-> float conversion functions are always soft-float, but are
-  // needed for some targets which use a hard-float calling convention by
-  // default.
-  if (Subtarget->isAAPCS_ABI()) {
-    setLibcallCallingConv(RTLIB::FPROUND_F32_F16, CallingConv::ARM_AAPCS);
-    setLibcallCallingConv(RTLIB::FPROUND_F64_F16, CallingConv::ARM_AAPCS);
-    setLibcallCallingConv(RTLIB::FPEXT_F16_F32, CallingConv::ARM_AAPCS);
-  } else {
-    setLibcallCallingConv(RTLIB::FPROUND_F32_F16, CallingConv::ARM_APCS);
-    setLibcallCallingConv(RTLIB::FPROUND_F64_F16, CallingConv::ARM_APCS);
-    setLibcallCallingConv(RTLIB::FPEXT_F16_F32, CallingConv::ARM_APCS);
+  // The half <-> float conversion functions are always soft-float on
+  // non-watchos platforms, but are needed for some targets which use a
+  // hard-float calling convention by default.
+  if (!Subtarget->isTargetWatchABI()) {
+    if (Subtarget->isAAPCS_ABI()) {
+      setLibcallCallingConv(RTLIB::FPROUND_F32_F16, CallingConv::ARM_AAPCS);
+      setLibcallCallingConv(RTLIB::FPROUND_F64_F16, CallingConv::ARM_AAPCS);
+      setLibcallCallingConv(RTLIB::FPEXT_F16_F32, CallingConv::ARM_AAPCS);
+    } else {
+      setLibcallCallingConv(RTLIB::FPROUND_F32_F16, CallingConv::ARM_APCS);
+      setLibcallCallingConv(RTLIB::FPROUND_F64_F16, CallingConv::ARM_APCS);
+      setLibcallCallingConv(RTLIB::FPEXT_F16_F32, CallingConv::ARM_APCS);
+    }
   }
 
   // In EABI, these functions have an __aeabi_ prefix, but in GNUEABI they have
@@ -581,6 +586,11 @@ ARMTargetLowering::ARMTargetLowering(const TargetMachine &TM,
     setOperationAction(ISD::CTPOP,      MVT::v4i32, Custom);
     setOperationAction(ISD::CTPOP,      MVT::v4i16, Custom);
     setOperationAction(ISD::CTPOP,      MVT::v8i16, Custom);
+    setOperationAction(ISD::CTPOP,      MVT::v1i64, Expand);
+    setOperationAction(ISD::CTPOP,      MVT::v2i64, Expand);
+
+    setOperationAction(ISD::CTLZ,       MVT::v1i64, Expand);
+    setOperationAction(ISD::CTLZ,       MVT::v2i64, Expand);
 
     // NEON does not have single instruction CTTZ for vectors.
     setOperationAction(ISD::CTTZ, MVT::v8i8, Custom);
@@ -712,6 +722,10 @@ ARMTargetLowering::ARMTargetLowering(const TargetMachine &TM,
       setIndexedStoreAction(im, MVT::i16, Legal);
       setIndexedStoreAction(im, MVT::i32, Legal);
     }
+  } else {
+    // Thumb-1 has limited post-inc load/store support - LDM r0!, {r1}.
+    setIndexedLoadAction(ISD::POST_INC, MVT::i32,  Legal);
+    setIndexedStoreAction(ISD::POST_INC, MVT::i32,  Legal);
   }
 
   setOperationAction(ISD::SADDO, MVT::i32, Custom);
@@ -758,10 +772,6 @@ ARMTargetLowering::ARMTargetLowering(const TargetMachine &TM,
   if (!Subtarget->hasV5TOps() || Subtarget->isThumb1Only())
     setOperationAction(ISD::CTLZ, MVT::i32, Expand);
 
-  // These just redirect to CTTZ and CTLZ on ARM.
-  setOperationAction(ISD::CTTZ_ZERO_UNDEF  , MVT::i32  , Expand);
-  setOperationAction(ISD::CTLZ_ZERO_UNDEF  , MVT::i32  , Expand);
-
   // @llvm.readcyclecounter requires the Performance Monitors extension.
   // Default to the 0 expansion on unsupported platforms.
   // FIXME: Technically there are older ARM CPUs that have
@@ -773,19 +783,30 @@ ARMTargetLowering::ARMTargetLowering(const TargetMachine &TM,
   if (!Subtarget->hasV6Ops())
     setOperationAction(ISD::BSWAP, MVT::i32, Expand);
 
-  if (!(Subtarget->hasDivide() && Subtarget->isThumb2()) &&
-      !(Subtarget->hasDivideInARMMode() && !Subtarget->isThumb())) {
+  bool hasDivide = Subtarget->isThumb() ? Subtarget->hasDivide()
+                                        : Subtarget->hasDivideInARMMode();
+  if (!hasDivide) {
     // These are expanded into libcalls if the cpu doesn't have HW divider.
     setOperationAction(ISD::SDIV,  MVT::i32, LibCall);
     setOperationAction(ISD::UDIV,  MVT::i32, LibCall);
   }
 
+  if (Subtarget->isTargetWindows() && !Subtarget->hasDivide()) {
+    setOperationAction(ISD::SDIV, MVT::i32, Custom);
+    setOperationAction(ISD::UDIV, MVT::i32, Custom);
+
+    setOperationAction(ISD::SDIV, MVT::i64, Custom);
+    setOperationAction(ISD::UDIV, MVT::i64, Custom);
+  }
+
   setOperationAction(ISD::SREM,  MVT::i32, Expand);
   setOperationAction(ISD::UREM,  MVT::i32, Expand);
   // Register based DivRem for AEABI (RTABI 4.2)
-  if (Subtarget->isTargetAEABI() || Subtarget->isTargetAndroid()) {
+  if (Subtarget->isTargetAEABI() || Subtarget->isTargetAndroid() ||
+      Subtarget->isTargetGNUAEABI() || Subtarget->isTargetMuslAEABI()) {
     setOperationAction(ISD::SREM, MVT::i64, Custom);
     setOperationAction(ISD::UREM, MVT::i64, Custom);
+    HasStandaloneRem = false;
 
     setLibcallName(RTLIB::SDIVREM_I8,  "__aeabi_idivmod");
     setLibcallName(RTLIB::SDIVREM_I16, "__aeabi_idivmod");
@@ -807,6 +828,8 @@ ARMTargetLowering::ARMTargetLowering(const TargetMachine &TM,
 
     setOperationAction(ISD::SDIVREM, MVT::i32, Custom);
     setOperationAction(ISD::UDIVREM, MVT::i32, Custom);
+    setOperationAction(ISD::SDIVREM, MVT::i64, Custom);
+    setOperationAction(ISD::UDIVREM, MVT::i64, Custom);
   } else {
     setOperationAction(ISD::SDIVREM, MVT::i32, Expand);
     setOperationAction(ISD::UDIVREM, MVT::i32, Expand);
@@ -833,21 +856,21 @@ ARMTargetLowering::ARMTargetLowering(const TargetMachine &TM,
     setOperationAction(ISD::DYNAMIC_STACKALLOC, MVT::i32, Expand);
 
   // ARMv6 Thumb1 (except for CPUs that support dmb / dsb) and earlier use
-  // the default expansion. If we are targeting a single threaded system,
-  // then set them all for expand so we can lower them later into their
-  // non-atomic form.
-  if (TM.Options.ThreadModel == ThreadModel::Single)
-    setOperationAction(ISD::ATOMIC_FENCE,   MVT::Other, Expand);
-  else if (Subtarget->hasAnyDataBarrier() && !Subtarget->isThumb1Only()) {
+  // the default expansion.
+  InsertFencesForAtomic = false;
+  if (Subtarget->hasAnyDataBarrier() &&
+      (!Subtarget->isThumb() || Subtarget->hasV8MBaselineOps())) {
     // ATOMIC_FENCE needs custom lowering; the others should have been expanded
     // to ldrex/strex loops already.
     setOperationAction(ISD::ATOMIC_FENCE,     MVT::Other, Custom);
+    if (!Subtarget->isThumb() || !Subtarget->isMClass())
+      setOperationAction(ISD::ATOMIC_CMP_SWAP,  MVT::i64, Custom);
 
     // On v8, we have particularly efficient implementations of atomic fences
     // if they can be combined with nearby atomic loads and stores.
-    if (!Subtarget->hasV8Ops()) {
+    if (!Subtarget->hasV8Ops() || getTargetMachine().getOptLevel() == 0) {
       // Automatically insert fences (dmb ish) around ATOMIC_SWAP etc.
-      setInsertFencesForAtomic(true);
+      InsertFencesForAtomic = true;
     }
   } else {
     // If there's anything we can use as a barrier, go through custom lowering
@@ -909,6 +932,10 @@ ARMTargetLowering::ARMTargetLowering(const TargetMachine &TM,
   setOperationAction(ISD::SELECT_CC, MVT::f32, Custom);
   setOperationAction(ISD::SELECT_CC, MVT::f64, Custom);
 
+  // Thumb-1 cannot currently select ARMISD::SUBE.
+  if (!Subtarget->isThumb1Only())
+    setOperationAction(ISD::SETCCE, MVT::i32, Custom);
+
   setOperationAction(ISD::BRCOND,    MVT::Other, Expand);
   setOperationAction(ISD::BR_CC,     MVT::i32,   Custom);
   setOperationAction(ISD::BR_CC,     MVT::f32,   Custom);
@@ -956,7 +983,7 @@ ARMTargetLowering::ARMTargetLowering(const TargetMachine &TM,
   if (Subtarget->hasSinCos()) {
     setLibcallName(RTLIB::SINCOS_F32, "sincosf");
     setLibcallName(RTLIB::SINCOS_F64, "sincos");
-    if (Subtarget->isTargetWatchOS()) {
+    if (Subtarget->isTargetWatchABI()) {
       setLibcallCallingConv(RTLIB::SINCOS_F32, CallingConv::ARM_AAPCS_VFP);
       setLibcallCallingConv(RTLIB::SINCOS_F64, CallingConv::ARM_AAPCS_VFP);
     }
@@ -1039,7 +1066,7 @@ ARMTargetLowering::ARMTargetLowering(const TargetMachine &TM,
   setMinStackArgumentAlignment(4);
 
   // Prefer likely predicted branches to selects on out-of-order cores.
-  PredictableSelectIsExpensive = Subtarget->isLikeA9();
+  PredictableSelectIsExpensive = Subtarget->getSchedModel().isOutOfOrder();
 
   setMinFunctionAlignment(Subtarget->isThumb() ? 1 : 2);
 }
@@ -1106,7 +1133,6 @@ const char *ARMTargetLowering::getTargetNodeName(unsigned Opcode) const {
   case ARMISD::CALL:          return "ARMISD::CALL";
   case ARMISD::CALL_PRED:     return "ARMISD::CALL_PRED";
   case ARMISD::CALL_NOLINK:   return "ARMISD::CALL_NOLINK";
-  case ARMISD::tCALL:         return "ARMISD::tCALL";
   case ARMISD::BRCOND:        return "ARMISD::BRCOND";
   case ARMISD::BR_JT:         return "ARMISD::BR_JT";
   case ARMISD::BR2_JT:        return "ARMISD::BR2_JT";
@@ -1123,6 +1149,8 @@ const char *ARMTargetLowering::getTargetNodeName(unsigned Opcode) const {
 
   case ARMISD::CMOV:          return "ARMISD::CMOV";
 
+  case ARMISD::SSAT:          return "ARMISD::SSAT";
+
   case ARMISD::SRL_FLAG:      return "ARMISD::SRL_FLAG";
   case ARMISD::SRA_FLAG:      return "ARMISD::SRA_FLAG";
   case ARMISD::RRX:           return "ARMISD::RRX";
@@ -1199,6 +1227,7 @@ const char *ARMTargetLowering::getTargetNodeName(unsigned Opcode) const {
   case ARMISD::VTBL2:         return "ARMISD::VTBL2";
   case ARMISD::VMULLs:        return "ARMISD::VMULLs";
   case ARMISD::VMULLu:        return "ARMISD::VMULLu";
+  case ARMISD::UMAAL:         return "ARMISD::UMAAL";
   case ARMISD::UMLAL:         return "ARMISD::UMLAL";
   case ARMISD::SMLAL:         return "ARMISD::SMLAL";
   case ARMISD::BUILD_VECTOR:  return "ARMISD::BUILD_VECTOR";
@@ -1373,7 +1402,10 @@ ARMTargetLowering::getEffectiveCallingConv(CallingConv::ID CC,
   case CallingConv::ARM_APCS:
   case CallingConv::GHC:
     return CC;
+  case CallingConv::PreserveMost:
+    return CallingConv::PreserveMost;
   case CallingConv::ARM_AAPCS_VFP:
+  case CallingConv::Swift:
     return isVarArg ? CallingConv::ARM_AAPCS : CallingConv::ARM_AAPCS_VFP;
   case CallingConv::C:
     if (!Subtarget->isAAPCS_ABI())
@@ -1415,18 +1447,18 @@ CCAssignFn *ARMTargetLowering::CCAssignFnForNode(CallingConv::ID CC,
     return (Return ? RetFastCC_ARM_APCS : FastCC_ARM_APCS);
   case CallingConv::GHC:
     return (Return ? RetCC_ARM_APCS : CC_ARM_APCS_GHC);
+  case CallingConv::PreserveMost:
+    return (Return ? RetCC_ARM_AAPCS : CC_ARM_AAPCS);
   }
 }
 
 /// LowerCallResult - Lower the result values of a call into the
 /// appropriate copies out of appropriate physical registers.
-SDValue
-ARMTargetLowering::LowerCallResult(SDValue Chain, SDValue InFlag,
-                                   CallingConv::ID CallConv, bool isVarArg,
-                                   const SmallVectorImpl<ISD::InputArg> &Ins,
-                                   SDLoc dl, SelectionDAG &DAG,
-                                   SmallVectorImpl<SDValue> &InVals,
-                                   bool isThisReturn, SDValue ThisVal) const {
+SDValue ARMTargetLowering::LowerCallResult(
+    SDValue Chain, SDValue InFlag, CallingConv::ID CallConv, bool isVarArg,
+    const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &dl,
+    SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals, bool isThisReturn,
+    SDValue ThisVal) const {
 
   // Assign locations to each value returned by this call.
   SmallVector<CCValAssign, 16> RVLocs;
@@ -1442,7 +1474,7 @@ ARMTargetLowering::LowerCallResult(SDValue Chain, SDValue InFlag,
 
     // Pass 'this' value directly from the argument to return value, to avoid
     // reg unit interference
-    if (i == 0 && isThisReturn) {
+    if (i == 0 && isThisReturn && EnableThisRetForwarding) {
       assert(!VA.needsCustom() && VA.getLocVT() == MVT::i32 &&
              "unexpected return calling convention register assignment");
       InVals.push_back(ThisVal);
@@ -1506,23 +1538,21 @@ ARMTargetLowering::LowerCallResult(SDValue Chain, SDValue InFlag,
 }
 
 /// LowerMemOpCallTo - Store the argument to the stack.
-SDValue
-ARMTargetLowering::LowerMemOpCallTo(SDValue Chain,
-                                    SDValue StackPtr, SDValue Arg,
-                                    SDLoc dl, SelectionDAG &DAG,
-                                    const CCValAssign &VA,
-                                    ISD::ArgFlagsTy Flags) const {
+SDValue ARMTargetLowering::LowerMemOpCallTo(SDValue Chain, SDValue StackPtr,
+                                            SDValue Arg, const SDLoc &dl,
+                                            SelectionDAG &DAG,
+                                            const CCValAssign &VA,
+                                            ISD::ArgFlagsTy Flags) const {
   unsigned LocMemOffset = VA.getLocMemOffset();
   SDValue PtrOff = DAG.getIntPtrConstant(LocMemOffset, dl);
   PtrOff = DAG.getNode(ISD::ADD, dl, getPointerTy(DAG.getDataLayout()),
                        StackPtr, PtrOff);
   return DAG.getStore(
       Chain, dl, Arg, PtrOff,
-      MachinePointerInfo::getStack(DAG.getMachineFunction(), LocMemOffset),
-      false, false, 0);
+      MachinePointerInfo::getStack(DAG.getMachineFunction(), LocMemOffset));
 }
 
-void ARMTargetLowering::PassF64ArgInRegs(SDLoc dl, SelectionDAG &DAG,
+void ARMTargetLowering::PassF64ArgInRegs(const SDLoc &dl, SelectionDAG &DAG,
                                          SDValue Chain, SDValue &Arg,
                                          RegsToPassVector &RegsToPass,
                                          CCValAssign &VA, CCValAssign &NextVA,
@@ -1704,7 +1734,6 @@ ARMTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
           SDValue AddArg = DAG.getNode(ISD::ADD, dl, PtrVT, Arg, Const);
           SDValue Load = DAG.getLoad(PtrVT, dl, Chain, AddArg,
                                      MachinePointerInfo(),
-                                     false, false, false,
                                      DAG.InferPtrAlignment(AddArg));
           MemOpChains.push_back(Load.getValue(1));
           RegsToPass.push_back(std::make_pair(j, Load));
@@ -1780,20 +1809,27 @@ ARMTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
   // direct call is) turn it into a TargetGlobalAddress/TargetExternalSymbol
   // node so that legalize doesn't hack it.
   bool isDirect = false;
-  bool isARMFunc = false;
+
+  const TargetMachine &TM = getTargetMachine();
+  const Module *Mod = MF.getFunction()->getParent();
+  const GlobalValue *GV = nullptr;
+  if (GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee))
+    GV = G->getGlobal();
+  bool isStub =
+      !TM.shouldAssumeDSOLocal(*Mod, GV) && Subtarget->isTargetMachO();
+
+  bool isARMFunc = !Subtarget->isThumb() || (isStub && !Subtarget->isMClass());
   bool isLocalARMFunc = false;
   ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>();
   auto PtrVt = getPointerTy(DAG.getDataLayout());
 
   if (Subtarget->genLongCalls()) {
-    assert((Subtarget->isTargetWindows() ||
-            getTargetMachine().getRelocationModel() == Reloc::Static) &&
-           "long-calls with non-static relocation model!");
+    assert((!isPositionIndependent() || Subtarget->isTargetWindows()) &&
+           "long-calls codegen is not position independent!");
     // Handle a global address or an external symbol. If it's not one of
     // those, the target's already in a register, so we don't need to do
     // anything extra.
-    if (GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee)) {
-      const GlobalValue *GV = G->getGlobal();
+    if (isa<GlobalAddressSDNode>(Callee)) {
       // Create a constant pool entry for the callee address
       unsigned ARMPCLabelIndex = AFI->createPICLabelUId();
       ARMConstantPoolValue *CPV =
@@ -1804,8 +1840,7 @@ ARMTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
       CPAddr = DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, CPAddr);
       Callee = DAG.getLoad(
           PtrVt, dl, DAG.getEntryNode(), CPAddr,
-          MachinePointerInfo::getConstantPool(DAG.getMachineFunction()), false,
-          false, false, 0);
+          MachinePointerInfo::getConstantPool(DAG.getMachineFunction()));
     } else if (ExternalSymbolSDNode *S=dyn_cast<ExternalSymbolSDNode>(Callee)) {
       const char *Sym = S->getSymbol();
 
@@ -1819,54 +1854,55 @@ ARMTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
       CPAddr = DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, CPAddr);
       Callee = DAG.getLoad(
           PtrVt, dl, DAG.getEntryNode(), CPAddr,
-          MachinePointerInfo::getConstantPool(DAG.getMachineFunction()), false,
-          false, false, 0);
-    }
-  } else if (GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee)) {
-    const GlobalValue *GV = G->getGlobal();
-    isDirect = true;
-    bool isDef = GV->isStrongDefinitionForLinker();
-    bool isStub = (!isDef && Subtarget->isTargetMachO()) &&
-                   getTargetMachine().getRelocationModel() != Reloc::Static;
-    isARMFunc = !Subtarget->isThumb() || (isStub && !Subtarget->isMClass());
-    // ARM call to a local ARM function is predicable.
-    isLocalARMFunc = !Subtarget->isThumb() && (isDef || !ARMInterworking);
-    // tBX takes a register source operand.
-    if (isStub && Subtarget->isThumb1Only() && !Subtarget->hasV5TOps()) {
-      assert(Subtarget->isTargetMachO() && "WrapperPIC use on non-MachO?");
-      Callee = DAG.getNode(
-          ARMISD::WrapperPIC, dl, PtrVt,
-          DAG.getTargetGlobalAddress(GV, dl, PtrVt, 0, ARMII::MO_NONLAZY));
-      Callee = DAG.getLoad(PtrVt, dl, DAG.getEntryNode(), Callee,
-                           MachinePointerInfo::getGOT(DAG.getMachineFunction()),
-                           false, false, true, 0);
-    } else if (Subtarget->isTargetCOFF()) {
-      assert(Subtarget->isTargetWindows() &&
-             "Windows is the only supported COFF target");
-      unsigned TargetFlags = GV->hasDLLImportStorageClass()
-                                 ? ARMII::MO_DLLIMPORT
-                                 : ARMII::MO_NO_FLAG;
-      Callee =
-          DAG.getTargetGlobalAddress(GV, dl, PtrVt, /*Offset=*/0, TargetFlags);
-      if (GV->hasDLLImportStorageClass())
+          MachinePointerInfo::getConstantPool(DAG.getMachineFunction()));
+    }
+  } else if (isa<GlobalAddressSDNode>(Callee)) {
+    // If we're optimizing for minimum size and the function is called three or
+    // more times in this block, we can improve codesize by calling indirectly
+    // as BLXr has a 16-bit encoding.
+    auto *GV = cast<GlobalAddressSDNode>(Callee)->getGlobal();
+    auto *BB = CLI.CS->getParent();
+    bool PreferIndirect =
+        Subtarget->isThumb() && MF.getFunction()->optForMinSize() &&
+        std::count_if(GV->user_begin(), GV->user_end(), [&BB](const User *U) {
+          return isa<Instruction>(U) && cast<Instruction>(U)->getParent() == BB;
+        }) > 2;
+
+    if (!PreferIndirect) {
+      isDirect = true;
+      bool isDef = GV->isStrongDefinitionForLinker();
+
+      // ARM call to a local ARM function is predicable.
+      isLocalARMFunc = !Subtarget->isThumb() && (isDef || !ARMInterworking);
+      // tBX takes a register source operand.
+      if (isStub && Subtarget->isThumb1Only() && !Subtarget->hasV5TOps()) {
+        assert(Subtarget->isTargetMachO() && "WrapperPIC use on non-MachO?");
+        Callee = DAG.getNode(
+            ARMISD::WrapperPIC, dl, PtrVt,
+            DAG.getTargetGlobalAddress(GV, dl, PtrVt, 0, ARMII::MO_NONLAZY));
         Callee =
-            DAG.getLoad(PtrVt, dl, DAG.getEntryNode(),
-                        DAG.getNode(ARMISD::Wrapper, dl, PtrVt, Callee),
+            DAG.getLoad(PtrVt, dl, DAG.getEntryNode(), Callee,
                         MachinePointerInfo::getGOT(DAG.getMachineFunction()),
-                        false, false, false, 0);
-    } else {
-      // On ELF targets for PIC code, direct calls should go through the PLT
-      unsigned OpFlags = 0;
-      if (Subtarget->isTargetELF() &&
-          getTargetMachine().getRelocationModel() == Reloc::PIC_)
-        OpFlags = ARMII::MO_PLT;
-      Callee = DAG.getTargetGlobalAddress(GV, dl, PtrVt, 0, OpFlags);
+                        /* Alignment = */ 0, MachineMemOperand::MOInvariant);
+      } else if (Subtarget->isTargetCOFF()) {
+        assert(Subtarget->isTargetWindows() &&
+               "Windows is the only supported COFF target");
+        unsigned TargetFlags = GV->hasDLLImportStorageClass()
+                                   ? ARMII::MO_DLLIMPORT
+                                   : ARMII::MO_NO_FLAG;
+        Callee = DAG.getTargetGlobalAddress(GV, dl, PtrVt, /*Offset=*/0,
+                                            TargetFlags);
+        if (GV->hasDLLImportStorageClass())
+          Callee =
+              DAG.getLoad(PtrVt, dl, DAG.getEntryNode(),
+                          DAG.getNode(ARMISD::Wrapper, dl, PtrVt, Callee),
+                          MachinePointerInfo::getGOT(DAG.getMachineFunction()));
+      } else {
+        Callee = DAG.getTargetGlobalAddress(GV, dl, PtrVt, 0, 0);
+      }
     }
   } else if (ExternalSymbolSDNode *S = dyn_cast<ExternalSymbolSDNode>(Callee)) {
     isDirect = true;
-    bool isStub = Subtarget->isTargetMachO() &&
-                  getTargetMachine().getRelocationModel() != Reloc::Static;
-    isARMFunc = !Subtarget->isThumb() || (isStub && !Subtarget->isMClass());
     // tBX takes a register source operand.
     const char *Sym = S->getSymbol();
     if (isARMFunc && Subtarget->isThumb1Only() && !Subtarget->hasV5TOps()) {
@@ -1878,17 +1914,11 @@ ARMTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
       CPAddr = DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, CPAddr);
       Callee = DAG.getLoad(
           PtrVt, dl, DAG.getEntryNode(), CPAddr,
-          MachinePointerInfo::getConstantPool(DAG.getMachineFunction()), false,
-          false, false, 0);
+          MachinePointerInfo::getConstantPool(DAG.getMachineFunction()));
       SDValue PICLabel = DAG.getConstant(ARMPCLabelIndex, dl, MVT::i32);
       Callee = DAG.getNode(ARMISD::PIC_ADD, dl, PtrVt, Callee, PICLabel);
     } else {
-      unsigned OpFlags = 0;
-      // On ELF targets for PIC code, direct calls should go through the PLT
-      if (Subtarget->isTargetELF() &&
-                  getTargetMachine().getRelocationModel() == Reloc::PIC_)
-        OpFlags = ARMII::MO_PLT;
-      Callee = DAG.getTargetExternalSymbol(Sym, PtrVt, OpFlags);
+      Callee = DAG.getTargetExternalSymbol(Sym, PtrVt, 0);
     }
   }
 
@@ -1898,11 +1928,11 @@ ARMTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
     if ((!isDirect || isARMFunc) && !Subtarget->hasV5TOps())
       CallOpc = ARMISD::CALL_NOLINK;
     else
-      CallOpc = isARMFunc ? ARMISD::CALL : ARMISD::tCALL;
+      CallOpc = ARMISD::CALL;
   } else {
     if (!isDirect && !Subtarget->hasV5TOps())
       CallOpc = ARMISD::CALL_NOLINK;
-    else if (doesNotRet && isDirect && Subtarget->hasRAS() &&
+    else if (doesNotRet && isDirect && Subtarget->hasRetAddrStack() &&
              // Emit regular call when code size is the priority
              !MF.getFunction()->optForMinSize())
       // "mov lr, pc; b _foo" to avoid confusing the RSP
@@ -2042,7 +2072,7 @@ bool MatchingStackOffset(SDValue Arg, unsigned Offset, ISD::ArgFlagsTy Flags,
     if (!Def)
       return false;
     if (!Flags.isByVal()) {
-      if (!TII->isLoadFromStackSlot(Def, FI))
+      if (!TII->isLoadFromStackSlot(*Def, FI))
         return false;
     } else {
       return false;
@@ -2082,9 +2112,9 @@ ARMTargetLowering::IsEligibleForTailCallOptimization(SDValue Callee,
                                     const SmallVectorImpl<SDValue> &OutVals,
                                     const SmallVectorImpl<ISD::InputArg> &Ins,
                                                      SelectionDAG& DAG) const {
-  const Function *CallerF = DAG.getMachineFunction().getFunction();
+  MachineFunction &MF = DAG.getMachineFunction();
+  const Function *CallerF = MF.getFunction();
   CallingConv::ID CallerCC = CallerF->getCallingConv();
-  bool CCMatch = CallerCC == CalleeCC;
 
   assert(Subtarget->supportsTailCall());
 
@@ -2122,41 +2152,25 @@ ARMTargetLowering::IsEligibleForTailCallOptimization(SDValue Callee,
       return false;
   }
 
-  // If the calling conventions do not match, then we'd better make sure the
-  // results are returned in the same way as what the caller expects.
-  if (!CCMatch) {
-    SmallVector<CCValAssign, 16> RVLocs1;
-    ARMCCState CCInfo1(CalleeCC, false, DAG.getMachineFunction(), RVLocs1,
-                       *DAG.getContext(), Call);
-    CCInfo1.AnalyzeCallResult(Ins, CCAssignFnForNode(CalleeCC, true, isVarArg));
-
-    SmallVector<CCValAssign, 16> RVLocs2;
-    ARMCCState CCInfo2(CallerCC, false, DAG.getMachineFunction(), RVLocs2,
-                       *DAG.getContext(), Call);
-    CCInfo2.AnalyzeCallResult(Ins, CCAssignFnForNode(CallerCC, true, isVarArg));
-
-    if (RVLocs1.size() != RVLocs2.size())
+  // Check that the call results are passed in the same way.
+  LLVMContext &C = *DAG.getContext();
+  if (!CCState::resultsCompatible(CalleeCC, CallerCC, MF, C, Ins,
+                                  CCAssignFnForNode(CalleeCC, true, isVarArg),
+                                  CCAssignFnForNode(CallerCC, true, isVarArg)))
+    return false;
+  // The callee has to preserve all registers the caller needs to preserve.
+  const ARMBaseRegisterInfo *TRI = Subtarget->getRegisterInfo();
+  const uint32_t *CallerPreserved = TRI->getCallPreservedMask(MF, CallerCC);
+  if (CalleeCC != CallerCC) {
+    const uint32_t *CalleePreserved = TRI->getCallPreservedMask(MF, CalleeCC);
+    if (!TRI->regmaskSubsetEqual(CallerPreserved, CalleePreserved))
       return false;
-    for (unsigned i = 0, e = RVLocs1.size(); i != e; ++i) {
-      if (RVLocs1[i].isRegLoc() != RVLocs2[i].isRegLoc())
-        return false;
-      if (RVLocs1[i].getLocInfo() != RVLocs2[i].getLocInfo())
-        return false;
-      if (RVLocs1[i].isRegLoc()) {
-        if (RVLocs1[i].getLocReg() != RVLocs2[i].getLocReg())
-          return false;
-      } else {
-        if (RVLocs1[i].getLocMemOffset() != RVLocs2[i].getLocMemOffset())
-          return false;
-      }
-    }
   }
 
   // If Caller's vararg or byval argument has been split between registers and
   // stack, do not perform tail call, since part of the argument is in caller's
   // local frame.
-  const ARMFunctionInfo *AFI_Caller = DAG.getMachineFunction().
-                                      getInfo<ARMFunctionInfo>();
+  const ARMFunctionInfo *AFI_Caller = MF.getInfo<ARMFunctionInfo>();
   if (AFI_Caller->getArgRegsSaveSize())
     return false;
 
@@ -2166,13 +2180,10 @@ ARMTargetLowering::IsEligibleForTailCallOptimization(SDValue Callee,
     // Check if stack adjustment is needed. For now, do not do this if any
     // argument is passed on the stack.
     SmallVector<CCValAssign, 16> ArgLocs;
-    ARMCCState CCInfo(CalleeCC, isVarArg, DAG.getMachineFunction(), ArgLocs,
-                      *DAG.getContext(), Call);
+    ARMCCState CCInfo(CalleeCC, isVarArg, MF, ArgLocs, C, Call);
     CCInfo.AnalyzeCallOperands(Outs,
                                CCAssignFnForNode(CalleeCC, false, isVarArg));
     if (CCInfo.getNextStackOffset()) {
-      MachineFunction &MF = DAG.getMachineFunction();
-
       // Check if the arguments are already laid out in the right way as
       // the caller's fixed stack objects.
       MachineFrameInfo *MFI = MF.getFrameInfo();
@@ -2209,6 +2220,10 @@ ARMTargetLowering::IsEligibleForTailCallOptimization(SDValue Callee,
         }
       }
     }
+
+    const MachineRegisterInfo &MRI = MF.getRegInfo();
+    if (!parametersInCSRMatch(MRI, CallerPreserved, ArgLocs, OutVals))
+      return false;
   }
 
   return true;
@@ -2226,7 +2241,7 @@ ARMTargetLowering::CanLowerReturn(CallingConv::ID CallConv,
 }
 
 static SDValue LowerInterruptReturn(SmallVectorImpl<SDValue> &RetOps,
-                                    SDLoc DL, SelectionDAG &DAG) {
+                                    const SDLoc &DL, SelectionDAG &DAG) {
   const MachineFunction &MF = DAG.getMachineFunction();
   const Function *F = MF.getFunction();
 
@@ -2259,11 +2274,11 @@ static SDValue LowerInterruptReturn(SmallVectorImpl<SDValue> &RetOps,
 }
 
 SDValue
-ARMTargetLowering::LowerReturn(SDValue Chain,
-                               CallingConv::ID CallConv, bool isVarArg,
+ARMTargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv,
+                               bool isVarArg,
                                const SmallVectorImpl<ISD::OutputArg> &Outs,
                                const SmallVectorImpl<SDValue> &OutVals,
-                               SDLoc dl, SelectionDAG &DAG) const {
+                               const SDLoc &dl, SelectionDAG &DAG) const {
 
   // CCValAssign - represent the assignment of the return value to a location.
   SmallVector<CCValAssign, 16> RVLocs;
@@ -2521,9 +2536,9 @@ SDValue ARMTargetLowering::LowerBlockAddress(SDValue Op,
   SDLoc DL(Op);
   EVT PtrVT = getPointerTy(DAG.getDataLayout());
   const BlockAddress *BA = cast<BlockAddressSDNode>(Op)->getBlockAddress();
-  Reloc::Model RelocM = getTargetMachine().getRelocationModel();
   SDValue CPAddr;
-  if (RelocM == Reloc::Static) {
+  bool IsPositionIndependent = isPositionIndependent();
+  if (!IsPositionIndependent) {
     CPAddr = DAG.getTargetConstantPool(BA, PtrVT, 4);
   } else {
     unsigned PCAdj = Subtarget->isThumb() ? 4 : 8;
@@ -2534,11 +2549,10 @@ SDValue ARMTargetLowering::LowerBlockAddress(SDValue Op,
     CPAddr = DAG.getTargetConstantPool(CPV, PtrVT, 4);
   }
   CPAddr = DAG.getNode(ARMISD::Wrapper, DL, PtrVT, CPAddr);
-  SDValue Result =
-      DAG.getLoad(PtrVT, DL, DAG.getEntryNode(), CPAddr,
-                  MachinePointerInfo::getConstantPool(DAG.getMachineFunction()),
-                  false, false, false, 0);
-  if (RelocM == Reloc::Static)
+  SDValue Result = DAG.getLoad(
+      PtrVT, DL, DAG.getEntryNode(), CPAddr,
+      MachinePointerInfo::getConstantPool(DAG.getMachineFunction()));
+  if (!IsPositionIndependent)
     return Result;
   SDValue PICLabel = DAG.getConstant(ARMPCLabelIndex, DL, MVT::i32);
   return DAG.getNode(ARMISD::PIC_ADD, DL, PtrVT, Result, PICLabel);
@@ -2584,7 +2598,8 @@ ARMTargetLowering::LowerGlobalTLSAddressDarwin(SDValue Op,
   SDValue FuncTLVGet =
       DAG.getLoad(MVT::i32, DL, Chain, DescAddr,
                   MachinePointerInfo::getGOT(DAG.getMachineFunction()),
-                  false, true, true, 4);
+                  /* Alignment = */ 4, MachineMemOperand::MONonTemporal |
+                                           MachineMemOperand::MOInvariant);
   Chain = FuncTLVGet.getValue(1);
 
   MachineFunction &F = DAG.getMachineFunction();
@@ -2610,6 +2625,61 @@ ARMTargetLowering::LowerGlobalTLSAddressDarwin(SDValue Op,
   return DAG.getCopyFromReg(Chain, DL, ARM::R0, MVT::i32, Chain.getValue(1));
 }
 
+SDValue
+ARMTargetLowering::LowerGlobalTLSAddressWindows(SDValue Op,
+                                                SelectionDAG &DAG) const {
+  assert(Subtarget->isTargetWindows() && "Windows specific TLS lowering");
+
+  SDValue Chain = DAG.getEntryNode();
+  EVT PtrVT = getPointerTy(DAG.getDataLayout());
+  SDLoc DL(Op);
+
+  // Load the current TEB (thread environment block)
+  SDValue Ops[] = {Chain,
+                   DAG.getConstant(Intrinsic::arm_mrc, DL, MVT::i32),
+                   DAG.getConstant(15, DL, MVT::i32),
+                   DAG.getConstant(0, DL, MVT::i32),
+                   DAG.getConstant(13, DL, MVT::i32),
+                   DAG.getConstant(0, DL, MVT::i32),
+                   DAG.getConstant(2, DL, MVT::i32)};
+  SDValue CurrentTEB = DAG.getNode(ISD::INTRINSIC_W_CHAIN, DL,
+                                   DAG.getVTList(MVT::i32, MVT::Other), Ops);
+
+  SDValue TEB = CurrentTEB.getValue(0);
+  Chain = CurrentTEB.getValue(1);
+
+  // Load the ThreadLocalStoragePointer from the TEB
+  // A pointer to the TLS array is located at offset 0x2c from the TEB.
+  SDValue TLSArray =
+      DAG.getNode(ISD::ADD, DL, PtrVT, TEB, DAG.getIntPtrConstant(0x2c, DL));
+  TLSArray = DAG.getLoad(PtrVT, DL, Chain, TLSArray, MachinePointerInfo());
+
+  // The pointer to the thread's TLS data area is at the TLS Index scaled by 4
+  // offset into the TLSArray.
+
+  // Load the TLS index from the C runtime
+  SDValue TLSIndex =
+      DAG.getTargetExternalSymbol("_tls_index", PtrVT, ARMII::MO_NO_FLAG);
+  TLSIndex = DAG.getNode(ARMISD::Wrapper, DL, PtrVT, TLSIndex);
+  TLSIndex = DAG.getLoad(PtrVT, DL, Chain, TLSIndex, MachinePointerInfo());
+
+  SDValue Slot = DAG.getNode(ISD::SHL, DL, PtrVT, TLSIndex,
+                              DAG.getConstant(2, DL, MVT::i32));
+  SDValue TLS = DAG.getLoad(PtrVT, DL, Chain,
+                            DAG.getNode(ISD::ADD, DL, PtrVT, TLSArray, Slot),
+                            MachinePointerInfo());
+
+  // Get the offset of the start of the .tls section (section base)
+  const auto *GA = cast<GlobalAddressSDNode>(Op);
+  auto *CPV = ARMConstantPoolConstant::Create(GA->getGlobal(), ARMCP::SECREL);
+  SDValue Offset = DAG.getLoad(
+      PtrVT, DL, Chain, DAG.getNode(ARMISD::Wrapper, DL, MVT::i32,
+                                    DAG.getTargetConstantPool(CPV, PtrVT, 4)),
+      MachinePointerInfo::getConstantPool(DAG.getMachineFunction()));
+
+  return DAG.getNode(ISD::ADD, DL, PtrVT, TLS, Offset);
+}
+
 // Lower ISD::GlobalTLSAddress using the "general dynamic" model
 SDValue
 ARMTargetLowering::LowerToTLSGeneralDynamicModel(GlobalAddressSDNode *GA,
@@ -2625,10 +2695,9 @@ ARMTargetLowering::LowerToTLSGeneralDynamicModel(GlobalAddressSDNode *GA,
                                     ARMCP::CPValue, PCAdj, ARMCP::TLSGD, true);
   SDValue Argument = DAG.getTargetConstantPool(CPV, PtrVT, 4);
   Argument = DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, Argument);
-  Argument =
-      DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), Argument,
-                  MachinePointerInfo::getConstantPool(DAG.getMachineFunction()),
-                  false, false, false, 0);
+  Argument = DAG.getLoad(
+      PtrVT, dl, DAG.getEntryNode(), Argument,
+      MachinePointerInfo::getConstantPool(DAG.getMachineFunction()));
   SDValue Chain = Argument.getValue(1);
 
   SDValue PICLabel = DAG.getConstant(ARMPCLabelIndex, dl, MVT::i32);
@@ -2645,8 +2714,7 @@ ARMTargetLowering::LowerToTLSGeneralDynamicModel(GlobalAddressSDNode *GA,
   TargetLowering::CallLoweringInfo CLI(DAG);
   CLI.setDebugLoc(dl).setChain(Chain)
     .setCallee(CallingConv::C, Type::getInt32Ty(*DAG.getContext()),
-               DAG.getExternalSymbol("__tls_get_addr", PtrVT), std::move(Args),
-               0);
+               DAG.getExternalSymbol("__tls_get_addr", PtrVT), std::move(Args));
 
   std::pair<SDValue, SDValue> CallResult = LowerCallTo(CLI);
   return CallResult.first;
@@ -2680,8 +2748,7 @@ ARMTargetLowering::LowerToTLSExecModels(GlobalAddressSDNode *GA,
     Offset = DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, Offset);
     Offset = DAG.getLoad(
         PtrVT, dl, Chain, Offset,
-        MachinePointerInfo::getConstantPool(DAG.getMachineFunction()), false,
-        false, false, 0);
+        MachinePointerInfo::getConstantPool(DAG.getMachineFunction()));
     Chain = Offset.getValue(1);
 
     SDValue PICLabel = DAG.getConstant(ARMPCLabelIndex, dl, MVT::i32);
@@ -2689,8 +2756,7 @@ ARMTargetLowering::LowerToTLSExecModels(GlobalAddressSDNode *GA,
 
     Offset = DAG.getLoad(
         PtrVT, dl, Chain, Offset,
-        MachinePointerInfo::getConstantPool(DAG.getMachineFunction()), false,
-        false, false, 0);
+        MachinePointerInfo::getConstantPool(DAG.getMachineFunction()));
   } else {
     // local exec model
     assert(model == TLSModel::LocalExec);
@@ -2700,8 +2766,7 @@ ARMTargetLowering::LowerToTLSExecModels(GlobalAddressSDNode *GA,
     Offset = DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, Offset);
     Offset = DAG.getLoad(
         PtrVT, dl, Chain, Offset,
-        MachinePointerInfo::getConstantPool(DAG.getMachineFunction()), false,
-        false, false, 0);
+        MachinePointerInfo::getConstantPool(DAG.getMachineFunction()));
   }
 
   // The address of the thread local variable is the add of the thread
@@ -2714,6 +2779,9 @@ ARMTargetLowering::LowerGlobalTLSAddress(SDValue Op, SelectionDAG &DAG) const {
   if (Subtarget->isTargetDarwin())
     return LowerGlobalTLSAddressDarwin(Op, DAG);
 
+  if (Subtarget->isTargetWindows())
+    return LowerGlobalTLSAddressWindows(Op, DAG);
+
   // TODO: implement the "local dynamic" model
   assert(Subtarget->isTargetELF() && "Only ELF implemented here");
   GlobalAddressSDNode *GA = cast<GlobalAddressSDNode>(Op);
@@ -2738,9 +2806,9 @@ SDValue ARMTargetLowering::LowerGlobalAddressELF(SDValue Op,
   EVT PtrVT = getPointerTy(DAG.getDataLayout());
   SDLoc dl(Op);
   const GlobalValue *GV = cast<GlobalAddressSDNode>(Op)->getGlobal();
-  if (getTargetMachine().getRelocationModel() == Reloc::PIC_) {
-    bool UseGOT_PREL =
-        !(GV->hasHiddenVisibility() || GV->hasLocalLinkage());
+  const TargetMachine &TM = getTargetMachine();
+  if (isPositionIndependent()) {
+    bool UseGOT_PREL = !TM.shouldAssumeDSOLocal(*GV->getParent(), GV);
 
     MachineFunction &MF = DAG.getMachineFunction();
     ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>();
@@ -2756,15 +2824,14 @@ SDValue ARMTargetLowering::LowerGlobalAddressELF(SDValue Op,
     CPAddr = DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, CPAddr);
     SDValue Result = DAG.getLoad(
         PtrVT, dl, DAG.getEntryNode(), CPAddr,
-        MachinePointerInfo::getConstantPool(DAG.getMachineFunction()), false,
-        false, false, 0);
+        MachinePointerInfo::getConstantPool(DAG.getMachineFunction()));
     SDValue Chain = Result.getValue(1);
     SDValue PICLabel = DAG.getConstant(ARMPCLabelIndex, dl, MVT::i32);
     Result = DAG.getNode(ARMISD::PIC_ADD, dl, PtrVT, Result, PICLabel);
     if (UseGOT_PREL)
-      Result = DAG.getLoad(PtrVT, dl, Chain, Result,
-                           MachinePointerInfo::getGOT(DAG.getMachineFunction()),
-                           false, false, false, 0);
+      Result =
+          DAG.getLoad(PtrVT, dl, Chain, Result,
+                      MachinePointerInfo::getGOT(DAG.getMachineFunction()));
     return Result;
   }
 
@@ -2781,8 +2848,7 @@ SDValue ARMTargetLowering::LowerGlobalAddressELF(SDValue Op,
     CPAddr = DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, CPAddr);
     return DAG.getLoad(
         PtrVT, dl, DAG.getEntryNode(), CPAddr,
-        MachinePointerInfo::getConstantPool(DAG.getMachineFunction()), false,
-        false, false, 0);
+        MachinePointerInfo::getConstantPool(DAG.getMachineFunction()));
   }
 }
 
@@ -2791,7 +2857,6 @@ SDValue ARMTargetLowering::LowerGlobalAddressDarwin(SDValue Op,
   EVT PtrVT = getPointerTy(DAG.getDataLayout());
   SDLoc dl(Op);
   const GlobalValue *GV = cast<GlobalAddressSDNode>(Op)->getGlobal();
-  Reloc::Model RelocM = getTargetMachine().getRelocationModel();
 
   if (Subtarget->useMovt(DAG.getMachineFunction()))
     ++NumMovwMovt;
@@ -2799,15 +2864,14 @@ SDValue ARMTargetLowering::LowerGlobalAddressDarwin(SDValue Op,
   // FIXME: Once remat is capable of dealing with instructions with register
   // operands, expand this into multiple nodes
   unsigned Wrapper =
-      RelocM == Reloc::PIC_ ? ARMISD::WrapperPIC : ARMISD::Wrapper;
+      isPositionIndependent() ? ARMISD::WrapperPIC : ARMISD::Wrapper;
 
   SDValue G = DAG.getTargetGlobalAddress(GV, dl, PtrVT, 0, ARMII::MO_NONLAZY);
   SDValue Result = DAG.getNode(Wrapper, dl, PtrVT, G);
 
-  if (Subtarget->GVIsIndirectSymbol(GV, RelocM))
+  if (Subtarget->isGVIndirectSymbol(GV))
     Result = DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), Result,
-                         MachinePointerInfo::getGOT(DAG.getMachineFunction()),
-                         false, false, false, 0);
+                         MachinePointerInfo::getGOT(DAG.getMachineFunction()));
   return Result;
 }
 
@@ -2833,8 +2897,7 @@ SDValue ARMTargetLowering::LowerGlobalAddressWindows(SDValue Op,
                                                   TargetFlags));
   if (GV->hasDLLImportStorageClass())
     Result = DAG.getLoad(PtrVT, DL, DAG.getEntryNode(), Result,
-                         MachinePointerInfo::getGOT(DAG.getMachineFunction()),
-                         false, false, false, 0);
+                         MachinePointerInfo::getGOT(DAG.getMachineFunction()));
   return Result;
 }
 
@@ -2873,7 +2936,7 @@ ARMTargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG,
            "RBIT intrinsic must have i32 type!");
     return DAG.getNode(ISD::BITREVERSE, dl, MVT::i32, Op.getOperand(1));
   }
-  case Intrinsic::arm_thread_pointer: {
+  case Intrinsic::thread_pointer: {
     EVT PtrVT = getPointerTy(DAG.getDataLayout());
     return DAG.getNode(ARMISD::THREAD_POINTER, dl, PtrVT);
   }
@@ -2882,10 +2945,9 @@ ARMTargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG,
     ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>();
     unsigned ARMPCLabelIndex = AFI->createPICLabelUId();
     EVT PtrVT = getPointerTy(DAG.getDataLayout());
-    Reloc::Model RelocM = getTargetMachine().getRelocationModel();
     SDValue CPAddr;
-    unsigned PCAdj = (RelocM != Reloc::PIC_)
-      ? 0 : (Subtarget->isThumb() ? 4 : 8);
+    bool IsPositionIndependent = isPositionIndependent();
+    unsigned PCAdj = IsPositionIndependent ? (Subtarget->isThumb() ? 4 : 8) : 0;
     ARMConstantPoolValue *CPV =
       ARMConstantPoolConstant::Create(MF.getFunction(), ARMPCLabelIndex,
                                       ARMCP::CPLSDA, PCAdj);
@@ -2893,10 +2955,9 @@ ARMTargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG,
     CPAddr = DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, CPAddr);
     SDValue Result = DAG.getLoad(
         PtrVT, dl, DAG.getEntryNode(), CPAddr,
-        MachinePointerInfo::getConstantPool(DAG.getMachineFunction()), false,
-        false, false, 0);
+        MachinePointerInfo::getConstantPool(DAG.getMachineFunction()));
 
-    if (RelocM == Reloc::PIC_) {
+    if (IsPositionIndependent) {
       SDValue PICLabel = DAG.getConstant(ARMPCLabelIndex, dl, MVT::i32);
       Result = DAG.getNode(ARMISD::PIC_ADD, dl, PtrVT, Result, PICLabel);
     }
@@ -2962,7 +3023,8 @@ static SDValue LowerATOMIC_FENCE(SDValue Op, SelectionDAG &DAG,
   if (Subtarget->isMClass()) {
     // Only a full system barrier exists in the M-class architectures.
     Domain = ARM_MB::SY;
-  } else if (Subtarget->isSwift() && Ord == Release) {
+  } else if (Subtarget->preferISHSTBarriers() &&
+             Ord == AtomicOrdering::Release) {
     // Swift happens to implement ISHST barriers in a way that's compatible with
     // Release semantics but weaker than ISH so we'd be fools not to use
     // it. Beware: other processors probably don't!
@@ -3012,13 +3074,14 @@ static SDValue LowerVASTART(SDValue Op, SelectionDAG &DAG) {
   SDValue FR = DAG.getFrameIndex(FuncInfo->getVarArgsFrameIndex(), PtrVT);
   const Value *SV = cast<SrcValueSDNode>(Op.getOperand(2))->getValue();
   return DAG.getStore(Op.getOperand(0), dl, FR, Op.getOperand(1),
-                      MachinePointerInfo(SV), false, false, 0);
+                      MachinePointerInfo(SV));
 }
 
-SDValue
-ARMTargetLowering::GetF64FormalArgument(CCValAssign &VA, CCValAssign &NextVA,
-                                        SDValue &Root, SelectionDAG &DAG,
-                                        SDLoc dl) const {
+SDValue ARMTargetLowering::GetF64FormalArgument(CCValAssign &VA,
+                                                CCValAssign &NextVA,
+                                                SDValue &Root,
+                                                SelectionDAG &DAG,
+                                                const SDLoc &dl) const {
   MachineFunction &MF = DAG.getMachineFunction();
   ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>();
 
@@ -3041,8 +3104,7 @@ ARMTargetLowering::GetF64FormalArgument(CCValAssign &VA, CCValAssign &NextVA,
     SDValue FIN = DAG.getFrameIndex(FI, getPointerTy(DAG.getDataLayout()));
     ArgValue2 = DAG.getLoad(
         MVT::i32, dl, Root, FIN,
-        MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI), false,
-        false, false, 0);
+        MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI));
   } else {
     Reg = MF.addLiveIn(NextVA.getLocReg(), RC);
     ArgValue2 = DAG.getCopyFromReg(Root, dl, Reg, MVT::i32);
@@ -3060,13 +3122,11 @@ ARMTargetLowering::GetF64FormalArgument(CCValAssign &VA, CCValAssign &NextVA,
 // these values; otherwise, this reassembles a (byval) structure that
 // was split between registers and memory.
 // Return: The frame index registers were stored into.
-int
-ARMTargetLowering::StoreByValRegs(CCState &CCInfo, SelectionDAG &DAG,
-                                  SDLoc dl, SDValue &Chain,
-                                  const Value *OrigArg,
-                                  unsigned InRegsParamRecordIdx,
-                                  int ArgOffset,
-                                  unsigned ArgSize) const {
+int ARMTargetLowering::StoreByValRegs(CCState &CCInfo, SelectionDAG &DAG,
+                                      const SDLoc &dl, SDValue &Chain,
+                                      const Value *OrigArg,
+                                      unsigned InRegsParamRecordIdx,
+                                      int ArgOffset, unsigned ArgSize) const {
   // Currently, two use-cases possible:
   // Case #1. Non-var-args function, and we meet first byval parameter.
   //          Setup first unallocated register as first byval register;
@@ -3104,9 +3164,8 @@ ARMTargetLowering::StoreByValRegs(CCState &CCInfo, SelectionDAG &DAG,
   for (unsigned Reg = RBegin, i = 0; Reg < REnd; ++Reg, ++i) {
     unsigned VReg = MF.addLiveIn(Reg, RC);
     SDValue Val = DAG.getCopyFromReg(Chain, dl, VReg, MVT::i32);
-    SDValue Store =
-        DAG.getStore(Val.getValue(1), dl, Val, FIN,
-                     MachinePointerInfo(OrigArg, 4 * i), false, false, 0);
+    SDValue Store = DAG.getStore(Val.getValue(1), dl, Val, FIN,
+                                 MachinePointerInfo(OrigArg, 4 * i));
     MemOps.push_back(Store);
     FIN = DAG.getNode(ISD::ADD, dl, PtrVT, FIN, DAG.getConstant(4, dl, PtrVT));
   }
@@ -3117,17 +3176,16 @@ ARMTargetLowering::StoreByValRegs(CCState &CCInfo, SelectionDAG &DAG,
 }
 
 // Setup stack frame, the va_list pointer will start from.
-void
-ARMTargetLowering::VarArgStyleRegisters(CCState &CCInfo, SelectionDAG &DAG,
-                                        SDLoc dl, SDValue &Chain,
-                                        unsigned ArgOffset,
-                                        unsigned TotalArgRegsSaveSize,
-                                        bool ForceMutable) const {
+void ARMTargetLowering::VarArgStyleRegisters(CCState &CCInfo, SelectionDAG &DAG,
+                                             const SDLoc &dl, SDValue &Chain,
+                                             unsigned ArgOffset,
+                                             unsigned TotalArgRegsSaveSize,
+                                             bool ForceMutable) const {
   MachineFunction &MF = DAG.getMachineFunction();
   ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>();
 
   // Try to store any remaining integer argument regs
-  // to their spots on the stack so that they may be loaded by deferencing
+  // to their spots on the stack so that they may be loaded by dereferencing
   // the result of va_next.
   // If there is no regs to be stored, just point address after last
   // argument passed via stack.
@@ -3137,14 +3195,10 @@ ARMTargetLowering::VarArgStyleRegisters(CCState &CCInfo, SelectionDAG &DAG,
   AFI->setVarArgsFrameIndex(FrameIndex);
 }
 
-SDValue
-ARMTargetLowering::LowerFormalArguments(SDValue Chain,
-                                        CallingConv::ID CallConv, bool isVarArg,
-                                        const SmallVectorImpl<ISD::InputArg>
-                                          &Ins,
-                                        SDLoc dl, SelectionDAG &DAG,
-                                        SmallVectorImpl<SDValue> &InVals)
-                                          const {
+SDValue ARMTargetLowering::LowerFormalArguments(
+    SDValue Chain, CallingConv::ID CallConv, bool isVarArg,
+    const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &dl,
+    SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals) const {
   MachineFunction &MF = DAG.getMachineFunction();
   MachineFrameInfo *MFI = MF.getFrameInfo();
 
@@ -3226,10 +3280,9 @@ ARMTargetLowering::LowerFormalArguments(SDValue Chain,
           if (VA.isMemLoc()) {
             int FI = MFI->CreateFixedObject(8, VA.getLocMemOffset(), true);
             SDValue FIN = DAG.getFrameIndex(FI, PtrVT);
-            ArgValue2 = DAG.getLoad(
-                MVT::f64, dl, Chain, FIN,
-                MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI),
-                false, false, false, 0);
+            ArgValue2 = DAG.getLoad(MVT::f64, dl, Chain, FIN,
+                                    MachinePointerInfo::getFixedStack(
+                                        DAG.getMachineFunction(), FI));
           } else {
             ArgValue2 = GetF64FormalArgument(VA, ArgLocs[++i],
                                              Chain, DAG, dl);
@@ -3322,10 +3375,9 @@ ARMTargetLowering::LowerFormalArguments(SDValue Chain,
 
             // Create load nodes to retrieve arguments from the stack.
             SDValue FIN = DAG.getFrameIndex(FI, PtrVT);
-            InVals.push_back(DAG.getLoad(
-                VA.getValVT(), dl, Chain, FIN,
-                MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI),
-                false, false, false, 0));
+            InVals.push_back(DAG.getLoad(VA.getValVT(), dl, Chain, FIN,
+                                         MachinePointerInfo::getFixedStack(
+                                             DAG.getMachineFunction(), FI)));
           }
           lastInsIndex = index;
         }
@@ -3369,10 +3421,9 @@ static bool isFloatingPointZero(SDValue Op) {
 
 /// Returns appropriate ARM CMP (cmp) and corresponding condition code for
 /// the given operands.
-SDValue
-ARMTargetLowering::getARMCmp(SDValue LHS, SDValue RHS, ISD::CondCode CC,
-                             SDValue &ARMcc, SelectionDAG &DAG,
-                             SDLoc dl) const {
+SDValue ARMTargetLowering::getARMCmp(SDValue LHS, SDValue RHS, ISD::CondCode CC,
+                                     SDValue &ARMcc, SelectionDAG &DAG,
+                                     const SDLoc &dl) const {
   if (ConstantSDNode *RHSC = dyn_cast<ConstantSDNode>(RHS.getNode())) {
     unsigned C = RHSC->getZExtValue();
     if (!isLegalICmpImmediate(C)) {
@@ -3428,9 +3479,8 @@ ARMTargetLowering::getARMCmp(SDValue LHS, SDValue RHS, ISD::CondCode CC,
 }
 
 /// Returns a appropriate VFP CMP (fcmp{s|d}+fmstat) for the given operands.
-SDValue
-ARMTargetLowering::getVFPCmp(SDValue LHS, SDValue RHS, SelectionDAG &DAG,
-                             SDLoc dl) const {
+SDValue ARMTargetLowering::getVFPCmp(SDValue LHS, SDValue RHS,
+                                     SelectionDAG &DAG, const SDLoc &dl) const {
   assert(!Subtarget->isFPOnlySP() || RHS.getValueType() != MVT::f64);
   SDValue Cmp;
   if (!isFloatingPointZero(RHS))
@@ -3647,7 +3697,7 @@ static void checkVSELConstraints(ISD::CondCode CC, ARMCC::CondCodes &CondCode,
   }
 }
 
-SDValue ARMTargetLowering::getCMOV(SDLoc dl, EVT VT, SDValue FalseVal,
+SDValue ARMTargetLowering::getCMOV(const SDLoc &dl, EVT VT, SDValue FalseVal,
                                    SDValue TrueVal, SDValue ARMcc, SDValue CCR,
                                    SDValue Cmp, SelectionDAG &DAG) const {
   if (Subtarget->isFPOnlySP() && VT == MVT::f64) {
@@ -3673,14 +3723,149 @@ SDValue ARMTargetLowering::getCMOV(SDLoc dl, EVT VT, SDValue FalseVal,
   }
 }
 
+static bool isGTorGE(ISD::CondCode CC) {
+  return CC == ISD::SETGT || CC == ISD::SETGE;
+}
+
+static bool isLTorLE(ISD::CondCode CC) {
+  return CC == ISD::SETLT || CC == ISD::SETLE;
+}
+
+// See if a conditional (LHS CC RHS ? TrueVal : FalseVal) is lower-saturating.
+// All of these conditions (and their <= and >= counterparts) will do:
+//          x < k ? k : x
+//          x > k ? x : k
+//          k < x ? x : k
+//          k > x ? k : x
+static bool isLowerSaturate(const SDValue LHS, const SDValue RHS,
+                            const SDValue TrueVal, const SDValue FalseVal,
+                            const ISD::CondCode CC, const SDValue K) {
+  return (isGTorGE(CC) &&
+          ((K == LHS && K == TrueVal) || (K == RHS && K == FalseVal))) ||
+         (isLTorLE(CC) &&
+          ((K == RHS && K == TrueVal) || (K == LHS && K == FalseVal)));
+}
+
+// Similar to isLowerSaturate(), but checks for upper-saturating conditions.
+static bool isUpperSaturate(const SDValue LHS, const SDValue RHS,
+                            const SDValue TrueVal, const SDValue FalseVal,
+                            const ISD::CondCode CC, const SDValue K) {
+  return (isGTorGE(CC) &&
+          ((K == RHS && K == TrueVal) || (K == LHS && K == FalseVal))) ||
+         (isLTorLE(CC) &&
+          ((K == LHS && K == TrueVal) || (K == RHS && K == FalseVal)));
+}
+
+// Check if two chained conditionals could be converted into SSAT.
+//
+// SSAT can replace a set of two conditional selectors that bound a number to an
+// interval of type [k, ~k] when k + 1 is a power of 2. Here are some examples:
+//
+//     x < -k ? -k : (x > k ? k : x)
+//     x < -k ? -k : (x < k ? x : k)
+//     x > -k ? (x > k ? k : x) : -k
+//     x < k ? (x < -k ? -k : x) : k
+//     etc.
+//
+// It returns true if the conversion can be done, false otherwise.
+// Additionally, the variable is returned in parameter V and the constant in K.
+static bool isSaturatingConditional(const SDValue &Op, SDValue &V,
+                                    uint64_t &K) {
+
+  SDValue LHS1 = Op.getOperand(0);
+  SDValue RHS1 = Op.getOperand(1);
+  SDValue TrueVal1 = Op.getOperand(2);
+  SDValue FalseVal1 = Op.getOperand(3);
+  ISD::CondCode CC1 = cast<CondCodeSDNode>(Op.getOperand(4))->get();
+
+  const SDValue Op2 = isa<ConstantSDNode>(TrueVal1) ? FalseVal1 : TrueVal1;
+  if (Op2.getOpcode() != ISD::SELECT_CC)
+    return false;
+
+  SDValue LHS2 = Op2.getOperand(0);
+  SDValue RHS2 = Op2.getOperand(1);
+  SDValue TrueVal2 = Op2.getOperand(2);
+  SDValue FalseVal2 = Op2.getOperand(3);
+  ISD::CondCode CC2 = cast<CondCodeSDNode>(Op2.getOperand(4))->get();
+
+  // Find out which are the constants and which are the variables
+  // in each conditional
+  SDValue *K1 = isa<ConstantSDNode>(LHS1) ? &LHS1 : isa<ConstantSDNode>(RHS1)
+                                                        ? &RHS1
+                                                        : NULL;
+  SDValue *K2 = isa<ConstantSDNode>(LHS2) ? &LHS2 : isa<ConstantSDNode>(RHS2)
+                                                        ? &RHS2
+                                                        : NULL;
+  SDValue K2Tmp = isa<ConstantSDNode>(TrueVal2) ? TrueVal2 : FalseVal2;
+  SDValue V1Tmp = (K1 && *K1 == LHS1) ? RHS1 : LHS1;
+  SDValue V2Tmp = (K2 && *K2 == LHS2) ? RHS2 : LHS2;
+  SDValue V2 = (K2Tmp == TrueVal2) ? FalseVal2 : TrueVal2;
+
+  // We must detect cases where the original operations worked with 16- or
+  // 8-bit values. In such case, V2Tmp != V2 because the comparison operations
+  // must work with sign-extended values but the select operations return
+  // the original non-extended value.
+  SDValue V2TmpReg = V2Tmp;
+  if (V2Tmp->getOpcode() == ISD::SIGN_EXTEND_INREG)
+    V2TmpReg = V2Tmp->getOperand(0);
+
+  // Check that the registers and the constants have the correct values
+  // in both conditionals
+  if (!K1 || !K2 || *K1 == Op2 || *K2 != K2Tmp || V1Tmp != V2Tmp ||
+      V2TmpReg != V2)
+    return false;
+
+  // Figure out which conditional is saturating the lower/upper bound.
+  const SDValue *LowerCheckOp =
+      isLowerSaturate(LHS1, RHS1, TrueVal1, FalseVal1, CC1, *K1)
+          ? &Op
+          : isLowerSaturate(LHS2, RHS2, TrueVal2, FalseVal2, CC2, *K2) ? &Op2
+                                                                       : NULL;
+  const SDValue *UpperCheckOp =
+      isUpperSaturate(LHS1, RHS1, TrueVal1, FalseVal1, CC1, *K1)
+          ? &Op
+          : isUpperSaturate(LHS2, RHS2, TrueVal2, FalseVal2, CC2, *K2) ? &Op2
+                                                                       : NULL;
+
+  if (!UpperCheckOp || !LowerCheckOp || LowerCheckOp == UpperCheckOp)
+    return false;
+
+  // Check that the constant in the lower-bound check is
+  // the opposite of the constant in the upper-bound check
+  // in 1's complement.
+  int64_t Val1 = cast<ConstantSDNode>(*K1)->getSExtValue();
+  int64_t Val2 = cast<ConstantSDNode>(*K2)->getSExtValue();
+  int64_t PosVal = std::max(Val1, Val2);
+
+  if (((Val1 > Val2 && UpperCheckOp == &Op) ||
+       (Val1 < Val2 && UpperCheckOp == &Op2)) &&
+      Val1 == ~Val2 && isPowerOf2_64(PosVal + 1)) {
+
+    V = V2;
+    K = (uint64_t)PosVal; // At this point, PosVal is guaranteed to be positive
+    return true;
+  }
+
+  return false;
+}
+
 SDValue ARMTargetLowering::LowerSELECT_CC(SDValue Op, SelectionDAG &DAG) const {
+
   EVT VT = Op.getValueType();
+  SDLoc dl(Op);
+
+  // Try to convert two saturating conditional selects into a single SSAT
+  SDValue SatValue;
+  uint64_t SatConstant;
+  if (isSaturatingConditional(Op, SatValue, SatConstant))
+    return DAG.getNode(ARMISD::SSAT, dl, VT, SatValue,
+                       DAG.getConstant(countTrailingOnes(SatConstant), dl, VT));
+
   SDValue LHS = Op.getOperand(0);
   SDValue RHS = Op.getOperand(1);
   ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(4))->get();
   SDValue TrueVal = Op.getOperand(2);
   SDValue FalseVal = Op.getOperand(3);
-  SDLoc dl(Op);
 
   if (Subtarget->isFPOnlySP() && LHS.getValueType() == MVT::f64) {
     DAG.getTargetLoweringInfo().softenSetCCOperands(DAG, MVT::f64, LHS, RHS, CC,
@@ -3781,10 +3966,9 @@ static SDValue bitcastf32Toi32(SDValue Op, SelectionDAG &DAG) {
     return DAG.getConstant(0, SDLoc(Op), MVT::i32);
 
   if (LoadSDNode *Ld = dyn_cast<LoadSDNode>(Op))
-    return DAG.getLoad(MVT::i32, SDLoc(Op),
-                       Ld->getChain(), Ld->getBasePtr(), Ld->getPointerInfo(),
-                       Ld->isVolatile(), Ld->isNonTemporal(),
-                       Ld->isInvariant(), Ld->getAlignment());
+    return DAG.getLoad(MVT::i32, SDLoc(Op), Ld->getChain(), Ld->getBasePtr(),
+                       Ld->getPointerInfo(), Ld->getAlignment(),
+                       Ld->getMemOperand()->getFlags());
 
   llvm_unreachable("Unknown VFP cmp argument!");
 }
@@ -3801,21 +3985,17 @@ static void expandf64Toi32(SDValue Op, SelectionDAG &DAG,
 
   if (LoadSDNode *Ld = dyn_cast<LoadSDNode>(Op)) {
     SDValue Ptr = Ld->getBasePtr();
-    RetVal1 = DAG.getLoad(MVT::i32, dl,
-                          Ld->getChain(), Ptr,
-                          Ld->getPointerInfo(),
-                          Ld->isVolatile(), Ld->isNonTemporal(),
-                          Ld->isInvariant(), Ld->getAlignment());
+    RetVal1 =
+        DAG.getLoad(MVT::i32, dl, Ld->getChain(), Ptr, Ld->getPointerInfo(),
+                    Ld->getAlignment(), Ld->getMemOperand()->getFlags());
 
     EVT PtrType = Ptr.getValueType();
     unsigned NewAlign = MinAlign(Ld->getAlignment(), 4);
     SDValue NewPtr = DAG.getNode(ISD::ADD, dl,
                                  PtrType, Ptr, DAG.getConstant(4, dl, PtrType));
-    RetVal2 = DAG.getLoad(MVT::i32, dl,
-                          Ld->getChain(), NewPtr,
-                          Ld->getPointerInfo().getWithOffset(4),
-                          Ld->isVolatile(), Ld->isNonTemporal(),
-                          Ld->isInvariant(), NewAlign);
+    RetVal2 = DAG.getLoad(MVT::i32, dl, Ld->getChain(), NewPtr,
+                          Ld->getPointerInfo().getWithOffset(4), NewAlign,
+                          Ld->getMemOperand()->getFlags());
     return;
   }
 
@@ -3908,8 +4088,7 @@ SDValue ARMTargetLowering::LowerBR_CC(SDValue Op, SelectionDAG &DAG) const {
   if (getTargetMachine().Options.UnsafeFPMath &&
       (CC == ISD::SETEQ || CC == ISD::SETOEQ ||
        CC == ISD::SETNE || CC == ISD::SETUNE)) {
-    SDValue Result = OptimizeVFPBrcond(Op, DAG);
-    if (Result.getNode())
+    if (SDValue Result = OptimizeVFPBrcond(Op, DAG))
       return Result;
   }
 
@@ -3950,19 +4129,17 @@ SDValue ARMTargetLowering::LowerBR_JT(SDValue Op, SelectionDAG &DAG) const {
     return DAG.getNode(ARMISD::BR2_JT, dl, MVT::Other, Chain,
                        Addr, Op.getOperand(2), JTI);
   }
-  if (getTargetMachine().getRelocationModel() == Reloc::PIC_) {
+  if (isPositionIndependent()) {
     Addr =
         DAG.getLoad((EVT)MVT::i32, dl, Chain, Addr,
-                    MachinePointerInfo::getJumpTable(DAG.getMachineFunction()),
-                    false, false, false, 0);
+                    MachinePointerInfo::getJumpTable(DAG.getMachineFunction()));
     Chain = Addr.getValue(1);
     Addr = DAG.getNode(ISD::ADD, dl, PTy, Addr, Table);
     return DAG.getNode(ARMISD::BR_JT, dl, MVT::Other, Chain, Addr, JTI);
   } else {
     Addr =
         DAG.getLoad(PTy, dl, Chain, Addr,
-                    MachinePointerInfo::getJumpTable(DAG.getMachineFunction()),
-                    false, false, false, 0);
+                    MachinePointerInfo::getJumpTable(DAG.getMachineFunction()));
     Chain = Addr.getValue(1);
     return DAG.getNode(ARMISD::BR_JT, dl, MVT::Other, Chain, Addr, JTI);
   }
@@ -4156,7 +4333,7 @@ SDValue ARMTargetLowering::LowerRETURNADDR(SDValue Op, SelectionDAG &DAG) const{
     SDValue Offset = DAG.getConstant(4, dl, MVT::i32);
     return DAG.getLoad(VT, dl, DAG.getEntryNode(),
                        DAG.getNode(ISD::ADD, dl, VT, FrameAddr, Offset),
-                       MachinePointerInfo(), false, false, false, 0);
+                       MachinePointerInfo());
   }
 
   // Return LR, which contains the return address. Mark it an implicit live-in.
@@ -4178,8 +4355,7 @@ SDValue ARMTargetLowering::LowerFRAMEADDR(SDValue Op, SelectionDAG &DAG) const {
   SDValue FrameAddr = DAG.getCopyFromReg(DAG.getEntryNode(), dl, FrameReg, VT);
   while (Depth--)
     FrameAddr = DAG.getLoad(VT, dl, DAG.getEntryNode(), FrameAddr,
-                            MachinePointerInfo(),
-                            false, false, false, 0);
+                            MachinePointerInfo());
   return FrameAddr;
 }
 
@@ -4322,7 +4498,7 @@ static SDValue ExpandBITCAST(SDNode *N, SelectionDAG &DAG) {
 /// not support i64 elements, so sometimes the zero vectors will need to be
 /// explicitly constructed.  Regardless, use a canonical VMOV to create the
 /// zero vector.
-static SDValue getZeroVector(EVT VT, SelectionDAG &DAG, SDLoc dl) {
+static SDValue getZeroVector(EVT VT, SelectionDAG &DAG, const SDLoc &dl) {
   assert(VT.isVector() && "Expected a vector type");
   // The canonical modified immediate encoding of a zero vector is....0!
   SDValue EncodedVal = DAG.getTargetConstant(0, dl, MVT::i32);
@@ -4826,12 +5002,36 @@ static SDValue LowerVSETCC(SDValue Op, SelectionDAG &DAG) {
   return Result;
 }
 
+static SDValue LowerSETCCE(SDValue Op, SelectionDAG &DAG) {
+  SDValue LHS = Op.getOperand(0);
+  SDValue RHS = Op.getOperand(1);
+  SDValue Carry = Op.getOperand(2);
+  SDValue Cond = Op.getOperand(3);
+  SDLoc DL(Op);
+
+  assert(LHS.getSimpleValueType().isInteger() && "SETCCE is integer only.");
+
+  assert(Carry.getOpcode() != ISD::CARRY_FALSE);
+  SDVTList VTs = DAG.getVTList(LHS.getValueType(), MVT::i32);
+  SDValue Cmp = DAG.getNode(ARMISD::SUBE, DL, VTs, LHS, RHS, Carry);
+
+  SDValue FVal = DAG.getConstant(0, DL, MVT::i32);
+  SDValue TVal = DAG.getConstant(1, DL, MVT::i32);
+  SDValue ARMcc = DAG.getConstant(
+      IntCCToARMCC(cast<CondCodeSDNode>(Cond)->get()), DL, MVT::i32);
+  SDValue CCR = DAG.getRegister(ARM::CPSR, MVT::i32);
+  SDValue Chain = DAG.getCopyToReg(DAG.getEntryNode(), DL, ARM::CPSR,
+                                   Cmp.getValue(1), SDValue());
+  return DAG.getNode(ARMISD::CMOV, DL, Op.getValueType(), FVal, TVal, ARMcc,
+                     CCR, Chain.getValue(1));
+}
+
 /// isNEONModifiedImm - Check if the specified splat value corresponds to a
 /// valid vector constant for a NEON instruction with a "modified immediate"
 /// operand (e.g., VMOV).  If so, return the encoded value.
 static SDValue isNEONModifiedImm(uint64_t SplatBits, uint64_t SplatUndef,
                                  unsigned SplatBitSize, SelectionDAG &DAG,
-                                 SDLoc dl, EVT &VT, bool is128Bits,
+                                 const SDLoc &dl, EVT &VT, bool is128Bits,
                                  NEONModImmType type) {
   unsigned OpCmode, Imm;
 
@@ -4979,7 +5179,7 @@ SDValue ARMTargetLowering::LowerConstantFP(SDValue Op, SelectionDAG &DAG,
     return SDValue();
 
   // Try splatting with a VMOV.f32...
-  APFloat FPVal = CFP->getValueAPF();
+  const APFloat &FPVal = CFP->getValueAPF();
   int ImmVal = IsDouble ? ARM_AM::getFP64Imm(FPVal) : ARM_AM::getFP32Imm(FPVal);
 
   if (ImmVal != -1) {
@@ -5421,7 +5621,7 @@ static bool isReverseMask(ArrayRef<int> M, EVT VT) {
 // instruction, return an SDValue of such a constant (will become a MOV
 // instruction).  Otherwise return null.
 static SDValue IsSingleInstrConstant(SDValue N, SelectionDAG &DAG,
-                                     const ARMSubtarget *ST, SDLoc dl) {
+                                     const ARMSubtarget *ST, const SDLoc &dl) {
   uint64_t Val;
   if (!isa<ConstantSDNode>(N))
     return SDValue();
@@ -5502,7 +5702,7 @@ SDValue ARMTargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG,
   SDValue Value;
   for (unsigned i = 0; i < NumElts; ++i) {
     SDValue V = Op.getOperand(i);
-    if (V.getOpcode() == ISD::UNDEF)
+    if (V.isUndef())
       continue;
     if (i > 0)
       isOnlyLowElement = false;
@@ -5585,7 +5785,7 @@ SDValue ARMTargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG,
         Ops.push_back(DAG.getNode(ISD::BITCAST, dl, MVT::i32,
                                   Op.getOperand(i)));
       EVT VecVT = EVT::getVectorVT(*DAG.getContext(), MVT::i32, NumElts);
-      SDValue Val = DAG.getNode(ISD::BUILD_VECTOR, dl, VecVT, Ops);
+      SDValue Val = DAG.getBuildVector(VecVT, dl, Ops);
       Val = LowerBUILD_VECTOR(Val, DAG, ST);
       if (Val.getNode())
         return DAG.getNode(ISD::BITCAST, dl, VT, Val);
@@ -5635,7 +5835,7 @@ SDValue ARMTargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG,
     SDValue Vec = DAG.getUNDEF(VT);
     for (unsigned i = 0 ; i < NumElts; ++i) {
       SDValue V = Op.getOperand(i);
-      if (V.getOpcode() == ISD::UNDEF)
+      if (V.isUndef())
         continue;
       SDValue LaneIdx = DAG.getConstant(i, dl, MVT::i32);
       Vec = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, Vec, V, LaneIdx);
@@ -5681,7 +5881,7 @@ SDValue ARMTargetLowering::ReconstructShuffle(SDValue Op,
   SmallVector<ShuffleSourceInfo, 2> Sources;
   for (unsigned i = 0; i < NumElts; ++i) {
     SDValue V = Op.getOperand(i);
-    if (V.getOpcode() == ISD::UNDEF)
+    if (V.isUndef())
       continue;
     else if (V.getOpcode() != ISD::EXTRACT_VECTOR_ELT) {
       // A shuffle can only come from building a vector from various
@@ -5808,7 +6008,7 @@ SDValue ARMTargetLowering::ReconstructShuffle(SDValue Op,
   int BitsPerShuffleLane = ShuffleVT.getVectorElementType().getSizeInBits();
   for (unsigned i = 0; i < VT.getVectorNumElements(); ++i) {
     SDValue Entry = Op.getOperand(i);
-    if (Entry.getOpcode() == ISD::UNDEF)
+    if (Entry.isUndef())
       continue;
 
     auto Src = std::find(Sources.begin(), Sources.end(), Entry.getOperand(0));
@@ -5845,7 +6045,7 @@ SDValue ARMTargetLowering::ReconstructShuffle(SDValue Op,
     ShuffleOps[i] = Sources[i].ShuffleVec;
 
   SDValue Shuffle = DAG.getVectorShuffle(ShuffleVT, dl, ShuffleOps[0],
-                                         ShuffleOps[1], &Mask[0]);
+                                         ShuffleOps[1], Mask);
   return DAG.getNode(ISD::BITCAST, dl, VT, Shuffle);
 }
 
@@ -5895,7 +6095,7 @@ ARMTargetLowering::isShuffleMaskLegal(const SmallVectorImpl<int> &M,
 /// the specified operations to build the shuffle.
 static SDValue GeneratePerfectShuffle(unsigned PFEntry, SDValue LHS,
                                       SDValue RHS, SelectionDAG &DAG,
-                                      SDLoc dl) {
+                                      const SDLoc &dl) {
   unsigned OpNum = (PFEntry >> 26) & 0x0F;
   unsigned LHSID = (PFEntry >> 13) & ((1 << 13)-1);
   unsigned RHSID = (PFEntry >>  0) & ((1 << 13)-1);
@@ -5982,12 +6182,12 @@ static SDValue LowerVECTOR_SHUFFLEv8i8(SDValue Op,
          I = ShuffleMask.begin(), E = ShuffleMask.end(); I != E; ++I)
     VTBLMask.push_back(DAG.getConstant(*I, DL, MVT::i32));
 
-  if (V2.getNode()->getOpcode() == ISD::UNDEF)
+  if (V2.getNode()->isUndef())
     return DAG.getNode(ARMISD::VTBL1, DL, MVT::v8i8, V1,
-                       DAG.getNode(ISD::BUILD_VECTOR, DL, MVT::v8i8, VTBLMask));
+                       DAG.getBuildVector(MVT::v8i8, DL, VTBLMask));
 
   return DAG.getNode(ARMISD::VTBL2, DL, MVT::v8i8, V1, V2,
-                     DAG.getNode(ISD::BUILD_VECTOR, DL, MVT::v8i8, VTBLMask));
+                     DAG.getBuildVector(MVT::v8i8, DL, VTBLMask));
 }
 
 static SDValue LowerReverse_VECTOR_SHUFFLEv16i8_v8i16(SDValue Op,
@@ -6024,7 +6224,7 @@ static SDValue LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) {
 
   unsigned EltSize = VT.getVectorElementType().getSizeInBits();
   if (EltSize <= 32) {
-    if (ShuffleVectorSDNode::isSplatMask(&ShuffleMask[0], VT)) {
+    if (SVN->isSplat()) {
       int Lane = SVN->getSplatIndex();
       // If this is undef splat, generate it via "just" vdup, if possible.
       if (Lane == -1) Lane = 0;
@@ -6040,7 +6240,7 @@ static SDValue LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) {
           !isa<ConstantSDNode>(V1.getOperand(0))) {
         bool IsScalarToVector = true;
         for (unsigned i = 1, e = V1.getNumOperands(); i != e; ++i)
-          if (V1.getOperand(i).getOpcode() != ISD::UNDEF) {
+          if (!V1.getOperand(i).isUndef()) {
             IsScalarToVector = false;
             break;
           }
@@ -6067,8 +6267,7 @@ static SDValue LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) {
     if (isVREVMask(ShuffleMask, VT, 16))
       return DAG.getNode(ARMISD::VREV16, dl, VT, V1);
 
-    if (V2->getOpcode() == ISD::UNDEF &&
-        isSingletonVEXTMask(ShuffleMask, VT, Imm)) {
+    if (V2->isUndef() && isSingletonVEXTMask(ShuffleMask, VT, Imm)) {
       return DAG.getNode(ARMISD::VEXT, dl, VT, V1, V1,
                          DAG.getConstant(Imm, dl, MVT::i32));
     }
@@ -6103,8 +6302,7 @@ static SDValue LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) {
     // ->
     //   concat(VZIP(v1, v2):0, :1)
     //
-    if (V1->getOpcode() == ISD::CONCAT_VECTORS &&
-        V2->getOpcode() == ISD::UNDEF) {
+    if (V1->getOpcode() == ISD::CONCAT_VECTORS && V2->isUndef()) {
       SDValue SubV1 = V1->getOperand(0);
       SDValue SubV2 = V1->getOperand(1);
       EVT SubVT = SubV1.getValueType();
@@ -6175,11 +6373,9 @@ static SDValue LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) {
   if ((VT == MVT::v8i16 || VT == MVT::v16i8) && isReverseMask(ShuffleMask, VT))
     return LowerReverse_VECTOR_SHUFFLEv16i8_v8i16(Op, DAG);
 
-  if (VT == MVT::v8i8) {
-    SDValue NewOp = LowerVECTOR_SHUFFLEv8i8(Op, ShuffleMask, DAG);
-    if (NewOp.getNode())
+  if (VT == MVT::v8i8)
+    if (SDValue NewOp = LowerVECTOR_SHUFFLEv8i8(Op, ShuffleMask, DAG))
       return NewOp;
-  }
 
   return SDValue();
 }
@@ -6218,11 +6414,11 @@ static SDValue LowerCONCAT_VECTORS(SDValue Op, SelectionDAG &DAG) {
   SDValue Val = DAG.getUNDEF(MVT::v2f64);
   SDValue Op0 = Op.getOperand(0);
   SDValue Op1 = Op.getOperand(1);
-  if (Op0.getOpcode() != ISD::UNDEF)
+  if (!Op0.isUndef())
     Val = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v2f64, Val,
                       DAG.getNode(ISD::BITCAST, dl, MVT::f64, Op0),
                       DAG.getIntPtrConstant(0, dl));
-  if (Op1.getOpcode() != ISD::UNDEF)
+  if (!Op1.isUndef())
     Val = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v2f64, Val,
                       DAG.getNode(ISD::BITCAST, dl, MVT::f64, Op1),
                       DAG.getIntPtrConstant(1, dl));
@@ -6351,17 +6547,16 @@ static SDValue SkipLoadExtensionForVMULL(LoadSDNode *LD, SelectionDAG& DAG) {
   // The load already has the right type.
   if (ExtendedTy == LD->getMemoryVT())
     return DAG.getLoad(LD->getMemoryVT(), SDLoc(LD), LD->getChain(),
-                LD->getBasePtr(), LD->getPointerInfo(), LD->isVolatile(),
-                LD->isNonTemporal(), LD->isInvariant(),
-                LD->getAlignment());
+                       LD->getBasePtr(), LD->getPointerInfo(),
+                       LD->getAlignment(), LD->getMemOperand()->getFlags());
 
   // We need to create a zextload/sextload. We cannot just create a load
   // followed by a zext/zext node because LowerMUL is also run during normal
   // operation legalization where we can't create illegal types.
   return DAG.getExtLoad(LD->getExtensionType(), SDLoc(LD), ExtendedTy,
                         LD->getChain(), LD->getBasePtr(), LD->getPointerInfo(),
-                        LD->getMemoryVT(), LD->isVolatile(), LD->isInvariant(),
-                        LD->isNonTemporal(), LD->getAlignment());
+                        LD->getMemoryVT(), LD->getAlignment(),
+                        LD->getMemOperand()->getFlags());
 }
 
 /// SkipExtensionForVMULL - For a node that is a SIGN_EXTEND, ZERO_EXTEND,
@@ -6387,8 +6582,9 @@ static SDValue SkipExtensionForVMULL(SDNode *N, SelectionDAG &DAG) {
     assert(BVN->getOpcode() == ISD::BUILD_VECTOR &&
            BVN->getValueType(0) == MVT::v4i32 && "expected v4i32 BUILD_VECTOR");
     unsigned LowElt = DAG.getDataLayout().isBigEndian() ? 1 : 0;
-    return DAG.getNode(ISD::BUILD_VECTOR, SDLoc(N), MVT::v2i32,
-                       BVN->getOperand(LowElt), BVN->getOperand(LowElt+2));
+    return DAG.getBuildVector(
+        MVT::v2i32, SDLoc(N),
+        {BVN->getOperand(LowElt), BVN->getOperand(LowElt + 2)});
   }
   // Construct a new BUILD_VECTOR with elements truncated to half the size.
   assert(N->getOpcode() == ISD::BUILD_VECTOR && "expected BUILD_VECTOR");
@@ -6405,8 +6601,7 @@ static SDValue SkipExtensionForVMULL(SDNode *N, SelectionDAG &DAG) {
     // The values are implicitly truncated so sext vs. zext doesn't matter.
     Ops.push_back(DAG.getConstant(CInt.zextOrTrunc(32), dl, MVT::i32));
   }
-  return DAG.getNode(ISD::BUILD_VECTOR, dl,
-                     MVT::getVectorVT(TruncVT, NumElts), Ops);
+  return DAG.getBuildVector(MVT::getVectorVT(TruncVT, NumElts), dl, Ops);
 }
 
 static bool isAddSubSExt(SDNode *N, SelectionDAG &DAG) {
@@ -6506,8 +6701,8 @@ static SDValue LowerMUL(SDValue Op, SelectionDAG &DAG) {
                                DAG.getNode(ISD::BITCAST, DL, Op1VT, N01), Op1));
 }
 
-static SDValue
-LowerSDIV_v4i8(SDValue X, SDValue Y, SDLoc dl, SelectionDAG &DAG) {
+static SDValue LowerSDIV_v4i8(SDValue X, SDValue Y, const SDLoc &dl,
+                              SelectionDAG &DAG) {
   // TODO: Should this propagate fast-math-flags?
 
   // Convert to float
@@ -6528,8 +6723,7 @@ LowerSDIV_v4i8(SDValue X, SDValue Y, SDLoc dl, SelectionDAG &DAG) {
   // float4 result = as_float4(as_int4(xf*recip) + 0xb000);
   X = DAG.getNode(ISD::FMUL, dl, MVT::v4f32, X, Y);
   X = DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, X);
-  Y = DAG.getConstant(0xb000, dl, MVT::i32);
-  Y = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4i32, Y, Y, Y, Y);
+  Y = DAG.getConstant(0xb000, dl, MVT::v4i32);
   X = DAG.getNode(ISD::ADD, dl, MVT::v4i32, X, Y);
   X = DAG.getNode(ISD::BITCAST, dl, MVT::v4f32, X);
   // Convert back to short.
@@ -6538,8 +6732,8 @@ LowerSDIV_v4i8(SDValue X, SDValue Y, SDLoc dl, SelectionDAG &DAG) {
   return X;
 }
 
-static SDValue
-LowerSDIV_v4i16(SDValue N0, SDValue N1, SDLoc dl, SelectionDAG &DAG) {
+static SDValue LowerSDIV_v4i16(SDValue N0, SDValue N1, const SDLoc &dl,
+                               SelectionDAG &DAG) {
   // TODO: Should this propagate fast-math-flags?
 
   SDValue N2;
@@ -6567,8 +6761,7 @@ LowerSDIV_v4i16(SDValue N0, SDValue N1, SDLoc dl, SelectionDAG &DAG) {
   // float4 result = as_float4(as_int4(xf*recip) + 0x89);
   N0 = DAG.getNode(ISD::FMUL, dl, MVT::v4f32, N0, N2);
   N0 = DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, N0);
-  N1 = DAG.getConstant(0x89, dl, MVT::i32);
-  N1 = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4i32, N1, N1, N1, N1);
+  N1 = DAG.getConstant(0x89, dl, MVT::v4i32);
   N0 = DAG.getNode(ISD::ADD, dl, MVT::v4i32, N0, N1);
   N0 = DAG.getNode(ISD::BITCAST, dl, MVT::v4f32, N0);
   // Convert back to integer and return.
@@ -6679,8 +6872,7 @@ static SDValue LowerUDIV(SDValue Op, SelectionDAG &DAG) {
   // float4 result = as_float4(as_int4(xf*recip) + 2);
   N0 = DAG.getNode(ISD::FMUL, dl, MVT::v4f32, N0, N2);
   N0 = DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, N0);
-  N1 = DAG.getConstant(2, dl, MVT::i32);
-  N1 = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4i32, N1, N1, N1, N1);
+  N1 = DAG.getConstant(2, dl, MVT::v4i32);
   N0 = DAG.getNode(ISD::ADD, dl, MVT::v4i32, N0, N1);
   N0 = DAG.getNode(ISD::BITCAST, dl, MVT::v4f32, N0);
   // Convert back to integer and return.
@@ -6766,21 +6958,21 @@ SDValue ARMTargetLowering::LowerFSINCOS(SDValue Op, SelectionDAG &DAG) const {
   TargetLowering::CallLoweringInfo CLI(DAG);
   CLI.setDebugLoc(dl)
       .setChain(DAG.getEntryNode())
-      .setCallee(CC, RetTy, Callee, std::move(Args), 0)
+      .setCallee(CC, RetTy, Callee, std::move(Args))
       .setDiscardResult(ShouldUseSRet);
   std::pair<SDValue, SDValue> CallResult = LowerCallTo(CLI);
 
   if (!ShouldUseSRet)
     return CallResult.first;
 
-  SDValue LoadSin = DAG.getLoad(ArgVT, dl, CallResult.second, SRet,
-                                MachinePointerInfo(), false, false, false, 0);
+  SDValue LoadSin =
+      DAG.getLoad(ArgVT, dl, CallResult.second, SRet, MachinePointerInfo());
 
   // Address of cos field.
   SDValue Add = DAG.getNode(ISD::ADD, dl, PtrVT, SRet,
                             DAG.getIntPtrConstant(ArgVT.getStoreSize(), dl));
-  SDValue LoadCos = DAG.getLoad(ArgVT, dl, LoadSin.getValue(1), Add,
-                                MachinePointerInfo(), false, false, false, 0);
+  SDValue LoadCos =
+      DAG.getLoad(ArgVT, dl, LoadSin.getValue(1), Add, MachinePointerInfo());
 
   SDVTList Tys = DAG.getVTList(ArgVT, ArgVT);
   return DAG.getNode(ISD::MERGE_VALUES, dl, Tys,
@@ -6819,7 +7011,7 @@ SDValue ARMTargetLowering::LowerWindowsDIVLibCall(SDValue Op, SelectionDAG &DAG,
   CLI.setDebugLoc(dl)
     .setChain(Chain)
     .setCallee(CallingConv::ARM_AAPCS_VFP, VT.getTypeForEVT(*DAG.getContext()),
-               ES, std::move(Args), 0);
+               ES, std::move(Args));
 
   return LowerCallTo(CLI).first;
 }
@@ -6867,13 +7059,13 @@ void ARMTargetLowering::ExpandDIV_Windows(
 }
 
 static SDValue LowerAtomicLoadStore(SDValue Op, SelectionDAG &DAG) {
-  // Monotonic load/store is legal for all targets
-  if (cast<AtomicSDNode>(Op)->getOrdering() <= Monotonic)
-    return Op;
+  if (isStrongerThanMonotonic(cast<AtomicSDNode>(Op)->getOrdering()))
+    // Acquire/Release load/store is not legal for targets without a dmb or
+    // equivalent available.
+    return SDValue();
 
-  // Acquire/Release load/store is not legal for targets without a
-  // dmb or equivalent available.
-  return SDValue();
+  // Monotonic load/store is legal for all targets.
+  return Op;
 }
 
 static void ReplaceREADCYCLECOUNTER(SDNode *N,
@@ -6899,6 +7091,46 @@ static void ReplaceREADCYCLECOUNTER(SDNode *N,
   Results.push_back(Cycles32.getValue(1));
 }
 
+static SDValue createGPRPairNode(SelectionDAG &DAG, SDValue V) {
+  SDLoc dl(V.getNode());
+  SDValue VLo = DAG.getAnyExtOrTrunc(V, dl, MVT::i32);
+  SDValue VHi = DAG.getAnyExtOrTrunc(
+      DAG.getNode(ISD::SRL, dl, MVT::i64, V, DAG.getConstant(32, dl, MVT::i32)),
+      dl, MVT::i32);
+  SDValue RegClass =
+      DAG.getTargetConstant(ARM::GPRPairRegClassID, dl, MVT::i32);
+  SDValue SubReg0 = DAG.getTargetConstant(ARM::gsub_0, dl, MVT::i32);
+  SDValue SubReg1 = DAG.getTargetConstant(ARM::gsub_1, dl, MVT::i32);
+  const SDValue Ops[] = { RegClass, VLo, SubReg0, VHi, SubReg1 };
+  return SDValue(
+      DAG.getMachineNode(TargetOpcode::REG_SEQUENCE, dl, MVT::Untyped, Ops), 0);
+}
+
+static void ReplaceCMP_SWAP_64Results(SDNode *N,
+                                       SmallVectorImpl<SDValue> & Results,
+                                       SelectionDAG &DAG) {
+  assert(N->getValueType(0) == MVT::i64 &&
+         "AtomicCmpSwap on types less than 64 should be legal");
+  SDValue Ops[] = {N->getOperand(1),
+                   createGPRPairNode(DAG, N->getOperand(2)),
+                   createGPRPairNode(DAG, N->getOperand(3)),
+                   N->getOperand(0)};
+  SDNode *CmpSwap = DAG.getMachineNode(
+      ARM::CMP_SWAP_64, SDLoc(N),
+      DAG.getVTList(MVT::Untyped, MVT::i32, MVT::Other), Ops);
+
+  MachineFunction &MF = DAG.getMachineFunction();
+  MachineSDNode::mmo_iterator MemOp = MF.allocateMemRefsArray(1);
+  MemOp[0] = cast<MemSDNode>(N)->getMemOperand();
+  cast<MachineSDNode>(CmpSwap)->setMemRefs(MemOp, MemOp + 1);
+
+  Results.push_back(DAG.getTargetExtractSubreg(ARM::gsub_0, SDLoc(N), MVT::i32,
+                                               SDValue(CmpSwap, 0)));
+  Results.push_back(DAG.getTargetExtractSubreg(ARM::gsub_1, SDLoc(N), MVT::i32,
+                                               SDValue(CmpSwap, 0)));
+  Results.push_back(SDValue(CmpSwap, 2));
+}
+
 SDValue ARMTargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
   switch (Op.getOpcode()) {
   default: llvm_unreachable("Don't know how to custom lower this!");
@@ -6948,6 +7180,7 @@ SDValue ARMTargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
   case ISD::CTTZ_ZERO_UNDEF: return LowerCTTZ(Op.getNode(), DAG, Subtarget);
   case ISD::CTPOP:         return LowerCTPOP(Op.getNode(), DAG, Subtarget);
   case ISD::SETCC:         return LowerVSETCC(Op, DAG);
+  case ISD::SETCCE:        return LowerSETCCE(Op, DAG);
   case ISD::ConstantFP:    return LowerConstantFP(Op, DAG, Subtarget);
   case ISD::BUILD_VECTOR:  return LowerBUILD_VECTOR(Op, DAG, Subtarget);
   case ISD::VECTOR_SHUFFLE: return LowerVECTOR_SHUFFLE(Op, DAG);
@@ -6956,8 +7189,14 @@ SDValue ARMTargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
   case ISD::CONCAT_VECTORS: return LowerCONCAT_VECTORS(Op, DAG);
   case ISD::FLT_ROUNDS_:   return LowerFLT_ROUNDS_(Op, DAG);
   case ISD::MUL:           return LowerMUL(Op, DAG);
-  case ISD::SDIV:          return LowerSDIV(Op, DAG);
-  case ISD::UDIV:          return LowerUDIV(Op, DAG);
+  case ISD::SDIV:
+    if (Subtarget->isTargetWindows())
+      return LowerDIV_Windows(Op, DAG, /* Signed */ true);
+    return LowerSDIV(Op, DAG);
+  case ISD::UDIV:
+    if (Subtarget->isTargetWindows())
+      return LowerDIV_Windows(Op, DAG, /* Signed */ false);
+    return LowerUDIV(Op, DAG);
   case ISD::ADDC:
   case ISD::ADDE:
   case ISD::SUBC:
@@ -7005,6 +7244,13 @@ void ARMTargetLowering::ReplaceNodeResults(SDNode *N,
   case ISD::UREM:
     Res = LowerREM(N, DAG);
     break;
+  case ISD::SDIVREM:
+  case ISD::UDIVREM:
+    Res = LowerDivRem(SDValue(N, 0), DAG);
+    assert(Res.getNumOperands() == 2 && "DivRem needs two values");
+    Results.push_back(Res.getValue(0));
+    Results.push_back(Res.getValue(1));
+    return;
   case ISD::READCYCLECOUNTER:
     ReplaceREADCYCLECOUNTER(N, Results, DAG, Subtarget);
     return;
@@ -7013,6 +7259,9 @@ void ARMTargetLowering::ReplaceNodeResults(SDNode *N,
     assert(Subtarget->isTargetWindows() && "can only expand DIV on Windows");
     return ExpandDIV_Windows(SDValue(N, 0), DAG, N->getOpcode() == ISD::SDIV,
                              Results);
+  case ISD::ATOMIC_CMP_SWAP:
+    ReplaceCMP_SWAP_64Results(N, Results, DAG);
+    return;
   }
   if (Res.getNode())
     Results.push_back(Res);
@@ -7024,11 +7273,12 @@ void ARMTargetLowering::ReplaceNodeResults(SDNode *N,
 
 /// SetupEntryBlockForSjLj - Insert code into the entry block that creates and
 /// registers the function context.
-void ARMTargetLowering::
-SetupEntryBlockForSjLj(MachineInstr *MI, MachineBasicBlock *MBB,
-                       MachineBasicBlock *DispatchBB, int FI) const {
+void ARMTargetLowering::SetupEntryBlockForSjLj(MachineInstr &MI,
+                                               MachineBasicBlock *MBB,
+                                               MachineBasicBlock *DispatchBB,
+                                               int FI) const {
   const TargetInstrInfo *TII = Subtarget->getInstrInfo();
-  DebugLoc dl = MI->getDebugLoc();
+  DebugLoc dl = MI.getDebugLoc();
   MachineFunction *MF = MBB->getParent();
   MachineRegisterInfo *MRI = &MF->getRegInfo();
   MachineConstantPool *MCP = MF->getConstantPool();
@@ -7139,10 +7389,10 @@ SetupEntryBlockForSjLj(MachineInstr *MI, MachineBasicBlock *MBB,
   }
 }
 
-void ARMTargetLowering::EmitSjLjDispatchBlock(MachineInstr *MI,
+void ARMTargetLowering::EmitSjLjDispatchBlock(MachineInstr &MI,
                                               MachineBasicBlock *MBB) const {
   const TargetInstrInfo *TII = Subtarget->getInstrInfo();
-  DebugLoc dl = MI->getDebugLoc();
+  DebugLoc dl = MI.getDebugLoc();
   MachineFunction *MF = MBB->getParent();
   MachineRegisterInfo *MRI = &MF->getRegInfo();
   MachineFrameInfo *MFI = MF->getFrameInfo();
@@ -7182,7 +7432,7 @@ void ARMTargetLowering::EmitSjLjDispatchBlock(MachineInstr *MI,
 
   // Get an ordered list of the machine basic blocks for the jump table.
   std::vector<MachineBasicBlock*> LPadList;
-  SmallPtrSet<MachineBasicBlock*, 64> InvokeBBs;
+  SmallPtrSet<MachineBasicBlock*, 32> InvokeBBs;
   LPadList.reserve(CallSiteNumToLPad.size());
   for (unsigned I = 1; I <= MaxCSNum; ++I) {
     SmallVectorImpl<MachineBasicBlock*> &MBBList = CallSiteNumToLPad[I];
@@ -7200,7 +7450,6 @@ void ARMTargetLowering::EmitSjLjDispatchBlock(MachineInstr *MI,
   MachineJumpTableInfo *JTI =
     MF->getOrCreateJumpTableInfo(MachineJumpTableInfo::EK_Inline);
   unsigned MJTI = JTI->createJumpTableIndex(LPadList);
-  Reloc::Model RelocM = getTargetMachine().getRelocationModel();
 
   // Create the MBBs for the dispatch code.
 
@@ -7244,6 +7493,7 @@ void ARMTargetLowering::EmitSjLjDispatchBlock(MachineInstr *MI,
   // registers being marked as clobbered.
   MIB.addRegMask(RI.getNoPreservedMask());
 
+  bool IsPositionIndependent = isPositionIndependent();
   unsigned NumLPads = LPadList.size();
   if (Subtarget->isThumb2()) {
     unsigned NewVReg1 = MRI->createVirtualRegister(TRC);
@@ -7357,7 +7607,7 @@ void ARMTargetLowering::EmitSjLjDispatchBlock(MachineInstr *MI,
                    .addMemOperand(JTMMOLd));
 
     unsigned NewVReg6 = NewVReg5;
-    if (RelocM == Reloc::PIC_) {
+    if (IsPositionIndependent) {
       NewVReg6 = MRI->createVirtualRegister(TRC);
       AddDefaultPred(BuildMI(DispContBB, dl, TII->get(ARM::tADDrr), NewVReg6)
                      .addReg(ARM::CPSR, RegState::Define)
@@ -7440,7 +7690,7 @@ void ARMTargetLowering::EmitSjLjDispatchBlock(MachineInstr *MI,
       .addImm(0)
       .addMemOperand(JTMMOLd));
 
-    if (RelocM == Reloc::PIC_) {
+    if (IsPositionIndependent) {
       BuildMI(DispContBB, dl, TII->get(ARM::BR_JTadd))
         .addReg(NewVReg5, RegState::Kill)
         .addReg(NewVReg4)
@@ -7524,7 +7774,7 @@ void ARMTargetLowering::EmitSjLjDispatchBlock(MachineInstr *MI,
     (*I)->setIsEHPad(false);
 
   // The instruction is gone now.
-  MI->eraseFromParent();
+  MI.eraseFromParent();
 }
 
 static
@@ -7576,8 +7826,8 @@ static unsigned getStOpcode(unsigned StSize, bool IsThumb1, bool IsThumb2) {
 
 /// Emit a post-increment load operation with given size. The instructions
 /// will be added to BB at Pos.
-static void emitPostLd(MachineBasicBlock *BB, MachineInstr *Pos,
-                       const TargetInstrInfo *TII, DebugLoc dl,
+static void emitPostLd(MachineBasicBlock *BB, MachineBasicBlock::iterator Pos,
+                       const TargetInstrInfo *TII, const DebugLoc &dl,
                        unsigned LdSize, unsigned Data, unsigned AddrIn,
                        unsigned AddrOut, bool IsThumb1, bool IsThumb2) {
   unsigned LdOpc = getLdOpcode(LdSize, IsThumb1, IsThumb2);
@@ -7608,8 +7858,8 @@ static void emitPostLd(MachineBasicBlock *BB, MachineInstr *Pos,
 
 /// Emit a post-increment store operation with given size. The instructions
 /// will be added to BB at Pos.
-static void emitPostSt(MachineBasicBlock *BB, MachineInstr *Pos,
-                       const TargetInstrInfo *TII, DebugLoc dl,
+static void emitPostSt(MachineBasicBlock *BB, MachineBasicBlock::iterator Pos,
+                       const TargetInstrInfo *TII, const DebugLoc &dl,
                        unsigned StSize, unsigned Data, unsigned AddrIn,
                        unsigned AddrOut, bool IsThumb1, bool IsThumb2) {
   unsigned StOpc = getStOpcode(StSize, IsThumb1, IsThumb2);
@@ -7637,7 +7887,7 @@ static void emitPostSt(MachineBasicBlock *BB, MachineInstr *Pos,
 }
 
 MachineBasicBlock *
-ARMTargetLowering::EmitStructByval(MachineInstr *MI,
+ARMTargetLowering::EmitStructByval(MachineInstr &MI,
                                    MachineBasicBlock *BB) const {
   // This pseudo instruction has 3 operands: dst, src, size
   // We expand it to a loop if size > Subtarget->getMaxInlineSizeThreshold().
@@ -7646,11 +7896,11 @@ ARMTargetLowering::EmitStructByval(MachineInstr *MI,
   const BasicBlock *LLVM_BB = BB->getBasicBlock();
   MachineFunction::iterator It = ++BB->getIterator();
 
-  unsigned dest = MI->getOperand(0).getReg();
-  unsigned src = MI->getOperand(1).getReg();
-  unsigned SizeVal = MI->getOperand(2).getImm();
-  unsigned Align = MI->getOperand(3).getImm();
-  DebugLoc dl = MI->getDebugLoc();
+  unsigned dest = MI.getOperand(0).getReg();
+  unsigned src = MI.getOperand(1).getReg();
+  unsigned SizeVal = MI.getOperand(2).getImm();
+  unsigned Align = MI.getOperand(3).getImm();
+  DebugLoc dl = MI.getDebugLoc();
 
   MachineFunction *MF = BB->getParent();
   MachineRegisterInfo &MRI = MF->getRegInfo();
@@ -7722,7 +7972,7 @@ ARMTargetLowering::EmitStructByval(MachineInstr *MI,
       srcIn = srcOut;
       destIn = destOut;
     }
-    MI->eraseFromParent();   // The instruction is gone now.
+    MI.eraseFromParent(); // The instruction is gone now.
     return BB;
   }
 
@@ -7848,7 +8098,7 @@ ARMTargetLowering::EmitStructByval(MachineInstr *MI,
 
   // Add epilogue to handle BytesLeft.
   BB = exitMBB;
-  MachineInstr *StartOfExit = exitMBB->begin();
+  auto StartOfExit = exitMBB->begin();
 
   //   [scratch, srcOut] = LDRB_POST(srcLoop, 1)
   //   [destOut] = STRB_POST(scratch, destLoop, 1)
@@ -7866,16 +8116,16 @@ ARMTargetLowering::EmitStructByval(MachineInstr *MI,
     destIn = destOut;
   }
 
-  MI->eraseFromParent();   // The instruction is gone now.
+  MI.eraseFromParent(); // The instruction is gone now.
   return BB;
 }
 
 MachineBasicBlock *
-ARMTargetLowering::EmitLowered__chkstk(MachineInstr *MI,
+ARMTargetLowering::EmitLowered__chkstk(MachineInstr &MI,
                                        MachineBasicBlock *MBB) const {
   const TargetMachine &TM = getTargetMachine();
   const TargetInstrInfo &TII = *Subtarget->getInstrInfo();
-  DebugLoc DL = MI->getDebugLoc();
+  DebugLoc DL = MI.getDebugLoc();
 
   assert(Subtarget->isTargetWindows() &&
          "__chkstk is only supported on Windows");
@@ -7930,24 +8180,26 @@ ARMTargetLowering::EmitLowered__chkstk(MachineInstr *MI,
 
   AddDefaultCC(AddDefaultPred(BuildMI(*MBB, MI, DL, TII.get(ARM::t2SUBrr),
                                       ARM::SP)
-                              .addReg(ARM::SP).addReg(ARM::R4)));
+                         .addReg(ARM::SP, RegState::Kill)
+                         .addReg(ARM::R4, RegState::Kill)
+                         .setMIFlags(MachineInstr::FrameSetup)));
 
-  MI->eraseFromParent();
+  MI.eraseFromParent();
   return MBB;
 }
 
 MachineBasicBlock *
-ARMTargetLowering::EmitLowered__dbzchk(MachineInstr *MI,
+ARMTargetLowering::EmitLowered__dbzchk(MachineInstr &MI,
                                        MachineBasicBlock *MBB) const {
-  DebugLoc DL = MI->getDebugLoc();
+  DebugLoc DL = MI.getDebugLoc();
   MachineFunction *MF = MBB->getParent();
   const TargetInstrInfo *TII = Subtarget->getInstrInfo();
 
   MachineBasicBlock *ContBB = MF->CreateMachineBasicBlock();
-  MF->push_back(ContBB);
+  MF->insert(++MBB->getIterator(), ContBB);
   ContBB->splice(ContBB->begin(), MBB,
                  std::next(MachineBasicBlock::iterator(MI)), MBB->end());
-  MBB->addSuccessor(ContBB);
+  ContBB->transferSuccessorsAndUpdatePHIs(MBB);
 
   MachineBasicBlock *TrapBB = MF->CreateMachineBasicBlock();
   MF->push_back(TrapBB);
@@ -7955,74 +8207,89 @@ ARMTargetLowering::EmitLowered__dbzchk(MachineInstr *MI,
   MBB->addSuccessor(TrapBB);
 
   BuildMI(*MBB, MI, DL, TII->get(ARM::tCBZ))
-      .addReg(MI->getOperand(0).getReg())
+      .addReg(MI.getOperand(0).getReg())
       .addMBB(TrapBB);
+  AddDefaultPred(BuildMI(*MBB, MI, DL, TII->get(ARM::t2B)).addMBB(ContBB));
+  MBB->addSuccessor(ContBB);
 
-  MI->eraseFromParent();
+  MI.eraseFromParent();
   return ContBB;
 }
 
 MachineBasicBlock *
-ARMTargetLowering::EmitInstrWithCustomInserter(MachineInstr *MI,
+ARMTargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI,
                                                MachineBasicBlock *BB) const {
   const TargetInstrInfo *TII = Subtarget->getInstrInfo();
-  DebugLoc dl = MI->getDebugLoc();
+  DebugLoc dl = MI.getDebugLoc();
   bool isThumb2 = Subtarget->isThumb2();
-  switch (MI->getOpcode()) {
+  switch (MI.getOpcode()) {
   default: {
-    MI->dump();
+    MI.dump();
     llvm_unreachable("Unexpected instr type to insert");
   }
+
+  // Thumb1 post-indexed loads are really just single-register LDMs.
+  case ARM::tLDR_postidx: {
+    BuildMI(*BB, MI, dl, TII->get(ARM::tLDMIA_UPD))
+      .addOperand(MI.getOperand(1)) // Rn_wb
+      .addOperand(MI.getOperand(2)) // Rn
+      .addOperand(MI.getOperand(3)) // PredImm
+      .addOperand(MI.getOperand(4)) // PredReg
+      .addOperand(MI.getOperand(0)); // Rt
+    MI.eraseFromParent();
+    return BB;
+  }
+
   // The Thumb2 pre-indexed stores have the same MI operands, they just
   // define them differently in the .td files from the isel patterns, so
   // they need pseudos.
   case ARM::t2STR_preidx:
-    MI->setDesc(TII->get(ARM::t2STR_PRE));
+    MI.setDesc(TII->get(ARM::t2STR_PRE));
     return BB;
   case ARM::t2STRB_preidx:
-    MI->setDesc(TII->get(ARM::t2STRB_PRE));
+    MI.setDesc(TII->get(ARM::t2STRB_PRE));
     return BB;
   case ARM::t2STRH_preidx:
-    MI->setDesc(TII->get(ARM::t2STRH_PRE));
+    MI.setDesc(TII->get(ARM::t2STRH_PRE));
     return BB;
 
   case ARM::STRi_preidx:
   case ARM::STRBi_preidx: {
-    unsigned NewOpc = MI->getOpcode() == ARM::STRi_preidx ?
-      ARM::STR_PRE_IMM : ARM::STRB_PRE_IMM;
+    unsigned NewOpc = MI.getOpcode() == ARM::STRi_preidx ? ARM::STR_PRE_IMM
+                                                         : ARM::STRB_PRE_IMM;
     // Decode the offset.
-    unsigned Offset = MI->getOperand(4).getImm();
+    unsigned Offset = MI.getOperand(4).getImm();
     bool isSub = ARM_AM::getAM2Op(Offset) == ARM_AM::sub;
     Offset = ARM_AM::getAM2Offset(Offset);
     if (isSub)
       Offset = -Offset;
 
-    MachineMemOperand *MMO = *MI->memoperands_begin();
+    MachineMemOperand *MMO = *MI.memoperands_begin();
     BuildMI(*BB, MI, dl, TII->get(NewOpc))
-      .addOperand(MI->getOperand(0))  // Rn_wb
-      .addOperand(MI->getOperand(1))  // Rt
-      .addOperand(MI->getOperand(2))  // Rn
-      .addImm(Offset)                 // offset (skip GPR==zero_reg)
-      .addOperand(MI->getOperand(5))  // pred
-      .addOperand(MI->getOperand(6))
-      .addMemOperand(MMO);
-    MI->eraseFromParent();
+        .addOperand(MI.getOperand(0)) // Rn_wb
+        .addOperand(MI.getOperand(1)) // Rt
+        .addOperand(MI.getOperand(2)) // Rn
+        .addImm(Offset)               // offset (skip GPR==zero_reg)
+        .addOperand(MI.getOperand(5)) // pred
+        .addOperand(MI.getOperand(6))
+        .addMemOperand(MMO);
+    MI.eraseFromParent();
     return BB;
   }
   case ARM::STRr_preidx:
   case ARM::STRBr_preidx:
   case ARM::STRH_preidx: {
     unsigned NewOpc;
-    switch (MI->getOpcode()) {
+    switch (MI.getOpcode()) {
     default: llvm_unreachable("unexpected opcode!");
     case ARM::STRr_preidx: NewOpc = ARM::STR_PRE_REG; break;
     case ARM::STRBr_preidx: NewOpc = ARM::STRB_PRE_REG; break;
     case ARM::STRH_preidx: NewOpc = ARM::STRH_PRE; break;
     }
     MachineInstrBuilder MIB = BuildMI(*BB, MI, dl, TII->get(NewOpc));
-    for (unsigned i = 0; i < MI->getNumOperands(); ++i)
-      MIB.addOperand(MI->getOperand(i));
-    MI->eraseFromParent();
+    for (unsigned i = 0; i < MI.getNumOperands(); ++i)
+      MIB.addOperand(MI.getOperand(i));
+    MI.eraseFromParent();
     return BB;
   }
 
@@ -8055,8 +8322,10 @@ ARMTargetLowering::EmitInstrWithCustomInserter(MachineInstr *MI,
     BB->addSuccessor(copy0MBB);
     BB->addSuccessor(sinkMBB);
 
-    BuildMI(BB, dl, TII->get(ARM::tBcc)).addMBB(sinkMBB)
-      .addImm(MI->getOperand(3).getImm()).addReg(MI->getOperand(4).getReg());
+    BuildMI(BB, dl, TII->get(ARM::tBcc))
+        .addMBB(sinkMBB)
+        .addImm(MI.getOperand(3).getImm())
+        .addReg(MI.getOperand(4).getReg());
 
     //  copy0MBB:
     //   %FalseValue = ...
@@ -8070,12 +8339,13 @@ ARMTargetLowering::EmitInstrWithCustomInserter(MachineInstr *MI,
     //   %Result = phi [ %FalseValue, copy0MBB ], [ %TrueValue, thisMBB ]
     //  ...
     BB = sinkMBB;
-    BuildMI(*BB, BB->begin(), dl,
-            TII->get(ARM::PHI), MI->getOperand(0).getReg())
-      .addReg(MI->getOperand(1).getReg()).addMBB(copy0MBB)
-      .addReg(MI->getOperand(2).getReg()).addMBB(thisMBB);
+    BuildMI(*BB, BB->begin(), dl, TII->get(ARM::PHI), MI.getOperand(0).getReg())
+        .addReg(MI.getOperand(1).getReg())
+        .addMBB(copy0MBB)
+        .addReg(MI.getOperand(2).getReg())
+        .addMBB(thisMBB);
 
-    MI->eraseFromParent();   // The pseudo instruction is gone now.
+    MI.eraseFromParent(); // The pseudo instruction is gone now.
     return BB;
   }
 
@@ -8086,10 +8356,10 @@ ARMTargetLowering::EmitInstrWithCustomInserter(MachineInstr *MI,
 
     // Compare both parts that make up the double comparison separately for
     // equality.
-    bool RHSisZero = MI->getOpcode() == ARM::BCCZi64;
+    bool RHSisZero = MI.getOpcode() == ARM::BCCZi64;
 
-    unsigned LHS1 = MI->getOperand(1).getReg();
-    unsigned LHS2 = MI->getOperand(2).getReg();
+    unsigned LHS1 = MI.getOperand(1).getReg();
+    unsigned LHS2 = MI.getOperand(2).getReg();
     if (RHSisZero) {
       AddDefaultPred(BuildMI(BB, dl,
                              TII->get(isThumb2 ? ARM::t2CMPri : ARM::CMPri))
@@ -8098,8 +8368,8 @@ ARMTargetLowering::EmitInstrWithCustomInserter(MachineInstr *MI,
         .addReg(LHS2).addImm(0)
         .addImm(ARMCC::EQ).addReg(ARM::CPSR);
     } else {
-      unsigned RHS1 = MI->getOperand(3).getReg();
-      unsigned RHS2 = MI->getOperand(4).getReg();
+      unsigned RHS1 = MI.getOperand(3).getReg();
+      unsigned RHS2 = MI.getOperand(4).getReg();
       AddDefaultPred(BuildMI(BB, dl,
                              TII->get(isThumb2 ? ARM::t2CMPrr : ARM::CMPrr))
                      .addReg(LHS1).addReg(RHS1));
@@ -8108,9 +8378,9 @@ ARMTargetLowering::EmitInstrWithCustomInserter(MachineInstr *MI,
         .addImm(ARMCC::EQ).addReg(ARM::CPSR);
     }
 
-    MachineBasicBlock *destMBB = MI->getOperand(RHSisZero ? 3 : 5).getMBB();
+    MachineBasicBlock *destMBB = MI.getOperand(RHSisZero ? 3 : 5).getMBB();
     MachineBasicBlock *exitMBB = OtherSucc(BB, destMBB);
-    if (MI->getOperand(0).getImm() == ARMCC::NE)
+    if (MI.getOperand(0).getImm() == ARMCC::NE)
       std::swap(destMBB, exitMBB);
 
     BuildMI(BB, dl, TII->get(isThumb2 ? ARM::t2Bcc : ARM::Bcc))
@@ -8120,7 +8390,7 @@ ARMTargetLowering::EmitInstrWithCustomInserter(MachineInstr *MI,
     else
       BuildMI(BB, dl, TII->get(ARM::B)) .addMBB(exitMBB);
 
-    MI->eraseFromParent();   // The pseudo instruction is gone now.
+    MI.eraseFromParent(); // The pseudo instruction is gone now.
     return BB;
   }
 
@@ -8157,9 +8427,9 @@ ARMTargetLowering::EmitInstrWithCustomInserter(MachineInstr *MI,
     Fn->insert(BBI, RSBBB);
     Fn->insert(BBI, SinkBB);
 
-    unsigned int ABSSrcReg = MI->getOperand(1).getReg();
-    unsigned int ABSDstReg = MI->getOperand(0).getReg();
-    bool ABSSrcKIll = MI->getOperand(1).isKill();
+    unsigned int ABSSrcReg = MI.getOperand(1).getReg();
+    unsigned int ABSDstReg = MI.getOperand(0).getReg();
+    bool ABSSrcKIll = MI.getOperand(1).isKill();
     bool isThumb2 = Subtarget->isThumb2();
     MachineRegisterInfo &MRI = Fn->getRegInfo();
     // In Thumb mode S must not be specified if source register is the SP or
@@ -8204,7 +8474,7 @@ ARMTargetLowering::EmitInstrWithCustomInserter(MachineInstr *MI,
       .addReg(ABSSrcReg).addMBB(BB);
 
     // remove ABS instruction
-    MI->eraseFromParent();
+    MI.eraseFromParent();
 
     // return last added BB
     return SinkBB;
@@ -8223,38 +8493,38 @@ ARMTargetLowering::EmitInstrWithCustomInserter(MachineInstr *MI,
 /// when it is expanded into LDM/STM. This is done as a post-isel lowering
 /// instead of as a custom inserter because we need the use list from the SDNode.
 static void attachMEMCPYScratchRegs(const ARMSubtarget *Subtarget,
-                                   MachineInstr *MI, const SDNode *Node) {
+                                    MachineInstr &MI, const SDNode *Node) {
   bool isThumb1 = Subtarget->isThumb1Only();
 
-  DebugLoc DL = MI->getDebugLoc();
-  MachineFunction *MF = MI->getParent()->getParent();
+  DebugLoc DL = MI.getDebugLoc();
+  MachineFunction *MF = MI.getParent()->getParent();
   MachineRegisterInfo &MRI = MF->getRegInfo();
   MachineInstrBuilder MIB(*MF, MI);
 
   // If the new dst/src is unused mark it as dead.
   if (!Node->hasAnyUseOfValue(0)) {
-    MI->getOperand(0).setIsDead(true);
+    MI.getOperand(0).setIsDead(true);
   }
   if (!Node->hasAnyUseOfValue(1)) {
-    MI->getOperand(1).setIsDead(true);
+    MI.getOperand(1).setIsDead(true);
   }
 
   // The MEMCPY both defines and kills the scratch registers.
-  for (unsigned I = 0; I != MI->getOperand(4).getImm(); ++I) {
+  for (unsigned I = 0; I != MI.getOperand(4).getImm(); ++I) {
     unsigned TmpReg = MRI.createVirtualRegister(isThumb1 ? &ARM::tGPRRegClass
                                                          : &ARM::GPRRegClass);
     MIB.addReg(TmpReg, RegState::Define|RegState::Dead);
   }
 }
 
-void ARMTargetLowering::AdjustInstrPostInstrSelection(MachineInstr *MI,
+void ARMTargetLowering::AdjustInstrPostInstrSelection(MachineInstr &MI,
                                                       SDNode *Node) const {
-  if (MI->getOpcode() == ARM::MEMCPY) {
+  if (MI.getOpcode() == ARM::MEMCPY) {
     attachMEMCPYScratchRegs(Subtarget, MI, Node);
     return;
   }
 
-  const MCInstrDesc *MCID = &MI->getDesc();
+  const MCInstrDesc *MCID = &MI.getDesc();
   // Adjust potentially 's' setting instructions after isel, i.e. ADC, SBC, RSB,
   // RSC. Coming out of isel, they have an implicit CPSR def, but the optional
   // operand is still set to noreg. If needed, set the optional operand's
@@ -8263,24 +8533,24 @@ void ARMTargetLowering::AdjustInstrPostInstrSelection(MachineInstr *MI,
   // e.g. ADCS (..., CPSR<imp-def>) -> ADC (... opt:CPSR<def>).
 
   // Rename pseudo opcodes.
-  unsigned NewOpc = convertAddSubFlagsOpcode(MI->getOpcode());
+  unsigned NewOpc = convertAddSubFlagsOpcode(MI.getOpcode());
   if (NewOpc) {
     const ARMBaseInstrInfo *TII = Subtarget->getInstrInfo();
     MCID = &TII->get(NewOpc);
 
-    assert(MCID->getNumOperands() == MI->getDesc().getNumOperands() + 1 &&
+    assert(MCID->getNumOperands() == MI.getDesc().getNumOperands() + 1 &&
            "converted opcode should be the same except for cc_out");
 
-    MI->setDesc(*MCID);
+    MI.setDesc(*MCID);
 
     // Add the optional cc_out operand
-    MI->addOperand(MachineOperand::CreateReg(0, /*isDef=*/true));
+    MI.addOperand(MachineOperand::CreateReg(0, /*isDef=*/true));
   }
   unsigned ccOutIdx = MCID->getNumOperands() - 1;
 
   // Any ARM instruction that sets the 's' bit should specify an optional
   // "cc_out" operand in the last operand position.
-  if (!MI->hasOptionalDef() || !MCID->OpInfo[ccOutIdx].isOptionalDef()) {
+  if (!MI.hasOptionalDef() || !MCID->OpInfo[ccOutIdx].isOptionalDef()) {
     assert(!NewOpc && "Optional cc_out operand required");
     return;
   }
@@ -8288,14 +8558,14 @@ void ARMTargetLowering::AdjustInstrPostInstrSelection(MachineInstr *MI,
   // since we already have an optional CPSR def.
   bool definesCPSR = false;
   bool deadCPSR = false;
-  for (unsigned i = MCID->getNumOperands(), e = MI->getNumOperands();
-       i != e; ++i) {
-    const MachineOperand &MO = MI->getOperand(i);
+  for (unsigned i = MCID->getNumOperands(), e = MI.getNumOperands(); i != e;
+       ++i) {
+    const MachineOperand &MO = MI.getOperand(i);
     if (MO.isReg() && MO.isDef() && MO.getReg() == ARM::CPSR) {
       definesCPSR = true;
       if (MO.isDead())
         deadCPSR = true;
-      MI->RemoveOperand(i);
+      MI.RemoveOperand(i);
       break;
     }
   }
@@ -8305,14 +8575,14 @@ void ARMTargetLowering::AdjustInstrPostInstrSelection(MachineInstr *MI,
   }
   assert(deadCPSR == !Node->hasAnyUseOfValue(1) && "inconsistent dead flag");
   if (deadCPSR) {
-    assert(!MI->getOperand(ccOutIdx).getReg() &&
+    assert(!MI.getOperand(ccOutIdx).getReg() &&
            "expect uninitialized optional cc_out operand");
     return;
   }
 
   // If this instruction was defined with an optional CPSR def and its dag node
   // had a live implicit CPSR def, then activate the optional CPSR def.
-  MachineOperand &MO = MI->getOperand(ccOutIdx);
+  MachineOperand &MO = MI.getOperand(ccOutIdx);
   MO.setReg(ARM::CPSR);
   MO.setIsDef(true);
 }
@@ -8442,16 +8712,12 @@ SDValue combineSelectAndUseCommutative(SDNode *N, bool AllOnes,
                                        TargetLowering::DAGCombinerInfo &DCI) {
   SDValue N0 = N->getOperand(0);
   SDValue N1 = N->getOperand(1);
-  if (N0.getNode()->hasOneUse()) {
-    SDValue Result = combineSelectAndUse(N, N0, N1, DCI, AllOnes);
-    if (Result.getNode())
+  if (N0.getNode()->hasOneUse())
+    if (SDValue Result = combineSelectAndUse(N, N0, N1, DCI, AllOnes))
       return Result;
-  }
-  if (N1.getNode()->hasOneUse()) {
-    SDValue Result = combineSelectAndUse(N, N1, N0, DCI, AllOnes);
-    if (Result.getNode())
+  if (N1.getNode()->hasOneUse())
+    if (SDValue Result = combineSelectAndUse(N, N1, N0, DCI, AllOnes))
       return Result;
-  }
   return SDValue();
 }
 
@@ -8533,7 +8799,7 @@ static SDValue AddCombineToVPADDL(SDNode *N, SDValue N0, SDValue N1,
   // Get widened type and narrowed type.
   MVT widenType;
   unsigned numElem = VT.getVectorNumElements();
-  
+
   EVT inputLaneType = Vec.getValueType().getVectorElementType();
   switch (inputLaneType.getSimpleVT().SimpleTy) {
     case MVT::i8: widenType = MVT::getVectorVT(MVT::i16, numElem); break;
@@ -8559,11 +8825,6 @@ static SDValue AddCombineTo64bitMLAL(SDNode *AddcNode,
                                      TargetLowering::DAGCombinerInfo &DCI,
                                      const ARMSubtarget *Subtarget) {
 
-  if (Subtarget->isThumb1Only()) return SDValue();
-
-  // Only perform the checks after legalize when the pattern is available.
-  if (DCI.isBeforeLegalize()) return SDValue();
-
   // Look for multiply add opportunities.
   // The pattern is a ISD::UMUL_LOHI followed by two add nodes, where
   // each add nodes consumes a value from ISD::UMUL_LOHI and there is
@@ -8691,14 +8952,97 @@ static SDValue AddCombineTo64bitMLAL(SDNode *AddcNode,
   return resNode;
 }
 
+static SDValue AddCombineTo64bitUMAAL(SDNode *AddcNode,
+                                      TargetLowering::DAGCombinerInfo &DCI,
+                                      const ARMSubtarget *Subtarget) {
+  // UMAAL is similar to UMLAL except that it adds two unsigned values.
+  // While trying to combine for the other MLAL nodes, first search for the
+  // chance to use UMAAL. Check if Addc uses another addc node which can first
+  // be combined into a UMLAL. The other pattern is AddcNode being combined
+  // into an UMLAL and then using another addc is handled in ISelDAGToDAG.
+
+  if (!Subtarget->hasV6Ops())
+    return AddCombineTo64bitMLAL(AddcNode, DCI, Subtarget);
+
+  SDNode *PrevAddc = nullptr;
+  if (AddcNode->getOperand(0).getOpcode() == ISD::ADDC)
+    PrevAddc = AddcNode->getOperand(0).getNode();
+  else if (AddcNode->getOperand(1).getOpcode() == ISD::ADDC)
+    PrevAddc = AddcNode->getOperand(1).getNode();
+
+  // If there's no addc chains, just return a search for any MLAL.
+  if (PrevAddc == nullptr)
+    return AddCombineTo64bitMLAL(AddcNode, DCI, Subtarget);
+
+  // Try to convert the addc operand to an MLAL and if that fails try to
+  // combine AddcNode.
+  SDValue MLAL = AddCombineTo64bitMLAL(PrevAddc, DCI, Subtarget);
+  if (MLAL != SDValue(PrevAddc, 0))
+    return AddCombineTo64bitMLAL(AddcNode, DCI, Subtarget);
+
+  // Find the converted UMAAL or quit if it doesn't exist.
+  SDNode *UmlalNode = nullptr;
+  SDValue AddHi;
+  if (AddcNode->getOperand(0).getOpcode() == ARMISD::UMLAL) {
+    UmlalNode = AddcNode->getOperand(0).getNode();
+    AddHi = AddcNode->getOperand(1);
+  } else if (AddcNode->getOperand(1).getOpcode() == ARMISD::UMLAL) {
+    UmlalNode = AddcNode->getOperand(1).getNode();
+    AddHi = AddcNode->getOperand(0);
+  } else {
+    return SDValue();
+  }
+
+  // The ADDC should be glued to an ADDE node, which uses the same UMLAL as
+  // the ADDC as well as Zero.
+  auto *Zero = dyn_cast<ConstantSDNode>(UmlalNode->getOperand(3));
+
+  if (!Zero || Zero->getZExtValue() != 0)
+    return SDValue();
+
+  // Check that we have a glued ADDC node.
+  if (AddcNode->getValueType(1) != MVT::Glue)
+    return SDValue();
+
+  // Look for the glued ADDE.
+  SDNode* AddeNode = AddcNode->getGluedUser();
+  if (!AddeNode)
+    return SDValue();
+
+  if ((AddeNode->getOperand(0).getNode() == Zero &&
+       AddeNode->getOperand(1).getNode() == UmlalNode) ||
+      (AddeNode->getOperand(0).getNode() == UmlalNode &&
+       AddeNode->getOperand(1).getNode() == Zero)) {
+
+    SelectionDAG &DAG = DCI.DAG;
+    SDValue Ops[] = { UmlalNode->getOperand(0), UmlalNode->getOperand(1),
+                      UmlalNode->getOperand(2), AddHi };
+    SDValue UMAAL =  DAG.getNode(ARMISD::UMAAL, SDLoc(AddcNode),
+                                 DAG.getVTList(MVT::i32, MVT::i32), Ops);
+
+    // Replace the ADDs' nodes uses by the UMAAL node's values.
+    DAG.ReplaceAllUsesOfValueWith(SDValue(AddeNode, 0), SDValue(UMAAL.getNode(), 1));
+    DAG.ReplaceAllUsesOfValueWith(SDValue(AddcNode, 0), SDValue(UMAAL.getNode(), 0));
+
+    // Return original node to notify the driver to stop replacing.
+    return SDValue(AddcNode, 0);
+  }
+  return SDValue();
+}
+
 /// PerformADDCCombine - Target-specific dag combine transform from
-/// ISD::ADDC, ISD::ADDE, and ISD::MUL_LOHI to MLAL.
+/// ISD::ADDC, ISD::ADDE, and ISD::MUL_LOHI to MLAL or
+/// ISD::ADDC, ISD::ADDE and ARMISD::UMLAL to ARMISD::UMAAL
 static SDValue PerformADDCCombine(SDNode *N,
                                  TargetLowering::DAGCombinerInfo &DCI,
                                  const ARMSubtarget *Subtarget) {
 
-  return AddCombineTo64bitMLAL(N, DCI, Subtarget);
+  if (Subtarget->isThumb1Only()) return SDValue();
 
+  // Only perform the checks after legalize when the pattern is available.
+  if (DCI.isBeforeLegalize()) return SDValue();
+
+  return AddCombineTo64bitUMAAL(N, DCI, Subtarget);
 }
 
 /// PerformADDCombineWithOperands - Try DAG combinations for an ADD with
@@ -8710,15 +9054,13 @@ static SDValue PerformADDCombineWithOperands(SDNode *N, SDValue N0, SDValue N1,
                                           const ARMSubtarget *Subtarget){
 
   // Attempt to create vpaddl for this add.
-  SDValue Result = AddCombineToVPADDL(N, N0, N1, DCI, Subtarget);
-  if (Result.getNode())
+  if (SDValue Result = AddCombineToVPADDL(N, N0, N1, DCI, Subtarget))
     return Result;
 
   // fold (add (select cc, 0, c), x) -> (select cc, x, (add, x, c))
-  if (N0.getNode()->hasOneUse()) {
-    SDValue Result = combineSelectAndUse(N, N0, N1, DCI);
-    if (Result.getNode()) return Result;
-  }
+  if (N0.getNode()->hasOneUse())
+    if (SDValue Result = combineSelectAndUse(N, N0, N1, DCI))
+      return Result;
   return SDValue();
 }
 
@@ -8731,8 +9073,7 @@ static SDValue PerformADDCombine(SDNode *N,
   SDValue N1 = N->getOperand(1);
 
   // First try with the default operand order.
-  SDValue Result = PerformADDCombineWithOperands(N, N0, N1, DCI, Subtarget);
-  if (Result.getNode())
+  if (SDValue Result = PerformADDCombineWithOperands(N, N0, N1, DCI, Subtarget))
     return Result;
 
   // If that didn't work, try again with the operands commuted.
@@ -8747,10 +9088,9 @@ static SDValue PerformSUBCombine(SDNode *N,
   SDValue N1 = N->getOperand(1);
 
   // fold (sub x, (select cc, 0, c)) -> (select cc, x, (sub, x, c))
-  if (N1.getNode()->hasOneUse()) {
-    SDValue Result = combineSelectAndUse(N, N1, N0, DCI);
-    if (Result.getNode()) return Result;
-  }
+  if (N1.getNode()->hasOneUse())
+    if (SDValue Result = combineSelectAndUse(N, N1, N0, DCI))
+      return Result;
 
   return SDValue();
 }
@@ -8920,8 +9260,7 @@ static SDValue PerformANDCombine(SDNode *N,
 
   if (!Subtarget->isThumb1Only()) {
     // fold (and (select cc, -1, c), x) -> (select cc, x, (and, x, c))
-    SDValue Result = combineSelectAndUseCommutative(N, true, DCI);
-    if (Result.getNode())
+    if (SDValue Result = combineSelectAndUseCommutative(N, true, DCI))
       return Result;
   }
 
@@ -8963,8 +9302,7 @@ static SDValue PerformORCombine(SDNode *N,
 
   if (!Subtarget->isThumb1Only()) {
     // fold (or (select cc, 0, c), x) -> (select cc, x, (or, x, c))
-    SDValue Result = combineSelectAndUseCommutative(N, false, DCI);
-    if (Result.getNode())
+    if (SDValue Result = combineSelectAndUseCommutative(N, false, DCI))
       return Result;
   }
 
@@ -9137,8 +9475,7 @@ static SDValue PerformXORCombine(SDNode *N,
 
   if (!Subtarget->isThumb1Only()) {
     // fold (xor (select cc, 0, c), x) -> (select cc, x, (xor, x, c))
-    SDValue Result = combineSelectAndUseCommutative(N, false, DCI);
-    if (Result.getNode())
+    if (SDValue Result = combineSelectAndUseCommutative(N, false, DCI))
       return Result;
   }
 
@@ -9300,17 +9637,15 @@ static SDValue PerformVMOVRRDCombine(SDNode *N,
     SelectionDAG &DAG = DCI.DAG;
     SDLoc DL(LD);
     SDValue BasePtr = LD->getBasePtr();
-    SDValue NewLD1 = DAG.getLoad(MVT::i32, DL, LD->getChain(), BasePtr,
-                                 LD->getPointerInfo(), LD->isVolatile(),
-                                 LD->isNonTemporal(), LD->isInvariant(),
-                                 LD->getAlignment());
+    SDValue NewLD1 =
+        DAG.getLoad(MVT::i32, DL, LD->getChain(), BasePtr, LD->getPointerInfo(),
+                    LD->getAlignment(), LD->getMemOperand()->getFlags());
 
     SDValue OffsetPtr = DAG.getNode(ISD::ADD, DL, MVT::i32, BasePtr,
                                     DAG.getConstant(4, DL, MVT::i32));
-    SDValue NewLD2 = DAG.getLoad(MVT::i32, DL, NewLD1.getValue(1), OffsetPtr,
-                                 LD->getPointerInfo(), LD->isVolatile(),
-                                 LD->isNonTemporal(), LD->isInvariant(),
-                                 std::min(4U, LD->getAlignment() / 2));
+    SDValue NewLD2 = DAG.getLoad(
+        MVT::i32, DL, NewLD1.getValue(1), OffsetPtr, LD->getPointerInfo(),
+        std::min(4U, LD->getAlignment() / 2), LD->getMemOperand()->getFlags());
 
     DAG.ReplaceAllUsesOfValueWith(SDValue(LD, 1), NewLD2.getValue(1));
     if (DCI.DAG.getDataLayout().isBigEndian())
@@ -9364,11 +9699,9 @@ static SDValue PerformBUILD_VECTORCombine(SDNode *N,
   // into a pair of GPRs, which is fine when the value is used as a scalar,
   // but if the i64 value is converted to a vector, we need to undo the VMOVRRD.
   SelectionDAG &DAG = DCI.DAG;
-  if (N->getNumOperands() == 2) {
-    SDValue RV = PerformVMOVDRRCombine(N, DAG);
-    if (RV.getNode())
+  if (N->getNumOperands() == 2)
+    if (SDValue RV = PerformVMOVDRRCombine(N, DAG))
       return RV;
-  }
 
   // Load i64 elements as f64 values so that type legalization does not split
   // them up into i32 values.
@@ -9385,7 +9718,7 @@ static SDValue PerformBUILD_VECTORCombine(SDNode *N,
     DCI.AddToWorklist(V.getNode());
   }
   EVT FloatVT = EVT::getVectorVT(*DAG.getContext(), MVT::f64, NumElts);
-  SDValue BV = DAG.getNode(ISD::BUILD_VECTOR, dl, FloatVT, Ops);
+  SDValue BV = DAG.getBuildVector(FloatVT, dl, Ops);
   return DAG.getNode(ISD::BITCAST, dl, VT, BV);
 }
 
@@ -9434,7 +9767,7 @@ PerformARMBUILD_VECTORCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI) {
       // Assume only bit cast to i32 will go away.
       if (Elt->getOperand(0).getValueType() == MVT::i32)
         ++NumOfBitCastedElts;
-    } else if (Elt.getOpcode() == ISD::UNDEF || isa<ConstantSDNode>(Elt))
+    } else if (Elt.isUndef() || isa<ConstantSDNode>(Elt))
       // Constants are statically casted, thus do not count them as
       // relevant operands.
       --NumOfRelevantElts;
@@ -9461,7 +9794,7 @@ PerformARMBUILD_VECTORCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI) {
   SDLoc dl(N);
   for (unsigned Idx = 0 ; Idx < NumElts; ++Idx) {
     SDValue V = N->getOperand(Idx);
-    if (V.getOpcode() == ISD::UNDEF)
+    if (V.isUndef())
       continue;
     if (V.getOpcode() == ISD::BITCAST &&
         V->getOperand(0).getValueType() == MVT::i32)
@@ -9529,8 +9862,7 @@ static SDValue PerformVECTOR_SHUFFLECombine(SDNode *N, SelectionDAG &DAG) {
     return SDValue();
   SDValue Concat0Op1 = Op0.getOperand(1);
   SDValue Concat1Op1 = Op1.getOperand(1);
-  if (Concat0Op1.getOpcode() != ISD::UNDEF ||
-      Concat1Op1.getOpcode() != ISD::UNDEF)
+  if (!Concat0Op1.isUndef() || !Concat1Op1.isUndef())
     return SDValue();
   // Skip the transformation if any of the types are illegal.
   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
@@ -9557,7 +9889,7 @@ static SDValue PerformVECTOR_SHUFFLECombine(SDNode *N, SelectionDAG &DAG) {
     NewMask.push_back(NewElt);
   }
   return DAG.getVectorShuffle(VT, SDLoc(N), NewConcat,
-                              DAG.getUNDEF(VT), NewMask.data());
+                              DAG.getUNDEF(VT), NewMask);
 }
 
 /// CombineBaseUpdate - Target-specific DAG combine function for VLDDUP,
@@ -9953,7 +10285,7 @@ static SDValue PerformSTORECombine(SDNode *N,
 
     SDValue Shuff = DAG.getVectorShuffle(WideVecVT, DL, WideVec,
                                 DAG.getUNDEF(WideVec.getValueType()),
-                                ShuffleVec.data());
+                                ShuffleVec);
     // At this point all of the data is stored at the bottom of the
     // register. We now need to save it to mem.
 
@@ -9984,8 +10316,8 @@ static SDValue PerformSTORECombine(SDNode *N,
                                    StoreType, ShuffWide,
                                    DAG.getIntPtrConstant(I, DL));
       SDValue Ch = DAG.getStore(St->getChain(), DL, SubVec, BasePtr,
-                                St->getPointerInfo(), St->isVolatile(),
-                                St->isNonTemporal(), St->getAlignment());
+                                St->getPointerInfo(), St->getAlignment(),
+                                St->getMemOperand()->getFlags());
       BasePtr = DAG.getNode(ISD::ADD, DL, BasePtr.getValueType(), BasePtr,
                             Increment);
       Chains.push_back(Ch);
@@ -10004,18 +10336,18 @@ static SDValue PerformSTORECombine(SDNode *N,
     bool isBigEndian = DAG.getDataLayout().isBigEndian();
     SDLoc DL(St);
     SDValue BasePtr = St->getBasePtr();
-    SDValue NewST1 = DAG.getStore(St->getChain(), DL,
-                                  StVal.getNode()->getOperand(isBigEndian ? 1 : 0 ),
-                                  BasePtr, St->getPointerInfo(), St->isVolatile(),
-                                  St->isNonTemporal(), St->getAlignment());
+    SDValue NewST1 = DAG.getStore(
+        St->getChain(), DL, StVal.getNode()->getOperand(isBigEndian ? 1 : 0),
+        BasePtr, St->getPointerInfo(), St->getAlignment(),
+        St->getMemOperand()->getFlags());
 
     SDValue OffsetPtr = DAG.getNode(ISD::ADD, DL, MVT::i32, BasePtr,
                                     DAG.getConstant(4, DL, MVT::i32));
     return DAG.getStore(NewST1.getValue(0), DL,
                         StVal.getNode()->getOperand(isBigEndian ? 0 : 1),
-                        OffsetPtr, St->getPointerInfo(), St->isVolatile(),
-                        St->isNonTemporal(),
-                        std::min(4U, St->getAlignment() / 2));
+                        OffsetPtr, St->getPointerInfo(),
+                        std::min(4U, St->getAlignment() / 2),
+                        St->getMemOperand()->getFlags());
   }
 
   if (StVal.getValueType() == MVT::i64 &&
@@ -10038,9 +10370,8 @@ static SDValue PerformSTORECombine(SDNode *N,
     DCI.AddToWorklist(ExtElt.getNode());
     DCI.AddToWorklist(V.getNode());
     return DAG.getStore(St->getChain(), dl, V, St->getBasePtr(),
-                        St->getPointerInfo(), St->isVolatile(),
-                        St->isNonTemporal(), St->getAlignment(),
-                        St->getAAInfo());
+                        St->getPointerInfo(), St->getAlignment(),
+                        St->getMemOperand()->getFlags(), St->getAAInfo());
   }
 
   // If this is a legal vector store, try to combine it into a VST1_UPD.
@@ -10066,7 +10397,8 @@ static SDValue PerformVCVTCombine(SDNode *N, SelectionDAG &DAG,
     return SDValue();
 
   SDValue Op = N->getOperand(0);
-  if (!Op.getValueType().isVector() || Op.getOpcode() != ISD::FMUL)
+  if (!Op.getValueType().isVector() || !Op.getValueType().isSimple() ||
+      Op.getOpcode() != ISD::FMUL)
     return SDValue();
 
   SDValue ConstVec = Op->getOperand(1);
@@ -10123,7 +10455,7 @@ static SDValue PerformVDIVCombine(SDNode *N, SelectionDAG &DAG,
 
   SDValue Op = N->getOperand(0);
   unsigned OpOpcode = Op.getNode()->getOpcode();
-  if (!N->getValueType(0).isVector() ||
+  if (!N->getValueType(0).isVector() || !N->getValueType(0).isSimple() ||
       (OpOpcode != ISD::SINT_TO_FP && OpOpcode != ISD::UINT_TO_FP))
     return SDValue();
 
@@ -10464,7 +10796,7 @@ static void computeKnownBits(SelectionDAG &DAG, SDValue Op, APInt &KnownZero,
     // The operand to BFI is already a mask suitable for removing the bits it
     // sets.
     ConstantSDNode *CI = cast<ConstantSDNode>(Op.getOperand(2));
-    APInt Mask = CI->getAPIntValue();
+    const APInt &Mask = CI->getAPIntValue();
     KnownZero &= Mask;
     KnownOne &= Mask;
     return;
@@ -10522,7 +10854,7 @@ SDValue ARMTargetLowering::PerformCMOVToBFICombine(SDNode *CMOV, SelectionDAG &D
   } else {
     assert(CC == ARMCC::NE && "How can a CMPZ node not be EQ or NE?");
   }
-  
+
   if (Op1->getOpcode() != ISD::OR)
     return SDValue();
 
@@ -10552,7 +10884,7 @@ SDValue ARMTargetLowering::PerformCMOVToBFICombine(SDNode *CMOV, SelectionDAG &D
   SDLoc dl(X);
   EVT VT = X.getValueType();
   unsigned BitInX = AndC->getAPIntValue().logBase2();
-  
+
   if (BitInX != 0) {
     // We must shift X first.
     X = DAG.getNode(ISD::SRL, dl, VT, X,
@@ -10573,6 +10905,46 @@ SDValue ARMTargetLowering::PerformCMOVToBFICombine(SDNode *CMOV, SelectionDAG &D
   return V;
 }
 
+/// PerformBRCONDCombine - Target-specific DAG combining for ARMISD::BRCOND.
+SDValue
+ARMTargetLowering::PerformBRCONDCombine(SDNode *N, SelectionDAG &DAG) const {
+  SDValue Cmp = N->getOperand(4);
+  if (Cmp.getOpcode() != ARMISD::CMPZ)
+    // Only looking at NE cases.
+    return SDValue();
+
+  EVT VT = N->getValueType(0);
+  SDLoc dl(N);
+  SDValue LHS = Cmp.getOperand(0);
+  SDValue RHS = Cmp.getOperand(1);
+  SDValue Chain = N->getOperand(0);
+  SDValue BB = N->getOperand(1);
+  SDValue ARMcc = N->getOperand(2);
+  ARMCC::CondCodes CC =
+    (ARMCC::CondCodes)cast<ConstantSDNode>(ARMcc)->getZExtValue();
+
+  // (brcond Chain BB ne CPSR (cmpz (and (cmov 0 1 CC CPSR Cmp) 1) 0))
+  // -> (brcond Chain BB CC CPSR Cmp)
+  if (CC == ARMCC::NE && LHS.getOpcode() == ISD::AND && LHS->hasOneUse() &&
+      LHS->getOperand(0)->getOpcode() == ARMISD::CMOV &&
+      LHS->getOperand(0)->hasOneUse()) {
+    auto *LHS00C = dyn_cast<ConstantSDNode>(LHS->getOperand(0)->getOperand(0));
+    auto *LHS01C = dyn_cast<ConstantSDNode>(LHS->getOperand(0)->getOperand(1));
+    auto *LHS1C = dyn_cast<ConstantSDNode>(LHS->getOperand(1));
+    auto *RHSC = dyn_cast<ConstantSDNode>(RHS);
+    if ((LHS00C && LHS00C->getZExtValue() == 0) &&
+        (LHS01C && LHS01C->getZExtValue() == 1) &&
+        (LHS1C && LHS1C->getZExtValue() == 1) &&
+        (RHSC && RHSC->getZExtValue() == 0)) {
+      return DAG.getNode(
+          ARMISD::BRCOND, dl, VT, Chain, BB, LHS->getOperand(0)->getOperand(2),
+          LHS->getOperand(0)->getOperand(3), LHS->getOperand(0)->getOperand(4));
+    }
+  }
+
+  return SDValue();
+}
+
 /// PerformCMOVCombine - Target-specific DAG combining for ARMISD::CMOV.
 SDValue
 ARMTargetLowering::PerformCMOVCombine(SDNode *N, SelectionDAG &DAG) const {
@@ -10626,6 +10998,21 @@ ARMTargetLowering::PerformCMOVCombine(SDNode *N, SelectionDAG &DAG) const {
                       N->getOperand(3), NewCmp);
   }
 
+  // (cmov F T ne CPSR (cmpz (cmov 0 1 CC CPSR Cmp) 0))
+  // -> (cmov F T CC CPSR Cmp)
+  if (CC == ARMCC::NE && LHS.getOpcode() == ARMISD::CMOV && LHS->hasOneUse()) {
+    auto *LHS0C = dyn_cast<ConstantSDNode>(LHS->getOperand(0));
+    auto *LHS1C = dyn_cast<ConstantSDNode>(LHS->getOperand(1));
+    auto *RHSC = dyn_cast<ConstantSDNode>(RHS);
+    if ((LHS0C && LHS0C->getZExtValue() == 0) &&
+        (LHS1C && LHS1C->getZExtValue() == 1) &&
+        (RHSC && RHSC->getZExtValue() == 0)) {
+      return DAG.getNode(ARMISD::CMOV, dl, VT, FalseVal, TrueVal,
+                         LHS->getOperand(2), LHS->getOperand(3),
+                         LHS->getOperand(4));
+    }
+  }
+
   if (Res.getNode()) {
     APInt KnownZero, KnownOne;
     DAG.computeKnownBits(SDValue(N,0), KnownZero, KnownOne);
@@ -10676,6 +11063,7 @@ SDValue ARMTargetLowering::PerformDAGCombine(SDNode *N,
   case ISD::ZERO_EXTEND:
   case ISD::ANY_EXTEND: return PerformExtendCombine(N, DCI.DAG, Subtarget);
   case ARMISD::CMOV: return PerformCMOVCombine(N, DCI.DAG);
+  case ARMISD::BRCOND: return PerformBRCONDCombine(N, DCI.DAG);
   case ISD::LOAD:       return PerformLOADCombine(N, DCI);
   case ARMISD::VLD2DUP:
   case ARMISD::VLD3DUP:
@@ -11198,22 +11586,37 @@ bool ARMTargetLowering::getPostIndexedAddressParts(SDNode *N, SDNode *Op,
                                                    SDValue &Offset,
                                                    ISD::MemIndexedMode &AM,
                                                    SelectionDAG &DAG) const {
-  if (Subtarget->isThumb1Only())
-    return false;
-
   EVT VT;
   SDValue Ptr;
-  bool isSEXTLoad = false;
+  bool isSEXTLoad = false, isNonExt;
   if (LoadSDNode *LD = dyn_cast<LoadSDNode>(N)) {
     VT  = LD->getMemoryVT();
     Ptr = LD->getBasePtr();
     isSEXTLoad = LD->getExtensionType() == ISD::SEXTLOAD;
+    isNonExt = LD->getExtensionType() == ISD::NON_EXTLOAD;
   } else if (StoreSDNode *ST = dyn_cast<StoreSDNode>(N)) {
     VT  = ST->getMemoryVT();
     Ptr = ST->getBasePtr();
+    isNonExt = !ST->isTruncatingStore();
   } else
     return false;
 
+  if (Subtarget->isThumb1Only()) {
+    // Thumb-1 can do a limited post-inc load or store as an updating LDM. It
+    // must be non-extending/truncating, i32, with an offset of 4.
+    assert(Op->getValueType(0) == MVT::i32 && "Non-i32 post-inc op?!");
+    if (Op->getOpcode() != ISD::ADD || !isNonExt)
+      return false;
+    auto *RHS = dyn_cast<ConstantSDNode>(Op->getOperand(1));
+    if (!RHS || RHS->getZExtValue() != 4)
+      return false;
+    
+    Offset = Op->getOperand(1);
+    Base = Op->getOperand(0);
+    AM = ISD::POST_INC;
+    return true;
+  }
+  
   bool isInc;
   bool isLegal = false;
   if (Subtarget->isThumb2())
@@ -11322,6 +11725,26 @@ bool ARMTargetLowering::ExpandInlineAsm(CallInst *CI) const {
   return false;
 }
 
+const char *ARMTargetLowering::LowerXConstraint(EVT ConstraintVT) const {
+  // At this point, we have to lower this constraint to something else, so we
+  // lower it to an "r" or "w". However, by doing this we will force the result
+  // to be in register, while the X constraint is much more permissive.
+  //
+  // Although we are correct (we are free to emit anything, without
+  // constraints), we might break use cases that would expect us to be more
+  // efficient and emit something else.
+  if (!Subtarget->hasVFP2())
+    return "r";
+  if (ConstraintVT.isFloatingPoint())
+    return "w";
+  if (ConstraintVT.isVector() && Subtarget->hasNEON() &&
+     (ConstraintVT.getSizeInBits() == 64 ||
+      ConstraintVT.getSizeInBits() == 128))
+    return "w";
+
+  return "r";
+}
+
 /// getConstraintType - Given a constraint letter, return the type of
 /// constraint it is for this target.
 ARMTargetLowering::ConstraintType
@@ -11640,7 +12063,8 @@ static TargetLowering::ArgListTy getDivRemArgList(
 }
 
 SDValue ARMTargetLowering::LowerDivRem(SDValue Op, SelectionDAG &DAG) const {
-  assert((Subtarget->isTargetAEABI() || Subtarget->isTargetAndroid()) &&
+  assert((Subtarget->isTargetAEABI() || Subtarget->isTargetAndroid() ||
+          Subtarget->isTargetGNUAEABI() || Subtarget->isTargetMuslAEABI()) &&
          "Register-based DivRem lowering only");
   unsigned Opcode = Op->getOpcode();
   assert((Opcode == ISD::SDIVREM || Opcode == ISD::UDIVREM) &&
@@ -11664,7 +12088,7 @@ SDValue ARMTargetLowering::LowerDivRem(SDValue Op, SelectionDAG &DAG) const {
   SDLoc dl(Op);
   TargetLowering::CallLoweringInfo CLI(DAG);
   CLI.setDebugLoc(dl).setChain(InChain)
-    .setCallee(getLibcallCallingConv(LC), RetTy, Callee, std::move(Args), 0)
+    .setCallee(getLibcallCallingConv(LC), RetTy, Callee, std::move(Args))
     .setInRegister().setSExtResult(isSigned).setZExtResult(!isSigned);
 
   std::pair<SDValue, SDValue> CallInfo = LowerCallTo(CLI);
@@ -11702,7 +12126,7 @@ SDValue ARMTargetLowering::LowerREM(SDNode *N, SelectionDAG &DAG) const {
   // Lower call
   CallLoweringInfo CLI(DAG);
   CLI.setChain(InChain)
-     .setCallee(CallingConv::ARM_AAPCS, RetTy, Callee, std::move(Args), 0)
+     .setCallee(CallingConv::ARM_AAPCS, RetTy, Callee, std::move(Args))
      .setSExtResult(isSigned).setZExtResult(!isSigned).setDebugLoc(SDLoc(N));
   std::pair<SDValue, SDValue> CallResult = LowerCallTo(CLI);
 
@@ -11950,23 +12374,20 @@ Instruction* ARMTargetLowering::makeDMB(IRBuilder<> &Builder,
 Instruction* ARMTargetLowering::emitLeadingFence(IRBuilder<> &Builder,
                                          AtomicOrdering Ord, bool IsStore,
                                          bool IsLoad) const {
-  if (!getInsertFencesForAtomic())
-    return nullptr;
-
   switch (Ord) {
-  case NotAtomic:
-  case Unordered:
+  case AtomicOrdering::NotAtomic:
+  case AtomicOrdering::Unordered:
     llvm_unreachable("Invalid fence: unordered/non-atomic");
-  case Monotonic:
-  case Acquire:
+  case AtomicOrdering::Monotonic:
+  case AtomicOrdering::Acquire:
     return nullptr; // Nothing to do
-  case SequentiallyConsistent:
+  case AtomicOrdering::SequentiallyConsistent:
     if (!IsStore)
       return nullptr; // Nothing to do
     /*FALLTHROUGH*/
-  case Release:
-  case AcquireRelease:
-    if (Subtarget->isSwift())
+  case AtomicOrdering::Release:
+  case AtomicOrdering::AcquireRelease:
+    if (Subtarget->preferISHSTBarriers())
       return makeDMB(Builder, ARM_MB::ISHST);
     // FIXME: add a comment with a link to documentation justifying this.
     else
@@ -11978,19 +12399,16 @@ Instruction* ARMTargetLowering::emitLeadingFence(IRBuilder<> &Builder,
 Instruction* ARMTargetLowering::emitTrailingFence(IRBuilder<> &Builder,
                                           AtomicOrdering Ord, bool IsStore,
                                           bool IsLoad) const {
-  if (!getInsertFencesForAtomic())
-    return nullptr;
-
   switch (Ord) {
-  case NotAtomic:
-  case Unordered:
+  case AtomicOrdering::NotAtomic:
+  case AtomicOrdering::Unordered:
     llvm_unreachable("Invalid fence: unordered/not-atomic");
-  case Monotonic:
-  case Release:
+  case AtomicOrdering::Monotonic:
+  case AtomicOrdering::Release:
     return nullptr; // Nothing to do
-  case Acquire:
-  case AcquireRelease:
-  case SequentiallyConsistent:
+  case AtomicOrdering::Acquire:
+  case AtomicOrdering::AcquireRelease:
+  case AtomicOrdering::SequentiallyConsistent:
     return makeDMB(Builder, ARM_MB::ISH);
   }
   llvm_unreachable("Unknown fence ordering in emitTrailingFence");
@@ -12031,7 +12449,17 @@ ARMTargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *AI) const {
 
 bool ARMTargetLowering::shouldExpandAtomicCmpXchgInIR(
     AtomicCmpXchgInst *AI) const {
-  return true;
+  // At -O0, fast-regalloc cannot cope with the live vregs necessary to
+  // implement cmpxchg without spilling. If the address being exchanged is also
+  // on the stack and close enough to the spill slot, this can lead to a
+  // situation where the monitor always gets cleared and the atomic operation
+  // can never succeed. So at -O0 we need a late-expanded pseudo-inst instead.
+  return getTargetMachine().getOptLevel() != 0;
+}
+
+bool ARMTargetLowering::shouldInsertFencesForAtomic(
+    const Instruction *I) const {
+  return InsertFencesForAtomic;
 }
 
 // This has so far only been implemented for MachO.
@@ -12080,7 +12508,7 @@ Value *ARMTargetLowering::emitLoadLinked(IRBuilder<> &Builder, Value *Addr,
                                          AtomicOrdering Ord) const {
   Module *M = Builder.GetInsertBlock()->getParent()->getParent();
   Type *ValTy = cast<PointerType>(Addr->getType())->getElementType();
-  bool IsAcquire = isAtLeastAcquire(Ord);
+  bool IsAcquire = isAcquireOrStronger(Ord);
 
   // Since i64 isn't legal and intrinsics don't get type-lowered, the ldrexd
   // intrinsic must return {i32, i32} and we have to recombine them into a
@@ -12124,7 +12552,7 @@ Value *ARMTargetLowering::emitStoreConditional(IRBuilder<> &Builder, Value *Val,
                                                Value *Addr,
                                                AtomicOrdering Ord) const {
   Module *M = Builder.GetInsertBlock()->getParent()->getParent();
-  bool IsRelease = isAtLeastRelease(Ord);
+  bool IsRelease = isReleaseOrStronger(Ord);
 
   // Since the intrinsics must have legal type, the i64 intrinsics take two
   // parameters: "i32, i32". We must marshal Val into the appropriate form
diff --git a/lib/Target/ARM/ARMISelLowering.h b/lib/Target/ARM/ARMISelLowering.h
index 96b56c3ec330..4906686616bc 100644
--- a/lib/Target/ARM/ARMISelLowering.h
+++ b/lib/Target/ARM/ARMISelLowering.h
@@ -43,7 +43,6 @@ namespace llvm {
       CALL,         // Function call.
       CALL_PRED,    // Function call that's predicable.
       CALL_NOLINK,  // Function call with branch not branch-and-link.
-      tCALL,        // Thumb function call.
       BRCOND,       // Conditional branch.
       BR_JT,        // Jumptable branch.
       BR2_JT,       // Jumptable branch (2 level - jumptable entry is a jump).
@@ -61,6 +60,8 @@ namespace llvm {
 
       CMOV,         // ARM conditional move instructions.
 
+      SSAT,         // Signed saturation
+
       BCC_i64,
 
       SRL_FLAG,     // V,Flag = srl_flag X -> srl X, 1 + save carry out.
@@ -164,6 +165,7 @@ namespace llvm {
 
       UMLAL,        // 64bit Unsigned Accumulate Multiply
       SMLAL,        // 64bit Signed Accumulate Multiply
+      UMAAL,        // 64-bit Unsigned Accumulate Accumulate Multiply
 
       // Operands of the standard BUILD_VECTOR node are not legalized, which
       // is fine if BUILD_VECTORs are always lowered to shuffles or other
@@ -251,13 +253,14 @@ namespace llvm {
                            EVT VT) const override;
 
     MachineBasicBlock *
-      EmitInstrWithCustomInserter(MachineInstr *MI,
-                                  MachineBasicBlock *MBB) const override;
+    EmitInstrWithCustomInserter(MachineInstr &MI,
+                                MachineBasicBlock *MBB) const override;
 
-    void AdjustInstrPostInstrSelection(MachineInstr *MI,
+    void AdjustInstrPostInstrSelection(MachineInstr &MI,
                                        SDNode *Node) const override;
 
     SDValue PerformCMOVCombine(SDNode *N, SelectionDAG &DAG) const;
+    SDValue PerformBRCONDCombine(SDNode *N, SelectionDAG &DAG) const;
     SDValue PerformCMOVToBFICombine(SDNode *N, SelectionDAG &DAG) const;
     SDValue PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const override;
 
@@ -335,6 +338,8 @@ namespace llvm {
     getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI,
                                  StringRef Constraint, MVT VT) const override;
 
+    const char *LowerXConstraint(EVT ConstraintVT) const override;
+
     /// LowerAsmOperandForConstraint - Lower the specified operand into the Ops
     /// vector.  If it is invalid, don't add anything to Ops. If hasMemory is
     /// true it means one of the asm constraint of the inline asm instruction
@@ -453,6 +458,7 @@ namespace llvm {
     bool lowerInterleavedStore(StoreInst *SI, ShuffleVectorInst *SVI,
                                unsigned Factor) const override;
 
+    bool shouldInsertFencesForAtomic(const Instruction *I) const override;
     TargetLoweringBase::AtomicExpansionKind
     shouldExpandAtomicLoadInIR(LoadInst *LI) const override;
     bool shouldExpandAtomicStoreInIR(StoreInst *SI) const override;
@@ -468,6 +474,14 @@ namespace llvm {
     bool isCheapToSpeculateCttz() const override;
     bool isCheapToSpeculateCtlz() const override;
 
+    bool supportSwiftError() const override {
+      return true;
+    }
+
+    bool hasStandaloneRem(EVT VT) const override {
+      return HasStandaloneRem;
+    }
+
   protected:
     std::pair<const TargetRegisterClass *, uint8_t>
     findRepresentativeClass(const TargetRegisterInfo *TRI,
@@ -486,29 +500,34 @@ namespace llvm {
     ///
     unsigned ARMPCLabelIndex;
 
+    // TODO: remove this, and have shouldInsertFencesForAtomic do the proper
+    // check.
+    bool InsertFencesForAtomic;
+
+    bool HasStandaloneRem = true;
+
     void addTypeForNEON(MVT VT, MVT PromotedLdStVT, MVT PromotedBitwiseVT);
     void addDRTypeForNEON(MVT VT);
     void addQRTypeForNEON(MVT VT);
     std::pair<SDValue, SDValue> getARMXALUOOp(SDValue Op, SelectionDAG &DAG, SDValue &ARMcc) const;
 
     typedef SmallVector<std::pair<unsigned, SDValue>, 8> RegsToPassVector;
-    void PassF64ArgInRegs(SDLoc dl, SelectionDAG &DAG,
-                          SDValue Chain, SDValue &Arg,
-                          RegsToPassVector &RegsToPass,
+    void PassF64ArgInRegs(const SDLoc &dl, SelectionDAG &DAG, SDValue Chain,
+                          SDValue &Arg, RegsToPassVector &RegsToPass,
                           CCValAssign &VA, CCValAssign &NextVA,
                           SDValue &StackPtr,
                           SmallVectorImpl<SDValue> &MemOpChains,
                           ISD::ArgFlagsTy Flags) const;
     SDValue GetF64FormalArgument(CCValAssign &VA, CCValAssign &NextVA,
                                  SDValue &Root, SelectionDAG &DAG,
-                                 SDLoc dl) const;
+                                 const SDLoc &dl) const;
 
     CallingConv::ID getEffectiveCallingConv(CallingConv::ID CC,
                                             bool isVarArg) const;
     CCAssignFn *CCAssignFnForNode(CallingConv::ID CC, bool Return,
                                   bool isVarArg) const;
     SDValue LowerMemOpCallTo(SDValue Chain, SDValue StackPtr, SDValue Arg,
-                             SDLoc dl, SelectionDAG &DAG,
+                             const SDLoc &dl, SelectionDAG &DAG,
                              const CCValAssign &VA,
                              ISD::ArgFlagsTy Flags) const;
     SDValue LowerEH_SJLJ_SETJMP(SDValue Op, SelectionDAG &DAG) const;
@@ -527,6 +546,7 @@ namespace llvm {
                                  SelectionDAG &DAG,
                                  TLSModel::Model model) const;
     SDValue LowerGlobalTLSAddressDarwin(SDValue Op, SelectionDAG &DAG) const;
+    SDValue LowerGlobalTLSAddressWindows(SDValue Op, SelectionDAG &DAG) const;
     SDValue LowerGLOBAL_OFFSET_TABLE(SDValue Op, SelectionDAG &DAG) const;
     SDValue LowerBR_JT(SDValue Op, SelectionDAG &DAG) const;
     SDValue LowerXALUO(SDValue Op, SelectionDAG &DAG) const;
@@ -576,9 +596,9 @@ namespace llvm {
     SDValue LowerCallResult(SDValue Chain, SDValue InFlag,
                             CallingConv::ID CallConv, bool isVarArg,
                             const SmallVectorImpl<ISD::InputArg> &Ins,
-                            SDLoc dl, SelectionDAG &DAG,
-                            SmallVectorImpl<SDValue> &InVals,
-                            bool isThisReturn, SDValue ThisVal) const;
+                            const SDLoc &dl, SelectionDAG &DAG,
+                            SmallVectorImpl<SDValue> &InVals, bool isThisReturn,
+                            SDValue ThisVal) const;
 
     bool supportSplitCSR(MachineFunction *MF) const override {
       return MF->getFunction()->getCallingConv() == CallingConv::CXX_FAST_TLS &&
@@ -590,23 +610,19 @@ namespace llvm {
       const SmallVectorImpl<MachineBasicBlock *> &Exits) const override;
 
     SDValue
-      LowerFormalArguments(SDValue Chain,
-                           CallingConv::ID CallConv, bool isVarArg,
-                           const SmallVectorImpl<ISD::InputArg> &Ins,
-                           SDLoc dl, SelectionDAG &DAG,
-                           SmallVectorImpl<SDValue> &InVals) const override;
-
-    int StoreByValRegs(CCState &CCInfo, SelectionDAG &DAG,
-                       SDLoc dl, SDValue &Chain,
-                       const Value *OrigArg,
-                       unsigned InRegsParamRecordIdx,
-                       int ArgOffset,
+    LowerFormalArguments(SDValue Chain, CallingConv::ID CallConv, bool isVarArg,
+                         const SmallVectorImpl<ISD::InputArg> &Ins,
+                         const SDLoc &dl, SelectionDAG &DAG,
+                         SmallVectorImpl<SDValue> &InVals) const override;
+
+    int StoreByValRegs(CCState &CCInfo, SelectionDAG &DAG, const SDLoc &dl,
+                       SDValue &Chain, const Value *OrigArg,
+                       unsigned InRegsParamRecordIdx, int ArgOffset,
                        unsigned ArgSize) const;
 
     void VarArgStyleRegisters(CCState &CCInfo, SelectionDAG &DAG,
-                              SDLoc dl, SDValue &Chain,
-                              unsigned ArgOffset,
-                              unsigned TotalArgRegsSaveSize,
+                              const SDLoc &dl, SDValue &Chain,
+                              unsigned ArgOffset, unsigned TotalArgRegsSaveSize,
                               bool ForceMutable = false) const;
 
     SDValue
@@ -634,42 +650,39 @@ namespace llvm {
                         const SmallVectorImpl<ISD::OutputArg> &Outs,
                         LLVMContext &Context) const override;
 
-    SDValue
-      LowerReturn(SDValue Chain,
-                  CallingConv::ID CallConv, bool isVarArg,
-                  const SmallVectorImpl<ISD::OutputArg> &Outs,
-                  const SmallVectorImpl<SDValue> &OutVals,
-                  SDLoc dl, SelectionDAG &DAG) const override;
+    SDValue LowerReturn(SDValue Chain, CallingConv::ID CallConv, bool isVarArg,
+                        const SmallVectorImpl<ISD::OutputArg> &Outs,
+                        const SmallVectorImpl<SDValue> &OutVals,
+                        const SDLoc &dl, SelectionDAG &DAG) const override;
 
     bool isUsedByReturnOnly(SDNode *N, SDValue &Chain) const override;
 
     bool mayBeEmittedAsTailCall(CallInst *CI) const override;
 
-    SDValue getCMOV(SDLoc dl, EVT VT, SDValue FalseVal, SDValue TrueVal,
+    SDValue getCMOV(const SDLoc &dl, EVT VT, SDValue FalseVal, SDValue TrueVal,
                     SDValue ARMcc, SDValue CCR, SDValue Cmp,
                     SelectionDAG &DAG) const;
     SDValue getARMCmp(SDValue LHS, SDValue RHS, ISD::CondCode CC,
-                      SDValue &ARMcc, SelectionDAG &DAG, SDLoc dl) const;
-    SDValue getVFPCmp(SDValue LHS, SDValue RHS,
-                      SelectionDAG &DAG, SDLoc dl) const;
+                      SDValue &ARMcc, SelectionDAG &DAG, const SDLoc &dl) const;
+    SDValue getVFPCmp(SDValue LHS, SDValue RHS, SelectionDAG &DAG,
+                      const SDLoc &dl) const;
     SDValue duplicateCmp(SDValue Cmp, SelectionDAG &DAG) const;
 
     SDValue OptimizeVFPBrcond(SDValue Op, SelectionDAG &DAG) const;
 
-    void SetupEntryBlockForSjLj(MachineInstr *MI,
-                                MachineBasicBlock *MBB,
+    void SetupEntryBlockForSjLj(MachineInstr &MI, MachineBasicBlock *MBB,
                                 MachineBasicBlock *DispatchBB, int FI) const;
 
-    void EmitSjLjDispatchBlock(MachineInstr *MI, MachineBasicBlock *MBB) const;
+    void EmitSjLjDispatchBlock(MachineInstr &MI, MachineBasicBlock *MBB) const;
 
-    bool RemapAddSubWithFlags(MachineInstr *MI, MachineBasicBlock *BB) const;
+    bool RemapAddSubWithFlags(MachineInstr &MI, MachineBasicBlock *BB) const;
 
-    MachineBasicBlock *EmitStructByval(MachineInstr *MI,
+    MachineBasicBlock *EmitStructByval(MachineInstr &MI,
                                        MachineBasicBlock *MBB) const;
 
-    MachineBasicBlock *EmitLowered__chkstk(MachineInstr *MI,
+    MachineBasicBlock *EmitLowered__chkstk(MachineInstr &MI,
                                            MachineBasicBlock *MBB) const;
-    MachineBasicBlock *EmitLowered__dbzchk(MachineInstr *MI,
+    MachineBasicBlock *EmitLowered__dbzchk(MachineInstr &MI,
                                            MachineBasicBlock *MBB) const;
   };
 
diff --git a/lib/Target/ARM/ARMInstrFormats.td b/lib/Target/ARM/ARMInstrFormats.td
index e79608d360ca..37a83f70a1fb 100644
--- a/lib/Target/ARM/ARMInstrFormats.td
+++ b/lib/Target/ARM/ARMInstrFormats.td
@@ -246,23 +246,33 @@ def shr_imm64 : Operand<i32>, ImmLeaf<i32, [{ return Imm > 0 && Imm <= 64; }]> {
   let ParserMatchClass = shr_imm64_asm_operand;
 }
 
+
+// ARM Assembler operand for ldr Rd, =expression which generates an offset
+// to a constant pool entry or a MOV depending on the value of expression
+def const_pool_asm_operand : AsmOperandClass { let Name = "ConstPoolAsmImm"; }
+def const_pool_asm_imm : Operand<i32> {
+  let ParserMatchClass = const_pool_asm_operand;
+}
+
+
 //===----------------------------------------------------------------------===//
 // ARM Assembler alias templates.
 //
-class ARMInstAlias<string Asm, dag Result, bit Emit = 0b1>
-      : InstAlias<Asm, Result, Emit>, Requires<[IsARM]>;
-class  tInstAlias<string Asm, dag Result, bit Emit = 0b1>
-      : InstAlias<Asm, Result, Emit>, Requires<[IsThumb]>;
-class t2InstAlias<string Asm, dag Result, bit Emit = 0b1>
-      : InstAlias<Asm, Result, Emit>, Requires<[IsThumb2]>;
-class VFP2InstAlias<string Asm, dag Result, bit Emit = 0b1>
-      : InstAlias<Asm, Result, Emit>, Requires<[HasVFP2]>;
-class VFP2DPInstAlias<string Asm, dag Result, bit Emit = 0b1>
-      : InstAlias<Asm, Result, Emit>, Requires<[HasVFP2,HasDPVFP]>;
-class VFP3InstAlias<string Asm, dag Result, bit Emit = 0b1>
-      : InstAlias<Asm, Result, Emit>, Requires<[HasVFP3]>;
-class NEONInstAlias<string Asm, dag Result, bit Emit = 0b1>
-      : InstAlias<Asm, Result, Emit>, Requires<[HasNEON]>;
+// Note: When EmitPriority == 1, the alias will be used for printing
+class ARMInstAlias<string Asm, dag Result, bit EmitPriority = 0>
+      : InstAlias<Asm, Result, EmitPriority>, Requires<[IsARM]>;
+class  tInstAlias<string Asm, dag Result, bit EmitPriority = 0>
+      : InstAlias<Asm, Result, EmitPriority>, Requires<[IsThumb]>;
+class t2InstAlias<string Asm, dag Result, bit EmitPriority = 0>
+      : InstAlias<Asm, Result, EmitPriority>, Requires<[IsThumb2]>;
+class VFP2InstAlias<string Asm, dag Result, bit EmitPriority = 0>
+      : InstAlias<Asm, Result, EmitPriority>, Requires<[HasVFP2]>;
+class VFP2DPInstAlias<string Asm, dag Result, bit EmitPriority = 0>
+      : InstAlias<Asm, Result, EmitPriority>, Requires<[HasVFP2,HasDPVFP]>;
+class VFP3InstAlias<string Asm, dag Result, bit EmitPriority = 0>
+      : InstAlias<Asm, Result, EmitPriority>, Requires<[HasVFP3]>;
+class NEONInstAlias<string Asm, dag Result, bit EmitPriority = 0>
+      : InstAlias<Asm, Result, EmitPriority>, Requires<[HasNEON]>;
 
 
 class VFP2MnemonicAlias<string src, string dst> : MnemonicAlias<src, dst>,
@@ -563,12 +573,12 @@ class AIstrex<bits<2> opcod, dag oops, dag iops, InstrItinClass itin,
 class AIldaex<bits<2> opcod, dag oops, dag iops, InstrItinClass itin,
               string opc, string asm, list<dag> pattern>
   : AIldr_ex_or_acq<opcod, 0b10, oops, iops, itin, opc, asm, pattern>,
-    Requires<[IsARM, HasV8]>;
+    Requires<[IsARM, HasAcquireRelease, HasV7Clrex]>;
 
 class AIstlex<bits<2> opcod, dag oops, dag iops, InstrItinClass itin,
               string opc, string asm, list<dag> pattern>
   : AIstr_ex_or_rel<opcod, 0b10, oops, iops, itin, opc, asm, pattern>,
-    Requires<[IsARM, HasV8]> {
+    Requires<[IsARM, HasAcquireRelease, HasV7Clrex]> {
   bits<4> Rd;
   let Inst{15-12} = Rd;
 }
@@ -593,12 +603,12 @@ class AIswp<bit b, dag oops, dag iops, string opc, list<dag> pattern>
 class AIldracq<bits<2> opcod, dag oops, dag iops, InstrItinClass itin,
               string opc, string asm, list<dag> pattern>
   : AIldr_ex_or_acq<opcod, 0b00, oops, iops, itin, opc, asm, pattern>,
-    Requires<[IsARM, HasV8]>;
+    Requires<[IsARM, HasAcquireRelease]>;
 
 class AIstrrel<bits<2> opcod, dag oops, dag iops, InstrItinClass itin,
               string opc, string asm, list<dag> pattern>
   : AIstr_ex_or_rel<opcod, 0b00, oops, iops, itin, opc, asm, pattern>,
-    Requires<[IsARM, HasV8]> {
+    Requires<[IsARM, HasAcquireRelease]> {
   let Inst{15-12}   = 0b1111;
 }
 
@@ -1379,11 +1389,6 @@ class T2Ipostldst<bit signed, bits<2> opcod, bit load, bit pre,
   let DecoderMethod = "DecodeT2LdStPre";
 }
 
-// Tv5Pat - Same as Pat<>, but requires V5T Thumb mode.
-class Tv5Pat<dag pattern, dag result> : Pat<pattern, result> {
-  list<Predicate> Predicates = [IsThumb, IsThumb1Only, HasV5T];
-}
-
 // T1Pat - Same as Pat<>, but requires that the compiler be in Thumb1 mode.
 class T1Pat<dag pattern, dag result> : Pat<pattern, result> {
   list<Predicate> Predicates = [IsThumb, IsThumb1Only];
@@ -1495,6 +1500,32 @@ class ASI5<bits<4> opcod1, bits<2> opcod2, dag oops, dag iops,
   let D = VFPNeonDomain;
 }
 
+class AHI5<bits<4> opcod1, bits<2> opcod2, dag oops, dag iops,
+           InstrItinClass itin,
+           string opc, string asm, list<dag> pattern>
+  : VFPI<oops, iops, AddrMode5, 4, IndexModeNone,
+         VFPLdStFrm, itin, opc, asm, "", pattern> {
+  list<Predicate> Predicates = [HasFullFP16];
+
+  // Instruction operands.
+  bits<5>  Sd;
+  bits<13> addr;
+
+  // Encode instruction operands.
+  let Inst{23}    = addr{8};      // U (add = (U == '1'))
+  let Inst{22}    = Sd{0};
+  let Inst{19-16} = addr{12-9};   // Rn
+  let Inst{15-12} = Sd{4-1};
+  let Inst{7-0}   = addr{7-0};    // imm8
+
+  let Inst{27-24} = opcod1;
+  let Inst{21-20} = opcod2;
+  let Inst{11-8}  = 0b1001;     // Half precision
+
+  // Loads & stores operate on both NEON and VFP pipelines.
+  let D = VFPNeonDomain;
+}
+
 // VFP Load / store multiple pseudo instructions.
 class PseudoVFPLdStM<dag oops, dag iops, InstrItinClass itin, string cstr,
                      list<dag> pattern>
@@ -1817,6 +1848,114 @@ class ASbIn<bits<5> opcod1, bits<2> opcod2, bit op6, bit op4, dag oops,
   let Inst{22}    = Sd{0};
 }
 
+// Half precision, unary, predicated
+class AHuI<bits<5> opcod1, bits<2> opcod2, bits<4> opcod3, bits<2> opcod4,
+           bit opcod5, dag oops, dag iops, InstrItinClass itin, string opc,
+           string asm, list<dag> pattern>
+  : VFPAI<oops, iops, VFPUnaryFrm, itin, opc, asm, pattern> {
+  list<Predicate> Predicates = [HasFullFP16];
+
+  // Instruction operands.
+  bits<5> Sd;
+  bits<5> Sm;
+
+  // Encode instruction operands.
+  let Inst{3-0}   = Sm{4-1};
+  let Inst{5}     = Sm{0};
+  let Inst{15-12} = Sd{4-1};
+  let Inst{22}    = Sd{0};
+
+  let Inst{27-23} = opcod1;
+  let Inst{21-20} = opcod2;
+  let Inst{19-16} = opcod3;
+  let Inst{11-8}  = 0b1001;   // Half precision
+  let Inst{7-6}   = opcod4;
+  let Inst{4}     = opcod5;
+}
+
+// Half precision, unary, non-predicated
+class AHuInp<bits<5> opcod1, bits<2> opcod2, bits<4> opcod3, bits<2> opcod4,
+             bit opcod5, dag oops, dag iops, InstrItinClass itin,
+             string asm, list<dag> pattern>
+  : VFPXI<oops, iops, AddrModeNone, 4, IndexModeNone,
+          VFPUnaryFrm, itin, asm, "", pattern> {
+  list<Predicate> Predicates = [HasFullFP16];
+
+  // Instruction operands.
+  bits<5> Sd;
+  bits<5> Sm;
+
+  let Inst{31-28} = 0b1111;
+
+  // Encode instruction operands.
+  let Inst{3-0}   = Sm{4-1};
+  let Inst{5}     = Sm{0};
+  let Inst{15-12} = Sd{4-1};
+  let Inst{22}    = Sd{0};
+
+  let Inst{27-23} = opcod1;
+  let Inst{21-20} = opcod2;
+  let Inst{19-16} = opcod3;
+  let Inst{11-8}  = 0b1001;   // Half precision
+  let Inst{7-6}   = opcod4;
+  let Inst{4}     = opcod5;
+}
+
+// Half precision, binary
+class AHbI<bits<5> opcod1, bits<2> opcod2, bit op6, bit op4, dag oops, dag iops,
+           InstrItinClass itin, string opc, string asm, list<dag> pattern>
+  : VFPAI<oops, iops, VFPBinaryFrm, itin, opc, asm, pattern> {
+  list<Predicate> Predicates = [HasFullFP16];
+
+  // Instruction operands.
+  bits<5> Sd;
+  bits<5> Sn;
+  bits<5> Sm;
+
+  // Encode instruction operands.
+  let Inst{3-0}   = Sm{4-1};
+  let Inst{5}     = Sm{0};
+  let Inst{19-16} = Sn{4-1};
+  let Inst{7}     = Sn{0};
+  let Inst{15-12} = Sd{4-1};
+  let Inst{22}    = Sd{0};
+
+  let Inst{27-23} = opcod1;
+  let Inst{21-20} = opcod2;
+  let Inst{11-8}  = 0b1001;   // Half precision
+  let Inst{6}     = op6;
+  let Inst{4}     = op4;
+}
+
+// Half precision, binary, not predicated
+class AHbInp<bits<5> opcod1, bits<2> opcod2, bit opcod3, dag oops, dag iops,
+           InstrItinClass itin, string asm, list<dag> pattern>
+  : VFPXI<oops, iops, AddrModeNone, 4, IndexModeNone,
+          VFPBinaryFrm, itin, asm, "", pattern> {
+  list<Predicate> Predicates = [HasFullFP16];
+
+  // Instruction operands.
+  bits<5> Sd;
+  bits<5> Sn;
+  bits<5> Sm;
+
+  let Inst{31-28} = 0b1111;
+
+  // Encode instruction operands.
+  let Inst{3-0}   = Sm{4-1};
+  let Inst{5}     = Sm{0};
+  let Inst{19-16} = Sn{4-1};
+  let Inst{7}     = Sn{0};
+  let Inst{15-12} = Sd{4-1};
+  let Inst{22}    = Sd{0};
+
+  let Inst{27-23} = opcod1;
+  let Inst{21-20} = opcod2;
+  let Inst{11-8}  = 0b1001;   // Half precision
+  let Inst{6}     = opcod3;
+  let Inst{4}     = 0;
+}
+
 // VFP conversion instructions
 class AVConv1I<bits<5> opcod1, bits<2> opcod2, bits<4> opcod3, bits<4> opcod4,
                dag oops, dag iops, InstrItinClass itin, string opc, string asm,
@@ -2321,22 +2460,25 @@ class NEONFPPat<dag pattern, dag result> : Pat<pattern, result> {
 }
 
 // VFP/NEON Instruction aliases for type suffices.
-class VFPDataTypeInstAlias<string opc, string dt, string asm, dag Result> :
-  InstAlias<!strconcat(opc, dt, "\t", asm), Result>, Requires<[HasVFP2]>;
+// Note: When EmitPriority == 1, the alias will be used for printing
+class VFPDataTypeInstAlias<string opc, string dt, string asm, dag Result, bit EmitPriority = 0> :
+  InstAlias<!strconcat(opc, dt, "\t", asm), Result, EmitPriority>, Requires<[HasVFP2]>;
 
-multiclass VFPDTAnyInstAlias<string opc, string asm, dag Result> {
-  def : VFPDataTypeInstAlias<opc, ".8", asm, Result>;
-  def : VFPDataTypeInstAlias<opc, ".16", asm, Result>;
-  def : VFPDataTypeInstAlias<opc, ".32", asm, Result>;
-  def : VFPDataTypeInstAlias<opc, ".64", asm, Result>;
+// Note: When EmitPriority == 1, the alias will be used for printing
+multiclass VFPDTAnyInstAlias<string opc, string asm, dag Result, bit EmitPriority = 0> {
+  def : VFPDataTypeInstAlias<opc, ".8", asm, Result, EmitPriority>;
+  def : VFPDataTypeInstAlias<opc, ".16", asm, Result, EmitPriority>;
+  def : VFPDataTypeInstAlias<opc, ".32", asm, Result, EmitPriority>;
+  def : VFPDataTypeInstAlias<opc, ".64", asm, Result, EmitPriority>;
 }
 
-multiclass NEONDTAnyInstAlias<string opc, string asm, dag Result> {
+// Note: When EmitPriority == 1, the alias will be used for printing
+multiclass NEONDTAnyInstAlias<string opc, string asm, dag Result, bit EmitPriority = 0> {
   let Predicates = [HasNEON] in {
-  def : VFPDataTypeInstAlias<opc, ".8", asm, Result>;
-  def : VFPDataTypeInstAlias<opc, ".16", asm, Result>;
-  def : VFPDataTypeInstAlias<opc, ".32", asm, Result>;
-  def : VFPDataTypeInstAlias<opc, ".64", asm, Result>;
+  def : VFPDataTypeInstAlias<opc, ".8", asm, Result, EmitPriority>;
+  def : VFPDataTypeInstAlias<opc, ".16", asm, Result, EmitPriority>;
+  def : VFPDataTypeInstAlias<opc, ".32", asm, Result, EmitPriority>;
+  def : VFPDataTypeInstAlias<opc, ".64", asm, Result, EmitPriority>;
 }
 }
 
diff --git a/lib/Target/ARM/ARMInstrInfo.cpp b/lib/Target/ARM/ARMInstrInfo.cpp
index cf973d68085f..98b1b4ca4272 100644
--- a/lib/Target/ARM/ARMInstrInfo.cpp
+++ b/lib/Target/ARM/ARMInstrInfo.cpp
@@ -90,29 +90,29 @@ unsigned ARMInstrInfo::getUnindexedOpcode(unsigned Opc) const {
   return 0;
 }
 
-void ARMInstrInfo::expandLoadStackGuard(MachineBasicBlock::iterator MI,
-                                        Reloc::Model RM) const {
+void ARMInstrInfo::expandLoadStackGuard(MachineBasicBlock::iterator MI) const {
   MachineFunction &MF = *MI->getParent()->getParent();
   const ARMSubtarget &Subtarget = MF.getSubtarget<ARMSubtarget>();
+  const TargetMachine &TM = MF.getTarget();
 
   if (!Subtarget.useMovt(MF)) {
-    if (RM == Reloc::PIC_)
-      expandLoadStackGuardBase(MI, ARM::LDRLIT_ga_pcrel, ARM::LDRi12, RM);
+    if (TM.isPositionIndependent())
+      expandLoadStackGuardBase(MI, ARM::LDRLIT_ga_pcrel, ARM::LDRi12);
     else
-      expandLoadStackGuardBase(MI, ARM::LDRLIT_ga_abs, ARM::LDRi12, RM);
+      expandLoadStackGuardBase(MI, ARM::LDRLIT_ga_abs, ARM::LDRi12);
     return;
   }
 
-  if (RM != Reloc::PIC_) {
-    expandLoadStackGuardBase(MI, ARM::MOVi32imm, ARM::LDRi12, RM);
+  if (!TM.isPositionIndependent()) {
+    expandLoadStackGuardBase(MI, ARM::MOVi32imm, ARM::LDRi12);
     return;
   }
 
   const GlobalValue *GV =
       cast<GlobalValue>((*MI->memoperands_begin())->getValue());
 
-  if (!Subtarget.GVIsIndirectSymbol(GV, RM)) {
-    expandLoadStackGuardBase(MI, ARM::MOV_ga_pcrel, ARM::LDRi12, RM);
+  if (!Subtarget.isGVIndirectSymbol(GV)) {
+    expandLoadStackGuardBase(MI, ARM::MOV_ga_pcrel, ARM::LDRi12);
     return;
   }
 
@@ -123,9 +123,9 @@ void ARMInstrInfo::expandLoadStackGuard(MachineBasicBlock::iterator MI,
 
   MIB = BuildMI(MBB, MI, DL, get(ARM::MOV_ga_pcrel_ldr), Reg)
             .addGlobalAddress(GV, 0, ARMII::MO_NONLAZY);
-  unsigned Flag = MachineMemOperand::MOLoad | MachineMemOperand::MOInvariant;
+  auto Flags = MachineMemOperand::MOLoad | MachineMemOperand::MOInvariant;
   MachineMemOperand *MMO = MBB.getParent()->getMachineMemOperand(
-      MachinePointerInfo::getGOT(*MBB.getParent()), Flag, 4, 4);
+      MachinePointerInfo::getGOT(*MBB.getParent()), Flags, 4, 4);
   MIB.addMemOperand(MMO);
   MIB = BuildMI(MBB, MI, DL, get(ARM::LDRi12), Reg);
   MIB.addReg(Reg, RegState::Kill).addImm(0);
diff --git a/lib/Target/ARM/ARMInstrInfo.h b/lib/Target/ARM/ARMInstrInfo.h
index 90f34ea08401..4b1b7097b18d 100644
--- a/lib/Target/ARM/ARMInstrInfo.h
+++ b/lib/Target/ARM/ARMInstrInfo.h
@@ -39,8 +39,7 @@ public:
   const ARMRegisterInfo &getRegisterInfo() const override { return RI; }
 
 private:
-  void expandLoadStackGuard(MachineBasicBlock::iterator MI,
-                            Reloc::Model RM) const override;
+  void expandLoadStackGuard(MachineBasicBlock::iterator MI) const override;
 };
 
 }
diff --git a/lib/Target/ARM/ARMInstrInfo.td b/lib/Target/ARM/ARMInstrInfo.td
index c446ba3109e4..060376b0a273 100644
--- a/lib/Target/ARM/ARMInstrInfo.td
+++ b/lib/Target/ARM/ARMInstrInfo.td
@@ -90,12 +90,6 @@ def SDTBinaryArithWithFlagsInOut : SDTypeProfile<2, 3,
                                              SDTCisVT<1, i32>,
                                              SDTCisVT<4, i32>]>;
 
-def SDT_ARM64bitmlal : SDTypeProfile<2,4, [ SDTCisVT<0, i32>, SDTCisVT<1, i32>,
-                                        SDTCisVT<2, i32>, SDTCisVT<3, i32>,
-                                        SDTCisVT<4, i32>, SDTCisVT<5, i32> ] >;
-def ARMUmlal         : SDNode<"ARMISD::UMLAL", SDT_ARM64bitmlal>;
-def ARMSmlal         : SDNode<"ARMISD::SMLAL", SDT_ARM64bitmlal>;
-
 // Node definitions.
 def ARMWrapper       : SDNode<"ARMISD::Wrapper",     SDTIntUnaryOp>;
 def ARMWrapperPIC    : SDNode<"ARMISD::WrapperPIC",  SDTIntUnaryOp>;
@@ -128,6 +122,8 @@ def ARMintretflag    : SDNode<"ARMISD::INTRET_FLAG", SDT_ARMcall,
 def ARMcmov          : SDNode<"ARMISD::CMOV", SDT_ARMCMov,
                               [SDNPInGlue]>;
 
+def ARMssatnoshift   : SDNode<"ARMISD::SSAT", SDTIntSatNoShOp, []>;
+
 def ARMbrcond        : SDNode<"ARMISD::BRCOND", SDT_ARMBrcond,
                               [SDNPHasChain, SDNPInGlue, SDNPOutGlue]>;
 
@@ -201,6 +197,12 @@ def NoV6             : Predicate<"!Subtarget->hasV6Ops()">;
 def HasV6M           : Predicate<"Subtarget->hasV6MOps()">,
                                  AssemblerPredicate<"HasV6MOps",
                                                     "armv6m or armv6t2">;
+def HasV8MBaseline   : Predicate<"Subtarget->hasV8MBaselineOps()">,
+                                 AssemblerPredicate<"HasV8MBaselineOps",
+                                                    "armv8m.base">;
+def HasV8MMainline   : Predicate<"Subtarget->hasV8MMainlineOps()">,
+                                 AssemblerPredicate<"HasV8MMainlineOps",
+                                                    "armv8m.main">;
 def HasV6T2          : Predicate<"Subtarget->hasV6T2Ops()">,
                                  AssemblerPredicate<"HasV6T2Ops", "armv6t2">;
 def NoV6T2           : Predicate<"!Subtarget->hasV6T2Ops()">;
@@ -235,6 +237,8 @@ def HasCrypto        : Predicate<"Subtarget->hasCrypto()">,
                                  AssemblerPredicate<"FeatureCrypto", "crypto">;
 def HasCRC           : Predicate<"Subtarget->hasCRC()">,
                                  AssemblerPredicate<"FeatureCRC", "crc">;
+def HasRAS           : Predicate<"Subtarget->hasRAS()">,
+                                 AssemblerPredicate<"FeatureRAS", "ras">;
 def HasFP16          : Predicate<"Subtarget->hasFP16()">,
                                  AssemblerPredicate<"FeatureFP16","half-float conversions">;
 def HasFullFP16      : Predicate<"Subtarget->hasFullFP16()">,
@@ -251,6 +255,12 @@ def HasDSP           : Predicate<"Subtarget->hasDSP()">,
 def HasDB            : Predicate<"Subtarget->hasDataBarrier()">,
                                  AssemblerPredicate<"FeatureDB",
                                                     "data-barriers">;
+def HasV7Clrex  : Predicate<"Subtarget->hasV7Clrex()">,
+                            AssemblerPredicate<"FeatureV7Clrex",
+                                               "v7 clrex">;
+def HasAcquireRelease : Predicate<"Subtarget->hasAcquireRelease()">,
+                                  AssemblerPredicate<"FeatureAcquireRelease",
+                                                     "acquire/release">;
 def HasMP            : Predicate<"Subtarget->hasMPExtension()">,
                                  AssemblerPredicate<"FeatureMP",
                                                     "mp-extensions">;
@@ -260,6 +270,9 @@ def HasVirtualization: Predicate<"false">,
 def HasTrustZone     : Predicate<"Subtarget->hasTrustZone()">,
                                  AssemblerPredicate<"FeatureTrustZone",
                                                     "TrustZone">;
+def Has8MSecExt      : Predicate<"Subtarget->has8MSecExt()">,
+                                 AssemblerPredicate<"Feature8MSecExt",
+                                                    "ARMv8-M Security Extensions">;
 def HasZCZ           : Predicate<"Subtarget->hasZeroCycleZeroing()">;
 def UseNEONForFP     : Predicate<"Subtarget->useNEONForSinglePrecisionFP()">;
 def DontUseNEONForFP : Predicate<"!Subtarget->useNEONForSinglePrecisionFP()">;
@@ -279,6 +292,8 @@ def IsARM            : Predicate<"!Subtarget->isThumb()">,
 def IsMachO          : Predicate<"Subtarget->isTargetMachO()">;
 def IsNotMachO       : Predicate<"!Subtarget->isTargetMachO()">;
 def IsNaCl           : Predicate<"Subtarget->isTargetNaCl()">;
+def IsWindows        : Predicate<"Subtarget->isTargetWindows()">;
+def IsNotWindows     : Predicate<"!Subtarget->isTargetWindows()">;
 def UseNaClTrap      : Predicate<"Subtarget->useNaClTrap()">,
                                  AssemblerPredicate<"FeatureNaClTrap", "NaCl">;
 def DontUseNaClTrap  : Predicate<"!Subtarget->useNaClTrap()">;
@@ -301,19 +316,16 @@ def DontUseFusedMAC  : Predicate<"!(TM.Options.AllowFPOpFusion =="
                                  " Subtarget->hasVFP4()) || "
                                  "Subtarget->isTargetDarwin()">;
 
-// VGETLNi32 is microcoded on Swift - prefer VMOV.
-def HasFastVGETLNi32 : Predicate<"!Subtarget->isSwift()">;
-def HasSlowVGETLNi32 : Predicate<"Subtarget->isSwift()">;
+def HasFastVGETLNi32 : Predicate<"!Subtarget->hasSlowVGETLNi32()">;
+def HasSlowVGETLNi32 : Predicate<"Subtarget->hasSlowVGETLNi32()">;
 
-// VDUP.32 is microcoded on Swift - prefer VMOV.
-def HasFastVDUP32 : Predicate<"!Subtarget->isSwift()">;
-def HasSlowVDUP32 : Predicate<"Subtarget->isSwift()">;
+def HasFastVDUP32 : Predicate<"!Subtarget->hasSlowVDUP32()">;
+def HasSlowVDUP32 : Predicate<"Subtarget->hasSlowVDUP32()">;
 
-// Cortex-A9 prefers VMOVSR to VMOVDRR even when using NEON for scalar FP, as
-// this allows more effective execution domain optimization. See
-// setExecutionDomain().
-def UseVMOVSR : Predicate<"Subtarget->isCortexA9() || !Subtarget->useNEONForSinglePrecisionFP()">;
-def DontUseVMOVSR : Predicate<"!Subtarget->isCortexA9() && Subtarget->useNEONForSinglePrecisionFP()">;
+def UseVMOVSR : Predicate<"Subtarget->preferVMOVSR() ||"
+                          "!Subtarget->useNEONForSinglePrecisionFP()">;
+def DontUseVMOVSR : Predicate<"!Subtarget->preferVMOVSR() &&"
+                              "Subtarget->useNEONForSinglePrecisionFP()">;
 
 def IsLE             : Predicate<"MF->getDataLayout().isLittleEndian()">;
 def IsBE             : Predicate<"MF->getDataLayout().isBigEndian()">;
@@ -360,8 +372,6 @@ def lo16AllZero : PatLeaf<(i32 imm), [{
   return (((uint32_t)N->getZExtValue()) & 0xFFFFUL) == 0;
 }], hi16>;
 
-class BinOpWithFlagFrag<dag res> :
-      PatFrag<(ops node:$LHS, node:$RHS, node:$FLAG), res>;
 class BinOpFrag<dag res> : PatFrag<(ops node:$LHS, node:$RHS), res>;
 class UnOpFrag <dag res> : PatFrag<(ops node:$Src), res>;
 
@@ -408,34 +418,35 @@ def brtarget : Operand<OtherVT> {
   let DecoderMethod = "DecodeT2BROperand";
 }
 
-// FIXME: get rid of this one?
-def uncondbrtarget : Operand<OtherVT> {
-  let EncoderMethod = "getUnconditionalBranchTargetOpValue";
-  let OperandType = "OPERAND_PCREL";
+// Branches targeting ARM-mode must be divisible by 4 if they're a raw
+// immediate.
+def ARMBranchTarget : AsmOperandClass {
+  let Name = "ARMBranchTarget";
 }
 
-// Branch target for ARM. Handles conditional/unconditional
-def br_target : Operand<OtherVT> {
-  let EncoderMethod = "getARMBranchTargetOpValue";
-  let OperandType = "OPERAND_PCREL";
+// Branches targeting Thumb-mode must be divisible by 2 if they're a raw
+// immediate.
+def ThumbBranchTarget : AsmOperandClass {
+  let Name = "ThumbBranchTarget";
 }
 
-// Call target.
-// FIXME: rename bltarget to t2_bl_target?
-def bltarget : Operand<i32> {
-  // Encoded the same as branch targets.
-  let EncoderMethod = "getBranchTargetOpValue";
+def arm_br_target : Operand<OtherVT> {
+  let ParserMatchClass = ARMBranchTarget;
+  let EncoderMethod = "getARMBranchTargetOpValue";
   let OperandType = "OPERAND_PCREL";
 }
 
 // Call target for ARM. Handles conditional/unconditional
 // FIXME: rename bl_target to t2_bltarget?
-def bl_target : Operand<i32> {
+def arm_bl_target : Operand<i32> {
+  let ParserMatchClass = ARMBranchTarget;
   let EncoderMethod = "getARMBLTargetOpValue";
   let OperandType = "OPERAND_PCREL";
 }
 
-def blx_target : Operand<i32> {
+// Target for BLX *from* ARM mode.
+def arm_blx_target : Operand<i32> {
+  let ParserMatchClass = ThumbBranchTarget;
   let EncoderMethod = "getARMBLXTargetOpValue";
   let OperandType = "OPERAND_PCREL";
 }
@@ -981,6 +992,21 @@ def addrmode5_pre : AddrMode5 {
    let PrintMethod = "printAddrMode5Operand<true>";
 }
 
+// addrmode5fp16 := reg +/- imm8*2
+//
+def AddrMode5FP16AsmOperand : AsmOperandClass { let Name = "AddrMode5FP16"; }
+class AddrMode5FP16 : Operand<i32>,
+                      ComplexPattern<i32, 2, "SelectAddrMode5FP16", []> {
+  let EncoderMethod = "getAddrMode5FP16OpValue";
+  let DecoderMethod = "DecodeAddrMode5FP16Operand";
+  let ParserMatchClass = AddrMode5FP16AsmOperand;
+  let MIOperandInfo = (ops GPR:$base, i32imm);
+}
+
+def addrmode5fp16 : AddrMode5FP16 {
+   let PrintMethod = "printAddrMode5FP16Operand<false>";
+}
+
 // addrmode6 := reg with optional alignment
 //
 def AddrMode6AsmOperand : AsmOperandClass { let Name = "AlignedMemory"; }
@@ -1224,7 +1250,7 @@ include "ARMInstrFormats.td"
 let TwoOperandAliasConstraint = "$Rn = $Rd" in
 multiclass AsI1_bin_irs<bits<4> opcod, string opc,
                      InstrItinClass iii, InstrItinClass iir, InstrItinClass iis,
-                        PatFrag opnode, bit Commutable = 0> {
+                     SDPatternOperator opnode, bit Commutable = 0> {
   // The register-immediate version is re-materializable. This is useful
   // in particular for taking the address of a local.
   let isReMaterializable = 1 in {
@@ -1297,7 +1323,7 @@ multiclass AsI1_bin_irs<bits<4> opcod, string opc,
 let TwoOperandAliasConstraint = "$Rn = $Rd" in
 multiclass AsI1_rbin_irs<bits<4> opcod, string opc,
                      InstrItinClass iii, InstrItinClass iir, InstrItinClass iis,
-                        PatFrag opnode, bit Commutable = 0> {
+                     SDNode opnode, bit Commutable = 0> {
   // The register-immediate version is re-materializable. This is useful
   // in particular for taking the address of a local.
   let isReMaterializable = 1 in {
@@ -1369,7 +1395,7 @@ multiclass AsI1_rbin_irs<bits<4> opcod, string opc,
 /// AdjustInstrPostInstrSelection after giving them an optional CPSR operand.
 let hasPostISelHook = 1, Defs = [CPSR] in {
 multiclass AsI1_bin_s_irs<InstrItinClass iii, InstrItinClass iir,
-                          InstrItinClass iis, PatFrag opnode,
+                          InstrItinClass iis, SDNode opnode,
                           bit Commutable = 0> {
   def ri : ARMPseudoInst<(outs GPR:$Rd), (ins GPR:$Rn, mod_imm:$imm, pred:$p),
                          4, iii,
@@ -1402,7 +1428,7 @@ multiclass AsI1_bin_s_irs<InstrItinClass iii, InstrItinClass iir,
 /// operands are reversed.
 let hasPostISelHook = 1, Defs = [CPSR] in {
 multiclass AsI1_rbin_s_is<InstrItinClass iii, InstrItinClass iir,
-                          InstrItinClass iis, PatFrag opnode,
+                          InstrItinClass iis, SDNode opnode,
                           bit Commutable = 0> {
   def ri : ARMPseudoInst<(outs GPR:$Rd), (ins GPR:$Rn, mod_imm:$imm, pred:$p),
                          4, iii,
@@ -1431,8 +1457,8 @@ multiclass AsI1_rbin_s_is<InstrItinClass iii, InstrItinClass iir,
 let isCompare = 1, Defs = [CPSR] in {
 multiclass AI1_cmp_irs<bits<4> opcod, string opc,
                      InstrItinClass iii, InstrItinClass iir, InstrItinClass iis,
-                       PatFrag opnode, bit Commutable = 0,
-                       string rrDecoderMethod = ""> {
+                     SDPatternOperator opnode, bit Commutable = 0,
+                     string rrDecoderMethod = ""> {
   def ri : AI1<opcod, (outs), (ins GPR:$Rn, mod_imm:$imm), DPFrm, iii,
                opc, "\t$Rn, $imm",
                [(opnode GPR:$Rn, mod_imm:$imm)]>,
@@ -1561,7 +1587,7 @@ class AI_exta_rrot_np<bits<8> opcod, string opc>
 
 /// AI1_adde_sube_irs - Define instructions and patterns for adde and sube.
 let TwoOperandAliasConstraint = "$Rn = $Rd" in
-multiclass AI1_adde_sube_irs<bits<4> opcod, string opc, PatFrag opnode,
+multiclass AI1_adde_sube_irs<bits<4> opcod, string opc, SDNode opnode,
                              bit Commutable = 0> {
   let hasPostISelHook = 1, Defs = [CPSR], Uses = [CPSR] in {
   def ri : AsI1<opcod, (outs GPR:$Rd), (ins GPR:$Rn, mod_imm:$imm),
@@ -1632,7 +1658,7 @@ multiclass AI1_adde_sube_irs<bits<4> opcod, string opc, PatFrag opnode,
 
 /// AI1_rsc_irs - Define instructions and patterns for rsc
 let TwoOperandAliasConstraint = "$Rn = $Rd" in
-multiclass AI1_rsc_irs<bits<4> opcod, string opc, PatFrag opnode> {
+multiclass AI1_rsc_irs<bits<4> opcod, string opc, SDNode opnode> {
   let hasPostISelHook = 1, Defs = [CPSR], Uses = [CPSR] in {
   def ri : AsI1<opcod, (outs GPR:$Rd), (ins GPR:$Rn, mod_imm:$imm),
                 DPFrm, IIC_iALUi, opc, "\t$Rd, $Rn, $imm",
@@ -1880,6 +1906,7 @@ def HINT : AI<(outs), (ins imm0_239:$imm), MiscFrm, NoItinerary,
   bits<8> imm;
   let Inst{27-8} = 0b00110010000011110000;
   let Inst{7-0} = imm;
+  let DecoderMethod = "DecodeHINTInstruction";
 }
 
 def : InstAlias<"nop$p", (HINT 0, pred:$p)>, Requires<[IsARM, HasV6K]>;
@@ -1888,6 +1915,7 @@ def : InstAlias<"wfe$p", (HINT 2, pred:$p)>, Requires<[IsARM, HasV6K]>;
 def : InstAlias<"wfi$p", (HINT 3, pred:$p)>, Requires<[IsARM, HasV6K]>;
 def : InstAlias<"sev$p", (HINT 4, pred:$p)>, Requires<[IsARM, HasV6K]>;
 def : InstAlias<"sevl$p", (HINT 5, pred:$p)>, Requires<[IsARM, HasV8]>;
+def : InstAlias<"esb$p", (HINT 16, pred:$p)>, Requires<[IsARM, HasRAS]>;
 
 def SEL : AI<(outs GPR:$Rd), (ins GPR:$Rn, GPR:$Rm), DPFrm, NoItinerary, "sel",
              "\t$Rd, $Rn, $Rm", []>, Requires<[IsARM, HasV6]> {
@@ -1915,7 +1943,7 @@ def BKPT : AInoP<(outs), (ins imm0_65535:$val), MiscFrm, NoItinerary,
   let Inst{7-4} = 0b0111;
 }
 // default immediate for breakpoint mnemonic
-def : InstAlias<"bkpt", (BKPT 0)>, Requires<[IsARM]>;
+def : InstAlias<"bkpt", (BKPT 0), 0>, Requires<[IsARM]>;
 
 def HLT : AInoP<(outs), (ins imm0_65535:$val), MiscFrm, NoItinerary,
                  "hlt", "\t$val", []>, Requires<[IsARM, HasV8]> {
@@ -2181,7 +2209,7 @@ let isCall = 1,
   // at least be a pseudo instruction expanding to the predicated version
   // at MC lowering time.
   Defs = [LR], Uses = [SP] in {
-  def BL  : ABXI<0b1011, (outs), (ins bl_target:$func),
+  def BL  : ABXI<0b1011, (outs), (ins arm_bl_target:$func),
                 IIC_Br, "bl\t$func",
                 [(ARMcall tglobaladdr:$func)]>,
             Requires<[IsARM]>, Sched<[WriteBrL]> {
@@ -2191,7 +2219,7 @@ let isCall = 1,
     let DecoderMethod = "DecodeBranchImmInstruction";
   }
 
-  def BL_pred : ABI<0b1011, (outs), (ins bl_target:$func),
+  def BL_pred : ABI<0b1011, (outs), (ins arm_bl_target:$func),
                    IIC_Br, "bl", "\t$func",
                    [(ARMcall_pred tglobaladdr:$func)]>,
                 Requires<[IsARM]>, Sched<[WriteBrL]> {
@@ -2232,7 +2260,7 @@ let isCall = 1,
 
   // mov lr, pc; b if callee is marked noreturn to avoid confusing the
   // return stack predictor.
-  def BMOVPCB_CALL : ARMPseudoInst<(outs), (ins bl_target:$func),
+  def BMOVPCB_CALL : ARMPseudoInst<(outs), (ins arm_bl_target:$func),
                                8, IIC_Br, [(ARMcall_nolink tglobaladdr:$func)]>,
                       Requires<[IsARM]>, Sched<[WriteBr]>;
 }
@@ -2240,7 +2268,7 @@ let isCall = 1,
 let isBranch = 1, isTerminator = 1 in {
   // FIXME: should be able to write a pattern for ARMBrcond, but can't use
   // a two-value operand where a dag node expects two operands. :(
-  def Bcc : ABI<0b1010, (outs), (ins br_target:$target),
+  def Bcc : ABI<0b1010, (outs), (ins arm_br_target:$target),
                IIC_Br, "b", "\t$target",
                [/*(ARMbrcond bb:$target, imm:$cc, CCR:$ccr)*/]>,
                Sched<[WriteBr]>  {
@@ -2255,8 +2283,9 @@ let isBranch = 1, isTerminator = 1 in {
     // FIXME: We shouldn't need this pseudo at all. Just using Bcc directly
     // should be sufficient.
     // FIXME: Is B really a Barrier? That doesn't seem right.
-    def B : ARMPseudoExpand<(outs), (ins br_target:$target), 4, IIC_Br,
-                [(br bb:$target)], (Bcc br_target:$target, (ops 14, zero_reg))>,
+    def B : ARMPseudoExpand<(outs), (ins arm_br_target:$target), 4, IIC_Br,
+                [(br bb:$target)], (Bcc arm_br_target:$target,
+                (ops 14, zero_reg))>,
                 Sched<[WriteBr]>;
 
     let Size = 4, isNotDuplicable = 1, isIndirectBranch = 1 in {
@@ -2283,7 +2312,7 @@ let isBranch = 1, isTerminator = 1 in {
 }
 
 // BLX (immediate)
-def BLXi : AXI<(outs), (ins blx_target:$target), BrMiscFrm, NoItinerary,
+def BLXi : AXI<(outs), (ins arm_blx_target:$target), BrMiscFrm, NoItinerary,
                "blx\t$target", []>,
            Requires<[IsARM, HasV5T]>, Sched<[WriteBrL]> {
   let Inst{31-25} = 0b1111101;
@@ -2313,9 +2342,9 @@ let isCall = 1, isTerminator = 1, isReturn = 1, isBarrier = 1, Uses = [SP] in {
   def TCRETURNri : PseudoInst<(outs), (ins tcGPR:$dst), IIC_Br, []>,
                    Sched<[WriteBr]>;
 
-  def TAILJMPd : ARMPseudoExpand<(outs), (ins br_target:$dst),
+  def TAILJMPd : ARMPseudoExpand<(outs), (ins arm_br_target:$dst),
                                  4, IIC_Br, [],
-                                 (Bcc br_target:$dst, (ops 14, zero_reg))>,
+                                 (Bcc arm_br_target:$dst, (ops 14, zero_reg))>,
                                  Requires<[IsARM]>, Sched<[WriteBr]>;
 
   def TAILJMPr : ARMPseudoExpand<(outs), (ins tcGPR:$dst),
@@ -2467,14 +2496,12 @@ def ERET : ABI<0b0001, (outs), (ins), NoItinerary, "eret", "", []>,
 // Load
 
 
-defm LDR  : AI_ldr1<0, "ldr", IIC_iLoad_r, IIC_iLoad_si,
-                    UnOpFrag<(load node:$Src)>>;
+defm LDR  : AI_ldr1<0, "ldr", IIC_iLoad_r, IIC_iLoad_si, load>;
 defm LDRB : AI_ldr1nopc<1, "ldrb", IIC_iLoad_bh_r, IIC_iLoad_bh_si,
-                    UnOpFrag<(zextloadi8 node:$Src)>>;
-defm STR  : AI_str1<0, "str", IIC_iStore_r, IIC_iStore_si,
-                   BinOpFrag<(store node:$LHS, node:$RHS)>>;
+                        zextloadi8>;
+defm STR  : AI_str1<0, "str", IIC_iStore_r, IIC_iStore_si, store>;
 defm STRB : AI_str1nopc<1, "strb", IIC_iStore_bh_r, IIC_iStore_bh_si,
-                   BinOpFrag<(truncstorei8 node:$LHS, node:$RHS)>>;
+                        truncstorei8>;
 
 // Special LDR for loads from non-pc-relative constpools.
 let canFoldAsLoad = 1, mayLoad = 1, hasSideEffects = 0,
@@ -2764,6 +2791,12 @@ def LDRBT_POST
   : ARMAsmPseudo<"ldrbt${q} $Rt, $addr", (ins addr_offset_none:$addr, pred:$q),
                  (outs GPR:$Rt)>;
 
+// Pseudo instruction ldr Rt, =immediate
+def LDRConstPool
+  : ARMAsmPseudo<"ldr${q} $Rt, $immediate",
+                 (ins const_pool_asm_imm:$immediate, pred:$q),
+                 (outs GPR:$Rt)>;
+
 // Store
 
 // Stores with truncate
@@ -3299,8 +3332,8 @@ def MOVi16 : AI1<0b1000, (outs GPR:$Rd), (ins imm0_65535_expr:$imm),
 }
 
 def : InstAlias<"mov${p} $Rd, $imm",
-                (MOVi16 GPR:$Rd, imm0_65535_expr:$imm, pred:$p)>,
-        Requires<[IsARM]>;
+                (MOVi16 GPR:$Rd, imm0_65535_expr:$imm, pred:$p), 0>,
+        Requires<[IsARM, HasV6T2]>;
 
 def MOVi16_ga_pcrel : PseudoInst<(outs GPR:$Rd),
                                 (ins i32imm:$addr, pclabel:$id), IIC_iMOVi, []>,
@@ -3439,11 +3472,9 @@ def UBFX  : I<(outs GPRnopc:$Rd),
 //
 
 defm ADD  : AsI1_bin_irs<0b0100, "add",
-                         IIC_iALUi, IIC_iALUr, IIC_iALUsr,
-                         BinOpFrag<(add  node:$LHS, node:$RHS)>, 1>;
+                         IIC_iALUi, IIC_iALUr, IIC_iALUsr, add, 1>;
 defm SUB  : AsI1_bin_irs<0b0010, "sub",
-                         IIC_iALUi, IIC_iALUr, IIC_iALUsr,
-                         BinOpFrag<(sub  node:$LHS, node:$RHS)>>;
+                         IIC_iALUi, IIC_iALUr, IIC_iALUsr, sub>;
 
 // ADD and SUB with 's' bit set.
 //
@@ -3455,27 +3486,21 @@ defm SUB  : AsI1_bin_irs<0b0010, "sub",
 // FIXME: Eliminate ADDS/SUBS pseudo opcodes after adding tablegen
 // support for an optional CPSR definition that corresponds to the DAG
 // node's second value. We can then eliminate the implicit def of CPSR.
-defm ADDS : AsI1_bin_s_irs<IIC_iALUi, IIC_iALUr, IIC_iALUsr,
-                           BinOpFrag<(ARMaddc node:$LHS, node:$RHS)>, 1>;
-defm SUBS : AsI1_bin_s_irs<IIC_iALUi, IIC_iALUr, IIC_iALUsr,
-                           BinOpFrag<(ARMsubc node:$LHS, node:$RHS)>>;
+defm ADDS : AsI1_bin_s_irs<IIC_iALUi, IIC_iALUr, IIC_iALUsr, ARMaddc, 1>;
+defm SUBS : AsI1_bin_s_irs<IIC_iALUi, IIC_iALUr, IIC_iALUsr, ARMsubc>;
 
-defm ADC : AI1_adde_sube_irs<0b0101, "adc",
-              BinOpWithFlagFrag<(ARMadde node:$LHS, node:$RHS, node:$FLAG)>, 1>;
-defm SBC : AI1_adde_sube_irs<0b0110, "sbc",
-              BinOpWithFlagFrag<(ARMsube node:$LHS, node:$RHS, node:$FLAG)>>;
+defm ADC : AI1_adde_sube_irs<0b0101, "adc", ARMadde, 1>;
+defm SBC : AI1_adde_sube_irs<0b0110, "sbc", ARMsube>;
 
 defm RSB  : AsI1_rbin_irs<0b0011, "rsb",
                           IIC_iALUi, IIC_iALUr, IIC_iALUsr,
-                          BinOpFrag<(sub node:$LHS, node:$RHS)>>;
+                          sub>;
 
 // FIXME: Eliminate them if we can write def : Pat patterns which defines
 // CPSR and the implicit def of CPSR is not needed.
-defm RSBS : AsI1_rbin_s_is<IIC_iALUi, IIC_iALUr, IIC_iALUsr,
-                           BinOpFrag<(ARMsubc node:$LHS, node:$RHS)>>;
+defm RSBS : AsI1_rbin_s_is<IIC_iALUi, IIC_iALUr, IIC_iALUsr, ARMsubc>;
 
-defm RSC : AI1_rsc_irs<0b0111, "rsc",
-                BinOpWithFlagFrag<(ARMsube node:$LHS, node:$RHS, node:$FLAG)>>;
+defm RSC : AI1_rsc_irs<0b0111, "rsc", ARMsube>;
 
 // (sub X, imm) gets canonicalized to (add X, -imm).  Match this form.
 // The assume-no-carry-in form uses the negation of the input since add/sub
@@ -3685,20 +3710,19 @@ def : ARMV6Pat<(int_arm_ssat GPRnopc:$a, imm1_32:$pos),
                (SSAT imm1_32:$pos, GPRnopc:$a, 0)>;
 def : ARMV6Pat<(int_arm_usat GPRnopc:$a, imm0_31:$pos),
                (USAT imm0_31:$pos, GPRnopc:$a, 0)>;
+def : ARMPat<(ARMssatnoshift GPRnopc:$Rn, imm0_31:$imm),
+             (SSAT imm0_31:$imm, GPRnopc:$Rn, 0)>;
 
 //===----------------------------------------------------------------------===//
 //  Bitwise Instructions.
 //
 
 defm AND   : AsI1_bin_irs<0b0000, "and",
-                          IIC_iBITi, IIC_iBITr, IIC_iBITsr,
-                          BinOpFrag<(and node:$LHS, node:$RHS)>, 1>;
+                          IIC_iBITi, IIC_iBITr, IIC_iBITsr, and, 1>;
 defm ORR   : AsI1_bin_irs<0b1100, "orr",
-                          IIC_iBITi, IIC_iBITr, IIC_iBITsr,
-                          BinOpFrag<(or  node:$LHS, node:$RHS)>, 1>;
+                          IIC_iBITi, IIC_iBITr, IIC_iBITsr, or, 1>;
 defm EOR   : AsI1_bin_irs<0b0001, "eor",
-                          IIC_iBITi, IIC_iBITr, IIC_iBITsr,
-                          BinOpFrag<(xor node:$LHS, node:$RHS)>, 1>;
+                          IIC_iBITi, IIC_iBITr, IIC_iBITsr, xor, 1>;
 defm BIC   : AsI1_bin_irs<0b1110, "bic",
                           IIC_iBITi, IIC_iBITr, IIC_iBITsr,
                           BinOpFrag<(and node:$LHS, (not node:$RHS))>>;
@@ -3923,9 +3947,10 @@ def UMLAL : AsMla1I64<0b0000101, (outs GPR:$RdLo, GPR:$RdHi),
          RegConstraint<"$RLo = $RdLo, $RHi = $RdHi">, Requires<[IsARM, HasV6]>;
 
 def UMAAL : AMul1I <0b0000010, (outs GPR:$RdLo, GPR:$RdHi),
-                               (ins GPR:$Rn, GPR:$Rm), IIC_iMAC64,
+                               (ins GPR:$Rn, GPR:$Rm, GPR:$RLo, GPR:$RHi),
+                               IIC_iMAC64,
                     "umaal", "\t$RdLo, $RdHi, $Rn, $Rm", []>,
-                    Requires<[IsARM, HasV6]> {
+         RegConstraint<"$RLo = $RdLo, $RHi = $RdHi">, Requires<[IsARM, HasV6]> {
   bits<4> RdLo;
   bits<4> RdHi;
   bits<4> Rm;
@@ -3989,28 +4014,28 @@ def SMMLSR : AMul2Ia <0b0111010, 0b1111, (outs GPR:$Rd),
                IIC_iMAC32, "smmlsr", "\t$Rd, $Rn, $Rm, $Ra", []>,
             Requires<[IsARM, HasV6]>;
 
-multiclass AI_smul<string opc, PatFrag opnode> {
+multiclass AI_smul<string opc> {
   def BB : AMulxyI<0b0001011, 0b00, (outs GPR:$Rd), (ins GPR:$Rn, GPR:$Rm),
               IIC_iMUL16, !strconcat(opc, "bb"), "\t$Rd, $Rn, $Rm",
-              [(set GPR:$Rd, (opnode (sext_inreg GPR:$Rn, i16),
+              [(set GPR:$Rd, (mul (sext_inreg GPR:$Rn, i16),
                                       (sext_inreg GPR:$Rm, i16)))]>,
            Requires<[IsARM, HasV5TE]>;
 
   def BT : AMulxyI<0b0001011, 0b10, (outs GPR:$Rd), (ins GPR:$Rn, GPR:$Rm),
               IIC_iMUL16, !strconcat(opc, "bt"), "\t$Rd, $Rn, $Rm",
-              [(set GPR:$Rd, (opnode (sext_inreg GPR:$Rn, i16),
+              [(set GPR:$Rd, (mul (sext_inreg GPR:$Rn, i16),
                                       (sra GPR:$Rm, (i32 16))))]>,
            Requires<[IsARM, HasV5TE]>;
 
   def TB : AMulxyI<0b0001011, 0b01, (outs GPR:$Rd), (ins GPR:$Rn, GPR:$Rm),
               IIC_iMUL16, !strconcat(opc, "tb"), "\t$Rd, $Rn, $Rm",
-              [(set GPR:$Rd, (opnode (sra GPR:$Rn, (i32 16)),
+              [(set GPR:$Rd, (mul (sra GPR:$Rn, (i32 16)),
                                       (sext_inreg GPR:$Rm, i16)))]>,
            Requires<[IsARM, HasV5TE]>;
 
   def TT : AMulxyI<0b0001011, 0b11, (outs GPR:$Rd), (ins GPR:$Rn, GPR:$Rm),
               IIC_iMUL16, !strconcat(opc, "tt"), "\t$Rd, $Rn, $Rm",
-              [(set GPR:$Rd, (opnode (sra GPR:$Rn, (i32 16)),
+              [(set GPR:$Rd, (mul (sra GPR:$Rn, (i32 16)),
                                       (sra GPR:$Rm, (i32 16))))]>,
             Requires<[IsARM, HasV5TE]>;
 
@@ -4026,13 +4051,13 @@ multiclass AI_smul<string opc, PatFrag opnode> {
 }
 
 
-multiclass AI_smla<string opc, PatFrag opnode> {
+multiclass AI_smla<string opc> {
   let DecoderMethod = "DecodeSMLAInstruction" in {
   def BB : AMulxyIa<0b0001000, 0b00, (outs GPRnopc:$Rd),
               (ins GPRnopc:$Rn, GPRnopc:$Rm, GPR:$Ra),
               IIC_iMAC16, !strconcat(opc, "bb"), "\t$Rd, $Rn, $Rm, $Ra",
               [(set GPRnopc:$Rd, (add GPR:$Ra,
-                               (opnode (sext_inreg GPRnopc:$Rn, i16),
+                               (mul (sext_inreg GPRnopc:$Rn, i16),
                                        (sext_inreg GPRnopc:$Rm, i16))))]>,
            Requires<[IsARM, HasV5TE, UseMulOps]>;
 
@@ -4040,7 +4065,7 @@ multiclass AI_smla<string opc, PatFrag opnode> {
               (ins GPRnopc:$Rn, GPRnopc:$Rm, GPR:$Ra),
               IIC_iMAC16, !strconcat(opc, "bt"), "\t$Rd, $Rn, $Rm, $Ra",
               [(set GPRnopc:$Rd,
-                    (add GPR:$Ra, (opnode (sext_inreg GPRnopc:$Rn, i16),
+                    (add GPR:$Ra, (mul (sext_inreg GPRnopc:$Rn, i16),
                                           (sra GPRnopc:$Rm, (i32 16)))))]>,
            Requires<[IsARM, HasV5TE, UseMulOps]>;
 
@@ -4048,7 +4073,7 @@ multiclass AI_smla<string opc, PatFrag opnode> {
               (ins GPRnopc:$Rn, GPRnopc:$Rm, GPR:$Ra),
               IIC_iMAC16, !strconcat(opc, "tb"), "\t$Rd, $Rn, $Rm, $Ra",
               [(set GPRnopc:$Rd,
-                    (add GPR:$Ra, (opnode (sra GPRnopc:$Rn, (i32 16)),
+                    (add GPR:$Ra, (mul (sra GPRnopc:$Rn, (i32 16)),
                                           (sext_inreg GPRnopc:$Rm, i16))))]>,
            Requires<[IsARM, HasV5TE, UseMulOps]>;
 
@@ -4056,7 +4081,7 @@ multiclass AI_smla<string opc, PatFrag opnode> {
               (ins GPRnopc:$Rn, GPRnopc:$Rm, GPR:$Ra),
               IIC_iMAC16, !strconcat(opc, "tt"), "\t$Rd, $Rn, $Rm, $Ra",
              [(set GPRnopc:$Rd,
-                   (add GPR:$Ra, (opnode (sra GPRnopc:$Rn, (i32 16)),
+                   (add GPR:$Ra, (mul (sra GPRnopc:$Rn, (i32 16)),
                                          (sra GPRnopc:$Rm, (i32 16)))))]>,
             Requires<[IsARM, HasV5TE, UseMulOps]>;
 
@@ -4074,8 +4099,8 @@ multiclass AI_smla<string opc, PatFrag opnode> {
   }
 }
 
-defm SMUL : AI_smul<"smul", BinOpFrag<(mul node:$LHS, node:$RHS)>>;
-defm SMLA : AI_smla<"smla", BinOpFrag<(mul node:$LHS, node:$RHS)>>;
+defm SMUL : AI_smul<"smul">;
+defm SMLA : AI_smla<"smla">;
 
 // Halfword multiply accumulate long: SMLAL<x><y>.
 def SMLALBB : AMulxyI64<0b0001010, 0b00, (outs GPRnopc:$RdLo, GPRnopc:$RdHi),
@@ -4336,8 +4361,7 @@ def SETPAN : AInoP<(outs), (ins imm0_1:$imm), MiscFrm, NoItinerary, "setpan",
 //
 
 defm CMP  : AI1_cmp_irs<0b1010, "cmp",
-                        IIC_iCMPi, IIC_iCMPr, IIC_iCMPsr,
-                        BinOpFrag<(ARMcmp node:$LHS, node:$RHS)>>;
+                        IIC_iCMPi, IIC_iCMPr, IIC_iCMPsr, ARMcmp>;
 
 // ARMcmpZ can re-use the above instruction definitions.
 def : ARMPat<(ARMcmpZ GPR:$src, mod_imm:$imm),
@@ -4745,7 +4769,7 @@ def : ARMPat<(stlex_2 (and GPR:$Rt, 0xffff), addr_offset_none:$addr),
 class acquiring_load<PatFrag base>
   : PatFrag<(ops node:$ptr), (base node:$ptr), [{
   AtomicOrdering Ordering = cast<AtomicSDNode>(N)->getOrdering();
-  return isAtLeastAcquire(Ordering);
+  return isAcquireOrStronger(Ordering);
 }]>;
 
 def atomic_load_acquire_8  : acquiring_load<atomic_load_8>;
@@ -4755,7 +4779,7 @@ def atomic_load_acquire_32 : acquiring_load<atomic_load_32>;
 class releasing_store<PatFrag base>
   : PatFrag<(ops node:$ptr, node:$val), (base node:$ptr, node:$val), [{
   AtomicOrdering Ordering = cast<AtomicSDNode>(N)->getOrdering();
-  return isAtLeastRelease(Ordering);
+  return isReleaseOrStronger(Ordering);
 }]>;
 
 def atomic_store_release_8  : releasing_store<atomic_store_8>;
@@ -4831,21 +4855,21 @@ def CDP2 : ABXI<0b1110, (outs), (ins p_imm:$cop, imm0_15:$opc1,
 }
 
 class ACI<dag oops, dag iops, string opc, string asm,
-          IndexMode im = IndexModeNone>
+            list<dag> pattern, IndexMode im = IndexModeNone>
   : I<oops, iops, AddrModeNone, 4, im, BrFrm, NoItinerary,
-      opc, asm, "", []> {
+      opc, asm, "", pattern> {
   let Inst{27-25} = 0b110;
 }
 class ACInoP<dag oops, dag iops, string opc, string asm,
-          IndexMode im = IndexModeNone>
+          list<dag> pattern, IndexMode im = IndexModeNone>
   : InoP<oops, iops, AddrModeNone, 4, im, BrFrm, NoItinerary,
-         opc, asm, "", []> {
+         opc, asm, "", pattern> {
   let Inst{31-28} = 0b1111;
   let Inst{27-25} = 0b110;
 }
-multiclass LdStCop<bit load, bit Dbit, string asm> {
+multiclass LdStCop<bit load, bit Dbit, string asm, list<dag> pattern> {
   def _OFFSET : ACI<(outs), (ins p_imm:$cop, c_imm:$CRd, addrmode5:$addr),
-                    asm, "\t$cop, $CRd, $addr"> {
+                    asm, "\t$cop, $CRd, $addr", pattern> {
     bits<13> addr;
     bits<4> cop;
     bits<4> CRd;
@@ -4861,7 +4885,7 @@ multiclass LdStCop<bit load, bit Dbit, string asm> {
     let DecoderMethod = "DecodeCopMemInstruction";
   }
   def _PRE : ACI<(outs), (ins p_imm:$cop, c_imm:$CRd, addrmode5_pre:$addr),
-                 asm, "\t$cop, $CRd, $addr!", IndexModePre> {
+                 asm, "\t$cop, $CRd, $addr!", [], IndexModePre> {
     bits<13> addr;
     bits<4> cop;
     bits<4> CRd;
@@ -4878,7 +4902,7 @@ multiclass LdStCop<bit load, bit Dbit, string asm> {
   }
   def _POST: ACI<(outs), (ins p_imm:$cop, c_imm:$CRd, addr_offset_none:$addr,
                               postidx_imm8s4:$offset),
-                 asm, "\t$cop, $CRd, $addr, $offset", IndexModePost> {
+                 asm, "\t$cop, $CRd, $addr, $offset", [], IndexModePost> {
     bits<9> offset;
     bits<4> addr;
     bits<4> cop;
@@ -4897,7 +4921,7 @@ multiclass LdStCop<bit load, bit Dbit, string asm> {
   def _OPTION : ACI<(outs),
                     (ins p_imm:$cop, c_imm:$CRd, addr_offset_none:$addr,
                          coproc_option_imm:$option),
-      asm, "\t$cop, $CRd, $addr, $option"> {
+      asm, "\t$cop, $CRd, $addr, $option", []> {
     bits<8> option;
     bits<4> addr;
     bits<4> cop;
@@ -4914,9 +4938,9 @@ multiclass LdStCop<bit load, bit Dbit, string asm> {
     let DecoderMethod = "DecodeCopMemInstruction";
   }
 }
-multiclass LdSt2Cop<bit load, bit Dbit, string asm> {
+multiclass LdSt2Cop<bit load, bit Dbit, string asm, list<dag> pattern> {
   def _OFFSET : ACInoP<(outs), (ins p_imm:$cop, c_imm:$CRd, addrmode5:$addr),
-                       asm, "\t$cop, $CRd, $addr"> {
+                       asm, "\t$cop, $CRd, $addr", pattern> {
     bits<13> addr;
     bits<4> cop;
     bits<4> CRd;
@@ -4932,7 +4956,7 @@ multiclass LdSt2Cop<bit load, bit Dbit, string asm> {
     let DecoderMethod = "DecodeCopMemInstruction";
   }
   def _PRE : ACInoP<(outs), (ins p_imm:$cop, c_imm:$CRd, addrmode5_pre:$addr),
-                    asm, "\t$cop, $CRd, $addr!", IndexModePre> {
+                    asm, "\t$cop, $CRd, $addr!", [], IndexModePre> {
     bits<13> addr;
     bits<4> cop;
     bits<4> CRd;
@@ -4949,7 +4973,7 @@ multiclass LdSt2Cop<bit load, bit Dbit, string asm> {
   }
   def _POST: ACInoP<(outs), (ins p_imm:$cop, c_imm:$CRd, addr_offset_none:$addr,
                                  postidx_imm8s4:$offset),
-                 asm, "\t$cop, $CRd, $addr, $offset", IndexModePost> {
+                 asm, "\t$cop, $CRd, $addr, $offset", [], IndexModePost> {
     bits<9> offset;
     bits<4> addr;
     bits<4> cop;
@@ -4968,7 +4992,7 @@ multiclass LdSt2Cop<bit load, bit Dbit, string asm> {
   def _OPTION : ACInoP<(outs),
                        (ins p_imm:$cop, c_imm:$CRd, addr_offset_none:$addr,
                             coproc_option_imm:$option),
-      asm, "\t$cop, $CRd, $addr, $option"> {
+      asm, "\t$cop, $CRd, $addr, $option", []> {
     bits<8> option;
     bits<4> addr;
     bits<4> cop;
@@ -4986,14 +5010,15 @@ multiclass LdSt2Cop<bit load, bit Dbit, string asm> {
   }
 }
 
-defm LDC   : LdStCop <1, 0, "ldc">;
-defm LDCL  : LdStCop <1, 1, "ldcl">;
-defm STC   : LdStCop <0, 0, "stc">;
-defm STCL  : LdStCop <0, 1, "stcl">;
-defm LDC2  : LdSt2Cop<1, 0, "ldc2">, Requires<[PreV8]>;
-defm LDC2L : LdSt2Cop<1, 1, "ldc2l">, Requires<[PreV8]>;
-defm STC2  : LdSt2Cop<0, 0, "stc2">, Requires<[PreV8]>;
-defm STC2L : LdSt2Cop<0, 1, "stc2l">, Requires<[PreV8]>;
+defm LDC   : LdStCop <1, 0, "ldc", [(int_arm_ldc imm:$cop, imm:$CRd, addrmode5:$addr)]>;
+defm LDCL  : LdStCop <1, 1, "ldcl", [(int_arm_ldcl imm:$cop, imm:$CRd, addrmode5:$addr)]>;
+defm LDC2  : LdSt2Cop<1, 0, "ldc2", [(int_arm_ldc2 imm:$cop, imm:$CRd, addrmode5:$addr)]>, Requires<[PreV8]>;
+defm LDC2L : LdSt2Cop<1, 1, "ldc2l", [(int_arm_ldc2l imm:$cop, imm:$CRd, addrmode5:$addr)]>, Requires<[PreV8]>;
+
+defm STC   : LdStCop <0, 0, "stc", [(int_arm_stc imm:$cop, imm:$CRd, addrmode5:$addr)]>;
+defm STCL  : LdStCop <0, 1, "stcl", [(int_arm_stcl imm:$cop, imm:$CRd, addrmode5:$addr)]>;
+defm STC2  : LdSt2Cop<0, 0, "stc2", [(int_arm_stc2 imm:$cop, imm:$CRd, addrmode5:$addr)]>, Requires<[PreV8]>;
+defm STC2L : LdSt2Cop<0, 1, "stc2l", [(int_arm_stc2l imm:$cop, imm:$CRd, addrmode5:$addr)]>, Requires<[PreV8]>;
 
 //===----------------------------------------------------------------------===//
 // Move between coprocessor and ARM core register.
@@ -5118,9 +5143,9 @@ def MRRC : MovRRCopro<"mrrc", 1 /* from coprocessor to ARM core register */,
                       (outs GPRnopc:$Rt, GPRnopc:$Rt2),
                       (ins p_imm:$cop, imm0_15:$opc1, c_imm:$CRm), []>;
 
-class MovRRCopro2<string opc, bit direction, list<dag> pattern = []>
-  : ABXI<0b1100, (outs), (ins p_imm:$cop, imm0_15:$opc1,
-         GPRnopc:$Rt, GPRnopc:$Rt2, c_imm:$CRm), NoItinerary,
+class MovRRCopro2<string opc, bit direction, dag oops, dag iops,
+                  list<dag> pattern = []>
+  : ABXI<0b1100, oops, iops, NoItinerary,
          !strconcat(opc, "\t$cop, $opc1, $Rt, $Rt2, $CRm"), pattern>,
     Requires<[PreV8]> {
   let Inst{31-28} = 0b1111;
@@ -5139,13 +5164,18 @@ class MovRRCopro2<string opc, bit direction, list<dag> pattern = []>
   let Inst{7-4}   = opc1;
   let Inst{3-0}   = CRm;
 
-  let DecoderMethod = "DecodeMRRC2";
+  let DecoderMethod = "DecoderForMRRC2AndMCRR2";
 }
 
 def MCRR2 : MovRRCopro2<"mcrr2", 0 /* from ARM core register to coprocessor */,
+                        (outs), (ins p_imm:$cop, imm0_15:$opc1, GPRnopc:$Rt,
+                        GPRnopc:$Rt2, c_imm:$CRm),
                         [(int_arm_mcrr2 imm:$cop, imm:$opc1, GPRnopc:$Rt,
                                         GPRnopc:$Rt2, imm:$CRm)]>;
-def MRRC2 : MovRRCopro2<"mrrc2", 1 /* from coprocessor to ARM core register */>;
+
+def MRRC2 : MovRRCopro2<"mrrc2", 1 /* from coprocessor to ARM core register */,
+                       (outs GPRnopc:$Rt, GPRnopc:$Rt2),
+                       (ins p_imm:$cop, imm0_15:$opc1, c_imm:$CRm), []>;
 
 //===----------------------------------------------------------------------===//
 // Move between special register and ARM core register
@@ -5164,7 +5194,7 @@ def MRS : ABI<0b0001, (outs GPRnopc:$Rd), (ins), NoItinerary,
   let Unpredictable{11-0} = 0b110100001111;
 }
 
-def : InstAlias<"mrs${p} $Rd, cpsr", (MRS GPRnopc:$Rd, pred:$p)>,
+def : InstAlias<"mrs${p} $Rd, cpsr", (MRS GPRnopc:$Rd, pred:$p), 0>,
          Requires<[IsARM]>;
 
 // The MRSsys instruction is the MRS instruction from the ARM ARM,
@@ -5206,6 +5236,7 @@ def MRSbanked : ABI<0b0001, (outs GPRnopc:$Rd), (ins banked_reg:$banked),
 // to distinguish between them. The mask operand contains the special register
 // (R Bit) in bit 4 and bits 3-0 contains the mask with the fields to be
 // accessed in the special register.
+let Defs = [CPSR] in
 def MSR : ABI<0b0001, (outs), (ins msr_mask:$mask, GPR:$Rn), NoItinerary,
               "msr", "\t$mask, $Rn", []> {
   bits<5> mask;
@@ -5220,6 +5251,7 @@ def MSR : ABI<0b0001, (outs), (ins msr_mask:$mask, GPR:$Rn), NoItinerary,
   let Inst{3-0} = Rn;
 }
 
+let Defs = [CPSR] in
 def MSRi : ABI<0b0011, (outs), (ins msr_mask:$mask,  mod_imm:$imm), NoItinerary,
                "msr", "\t$mask, $imm", []> {
   bits<5> mask;
@@ -5268,8 +5300,8 @@ let usesCustomInserter = 1, Uses = [R4], Defs = [R4, SP] in
 def win__dbzchk : SDNode<"ARMISD::WIN__DBZCHK", SDT_WIN__DBZCHK,
                          [SDNPHasChain, SDNPSideEffect, SDNPOutGlue]>;
 let usesCustomInserter = 1, Defs = [CPSR] in
-  def WIN__DBZCHK : PseudoInst<(outs), (ins GPR:$divisor), NoItinerary,
-                               [(win__dbzchk GPR:$divisor)]>;
+  def WIN__DBZCHK : PseudoInst<(outs), (ins tGPR:$divisor), NoItinerary,
+                               [(win__dbzchk tGPR:$divisor)]>;
 
 //===----------------------------------------------------------------------===//
 // TLS Instructions
@@ -5423,6 +5455,8 @@ def : Pat<(load (ARMWrapperPIC tglobaltlsaddr:$addr)),
 def : ARMPat<(ARMWrapper  tconstpool  :$dst), (LEApcrel tconstpool  :$dst)>;
 def : ARMPat<(ARMWrapper  tglobaladdr :$dst), (MOVi32imm tglobaladdr :$dst)>,
             Requires<[IsARM, UseMovt]>;
+def : ARMPat<(ARMWrapper texternalsym :$dst), (MOVi32imm texternalsym :$dst)>,
+            Requires<[IsARM, UseMovt]>;
 def : ARMPat<(ARMWrapperJT tjumptable:$dst),
              (LEApcrelJT tjumptable:$dst)>;
 
@@ -5568,9 +5602,9 @@ include "ARMInstrNEON.td"
 //
 
 // Memory barriers
-def : InstAlias<"dmb", (DMB 0xf)>, Requires<[IsARM, HasDB]>;
-def : InstAlias<"dsb", (DSB 0xf)>, Requires<[IsARM, HasDB]>;
-def : InstAlias<"isb", (ISB 0xf)>, Requires<[IsARM, HasDB]>;
+def : InstAlias<"dmb", (DMB 0xf), 0>, Requires<[IsARM, HasDB]>;
+def : InstAlias<"dsb", (DSB 0xf), 0>, Requires<[IsARM, HasDB]>;
+def : InstAlias<"isb", (ISB 0xf), 0>, Requires<[IsARM, HasDB]>;
 
 // System instructions
 def : MnemonicAlias<"swi", "svc">;
@@ -5583,13 +5617,13 @@ def : MnemonicAlias<"stmfd", "stmdb">;
 def : MnemonicAlias<"stmia", "stm">;
 def : MnemonicAlias<"stmea", "stm">;
 
-// PKHBT/PKHTB with default shift amount. PKHTB is equivalent to PKHBT when the
-// shift amount is zero (i.e., unspecified).
+// PKHBT/PKHTB with default shift amount. PKHTB is equivalent to PKHBT with the
+// input operands swapped when the shift amount is zero (i.e., unspecified).
 def : InstAlias<"pkhbt${p} $Rd, $Rn, $Rm",
-                (PKHBT GPRnopc:$Rd, GPRnopc:$Rn, GPRnopc:$Rm, 0, pred:$p)>,
+                (PKHBT GPRnopc:$Rd, GPRnopc:$Rn, GPRnopc:$Rm, 0, pred:$p), 0>,
         Requires<[IsARM, HasV6]>;
 def : InstAlias<"pkhtb${p} $Rd, $Rn, $Rm",
-                (PKHBT GPRnopc:$Rd, GPRnopc:$Rn, GPRnopc:$Rm, 0, pred:$p)>,
+                (PKHBT GPRnopc:$Rd, GPRnopc:$Rm, GPRnopc:$Rn, 0, pred:$p), 0>,
         Requires<[IsARM, HasV6]>;
 
 // PUSH/POP aliases for STM/LDM
@@ -5747,23 +5781,23 @@ def : InstAlias<"nop${p}", (MOVr R0, R0, pred:$p, zero_reg)>,
 // the instruction definitions need difference constraints pre-v6.
 // Use these aliases for the assembly parsing on pre-v6.
 def : InstAlias<"mul${s}${p} $Rd, $Rn, $Rm",
-            (MUL GPRnopc:$Rd, GPRnopc:$Rn, GPRnopc:$Rm, pred:$p, cc_out:$s)>,
+            (MUL GPRnopc:$Rd, GPRnopc:$Rn, GPRnopc:$Rm, pred:$p, cc_out:$s), 0>,
          Requires<[IsARM, NoV6]>;
 def : InstAlias<"mla${s}${p} $Rd, $Rn, $Rm, $Ra",
             (MLA GPRnopc:$Rd, GPRnopc:$Rn, GPRnopc:$Rm, GPRnopc:$Ra,
-             pred:$p, cc_out:$s)>,
+             pred:$p, cc_out:$s), 0>,
          Requires<[IsARM, NoV6]>;
 def : InstAlias<"smlal${s}${p} $RdLo, $RdHi, $Rn, $Rm",
-            (SMLAL GPR:$RdLo, GPR:$RdHi, GPR:$Rn, GPR:$Rm, pred:$p, cc_out:$s)>,
+            (SMLAL GPR:$RdLo, GPR:$RdHi, GPR:$Rn, GPR:$Rm, pred:$p, cc_out:$s), 0>,
          Requires<[IsARM, NoV6]>;
 def : InstAlias<"umlal${s}${p} $RdLo, $RdHi, $Rn, $Rm",
-            (UMLAL GPR:$RdLo, GPR:$RdHi, GPR:$Rn, GPR:$Rm, pred:$p, cc_out:$s)>,
+            (UMLAL GPR:$RdLo, GPR:$RdHi, GPR:$Rn, GPR:$Rm, pred:$p, cc_out:$s), 0>,
          Requires<[IsARM, NoV6]>;
 def : InstAlias<"smull${s}${p} $RdLo, $RdHi, $Rn, $Rm",
-            (SMULL GPR:$RdLo, GPR:$RdHi, GPR:$Rn, GPR:$Rm, pred:$p, cc_out:$s)>,
+            (SMULL GPR:$RdLo, GPR:$RdHi, GPR:$Rn, GPR:$Rm, pred:$p, cc_out:$s), 0>,
          Requires<[IsARM, NoV6]>;
 def : InstAlias<"umull${s}${p} $RdLo, $RdHi, $Rn, $Rm",
-            (UMULL GPR:$RdLo, GPR:$RdHi, GPR:$Rn, GPR:$Rm, pred:$p, cc_out:$s)>,
+            (UMULL GPR:$RdLo, GPR:$RdHi, GPR:$Rn, GPR:$Rm, pred:$p, cc_out:$s), 0>,
          Requires<[IsARM, NoV6]>;
 
 // 'it' blocks in ARM mode just validate the predicates. The IT itself
@@ -5775,3 +5809,36 @@ let mayLoad = 1, mayStore =1, hasSideEffects = 1 in
 def SPACE : PseudoInst<(outs GPR:$Rd), (ins i32imm:$size, GPR:$Rn),
                        NoItinerary,
                        [(set GPR:$Rd, (int_arm_space imm:$size, GPR:$Rn))]>;
+
+//===----------------------------------
+// Atomic cmpxchg for -O0
+//===----------------------------------
+
+// The fast register allocator used during -O0 inserts spills to cover any VRegs
+// live across basic block boundaries. When this happens between an LDXR and an
+// STXR it can clear the exclusive monitor, causing all cmpxchg attempts to
+// fail.
+
+// Unfortunately, this means we have to have an alternative (expanded
+// post-regalloc) path for -O0 compilations. Fortunately this path can be
+// significantly more naive than the standard expansion: we conservatively
+// assume seq_cst, strong cmpxchg and omit clrex on failure.
+
+let Constraints = "@earlyclobber $Rd,@earlyclobber $status",
+    mayLoad = 1, mayStore = 1 in {
+def CMP_SWAP_8 : PseudoInst<(outs GPR:$Rd, GPR:$status),
+                            (ins GPR:$addr, GPR:$desired, GPR:$new),
+                            NoItinerary, []>, Sched<[]>;
+
+def CMP_SWAP_16 : PseudoInst<(outs GPR:$Rd, GPR:$status),
+                             (ins GPR:$addr, GPR:$desired, GPR:$new),
+                             NoItinerary, []>, Sched<[]>;
+
+def CMP_SWAP_32 : PseudoInst<(outs GPR:$Rd, GPR:$status),
+                             (ins GPR:$addr, GPR:$desired, GPR:$new),
+                             NoItinerary, []>, Sched<[]>;
+
+def CMP_SWAP_64 : PseudoInst<(outs GPRPair:$Rd, GPR:$status),
+                             (ins GPR:$addr, GPRPair:$desired, GPRPair:$new),
+                             NoItinerary, []>, Sched<[]>;
+}
diff --git a/lib/Target/ARM/ARMInstrThumb.td b/lib/Target/ARM/ARMInstrThumb.td
index 5b1f9a06442e..93a174f3678a 100644
--- a/lib/Target/ARM/ARMInstrThumb.td
+++ b/lib/Target/ARM/ARMInstrThumb.td
@@ -15,10 +15,6 @@
 // Thumb specific DAG Nodes.
 //
 
-def ARMtcall : SDNode<"ARMISD::tCALL", SDT_ARMcall,
-                      [SDNPHasChain, SDNPOptInGlue, SDNPOutGlue,
-                       SDNPVariadic]>;
-
 def imm_sr_XFORM: SDNodeXForm<imm, [{
   unsigned Imm = N->getZExtValue();
   return CurDAG->getTargetConstant((Imm == 32 ? 0 : Imm), SDLoc(N), MVT::i32);
@@ -70,6 +66,14 @@ def thumb_immshifted_shamt : SDNodeXForm<imm, [{
   return CurDAG->getTargetConstant(V, SDLoc(N), MVT::i32);
 }]>;
 
+def imm256_510 : ImmLeaf<i32, [{
+  return Imm >= 256 && Imm < 511;
+}]>;
+
+def thumb_imm256_510_addend : SDNodeXForm<imm, [{
+  return CurDAG->getTargetConstant(N->getZExtValue() - 255, SDLoc(N), MVT::i32);
+}]>;
+
 // Scaled 4 immediate.
 def t_imm0_1020s4_asmoperand: AsmOperandClass { let Name = "Imm0_1020s4"; }
 def t_imm0_1020s4 : Operand<i32> {
@@ -121,26 +125,38 @@ def t_adrlabel : Operand<i32> {
   let ParserMatchClass = UnsignedOffset_b8s2;
 }
 
-def t_bcctarget : Operand<i32> {
-  let EncoderMethod = "getThumbBCCTargetOpValue";
-  let DecoderMethod = "DecodeThumbBCCTargetOperand";
-}
 
-def t_cbtarget : Operand<i32> {
-  let EncoderMethod = "getThumbCBTargetOpValue";
-  let DecoderMethod = "DecodeThumbCmpBROperand";
+def thumb_br_target : Operand<OtherVT> {
+  let ParserMatchClass = ThumbBranchTarget;
+  let EncoderMethod = "getThumbBranchTargetOpValue";
+  let OperandType = "OPERAND_PCREL";
 }
 
-def t_bltarget : Operand<i32> {
+def thumb_bl_target : Operand<i32> {
+  let ParserMatchClass = ThumbBranchTarget;
   let EncoderMethod = "getThumbBLTargetOpValue";
   let DecoderMethod = "DecodeThumbBLTargetOperand";
 }
 
-def t_blxtarget : Operand<i32> {
+// Target for BLX *from* thumb mode.
+def thumb_blx_target : Operand<i32> {
+  let ParserMatchClass = ARMBranchTarget;
   let EncoderMethod = "getThumbBLXTargetOpValue";
   let DecoderMethod = "DecodeThumbBLXOffset";
 }
 
+def thumb_bcc_target : Operand<OtherVT> {
+  let ParserMatchClass = ThumbBranchTarget;
+  let EncoderMethod = "getThumbBCCTargetOpValue";
+  let DecoderMethod = "DecodeThumbBCCTargetOperand";
+}
+
+def thumb_cb_target : Operand<OtherVT> {
+  let ParserMatchClass = ThumbBranchTarget;
+  let EncoderMethod = "getThumbCBTargetOpValue";
+  let DecoderMethod = "DecodeThumbCmpBROperand";
+}
+
 // t_addrmode_pc := <label> => pc + imm8 * 4
 //
 def t_addrmode_pc : MemOperand {
@@ -278,16 +294,17 @@ def tHINT : T1pI<(outs), (ins imm0_15:$imm), NoItinerary, "hint", "\t$imm",
   let Inst{7-4} = imm;
 }
 
-class tHintAlias<string Asm, dag Result> : tInstAlias<Asm, Result> {
+// Note: When EmitPriority == 1, the alias will be used for printing
+class tHintAlias<string Asm, dag Result, bit EmitPriority = 0> : tInstAlias<Asm, Result, EmitPriority> {
   let Predicates = [IsThumb, HasV6M];
 }
 
-def : tHintAlias<"nop$p", (tHINT 0, pred:$p)>; // A8.6.110
-def : tHintAlias<"yield$p", (tHINT 1, pred:$p)>; // A8.6.410
-def : tHintAlias<"wfe$p", (tHINT 2, pred:$p)>; // A8.6.408
-def : tHintAlias<"wfi$p", (tHINT 3, pred:$p)>; // A8.6.409
-def : tHintAlias<"sev$p", (tHINT 4, pred:$p)>; // A8.6.157
-def : tInstAlias<"sevl$p", (tHINT 5, pred:$p)> {
+def : tHintAlias<"nop$p", (tHINT 0, pred:$p), 1>; // A8.6.110
+def : tHintAlias<"yield$p", (tHINT 1, pred:$p), 1>; // A8.6.410
+def : tHintAlias<"wfe$p", (tHINT 2, pred:$p), 1>; // A8.6.408
+def : tHintAlias<"wfi$p", (tHINT 3, pred:$p), 1>; // A8.6.409
+def : tHintAlias<"sev$p", (tHINT 4, pred:$p), 1>; // A8.6.157
+def : tInstAlias<"sevl$p", (tHINT 5, pred:$p), 1> {
   let Predicates = [IsThumb2, HasV8];
 }
 
@@ -302,7 +319,7 @@ def tBKPT : T1I<(outs), (ins imm0_255:$val), NoItinerary, "bkpt\t$val",
   let Inst{7-0} = val;
 }
 // default immediate for breakpoint mnemonic
-def : InstAlias<"bkpt", (tBKPT 0)>, Requires<[IsThumb]>;
+def : InstAlias<"bkpt", (tBKPT 0), 0>, Requires<[IsThumb]>;
 
 def tHLT : T1I<(outs), (ins imm0_63:$val), NoItinerary, "hlt\t$val",
                 []>, T1Encoding<0b101110>, Requires<[IsThumb, HasV8]> {
@@ -439,6 +456,14 @@ let isBranch = 1, isTerminator = 1, isBarrier = 1, isIndirectBranch = 1 in {
     let Inst{2-0} = 0b000;
     let Unpredictable{2-0} = 0b111;
   }
+  def tBXNS : TI<(outs), (ins GPR:$Rm, pred:$p), IIC_Br, "bxns${p}\t$Rm", []>,
+              Requires<[IsThumb, Has8MSecExt]>,
+              T1Special<{1,1,0,?}>, Sched<[WriteBr]> {
+    bits<4> Rm;
+    let Inst{6-3} = Rm;
+    let Inst{2-0} = 0b100;
+    let Unpredictable{1-0} = 0b11;
+  }
 }
 
 let isReturn = 1, isTerminator = 1, isBarrier = 1 in {
@@ -458,9 +483,9 @@ let isCall = 1,
   Defs = [LR], Uses = [SP] in {
   // Also used for Thumb2
   def tBL  : TIx2<0b11110, 0b11, 1,
-                  (outs), (ins pred:$p, t_bltarget:$func), IIC_Br,
+                  (outs), (ins pred:$p, thumb_bl_target:$func), IIC_Br,
                   "bl${p}\t$func",
-                  [(ARMtcall tglobaladdr:$func)]>,
+                  [(ARMcall tglobaladdr:$func)]>,
              Requires<[IsThumb]>, Sched<[WriteBrL]> {
     bits<24> func;
     let Inst{26} = func{23};
@@ -472,9 +497,8 @@ let isCall = 1,
 
   // ARMv5T and above, also used for Thumb2
   def tBLXi : TIx2<0b11110, 0b11, 0,
-                 (outs), (ins pred:$p, t_blxtarget:$func), IIC_Br,
-                   "blx${p}\t$func",
-                   [(ARMcall tglobaladdr:$func)]>,
+                 (outs), (ins pred:$p, thumb_blx_target:$func), IIC_Br,
+                   "blx${p}\t$func", []>,
               Requires<[IsThumb, HasV5T, IsNotMClass]>, Sched<[WriteBrL]> {
     bits<24> func;
     let Inst{26} = func{23};
@@ -488,7 +512,7 @@ let isCall = 1,
   // Also used for Thumb2
   def tBLXr : TI<(outs), (ins pred:$p, GPR:$func), IIC_Br,
                   "blx${p}\t$func",
-                  [(ARMtcall GPR:$func)]>,
+                  [(ARMcall GPR:$func)]>,
               Requires<[IsThumb, HasV5T]>,
               T1Special<{1,1,1,?}>, Sched<[WriteBrL]> { // A6.2.3 & A8.6.24;
     bits<4> func;
@@ -496,6 +520,17 @@ let isCall = 1,
     let Inst{2-0} = 0b000;
   }
 
+  // ARMv8-M Security Extensions
+  def tBLXNSr : TI<(outs), (ins pred:$p, GPRnopc:$func), IIC_Br,
+                   "blxns${p}\t$func", []>,
+                Requires<[IsThumb, Has8MSecExt]>,
+                T1Special<{1,1,1,?}>, Sched<[WriteBrL]> {
+    bits<4> func;
+    let Inst{6-3} = func;
+    let Inst{2-0} = 0b100;
+    let Unpredictable{1-0} = 0b11;
+  }
+
   // ARMv4T
   def tBX_CALL : tPseudoInst<(outs), (ins tGPR:$func),
                   4, IIC_Br,
@@ -517,8 +552,9 @@ let isBranch = 1, isTerminator = 1, isBarrier = 1 in {
   // Just a pseudo for a tBL instruction. Needed to let regalloc know about
   // the clobber of LR.
   let Defs = [LR] in
-  def tBfar : tPseudoExpand<(outs), (ins t_bltarget:$target, pred:$p),
-                          4, IIC_Br, [], (tBL pred:$p, t_bltarget:$target)>,
+  def tBfar : tPseudoExpand<(outs), (ins thumb_bl_target:$target, pred:$p),
+                          4, IIC_Br, [],
+                          (tBL pred:$p, thumb_bl_target:$target)>,
                           Sched<[WriteBrTbl]>;
 
   def tBR_JTr : tPseudoInst<(outs),
@@ -534,7 +570,7 @@ let isBranch = 1, isTerminator = 1, isBarrier = 1 in {
 // FIXME: should be able to write a pattern for ARMBrcond, but can't use
 // a two-value operand where a dag node expects two operands. :(
 let isBranch = 1, isTerminator = 1 in
-  def tBcc : T1I<(outs), (ins t_bcctarget:$target, pred:$p), IIC_Br,
+  def tBcc : T1I<(outs), (ins thumb_bcc_target:$target, pred:$p), IIC_Br,
                  "b${p}\t$target",
                  [/*(ARMbrcond bb:$target, imm:$cc)*/]>,
              T1BranchCond<{1,1,0,1}>, Sched<[WriteBr]> {
@@ -663,19 +699,19 @@ multiclass thumb_st_rr_ri_enc<bits<3> reg_opc, bits<4> imm_opc,
 defm tLDR  : thumb_ld_rr_ri_enc<0b100, 0b0110, t_addrmode_rr,
                                 t_addrmode_is4, AddrModeT1_4,
                                 IIC_iLoad_r, IIC_iLoad_i, "ldr",
-                                UnOpFrag<(load node:$Src)>>;
+                                load>;
 
 // A8.6.64 & A8.6.61
 defm tLDRB : thumb_ld_rr_ri_enc<0b110, 0b0111, t_addrmode_rr,
                                 t_addrmode_is1, AddrModeT1_1,
                                 IIC_iLoad_bh_r, IIC_iLoad_bh_i, "ldrb",
-                                UnOpFrag<(zextloadi8 node:$Src)>>;
+                                zextloadi8>;
 
 // A8.6.76 & A8.6.73
 defm tLDRH : thumb_ld_rr_ri_enc<0b101, 0b1000, t_addrmode_rr,
                                 t_addrmode_is2, AddrModeT1_2,
                                 IIC_iLoad_bh_r, IIC_iLoad_bh_i, "ldrh",
-                                UnOpFrag<(zextloadi16 node:$Src)>>;
+                                zextloadi16>;
 
 let AddedComplexity = 10 in
 def tLDRSB :                    // A8.6.80
@@ -706,19 +742,19 @@ def tSTRspi : T1pIs<(outs), (ins tGPR:$Rt, t_addrmode_sp:$addr), IIC_iStore_i,
 defm tSTR  : thumb_st_rr_ri_enc<0b000, 0b0110, t_addrmode_rr,
                                 t_addrmode_is4, AddrModeT1_4,
                                 IIC_iStore_r, IIC_iStore_i, "str",
-                                BinOpFrag<(store node:$LHS, node:$RHS)>>;
+                                store>;
 
 // A8.6.197 & A8.6.195
 defm tSTRB : thumb_st_rr_ri_enc<0b010, 0b0111, t_addrmode_rr,
                                 t_addrmode_is1, AddrModeT1_1,
                                 IIC_iStore_bh_r, IIC_iStore_bh_i, "strb",
-                                BinOpFrag<(truncstorei8 node:$LHS, node:$RHS)>>;
+                                truncstorei8>;
 
 // A8.6.207 & A8.6.205
 defm tSTRH : thumb_st_rr_ri_enc<0b001, 0b1000, t_addrmode_rr,
                                t_addrmode_is2, AddrModeT1_2,
                                IIC_iStore_bh_r, IIC_iStore_bh_i, "strh",
-                               BinOpFrag<(truncstorei16 node:$LHS, node:$RHS)>>;
+                               truncstorei16>;
 
 
 //===----------------------------------------------------------------------===//
@@ -770,7 +806,7 @@ def tSTMIA_UPD : Thumb1I<(outs GPR:$wb),
 } // hasSideEffects
 
 def : InstAlias<"ldm${p} $Rn!, $regs",
-                (tLDMIA tGPR:$Rn, pred:$p, reglist:$regs)>,
+                (tLDMIA tGPR:$Rn, pred:$p, reglist:$regs), 0>,
         Requires<[IsThumb, IsThumb1Only]>;
 
 let mayLoad = 1, Uses = [SP], Defs = [SP], hasExtraDefRegAllocReq = 1 in
@@ -1310,7 +1346,14 @@ def tInt_eh_sjlj_longjmp : XI<(outs), (ins GPR:$src, GPR:$scratch),
                               AddrModeNone, 0, IndexModeNone,
                               Pseudo, NoItinerary, "", "",
                               [(ARMeh_sjlj_longjmp GPR:$src, GPR:$scratch)]>,
-                             Requires<[IsThumb]>;
+                             Requires<[IsThumb,IsNotWindows]>;
+
+let isBarrier = 1, hasSideEffects = 1, isTerminator = 1, isCodeGenOnly = 1,
+    Defs = [ R11, LR, SP ] in
+def tInt_WIN_eh_sjlj_longjmp
+  : XI<(outs), (ins GPR:$src, GPR:$scratch), AddrModeNone, 0, IndexModeNone,
+       Pseudo, NoItinerary, "", "", [(ARMeh_sjlj_longjmp GPR:$src, GPR:$scratch)]>,
+    Requires<[IsThumb,IsWindows]>;
 
 //===----------------------------------------------------------------------===//
 // Non-Instruction Patterns
@@ -1380,16 +1423,9 @@ def : T1Pat<(ARMWrapperJT tjumptable:$dst),
             (tLEApcrelJT tjumptable:$dst)>;
 
 // Direct calls
-def : T1Pat<(ARMtcall texternalsym:$func), (tBL texternalsym:$func)>,
+def : T1Pat<(ARMcall texternalsym:$func), (tBL texternalsym:$func)>,
       Requires<[IsThumb]>;
 
-def : Tv5Pat<(ARMcall texternalsym:$func), (tBLXi texternalsym:$func)>,
-      Requires<[IsThumb, HasV5T, IsNotMClass]>;
-
-// Indirect calls to ARM routines
-def : Tv5Pat<(ARMcall GPR:$dst), (tBLXr GPR:$dst)>,
-      Requires<[IsThumb, HasV5T]>;
-
 // zextload i1 -> zextload i8
 def : T1Pat<(zextloadi1 t_addrmode_is1:$addr),
             (tLDRBi t_addrmode_is1:$addr)>;
@@ -1415,6 +1451,24 @@ def : T1Pat<(extloadi8  t_addrmode_rr:$addr),  (tLDRBr t_addrmode_rr:$addr)>;
 def : T1Pat<(extloadi16 t_addrmode_is2:$addr), (tLDRHi t_addrmode_is2:$addr)>;
 def : T1Pat<(extloadi16 t_addrmode_rr:$addr),  (tLDRHr t_addrmode_rr:$addr)>;
 
+// post-inc loads and stores
+
+// post-inc LDR -> LDM r0!, {r1}. The way operands are layed out in LDMs is
+// different to how ISel expects them for a post-inc load, so use a pseudo
+// and expand it just after ISel.
+let usesCustomInserter = 1,
+    Constraints = "$Rn = $Rn_wb,@earlyclobber $Rn_wb" in
+ def tLDR_postidx: tPseudoInst<(outs rGPR:$Rt, rGPR:$Rn_wb),
+                               (ins rGPR:$Rn, pred:$p),
+                               4, IIC_iStore_ru,
+                               []>;
+
+// post-inc STR -> STM r0!, {r1}. The layout of this (because it doesn't def
+// multiple registers) is the same in ISel as MachineInstr, so there's no need
+// for a pseudo.
+def : T1Pat<(post_store rGPR:$Rt, rGPR:$Rn, 4),
+            (tSTMIA_UPD rGPR:$Rn, rGPR:$Rt)>;
+
 // If it's impossible to use [r,r] address mode for sextload, select to
 // ldr{b|h} + sxt{b|h} instead.
 def : T1Pat<(sextloadi8 t_addrmode_is1:$addr),
@@ -1474,6 +1528,10 @@ def : T1Pat<(i32 thumb_immshifted:$src),
 def : T1Pat<(i32 imm0_255_comp:$src),
             (tMVN (tMOVi8 (imm_comp_XFORM imm:$src)))>;
 
+def : T1Pat<(i32 imm256_510:$src),
+            (tADDi8 (tMOVi8 255),
+                    (thumb_imm256_510_addend imm:$src))>;
+
 // Pseudo instruction that combines ldr from constpool and add pc. This should
 // be expanded into two instructions late to allow if-conversion and
 // scheduling.
@@ -1502,7 +1560,7 @@ let isBranch = 1, isTerminator = 1, isBarrier = 1, isIndirectBranch = 1 in {
 
 // In Thumb1, "nop" is encoded as a "mov r8, r8". Technically, the bf00
 // encoding is available on ARMv6K, but we don't differentiate that finely.
-def : InstAlias<"nop", (tMOVr R8, R8, 14, 0)>,Requires<[IsThumb, IsThumb1Only]>;
+def : InstAlias<"nop", (tMOVr R8, R8, 14, 0), 0>, Requires<[IsThumb, IsThumb1Only]>;
 
 
 // For round-trip assembly/disassembly, we have to handle a CPS instruction
@@ -1524,3 +1582,8 @@ def : tInstAlias<"lsr${s}${p} $Rdm, $imm",
              (tLSRri tGPR:$Rdm, cc_out:$s, tGPR:$Rdm, imm_sr:$imm, pred:$p)>;
 def : tInstAlias<"asr${s}${p} $Rdm, $imm",
              (tASRri tGPR:$Rdm, cc_out:$s, tGPR:$Rdm, imm_sr:$imm, pred:$p)>;
+
+// Pseudo instruction ldr Rt, =immediate
+def tLDRConstPool
+  : tAsmPseudo<"ldr${p} $Rt, $immediate",
+               (ins tGPR:$Rt, const_pool_asm_imm:$immediate, pred:$p)>;
diff --git a/lib/Target/ARM/ARMInstrThumb2.td b/lib/Target/ARM/ARMInstrThumb2.td
index f42f4569b2f8..55e5308be40e 100644
--- a/lib/Target/ARM/ARMInstrThumb2.td
+++ b/lib/Target/ARM/ARMInstrThumb2.td
@@ -270,7 +270,7 @@ def t2addrmode_so_reg : MemOperand,
   let EncoderMethod = "getT2AddrModeSORegOpValue";
   let DecoderMethod = "DecodeT2AddrModeSOReg";
   let ParserMatchClass = t2addrmode_so_reg_asmoperand;
-  let MIOperandInfo = (ops GPR:$base, rGPR:$offsreg, i32imm:$offsimm);
+  let MIOperandInfo = (ops GPRnopc:$base, rGPR:$offsreg, i32imm:$offsimm);
 }
 
 // Addresses for the TBB/TBH instructions.
@@ -576,8 +576,8 @@ class T2MlaLong<bits<3> opc22_20, bits<4> opc7_4,
 /// changed to modify CPSR.
 multiclass T2I_bin_irs<bits<4> opcod, string opc,
                      InstrItinClass iii, InstrItinClass iir, InstrItinClass iis,
-                       PatFrag opnode, bit Commutable = 0,
-                       string wide = ""> {
+                     SDPatternOperator opnode, bit Commutable = 0,
+                     string wide = ""> {
    // shifted imm
    def ri : T2sTwoRegImm<
                 (outs rGPR:$Rd), (ins rGPR:$Rn, t2_so_imm:$imm), iii,
@@ -632,7 +632,7 @@ multiclass T2I_bin_irs<bits<4> opcod, string opc,
 //  the ".w" suffix to indicate that they are wide.
 multiclass T2I_bin_w_irs<bits<4> opcod, string opc,
                      InstrItinClass iii, InstrItinClass iir, InstrItinClass iis,
-                         PatFrag opnode, bit Commutable = 0> :
+                     SDPatternOperator opnode, bit Commutable = 0> :
     T2I_bin_irs<opcod, opc, iii, iir, iis, opnode, Commutable, ".w"> {
   // Assembler aliases w/ the ".w" suffix.
   def : t2InstAlias<!strconcat(opc, "${s}${p}.w", " $Rd, $Rn, $imm"),
@@ -661,7 +661,7 @@ multiclass T2I_bin_w_irs<bits<4> opcod, string opc,
 /// T2I_rbin_is - Same as T2I_bin_irs except the order of operands are
 /// reversed.  The 'rr' form is only defined for the disassembler; for codegen
 /// it is equivalent to the T2I_bin_irs counterpart.
-multiclass T2I_rbin_irs<bits<4> opcod, string opc, PatFrag opnode> {
+multiclass T2I_rbin_irs<bits<4> opcod, string opc, SDNode opnode> {
    // shifted imm
    def ri : T2sTwoRegImm<
                  (outs rGPR:$Rd), (ins rGPR:$Rn, t2_so_imm:$imm), IIC_iALUi,
@@ -705,7 +705,7 @@ multiclass T2I_rbin_irs<bits<4> opcod, string opc, PatFrag opnode> {
 /// AdjustInstrPostInstrSelection after giving then an optional CPSR operand.
 let hasPostISelHook = 1, Defs = [CPSR] in {
 multiclass T2I_bin_s_irs<InstrItinClass iii, InstrItinClass iir,
-                         InstrItinClass iis, PatFrag opnode,
+                         InstrItinClass iis, SDNode opnode,
                          bit Commutable = 0> {
    // shifted imm
    def ri : t2PseudoInst<(outs rGPR:$Rd),
@@ -735,7 +735,7 @@ multiclass T2I_bin_s_irs<InstrItinClass iii, InstrItinClass iir,
 /// T2I_rbin_s_is -  Same as T2I_bin_s_irs, except selection DAG
 /// operands are reversed.
 let hasPostISelHook = 1, Defs = [CPSR] in {
-multiclass T2I_rbin_s_is<PatFrag opnode> {
+multiclass T2I_rbin_s_is<SDNode opnode> {
    // shifted imm
    def ri : t2PseudoInst<(outs rGPR:$Rd),
                          (ins rGPR:$Rn, t2_so_imm:$imm, pred:$p),
@@ -755,7 +755,7 @@ multiclass T2I_rbin_s_is<PatFrag opnode> {
 
 /// T2I_bin_ii12rs - Defines a set of (op reg, {so_imm|imm0_4095|r|so_reg})
 /// patterns for a binary operation that produces a value.
-multiclass T2I_bin_ii12rs<bits<3> op23_21, string opc, PatFrag opnode,
+multiclass T2I_bin_ii12rs<bits<3> op23_21, string opc, SDNode opnode,
                           bit Commutable = 0> {
    // shifted imm
    // The register-immediate version is re-materializable. This is useful
@@ -824,7 +824,7 @@ multiclass T2I_bin_ii12rs<bits<3> op23_21, string opc, PatFrag opnode,
 /// for a binary operation that produces a value and use the carry
 /// bit. It's not predicable.
 let Defs = [CPSR], Uses = [CPSR] in {
-multiclass T2I_adde_sube_irs<bits<4> opcod, string opc, PatFrag opnode,
+multiclass T2I_adde_sube_irs<bits<4> opcod, string opc, SDNode opnode,
                              bit Commutable = 0> {
    // shifted imm
    def ri : T2sTwoRegImm<(outs rGPR:$Rd), (ins rGPR:$Rn, t2_so_imm:$imm),
@@ -864,7 +864,7 @@ multiclass T2I_adde_sube_irs<bits<4> opcod, string opc, PatFrag opnode,
 
 /// T2I_sh_ir - Defines a set of (op reg, {so_imm|r}) patterns for a shift /
 //  rotate operation that produces a value.
-multiclass T2I_sh_ir<bits<2> opcod, string opc, Operand ty, PatFrag opnode> {
+multiclass T2I_sh_ir<bits<2> opcod, string opc, Operand ty, SDNode opnode> {
    // 5-bit imm
    def ri : T2sTwoRegShiftImm<
                  (outs rGPR:$Rd), (ins rGPR:$Rm, ty:$imm), IIC_iMOVsi,
@@ -919,7 +919,7 @@ multiclass T2I_sh_ir<bits<2> opcod, string opc, Operand ty, PatFrag opnode> {
 /// a explicit result, only implicitly set CPSR.
 multiclass T2I_cmp_irs<bits<4> opcod, string opc,
                      InstrItinClass iii, InstrItinClass iir, InstrItinClass iis,
-                       PatFrag opnode> {
+                     SDPatternOperator opnode> {
 let isCompare = 1, Defs = [CPSR] in {
    // shifted imm
    def ri : T2OneRegCmpImm<
@@ -1260,20 +1260,19 @@ def t2LEApcrelJT : t2PseudoInst<(outs rGPR:$Rd),
 
 // Load
 let canFoldAsLoad = 1, isReMaterializable = 1  in
-defm t2LDR   : T2I_ld<0, 0b10, "ldr", IIC_iLoad_i, IIC_iLoad_si, GPR,
-                      UnOpFrag<(load node:$Src)>>;
+defm t2LDR   : T2I_ld<0, 0b10, "ldr", IIC_iLoad_i, IIC_iLoad_si, GPR, load>;
 
 // Loads with zero extension
 defm t2LDRH  : T2I_ld<0, 0b01, "ldrh", IIC_iLoad_bh_i, IIC_iLoad_bh_si,
-                      GPRnopc, UnOpFrag<(zextloadi16 node:$Src)>>;
+                      GPRnopc, zextloadi16>;
 defm t2LDRB  : T2I_ld<0, 0b00, "ldrb", IIC_iLoad_bh_i, IIC_iLoad_bh_si,
-                      GPRnopc, UnOpFrag<(zextloadi8  node:$Src)>>;
+                      GPRnopc, zextloadi8>;
 
 // Loads with sign extension
 defm t2LDRSH : T2I_ld<1, 0b01, "ldrsh", IIC_iLoad_bh_i, IIC_iLoad_bh_si,
-                      GPRnopc, UnOpFrag<(sextloadi16 node:$Src)>>;
+                      GPRnopc, sextloadi16>;
 defm t2LDRSB : T2I_ld<1, 0b00, "ldrsb", IIC_iLoad_bh_i, IIC_iLoad_bh_si,
-                      GPRnopc, UnOpFrag<(sextloadi8  node:$Src)>>;
+                      GPRnopc, sextloadi8>;
 
 let mayLoad = 1, hasSideEffects = 0, hasExtraDefRegAllocReq = 1 in {
 // Load doubleword
@@ -1414,7 +1413,7 @@ def t2LDRSHT : T2IldT<1, 0b01, "ldrsht", IIC_iLoad_bh_i>;
 class T2Ildacq<bits<4> bits23_20, bits<2> bit54, dag oops, dag iops,
                string opc, string asm, list<dag> pattern>
   : Thumb2I<oops, iops, AddrModeNone, 4, NoItinerary,
-            opc, asm, "", pattern>, Requires<[IsThumb, HasV8]> {
+            opc, asm, "", pattern>, Requires<[IsThumb, HasAcquireRelease]> {
   bits<4> Rt;
   bits<4> addr;
 
@@ -1438,12 +1437,11 @@ def t2LDAH : T2Ildacq<0b1101, 0b01, (outs rGPR:$Rt),
                       (ins addr_offset_none:$addr), "ldah", "\t$Rt, $addr", []>;
 
 // Store
-defm t2STR :T2I_st<0b10,"str", IIC_iStore_i, IIC_iStore_si, GPR,
-                   BinOpFrag<(store node:$LHS, node:$RHS)>>;
+defm t2STR :T2I_st<0b10,"str", IIC_iStore_i, IIC_iStore_si, GPR, store>;
 defm t2STRB:T2I_st<0b00,"strb", IIC_iStore_bh_i, IIC_iStore_bh_si,
-                   rGPR, BinOpFrag<(truncstorei8 node:$LHS, node:$RHS)>>;
+                   rGPR, truncstorei8>;
 defm t2STRH:T2I_st<0b01,"strh", IIC_iStore_bh_i, IIC_iStore_bh_si,
-                   rGPR, BinOpFrag<(truncstorei16 node:$LHS, node:$RHS)>>;
+                   rGPR, truncstorei16>;
 
 // Store doubleword
 let mayStore = 1, hasSideEffects = 0, hasExtraSrcRegAllocReq = 1 in
@@ -1586,7 +1584,7 @@ def t2STRD_POST : T2Ii8s4post<0, 1, 0, (outs GPR:$wb),
 class T2Istrrel<bits<2> bit54, dag oops, dag iops,
                 string opc, string asm, list<dag> pattern>
   : Thumb2I<oops, iops, AddrModeNone, 4, NoItinerary, opc,
-            asm, "", pattern>, Requires<[IsThumb, HasV8]> {
+            asm, "", pattern>, Requires<[IsThumb, HasAcquireRelease]> {
   bits<4> Rt;
   bits<4> addr;
 
@@ -1906,7 +1904,8 @@ def : t2InstAlias<"mov${p} $Rd, $imm", (t2MOVi rGPR:$Rd, t2_so_imm:$imm,
 let isReMaterializable = 1, isAsCheapAsAMove = 1, isMoveImm = 1 in
 def t2MOVi16 : T2I<(outs rGPR:$Rd), (ins imm0_65535_expr:$imm), IIC_iMOVi,
                    "movw", "\t$Rd, $imm",
-                   [(set rGPR:$Rd, imm0_65535:$imm)]>, Sched<[WriteALU]> {
+                   [(set rGPR:$Rd, imm0_65535:$imm)]>, Sched<[WriteALU]>,
+                   Requires<[IsThumb, HasV8MBaseline]> {
   let Inst{31-27} = 0b11110;
   let Inst{25} = 1;
   let Inst{24-21} = 0b0010;
@@ -1924,8 +1923,9 @@ def t2MOVi16 : T2I<(outs rGPR:$Rd), (ins imm0_65535_expr:$imm), IIC_iMOVi,
   let DecoderMethod = "DecodeT2MOVTWInstruction";
 }
 
-def : t2InstAlias<"mov${p} $Rd, $imm",
-                  (t2MOVi16 rGPR:$Rd, imm256_65535_expr:$imm, pred:$p)>;
+def : InstAlias<"mov${p} $Rd, $imm",
+                (t2MOVi16 rGPR:$Rd, imm256_65535_expr:$imm, pred:$p), 0>,
+                Requires<[IsThumb, HasV8MBaseline]>;
 
 def t2MOVi16_ga_pcrel : PseudoInst<(outs rGPR:$Rd),
                                 (ins i32imm:$addr, pclabel:$id), IIC_iMOVi, []>;
@@ -1936,7 +1936,8 @@ def t2MOVTi16 : T2I<(outs rGPR:$Rd),
                     "movt", "\t$Rd, $imm",
                     [(set rGPR:$Rd,
                           (or (and rGPR:$src, 0xffff), lo16AllZero:$imm))]>,
-                          Sched<[WriteALU]> {
+                          Sched<[WriteALU]>,
+                          Requires<[IsThumb, HasV8MBaseline]> {
   let Inst{31-27} = 0b11110;
   let Inst{25} = 1;
   let Inst{24-21} = 0b0110;
@@ -1956,7 +1957,7 @@ def t2MOVTi16 : T2I<(outs rGPR:$Rd),
 
 def t2MOVTi16_ga_pcrel : PseudoInst<(outs rGPR:$Rd),
                      (ins rGPR:$src, i32imm:$addr, pclabel:$id), IIC_iMOVi, []>,
-                     Sched<[WriteALU]>;
+                     Sched<[WriteALU]>, Requires<[IsThumb, HasV8MBaseline]>;
 } // Constraints
 
 def : T2Pat<(or rGPR:$src, 0xffff0000), (t2MOVTi16 rGPR:$src, 0xffff)>;
@@ -1997,7 +1998,7 @@ def t2UXTB   : T2I_ext_rrot<0b101, "uxtb",
 def t2UXTH   : T2I_ext_rrot<0b001, "uxth",
                                UnOpFrag<(and node:$Src, 0x0000FFFF)>>;
 def t2UXTB16 : T2I_ext_rrot_uxtb16<0b011, "uxtb16",
-                               UnOpFrag<(and node:$Src, 0x00FF00FF)>>;
+                                   UnOpFrag<(and node:$Src, 0x00FF00FF)>>;
 
 // FIXME: This pattern incorrectly assumes the shl operator is a rotate.
 //        The transformation should probably be done as a combiner action
@@ -2029,10 +2030,8 @@ def : Pat<(add rGPR:$Rn, (and (srl rGPR:$Rm, imm8_or_16:$rot), 0xFFFF)),
 //  Arithmetic Instructions.
 //
 
-defm t2ADD  : T2I_bin_ii12rs<0b000, "add",
-                             BinOpFrag<(add  node:$LHS, node:$RHS)>, 1>;
-defm t2SUB  : T2I_bin_ii12rs<0b101, "sub",
-                             BinOpFrag<(sub  node:$LHS, node:$RHS)>>;
+defm t2ADD  : T2I_bin_ii12rs<0b000, "add", add, 1>;
+defm t2SUB  : T2I_bin_ii12rs<0b101, "sub", sub>;
 
 // ADD and SUB with 's' bit set. No 12-bit immediate (T4) variants.
 //
@@ -2044,25 +2043,20 @@ defm t2SUB  : T2I_bin_ii12rs<0b101, "sub",
 // FIXME: Eliminate t2ADDS/t2SUBS pseudo opcodes after adding tablegen
 // support for an optional CPSR definition that corresponds to the DAG
 // node's second value. We can then eliminate the implicit def of CPSR.
-defm t2ADDS : T2I_bin_s_irs <IIC_iALUi, IIC_iALUr, IIC_iALUsi,
-                             BinOpFrag<(ARMaddc node:$LHS, node:$RHS)>, 1>;
-defm t2SUBS : T2I_bin_s_irs <IIC_iALUi, IIC_iALUr, IIC_iALUsi,
-                             BinOpFrag<(ARMsubc node:$LHS, node:$RHS)>>;
+defm t2ADDS : T2I_bin_s_irs <IIC_iALUi, IIC_iALUr, IIC_iALUsi, ARMaddc, 1>;
+defm t2SUBS : T2I_bin_s_irs <IIC_iALUi, IIC_iALUr, IIC_iALUsi, ARMsubc>;
 
 let hasPostISelHook = 1 in {
-defm t2ADC  : T2I_adde_sube_irs<0b1010, "adc",
-              BinOpWithFlagFrag<(ARMadde node:$LHS, node:$RHS, node:$FLAG)>, 1>;
-defm t2SBC  : T2I_adde_sube_irs<0b1011, "sbc",
-              BinOpWithFlagFrag<(ARMsube node:$LHS, node:$RHS, node:$FLAG)>>;
+defm t2ADC  : T2I_adde_sube_irs<0b1010, "adc", ARMadde, 1>;
+defm t2SBC  : T2I_adde_sube_irs<0b1011, "sbc", ARMsube>;
 }
 
 // RSB
-defm t2RSB  : T2I_rbin_irs  <0b1110, "rsb",
-                             BinOpFrag<(sub  node:$LHS, node:$RHS)>>;
+defm t2RSB  : T2I_rbin_irs  <0b1110, "rsb", sub>;
 
 // FIXME: Eliminate them if we can write def : Pat patterns which defines
 // CPSR and the implicit def of CPSR is not needed.
-defm t2RSBS : T2I_rbin_s_is <BinOpFrag<(ARMsubc node:$LHS, node:$RHS)>>;
+defm t2RSBS : T2I_rbin_s_is <ARMsubc>;
 
 // (sub X, imm) gets canonicalized to (add X, -imm).  Match this form.
 // The assume-no-carry-in form uses the negation of the input since add/sub
@@ -2293,19 +2287,17 @@ def t2USAT16: T2SatI<(outs rGPR:$Rd), (ins imm0_15:$sat_imm, rGPR:$Rn),
 
 def : T2Pat<(int_arm_ssat GPR:$a, imm1_32:$pos), (t2SSAT imm1_32:$pos, GPR:$a, 0)>;
 def : T2Pat<(int_arm_usat GPR:$a, imm0_31:$pos), (t2USAT imm0_31:$pos, GPR:$a, 0)>;
+def : T2Pat<(ARMssatnoshift GPRnopc:$Rn, imm0_31:$imm),
+             (t2SSAT imm0_31:$imm, GPRnopc:$Rn, 0)>;
 
 //===----------------------------------------------------------------------===//
 //  Shift and rotate Instructions.
 //
 
-defm t2LSL  : T2I_sh_ir<0b00, "lsl", imm0_31,
-                        BinOpFrag<(shl  node:$LHS, node:$RHS)>>;
-defm t2LSR  : T2I_sh_ir<0b01, "lsr", imm_sr,
-                        BinOpFrag<(srl  node:$LHS, node:$RHS)>>;
-defm t2ASR  : T2I_sh_ir<0b10, "asr", imm_sr,
-                        BinOpFrag<(sra  node:$LHS, node:$RHS)>>;
-defm t2ROR  : T2I_sh_ir<0b11, "ror", imm0_31,
-                        BinOpFrag<(rotr node:$LHS, node:$RHS)>>;
+defm t2LSL  : T2I_sh_ir<0b00, "lsl", imm0_31, shl>;
+defm t2LSR  : T2I_sh_ir<0b01, "lsr", imm_sr,  srl>;
+defm t2ASR  : T2I_sh_ir<0b10, "asr", imm_sr,  sra>;
+defm t2ROR  : T2I_sh_ir<0b11, "ror", imm0_31, rotr>;
 
 // (rotr x, (and y, 0x...1f)) ==> (ROR x, y)
 def : T2Pat<(rotr rGPR:$lhs, (and rGPR:$rhs, lo5AllOne)),
@@ -2362,14 +2354,11 @@ def t2MOVsra_flag : T2TwoRegShiftImm<
 //
 
 defm t2AND  : T2I_bin_w_irs<0b0000, "and",
-                            IIC_iBITi, IIC_iBITr, IIC_iBITsi,
-                            BinOpFrag<(and node:$LHS, node:$RHS)>, 1>;
+                            IIC_iBITi, IIC_iBITr, IIC_iBITsi, and, 1>;
 defm t2ORR  : T2I_bin_w_irs<0b0010, "orr",
-                            IIC_iBITi, IIC_iBITr, IIC_iBITsi,
-                            BinOpFrag<(or  node:$LHS, node:$RHS)>, 1>;
+                            IIC_iBITi, IIC_iBITr, IIC_iBITsi, or, 1>;
 defm t2EOR  : T2I_bin_w_irs<0b0100, "eor",
-                            IIC_iBITi, IIC_iBITr, IIC_iBITsi,
-                            BinOpFrag<(xor node:$LHS, node:$RHS)>, 1>;
+                            IIC_iBITi, IIC_iBITr, IIC_iBITsi, xor, 1>;
 
 defm t2BIC  : T2I_bin_w_irs<0b0001, "bic",
                             IIC_iBITi, IIC_iBITr, IIC_iBITsi,
@@ -2516,7 +2505,7 @@ multiclass T2I_un_irs<bits<4> opcod, string opc,
 let AddedComplexity = 1 in
 defm t2MVN  : T2I_un_irs <0b0011, "mvn",
                           IIC_iMVNi, IIC_iMVNr, IIC_iMVNsi,
-                          UnOpFrag<(not node:$Src)>, 1, 1, 1>;
+                          not, 1, 1, 1>;
 
 let AddedComplexity = 1 in
 def : T2Pat<(and     rGPR:$src, t2_so_imm_not:$imm),
@@ -2606,8 +2595,9 @@ def t2UMLAL : T2MlaLong<0b110, 0b0000,
 
 def t2UMAAL : T2MulLong<0b110, 0b0110,
                   (outs rGPR:$RdLo, rGPR:$RdHi),
-                  (ins rGPR:$Rn, rGPR:$Rm), IIC_iMAC64,
+                  (ins rGPR:$Rn, rGPR:$Rm, rGPR:$RLo, rGPR:$RHi), IIC_iMAC64,
                   "umaal", "\t$RdLo, $RdHi, $Rn, $Rm", []>,
+          RegConstraint<"$RLo = $RdLo, $RHi = $RdHi">,
           Requires<[IsThumb2, HasDSP]>;
 } // hasSideEffects
 
@@ -2677,7 +2667,7 @@ def t2SMMLSR:T2FourReg<
   let Inst{7-4} = 0b0001; // Rounding (Inst{4} = 1)
 }
 
-multiclass T2I_smul<string opc, PatFrag opnode> {
+multiclass T2I_smul<string opc, SDNode opnode> {
   def BB : T2ThreeReg<(outs rGPR:$Rd), (ins rGPR:$Rn, rGPR:$Rm), IIC_iMUL16,
               !strconcat(opc, "bb"), "\t$Rd, $Rn, $Rm",
               [(set rGPR:$Rd, (opnode (sext_inreg rGPR:$Rn, i16),
@@ -2756,7 +2746,7 @@ multiclass T2I_smul<string opc, PatFrag opnode> {
 }
 
 
-multiclass T2I_smla<string opc, PatFrag opnode> {
+multiclass T2I_smla<string opc, SDNode opnode> {
   def BB : T2FourReg<
         (outs rGPR:$Rd), (ins rGPR:$Rn, rGPR:$Rm, rGPR:$Ra), IIC_iMAC16,
               !strconcat(opc, "bb"), "\t$Rd, $Rn, $Rm, $Ra",
@@ -2835,8 +2825,8 @@ multiclass T2I_smla<string opc, PatFrag opnode> {
   }
 }
 
-defm t2SMUL : T2I_smul<"smul", BinOpFrag<(mul node:$LHS, node:$RHS)>>;
-defm t2SMLA : T2I_smla<"smla", BinOpFrag<(mul node:$LHS, node:$RHS)>>;
+defm t2SMUL : T2I_smul<"smul", mul>;
+defm t2SMLA : T2I_smla<"smla", mul>;
 
 // Halfword multiple accumulate long: SMLAL<x><y>
 def t2SMLALBB : T2FourReg_mac<1, 0b100, 0b1000, (outs rGPR:$Ra,rGPR:$Rd),
@@ -2923,7 +2913,7 @@ def t2SMLSLDX : T2FourReg_mac<1, 0b101, 0b1101, (outs rGPR:$Ra,rGPR:$Rd),
 def t2SDIV : T2ThreeReg<(outs rGPR:$Rd), (ins rGPR:$Rn, rGPR:$Rm), IIC_iDIV,
                  "sdiv", "\t$Rd, $Rn, $Rm",
                  [(set rGPR:$Rd, (sdiv rGPR:$Rn, rGPR:$Rm))]>,
-                 Requires<[HasDivide, IsThumb2]> {
+                 Requires<[HasDivide, IsThumb, HasV8MBaseline]> {
   let Inst{31-27} = 0b11111;
   let Inst{26-21} = 0b011100;
   let Inst{20} = 0b1;
@@ -2934,7 +2924,7 @@ def t2SDIV : T2ThreeReg<(outs rGPR:$Rd), (ins rGPR:$Rn, rGPR:$Rm), IIC_iDIV,
 def t2UDIV : T2ThreeReg<(outs rGPR:$Rd), (ins rGPR:$Rn, rGPR:$Rm), IIC_iDIV,
                  "udiv", "\t$Rd, $Rn, $Rm",
                  [(set rGPR:$Rd, (udiv rGPR:$Rn, rGPR:$Rm))]>,
-                 Requires<[HasDivide, IsThumb2]> {
+                 Requires<[HasDivide, IsThumb, HasV8MBaseline]> {
   let Inst{31-27} = 0b11111;
   let Inst{26-21} = 0b011101;
   let Inst{20} = 0b1;
@@ -3080,8 +3070,7 @@ def t2CRC32CW : T2I_crc32<1, 0b10, "cw", int_arm_crc32cw>;
 //  Comparison Instructions...
 //
 defm t2CMP  : T2I_cmp_irs<0b1101, "cmp",
-                          IIC_iCMPi, IIC_iCMPr, IIC_iCMPsi,
-                          BinOpFrag<(ARMcmp node:$LHS, node:$RHS)>>;
+                          IIC_iCMPi, IIC_iCMPr, IIC_iCMPsi, ARMcmp>;
 
 def : T2Pat<(ARMcmpZ  GPRnopc:$lhs, t2_so_imm:$imm),
             (t2CMPri  GPRnopc:$lhs, t2_so_imm:$imm)>;
@@ -3288,15 +3277,18 @@ let mayLoad = 1 in {
 def t2LDREXB : T2I_ldrex<0b0100, (outs rGPR:$Rt), (ins addr_offset_none:$addr),
                          AddrModeNone, 4, NoItinerary,
                          "ldrexb", "\t$Rt, $addr", "",
-                         [(set rGPR:$Rt, (ldrex_1 addr_offset_none:$addr))]>;
+                         [(set rGPR:$Rt, (ldrex_1 addr_offset_none:$addr))]>,
+               Requires<[IsThumb, HasV8MBaseline]>;
 def t2LDREXH : T2I_ldrex<0b0101, (outs rGPR:$Rt), (ins addr_offset_none:$addr),
                          AddrModeNone, 4, NoItinerary,
                          "ldrexh", "\t$Rt, $addr", "",
-                         [(set rGPR:$Rt, (ldrex_2 addr_offset_none:$addr))]>;
+                         [(set rGPR:$Rt, (ldrex_2 addr_offset_none:$addr))]>,
+               Requires<[IsThumb, HasV8MBaseline]>;
 def t2LDREX  : Thumb2I<(outs rGPR:$Rt), (ins t2addrmode_imm0_1020s4:$addr),
                        AddrModeNone, 4, NoItinerary,
                        "ldrex", "\t$Rt, $addr", "",
-                     [(set rGPR:$Rt, (ldrex_4 t2addrmode_imm0_1020s4:$addr))]> {
+                     [(set rGPR:$Rt, (ldrex_4 t2addrmode_imm0_1020s4:$addr))]>,
+               Requires<[IsThumb, HasV8MBaseline]> {
   bits<4> Rt;
   bits<12> addr;
   let Inst{31-27} = 0b11101;
@@ -3320,17 +3312,17 @@ def t2LDAEXB : T2I_ldrex<0b1100, (outs rGPR:$Rt), (ins addr_offset_none:$addr),
                          AddrModeNone, 4, NoItinerary,
                          "ldaexb", "\t$Rt, $addr", "",
                          [(set rGPR:$Rt, (ldaex_1 addr_offset_none:$addr))]>,
-               Requires<[IsThumb, HasV8]>;
+               Requires<[IsThumb, HasAcquireRelease, HasV7Clrex]>;
 def t2LDAEXH : T2I_ldrex<0b1101, (outs rGPR:$Rt), (ins addr_offset_none:$addr),
                          AddrModeNone, 4, NoItinerary,
                          "ldaexh", "\t$Rt, $addr", "",
                          [(set rGPR:$Rt, (ldaex_2 addr_offset_none:$addr))]>,
-               Requires<[IsThumb, HasV8]>;
+               Requires<[IsThumb, HasAcquireRelease, HasV7Clrex]>;
 def t2LDAEX  : Thumb2I<(outs rGPR:$Rt), (ins addr_offset_none:$addr),
                        AddrModeNone, 4, NoItinerary,
                        "ldaex", "\t$Rt, $addr", "",
                          [(set rGPR:$Rt, (ldaex_4 addr_offset_none:$addr))]>,
-               Requires<[IsThumb, HasV8]> {
+               Requires<[IsThumb, HasAcquireRelease, HasV7Clrex]> {
   bits<4> Rt;
   bits<4> addr;
   let Inst{31-27} = 0b11101;
@@ -3345,7 +3337,8 @@ def t2LDAEXD : T2I_ldrex<0b1111, (outs rGPR:$Rt, rGPR:$Rt2),
                          (ins addr_offset_none:$addr),
                          AddrModeNone, 4, NoItinerary,
                          "ldaexd", "\t$Rt, $Rt2, $addr", "",
-                         [], {?, ?, ?, ?}>, Requires<[IsThumb, HasV8]> {
+                         [], {?, ?, ?, ?}>, Requires<[IsThumb,
+                         HasAcquireRelease, HasV7Clrex, IsNotMClass]> {
   bits<4> Rt2;
   let Inst{11-8} = Rt2;
 
@@ -3359,20 +3352,23 @@ def t2STREXB : T2I_strex<0b0100, (outs rGPR:$Rd),
                          AddrModeNone, 4, NoItinerary,
                          "strexb", "\t$Rd, $Rt, $addr", "",
                          [(set rGPR:$Rd,
-                               (strex_1 rGPR:$Rt, addr_offset_none:$addr))]>;
+                               (strex_1 rGPR:$Rt, addr_offset_none:$addr))]>,
+               Requires<[IsThumb, HasV8MBaseline]>;
 def t2STREXH : T2I_strex<0b0101, (outs rGPR:$Rd),
                          (ins rGPR:$Rt, addr_offset_none:$addr),
                          AddrModeNone, 4, NoItinerary,
                          "strexh", "\t$Rd, $Rt, $addr", "",
                          [(set rGPR:$Rd,
-                               (strex_2 rGPR:$Rt, addr_offset_none:$addr))]>;
+                               (strex_2 rGPR:$Rt, addr_offset_none:$addr))]>,
+               Requires<[IsThumb, HasV8MBaseline]>;
 
 def t2STREX  : Thumb2I<(outs rGPR:$Rd), (ins rGPR:$Rt,
                              t2addrmode_imm0_1020s4:$addr),
                   AddrModeNone, 4, NoItinerary,
                   "strex", "\t$Rd, $Rt, $addr", "",
                   [(set rGPR:$Rd,
-                        (strex_4 rGPR:$Rt, t2addrmode_imm0_1020s4:$addr))]> {
+                        (strex_4 rGPR:$Rt, t2addrmode_imm0_1020s4:$addr))]>,
+               Requires<[IsThumb, HasV8MBaseline]> {
   bits<4> Rd;
   bits<4> Rt;
   bits<12> addr;
@@ -3399,7 +3395,8 @@ def t2STLEXB : T2I_strex<0b1100, (outs rGPR:$Rd),
                          "stlexb", "\t$Rd, $Rt, $addr", "",
                          [(set rGPR:$Rd,
                                (stlex_1 rGPR:$Rt, addr_offset_none:$addr))]>,
-                         Requires<[IsThumb, HasV8]>;
+                         Requires<[IsThumb, HasAcquireRelease,
+                                   HasV7Clrex]>;
 
 def t2STLEXH : T2I_strex<0b1101, (outs rGPR:$Rd),
                          (ins rGPR:$Rt, addr_offset_none:$addr),
@@ -3407,7 +3404,8 @@ def t2STLEXH : T2I_strex<0b1101, (outs rGPR:$Rd),
                          "stlexh", "\t$Rd, $Rt, $addr", "",
                          [(set rGPR:$Rd,
                                (stlex_2 rGPR:$Rt, addr_offset_none:$addr))]>,
-                         Requires<[IsThumb, HasV8]>;
+                         Requires<[IsThumb, HasAcquireRelease,
+                                   HasV7Clrex]>;
 
 def t2STLEX  : Thumb2I<(outs rGPR:$Rd), (ins rGPR:$Rt,
                              addr_offset_none:$addr),
@@ -3415,7 +3413,7 @@ def t2STLEX  : Thumb2I<(outs rGPR:$Rd), (ins rGPR:$Rt,
                   "stlex", "\t$Rd, $Rt, $addr", "",
                   [(set rGPR:$Rd,
                         (stlex_4 rGPR:$Rt, addr_offset_none:$addr))]>,
-                  Requires<[IsThumb, HasV8]> {
+                  Requires<[IsThumb, HasAcquireRelease, HasV7Clrex]> {
   bits<4> Rd;
   bits<4> Rt;
   bits<4> addr;
@@ -3431,14 +3429,15 @@ def t2STLEXD : T2I_strex<0b1111, (outs rGPR:$Rd),
                          (ins rGPR:$Rt, rGPR:$Rt2, addr_offset_none:$addr),
                          AddrModeNone, 4, NoItinerary,
                          "stlexd", "\t$Rd, $Rt, $Rt2, $addr", "", [],
-                         {?, ?, ?, ?}>, Requires<[IsThumb, HasV8]> {
+                         {?, ?, ?, ?}>, Requires<[IsThumb, HasAcquireRelease,
+                         HasV7Clrex, IsNotMClass]> {
   bits<4> Rt2;
   let Inst{11-8} = Rt2;
 }
 }
 
 def t2CLREX : T2I<(outs), (ins), NoItinerary, "clrex", "", [(int_arm_clrex)]>,
-            Requires<[IsThumb2, HasV7]>  {
+            Requires<[IsThumb, HasV7Clrex]>  {
   let Inst{31-16} = 0xf3bf;
   let Inst{15-14} = 0b10;
   let Inst{13} = 0;
@@ -3449,22 +3448,30 @@ def t2CLREX : T2I<(outs), (ins), NoItinerary, "clrex", "", [(int_arm_clrex)]>,
 }
 
 def : T2Pat<(and (ldrex_1 addr_offset_none:$addr), 0xff),
-            (t2LDREXB addr_offset_none:$addr)>;
+            (t2LDREXB addr_offset_none:$addr)>,
+            Requires<[IsThumb, HasV8MBaseline]>;
 def : T2Pat<(and (ldrex_2 addr_offset_none:$addr), 0xffff),
-            (t2LDREXH addr_offset_none:$addr)>;
+            (t2LDREXH addr_offset_none:$addr)>,
+            Requires<[IsThumb, HasV8MBaseline]>;
 def : T2Pat<(strex_1 (and GPR:$Rt, 0xff), addr_offset_none:$addr),
-            (t2STREXB GPR:$Rt, addr_offset_none:$addr)>;
+            (t2STREXB GPR:$Rt, addr_offset_none:$addr)>,
+            Requires<[IsThumb, HasV8MBaseline]>;
 def : T2Pat<(strex_2 (and GPR:$Rt, 0xffff), addr_offset_none:$addr),
-            (t2STREXH GPR:$Rt, addr_offset_none:$addr)>;
+            (t2STREXH GPR:$Rt, addr_offset_none:$addr)>,
+            Requires<[IsThumb, HasV8MBaseline]>;
 
 def : T2Pat<(and (ldaex_1 addr_offset_none:$addr), 0xff),
-            (t2LDAEXB addr_offset_none:$addr)>;
+            (t2LDAEXB addr_offset_none:$addr)>,
+            Requires<[IsThumb, HasAcquireRelease, HasV7Clrex]>;
 def : T2Pat<(and (ldaex_2 addr_offset_none:$addr), 0xffff),
-            (t2LDAEXH addr_offset_none:$addr)>;
+            (t2LDAEXH addr_offset_none:$addr)>,
+            Requires<[IsThumb, HasAcquireRelease, HasV7Clrex]>;
 def : T2Pat<(stlex_1 (and GPR:$Rt, 0xff), addr_offset_none:$addr),
-            (t2STLEXB GPR:$Rt, addr_offset_none:$addr)>;
+            (t2STLEXB GPR:$Rt, addr_offset_none:$addr)>,
+            Requires<[IsThumb, HasAcquireRelease, HasV7Clrex]>;
 def : T2Pat<(stlex_2 (and GPR:$Rt, 0xffff), addr_offset_none:$addr),
-            (t2STLEXH GPR:$Rt, addr_offset_none:$addr)>;
+            (t2STLEXH GPR:$Rt, addr_offset_none:$addr)>,
+            Requires<[IsThumb, HasAcquireRelease, HasV7Clrex]>;
 
 //===----------------------------------------------------------------------===//
 // SJLJ Exception handling intrinsics
@@ -3517,9 +3524,10 @@ def t2LDMIA_RET: t2PseudoExpand<(outs GPR:$wb), (ins GPR:$Rn, pred:$p,
 
 let isBranch = 1, isTerminator = 1, isBarrier = 1 in {
 let isPredicable = 1 in
-def t2B   : T2I<(outs), (ins uncondbrtarget:$target), IIC_Br,
+def t2B   : T2I<(outs), (ins thumb_br_target:$target), IIC_Br,
                  "b", ".w\t$target",
-                 [(br bb:$target)]>, Sched<[WriteBr]> {
+                 [(br bb:$target)]>, Sched<[WriteBr]>,
+                 Requires<[IsThumb, HasV8MBaseline]> {
   let Inst{31-27} = 0b11110;
   let Inst{15-14} = 0b10;
   let Inst{12} = 1;
@@ -3609,9 +3617,9 @@ let isCall = 1, isTerminator = 1, isReturn = 1, isBarrier = 1 in {
   // IOS version.
   let Uses = [SP] in
   def tTAILJMPd: tPseudoExpand<(outs),
-                   (ins uncondbrtarget:$dst, pred:$p),
+                   (ins thumb_br_target:$dst, pred:$p),
                    4, IIC_Br, [],
-                   (t2B uncondbrtarget:$dst, pred:$p)>,
+                   (t2B thumb_br_target:$dst, pred:$p)>,
                  Requires<[IsThumb2, IsMachO]>, Sched<[WriteBr]>;
 }
 
@@ -3647,10 +3655,10 @@ def t2BXJ : T2I<(outs), (ins GPRnopc:$func), NoItinerary, "bxj", "\t$func", []>,
 
 // Compare and branch on zero / non-zero
 let isBranch = 1, isTerminator = 1 in {
-  def tCBZ  : T1I<(outs), (ins tGPR:$Rn, t_cbtarget:$target), IIC_Br,
+  def tCBZ  : T1I<(outs), (ins tGPR:$Rn, thumb_cb_target:$target), IIC_Br,
                   "cbz\t$Rn, $target", []>,
               T1Misc<{0,0,?,1,?,?,?}>,
-              Requires<[IsThumb2]>, Sched<[WriteBr]> {
+              Requires<[IsThumb, HasV8MBaseline]>, Sched<[WriteBr]> {
     // A8.6.27
     bits<6> target;
     bits<3> Rn;
@@ -3659,10 +3667,10 @@ let isBranch = 1, isTerminator = 1 in {
     let Inst{2-0} = Rn;
   }
 
-  def tCBNZ : T1I<(outs), (ins tGPR:$Rn, t_cbtarget:$target), IIC_Br,
+  def tCBNZ : T1I<(outs), (ins tGPR:$Rn, thumb_cb_target:$target), IIC_Br,
                   "cbnz\t$Rn, $target", []>,
               T1Misc<{1,0,?,1,?,?,?}>,
-              Requires<[IsThumb2]>, Sched<[WriteBr]> {
+              Requires<[IsThumb, HasV8MBaseline]>, Sched<[WriteBr]> {
     // A8.6.27
     bits<6> target;
     bits<3> Rn;
@@ -3715,15 +3723,21 @@ def t2HINT : T2I<(outs), (ins imm0_239:$imm), NoItinerary, "hint", ".w\t$imm",
   let Inst{7-0} = imm;
 }
 
-def : t2InstAlias<"hint$p $imm", (t2HINT imm0_239:$imm, pred:$p)>;
-def : t2InstAlias<"nop$p.w", (t2HINT 0, pred:$p)>;
-def : t2InstAlias<"yield$p.w", (t2HINT 1, pred:$p)>;
-def : t2InstAlias<"wfe$p.w", (t2HINT 2, pred:$p)>;
-def : t2InstAlias<"wfi$p.w", (t2HINT 3, pred:$p)>;
-def : t2InstAlias<"sev$p.w", (t2HINT 4, pred:$p)>;
-def : t2InstAlias<"sevl$p.w", (t2HINT 5, pred:$p)> {
+def : t2InstAlias<"hint$p $imm", (t2HINT imm0_239:$imm, pred:$p), 0>;
+def : t2InstAlias<"nop$p.w", (t2HINT 0, pred:$p), 1>;
+def : t2InstAlias<"yield$p.w", (t2HINT 1, pred:$p), 1>;
+def : t2InstAlias<"wfe$p.w", (t2HINT 2, pred:$p), 1>;
+def : t2InstAlias<"wfi$p.w", (t2HINT 3, pred:$p), 1>;
+def : t2InstAlias<"sev$p.w", (t2HINT 4, pred:$p), 1>;
+def : t2InstAlias<"sevl$p.w", (t2HINT 5, pred:$p), 1> {
   let Predicates = [IsThumb2, HasV8];
 }
+def : t2InstAlias<"esb$p.w", (t2HINT 16, pred:$p), 1> {
+  let Predicates = [IsThumb2, HasRAS];
+}
+def : t2InstAlias<"esb$p", (t2HINT 16, pred:$p), 0> {
+  let Predicates = [IsThumb2, HasRAS];
+}
 
 def t2DBG : T2I<(outs), (ins imm0_15:$opt), NoItinerary, "dbg", "\t$opt",
                 [(int_arm_dbg imm0_15:$opt)]> {
@@ -3848,7 +3862,7 @@ def : t2InstAlias<"hvc\t$imm16", (t2HVC imm0_65535:$imm16)>;
 // ERET - Return from exception in Hypervisor mode.
 // B9.3.3, B9.3.20: ERET is an alias for "SUBS PC, LR, #0" in an implementation that
 // includes virtualization extensions.
-def t2ERET : InstAlias<"eret${p}", (t2SUBS_PC_LR 0, pred:$p)>,
+def t2ERET : InstAlias<"eret${p}", (t2SUBS_PC_LR 0, pred:$p), 1>,
              Requires<[IsThumb2, HasVirtualization]>;
 
 //===----------------------------------------------------------------------===//
@@ -3871,7 +3885,7 @@ let isReMaterializable = 1 in {
 def t2MOV_ga_pcrel : PseudoInst<(outs rGPR:$dst), (ins i32imm:$addr),
                                 IIC_iMOVix2addpc,
                           [(set rGPR:$dst, (ARMWrapperPIC tglobaladdr:$addr))]>,
-                          Requires<[IsThumb2, UseMovt]>;
+                          Requires<[IsThumb, HasV8MBaseline, UseMovt]>;
 
 }
 
@@ -3883,12 +3897,13 @@ def : T2Pat<(ARMWrapper tglobaltlsaddr:$dst),
       Requires<[IsThumb2, UseMovt]>;
 
 // ConstantPool, GlobalAddress, and JumpTable
-def : T2Pat<(ARMWrapper  tconstpool  :$dst), (t2LEApcrel tconstpool  :$dst)>;
-def : T2Pat<(ARMWrapper  tglobaladdr :$dst), (t2MOVi32imm tglobaladdr :$dst)>,
-           Requires<[IsThumb2, UseMovt]>;
+def : T2Pat<(ARMWrapper tconstpool :$dst), (t2LEApcrel tconstpool :$dst)>;
+def : T2Pat<(ARMWrapper texternalsym :$dst), (t2MOVi32imm texternalsym :$dst)>,
+    Requires<[IsThumb, HasV8MBaseline, UseMovt]>;
+def : T2Pat<(ARMWrapper tglobaladdr :$dst), (t2MOVi32imm tglobaladdr :$dst)>,
+    Requires<[IsThumb, HasV8MBaseline, UseMovt]>;
 
-def : T2Pat<(ARMWrapperJT tjumptable:$dst),
-            (t2LEApcrelJT tjumptable:$dst)>;
+def : T2Pat<(ARMWrapperJT tjumptable:$dst), (t2LEApcrelJT tjumptable:$dst)>;
 
 // Pseudo instruction that combines ldr from constpool and add pc. This should
 // be expanded into two instructions late to allow if-conversion and
@@ -3910,16 +3925,16 @@ def t2ABS : PseudoInst<(outs rGPR:$dst), (ins rGPR:$src),
 //===----------------------------------------------------------------------===//
 // Coprocessor load/store -- for disassembly only
 //
-class T2CI<bits<4> op31_28, dag oops, dag iops, string opc, string asm>
-  : T2I<oops, iops, NoItinerary, opc, asm, []> {
+class T2CI<bits<4> op31_28, dag oops, dag iops, string opc, string asm, list<dag> pattern>
+  : T2I<oops, iops, NoItinerary, opc, asm, pattern> {
   let Inst{31-28} = op31_28;
   let Inst{27-25} = 0b110;
 }
 
-multiclass t2LdStCop<bits<4> op31_28, bit load, bit Dbit, string asm> {
+multiclass t2LdStCop<bits<4> op31_28, bit load, bit Dbit, string asm, list<dag> pattern> {
   def _OFFSET : T2CI<op31_28,
                      (outs), (ins p_imm:$cop, c_imm:$CRd, addrmode5:$addr),
-                     asm, "\t$cop, $CRd, $addr"> {
+                     asm, "\t$cop, $CRd, $addr", pattern> {
     bits<13> addr;
     bits<4> cop;
     bits<4> CRd;
@@ -3936,7 +3951,7 @@ multiclass t2LdStCop<bits<4> op31_28, bit load, bit Dbit, string asm> {
   }
   def _PRE : T2CI<op31_28,
                   (outs), (ins p_imm:$cop, c_imm:$CRd, addrmode5_pre:$addr),
-                  asm, "\t$cop, $CRd, $addr!"> {
+                  asm, "\t$cop, $CRd, $addr!", []> {
     bits<13> addr;
     bits<4> cop;
     bits<4> CRd;
@@ -3954,7 +3969,7 @@ multiclass t2LdStCop<bits<4> op31_28, bit load, bit Dbit, string asm> {
   def _POST: T2CI<op31_28,
                   (outs), (ins p_imm:$cop, c_imm:$CRd, addr_offset_none:$addr,
                                postidx_imm8s4:$offset),
-                 asm, "\t$cop, $CRd, $addr, $offset"> {
+                 asm, "\t$cop, $CRd, $addr, $offset", []> {
     bits<9> offset;
     bits<4> addr;
     bits<4> cop;
@@ -3973,7 +3988,7 @@ multiclass t2LdStCop<bits<4> op31_28, bit load, bit Dbit, string asm> {
   def _OPTION : T2CI<op31_28, (outs),
                      (ins p_imm:$cop, c_imm:$CRd, addr_offset_none:$addr,
                           coproc_option_imm:$option),
-      asm, "\t$cop, $CRd, $addr, $option"> {
+      asm, "\t$cop, $CRd, $addr, $option", []> {
     bits<8> option;
     bits<4> addr;
     bits<4> cop;
@@ -3991,14 +4006,15 @@ multiclass t2LdStCop<bits<4> op31_28, bit load, bit Dbit, string asm> {
   }
 }
 
-defm t2LDC   : t2LdStCop<0b1110, 1, 0, "ldc">;
-defm t2LDCL  : t2LdStCop<0b1110, 1, 1, "ldcl">;
-defm t2STC   : t2LdStCop<0b1110, 0, 0, "stc">;
-defm t2STCL  : t2LdStCop<0b1110, 0, 1, "stcl">;
-defm t2LDC2  : t2LdStCop<0b1111, 1, 0, "ldc2">, Requires<[PreV8,IsThumb2]>;
-defm t2LDC2L : t2LdStCop<0b1111, 1, 1, "ldc2l">, Requires<[PreV8,IsThumb2]>;
-defm t2STC2  : t2LdStCop<0b1111, 0, 0, "stc2">, Requires<[PreV8,IsThumb2]>;
-defm t2STC2L : t2LdStCop<0b1111, 0, 1, "stc2l">, Requires<[PreV8,IsThumb2]>;
+defm t2LDC   : t2LdStCop<0b1110, 1, 0, "ldc", [(int_arm_ldc imm:$cop, imm:$CRd, addrmode5:$addr)]>;
+defm t2LDCL  : t2LdStCop<0b1110, 1, 1, "ldcl", [(int_arm_ldcl imm:$cop, imm:$CRd, addrmode5:$addr)]>;
+defm t2LDC2  : t2LdStCop<0b1111, 1, 0, "ldc2", [(int_arm_ldc2 imm:$cop, imm:$CRd, addrmode5:$addr)]>, Requires<[PreV8,IsThumb2]>;
+defm t2LDC2L : t2LdStCop<0b1111, 1, 1, "ldc2l", [(int_arm_ldc2l imm:$cop, imm:$CRd, addrmode5:$addr)]>, Requires<[PreV8,IsThumb2]>;
+
+defm t2STC   : t2LdStCop<0b1110, 0, 0, "stc", [(int_arm_stc imm:$cop, imm:$CRd, addrmode5:$addr)]>;
+defm t2STCL  : t2LdStCop<0b1110, 0, 1, "stcl", [(int_arm_stcl imm:$cop, imm:$CRd, addrmode5:$addr)]>;
+defm t2STC2  : t2LdStCop<0b1111, 0, 0, "stc2", [(int_arm_stc2 imm:$cop, imm:$CRd, addrmode5:$addr)]>, Requires<[PreV8,IsThumb2]>;
+defm t2STC2L : t2LdStCop<0b1111, 0, 1, "stc2l", [(int_arm_stc2l imm:$cop, imm:$CRd, addrmode5:$addr)]>, Requires<[PreV8,IsThumb2]>;
 
 
 //===----------------------------------------------------------------------===//
@@ -4070,6 +4086,7 @@ def t2MRS_M : T2I<(outs rGPR:$Rd), (ins msr_mask:$SYSm), NoItinerary,
 // same and the assembly parser has no way to distinguish between them. The mask
 // operand contains the special register (R Bit) in bit 4 and bits 3-0 contains
 // the mask with the fields to be accessed in the special register.
+let Defs = [CPSR] in
 def t2MSR_AR : T2I<(outs), (ins msr_mask:$mask, rGPR:$Rn),
                    NoItinerary, "msr", "\t$mask, $Rn", []>,
                Requires<[IsThumb2,IsNotMClass]> {
@@ -4105,6 +4122,7 @@ def t2MSRbanked : T2I<(outs), (ins banked_reg:$banked, rGPR:$Rn),
 // M class MSR.
 //
 // Move from ARM core register to Special Register
+let Defs = [CPSR] in
 def t2MSR_M : T2I<(outs), (ins msr_mask:$SYSm, rGPR:$Rn),
                   NoItinerary, "msr", "\t$SYSm, $Rn", []>,
               Requires<[IsThumb,IsMClass]> {
@@ -4314,6 +4332,37 @@ def t2SETPAN : T1I<(outs), (ins imm0_1:$imm), NoItinerary, "setpan\t$imm", []>,
 }
 
 //===----------------------------------------------------------------------===//
+// ARMv8-M Security Extensions instructions
+//
+
+let hasSideEffects = 1 in
+def t2SG : T2I<(outs), (ins), NoItinerary, "sg", "", []>,
+           Requires<[Has8MSecExt]> {
+  let Inst = 0xe97fe97f;
+}
+
+class T2TT<bits<2> at, string asm, list<dag> pattern>
+  : T2I<(outs rGPR:$Rt), (ins GPRnopc:$Rn), NoItinerary, asm, "\t$Rt, $Rn",
+        pattern> {
+  bits<4> Rn;
+  bits<4> Rt;
+
+  let Inst{31-20} = 0b111010000100;
+  let Inst{19-16} = Rn;
+  let Inst{15-12} = 0b1111;
+  let Inst{11-8} = Rt;
+  let Inst{7-6} = at;
+  let Inst{5-0} = 0b000000;
+
+  let Unpredictable{5-0} = 0b111111;
+}
+
+def t2TT   : T2TT<0b00, "tt",   []>, Requires<[IsThumb,Has8MSecExt]>;
+def t2TTT  : T2TT<0b01, "ttt",  []>, Requires<[IsThumb,Has8MSecExt]>;
+def t2TTA  : T2TT<0b10, "tta",  []>, Requires<[IsThumb,Has8MSecExt]>;
+def t2TTAT : T2TT<0b11, "ttat", []>, Requires<[IsThumb,Has8MSecExt]>;
+
+//===----------------------------------------------------------------------===//
 // Non-Instruction Patterns
 //
 
@@ -4488,9 +4537,9 @@ def : t2InstAlias<"tst${p} $Rn, $Rm",
                   (t2TSTrr GPRnopc:$Rn, rGPR:$Rm, pred:$p)>;
 
 // Memory barriers
-def : InstAlias<"dmb${p}", (t2DMB 0xf, pred:$p)>, Requires<[HasDB]>;
-def : InstAlias<"dsb${p}", (t2DSB 0xf, pred:$p)>, Requires<[HasDB]>;
-def : InstAlias<"isb${p}", (t2ISB 0xf, pred:$p)>, Requires<[HasDB]>;
+def : InstAlias<"dmb${p}", (t2DMB 0xf, pred:$p), 0>, Requires<[HasDB]>;
+def : InstAlias<"dsb${p}", (t2DSB 0xf, pred:$p), 0>, Requires<[HasDB]>;
+def : InstAlias<"isb${p}", (t2ISB 0xf, pred:$p), 0>, Requires<[HasDB]>;
 
 // Alias for LDR, LDRB, LDRH, LDRSB, and LDRSH without the ".w" optional
 // width specifier.
@@ -4535,13 +4584,13 @@ def : t2InstAlias<"mvn${s}${p} $Rd, $Rm",
 def : t2InstAlias<"mvn${s}${p} $Rd, $ShiftedRm",
            (t2MVNs rGPR:$Rd, t2_so_reg:$ShiftedRm, pred:$p, cc_out:$s)>;
 
-// PKHBT/PKHTB with default shift amount. PKHTB is equivalent to PKHBT when the
-// shift amount is zero (i.e., unspecified).
+// PKHBT/PKHTB with default shift amount. PKHTB is equivalent to PKHBT with the
+// input operands swapped when the shift amount is zero (i.e., unspecified).
 def : InstAlias<"pkhbt${p} $Rd, $Rn, $Rm",
-                (t2PKHBT rGPR:$Rd, rGPR:$Rn, rGPR:$Rm, 0, pred:$p)>,
+                (t2PKHBT rGPR:$Rd, rGPR:$Rn, rGPR:$Rm, 0, pred:$p), 0>,
             Requires<[HasT2ExtractPack, IsThumb2]>;
 def : InstAlias<"pkhtb${p} $Rd, $Rn, $Rm",
-                (t2PKHBT rGPR:$Rd, rGPR:$Rn, rGPR:$Rm, 0, pred:$p)>,
+                (t2PKHBT rGPR:$Rd, rGPR:$Rm, rGPR:$Rn, 0, pred:$p), 0>,
             Requires<[HasT2ExtractPack, IsThumb2]>;
 
 // PUSH/POP aliases for STM/LDM
@@ -4620,16 +4669,16 @@ def : t2InstAlias<"strh${p} $Rt, $addr",
 
 // Extend instruction optional rotate operand.
 def : InstAlias<"sxtab${p} $Rd, $Rn, $Rm",
-              (t2SXTAB rGPR:$Rd, rGPR:$Rn, rGPR:$Rm, 0, pred:$p)>,
+              (t2SXTAB rGPR:$Rd, rGPR:$Rn, rGPR:$Rm, 0, pred:$p), 0>,
               Requires<[HasT2ExtractPack, IsThumb2]>;
 def : InstAlias<"sxtah${p} $Rd, $Rn, $Rm",
-              (t2SXTAH rGPR:$Rd, rGPR:$Rn, rGPR:$Rm, 0, pred:$p)>,
+              (t2SXTAH rGPR:$Rd, rGPR:$Rn, rGPR:$Rm, 0, pred:$p), 0>,
               Requires<[HasT2ExtractPack, IsThumb2]>;
 def : InstAlias<"sxtab16${p} $Rd, $Rn, $Rm",
-              (t2SXTAB16 rGPR:$Rd, rGPR:$Rn, rGPR:$Rm, 0, pred:$p)>,
+              (t2SXTAB16 rGPR:$Rd, rGPR:$Rn, rGPR:$Rm, 0, pred:$p), 0>,
               Requires<[HasT2ExtractPack, IsThumb2]>;
 def : InstAlias<"sxtb16${p} $Rd, $Rm",
-              (t2SXTB16 rGPR:$Rd, rGPR:$Rm, 0, pred:$p)>,
+              (t2SXTB16 rGPR:$Rd, rGPR:$Rm, 0, pred:$p), 0>,
               Requires<[HasT2ExtractPack, IsThumb2]>;
 
 def : t2InstAlias<"sxtb${p} $Rd, $Rm",
@@ -4642,16 +4691,16 @@ def : t2InstAlias<"sxth${p}.w $Rd, $Rm",
                 (t2SXTH rGPR:$Rd, rGPR:$Rm, 0, pred:$p)>;
 
 def : InstAlias<"uxtab${p} $Rd, $Rn, $Rm",
-              (t2UXTAB rGPR:$Rd, rGPR:$Rn, rGPR:$Rm, 0, pred:$p)>,
+              (t2UXTAB rGPR:$Rd, rGPR:$Rn, rGPR:$Rm, 0, pred:$p), 0>,
               Requires<[HasT2ExtractPack, IsThumb2]>;
 def : InstAlias<"uxtah${p} $Rd, $Rn, $Rm",
-              (t2UXTAH rGPR:$Rd, rGPR:$Rn, rGPR:$Rm, 0, pred:$p)>,
+              (t2UXTAH rGPR:$Rd, rGPR:$Rn, rGPR:$Rm, 0, pred:$p), 0>,
               Requires<[HasT2ExtractPack, IsThumb2]>;
 def : InstAlias<"uxtab16${p} $Rd, $Rn, $Rm",
-              (t2UXTAB16 rGPR:$Rd, rGPR:$Rn, rGPR:$Rm, 0, pred:$p)>,
+              (t2UXTAB16 rGPR:$Rd, rGPR:$Rn, rGPR:$Rm, 0, pred:$p), 0>,
               Requires<[HasT2ExtractPack, IsThumb2]>;
 def : InstAlias<"uxtb16${p} $Rd, $Rm",
-              (t2UXTB16 rGPR:$Rd, rGPR:$Rm, 0, pred:$p)>,
+              (t2UXTB16 rGPR:$Rd, rGPR:$Rm, 0, pred:$p), 0>,
               Requires<[HasT2ExtractPack, IsThumb2]>;
 
 def : t2InstAlias<"uxtb${p} $Rd, $Rm",
@@ -4667,7 +4716,7 @@ def : t2InstAlias<"uxth${p}.w $Rd, $Rm",
 def : t2InstAlias<"uxtb${p} $Rd, $Rm$rot",
                   (t2UXTB rGPR:$Rd, rGPR:$Rm, rot_imm:$rot, pred:$p)>;
 def : InstAlias<"uxtb16${p} $Rd, $Rm$rot",
-                (t2UXTB16 rGPR:$Rd, rGPR:$Rm, rot_imm:$rot, pred:$p)>,
+                (t2UXTB16 rGPR:$Rd, rGPR:$Rm, rot_imm:$rot, pred:$p), 0>,
                 Requires<[HasT2ExtractPack, IsThumb2]>;
 def : t2InstAlias<"uxth${p} $Rd, $Rm$rot",
                   (t2UXTH rGPR:$Rd, rGPR:$Rm, rot_imm:$rot, pred:$p)>;
@@ -4675,7 +4724,7 @@ def : t2InstAlias<"uxth${p} $Rd, $Rm$rot",
 def : t2InstAlias<"sxtb${p} $Rd, $Rm$rot",
                   (t2SXTB rGPR:$Rd, rGPR:$Rm, rot_imm:$rot, pred:$p)>;
 def : InstAlias<"sxtb16${p} $Rd, $Rm$rot",
-                (t2SXTB16 rGPR:$Rd, rGPR:$Rm, rot_imm:$rot, pred:$p)>,
+                (t2SXTB16 rGPR:$Rd, rGPR:$Rm, rot_imm:$rot, pred:$p), 0>,
                 Requires<[HasT2ExtractPack, IsThumb2]>;
 def : t2InstAlias<"sxth${p} $Rd, $Rm$rot",
                   (t2SXTH rGPR:$Rd, rGPR:$Rm, rot_imm:$rot, pred:$p)>;
@@ -4764,9 +4813,14 @@ def : t2InstAlias<"ldrsh${p}.w $Rt, $addr",
 def : t2InstAlias<"add${p} $Rd, pc, $imm",
                   (t2ADR rGPR:$Rd, imm0_4095:$imm, pred:$p)>;
 
+// Pseudo instruction ldr Rt, =immediate
+def t2LDRConstPool
+  : t2AsmPseudo<"ldr${p} $Rt, $immediate",
+                (ins GPRnopc:$Rt, const_pool_asm_imm:$immediate, pred:$p)>;
+
 // PLD/PLDW/PLI with alternate literal form.
 def : t2InstAlias<"pld${p} $addr",
                   (t2PLDpci t2ldr_pcrel_imm12:$addr, pred:$p)>;
 def : InstAlias<"pli${p} $addr",
-                 (t2PLIpci  t2ldr_pcrel_imm12:$addr, pred:$p)>,
+                 (t2PLIpci  t2ldr_pcrel_imm12:$addr, pred:$p), 0>,
       Requires<[IsThumb2,HasV7]>;
diff --git a/lib/Target/ARM/ARMInstrVFP.td b/lib/Target/ARM/ARMInstrVFP.td
index 63e7940bb14e..e29d265ae3d1 100644
--- a/lib/Target/ARM/ARMInstrVFP.td
+++ b/lib/Target/ARM/ARMInstrVFP.td
@@ -30,6 +30,18 @@ def FPImmOperand : AsmOperandClass {
   let ParserMethod = "parseFPImm";
 }
 
+def vfp_f16imm : Operand<f16>,
+                 PatLeaf<(f16 fpimm), [{
+      return ARM_AM::getFP16Imm(N->getValueAPF()) != -1;
+    }], SDNodeXForm<fpimm, [{
+      APFloat InVal = N->getValueAPF();
+      uint32_t enc = ARM_AM::getFP16Imm(InVal);
+      return CurDAG->getTargetConstant(enc, MVT::i32);
+    }]>> {
+  let PrintMethod = "printFPImmOperand";
+  let ParserMatchClass = FPImmOperand;
+}
+
 def vfp_f32imm : Operand<f32>,
                  PatLeaf<(f32 fpimm), [{
       return ARM_AM::getFP32Imm(N->getValueAPF()) != -1;
@@ -98,6 +110,11 @@ def VLDRS : ASI5<0b1101, 0b01, (outs SPR:$Sd), (ins addrmode5:$addr),
   let D = VFPNeonDomain;
 }
 
+def VLDRH : AHI5<0b1101, 0b01, (outs SPR:$Sd), (ins addrmode5fp16:$addr),
+                 IIC_fpLoad16, "vldr", ".16\t$Sd, $addr",
+                 []>,
+            Requires<[HasFullFP16]>;
+
 } // End of 'let canFoldAsLoad = 1, isReMaterializable = 1 in'
 
 def VSTRD : ADI5<0b1101, 0b00, (outs), (ins DPR:$Dd, addrmode5:$addr),
@@ -112,6 +129,11 @@ def VSTRS : ASI5<0b1101, 0b00, (outs), (ins SPR:$Sd, addrmode5:$addr),
   let D = VFPNeonDomain;
 }
 
+def VSTRH : AHI5<0b1101, 0b00, (outs), (ins SPR:$Sd, addrmode5fp16:$addr),
+                 IIC_fpStore16, "vstr", ".16\t$Sd, $addr",
+                 []>,
+            Requires<[HasFullFP16]>;
+
 //===----------------------------------------------------------------------===//
 //  Load / store multiple Instructions.
 //
@@ -200,6 +222,37 @@ defm VSTM : vfp_ldst_mult<"vstm", 0, IIC_fpStore_m, IIC_fpStore_mu>;
 def : MnemonicAlias<"vldm", "vldmia">;
 def : MnemonicAlias<"vstm", "vstmia">;
 
+
+//===----------------------------------------------------------------------===//
+//  Lazy load / store multiple Instructions
+//
+let mayLoad = 1 in
+def VLLDM : AXSI4<(outs), (ins GPRnopc:$Rn, pred:$p), IndexModeNone,
+                  IIC_fpLoad_m, "vlldm${p}\t$Rn", "", []>,
+            Requires<[HasV8MMainline, Has8MSecExt]> {
+    let Inst{24-23} = 0b00;
+    let Inst{22}    = 0;
+    let Inst{21}    = 1;
+    let Inst{20}    = 1;
+    let Inst{15-12} = 0;
+    let Inst{7-0}   = 0;
+    let mayLoad     = 1;
+}
+
+let mayStore = 1 in
+def VLSTM : AXSI4<(outs), (ins GPRnopc:$Rn, pred:$p), IndexModeNone,
+                  IIC_fpStore_m, "vlstm${p}\t$Rn", "", []>,
+            Requires<[HasV8MMainline, Has8MSecExt]> {
+    let Inst{24-23} = 0b00;
+    let Inst{22}    = 0;
+    let Inst{21}    = 1;
+    let Inst{20}    = 0;
+    let Inst{15-12} = 0;
+    let Inst{7-0}   = 0;
+    let mayStore    = 1;
+}
+
+
 // FLDM/FSTM - Load / Store multiple single / double precision registers for
 // pre-ARMv6 cores.
 // These instructions are deprecated!
@@ -221,13 +274,13 @@ def : VFP2MnemonicAlias<"fstmdbd", "vstmdb">;
 def : VFP2MnemonicAlias<"fstmead", "vstmia">;
 def : VFP2MnemonicAlias<"fstmfdd", "vstmdb">;
 
-def : InstAlias<"vpush${p} $r", (VSTMDDB_UPD SP, pred:$p, dpr_reglist:$r)>,
+def : InstAlias<"vpush${p} $r", (VSTMDDB_UPD SP, pred:$p, dpr_reglist:$r), 0>,
                 Requires<[HasVFP2]>;
-def : InstAlias<"vpush${p} $r", (VSTMSDB_UPD SP, pred:$p, spr_reglist:$r)>,
+def : InstAlias<"vpush${p} $r", (VSTMSDB_UPD SP, pred:$p, spr_reglist:$r), 0>,
                 Requires<[HasVFP2]>;
-def : InstAlias<"vpop${p} $r",  (VLDMDIA_UPD SP, pred:$p, dpr_reglist:$r)>,
+def : InstAlias<"vpop${p} $r",  (VLDMDIA_UPD SP, pred:$p, dpr_reglist:$r), 0>,
                 Requires<[HasVFP2]>;
-def : InstAlias<"vpop${p} $r",  (VLDMSIA_UPD SP, pred:$p, spr_reglist:$r)>,
+def : InstAlias<"vpop${p} $r",  (VLDMSIA_UPD SP, pred:$p, spr_reglist:$r), 0>,
                 Requires<[HasVFP2]>;
 defm : VFPDTAnyInstAlias<"vpush${p}", "$r",
                          (VSTMSDB_UPD SP, pred:$p, spr_reglist:$r)>;
@@ -295,6 +348,12 @@ def VADDS  : ASbIn<0b11100, 0b11, 0, 0,
   let D = VFPNeonA8Domain;
 }
 
+let TwoOperandAliasConstraint = "$Sn = $Sd" in
+def VADDH  : AHbI<0b11100, 0b11, 0, 0,
+                  (outs SPR:$Sd), (ins SPR:$Sn, SPR:$Sm),
+                  IIC_fpALU16, "vadd", ".f16\t$Sd, $Sn, $Sm",
+                  []>;
+
 let TwoOperandAliasConstraint = "$Dn = $Dd" in
 def VSUBD  : ADbI<0b11100, 0b11, 1, 0,
                   (outs DPR:$Dd), (ins DPR:$Dn, DPR:$Dm),
@@ -311,6 +370,12 @@ def VSUBS  : ASbIn<0b11100, 0b11, 1, 0,
   let D = VFPNeonA8Domain;
 }
 
+let TwoOperandAliasConstraint = "$Sn = $Sd" in
+def VSUBH  : AHbI<0b11100, 0b11, 1, 0,
+                  (outs SPR:$Sd), (ins SPR:$Sn, SPR:$Sm),
+                  IIC_fpALU16, "vsub", ".f16\t$Sd, $Sn, $Sm",
+                  []>;
+
 let TwoOperandAliasConstraint = "$Dn = $Dd" in
 def VDIVD  : ADbI<0b11101, 0b00, 0, 0,
                   (outs DPR:$Dd), (ins DPR:$Dn, DPR:$Dm),
@@ -323,6 +388,12 @@ def VDIVS  : ASbI<0b11101, 0b00, 0, 0,
                   IIC_fpDIV32, "vdiv", ".f32\t$Sd, $Sn, $Sm",
                   [(set SPR:$Sd, (fdiv SPR:$Sn, SPR:$Sm))]>;
 
+let TwoOperandAliasConstraint = "$Sn = $Sd" in
+def VDIVH  : AHbI<0b11101, 0b00, 0, 0,
+                  (outs SPR:$Sd), (ins SPR:$Sn, SPR:$Sm),
+                  IIC_fpDIV16, "vdiv", ".f16\t$Sd, $Sn, $Sm",
+                  []>;
+
 let TwoOperandAliasConstraint = "$Dn = $Dd" in
 def VMULD  : ADbI<0b11100, 0b10, 0, 0,
                   (outs DPR:$Dd), (ins DPR:$Dn, DPR:$Dm),
@@ -339,6 +410,12 @@ def VMULS  : ASbIn<0b11100, 0b10, 0, 0,
   let D = VFPNeonA8Domain;
 }
 
+let TwoOperandAliasConstraint = "$Sn = $Sd" in
+def VMULH  : AHbI<0b11100, 0b10, 0, 0,
+                  (outs SPR:$Sd), (ins SPR:$Sn, SPR:$Sm),
+                  IIC_fpMUL16, "vmul", ".f16\t$Sd, $Sn, $Sm",
+                  []>;
+
 def VNMULD : ADbI<0b11100, 0b10, 1, 0,
                   (outs DPR:$Dd), (ins DPR:$Dn, DPR:$Dm),
                   IIC_fpMUL64, "vnmul", ".f64\t$Dd, $Dn, $Dm",
@@ -353,9 +430,20 @@ def VNMULS : ASbI<0b11100, 0b10, 1, 0,
   let D = VFPNeonA8Domain;
 }
 
+def VNMULH : AHbI<0b11100, 0b10, 1, 0,
+                  (outs SPR:$Sd), (ins SPR:$Sn, SPR:$Sm),
+                  IIC_fpMUL16, "vnmul", ".f16\t$Sd, $Sn, $Sm",
+                  []>;
+
 multiclass vsel_inst<string op, bits<2> opc, int CC> {
   let DecoderNamespace = "VFPV8", PostEncoderMethod = "",
       Uses = [CPSR], AddedComplexity = 4 in {
+    def H : AHbInp<0b11100, opc, 0,
+                   (outs SPR:$Sd), (ins SPR:$Sn, SPR:$Sm),
+                   NoItinerary, !strconcat("vsel", op, ".f16\t$Sd, $Sn, $Sm"),
+                   []>,
+                   Requires<[HasFullFP16]>;
+
     def S : ASbInp<0b11100, opc, 0,
                    (outs SPR:$Sd), (ins SPR:$Sn, SPR:$Sm),
                    NoItinerary, !strconcat("vsel", op, ".f32\t$Sd, $Sn, $Sm"),
@@ -378,6 +466,12 @@ defm VSELVS : vsel_inst<"vs", 0b01, 6>;
 
 multiclass vmaxmin_inst<string op, bit opc, SDNode SD> {
   let DecoderNamespace = "VFPV8", PostEncoderMethod = "" in {
+    def H : AHbInp<0b11101, 0b00, opc,
+                   (outs SPR:$Sd), (ins SPR:$Sn, SPR:$Sm),
+                   NoItinerary, !strconcat(op, ".f16\t$Sd, $Sn, $Sm"),
+                   []>,
+                   Requires<[HasFullFP16]>;
+
     def S : ASbInp<0b11101, 0b00, opc,
                    (outs SPR:$Sd), (ins SPR:$Sn, SPR:$Sm),
                    NoItinerary, !strconcat(op, ".f32\t$Sd, $Sn, $Sm"),
@@ -418,6 +512,12 @@ def VCMPES : ASuI<0b11101, 0b11, 0b0100, 0b11, 0,
   let D = VFPNeonA8Domain;
 }
 
+def VCMPEH : AHuI<0b11101, 0b11, 0b0100, 0b11, 0,
+                  (outs), (ins SPR:$Sd, SPR:$Sm),
+                  IIC_fpCMP16, "vcmpe", ".f16\t$Sd, $Sm",
+                  []>;
+
+
 // FIXME: Verify encoding after integrated assembler is working.
 def VCMPD  : ADuI<0b11101, 0b11, 0b0100, 0b01, 0,
                   (outs), (ins DPR:$Dd, DPR:$Dm),
@@ -432,6 +532,11 @@ def VCMPS  : ASuI<0b11101, 0b11, 0b0100, 0b01, 0,
   // VFP pipelines on A8.
   let D = VFPNeonA8Domain;
 }
+
+def VCMPH  : AHuI<0b11101, 0b11, 0b0100, 0b01, 0,
+                  (outs), (ins SPR:$Sd, SPR:$Sm),
+                  IIC_fpCMP16, "vcmp", ".f16\t$Sd, $Sm",
+                  []>;
 } // Defs = [FPSCR_NZCV]
 
 //===----------------------------------------------------------------------===//
@@ -452,6 +557,11 @@ def VABSS  : ASuIn<0b11101, 0b11, 0b0000, 0b11, 0,
   let D = VFPNeonA8Domain;
 }
 
+def VABSH  : AHuI<0b11101, 0b11, 0b0000, 0b11, 0,
+                   (outs SPR:$Sd), (ins SPR:$Sm),
+                   IIC_fpUNA16, "vabs", ".f16\t$Sd, $Sm",
+                   []>;
+
 let Defs = [FPSCR_NZCV] in {
 def VCMPEZD : ADuI<0b11101, 0b11, 0b0101, 0b11, 0,
                    (outs), (ins DPR:$Dd),
@@ -473,6 +583,14 @@ def VCMPEZS : ASuI<0b11101, 0b11, 0b0101, 0b11, 0,
   let D = VFPNeonA8Domain;
 }
 
+def VCMPEZH : AHuI<0b11101, 0b11, 0b0101, 0b11, 0,
+                   (outs), (ins SPR:$Sd),
+                   IIC_fpCMP16, "vcmpe", ".f16\t$Sd, #0",
+                   []> {
+  let Inst{3-0} = 0b0000;
+  let Inst{5}   = 0;
+}
+
 // FIXME: Verify encoding after integrated assembler is working.
 def VCMPZD  : ADuI<0b11101, 0b11, 0b0101, 0b01, 0,
                    (outs), (ins DPR:$Dd),
@@ -493,6 +611,14 @@ def VCMPZS  : ASuI<0b11101, 0b11, 0b0101, 0b01, 0,
   // VFP pipelines on A8.
   let D = VFPNeonA8Domain;
 }
+
+def VCMPZH  : AHuI<0b11101, 0b11, 0b0101, 0b01, 0,
+                   (outs), (ins SPR:$Sd),
+                   IIC_fpCMP16, "vcmp", ".f16\t$Sd, #0",
+                   []> {
+  let Inst{3-0} = 0b0000;
+  let Inst{5}   = 0;
+}
 } // Defs = [FPSCR_NZCV]
 
 def VCVTDS  : ASuI<0b11101, 0b11, 0b0111, 0b11, 0,
@@ -627,6 +753,22 @@ def : Pat<(f64 (f16_to_fp GPR:$a)),
 multiclass vcvt_inst<string opc, bits<2> rm,
                      SDPatternOperator node = null_frag> {
   let PostEncoderMethod = "", DecoderNamespace = "VFPV8" in {
+    def SH : AHuInp<0b11101, 0b11, 0b1100, 0b11, 0,
+                    (outs SPR:$Sd), (ins SPR:$Sm),
+                    NoItinerary, !strconcat("vcvt", opc, ".s32.f16\t$Sd, $Sm"),
+                    []>,
+                    Requires<[HasFullFP16]> {
+      let Inst{17-16} = rm;
+    }
+
+    def UH : AHuInp<0b11101, 0b11, 0b1100, 0b01, 0,
+                    (outs SPR:$Sd), (ins SPR:$Sm),
+                    NoItinerary, !strconcat("vcvt", opc, ".u32.f16\t$Sd, $Sm"),
+                    []>,
+                    Requires<[HasFullFP16]> {
+      let Inst{17-16} = rm;
+    }
+
     def SS : ASuInp<0b11101, 0b11, 0b1100, 0b11, 0,
                     (outs SPR:$Sd), (ins SPR:$Sm),
                     NoItinerary, !strconcat("vcvt", opc, ".s32.f32\t$Sd, $Sm"),
@@ -715,7 +857,21 @@ def VNEGS  : ASuIn<0b11101, 0b11, 0b0001, 0b01, 0,
   let D = VFPNeonA8Domain;
 }
 
+def VNEGH  : AHuI<0b11101, 0b11, 0b0001, 0b01, 0,
+                  (outs SPR:$Sd), (ins SPR:$Sm),
+                  IIC_fpUNA16, "vneg", ".f16\t$Sd, $Sm",
+                  []>;
+
 multiclass vrint_inst_zrx<string opc, bit op, bit op2, SDPatternOperator node> {
+  def H : AHuI<0b11101, 0b11, 0b0110, 0b11, 0,
+               (outs SPR:$Sd), (ins SPR:$Sm),
+               NoItinerary, !strconcat("vrint", opc), ".f16\t$Sd, $Sm",
+               []>,
+               Requires<[HasFullFP16]> {
+    let Inst{7} = op2;
+    let Inst{16} = op;
+  }
+
   def S : ASuI<0b11101, 0b11, 0b0110, 0b11, 0,
                (outs SPR:$Sd), (ins SPR:$Sm),
                NoItinerary, !strconcat("vrint", opc), ".f32\t$Sd, $Sm",
@@ -733,11 +889,14 @@ multiclass vrint_inst_zrx<string opc, bit op, bit op2, SDPatternOperator node> {
     let Inst{16} = op;
   }
 
+  def : InstAlias<!strconcat("vrint", opc, "$p.f16.f16\t$Sd, $Sm"),
+                  (!cast<Instruction>(NAME#"H") SPR:$Sd, SPR:$Sm, pred:$p), 0>,
+        Requires<[HasFullFP16]>;
   def : InstAlias<!strconcat("vrint", opc, "$p.f32.f32\t$Sd, $Sm"),
-                  (!cast<Instruction>(NAME#"S") SPR:$Sd, SPR:$Sm, pred:$p)>,
+                  (!cast<Instruction>(NAME#"S") SPR:$Sd, SPR:$Sm, pred:$p), 0>,
         Requires<[HasFPARMv8]>;
   def : InstAlias<!strconcat("vrint", opc, "$p.f64.f64\t$Dd, $Dm"),
-                  (!cast<Instruction>(NAME#"D") DPR:$Dd, DPR:$Dm, pred:$p)>,
+                  (!cast<Instruction>(NAME#"D") DPR:$Dd, DPR:$Dm, pred:$p), 0>,
         Requires<[HasFPARMv8,HasDPVFP]>;
 }
 
@@ -748,6 +907,13 @@ defm VRINTX : vrint_inst_zrx<"x", 1, 0, frint>;
 multiclass vrint_inst_anpm<string opc, bits<2> rm,
                            SDPatternOperator node = null_frag> {
   let PostEncoderMethod = "", DecoderNamespace = "VFPV8" in {
+    def H : AHuInp<0b11101, 0b11, 0b1000, 0b01, 0,
+                   (outs SPR:$Sd), (ins SPR:$Sm),
+                   NoItinerary, !strconcat("vrint", opc, ".f16\t$Sd, $Sm"),
+                   []>,
+                   Requires<[HasFullFP16]> {
+      let Inst{17-16} = rm;
+    }
     def S : ASuInp<0b11101, 0b11, 0b1000, 0b01, 0,
                    (outs SPR:$Sd), (ins SPR:$Sm),
                    NoItinerary, !strconcat("vrint", opc, ".f32\t$Sd, $Sm"),
@@ -765,10 +931,10 @@ multiclass vrint_inst_anpm<string opc, bits<2> rm,
   }
 
   def : InstAlias<!strconcat("vrint", opc, ".f32.f32\t$Sd, $Sm"),
-                  (!cast<Instruction>(NAME#"S") SPR:$Sd, SPR:$Sm)>,
+                  (!cast<Instruction>(NAME#"S") SPR:$Sd, SPR:$Sm), 0>,
         Requires<[HasFPARMv8]>;
   def : InstAlias<!strconcat("vrint", opc, ".f64.f64\t$Dd, $Dm"),
-                  (!cast<Instruction>(NAME#"D") DPR:$Dd, DPR:$Dm)>,
+                  (!cast<Instruction>(NAME#"D") DPR:$Dd, DPR:$Dm), 0>,
         Requires<[HasFPARMv8,HasDPVFP]>;
 }
 
@@ -787,6 +953,11 @@ def VSQRTS : ASuI<0b11101, 0b11, 0b0001, 0b11, 0,
                   IIC_fpSQRT32, "vsqrt", ".f32\t$Sd, $Sm",
                   [(set SPR:$Sd, (fsqrt SPR:$Sm))]>;
 
+def VSQRTH : AHuI<0b11101, 0b11, 0b0001, 0b11, 0,
+                  (outs SPR:$Sd), (ins SPR:$Sm),
+                  IIC_fpSQRT16, "vsqrt", ".f16\t$Sd, $Sm",
+                  []>;
+
 let hasSideEffects = 0 in {
 def VMOVD  : ADuI<0b11101, 0b11, 0b0000, 0b01, 0,
                   (outs DPR:$Dd), (ins DPR:$Dm),
@@ -795,6 +966,18 @@ def VMOVD  : ADuI<0b11101, 0b11, 0b0000, 0b01, 0,
 def VMOVS  : ASuI<0b11101, 0b11, 0b0000, 0b01, 0,
                   (outs SPR:$Sd), (ins SPR:$Sm),
                   IIC_fpUNA32, "vmov", ".f32\t$Sd, $Sm", []>;
+
+let PostEncoderMethod = "", DecoderNamespace = "VFPV8" in {
+def VMOVH  : ASuInp<0b11101, 0b11, 0b0000, 0b01, 0,
+                  (outs SPR:$Sd), (ins SPR:$Sm),
+                  IIC_fpUNA16, "vmovx.f16\t$Sd, $Sm", []>,
+             Requires<[HasFullFP16]>;
+
+def VINSH  : ASuInp<0b11101, 0b11, 0b0000, 0b11, 0,
+                  (outs SPR:$Sd), (ins SPR:$Sm),
+                  IIC_fpUNA16, "vins.f16\t$Sd, $Sm", []>,
+             Requires<[HasFullFP16]>;
+} // PostEncoderMethod
 } // hasSideEffects
 
 //===----------------------------------------------------------------------===//
@@ -966,6 +1149,44 @@ def VMOVSRR : AVConv5I<0b11000100, 0b1010,
   let DecoderMethod = "DecodeVMOVSRR";
 }
 
+// Move H->R, clearing top 16 bits
+def VMOVRH : AVConv2I<0b11100001, 0b1001,
+                      (outs GPR:$Rt), (ins SPR:$Sn),
+                      IIC_fpMOVSI, "vmov", ".f16\t$Rt, $Sn",
+                      []>,
+             Requires<[HasFullFP16]> {
+  // Instruction operands.
+  bits<4> Rt;
+  bits<5> Sn;
+
+  // Encode instruction operands.
+  let Inst{19-16} = Sn{4-1};
+  let Inst{7}     = Sn{0};
+  let Inst{15-12} = Rt;
+
+  let Inst{6-5}   = 0b00;
+  let Inst{3-0}   = 0b0000;
+}
+
+// Move R->H, clearing top 16 bits
+def VMOVHR : AVConv4I<0b11100000, 0b1001,
+                      (outs SPR:$Sn), (ins GPR:$Rt),
+                      IIC_fpMOVIS, "vmov", ".f16\t$Sn, $Rt",
+                      []>,
+             Requires<[HasFullFP16]> {
+  // Instruction operands.
+  bits<5> Sn;
+  bits<4> Rt;
+
+  // Encode instruction operands.
+  let Inst{19-16} = Sn{4-1};
+  let Inst{7}     = Sn{0};
+  let Inst{15-12} = Rt;
+
+  let Inst{6-5}   = 0b00;
+  let Inst{3-0}   = 0b0000;
+}
+
 // FMRDH: SPR -> GPR
 // FMRDL: SPR -> GPR
 // FMRRS: SPR -> GPR
@@ -1011,6 +1232,25 @@ class AVConv1InSs_Encode<bits<5> opcod1, bits<2> opcod2, bits<4> opcod3,
   let Inst{22}    = Sd{0};
 }
 
+class AVConv1IHs_Encode<bits<5> opcod1, bits<2> opcod2, bits<4> opcod3,
+                        bits<4> opcod4, dag oops, dag iops,
+                        InstrItinClass itin, string opc, string asm,
+                        list<dag> pattern>
+  : AVConv1I<opcod1, opcod2, opcod3, opcod4, oops, iops, itin, opc, asm,
+             pattern> {
+  // Instruction operands.
+  bits<5> Sd;
+  bits<5> Sm;
+
+  // Encode instruction operands.
+  let Inst{3-0}   = Sm{4-1};
+  let Inst{5}     = Sm{0};
+  let Inst{15-12} = Sd{4-1};
+  let Inst{22}    = Sd{0};
+
+  let Predicates = [HasFullFP16];
+}
+
 def VSITOD : AVConv1IDs_Encode<0b11101, 0b11, 0b1000, 0b1011,
                                (outs DPR:$Dd), (ins SPR:$Sm),
                                IIC_fpCVTID, "vcvt", ".f64.s32\t$Dd, $Sm",
@@ -1043,6 +1283,13 @@ def : VFPNoNEONPat<(f32 (sint_to_fp GPR:$a)),
 def : VFPNoNEONPat<(f32 (sint_to_fp (i32 (alignedload32 addrmode5:$a)))),
                    (VSITOS (VLDRS addrmode5:$a))>;
 
+def VSITOH : AVConv1IHs_Encode<0b11101, 0b11, 0b1000, 0b1001,
+                               (outs SPR:$Sd), (ins SPR:$Sm),
+                               IIC_fpCVTIH, "vcvt", ".f16.s32\t$Sd, $Sm",
+                               []> {
+  let Inst{7} = 1; // s32
+}
+
 def VUITOD : AVConv1IDs_Encode<0b11101, 0b11, 0b1000, 0b1011,
                                (outs DPR:$Dd), (ins SPR:$Sm),
                                IIC_fpCVTID, "vcvt", ".f64.u32\t$Dd, $Sm",
@@ -1075,6 +1322,13 @@ def : VFPNoNEONPat<(f32 (uint_to_fp GPR:$a)),
 def : VFPNoNEONPat<(f32 (uint_to_fp (i32 (alignedload32 addrmode5:$a)))),
                    (VUITOS (VLDRS addrmode5:$a))>;
 
+def VUITOH : AVConv1IHs_Encode<0b11101, 0b11, 0b1000, 0b1001,
+                                (outs SPR:$Sd), (ins SPR:$Sm),
+                                IIC_fpCVTIH, "vcvt", ".f16.u32\t$Sd, $Sm",
+                                []> {
+  let Inst{7} = 0; // u32
+}
+
 // FP -> Int:
 
 class AVConv1IsD_Encode<bits<5> opcod1, bits<2> opcod2, bits<4> opcod3,
@@ -1113,6 +1367,25 @@ class AVConv1InsS_Encode<bits<5> opcod1, bits<2> opcod2, bits<4> opcod3,
   let Inst{22}    = Sd{0};
 }
 
+class AVConv1IsH_Encode<bits<5> opcod1, bits<2> opcod2, bits<4> opcod3,
+                         bits<4> opcod4, dag oops, dag iops,
+                         InstrItinClass itin, string opc, string asm,
+                         list<dag> pattern>
+  : AVConv1I<opcod1, opcod2, opcod3, opcod4, oops, iops, itin, opc, asm,
+              pattern> {
+  // Instruction operands.
+  bits<5> Sd;
+  bits<5> Sm;
+
+  // Encode instruction operands.
+  let Inst{3-0}   = Sm{4-1};
+  let Inst{5}     = Sm{0};
+  let Inst{15-12} = Sd{4-1};
+  let Inst{22}    = Sd{0};
+
+  let Predicates = [HasFullFP16];
+}
+
 // Always set Z bit in the instruction, i.e. "round towards zero" variants.
 def VTOSIZD : AVConv1IsD_Encode<0b11101, 0b11, 0b1101, 0b1011,
                                 (outs SPR:$Sd), (ins DPR:$Dm),
@@ -1147,6 +1420,13 @@ def : VFPNoNEONPat<(alignedstore32 (i32 (fp_to_sint (f32 SPR:$a))),
                                    addrmode5:$ptr),
                    (VSTRS (VTOSIZS SPR:$a), addrmode5:$ptr)>;
 
+def VTOSIZH : AVConv1IsH_Encode<0b11101, 0b11, 0b1101, 0b1001,
+                                 (outs SPR:$Sd), (ins SPR:$Sm),
+                                 IIC_fpCVTHI, "vcvt", ".s32.f16\t$Sd, $Sm",
+                                 []> {
+  let Inst{7} = 1; // Z bit
+}
+
 def VTOUIZD : AVConv1IsD_Encode<0b11101, 0b11, 0b1100, 0b1011,
                                (outs SPR:$Sd), (ins DPR:$Dm),
                                IIC_fpCVTDI, "vcvt", ".u32.f64\t$Sd, $Dm",
@@ -1180,6 +1460,13 @@ def : VFPNoNEONPat<(alignedstore32 (i32 (fp_to_uint (f32 SPR:$a))),
                                    addrmode5:$ptr),
                   (VSTRS (VTOUIZS SPR:$a), addrmode5:$ptr)>;
 
+def VTOUIZH : AVConv1IsH_Encode<0b11101, 0b11, 0b1100, 0b1001,
+                                 (outs SPR:$Sd), (ins SPR:$Sm),
+                                 IIC_fpCVTHI, "vcvt", ".u32.f16\t$Sd, $Sm",
+                                 []> {
+  let Inst{7} = 1; // Z bit
+}
+
 // And the Z bit '0' variants, i.e. use the rounding mode specified by FPSCR.
 let Uses = [FPSCR] in {
 // FIXME: Verify encoding after integrated assembler is working.
@@ -1197,6 +1484,13 @@ def VTOSIRS : AVConv1InsS_Encode<0b11101, 0b11, 0b1101, 0b1010,
   let Inst{7} = 0; // Z bit
 }
 
+def VTOSIRH : AVConv1IsH_Encode<0b11101, 0b11, 0b1101, 0b1001,
+                                 (outs SPR:$Sd), (ins SPR:$Sm),
+                                 IIC_fpCVTHI, "vcvtr", ".s32.f16\t$Sd, $Sm",
+                                 []> {
+  let Inst{7} = 0; // Z bit
+}
+
 def VTOUIRD : AVConv1IsD_Encode<0b11101, 0b11, 0b1100, 0b1011,
                                 (outs SPR:$Sd), (ins DPR:$Dm),
                                 IIC_fpCVTDI, "vcvtr", ".u32.f64\t$Sd, $Dm",
@@ -1210,6 +1504,13 @@ def VTOUIRS : AVConv1InsS_Encode<0b11101, 0b11, 0b1100, 0b1010,
                                  [(set SPR:$Sd, (int_arm_vcvtru SPR:$Sm))]> {
   let Inst{7} = 0; // Z bit
 }
+
+def VTOUIRH : AVConv1IsH_Encode<0b11101, 0b11, 0b1100, 0b1001,
+                                 (outs SPR:$Sd), (ins SPR:$Sm),
+                                 IIC_fpCVTHI, "vcvtr", ".u32.f16\t$Sd, $Sm",
+                                 []> {
+  let Inst{7} = 0; // Z bit
+}
 }
 
 // Convert between floating-point and fixed-point
@@ -1249,6 +1550,26 @@ class AVConv1XInsD_Encode<bits<5> op1, bits<2> op2, bits<4> op3, bits<4> op4,
   let Predicates = [HasVFP2, HasDPVFP];
 }
 
+def VTOSHH : AVConv1XInsS_Encode<0b11101, 0b11, 0b1110, 0b1001, 0,
+                       (outs SPR:$dst), (ins SPR:$a, fbits16:$fbits),
+                 IIC_fpCVTHI, "vcvt", ".s16.f16\t$dst, $a, $fbits", []>,
+             Requires<[HasFullFP16]>;
+
+def VTOUHH : AVConv1XInsS_Encode<0b11101, 0b11, 0b1111, 0b1001, 0,
+                       (outs SPR:$dst), (ins SPR:$a, fbits16:$fbits),
+                 IIC_fpCVTHI, "vcvt", ".u16.f16\t$dst, $a, $fbits", []>,
+             Requires<[HasFullFP16]>;
+
+def VTOSLH : AVConv1XInsS_Encode<0b11101, 0b11, 0b1110, 0b1001, 1,
+                       (outs SPR:$dst), (ins SPR:$a, fbits32:$fbits),
+                 IIC_fpCVTHI, "vcvt", ".s32.f16\t$dst, $a, $fbits", []>,
+             Requires<[HasFullFP16]>;
+
+def VTOULH : AVConv1XInsS_Encode<0b11101, 0b11, 0b1111, 0b1001, 1,
+                       (outs SPR:$dst), (ins SPR:$a, fbits32:$fbits),
+                 IIC_fpCVTHI, "vcvt", ".u32.f16\t$dst, $a, $fbits", []>,
+             Requires<[HasFullFP16]>;
+
 def VTOSHS : AVConv1XInsS_Encode<0b11101, 0b11, 0b1110, 0b1010, 0,
                        (outs SPR:$dst), (ins SPR:$a, fbits16:$fbits),
                  IIC_fpCVTSI, "vcvt", ".s16.f32\t$dst, $a, $fbits", []> {
@@ -1299,6 +1620,26 @@ def VTOULD : AVConv1XInsD_Encode<0b11101, 0b11, 0b1111, 0b1011, 1,
 
 // Fixed-Point to FP:
 
+def VSHTOH : AVConv1XInsS_Encode<0b11101, 0b11, 0b1010, 0b1001, 0,
+                       (outs SPR:$dst), (ins SPR:$a, fbits16:$fbits),
+                 IIC_fpCVTIH, "vcvt", ".f16.s16\t$dst, $a, $fbits", []>,
+             Requires<[HasFullFP16]>;
+
+def VUHTOH : AVConv1XInsS_Encode<0b11101, 0b11, 0b1011, 0b1001, 0,
+                       (outs SPR:$dst), (ins SPR:$a, fbits16:$fbits),
+                 IIC_fpCVTIH, "vcvt", ".f16.u16\t$dst, $a, $fbits", []>,
+             Requires<[HasFullFP16]>;
+
+def VSLTOH : AVConv1XInsS_Encode<0b11101, 0b11, 0b1010, 0b1001, 1,
+                       (outs SPR:$dst), (ins SPR:$a, fbits32:$fbits),
+                 IIC_fpCVTIH, "vcvt", ".f16.s32\t$dst, $a, $fbits", []>,
+             Requires<[HasFullFP16]>;
+
+def VULTOH : AVConv1XInsS_Encode<0b11101, 0b11, 0b1011, 0b1001, 1,
+                       (outs SPR:$dst), (ins SPR:$a, fbits32:$fbits),
+                 IIC_fpCVTIH, "vcvt", ".f16.u32\t$dst, $a, $fbits", []>,
+             Requires<[HasFullFP16]>;
+
 def VSHTOS : AVConv1XInsS_Encode<0b11101, 0b11, 0b1010, 0b1010, 0,
                        (outs SPR:$dst), (ins SPR:$a, fbits16:$fbits),
                  IIC_fpCVTIS, "vcvt", ".f32.s16\t$dst, $a, $fbits", []> {
@@ -1373,6 +1714,13 @@ def VMLAS : ASbIn<0b11100, 0b00, 0, 0,
   let D = VFPNeonA8Domain;
 }
 
+def VMLAH : AHbI<0b11100, 0b00, 0, 0,
+                  (outs SPR:$Sd), (ins SPR:$Sdin, SPR:$Sn, SPR:$Sm),
+                  IIC_fpMAC16, "vmla", ".f16\t$Sd, $Sn, $Sm",
+                  []>,
+              RegConstraint<"$Sdin = $Sd">,
+              Requires<[HasFullFP16,UseFPVMLx,DontUseFusedMAC]>;
+
 def : Pat<(fadd_mlx DPR:$dstin, (fmul_su DPR:$a, (f64 DPR:$b))),
           (VMLAD DPR:$dstin, DPR:$a, DPR:$b)>,
           Requires<[HasVFP2,HasDPVFP,UseFPVMLx,DontUseFusedMAC]>;
@@ -1400,6 +1748,13 @@ def VMLSS : ASbIn<0b11100, 0b00, 1, 0,
   let D = VFPNeonA8Domain;
 }
 
+def VMLSH : AHbI<0b11100, 0b00, 1, 0,
+                  (outs SPR:$Sd), (ins SPR:$Sdin, SPR:$Sn, SPR:$Sm),
+                  IIC_fpMAC16, "vmls", ".f16\t$Sd, $Sn, $Sm",
+                  []>,
+              RegConstraint<"$Sdin = $Sd">,
+              Requires<[HasFullFP16,UseFPVMLx,DontUseFusedMAC]>;
+
 def : Pat<(fsub_mlx DPR:$dstin, (fmul_su DPR:$a, (f64 DPR:$b))),
           (VMLSD DPR:$dstin, DPR:$a, DPR:$b)>,
           Requires<[HasVFP2,HasDPVFP,UseFPVMLx,DontUseFusedMAC]>;
@@ -1427,6 +1782,13 @@ def VNMLAS : ASbI<0b11100, 0b01, 1, 0,
   let D = VFPNeonA8Domain;
 }
 
+def VNMLAH : AHbI<0b11100, 0b01, 1, 0,
+                  (outs SPR:$Sd), (ins SPR:$Sdin, SPR:$Sn, SPR:$Sm),
+                  IIC_fpMAC16, "vnmla", ".f16\t$Sd, $Sn, $Sm",
+                  []>,
+                RegConstraint<"$Sdin = $Sd">,
+                Requires<[HasFullFP16,UseFPVMLx,DontUseFusedMAC]>;
+
 def : Pat<(fsub_mlx (fneg (fmul_su DPR:$a, (f64 DPR:$b))), DPR:$dstin),
           (VNMLAD DPR:$dstin, DPR:$a, DPR:$b)>,
           Requires<[HasVFP2,HasDPVFP,UseFPVMLx,DontUseFusedMAC]>;
@@ -1453,6 +1815,13 @@ def VNMLSS : ASbI<0b11100, 0b01, 0, 0,
   let D = VFPNeonA8Domain;
 }
 
+def VNMLSH : AHbI<0b11100, 0b01, 0, 0,
+                  (outs SPR:$Sd), (ins SPR:$Sdin, SPR:$Sn, SPR:$Sm),
+                  IIC_fpMAC16, "vnmls", ".f16\t$Sd, $Sn, $Sm",
+             []>,
+                         RegConstraint<"$Sdin = $Sd">,
+                Requires<[HasFullFP16,UseFPVMLx,DontUseFusedMAC]>;
+
 def : Pat<(fsub_mlx (fmul_su DPR:$a, (f64 DPR:$b)), DPR:$dstin),
           (VNMLSD DPR:$dstin, DPR:$a, DPR:$b)>,
           Requires<[HasVFP2,HasDPVFP,UseFPVMLx,DontUseFusedMAC]>;
@@ -1482,6 +1851,13 @@ def VFMAS : ASbIn<0b11101, 0b10, 0, 0,
   // VFP pipelines.
 }
 
+def VFMAH : AHbI<0b11101, 0b10, 0, 0,
+                  (outs SPR:$Sd), (ins SPR:$Sdin, SPR:$Sn, SPR:$Sm),
+                  IIC_fpFMAC16, "vfma", ".f16\t$Sd, $Sn, $Sm",
+                  []>,
+              RegConstraint<"$Sdin = $Sd">,
+              Requires<[HasFullFP16,UseFusedMAC]>;
+
 def : Pat<(fadd_mlx DPR:$dstin, (fmul_su DPR:$a, (f64 DPR:$b))),
           (VFMAD DPR:$dstin, DPR:$a, DPR:$b)>,
           Requires<[HasVFP4,HasDPVFP,UseFusedMAC]>;
@@ -1517,6 +1893,13 @@ def VFMSS : ASbIn<0b11101, 0b10, 1, 0,
   // VFP pipelines.
 }
 
+def VFMSH : AHbI<0b11101, 0b10, 1, 0,
+                  (outs SPR:$Sd), (ins SPR:$Sdin, SPR:$Sn, SPR:$Sm),
+                  IIC_fpFMAC16, "vfms", ".f16\t$Sd, $Sn, $Sm",
+                  []>,
+              RegConstraint<"$Sdin = $Sd">,
+              Requires<[HasFullFP16,UseFusedMAC]>;
+
 def : Pat<(fsub_mlx DPR:$dstin, (fmul_su DPR:$a, (f64 DPR:$b))),
           (VFMSD DPR:$dstin, DPR:$a, DPR:$b)>,
           Requires<[HasVFP4,HasDPVFP,UseFusedMAC]>;
@@ -1559,6 +1942,13 @@ def VFNMAS : ASbI<0b11101, 0b01, 1, 0,
   // VFP pipelines.
 }
 
+def VFNMAH : AHbI<0b11101, 0b01, 1, 0,
+                  (outs SPR:$Sd), (ins SPR:$Sdin, SPR:$Sn, SPR:$Sm),
+                  IIC_fpFMAC16, "vfnma", ".f16\t$Sd, $Sn, $Sm",
+                  []>,
+                RegConstraint<"$Sdin = $Sd">,
+                Requires<[HasFullFP16,UseFusedMAC]>;
+
 def : Pat<(fsub_mlx (fneg (fmul_su DPR:$a, (f64 DPR:$b))), DPR:$dstin),
           (VFNMAD DPR:$dstin, DPR:$a, DPR:$b)>,
           Requires<[HasVFP4,HasDPVFP,UseFusedMAC]>;
@@ -1600,6 +1990,13 @@ def VFNMSS : ASbI<0b11101, 0b01, 0, 0,
   // VFP pipelines.
 }
 
+def VFNMSH : AHbI<0b11101, 0b01, 0, 0,
+                  (outs SPR:$Sd), (ins SPR:$Sdin, SPR:$Sn, SPR:$Sm),
+                  IIC_fpFMAC16, "vfnms", ".f16\t$Sd, $Sn, $Sm",
+             []>,
+                         RegConstraint<"$Sdin = $Sd">,
+                  Requires<[HasFullFP16,UseFusedMAC]>;
+
 def : Pat<(fsub_mlx (fmul_su DPR:$a, (f64 DPR:$b)), DPR:$dstin),
           (VFNMSD DPR:$dstin, DPR:$a, DPR:$b)>,
           Requires<[HasVFP4,HasDPVFP,UseFusedMAC]>;
@@ -1780,6 +2177,23 @@ def FCONSTS : VFPAI<(outs SPR:$Sd), (ins vfp_f32imm:$imm),
   let Inst{7-4}   = 0b0000;
   let Inst{3-0}   = imm{3-0};
 }
+
+def FCONSTH : VFPAI<(outs SPR:$Sd), (ins vfp_f16imm:$imm),
+                     VFPMiscFrm, IIC_fpUNA16,
+                     "vmov", ".f16\t$Sd, $imm",
+                     []>, Requires<[HasFullFP16]> {
+  bits<5> Sd;
+  bits<8> imm;
+
+  let Inst{27-23} = 0b11101;
+  let Inst{22}    = Sd{0};
+  let Inst{21-20} = 0b11;
+  let Inst{19-16} = imm{7-4};
+  let Inst{15-12} = Sd{4-1};
+  let Inst{11-8}  = 0b1001;     // Half precision
+  let Inst{7-4}   = 0b0000;
+  let Inst{3-0}   = imm{3-0};
+}
 }
 
 //===----------------------------------------------------------------------===//
diff --git a/lib/Target/ARM/ARMLoadStoreOptimizer.cpp b/lib/Target/ARM/ARMLoadStoreOptimizer.cpp
index 6e7e47b8706a..62d57f3f4986 100644
--- a/lib/Target/ARM/ARMLoadStoreOptimizer.cpp
+++ b/lib/Target/ARM/ARMLoadStoreOptimizer.cpp
@@ -60,9 +60,14 @@ STATISTIC(NumSTRD2STM,  "Number of strd instructions turned back into stm");
 STATISTIC(NumLDRD2LDR,  "Number of ldrd instructions turned back into ldr's");
 STATISTIC(NumSTRD2STR,  "Number of strd instructions turned back into str's");
 
-namespace llvm {
-void initializeARMLoadStoreOptPass(PassRegistry &);
-}
+/// This switch disables formation of double/multi instructions that could
+/// potentially lead to (new) alignment traps even with CCR.UNALIGN_TRP
+/// disabled. This can be used to create libraries that are robust even when
+/// users provoke undefined behaviour by supplying misaligned pointers.
+/// \see mayCombineMisaligned()
+static cl::opt<bool>
+AssumeMisalignedLoadStores("arm-assume-misaligned-load-store", cl::Hidden,
+    cl::init(false), cl::desc("Be more conservative in ARM load/store opt"));
 
 #define ARM_LOAD_STORE_OPT_NAME "ARM load / store optimization pass"
 
@@ -71,9 +76,7 @@ namespace {
   /// form ldm / stm instructions.
   struct ARMLoadStoreOpt : public MachineFunctionPass {
     static char ID;
-    ARMLoadStoreOpt() : MachineFunctionPass(ID) {
-      initializeARMLoadStoreOptPass(*PassRegistry::getPassRegistry());
-    }
+    ARMLoadStoreOpt() : MachineFunctionPass(ID) {}
 
     const MachineFunction *MF;
     const TargetInstrInfo *TII;
@@ -90,6 +93,11 @@ namespace {
 
     bool runOnMachineFunction(MachineFunction &Fn) override;
 
+    MachineFunctionProperties getRequiredProperties() const override {
+      return MachineFunctionProperties().set(
+          MachineFunctionProperties::Property::AllVRegsAllocated);
+    }
+
     const char *getPassName() const override {
       return ARM_LOAD_STORE_OPT_NAME;
     }
@@ -101,8 +109,8 @@ namespace {
       MachineInstr *MI;
       int Offset;        ///< Load/Store offset.
       unsigned Position; ///< Position as counted from end of basic block.
-      MemOpQueueEntry(MachineInstr *MI, int Offset, unsigned Position)
-        : MI(MI), Offset(Offset), Position(Position) {}
+      MemOpQueueEntry(MachineInstr &MI, int Offset, unsigned Position)
+          : MI(&MI), Offset(Offset), Position(Position) {}
     };
     typedef SmallVector<MemOpQueueEntry,8> MemOpQueue;
 
@@ -131,17 +139,19 @@ namespace {
                             MachineBasicBlock::const_iterator Before);
     unsigned findFreeReg(const TargetRegisterClass &RegClass);
     void UpdateBaseRegUses(MachineBasicBlock &MBB,
-                           MachineBasicBlock::iterator MBBI,
-                           DebugLoc DL, unsigned Base, unsigned WordOffset,
+                           MachineBasicBlock::iterator MBBI, const DebugLoc &DL,
+                           unsigned Base, unsigned WordOffset,
                            ARMCC::CondCodes Pred, unsigned PredReg);
-    MachineInstr *CreateLoadStoreMulti(MachineBasicBlock &MBB,
-        MachineBasicBlock::iterator InsertBefore, int Offset, unsigned Base,
-        bool BaseKill, unsigned Opcode, ARMCC::CondCodes Pred, unsigned PredReg,
-        DebugLoc DL, ArrayRef<std::pair<unsigned, bool>> Regs);
-    MachineInstr *CreateLoadStoreDouble(MachineBasicBlock &MBB,
-        MachineBasicBlock::iterator InsertBefore, int Offset, unsigned Base,
-        bool BaseKill, unsigned Opcode, ARMCC::CondCodes Pred, unsigned PredReg,
-        DebugLoc DL, ArrayRef<std::pair<unsigned, bool>> Regs) const;
+    MachineInstr *CreateLoadStoreMulti(
+        MachineBasicBlock &MBB, MachineBasicBlock::iterator InsertBefore,
+        int Offset, unsigned Base, bool BaseKill, unsigned Opcode,
+        ARMCC::CondCodes Pred, unsigned PredReg, const DebugLoc &DL,
+        ArrayRef<std::pair<unsigned, bool>> Regs);
+    MachineInstr *CreateLoadStoreDouble(
+        MachineBasicBlock &MBB, MachineBasicBlock::iterator InsertBefore,
+        int Offset, unsigned Base, bool BaseKill, unsigned Opcode,
+        ARMCC::CondCodes Pred, unsigned PredReg, const DebugLoc &DL,
+        ArrayRef<std::pair<unsigned, bool>> Regs) const;
     void FormCandidates(const MemOpQueue &MemOps);
     MachineInstr *MergeOpsUpdate(const MergeCandidate &Cand);
     bool FixInvalidRegPairOp(MachineBasicBlock &MBB,
@@ -156,10 +166,11 @@ namespace {
   char ARMLoadStoreOpt::ID = 0;
 }
 
-INITIALIZE_PASS(ARMLoadStoreOpt, "arm-load-store-opt", ARM_LOAD_STORE_OPT_NAME, false, false)
+INITIALIZE_PASS(ARMLoadStoreOpt, "arm-ldst-opt", ARM_LOAD_STORE_OPT_NAME, false,
+                false)
 
-static bool definesCPSR(const MachineInstr *MI) {
-  for (const auto &MO : MI->operands()) {
+static bool definesCPSR(const MachineInstr &MI) {
+  for (const auto &MO : MI.operands()) {
     if (!MO.isReg())
       continue;
     if (MO.isDef() && MO.getReg() == ARM::CPSR && !MO.isDead())
@@ -171,11 +182,11 @@ static bool definesCPSR(const MachineInstr *MI) {
   return false;
 }
 
-static int getMemoryOpOffset(const MachineInstr *MI) {
-  unsigned Opcode = MI->getOpcode();
+static int getMemoryOpOffset(const MachineInstr &MI) {
+  unsigned Opcode = MI.getOpcode();
   bool isAM3 = Opcode == ARM::LDRD || Opcode == ARM::STRD;
-  unsigned NumOperands = MI->getDesc().getNumOperands();
-  unsigned OffField = MI->getOperand(NumOperands-3).getImm();
+  unsigned NumOperands = MI.getDesc().getNumOperands();
+  unsigned OffField = MI.getOperand(NumOperands - 3).getImm();
 
   if (Opcode == ARM::t2LDRi12 || Opcode == ARM::t2LDRi8 ||
       Opcode == ARM::t2STRi12 || Opcode == ARM::t2STRi8 ||
@@ -436,12 +447,12 @@ static unsigned getLSMultipleTransferSize(const MachineInstr *MI) {
 
 /// Update future uses of the base register with the offset introduced
 /// due to writeback. This function only works on Thumb1.
-void
-ARMLoadStoreOpt::UpdateBaseRegUses(MachineBasicBlock &MBB,
-                                   MachineBasicBlock::iterator MBBI,
-                                   DebugLoc DL, unsigned Base,
-                                   unsigned WordOffset,
-                                   ARMCC::CondCodes Pred, unsigned PredReg) {
+void ARMLoadStoreOpt::UpdateBaseRegUses(MachineBasicBlock &MBB,
+                                        MachineBasicBlock::iterator MBBI,
+                                        const DebugLoc &DL, unsigned Base,
+                                        unsigned WordOffset,
+                                        ARMCC::CondCodes Pred,
+                                        unsigned PredReg) {
   assert(isThumb1 && "Can only update base register uses for Thumb1!");
   // Start updating any instructions with immediate offsets. Insert a SUB before
   // the first non-updateable instruction (if any).
@@ -475,7 +486,7 @@ ARMLoadStoreOpt::UpdateBaseRegUses(MachineBasicBlock &MBB,
           InsertSub = true;
 
       } else if ((Opc == ARM::tSUBi8 || Opc == ARM::tADDi8) &&
-                 !definesCPSR(MBBI)) {
+                 !definesCPSR(*MBBI)) {
         // SUBS/ADDS using this register, with a dead def of the CPSR.
         // Merge it with the update; if the merged offset is too large,
         // insert a new sub instead.
@@ -499,7 +510,7 @@ ARMLoadStoreOpt::UpdateBaseRegUses(MachineBasicBlock &MBB,
         InsertSub = true;
       }
 
-    } else if (definesCPSR(MBBI) || MBBI->isCall() || MBBI->isBranch()) {
+    } else if (definesCPSR(*MBBI) || MBBI->isCall() || MBBI->isBranch()) {
       // Since SUBS sets the condition flags, we can't place the base reset
       // after an instruction that has a live CPSR def.
       // The base register might also contain an argument for a function call.
@@ -552,7 +563,7 @@ void ARMLoadStoreOpt::moveLiveRegsBefore(const MachineBasicBlock &MBB,
   // Initialize if we never queried in this block.
   if (!LiveRegsValid) {
     LiveRegs.init(TRI);
-    LiveRegs.addLiveOuts(&MBB, true);
+    LiveRegs.addLiveOuts(MBB);
     LiveRegPos = MBB.end();
     LiveRegsValid = true;
   }
@@ -574,10 +585,11 @@ static bool ContainsReg(const ArrayRef<std::pair<unsigned, bool>> &Regs,
 /// Create and insert a LDM or STM with Base as base register and registers in
 /// Regs as the register operands that would be loaded / stored.  It returns
 /// true if the transformation is done.
-MachineInstr *ARMLoadStoreOpt::CreateLoadStoreMulti(MachineBasicBlock &MBB,
-    MachineBasicBlock::iterator InsertBefore, int Offset, unsigned Base,
-    bool BaseKill, unsigned Opcode, ARMCC::CondCodes Pred, unsigned PredReg,
-    DebugLoc DL, ArrayRef<std::pair<unsigned, bool>> Regs) {
+MachineInstr *ARMLoadStoreOpt::CreateLoadStoreMulti(
+    MachineBasicBlock &MBB, MachineBasicBlock::iterator InsertBefore,
+    int Offset, unsigned Base, bool BaseKill, unsigned Opcode,
+    ARMCC::CondCodes Pred, unsigned PredReg, const DebugLoc &DL,
+    ArrayRef<std::pair<unsigned, bool>> Regs) {
   unsigned NumRegs = Regs.size();
   assert(NumRegs > 1);
 
@@ -770,10 +782,11 @@ MachineInstr *ARMLoadStoreOpt::CreateLoadStoreMulti(MachineBasicBlock &MBB,
   return MIB.getInstr();
 }
 
-MachineInstr *ARMLoadStoreOpt::CreateLoadStoreDouble(MachineBasicBlock &MBB,
-    MachineBasicBlock::iterator InsertBefore, int Offset, unsigned Base,
-    bool BaseKill, unsigned Opcode, ARMCC::CondCodes Pred, unsigned PredReg,
-    DebugLoc DL, ArrayRef<std::pair<unsigned, bool>> Regs) const {
+MachineInstr *ARMLoadStoreOpt::CreateLoadStoreDouble(
+    MachineBasicBlock &MBB, MachineBasicBlock::iterator InsertBefore,
+    int Offset, unsigned Base, bool BaseKill, unsigned Opcode,
+    ARMCC::CondCodes Pred, unsigned PredReg, const DebugLoc &DL,
+    ArrayRef<std::pair<unsigned, bool>> Regs) const {
   bool IsLoad = isi32Load(Opcode);
   assert((IsLoad || isi32Store(Opcode)) && "Must have integer load or store");
   unsigned LoadStoreOpcode = IsLoad ? ARM::t2LDRDi8 : ARM::t2STRDi8;
@@ -836,11 +849,11 @@ MachineInstr *ARMLoadStoreOpt::MergeOpsUpdate(const MergeCandidate &Cand) {
   MachineInstr *LatestMI = Cand.Instrs[Cand.LatestMIIdx];
   iterator InsertBefore = std::next(iterator(LatestMI));
   MachineBasicBlock &MBB = *LatestMI->getParent();
-  unsigned Offset = getMemoryOpOffset(First);
+  unsigned Offset = getMemoryOpOffset(*First);
   unsigned Base = getLoadStoreBaseOp(*First).getReg();
   bool BaseKill = LatestMI->killsRegister(Base);
   unsigned PredReg = 0;
-  ARMCC::CondCodes Pred = getInstrPredicate(First, PredReg);
+  ARMCC::CondCodes Pred = getInstrPredicate(*First, PredReg);
   DebugLoc DL = First->getDebugLoc();
   MachineInstr *Merged = nullptr;
   if (Cand.CanMergeToLSDouble)
@@ -916,6 +929,24 @@ static bool isValidLSDoubleOffset(int Offset) {
   return (Value % 4) == 0 && Value < 1024;
 }
 
+/// Return true for loads/stores that can be combined to a double/multi
+/// operation without increasing the requirements for alignment.
+static bool mayCombineMisaligned(const TargetSubtargetInfo &STI,
+                                 const MachineInstr &MI) {
+  // vldr/vstr trap on misaligned pointers anyway, forming vldm makes no
+  // difference.
+  unsigned Opcode = MI.getOpcode();
+  if (!isi32Load(Opcode) && !isi32Store(Opcode))
+    return true;
+
+  // Stack pointer alignment is out of the programmers control so we can trust
+  // SP-relative loads/stores.
+  if (getLoadStoreBaseOp(MI).getReg() == ARM::SP &&
+      STI.getFrameLowering()->getTransientStackAlignment() >= 4)
+    return true;
+  return false;
+}
+
 /// Find candidates for load/store multiple merge in list of MemOpQueueEntries.
 void ARMLoadStoreOpt::FormCandidates(const MemOpQueue &MemOps) {
   const MachineInstr *FirstMI = MemOps[0].MI;
@@ -946,7 +977,7 @@ void ARMLoadStoreOpt::FormCandidates(const MemOpQueue &MemOps) {
     bool CanMergeToLSMulti = true;
     // On swift vldm/vstm starting with an odd register number as that needs
     // more uops than single vldrs.
-    if (STI->isSwift() && !isNotVFP && (PRegNum % 2) == 1)
+    if (STI->hasSlowOddRegister() && !isNotVFP && (PRegNum % 2) == 1)
       CanMergeToLSMulti = false;
 
     // LDRD/STRD do not allow SP/PC. LDM/STM do not support it or have it
@@ -954,6 +985,10 @@ void ARMLoadStoreOpt::FormCandidates(const MemOpQueue &MemOps) {
     if (PReg == ARM::SP || PReg == ARM::PC)
       CanMergeToLSMulti = CanMergeToLSDouble = false;
 
+    // Should we be conservative?
+    if (AssumeMisalignedLoadStores && !mayCombineMisaligned(*STI, *MI))
+      CanMergeToLSMulti = CanMergeToLSDouble = false;
+
     // Merge following instructions where possible.
     for (unsigned I = SIndex+1; I < EIndex; ++I, ++Count) {
       int NewOffset = MemOps[I].Offset;
@@ -1102,11 +1137,11 @@ static int isIncrementOrDecrement(const MachineInstr &MI, unsigned Reg,
   unsigned MIPredReg;
   if (MI.getOperand(0).getReg() != Reg ||
       MI.getOperand(1).getReg() != Reg ||
-      getInstrPredicate(&MI, MIPredReg) != Pred ||
+      getInstrPredicate(MI, MIPredReg) != Pred ||
       MIPredReg != PredReg)
     return 0;
 
-  if (CheckCPSRDef && definesCPSR(&MI))
+  if (CheckCPSRDef && definesCPSR(MI))
     return 0;
   return MI.getOperand(2).getImm() * Scale;
 }
@@ -1169,7 +1204,7 @@ bool ARMLoadStoreOpt::MergeBaseUpdateLSMultiple(MachineInstr *MI) {
   unsigned Base = BaseOP.getReg();
   bool BaseKill = BaseOP.isKill();
   unsigned PredReg = 0;
-  ARMCC::CondCodes Pred = getInstrPredicate(MI, PredReg);
+  ARMCC::CondCodes Pred = getInstrPredicate(*MI, PredReg);
   unsigned Opcode = MI->getOpcode();
   DebugLoc DL = MI->getDebugLoc();
 
@@ -1193,10 +1228,30 @@ bool ARMLoadStoreOpt::MergeBaseUpdateLSMultiple(MachineInstr *MI) {
   } else {
     MergeInstr = findIncDecAfter(MBBI, Base, Pred, PredReg, Offset);
     if (((Mode != ARM_AM::ia && Mode != ARM_AM::ib) || Offset != Bytes) &&
-        ((Mode != ARM_AM::da && Mode != ARM_AM::db) || Offset != -Bytes))
-      return false;
+        ((Mode != ARM_AM::da && Mode != ARM_AM::db) || Offset != -Bytes)) {
+
+      // We couldn't find an inc/dec to merge. But if the base is dead, we
+      // can still change to a writeback form as that will save us 2 bytes
+      // of code size. It can create WAW hazards though, so only do it if
+      // we're minimizing code size.
+      if (!MBB.getParent()->getFunction()->optForMinSize() || !BaseKill)
+        return false;
+      
+      bool HighRegsUsed = false;
+      for (unsigned i = 2, e = MI->getNumOperands(); i != e; ++i)
+        if (MI->getOperand(i).getReg() >= ARM::R8) {
+          HighRegsUsed = true;
+          break;
+        }
+
+      if (!HighRegsUsed)
+        MergeInstr = MBB.end();
+      else
+        return false;
+    }
   }
-  MBB.erase(MergeInstr);
+  if (MergeInstr != MBB.end())
+    MBB.erase(MergeInstr);
 
   unsigned NewOpc = getUpdatingLSMultipleOpcode(Opcode, Mode);
   MachineInstrBuilder MIB = BuildMI(MBB, MBBI, DL, TII->get(NewOpc))
@@ -1291,7 +1346,7 @@ bool ARMLoadStoreOpt::MergeBaseUpdateLoadStore(MachineInstr *MI) {
     return false;
 
   unsigned PredReg = 0;
-  ARMCC::CondCodes Pred = getInstrPredicate(MI, PredReg);
+  ARMCC::CondCodes Pred = getInstrPredicate(*MI, PredReg);
   int Bytes = getLSMultipleTransferSize(MI);
   MachineBasicBlock &MBB = *MI->getParent();
   MachineBasicBlock::iterator MBBI(MI);
@@ -1388,7 +1443,7 @@ bool ARMLoadStoreOpt::MergeBaseUpdateLSDouble(MachineInstr &MI) const {
     return false;
 
   unsigned PredReg;
-  ARMCC::CondCodes Pred = getInstrPredicate(&MI, PredReg);
+  ARMCC::CondCodes Pred = getInstrPredicate(MI, PredReg);
   MachineBasicBlock::iterator MBBI(MI);
   MachineBasicBlock &MBB = *MI.getParent();
   int Offset;
@@ -1487,14 +1542,13 @@ static bool isMemoryOp(const MachineInstr &MI) {
 }
 
 static void InsertLDR_STR(MachineBasicBlock &MBB,
-                          MachineBasicBlock::iterator &MBBI,
-                          int Offset, bool isDef,
-                          DebugLoc DL, unsigned NewOpc,
+                          MachineBasicBlock::iterator &MBBI, int Offset,
+                          bool isDef, const DebugLoc &DL, unsigned NewOpc,
                           unsigned Reg, bool RegDeadKill, bool RegUndef,
                           unsigned BaseReg, bool BaseKill, bool BaseUndef,
-                          bool OffKill, bool OffUndef,
-                          ARMCC::CondCodes Pred, unsigned PredReg,
-                          const TargetInstrInfo *TII, bool isT2) {
+                          bool OffKill, bool OffUndef, ARMCC::CondCodes Pred,
+                          unsigned PredReg, const TargetInstrInfo *TII,
+                          bool isT2) {
   if (isDef) {
     MachineInstrBuilder MIB = BuildMI(MBB, MBBI, MBBI->getDebugLoc(),
                                       TII->get(NewOpc))
@@ -1547,9 +1601,9 @@ bool ARMLoadStoreOpt::FixInvalidRegPairOp(MachineBasicBlock &MBB,
   bool BaseUndef = BaseOp.isUndef();
   bool OffKill = isT2 ? false : MI->getOperand(3).isKill();
   bool OffUndef = isT2 ? false : MI->getOperand(3).isUndef();
-  int OffImm = getMemoryOpOffset(MI);
+  int OffImm = getMemoryOpOffset(*MI);
   unsigned PredReg = 0;
-  ARMCC::CondCodes Pred = getInstrPredicate(MI, PredReg);
+  ARMCC::CondCodes Pred = getInstrPredicate(*MI, PredReg);
 
   if (OddRegNum > EvenRegNum && OffImm == 0) {
     // Ascending register numbers and no offset. It's safe to change it to a
@@ -1655,14 +1709,14 @@ bool ARMLoadStoreOpt::LoadStoreMultipleOpti(MachineBasicBlock &MBB) {
       unsigned Reg = MO.getReg();
       unsigned Base = getLoadStoreBaseOp(*MBBI).getReg();
       unsigned PredReg = 0;
-      ARMCC::CondCodes Pred = getInstrPredicate(MBBI, PredReg);
-      int Offset = getMemoryOpOffset(MBBI);
+      ARMCC::CondCodes Pred = getInstrPredicate(*MBBI, PredReg);
+      int Offset = getMemoryOpOffset(*MBBI);
       if (CurrBase == 0) {
         // Start of a new chain.
         CurrBase = Base;
         CurrOpc  = Opcode;
         CurrPred = Pred;
-        MemOps.push_back(MemOpQueueEntry(MBBI, Offset, Position));
+        MemOps.push_back(MemOpQueueEntry(*MBBI, Offset, Position));
         continue;
       }
       // Note: No need to match PredReg in the next if.
@@ -1690,7 +1744,7 @@ bool ARMLoadStoreOpt::LoadStoreMultipleOpti(MachineBasicBlock &MBB) {
         if (!Overlap) {
           // Check offset and sort memory operation into the current chain.
           if (Offset > MemOps.back().Offset) {
-            MemOps.push_back(MemOpQueueEntry(MBBI, Offset, Position));
+            MemOps.push_back(MemOpQueueEntry(*MBBI, Offset, Position));
             continue;
           } else {
             MemOpQueue::iterator MI, ME;
@@ -1706,7 +1760,7 @@ bool ARMLoadStoreOpt::LoadStoreMultipleOpti(MachineBasicBlock &MBB) {
               }
             }
             if (MI != MemOps.end()) {
-              MemOps.insert(MI, MemOpQueueEntry(MBBI, Offset, Position));
+              MemOps.insert(MI, MemOpQueueEntry(*MBBI, Offset, Position));
               continue;
             }
           }
@@ -1723,7 +1777,7 @@ bool ARMLoadStoreOpt::LoadStoreMultipleOpti(MachineBasicBlock &MBB) {
                MBBI->getOpcode() == ARM::t2STRDi8) {
       // ARMPreAllocLoadStoreOpt has already formed some LDRD/STRD instructions
       // remember them because we may still be able to merge add/sub into them.
-      MergeBaseCandidates.push_back(MBBI);
+      MergeBaseCandidates.push_back(&*MBBI);
     }
 
 
@@ -1805,20 +1859,20 @@ bool ARMLoadStoreOpt::MergeReturnIntoLDM(MachineBasicBlock &MBB) {
     // Ignore any DBG_VALUE instructions.
     while (PrevI->isDebugValue() && PrevI != MBB.begin())
       --PrevI;
-    MachineInstr *PrevMI = PrevI;
-    unsigned Opcode = PrevMI->getOpcode();
+    MachineInstr &PrevMI = *PrevI;
+    unsigned Opcode = PrevMI.getOpcode();
     if (Opcode == ARM::LDMIA_UPD || Opcode == ARM::LDMDA_UPD ||
         Opcode == ARM::LDMDB_UPD || Opcode == ARM::LDMIB_UPD ||
         Opcode == ARM::t2LDMIA_UPD || Opcode == ARM::t2LDMDB_UPD) {
-      MachineOperand &MO = PrevMI->getOperand(PrevMI->getNumOperands()-1);
+      MachineOperand &MO = PrevMI.getOperand(PrevMI.getNumOperands() - 1);
       if (MO.getReg() != ARM::LR)
         return false;
       unsigned NewOpc = (isThumb2 ? ARM::t2LDMIA_RET : ARM::LDMIA_RET);
       assert(((isThumb2 && Opcode == ARM::t2LDMIA_UPD) ||
               Opcode == ARM::LDMIA_UPD) && "Unsupported multiple load-return!");
-      PrevMI->setDesc(TII->get(NewOpc));
+      PrevMI.setDesc(TII->get(NewOpc));
       MO.setReg(ARM::PC);
-      PrevMI->copyImplicitOps(*MBB.getParent(), &*MBBI);
+      PrevMI.copyImplicitOps(*MBB.getParent(), *MBBI);
       MBB.erase(MBBI);
       return true;
     }
@@ -1840,8 +1894,8 @@ bool ARMLoadStoreOpt::CombineMovBx(MachineBasicBlock &MBB) {
   for (auto Use : Prev->uses())
     if (Use.isKill()) {
       AddDefaultPred(BuildMI(MBB, MBBI, MBBI->getDebugLoc(), TII->get(ARM::tBX))
-          .addReg(Use.getReg(), RegState::Kill))
-          .copyImplicitOps(&*MBBI);
+                         .addReg(Use.getReg(), RegState::Kill))
+          .copyImplicitOps(*MBBI);
       MBB.erase(MBBI);
       MBB.erase(Prev);
       return true;
@@ -1851,6 +1905,9 @@ bool ARMLoadStoreOpt::CombineMovBx(MachineBasicBlock &MBB) {
 }
 
 bool ARMLoadStoreOpt::runOnMachineFunction(MachineFunction &Fn) {
+  if (skipFunction(*Fn.getFunction()))
+    return false;
+
   MF = &Fn;
   STI = &static_cast<const ARMSubtarget &>(Fn.getSubtarget());
   TL = STI->getTargetLowering();
@@ -1877,10 +1934,6 @@ bool ARMLoadStoreOpt::runOnMachineFunction(MachineFunction &Fn) {
   return Modified;
 }
 
-namespace llvm {
-void initializeARMPreAllocLoadStoreOptPass(PassRegistry &);
-}
-
 #define ARM_PREALLOC_LOAD_STORE_OPT_NAME                                       \
   "ARM pre- register allocation load / store optimization pass"
 
@@ -1889,9 +1942,7 @@ namespace {
   /// locations close to make it more likely they will be combined later.
   struct ARMPreAllocLoadStoreOpt : public MachineFunctionPass{
     static char ID;
-    ARMPreAllocLoadStoreOpt() : MachineFunctionPass(ID) {
-      initializeARMPreAllocLoadStoreOptPass(*PassRegistry::getPassRegistry());
-    }
+    ARMPreAllocLoadStoreOpt() : MachineFunctionPass(ID) {}
 
     const DataLayout *TD;
     const TargetInstrInfo *TII;
@@ -1922,10 +1973,13 @@ namespace {
   char ARMPreAllocLoadStoreOpt::ID = 0;
 }
 
-INITIALIZE_PASS(ARMPreAllocLoadStoreOpt, "arm-prera-load-store-opt",
+INITIALIZE_PASS(ARMPreAllocLoadStoreOpt, "arm-prera-ldst-opt",
                 ARM_PREALLOC_LOAD_STORE_OPT_NAME, false, false)
 
 bool ARMPreAllocLoadStoreOpt::runOnMachineFunction(MachineFunction &Fn) {
+  if (AssumeMisalignedLoadStores || skipFunction(*Fn.getFunction()))
+    return false;
+
   TD = &Fn.getDataLayout();
   STI = &static_cast<const ARMSubtarget &>(Fn.getSubtarget());
   TII = STI->getInstrInfo();
@@ -2034,7 +2088,7 @@ ARMPreAllocLoadStoreOpt::CanFormLdStDWord(MachineInstr *Op0, MachineInstr *Op1,
     return false;
 
   // Then make sure the immediate offset fits.
-  int OffImm = getMemoryOpOffset(Op0);
+  int OffImm = getMemoryOpOffset(*Op0);
   if (isT2) {
     int Limit = (1 << 8) * Scale;
     if (OffImm >= Limit || (OffImm <= -Limit) || (OffImm & (Scale-1)))
@@ -2056,7 +2110,7 @@ ARMPreAllocLoadStoreOpt::CanFormLdStDWord(MachineInstr *Op0, MachineInstr *Op1,
   if (FirstReg == SecondReg)
     return false;
   BaseReg = Op0->getOperand(1).getReg();
-  Pred = getInstrPredicate(Op0, PredReg);
+  Pred = getInstrPredicate(*Op0, PredReg);
   dl = Op0->getDebugLoc();
   return true;
 }
@@ -2070,11 +2124,11 @@ bool ARMPreAllocLoadStoreOpt::RescheduleOps(MachineBasicBlock *MBB,
   // Sort by offset (in reverse order).
   std::sort(Ops.begin(), Ops.end(),
             [](const MachineInstr *LHS, const MachineInstr *RHS) {
-    int LOffset = getMemoryOpOffset(LHS);
-    int ROffset = getMemoryOpOffset(RHS);
-    assert(LHS == RHS || LOffset != ROffset);
-    return LOffset > ROffset;
-  });
+              int LOffset = getMemoryOpOffset(*LHS);
+              int ROffset = getMemoryOpOffset(*RHS);
+              assert(LHS == RHS || LOffset != ROffset);
+              return LOffset > ROffset;
+            });
 
   // The loads / stores of the same base are in order. Scan them from first to
   // last and check for the following:
@@ -2106,7 +2160,7 @@ bool ARMPreAllocLoadStoreOpt::RescheduleOps(MachineBasicBlock *MBB,
       if (LastOpcode && LSMOpcode != LastOpcode)
         break;
 
-      int Offset = getMemoryOpOffset(Op);
+      int Offset = getMemoryOpOffset(*Op);
       unsigned Bytes = getLSMultipleTransferSize(Op);
       if (LastBytes) {
         if (Bytes != LastBytes || Offset != (LastOffset + (int)Bytes))
@@ -2141,8 +2195,8 @@ bool ARMPreAllocLoadStoreOpt::RescheduleOps(MachineBasicBlock *MBB,
       } else {
         // This is the new location for the loads / stores.
         MachineBasicBlock::iterator InsertPos = isLd ? FirstOp : LastOp;
-        while (InsertPos != MBB->end()
-               && (MemOps.count(InsertPos) || InsertPos->isDebugValue()))
+        while (InsertPos != MBB->end() &&
+               (MemOps.count(&*InsertPos) || InsertPos->isDebugValue()))
           ++InsertPos;
 
         // If we are moving a pair of loads / stores, see if it makes sense
@@ -2237,25 +2291,25 @@ ARMPreAllocLoadStoreOpt::RescheduleLoadStoreInstrs(MachineBasicBlock *MBB) {
   MachineBasicBlock::iterator E = MBB->end();
   while (MBBI != E) {
     for (; MBBI != E; ++MBBI) {
-      MachineInstr *MI = MBBI;
-      if (MI->isCall() || MI->isTerminator()) {
+      MachineInstr &MI = *MBBI;
+      if (MI.isCall() || MI.isTerminator()) {
         // Stop at barriers.
         ++MBBI;
         break;
       }
 
-      if (!MI->isDebugValue())
-        MI2LocMap[MI] = ++Loc;
+      if (!MI.isDebugValue())
+        MI2LocMap[&MI] = ++Loc;
 
-      if (!isMemoryOp(*MI))
+      if (!isMemoryOp(MI))
         continue;
       unsigned PredReg = 0;
       if (getInstrPredicate(MI, PredReg) != ARMCC::AL)
         continue;
 
-      int Opc = MI->getOpcode();
+      int Opc = MI.getOpcode();
       bool isLd = isLoadSingle(Opc);
-      unsigned Base = MI->getOperand(1).getReg();
+      unsigned Base = MI.getOperand(1).getReg();
       int Offset = getMemoryOpOffset(MI);
 
       bool StopHere = false;
@@ -2264,15 +2318,15 @@ ARMPreAllocLoadStoreOpt::RescheduleLoadStoreInstrs(MachineBasicBlock *MBB) {
           Base2LdsMap.find(Base);
         if (BI != Base2LdsMap.end()) {
           for (unsigned i = 0, e = BI->second.size(); i != e; ++i) {
-            if (Offset == getMemoryOpOffset(BI->second[i])) {
+            if (Offset == getMemoryOpOffset(*BI->second[i])) {
               StopHere = true;
               break;
             }
           }
           if (!StopHere)
-            BI->second.push_back(MI);
+            BI->second.push_back(&MI);
         } else {
-          Base2LdsMap[Base].push_back(MI);
+          Base2LdsMap[Base].push_back(&MI);
           LdBases.push_back(Base);
         }
       } else {
@@ -2280,15 +2334,15 @@ ARMPreAllocLoadStoreOpt::RescheduleLoadStoreInstrs(MachineBasicBlock *MBB) {
           Base2StsMap.find(Base);
         if (BI != Base2StsMap.end()) {
           for (unsigned i = 0, e = BI->second.size(); i != e; ++i) {
-            if (Offset == getMemoryOpOffset(BI->second[i])) {
+            if (Offset == getMemoryOpOffset(*BI->second[i])) {
               StopHere = true;
               break;
             }
           }
           if (!StopHere)
-            BI->second.push_back(MI);
+            BI->second.push_back(&MI);
         } else {
-          Base2StsMap[Base].push_back(MI);
+          Base2StsMap[Base].push_back(&MI);
           StBases.push_back(Base);
         }
       }
@@ -2335,4 +2389,3 @@ FunctionPass *llvm::createARMLoadStoreOptimizationPass(bool PreAlloc) {
     return new ARMPreAllocLoadStoreOpt();
   return new ARMLoadStoreOpt();
 }
-
diff --git a/lib/Target/ARM/ARMMCInstLower.cpp b/lib/Target/ARM/ARMMCInstLower.cpp
index a2aca2d1a69e..7429acdb09ad 100644
--- a/lib/Target/ARM/ARMMCInstLower.cpp
+++ b/lib/Target/ARM/ARMMCInstLower.cpp
@@ -26,33 +26,22 @@ using namespace llvm;
 
 MCOperand ARMAsmPrinter::GetSymbolRef(const MachineOperand &MO,
                                       const MCSymbol *Symbol) {
-  const MCExpr *Expr;
-  unsigned Option = MO.getTargetFlags() & ARMII::MO_OPTION_MASK;
-  switch (Option) {
-  default: {
-    Expr = MCSymbolRefExpr::create(Symbol, MCSymbolRefExpr::VK_None,
-                                   OutContext);
-    switch (Option) {
-    default: llvm_unreachable("Unknown target flag on symbol operand");
-    case ARMII::MO_NO_FLAG:
-      break;
-    case ARMII::MO_LO16:
-      Expr = MCSymbolRefExpr::create(Symbol, MCSymbolRefExpr::VK_None,
-                                     OutContext);
-      Expr = ARMMCExpr::createLower16(Expr, OutContext);
-      break;
-    case ARMII::MO_HI16:
-      Expr = MCSymbolRefExpr::create(Symbol, MCSymbolRefExpr::VK_None,
-                                     OutContext);
-      Expr = ARMMCExpr::createUpper16(Expr, OutContext);
-      break;
-    }
+  const MCExpr *Expr =
+      MCSymbolRefExpr::create(Symbol, MCSymbolRefExpr::VK_None, OutContext);
+  switch (MO.getTargetFlags() & ARMII::MO_OPTION_MASK) {
+  default:
+    llvm_unreachable("Unknown target flag on symbol operand");
+  case ARMII::MO_NO_FLAG:
     break;
-  }
-
-  case ARMII::MO_PLT:
-    Expr = MCSymbolRefExpr::create(Symbol, MCSymbolRefExpr::VK_PLT,
-                                   OutContext);
+  case ARMII::MO_LO16:
+    Expr =
+        MCSymbolRefExpr::create(Symbol, MCSymbolRefExpr::VK_None, OutContext);
+    Expr = ARMMCExpr::createLower16(Expr, OutContext);
+    break;
+  case ARMII::MO_HI16:
+    Expr =
+        MCSymbolRefExpr::create(Symbol, MCSymbolRefExpr::VK_None, OutContext);
+    Expr = ARMMCExpr::createUpper16(Expr, OutContext);
     break;
   }
 
@@ -89,7 +78,7 @@ bool ARMAsmPrinter::lowerOperand(const MachineOperand &MO,
     break;
   }
   case MachineOperand::MO_ExternalSymbol:
-   MCOp = GetSymbolRef(MO,
+    MCOp = GetSymbolRef(MO,
                         GetExternalSymbolSymbol(MO.getSymbolName()));
     break;
   case MachineOperand::MO_JumpTableIndex:
diff --git a/lib/Target/ARM/ARMMachineFunctionInfo.cpp b/lib/Target/ARM/ARMMachineFunctionInfo.cpp
index 71ad7a4a732a..b6dee9ff8385 100644
--- a/lib/Target/ARM/ARMMachineFunctionInfo.cpp
+++ b/lib/Target/ARM/ARMMachineFunctionInfo.cpp
@@ -21,4 +21,4 @@ ARMFunctionInfo::ARMFunctionInfo(MachineFunction &MF)
       FramePtrSpillOffset(0), GPRCS1Offset(0), GPRCS2Offset(0), DPRCSOffset(0),
       GPRCS1Size(0), GPRCS2Size(0), DPRCSSize(0),
       PICLabelUId(0), VarArgsFrameIndex(0), HasITBlocks(false),
-      IsSplitCSR(false) {}
+      ArgumentStackSize(0), IsSplitCSR(false) {}
diff --git a/lib/Target/ARM/ARMMachineFunctionInfo.h b/lib/Target/ARM/ARMMachineFunctionInfo.h
index 68f9aec8cae5..f71497240ff3 100644
--- a/lib/Target/ARM/ARMMachineFunctionInfo.h
+++ b/lib/Target/ARM/ARMMachineFunctionInfo.h
@@ -15,7 +15,6 @@
 #define LLVM_LIB_TARGET_ARM_ARMMACHINEFUNCTIONINFO_H
 
 #include "ARMSubtarget.h"
-#include "llvm/ADT/BitVector.h"
 #include "llvm/ADT/DenseMap.h"
 #include "llvm/CodeGen/MachineFunction.h"
 #include "llvm/Target/TargetMachine.h"
diff --git a/lib/Target/ARM/ARMOptimizeBarriersPass.cpp b/lib/Target/ARM/ARMOptimizeBarriersPass.cpp
index 30baf4263c11..73dcb9641b61 100644
--- a/lib/Target/ARM/ARMOptimizeBarriersPass.cpp
+++ b/lib/Target/ARM/ARMOptimizeBarriersPass.cpp
@@ -27,6 +27,11 @@ public:
 
   bool runOnMachineFunction(MachineFunction &Fn) override;
 
+  MachineFunctionProperties getRequiredProperties() const override {
+    return MachineFunctionProperties().set(
+        MachineFunctionProperties::Property::AllVRegsAllocated);
+  }
+
   const char *getPassName() const override {
     return "optimise barriers pass";
   }
@@ -46,6 +51,9 @@ static bool CanMovePastDMB(const MachineInstr *MI) {
 }
 
 bool ARMOptimizeBarriersPass::runOnMachineFunction(MachineFunction &MF) {
+  if (skipFunction(*MF.getFunction()))
+    return false;
+
   // Vector to store the DMBs we will remove after the first iteration
   std::vector<MachineInstr *> ToRemove;
   // DMBType is the Imm value of the first operand. It determines whether it's a
diff --git a/lib/Target/ARM/ARMSchedule.td b/lib/Target/ARM/ARMSchedule.td
index 528c4ec73781..47a99313025c 100644
--- a/lib/Target/ARM/ARMSchedule.td
+++ b/lib/Target/ARM/ARMSchedule.td
@@ -94,7 +94,7 @@ def : PredicateProlog<[{
   (void)TII;
 }]>;
 
-def IsPredicatedPred : SchedPredicate<[{TII->isPredicated(MI)}]>;
+def IsPredicatedPred : SchedPredicate<[{TII->isPredicated(*MI)}]>;
 
 //===----------------------------------------------------------------------===//
 // Instruction Itinerary classes used for ARM
@@ -186,38 +186,50 @@ def IIC_iStore_mu  : InstrItinClass;
 def IIC_Preload    : InstrItinClass;
 def IIC_Br         : InstrItinClass;
 def IIC_fpSTAT     : InstrItinClass;
+def IIC_fpUNA16    : InstrItinClass;
 def IIC_fpUNA32    : InstrItinClass;
 def IIC_fpUNA64    : InstrItinClass;
+def IIC_fpCMP16    : InstrItinClass;
 def IIC_fpCMP32    : InstrItinClass;
 def IIC_fpCMP64    : InstrItinClass;
 def IIC_fpCVTSD    : InstrItinClass;
 def IIC_fpCVTDS    : InstrItinClass;
 def IIC_fpCVTSH    : InstrItinClass;
 def IIC_fpCVTHS    : InstrItinClass;
+def IIC_fpCVTIH    : InstrItinClass;
 def IIC_fpCVTIS    : InstrItinClass;
 def IIC_fpCVTID    : InstrItinClass;
+def IIC_fpCVTHI    : InstrItinClass;
 def IIC_fpCVTSI    : InstrItinClass;
 def IIC_fpCVTDI    : InstrItinClass;
 def IIC_fpMOVIS    : InstrItinClass;
 def IIC_fpMOVID    : InstrItinClass;
 def IIC_fpMOVSI    : InstrItinClass;
 def IIC_fpMOVDI    : InstrItinClass;
+def IIC_fpALU16    : InstrItinClass;
 def IIC_fpALU32    : InstrItinClass;
 def IIC_fpALU64    : InstrItinClass;
+def IIC_fpMUL16    : InstrItinClass;
 def IIC_fpMUL32    : InstrItinClass;
 def IIC_fpMUL64    : InstrItinClass;
+def IIC_fpMAC16    : InstrItinClass;
 def IIC_fpMAC32    : InstrItinClass;
 def IIC_fpMAC64    : InstrItinClass;
+def IIC_fpFMAC16   : InstrItinClass;
 def IIC_fpFMAC32   : InstrItinClass;
 def IIC_fpFMAC64   : InstrItinClass;
+def IIC_fpDIV16    : InstrItinClass;
 def IIC_fpDIV32    : InstrItinClass;
 def IIC_fpDIV64    : InstrItinClass;
+def IIC_fpSQRT16   : InstrItinClass;
 def IIC_fpSQRT32   : InstrItinClass;
 def IIC_fpSQRT64   : InstrItinClass;
+def IIC_fpLoad16   : InstrItinClass;
 def IIC_fpLoad32   : InstrItinClass;
 def IIC_fpLoad64   : InstrItinClass;
 def IIC_fpLoad_m   : InstrItinClass;
 def IIC_fpLoad_mu  : InstrItinClass;
+def IIC_fpStore16  : InstrItinClass;
 def IIC_fpStore32  : InstrItinClass;
 def IIC_fpStore64  : InstrItinClass;
 def IIC_fpStore_m  : InstrItinClass;
diff --git a/lib/Target/ARM/ARMScheduleA8.td b/lib/Target/ARM/ARMScheduleA8.td
index 2c6382542ab9..ba380cba100f 100644
--- a/lib/Target/ARM/ARMScheduleA8.td
+++ b/lib/Target/ARM/ARMScheduleA8.td
@@ -1065,11 +1065,11 @@ def CortexA8Itineraries : ProcessorItineraries<
 // Cortex-A8 machine model for scheduling and other instruction cost heuristics.
 def CortexA8Model : SchedMachineModel {
   let IssueWidth = 2; // 2 micro-ops are dispatched per cycle.
-  let MinLatency = -1; // OperandCycles are interpreted as MinLatency.
   let LoadLatency = 2; // Optimistic load latency assuming bypass.
                        // This is overriden by OperandCycles if the
                        // Itineraries are queried instead.
   let MispredictPenalty = 13; // Based on estimate of pipeline depth.
+  let CompleteModel = 0;
 
   let Itineraries = CortexA8Itineraries;
 }
diff --git a/lib/Target/ARM/ARMScheduleA9.td b/lib/Target/ARM/ARMScheduleA9.td
index 9a1d22275646..519e595bd184 100644
--- a/lib/Target/ARM/ARMScheduleA9.td
+++ b/lib/Target/ARM/ARMScheduleA9.td
@@ -2025,12 +2025,12 @@ def A9WriteAdr#NumAddr : WriteSequence<[A9WriteAdr], NumAddr>;
 
 // Define a predicate to select the LDM based on number of memory addresses.
 def A9LMAdr#NumAddr#Pred :
-  SchedPredicate<"(TII->getNumLDMAddresses(MI)+1)/2 == "#NumAddr>;
+  SchedPredicate<"(TII->getNumLDMAddresses(*MI)+1)/2 == "#NumAddr>;
 
 } // foreach NumAddr
 
 // Fall-back for unknown LDMs.
-def A9LMUnknownPred : SchedPredicate<"TII->getNumLDMAddresses(MI) == 0">;
+def A9LMUnknownPred : SchedPredicate<"TII->getNumLDMAddresses(*MI) == 0">;
 
 // LDM/VLDM/VLDn address generation latency & resources.
 // Dynamically select the A9WriteAdrN sequence using a predicate.
diff --git a/lib/Target/ARM/ARMScheduleSwift.td b/lib/Target/ARM/ARMScheduleSwift.td
index 3ad7730228e5..ea2bf4b578f0 100644
--- a/lib/Target/ARM/ARMScheduleSwift.td
+++ b/lib/Target/ARM/ARMScheduleSwift.td
@@ -374,7 +374,7 @@ let SchedModel = SwiftModel in {
   }
   // Predicate.
   foreach NumAddr = 1-16 in {
-    def SwiftLMAddr#NumAddr#Pred : SchedPredicate<"TII->getNumLDMAddresses(MI) == "#NumAddr>;
+    def SwiftLMAddr#NumAddr#Pred : SchedPredicate<"TII->getNumLDMAddresses(*MI) == "#NumAddr>;
   }
   def SwiftWriteLDMAddrNoWB : SchedWriteRes<[SwiftUnitP01]> { let Latency = 0; }
   def SwiftWriteLDMAddrWB : SchedWriteRes<[SwiftUnitP01, SwiftUnitP01]>;
diff --git a/lib/Target/ARM/ARMSelectionDAGInfo.cpp b/lib/Target/ARM/ARMSelectionDAGInfo.cpp
index 6fded9c8ab73..3b99762f7157 100644
--- a/lib/Target/ARM/ARMSelectionDAGInfo.cpp
+++ b/lib/Target/ARM/ARMSelectionDAGInfo.cpp
@@ -21,12 +21,9 @@ using namespace llvm;
 // Emit, if possible, a specialized version of the given Libcall. Typically this
 // means selecting the appropriately aligned version, but we also convert memset
 // of 0 into memclr.
-SDValue ARMSelectionDAGInfo::
-EmitSpecializedLibcall(SelectionDAG &DAG, SDLoc dl,
-                       SDValue Chain,
-                       SDValue Dst, SDValue Src,
-                       SDValue Size, unsigned Align,
-                       RTLIB::Libcall LC) const {
+SDValue ARMSelectionDAGInfo::EmitSpecializedLibcall(
+    SelectionDAG &DAG, const SDLoc &dl, SDValue Chain, SDValue Dst, SDValue Src,
+    SDValue Size, unsigned Align, RTLIB::Libcall LC) const {
   const ARMSubtarget &Subtarget =
       DAG.getMachineFunction().getSubtarget<ARMSubtarget>();
   const ARMTargetLowering *TLI = Subtarget.getTargetLowering();
@@ -121,21 +118,17 @@ EmitSpecializedLibcall(SelectionDAG &DAG, SDLoc dl,
            TLI->getLibcallCallingConv(LC), Type::getVoidTy(*DAG.getContext()),
            DAG.getExternalSymbol(FunctionNames[AEABILibcall][AlignVariant],
                                  TLI->getPointerTy(DAG.getDataLayout())),
-           std::move(Args), 0)
+           std::move(Args))
       .setDiscardResult();
   std::pair<SDValue,SDValue> CallResult = TLI->LowerCallTo(CLI);
   
   return CallResult.second;
 }
 
-SDValue
-ARMSelectionDAGInfo::EmitTargetCodeForMemcpy(SelectionDAG &DAG, SDLoc dl,
-                                             SDValue Chain,
-                                             SDValue Dst, SDValue Src,
-                                             SDValue Size, unsigned Align,
-                                             bool isVolatile, bool AlwaysInline,
-                                             MachinePointerInfo DstPtrInfo,
-                                          MachinePointerInfo SrcPtrInfo) const {
+SDValue ARMSelectionDAGInfo::EmitTargetCodeForMemcpy(
+    SelectionDAG &DAG, const SDLoc &dl, SDValue Chain, SDValue Dst, SDValue Src,
+    SDValue Size, unsigned Align, bool isVolatile, bool AlwaysInline,
+    MachinePointerInfo DstPtrInfo, MachinePointerInfo SrcPtrInfo) const {
   const ARMSubtarget &Subtarget =
       DAG.getMachineFunction().getSubtarget<ARMSubtarget>();
   // Do repeated 4-byte loads and stores. To be improved.
@@ -176,6 +169,12 @@ ARMSelectionDAGInfo::EmitTargetCodeForMemcpy(SelectionDAG &DAG, SDLoc dl,
   // emit.
   unsigned NumMEMCPYs = (NumMemOps + MaxLoadsInLDM - 1) / MaxLoadsInLDM;
 
+  // Code size optimisation: do not inline memcpy if expansion results in
+  // more instructions than the libary call.
+  if (NumMEMCPYs > 1 && DAG.getMachineFunction().getFunction()->optForMinSize()) {
+    return SDValue();
+  }
+
   SDVTList VTs = DAG.getVTList(MVT::i32, MVT::i32, MVT::Other, MVT::Glue);
 
   for (unsigned I = 0; I != NumMEMCPYs; ++I) {
@@ -213,8 +212,7 @@ ARMSelectionDAGInfo::EmitTargetCodeForMemcpy(SelectionDAG &DAG, SDLoc dl,
     Loads[i] = DAG.getLoad(VT, dl, Chain,
                            DAG.getNode(ISD::ADD, dl, MVT::i32, Src,
                                        DAG.getConstant(SrcOff, dl, MVT::i32)),
-                           SrcPtrInfo.getWithOffset(SrcOff),
-                           false, false, false, 0);
+                           SrcPtrInfo.getWithOffset(SrcOff));
     TFOps[i] = Loads[i].getValue(1);
     ++i;
     SrcOff += VTSize;
@@ -237,7 +235,7 @@ ARMSelectionDAGInfo::EmitTargetCodeForMemcpy(SelectionDAG &DAG, SDLoc dl,
     TFOps[i] = DAG.getStore(Chain, dl, Loads[i],
                             DAG.getNode(ISD::ADD, dl, MVT::i32, Dst,
                                         DAG.getConstant(DstOff, dl, MVT::i32)),
-                            DstPtrInfo.getWithOffset(DstOff), false, false, 0);
+                            DstPtrInfo.getWithOffset(DstOff));
     ++i;
     DstOff += VTSize;
     BytesLeft -= VTSize;
@@ -246,26 +244,18 @@ ARMSelectionDAGInfo::EmitTargetCodeForMemcpy(SelectionDAG &DAG, SDLoc dl,
                      makeArrayRef(TFOps, i));
 }
 
-
-SDValue ARMSelectionDAGInfo::
-EmitTargetCodeForMemmove(SelectionDAG &DAG, SDLoc dl,
-                         SDValue Chain,
-                         SDValue Dst, SDValue Src,
-                         SDValue Size, unsigned Align,
-                         bool isVolatile,
-                         MachinePointerInfo DstPtrInfo,
-                         MachinePointerInfo SrcPtrInfo) const {
+SDValue ARMSelectionDAGInfo::EmitTargetCodeForMemmove(
+    SelectionDAG &DAG, const SDLoc &dl, SDValue Chain, SDValue Dst, SDValue Src,
+    SDValue Size, unsigned Align, bool isVolatile,
+    MachinePointerInfo DstPtrInfo, MachinePointerInfo SrcPtrInfo) const {
   return EmitSpecializedLibcall(DAG, dl, Chain, Dst, Src, Size, Align,
                                 RTLIB::MEMMOVE);
 }
 
-
-SDValue ARMSelectionDAGInfo::
-EmitTargetCodeForMemset(SelectionDAG &DAG, SDLoc dl,
-                        SDValue Chain, SDValue Dst,
-                        SDValue Src, SDValue Size,
-                        unsigned Align, bool isVolatile,
-                        MachinePointerInfo DstPtrInfo) const {
+SDValue ARMSelectionDAGInfo::EmitTargetCodeForMemset(
+    SelectionDAG &DAG, const SDLoc &dl, SDValue Chain, SDValue Dst, SDValue Src,
+    SDValue Size, unsigned Align, bool isVolatile,
+    MachinePointerInfo DstPtrInfo) const {
   return EmitSpecializedLibcall(DAG, dl, Chain, Dst, Src, Size, Align,
                                 RTLIB::MEMSET);
 }
diff --git a/lib/Target/ARM/ARMSelectionDAGInfo.h b/lib/Target/ARM/ARMSelectionDAGInfo.h
index 289879ee1d7e..2ddb42c95397 100644
--- a/lib/Target/ARM/ARMSelectionDAGInfo.h
+++ b/lib/Target/ARM/ARMSelectionDAGInfo.h
@@ -7,7 +7,7 @@
 //
 //===----------------------------------------------------------------------===//
 //
-// This file defines the ARM subclass for TargetSelectionDAGInfo.
+// This file defines the ARM subclass for SelectionDAGTargetInfo.
 //
 //===----------------------------------------------------------------------===//
 
@@ -15,7 +15,8 @@
 #define LLVM_LIB_TARGET_ARM_ARMSELECTIONDAGINFO_H
 
 #include "MCTargetDesc/ARMAddressingModes.h"
-#include "llvm/Target/TargetSelectionDAGInfo.h"
+#include "llvm/CodeGen/RuntimeLibcalls.h"
+#include "llvm/CodeGen/SelectionDAGTargetInfo.h"
 
 namespace llvm {
 
@@ -35,35 +36,30 @@ namespace ARM_AM {
   }
 }  // end namespace ARM_AM
 
-class ARMSelectionDAGInfo : public TargetSelectionDAGInfo {
+class ARMSelectionDAGInfo : public SelectionDAGTargetInfo {
 public:
-
-  SDValue EmitTargetCodeForMemcpy(SelectionDAG &DAG, SDLoc dl,
-                                  SDValue Chain,
-                                  SDValue Dst, SDValue Src,
-                                  SDValue Size, unsigned Align,
-                                  bool isVolatile, bool AlwaysInline,
+  SDValue EmitTargetCodeForMemcpy(SelectionDAG &DAG, const SDLoc &dl,
+                                  SDValue Chain, SDValue Dst, SDValue Src,
+                                  SDValue Size, unsigned Align, bool isVolatile,
+                                  bool AlwaysInline,
                                   MachinePointerInfo DstPtrInfo,
                                   MachinePointerInfo SrcPtrInfo) const override;
 
-  SDValue EmitTargetCodeForMemmove(SelectionDAG &DAG, SDLoc dl,
-                                   SDValue Chain,
-                                   SDValue Dst, SDValue Src,
-                                   SDValue Size, unsigned Align, bool isVolatile,
-                                   MachinePointerInfo DstPtrInfo,
-                                   MachinePointerInfo SrcPtrInfo) const override;
+  SDValue
+  EmitTargetCodeForMemmove(SelectionDAG &DAG, const SDLoc &dl, SDValue Chain,
+                           SDValue Dst, SDValue Src, SDValue Size,
+                           unsigned Align, bool isVolatile,
+                           MachinePointerInfo DstPtrInfo,
+                           MachinePointerInfo SrcPtrInfo) const override;
 
   // Adjust parameters for memset, see RTABI section 4.3.4
-  SDValue EmitTargetCodeForMemset(SelectionDAG &DAG, SDLoc dl,
-                                  SDValue Chain,
-                                  SDValue Op1, SDValue Op2,
-                                  SDValue Op3, unsigned Align,
-                                  bool isVolatile,
+  SDValue EmitTargetCodeForMemset(SelectionDAG &DAG, const SDLoc &dl,
+                                  SDValue Chain, SDValue Op1, SDValue Op2,
+                                  SDValue Op3, unsigned Align, bool isVolatile,
                                   MachinePointerInfo DstPtrInfo) const override;
 
-  SDValue EmitSpecializedLibcall(SelectionDAG &DAG, SDLoc dl,
-                                 SDValue Chain,
-                                 SDValue Dst, SDValue Src,
+  SDValue EmitSpecializedLibcall(SelectionDAG &DAG, const SDLoc &dl,
+                                 SDValue Chain, SDValue Dst, SDValue Src,
                                  SDValue Size, unsigned Align,
                                  RTLIB::Libcall LC) const;
 };
diff --git a/lib/Target/ARM/ARMSubtarget.cpp b/lib/Target/ARM/ARMSubtarget.cpp
index bb6ae28065bd..1d7eef9ddcfd 100644
--- a/lib/Target/ARM/ARMSubtarget.cpp
+++ b/lib/Target/ARM/ARMSubtarget.cpp
@@ -88,10 +88,9 @@ ARMFrameLowering *ARMSubtarget::initializeFrameLowering(StringRef CPU,
 ARMSubtarget::ARMSubtarget(const Triple &TT, const std::string &CPU,
                            const std::string &FS,
                            const ARMBaseTargetMachine &TM, bool IsLittle)
-    : ARMGenSubtargetInfo(TT, CPU, FS), ARMProcFamily(Others),
-      ARMProcClass(None), ARMArch(ARMv4t), stackAlignment(4), CPUString(CPU),
-      IsLittle(IsLittle), TargetTriple(TT), Options(TM.Options), TM(TM),
-      FrameLowering(initializeFrameLowering(CPU, FS)),
+    : ARMGenSubtargetInfo(TT, CPU, FS), UseMulOps(UseFusedMulOps),
+      CPUString(CPU), IsLittle(IsLittle), TargetTriple(TT), Options(TM.Options),
+      TM(TM), FrameLowering(initializeFrameLowering(CPU, FS)),
       // At this point initializeSubtargetDependencies has been called so
       // we can query directly.
       InstrInfo(isThumb1Only()
@@ -102,63 +101,10 @@ ARMSubtarget::ARMSubtarget(const Triple &TT, const std::string &CPU,
       TLInfo(TM, *this) {}
 
 void ARMSubtarget::initializeEnvironment() {
-  HasV4TOps = false;
-  HasV5TOps = false;
-  HasV5TEOps = false;
-  HasV6Ops = false;
-  HasV6MOps = false;
-  HasV6KOps = false;
-  HasV6T2Ops = false;
-  HasV7Ops = false;
-  HasV8Ops = false;
-  HasV8_1aOps = false;
-  HasV8_2aOps = false;
-  HasVFPv2 = false;
-  HasVFPv3 = false;
-  HasVFPv4 = false;
-  HasFPARMv8 = false;
-  HasNEON = false;
-  UseNEONForSinglePrecisionFP = false;
-  UseMulOps = UseFusedMulOps;
-  SlowFPVMLx = false;
-  HasVMLxForwarding = false;
-  SlowFPBrcc = false;
-  InThumbMode = false;
-  UseSoftFloat = false;
-  HasThumb2 = false;
-  NoARM = false;
-  ReserveR9 = false;
-  NoMovt = false;
-  SupportsTailCall = false;
-  HasFP16 = false;
-  HasFullFP16 = false;
-  HasD16 = false;
-  HasHardwareDivide = false;
-  HasHardwareDivideInARM = false;
-  HasT2ExtractPack = false;
-  HasDataBarrier = false;
-  Pref32BitThumb = false;
-  AvoidCPSRPartialUpdate = false;
-  AvoidMOVsShifterOperand = false;
-  HasRAS = false;
-  HasMPExtension = false;
-  HasVirtualization = false;
-  FPOnlySP = false;
-  HasPerfMon = false;
-  HasTrustZone = false;
-  HasCrypto = false;
-  HasCRC = false;
-  HasZeroCycleZeroing = false;
-  StrictAlign = false;
-  HasDSP = false;
-  UseNaClTrap = false;
-  GenLongCalls = false;
-  UnsafeFPMath = false;
-
   // MCAsmInfo isn't always present (e.g. in opt) so we can't initialize this
   // directly from it, but we can try to make sure they're consistent when both
   // available.
-  UseSjLjEH = isTargetDarwin() && !isTargetWatchOS();
+  UseSjLjEH = isTargetDarwin() && !isTargetWatchABI();
   assert((!TM.getMCAsmInfo() ||
           (TM.getMCAsmInfo()->getExceptionHandlingType() ==
            ExceptionHandling::SjLj) == UseSjLjEH) &&
@@ -230,7 +176,7 @@ void ARMSubtarget::initSubtargetFeatures(StringRef CPU, StringRef FS) {
   // registers are the 4 used for parameters.  We don't currently do this
   // case.
 
-  SupportsTailCall = !isThumb1Only();
+  SupportsTailCall = !isThumb() || hasV8MBaselineOps();
 
   if (isTargetMachO() && isTargetIOS() && getTargetTriple().isOSVersionLT(5, 0))
     SupportsTailCall = false;
@@ -252,6 +198,53 @@ void ARMSubtarget::initSubtargetFeatures(StringRef CPU, StringRef FS) {
   if ((Bits[ARM::ProcA5] || Bits[ARM::ProcA8]) && // Where this matters
       (Options.UnsafeFPMath || isTargetDarwin()))
     UseNEONForSinglePrecisionFP = true;
+
+  // FIXME: Teach TableGen to deal with these instead of doing it manually here.
+  switch (ARMProcFamily) {
+  case Others:
+  case CortexA5:
+    break;
+  case CortexA7:
+    LdStMultipleTiming = DoubleIssue;
+    break;
+  case CortexA8:
+    LdStMultipleTiming = DoubleIssue;
+    break;
+  case CortexA9:
+    LdStMultipleTiming = DoubleIssueCheckUnalignedAccess;
+    PreISelOperandLatencyAdjustment = 1;
+    break;
+  case CortexA12:
+    break;
+  case CortexA15:
+    MaxInterleaveFactor = 2;
+    PreISelOperandLatencyAdjustment = 1;
+    PartialUpdateClearance = 12;
+    break;
+  case CortexA17:
+  case CortexA32:
+  case CortexA35:
+  case CortexA53:
+  case CortexA57:
+  case CortexA72:
+  case CortexA73:
+  case CortexR4:
+  case CortexR4F:
+  case CortexR5:
+  case CortexR7:
+  case CortexM3:
+  case ExynosM1:
+    break;
+  case Krait:
+    PreISelOperandLatencyAdjustment = 1;
+    break;
+  case Swift:
+    MaxInterleaveFactor = 2;
+    LdStMultipleTiming = SingleIssuePlusExtras;
+    PreISelOperandLatencyAdjustment = 1;
+    PartialUpdateClearance = 12;
+    break;
+  }
 }
 
 bool ARMSubtarget::isAPCS_ABI() const {
@@ -268,40 +261,16 @@ bool ARMSubtarget::isAAPCS16_ABI() const {
   return TM.TargetABI == ARMBaseTargetMachine::ARM_ABI_AAPCS16;
 }
 
+bool ARMSubtarget::isGVIndirectSymbol(const GlobalValue *GV) const {
+  if (!TM.shouldAssumeDSOLocal(*GV->getParent(), GV))
+    return true;
 
-/// GVIsIndirectSymbol - true if the GV will be accessed via an indirect symbol.
-bool
-ARMSubtarget::GVIsIndirectSymbol(const GlobalValue *GV,
-                                 Reloc::Model RelocM) const {
-  if (RelocM == Reloc::Static)
-    return false;
-
-  bool isDef = GV->isStrongDefinitionForLinker();
-
-  if (!isTargetMachO()) {
-    // Extra load is needed for all externally visible.
-    if (GV->hasLocalLinkage() || GV->hasHiddenVisibility())
-      return false;
+  // 32 bit macho has no relocation for a-b if a is undefined, even if b is in
+  // the section that is being relocated. This means we have to use o load even
+  // for GVs that are known to be local to the dso.
+  if (isTargetDarwin() && TM.isPositionIndependent() &&
+      (GV->isDeclarationForLinker() || GV->hasCommonLinkage()))
     return true;
-  } else {
-    // If this is a strong reference to a definition, it is definitely not
-    // through a stub.
-    if (isDef)
-      return false;
-
-    // Unless we have a symbol with hidden visibility, we have to go through a
-    // normal $non_lazy_ptr stub because this symbol might be resolved late.
-    if (!GV->hasHiddenVisibility())  // Non-hidden $non_lazy_ptr reference.
-      return true;
-
-    if (RelocM == Reloc::PIC_) {
-      // If symbol visibility is hidden, we have a stub for common symbol
-      // references and external declarations.
-      if (GV->isDeclarationForLinker() || GV->hasCommonLinkage())
-        // Hidden $non_lazy_ptr reference.
-        return true;
-    }
-  }
 
   return false;
 }
@@ -332,21 +301,21 @@ bool ARMSubtarget::enablePostRAScheduler() const {
 }
 
 bool ARMSubtarget::enableAtomicExpand() const {
-  return hasAnyDataBarrier() && !isThumb1Only();
+  return hasAnyDataBarrier() && (!isThumb() || hasV8MBaselineOps());
 }
 
 bool ARMSubtarget::useStride4VFPs(const MachineFunction &MF) const {
   // For general targets, the prologue can grow when VFPs are allocated with
   // stride 4 (more vpush instructions). But WatchOS uses a compact unwind
   // format which it's more important to get right.
-  return isTargetWatchOS() || (isSwift() && !MF.getFunction()->optForMinSize());
+  return isTargetWatchABI() || (isSwift() && !MF.getFunction()->optForMinSize());
 }
 
 bool ARMSubtarget::useMovt(const MachineFunction &MF) const {
   // NOTE Windows on ARM needs to use mov.w/mov.t pairs to materialise 32-bit
   // immediates as it is inherently position independent, and may be out of
   // range otherwise.
-  return !NoMovt && hasV6T2Ops() &&
+  return !NoMovt && hasV8MBaselineOps() &&
          (isTargetWindows() || !MF.getFunction()->optForMinSize());
 }
 
diff --git a/lib/Target/ARM/ARMSubtarget.h b/lib/Target/ARM/ARMSubtarget.h
index 4d54e5751473..910de0e1e72d 100644
--- a/lib/Target/ARM/ARMSubtarget.h
+++ b/lib/Target/ARM/ARMSubtarget.h
@@ -43,8 +43,9 @@ class ARMSubtarget : public ARMGenSubtargetInfo {
 protected:
   enum ARMProcFamilyEnum {
     Others, CortexA5, CortexA7, CortexA8, CortexA9, CortexA12, CortexA15,
-    CortexA17, CortexR4, CortexR4F, CortexR5, CortexR7, CortexA35, CortexA53,
-    CortexA57, CortexA72, Krait, Swift, ExynosM1
+    CortexA17, CortexR4, CortexR4F, CortexR5, CortexR7, CortexM3,
+    CortexA32, CortexA35, CortexA53, CortexA57, CortexA72, CortexA73,
+    Krait, Swift, ExynosM1
   };
   enum ARMProcClassEnum {
     None, AClass, RClass, MClass
@@ -52,188 +53,275 @@ protected:
   enum ARMArchEnum {
     ARMv2, ARMv2a, ARMv3, ARMv3m, ARMv4, ARMv4t, ARMv5, ARMv5t, ARMv5te,
     ARMv5tej, ARMv6, ARMv6k, ARMv6kz, ARMv6t2, ARMv6m, ARMv6sm, ARMv7a, ARMv7r,
-    ARMv7m, ARMv7em, ARMv8a, ARMv81a, ARMv82a
+    ARMv7m, ARMv7em, ARMv8a, ARMv81a, ARMv82a, ARMv8mMainline, ARMv8mBaseline
   };
 
+public:
+  /// What kind of timing do load multiple/store multiple instructions have.
+  enum ARMLdStMultipleTiming {
+    /// Can load/store 2 registers/cycle.
+    DoubleIssue,
+    /// Can load/store 2 registers/cycle, but needs an extra cycle if the access
+    /// is not 64-bit aligned.
+    DoubleIssueCheckUnalignedAccess,
+    /// Can load/store 1 register/cycle.
+    SingleIssue,
+    /// Can load/store 1 register/cycle, but needs an extra cycle for address
+    /// computation and potentially also for register writeback.
+    SingleIssuePlusExtras,
+  };
+
+protected:
   /// ARMProcFamily - ARM processor family: Cortex-A8, Cortex-A9, and others.
-  ARMProcFamilyEnum ARMProcFamily;
+  ARMProcFamilyEnum ARMProcFamily = Others;
 
   /// ARMProcClass - ARM processor class: None, AClass, RClass or MClass.
-  ARMProcClassEnum ARMProcClass;
+  ARMProcClassEnum ARMProcClass = None;
 
   /// ARMArch - ARM architecture
-  ARMArchEnum ARMArch;
+  ARMArchEnum ARMArch = ARMv4t;
 
   /// HasV4TOps, HasV5TOps, HasV5TEOps,
   /// HasV6Ops, HasV6MOps, HasV6KOps, HasV6T2Ops, HasV7Ops, HasV8Ops -
   /// Specify whether target support specific ARM ISA variants.
-  bool HasV4TOps;
-  bool HasV5TOps;
-  bool HasV5TEOps;
-  bool HasV6Ops;
-  bool HasV6MOps;
-  bool HasV6KOps;
-  bool HasV6T2Ops;
-  bool HasV7Ops;
-  bool HasV8Ops;
-  bool HasV8_1aOps;
-  bool HasV8_2aOps;
+  bool HasV4TOps = false;
+  bool HasV5TOps = false;
+  bool HasV5TEOps = false;
+  bool HasV6Ops = false;
+  bool HasV6MOps = false;
+  bool HasV6KOps = false;
+  bool HasV6T2Ops = false;
+  bool HasV7Ops = false;
+  bool HasV8Ops = false;
+  bool HasV8_1aOps = false;
+  bool HasV8_2aOps = false;
+  bool HasV8MBaselineOps = false;
+  bool HasV8MMainlineOps = false;
 
   /// HasVFPv2, HasVFPv3, HasVFPv4, HasFPARMv8, HasNEON - Specify what
   /// floating point ISAs are supported.
-  bool HasVFPv2;
-  bool HasVFPv3;
-  bool HasVFPv4;
-  bool HasFPARMv8;
-  bool HasNEON;
+  bool HasVFPv2 = false;
+  bool HasVFPv3 = false;
+  bool HasVFPv4 = false;
+  bool HasFPARMv8 = false;
+  bool HasNEON = false;
 
   /// UseNEONForSinglePrecisionFP - if the NEONFP attribute has been
   /// specified. Use the method useNEONForSinglePrecisionFP() to
   /// determine if NEON should actually be used.
-  bool UseNEONForSinglePrecisionFP;
+  bool UseNEONForSinglePrecisionFP = false;
 
   /// UseMulOps - True if non-microcoded fused integer multiply-add and
   /// multiply-subtract instructions should be used.
-  bool UseMulOps;
+  bool UseMulOps = false;
 
   /// SlowFPVMLx - If the VFP2 / NEON instructions are available, indicates
   /// whether the FP VML[AS] instructions are slow (if so, don't use them).
-  bool SlowFPVMLx;
+  bool SlowFPVMLx = false;
 
   /// HasVMLxForwarding - If true, NEON has special multiplier accumulator
   /// forwarding to allow mul + mla being issued back to back.
-  bool HasVMLxForwarding;
+  bool HasVMLxForwarding = false;
 
   /// SlowFPBrcc - True if floating point compare + branch is slow.
-  bool SlowFPBrcc;
+  bool SlowFPBrcc = false;
 
   /// InThumbMode - True if compiling for Thumb, false for ARM.
-  bool InThumbMode;
+  bool InThumbMode = false;
 
   /// UseSoftFloat - True if we're using software floating point features.
-  bool UseSoftFloat;
+  bool UseSoftFloat = false;
 
   /// HasThumb2 - True if Thumb2 instructions are supported.
-  bool HasThumb2;
+  bool HasThumb2 = false;
 
   /// NoARM - True if subtarget does not support ARM mode execution.
-  bool NoARM;
+  bool NoARM = false;
 
   /// ReserveR9 - True if R9 is not available as a general purpose register.
-  bool ReserveR9;
+  bool ReserveR9 = false;
 
   /// NoMovt - True if MOVT / MOVW pairs are not used for materialization of
   /// 32-bit imms (including global addresses).
-  bool NoMovt;
+  bool NoMovt = false;
 
   /// SupportsTailCall - True if the OS supports tail call. The dynamic linker
   /// must be able to synthesize call stubs for interworking between ARM and
   /// Thumb.
-  bool SupportsTailCall;
+  bool SupportsTailCall = false;
 
   /// HasFP16 - True if subtarget supports half-precision FP conversions
-  bool HasFP16;
+  bool HasFP16 = false;
 
   /// HasFullFP16 - True if subtarget supports half-precision FP operations
-  bool HasFullFP16;
+  bool HasFullFP16 = false;
 
   /// HasD16 - True if subtarget is limited to 16 double precision
   /// FP registers for VFPv3.
-  bool HasD16;
+  bool HasD16 = false;
 
   /// HasHardwareDivide - True if subtarget supports [su]div
-  bool HasHardwareDivide;
+  bool HasHardwareDivide = false;
 
   /// HasHardwareDivideInARM - True if subtarget supports [su]div in ARM mode
-  bool HasHardwareDivideInARM;
+  bool HasHardwareDivideInARM = false;
 
   /// HasT2ExtractPack - True if subtarget supports thumb2 extract/pack
   /// instructions.
-  bool HasT2ExtractPack;
+  bool HasT2ExtractPack = false;
 
   /// HasDataBarrier - True if the subtarget supports DMB / DSB data barrier
   /// instructions.
-  bool HasDataBarrier;
+  bool HasDataBarrier = false;
+
+  /// HasV7Clrex - True if the subtarget supports CLREX instructions
+  bool HasV7Clrex = false;
+
+  /// HasAcquireRelease - True if the subtarget supports v8 atomics (LDA/LDAEX etc)
+  /// instructions
+  bool HasAcquireRelease = false;
 
   /// Pref32BitThumb - If true, codegen would prefer 32-bit Thumb instructions
   /// over 16-bit ones.
-  bool Pref32BitThumb;
+  bool Pref32BitThumb = false;
 
   /// AvoidCPSRPartialUpdate - If true, codegen would avoid using instructions
   /// that partially update CPSR and add false dependency on the previous
   /// CPSR setting instruction.
-  bool AvoidCPSRPartialUpdate;
+  bool AvoidCPSRPartialUpdate = false;
 
   /// AvoidMOVsShifterOperand - If true, codegen should avoid using flag setting
   /// movs with shifter operand (i.e. asr, lsl, lsr).
-  bool AvoidMOVsShifterOperand;
+  bool AvoidMOVsShifterOperand = false;
 
-  /// HasRAS - Some processors perform return stack prediction. CodeGen should
+  /// HasRetAddrStack - Some processors perform return stack prediction. CodeGen should
   /// avoid issue "normal" call instructions to callees which do not return.
-  bool HasRAS;
+  bool HasRetAddrStack = false;
 
   /// HasMPExtension - True if the subtarget supports Multiprocessing
   /// extension (ARMv7 only).
-  bool HasMPExtension;
+  bool HasMPExtension = false;
 
   /// HasVirtualization - True if the subtarget supports the Virtualization
   /// extension.
-  bool HasVirtualization;
+  bool HasVirtualization = false;
 
   /// FPOnlySP - If true, the floating point unit only supports single
   /// precision.
-  bool FPOnlySP;
+  bool FPOnlySP = false;
 
   /// If true, the processor supports the Performance Monitor Extensions. These
   /// include a generic cycle-counter as well as more fine-grained (often
   /// implementation-specific) events.
-  bool HasPerfMon;
+  bool HasPerfMon = false;
 
   /// HasTrustZone - if true, processor supports TrustZone security extensions
-  bool HasTrustZone;
+  bool HasTrustZone = false;
+
+  /// Has8MSecExt - if true, processor supports ARMv8-M Security Extensions
+  bool Has8MSecExt = false;
 
   /// HasCrypto - if true, processor supports Cryptography extensions
-  bool HasCrypto;
+  bool HasCrypto = false;
 
   /// HasCRC - if true, processor supports CRC instructions
-  bool HasCRC;
+  bool HasCRC = false;
+
+  /// HasRAS - if true, the processor supports RAS extensions
+  bool HasRAS = false;
 
   /// If true, the instructions "vmov.i32 d0, #0" and "vmov.i32 q0, #0" are
   /// particularly effective at zeroing a VFP register.
-  bool HasZeroCycleZeroing;
+  bool HasZeroCycleZeroing = false;
+
+  /// If true, if conversion may decide to leave some instructions unpredicated.
+  bool IsProfitableToUnpredicate = false;
+
+  /// If true, VMOV will be favored over VGETLNi32.
+  bool HasSlowVGETLNi32 = false;
+
+  /// If true, VMOV will be favored over VDUP.
+  bool HasSlowVDUP32 = false;
+
+  /// If true, VMOVSR will be favored over VMOVDRR.
+  bool PreferVMOVSR = false;
+
+  /// If true, ISHST barriers will be used for Release semantics.
+  bool PreferISHST = false;
+
+  /// If true, a VLDM/VSTM starting with an odd register number is considered to
+  /// take more microops than single VLDRS/VSTRS.
+  bool SlowOddRegister = false;
+
+  /// If true, loading into a D subregister will be penalized.
+  bool SlowLoadDSubregister = false;
+
+  /// If true, the AGU and NEON/FPU units are multiplexed.
+  bool HasMuxedUnits = false;
+
+  /// If true, VMOVS will never be widened to VMOVD
+  bool DontWidenVMOVS = false;
+
+  /// If true, run the MLx expansion pass.
+  bool ExpandMLx = false;
+
+  /// If true, VFP/NEON VMLA/VMLS have special RAW hazards.
+  bool HasVMLxHazards = false;
+
+  /// If true, VMOVRS, VMOVSR and VMOVS will be converted from VFP to NEON.
+  bool UseNEONForFPMovs = false;
+
+  /// If true, VLDn instructions take an extra cycle for unaligned accesses.
+  bool CheckVLDnAlign = false;
+
+  /// If true, VFP instructions are not pipelined.
+  bool NonpipelinedVFP = false;
 
   /// StrictAlign - If true, the subtarget disallows unaligned memory
   /// accesses for some types.  For details, see
   /// ARMTargetLowering::allowsMisalignedMemoryAccesses().
-  bool StrictAlign;
+  bool StrictAlign = false;
 
   /// RestrictIT - If true, the subtarget disallows generation of deprecated IT
   ///  blocks to conform to ARMv8 rule.
-  bool RestrictIT;
+  bool RestrictIT = false;
 
   /// HasDSP - If true, the subtarget supports the DSP (saturating arith
   /// and such) instructions.
-  bool HasDSP;
+  bool HasDSP = false;
 
   /// NaCl TRAP instruction is generated instead of the regular TRAP.
-  bool UseNaClTrap;
+  bool UseNaClTrap = false;
 
   /// Generate calls via indirect call instructions.
-  bool GenLongCalls;
+  bool GenLongCalls = false;
 
   /// Target machine allowed unsafe FP math (such as use of NEON fp)
-  bool UnsafeFPMath;
+  bool UnsafeFPMath = false;
 
   /// UseSjLjEH - If true, the target uses SjLj exception handling (e.g. iOS).
-  bool UseSjLjEH;
+  bool UseSjLjEH = false;
 
   /// stackAlignment - The minimum alignment known to hold of the stack frame on
   /// entry to the function and which must be maintained by every function.
-  unsigned stackAlignment;
+  unsigned stackAlignment = 4;
 
   /// CPUString - String name of used CPU.
   std::string CPUString;
 
+  unsigned MaxInterleaveFactor = 1;
+
+  /// Clearance before partial register updates (in number of instructions)
+  unsigned PartialUpdateClearance = 0;
+
+  /// What kind of timing do load multiple/store multiple have (double issue,
+  /// single issue etc).
+  ARMLdStMultipleTiming LdStMultipleTiming = SingleIssue;
+
+  /// The adjustment that we need to apply to get the operand latency from the
+  /// operand cycle returned by the itinerary data for pre-ISel operands.
+  int PreISelOperandLatencyAdjustment = 2;
+
   /// IsLittle - The target is Little Endian
   bool IsLittle;
 
@@ -313,17 +401,23 @@ public:
   bool hasV8Ops()   const { return HasV8Ops;  }
   bool hasV8_1aOps() const { return HasV8_1aOps; }
   bool hasV8_2aOps() const { return HasV8_2aOps; }
+  bool hasV8MBaselineOps() const { return HasV8MBaselineOps; }
+  bool hasV8MMainlineOps() const { return HasV8MMainlineOps; }
 
+  /// @{
+  /// These functions are obsolete, please consider adding subtarget features
+  /// or properties instead of calling them.
   bool isCortexA5() const { return ARMProcFamily == CortexA5; }
   bool isCortexA7() const { return ARMProcFamily == CortexA7; }
   bool isCortexA8() const { return ARMProcFamily == CortexA8; }
   bool isCortexA9() const { return ARMProcFamily == CortexA9; }
   bool isCortexA15() const { return ARMProcFamily == CortexA15; }
   bool isSwift()    const { return ARMProcFamily == Swift; }
-  bool isCortexM3() const { return CPUString == "cortex-m3"; }
+  bool isCortexM3() const { return ARMProcFamily == CortexM3; }
   bool isLikeA9() const { return isCortexA9() || isCortexA15() || isKrait(); }
   bool isCortexR5() const { return ARMProcFamily == CortexR5; }
   bool isKrait() const { return ARMProcFamily == Krait; }
+  /// @}
 
   bool hasARMOps() const { return !NoARM; }
 
@@ -334,6 +428,7 @@ public:
   bool hasNEON() const { return HasNEON;  }
   bool hasCrypto() const { return HasCrypto; }
   bool hasCRC() const { return HasCRC; }
+  bool hasRAS() const { return HasRAS; }
   bool hasVirtualization() const { return HasVirtualization; }
   bool useNEONForSinglePrecisionFP() const {
     return hasNEON() && UseNEONForSinglePrecisionFP;
@@ -343,6 +438,8 @@ public:
   bool hasDivideInARMMode() const { return HasHardwareDivideInARM; }
   bool hasT2ExtractPack() const { return HasT2ExtractPack; }
   bool hasDataBarrier() const { return HasDataBarrier; }
+  bool hasV7Clrex() const { return HasV7Clrex; }
+  bool hasAcquireRelease() const { return HasAcquireRelease; }
   bool hasAnyDataBarrier() const {
     return HasDataBarrier || (hasV6Ops() && !isThumb());
   }
@@ -353,11 +450,26 @@ public:
   bool isFPOnlySP() const { return FPOnlySP; }
   bool hasPerfMon() const { return HasPerfMon; }
   bool hasTrustZone() const { return HasTrustZone; }
+  bool has8MSecExt() const { return Has8MSecExt; }
   bool hasZeroCycleZeroing() const { return HasZeroCycleZeroing; }
+  bool isProfitableToUnpredicate() const { return IsProfitableToUnpredicate; }
+  bool hasSlowVGETLNi32() const { return HasSlowVGETLNi32; }
+  bool hasSlowVDUP32() const { return HasSlowVDUP32; }
+  bool preferVMOVSR() const { return PreferVMOVSR; }
+  bool preferISHSTBarriers() const { return PreferISHST; }
+  bool expandMLx() const { return ExpandMLx; }
+  bool hasVMLxHazards() const { return HasVMLxHazards; }
+  bool hasSlowOddRegister() const { return SlowOddRegister; }
+  bool hasSlowLoadDSubregister() const { return SlowLoadDSubregister; }
+  bool hasMuxedUnits() const { return HasMuxedUnits; }
+  bool dontWidenVMOVS() const { return DontWidenVMOVS; }
+  bool useNEONForFPMovs() const { return UseNEONForFPMovs; }
+  bool checkVLDnAccessAlignment() const { return CheckVLDnAlign; }
+  bool nonpipelinedVFP() const { return NonpipelinedVFP; }
   bool prefers32BitThumb() const { return Pref32BitThumb; }
   bool avoidCPSRPartialUpdate() const { return AvoidCPSRPartialUpdate; }
   bool avoidMOVsShifterOperand() const { return AvoidMOVsShifterOperand; }
-  bool hasRAS() const { return HasRAS; }
+  bool hasRetAddrStack() const { return HasRetAddrStack; }
   bool hasMPExtension() const { return HasMPExtension; }
   bool hasDSP() const { return HasDSP; }
   bool useNaClTrap() const { return UseNaClTrap; }
@@ -373,6 +485,7 @@ public:
   bool isTargetDarwin() const { return TargetTriple.isOSDarwin(); }
   bool isTargetIOS() const { return TargetTriple.isiOS(); }
   bool isTargetWatchOS() const { return TargetTriple.isWatchOS(); }
+  bool isTargetWatchABI() const { return TargetTriple.isWatchABI(); }
   bool isTargetLinux() const { return TargetTriple.isOSLinux(); }
   bool isTargetNaCl() const { return TargetTriple.isOSNaCl(); }
   bool isTargetNetBSD() const { return TargetTriple.isOSNetBSD(); }
@@ -399,14 +512,21 @@ public:
             TargetTriple.getEnvironment() == Triple::GNUEABIHF) &&
            !isTargetDarwin() && !isTargetWindows();
   }
+  bool isTargetMuslAEABI() const {
+    return (TargetTriple.getEnvironment() == Triple::MuslEABI ||
+            TargetTriple.getEnvironment() == Triple::MuslEABIHF) &&
+           !isTargetDarwin() && !isTargetWindows();
+  }
 
   // ARM Targets that support EHABI exception handling standard
   // Darwin uses SjLj. Other targets might need more checks.
   bool isTargetEHABICompatible() const {
     return (TargetTriple.getEnvironment() == Triple::EABI ||
             TargetTriple.getEnvironment() == Triple::GNUEABI ||
+            TargetTriple.getEnvironment() == Triple::MuslEABI ||
             TargetTriple.getEnvironment() == Triple::EABIHF ||
             TargetTriple.getEnvironment() == Triple::GNUEABIHF ||
+            TargetTriple.getEnvironment() == Triple::MuslEABIHF ||
             isTargetAndroid()) &&
            !isTargetDarwin() && !isTargetWindows();
   }
@@ -414,6 +534,7 @@ public:
   bool isTargetHardFloat() const {
     // FIXME: this is invalid for WindowsCE
     return TargetTriple.getEnvironment() == Triple::GNUEABIHF ||
+           TargetTriple.getEnvironment() == Triple::MuslEABIHF ||
            TargetTriple.getEnvironment() == Triple::EABIHF ||
            isTargetWindows() || isAAPCS16_ABI();
   }
@@ -436,6 +557,13 @@ public:
     return isTargetMachO() ? (ReserveR9 || !HasV6Ops) : ReserveR9;
   }
 
+  /// Returns true if the frame setup is split into two separate pushes (first
+  /// r0-r7,lr then r8-r11), principally so that the frame pointer is adjacent
+  /// to lr.
+  bool splitFramePushPop() const {
+    return isTargetMachO();
+  }
+
   bool useStride4VFPs(const MachineFunction &MF) const;
 
   bool useMovt(const MachineFunction &MF) const;
@@ -476,9 +604,20 @@ public:
   /// function for this subtarget.
   unsigned getStackAlignment() const { return stackAlignment; }
 
-  /// GVIsIndirectSymbol - true if the GV will be accessed via an indirect
-  /// symbol.
-  bool GVIsIndirectSymbol(const GlobalValue *GV, Reloc::Model RelocM) const;
+  unsigned getMaxInterleaveFactor() const { return MaxInterleaveFactor; }
+
+  unsigned getPartialUpdateClearance() const { return PartialUpdateClearance; }
+
+  ARMLdStMultipleTiming getLdStMultipleTiming() const {
+    return LdStMultipleTiming;
+  }
+
+  int getPreISelOperandLatencyAdjustment() const {
+    return PreISelOperandLatencyAdjustment;
+  }
+
+  /// True if the GV will be accessed via an indirect symbol.
+  bool isGVIndirectSymbol(const GlobalValue *GV) const;
 
   /// True if fast-isel is used.
   bool useFastISel() const;
diff --git a/lib/Target/ARM/ARMTargetMachine.cpp b/lib/Target/ARM/ARMTargetMachine.cpp
index fca1901dc57c..dc730a675bef 100644
--- a/lib/Target/ARM/ARMTargetMachine.cpp
+++ b/lib/Target/ARM/ARMTargetMachine.cpp
@@ -16,6 +16,7 @@
 #include "ARMTargetObjectFile.h"
 #include "ARMTargetTransformInfo.h"
 #include "llvm/CodeGen/Passes.h"
+#include "llvm/CodeGen/TargetPassConfig.h"
 #include "llvm/IR/Function.h"
 #include "llvm/IR/LegacyPassManager.h"
 #include "llvm/MC/MCAsmInfo.h"
@@ -53,6 +54,10 @@ extern "C" void LLVMInitializeARMTarget() {
   RegisterTargetMachine<ARMBETargetMachine> Y(TheARMBETarget);
   RegisterTargetMachine<ThumbLETargetMachine> A(TheThumbLETarget);
   RegisterTargetMachine<ThumbBETargetMachine> B(TheThumbBETarget);
+
+  PassRegistry &Registry = *PassRegistry::getPassRegistry();
+  initializeARMLoadStoreOptPass(Registry);
+  initializeARMPreAllocLoadStoreOptPass(Registry);
 }
 
 static std::unique_ptr<TargetLoweringObjectFile> createTLOF(const Triple &TT) {
@@ -85,7 +90,7 @@ computeTargetABI(const Triple &TT, StringRef CPU,
         (TT.getOS() == llvm::Triple::UnknownOS && TT.isOSBinFormatMachO()) ||
         CPU.startswith("cortex-m")) {
       TargetABI = ARMBaseTargetMachine::ARM_ABI_AAPCS;
-    } else if (TT.isWatchOS()) {
+    } else if (TT.isWatchABI()) {
       TargetABI = ARMBaseTargetMachine::ARM_ABI_AAPCS16;
     } else {
       TargetABI = ARMBaseTargetMachine::ARM_ABI_APCS;
@@ -99,6 +104,8 @@ computeTargetABI(const Triple &TT, StringRef CPU,
     case llvm::Triple::Android:
     case llvm::Triple::GNUEABI:
     case llvm::Triple::GNUEABIHF:
+    case llvm::Triple::MuslEABI:
+    case llvm::Triple::MuslEABIHF:
     case llvm::Triple::EABIHF:
     case llvm::Triple::EABI:
       TargetABI = ARMBaseTargetMachine::ARM_ABI_AAPCS;
@@ -171,15 +178,30 @@ static std::string computeDataLayout(const Triple &TT, StringRef CPU,
   return Ret;
 }
 
-/// TargetMachine ctor - Create an ARM architecture model.
+static Reloc::Model getEffectiveRelocModel(const Triple &TT,
+                                           Optional<Reloc::Model> RM) {
+  if (!RM.hasValue())
+    // Default relocation model on Darwin is PIC.
+    return TT.isOSBinFormatMachO() ? Reloc::PIC_ : Reloc::Static;
+
+  // DynamicNoPIC is only used on darwin.
+  if (*RM == Reloc::DynamicNoPIC && !TT.isOSDarwin())
+    return Reloc::Static;
+
+  return *RM;
+}
+
+/// Create an ARM architecture model.
 ///
 ARMBaseTargetMachine::ARMBaseTargetMachine(const Target &T, const Triple &TT,
                                            StringRef CPU, StringRef FS,
                                            const TargetOptions &Options,
-                                           Reloc::Model RM, CodeModel::Model CM,
+                                           Optional<Reloc::Model> RM,
+                                           CodeModel::Model CM,
                                            CodeGenOpt::Level OL, bool isLittle)
     : LLVMTargetMachine(T, computeDataLayout(TT, CPU, Options, isLittle), TT,
-                        CPU, FS, Options, RM, CM, OL),
+                        CPU, FS, Options, getEffectiveRelocModel(TT, RM), CM,
+                        OL),
       TargetABI(computeTargetABI(TT, CPU, Options)),
       TLOF(createTLOF(getTargetTriple())),
       Subtarget(TT, CPU, FS, *this, isLittle), isLittle(isLittle) {
@@ -192,7 +214,8 @@ ARMBaseTargetMachine::ARMBaseTargetMachine(const Target &T, const Triple &TT,
   // Default to triple-appropriate EABI
   if (Options.EABIVersion == EABI::Default ||
       Options.EABIVersion == EABI::Unknown) {
-    if (Subtarget.isTargetGNUAEABI())
+    // musl is compatible with glibc with regard to EABI version
+    if (Subtarget.isTargetGNUAEABI() || Subtarget.isTargetMuslAEABI())
       this->Options.EABIVersion = EABI::GNU;
     else
       this->Options.EABIVersion = EABI::EABI5;
@@ -219,7 +242,6 @@ ARMBaseTargetMachine::getSubtargetImpl(const Function &F) const {
   // it as a key for the subtarget since that can be the only difference
   // between two functions.
   bool SoftFloat =
-      F.hasFnAttribute("use-soft-float") &&
       F.getFnAttribute("use-soft-float").getValueAsString() == "true";
   // If the soft float attribute is set on the function turn on the soft float
   // subtarget feature.
@@ -248,8 +270,9 @@ void ARMTargetMachine::anchor() {}
 ARMTargetMachine::ARMTargetMachine(const Target &T, const Triple &TT,
                                    StringRef CPU, StringRef FS,
                                    const TargetOptions &Options,
-                                   Reloc::Model RM, CodeModel::Model CM,
-                                   CodeGenOpt::Level OL, bool isLittle)
+                                   Optional<Reloc::Model> RM,
+                                   CodeModel::Model CM, CodeGenOpt::Level OL,
+                                   bool isLittle)
     : ARMBaseTargetMachine(T, TT, CPU, FS, Options, RM, CM, OL, isLittle) {
   initAsmInfo();
   if (!Subtarget.hasARMOps())
@@ -262,7 +285,8 @@ void ARMLETargetMachine::anchor() {}
 ARMLETargetMachine::ARMLETargetMachine(const Target &T, const Triple &TT,
                                        StringRef CPU, StringRef FS,
                                        const TargetOptions &Options,
-                                       Reloc::Model RM, CodeModel::Model CM,
+                                       Optional<Reloc::Model> RM,
+                                       CodeModel::Model CM,
                                        CodeGenOpt::Level OL)
     : ARMTargetMachine(T, TT, CPU, FS, Options, RM, CM, OL, true) {}
 
@@ -271,7 +295,8 @@ void ARMBETargetMachine::anchor() {}
 ARMBETargetMachine::ARMBETargetMachine(const Target &T, const Triple &TT,
                                        StringRef CPU, StringRef FS,
                                        const TargetOptions &Options,
-                                       Reloc::Model RM, CodeModel::Model CM,
+                                       Optional<Reloc::Model> RM,
+                                       CodeModel::Model CM,
                                        CodeGenOpt::Level OL)
     : ARMTargetMachine(T, TT, CPU, FS, Options, RM, CM, OL, false) {}
 
@@ -280,7 +305,8 @@ void ThumbTargetMachine::anchor() {}
 ThumbTargetMachine::ThumbTargetMachine(const Target &T, const Triple &TT,
                                        StringRef CPU, StringRef FS,
                                        const TargetOptions &Options,
-                                       Reloc::Model RM, CodeModel::Model CM,
+                                       Optional<Reloc::Model> RM,
+                                       CodeModel::Model CM,
                                        CodeGenOpt::Level OL, bool isLittle)
     : ARMBaseTargetMachine(T, TT, CPU, FS, Options, RM, CM, OL, isLittle) {
   initAsmInfo();
@@ -291,7 +317,8 @@ void ThumbLETargetMachine::anchor() {}
 ThumbLETargetMachine::ThumbLETargetMachine(const Target &T, const Triple &TT,
                                            StringRef CPU, StringRef FS,
                                            const TargetOptions &Options,
-                                           Reloc::Model RM, CodeModel::Model CM,
+                                           Optional<Reloc::Model> RM,
+                                           CodeModel::Model CM,
                                            CodeGenOpt::Level OL)
     : ThumbTargetMachine(T, TT, CPU, FS, Options, RM, CM, OL, true) {}
 
@@ -300,7 +327,8 @@ void ThumbBETargetMachine::anchor() {}
 ThumbBETargetMachine::ThumbBETargetMachine(const Target &T, const Triple &TT,
                                            StringRef CPU, StringRef FS,
                                            const TargetOptions &Options,
-                                           Reloc::Model RM, CodeModel::Model CM,
+                                           Optional<Reloc::Model> RM,
+                                           CodeModel::Model CM,
                                            CodeGenOpt::Level OL)
     : ThumbTargetMachine(T, TT, CPU, FS, Options, RM, CM, OL, false) {}
 
diff --git a/lib/Target/ARM/ARMTargetMachine.h b/lib/Target/ARM/ARMTargetMachine.h
index 8ad1f3dc2c34..c6b70b953162 100644
--- a/lib/Target/ARM/ARMTargetMachine.h
+++ b/lib/Target/ARM/ARMTargetMachine.h
@@ -39,7 +39,7 @@ protected:
 public:
   ARMBaseTargetMachine(const Target &T, const Triple &TT, StringRef CPU,
                        StringRef FS, const TargetOptions &Options,
-                       Reloc::Model RM, CodeModel::Model CM,
+                       Optional<Reloc::Model> RM, CodeModel::Model CM,
                        CodeGenOpt::Level OL, bool isLittle);
   ~ARMBaseTargetMachine() override;
 
@@ -58,39 +58,40 @@ public:
   }
 };
 
-/// ARMTargetMachine - ARM target machine.
+/// ARM target machine.
 ///
 class ARMTargetMachine : public ARMBaseTargetMachine {
   virtual void anchor();
  public:
    ARMTargetMachine(const Target &T, const Triple &TT, StringRef CPU,
-                    StringRef FS, const TargetOptions &Options, Reloc::Model RM,
-                    CodeModel::Model CM, CodeGenOpt::Level OL, bool isLittle);
+                    StringRef FS, const TargetOptions &Options,
+                    Optional<Reloc::Model> RM, CodeModel::Model CM,
+                    CodeGenOpt::Level OL, bool isLittle);
 };
 
-/// ARMLETargetMachine - ARM little endian target machine.
+/// ARM little endian target machine.
 ///
 class ARMLETargetMachine : public ARMTargetMachine {
   void anchor() override;
 public:
   ARMLETargetMachine(const Target &T, const Triple &TT, StringRef CPU,
                      StringRef FS, const TargetOptions &Options,
-                     Reloc::Model RM, CodeModel::Model CM,
+                     Optional<Reloc::Model> RM, CodeModel::Model CM,
                      CodeGenOpt::Level OL);
 };
 
-/// ARMBETargetMachine - ARM big endian target machine.
+/// ARM big endian target machine.
 ///
 class ARMBETargetMachine : public ARMTargetMachine {
   void anchor() override;
 public:
   ARMBETargetMachine(const Target &T, const Triple &TT, StringRef CPU,
                      StringRef FS, const TargetOptions &Options,
-                     Reloc::Model RM, CodeModel::Model CM,
+                     Optional<Reloc::Model> RM, CodeModel::Model CM,
                      CodeGenOpt::Level OL);
 };
 
-/// ThumbTargetMachine - Thumb target machine.
+/// Thumb target machine.
 /// Due to the way architectures are handled, this represents both
 ///   Thumb-1 and Thumb-2.
 ///
@@ -99,29 +100,29 @@ class ThumbTargetMachine : public ARMBaseTargetMachine {
 public:
   ThumbTargetMachine(const Target &T, const Triple &TT, StringRef CPU,
                      StringRef FS, const TargetOptions &Options,
-                     Reloc::Model RM, CodeModel::Model CM, CodeGenOpt::Level OL,
-                     bool isLittle);
+                     Optional<Reloc::Model> RM, CodeModel::Model CM,
+                     CodeGenOpt::Level OL, bool isLittle);
 };
 
-/// ThumbLETargetMachine - Thumb little endian target machine.
+/// Thumb little endian target machine.
 ///
 class ThumbLETargetMachine : public ThumbTargetMachine {
   void anchor() override;
 public:
   ThumbLETargetMachine(const Target &T, const Triple &TT, StringRef CPU,
                        StringRef FS, const TargetOptions &Options,
-                       Reloc::Model RM, CodeModel::Model CM,
+                       Optional<Reloc::Model> RM, CodeModel::Model CM,
                        CodeGenOpt::Level OL);
 };
 
-/// ThumbBETargetMachine - Thumb big endian target machine.
+/// Thumb big endian target machine.
 ///
 class ThumbBETargetMachine : public ThumbTargetMachine {
   void anchor() override;
 public:
   ThumbBETargetMachine(const Target &T, const Triple &TT, StringRef CPU,
                        StringRef FS, const TargetOptions &Options,
-                       Reloc::Model RM, CodeModel::Model CM,
+                       Optional<Reloc::Model> RM, CodeModel::Model CM,
                        CodeGenOpt::Level OL);
 };
 
diff --git a/lib/Target/ARM/ARMTargetObjectFile.h b/lib/Target/ARM/ARMTargetObjectFile.h
index 98e8763c4705..b1db201cb30d 100644
--- a/lib/Target/ARM/ARMTargetObjectFile.h
+++ b/lib/Target/ARM/ARMTargetObjectFile.h
@@ -21,10 +21,10 @@ class ARMElfTargetObjectFile : public TargetLoweringObjectFileELF {
 protected:
   const MCSection *AttributesSection;
 public:
-  ARMElfTargetObjectFile() :
-    TargetLoweringObjectFileELF(),
-    AttributesSection(nullptr)
-  {}
+  ARMElfTargetObjectFile()
+      : TargetLoweringObjectFileELF(), AttributesSection(nullptr) {
+    PLTRelativeVariantKind = MCSymbolRefExpr::VK_ARM_PREL31;
+  }
 
   void Initialize(MCContext &Ctx, const TargetMachine &TM) override;
 
diff --git a/lib/Target/ARM/ARMTargetTransformInfo.cpp b/lib/Target/ARM/ARMTargetTransformInfo.cpp
index c1520119ef21..13c5dc61acd9 100644
--- a/lib/Target/ARM/ARMTargetTransformInfo.cpp
+++ b/lib/Target/ARM/ARMTargetTransformInfo.cpp
@@ -18,12 +18,12 @@ using namespace llvm;
 int ARMTTIImpl::getIntImmCost(const APInt &Imm, Type *Ty) {
   assert(Ty->isIntegerTy());
 
-  unsigned Bits = Ty->getPrimitiveSizeInBits();
-  if (Bits == 0 || Bits > 32)
-    return 4;
+ unsigned Bits = Ty->getPrimitiveSizeInBits();
+ if (Bits == 0 || Imm.getActiveBits() >= 64)
+   return 4;
 
-  int32_t SImmVal = Imm.getSExtValue();
-  uint32_t ZImmVal = Imm.getZExtValue();
+  int64_t SImmVal = Imm.getSExtValue();
+  uint64_t ZImmVal = Imm.getZExtValue();
   if (!ST->isThumb()) {
     if ((SImmVal >= 0 && SImmVal < 65536) ||
         (ARM_AM::getSOImmVal(ZImmVal) != -1) ||
@@ -47,6 +47,32 @@ int ARMTTIImpl::getIntImmCost(const APInt &Imm, Type *Ty) {
   return 3;
 }
 
+
+// Constants smaller than 256 fit in the immediate field of
+// Thumb1 instructions so we return a zero cost and 1 otherwise.
+int ARMTTIImpl::getIntImmCodeSizeCost(unsigned Opcode, unsigned Idx,
+                                      const APInt &Imm, Type *Ty) {
+  if (Imm.isNonNegative() && Imm.getLimitedValue() < 256)
+    return 0;
+
+  return 1;
+}
+
+int ARMTTIImpl::getIntImmCost(unsigned Opcode, unsigned Idx, const APInt &Imm,
+                              Type *Ty) {
+  // Division by a constant can be turned into multiplication, but only if we
+  // know it's constant. So it's not so much that the immediate is cheap (it's
+  // not), but that the alternative is worse.
+  // FIXME: this is probably unneeded with GlobalISel.
+  if ((Opcode == Instruction::SDiv || Opcode == Instruction::UDiv ||
+       Opcode == Instruction::SRem || Opcode == Instruction::URem) &&
+      Idx == 1)
+    return 0;
+
+  return getIntImmCost(Imm, Ty);
+}
+
+
 int ARMTTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src) {
   int ISD = TLI->InstructionOpcodeToISD(Opcode);
   assert(ISD && "Invalid opcode");
@@ -244,10 +270,8 @@ int ARMTTIImpl::getVectorInstrCost(unsigned Opcode, Type *ValTy,
                                    unsigned Index) {
   // Penalize inserting into an D-subregister. We end up with a three times
   // lower estimated throughput on swift.
-  if (ST->isSwift() &&
-      Opcode == Instruction::InsertElement &&
-      ValTy->isVectorTy() &&
-      ValTy->getScalarSizeInBits() <= 32)
+  if (ST->hasSlowLoadDSubregister() && Opcode == Instruction::InsertElement &&
+      ValTy->isVectorTy() && ValTy->getScalarSizeInBits() <= 32)
     return 3;
 
   if ((Opcode == Instruction::InsertElement ||
diff --git a/lib/Target/ARM/ARMTargetTransformInfo.h b/lib/Target/ARM/ARMTargetTransformInfo.h
index 7d8d2381c983..a0ca9e648002 100644
--- a/lib/Target/ARM/ARMTargetTransformInfo.h
+++ b/lib/Target/ARM/ARMTargetTransformInfo.h
@@ -54,12 +54,24 @@ public:
 
   bool enableInterleavedAccessVectorization() { return true; }
 
+  /// Floating-point computation using ARMv8 AArch32 Advanced
+  /// SIMD instructions remains unchanged from ARMv7. Only AArch64 SIMD
+  /// is IEEE-754 compliant, but it's not covered in this target.
+  bool isFPVectorizationPotentiallyUnsafe() {
+    return !ST->isTargetDarwin();
+  }
+
   /// \name Scalar TTI Implementations
   /// @{
 
+  int getIntImmCodeSizeCost(unsigned Opcode, unsigned Idx, const APInt &Imm,
+                            Type *Ty);
+
   using BaseT::getIntImmCost;
   int getIntImmCost(const APInt &Imm, Type *Ty);
 
+  int getIntImmCost(unsigned Opcode, unsigned Idx, const APInt &Imm, Type *Ty);
+
   /// @}
 
   /// \name Vector TTI Implementations
@@ -88,10 +100,7 @@ public:
   }
 
   unsigned getMaxInterleaveFactor(unsigned VF) {
-    // These are out of order CPUs:
-    if (ST->isCortexA15() || ST->isSwift())
-      return 2;
-    return 1;
+    return ST->getMaxInterleaveFactor();
   }
 
   int getShuffleCost(TTI::ShuffleKind Kind, Type *Tp, int Index, Type *SubTp);
diff --git a/lib/Target/ARM/AsmParser/ARMAsmParser.cpp b/lib/Target/ARM/AsmParser/ARMAsmParser.cpp
index c69a741244cf..7d49302f9a96 100644
--- a/lib/Target/ARM/AsmParser/ARMAsmParser.cpp
+++ b/lib/Target/ARM/AsmParser/ARMAsmParser.cpp
@@ -20,7 +20,7 @@
 #include "llvm/MC/MCAsmInfo.h"
 #include "llvm/MC/MCAssembler.h"
 #include "llvm/MC/MCContext.h"
-#include "llvm/MC/MCDisassembler.h"
+#include "llvm/MC/MCDisassembler/MCDisassembler.h"
 #include "llvm/MC/MCELFStreamer.h"
 #include "llvm/MC/MCExpr.h"
 #include "llvm/MC/MCInst.h"
@@ -31,20 +31,20 @@
 #include "llvm/MC/MCParser/MCAsmParser.h"
 #include "llvm/MC/MCParser/MCAsmParserUtils.h"
 #include "llvm/MC/MCParser/MCParsedAsmOperand.h"
+#include "llvm/MC/MCParser/MCTargetAsmParser.h"
 #include "llvm/MC/MCRegisterInfo.h"
 #include "llvm/MC/MCSection.h"
 #include "llvm/MC/MCStreamer.h"
 #include "llvm/MC/MCSubtargetInfo.h"
 #include "llvm/MC/MCSymbol.h"
-#include "llvm/MC/MCTargetAsmParser.h"
 #include "llvm/Support/ARMBuildAttributes.h"
 #include "llvm/Support/ARMEHABI.h"
-#include "llvm/Support/TargetParser.h"
 #include "llvm/Support/COFF.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/ELF.h"
 #include "llvm/Support/MathExtras.h"
 #include "llvm/Support/SourceMgr.h"
+#include "llvm/Support/TargetParser.h"
 #include "llvm/Support/TargetRegistry.h"
 #include "llvm/Support/raw_ostream.h"
 
@@ -257,9 +257,15 @@ class ARMAsmParser : public MCTargetAsmParser {
   bool hasThumb() const {
     return getSTI().getFeatureBits()[ARM::HasV4TOps];
   }
+  bool hasThumb2() const {
+    return getSTI().getFeatureBits()[ARM::FeatureThumb2];
+  }
   bool hasV6Ops() const {
     return getSTI().getFeatureBits()[ARM::HasV6Ops];
   }
+  bool hasV6T2Ops() const {
+    return getSTI().getFeatureBits()[ARM::HasV6T2Ops];
+  }
   bool hasV6MOps() const {
     return getSTI().getFeatureBits()[ARM::HasV6MOps];
   }
@@ -269,6 +275,15 @@ class ARMAsmParser : public MCTargetAsmParser {
   bool hasV8Ops() const {
     return getSTI().getFeatureBits()[ARM::HasV8Ops];
   }
+  bool hasV8MBaseline() const {
+    return getSTI().getFeatureBits()[ARM::HasV8MBaselineOps];
+  }
+  bool hasV8MMainline() const {
+    return getSTI().getFeatureBits()[ARM::HasV8MMainlineOps];
+  }
+  bool has8MSecExt() const {
+    return getSTI().getFeatureBits()[ARM::Feature8MSecExt];
+  }
   bool hasARM() const {
     return !getSTI().getFeatureBits()[ARM::FeatureNoARM];
   }
@@ -281,12 +296,16 @@ class ARMAsmParser : public MCTargetAsmParser {
   bool hasV8_1aOps() const {
     return getSTI().getFeatureBits()[ARM::HasV8_1aOps];
   }
+  bool hasRAS() const {
+    return getSTI().getFeatureBits()[ARM::FeatureRAS];
+  }
 
   void SwitchMode() {
     MCSubtargetInfo &STI = copySTI();
     uint64_t FB = ComputeAvailableFeatures(STI.ToggleFeature(ARM::ModeThumb));
     setAvailableFeatures(FB);
   }
+  void FixModeAfterArchChange(bool WasThumb, SMLoc Loc);
   bool isMClass() const {
     return getSTI().getFeatureBits()[ARM::FeatureMClass];
   }
@@ -417,8 +436,9 @@ class ARMOperand : public MCParsedAsmOperand {
     k_ShifterImmediate,
     k_RotateImmediate,
     k_ModifiedImmediate,
+    k_ConstantPoolImmediate,
     k_BitfieldDescriptor,
-    k_Token
+    k_Token,
   } Kind;
 
   SMLoc StartLoc, EndLoc, AlignmentLoc;
@@ -611,6 +631,11 @@ public:
     return Imm.Val;
   }
 
+  const MCExpr *getConstantPoolImm() const {
+    assert(isConstantPoolImm() && "Invalid access!");
+    return Imm.Val;
+  }
+
   unsigned getVectorIndex() const {
     assert(Kind == k_VectorIndex && "Invalid access!");
     return VectorIndex.Val;
@@ -648,7 +673,27 @@ public:
   bool isCCOut() const { return Kind == k_CCOut; }
   bool isITMask() const { return Kind == k_ITCondMask; }
   bool isITCondCode() const { return Kind == k_CondCode; }
-  bool isImm() const override { return Kind == k_Immediate; }
+  bool isImm() const override {
+    return Kind == k_Immediate;
+  }
+
+  bool isARMBranchTarget() const {
+    if (!isImm()) return false;
+
+    if (const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm()))
+      return CE->getValue() % 4 == 0;
+    return true;
+  }
+
+
+  bool isThumbBranchTarget() const {
+    if (!isImm()) return false;
+
+    if (const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm()))
+      return CE->getValue() % 2 == 0;
+    return true;
+  }
+
   // checks whether this operand is an unsigned offset which fits is a field
   // of specified width and scaled by a specific number of bits
   template<unsigned width, unsigned scale>
@@ -1036,6 +1081,7 @@ public:
     return ARM_AM::getSOImmVal(Value) == -1 &&
       ARM_AM::getSOImmVal(-Value) != -1;
   }
+  bool isConstantPoolImm() const { return Kind == k_ConstantPoolImmediate; }
   bool isBitfield() const { return Kind == k_BitfieldDescriptor; }
   bool isPostIdxRegShifted() const { return Kind == k_PostIndexRegister; }
   bool isPostIdxReg() const {
@@ -1183,6 +1229,20 @@ public:
     return (Val >= -1020 && Val <= 1020 && ((Val & 3) == 0)) ||
       Val == INT32_MIN;
   }
+  bool isAddrMode5FP16() const {
+    // If we have an immediate that's not a constant, treat it as a label
+    // reference needing a fixup. If it is a constant, it's something else
+    // and we reject it.
+    if (isImm() && !isa<MCConstantExpr>(getImm()))
+      return true;
+    if (!isMem() || Memory.Alignment != 0) return false;
+    // Check for register offset.
+    if (Memory.OffsetRegNum) return false;
+    // Immediate offset in range [-510, 510] and a multiple of 2.
+    if (!Memory.OffsetImm) return true;
+    int64_t Val = Memory.OffsetImm->getValue();
+    return (Val >= -510 && Val <= 510 && ((Val & 1) == 0)) || Val == INT32_MIN;
+  }
   bool isMemTBB() const {
     if (!isMem() || !Memory.OffsetRegNum || Memory.isNegative ||
         Memory.ShiftType != ARM_AM::no_shift || Memory.Alignment != 0)
@@ -1203,7 +1263,7 @@ public:
   }
   bool isT2MemRegOffset() const {
     if (!isMem() || !Memory.OffsetRegNum || Memory.isNegative ||
-        Memory.Alignment != 0)
+        Memory.Alignment != 0 || Memory.BaseRegNum == ARM::PC)
       return false;
     // Only lsl #{0, 1, 2, 3} allowed.
     if (Memory.ShiftType == ARM_AM::no_shift)
@@ -1319,6 +1379,7 @@ public:
     // If we have an immediate that's not a constant, treat it as a label
     // reference needing a fixup. If it is a constant, it's something else
     // and we reject it.
+
     if (isImm() && !isa<MCConstantExpr>(getImm()))
       return true;
 
@@ -1329,6 +1390,11 @@ public:
     int64_t Val = Memory.OffsetImm->getValue();
     return (Val > -4096 && Val < 4096) || (Val == INT32_MIN);
   }
+  bool isConstPoolAsmImm() const {
+    // Delay processing of Constant Pool Immediate, this will turn into
+    // a constant. Match no other operand
+    return (isConstantPoolImm());
+  }
   bool isPostIdxImm8() const {
     if (!isImm()) return false;
     const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm());
@@ -1665,7 +1731,7 @@ public:
     if (!CE) return false;
     uint64_t Value = CE->getValue();
     // i64 value with each byte being either 0 or 0xff.
-    for (unsigned i = 0; i < 8; ++i)
+    for (unsigned i = 0; i < 8; ++i, Value >>= 8)
       if ((Value & 0xff) != 0 && (Value & 0xff) != 0xff) return false;
     return true;
   }
@@ -1680,6 +1746,16 @@ public:
       Inst.addOperand(MCOperand::createExpr(Expr));
   }
 
+  void addARMBranchTargetOperands(MCInst &Inst, unsigned N) const {
+    assert(N == 1 && "Invalid number of operands!");
+    addExpr(Inst, getImm());
+  }
+
+  void addThumbBranchTargetOperands(MCInst &Inst, unsigned N) const {
+    assert(N == 1 && "Invalid number of operands!");
+    addExpr(Inst, getImm());
+  }
+
   void addCondCodeOperands(MCInst &Inst, unsigned N) const {
     assert(N == 2 && "Invalid number of operands!");
     Inst.addOperand(MCOperand::createImm(unsigned(getCondCode())));
@@ -1941,6 +2017,7 @@ public:
       }
 
       const MCSymbolRefExpr *SR = dyn_cast<MCSymbolRefExpr>(Imm.Val);
+ 
       assert(SR && "Unknown value type!");
       Inst.addOperand(MCOperand::createExpr(SR));
       return;
@@ -2145,6 +2222,28 @@ public:
     Inst.addOperand(MCOperand::createImm(Val));
   }
 
+  void addAddrMode5FP16Operands(MCInst &Inst, unsigned N) const {
+    assert(N == 2 && "Invalid number of operands!");
+    // If we have an immediate that's not a constant, treat it as a label
+    // reference needing a fixup. If it is a constant, it's something else
+    // and we reject it.
+    if (isImm()) {
+      Inst.addOperand(MCOperand::createExpr(getImm()));
+      Inst.addOperand(MCOperand::createImm(0));
+      return;
+    }
+
+    // The lower bit is always zero and as such is not encoded.
+    int32_t Val = Memory.OffsetImm ? Memory.OffsetImm->getValue() / 2 : 0;
+    ARM_AM::AddrOpc AddSub = Val < 0 ? ARM_AM::sub : ARM_AM::add;
+    // Special case for #-0
+    if (Val == INT32_MIN) Val = 0;
+    if (Val < 0) Val = -Val;
+    Val = ARM_AM::getAM5FP16Opc(AddSub, Val);
+    Inst.addOperand(MCOperand::createReg(Memory.BaseRegNum));
+    Inst.addOperand(MCOperand::createImm(Val));
+  }
+
   void addMemImm8s4OffsetOperands(MCInst &Inst, unsigned N) const {
     assert(N == 2 && "Invalid number of operands!");
     // If we have an immediate that's not a constant, treat it as a label
@@ -2214,6 +2313,14 @@ public:
     Inst.addOperand(MCOperand::createImm(Val));
   }
 
+  void addConstPoolAsmImmOperands(MCInst &Inst, unsigned N) const {
+    assert(N == 1 && "Invalid number of operands!");
+    // This is container for the immediate that we will create the constant
+    // pool from
+    addExpr(Inst, getConstantPoolImm());
+    return;
+  }
+
   void addMemTBBOperands(MCInst &Inst, unsigned N) const {
     assert(N == 2 && "Invalid number of operands!");
     Inst.addOperand(MCOperand::createReg(Memory.BaseRegNum));
@@ -2594,6 +2701,15 @@ public:
   }
 
   static std::unique_ptr<ARMOperand>
+  CreateConstantPoolImm(const MCExpr *Val, SMLoc S, SMLoc E) {
+    auto Op = make_unique<ARMOperand>(k_ConstantPoolImmediate);
+    Op->Imm.Val = Val;
+    Op->StartLoc = S;
+    Op->EndLoc = E;
+    return Op;
+  }
+
+  static std::unique_ptr<ARMOperand>
   CreateBitfield(unsigned LSB, unsigned Width, SMLoc S, SMLoc E) {
     auto Op = make_unique<ARMOperand>(k_BitfieldDescriptor);
     Op->Bitfield.LSB = LSB;
@@ -2850,6 +2966,9 @@ void ARMOperand::print(raw_ostream &OS) const {
     OS << "<mod_imm #" << ModImm.Bits << ", #"
        <<  ModImm.Rot << ")>";
     break;
+  case k_ConstantPoolImmediate:
+    OS << "<constant_pool_imm #" << *getConstantPoolImm();
+    break;
   case k_BitfieldDescriptor:
     OS << "<bitfield " << "lsb: " << Bitfield.LSB
        << ", width: " << Bitfield.Width << ">";
@@ -3969,6 +4088,18 @@ ARMAsmParser::parseMSRMaskOperand(OperandVector &Operands) {
       .Case("basepri_max", 0x812)
       .Case("faultmask", 0x813)
       .Case("control", 0x814)
+      .Case("msplim", 0x80a)
+      .Case("psplim", 0x80b)
+      .Case("msp_ns", 0x888)
+      .Case("psp_ns", 0x889)
+      .Case("msplim_ns", 0x88a)
+      .Case("psplim_ns", 0x88b)
+      .Case("primask_ns", 0x890)
+      .Case("basepri_ns", 0x891)
+      .Case("basepri_max_ns", 0x892)
+      .Case("faultmask_ns", 0x893)
+      .Case("control_ns", 0x894)
+      .Case("sp_ns", 0x898)
       .Default(~0U);
 
     if (FlagsVal == ~0U)
@@ -3983,6 +4114,14 @@ ARMAsmParser::parseMSRMaskOperand(OperandVector &Operands) {
       // basepri, basepri_max and faultmask only valid for V7m.
       return MatchOperand_NoMatch;
 
+    if (!has8MSecExt() && (FlagsVal == 0x80a || FlagsVal == 0x80b ||
+                             (FlagsVal > 0x814 && FlagsVal < 0xc00)))
+      return MatchOperand_NoMatch;
+
+    if (!hasV8MMainline() && (FlagsVal == 0x88a || FlagsVal == 0x88b ||
+                              (FlagsVal > 0x890 && FlagsVal <= 0x893)))
+      return MatchOperand_NoMatch;
+
     Parser.Lex(); // Eat identifier token.
     Operands.push_back(ARMOperand::CreateMSRMask(FlagsVal, S));
     return MatchOperand_Success;
@@ -4673,14 +4812,14 @@ void ARMAsmParser::cvtThumbBranches(MCInst &Inst,
     // classify tB as either t2B or t1B based on range of immediate operand
     case ARM::tB: {
       ARMOperand &op = static_cast<ARMOperand &>(*Operands[ImmOp]);
-      if (!op.isSignedOffset<11, 1>() && isThumbTwo())
+      if (!op.isSignedOffset<11, 1>() && isThumb() && hasV8MBaseline())
         Inst.setOpcode(ARM::t2B);
       break;
     }
     // classify tBcc as either t2Bcc or t1Bcc based on range of immediate operand
     case ARM::tBcc: {
       ARMOperand &op = static_cast<ARMOperand &>(*Operands[ImmOp]);
-      if (!op.isSignedOffset<8, 1>() && isThumbTwo())
+      if (!op.isSignedOffset<8, 1>() && isThumb() && hasV8MBaseline())
         Inst.setOpcode(ARM::t2Bcc);
       break;
     }
@@ -4973,7 +5112,8 @@ ARMAsmParser::parseFPImm(OperandVector &Operands) {
   // vmov.i{8|16|32|64} <dreg|qreg>, #imm
   ARMOperand &TyOp = static_cast<ARMOperand &>(*Operands[2]);
   bool isVmovf = TyOp.isToken() &&
-                 (TyOp.getToken() == ".f32" || TyOp.getToken() == ".f64");
+                 (TyOp.getToken() == ".f32" || TyOp.getToken() == ".f64" ||
+                  TyOp.getToken() == ".f16");
   ARMOperand &Mnemonic = static_cast<ARMOperand &>(*Operands[0]);
   bool isFconst = Mnemonic.isToken() && (Mnemonic.getToken() == "fconstd" ||
                                          Mnemonic.getToken() == "fconsts");
@@ -5144,16 +5284,12 @@ bool ARMAsmParser::parseOperand(OperandVector &Operands, StringRef Mnemonic) {
     S = Parser.getTok().getLoc();
     if (Mnemonic != "ldr") // only parse for ldr pseudo (e.g. ldr r0, =val)
       return Error(S, "unexpected token in operand");
-
     Parser.Lex(); // Eat '='
     const MCExpr *SubExprVal;
     if (getParser().parseExpression(SubExprVal))
       return true;
     E = SMLoc::getFromPointer(Parser.getTok().getLoc().getPointer() - 1);
-
-    const MCExpr *CPLoc =
-        getTargetStreamer().addConstantPoolEntry(SubExprVal, S);
-    Operands.push_back(ARMOperand::CreateImm(CPLoc, S, E));
+    Operands.push_back(ARMOperand::CreateConstantPoolImm(SubExprVal, S, E));
     return false;
   }
   }
@@ -5265,7 +5401,8 @@ StringRef ARMAsmParser::splitMnemonic(StringRef Mnemonic,
       Mnemonic == "vcvta" || Mnemonic == "vcvtn"  || Mnemonic == "vcvtp" ||
       Mnemonic == "vcvtm" || Mnemonic == "vrinta" || Mnemonic == "vrintn" ||
       Mnemonic == "vrintp" || Mnemonic == "vrintm" || Mnemonic == "hvc" ||
-      Mnemonic.startswith("vsel"))
+      Mnemonic.startswith("vsel") || Mnemonic == "vins" || Mnemonic == "vmovx" ||
+      Mnemonic == "bxns"  || Mnemonic == "blxns")
     return Mnemonic;
 
   // First, split out any predication code. Ignore mnemonics we know aren't
@@ -5311,6 +5448,7 @@ StringRef ARMAsmParser::splitMnemonic(StringRef Mnemonic,
         Mnemonic == "fsts" || Mnemonic == "fcpys" || Mnemonic == "fdivs" ||
         Mnemonic == "fmuls" || Mnemonic == "fcmps" || Mnemonic == "fcmpzs" ||
         Mnemonic == "vfms" || Mnemonic == "vfnms" || Mnemonic == "fconsts" ||
+        Mnemonic == "bxns" || Mnemonic == "blxns" ||
         (Mnemonic == "movs" && isThumb()))) {
     Mnemonic = Mnemonic.slice(0, Mnemonic.size() - 1);
     CarrySetting = true;
@@ -5369,7 +5507,8 @@ void ARMAsmParser::getMnemonicAcceptInfo(StringRef Mnemonic, StringRef FullInst,
       Mnemonic == "vrintn" || Mnemonic == "vrintp" || Mnemonic == "vrintm" ||
       Mnemonic.startswith("aes") || Mnemonic == "hvc" || Mnemonic == "setpan" ||
       Mnemonic.startswith("sha1") || Mnemonic.startswith("sha256") ||
-      (FullInst.startswith("vmull") && FullInst.endswith(".p64"))) {
+      (FullInst.startswith("vmull") && FullInst.endswith(".p64")) ||
+      Mnemonic == "vmovx" || Mnemonic == "vins") {
     // These mnemonics are never predicable
     CanAcceptPredicationCode = false;
   } else if (!isThumb()) {
@@ -6405,6 +6544,20 @@ bool ARMAsmParser::validateInstruction(MCInst &Inst,
           "immediate expression for mov requires :lower16: or :upper16");
     break;
   }
+  case ARM::HINT:
+  case ARM::t2HINT: {
+    if (hasRAS()) {
+      // ESB is not predicable (pred must be AL)
+      unsigned Imm8 = Inst.getOperand(0).getImm();
+      unsigned Pred = Inst.getOperand(1).getImm();
+      if (Imm8 == 0x10 && Pred != ARMCC::AL)
+        return Error(Operands[1]->getStartLoc(), "instruction 'esb' is not "
+                                                 "predicable, but condition "
+                                                 "code specified");
+    }
+    // Without the RAS extension, this behaves as any other unallocated hint.
+    break;
+  }
   }
 
   return false;
@@ -6766,6 +6919,90 @@ bool ARMAsmParser::processInstruction(MCInst &Inst,
   case ARM::t2LDRSHpcrel:
     Inst.setOpcode(ARM::t2LDRSHpci);
     return true;
+  case ARM::LDRConstPool:
+  case ARM::tLDRConstPool:
+  case ARM::t2LDRConstPool: {
+    // Pseudo instruction ldr rt, =immediate is converted to a
+    // MOV rt, immediate if immediate is known and representable
+    // otherwise we create a constant pool entry that we load from.
+    MCInst TmpInst;
+    if (Inst.getOpcode() == ARM::LDRConstPool)
+      TmpInst.setOpcode(ARM::LDRi12);
+    else if (Inst.getOpcode() == ARM::tLDRConstPool)
+      TmpInst.setOpcode(ARM::tLDRpci);
+    else if (Inst.getOpcode() == ARM::t2LDRConstPool)
+      TmpInst.setOpcode(ARM::t2LDRpci);
+    const ARMOperand &PoolOperand =
+      static_cast<ARMOperand &>(*Operands[3]);
+    const MCExpr *SubExprVal = PoolOperand.getConstantPoolImm();
+    // If SubExprVal is a constant we may be able to use a MOV
+    if (isa<MCConstantExpr>(SubExprVal) &&
+        Inst.getOperand(0).getReg() != ARM::PC &&
+        Inst.getOperand(0).getReg() != ARM::SP) {
+      int64_t Value =
+        (int64_t) (cast<MCConstantExpr>(SubExprVal))->getValue();
+      bool UseMov  = true;
+      bool MovHasS = true;
+      if (Inst.getOpcode() == ARM::LDRConstPool) {
+        // ARM Constant
+        if (ARM_AM::getSOImmVal(Value) != -1) {
+          Value = ARM_AM::getSOImmVal(Value);
+          TmpInst.setOpcode(ARM::MOVi);
+        }
+        else if (ARM_AM::getSOImmVal(~Value) != -1) {
+          Value = ARM_AM::getSOImmVal(~Value);
+          TmpInst.setOpcode(ARM::MVNi);
+        }
+        else if (hasV6T2Ops() &&
+                 Value >=0 && Value < 65536) {
+          TmpInst.setOpcode(ARM::MOVi16);
+          MovHasS = false;
+        }
+        else
+          UseMov = false;
+      }
+      else {
+        // Thumb/Thumb2 Constant
+        if (hasThumb2() &&
+            ARM_AM::getT2SOImmVal(Value) != -1)
+          TmpInst.setOpcode(ARM::t2MOVi);
+        else if (hasThumb2() &&
+                 ARM_AM::getT2SOImmVal(~Value) != -1) {
+          TmpInst.setOpcode(ARM::t2MVNi);
+          Value = ~Value;
+        }
+        else if (hasV8MBaseline() &&
+                 Value >=0 && Value < 65536) {
+          TmpInst.setOpcode(ARM::t2MOVi16);
+          MovHasS = false;
+        }
+        else
+          UseMov = false;
+      }
+      if (UseMov) {
+        TmpInst.addOperand(Inst.getOperand(0));           // Rt
+        TmpInst.addOperand(MCOperand::createImm(Value));  // Immediate
+        TmpInst.addOperand(Inst.getOperand(2));           // CondCode
+        TmpInst.addOperand(Inst.getOperand(3));           // CondCode
+        if (MovHasS)
+          TmpInst.addOperand(MCOperand::createReg(0));    // S
+        Inst = TmpInst;
+        return true;
+      }
+    }
+    // No opportunity to use MOV/MVN create constant pool
+    const MCExpr *CPLoc =
+      getTargetStreamer().addConstantPoolEntry(SubExprVal,
+                                               PoolOperand.getStartLoc());
+    TmpInst.addOperand(Inst.getOperand(0));           // Rt
+    TmpInst.addOperand(MCOperand::createExpr(CPLoc)); // offset to constpool
+    if (TmpInst.getOpcode() == ARM::LDRi12)
+      TmpInst.addOperand(MCOperand::createImm(0));    // unused offset
+    TmpInst.addOperand(Inst.getOperand(2));           // CondCode
+    TmpInst.addOperand(Inst.getOperand(3));           // CondCode
+    Inst = TmpInst;
+    return true;
+  }
   // Handle NEON VST complex aliases.
   case ARM::VST1LNdWB_register_Asm_8:
   case ARM::VST1LNdWB_register_Asm_16:
@@ -9031,6 +9268,31 @@ bool ARMAsmParser::parseDirectiveUnreq(SMLoc L) {
   return false;
 }
 
+// After changing arch/CPU, try to put the ARM/Thumb mode back to what it was
+// before, if supported by the new target, or emit mapping symbols for the mode
+// switch.
+void ARMAsmParser::FixModeAfterArchChange(bool WasThumb, SMLoc Loc) {
+  if (WasThumb != isThumb()) {
+    if (WasThumb && hasThumb()) {
+      // Stay in Thumb mode
+      SwitchMode();
+    } else if (!WasThumb && hasARM()) {
+      // Stay in ARM mode
+      SwitchMode();
+    } else {
+      // Mode switch forced, because the new arch doesn't support the old mode.
+      getParser().getStreamer().EmitAssemblerFlag(isThumb() ? MCAF_Code16
+                                                            : MCAF_Code32);
+      // Warn about the implcit mode switch. GAS does not switch modes here,
+      // but instead stays in the old mode, reporting an error on any following
+      // instructions as the mode does not exist on the target.
+      Warning(Loc, Twine("new target does not support ") +
+                       (WasThumb ? "thumb" : "arm") + " mode, switching to " +
+                       (!WasThumb ? "thumb" : "arm") + " mode");
+    }
+  }
+}
+
 /// parseDirectiveArch
 ///  ::= .arch token
 bool ARMAsmParser::parseDirectiveArch(SMLoc L) {
@@ -9043,10 +9305,12 @@ bool ARMAsmParser::parseDirectiveArch(SMLoc L) {
     return false;
   }
 
+  bool WasThumb = isThumb();
   Triple T;
   MCSubtargetInfo &STI = copySTI();
   STI.setDefaultFeatures("", ("+" + ARM::getArchName(ID)).str());
   setAvailableFeatures(ComputeAvailableFeatures(STI.getFeatureBits()));
+  FixModeAfterArchChange(WasThumb, L);
 
   getTargetStreamer().emitArch(ID);
   return false;
@@ -9177,9 +9441,11 @@ bool ARMAsmParser::parseDirectiveCPU(SMLoc L) {
     return false;
   }
 
+  bool WasThumb = isThumb();
   MCSubtargetInfo &STI = copySTI();
   STI.setDefaultFeatures(CPU, "");
   setAvailableFeatures(ComputeAvailableFeatures(STI.getFeatureBits()));
+  FixModeAfterArchChange(WasThumb, L);
 
   return false;
 }
@@ -9834,7 +10100,7 @@ bool ARMAsmParser::parseDirectiveObjectArch(SMLoc L) {
 
   StringRef Arch = Parser.getTok().getString();
   SMLoc ArchLoc = Parser.getTok().getLoc();
-  getLexer().Lex();
+  Lex();
 
   unsigned ID = ARM::parseArch(Arch);
 
@@ -9863,7 +10129,9 @@ bool ARMAsmParser::parseDirectiveAlign(SMLoc L) {
     return true;
 
   // '.align' is target specifically handled to mean 2**2 byte alignment.
-  if (getStreamer().getCurrentSection().first->UseCodeAlign())
+  const MCSection *Section = getStreamer().getCurrentSection().first;
+  assert(Section && "must have section to emit alignment");
+  if (Section->UseCodeAlign())
     getStreamer().EmitCodeAlignment(4, 0);
   else
     getStreamer().EmitValueToAlignment(4, 0, 1, 0);
@@ -9933,6 +10201,7 @@ static const struct {
   // FIXME: Only available in A-class, isel not predicated
   { ARM::AEK_VIRT, Feature_HasV7, {ARM::FeatureVirtualization} },
   { ARM::AEK_FP16, Feature_HasV8_2a, {ARM::FeatureFPARMv8, ARM::FeatureFullFP16} },
+  { ARM::AEK_RAS, Feature_HasV8, {ARM::FeatureRAS} },
   // FIXME: Unsupported extensions.
   { ARM::AEK_OS, Feature_None, {} },
   { ARM::AEK_IWMMXT, Feature_None, {} },
@@ -9954,7 +10223,7 @@ bool ARMAsmParser::parseDirectiveArchExtension(SMLoc L) {
 
   StringRef Name = Parser.getTok().getString();
   SMLoc ExtLoc = Parser.getTok().getLoc();
-  getLexer().Lex();
+  Lex();
 
   bool EnableFeature = true;
   if (Name.startswith_lower("no")) {
diff --git a/lib/Target/ARM/AsmParser/Makefile b/lib/Target/ARM/AsmParser/Makefile
deleted file mode 100644
index 841516fffbd5..000000000000
--- a/lib/Target/ARM/AsmParser/Makefile
+++ /dev/null
@@ -1,15 +0,0 @@
-##===- lib/Target/ARM/AsmParser/Makefile -------------------*- Makefile -*-===##
-#
-#                     The LLVM Compiler Infrastructure
-#
-# This file is distributed under the University of Illinois Open Source
-# License. See LICENSE.TXT for details.
-#
-##===----------------------------------------------------------------------===##
-LEVEL = ../../../..
-LIBRARYNAME = LLVMARMAsmParser
-
-# Hack: we need to include 'main' ARM target directory to grab private headers
-CPP.Flags += -I$(PROJ_OBJ_DIR)/.. -I$(PROJ_SRC_DIR)/..
-
-include $(LEVEL)/Makefile.common
diff --git a/lib/Target/ARM/Disassembler/ARMDisassembler.cpp b/lib/Target/ARM/Disassembler/ARMDisassembler.cpp
index e63defed2288..3196a57ccc3e 100644
--- a/lib/Target/ARM/Disassembler/ARMDisassembler.cpp
+++ b/lib/Target/ARM/Disassembler/ARMDisassembler.cpp
@@ -7,7 +7,7 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "llvm/MC/MCDisassembler.h"
+#include "llvm/MC/MCDisassembler/MCDisassembler.h"
 #include "MCTargetDesc/ARMAddressingModes.h"
 #include "MCTargetDesc/ARMBaseInfo.h"
 #include "MCTargetDesc/ARMMCExpr.h"
@@ -210,6 +210,8 @@ static DecodeStatus DecodeArmMOVTWInstruction(MCInst &Inst, unsigned Insn,
                                uint64_t Address, const void *Decoder);
 static DecodeStatus DecodeSMLAInstruction(MCInst &Inst, unsigned Insn,
                                uint64_t Address, const void *Decoder);
+static DecodeStatus DecodeHINTInstruction(MCInst &Inst, unsigned Insn,
+                               uint64_t Address, const void *Decoder);
 static DecodeStatus DecodeCPSInstruction(MCInst &Inst, unsigned Insn,
                                uint64_t Address, const void *Decoder);
 static DecodeStatus DecodeTSTInstruction(MCInst &Inst, unsigned Insn,
@@ -222,6 +224,8 @@ static DecodeStatus DecodeAddrModeImm12Operand(MCInst &Inst, unsigned Val,
                                uint64_t Address, const void *Decoder);
 static DecodeStatus DecodeAddrMode5Operand(MCInst &Inst, unsigned Val,
                                uint64_t Address, const void *Decoder);
+static DecodeStatus DecodeAddrMode5FP16Operand(MCInst &Inst, unsigned Val,
+                               uint64_t Address, const void *Decoder);
 static DecodeStatus DecodeAddrMode7Operand(MCInst &Inst, unsigned Val,
                                uint64_t Address, const void *Decoder);
 static DecodeStatus DecodeT2BInstruction(MCInst &Inst, unsigned Insn,
@@ -391,8 +395,8 @@ static DecodeStatus DecodeT2ShifterImmOperand(MCInst &Inst, unsigned Val,
 
 static DecodeStatus DecodeLDR(MCInst &Inst, unsigned Val,
                                 uint64_t Address, const void *Decoder);
-static DecodeStatus DecodeMRRC2(llvm::MCInst &Inst, unsigned Val,
-                                uint64_t Address, const void *Decoder);
+static DecodeStatus DecoderForMRRC2AndMCRR2(llvm::MCInst &Inst, unsigned Val,
+                                            uint64_t Address, const void *Decoder);
 #include "ARMGenDisassemblerTables.inc"
 
 static MCDisassembler *createARMDisassembler(const Target &T,
@@ -590,6 +594,8 @@ MCDisassembler::DecodeStatus
 ThumbDisassembler::AddThumbPredicate(MCInst &MI) const {
   MCDisassembler::DecodeStatus S = Success;
 
+  const FeatureBitset &FeatureBits = getSubtargetInfo().getFeatureBits();
+
   // A few instructions actually have predicates encoded in them.  Don't
   // try to overwrite it if we're seeing one of those.
   switch (MI.getOpcode()) {
@@ -610,6 +616,10 @@ ThumbDisassembler::AddThumbPredicate(MCInst &MI) const {
       else
         return Success;
       break;
+    case ARM::t2HINT:
+      if (MI.getOperand(0).getImm() == 0x10 && (FeatureBits[ARM::FeatureRAS]) != 0)
+        S = SoftFail;
+      break;
     case ARM::tB:
     case ARM::t2B:
     case ARM::t2TBB:
@@ -1941,6 +1951,29 @@ static DecodeStatus DecodeMemMultipleWritebackInstruction(MCInst &Inst,
   return S;
 }
 
+// Check for UNPREDICTABLE predicated ESB instruction
+static DecodeStatus DecodeHINTInstruction(MCInst &Inst, unsigned Insn,
+                                 uint64_t Address, const void *Decoder) {
+  unsigned pred = fieldFromInstruction(Insn, 28, 4);
+  unsigned imm8 = fieldFromInstruction(Insn, 0, 8);
+  const MCDisassembler *Dis = static_cast<const MCDisassembler*>(Decoder);
+  const FeatureBitset &FeatureBits = Dis->getSubtargetInfo().getFeatureBits();
+
+  DecodeStatus S = MCDisassembler::Success;
+
+  Inst.addOperand(MCOperand::createImm(imm8));
+
+  if (!Check(S, DecodePredicateOperand(Inst, pred, Address, Decoder)))
+    return MCDisassembler::Fail;
+
+  // ESB is unpredictable if pred != AL. Without the RAS extension, it is a NOP,
+  // so all predicates should be allowed.
+  if (imm8 == 0x10 && pred != 0xe && ((FeatureBits[ARM::FeatureRAS]) != 0))
+    S = MCDisassembler::SoftFail;
+
+  return S;
+}
+
 static DecodeStatus DecodeCPSInstruction(MCInst &Inst, unsigned Insn,
                                  uint64_t Address, const void *Decoder) {
   unsigned imod = fieldFromInstruction(Insn, 18, 2);
@@ -2183,6 +2216,7 @@ static DecodeStatus DecodeAddrMode5Operand(MCInst &Inst, unsigned Val,
   DecodeStatus S = MCDisassembler::Success;
 
   unsigned Rn = fieldFromInstruction(Val, 9, 4);
+  // U == 1 to add imm, 0 to subtract it.
   unsigned U = fieldFromInstruction(Val, 8, 1);
   unsigned imm = fieldFromInstruction(Val, 0, 8);
 
@@ -2197,6 +2231,26 @@ static DecodeStatus DecodeAddrMode5Operand(MCInst &Inst, unsigned Val,
   return S;
 }
 
+static DecodeStatus DecodeAddrMode5FP16Operand(MCInst &Inst, unsigned Val,
+                                   uint64_t Address, const void *Decoder) {
+  DecodeStatus S = MCDisassembler::Success;
+
+  unsigned Rn = fieldFromInstruction(Val, 9, 4);
+  // U == 1 to add imm, 0 to subtract it.
+  unsigned U = fieldFromInstruction(Val, 8, 1);
+  unsigned imm = fieldFromInstruction(Val, 0, 8);
+
+  if (!Check(S, DecodeGPRRegisterClass(Inst, Rn, Address, Decoder)))
+    return MCDisassembler::Fail;
+
+  if (U)
+    Inst.addOperand(MCOperand::createImm(ARM_AM::getAM5FP16Opc(ARM_AM::add, imm)));
+  else
+    Inst.addOperand(MCOperand::createImm(ARM_AM::getAM5FP16Opc(ARM_AM::sub, imm)));
+
+  return S;
+}
+
 static DecodeStatus DecodeAddrMode7Operand(MCInst &Inst, unsigned Val,
                                    uint64_t Address, const void *Decoder) {
   return DecodeGPRRegisterClass(Inst, Val, Address, Decoder);
@@ -4096,6 +4150,24 @@ static DecodeStatus DecodeMSRMask(MCInst &Inst, unsigned Val,
         // Values basepri, basepri_max and faultmask are only valid for v7m.
         return MCDisassembler::Fail;
       break;
+    case 0x8a: // msplim_ns
+    case 0x8b: // psplim_ns
+    case 0x91: // basepri_ns
+    case 0x92: // basepri_max_ns
+    case 0x93: // faultmask_ns
+      if (!(FeatureBits[ARM::HasV8MMainlineOps]))
+        return MCDisassembler::Fail;
+      // fall through
+    case 10:   // msplim
+    case 11:   // psplim
+    case 0x88: // msp_ns
+    case 0x89: // psp_ns
+    case 0x90: // primask_ns
+    case 0x94: // control_ns
+    case 0x98: // sp_ns
+      if (!(FeatureBits[ARM::Feature8MSecExt]))
+        return MCDisassembler::Fail;
+      break;
     default:
       return MCDisassembler::Fail;
     }
@@ -5193,8 +5265,8 @@ static DecodeStatus DecodeLDR(MCInst &Inst, unsigned Val,
   return S;
 }
 
-static DecodeStatus DecodeMRRC2(llvm::MCInst &Inst, unsigned Val,
-                                uint64_t Address, const void *Decoder) {
+static DecodeStatus DecoderForMRRC2AndMCRR2(llvm::MCInst &Inst, unsigned Val,
+                                            uint64_t Address, const void *Decoder) {
 
   DecodeStatus S = MCDisassembler::Success;
 
@@ -5210,12 +5282,30 @@ static DecodeStatus DecodeMRRC2(llvm::MCInst &Inst, unsigned Val,
   if (Rt == Rt2)
     S = MCDisassembler::SoftFail;
 
+  // We have to check if the instruction is MRRC2
+  // or MCRR2 when constructing the operands for
+  // Inst. Reason is because MRRC2 stores to two
+  // registers so it's tablegen desc has has two
+  // outputs whereas MCRR doesn't store to any
+  // registers so all of it's operands are listed
+  // as inputs, therefore the operand order for
+  // MRRC2 needs to be [Rt, Rt2, cop, opc1, CRm]
+  // and MCRR2 operand order is [cop, opc1, Rt, Rt2, CRm]
+
+  if (Inst.getOpcode() == ARM::MRRC2) {
+    if (!Check(S, DecodeGPRnopcRegisterClass(Inst, Rt, Address, Decoder)))
+      return MCDisassembler::Fail;
+    if (!Check(S, DecodeGPRnopcRegisterClass(Inst, Rt2, Address, Decoder)))
+      return MCDisassembler::Fail;
+  }
   Inst.addOperand(MCOperand::createImm(cop));
   Inst.addOperand(MCOperand::createImm(opc1));
-  if (!Check(S, DecodeGPRnopcRegisterClass(Inst, Rt, Address, Decoder)))
-    return MCDisassembler::Fail;
-  if (!Check(S, DecodeGPRnopcRegisterClass(Inst, Rt2, Address, Decoder)))
-    return MCDisassembler::Fail;
+  if (Inst.getOpcode() == ARM::MCRR2) {
+    if (!Check(S, DecodeGPRnopcRegisterClass(Inst, Rt, Address, Decoder)))
+      return MCDisassembler::Fail;
+    if (!Check(S, DecodeGPRnopcRegisterClass(Inst, Rt2, Address, Decoder)))
+      return MCDisassembler::Fail;
+  }
   Inst.addOperand(MCOperand::createImm(CRm));
 
   return S;
diff --git a/lib/Target/ARM/Disassembler/Makefile b/lib/Target/ARM/Disassembler/Makefile
deleted file mode 100644
index 031b6aca5a48..000000000000
--- a/lib/Target/ARM/Disassembler/Makefile
+++ /dev/null
@@ -1,16 +0,0 @@
-##===- lib/Target/ARM/Disassembler/Makefile ----------------*- Makefile -*-===##
-#
-#                     The LLVM Compiler Infrastructure
-#
-# This file is distributed under the University of Illinois Open Source
-# License. See LICENSE.TXT for details.
-#
-##===----------------------------------------------------------------------===##
-
-LEVEL = ../../../..
-LIBRARYNAME = LLVMARMDisassembler
-
-# Hack: we need to include 'main' arm target directory to grab private headers
-CPPFLAGS = -I$(PROJ_OBJ_DIR)/.. -I$(PROJ_SRC_DIR)/..
-
-include $(LEVEL)/Makefile.common
diff --git a/lib/Target/ARM/InstPrinter/ARMInstPrinter.cpp b/lib/Target/ARM/InstPrinter/ARMInstPrinter.cpp
index 33fc85af9b19..e81bb77dbdfc 100644
--- a/lib/Target/ARM/InstPrinter/ARMInstPrinter.cpp
+++ b/lib/Target/ARM/InstPrinter/ARMInstPrinter.cpp
@@ -25,6 +25,7 @@ using namespace llvm;
 
 #define DEBUG_TYPE "asm-printer"
 
+#define PRINT_ALIAS_INSTR
 #include "ARMGenAsmWriter.inc"
 
 /// translateShiftImm - Convert shift immediate from 0-31 to 1-32 for printing.
@@ -73,43 +74,6 @@ void ARMInstPrinter::printInst(const MCInst *MI, raw_ostream &O,
 
   switch (Opcode) {
 
-  // Check for HINT instructions w/ canonical names.
-  case ARM::HINT:
-  case ARM::tHINT:
-  case ARM::t2HINT:
-    switch (MI->getOperand(0).getImm()) {
-    case 0:
-      O << "\tnop";
-      break;
-    case 1:
-      O << "\tyield";
-      break;
-    case 2:
-      O << "\twfe";
-      break;
-    case 3:
-      O << "\twfi";
-      break;
-    case 4:
-      O << "\tsev";
-      break;
-    case 5:
-      if (STI.getFeatureBits()[ARM::HasV8Ops]) {
-        O << "\tsevl";
-        break;
-      } // Fallthrough for non-v8
-    default:
-      // Anything else should just print normally.
-      printInstruction(MI, STI, O);
-      printAnnotation(O, Annot);
-      return;
-    }
-    printPredicateOperand(MI, 1, STI, O);
-    if (Opcode == ARM::t2HINT)
-      O << ".w";
-    printAnnotation(O, Annot);
-    return;
-
   // Check for MOVs and print canonical forms, instead.
   case ARM::MOVsr: {
     // FIXME: Thumb variants?
@@ -297,23 +261,11 @@ void ARMInstPrinter::printInst(const MCInst *MI, raw_ostream &O,
     }
     break;
   }
-  // B9.3.3 ERET (Thumb)
-  // For a target that has Virtualization Extensions, ERET is the preferred
-  // disassembly of SUBS PC, LR, #0
-  case ARM::t2SUBS_PC_LR: {
-    if (MI->getNumOperands() == 3 && MI->getOperand(0).isImm() &&
-        MI->getOperand(0).getImm() == 0 &&
-        STI.getFeatureBits()[ARM::FeatureVirtualization]) {
-      O << "\teret";
-      printPredicateOperand(MI, 1, STI, O);
-      printAnnotation(O, Annot);
-      return;
-    }
-    break;
-  }
   }
 
-  printInstruction(MI, STI, O);
+  if (!printAliasInstr(MI, STI, O))
+    printInstruction(MI, STI, O);
+
   printAnnotation(O, Annot);
 }
 
@@ -645,6 +597,34 @@ void ARMInstPrinter::printAddrMode5Operand(const MCInst *MI, unsigned OpNum,
   O << "]" << markup(">");
 }
 
+template <bool AlwaysPrintImm0>
+void ARMInstPrinter::printAddrMode5FP16Operand(const MCInst *MI, unsigned OpNum,
+                                               const MCSubtargetInfo &STI,
+                                               raw_ostream &O) {
+  const MCOperand &MO1 = MI->getOperand(OpNum);
+  const MCOperand &MO2 = MI->getOperand(OpNum+1);
+
+  if (!MO1.isReg()) {   // FIXME: This is for CP entries, but isn't right.
+    printOperand(MI, OpNum, STI, O);
+    return;
+  }
+
+  O << markup("<mem:") << "[";
+  printRegName(O, MO1.getReg());
+
+  unsigned ImmOffs = ARM_AM::getAM5FP16Offset(MO2.getImm());
+  unsigned Op = ARM_AM::getAM5FP16Op(MO2.getImm());
+  if (AlwaysPrintImm0 || ImmOffs || Op == ARM_AM::sub) {
+    O << ", "
+      << markup("<imm:")
+      << "#"
+      << ARM_AM::getAddrOpcStr(ARM_AM::getAM5FP16Op(MO2.getImm()))
+      << ImmOffs * 2
+      << markup(">");
+  }
+  O << "]" << markup(">");
+}
+
 void ARMInstPrinter::printAddrMode6Operand(const MCInst *MI, unsigned OpNum,
                                            const MCSubtargetInfo &STI,
                                            raw_ostream &O) {
@@ -901,6 +881,42 @@ void ARMInstPrinter::printMSRMaskOperand(const MCInst *MI, unsigned OpNum,
     case 20:
       O << "control";
       return;
+    case 10:
+      O << "msplim";
+      return;
+    case 11:
+      O << "psplim";
+      return;
+    case 0x88:
+      O << "msp_ns";
+      return;
+    case 0x89:
+      O << "psp_ns";
+      return;
+    case 0x8a:
+      O << "msplim_ns";
+      return;
+    case 0x8b:
+      O << "psplim_ns";
+      return;
+    case 0x90:
+      O << "primask_ns";
+      return;
+    case 0x91:
+      O << "basepri_ns";
+      return;
+    case 0x92:
+      O << "basepri_max_ns";
+      return;
+    case 0x93:
+      O << "faultmask_ns";
+      return;
+    case 0x94:
+      O << "control_ns";
+      return;
+    case 0x98:
+      O << "sp_ns";
+      return;
     }
   }
 
diff --git a/lib/Target/ARM/InstPrinter/ARMInstPrinter.h b/lib/Target/ARM/InstPrinter/ARMInstPrinter.h
index 52f7115f0558..9d80eed84dc2 100644
--- a/lib/Target/ARM/InstPrinter/ARMInstPrinter.h
+++ b/lib/Target/ARM/InstPrinter/ARMInstPrinter.h
@@ -30,6 +30,12 @@ public:
   // Autogenerated by tblgen.
   void printInstruction(const MCInst *MI, const MCSubtargetInfo &STI,
                         raw_ostream &O);
+  virtual bool printAliasInstr(const MCInst *MI, const MCSubtargetInfo &STI,
+                               raw_ostream &O);
+  virtual void printCustomAliasOperand(const MCInst *MI, unsigned OpIdx,
+                                       unsigned PrintMethodIdx,
+                                       const MCSubtargetInfo &STI,
+                                       raw_ostream &O);
   static const char *getRegisterName(unsigned RegNo);
 
   void printOperand(const MCInst *MI, unsigned OpNo, const MCSubtargetInfo &STI,
@@ -71,6 +77,9 @@ public:
   template <bool AlwaysPrintImm0>
   void printAddrMode5Operand(const MCInst *MI, unsigned OpNum,
                              const MCSubtargetInfo &STI, raw_ostream &O);
+  template <bool AlwaysPrintImm0>
+  void printAddrMode5FP16Operand(const MCInst *MI, unsigned OpNum,
+                                 const MCSubtargetInfo &STI, raw_ostream &O);
   void printAddrMode6Operand(const MCInst *MI, unsigned OpNum,
                              const MCSubtargetInfo &STI, raw_ostream &O);
   void printAddrMode7Operand(const MCInst *MI, unsigned OpNum,
diff --git a/lib/Target/ARM/InstPrinter/Makefile b/lib/Target/ARM/InstPrinter/Makefile
deleted file mode 100644
index 65d372e44b88..000000000000
--- a/lib/Target/ARM/InstPrinter/Makefile
+++ /dev/null
@@ -1,15 +0,0 @@
-##===- lib/Target/ARM/AsmPrinter/Makefile ------------------*- Makefile -*-===##
-#
-#                     The LLVM Compiler Infrastructure
-#
-# This file is distributed under the University of Illinois Open Source
-# License. See LICENSE.TXT for details.
-#
-##===----------------------------------------------------------------------===##
-LEVEL = ../../../..
-LIBRARYNAME = LLVMARMAsmPrinter
-
-# Hack: we need to include 'main' arm target directory to grab private headers
-CPP.Flags += -I$(PROJ_OBJ_DIR)/.. -I$(PROJ_SRC_DIR)/..
-
-include $(LEVEL)/Makefile.common
diff --git a/lib/Target/ARM/MCTargetDesc/ARMAddressingModes.h b/lib/Target/ARM/MCTargetDesc/ARMAddressingModes.h
index b03cada9a641..3959eab966a8 100644
--- a/lib/Target/ARM/MCTargetDesc/ARMAddressingModes.h
+++ b/lib/Target/ARM/MCTargetDesc/ARMAddressingModes.h
@@ -486,7 +486,7 @@ namespace ARM_AM {
   // addrmode5 := reg +/- imm8*4
   //
   // The first operand is always a Reg.  The second operand encodes the
-  // operation in bit 8 and the immediate in bits 0-7.
+  // operation (add or subtract) in bit 8 and the immediate in bits 0-7.
 
   /// getAM5Opc - This function encodes the addrmode5 opc field.
   static inline unsigned getAM5Opc(AddrOpc Opc, unsigned char Offset) {
@@ -501,6 +501,29 @@ namespace ARM_AM {
   }
 
   //===--------------------------------------------------------------------===//
+  // Addressing Mode #5 FP16
+  //===--------------------------------------------------------------------===//
+  //
+  // This is used for coprocessor instructions, such as 16-bit FP load/stores.
+  //
+  // addrmode5fp16 := reg +/- imm8*2
+  //
+  // The first operand is always a Reg.  The second operand encodes the
+  // operation (add or subtract) in bit 8 and the immediate in bits 0-7.
+
+  /// getAM5FP16Opc - This function encodes the addrmode5fp16 opc field.
+  static inline unsigned getAM5FP16Opc(AddrOpc Opc, unsigned char Offset) {
+    bool isSub = Opc == sub;
+    return ((int)isSub << 8) | Offset;
+  }
+  static inline unsigned char getAM5FP16Offset(unsigned AM5Opc) {
+    return AM5Opc & 0xFF;
+  }
+  static inline AddrOpc getAM5FP16Op(unsigned AM5Opc) {
+    return ((AM5Opc >> 8) & 1) ? sub : add;
+  }
+
+  //===--------------------------------------------------------------------===//
   // Addressing Mode #6
   //===--------------------------------------------------------------------===//
   //
@@ -650,6 +673,32 @@ namespace ARM_AM {
     return FPUnion.F;
   }
 
+  /// getFP16Imm - Return an 8-bit floating-point version of the 16-bit
+  /// floating-point value. If the value cannot be represented as an 8-bit
+  /// floating-point value, then return -1.
+  static inline int getFP16Imm(const APInt &Imm) {
+    uint32_t Sign = Imm.lshr(15).getZExtValue() & 1;
+    int32_t Exp = (Imm.lshr(10).getSExtValue() & 0x1f) - 15;  // -14 to 15
+    int64_t Mantissa = Imm.getZExtValue() & 0x3ff;  // 10 bits
+
+    // We can handle 4 bits of mantissa.
+    // mantissa = (16+UInt(e:f:g:h))/16.
+    if (Mantissa & 0x3f)
+      return -1;
+    Mantissa >>= 6;
+
+    // We can handle 3 bits of exponent: exp == UInt(NOT(b):c:d)-3
+    if (Exp < -3 || Exp > 4)
+      return -1;
+    Exp = ((Exp+3) & 0x7) ^ 4;
+
+    return ((int)Sign << 7) | (Exp << 4) | Mantissa;
+  }
+
+  static inline int getFP16Imm(const APFloat &FPImm) {
+    return getFP16Imm(FPImm.bitcastToAPInt());
+  }
+
   /// getFP32Imm - Return an 8-bit floating-point version of the 32-bit
   /// floating-point value. If the value cannot be represented as an 8-bit
   /// floating-point value, then return -1.
diff --git a/lib/Target/ARM/MCTargetDesc/ARMAsmBackend.cpp b/lib/Target/ARM/MCTargetDesc/ARMAsmBackend.cpp
index fa52c9354c17..0fc758201d47 100644
--- a/lib/Target/ARM/MCTargetDesc/ARMAsmBackend.cpp
+++ b/lib/Target/ARM/MCTargetDesc/ARMAsmBackend.cpp
@@ -46,6 +46,7 @@ public:
       : MCELFObjectTargetWriter(/*Is64Bit*/ false, OSABI, ELF::EM_ARM,
                                 /*HasRelocationAddend*/ false) {}
 };
+} // end anonymous namespace
 
 const MCFixupKindInfo &ARMAsmBackend::getFixupKindInfo(MCFixupKind Kind) const {
   const static MCFixupKindInfo InfosLE[ARM::NumTargetFixupKinds] = {
@@ -62,6 +63,10 @@ const MCFixupKindInfo &ARMAsmBackend::getFixupKindInfo(MCFixupKind Kind) const {
       {"fixup_t2_pcrel_10", 0, 32,
        MCFixupKindInfo::FKF_IsPCRel |
            MCFixupKindInfo::FKF_IsAlignedDownTo32Bits},
+      {"fixup_arm_pcrel_9", 0, 32, MCFixupKindInfo::FKF_IsPCRel},
+      {"fixup_t2_pcrel_9", 0, 32,
+       MCFixupKindInfo::FKF_IsPCRel |
+           MCFixupKindInfo::FKF_IsAlignedDownTo32Bits},
       {"fixup_thumb_adr_pcrel_10", 0, 8,
        MCFixupKindInfo::FKF_IsPCRel |
            MCFixupKindInfo::FKF_IsAlignedDownTo32Bits},
@@ -78,7 +83,9 @@ const MCFixupKindInfo &ARMAsmBackend::getFixupKindInfo(MCFixupKind Kind) const {
       {"fixup_arm_condbl", 0, 24, MCFixupKindInfo::FKF_IsPCRel},
       {"fixup_arm_blx", 0, 24, MCFixupKindInfo::FKF_IsPCRel},
       {"fixup_arm_thumb_bl", 0, 32, MCFixupKindInfo::FKF_IsPCRel},
-      {"fixup_arm_thumb_blx", 0, 32, MCFixupKindInfo::FKF_IsPCRel},
+      {"fixup_arm_thumb_blx", 0, 32,
+       MCFixupKindInfo::FKF_IsPCRel |
+           MCFixupKindInfo::FKF_IsAlignedDownTo32Bits},
       {"fixup_arm_thumb_cb", 0, 16, MCFixupKindInfo::FKF_IsPCRel},
       {"fixup_arm_thumb_cp", 0, 8,
        MCFixupKindInfo::FKF_IsPCRel |
@@ -90,6 +97,7 @@ const MCFixupKindInfo &ARMAsmBackend::getFixupKindInfo(MCFixupKind Kind) const {
       {"fixup_arm_movw_lo16", 0, 20, 0},
       {"fixup_t2_movt_hi16", 0, 20, 0},
       {"fixup_t2_movw_lo16", 0, 20, 0},
+      {"fixup_arm_mod_imm", 0, 12, 0},
   };
   const static MCFixupKindInfo InfosBE[ARM::NumTargetFixupKinds] = {
       // This table *must* be in the order that the fixup_* kinds are defined in
@@ -105,6 +113,10 @@ const MCFixupKindInfo &ARMAsmBackend::getFixupKindInfo(MCFixupKind Kind) const {
       {"fixup_t2_pcrel_10", 0, 32,
        MCFixupKindInfo::FKF_IsPCRel |
            MCFixupKindInfo::FKF_IsAlignedDownTo32Bits},
+      {"fixup_arm_pcrel_9", 0, 32, MCFixupKindInfo::FKF_IsPCRel},
+      {"fixup_t2_pcrel_9", 0, 32,
+       MCFixupKindInfo::FKF_IsPCRel |
+           MCFixupKindInfo::FKF_IsAlignedDownTo32Bits},
       {"fixup_thumb_adr_pcrel_10", 8, 8,
        MCFixupKindInfo::FKF_IsPCRel |
            MCFixupKindInfo::FKF_IsAlignedDownTo32Bits},
@@ -121,7 +133,9 @@ const MCFixupKindInfo &ARMAsmBackend::getFixupKindInfo(MCFixupKind Kind) const {
       {"fixup_arm_condbl", 8, 24, MCFixupKindInfo::FKF_IsPCRel},
       {"fixup_arm_blx", 8, 24, MCFixupKindInfo::FKF_IsPCRel},
       {"fixup_arm_thumb_bl", 0, 32, MCFixupKindInfo::FKF_IsPCRel},
-      {"fixup_arm_thumb_blx", 0, 32, MCFixupKindInfo::FKF_IsPCRel},
+      {"fixup_arm_thumb_blx", 0, 32,
+       MCFixupKindInfo::FKF_IsPCRel |
+           MCFixupKindInfo::FKF_IsAlignedDownTo32Bits},
       {"fixup_arm_thumb_cb", 0, 16, MCFixupKindInfo::FKF_IsPCRel},
       {"fixup_arm_thumb_cp", 8, 8,
        MCFixupKindInfo::FKF_IsPCRel |
@@ -133,6 +147,7 @@ const MCFixupKindInfo &ARMAsmBackend::getFixupKindInfo(MCFixupKind Kind) const {
       {"fixup_arm_movw_lo16", 12, 20, 0},
       {"fixup_t2_movt_hi16", 12, 20, 0},
       {"fixup_t2_movw_lo16", 12, 20, 0},
+      {"fixup_arm_mod_imm", 20, 12, 0},
   };
 
   if (Kind < FirstTargetFixupKind)
@@ -155,10 +170,10 @@ void ARMAsmBackend::handleAssemblerFlag(MCAssemblerFlag Flag) {
     break;
   }
 }
-} // end anonymous namespace
 
 unsigned ARMAsmBackend::getRelaxedOpcode(unsigned Op) const {
   bool HasThumb2 = STI->getFeatureBits()[ARM::FeatureThumb2];
+  bool HasV8MBaselineOps = STI->getFeatureBits()[ARM::HasV8MBaselineOps];
 
   switch (Op) {
   default:
@@ -170,7 +185,7 @@ unsigned ARMAsmBackend::getRelaxedOpcode(unsigned Op) const {
   case ARM::tADR:
     return HasThumb2 ? (unsigned)ARM::t2ADR : Op;
   case ARM::tB:
-    return HasThumb2 ? (unsigned)ARM::t2B : Op;
+    return HasV8MBaselineOps ? (unsigned)ARM::t2B : Op;
   case ARM::tCBZ:
     return ARM::tHINT;
   case ARM::tCBNZ:
@@ -243,7 +258,9 @@ bool ARMAsmBackend::fixupNeedsRelaxation(const MCFixup &Fixup, uint64_t Value,
   return reasonForFixupRelaxation(Fixup, Value);
 }
 
-void ARMAsmBackend::relaxInstruction(const MCInst &Inst, MCInst &Res) const {
+void ARMAsmBackend::relaxInstruction(const MCInst &Inst,
+                                     const MCSubtargetInfo &STI,
+                                     MCInst &Res) const {
   unsigned RelaxedOp = getRelaxedOpcode(Inst.getOpcode());
 
   // Sanity check w/ diagnostic if we get here w/ a bogus instruction.
@@ -449,7 +466,7 @@ unsigned ARMAsmBackend::adjustFixupValue(const MCFixup &Fixup, uint64_t Value,
     // Offset by 8 just as above.
     if (const MCSymbolRefExpr *SRE =
             dyn_cast<MCSymbolRefExpr>(Fixup.getValue()))
-      if (SRE->getKind() == MCSymbolRefExpr::VK_ARM_TLSCALL)
+      if (SRE->getKind() == MCSymbolRefExpr::VK_TLSCALL)
         return 0;
     return 0xffffff & ((Value - 8) >> 2);
   case ARM::fixup_t2_uncondbranch: {
@@ -524,10 +541,15 @@ unsigned ARMAsmBackend::adjustFixupValue(const MCFixup &Fixup, uint64_t Value,
     //
     // Note that the halfwords are stored high first, low second; so we need
     // to transpose the fixup value here to map properly.
-    uint32_t offset = (Value - 2) >> 2;
+    if (Ctx && Value  % 4 != 0) {
+      Ctx->reportError(Fixup.getLoc(), "misaligned ARM call destination");
+      return 0;
+    }
+
+    uint32_t offset = (Value - 4) >> 2;
     if (const MCSymbolRefExpr *SRE =
             dyn_cast<MCSymbolRefExpr>(Fixup.getValue()))
-      if (SRE->getKind() == MCSymbolRefExpr::VK_ARM_TLSCALL)
+      if (SRE->getKind() == MCSymbolRefExpr::VK_TLSCALL)
         offset = 0;
     uint32_t signBit = (offset & 0x400000) >> 22;
     uint32_t I1Bit = (offset & 0x200000) >> 21;
@@ -563,7 +585,8 @@ unsigned ARMAsmBackend::adjustFixupValue(const MCFixup &Fixup, uint64_t Value,
   }
   case ARM::fixup_arm_thumb_br:
     // Offset by 4 and don't encode the lower bit, which is always 0.
-    if (Ctx && !STI->getFeatureBits()[ARM::FeatureThumb2]) {
+    if (Ctx && !STI->getFeatureBits()[ARM::FeatureThumb2] &&
+               !STI->getFeatureBits()[ARM::HasV8MBaselineOps]) {
       const char *FixupDiagnostic = reasonForFixupRelaxation(Fixup, Value);
       if (FixupDiagnostic) {
         Ctx->reportError(Fixup.getLoc(), FixupDiagnostic);
@@ -624,6 +647,44 @@ unsigned ARMAsmBackend::adjustFixupValue(const MCFixup &Fixup, uint64_t Value,
 
     return Value;
   }
+  case ARM::fixup_arm_pcrel_9:
+    Value = Value - 4; // ARM fixups offset by an additional word and don't
+                       // need to adjust for the half-word ordering.
+                       // Fall through.
+  case ARM::fixup_t2_pcrel_9: {
+    // Offset by 4, adjusted by two due to the half-word ordering of thumb.
+    Value = Value - 4;
+    bool isAdd = true;
+    if ((int64_t)Value < 0) {
+      Value = -Value;
+      isAdd = false;
+    }
+    // These values don't encode the low bit since it's always zero.
+    if (Ctx && (Value & 1)) {
+      Ctx->reportError(Fixup.getLoc(), "invalid value for this fixup");
+      return 0;
+    }
+    Value >>= 1;
+    if (Ctx && Value >= 256) {
+      Ctx->reportError(Fixup.getLoc(), "out of range pc-relative fixup value");
+      return 0;
+    }
+    Value |= isAdd << 23;
+
+    // Same addressing mode as fixup_arm_pcrel_9, but with 16-bit halfwords
+    // swapped.
+    if (Kind == ARM::fixup_t2_pcrel_9)
+      return swapHalfWords(Value, IsLittleEndian);
+
+    return Value;
+  }
+  case ARM::fixup_arm_mod_imm:
+    Value = ARM_AM::getSOImmVal(Value);
+    if (Ctx && Value >> 12) {
+      Ctx->reportError(Fixup.getLoc(), "out of range immediate fixup value");
+      return 0;
+    }
+    return Value;
   }
 }
 
@@ -690,11 +751,13 @@ static unsigned getFixupKindNumBytes(unsigned Kind) {
   case FK_Data_2:
   case ARM::fixup_arm_thumb_br:
   case ARM::fixup_arm_thumb_cb:
+  case ARM::fixup_arm_mod_imm:
     return 2;
 
   case ARM::fixup_arm_pcrel_10_unscaled:
   case ARM::fixup_arm_ldst_pcrel_12:
   case ARM::fixup_arm_pcrel_10:
+  case ARM::fixup_arm_pcrel_9:
   case ARM::fixup_arm_adr_pcrel_12:
   case ARM::fixup_arm_uncondbl:
   case ARM::fixup_arm_condbl:
@@ -708,6 +771,7 @@ static unsigned getFixupKindNumBytes(unsigned Kind) {
   case ARM::fixup_t2_condbranch:
   case ARM::fixup_t2_uncondbranch:
   case ARM::fixup_t2_pcrel_10:
+  case ARM::fixup_t2_pcrel_9:
   case ARM::fixup_t2_adr_pcrel_12:
   case ARM::fixup_arm_thumb_bl:
   case ARM::fixup_arm_thumb_blx:
@@ -766,6 +830,7 @@ static unsigned getFixupKindContainerSizeBytes(unsigned Kind) {
   case ARM::fixup_arm_movw_lo16:
   case ARM::fixup_t2_movt_hi16:
   case ARM::fixup_t2_movw_lo16:
+  case ARM::fixup_arm_mod_imm:
     // Instruction size is 4 bytes.
     return 4;
   }
diff --git a/lib/Target/ARM/MCTargetDesc/ARMAsmBackend.h b/lib/Target/ARM/MCTargetDesc/ARMAsmBackend.h
index 28a62132a419..84caaacc47d3 100644
--- a/lib/Target/ARM/MCTargetDesc/ARMAsmBackend.h
+++ b/lib/Target/ARM/MCTargetDesc/ARMAsmBackend.h
@@ -11,12 +11,12 @@
 #define LLVM_LIB_TARGET_ARM_ARMASMBACKEND_H
 
 #include "MCTargetDesc/ARMFixupKinds.h"
+#include "MCTargetDesc/ARMMCTargetDesc.h"
 #include "llvm/MC/MCAsmBackend.h"
 #include "llvm/MC/MCSubtargetInfo.h"
+#include "llvm/Support/TargetRegistry.h"
 
-using namespace llvm;
-
-namespace {
+namespace llvm {
 
 class ARMAsmBackend : public MCAsmBackend {
   const MCSubtargetInfo *STI;
@@ -63,7 +63,8 @@ public:
                             const MCRelaxableFragment *DF,
                             const MCAsmLayout &Layout) const override;
 
-  void relaxInstruction(const MCInst &Inst, MCInst &Res) const override;
+  void relaxInstruction(const MCInst &Inst, const MCSubtargetInfo &STI,
+                        MCInst &Res) const override;
 
   bool writeNopData(uint64_t Count, MCObjectWriter *OW) const override;
 
@@ -74,6 +75,6 @@ public:
   void setIsThumb(bool it) { isThumbMode = it; }
   bool isLittle() const { return IsLittleEndian; }
 };
-} // end anonymous namespace
+} // end namespace llvm
 
 #endif
diff --git a/lib/Target/ARM/MCTargetDesc/ARMAsmBackendDarwin.h b/lib/Target/ARM/MCTargetDesc/ARMAsmBackendDarwin.h
index 995dd0fe08ee..09dc0173ade6 100644
--- a/lib/Target/ARM/MCTargetDesc/ARMAsmBackendDarwin.h
+++ b/lib/Target/ARM/MCTargetDesc/ARMAsmBackendDarwin.h
@@ -10,11 +10,10 @@
 #ifndef LLVM_LIB_TARGET_ARM_ARMASMBACKENDDARWIN_H
 #define LLVM_LIB_TARGET_ARM_ARMASMBACKENDDARWIN_H
 
+#include "ARMAsmBackend.h"
 #include "llvm/Support/MachO.h"
 
-using namespace llvm;
-
-namespace {
+namespace llvm {
 class ARMAsmBackendDarwin : public ARMAsmBackend {
   const MCRegisterInfo &MRI;
 public:
@@ -22,7 +21,6 @@ public:
   ARMAsmBackendDarwin(const Target &T, const Triple &TT,
                       const MCRegisterInfo &MRI, MachO::CPUSubTypeARM st)
       : ARMAsmBackend(T, TT, /* IsLittleEndian */ true), MRI(MRI), Subtype(st) {
-    HasDataInCodeSupport = true;
   }
 
   MCObjectWriter *createObjectWriter(raw_pwrite_stream &OS) const override {
@@ -33,6 +31,6 @@ public:
   uint32_t generateCompactUnwindEncoding(
       ArrayRef<MCCFIInstruction> Instrs) const override;
 };
-}
+} // end namespace llvm
 
 #endif
diff --git a/lib/Target/ARM/MCTargetDesc/ARMAsmBackendELF.h b/lib/Target/ARM/MCTargetDesc/ARMAsmBackendELF.h
index 68b12edd089e..748f915be17b 100644
--- a/lib/Target/ARM/MCTargetDesc/ARMAsmBackendELF.h
+++ b/lib/Target/ARM/MCTargetDesc/ARMAsmBackendELF.h
@@ -10,7 +10,10 @@
 #ifndef LLVM_LIB_TARGET_ARM_ELFARMASMBACKEND_H
 #define LLVM_LIB_TARGET_ARM_ELFARMASMBACKEND_H
 
+#include "ARMAsmBackend.h"
+#include "MCTargetDesc/ARMMCTargetDesc.h"
 using namespace llvm;
+
 namespace {
 class ARMAsmBackendELF : public ARMAsmBackend {
 public:
diff --git a/lib/Target/ARM/MCTargetDesc/ARMAsmBackendWinCOFF.h b/lib/Target/ARM/MCTargetDesc/ARMAsmBackendWinCOFF.h
index 170f59a4c905..2a375be49a83 100644
--- a/lib/Target/ARM/MCTargetDesc/ARMAsmBackendWinCOFF.h
+++ b/lib/Target/ARM/MCTargetDesc/ARMAsmBackendWinCOFF.h
@@ -10,6 +10,7 @@
 #ifndef LLVM_LIB_TARGET_ARM_ARMASMBACKENDWINCOFF_H
 #define LLVM_LIB_TARGET_ARM_ARMASMBACKENDWINCOFF_H
 
+#include "ARMAsmBackend.h"
 using namespace llvm;
 
 namespace {
diff --git a/lib/Target/ARM/MCTargetDesc/ARMBaseInfo.h b/lib/Target/ARM/MCTargetDesc/ARMBaseInfo.h
index 4289a73e9d6b..088b4205ed62 100644
--- a/lib/Target/ARM/MCTargetDesc/ARMBaseInfo.h
+++ b/lib/Target/ARM/MCTargetDesc/ARMBaseInfo.h
@@ -289,18 +289,20 @@ namespace ARMII {
     /// higher 16 bit of the address. Used only via movt instruction.
     MO_HI16 = 0x2,
 
-    /// MO_PLT - On a symbol operand, this represents an ELF PLT reference on a
-    /// call operand.
-    MO_PLT = 0x3,
-
     /// MO_OPTION_MASK - Most flags are mutually exclusive; this mask selects
     /// just that part of the flag set.
-    MO_OPTION_MASK = 0x3f,
+    MO_OPTION_MASK = 0x1f,
 
     /// MO_DLLIMPORT - On a symbol operand, this represents that the reference
     /// to the symbol is for an import stub.  This is used for DLL import
     /// storage class indication on Windows.
-    MO_DLLIMPORT = 0x40,
+    MO_DLLIMPORT = 0x20,
+
+    /// MO_SECREL - On a symbol operand this indicates that the immediate is
+    /// the offset from beginning of section.
+    ///
+    /// This is the TLS offset for the COFF/Windows TLS mechanism.
+    MO_SECREL = 0x40,
 
     /// MO_NONLAZY - This is an independent flag, on a symbol operand "FOO" it
     /// represents a symbol which, if indirect, will get special Darwin mangling
diff --git a/lib/Target/ARM/MCTargetDesc/ARMELFObjectWriter.cpp b/lib/Target/ARM/MCTargetDesc/ARMELFObjectWriter.cpp
index 52eba8be288f..4118fe8e8cdb 100644
--- a/lib/Target/ARM/MCTargetDesc/ARMELFObjectWriter.cpp
+++ b/lib/Target/ARM/MCTargetDesc/ARMELFObjectWriter.cpp
@@ -34,8 +34,8 @@ namespace {
 
     ~ARMELFObjectWriter() override;
 
-    unsigned GetRelocType(const MCValue &Target, const MCFixup &Fixup,
-                          bool IsPCRel) const override;
+    unsigned getRelocType(MCContext &Ctx, const MCValue &Target,
+                          const MCFixup &Fixup, bool IsPCRel) const override;
 
     bool needsRelocateWithSymbol(const MCSymbol &Sym,
                                  unsigned Type) const override;
@@ -67,7 +67,7 @@ bool ARMELFObjectWriter::needsRelocateWithSymbol(const MCSymbol &Sym,
 // Need to examine the Fixup when determining whether to 
 // emit the relocation as an explicit symbol or as a section relative
 // offset
-unsigned ARMELFObjectWriter::GetRelocType(const MCValue &Target,
+unsigned ARMELFObjectWriter::getRelocType(MCContext &Ctx, const MCValue &Target,
                                           const MCFixup &Fixup,
                                           bool IsPCRel) const {
   return GetRelocTypeInner(Target, Fixup, IsPCRel);
@@ -98,6 +98,9 @@ unsigned ARMELFObjectWriter::GetRelocTypeInner(const MCValue &Target,
       case MCSymbolRefExpr::VK_ARM_GOT_PREL:
         Type = ELF::R_ARM_GOT_PREL;
         break;
+      case MCSymbolRefExpr::VK_ARM_PREL31:
+        Type = ELF::R_ARM_PREL31;
+        break;
       }
       break;
     case ARM::fixup_arm_blx:
@@ -106,7 +109,7 @@ unsigned ARMELFObjectWriter::GetRelocTypeInner(const MCValue &Target,
       case MCSymbolRefExpr::VK_PLT:
         Type = ELF::R_ARM_CALL;
         break;
-      case MCSymbolRefExpr::VK_ARM_TLSCALL:
+      case MCSymbolRefExpr::VK_TLSCALL:
         Type = ELF::R_ARM_TLS_CALL;
         break;
       default:
@@ -120,6 +123,8 @@ unsigned ARMELFObjectWriter::GetRelocTypeInner(const MCValue &Target,
       Type = ELF::R_ARM_JUMP24;
       break;
     case ARM::fixup_t2_condbranch:
+      Type = ELF::R_ARM_THM_JUMP19;
+      break;
     case ARM::fixup_t2_uncondbranch:
       Type = ELF::R_ARM_THM_JUMP24;
       break;
@@ -138,7 +143,7 @@ unsigned ARMELFObjectWriter::GetRelocTypeInner(const MCValue &Target,
     case ARM::fixup_arm_thumb_bl:
     case ARM::fixup_arm_thumb_blx:
       switch (Modifier) {
-      case MCSymbolRefExpr::VK_ARM_TLSCALL:
+      case MCSymbolRefExpr::VK_TLSCALL:
         Type = ELF::R_ARM_THM_TLS_CALL;
         break;
       default:
@@ -210,10 +215,10 @@ unsigned ARMELFObjectWriter::GetRelocTypeInner(const MCValue &Target,
       case MCSymbolRefExpr::VK_ARM_TLSLDO:
         Type = ELF::R_ARM_TLS_LDO32;
         break;
-      case MCSymbolRefExpr::VK_ARM_TLSCALL:
+      case MCSymbolRefExpr::VK_TLSCALL:
         Type = ELF::R_ARM_TLS_CALL;
         break;
-      case MCSymbolRefExpr::VK_ARM_TLSDESC:
+      case MCSymbolRefExpr::VK_TLSDESC:
         Type = ELF::R_ARM_TLS_GOTDESC;
         break;
       case MCSymbolRefExpr::VK_ARM_TLSDESCSEQ:
diff --git a/lib/Target/ARM/MCTargetDesc/ARMELFStreamer.cpp b/lib/Target/ARM/MCTargetDesc/ARMELFStreamer.cpp
index 57577dc834b7..36cb74765f3b 100644
--- a/lib/Target/ARM/MCTargetDesc/ARMELFStreamer.cpp
+++ b/lib/Target/ARM/MCTargetDesc/ARMELFStreamer.cpp
@@ -763,6 +763,12 @@ void ARMTargetELFStreamer::emitArchDefaultAttributes() {
     setAttributeItem(Virtualization_use, AllowTZVirtualization, false);
     break;
 
+  case ARM::AK_ARMV8MBaseline:
+  case ARM::AK_ARMV8MMainline:
+    setAttributeItem(THUMB_ISA_use, AllowThumbDerived, false);
+    setAttributeItem(CPU_arch_profile, MicroControllerProfile, false);
+    break;
+
   case ARM::AK_IWMMXT:
     setAttributeItem(ARM_ISA_use, Allowed, false);
     setAttributeItem(THUMB_ISA_use, Allowed, false);
diff --git a/lib/Target/ARM/MCTargetDesc/ARMFixupKinds.h b/lib/Target/ARM/MCTargetDesc/ARMFixupKinds.h
index 46ba57170db5..3fe2302bdd37 100644
--- a/lib/Target/ARM/MCTargetDesc/ARMFixupKinds.h
+++ b/lib/Target/ARM/MCTargetDesc/ARMFixupKinds.h
@@ -33,6 +33,13 @@ enum Fixups {
   // fixup_t2_pcrel_10 - Equivalent to fixup_arm_pcrel_10, accounting for
   // the short-swapped encoding of Thumb2 instructions.
   fixup_t2_pcrel_10,
+  // fixup_arm_pcrel_9 - 9-bit PC relative relocation for symbol addresses
+  // used in VFP instructions where bit 0 not encoded (so it's encoded as an
+  // 8-bit immediate).
+  fixup_arm_pcrel_9,
+  // fixup_t2_pcrel_9 - Equivalent to fixup_arm_pcrel_9, accounting for
+  // the short-swapped encoding of Thumb2 instructions.
+  fixup_t2_pcrel_9,
   // fixup_thumb_adr_pcrel_10 - 10-bit PC relative relocation for symbol
   // addresses where the lower 2 bits are not encoded (so it's encoded as an
   // 8-bit immediate).
@@ -100,6 +107,9 @@ enum Fixups {
   fixup_t2_movt_hi16, // :upper16:
   fixup_t2_movw_lo16, // :lower16:
 
+  // fixup_arm_mod_imm - Fixup for mod_imm
+  fixup_arm_mod_imm,
+
   // Marker
   LastTargetFixupKind,
   NumTargetFixupKinds = LastTargetFixupKind - FirstTargetFixupKind
diff --git a/lib/Target/ARM/MCTargetDesc/ARMMCAsmInfo.cpp b/lib/Target/ARM/MCTargetDesc/ARMMCAsmInfo.cpp
index bda37f6616a8..53cd29a6061e 100644
--- a/lib/Target/ARM/MCTargetDesc/ARMMCAsmInfo.cpp
+++ b/lib/Target/ARM/MCTargetDesc/ARMMCAsmInfo.cpp
@@ -13,7 +13,6 @@
 
 #include "ARMMCAsmInfo.h"
 #include "llvm/ADT/Triple.h"
-#include "llvm/Support/CommandLine.h"
 
 using namespace llvm;
 
@@ -33,7 +32,7 @@ ARMMCAsmInfoDarwin::ARMMCAsmInfoDarwin(const Triple &TheTriple) {
   SupportsDebugInformation = true;
 
   // Exceptions handling
-  ExceptionsType = TheTriple.isOSDarwin() && !TheTriple.isWatchOS()
+  ExceptionsType = (TheTriple.isOSDarwin() && !TheTriple.isWatchABI())
                        ? ExceptionHandling::SjLj
                        : ExceptionHandling::DwarfCFI;
 
diff --git a/lib/Target/ARM/MCTargetDesc/ARMMCCodeEmitter.cpp b/lib/Target/ARM/MCTargetDesc/ARMMCCodeEmitter.cpp
index b88578309f08..9fca13eeea93 100644
--- a/lib/Target/ARM/MCTargetDesc/ARMMCCodeEmitter.cpp
+++ b/lib/Target/ARM/MCTargetDesc/ARMMCCodeEmitter.cpp
@@ -120,11 +120,11 @@ public:
                                   SmallVectorImpl<MCFixup> &Fixups,
                                   const MCSubtargetInfo &STI) const;
 
-  /// getUnconditionalBranchTargetOpValue - Return encoding info for 24-bit
+  /// getThumbBranchTargetOpValue - Return encoding info for 24-bit
   /// immediate Thumb2 direct branch target.
-  uint32_t getUnconditionalBranchTargetOpValue(const MCInst &MI, unsigned OpIdx,
-                                  SmallVectorImpl<MCFixup> &Fixups,
-                                  const MCSubtargetInfo &STI) const;
+  uint32_t getThumbBranchTargetOpValue(const MCInst &MI, unsigned OpIdx,
+                                       SmallVectorImpl<MCFixup> &Fixups,
+                                       const MCSubtargetInfo &STI) const;
 
   /// getARMBranchTargetOpValue - Return encoding info for 24-bit immediate
   /// branch target.
@@ -214,11 +214,6 @@ public:
     llvm_unreachable("Invalid ShiftOpc!");
   }
 
-  /// getAddrMode2OpValue - Return encoding for addrmode2 operands.
-  uint32_t getAddrMode2OpValue(const MCInst &MI, unsigned OpIdx,
-                               SmallVectorImpl<MCFixup> &Fixups,
-                               const MCSubtargetInfo &STI) const;
-
   /// getAddrMode2OffsetOpValue - Return encoding for am2offset operands.
   uint32_t getAddrMode2OffsetOpValue(const MCInst &MI, unsigned OpIdx,
                                      SmallVectorImpl<MCFixup> &Fixups,
@@ -255,11 +250,16 @@ public:
                                 SmallVectorImpl<MCFixup> &Fixups,
                                 const MCSubtargetInfo &STI) const;
 
-  /// getAddrMode5OpValue - Return encoding info for 'reg +/- imm8' operand.
+  /// getAddrMode5OpValue - Return encoding info for 'reg +/- (imm8 << 2)' operand.
   uint32_t getAddrMode5OpValue(const MCInst &MI, unsigned OpIdx,
                                SmallVectorImpl<MCFixup> &Fixups,
                                const MCSubtargetInfo &STI) const;
 
+  /// getAddrMode5FP16OpValue - Return encoding info for 'reg +/- (imm8 << 1)' operand.
+  uint32_t getAddrMode5FP16OpValue(const MCInst &MI, unsigned OpIdx,
+                               SmallVectorImpl<MCFixup> &Fixups,
+                               const MCSubtargetInfo &STI) const;
+
   /// getCCOutOpValue - Return encoding of the 's' bit.
   unsigned getCCOutOpValue(const MCInst &MI, unsigned Op,
                            SmallVectorImpl<MCFixup> &Fixups,
@@ -312,12 +312,8 @@ public:
     // Support for fixups (MCFixup)
     if (MO.isExpr()) {
       const MCExpr *Expr = MO.getExpr();
-      // In instruction code this value always encoded as lowest 12 bits,
-      // so we don't have to perform any specific adjustments.
-      // Due to requirements of relocatable records we have to use FK_Data_4.
-      // See ARMELFObjectWriter::ExplicitRelSym and
-      //     ARMELFObjectWriter::GetRelocTypeInner for more details.
-      MCFixupKind Kind = MCFixupKind(FK_Data_4);
+      // Fixups resolve to plain values that need to be encoded.
+      MCFixupKind Kind = MCFixupKind(ARM::fixup_arm_mod_imm);
       Fixups.push_back(MCFixup::create(0, Expr, Kind, MI.getLoc()));
       return 0;
     }
@@ -345,9 +341,6 @@ public:
   unsigned getT2AddrModeImm8OffsetOpValue(const MCInst &MI, unsigned OpNum,
     SmallVectorImpl<MCFixup> &Fixups,
     const MCSubtargetInfo &STI) const;
-  unsigned getT2AddrModeImm12OffsetOpValue(const MCInst &MI, unsigned OpNum,
-    SmallVectorImpl<MCFixup> &Fixups,
-    const MCSubtargetInfo &STI) const;
 
   /// getSORegOpValue - Return an encoded so_reg shifted register value.
   unsigned getSORegRegOpValue(const MCInst &MI, unsigned Op,
@@ -757,10 +750,9 @@ getARMBLXTargetOpValue(const MCInst &MI, unsigned OpIdx,
 
 /// getUnconditionalBranchTargetOpValue - Return encoding info for 24-bit
 /// immediate branch target.
-uint32_t ARMMCCodeEmitter::
-getUnconditionalBranchTargetOpValue(const MCInst &MI, unsigned OpIdx,
-                       SmallVectorImpl<MCFixup> &Fixups,
-                       const MCSubtargetInfo &STI) const {
+uint32_t ARMMCCodeEmitter::getThumbBranchTargetOpValue(
+    const MCInst &MI, unsigned OpIdx, SmallVectorImpl<MCFixup> &Fixups,
+    const MCSubtargetInfo &STI) const {
   unsigned Val = 0;
   const MCOperand MO = MI.getOperand(OpIdx);
     
@@ -1049,12 +1041,12 @@ ARMMCCodeEmitter::getHiLo16ImmOpValue(const MCInst &MI, unsigned OpIdx,
     switch (ARM16Expr->getKind()) {
     default: llvm_unreachable("Unsupported ARMFixup");
     case ARMMCExpr::VK_ARM_HI16:
-      Kind = MCFixupKind(isThumb2(STI) ? ARM::fixup_t2_movt_hi16
-                                       : ARM::fixup_arm_movt_hi16);
+      Kind = MCFixupKind(isThumb(STI) ? ARM::fixup_t2_movt_hi16
+                                      : ARM::fixup_arm_movt_hi16);
       break;
     case ARMMCExpr::VK_ARM_LO16:
-      Kind = MCFixupKind(isThumb2(STI) ? ARM::fixup_t2_movw_lo16
-                                       : ARM::fixup_arm_movw_lo16);
+      Kind = MCFixupKind(isThumb(STI) ? ARM::fixup_t2_movw_lo16
+                                      : ARM::fixup_arm_movw_lo16);
       break;
     }
 
@@ -1105,21 +1097,6 @@ getLdStSORegOpValue(const MCInst &MI, unsigned OpIdx,
 }
 
 uint32_t ARMMCCodeEmitter::
-getAddrMode2OpValue(const MCInst &MI, unsigned OpIdx,
-                    SmallVectorImpl<MCFixup> &Fixups,
-                    const MCSubtargetInfo &STI) const {
-  // {17-14}  Rn
-  // {13}     1 == imm12, 0 == Rm
-  // {12}     isAdd
-  // {11-0}   imm12/Rm
-  const MCOperand &MO = MI.getOperand(OpIdx);
-  unsigned Rn = CTX.getRegisterInfo()->getEncodingValue(MO.getReg());
-  uint32_t Binary = getAddrMode2OffsetOpValue(MI, OpIdx + 1, Fixups, STI);
-  Binary |= Rn << 14;
-  return Binary;
-}
-
-uint32_t ARMMCCodeEmitter::
 getAddrMode2OffsetOpValue(const MCInst &MI, unsigned OpIdx,
                           SmallVectorImpl<MCFixup> &Fixups,
                           const MCSubtargetInfo &STI) const {
@@ -1252,7 +1229,7 @@ getAddrModePCOpValue(const MCInst &MI, unsigned OpIdx,
   return (MO.getImm() >> 2);
 }
 
-/// getAddrMode5OpValue - Return encoding info for 'reg +/- imm10' operand.
+/// getAddrMode5OpValue - Return encoding info for 'reg +/- (imm8 << 2)' operand.
 uint32_t ARMMCCodeEmitter::
 getAddrMode5OpValue(const MCInst &MI, unsigned OpIdx,
                     SmallVectorImpl<MCFixup> &Fixups,
@@ -1292,6 +1269,46 @@ getAddrMode5OpValue(const MCInst &MI, unsigned OpIdx,
   return Binary;
 }
 
+/// getAddrMode5FP16OpValue - Return encoding info for 'reg +/- (imm8 << 1)' operand.
+uint32_t ARMMCCodeEmitter::
+getAddrMode5FP16OpValue(const MCInst &MI, unsigned OpIdx,
+                    SmallVectorImpl<MCFixup> &Fixups,
+                    const MCSubtargetInfo &STI) const {
+  // {12-9} = reg
+  // {8}    = (U)nsigned (add == '1', sub == '0')
+  // {7-0}  = imm8
+  unsigned Reg, Imm8;
+  bool isAdd;
+  // If The first operand isn't a register, we have a label reference.
+  const MCOperand &MO = MI.getOperand(OpIdx);
+  if (!MO.isReg()) {
+    Reg = CTX.getRegisterInfo()->getEncodingValue(ARM::PC);   // Rn is PC.
+    Imm8 = 0;
+    isAdd = false; // 'U' bit is handled as part of the fixup.
+
+    assert(MO.isExpr() && "Unexpected machine operand type!");
+    const MCExpr *Expr = MO.getExpr();
+    MCFixupKind Kind;
+    if (isThumb2(STI))
+      Kind = MCFixupKind(ARM::fixup_t2_pcrel_9);
+    else
+      Kind = MCFixupKind(ARM::fixup_arm_pcrel_9);
+    Fixups.push_back(MCFixup::create(0, Expr, Kind, MI.getLoc()));
+
+    ++MCNumCPRelocations;
+  } else {
+    EncodeAddrModeOpValues(MI, OpIdx, Reg, Imm8, Fixups, STI);
+    isAdd = ARM_AM::getAM5Op(Imm8) == ARM_AM::add;
+  }
+
+  uint32_t Binary = ARM_AM::getAM5Offset(Imm8);
+  // Immediate is always encoded as positive. The 'U' bit controls add vs sub.
+  if (isAdd)
+    Binary |= (1 << 8);
+  Binary |= (Reg << 9);
+  return Binary;
+}
+
 unsigned ARMMCCodeEmitter::
 getSORegRegOpValue(const MCInst &MI, unsigned OpIdx,
                 SmallVectorImpl<MCFixup> &Fixups,
@@ -1446,23 +1463,6 @@ getT2AddrModeImm8OffsetOpValue(const MCInst &MI, unsigned OpNum,
 }
 
 unsigned ARMMCCodeEmitter::
-getT2AddrModeImm12OffsetOpValue(const MCInst &MI, unsigned OpNum,
-                         SmallVectorImpl<MCFixup> &Fixups,
-                         const MCSubtargetInfo &STI) const {
-  const MCOperand &MO1 = MI.getOperand(OpNum);
-
-  // FIXME: Needs fixup support.
-  unsigned Value = 0;
-  int32_t tmp = (int32_t)MO1.getImm();
-  if (tmp < 0)
-    tmp = abs(tmp);
-  else
-    Value |= 4096; // Set the ADD bit
-  Value |= tmp & 4095;
-  return Value;
-}
-
-unsigned ARMMCCodeEmitter::
 getT2SORegOpValue(const MCInst &MI, unsigned OpIdx,
                 SmallVectorImpl<MCFixup> &Fixups,
                 const MCSubtargetInfo &STI) const {
diff --git a/lib/Target/ARM/MCTargetDesc/ARMMCTargetDesc.cpp b/lib/Target/ARM/MCTargetDesc/ARMMCTargetDesc.cpp
index 8c8c249addb5..afb089ab0286 100644
--- a/lib/Target/ARM/MCTargetDesc/ARMMCTargetDesc.cpp
+++ b/lib/Target/ARM/MCTargetDesc/ARMMCTargetDesc.cpp
@@ -16,7 +16,6 @@
 #include "ARMMCTargetDesc.h"
 #include "InstPrinter/ARMInstPrinter.h"
 #include "llvm/ADT/Triple.h"
-#include "llvm/MC/MCCodeGenInfo.h"
 #include "llvm/MC/MCELFStreamer.h"
 #include "llvm/MC/MCInstrAnalysis.h"
 #include "llvm/MC/MCInstrInfo.h"
@@ -201,18 +200,6 @@ static MCAsmInfo *createARMMCAsmInfo(const MCRegisterInfo &MRI,
   return MAI;
 }
 
-static MCCodeGenInfo *createARMMCCodeGenInfo(const Triple &TT, Reloc::Model RM,
-                                             CodeModel::Model CM,
-                                             CodeGenOpt::Level OL) {
-  MCCodeGenInfo *X = new MCCodeGenInfo();
-  if (RM == Reloc::Default) {
-    // Default relocation model on Darwin is PIC, not DynamicNoPIC.
-    RM = TT.isOSDarwin() ? Reloc::PIC_ : Reloc::DynamicNoPIC;
-  }
-  X->initMCCodeGenInfo(RM, CM, OL);
-  return X;
-}
-
 static MCStreamer *createELFStreamer(const Triple &T, MCContext &Ctx,
                                      MCAsmBackend &MAB, raw_pwrite_stream &OS,
                                      MCCodeEmitter *Emitter, bool RelaxAll) {
@@ -291,9 +278,6 @@ extern "C" void LLVMInitializeARMTargetMC() {
     // Register the MC asm info.
     RegisterMCAsmInfoFn X(*T, createARMMCAsmInfo);
 
-    // Register the MC codegen info.
-    TargetRegistry::RegisterMCCodeGenInfo(*T, createARMMCCodeGenInfo);
-
     // Register the MC instruction info.
     TargetRegistry::RegisterMCInstrInfo(*T, createARMMCInstrInfo);
 
diff --git a/lib/Target/ARM/MCTargetDesc/ARMMachORelocationInfo.cpp b/lib/Target/ARM/MCTargetDesc/ARMMachORelocationInfo.cpp
index 4468132588cf..482bcf902518 100644
--- a/lib/Target/ARM/MCTargetDesc/ARMMachORelocationInfo.cpp
+++ b/lib/Target/ARM/MCTargetDesc/ARMMachORelocationInfo.cpp
@@ -12,7 +12,7 @@
 #include "llvm-c/Disassembler.h"
 #include "llvm/MC/MCContext.h"
 #include "llvm/MC/MCExpr.h"
-#include "llvm/MC/MCRelocationInfo.h"
+#include "llvm/MC/MCDisassembler/MCRelocationInfo.h"
 
 using namespace llvm;
 using namespace object;
diff --git a/lib/Target/ARM/MCTargetDesc/ARMMachObjectWriter.cpp b/lib/Target/ARM/MCTargetDesc/ARMMachObjectWriter.cpp
index cfd504e533af..cfa6ce7da65e 100644
--- a/lib/Target/ARM/MCTargetDesc/ARMMachObjectWriter.cpp
+++ b/lib/Target/ARM/MCTargetDesc/ARMMachObjectWriter.cpp
@@ -389,7 +389,8 @@ void ARMMachObjectWriter::recordRelocation(MachObjectWriter *Writer,
   uint32_t Offset = Target.getConstant();
   if (IsPCRel && RelocType == MachO::ARM_RELOC_VANILLA)
     Offset += 1 << Log2Size;
-  if (Offset && A && !Writer->doesSymbolRequireExternRelocation(*A))
+  if (Offset && A && !Writer->doesSymbolRequireExternRelocation(*A) &&
+      RelocType != MachO::ARM_RELOC_HALF)
     return RecordARMScatteredRelocation(Writer, Asm, Layout, Fragment, Fixup,
                                         Target, RelocType, Log2Size,
                                         FixedValue);
@@ -447,8 +448,10 @@ void ARMMachObjectWriter::recordRelocation(MachObjectWriter *Writer,
   // Even when it's not a scattered relocation, movw/movt always uses
   // a PAIR relocation.
   if (Type == MachO::ARM_RELOC_HALF) {
-    // The other-half value only gets populated for the movt and movw
-    // relocation entries.
+    // The entire addend is needed to correctly apply a relocation. One half is
+    // extracted from the instruction itself, the other comes from this
+    // PAIR. I.e. it's correct that we insert the high bits of the addend in the
+    // MOVW case here.  relocation entries.
     uint32_t Value = 0;
     switch ((unsigned)Fixup.getKind()) {
     default: break;
diff --git a/lib/Target/ARM/MCTargetDesc/Makefile b/lib/Target/ARM/MCTargetDesc/Makefile
deleted file mode 100644
index 448ed9df2bff..000000000000
--- a/lib/Target/ARM/MCTargetDesc/Makefile
+++ /dev/null
@@ -1,16 +0,0 @@
-##===- lib/Target/ARM/TargetDesc/Makefile ------------------*- Makefile -*-===##
-#
-#                     The LLVM Compiler Infrastructure
-#
-# This file is distributed under the University of Illinois Open Source
-# License. See LICENSE.TXT for details.
-#
-##===----------------------------------------------------------------------===##
-
-LEVEL = ../../../..
-LIBRARYNAME = LLVMARMDesc
-
-# Hack: we need to include 'main' target directory to grab private headers
-CPP.Flags += -I$(PROJ_OBJ_DIR)/.. -I$(PROJ_SRC_DIR)/..
-
-include $(LEVEL)/Makefile.common
diff --git a/lib/Target/ARM/MLxExpansionPass.cpp b/lib/Target/ARM/MLxExpansionPass.cpp
index ed2deeaa24c0..7f2124033982 100644
--- a/lib/Target/ARM/MLxExpansionPass.cpp
+++ b/lib/Target/ARM/MLxExpansionPass.cpp
@@ -378,12 +378,14 @@ bool MLxExpansion::ExpandFPMLxInstructions(MachineBasicBlock &MBB) {
 }
 
 bool MLxExpansion::runOnMachineFunction(MachineFunction &Fn) {
+  if (skipFunction(*Fn.getFunction()))
+    return false;
+
   TII = static_cast<const ARMBaseInstrInfo *>(Fn.getSubtarget().getInstrInfo());
   TRI = Fn.getSubtarget().getRegisterInfo();
   MRI = &Fn.getRegInfo();
   const ARMSubtarget *STI = &Fn.getSubtarget<ARMSubtarget>();
-  // Only run this for CortexA9.
-  if (!STI->isCortexA9())
+  if (!STI->expandMLx())
     return false;
   isLikeA9 = STI->isLikeA9() || STI->isSwift();
   isSwift = STI->isSwift();
diff --git a/lib/Target/ARM/Makefile b/lib/Target/ARM/Makefile
deleted file mode 100644
index c1601a3f29dd..000000000000
--- a/lib/Target/ARM/Makefile
+++ /dev/null
@@ -1,24 +0,0 @@
-##===- lib/Target/ARM/Makefile -----------------------------*- Makefile -*-===##
-#
-#                     The LLVM Compiler Infrastructure
-#
-# This file is distributed under the University of Illinois Open Source
-# License. See LICENSE.TXT for details.
-#
-##===----------------------------------------------------------------------===##
-
-LEVEL = ../../..
-LIBRARYNAME = LLVMARMCodeGen
-TARGET = ARM
-
-# Make sure that tblgen is run, first thing.
-BUILT_SOURCES = ARMGenRegisterInfo.inc ARMGenInstrInfo.inc \
-		ARMGenAsmWriter.inc ARMGenAsmMatcher.inc \
-                ARMGenDAGISel.inc ARMGenSubtargetInfo.inc \
-                ARMGenCallingConv.inc \
-                ARMGenFastISel.inc ARMGenMCCodeEmitter.inc \
-                ARMGenMCPseudoLowering.inc ARMGenDisassemblerTables.inc
-
-DIRS = InstPrinter AsmParser Disassembler TargetInfo MCTargetDesc
-
-include $(LEVEL)/Makefile.common
diff --git a/lib/Target/ARM/README.txt b/lib/Target/ARM/README.txt
index 5acb2d46f3e7..549af00fcc99 100644
--- a/lib/Target/ARM/README.txt
+++ b/lib/Target/ARM/README.txt
@@ -115,7 +115,7 @@ L6:
 	.long	-858993459
 	.long	1074318540
 
-3) struct copies appear to be done field by field 
+3) struct copies appear to be done field by field
 instead of by words, at least sometimes:
 
 struct foo { int x; short s; char c1; char c2; };
@@ -142,7 +142,7 @@ a good way to measure on ARM).
 
 * Consider this silly example:
 
-double bar(double x) {  
+double bar(double x) {
   double r = foo(3.1);
   return x+r;
 }
@@ -162,7 +162,7 @@ _bar:
         fmrrd r0, r1, d0
         ldmfd sp!, {r4, r5, r7, pc}
 
-Ignore the prologue and epilogue stuff for a second. Note 
+Ignore the prologue and epilogue stuff for a second. Note
 	mov r4, r0
 	mov r5, r1
 the copys to callee-save registers and the fact they are only being used by the
@@ -269,7 +269,7 @@ LBB4:
   b LBB2
 
 If BB4 is the only predecessor of BB3, then we can emit BB3 after BB4. We can
-then eliminate beq and and turn the unconditional branch to LBB2 to a bne.
+then eliminate beq and turn the unconditional branch to LBB2 to a bne.
 
 See McCat/18-imp/ComputeBoundingBoxes for an example.
 
@@ -391,10 +391,10 @@ void foo(signed char* p) {
 }
 
 llvm decides it's a good idea to turn the repeated if...else into a
-binary tree, as if it were a switch; the resulting code requires -1 
+binary tree, as if it were a switch; the resulting code requires -1
 compare-and-branches when *p<=2 or *p==5, the same number if *p==4
 or *p>6, and +1 if *p==3.  So it should be a speed win
-(on balance).  However, the revised code is larger, with 4 conditional 
+(on balance).  However, the revised code is larger, with 4 conditional
 branches instead of 3.
 
 More seriously, there is a byte->word extend before
@@ -421,8 +421,8 @@ int foo(int a, int b, int c, int d) {
   return (int)(acc >> 32);
 }
 
-Should compile to use SMLAL (Signed Multiply Accumulate Long) which multiplies 
-two signed 32-bit values to produce a 64-bit value, and accumulates this with 
+Should compile to use SMLAL (Signed Multiply Accumulate Long) which multiplies
+two signed 32-bit values to produce a 64-bit value, and accumulates this with
 a 64-bit value.
 
 We currently get this with both v4 and v6:
@@ -513,7 +513,7 @@ Be careful though as the last attempt caused infinite looping on lencod.
 
 //===---------------------------------------------------------------------===//
 
-Predication issue. This function:   
+Predication issue. This function:
 
 extern unsigned array[ 128 ];
 int     foo( int x ) {
diff --git a/lib/Target/ARM/TargetInfo/ARMTargetInfo.cpp b/lib/Target/ARM/TargetInfo/ARMTargetInfo.cpp
index df73554372d8..3f88eb818062 100644
--- a/lib/Target/ARM/TargetInfo/ARMTargetInfo.cpp
+++ b/lib/Target/ARM/TargetInfo/ARMTargetInfo.cpp
@@ -8,7 +8,6 @@
 //===----------------------------------------------------------------------===//
 
 #include "MCTargetDesc/ARMMCTargetDesc.h"
-#include "llvm/IR/Module.h"
 #include "llvm/Support/TargetRegistry.h"
 using namespace llvm;
 
diff --git a/lib/Target/ARM/TargetInfo/Makefile b/lib/Target/ARM/TargetInfo/Makefile
deleted file mode 100644
index 6292ab14b346..000000000000
--- a/lib/Target/ARM/TargetInfo/Makefile
+++ /dev/null
@@ -1,15 +0,0 @@
-##===- lib/Target/ARM/TargetInfo/Makefile ------------------*- Makefile -*-===##
-#
-#                     The LLVM Compiler Infrastructure
-#
-# This file is distributed under the University of Illinois Open Source
-# License. See LICENSE.TXT for details.
-#
-##===----------------------------------------------------------------------===##
-LEVEL = ../../../..
-LIBRARYNAME = LLVMARMInfo
-
-# Hack: we need to include 'main' target directory to grab private headers
-CPPFLAGS = -I$(PROJ_OBJ_DIR)/.. -I$(PROJ_SRC_DIR)/..
-
-include $(LEVEL)/Makefile.common
diff --git a/lib/Target/ARM/Thumb1FrameLowering.cpp b/lib/Target/ARM/Thumb1FrameLowering.cpp
index 93e0ac4aa320..c0732e4b750a 100644
--- a/lib/Target/ARM/Thumb1FrameLowering.cpp
+++ b/lib/Target/ARM/Thumb1FrameLowering.cpp
@@ -38,18 +38,17 @@ bool Thumb1FrameLowering::hasReservedCallFrame(const MachineFunction &MF) const{
   return !MF.getFrameInfo()->hasVarSizedObjects();
 }
 
-static void
-emitSPUpdate(MachineBasicBlock &MBB,
-             MachineBasicBlock::iterator &MBBI,
-             const TargetInstrInfo &TII, DebugLoc dl,
-             const ThumbRegisterInfo &MRI,
-             int NumBytes, unsigned MIFlags = MachineInstr::NoFlags)  {
+static void emitSPUpdate(MachineBasicBlock &MBB,
+                         MachineBasicBlock::iterator &MBBI,
+                         const TargetInstrInfo &TII, const DebugLoc &dl,
+                         const ThumbRegisterInfo &MRI, int NumBytes,
+                         unsigned MIFlags = MachineInstr::NoFlags) {
   emitThumbRegPlusImmediate(MBB, MBBI, dl, ARM::SP, ARM::SP, NumBytes, TII,
                             MRI, MIFlags);
 }
 
 
-void Thumb1FrameLowering::
+MachineBasicBlock::iterator Thumb1FrameLowering::
 eliminateCallFramePseudoInstr(MachineFunction &MF, MachineBasicBlock &MBB,
                               MachineBasicBlock::iterator I) const {
   const Thumb1InstrInfo &TII =
@@ -60,9 +59,9 @@ eliminateCallFramePseudoInstr(MachineFunction &MF, MachineBasicBlock &MBB,
     // If we have alloca, convert as follows:
     // ADJCALLSTACKDOWN -> sub, sp, sp, amount
     // ADJCALLSTACKUP   -> add, sp, sp, amount
-    MachineInstr *Old = I;
-    DebugLoc dl = Old->getDebugLoc();
-    unsigned Amount = Old->getOperand(0).getImm();
+    MachineInstr &Old = *I;
+    DebugLoc dl = Old.getDebugLoc();
+    unsigned Amount = Old.getOperand(0).getImm();
     if (Amount != 0) {
       // We need to keep the stack aligned properly.  To do this, we round the
       // amount of space needed for the outgoing arguments up to the next
@@ -71,7 +70,7 @@ eliminateCallFramePseudoInstr(MachineFunction &MF, MachineBasicBlock &MBB,
       Amount = (Amount+Align-1)/Align*Align;
 
       // Replace the pseudo instruction with a new instruction...
-      unsigned Opc = Old->getOpcode();
+      unsigned Opc = Old.getOpcode();
       if (Opc == ARM::ADJCALLSTACKDOWN || Opc == ARM::tADJCALLSTACKDOWN) {
         emitSPUpdate(MBB, I, TII, dl, *RegInfo, -Amount);
       } else {
@@ -80,7 +79,7 @@ eliminateCallFramePseudoInstr(MachineFunction &MF, MachineBasicBlock &MBB,
       }
     }
   }
-  MBB.erase(I);
+  return MBB.erase(I);
 }
 
 void Thumb1FrameLowering::emitPrologue(MachineFunction &MF,
@@ -151,7 +150,7 @@ void Thumb1FrameLowering::emitPrologue(MachineFunction &MF,
     case ARM::R9:
     case ARM::R10:
     case ARM::R11:
-      if (STI.isTargetMachO()) {
+      if (STI.splitFramePushPop()) {
         GPRCS2Size += 4;
         break;
       }
@@ -189,7 +188,7 @@ void Thumb1FrameLowering::emitPrologue(MachineFunction &MF,
 
   int FramePtrOffsetInBlock = 0;
   unsigned adjustedGPRCS1Size = GPRCS1Size;
-  if (tryFoldSPUpdateIntoPushPop(STI, MF, std::prev(MBBI), NumBytes)) {
+  if (tryFoldSPUpdateIntoPushPop(STI, MF, &*std::prev(MBBI), NumBytes)) {
     FramePtrOffsetInBlock = NumBytes;
     adjustedGPRCS1Size += NumBytes;
     NumBytes = 0;
@@ -213,7 +212,7 @@ void Thumb1FrameLowering::emitPrologue(MachineFunction &MF,
     case ARM::R10:
     case ARM::R11:
     case ARM::R12:
-      if (STI.isTargetMachO())
+      if (STI.splitFramePushPop())
         break;
       // fallthough
     case ARM::R0:
@@ -304,16 +303,15 @@ void Thumb1FrameLowering::emitPrologue(MachineFunction &MF,
     AFI->setShouldRestoreSPFromFP(true);
 }
 
-static bool isCSRestore(MachineInstr *MI, const MCPhysReg *CSRegs) {
-  if (MI->getOpcode() == ARM::tLDRspi &&
-      MI->getOperand(1).isFI() &&
-      isCalleeSavedRegister(MI->getOperand(0).getReg(), CSRegs))
+static bool isCSRestore(MachineInstr &MI, const MCPhysReg *CSRegs) {
+  if (MI.getOpcode() == ARM::tLDRspi && MI.getOperand(1).isFI() &&
+      isCalleeSavedRegister(MI.getOperand(0).getReg(), CSRegs))
     return true;
-  else if (MI->getOpcode() == ARM::tPOP) {
+  else if (MI.getOpcode() == ARM::tPOP) {
     // The first two operands are predicates. The last two are
     // imp-def and imp-use of SP. Check everything in between.
-    for (int i = 2, e = MI->getNumOperands() - 2; i != e; ++i)
-      if (!isCalleeSavedRegister(MI->getOperand(i).getReg(), CSRegs))
+    for (int i = 2, e = MI.getNumOperands() - 2; i != e; ++i)
+      if (!isCalleeSavedRegister(MI.getOperand(i).getReg(), CSRegs))
         return false;
     return true;
   }
@@ -346,8 +344,8 @@ void Thumb1FrameLowering::emitEpilogue(MachineFunction &MF,
     if (MBBI != MBB.begin()) {
       do
         --MBBI;
-      while (MBBI != MBB.begin() && isCSRestore(MBBI, CSRegs));
-      if (!isCSRestore(MBBI, CSRegs))
+      while (MBBI != MBB.begin() && isCSRestore(*MBBI, CSRegs));
+      if (!isCSRestore(*MBBI, CSRegs))
         ++MBBI;
     }
 
@@ -376,11 +374,11 @@ void Thumb1FrameLowering::emitEpilogue(MachineFunction &MF,
           .addReg(FramePtr));
     } else {
       if (MBBI != MBB.end() && MBBI->getOpcode() == ARM::tBX_RET &&
-          &MBB.front() != MBBI && std::prev(MBBI)->getOpcode() == ARM::tPOP) {
+          &MBB.front() != &*MBBI && std::prev(MBBI)->getOpcode() == ARM::tPOP) {
         MachineBasicBlock::iterator PMBBI = std::prev(MBBI);
-        if (!tryFoldSPUpdateIntoPushPop(STI, MF, PMBBI, NumBytes))
+        if (!tryFoldSPUpdateIntoPushPop(STI, MF, &*PMBBI, NumBytes))
           emitSPUpdate(MBB, PMBBI, TII, dl, *RegInfo, NumBytes);
-      } else if (!tryFoldSPUpdateIntoPushPop(STI, MF, MBBI, NumBytes))
+      } else if (!tryFoldSPUpdateIntoPushPop(STI, MF, &*MBBI, NumBytes))
         emitSPUpdate(MBB, MBBI, TII, dl, *RegInfo, NumBytes);
     }
   }
@@ -467,7 +465,7 @@ bool Thumb1FrameLowering::emitPopSpecialFixUp(MachineBasicBlock &MBB,
   // Look for a temporary register to use.
   // First, compute the liveness information.
   LivePhysRegs UsedRegs(STI.getRegisterInfo());
-  UsedRegs.addLiveOuts(&MBB, /*AddPristines*/ true);
+  UsedRegs.addLiveOuts(MBB);
   // The semantic of pristines changed recently and now,
   // the callee-saved registers that are touched in the function
   // are not part of the pristines set anymore.
@@ -637,7 +635,7 @@ restoreCalleeSavedRegisters(MachineBasicBlock &MBB,
         Reg = ARM::PC;
         (*MIB).setDesc(TII.get(ARM::tPOP_RET));
         if (MI != MBB.end())
-          MIB.copyImplicitOps(&*MI);
+          MIB.copyImplicitOps(*MI);
         MI = MBB.erase(MI);
       } else
         // LR may only be popped into PC, as part of return sequence.
diff --git a/lib/Target/ARM/Thumb1FrameLowering.h b/lib/Target/ARM/Thumb1FrameLowering.h
index 27faac63683a..9de1ba1d7009 100644
--- a/lib/Target/ARM/Thumb1FrameLowering.h
+++ b/lib/Target/ARM/Thumb1FrameLowering.h
@@ -41,7 +41,7 @@ public:
 
   bool hasReservedCallFrame(const MachineFunction &MF) const override;
 
-  void
+  MachineBasicBlock::iterator
   eliminateCallFramePseudoInstr(MachineFunction &MF,
                                 MachineBasicBlock &MBB,
                                 MachineBasicBlock::iterator MI) const override;
diff --git a/lib/Target/ARM/Thumb1InstrInfo.cpp b/lib/Target/ARM/Thumb1InstrInfo.cpp
index 530e1d33839a..159731d8fc72 100644
--- a/lib/Target/ARM/Thumb1InstrInfo.cpp
+++ b/lib/Target/ARM/Thumb1InstrInfo.cpp
@@ -38,9 +38,9 @@ unsigned Thumb1InstrInfo::getUnindexedOpcode(unsigned Opc) const {
 }
 
 void Thumb1InstrInfo::copyPhysReg(MachineBasicBlock &MBB,
-                                  MachineBasicBlock::iterator I, DebugLoc DL,
-                                  unsigned DestReg, unsigned SrcReg,
-                                  bool KillSrc) const {
+                                  MachineBasicBlock::iterator I,
+                                  const DebugLoc &DL, unsigned DestReg,
+                                  unsigned SrcReg, bool KillSrc) const {
   // Need to check the arch.
   MachineFunction &MF = *MBB.getParent();
   const ARMSubtarget &st = MF.getSubtarget<ARMSubtarget>();
@@ -118,11 +118,12 @@ loadRegFromStackSlot(MachineBasicBlock &MBB, MachineBasicBlock::iterator I,
   }
 }
 
-void
-Thumb1InstrInfo::expandLoadStackGuard(MachineBasicBlock::iterator MI,
-                                      Reloc::Model RM) const {
-  if (RM == Reloc::PIC_)
-    expandLoadStackGuardBase(MI, ARM::tLDRLIT_ga_pcrel, ARM::tLDRi, RM);
+void Thumb1InstrInfo::expandLoadStackGuard(
+    MachineBasicBlock::iterator MI) const {
+  MachineFunction &MF = *MI->getParent()->getParent();
+  const TargetMachine &TM = MF.getTarget();
+  if (TM.isPositionIndependent())
+    expandLoadStackGuardBase(MI, ARM::tLDRLIT_ga_pcrel, ARM::tLDRi);
   else
-    expandLoadStackGuardBase(MI, ARM::tLDRLIT_ga_abs, ARM::tLDRi, RM);
+    expandLoadStackGuardBase(MI, ARM::tLDRLIT_ga_abs, ARM::tLDRi);
 }
diff --git a/lib/Target/ARM/Thumb1InstrInfo.h b/lib/Target/ARM/Thumb1InstrInfo.h
index f3f493d89237..931914ad2799 100644
--- a/lib/Target/ARM/Thumb1InstrInfo.h
+++ b/lib/Target/ARM/Thumb1InstrInfo.h
@@ -38,9 +38,8 @@ public:
   ///
   const ThumbRegisterInfo &getRegisterInfo() const override { return RI; }
 
-  void copyPhysReg(MachineBasicBlock &MBB,
-                   MachineBasicBlock::iterator I, DebugLoc DL,
-                   unsigned DestReg, unsigned SrcReg,
+  void copyPhysReg(MachineBasicBlock &MBB, MachineBasicBlock::iterator I,
+                   const DebugLoc &DL, unsigned DestReg, unsigned SrcReg,
                    bool KillSrc) const override;
   void storeRegToStackSlot(MachineBasicBlock &MBB,
                            MachineBasicBlock::iterator MBBI,
@@ -55,8 +54,7 @@ public:
                             const TargetRegisterInfo *TRI) const override;
 
 private:
-  void expandLoadStackGuard(MachineBasicBlock::iterator MI,
-                            Reloc::Model RM) const override;
+  void expandLoadStackGuard(MachineBasicBlock::iterator MI) const override;
 };
 }
 
diff --git a/lib/Target/ARM/Thumb2ITBlockPass.cpp b/lib/Target/ARM/Thumb2ITBlockPass.cpp
index bf0498dfda69..0c7055551632 100644
--- a/lib/Target/ARM/Thumb2ITBlockPass.cpp
+++ b/lib/Target/ARM/Thumb2ITBlockPass.cpp
@@ -36,6 +36,11 @@ namespace {
 
     bool runOnMachineFunction(MachineFunction &Fn) override;
 
+    MachineFunctionProperties getRequiredProperties() const override {
+      return MachineFunctionProperties().set(
+          MachineFunctionProperties::Property::AllVRegsAllocated);
+    }
+
     const char *getPassName() const override {
       return "Thumb IT blocks insertion pass";
     }
@@ -165,7 +170,7 @@ Thumb2ITBlockPass::MoveCopyOutOfITBlock(MachineInstr *MI,
     ++I;
   if (I != E) {
     unsigned NPredReg = 0;
-    ARMCC::CondCodes NCC = getITInstrPredicate(I, NPredReg);
+    ARMCC::CondCodes NCC = getITInstrPredicate(*I, NPredReg);
     if (NCC == CC || NCC == OCC)
       return true;
   }
@@ -182,7 +187,7 @@ bool Thumb2ITBlockPass::InsertITInstructions(MachineBasicBlock &MBB) {
     MachineInstr *MI = &*MBBI;
     DebugLoc dl = MI->getDebugLoc();
     unsigned PredReg = 0;
-    ARMCC::CondCodes CC = getITInstrPredicate(MI, PredReg);
+    ARMCC::CondCodes CC = getITInstrPredicate(*MI, PredReg);
     if (CC == ARMCC::AL) {
       ++MBBI;
       continue;
@@ -222,7 +227,7 @@ bool Thumb2ITBlockPass::InsertITInstructions(MachineBasicBlock &MBB) {
         MI = NMI;
 
         unsigned NPredReg = 0;
-        ARMCC::CondCodes NCC = getITInstrPredicate(NMI, NPredReg);
+        ARMCC::CondCodes NCC = getITInstrPredicate(*NMI, NPredReg);
         if (NCC == CC || NCC == OCC) {
           Mask |= (NCC & 1) << Pos;
           // Add implicit use of ITSTATE.
diff --git a/lib/Target/ARM/Thumb2InstrInfo.cpp b/lib/Target/ARM/Thumb2InstrInfo.cpp
index 4da769f23280..e2e6dafd218a 100644
--- a/lib/Target/ARM/Thumb2InstrInfo.cpp
+++ b/lib/Target/ARM/Thumb2InstrInfo.cpp
@@ -50,7 +50,7 @@ Thumb2InstrInfo::ReplaceTailWithBranchTo(MachineBasicBlock::iterator Tail,
                                          MachineBasicBlock *NewDest) const {
   MachineBasicBlock *MBB = Tail->getParent();
   ARMFunctionInfo *AFI = MBB->getParent()->getInfo<ARMFunctionInfo>();
-  if (!AFI->hasITBlocks()) {
+  if (!AFI->hasITBlocks() || Tail->isBranch()) {
     TargetInstrInfo::ReplaceTailWithBranchTo(Tail, NewDest);
     return;
   }
@@ -58,7 +58,7 @@ Thumb2InstrInfo::ReplaceTailWithBranchTo(MachineBasicBlock::iterator Tail,
   // If the first instruction of Tail is predicated, we may have to update
   // the IT instruction.
   unsigned PredReg = 0;
-  ARMCC::CondCodes CC = getInstrPredicate(Tail, PredReg);
+  ARMCC::CondCodes CC = getInstrPredicate(*Tail, PredReg);
   MachineBasicBlock::iterator MBBI = Tail;
   if (CC != ARMCC::AL)
     // Expecting at least the t2IT instruction before it.
@@ -106,13 +106,13 @@ Thumb2InstrInfo::isLegalToSplitMBBAt(MachineBasicBlock &MBB,
   }
 
   unsigned PredReg = 0;
-  return getITInstrPredicate(MBBI, PredReg) == ARMCC::AL;
+  return getITInstrPredicate(*MBBI, PredReg) == ARMCC::AL;
 }
 
 void Thumb2InstrInfo::copyPhysReg(MachineBasicBlock &MBB,
-                                  MachineBasicBlock::iterator I, DebugLoc DL,
-                                  unsigned DestReg, unsigned SrcReg,
-                                  bool KillSrc) const {
+                                  MachineBasicBlock::iterator I,
+                                  const DebugLoc &DL, unsigned DestReg,
+                                  unsigned SrcReg, bool KillSrc) const {
   // Handle SPR, DPR, and QPR copies.
   if (!ARM::GPRRegClass.contains(DestReg, SrcReg))
     return ARMBaseInstrInfo::copyPhysReg(MBB, I, DL, DestReg, SrcReg, KillSrc);
@@ -148,8 +148,10 @@ storeRegToStackSlot(MachineBasicBlock &MBB, MachineBasicBlock::iterator I,
     // Thumb2 STRD expects its dest-registers to be in rGPR. Not a problem for
     // gsub_0, but needs an extra constraint for gsub_1 (which could be sp
     // otherwise).
-    MachineRegisterInfo *MRI = &MF.getRegInfo();
-    MRI->constrainRegClass(SrcReg, &ARM::GPRPair_with_gsub_1_in_rGPRRegClass);
+    if (TargetRegisterInfo::isVirtualRegister(SrcReg)) {
+      MachineRegisterInfo *MRI = &MF.getRegInfo();
+      MRI->constrainRegClass(SrcReg, &ARM::GPRPair_with_gsub_1_in_rGPRRegClass);
+    }
 
     MachineInstrBuilder MIB = BuildMI(MBB, I, DL, get(ARM::t2STRDi8));
     AddDReg(MIB, SrcReg, ARM::gsub_0, getKillRegState(isKill), TRI);
@@ -187,8 +189,11 @@ loadRegFromStackSlot(MachineBasicBlock &MBB, MachineBasicBlock::iterator I,
     // Thumb2 LDRD expects its dest-registers to be in rGPR. Not a problem for
     // gsub_0, but needs an extra constraint for gsub_1 (which could be sp
     // otherwise).
-    MachineRegisterInfo *MRI = &MF.getRegInfo();
-    MRI->constrainRegClass(DestReg, &ARM::GPRPair_with_gsub_1_in_rGPRRegClass);
+    if (TargetRegisterInfo::isVirtualRegister(DestReg)) {
+      MachineRegisterInfo *MRI = &MF.getRegInfo();
+      MRI->constrainRegClass(DestReg,
+                             &ARM::GPRPair_with_gsub_1_in_rGPRRegClass);
+    }
 
     MachineInstrBuilder MIB = BuildMI(MBB, I, DL, get(ARM::t2LDRDi8));
     AddDReg(MIB, DestReg, ARM::gsub_0, RegState::DefineNoRead, TRI);
@@ -204,20 +209,22 @@ loadRegFromStackSlot(MachineBasicBlock &MBB, MachineBasicBlock::iterator I,
   ARMBaseInstrInfo::loadRegFromStackSlot(MBB, I, DestReg, FI, RC, TRI);
 }
 
-void
-Thumb2InstrInfo::expandLoadStackGuard(MachineBasicBlock::iterator MI,
-                                      Reloc::Model RM) const {
-  if (RM == Reloc::PIC_)
-    expandLoadStackGuardBase(MI, ARM::t2MOV_ga_pcrel, ARM::t2LDRi12, RM);
+void Thumb2InstrInfo::expandLoadStackGuard(
+    MachineBasicBlock::iterator MI) const {
+  MachineFunction &MF = *MI->getParent()->getParent();
+  if (MF.getTarget().isPositionIndependent())
+    expandLoadStackGuardBase(MI, ARM::t2MOV_ga_pcrel, ARM::t2LDRi12);
   else
-    expandLoadStackGuardBase(MI, ARM::t2MOVi32imm, ARM::t2LDRi12, RM);
+    expandLoadStackGuardBase(MI, ARM::t2MOVi32imm, ARM::t2LDRi12);
 }
 
 void llvm::emitT2RegPlusImmediate(MachineBasicBlock &MBB,
-                               MachineBasicBlock::iterator &MBBI, DebugLoc dl,
-                               unsigned DestReg, unsigned BaseReg, int NumBytes,
-                               ARMCC::CondCodes Pred, unsigned PredReg,
-                               const ARMBaseInstrInfo &TII, unsigned MIFlags) {
+                                  MachineBasicBlock::iterator &MBBI,
+                                  const DebugLoc &dl, unsigned DestReg,
+                                  unsigned BaseReg, int NumBytes,
+                                  ARMCC::CondCodes Pred, unsigned PredReg,
+                                  const ARMBaseInstrInfo &TII,
+                                  unsigned MIFlags) {
   if (NumBytes == 0 && DestReg != BaseReg) {
     BuildMI(MBB, MBBI, dl, TII.get(ARM::tMOVr), DestReg)
       .addReg(BaseReg, RegState::Kill)
@@ -459,7 +466,7 @@ bool llvm::rewriteT2FrameIndex(MachineInstr &MI, unsigned FrameRegIdx,
     Offset += MI.getOperand(FrameRegIdx+1).getImm();
 
     unsigned PredReg;
-    if (Offset == 0 && getInstrPredicate(&MI, PredReg) == ARMCC::AL) {
+    if (Offset == 0 && getInstrPredicate(MI, PredReg) == ARMCC::AL) {
       // Turn it into a move.
       MI.setDesc(TII.get(ARM::tMOVr));
       MI.getOperand(FrameRegIdx).ChangeToRegister(FrameReg, false);
@@ -627,9 +634,9 @@ bool llvm::rewriteT2FrameIndex(MachineInstr &MI, unsigned FrameRegIdx,
   return Offset == 0;
 }
 
-ARMCC::CondCodes
-llvm::getITInstrPredicate(const MachineInstr *MI, unsigned &PredReg) {
-  unsigned Opc = MI->getOpcode();
+ARMCC::CondCodes llvm::getITInstrPredicate(const MachineInstr &MI,
+                                           unsigned &PredReg) {
+  unsigned Opc = MI.getOpcode();
   if (Opc == ARM::tBcc || Opc == ARM::t2Bcc)
     return ARMCC::AL;
   return getInstrPredicate(MI, PredReg);
diff --git a/lib/Target/ARM/Thumb2InstrInfo.h b/lib/Target/ARM/Thumb2InstrInfo.h
index 916ab06ec305..15d63300b6a2 100644
--- a/lib/Target/ARM/Thumb2InstrInfo.h
+++ b/lib/Target/ARM/Thumb2InstrInfo.h
@@ -39,9 +39,8 @@ public:
   bool isLegalToSplitMBBAt(MachineBasicBlock &MBB,
                            MachineBasicBlock::iterator MBBI) const override;
 
-  void copyPhysReg(MachineBasicBlock &MBB,
-                   MachineBasicBlock::iterator I, DebugLoc DL,
-                   unsigned DestReg, unsigned SrcReg,
+  void copyPhysReg(MachineBasicBlock &MBB, MachineBasicBlock::iterator I,
+                   const DebugLoc &DL, unsigned DestReg, unsigned SrcReg,
                    bool KillSrc) const override;
 
   void storeRegToStackSlot(MachineBasicBlock &MBB,
@@ -63,16 +62,13 @@ public:
   const ThumbRegisterInfo &getRegisterInfo() const override { return RI; }
 
 private:
-  void expandLoadStackGuard(MachineBasicBlock::iterator MI,
-                            Reloc::Model RM) const override;
+  void expandLoadStackGuard(MachineBasicBlock::iterator MI) const override;
 };
 
 /// getITInstrPredicate - Valid only in Thumb2 mode. This function is identical
 /// to llvm::getInstrPredicate except it returns AL for conditional branch
 /// instructions which are "predicated", but are not in IT blocks.
-ARMCC::CondCodes getITInstrPredicate(const MachineInstr *MI, unsigned &PredReg);
-
-
+ARMCC::CondCodes getITInstrPredicate(const MachineInstr &MI, unsigned &PredReg);
 }
 
 #endif
diff --git a/lib/Target/ARM/Thumb2SizeReduction.cpp b/lib/Target/ARM/Thumb2SizeReduction.cpp
index bcd0e5751258..c4fdb9b3147d 100644
--- a/lib/Target/ARM/Thumb2SizeReduction.cpp
+++ b/lib/Target/ARM/Thumb2SizeReduction.cpp
@@ -18,11 +18,12 @@
 #include "llvm/CodeGen/MachineFunctionPass.h"
 #include "llvm/CodeGen/MachineInstr.h"
 #include "llvm/CodeGen/MachineInstrBuilder.h"
-#include "llvm/IR/Function.h"        // To access Function attributes
+#include "llvm/IR/Function.h" // To access Function attributes
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/Target/TargetMachine.h"
+#include <utility>
 using namespace llvm;
 
 #define DEBUG_TYPE "t2-reduce-size"
@@ -115,12 +116,14 @@ namespace {
   { ARM::t2LDRHs, ARM::tLDRHr,  0,             0,   0,   1,   0,  0,0, 0,1,0 },
   { ARM::t2LDRSBs,ARM::tLDRSB,  0,             0,   0,   1,   0,  0,0, 0,1,0 },
   { ARM::t2LDRSHs,ARM::tLDRSH,  0,             0,   0,   1,   0,  0,0, 0,1,0 },
+  { ARM::t2LDR_POST,ARM::tLDMIA_UPD,0,         0,   0,   1,   0,  0,0, 0,1,0 },
   { ARM::t2STRi12,ARM::tSTRi,   ARM::tSTRspi,  5,   8,   1,   0,  0,0, 0,1,0 },
   { ARM::t2STRs,  ARM::tSTRr,   0,             0,   0,   1,   0,  0,0, 0,1,0 },
   { ARM::t2STRBi12,ARM::tSTRBi, 0,             5,   0,   1,   0,  0,0, 0,1,0 },
   { ARM::t2STRBs, ARM::tSTRBr,  0,             0,   0,   1,   0,  0,0, 0,1,0 },
   { ARM::t2STRHi12,ARM::tSTRHi, 0,             5,   0,   1,   0,  0,0, 0,1,0 },
   { ARM::t2STRHs, ARM::tSTRHr,  0,             0,   0,   1,   0,  0,0, 0,1,0 },
+  { ARM::t2STR_POST,ARM::tSTMIA_UPD,0,         0,   0,   1,   0,  0,0, 0,1,0 },
 
   { ARM::t2LDMIA, ARM::tLDMIA,  0,             0,   0,   1,   1,  1,1, 0,1,0 },
   { ARM::t2LDMIA_RET,0,         ARM::tPOP_RET, 0,   0,   1,   1,  1,1, 0,1,0 },
@@ -143,6 +146,11 @@ namespace {
 
     bool runOnMachineFunction(MachineFunction &MF) override;
 
+    MachineFunctionProperties getRequiredProperties() const override {
+      return MachineFunctionProperties().set(
+          MachineFunctionProperties::Property::AllVRegsAllocated);
+    }
+
     const char *getPassName() const override {
       return "Thumb2 instruction size reduction pass";
     }
@@ -208,7 +216,7 @@ namespace {
 }
 
 Thumb2SizeReduce::Thumb2SizeReduce(std::function<bool(const Function &)> Ftor)
-    : MachineFunctionPass(ID), PredicateFtor(Ftor) {
+    : MachineFunctionPass(ID), PredicateFtor(std::move(Ftor)) {
   OptimizeSize = MinimizeSize = false;
   for (unsigned i = 0, e = array_lengthof(ReduceTable); i != e; ++i) {
     unsigned FromOpc = ReduceTable[i].WideOpc;
@@ -417,6 +425,46 @@ Thumb2SizeReduce::ReduceLoadStore(MachineBasicBlock &MBB, MachineInstr *MI,
     HasShift = true;
     OpNum = 4;
     break;
+  case ARM::t2LDR_POST:
+  case ARM::t2STR_POST: {
+    if (!MBB.getParent()->getFunction()->optForMinSize())
+      return false;
+
+    // We're creating a completely different type of load/store - LDM from LDR.
+    // For this reason we can't reuse the logic at the end of this function; we
+    // have to implement the MI building here.
+    bool IsStore = Entry.WideOpc == ARM::t2STR_POST;
+    unsigned Rt = MI->getOperand(IsStore ? 1 : 0).getReg();
+    unsigned Rn = MI->getOperand(IsStore ? 0 : 1).getReg();
+    unsigned Offset = MI->getOperand(3).getImm();
+    unsigned PredImm = MI->getOperand(4).getImm();
+    unsigned PredReg = MI->getOperand(5).getReg();
+    assert(isARMLowRegister(Rt));
+    assert(isARMLowRegister(Rn));
+
+    if (Offset != 4)
+      return false;
+
+    // Add the 16-bit load / store instruction.
+    DebugLoc dl = MI->getDebugLoc();
+    auto MIB = BuildMI(MBB, MI, dl, TII->get(Entry.NarrowOpc1))
+                   .addReg(Rn, RegState::Define)
+                   .addReg(Rn)
+                   .addImm(PredImm)
+                   .addReg(PredReg)
+                   .addReg(Rt, IsStore ? 0 : RegState::Define);
+
+    // Transfer memoperands.
+    MIB->setMemRefs(MI->memoperands_begin(), MI->memoperands_end());
+
+    // Transfer MI flags.
+    MIB.setMIFlags(MI->getFlags());
+
+    // Kill the old instruction.
+    MI->eraseFromBundle();
+    ++NumLdSts;
+    return true;
+  }
   case ARM::t2LDMIA: {
     unsigned BaseReg = MI->getOperand(0).getReg();
     assert(isARMLowRegister(BaseReg));
@@ -597,7 +645,7 @@ Thumb2SizeReduce::ReduceSpecial(MachineBasicBlock &MBB, MachineInstr *MI,
   case ARM::t2ADDSri:
   case ARM::t2ADDSrr: {
     unsigned PredReg = 0;
-    if (getInstrPredicate(MI, PredReg) == ARMCC::AL) {
+    if (getInstrPredicate(*MI, PredReg) == ARMCC::AL) {
       switch (Opc) {
       default: break;
       case ARM::t2ADDSri: {
@@ -670,7 +718,7 @@ Thumb2SizeReduce::ReduceTo2Addr(MachineBasicBlock &MBB, MachineInstr *MI,
       if (Reg1 != Reg0)
         return false;
       // Try to commute the operands to make it a 2-address instruction.
-      MachineInstr *CommutedMI = TII->commuteInstruction(MI);
+      MachineInstr *CommutedMI = TII->commuteInstruction(*MI);
       if (!CommutedMI)
         return false;
     }
@@ -678,11 +726,11 @@ Thumb2SizeReduce::ReduceTo2Addr(MachineBasicBlock &MBB, MachineInstr *MI,
     // Try to commute the operands to make it a 2-address instruction.
     unsigned CommOpIdx1 = 1;
     unsigned CommOpIdx2 = TargetInstrInfo::CommuteAnyOperandIndex;
-    if (!TII->findCommutedOpIndices(MI, CommOpIdx1, CommOpIdx2) ||
+    if (!TII->findCommutedOpIndices(*MI, CommOpIdx1, CommOpIdx2) ||
         MI->getOperand(CommOpIdx2).getReg() != Reg0)
       return false;
     MachineInstr *CommutedMI =
-        TII->commuteInstruction(MI, false, CommOpIdx1, CommOpIdx2);
+        TII->commuteInstruction(*MI, false, CommOpIdx1, CommOpIdx2);
     if (!CommutedMI)
       return false;
   }
@@ -702,7 +750,7 @@ Thumb2SizeReduce::ReduceTo2Addr(MachineBasicBlock &MBB, MachineInstr *MI,
   // Check if it's possible / necessary to transfer the predicate.
   const MCInstrDesc &NewMCID = TII->get(Entry.NarrowOpc2);
   unsigned PredReg = 0;
-  ARMCC::CondCodes Pred = getInstrPredicate(MI, PredReg);
+  ARMCC::CondCodes Pred = getInstrPredicate(*MI, PredReg);
   bool SkipPred = false;
   if (Pred != ARMCC::AL) {
     if (!NewMCID.isPredicable())
@@ -798,7 +846,7 @@ Thumb2SizeReduce::ReduceToNarrow(MachineBasicBlock &MBB, MachineInstr *MI,
   // Check if it's possible / necessary to transfer the predicate.
   const MCInstrDesc &NewMCID = TII->get(Entry.NarrowOpc1);
   unsigned PredReg = 0;
-  ARMCC::CondCodes Pred = getInstrPredicate(MI, PredReg);
+  ARMCC::CondCodes Pred = getInstrPredicate(*MI, PredReg);
   bool SkipPred = false;
   if (Pred != ARMCC::AL) {
     if (!NewMCID.isPredicable())
@@ -983,7 +1031,7 @@ bool Thumb2SizeReduce::ReduceMBB(MachineBasicBlock &MBB) {
         NextMII->bundleWithPred();
     }
 
-    if (!NextInSameBundle && MI->isInsideBundle()) {
+    if (BundleMI && !NextInSameBundle && MI->isInsideBundle()) {
       // FIXME: Since post-ra scheduler operates on bundles, the CPSR kill
       // marker is only on the BUNDLE instruction. Process the BUNDLE
       // instruction as we finish with the bundled instruction to work around
@@ -1050,5 +1098,5 @@ bool Thumb2SizeReduce::runOnMachineFunction(MachineFunction &MF) {
 /// reduction pass.
 FunctionPass *llvm::createThumb2SizeReductionPass(
     std::function<bool(const Function &)> Ftor) {
-  return new Thumb2SizeReduce(Ftor);
+  return new Thumb2SizeReduce(std::move(Ftor));
 }
diff --git a/lib/Target/ARM/ThumbRegisterInfo.cpp b/lib/Target/ARM/ThumbRegisterInfo.cpp
index b5f9d7e38f27..6c26c8843865 100644
--- a/lib/Target/ARM/ThumbRegisterInfo.cpp
+++ b/lib/Target/ARM/ThumbRegisterInfo.cpp
@@ -61,7 +61,7 @@ ThumbRegisterInfo::getPointerRegClass(const MachineFunction &MF,
 
 static void emitThumb1LoadConstPool(MachineBasicBlock &MBB,
                                     MachineBasicBlock::iterator &MBBI,
-                                    DebugLoc dl, unsigned DestReg,
+                                    const DebugLoc &dl, unsigned DestReg,
                                     unsigned SubIdx, int Val,
                                     ARMCC::CondCodes Pred, unsigned PredReg,
                                     unsigned MIFlags) {
@@ -81,7 +81,7 @@ static void emitThumb1LoadConstPool(MachineBasicBlock &MBB,
 
 static void emitThumb2LoadConstPool(MachineBasicBlock &MBB,
                                     MachineBasicBlock::iterator &MBBI,
-                                    DebugLoc dl, unsigned DestReg,
+                                    const DebugLoc &dl, unsigned DestReg,
                                     unsigned SubIdx, int Val,
                                     ARMCC::CondCodes Pred, unsigned PredReg,
                                     unsigned MIFlags) {
@@ -101,9 +101,9 @@ static void emitThumb2LoadConstPool(MachineBasicBlock &MBB,
 /// emitLoadConstPool - Emits a load from constpool to materialize the
 /// specified immediate.
 void ThumbRegisterInfo::emitLoadConstPool(
-    MachineBasicBlock &MBB, MachineBasicBlock::iterator &MBBI, DebugLoc dl,
-    unsigned DestReg, unsigned SubIdx, int Val, ARMCC::CondCodes Pred,
-    unsigned PredReg, unsigned MIFlags) const {
+    MachineBasicBlock &MBB, MachineBasicBlock::iterator &MBBI,
+    const DebugLoc &dl, unsigned DestReg, unsigned SubIdx, int Val,
+    ARMCC::CondCodes Pred, unsigned PredReg, unsigned MIFlags) const {
   MachineFunction &MF = *MBB.getParent();
   const ARMSubtarget &STI = MF.getSubtarget<ARMSubtarget>();
   if (STI.isThumb1Only()) {
@@ -120,57 +120,55 @@ void ThumbRegisterInfo::emitLoadConstPool(
 /// a destreg = basereg + immediate in Thumb code. Materialize the immediate
 /// in a register using mov / mvn sequences or load the immediate from a
 /// constpool entry.
-static
-void emitThumbRegPlusImmInReg(MachineBasicBlock &MBB,
-                              MachineBasicBlock::iterator &MBBI,
-                              DebugLoc dl,
-                              unsigned DestReg, unsigned BaseReg,
-                              int NumBytes, bool CanChangeCC,
-                              const TargetInstrInfo &TII,
-                              const ARMBaseRegisterInfo& MRI,
-                              unsigned MIFlags = MachineInstr::NoFlags) {
-    MachineFunction &MF = *MBB.getParent();
-    bool isHigh = !isARMLowRegister(DestReg) ||
-                  (BaseReg != 0 && !isARMLowRegister(BaseReg));
-    bool isSub = false;
-    // Subtract doesn't have high register version. Load the negative value
-    // if either base or dest register is a high register. Also, if do not
-    // issue sub as part of the sequence if condition register is to be
-    // preserved.
-    if (NumBytes < 0 && !isHigh && CanChangeCC) {
-      isSub = true;
-      NumBytes = -NumBytes;
-    }
-    unsigned LdReg = DestReg;
-    if (DestReg == ARM::SP)
-      assert(BaseReg == ARM::SP && "Unexpected!");
-    if (!isARMLowRegister(DestReg) && !MRI.isVirtualRegister(DestReg))
-      LdReg = MF.getRegInfo().createVirtualRegister(&ARM::tGPRRegClass);
-
-    if (NumBytes <= 255 && NumBytes >= 0 && CanChangeCC) {
-      AddDefaultT1CC(BuildMI(MBB, MBBI, dl, TII.get(ARM::tMOVi8), LdReg))
-        .addImm(NumBytes).setMIFlags(MIFlags);
-    } else if (NumBytes < 0 && NumBytes >= -255 && CanChangeCC) {
-      AddDefaultT1CC(BuildMI(MBB, MBBI, dl, TII.get(ARM::tMOVi8), LdReg))
-        .addImm(NumBytes).setMIFlags(MIFlags);
-      AddDefaultT1CC(BuildMI(MBB, MBBI, dl, TII.get(ARM::tRSB), LdReg))
-        .addReg(LdReg, RegState::Kill).setMIFlags(MIFlags);
-    } else
-      MRI.emitLoadConstPool(MBB, MBBI, dl, LdReg, 0, NumBytes,
-                            ARMCC::AL, 0, MIFlags);
-
-    // Emit add / sub.
-    int Opc = (isSub) ? ARM::tSUBrr : ((isHigh || !CanChangeCC) ? ARM::tADDhirr
-                                                                : ARM::tADDrr);
-    MachineInstrBuilder MIB =
-      BuildMI(MBB, MBBI, dl, TII.get(Opc), DestReg);
-    if (Opc != ARM::tADDhirr)
-      MIB = AddDefaultT1CC(MIB);
-    if (DestReg == ARM::SP || isSub)
-      MIB.addReg(BaseReg).addReg(LdReg, RegState::Kill);
-    else
-      MIB.addReg(LdReg).addReg(BaseReg, RegState::Kill);
-    AddDefaultPred(MIB);
+static void emitThumbRegPlusImmInReg(
+    MachineBasicBlock &MBB, MachineBasicBlock::iterator &MBBI,
+    const DebugLoc &dl, unsigned DestReg, unsigned BaseReg, int NumBytes,
+    bool CanChangeCC, const TargetInstrInfo &TII,
+    const ARMBaseRegisterInfo &MRI, unsigned MIFlags = MachineInstr::NoFlags) {
+  MachineFunction &MF = *MBB.getParent();
+  bool isHigh = !isARMLowRegister(DestReg) ||
+                (BaseReg != 0 && !isARMLowRegister(BaseReg));
+  bool isSub = false;
+  // Subtract doesn't have high register version. Load the negative value
+  // if either base or dest register is a high register. Also, if do not
+  // issue sub as part of the sequence if condition register is to be
+  // preserved.
+  if (NumBytes < 0 && !isHigh && CanChangeCC) {
+    isSub = true;
+    NumBytes = -NumBytes;
+  }
+  unsigned LdReg = DestReg;
+  if (DestReg == ARM::SP)
+    assert(BaseReg == ARM::SP && "Unexpected!");
+  if (!isARMLowRegister(DestReg) && !MRI.isVirtualRegister(DestReg))
+    LdReg = MF.getRegInfo().createVirtualRegister(&ARM::tGPRRegClass);
+
+  if (NumBytes <= 255 && NumBytes >= 0 && CanChangeCC) {
+    AddDefaultT1CC(BuildMI(MBB, MBBI, dl, TII.get(ARM::tMOVi8), LdReg))
+        .addImm(NumBytes)
+        .setMIFlags(MIFlags);
+  } else if (NumBytes < 0 && NumBytes >= -255 && CanChangeCC) {
+    AddDefaultT1CC(BuildMI(MBB, MBBI, dl, TII.get(ARM::tMOVi8), LdReg))
+        .addImm(NumBytes)
+        .setMIFlags(MIFlags);
+    AddDefaultT1CC(BuildMI(MBB, MBBI, dl, TII.get(ARM::tRSB), LdReg))
+        .addReg(LdReg, RegState::Kill)
+        .setMIFlags(MIFlags);
+  } else
+    MRI.emitLoadConstPool(MBB, MBBI, dl, LdReg, 0, NumBytes, ARMCC::AL, 0,
+                          MIFlags);
+
+  // Emit add / sub.
+  int Opc = (isSub) ? ARM::tSUBrr
+                    : ((isHigh || !CanChangeCC) ? ARM::tADDhirr : ARM::tADDrr);
+  MachineInstrBuilder MIB = BuildMI(MBB, MBBI, dl, TII.get(Opc), DestReg);
+  if (Opc != ARM::tADDhirr)
+    MIB = AddDefaultT1CC(MIB);
+  if (DestReg == ARM::SP || isSub)
+    MIB.addReg(BaseReg).addReg(LdReg, RegState::Kill);
+  else
+    MIB.addReg(LdReg).addReg(BaseReg, RegState::Kill);
+  AddDefaultPred(MIB);
 }
 
 /// emitThumbRegPlusImmediate - Emits a series of instructions to materialize
@@ -179,10 +177,10 @@ void emitThumbRegPlusImmInReg(MachineBasicBlock &MBB,
 /// be too long. This is allowed to modify the condition flags.
 void llvm::emitThumbRegPlusImmediate(MachineBasicBlock &MBB,
                                      MachineBasicBlock::iterator &MBBI,
-                                     DebugLoc dl,
-                                     unsigned DestReg, unsigned BaseReg,
-                                     int NumBytes, const TargetInstrInfo &TII,
-                                     const ARMBaseRegisterInfo& MRI,
+                                     const DebugLoc &dl, unsigned DestReg,
+                                     unsigned BaseReg, int NumBytes,
+                                     const TargetInstrInfo &TII,
+                                     const ARMBaseRegisterInfo &MRI,
                                      unsigned MIFlags) {
   bool isSub = NumBytes < 0;
   unsigned Bytes = (unsigned)NumBytes;
@@ -281,7 +279,7 @@ void llvm::emitThumbRegPlusImmediate(MachineBasicBlock &MBB,
 
   unsigned RequiredExtraInstrs;
   if (ExtraRange)
-    RequiredExtraInstrs = RoundUpToAlignment(RangeAfterCopy, ExtraRange) / ExtraRange;
+    RequiredExtraInstrs = alignTo(RangeAfterCopy, ExtraRange) / ExtraRange;
   else if (RangeAfterCopy > 0)
     // We need an extra instruction but none is available
     RequiredExtraInstrs = 1000000;
diff --git a/lib/Target/ARM/ThumbRegisterInfo.h b/lib/Target/ARM/ThumbRegisterInfo.h
index 23aaff37f409..e6b06959e428 100644
--- a/lib/Target/ARM/ThumbRegisterInfo.h
+++ b/lib/Target/ARM/ThumbRegisterInfo.h
@@ -39,8 +39,9 @@ public:
   /// specified immediate.
   void
   emitLoadConstPool(MachineBasicBlock &MBB, MachineBasicBlock::iterator &MBBI,
-                    DebugLoc dl, unsigned DestReg, unsigned SubIdx, int Val,
-                    ARMCC::CondCodes Pred = ARMCC::AL, unsigned PredReg = 0,
+                    const DebugLoc &dl, unsigned DestReg, unsigned SubIdx,
+                    int Val, ARMCC::CondCodes Pred = ARMCC::AL,
+                    unsigned PredReg = 0,
                     unsigned MIFlags = MachineInstr::NoFlags) const override;
 
   // rewrite MI to access 'Offset' bytes from the FP. Update Offset to be
diff --git a/lib/Target/AVR/AVR.h b/lib/Target/AVR/AVR.h
index 4c1667ed341c..041c77cfcb9d 100644
--- a/lib/Target/AVR/AVR.h
+++ b/lib/Target/AVR/AVR.h
@@ -30,9 +30,7 @@ FunctionPass *createAVRFrameAnalyzerPass();
 FunctionPass *createAVRDynAllocaSRPass();
 FunctionPass *createAVRBranchSelectionPass();
 
-/**
- * Contains the AVR backend.
- */
+/// Contains the AVR backend.
 namespace AVR {
 
 enum AddressSpace { DataMemory, ProgramMemory };
diff --git a/lib/Target/AVR/AVR.td b/lib/Target/AVR/AVR.td
index 9e80717cd680..27cf212704f0 100644
--- a/lib/Target/AVR/AVR.td
+++ b/lib/Target/AVR/AVR.td
@@ -514,9 +514,9 @@ include "AVRRegisterInfo.td"
 // Instruction Descriptions
 //===---------------------------------------------------------------------===//
 
-//include "AVRInstrInfo.td"
+include "AVRInstrInfo.td"
 
-//def AVRInstrInfo : InstrInfo;
+def AVRInstrInfo : InstrInfo;
 
 //===---------------------------------------------------------------------===//
 // Calling Conventions
@@ -554,7 +554,7 @@ include "AVRCallingConv.td"
 //===---------------------------------------------------------------------===//
 
 def AVR : Target {
-//   let InstructionSet         = AVRInstrInfo;
+   let InstructionSet         = AVRInstrInfo;
 //   let AssemblyWriters        = [AVRAsmWriter];
 //
 //   let AssemblyParsers        = [AVRAsmParser];
diff --git a/lib/Target/AVR/AVRConfig.h b/lib/Target/AVR/AVRConfig.h
deleted file mode 100644
index 65588bc50840..000000000000
--- a/lib/Target/AVR/AVRConfig.h
+++ /dev/null
@@ -1,15 +0,0 @@
-//===-- AVRConfig.h - AVR Backend Configuration Header ----------*- C++ -*-===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef LLVM_AVR_CONFIG_H
-#define LLVM_AVR_CONFIG_H
-
-#define LLVM_AVR_GCC_COMPAT
-
-#endif // LLVM_AVR_CONFIG_H
diff --git a/lib/Target/AVR/AVRFrameLowering.h b/lib/Target/AVR/AVRFrameLowering.h
new file mode 100644
index 000000000000..850a43abebfa
--- /dev/null
+++ b/lib/Target/AVR/AVRFrameLowering.h
@@ -0,0 +1,46 @@
+//===-- AVRFrameLowering.h - Define frame lowering for AVR ------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_AVR_FRAME_LOWERING_H
+#define LLVM_AVR_FRAME_LOWERING_H
+
+#include "llvm/Target/TargetFrameLowering.h"
+
+namespace llvm {
+
+/// Utilities for creating function call frames.
+class AVRFrameLowering : public TargetFrameLowering {
+public:
+  explicit AVRFrameLowering();
+
+public:
+  void emitPrologue(MachineFunction &MF, MachineBasicBlock &MBB) const override;
+  void emitEpilogue(MachineFunction &MF, MachineBasicBlock &MBB) const override;
+  bool hasFP(const MachineFunction &MF) const override;
+  bool spillCalleeSavedRegisters(MachineBasicBlock &MBB,
+                                 MachineBasicBlock::iterator MI,
+                                 const std::vector<CalleeSavedInfo> &CSI,
+                                 const TargetRegisterInfo *TRI) const override;
+  bool
+  restoreCalleeSavedRegisters(MachineBasicBlock &MBB,
+                              MachineBasicBlock::iterator MI,
+                              const std::vector<CalleeSavedInfo> &CSI,
+                              const TargetRegisterInfo *TRI) const override;
+  bool hasReservedCallFrame(const MachineFunction &MF) const override;
+  bool canSimplifyCallFramePseudos(const MachineFunction &MF) const override;
+  void determineCalleeSaves(MachineFunction &MF, BitVector &SavedRegs,
+                            RegScavenger *RS = nullptr) const override;
+  MachineBasicBlock::iterator
+  eliminateCallFramePseudoInstr(MachineFunction &MF, MachineBasicBlock &MBB,
+                                MachineBasicBlock::iterator MI) const override;
+};
+
+} // end namespace llvm
+
+#endif // LLVM_AVR_FRAME_LOWERING_H
diff --git a/lib/Target/AVR/AVRISelLowering.h b/lib/Target/AVR/AVRISelLowering.h
new file mode 100644
index 000000000000..2c8c9c88b6dd
--- /dev/null
+++ b/lib/Target/AVR/AVRISelLowering.h
@@ -0,0 +1,152 @@
+//===-- AVRISelLowering.h - AVR DAG Lowering Interface ----------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines the interfaces that AVR uses to lower LLVM code into a
+// selection DAG.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_AVR_ISEL_LOWERING_H
+#define LLVM_AVR_ISEL_LOWERING_H
+
+#include "llvm/Target/TargetLowering.h"
+
+namespace llvm {
+
+namespace AVRISD {
+
+/// AVR Specific DAG Nodes
+enum NodeType {
+  /// Start the numbering where the builtin ops leave off.
+  FIRST_NUMBER = ISD::BUILTIN_OP_END,
+  /// Return from subroutine.
+  RET_FLAG,
+  /// Return from ISR.
+  RETI_FLAG,
+  /// Represents an abstract call instruction,
+  /// which includes a bunch of information.
+  CALL,
+  /// A wrapper node for TargetConstantPool,
+  /// TargetExternalSymbol, and TargetGlobalAddress.
+  WRAPPER,
+  LSL,     ///< Logical shift left.
+  LSR,     ///< Logical shift right.
+  ASR,     ///< Arithmetic shift right.
+  ROR,     ///< Bit rotate right.
+  ROL,     ///< Bit rotate left.
+  LSLLOOP, ///< A loop of single logical shift left instructions.
+  LSRLOOP, ///< A loop of single logical shift right instructions.
+  ASRLOOP, ///< A loop of single arithmetic shift right instructions.
+  /// AVR conditional branches. Operand 0 is the chain operand, operand 1
+  /// is the block to branch if condition is true, operand 2 is the
+  /// condition code, and operand 3 is the flag operand produced by a CMP
+  /// or TEST instruction.
+  BRCOND,
+  /// Compare instruction.
+  CMP,
+  /// Compare with carry instruction.
+  CMPC,
+  /// Test for zero or minus instruction.
+  TST,
+  /// Operand 0 and operand 1 are selection variable, operand 2
+  /// is condition code and operand 3 is flag operand.
+  SELECT_CC
+};
+
+} // end of namespace AVRISD
+
+class AVRTargetMachine;
+
+/// Performs target lowering for the AVR.
+class AVRTargetLowering : public TargetLowering {
+public:
+  explicit AVRTargetLowering(AVRTargetMachine &TM);
+
+public:
+  MVT getScalarShiftAmountTy(const DataLayout &, EVT LHSTy) const override {
+    return MVT::i8;
+  }
+  const char *getTargetNodeName(unsigned Opcode) const override;
+
+  SDValue LowerOperation(SDValue Op, SelectionDAG &DAG) const override;
+
+  void ReplaceNodeResults(SDNode *N, SmallVectorImpl<SDValue> &Results,
+                          SelectionDAG &DAG) const override;
+
+  bool isLegalAddressingMode(const DataLayout &DL, const AddrMode &AM, Type *Ty,
+                             unsigned AS) const override;
+
+  bool getPreIndexedAddressParts(SDNode *N, SDValue &Base, SDValue &Offset,
+                                 ISD::MemIndexedMode &AM,
+                                 SelectionDAG &DAG) const override;
+
+  bool getPostIndexedAddressParts(SDNode *N, SDNode *Op, SDValue &Base,
+                                  SDValue &Offset, ISD::MemIndexedMode &AM,
+                                  SelectionDAG &DAG) const override;
+
+  bool isOffsetFoldingLegal(const GlobalAddressSDNode *GA) const override;
+
+  MachineBasicBlock *
+  EmitInstrWithCustomInserter(MachineInstr &MI,
+                              MachineBasicBlock *MBB) const override;
+
+  ConstraintType getConstraintType(StringRef Constraint) const override;
+
+  ConstraintWeight
+  getSingleConstraintMatchWeight(AsmOperandInfo &info,
+                                 const char *constraint) const override;
+
+  std::pair<unsigned, const TargetRegisterClass *>
+  getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI,
+                               StringRef Constraint, MVT VT) const override;
+
+  unsigned getInlineAsmMemConstraint(StringRef ConstraintCode) const override;
+
+  void LowerAsmOperandForConstraint(SDValue Op, std::string &Constraint,
+                                    std::vector<SDValue> &Ops,
+                                    SelectionDAG &DAG) const override;
+
+private:
+  SDValue getAVRCmp(SDValue LHS, SDValue RHS, ISD::CondCode CC, SDValue &AVRcc,
+                    SelectionDAG &DAG, SDLoc dl) const;
+  SDValue LowerShifts(SDValue Op, SelectionDAG &DAG) const;
+  SDValue LowerDivRem(SDValue Op, SelectionDAG &DAG) const;
+  SDValue LowerGlobalAddress(SDValue Op, SelectionDAG &DAG) const;
+  SDValue LowerBlockAddress(SDValue Op, SelectionDAG &DAG) const;
+  SDValue LowerBR_CC(SDValue Op, SelectionDAG &DAG) const;
+  SDValue LowerINLINEASM(SDValue Op, SelectionDAG &DAG) const;
+  SDValue LowerSELECT_CC(SDValue Op, SelectionDAG &DAG) const;
+  SDValue LowerSETCC(SDValue Op, SelectionDAG &DAG) const;
+  SDValue LowerVASTART(SDValue Op, SelectionDAG &DAG) const;
+
+  SDValue LowerReturn(SDValue Chain, CallingConv::ID CallConv, bool isVarArg,
+                      const SmallVectorImpl<ISD::OutputArg> &Outs,
+                      const SmallVectorImpl<SDValue> &OutVals, const SDLoc &dl,
+                      SelectionDAG &DAG) const override;
+  SDValue LowerFormalArguments(SDValue Chain, CallingConv::ID CallConv,
+                               bool isVarArg,
+                               const SmallVectorImpl<ISD::InputArg> &Ins,
+                               const SDLoc &dl, SelectionDAG &DAG,
+                               SmallVectorImpl<SDValue> &InVals) const override;
+  SDValue LowerCall(TargetLowering::CallLoweringInfo &CLI,
+                    SmallVectorImpl<SDValue> &InVals) const override;
+  SDValue LowerCallResult(SDValue Chain, SDValue InFlag,
+                          CallingConv::ID CallConv, bool isVarArg,
+                          const SmallVectorImpl<ISD::InputArg> &Ins,
+                          const SDLoc &dl, SelectionDAG &DAG,
+                          SmallVectorImpl<SDValue> &InVals) const;
+
+private:
+  MachineBasicBlock *insertShift(MachineInstr *MI, MachineBasicBlock *BB) const;
+  MachineBasicBlock *insertMul(MachineInstr *MI, MachineBasicBlock *BB) const;
+};
+
+} // end namespace llvm
+
+#endif // LLVM_AVR_ISEL_LOWERING_H
diff --git a/lib/Target/AVR/AVRInstrFormats.td b/lib/Target/AVR/AVRInstrFormats.td
new file mode 100644
index 000000000000..c10023dd2c0a
--- /dev/null
+++ b/lib/Target/AVR/AVRInstrFormats.td
@@ -0,0 +1,577 @@
+//===-- AVRInstrInfo.td - AVR Instruction Formats ----------*- tablegen -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// AVR Instruction Format Definitions.
+//
+//===----------------------------------------------------------------------===//
+
+// A generic AVR instruction.
+class AVRInst<dag outs, dag ins, string asmstr, list<dag> pattern> : Instruction
+{
+  let Namespace = "AVR";
+
+  dag OutOperandList = outs;
+  dag InOperandList = ins;
+  let AsmString = asmstr;
+  let Pattern = pattern;
+}
+
+/// A 16-bit AVR instruction.
+class AVRInst16<dag outs, dag ins, string asmstr, list<dag> pattern>
+  : AVRInst<outs, ins, asmstr, pattern>
+{
+  field bits<16> Inst;
+
+  let Size = 2;
+}
+
+/// a 32-bit AVR instruction.
+class AVRInst32<dag outs, dag ins, string asmstr, list<dag> pattern>
+  : AVRInst<outs, ins, asmstr, pattern>
+{
+  field bits<32> Inst;
+
+  let Size = 4;
+}
+
+// A class for pseudo instructions.
+// Psuedo instructions are not real AVR instructions. The DAG stores
+// psuedo instructions which are replaced by real AVR instructions by
+// AVRExpandPseudoInsts.cpp.
+//
+// For example, the ADDW (add wide, as in add 16 bit values) instruction
+// is defined as a pseudo instruction. In AVRExpandPseudoInsts.cpp,
+// the instruction is then replaced by two add instructions - one for each byte.
+class Pseudo<dag outs, dag ins, string asmstr, list<dag> pattern>
+  : AVRInst16<outs, ins, asmstr, pattern>
+{
+  let Pattern = pattern;
+
+  let isPseudo = 1;
+  let isCodeGenOnly = 1;
+}
+
+//===----------------------------------------------------------------------===//
+// Register / register instruction: <|opcode|ffrd|dddd|rrrr|>
+// opcode = 4 bits.
+// f = secondary opcode = 2 bits
+// d = destination = 5 bits
+// r = source = 5 bits
+// (Accepts all registers)
+//===----------------------------------------------------------------------===//
+class FRdRr<bits<4> opcode, bits<2> f, dag outs, dag ins, string asmstr,
+            list<dag> pattern> : AVRInst16<outs, ins, asmstr, pattern>
+{
+  bits<5> rd;
+  bits<5> rr;
+
+  let Inst{15-12} = opcode;
+  let Inst{11-10} = f;
+  let Inst{9} = rr{4};
+  let Inst{8-4} = rd;
+  let Inst{3-0} = rr{3-0};
+}
+
+class FTST<bits<4> opcode, bits<2> f, dag outs, dag ins, string asmstr,
+            list<dag> pattern> : AVRInst16<outs, ins, asmstr, pattern>
+{
+  bits<5> rd;
+
+  let Inst{15-12} = opcode;
+  let Inst{11-10} = f;
+  let Inst{9} = rd{4};
+  let Inst{8-4} = rd;
+  let Inst{3-0} = rd{3-0};
+}
+
+//===----------------------------------------------------------------------===//
+// Instruction of the format `<mnemonic> Z, Rd`
+// <|1001|001r|rrrr|0ttt>
+//===----------------------------------------------------------------------===//
+class FZRd<bits<3> t, dag outs, dag ins, string asmstr, list<dag> pattern>
+  : AVRInst16<outs, ins, asmstr, pattern>
+{
+  bits<5> rd;
+
+  let Inst{15-12} = 0b1001;
+
+  let Inst{11-9} = 0b001;
+  let Inst{8} = rd{4};
+
+  let Inst{7-4} = rd{3-0};
+
+  let Inst{3} = 0;
+  let Inst{2-0} = t;
+}
+
+//===----------------------------------------------------------------------===//
+// Register / immediate8 instruction: <|opcode|KKKK|dddd|KKKK|>
+// opcode = 4 bits.
+// K = constant data = 8 bits
+// d = destination = 4 bits
+// (Only accepts r16-r31)
+//===----------------------------------------------------------------------===//
+class FRdK<bits<4> opcode, dag outs, dag ins, string asmstr, list<dag> pattern>
+  : AVRInst16<outs, ins, asmstr, pattern>
+{
+  bits<4> rd;
+  bits<8> k;
+
+  let Inst{15-12} = opcode;
+  let Inst{11-8} = k{7-4};
+  let Inst{7-4} = rd{3-0};
+  let Inst{3-0} = k{3-0};
+
+  let isAsCheapAsAMove = 1;
+}
+
+//===----------------------------------------------------------------------===//
+// Register instruction: <|opcode|fffd|dddd|ffff|>
+// opcode = 4 bits.
+// f = secondary opcode = 7 bits
+// d = destination = 5 bits
+// (Accepts all registers)
+//===----------------------------------------------------------------------===//
+class FRd<bits<4> opcode, bits<7> f, dag outs, dag ins, string asmstr,
+          list<dag> pattern> : AVRInst16<outs, ins, asmstr, pattern>
+{
+  bits<5> d;
+
+  let Inst{15-12} = opcode;
+  let Inst{11-9} = f{6-4};
+  let Inst{8-4} = d;
+  let Inst{3-0} = f{3-0};
+}
+
+//===----------------------------------------------------------------------===//
+// [STD/LDD] P+q, Rr special encoding: <|10q0|qqtr|rrrr|pqqq>
+// t = type (1 for STD, 0 for LDD)
+// q = displacement (6 bits)
+// r = register (5 bits)
+// p = pointer register (1 bit) [1 for Y, 0 for Z]
+//===----------------------------------------------------------------------===//
+class FSTDLDD<bit type, dag outs, dag ins, string asmstr, list<dag> pattern>
+  : AVRInst16<outs, ins, asmstr, pattern>
+{
+  bits<7> memri;
+  bits<5> reg; // the GP register
+
+  let Inst{15-14} = 0b10;
+  let Inst{13} = memri{5};
+  let Inst{12} = 0;
+
+  let Inst{11-10} = memri{4-3};
+  let Inst{9} = type;
+  let Inst{8} = reg{4};
+
+  let Inst{7-4} = reg{3-0};
+
+  let Inst{3} = memri{6};
+  let Inst{2-0} = memri{2-0};
+}
+
+//===---------------------------------------------------------------------===//
+// An ST/LD instruction.
+// <|100i|00tr|rrrr|ppaa|>
+// t = type (1 for store, 0 for load)
+// a = regular/postinc/predec (reg = 0b00, postinc = 0b01, predec = 0b10)
+// p = pointer register
+// r = src/dst register
+//
+// Note that the bit labelled 'i' above does not follow a simple pattern,
+// so there exists a post encoder method to set it manually.
+//===---------------------------------------------------------------------===//
+class FSTLD<bit type, bits<2> mode, dag outs, dag ins,
+            string asmstr, list<dag> pattern>
+  : AVRInst16<outs, ins, asmstr, pattern>
+{
+  bits<2> ptrreg;
+  bits<5> reg;
+
+  let Inst{15-13} = 0b100;
+  // This bit varies depending on the arguments and the mode.
+  // We have a post encoder method to set this bit manually.
+  let Inst{12} = 0;
+
+  let Inst{11-10} = 0b00;
+  let Inst{9} = type;
+  let Inst{8} = reg{4};
+
+  let Inst{7-4} = reg{3-0};
+
+  let Inst{3-2} = ptrreg{1-0};
+  let Inst{1-0} = mode{1-0};
+
+  let PostEncoderMethod = "loadStorePostEncoder";
+}
+
+//===---------------------------------------------------------------------===//
+// Special format for the LPM/ELPM instructions
+// [E]LPM Rd, Z[+]
+// <|1001|000d|dddd|01ep>
+// d = destination register
+// e = is elpm
+// p = is postincrement
+//===---------------------------------------------------------------------===//
+class FLPMX<bit e, bit p, dag outs, dag ins, string asmstr, list<dag> pattern>
+  : AVRInst16<outs, ins, asmstr, pattern>
+{
+   bits<5> reg;
+
+   let Inst{15-12} = 0b1001;
+
+   let Inst{11-9} = 0b000;
+   let Inst{8} = reg{4};
+
+   let Inst{7-4} = reg{3-0};
+
+   let Inst{3-2} = 0b01;
+   let Inst{1} = e;
+   let Inst{0} = p;
+}
+
+//===----------------------------------------------------------------------===//
+// MOVWRdRr special encoding: <|0000|0001|dddd|rrrr|>
+// d = destination = 4 bits
+// r = source = 4 bits
+// (Only accepts even registers)
+//===----------------------------------------------------------------------===//
+class FMOVWRdRr<dag outs, dag ins, string asmstr, list<dag> pattern>
+  : AVRInst16<outs, ins, asmstr, pattern>
+{
+  bits<5> d;
+  bits<5> r;
+
+  let Inst{15-8} = 0b00000001;
+  let Inst{7-4} = d{4-1};
+  let Inst{3-0} = r{4-1};
+}
+
+//===----------------------------------------------------------------------===//
+// MULSrr special encoding: <|0000|0010|dddd|rrrr|>
+// d = multiplicand = 4 bits
+// r = multiplier = 4 bits
+// (Only accepts r16-r31)
+//===----------------------------------------------------------------------===//
+class FMUL2RdRr<bit f, dag outs, dag ins, string asmstr, list<dag> pattern>
+  : AVRInst16<outs, ins, asmstr, pattern>
+{
+  bits<5> rd;              // accept 5 bits but only encode the lower 4
+  bits<5> rr;              // accept 5 bits but only encode the lower 4
+
+  let Inst{15-9} = 0b0000001;
+  let Inst{8} = f;
+  let Inst{7-4} = rd{3-0};
+  let Inst{3-0} = rr{3-0};
+}
+
+// Special encoding for the FMUL family of instructions.
+//
+// <0000|0011|fddd|frrr|>
+//
+// ff = 0b01 for FMUL
+//      0b10 for FMULS
+//      0b11 for FMULSU
+//
+// ddd = destination register
+// rrr = source register
+class FFMULRdRr<bits<2> f, dag outs, dag ins, string asmstr, list<dag> pattern>
+  : AVRInst16<outs, ins, asmstr, pattern>
+{
+  bits<3> rd;
+  bits<3> rr;
+
+  let Inst{15-8} = 0b00000011;
+  let Inst{7} = f{1};
+  let Inst{6-4} = rd;
+  let Inst{3} = f{0};
+  let Inst{2-0} = rr;
+}
+
+
+//===----------------------------------------------------------------------===//
+// Arithmetic word instructions (ADIW / SBIW): <|1001|011f|kkdd|kkkk|>
+// f = secondary opcode = 1 bit
+// k = constant data = 6 bits
+// d = destination = 4 bits
+// (Only accepts r25:24 r27:26 r29:28 r31:30)
+//===----------------------------------------------------------------------===//
+class FWRdK<bit f, dag outs, dag ins, string asmstr, list<dag> pattern>
+  : AVRInst16<outs, ins, asmstr, pattern>
+{
+  bits<5> dst;              // accept 5 bits but only encode bits 1 and 2
+  bits<6> k;
+
+  let Inst{15-9} = 0b1001011;
+  let Inst{8} = f;
+  let Inst{7-6} = k{5-4};
+  let Inst{5-4} = dst{2-1};
+  let Inst{3-0} = k{3-0};
+}
+
+//===----------------------------------------------------------------------===//
+// In I/O instruction: <|1011|0AAd|dddd|AAAA|>
+// A = I/O location address = 6 bits
+// d = destination = 5 bits
+// (Accepts all registers)
+//===----------------------------------------------------------------------===//
+class FIORdA<dag outs, dag ins, string asmstr, list<dag> pattern>
+  : AVRInst16<outs, ins, asmstr, pattern>
+{
+  bits<5> d;
+  bits<6> A;
+
+  let Inst{15-11} = 0b10110;
+  let Inst{10-9} = A{5-4};
+  let Inst{8-4} = d;
+  let Inst{3-0} = A{3-0};
+}
+
+//===----------------------------------------------------------------------===//
+// Out I/O instruction: <|1011|1AAr|rrrr|AAAA|>
+// A = I/O location address = 6 bits
+// d = destination = 5 bits
+// (Accepts all registers)
+//===----------------------------------------------------------------------===//
+class FIOARr<dag outs, dag ins, string asmstr, list<dag> pattern>
+  : AVRInst16<outs, ins, asmstr, pattern>
+{
+  bits<6> A;
+  bits<5> r;
+
+  let Inst{15-11} = 0b10111;
+  let Inst{10-9} = A{5-4};
+  let Inst{8-4} = r;
+  let Inst{3-0} = A{3-0};
+}
+
+//===----------------------------------------------------------------------===//
+// I/O bit instruction.
+// <|1001|10tt|AAAA|Abbb>
+// t = type (1 for SBI, 0 for CBI)
+// A = I/O location address (5 bits)
+// b = bit number
+//===----------------------------------------------------------------------===//
+class FIOBIT<bits<2> t, dag outs, dag ins, string asmstr, list<dag> pattern>
+  : AVRInst16<outs, ins, asmstr, pattern>
+{
+  bits<5> A;
+  bits<3> b;
+
+  let Inst{15-12} = 0b1001;
+
+  let Inst{11-10} = 0b10;
+  let Inst{9-8} = t;
+
+  let Inst{7-4} = A{4-1};
+
+  let Inst{3} = A{0};
+  let Inst{2-0} = b{2-0};
+}
+
+//===----------------------------------------------------------------------===//
+// BST/BLD instruction.
+// <|1111|1ttd|dddd|0bbb>
+// t = type (1 for BST, 0 for BLD)
+// d = destination register
+// b = bit
+//===----------------------------------------------------------------------===//
+class FRdB<bits<2> t, dag outs, dag ins, string asmstr, list<dag> pattern>
+  : AVRInst16<outs, ins, asmstr, pattern>
+{
+  bits<5> rd;
+  bits<3> b;
+
+  let Inst{15-12} = 0b1111;
+
+  let Inst{11} = 0b1;
+  let Inst{10-9} = t;
+  let Inst{8} = rd{4};
+
+  let Inst{7-4} = rd{3-0};
+
+  let Inst{3} = 0;
+  let Inst{2-0} = b;
+}
+
+// Special encoding for the `DES K` instruction.
+//
+// <|1001|0100|KKKK|1011>
+//
+// KKKK = 4 bit immediate
+class FDES<dag outs, dag ins, string asmstr, list<dag> pattern>
+  : AVRInst16<outs, ins, asmstr, pattern>
+{
+  bits<4> k;
+
+  let Inst{15-12} = 0b1001;
+
+  let Inst{11-8} = 0b0100;
+
+  let Inst{7-4} = k;
+
+  let Inst{3-0} = 0b1011;
+}
+
+//===----------------------------------------------------------------------===//
+// Conditional Branching instructions: <|1111|0fkk|kkkk|ksss|>
+// f = secondary opcode = 1 bit
+// k = constant address = 7 bits
+// s = bit in status register = 3 bits
+//===----------------------------------------------------------------------===//
+class FBRsk<bit f, bits<3> s, dag outs, dag ins, string asmstr, list<dag> pattern>
+  : AVRInst16<outs, ins, asmstr, pattern>
+{
+  bits<7> k;
+
+  let Inst{15-11} = 0b11110;
+  let Inst{10} = f;
+  let Inst{9-3} = k;
+  let Inst{2-0} = s;
+}
+
+//===----------------------------------------------------------------------===//
+// Special, opcode only instructions: <|opcode|>
+//===----------------------------------------------------------------------===//
+
+class F16<bits<16> opcode, dag outs, dag ins, string asmstr, list<dag> pattern>
+  : AVRInst16<outs, ins, asmstr, pattern>
+{
+  let Inst = opcode;
+}
+
+class F32<bits<32> opcode, dag outs, dag ins, string asmstr, list<dag> pattern>
+  : AVRInst32<outs, ins, asmstr, pattern>
+{
+  let Inst = opcode;
+}
+
+//===----------------------------------------------------------------------===//
+// Branching instructions with immediate12: <|110f|kkkk|kkkk|kkkk|>
+// f = secondary opcode = 1 bit
+// k = constant address = 12 bits
+//===----------------------------------------------------------------------===//
+class FBRk<bit f, dag outs, dag ins, string asmstr, list<dag> pattern>
+  : AVRInst16<outs, ins, asmstr, pattern>
+{
+  bits<12> k;
+
+  let Inst{15-13} = 0b110;
+  let Inst{12} = f;
+  let Inst{11-0} = k;
+}
+
+//===----------------------------------------------------------------------===//
+// 32 bits branching instructions: <|1001|010k|kkkk|fffk|kkkk|kkkk|kkkk|kkkk|>
+// f = secondary opcode = 3 bits
+// k = constant address = 22 bits
+//===----------------------------------------------------------------------===//
+class F32BRk<bits<3> f, dag outs, dag ins, string asmstr, list<dag> pattern>
+  : AVRInst32<outs, ins, asmstr, pattern>
+{
+  bits<22> k;
+
+  let Inst{31-25} = 0b1001010;
+  let Inst{24-20} = k{21-17};
+  let Inst{19-17} = f;
+  let Inst{16-0} = k{16-0};
+}
+
+//===----------------------------------------------------------------------===//
+// 32 bits direct mem instructions: <|1001|00fd|dddd|0000|kkkk|kkkk|kkkk|kkkk|> 
+// f = secondary opcode = 1 bit
+// d = destination = 5 bits
+// k = constant address = 16 bits
+// (Accepts all registers)
+//===----------------------------------------------------------------------===//
+class F32DM<bit f, dag outs, dag ins, string asmstr, list<dag> pattern>
+  : AVRInst32<outs, ins, asmstr, pattern>
+{
+  bits<5> rd;
+  bits<16> k;
+
+  let Inst{31-28} = 0b1001;
+
+  let Inst{27-26} = 0b00;
+  let Inst{25} = f;
+  let Inst{24} = rd{4};
+
+  let Inst{23-20} = rd{3-0};
+
+  let Inst{19-16} = 0b0000;
+
+  let Inst{15-0} = k;
+}
+
+// <|1001|0100|bfff|1000>
+class FS<bit b, dag outs, dag ins, string asmstr, list<dag> pattern>
+  : AVRInst16<outs, ins, asmstr, pattern>
+{
+  bits<3> s;
+
+  let Inst{15-12} = 0b1001;
+
+  let Inst{11-8} = 0b0100;
+
+  let Inst{7} = b;
+  let Inst{6-4} = s;
+
+  let Inst{3-0} = 0b1000;
+}
+
+// Set/clr bit in status flag instructions/
+// <BRBS|BRBC> s, k
+// ---------------------
+// <|1111|0fkk|kkkk|ksss>
+class FSK<bit f, dag outs, dag ins, string asmstr, list<dag> pattern>
+  : AVRInst16<outs, ins, asmstr, pattern>
+{
+  bits<7> k;
+  bits<3> s;
+
+  let Inst{15-12} = 0b1111;
+
+  let Inst{11} = 0;
+  let Inst{10} = f;
+  let Inst{9-8} = k{6-5};
+
+  let Inst{7-4} = k{4-1};
+
+  let Inst{3} = k{0};
+  let Inst{2-0} = s;
+}
+
+class ExtensionPseudo<dag outs, dag ins, string asmstr, list<dag> pattern>
+  : Pseudo<outs, ins, asmstr, pattern>
+{
+  let Defs = [SREG];
+}
+
+class StorePseudo<dag outs, dag ins, string asmstr, list<dag> pattern>
+  : Pseudo<outs, ins, asmstr, pattern>
+{
+  let Defs = [SP];
+}
+
+class SelectPseudo<dag outs, dag ins, string asmstr, list<dag> pattern>
+  : Pseudo<outs, ins, asmstr, pattern>
+{
+  let usesCustomInserter = 1;
+
+  let Uses = [SREG];
+}
+
+class ShiftPseudo<dag outs, dag ins, string asmstr, list<dag> pattern>
+  : Pseudo<outs, ins, asmstr, pattern>
+{
+  let usesCustomInserter = 1;
+
+  let Defs = [SREG];
+}
+
diff --git a/lib/Target/AVR/AVRInstrInfo.cpp b/lib/Target/AVR/AVRInstrInfo.cpp
new file mode 100644
index 000000000000..0327c015cbbb
--- /dev/null
+++ b/lib/Target/AVR/AVRInstrInfo.cpp
@@ -0,0 +1,466 @@
+//===-- AVRInstrInfo.cpp - AVR Instruction Information --------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the AVR implementation of the TargetInstrInfo class.
+//
+//===----------------------------------------------------------------------===//
+
+#include "AVRInstrInfo.h"
+
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/CodeGen/MachineConstantPool.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineMemOperand.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/Function.h"
+#include "llvm/MC/MCContext.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/TargetRegistry.h"
+
+#include "AVR.h"
+#include "AVRMachineFunctionInfo.h"
+#include "AVRTargetMachine.h"
+#include "MCTargetDesc/AVRMCTargetDesc.h"
+
+#define GET_INSTRINFO_CTOR_DTOR
+#include "AVRGenInstrInfo.inc"
+
+namespace llvm {
+
+AVRInstrInfo::AVRInstrInfo()
+    : AVRGenInstrInfo(AVR::ADJCALLSTACKDOWN, AVR::ADJCALLSTACKUP), RI() {}
+
+void AVRInstrInfo::copyPhysReg(MachineBasicBlock &MBB,
+                               MachineBasicBlock::iterator MI,
+                               const DebugLoc &DL, unsigned DestReg,
+                               unsigned SrcReg, bool KillSrc) const {
+  unsigned Opc;
+
+  if (AVR::GPR8RegClass.contains(DestReg, SrcReg)) {
+    Opc = AVR::MOVRdRr;
+  } else if (AVR::DREGSRegClass.contains(DestReg, SrcReg)) {
+    Opc = AVR::MOVWRdRr;
+  } else if (SrcReg == AVR::SP && AVR::DREGSRegClass.contains(DestReg)) {
+    Opc = AVR::SPREAD;
+  } else if (DestReg == AVR::SP && AVR::DREGSRegClass.contains(SrcReg)) {
+    Opc = AVR::SPWRITE;
+  } else {
+    llvm_unreachable("Impossible reg-to-reg copy");
+  }
+
+  BuildMI(MBB, MI, DL, get(Opc), DestReg)
+      .addReg(SrcReg, getKillRegState(KillSrc));
+}
+
+unsigned AVRInstrInfo::isLoadFromStackSlot(const MachineInstr &MI,
+                                           int &FrameIndex) const {
+  switch (MI.getOpcode()) {
+  case AVR::LDDRdPtrQ:
+  case AVR::LDDWRdYQ: { //:FIXME: remove this once PR13375 gets fixed
+    if (MI.getOperand(1).isFI() && MI.getOperand(2).isImm() &&
+        MI.getOperand(2).getImm() == 0) {
+      FrameIndex = MI.getOperand(1).getIndex();
+      return MI.getOperand(0).getReg();
+    }
+    break;
+  }
+  default:
+    break;
+  }
+
+  return 0;
+}
+
+unsigned AVRInstrInfo::isStoreToStackSlot(const MachineInstr &MI,
+                                          int &FrameIndex) const {
+  switch (MI.getOpcode()) {
+  case AVR::STDPtrQRr:
+  case AVR::STDWPtrQRr: {
+    if (MI.getOperand(0).isFI() && MI.getOperand(1).isImm() &&
+        MI.getOperand(1).getImm() == 0) {
+      FrameIndex = MI.getOperand(0).getIndex();
+      return MI.getOperand(2).getReg();
+    }
+    break;
+  }
+  default:
+    break;
+  }
+
+  return 0;
+}
+
+void AVRInstrInfo::storeRegToStackSlot(MachineBasicBlock &MBB,
+                                       MachineBasicBlock::iterator MI,
+                                       unsigned SrcReg, bool isKill,
+                                       int FrameIndex,
+                                       const TargetRegisterClass *RC,
+                                       const TargetRegisterInfo *TRI) const {
+  MachineFunction &MF = *MBB.getParent();
+
+  DebugLoc DL;
+  if (MI != MBB.end()) {
+    DL = MI->getDebugLoc();
+  }
+
+  const MachineFrameInfo &MFI = *MF.getFrameInfo();
+
+  MachineMemOperand *MMO = MF.getMachineMemOperand(
+      MachinePointerInfo::getFixedStack(MF, FrameIndex),
+      MachineMemOperand::MOStore, MFI.getObjectSize(FrameIndex),
+      MFI.getObjectAlignment(FrameIndex));
+
+  unsigned Opcode = 0;
+  if (RC->hasType(MVT::i8)) {
+    Opcode = AVR::STDPtrQRr;
+  } else if (RC->hasType(MVT::i16)) {
+    Opcode = AVR::STDWPtrQRr;
+  } else {
+    llvm_unreachable("Cannot store this register into a stack slot!");
+  }
+
+  BuildMI(MBB, MI, DL, get(Opcode))
+      .addFrameIndex(FrameIndex)
+      .addImm(0)
+      .addReg(SrcReg, getKillRegState(isKill))
+      .addMemOperand(MMO);
+}
+
+void AVRInstrInfo::loadRegFromStackSlot(MachineBasicBlock &MBB,
+                                        MachineBasicBlock::iterator MI,
+                                        unsigned DestReg, int FrameIndex,
+                                        const TargetRegisterClass *RC,
+                                        const TargetRegisterInfo *TRI) const {
+  DebugLoc DL;
+  if (MI != MBB.end()) {
+    DL = MI->getDebugLoc();
+  }
+
+  MachineFunction &MF = *MBB.getParent();
+  const MachineFrameInfo &MFI = *MF.getFrameInfo();
+
+  MachineMemOperand *MMO = MF.getMachineMemOperand(
+      MachinePointerInfo::getFixedStack(MF, FrameIndex),
+      MachineMemOperand::MOLoad, MFI.getObjectSize(FrameIndex),
+      MFI.getObjectAlignment(FrameIndex));
+
+  unsigned Opcode = 0;
+  if (RC->hasType(MVT::i8)) {
+    Opcode = AVR::LDDRdPtrQ;
+  } else if (RC->hasType(MVT::i16)) {
+    // Opcode = AVR::LDDWRdPtrQ;
+    //:FIXME: remove this once PR13375 gets fixed
+    Opcode = AVR::LDDWRdYQ;
+  } else {
+    llvm_unreachable("Cannot load this register from a stack slot!");
+  }
+
+  BuildMI(MBB, MI, DL, get(Opcode), DestReg)
+      .addFrameIndex(FrameIndex)
+      .addImm(0)
+      .addMemOperand(MMO);
+}
+
+const MCInstrDesc &AVRInstrInfo::getBrCond(AVRCC::CondCodes CC) const {
+  switch (CC) {
+  default:
+    llvm_unreachable("Unknown condition code!");
+  case AVRCC::COND_EQ:
+    return get(AVR::BREQk);
+  case AVRCC::COND_NE:
+    return get(AVR::BRNEk);
+  case AVRCC::COND_GE:
+    return get(AVR::BRGEk);
+  case AVRCC::COND_LT:
+    return get(AVR::BRLTk);
+  case AVRCC::COND_SH:
+    return get(AVR::BRSHk);
+  case AVRCC::COND_LO:
+    return get(AVR::BRLOk);
+  case AVRCC::COND_MI:
+    return get(AVR::BRMIk);
+  case AVRCC::COND_PL:
+    return get(AVR::BRPLk);
+  }
+}
+
+AVRCC::CondCodes AVRInstrInfo::getCondFromBranchOpc(unsigned Opc) const {
+  switch (Opc) {
+  default:
+    return AVRCC::COND_INVALID;
+  case AVR::BREQk:
+    return AVRCC::COND_EQ;
+  case AVR::BRNEk:
+    return AVRCC::COND_NE;
+  case AVR::BRSHk:
+    return AVRCC::COND_SH;
+  case AVR::BRLOk:
+    return AVRCC::COND_LO;
+  case AVR::BRMIk:
+    return AVRCC::COND_MI;
+  case AVR::BRPLk:
+    return AVRCC::COND_PL;
+  case AVR::BRGEk:
+    return AVRCC::COND_GE;
+  case AVR::BRLTk:
+    return AVRCC::COND_LT;
+  }
+}
+
+AVRCC::CondCodes AVRInstrInfo::getOppositeCondition(AVRCC::CondCodes CC) const {
+  switch (CC) {
+  default:
+    llvm_unreachable("Invalid condition!");
+  case AVRCC::COND_EQ:
+    return AVRCC::COND_NE;
+  case AVRCC::COND_NE:
+    return AVRCC::COND_EQ;
+  case AVRCC::COND_SH:
+    return AVRCC::COND_LO;
+  case AVRCC::COND_LO:
+    return AVRCC::COND_SH;
+  case AVRCC::COND_GE:
+    return AVRCC::COND_LT;
+  case AVRCC::COND_LT:
+    return AVRCC::COND_GE;
+  case AVRCC::COND_MI:
+    return AVRCC::COND_PL;
+  case AVRCC::COND_PL:
+    return AVRCC::COND_MI;
+  }
+}
+
+bool AVRInstrInfo::analyzeBranch(MachineBasicBlock &MBB,
+                                 MachineBasicBlock *&TBB,
+                                 MachineBasicBlock *&FBB,
+                                 SmallVectorImpl<MachineOperand> &Cond,
+                                 bool AllowModify) const {
+  // Start from the bottom of the block and work up, examining the
+  // terminator instructions.
+  MachineBasicBlock::iterator I = MBB.end();
+  MachineBasicBlock::iterator UnCondBrIter = MBB.end();
+
+  while (I != MBB.begin()) {
+    --I;
+    if (I->isDebugValue()) {
+      continue;
+    }
+
+    // Working from the bottom, when we see a non-terminator
+    // instruction, we're done.
+    if (!isUnpredicatedTerminator(*I)) {
+      break;
+    }
+
+    // A terminator that isn't a branch can't easily be handled
+    // by this analysis.
+    if (!I->getDesc().isBranch()) {
+      return true;
+    }
+
+    // Handle unconditional branches.
+    //:TODO: add here jmp
+    if (I->getOpcode() == AVR::RJMPk) {
+      UnCondBrIter = I;
+
+      if (!AllowModify) {
+        TBB = I->getOperand(0).getMBB();
+        continue;
+      }
+
+      // If the block has any instructions after a JMP, delete them.
+      while (std::next(I) != MBB.end()) {
+        std::next(I)->eraseFromParent();
+      }
+
+      Cond.clear();
+      FBB = 0;
+
+      // Delete the JMP if it's equivalent to a fall-through.
+      if (MBB.isLayoutSuccessor(I->getOperand(0).getMBB())) {
+        TBB = 0;
+        I->eraseFromParent();
+        I = MBB.end();
+        UnCondBrIter = MBB.end();
+        continue;
+      }
+
+      // TBB is used to indicate the unconditinal destination.
+      TBB = I->getOperand(0).getMBB();
+      continue;
+    }
+
+    // Handle conditional branches.
+    AVRCC::CondCodes BranchCode = getCondFromBranchOpc(I->getOpcode());
+    if (BranchCode == AVRCC::COND_INVALID) {
+      return true; // Can't handle indirect branch.
+    }
+
+    // Working from the bottom, handle the first conditional branch.
+    if (Cond.empty()) {
+      MachineBasicBlock *TargetBB = I->getOperand(0).getMBB();
+      if (AllowModify && UnCondBrIter != MBB.end() &&
+          MBB.isLayoutSuccessor(TargetBB)) {
+        // If we can modify the code and it ends in something like:
+        //
+        //     jCC L1
+        //     jmp L2
+        //   L1:
+        //     ...
+        //   L2:
+        //
+        // Then we can change this to:
+        //
+        //     jnCC L2
+        //   L1:
+        //     ...
+        //   L2:
+        //
+        // Which is a bit more efficient.
+        // We conditionally jump to the fall-through block.
+        BranchCode = getOppositeCondition(BranchCode);
+        unsigned JNCC = getBrCond(BranchCode).getOpcode();
+        MachineBasicBlock::iterator OldInst = I;
+
+        BuildMI(MBB, UnCondBrIter, MBB.findDebugLoc(I), get(JNCC))
+            .addMBB(UnCondBrIter->getOperand(0).getMBB());
+        BuildMI(MBB, UnCondBrIter, MBB.findDebugLoc(I), get(AVR::RJMPk))
+            .addMBB(TargetBB);
+
+        OldInst->eraseFromParent();
+        UnCondBrIter->eraseFromParent();
+
+        // Restart the analysis.
+        UnCondBrIter = MBB.end();
+        I = MBB.end();
+        continue;
+      }
+
+      FBB = TBB;
+      TBB = I->getOperand(0).getMBB();
+      Cond.push_back(MachineOperand::CreateImm(BranchCode));
+      continue;
+    }
+
+    // Handle subsequent conditional branches. Only handle the case where all
+    // conditional branches branch to the same destination.
+    assert(Cond.size() == 1);
+    assert(TBB);
+
+    // Only handle the case where all conditional branches branch to
+    // the same destination.
+    if (TBB != I->getOperand(0).getMBB()) {
+      return true;
+    }
+
+    AVRCC::CondCodes OldBranchCode = (AVRCC::CondCodes)Cond[0].getImm();
+    // If the conditions are the same, we can leave them alone.
+    if (OldBranchCode == BranchCode) {
+      continue;
+    }
+
+    return true;
+  }
+
+  return false;
+}
+
+unsigned AVRInstrInfo::InsertBranch(MachineBasicBlock &MBB,
+                                    MachineBasicBlock *TBB,
+                                    MachineBasicBlock *FBB,
+                                    ArrayRef<MachineOperand> Cond,
+                                    const DebugLoc &DL) const {
+  // Shouldn't be a fall through.
+  assert(TBB && "InsertBranch must not be told to insert a fallthrough");
+  assert((Cond.size() == 1 || Cond.size() == 0) &&
+         "AVR branch conditions have one component!");
+
+  if (Cond.empty()) {
+    assert(!FBB && "Unconditional branch with multiple successors!");
+    BuildMI(&MBB, DL, get(AVR::RJMPk)).addMBB(TBB);
+    return 1;
+  }
+
+  // Conditional branch.
+  unsigned Count = 0;
+  AVRCC::CondCodes CC = (AVRCC::CondCodes)Cond[0].getImm();
+  BuildMI(&MBB, DL, getBrCond(CC)).addMBB(TBB);
+  ++Count;
+
+  if (FBB) {
+    // Two-way Conditional branch. Insert the second branch.
+    BuildMI(&MBB, DL, get(AVR::RJMPk)).addMBB(FBB);
+    ++Count;
+  }
+
+  return Count;
+}
+
+unsigned AVRInstrInfo::RemoveBranch(MachineBasicBlock &MBB) const {
+  MachineBasicBlock::iterator I = MBB.end();
+  unsigned Count = 0;
+
+  while (I != MBB.begin()) {
+    --I;
+    if (I->isDebugValue()) {
+      continue;
+    }
+    //:TODO: add here the missing jmp instructions once they are implemented
+    // like jmp, {e}ijmp, and other cond branches, ...
+    if (I->getOpcode() != AVR::RJMPk &&
+        getCondFromBranchOpc(I->getOpcode()) == AVRCC::COND_INVALID) {
+      break;
+    }
+
+    // Remove the branch.
+    I->eraseFromParent();
+    I = MBB.end();
+    ++Count;
+  }
+
+  return Count;
+}
+
+bool AVRInstrInfo::ReverseBranchCondition(
+    SmallVectorImpl<MachineOperand> &Cond) const {
+  assert(Cond.size() == 1 && "Invalid AVR branch condition!");
+
+  AVRCC::CondCodes CC = static_cast<AVRCC::CondCodes>(Cond[0].getImm());
+  Cond[0].setImm(getOppositeCondition(CC));
+
+  return false;
+}
+
+unsigned AVRInstrInfo::GetInstSizeInBytes(const MachineInstr *MI) const {
+  unsigned Opcode = MI->getOpcode();
+
+  switch (Opcode) {
+  // A regular instruction
+  default: {
+    const MCInstrDesc &Desc = get(Opcode);
+    return Desc.getSize();
+  }
+  case TargetOpcode::EH_LABEL:
+  case TargetOpcode::IMPLICIT_DEF:
+  case TargetOpcode::KILL:
+  case TargetOpcode::DBG_VALUE:
+    return 0;
+  case TargetOpcode::INLINEASM: {
+    const MachineFunction *MF = MI->getParent()->getParent();
+    const AVRTargetMachine &TM = static_cast<const AVRTargetMachine&>(MF->getTarget());
+    const TargetInstrInfo &TII = *TM.getSubtargetImpl()->getInstrInfo();
+    return TII.getInlineAsmLength(MI->getOperand(0).getSymbolName(),
+                                  *TM.getMCAsmInfo());
+  }
+  }
+}
+
+} // end of namespace llvm
diff --git a/lib/Target/AVR/AVRInstrInfo.h b/lib/Target/AVR/AVRInstrInfo.h
new file mode 100644
index 000000000000..fc8945d82432
--- /dev/null
+++ b/lib/Target/AVR/AVRInstrInfo.h
@@ -0,0 +1,110 @@
+//===-- AVRInstrInfo.h - AVR Instruction Information ------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the AVR implementation of the TargetInstrInfo class.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_AVR_INSTR_INFO_H
+#define LLVM_AVR_INSTR_INFO_H
+
+#include "llvm/Target/TargetInstrInfo.h"
+
+#include "AVRRegisterInfo.h"
+
+#define GET_INSTRINFO_HEADER
+#include "AVRGenInstrInfo.inc"
+#undef GET_INSTRINFO_HEADER
+
+namespace llvm {
+
+namespace AVRCC {
+
+/// AVR specific condition codes.
+/// These correspond to `AVR_*_COND` in `AVRInstrInfo.td`.
+/// They must be kept in synch.
+enum CondCodes {
+  COND_EQ, //!< Equal
+  COND_NE, //!< Not equal
+  COND_GE, //!< Greater than or equal
+  COND_LT, //!< Less than
+  COND_SH, //!< Unsigned same or higher
+  COND_LO, //!< Unsigned lower
+  COND_MI, //!< Minus
+  COND_PL, //!< Plus
+  COND_INVALID
+};
+
+} // end of namespace AVRCC
+
+namespace AVRII {
+
+/// Specifies a target operand flag.
+enum TOF {
+  MO_NO_FLAG,
+
+  /// On a symbol operand, this represents the lo part.
+  MO_LO = (1 << 1),
+
+  /// On a symbol operand, this represents the hi part.
+  MO_HI = (1 << 2),
+
+  /// On a symbol operand, this represents it has to be negated.
+  MO_NEG = (1 << 3)
+};
+
+} // end of namespace AVRII
+
+/// Utilities related to the AVR instruction set.
+class AVRInstrInfo : public AVRGenInstrInfo {
+public:
+  explicit AVRInstrInfo();
+
+  const AVRRegisterInfo &getRegisterInfo() const { return RI; }
+  const MCInstrDesc &getBrCond(AVRCC::CondCodes CC) const;
+  AVRCC::CondCodes getCondFromBranchOpc(unsigned Opc) const;
+  AVRCC::CondCodes getOppositeCondition(AVRCC::CondCodes CC) const;
+  unsigned GetInstSizeInBytes(const MachineInstr *MI) const;
+
+  void copyPhysReg(MachineBasicBlock &MBB, MachineBasicBlock::iterator MI,
+                   const DebugLoc &DL, unsigned DestReg, unsigned SrcReg,
+                   bool KillSrc) const override;
+  void storeRegToStackSlot(MachineBasicBlock &MBB,
+                           MachineBasicBlock::iterator MI, unsigned SrcReg,
+                           bool isKill, int FrameIndex,
+                           const TargetRegisterClass *RC,
+                           const TargetRegisterInfo *TRI) const override;
+  void loadRegFromStackSlot(MachineBasicBlock &MBB,
+                            MachineBasicBlock::iterator MI, unsigned DestReg,
+                            int FrameIndex, const TargetRegisterClass *RC,
+                            const TargetRegisterInfo *TRI) const override;
+  unsigned isLoadFromStackSlot(const MachineInstr &MI,
+                               int &FrameIndex) const override;
+  unsigned isStoreToStackSlot(const MachineInstr &MI,
+                              int &FrameIndex) const override;
+
+  // Branch analysis.
+  bool analyzeBranch(MachineBasicBlock &MBB, MachineBasicBlock *&TBB,
+                     MachineBasicBlock *&FBB,
+                     SmallVectorImpl<MachineOperand> &Cond,
+                     bool AllowModify = false) const override;
+  unsigned InsertBranch(MachineBasicBlock &MBB, MachineBasicBlock *TBB,
+                        MachineBasicBlock *FBB, ArrayRef<MachineOperand> Cond,
+                        const DebugLoc &DL) const override;
+  unsigned RemoveBranch(MachineBasicBlock &MBB) const override;
+  bool
+  ReverseBranchCondition(SmallVectorImpl<MachineOperand> &Cond) const override;
+
+private:
+  const AVRRegisterInfo RI;
+};
+
+} // end namespace llvm
+
+#endif // LLVM_AVR_INSTR_INFO_H
diff --git a/lib/Target/AVR/AVRInstrInfo.td b/lib/Target/AVR/AVRInstrInfo.td
new file mode 100644
index 000000000000..e75683680150
--- /dev/null
+++ b/lib/Target/AVR/AVRInstrInfo.td
@@ -0,0 +1,1981 @@
+//===-- AVRInstrInfo.td - AVR Instruction defs -------------*- tablegen -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file describes the AVR instructions in TableGen format.
+//
+//===----------------------------------------------------------------------===//
+
+include "AVRInstrFormats.td"
+
+//===----------------------------------------------------------------------===//
+// AVR Type Profiles
+//===----------------------------------------------------------------------===//
+
+def SDT_AVRCallSeqStart : SDCallSeqStart<[SDTCisVT<0, i16>]>;
+def SDT_AVRCallSeqEnd : SDCallSeqEnd<[SDTCisVT<0, i16>, SDTCisVT<1, i16>]>;
+def SDT_AVRCall : SDTypeProfile<0, -1, [SDTCisVT<0, iPTR>]>;
+def SDT_AVRWrapper : SDTypeProfile<1, 1, [SDTCisSameAs<0, 1>, SDTCisPtrTy<0>]>;
+def SDT_AVRBrcond : SDTypeProfile<0, 2,
+                                  [SDTCisVT<0, OtherVT>, SDTCisVT<1, i8>]>;
+def SDT_AVRCmp : SDTypeProfile<0, 2, [SDTCisSameAs<0, 1>]>;
+def SDT_AVRTst : SDTypeProfile<0, 1, [SDTCisInt<0>]>;
+def SDT_AVRSelectCC : SDTypeProfile<1, 3, [SDTCisSameAs<0, 1>,
+                                    SDTCisSameAs<1, 2>, SDTCisVT<3, i8>]>;
+
+//===----------------------------------------------------------------------===//
+// AVR Specific Node Definitions
+//===----------------------------------------------------------------------===//
+
+def AVRretflag : SDNode<"AVRISD::RET_FLAG", SDTNone,
+                        [SDNPHasChain, SDNPOptInGlue, SDNPVariadic]>;
+def AVRretiflag : SDNode<"AVRISD::RETI_FLAG", SDTNone,
+                         [SDNPHasChain, SDNPOptInGlue, SDNPVariadic]>;
+
+def AVRcallseq_start : SDNode<"ISD::CALLSEQ_START", SDT_AVRCallSeqStart,
+                              [SDNPHasChain, SDNPOutGlue]>;
+def AVRcallseq_end : SDNode<"ISD::CALLSEQ_END", SDT_AVRCallSeqEnd,
+                            [SDNPHasChain, SDNPOptInGlue, SDNPOutGlue]>;
+
+def AVRcall : SDNode<"AVRISD::CALL", SDT_AVRCall,
+                     [SDNPHasChain, SDNPOutGlue, SDNPOptInGlue, SDNPVariadic]>;
+
+def AVRWrapper : SDNode<"AVRISD::WRAPPER", SDT_AVRWrapper>;
+
+def AVRbrcond : SDNode<"AVRISD::BRCOND", SDT_AVRBrcond,
+                       [SDNPHasChain, SDNPInGlue]>;
+def AVRcmp : SDNode<"AVRISD::CMP", SDT_AVRCmp, [SDNPOutGlue]>;
+def AVRcmpc : SDNode<"AVRISD::CMPC", SDT_AVRCmp, [SDNPInGlue, SDNPOutGlue]>;
+def AVRtst : SDNode<"AVRISD::TST", SDT_AVRTst, [SDNPOutGlue]>;
+def AVRselectcc: SDNode<"AVRISD::SELECT_CC", SDT_AVRSelectCC, [SDNPInGlue]>;
+
+// Shift nodes.
+def AVRlsl : SDNode<"AVRISD::LSL", SDTIntUnaryOp>;
+def AVRlsr : SDNode<"AVRISD::LSR", SDTIntUnaryOp>;
+def AVRrol : SDNode<"AVRISD::ROL", SDTIntUnaryOp>;
+def AVRror : SDNode<"AVRISD::ROR", SDTIntUnaryOp>;
+def AVRasr : SDNode<"AVRISD::ASR", SDTIntUnaryOp>;
+
+// Pseudo shift nodes for non-constant shift amounts.
+def AVRlslLoop : SDNode<"AVRISD::LSLLOOP", SDTIntShiftOp>;
+def AVRlsrLoop : SDNode<"AVRISD::LSRLOOP", SDTIntShiftOp>;
+def AVRasrLoop : SDNode<"AVRISD::ASRLOOP", SDTIntShiftOp>;
+
+//===----------------------------------------------------------------------===//
+// AVR Operands, Complex Patterns and Transformations Definitions.
+//===----------------------------------------------------------------------===//
+
+def imm8_neg_XFORM : SDNodeXForm<imm,
+[{
+  return CurDAG->getTargetConstant(-N->getAPIntValue(), SDLoc(N), MVT::i8);
+}]>;
+
+def imm16_neg_XFORM : SDNodeXForm<imm,
+[{
+  return CurDAG->getTargetConstant(-N->getAPIntValue(), SDLoc(N), MVT::i16);
+}]>;
+
+def imm0_63_neg : PatLeaf<(imm),
+[{
+  int64_t val = -N->getSExtValue();
+  return val >= 0 && val < 64;
+}], imm16_neg_XFORM>;
+
+def uimm6 : PatLeaf<(imm), [{ return isUInt<6>(N->getZExtValue()); }]>;
+
+def ioaddr_XFORM : SDNodeXForm<imm,
+[{
+  return CurDAG->getTargetConstant(uint8_t(N->getZExtValue()) - 0x20, SDLoc(N), MVT::i8);
+}]>;
+
+def iobitpos8_XFORM : SDNodeXForm<imm,
+[{
+  return CurDAG->getTargetConstant(Log2_32(uint8_t(N->getZExtValue())),
+                                   SDLoc(N), MVT::i8);
+}]>;
+
+def iobitposn8_XFORM : SDNodeXForm<imm,
+[{
+  return CurDAG->getTargetConstant(Log2_32(uint8_t(~N->getZExtValue())),
+                                   SDLoc(N), MVT::i8);
+}]>;
+
+def ioaddr8 : PatLeaf<(imm),
+[{
+  uint64_t val = N->getZExtValue();
+  return val >= 0x20 && val < 0x60;
+}], ioaddr_XFORM>;
+
+def lowioaddr8 : PatLeaf<(imm),
+[{
+  uint64_t val = N->getZExtValue();
+  return val >= 0x20 && val < 0x40;
+}], ioaddr_XFORM>;
+
+def ioaddr16 : PatLeaf<(imm),
+[{
+  uint64_t val = N->getZExtValue();
+  return val >= 0x20 && val < 0x5f;
+}], ioaddr_XFORM>;
+
+def iobitpos8 : PatLeaf<(imm),
+[{
+  return isPowerOf2_32(uint8_t(N->getZExtValue()));
+}], iobitpos8_XFORM>;
+
+def iobitposn8 : PatLeaf<(imm),
+[{
+  return isPowerOf2_32(uint8_t(~N->getZExtValue()));
+}], iobitposn8_XFORM>;
+
+def MemriAsmOperand : AsmOperandClass {
+  let Name = "Memri";
+  let ParserMethod = "parseMemriOperand";
+}
+
+/// Address operand for `reg+imm` used by STD and LDD.
+def memri : Operand<iPTR>
+{
+  let MIOperandInfo = (ops PTRDISPREGS, i16imm);
+
+  let PrintMethod = "printMemri";
+  let EncoderMethod = "encodeMemri";
+
+  let ParserMatchClass = MemriAsmOperand;
+}
+
+// Address operand for `SP+imm` used by STD{W}SPQRr
+def memspi : Operand<iPTR>
+{
+  let MIOperandInfo = (ops GPRSP, i16imm);
+}
+
+def i8imm_com : Operand<i8>
+{
+  let EncoderMethod = "encodeComplement";
+
+  let MIOperandInfo = (ops i8imm);
+}
+
+def relbrtarget_7 : Operand<OtherVT>
+{
+    let PrintMethod   = "printPCRelImm";
+    let EncoderMethod = "encodeRelCondBrTarget<AVR::fixup_7_pcrel>";
+}
+
+def brtarget_13 : Operand<OtherVT>
+{
+    let PrintMethod   = "printPCRelImm";
+    let EncoderMethod = "encodeRelCondBrTarget<AVR::fixup_13_pcrel>";
+}
+
+// The target of a 22 or 16-bit call/jmp instruction.
+def call_target : Operand<iPTR>
+{
+    let EncoderMethod = "encodeCallTarget";
+}
+
+// Addressing mode pattern reg+imm6
+def addr : ComplexPattern<iPTR, 2, "SelectAddr", [], [SDNPWantRoot]>;
+
+// AsmOperand class for a pointer register.
+// Used with the LD/ST family of instructions.
+// See FSTLD in AVRInstrFormats.td
+def PtrRegAsmOperand : AsmOperandClass
+{
+   let Name = "Reg";
+}
+
+// A special operand type for the LD/ST instructions.
+// It converts the pointer register number into a two-bit field used in the
+// instruction.
+def LDSTPtrReg : Operand<i16>
+{
+    let MIOperandInfo = (ops PTRREGS);
+    let EncoderMethod = "encodeLDSTPtrReg";
+
+    let ParserMatchClass = PtrRegAsmOperand;
+}
+
+// A special operand type for the LDD/STD instructions.
+// It behaves identically to the LD/ST version, except restricts
+// the pointer registers to Y and Z.
+def LDDSTDPtrReg : Operand<i16>
+{
+    let MIOperandInfo = (ops PTRDISPREGS);
+    let EncoderMethod = "encodeLDSTPtrReg";
+
+    let ParserMatchClass = PtrRegAsmOperand;
+}
+
+//===----------------------------------------------------------------------===//
+// AVR predicates for subtarget features
+//===----------------------------------------------------------------------===//
+
+def HasSRAM       :    Predicate<"Subtarget->hasSRAM()">,
+                         AssemblerPredicate<"FeatureSRAM">;
+
+def HasJMPCALL    :    Predicate<"Subtarget->hasJMPCALL()">,
+                         AssemblerPredicate<"FeatureJMPCALL">;
+
+def HasIJMPCALL   :    Predicate<"Subtarget->hasIJMPCALL()">,
+                         AssemblerPredicate<"FeatureIJMPCALL">;
+
+def HasEIJMPCALL  :    Predicate<"Subtarget->hasEIJMPCALL()">,
+                         AssemblerPredicate<"FeatureEIJMPCALL">;
+
+def HasADDSUBIW   :    Predicate<"Subtarget->hasADDSUBIW()">,
+                         AssemblerPredicate<"FeatureADDSUBIW">;
+
+def HasSmallStack :    Predicate<"Subtarget->HasSmallStack()">,
+                         AssemblerPredicate<"FeatureSmallStack">;
+
+def HasMOVW       :    Predicate<"Subtarget->hasMOVW()">,
+                         AssemblerPredicate<"FeatureMOVW">;
+
+def HasLPM        :    Predicate<"Subtarget->hasLPM()">,
+                         AssemblerPredicate<"FeatureLPM">;
+
+def HasLPMX       :    Predicate<"Subtarget->hasLPMX()">,
+                         AssemblerPredicate<"FeatureLPMX">;
+
+def HasELPM       :    Predicate<"Subtarget->hasELPM()">,
+                         AssemblerPredicate<"FeatureELPM">;
+
+def HasELPMX      :    Predicate<"Subtarget->hasELPMX()">,
+                         AssemblerPredicate<"FeatureELPMX">;
+
+def HasSPM        :    Predicate<"Subtarget->hasSPM()">,
+                         AssemblerPredicate<"FeatureSPM">;
+
+def HasSPMX       :    Predicate<"Subtarget->hasSPMX()">,
+                         AssemblerPredicate<"FeatureSPMX">;
+
+def HasDES        :    Predicate<"Subtarget->hasDES()">,
+                         AssemblerPredicate<"FeatureDES">;
+
+def SupportsRMW   :    Predicate<"Subtarget->supportsRMW()">,
+                         AssemblerPredicate<"FeatureRMW">;
+
+def SupportsMultiplication : Predicate<"Subtarget->supportsMultiplication()">,
+                               AssemblerPredicate<"FeatureMultiplication">;
+
+def HasBREAK      :    Predicate<"Subtarget->hasBREAK()">,
+                         AssemblerPredicate<"FeatureBREAK">;
+
+def HasTinyEncoding : Predicate<"Subtarget->hasTinyEncoding()">,
+                        AssemblerPredicate<"FeatureTinyEncoding">;
+
+
+// AVR specific condition code. These correspond to AVR_*_COND in
+// AVRInstrInfo.td. They must be kept in synch.
+def AVR_COND_EQ : PatLeaf<(i8 0)>;
+def AVR_COND_NE : PatLeaf<(i8 1)>;
+def AVR_COND_GE : PatLeaf<(i8 2)>;
+def AVR_COND_LT : PatLeaf<(i8 3)>;
+def AVR_COND_SH : PatLeaf<(i8 4)>;
+def AVR_COND_LO : PatLeaf<(i8 5)>;
+def AVR_COND_MI : PatLeaf<(i8 6)>;
+def AVR_COND_PL : PatLeaf<(i8 7)>;
+
+
+//===----------------------------------------------------------------------===//
+//===----------------------------------------------------------------------===//
+// AVR Instruction list
+//===----------------------------------------------------------------------===//
+//===----------------------------------------------------------------------===//
+
+// ADJCALLSTACKDOWN/UP implicitly use/def SP because they may be expanded into
+// a stack adjustment and the codegen must know that they may modify the stack
+// pointer before prolog-epilog rewriting occurs.
+// Pessimistically assume ADJCALLSTACKDOWN / ADJCALLSTACKUP will become
+// sub / add which can clobber SREG.
+let Defs = [SP, SREG],
+Uses = [SP] in
+{
+  def ADJCALLSTACKDOWN : Pseudo<(outs),
+                                (ins i16imm:$amt),
+                                "#ADJCALLSTACKDOWN",
+                                [(AVRcallseq_start timm:$amt)]>;
+
+  // R31R30 is used to update SP, since it is a scratch reg and this instruction
+  // is placed after the function call then R31R30 should be always free.
+  //let Defs = [R31R30],
+  //Uses = [R31R30] in
+  //:TODO: if we enable this, the pseudo is killed because it looks dead
+  def ADJCALLSTACKUP : Pseudo<(outs),
+                              (ins i16imm:$amt1, i16imm:$amt2),
+                              "#ADJCALLSTACKUP",
+                              [(AVRcallseq_end timm:$amt1, timm:$amt2)]>;
+}
+
+//===----------------------------------------------------------------------===//
+// Addition
+//===----------------------------------------------------------------------===//
+let isCommutable = 1,
+Constraints = "$src = $rd",
+Defs = [SREG] in
+{
+  // ADD Rd, Rr
+  // Adds two 8-bit registers.
+  def ADDRdRr : FRdRr<0b0000,
+                      0b11,
+                      (outs GPR8:$rd),
+                      (ins GPR8:$src, GPR8:$rr),
+                      "add\t$rd, $rr",
+                      [(set i8:$rd, (add i8:$src, i8:$rr)),
+                       (implicit SREG)]>;
+
+  // ADDW Rd+1:Rd, Rr+1:Rr
+  // Pseudo instruction to add four 8-bit registers as two 16-bit values.
+  //
+  // Expands to:
+  // add Rd,    Rr
+  // adc Rd+1, Rr+1
+  def ADDWRdRr : Pseudo<(outs DREGS:$rd),
+                        (ins DREGS:$src, DREGS:$rr),
+                        "addw\t$rd, $rr",
+                        [(set i16:$rd, (add i16:$src, i16:$rr)),
+                         (implicit SREG)]>;
+
+  // ADC Rd, Rr
+  // Adds two 8-bit registers with carry.
+  let Uses = [SREG] in
+  def ADCRdRr : FRdRr<0b0001,
+                      0b11,
+                      (outs GPR8:$rd),
+                      (ins GPR8:$src, GPR8:$rr),
+                      "adc\t$rd, $rr",
+                      [(set i8:$rd, (adde i8:$src, i8:$rr)),
+                       (implicit SREG)]>;
+
+  // ADCW Rd+1:Rd, Rr+1:Rr
+  // Pseudo instruction to add four 8-bit registers as two 16-bit values with
+  // carry.
+  //
+  // Expands to:
+  // adc Rd,   Rr
+  // adc Rd+1, Rr+1
+  let Uses = [SREG] in
+  def ADCWRdRr : Pseudo<(outs DREGS:$rd),
+                        (ins DREGS:$src, DREGS:$rr),
+                        "adcw\t$rd, $rr",
+                        [(set i16:$rd, (adde i16:$src, i16:$rr)),
+                         (implicit SREG)]>;
+
+  // AIDW Rd, k
+  // Adds an immediate 6-bit value K to Rd, placing the result in Rd.
+  def ADIWRdK : FWRdK<0b0,
+                      (outs IWREGS:$rd),
+                      (ins IWREGS:$src, i16imm:$k),
+                      "adiw\t$rd, $k",
+                      [(set i16:$rd, (add i16:$src, uimm6:$k)),
+                       (implicit SREG)]>,
+                Requires<[HasADDSUBIW]>;
+}
+
+//===----------------------------------------------------------------------===//
+// Subtraction
+//===----------------------------------------------------------------------===//
+let Constraints = "$src = $rd",
+Defs = [SREG] in
+{
+  // SUB Rd, Rr
+  // Subtracts the 8-bit value of Rr from Rd and places the value in Rd.
+  def SUBRdRr : FRdRr<0b0001,
+                      0b10,
+                      (outs GPR8:$rd),
+                      (ins GPR8:$src, GPR8:$rr),
+                      "sub\t$rd, $rr",
+                      [(set i8:$rd, (sub i8:$src, i8:$rr)),
+                       (implicit SREG)]>;
+
+  // SUBW Rd+1:Rd, Rr+1:Rr
+  // Subtracts two 16-bit values and places the result into Rd.
+  //
+  // Expands to:
+  // sub Rd,   Rr
+  // sbc Rd+1, Rr+1
+  def SUBWRdRr : Pseudo<(outs DREGS:$rd),
+                        (ins DREGS:$src, DREGS:$rr),
+                        "subw\t$rd, $rr",
+                        [(set i16:$rd, (sub i16:$src, i16:$rr)),
+                         (implicit SREG)]>;
+
+  def SUBIRdK : FRdK<0b0101,
+                     (outs LD8:$rd),
+                     (ins LD8:$src, i8imm:$k),
+                     "subi\t$rd, $k",
+                     [(set i8:$rd, (sub i8:$src, imm:$k)),
+                      (implicit SREG)]>;
+
+  // SUBIW Rd+1:Rd, K+1:K
+  //
+  // Expands to:
+  // subi Rd,   K
+  // sbci Rd+1, K+1
+  def SUBIWRdK : Pseudo<(outs DLDREGS:$rd),
+                        (ins DLDREGS:$src, i16imm:$rr),
+                        "subiw\t$rd, $rr",
+                        [(set i16:$rd, (sub i16:$src, imm:$rr)),
+                         (implicit SREG)]>;
+
+  def SBIWRdK : FWRdK<0b1,
+                      (outs IWREGS:$rd),
+                      (ins IWREGS:$src, i16imm:$k),
+                      "sbiw\t$rd, $k",
+                      [(set i16:$rd, (sub i16:$src, uimm6:$k)),
+                       (implicit SREG)]>,
+                Requires<[HasADDSUBIW]>;
+
+  // Subtract with carry operations which must read the carry flag in SREG.
+  let Uses = [SREG] in
+  {
+    def SBCRdRr : FRdRr<0b0000,
+                        0b10,
+                        (outs GPR8:$rd),
+                        (ins GPR8:$src, GPR8:$rr),
+                        "sbc\t$rd, $rr",
+                        [(set i8:$rd, (sube i8:$src, i8:$rr)),
+                         (implicit SREG)]>;
+
+    // SBCW Rd+1:Rd, Rr+1:Rr
+    //
+    // Expands to:
+    // sbc Rd,   Rr
+    // sbc Rd+1, Rr+1
+    def SBCWRdRr : Pseudo<(outs DREGS:$rd),
+                          (ins DREGS:$src, DREGS:$rr),
+                          "sbcw\t$rd, $rr",
+                          [(set i16:$rd, (sube i16:$src, i16:$rr)),
+                           (implicit SREG)]>;
+
+    def SBCIRdK : FRdK<0b0100,
+                       (outs LD8:$rd),
+                       (ins LD8:$src, i8imm:$k),
+                       "sbci\t$rd, $k",
+                       [(set i8:$rd, (sube i8:$src, imm:$k)),
+                        (implicit SREG)]>;
+
+    // SBCIW Rd+1:Rd, K+1:K
+    // sbci Rd,   K
+    // sbci Rd+1, K+1
+    def SBCIWRdK : Pseudo<(outs DLDREGS:$rd),
+                          (ins DLDREGS:$src, i16imm:$rr),
+                          "sbciw\t$rd, $rr",
+                          [(set i16:$rd, (sube i16:$src, imm:$rr)),
+                           (implicit SREG)]>;
+  }
+}
+
+//===----------------------------------------------------------------------===//
+// Increment and Decrement
+//===----------------------------------------------------------------------===//
+let Constraints = "$src = $rd",
+Defs = [SREG] in
+{
+  def INCRd : FRd<0b1001,
+                  0b0100011,
+                  (outs GPR8:$rd),
+                  (ins GPR8:$src),
+                  "inc\t$rd",
+                  [(set i8:$rd, (add i8:$src, 1)), (implicit SREG)]>;
+
+  def DECRd : FRd<0b1001,
+                  0b0101010,
+                  (outs GPR8:$rd),
+                  (ins GPR8:$src),
+                  "dec\t$rd",
+                  [(set i8:$rd, (add i8:$src, -1)), (implicit SREG)]>;
+}
+
+//===----------------------------------------------------------------------===//
+// Multiplication
+//===----------------------------------------------------------------------===//
+
+let isCommutable = 1,
+Defs = [R1, R0, SREG] in
+{
+  // MUL Rd, Rr
+  // Multiplies Rd by Rr and places the result into R1:R0.
+  let usesCustomInserter = 1 in {
+    def MULRdRr : FRdRr<0b1001, 0b11,
+                        (outs),
+                        (ins GPR8:$lhs, GPR8:$rhs),
+                        "mul\t$lhs, $rhs",
+                        [/*(set R1, R0, (smullohi i8:$lhs, i8:$rhs))*/]>,
+                    Requires<[SupportsMultiplication]>;
+
+    def MULSRdRr : FMUL2RdRr<0,
+                             (outs),
+                             (ins GPR8:$lhs, GPR8:$rhs),
+                             "muls\t$lhs, $rhs",
+                             []>,
+                   Requires<[SupportsMultiplication]>;
+  }
+
+  def MULSURdRr : FMUL2RdRr<1,
+                            (outs),
+                            (ins GPR8:$lhs, GPR8:$rhs),
+                            "mulsu\t$lhs, $rhs",
+                            []>,
+                  Requires<[SupportsMultiplication]>;
+
+  def FMUL : FFMULRdRr<0b01,
+                       (outs),
+                       (ins GPR8:$lhs, GPR8:$rhs),
+                       "fmul\t$lhs, $rhs",
+                       []>,
+             Requires<[SupportsMultiplication]>;
+
+  def FMULS : FFMULRdRr<0b10,
+                        (outs),
+                        (ins GPR8:$lhs, GPR8:$rhs),
+                        "fmuls\t$lhs, $rhs",
+                        []>,
+              Requires<[SupportsMultiplication]>;
+
+  def FMULSU : FFMULRdRr<0b11,
+                         (outs),
+                         (ins GPR8:$lhs, GPR8:$rhs),
+                         "fmulsu\t$lhs, $rhs",
+                         []>,
+               Requires<[SupportsMultiplication]>;
+}
+
+let Defs = [R15, R14, R13, R12, R11, R10, R9,
+            R8, R7, R6, R5, R4, R3, R2, R1, R0] in
+def DESK : FDES<(outs),
+                (ins i8imm:$k),
+                "des\t$k",
+                []>,
+           Requires<[HasDES]>;
+
+//===----------------------------------------------------------------------===//
+// Logic
+//===----------------------------------------------------------------------===//
+let Constraints = "$src = $rd",
+Defs = [SREG] in
+{
+  // Register-Register logic instructions (which have the
+  // property of commutativity).
+  let isCommutable = 1 in
+  {
+    def ANDRdRr : FRdRr<0b0010,
+                        0b00,
+                        (outs GPR8:$rd),
+                        (ins GPR8:$src, GPR8:$rr),
+                        "and\t$rd, $rr",
+                        [(set i8:$rd, (and i8:$src, i8:$rr)),
+                         (implicit SREG)]>;
+
+    // ANDW Rd+1:Rd, Rr+1:Rr
+    //
+    // Expands to:
+    // and Rd,   Rr
+    // and Rd+1, Rr+1
+    def ANDWRdRr : Pseudo<(outs DREGS:$rd),
+                          (ins DREGS:$src, DREGS:$rr),
+                          "andw\t$rd, $rr",
+                          [(set i16:$rd, (and i16:$src, i16:$rr)),
+                           (implicit SREG)]>;
+
+    def ORRdRr : FRdRr<0b0010,
+                       0b10,
+                       (outs GPR8:$rd),
+                       (ins GPR8:$src, GPR8:$rr),
+                       "or\t$rd, $rr",
+                       [(set i8:$rd, (or i8:$src, i8:$rr)),
+                        (implicit SREG)]>;
+
+    // ORW Rd+1:Rd, Rr+1:Rr
+    //
+    // Expands to:
+    // or Rd,   Rr
+    // or Rd+1, Rr+1
+    def ORWRdRr : Pseudo<(outs DREGS:$rd),
+                         (ins DREGS:$src, DREGS:$rr),
+                         "orw\t$rd, $rr",
+                         [(set i16:$rd, (or i16:$src, i16:$rr)),
+                          (implicit SREG)]>;
+
+    def EORRdRr : FRdRr<0b0010,
+                        0b01,
+                        (outs GPR8:$rd),
+                        (ins GPR8:$src, GPR8:$rr),
+                        "eor\t$rd, $rr",
+                        [(set i8:$rd, (xor i8:$src, i8:$rr)),
+                         (implicit SREG)]>;
+
+    // EORW Rd+1:Rd, Rr+1:Rr
+    //
+    // Expands to:
+    // eor Rd,   Rr
+    // eor Rd+1, Rr+1
+    def EORWRdRr : Pseudo<(outs DREGS:$rd),
+                          (ins DREGS:$src, DREGS:$rr),
+                          "eorw\t$rd, $rr",
+                          [(set i16:$rd, (xor i16:$src, i16:$rr)),
+                           (implicit SREG)]>;
+  }
+
+  def ANDIRdK : FRdK<0b0111,
+                     (outs LD8:$rd),
+                     (ins LD8:$src, i8imm:$k),
+                     "andi\t$rd, $k",
+                     [(set i8:$rd, (and i8:$src, imm:$k)),
+                      (implicit SREG)]>;
+
+  // ANDI Rd+1:Rd, K+1:K
+  //
+  // Expands to:
+  // andi Rd,   K
+  // andi Rd+1, K+1
+  def ANDIWRdK : Pseudo<(outs DLDREGS:$rd),
+                        (ins DLDREGS:$src, i16imm:$k),
+                        "andiw\t$rd, $k",
+                        [(set i16:$rd, (and i16:$src, imm:$k)),
+                         (implicit SREG)]>;
+
+  def ORIRdK : FRdK<0b0110,
+                    (outs LD8:$rd),
+                    (ins LD8:$src, i8imm:$k),
+                    "ori\t$rd, $k",
+                    [(set i8:$rd, (or i8:$src, imm:$k)),
+                     (implicit SREG)]>;
+
+  // ORIW Rd+1:Rd, K+1,K
+  //
+  // Expands to:
+  // ori Rd,   K
+  // ori Rd+1, K+1
+  def ORIWRdK : Pseudo<(outs DLDREGS:$rd),
+                       (ins DLDREGS:$src, i16imm:$rr),
+                       "oriw\t$rd, $rr",
+                       [(set i16:$rd, (or i16:$src, imm:$rr)),
+                        (implicit SREG)]>;
+}
+
+//===----------------------------------------------------------------------===//
+// One's/Two's Compliment
+//===----------------------------------------------------------------------===//
+let Constraints = "$src = $rd",
+Defs = [SREG] in
+{
+  def COMRd : FRd<0b1001,
+                  0b0100000,
+                  (outs GPR8:$rd),
+                  (ins GPR8:$src),
+                  "com\t$rd",
+                  [(set i8:$rd, (not i8:$src)), (implicit SREG)]>;
+
+  // COMW Rd+1:Rd
+  //
+  // Expands to:
+  // com Rd
+  // com Rd+1
+  def COMWRd : Pseudo<(outs DREGS:$rd),
+                      (ins DREGS:$src),
+                      "comw\t$rd",
+                      [(set i16:$rd, (not i16:$src)), (implicit SREG)]>;
+
+  //:TODO: optimize NEG for wider types
+  def NEGRd : FRd<0b1001,
+                  0b0100001,
+                  (outs GPR8:$rd),
+                  (ins GPR8:$src),
+                  "neg\t$rd",
+                  [(set i8:$rd, (ineg i8:$src)), (implicit SREG)]>;
+}
+
+// TST Rd
+// Test for zero of minus.
+// This operation is identical to a `Rd AND Rd`.
+//def : InstAlias<"tst\t$rd", (ANDRdRr GPR8:$rd, GPR8:$rd), 1>;
+
+let Defs = [SREG] in
+def TSTRd : FTST<0b0010,
+                  0b00,
+                  (outs),
+                  (ins GPR8:$rd),
+                  "tst\t$rd",
+                  [(AVRtst i8:$rd)]>;
+
+//===----------------------------------------------------------------------===//
+// Jump instructions
+//===----------------------------------------------------------------------===//
+let isBarrier = 1,
+isBranch = 1,
+isTerminator = 1 in
+{
+  def RJMPk : FBRk<0,
+                   (outs),
+                   (ins brtarget_13:$target),
+                   "rjmp\t$target",
+                   [(br bb:$target)]>;
+
+  let isIndirectBranch = 1,
+  Uses = [R31R30] in
+  def IJMP : F16<0b1001010000001001,
+                 (outs),
+                 (ins),
+                 "ijmp",
+                 []>,
+             Requires<[HasIJMPCALL]>;
+
+  let isIndirectBranch = 1,
+  Uses = [R31R30] in
+  def EIJMP : F16<0b1001010000011001,
+                  (outs),
+                  (ins),
+                  "eijmp",
+                  []>,
+              Requires<[HasEIJMPCALL]>;
+
+  def JMPk : F32BRk<0b110,
+                    (outs),
+                    (ins call_target:$k),
+                    "jmp\t$k",
+                    []>,
+             Requires<[HasJMPCALL]>;
+}
+
+//===----------------------------------------------------------------------===//
+// Call instructions
+//===----------------------------------------------------------------------===//
+let isCall = 1 in
+{
+  // SP is marked as a use to prevent stack-pointer assignments that appear
+  // immediately before calls from potentially appearing dead.
+  let Uses = [SP] in
+  def RCALLk : FBRk<1,
+                    (outs),
+                    (ins brtarget_13:$target),
+                    "rcall\t$target",
+                    []>;
+
+  // SP is marked as a use to prevent stack-pointer assignments that appear
+  // immediately before calls from potentially appearing dead.
+  let Uses = [SP, R31R30] in
+  def ICALL : F16<0b1001010100001001,
+                  (outs),
+                  (ins variable_ops),
+                  "icall",
+                  []>,
+              Requires<[HasIJMPCALL]>;
+
+  // SP is marked as a use to prevent stack-pointer assignments that appear
+  // immediately before calls from potentially appearing dead.
+  let Uses = [SP, R31R30] in
+  def EICALL : F16<0b1001010100011001,
+                   (outs),
+                   (ins variable_ops),
+                   "eicall",
+                   []>,
+               Requires<[HasEIJMPCALL]>;
+
+  // SP is marked as a use to prevent stack-pointer assignments that appear
+  // immediately before calls from potentially appearing dead.
+  //
+  //:TODO: the imm field can be either 16 or 22 bits in devices with more
+  // than 64k of ROM, fix it once we support the largest devices.
+  let Uses = [SP] in
+  def CALLk : F32BRk<0b111,
+                     (outs),
+                     (ins call_target:$k),
+                     "call\t$k",
+                     [(AVRcall imm:$k)]>,
+              Requires<[HasJMPCALL]>;
+}
+
+//===----------------------------------------------------------------------===//
+// Return instructions.
+//===----------------------------------------------------------------------===//
+let isTerminator = 1,
+isReturn = 1,
+isBarrier = 1 in 
+{
+  def RET : F16<0b1001010100001000,
+                (outs),
+                (ins),
+                "ret",
+                [(AVRretflag)]>;
+
+  def RETI : F16<0b1001010100011000,
+                 (outs),
+                 (ins),
+                 "reti",
+                 [(AVRretiflag)]>;
+}
+
+//===----------------------------------------------------------------------===//
+// Compare operations.
+//===----------------------------------------------------------------------===//
+let Defs = [SREG] in
+{
+  // CPSE Rd, Rr
+  // Compare Rd and Rr, skipping the next instruction if they are equal.
+  let isBarrier = 1,
+  isBranch = 1,
+  isTerminator = 1 in
+  def CPSE : FRdRr<0b0001,
+                   0b00,
+                   (outs),
+                   (ins GPR8:$rd, GPR8:$rr),
+                   "cpse\t$rd, $rr",
+                   []>;
+
+  def CPRdRr : FRdRr<0b0001,
+                     0b01,
+                     (outs),
+                     (ins GPR8:$rd, GPR8:$rr),
+                     "cp\t$rd, $rr",
+                     [(AVRcmp i8:$rd, i8:$rr), (implicit SREG)]>;
+
+  // CPW Rd+1:Rd, Rr+1:Rr
+  //
+  // Expands to:
+  // cp  Rd,   Rr
+  // cpc Rd+1, Rr+1
+  def CPWRdRr : Pseudo<(outs),
+                       (ins DREGS:$src, DREGS:$src2),
+                       "cpw\t$src, $src2",
+                       [(AVRcmp i16:$src, i16:$src2), (implicit SREG)]>;
+
+  let Uses = [SREG] in
+  def CPCRdRr : FRdRr<0b0000,
+                      0b01,
+                      (outs),
+                      (ins GPR8:$rd, GPR8:$rr),
+                      "cpc\t$rd, $rr",
+                      [(AVRcmpc i8:$rd, i8:$rr), (implicit SREG)]>;
+
+  // CPCW Rd+1:Rd. Rr+1:Rr
+  //
+  // Expands to:
+  // cpc Rd,   Rr
+  // cpc Rd+1, Rr+1
+  let Uses = [SREG] in
+  def CPCWRdRr : Pseudo<(outs),
+                        (ins DREGS:$src, DREGS:$src2),
+                        "cpcw\t$src, $src2",
+                        [(AVRcmpc i16:$src, i16:$src2), (implicit SREG)]>;
+
+  // CPI Rd, K
+  // Compares a register with an 8 bit immediate.
+  let Uses = [SREG] in
+  def CPIRdK : FRdK<0b0011,
+                    (outs),
+                    (ins GPR8:$rd, i8imm:$k),
+                    "cpi\t$rd, $k",
+                    [(AVRcmp i8:$rd, imm:$k), (implicit SREG)]>;
+}
+
+//===----------------------------------------------------------------------===//
+// Register conditional skipping/branching operations.
+//===----------------------------------------------------------------------===//
+let isBranch = 1,
+isTerminator = 1 in
+{
+  // Conditional skipping on GPR register bits, and
+  // conditional skipping on IO register bits.
+  let isBarrier = 1 in
+  {
+    def SBRCRrB : FRdB<0b10,
+                       (outs),
+                       (ins GPR8:$rr, i8imm:$b),
+                       "sbrc\t$rr, $b",
+                       []>;
+
+    def SBRSRrB : FRdB<0b11,
+                       (outs),
+                       (ins GPR8:$rr, i8imm:$b),
+                       "sbrs\t$rr, $b",
+                       []>;
+
+    def SBICAb : FIOBIT<0b01,
+                        (outs),
+                        (ins i16imm:$a, i8imm:$b),
+                        "sbic\t$a, $b",
+                        []>;
+
+    def SBISAb : FIOBIT<0b11,
+                        (outs),
+                        (ins i16imm:$a, i8imm:$b),
+                        "sbis\t$a, $b",
+                        []>;
+  }
+
+  // Relative branches on status flag bits.
+  let Uses = [SREG] in
+  {
+    // BRBS s, k
+    // Branch if `s` flag in status register is set.
+    def BRBSsk : FSK<0,
+                     (outs),
+                     (ins i8imm:$s, relbrtarget_7:$k),
+                     "brbs\t$s, $k",
+                     []>;
+
+    // BRBC s, k
+    // Branch if `s` flag in status register is clear.
+    def BRBCsk : FSK<1,
+                     (outs),
+                     (ins i8imm:$s, relbrtarget_7:$k),
+                     "brbc\t$s, $k",
+                     []>;
+  }
+}
+
+
+// BRCS k
+// Branch if carry flag is set
+def : InstAlias<"brcs\t$k", (BRBSsk 0, relbrtarget_7:$k)>;
+
+// BRCC k
+// Branch if carry flag is clear
+def : InstAlias<"brcc\t$k", (BRBCsk 0, relbrtarget_7:$k)>;
+
+// BRHS k
+// Branch if half carry flag is set
+def : InstAlias<"brhs\t$k", (BRBSsk 5, relbrtarget_7:$k)>;
+
+// BRHC k
+// Branch if half carry flag is clear
+def : InstAlias<"brhc\t$k", (BRBCsk 5, relbrtarget_7:$k)>;
+
+// BRTS k
+// Branch if the T flag is set
+def : InstAlias<"brts\t$k", (BRBSsk 6, relbrtarget_7:$k)>;
+
+// BRTC k
+// Branch if the T flag is clear
+def : InstAlias<"brtc\t$k", (BRBCsk 6, relbrtarget_7:$k)>;
+
+// BRVS k
+// Branch if the overflow flag is set
+def : InstAlias<"brvs\t$k", (BRBSsk 3, relbrtarget_7:$k)>;
+
+// BRVC k
+// Branch if the overflow flag is clear
+def : InstAlias<"brvc\t$k", (BRBCsk 3, relbrtarget_7:$k)>;
+
+// BRIE k
+// Branch if the global interrupt flag is enabled
+def : InstAlias<"brie\t$k", (BRBSsk 7, relbrtarget_7:$k)>;
+
+// BRID k
+// Branch if the global interrupt flag is disabled
+def : InstAlias<"brid\t$k", (BRBCsk 7, relbrtarget_7:$k)>;
+
+//===----------------------------------------------------------------------===//
+// PC-relative conditional branches
+//===----------------------------------------------------------------------===//
+// Based on status register. We cannot simplify these into instruction aliases
+// because we also need to be able to specify a pattern to match for ISel.
+let isBranch = 1,
+isTerminator = 1,
+Uses = [SREG] in
+{
+  def BREQk : FBRsk<0,
+                    0b001,
+                    (outs),
+                    (ins relbrtarget_7:$target),
+                    "breq\t$target",
+                    [(AVRbrcond bb:$target, AVR_COND_EQ)]>;
+
+  def BRNEk : FBRsk<1,
+                    0b001,
+                    (outs),
+                    (ins relbrtarget_7:$target),
+                    "brne\t$target",
+                    [(AVRbrcond bb:$target, AVR_COND_NE)]>;
+
+
+  def BRSHk : FBRsk<1,
+                    0b000,
+                    (outs),
+                    (ins relbrtarget_7:$target),
+                    "brsh\t$target",
+                    [(AVRbrcond bb:$target, AVR_COND_SH)]>;
+
+  def BRLOk : FBRsk<0,
+                    0b000,
+                    (outs),
+                    (ins relbrtarget_7:$target),
+                    "brlo\t$target",
+                    [(AVRbrcond bb:$target, AVR_COND_LO)]>;
+
+  def BRMIk : FBRsk<0,
+                    0b010,
+                    (outs),
+                    (ins relbrtarget_7:$target),
+                    "brmi\t$target",
+                    [(AVRbrcond bb:$target, AVR_COND_MI)]>;
+
+  def BRPLk : FBRsk<1,
+                    0b010,
+                    (outs),
+                    (ins relbrtarget_7:$target),
+                    "brpl\t$target",
+                    [(AVRbrcond bb:$target, AVR_COND_PL)]>;
+
+  def BRGEk : FBRsk<1,
+                    0b100,
+                    (outs),
+                    (ins relbrtarget_7:$target),
+                    "brge\t$target",
+                    [(AVRbrcond bb:$target, AVR_COND_GE)]>;
+
+  def BRLTk : FBRsk<0,
+                    0b100,
+                    (outs),
+                    (ins relbrtarget_7:$target),
+                    "brlt\t$target",
+                    [(AVRbrcond bb:$target, AVR_COND_LT)]>;
+}
+
+//===----------------------------------------------------------------------===//
+// Data transfer instructions
+//===----------------------------------------------------------------------===//
+// 8 and 16-bit register move instructions.
+let hasSideEffects = 0 in
+{
+  def MOVRdRr : FRdRr<0b0010,
+                      0b11,
+                      (outs GPR8:$rd),
+                      (ins GPR8:$rr),
+                      "mov\t$rd, $rr",
+                      []>;
+
+  def MOVWRdRr : FMOVWRdRr<(outs DREGS:$dst),
+                           (ins DREGS:$src),
+                           "movw\t$dst, $src",
+                           []>,
+                 Requires<[HasMOVW]>;
+}
+
+// Load immediate values into registers.
+let isReMaterializable = 1 in
+{
+  def LDIRdK : FRdK<0b1110,
+                    (outs LD8:$rd),
+                    (ins i8imm:$k),
+                    "ldi\t$rd, $k",
+                    [(set i8:$rd, imm:$k)]>;
+
+  // LDIW Rd+1:Rd, K+1:K
+  //
+  // Expands to:
+  // ldi Rd,   K
+  // ldi Rd+1, K+1
+  def LDIWRdK : Pseudo<(outs DLDREGS:$dst),
+                       (ins i16imm:$src),
+                       "ldiw\t$dst, $src",
+                       [(set i16:$dst, imm:$src)]>;
+}
+
+// Load from data space into register.
+let canFoldAsLoad = 1,
+isReMaterializable = 1 in
+{
+  def LDSRdK : F32DM<0b0,
+                     (outs GPR8:$rd),
+                     (ins i16imm:$k),
+                     "lds\t$rd, $k",
+                     [(set i8:$rd, (load imm:$k))]>,
+               Requires<[HasSRAM]>;
+
+  // LDSW Rd+1:Rd, K+1:K
+  //
+  // Expands to:
+  // lds Rd,  (K+1:K)
+  // lds Rd+1 (K+1:K) + 1
+  def LDSWRdK : Pseudo<(outs DREGS:$dst),
+                       (ins i16imm:$src),
+                       "ldsw\t$dst, $src",
+                       [(set i16:$dst, (load imm:$src))]>,
+                Requires<[HasSRAM]>;
+}
+
+// Indirect loads.
+let canFoldAsLoad = 1,
+isReMaterializable = 1 in
+{
+  def LDRdPtr : FSTLD<0,
+                      0b00,
+                      (outs GPR8:$reg),
+                      (ins LDSTPtrReg:$ptrreg),
+                      "ld\t$reg, $ptrreg",
+                      [(set GPR8:$reg, (load i16:$ptrreg))]>,
+                Requires<[HasSRAM]>;
+
+  // LDW Rd+1:Rd, P
+  //
+  // Expands to:
+  // ld Rd,   P+
+  // ld Rd+1, P+
+  let Constraints = "@earlyclobber $reg" in
+  def LDWRdPtr : Pseudo<(outs DREGS:$reg),
+                        (ins PTRDISPREGS:$ptrreg),
+                        "ldw\t$reg, $ptrreg",
+                        [(set i16:$reg, (load i16:$ptrreg))]>,
+                 Requires<[HasSRAM]>;
+}
+
+// Indirect loads (with postincrement or predecrement).
+let mayLoad = 1,
+hasSideEffects = 0,
+Constraints = "$ptrreg = $base_wb,@earlyclobber $reg,@earlyclobber $base_wb" in
+{
+  def LDRdPtrPi : FSTLD<0,
+                        0b01,
+                        (outs GPR8:$reg, PTRREGS:$base_wb),
+                        (ins LDSTPtrReg:$ptrreg),
+                        "ld\t$reg, $ptrreg+",
+                        []>,
+                  Requires<[HasSRAM]>;
+
+  // LDW Rd+1:Rd, P+
+  // Expands to:
+  // ld Rd,   P+
+  // ld Rd+1, P+
+  def LDWRdPtrPi : Pseudo<(outs DREGS:$reg, PTRREGS:$base_wb),
+                          (ins PTRREGS:$ptrreg),
+                          "ldw\t$reg, $ptrreg+",
+                          []>,
+                   Requires<[HasSRAM]>;
+
+  def LDRdPtrPd : FSTLD<0,
+                        0b10,
+                        (outs GPR8:$reg, PTRREGS:$base_wb),
+                        (ins LDSTPtrReg:$ptrreg),
+                        "ld\t$reg, -$ptrreg",
+                        []>,
+                  Requires<[HasSRAM]>;
+
+  // LDW Rd+1:Rd, -P
+  //
+  // Expands to:
+  // ld Rd+1, -P
+  // ld Rd,   -P
+  def LDWRdPtrPd : Pseudo<(outs DREGS:$reg, PTRREGS:$base_wb),
+                          (ins PTRREGS:$ptrreg),
+                          "ldw\t$reg, -$ptrreg",
+                          []>,
+                   Requires<[HasSRAM]>;
+}
+
+// Load indirect with displacement operations.
+let canFoldAsLoad = 1,
+isReMaterializable = 1 in
+{
+  def LDDRdPtrQ : FSTDLDD<0,
+                          (outs GPR8:$reg),
+                          (ins memri:$memri),
+                          "ldd\t$reg, $memri",
+                          [(set i8:$reg, (load addr:$memri))]>,
+                  Requires<[HasSRAM]>;
+
+  // LDDW Rd+1:Rd, P+q
+  //
+  // Expands to:
+  // ldd Rd,   P+q
+  // ldd Rd+1, P+q+1
+  let Constraints = "@earlyclobber $dst" in
+  def LDDWRdPtrQ : Pseudo<(outs DREGS:$dst),
+                          (ins memri:$memri),
+                          "lddw\t$dst, $memri",
+                          [(set i16:$dst, (load addr:$memri))]>,
+                   Requires<[HasSRAM]>;
+
+  //:FIXME: remove this once PR13375 gets fixed
+  // Bug report: https://llvm.org/bugs/show_bug.cgi?id=13375
+  let mayLoad = 1,
+  hasSideEffects = 0 in
+  def LDDWRdYQ : Pseudo<(outs DREGS:$dst),
+                        (ins memri:$memri),
+                        "lddw\t$dst, $memri",
+                        []>,
+                 Requires<[HasSRAM]>;
+}
+
+// Indirect store from register to data space.
+def STSKRr : F32DM<0b1,
+                   (outs),
+                   (ins i16imm:$k, GPR8:$rd),
+                   "sts\t$k, $rd",
+                   [(store i8:$rd, imm:$k)]>,
+             Requires<[HasSRAM]>;
+
+// STSW K+1:K, Rr+1:Rr
+//
+// Expands to:
+// sts Rr+1, (K+1:K) + 1
+// sts Rr,   (K+1:K)
+def STSWKRr : Pseudo<(outs),
+                     (ins i16imm:$dst, DREGS:$src),
+                     "stsw\t$dst, $src",
+                     [(store i16:$src, imm:$dst)]>,
+              Requires<[HasSRAM]>;
+
+// Indirect stores.
+// ST P, Rr
+// Stores the value of Rr into the location addressed by pointer P.
+def STPtrRr : FSTLD<1,
+                    0b00,
+                    (outs),
+                    (ins LDSTPtrReg:$ptrreg, GPR8:$reg),
+                    "st\t$ptrreg, $reg",
+                    [(store GPR8:$reg, i16:$ptrreg)]>,
+              Requires<[HasSRAM]>;
+
+// STW P, Rr+1:Rr
+// Stores the value of Rr into the location addressed by pointer P.
+//
+// Expands to:
+// st P, Rr
+// std P+1, Rr+1
+def STWPtrRr : Pseudo<(outs),
+                      (ins PTRDISPREGS:$ptrreg, DREGS:$reg),
+                      "stw\t$ptrreg, $reg",
+                      [(store i16:$reg, i16:$ptrreg)]>,
+               Requires<[HasSRAM]>;
+
+// Indirect stores (with postincrement or predecrement).
+let Constraints = "$ptrreg = $base_wb,@earlyclobber $base_wb" in
+{
+
+  // ST P+, Rr
+  // Stores the value of Rr into the location addressed by pointer P.
+  // Post increments P.
+  def STPtrPiRr : FSTLD<1,
+                        0b01,
+                        (outs LDSTPtrReg:$base_wb),
+                        (ins LDSTPtrReg:$ptrreg, GPR8:$reg, i8imm:$offs),
+                        "st\t$ptrreg+, $reg",
+                        [(set i16:$base_wb,
+                         (post_store GPR8:$reg, i16:$ptrreg, imm:$offs))]>,
+                  Requires<[HasSRAM]>;
+
+  // STW P+, Rr+1:Rr
+  // Stores the value of Rr into the location addressed by pointer P.
+  // Post increments P.
+  //
+  // Expands to:
+  // st P+, Rr
+  // st P+, Rr+1
+  def STWPtrPiRr : Pseudo<(outs PTRREGS:$base_wb),
+                          (ins PTRREGS:$ptrreg, DREGS:$trh, i8imm:$offs),
+                          "stw\t$ptrreg+, $trh",
+                          [(set PTRREGS:$base_wb,
+                           (post_store DREGS:$trh, PTRREGS:$ptrreg, imm:$offs))]>,
+                   Requires<[HasSRAM]>;
+
+  // ST -P, Rr
+  // Stores the value of Rr into the location addressed by pointer P.
+  // Pre decrements P.
+  def STPtrPdRr : FSTLD<1,
+                        0b10,
+                        (outs LDSTPtrReg:$base_wb),
+                        (ins LDSTPtrReg:$ptrreg, GPR8:$reg, i8imm:$offs),
+                        "st\t-$ptrreg, $reg",
+                        [(set i16:$base_wb,
+                         (pre_store GPR8:$reg, i16:$ptrreg, imm:$offs))]>,
+                  Requires<[HasSRAM]>;
+
+  // STW -P, Rr+1:Rr
+  // Stores the value of Rr into the location addressed by pointer P.
+  // Pre decrements P.
+  //
+  // Expands to:
+  // st -P, Rr+1
+  // st -P, Rr
+  def STWPtrPdRr : Pseudo<(outs PTRREGS:$base_wb),
+                          (ins PTRREGS:$ptrreg, DREGS:$reg, i8imm:$offs),
+                          "stw\t-$ptrreg, $reg",
+                          [(set PTRREGS:$base_wb,
+                           (pre_store i16:$reg, i16:$ptrreg, imm:$offs))]>,
+                   Requires<[HasSRAM]>;
+}
+
+// Store indirect with displacement operations.
+// STD P+q, Rr
+// Stores the value of Rr into the location addressed by pointer P with a
+// displacement of q. Does not modify P.
+def STDPtrQRr : FSTDLDD<1,
+                        (outs),
+                        (ins memri:$memri, GPR8:$reg),
+                        "std\t$memri, $reg",
+                        [(store i8:$reg, addr:$memri)]>,
+                Requires<[HasSRAM]>;
+
+// STDW P+q, Rr+1:Rr
+// Stores the value of Rr into the location addressed by pointer P with a
+// displacement of q. Does not modify P.
+//
+// Expands to:
+// std P+q,   Rr
+// std P+q+1, Rr+1
+def STDWPtrQRr : Pseudo<(outs),
+                        (ins memri:$memri, DREGS:$src),
+                        "stdw\t$memri, $src",
+                        [(store i16:$src, addr:$memri)]>,
+                 Requires<[HasSRAM]>;
+
+
+// Load program memory operations.
+let canFoldAsLoad = 1,
+isReMaterializable = 1,
+hasSideEffects = 0 in
+{
+  let Defs = [R0],
+      Uses = [R31R30] in
+  def LPM : F16<0b1001010111001000,
+                (outs),
+                (ins),
+                "lpm",
+                []>,
+            Requires<[HasLPM]>;
+
+  def LPMRdZ : FLPMX<0,
+                     0,
+                     (outs GPR8:$dst),
+                     (ins ZREGS:$z),
+                     "lpm\t$dst, $z",
+                     []>,
+               Requires<[HasLPMX]>;
+
+  def LPMWRdZ : Pseudo<(outs DREGS:$dst),
+                       (ins ZREGS:$z),
+                       "lpmw\t$dst, $z",
+                       []>,
+                Requires<[HasLPMX]>;
+
+  // Load program memory, while postincrementing the Z register.
+  let mayLoad = 1,
+  Defs = [R31R30] in
+  {
+    def LPMRdZPi : FLPMX<0,
+                         1,
+                         (outs GPR8:$dst),
+                         (ins ZREGS:$z),
+                         "lpm\t$dst, $z+",
+                         []>,
+                   Requires<[HasLPMX]>;
+
+    def LPMWRdZPi : Pseudo<(outs DREGS:$dst),
+                           (ins ZREGS:$z),
+                           "lpmw\t$dst, $z+",
+                           []>,
+                    Requires<[HasLPMX]>;
+  }
+}
+
+// Extended load program memory operations.
+let mayLoad = 1,
+hasSideEffects = 0 in
+{
+  let Defs = [R0],
+      Uses = [R31R30] in
+  def ELPM : F16<0b1001010111011000,
+                 (outs),
+                 (ins),
+                 "elpm",
+                 []>,
+             Requires<[HasELPM]>;
+
+  def ELPMRdZ : FLPMX<1,
+                      0,
+                      (outs GPR8:$dst),
+                      (ins ZREGS:$z),
+                      "elpm\t$dst, $z",
+                      []>,
+                Requires<[HasELPMX]>;
+
+  let Defs = [R31R30] in
+  def ELPMRdZPi : FLPMX<1,
+                        1,
+                        (outs GPR8:$dst),
+                        (ins ZREGS: $z),
+                        "elpm\t$dst, $z+",
+                        []>,
+                  Requires<[HasELPMX]>;
+}
+
+// Store program memory operations.
+let Uses = [R1, R0] in
+{
+  let Uses = [R31R30, R1, R0] in
+  def SPM : F16<0b1001010111101000,
+                (outs),
+                (ins),
+                "spm",
+                []>,
+            Requires<[HasSPM]>;
+
+  let Defs = [R31R30] in
+  def SPMZPi : F16<0b1001010111111000,
+                   (outs),
+                   (ins ZREGS:$z),
+                   "spm $z+",
+                   []>,
+               Requires<[HasSPMX]>;
+}
+
+// Read data from IO location operations.
+let canFoldAsLoad = 1,
+isReMaterializable = 1 in
+{
+  def INRdA : FIORdA<(outs GPR8:$dst),
+                     (ins i16imm:$src),
+                     "in\t$dst, $src",
+                     [(set i8:$dst, (load ioaddr8:$src))]>;
+
+  def INWRdA : Pseudo<(outs DREGS:$dst),
+                      (ins i16imm:$src),
+                      "inw\t$dst, $src",
+                      [(set i16:$dst, (load ioaddr16:$src))]>;
+}
+
+// Write data to IO location operations.
+def OUTARr : FIOARr<(outs),
+                    (ins i16imm:$dst, GPR8:$src),
+                    "out\t$dst, $src",
+                    [(store i8:$src, ioaddr8:$dst)]>;
+
+def OUTWARr : Pseudo<(outs),
+                     (ins i16imm:$dst, DREGS:$src),
+                     "outw\t$dst, $src",
+                     [(store i16:$src, ioaddr16:$dst)]>;
+
+// Stack push/pop operations.
+let Defs = [SP],
+Uses = [SP],
+hasSideEffects = 0 in
+{
+  // Stack push operations.
+  let mayStore = 1 in
+  {
+    def PUSHRr : FRd<0b1001,
+                     0b0011111,
+                     (outs),
+                     (ins GPR8:$reg),
+                     "push\t$reg",
+                     []>,
+                 Requires<[HasSRAM]>;
+
+    def PUSHWRr : Pseudo<(outs),
+                         (ins DREGS:$reg),
+                         "pushw\t$reg",
+                         []>,
+                  Requires<[HasSRAM]>;
+  }
+
+  // Stack pop operations.
+  let mayLoad = 1 in
+  {
+    def POPRd : FRd<0b1001,
+                    0b0001111,
+                    (outs GPR8:$reg),
+                    (ins),
+                    "pop\t$reg",
+                    []>,
+                Requires<[HasSRAM]>;
+
+    def POPWRd : Pseudo<(outs DREGS:$reg),
+                        (ins),
+                        "popw\t$reg",
+                        []>,
+                 Requires<[HasSRAM]>;
+  }
+}
+
+// Read-Write-Modify (RMW) instructions.
+def XCHZRd : FZRd<0b100,
+                  (outs GPR8:$rd),
+                  (ins ZREGS:$z),
+                  "xch\t$z, $rd",
+                  []>,
+             Requires<[SupportsRMW]>;
+
+def LASZRd : FZRd<0b101,
+                  (outs GPR8:$rd),
+                  (ins ZREGS:$z),
+                  "las\t$z, $rd",
+                  []>,
+             Requires<[SupportsRMW]>;
+
+def LACZRd : FZRd<0b110,
+                  (outs GPR8:$rd),
+                  (ins ZREGS:$z),
+                  "lac\t$z, $rd",
+                  []>,
+             Requires<[SupportsRMW]>;
+
+def LATZRd : FZRd<0b111,
+                  (outs GPR8:$rd),
+                  (ins ZREGS:$z),
+                  "lat\t$z, $rd",
+                  []>,
+             Requires<[SupportsRMW]>;
+
+//===----------------------------------------------------------------------===//
+// Bit and bit-test instructions
+//===----------------------------------------------------------------------===//
+
+// Bit shift/rotate operations.
+let Constraints = "$src = $rd",
+Defs = [SREG] in
+{
+  def LSLRd : FRdRr<0b0000,
+                    0b11,
+                    (outs GPR8:$rd),
+                    (ins GPR8:$src),
+                    "lsl\t$rd",
+                    [(set i8:$rd, (AVRlsl i8:$src)), (implicit SREG)]>;
+
+  def LSLWRd : Pseudo<(outs DREGS:$rd),
+                      (ins DREGS:$src),
+                      "lslw\t$rd",
+                      [(set i16:$rd, (AVRlsl i16:$src)), (implicit SREG)]>;
+
+  def LSRRd : FRd<0b1001,
+                  0b0100110,
+                  (outs GPR8:$rd),
+                  (ins GPR8:$src),
+                  "lsr\t$rd",
+                  [(set i8:$rd, (AVRlsr i8:$src)), (implicit SREG)]>;
+
+  def LSRWRd : Pseudo<(outs DREGS:$rd),
+                      (ins DREGS:$src),
+                      "lsrw\t$rd",
+                      [(set i16:$rd, (AVRlsr i16:$src)), (implicit SREG)]>;
+
+  def ASRRd : FRd<0b1001,
+                  0b0100101,
+                  (outs GPR8:$rd),
+                  (ins GPR8:$src),
+                  "asr\t$rd",
+                  [(set i8:$rd, (AVRasr i8:$src)), (implicit SREG)]>;
+
+  def ASRWRd : Pseudo<(outs DREGS:$rd),
+                      (ins DREGS:$src),
+                      "asrw\t$rd",
+                      [(set i16:$rd, (AVRasr i16:$src)), (implicit SREG)]>;
+
+  // Bit rotate operations.
+  let Uses = [SREG] in
+  {
+    def ROLRd : FRdRr<0b0001,
+                      0b11,
+                      (outs GPR8:$rd),
+                      (ins GPR8:$src),
+                      "rol\t$rd",
+                      [(set i8:$rd, (AVRrol i8:$src)), (implicit SREG)]>;
+
+    def ROLWRd : Pseudo<(outs DREGS:$rd),
+                        (ins DREGS:$src),
+                        "rolw\t$rd",
+                        [(set i16:$rd, (AVRrol i16:$src)), (implicit SREG)]>;
+
+    def RORRd : FRd<0b1001,
+                    0b0100111,
+                    (outs GPR8:$rd),
+                    (ins GPR8:$src),
+                    "ror\t$rd",
+                    [(set i8:$rd, (AVRror i8:$src)), (implicit SREG)]>;
+
+    def RORWRd : Pseudo<(outs DREGS:$rd),
+                        (ins DREGS:$src),
+                        "rorw\t$rd",
+                        [(set i16:$rd, (AVRror i16:$src)), (implicit SREG)]>;
+  }
+}
+
+// SWAP Rd
+// Swaps the high and low nibbles in a register.
+let Constraints = "$src = $rd" in
+def SWAPRd : FRd<0b1001,
+                 0b0100010,
+                 (outs GPR8:$rd),
+                 (ins GPR8:$src),
+                 "swap\t$rd",
+                 [(set i8:$rd, (bswap i8:$src))]>;
+
+// IO register bit set/clear operations.
+//:TODO: add patterns when popcount(imm)==2 to be expanded with 2 sbi/cbi
+// instead of in+ori+out which requires one more instr.
+def SBIAb : FIOBIT<0b10,
+                   (outs),
+                   (ins i16imm:$addr, i8imm:$bit),
+                   "sbi\t$addr, $bit",
+                   [(store (or (i8 (load lowioaddr8:$addr)), iobitpos8:$bit),
+                     lowioaddr8:$addr)]>;
+
+def CBIAb : FIOBIT<0b00,
+                   (outs),
+                   (ins i16imm:$addr, i8imm:$bit),
+                   "cbi\t$addr, $bit",
+                   [(store (and (i8 (load lowioaddr8:$addr)), iobitposn8:$bit),
+                     lowioaddr8:$addr)]>;
+
+// Status register bit load/store operations.
+let Defs = [SREG] in
+def BST : FRdB<0b01,
+               (outs),
+               (ins GPR8:$rd, i8imm:$b),
+               "bst\t$rd, $b",
+               []>;
+
+let Uses = [SREG] in
+def BLD : FRdB<0b00,
+               (outs),
+               (ins GPR8:$rd, i8imm:$b),
+               "bld\t$rd, $b",
+               []>;
+
+// Set/clear bit in register operations.
+let Constraints = "$src = $rd",
+Defs = [SREG] in
+{
+  // SBR Rd, K
+  // Alias for ORI Rd, K
+  def SBRRdK : FRdK<0b0110,
+                    (outs LD8:$rd),
+                    (ins LD8:$src, i8imm:$k),
+                    "sbr\t$rd, $k",
+                    [(set i8:$rd, (or i8:$src, imm:$k)),
+                     (implicit SREG)]>;
+
+  // CBR Rd, K
+  // Alias for `ANDI Rd, COM(K)` where COM(K) is the compliment of K.
+  def CBRRdK : FRdK<0b0111,
+                    (outs LD8:$rd),
+                    (ins LD8:$src, i8imm_com:$k),
+                    "cbr\t$rd, $k",
+                    []>;
+}
+
+// CLR Rd
+// Alias for EOR Rd, Rd
+// -------------
+// Clears all bits in a register.
+def CLR : InstAlias<"clr\t$rd", (EORRdRr GPR8:$rd, GPR8:$rd)>;
+
+// SER Rd
+// Alias for LDI Rd, 0xff
+// ---------
+// Sets all bits in a register.
+def : InstAlias<"ser\t$rd", (LDIRdK LD8:$rd, 0xff), 0>;
+
+let Defs = [SREG] in
+def BSETs : FS<0,
+               (outs),
+               (ins i8imm:$s),
+               "bset\t$s",
+               []>;
+
+let Defs = [SREG] in
+def BCLRs : FS<1,
+               (outs),
+               (ins i8imm:$s),
+               "bclr\t$s",
+               []>;
+
+// Set/clear aliases for the carry (C) status flag (bit 0).
+def : InstAlias<"sec", (BSETs 0)>;
+def : InstAlias<"clc", (BCLRs 0)>;
+
+// Set/clear aliases for the zero (Z) status flag (bit 1).
+def : InstAlias<"sez", (BSETs 1)>;
+def : InstAlias<"clz", (BCLRs 1)>;
+
+// Set/clear aliases for the negative (N) status flag (bit 2).
+def : InstAlias<"sen", (BSETs 2)>;
+def : InstAlias<"cln", (BCLRs 2)>;
+
+// Set/clear aliases for the overflow (V) status flag (bit 3).
+def : InstAlias<"sev", (BSETs 3)>;
+def : InstAlias<"clv", (BCLRs 3)>;
+
+// Set/clear aliases for the signed (S) status flag (bit 4).
+def : InstAlias<"ses", (BSETs 4)>;
+def : InstAlias<"cls", (BCLRs 4)>;
+
+// Set/clear aliases for the half-carry (H) status flag (bit 5).
+def : InstAlias<"seh", (BSETs 5)>;
+def : InstAlias<"clh", (BCLRs 5)>;
+
+// Set/clear aliases for the T status flag (bit 6).
+def : InstAlias<"set", (BSETs 6)>;
+def : InstAlias<"clt", (BCLRs 6)>;
+
+// Set/clear aliases for the interrupt (I) status flag (bit 7).
+def : InstAlias<"sei", (BSETs 7)>;
+def : InstAlias<"cli", (BCLRs 7)>;
+
+//===----------------------------------------------------------------------===//
+// Special/Control instructions
+//===----------------------------------------------------------------------===//
+
+// BREAK
+// Breakpoint instruction
+// ---------
+// <|1001|0101|1001|1000>
+def BREAK : F16<0b1001010110011000,
+                (outs),
+                (ins),
+                "break",
+                []>,
+            Requires<[HasBREAK]>;
+
+// NOP
+// No-operation instruction
+// ---------
+// <|0000|0000|0000|0000>
+def NOP : F16<0b0000000000000000,
+              (outs),
+              (ins),
+              "nop",
+              []>;
+
+// SLEEP
+// Sleep instruction
+// ---------
+// <|1001|0101|1000|1000>
+def SLEEP : F16<0b1001010110001000,
+                (outs),
+                (ins),
+                "sleep",
+                []>;
+
+// WDR
+// Watchdog reset
+// ---------
+// <|1001|0101|1010|1000>
+def WDR : F16<0b1001010110101000,
+              (outs),
+              (ins),
+              "wdr",
+              []>;
+
+//===----------------------------------------------------------------------===//
+// Pseudo instructions for later expansion
+//===----------------------------------------------------------------------===//
+
+//:TODO: Optimize this for wider types AND optimize the following code
+//       compile int foo(char a, char b, char c, char d) {return d+b;}
+//       looks like a missed sext_inreg opportunity.
+def SEXT : ExtensionPseudo<
+  (outs DREGS:$dst),
+  (ins GPR8:$src),
+  "sext\t$dst, $src",
+  [(set i16:$dst, (sext i8:$src)), (implicit SREG)]
+>;
+
+def ZEXT : ExtensionPseudo<
+  (outs DREGS:$dst),
+  (ins GPR8:$src),
+  "zext\t$dst, $src",
+  [(set i16:$dst, (zext i8:$src)), (implicit SREG)]
+>;
+
+// This pseudo gets expanded into a movw+adiw thus it clobbers SREG.
+let Defs = [SREG],
+    hasSideEffects = 0 in
+def FRMIDX : Pseudo<(outs DLDREGS:$dst),
+                    (ins DLDREGS:$src, i16imm:$src2),
+                    "frmidx\t$dst, $src, $src2",
+                    []>;
+
+// This pseudo is either converted to a regular store or a push which clobbers
+// SP.
+def STDSPQRr : StorePseudo<
+  (outs),
+  (ins memspi:$dst, GPR8:$src),
+  "stdstk\t$dst, $src",
+  [(store i8:$src, addr:$dst)]
+>;
+
+// This pseudo is either converted to a regular store or a push which clobbers
+// SP.
+def STDWSPQRr : StorePseudo<
+  (outs),
+  (ins memspi:$dst, DREGS:$src),
+  "stdwstk\t$dst, $src",
+  [(store i16:$src, addr:$dst)]
+>;
+
+// SP read/write pseudos.
+let hasSideEffects = 0 in
+{
+  let Uses = [SP] in
+  def SPREAD : Pseudo<
+    (outs DREGS:$dst),
+    (ins GPRSP:$src),
+    "spread\t$dst, $src",
+    []
+  >;
+
+  let Defs = [SP] in
+  def SPWRITE : Pseudo<
+    (outs GPRSP:$dst),
+    (ins DREGS:$src),
+    "spwrite\t$dst, $src",
+    []>;
+}
+
+def Select8 : SelectPseudo<
+  (outs GPR8:$dst),
+  (ins GPR8:$src, GPR8:$src2, i8imm:$cc),
+  "# Select8 PSEUDO",
+  [(set i8:$dst, (AVRselectcc i8:$src, i8:$src2, imm:$cc))]
+>;
+
+def Select16 : SelectPseudo<
+  (outs DREGS:$dst),
+  (ins DREGS:$src, DREGS:$src2, i8imm:$cc),
+  "# Select16 PSEUDO",
+  [(set i16:$dst, (AVRselectcc i16:$src, i16:$src2, imm:$cc))]
+>;
+
+def Lsl8 : ShiftPseudo<
+  (outs GPR8:$dst),
+  (ins GPR8:$src, GPR8:$cnt),
+  "# Lsl8 PSEUDO",
+  [(set i8:$dst, (AVRlslLoop i8:$src, i8:$cnt))]
+>;
+
+def Lsl16 : ShiftPseudo<
+  (outs DREGS:$dst),
+  (ins DREGS:$src, GPR8:$cnt),
+  "# Lsl16 PSEUDO",
+  [(set i16:$dst, (AVRlslLoop i16:$src, i8:$cnt))]
+>;
+
+def Lsr8 : ShiftPseudo<
+  (outs GPR8:$dst),
+  (ins GPR8:$src, GPR8:$cnt),
+  "# Lsr8 PSEUDO",
+  [(set i8:$dst, (AVRlsrLoop i8:$src, i8:$cnt))]
+>;
+
+
+def Lsr16 : ShiftPseudo<
+  (outs DREGS:$dst),
+   (ins DREGS:$src, GPR8:$cnt),
+   "# Lsr16 PSEUDO",
+   [(set i16:$dst, (AVRlsrLoop i16:$src, i8:$cnt))]
+>;
+
+def Asr8 : ShiftPseudo<
+  (outs GPR8:$dst),
+  (ins GPR8:$src, GPR8:$cnt),
+  "# Asr8 PSEUDO",
+  [(set i8:$dst, (AVRasrLoop i8:$src, i8:$cnt))]
+>;
+
+def Asr16 : ShiftPseudo<
+  (outs DREGS:$dst),
+   (ins DREGS:$src, GPR8:$cnt),
+   "# Asr16 PSEUDO",
+   [(set i16:$dst, (AVRasrLoop i16:$src, i8:$cnt))]
+>;
+
+
+//===----------------------------------------------------------------------===//
+// Non-Instruction Patterns
+//===----------------------------------------------------------------------===//
+
+//:TODO: look in x86InstrCompiler.td for odd encoding trick related to
+// add x, 128 -> sub x, -128. Clang is emitting an eor for this (ldi+eor)
+
+// the add instruction always writes the carry flag
+def : Pat<(addc i8:$src, i8:$src2),
+          (ADDRdRr i8:$src, i8:$src2)>;
+def : Pat<(addc DREGS:$src, DREGS:$src2),
+          (ADDWRdRr DREGS:$src, DREGS:$src2)>;
+
+// all sub instruction variants always writes the carry flag
+def : Pat<(subc i8:$src, i8:$src2),
+          (SUBRdRr i8:$src, i8:$src2)>;
+def : Pat<(subc i16:$src, i16:$src2),
+          (SUBWRdRr i16:$src, i16:$src2)>;
+def : Pat<(subc i8:$src, imm:$src2),
+          (SUBIRdK i8:$src, imm:$src2)>;
+def : Pat<(subc i16:$src, imm:$src2),
+          (SUBIWRdK i16:$src, imm:$src2)>;
+
+// These patterns convert add (x, -imm) to sub (x, imm) since we dont have
+// any add with imm instructions. Also take care of the adiw/sbiw instructions.
+def : Pat<(add i16:$src1, imm0_63_neg:$src2),
+          (SBIWRdK i16:$src1, (imm0_63_neg:$src2))>;
+def : Pat<(add i16:$src1, imm:$src2),
+          (SUBIWRdK i16:$src1, (imm16_neg_XFORM imm:$src2))>;
+def : Pat<(addc i16:$src1, imm:$src2),
+          (SUBIWRdK i16:$src1, (imm16_neg_XFORM imm:$src2))>;
+def : Pat<(adde i16:$src1, imm:$src2),
+          (SBCIWRdK i16:$src1, (imm16_neg_XFORM imm:$src2))>;
+
+def : Pat<(add i8:$src1, imm:$src2),
+          (SUBIRdK i8:$src1, (imm8_neg_XFORM imm:$src2))>;
+def : Pat<(addc i8:$src1, imm:$src2),
+          (SUBIRdK i8:$src1, (imm8_neg_XFORM imm:$src2))>;
+def : Pat<(adde i8:$src1, imm:$src2),
+          (SBCIRdK i8:$src1, (imm8_neg_XFORM imm:$src2))>;
+
+// Calls.
+def : Pat<(AVRcall (i16 tglobaladdr:$dst)),
+          (CALLk tglobaladdr:$dst)>;
+def : Pat<(AVRcall (i16 texternalsym:$dst)),
+          (CALLk texternalsym:$dst)>;
+
+// `anyext`
+def : Pat<(i16 (anyext i8:$src)),
+          (INSERT_SUBREG (i16 (IMPLICIT_DEF)), i8:$src, sub_lo)>;
+
+// `trunc`
+def : Pat<(i8 (trunc i16:$src)),
+          (EXTRACT_SUBREG i16:$src, sub_lo)>;
+
+// sext_inreg
+def : Pat<(sext_inreg i16:$src, i8),
+          (SEXT (i8 (EXTRACT_SUBREG i16:$src, sub_lo)))>;
+
+// GlobalAddress
+def : Pat<(i16 (AVRWrapper tglobaladdr:$dst)),
+          (LDIWRdK tglobaladdr:$dst)>;
+def : Pat<(add i16:$src, (AVRWrapper tglobaladdr:$src2)),
+          (SUBIWRdK i16:$src, tglobaladdr:$src2)>;
+def : Pat<(i8 (load (AVRWrapper tglobaladdr:$dst))),
+          (LDSRdK tglobaladdr:$dst)>;
+def : Pat<(i16 (load (AVRWrapper tglobaladdr:$dst))),
+          (LDSWRdK tglobaladdr:$dst)>;
+def : Pat<(store i8:$src, (i16 (AVRWrapper tglobaladdr:$dst))),
+          (STSKRr tglobaladdr:$dst, i8:$src)>;
+def : Pat<(store i16:$src, (i16 (AVRWrapper tglobaladdr:$dst))),
+          (STSWKRr tglobaladdr:$dst, i16:$src)>;
+
+// BlockAddress
+def : Pat<(i16 (AVRWrapper tblockaddress:$dst)),
+          (LDIWRdK tblockaddress:$dst)>;
+
+// hi-reg truncation : trunc(int16 >> 8)
+//:FIXME: i think it's better to emit an extract subreg node in the DAG than
+// all this mess once we get optimal shift code
+// lol... I think so, too. [@agnat]
+def : Pat<(i8 (trunc (AVRlsr (AVRlsr (AVRlsr (AVRlsr (AVRlsr (AVRlsr (AVRlsr
+                     (AVRlsr DREGS:$src)))))))))),
+          (EXTRACT_SUBREG DREGS:$src, sub_hi)>;
+
+// :FIXME: DAGCombiner produces an shl node after legalization from these seq:
+// BR_JT -> (mul x, 2) -> (shl x, 1)
+def : Pat<(shl i16:$src1, (i8 1)),
+          (LSLWRd i16:$src1)>;
+
diff --git a/lib/Target/AVR/AVRMachineFunctionInfo.h b/lib/Target/AVR/AVRMachineFunctionInfo.h
index 6571d5d3e603..cf0c73576301 100644
--- a/lib/Target/AVR/AVRMachineFunctionInfo.h
+++ b/lib/Target/AVR/AVRMachineFunctionInfo.h
@@ -14,15 +14,11 @@
 #ifndef LLVM_AVR_MACHINE_FUNCTION_INFO_H
 #define LLVM_AVR_MACHINE_FUNCTION_INFO_H
 
-#include "AVRConfig.h"
-
 #include "llvm/CodeGen/MachineFunction.h"
 
 namespace llvm {
 
-/**
- * Contains AVR-specific information for each MachineFunction.
- */
+/// Contains AVR-specific information for each MachineFunction.
 class AVRMachineFunctionInfo : public MachineFunctionInfo {
   /// Indicates if a register has been spilled by the register
   /// allocator.
diff --git a/lib/Target/AVR/AVRRegisterInfo.cpp b/lib/Target/AVR/AVRRegisterInfo.cpp
new file mode 100644
index 000000000000..5786f74d1c4f
--- /dev/null
+++ b/lib/Target/AVR/AVRRegisterInfo.cpp
@@ -0,0 +1,256 @@
+//===-- AVRRegisterInfo.cpp - AVR Register Information --------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the AVR implementation of the TargetRegisterInfo class.
+//
+//===----------------------------------------------------------------------===//
+
+#include "AVRRegisterInfo.h"
+
+#include "llvm/ADT/BitVector.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/IR/Function.h"
+#include "llvm/Target/TargetFrameLowering.h"
+
+#include "AVR.h"
+#include "AVRInstrInfo.h"
+#include "AVRTargetMachine.h"
+#include "MCTargetDesc/AVRMCTargetDesc.h"
+
+#define GET_REGINFO_TARGET_DESC
+#include "AVRGenRegisterInfo.inc"
+
+namespace llvm {
+
+AVRRegisterInfo::AVRRegisterInfo() : AVRGenRegisterInfo(0) {}
+
+const uint16_t *
+AVRRegisterInfo::getCalleeSavedRegs(const MachineFunction *MF) const {
+  CallingConv::ID CC = MF->getFunction()->getCallingConv();
+
+  return ((CC == CallingConv::AVR_INTR || CC == CallingConv::AVR_SIGNAL)
+              ? CSR_Interrupts_SaveList
+              : CSR_Normal_SaveList);
+}
+
+const uint32_t *
+AVRRegisterInfo::getCallPreservedMask(const MachineFunction &MF,
+                                      CallingConv::ID CC) const {
+  return ((CC == CallingConv::AVR_INTR || CC == CallingConv::AVR_SIGNAL)
+              ? CSR_Interrupts_RegMask
+              : CSR_Normal_RegMask);
+}
+
+BitVector AVRRegisterInfo::getReservedRegs(const MachineFunction &MF) const {
+  BitVector Reserved(getNumRegs());
+  const AVRTargetMachine &TM = static_cast<const AVRTargetMachine&>(MF.getTarget());
+  const TargetFrameLowering *TFI = TM.getSubtargetImpl()->getFrameLowering();
+
+  // Reserve the intermediate result registers r1 and r2
+  // The result of instructions like 'mul' is always stored here.
+  Reserved.set(AVR::R0);
+  Reserved.set(AVR::R1);
+  Reserved.set(AVR::R1R0);
+
+  //  Reserve the stack pointer.
+  Reserved.set(AVR::SPL);
+  Reserved.set(AVR::SPH);
+  Reserved.set(AVR::SP);
+
+  // Reserve the frame pointer registers r28 and r29 if the function requires one.
+  if (TFI->hasFP(MF)) {
+    Reserved.set(AVR::R28);
+    Reserved.set(AVR::R29);
+    Reserved.set(AVR::R29R28);
+  }
+
+  return Reserved;
+}
+
+const TargetRegisterClass *
+AVRRegisterInfo::getLargestLegalSuperClass(const TargetRegisterClass *RC,
+                                           const MachineFunction &MF) const {
+  if (RC->hasType(MVT::i16)) {
+    return &AVR::DREGSRegClass;
+  }
+
+  if (RC->hasType(MVT::i8)) {
+    return &AVR::GPR8RegClass;
+  }
+
+  llvm_unreachable("Invalid register size");
+}
+
+/// Fold a frame offset shared between two add instructions into a single one.
+static void foldFrameOffset(MachineInstr &MI, int &Offset, unsigned DstReg) {
+  int Opcode = MI.getOpcode();
+
+  // Don't bother trying if the next instruction is not an add or a sub.
+  if ((Opcode != AVR::SUBIWRdK) && (Opcode != AVR::ADIWRdK)) {
+    return;
+  }
+
+  // Check that DstReg matches with next instruction, otherwise the instruction
+  // is not related to stack address manipulation.
+  if (DstReg != MI.getOperand(0).getReg()) {
+    return;
+  }
+
+  // Add the offset in the next instruction to our offset.
+  switch (Opcode) {
+  case AVR::SUBIWRdK:
+    Offset += -MI.getOperand(2).getImm();
+    break;
+  case AVR::ADIWRdK:
+    Offset += MI.getOperand(2).getImm();
+    break;
+  }
+
+  // Finally remove the instruction.
+  MI.eraseFromParent();
+}
+
+void AVRRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II,
+                                          int SPAdj, unsigned FIOperandNum,
+                                          RegScavenger *RS) const {
+  assert(SPAdj == 0 && "Unexpected SPAdj value");
+
+  MachineInstr &MI = *II;
+  DebugLoc dl = MI.getDebugLoc();
+  MachineBasicBlock &MBB = *MI.getParent();
+  const MachineFunction &MF = *MBB.getParent();
+  const AVRTargetMachine &TM = (const AVRTargetMachine &)MF.getTarget();
+  const TargetInstrInfo &TII = *TM.getSubtargetImpl()->getInstrInfo();
+  const MachineFrameInfo *MFI = MF.getFrameInfo();
+  const TargetFrameLowering *TFI = TM.getSubtargetImpl()->getFrameLowering();
+  int FrameIndex = MI.getOperand(FIOperandNum).getIndex();
+  int Offset = MFI->getObjectOffset(FrameIndex);
+
+  // Add one to the offset because SP points to an empty slot.
+  Offset += MFI->getStackSize() - TFI->getOffsetOfLocalArea() + 1;
+  // Fold incoming offset.
+  Offset += MI.getOperand(FIOperandNum + 1).getImm();
+
+  // This is actually "load effective address" of the stack slot
+  // instruction. We have only two-address instructions, thus we need to
+  // expand it into move + add.
+  if (MI.getOpcode() == AVR::FRMIDX) {
+    MI.setDesc(TII.get(AVR::MOVWRdRr));
+    MI.getOperand(FIOperandNum).ChangeToRegister(AVR::R29R28, false);
+
+    assert(Offset > 0 && "Invalid offset");
+
+    // We need to materialize the offset via an add instruction.
+    unsigned Opcode;
+    unsigned DstReg = MI.getOperand(0).getReg();
+    assert(DstReg != AVR::R29R28 && "Dest reg cannot be the frame pointer");
+
+    // Generally, to load a frame address two add instructions are emitted that
+    // could get folded into a single one:
+    //  movw    r31:r30, r29:r28
+    //  adiw    r31:r30, 29
+    //  adiw    r31:r30, 16
+    // to:
+    //  movw    r31:r30, r29:r28
+    //  adiw    r31:r30, 45
+    foldFrameOffset(*std::next(II), Offset, DstReg);
+
+    // Select the best opcode based on DstReg and the offset size.
+    switch (DstReg) {
+    case AVR::R25R24:
+    case AVR::R27R26:
+    case AVR::R31R30: {
+      if (isUInt<6>(Offset)) {
+        Opcode = AVR::ADIWRdK;
+        break;
+      }
+      // Fallthrough
+    }
+    default: {
+      // This opcode will get expanded into a pair of subi/sbci.
+      Opcode = AVR::SUBIWRdK;
+      Offset = -Offset;
+      break;
+    }
+    }
+
+    MachineInstr *New = BuildMI(MBB, std::next(II), dl, TII.get(Opcode), DstReg)
+                            .addReg(DstReg, RegState::Kill)
+                            .addImm(Offset);
+    New->getOperand(3).setIsDead();
+
+    return;
+  }
+
+  // If the offset is too big we have to adjust and restore the frame pointer
+  // to materialize a valid load/store with displacement.
+  //:TODO: consider using only one adiw/sbiw chain for more than one frame index
+  if (Offset >= 63) {
+    unsigned AddOpc = AVR::ADIWRdK, SubOpc = AVR::SBIWRdK;
+    int AddOffset = Offset - 63 + 1;
+
+    // For huge offsets where adiw/sbiw cannot be used use a pair of subi/sbci.
+    if ((Offset - 63 + 1) > 63) {
+      AddOpc = AVR::SUBIWRdK;
+      SubOpc = AVR::SUBIWRdK;
+      AddOffset = -AddOffset;
+    }
+
+    // It is possible that the spiller places this frame instruction in between
+    // a compare and branch, invalidating the contents of SREG set by the
+    // compare instruction because of the add/sub pairs. Conservatively save and
+    // restore SREG before and after each add/sub pair.
+    BuildMI(MBB, II, dl, TII.get(AVR::INRdA), AVR::R0).addImm(0x3f);
+
+    MachineInstr *New = BuildMI(MBB, II, dl, TII.get(AddOpc), AVR::R29R28)
+                            .addReg(AVR::R29R28, RegState::Kill)
+                            .addImm(AddOffset);
+    New->getOperand(3).setIsDead();
+
+    // Restore SREG.
+    BuildMI(MBB, std::next(II), dl, TII.get(AVR::OUTARr))
+        .addImm(0x3f)
+        .addReg(AVR::R0, RegState::Kill);
+
+    // No need to set SREG as dead here otherwise if the next instruction is a
+    // cond branch it will be using a dead register.
+    New = BuildMI(MBB, std::next(II), dl, TII.get(SubOpc), AVR::R29R28)
+              .addReg(AVR::R29R28, RegState::Kill)
+              .addImm(Offset - 63 + 1);
+
+    Offset = 62;
+  }
+
+  MI.getOperand(FIOperandNum).ChangeToRegister(AVR::R29R28, false);
+  assert(isUInt<6>(Offset) && "Offset is out of range");
+  MI.getOperand(FIOperandNum + 1).ChangeToImmediate(Offset);
+}
+
+unsigned AVRRegisterInfo::getFrameRegister(const MachineFunction &MF) const {
+  const TargetFrameLowering *TFI = MF.getSubtarget().getFrameLowering();
+  if (TFI->hasFP(MF)) {
+    // The Y pointer register
+    return AVR::R28;
+  }
+
+  return AVR::SP;
+}
+
+const TargetRegisterClass *
+AVRRegisterInfo::getPointerRegClass(const MachineFunction &MF,
+                                    unsigned Kind) const {
+  // FIXME: Currently we're using avr-gcc as reference, so we restrict
+  // ptrs to Y and Z regs. Though avr-gcc has buggy implementation
+  // of memory constraint, so we can fix it and bit avr-gcc here ;-)
+  return &AVR::PTRDISPREGSRegClass;
+}
+
+} // end of namespace llvm
diff --git a/lib/Target/AVR/AVRRegisterInfo.h b/lib/Target/AVR/AVRRegisterInfo.h
new file mode 100644
index 000000000000..59c0849d209b
--- /dev/null
+++ b/lib/Target/AVR/AVRRegisterInfo.h
@@ -0,0 +1,56 @@
+//===-- AVRRegisterInfo.h - AVR Register Information Impl -------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the AVR implementation of the TargetRegisterInfo class.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_AVR_REGISTER_INFO_H
+#define LLVM_AVR_REGISTER_INFO_H
+
+#include "llvm/Target/TargetRegisterInfo.h"
+
+#define GET_REGINFO_HEADER
+#include "AVRGenRegisterInfo.inc"
+
+namespace llvm {
+
+/// Utilities relating to AVR registers.
+class AVRRegisterInfo : public AVRGenRegisterInfo {
+public:
+  AVRRegisterInfo();
+
+public:
+  const uint16_t *
+  getCalleeSavedRegs(const MachineFunction *MF = 0) const override;
+  const uint32_t *getCallPreservedMask(const MachineFunction &MF,
+                                       CallingConv::ID CC) const override;
+  BitVector getReservedRegs(const MachineFunction &MF) const override;
+
+  const TargetRegisterClass *
+  getLargestLegalSuperClass(const TargetRegisterClass *RC,
+                            const MachineFunction &MF) const override;
+
+  /// Stack Frame Processing Methods
+  void eliminateFrameIndex(MachineBasicBlock::iterator MI, int SPAdj,
+                           unsigned FIOperandNum,
+                           RegScavenger *RS = NULL) const override;
+
+  /// Debug information queries.
+  unsigned getFrameRegister(const MachineFunction &MF) const override;
+
+  /// Returns a TargetRegisterClass used for pointer values.
+  const TargetRegisterClass *
+  getPointerRegClass(const MachineFunction &MF,
+                     unsigned Kind = 0) const override;
+};
+
+} // end namespace llvm
+
+#endif // LLVM_AVR_REGISTER_INFO_H
diff --git a/lib/Target/AVR/AVRSelectionDAGInfo.h b/lib/Target/AVR/AVRSelectionDAGInfo.h
index ee832ad1bc75..6474c8779330 100644
--- a/lib/Target/AVR/AVRSelectionDAGInfo.h
+++ b/lib/Target/AVR/AVRSelectionDAGInfo.h
@@ -7,20 +7,19 @@
 //
 //===----------------------------------------------------------------------===//
 //
-// This file defines the AVR subclass for TargetSelectionDAGInfo.
+// This file defines the AVR subclass for SelectionDAGTargetInfo.
 //
 //===----------------------------------------------------------------------===//
 
 #ifndef LLVM_AVR_SELECTION_DAG_INFO_H
 #define LLVM_AVR_SELECTION_DAG_INFO_H
 
-#include "llvm/Target/TargetSelectionDAGInfo.h"
+#include "llvm/CodeGen/SelectionDAGTargetInfo.h"
 
 namespace llvm {
-/**
- * Holds information about the AVR instruction selection DAG.
- */
-class AVRSelectionDAGInfo : public TargetSelectionDAGInfo {
+
+/// Holds information about the AVR instruction selection DAG.
+class AVRSelectionDAGInfo : public SelectionDAGTargetInfo {
 public:
 };
 
diff --git a/lib/Target/AVR/AVRSubtarget.cpp b/lib/Target/AVR/AVRSubtarget.cpp
new file mode 100644
index 000000000000..c228d051d771
--- /dev/null
+++ b/lib/Target/AVR/AVRSubtarget.cpp
@@ -0,0 +1,47 @@
+//===-- AVRSubtarget.cpp - AVR Subtarget Information ----------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the AVR specific subclass of TargetSubtargetInfo.
+//
+//===----------------------------------------------------------------------===//
+
+#include "AVRSubtarget.h"
+
+#include "llvm/Support/ELF.h"
+#include "llvm/Support/TargetRegistry.h"
+
+#include "AVR.h"
+#include "AVRTargetMachine.h"
+#include "MCTargetDesc/AVRMCTargetDesc.h"
+
+#define DEBUG_TYPE "avr-subtarget"
+
+#define GET_SUBTARGETINFO_TARGET_DESC
+#define GET_SUBTARGETINFO_CTOR
+#include "AVRGenSubtargetInfo.inc"
+
+namespace llvm {
+
+AVRSubtarget::AVRSubtarget(const Triple &TT, const std::string &CPU,
+                           const std::string &FS, AVRTargetMachine &TM)
+    : AVRGenSubtargetInfo(TT, CPU, FS), InstrInfo(), FrameLowering(),
+      TLInfo(TM), TSInfo(),
+
+      // Subtarget features
+      m_hasSRAM(false), m_hasJMPCALL(false), m_hasIJMPCALL(false),
+      m_hasEIJMPCALL(false), m_hasADDSUBIW(false), m_hasSmallStack(false),
+      m_hasMOVW(false), m_hasLPM(false), m_hasLPMX(false),  m_hasELPM(false),
+      m_hasELPMX(false), m_hasSPM(false), m_hasSPMX(false), m_hasDES(false),
+      m_supportsRMW(false), m_supportsMultiplication(false), m_hasBREAK(false),
+      m_hasTinyEncoding(false), ELFArch(false), m_FeatureSetDummy(false) {
+  // Parse features string.
+  ParseSubtargetFeatures(CPU, FS);
+}
+
+} // end of namespace llvm
diff --git a/lib/Target/AVR/AVRSubtarget.h b/lib/Target/AVR/AVRSubtarget.h
new file mode 100644
index 000000000000..a37849c3f3f7
--- /dev/null
+++ b/lib/Target/AVR/AVRSubtarget.h
@@ -0,0 +1,119 @@
+//===-- AVRSubtarget.h - Define Subtarget for the AVR -----------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file declares the AVR specific subclass of TargetSubtargetInfo.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_AVR_SUBTARGET_H
+#define LLVM_AVR_SUBTARGET_H
+
+#include "llvm/Target/TargetSubtargetInfo.h"
+#include "llvm/IR/DataLayout.h"
+#include "llvm/IR/DataLayout.h"
+#include "llvm/Target/TargetMachine.h"
+
+#include "AVRFrameLowering.h"
+#include "AVRISelLowering.h"
+#include "AVRInstrInfo.h"
+#include "AVRSelectionDAGInfo.h"
+
+#define GET_SUBTARGETINFO_HEADER
+#include "AVRGenSubtargetInfo.inc"
+
+namespace llvm {
+
+/// A specific AVR target MCU.
+class AVRSubtarget : public AVRGenSubtargetInfo {
+public:
+  //! Creates an AVR subtarget.
+  //! \param TT  The target triple.
+  //! \param CPU The CPU to target.
+  //! \param FS  The feature string.
+  //! \param TM  The target machine.
+  AVRSubtarget(const Triple &TT, const std::string &CPU, const std::string &FS,
+               AVRTargetMachine &TM);
+
+  const AVRInstrInfo *getInstrInfo() const override { return &InstrInfo; }
+  const TargetFrameLowering *getFrameLowering() const override { return &FrameLowering; }
+  const AVRTargetLowering *getTargetLowering() const override { return &TLInfo; }
+  const AVRSelectionDAGInfo *getSelectionDAGInfo() const override { return &TSInfo; }
+  const AVRRegisterInfo *getRegisterInfo() const override { return &InstrInfo.getRegisterInfo(); }
+
+  /// Parses a subtarget feature string, setting appropriate options.
+  /// \note Definition of function is auto generated by `tblgen`.
+  void ParseSubtargetFeatures(StringRef CPU, StringRef FS);
+
+  // Subtarget feature getters.
+  // See AVR.td for details.
+  bool hasSRAM() const { return m_hasSRAM; }
+  bool hasJMPCALL() const { return m_hasJMPCALL; }
+  bool hasIJMPCALL() const { return m_hasIJMPCALL; }
+  bool hasEIJMPCALL() const { return m_hasEIJMPCALL; }
+  bool hasADDSUBIW() const { return m_hasADDSUBIW; }
+  bool hasSmallStack() const { return m_hasSmallStack; }
+  bool hasMOVW() const { return m_hasMOVW; }
+  bool hasLPM() const { return m_hasLPM; }
+  bool hasLPMX() const { return m_hasLPMX; }
+  bool hasELPM() const { return m_hasELPM; }
+  bool hasELPMX() const { return m_hasELPMX; }
+  bool hasSPM() const { return m_hasSPM; }
+  bool hasSPMX() const { return m_hasSPMX; }
+  bool hasDES() const { return m_hasDES; }
+  bool supportsRMW() const { return m_supportsRMW; }
+  bool supportsMultiplication() const { return m_supportsMultiplication; }
+  bool hasBREAK() const { return m_hasBREAK; }
+  bool hasTinyEncoding() const { return m_hasTinyEncoding; }
+
+  /// Gets the ELF architecture for the e_flags field
+  /// of an ELF object file.
+  unsigned getELFArch() const {
+    assert(ELFArch != 0 &&
+           "every device must have an associate ELF architecture");
+    return ELFArch;
+  }
+
+private:
+  AVRInstrInfo InstrInfo;
+  AVRFrameLowering FrameLowering;
+  AVRTargetLowering TLInfo;
+  AVRSelectionDAGInfo TSInfo;
+
+  // Subtarget feature settings
+  // See AVR.td for details.
+  bool m_hasSRAM;
+  bool m_hasJMPCALL;
+  bool m_hasIJMPCALL;
+  bool m_hasEIJMPCALL;
+  bool m_hasADDSUBIW;
+  bool m_hasSmallStack;
+  bool m_hasMOVW;
+  bool m_hasLPM;
+  bool m_hasLPMX;
+  bool m_hasELPM;
+  bool m_hasELPMX;
+  bool m_hasSPM;
+  bool m_hasSPMX;
+  bool m_hasDES;
+  bool m_supportsRMW;
+  bool m_supportsMultiplication;
+  bool m_hasBREAK;
+  bool m_hasTinyEncoding;
+
+  /// The ELF e_flags architecture.
+  unsigned ELFArch;
+
+  // Dummy member, used by FeatureSet's. We cannot have a SubtargetFeature with
+  // no variable, so we instead bind pseudo features to this variable.
+  bool m_FeatureSetDummy;
+};
+
+} // end namespace llvm
+
+#endif // LLVM_AVR_SUBTARGET_H
diff --git a/lib/Target/AVR/AVRTargetMachine.cpp b/lib/Target/AVR/AVRTargetMachine.cpp
index a91dce8a63f4..508723e91c60 100644
--- a/lib/Target/AVR/AVRTargetMachine.cpp
+++ b/lib/Target/AVR/AVRTargetMachine.cpp
@@ -1,4 +1,105 @@
+//===-- AVRTargetMachine.cpp - Define TargetMachine for AVR ---------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines the AVR specific subclass of TargetMachine.
+//
+//===----------------------------------------------------------------------===//
+
+#include "AVRTargetMachine.h"
+
+#include "llvm/CodeGen/Passes.h"
+#include "llvm/CodeGen/TargetPassConfig.h"
+#include "llvm/IR/Module.h"
+#include "llvm/IR/LegacyPassManager.h"
+#include "llvm/Support/TargetRegistry.h"
+
+#include "AVRTargetObjectFile.h"
+#include "AVR.h"
+#include "MCTargetDesc/AVRMCTargetDesc.h"
+
+namespace llvm {
+
+/// Processes a CPU name.
+static StringRef getCPU(StringRef CPU) {
+  if (CPU.empty() || CPU == "generic") {
+    return "avr2";
+  }
+
+  return CPU;
+}
+
+static Reloc::Model getEffectiveRelocModel(Optional<Reloc::Model> RM) {
+  return RM.hasValue() ? *RM : Reloc::Static;
+}
+
+AVRTargetMachine::AVRTargetMachine(const Target &T, const Triple &TT,
+                                   StringRef CPU, StringRef FS,
+                                   const TargetOptions &Options,
+                                   Optional<Reloc::Model> RM, CodeModel::Model CM,
+                                   CodeGenOpt::Level OL)
+    : LLVMTargetMachine(
+          T, "e-p:16:8:8-i8:8:8-i16:8:8-i32:8:8-i64:8:8-f32:8:8-f64:8:8-n8", TT,
+          getCPU(CPU), FS, Options, getEffectiveRelocModel(RM), CM, OL),
+      SubTarget(TT, getCPU(CPU), FS, *this) {
+  this->TLOF = make_unique<AVRTargetObjectFile>();
+  initAsmInfo();
+}
+
+namespace {
+/// AVR Code Generator Pass Configuration Options.
+class AVRPassConfig : public TargetPassConfig {
+public:
+  AVRPassConfig(AVRTargetMachine *TM, PassManagerBase &PM)
+      : TargetPassConfig(TM, PM) {}
+
+  AVRTargetMachine &getAVRTargetMachine() const {
+    return getTM<AVRTargetMachine>();
+  }
+
+  bool addInstSelector() override;
+  void addPreSched2() override;
+  void addPreRegAlloc() override;
+  void addPreEmitPass() override;
+};
+} // namespace
+
+TargetPassConfig *AVRTargetMachine::createPassConfig(PassManagerBase &PM) {
+  return new AVRPassConfig(this, PM);
+}
 
 extern "C" void LLVMInitializeAVRTarget() {
+  // Register the target.
+  RegisterTargetMachine<AVRTargetMachine> X(TheAVRTarget);
+}
+
+const AVRSubtarget *AVRTargetMachine::getSubtargetImpl() const {
+  return &SubTarget;
+}
+
+const AVRSubtarget *AVRTargetMachine::getSubtargetImpl(const Function &) const {
+  return &SubTarget;
+}
 
+//===----------------------------------------------------------------------===//
+// Pass Pipeline Configuration
+//===----------------------------------------------------------------------===//
+
+bool AVRPassConfig::addInstSelector() {
+  return false;
+}
+
+void AVRPassConfig::addPreRegAlloc() {
 }
+
+void AVRPassConfig::addPreSched2() { }
+
+void AVRPassConfig::addPreEmitPass() {
+}
+
+} // end of namespace llvm
diff --git a/lib/Target/AVR/AVRTargetMachine.h b/lib/Target/AVR/AVRTargetMachine.h
new file mode 100644
index 000000000000..10345193d14a
--- /dev/null
+++ b/lib/Target/AVR/AVRTargetMachine.h
@@ -0,0 +1,51 @@
+//===-- AVRTargetMachine.h - Define TargetMachine for AVR -------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file declares the AVR specific subclass of TargetMachine.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_AVR_TARGET_MACHINE_H
+#define LLVM_AVR_TARGET_MACHINE_H
+
+#include "llvm/IR/DataLayout.h"
+#include "llvm/Target/TargetMachine.h"
+
+#include "AVRFrameLowering.h"
+#include "AVRISelLowering.h"
+#include "AVRInstrInfo.h"
+#include "AVRSelectionDAGInfo.h"
+#include "AVRSubtarget.h"
+
+namespace llvm {
+
+/// A generic AVR implementation.
+class AVRTargetMachine : public LLVMTargetMachine {
+public:
+  AVRTargetMachine(const Target &T, const Triple &TT, StringRef CPU,
+                   StringRef FS, const TargetOptions &Options, Optional<Reloc::Model> RM,
+                   CodeModel::Model CM, CodeGenOpt::Level OL);
+
+  const AVRSubtarget *getSubtargetImpl() const;
+  const AVRSubtarget *getSubtargetImpl(const Function &) const override;
+
+  TargetLoweringObjectFile *getObjFileLowering() const override {
+    return this->TLOF.get();
+  }
+
+  TargetPassConfig *createPassConfig(PassManagerBase &PM) override;
+
+private:
+  std::unique_ptr<TargetLoweringObjectFile> TLOF;
+  AVRSubtarget SubTarget;
+};
+
+} // end namespace llvm
+
+#endif // LLVM_AVR_TARGET_MACHINE_H
diff --git a/lib/Target/AVR/AVRTargetObjectFile.h b/lib/Target/AVR/AVRTargetObjectFile.h
index bdda35b34993..587612584314 100644
--- a/lib/Target/AVR/AVRTargetObjectFile.h
+++ b/lib/Target/AVR/AVRTargetObjectFile.h
@@ -13,9 +13,8 @@
 #include "llvm/CodeGen/TargetLoweringObjectFileImpl.h"
 
 namespace llvm {
-/**
- * Lowering for an AVR ELF32 object file.
- */
+
+/// Lowering for an AVR ELF32 object file.
 class AVRTargetObjectFile : public TargetLoweringObjectFileELF {
   typedef TargetLoweringObjectFileELF Base;
 
diff --git a/lib/Target/AVR/CMakeLists.txt b/lib/Target/AVR/CMakeLists.txt
index b4fb0d9ceac2..4b8354d64879 100644
--- a/lib/Target/AVR/CMakeLists.txt
+++ b/lib/Target/AVR/CMakeLists.txt
@@ -1,15 +1,21 @@
 set(LLVM_TARGET_DEFINITIONS AVR.td)
 
 tablegen(LLVM AVRGenRegisterInfo.inc -gen-register-info)
+tablegen(LLVM AVRGenInstrInfo.inc -gen-instr-info)
 tablegen(LLVM AVRGenCallingConv.inc -gen-callingconv)
+tablegen(LLVM AVRGenSubtargetInfo.inc -gen-subtarget)
 add_public_tablegen_target(AVRCommonTableGen)
 
 add_llvm_target(AVRCodeGen
-    AVRTargetMachine.cpp
-    AVRTargetObjectFile.cpp
-  )
+  AVRInstrInfo.cpp
+  AVRRegisterInfo.cpp
+  AVRSubtarget.cpp
+  AVRTargetMachine.cpp
+  AVRTargetObjectFile.cpp
+)
 
 add_dependencies(LLVMAVRCodeGen intrinsics_gen)
 
+add_subdirectory(MCTargetDesc)
 add_subdirectory(TargetInfo)
 
diff --git a/lib/Target/AVR/MCTargetDesc/AVRELFStreamer.cpp b/lib/Target/AVR/MCTargetDesc/AVRELFStreamer.cpp
new file mode 100644
index 000000000000..481de320b22f
--- /dev/null
+++ b/lib/Target/AVR/MCTargetDesc/AVRELFStreamer.cpp
@@ -0,0 +1,66 @@
+#include "AVRELFStreamer.h"
+
+#include "llvm/Support/ELF.h"
+#include "llvm/Support/FormattedStream.h"
+
+#include "AVRMCTargetDesc.h"
+
+namespace llvm {
+
+static unsigned getEFlagsForFeatureSet(const FeatureBitset &Features) {
+  unsigned EFlags = 0;
+
+  // Set architecture
+  if (Features[AVR::ELFArchAVR1])
+    EFlags |= ELF::EF_AVR_ARCH_AVR1;
+  else if (Features[AVR::ELFArchAVR2])
+    EFlags |= ELF::EF_AVR_ARCH_AVR2;
+  else if (Features[AVR::ELFArchAVR25])
+    EFlags |= ELF::EF_AVR_ARCH_AVR25;
+  else if (Features[AVR::ELFArchAVR3])
+    EFlags |= ELF::EF_AVR_ARCH_AVR3;
+  else if (Features[AVR::ELFArchAVR31])
+    EFlags |= ELF::EF_AVR_ARCH_AVR31;
+  else if (Features[AVR::ELFArchAVR35])
+    EFlags |= ELF::EF_AVR_ARCH_AVR35;
+  else if (Features[AVR::ELFArchAVR4])
+    EFlags |= ELF::EF_AVR_ARCH_AVR4;
+  else if (Features[AVR::ELFArchAVR5])
+    EFlags |= ELF::EF_AVR_ARCH_AVR5;
+  else if (Features[AVR::ELFArchAVR51])
+    EFlags |= ELF::EF_AVR_ARCH_AVR51;
+  else if (Features[AVR::ELFArchAVR6])
+    EFlags |= ELF::EF_AVR_ARCH_AVR6;
+  else if (Features[AVR::ELFArchAVRTiny])
+    EFlags |= ELF::EF_AVR_ARCH_AVRTINY;
+  else if (Features[AVR::ELFArchXMEGA1])
+    EFlags |= ELF::EF_AVR_ARCH_XMEGA1;
+  else if (Features[AVR::ELFArchXMEGA2])
+    EFlags |= ELF::EF_AVR_ARCH_XMEGA2;
+  else if (Features[AVR::ELFArchXMEGA3])
+    EFlags |= ELF::EF_AVR_ARCH_XMEGA3;
+  else if (Features[AVR::ELFArchXMEGA4])
+    EFlags |= ELF::EF_AVR_ARCH_XMEGA4;
+  else if (Features[AVR::ELFArchXMEGA5])
+    EFlags |= ELF::EF_AVR_ARCH_XMEGA5;
+  else if (Features[AVR::ELFArchXMEGA6])
+    EFlags |= ELF::EF_AVR_ARCH_XMEGA6;
+  else if (Features[AVR::ELFArchXMEGA7])
+    EFlags |= ELF::EF_AVR_ARCH_XMEGA7;
+
+  return EFlags;
+}
+
+AVRELFStreamer::AVRELFStreamer(MCStreamer &S,
+                               const MCSubtargetInfo &STI)
+    : AVRTargetStreamer(S) {
+
+  MCAssembler &MCA = getStreamer().getAssembler();
+  unsigned EFlags = MCA.getELFHeaderEFlags();
+
+  EFlags |= getEFlagsForFeatureSet(STI.getFeatureBits());
+
+  MCA.setELFHeaderEFlags(EFlags);
+}
+
+} // end namespace llvm
diff --git a/lib/Target/AVR/MCTargetDesc/AVRELFStreamer.h b/lib/Target/AVR/MCTargetDesc/AVRELFStreamer.h
new file mode 100644
index 000000000000..e5df6cc34e40
--- /dev/null
+++ b/lib/Target/AVR/MCTargetDesc/AVRELFStreamer.h
@@ -0,0 +1,29 @@
+//===----- AVRELFStreamer.h - AVR Target Streamer --------------*- C++ -*--===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_AVR_ELF_STREAMER_H
+#define LLVM_AVR_ELF_STREAMER_H
+
+#include "AVRTargetStreamer.h"
+
+namespace llvm {
+
+/// A target streamer for an AVR ELF object file.
+class AVRELFStreamer : public AVRTargetStreamer {
+public:
+  AVRELFStreamer(MCStreamer &S, const MCSubtargetInfo &STI);
+
+  MCELFStreamer &getStreamer() {
+    return static_cast<MCELFStreamer &>(Streamer);
+  }
+};
+
+} // end namespace llvm
+
+#endif
diff --git a/lib/Target/AVR/MCTargetDesc/AVRMCAsmInfo.cpp b/lib/Target/AVR/MCTargetDesc/AVRMCAsmInfo.cpp
new file mode 100644
index 000000000000..cca3bcc4968a
--- /dev/null
+++ b/lib/Target/AVR/MCTargetDesc/AVRMCAsmInfo.cpp
@@ -0,0 +1,28 @@
+//===-- AVRMCAsmInfo.cpp - AVR asm properties -----------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the declarations of the AVRMCAsmInfo properties.
+//
+//===----------------------------------------------------------------------===//
+
+#include "AVRMCAsmInfo.h"
+
+#include "llvm/ADT/Triple.h"
+
+namespace llvm {
+
+AVRMCAsmInfo::AVRMCAsmInfo(const Triple &TT) {
+  PointerSize = 2;
+  CalleeSaveStackSlotSize = 2;
+  CommentString = ";";
+  PrivateGlobalPrefix = ".L";
+  UsesELFSectionDirectiveForBSS = true;
+}
+
+} // end of namespace llvm
diff --git a/lib/Target/AVR/MCTargetDesc/AVRMCAsmInfo.h b/lib/Target/AVR/MCTargetDesc/AVRMCAsmInfo.h
new file mode 100644
index 000000000000..cc2207a3cfae
--- /dev/null
+++ b/lib/Target/AVR/MCTargetDesc/AVRMCAsmInfo.h
@@ -0,0 +1,31 @@
+//===-- AVRMCAsmInfo.h - AVR asm properties ---------------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the declaration of the AVRMCAsmInfo class.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_AVR_ASM_INFO_H
+#define LLVM_AVR_ASM_INFO_H
+
+#include "llvm/MC/MCAsmInfo.h"
+
+namespace llvm {
+
+class Triple;
+
+/// Specifies the format of AVR assembly files.
+class AVRMCAsmInfo : public MCAsmInfo {
+public:
+  explicit AVRMCAsmInfo(const Triple &TT);
+};
+
+} // end namespace llvm
+
+#endif // LLVM_AVR_ASM_INFO_H
diff --git a/lib/Target/AVR/MCTargetDesc/AVRMCTargetDesc.h b/lib/Target/AVR/MCTargetDesc/AVRMCTargetDesc.h
new file mode 100644
index 000000000000..b72793d0fab4
--- /dev/null
+++ b/lib/Target/AVR/MCTargetDesc/AVRMCTargetDesc.h
@@ -0,0 +1,57 @@
+//===-- AVRMCTargetDesc.h - AVR Target Descriptions -------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file provides AVR specific target descriptions.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_AVR_MCTARGET_DESC_H
+#define LLVM_AVR_MCTARGET_DESC_H
+
+#include "llvm/Support/DataTypes.h"
+
+namespace llvm {
+
+class MCAsmBackend;
+class MCCodeEmitter;
+class MCContext;
+class MCInstrInfo;
+class MCObjectWriter;
+class MCRegisterInfo;
+class StringRef;
+class Target;
+class Triple;
+class raw_pwrite_stream;
+
+extern Target TheAVRTarget;
+
+/// Creates a machine code emitter for AVR.
+MCCodeEmitter *createAVRMCCodeEmitter(const MCInstrInfo &MCII,
+                                      const MCRegisterInfo &MRI,
+                                      MCContext &Ctx);
+
+/// Creates an assembly backend for AVR.
+MCAsmBackend *createAVRAsmBackend(const Target &T, const MCRegisterInfo &MRI,
+                                  const Triple &TT, StringRef CPU);
+
+/// Creates an ELF object writer for AVR.
+MCObjectWriter *createAVRELFObjectWriter(raw_pwrite_stream &OS, uint8_t OSABI);
+
+} // end namespace llvm
+
+#define GET_REGINFO_ENUM
+#include "AVRGenRegisterInfo.inc"
+
+#define GET_INSTRINFO_ENUM
+#include "AVRGenInstrInfo.inc"
+
+#define GET_SUBTARGETINFO_ENUM
+#include "AVRGenSubtargetInfo.inc"
+
+#endif // LLVM_AVR_MCTARGET_DESC_H
diff --git a/lib/Target/AVR/MCTargetDesc/AVRTargetStreamer.cpp b/lib/Target/AVR/MCTargetDesc/AVRTargetStreamer.cpp
new file mode 100644
index 000000000000..a2d8c16eeb8c
--- /dev/null
+++ b/lib/Target/AVR/MCTargetDesc/AVRTargetStreamer.cpp
@@ -0,0 +1,24 @@
+//===-- AVRTargetStreamer.cpp - AVR Target Streamer Methods ---------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file provides AVR specific target streamer methods.
+//
+//===----------------------------------------------------------------------===//
+
+#include "AVRTargetStreamer.h"
+
+namespace llvm {
+
+AVRTargetStreamer::AVRTargetStreamer(MCStreamer &S) : MCTargetStreamer(S) {}
+
+AVRTargetAsmStreamer::AVRTargetAsmStreamer(MCStreamer &S)
+    : AVRTargetStreamer(S) {}
+
+} // end namespace llvm
+
diff --git a/lib/Target/AVR/MCTargetDesc/AVRTargetStreamer.h b/lib/Target/AVR/MCTargetDesc/AVRTargetStreamer.h
new file mode 100644
index 000000000000..99a536699ae9
--- /dev/null
+++ b/lib/Target/AVR/MCTargetDesc/AVRTargetStreamer.h
@@ -0,0 +1,32 @@
+//===-- AVRTargetStreamer.h - AVR Target Streamer --------------*- C++ -*--===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_AVR_TARGET_STREAMER_H
+#define LLVM_AVR_TARGET_STREAMER_H
+
+#include "llvm/MC/MCELFStreamer.h"
+
+namespace llvm {
+class MCStreamer;
+
+/// A generic AVR target output stream.
+class AVRTargetStreamer : public MCTargetStreamer {
+public:
+  explicit AVRTargetStreamer(MCStreamer &S);
+};
+
+/// A target streamer for textual AVR assembly code.
+class AVRTargetAsmStreamer : public AVRTargetStreamer {
+public:
+  explicit AVRTargetAsmStreamer(MCStreamer &S);
+};
+
+} // end namespace llvm
+
+#endif // LLVM_AVR_TARGET_STREAMER_H
diff --git a/lib/Target/AVR/MCTargetDesc/CMakeLists.txt b/lib/Target/AVR/MCTargetDesc/CMakeLists.txt
new file mode 100644
index 000000000000..3cceb49acb30
--- /dev/null
+++ b/lib/Target/AVR/MCTargetDesc/CMakeLists.txt
@@ -0,0 +1,8 @@
+add_llvm_library(LLVMAVRDesc
+  AVRELFStreamer.cpp
+  AVRMCAsmInfo.cpp
+  AVRTargetStreamer.cpp
+)
+
+add_dependencies(LLVMAVRDesc AVRCommonTableGen)
+
diff --git a/lib/Target/AVR/MCTargetDesc/LLVMBuild.txt b/lib/Target/AVR/MCTargetDesc/LLVMBuild.txt
new file mode 100644
index 000000000000..8786f563cdc8
--- /dev/null
+++ b/lib/Target/AVR/MCTargetDesc/LLVMBuild.txt
@@ -0,0 +1,23 @@
+;===- ./lib/Target/AVR/MCTargetDesc/LLVMBuild.txt --------------*- Conf -*--===;
+;
+;                     The LLVM Compiler Infrastructure
+;
+; This file is distributed under the University of Illinois Open Source
+; License. See LICENSE.TXT for details.
+;
+;===------------------------------------------------------------------------===;
+;
+; This is an LLVMBuild description file for the components in this subdirectory.
+;
+; For more information on the LLVMBuild system, please see:
+;
+;   http://llvm.org/docs/LLVMBuild.html
+;
+;===------------------------------------------------------------------------===;
+
+[component_0]
+type = Library
+name = AVRDesc
+parent = AVR
+required_libraries = MC AVRInfo Support
+add_to_library_groups = AVR
diff --git a/lib/Target/AVR/Makefile b/lib/Target/AVR/Makefile
deleted file mode 100644
index c91b6f5c0ae9..000000000000
--- a/lib/Target/AVR/Makefile
+++ /dev/null
@@ -1,19 +0,0 @@
-##===- lib/Target/AVR/Makefile -----------------------------*- Makefile -*-===##
-#
-#                     The LLVM Compiler Infrastructure
-#
-# This file is distributed under the University of Illinois Open Source
-# License. See LICENSE.TXT for details.
-#
-##===----------------------------------------------------------------------===##
-
-LEVEL = ../../..
-LIBRARYNAME = LLVMAVRCodeGen
-TARGET = AVR
-
-# Make sure that tblgen is run, first thing.
-BUILT_SOURCES = AVRGenRegisterInfo.inc
-
-DIRS = TargetInfo
-
-include $(LEVEL)/Makefile.common
diff --git a/lib/Target/AVR/TODO.md b/lib/Target/AVR/TODO.md
new file mode 100644
index 000000000000..3a333355646d
--- /dev/null
+++ b/lib/Target/AVR/TODO.md
@@ -0,0 +1,7 @@
+# Write an XFAIL test for this `FIXME` in `AVRInstrInfo.td`
+
+```
+// :FIXME: DAGCombiner produces an shl node after legalization from these seq:
+// BR_JT -> (mul x, 2) -> (shl x, 1)
+```
+
diff --git a/lib/Target/AVR/TargetInfo/CMakeLists.txt b/lib/Target/AVR/TargetInfo/CMakeLists.txt
index f27090037702..557c55ae2f72 100644
--- a/lib/Target/AVR/TargetInfo/CMakeLists.txt
+++ b/lib/Target/AVR/TargetInfo/CMakeLists.txt
@@ -5,3 +5,5 @@ add_llvm_library(LLVMAVRInfo
   AVRTargetInfo.cpp
 )
 
+add_dependencies(LLVMAVRInfo AVRCommonTableGen)
+
diff --git a/lib/Target/AVR/TargetInfo/Makefile b/lib/Target/AVR/TargetInfo/Makefile
deleted file mode 100644
index 92b483dd028b..000000000000
--- a/lib/Target/AVR/TargetInfo/Makefile
+++ /dev/null
@@ -1,16 +0,0 @@
-##===- lib/Target/AVR/TargetInfo/Makefile ------------------*- Makefile -*-===##
-#
-#                     The LLVM Compiler Infrastructure
-#
-# This file is distributed under the University of Illinois Open Source
-# License. See LICENSE.TXT for details.
-#
-##===----------------------------------------------------------------------===##
-
-LEVEL = ../../../..
-LIBRARYNAME = LLVMAVRInfo
-
-# Hack: we need to include 'main' target directory to grab private headers
-CPP.Flags += -I$(PROJ_OBJ_DIR)/.. -I$(PROJ_SRC_DIR)/..
-
-include $(LEVEL)/Makefile.common
diff --git a/lib/Target/BPF/BPFAsmPrinter.cpp b/lib/Target/BPF/BPFAsmPrinter.cpp
index 10ec6587550b..1078b0652189 100644
--- a/lib/Target/BPF/BPFAsmPrinter.cpp
+++ b/lib/Target/BPF/BPFAsmPrinter.cpp
@@ -40,38 +40,10 @@ public:
 
   const char *getPassName() const override { return "BPF Assembly Printer"; }
 
-  void printOperand(const MachineInstr *MI, int OpNum, raw_ostream &O,
-                    const char *Modifier = nullptr);
   void EmitInstruction(const MachineInstr *MI) override;
 };
 }
 
-void BPFAsmPrinter::printOperand(const MachineInstr *MI, int OpNum,
-                                 raw_ostream &O, const char *Modifier) {
-  const MachineOperand &MO = MI->getOperand(OpNum);
-
-  switch (MO.getType()) {
-  case MachineOperand::MO_Register:
-    O << BPFInstPrinter::getRegisterName(MO.getReg());
-    break;
-
-  case MachineOperand::MO_Immediate:
-    O << MO.getImm();
-    break;
-
-  case MachineOperand::MO_MachineBasicBlock:
-    O << *MO.getMBB()->getSymbol();
-    break;
-
-  case MachineOperand::MO_GlobalAddress:
-    O << *getSymbol(MO.getGlobal());
-    break;
-
-  default:
-    llvm_unreachable("<unknown operand type>");
-  }
-}
-
 void BPFAsmPrinter::EmitInstruction(const MachineInstr *MI) {
 
   BPFMCInstLower MCInstLowering(OutContext, *this);
diff --git a/lib/Target/BPF/BPFFrameLowering.h b/lib/Target/BPF/BPFFrameLowering.h
index 251cda965ff5..5db963f518b1 100644
--- a/lib/Target/BPF/BPFFrameLowering.h
+++ b/lib/Target/BPF/BPFFrameLowering.h
@@ -31,10 +31,10 @@ public:
   void determineCalleeSaves(MachineFunction &MF, BitVector &SavedRegs,
                             RegScavenger *RS) const override;
 
-  void
+  MachineBasicBlock::iterator
   eliminateCallFramePseudoInstr(MachineFunction &MF, MachineBasicBlock &MBB,
                                 MachineBasicBlock::iterator MI) const override {
-    MBB.erase(MI);
+    return MBB.erase(MI);
   }
 };
 }
diff --git a/lib/Target/BPF/BPFISelDAGToDAG.cpp b/lib/Target/BPF/BPFISelDAGToDAG.cpp
index 9d5f1d406d0e..ac2af036b6f8 100644
--- a/lib/Target/BPF/BPFISelDAGToDAG.cpp
+++ b/lib/Target/BPF/BPFISelDAGToDAG.cpp
@@ -46,7 +46,7 @@ private:
 // Include the pieces autogenerated from the target description.
 #include "BPFGenDAGISel.inc"
 
-  SDNode *Select(SDNode *N) override;
+  void Select(SDNode *N) override;
 
   // Complex Pattern for address selection.
   bool SelectAddr(SDValue Addr, SDValue &Base, SDValue &Offset);
@@ -115,7 +115,7 @@ bool BPFDAGToDAGISel::SelectFIAddr(SDValue Addr, SDValue &Base, SDValue &Offset)
   return false;
 }
 
-SDNode *BPFDAGToDAGISel::Select(SDNode *Node) {
+void BPFDAGToDAGISel::Select(SDNode *Node) {
   unsigned Opcode = Node->getOpcode();
 
   // Dump information about the Node being selected
@@ -124,12 +124,24 @@ SDNode *BPFDAGToDAGISel::Select(SDNode *Node) {
   // If we have a custom node, we already have selected!
   if (Node->isMachineOpcode()) {
     DEBUG(dbgs() << "== "; Node->dump(CurDAG); dbgs() << '\n');
-    return NULL;
+    return;
   }
 
   // tablegen selection should be handled here.
   switch (Opcode) {
   default: break;
+  case ISD::SDIV: {
+    DebugLoc Empty;
+    const DebugLoc &DL = Node->getDebugLoc();
+    if (DL != Empty)
+      errs() << "Error at line " << DL.getLine() << ": ";
+    else
+      errs() << "Error: ";
+    errs() << "Unsupport signed division for DAG: ";
+    Node->dump(CurDAG);
+    errs() << "Please convert to unsigned div/mod.\n";
+    break;
+  }
   case ISD::INTRINSIC_W_CHAIN: {
     unsigned IntNo = cast<ConstantSDNode>(Node->getOperand(1))->getZExtValue();
     switch (IntNo) {
@@ -156,22 +168,17 @@ SDNode *BPFDAGToDAGISel::Select(SDNode *Node) {
     EVT VT = Node->getValueType(0);
     SDValue TFI = CurDAG->getTargetFrameIndex(FI, VT);
     unsigned Opc = BPF::MOV_rr;
-    if (Node->hasOneUse())
-      return CurDAG->SelectNodeTo(Node, Opc, VT, TFI);
-    return CurDAG->getMachineNode(Opc, SDLoc(Node), VT, TFI);
+    if (Node->hasOneUse()) {
+      CurDAG->SelectNodeTo(Node, Opc, VT, TFI);
+      return;
+    }
+    ReplaceNode(Node, CurDAG->getMachineNode(Opc, SDLoc(Node), VT, TFI));
+    return;
   }
   }
 
   // Select the default instruction
-  SDNode *ResNode = SelectCode(Node);
-
-  DEBUG(dbgs() << "=> ";
-        if (ResNode == nullptr || ResNode == Node)
-          Node->dump(CurDAG);
-        else
-          ResNode->dump(CurDAG);
-        dbgs() << '\n');
-  return ResNode;
+  SelectCode(Node);
 }
 
 FunctionPass *llvm::createBPFISelDag(BPFTargetMachine &TM) {
diff --git a/lib/Target/BPF/BPFISelLowering.cpp b/lib/Target/BPF/BPFISelLowering.cpp
index 6a5b37e153d8..cca3492a1992 100644
--- a/lib/Target/BPF/BPFISelLowering.cpp
+++ b/lib/Target/BPF/BPFISelLowering.cpp
@@ -14,8 +14,8 @@
 
 #include "BPFISelLowering.h"
 #include "BPF.h"
-#include "BPFTargetMachine.h"
 #include "BPFSubtarget.h"
+#include "BPFTargetMachine.h"
 #include "llvm/CodeGen/CallingConvLower.h"
 #include "llvm/CodeGen/MachineFrameInfo.h"
 #include "llvm/CodeGen/MachineFunction.h"
@@ -24,68 +24,31 @@
 #include "llvm/CodeGen/SelectionDAGISel.h"
 #include "llvm/CodeGen/TargetLoweringObjectFileImpl.h"
 #include "llvm/CodeGen/ValueTypes.h"
-#include "llvm/Support/CommandLine.h"
+#include "llvm/IR/DiagnosticInfo.h"
+#include "llvm/IR/DiagnosticPrinter.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/raw_ostream.h"
-#include "llvm/IR/DiagnosticInfo.h"
-#include "llvm/IR/DiagnosticPrinter.h"
 using namespace llvm;
 
 #define DEBUG_TYPE "bpf-lower"
 
-namespace {
-
-// Diagnostic information for unimplemented or unsupported feature reporting.
-class DiagnosticInfoUnsupported : public DiagnosticInfo {
-private:
-  // Debug location where this diagnostic is triggered.
-  DebugLoc DLoc;
-  const Twine &Description;
-  const Function &Fn;
-  SDValue Value;
-
-  static int KindID;
-
-  static int getKindID() {
-    if (KindID == 0)
-      KindID = llvm::getNextAvailablePluginDiagnosticKind();
-    return KindID;
-  }
-
-public:
-  DiagnosticInfoUnsupported(SDLoc DLoc, const Function &Fn, const Twine &Desc,
-                            SDValue Value)
-      : DiagnosticInfo(getKindID(), DS_Error), DLoc(DLoc.getDebugLoc()),
-        Description(Desc), Fn(Fn), Value(Value) {}
-
-  void print(DiagnosticPrinter &DP) const override {
-    std::string Str;
-    raw_string_ostream OS(Str);
-
-    if (DLoc) {
-      auto DIL = DLoc.get();
-      StringRef Filename = DIL->getFilename();
-      unsigned Line = DIL->getLine();
-      unsigned Column = DIL->getColumn();
-      OS << Filename << ':' << Line << ':' << Column << ' ';
-    }
-
-    OS << "in function " << Fn.getName() << ' ' << *Fn.getFunctionType() << '\n'
-       << Description;
-    if (Value)
-      Value->print(OS);
-    OS << '\n';
-    OS.flush();
-    DP << Str;
-  }
-
-  static bool classof(const DiagnosticInfo *DI) {
-    return DI->getKind() == getKindID();
-  }
-};
+static void fail(const SDLoc &DL, SelectionDAG &DAG, const char *Msg) {
+  MachineFunction &MF = DAG.getMachineFunction();
+  DAG.getContext()->diagnose(
+      DiagnosticInfoUnsupported(*MF.getFunction(), Msg, DL.getDebugLoc()));
+}
 
-int DiagnosticInfoUnsupported::KindID = 0;
+static void fail(const SDLoc &DL, SelectionDAG &DAG, const char *Msg,
+                 SDValue Val) {
+  MachineFunction &MF = DAG.getMachineFunction();
+  std::string Str;
+  raw_string_ostream OS(Str);
+  OS << Msg;
+  Val->print(OS);
+  OS.flush();
+  DAG.getContext()->diagnose(
+      DiagnosticInfoUnsupported(*MF.getFunction(), Str, DL.getDebugLoc()));
 }
 
 BPFTargetLowering::BPFTargetLowering(const TargetMachine &TM,
@@ -187,8 +150,8 @@ SDValue BPFTargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
 
 SDValue BPFTargetLowering::LowerFormalArguments(
     SDValue Chain, CallingConv::ID CallConv, bool IsVarArg,
-    const SmallVectorImpl<ISD::InputArg> &Ins, SDLoc DL, SelectionDAG &DAG,
-    SmallVectorImpl<SDValue> &InVals) const {
+    const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &DL,
+    SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals) const {
   switch (CallConv) {
   default:
     llvm_unreachable("Unsupported calling convention");
@@ -212,7 +175,7 @@ SDValue BPFTargetLowering::LowerFormalArguments(
       switch (RegVT.getSimpleVT().SimpleTy) {
       default: {
         errs() << "LowerFormalArguments Unhandled argument type: "
-               << RegVT.getSimpleVT().SimpleTy << '\n';
+               << RegVT.getEVTString() << '\n';
         llvm_unreachable(0);
       }
       case MVT::i64:
@@ -236,22 +199,20 @@ SDValue BPFTargetLowering::LowerFormalArguments(
         InVals.push_back(ArgValue);
       }
     } else {
-      DiagnosticInfoUnsupported Err(DL, *MF.getFunction(),
-                                    "defined with too many args", SDValue());
-      DAG.getContext()->diagnose(Err);
+      fail(DL, DAG, "defined with too many args");
+      InVals.push_back(DAG.getConstant(0, DL, VA.getLocVT()));
     }
   }
 
   if (IsVarArg || MF.getFunction()->hasStructRetAttr()) {
-    DiagnosticInfoUnsupported Err(
-        DL, *MF.getFunction(),
-        "functions with VarArgs or StructRet are not supported", SDValue());
-    DAG.getContext()->diagnose(Err);
+    fail(DL, DAG, "functions with VarArgs or StructRet are not supported");
   }
 
   return Chain;
 }
 
+const unsigned BPFTargetLowering::MaxArgs = 5;
+
 SDValue BPFTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
                                      SmallVectorImpl<SDValue> &InVals) const {
   SelectionDAG &DAG = CLI.DAG;
@@ -284,30 +245,27 @@ SDValue BPFTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
 
   unsigned NumBytes = CCInfo.getNextStackOffset();
 
-  if (Outs.size() >= 6) {
-    DiagnosticInfoUnsupported Err(CLI.DL, *MF.getFunction(),
-                                  "too many args to ", Callee);
-    DAG.getContext()->diagnose(Err);
-  }
+  if (Outs.size() > MaxArgs)
+    fail(CLI.DL, DAG, "too many args to ", Callee);
 
   for (auto &Arg : Outs) {
     ISD::ArgFlagsTy Flags = Arg.Flags;
     if (!Flags.isByVal())
       continue;
 
-    DiagnosticInfoUnsupported Err(CLI.DL, *MF.getFunction(),
-                                  "pass by value not supported ", Callee);
-    DAG.getContext()->diagnose(Err);
+    fail(CLI.DL, DAG, "pass by value not supported ", Callee);
   }
 
   auto PtrVT = getPointerTy(MF.getDataLayout());
   Chain = DAG.getCALLSEQ_START(
       Chain, DAG.getConstant(NumBytes, CLI.DL, PtrVT, true), CLI.DL);
 
-  SmallVector<std::pair<unsigned, SDValue>, 5> RegsToPass;
+  SmallVector<std::pair<unsigned, SDValue>, MaxArgs> RegsToPass;
 
   // Walk arg assignments
-  for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
+  for (unsigned i = 0,
+                e = std::min(static_cast<unsigned>(ArgLocs.size()), MaxArgs);
+       i != e; ++i) {
     CCValAssign &VA = ArgLocs[i];
     SDValue Arg = OutVals[i];
 
@@ -388,7 +346,8 @@ BPFTargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv,
                                bool IsVarArg,
                                const SmallVectorImpl<ISD::OutputArg> &Outs,
                                const SmallVectorImpl<SDValue> &OutVals,
-                               SDLoc DL, SelectionDAG &DAG) const {
+                               const SDLoc &DL, SelectionDAG &DAG) const {
+  unsigned Opc = BPFISD::RET_FLAG;
 
   // CCValAssign - represent the assignment of the return value to a location
   SmallVector<CCValAssign, 16> RVLocs;
@@ -398,9 +357,8 @@ BPFTargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv,
   CCState CCInfo(CallConv, IsVarArg, MF, RVLocs, *DAG.getContext());
 
   if (MF.getFunction()->getReturnType()->isAggregateType()) {
-    DiagnosticInfoUnsupported Err(DL, *MF.getFunction(),
-                                  "only integer returns supported", SDValue());
-    DAG.getContext()->diagnose(Err);
+    fail(DL, DAG, "only integer returns supported");
+    return DAG.getNode(Opc, DL, MVT::Other, Chain);
   }
 
   // Analize return values.
@@ -422,7 +380,6 @@ BPFTargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv,
     RetOps.push_back(DAG.getRegister(VA.getLocReg(), VA.getLocVT()));
   }
 
-  unsigned Opc = BPFISD::RET_FLAG;
   RetOps[0] = Chain; // Update chain.
 
   // Add the flag if we have it.
@@ -434,8 +391,8 @@ BPFTargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv,
 
 SDValue BPFTargetLowering::LowerCallResult(
     SDValue Chain, SDValue InFlag, CallingConv::ID CallConv, bool IsVarArg,
-    const SmallVectorImpl<ISD::InputArg> &Ins, SDLoc DL, SelectionDAG &DAG,
-    SmallVectorImpl<SDValue> &InVals) const {
+    const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &DL,
+    SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals) const {
 
   MachineFunction &MF = DAG.getMachineFunction();
   // Assign locations to each value returned by this call.
@@ -443,9 +400,10 @@ SDValue BPFTargetLowering::LowerCallResult(
   CCState CCInfo(CallConv, IsVarArg, MF, RVLocs, *DAG.getContext());
 
   if (Ins.size() >= 2) {
-    DiagnosticInfoUnsupported Err(DL, *MF.getFunction(),
-                                  "only small returns supported", SDValue());
-    DAG.getContext()->diagnose(Err);
+    fail(DL, DAG, "only small returns supported");
+    for (unsigned i = 0, e = Ins.size(); i != e; ++i)
+      InVals.push_back(DAG.getConstant(0, DL, Ins[i].VT));
+    return DAG.getCopyFromReg(Chain, DL, 1, Ins[0].VT, InFlag).getValue(1);
   }
 
   CCInfo.AnalyzeCallResult(Ins, RetCC_BPF64);
@@ -535,12 +493,12 @@ SDValue BPFTargetLowering::LowerGlobalAddress(SDValue Op,
 }
 
 MachineBasicBlock *
-BPFTargetLowering::EmitInstrWithCustomInserter(MachineInstr *MI,
+BPFTargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI,
                                                MachineBasicBlock *BB) const {
   const TargetInstrInfo &TII = *BB->getParent()->getSubtarget().getInstrInfo();
-  DebugLoc DL = MI->getDebugLoc();
+  DebugLoc DL = MI.getDebugLoc();
 
-  assert(MI->getOpcode() == BPF::Select && "Unexpected instr type to insert");
+  assert(MI.getOpcode() == BPF::Select && "Unexpected instr type to insert");
 
   // To "insert" a SELECT instruction, we actually have to insert the diamond
   // control-flow pattern.  The incoming instruction knows the destination vreg
@@ -571,9 +529,9 @@ BPFTargetLowering::EmitInstrWithCustomInserter(MachineInstr *MI,
   BB->addSuccessor(Copy1MBB);
 
   // Insert Branch if Flag
-  unsigned LHS = MI->getOperand(1).getReg();
-  unsigned RHS = MI->getOperand(2).getReg();
-  int CC = MI->getOperand(3).getImm();
+  unsigned LHS = MI.getOperand(1).getReg();
+  unsigned RHS = MI.getOperand(2).getReg();
+  int CC = MI.getOperand(3).getImm();
   switch (CC) {
   case ISD::SETGT:
     BuildMI(BB, DL, TII.get(BPF::JSGT_rr))
@@ -627,12 +585,12 @@ BPFTargetLowering::EmitInstrWithCustomInserter(MachineInstr *MI,
   //  %Result = phi [ %FalseValue, Copy0MBB ], [ %TrueValue, ThisMBB ]
   // ...
   BB = Copy1MBB;
-  BuildMI(*BB, BB->begin(), DL, TII.get(BPF::PHI), MI->getOperand(0).getReg())
-      .addReg(MI->getOperand(5).getReg())
+  BuildMI(*BB, BB->begin(), DL, TII.get(BPF::PHI), MI.getOperand(0).getReg())
+      .addReg(MI.getOperand(5).getReg())
       .addMBB(Copy0MBB)
-      .addReg(MI->getOperand(4).getReg())
+      .addReg(MI.getOperand(4).getReg())
       .addMBB(ThisMBB);
 
-  MI->eraseFromParent(); // The pseudo instruction is gone now.
+  MI.eraseFromParent(); // The pseudo instruction is gone now.
   return BB;
 }
diff --git a/lib/Target/BPF/BPFISelLowering.h b/lib/Target/BPF/BPFISelLowering.h
index ec71dca2faeb..3d1726be286e 100644
--- a/lib/Target/BPF/BPFISelLowering.h
+++ b/lib/Target/BPF/BPFISelLowering.h
@@ -43,7 +43,7 @@ public:
   const char *getTargetNodeName(unsigned Opcode) const override;
 
   MachineBasicBlock *
-  EmitInstrWithCustomInserter(MachineInstr *MI,
+  EmitInstrWithCustomInserter(MachineInstr &MI,
                               MachineBasicBlock *BB) const override;
 
 private:
@@ -54,10 +54,13 @@ private:
   // Lower the result values of a call, copying them out of physregs into vregs
   SDValue LowerCallResult(SDValue Chain, SDValue InFlag,
                           CallingConv::ID CallConv, bool IsVarArg,
-                          const SmallVectorImpl<ISD::InputArg> &Ins, SDLoc DL,
-                          SelectionDAG &DAG,
+                          const SmallVectorImpl<ISD::InputArg> &Ins,
+                          const SDLoc &DL, SelectionDAG &DAG,
                           SmallVectorImpl<SDValue> &InVals) const;
 
+  // Maximum number of arguments to a call
+  static const unsigned MaxArgs;
+
   // Lower a call into CALLSEQ_START - BPFISD:CALL - CALLSEQ_END chain
   SDValue LowerCall(TargetLowering::CallLoweringInfo &CLI,
                     SmallVectorImpl<SDValue> &InVals) const override;
@@ -66,12 +69,12 @@ private:
   SDValue LowerFormalArguments(SDValue Chain, CallingConv::ID CallConv,
                                bool IsVarArg,
                                const SmallVectorImpl<ISD::InputArg> &Ins,
-                               SDLoc DL, SelectionDAG &DAG,
+                               const SDLoc &DL, SelectionDAG &DAG,
                                SmallVectorImpl<SDValue> &InVals) const override;
 
   SDValue LowerReturn(SDValue Chain, CallingConv::ID CallConv, bool IsVarArg,
                       const SmallVectorImpl<ISD::OutputArg> &Outs,
-                      const SmallVectorImpl<SDValue> &OutVals, SDLoc DL,
+                      const SmallVectorImpl<SDValue> &OutVals, const SDLoc &DL,
                       SelectionDAG &DAG) const override;
 
   EVT getOptimalMemOpType(uint64_t Size, unsigned DstAlign, unsigned SrcAlign,
diff --git a/lib/Target/BPF/BPFInstrInfo.cpp b/lib/Target/BPF/BPFInstrInfo.cpp
index 83d14efc1a6c..7aea0512ac78 100644
--- a/lib/Target/BPF/BPFInstrInfo.cpp
+++ b/lib/Target/BPF/BPFInstrInfo.cpp
@@ -32,9 +32,9 @@ BPFInstrInfo::BPFInstrInfo()
     : BPFGenInstrInfo(BPF::ADJCALLSTACKDOWN, BPF::ADJCALLSTACKUP) {}
 
 void BPFInstrInfo::copyPhysReg(MachineBasicBlock &MBB,
-                               MachineBasicBlock::iterator I, DebugLoc DL,
-                               unsigned DestReg, unsigned SrcReg,
-                               bool KillSrc) const {
+                               MachineBasicBlock::iterator I,
+                               const DebugLoc &DL, unsigned DestReg,
+                               unsigned SrcReg, bool KillSrc) const {
   if (BPF::GPRRegClass.contains(DestReg, SrcReg))
     BuildMI(MBB, I, DL, get(BPF::MOV_rr), DestReg)
         .addReg(SrcReg, getKillRegState(KillSrc));
@@ -75,7 +75,7 @@ void BPFInstrInfo::loadRegFromStackSlot(MachineBasicBlock &MBB,
     llvm_unreachable("Can't load this register from stack slot");
 }
 
-bool BPFInstrInfo::AnalyzeBranch(MachineBasicBlock &MBB,
+bool BPFInstrInfo::analyzeBranch(MachineBasicBlock &MBB,
                                  MachineBasicBlock *&TBB,
                                  MachineBasicBlock *&FBB,
                                  SmallVectorImpl<MachineOperand> &Cond,
@@ -90,7 +90,7 @@ bool BPFInstrInfo::AnalyzeBranch(MachineBasicBlock &MBB,
 
     // Working from the bottom, when we see a non-terminator
     // instruction, we're done.
-    if (!isUnpredicatedTerminator(I))
+    if (!isUnpredicatedTerminator(*I))
       break;
 
     // A terminator that isn't a branch can't easily be handled
@@ -134,7 +134,7 @@ unsigned BPFInstrInfo::InsertBranch(MachineBasicBlock &MBB,
                                     MachineBasicBlock *TBB,
                                     MachineBasicBlock *FBB,
                                     ArrayRef<MachineOperand> Cond,
-                                    DebugLoc DL) const {
+                                    const DebugLoc &DL) const {
   // Shouldn't be a fall through.
   assert(TBB && "InsertBranch must not be told to insert a fallthrough");
 
diff --git a/lib/Target/BPF/BPFInstrInfo.h b/lib/Target/BPF/BPFInstrInfo.h
index ac60188804d2..cc2e41e4c603 100644
--- a/lib/Target/BPF/BPFInstrInfo.h
+++ b/lib/Target/BPF/BPFInstrInfo.h
@@ -31,7 +31,7 @@ public:
   const BPFRegisterInfo &getRegisterInfo() const { return RI; }
 
   void copyPhysReg(MachineBasicBlock &MBB, MachineBasicBlock::iterator I,
-                   DebugLoc DL, unsigned DestReg, unsigned SrcReg,
+                   const DebugLoc &DL, unsigned DestReg, unsigned SrcReg,
                    bool KillSrc) const override;
 
   void storeRegToStackSlot(MachineBasicBlock &MBB,
@@ -44,7 +44,7 @@ public:
                             MachineBasicBlock::iterator MBBI, unsigned DestReg,
                             int FrameIndex, const TargetRegisterClass *RC,
                             const TargetRegisterInfo *TRI) const override;
-  bool AnalyzeBranch(MachineBasicBlock &MBB, MachineBasicBlock *&TBB,
+  bool analyzeBranch(MachineBasicBlock &MBB, MachineBasicBlock *&TBB,
                      MachineBasicBlock *&FBB,
                      SmallVectorImpl<MachineOperand> &Cond,
                      bool AllowModify) const override;
@@ -52,7 +52,7 @@ public:
   unsigned RemoveBranch(MachineBasicBlock &MBB) const override;
   unsigned InsertBranch(MachineBasicBlock &MBB, MachineBasicBlock *TBB,
                         MachineBasicBlock *FBB, ArrayRef<MachineOperand> Cond,
-                        DebugLoc DL) const override;
+                        const DebugLoc &DL) const override;
 };
 }
 
diff --git a/lib/Target/BPF/BPFMCInstLower.cpp b/lib/Target/BPF/BPFMCInstLower.cpp
index 00bd8d9c090c..f64defecf3cc 100644
--- a/lib/Target/BPF/BPFMCInstLower.cpp
+++ b/lib/Target/BPF/BPFMCInstLower.cpp
@@ -20,9 +20,8 @@
 #include "llvm/MC/MCContext.h"
 #include "llvm/MC/MCExpr.h"
 #include "llvm/MC/MCInst.h"
-#include "llvm/Support/raw_ostream.h"
 #include "llvm/Support/ErrorHandling.h"
-#include "llvm/ADT/SmallString.h"
+#include "llvm/Support/raw_ostream.h"
 using namespace llvm;
 
 MCSymbol *
diff --git a/lib/Target/BPF/BPFSubtarget.h b/lib/Target/BPF/BPFSubtarget.h
index 5ad58db75395..27cc9a262fc3 100644
--- a/lib/Target/BPF/BPFSubtarget.h
+++ b/lib/Target/BPF/BPFSubtarget.h
@@ -17,7 +17,7 @@
 #include "BPFFrameLowering.h"
 #include "BPFISelLowering.h"
 #include "BPFInstrInfo.h"
-#include "llvm/Target/TargetSelectionDAGInfo.h"
+#include "llvm/CodeGen/SelectionDAGTargetInfo.h"
 #include "llvm/IR/DataLayout.h"
 #include "llvm/Target/TargetMachine.h"
 #include "llvm/Target/TargetSubtargetInfo.h"
@@ -33,7 +33,7 @@ class BPFSubtarget : public BPFGenSubtargetInfo {
   BPFInstrInfo InstrInfo;
   BPFFrameLowering FrameLowering;
   BPFTargetLowering TLInfo;
-  TargetSelectionDAGInfo TSInfo;
+  SelectionDAGTargetInfo TSInfo;
 
 public:
   // This constructor initializes the data members to match that
@@ -52,7 +52,7 @@ public:
   const BPFTargetLowering *getTargetLowering() const override {
     return &TLInfo;
   }
-  const TargetSelectionDAGInfo *getSelectionDAGInfo() const override {
+  const SelectionDAGTargetInfo *getSelectionDAGInfo() const override {
     return &TSInfo;
   }
   const TargetRegisterInfo *getRegisterInfo() const override {
diff --git a/lib/Target/BPF/BPFTargetMachine.cpp b/lib/Target/BPF/BPFTargetMachine.cpp
index 06cba2252a25..5fc6f2f0ce55 100644
--- a/lib/Target/BPF/BPFTargetMachine.cpp
+++ b/lib/Target/BPF/BPFTargetMachine.cpp
@@ -16,6 +16,7 @@
 #include "llvm/CodeGen/TargetLoweringObjectFileImpl.h"
 #include "llvm/IR/LegacyPassManager.h"
 #include "llvm/CodeGen/Passes.h"
+#include "llvm/CodeGen/TargetPassConfig.h"
 #include "llvm/Support/FormattedStream.h"
 #include "llvm/Support/TargetRegistry.h"
 #include "llvm/Target/TargetOptions.h"
@@ -36,13 +37,19 @@ static std::string computeDataLayout(const Triple &TT) {
     return "e-m:e-p:64:64-i64:64-n32:64-S128";
 }
 
+static Reloc::Model getEffectiveRelocModel(Optional<Reloc::Model> RM) {
+  if (!RM.hasValue())
+    return Reloc::PIC_;
+  return *RM;
+}
+
 BPFTargetMachine::BPFTargetMachine(const Target &T, const Triple &TT,
                                    StringRef CPU, StringRef FS,
                                    const TargetOptions &Options,
-                                   Reloc::Model RM, CodeModel::Model CM,
-                                   CodeGenOpt::Level OL)
-    : LLVMTargetMachine(T, computeDataLayout(TT), TT, CPU, FS, Options, RM, CM,
-                        OL),
+                                   Optional<Reloc::Model> RM,
+                                   CodeModel::Model CM, CodeGenOpt::Level OL)
+    : LLVMTargetMachine(T, computeDataLayout(TT), TT, CPU, FS, Options,
+                        getEffectiveRelocModel(RM), CM, OL),
       TLOF(make_unique<TargetLoweringObjectFileELF>()),
       Subtarget(TT, CPU, FS, *this) {
   initAsmInfo();
diff --git a/lib/Target/BPF/BPFTargetMachine.h b/lib/Target/BPF/BPFTargetMachine.h
index a0086df2d32c..644481446883 100644
--- a/lib/Target/BPF/BPFTargetMachine.h
+++ b/lib/Target/BPF/BPFTargetMachine.h
@@ -24,8 +24,9 @@ class BPFTargetMachine : public LLVMTargetMachine {
 
 public:
   BPFTargetMachine(const Target &T, const Triple &TT, StringRef CPU,
-                   StringRef FS, const TargetOptions &Options, Reloc::Model RM,
-                   CodeModel::Model CM, CodeGenOpt::Level OL);
+                   StringRef FS, const TargetOptions &Options,
+                   Optional<Reloc::Model> RM, CodeModel::Model CM,
+                   CodeGenOpt::Level OL);
 
   const BPFSubtarget *getSubtargetImpl() const { return &Subtarget; }
   const BPFSubtarget *getSubtargetImpl(const Function &) const override {
diff --git a/lib/Target/BPF/InstPrinter/Makefile b/lib/Target/BPF/InstPrinter/Makefile
deleted file mode 100644
index f46af83346d8..000000000000
--- a/lib/Target/BPF/InstPrinter/Makefile
+++ /dev/null
@@ -1,16 +0,0 @@
-##===- lib/Target/BPF/InstPrinter/Makefile -----------------*- Makefile -*-===##
-#
-#                     The LLVM Compiler Infrastructure
-#
-# This file is distributed under the University of Illinois Open Source
-# License. See LICENSE.TXT for details.
-#
-##===----------------------------------------------------------------------===##
-
-LEVEL = ../../../..
-LIBRARYNAME = LLVMBPFAsmPrinter
-
-# Hack: we need to include 'main' BPF target directory to grab private headers
-CPP.Flags += -I$(PROJ_OBJ_DIR)/.. -I$(PROJ_SRC_DIR)/..
-
-include $(LEVEL)/Makefile.common
diff --git a/lib/Target/BPF/MCTargetDesc/BPFAsmBackend.cpp b/lib/Target/BPF/MCTargetDesc/BPFAsmBackend.cpp
index 8c358cab62e8..2de40aab3a74 100644
--- a/lib/Target/BPF/MCTargetDesc/BPFAsmBackend.cpp
+++ b/lib/Target/BPF/MCTargetDesc/BPFAsmBackend.cpp
@@ -47,7 +47,8 @@ public:
 
   bool mayNeedRelaxation(const MCInst &Inst) const override { return false; }
 
-  void relaxInstruction(const MCInst &Inst, MCInst &Res) const override {}
+  void relaxInstruction(const MCInst &Inst, const MCSubtargetInfo &STI,
+                        MCInst &Res) const override {}
 
   bool writeNopData(uint64_t Count, MCObjectWriter *OW) const override;
 };
diff --git a/lib/Target/BPF/MCTargetDesc/BPFELFObjectWriter.cpp b/lib/Target/BPF/MCTargetDesc/BPFELFObjectWriter.cpp
index 87cdd5eb9dad..4b92e3eb019b 100644
--- a/lib/Target/BPF/MCTargetDesc/BPFELFObjectWriter.cpp
+++ b/lib/Target/BPF/MCTargetDesc/BPFELFObjectWriter.cpp
@@ -22,18 +22,18 @@ public:
   ~BPFELFObjectWriter() override;
 
 protected:
-  unsigned GetRelocType(const MCValue &Target, const MCFixup &Fixup,
-                        bool IsPCRel) const override;
+  unsigned getRelocType(MCContext &Ctx, const MCValue &Target,
+                        const MCFixup &Fixup, bool IsPCRel) const override;
 };
 }
 
 BPFELFObjectWriter::BPFELFObjectWriter(uint8_t OSABI)
-    : MCELFObjectTargetWriter(/*Is64Bit*/ true, OSABI, ELF::EM_NONE,
+    : MCELFObjectTargetWriter(/*Is64Bit*/ true, OSABI, ELF::EM_BPF,
                               /*HasRelocationAddend*/ false) {}
 
 BPFELFObjectWriter::~BPFELFObjectWriter() {}
 
-unsigned BPFELFObjectWriter::GetRelocType(const MCValue &Target,
+unsigned BPFELFObjectWriter::getRelocType(MCContext &Ctx, const MCValue &Target,
                                           const MCFixup &Fixup,
                                           bool IsPCRel) const {
   // determine the type of the relocation
diff --git a/lib/Target/BPF/MCTargetDesc/BPFMCAsmInfo.h b/lib/Target/BPF/MCTargetDesc/BPFMCAsmInfo.h
index 1f440fe87871..9a2e223bcbd6 100644
--- a/lib/Target/BPF/MCTargetDesc/BPFMCAsmInfo.h
+++ b/lib/Target/BPF/MCTargetDesc/BPFMCAsmInfo.h
@@ -14,13 +14,11 @@
 #ifndef LLVM_LIB_TARGET_BPF_MCTARGETDESC_BPFMCASMINFO_H
 #define LLVM_LIB_TARGET_BPF_MCTARGETDESC_BPFMCASMINFO_H
 
-#include "llvm/ADT/StringRef.h"
-#include "llvm/MC/MCAsmInfo.h"
 #include "llvm/ADT/Triple.h"
+#include "llvm/MC/MCAsmInfo.h"
 
 namespace llvm {
 class Target;
-class Triple;
 
 class BPFMCAsmInfo : public MCAsmInfo {
 public:
diff --git a/lib/Target/BPF/MCTargetDesc/BPFMCCodeEmitter.cpp b/lib/Target/BPF/MCTargetDesc/BPFMCCodeEmitter.cpp
index dc4ede30f191..c6561ddda26e 100644
--- a/lib/Target/BPF/MCTargetDesc/BPFMCCodeEmitter.cpp
+++ b/lib/Target/BPF/MCTargetDesc/BPFMCCodeEmitter.cpp
@@ -20,6 +20,7 @@
 #include "llvm/MC/MCSubtargetInfo.h"
 #include "llvm/MC/MCSymbol.h"
 #include "llvm/ADT/Statistic.h"
+#include "llvm/Support/EndianStream.h"
 #include "llvm/Support/raw_ostream.h"
 using namespace llvm;
 
diff --git a/lib/Target/BPF/MCTargetDesc/BPFMCTargetDesc.cpp b/lib/Target/BPF/MCTargetDesc/BPFMCTargetDesc.cpp
index 840570ebc392..03d6b193fe27 100644
--- a/lib/Target/BPF/MCTargetDesc/BPFMCTargetDesc.cpp
+++ b/lib/Target/BPF/MCTargetDesc/BPFMCTargetDesc.cpp
@@ -15,7 +15,6 @@
 #include "BPFMCTargetDesc.h"
 #include "BPFMCAsmInfo.h"
 #include "InstPrinter/BPFInstPrinter.h"
-#include "llvm/MC/MCCodeGenInfo.h"
 #include "llvm/MC/MCInstrInfo.h"
 #include "llvm/MC/MCRegisterInfo.h"
 #include "llvm/MC/MCStreamer.h"
@@ -51,14 +50,6 @@ static MCSubtargetInfo *createBPFMCSubtargetInfo(const Triple &TT,
   return createBPFMCSubtargetInfoImpl(TT, CPU, FS);
 }
 
-static MCCodeGenInfo *createBPFMCCodeGenInfo(const Triple &TT, Reloc::Model RM,
-                                             CodeModel::Model CM,
-                                             CodeGenOpt::Level OL) {
-  MCCodeGenInfo *X = new MCCodeGenInfo();
-  X->initMCCodeGenInfo(RM, CM, OL);
-  return X;
-}
-
 static MCStreamer *createBPFMCStreamer(const Triple &T,
                                        MCContext &Ctx, MCAsmBackend &MAB,
                                        raw_pwrite_stream &OS, MCCodeEmitter *Emitter,
@@ -81,9 +72,6 @@ extern "C" void LLVMInitializeBPFTargetMC() {
     // Register the MC asm info.
     RegisterMCAsmInfo<BPFMCAsmInfo> X(*T);
 
-    // Register the MC codegen info.
-    TargetRegistry::RegisterMCCodeGenInfo(*T, createBPFMCCodeGenInfo);
-
     // Register the MC instruction info.
     TargetRegistry::RegisterMCInstrInfo(*T, createBPFMCInstrInfo);
 
diff --git a/lib/Target/BPF/MCTargetDesc/Makefile b/lib/Target/BPF/MCTargetDesc/Makefile
deleted file mode 100644
index af70cd059e5d..000000000000
--- a/lib/Target/BPF/MCTargetDesc/Makefile
+++ /dev/null
@@ -1,16 +0,0 @@
-##===- lib/Target/BPF/MCTargetDesc/Makefile ----------------*- Makefile -*-===##
-#
-#                     The LLVM Compiler Infrastructure
-#
-# This file is distributed under the University of Illinois Open Source
-# License. See LICENSE.TXT for details.
-#
-##===----------------------------------------------------------------------===##
-
-LEVEL = ../../../..
-LIBRARYNAME = LLVMBPFDesc
-
-# Hack: we need to include 'main' target directory to grab private headers
-CPP.Flags += -I$(PROJ_OBJ_DIR)/.. -I$(PROJ_SRC_DIR)/..
-
-include $(LEVEL)/Makefile.common
diff --git a/lib/Target/BPF/Makefile b/lib/Target/BPF/Makefile
deleted file mode 100644
index 7492f5edd517..000000000000
--- a/lib/Target/BPF/Makefile
+++ /dev/null
@@ -1,21 +0,0 @@
-##===- lib/Target/BPF/Makefile -----------------------------*- Makefile -*-===##
-#
-#                     The LLVM Compiler Infrastructure
-#
-# This file is distributed under the University of Illinois Open Source
-# License. See LICENSE.TXT for details.
-#
-##===----------------------------------------------------------------------===##
-
-LEVEL = ../../..
-LIBRARYNAME = LLVMBPFCodeGen
-TARGET = BPF
-
-# Make sure that tblgen is run, first thing.
-BUILT_SOURCES = BPFGenRegisterInfo.inc BPFGenInstrInfo.inc \
-		BPFGenAsmWriter.inc BPFGenAsmMatcher.inc BPFGenDAGISel.inc \
-		BPFGenMCCodeEmitter.inc BPFGenSubtargetInfo.inc BPFGenCallingConv.inc
-
-DIRS = InstPrinter TargetInfo MCTargetDesc
-
-include $(LEVEL)/Makefile.common
diff --git a/lib/Target/BPF/TargetInfo/Makefile b/lib/Target/BPF/TargetInfo/Makefile
deleted file mode 100644
index 02af58ea1876..000000000000
--- a/lib/Target/BPF/TargetInfo/Makefile
+++ /dev/null
@@ -1,16 +0,0 @@
-##===- lib/Target/BPF/TargetInfo/Makefile ------------------*- Makefile -*-===##
-#
-#                     The LLVM Compiler Infrastructure
-#
-# This file is distributed under the University of Illinois Open Source
-# License. See LICENSE.TXT for details.
-#
-##===----------------------------------------------------------------------===##
-
-LEVEL = ../../../..
-LIBRARYNAME = LLVMBPFInfo
-
-# Hack: we need to include 'main' target directory to grab private headers
-CPPFLAGS = -I$(PROJ_OBJ_DIR)/.. -I$(PROJ_SRC_DIR)/..
-
-include $(LEVEL)/Makefile.common
diff --git a/lib/Target/CppBackend/CMakeLists.txt b/lib/Target/CppBackend/CMakeLists.txt
deleted file mode 100644
index 515e1dd7e39f..000000000000
--- a/lib/Target/CppBackend/CMakeLists.txt
+++ /dev/null
@@ -1,5 +0,0 @@
-add_llvm_target(CppBackendCodeGen
-  CPPBackend.cpp
-  )
-
-add_subdirectory(TargetInfo)
diff --git a/lib/Target/CppBackend/CPPBackend.cpp b/lib/Target/CppBackend/CPPBackend.cpp
deleted file mode 100644
index 5ea6551ebc9c..000000000000
--- a/lib/Target/CppBackend/CPPBackend.cpp
+++ /dev/null
@@ -1,2143 +0,0 @@
-//===-- CPPBackend.cpp - Library for converting LLVM code to C++ code -----===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-//
-// This file implements the writing of the LLVM IR as a set of C++ calls to the
-// LLVM IR interface. The input module is assumed to be verified.
-//
-//===----------------------------------------------------------------------===//
-
-#include "CPPTargetMachine.h"
-#include "llvm/ADT/SmallPtrSet.h"
-#include "llvm/ADT/StringExtras.h"
-#include "llvm/ADT/STLExtras.h"
-#include "llvm/Config/config.h"
-#include "llvm/IR/CallingConv.h"
-#include "llvm/IR/Constants.h"
-#include "llvm/IR/DerivedTypes.h"
-#include "llvm/IR/InlineAsm.h"
-#include "llvm/IR/Instruction.h"
-#include "llvm/IR/Instructions.h"
-#include "llvm/IR/LegacyPassManager.h"
-#include "llvm/IR/Module.h"
-#include "llvm/MC/MCAsmInfo.h"
-#include "llvm/MC/MCInstrInfo.h"
-#include "llvm/MC/MCSubtargetInfo.h"
-#include "llvm/Pass.h"
-#include "llvm/Support/CommandLine.h"
-#include "llvm/Support/ErrorHandling.h"
-#include "llvm/Support/FormattedStream.h"
-#include "llvm/Support/TargetRegistry.h"
-#include <algorithm>
-#include <cctype>
-#include <cstdio>
-#include <map>
-#include <set>
-using namespace llvm;
-
-static cl::opt<std::string>
-FuncName("cppfname", cl::desc("Specify the name of the generated function"),
-         cl::value_desc("function name"));
-
-enum WhatToGenerate {
-  GenProgram,
-  GenModule,
-  GenContents,
-  GenFunction,
-  GenFunctions,
-  GenInline,
-  GenVariable,
-  GenType
-};
-
-static cl::opt<WhatToGenerate> GenerationType("cppgen", cl::Optional,
-  cl::desc("Choose what kind of output to generate"),
-  cl::init(GenProgram),
-  cl::values(
-    clEnumValN(GenProgram,  "program",   "Generate a complete program"),
-    clEnumValN(GenModule,   "module",    "Generate a module definition"),
-    clEnumValN(GenContents, "contents",  "Generate contents of a module"),
-    clEnumValN(GenFunction, "function",  "Generate a function definition"),
-    clEnumValN(GenFunctions,"functions", "Generate all function definitions"),
-    clEnumValN(GenInline,   "inline",    "Generate an inline function"),
-    clEnumValN(GenVariable, "variable",  "Generate a variable definition"),
-    clEnumValN(GenType,     "type",      "Generate a type definition"),
-    clEnumValEnd
-  )
-);
-
-static cl::opt<std::string> NameToGenerate("cppfor", cl::Optional,
-  cl::desc("Specify the name of the thing to generate"),
-  cl::init("!bad!"));
-
-extern "C" void LLVMInitializeCppBackendTarget() {
-  // Register the target.
-  RegisterTargetMachine<CPPTargetMachine> X(TheCppBackendTarget);
-}
-
-namespace {
-  typedef std::vector<Type*> TypeList;
-  typedef std::map<Type*,std::string> TypeMap;
-  typedef std::map<const Value*,std::string> ValueMap;
-  typedef std::set<std::string> NameSet;
-  typedef std::set<Type*> TypeSet;
-  typedef std::set<const Value*> ValueSet;
-  typedef std::map<const Value*,std::string> ForwardRefMap;
-
-  /// CppWriter - This class is the main chunk of code that converts an LLVM
-  /// module to a C++ translation unit.
-  class CppWriter : public ModulePass {
-    std::unique_ptr<formatted_raw_ostream> OutOwner;
-    formatted_raw_ostream &Out;
-    const Module *TheModule;
-    uint64_t uniqueNum;
-    TypeMap TypeNames;
-    ValueMap ValueNames;
-    NameSet UsedNames;
-    TypeSet DefinedTypes;
-    ValueSet DefinedValues;
-    ForwardRefMap ForwardRefs;
-    bool is_inline;
-    unsigned indent_level;
-
-  public:
-    static char ID;
-    explicit CppWriter(std::unique_ptr<formatted_raw_ostream> o)
-        : ModulePass(ID), OutOwner(std::move(o)), Out(*OutOwner), uniqueNum(0),
-          is_inline(false), indent_level(0) {}
-
-    const char *getPassName() const override { return "C++ backend"; }
-
-    bool runOnModule(Module &M) override;
-
-    void printProgram(const std::string& fname, const std::string& modName );
-    void printModule(const std::string& fname, const std::string& modName );
-    void printContents(const std::string& fname, const std::string& modName );
-    void printFunction(const std::string& fname, const std::string& funcName );
-    void printFunctions();
-    void printInline(const std::string& fname, const std::string& funcName );
-    void printVariable(const std::string& fname, const std::string& varName );
-    void printType(const std::string& fname, const std::string& typeName );
-
-    void error(const std::string& msg);
-
-    
-    formatted_raw_ostream& nl(formatted_raw_ostream &Out, int delta = 0);
-    inline void in() { indent_level++; }
-    inline void out() { if (indent_level >0) indent_level--; }
-    
-  private:
-    void printLinkageType(GlobalValue::LinkageTypes LT);
-    void printVisibilityType(GlobalValue::VisibilityTypes VisTypes);
-    void printDLLStorageClassType(GlobalValue::DLLStorageClassTypes DSCType);
-    void printThreadLocalMode(GlobalVariable::ThreadLocalMode TLM);
-    void printCallingConv(CallingConv::ID cc);
-    void printEscapedString(const std::string& str);
-    void printCFP(const ConstantFP* CFP);
-
-    std::string getCppName(Type* val);
-    inline void printCppName(Type* val);
-
-    std::string getCppName(const Value* val);
-    inline void printCppName(const Value* val);
-
-    void printAttributes(const AttributeSet &PAL, const std::string &name);
-    void printType(Type* Ty);
-    void printTypes(const Module* M);
-
-    void printConstant(const Constant *CPV);
-    void printConstants(const Module* M);
-
-    void printVariableUses(const GlobalVariable *GV);
-    void printVariableHead(const GlobalVariable *GV);
-    void printVariableBody(const GlobalVariable *GV);
-
-    void printFunctionUses(const Function *F);
-    void printFunctionHead(const Function *F);
-    void printFunctionBody(const Function *F);
-    void printInstruction(const Instruction *I, const std::string& bbname);
-    std::string getOpName(const Value*);
-
-    void printModuleBody();
-  };
-} // end anonymous namespace.
-
-formatted_raw_ostream &CppWriter::nl(formatted_raw_ostream &Out, int delta) {
-  Out << '\n';
-  if (delta >= 0 || indent_level >= unsigned(-delta))
-    indent_level += delta;
-  Out.indent(indent_level);
-  return Out;
-}
-
-static inline void sanitize(std::string &str) {
-  for (size_t i = 0; i < str.length(); ++i)
-    if (!isalnum(str[i]) && str[i] != '_')
-      str[i] = '_';
-}
-
-static std::string getTypePrefix(Type *Ty) {
-  switch (Ty->getTypeID()) {
-  case Type::VoidTyID:     return "void_";
-  case Type::IntegerTyID:
-    return "int" + utostr(cast<IntegerType>(Ty)->getBitWidth()) + "_";
-  case Type::FloatTyID:    return "float_";
-  case Type::DoubleTyID:   return "double_";
-  case Type::LabelTyID:    return "label_";
-  case Type::FunctionTyID: return "func_";
-  case Type::StructTyID:   return "struct_";
-  case Type::ArrayTyID:    return "array_";
-  case Type::PointerTyID:  return "ptr_";
-  case Type::VectorTyID:   return "packed_";
-  default:                 return "other_";
-  }
-}
-
-void CppWriter::error(const std::string& msg) {
-  report_fatal_error(msg);
-}
-
-static inline std::string ftostr(const APFloat& V) {
-  std::string Buf;
-  if (&V.getSemantics() == &APFloat::IEEEdouble) {
-    raw_string_ostream(Buf) << V.convertToDouble();
-    return Buf;
-  } else if (&V.getSemantics() == &APFloat::IEEEsingle) {
-    raw_string_ostream(Buf) << (double)V.convertToFloat();
-    return Buf;
-  }
-  return "<unknown format in ftostr>"; // error
-}
-
-// printCFP - Print a floating point constant .. very carefully :)
-// This makes sure that conversion to/from floating yields the same binary
-// result so that we don't lose precision.
-void CppWriter::printCFP(const ConstantFP *CFP) {
-  bool ignored;
-  APFloat APF = APFloat(CFP->getValueAPF());  // copy
-  if (CFP->getType() == Type::getFloatTy(CFP->getContext()))
-    APF.convert(APFloat::IEEEdouble, APFloat::rmNearestTiesToEven, &ignored);
-  Out << "ConstantFP::get(mod->getContext(), ";
-  Out << "APFloat(";
-#if HAVE_PRINTF_A
-  char Buffer[100];
-  sprintf(Buffer, "%A", APF.convertToDouble());
-  if ((!strncmp(Buffer, "0x", 2) ||
-       !strncmp(Buffer, "-0x", 3) ||
-       !strncmp(Buffer, "+0x", 3)) &&
-      APF.bitwiseIsEqual(APFloat(atof(Buffer)))) {
-    if (CFP->getType() == Type::getDoubleTy(CFP->getContext()))
-      Out << "BitsToDouble(" << Buffer << ")";
-    else
-      Out << "BitsToFloat((float)" << Buffer << ")";
-    Out << ")";
-  } else {
-#endif
-    std::string StrVal = ftostr(CFP->getValueAPF());
-
-    while (StrVal[0] == ' ')
-      StrVal.erase(StrVal.begin());
-
-    // Check to make sure that the stringized number is not some string like
-    // "Inf" or NaN.  Check that the string matches the "[-+]?[0-9]" regex.
-    if (((StrVal[0] >= '0' && StrVal[0] <= '9') ||
-         ((StrVal[0] == '-' || StrVal[0] == '+') &&
-          (StrVal[1] >= '0' && StrVal[1] <= '9'))) &&
-        (CFP->isExactlyValue(atof(StrVal.c_str())))) {
-      if (CFP->getType() == Type::getDoubleTy(CFP->getContext()))
-        Out <<  StrVal;
-      else
-        Out << StrVal << "f";
-    } else if (CFP->getType() == Type::getDoubleTy(CFP->getContext()))
-      Out << "BitsToDouble(0x"
-          << utohexstr(CFP->getValueAPF().bitcastToAPInt().getZExtValue())
-          << "ULL) /* " << StrVal << " */";
-    else
-      Out << "BitsToFloat(0x"
-          << utohexstr((uint32_t)CFP->getValueAPF().
-                                      bitcastToAPInt().getZExtValue())
-          << "U) /* " << StrVal << " */";
-    Out << ")";
-#if HAVE_PRINTF_A
-  }
-#endif
-  Out << ")";
-}
-
-void CppWriter::printCallingConv(CallingConv::ID cc){
-  // Print the calling convention.
-  switch (cc) {
-  case CallingConv::C:     Out << "CallingConv::C"; break;
-  case CallingConv::Fast:  Out << "CallingConv::Fast"; break;
-  case CallingConv::Cold:  Out << "CallingConv::Cold"; break;
-  case CallingConv::FirstTargetCC: Out << "CallingConv::FirstTargetCC"; break;
-  default:                 Out << cc; break;
-  }
-}
-
-void CppWriter::printLinkageType(GlobalValue::LinkageTypes LT) {
-  switch (LT) {
-  case GlobalValue::InternalLinkage:
-    Out << "GlobalValue::InternalLinkage"; break;
-  case GlobalValue::PrivateLinkage:
-    Out << "GlobalValue::PrivateLinkage"; break;
-  case GlobalValue::AvailableExternallyLinkage:
-    Out << "GlobalValue::AvailableExternallyLinkage "; break;
-  case GlobalValue::LinkOnceAnyLinkage:
-    Out << "GlobalValue::LinkOnceAnyLinkage "; break;
-  case GlobalValue::LinkOnceODRLinkage:
-    Out << "GlobalValue::LinkOnceODRLinkage "; break;
-  case GlobalValue::WeakAnyLinkage:
-    Out << "GlobalValue::WeakAnyLinkage"; break;
-  case GlobalValue::WeakODRLinkage:
-    Out << "GlobalValue::WeakODRLinkage"; break;
-  case GlobalValue::AppendingLinkage:
-    Out << "GlobalValue::AppendingLinkage"; break;
-  case GlobalValue::ExternalLinkage:
-    Out << "GlobalValue::ExternalLinkage"; break;
-  case GlobalValue::ExternalWeakLinkage:
-    Out << "GlobalValue::ExternalWeakLinkage"; break;
-  case GlobalValue::CommonLinkage:
-    Out << "GlobalValue::CommonLinkage"; break;
-  }
-}
-
-void CppWriter::printVisibilityType(GlobalValue::VisibilityTypes VisType) {
-  switch (VisType) {
-  case GlobalValue::DefaultVisibility:
-    Out << "GlobalValue::DefaultVisibility";
-    break;
-  case GlobalValue::HiddenVisibility:
-    Out << "GlobalValue::HiddenVisibility";
-    break;
-  case GlobalValue::ProtectedVisibility:
-    Out << "GlobalValue::ProtectedVisibility";
-    break;
-  }
-}
-
-void CppWriter::printDLLStorageClassType(
-                                    GlobalValue::DLLStorageClassTypes DSCType) {
-  switch (DSCType) {
-  case GlobalValue::DefaultStorageClass:
-    Out << "GlobalValue::DefaultStorageClass";
-    break;
-  case GlobalValue::DLLImportStorageClass:
-    Out << "GlobalValue::DLLImportStorageClass";
-    break;
-  case GlobalValue::DLLExportStorageClass:
-    Out << "GlobalValue::DLLExportStorageClass";
-    break;
-  }
-}
-
-void CppWriter::printThreadLocalMode(GlobalVariable::ThreadLocalMode TLM) {
-  switch (TLM) {
-    case GlobalVariable::NotThreadLocal:
-      Out << "GlobalVariable::NotThreadLocal";
-      break;
-    case GlobalVariable::GeneralDynamicTLSModel:
-      Out << "GlobalVariable::GeneralDynamicTLSModel";
-      break;
-    case GlobalVariable::LocalDynamicTLSModel:
-      Out << "GlobalVariable::LocalDynamicTLSModel";
-      break;
-    case GlobalVariable::InitialExecTLSModel:
-      Out << "GlobalVariable::InitialExecTLSModel";
-      break;
-    case GlobalVariable::LocalExecTLSModel:
-      Out << "GlobalVariable::LocalExecTLSModel";
-      break;
-  }
-}
-
-// printEscapedString - Print each character of the specified string, escaping
-// it if it is not printable or if it is an escape char.
-void CppWriter::printEscapedString(const std::string &Str) {
-  for (unsigned i = 0, e = Str.size(); i != e; ++i) {
-    unsigned char C = Str[i];
-    if (isprint(C) && C != '"' && C != '\\') {
-      Out << C;
-    } else {
-      Out << "\\x"
-          << (char) ((C/16  < 10) ? ( C/16 +'0') : ( C/16 -10+'A'))
-          << (char)(((C&15) < 10) ? ((C&15)+'0') : ((C&15)-10+'A'));
-    }
-  }
-}
-
-std::string CppWriter::getCppName(Type* Ty) {
-  switch (Ty->getTypeID()) {
-  default:
-    break;
-  case Type::VoidTyID:
-    return "Type::getVoidTy(mod->getContext())";
-  case Type::IntegerTyID: {
-    unsigned BitWidth = cast<IntegerType>(Ty)->getBitWidth();
-    return "IntegerType::get(mod->getContext(), " + utostr(BitWidth) + ")";
-  }
-  case Type::X86_FP80TyID:
-    return "Type::getX86_FP80Ty(mod->getContext())";
-  case Type::FloatTyID:
-    return "Type::getFloatTy(mod->getContext())";
-  case Type::DoubleTyID:
-    return "Type::getDoubleTy(mod->getContext())";
-  case Type::LabelTyID:
-    return "Type::getLabelTy(mod->getContext())";
-  case Type::X86_MMXTyID:
-    return "Type::getX86_MMXTy(mod->getContext())";
-  }
-
-  // Now, see if we've seen the type before and return that
-  TypeMap::iterator I = TypeNames.find(Ty);
-  if (I != TypeNames.end())
-    return I->second;
-
-  // Okay, let's build a new name for this type. Start with a prefix
-  const char* prefix = nullptr;
-  switch (Ty->getTypeID()) {
-  case Type::FunctionTyID:    prefix = "FuncTy_"; break;
-  case Type::StructTyID:      prefix = "StructTy_"; break;
-  case Type::ArrayTyID:       prefix = "ArrayTy_"; break;
-  case Type::PointerTyID:     prefix = "PointerTy_"; break;
-  case Type::VectorTyID:      prefix = "VectorTy_"; break;
-  default:                    prefix = "OtherTy_"; break; // prevent breakage
-  }
-
-  // See if the type has a name in the symboltable and build accordingly
-  std::string name;
-  if (StructType *STy = dyn_cast<StructType>(Ty))
-    if (STy->hasName())
-      name = STy->getName();
-  
-  if (name.empty())
-    name = utostr(uniqueNum++);
-  
-  name = std::string(prefix) + name;
-  sanitize(name);
-
-  // Save the name
-  return TypeNames[Ty] = name;
-}
-
-void CppWriter::printCppName(Type* Ty) {
-  printEscapedString(getCppName(Ty));
-}
-
-std::string CppWriter::getCppName(const Value* val) {
-  std::string name;
-  ValueMap::iterator I = ValueNames.find(val);
-  if (I != ValueNames.end() && I->first == val)
-    return  I->second;
-
-  if (const GlobalVariable* GV = dyn_cast<GlobalVariable>(val)) {
-    name = std::string("gvar_") +
-      getTypePrefix(GV->getType()->getElementType());
-  } else if (isa<Function>(val)) {
-    name = std::string("func_");
-  } else if (const Constant* C = dyn_cast<Constant>(val)) {
-    name = std::string("const_") + getTypePrefix(C->getType());
-  } else if (const Argument* Arg = dyn_cast<Argument>(val)) {
-    if (is_inline) {
-      unsigned argNum = std::distance(Arg->getParent()->arg_begin(),
-                                      Function::const_arg_iterator(Arg)) + 1;
-      name = std::string("arg_") + utostr(argNum);
-      NameSet::iterator NI = UsedNames.find(name);
-      if (NI != UsedNames.end())
-        name += std::string("_") + utostr(uniqueNum++);
-      UsedNames.insert(name);
-      return ValueNames[val] = name;
-    } else {
-      name = getTypePrefix(val->getType());
-    }
-  } else {
-    name = getTypePrefix(val->getType());
-  }
-  if (val->hasName())
-    name += val->getName();
-  else
-    name += utostr(uniqueNum++);
-  sanitize(name);
-  NameSet::iterator NI = UsedNames.find(name);
-  if (NI != UsedNames.end())
-    name += std::string("_") + utostr(uniqueNum++);
-  UsedNames.insert(name);
-  return ValueNames[val] = name;
-}
-
-void CppWriter::printCppName(const Value* val) {
-  printEscapedString(getCppName(val));
-}
-
-void CppWriter::printAttributes(const AttributeSet &PAL,
-                                const std::string &name) {
-  Out << "AttributeSet " << name << "_PAL;";
-  nl(Out);
-  if (!PAL.isEmpty()) {
-    Out << '{'; in(); nl(Out);
-    Out << "SmallVector<AttributeSet, 4> Attrs;"; nl(Out);
-    Out << "AttributeSet PAS;"; in(); nl(Out);
-    for (unsigned i = 0; i < PAL.getNumSlots(); ++i) {
-      unsigned index = PAL.getSlotIndex(i);
-      AttrBuilder attrs(PAL.getSlotAttributes(i), index);
-      Out << "{"; in(); nl(Out);
-      Out << "AttrBuilder B;"; nl(Out);
-
-#define HANDLE_ATTR(X)                                                  \
-      if (attrs.contains(Attribute::X)) {                               \
-        Out << "B.addAttribute(Attribute::" #X ");"; nl(Out);           \
-        attrs.removeAttribute(Attribute::X);                            \
-      }
-
-      HANDLE_ATTR(SExt);
-      HANDLE_ATTR(ZExt);
-      HANDLE_ATTR(NoReturn);
-      HANDLE_ATTR(InReg);
-      HANDLE_ATTR(StructRet);
-      HANDLE_ATTR(NoUnwind);
-      HANDLE_ATTR(NoAlias);
-      HANDLE_ATTR(ByVal);
-      HANDLE_ATTR(InAlloca);
-      HANDLE_ATTR(Nest);
-      HANDLE_ATTR(ReadNone);
-      HANDLE_ATTR(ReadOnly);
-      HANDLE_ATTR(NoInline);
-      HANDLE_ATTR(AlwaysInline);
-      HANDLE_ATTR(OptimizeNone);
-      HANDLE_ATTR(OptimizeForSize);
-      HANDLE_ATTR(StackProtect);
-      HANDLE_ATTR(StackProtectReq);
-      HANDLE_ATTR(StackProtectStrong);
-      HANDLE_ATTR(SafeStack);
-      HANDLE_ATTR(NoCapture);
-      HANDLE_ATTR(NoRedZone);
-      HANDLE_ATTR(NoImplicitFloat);
-      HANDLE_ATTR(Naked);
-      HANDLE_ATTR(InlineHint);
-      HANDLE_ATTR(ReturnsTwice);
-      HANDLE_ATTR(UWTable);
-      HANDLE_ATTR(NonLazyBind);
-      HANDLE_ATTR(MinSize);
-#undef HANDLE_ATTR
-
-      if (attrs.contains(Attribute::StackAlignment)) {
-        Out << "B.addStackAlignmentAttr(" << attrs.getStackAlignment()<<')';
-        nl(Out);
-        attrs.removeAttribute(Attribute::StackAlignment);
-      }
-
-      Out << "PAS = AttributeSet::get(mod->getContext(), ";
-      if (index == ~0U)
-        Out << "~0U,";
-      else
-        Out << index << "U,";
-      Out << " B);"; out(); nl(Out);
-      Out << "}"; out(); nl(Out);
-      nl(Out);
-      Out << "Attrs.push_back(PAS);"; nl(Out);
-    }
-    Out << name << "_PAL = AttributeSet::get(mod->getContext(), Attrs);";
-    nl(Out);
-    out(); nl(Out);
-    Out << '}'; nl(Out);
-  }
-}
-
-void CppWriter::printType(Type* Ty) {
-  // We don't print definitions for primitive types
-  if (Ty->isFloatingPointTy() || Ty->isX86_MMXTy() || Ty->isIntegerTy() ||
-      Ty->isLabelTy() || Ty->isMetadataTy() || Ty->isVoidTy() ||
-      Ty->isTokenTy())
-    return;
-
-  // If we already defined this type, we don't need to define it again.
-  if (DefinedTypes.find(Ty) != DefinedTypes.end())
-    return;
-
-  // Everything below needs the name for the type so get it now.
-  std::string typeName(getCppName(Ty));
-
-  // Print the type definition
-  switch (Ty->getTypeID()) {
-  case Type::FunctionTyID:  {
-    FunctionType* FT = cast<FunctionType>(Ty);
-    Out << "std::vector<Type*>" << typeName << "_args;";
-    nl(Out);
-    FunctionType::param_iterator PI = FT->param_begin();
-    FunctionType::param_iterator PE = FT->param_end();
-    for (; PI != PE; ++PI) {
-      Type* argTy = static_cast<Type*>(*PI);
-      printType(argTy);
-      std::string argName(getCppName(argTy));
-      Out << typeName << "_args.push_back(" << argName;
-      Out << ");";
-      nl(Out);
-    }
-    printType(FT->getReturnType());
-    std::string retTypeName(getCppName(FT->getReturnType()));
-    Out << "FunctionType* " << typeName << " = FunctionType::get(";
-    in(); nl(Out) << "/*Result=*/" << retTypeName;
-    Out << ",";
-    nl(Out) << "/*Params=*/" << typeName << "_args,";
-    nl(Out) << "/*isVarArg=*/" << (FT->isVarArg() ? "true" : "false") << ");";
-    out();
-    nl(Out);
-    break;
-  }
-  case Type::StructTyID: {
-    StructType* ST = cast<StructType>(Ty);
-    if (!ST->isLiteral()) {
-      Out << "StructType *" << typeName << " = mod->getTypeByName(\"";
-      printEscapedString(ST->getName());
-      Out << "\");";
-      nl(Out);
-      Out << "if (!" << typeName << ") {";
-      nl(Out);
-      Out << typeName << " = ";
-      Out << "StructType::create(mod->getContext(), \"";
-      printEscapedString(ST->getName());
-      Out << "\");";
-      nl(Out);
-      Out << "}";
-      nl(Out);
-      // Indicate that this type is now defined.
-      DefinedTypes.insert(Ty);
-    }
-
-    Out << "std::vector<Type*>" << typeName << "_fields;";
-    nl(Out);
-    StructType::element_iterator EI = ST->element_begin();
-    StructType::element_iterator EE = ST->element_end();
-    for (; EI != EE; ++EI) {
-      Type* fieldTy = static_cast<Type*>(*EI);
-      printType(fieldTy);
-      std::string fieldName(getCppName(fieldTy));
-      Out << typeName << "_fields.push_back(" << fieldName;
-      Out << ");";
-      nl(Out);
-    }
-
-    if (ST->isLiteral()) {
-      Out << "StructType *" << typeName << " = ";
-      Out << "StructType::get(" << "mod->getContext(), ";
-    } else {
-      Out << "if (" << typeName << "->isOpaque()) {";
-      nl(Out);
-      Out << typeName << "->setBody(";
-    }
-
-    Out << typeName << "_fields, /*isPacked=*/"
-        << (ST->isPacked() ? "true" : "false") << ");";
-    nl(Out);
-    if (!ST->isLiteral()) {
-      Out << "}";
-      nl(Out);
-    }
-    break;
-  }
-  case Type::ArrayTyID: {
-    ArrayType* AT = cast<ArrayType>(Ty);
-    Type* ET = AT->getElementType();
-    printType(ET);
-    if (DefinedTypes.find(Ty) == DefinedTypes.end()) {
-      std::string elemName(getCppName(ET));
-      Out << "ArrayType* " << typeName << " = ArrayType::get("
-          << elemName << ", " << AT->getNumElements() << ");";
-      nl(Out);
-    }
-    break;
-  }
-  case Type::PointerTyID: {
-    PointerType* PT = cast<PointerType>(Ty);
-    Type* ET = PT->getElementType();
-    printType(ET);
-    if (DefinedTypes.find(Ty) == DefinedTypes.end()) {
-      std::string elemName(getCppName(ET));
-      Out << "PointerType* " << typeName << " = PointerType::get("
-          << elemName << ", " << PT->getAddressSpace() << ");";
-      nl(Out);
-    }
-    break;
-  }
-  case Type::VectorTyID: {
-    VectorType* PT = cast<VectorType>(Ty);
-    Type* ET = PT->getElementType();
-    printType(ET);
-    if (DefinedTypes.find(Ty) == DefinedTypes.end()) {
-      std::string elemName(getCppName(ET));
-      Out << "VectorType* " << typeName << " = VectorType::get("
-          << elemName << ", " << PT->getNumElements() << ");";
-      nl(Out);
-    }
-    break;
-  }
-  default:
-    error("Invalid TypeID");
-  }
-
-  // Indicate that this type is now defined.
-  DefinedTypes.insert(Ty);
-
-  // Finally, separate the type definition from other with a newline.
-  nl(Out);
-}
-
-void CppWriter::printTypes(const Module* M) {
-  // Add all of the global variables to the value table.
-  for (Module::const_global_iterator I = TheModule->global_begin(),
-         E = TheModule->global_end(); I != E; ++I) {
-    if (I->hasInitializer())
-      printType(I->getInitializer()->getType());
-    printType(I->getType());
-  }
-
-  // Add all the functions to the table
-  for (Module::const_iterator FI = TheModule->begin(), FE = TheModule->end();
-       FI != FE; ++FI) {
-    printType(FI->getReturnType());
-    printType(FI->getFunctionType());
-    // Add all the function arguments
-    for (Function::const_arg_iterator AI = FI->arg_begin(),
-           AE = FI->arg_end(); AI != AE; ++AI) {
-      printType(AI->getType());
-    }
-
-    // Add all of the basic blocks and instructions
-    for (Function::const_iterator BB = FI->begin(),
-           E = FI->end(); BB != E; ++BB) {
-      printType(BB->getType());
-      for (BasicBlock::const_iterator I = BB->begin(), E = BB->end(); I!=E;
-           ++I) {
-        printType(I->getType());
-        for (unsigned i = 0; i < I->getNumOperands(); ++i)
-          printType(I->getOperand(i)->getType());
-      }
-    }
-  }
-}
-
-
-// printConstant - Print out a constant pool entry...
-void CppWriter::printConstant(const Constant *CV) {
-  // First, if the constant is actually a GlobalValue (variable or function)
-  // or its already in the constant list then we've printed it already and we
-  // can just return.
-  if (isa<GlobalValue>(CV) || ValueNames.find(CV) != ValueNames.end())
-    return;
-
-  std::string constName(getCppName(CV));
-  std::string typeName(getCppName(CV->getType()));
-
-  if (const ConstantInt *CI = dyn_cast<ConstantInt>(CV)) {
-    std::string constValue = CI->getValue().toString(10, true);
-    Out << "ConstantInt* " << constName
-        << " = ConstantInt::get(mod->getContext(), APInt("
-        << cast<IntegerType>(CI->getType())->getBitWidth()
-        << ", StringRef(\"" <<  constValue << "\"), 10));";
-  } else if (isa<ConstantAggregateZero>(CV)) {
-    Out << "ConstantAggregateZero* " << constName
-        << " = ConstantAggregateZero::get(" << typeName << ");";
-  } else if (isa<ConstantPointerNull>(CV)) {
-    Out << "ConstantPointerNull* " << constName
-        << " = ConstantPointerNull::get(" << typeName << ");";
-  } else if (const ConstantFP *CFP = dyn_cast<ConstantFP>(CV)) {
-    Out << "ConstantFP* " << constName << " = ";
-    printCFP(CFP);
-    Out << ";";
-  } else if (const ConstantArray *CA = dyn_cast<ConstantArray>(CV)) {
-    Out << "std::vector<Constant*> " << constName << "_elems;";
-    nl(Out);
-    unsigned N = CA->getNumOperands();
-    for (unsigned i = 0; i < N; ++i) {
-      printConstant(CA->getOperand(i)); // recurse to print operands
-      Out << constName << "_elems.push_back("
-          << getCppName(CA->getOperand(i)) << ");";
-      nl(Out);
-    }
-    Out << "Constant* " << constName << " = ConstantArray::get("
-        << typeName << ", " << constName << "_elems);";
-  } else if (const ConstantStruct *CS = dyn_cast<ConstantStruct>(CV)) {
-    Out << "std::vector<Constant*> " << constName << "_fields;";
-    nl(Out);
-    unsigned N = CS->getNumOperands();
-    for (unsigned i = 0; i < N; i++) {
-      printConstant(CS->getOperand(i));
-      Out << constName << "_fields.push_back("
-          << getCppName(CS->getOperand(i)) << ");";
-      nl(Out);
-    }
-    Out << "Constant* " << constName << " = ConstantStruct::get("
-        << typeName << ", " << constName << "_fields);";
-  } else if (const ConstantVector *CVec = dyn_cast<ConstantVector>(CV)) {
-    Out << "std::vector<Constant*> " << constName << "_elems;";
-    nl(Out);
-    unsigned N = CVec->getNumOperands();
-    for (unsigned i = 0; i < N; ++i) {
-      printConstant(CVec->getOperand(i));
-      Out << constName << "_elems.push_back("
-          << getCppName(CVec->getOperand(i)) << ");";
-      nl(Out);
-    }
-    Out << "Constant* " << constName << " = ConstantVector::get("
-        << typeName << ", " << constName << "_elems);";
-  } else if (isa<UndefValue>(CV)) {
-    Out << "UndefValue* " << constName << " = UndefValue::get("
-        << typeName << ");";
-  } else if (const ConstantDataSequential *CDS =
-               dyn_cast<ConstantDataSequential>(CV)) {
-    if (CDS->isString()) {
-      Out << "Constant *" << constName <<
-      " = ConstantDataArray::getString(mod->getContext(), \"";
-      StringRef Str = CDS->getAsString();
-      bool nullTerminate = false;
-      if (Str.back() == 0) {
-        Str = Str.drop_back();
-        nullTerminate = true;
-      }
-      printEscapedString(Str);
-      // Determine if we want null termination or not.
-      if (nullTerminate)
-        Out << "\", true);";
-      else
-        Out << "\", false);";// No null terminator
-    } else {
-      // TODO: Could generate more efficient code generating CDS calls instead.
-      Out << "std::vector<Constant*> " << constName << "_elems;";
-      nl(Out);
-      for (unsigned i = 0; i != CDS->getNumElements(); ++i) {
-        Constant *Elt = CDS->getElementAsConstant(i);
-        printConstant(Elt);
-        Out << constName << "_elems.push_back(" << getCppName(Elt) << ");";
-        nl(Out);
-      }
-      Out << "Constant* " << constName;
-      
-      if (isa<ArrayType>(CDS->getType()))
-        Out << " = ConstantArray::get(";
-      else
-        Out << " = ConstantVector::get(";
-      Out << typeName << ", " << constName << "_elems);";
-    }
-  } else if (const ConstantExpr *CE = dyn_cast<ConstantExpr>(CV)) {
-    if (CE->getOpcode() == Instruction::GetElementPtr) {
-      Out << "std::vector<Constant*> " << constName << "_indices;";
-      nl(Out);
-      printConstant(CE->getOperand(0));
-      for (unsigned i = 1; i < CE->getNumOperands(); ++i ) {
-        printConstant(CE->getOperand(i));
-        Out << constName << "_indices.push_back("
-            << getCppName(CE->getOperand(i)) << ");";
-        nl(Out);
-      }
-      Out << "Constant* " << constName
-          << " = ConstantExpr::getGetElementPtr("
-          << getCppName(CE->getOperand(0)) << ", "
-          << constName << "_indices);";
-    } else if (CE->isCast()) {
-      printConstant(CE->getOperand(0));
-      Out << "Constant* " << constName << " = ConstantExpr::getCast(";
-      switch (CE->getOpcode()) {
-      default: llvm_unreachable("Invalid cast opcode");
-      case Instruction::Trunc: Out << "Instruction::Trunc"; break;
-      case Instruction::ZExt:  Out << "Instruction::ZExt"; break;
-      case Instruction::SExt:  Out << "Instruction::SExt"; break;
-      case Instruction::FPTrunc:  Out << "Instruction::FPTrunc"; break;
-      case Instruction::FPExt:  Out << "Instruction::FPExt"; break;
-      case Instruction::FPToUI:  Out << "Instruction::FPToUI"; break;
-      case Instruction::FPToSI:  Out << "Instruction::FPToSI"; break;
-      case Instruction::UIToFP:  Out << "Instruction::UIToFP"; break;
-      case Instruction::SIToFP:  Out << "Instruction::SIToFP"; break;
-      case Instruction::PtrToInt:  Out << "Instruction::PtrToInt"; break;
-      case Instruction::IntToPtr:  Out << "Instruction::IntToPtr"; break;
-      case Instruction::BitCast:  Out << "Instruction::BitCast"; break;
-      }
-      Out << ", " << getCppName(CE->getOperand(0)) << ", "
-          << getCppName(CE->getType()) << ");";
-    } else {
-      unsigned N = CE->getNumOperands();
-      for (unsigned i = 0; i < N; ++i ) {
-        printConstant(CE->getOperand(i));
-      }
-      Out << "Constant* " << constName << " = ConstantExpr::";
-      switch (CE->getOpcode()) {
-      case Instruction::Add:    Out << "getAdd(";  break;
-      case Instruction::FAdd:   Out << "getFAdd(";  break;
-      case Instruction::Sub:    Out << "getSub("; break;
-      case Instruction::FSub:   Out << "getFSub("; break;
-      case Instruction::Mul:    Out << "getMul("; break;
-      case Instruction::FMul:   Out << "getFMul("; break;
-      case Instruction::UDiv:   Out << "getUDiv("; break;
-      case Instruction::SDiv:   Out << "getSDiv("; break;
-      case Instruction::FDiv:   Out << "getFDiv("; break;
-      case Instruction::URem:   Out << "getURem("; break;
-      case Instruction::SRem:   Out << "getSRem("; break;
-      case Instruction::FRem:   Out << "getFRem("; break;
-      case Instruction::And:    Out << "getAnd("; break;
-      case Instruction::Or:     Out << "getOr("; break;
-      case Instruction::Xor:    Out << "getXor("; break;
-      case Instruction::ICmp:
-        Out << "getICmp(ICmpInst::ICMP_";
-        switch (CE->getPredicate()) {
-        case ICmpInst::ICMP_EQ:  Out << "EQ"; break;
-        case ICmpInst::ICMP_NE:  Out << "NE"; break;
-        case ICmpInst::ICMP_SLT: Out << "SLT"; break;
-        case ICmpInst::ICMP_ULT: Out << "ULT"; break;
-        case ICmpInst::ICMP_SGT: Out << "SGT"; break;
-        case ICmpInst::ICMP_UGT: Out << "UGT"; break;
-        case ICmpInst::ICMP_SLE: Out << "SLE"; break;
-        case ICmpInst::ICMP_ULE: Out << "ULE"; break;
-        case ICmpInst::ICMP_SGE: Out << "SGE"; break;
-        case ICmpInst::ICMP_UGE: Out << "UGE"; break;
-        default: error("Invalid ICmp Predicate");
-        }
-        break;
-      case Instruction::FCmp:
-        Out << "getFCmp(FCmpInst::FCMP_";
-        switch (CE->getPredicate()) {
-        case FCmpInst::FCMP_FALSE: Out << "FALSE"; break;
-        case FCmpInst::FCMP_ORD:   Out << "ORD"; break;
-        case FCmpInst::FCMP_UNO:   Out << "UNO"; break;
-        case FCmpInst::FCMP_OEQ:   Out << "OEQ"; break;
-        case FCmpInst::FCMP_UEQ:   Out << "UEQ"; break;
-        case FCmpInst::FCMP_ONE:   Out << "ONE"; break;
-        case FCmpInst::FCMP_UNE:   Out << "UNE"; break;
-        case FCmpInst::FCMP_OLT:   Out << "OLT"; break;
-        case FCmpInst::FCMP_ULT:   Out << "ULT"; break;
-        case FCmpInst::FCMP_OGT:   Out << "OGT"; break;
-        case FCmpInst::FCMP_UGT:   Out << "UGT"; break;
-        case FCmpInst::FCMP_OLE:   Out << "OLE"; break;
-        case FCmpInst::FCMP_ULE:   Out << "ULE"; break;
-        case FCmpInst::FCMP_OGE:   Out << "OGE"; break;
-        case FCmpInst::FCMP_UGE:   Out << "UGE"; break;
-        case FCmpInst::FCMP_TRUE:  Out << "TRUE"; break;
-        default: error("Invalid FCmp Predicate");
-        }
-        break;
-      case Instruction::Shl:     Out << "getShl("; break;
-      case Instruction::LShr:    Out << "getLShr("; break;
-      case Instruction::AShr:    Out << "getAShr("; break;
-      case Instruction::Select:  Out << "getSelect("; break;
-      case Instruction::ExtractElement: Out << "getExtractElement("; break;
-      case Instruction::InsertElement:  Out << "getInsertElement("; break;
-      case Instruction::ShuffleVector:  Out << "getShuffleVector("; break;
-      default:
-        error("Invalid constant expression");
-        break;
-      }
-      Out << getCppName(CE->getOperand(0));
-      for (unsigned i = 1; i < CE->getNumOperands(); ++i)
-        Out << ", " << getCppName(CE->getOperand(i));
-      Out << ");";
-    }
-  } else if (const BlockAddress *BA = dyn_cast<BlockAddress>(CV)) {
-    Out << "Constant* " << constName << " = ";
-    Out << "BlockAddress::get(" << getOpName(BA->getBasicBlock()) << ");";
-  } else {
-    error("Bad Constant");
-    Out << "Constant* " << constName << " = 0; ";
-  }
-  nl(Out);
-}
-
-void CppWriter::printConstants(const Module* M) {
-  // Traverse all the global variables looking for constant initializers
-  for (Module::const_global_iterator I = TheModule->global_begin(),
-         E = TheModule->global_end(); I != E; ++I)
-    if (I->hasInitializer())
-      printConstant(I->getInitializer());
-
-  // Traverse the LLVM functions looking for constants
-  for (Module::const_iterator FI = TheModule->begin(), FE = TheModule->end();
-       FI != FE; ++FI) {
-    // Add all of the basic blocks and instructions
-    for (Function::const_iterator BB = FI->begin(),
-           E = FI->end(); BB != E; ++BB) {
-      for (BasicBlock::const_iterator I = BB->begin(), E = BB->end(); I!=E;
-           ++I) {
-        for (unsigned i = 0; i < I->getNumOperands(); ++i) {
-          if (Constant* C = dyn_cast<Constant>(I->getOperand(i))) {
-            printConstant(C);
-          }
-        }
-      }
-    }
-  }
-}
-
-void CppWriter::printVariableUses(const GlobalVariable *GV) {
-  nl(Out) << "// Type Definitions";
-  nl(Out);
-  printType(GV->getType());
-  if (GV->hasInitializer()) {
-    const Constant *Init = GV->getInitializer();
-    printType(Init->getType());
-    if (const Function *F = dyn_cast<Function>(Init)) {
-      nl(Out)<< "/ Function Declarations"; nl(Out);
-      printFunctionHead(F);
-    } else if (const GlobalVariable* gv = dyn_cast<GlobalVariable>(Init)) {
-      nl(Out) << "// Global Variable Declarations"; nl(Out);
-      printVariableHead(gv);
-      
-      nl(Out) << "// Global Variable Definitions"; nl(Out);
-      printVariableBody(gv);
-    } else  {
-      nl(Out) << "// Constant Definitions"; nl(Out);
-      printConstant(Init);
-    }
-  }
-}
-
-void CppWriter::printVariableHead(const GlobalVariable *GV) {
-  nl(Out) << "GlobalVariable* " << getCppName(GV);
-  if (is_inline) {
-    Out << " = mod->getGlobalVariable(mod->getContext(), ";
-    printEscapedString(GV->getName());
-    Out << ", " << getCppName(GV->getType()->getElementType()) << ",true)";
-    nl(Out) << "if (!" << getCppName(GV) << ") {";
-    in(); nl(Out) << getCppName(GV);
-  }
-  Out << " = new GlobalVariable(/*Module=*/*mod, ";
-  nl(Out) << "/*Type=*/";
-  printCppName(GV->getType()->getElementType());
-  Out << ",";
-  nl(Out) << "/*isConstant=*/" << (GV->isConstant()?"true":"false");
-  Out << ",";
-  nl(Out) << "/*Linkage=*/";
-  printLinkageType(GV->getLinkage());
-  Out << ",";
-  nl(Out) << "/*Initializer=*/0, ";
-  if (GV->hasInitializer()) {
-    Out << "// has initializer, specified below";
-  }
-  nl(Out) << "/*Name=*/\"";
-  printEscapedString(GV->getName());
-  Out << "\");";
-  nl(Out);
-
-  if (GV->hasSection()) {
-    printCppName(GV);
-    Out << "->setSection(\"";
-    printEscapedString(GV->getSection());
-    Out << "\");";
-    nl(Out);
-  }
-  if (GV->getAlignment()) {
-    printCppName(GV);
-    Out << "->setAlignment(" << GV->getAlignment() << ");";
-    nl(Out);
-  }
-  if (GV->getVisibility() != GlobalValue::DefaultVisibility) {
-    printCppName(GV);
-    Out << "->setVisibility(";
-    printVisibilityType(GV->getVisibility());
-    Out << ");";
-    nl(Out);
-  }
-  if (GV->getDLLStorageClass() != GlobalValue::DefaultStorageClass) {
-    printCppName(GV);
-    Out << "->setDLLStorageClass(";
-    printDLLStorageClassType(GV->getDLLStorageClass());
-    Out << ");";
-    nl(Out);
-  }
-  if (GV->isThreadLocal()) {
-    printCppName(GV);
-    Out << "->setThreadLocalMode(";
-    printThreadLocalMode(GV->getThreadLocalMode());
-    Out << ");";
-    nl(Out);
-  }
-  if (is_inline) {
-    out(); Out << "}"; nl(Out);
-  }
-}
-
-void CppWriter::printVariableBody(const GlobalVariable *GV) {
-  if (GV->hasInitializer()) {
-    printCppName(GV);
-    Out << "->setInitializer(";
-    Out << getCppName(GV->getInitializer()) << ");";
-    nl(Out);
-  }
-}
-
-std::string CppWriter::getOpName(const Value* V) {
-  if (!isa<Instruction>(V) || DefinedValues.find(V) != DefinedValues.end())
-    return getCppName(V);
-
-  // See if its alread in the map of forward references, if so just return the
-  // name we already set up for it
-  ForwardRefMap::const_iterator I = ForwardRefs.find(V);
-  if (I != ForwardRefs.end())
-    return I->second;
-
-  // This is a new forward reference. Generate a unique name for it
-  std::string result(std::string("fwdref_") + utostr(uniqueNum++));
-
-  // Yes, this is a hack. An Argument is the smallest instantiable value that
-  // we can make as a placeholder for the real value. We'll replace these
-  // Argument instances later.
-  Out << "Argument* " << result << " = new Argument("
-      << getCppName(V->getType()) << ");";
-  nl(Out);
-  ForwardRefs[V] = result;
-  return result;
-}
-
-static StringRef ConvertAtomicOrdering(AtomicOrdering Ordering) {
-  switch (Ordering) {
-    case NotAtomic: return "NotAtomic";
-    case Unordered: return "Unordered";
-    case Monotonic: return "Monotonic";
-    case Acquire: return "Acquire";
-    case Release: return "Release";
-    case AcquireRelease: return "AcquireRelease";
-    case SequentiallyConsistent: return "SequentiallyConsistent";
-  }
-  llvm_unreachable("Unknown ordering");
-}
-
-static StringRef ConvertAtomicSynchScope(SynchronizationScope SynchScope) {
-  switch (SynchScope) {
-    case SingleThread: return "SingleThread";
-    case CrossThread: return "CrossThread";
-  }
-  llvm_unreachable("Unknown synch scope");
-}
-
-// printInstruction - This member is called for each Instruction in a function.
-void CppWriter::printInstruction(const Instruction *I,
-                                 const std::string& bbname) {
-  std::string iName(getCppName(I));
-
-  // Before we emit this instruction, we need to take care of generating any
-  // forward references. So, we get the names of all the operands in advance
-  const unsigned Ops(I->getNumOperands());
-  std::string* opNames = new std::string[Ops];
-  for (unsigned i = 0; i < Ops; i++)
-    opNames[i] = getOpName(I->getOperand(i));
-
-  switch (I->getOpcode()) {
-  default:
-    error("Invalid instruction");
-    break;
-
-  case Instruction::Ret: {
-    const ReturnInst* ret =  cast<ReturnInst>(I);
-    Out << "ReturnInst::Create(mod->getContext(), "
-        << (ret->getReturnValue() ? opNames[0] + ", " : "") << bbname << ");";
-    break;
-  }
-  case Instruction::Br: {
-    const BranchInst* br = cast<BranchInst>(I);
-    Out << "BranchInst::Create(" ;
-    if (br->getNumOperands() == 3) {
-      Out << opNames[2] << ", "
-          << opNames[1] << ", "
-          << opNames[0] << ", ";
-
-    } else if (br->getNumOperands() == 1) {
-      Out << opNames[0] << ", ";
-    } else {
-      error("Branch with 2 operands?");
-    }
-    Out << bbname << ");";
-    break;
-  }
-  case Instruction::Switch: {
-    const SwitchInst *SI = cast<SwitchInst>(I);
-    Out << "SwitchInst* " << iName << " = SwitchInst::Create("
-        << getOpName(SI->getCondition()) << ", "
-        << getOpName(SI->getDefaultDest()) << ", "
-        << SI->getNumCases() << ", " << bbname << ");";
-    nl(Out);
-    for (SwitchInst::ConstCaseIt i = SI->case_begin(), e = SI->case_end();
-         i != e; ++i) {
-      const ConstantInt* CaseVal = i.getCaseValue();
-      const BasicBlock *BB = i.getCaseSuccessor();
-      Out << iName << "->addCase("
-          << getOpName(CaseVal) << ", "
-          << getOpName(BB) << ");";
-      nl(Out);
-    }
-    break;
-  }
-  case Instruction::IndirectBr: {
-    const IndirectBrInst *IBI = cast<IndirectBrInst>(I);
-    Out << "IndirectBrInst *" << iName << " = IndirectBrInst::Create("
-        << opNames[0] << ", " << IBI->getNumDestinations() << ");";
-    nl(Out);
-    for (unsigned i = 1; i != IBI->getNumOperands(); ++i) {
-      Out << iName << "->addDestination(" << opNames[i] << ");";
-      nl(Out);
-    }
-    break;
-  }
-  case Instruction::Resume: {
-    Out << "ResumeInst::Create(" << opNames[0] << ", " << bbname << ");";
-    break;
-  }
-  case Instruction::Invoke: {
-    const InvokeInst* inv = cast<InvokeInst>(I);
-    Out << "std::vector<Value*> " << iName << "_params;";
-    nl(Out);
-    for (unsigned i = 0; i < inv->getNumArgOperands(); ++i) {
-      Out << iName << "_params.push_back("
-          << getOpName(inv->getArgOperand(i)) << ");";
-      nl(Out);
-    }
-    // FIXME: This shouldn't use magic numbers -3, -2, and -1.
-    Out << "InvokeInst *" << iName << " = InvokeInst::Create("
-        << getOpName(inv->getCalledValue()) << ", "
-        << getOpName(inv->getNormalDest()) << ", "
-        << getOpName(inv->getUnwindDest()) << ", "
-        << iName << "_params, \"";
-    printEscapedString(inv->getName());
-    Out << "\", " << bbname << ");";
-    nl(Out) << iName << "->setCallingConv(";
-    printCallingConv(inv->getCallingConv());
-    Out << ");";
-    printAttributes(inv->getAttributes(), iName);
-    Out << iName << "->setAttributes(" << iName << "_PAL);";
-    nl(Out);
-    break;
-  }
-  case Instruction::Unreachable: {
-    Out << "new UnreachableInst("
-        << "mod->getContext(), "
-        << bbname << ");";
-    break;
-  }
-  case Instruction::Add:
-  case Instruction::FAdd:
-  case Instruction::Sub:
-  case Instruction::FSub:
-  case Instruction::Mul:
-  case Instruction::FMul:
-  case Instruction::UDiv:
-  case Instruction::SDiv:
-  case Instruction::FDiv:
-  case Instruction::URem:
-  case Instruction::SRem:
-  case Instruction::FRem:
-  case Instruction::And:
-  case Instruction::Or:
-  case Instruction::Xor:
-  case Instruction::Shl:
-  case Instruction::LShr:
-  case Instruction::AShr:{
-    Out << "BinaryOperator* " << iName << " = BinaryOperator::Create(";
-    switch (I->getOpcode()) {
-    case Instruction::Add: Out << "Instruction::Add"; break;
-    case Instruction::FAdd: Out << "Instruction::FAdd"; break;
-    case Instruction::Sub: Out << "Instruction::Sub"; break;
-    case Instruction::FSub: Out << "Instruction::FSub"; break;
-    case Instruction::Mul: Out << "Instruction::Mul"; break;
-    case Instruction::FMul: Out << "Instruction::FMul"; break;
-    case Instruction::UDiv:Out << "Instruction::UDiv"; break;
-    case Instruction::SDiv:Out << "Instruction::SDiv"; break;
-    case Instruction::FDiv:Out << "Instruction::FDiv"; break;
-    case Instruction::URem:Out << "Instruction::URem"; break;
-    case Instruction::SRem:Out << "Instruction::SRem"; break;
-    case Instruction::FRem:Out << "Instruction::FRem"; break;
-    case Instruction::And: Out << "Instruction::And"; break;
-    case Instruction::Or:  Out << "Instruction::Or";  break;
-    case Instruction::Xor: Out << "Instruction::Xor"; break;
-    case Instruction::Shl: Out << "Instruction::Shl"; break;
-    case Instruction::LShr:Out << "Instruction::LShr"; break;
-    case Instruction::AShr:Out << "Instruction::AShr"; break;
-    default: Out << "Instruction::BadOpCode"; break;
-    }
-    Out << ", " << opNames[0] << ", " << opNames[1] << ", \"";
-    printEscapedString(I->getName());
-    Out << "\", " << bbname << ");";
-    break;
-  }
-  case Instruction::FCmp: {
-    Out << "FCmpInst* " << iName << " = new FCmpInst(*" << bbname << ", ";
-    switch (cast<FCmpInst>(I)->getPredicate()) {
-    case FCmpInst::FCMP_FALSE: Out << "FCmpInst::FCMP_FALSE"; break;
-    case FCmpInst::FCMP_OEQ  : Out << "FCmpInst::FCMP_OEQ"; break;
-    case FCmpInst::FCMP_OGT  : Out << "FCmpInst::FCMP_OGT"; break;
-    case FCmpInst::FCMP_OGE  : Out << "FCmpInst::FCMP_OGE"; break;
-    case FCmpInst::FCMP_OLT  : Out << "FCmpInst::FCMP_OLT"; break;
-    case FCmpInst::FCMP_OLE  : Out << "FCmpInst::FCMP_OLE"; break;
-    case FCmpInst::FCMP_ONE  : Out << "FCmpInst::FCMP_ONE"; break;
-    case FCmpInst::FCMP_ORD  : Out << "FCmpInst::FCMP_ORD"; break;
-    case FCmpInst::FCMP_UNO  : Out << "FCmpInst::FCMP_UNO"; break;
-    case FCmpInst::FCMP_UEQ  : Out << "FCmpInst::FCMP_UEQ"; break;
-    case FCmpInst::FCMP_UGT  : Out << "FCmpInst::FCMP_UGT"; break;
-    case FCmpInst::FCMP_UGE  : Out << "FCmpInst::FCMP_UGE"; break;
-    case FCmpInst::FCMP_ULT  : Out << "FCmpInst::FCMP_ULT"; break;
-    case FCmpInst::FCMP_ULE  : Out << "FCmpInst::FCMP_ULE"; break;
-    case FCmpInst::FCMP_UNE  : Out << "FCmpInst::FCMP_UNE"; break;
-    case FCmpInst::FCMP_TRUE : Out << "FCmpInst::FCMP_TRUE"; break;
-    default: Out << "FCmpInst::BAD_ICMP_PREDICATE"; break;
-    }
-    Out << ", " << opNames[0] << ", " << opNames[1] << ", \"";
-    printEscapedString(I->getName());
-    Out << "\");";
-    break;
-  }
-  case Instruction::ICmp: {
-    Out << "ICmpInst* " << iName << " = new ICmpInst(*" << bbname << ", ";
-    switch (cast<ICmpInst>(I)->getPredicate()) {
-    case ICmpInst::ICMP_EQ:  Out << "ICmpInst::ICMP_EQ";  break;
-    case ICmpInst::ICMP_NE:  Out << "ICmpInst::ICMP_NE";  break;
-    case ICmpInst::ICMP_ULE: Out << "ICmpInst::ICMP_ULE"; break;
-    case ICmpInst::ICMP_SLE: Out << "ICmpInst::ICMP_SLE"; break;
-    case ICmpInst::ICMP_UGE: Out << "ICmpInst::ICMP_UGE"; break;
-    case ICmpInst::ICMP_SGE: Out << "ICmpInst::ICMP_SGE"; break;
-    case ICmpInst::ICMP_ULT: Out << "ICmpInst::ICMP_ULT"; break;
-    case ICmpInst::ICMP_SLT: Out << "ICmpInst::ICMP_SLT"; break;
-    case ICmpInst::ICMP_UGT: Out << "ICmpInst::ICMP_UGT"; break;
-    case ICmpInst::ICMP_SGT: Out << "ICmpInst::ICMP_SGT"; break;
-    default: Out << "ICmpInst::BAD_ICMP_PREDICATE"; break;
-    }
-    Out << ", " << opNames[0] << ", " << opNames[1] << ", \"";
-    printEscapedString(I->getName());
-    Out << "\");";
-    break;
-  }
-  case Instruction::Alloca: {
-    const AllocaInst* allocaI = cast<AllocaInst>(I);
-    Out << "AllocaInst* " << iName << " = new AllocaInst("
-        << getCppName(allocaI->getAllocatedType()) << ", ";
-    if (allocaI->isArrayAllocation())
-      Out << opNames[0] << ", ";
-    Out << "\"";
-    printEscapedString(allocaI->getName());
-    Out << "\", " << bbname << ");";
-    if (allocaI->getAlignment())
-      nl(Out) << iName << "->setAlignment("
-          << allocaI->getAlignment() << ");";
-    break;
-  }
-  case Instruction::Load: {
-    const LoadInst* load = cast<LoadInst>(I);
-    Out << "LoadInst* " << iName << " = new LoadInst("
-        << opNames[0] << ", \"";
-    printEscapedString(load->getName());
-    Out << "\", " << (load->isVolatile() ? "true" : "false" )
-        << ", " << bbname << ");";
-    if (load->getAlignment())
-      nl(Out) << iName << "->setAlignment("
-              << load->getAlignment() << ");";
-    if (load->isAtomic()) {
-      StringRef Ordering = ConvertAtomicOrdering(load->getOrdering());
-      StringRef CrossThread = ConvertAtomicSynchScope(load->getSynchScope());
-      nl(Out) << iName << "->setAtomic("
-              << Ordering << ", " << CrossThread << ");";
-    }
-    break;
-  }
-  case Instruction::Store: {
-    const StoreInst* store = cast<StoreInst>(I);
-    Out << "StoreInst* " << iName << " = new StoreInst("
-        << opNames[0] << ", "
-        << opNames[1] << ", "
-        << (store->isVolatile() ? "true" : "false")
-        << ", " << bbname << ");";
-    if (store->getAlignment())
-      nl(Out) << iName << "->setAlignment("
-              << store->getAlignment() << ");";
-    if (store->isAtomic()) {
-      StringRef Ordering = ConvertAtomicOrdering(store->getOrdering());
-      StringRef CrossThread = ConvertAtomicSynchScope(store->getSynchScope());
-      nl(Out) << iName << "->setAtomic("
-              << Ordering << ", " << CrossThread << ");";
-    }
-    break;
-  }
-  case Instruction::GetElementPtr: {
-    const GetElementPtrInst* gep = cast<GetElementPtrInst>(I);
-    Out << "GetElementPtrInst* " << iName << " = GetElementPtrInst::Create("
-        << getCppName(gep->getSourceElementType()) << ", " << opNames[0] << ", {";
-    in();
-    for (unsigned i = 1; i < gep->getNumOperands(); ++i ) {
-      if (i != 1) {
-        Out << ", ";
-      }
-      nl(Out);
-      Out << opNames[i];
-    }
-    out();
-    nl(Out) << "}, \"";
-    printEscapedString(gep->getName());
-    Out << "\", " << bbname << ");";
-    break;
-  }
-  case Instruction::PHI: {
-    const PHINode* phi = cast<PHINode>(I);
-
-    Out << "PHINode* " << iName << " = PHINode::Create("
-        << getCppName(phi->getType()) << ", "
-        << phi->getNumIncomingValues() << ", \"";
-    printEscapedString(phi->getName());
-    Out << "\", " << bbname << ");";
-    nl(Out);
-    for (unsigned i = 0; i < phi->getNumIncomingValues(); ++i) {
-      Out << iName << "->addIncoming("
-          << opNames[PHINode::getOperandNumForIncomingValue(i)] << ", "
-          << getOpName(phi->getIncomingBlock(i)) << ");";
-      nl(Out);
-    }
-    break;
-  }
-  case Instruction::Trunc:
-  case Instruction::ZExt:
-  case Instruction::SExt:
-  case Instruction::FPTrunc:
-  case Instruction::FPExt:
-  case Instruction::FPToUI:
-  case Instruction::FPToSI:
-  case Instruction::UIToFP:
-  case Instruction::SIToFP:
-  case Instruction::PtrToInt:
-  case Instruction::IntToPtr:
-  case Instruction::BitCast: {
-    const CastInst* cst = cast<CastInst>(I);
-    Out << "CastInst* " << iName << " = new ";
-    switch (I->getOpcode()) {
-    case Instruction::Trunc:    Out << "TruncInst"; break;
-    case Instruction::ZExt:     Out << "ZExtInst"; break;
-    case Instruction::SExt:     Out << "SExtInst"; break;
-    case Instruction::FPTrunc:  Out << "FPTruncInst"; break;
-    case Instruction::FPExt:    Out << "FPExtInst"; break;
-    case Instruction::FPToUI:   Out << "FPToUIInst"; break;
-    case Instruction::FPToSI:   Out << "FPToSIInst"; break;
-    case Instruction::UIToFP:   Out << "UIToFPInst"; break;
-    case Instruction::SIToFP:   Out << "SIToFPInst"; break;
-    case Instruction::PtrToInt: Out << "PtrToIntInst"; break;
-    case Instruction::IntToPtr: Out << "IntToPtrInst"; break;
-    case Instruction::BitCast:  Out << "BitCastInst"; break;
-    default: llvm_unreachable("Unreachable");
-    }
-    Out << "(" << opNames[0] << ", "
-        << getCppName(cst->getType()) << ", \"";
-    printEscapedString(cst->getName());
-    Out << "\", " << bbname << ");";
-    break;
-  }
-  case Instruction::Call: {
-    const CallInst* call = cast<CallInst>(I);
-    if (const InlineAsm* ila = dyn_cast<InlineAsm>(call->getCalledValue())) {
-      Out << "InlineAsm* " << getCppName(ila) << " = InlineAsm::get("
-          << getCppName(ila->getFunctionType()) << ", \""
-          << ila->getAsmString() << "\", \""
-          << ila->getConstraintString() << "\","
-          << (ila->hasSideEffects() ? "true" : "false") << ");";
-      nl(Out);
-    }
-    if (call->getNumArgOperands() > 1) {
-      Out << "std::vector<Value*> " << iName << "_params;";
-      nl(Out);
-      for (unsigned i = 0; i < call->getNumArgOperands(); ++i) {
-        Out << iName << "_params.push_back(" << opNames[i] << ");";
-        nl(Out);
-      }
-      Out << "CallInst* " << iName << " = CallInst::Create("
-          << opNames[call->getNumArgOperands()] << ", "
-          << iName << "_params, \"";
-    } else if (call->getNumArgOperands() == 1) {
-      Out << "CallInst* " << iName << " = CallInst::Create("
-          << opNames[call->getNumArgOperands()] << ", " << opNames[0] << ", \"";
-    } else {
-      Out << "CallInst* " << iName << " = CallInst::Create("
-          << opNames[call->getNumArgOperands()] << ", \"";
-    }
-    printEscapedString(call->getName());
-    Out << "\", " << bbname << ");";
-    nl(Out) << iName << "->setCallingConv(";
-    printCallingConv(call->getCallingConv());
-    Out << ");";
-    nl(Out) << iName << "->setTailCall("
-        << (call->isTailCall() ? "true" : "false");
-    Out << ");";
-    nl(Out);
-    printAttributes(call->getAttributes(), iName);
-    Out << iName << "->setAttributes(" << iName << "_PAL);";
-    nl(Out);
-    break;
-  }
-  case Instruction::Select: {
-    const SelectInst* sel = cast<SelectInst>(I);
-    Out << "SelectInst* " << getCppName(sel) << " = SelectInst::Create(";
-    Out << opNames[0] << ", " << opNames[1] << ", " << opNames[2] << ", \"";
-    printEscapedString(sel->getName());
-    Out << "\", " << bbname << ");";
-    break;
-  }
-  case Instruction::UserOp1:
-    /// FALL THROUGH
-  case Instruction::UserOp2: {
-    /// FIXME: What should be done here?
-    break;
-  }
-  case Instruction::VAArg: {
-    const VAArgInst* va = cast<VAArgInst>(I);
-    Out << "VAArgInst* " << getCppName(va) << " = new VAArgInst("
-        << opNames[0] << ", " << getCppName(va->getType()) << ", \"";
-    printEscapedString(va->getName());
-    Out << "\", " << bbname << ");";
-    break;
-  }
-  case Instruction::ExtractElement: {
-    const ExtractElementInst* eei = cast<ExtractElementInst>(I);
-    Out << "ExtractElementInst* " << getCppName(eei)
-        << " = new ExtractElementInst(" << opNames[0]
-        << ", " << opNames[1] << ", \"";
-    printEscapedString(eei->getName());
-    Out << "\", " << bbname << ");";
-    break;
-  }
-  case Instruction::InsertElement: {
-    const InsertElementInst* iei = cast<InsertElementInst>(I);
-    Out << "InsertElementInst* " << getCppName(iei)
-        << " = InsertElementInst::Create(" << opNames[0]
-        << ", " << opNames[1] << ", " << opNames[2] << ", \"";
-    printEscapedString(iei->getName());
-    Out << "\", " << bbname << ");";
-    break;
-  }
-  case Instruction::ShuffleVector: {
-    const ShuffleVectorInst* svi = cast<ShuffleVectorInst>(I);
-    Out << "ShuffleVectorInst* " << getCppName(svi)
-        << " = new ShuffleVectorInst(" << opNames[0]
-        << ", " << opNames[1] << ", " << opNames[2] << ", \"";
-    printEscapedString(svi->getName());
-    Out << "\", " << bbname << ");";
-    break;
-  }
-  case Instruction::ExtractValue: {
-    const ExtractValueInst *evi = cast<ExtractValueInst>(I);
-    Out << "std::vector<unsigned> " << iName << "_indices;";
-    nl(Out);
-    for (unsigned i = 0; i < evi->getNumIndices(); ++i) {
-      Out << iName << "_indices.push_back("
-          << evi->idx_begin()[i] << ");";
-      nl(Out);
-    }
-    Out << "ExtractValueInst* " << getCppName(evi)
-        << " = ExtractValueInst::Create(" << opNames[0]
-        << ", "
-        << iName << "_indices, \"";
-    printEscapedString(evi->getName());
-    Out << "\", " << bbname << ");";
-    break;
-  }
-  case Instruction::InsertValue: {
-    const InsertValueInst *ivi = cast<InsertValueInst>(I);
-    Out << "std::vector<unsigned> " << iName << "_indices;";
-    nl(Out);
-    for (unsigned i = 0; i < ivi->getNumIndices(); ++i) {
-      Out << iName << "_indices.push_back("
-          << ivi->idx_begin()[i] << ");";
-      nl(Out);
-    }
-    Out << "InsertValueInst* " << getCppName(ivi)
-        << " = InsertValueInst::Create(" << opNames[0]
-        << ", " << opNames[1] << ", "
-        << iName << "_indices, \"";
-    printEscapedString(ivi->getName());
-    Out << "\", " << bbname << ");";
-    break;
-  }
-  case Instruction::Fence: {
-    const FenceInst *fi = cast<FenceInst>(I);
-    StringRef Ordering = ConvertAtomicOrdering(fi->getOrdering());
-    StringRef CrossThread = ConvertAtomicSynchScope(fi->getSynchScope());
-    Out << "FenceInst* " << iName
-        << " = new FenceInst(mod->getContext(), "
-        << Ordering << ", " << CrossThread << ", " << bbname
-        << ");";
-    break;
-  }
-  case Instruction::AtomicCmpXchg: {
-    const AtomicCmpXchgInst *cxi = cast<AtomicCmpXchgInst>(I);
-    StringRef SuccessOrdering =
-        ConvertAtomicOrdering(cxi->getSuccessOrdering());
-    StringRef FailureOrdering =
-        ConvertAtomicOrdering(cxi->getFailureOrdering());
-    StringRef CrossThread = ConvertAtomicSynchScope(cxi->getSynchScope());
-    Out << "AtomicCmpXchgInst* " << iName
-        << " = new AtomicCmpXchgInst("
-        << opNames[0] << ", " << opNames[1] << ", " << opNames[2] << ", "
-        << SuccessOrdering << ", " << FailureOrdering << ", "
-        << CrossThread << ", " << bbname
-        << ");";
-    nl(Out) << iName << "->setName(\"";
-    printEscapedString(cxi->getName());
-    Out << "\");";
-    nl(Out) << iName << "->setVolatile("
-            << (cxi->isVolatile() ? "true" : "false") << ");";
-    nl(Out) << iName << "->setWeak("
-            << (cxi->isWeak() ? "true" : "false") << ");";
-    break;
-  }
-  case Instruction::AtomicRMW: {
-    const AtomicRMWInst *rmwi = cast<AtomicRMWInst>(I);
-    StringRef Ordering = ConvertAtomicOrdering(rmwi->getOrdering());
-    StringRef CrossThread = ConvertAtomicSynchScope(rmwi->getSynchScope());
-    StringRef Operation;
-    switch (rmwi->getOperation()) {
-      case AtomicRMWInst::Xchg: Operation = "AtomicRMWInst::Xchg"; break;
-      case AtomicRMWInst::Add:  Operation = "AtomicRMWInst::Add"; break;
-      case AtomicRMWInst::Sub:  Operation = "AtomicRMWInst::Sub"; break;
-      case AtomicRMWInst::And:  Operation = "AtomicRMWInst::And"; break;
-      case AtomicRMWInst::Nand: Operation = "AtomicRMWInst::Nand"; break;
-      case AtomicRMWInst::Or:   Operation = "AtomicRMWInst::Or"; break;
-      case AtomicRMWInst::Xor:  Operation = "AtomicRMWInst::Xor"; break;
-      case AtomicRMWInst::Max:  Operation = "AtomicRMWInst::Max"; break;
-      case AtomicRMWInst::Min:  Operation = "AtomicRMWInst::Min"; break;
-      case AtomicRMWInst::UMax: Operation = "AtomicRMWInst::UMax"; break;
-      case AtomicRMWInst::UMin: Operation = "AtomicRMWInst::UMin"; break;
-      case AtomicRMWInst::BAD_BINOP: llvm_unreachable("Bad atomic operation");
-    }
-    Out << "AtomicRMWInst* " << iName
-        << " = new AtomicRMWInst("
-        << Operation << ", "
-        << opNames[0] << ", " << opNames[1] << ", "
-        << Ordering << ", " << CrossThread << ", " << bbname
-        << ");";
-    nl(Out) << iName << "->setName(\"";
-    printEscapedString(rmwi->getName());
-    Out << "\");";
-    nl(Out) << iName << "->setVolatile("
-            << (rmwi->isVolatile() ? "true" : "false") << ");";
-    break;
-  }
-  case Instruction::LandingPad: {
-    const LandingPadInst *lpi = cast<LandingPadInst>(I);
-    Out << "LandingPadInst* " << iName << " = LandingPadInst::Create(";
-    printCppName(lpi->getType());
-    Out << ", " << opNames[0] << ", " << lpi->getNumClauses() << ", \"";
-    printEscapedString(lpi->getName());
-    Out << "\", " << bbname << ");";
-    nl(Out) << iName << "->setCleanup("
-            << (lpi->isCleanup() ? "true" : "false")
-            << ");";
-    for (unsigned i = 0, e = lpi->getNumClauses(); i != e; ++i)
-      nl(Out) << iName << "->addClause(" << opNames[i+1] << ");";
-    break;
-  }
-  }
-  DefinedValues.insert(I);
-  nl(Out);
-  delete [] opNames;
-}
-
-// Print out the types, constants and declarations needed by one function
-void CppWriter::printFunctionUses(const Function* F) {
-  nl(Out) << "// Type Definitions"; nl(Out);
-  if (!is_inline) {
-    // Print the function's return type
-    printType(F->getReturnType());
-
-    // Print the function's function type
-    printType(F->getFunctionType());
-
-    // Print the types of each of the function's arguments
-    for (Function::const_arg_iterator AI = F->arg_begin(), AE = F->arg_end();
-         AI != AE; ++AI) {
-      printType(AI->getType());
-    }
-  }
-
-  // Print type definitions for every type referenced by an instruction and
-  // make a note of any global values or constants that are referenced
-  SmallPtrSet<GlobalValue*,64> gvs;
-  SmallPtrSet<Constant*,64> consts;
-  for (Function::const_iterator BB = F->begin(), BE = F->end();
-       BB != BE; ++BB){
-    for (BasicBlock::const_iterator I = BB->begin(), E = BB->end();
-         I != E; ++I) {
-      // Print the type of the instruction itself
-      printType(I->getType());
-
-      // Print the type of each of the instruction's operands
-      for (unsigned i = 0; i < I->getNumOperands(); ++i) {
-        Value* operand = I->getOperand(i);
-        printType(operand->getType());
-
-        // If the operand references a GVal or Constant, make a note of it
-        if (GlobalValue* GV = dyn_cast<GlobalValue>(operand)) {
-          gvs.insert(GV);
-          if (GenerationType != GenFunction)
-            if (GlobalVariable *GVar = dyn_cast<GlobalVariable>(GV))
-              if (GVar->hasInitializer())
-                consts.insert(GVar->getInitializer());
-        } else if (Constant* C = dyn_cast<Constant>(operand)) {
-          consts.insert(C);
-          for (Value* operand : C->operands()) {
-            // If the operand references a GVal or Constant, make a note of it
-            printType(operand->getType());
-            if (GlobalValue* GV = dyn_cast<GlobalValue>(operand)) {
-              gvs.insert(GV);
-              if (GenerationType != GenFunction)
-                if (GlobalVariable *GVar = dyn_cast<GlobalVariable>(GV))
-                  if (GVar->hasInitializer())
-                    consts.insert(GVar->getInitializer());
-            }
-          }
-        }
-      }
-    }
-  }
-
-  // Print the function declarations for any functions encountered
-  nl(Out) << "// Function Declarations"; nl(Out);
-  for (auto *GV : gvs) {
-    if (Function *Fun = dyn_cast<Function>(GV)) {
-      if (!is_inline || Fun != F)
-        printFunctionHead(Fun);
-    }
-  }
-
-  // Print the global variable declarations for any variables encountered
-  nl(Out) << "// Global Variable Declarations"; nl(Out);
-  for (auto *GV : gvs) {
-    if (GlobalVariable *F = dyn_cast<GlobalVariable>(GV))
-      printVariableHead(F);
-  }
-
-  // Print the constants found
-  nl(Out) << "// Constant Definitions"; nl(Out);
-  for (const auto *C : consts) {
-    printConstant(C);
-  }
-
-  // Process the global variables definitions now that all the constants have
-  // been emitted. These definitions just couple the gvars with their constant
-  // initializers.
-  if (GenerationType != GenFunction) {
-    nl(Out) << "// Global Variable Definitions"; nl(Out);
-    for (auto *GV : gvs) {
-      if (GlobalVariable *Var = dyn_cast<GlobalVariable>(GV))
-        printVariableBody(Var);
-    }
-  }
-}
-
-void CppWriter::printFunctionHead(const Function* F) {
-  nl(Out) << "Function* " << getCppName(F);
-  Out << " = mod->getFunction(\"";
-  printEscapedString(F->getName());
-  Out << "\");";
-  nl(Out) << "if (!" << getCppName(F) << ") {";
-  nl(Out) << getCppName(F);
-
-  Out<< " = Function::Create(";
-  nl(Out,1) << "/*Type=*/" << getCppName(F->getFunctionType()) << ",";
-  nl(Out) << "/*Linkage=*/";
-  printLinkageType(F->getLinkage());
-  Out << ",";
-  nl(Out) << "/*Name=*/\"";
-  printEscapedString(F->getName());
-  Out << "\", mod); " << (F->isDeclaration()? "// (external, no body)" : "");
-  nl(Out,-1);
-  printCppName(F);
-  Out << "->setCallingConv(";
-  printCallingConv(F->getCallingConv());
-  Out << ");";
-  nl(Out);
-  if (F->hasSection()) {
-    printCppName(F);
-    Out << "->setSection(\"" << F->getSection() << "\");";
-    nl(Out);
-  }
-  if (F->getAlignment()) {
-    printCppName(F);
-    Out << "->setAlignment(" << F->getAlignment() << ");";
-    nl(Out);
-  }
-  if (F->getVisibility() != GlobalValue::DefaultVisibility) {
-    printCppName(F);
-    Out << "->setVisibility(";
-    printVisibilityType(F->getVisibility());
-    Out << ");";
-    nl(Out);
-  }
-  if (F->getDLLStorageClass() != GlobalValue::DefaultStorageClass) {
-    printCppName(F);
-    Out << "->setDLLStorageClass(";
-    printDLLStorageClassType(F->getDLLStorageClass());
-    Out << ");";
-    nl(Out);
-  }
-  if (F->hasGC()) {
-    printCppName(F);
-    Out << "->setGC(\"" << F->getGC() << "\");";
-    nl(Out);
-  }
-  Out << "}";
-  nl(Out);
-  printAttributes(F->getAttributes(), getCppName(F));
-  printCppName(F);
-  Out << "->setAttributes(" << getCppName(F) << "_PAL);";
-  nl(Out);
-}
-
-void CppWriter::printFunctionBody(const Function *F) {
-  if (F->isDeclaration())
-    return; // external functions have no bodies.
-
-  // Clear the DefinedValues and ForwardRefs maps because we can't have
-  // cross-function forward refs
-  ForwardRefs.clear();
-  DefinedValues.clear();
-
-  // Create all the argument values
-  if (!is_inline) {
-    if (!F->arg_empty()) {
-      Out << "Function::arg_iterator args = " << getCppName(F)
-          << "->arg_begin();";
-      nl(Out);
-    }
-    for (const Argument &AI : F->args()) {
-      Out << "Value* " << getCppName(&AI) << " = args++;";
-      nl(Out);
-      if (AI.hasName()) {
-        Out << getCppName(&AI) << "->setName(\"";
-        printEscapedString(AI.getName());
-        Out << "\");";
-        nl(Out);
-      }
-    }
-  }
-
-  // Create all the basic blocks
-  nl(Out);
-  for (const BasicBlock &BI : *F) {
-    std::string bbname(getCppName(&BI));
-    Out << "BasicBlock* " << bbname <<
-           " = BasicBlock::Create(mod->getContext(), \"";
-    if (BI.hasName())
-      printEscapedString(BI.getName());
-    Out << "\"," << getCppName(BI.getParent()) << ",0);";
-    nl(Out);
-  }
-
-  // Output all of its basic blocks... for the function
-  for (const BasicBlock &BI : *F) {
-    std::string bbname(getCppName(&BI));
-    nl(Out) << "// Block " << BI.getName() << " (" << bbname << ")";
-    nl(Out);
-
-    // Output all of the instructions in the basic block...
-    for (const Instruction &I : BI)
-      printInstruction(&I, bbname);
-  }
-
-  // Loop over the ForwardRefs and resolve them now that all instructions
-  // are generated.
-  if (!ForwardRefs.empty()) {
-    nl(Out) << "// Resolve Forward References";
-    nl(Out);
-  }
-
-  while (!ForwardRefs.empty()) {
-    ForwardRefMap::iterator I = ForwardRefs.begin();
-    Out << I->second << "->replaceAllUsesWith("
-        << getCppName(I->first) << "); delete " << I->second << ";";
-    nl(Out);
-    ForwardRefs.erase(I);
-  }
-}
-
-void CppWriter::printInline(const std::string& fname,
-                            const std::string& func) {
-  const Function* F = TheModule->getFunction(func);
-  if (!F) {
-    error(std::string("Function '") + func + "' not found in input module");
-    return;
-  }
-  if (F->isDeclaration()) {
-    error(std::string("Function '") + func + "' is external!");
-    return;
-  }
-  nl(Out) << "BasicBlock* " << fname << "(Module* mod, Function *"
-          << getCppName(F);
-  unsigned arg_count = 1;
-  for (Function::const_arg_iterator AI = F->arg_begin(), AE = F->arg_end();
-       AI != AE; ++AI) {
-    Out << ", Value* arg_" << arg_count++;
-  }
-  Out << ") {";
-  nl(Out);
-  is_inline = true;
-  printFunctionUses(F);
-  printFunctionBody(F);
-  is_inline = false;
-  Out << "return " << getCppName(&F->front()) << ";";
-  nl(Out) << "}";
-  nl(Out);
-}
-
-void CppWriter::printModuleBody() {
-  // Print out all the type definitions
-  nl(Out) << "// Type Definitions"; nl(Out);
-  printTypes(TheModule);
-
-  // Functions can call each other and global variables can reference them so
-  // define all the functions first before emitting their function bodies.
-  nl(Out) << "// Function Declarations"; nl(Out);
-  for (const Function &I : *TheModule)
-    printFunctionHead(&I);
-
-  // Process the global variables declarations. We can't initialze them until
-  // after the constants are printed so just print a header for each global
-  nl(Out) << "// Global Variable Declarations\n"; nl(Out);
-  for (const GlobalVariable &I : TheModule->globals())
-    printVariableHead(&I);
-
-  // Print out all the constants definitions. Constants don't recurse except
-  // through GlobalValues. All GlobalValues have been declared at this point
-  // so we can proceed to generate the constants.
-  nl(Out) << "// Constant Definitions"; nl(Out);
-  printConstants(TheModule);
-
-  // Process the global variables definitions now that all the constants have
-  // been emitted. These definitions just couple the gvars with their constant
-  // initializers.
-  nl(Out) << "// Global Variable Definitions"; nl(Out);
-  for (const GlobalVariable &I : TheModule->globals())
-    printVariableBody(&I);
-
-  // Finally, we can safely put out all of the function bodies.
-  nl(Out) << "// Function Definitions"; nl(Out);
-  for (const Function &I : *TheModule) {
-    if (!I.isDeclaration()) {
-      nl(Out) << "// Function: " << I.getName() << " (" << getCppName(&I)
-              << ")";
-      nl(Out) << "{";
-      nl(Out,1);
-      printFunctionBody(&I);
-      nl(Out,-1) << "}";
-      nl(Out);
-    }
-  }
-}
-
-void CppWriter::printProgram(const std::string& fname,
-                             const std::string& mName) {
-  Out << "#include <llvm/Pass.h>\n";
-
-  Out << "#include <llvm/ADT/SmallVector.h>\n";
-  Out << "#include <llvm/Analysis/Verifier.h>\n";
-  Out << "#include <llvm/IR/BasicBlock.h>\n";
-  Out << "#include <llvm/IR/CallingConv.h>\n";
-  Out << "#include <llvm/IR/Constants.h>\n";
-  Out << "#include <llvm/IR/DerivedTypes.h>\n";
-  Out << "#include <llvm/IR/Function.h>\n";
-  Out << "#include <llvm/IR/GlobalVariable.h>\n";
-  Out << "#include <llvm/IR/IRPrintingPasses.h>\n";
-  Out << "#include <llvm/IR/InlineAsm.h>\n";
-  Out << "#include <llvm/IR/Instructions.h>\n";
-  Out << "#include <llvm/IR/LLVMContext.h>\n";
-  Out << "#include <llvm/IR/LegacyPassManager.h>\n";
-  Out << "#include <llvm/IR/Module.h>\n";
-  Out << "#include <llvm/Support/FormattedStream.h>\n";
-  Out << "#include <llvm/Support/MathExtras.h>\n";
-  Out << "#include <algorithm>\n";
-  Out << "using namespace llvm;\n\n";
-  Out << "Module* " << fname << "();\n\n";
-  Out << "int main(int argc, char**argv) {\n";
-  Out << "  Module* Mod = " << fname << "();\n";
-  Out << "  verifyModule(*Mod, PrintMessageAction);\n";
-  Out << "  PassManager PM;\n";
-  Out << "  PM.add(createPrintModulePass(&outs()));\n";
-  Out << "  PM.run(*Mod);\n";
-  Out << "  return 0;\n";
-  Out << "}\n\n";
-  printModule(fname,mName);
-}
-
-void CppWriter::printModule(const std::string& fname,
-                            const std::string& mName) {
-  nl(Out) << "Module* " << fname << "() {";
-  nl(Out,1) << "// Module Construction";
-  nl(Out) << "Module* mod = new Module(\"";
-  printEscapedString(mName);
-  Out << "\", getGlobalContext());";
-  if (!TheModule->getTargetTriple().empty()) {
-    nl(Out) << "mod->setDataLayout(\"" << TheModule->getDataLayoutStr()
-            << "\");";
-  }
-  if (!TheModule->getTargetTriple().empty()) {
-    nl(Out) << "mod->setTargetTriple(\"" << TheModule->getTargetTriple()
-            << "\");";
-  }
-
-  if (!TheModule->getModuleInlineAsm().empty()) {
-    nl(Out) << "mod->setModuleInlineAsm(\"";
-    printEscapedString(TheModule->getModuleInlineAsm());
-    Out << "\");";
-  }
-  nl(Out);
-
-  printModuleBody();
-  nl(Out) << "return mod;";
-  nl(Out,-1) << "}";
-  nl(Out);
-}
-
-void CppWriter::printContents(const std::string& fname,
-                              const std::string& mName) {
-  Out << "\nModule* " << fname << "(Module *mod) {\n";
-  Out << "\nmod->setModuleIdentifier(\"";
-  printEscapedString(mName);
-  Out << "\");\n";
-  printModuleBody();
-  Out << "\nreturn mod;\n";
-  Out << "\n}\n";
-}
-
-void CppWriter::printFunction(const std::string& fname,
-                              const std::string& funcName) {
-  const Function* F = TheModule->getFunction(funcName);
-  if (!F) {
-    error(std::string("Function '") + funcName + "' not found in input module");
-    return;
-  }
-  Out << "\nFunction* " << fname << "(Module *mod) {\n";
-  printFunctionUses(F);
-  printFunctionHead(F);
-  printFunctionBody(F);
-  Out << "return " << getCppName(F) << ";\n";
-  Out << "}\n";
-}
-
-void CppWriter::printFunctions() {
-  const Module::FunctionListType &funcs = TheModule->getFunctionList();
-  Module::const_iterator I  = funcs.begin();
-  Module::const_iterator IE = funcs.end();
-
-  for (; I != IE; ++I) {
-    const Function &func = *I;
-    if (!func.isDeclaration()) {
-      std::string name("define_");
-      name += func.getName();
-      printFunction(name, func.getName());
-    }
-  }
-}
-
-void CppWriter::printVariable(const std::string& fname,
-                              const std::string& varName) {
-  const GlobalVariable* GV = TheModule->getNamedGlobal(varName);
-
-  if (!GV) {
-    error(std::string("Variable '") + varName + "' not found in input module");
-    return;
-  }
-  Out << "\nGlobalVariable* " << fname << "(Module *mod) {\n";
-  printVariableUses(GV);
-  printVariableHead(GV);
-  printVariableBody(GV);
-  Out << "return " << getCppName(GV) << ";\n";
-  Out << "}\n";
-}
-
-void CppWriter::printType(const std::string &fname,
-                          const std::string &typeName) {
-  Type* Ty = TheModule->getTypeByName(typeName);
-  if (!Ty) {
-    error(std::string("Type '") + typeName + "' not found in input module");
-    return;
-  }
-  Out << "\nType* " << fname << "(Module *mod) {\n";
-  printType(Ty);
-  Out << "return " << getCppName(Ty) << ";\n";
-  Out << "}\n";
-}
-
-bool CppWriter::runOnModule(Module &M) {
-  TheModule = &M;
-
-  // Emit a header
-  Out << "// Generated by llvm2cpp - DO NOT MODIFY!\n\n";
-
-  // Get the name of the function we're supposed to generate
-  std::string fname = FuncName.getValue();
-
-  // Get the name of the thing we are to generate
-  std::string tgtname = NameToGenerate.getValue();
-  if (GenerationType == GenModule ||
-      GenerationType == GenContents ||
-      GenerationType == GenProgram ||
-      GenerationType == GenFunctions) {
-    if (tgtname == "!bad!") {
-      if (M.getModuleIdentifier() == "-")
-        tgtname = "<stdin>";
-      else
-        tgtname = M.getModuleIdentifier();
-    }
-  } else if (tgtname == "!bad!")
-    error("You must use the -for option with -gen-{function,variable,type}");
-
-  switch (WhatToGenerate(GenerationType)) {
-   case GenProgram:
-    if (fname.empty())
-      fname = "makeLLVMModule";
-    printProgram(fname,tgtname);
-    break;
-   case GenModule:
-    if (fname.empty())
-      fname = "makeLLVMModule";
-    printModule(fname,tgtname);
-    break;
-   case GenContents:
-    if (fname.empty())
-      fname = "makeLLVMModuleContents";
-    printContents(fname,tgtname);
-    break;
-   case GenFunction:
-    if (fname.empty())
-      fname = "makeLLVMFunction";
-    printFunction(fname,tgtname);
-    break;
-   case GenFunctions:
-    printFunctions();
-    break;
-   case GenInline:
-    if (fname.empty())
-      fname = "makeLLVMInline";
-    printInline(fname,tgtname);
-    break;
-   case GenVariable:
-    if (fname.empty())
-      fname = "makeLLVMVariable";
-    printVariable(fname,tgtname);
-    break;
-   case GenType:
-    if (fname.empty())
-      fname = "makeLLVMType";
-    printType(fname,tgtname);
-    break;
-  }
-
-  return false;
-}
-
-char CppWriter::ID = 0;
-
-//===----------------------------------------------------------------------===//
-//                       External Interface declaration
-//===----------------------------------------------------------------------===//
-
-bool CPPTargetMachine::addPassesToEmitFile(
-    PassManagerBase &PM, raw_pwrite_stream &o, CodeGenFileType FileType,
-    bool DisableVerify, AnalysisID StartBefore, AnalysisID StartAfter,
-    AnalysisID StopAfter, MachineFunctionInitializer *MFInitializer) {
-  if (FileType != TargetMachine::CGFT_AssemblyFile)
-    return true;
-  auto FOut = llvm::make_unique<formatted_raw_ostream>(o);
-  PM.add(new CppWriter(std::move(FOut)));
-  return false;
-}
diff --git a/lib/Target/CppBackend/CPPTargetMachine.h b/lib/Target/CppBackend/CPPTargetMachine.h
deleted file mode 100644
index 00e402feffbc..000000000000
--- a/lib/Target/CppBackend/CPPTargetMachine.h
+++ /dev/null
@@ -1,44 +0,0 @@
-//===-- CPPTargetMachine.h - TargetMachine for the C++ backend --*- C++ -*-===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-//
-// This file declares the TargetMachine that is used by the C++ backend.
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef LLVM_LIB_TARGET_CPPBACKEND_CPPTARGETMACHINE_H
-#define LLVM_LIB_TARGET_CPPBACKEND_CPPTARGETMACHINE_H
-
-#include "llvm/IR/DataLayout.h"
-#include "llvm/Target/TargetMachine.h"
-#include "llvm/Target/TargetSubtargetInfo.h"
-
-namespace llvm {
-
-class formatted_raw_ostream;
-
-struct CPPTargetMachine : public TargetMachine {
-  CPPTargetMachine(const Target &T, const Triple &TT, StringRef CPU,
-                   StringRef FS, const TargetOptions &Options, Reloc::Model RM,
-                   CodeModel::Model CM, CodeGenOpt::Level OL)
-      : TargetMachine(T, "", TT, CPU, FS, Options) {}
-
-public:
-  bool addPassesToEmitFile(PassManagerBase &PM, raw_pwrite_stream &Out,
-                           CodeGenFileType FileType, bool DisableVerify,
-                           AnalysisID StartBefore, AnalysisID StartAfter,
-                           AnalysisID StopAfter,
-                           MachineFunctionInitializer *MFInitializer) override;
-};
-
-extern Target TheCppBackendTarget;
-
-} // End llvm namespace
-
-
-#endif
diff --git a/lib/Target/CppBackend/Makefile b/lib/Target/CppBackend/Makefile
deleted file mode 100644
index efc7463fda3d..000000000000
--- a/lib/Target/CppBackend/Makefile
+++ /dev/null
@@ -1,16 +0,0 @@
-##===- lib/Target/CppBackend/Makefile --- ------------------*- Makefile -*-===##
-#
-#                     The LLVM Compiler Infrastructure
-#
-# This file is distributed under the University of Illinois Open Source
-# License. See LICENSE.TXT for details.
-#
-##===----------------------------------------------------------------------===##
-
-LEVEL = ../../..
-LIBRARYNAME = LLVMCppBackendCodeGen
-DIRS = TargetInfo
-
-include $(LEVEL)/Makefile.common
-
-CompileCommonOpts += -Wno-format
diff --git a/lib/Target/CppBackend/TargetInfo/CMakeLists.txt b/lib/Target/CppBackend/TargetInfo/CMakeLists.txt
deleted file mode 100644
index d86446f6bc02..000000000000
--- a/lib/Target/CppBackend/TargetInfo/CMakeLists.txt
+++ /dev/null
@@ -1,3 +0,0 @@
-add_llvm_library(LLVMCppBackendInfo
-  CppBackendTargetInfo.cpp
-  )
diff --git a/lib/Target/CppBackend/TargetInfo/CppBackendTargetInfo.cpp b/lib/Target/CppBackend/TargetInfo/CppBackendTargetInfo.cpp
deleted file mode 100644
index f88d82228ca4..000000000000
--- a/lib/Target/CppBackend/TargetInfo/CppBackendTargetInfo.cpp
+++ /dev/null
@@ -1,29 +0,0 @@
-//===-- CppBackendTargetInfo.cpp - CppBackend Target Implementation -------===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-
-#include "CPPTargetMachine.h"
-#include "llvm/IR/Module.h"
-#include "llvm/Support/TargetRegistry.h"
-using namespace llvm;
-
-Target llvm::TheCppBackendTarget;
-
-static bool CppBackend_TripleMatchQuality(Triple::ArchType Arch) {
-  // This backend doesn't correspond to any architecture. It must be explicitly
-  // selected with -march.
-  return false;
-}
-
-extern "C" void LLVMInitializeCppBackendTargetInfo() {
-  TargetRegistry::RegisterTarget(TheCppBackendTarget, "cpp",
-                                  "C++ backend",
-                                  &CppBackend_TripleMatchQuality);
-}
-
-extern "C" void LLVMInitializeCppBackendTargetMC() {}
diff --git a/lib/Target/CppBackend/TargetInfo/Makefile b/lib/Target/CppBackend/TargetInfo/Makefile
deleted file mode 100644
index 6e682838daec..000000000000
--- a/lib/Target/CppBackend/TargetInfo/Makefile
+++ /dev/null
@@ -1,15 +0,0 @@
-##===- lib/Target/CppBackend/TargetInfo/Makefile -----------*- Makefile -*-===##
-#
-#                     The LLVM Compiler Infrastructure
-#
-# This file is distributed under the University of Illinois Open Source
-# License. See LICENSE.TXT for details.
-#
-##===----------------------------------------------------------------------===##
-LEVEL = ../../../..
-LIBRARYNAME = LLVMCppBackendInfo
-
-# Hack: we need to include 'main' target directory to grab private headers
-CPPFLAGS = -I$(PROJ_OBJ_DIR)/.. -I$(PROJ_SRC_DIR)/..
-
-include $(LEVEL)/Makefile.common
diff --git a/lib/Target/Hexagon/AsmParser/HexagonAsmParser.cpp b/lib/Target/Hexagon/AsmParser/HexagonAsmParser.cpp
index a8622a96527c..496efbf7374b 100644
--- a/lib/Target/Hexagon/AsmParser/HexagonAsmParser.cpp
+++ b/lib/Target/Hexagon/AsmParser/HexagonAsmParser.cpp
@@ -13,14 +13,13 @@
 #include "HexagonRegisterInfo.h"
 #include "HexagonTargetStreamer.h"
 #include "MCTargetDesc/HexagonBaseInfo.h"
-#include "MCTargetDesc/HexagonMCELFStreamer.h"
+#include "MCTargetDesc/HexagonMCAsmInfo.h"
 #include "MCTargetDesc/HexagonMCChecker.h"
+#include "MCTargetDesc/HexagonMCELFStreamer.h"
 #include "MCTargetDesc/HexagonMCExpr.h"
 #include "MCTargetDesc/HexagonMCShuffler.h"
 #include "MCTargetDesc/HexagonMCTargetDesc.h"
-#include "MCTargetDesc/HexagonMCAsmInfo.h"
 #include "MCTargetDesc/HexagonShuffler.h"
-#include "llvm/ADT/SmallString.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/StringExtras.h"
 #include "llvm/ADT/Twine.h"
@@ -31,19 +30,19 @@
 #include "llvm/MC/MCParser/MCAsmLexer.h"
 #include "llvm/MC/MCParser/MCAsmParser.h"
 #include "llvm/MC/MCParser/MCParsedAsmOperand.h"
-#include "llvm/MC/MCStreamer.h"
+#include "llvm/MC/MCParser/MCTargetAsmParser.h"
 #include "llvm/MC/MCSectionELF.h"
+#include "llvm/MC/MCStreamer.h"
 #include "llvm/MC/MCSubtargetInfo.h"
-#include "llvm/MC/MCTargetAsmParser.h"
+#include "llvm/MC/MCValue.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/ELF.h"
 #include "llvm/Support/Format.h"
-#include "llvm/Support/SourceMgr.h"
 #include "llvm/Support/MemoryBuffer.h"
+#include "llvm/Support/SourceMgr.h"
 #include "llvm/Support/TargetRegistry.h"
 #include "llvm/Support/raw_ostream.h"
-#include <sstream>
 
 using namespace llvm;
 
@@ -108,7 +107,7 @@ class HexagonAsmParser : public MCTargetAsmParser {
   void canonicalizeImmediates(MCInst &MCI);
   bool matchOneInstruction(MCInst &MCB, SMLoc IDLoc,
                            OperandVector &InstOperands, uint64_t &ErrorInfo,
-                           bool MatchingInlineAsm, bool &MustExtend);
+                           bool MatchingInlineAsm);
 
   bool MatchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode,
                                OperandVector &Operands, MCStreamer &Out,
@@ -117,7 +116,7 @@ class HexagonAsmParser : public MCTargetAsmParser {
   unsigned validateTargetOperandClass(MCParsedAsmOperand &Op, unsigned Kind) override;
   void OutOfRange(SMLoc IDLoc, long long Val, long long Max);
   int processInstruction(MCInst &Inst, OperandVector const &Operands,
-                         SMLoc IDLoc, bool &MustExtend);
+                         SMLoc IDLoc);
 
   // Check if we have an assembler and, if so, set the ELF e_header flags.
   void chksetELFHeaderEFlags(unsigned flags) {
@@ -125,6 +124,8 @@ class HexagonAsmParser : public MCTargetAsmParser {
       getAssembler()->setELFHeaderEFlags(flags);
   }
 
+  unsigned matchRegister(StringRef Name);
+
 /// @name Auto-generated Match Functions
 /// {
 
@@ -150,7 +151,6 @@ public:
   }
   }
 
-  bool mustExtend(OperandVector &Operands);
   bool splitIdentifier(OperandVector &Operands);
   bool parseOperand(OperandVector &Operands);
   bool parseInstruction(OperandVector &Operands);
@@ -186,7 +186,6 @@ struct HexagonOperand : public MCParsedAsmOperand {
 
   struct ImmTy {
     const MCExpr *Val;
-    bool MustExtend;
   };
 
   struct InstTy {
@@ -243,8 +242,8 @@ public:
   bool CheckImmRange(int immBits, int zeroBits, bool isSigned,
                      bool isRelocatable, bool Extendable) const {
     if (Kind == Immediate) {
-      const MCExpr *myMCExpr = getImm();
-      if (Imm.MustExtend && !Extendable)
+      const MCExpr *myMCExpr = &HexagonMCInstrInfo::getExpr(*getImm());
+      if (HexagonMCInstrInfo::mustExtend(*Imm.Val) && !Extendable)
         return false;
       int64_t Res;
       if (myMCExpr->evaluateAsAbsolute(Res)) {
@@ -278,6 +277,7 @@ public:
 
   bool isf32Ext() const { return false; }
   bool iss32Imm() const { return CheckImmRange(32, 0, true, true, false); }
+  bool iss23_2Imm() const { return CheckImmRange(23, 2, true, true, false); }
   bool iss8Imm() const { return CheckImmRange(8, 0, true, false, false); }
   bool iss8Imm64() const { return CheckImmRange(8, 0, true, true, false); }
   bool iss7Imm() const { return CheckImmRange(7, 0, true, false, false); }
@@ -347,7 +347,7 @@ public:
   bool isu6_1Ext() const { return CheckImmRange(6 + 26, 1, false, true, true); }
   bool isu6_2Ext() const { return CheckImmRange(6 + 26, 2, false, true, true); }
   bool isu6_3Ext() const { return CheckImmRange(6 + 26, 3, false, true, true); }
-  bool isu32MustExt() const { return isImm() && Imm.MustExtend; }
+  bool isu32MustExt() const { return isImm(); }
 
   void addRegOperands(MCInst &Inst, unsigned N) const {
     assert(N == 1 && "Invalid number of operands!");
@@ -361,20 +361,17 @@ public:
 
   void addSignedImmOperands(MCInst &Inst, unsigned N) const {
     assert(N == 1 && "Invalid number of operands!");
-    MCExpr const *Expr = getImm();
+    HexagonMCExpr *Expr =
+        const_cast<HexagonMCExpr *>(cast<HexagonMCExpr>(getImm()));
     int64_t Value;
     if (!Expr->evaluateAsAbsolute(Value)) {
       Inst.addOperand(MCOperand::createExpr(Expr));
       return;
     }
-    int64_t Extended = SignExtend64 (Value, 32);
-    if ((Extended < 0) == (Value < 0)) {
-      Inst.addOperand(MCOperand::createExpr(Expr));
-      return;
-    }
-    // Flip bit 33 to signal signed unsigned mismatch
-    Extended ^= 0x100000000;
-    Inst.addOperand(MCOperand::createImm(Extended));
+    int64_t Extended = SignExtend64(Value, 32);
+    if ((Extended < 0) != (Value < 0))
+      Expr->setSignMismatch();
+    Inst.addOperand(MCOperand::createExpr(Expr));
   }
 
   void addf32ExtOperands(MCInst &Inst, unsigned N) const {
@@ -384,6 +381,9 @@ public:
   void adds32ImmOperands(MCInst &Inst, unsigned N) const {
     addSignedImmOperands(Inst, N);
   }
+  void adds23_2ImmOperands(MCInst &Inst, unsigned N) const {
+    addSignedImmOperands(Inst, N);
+  }
   void adds8ImmOperands(MCInst &Inst, unsigned N) const {
     addSignedImmOperands(Inst, N);
   }
@@ -553,13 +553,15 @@ public:
 
   void adds4_6ImmOperands(MCInst &Inst, unsigned N) const {
     assert(N == 1 && "Invalid number of operands!");
-    const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm());
+    const MCConstantExpr *CE =
+        dyn_cast<MCConstantExpr>(&HexagonMCInstrInfo::getExpr(*getImm()));
     Inst.addOperand(MCOperand::createImm(CE->getValue() * 64));
   }
 
   void adds3_6ImmOperands(MCInst &Inst, unsigned N) const {
     assert(N == 1 && "Invalid number of operands!");
-    const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm());
+    const MCConstantExpr *CE =
+        dyn_cast<MCConstantExpr>(&HexagonMCInstrInfo::getExpr(*getImm()));
     Inst.addOperand(MCOperand::createImm(CE->getValue() * 64));
   }
 
@@ -592,7 +594,6 @@ public:
                                                    SMLoc E) {
     HexagonOperand *Op = new HexagonOperand(Immediate);
     Op->Imm.Val = Val;
-    Op->Imm.MustExtend = false;
     Op->StartLoc = S;
     Op->EndLoc = E;
     return std::unique_ptr<HexagonOperand>(Op);
@@ -616,9 +617,6 @@ void HexagonOperand::print(raw_ostream &OS) const {
   }
 }
 
-/// @name Auto-generated Match Functions
-static unsigned MatchRegisterName(StringRef Name);
-
 bool HexagonAsmParser::finishBundle(SMLoc IDLoc, MCStreamer &Out) {
   DEBUG(dbgs() << "Bundle:");
   DEBUG(MCB.dump_pretty(dbgs()));
@@ -730,11 +728,10 @@ bool HexagonAsmParser::finishBundle(SMLoc IDLoc, MCStreamer &Out) {
 
 bool HexagonAsmParser::matchBundleOptions() {
   MCAsmParser &Parser = getParser();
-  MCAsmLexer &Lexer = getLexer();
   while (true) {
     if (!Parser.getTok().is(AsmToken::Colon))
       return false;
-    Lexer.Lex();
+    Lex();
     StringRef Option = Parser.getTok().getString();
     if (Option.compare_lower("endloop0") == 0)
       HexagonMCInstrInfo::setInnerLoop(MCB);
@@ -746,7 +743,7 @@ bool HexagonAsmParser::matchBundleOptions() {
       HexagonMCInstrInfo::setMemStoreReorderEnabled(MCB);
     else
       return true;
-    Lexer.Lex();
+    Lex();
   }
 }
 
@@ -759,33 +756,29 @@ void HexagonAsmParser::canonicalizeImmediates(MCInst &MCI) {
   for (MCOperand &I : MCI)
     if (I.isImm()) {
       int64_t Value (I.getImm());
-      if ((Value & 0x100000000) != (Value & 0x80000000)) {
-        // Detect flipped bit 33 wrt bit 32 and signal warning
-        Value ^= 0x100000000;
-        if (WarnSignedMismatch)
-          Warning (MCI.getLoc(), "Signed/Unsigned mismatch");
-      }
-      NewInst.addOperand(MCOperand::createExpr(
-          MCConstantExpr::create(Value, getContext())));
+      NewInst.addOperand(MCOperand::createExpr(HexagonMCExpr::create(
+          MCConstantExpr::create(Value, getContext()), getContext())));
     }
-    else
+    else {
+      if (I.isExpr() && cast<HexagonMCExpr>(I.getExpr())->signMismatch() &&
+          WarnSignedMismatch)
+        Warning (MCI.getLoc(), "Signed/Unsigned mismatch");
       NewInst.addOperand(I);
+    }
   MCI = NewInst;
 }
 
 bool HexagonAsmParser::matchOneInstruction(MCInst &MCI, SMLoc IDLoc,
                                            OperandVector &InstOperands,
                                            uint64_t &ErrorInfo,
-                                           bool MatchingInlineAsm,
-                                           bool &MustExtend) {
+                                           bool MatchingInlineAsm) {
   // Perform matching with tablegen asmmatcher generated function
   int result =
       MatchInstructionImpl(InstOperands, MCI, ErrorInfo, MatchingInlineAsm);
   if (result == Match_Success) {
     MCI.setLoc(IDLoc);
-    MustExtend = mustExtend(InstOperands);
     canonicalizeImmediates(MCI);
-    result = processInstruction(MCI, InstOperands, IDLoc, MustExtend);
+    result = processInstruction(MCI, InstOperands, IDLoc);
 
     DEBUG(dbgs() << "Insn:");
     DEBUG(MCI.dump_pretty(dbgs()));
@@ -823,17 +816,6 @@ bool HexagonAsmParser::matchOneInstruction(MCInst &MCI, SMLoc IDLoc,
   llvm_unreachable("Implement any new match types added!");
 }
 
-bool HexagonAsmParser::mustExtend(OperandVector &Operands) {
-  unsigned Count = 0;
-  for (std::unique_ptr<MCParsedAsmOperand> &i : Operands)
-    if (i->isImm())
-      if (static_cast<HexagonOperand *>(i.get())->Imm.MustExtend)
-        ++Count;
-  // Multiple extenders should have been filtered by iss9Ext et. al.
-  assert(Count < 2 && "Multiple extenders");
-  return Count == 1;
-}
-
 bool HexagonAsmParser::MatchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode,
                                                OperandVector &Operands,
                                                MCStreamer &Out,
@@ -865,13 +847,11 @@ bool HexagonAsmParser::MatchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode,
     return finishBundle(IDLoc, Out);
   }
   MCInst *SubInst = new (getParser().getContext()) MCInst;
-  bool MustExtend = false;
   if (matchOneInstruction(*SubInst, IDLoc, Operands, ErrorInfo,
-                          MatchingInlineAsm, MustExtend))
+                          MatchingInlineAsm))
     return true;
   HexagonMCInstrInfo::extendIfNeeded(
-      getParser().getContext(), MCII, MCB, *SubInst,
-      HexagonMCInstrInfo::isExtended(MCII, *SubInst) || MustExtend);
+      getParser().getContext(), MCII, MCB, *SubInst);
   MCB.addOperand(MCOperand::createInst(SubInst));
   if (!InBrackets)
     return finishBundle(IDLoc, Out);
@@ -916,7 +896,8 @@ bool HexagonAsmParser::ParseDirectiveSubsection(SMLoc L) {
   // end of the section.  Only legacy hexagon-gcc created assembly code
   // used negative subsections.
   if ((Res < 0) && (Res > -8193))
-    Subsection = MCConstantExpr::create(8192 + Res, this->getContext());
+    Subsection = HexagonMCExpr::create(
+        MCConstantExpr::create(8192 + Res, getContext()), getContext());
 
   getStreamer().SubSection(Subsection);
   return false;
@@ -1110,7 +1091,7 @@ bool HexagonAsmParser::splitIdentifier(OperandVector &Operands) {
   AsmToken const &Token = getParser().getTok();
   StringRef String = Token.getString();
   SMLoc Loc = Token.getLoc();
-  getLexer().Lex();
+  Lex();
   do {
     std::pair<StringRef, StringRef> HeadTail = String.split('.');
     if (!HeadTail.first.empty())
@@ -1144,7 +1125,7 @@ bool HexagonAsmParser::parseOperand(OperandVector &Operands) {
           static char const *RParen = ")";
           Operands.push_back(HexagonOperand::CreateToken(LParen, Begin));
           Operands.push_back(HexagonOperand::CreateReg(Register, Begin, End));
-          AsmToken MaybeDotNew = Lexer.getTok();
+          const AsmToken &MaybeDotNew = Lexer.getTok();
           if (MaybeDotNew.is(AsmToken::TokenKind::Identifier) &&
               MaybeDotNew.getString().equals_lower(".new"))
             splitIdentifier(Operands);
@@ -1160,7 +1141,7 @@ bool HexagonAsmParser::parseOperand(OperandVector &Operands) {
           Operands.insert(Operands.end () - 1,
                           HexagonOperand::CreateToken(LParen, Begin));
           Operands.push_back(HexagonOperand::CreateReg(Register, Begin, End));
-          AsmToken MaybeDotNew = Lexer.getTok();
+          const AsmToken &MaybeDotNew = Lexer.getTok();
           if (MaybeDotNew.is(AsmToken::TokenKind::Identifier) &&
               MaybeDotNew.getString().equals_lower(".new"))
             splitIdentifier(Operands);
@@ -1186,7 +1167,7 @@ bool HexagonAsmParser::isLabel(AsmToken &Token) {
     return false;
   if (!Token.is(AsmToken::TokenKind::Identifier))
     return true;
-  if (!MatchRegisterName(String.lower()))
+  if (!matchRegister(String.lower()))
     return true;
   (void)Second;
   assert(Second.is(AsmToken::Colon));
@@ -1197,7 +1178,7 @@ bool HexagonAsmParser::isLabel(AsmToken &Token) {
                   Collapsed.end());
   StringRef Whole = Collapsed;
   std::pair<StringRef, StringRef> DotSplit = Whole.split('.');
-  if (!MatchRegisterName(DotSplit.first.lower()))
+  if (!matchRegister(DotSplit.first.lower()))
     return true;
   return false;
 }
@@ -1242,7 +1223,7 @@ bool HexagonAsmParser::ParseRegister(unsigned &RegNo, SMLoc &StartLoc, SMLoc &En
                   Collapsed.end());
   StringRef FullString = Collapsed;
   std::pair<StringRef, StringRef> DotSplit = FullString.split('.');
-  unsigned DotReg = MatchRegisterName(DotSplit.first.lower());
+  unsigned DotReg = matchRegister(DotSplit.first.lower());
   if (DotReg != Hexagon::NoRegister && RegisterMatchesArch(DotReg)) {
     if (DotSplit.second.empty()) {
       RegNo = DotReg;
@@ -1262,7 +1243,7 @@ bool HexagonAsmParser::ParseRegister(unsigned &RegNo, SMLoc &StartLoc, SMLoc &En
     }
   }
   std::pair<StringRef, StringRef> ColonSplit = StringRef(FullString).split(':');
-  unsigned ColonReg = MatchRegisterName(ColonSplit.first.lower());
+  unsigned ColonReg = matchRegister(ColonSplit.first.lower());
   if (ColonReg != Hexagon::NoRegister && RegisterMatchesArch(DotReg)) {
     Lexer.UnLex(Lookahead.back());
     Lookahead.pop_back();
@@ -1302,7 +1283,7 @@ bool HexagonAsmParser::parseExpression(MCExpr const *& Expr) {
   static char const * Comma = ",";
   do {
     Tokens.emplace_back (Lexer.getTok());
-    Lexer.Lex();
+    Lex();
     switch (Tokens.back().getKind())
     {
     case AsmToken::TokenKind::Hash:
@@ -1333,11 +1314,12 @@ bool HexagonAsmParser::parseExpressionOrOperand(OperandVector &Operands) {
   if (implicitExpressionLocation(Operands)) {
     MCAsmParser &Parser = getParser();
     SMLoc Loc = Parser.getLexer().getLoc();
-    std::unique_ptr<HexagonOperand> Expr =
-        HexagonOperand::CreateImm(nullptr, Loc, Loc);
-    MCExpr const *& Val = Expr->Imm.Val;
-    Operands.push_back(std::move(Expr));
-    return parseExpression(Val);
+    MCExpr const *Expr = nullptr;
+    bool Error = parseExpression(Expr);
+    Expr = HexagonMCExpr::create(Expr, getContext());
+    if (!Error)
+      Operands.push_back(HexagonOperand::CreateImm(Expr, Loc, Loc));
+    return Error;
   }
   return parseOperand(Operands);
 }
@@ -1350,7 +1332,7 @@ bool HexagonAsmParser::parseInstruction(OperandVector &Operands) {
     AsmToken const &Token = Parser.getTok();
     switch (Token.getKind()) {
     case AsmToken::EndOfStatement: {
-      Lexer.Lex();
+      Lex();
       return false;
     }
     case AsmToken::LCurly: {
@@ -1358,19 +1340,19 @@ bool HexagonAsmParser::parseInstruction(OperandVector &Operands) {
         return true;
       Operands.push_back(
           HexagonOperand::CreateToken(Token.getString(), Token.getLoc()));
-      Lexer.Lex();
+      Lex();
       return false;
     }
     case AsmToken::RCurly: {
       if (Operands.empty()) {
         Operands.push_back(
             HexagonOperand::CreateToken(Token.getString(), Token.getLoc()));
-        Lexer.Lex();
+        Lex();
       }
       return false;
     }
     case AsmToken::Comma: {
-      Lexer.Lex();
+      Lex();
       continue;
     }
     case AsmToken::EqualEqual:
@@ -1383,30 +1365,28 @@ bool HexagonAsmParser::parseInstruction(OperandVector &Operands) {
           Token.getString().substr(0, 1), Token.getLoc()));
       Operands.push_back(HexagonOperand::CreateToken(
           Token.getString().substr(1, 1), Token.getLoc()));
-      Lexer.Lex();
+      Lex();
       continue;
     }
     case AsmToken::Hash: {
       bool MustNotExtend = false;
       bool ImplicitExpression = implicitExpressionLocation(Operands);
-      std::unique_ptr<HexagonOperand> Expr = HexagonOperand::CreateImm(
-          nullptr, Lexer.getLoc(), Lexer.getLoc());
+      SMLoc ExprLoc = Lexer.getLoc();
       if (!ImplicitExpression)
         Operands.push_back(
           HexagonOperand::CreateToken(Token.getString(), Token.getLoc()));
-      Lexer.Lex();
+      Lex();
       bool MustExtend = false;
       bool HiOnly = false;
       bool LoOnly = false;
       if (Lexer.is(AsmToken::Hash)) {
-        Lexer.Lex();
+        Lex();
         MustExtend = true;
       } else if (ImplicitExpression)
         MustNotExtend = true;
       AsmToken const &Token = Parser.getTok();
       if (Token.is(AsmToken::Identifier)) {
         StringRef String = Token.getString();
-        AsmToken IDToken = Token;
         if (String.lower() == "hi") {
           HiOnly = true;
         } else if (String.lower() == "lo") {
@@ -1418,27 +1398,46 @@ bool HexagonAsmParser::parseInstruction(OperandVector &Operands) {
             HiOnly = false;
             LoOnly = false;
           } else {
-            Lexer.Lex();
+            Lex();
           }
         }
       }
-      if (parseExpression(Expr->Imm.Val))
+      MCExpr const *Expr = nullptr;
+      if (parseExpression(Expr))
         return true;
       int64_t Value;
       MCContext &Context = Parser.getContext();
-      assert(Expr->Imm.Val != nullptr);
-      if (Expr->Imm.Val->evaluateAsAbsolute(Value)) {
+      assert(Expr != nullptr);
+      if (Expr->evaluateAsAbsolute(Value)) {
         if (HiOnly)
-          Expr->Imm.Val = MCBinaryExpr::createLShr(
-              Expr->Imm.Val, MCConstantExpr::create(16, Context), Context);
+          Expr = MCBinaryExpr::createLShr(
+              Expr,  MCConstantExpr::create(16, Context), Context);
         if (HiOnly || LoOnly)
-          Expr->Imm.Val = MCBinaryExpr::createAnd(
-              Expr->Imm.Val, MCConstantExpr::create(0xffff, Context), Context);
+          Expr = MCBinaryExpr::createAnd(Expr,
+              MCConstantExpr::create(0xffff, Context),
+                                    Context);
+      } else {
+        MCValue Value;
+        if (Expr->evaluateAsRelocatable(Value, nullptr, nullptr)) {
+          if (!Value.isAbsolute()) {
+            switch(Value.getAccessVariant()) {
+            case MCSymbolRefExpr::VariantKind::VK_TPREL:
+            case MCSymbolRefExpr::VariantKind::VK_DTPREL:
+              // Don't lazy extend these expression variants
+              MustNotExtend = !MustExtend;
+              break;
+            default:
+              break;
+            }
+          }
+        }
       }
-      if (MustNotExtend)
-        Expr->Imm.Val = HexagonNoExtendOperand::Create(Expr->Imm.Val, Context);
-      Expr->Imm.MustExtend = MustExtend;
-      Operands.push_back(std::move(Expr));
+      Expr = HexagonMCExpr::create(Expr, Context);
+      HexagonMCInstrInfo::setMustNotExtend(*Expr, MustNotExtend);
+      HexagonMCInstrInfo::setMustExtend(*Expr, MustExtend);
+      std::unique_ptr<HexagonOperand> Operand =
+          HexagonOperand::CreateImm(Expr, ExprLoc, ExprLoc);
+      Operands.push_back(std::move(Operand));
       continue;
     }
     default:
@@ -1524,7 +1523,7 @@ void HexagonAsmParser::OutOfRange(SMLoc IDLoc, long long Val, long long Max) {
 
 int HexagonAsmParser::processInstruction(MCInst &Inst,
                                          OperandVector const &Operands,
-                                         SMLoc IDLoc, bool &MustExtend) {
+                                         SMLoc IDLoc) {
   MCContext &Context = getParser().getContext();
   const MCRegisterInfo *RI = getContext().getRegisterInfo();
   std::string r = "r";
@@ -1536,6 +1535,18 @@ int HexagonAsmParser::processInstruction(MCInst &Inst,
   default:
     break;
 
+  case Hexagon::A2_iconst: {
+    Inst.setOpcode(Hexagon::A2_addi);
+    MCOperand Reg = Inst.getOperand(0);
+    MCOperand S16 = Inst.getOperand(1);
+    HexagonMCInstrInfo::setMustNotExtend(*S16.getExpr());
+    HexagonMCInstrInfo::setS23_2_reloc(*S16.getExpr());
+    Inst.clear();
+    Inst.addOperand(Reg);
+    Inst.addOperand(MCOperand::createReg(Hexagon::R0));
+    Inst.addOperand(S16);
+    break;
+  }
   case Hexagon::M4_mpyrr_addr:
   case Hexagon::S4_addi_asl_ri:
   case Hexagon::S4_addi_lsr_ri:
@@ -1555,8 +1566,8 @@ int HexagonAsmParser::processInstruction(MCInst &Inst,
 
   case Hexagon::C2_cmpgei: {
     MCOperand &MO = Inst.getOperand(2);
-    MO.setExpr(MCBinaryExpr::createSub(
-        MO.getExpr(), MCConstantExpr::create(1, Context), Context));
+    MO.setExpr(HexagonMCExpr::create(MCBinaryExpr::createSub(
+        MO.getExpr(), MCConstantExpr::create(1, Context), Context), Context));
     Inst.setOpcode(Hexagon::C2_cmpgti);
     break;
   }
@@ -1577,49 +1588,24 @@ int HexagonAsmParser::processInstruction(MCInst &Inst,
       TmpInst.addOperand(Rt);
       Inst = TmpInst;
     } else {
-      MO.setExpr(MCBinaryExpr::createSub(
-          MO.getExpr(), MCConstantExpr::create(1, Context), Context));
+      MO.setExpr(HexagonMCExpr::create(MCBinaryExpr::createSub(
+          MO.getExpr(), MCConstantExpr::create(1, Context), Context), Context));
       Inst.setOpcode(Hexagon::C2_cmpgtui);
     }
     break;
   }
-  case Hexagon::J2_loop1r:
-  case Hexagon::J2_loop1i:
-  case Hexagon::J2_loop0r:
-  case Hexagon::J2_loop0i: {
-    MCOperand &MO = Inst.getOperand(0);
-    // Loop has different opcodes for extended vs not extended, but we should
-    //   not use the other opcode as it is a legacy artifact of TD files.
-    int64_t Value;
-    if (MO.getExpr()->evaluateAsAbsolute(Value)) {
-      // if the operand can fit within a 7:2 field
-      if (Value < (1 << 8) && Value >= -(1 << 8)) {
-        SMLoc myLoc = Operands[2]->getStartLoc();
-        // # is left in startLoc in the case of ##
-        // If '##' found then force extension.
-        if (*myLoc.getPointer() == '#') {
-          MustExtend = true;
-          break;
-        }
-      } else {
-        // If immediate and out of 7:2 range.
-        MustExtend = true;
-      }
-    }
-    break;
-  }
 
   // Translate a "$Rdd = $Rss" to "$Rdd = combine($Rs, $Rt)"
   case Hexagon::A2_tfrp: {
     MCOperand &MO = Inst.getOperand(1);
     unsigned int RegPairNum = RI->getEncodingValue(MO.getReg());
-    std::string R1 = r + llvm::utostr_32(RegPairNum + 1);
+    std::string R1 = r + llvm::utostr(RegPairNum + 1);
     StringRef Reg1(R1);
-    MO.setReg(MatchRegisterName(Reg1));
+    MO.setReg(matchRegister(Reg1));
     // Add a new operand for the second register in the pair.
-    std::string R2 = r + llvm::utostr_32(RegPairNum);
+    std::string R2 = r + llvm::utostr(RegPairNum);
     StringRef Reg2(R2);
-    Inst.addOperand(MCOperand::createReg(MatchRegisterName(Reg2)));
+    Inst.addOperand(MCOperand::createReg(matchRegister(Reg2)));
     Inst.setOpcode(Hexagon::A2_combinew);
     break;
   }
@@ -1628,13 +1614,13 @@ int HexagonAsmParser::processInstruction(MCInst &Inst,
   case Hexagon::A2_tfrpf: {
     MCOperand &MO = Inst.getOperand(2);
     unsigned int RegPairNum = RI->getEncodingValue(MO.getReg());
-    std::string R1 = r + llvm::utostr_32(RegPairNum + 1);
+    std::string R1 = r + llvm::utostr(RegPairNum + 1);
     StringRef Reg1(R1);
-    MO.setReg(MatchRegisterName(Reg1));
+    MO.setReg(matchRegister(Reg1));
     // Add a new operand for the second register in the pair.
-    std::string R2 = r + llvm::utostr_32(RegPairNum);
+    std::string R2 = r + llvm::utostr(RegPairNum);
     StringRef Reg2(R2);
-    Inst.addOperand(MCOperand::createReg(MatchRegisterName(Reg2)));
+    Inst.addOperand(MCOperand::createReg(matchRegister(Reg2)));
     Inst.setOpcode((Inst.getOpcode() == Hexagon::A2_tfrpt)
                        ? Hexagon::C2_ccombinewt
                        : Hexagon::C2_ccombinewf);
@@ -1644,19 +1630,32 @@ int HexagonAsmParser::processInstruction(MCInst &Inst,
   case Hexagon::A2_tfrpfnew: {
     MCOperand &MO = Inst.getOperand(2);
     unsigned int RegPairNum = RI->getEncodingValue(MO.getReg());
-    std::string R1 = r + llvm::utostr_32(RegPairNum + 1);
+    std::string R1 = r + llvm::utostr(RegPairNum + 1);
     StringRef Reg1(R1);
-    MO.setReg(MatchRegisterName(Reg1));
+    MO.setReg(matchRegister(Reg1));
     // Add a new operand for the second register in the pair.
-    std::string R2 = r + llvm::utostr_32(RegPairNum);
+    std::string R2 = r + llvm::utostr(RegPairNum);
     StringRef Reg2(R2);
-    Inst.addOperand(MCOperand::createReg(MatchRegisterName(Reg2)));
+    Inst.addOperand(MCOperand::createReg(matchRegister(Reg2)));
     Inst.setOpcode((Inst.getOpcode() == Hexagon::A2_tfrptnew)
                        ? Hexagon::C2_ccombinewnewt
                        : Hexagon::C2_ccombinewnewf);
     break;
   }
 
+  // Translate a "$Vdd = $Vss" to "$Vdd = vcombine($Vs, $Vt)"
+  case Hexagon::HEXAGON_V6_vassignpair: {
+    MCOperand &MO = Inst.getOperand(1);
+    unsigned int RegPairNum = RI->getEncodingValue(MO.getReg());
+    std::string R1 = v + llvm::utostr(RegPairNum + 1);
+    MO.setReg(MatchRegisterName(R1));
+    // Add a new operand for the second register in the pair.
+    std::string R2 = v + llvm::utostr(RegPairNum);
+    Inst.addOperand(MCOperand::createReg(MatchRegisterName(R2)));
+    Inst.setOpcode(Hexagon::V6_vcombine);
+    break;
+  }
+
   // Translate a "$Rx =  CONST32(#imm)" to "$Rx = memw(gp+#LABEL) "
   case Hexagon::CONST32:
   case Hexagon::CONST32_Float_Real:
@@ -1773,7 +1772,8 @@ int HexagonAsmParser::processInstruction(MCInst &Inst,
     MCOperand &MO = Inst.getOperand(1);
     int64_t Value;
     int sVal = (MO.getExpr()->evaluateAsAbsolute(Value) && Value < 0) ? -1 : 0;
-    MCOperand imm(MCOperand::createExpr(MCConstantExpr::create(sVal, Context)));
+    MCOperand imm(MCOperand::createExpr(
+        HexagonMCExpr::create(MCConstantExpr::create(sVal, Context), Context)));
     Inst = makeCombineInst(Hexagon::A2_combineii, Rdd, imm, MO);
     break;
   }
@@ -1784,18 +1784,19 @@ int HexagonAsmParser::processInstruction(MCInst &Inst,
     MCOperand &MO = Inst.getOperand(1);
     int64_t Value;
     if (MO.getExpr()->evaluateAsAbsolute(Value)) {
-      unsigned long long u64 = Value;
-      signed int s8 = (u64 >> 32) & 0xFFFFFFFF;
-      if (s8 < -128 || s8 > 127)
+      int s8 = Hi_32(Value);
+      if (!isInt<8>(s8))
         OutOfRange(IDLoc, s8, -128);
-      MCOperand imm(MCOperand::createExpr(
-          MCConstantExpr::create(s8, Context))); // upper 32
-      MCOperand imm2(MCOperand::createExpr(
-          MCConstantExpr::create(u64 & 0xFFFFFFFF, Context))); // lower 32
+      MCOperand imm(MCOperand::createExpr(HexagonMCExpr::create(
+          MCConstantExpr::create(s8, Context), Context))); // upper 32
+      auto Expr = HexagonMCExpr::create(
+          MCConstantExpr::create(Lo_32(Value), Context), Context);
+      HexagonMCInstrInfo::setMustExtend(*Expr, HexagonMCInstrInfo::mustExtend(*MO.getExpr()));
+      MCOperand imm2(MCOperand::createExpr(Expr)); // lower 32
       Inst = makeCombineInst(Hexagon::A4_combineii, Rdd, imm, imm2);
     } else {
-      MCOperand imm(MCOperand::createExpr(
-          MCConstantExpr::create(0, Context))); // upper 32
+      MCOperand imm(MCOperand::createExpr(HexagonMCExpr::create(
+          MCConstantExpr::create(0, Context), Context))); // upper 32
       Inst = makeCombineInst(Hexagon::A4_combineii, Rdd, imm, MO);
     }
     break;
@@ -1843,8 +1844,8 @@ int HexagonAsmParser::processInstruction(MCInst &Inst,
     MCOperand &Rs = Inst.getOperand(2);
     MCOperand &Imm4 = Inst.getOperand(3);
     MCOperand &Imm6 = Inst.getOperand(4);
-    Imm6.setExpr(MCBinaryExpr::createSub(
-        Imm6.getExpr(), MCConstantExpr::create(1, Context), Context));
+    Imm6.setExpr(HexagonMCExpr::create(MCBinaryExpr::createSub(
+        Imm6.getExpr(), MCConstantExpr::create(1, Context), Context), Context));
     TmpInst.setOpcode(Hexagon::S2_tableidxh);
     TmpInst.addOperand(Rx);
     TmpInst.addOperand(_dst_);
@@ -1862,8 +1863,8 @@ int HexagonAsmParser::processInstruction(MCInst &Inst,
     MCOperand &Rs = Inst.getOperand(2);
     MCOperand &Imm4 = Inst.getOperand(3);
     MCOperand &Imm6 = Inst.getOperand(4);
-    Imm6.setExpr(MCBinaryExpr::createSub(
-        Imm6.getExpr(), MCConstantExpr::create(2, Context), Context));
+    Imm6.setExpr(HexagonMCExpr::create(MCBinaryExpr::createSub(
+        Imm6.getExpr(), MCConstantExpr::create(2, Context), Context), Context));
     TmpInst.setOpcode(Hexagon::S2_tableidxw);
     TmpInst.addOperand(Rx);
     TmpInst.addOperand(_dst_);
@@ -1881,8 +1882,8 @@ int HexagonAsmParser::processInstruction(MCInst &Inst,
     MCOperand &Rs = Inst.getOperand(2);
     MCOperand &Imm4 = Inst.getOperand(3);
     MCOperand &Imm6 = Inst.getOperand(4);
-    Imm6.setExpr(MCBinaryExpr::createSub(
-        Imm6.getExpr(), MCConstantExpr::create(3, Context), Context));
+    Imm6.setExpr(HexagonMCExpr::create(MCBinaryExpr::createSub(
+        Imm6.getExpr(), MCConstantExpr::create(3, Context), Context), Context));
     TmpInst.setOpcode(Hexagon::S2_tableidxd);
     TmpInst.addOperand(Rx);
     TmpInst.addOperand(_dst_);
@@ -1903,12 +1904,14 @@ int HexagonAsmParser::processInstruction(MCInst &Inst,
     MCOperand &Rs = Inst.getOperand(1);
     MCOperand &Imm = Inst.getOperand(2);
     int64_t Value;
-    bool Absolute = Imm.getExpr()->evaluateAsAbsolute(Value);
+    MCExpr const &Expr = *Imm.getExpr();
+    bool Absolute = Expr.evaluateAsAbsolute(Value);
     assert(Absolute);
     (void)Absolute;
-    if (!MustExtend) {
+    if (!HexagonMCInstrInfo::mustExtend(Expr)) {
       if (Value < 0 && Value > -256) {
-        Imm.setExpr(MCConstantExpr::create(Value * -1, Context));
+        Imm.setExpr(HexagonMCExpr::create(
+            MCConstantExpr::create(Value * -1, Context), Context));
         TmpInst.setOpcode(Hexagon::M2_mpysin);
       } else if (Value < 256 && Value >= 0)
         TmpInst.setOpcode(Hexagon::M2_mpysip);
@@ -1941,8 +1944,10 @@ int HexagonAsmParser::processInstruction(MCInst &Inst,
       TmpInst.addOperand(Rd);
       TmpInst.addOperand(Rs);
     } else {
-      Imm.setExpr(MCBinaryExpr::createSub(
-          Imm.getExpr(), MCConstantExpr::create(1, Context), Context));
+      Imm.setExpr(HexagonMCExpr::create(
+          MCBinaryExpr::createSub(Imm.getExpr(),
+                                  MCConstantExpr::create(1, Context), Context),
+          Context));
       TmpInst.setOpcode(Hexagon::S2_asr_i_r_rnd);
       MCOperand &Rd = Inst.getOperand(0);
       MCOperand &Rs = Inst.getOperand(1);
@@ -1965,20 +1970,22 @@ int HexagonAsmParser::processInstruction(MCInst &Inst,
     if (Value == 0) { // convert to $Rdd = combine ($Rs[0], $Rs[1])
       MCInst TmpInst;
       unsigned int RegPairNum = RI->getEncodingValue(Rss.getReg());
-      std::string R1 = r + llvm::utostr_32(RegPairNum + 1);
+      std::string R1 = r + llvm::utostr(RegPairNum + 1);
       StringRef Reg1(R1);
-      Rss.setReg(MatchRegisterName(Reg1));
+      Rss.setReg(matchRegister(Reg1));
       // Add a new operand for the second register in the pair.
-      std::string R2 = r + llvm::utostr_32(RegPairNum);
+      std::string R2 = r + llvm::utostr(RegPairNum);
       StringRef Reg2(R2);
       TmpInst.setOpcode(Hexagon::A2_combinew);
       TmpInst.addOperand(Rdd);
       TmpInst.addOperand(Rss);
-      TmpInst.addOperand(MCOperand::createReg(MatchRegisterName(Reg2)));
+      TmpInst.addOperand(MCOperand::createReg(matchRegister(Reg2)));
       Inst = TmpInst;
     } else {
-      Imm.setExpr(MCBinaryExpr::createSub(
-          Imm.getExpr(), MCConstantExpr::create(1, Context), Context));
+      Imm.setExpr(HexagonMCExpr::create(
+          MCBinaryExpr::createSub(Imm.getExpr(),
+                                  MCConstantExpr::create(1, Context), Context),
+          Context));
       Inst.setOpcode(Hexagon::S2_asr_i_p_rnd);
     }
     break;
@@ -1990,15 +1997,15 @@ int HexagonAsmParser::processInstruction(MCInst &Inst,
     if (RegNum & 1) { // Odd mapped to raw:hi, regpair is rodd:odd-1, like r3:2
       Inst.setOpcode(Hexagon::A4_boundscheck_hi);
       std::string Name =
-          r + llvm::utostr_32(RegNum) + Colon + llvm::utostr_32(RegNum - 1);
+          r + llvm::utostr(RegNum) + Colon + llvm::utostr(RegNum - 1);
       StringRef RegPair = Name;
-      Rs.setReg(MatchRegisterName(RegPair));
+      Rs.setReg(matchRegister(RegPair));
     } else { // raw:lo
       Inst.setOpcode(Hexagon::A4_boundscheck_lo);
       std::string Name =
-          r + llvm::utostr_32(RegNum + 1) + Colon + llvm::utostr_32(RegNum);
+          r + llvm::utostr(RegNum + 1) + Colon + llvm::utostr(RegNum);
       StringRef RegPair = Name;
-      Rs.setReg(MatchRegisterName(RegPair));
+      Rs.setReg(matchRegister(RegPair));
     }
     break;
   }
@@ -2009,15 +2016,15 @@ int HexagonAsmParser::processInstruction(MCInst &Inst,
     if (RegNum & 1) { // Odd mapped to raw:hi
       Inst.setOpcode(Hexagon::A2_addsph);
       std::string Name =
-          r + llvm::utostr_32(RegNum) + Colon + llvm::utostr_32(RegNum - 1);
+          r + llvm::utostr(RegNum) + Colon + llvm::utostr(RegNum - 1);
       StringRef RegPair = Name;
-      Rs.setReg(MatchRegisterName(RegPair));
+      Rs.setReg(matchRegister(RegPair));
     } else { // Even mapped raw:lo
       Inst.setOpcode(Hexagon::A2_addspl);
       std::string Name =
-          r + llvm::utostr_32(RegNum + 1) + Colon + llvm::utostr_32(RegNum);
+          r + llvm::utostr(RegNum + 1) + Colon + llvm::utostr(RegNum);
       StringRef RegPair = Name;
-      Rs.setReg(MatchRegisterName(RegPair));
+      Rs.setReg(matchRegister(RegPair));
     }
     break;
   }
@@ -2028,15 +2035,15 @@ int HexagonAsmParser::processInstruction(MCInst &Inst,
     if (RegNum & 1) { // Odd mapped to sat:raw:hi
       Inst.setOpcode(Hexagon::M2_vrcmpys_s1_h);
       std::string Name =
-          r + llvm::utostr_32(RegNum) + Colon + llvm::utostr_32(RegNum - 1);
+          r + llvm::utostr(RegNum) + Colon + llvm::utostr(RegNum - 1);
       StringRef RegPair = Name;
-      Rt.setReg(MatchRegisterName(RegPair));
+      Rt.setReg(matchRegister(RegPair));
     } else { // Even mapped sat:raw:lo
       Inst.setOpcode(Hexagon::M2_vrcmpys_s1_l);
       std::string Name =
-          r + llvm::utostr_32(RegNum + 1) + Colon + llvm::utostr_32(RegNum);
+          r + llvm::utostr(RegNum + 1) + Colon + llvm::utostr(RegNum);
       StringRef RegPair = Name;
-      Rt.setReg(MatchRegisterName(RegPair));
+      Rt.setReg(matchRegister(RegPair));
     }
     break;
   }
@@ -2050,15 +2057,15 @@ int HexagonAsmParser::processInstruction(MCInst &Inst,
     if (RegNum & 1) { // Odd mapped to sat:raw:hi
       TmpInst.setOpcode(Hexagon::M2_vrcmpys_acc_s1_h);
       std::string Name =
-          r + llvm::utostr_32(RegNum) + Colon + llvm::utostr_32(RegNum - 1);
+          r + llvm::utostr(RegNum) + Colon + llvm::utostr(RegNum - 1);
       StringRef RegPair = Name;
-      Rt.setReg(MatchRegisterName(RegPair));
+      Rt.setReg(matchRegister(RegPair));
     } else { // Even mapped sat:raw:lo
       TmpInst.setOpcode(Hexagon::M2_vrcmpys_acc_s1_l);
       std::string Name =
-          r + llvm::utostr_32(RegNum + 1) + Colon + llvm::utostr_32(RegNum);
+          r + llvm::utostr(RegNum + 1) + Colon + llvm::utostr(RegNum);
       StringRef RegPair = Name;
-      Rt.setReg(MatchRegisterName(RegPair));
+      Rt.setReg(matchRegister(RegPair));
     }
     // Registers are in different positions
     TmpInst.addOperand(Rxx);
@@ -2075,15 +2082,15 @@ int HexagonAsmParser::processInstruction(MCInst &Inst,
     if (RegNum & 1) { // Odd mapped to rnd:sat:raw:hi
       Inst.setOpcode(Hexagon::M2_vrcmpys_s1rp_h);
       std::string Name =
-          r + llvm::utostr_32(RegNum) + Colon + llvm::utostr_32(RegNum - 1);
+          r + llvm::utostr(RegNum) + Colon + llvm::utostr(RegNum - 1);
       StringRef RegPair = Name;
-      Rt.setReg(MatchRegisterName(RegPair));
+      Rt.setReg(matchRegister(RegPair));
     } else { // Even mapped rnd:sat:raw:lo
       Inst.setOpcode(Hexagon::M2_vrcmpys_s1rp_l);
       std::string Name =
-          r + llvm::utostr_32(RegNum + 1) + Colon + llvm::utostr_32(RegNum);
+          r + llvm::utostr(RegNum + 1) + Colon + llvm::utostr(RegNum);
       StringRef RegPair = Name;
-      Rt.setReg(MatchRegisterName(RegPair));
+      Rt.setReg(matchRegister(RegPair));
     }
     break;
   }
@@ -2097,8 +2104,10 @@ int HexagonAsmParser::processInstruction(MCInst &Inst,
     if (Value == 0)
       Inst.setOpcode(Hexagon::S2_vsathub);
     else {
-      Imm.setExpr(MCBinaryExpr::createSub(
-          Imm.getExpr(), MCConstantExpr::create(1, Context), Context));
+      Imm.setExpr(HexagonMCExpr::create(
+          MCBinaryExpr::createSub(Imm.getExpr(),
+                                  MCConstantExpr::create(1, Context), Context),
+          Context));
       Inst.setOpcode(Hexagon::S5_asrhub_rnd_sat);
     }
     break;
@@ -2115,20 +2124,22 @@ int HexagonAsmParser::processInstruction(MCInst &Inst,
     if (Value == 0) {
       MCInst TmpInst;
       unsigned int RegPairNum = RI->getEncodingValue(Rss.getReg());
-      std::string R1 = r + llvm::utostr_32(RegPairNum + 1);
+      std::string R1 = r + llvm::utostr(RegPairNum + 1);
       StringRef Reg1(R1);
-      Rss.setReg(MatchRegisterName(Reg1));
+      Rss.setReg(matchRegister(Reg1));
       // Add a new operand for the second register in the pair.
-      std::string R2 = r + llvm::utostr_32(RegPairNum);
+      std::string R2 = r + llvm::utostr(RegPairNum);
       StringRef Reg2(R2);
       TmpInst.setOpcode(Hexagon::A2_combinew);
       TmpInst.addOperand(Rdd);
       TmpInst.addOperand(Rss);
-      TmpInst.addOperand(MCOperand::createReg(MatchRegisterName(Reg2)));
+      TmpInst.addOperand(MCOperand::createReg(matchRegister(Reg2)));
       Inst = TmpInst;
     } else {
-      Imm.setExpr(MCBinaryExpr::createSub(
-          Imm.getExpr(), MCConstantExpr::create(1, Context), Context));
+      Imm.setExpr(HexagonMCExpr::create(
+          MCBinaryExpr::createSub(Imm.getExpr(),
+                                  MCConstantExpr::create(1, Context), Context),
+          Context));
       Inst.setOpcode(Hexagon::S5_vasrhrnd);
     }
     break;
@@ -2140,8 +2151,8 @@ int HexagonAsmParser::processInstruction(MCInst &Inst,
     MCOperand &Rs = Inst.getOperand(1);
     TmpInst.setOpcode(Hexagon::A2_subri);
     TmpInst.addOperand(Rd);
-    TmpInst.addOperand(
-        MCOperand::createExpr(MCConstantExpr::create(-1, Context)));
+    TmpInst.addOperand(MCOperand::createExpr(
+        HexagonMCExpr::create(MCConstantExpr::create(-1, Context), Context)));
     TmpInst.addOperand(Rs);
     Inst = TmpInst;
     break;
@@ -2150,3 +2161,10 @@ int HexagonAsmParser::processInstruction(MCInst &Inst,
 
   return Match_Success;
 }
+
+
+unsigned HexagonAsmParser::matchRegister(StringRef Name) {
+  if (unsigned Reg = MatchRegisterName(Name))
+    return Reg;
+  return MatchRegisterAltName(Name);
+}
diff --git a/lib/Target/Hexagon/AsmParser/Makefile b/lib/Target/Hexagon/AsmParser/Makefile
deleted file mode 100644
index 0aa0b4140c3e..000000000000
--- a/lib/Target/Hexagon/AsmParser/Makefile
+++ /dev/null
@@ -1,15 +0,0 @@
-##===- lib/Target/Hexagon/AsmParser/Makefile ------------------*- Makefile -*-===##
-#
-#                     The LLVM Compiler Infrastructure
-#
-# This file is distributed under the University of Illinois Open Source
-# License. See LICENSE.TXT for details.
-#
-##===----------------------------------------------------------------------===##
-LEVEL = ../../../..
-LIBRARYNAME = LLVMHexagonAsmParser
-
-# Hack: we need to include 'main' Hexagon target directory to grab private headers
-CPP.Flags += -I$(PROJ_OBJ_DIR)/.. -I$(PROJ_SRC_DIR)/..
-
-include $(LEVEL)/Makefile.common
diff --git a/lib/Target/Hexagon/BitTracker.cpp b/lib/Target/Hexagon/BitTracker.cpp
index ea96eb0ee10a..d052a835fbd8 100644
--- a/lib/Target/Hexagon/BitTracker.cpp
+++ b/lib/Target/Hexagon/BitTracker.cpp
@@ -84,87 +84,89 @@ namespace {
   }
 }
 
-raw_ostream &llvm::operator<<(raw_ostream &OS, const BT::BitValue &BV) {
-  switch (BV.Type) {
-    case BT::BitValue::Top:
-      OS << 'T';
-      break;
-    case BT::BitValue::Zero:
-      OS << '0';
-      break;
-    case BT::BitValue::One:
-      OS << '1';
-      break;
-    case BT::BitValue::Ref:
-      OS << printv(BV.RefI.Reg) << '[' << BV.RefI.Pos << ']';
-      break;
+namespace llvm {
+  raw_ostream &operator<<(raw_ostream &OS, const BT::BitValue &BV) {
+    switch (BV.Type) {
+      case BT::BitValue::Top:
+        OS << 'T';
+        break;
+      case BT::BitValue::Zero:
+        OS << '0';
+        break;
+      case BT::BitValue::One:
+        OS << '1';
+        break;
+      case BT::BitValue::Ref:
+        OS << printv(BV.RefI.Reg) << '[' << BV.RefI.Pos << ']';
+        break;
+    }
+    return OS;
   }
-  return OS;
-}
 
-raw_ostream &llvm::operator<<(raw_ostream &OS, const BT::RegisterCell &RC) {
-  unsigned n = RC.Bits.size();
-  OS << "{ w:" << n;
-  // Instead of printing each bit value individually, try to group them
-  // into logical segments, such as sequences of 0 or 1 bits or references
-  // to consecutive bits (e.g. "bits 3-5 are same as bits 7-9 of reg xyz").
-  // "Start" will be the index of the beginning of the most recent segment.
-  unsigned Start = 0;
-  bool SeqRef = false;    // A sequence of refs to consecutive bits.
-  bool ConstRef = false;  // A sequence of refs to the same bit.
-
-  for (unsigned i = 1, n = RC.Bits.size(); i < n; ++i) {
-    const BT::BitValue &V = RC[i];
-    const BT::BitValue &SV = RC[Start];
-    bool IsRef = (V.Type == BT::BitValue::Ref);
-    // If the current value is the same as Start, skip to the next one.
-    if (!IsRef && V == SV)
-      continue;
-    if (IsRef && SV.Type == BT::BitValue::Ref && V.RefI.Reg == SV.RefI.Reg) {
-      if (Start+1 == i) {
-        SeqRef = (V.RefI.Pos == SV.RefI.Pos+1);
-        ConstRef = (V.RefI.Pos == SV.RefI.Pos);
-      }
-      if (SeqRef && V.RefI.Pos == SV.RefI.Pos+(i-Start))
-        continue;
-      if (ConstRef && V.RefI.Pos == SV.RefI.Pos)
+  raw_ostream &operator<<(raw_ostream &OS, const BT::RegisterCell &RC) {
+    unsigned n = RC.Bits.size();
+    OS << "{ w:" << n;
+    // Instead of printing each bit value individually, try to group them
+    // into logical segments, such as sequences of 0 or 1 bits or references
+    // to consecutive bits (e.g. "bits 3-5 are same as bits 7-9 of reg xyz").
+    // "Start" will be the index of the beginning of the most recent segment.
+    unsigned Start = 0;
+    bool SeqRef = false;    // A sequence of refs to consecutive bits.
+    bool ConstRef = false;  // A sequence of refs to the same bit.
+
+    for (unsigned i = 1, n = RC.Bits.size(); i < n; ++i) {
+      const BT::BitValue &V = RC[i];
+      const BT::BitValue &SV = RC[Start];
+      bool IsRef = (V.Type == BT::BitValue::Ref);
+      // If the current value is the same as Start, skip to the next one.
+      if (!IsRef && V == SV)
         continue;
+      if (IsRef && SV.Type == BT::BitValue::Ref && V.RefI.Reg == SV.RefI.Reg) {
+        if (Start+1 == i) {
+          SeqRef = (V.RefI.Pos == SV.RefI.Pos+1);
+          ConstRef = (V.RefI.Pos == SV.RefI.Pos);
+        }
+        if (SeqRef && V.RefI.Pos == SV.RefI.Pos+(i-Start))
+          continue;
+        if (ConstRef && V.RefI.Pos == SV.RefI.Pos)
+          continue;
+      }
+
+      // The current value is different. Print the previous one and reset
+      // the Start.
+      OS << " [" << Start;
+      unsigned Count = i - Start;
+      if (Count == 1) {
+        OS << "]:" << SV;
+      } else {
+        OS << '-' << i-1 << "]:";
+        if (SV.Type == BT::BitValue::Ref && SeqRef)
+          OS << printv(SV.RefI.Reg) << '[' << SV.RefI.Pos << '-'
+             << SV.RefI.Pos+(Count-1) << ']';
+        else
+          OS << SV;
+      }
+      Start = i;
+      SeqRef = ConstRef = false;
     }
 
-    // The current value is different. Print the previous one and reset
-    // the Start.
     OS << " [" << Start;
-    unsigned Count = i - Start;
-    if (Count == 1) {
-      OS << "]:" << SV;
+    unsigned Count = n - Start;
+    if (n-Start == 1) {
+      OS << "]:" << RC[Start];
     } else {
-      OS << '-' << i-1 << "]:";
+      OS << '-' << n-1 << "]:";
+      const BT::BitValue &SV = RC[Start];
       if (SV.Type == BT::BitValue::Ref && SeqRef)
         OS << printv(SV.RefI.Reg) << '[' << SV.RefI.Pos << '-'
            << SV.RefI.Pos+(Count-1) << ']';
       else
         OS << SV;
     }
-    Start = i;
-    SeqRef = ConstRef = false;
-  }
+    OS << " }";
 
-  OS << " [" << Start;
-  unsigned Count = n - Start;
-  if (n-Start == 1) {
-    OS << "]:" << RC[Start];
-  } else {
-    OS << '-' << n-1 << "]:";
-    const BT::BitValue &SV = RC[Start];
-    if (SV.Type == BT::BitValue::Ref && SeqRef)
-      OS << printv(SV.RefI.Reg) << '[' << SV.RefI.Pos << '-'
-         << SV.RefI.Pos+(Count-1) << ']';
-    else
-      OS << SV;
+    return OS;
   }
-  OS << " }";
-
-  return OS;
 }
 
 BitTracker::BitTracker(const MachineEvaluator &E, MachineFunction &F)
@@ -420,7 +422,7 @@ BT::RegisterCell BT::MachineEvaluator::eIMM(int64_t V, uint16_t W) const {
 
 
 BT::RegisterCell BT::MachineEvaluator::eIMM(const ConstantInt *CI) const {
-  APInt A = CI->getValue();
+  const APInt &A = CI->getValue();
   uint16_t BW = A.getBitWidth();
   assert((unsigned)BW == A.getBitWidth() && "BitWidth overflow");
   RegisterCell Res(BW);
@@ -731,18 +733,18 @@ BT::BitMask BT::MachineEvaluator::mask(unsigned Reg, unsigned Sub) const {
   return BitMask(0, W-1);
 }
 
-
-bool BT::MachineEvaluator::evaluate(const MachineInstr *MI,
-      const CellMapType &Inputs, CellMapType &Outputs) const {
-  unsigned Opc = MI->getOpcode();
+bool BT::MachineEvaluator::evaluate(const MachineInstr &MI,
+                                    const CellMapType &Inputs,
+                                    CellMapType &Outputs) const {
+  unsigned Opc = MI.getOpcode();
   switch (Opc) {
     case TargetOpcode::REG_SEQUENCE: {
-      RegisterRef RD = MI->getOperand(0);
+      RegisterRef RD = MI.getOperand(0);
       assert(RD.Sub == 0);
-      RegisterRef RS = MI->getOperand(1);
-      unsigned SS = MI->getOperand(2).getImm();
-      RegisterRef RT = MI->getOperand(3);
-      unsigned ST = MI->getOperand(4).getImm();
+      RegisterRef RS = MI.getOperand(1);
+      unsigned SS = MI.getOperand(2).getImm();
+      RegisterRef RT = MI.getOperand(3);
+      unsigned ST = MI.getOperand(4).getImm();
       assert(SS != ST);
 
       uint16_t W = getRegBitWidth(RD);
@@ -756,8 +758,8 @@ bool BT::MachineEvaluator::evaluate(const MachineInstr *MI,
     case TargetOpcode::COPY: {
       // COPY can transfer a smaller register into a wider one.
       // If that is the case, fill the remaining high bits with 0.
-      RegisterRef RD = MI->getOperand(0);
-      RegisterRef RS = MI->getOperand(1);
+      RegisterRef RD = MI.getOperand(0);
+      RegisterRef RS = MI.getOperand(1);
       assert(RD.Sub == 0);
       uint16_t WD = getRegBitWidth(RD);
       uint16_t WS = getRegBitWidth(RS);
@@ -780,12 +782,12 @@ bool BT::MachineEvaluator::evaluate(const MachineInstr *MI,
 
 // Main W-Z implementation.
 
-void BT::visitPHI(const MachineInstr *PI) {
-  int ThisN = PI->getParent()->getNumber();
+void BT::visitPHI(const MachineInstr &PI) {
+  int ThisN = PI.getParent()->getNumber();
   if (Trace)
-    dbgs() << "Visit FI(BB#" << ThisN << "): " << *PI;
+    dbgs() << "Visit FI(BB#" << ThisN << "): " << PI;
 
-  const MachineOperand &MD = PI->getOperand(0);
+  const MachineOperand &MD = PI.getOperand(0);
   assert(MD.getSubReg() == 0 && "Unexpected sub-register in definition");
   RegisterRef DefRR(MD);
   uint16_t DefBW = ME.getRegBitWidth(DefRR);
@@ -796,8 +798,8 @@ void BT::visitPHI(const MachineInstr *PI) {
 
   bool Changed = false;
 
-  for (unsigned i = 1, n = PI->getNumOperands(); i < n; i += 2) {
-    const MachineBasicBlock *PB = PI->getOperand(i+1).getMBB();
+  for (unsigned i = 1, n = PI.getNumOperands(); i < n; i += 2) {
+    const MachineBasicBlock *PB = PI.getOperand(i + 1).getMBB();
     int PredN = PB->getNumber();
     if (Trace)
       dbgs() << "  edge BB#" << PredN << "->BB#" << ThisN;
@@ -807,7 +809,7 @@ void BT::visitPHI(const MachineInstr *PI) {
       continue;
     }
 
-    RegisterRef RU = PI->getOperand(i);
+    RegisterRef RU = PI.getOperand(i);
     RegisterCell ResC = ME.getCell(RU, Map);
     if (Trace)
       dbgs() << " input reg: " << PrintReg(RU.Reg, &ME.TRI, RU.Sub)
@@ -824,22 +826,21 @@ void BT::visitPHI(const MachineInstr *PI) {
   }
 }
 
-
-void BT::visitNonBranch(const MachineInstr *MI) {
+void BT::visitNonBranch(const MachineInstr &MI) {
   if (Trace) {
-    int ThisN = MI->getParent()->getNumber();
-    dbgs() << "Visit MI(BB#" << ThisN << "): " << *MI;
+    int ThisN = MI.getParent()->getNumber();
+    dbgs() << "Visit MI(BB#" << ThisN << "): " << MI;
   }
-  if (MI->isDebugValue())
+  if (MI.isDebugValue())
     return;
-  assert(!MI->isBranch() && "Unexpected branch instruction");
+  assert(!MI.isBranch() && "Unexpected branch instruction");
 
   CellMapType ResMap;
   bool Eval = ME.evaluate(MI, Map, ResMap);
 
   if (Trace && Eval) {
-    for (unsigned i = 0, n = MI->getNumOperands(); i < n; ++i) {
-      const MachineOperand &MO = MI->getOperand(i);
+    for (unsigned i = 0, n = MI.getNumOperands(); i < n; ++i) {
+      const MachineOperand &MO = MI.getOperand(i);
       if (!MO.isReg() || !MO.isUse())
         continue;
       RegisterRef RU(MO);
@@ -857,8 +858,8 @@ void BT::visitNonBranch(const MachineInstr *MI) {
 
   // Iterate over all definitions of the instruction, and update the
   // cells accordingly.
-  for (unsigned i = 0, n = MI->getNumOperands(); i < n; ++i) {
-    const MachineOperand &MO = MI->getOperand(i);
+  for (unsigned i = 0, n = MI.getNumOperands(); i < n; ++i) {
+    const MachineOperand &MO = MI.getOperand(i);
     // Visit register defs only.
     if (!MO.isReg() || !MO.isDef())
       continue;
@@ -905,9 +906,8 @@ void BT::visitNonBranch(const MachineInstr *MI) {
   }
 }
 
-
-void BT::visitBranchesFrom(const MachineInstr *BI) {
-  const MachineBasicBlock &B = *BI->getParent();
+void BT::visitBranchesFrom(const MachineInstr &BI) {
+  const MachineBasicBlock &B = *BI.getParent();
   MachineBasicBlock::const_iterator It = BI, End = B.end();
   BranchTargetList Targets, BTs;
   bool FallsThrough = true, DefaultToAll = false;
@@ -915,11 +915,11 @@ void BT::visitBranchesFrom(const MachineInstr *BI) {
 
   do {
     BTs.clear();
-    const MachineInstr *MI = &*It;
+    const MachineInstr &MI = *It;
     if (Trace)
-      dbgs() << "Visit BR(BB#" << ThisN << "): " << *MI;
-    assert(MI->isBranch() && "Expecting branch instruction");
-    InstrExec.insert(MI);
+      dbgs() << "Visit BR(BB#" << ThisN << "): " << MI;
+    assert(MI.isBranch() && "Expecting branch instruction");
+    InstrExec.insert(&MI);
     bool Eval = ME.evaluate(MI, Map, BTs, FallsThrough);
     if (!Eval) {
       // If the evaluation failed, we will add all targets. Keep going in
@@ -983,11 +983,11 @@ void BT::visitUsesOf(unsigned Reg) {
     if (!InstrExec.count(UseI))
       continue;
     if (UseI->isPHI())
-      visitPHI(UseI);
+      visitPHI(*UseI);
     else if (!UseI->isBranch())
-      visitNonBranch(UseI);
+      visitNonBranch(*UseI);
     else
-      visitBranchesFrom(UseI);
+      visitBranchesFrom(*UseI);
   }
 }
 
@@ -1084,8 +1084,8 @@ void BT::run() {
     MachineBasicBlock::const_iterator It = B.begin(), End = B.end();
     // Visit PHI nodes first.
     while (It != End && It->isPHI()) {
-      const MachineInstr *PI = &*It++;
-      InstrExec.insert(PI);
+      const MachineInstr &PI = *It++;
+      InstrExec.insert(&PI);
       visitPHI(PI);
     }
 
@@ -1098,8 +1098,8 @@ void BT::run() {
 
     // Visit non-branch instructions.
     while (It != End && !It->isBranch()) {
-      const MachineInstr *MI = &*It++;
-      InstrExec.insert(MI);
+      const MachineInstr &MI = *It++;
+      InstrExec.insert(&MI);
       visitNonBranch(MI);
     }
     // If block end has been reached, add the fall-through edge to the queue.
@@ -1114,7 +1114,7 @@ void BT::run() {
     } else {
       // Handle the remaining sequence of branches. This function will update
       // the work queue.
-      visitBranchesFrom(It);
+      visitBranchesFrom(*It);
     }
   } // while (!FlowQ->empty())
 
diff --git a/lib/Target/Hexagon/BitTracker.h b/lib/Target/Hexagon/BitTracker.h
index 959c8318fd60..5b925fe696f8 100644
--- a/lib/Target/Hexagon/BitTracker.h
+++ b/lib/Target/Hexagon/BitTracker.h
@@ -51,9 +51,9 @@ struct BitTracker {
   bool reached(const MachineBasicBlock *B) const;
 
 private:
-  void visitPHI(const MachineInstr *PI);
-  void visitNonBranch(const MachineInstr *MI);
-  void visitBranchesFrom(const MachineInstr *BI);
+  void visitPHI(const MachineInstr &PI);
+  void visitNonBranch(const MachineInstr &MI);
+  void visitBranchesFrom(const MachineInstr &BI);
   void visitUsesOf(unsigned Reg);
   void reset();
 
@@ -417,13 +417,13 @@ struct BitTracker::MachineEvaluator {
   // Evaluate a non-branching machine instruction, given the cell map with
   // the input values. Place the results in the Outputs map. Return "true"
   // if evaluation succeeded, "false" otherwise.
-  virtual bool evaluate(const MachineInstr *MI, const CellMapType &Inputs,
+  virtual bool evaluate(const MachineInstr &MI, const CellMapType &Inputs,
                         CellMapType &Outputs) const;
   // Evaluate a branch, given the cell map with the input values. Fill out
   // a list of all possible branch targets and indicate (through a flag)
   // whether the branch could fall-through. Return "true" if this information
   // has been successfully computed, "false" otherwise.
-  virtual bool evaluate(const MachineInstr *BI, const CellMapType &Inputs,
+  virtual bool evaluate(const MachineInstr &BI, const CellMapType &Inputs,
                         BranchTargetList &Targets, bool &FallsThru) const = 0;
 
   const TargetRegisterInfo &TRI;
diff --git a/lib/Target/Hexagon/CMakeLists.txt b/lib/Target/Hexagon/CMakeLists.txt
index 333ca6a757aa..0e32f25f52b7 100644
--- a/lib/Target/Hexagon/CMakeLists.txt
+++ b/lib/Target/Hexagon/CMakeLists.txt
@@ -17,12 +17,13 @@ add_llvm_target(HexagonCodeGen
   HexagonAsmPrinter.cpp
   HexagonBitSimplify.cpp
   HexagonBitTracker.cpp
+  HexagonBlockRanges.cpp
+  HexagonBranchRelaxation.cpp
   HexagonCFGOptimizer.cpp
   HexagonCommonGEP.cpp
   HexagonCopyToCombine.cpp
   HexagonEarlyIfConv.cpp
   HexagonExpandCondsets.cpp
-  HexagonExpandPredSpillCode.cpp
   HexagonFixupHwLoops.cpp
   HexagonFrameLowering.cpp
   HexagonGenExtract.cpp
@@ -37,6 +38,7 @@ add_llvm_target(HexagonCodeGen
   HexagonMachineScheduler.cpp
   HexagonMCInstLower.cpp
   HexagonNewValueJump.cpp
+  HexagonOptAddrMode.cpp
   HexagonOptimizeSZextends.cpp
   HexagonPeephole.cpp
   HexagonRDF.cpp
@@ -55,7 +57,7 @@ add_llvm_target(HexagonCodeGen
   RDFDeadCode.cpp
   RDFGraph.cpp
   RDFLiveness.cpp
-)
+  )
 
 add_subdirectory(AsmParser)
 add_subdirectory(TargetInfo)
diff --git a/lib/Target/Hexagon/Disassembler/HexagonDisassembler.cpp b/lib/Target/Hexagon/Disassembler/HexagonDisassembler.cpp
index 4a9c3413cb29..7bc08ecfcab6 100644
--- a/lib/Target/Hexagon/Disassembler/HexagonDisassembler.cpp
+++ b/lib/Target/Hexagon/Disassembler/HexagonDisassembler.cpp
@@ -16,7 +16,7 @@
 #include "MCTargetDesc/HexagonMCInstrInfo.h"
 #include "MCTargetDesc/HexagonInstPrinter.h"
 #include "llvm/ADT/StringExtras.h"
-#include "llvm/MC/MCDisassembler.h"
+#include "llvm/MC/MCDisassembler/MCDisassembler.h"
 #include "llvm/MC/MCContext.h"
 #include "llvm/MC/MCExpr.h"
 #include "llvm/MC/MCFixedLenDisassembler.h"
@@ -30,7 +30,6 @@
 #include "llvm/Support/MemoryObject.h"
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/Support/TargetRegistry.h"
-#include <vector>
 
 using namespace llvm;
 using namespace Hexagon;
@@ -382,7 +381,8 @@ DecodeStatus HexagonDisassembler::getSingleInstruction(
       if (Producer >= Hexagon::W0 && Producer <= Hexagon::W15)
         Producer = ((Producer - Hexagon::W0) << 1) + SubregBit + Hexagon::V0;
       else if (SubregBit)
-        // Subreg bit should not be set for non-doublevector newvalue producers
+        // Hexagon PRM 10.11 New-value operands
+        // Nt[0] is reserved and should always be encoded as zero.
         return MCDisassembler::Fail;
       assert(Producer != Hexagon::NoRegister);
       MCO.setReg(Producer);
@@ -1459,6 +1459,7 @@ void HexagonDisassembler::addSubinstOperands(MCInst *MI, unsigned opcode,
     operand = getRegFromSubinstEncoding((inst & 0xf0) >> 4);
     Op = MCOperand::createReg(operand);
     MI->addOperand(Op);
+    break;
   case Hexagon::V4_SA1_and1:
   case Hexagon::V4_SA1_dec:
   case Hexagon::V4_SA1_inc:
diff --git a/lib/Target/Hexagon/Disassembler/Makefile b/lib/Target/Hexagon/Disassembler/Makefile
deleted file mode 100644
index 16c305fe4074..000000000000
--- a/lib/Target/Hexagon/Disassembler/Makefile
+++ /dev/null
@@ -1,16 +0,0 @@
-##===-- lib/Target/Hexagon/Disassembler/Makefile -----------*- Makefile -*-===##
-#
-#                     The LLVM Compiler Infrastructure
-#
-# This file is distributed under the University of Illinois Open Source
-# License. See LICENSE.TXT for details.
-#
-##===----------------------------------------------------------------------===##
-
-LEVEL = ../../../..
-LIBRARYNAME = LLVMHexagonDisassembler
-
-# Hack: we need to include 'main' target directory to grab private headers
-CPP.Flags += -I$(PROJ_OBJ_DIR)/.. -I$(PROJ_SRC_DIR)/..
-
-include $(LEVEL)/Makefile.common
diff --git a/lib/Target/Hexagon/Hexagon.td b/lib/Target/Hexagon/Hexagon.td
index 5a7eb215de42..aaa0f3e9b3d3 100644
--- a/lib/Target/Hexagon/Hexagon.td
+++ b/lib/Target/Hexagon/Hexagon.td
@@ -47,7 +47,6 @@ def IEEERndNearV5T     : Predicate<"HST->modeIEEERndNear()">;
 def UseHVXDbl          : Predicate<"HST->useHVXDblOps()">,
                          AssemblerPredicate<"ExtensionHVXDbl">;
 def UseHVXSgl          : Predicate<"HST->useHVXSglOps()">;
-
 def UseHVX             : Predicate<"HST->useHVXSglOps() ||HST->useHVXDblOps()">,
                          AssemblerPredicate<"ExtensionHVX">;
 
@@ -171,6 +170,15 @@ def getBaseWithImmOffset : InstrMapping {
   let ValueCols = [["BaseImmOffset"]];
 }
 
+def getAbsoluteForm : InstrMapping {
+  let FilterClass = "AddrModeRel";
+  let RowFields = ["CextOpcode", "PredSense", "PNewValue", "isNVStore",
+                   "isFloat"];
+  let ColFields = ["addrMode"];
+  let KeyCol = ["BaseImmOffset"];
+  let ValueCols = [["Absolute"]];
+}
+
 def getBaseWithRegOffset : InstrMapping {
   let FilterClass = "AddrModeRel";
   let RowFields = ["CextOpcode", "PredSense", "PNewValue", "isNVStore"];
@@ -179,6 +187,22 @@ def getBaseWithRegOffset : InstrMapping {
   let ValueCols = [["BaseRegOffset"]];
 }
 
+def xformRegToImmOffset : InstrMapping {
+  let FilterClass = "AddrModeRel";
+  let RowFields = ["CextOpcode", "PredSense", "PNewValue", "isNVStore"];
+  let ColFields = ["addrMode"];
+  let KeyCol = ["BaseRegOffset"];
+  let ValueCols = [["BaseImmOffset"]];
+}
+
+def getBaseWithLongOffset : InstrMapping {
+  let FilterClass = "ImmRegShl";
+  let RowFields = ["CextOpcode", "PredSense", "PNewValue", "isNVStore"];
+  let ColFields = ["addrMode"];
+  let KeyCol = ["BaseRegOffset"];
+  let ValueCols = [["BaseLongOffset"]];
+}
+
 def getRegForm : InstrMapping {
   let FilterClass = "ImmRegRel";
   let RowFields = ["CextOpcode", "PredSense", "PNewValue"];
@@ -252,6 +276,7 @@ def : Proc<"hexagonv60", HexagonModelV60,
 //===----------------------------------------------------------------------===//
 
 def HexagonAsmParser : AsmParser {
+  let ShouldEmitMatchRegisterAltName = 1;
   bit HasMnemonicFirst = 0;
 }
 
diff --git a/lib/Target/Hexagon/HexagonAsmPrinter.cpp b/lib/Target/Hexagon/HexagonAsmPrinter.cpp
index 4c7c0392a132..cd954a146104 100644
--- a/lib/Target/Hexagon/HexagonAsmPrinter.cpp
+++ b/lib/Target/Hexagon/HexagonAsmPrinter.cpp
@@ -21,8 +21,6 @@
 #include "MCTargetDesc/HexagonInstPrinter.h"
 #include "MCTargetDesc/HexagonMCInstrInfo.h"
 #include "MCTargetDesc/HexagonMCShuffler.h"
-#include "llvm/ADT/SmallString.h"
-#include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/StringExtras.h"
 #include "llvm/Analysis/ConstantFolding.h"
 #include "llvm/CodeGen/AsmPrinter.h"
@@ -44,7 +42,6 @@
 #include "llvm/MC/MCStreamer.h"
 #include "llvm/MC/MCSymbol.h"
 #include "llvm/Support/CommandLine.h"
-#include "llvm/Support/Compiler.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/ELF.h"
 #include "llvm/Support/Format.h"
@@ -264,6 +261,19 @@ void HexagonAsmPrinter::HexagonProcessInstruction(MCInst &Inst,
   switch (Inst.getOpcode()) {
   default: return;
 
+  case Hexagon::A2_iconst: {
+    Inst.setOpcode(Hexagon::A2_addi);
+    MCOperand Reg = Inst.getOperand(0);
+    MCOperand S16 = Inst.getOperand(1);
+    HexagonMCInstrInfo::setMustNotExtend(*S16.getExpr());
+    HexagonMCInstrInfo::setS23_2_reloc(*S16.getExpr());
+    Inst.clear();
+    Inst.addOperand(Reg);
+    Inst.addOperand(MCOperand::createReg(Hexagon::R0));
+    Inst.addOperand(S16);
+    break;
+  }
+
   // "$dst = CONST64(#$src1)",
   case Hexagon::CONST64_Float_Real:
   case Hexagon::CONST64_Int_Real:
@@ -297,8 +307,8 @@ void HexagonAsmPrinter::HexagonProcessInstruction(MCInst &Inst,
       MCOperand &Reg = MappedInst.getOperand(0);
       TmpInst.setOpcode(Hexagon::L2_loadrigp);
       TmpInst.addOperand(Reg);
-      TmpInst.addOperand(MCOperand::createExpr(
-                         MCSymbolRefExpr::create(Sym, OutContext)));
+      TmpInst.addOperand(MCOperand::createExpr(HexagonMCExpr::create(
+          MCSymbolRefExpr::create(Sym, OutContext), OutContext)));
       MappedInst = TmpInst;
     }
     break;
@@ -367,7 +377,8 @@ void HexagonAsmPrinter::HexagonProcessInstruction(MCInst &Inst,
     int64_t Imm;
     MCExpr const *Expr = MO.getExpr();
     bool Success = Expr->evaluateAsAbsolute(Imm);
-    assert (Success && "Expected immediate and none was found");(void)Success;
+    assert (Success && "Expected immediate and none was found");
+    (void)Success;
     MCInst TmpInst;
     if (Imm == 0) {
       TmpInst.setOpcode(Hexagon::S2_vsathub);
@@ -381,7 +392,8 @@ void HexagonAsmPrinter::HexagonProcessInstruction(MCInst &Inst,
     TmpInst.addOperand(MappedInst.getOperand(1));
     const MCExpr *One = MCConstantExpr::create(1, OutContext);
     const MCExpr *Sub = MCBinaryExpr::createSub(Expr, One, OutContext);
-    TmpInst.addOperand(MCOperand::createExpr(Sub));
+    TmpInst.addOperand(
+        MCOperand::createExpr(HexagonMCExpr::create(Sub, OutContext)));
     MappedInst = TmpInst;
     return;
   }
@@ -391,7 +403,8 @@ void HexagonAsmPrinter::HexagonProcessInstruction(MCInst &Inst,
     MCExpr const *Expr = MO2.getExpr();
     int64_t Imm;
     bool Success = Expr->evaluateAsAbsolute(Imm);
-    assert (Success && "Expected immediate and none was found");(void)Success;
+    assert (Success && "Expected immediate and none was found");
+    (void)Success;
     MCInst TmpInst;
     if (Imm == 0) {
       TmpInst.setOpcode(Hexagon::A2_combinew);
@@ -414,7 +427,8 @@ void HexagonAsmPrinter::HexagonProcessInstruction(MCInst &Inst,
     TmpInst.addOperand(MappedInst.getOperand(1));
     const MCExpr *One = MCConstantExpr::create(1, OutContext);
     const MCExpr *Sub = MCBinaryExpr::createSub(Expr, One, OutContext);
-    TmpInst.addOperand(MCOperand::createExpr(Sub));
+    TmpInst.addOperand(
+        MCOperand::createExpr(HexagonMCExpr::create(Sub, OutContext)));
     MappedInst = TmpInst;
     return;
   }
@@ -424,7 +438,8 @@ void HexagonAsmPrinter::HexagonProcessInstruction(MCInst &Inst,
     MCExpr const *Expr = MO.getExpr();
     int64_t Imm;
     bool Success = Expr->evaluateAsAbsolute(Imm);
-    assert (Success && "Expected immediate and none was found");(void)Success;
+    assert (Success && "Expected immediate and none was found");
+    (void)Success;
     MCInst TmpInst;
     if (Imm == 0) {
       TmpInst.setOpcode(Hexagon::A2_tfr);
@@ -438,7 +453,8 @@ void HexagonAsmPrinter::HexagonProcessInstruction(MCInst &Inst,
     TmpInst.addOperand(MappedInst.getOperand(1));
     const MCExpr *One = MCConstantExpr::create(1, OutContext);
     const MCExpr *Sub = MCBinaryExpr::createSub(Expr, One, OutContext);
-    TmpInst.addOperand(MCOperand::createExpr(Sub));
+    TmpInst.addOperand(
+        MCOperand::createExpr(HexagonMCExpr::create(Sub, OutContext)));
     MappedInst = TmpInst;
     return;
   }
@@ -470,10 +486,10 @@ void HexagonAsmPrinter::HexagonProcessInstruction(MCInst &Inst,
     bool Success = MO.getExpr()->evaluateAsAbsolute(Imm);
     if (Success && Imm < 0) {
       const MCExpr *MOne = MCConstantExpr::create(-1, OutContext);
-      TmpInst.addOperand(MCOperand::createExpr(MOne));
+      TmpInst.addOperand(MCOperand::createExpr(HexagonMCExpr::create(MOne, OutContext)));
     } else {
       const MCExpr *Zero = MCConstantExpr::create(0, OutContext);
-      TmpInst.addOperand(MCOperand::createExpr(Zero));
+      TmpInst.addOperand(MCOperand::createExpr(HexagonMCExpr::create(Zero, OutContext)));
     }
     TmpInst.addOperand(MO);
     MappedInst = TmpInst;
@@ -523,12 +539,13 @@ void HexagonAsmPrinter::HexagonProcessInstruction(MCInst &Inst,
     MCExpr const *Expr = Imm.getExpr();
     int64_t Value;
     bool Success = Expr->evaluateAsAbsolute(Value);
-    assert(Success);(void)Success;
+    assert(Success);
+    (void)Success;
     if (Value < 0 && Value > -256) {
       MappedInst.setOpcode(Hexagon::M2_mpysin);
-      Imm.setExpr(MCUnaryExpr::createMinus(Expr, OutContext));
-    }
-    else
+      Imm.setExpr(HexagonMCExpr::create(
+          MCUnaryExpr::createMinus(Expr, OutContext), OutContext));
+    } else
       MappedInst.setOpcode(Hexagon::M2_mpysip);
     return;
   }
diff --git a/lib/Target/Hexagon/HexagonBitSimplify.cpp b/lib/Target/Hexagon/HexagonBitSimplify.cpp
index 4d2b54521e83..c8b4a4cf9382 100644
--- a/lib/Target/Hexagon/HexagonBitSimplify.cpp
+++ b/lib/Target/Hexagon/HexagonBitSimplify.cpp
@@ -9,18 +9,17 @@
 
 #define DEBUG_TYPE "hexbit"
 
-#include "llvm/CodeGen/Passes.h"
+#include "HexagonBitTracker.h"
+#include "HexagonTargetMachine.h"
 #include "llvm/CodeGen/MachineDominators.h"
 #include "llvm/CodeGen/MachineFunctionPass.h"
 #include "llvm/CodeGen/MachineInstrBuilder.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
-#include "llvm/Support/CommandLine.h"
+#include "llvm/CodeGen/Passes.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/raw_ostream.h"
-#include "llvm/Target/TargetMachine.h"
 #include "llvm/Target/TargetInstrInfo.h"
-#include "HexagonTargetMachine.h"
-#include "HexagonBitTracker.h"
+#include "llvm/Target/TargetMachine.h"
 
 using namespace llvm;
 
@@ -159,8 +158,6 @@ namespace {
     static void getInstrUses(const MachineInstr &MI, RegisterSet &Uses);
     static bool isEqual(const BitTracker::RegisterCell &RC1, uint16_t B1,
         const BitTracker::RegisterCell &RC2, uint16_t B2, uint16_t W);
-    static bool isConst(const BitTracker::RegisterCell &RC, uint16_t B,
-        uint16_t W);
     static bool isZero(const BitTracker::RegisterCell &RC, uint16_t B,
         uint16_t W);
     static bool getConst(const BitTracker::RegisterCell &RC, uint16_t B,
@@ -284,17 +281,6 @@ bool HexagonBitSimplify::isEqual(const BitTracker::RegisterCell &RC1,
   return true;
 }
 
-
-bool HexagonBitSimplify::isConst(const BitTracker::RegisterCell &RC,
-      uint16_t B, uint16_t W) {
-  assert(B < RC.width() && B+W <= RC.width());
-  for (uint16_t i = B; i < B+W; ++i)
-    if (!RC[i].num())
-      return false;
-  return true;
-}
-
-
 bool HexagonBitSimplify::isZero(const BitTracker::RegisterCell &RC,
       uint16_t B, uint16_t W) {
   assert(B < RC.width() && B+W <= RC.width());
@@ -876,6 +862,12 @@ const TargetRegisterClass *HexagonBitSimplify::getFinalVRegClass(
     case Hexagon::DoubleRegsRegClassID:
       VerifySR(RR.Sub);
       return &Hexagon::IntRegsRegClass;
+    case Hexagon::VecDblRegsRegClassID:
+      VerifySR(RR.Sub);
+      return &Hexagon::VectorRegsRegClass;
+    case Hexagon::VecDblRegs128BRegClassID:
+      VerifySR(RR.Sub);
+      return &Hexagon::VectorRegs128BRegClass;
   }
   return nullptr;
 }
@@ -1297,7 +1289,7 @@ bool RedundantInstrElimination::processBlock(MachineBasicBlock &B,
         continue;
 
       // If found, replace the instruction with a COPY.
-      DebugLoc DL = MI->getDebugLoc();
+      const DebugLoc &DL = MI->getDebugLoc();
       const TargetRegisterClass *FRC = HBS::getFinalVRegClass(RD, MRI);
       unsigned NewR = MRI.createVirtualRegister(FRC);
       BuildMI(B, At, DL, HII.get(TargetOpcode::COPY), NewR)
@@ -1326,7 +1318,7 @@ namespace {
       : Transformation(true), HII(hii), MRI(mri), BT(bt) {}
     bool processBlock(MachineBasicBlock &B, const RegisterSet &AVs) override;
   private:
-    bool isTfrConst(const MachineInstr *MI) const;
+    bool isTfrConst(const MachineInstr &MI) const;
     bool isConst(unsigned R, int64_t &V) const;
     unsigned genTfrConst(const TargetRegisterClass *RC, int64_t C,
         MachineBasicBlock &B, MachineBasicBlock::iterator At, DebugLoc &DL);
@@ -1354,9 +1346,8 @@ bool ConstGeneration::isConst(unsigned R, int64_t &C) const {
   return true;
 }
 
-
-bool ConstGeneration::isTfrConst(const MachineInstr *MI) const {
-  unsigned Opc = MI->getOpcode();
+bool ConstGeneration::isTfrConst(const MachineInstr &MI) const {
+  unsigned Opc = MI.getOpcode();
   switch (Opc) {
     case Hexagon::A2_combineii:
     case Hexagon::A4_combineii:
@@ -1426,7 +1417,7 @@ bool ConstGeneration::processBlock(MachineBasicBlock &B, const RegisterSet&) {
   RegisterSet Defs;
 
   for (auto I = B.begin(), E = B.end(); I != E; ++I) {
-    if (isTfrConst(I))
+    if (isTfrConst(*I))
       continue;
     Defs.clear();
     HBS::getInstrDefs(*I, Defs);
@@ -1960,11 +1951,10 @@ bool BitSimplification::genExtractHalf(MachineInstr *MI,
     NewR = MRI.createVirtualRegister(&Hexagon::IntRegsRegClass);
     BuildMI(B, At, DL, HII.get(Hexagon::A2_zxth), NewR)
         .addReg(L.Reg, 0, L.Sub);
-  } else if (!L.Low && Opc != Hexagon::S2_extractu) {
+  } else if (!L.Low && Opc != Hexagon::S2_lsr_i_r) {
     NewR = MRI.createVirtualRegister(&Hexagon::IntRegsRegClass);
-    BuildMI(B, MI, DL, HII.get(Hexagon::S2_extractu), NewR)
+    BuildMI(B, MI, DL, HII.get(Hexagon::S2_lsr_i_r), NewR)
         .addReg(L.Reg, 0, L.Sub)
-        .addImm(16)
         .addImm(16);
   }
   if (NewR == 0)
@@ -2187,6 +2177,9 @@ bool BitSimplification::processBlock(MachineBasicBlock &B,
 
 
 bool HexagonBitSimplify::runOnMachineFunction(MachineFunction &MF) {
+  if (skipFunction(*MF.getFunction()))
+    return false;
+
   auto &HST = MF.getSubtarget<HexagonSubtarget>();
   auto &HRI = *HST.getRegisterInfo();
   auto &HII = *HST.getInstrInfo();
@@ -2729,6 +2722,9 @@ bool HexagonLoopRescheduling::processLoop(LoopCand &C) {
 
 
 bool HexagonLoopRescheduling::runOnMachineFunction(MachineFunction &MF) {
+  if (skipFunction(*MF.getFunction()))
+    return false;
+
   auto &HST = MF.getSubtarget<HexagonSubtarget>();
   HII = HST.getInstrInfo();
   HRI = HST.getRegisterInfo();
diff --git a/lib/Target/Hexagon/HexagonBitTracker.cpp b/lib/Target/Hexagon/HexagonBitTracker.cpp
index d5848dc45a3b..78b57d27ad50 100644
--- a/lib/Target/Hexagon/HexagonBitTracker.cpp
+++ b/lib/Target/Hexagon/HexagonBitTracker.cpp
@@ -102,9 +102,9 @@ class RegisterRefs {
   std::vector<BT::RegisterRef> Vector;
 
 public:
-  RegisterRefs(const MachineInstr *MI) : Vector(MI->getNumOperands()) {
+  RegisterRefs(const MachineInstr &MI) : Vector(MI.getNumOperands()) {
     for (unsigned i = 0, n = Vector.size(); i < n; ++i) {
-      const MachineOperand &MO = MI->getOperand(i);
+      const MachineOperand &MO = MI.getOperand(i);
       if (MO.isReg())
         Vector[i] = BT::RegisterRef(MO);
       // For indices that don't correspond to registers, the entry will
@@ -121,13 +121,14 @@ public:
 };
 }
 
-bool HexagonEvaluator::evaluate(const MachineInstr *MI,
-      const CellMapType &Inputs, CellMapType &Outputs) const {
+bool HexagonEvaluator::evaluate(const MachineInstr &MI,
+                                const CellMapType &Inputs,
+                                CellMapType &Outputs) const {
   unsigned NumDefs = 0;
 
   // Sanity verification: there should not be any defs with subregisters.
-  for (unsigned i = 0, n = MI->getNumOperands(); i < n; ++i) {
-    const MachineOperand &MO = MI->getOperand(i);
+  for (unsigned i = 0, n = MI.getNumOperands(); i < n; ++i) {
+    const MachineOperand &MO = MI.getOperand(i);
     if (!MO.isReg() || !MO.isDef())
       continue;
     NumDefs++;
@@ -137,7 +138,7 @@ bool HexagonEvaluator::evaluate(const MachineInstr *MI,
   if (NumDefs == 0)
     return false;
 
-  if (MI->mayLoad())
+  if (MI.mayLoad())
     return evaluateLoad(MI, Inputs, Outputs);
 
   // Check COPY instructions that copy formal parameters into virtual
@@ -154,7 +155,7 @@ bool HexagonEvaluator::evaluate(const MachineInstr *MI,
   // was not a COPY, it would not be clear how to mirror that extension
   // on the callee's side. For that reason, only check COPY instructions
   // for potential extensions.
-  if (MI->isCopy()) {
+  if (MI.isCopy()) {
     if (evaluateFormalCopy(MI, Inputs, Outputs))
       return true;
   }
@@ -165,19 +166,19 @@ bool HexagonEvaluator::evaluate(const MachineInstr *MI,
   // checking what kind of operand a given instruction has individually
   // for each instruction, do it here. Global symbols as operands gene-
   // rally do not provide any useful information.
-  for (unsigned i = 0, n = MI->getNumOperands(); i < n; ++i) {
-    const MachineOperand &MO = MI->getOperand(i);
+  for (unsigned i = 0, n = MI.getNumOperands(); i < n; ++i) {
+    const MachineOperand &MO = MI.getOperand(i);
     if (MO.isGlobal() || MO.isBlockAddress() || MO.isSymbol() || MO.isJTI() ||
         MO.isCPI())
       return false;
   }
 
   RegisterRefs Reg(MI);
-  unsigned Opc = MI->getOpcode();
+  unsigned Opc = MI.getOpcode();
   using namespace Hexagon;
-  #define op(i) MI->getOperand(i)
-  #define rc(i) RegisterCell::ref(getCell(Reg[i],Inputs))
-  #define im(i) MI->getOperand(i).getImm()
+#define op(i) MI.getOperand(i)
+#define rc(i) RegisterCell::ref(getCell(Reg[i], Inputs))
+#define im(i) MI.getOperand(i).getImm()
 
   // If the instruction has no register operands, skip it.
   if (Reg.size() == 0)
@@ -190,9 +191,9 @@ bool HexagonEvaluator::evaluate(const MachineInstr *MI,
     return true;
   };
   // Get the cell corresponding to the N-th operand.
-  auto cop = [this,&Reg,&MI,&Inputs] (unsigned N, uint16_t W)
-        -> BT::RegisterCell {
-    const MachineOperand &Op = MI->getOperand(N);
+  auto cop = [this, &Reg, &MI, &Inputs](unsigned N,
+                                        uint16_t W) -> BT::RegisterCell {
+    const MachineOperand &Op = MI.getOperand(N);
     if (Op.isImm())
       return eIMM(Op.getImm(), W);
     if (!Op.isReg())
@@ -879,13 +880,13 @@ bool HexagonEvaluator::evaluate(const MachineInstr *MI,
   return false;
 }
 
-
-bool HexagonEvaluator::evaluate(const MachineInstr *BI,
-      const CellMapType &Inputs, BranchTargetList &Targets,
-      bool &FallsThru) const {
+bool HexagonEvaluator::evaluate(const MachineInstr &BI,
+                                const CellMapType &Inputs,
+                                BranchTargetList &Targets,
+                                bool &FallsThru) const {
   // We need to evaluate one branch at a time. TII::AnalyzeBranch checks
   // all the branches in a basic block at once, so we cannot use it.
-  unsigned Opc = BI->getOpcode();
+  unsigned Opc = BI.getOpcode();
   bool SimpleBranch = false;
   bool Negated = false;
   switch (Opc) {
@@ -901,7 +902,7 @@ bool HexagonEvaluator::evaluate(const MachineInstr *BI,
       SimpleBranch = true;
       break;
     case Hexagon::J2_jump:
-      Targets.insert(BI->getOperand(0).getMBB());
+      Targets.insert(BI.getOperand(0).getMBB());
       FallsThru = false;
       return true;
     default:
@@ -914,7 +915,7 @@ bool HexagonEvaluator::evaluate(const MachineInstr *BI,
     return false;
 
   // BI is a conditional branch if we got here.
-  RegisterRef PR = BI->getOperand(0);
+  RegisterRef PR = BI.getOperand(0);
   RegisterCell PC = getCell(PR, Inputs);
   const BT::BitValue &Test = PC[0];
 
@@ -929,18 +930,18 @@ bool HexagonEvaluator::evaluate(const MachineInstr *BI,
     return true;
   }
 
-  Targets.insert(BI->getOperand(1).getMBB());
+  Targets.insert(BI.getOperand(1).getMBB());
   FallsThru = false;
   return true;
 }
 
-
-bool HexagonEvaluator::evaluateLoad(const MachineInstr *MI,
-      const CellMapType &Inputs, CellMapType &Outputs) const {
+bool HexagonEvaluator::evaluateLoad(const MachineInstr &MI,
+                                    const CellMapType &Inputs,
+                                    CellMapType &Outputs) const {
   if (TII.isPredicated(MI))
     return false;
-  assert(MI->mayLoad() && "A load that mayn't?");
-  unsigned Opc = MI->getOpcode();
+  assert(MI.mayLoad() && "A load that mayn't?");
+  unsigned Opc = MI.getOpcode();
 
   uint16_t BitNum;
   bool SignEx;
@@ -1067,7 +1068,7 @@ bool HexagonEvaluator::evaluateLoad(const MachineInstr *MI,
       break;
   }
 
-  const MachineOperand &MD = MI->getOperand(0);
+  const MachineOperand &MD = MI.getOperand(0);
   assert(MD.isReg() && MD.isDef());
   RegisterRef RD = MD;
 
@@ -1091,15 +1092,15 @@ bool HexagonEvaluator::evaluateLoad(const MachineInstr *MI,
   return true;
 }
 
-
-bool HexagonEvaluator::evaluateFormalCopy(const MachineInstr *MI,
-      const CellMapType &Inputs, CellMapType &Outputs) const {
+bool HexagonEvaluator::evaluateFormalCopy(const MachineInstr &MI,
+                                          const CellMapType &Inputs,
+                                          CellMapType &Outputs) const {
   // If MI defines a formal parameter, but is not a copy (loads are handled
   // in evaluateLoad), then it's not clear what to do.
-  assert(MI->isCopy());
+  assert(MI.isCopy());
 
-  RegisterRef RD = MI->getOperand(0);
-  RegisterRef RS = MI->getOperand(1);
+  RegisterRef RD = MI.getOperand(0);
+  RegisterRef RS = MI.getOperand(1);
   assert(RD.Sub == 0);
   if (!TargetRegisterInfo::isPhysicalRegister(RS.Reg))
     return false;
diff --git a/lib/Target/Hexagon/HexagonBitTracker.h b/lib/Target/Hexagon/HexagonBitTracker.h
index 897af2d71870..9e7b1dbe298f 100644
--- a/lib/Target/Hexagon/HexagonBitTracker.h
+++ b/lib/Target/Hexagon/HexagonBitTracker.h
@@ -26,9 +26,9 @@ struct HexagonEvaluator : public BitTracker::MachineEvaluator {
   HexagonEvaluator(const HexagonRegisterInfo &tri, MachineRegisterInfo &mri,
                    const HexagonInstrInfo &tii, MachineFunction &mf);
 
-  bool evaluate(const MachineInstr *MI, const CellMapType &Inputs,
+  bool evaluate(const MachineInstr &MI, const CellMapType &Inputs,
                 CellMapType &Outputs) const override;
-  bool evaluate(const MachineInstr *BI, const CellMapType &Inputs,
+  bool evaluate(const MachineInstr &BI, const CellMapType &Inputs,
                 BranchTargetList &Targets, bool &FallsThru) const override;
 
   BitTracker::BitMask mask(unsigned Reg, unsigned Sub) const override;
@@ -38,9 +38,9 @@ struct HexagonEvaluator : public BitTracker::MachineEvaluator {
   const HexagonInstrInfo &TII;
 
 private:
-  bool evaluateLoad(const MachineInstr *MI, const CellMapType &Inputs,
+  bool evaluateLoad(const MachineInstr &MI, const CellMapType &Inputs,
                     CellMapType &Outputs) const;
-  bool evaluateFormalCopy(const MachineInstr *MI, const CellMapType &Inputs,
+  bool evaluateFormalCopy(const MachineInstr &MI, const CellMapType &Inputs,
                           CellMapType &Outputs) const;
 
   unsigned getNextPhysReg(unsigned PReg, unsigned Width) const;
diff --git a/lib/Target/Hexagon/HexagonBlockRanges.cpp b/lib/Target/Hexagon/HexagonBlockRanges.cpp
new file mode 100644
index 000000000000..5c44029dc6e7
--- /dev/null
+++ b/lib/Target/Hexagon/HexagonBlockRanges.cpp
@@ -0,0 +1,483 @@
+//===--- HexagonBlockRanges.cpp -------------------------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "hbr"
+
+#include "HexagonBlockRanges.h"
+#include "HexagonInstrInfo.h"
+#include "HexagonSubtarget.h"
+
+#include "llvm/ADT/BitVector.h"
+#include "llvm/CodeGen/MachineBasicBlock.h"
+#include "llvm/CodeGen/MachineInstr.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/Support/Compiler.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Target/TargetInstrInfo.h"
+#include "llvm/Target/TargetRegisterInfo.h"
+
+#include <map>
+
+using namespace llvm;
+
+bool HexagonBlockRanges::IndexRange::overlaps(const IndexRange &A) const {
+  // If A contains start(), or "this" contains A.start(), then overlap.
+  IndexType S = start(), E = end(), AS = A.start(), AE = A.end();
+  if (AS == S)
+    return true;
+  bool SbAE = (S < AE) || (S == AE && A.TiedEnd);  // S-before-AE.
+  bool ASbE = (AS < E) || (AS == E && TiedEnd);    // AS-before-E.
+  if ((AS < S && SbAE) || (S < AS && ASbE))
+    return true;
+  // Otherwise no overlap.
+  return false;
+}
+
+
+bool HexagonBlockRanges::IndexRange::contains(const IndexRange &A) const {
+  if (start() <= A.start()) {
+    // Treat "None" in the range end as equal to the range start.
+    IndexType E = (end() != IndexType::None) ? end() : start();
+    IndexType AE = (A.end() != IndexType::None) ? A.end() : A.start();
+    if (AE <= E)
+      return true;
+  }
+  return false;
+}
+
+
+void HexagonBlockRanges::IndexRange::merge(const IndexRange &A) {
+  // Allow merging adjacent ranges.
+  assert(end() == A.start() || overlaps(A));
+  IndexType AS = A.start(), AE = A.end();
+  if (AS < start() || start() == IndexType::None)
+    setStart(AS);
+  if (end() < AE || end() == IndexType::None) {
+    setEnd(AE);
+    TiedEnd = A.TiedEnd;
+  } else {
+    if (end() == AE)
+      TiedEnd |= A.TiedEnd;
+  }
+  if (A.Fixed)
+    Fixed = true;
+}
+
+
+void HexagonBlockRanges::RangeList::include(const RangeList &RL) {
+  for (auto &R : RL)
+    if (std::find(begin(), end(), R) == end())
+      push_back(R);
+}
+
+
+// Merge all overlapping ranges in the list, so that all that remains
+// is a list of disjoint ranges.
+void HexagonBlockRanges::RangeList::unionize(bool MergeAdjacent) {
+  if (empty())
+    return;
+
+  std::sort(begin(), end());
+  iterator Iter = begin();
+
+  while (Iter != end()-1) {
+    iterator Next = std::next(Iter);
+    // If MergeAdjacent is true, merge ranges A and B, where A.end == B.start.
+    // This allows merging dead ranges, but is not valid for live ranges.
+    bool Merge = MergeAdjacent && (Iter->end() == Next->start());
+    if (Merge || Iter->overlaps(*Next)) {
+      Iter->merge(*Next);
+      erase(Next);
+      continue;
+    }
+    ++Iter;
+  }
+}
+
+
+// Compute a range A-B and add it to the list.
+void HexagonBlockRanges::RangeList::addsub(const IndexRange &A,
+      const IndexRange &B) {
+  // Exclusion of non-overlapping ranges makes some checks simpler
+  // later in this function.
+  if (!A.overlaps(B)) {
+    // A - B = A.
+    add(A);
+    return;
+  }
+
+  IndexType AS = A.start(), AE = A.end();
+  IndexType BS = B.start(), BE = B.end();
+
+  // If AE is None, then A is included in B, since A and B overlap.
+  // The result of subtraction if empty, so just return.
+  if (AE == IndexType::None)
+    return;
+
+  if (AS < BS) {
+    // A starts before B.
+    // AE cannot be None since A and B overlap.
+    assert(AE != IndexType::None);
+    // Add the part of A that extends on the "less" side of B.
+    add(AS, BS, A.Fixed, false);
+  }
+
+  if (BE < AE) {
+    // BE cannot be Exit here.
+    if (BE == IndexType::None)
+      add(BS, AE, A.Fixed, false);
+    else
+      add(BE, AE, A.Fixed, false);
+  }
+}
+
+
+// Subtract a given range from each element in the list.
+void HexagonBlockRanges::RangeList::subtract(const IndexRange &Range) {
+  // Cannot assume that the list is unionized (i.e. contains only non-
+  // overlapping ranges.
+  RangeList T;
+  for (iterator Next, I = begin(); I != end(); I = Next) {
+    IndexRange &Rg = *I;
+    if (Rg.overlaps(Range)) {
+      T.addsub(Rg, Range);
+      Next = this->erase(I);
+    } else {
+      Next = std::next(I);
+    }
+  }
+  include(T);
+}
+
+
+HexagonBlockRanges::InstrIndexMap::InstrIndexMap(MachineBasicBlock &B)
+    : Block(B) {
+  IndexType Idx = IndexType::First;
+  First = Idx;
+  for (auto &In : B) {
+    if (In.isDebugValue())
+      continue;
+    assert(getIndex(&In) == IndexType::None && "Instruction already in map");
+    Map.insert(std::make_pair(Idx, &In));
+    ++Idx;
+  }
+  Last = B.empty() ? IndexType::None : unsigned(Idx)-1;
+}
+
+
+MachineInstr *HexagonBlockRanges::InstrIndexMap::getInstr(IndexType Idx) const {
+  auto F = Map.find(Idx);
+  return (F != Map.end()) ? F->second : 0;
+}
+
+
+HexagonBlockRanges::IndexType HexagonBlockRanges::InstrIndexMap::getIndex(
+      MachineInstr *MI) const {
+  for (auto &I : Map)
+    if (I.second == MI)
+      return I.first;
+  return IndexType::None;
+}
+
+
+HexagonBlockRanges::IndexType HexagonBlockRanges::InstrIndexMap::getPrevIndex(
+      IndexType Idx) const {
+  assert (Idx != IndexType::None);
+  if (Idx == IndexType::Entry)
+    return IndexType::None;
+  if (Idx == IndexType::Exit)
+    return Last;
+  if (Idx == First)
+    return IndexType::Entry;
+  return unsigned(Idx)-1;
+}
+
+
+HexagonBlockRanges::IndexType HexagonBlockRanges::InstrIndexMap::getNextIndex(
+      IndexType Idx) const {
+  assert (Idx != IndexType::None);
+  if (Idx == IndexType::Entry)
+    return IndexType::First;
+  if (Idx == IndexType::Exit || Idx == Last)
+    return IndexType::None;
+  return unsigned(Idx)+1;
+}
+
+
+void HexagonBlockRanges::InstrIndexMap::replaceInstr(MachineInstr *OldMI,
+      MachineInstr *NewMI) {
+  for (auto &I : Map) {
+    if (I.second != OldMI)
+      continue;
+    if (NewMI != nullptr)
+      I.second = NewMI;
+    else
+      Map.erase(I.first);
+    break;
+  }
+}
+
+
+HexagonBlockRanges::HexagonBlockRanges(MachineFunction &mf)
+  : MF(mf), HST(mf.getSubtarget<HexagonSubtarget>()),
+    TII(*HST.getInstrInfo()), TRI(*HST.getRegisterInfo()),
+    Reserved(TRI.getReservedRegs(mf)) {
+  // Consider all non-allocatable registers as reserved.
+  for (auto I = TRI.regclass_begin(), E = TRI.regclass_end(); I != E; ++I) {
+    auto *RC = *I;
+    if (RC->isAllocatable())
+      continue;
+    for (unsigned R : *RC)
+      Reserved[R] = true;
+  }
+}
+
+
+HexagonBlockRanges::RegisterSet HexagonBlockRanges::getLiveIns(
+      const MachineBasicBlock &B) {
+  RegisterSet LiveIns;
+  for (auto I : B.liveins())
+    if (!Reserved[I.PhysReg])
+      LiveIns.insert({I.PhysReg, 0});
+  return LiveIns;
+}
+
+
+HexagonBlockRanges::RegisterSet HexagonBlockRanges::expandToSubRegs(
+      RegisterRef R, const MachineRegisterInfo &MRI,
+      const TargetRegisterInfo &TRI) {
+  RegisterSet SRs;
+
+  if (R.Sub != 0) {
+    SRs.insert(R);
+    return SRs;
+  }
+
+  if (TargetRegisterInfo::isPhysicalRegister(R.Reg)) {
+    MCSubRegIterator I(R.Reg, &TRI);
+    if (!I.isValid())
+      SRs.insert({R.Reg, 0});
+    for (; I.isValid(); ++I)
+      SRs.insert({*I, 0});
+  } else {
+    assert(TargetRegisterInfo::isVirtualRegister(R.Reg));
+    auto &RC = *MRI.getRegClass(R.Reg);
+    unsigned PReg = *RC.begin();
+    MCSubRegIndexIterator I(PReg, &TRI);
+    if (!I.isValid())
+      SRs.insert({R.Reg, 0});
+    for (; I.isValid(); ++I)
+      SRs.insert({R.Reg, I.getSubRegIndex()});
+  }
+  return SRs;
+}
+
+
+void HexagonBlockRanges::computeInitialLiveRanges(InstrIndexMap &IndexMap,
+      RegToRangeMap &LiveMap) {
+  std::map<RegisterRef,IndexType> LastDef, LastUse;
+  RegisterSet LiveOnEntry;
+  MachineBasicBlock &B = IndexMap.getBlock();
+  MachineRegisterInfo &MRI = B.getParent()->getRegInfo();
+
+  for (auto R : getLiveIns(B))
+    for (auto S : expandToSubRegs(R, MRI, TRI))
+      LiveOnEntry.insert(S);
+
+  for (auto R : LiveOnEntry)
+    LastDef[R] = IndexType::Entry;
+
+  auto closeRange = [&LastUse,&LastDef,&LiveMap] (RegisterRef R) -> void {
+    auto LD = LastDef[R], LU = LastUse[R];
+    if (LD == IndexType::None)
+      LD = IndexType::Entry;
+    if (LU == IndexType::None)
+      LU = IndexType::Exit;
+    LiveMap[R].add(LD, LU, false, false);
+    LastUse[R] = LastDef[R] = IndexType::None;
+  };
+
+  for (auto &In : B) {
+    if (In.isDebugValue())
+      continue;
+    IndexType Index = IndexMap.getIndex(&In);
+    // Process uses first.
+    for (auto &Op : In.operands()) {
+      if (!Op.isReg() || !Op.isUse() || Op.isUndef())
+        continue;
+      RegisterRef R = { Op.getReg(), Op.getSubReg() };
+      if (TargetRegisterInfo::isPhysicalRegister(R.Reg) && Reserved[R.Reg])
+        continue;
+      bool IsKill = Op.isKill();
+      for (auto S : expandToSubRegs(R, MRI, TRI)) {
+        LastUse[S] = Index;
+        if (IsKill)
+          closeRange(S);
+      }
+    }
+    // Process defs.
+    for (auto &Op : In.operands()) {
+      if (!Op.isReg() || !Op.isDef() || Op.isUndef())
+        continue;
+      RegisterRef R = { Op.getReg(), Op.getSubReg() };
+      if (TargetRegisterInfo::isPhysicalRegister(R.Reg) && Reserved[R.Reg])
+        continue;
+      for (auto S : expandToSubRegs(R, MRI, TRI)) {
+        if (LastDef[S] != IndexType::None || LastUse[S] != IndexType::None)
+          closeRange(S);
+        LastDef[S] = Index;
+      }
+    }
+  }
+
+  // Collect live-on-exit.
+  RegisterSet LiveOnExit;
+  for (auto *SB : B.successors())
+    for (auto R : getLiveIns(*SB))
+      for (auto S : expandToSubRegs(R, MRI, TRI))
+        LiveOnExit.insert(S);
+
+  for (auto R : LiveOnExit)
+    LastUse[R] = IndexType::Exit;
+
+  // Process remaining registers.
+  RegisterSet Left;
+  for (auto &I : LastUse)
+    if (I.second != IndexType::None)
+      Left.insert(I.first);
+  for (auto &I : LastDef)
+    if (I.second != IndexType::None)
+      Left.insert(I.first);
+  for (auto R : Left)
+    closeRange(R);
+
+  // Finalize the live ranges.
+  for (auto &P : LiveMap)
+    P.second.unionize();
+}
+
+
+HexagonBlockRanges::RegToRangeMap HexagonBlockRanges::computeLiveMap(
+      InstrIndexMap &IndexMap) {
+  RegToRangeMap LiveMap;
+  DEBUG(dbgs() << LLVM_FUNCTION_NAME << ": index map\n" << IndexMap << '\n');
+  computeInitialLiveRanges(IndexMap, LiveMap);
+  DEBUG(dbgs() << LLVM_FUNCTION_NAME << ": live map\n"
+               << PrintRangeMap(LiveMap, TRI) << '\n');
+  return LiveMap;
+}
+
+
+HexagonBlockRanges::RegToRangeMap HexagonBlockRanges::computeDeadMap(
+      InstrIndexMap &IndexMap, RegToRangeMap &LiveMap) {
+  RegToRangeMap DeadMap;
+
+  auto addDeadRanges = [&IndexMap,&LiveMap,&DeadMap] (RegisterRef R) -> void {
+    auto F = LiveMap.find(R);
+    if (F == LiveMap.end() || F->second.empty()) {
+      DeadMap[R].add(IndexType::Entry, IndexType::Exit, false, false);
+      return;
+    }
+
+    RangeList &RL = F->second;
+    RangeList::iterator A = RL.begin(), Z = RL.end()-1;
+
+    // Try to create the initial range.
+    if (A->start() != IndexType::Entry) {
+      IndexType DE = IndexMap.getPrevIndex(A->start());
+      if (DE != IndexType::Entry)
+        DeadMap[R].add(IndexType::Entry, DE, false, false);
+    }
+
+    while (A != Z) {
+      // Creating a dead range that follows A.  Pay attention to empty
+      // ranges (i.e. those ending with "None").
+      IndexType AE = (A->end() == IndexType::None) ? A->start() : A->end();
+      IndexType DS = IndexMap.getNextIndex(AE);
+      ++A;
+      IndexType DE = IndexMap.getPrevIndex(A->start());
+      if (DS < DE)
+        DeadMap[R].add(DS, DE, false, false);
+    }
+
+    // Try to create the final range.
+    if (Z->end() != IndexType::Exit) {
+      IndexType ZE = (Z->end() == IndexType::None) ? Z->start() : Z->end();
+      IndexType DS = IndexMap.getNextIndex(ZE);
+      if (DS < IndexType::Exit)
+        DeadMap[R].add(DS, IndexType::Exit, false, false);
+    }
+  };
+
+  MachineFunction &MF = *IndexMap.getBlock().getParent();
+  auto &MRI = MF.getRegInfo();
+  unsigned NumRegs = TRI.getNumRegs();
+  BitVector Visited(NumRegs);
+  for (unsigned R = 1; R < NumRegs; ++R) {
+    for (auto S : expandToSubRegs({R,0}, MRI, TRI)) {
+      if (Reserved[S.Reg] || Visited[S.Reg])
+        continue;
+      addDeadRanges(S);
+      Visited[S.Reg] = true;
+    }
+  }
+  for (auto &P : LiveMap)
+    if (TargetRegisterInfo::isVirtualRegister(P.first.Reg))
+      addDeadRanges(P.first);
+
+  DEBUG(dbgs() << LLVM_FUNCTION_NAME << ": dead map\n"
+               << PrintRangeMap(DeadMap, TRI) << '\n');
+  return DeadMap;
+}
+
+raw_ostream &llvm::operator<<(raw_ostream &OS,
+                              HexagonBlockRanges::IndexType Idx) {
+  if (Idx == HexagonBlockRanges::IndexType::None)
+    return OS << '-';
+  if (Idx == HexagonBlockRanges::IndexType::Entry)
+    return OS << 'n';
+  if (Idx == HexagonBlockRanges::IndexType::Exit)
+    return OS << 'x';
+  return OS << unsigned(Idx)-HexagonBlockRanges::IndexType::First+1;
+}
+
+// A mapping to translate between instructions and their indices.
+raw_ostream &llvm::operator<<(raw_ostream &OS,
+                              const HexagonBlockRanges::IndexRange &IR) {
+  OS << '[' << IR.start() << ':' << IR.end() << (IR.TiedEnd ? '}' : ']');
+  if (IR.Fixed)
+    OS << '!';
+  return OS;
+}
+
+raw_ostream &llvm::operator<<(raw_ostream &OS,
+                              const HexagonBlockRanges::RangeList &RL) {
+  for (auto &R : RL)
+    OS << R << " ";
+  return OS;
+}
+
+raw_ostream &llvm::operator<<(raw_ostream &OS,
+                              const HexagonBlockRanges::InstrIndexMap &M) {
+  for (auto &In : M.Block) {
+    HexagonBlockRanges::IndexType Idx = M.getIndex(&In);
+    OS << Idx << (Idx == M.Last ? ". " : "  ") << In;
+  }
+  return OS;
+}
+
+raw_ostream &llvm::operator<<(raw_ostream &OS,
+                              const HexagonBlockRanges::PrintRangeMap &P) {
+  for (auto &I : P.Map) {
+    const HexagonBlockRanges::RangeList &RL = I.second;
+    OS << PrintReg(I.first.Reg, &P.TRI, I.first.Sub) << " -> " << RL << "\n";
+  }
+  return OS;
+}
diff --git a/lib/Target/Hexagon/HexagonBlockRanges.h b/lib/Target/Hexagon/HexagonBlockRanges.h
new file mode 100644
index 000000000000..9c3f938f99eb
--- /dev/null
+++ b/lib/Target/Hexagon/HexagonBlockRanges.h
@@ -0,0 +1,239 @@
+//===--- HexagonBlockRanges.h ---------------------------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+#ifndef HEXAGON_BLOCK_RANGES_H
+#define HEXAGON_BLOCK_RANGES_H
+
+#include "llvm/ADT/BitVector.h"
+#include "llvm/CodeGen/MachineBasicBlock.h"
+#include "llvm/MC/MCRegisterInfo.h"  // For MCPhysReg.
+#include <map>
+#include <set>
+#include <vector>
+
+namespace llvm {
+  class Function;
+  class HexagonSubtarget;
+  class MachineBasicBlock;
+  class MachineFunction;
+  class MachineInstr;
+  class MCInstrDesc;
+  class raw_ostream;
+  class TargetInstrInfo;
+  class TargetRegisterClass;
+  class TargetRegisterInfo;
+  class Type;
+
+struct HexagonBlockRanges {
+  HexagonBlockRanges(MachineFunction &MF);
+
+  struct RegisterRef {
+    unsigned Reg, Sub;
+    bool operator<(RegisterRef R) const {
+      return Reg < R.Reg || (Reg == R.Reg && Sub < R.Sub);
+    }
+  };
+  typedef std::set<RegisterRef> RegisterSet;
+
+  // This is to represent an "index", which is an abstraction of a position
+  // of an instruction within a basic block.
+  class IndexType {
+  public:
+    enum : unsigned {
+      None  = 0,
+      Entry = 1,
+      Exit  = 2,
+      First = 11  // 10th + 1st
+    };
+    static bool isInstr(IndexType X) { return X.Index >= First; }
+
+    IndexType() : Index(None) {}
+    IndexType(unsigned Idx) : Index(Idx) {}
+    operator unsigned() const;
+    bool operator== (unsigned x) const;
+    bool operator== (IndexType Idx) const;
+    bool operator!= (unsigned x) const;
+    bool operator!= (IndexType Idx) const;
+    IndexType operator++ ();
+    bool operator< (unsigned Idx) const;
+    bool operator< (IndexType Idx) const;
+    bool operator<= (IndexType Idx) const;
+
+  private:
+    bool operator>  (IndexType Idx) const;
+    bool operator>= (IndexType Idx) const;
+
+    unsigned Index;
+  };
+
+  // A range of indices, essentially a representation of a live range.
+  // This is also used to represent "dead ranges", i.e. ranges where a
+  // register is dead.
+  class IndexRange : public std::pair<IndexType,IndexType> {
+  public:
+    IndexRange() : Fixed(false), TiedEnd(false) {}
+    IndexRange(IndexType Start, IndexType End, bool F = false, bool T = false)
+      : std::pair<IndexType,IndexType>(Start, End), Fixed(F), TiedEnd(T) {}
+    IndexType start() const { return first; }
+    IndexType end() const   { return second; }
+
+    bool operator< (const IndexRange &A) const {
+      return start() < A.start();
+    }
+    bool overlaps(const IndexRange &A) const;
+    bool contains(const IndexRange &A) const;
+    void merge(const IndexRange &A);
+
+    bool Fixed;      // Can be renamed?  "Fixed" means "no".
+    bool TiedEnd;    // The end is not a use, but a dead def tied to a use.
+
+  private:
+    void setStart(const IndexType &S) { first = S; }
+    void setEnd(const IndexType &E)   { second = E; }
+  };
+
+  // A list of index ranges. This represents liveness of a register
+  // in a basic block.
+  class RangeList : public std::vector<IndexRange> {
+  public:
+    void add(IndexType Start, IndexType End, bool Fixed, bool TiedEnd) {
+      push_back(IndexRange(Start, End, Fixed, TiedEnd));
+    }
+    void add(const IndexRange &Range) {
+      push_back(Range);
+    }
+    void include(const RangeList &RL);
+    void unionize(bool MergeAdjacent = false);
+    void subtract(const IndexRange &Range);
+
+  private:
+    void addsub(const IndexRange &A, const IndexRange &B);
+  };
+
+  class InstrIndexMap {
+  public:
+    InstrIndexMap(MachineBasicBlock &B);
+    MachineInstr *getInstr(IndexType Idx) const;
+    IndexType getIndex(MachineInstr *MI) const;
+    MachineBasicBlock &getBlock() const { return Block; }
+    IndexType getPrevIndex(IndexType Idx) const;
+    IndexType getNextIndex(IndexType Idx) const;
+    void replaceInstr(MachineInstr *OldMI, MachineInstr *NewMI);
+
+    friend raw_ostream &operator<< (raw_ostream &OS, const InstrIndexMap &Map);
+    IndexType First, Last;
+
+  private:
+    MachineBasicBlock &Block;
+    std::map<IndexType,MachineInstr*> Map;
+  };
+
+  typedef std::map<RegisterRef,RangeList> RegToRangeMap;
+  RegToRangeMap computeLiveMap(InstrIndexMap &IndexMap);
+  RegToRangeMap computeDeadMap(InstrIndexMap &IndexMap, RegToRangeMap &LiveMap);
+  static RegisterSet expandToSubRegs(RegisterRef R,
+      const MachineRegisterInfo &MRI, const TargetRegisterInfo &TRI);
+
+  struct PrintRangeMap {
+    PrintRangeMap(const RegToRangeMap &M, const TargetRegisterInfo &I)
+        : Map(M), TRI(I) {}
+
+    friend raw_ostream &operator<< (raw_ostream &OS, const PrintRangeMap &P);
+  private:
+    const RegToRangeMap &Map;
+    const TargetRegisterInfo &TRI;
+  };
+
+private:
+  RegisterSet getLiveIns(const MachineBasicBlock &B);
+
+  void computeInitialLiveRanges(InstrIndexMap &IndexMap,
+      RegToRangeMap &LiveMap);
+
+  MachineFunction &MF;
+  const HexagonSubtarget &HST;
+  const TargetInstrInfo &TII;
+  const TargetRegisterInfo &TRI;
+  BitVector Reserved;
+};
+
+
+inline HexagonBlockRanges::IndexType::operator unsigned() const {
+  assert(Index >= First);
+  return Index;
+}
+
+inline bool HexagonBlockRanges::IndexType::operator== (unsigned x) const {
+  return Index == x;
+}
+
+inline bool HexagonBlockRanges::IndexType::operator== (IndexType Idx) const {
+  return Index == Idx.Index;
+}
+
+inline bool HexagonBlockRanges::IndexType::operator!= (unsigned x) const {
+  return Index != x;
+}
+
+inline bool HexagonBlockRanges::IndexType::operator!= (IndexType Idx) const {
+  return Index != Idx.Index;
+}
+
+inline
+HexagonBlockRanges::IndexType HexagonBlockRanges::IndexType::operator++ () {
+  assert(Index != None);
+  assert(Index != Exit);
+  if (Index == Entry)
+    Index = First;
+  else
+    ++Index;
+  return *this;
+}
+
+inline bool HexagonBlockRanges::IndexType::operator< (unsigned Idx) const {
+  return operator< (IndexType(Idx));
+}
+
+inline bool HexagonBlockRanges::IndexType::operator< (IndexType Idx) const {
+  // !(x < x).
+  if (Index == Idx.Index)
+    return false;
+  // !(None < x) for all x.
+  // !(x < None) for all x.
+  if (Index == None || Idx.Index == None)
+    return false;
+  // !(Exit < x) for all x.
+  // !(x < Entry) for all x.
+  if (Index == Exit || Idx.Index == Entry)
+    return false;
+  // Entry < x for all x != Entry.
+  // x < Exit for all x != Exit.
+  if (Index == Entry || Idx.Index == Exit)
+    return true;
+
+  return Index < Idx.Index;
+}
+
+inline bool HexagonBlockRanges::IndexType::operator<= (IndexType Idx) const {
+  return operator==(Idx) || operator<(Idx);
+}
+
+
+raw_ostream &operator<< (raw_ostream &OS, HexagonBlockRanges::IndexType Idx);
+raw_ostream &operator<< (raw_ostream &OS,
+      const HexagonBlockRanges::IndexRange &IR);
+raw_ostream &operator<< (raw_ostream &OS,
+      const HexagonBlockRanges::RangeList &RL);
+raw_ostream &operator<< (raw_ostream &OS,
+      const HexagonBlockRanges::InstrIndexMap &M);
+raw_ostream &operator<< (raw_ostream &OS,
+      const HexagonBlockRanges::PrintRangeMap &P);
+
+} // namespace llvm
+
+#endif
diff --git a/lib/Target/Hexagon/HexagonBranchRelaxation.cpp b/lib/Target/Hexagon/HexagonBranchRelaxation.cpp
new file mode 100644
index 000000000000..f042baf1ef05
--- /dev/null
+++ b/lib/Target/Hexagon/HexagonBranchRelaxation.cpp
@@ -0,0 +1,211 @@
+//===--- HexagonBranchRelaxation.cpp - Identify and relax long jumps ------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "hexagon-brelax"
+
+#include "Hexagon.h"
+#include "HexagonInstrInfo.h"
+#include "HexagonSubtarget.h"
+#include "HexagonTargetMachine.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/Passes.h"
+#include "llvm/PassSupport.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+
+using namespace llvm;
+
+// Since we have no exact knowledge of code layout, allow some safety buffer
+// for jump target. This is measured in bytes.
+static cl::opt<uint32_t> BranchRelaxSafetyBuffer("branch-relax-safety-buffer",
+  cl::init(200), cl::Hidden, cl::ZeroOrMore, cl::desc("safety buffer size"));
+
+namespace llvm {
+  FunctionPass *createHexagonBranchRelaxation();
+  void initializeHexagonBranchRelaxationPass(PassRegistry&);
+}
+
+namespace {
+  struct HexagonBranchRelaxation : public MachineFunctionPass {
+  public:
+    static char ID;
+    HexagonBranchRelaxation() : MachineFunctionPass(ID) {
+      initializeHexagonBranchRelaxationPass(*PassRegistry::getPassRegistry());
+    }
+
+    bool runOnMachineFunction(MachineFunction &MF) override;
+
+    const char *getPassName() const override {
+      return "Hexagon Branch Relaxation";
+    }
+
+    void getAnalysisUsage(AnalysisUsage &AU) const override {
+      AU.setPreservesCFG();
+      MachineFunctionPass::getAnalysisUsage(AU);
+    }
+
+  private:
+    const HexagonInstrInfo *HII;
+    const HexagonRegisterInfo *HRI;
+
+    bool relaxBranches(MachineFunction &MF);
+    void computeOffset(MachineFunction &MF,
+          DenseMap<MachineBasicBlock*, unsigned> &BlockToInstOffset);
+    bool reGenerateBranch(MachineFunction &MF,
+          DenseMap<MachineBasicBlock*, unsigned> &BlockToInstOffset);
+    bool isJumpOutOfRange(MachineInstr &MI,
+          DenseMap<MachineBasicBlock*, unsigned> &BlockToInstOffset);
+  };
+
+  char HexagonBranchRelaxation::ID = 0;
+} // end anonymous namespace
+
+INITIALIZE_PASS(HexagonBranchRelaxation, "hexagon-brelax",
+                "Hexagon Branch Relaxation", false, false)
+
+FunctionPass *llvm::createHexagonBranchRelaxation() {
+  return new HexagonBranchRelaxation();
+}
+
+
+bool HexagonBranchRelaxation::runOnMachineFunction(MachineFunction &MF) {
+  DEBUG(dbgs() << "****** Hexagon Branch Relaxation ******\n");
+
+  auto &HST = MF.getSubtarget<HexagonSubtarget>();
+  HII = HST.getInstrInfo();
+  HRI = HST.getRegisterInfo();
+
+  bool Changed = false;
+  Changed = relaxBranches(MF);
+  return Changed;
+}
+
+
+void HexagonBranchRelaxation::computeOffset(MachineFunction &MF,
+      DenseMap<MachineBasicBlock*, unsigned> &OffsetMap) {
+  // offset of the current instruction from the start.
+  unsigned InstOffset = 0;
+  for (auto &B : MF) {
+    if (B.getAlignment()) {
+      // Although we don't know the exact layout of the final code, we need
+      // to account for alignment padding somehow. This heuristic pads each
+      // aligned basic block according to the alignment value.
+      int ByteAlign = (1u << B.getAlignment()) - 1;
+      InstOffset = (InstOffset + ByteAlign) & ~(ByteAlign);
+    }
+    OffsetMap[&B] = InstOffset;
+    for (auto &MI : B.instrs())
+      InstOffset += HII->getSize(&MI);
+  }
+}
+
+
+/// relaxBranches - For Hexagon, if the jump target/loop label is too far from
+/// the jump/loop instruction then, we need to make sure that we have constant
+/// extenders set for jumps and loops.
+
+/// There are six iterations in this phase. It's self explanatory below.
+bool HexagonBranchRelaxation::relaxBranches(MachineFunction &MF) {
+  // Compute the offset of each basic block
+  // offset of the current instruction from the start.
+  // map for each instruction to the beginning of the function
+  DenseMap<MachineBasicBlock*, unsigned> BlockToInstOffset;
+  computeOffset(MF, BlockToInstOffset);
+
+  return reGenerateBranch(MF, BlockToInstOffset);
+}
+
+
+/// Check if a given instruction is:
+/// - a jump to a distant target
+/// - that exceeds its immediate range
+/// If both conditions are true, it requires constant extension.
+bool HexagonBranchRelaxation::isJumpOutOfRange(MachineInstr &MI,
+      DenseMap<MachineBasicBlock*, unsigned> &BlockToInstOffset) {
+  MachineBasicBlock &B = *MI.getParent();
+  auto FirstTerm = B.getFirstInstrTerminator();
+  if (FirstTerm == B.instr_end())
+    return false;
+
+  unsigned InstOffset = BlockToInstOffset[&B];
+  unsigned Distance = 0;
+
+  // To save time, estimate exact position of a branch instruction
+  // as one at the end of the MBB.
+  // Number of instructions times typical instruction size.
+  InstOffset += HII->nonDbgBBSize(&B) * HEXAGON_INSTR_SIZE;
+
+  MachineBasicBlock *TBB = NULL, *FBB = NULL;
+  SmallVector<MachineOperand, 4> Cond;
+
+  // Try to analyze this branch.
+  if (HII->analyzeBranch(B, TBB, FBB, Cond, false)) {
+    // Could not analyze it. See if this is something we can recognize.
+    // If it is a NVJ, it should always have its target in
+    // a fixed location.
+    if (HII->isNewValueJump(&*FirstTerm))
+      TBB = FirstTerm->getOperand(HII->getCExtOpNum(&*FirstTerm)).getMBB();
+  }
+  if (TBB && &MI == &*FirstTerm) {
+    Distance = std::abs((long long)InstOffset - BlockToInstOffset[TBB])
+                + BranchRelaxSafetyBuffer;
+    return !HII->isJumpWithinBranchRange(&*FirstTerm, Distance);
+  }
+  if (FBB) {
+    // Look for second terminator.
+    auto SecondTerm = std::next(FirstTerm);
+    assert(SecondTerm != B.instr_end() &&
+          (SecondTerm->isBranch() || SecondTerm->isCall()) &&
+          "Bad second terminator");
+    if (&MI != &*SecondTerm)
+      return false;
+    // Analyze the second branch in the BB.
+    Distance = std::abs((long long)InstOffset - BlockToInstOffset[FBB])
+                + BranchRelaxSafetyBuffer;
+    return !HII->isJumpWithinBranchRange(&*SecondTerm, Distance);
+  }
+  return false;
+}
+
+
+bool HexagonBranchRelaxation::reGenerateBranch(MachineFunction &MF,
+      DenseMap<MachineBasicBlock*, unsigned> &BlockToInstOffset) {
+  bool Changed = false;
+
+  for (auto &B : MF) {
+    for (auto &MI : B) {
+      if (!MI.isBranch() || !isJumpOutOfRange(MI, BlockToInstOffset))
+        continue;
+      DEBUG(dbgs() << "Long distance jump. isExtendable("
+                   << HII->isExtendable(&MI) << ") isConstExtended("
+                   << HII->isConstExtended(&MI) << ") " << MI);
+
+      // Since we have not merged HW loops relaxation into
+      // this code (yet), soften our approach for the moment.
+      if (!HII->isExtendable(&MI) && !HII->isExtended(&MI)) {
+        DEBUG(dbgs() << "\tUnderimplemented relax branch instruction.\n");
+      } else {
+        // Find which operand is expandable.
+        int ExtOpNum = HII->getCExtOpNum(&MI);
+        MachineOperand &MO = MI.getOperand(ExtOpNum);
+        // This need to be something we understand. So far we assume all
+        // branches have only MBB address as expandable field.
+        // If it changes, this will need to be expanded.
+        assert(MO.isMBB() && "Branch with unknown expandable field type");
+        // Mark given operand as extended.
+        MO.addTargetFlag(HexagonII::HMOTF_ConstExtended);
+        Changed = true;
+      }
+    }
+  }
+  return Changed;
+}
diff --git a/lib/Target/Hexagon/HexagonCFGOptimizer.cpp b/lib/Target/Hexagon/HexagonCFGOptimizer.cpp
index efafdd007289..559bdfb16a6f 100644
--- a/lib/Target/Hexagon/HexagonCFGOptimizer.cpp
+++ b/lib/Target/Hexagon/HexagonCFGOptimizer.cpp
@@ -16,7 +16,6 @@
 #include "llvm/CodeGen/MachineLoopInfo.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
 #include "llvm/CodeGen/Passes.h"
-#include "llvm/Support/Compiler.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/MathExtras.h"
 #include "llvm/Target/TargetInstrInfo.h"
@@ -38,9 +37,9 @@ namespace {
 class HexagonCFGOptimizer : public MachineFunctionPass {
 
 private:
-  void InvertAndChangeJumpTarget(MachineInstr*, MachineBasicBlock*);
+  void InvertAndChangeJumpTarget(MachineInstr &, MachineBasicBlock *);
 
- public:
+public:
   static char ID;
   HexagonCFGOptimizer() : MachineFunctionPass(ID) {
     initializeHexagonCFGOptimizerPass(*PassRegistry::getPassRegistry());
@@ -50,6 +49,10 @@ private:
     return "Hexagon CFG Optimizer";
   }
   bool runOnMachineFunction(MachineFunction &Fn) override;
+  MachineFunctionProperties getRequiredProperties() const override {
+    return MachineFunctionProperties().set(
+        MachineFunctionProperties::Property::AllVRegsAllocated);
+  }
 };
 
 
@@ -65,14 +68,12 @@ static bool IsUnconditionalJump(int Opc) {
   return (Opc == Hexagon::J2_jump);
 }
 
-
-void
-HexagonCFGOptimizer::InvertAndChangeJumpTarget(MachineInstr* MI,
-                                               MachineBasicBlock* NewTarget) {
+void HexagonCFGOptimizer::InvertAndChangeJumpTarget(
+    MachineInstr &MI, MachineBasicBlock *NewTarget) {
   const TargetInstrInfo *TII =
-      MI->getParent()->getParent()->getSubtarget().getInstrInfo();
+      MI.getParent()->getParent()->getSubtarget().getInstrInfo();
   int NewOpcode = 0;
-  switch(MI->getOpcode()) {
+  switch (MI.getOpcode()) {
   case Hexagon::J2_jumpt:
     NewOpcode = Hexagon::J2_jumpf;
     break;
@@ -93,12 +94,15 @@ HexagonCFGOptimizer::InvertAndChangeJumpTarget(MachineInstr* MI,
     llvm_unreachable("Cannot handle this case");
   }
 
-  MI->setDesc(TII->get(NewOpcode));
-  MI->getOperand(1).setMBB(NewTarget);
+  MI.setDesc(TII->get(NewOpcode));
+  MI.getOperand(1).setMBB(NewTarget);
 }
 
 
 bool HexagonCFGOptimizer::runOnMachineFunction(MachineFunction &Fn) {
+  if (skipFunction(*Fn.getFunction()))
+    return false;
+
   // Loop over all of the basic blocks.
   for (MachineFunction::iterator MBBb = Fn.begin(), MBBe = Fn.end();
        MBBb != MBBe; ++MBBb) {
@@ -107,8 +111,8 @@ bool HexagonCFGOptimizer::runOnMachineFunction(MachineFunction &Fn) {
     // Traverse the basic block.
     MachineBasicBlock::iterator MII = MBB->getFirstTerminator();
     if (MII != MBB->end()) {
-      MachineInstr *MI = MII;
-      int Opc = MI->getOpcode();
+      MachineInstr &MI = *MII;
+      int Opc = MI.getOpcode();
       if (IsConditionalBranch(Opc)) {
 
         //
@@ -160,9 +164,9 @@ bool HexagonCFGOptimizer::runOnMachineFunction(MachineFunction &Fn) {
         // The target of the unconditional branch must be JumpAroundTarget.
         // TODO: If not, we should not invert the unconditional branch.
         MachineBasicBlock* CondBranchTarget = nullptr;
-        if ((MI->getOpcode() == Hexagon::J2_jumpt) ||
-            (MI->getOpcode() == Hexagon::J2_jumpf)) {
-          CondBranchTarget = MI->getOperand(1).getMBB();
+        if (MI.getOpcode() == Hexagon::J2_jumpt ||
+            MI.getOpcode() == Hexagon::J2_jumpf) {
+          CondBranchTarget = MI.getOperand(1).getMBB();
         }
 
         if (!LayoutSucc || (CondBranchTarget != JumpAroundTarget)) {
@@ -174,6 +178,7 @@ bool HexagonCFGOptimizer::runOnMachineFunction(MachineFunction &Fn) {
           // Ensure that BB2 has one instruction -- an unconditional jump.
           if ((LayoutSucc->size() == 1) &&
               IsUnconditionalJump(LayoutSucc->front().getOpcode())) {
+            assert(JumpAroundTarget && "jump target is needed to process second basic block");
             MachineBasicBlock* UncondTarget =
               LayoutSucc->front().getOperand(0).getMBB();
             // Check if the layout successor of BB2 is BB3.
@@ -232,15 +237,8 @@ bool HexagonCFGOptimizer::runOnMachineFunction(MachineFunction &Fn) {
 //                         Public Constructor Functions
 //===----------------------------------------------------------------------===//
 
-static void initializePassOnce(PassRegistry &Registry) {
-  PassInfo *PI = new PassInfo("Hexagon CFG Optimizer", "hexagon-cfg",
-                              &HexagonCFGOptimizer::ID, nullptr, false, false);
-  Registry.registerPass(*PI, true);
-}
-
-void llvm::initializeHexagonCFGOptimizerPass(PassRegistry &Registry) {
-  CALL_ONCE_INITIALIZATION(initializePassOnce)
-}
+INITIALIZE_PASS(HexagonCFGOptimizer, "hexagon-cfg", "Hexagon CFG Optimizer",
+                false, false)
 
 FunctionPass *llvm::createHexagonCFGOptimizer() {
   return new HexagonCFGOptimizer();
diff --git a/lib/Target/Hexagon/HexagonCommonGEP.cpp b/lib/Target/Hexagon/HexagonCommonGEP.cpp
index 931db6687bf8..b612b11aed50 100644
--- a/lib/Target/Hexagon/HexagonCommonGEP.cpp
+++ b/lib/Target/Hexagon/HexagonCommonGEP.cpp
@@ -90,8 +90,8 @@ namespace {
     virtual void getAnalysisUsage(AnalysisUsage &AU) const {
       AU.addRequired<DominatorTreeWrapperPass>();
       AU.addPreserved<DominatorTreeWrapperPass>();
-      AU.addRequired<PostDominatorTree>();
-      AU.addPreserved<PostDominatorTree>();
+      AU.addRequired<PostDominatorTreeWrapperPass>();
+      AU.addPreserved<PostDominatorTreeWrapperPass>();
       AU.addRequired<LoopInfoWrapperPass>();
       AU.addPreserved<LoopInfoWrapperPass>();
       FunctionPass::getAnalysisUsage(AU);
@@ -147,7 +147,7 @@ char HexagonCommonGEP::ID = 0;
 INITIALIZE_PASS_BEGIN(HexagonCommonGEP, "hcommgep", "Hexagon Common GEP",
       false, false)
 INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
-INITIALIZE_PASS_DEPENDENCY(PostDominatorTree)
+INITIALIZE_PASS_DEPENDENCY(PostDominatorTreeWrapperPass)
 INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass)
 INITIALIZE_PASS_END(HexagonCommonGEP, "hcommgep", "Hexagon Common GEP",
       false, false)
@@ -212,7 +212,6 @@ namespace {
       if (Comma)
         OS << ',';
       OS << "used";
-      Comma = true;
     }
     OS << "} ";
     if (GN.Flags & GepNode::Root)
@@ -1268,6 +1267,9 @@ void HexagonCommonGEP::removeDeadCode() {
 
 
 bool HexagonCommonGEP::runOnFunction(Function &F) {
+  if (skipFunction(F))
+    return false;
+
   // For now bail out on C++ exception handling.
   for (Function::iterator A = F.begin(), Z = F.end(); A != Z; ++A)
     for (BasicBlock::iterator I = A->begin(), E = A->end(); I != E; ++I)
@@ -1276,7 +1278,7 @@ bool HexagonCommonGEP::runOnFunction(Function &F) {
 
   Fn = &F;
   DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
-  PDT = &getAnalysis<PostDominatorTree>();
+  PDT = &getAnalysis<PostDominatorTreeWrapperPass>().getPostDomTree();
   LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
   Ctx = &F.getContext();
 
@@ -1295,7 +1297,7 @@ bool HexagonCommonGEP::runOnFunction(Function &F) {
   materialize(Loc);
   removeDeadCode();
 
-#ifdef XDEBUG
+#ifdef EXPENSIVE_CHECKS
   // Run this only when expensive checks are enabled.
   verifyFunction(F);
 #endif
diff --git a/lib/Target/Hexagon/HexagonCopyToCombine.cpp b/lib/Target/Hexagon/HexagonCopyToCombine.cpp
index 9fd863f6e153..face0f3f64b4 100644
--- a/lib/Target/Hexagon/HexagonCopyToCombine.cpp
+++ b/lib/Target/Hexagon/HexagonCopyToCombine.cpp
@@ -42,6 +42,11 @@ cl::opt<bool> IsCombinesDisabled("disable-merge-into-combines",
                                  cl::init(false),
                                  cl::desc("Disable merging into combines"));
 static
+cl::opt<bool> IsConst64Disabled("disable-const64",
+                                 cl::Hidden, cl::ZeroOrMore,
+                                 cl::init(false),
+                                 cl::desc("Disable generation of const64"));
+static
 cl::opt<unsigned>
 MaxNumOfInstsBetweenNewValueStoreAndTFR("max-num-inst-between-tfr-and-nv-store",
                    cl::Hidden, cl::init(4),
@@ -62,6 +67,8 @@ class HexagonCopyToCombine : public MachineFunctionPass  {
   bool ShouldCombineAggressively;
 
   DenseSet<MachineInstr *> PotentiallyNewifiableTFR;
+  SmallVector<MachineInstr *, 8> DbgMItoMove;
+
 public:
   static char ID;
 
@@ -79,15 +86,22 @@ public:
 
   bool runOnMachineFunction(MachineFunction &Fn) override;
 
+  MachineFunctionProperties getRequiredProperties() const override {
+    return MachineFunctionProperties().set(
+        MachineFunctionProperties::Property::AllVRegsAllocated);
+  }
+
 private:
-  MachineInstr *findPairable(MachineInstr *I1, bool &DoInsertAtI1);
+  MachineInstr *findPairable(MachineInstr &I1, bool &DoInsertAtI1,
+                             bool AllowC64);
 
   void findPotentialNewifiableTFRs(MachineBasicBlock &);
 
-  void combine(MachineInstr *I1, MachineInstr *I2,
-               MachineBasicBlock::iterator &MI, bool DoInsertAtI1);
+  void combine(MachineInstr &I1, MachineInstr &I2,
+               MachineBasicBlock::iterator &MI, bool DoInsertAtI1,
+               bool OptForSize);
 
-  bool isSafeToMoveTogether(MachineInstr *I1, MachineInstr *I2,
+  bool isSafeToMoveTogether(MachineInstr &I1, MachineInstr &I2,
                             unsigned I1DestReg, unsigned I2DestReg,
                             bool &DoInsertAtI1);
 
@@ -102,6 +116,9 @@ private:
 
   void emitCombineII(MachineBasicBlock::iterator &Before, unsigned DestReg,
                      MachineOperand &HiOperand, MachineOperand &LoOperand);
+
+  void emitConst64(MachineBasicBlock::iterator &Before, unsigned DestReg,
+                   MachineOperand &HiOperand, MachineOperand &LoOperand);
 };
 
 } // End anonymous namespace.
@@ -111,14 +128,13 @@ char HexagonCopyToCombine::ID = 0;
 INITIALIZE_PASS(HexagonCopyToCombine, "hexagon-copy-combine",
                 "Hexagon Copy-To-Combine Pass", false, false)
 
-static bool isCombinableInstType(MachineInstr *MI,
-                                 const HexagonInstrInfo *TII,
+static bool isCombinableInstType(MachineInstr &MI, const HexagonInstrInfo *TII,
                                  bool ShouldCombineAggressively) {
-  switch(MI->getOpcode()) {
+  switch (MI.getOpcode()) {
   case Hexagon::A2_tfr: {
     // A COPY instruction can be combined if its arguments are IntRegs (32bit).
-    const MachineOperand &Op0 = MI->getOperand(0);
-    const MachineOperand &Op1 = MI->getOperand(1);
+    const MachineOperand &Op0 = MI.getOperand(0);
+    const MachineOperand &Op1 = MI.getOperand(1);
     assert(Op0.isReg() && Op1.isReg());
 
     unsigned DestReg = Op0.getReg();
@@ -130,8 +146,8 @@ static bool isCombinableInstType(MachineInstr *MI,
   case Hexagon::A2_tfrsi: {
     // A transfer-immediate can be combined if its argument is a signed 8bit
     // value.
-    const MachineOperand &Op0 = MI->getOperand(0);
-    const MachineOperand &Op1 = MI->getOperand(1);
+    const MachineOperand &Op0 = MI.getOperand(0);
+    const MachineOperand &Op1 = MI.getOperand(1);
     assert(Op0.isReg());
 
     unsigned DestReg = Op0.getReg();
@@ -154,11 +170,10 @@ static bool isCombinableInstType(MachineInstr *MI,
   return false;
 }
 
-template <unsigned N>
-static bool isGreaterThanNBitTFRI(const MachineInstr *I) {
-  if (I->getOpcode() == Hexagon::TFRI64_V4 ||
-      I->getOpcode() == Hexagon::A2_tfrsi) {
-    const MachineOperand &Op = I->getOperand(1);
+template <unsigned N> static bool isGreaterThanNBitTFRI(const MachineInstr &I) {
+  if (I.getOpcode() == Hexagon::TFRI64_V4 ||
+      I.getOpcode() == Hexagon::A2_tfrsi) {
+    const MachineOperand &Op = I.getOperand(1);
     return !Op.isImm() || !isInt<N>(Op.getImm());
   }
   return false;
@@ -167,19 +182,34 @@ static bool isGreaterThanNBitTFRI(const MachineInstr *I) {
 /// areCombinableOperations - Returns true if the two instruction can be merge
 /// into a combine (ignoring register constraints).
 static bool areCombinableOperations(const TargetRegisterInfo *TRI,
-                                    MachineInstr *HighRegInst,
-                                    MachineInstr *LowRegInst) {
-  unsigned HiOpc = HighRegInst->getOpcode();
-  unsigned LoOpc = LowRegInst->getOpcode();
+                                    MachineInstr &HighRegInst,
+                                    MachineInstr &LowRegInst, bool AllowC64) {
+  unsigned HiOpc = HighRegInst.getOpcode();
+  unsigned LoOpc = LowRegInst.getOpcode();
   (void)HiOpc; // Fix compiler warning
   (void)LoOpc; // Fix compiler warning
   assert((HiOpc == Hexagon::A2_tfr || HiOpc == Hexagon::A2_tfrsi) &&
          (LoOpc == Hexagon::A2_tfr || LoOpc == Hexagon::A2_tfrsi) &&
          "Assume individual instructions are of a combinable type");
 
-  // There is no combine of two constant extended values.
+  if (!AllowC64) {
+    // There is no combine of two constant extended values.
+    if (isGreaterThanNBitTFRI<8>(HighRegInst) &&
+        isGreaterThanNBitTFRI<6>(LowRegInst))
+      return false;
+  }
+
+  // There is a combine of two constant extended values into CONST64,
+  // provided both constants are true immediates.
+  if (isGreaterThanNBitTFRI<16>(HighRegInst) &&
+      isGreaterThanNBitTFRI<16>(LowRegInst))
+    return (HighRegInst.getOperand(1).isImm() &&
+            LowRegInst.getOperand(1).isImm());
+
+  // There is no combine of two constant extended values, unless handled above
+  // Make both 8-bit size checks to allow both combine (#,##) and combine(##,#)
   if (isGreaterThanNBitTFRI<8>(HighRegInst) &&
-      isGreaterThanNBitTFRI<6>(LowRegInst))
+      isGreaterThanNBitTFRI<8>(LowRegInst))
     return false;
 
   return true;
@@ -191,25 +221,23 @@ static bool isEvenReg(unsigned Reg) {
   return (Reg - Hexagon::R0) % 2 == 0;
 }
 
-static void removeKillInfo(MachineInstr *MI, unsigned RegNotKilled) {
-  for (unsigned I = 0, E = MI->getNumOperands(); I != E; ++I) {
-    MachineOperand &Op = MI->getOperand(I);
+static void removeKillInfo(MachineInstr &MI, unsigned RegNotKilled) {
+  for (unsigned I = 0, E = MI.getNumOperands(); I != E; ++I) {
+    MachineOperand &Op = MI.getOperand(I);
     if (!Op.isReg() || Op.getReg() != RegNotKilled || !Op.isKill())
       continue;
     Op.setIsKill(false);
   }
 }
 
-/// isUnsafeToMoveAcross - Returns true if it is unsafe to move a copy
-/// instruction from \p UseReg to \p DestReg over the instruction \p I.
-static bool isUnsafeToMoveAcross(MachineInstr *I, unsigned UseReg,
-                                  unsigned DestReg,
-                                  const TargetRegisterInfo *TRI) {
-  return (UseReg && (I->modifiesRegister(UseReg, TRI))) ||
-         I->modifiesRegister(DestReg, TRI) ||
-         I->readsRegister(DestReg, TRI) ||
-         I->hasUnmodeledSideEffects() ||
-         I->isInlineAsm() || I->isDebugValue();
+/// Returns true if it is unsafe to move a copy instruction from \p UseReg to
+/// \p DestReg over the instruction \p MI.
+static bool isUnsafeToMoveAcross(MachineInstr &MI, unsigned UseReg,
+                                 unsigned DestReg,
+                                 const TargetRegisterInfo *TRI) {
+  return (UseReg && (MI.modifiesRegister(UseReg, TRI))) ||
+         MI.modifiesRegister(DestReg, TRI) || MI.readsRegister(DestReg, TRI) ||
+         MI.hasUnmodeledSideEffects() || MI.isInlineAsm() || MI.isDebugValue();
 }
 
 static unsigned UseReg(const MachineOperand& MO) {
@@ -218,16 +246,16 @@ static unsigned UseReg(const MachineOperand& MO) {
 
 /// isSafeToMoveTogether - Returns true if it is safe to move I1 next to I2 such
 /// that the two instructions can be paired in a combine.
-bool HexagonCopyToCombine::isSafeToMoveTogether(MachineInstr *I1,
-                                                MachineInstr *I2,
+bool HexagonCopyToCombine::isSafeToMoveTogether(MachineInstr &I1,
+                                                MachineInstr &I2,
                                                 unsigned I1DestReg,
                                                 unsigned I2DestReg,
                                                 bool &DoInsertAtI1) {
-  unsigned I2UseReg = UseReg(I2->getOperand(1));
+  unsigned I2UseReg = UseReg(I2.getOperand(1));
 
   // It is not safe to move I1 and I2 into one combine if I2 has a true
   // dependence on I1.
-  if (I2UseReg && I1->modifiesRegister(I2UseReg, TRI))
+  if (I2UseReg && I1.modifiesRegister(I2UseReg, TRI))
     return false;
 
   bool isSafe = true;
@@ -246,7 +274,7 @@ bool HexagonCopyToCombine::isSafeToMoveTogether(MachineInstr *I1,
     // uses I2's use reg we need to modify that (first) instruction to now kill
     // this reg.
     unsigned KilledOperand = 0;
-    if (I2->killsRegister(I2UseReg))
+    if (I2.killsRegister(I2UseReg))
       KilledOperand = I2UseReg;
     MachineInstr *KillingInstr = nullptr;
 
@@ -257,7 +285,10 @@ bool HexagonCopyToCombine::isSafeToMoveTogether(MachineInstr *I1,
       //   * reads I2's def reg
       //   * or has unmodelled side effects
       // we can't move I2 across it.
-      if (isUnsafeToMoveAcross(&*I, I2UseReg, I2DestReg, TRI)) {
+      if (I->isDebugValue())
+        continue;
+
+      if (isUnsafeToMoveAcross(*I, I2UseReg, I2DestReg, TRI)) {
         isSafe = false;
         break;
       }
@@ -287,7 +318,7 @@ bool HexagonCopyToCombine::isSafeToMoveTogether(MachineInstr *I1,
     // At O3 we got better results (dhrystone) by being more conservative here.
     if (!ShouldCombineAggressively)
       End = std::next(MachineBasicBlock::iterator(I2));
-    unsigned I1UseReg = UseReg(I1->getOperand(1));
+    unsigned I1UseReg = UseReg(I1.getOperand(1));
     // Track killed operands. If we move across an instruction that kills our
     // operand, we need to update the kill information on the moved I1. It kills
     // the operand now.
@@ -295,7 +326,8 @@ bool HexagonCopyToCombine::isSafeToMoveTogether(MachineInstr *I1,
     unsigned KilledOperand = 0;
 
     while(++I != End) {
-      // If the intervening instruction I:
+      MachineInstr &MI = *I;
+      // If the intervening instruction MI:
       //   * modifies I1's use reg
       //   * modifies I1's def reg
       //   * reads I1's def reg
@@ -304,30 +336,36 @@ bool HexagonCopyToCombine::isSafeToMoveTogether(MachineInstr *I1,
       //   kill flag for a register (a removeRegisterKilled() analogous to
       //   addRegisterKilled) that handles aliased register correctly.
       //   * or has a killed aliased register use of I1's use reg
-      //           %D4<def> = TFRI64 16
-      //           %R6<def> = TFR %R9
+      //           %D4<def> = A2_tfrpi 16
+      //           %R6<def> = A2_tfr %R9
       //           %R8<def> = KILL %R8, %D4<imp-use,kill>
       //      If we want to move R6 = across the KILL instruction we would have
       //      to remove the %D4<imp-use,kill> operand. For now, we are
       //      conservative and disallow the move.
       // we can't move I1 across it.
-      if (isUnsafeToMoveAcross(I, I1UseReg, I1DestReg, TRI) ||
+      if (MI.isDebugValue()) {
+        if (MI.readsRegister(I1DestReg, TRI)) // Move this instruction after I2.
+          DbgMItoMove.push_back(&MI);
+        continue;
+      }
+
+      if (isUnsafeToMoveAcross(MI, I1UseReg, I1DestReg, TRI) ||
           // Check for an aliased register kill. Bail out if we see one.
-          (!I->killsRegister(I1UseReg) && I->killsRegister(I1UseReg, TRI)))
+          (!MI.killsRegister(I1UseReg) && MI.killsRegister(I1UseReg, TRI)))
         return false;
 
       // Check for an exact kill (registers match).
-      if (I1UseReg && I->killsRegister(I1UseReg)) {
+      if (I1UseReg && MI.killsRegister(I1UseReg)) {
         assert(!KillingInstr && "Should only see one killing instruction");
         KilledOperand = I1UseReg;
-        KillingInstr = &*I;
+        KillingInstr = &MI;
       }
     }
     if (KillingInstr) {
-      removeKillInfo(KillingInstr, KilledOperand);
+      removeKillInfo(*KillingInstr, KilledOperand);
       // Update I1 to set the kill flag. This flag will later be picked up by
       // the new COMBINE instruction.
-      bool Added = I1->addRegisterKilled(KilledOperand, TRI);
+      bool Added = I1.addRegisterKilled(KilledOperand, TRI);
       (void)Added; // suppress compiler warning
       assert(Added && "Must successfully update kill flag");
     }
@@ -342,14 +380,16 @@ bool HexagonCopyToCombine::isSafeToMoveTogether(MachineInstr *I1,
 void
 HexagonCopyToCombine::findPotentialNewifiableTFRs(MachineBasicBlock &BB) {
   DenseMap<unsigned, MachineInstr *> LastDef;
-  for (MachineBasicBlock::iterator I = BB.begin(), E = BB.end(); I != E; ++I) {
-    MachineInstr *MI = I;
+  for (MachineInstr &MI : BB) {
+    if (MI.isDebugValue())
+      continue;
+
     // Mark TFRs that feed a potential new value store as such.
-    if(TII->mayBeNewStore(MI)) {
+    if (TII->mayBeNewStore(&MI)) {
       // Look for uses of TFR instructions.
-      for (unsigned OpdIdx = 0, OpdE = MI->getNumOperands(); OpdIdx != OpdE;
+      for (unsigned OpdIdx = 0, OpdE = MI.getNumOperands(); OpdIdx != OpdE;
            ++OpdIdx) {
-        MachineOperand &Op = MI->getOperand(OpdIdx);
+        MachineOperand &Op = MI.getOperand(OpdIdx);
 
         // Skip over anything except register uses.
         if (!Op.isReg() || !Op.isUse() || !Op.getReg())
@@ -360,14 +400,18 @@ HexagonCopyToCombine::findPotentialNewifiableTFRs(MachineBasicBlock &BB) {
         MachineInstr *DefInst = LastDef[Reg];
         if (!DefInst)
           continue;
-        if (!isCombinableInstType(DefInst, TII, ShouldCombineAggressively))
+        if (!isCombinableInstType(*DefInst, TII, ShouldCombineAggressively))
           continue;
 
         // Only close newifiable stores should influence the decision.
+        // Ignore the debug instructions in between.
         MachineBasicBlock::iterator It(DefInst);
         unsigned NumInstsToDef = 0;
-        while (&*It++ != MI)
-          ++NumInstsToDef;
+        while (&*It != &MI) {
+          if (!It->isDebugValue())
+            ++NumInstsToDef;
+          ++It;
+        }
 
         if (NumInstsToDef > MaxNumOfInstsBetweenNewValueStoreAndTFR)
           continue;
@@ -380,17 +424,17 @@ HexagonCopyToCombine::findPotentialNewifiableTFRs(MachineBasicBlock &BB) {
 
     // Put instructions that last defined integer or double registers into the
     // map.
-    for (unsigned I = 0, E = MI->getNumOperands(); I != E; ++I) {
-      MachineOperand &Op = MI->getOperand(I);
+    for (unsigned I = 0, E = MI.getNumOperands(); I != E; ++I) {
+      MachineOperand &Op = MI.getOperand(I);
       if (!Op.isReg() || !Op.isDef() || !Op.getReg())
         continue;
       unsigned Reg = Op.getReg();
       if (Hexagon::DoubleRegsRegClass.contains(Reg)) {
         for (MCSubRegIterator SubRegs(Reg, TRI); SubRegs.isValid(); ++SubRegs) {
-          LastDef[*SubRegs] = MI;
+          LastDef[*SubRegs] = &MI;
         }
       } else if (Hexagon::IntRegsRegClass.contains(Reg))
-        LastDef[Reg] = MI;
+        LastDef[Reg] = &MI;
     }
   }
 }
@@ -405,6 +449,9 @@ bool HexagonCopyToCombine::runOnMachineFunction(MachineFunction &MF) {
   TRI = MF.getSubtarget().getRegisterInfo();
   TII = MF.getSubtarget<HexagonSubtarget>().getInstrInfo();
 
+  const Function *F = MF.getFunction();
+  bool OptForSize = F->hasFnAttribute(Attribute::OptimizeForSize);
+
   // Combine aggressively (for code size)
   ShouldCombineAggressively =
     MF.getTarget().getOptLevel() <= CodeGenOpt::Default;
@@ -418,11 +465,15 @@ bool HexagonCopyToCombine::runOnMachineFunction(MachineFunction &MF) {
     // Traverse instructions in basic block.
     for(MachineBasicBlock::iterator MI = BI->begin(), End = BI->end();
         MI != End;) {
-      MachineInstr *I1 = MI++;
+      MachineInstr &I1 = *MI++;
+
+      if (I1.isDebugValue())
+        continue;
+
       // Don't combine a TFR whose user could be newified (instructions that
       // define double registers can not be newified - Programmer's Ref Manual
       // 5.4.2 New-value stores).
-      if (ShouldCombineAggressively && PotentiallyNewifiableTFR.count(I1))
+      if (ShouldCombineAggressively && PotentiallyNewifiableTFR.count(&I1))
         continue;
 
       // Ignore instructions that are not combinable.
@@ -430,12 +481,14 @@ bool HexagonCopyToCombine::runOnMachineFunction(MachineFunction &MF) {
         continue;
 
       // Find a second instruction that can be merged into a combine
-      // instruction.
+      // instruction. In addition, also find all the debug instructions that
+      // need to be moved along with it.
       bool DoInsertAtI1 = false;
-      MachineInstr *I2 = findPairable(I1, DoInsertAtI1);
+      DbgMItoMove.clear();
+      MachineInstr *I2 = findPairable(I1, DoInsertAtI1, OptForSize);
       if (I2) {
         HasChanged = true;
-        combine(I1, I2, MI, DoInsertAtI1);
+        combine(I1, *I2, MI, DoInsertAtI1, OptForSize);
       }
     }
   }
@@ -447,23 +500,28 @@ bool HexagonCopyToCombine::runOnMachineFunction(MachineFunction &MF) {
 /// COMBINE instruction or 0 if no such instruction can be found. Returns true
 /// in \p DoInsertAtI1 if the combine must be inserted at instruction \p I1
 /// false if the combine must be inserted at the returned instruction.
-MachineInstr *HexagonCopyToCombine::findPairable(MachineInstr *I1,
-                                                 bool &DoInsertAtI1) {
+MachineInstr *HexagonCopyToCombine::findPairable(MachineInstr &I1,
+                                                 bool &DoInsertAtI1,
+                                                 bool AllowC64) {
   MachineBasicBlock::iterator I2 = std::next(MachineBasicBlock::iterator(I1));
-  unsigned I1DestReg = I1->getOperand(0).getReg();
 
-  for (MachineBasicBlock::iterator End = I1->getParent()->end(); I2 != End;
+  while (I2->isDebugValue())
+    ++I2;
+
+  unsigned I1DestReg = I1.getOperand(0).getReg();
+
+  for (MachineBasicBlock::iterator End = I1.getParent()->end(); I2 != End;
        ++I2) {
     // Bail out early if we see a second definition of I1DestReg.
     if (I2->modifiesRegister(I1DestReg, TRI))
       break;
 
     // Ignore non-combinable instructions.
-    if (!isCombinableInstType(I2, TII, ShouldCombineAggressively))
+    if (!isCombinableInstType(*I2, TII, ShouldCombineAggressively))
       continue;
 
     // Don't combine a TFR whose user could be newified.
-    if (ShouldCombineAggressively && PotentiallyNewifiableTFR.count(I2))
+    if (ShouldCombineAggressively && PotentiallyNewifiableTFR.count(&*I2))
       continue;
 
     unsigned I2DestReg = I2->getOperand(0).getReg();
@@ -478,15 +536,14 @@ MachineInstr *HexagonCopyToCombine::findPairable(MachineInstr *I1,
 
     // Check that the two instructions are combinable. V4 allows more
     // instructions to be merged into a combine.
-    // The order matters because in a TFRI we might can encode a int8 as the
-    // hi reg operand but only a uint6 as the low reg operand.
-    if ((IsI2LowReg && !areCombinableOperations(TRI, I1, I2)) ||
-        (IsI1LowReg && !areCombinableOperations(TRI, I2, I1)))
+    // The order matters because in a A2_tfrsi we might can encode a int8 as
+    // the hi reg operand but only a uint6 as the low reg operand.
+    if ((IsI2LowReg && !areCombinableOperations(TRI, I1, *I2, AllowC64)) ||
+        (IsI1LowReg && !areCombinableOperations(TRI, *I2, I1, AllowC64)))
       break;
 
-    if (isSafeToMoveTogether(I1, I2, I1DestReg, I2DestReg,
-                             DoInsertAtI1))
-      return I2;
+    if (isSafeToMoveTogether(I1, *I2, I1DestReg, I2DestReg, DoInsertAtI1))
+      return &*I2;
 
     // Not safe. Stop searching.
     break;
@@ -494,16 +551,17 @@ MachineInstr *HexagonCopyToCombine::findPairable(MachineInstr *I1,
   return nullptr;
 }
 
-void HexagonCopyToCombine::combine(MachineInstr *I1, MachineInstr *I2,
+void HexagonCopyToCombine::combine(MachineInstr &I1, MachineInstr &I2,
                                    MachineBasicBlock::iterator &MI,
-                                   bool DoInsertAtI1) {
+                                   bool DoInsertAtI1, bool OptForSize) {
   // We are going to delete I2. If MI points to I2 advance it to the next
   // instruction.
-  if ((MachineInstr *)MI == I2) ++MI;
+  if (MI == I2.getIterator())
+    ++MI;
 
   // Figure out whether I1 or I2 goes into the lowreg part.
-  unsigned I1DestReg = I1->getOperand(0).getReg();
-  unsigned I2DestReg = I2->getOperand(0).getReg();
+  unsigned I1DestReg = I1.getOperand(0).getReg();
+  unsigned I2DestReg = I2.getOperand(0).getReg();
   bool IsI1Loreg = (I2DestReg - I1DestReg) == 1;
   unsigned LoRegDef = IsI1Loreg ? I1DestReg : I2DestReg;
 
@@ -515,15 +573,17 @@ void HexagonCopyToCombine::combine(MachineInstr *I1, MachineInstr *I2,
 
 
   // Setup source operands.
-  MachineOperand &LoOperand = IsI1Loreg ? I1->getOperand(1) :
-    I2->getOperand(1);
-  MachineOperand &HiOperand = IsI1Loreg ? I2->getOperand(1) :
-    I1->getOperand(1);
+  MachineOperand &LoOperand = IsI1Loreg ? I1.getOperand(1) : I2.getOperand(1);
+  MachineOperand &HiOperand = IsI1Loreg ? I2.getOperand(1) : I1.getOperand(1);
 
   // Figure out which source is a register and which a constant.
   bool IsHiReg = HiOperand.isReg();
   bool IsLoReg = LoOperand.isReg();
 
+  // There is a combine of two constant extended values into CONST64.
+  bool IsC64 = OptForSize && LoOperand.isImm() && HiOperand.isImm() &&
+               isGreaterThanNBitTFRI<16>(I1) && isGreaterThanNBitTFRI<16>(I2);
+
   MachineBasicBlock::iterator InsertPt(DoInsertAtI1 ? I1 : I2);
   // Emit combine.
   if (IsHiReg && IsLoReg)
@@ -532,11 +592,45 @@ void HexagonCopyToCombine::combine(MachineInstr *I1, MachineInstr *I2,
     emitCombineRI(InsertPt, DoubleRegDest, HiOperand, LoOperand);
   else if (IsLoReg)
     emitCombineIR(InsertPt, DoubleRegDest, HiOperand, LoOperand);
+  else if (IsC64 && !IsConst64Disabled)
+    emitConst64(InsertPt, DoubleRegDest, HiOperand, LoOperand);
   else
     emitCombineII(InsertPt, DoubleRegDest, HiOperand, LoOperand);
 
-  I1->eraseFromParent();
-  I2->eraseFromParent();
+  // Move debug instructions along with I1 if it's being
+  // moved towards I2.
+  if (!DoInsertAtI1 && DbgMItoMove.size() != 0) {
+    // Insert debug instructions at the new location before I2.
+    MachineBasicBlock *BB = InsertPt->getParent();
+    for (auto NewMI : DbgMItoMove) {
+      // If iterator MI is pointing to DEBUG_VAL, make sure
+      // MI now points to next relevant instruction.
+      if (NewMI == (MachineInstr*)MI)
+        ++MI;
+      BB->splice(InsertPt, BB, NewMI);
+    }
+  }
+
+  I1.eraseFromParent();
+  I2.eraseFromParent();
+}
+
+void HexagonCopyToCombine::emitConst64(MachineBasicBlock::iterator &InsertPt,
+                                       unsigned DoubleDestReg,
+                                       MachineOperand &HiOperand,
+                                       MachineOperand &LoOperand) {
+  DEBUG(dbgs() << "Found a CONST64\n");
+
+  DebugLoc DL = InsertPt->getDebugLoc();
+  MachineBasicBlock *BB = InsertPt->getParent();
+  assert(LoOperand.isImm() && HiOperand.isImm() &&
+         "Both operands must be immediate");
+
+  int64_t V = HiOperand.getImm();
+  V = (V << 32) | (0x0ffffffffLL & LoOperand.getImm());
+  BuildMI(*BB, InsertPt, DL, TII->get(Hexagon::CONST64_Int_Real),
+    DoubleDestReg)
+    .addImm(V);
 }
 
 void HexagonCopyToCombine::emitCombineII(MachineBasicBlock::iterator &InsertPt,
diff --git a/lib/Target/Hexagon/HexagonEarlyIfConv.cpp b/lib/Target/Hexagon/HexagonEarlyIfConv.cpp
index ee0c318ffb5d..2665acd19fb1 100644
--- a/lib/Target/Hexagon/HexagonEarlyIfConv.cpp
+++ b/lib/Target/Hexagon/HexagonEarlyIfConv.cpp
@@ -78,8 +78,6 @@
 #include "HexagonTargetMachine.h"
 
 #include <functional>
-#include <set>
-#include <vector>
 
 using namespace llvm;
 
@@ -359,7 +357,7 @@ bool HexagonEarlyIfConversion::isValidCandidate(const MachineBasicBlock *B)
     // update the use of it after predication). PHI uses will be updated
     // to use a result of a MUX, and a MUX cannot be created for predicate
     // registers.
-    for (ConstMIOperands MO(&MI); MO.isValid(); ++MO) {
+    for (ConstMIOperands MO(MI); MO.isValid(); ++MO) {
       if (!MO->isReg() || !MO->isDef())
         continue;
       unsigned R = MO->getReg();
@@ -377,7 +375,7 @@ bool HexagonEarlyIfConversion::isValidCandidate(const MachineBasicBlock *B)
 
 
 bool HexagonEarlyIfConversion::usesUndefVReg(const MachineInstr *MI) const {
-  for (ConstMIOperands MO(MI); MO.isValid(); ++MO) {
+  for (ConstMIOperands MO(*MI); MO.isValid(); ++MO) {
     if (!MO->isReg() || !MO->isUse())
       continue;
     unsigned R = MO->getReg();
@@ -445,7 +443,7 @@ unsigned HexagonEarlyIfConversion::computePhiCost(MachineBasicBlock *B) const {
     }
     MachineInstr *Def1 = MRI->getVRegDef(RO1.getReg());
     MachineInstr *Def3 = MRI->getVRegDef(RO3.getReg());
-    if (!TII->isPredicable(Def1) || !TII->isPredicable(Def3))
+    if (!TII->isPredicable(*Def1) || !TII->isPredicable(*Def3))
       Cost++;
   }
   return Cost;
@@ -456,7 +454,7 @@ unsigned HexagonEarlyIfConversion::countPredicateDefs(
       const MachineBasicBlock *B) const {
   unsigned PredDefs = 0;
   for (auto &MI : *B) {
-    for (ConstMIOperands MO(&MI); MO.isValid(); ++MO) {
+    for (ConstMIOperands MO(MI); MO.isValid(); ++MO) {
       if (!MO->isReg() || !MO->isDef())
         continue;
       unsigned R = MO->getReg();
@@ -721,7 +719,7 @@ void HexagonEarlyIfConversion::predicateInstr(MachineBasicBlock *ToB,
     assert(COpc);
     MachineInstrBuilder MIB = BuildMI(*ToB, At, DL, TII->get(COpc))
       .addReg(PredR);
-    for (MIOperands MO(MI); MO.isValid(); ++MO)
+    for (MIOperands MO(*MI); MO.isValid(); ++MO)
       MIB.addOperand(*MO);
 
     // Set memory references.
@@ -962,7 +960,7 @@ void HexagonEarlyIfConversion::eliminatePhis(MachineBasicBlock *B) {
       // MRI.replaceVregUsesWith does not allow to update the subregister,
       // so instead of doing the use-iteration here, create a copy into a
       // "non-subregistered" register.
-      DebugLoc DL = PN->getDebugLoc();
+      const DebugLoc &DL = PN->getDebugLoc();
       const TargetRegisterClass *RC = MRI->getRegClass(DefR);
       NewR = MRI->createVirtualRegister(RC);
       NonPHI = BuildMI(*B, NonPHI, DL, TII->get(TargetOpcode::COPY), NewR)
@@ -980,7 +978,7 @@ void HexagonEarlyIfConversion::replacePhiEdges(MachineBasicBlock *OldB,
     MachineBasicBlock *SB = *I;
     MachineBasicBlock::iterator P, N = SB->getFirstNonPHI();
     for (P = SB->begin(); P != N; ++P) {
-      MachineInstr *PN = &*P;
+      MachineInstr &PN = *P;
       for (MIOperands MO(PN); MO.isValid(); ++MO)
         if (MO->isMBB() && MO->getMBB() == OldB)
           MO->setMBB(NewB);
@@ -1034,6 +1032,9 @@ void HexagonEarlyIfConversion::simplifyFlowGraph(const FlowPattern &FP) {
 
 
 bool HexagonEarlyIfConversion::runOnMachineFunction(MachineFunction &MF) {
+  if (skipFunction(*MF.getFunction()))
+    return false;
+
   auto &ST = MF.getSubtarget();
   TII = ST.getInstrInfo();
   TRI = ST.getRegisterInfo();
diff --git a/lib/Target/Hexagon/HexagonExpandCondsets.cpp b/lib/Target/Hexagon/HexagonExpandCondsets.cpp
index ce10aeadef94..bd5bb9cbc235 100644
--- a/lib/Target/Hexagon/HexagonExpandCondsets.cpp
+++ b/lib/Target/Hexagon/HexagonExpandCondsets.cpp
@@ -17,10 +17,10 @@
 //
 // Liveness tracking aside, the main functionality of this pass is divided
 // into two steps. The first step is to replace an instruction
-//   vreg0 = C2_mux vreg0, vreg1, vreg2
+//   vreg0 = C2_mux vreg1, vreg2, vreg3
 // with a pair of conditional transfers
-//   vreg0 = A2_tfrt vreg0, vreg1
-//   vreg0 = A2_tfrf vreg0, vreg2
+//   vreg0 = A2_tfrt vreg1, vreg2
+//   vreg0 = A2_tfrf vreg1, vreg3
 // It is the intention that the execution of this pass could be terminated
 // after this step, and the code generated would be functionally correct.
 //
@@ -60,12 +60,92 @@
 //   vreg3 = A2_tfrf vreg0, vreg2
 //
 
+// Splitting a definition of a register into two predicated transfers
+// creates a complication in liveness tracking. Live interval computation
+// will see both instructions as actual definitions, and will mark the
+// first one as dead. The definition is not actually dead, and this
+// situation will need to be fixed. For example:
+//   vreg1<def,dead> = A2_tfrt ...  ; marked as dead
+//   vreg1<def> = A2_tfrf ...
+//
+// Since any of the individual predicated transfers may end up getting
+// removed (in case it is an identity copy), some pre-existing def may
+// be marked as dead after live interval recomputation:
+//   vreg1<def,dead> = ...          ; marked as dead
+//   ...
+//   vreg1<def> = A2_tfrf ...       ; if A2_tfrt is removed
+// This case happens if vreg1 was used as a source in A2_tfrt, which means
+// that is it actually live at the A2_tfrf, and so the now dead definition
+// of vreg1 will need to be updated to non-dead at some point.
+//
+// This issue could be remedied by adding implicit uses to the predicated
+// transfers, but this will create a problem with subsequent predication,
+// since the transfers will no longer be possible to reorder. To avoid
+// that, the initial splitting will not add any implicit uses. These
+// implicit uses will be added later, after predication. The extra price,
+// however, is that finding the locations where the implicit uses need
+// to be added, and updating the live ranges will be more involved.
+//
+// An additional problem appears when subregister liveness tracking is
+// enabled. In such a scenario, the live interval for the super-register
+// will have live ranges for each subregister (i.e. subranges). This sub-
+// range contains all liveness information about the subregister, except
+// for one case: a "read-undef" flag from another subregister will not
+// be reflected: given
+//   vreg1:subreg_hireg<def,read-undef> = ...  ; "undefines" subreg_loreg
+// the subrange for subreg_loreg will not have any indication that it is
+// undefined at this point. Calculating subregister liveness based only
+// on the information from the subrange may create a segment which spans
+// over such a "read-undef" flag. This would create inconsistencies in
+// the liveness data, resulting in assertions or incorrect code.
+// Example:
+//   vreg1:subreg_loreg<def> = ...
+//   vreg1:subreg_hireg<def, read-undef> = ... ; "undefines" subreg_loreg
+//   ...
+//   vreg1:subreg_loreg<def> = A2_tfrt ...     ; may end up with imp-use
+//                                             ; of subreg_loreg
+// The remedy takes advantage of the fact, that at this point we have
+// an unconditional definition of the subregister. What this means is
+// that any preceding value in this subregister will be overwritten,
+// or in other words, the last use before this def is a kill. This also
+// implies that the first of the predicated transfers at this location
+// should not have any implicit uses.
+// Assume for a moment that no part of the corresponding super-register
+// is used as a source. In such case, the entire super-register can be
+// considered undefined immediately before this instruction. Because of
+// that, we can insert an IMPLICIT_DEF of the super-register at this
+// location, which will cause it to be reflected in all the associated
+// subranges. What is important here is that if an IMPLICIT_DEF of
+// subreg_loreg was used, we would lose the indication that subreg_hireg
+// is also considered undefined. This could lead to having implicit uses
+// incorrectly added.
+//
+// What is left is the two cases when the super-register is used as a
+// source.
+// * Case 1: the used part is the same as the one that is defined:
+//   vreg1<def> = ...
+//   ...
+//   vreg1:subreg_loreg<def,read-undef> = C2_mux ..., vreg1:subreg_loreg
+// In the end, the subreg_loreg should be marked as live at the point of
+// the splitting:
+//   vreg1:subreg_loreg<def,read-undef> = A2_tfrt ; should have imp-use
+//   vreg1:subreg_loreg<def,read-undef> = A2_tfrf ; should have imp-use
+// Hence, an IMPLICIT_DEF of only vreg1:subreg_hireg would be sufficient.
+// * Case 2: the used part does not overlap the part being defined:
+//   vreg1<def> = ...
+//   ...
+//   vreg1:subreg_loreg<def,read-undef> = C2_mux ..., vreg1:subreg_hireg
+// For this case, we insert an IMPLICIT_DEF of vreg1:subreg_hireg after
+// the C2_mux.
+
 #define DEBUG_TYPE "expand-condsets"
-#include "HexagonTargetMachine.h"
 
+#include "HexagonTargetMachine.h"
+#include "llvm/ADT/SetVector.h"
 #include "llvm/CodeGen/Passes.h"
 #include "llvm/CodeGen/LiveInterval.h"
 #include "llvm/CodeGen/LiveIntervalAnalysis.h"
+#include "llvm/CodeGen/MachineDominators.h"
 #include "llvm/CodeGen/MachineFunction.h"
 #include "llvm/CodeGen/MachineInstrBuilder.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
@@ -76,6 +156,11 @@
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/raw_ostream.h"
 
+#include <algorithm>
+#include <iterator>
+#include <set>
+#include <utility>
+
 using namespace llvm;
 
 static cl::opt<unsigned> OptTfrLimit("expand-condsets-tfr-limit",
@@ -103,22 +188,26 @@ namespace {
       initializeHexagonExpandCondsetsPass(*PassRegistry::getPassRegistry());
     }
 
-    virtual const char *getPassName() const {
+    const char *getPassName() const override {
       return "Hexagon Expand Condsets";
     }
-    virtual void getAnalysisUsage(AnalysisUsage &AU) const {
+    void getAnalysisUsage(AnalysisUsage &AU) const override {
       AU.addRequired<LiveIntervals>();
       AU.addPreserved<LiveIntervals>();
       AU.addPreserved<SlotIndexes>();
+      AU.addRequired<MachineDominatorTree>();
+      AU.addPreserved<MachineDominatorTree>();
       MachineFunctionPass::getAnalysisUsage(AU);
     }
-    virtual bool runOnMachineFunction(MachineFunction &MF);
+    bool runOnMachineFunction(MachineFunction &MF) override;
 
   private:
     const HexagonInstrInfo *HII;
     const TargetRegisterInfo *TRI;
+    MachineDominatorTree *MDT;
     MachineRegisterInfo *MRI;
     LiveIntervals *LIS;
+    std::set<MachineInstr*> LocalImpDefs;
 
     bool CoaLimitActive, TfrLimitActive;
     unsigned CoaLimit, TfrLimit, CoaCounter, TfrCounter;
@@ -131,6 +220,9 @@ namespace {
         return Reg == RR.Reg && Sub == RR.Sub;
       }
       bool operator!= (RegisterRef RR) const { return !operator==(RR); }
+      bool operator< (RegisterRef RR) const {
+        return Reg < RR.Reg || (Reg == RR.Reg && Sub < RR.Sub);
+      }
       unsigned Reg, Sub;
     };
 
@@ -138,44 +230,44 @@ namespace {
     enum { Sub_Low = 0x1, Sub_High = 0x2, Sub_None = (Sub_Low | Sub_High) };
     enum { Exec_Then = 0x10, Exec_Else = 0x20 };
     unsigned getMaskForSub(unsigned Sub);
-    bool isCondset(const MachineInstr *MI);
+    bool isCondset(const MachineInstr &MI);
+    LaneBitmask getLaneMask(unsigned Reg, unsigned Sub);
 
     void addRefToMap(RegisterRef RR, ReferenceMap &Map, unsigned Exec);
     bool isRefInMap(RegisterRef, ReferenceMap &Map, unsigned Exec);
 
-    LiveInterval::iterator nextSegment(LiveInterval &LI, SlotIndex S);
-    LiveInterval::iterator prevSegment(LiveInterval &LI, SlotIndex S);
-    void makeDefined(unsigned Reg, SlotIndex S, bool SetDef);
-    void makeUndead(unsigned Reg, SlotIndex S);
-    void shrinkToUses(unsigned Reg, LiveInterval &LI);
-    void updateKillFlags(unsigned Reg, LiveInterval &LI);
-    void terminateSegment(LiveInterval::iterator LT, SlotIndex S,
-        LiveInterval &LI);
-    void addInstrToLiveness(MachineInstr *MI);
-    void removeInstrFromLiveness(MachineInstr *MI);
+    void removeImpDefSegments(LiveRange &Range);
+    void updateDeadsInRange(unsigned Reg, LaneBitmask LM, LiveRange &Range);
+    void updateKillFlags(unsigned Reg);
+    void updateDeadFlags(unsigned Reg);
+    void recalculateLiveInterval(unsigned Reg);
+    void removeInstr(MachineInstr &MI);
+    void updateLiveness(std::set<unsigned> &RegSet, bool Recalc,
+        bool UpdateKills, bool UpdateDeads);
 
     unsigned getCondTfrOpcode(const MachineOperand &SO, bool Cond);
-    MachineInstr *genTfrFor(MachineOperand &SrcOp, unsigned DstR,
-        unsigned DstSR, const MachineOperand &PredOp, bool Cond);
-    bool split(MachineInstr *MI);
-    bool splitInBlock(MachineBasicBlock &B);
+    MachineInstr *genCondTfrFor(MachineOperand &SrcOp,
+        MachineBasicBlock::iterator At, unsigned DstR,
+        unsigned DstSR, const MachineOperand &PredOp, bool PredSense,
+        bool ReadUndef, bool ImpUse);
+    bool split(MachineInstr &MI, std::set<unsigned> &UpdRegs);
+    bool splitInBlock(MachineBasicBlock &B, std::set<unsigned> &UpdRegs);
 
     bool isPredicable(MachineInstr *MI);
     MachineInstr *getReachingDefForPred(RegisterRef RD,
         MachineBasicBlock::iterator UseIt, unsigned PredR, bool Cond);
-    bool canMoveOver(MachineInstr *MI, ReferenceMap &Defs, ReferenceMap &Uses);
-    bool canMoveMemTo(MachineInstr *MI, MachineInstr *ToI, bool IsDown);
-    void predicateAt(RegisterRef RD, MachineInstr *MI,
-        MachineBasicBlock::iterator Where, unsigned PredR, bool Cond);
+    bool canMoveOver(MachineInstr &MI, ReferenceMap &Defs, ReferenceMap &Uses);
+    bool canMoveMemTo(MachineInstr &MI, MachineInstr &ToI, bool IsDown);
+    void predicateAt(const MachineOperand &DefOp, MachineInstr &MI,
+                     MachineBasicBlock::iterator Where,
+                     const MachineOperand &PredOp, bool Cond,
+                     std::set<unsigned> &UpdRegs);
     void renameInRange(RegisterRef RO, RegisterRef RN, unsigned PredR,
         bool Cond, MachineBasicBlock::iterator First,
         MachineBasicBlock::iterator Last);
-    bool predicate(MachineInstr *TfrI, bool Cond);
-    bool predicateInBlock(MachineBasicBlock &B);
-
-    void postprocessUndefImplicitUses(MachineBasicBlock &B);
-    void removeImplicitUses(MachineInstr *MI);
-    void removeImplicitUses(MachineBasicBlock &B);
+    bool predicate(MachineInstr &TfrI, bool Cond, std::set<unsigned> &UpdRegs);
+    bool predicateInBlock(MachineBasicBlock &B,
+        std::set<unsigned> &UpdRegs);
 
     bool isIntReg(RegisterRef RR, unsigned &BW);
     bool isIntraBlocks(LiveInterval &LI);
@@ -186,6 +278,13 @@ namespace {
 
 char HexagonExpandCondsets::ID = 0;
 
+INITIALIZE_PASS_BEGIN(HexagonExpandCondsets, "expand-condsets",
+  "Hexagon Expand Condsets", false, false)
+INITIALIZE_PASS_DEPENDENCY(MachineDominatorTree)
+INITIALIZE_PASS_DEPENDENCY(SlotIndexes)
+INITIALIZE_PASS_DEPENDENCY(LiveIntervals)
+INITIALIZE_PASS_END(HexagonExpandCondsets, "expand-condsets",
+  "Hexagon Expand Condsets", false, false)
 
 unsigned HexagonExpandCondsets::getMaskForSub(unsigned Sub) {
   switch (Sub) {
@@ -199,9 +298,8 @@ unsigned HexagonExpandCondsets::getMaskForSub(unsigned Sub) {
   llvm_unreachable("Invalid subregister");
 }
 
-
-bool HexagonExpandCondsets::isCondset(const MachineInstr *MI) {
-  unsigned Opc = MI->getOpcode();
+bool HexagonExpandCondsets::isCondset(const MachineInstr &MI) {
+  unsigned Opc = MI.getOpcode();
   switch (Opc) {
     case Hexagon::C2_mux:
     case Hexagon::C2_muxii:
@@ -215,6 +313,13 @@ bool HexagonExpandCondsets::isCondset(const MachineInstr *MI) {
 }
 
 
+LaneBitmask HexagonExpandCondsets::getLaneMask(unsigned Reg, unsigned Sub) {
+  assert(TargetRegisterInfo::isVirtualRegister(Reg));
+  return Sub != 0 ? TRI->getSubRegIndexLaneMask(Sub)
+                  : MRI->getMaxLaneMaskForVReg(Reg);
+}
+
+
 void HexagonExpandCondsets::addRefToMap(RegisterRef RR, ReferenceMap &Map,
       unsigned Exec) {
   unsigned Mask = getMaskForSub(RR.Sub) | Exec;
@@ -238,408 +343,231 @@ bool HexagonExpandCondsets::isRefInMap(RegisterRef RR, ReferenceMap &Map,
 }
 
 
-LiveInterval::iterator HexagonExpandCondsets::nextSegment(LiveInterval &LI,
-      SlotIndex S) {
-  for (LiveInterval::iterator I = LI.begin(), E = LI.end(); I != E; ++I) {
-    if (I->start >= S)
-      return I;
-  }
-  return LI.end();
-}
-
-
-LiveInterval::iterator HexagonExpandCondsets::prevSegment(LiveInterval &LI,
-      SlotIndex S) {
-  LiveInterval::iterator P = LI.end();
-  for (LiveInterval::iterator I = LI.begin(), E = LI.end(); I != E; ++I) {
-    if (I->end > S)
-      return P;
-    P = I;
-  }
-  return P;
-}
-
-
-/// Find the implicit use of register Reg in slot index S, and make sure
-/// that the "defined" flag is set to SetDef. While the mux expansion is
-/// going on, predicated instructions will have implicit uses of the
-/// registers that are being defined. This is to keep any preceding
-/// definitions live. If there is no preceding definition, the implicit
-/// use will be marked as "undef", otherwise it will be "defined". This
-/// function is used to update the flag.
-void HexagonExpandCondsets::makeDefined(unsigned Reg, SlotIndex S,
-      bool SetDef) {
-  if (!S.isRegister())
-    return;
-  MachineInstr *MI = LIS->getInstructionFromIndex(S);
-  assert(MI && "Expecting instruction");
-  for (auto &Op : MI->operands()) {
-    if (!Op.isReg() || !Op.isUse() || Op.getReg() != Reg)
-      continue;
-    bool IsDef = !Op.isUndef();
-    if (Op.isImplicit() && IsDef != SetDef)
-      Op.setIsUndef(!SetDef);
-  }
-}
-
-
-void HexagonExpandCondsets::makeUndead(unsigned Reg, SlotIndex S) {
-  // If S is a block boundary, then there can still be a dead def reaching
-  // this point. Instead of traversing the CFG, queue start points of all
-  // live segments that begin with a register, and end at a block boundary.
-  // This may "resurrect" some truly dead definitions, but doing so is
-  // harmless.
-  SmallVector<MachineInstr*,8> Defs;
-  if (S.isBlock()) {
-    LiveInterval &LI = LIS->getInterval(Reg);
-    for (LiveInterval::iterator I = LI.begin(), E = LI.end(); I != E; ++I) {
-      if (!I->start.isRegister() || !I->end.isBlock())
-        continue;
-      MachineInstr *MI = LIS->getInstructionFromIndex(I->start);
-      Defs.push_back(MI);
-    }
-  } else if (S.isRegister()) {
-    MachineInstr *MI = LIS->getInstructionFromIndex(S);
-    Defs.push_back(MI);
-  }
-
-  for (unsigned i = 0, n = Defs.size(); i < n; ++i) {
-    MachineInstr *MI = Defs[i];
+void HexagonExpandCondsets::updateKillFlags(unsigned Reg) {
+  auto KillAt = [this,Reg] (SlotIndex K, LaneBitmask LM) -> void {
+    // Set the <kill> flag on a use of Reg whose lane mask is contained in LM.
+    MachineInstr *MI = LIS->getInstructionFromIndex(K);
     for (auto &Op : MI->operands()) {
-      if (!Op.isReg() || !Op.isDef() || Op.getReg() != Reg)
-        continue;
-      Op.setIsDead(false);
-    }
-  }
-}
-
-
-/// Shrink the segments in the live interval for a given register to the last
-/// use before each subsequent def. Unlike LiveIntervals::shrinkToUses, this
-/// function will not mark any definitions of Reg as dead. The reason for this
-/// is that this function is used while a MUX instruction is being expanded,
-/// or while a conditional copy is undergoing predication. During these
-/// processes, there may be defs present in the instruction sequence that have
-/// not yet been removed, or there may be missing uses that have not yet been
-/// added. We want to utilize LiveIntervals::shrinkToUses as much as possible,
-/// but since it does not extend any intervals that are too short, we need to
-/// pre-emptively extend them here in anticipation of further changes.
-void HexagonExpandCondsets::shrinkToUses(unsigned Reg, LiveInterval &LI) {
-  SmallVector<MachineInstr*,4> Deads;
-  LIS->shrinkToUses(&LI, &Deads);
-  // Need to undo the deadification made by "shrinkToUses". It's easier to
-  // do it here, since we have a list of all instructions that were just
-  // marked as dead.
-  for (unsigned i = 0, n = Deads.size(); i < n; ++i) {
-    MachineInstr *MI = Deads[i];
-    // Clear the "dead" flag.
-    for (auto &Op : MI->operands()) {
-      if (!Op.isReg() || !Op.isDef() || Op.getReg() != Reg)
+      if (!Op.isReg() || !Op.isUse() || Op.getReg() != Reg)
         continue;
-      Op.setIsDead(false);
+      LaneBitmask SLM = getLaneMask(Reg, Op.getSubReg());
+      if ((SLM & LM) == SLM) {
+        // Only set the kill flag on the first encountered use of Reg in this
+        // instruction.
+        Op.setIsKill(true);
+        break;
+      }
     }
-    // Extend the live segment to the beginning of the next one.
-    LiveInterval::iterator End = LI.end();
-    SlotIndex S = LIS->getInstructionIndex(MI).getRegSlot();
-    LiveInterval::iterator T = LI.FindSegmentContaining(S);
-    assert(T != End);
-    LiveInterval::iterator N = std::next(T);
-    if (N != End)
-      T->end = N->start;
-    else
-      T->end = LIS->getMBBEndIdx(MI->getParent());
-  }
-  updateKillFlags(Reg, LI);
-}
-
+  };
 
-/// Given an updated live interval LI for register Reg, update the kill flags
-/// in instructions using Reg to reflect the liveness changes.
-void HexagonExpandCondsets::updateKillFlags(unsigned Reg, LiveInterval &LI) {
-  MRI->clearKillFlags(Reg);
-  for (LiveInterval::iterator I = LI.begin(), E = LI.end(); I != E; ++I) {
-    SlotIndex EX = I->end;
-    if (!EX.isRegister())
+  LiveInterval &LI = LIS->getInterval(Reg);
+  for (auto I = LI.begin(), E = LI.end(); I != E; ++I) {
+    if (!I->end.isRegister())
       continue;
-    MachineInstr *MI = LIS->getInstructionFromIndex(EX);
-    for (auto &Op : MI->operands()) {
-      if (!Op.isReg() || !Op.isUse() || Op.getReg() != Reg)
+    // Do not mark the end of the segment as <kill>, if the next segment
+    // starts with a predicated instruction.
+    auto NextI = std::next(I);
+    if (NextI != E && NextI->start.isRegister()) {
+      MachineInstr *DefI = LIS->getInstructionFromIndex(NextI->start);
+      if (HII->isPredicated(*DefI))
         continue;
-      // Only set the kill flag on the first encountered use of Reg in this
-      // instruction.
-      Op.setIsKill(true);
-      break;
     }
+    bool WholeReg = true;
+    if (LI.hasSubRanges()) {
+      auto EndsAtI = [I] (LiveInterval::SubRange &S) -> bool {
+        LiveRange::iterator F = S.find(I->end);
+        return F != S.end() && I->end == F->end;
+      };
+      // Check if all subranges end at I->end. If so, make sure to kill
+      // the whole register.
+      for (LiveInterval::SubRange &S : LI.subranges()) {
+        if (EndsAtI(S))
+          KillAt(I->end, S.LaneMask);
+        else
+          WholeReg = false;
+      }
+    }
+    if (WholeReg)
+      KillAt(I->end, MRI->getMaxLaneMaskForVReg(Reg));
   }
 }
 
 
-/// When adding a new instruction to liveness, the newly added definition
-/// will start a new live segment. This may happen at a position that falls
-/// within an existing live segment. In such case that live segment needs to
-/// be truncated to make room for the new segment. Ultimately, the truncation
-/// will occur at the last use, but for now the segment can be terminated
-/// right at the place where the new segment will start. The segments will be
-/// shrunk-to-uses later.
-void HexagonExpandCondsets::terminateSegment(LiveInterval::iterator LT,
-      SlotIndex S, LiveInterval &LI) {
-  // Terminate the live segment pointed to by LT within a live interval LI.
-  if (LT == LI.end())
-    return;
+void HexagonExpandCondsets::removeImpDefSegments(LiveRange &Range) {
+  auto StartImpDef = [this] (LiveRange::Segment &S) -> bool {
+    return S.start.isRegister() &&
+           LocalImpDefs.count(LIS->getInstructionFromIndex(S.start));
+  };
+  Range.segments.erase(std::remove_if(Range.begin(), Range.end(), StartImpDef),
+                       Range.end());
+}
 
-  VNInfo *OldVN = LT->valno;
-  SlotIndex EX = LT->end;
-  LT->end = S;
-  // If LT does not end at a block boundary, the termination is done.
-  if (!EX.isBlock())
+void HexagonExpandCondsets::updateDeadsInRange(unsigned Reg, LaneBitmask LM,
+      LiveRange &Range) {
+  assert(TargetRegisterInfo::isVirtualRegister(Reg));
+  if (Range.empty())
     return;
 
-  // If LT ended at a block boundary, it's possible that its value number
-  // is picked up at the beginning other blocks. Create a new value number
-  // and change such blocks to use it instead.
-  VNInfo *NewVN = 0;
-  for (LiveInterval::iterator I = LI.begin(), E = LI.end(); I != E; ++I) {
-    if (!I->start.isBlock() || I->valno != OldVN)
-      continue;
-    // Generate on-demand a new value number that is defined by the
-    // block beginning (i.e. -phi).
-    if (!NewVN)
-      NewVN = LI.getNextValue(I->start, LIS->getVNInfoAllocator());
-    I->valno = NewVN;
-  }
-}
+  auto IsRegDef = [this,Reg,LM] (MachineOperand &Op) -> bool {
+    if (!Op.isReg() || !Op.isDef())
+      return false;
+    unsigned DR = Op.getReg(), DSR = Op.getSubReg();
+    if (!TargetRegisterInfo::isVirtualRegister(DR) || DR != Reg)
+      return false;
+    LaneBitmask SLM = getLaneMask(DR, DSR);
+    return (SLM & LM) != 0;
+  };
 
+  // The splitting step will create pairs of predicated definitions without
+  // any implicit uses (since implicit uses would interfere with predication).
+  // This can cause the reaching defs to become dead after live range
+  // recomputation, even though they are not really dead.
+  // We need to identify predicated defs that need implicit uses, and
+  // dead defs that are not really dead, and correct both problems.
+
+  SetVector<MachineBasicBlock*> Defs;
+  auto Dominate = [this] (SetVector<MachineBasicBlock*> &Defs,
+                          MachineBasicBlock *Dest) -> bool {
+    for (MachineBasicBlock *D : Defs)
+      if (D != Dest && MDT->dominates(D, Dest))
+        return true;
 
-/// Add the specified instruction to live intervals. This function is used
-/// to update the live intervals while the program code is being changed.
-/// Neither the expansion of a MUX, nor the predication are atomic, and this
-/// function is used to update the live intervals while these transformations
-/// are being done.
-void HexagonExpandCondsets::addInstrToLiveness(MachineInstr *MI) {
-  SlotIndex MX = LIS->isNotInMIMap(MI) ? LIS->InsertMachineInstrInMaps(MI)
-                                       : LIS->getInstructionIndex(MI);
-  DEBUG(dbgs() << "adding liveness info for instr\n  " << MX << "  " << *MI);
-
-  MX = MX.getRegSlot();
-  bool Predicated = HII->isPredicated(MI);
-  MachineBasicBlock *MB = MI->getParent();
-
-  // Strip all implicit uses from predicated instructions. They will be
-  // added again, according to the updated information.
-  if (Predicated)
-    removeImplicitUses(MI);
-
-  // For each def in MI we need to insert a new live segment starting at MX
-  // into the interval. If there already exists a live segment in the interval
-  // that contains MX, we need to terminate it at MX.
-  SmallVector<RegisterRef,2> Defs;
-  for (auto &Op : MI->operands())
-    if (Op.isReg() && Op.isDef())
-      Defs.push_back(RegisterRef(Op));
-
-  for (unsigned i = 0, n = Defs.size(); i < n; ++i) {
-    unsigned DefR = Defs[i].Reg;
-    LiveInterval &LID = LIS->getInterval(DefR);
-    DEBUG(dbgs() << "adding def " << PrintReg(DefR, TRI)
-                 << " with interval\n  " << LID << "\n");
-    // If MX falls inside of an existing live segment, terminate it.
-    LiveInterval::iterator LT = LID.FindSegmentContaining(MX);
-    if (LT != LID.end())
-      terminateSegment(LT, MX, LID);
-    DEBUG(dbgs() << "after terminating segment\n  " << LID << "\n");
-
-    // Create a new segment starting from MX.
-    LiveInterval::iterator P = prevSegment(LID, MX), N = nextSegment(LID, MX);
-    SlotIndex EX;
-    VNInfo *VN = LID.getNextValue(MX, LIS->getVNInfoAllocator());
-    if (N == LID.end()) {
-      // There is no live segment after MX. End this segment at the end of
-      // the block.
-      EX = LIS->getMBBEndIdx(MB);
-    } else {
-      // If the next segment starts at the block boundary, end the new segment
-      // at the boundary of the preceding block (i.e. the previous index).
-      // Otherwise, end the segment at the beginning of the next segment. In
-      // either case it will be "shrunk-to-uses" later.
-      EX = N->start.isBlock() ? N->start.getPrevIndex() : N->start;
+    MachineBasicBlock *Entry = &Dest->getParent()->front();
+    SetVector<MachineBasicBlock*> Work(Dest->pred_begin(), Dest->pred_end());
+    for (unsigned i = 0; i < Work.size(); ++i) {
+      MachineBasicBlock *B = Work[i];
+      if (Defs.count(B))
+        continue;
+      if (B == Entry)
+        return false;
+      for (auto *P : B->predecessors())
+        Work.insert(P);
     }
-    if (Predicated) {
-      // Predicated instruction will have an implicit use of the defined
-      // register. This is necessary so that this definition will not make
-      // any previous definitions dead. If there are no previous live
-      // segments, still add the implicit use, but make it "undef".
-      // Because of the implicit use, the preceding definition is not
-      // dead. Mark is as such (if necessary).
-      MachineOperand ImpUse = MachineOperand::CreateReg(DefR, false, true);
-      ImpUse.setSubReg(Defs[i].Sub);
-      bool Undef = false;
-      if (P == LID.end())
-        Undef = true;
-      else {
-        // If the previous segment extends to the end of the previous block,
-        // the end index may actually be the beginning of this block. If
-        // the previous segment ends at a block boundary, move it back by one,
-        // to get the proper block for it.
-        SlotIndex PE = P->end.isBlock() ? P->end.getPrevIndex() : P->end;
-        MachineBasicBlock *PB = LIS->getMBBFromIndex(PE);
-        if (PB != MB && !LIS->isLiveInToMBB(LID, MB))
-          Undef = true;
-      }
-      if (!Undef) {
-        makeUndead(DefR, P->valno->def);
-        // We are adding a live use, so extend the previous segment to
-        // include it.
-        P->end = MX;
-      } else {
-        ImpUse.setIsUndef(true);
-      }
+    return true;
+  };
 
-      if (!MI->readsRegister(DefR))
-        MI->addOperand(ImpUse);
-      if (N != LID.end())
-        makeDefined(DefR, N->start, true);
-    }
-    LiveRange::Segment NR = LiveRange::Segment(MX, EX, VN);
-    LID.addSegment(NR);
-    DEBUG(dbgs() << "added a new segment " << NR << "\n  " << LID << "\n");
-    shrinkToUses(DefR, LID);
-    DEBUG(dbgs() << "updated imp-uses: " << *MI);
-    LID.verify();
+  // First, try to extend live range within individual basic blocks. This
+  // will leave us only with dead defs that do not reach any predicated
+  // defs in the same block.
+  SmallVector<SlotIndex,4> PredDefs;
+  for (auto &Seg : Range) {
+    if (!Seg.start.isRegister())
+      continue;
+    MachineInstr *DefI = LIS->getInstructionFromIndex(Seg.start);
+    if (LocalImpDefs.count(DefI))
+      continue;
+    Defs.insert(DefI->getParent());
+    if (HII->isPredicated(*DefI))
+      PredDefs.push_back(Seg.start);
+  }
+  for (auto &SI : PredDefs) {
+    MachineBasicBlock *BB = LIS->getMBBFromIndex(SI);
+    if (Range.extendInBlock(LIS->getMBBStartIdx(BB), SI))
+      SI = SlotIndex();
   }
 
-  // For each use in MI:
-  // - If there is no live segment that contains MX for the used register,
-  //   extend the previous one. Ignore implicit uses.
-  for (auto &Op : MI->operands()) {
-    if (!Op.isReg() || !Op.isUse() || Op.isImplicit() || Op.isUndef())
+  // Calculate reachability for those predicated defs that were not handled
+  // by the in-block extension.
+  SmallVector<SlotIndex,4> ExtTo;
+  for (auto &SI : PredDefs) {
+    if (!SI.isValid())
+      continue;
+    MachineBasicBlock *BB = LIS->getMBBFromIndex(SI);
+    if (BB->pred_empty())
+      continue;
+    // If the defs from this range reach SI via all predecessors, it is live.
+    if (Dominate(Defs, BB))
+      ExtTo.push_back(SI);
+  }
+  LIS->extendToIndices(Range, ExtTo);
+
+  // Remove <dead> flags from all defs that are not dead after live range
+  // extension, and collect all def operands. They will be used to generate
+  // the necessary implicit uses.
+  std::set<RegisterRef> DefRegs;
+  for (auto &Seg : Range) {
+    if (!Seg.start.isRegister())
       continue;
-    unsigned UseR = Op.getReg();
-    LiveInterval &LIU = LIS->getInterval(UseR);
-    // Find the last segment P that starts before MX.
-    LiveInterval::iterator P = LIU.FindSegmentContaining(MX);
-    if (P == LIU.end())
-      P = prevSegment(LIU, MX);
-
-    assert(P != LIU.end() && "MI uses undefined register?");
-    SlotIndex EX = P->end;
-    // If P contains MX, there is not much to do.
-    if (EX > MX) {
-      Op.setIsKill(false);
+    MachineInstr *DefI = LIS->getInstructionFromIndex(Seg.start);
+    if (LocalImpDefs.count(DefI))
       continue;
+    for (auto &Op : DefI->operands()) {
+      if (Seg.start.isDead() || !IsRegDef(Op))
+        continue;
+      DefRegs.insert(Op);
+      Op.setIsDead(false);
     }
-    // Otherwise, extend P to "next(MX)".
-    P->end = MX.getNextIndex();
-    Op.setIsKill(true);
-    // Get the old "kill" instruction, and remove the kill flag.
-    if (MachineInstr *KI = LIS->getInstructionFromIndex(MX))
-      KI->clearRegisterKills(UseR, nullptr);
-    shrinkToUses(UseR, LIU);
-    LIU.verify();
   }
-}
 
 
-/// Update the live interval information to reflect the removal of the given
-/// instruction from the program. As with "addInstrToLiveness", this function
-/// is called while the program code is being changed.
-void HexagonExpandCondsets::removeInstrFromLiveness(MachineInstr *MI) {
-  SlotIndex MX = LIS->getInstructionIndex(MI).getRegSlot();
-  DEBUG(dbgs() << "removing instr\n  " << MX << "  " << *MI);
+  // Finally, add implicit uses to each predicated def that is reached
+  // by other defs. Remove segments started by implicit-defs first, since
+  // they do not define registers.
+  removeImpDefSegments(Range);
 
-  // For each def in MI:
-  // If MI starts a live segment, merge this segment with the previous segment.
-  //
-  for (auto &Op : MI->operands()) {
-    if (!Op.isReg() || !Op.isDef())
+  for (auto &Seg : Range) {
+    if (!Seg.start.isRegister() || !Range.liveAt(Seg.start.getPrevSlot()))
       continue;
-    unsigned DefR = Op.getReg();
-    LiveInterval &LID = LIS->getInterval(DefR);
-    LiveInterval::iterator LT = LID.FindSegmentContaining(MX);
-    assert(LT != LID.end() && "Expecting live segments");
-    DEBUG(dbgs() << "removing def at " << MX << " of " << PrintReg(DefR, TRI)
-                 << " with interval\n  " << LID << "\n");
-    if (LT->start != MX)
+    MachineInstr *DefI = LIS->getInstructionFromIndex(Seg.start);
+    if (!HII->isPredicated(*DefI))
       continue;
+    MachineFunction &MF = *DefI->getParent()->getParent();
+    // Construct the set of all necessary implicit uses, based on the def
+    // operands in the instruction.
+    std::set<RegisterRef> ImpUses;
+    for (auto &Op : DefI->operands())
+      if (Op.isReg() && Op.isDef() && DefRegs.count(Op))
+        ImpUses.insert(Op);
+    for (RegisterRef R : ImpUses)
+      MachineInstrBuilder(MF, DefI).addReg(R.Reg, RegState::Implicit, R.Sub);
+  }
+}
 
-    VNInfo *MVN = LT->valno;
-    if (LT != LID.begin()) {
-      // If the current live segment is not the first, the task is easy. If
-      // the previous segment continues into the current block, extend it to
-      // the end of the current one, and merge the value numbers.
-      // Otherwise, remove the current segment, and make the end of it "undef".
-      LiveInterval::iterator P = std::prev(LT);
-      SlotIndex PE = P->end.isBlock() ? P->end.getPrevIndex() : P->end;
-      MachineBasicBlock *MB = MI->getParent();
-      MachineBasicBlock *PB = LIS->getMBBFromIndex(PE);
-      if (PB != MB && !LIS->isLiveInToMBB(LID, MB)) {
-        makeDefined(DefR, LT->end, false);
-        LID.removeSegment(*LT);
-      } else {
-        // Make the segments adjacent, so that merge-vn can also merge the
-        // segments.
-        P->end = LT->start;
-        makeUndead(DefR, P->valno->def);
-        LID.MergeValueNumberInto(MVN, P->valno);
-      }
-    } else {
-      LiveInterval::iterator N = std::next(LT);
-      LiveInterval::iterator RmB = LT, RmE = N;
-      while (N != LID.end()) {
-        // Iterate until the first register-based definition is found
-        // (i.e. skip all block-boundary entries).
-        LiveInterval::iterator Next = std::next(N);
-        if (N->start.isRegister()) {
-          makeDefined(DefR, N->start, false);
-          break;
-        }
-        if (N->end.isRegister()) {
-          makeDefined(DefR, N->end, false);
-          RmE = Next;
-          break;
-        }
-        RmE = Next;
-        N = Next;
-      }
-      // Erase the segments in one shot to avoid invalidating iterators.
-      LID.segments.erase(RmB, RmE);
-    }
-
-    bool VNUsed = false;
-    for (LiveInterval::iterator I = LID.begin(), E = LID.end(); I != E; ++I) {
-      if (I->valno != MVN)
-        continue;
-      VNUsed = true;
-      break;
-    }
-    if (!VNUsed)
-      MVN->markUnused();
 
-    DEBUG(dbgs() << "new interval: ");
-    if (!LID.empty()) {
-      DEBUG(dbgs() << LID << "\n");
-      LID.verify();
-    } else {
-      DEBUG(dbgs() << "<empty>\n");
-      LIS->removeInterval(DefR);
+void HexagonExpandCondsets::updateDeadFlags(unsigned Reg) {
+  LiveInterval &LI = LIS->getInterval(Reg);
+  if (LI.hasSubRanges()) {
+    for (LiveInterval::SubRange &S : LI.subranges()) {
+      updateDeadsInRange(Reg, S.LaneMask, S);
+      LIS->shrinkToUses(S, Reg);
+      // LI::shrinkToUses will add segments started by implicit-defs.
+      // Remove them again.
+      removeImpDefSegments(S);
     }
+    LI.clear();
+    LIS->constructMainRangeFromSubranges(LI);
+  } else {
+    updateDeadsInRange(Reg, MRI->getMaxLaneMaskForVReg(Reg), LI);
   }
+}
 
-  // For uses there is nothing to do. The intervals will be updated via
-  // shrinkToUses.
-  SmallVector<unsigned,4> Uses;
-  for (auto &Op : MI->operands()) {
-    if (!Op.isReg() || !Op.isUse())
-      continue;
-    unsigned R = Op.getReg();
-    if (!TargetRegisterInfo::isVirtualRegister(R))
-      continue;
-    Uses.push_back(R);
-  }
+
+void HexagonExpandCondsets::recalculateLiveInterval(unsigned Reg) {
+  LIS->removeInterval(Reg);
+  LIS->createAndComputeVirtRegInterval(Reg);
+}
+
+void HexagonExpandCondsets::removeInstr(MachineInstr &MI) {
   LIS->RemoveMachineInstrFromMaps(MI);
-  MI->eraseFromParent();
-  for (unsigned i = 0, n = Uses.size(); i < n; ++i) {
-    LiveInterval &LI = LIS->getInterval(Uses[i]);
-    shrinkToUses(Uses[i], LI);
+  MI.eraseFromParent();
+}
+
+
+void HexagonExpandCondsets::updateLiveness(std::set<unsigned> &RegSet,
+      bool Recalc, bool UpdateKills, bool UpdateDeads) {
+  UpdateKills |= UpdateDeads;
+  for (auto R : RegSet) {
+    if (Recalc)
+      recalculateLiveInterval(R);
+    if (UpdateKills)
+      MRI->clearKillFlags(R);
+    if (UpdateDeads)
+      updateDeadFlags(R);
+    // Fixing <dead> flags may extend live ranges, so reset <kill> flags
+    // after that.
+    if (UpdateKills)
+      updateKillFlags(R);
+    LIS->getInterval(R).verify();
   }
 }
 
@@ -647,7 +575,7 @@ void HexagonExpandCondsets::removeInstrFromLiveness(MachineInstr *MI) {
 /// Get the opcode for a conditional transfer of the value in SO (source
 /// operand). The condition (true/false) is given in Cond.
 unsigned HexagonExpandCondsets::getCondTfrOpcode(const MachineOperand &SO,
-      bool Cond) {
+      bool IfTrue) {
   using namespace Hexagon;
   if (SO.isReg()) {
     unsigned PhysR;
@@ -664,14 +592,14 @@ unsigned HexagonExpandCondsets::getCondTfrOpcode(const MachineOperand &SO,
     const TargetRegisterClass *RC = TRI->getMinimalPhysRegClass(PhysS);
     switch (RC->getSize()) {
       case 4:
-        return Cond ? A2_tfrt : A2_tfrf;
+        return IfTrue ? A2_tfrt : A2_tfrf;
       case 8:
-        return Cond ? A2_tfrpt : A2_tfrpf;
+        return IfTrue ? A2_tfrpt : A2_tfrpf;
     }
     llvm_unreachable("Invalid register operand");
   }
   if (SO.isImm() || SO.isFPImm())
-    return Cond ? C2_cmoveit : C2_cmoveif;
+    return IfTrue ? C2_cmoveit : C2_cmoveif;
   llvm_unreachable("Unexpected source operand");
 }
 
@@ -680,12 +608,13 @@ unsigned HexagonExpandCondsets::getCondTfrOpcode(const MachineOperand &SO,
 /// destination register DstR:DstSR, and using the predicate register from
 /// PredOp. The Cond argument specifies whether the predicate is to be
 /// if(PredOp), or if(!PredOp).
-MachineInstr *HexagonExpandCondsets::genTfrFor(MachineOperand &SrcOp,
-      unsigned DstR, unsigned DstSR, const MachineOperand &PredOp, bool Cond) {
+MachineInstr *HexagonExpandCondsets::genCondTfrFor(MachineOperand &SrcOp,
+      MachineBasicBlock::iterator At,
+      unsigned DstR, unsigned DstSR, const MachineOperand &PredOp,
+      bool PredSense, bool ReadUndef, bool ImpUse) {
   MachineInstr *MI = SrcOp.getParent();
-  MachineBasicBlock &B = *MI->getParent();
-  MachineBasicBlock::iterator At = MI;
-  DebugLoc DL = MI->getDebugLoc();
+  MachineBasicBlock &B = *At->getParent();
+  const DebugLoc &DL = MI->getDebugLoc();
 
   // Don't avoid identity copies here (i.e. if the source and the destination
   // are the same registers). It is actually better to generate them here,
@@ -693,62 +622,101 @@ MachineInstr *HexagonExpandCondsets::genTfrFor(MachineOperand &SrcOp,
   // step. The predication will remove such a copy if it is unable to
   /// predicate.
 
-  unsigned Opc = getCondTfrOpcode(SrcOp, Cond);
-  MachineInstr *TfrI = BuildMI(B, At, DL, HII->get(Opc))
-        .addReg(DstR, RegState::Define, DstSR)
+  unsigned Opc = getCondTfrOpcode(SrcOp, PredSense);
+  unsigned State = RegState::Define | (ReadUndef ? RegState::Undef : 0);
+  MachineInstrBuilder MIB = BuildMI(B, At, DL, HII->get(Opc))
+        .addReg(DstR, State, DstSR)
         .addOperand(PredOp)
         .addOperand(SrcOp);
+
   // We don't want any kills yet.
-  TfrI->clearKillInfo();
-  DEBUG(dbgs() << "created an initial copy: " << *TfrI);
-  return TfrI;
+  MIB->clearKillInfo();
+  DEBUG(dbgs() << "created an initial copy: " << *MIB);
+  return &*MIB;
 }
 
 
 /// Replace a MUX instruction MI with a pair A2_tfrt/A2_tfrf. This function
 /// performs all necessary changes to complete the replacement.
-bool HexagonExpandCondsets::split(MachineInstr *MI) {
+bool HexagonExpandCondsets::split(MachineInstr &MI,
+                                  std::set<unsigned> &UpdRegs) {
   if (TfrLimitActive) {
     if (TfrCounter >= TfrLimit)
       return false;
     TfrCounter++;
   }
-  DEBUG(dbgs() << "\nsplitting BB#" << MI->getParent()->getNumber()
-               << ": " << *MI);
-  MachineOperand &MD = MI->getOperand(0); // Definition
-  MachineOperand &MP = MI->getOperand(1); // Predicate register
+  DEBUG(dbgs() << "\nsplitting BB#" << MI.getParent()->getNumber() << ": "
+               << MI);
+  MachineOperand &MD = MI.getOperand(0);  // Definition
+  MachineOperand &MP = MI.getOperand(1);  // Predicate register
+  MachineOperand &MS1 = MI.getOperand(2); // Source value #1
+  MachineOperand &MS2 = MI.getOperand(3); // Source value #2
   assert(MD.isDef());
   unsigned DR = MD.getReg(), DSR = MD.getSubReg();
+  bool ReadUndef = MD.isUndef();
+  MachineBasicBlock::iterator At = MI;
+
+  if (ReadUndef && DSR != 0 && MRI->shouldTrackSubRegLiveness(DR)) {
+    unsigned NewSR = 0;
+    MachineBasicBlock::iterator DefAt = At;
+    bool SameReg = (MS1.isReg() && DR == MS1.getReg()) ||
+                   (MS2.isReg() && DR == MS2.getReg());
+    if (SameReg) {
+      NewSR = (DSR == Hexagon::subreg_loreg) ? Hexagon::subreg_hireg
+                                             : Hexagon::subreg_loreg;
+      // Advance the insertion point if the subregisters differ between
+      // the source and the target (with the same super-register).
+      // Note: this case has never occured during tests.
+      if ((MS1.isReg() && NewSR == MS1.getSubReg()) ||
+          (MS2.isReg() && NewSR == MS2.getSubReg()))
+        ++DefAt;
+    }
+    // Use "At", since "DefAt" may be end().
+    MachineBasicBlock &B = *At->getParent();
+    DebugLoc DL = At->getDebugLoc();
+    auto ImpD = BuildMI(B, DefAt, DL, HII->get(TargetOpcode::IMPLICIT_DEF))
+                  .addReg(DR, RegState::Define, NewSR);
+    LIS->InsertMachineInstrInMaps(*ImpD);
+    LocalImpDefs.insert(&*ImpD);
+  }
 
   // First, create the two invididual conditional transfers, and add each
   // of them to the live intervals information. Do that first and then remove
   // the old instruction from live intervals.
-  if (MachineInstr *TfrT = genTfrFor(MI->getOperand(2), DR, DSR, MP, true))
-    addInstrToLiveness(TfrT);
-  if (MachineInstr *TfrF = genTfrFor(MI->getOperand(3), DR, DSR, MP, false))
-    addInstrToLiveness(TfrF);
-  removeInstrFromLiveness(MI);
-
+  MachineInstr *TfrT =
+      genCondTfrFor(MI.getOperand(2), At, DR, DSR, MP, true, ReadUndef, false);
+  MachineInstr *TfrF =
+      genCondTfrFor(MI.getOperand(3), At, DR, DSR, MP, false, ReadUndef, true);
+  LIS->InsertMachineInstrInMaps(*TfrT);
+  LIS->InsertMachineInstrInMaps(*TfrF);
+
+  // Will need to recalculate live intervals for all registers in MI.
+  for (auto &Op : MI.operands())
+    if (Op.isReg())
+      UpdRegs.insert(Op.getReg());
+
+  removeInstr(MI);
   return true;
 }
 
 
-/// Split all MUX instructions in the given block into pairs of contitional
+/// Split all MUX instructions in the given block into pairs of conditional
 /// transfers.
-bool HexagonExpandCondsets::splitInBlock(MachineBasicBlock &B) {
+bool HexagonExpandCondsets::splitInBlock(MachineBasicBlock &B,
+      std::set<unsigned> &UpdRegs) {
   bool Changed = false;
   MachineBasicBlock::iterator I, E, NextI;
   for (I = B.begin(), E = B.end(); I != E; I = NextI) {
     NextI = std::next(I);
-    if (isCondset(I))
-      Changed |= split(I);
+    if (isCondset(*I))
+      Changed |= split(*I, UpdRegs);
   }
   return Changed;
 }
 
 
 bool HexagonExpandCondsets::isPredicable(MachineInstr *MI) {
-  if (HII->isPredicated(MI) || !HII->isPredicable(MI))
+  if (HII->isPredicated(*MI) || !HII->isPredicable(*MI))
     return false;
   if (MI->hasUnmodeledSideEffects() || MI->mayStore())
     return false;
@@ -784,8 +752,8 @@ MachineInstr *HexagonExpandCondsets::getReachingDefForPred(RegisterRef RD,
     MachineInstr *MI = &*I;
     // Check if this instruction can be ignored, i.e. if it is predicated
     // on the complementary condition.
-    if (PredValid && HII->isPredicated(MI)) {
-      if (MI->readsRegister(PredR) && (Cond != HII->isPredicatedTrue(MI)))
+    if (PredValid && HII->isPredicated(*MI)) {
+      if (MI->readsRegister(PredR) && (Cond != HII->isPredicatedTrue(*MI)))
         continue;
     }
 
@@ -821,12 +789,12 @@ MachineInstr *HexagonExpandCondsets::getReachingDefForPred(RegisterRef RD,
 /// the maps Defs and Uses. These maps reflect the conditional defs and uses
 /// that depend on the same predicate register to allow moving instructions
 /// over instructions predicated on the opposite condition.
-bool HexagonExpandCondsets::canMoveOver(MachineInstr *MI, ReferenceMap &Defs,
-      ReferenceMap &Uses) {
+bool HexagonExpandCondsets::canMoveOver(MachineInstr &MI, ReferenceMap &Defs,
+                                        ReferenceMap &Uses) {
   // In order to be able to safely move MI over instructions that define
   // "Defs" and use "Uses", no def operand from MI can be defined or used
   // and no use operand can be defined.
-  for (auto &Op : MI->operands()) {
+  for (auto &Op : MI.operands()) {
     if (!Op.isReg())
       continue;
     RegisterRef RR = Op;
@@ -848,19 +816,19 @@ bool HexagonExpandCondsets::canMoveOver(MachineInstr *MI, ReferenceMap &Defs,
 
 /// Check if the instruction accessing memory (TheI) can be moved to the
 /// location ToI.
-bool HexagonExpandCondsets::canMoveMemTo(MachineInstr *TheI, MachineInstr *ToI,
-      bool IsDown) {
-  bool IsLoad = TheI->mayLoad(), IsStore = TheI->mayStore();
+bool HexagonExpandCondsets::canMoveMemTo(MachineInstr &TheI, MachineInstr &ToI,
+                                         bool IsDown) {
+  bool IsLoad = TheI.mayLoad(), IsStore = TheI.mayStore();
   if (!IsLoad && !IsStore)
     return true;
   if (HII->areMemAccessesTriviallyDisjoint(TheI, ToI))
     return true;
-  if (TheI->hasUnmodeledSideEffects())
+  if (TheI.hasUnmodeledSideEffects())
     return false;
 
   MachineBasicBlock::iterator StartI = IsDown ? TheI : ToI;
   MachineBasicBlock::iterator EndI = IsDown ? ToI : TheI;
-  bool Ordered = TheI->hasOrderedMemoryRef();
+  bool Ordered = TheI.hasOrderedMemoryRef();
 
   // Search for aliased memory reference in (StartI, EndI).
   for (MachineBasicBlock::iterator I = std::next(StartI); I != EndI; ++I) {
@@ -883,8 +851,11 @@ bool HexagonExpandCondsets::canMoveMemTo(MachineInstr *TheI, MachineInstr *ToI,
 
 /// Generate a predicated version of MI (where the condition is given via
 /// PredR and Cond) at the point indicated by Where.
-void HexagonExpandCondsets::predicateAt(RegisterRef RD, MachineInstr *MI,
-      MachineBasicBlock::iterator Where, unsigned PredR, bool Cond) {
+void HexagonExpandCondsets::predicateAt(const MachineOperand &DefOp,
+                                        MachineInstr &MI,
+                                        MachineBasicBlock::iterator Where,
+                                        const MachineOperand &PredOp, bool Cond,
+                                        std::set<unsigned> &UpdRegs) {
   // The problem with updating live intervals is that we can move one def
   // past another def. In particular, this can happen when moving an A2_tfrt
   // over an A2_tfrf defining the same register. From the point of view of
@@ -896,33 +867,34 @@ void HexagonExpandCondsets::predicateAt(RegisterRef RD, MachineInstr *MI,
   // target location, (2) update liveness, (3) delete the old instruction,
   // and (4) update liveness again.
 
-  MachineBasicBlock &B = *MI->getParent();
+  MachineBasicBlock &B = *MI.getParent();
   DebugLoc DL = Where->getDebugLoc();  // "Where" points to an instruction.
-  unsigned Opc = MI->getOpcode();
+  unsigned Opc = MI.getOpcode();
   unsigned PredOpc = HII->getCondOpcode(Opc, !Cond);
   MachineInstrBuilder MB = BuildMI(B, Where, DL, HII->get(PredOpc));
-  unsigned Ox = 0, NP = MI->getNumOperands();
+  unsigned Ox = 0, NP = MI.getNumOperands();
   // Skip all defs from MI first.
   while (Ox < NP) {
-    MachineOperand &MO = MI->getOperand(Ox);
+    MachineOperand &MO = MI.getOperand(Ox);
     if (!MO.isReg() || !MO.isDef())
       break;
     Ox++;
   }
   // Add the new def, then the predicate register, then the rest of the
   // operands.
-  MB.addReg(RD.Reg, RegState::Define, RD.Sub);
-  MB.addReg(PredR);
+  MB.addReg(DefOp.getReg(), getRegState(DefOp), DefOp.getSubReg());
+  MB.addReg(PredOp.getReg(), PredOp.isUndef() ? RegState::Undef : 0,
+            PredOp.getSubReg());
   while (Ox < NP) {
-    MachineOperand &MO = MI->getOperand(Ox);
+    MachineOperand &MO = MI.getOperand(Ox);
     if (!MO.isReg() || !MO.isImplicit())
       MB.addOperand(MO);
     Ox++;
   }
 
   MachineFunction &MF = *B.getParent();
-  MachineInstr::mmo_iterator I = MI->memoperands_begin();
-  unsigned NR = std::distance(I, MI->memoperands_end());
+  MachineInstr::mmo_iterator I = MI.memoperands_begin();
+  unsigned NR = std::distance(I, MI.memoperands_end());
   MachineInstr::mmo_iterator MemRefs = MF.allocateMemRefsArray(NR);
   for (unsigned i = 0; i < NR; ++i)
     MemRefs[i] = *I++;
@@ -930,7 +902,11 @@ void HexagonExpandCondsets::predicateAt(RegisterRef RD, MachineInstr *MI,
 
   MachineInstr *NewI = MB;
   NewI->clearKillInfo();
-  addInstrToLiveness(NewI);
+  LIS->InsertMachineInstrInMaps(*NewI);
+
+  for (auto &Op : NewI->operands())
+    if (Op.isReg())
+      UpdRegs.insert(Op.getReg());
 }
 
 
@@ -945,9 +921,9 @@ void HexagonExpandCondsets::renameInRange(RegisterRef RO, RegisterRef RN,
     MachineInstr *MI = &*I;
     // Do not touch instructions that are not predicated, or are predicated
     // on the opposite condition.
-    if (!HII->isPredicated(MI))
+    if (!HII->isPredicated(*MI))
       continue;
-    if (!MI->readsRegister(PredR) || (Cond != HII->isPredicatedTrue(MI)))
+    if (!MI->readsRegister(PredR) || (Cond != HII->isPredicatedTrue(*MI)))
       continue;
 
     for (auto &Op : MI->operands()) {
@@ -965,22 +941,27 @@ void HexagonExpandCondsets::renameInRange(RegisterRef RO, RegisterRef RN,
 /// For a given conditional copy, predicate the definition of the source of
 /// the copy under the given condition (using the same predicate register as
 /// the copy).
-bool HexagonExpandCondsets::predicate(MachineInstr *TfrI, bool Cond) {
+bool HexagonExpandCondsets::predicate(MachineInstr &TfrI, bool Cond,
+                                      std::set<unsigned> &UpdRegs) {
   // TfrI - A2_tfr[tf] Instruction (not A2_tfrsi).
-  unsigned Opc = TfrI->getOpcode();
+  unsigned Opc = TfrI.getOpcode();
   (void)Opc;
   assert(Opc == Hexagon::A2_tfrt || Opc == Hexagon::A2_tfrf);
   DEBUG(dbgs() << "\nattempt to predicate if-" << (Cond ? "true" : "false")
-               << ": " << *TfrI);
+               << ": " << TfrI);
 
-  MachineOperand &MD = TfrI->getOperand(0);
-  MachineOperand &MP = TfrI->getOperand(1);
-  MachineOperand &MS = TfrI->getOperand(2);
+  MachineOperand &MD = TfrI.getOperand(0);
+  MachineOperand &MP = TfrI.getOperand(1);
+  MachineOperand &MS = TfrI.getOperand(2);
   // The source operand should be a <kill>. This is not strictly necessary,
   // but it makes things a lot simpler. Otherwise, we would need to rename
   // some registers, which would complicate the transformation considerably.
   if (!MS.isKill())
     return false;
+  // Avoid predicating instructions that define a subregister if subregister
+  // liveness tracking is not enabled.
+  if (MD.getSubReg() && !MRI->shouldTrackSubRegLiveness(MD.getReg()))
+    return false;
 
   RegisterRef RT(MS);
   unsigned PredR = MP.getReg();
@@ -1014,8 +995,8 @@ bool HexagonExpandCondsets::predicate(MachineInstr *TfrI, bool Cond) {
     // By default assume that the instruction executes on the same condition
     // as TfrI (Exec_Then), and also on the opposite one (Exec_Else).
     unsigned Exec = Exec_Then | Exec_Else;
-    if (PredValid && HII->isPredicated(MI) && MI->readsRegister(PredR))
-      Exec = (Cond == HII->isPredicatedTrue(MI)) ? Exec_Then : Exec_Else;
+    if (PredValid && HII->isPredicated(*MI) && MI->readsRegister(PredR))
+      Exec = (Cond == HII->isPredicatedTrue(*MI)) ? Exec_Then : Exec_Else;
 
     for (auto &Op : MI->operands()) {
       if (!Op.isReg())
@@ -1059,48 +1040,53 @@ bool HexagonExpandCondsets::predicate(MachineInstr *TfrI, bool Cond) {
   // If the target register of the TfrI (RD) is not used or defined between
   // DefI and TfrI, consider moving TfrI up to DefI.
   bool CanUp =   canMoveOver(TfrI, Defs, Uses);
-  bool CanDown = canMoveOver(DefI, Defs, Uses);
+  bool CanDown = canMoveOver(*DefI, Defs, Uses);
   // The TfrI does not access memory, but DefI could. Check if it's safe
   // to move DefI down to TfrI.
   if (DefI->mayLoad() || DefI->mayStore())
-    if (!canMoveMemTo(DefI, TfrI, true))
+    if (!canMoveMemTo(*DefI, TfrI, true))
       CanDown = false;
 
   DEBUG(dbgs() << "Can move up: " << (CanUp ? "yes" : "no")
                << ", can move down: " << (CanDown ? "yes\n" : "no\n"));
   MachineBasicBlock::iterator PastDefIt = std::next(DefIt);
   if (CanUp)
-    predicateAt(RD, DefI, PastDefIt, PredR, Cond);
+    predicateAt(MD, *DefI, PastDefIt, MP, Cond, UpdRegs);
   else if (CanDown)
-    predicateAt(RD, DefI, TfrIt, PredR, Cond);
+    predicateAt(MD, *DefI, TfrIt, MP, Cond, UpdRegs);
   else
     return false;
 
-  if (RT != RD)
+  if (RT != RD) {
     renameInRange(RT, RD, PredR, Cond, PastDefIt, TfrIt);
+    UpdRegs.insert(RT.Reg);
+  }
 
-  // Delete the user of RT first (it should work either way, but this order
-  // of deleting is more natural).
-  removeInstrFromLiveness(TfrI);
-  removeInstrFromLiveness(DefI);
+  removeInstr(TfrI);
+  removeInstr(*DefI);
   return true;
 }
 
 
 /// Predicate all cases of conditional copies in the specified block.
-bool HexagonExpandCondsets::predicateInBlock(MachineBasicBlock &B) {
+bool HexagonExpandCondsets::predicateInBlock(MachineBasicBlock &B,
+      std::set<unsigned> &UpdRegs) {
   bool Changed = false;
   MachineBasicBlock::iterator I, E, NextI;
   for (I = B.begin(), E = B.end(); I != E; I = NextI) {
     NextI = std::next(I);
     unsigned Opc = I->getOpcode();
     if (Opc == Hexagon::A2_tfrt || Opc == Hexagon::A2_tfrf) {
-      bool Done = predicate(I, (Opc == Hexagon::A2_tfrt));
+      bool Done = predicate(*I, (Opc == Hexagon::A2_tfrt), UpdRegs);
       if (!Done) {
         // If we didn't predicate I, we may need to remove it in case it is
         // an "identity" copy, e.g.  vreg1 = A2_tfrt vreg2, vreg1.
-        if (RegisterRef(I->getOperand(0)) == RegisterRef(I->getOperand(2)))
-          removeInstrFromLiveness(I);
+        if (RegisterRef(I->getOperand(0)) == RegisterRef(I->getOperand(2))) {
+          for (auto &Op : I->operands())
+            if (Op.isReg())
+              UpdRegs.insert(Op.getReg());
+          removeInstr(*I);
+        }
       }
       Changed |= Done;
     }
@@ -1109,51 +1095,6 @@ bool HexagonExpandCondsets::predicateInBlock(MachineBasicBlock &B) {
 }
 
 
-void HexagonExpandCondsets::removeImplicitUses(MachineInstr *MI) {
-  for (unsigned i = MI->getNumOperands(); i > 0; --i) {
-    MachineOperand &MO = MI->getOperand(i-1);
-    if (MO.isReg() && MO.isUse() && MO.isImplicit())
-      MI->RemoveOperand(i-1);
-  }
-}
-
-
-void HexagonExpandCondsets::removeImplicitUses(MachineBasicBlock &B) {
-  for (MachineBasicBlock::iterator I = B.begin(), E = B.end(); I != E; ++I) {
-    MachineInstr *MI = &*I;
-    if (HII->isPredicated(MI))
-      removeImplicitUses(MI);
-  }
-}
-
-
-void HexagonExpandCondsets::postprocessUndefImplicitUses(MachineBasicBlock &B) {
-  // Implicit uses that are "undef" are only meaningful (outside of the
-  // internals of this pass) when the instruction defines a subregister,
-  // and the implicit-undef use applies to the defined register. In such
-  // cases, the proper way to record the information in the IR is to mark
-  // the definition as "undef", which will be interpreted as "read-undef".
-  typedef SmallSet<unsigned,2> RegisterSet;
-  for (MachineBasicBlock::iterator I = B.begin(), E = B.end(); I != E; ++I) {
-    MachineInstr *MI = &*I;
-    RegisterSet Undefs;
-    for (unsigned i = MI->getNumOperands(); i > 0; --i) {
-      MachineOperand &MO = MI->getOperand(i-1);
-      if (MO.isReg() && MO.isUse() && MO.isImplicit() && MO.isUndef()) {
-        MI->RemoveOperand(i-1);
-        Undefs.insert(MO.getReg());
-      }
-    }
-    for (auto &Op : MI->operands()) {
-      if (!Op.isReg() || !Op.isDef() || !Op.getSubReg())
-        continue;
-      if (Undefs.count(Op.getReg()))
-        Op.setIsUndef(true);
-    }
-  }
-}
-
-
 bool HexagonExpandCondsets::isIntReg(RegisterRef RR, unsigned &BW) {
   if (!TargetRegisterInfo::isVirtualRegister(RR.Reg))
     return false;
@@ -1236,7 +1177,7 @@ bool HexagonExpandCondsets::coalesceRegisters(RegisterRef R1, RegisterRef R2) {
   while (L2.begin() != L2.end())
     L2.removeSegment(*L2.begin());
 
-  updateKillFlags(R1.Reg, L1);
+  updateKillFlags(R1.Reg);
   DEBUG(dbgs() << "coalesced: " << L1 << "\n");
   L1.verify();
 
@@ -1253,7 +1194,7 @@ bool HexagonExpandCondsets::coalesceSegments(MachineFunction &MF) {
     MachineBasicBlock &B = *I;
     for (MachineBasicBlock::iterator J = B.begin(), F = B.end(); J != F; ++J) {
       MachineInstr *MI = &*J;
-      if (!isCondset(MI))
+      if (!isCondset(*MI))
         continue;
       MachineOperand &S1 = MI->getOperand(2), &S2 = MI->getOperand(3);
       if (!S1.isReg() && !S2.isReg())
@@ -1290,13 +1231,13 @@ bool HexagonExpandCondsets::coalesceSegments(MachineFunction &MF) {
     if (S1.isReg()) {
       RegisterRef RS = S1;
       MachineInstr *RDef = getReachingDefForPred(RS, CI, RP.Reg, true);
-      if (!RDef || !HII->isPredicable(RDef))
+      if (!RDef || !HII->isPredicable(*RDef))
         Done = coalesceRegisters(RD, RegisterRef(S1));
     }
     if (!Done && S2.isReg()) {
       RegisterRef RS = S2;
       MachineInstr *RDef = getReachingDefForPred(RS, CI, RP.Reg, false);
-      if (!RDef || !HII->isPredicable(RDef))
+      if (!RDef || !HII->isPredicable(*RDef))
         Done = coalesceRegisters(RD, RegisterRef(S2));
     }
     Changed |= Done;
@@ -1306,32 +1247,59 @@ bool HexagonExpandCondsets::coalesceSegments(MachineFunction &MF) {
 
 
 bool HexagonExpandCondsets::runOnMachineFunction(MachineFunction &MF) {
+  if (skipFunction(*MF.getFunction()))
+    return false;
+
   HII = static_cast<const HexagonInstrInfo*>(MF.getSubtarget().getInstrInfo());
   TRI = MF.getSubtarget().getRegisterInfo();
+  MDT = &getAnalysis<MachineDominatorTree>();
   LIS = &getAnalysis<LiveIntervals>();
   MRI = &MF.getRegInfo();
+  LocalImpDefs.clear();
+
+  DEBUG(LIS->print(dbgs() << "Before expand-condsets\n",
+                   MF.getFunction()->getParent()));
 
   bool Changed = false;
+  std::set<unsigned> SplitUpd, PredUpd;
 
   // Try to coalesce the target of a mux with one of its sources.
   // This could eliminate a register copy in some circumstances.
   Changed |= coalesceSegments(MF);
 
-  for (MachineFunction::iterator I = MF.begin(), E = MF.end(); I != E; ++I) {
-    // First, simply split all muxes into a pair of conditional transfers
-    // and update the live intervals to reflect the new arrangement.
-    // This is done mainly to make the live interval update simpler, than it
-    // would be while trying to predicate instructions at the same time.
-    Changed |= splitInBlock(*I);
-    // Traverse all blocks and collapse predicable instructions feeding
-    // conditional transfers into predicated instructions.
-    // Walk over all the instructions again, so we may catch pre-existing
-    // cases that were not created in the previous step.
-    Changed |= predicateInBlock(*I);
-  }
+  // First, simply split all muxes into a pair of conditional transfers
+  // and update the live intervals to reflect the new arrangement. The
+  // goal is to update the kill flags, since predication will rely on
+  // them.
+  for (auto &B : MF)
+    Changed |= splitInBlock(B, SplitUpd);
+  updateLiveness(SplitUpd, true, true, false);
+
+  // Traverse all blocks and collapse predicable instructions feeding
+  // conditional transfers into predicated instructions.
+  // Walk over all the instructions again, so we may catch pre-existing
+  // cases that were not created in the previous step.
+  for (auto &B : MF)
+    Changed |= predicateInBlock(B, PredUpd);
+
+  updateLiveness(PredUpd, true, true, true);
+  // Remove from SplitUpd all registers contained in PredUpd to avoid
+  // unnecessary liveness recalculation.
+  std::set<unsigned> Diff;
+  std::set_difference(SplitUpd.begin(), SplitUpd.end(),
+                      PredUpd.begin(), PredUpd.end(),
+                      std::inserter(Diff, Diff.begin()));
+  updateLiveness(Diff, false, false, true);
+
+  for (auto *ImpD : LocalImpDefs)
+    removeInstr(*ImpD);
+
+  DEBUG({
+    if (Changed)
+      LIS->print(dbgs() << "After expand-condsets\n",
+                 MF.getFunction()->getParent());
+  });
 
-  for (MachineFunction::iterator I = MF.begin(), E = MF.end(); I != E; ++I)
-    postprocessUndefImplicitUses(*I);
   return Changed;
 }
 
@@ -1340,18 +1308,6 @@ bool HexagonExpandCondsets::runOnMachineFunction(MachineFunction &MF) {
 //                         Public Constructor Functions
 //===----------------------------------------------------------------------===//
 
-static void initializePassOnce(PassRegistry &Registry) {
-  const char *Name = "Hexagon Expand Condsets";
-  PassInfo *PI = new PassInfo(Name, "expand-condsets",
-        &HexagonExpandCondsets::ID, 0, false, false);
-  Registry.registerPass(*PI, true);
-}
-
-void llvm::initializeHexagonExpandCondsetsPass(PassRegistry &Registry) {
-  CALL_ONCE_INITIALIZATION(initializePassOnce)
-}
-
-
 FunctionPass *llvm::createHexagonExpandCondsets() {
   return new HexagonExpandCondsets();
 }
diff --git a/lib/Target/Hexagon/HexagonExpandPredSpillCode.cpp b/lib/Target/Hexagon/HexagonExpandPredSpillCode.cpp
deleted file mode 100644
index 6e2dbc06b124..000000000000
--- a/lib/Target/Hexagon/HexagonExpandPredSpillCode.cpp
+++ /dev/null
@@ -1,357 +0,0 @@
-//===-- HexagonExpandPredSpillCode.cpp - Expand Predicate Spill Code ------===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-// The Hexagon processor has no instructions that load or store predicate
-// registers directly.  So, when these registers must be spilled a general
-// purpose register must be found and the value copied to/from it from/to
-// the predicate register.  This code currently does not use the register
-// scavenger mechanism available in the allocator.  There are two registers
-// reserved to allow spilling/restoring predicate registers.  One is used to
-// hold the predicate value.  The other is used when stack frame offsets are
-// too large.
-//
-//===----------------------------------------------------------------------===//
-
-#include "Hexagon.h"
-#include "HexagonMachineFunctionInfo.h"
-#include "HexagonSubtarget.h"
-#include "llvm/ADT/Statistic.h"
-#include "llvm/CodeGen/LatencyPriorityQueue.h"
-#include "llvm/CodeGen/MachineDominators.h"
-#include "llvm/CodeGen/MachineFunctionPass.h"
-#include "llvm/CodeGen/MachineInstrBuilder.h"
-#include "llvm/CodeGen/MachineLoopInfo.h"
-#include "llvm/CodeGen/MachineRegisterInfo.h"
-#include "llvm/CodeGen/Passes.h"
-#include "llvm/CodeGen/ScheduleHazardRecognizer.h"
-#include "llvm/CodeGen/SchedulerRegistry.h"
-#include "llvm/Support/Compiler.h"
-#include "llvm/Support/Debug.h"
-#include "llvm/Support/MathExtras.h"
-#include "llvm/Target/TargetInstrInfo.h"
-#include "llvm/Target/TargetMachine.h"
-#include "llvm/Target/TargetRegisterInfo.h"
-
-using namespace llvm;
-
-
-namespace llvm {
-  FunctionPass *createHexagonExpandPredSpillCode();
-  void initializeHexagonExpandPredSpillCodePass(PassRegistry&);
-}
-
-
-namespace {
-
-class HexagonExpandPredSpillCode : public MachineFunctionPass {
- public:
-    static char ID;
-    HexagonExpandPredSpillCode() : MachineFunctionPass(ID) {
-      PassRegistry &Registry = *PassRegistry::getPassRegistry();
-      initializeHexagonExpandPredSpillCodePass(Registry);
-    }
-
-    const char *getPassName() const override {
-      return "Hexagon Expand Predicate Spill Code";
-    }
-    bool runOnMachineFunction(MachineFunction &Fn) override;
-};
-
-
-char HexagonExpandPredSpillCode::ID = 0;
-
-
-bool HexagonExpandPredSpillCode::runOnMachineFunction(MachineFunction &Fn) {
-
-  const HexagonSubtarget &QST = Fn.getSubtarget<HexagonSubtarget>();
-  const HexagonInstrInfo *TII = QST.getInstrInfo();
-
-  // Loop over all of the basic blocks.
-  for (MachineFunction::iterator MBBb = Fn.begin(), MBBe = Fn.end();
-       MBBb != MBBe; ++MBBb) {
-    MachineBasicBlock *MBB = &*MBBb;
-    // Traverse the basic block.
-    for (MachineBasicBlock::iterator MII = MBB->begin(); MII != MBB->end();
-         ++MII) {
-      MachineInstr *MI = MII;
-      int Opc = MI->getOpcode();
-      if (Opc == Hexagon::S2_storerb_pci_pseudo ||
-          Opc == Hexagon::S2_storerh_pci_pseudo ||
-          Opc == Hexagon::S2_storeri_pci_pseudo ||
-          Opc == Hexagon::S2_storerd_pci_pseudo ||
-          Opc == Hexagon::S2_storerf_pci_pseudo) {
-        unsigned Opcode;
-        if (Opc == Hexagon::S2_storerd_pci_pseudo)
-          Opcode = Hexagon::S2_storerd_pci;
-        else if (Opc == Hexagon::S2_storeri_pci_pseudo)
-          Opcode = Hexagon::S2_storeri_pci;
-        else if (Opc == Hexagon::S2_storerh_pci_pseudo)
-          Opcode = Hexagon::S2_storerh_pci;
-        else if (Opc == Hexagon::S2_storerf_pci_pseudo)
-          Opcode = Hexagon::S2_storerf_pci;
-        else if (Opc == Hexagon::S2_storerb_pci_pseudo)
-          Opcode = Hexagon::S2_storerb_pci;
-        else
-          llvm_unreachable("wrong Opc");
-        MachineOperand &Op0 = MI->getOperand(0);
-        MachineOperand &Op1 = MI->getOperand(1);
-        MachineOperand &Op2 = MI->getOperand(2);
-        MachineOperand &Op3 = MI->getOperand(3); // Modifier value.
-        MachineOperand &Op4 = MI->getOperand(4);
-        // Emit a "C6 = Rn, C6 is the control register for M0".
-        BuildMI(*MBB, MII, MI->getDebugLoc(), TII->get(Hexagon::A2_tfrrcr),
-                Hexagon::C6)->addOperand(Op3);
-        // Replace the pseude circ_ldd by the real circ_ldd.
-        MachineInstr *NewMI = BuildMI(*MBB, MII, MI->getDebugLoc(),
-                                      TII->get(Opcode));
-        NewMI->addOperand(Op0);
-        NewMI->addOperand(Op1);
-        NewMI->addOperand(Op4);
-        NewMI->addOperand(MachineOperand::CreateReg(Hexagon::M0,
-                                                    false, /*isDef*/
-                                                    false, /*isImpl*/
-                                                    true   /*isKill*/));
-        NewMI->addOperand(Op2);
-        MII = MBB->erase(MI);
-        --MII;
-      } else if (Opc == Hexagon::L2_loadrd_pci_pseudo ||
-                 Opc == Hexagon::L2_loadri_pci_pseudo ||
-                 Opc == Hexagon::L2_loadrh_pci_pseudo ||
-                 Opc == Hexagon::L2_loadruh_pci_pseudo||
-                 Opc == Hexagon::L2_loadrb_pci_pseudo ||
-                 Opc == Hexagon::L2_loadrub_pci_pseudo) {
-        unsigned Opcode;
-        if (Opc == Hexagon::L2_loadrd_pci_pseudo)
-          Opcode = Hexagon::L2_loadrd_pci;
-        else if (Opc == Hexagon::L2_loadri_pci_pseudo)
-          Opcode = Hexagon::L2_loadri_pci;
-        else if (Opc == Hexagon::L2_loadrh_pci_pseudo)
-          Opcode = Hexagon::L2_loadrh_pci;
-        else if (Opc == Hexagon::L2_loadruh_pci_pseudo)
-          Opcode = Hexagon::L2_loadruh_pci;
-        else if (Opc == Hexagon::L2_loadrb_pci_pseudo)
-          Opcode = Hexagon::L2_loadrb_pci;
-        else if (Opc == Hexagon::L2_loadrub_pci_pseudo)
-          Opcode = Hexagon::L2_loadrub_pci;
-        else
-          llvm_unreachable("wrong Opc");
-
-        MachineOperand &Op0 = MI->getOperand(0);
-        MachineOperand &Op1 = MI->getOperand(1);
-        MachineOperand &Op2 = MI->getOperand(2);
-        MachineOperand &Op4 = MI->getOperand(4); // Modifier value.
-        MachineOperand &Op5 = MI->getOperand(5);
-        // Emit a "C6 = Rn, C6 is the control register for M0".
-        BuildMI(*MBB, MII, MI->getDebugLoc(), TII->get(Hexagon::A2_tfrrcr),
-                Hexagon::C6)->addOperand(Op4);
-        // Replace the pseude circ_ldd by the real circ_ldd.
-        MachineInstr *NewMI = BuildMI(*MBB, MII, MI->getDebugLoc(),
-                                      TII->get(Opcode));
-        NewMI->addOperand(Op1);
-        NewMI->addOperand(Op0);
-        NewMI->addOperand(Op2);
-        NewMI->addOperand(Op5);
-        NewMI->addOperand(MachineOperand::CreateReg(Hexagon::M0,
-                                                    false, /*isDef*/
-                                                    false, /*isImpl*/
-                                                    true   /*isKill*/));
-        MII = MBB->erase(MI);
-        --MII;
-      } else if (Opc == Hexagon::L2_loadrd_pbr_pseudo ||
-                 Opc == Hexagon::L2_loadri_pbr_pseudo ||
-                 Opc == Hexagon::L2_loadrh_pbr_pseudo ||
-                 Opc == Hexagon::L2_loadruh_pbr_pseudo||
-                 Opc == Hexagon::L2_loadrb_pbr_pseudo ||
-                 Opc == Hexagon::L2_loadrub_pbr_pseudo) {
-        unsigned Opcode;
-        if (Opc == Hexagon::L2_loadrd_pbr_pseudo)
-          Opcode = Hexagon::L2_loadrd_pbr;
-        else if (Opc == Hexagon::L2_loadri_pbr_pseudo)
-          Opcode = Hexagon::L2_loadri_pbr;
-        else if (Opc == Hexagon::L2_loadrh_pbr_pseudo)
-          Opcode = Hexagon::L2_loadrh_pbr;
-        else if (Opc == Hexagon::L2_loadruh_pbr_pseudo)
-          Opcode = Hexagon::L2_loadruh_pbr;
-        else if (Opc == Hexagon::L2_loadrb_pbr_pseudo)
-          Opcode = Hexagon::L2_loadrb_pbr;
-        else if (Opc == Hexagon::L2_loadrub_pbr_pseudo)
-          Opcode = Hexagon::L2_loadrub_pbr;
-        else
-          llvm_unreachable("wrong Opc");
-        MachineOperand &Op0 = MI->getOperand(0);
-        MachineOperand &Op1 = MI->getOperand(1);
-        MachineOperand &Op2 = MI->getOperand(2);
-        MachineOperand &Op4 = MI->getOperand(4); // Modifier value.
-        // Emit a "C6 = Rn, C6 is the control register for M0".
-        BuildMI(*MBB, MII, MI->getDebugLoc(), TII->get(Hexagon::A2_tfrrcr),
-                Hexagon::C6)->addOperand(Op4);
-        // Replace the pseudo brev_ldd by the real brev_ldd.
-        MachineInstr *NewMI = BuildMI(*MBB, MII, MI->getDebugLoc(),
-                                      TII->get(Opcode));
-        NewMI->addOperand(Op1);
-        NewMI->addOperand(Op0);
-        NewMI->addOperand(Op2);
-        NewMI->addOperand(MachineOperand::CreateReg(Hexagon::M0,
-                                                    false, /*isDef*/
-                                                    false, /*isImpl*/
-                                                    true   /*isKill*/));
-        MII = MBB->erase(MI);
-        --MII;
-      } else if (Opc == Hexagon::S2_storerd_pbr_pseudo ||
-                 Opc == Hexagon::S2_storeri_pbr_pseudo ||
-                 Opc == Hexagon::S2_storerh_pbr_pseudo ||
-                 Opc == Hexagon::S2_storerb_pbr_pseudo ||
-                 Opc == Hexagon::S2_storerf_pbr_pseudo) {
-        unsigned Opcode;
-        if (Opc == Hexagon::S2_storerd_pbr_pseudo)
-          Opcode = Hexagon::S2_storerd_pbr;
-        else if (Opc == Hexagon::S2_storeri_pbr_pseudo)
-          Opcode = Hexagon::S2_storeri_pbr;
-        else if (Opc == Hexagon::S2_storerh_pbr_pseudo)
-          Opcode = Hexagon::S2_storerh_pbr;
-        else if (Opc == Hexagon::S2_storerf_pbr_pseudo)
-          Opcode = Hexagon::S2_storerf_pbr;
-        else if (Opc == Hexagon::S2_storerb_pbr_pseudo)
-          Opcode = Hexagon::S2_storerb_pbr;
-        else
-          llvm_unreachable("wrong Opc");
-        MachineOperand &Op0 = MI->getOperand(0);
-        MachineOperand &Op1 = MI->getOperand(1);
-        MachineOperand &Op2 = MI->getOperand(2);
-        MachineOperand &Op3 = MI->getOperand(3); // Modifier value.
-        // Emit a "C6 = Rn, C6 is the control register for M0".
-        BuildMI(*MBB, MII, MI->getDebugLoc(), TII->get(Hexagon::A2_tfrrcr),
-                Hexagon::C6)->addOperand(Op3);
-        // Replace the pseudo brev_ldd by the real brev_ldd.
-        MachineInstr *NewMI = BuildMI(*MBB, MII, MI->getDebugLoc(),
-                                      TII->get(Opcode));
-        NewMI->addOperand(Op0);
-        NewMI->addOperand(Op1);
-        NewMI->addOperand(MachineOperand::CreateReg(Hexagon::M0,
-                                                    false, /*isDef*/
-                                                    false, /*isImpl*/
-                                                    true   /*isKill*/));
-        NewMI->addOperand(Op2);
-        MII = MBB->erase(MI);
-        --MII;
-      } else if (Opc == Hexagon::STriw_pred) {
-        // STriw_pred [R30], ofst, SrcReg;
-        unsigned FP = MI->getOperand(0).getReg();
-        assert(FP == QST.getRegisterInfo()->getFrameRegister() &&
-               "Not a Frame Pointer, Nor a Spill Slot");
-        assert(MI->getOperand(1).isImm() && "Not an offset");
-        int Offset = MI->getOperand(1).getImm();
-        int SrcReg = MI->getOperand(2).getReg();
-        assert(Hexagon::PredRegsRegClass.contains(SrcReg) &&
-               "Not a predicate register");
-        if (!TII->isValidOffset(Hexagon::S2_storeri_io, Offset)) {
-          if (!TII->isValidOffset(Hexagon::A2_addi, Offset)) {
-            BuildMI(*MBB, MII, MI->getDebugLoc(),
-                    TII->get(Hexagon::CONST32_Int_Real),
-                      HEXAGON_RESERVED_REG_1).addImm(Offset);
-            BuildMI(*MBB, MII, MI->getDebugLoc(), TII->get(Hexagon::A2_add),
-                    HEXAGON_RESERVED_REG_1)
-              .addReg(FP).addReg(HEXAGON_RESERVED_REG_1);
-            BuildMI(*MBB, MII, MI->getDebugLoc(), TII->get(Hexagon::C2_tfrpr),
-                      HEXAGON_RESERVED_REG_2).addReg(SrcReg);
-            BuildMI(*MBB, MII, MI->getDebugLoc(),
-                    TII->get(Hexagon::S2_storeri_io))
-              .addReg(HEXAGON_RESERVED_REG_1)
-              .addImm(0).addReg(HEXAGON_RESERVED_REG_2);
-          } else {
-            BuildMI(*MBB, MII, MI->getDebugLoc(), TII->get(Hexagon::A2_addi),
-                      HEXAGON_RESERVED_REG_1).addReg(FP).addImm(Offset);
-            BuildMI(*MBB, MII, MI->getDebugLoc(), TII->get(Hexagon::C2_tfrpr),
-                      HEXAGON_RESERVED_REG_2).addReg(SrcReg);
-            BuildMI(*MBB, MII, MI->getDebugLoc(),
-                          TII->get(Hexagon::S2_storeri_io))
-              .addReg(HEXAGON_RESERVED_REG_1)
-              .addImm(0)
-              .addReg(HEXAGON_RESERVED_REG_2);
-          }
-        } else {
-          BuildMI(*MBB, MII, MI->getDebugLoc(), TII->get(Hexagon::C2_tfrpr),
-                    HEXAGON_RESERVED_REG_2).addReg(SrcReg);
-          BuildMI(*MBB, MII, MI->getDebugLoc(),
-                        TII->get(Hexagon::S2_storeri_io)).
-                    addReg(FP).addImm(Offset).addReg(HEXAGON_RESERVED_REG_2);
-        }
-        MII = MBB->erase(MI);
-        --MII;
-      } else if (Opc == Hexagon::LDriw_pred) {
-        // DstReg = LDriw_pred [R30], ofst.
-        int DstReg = MI->getOperand(0).getReg();
-        assert(Hexagon::PredRegsRegClass.contains(DstReg) &&
-               "Not a predicate register");
-        unsigned FP = MI->getOperand(1).getReg();
-        assert(FP == QST.getRegisterInfo()->getFrameRegister() &&
-               "Not a Frame Pointer, Nor a Spill Slot");
-        assert(MI->getOperand(2).isImm() && "Not an offset");
-        int Offset = MI->getOperand(2).getImm();
-        if (!TII->isValidOffset(Hexagon::L2_loadri_io, Offset)) {
-          if (!TII->isValidOffset(Hexagon::A2_addi, Offset)) {
-            BuildMI(*MBB, MII, MI->getDebugLoc(),
-                    TII->get(Hexagon::CONST32_Int_Real),
-                      HEXAGON_RESERVED_REG_1).addImm(Offset);
-            BuildMI(*MBB, MII, MI->getDebugLoc(), TII->get(Hexagon::A2_add),
-                    HEXAGON_RESERVED_REG_1)
-              .addReg(FP)
-              .addReg(HEXAGON_RESERVED_REG_1);
-            BuildMI(*MBB, MII, MI->getDebugLoc(), TII->get(Hexagon::L2_loadri_io),
-                      HEXAGON_RESERVED_REG_2)
-              .addReg(HEXAGON_RESERVED_REG_1)
-              .addImm(0);
-            BuildMI(*MBB, MII, MI->getDebugLoc(), TII->get(Hexagon::C2_tfrrp),
-                      DstReg).addReg(HEXAGON_RESERVED_REG_2);
-          } else {
-            BuildMI(*MBB, MII, MI->getDebugLoc(), TII->get(Hexagon::A2_addi),
-                      HEXAGON_RESERVED_REG_1).addReg(FP).addImm(Offset);
-            BuildMI(*MBB, MII, MI->getDebugLoc(), TII->get(Hexagon::L2_loadri_io),
-                      HEXAGON_RESERVED_REG_2)
-              .addReg(HEXAGON_RESERVED_REG_1)
-              .addImm(0);
-            BuildMI(*MBB, MII, MI->getDebugLoc(), TII->get(Hexagon::C2_tfrrp),
-                      DstReg).addReg(HEXAGON_RESERVED_REG_2);
-          }
-        } else {
-          BuildMI(*MBB, MII, MI->getDebugLoc(), TII->get(Hexagon::L2_loadri_io),
-                    HEXAGON_RESERVED_REG_2).addReg(FP).addImm(Offset);
-          BuildMI(*MBB, MII, MI->getDebugLoc(), TII->get(Hexagon::C2_tfrrp),
-                    DstReg).addReg(HEXAGON_RESERVED_REG_2);
-        }
-        MII = MBB->erase(MI);
-        --MII;
-      }
-    }
-  }
-
-  return true;
-}
-
-}
-
-//===----------------------------------------------------------------------===//
-//                         Public Constructor Functions
-//===----------------------------------------------------------------------===//
-
-static void initializePassOnce(PassRegistry &Registry) {
-  const char *Name = "Hexagon Expand Predicate Spill Code";
-  PassInfo *PI = new PassInfo(Name, "hexagon-spill-pred",
-                              &HexagonExpandPredSpillCode::ID,
-                              nullptr, false, false);
-  Registry.registerPass(*PI, true);
-}
-
-void llvm::initializeHexagonExpandPredSpillCodePass(PassRegistry &Registry) {
-  CALL_ONCE_INITIALIZATION(initializePassOnce)
-}
-
-FunctionPass*
-llvm::createHexagonExpandPredSpillCode() {
-  return new HexagonExpandPredSpillCode();
-}
diff --git a/lib/Target/Hexagon/HexagonFixupHwLoops.cpp b/lib/Target/Hexagon/HexagonFixupHwLoops.cpp
index d0c7f9c8960f..3de817cc8fb6 100644
--- a/lib/Target/Hexagon/HexagonFixupHwLoops.cpp
+++ b/lib/Target/Hexagon/HexagonFixupHwLoops.cpp
@@ -45,6 +45,11 @@ namespace {
 
     bool runOnMachineFunction(MachineFunction &MF) override;
 
+    MachineFunctionProperties getRequiredProperties() const override {
+      return MachineFunctionProperties().set(
+          MachineFunctionProperties::Property::AllVRegsAllocated);
+    }
+
     const char *getPassName() const override {
       return "Hexagon Hardware Loop Fixup";
     }
@@ -77,14 +82,16 @@ FunctionPass *llvm::createHexagonFixupHwLoops() {
 }
 
 /// \brief Returns true if the instruction is a hardware loop instruction.
-static bool isHardwareLoop(const MachineInstr *MI) {
-  return MI->getOpcode() == Hexagon::J2_loop0r ||
-         MI->getOpcode() == Hexagon::J2_loop0i ||
-         MI->getOpcode() == Hexagon::J2_loop1r ||
-         MI->getOpcode() == Hexagon::J2_loop1i;
+static bool isHardwareLoop(const MachineInstr &MI) {
+  return MI.getOpcode() == Hexagon::J2_loop0r ||
+         MI.getOpcode() == Hexagon::J2_loop0i ||
+         MI.getOpcode() == Hexagon::J2_loop1r ||
+         MI.getOpcode() == Hexagon::J2_loop1i;
 }
 
 bool HexagonFixupHwLoops::runOnMachineFunction(MachineFunction &MF) {
+  if (skipFunction(*MF.getFunction()))
+    return false;
   return fixupLoopInstrs(MF);
 }
 
@@ -123,7 +130,6 @@ bool HexagonFixupHwLoops::fixupLoopInstrs(MachineFunction &MF) {
 
   // Second pass - check each loop instruction to see if it needs to be
   // converted.
-  InstOffset = 0;
   bool Changed = false;
   for (MachineBasicBlock &MBB : MF) {
     InstOffset = BlockToInstOffset[&MBB];
@@ -137,7 +143,7 @@ bool HexagonFixupHwLoops::fixupLoopInstrs(MachineFunction &MF) {
         ++MII;
         continue;
       }
-      if (isHardwareLoop(MII)) {
+      if (isHardwareLoop(*MII)) {
         assert(MII->getOperand(0).isMBB() &&
                "Expect a basic block as loop operand");
         int diff = InstOffset - BlockToInstOffset[MII->getOperand(0).getMBB()];
diff --git a/lib/Target/Hexagon/HexagonFrameLowering.cpp b/lib/Target/Hexagon/HexagonFrameLowering.cpp
index 7a52a1c9eaec..25402147bf53 100644
--- a/lib/Target/Hexagon/HexagonFrameLowering.cpp
+++ b/lib/Target/Hexagon/HexagonFrameLowering.cpp
@@ -10,8 +10,8 @@
 
 #define DEBUG_TYPE "hexagon-pei"
 
+#include "HexagonBlockRanges.h"
 #include "HexagonFrameLowering.h"
-#include "Hexagon.h"
 #include "HexagonInstrInfo.h"
 #include "HexagonMachineFunctionInfo.h"
 #include "HexagonRegisterInfo.h"
@@ -19,12 +19,11 @@
 #include "HexagonTargetMachine.h"
 #include "llvm/ADT/BitVector.h"
 #include "llvm/ADT/PostOrderIterator.h"
-#include "llvm/ADT/STLExtras.h"
 #include "llvm/CodeGen/MachineDominators.h"
-#include "llvm/CodeGen/MachineInstrBuilder.h"
 #include "llvm/CodeGen/MachineFunction.h"
 #include "llvm/CodeGen/MachineFunctionPass.h"
 #include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
 #include "llvm/CodeGen/MachineModuleInfo.h"
 #include "llvm/CodeGen/MachinePostDominators.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
@@ -126,8 +125,7 @@ using namespace llvm;
 static cl::opt<bool> DisableDeallocRet("disable-hexagon-dealloc-ret",
     cl::Hidden, cl::desc("Disable Dealloc Return for Hexagon target"));
 
-
-static cl::opt<int> NumberScavengerSlots("number-scavenger-slots",
+static cl::opt<unsigned> NumberScavengerSlots("number-scavenger-slots",
     cl::Hidden, cl::desc("Set the number of scavenger slots"), cl::init(2),
     cl::ZeroOrMore);
 
@@ -139,6 +137,10 @@ static cl::opt<int> SpillFuncThresholdOs("spill-func-threshold-Os",
     cl::Hidden, cl::desc("Specify Os spill func threshold"),
     cl::init(1), cl::ZeroOrMore);
 
+static cl::opt<bool> EnableStackOVFSanitizer("enable-stackovf-sanitizer",
+    cl::Hidden, cl::desc("Enable runtime checks for stack overflow."),
+    cl::init(false), cl::ZeroOrMore);
+
 static cl::opt<bool> EnableShrinkWrapping("hexagon-shrink-frame",
     cl::init(true), cl::Hidden, cl::ZeroOrMore,
     cl::desc("Enable stack frame shrink wrapping"));
@@ -150,6 +152,9 @@ static cl::opt<unsigned> ShrinkLimit("shrink-frame-limit", cl::init(UINT_MAX),
 static cl::opt<bool> UseAllocframe("use-allocframe", cl::init(true),
     cl::Hidden, cl::desc("Use allocframe more conservatively"));
 
+static cl::opt<bool> OptimizeSpillSlots("hexagon-opt-spill", cl::Hidden,
+    cl::init(true), cl::desc("Optimize spill slots"));
+
 
 namespace llvm {
   void initializeHexagonCallFrameInformationPass(PassRegistry&);
@@ -165,6 +170,10 @@ namespace {
       initializeHexagonCallFrameInformationPass(PR);
     }
     bool runOnMachineFunction(MachineFunction &MF) override;
+    MachineFunctionProperties getRequiredProperties() const override {
+      return MachineFunctionProperties().set(
+          MachineFunctionProperties::Property::AllVRegsAllocated);
+    }
   };
 
   char HexagonCallFrameInformation::ID = 0;
@@ -213,8 +222,8 @@ namespace {
   /// Returns the callee saved register with the largest id in the vector.
   unsigned getMaxCalleeSavedReg(const std::vector<CalleeSavedInfo> &CSI,
                                 const TargetRegisterInfo &TRI) {
-    assert(Hexagon::R1 > 0 &&
-           "Assume physical registers are encoded as positive integers");
+    static_assert(Hexagon::R1 > 0,
+                  "Assume physical registers are encoded as positive integers");
     if (CSI.empty())
       return 0;
 
@@ -229,7 +238,8 @@ namespace {
 
   /// Checks if the basic block contains any instruction that needs a stack
   /// frame to be already in place.
-  bool needsStackFrame(const MachineBasicBlock &MBB, const BitVector &CSR) {
+  bool needsStackFrame(const MachineBasicBlock &MBB, const BitVector &CSR,
+        const HexagonRegisterInfo &HRI) {
     for (auto &I : MBB) {
       const MachineInstr *MI = &I;
       if (MI->isCall())
@@ -258,8 +268,9 @@ namespace {
         // a stack slot.
         if (TargetRegisterInfo::isVirtualRegister(R))
           return true;
-        if (CSR[R])
-          return true;
+        for (MCSubRegIterator S(R, &HRI, true); S.isValid(); ++S)
+          if (CSR[*S])
+            return true;
       }
     }
     return false;
@@ -280,6 +291,40 @@ namespace {
         return true;
     return false;
   }
+
+  /// Returns the "return" instruction from this block, or nullptr if there
+  /// isn't any.
+  MachineInstr *getReturn(MachineBasicBlock &MBB) {
+    for (auto &I : MBB)
+      if (I.isReturn())
+        return &I;
+    return nullptr;
+  }
+
+  bool isRestoreCall(unsigned Opc) {
+    switch (Opc) {
+      case Hexagon::RESTORE_DEALLOC_RET_JMP_V4:
+      case Hexagon::RESTORE_DEALLOC_RET_JMP_V4_PIC:
+      case Hexagon::RESTORE_DEALLOC_BEFORE_TAILCALL_V4:
+      case Hexagon::RESTORE_DEALLOC_BEFORE_TAILCALL_V4_PIC:
+        return true;
+    }
+    return false;
+  }
+
+  inline bool isOptNone(const MachineFunction &MF) {
+    return MF.getFunction()->hasFnAttribute(Attribute::OptimizeNone) ||
+           MF.getTarget().getOptLevel() == CodeGenOpt::None;
+  }
+
+  inline bool isOptSize(const MachineFunction &MF) {
+    const Function &F = *MF.getFunction();
+    return F.optForSize() && !F.optForMinSize();
+  }
+
+  inline bool isMinSize(const MachineFunction &MF) {
+    return MF.getFunction()->optForMinSize();
+  }
 }
 
 
@@ -330,10 +375,11 @@ void HexagonFrameLowering::findShrunkPrologEpilog(MachineFunction &MF,
   SmallVector<MachineBasicBlock*,16> SFBlocks;
   BitVector CSR(Hexagon::NUM_TARGET_REGS);
   for (const MCPhysReg *P = HRI.getCalleeSavedRegs(&MF); *P; ++P)
-    CSR[*P] = true;
+    for (MCSubRegIterator S(*P, &HRI, true); S.isValid(); ++S)
+      CSR[*S] = true;
 
   for (auto &I : MF)
-    if (needsStackFrame(I, CSR))
+    if (needsStackFrame(I, CSR, HRI))
       SFBlocks.push_back(&I);
 
   DEBUG({
@@ -386,6 +432,7 @@ void HexagonFrameLowering::findShrunkPrologEpilog(MachineFunction &MF,
   EpilogB = PDomB;
 }
 
+
 /// Perform most of the PEI work here:
 /// - saving/restoring of the callee-saved registers,
 /// - stack frame creation and destruction.
@@ -396,7 +443,6 @@ void HexagonFrameLowering::emitPrologue(MachineFunction &MF,
   auto &HST = static_cast<const HexagonSubtarget&>(MF.getSubtarget());
   auto &HRI = *HST.getRegisterInfo();
 
-  assert(&MF.front() == &MBB && "Shrink-wrapping not yet supported");
   MachineFrameInfo *MFI = MF.getFrameInfo();
   const std::vector<CalleeSavedInfo> &CSI = MFI->getCalleeSavedInfo();
 
@@ -404,8 +450,9 @@ void HexagonFrameLowering::emitPrologue(MachineFunction &MF,
   if (EnableShrinkWrapping)
     findShrunkPrologEpilog(MF, PrologB, EpilogB);
 
-  insertCSRSpillsInBlock(*PrologB, CSI, HRI);
-  insertPrologueInBlock(*PrologB);
+  bool PrologueStubs = false;
+  insertCSRSpillsInBlock(*PrologB, CSI, HRI, PrologueStubs);
+  insertPrologueInBlock(*PrologB, PrologueStubs);
 
   if (EpilogB) {
     insertCSRRestoresInBlock(*EpilogB, CSI, HRI);
@@ -418,11 +465,34 @@ void HexagonFrameLowering::emitPrologue(MachineFunction &MF,
     for (auto &B : MF)
       if (B.isReturnBlock())
         insertEpilogueInBlock(B);
+
+    for (auto &B : MF) {
+      if (B.empty())
+        continue;
+      MachineInstr *RetI = getReturn(B);
+      if (!RetI || isRestoreCall(RetI->getOpcode()))
+        continue;
+      for (auto &R : CSI)
+        RetI->addOperand(MachineOperand::CreateReg(R.getReg(), false, true));
+    }
+  }
+
+  if (EpilogB) {
+    // If there is an epilog block, it may not have a return instruction.
+    // In such case, we need to add the callee-saved registers as live-ins
+    // in all blocks on all paths from the epilog to any return block.
+    unsigned MaxBN = 0;
+    for (auto &B : MF)
+      if (B.getNumber() >= 0)
+        MaxBN = std::max(MaxBN, unsigned(B.getNumber()));
+    BitVector DoneT(MaxBN+1), DoneF(MaxBN+1), Path(MaxBN+1);
+    updateExitPaths(*EpilogB, EpilogB, DoneT, DoneF, Path);
   }
 }
 
 
-void HexagonFrameLowering::insertPrologueInBlock(MachineBasicBlock &MBB) const {
+void HexagonFrameLowering::insertPrologueInBlock(MachineBasicBlock &MBB,
+      bool PrologueStubs) const {
   MachineFunction &MF = *MBB.getParent();
   MachineFrameInfo *MFI = MF.getFrameInfo();
   auto &HST = MF.getSubtarget<HexagonSubtarget>();
@@ -436,10 +506,10 @@ void HexagonFrameLowering::insertPrologueInBlock(MachineBasicBlock &MBB) const {
   // Get the number of bytes to allocate from the FrameInfo.
   unsigned FrameSize = MFI->getStackSize();
   // Round up the max call frame size to the max alignment on the stack.
-  unsigned MaxCFA = RoundUpToAlignment(MFI->getMaxCallFrameSize(), MaxAlign);
+  unsigned MaxCFA = alignTo(MFI->getMaxCallFrameSize(), MaxAlign);
   MFI->setMaxCallFrameSize(MaxCFA);
 
-  FrameSize = MaxCFA + RoundUpToAlignment(FrameSize, MaxAlign);
+  FrameSize = MaxCFA + alignTo(FrameSize, MaxAlign);
   MFI->setStackSize(FrameSize);
 
   bool AlignStack = (MaxAlign > getStackAlignment());
@@ -497,6 +567,13 @@ void HexagonFrameLowering::insertPrologueInBlock(MachineBasicBlock &MBB) const {
         .addReg(SP)
         .addImm(-int64_t(MaxAlign));
   }
+
+  // If the stack-checking is enabled, and we spilled the callee-saved
+  // registers inline (i.e. did not use a spill function), then call
+  // the stack checker directly.
+  if (EnableStackOVFSanitizer && !PrologueStubs)
+    BuildMI(MBB, InsertPt, dl, HII.get(Hexagon::CALLstk))
+           .addExternalSymbol("__runtime_stack_check");
 }
 
 void HexagonFrameLowering::insertEpilogueInBlock(MachineBasicBlock &MBB) const {
@@ -509,13 +586,7 @@ void HexagonFrameLowering::insertEpilogueInBlock(MachineBasicBlock &MBB) const {
   auto &HRI = *HST.getRegisterInfo();
   unsigned SP = HRI.getStackRegister();
 
-  MachineInstr *RetI = nullptr;
-  for (auto &I : MBB) {
-    if (!I.isReturn())
-      continue;
-    RetI = &I;
-    break;
-  }
+  MachineInstr *RetI = getReturn(MBB);
   unsigned RetOpc = RetI ? RetI->getOpcode() : 0;
 
   MachineBasicBlock::iterator InsertPt = MBB.getFirstTerminator();
@@ -536,7 +607,8 @@ void HexagonFrameLowering::insertEpilogueInBlock(MachineBasicBlock &MBB) const {
 
   // Check for RESTORE_DEALLOC_RET* tail call. Don't emit an extra dealloc-
   // frame instruction if we encounter it.
-  if (RetOpc == Hexagon::RESTORE_DEALLOC_RET_JMP_V4) {
+  if (RetOpc == Hexagon::RESTORE_DEALLOC_RET_JMP_V4 ||
+      RetOpc == Hexagon::RESTORE_DEALLOC_RET_JMP_V4_PIC) {
     MachineBasicBlock::iterator It = RetI;
     ++It;
     // Delete all instructions after the RESTORE (except labels).
@@ -556,7 +628,8 @@ void HexagonFrameLowering::insertEpilogueInBlock(MachineBasicBlock &MBB) const {
   if (!MBB.empty() && InsertPt != MBB.begin()) {
     MachineBasicBlock::iterator PrevIt = std::prev(InsertPt);
     unsigned COpc = PrevIt->getOpcode();
-    if (COpc == Hexagon::RESTORE_DEALLOC_BEFORE_TAILCALL_V4)
+    if (COpc == Hexagon::RESTORE_DEALLOC_BEFORE_TAILCALL_V4 ||
+        COpc == Hexagon::RESTORE_DEALLOC_BEFORE_TAILCALL_V4_PIC)
       NeedsDeallocframe = false;
   }
 
@@ -572,11 +645,56 @@ void HexagonFrameLowering::insertEpilogueInBlock(MachineBasicBlock &MBB) const {
   unsigned NewOpc = Hexagon::L4_return;
   MachineInstr *NewI = BuildMI(MBB, RetI, DL, HII.get(NewOpc));
   // Transfer the function live-out registers.
-  NewI->copyImplicitOps(MF, RetI);
+  NewI->copyImplicitOps(MF, *RetI);
   MBB.erase(RetI);
 }
 
 
+bool HexagonFrameLowering::updateExitPaths(MachineBasicBlock &MBB,
+      MachineBasicBlock *RestoreB, BitVector &DoneT, BitVector &DoneF,
+      BitVector &Path) const {
+  assert(MBB.getNumber() >= 0);
+  unsigned BN = MBB.getNumber();
+  if (Path[BN] || DoneF[BN])
+    return false;
+  if (DoneT[BN])
+    return true;
+
+  auto &CSI = MBB.getParent()->getFrameInfo()->getCalleeSavedInfo();
+
+  Path[BN] = true;
+  bool ReachedExit = false;
+  for (auto &SB : MBB.successors())
+    ReachedExit |= updateExitPaths(*SB, RestoreB, DoneT, DoneF, Path);
+
+  if (!MBB.empty() && MBB.back().isReturn()) {
+    // Add implicit uses of all callee-saved registers to the reached
+    // return instructions. This is to prevent the anti-dependency breaker
+    // from renaming these registers.
+    MachineInstr &RetI = MBB.back();
+    if (!isRestoreCall(RetI.getOpcode()))
+      for (auto &R : CSI)
+        RetI.addOperand(MachineOperand::CreateReg(R.getReg(), false, true));
+    ReachedExit = true;
+  }
+
+  // We don't want to add unnecessary live-ins to the restore block: since
+  // the callee-saved registers are being defined in it, the entry of the
+  // restore block cannot be on the path from the definitions to any exit.
+  if (ReachedExit && &MBB != RestoreB) {
+    for (auto &R : CSI)
+      if (!MBB.isLiveIn(R.getReg()))
+        MBB.addLiveIn(R.getReg());
+    DoneT[BN] = true;
+  }
+  if (!ReachedExit)
+    DoneF[BN] = true;
+
+  Path[BN] = false;
+  return ReachedExit;
+}
+
+
 namespace {
   bool IsAllocFrame(MachineBasicBlock::const_iterator It) {
     if (!It->isBundle())
@@ -611,7 +729,7 @@ void HexagonFrameLowering::insertCFIInstructions(MachineFunction &MF) const {
 void HexagonFrameLowering::insertCFIInstructionsAt(MachineBasicBlock &MBB,
       MachineBasicBlock::iterator At) const {
   MachineFunction &MF = *MBB.getParent();
-  MachineFrameInfo *MFI = MF.getFrameInfo();
+  MachineFrameInfo &MFI = *MF.getFrameInfo();
   MachineModuleInfo &MMI = MF.getMMI();
   auto &HST = MF.getSubtarget<HexagonSubtarget>();
   auto &HII = *HST.getInstrInfo();
@@ -624,8 +742,9 @@ void HexagonFrameLowering::insertCFIInstructionsAt(MachineBasicBlock &MBB,
   const MCInstrDesc &CFID = HII.get(TargetOpcode::CFI_INSTRUCTION);
 
   MCSymbol *FrameLabel = MMI.getContext().createTempSymbol();
+  bool HasFP = hasFP(MF);
 
-  if (hasFP(MF)) {
+  if (HasFP) {
     unsigned DwFPReg = HRI.getDwarfRegNum(HRI.getFrameRegister(), true);
     unsigned DwRAReg = HRI.getDwarfRegNum(HRI.getRARegister(), true);
 
@@ -663,7 +782,7 @@ void HexagonFrameLowering::insertCFIInstructionsAt(MachineBasicBlock &MBB,
     Hexagon::NoRegister
   };
 
-  const std::vector<CalleeSavedInfo> &CSI = MFI->getCalleeSavedInfo();
+  const std::vector<CalleeSavedInfo> &CSI = MFI.getCalleeSavedInfo();
 
   for (unsigned i = 0; RegsToMove[i] != Hexagon::NoRegister; ++i) {
     unsigned Reg = RegsToMove[i];
@@ -674,9 +793,22 @@ void HexagonFrameLowering::insertCFIInstructionsAt(MachineBasicBlock &MBB,
     if (F == CSI.end())
       continue;
 
+    int64_t Offset;
+    if (HasFP) {
+      // If the function has a frame pointer (i.e. has an allocframe),
+      // then the CFA has been defined in terms of FP. Any offsets in
+      // the following CFI instructions have to be defined relative
+      // to FP, which points to the bottom of the stack frame.
+      // The function getFrameIndexReference can still choose to use SP
+      // for the offset calculation, so we cannot simply call it here.
+      // Instead, get the offset (relative to the FP) directly.
+      Offset = MFI.getObjectOffset(F->getFrameIdx());
+    } else {
+      unsigned FrameReg;
+      Offset = getFrameIndexReference(MF, F->getFrameIdx(), FrameReg);
+    }
     // Subtract 8 to make room for R30 and R31, which are added above.
-    unsigned FrameReg;
-    int64_t Offset = getFrameIndexReference(MF, F->getFrameIdx(), FrameReg) - 8;
+    Offset -= 8;
 
     if (Reg < Hexagon::D0 || Reg > Hexagon::D15) {
       unsigned DwarfReg = HRI.getDwarfRegNum(Reg, true);
@@ -734,7 +866,7 @@ bool HexagonFrameLowering::hasFP(const MachineFunction &MF) const {
     return true;
 
   if (MFI.getStackSize() > 0) {
-    if (UseAllocframe)
+    if (EnableStackOVFSanitizer || UseAllocframe)
       return true;
   }
 
@@ -752,8 +884,8 @@ enum SpillKind {
   SK_FromMemTailcall
 };
 
-static const char *
-getSpillFunctionFor(unsigned MaxReg, SpillKind SpillType) {
+static const char *getSpillFunctionFor(unsigned MaxReg, SpillKind SpillType,
+      bool Stkchk = false) {
   const char * V4SpillToMemoryFunctions[] = {
     "__save_r16_through_r17",
     "__save_r16_through_r19",
@@ -762,6 +894,14 @@ getSpillFunctionFor(unsigned MaxReg, SpillKind SpillType) {
     "__save_r16_through_r25",
     "__save_r16_through_r27" };
 
+  const char * V4SpillToMemoryStkchkFunctions[] = {
+    "__save_r16_through_r17_stkchk",
+    "__save_r16_through_r19_stkchk",
+    "__save_r16_through_r21_stkchk",
+    "__save_r16_through_r23_stkchk",
+    "__save_r16_through_r25_stkchk",
+    "__save_r16_through_r27_stkchk" };
+
   const char * V4SpillFromMemoryFunctions[] = {
     "__restore_r16_through_r17_and_deallocframe",
     "__restore_r16_through_r19_and_deallocframe",
@@ -783,7 +923,8 @@ getSpillFunctionFor(unsigned MaxReg, SpillKind SpillType) {
 
   switch(SpillType) {
   case SK_ToMem:
-    SpillFunc = V4SpillToMemoryFunctions;
+    SpillFunc = Stkchk ? V4SpillToMemoryStkchkFunctions
+                       : V4SpillToMemoryFunctions;
     break;
   case SK_FromMem:
     SpillFunc = V4SpillFromMemoryFunctions;
@@ -814,32 +955,20 @@ getSpillFunctionFor(unsigned MaxReg, SpillKind SpillType) {
   return 0;
 }
 
-/// Adds all callee-saved registers up to MaxReg to the instruction.
-static void addCalleeSaveRegistersAsImpOperand(MachineInstr *Inst,
-                                           unsigned MaxReg, bool IsDef) {
-  // Add the callee-saved registers as implicit uses.
-  for (unsigned R = Hexagon::R16; R <= MaxReg; ++R) {
-    MachineOperand ImpUse = MachineOperand::CreateReg(R, IsDef, true);
-    Inst->addOperand(ImpUse);
-  }
-}
-
 
 int HexagonFrameLowering::getFrameIndexReference(const MachineFunction &MF,
       int FI, unsigned &FrameReg) const {
   auto &MFI = *MF.getFrameInfo();
   auto &HRI = *MF.getSubtarget<HexagonSubtarget>().getRegisterInfo();
 
-  // Large parts of this code are shared with HRI::eliminateFrameIndex.
   int Offset = MFI.getObjectOffset(FI);
   bool HasAlloca = MFI.hasVarSizedObjects();
   bool HasExtraAlign = HRI.needsStackRealignment(MF);
   bool NoOpt = MF.getTarget().getOptLevel() == CodeGenOpt::None;
 
   unsigned SP = HRI.getStackRegister(), FP = HRI.getFrameRegister();
-  unsigned AP = 0;
-  if (const MachineInstr *AI = getAlignaInstr(MF))
-    AP = AI->getOperand(0).getReg();
+  auto &HMFI = *MF.getInfo<HexagonMachineFunctionInfo>();
+  unsigned AP = HMFI.getStackAlignBasePhysReg();
   unsigned FrameSize = MFI.getStackSize();
 
   bool UseFP = false, UseAP = false;  // Default: use SP (except at -O0).
@@ -912,24 +1041,40 @@ int HexagonFrameLowering::getFrameIndexReference(const MachineFunction &MF,
 
 
 bool HexagonFrameLowering::insertCSRSpillsInBlock(MachineBasicBlock &MBB,
-      const CSIVect &CSI, const HexagonRegisterInfo &HRI) const {
+      const CSIVect &CSI, const HexagonRegisterInfo &HRI,
+      bool &PrologueStubs) const {
   if (CSI.empty())
     return true;
 
   MachineBasicBlock::iterator MI = MBB.begin();
+  PrologueStubs = false;
   MachineFunction &MF = *MBB.getParent();
   auto &HII = *MF.getSubtarget<HexagonSubtarget>().getInstrInfo();
 
   if (useSpillFunction(MF, CSI)) {
+    PrologueStubs = true;
     unsigned MaxReg = getMaxCalleeSavedReg(CSI, HRI);
-    const char *SpillFun = getSpillFunctionFor(MaxReg, SK_ToMem);
+    bool StkOvrFlowEnabled = EnableStackOVFSanitizer;
+    const char *SpillFun = getSpillFunctionFor(MaxReg, SK_ToMem,
+                                               StkOvrFlowEnabled);
+    auto &HTM = static_cast<const HexagonTargetMachine&>(MF.getTarget());
+    bool IsPIC = HTM.isPositionIndependent();
+
     // Call spill function.
     DebugLoc DL = MI != MBB.end() ? MI->getDebugLoc() : DebugLoc();
+    unsigned SpillOpc;
+    if (StkOvrFlowEnabled)
+      SpillOpc = IsPIC ? Hexagon::SAVE_REGISTERS_CALL_V4STK_PIC
+                       : Hexagon::SAVE_REGISTERS_CALL_V4STK;
+    else
+      SpillOpc = IsPIC ? Hexagon::SAVE_REGISTERS_CALL_V4_PIC
+                       : Hexagon::SAVE_REGISTERS_CALL_V4;
+
     MachineInstr *SaveRegsCall =
-        BuildMI(MBB, MI, DL, HII.get(Hexagon::SAVE_REGISTERS_CALL_V4))
+        BuildMI(MBB, MI, DL, HII.get(SpillOpc))
           .addExternalSymbol(SpillFun);
     // Add callee-saved registers as use.
-    addCalleeSaveRegistersAsImpOperand(SaveRegsCall, MaxReg, false);
+    addCalleeSaveRegistersAsImpOperand(SaveRegsCall, CSI, false, true);
     // Add live in registers.
     for (unsigned I = 0; I < CSI.size(); ++I)
       MBB.addLiveIn(CSI[I].getReg());
@@ -966,6 +1111,8 @@ bool HexagonFrameLowering::insertCSRRestoresInBlock(MachineBasicBlock &MBB,
     unsigned MaxR = getMaxCalleeSavedReg(CSI, HRI);
     SpillKind Kind = HasTC ? SK_FromMemTailcall : SK_FromMem;
     const char *RestoreFn = getSpillFunctionFor(MaxR, Kind);
+    auto &HTM = static_cast<const HexagonTargetMachine&>(MF.getTarget());
+    bool IsPIC = HTM.isPositionIndependent();
 
     // Call spill function.
     DebugLoc DL = MI != MBB.end() ? MI->getDebugLoc()
@@ -973,20 +1120,22 @@ bool HexagonFrameLowering::insertCSRRestoresInBlock(MachineBasicBlock &MBB,
     MachineInstr *DeallocCall = nullptr;
 
     if (HasTC) {
-      unsigned ROpc = Hexagon::RESTORE_DEALLOC_BEFORE_TAILCALL_V4;
+      unsigned ROpc = IsPIC ? Hexagon::RESTORE_DEALLOC_BEFORE_TAILCALL_V4_PIC
+                            : Hexagon::RESTORE_DEALLOC_BEFORE_TAILCALL_V4;
       DeallocCall = BuildMI(MBB, MI, DL, HII.get(ROpc))
           .addExternalSymbol(RestoreFn);
     } else {
       // The block has a return.
       MachineBasicBlock::iterator It = MBB.getFirstTerminator();
       assert(It->isReturn() && std::next(It) == MBB.end());
-      unsigned ROpc = Hexagon::RESTORE_DEALLOC_RET_JMP_V4;
+      unsigned ROpc = IsPIC ? Hexagon::RESTORE_DEALLOC_RET_JMP_V4_PIC
+                            : Hexagon::RESTORE_DEALLOC_RET_JMP_V4;
       DeallocCall = BuildMI(MBB, It, DL, HII.get(ROpc))
           .addExternalSymbol(RestoreFn);
       // Transfer the function live-out registers.
-      DeallocCall->copyImplicitOps(MF, It);
+      DeallocCall->copyImplicitOps(MF, *It);
     }
-    addCalleeSaveRegistersAsImpOperand(DeallocCall, MaxR, true);
+    addCalleeSaveRegistersAsImpOperand(DeallocCall, CSI, true, false);
     return true;
   }
 
@@ -996,18 +1145,19 @@ bool HexagonFrameLowering::insertCSRRestoresInBlock(MachineBasicBlock &MBB,
     int FI = CSI[i].getFrameIdx();
     HII.loadRegFromStackSlot(MBB, MI, Reg, FI, RC, &HRI);
   }
+
   return true;
 }
 
-
-void HexagonFrameLowering::eliminateCallFramePseudoInstr(MachineFunction &MF,
-      MachineBasicBlock &MBB, MachineBasicBlock::iterator I) const {
+MachineBasicBlock::iterator HexagonFrameLowering::eliminateCallFramePseudoInstr(
+    MachineFunction &MF, MachineBasicBlock &MBB,
+    MachineBasicBlock::iterator I) const {
   MachineInstr &MI = *I;
   unsigned Opc = MI.getOpcode();
   (void)Opc; // Silence compiler warning.
   assert((Opc == Hexagon::ADJCALLSTACKDOWN || Opc == Hexagon::ADJCALLSTACKUP) &&
          "Cannot handle this call frame pseudo instruction");
-  MBB.erase(I);
+  return MBB.erase(I);
 }
 
 
@@ -1025,14 +1175,16 @@ void HexagonFrameLowering::processFunctionBeforeFrameFinalized(
     return;
 
   unsigned LFS = MFI->getLocalFrameSize();
-  int Offset = -LFS;
   for (int i = 0, e = MFI->getObjectIndexEnd(); i != e; ++i) {
     if (!MFI->isSpillSlotObjectIndex(i) || MFI->isDeadObjectIndex(i))
       continue;
-    int S = MFI->getObjectSize(i);
-    LFS += S;
-    Offset -= S;
-    MFI->mapLocalFrameObject(i, Offset);
+    unsigned S = MFI->getObjectSize(i);
+    // Reduce the alignment to at most 8. This will require unaligned vector
+    // stores if they happen here.
+    unsigned A = std::max(MFI->getObjectAlignment(i), 8U);
+    MFI->setObjectAlignment(i, 8);
+    LFS = alignTo(LFS+S, A);
+    MFI->mapLocalFrameObject(i, -LFS);
   }
 
   MFI->setLocalFrameSize(LFS);
@@ -1041,142 +1193,35 @@ void HexagonFrameLowering::processFunctionBeforeFrameFinalized(
   if (A == 0)
     MFI->setLocalFrameMaxAlign(8);
   MFI->setUseLocalStackAllocationBlock(true);
+
+  // Set the physical aligned-stack base address register.
+  unsigned AP = 0;
+  if (const MachineInstr *AI = getAlignaInstr(MF))
+    AP = AI->getOperand(0).getReg();
+  auto &HMFI = *MF.getInfo<HexagonMachineFunctionInfo>();
+  HMFI.setStackAlignBasePhysReg(AP);
 }
 
-/// Returns true if there is no caller saved registers available.
+/// Returns true if there are no caller-saved registers available in class RC.
 static bool needToReserveScavengingSpillSlots(MachineFunction &MF,
-                                              const HexagonRegisterInfo &HRI) {
+      const HexagonRegisterInfo &HRI, const TargetRegisterClass *RC) {
   MachineRegisterInfo &MRI = MF.getRegInfo();
-  const MCPhysReg *CallerSavedRegs = HRI.getCallerSavedRegs(&MF);
-  // Check for an unused caller-saved register.
-  for ( ; *CallerSavedRegs; ++CallerSavedRegs) {
-    MCPhysReg FreeReg = *CallerSavedRegs;
-    if (!MRI.reg_nodbg_empty(FreeReg))
-      continue;
-
-    // Check aliased register usage.
-    bool IsCurrentRegUsed = false;
-    for (MCRegAliasIterator AI(FreeReg, &HRI, false); AI.isValid(); ++AI)
-      if (!MRI.reg_nodbg_empty(*AI)) {
-        IsCurrentRegUsed = true;
-        break;
-      }
-    if (IsCurrentRegUsed)
-      continue;
 
-    // Neither directly used nor used through an aliased register.
+  auto IsUsed = [&HRI,&MRI] (unsigned Reg) -> bool {
+    for (MCRegAliasIterator AI(Reg, &HRI, true); AI.isValid(); ++AI)
+      if (MRI.isPhysRegUsed(*AI))
+        return true;
     return false;
-  }
-  // All caller-saved registers are used.
-  return true;
-}
-
-
-/// Replaces the predicate spill code pseudo instructions by valid instructions.
-bool HexagonFrameLowering::replacePredRegPseudoSpillCode(MachineFunction &MF)
-      const {
-  auto &HST = static_cast<const HexagonSubtarget&>(MF.getSubtarget());
-  auto &HII = *HST.getInstrInfo();
-  MachineRegisterInfo &MRI = MF.getRegInfo();
-  bool HasReplacedPseudoInst = false;
-  // Replace predicate spill pseudo instructions by real code.
-  // Loop over all of the basic blocks.
-  for (MachineFunction::iterator MBBb = MF.begin(), MBBe = MF.end();
-       MBBb != MBBe; ++MBBb) {
-    MachineBasicBlock *MBB = &*MBBb;
-    // Traverse the basic block.
-    MachineBasicBlock::iterator NextII;
-    for (MachineBasicBlock::iterator MII = MBB->begin(); MII != MBB->end();
-         MII = NextII) {
-      MachineInstr *MI = MII;
-      NextII = std::next(MII);
-      int Opc = MI->getOpcode();
-      if (Opc == Hexagon::STriw_pred) {
-        HasReplacedPseudoInst = true;
-        // STriw_pred FI, 0, SrcReg;
-        unsigned VirtReg = MRI.createVirtualRegister(&Hexagon::IntRegsRegClass);
-        unsigned SrcReg = MI->getOperand(2).getReg();
-        bool IsOrigSrcRegKilled = MI->getOperand(2).isKill();
-
-        assert(MI->getOperand(0).isFI() && "Expect a frame index");
-        assert(Hexagon::PredRegsRegClass.contains(SrcReg) &&
-               "Not a predicate register");
-
-        // Insert transfer to general purpose register.
-        //   VirtReg = C2_tfrpr SrcPredReg
-        BuildMI(*MBB, MII, MI->getDebugLoc(), HII.get(Hexagon::C2_tfrpr),
-                VirtReg).addReg(SrcReg, getKillRegState(IsOrigSrcRegKilled));
-
-        // Change instruction to S2_storeri_io.
-        //   S2_storeri_io FI, 0, VirtReg
-        MI->setDesc(HII.get(Hexagon::S2_storeri_io));
-        MI->getOperand(2).setReg(VirtReg);
-        MI->getOperand(2).setIsKill();
-
-      } else if (Opc == Hexagon::LDriw_pred) {
-        // DstReg = LDriw_pred FI, 0
-        MachineOperand &M0 = MI->getOperand(0);
-        if (M0.isDead()) {
-          MBB->erase(MII);
-          continue;
-        }
-
-        unsigned VirtReg = MRI.createVirtualRegister(&Hexagon::IntRegsRegClass);
-        unsigned DestReg = MI->getOperand(0).getReg();
-
-        assert(MI->getOperand(1).isFI() && "Expect a frame index");
-        assert(Hexagon::PredRegsRegClass.contains(DestReg) &&
-               "Not a predicate register");
-
-        // Change instruction to L2_loadri_io.
-        //   VirtReg = L2_loadri_io FI, 0
-        MI->setDesc(HII.get(Hexagon::L2_loadri_io));
-        MI->getOperand(0).setReg(VirtReg);
-
-        // Insert transfer to general purpose register.
-        //   DestReg = C2_tfrrp VirtReg
-        const MCInstrDesc &D = HII.get(Hexagon::C2_tfrrp);
-        BuildMI(*MBB, std::next(MII), MI->getDebugLoc(), D, DestReg)
-          .addReg(VirtReg, getKillRegState(true));
-        HasReplacedPseudoInst = true;
-      }
-    }
-  }
-  return HasReplacedPseudoInst;
-}
-
-
-void HexagonFrameLowering::determineCalleeSaves(MachineFunction &MF,
-                                                BitVector &SavedRegs,
-                                                RegScavenger *RS) const {
-  TargetFrameLowering::determineCalleeSaves(MF, SavedRegs, RS);
-
-  auto &HST = static_cast<const HexagonSubtarget&>(MF.getSubtarget());
-  auto &HRI = *HST.getRegisterInfo();
-
-  bool HasEHReturn = MF.getInfo<HexagonMachineFunctionInfo>()->hasEHReturn();
-
-  // If we have a function containing __builtin_eh_return we want to spill and
-  // restore all callee saved registers. Pretend that they are used.
-  if (HasEHReturn) {
-    for (const MCPhysReg *CSRegs = HRI.getCalleeSavedRegs(&MF); *CSRegs;
-         ++CSRegs)
-      SavedRegs.set(*CSRegs);
-  }
+  };
 
-  const TargetRegisterClass &RC = Hexagon::IntRegsRegClass;
+  // Check for an unused caller-saved register. Callee-saved registers
+  // have become pristine by now.
+  for (const MCPhysReg *P = HRI.getCallerSavedRegs(&MF, RC); *P; ++P)
+    if (!IsUsed(*P))
+      return false;
 
-  // Replace predicate register pseudo spill code.
-  bool HasReplacedPseudoInst = replacePredRegPseudoSpillCode(MF);
-
-  // We need to reserve a a spill slot if scavenging could potentially require
-  // spilling a scavenged register.
-  if (HasReplacedPseudoInst && needToReserveScavengingSpillSlots(MF, HRI)) {
-    MachineFrameInfo *MFI = MF.getFrameInfo();
-    for (int i=0; i < NumberScavengerSlots; i++)
-      RS->addScavengingFrameIndex(
-        MFI->CreateSpillStackObject(RC.getSize(), RC.getAlignment()));
-  }
+  // All caller-saved registers are used.
+  return true;
 }
 
 
@@ -1327,6 +1372,811 @@ bool HexagonFrameLowering::assignCalleeSavedSpillSlots(MachineFunction &MF,
 }
 
 
+bool HexagonFrameLowering::expandCopy(MachineBasicBlock &B,
+      MachineBasicBlock::iterator It, MachineRegisterInfo &MRI,
+      const HexagonInstrInfo &HII, SmallVectorImpl<unsigned> &NewRegs) const {
+  MachineInstr *MI = &*It;
+  DebugLoc DL = MI->getDebugLoc();
+  unsigned DstR = MI->getOperand(0).getReg();
+  unsigned SrcR = MI->getOperand(1).getReg();
+  if (!Hexagon::ModRegsRegClass.contains(DstR) ||
+      !Hexagon::ModRegsRegClass.contains(SrcR))
+    return false;
+
+  unsigned TmpR = MRI.createVirtualRegister(&Hexagon::IntRegsRegClass);
+  BuildMI(B, It, DL, HII.get(TargetOpcode::COPY), TmpR)
+    .addOperand(MI->getOperand(1));
+  BuildMI(B, It, DL, HII.get(TargetOpcode::COPY), DstR)
+    .addReg(TmpR, RegState::Kill);
+
+  NewRegs.push_back(TmpR);
+  B.erase(It);
+  return true;
+}
+
+bool HexagonFrameLowering::expandStoreInt(MachineBasicBlock &B,
+      MachineBasicBlock::iterator It, MachineRegisterInfo &MRI,
+      const HexagonInstrInfo &HII, SmallVectorImpl<unsigned> &NewRegs) const {
+  MachineInstr *MI = &*It;
+  DebugLoc DL = MI->getDebugLoc();
+  unsigned Opc = MI->getOpcode();
+  unsigned SrcR = MI->getOperand(2).getReg();
+  bool IsKill = MI->getOperand(2).isKill();
+
+  assert(MI->getOperand(0).isFI() && "Expect a frame index");
+  int FI = MI->getOperand(0).getIndex();
+
+  // TmpR = C2_tfrpr SrcR   if SrcR is a predicate register
+  // TmpR = A2_tfrcrr SrcR  if SrcR is a modifier register
+  unsigned TmpR = MRI.createVirtualRegister(&Hexagon::IntRegsRegClass);
+  unsigned TfrOpc = (Opc == Hexagon::STriw_pred) ? Hexagon::C2_tfrpr
+                                                 : Hexagon::A2_tfrcrr;
+  BuildMI(B, It, DL, HII.get(TfrOpc), TmpR)
+    .addReg(SrcR, getKillRegState(IsKill));
+
+  // S2_storeri_io FI, 0, TmpR
+  BuildMI(B, It, DL, HII.get(Hexagon::S2_storeri_io))
+    .addFrameIndex(FI)
+    .addImm(0)
+    .addReg(TmpR, RegState::Kill)
+    .setMemRefs(MI->memoperands_begin(), MI->memoperands_end());
+
+  NewRegs.push_back(TmpR);
+  B.erase(It);
+  return true;
+}
+
+bool HexagonFrameLowering::expandLoadInt(MachineBasicBlock &B,
+      MachineBasicBlock::iterator It, MachineRegisterInfo &MRI,
+      const HexagonInstrInfo &HII, SmallVectorImpl<unsigned> &NewRegs) const {
+  MachineInstr *MI = &*It;
+  DebugLoc DL = MI->getDebugLoc();
+  unsigned Opc = MI->getOpcode();
+  unsigned DstR = MI->getOperand(0).getReg();
+
+  assert(MI->getOperand(1).isFI() && "Expect a frame index");
+  int FI = MI->getOperand(1).getIndex();
+
+  // TmpR = L2_loadri_io FI, 0
+  unsigned TmpR = MRI.createVirtualRegister(&Hexagon::IntRegsRegClass);
+  BuildMI(B, It, DL, HII.get(Hexagon::L2_loadri_io), TmpR)
+    .addFrameIndex(FI)
+    .addImm(0)
+    .setMemRefs(MI->memoperands_begin(), MI->memoperands_end());
+
+  // DstR = C2_tfrrp TmpR   if DstR is a predicate register
+  // DstR = A2_tfrrcr TmpR  if DstR is a modifier register
+  unsigned TfrOpc = (Opc == Hexagon::LDriw_pred) ? Hexagon::C2_tfrrp
+                                                 : Hexagon::A2_tfrrcr;
+  BuildMI(B, It, DL, HII.get(TfrOpc), DstR)
+    .addReg(TmpR, RegState::Kill);
+
+  NewRegs.push_back(TmpR);
+  B.erase(It);
+  return true;
+}
+
+
+bool HexagonFrameLowering::expandStoreVecPred(MachineBasicBlock &B,
+      MachineBasicBlock::iterator It, MachineRegisterInfo &MRI,
+      const HexagonInstrInfo &HII, SmallVectorImpl<unsigned> &NewRegs) const {
+  auto &HST = B.getParent()->getSubtarget<HexagonSubtarget>();
+  MachineInstr *MI = &*It;
+  DebugLoc DL = MI->getDebugLoc();
+  unsigned SrcR = MI->getOperand(2).getReg();
+  bool IsKill = MI->getOperand(2).isKill();
+
+  assert(MI->getOperand(0).isFI() && "Expect a frame index");
+  int FI = MI->getOperand(0).getIndex();
+
+  bool Is128B = HST.useHVXDblOps();
+  auto *RC = !Is128B ? &Hexagon::VectorRegsRegClass
+                     : &Hexagon::VectorRegs128BRegClass;
+
+  // Insert transfer to general vector register.
+  //   TmpR0 = A2_tfrsi 0x01010101
+  //   TmpR1 = V6_vandqrt Qx, TmpR0
+  //   store FI, 0, TmpR1
+  unsigned TmpR0 = MRI.createVirtualRegister(&Hexagon::IntRegsRegClass);
+  unsigned TmpR1 = MRI.createVirtualRegister(RC);
+
+  BuildMI(B, It, DL, HII.get(Hexagon::A2_tfrsi), TmpR0)
+    .addImm(0x01010101);
+
+  unsigned VandOpc = !Is128B ? Hexagon::V6_vandqrt : Hexagon::V6_vandqrt_128B;
+  BuildMI(B, It, DL, HII.get(VandOpc), TmpR1)
+    .addReg(SrcR, getKillRegState(IsKill))
+    .addReg(TmpR0, RegState::Kill);
+
+  auto *HRI = B.getParent()->getSubtarget<HexagonSubtarget>().getRegisterInfo();
+  HII.storeRegToStackSlot(B, It, TmpR1, true, FI, RC, HRI);
+  expandStoreVec(B, std::prev(It), MRI, HII, NewRegs);
+
+  NewRegs.push_back(TmpR0);
+  NewRegs.push_back(TmpR1);
+  B.erase(It);
+  return true;
+}
+
+bool HexagonFrameLowering::expandLoadVecPred(MachineBasicBlock &B,
+      MachineBasicBlock::iterator It, MachineRegisterInfo &MRI,
+      const HexagonInstrInfo &HII, SmallVectorImpl<unsigned> &NewRegs) const {
+  auto &HST = B.getParent()->getSubtarget<HexagonSubtarget>();
+  MachineInstr *MI = &*It;
+  DebugLoc DL = MI->getDebugLoc();
+  unsigned DstR = MI->getOperand(0).getReg();
+
+  assert(MI->getOperand(1).isFI() && "Expect a frame index");
+  int FI = MI->getOperand(1).getIndex();
+
+  bool Is128B = HST.useHVXDblOps();
+  auto *RC = !Is128B ? &Hexagon::VectorRegsRegClass
+                     : &Hexagon::VectorRegs128BRegClass;
+
+  // TmpR0 = A2_tfrsi 0x01010101
+  // TmpR1 = load FI, 0
+  // DstR = V6_vandvrt TmpR1, TmpR0
+  unsigned TmpR0 = MRI.createVirtualRegister(&Hexagon::IntRegsRegClass);
+  unsigned TmpR1 = MRI.createVirtualRegister(RC);
+
+  BuildMI(B, It, DL, HII.get(Hexagon::A2_tfrsi), TmpR0)
+    .addImm(0x01010101);
+  auto *HRI = B.getParent()->getSubtarget<HexagonSubtarget>().getRegisterInfo();
+  HII.loadRegFromStackSlot(B, It, TmpR1, FI, RC, HRI);
+  expandLoadVec(B, std::prev(It), MRI, HII, NewRegs);
+
+  unsigned VandOpc = !Is128B ? Hexagon::V6_vandvrt : Hexagon::V6_vandvrt_128B;
+  BuildMI(B, It, DL, HII.get(VandOpc), DstR)
+    .addReg(TmpR1, RegState::Kill)
+    .addReg(TmpR0, RegState::Kill);
+
+  NewRegs.push_back(TmpR0);
+  NewRegs.push_back(TmpR1);
+  B.erase(It);
+  return true;
+}
+
+bool HexagonFrameLowering::expandStoreVec2(MachineBasicBlock &B,
+      MachineBasicBlock::iterator It, MachineRegisterInfo &MRI,
+      const HexagonInstrInfo &HII, SmallVectorImpl<unsigned> &NewRegs) const {
+  MachineFunction &MF = *B.getParent();
+  auto &HST = MF.getSubtarget<HexagonSubtarget>();
+  auto &MFI = *MF.getFrameInfo();
+  auto &HRI = *MF.getSubtarget<HexagonSubtarget>().getRegisterInfo();
+  MachineInstr *MI = &*It;
+  DebugLoc DL = MI->getDebugLoc();
+
+  unsigned SrcR = MI->getOperand(2).getReg();
+  unsigned SrcLo = HRI.getSubReg(SrcR, Hexagon::subreg_loreg);
+  unsigned SrcHi = HRI.getSubReg(SrcR, Hexagon::subreg_hireg);
+  bool IsKill = MI->getOperand(2).isKill();
+
+  assert(MI->getOperand(0).isFI() && "Expect a frame index");
+  int FI = MI->getOperand(0).getIndex();
+
+  bool Is128B = HST.useHVXDblOps();
+  auto *RC = !Is128B ? &Hexagon::VectorRegsRegClass
+                     : &Hexagon::VectorRegs128BRegClass;
+  unsigned Size = RC->getSize();
+  unsigned NeedAlign = RC->getAlignment();
+  unsigned HasAlign = MFI.getObjectAlignment(FI);
+  unsigned StoreOpc;
+
+  // Store low part.
+  if (NeedAlign <= HasAlign)
+    StoreOpc = !Is128B ? Hexagon::V6_vS32b_ai  : Hexagon::V6_vS32b_ai_128B;
+  else
+    StoreOpc = !Is128B ? Hexagon::V6_vS32Ub_ai : Hexagon::V6_vS32Ub_ai_128B;
+
+  BuildMI(B, It, DL, HII.get(StoreOpc))
+    .addFrameIndex(FI)
+    .addImm(0)
+    .addReg(SrcLo, getKillRegState(IsKill))
+    .setMemRefs(MI->memoperands_begin(), MI->memoperands_end());
+
+  // Load high part.
+  if (NeedAlign <= MinAlign(HasAlign, Size))
+    StoreOpc = !Is128B ? Hexagon::V6_vS32b_ai  : Hexagon::V6_vS32b_ai_128B;
+  else
+    StoreOpc = !Is128B ? Hexagon::V6_vS32Ub_ai : Hexagon::V6_vS32Ub_ai_128B;
+
+  BuildMI(B, It, DL, HII.get(StoreOpc))
+    .addFrameIndex(FI)
+    .addImm(Size)
+    .addReg(SrcHi, getKillRegState(IsKill))
+    .setMemRefs(MI->memoperands_begin(), MI->memoperands_end());
+
+  B.erase(It);
+  return true;
+}
+
+bool HexagonFrameLowering::expandLoadVec2(MachineBasicBlock &B,
+      MachineBasicBlock::iterator It, MachineRegisterInfo &MRI,
+      const HexagonInstrInfo &HII, SmallVectorImpl<unsigned> &NewRegs) const {
+  MachineFunction &MF = *B.getParent();
+  auto &HST = MF.getSubtarget<HexagonSubtarget>();
+  auto &MFI = *MF.getFrameInfo();
+  auto &HRI = *MF.getSubtarget<HexagonSubtarget>().getRegisterInfo();
+  MachineInstr *MI = &*It;
+  DebugLoc DL = MI->getDebugLoc();
+
+  unsigned DstR = MI->getOperand(0).getReg();
+  unsigned DstHi = HRI.getSubReg(DstR, Hexagon::subreg_hireg);
+  unsigned DstLo = HRI.getSubReg(DstR, Hexagon::subreg_loreg);
+
+  assert(MI->getOperand(1).isFI() && "Expect a frame index");
+  int FI = MI->getOperand(1).getIndex();
+
+  bool Is128B = HST.useHVXDblOps();
+  auto *RC = !Is128B ? &Hexagon::VectorRegsRegClass
+                     : &Hexagon::VectorRegs128BRegClass;
+  unsigned Size = RC->getSize();
+  unsigned NeedAlign = RC->getAlignment();
+  unsigned HasAlign = MFI.getObjectAlignment(FI);
+  unsigned LoadOpc;
+
+  // Load low part.
+  if (NeedAlign <= HasAlign)
+    LoadOpc = !Is128B ? Hexagon::V6_vL32b_ai  : Hexagon::V6_vL32b_ai_128B;
+  else
+    LoadOpc = !Is128B ? Hexagon::V6_vL32Ub_ai : Hexagon::V6_vL32Ub_ai_128B;
+
+  BuildMI(B, It, DL, HII.get(LoadOpc), DstLo)
+    .addFrameIndex(FI)
+    .addImm(0)
+    .setMemRefs(MI->memoperands_begin(), MI->memoperands_end());
+
+  // Load high part.
+  if (NeedAlign <= MinAlign(HasAlign, Size))
+    LoadOpc = !Is128B ? Hexagon::V6_vL32b_ai  : Hexagon::V6_vL32b_ai_128B;
+  else
+    LoadOpc = !Is128B ? Hexagon::V6_vL32Ub_ai : Hexagon::V6_vL32Ub_ai_128B;
+
+  BuildMI(B, It, DL, HII.get(LoadOpc), DstHi)
+    .addFrameIndex(FI)
+    .addImm(Size)
+    .setMemRefs(MI->memoperands_begin(), MI->memoperands_end());
+
+  B.erase(It);
+  return true;
+}
+
+bool HexagonFrameLowering::expandStoreVec(MachineBasicBlock &B,
+      MachineBasicBlock::iterator It, MachineRegisterInfo &MRI,
+      const HexagonInstrInfo &HII, SmallVectorImpl<unsigned> &NewRegs) const {
+  MachineFunction &MF = *B.getParent();
+  auto &HST = MF.getSubtarget<HexagonSubtarget>();
+  auto &MFI = *MF.getFrameInfo();
+  MachineInstr *MI = &*It;
+  DebugLoc DL = MI->getDebugLoc();
+
+  unsigned SrcR = MI->getOperand(2).getReg();
+  bool IsKill = MI->getOperand(2).isKill();
+
+  assert(MI->getOperand(0).isFI() && "Expect a frame index");
+  int FI = MI->getOperand(0).getIndex();
+
+  bool Is128B = HST.useHVXDblOps();
+  auto *RC = !Is128B ? &Hexagon::VectorRegsRegClass
+                     : &Hexagon::VectorRegs128BRegClass;
+
+  unsigned NeedAlign = RC->getAlignment();
+  unsigned HasAlign = MFI.getObjectAlignment(FI);
+  unsigned StoreOpc;
+
+  if (NeedAlign <= HasAlign)
+    StoreOpc = !Is128B ? Hexagon::V6_vS32b_ai : Hexagon::V6_vS32b_ai_128B;
+  else
+    StoreOpc = !Is128B ? Hexagon::V6_vS32Ub_ai : Hexagon::V6_vS32Ub_ai_128B;
+
+  BuildMI(B, It, DL, HII.get(StoreOpc))
+    .addFrameIndex(FI)
+    .addImm(0)
+    .addReg(SrcR, getKillRegState(IsKill))
+    .setMemRefs(MI->memoperands_begin(), MI->memoperands_end());
+
+  B.erase(It);
+  return true;
+}
+
+bool HexagonFrameLowering::expandLoadVec(MachineBasicBlock &B,
+      MachineBasicBlock::iterator It, MachineRegisterInfo &MRI,
+      const HexagonInstrInfo &HII, SmallVectorImpl<unsigned> &NewRegs) const {
+  MachineFunction &MF = *B.getParent();
+  auto &HST = MF.getSubtarget<HexagonSubtarget>();
+  auto &MFI = *MF.getFrameInfo();
+  MachineInstr *MI = &*It;
+  DebugLoc DL = MI->getDebugLoc();
+
+  unsigned DstR = MI->getOperand(0).getReg();
+
+  assert(MI->getOperand(1).isFI() && "Expect a frame index");
+  int FI = MI->getOperand(1).getIndex();
+
+  bool Is128B = HST.useHVXDblOps();
+  auto *RC = !Is128B ? &Hexagon::VectorRegsRegClass
+                     : &Hexagon::VectorRegs128BRegClass;
+
+  unsigned NeedAlign = RC->getAlignment();
+  unsigned HasAlign = MFI.getObjectAlignment(FI);
+  unsigned LoadOpc;
+
+  if (NeedAlign <= HasAlign)
+    LoadOpc = !Is128B ? Hexagon::V6_vL32b_ai : Hexagon::V6_vL32b_ai_128B;
+  else
+    LoadOpc = !Is128B ? Hexagon::V6_vL32Ub_ai : Hexagon::V6_vL32Ub_ai_128B;
+
+  BuildMI(B, It, DL, HII.get(LoadOpc), DstR)
+    .addFrameIndex(FI)
+    .addImm(0)
+    .setMemRefs(MI->memoperands_begin(), MI->memoperands_end());
+
+  B.erase(It);
+  return true;
+}
+
+
+bool HexagonFrameLowering::expandSpillMacros(MachineFunction &MF,
+      SmallVectorImpl<unsigned> &NewRegs) const {
+  auto &HST = MF.getSubtarget<HexagonSubtarget>();
+  auto &HII = *HST.getInstrInfo();
+  MachineRegisterInfo &MRI = MF.getRegInfo();
+  bool Changed = false;
+
+  for (auto &B : MF) {
+    // Traverse the basic block.
+    MachineBasicBlock::iterator NextI;
+    for (auto I = B.begin(), E = B.end(); I != E; I = NextI) {
+      MachineInstr *MI = &*I;
+      NextI = std::next(I);
+      unsigned Opc = MI->getOpcode();
+
+      switch (Opc) {
+        case TargetOpcode::COPY:
+          Changed |= expandCopy(B, I, MRI, HII, NewRegs);
+          break;
+        case Hexagon::STriw_pred:
+        case Hexagon::STriw_mod:
+          Changed |= expandStoreInt(B, I, MRI, HII, NewRegs);
+          break;
+        case Hexagon::LDriw_pred:
+        case Hexagon::LDriw_mod:
+          Changed |= expandLoadInt(B, I, MRI, HII, NewRegs);
+          break;
+        case Hexagon::STriq_pred_V6:
+        case Hexagon::STriq_pred_V6_128B:
+          Changed |= expandStoreVecPred(B, I, MRI, HII, NewRegs);
+          break;
+        case Hexagon::LDriq_pred_V6:
+        case Hexagon::LDriq_pred_V6_128B:
+          Changed |= expandLoadVecPred(B, I, MRI, HII, NewRegs);
+          break;
+        case Hexagon::LDrivv_pseudo_V6:
+        case Hexagon::LDrivv_pseudo_V6_128B:
+          Changed |= expandLoadVec2(B, I, MRI, HII, NewRegs);
+          break;
+        case Hexagon::STrivv_pseudo_V6:
+        case Hexagon::STrivv_pseudo_V6_128B:
+          Changed |= expandStoreVec2(B, I, MRI, HII, NewRegs);
+          break;
+        case Hexagon::STriv_pseudo_V6:
+        case Hexagon::STriv_pseudo_V6_128B:
+          Changed |= expandStoreVec(B, I, MRI, HII, NewRegs);
+          break;
+        case Hexagon::LDriv_pseudo_V6:
+        case Hexagon::LDriv_pseudo_V6_128B:
+          Changed |= expandLoadVec(B, I, MRI, HII, NewRegs);
+          break;
+      }
+    }
+  }
+
+  return Changed;
+}
+
+
+void HexagonFrameLowering::determineCalleeSaves(MachineFunction &MF,
+                                                BitVector &SavedRegs,
+                                                RegScavenger *RS) const {
+  auto &HST = MF.getSubtarget<HexagonSubtarget>();
+  auto &HRI = *HST.getRegisterInfo();
+
+  SavedRegs.resize(HRI.getNumRegs());
+
+  // If we have a function containing __builtin_eh_return we want to spill and
+  // restore all callee saved registers. Pretend that they are used.
+  if (MF.getInfo<HexagonMachineFunctionInfo>()->hasEHReturn())
+    for (const MCPhysReg *R = HRI.getCalleeSavedRegs(&MF); *R; ++R)
+      SavedRegs.set(*R);
+
+  // Replace predicate register pseudo spill code.
+  SmallVector<unsigned,8> NewRegs;
+  expandSpillMacros(MF, NewRegs);
+  if (OptimizeSpillSlots && !isOptNone(MF))
+    optimizeSpillSlots(MF, NewRegs);
+
+  // We need to reserve a a spill slot if scavenging could potentially require
+  // spilling a scavenged register.
+  if (!NewRegs.empty()) {
+    MachineFrameInfo &MFI = *MF.getFrameInfo();
+    MachineRegisterInfo &MRI = MF.getRegInfo();
+    SetVector<const TargetRegisterClass*> SpillRCs;
+    // Reserve an int register in any case, because it could be used to hold
+    // the stack offset in case it does not fit into a spill instruction.
+    SpillRCs.insert(&Hexagon::IntRegsRegClass);
+
+    for (unsigned VR : NewRegs)
+      SpillRCs.insert(MRI.getRegClass(VR));
+
+    for (auto *RC : SpillRCs) {
+      if (!needToReserveScavengingSpillSlots(MF, HRI, RC))
+        continue;
+      unsigned Num = RC == &Hexagon::IntRegsRegClass ? NumberScavengerSlots : 1;
+      unsigned S = RC->getSize(), A = RC->getAlignment();
+      for (unsigned i = 0; i < Num; i++) {
+        int NewFI = MFI.CreateSpillStackObject(S, A);
+        RS->addScavengingFrameIndex(NewFI);
+      }
+    }
+  }
+
+  TargetFrameLowering::determineCalleeSaves(MF, SavedRegs, RS);
+}
+
+
+unsigned HexagonFrameLowering::findPhysReg(MachineFunction &MF,
+      HexagonBlockRanges::IndexRange &FIR,
+      HexagonBlockRanges::InstrIndexMap &IndexMap,
+      HexagonBlockRanges::RegToRangeMap &DeadMap,
+      const TargetRegisterClass *RC) const {
+  auto &HRI = *MF.getSubtarget<HexagonSubtarget>().getRegisterInfo();
+  auto &MRI = MF.getRegInfo();
+
+  auto isDead = [&FIR,&DeadMap] (unsigned Reg) -> bool {
+    auto F = DeadMap.find({Reg,0});
+    if (F == DeadMap.end())
+      return false;
+    for (auto &DR : F->second)
+      if (DR.contains(FIR))
+        return true;
+    return false;
+  };
+
+  for (unsigned Reg : RC->getRawAllocationOrder(MF)) {
+    bool Dead = true;
+    for (auto R : HexagonBlockRanges::expandToSubRegs({Reg,0}, MRI, HRI)) {
+      if (isDead(R.Reg))
+        continue;
+      Dead = false;
+      break;
+    }
+    if (Dead)
+      return Reg;
+  }
+  return 0;
+}
+
+void HexagonFrameLowering::optimizeSpillSlots(MachineFunction &MF,
+      SmallVectorImpl<unsigned> &VRegs) const {
+  auto &HST = MF.getSubtarget<HexagonSubtarget>();
+  auto &HII = *HST.getInstrInfo();
+  auto &HRI = *HST.getRegisterInfo();
+  auto &MRI = MF.getRegInfo();
+  HexagonBlockRanges HBR(MF);
+
+  typedef std::map<MachineBasicBlock*,HexagonBlockRanges::InstrIndexMap>
+      BlockIndexMap;
+  typedef std::map<MachineBasicBlock*,HexagonBlockRanges::RangeList>
+      BlockRangeMap;
+  typedef HexagonBlockRanges::IndexType IndexType;
+
+  struct SlotInfo {
+    BlockRangeMap Map;
+    unsigned Size;
+    const TargetRegisterClass *RC;
+
+    SlotInfo() : Map(), Size(0), RC(nullptr) {}
+  };
+
+  BlockIndexMap BlockIndexes;
+  SmallSet<int,4> BadFIs;
+  std::map<int,SlotInfo> FIRangeMap;
+
+  auto getRegClass = [&MRI,&HRI] (HexagonBlockRanges::RegisterRef R)
+        -> const TargetRegisterClass* {
+    if (TargetRegisterInfo::isPhysicalRegister(R.Reg))
+      assert(R.Sub == 0);
+    if (TargetRegisterInfo::isVirtualRegister(R.Reg)) {
+      auto *RCR = MRI.getRegClass(R.Reg);
+      if (R.Sub == 0)
+        return RCR;
+      unsigned PR = *RCR->begin();
+      R.Reg = HRI.getSubReg(PR, R.Sub);
+    }
+    return HRI.getMinimalPhysRegClass(R.Reg);
+  };
+  // Accumulate register classes: get a common class for a pre-existing
+  // class HaveRC and a new class NewRC. Return nullptr if a common class
+  // cannot be found, otherwise return the resulting class. If HaveRC is
+  // nullptr, assume that it is still unset.
+  auto getCommonRC = [&HRI] (const TargetRegisterClass *HaveRC,
+                             const TargetRegisterClass *NewRC)
+        -> const TargetRegisterClass* {
+    if (HaveRC == nullptr || HaveRC == NewRC)
+      return NewRC;
+    // Different classes, both non-null. Pick the more general one.
+    if (HaveRC->hasSubClassEq(NewRC))
+      return HaveRC;
+    if (NewRC->hasSubClassEq(HaveRC))
+      return NewRC;
+    return nullptr;
+  };
+
+  // Scan all blocks in the function. Check all occurrences of frame indexes,
+  // and collect relevant information.
+  for (auto &B : MF) {
+    std::map<int,IndexType> LastStore, LastLoad;
+    // Emplace appears not to be supported in gcc 4.7.2-4.
+    //auto P = BlockIndexes.emplace(&B, HexagonBlockRanges::InstrIndexMap(B));
+    auto P = BlockIndexes.insert(
+                std::make_pair(&B, HexagonBlockRanges::InstrIndexMap(B)));
+    auto &IndexMap = P.first->second;
+    DEBUG(dbgs() << "Index map for BB#" << B.getNumber() << "\n"
+                 << IndexMap << '\n');
+
+    for (auto &In : B) {
+      int LFI, SFI;
+      bool Load = HII.isLoadFromStackSlot(In, LFI) && !HII.isPredicated(In);
+      bool Store = HII.isStoreToStackSlot(In, SFI) && !HII.isPredicated(In);
+      if (Load && Store) {
+        // If it's both a load and a store, then we won't handle it.
+        BadFIs.insert(LFI);
+        BadFIs.insert(SFI);
+        continue;
+      }
+      // Check for register classes of the register used as the source for
+      // the store, and the register used as the destination for the load.
+      // Also, only accept base+imm_offset addressing modes. Other addressing
+      // modes can have side-effects (post-increments, etc.). For stack
+      // slots they are very unlikely, so there is not much loss due to
+      // this restriction.
+      if (Load || Store) {
+        int TFI = Load ? LFI : SFI;
+        unsigned AM = HII.getAddrMode(&In);
+        SlotInfo &SI = FIRangeMap[TFI];
+        bool Bad = (AM != HexagonII::BaseImmOffset);
+        if (!Bad) {
+          // If the addressing mode is ok, check the register class.
+          const TargetRegisterClass *RC = nullptr;
+          if (Load) {
+            MachineOperand &DataOp = In.getOperand(0);
+            RC = getRegClass({DataOp.getReg(), DataOp.getSubReg()});
+          } else {
+            MachineOperand &DataOp = In.getOperand(2);
+            RC = getRegClass({DataOp.getReg(), DataOp.getSubReg()});
+          }
+          RC = getCommonRC(SI.RC, RC);
+          if (RC == nullptr)
+            Bad = true;
+          else
+            SI.RC = RC;
+        }
+        if (!Bad) {
+          // Check sizes.
+          unsigned S = (1U << (HII.getMemAccessSize(&In) - 1));
+          if (SI.Size != 0 && SI.Size != S)
+            Bad = true;
+          else
+            SI.Size = S;
+        }
+        if (Bad)
+          BadFIs.insert(TFI);
+      }
+
+      // Locate uses of frame indices.
+      for (unsigned i = 0, n = In.getNumOperands(); i < n; ++i) {
+        const MachineOperand &Op = In.getOperand(i);
+        if (!Op.isFI())
+          continue;
+        int FI = Op.getIndex();
+        // Make sure that the following operand is an immediate and that
+        // it is 0. This is the offset in the stack object.
+        if (i+1 >= n || !In.getOperand(i+1).isImm() ||
+            In.getOperand(i+1).getImm() != 0)
+          BadFIs.insert(FI);
+        if (BadFIs.count(FI))
+          continue;
+
+        IndexType Index = IndexMap.getIndex(&In);
+        if (Load) {
+          if (LastStore[FI] == IndexType::None)
+            LastStore[FI] = IndexType::Entry;
+          LastLoad[FI] = Index;
+        } else if (Store) {
+          HexagonBlockRanges::RangeList &RL = FIRangeMap[FI].Map[&B];
+          if (LastStore[FI] != IndexType::None)
+            RL.add(LastStore[FI], LastLoad[FI], false, false);
+          else if (LastLoad[FI] != IndexType::None)
+            RL.add(IndexType::Entry, LastLoad[FI], false, false);
+          LastLoad[FI] = IndexType::None;
+          LastStore[FI] = Index;
+        } else {
+          BadFIs.insert(FI);
+        }
+      }
+    }
+
+    for (auto &I : LastLoad) {
+      IndexType LL = I.second;
+      if (LL == IndexType::None)
+        continue;
+      auto &RL = FIRangeMap[I.first].Map[&B];
+      IndexType &LS = LastStore[I.first];
+      if (LS != IndexType::None)
+        RL.add(LS, LL, false, false);
+      else
+        RL.add(IndexType::Entry, LL, false, false);
+      LS = IndexType::None;
+    }
+    for (auto &I : LastStore) {
+      IndexType LS = I.second;
+      if (LS == IndexType::None)
+        continue;
+      auto &RL = FIRangeMap[I.first].Map[&B];
+      RL.add(LS, IndexType::None, false, false);
+    }
+  }
+
+  DEBUG({
+    for (auto &P : FIRangeMap) {
+      dbgs() << "fi#" << P.first;
+      if (BadFIs.count(P.first))
+        dbgs() << " (bad)";
+      dbgs() << "  RC: ";
+      if (P.second.RC != nullptr)
+        dbgs() << HRI.getRegClassName(P.second.RC) << '\n';
+      else
+        dbgs() << "<null>\n";
+      for (auto &R : P.second.Map)
+        dbgs() << "  BB#" << R.first->getNumber() << " { " << R.second << "}\n";
+    }
+  });
+
+  // When a slot is loaded from in a block without being stored to in the
+  // same block, it is live-on-entry to this block. To avoid CFG analysis,
+  // consider this slot to be live-on-exit from all blocks.
+  SmallSet<int,4> LoxFIs;
+
+  std::map<MachineBasicBlock*,std::vector<int>> BlockFIMap;
+
+  for (auto &P : FIRangeMap) {
+    // P = pair(FI, map: BB->RangeList)
+    if (BadFIs.count(P.first))
+      continue;
+    for (auto &B : MF) {
+      auto F = P.second.Map.find(&B);
+      // F = pair(BB, RangeList)
+      if (F == P.second.Map.end() || F->second.empty())
+        continue;
+      HexagonBlockRanges::IndexRange &IR = F->second.front();
+      if (IR.start() == IndexType::Entry)
+        LoxFIs.insert(P.first);
+      BlockFIMap[&B].push_back(P.first);
+    }
+  }
+
+  DEBUG({
+    dbgs() << "Block-to-FI map (* -- live-on-exit):\n";
+    for (auto &P : BlockFIMap) {
+      auto &FIs = P.second;
+      if (FIs.empty())
+        continue;
+      dbgs() << "  BB#" << P.first->getNumber() << ": {";
+      for (auto I : FIs) {
+        dbgs() << " fi#" << I;
+        if (LoxFIs.count(I))
+          dbgs() << '*';
+      }
+      dbgs() << " }\n";
+    }
+  });
+
+  // eliminate loads, when all loads eliminated, eliminate all stores.
+  for (auto &B : MF) {
+    auto F = BlockIndexes.find(&B);
+    assert(F != BlockIndexes.end());
+    HexagonBlockRanges::InstrIndexMap &IM = F->second;
+    HexagonBlockRanges::RegToRangeMap LM = HBR.computeLiveMap(IM);
+    HexagonBlockRanges::RegToRangeMap DM = HBR.computeDeadMap(IM, LM);
+    DEBUG(dbgs() << "BB#" << B.getNumber() << " dead map\n"
+                 << HexagonBlockRanges::PrintRangeMap(DM, HRI));
+
+    for (auto FI : BlockFIMap[&B]) {
+      if (BadFIs.count(FI))
+        continue;
+      DEBUG(dbgs() << "Working on fi#" << FI << '\n');
+      HexagonBlockRanges::RangeList &RL = FIRangeMap[FI].Map[&B];
+      for (auto &Range : RL) {
+        DEBUG(dbgs() << "--Examining range:" << RL << '\n');
+        if (!IndexType::isInstr(Range.start()) ||
+            !IndexType::isInstr(Range.end()))
+          continue;
+        MachineInstr *SI = IM.getInstr(Range.start());
+        MachineInstr *EI = IM.getInstr(Range.end());
+        assert(SI->mayStore() && "Unexpected start instruction");
+        assert(EI->mayLoad() && "Unexpected end instruction");
+        MachineOperand &SrcOp = SI->getOperand(2);
+
+        HexagonBlockRanges::RegisterRef SrcRR = { SrcOp.getReg(),
+                                                  SrcOp.getSubReg() };
+        auto *RC = getRegClass({SrcOp.getReg(), SrcOp.getSubReg()});
+        // The this-> is needed to unconfuse MSVC.
+        unsigned FoundR = this->findPhysReg(MF, Range, IM, DM, RC);
+        DEBUG(dbgs() << "Replacement reg:" << PrintReg(FoundR, &HRI) << '\n');
+        if (FoundR == 0)
+          continue;
+
+        // Generate the copy-in: "FoundR = COPY SrcR" at the store location.
+        MachineBasicBlock::iterator StartIt = SI, NextIt;
+        MachineInstr *CopyIn = nullptr;
+        if (SrcRR.Reg != FoundR || SrcRR.Sub != 0) {
+          const DebugLoc &DL = SI->getDebugLoc();
+          CopyIn = BuildMI(B, StartIt, DL, HII.get(TargetOpcode::COPY), FoundR)
+                      .addOperand(SrcOp);
+        }
+
+        ++StartIt;
+        // Check if this is a last store and the FI is live-on-exit.
+        if (LoxFIs.count(FI) && (&Range == &RL.back())) {
+          // Update store's source register.
+          if (unsigned SR = SrcOp.getSubReg())
+            SrcOp.setReg(HRI.getSubReg(FoundR, SR));
+          else
+            SrcOp.setReg(FoundR);
+          SrcOp.setSubReg(0);
+          // We are keeping this register live.
+          SrcOp.setIsKill(false);
+        } else {
+          B.erase(SI);
+          IM.replaceInstr(SI, CopyIn);
+        }
+
+        auto EndIt = std::next(MachineBasicBlock::iterator(EI));
+        for (auto It = StartIt; It != EndIt; It = NextIt) {
+          MachineInstr *MI = &*It;
+          NextIt = std::next(It);
+          int TFI;
+          if (!HII.isLoadFromStackSlot(*MI, TFI) || TFI != FI)
+            continue;
+          unsigned DstR = MI->getOperand(0).getReg();
+          assert(MI->getOperand(0).getSubReg() == 0);
+          MachineInstr *CopyOut = nullptr;
+          if (DstR != FoundR) {
+            DebugLoc DL = MI->getDebugLoc();
+            unsigned MemSize = (1U << (HII.getMemAccessSize(MI) - 1));
+            assert(HII.getAddrMode(MI) == HexagonII::BaseImmOffset);
+            unsigned CopyOpc = TargetOpcode::COPY;
+            if (HII.isSignExtendingLoad(*MI))
+              CopyOpc = (MemSize == 1) ? Hexagon::A2_sxtb : Hexagon::A2_sxth;
+            else if (HII.isZeroExtendingLoad(*MI))
+              CopyOpc = (MemSize == 1) ? Hexagon::A2_zxtb : Hexagon::A2_zxth;
+            CopyOut = BuildMI(B, It, DL, HII.get(CopyOpc), DstR)
+                        .addReg(FoundR, getKillRegState(MI == EI));
+          }
+          IM.replaceInstr(MI, CopyOut);
+          B.erase(It);
+        }
+
+        // Update the dead map.
+        HexagonBlockRanges::RegisterRef FoundRR = { FoundR, 0 };
+        for (auto RR : HexagonBlockRanges::expandToSubRegs(FoundRR, MRI, HRI))
+          DM[RR].subtract(Range);
+      } // for Range in range list
+    }
+  }
+}
+
+
 void HexagonFrameLowering::expandAlloca(MachineInstr *AI,
       const HexagonInstrInfo &HII, unsigned SP, unsigned CF) const {
   MachineBasicBlock &MB = *AI->getParent();
@@ -1407,15 +2257,13 @@ const MachineInstr *HexagonFrameLowering::getAlignaInstr(
 }
 
 
-// FIXME: Use Function::optForSize().
-inline static bool isOptSize(const MachineFunction &MF) {
-  AttributeSet AF = MF.getFunction()->getAttributes();
-  return AF.hasAttribute(AttributeSet::FunctionIndex,
-                         Attribute::OptimizeForSize);
-}
-
-inline static bool isMinSize(const MachineFunction &MF) {
-  return MF.getFunction()->optForMinSize();
+/// Adds all callee-saved registers as implicit uses or defs to the
+/// instruction.
+void HexagonFrameLowering::addCalleeSaveRegistersAsImpOperand(MachineInstr *MI,
+      const CSIVect &CSI, bool IsDef, bool IsKill) const {
+  // Add the callee-saved registers as implicit uses.
+  for (auto &R : CSI)
+    MI->addOperand(MachineOperand::CreateReg(R.getReg(), IsDef, true, IsKill));
 }
 
 
@@ -1472,7 +2320,18 @@ bool HexagonFrameLowering::useRestoreFunction(MachineFunction &MF,
       const CSIVect &CSI) const {
   if (shouldInlineCSR(MF, CSI))
     return false;
+  // The restore functions do a bit more than just restoring registers.
+  // The non-returning versions will go back directly to the caller's
+  // caller, others will clean up the stack frame in preparation for
+  // a tail call. Using them can still save code size even if only one
+  // register is getting restores. Make the decision based on -Oz:
+  // using -Os will use inline restore for a single register.
+  if (isMinSize(MF))
+    return true;
   unsigned NumCSI = CSI.size();
+  if (NumCSI <= 1)
+    return false;
+
   unsigned Threshold = isOptSize(MF) ? SpillFuncThresholdOs-1
                                      : SpillFuncThreshold;
   return Threshold < NumCSI;
diff --git a/lib/Target/Hexagon/HexagonFrameLowering.h b/lib/Target/Hexagon/HexagonFrameLowering.h
index 683b303d43ea..3e76214559b7 100644
--- a/lib/Target/Hexagon/HexagonFrameLowering.h
+++ b/lib/Target/Hexagon/HexagonFrameLowering.h
@@ -11,6 +11,7 @@
 #define LLVM_LIB_TARGET_HEXAGON_HEXAGONFRAMELOWERING_H
 
 #include "Hexagon.h"
+#include "HexagonBlockRanges.h"
 #include "llvm/Target/TargetFrameLowering.h"
 
 namespace llvm {
@@ -41,22 +42,23 @@ public:
     return true;
   }
 
-  void eliminateCallFramePseudoInstr(MachineFunction &MF,
-      MachineBasicBlock &MBB, MachineBasicBlock::iterator I) const override;
+  MachineBasicBlock::iterator
+  eliminateCallFramePseudoInstr(MachineFunction &MF, MachineBasicBlock &MBB,
+                                MachineBasicBlock::iterator I) const override;
   void processFunctionBeforeFrameFinalized(MachineFunction &MF,
-        RegScavenger *RS = nullptr) const override;
+      RegScavenger *RS = nullptr) const override;
   void determineCalleeSaves(MachineFunction &MF, BitVector &SavedRegs,
-        RegScavenger *RS) const override;
+      RegScavenger *RS) const override;
 
   bool targetHandlesStackFrameRounding() const override {
     return true;
   }
   int getFrameIndexReference(const MachineFunction &MF, int FI,
-                             unsigned &FrameReg) const override;
+      unsigned &FrameReg) const override;
   bool hasFP(const MachineFunction &MF) const override;
 
   const SpillSlot *getCalleeSavedSpillSlots(unsigned &NumEntries)
-        const override {
+      const override {
     static const SpillSlot Offsets[] = {
       { Hexagon::R17, -4 }, { Hexagon::R16, -8 }, { Hexagon::D8, -8 },
       { Hexagon::R19, -12 }, { Hexagon::R18, -16 }, { Hexagon::D9, -16 },
@@ -83,22 +85,61 @@ private:
 
   void expandAlloca(MachineInstr *AI, const HexagonInstrInfo &TII,
       unsigned SP, unsigned CF) const;
-  void insertPrologueInBlock(MachineBasicBlock &MBB) const;
+  void insertPrologueInBlock(MachineBasicBlock &MBB, bool PrologueStubs) const;
   void insertEpilogueInBlock(MachineBasicBlock &MBB) const;
   bool insertCSRSpillsInBlock(MachineBasicBlock &MBB, const CSIVect &CSI,
-      const HexagonRegisterInfo &HRI) const;
+      const HexagonRegisterInfo &HRI, bool &PrologueStubs) const;
   bool insertCSRRestoresInBlock(MachineBasicBlock &MBB, const CSIVect &CSI,
       const HexagonRegisterInfo &HRI) const;
+  bool updateExitPaths(MachineBasicBlock &MBB, MachineBasicBlock *RestoreB,
+      BitVector &DoneT, BitVector &DoneF, BitVector &Path) const;
   void insertCFIInstructionsAt(MachineBasicBlock &MBB,
       MachineBasicBlock::iterator At) const;
 
   void adjustForCalleeSavedRegsSpillCall(MachineFunction &MF) const;
-  bool replacePredRegPseudoSpillCode(MachineFunction &MF) const;
-  bool replaceVecPredRegPseudoSpillCode(MachineFunction &MF) const;
+
+  bool expandCopy(MachineBasicBlock &B, MachineBasicBlock::iterator It,
+      MachineRegisterInfo &MRI, const HexagonInstrInfo &HII,
+      SmallVectorImpl<unsigned> &NewRegs) const;
+  bool expandStoreInt(MachineBasicBlock &B, MachineBasicBlock::iterator It,
+      MachineRegisterInfo &MRI, const HexagonInstrInfo &HII,
+      SmallVectorImpl<unsigned> &NewRegs) const;
+  bool expandLoadInt(MachineBasicBlock &B, MachineBasicBlock::iterator It,
+      MachineRegisterInfo &MRI, const HexagonInstrInfo &HII,
+      SmallVectorImpl<unsigned> &NewRegs) const;
+  bool expandStoreVecPred(MachineBasicBlock &B, MachineBasicBlock::iterator It,
+      MachineRegisterInfo &MRI, const HexagonInstrInfo &HII,
+      SmallVectorImpl<unsigned> &NewRegs) const;
+  bool expandLoadVecPred(MachineBasicBlock &B, MachineBasicBlock::iterator It,
+      MachineRegisterInfo &MRI, const HexagonInstrInfo &HII,
+      SmallVectorImpl<unsigned> &NewRegs) const;
+  bool expandStoreVec2(MachineBasicBlock &B, MachineBasicBlock::iterator It,
+      MachineRegisterInfo &MRI, const HexagonInstrInfo &HII,
+      SmallVectorImpl<unsigned> &NewRegs) const;
+  bool expandLoadVec2(MachineBasicBlock &B, MachineBasicBlock::iterator It,
+      MachineRegisterInfo &MRI, const HexagonInstrInfo &HII,
+      SmallVectorImpl<unsigned> &NewRegs) const;
+  bool expandStoreVec(MachineBasicBlock &B, MachineBasicBlock::iterator It,
+      MachineRegisterInfo &MRI, const HexagonInstrInfo &HII,
+      SmallVectorImpl<unsigned> &NewRegs) const;
+  bool expandLoadVec(MachineBasicBlock &B, MachineBasicBlock::iterator It,
+      MachineRegisterInfo &MRI, const HexagonInstrInfo &HII,
+      SmallVectorImpl<unsigned> &NewRegs) const;
+  bool expandSpillMacros(MachineFunction &MF,
+      SmallVectorImpl<unsigned> &NewRegs) const;
+
+  unsigned findPhysReg(MachineFunction &MF, HexagonBlockRanges::IndexRange &FIR,
+      HexagonBlockRanges::InstrIndexMap &IndexMap,
+      HexagonBlockRanges::RegToRangeMap &DeadMap,
+      const TargetRegisterClass *RC) const;
+  void optimizeSpillSlots(MachineFunction &MF,
+      SmallVectorImpl<unsigned> &VRegs) const;
 
   void findShrunkPrologEpilog(MachineFunction &MF, MachineBasicBlock *&PrologB,
       MachineBasicBlock *&EpilogB) const;
 
+  void addCalleeSaveRegistersAsImpOperand(MachineInstr *MI, const CSIVect &CSI,
+      bool IsDef, bool IsKill) const;
   bool shouldInlineCSR(llvm::MachineFunction &MF, const CSIVect &CSI) const;
   bool useSpillFunction(MachineFunction &MF, const CSIVect &CSI) const;
   bool useRestoreFunction(MachineFunction &MF, const CSIVect &CSI) const;
diff --git a/lib/Target/Hexagon/HexagonGenExtract.cpp b/lib/Target/Hexagon/HexagonGenExtract.cpp
index f26e2ff764d7..f46b6d2a82e3 100644
--- a/lib/Target/Hexagon/HexagonGenExtract.cpp
+++ b/lib/Target/Hexagon/HexagonGenExtract.cpp
@@ -242,6 +242,9 @@ bool HexagonGenExtract::visitBlock(BasicBlock *B) {
 
 
 bool HexagonGenExtract::runOnFunction(Function &F) {
+  if (skipFunction(F))
+    return false;
+
   DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
   bool Changed;
 
diff --git a/lib/Target/Hexagon/HexagonGenInsert.cpp b/lib/Target/Hexagon/HexagonGenInsert.cpp
index 64a2b6cec18a..71d079193d79 100644
--- a/lib/Target/Hexagon/HexagonGenInsert.cpp
+++ b/lib/Target/Hexagon/HexagonGenInsert.cpp
@@ -9,11 +9,8 @@
 
 #define DEBUG_TYPE "hexinsert"
 
-#include "llvm/Pass.h"
-#include "llvm/PassRegistry.h"
 #include "llvm/ADT/BitVector.h"
 #include "llvm/ADT/DenseMap.h"
-#include "llvm/ADT/DenseSet.h"
 #include "llvm/ADT/PostOrderIterator.h"
 #include "llvm/CodeGen/MachineDominators.h"
 #include "llvm/CodeGen/MachineFunction.h"
@@ -21,10 +18,12 @@
 #include "llvm/CodeGen/MachineInstrBuilder.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
 #include "llvm/IR/Constants.h"
+#include "llvm/Pass.h"
+#include "llvm/PassRegistry.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Debug.h"
-#include "llvm/Support/raw_ostream.h"
 #include "llvm/Support/Timer.h"
+#include "llvm/Support/raw_ostream.h"
 #include "llvm/Target/TargetMachine.h"
 #include "llvm/Target/TargetRegisterInfo.h"
 
@@ -33,7 +32,6 @@
 #include "HexagonTargetMachine.h"
 #include "HexagonBitTracker.h"
 
-#include <map>
 #include <vector>
 
 using namespace llvm;
@@ -1446,7 +1444,7 @@ bool HexagonGenInsert::removeDeadCode(MachineDomTreeNode *N) {
 
     bool AllDead = true;
     SmallVector<unsigned,2> Regs;
-    for (ConstMIOperands Op(MI); Op.isValid(); ++Op) {
+    for (ConstMIOperands Op(*MI); Op.isValid(); ++Op) {
       if (!Op->isReg() || !Op->isDef())
         continue;
       unsigned R = Op->getReg();
@@ -1471,6 +1469,9 @@ bool HexagonGenInsert::removeDeadCode(MachineDomTreeNode *N) {
 
 
 bool HexagonGenInsert::runOnMachineFunction(MachineFunction &MF) {
+  if (skipFunction(*MF.getFunction()))
+    return false;
+
   bool Timing = OptTiming, TimingDetail = Timing && OptTimingDetail;
   bool Changed = false;
   TimerGroup __G("hexinsert");
diff --git a/lib/Target/Hexagon/HexagonGenMux.cpp b/lib/Target/Hexagon/HexagonGenMux.cpp
index c059d566709e..bb9256db4b48 100644
--- a/lib/Target/Hexagon/HexagonGenMux.cpp
+++ b/lib/Target/Hexagon/HexagonGenMux.cpp
@@ -49,6 +49,10 @@ namespace {
       MachineFunctionPass::getAnalysisUsage(AU);
     }
     bool runOnMachineFunction(MachineFunction &MF) override;
+    MachineFunctionProperties getRequiredProperties() const override {
+      return MachineFunctionProperties().set(
+          MachineFunctionProperties::Property::AllVRegsAllocated);
+    }
 
   private:
     const HexagonInstrInfo *HII;
@@ -70,10 +74,10 @@ namespace {
       MachineOperand *SrcT, *SrcF;
       MachineInstr *Def1, *Def2;
       MuxInfo(MachineBasicBlock::iterator It, unsigned DR, unsigned PR,
-            MachineOperand *TOp, MachineOperand *FOp,
-            MachineInstr *D1, MachineInstr *D2)
-        : At(It), DefR(DR), PredR(PR), SrcT(TOp), SrcF(FOp), Def1(D1),
-          Def2(D2) {}
+              MachineOperand *TOp, MachineOperand *FOp, MachineInstr &D1,
+              MachineInstr &D2)
+          : At(It), DefR(DR), PredR(PR), SrcT(TOp), SrcF(FOp), Def1(&D1),
+            Def2(&D2) {}
     };
     typedef DenseMap<MachineInstr*,unsigned> InstrIndexMap;
     typedef DenseMap<unsigned,DefUseInfo> DefUseInfoMap;
@@ -128,7 +132,7 @@ void HexagonGenMux::getDefsUses(const MachineInstr *MI, BitVector &Defs,
       expandReg(*R++, Uses);
 
   // Look over all operands, and collect explicit defs and uses.
-  for (ConstMIOperands Mo(MI); Mo.isValid(); ++Mo) {
+  for (ConstMIOperands Mo(*MI); Mo.isValid(); ++Mo) {
     if (!Mo->isReg() || Mo->isImplicit())
       continue;
     unsigned R = Mo->getReg();
@@ -258,8 +262,8 @@ bool HexagonGenMux::genMuxInBlock(MachineBasicBlock &B) {
     MachineBasicBlock::iterator It1 = B.begin(), It2 = B.begin();
     std::advance(It1, MinX);
     std::advance(It2, MaxX);
-    MachineInstr *Def1 = It1, *Def2 = It2;
-    MachineOperand *Src1 = &Def1->getOperand(2), *Src2 = &Def2->getOperand(2);
+    MachineInstr &Def1 = *It1, &Def2 = *It2;
+    MachineOperand *Src1 = &Def1.getOperand(2), *Src2 = &Def2.getOperand(2);
     unsigned SR1 = Src1->isReg() ? Src1->getReg() : 0;
     unsigned SR2 = Src2->isReg() ? Src2->getReg() : 0;
     bool Failure = false, CanUp = true, CanDown = true;
@@ -305,6 +309,8 @@ bool HexagonGenMux::genMuxInBlock(MachineBasicBlock &B) {
 }
 
 bool HexagonGenMux::runOnMachineFunction(MachineFunction &MF) {
+  if (skipFunction(*MF.getFunction()))
+    return false;
   HII = MF.getSubtarget<HexagonSubtarget>().getInstrInfo();
   HRI = MF.getSubtarget<HexagonSubtarget>().getRegisterInfo();
   bool Changed = false;
@@ -316,4 +322,3 @@ bool HexagonGenMux::runOnMachineFunction(MachineFunction &MF) {
 FunctionPass *llvm::createHexagonGenMux() {
   return new HexagonGenMux();
 }
-
diff --git a/lib/Target/Hexagon/HexagonGenPredicate.cpp b/lib/Target/Hexagon/HexagonGenPredicate.cpp
index d9675b5173d2..dcfd3e8317a9 100644
--- a/lib/Target/Hexagon/HexagonGenPredicate.cpp
+++ b/lib/Target/Hexagon/HexagonGenPredicate.cpp
@@ -9,24 +9,22 @@
 
 #define DEBUG_TYPE "gen-pred"
 
+#include "HexagonTargetMachine.h"
 #include "llvm/ADT/SetVector.h"
-#include "llvm/CodeGen/Passes.h"
 #include "llvm/CodeGen/MachineDominators.h"
 #include "llvm/CodeGen/MachineFunctionPass.h"
 #include "llvm/CodeGen/MachineInstrBuilder.h"
 #include "llvm/CodeGen/MachineLoopInfo.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
-#include "llvm/Support/CommandLine.h"
+#include "llvm/CodeGen/Passes.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/raw_ostream.h"
-#include "llvm/Target/TargetMachine.h"
 #include "llvm/Target/TargetInstrInfo.h"
-#include "HexagonTargetMachine.h"
+#include "llvm/Target/TargetMachine.h"
 
 #include <functional>
 #include <queue>
 #include <set>
-#include <vector>
 
 using namespace llvm;
 
@@ -157,7 +155,7 @@ unsigned HexagonGenPredicate::getPredForm(unsigned Opc) {
   // The opcode corresponding to 0 is TargetOpcode::PHI. We can use 0 here
   // to denote "none", but we need to make sure that none of the valid opcodes
   // that we return will ever be 0.
-  assert(PHI == 0 && "Use different value for <none>");
+  static_assert(PHI == 0, "Use different value for <none>");
   return 0;
 }
 
@@ -332,7 +330,7 @@ bool HexagonGenPredicate::isScalarPred(Register PredReg) {
       case Hexagon::C4_or_orn:
       case Hexagon::C2_xor:
         // Add operands to the queue.
-        for (ConstMIOperands Mo(DefI); Mo.isValid(); ++Mo)
+        for (ConstMIOperands Mo(*DefI); Mo.isValid(); ++Mo)
           if (Mo->isReg() && Mo->isUse())
             WorkQ.push(Register(Mo->getReg()));
         break;
@@ -449,13 +447,12 @@ bool HexagonGenPredicate::eliminatePredCopies(MachineFunction &MF) {
   // the convertible instruction is converted, its predicate result will be
   // copied back into the original gpr.
 
-  for (MachineFunction::iterator A = MF.begin(), Z = MF.end(); A != Z; ++A) {
-    MachineBasicBlock &B = *A;
-    for (MachineBasicBlock::iterator I = B.begin(), E = B.end(); I != E; ++I) {
-      if (I->getOpcode() != TargetOpcode::COPY)
+  for (MachineBasicBlock &MBB : MF) {
+    for (MachineInstr &MI : MBB) {
+      if (MI.getOpcode() != TargetOpcode::COPY)
         continue;
-      Register DR = I->getOperand(0);
-      Register SR = I->getOperand(1);
+      Register DR = MI.getOperand(0);
+      Register SR = MI.getOperand(1);
       if (!TargetRegisterInfo::isVirtualRegister(DR.R))
         continue;
       if (!TargetRegisterInfo::isVirtualRegister(SR.R))
@@ -466,7 +463,7 @@ bool HexagonGenPredicate::eliminatePredCopies(MachineFunction &MF) {
         continue;
       assert(!DR.S && !SR.S && "Unexpected subregister");
       MRI->replaceRegWith(DR.R, SR.R);
-      Erase.insert(I);
+      Erase.insert(&MI);
       Changed = true;
     }
   }
@@ -479,6 +476,9 @@ bool HexagonGenPredicate::eliminatePredCopies(MachineFunction &MF) {
 
 
 bool HexagonGenPredicate::runOnMachineFunction(MachineFunction &MF) {
+  if (skipFunction(*MF.getFunction()))
+    return false;
+
   TII = MF.getSubtarget<HexagonSubtarget>().getInstrInfo();
   TRI = MF.getSubtarget<HexagonSubtarget>().getRegisterInfo();
   MRI = &MF.getRegInfo();
diff --git a/lib/Target/Hexagon/HexagonHardwareLoops.cpp b/lib/Target/Hexagon/HexagonHardwareLoops.cpp
index d20a809d6c09..cc154c4be012 100644
--- a/lib/Target/Hexagon/HexagonHardwareLoops.cpp
+++ b/lib/Target/Hexagon/HexagonHardwareLoops.cpp
@@ -346,6 +346,8 @@ FunctionPass *llvm::createHexagonHardwareLoops() {
 
 bool HexagonHardwareLoops::runOnMachineFunction(MachineFunction &MF) {
   DEBUG(dbgs() << "********* Hexagon Hardware Loops *********\n");
+  if (skipFunction(*MF.getFunction()))
+    return false;
 
   bool Changed = false;
 
@@ -434,7 +436,7 @@ bool HexagonHardwareLoops::findInductionRegister(MachineLoop *L,
 
   SmallVector<MachineOperand,2> Cond;
   MachineBasicBlock *TB = nullptr, *FB = nullptr;
-  bool NotAnalyzed = TII->AnalyzeBranch(*ExitingBlock, TB, FB, Cond, false);
+  bool NotAnalyzed = TII->analyzeBranch(*ExitingBlock, TB, FB, Cond, false);
   if (NotAnalyzed)
     return false;
 
@@ -448,8 +450,8 @@ bool HexagonHardwareLoops::findInductionRegister(MachineLoop *L,
 
   unsigned CmpReg1 = 0, CmpReg2 = 0;
   int CmpImm = 0, CmpMask = 0;
-  bool CmpAnalyzed = TII->analyzeCompare(PredI, CmpReg1, CmpReg2,
-                                         CmpMask, CmpImm);
+  bool CmpAnalyzed =
+      TII->analyzeCompare(*PredI, CmpReg1, CmpReg2, CmpMask, CmpImm);
   // Fail if the compare was not analyzed, or it's not comparing a register
   // with an immediate value.  Not checking the mask here, since we handle
   // the individual compare opcodes (including A4_cmpb*) later on.
@@ -581,7 +583,7 @@ CountValue *HexagonHardwareLoops::getLoopTripCount(MachineLoop *L,
 
   SmallVector<MachineOperand,2> Cond;
   MachineBasicBlock *TB = nullptr, *FB = nullptr;
-  bool NotAnalyzed = TII->AnalyzeBranch(*ExitingBlock, TB, FB, Cond, false);
+  bool NotAnalyzed = TII->analyzeBranch(*ExitingBlock, TB, FB, Cond, false);
   if (NotAnalyzed)
     return nullptr;
 
@@ -593,7 +595,7 @@ CountValue *HexagonHardwareLoops::getLoopTripCount(MachineLoop *L,
   if (ExitingBlock != Latch && (TB == Latch || FB == Latch)) {
     MachineBasicBlock *LTB = 0, *LFB = 0;
     SmallVector<MachineOperand,2> LCond;
-    bool NotAnalyzed = TII->AnalyzeBranch(*Latch, LTB, LFB, LCond, false);
+    bool NotAnalyzed = TII->analyzeBranch(*Latch, LTB, LFB, LCond, false);
     if (NotAnalyzed)
       return nullptr;
     if (TB == Latch)
@@ -618,8 +620,8 @@ CountValue *HexagonHardwareLoops::getLoopTripCount(MachineLoop *L,
 
   unsigned CmpReg1 = 0, CmpReg2 = 0;
   int Mask = 0, ImmValue = 0;
-  bool AnalyzedCmp = TII->analyzeCompare(CondI, CmpReg1, CmpReg2,
-                                         Mask, ImmValue);
+  bool AnalyzedCmp =
+      TII->analyzeCompare(*CondI, CmpReg1, CmpReg2, Mask, ImmValue);
   if (!AnalyzedCmp)
     return nullptr;
 
@@ -1184,7 +1186,7 @@ bool HexagonHardwareLoops::convertToHardwareLoop(MachineLoop *L,
     MachineBasicBlock *TB = 0, *FB = 0;
     SmallVector<MachineOperand, 2> Cond;
 
-    if (TII->AnalyzeBranch(*ExitingBlock, TB, FB, Cond, false))
+    if (TII->analyzeBranch(*ExitingBlock, TB, FB, Cond, false))
       return false;
 
     if (L->contains(TB))
@@ -1418,12 +1420,12 @@ bool HexagonHardwareLoops::loopCountMayWrapOrUnderFlow(
     unsigned CmpReg1 = 0, CmpReg2 = 0;
     int CmpMask = 0, CmpValue = 0;
 
-    if (!TII->analyzeCompare(MI, CmpReg1, CmpReg2, CmpMask, CmpValue))
+    if (!TII->analyzeCompare(*MI, CmpReg1, CmpReg2, CmpMask, CmpValue))
       continue;
 
     MachineBasicBlock *TBB = 0, *FBB = 0;
     SmallVector<MachineOperand, 2> Cond;
-    if (TII->AnalyzeBranch(*MI->getParent(), TBB, FBB, Cond, false))
+    if (TII->analyzeBranch(*MI->getParent(), TBB, FBB, Cond, false))
       continue;
 
     Comparison::Kind Cmp = getComparisonKind(MI->getOpcode(), 0, 0, 0);
@@ -1619,14 +1621,14 @@ bool HexagonHardwareLoops::fixupInductionVariable(MachineLoop *L) {
   MachineBasicBlock *TB = nullptr, *FB = nullptr;
   SmallVector<MachineOperand,2> Cond;
   // AnalyzeBranch returns true if it fails to analyze branch.
-  bool NotAnalyzed = TII->AnalyzeBranch(*ExitingBlock, TB, FB, Cond, false);
+  bool NotAnalyzed = TII->analyzeBranch(*ExitingBlock, TB, FB, Cond, false);
   if (NotAnalyzed || Cond.empty())
     return false;
 
   if (ExitingBlock != Latch && (TB == Latch || FB == Latch)) {
     MachineBasicBlock *LTB = 0, *LFB = 0;
     SmallVector<MachineOperand,2> LCond;
-    bool NotAnalyzed = TII->AnalyzeBranch(*Latch, LTB, LFB, LCond, false);
+    bool NotAnalyzed = TII->analyzeBranch(*Latch, LTB, LFB, LCond, false);
     if (NotAnalyzed)
       return false;
 
@@ -1837,12 +1839,12 @@ MachineBasicBlock *HexagonHardwareLoops::createPreheaderForLoop(
   SmallVector<MachineOperand,2> Tmp1;
   MachineBasicBlock *TB = nullptr, *FB = nullptr;
 
-  if (TII->AnalyzeBranch(*ExitingBlock, TB, FB, Tmp1, false))
+  if (TII->analyzeBranch(*ExitingBlock, TB, FB, Tmp1, false))
     return nullptr;
 
   for (MBBVector::iterator I = Preds.begin(), E = Preds.end(); I != E; ++I) {
     MachineBasicBlock *PB = *I;
-    bool NotAnalyzed = TII->AnalyzeBranch(*PB, TB, FB, Tmp1, false);
+    bool NotAnalyzed = TII->analyzeBranch(*PB, TB, FB, Tmp1, false);
     if (NotAnalyzed)
       return nullptr;
   }
@@ -1928,7 +1930,7 @@ MachineBasicBlock *HexagonHardwareLoops::createPreheaderForLoop(
     MachineBasicBlock *PB = *I;
     if (PB != Latch) {
       Tmp2.clear();
-      bool NotAnalyzed = TII->AnalyzeBranch(*PB, TB, FB, Tmp2, false);
+      bool NotAnalyzed = TII->analyzeBranch(*PB, TB, FB, Tmp2, false);
       (void)NotAnalyzed; // suppress compiler warning
       assert (!NotAnalyzed && "Should be analyzable!");
       if (TB != Header && (Tmp2.empty() || FB != Header))
@@ -1940,7 +1942,7 @@ MachineBasicBlock *HexagonHardwareLoops::createPreheaderForLoop(
   // It can happen that the latch block will fall through into the header.
   // Insert an unconditional branch to the header.
   TB = FB = nullptr;
-  bool LatchNotAnalyzed = TII->AnalyzeBranch(*Latch, TB, FB, Tmp2, false);
+  bool LatchNotAnalyzed = TII->analyzeBranch(*Latch, TB, FB, Tmp2, false);
   (void)LatchNotAnalyzed; // suppress compiler warning
   assert (!LatchNotAnalyzed && "Should be analyzable!");
   if (!TB && !FB)
diff --git a/lib/Target/Hexagon/HexagonISelDAGToDAG.cpp b/lib/Target/Hexagon/HexagonISelDAGToDAG.cpp
index a0da945e7572..22247aa39b61 100644
--- a/lib/Target/Hexagon/HexagonISelDAGToDAG.cpp
+++ b/lib/Target/Hexagon/HexagonISelDAGToDAG.cpp
@@ -15,13 +15,11 @@
 #include "HexagonISelLowering.h"
 #include "HexagonMachineFunctionInfo.h"
 #include "HexagonTargetMachine.h"
-#include "llvm/ADT/DenseMap.h"
 #include "llvm/CodeGen/FunctionLoweringInfo.h"
 #include "llvm/CodeGen/MachineInstrBuilder.h"
 #include "llvm/CodeGen/SelectionDAGISel.h"
 #include "llvm/IR/Intrinsics.h"
 #include "llvm/Support/CommandLine.h"
-#include "llvm/Support/Compiler.h"
 #include "llvm/Support/Debug.h"
 using namespace llvm;
 
@@ -38,17 +36,13 @@ MaxNumOfUsesForConstExtenders("ga-max-num-uses-for-constant-extenders",
 // Instruction Selector Implementation
 //===----------------------------------------------------------------------===//
 
-namespace llvm {
-  void initializeHexagonDAGToDAGISelPass(PassRegistry&);
-}
-
 //===--------------------------------------------------------------------===//
 /// HexagonDAGToDAGISel - Hexagon specific code to select Hexagon machine
 /// instructions for SelectionDAG operations.
 ///
 namespace {
 class HexagonDAGToDAGISel : public SelectionDAGISel {
-  const HexagonTargetMachine& HTM;
+  const HexagonTargetMachine &HTM;
   const HexagonSubtarget *HST;
   const HexagonInstrInfo *HII;
   const HexagonRegisterInfo *HRI;
@@ -56,9 +50,7 @@ public:
   explicit HexagonDAGToDAGISel(HexagonTargetMachine &tm,
                                CodeGenOpt::Level OptLevel)
       : SelectionDAGISel(tm, OptLevel), HTM(tm), HST(nullptr), HII(nullptr),
-        HRI(nullptr) {
-    initializeHexagonDAGToDAGISelPass(*PassRegistry::getPassRegistry());
-  }
+        HRI(nullptr) {}
 
   bool runOnMachineFunction(MachineFunction &MF) override {
     // Reset the subtarget each time through.
@@ -72,7 +64,7 @@ public:
   virtual void PreprocessISelDAG() override;
   virtual void EmitFunctionEntryCode() override;
 
-  SDNode *Select(SDNode *N) override;
+  void Select(SDNode *N) override;
 
   // Complex Pattern Selectors.
   inline bool SelectAddrGA(SDValue &N, SDValue &R);
@@ -84,36 +76,41 @@ public:
     return "Hexagon DAG->DAG Pattern Instruction Selection";
   }
 
-  SDNode *SelectFrameIndex(SDNode *N);
+  // Generate a machine instruction node corresponding to the circ/brev
+  // load intrinsic.
+  MachineSDNode *LoadInstrForLoadIntrinsic(SDNode *IntN);
+  // Given the circ/brev load intrinsic and the already generated machine
+  // instruction, generate the appropriate store (that is a part of the
+  // intrinsic's functionality).
+  SDNode *StoreInstrForLoadIntrinsic(MachineSDNode *LoadN, SDNode *IntN);
+
+  void SelectFrameIndex(SDNode *N);
   /// SelectInlineAsmMemoryOperand - Implement addressing mode selection for
   /// inline asm expressions.
   bool SelectInlineAsmMemoryOperand(const SDValue &Op,
                                     unsigned ConstraintID,
                                     std::vector<SDValue> &OutOps) override;
-  SDNode *SelectLoad(SDNode *N);
-  SDNode *SelectBaseOffsetLoad(LoadSDNode *LD, SDLoc dl);
-  SDNode *SelectIndexedLoad(LoadSDNode *LD, SDLoc dl);
-  SDNode *SelectIndexedLoadZeroExtend64(LoadSDNode *LD, unsigned Opcode,
-                                        SDLoc dl);
-  SDNode *SelectIndexedLoadSignExtend64(LoadSDNode *LD, unsigned Opcode,
-                                        SDLoc dl);
-  SDNode *SelectBaseOffsetStore(StoreSDNode *ST, SDLoc dl);
-  SDNode *SelectIndexedStore(StoreSDNode *ST, SDLoc dl);
-  SDNode *SelectStore(SDNode *N);
-  SDNode *SelectSHL(SDNode *N);
-  SDNode *SelectMul(SDNode *N);
-  SDNode *SelectZeroExtend(SDNode *N);
-  SDNode *SelectIntrinsicWChain(SDNode *N);
-  SDNode *SelectIntrinsicWOChain(SDNode *N);
-  SDNode *SelectConstant(SDNode *N);
-  SDNode *SelectConstantFP(SDNode *N);
-  SDNode *SelectAdd(SDNode *N);
-  SDNode *SelectBitOp(SDNode *N);
+  bool tryLoadOfLoadIntrinsic(LoadSDNode *N);
+  void SelectLoad(SDNode *N);
+  void SelectBaseOffsetLoad(LoadSDNode *LD, SDLoc dl);
+  void SelectIndexedLoad(LoadSDNode *LD, const SDLoc &dl);
+  void SelectIndexedStore(StoreSDNode *ST, const SDLoc &dl);
+  void SelectStore(SDNode *N);
+  void SelectSHL(SDNode *N);
+  void SelectMul(SDNode *N);
+  void SelectZeroExtend(SDNode *N);
+  void SelectIntrinsicWChain(SDNode *N);
+  void SelectIntrinsicWOChain(SDNode *N);
+  void SelectConstant(SDNode *N);
+  void SelectConstantFP(SDNode *N);
+  void SelectAdd(SDNode *N);
+  void SelectBitcast(SDNode *N);
+  void SelectBitOp(SDNode *N);
 
   // XformMskToBitPosU5Imm - Returns the bit position which
   // the single bit 32 bit mask represents.
   // Used in Clr and Set bit immediate memops.
-  SDValue XformMskToBitPosU5Imm(uint32_t Imm, SDLoc DL) {
+  SDValue XformMskToBitPosU5Imm(uint32_t Imm, const SDLoc &DL) {
     int32_t bitPos;
     bitPos = Log2_32(Imm);
     assert(bitPos >= 0 && bitPos < 32 &&
@@ -123,13 +120,13 @@ public:
 
   // XformMskToBitPosU4Imm - Returns the bit position which the single-bit
   // 16 bit mask represents. Used in Clr and Set bit immediate memops.
-  SDValue XformMskToBitPosU4Imm(uint16_t Imm, SDLoc DL) {
+  SDValue XformMskToBitPosU4Imm(uint16_t Imm, const SDLoc &DL) {
     return XformMskToBitPosU5Imm(Imm, DL);
   }
 
   // XformMskToBitPosU3Imm - Returns the bit position which the single-bit
   // 8 bit mask represents. Used in Clr and Set bit immediate memops.
-  SDValue XformMskToBitPosU3Imm(uint8_t Imm, SDLoc DL) {
+  SDValue XformMskToBitPosU3Imm(uint8_t Imm, const SDLoc &DL) {
     return XformMskToBitPosU5Imm(Imm, DL);
   }
 
@@ -142,36 +139,36 @@ public:
   // XformM5ToU5Imm - Return a target constant with the specified value, of
   // type i32 where the negative literal is transformed into a positive literal
   // for use in -= memops.
-  inline SDValue XformM5ToU5Imm(signed Imm, SDLoc DL) {
-     assert((Imm >= -31 && Imm <= -1)  && "Constant out of range for Memops");
-     return CurDAG->getTargetConstant(-Imm, DL, MVT::i32);
+  inline SDValue XformM5ToU5Imm(signed Imm, const SDLoc &DL) {
+    assert((Imm >= -31 && Imm <= -1) && "Constant out of range for Memops");
+    return CurDAG->getTargetConstant(-Imm, DL, MVT::i32);
   }
 
   // XformU7ToU7M1Imm - Return a target constant decremented by 1, in range
   // [1..128], used in cmpb.gtu instructions.
-  inline SDValue XformU7ToU7M1Imm(signed Imm, SDLoc DL) {
+  inline SDValue XformU7ToU7M1Imm(signed Imm, const SDLoc &DL) {
     assert((Imm >= 1 && Imm <= 128) && "Constant out of range for cmpb op");
     return CurDAG->getTargetConstant(Imm - 1, DL, MVT::i8);
   }
 
   // XformS8ToS8M1Imm - Return a target constant decremented by 1.
-  inline SDValue XformSToSM1Imm(signed Imm, SDLoc DL) {
+  inline SDValue XformSToSM1Imm(signed Imm, const SDLoc &DL) {
     return CurDAG->getTargetConstant(Imm - 1, DL, MVT::i32);
   }
 
   // XformU8ToU8M1Imm - Return a target constant decremented by 1.
-  inline SDValue XformUToUM1Imm(unsigned Imm, SDLoc DL) {
+  inline SDValue XformUToUM1Imm(unsigned Imm, const SDLoc &DL) {
     assert((Imm >= 1) && "Cannot decrement unsigned int less than 1");
     return CurDAG->getTargetConstant(Imm - 1, DL, MVT::i32);
   }
 
   // XformSToSM2Imm - Return a target constant decremented by 2.
-  inline SDValue XformSToSM2Imm(unsigned Imm, SDLoc DL) {
+  inline SDValue XformSToSM2Imm(unsigned Imm, const SDLoc &DL) {
     return CurDAG->getTargetConstant(Imm - 2, DL, MVT::i32);
   }
 
   // XformSToSM3Imm - Return a target constant decremented by 3.
-  inline SDValue XformSToSM3Imm(unsigned Imm, SDLoc DL) {
+  inline SDValue XformSToSM3Imm(unsigned Imm, const SDLoc &DL) {
     return CurDAG->getTargetConstant(Imm - 3, DL, MVT::i32);
   }
 
@@ -180,6 +177,8 @@ public:
 
 private:
   bool isValueExtension(const SDValue &Val, unsigned FromBits, SDValue &Src);
+  bool orIsAdd(const SDNode *N) const;
+  bool isAlignedMemNode(const MemSDNode *N) const;
 }; // end HexagonDAGToDAGISel
 }  // end anonymous namespace
 
@@ -194,18 +193,6 @@ FunctionPass *createHexagonISelDag(HexagonTargetMachine &TM,
 }
 }
 
-static void initializePassOnce(PassRegistry &Registry) {
-  const char *Name = "Hexagon DAG->DAG Pattern Instruction Selection";
-  PassInfo *PI = new PassInfo(Name, "hexagon-isel",
-                              &SelectionDAGISel::ID, nullptr, false, false);
-  Registry.registerPass(*PI, true);
-}
-
-void llvm::initializeHexagonDAGToDAGISelPass(PassRegistry &Registry) {
-  CALL_ONCE_INITIALIZATION(initializePassOnce)
-}
-
-
 // Intrinsics that return a a predicate.
 static bool doesIntrinsicReturnPredicate(unsigned ID) {
   switch (ID) {
@@ -251,127 +238,11 @@ static bool doesIntrinsicReturnPredicate(unsigned ID) {
   }
 }
 
-SDNode *HexagonDAGToDAGISel::SelectIndexedLoadSignExtend64(LoadSDNode *LD,
-                                                           unsigned Opcode,
-                                                           SDLoc dl) {
-  SDValue Chain = LD->getChain();
-  EVT LoadedVT = LD->getMemoryVT();
-  SDValue Base = LD->getBasePtr();
-  SDValue Offset = LD->getOffset();
-  SDNode *OffsetNode = Offset.getNode();
-  int32_t Val = cast<ConstantSDNode>(OffsetNode)->getSExtValue();
-
-  if (HII->isValidAutoIncImm(LoadedVT, Val)) {
-    SDValue TargetConst = CurDAG->getTargetConstant(Val, dl, MVT::i32);
-    SDNode *Result_1 = CurDAG->getMachineNode(Opcode, dl, MVT::i32, MVT::i32,
-                                              MVT::Other, Base, TargetConst,
-                                              Chain);
-    SDNode *Result_2 = CurDAG->getMachineNode(Hexagon::A2_sxtw, dl, MVT::i64,
-                                              SDValue(Result_1, 0));
-    MachineSDNode::mmo_iterator MemOp = MF->allocateMemRefsArray(1);
-    MemOp[0] = LD->getMemOperand();
-    cast<MachineSDNode>(Result_1)->setMemRefs(MemOp, MemOp + 1);
-    const SDValue Froms[] = { SDValue(LD, 0),
-                              SDValue(LD, 1),
-                              SDValue(LD, 2) };
-    const SDValue Tos[]   = { SDValue(Result_2, 0),
-                              SDValue(Result_1, 1),
-                              SDValue(Result_1, 2) };
-    ReplaceUses(Froms, Tos, 3);
-    return Result_2;
-  }
-
-  SDValue TargetConst0 = CurDAG->getTargetConstant(0, dl, MVT::i32);
-  SDValue TargetConstVal = CurDAG->getTargetConstant(Val, dl, MVT::i32);
-  SDNode *Result_1 = CurDAG->getMachineNode(Opcode, dl, MVT::i32, MVT::Other,
-                                            Base, TargetConst0, Chain);
-  SDNode *Result_2 = CurDAG->getMachineNode(Hexagon::A2_sxtw, dl, MVT::i64,
-                                            SDValue(Result_1, 0));
-  SDNode* Result_3 = CurDAG->getMachineNode(Hexagon::A2_addi, dl, MVT::i32,
-                                            Base, TargetConstVal,
-                                            SDValue(Result_1, 1));
-  MachineSDNode::mmo_iterator MemOp = MF->allocateMemRefsArray(1);
-  MemOp[0] = LD->getMemOperand();
-  cast<MachineSDNode>(Result_1)->setMemRefs(MemOp, MemOp + 1);
-  const SDValue Froms[] = { SDValue(LD, 0),
-                            SDValue(LD, 1),
-                            SDValue(LD, 2) };
-  const SDValue Tos[]   = { SDValue(Result_2, 0),
-                            SDValue(Result_3, 0),
-                            SDValue(Result_1, 1) };
-  ReplaceUses(Froms, Tos, 3);
-  return Result_2;
-}
-
-
-SDNode *HexagonDAGToDAGISel::SelectIndexedLoadZeroExtend64(LoadSDNode *LD,
-                                                           unsigned Opcode,
-                                                           SDLoc dl) {
-  SDValue Chain = LD->getChain();
-  EVT LoadedVT = LD->getMemoryVT();
-  SDValue Base = LD->getBasePtr();
-  SDValue Offset = LD->getOffset();
-  SDNode *OffsetNode = Offset.getNode();
-  int32_t Val = cast<ConstantSDNode>(OffsetNode)->getSExtValue();
-
-  if (HII->isValidAutoIncImm(LoadedVT, Val)) {
-    SDValue TargetConstVal = CurDAG->getTargetConstant(Val, dl, MVT::i32);
-    SDValue TargetConst0 = CurDAG->getTargetConstant(0, dl, MVT::i32);
-    SDNode *Result_1 = CurDAG->getMachineNode(Opcode, dl, MVT::i32,
-                                              MVT::i32, MVT::Other, Base,
-                                              TargetConstVal, Chain);
-    SDNode *Result_2 = CurDAG->getMachineNode(Hexagon::A4_combineir, dl,
-                                              MVT::i64, MVT::Other,
-                                              TargetConst0,
-                                              SDValue(Result_1,0));
-    MachineSDNode::mmo_iterator MemOp = MF->allocateMemRefsArray(1);
-    MemOp[0] = LD->getMemOperand();
-    cast<MachineSDNode>(Result_1)->setMemRefs(MemOp, MemOp + 1);
-    const SDValue Froms[] = { SDValue(LD, 0),
-                              SDValue(LD, 1),
-                              SDValue(LD, 2) };
-    const SDValue Tos[]   = { SDValue(Result_2, 0),
-                              SDValue(Result_1, 1),
-                              SDValue(Result_1, 2) };
-    ReplaceUses(Froms, Tos, 3);
-    return Result_2;
-  }
-
-  // Generate an indirect load.
-  SDValue TargetConst0 = CurDAG->getTargetConstant(0, dl, MVT::i32);
-  SDValue TargetConstVal = CurDAG->getTargetConstant(Val, dl, MVT::i32);
-  SDNode *Result_1 = CurDAG->getMachineNode(Opcode, dl, MVT::i32,
-                                            MVT::Other, Base, TargetConst0,
-                                            Chain);
-  SDNode *Result_2 = CurDAG->getMachineNode(Hexagon::A4_combineir, dl,
-                                            MVT::i64, MVT::Other,
-                                            TargetConst0,
-                                            SDValue(Result_1,0));
-  // Add offset to base.
-  SDNode* Result_3 = CurDAG->getMachineNode(Hexagon::A2_addi, dl, MVT::i32,
-                                            Base, TargetConstVal,
-                                            SDValue(Result_1, 1));
-  MachineSDNode::mmo_iterator MemOp = MF->allocateMemRefsArray(1);
-  MemOp[0] = LD->getMemOperand();
-  cast<MachineSDNode>(Result_1)->setMemRefs(MemOp, MemOp + 1);
-  const SDValue Froms[] = { SDValue(LD, 0),
-                            SDValue(LD, 1),
-                            SDValue(LD, 2) };
-  const SDValue Tos[]   = { SDValue(Result_2, 0), // Load value.
-                            SDValue(Result_3, 0), // New address.
-                            SDValue(Result_1, 1) };
-  ReplaceUses(Froms, Tos, 3);
-  return Result_2;
-}
-
-
-SDNode *HexagonDAGToDAGISel::SelectIndexedLoad(LoadSDNode *LD, SDLoc dl) {
+void HexagonDAGToDAGISel::SelectIndexedLoad(LoadSDNode *LD, const SDLoc &dl) {
   SDValue Chain = LD->getChain();
   SDValue Base = LD->getBasePtr();
   SDValue Offset = LD->getOffset();
-  SDNode *OffsetNode = Offset.getNode();
-  // Get the constant value.
-  int32_t Val = cast<ConstantSDNode>(OffsetNode)->getSExtValue();
+  int32_t Inc = cast<ConstantSDNode>(Offset.getNode())->getSExtValue();
   EVT LoadedVT = LD->getMemoryVT();
   unsigned Opcode = 0;
 
@@ -379,232 +250,394 @@ SDNode *HexagonDAGToDAGISel::SelectIndexedLoad(LoadSDNode *LD, SDLoc dl) {
   // loads.
   ISD::LoadExtType ExtType = LD->getExtensionType();
   bool IsZeroExt = (ExtType == ISD::ZEXTLOAD || ExtType == ISD::EXTLOAD);
-  bool HasVecOffset = false;
+  bool IsValidInc = HII->isValidAutoIncImm(LoadedVT, Inc);
 
-  // Figure out the opcode.
-  if (LoadedVT == MVT::i64) {
-    if (HII->isValidAutoIncImm(LoadedVT, Val))
-      Opcode = Hexagon::L2_loadrd_pi;
+  assert(LoadedVT.isSimple());
+  switch (LoadedVT.getSimpleVT().SimpleTy) {
+  case MVT::i8:
+    if (IsZeroExt)
+      Opcode = IsValidInc ? Hexagon::L2_loadrub_pi : Hexagon::L2_loadrub_io;
     else
-      Opcode = Hexagon::L2_loadrd_io;
-  } else if (LoadedVT == MVT::i32) {
-    if (HII->isValidAutoIncImm(LoadedVT, Val))
-      Opcode = Hexagon::L2_loadri_pi;
-    else
-      Opcode = Hexagon::L2_loadri_io;
-  } else if (LoadedVT == MVT::i16) {
-    if (HII->isValidAutoIncImm(LoadedVT, Val))
-      Opcode = IsZeroExt ? Hexagon::L2_loadruh_pi : Hexagon::L2_loadrh_pi;
-    else
-      Opcode = IsZeroExt ? Hexagon::L2_loadruh_io : Hexagon::L2_loadrh_io;
-  } else if (LoadedVT == MVT::i8) {
-    if (HII->isValidAutoIncImm(LoadedVT, Val))
-      Opcode = IsZeroExt ? Hexagon::L2_loadrub_pi : Hexagon::L2_loadrb_pi;
+      Opcode = IsValidInc ? Hexagon::L2_loadrb_pi : Hexagon::L2_loadrb_io;
+    break;
+  case MVT::i16:
+    if (IsZeroExt)
+      Opcode = IsValidInc ? Hexagon::L2_loadruh_pi : Hexagon::L2_loadruh_io;
     else
-      Opcode = IsZeroExt ? Hexagon::L2_loadrub_io : Hexagon::L2_loadrb_io;
-  } else if (LoadedVT == MVT::v16i32 || LoadedVT == MVT::v8i64 ||
-             LoadedVT == MVT::v32i16 || LoadedVT == MVT::v64i8) {
-    HasVecOffset = true;
-    if (HII->isValidAutoIncImm(LoadedVT, Val)) {
-      Opcode = Hexagon::V6_vL32b_pi;
-    }
+      Opcode = IsValidInc ? Hexagon::L2_loadrh_pi : Hexagon::L2_loadrh_io;
+    break;
+  case MVT::i32:
+    Opcode = IsValidInc ? Hexagon::L2_loadri_pi : Hexagon::L2_loadri_io;
+    break;
+  case MVT::i64:
+    Opcode = IsValidInc ? Hexagon::L2_loadrd_pi : Hexagon::L2_loadrd_io;
+    break;
+  // 64B
+  case MVT::v64i8:
+  case MVT::v32i16:
+  case MVT::v16i32:
+  case MVT::v8i64:
+    if (isAlignedMemNode(LD))
+      Opcode = IsValidInc ? Hexagon::V6_vL32b_pi : Hexagon::V6_vL32b_ai;
     else
-      Opcode = Hexagon::V6_vL32b_ai;
+      Opcode = IsValidInc ? Hexagon::V6_vL32Ub_pi : Hexagon::V6_vL32Ub_ai;
+    break;
   // 128B
-  } else if (LoadedVT == MVT::v32i32 || LoadedVT == MVT::v16i64 ||
-             LoadedVT == MVT::v64i16 || LoadedVT == MVT::v128i8) {
-    HasVecOffset = true;
-    if (HII->isValidAutoIncImm(LoadedVT, Val)) {
-      Opcode = Hexagon::V6_vL32b_pi_128B;
-    }
+  case MVT::v128i8:
+  case MVT::v64i16:
+  case MVT::v32i32:
+  case MVT::v16i64:
+    if (isAlignedMemNode(LD))
+      Opcode = IsValidInc ? Hexagon::V6_vL32b_pi_128B
+                          : Hexagon::V6_vL32b_ai_128B;
     else
-      Opcode = Hexagon::V6_vL32b_ai_128B;
-  } else
-    llvm_unreachable("unknown memory type");
-
-  // For zero extended i64 loads, we need to add combine instructions.
-  if (LD->getValueType(0) == MVT::i64 && IsZeroExt)
-    return SelectIndexedLoadZeroExtend64(LD, Opcode, dl);
-  // Handle sign extended i64 loads.
-  if (LD->getValueType(0) == MVT::i64 && ExtType == ISD::SEXTLOAD)
-    return SelectIndexedLoadSignExtend64(LD, Opcode, dl);
-
-  if (HII->isValidAutoIncImm(LoadedVT, Val)) {
-    SDValue TargetConstVal = CurDAG->getTargetConstant(Val, dl, MVT::i32);
-    SDNode* Result = CurDAG->getMachineNode(Opcode, dl,
-                                            LD->getValueType(0),
-                                            MVT::i32, MVT::Other, Base,
-                                            TargetConstVal, Chain);
-    MachineSDNode::mmo_iterator MemOp = MF->allocateMemRefsArray(1);
-    MemOp[0] = LD->getMemOperand();
-    cast<MachineSDNode>(Result)->setMemRefs(MemOp, MemOp + 1);
-    if (HasVecOffset) {
-      const SDValue Froms[] = { SDValue(LD, 0),
-                                SDValue(LD, 2)
-      };
-      const SDValue Tos[]   = { SDValue(Result, 0),
-                                SDValue(Result, 2)
-      };
-      ReplaceUses(Froms, Tos, 2);
-    } else {
-      const SDValue Froms[] = { SDValue(LD, 0),
-                                SDValue(LD, 1),
-                                SDValue(LD, 2)
-      };
-      const SDValue Tos[]   = { SDValue(Result, 0),
-                                SDValue(Result, 1),
-                                SDValue(Result, 2)
-      };
-      ReplaceUses(Froms, Tos, 3);
+      Opcode = IsValidInc ? Hexagon::V6_vL32Ub_pi_128B
+                          : Hexagon::V6_vL32Ub_ai_128B;
+    break;
+  default:
+    llvm_unreachable("Unexpected memory type in indexed load");
+  }
+
+  SDValue IncV = CurDAG->getTargetConstant(Inc, dl, MVT::i32);
+  MachineSDNode::mmo_iterator MemOp = MF->allocateMemRefsArray(1);
+  MemOp[0] = LD->getMemOperand();
+
+  auto getExt64 = [this,ExtType] (MachineSDNode *N, const SDLoc &dl)
+        -> MachineSDNode* {
+    if (ExtType == ISD::ZEXTLOAD || ExtType == ISD::EXTLOAD) {
+      SDValue Zero = CurDAG->getTargetConstant(0, dl, MVT::i32);
+      return CurDAG->getMachineNode(Hexagon::A4_combineir, dl, MVT::i64,
+                                    Zero, SDValue(N, 0));
     }
-    return Result;
+    if (ExtType == ISD::SEXTLOAD)
+      return CurDAG->getMachineNode(Hexagon::A2_sxtw, dl, MVT::i64,
+                                    SDValue(N, 0));
+    return N;
+  };
+
+  //                  Loaded value   Next address   Chain
+  SDValue From[3] = { SDValue(LD,0), SDValue(LD,1), SDValue(LD,2) };
+  SDValue To[3];
+
+  EVT ValueVT = LD->getValueType(0);
+  if (ValueVT == MVT::i64 && ExtType != ISD::NON_EXTLOAD) {
+    // A load extending to i64 will actually produce i32, which will then
+    // need to be extended to i64.
+    assert(LoadedVT.getSizeInBits() <= 32);
+    ValueVT = MVT::i32;
+  }
+
+  if (IsValidInc) {
+    MachineSDNode *L = CurDAG->getMachineNode(Opcode, dl, ValueVT,
+                                              MVT::i32, MVT::Other, Base,
+                                              IncV, Chain);
+    L->setMemRefs(MemOp, MemOp+1);
+    To[1] = SDValue(L, 1); // Next address.
+    To[2] = SDValue(L, 2); // Chain.
+    // Handle special case for extension to i64.
+    if (LD->getValueType(0) == MVT::i64)
+      L = getExt64(L, dl);
+    To[0] = SDValue(L, 0); // Loaded (extended) value.
   } else {
-    SDValue TargetConst0 = CurDAG->getTargetConstant(0, dl, MVT::i32);
-    SDValue TargetConstVal = CurDAG->getTargetConstant(Val, dl, MVT::i32);
-    SDNode* Result_1 = CurDAG->getMachineNode(Opcode, dl,
-                                              LD->getValueType(0),
-                                              MVT::Other, Base, TargetConst0,
-                                              Chain);
-    SDNode* Result_2 = CurDAG->getMachineNode(Hexagon::A2_addi, dl, MVT::i32,
-                                              Base, TargetConstVal,
-                                              SDValue(Result_1, 1));
-    MachineSDNode::mmo_iterator MemOp = MF->allocateMemRefsArray(1);
-    MemOp[0] = LD->getMemOperand();
-    cast<MachineSDNode>(Result_1)->setMemRefs(MemOp, MemOp + 1);
-    const SDValue Froms[] = { SDValue(LD, 0),
-                              SDValue(LD, 1),
-                              SDValue(LD, 2)
-    };
-    const SDValue Tos[]   = { SDValue(Result_1, 0),
-                              SDValue(Result_2, 0),
-                              SDValue(Result_1, 1)
-    };
-    ReplaceUses(Froms, Tos, 3);
-    return Result_1;
+    SDValue Zero = CurDAG->getTargetConstant(0, dl, MVT::i32);
+    MachineSDNode *L = CurDAG->getMachineNode(Opcode, dl, ValueVT, MVT::Other,
+                                              Base, Zero, Chain);
+    L->setMemRefs(MemOp, MemOp+1);
+    To[2] = SDValue(L, 1); // Chain.
+    MachineSDNode *A = CurDAG->getMachineNode(Hexagon::A2_addi, dl, MVT::i32,
+                                              Base, IncV);
+    To[1] = SDValue(A, 0); // Next address.
+    // Handle special case for extension to i64.
+    if (LD->getValueType(0) == MVT::i64)
+      L = getExt64(L, dl);
+    To[0] = SDValue(L, 0); // Loaded (extended) value.
   }
+  ReplaceUses(From, To, 3);
+  CurDAG->RemoveDeadNode(LD);
 }
 
 
-SDNode *HexagonDAGToDAGISel::SelectLoad(SDNode *N) {
-  SDNode *result;
+MachineSDNode *HexagonDAGToDAGISel::LoadInstrForLoadIntrinsic(SDNode *IntN) {
+  if (IntN->getOpcode() != ISD::INTRINSIC_W_CHAIN)
+    return nullptr;
+
+  SDLoc dl(IntN);
+  unsigned IntNo = cast<ConstantSDNode>(IntN->getOperand(1))->getZExtValue();
+
+  static std::map<unsigned,unsigned> LoadPciMap = {
+    { Intrinsic::hexagon_circ_ldb,  Hexagon::L2_loadrb_pci  },
+    { Intrinsic::hexagon_circ_ldub, Hexagon::L2_loadrub_pci },
+    { Intrinsic::hexagon_circ_ldh,  Hexagon::L2_loadrh_pci  },
+    { Intrinsic::hexagon_circ_lduh, Hexagon::L2_loadruh_pci },
+    { Intrinsic::hexagon_circ_ldw,  Hexagon::L2_loadri_pci  },
+    { Intrinsic::hexagon_circ_ldd,  Hexagon::L2_loadrd_pci  },
+  };
+  auto FLC = LoadPciMap.find(IntNo);
+  if (FLC != LoadPciMap.end()) {
+    SDNode *Mod = CurDAG->getMachineNode(Hexagon::A2_tfrrcr, dl, MVT::i32,
+          IntN->getOperand(4));
+    EVT ValTy = (IntNo == Intrinsic::hexagon_circ_ldd) ? MVT::i64 : MVT::i32;
+    EVT RTys[] = { ValTy, MVT::i32, MVT::Other };
+    // Operands: { Base, Increment, Modifier, Chain }
+    auto Inc = cast<ConstantSDNode>(IntN->getOperand(5));
+    SDValue I = CurDAG->getTargetConstant(Inc->getSExtValue(), dl, MVT::i32);
+    MachineSDNode *Res = CurDAG->getMachineNode(FLC->second, dl, RTys,
+          { IntN->getOperand(2), I, SDValue(Mod,0), IntN->getOperand(0) });
+    return Res;
+  }
+
+  static std::map<unsigned,unsigned> LoadPbrMap = {
+    { Intrinsic::hexagon_brev_ldb,  Hexagon::L2_loadrb_pbr  },
+    { Intrinsic::hexagon_brev_ldub, Hexagon::L2_loadrub_pbr },
+    { Intrinsic::hexagon_brev_ldh,  Hexagon::L2_loadrh_pbr  },
+    { Intrinsic::hexagon_brev_lduh, Hexagon::L2_loadruh_pbr },
+    { Intrinsic::hexagon_brev_ldw,  Hexagon::L2_loadri_pbr  },
+    { Intrinsic::hexagon_brev_ldd,  Hexagon::L2_loadrd_pbr  },
+  };
+  auto FLB = LoadPbrMap.find(IntNo);
+  if (FLB != LoadPbrMap.end()) {
+    SDNode *Mod = CurDAG->getMachineNode(Hexagon::A2_tfrrcr, dl, MVT::i32,
+            IntN->getOperand(4));
+    EVT ValTy = (IntNo == Intrinsic::hexagon_brev_ldd) ? MVT::i64 : MVT::i32;
+    EVT RTys[] = { ValTy, MVT::i32, MVT::Other };
+    // Operands: { Base, Modifier, Chain }
+    MachineSDNode *Res = CurDAG->getMachineNode(FLB->second, dl, RTys,
+          { IntN->getOperand(2), SDValue(Mod,0), IntN->getOperand(0) });
+    return Res;
+  }
+
+  return nullptr;
+}
+
+SDNode *HexagonDAGToDAGISel::StoreInstrForLoadIntrinsic(MachineSDNode *LoadN,
+      SDNode *IntN) {
+  // The "LoadN" is just a machine load instruction. The intrinsic also
+  // involves storing it. Generate an appropriate store to the location
+  // given in the intrinsic's operand(3).
+  uint64_t F = HII->get(LoadN->getMachineOpcode()).TSFlags;
+  unsigned SizeBits = (F >> HexagonII::MemAccessSizePos) &
+                      HexagonII::MemAccesSizeMask;
+  unsigned Size = 1U << (SizeBits-1);
+
+  SDLoc dl(IntN);
+  MachinePointerInfo PI;
+  SDValue TS;
+  SDValue Loc = IntN->getOperand(3);
+
+  if (Size >= 4)
+    TS = CurDAG->getStore(SDValue(LoadN, 2), dl, SDValue(LoadN, 0), Loc, PI,
+                          Size);
+  else
+    TS = CurDAG->getTruncStore(SDValue(LoadN, 2), dl, SDValue(LoadN, 0), Loc,
+                               PI, MVT::getIntegerVT(Size * 8), Size);
+
+  SDNode *StoreN;
+  {
+    HandleSDNode Handle(TS);
+    SelectStore(TS.getNode());
+    StoreN = Handle.getValue().getNode();
+  }
+
+  // Load's results are { Loaded value, Updated pointer, Chain }
+  ReplaceUses(SDValue(IntN, 0), SDValue(LoadN, 1));
+  ReplaceUses(SDValue(IntN, 1), SDValue(StoreN, 0));
+  return StoreN;
+}
+
+bool HexagonDAGToDAGISel::tryLoadOfLoadIntrinsic(LoadSDNode *N) {
+  // The intrinsics for load circ/brev perform two operations:
+  // 1. Load a value V from the specified location, using the addressing
+  //    mode corresponding to the intrinsic.
+  // 2. Store V into a specified location. This location is typically a
+  //    local, temporary object.
+  // In many cases, the program using these intrinsics will immediately
+  // load V again from the local object. In those cases, when certain
+  // conditions are met, the last load can be removed.
+  // This function identifies and optimizes this pattern. If the pattern
+  // cannot be optimized, it returns nullptr, which will cause the load
+  // to be selected separately from the intrinsic (which will be handled
+  // in SelectIntrinsicWChain).
+
+  SDValue Ch = N->getOperand(0);
+  SDValue Loc = N->getOperand(1);
+
+  // Assume that the load and the intrinsic are connected directly with a
+  // chain:
+  //   t1: i32,ch = int.load ..., ..., ..., Loc, ...    // <-- C
+  //   t2: i32,ch = load t1:1, Loc, ...
+  SDNode *C = Ch.getNode();
+
+  if (C->getOpcode() != ISD::INTRINSIC_W_CHAIN)
+    return false;
+
+  // The second load can only be eliminated if its extension type matches
+  // that of the load instruction corresponding to the intrinsic. The user
+  // can provide an address of an unsigned variable to store the result of
+  // a sign-extending intrinsic into (or the other way around).
+  ISD::LoadExtType IntExt;
+  switch (cast<ConstantSDNode>(C->getOperand(1))->getZExtValue()) {
+    case Intrinsic::hexagon_brev_ldub:
+    case Intrinsic::hexagon_brev_lduh:
+    case Intrinsic::hexagon_circ_ldub:
+    case Intrinsic::hexagon_circ_lduh:
+      IntExt = ISD::ZEXTLOAD;
+      break;
+    case Intrinsic::hexagon_brev_ldw:
+    case Intrinsic::hexagon_brev_ldd:
+    case Intrinsic::hexagon_circ_ldw:
+    case Intrinsic::hexagon_circ_ldd:
+      IntExt = ISD::NON_EXTLOAD;
+      break;
+    default:
+      IntExt = ISD::SEXTLOAD;
+      break;
+  }
+  if (N->getExtensionType() != IntExt)
+    return false;
+
+  // Make sure the target location for the loaded value in the load intrinsic
+  // is the location from which LD (or N) is loading.
+  if (C->getNumOperands() < 4 || Loc.getNode() != C->getOperand(3).getNode())
+    return false;
+
+  if (MachineSDNode *L = LoadInstrForLoadIntrinsic(C)) {
+    SDNode *S = StoreInstrForLoadIntrinsic(L, C);
+    SDValue F[] = { SDValue(N,0), SDValue(N,1), SDValue(C,0), SDValue(C,1) };
+    SDValue T[] = { SDValue(L,0), SDValue(S,0), SDValue(L,1), SDValue(S,0) };
+    ReplaceUses(F, T, array_lengthof(T));
+    // This transformation will leave the intrinsic dead. If it remains in
+    // the DAG, the selection code will see it again, but without the load,
+    // and it will generate a store that is normally required for it.
+    CurDAG->RemoveDeadNode(C);
+    return true;
+  }
+
+  return false;
+}
+
+void HexagonDAGToDAGISel::SelectLoad(SDNode *N) {
   SDLoc dl(N);
   LoadSDNode *LD = cast<LoadSDNode>(N);
   ISD::MemIndexedMode AM = LD->getAddressingMode();
 
   // Handle indexed loads.
   if (AM != ISD::UNINDEXED) {
-    result = SelectIndexedLoad(LD, dl);
-  } else {
-    result = SelectCode(LD);
+    SelectIndexedLoad(LD, dl);
+    return;
   }
 
-  return result;
-}
+  // Handle patterns using circ/brev load intrinsics.
+  if (tryLoadOfLoadIntrinsic(LD))
+    return;
 
+  SelectCode(LD);
+}
 
-SDNode *HexagonDAGToDAGISel::SelectIndexedStore(StoreSDNode *ST, SDLoc dl) {
+void HexagonDAGToDAGISel::SelectIndexedStore(StoreSDNode *ST, const SDLoc &dl) {
   SDValue Chain = ST->getChain();
   SDValue Base = ST->getBasePtr();
   SDValue Offset = ST->getOffset();
   SDValue Value = ST->getValue();
-  SDNode *OffsetNode = Offset.getNode();
   // Get the constant value.
-  int32_t Val = cast<ConstantSDNode>(OffsetNode)->getSExtValue();
+  int32_t Inc = cast<ConstantSDNode>(Offset.getNode())->getSExtValue();
   EVT StoredVT = ST->getMemoryVT();
   EVT ValueVT = Value.getValueType();
 
-  // Offset value must be within representable range
-  // and must have correct alignment properties.
-  if (HII->isValidAutoIncImm(StoredVT, Val)) {
-    unsigned Opcode = 0;
-
-    // Figure out the post inc version of opcode.
-    if (StoredVT == MVT::i64) Opcode = Hexagon::S2_storerd_pi;
-    else if (StoredVT == MVT::i32) Opcode = Hexagon::S2_storeri_pi;
-    else if (StoredVT == MVT::i16) Opcode = Hexagon::S2_storerh_pi;
-    else if (StoredVT == MVT::i8) Opcode = Hexagon::S2_storerb_pi;
-    else if (StoredVT == MVT::v16i32 || StoredVT == MVT::v8i64 ||
-             StoredVT == MVT::v32i16 || StoredVT == MVT::v64i8) {
-      Opcode = Hexagon::V6_vS32b_pi;
-    }
-    // 128B
-    else if (StoredVT == MVT::v32i32 || StoredVT == MVT::v16i64 ||
-             StoredVT == MVT::v64i16 || StoredVT == MVT::v128i8) {
-      Opcode = Hexagon::V6_vS32b_pi_128B;
-    } else llvm_unreachable("unknown memory type");
-
-    if (ST->isTruncatingStore() && ValueVT.getSizeInBits() == 64) {
-      assert(StoredVT.getSizeInBits() < 64 && "Not a truncating store");
-      Value = CurDAG->getTargetExtractSubreg(Hexagon::subreg_loreg,
-                                             dl, MVT::i32, Value);
-    }
-    SDValue Ops[] = {Base, CurDAG->getTargetConstant(Val, dl, MVT::i32), Value,
-                     Chain};
-    // Build post increment store.
-    SDNode* Result = CurDAG->getMachineNode(Opcode, dl, MVT::i32,
-                                            MVT::Other, Ops);
-    MachineSDNode::mmo_iterator MemOp = MF->allocateMemRefsArray(1);
-    MemOp[0] = ST->getMemOperand();
-    cast<MachineSDNode>(Result)->setMemRefs(MemOp, MemOp + 1);
-
-    ReplaceUses(ST, Result);
-    ReplaceUses(SDValue(ST,1), SDValue(Result,1));
-    return Result;
-  }
-
-  // Note: Order of operands matches the def of instruction:
-  // def S2_storerd_io
-  //   : STInst<(outs), (ins IntRegs:$base, imm:$offset, DoubleRegs:$src1), ...
-  // and it differs for POST_ST* for instance.
-  SDValue Ops[] = { Base, CurDAG->getTargetConstant(0, dl, MVT::i32), Value,
-                    Chain};
+  bool IsValidInc = HII->isValidAutoIncImm(StoredVT, Inc);
   unsigned Opcode = 0;
 
-  // Figure out the opcode.
-  if (StoredVT == MVT::i64) Opcode = Hexagon::S2_storerd_io;
-  else if (StoredVT == MVT::i32) Opcode = Hexagon::S2_storeri_io;
-  else if (StoredVT == MVT::i16) Opcode = Hexagon::S2_storerh_io;
-  else if (StoredVT == MVT::i8) Opcode = Hexagon::S2_storerb_io;
-  else if (StoredVT == MVT::v16i32 || StoredVT == MVT::v8i64 ||
-           StoredVT == MVT::v32i16 || StoredVT == MVT::v64i8)
-     Opcode = Hexagon::V6_vS32b_ai;
+  assert(StoredVT.isSimple());
+  switch (StoredVT.getSimpleVT().SimpleTy) {
+  case MVT::i8:
+    Opcode = IsValidInc ? Hexagon::S2_storerb_pi : Hexagon::S2_storerb_io;
+    break;
+  case MVT::i16:
+    Opcode = IsValidInc ? Hexagon::S2_storerh_pi : Hexagon::S2_storerh_io;
+    break;
+  case MVT::i32:
+    Opcode = IsValidInc ? Hexagon::S2_storeri_pi : Hexagon::S2_storeri_io;
+    break;
+  case MVT::i64:
+    Opcode = IsValidInc ? Hexagon::S2_storerd_pi : Hexagon::S2_storerd_io;
+    break;
+  // 64B
+  case MVT::v64i8:
+  case MVT::v32i16:
+  case MVT::v16i32:
+  case MVT::v8i64:
+    if (isAlignedMemNode(ST))
+      Opcode = IsValidInc ? Hexagon::V6_vS32b_pi : Hexagon::V6_vS32b_ai;
+    else
+      Opcode = IsValidInc ? Hexagon::V6_vS32Ub_pi : Hexagon::V6_vS32Ub_ai;
+    break;
   // 128B
-  else if (StoredVT == MVT::v32i32 || StoredVT == MVT::v16i64 ||
-           StoredVT == MVT::v64i16 || StoredVT == MVT::v128i8)
-     Opcode = Hexagon::V6_vS32b_ai_128B;
-  else llvm_unreachable("unknown memory type");
-
-  // Build regular store.
-  SDValue TargetConstVal = CurDAG->getTargetConstant(Val, dl, MVT::i32);
-  SDNode* Result_1 = CurDAG->getMachineNode(Opcode, dl, MVT::Other, Ops);
-  // Build splitted incriment instruction.
-  SDNode* Result_2 = CurDAG->getMachineNode(Hexagon::A2_addi, dl, MVT::i32,
-                                            Base,
-                                            TargetConstVal,
-                                            SDValue(Result_1, 0));
+  case MVT::v128i8:
+  case MVT::v64i16:
+  case MVT::v32i32:
+  case MVT::v16i64:
+    if (isAlignedMemNode(ST))
+      Opcode = IsValidInc ? Hexagon::V6_vS32b_pi_128B
+                          : Hexagon::V6_vS32b_ai_128B;
+    else
+      Opcode = IsValidInc ? Hexagon::V6_vS32Ub_pi_128B
+                          : Hexagon::V6_vS32Ub_ai_128B;
+    break;
+  default:
+    llvm_unreachable("Unexpected memory type in indexed store");
+  }
+
+  if (ST->isTruncatingStore() && ValueVT.getSizeInBits() == 64) {
+    assert(StoredVT.getSizeInBits() < 64 && "Not a truncating store");
+    Value = CurDAG->getTargetExtractSubreg(Hexagon::subreg_loreg,
+                                           dl, MVT::i32, Value);
+  }
+
+  SDValue IncV = CurDAG->getTargetConstant(Inc, dl, MVT::i32);
   MachineSDNode::mmo_iterator MemOp = MF->allocateMemRefsArray(1);
   MemOp[0] = ST->getMemOperand();
-  cast<MachineSDNode>(Result_1)->setMemRefs(MemOp, MemOp + 1);
 
-  ReplaceUses(SDValue(ST,0), SDValue(Result_2,0));
-  ReplaceUses(SDValue(ST,1), SDValue(Result_1,0));
-  return Result_2;
+  //                  Next address   Chain
+  SDValue From[2] = { SDValue(ST,0), SDValue(ST,1) };
+  SDValue To[2];
+
+  if (IsValidInc) {
+    // Build post increment store.
+    SDValue Ops[] = { Base, IncV, Value, Chain };
+    MachineSDNode *S = CurDAG->getMachineNode(Opcode, dl, MVT::i32, MVT::Other,
+                                              Ops);
+    S->setMemRefs(MemOp, MemOp + 1);
+    To[0] = SDValue(S, 0);
+    To[1] = SDValue(S, 1);
+  } else {
+    SDValue Zero = CurDAG->getTargetConstant(0, dl, MVT::i32);
+    SDValue Ops[] = { Base, Zero, Value, Chain };
+    MachineSDNode *S = CurDAG->getMachineNode(Opcode, dl, MVT::Other, Ops);
+    S->setMemRefs(MemOp, MemOp + 1);
+    To[1] = SDValue(S, 0);
+    MachineSDNode *A = CurDAG->getMachineNode(Hexagon::A2_addi, dl, MVT::i32,
+                                              Base, IncV);
+    To[0] = SDValue(A, 0);
+  }
+
+  ReplaceUses(From, To, 2);
+  CurDAG->RemoveDeadNode(ST);
 }
 
-SDNode *HexagonDAGToDAGISel::SelectStore(SDNode *N) {
+void HexagonDAGToDAGISel::SelectStore(SDNode *N) {
   SDLoc dl(N);
   StoreSDNode *ST = cast<StoreSDNode>(N);
   ISD::MemIndexedMode AM = ST->getAddressingMode();
 
   // Handle indexed stores.
   if (AM != ISD::UNINDEXED) {
-    return SelectIndexedStore(ST, dl);
+    SelectIndexedStore(ST, dl);
+    return;
   }
 
-  return SelectCode(ST);
+  SelectCode(ST);
 }
 
-SDNode *HexagonDAGToDAGISel::SelectMul(SDNode *N) {
+void HexagonDAGToDAGISel::SelectMul(SDNode *N) {
   SDLoc dl(N);
 
   //
@@ -629,7 +662,8 @@ SDNode *HexagonDAGToDAGISel::SelectMul(SDNode *N) {
     if (MulOp0.getOpcode() == ISD::SIGN_EXTEND) {
       SDValue Sext0 = MulOp0.getOperand(0);
       if (Sext0.getNode()->getValueType(0) != MVT::i32) {
-        return SelectCode(N);
+        SelectCode(N);
+        return;
       }
 
       OP0 = Sext0;
@@ -638,7 +672,8 @@ SDNode *HexagonDAGToDAGISel::SelectMul(SDNode *N) {
       if (LD->getMemoryVT() != MVT::i32 ||
           LD->getExtensionType() != ISD::SEXTLOAD ||
           LD->getAddressingMode() != ISD::UNINDEXED) {
-        return SelectCode(N);
+        SelectCode(N);
+        return;
       }
 
       SDValue Chain = LD->getChain();
@@ -648,14 +683,16 @@ SDNode *HexagonDAGToDAGISel::SelectMul(SDNode *N) {
                                             LD->getBasePtr(), TargetConst0,
                                             Chain), 0);
     } else {
-      return SelectCode(N);
+      SelectCode(N);
+      return;
     }
 
     // Same goes for the second operand.
     if (MulOp1.getOpcode() == ISD::SIGN_EXTEND) {
       SDValue Sext1 = MulOp1.getOperand(0);
       if (Sext1.getNode()->getValueType(0) != MVT::i32) {
-        return SelectCode(N);
+        SelectCode(N);
+        return;
       }
 
       OP1 = Sext1;
@@ -664,7 +701,8 @@ SDNode *HexagonDAGToDAGISel::SelectMul(SDNode *N) {
       if (LD->getMemoryVT() != MVT::i32 ||
           LD->getExtensionType() != ISD::SEXTLOAD ||
           LD->getAddressingMode() != ISD::UNINDEXED) {
-        return SelectCode(N);
+        SelectCode(N);
+        return;
       }
 
       SDValue Chain = LD->getChain();
@@ -674,20 +712,21 @@ SDNode *HexagonDAGToDAGISel::SelectMul(SDNode *N) {
                                             LD->getBasePtr(), TargetConst0,
                                             Chain), 0);
     } else {
-      return SelectCode(N);
+      SelectCode(N);
+      return;
     }
 
     // Generate a mpy instruction.
     SDNode *Result = CurDAG->getMachineNode(Hexagon::M2_dpmpyss_s0, dl, MVT::i64,
                                             OP0, OP1);
-    ReplaceUses(N, Result);
-    return Result;
+    ReplaceNode(N, Result);
+    return;
   }
 
-  return SelectCode(N);
+  SelectCode(N);
 }
 
-SDNode *HexagonDAGToDAGISel::SelectSHL(SDNode *N) {
+void HexagonDAGToDAGISel::SelectSHL(SDNode *N) {
   SDLoc dl(N);
   if (N->getValueType(0) == MVT::i32) {
     SDValue Shl_0 = N->getOperand(0);
@@ -711,8 +750,8 @@ SDNode *HexagonDAGToDAGISel::SelectSHL(SDNode *N) {
               SDNode* Result =
                 CurDAG->getMachineNode(Hexagon::M2_mpysmi, dl,
                                        MVT::i32, Mul_0, Val);
-              ReplaceUses(N, Result);
-              return Result;
+              ReplaceNode(N, Result);
+              return;
             }
 
         }
@@ -740,8 +779,8 @@ SDNode *HexagonDAGToDAGISel::SelectSHL(SDNode *N) {
                     SDNode* Result =
                       CurDAG->getMachineNode(Hexagon::M2_mpysmi, dl, MVT::i32,
                                              Shl2_0, Val);
-                    ReplaceUses(N, Result);
-                    return Result;
+                    ReplaceNode(N, Result);
+                    return;
                   }
               }
             }
@@ -750,7 +789,7 @@ SDNode *HexagonDAGToDAGISel::SelectSHL(SDNode *N) {
       }
     }
   }
-  return SelectCode(N);
+  SelectCode(N);
 }
 
 
@@ -764,7 +803,7 @@ SDNode *HexagonDAGToDAGISel::SelectSHL(SDNode *N) {
 // compiler. Architecture defines them as 8-bit registers.
 // We want to preserve all the lower 8-bits and, not just 1 LSB bit.
 //
-SDNode *HexagonDAGToDAGISel::SelectZeroExtend(SDNode *N) {
+void HexagonDAGToDAGISel::SelectZeroExtend(SDNode *N) {
   SDLoc dl(N);
 
   SDValue Op0 = N->getOperand(0);
@@ -790,11 +829,14 @@ SDNode *HexagonDAGToDAGISel::SelectZeroExtend(SDNode *N) {
                                            SDValue(Mask,0), SDValue(OnesReg,0));
       SDValue SubR = CurDAG->getTargetConstant(Hexagon::subreg_loreg, dl,
                                                MVT::i32);
-      return CurDAG->getMachineNode(Hexagon::EXTRACT_SUBREG, dl, ExVT,
-                                    SDValue(And,0), SubR);
+      ReplaceNode(N, CurDAG->getMachineNode(Hexagon::EXTRACT_SUBREG, dl, ExVT,
+                                            SDValue(And, 0), SubR));
+      return;
     }
-    return CurDAG->getMachineNode(Hexagon::A2_andp, dl, ExVT,
-                                  SDValue(Mask,0), SDValue(OnesReg,0));
+    ReplaceNode(N,
+                CurDAG->getMachineNode(Hexagon::A2_andp, dl, ExVT,
+                                       SDValue(Mask, 0), SDValue(OnesReg, 0)));
+    return;
   }
 
   SDNode *IsIntrinsic = N->getOperand(0).getNode();
@@ -816,225 +858,37 @@ SDNode *HexagonDAGToDAGISel::SelectZeroExtend(SDNode *N) {
                                                   MVT::i64, MVT::Other,
                                                   SDValue(Result_2, 0),
                                                   SDValue(Result_1, 0));
-        ReplaceUses(N, Result_3);
-        return Result_3;
+        ReplaceNode(N, Result_3);
+        return;
       }
       if (N->getValueType(0) == MVT::i32) {
         // Convert the zero_extend to Rs = Pd
         SDNode* RsPd = CurDAG->getMachineNode(Hexagon::C2_tfrpr, dl,
                                               MVT::i32,
                                               SDValue(IsIntrinsic, 0));
-        ReplaceUses(N, RsPd);
-        return RsPd;
+        ReplaceNode(N, RsPd);
+        return;
       }
       llvm_unreachable("Unexpected value type");
     }
   }
-  return SelectCode(N);
+  SelectCode(N);
 }
 
+
 //
-// Checking for intrinsics circular load/store, and bitreverse load/store
-// instrisics in order to select the correct lowered operation.
+// Handling intrinsics for circular load and bitreverse load.
 //
-SDNode *HexagonDAGToDAGISel::SelectIntrinsicWChain(SDNode *N) {
-  unsigned IntNo = cast<ConstantSDNode>(N->getOperand(1))->getZExtValue();
-  if (IntNo == Intrinsic::hexagon_circ_ldd  ||
-      IntNo == Intrinsic::hexagon_circ_ldw  ||
-      IntNo == Intrinsic::hexagon_circ_lduh ||
-      IntNo == Intrinsic::hexagon_circ_ldh  ||
-      IntNo == Intrinsic::hexagon_circ_ldub ||
-      IntNo == Intrinsic::hexagon_circ_ldb) {
-    SDLoc dl(N);
-    SDValue Chain = N->getOperand(0);
-    SDValue Base = N->getOperand(2);
-    SDValue Load = N->getOperand(3);
-    SDValue ModifierExpr = N->getOperand(4);
-    SDValue Offset = N->getOperand(5);
-
-    // We need to add the rerurn type for the load.  This intrinsic has
-    // two return types, one for the load and one for the post-increment.
-    // Only the *_ld instructions push the extra return type, and bump the
-    // result node operand number correspondingly.
-    std::vector<EVT> ResTys;
-    unsigned opc;
-    unsigned memsize, align;
-    MVT MvtSize = MVT::i32;
-
-    if (IntNo == Intrinsic::hexagon_circ_ldd) {
-      ResTys.push_back(MVT::i32);
-      ResTys.push_back(MVT::i64);
-      opc = Hexagon::L2_loadrd_pci_pseudo;
-      memsize = 8;
-      align = 8;
-    } else if (IntNo == Intrinsic::hexagon_circ_ldw) {
-      ResTys.push_back(MVT::i32);
-      ResTys.push_back(MVT::i32);
-      opc = Hexagon::L2_loadri_pci_pseudo;
-      memsize = 4;
-      align = 4;
-    } else if (IntNo == Intrinsic::hexagon_circ_ldh) {
-      ResTys.push_back(MVT::i32);
-      ResTys.push_back(MVT::i32);
-      opc = Hexagon::L2_loadrh_pci_pseudo;
-      memsize = 2;
-      align = 2;
-      MvtSize = MVT::i16;
-    } else if (IntNo == Intrinsic::hexagon_circ_lduh) {
-      ResTys.push_back(MVT::i32);
-      ResTys.push_back(MVT::i32);
-      opc = Hexagon::L2_loadruh_pci_pseudo;
-      memsize = 2;
-      align = 2;
-      MvtSize = MVT::i16;
-    } else if (IntNo == Intrinsic::hexagon_circ_ldb) {
-      ResTys.push_back(MVT::i32);
-      ResTys.push_back(MVT::i32);
-      opc = Hexagon::L2_loadrb_pci_pseudo;
-      memsize = 1;
-      align = 1;
-      MvtSize = MVT::i8;
-    } else if (IntNo == Intrinsic::hexagon_circ_ldub) {
-      ResTys.push_back(MVT::i32);
-      ResTys.push_back(MVT::i32);
-      opc = Hexagon::L2_loadrub_pci_pseudo;
-      memsize = 1;
-      align = 1;
-      MvtSize = MVT::i8;
-    } else
-      llvm_unreachable("no opc");
-
-    ResTys.push_back(MVT::Other);
-
-    // Copy over the arguments, which are the same mostly.
-    SmallVector<SDValue, 5> Ops;
-    Ops.push_back(Base);
-    Ops.push_back(Load);
-    Ops.push_back(ModifierExpr);
-    int32_t Val = cast<ConstantSDNode>(Offset.getNode())->getSExtValue();
-    Ops.push_back(CurDAG->getTargetConstant(Val, dl, MVT::i32));
-    Ops.push_back(Chain);
-    SDNode* Result = CurDAG->getMachineNode(opc, dl, ResTys, Ops);
-
-    SDValue ST;
-    MachineMemOperand *Mem =
-      MF->getMachineMemOperand(MachinePointerInfo(),
-                               MachineMemOperand::MOStore, memsize, align);
-    if (MvtSize != MVT::i32)
-      ST = CurDAG->getTruncStore(Chain, dl, SDValue(Result, 1), Load,
-                                 MvtSize, Mem);
-    else
-      ST = CurDAG->getStore(Chain, dl, SDValue(Result, 1), Load, Mem);
-
-    SDNode* Store = SelectStore(ST.getNode());
-
-    const SDValue Froms[] = { SDValue(N, 0),
-                              SDValue(N, 1) };
-    const SDValue Tos[]   = { SDValue(Result, 0),
-                              SDValue(Store, 0) };
-    ReplaceUses(Froms, Tos, 2);
-    return Result;
-  }
-
-  if (IntNo == Intrinsic::hexagon_brev_ldd  ||
-      IntNo == Intrinsic::hexagon_brev_ldw  ||
-      IntNo == Intrinsic::hexagon_brev_ldh  ||
-      IntNo == Intrinsic::hexagon_brev_lduh ||
-      IntNo == Intrinsic::hexagon_brev_ldb  ||
-      IntNo == Intrinsic::hexagon_brev_ldub) {
-    SDLoc dl(N);
-    SDValue Chain = N->getOperand(0);
-    SDValue Base = N->getOperand(2);
-    SDValue Load = N->getOperand(3);
-    SDValue ModifierExpr = N->getOperand(4);
-
-    // We need to add the rerurn type for the load.  This intrinsic has
-    // two return types, one for the load and one for the post-increment.
-    std::vector<EVT> ResTys;
-    unsigned opc;
-    unsigned memsize, align;
-    MVT MvtSize = MVT::i32;
-
-    if (IntNo == Intrinsic::hexagon_brev_ldd) {
-      ResTys.push_back(MVT::i32);
-      ResTys.push_back(MVT::i64);
-      opc = Hexagon::L2_loadrd_pbr_pseudo;
-      memsize = 8;
-      align = 8;
-    } else if (IntNo == Intrinsic::hexagon_brev_ldw) {
-      ResTys.push_back(MVT::i32);
-      ResTys.push_back(MVT::i32);
-      opc = Hexagon::L2_loadri_pbr_pseudo;
-      memsize = 4;
-      align = 4;
-    } else if (IntNo == Intrinsic::hexagon_brev_ldh) {
-      ResTys.push_back(MVT::i32);
-      ResTys.push_back(MVT::i32);
-      opc = Hexagon::L2_loadrh_pbr_pseudo;
-      memsize = 2;
-      align = 2;
-      MvtSize = MVT::i16;
-    } else if (IntNo == Intrinsic::hexagon_brev_lduh) {
-      ResTys.push_back(MVT::i32);
-      ResTys.push_back(MVT::i32);
-      opc = Hexagon::L2_loadruh_pbr_pseudo;
-      memsize = 2;
-      align = 2;
-      MvtSize = MVT::i16;
-    } else if (IntNo == Intrinsic::hexagon_brev_ldb) {
-      ResTys.push_back(MVT::i32);
-      ResTys.push_back(MVT::i32);
-      opc = Hexagon::L2_loadrb_pbr_pseudo;
-      memsize = 1;
-      align = 1;
-      MvtSize = MVT::i8;
-    } else if (IntNo == Intrinsic::hexagon_brev_ldub) {
-      ResTys.push_back(MVT::i32);
-      ResTys.push_back(MVT::i32);
-      opc = Hexagon::L2_loadrub_pbr_pseudo;
-      memsize = 1;
-      align = 1;
-      MvtSize = MVT::i8;
-    } else
-      llvm_unreachable("no opc");
-
-    ResTys.push_back(MVT::Other);
-
-    // Copy over the arguments, which are the same mostly.
-    SmallVector<SDValue, 4> Ops;
-    Ops.push_back(Base);
-    Ops.push_back(Load);
-    Ops.push_back(ModifierExpr);
-    Ops.push_back(Chain);
-    SDNode* Result = CurDAG->getMachineNode(opc, dl, ResTys, Ops);
-    SDValue ST;
-    MachineMemOperand *Mem =
-      MF->getMachineMemOperand(MachinePointerInfo(),
-                               MachineMemOperand::MOStore, memsize, align);
-    if (MvtSize != MVT::i32)
-      ST = CurDAG->getTruncStore(Chain, dl, SDValue(Result, 1), Load,
-                                 MvtSize, Mem);
-    else
-      ST = CurDAG->getStore(Chain, dl, SDValue(Result, 1), Load, Mem);
-
-    SDNode* Store = SelectStore(ST.getNode());
-
-    const SDValue Froms[] = { SDValue(N, 0),
-                              SDValue(N, 1) };
-    const SDValue Tos[]   = { SDValue(Result, 0),
-                              SDValue(Store, 0) };
-    ReplaceUses(Froms, Tos, 2);
-    return Result;
+void HexagonDAGToDAGISel::SelectIntrinsicWChain(SDNode *N) {
+  if (MachineSDNode *L = LoadInstrForLoadIntrinsic(N)) {
+    StoreInstrForLoadIntrinsic(L, N);
+    CurDAG->RemoveDeadNode(N);
+    return;
   }
-
-  return SelectCode(N);
+  SelectCode(N);
 }
 
-//
-// Checking for intrinsics which have predicate registers as operand(s)
-// and lowering to the actual intrinsic.
-//
-SDNode *HexagonDAGToDAGISel::SelectIntrinsicWOChain(SDNode *N) {
+void HexagonDAGToDAGISel::SelectIntrinsicWOChain(SDNode *N) {
   unsigned IID = cast<ConstantSDNode>(N->getOperand(0))->getZExtValue();
   unsigned Bits;
   switch (IID) {
@@ -1045,42 +899,51 @@ SDNode *HexagonDAGToDAGISel::SelectIntrinsicWOChain(SDNode *N) {
     Bits = 16;
     break;
   default:
-    return SelectCode(N);
+    SelectCode(N);
+    return;
   }
 
-  SDValue const &V = N->getOperand(1);
+  SDValue V = N->getOperand(1);
   SDValue U;
   if (isValueExtension(V, Bits, U)) {
     SDValue R = CurDAG->getNode(N->getOpcode(), SDLoc(N), N->getValueType(0),
-      N->getOperand(0), U);
-    return SelectCode(R.getNode());
+                                N->getOperand(0), U);
+    ReplaceNode(N, R.getNode());
+    SelectCode(R.getNode());
+    return;
   }
-  return SelectCode(N);
+  SelectCode(N);
 }
 
 //
 // Map floating point constant values.
 //
-SDNode *HexagonDAGToDAGISel::SelectConstantFP(SDNode *N) {
+void HexagonDAGToDAGISel::SelectConstantFP(SDNode *N) {
   SDLoc dl(N);
   ConstantFPSDNode *CN = dyn_cast<ConstantFPSDNode>(N);
-  APFloat APF = CN->getValueAPF();
+  const APFloat &APF = CN->getValueAPF();
   if (N->getValueType(0) == MVT::f32) {
-    return CurDAG->getMachineNode(Hexagon::TFRI_f, dl, MVT::f32,
-              CurDAG->getTargetConstantFP(APF.convertToFloat(), dl, MVT::f32));
+    ReplaceNode(
+        N, CurDAG->getMachineNode(Hexagon::TFRI_f, dl, MVT::f32,
+                                  CurDAG->getTargetConstantFP(
+                                      APF.convertToFloat(), dl, MVT::f32)));
+    return;
   }
   else if (N->getValueType(0) == MVT::f64) {
-    return CurDAG->getMachineNode(Hexagon::CONST64_Float_Real, dl, MVT::f64,
-              CurDAG->getTargetConstantFP(APF.convertToDouble(), dl, MVT::f64));
+    ReplaceNode(
+        N, CurDAG->getMachineNode(Hexagon::CONST64_Float_Real, dl, MVT::f64,
+                                  CurDAG->getTargetConstantFP(
+                                      APF.convertToDouble(), dl, MVT::f64)));
+    return;
   }
 
-  return SelectCode(N);
+  SelectCode(N);
 }
 
 //
 // Map predicate true (encoded as -1 in LLVM) to a XOR.
 //
-SDNode *HexagonDAGToDAGISel::SelectConstant(SDNode *N) {
+void HexagonDAGToDAGISel::SelectConstant(SDNode *N) {
   SDLoc dl(N);
   if (N->getValueType(0) == MVT::i1) {
     SDNode* Result = 0;
@@ -1091,28 +954,30 @@ SDNode *HexagonDAGToDAGISel::SelectConstant(SDNode *N) {
       Result = CurDAG->getMachineNode(Hexagon::TFR_PdFalse, dl, MVT::i1);
     }
     if (Result) {
-      ReplaceUses(N, Result);
-      return Result;
+      ReplaceNode(N, Result);
+      return;
     }
   }
 
-  return SelectCode(N);
+  SelectCode(N);
 }
 
 
 //
 // Map add followed by a asr -> asr +=.
 //
-SDNode *HexagonDAGToDAGISel::SelectAdd(SDNode *N) {
+void HexagonDAGToDAGISel::SelectAdd(SDNode *N) {
   SDLoc dl(N);
   if (N->getValueType(0) != MVT::i32) {
-    return SelectCode(N);
+    SelectCode(N);
+    return;
   }
   // Identify nodes of the form: add(asr(...)).
   SDNode* Src1 = N->getOperand(0).getNode();
   if (Src1->getOpcode() != ISD::SRA || !Src1->hasOneUse()
       || Src1->getValueType(0) != MVT::i32) {
-    return SelectCode(N);
+    SelectCode(N);
+    return;
   }
 
   // Build Rd = Rd' + asr(Rs, Rt). The machine constraints will ensure that
@@ -1121,9 +986,7 @@ SDNode *HexagonDAGToDAGISel::SelectAdd(SDNode *N) {
                                           N->getOperand(1),
                                           Src1->getOperand(0),
                                           Src1->getOperand(1));
-  ReplaceUses(N, Result);
-
-  return Result;
+  ReplaceNode(N, Result);
 }
 
 //
@@ -1132,26 +995,32 @@ SDNode *HexagonDAGToDAGISel::SelectAdd(SDNode *N) {
 // OR -> setbit
 // XOR/FNEG ->toggle_bit.
 //
-SDNode *HexagonDAGToDAGISel::SelectBitOp(SDNode *N) {
+void HexagonDAGToDAGISel::SelectBitOp(SDNode *N) {
   SDLoc dl(N);
   EVT ValueVT = N->getValueType(0);
 
   // We handle only 32 and 64-bit bit ops.
   if (!(ValueVT == MVT::i32 || ValueVT == MVT::i64 ||
-        ValueVT == MVT::f32 || ValueVT == MVT::f64))
-    return SelectCode(N);
+        ValueVT == MVT::f32 || ValueVT == MVT::f64)) {
+    SelectCode(N);
+    return;
+  }
 
   // We handly only fabs and fneg for V5.
   unsigned Opc = N->getOpcode();
-  if ((Opc == ISD::FABS || Opc == ISD::FNEG) && !HST->hasV5TOps())
-    return SelectCode(N);
+  if ((Opc == ISD::FABS || Opc == ISD::FNEG) && !HST->hasV5TOps()) {
+    SelectCode(N);
+    return;
+  }
 
   int64_t Val = 0;
   if (Opc != ISD::FABS && Opc != ISD::FNEG) {
     if (N->getOperand(1).getOpcode() == ISD::Constant)
       Val = cast<ConstantSDNode>((N)->getOperand(1))->getSExtValue();
-    else
-     return SelectCode(N);
+    else {
+     SelectCode(N);
+     return;
+    }
   }
 
   if (Opc == ISD::AND) {
@@ -1159,8 +1028,10 @@ SDNode *HexagonDAGToDAGISel::SelectBitOp(SDNode *N) {
     if ((ValueVT == MVT::i32 && isPowerOf2_32(~Val)) ||
         (ValueVT == MVT::i64 && isPowerOf2_64(~Val)))
       Val = ~Val;
-    else
-      return SelectCode(N);
+    else {
+      SelectCode(N);
+      return;
+    }
   }
 
   // If OR or AND is being fed by shl, srl and, sra don't do this change,
@@ -1173,7 +1044,8 @@ SDNode *HexagonDAGToDAGISel::SelectBitOp(SDNode *N) {
       case ISD::SRA:
       case ISD::SRL:
       case ISD::SHL:
-        return SelectCode(N);
+        SelectCode(N);
+        return;
     }
   }
 
@@ -1181,8 +1053,10 @@ SDNode *HexagonDAGToDAGISel::SelectBitOp(SDNode *N) {
   unsigned BitPos = 0;
   if (Opc != ISD::FABS && Opc != ISD::FNEG) {
     if ((ValueVT == MVT::i32 && !isPowerOf2_32(Val)) ||
-        (ValueVT == MVT::i64 && !isPowerOf2_64(Val)))
-      return SelectCode(N);
+        (ValueVT == MVT::i64 && !isPowerOf2_64(Val))) {
+      SelectCode(N);
+      return;
+    }
 
     // Get the bit position.
     BitPos = countTrailingZeros(uint64_t(Val));
@@ -1259,12 +1133,11 @@ SDNode *HexagonDAGToDAGISel::SelectBitOp(SDNode *N) {
     }
   }
 
-  ReplaceUses(N, Result);
-  return Result;
+  ReplaceNode(N, Result);
 }
 
 
-SDNode *HexagonDAGToDAGISel::SelectFrameIndex(SDNode *N) {
+void HexagonDAGToDAGISel::SelectFrameIndex(SDNode *N) {
   MachineFrameInfo *MFI = MF->getFrameInfo();
   const HexagonFrameLowering *HFI = HST->getFrameLowering();
   int FX = cast<FrameIndexSDNode>(N)->getIndex();
@@ -1290,61 +1163,91 @@ SDNode *HexagonDAGToDAGISel::SelectFrameIndex(SDNode *N) {
     R = CurDAG->getMachineNode(Hexagon::TFR_FIA, DL, MVT::i32, Ops);
   }
 
-  if (N->getHasDebugValue())
-    CurDAG->TransferDbgValues(SDValue(N, 0), SDValue(R, 0));
-  return R;
+  ReplaceNode(N, R);
 }
 
 
-SDNode *HexagonDAGToDAGISel::Select(SDNode *N) {
+void HexagonDAGToDAGISel::SelectBitcast(SDNode *N) {
+  EVT SVT = N->getOperand(0).getValueType();
+  EVT DVT = N->getValueType(0);
+  if (!SVT.isVector() || !DVT.isVector() ||
+      SVT.getVectorElementType() == MVT::i1 ||
+      DVT.getVectorElementType() == MVT::i1 ||
+      SVT.getSizeInBits() != DVT.getSizeInBits()) {
+    SelectCode(N);
+    return;
+  }
+
+  CurDAG->ReplaceAllUsesOfValueWith(SDValue(N,0), N->getOperand(0));
+  CurDAG->RemoveDeadNode(N);
+}
+
+
+void HexagonDAGToDAGISel::Select(SDNode *N) {
   if (N->isMachineOpcode()) {
     N->setNodeId(-1);
-    return nullptr;   // Already selected.
+    return;   // Already selected.
   }
 
   switch (N->getOpcode()) {
   case ISD::Constant:
-    return SelectConstant(N);
+    SelectConstant(N);
+    return;
 
   case ISD::ConstantFP:
-    return SelectConstantFP(N);
+    SelectConstantFP(N);
+    return;
 
   case ISD::FrameIndex:
-    return SelectFrameIndex(N);
+    SelectFrameIndex(N);
+    return;
 
   case ISD::ADD:
-    return SelectAdd(N);
+    SelectAdd(N);
+    return;
+
+  case ISD::BITCAST:
+    SelectBitcast(N);
+    return;
 
   case ISD::SHL:
-    return SelectSHL(N);
+    SelectSHL(N);
+    return;
 
   case ISD::LOAD:
-    return SelectLoad(N);
+    SelectLoad(N);
+    return;
 
   case ISD::STORE:
-    return SelectStore(N);
+    SelectStore(N);
+    return;
 
   case ISD::MUL:
-    return SelectMul(N);
+    SelectMul(N);
+    return;
 
   case ISD::AND:
   case ISD::OR:
   case ISD::XOR:
   case ISD::FABS:
   case ISD::FNEG:
-    return SelectBitOp(N);
+    SelectBitOp(N);
+    return;
 
   case ISD::ZERO_EXTEND:
-    return SelectZeroExtend(N);
+    SelectZeroExtend(N);
+    return;
 
   case ISD::INTRINSIC_W_CHAIN:
-    return SelectIntrinsicWChain(N);
+    SelectIntrinsicWChain(N);
+    return;
 
   case ISD::INTRINSIC_WO_CHAIN:
-    return SelectIntrinsicWOChain(N);
+    SelectIntrinsicWOChain(N);
+    return;
   }
 
-  return SelectCode(N);
+  SelectCode(N);
 }
 
 bool HexagonDAGToDAGISel::
@@ -1380,7 +1283,7 @@ void HexagonDAGToDAGISel::PreprocessISelDAG() {
   // Simplify: (or (select c x 0) z)  ->  (select c (or x z) z)
   //           (or (select c 0 y) z)  ->  (select c z (or y z))
   // This may not be the right thing for all targets, so do it here.
-  for (auto I: Nodes) {
+  for (auto I : Nodes) {
     if (I->getOpcode() != ISD::OR)
       continue;
 
@@ -1392,7 +1295,7 @@ void HexagonDAGToDAGISel::PreprocessISelDAG() {
     auto IsSelect0 = [IsZero] (const SDValue &Op) -> bool {
       if (Op.getOpcode() != ISD::SELECT)
         return false;
-      return IsZero(Op.getOperand(1))  || IsZero(Op.getOperand(2));
+      return IsZero(Op.getOperand(1)) || IsZero(Op.getOperand(2));
     };
 
     SDValue N0 = I->getOperand(0), N1 = I->getOperand(1);
@@ -1417,6 +1320,59 @@ void HexagonDAGToDAGISel::PreprocessISelDAG() {
       }
     }
   }
+
+  // Transform: (store ch addr (add x (add (shl y c) e)))
+  //        to: (store ch addr (add x (shl (add y d) c))),
+  // where e = (shl d c) for some integer d.
+  // The purpose of this is to enable generation of loads/stores with
+  // shifted addressing mode, i.e. mem(x+y<<#c). For that, the shift
+  // value c must be 0, 1 or 2.
+  for (auto I : Nodes) {
+    if (I->getOpcode() != ISD::STORE)
+      continue;
+
+    // I matched: (store ch addr Off)
+    SDValue Off = I->getOperand(2);
+    // Off needs to match: (add x (add (shl y c) (shl d c))))
+    if (Off.getOpcode() != ISD::ADD)
+      continue;
+    // Off matched: (add x T0)
+    SDValue T0 = Off.getOperand(1);
+    // T0 needs to match: (add T1 T2):
+    if (T0.getOpcode() != ISD::ADD)
+      continue;
+    // T0 matched: (add T1 T2)
+    SDValue T1 = T0.getOperand(0);
+    SDValue T2 = T0.getOperand(1);
+    // T1 needs to match: (shl y c)
+    if (T1.getOpcode() != ISD::SHL)
+      continue;
+    SDValue C = T1.getOperand(1);
+    ConstantSDNode *CN = dyn_cast<ConstantSDNode>(C.getNode());
+    if (CN == nullptr)
+      continue;
+    unsigned CV = CN->getZExtValue();
+    if (CV > 2)
+      continue;
+    // T2 needs to match e, where e = (shl d c) for some d.
+    ConstantSDNode *EN = dyn_cast<ConstantSDNode>(T2.getNode());
+    if (EN == nullptr)
+      continue;
+    unsigned EV = EN->getZExtValue();
+    if (EV % (1 << CV) != 0)
+      continue;
+    unsigned DV = EV / (1 << CV);
+
+    // Replace T0 with: (shl (add y d) c)
+    SDLoc DL = SDLoc(I);
+    EVT VT = T0.getValueType();
+    SDValue D = DAG.getConstant(DV, DL, VT);
+    // NewAdd = (add y d)
+    SDValue NewAdd = DAG.getNode(ISD::ADD, DL, VT, T1.getOperand(0), D);
+    // NewShl = (shl NewAdd c)
+    SDValue NewShl = DAG.getNode(ISD::SHL, DL, VT, NewAdd, C);
+    ReplaceNode(T0.getNode(), NewShl.getNode());
+  }
 }
 
 void HexagonDAGToDAGISel::EmitFunctionEntryCode() {
@@ -1561,3 +1517,26 @@ bool HexagonDAGToDAGISel::isValueExtension(const SDValue &Val,
   }
   return false;
 }
+
+
+bool HexagonDAGToDAGISel::orIsAdd(const SDNode *N) const {
+  assert(N->getOpcode() == ISD::OR);
+  auto *C = dyn_cast<ConstantSDNode>(N->getOperand(1));
+  assert(C);
+
+  // Detect when "or" is used to add an offset to a stack object.
+  if (auto *FN = dyn_cast<FrameIndexSDNode>(N->getOperand(0))) {
+    MachineFrameInfo *MFI = MF->getFrameInfo();
+    unsigned A = MFI->getObjectAlignment(FN->getIndex());
+    assert(isPowerOf2_32(A));
+    int32_t Off = C->getSExtValue();
+    // If the alleged offset fits in the zero bits guaranteed by
+    // the alignment, then this or is really an add.
+    return (Off >= 0) && (((A-1) & Off) == unsigned(Off));
+  }
+  return false;
+}
+
+bool HexagonDAGToDAGISel::isAlignedMemNode(const MemSDNode *N) const {
+  return N->getAlignment() >= N->getMemoryVT().getStoreSize();
+}
diff --git a/lib/Target/Hexagon/HexagonISelLowering.cpp b/lib/Target/Hexagon/HexagonISelLowering.cpp
index 01670902e2b0..cdd4c2f8617d 100644
--- a/lib/Target/Hexagon/HexagonISelLowering.cpp
+++ b/lib/Target/Hexagon/HexagonISelLowering.cpp
@@ -389,9 +389,12 @@ static bool RetCC_Hexagon(unsigned ValNo, MVT ValVT,
   bool UseHVX = HST.useHVXOps();
   bool UseHVXDbl = HST.useHVXDblOps();
 
-  if (LocVT == MVT::i1 ||
-      LocVT == MVT::i8 ||
-      LocVT == MVT::i16) {
+  if (LocVT == MVT::i1) {
+    // Return values of type MVT::i1 still need to be assigned to R0, but
+    // the value type needs to remain i1. LowerCallResult will deal with it,
+    // but it needs to recognize i1 as the value type.
+    LocVT = MVT::i32;
+  } else if (LocVT == MVT::i8 || LocVT == MVT::i16) {
     LocVT = MVT::i32;
     ValVT = MVT::i32;
     if (ArgFlags.isSExt())
@@ -443,9 +446,14 @@ static bool RetCC_Hexagon(unsigned ValNo, MVT ValVT,
 static bool RetCC_Hexagon32(unsigned ValNo, MVT ValVT,
                             MVT LocVT, CCValAssign::LocInfo LocInfo,
                             ISD::ArgFlagsTy ArgFlags, CCState &State) {
-
   if (LocVT == MVT::i32 || LocVT == MVT::f32) {
-    if (unsigned Reg = State.AllocateReg(Hexagon::R0)) {
+    // Note that use of registers beyond R1 is not ABI compliant. However there
+    // are (experimental) IR passes which generate internal functions that
+    // return structs using these additional registers.
+    static const uint16_t RegList[] = { Hexagon::R0, Hexagon::R1,
+                                        Hexagon::R2, Hexagon::R3,
+                                        Hexagon::R4, Hexagon::R5};
+    if (unsigned Reg = State.AllocateReg(RegList)) {
       State.addLoc(CCValAssign::getReg(ValNo, ValVT, Reg, LocVT, LocInfo));
       return false;
     }
@@ -505,15 +513,13 @@ static bool RetCC_HexagonVector(unsigned ValNo, MVT ValVT,
   return false;
 }
 
-void HexagonTargetLowering::promoteLdStType(EVT VT, EVT PromotedLdStVT) {
+void HexagonTargetLowering::promoteLdStType(MVT VT, MVT PromotedLdStVT) {
   if (VT != PromotedLdStVT) {
-    setOperationAction(ISD::LOAD, VT.getSimpleVT(), Promote);
-    AddPromotedToType(ISD::LOAD, VT.getSimpleVT(),
-                      PromotedLdStVT.getSimpleVT());
+    setOperationAction(ISD::LOAD, VT, Promote);
+    AddPromotedToType(ISD::LOAD, VT, PromotedLdStVT);
 
-    setOperationAction(ISD::STORE, VT.getSimpleVT(), Promote);
-    AddPromotedToType(ISD::STORE, VT.getSimpleVT(),
-                      PromotedLdStVT.getSimpleVT());
+    setOperationAction(ISD::STORE, VT, Promote);
+    AddPromotedToType(ISD::STORE, VT, PromotedLdStVT);
   }
 }
 
@@ -528,10 +534,9 @@ const {
 /// specified by the specific parameter attribute. The copy will be passed as
 /// a byval function parameter.  Sometimes what we are copying is the end of a
 /// larger object, the part that does not fit in registers.
-static SDValue
-CreateCopyOfByValArgument(SDValue Src, SDValue Dst, SDValue Chain,
-                          ISD::ArgFlagsTy Flags, SelectionDAG &DAG,
-                          SDLoc dl) {
+static SDValue CreateCopyOfByValArgument(SDValue Src, SDValue Dst,
+                                         SDValue Chain, ISD::ArgFlagsTy Flags,
+                                         SelectionDAG &DAG, const SDLoc &dl) {
 
   SDValue SizeNode = DAG.getConstant(Flags.getByValSize(), dl, MVT::i32);
   return DAG.getMemcpy(Chain, dl, Dst, Src, SizeNode, Flags.getByValAlign(),
@@ -554,11 +559,11 @@ static bool IsHvxVectorType(MVT ty) {
 // passed by value, the function prototype is modified to return void and
 // the value is stored in memory pointed by a pointer passed by caller.
 SDValue
-HexagonTargetLowering::LowerReturn(SDValue Chain,
-                                   CallingConv::ID CallConv, bool isVarArg,
+HexagonTargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv,
+                                   bool isVarArg,
                                    const SmallVectorImpl<ISD::OutputArg> &Outs,
                                    const SmallVectorImpl<SDValue> &OutVals,
-                                   SDLoc dl, SelectionDAG &DAG) const {
+                                   const SDLoc &dl, SelectionDAG &DAG) const {
 
   // CCValAssign - represent the assignment of the return value to locations.
   SmallVector<CCValAssign, 16> RVLocs;
@@ -608,16 +613,11 @@ bool HexagonTargetLowering::mayBeEmittedAsTailCall(CallInst *CI) const {
 /// Chain/InFlag are the input chain/flag to use, and that TheCall is the call
 /// being lowered. Returns a SDNode with the same number of values as the
 /// ISD::CALL.
-SDValue
-HexagonTargetLowering::LowerCallResult(SDValue Chain, SDValue InFlag,
-                                       CallingConv::ID CallConv, bool isVarArg,
-                                       const
-                                       SmallVectorImpl<ISD::InputArg> &Ins,
-                                       SDLoc dl, SelectionDAG &DAG,
-                                       SmallVectorImpl<SDValue> &InVals,
-                                       const SmallVectorImpl<SDValue> &OutVals,
-                                       SDValue Callee) const {
-
+SDValue HexagonTargetLowering::LowerCallResult(
+    SDValue Chain, SDValue InFlag, CallingConv::ID CallConv, bool isVarArg,
+    const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &dl,
+    SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals,
+    const SmallVectorImpl<SDValue> &OutVals, SDValue Callee) const {
   // Assign locations to each value returned by this call.
   SmallVector<CCValAssign, 16> RVLocs;
 
@@ -628,11 +628,30 @@ HexagonTargetLowering::LowerCallResult(SDValue Chain, SDValue InFlag,
 
   // Copy all of the result registers out of their specified physreg.
   for (unsigned i = 0; i != RVLocs.size(); ++i) {
-    Chain = DAG.getCopyFromReg(Chain, dl,
-                               RVLocs[i].getLocReg(),
-                               RVLocs[i].getValVT(), InFlag).getValue(1);
-    InFlag = Chain.getValue(2);
-    InVals.push_back(Chain.getValue(0));
+    SDValue RetVal;
+    if (RVLocs[i].getValVT() == MVT::i1) {
+      // Return values of type MVT::i1 require special handling. The reason
+      // is that MVT::i1 is associated with the PredRegs register class, but
+      // values of that type are still returned in R0. Generate an explicit
+      // copy into a predicate register from R0, and treat the value of the
+      // predicate register as the call result.
+      auto &MRI = DAG.getMachineFunction().getRegInfo();
+      SDValue FR0 = DAG.getCopyFromReg(Chain, dl, RVLocs[i].getLocReg(),
+                                       MVT::i32, InFlag);
+      // FR0 = (Value, Chain, Glue)
+      unsigned PredR = MRI.createVirtualRegister(&Hexagon::PredRegsRegClass);
+      SDValue TPR = DAG.getCopyToReg(FR0.getValue(1), dl, PredR,
+                                     FR0.getValue(0), FR0.getValue(2));
+      // TPR = (Chain, Glue)
+      RetVal = DAG.getCopyFromReg(TPR.getValue(0), dl, PredR, MVT::i1,
+                                  TPR.getValue(1));
+    } else {
+      RetVal = DAG.getCopyFromReg(Chain, dl, RVLocs[i].getLocReg(),
+                                  RVLocs[i].getValVT(), InFlag);
+    }
+    InVals.push_back(RetVal.getValue(0));
+    Chain = RetVal.getValue(1);
+    InFlag = RetVal.getValue(2);
   }
 
   return Chain;
@@ -759,8 +778,7 @@ HexagonTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
       } else {
         MachinePointerInfo LocPI = MachinePointerInfo::getStack(
             DAG.getMachineFunction(), LocMemOffset);
-        SDValue S = DAG.getStore(Chain, dl, Arg, MemAddr, LocPI, false,
-                                 false, 0);
+        SDValue S = DAG.getStore(Chain, dl, Arg, MemAddr, LocPI);
         MemOpChains.push_back(S);
       }
       continue;
@@ -990,6 +1008,34 @@ HexagonTargetLowering::LowerINLINEASM(SDValue Op, SelectionDAG &DAG) const {
   return Op;
 }
 
+// Need to transform ISD::PREFETCH into something that doesn't inherit
+// all of the properties of ISD::PREFETCH, specifically SDNPMayLoad and
+// SDNPMayStore.
+SDValue HexagonTargetLowering::LowerPREFETCH(SDValue Op,
+                                             SelectionDAG &DAG) const {
+  SDValue Chain = Op.getOperand(0);
+  SDValue Addr = Op.getOperand(1);
+  // Lower it to DCFETCH($reg, #0).  A "pat" will try to merge the offset in,
+  // if the "reg" is fed by an "add".
+  SDLoc DL(Op);
+  SDValue Zero = DAG.getConstant(0, DL, MVT::i32);
+  return DAG.getNode(HexagonISD::DCFETCH, DL, MVT::Other, Chain, Addr, Zero);
+}
+
+SDValue HexagonTargetLowering::LowerINTRINSIC_VOID(SDValue Op,
+      SelectionDAG &DAG) const {
+  SDValue Chain = Op.getOperand(0);
+  unsigned IntNo = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue();
+  // Lower the hexagon_prefetch builtin to DCFETCH, as above.
+  if (IntNo == Intrinsic::hexagon_prefetch) {
+    SDValue Addr = Op.getOperand(2);
+    SDLoc DL(Op);
+    SDValue Zero = DAG.getConstant(0, DL, MVT::i32);
+    return DAG.getNode(HexagonISD::DCFETCH, DL, MVT::Other, Chain, Addr, Zero);
+  }
+  return SDValue();
+}
+
 SDValue
 HexagonTargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op,
                                                SelectionDAG &DAG) const {
@@ -1016,20 +1062,15 @@ HexagonTargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op,
   SDValue AC = DAG.getConstant(A, dl, MVT::i32);
   SDVTList VTs = DAG.getVTList(MVT::i32, MVT::Other);
   SDValue AA = DAG.getNode(HexagonISD::ALLOCA, dl, VTs, Chain, Size, AC);
-  if (Op.getNode()->getHasDebugValue())
-    DAG.TransferDbgValues(Op, AA);
+
+  DAG.ReplaceAllUsesOfValueWith(Op, AA);
   return AA;
 }
 
-SDValue
-HexagonTargetLowering::LowerFormalArguments(SDValue Chain,
-                                            CallingConv::ID CallConv,
-                                            bool isVarArg,
-                                            const
-                                            SmallVectorImpl<ISD::InputArg> &Ins,
-                                            SDLoc dl, SelectionDAG &DAG,
-                                            SmallVectorImpl<SDValue> &InVals)
-const {
+SDValue HexagonTargetLowering::LowerFormalArguments(
+    SDValue Chain, CallingConv::ID CallConv, bool isVarArg,
+    const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &dl,
+    SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals) const {
 
   MachineFunction &MF = DAG.getMachineFunction();
   MachineFrameInfo *MFI = MF.getFrameInfo();
@@ -1144,9 +1185,8 @@ const {
         // location.
         InVals.push_back(FIN);
       } else {
-        InVals.push_back(DAG.getLoad(VA.getLocVT(), dl, Chain, FIN,
-                                     MachinePointerInfo(), false, false,
-                                     false, 0));
+        InVals.push_back(
+            DAG.getLoad(VA.getLocVT(), dl, Chain, FIN, MachinePointerInfo()));
       }
     }
   }
@@ -1174,13 +1214,13 @@ HexagonTargetLowering::LowerVASTART(SDValue Op, SelectionDAG &DAG) const {
   HexagonMachineFunctionInfo *QFI = MF.getInfo<HexagonMachineFunctionInfo>();
   SDValue Addr = DAG.getFrameIndex(QFI->getVarArgsFrameIndex(), MVT::i32);
   const Value *SV = cast<SrcValueSDNode>(Op.getOperand(2))->getValue();
-  return DAG.getStore(Op.getOperand(0), SDLoc(Op), Addr,
-                      Op.getOperand(1), MachinePointerInfo(SV), false,
-                      false, 0);
+  return DAG.getStore(Op.getOperand(0), SDLoc(Op), Addr, Op.getOperand(1),
+                      MachinePointerInfo(SV));
 }
 
 // Creates a SPLAT instruction for a constant value VAL.
-static SDValue createSplat(SelectionDAG &DAG, SDLoc dl, EVT VT, SDValue Val) {
+static SDValue createSplat(SelectionDAG &DAG, const SDLoc &dl, EVT VT,
+                           SDValue Val) {
   if (VT.getSimpleVT() == MVT::v4i8)
     return DAG.getNode(HexagonISD::VSPLATB, dl, VT, Val);
 
@@ -1301,20 +1341,14 @@ SDValue HexagonTargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const {
       SDValue Loads[4];
       // Base load.
       Loads[0] = DAG.getExtLoad(Ext, DL, MVT::i32, Chain, Base,
-                                LoadNode->getPointerInfo(), MVT::i16,
-                                LoadNode->isVolatile(),
-                                LoadNode->isNonTemporal(),
-                                LoadNode->isInvariant(),
-                                Alignment);
+                                LoadNode->getPointerInfo(), MVT::i16, Alignment,
+                                LoadNode->getMemOperand()->getFlags());
       // Base+2 load.
       SDValue Increment = DAG.getConstant(2, DL, MVT::i32);
       Ptr = DAG.getNode(ISD::ADD, DL, Base.getValueType(), Base, Increment);
       Loads[1] = DAG.getExtLoad(Ext, DL, MVT::i32, Chain, Ptr,
-                                LoadNode->getPointerInfo(), MVT::i16,
-                                LoadNode->isVolatile(),
-                                LoadNode->isNonTemporal(),
-                                LoadNode->isInvariant(),
-                                Alignment);
+                                LoadNode->getPointerInfo(), MVT::i16, Alignment,
+                                LoadNode->getMemOperand()->getFlags());
       // SHL 16, then OR base and base+2.
       SDValue ShiftAmount = DAG.getConstant(16, DL, MVT::i32);
       SDValue Tmp1 = DAG.getNode(ISD::SHL, DL, MVT::i32, Loads[1], ShiftAmount);
@@ -1323,20 +1357,14 @@ SDValue HexagonTargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const {
       Increment = DAG.getConstant(4, DL, MVT::i32);
       Ptr = DAG.getNode(ISD::ADD, DL, Base.getValueType(), Base, Increment);
       Loads[2] = DAG.getExtLoad(Ext, DL, MVT::i32, Chain, Ptr,
-                                LoadNode->getPointerInfo(), MVT::i16,
-                                LoadNode->isVolatile(),
-                                LoadNode->isNonTemporal(),
-                                LoadNode->isInvariant(),
-                                Alignment);
+                                LoadNode->getPointerInfo(), MVT::i16, Alignment,
+                                LoadNode->getMemOperand()->getFlags());
       // Base + 6.
       Increment = DAG.getConstant(6, DL, MVT::i32);
       Ptr = DAG.getNode(ISD::ADD, DL, Base.getValueType(), Base, Increment);
       Loads[3] = DAG.getExtLoad(Ext, DL, MVT::i32, Chain, Ptr,
-                                LoadNode->getPointerInfo(), MVT::i16,
-                                LoadNode->isVolatile(),
-                                LoadNode->isNonTemporal(),
-                                LoadNode->isInvariant(),
-                                Alignment);
+                                LoadNode->getPointerInfo(), MVT::i16, Alignment,
+                                LoadNode->getMemOperand()->getFlags());
       // SHL 16, then OR base+4 and base+6.
       Tmp1 = DAG.getNode(ISD::SHL, DL, MVT::i32, Loads[3], ShiftAmount);
       SDValue Tmp4 = DAG.getNode(ISD::OR, DL, MVT::i32, Tmp1, Loads[2]);
@@ -1349,8 +1377,8 @@ SDValue HexagonTargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const {
     } else {
       // Perform default type expansion.
       Result = DAG.getLoad(MVT::i64, DL, Chain, Ptr, LoadNode->getPointerInfo(),
-                           LoadNode->isVolatile(), LoadNode->isNonTemporal(),
-                          LoadNode->isInvariant(), LoadNode->getAlignment());
+                           LoadNode->getAlignment(),
+                           LoadNode->getMemOperand()->getFlags());
       LoadChain = Result.getValue(1);
     }
   } else
@@ -1370,15 +1398,15 @@ HexagonTargetLowering::LowerConstantPool(SDValue Op, SelectionDAG &DAG) const {
   EVT ValTy = Op.getValueType();
   ConstantPoolSDNode *CPN = cast<ConstantPoolSDNode>(Op);
   unsigned Align = CPN->getAlignment();
-  Reloc::Model RM = HTM.getRelocationModel();
-  unsigned char TF = (RM == Reloc::PIC_) ? HexagonII::MO_PCREL : 0;
+  bool IsPositionIndependent = isPositionIndependent();
+  unsigned char TF = IsPositionIndependent ? HexagonII::MO_PCREL : 0;
 
   SDValue T;
   if (CPN->isMachineConstantPoolEntry())
     T = DAG.getTargetConstantPool(CPN->getMachineCPVal(), ValTy, Align, TF);
   else
     T = DAG.getTargetConstantPool(CPN->getConstVal(), ValTy, Align, TF);
-  if (RM == Reloc::PIC_)
+  if (IsPositionIndependent)
     return DAG.getNode(HexagonISD::AT_PCREL, SDLoc(Op), ValTy, T);
   return DAG.getNode(HexagonISD::CP, SDLoc(Op), ValTy, T);
 }
@@ -1387,8 +1415,7 @@ SDValue
 HexagonTargetLowering::LowerJumpTable(SDValue Op, SelectionDAG &DAG) const {
   EVT VT = Op.getValueType();
   int Idx = cast<JumpTableSDNode>(Op)->getIndex();
-  Reloc::Model RM = HTM.getRelocationModel();
-  if (RM == Reloc::PIC_) {
+  if (isPositionIndependent()) {
     SDValue T = DAG.getTargetJumpTable(Idx, VT, HexagonII::MO_PCREL);
     return DAG.getNode(HexagonISD::AT_PCREL, SDLoc(Op), VT, T);
   }
@@ -1415,7 +1442,7 @@ HexagonTargetLowering::LowerRETURNADDR(SDValue Op, SelectionDAG &DAG) const {
     SDValue Offset = DAG.getConstant(4, dl, MVT::i32);
     return DAG.getLoad(VT, dl, DAG.getEntryNode(),
                        DAG.getNode(ISD::ADD, dl, VT, FrameAddr, Offset),
-                       MachinePointerInfo(), false, false, false, 0);
+                       MachinePointerInfo());
   }
 
   // Return LR, which contains the return address. Mark it an implicit live-in.
@@ -1436,8 +1463,7 @@ HexagonTargetLowering::LowerFRAMEADDR(SDValue Op, SelectionDAG &DAG) const {
                                          HRI.getFrameRegister(), VT);
   while (Depth--)
     FrameAddr = DAG.getLoad(VT, dl, DAG.getEntryNode(), FrameAddr,
-                            MachinePointerInfo(),
-                            false, false, false, 0);
+                            MachinePointerInfo());
   return FrameAddr;
 }
 
@@ -1461,13 +1487,12 @@ HexagonTargetLowering::LowerGLOBALADDRESS(SDValue Op, SelectionDAG &DAG) const {
 
   if (RM == Reloc::Static) {
     SDValue GA = DAG.getTargetGlobalAddress(GV, dl, PtrVT, Offset);
-    if (HLOF.IsGlobalInSmallSection(GV, HTM))
+    if (HLOF.isGlobalInSmallSection(GV, HTM))
       return DAG.getNode(HexagonISD::CONST32_GP, dl, PtrVT, GA);
     return DAG.getNode(HexagonISD::CONST32, dl, PtrVT, GA);
   }
 
-  bool UsePCRel = GV->hasInternalLinkage() || GV->hasHiddenVisibility() ||
-                  (GV->hasLocalLinkage() && !isa<Function>(GV));
+  bool UsePCRel = getTargetMachine().shouldAssumeDSOLocal(*GV->getParent(), GV);
   if (UsePCRel) {
     SDValue GA = DAG.getTargetGlobalAddress(GV, dl, PtrVT, Offset,
                                             HexagonII::MO_PCREL);
@@ -1490,7 +1515,7 @@ HexagonTargetLowering::LowerBlockAddress(SDValue Op, SelectionDAG &DAG) const {
 
   Reloc::Model RM = HTM.getRelocationModel();
   if (RM == Reloc::Static) {
-    SDValue A =  DAG.getTargetBlockAddress(BA, PtrVT);
+    SDValue A = DAG.getTargetBlockAddress(BA, PtrVT);
     return DAG.getNode(HexagonISD::CONST32_GP, dl, PtrVT, A);
   }
 
@@ -1507,6 +1532,157 @@ HexagonTargetLowering::LowerGLOBAL_OFFSET_TABLE(SDValue Op, SelectionDAG &DAG)
   return DAG.getNode(HexagonISD::AT_PCREL, SDLoc(Op), PtrVT, GOTSym);
 }
 
+SDValue
+HexagonTargetLowering::GetDynamicTLSAddr(SelectionDAG &DAG, SDValue Chain,
+      GlobalAddressSDNode *GA, SDValue *InFlag, EVT PtrVT, unsigned ReturnReg,
+      unsigned char OperandFlags) const {
+  MachineFrameInfo *MFI = DAG.getMachineFunction().getFrameInfo();
+  SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
+  SDLoc dl(GA);
+  SDValue TGA = DAG.getTargetGlobalAddress(GA->getGlobal(), dl,
+                                           GA->getValueType(0),
+                                           GA->getOffset(),
+                                           OperandFlags);
+  // Create Operands for the call.The Operands should have the following:
+  // 1. Chain SDValue
+  // 2. Callee which in this case is the Global address value.
+  // 3. Registers live into the call.In this case its R0, as we
+  //    have just one argument to be passed.
+  // 4. InFlag if there is any.
+  // Note: The order is important.
+
+  if (InFlag) {
+    SDValue Ops[] = { Chain, TGA,
+                      DAG.getRegister(Hexagon::R0, PtrVT), *InFlag };
+    Chain = DAG.getNode(HexagonISD::CALLv3, dl, NodeTys, Ops);
+  } else {
+    SDValue Ops[]  = { Chain, TGA, DAG.getRegister(Hexagon::R0, PtrVT)};
+    Chain = DAG.getNode(HexagonISD::CALLv3, dl, NodeTys, Ops);
+  }
+
+  // Inform MFI that function has calls.
+  MFI->setAdjustsStack(true);
+
+  SDValue Flag = Chain.getValue(1);
+  return DAG.getCopyFromReg(Chain, dl, ReturnReg, PtrVT, Flag);
+}
+
+//
+// Lower using the intial executable model for TLS addresses
+//
+SDValue
+HexagonTargetLowering::LowerToTLSInitialExecModel(GlobalAddressSDNode *GA,
+      SelectionDAG &DAG) const {
+  SDLoc dl(GA);
+  int64_t Offset = GA->getOffset();
+  auto PtrVT = getPointerTy(DAG.getDataLayout());
+
+  // Get the thread pointer.
+  SDValue TP = DAG.getCopyFromReg(DAG.getEntryNode(), dl, Hexagon::UGP, PtrVT);
+
+  bool IsPositionIndependent = isPositionIndependent();
+  unsigned char TF =
+      IsPositionIndependent ? HexagonII::MO_IEGOT : HexagonII::MO_IE;
+
+  // First generate the TLS symbol address
+  SDValue TGA = DAG.getTargetGlobalAddress(GA->getGlobal(), dl, PtrVT,
+                                           Offset, TF);
+
+  SDValue Sym = DAG.getNode(HexagonISD::CONST32, dl, PtrVT, TGA);
+
+  if (IsPositionIndependent) {
+    // Generate the GOT pointer in case of position independent code
+    SDValue GOT = LowerGLOBAL_OFFSET_TABLE(Sym, DAG);
+
+    // Add the TLS Symbol address to GOT pointer.This gives
+    // GOT relative relocation for the symbol.
+    Sym = DAG.getNode(ISD::ADD, dl, PtrVT, GOT, Sym);
+  }
+
+  // Load the offset value for TLS symbol.This offset is relative to
+  // thread pointer.
+  SDValue LoadOffset =
+      DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), Sym, MachinePointerInfo());
+
+  // Address of the thread local variable is the add of thread
+  // pointer and the offset of the variable.
+  return DAG.getNode(ISD::ADD, dl, PtrVT, TP, LoadOffset);
+}
+
+//
+// Lower using the local executable model for TLS addresses
+//
+SDValue
+HexagonTargetLowering::LowerToTLSLocalExecModel(GlobalAddressSDNode *GA,
+      SelectionDAG &DAG) const {
+  SDLoc dl(GA);
+  int64_t Offset = GA->getOffset();
+  auto PtrVT = getPointerTy(DAG.getDataLayout());
+
+  // Get the thread pointer.
+  SDValue TP = DAG.getCopyFromReg(DAG.getEntryNode(), dl, Hexagon::UGP, PtrVT);
+  // Generate the TLS symbol address
+  SDValue TGA = DAG.getTargetGlobalAddress(GA->getGlobal(), dl, PtrVT, Offset,
+                                           HexagonII::MO_TPREL);
+  SDValue Sym = DAG.getNode(HexagonISD::CONST32, dl, PtrVT, TGA);
+
+  // Address of the thread local variable is the add of thread
+  // pointer and the offset of the variable.
+  return DAG.getNode(ISD::ADD, dl, PtrVT, TP, Sym);
+}
+
+//
+// Lower using the general dynamic model for TLS addresses
+//
+SDValue
+HexagonTargetLowering::LowerToTLSGeneralDynamicModel(GlobalAddressSDNode *GA,
+      SelectionDAG &DAG) const {
+  SDLoc dl(GA);
+  int64_t Offset = GA->getOffset();
+  auto PtrVT = getPointerTy(DAG.getDataLayout());
+
+  // First generate the TLS symbol address
+  SDValue TGA = DAG.getTargetGlobalAddress(GA->getGlobal(), dl, PtrVT, Offset,
+                                           HexagonII::MO_GDGOT);
+
+  // Then, generate the GOT pointer
+  SDValue GOT = LowerGLOBAL_OFFSET_TABLE(TGA, DAG);
+
+  // Add the TLS symbol and the GOT pointer
+  SDValue Sym = DAG.getNode(HexagonISD::CONST32, dl, PtrVT, TGA);
+  SDValue Chain = DAG.getNode(ISD::ADD, dl, PtrVT, GOT, Sym);
+
+  // Copy over the argument to R0
+  SDValue InFlag;
+  Chain = DAG.getCopyToReg(DAG.getEntryNode(), dl, Hexagon::R0, Chain, InFlag);
+  InFlag = Chain.getValue(1);
+
+  return GetDynamicTLSAddr(DAG, Chain, GA, &InFlag, PtrVT,
+                           Hexagon::R0, HexagonII::MO_GDPLT);
+}
+
+//
+// Lower TLS addresses.
+//
+// For now for dynamic models, we only support the general dynamic model.
+//
+SDValue
+HexagonTargetLowering::LowerGlobalTLSAddress(SDValue Op,
+      SelectionDAG &DAG) const {
+  GlobalAddressSDNode *GA = cast<GlobalAddressSDNode>(Op);
+
+  switch (HTM.getTLSModel(GA->getGlobal())) {
+    case TLSModel::GeneralDynamic:
+    case TLSModel::LocalDynamic:
+      return LowerToTLSGeneralDynamicModel(GA, DAG);
+    case TLSModel::InitialExec:
+      return LowerToTLSInitialExecModel(GA, DAG);
+    case TLSModel::LocalExec:
+      return LowerToTLSLocalExecModel(GA, DAG);
+  }
+  llvm_unreachable("Bogus TLS model");
+}
+
 //===----------------------------------------------------------------------===//
 // TargetLowering Implementation
 //===----------------------------------------------------------------------===//
@@ -1524,9 +1700,11 @@ HexagonTargetLowering::HexagonTargetLowering(const TargetMachine &TM,
   setPrefLoopAlignment(4);
   setPrefFunctionAlignment(4);
   setMinFunctionAlignment(2);
-  setInsertFencesForAtomic(false);
   setStackPointerRegisterToSaveRestore(HRI.getStackRegister());
 
+  setMaxAtomicSizeInBitsSupported(64);
+  setMinCmpXchgSizeInBits(32);
+
   if (EnableHexSDNodeSched)
     setSchedulingPreference(Sched::VLIW);
   else
@@ -1606,8 +1784,11 @@ HexagonTargetLowering::HexagonTargetLowering(const TargetMachine &TM,
   setOperationAction(ISD::BUILD_PAIR, MVT::i64, Expand);
   setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i1, Expand);
   setOperationAction(ISD::INLINEASM, MVT::Other, Custom);
+  setOperationAction(ISD::PREFETCH, MVT::Other, Custom);
+  setOperationAction(ISD::INTRINSIC_VOID, MVT::Other, Custom);
   setOperationAction(ISD::EH_RETURN, MVT::Other, Custom);
   setOperationAction(ISD::GLOBAL_OFFSET_TABLE, MVT::i32, Custom);
+  setOperationAction(ISD::GlobalTLSAddress, MVT::i32, Custom);
   setOperationAction(ISD::ATOMIC_FENCE, MVT::Other, Custom);
 
   // Custom legalize GlobalAddress nodes into CONST32.
@@ -1629,9 +1810,9 @@ HexagonTargetLowering::HexagonTargetLowering(const TargetMachine &TM,
   setOperationAction(ISD::DYNAMIC_STACKALLOC, MVT::i32, Custom);
 
   if (EmitJumpTables)
-    setMinimumJumpTableEntries(2);
-  else
     setMinimumJumpTableEntries(MinimumJumpTables);
+  else
+    setMinimumJumpTableEntries(INT_MAX);
   setOperationAction(ISD::BR_JT, MVT::Other, Expand);
 
   // Hexagon has instructions for add/sub with carry. The problem with
@@ -1668,10 +1849,6 @@ HexagonTargetLowering::HexagonTargetLowering(const TargetMachine &TM,
   setOperationAction(ISD::CTLZ, MVT::i16, Promote);
   setOperationAction(ISD::CTTZ, MVT::i8,  Promote);
   setOperationAction(ISD::CTTZ, MVT::i16, Promote);
-  setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i8,  Promote);
-  setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i16, Promote);
-  setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::i8,  Promote);
-  setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::i16, Promote);
 
   // In V5, popcount can count # of 1s in i64 but returns i32.
   // On V4 it will be expanded (set later).
@@ -1751,8 +1928,7 @@ HexagonTargetLowering::HexagonTargetLowering(const TargetMachine &TM,
     ISD::SMUL_LOHI,             ISD::UMUL_LOHI,
     // Logical/bit:
     ISD::AND,     ISD::OR,      ISD::XOR,     ISD::ROTL,    ISD::ROTR,
-    ISD::CTPOP,   ISD::CTLZ,    ISD::CTTZ,    ISD::CTLZ_ZERO_UNDEF,
-    ISD::CTTZ_ZERO_UNDEF,
+    ISD::CTPOP,   ISD::CTLZ,    ISD::CTTZ,
     // Floating point arithmetic/math functions:
     ISD::FADD,    ISD::FSUB,    ISD::FMUL,    ISD::FMA,     ISD::FDIV,
     ISD::FREM,    ISD::FNEG,    ISD::FABS,    ISD::FSQRT,   ISD::FSIN,
@@ -2095,7 +2271,7 @@ static SDValue LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) {
   SDLoc dl(Op);
   EVT VT = Op.getValueType();
 
-  if (V2.getOpcode() == ISD::UNDEF)
+  if (V2.isUndef())
     V2 = V1;
 
   if (SVN->isSplat()) {
@@ -2113,7 +2289,7 @@ static SDValue LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) {
         !isa<ConstantSDNode>(V1.getOperand(0))) {
       bool IsScalarToVector = true;
       for (unsigned i = 1, e = V1.getNumOperands(); i != e; ++i)
-        if (V1.getOperand(i).getOpcode() != ISD::UNDEF) {
+        if (!V1.getOperand(i).isUndef()) {
           IsScalarToVector = false;
           break;
         }
@@ -2235,9 +2411,9 @@ HexagonTargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const {
     SDValue V0 = BVN->getOperand(0);
     SDValue V1 = BVN->getOperand(1);
 
-    if (V0.getOpcode() == ISD::UNDEF)
+    if (V0.isUndef())
       V0 = DAG.getConstant(0, dl, MVT::i32);
-    if (V1.getOpcode() == ISD::UNDEF)
+    if (V1.isUndef())
       V1 = DAG.getConstant(0, dl, MVT::i32);
 
     ConstantSDNode *C0 = dyn_cast<ConstantSDNode>(V0);
@@ -2257,7 +2433,7 @@ HexagonTargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const {
   // Try to generate a S2_packhl to build v2i16 vectors.
   if (VT.getSimpleVT() == MVT::v2i16) {
     for (unsigned i = 0, e = NElts; i != e; ++i) {
-      if (BVN->getOperand(i).getOpcode() == ISD::UNDEF)
+      if (BVN->getOperand(i).isUndef())
         continue;
       ConstantSDNode *Cst = dyn_cast<ConstantSDNode>(BVN->getOperand(i));
       // If the element isn't a constant, it is in a register:
@@ -2285,7 +2461,7 @@ HexagonTargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const {
     // combine, const64, etc. are Big Endian.
     unsigned OpIdx = NElts - i - 1;
     SDValue Operand = BVN->getOperand(OpIdx);
-    if (Operand.getOpcode() == ISD::UNDEF)
+    if (Operand.isUndef())
       continue;
 
     int64_t Val = 0;
@@ -2559,8 +2735,7 @@ HexagonTargetLowering::LowerEH_RETURN(SDValue Op, SelectionDAG &DAG) const {
   SDValue StoreAddr =
       DAG.getNode(ISD::ADD, dl, PtrVT, DAG.getRegister(Hexagon::R30, PtrVT),
                   DAG.getIntPtrConstant(4, dl));
-  Chain = DAG.getStore(Chain, dl, Handler, StoreAddr, MachinePointerInfo(),
-                       false, false, 0);
+  Chain = DAG.getStore(Chain, dl, Handler, StoreAddr, MachinePointerInfo());
   Chain = DAG.getCopyToReg(Chain, dl, OffsetReg, Offset);
 
   // Not needed we already use it as explict input to EH_RETURN.
@@ -2596,6 +2771,7 @@ HexagonTargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
       // Frame & Return address. Currently unimplemented.
     case ISD::RETURNADDR:           return LowerRETURNADDR(Op, DAG);
     case ISD::FRAMEADDR:            return LowerFRAMEADDR(Op, DAG);
+    case ISD::GlobalTLSAddress:     return LowerGlobalTLSAddress(Op, DAG);
     case ISD::ATOMIC_FENCE:         return LowerATOMIC_FENCE(Op, DAG);
     case ISD::GlobalAddress:        return LowerGLOBALADDRESS(Op, DAG);
     case ISD::BlockAddress:         return LowerBlockAddress(Op, DAG);
@@ -2608,7 +2784,9 @@ HexagonTargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
     case ISD::VSELECT:              return LowerVSELECT(Op, DAG);
     case ISD::CTPOP:                return LowerCTPOP(Op, DAG);
     case ISD::INTRINSIC_WO_CHAIN:   return LowerINTRINSIC_WO_CHAIN(Op, DAG);
+    case ISD::INTRINSIC_VOID:       return LowerINTRINSIC_VOID(Op, DAG);
     case ISD::INLINEASM:            return LowerINLINEASM(Op, DAG);
+    case ISD::PREFETCH:             return LowerPREFETCH(Op, DAG);
   }
 }
 
@@ -2622,18 +2800,17 @@ HexagonTargetLowering::getPICJumpTableRelocBase(SDValue Table,
   return DAG.getNode(HexagonISD::AT_PCREL, SDLoc(Table), VT, T);
 }
 
-MachineBasicBlock *
-HexagonTargetLowering::EmitInstrWithCustomInserter(MachineInstr *MI,
-                                                   MachineBasicBlock *BB)
-      const {
-  switch (MI->getOpcode()) {
-    case Hexagon::ALLOCA: {
-      MachineFunction *MF = BB->getParent();
-      auto *FuncInfo = MF->getInfo<HexagonMachineFunctionInfo>();
-      FuncInfo->addAllocaAdjustInst(MI);
-      return BB;
-    }
-    default: llvm_unreachable("Unexpected instr type to insert");
+MachineBasicBlock *HexagonTargetLowering::EmitInstrWithCustomInserter(
+    MachineInstr &MI, MachineBasicBlock *BB) const {
+  switch (MI.getOpcode()) {
+  case Hexagon::ALLOCA: {
+    MachineFunction *MF = BB->getParent();
+    auto *FuncInfo = MF->getInfo<HexagonMachineFunctionInfo>();
+    FuncInfo->addAllocaAdjustInst(&MI);
+    return BB;
+  }
+  default:
+    llvm_unreachable("Unexpected instr type to insert");
   } // switch
 }
 
@@ -2641,6 +2818,20 @@ HexagonTargetLowering::EmitInstrWithCustomInserter(MachineInstr *MI,
 // Inline Assembly Support
 //===----------------------------------------------------------------------===//
 
+TargetLowering::ConstraintType
+HexagonTargetLowering::getConstraintType(StringRef Constraint) const {
+  if (Constraint.size() == 1) {
+    switch (Constraint[0]) {
+      case 'q':
+      case 'v':
+        if (Subtarget.useHVXOps())
+          return C_Register;
+        break;
+    }
+  }
+  return TargetLowering::getConstraintType(Constraint);
+}
+
 std::pair<unsigned, const TargetRegisterClass *>
 HexagonTargetLowering::getRegForInlineAsmConstraint(
     const TargetRegisterInfo *TRI, StringRef Constraint, MVT VT) const {
@@ -2814,6 +3005,32 @@ bool llvm::isPositiveHalfWord(SDNode *N) {
   }
 }
 
+bool HexagonTargetLowering::allowsMisalignedMemoryAccesses(EVT VT,
+      unsigned AS, unsigned Align, bool *Fast) const {
+  if (Fast)
+    *Fast = false;
+
+  switch (VT.getSimpleVT().SimpleTy) {
+  default:
+    return false;
+  case MVT::v64i8:
+  case MVT::v128i8:
+  case MVT::v256i8:
+  case MVT::v32i16:
+  case MVT::v64i16:
+  case MVT::v128i16:
+  case MVT::v16i32:
+  case MVT::v32i32:
+  case MVT::v64i32:
+  case MVT::v8i64:
+  case MVT::v16i64:
+  case MVT::v32i64:
+    return true;
+  }
+  return false;
+}
+
+
 std::pair<const TargetRegisterClass*, uint8_t>
 HexagonTargetLowering::findRepresentativeClass(const TargetRegisterInfo *TRI,
       MVT VT) const {
@@ -2892,3 +3109,10 @@ bool HexagonTargetLowering::shouldExpandAtomicStoreInIR(StoreInst *SI) const {
   // Do not expand loads and stores that don't exceed 64 bits.
   return SI->getValueOperand()->getType()->getPrimitiveSizeInBits() > 64;
 }
+
+bool HexagonTargetLowering::shouldExpandAtomicCmpXchgInIR(
+      AtomicCmpXchgInst *AI) const {
+  const DataLayout &DL = AI->getModule()->getDataLayout();
+  unsigned Size = DL.getTypeStoreSize(AI->getCompareOperand()->getType());
+  return Size >= 4 && Size <= 8;
+}
diff --git a/lib/Target/Hexagon/HexagonISelLowering.h b/lib/Target/Hexagon/HexagonISelLowering.h
index bf378b922220..71f67349befe 100644
--- a/lib/Target/Hexagon/HexagonISelLowering.h
+++ b/lib/Target/Hexagon/HexagonISelLowering.h
@@ -94,7 +94,7 @@ bool isPositiveHalfWord(SDNode *N);
 
     bool CanReturnSmallStruct(const Function* CalleeFn, unsigned& RetSize)
         const;
-    void promoteLdStType(EVT VT, EVT PromotedLdStVT);
+    void promoteLdStType(MVT VT, MVT PromotedLdStVT);
     const HexagonTargetMachine &HTM;
     const HexagonSubtarget &Subtarget;
 
@@ -128,22 +128,37 @@ bool isPositiveHalfWord(SDNode *N);
     SDValue LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const;
     SDValue LowerDYNAMIC_STACKALLOC(SDValue Op, SelectionDAG &DAG) const;
     SDValue LowerINLINEASM(SDValue Op, SelectionDAG &DAG) const;
+    SDValue LowerPREFETCH(SDValue Op, SelectionDAG &DAG) const;
     SDValue LowerEH_LABEL(SDValue Op, SelectionDAG &DAG) const;
     SDValue LowerEH_RETURN(SDValue Op, SelectionDAG &DAG) const;
-    SDValue LowerFormalArguments(SDValue Chain, CallingConv::ID CallConv,
-        bool isVarArg, const SmallVectorImpl<ISD::InputArg> &Ins, SDLoc dl,
-        SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals) const override;
+    SDValue
+    LowerFormalArguments(SDValue Chain, CallingConv::ID CallConv, bool isVarArg,
+                         const SmallVectorImpl<ISD::InputArg> &Ins,
+                         const SDLoc &dl, SelectionDAG &DAG,
+                         SmallVectorImpl<SDValue> &InVals) const override;
     SDValue LowerGLOBALADDRESS(SDValue Op, SelectionDAG &DAG) const;
     SDValue LowerBlockAddress(SDValue Op, SelectionDAG &DAG) const;
+    SDValue LowerGlobalTLSAddress(SDValue Op, SelectionDAG &DAG) const;
+    SDValue LowerToTLSGeneralDynamicModel(GlobalAddressSDNode *GA,
+        SelectionDAG &DAG) const;
+    SDValue LowerToTLSInitialExecModel(GlobalAddressSDNode *GA,
+        SelectionDAG &DAG) const;
+    SDValue LowerToTLSLocalExecModel(GlobalAddressSDNode *GA,
+        SelectionDAG &DAG) const;
+    SDValue GetDynamicTLSAddr(SelectionDAG &DAG, SDValue Chain,
+        GlobalAddressSDNode *GA, SDValue *InFlag, EVT PtrVT,
+        unsigned ReturnReg, unsigned char OperandFlags) const;
     SDValue LowerGLOBAL_OFFSET_TABLE(SDValue Op, SelectionDAG &DAG) const;
 
     SDValue LowerCall(TargetLowering::CallLoweringInfo &CLI,
         SmallVectorImpl<SDValue> &InVals) const override;
     SDValue LowerCallResult(SDValue Chain, SDValue InFlag,
-        CallingConv::ID CallConv, bool isVarArg,
-        const SmallVectorImpl<ISD::InputArg> &Ins, SDLoc dl,
-        SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals,
-        const SmallVectorImpl<SDValue> &OutVals, SDValue Callee) const;
+                            CallingConv::ID CallConv, bool isVarArg,
+                            const SmallVectorImpl<ISD::InputArg> &Ins,
+                            const SDLoc &dl, SelectionDAG &DAG,
+                            SmallVectorImpl<SDValue> &InVals,
+                            const SmallVectorImpl<SDValue> &OutVals,
+                            SDValue Callee) const;
 
     SDValue LowerSETCC(SDValue Op, SelectionDAG &DAG) const;
     SDValue LowerVSELECT(SDValue Op, SelectionDAG &DAG) const;
@@ -153,14 +168,15 @@ bool isPositiveHalfWord(SDNode *N);
     SDValue LowerRETURNADDR(SDValue Op, SelectionDAG &DAG) const;
     SDValue LowerLOAD(SDValue Op, SelectionDAG &DAG) const;
 
-    SDValue LowerReturn(SDValue Chain, CallingConv::ID CallConv,
-        bool isVarArg, const SmallVectorImpl<ISD::OutputArg> &Outs,
-        const SmallVectorImpl<SDValue> &OutVals, SDLoc dl,
-        SelectionDAG &DAG) const override;
+    SDValue LowerReturn(SDValue Chain, CallingConv::ID CallConv, bool isVarArg,
+                        const SmallVectorImpl<ISD::OutputArg> &Outs,
+                        const SmallVectorImpl<SDValue> &OutVals,
+                        const SDLoc &dl, SelectionDAG &DAG) const override;
 
     bool mayBeEmittedAsTailCall(CallInst *CI) const override;
-    MachineBasicBlock * EmitInstrWithCustomInserter(MachineInstr *MI,
-        MachineBasicBlock *BB) const override;
+    MachineBasicBlock *
+    EmitInstrWithCustomInserter(MachineInstr &MI,
+                                MachineBasicBlock *BB) const override;
 
     /// If a physical register, this returns the register that receives the
     /// exception address on entry to an EH pad.
@@ -192,6 +208,8 @@ bool isPositiveHalfWord(SDNode *N);
                                     ISD::MemIndexedMode &AM,
                                     SelectionDAG &DAG) const override;
 
+    ConstraintType getConstraintType(StringRef Constraint) const override;
+
     std::pair<unsigned, const TargetRegisterClass *>
     getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI,
                                  StringRef Constraint, MVT VT) const override;
@@ -200,13 +218,12 @@ bool isPositiveHalfWord(SDNode *N);
     getInlineAsmMemConstraint(StringRef ConstraintCode) const override {
       if (ConstraintCode == "o")
         return InlineAsm::Constraint_o;
-      else if (ConstraintCode == "v")
-        return InlineAsm::Constraint_v;
       return TargetLowering::getInlineAsmMemConstraint(ConstraintCode);
     }
 
     // Intrinsics
     SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG) const;
+    SDValue LowerINTRINSIC_VOID(SDValue Op, SelectionDAG &DAG) const;
     /// isLegalAddressingMode - Return true if the addressing mode represented
     /// by AM is legal for this target, for a load/store of the specified type.
     /// The type may be VoidTy, in which case only return true if the addressing
@@ -226,6 +243,9 @@ bool isPositiveHalfWord(SDNode *N);
     /// the immediate into a register.
     bool isLegalICmpImmediate(int64_t Imm) const override;
 
+    bool allowsMisalignedMemoryAccesses(EVT VT, unsigned AddrSpace,
+        unsigned Align, bool *Fast) const override;
+
     /// Returns relocation base for the given PIC jumptable.
     SDValue getPICJumpTableRelocBase(SDValue Table, SelectionDAG &DAG)
                                      const override;
@@ -237,6 +257,8 @@ bool isPositiveHalfWord(SDNode *N);
         Value *Addr, AtomicOrdering Ord) const override;
     AtomicExpansionKind shouldExpandAtomicLoadInIR(LoadInst *LI) const override;
     bool shouldExpandAtomicStoreInIR(StoreInst *SI) const override;
+    bool shouldExpandAtomicCmpXchgInIR(AtomicCmpXchgInst *AI) const override;
+
     AtomicExpansionKind
     shouldExpandAtomicRMWInIR(AtomicRMWInst *AI) const override {
       return AtomicExpansionKind::LLSC;
diff --git a/lib/Target/Hexagon/HexagonInstrAlias.td b/lib/Target/Hexagon/HexagonInstrAlias.td
index 5a1a69b40d4d..9cbeae7c67c8 100644
--- a/lib/Target/Hexagon/HexagonInstrAlias.td
+++ b/lib/Target/Hexagon/HexagonInstrAlias.td
@@ -460,3 +460,195 @@ def : InstAlias<"$Pd=cmp.lt($Rs, $Rt)",
 def : InstAlias<"$Pd=cmp.ltu($Rs, $Rt)",
       (C2_cmpgtu PredRegs:$Pd, IntRegs:$Rt, IntRegs:$Rs), 0>;
 
+// maps if (!Pu) jumpr Rs -> if (!Pu) jumpr:nt Rs
+def : InstAlias<"if (!$Pu) jumpr $Rs",
+      (J2_jumprf PredRegs:$Pu, IntRegs:$Rs)>,
+      Requires<[HasV60T]>;
+
+// maps if (Pu) jumpr Rs -> if (Pu) jumpr:nt Rs
+def : InstAlias<"if ($Pu) jumpr $Rs",
+      (J2_jumprt PredRegs:$Pu, IntRegs:$Rs)>,
+      Requires<[HasV60T]>;
+
+// maps if (!Pu) jump $r15_2 -> if (!Pu) jump:nt $r15_2
+def : InstAlias<"if (!$Pu) jump $r15_2",
+      (J2_jumpf PredRegs:$Pu, brtarget:$r15_2)>,
+      Requires<[HasV60T]>;
+
+// maps if (Pu) jump $r15_2 -> if (Pu) jump:nt $r15_2
+def : InstAlias<"if ($Pu) jump $r15_2",
+     (J2_jumpt PredRegs:$Pu, brtarget:$r15_2)>,
+     Requires<[HasV60T]>;
+
+def : InstAlias<"if ($src) jump $r15_2",
+      (J2_jumpt PredRegs:$src, brtarget:$r15_2), 0>;
+
+def : InstAlias<"if (!$src) jump $r15_2",
+      (J2_jumpf PredRegs:$src, brtarget:$r15_2), 0>;
+
+def : InstAlias<"if ($src1) jumpr $src2",
+      (J2_jumprt PredRegs:$src1, IntRegs:$src2), 0>;
+
+def : InstAlias<"if (!$src1) jumpr $src2",
+      (J2_jumprf PredRegs:$src1, IntRegs:$src2), 0>;
+
+// V6_vassignp: Vector assign mapping.
+let hasNewValue = 1, opNewValue = 0, isAsmParserOnly = 1 in
+def HEXAGON_V6_vassignpair: CVI_VA_DV_Resource <
+  (outs VecDblRegs:$Vdd),
+  (ins VecDblRegs:$Vss),
+  "$Vdd = $Vss">;
+
+// maps Vd = #0 to Vd = vxor(Vd, Vd)
+def : InstAlias<"$Vd = #0",
+      (V6_vxor VectorRegs:$Vd, VectorRegs:$Vd, VectorRegs:$Vd)>,
+      Requires<[HasV60T]>;
+
+// maps Vdd  = #0 to Vdd = vsub(Vdd, Vdd)
+def : InstAlias<"$Vdd = #0",
+      (V6_vsubw_dv VecDblRegs:$Vdd, VecDblRegs:$Vdd, VecDblRegs:$Vdd)>,
+      Requires<[HasV60T]>;
+
+// maps   "$Qd = vcmp.eq($Vu.uh, $Vv.uh)" -> "$Qd = vcmp.eq($Vu.h, $Vv.h)"
+def : InstAlias<"$Qd = vcmp.eq($Vu.uh, $Vv.uh)",
+      (V6_veqh VecPredRegs:$Qd, VectorRegs:$Vu, VectorRegs:$Vv)>,
+      Requires<[HasV60T]>;
+
+// maps   "$Qd &= vcmp.eq($Vu.uh, $Vv.uh)" -> "$Qd &= vcmp.eq($Vu.h, $Vv.h)"
+def : InstAlias<"$Qd &= vcmp.eq($Vu.uh, $Vv.uh)",
+      (V6_veqh_and VecPredRegs:$Qd, VectorRegs:$Vu, VectorRegs:$Vv)>,
+      Requires<[HasV60T]>;
+
+// maps   "$Qd |= vcmp.eq($Vu.uh, $Vv.uh)" -> "$Qd |= vcmp.eq($Vu.h, $Vv.h)"
+def : InstAlias<"$Qd |= vcmp.eq($Vu.uh, $Vv.uh)",
+      (V6_veqh_or VecPredRegs:$Qd, VectorRegs:$Vu, VectorRegs:$Vv)>,
+      Requires<[HasV60T]>;
+
+// maps   "$Qd ^= vcmp.eq($Vu.uh, $Vv.uh)" -> "$Qd ^= vcmp.eq($Vu.h, $Vv.h)"
+def : InstAlias<"$Qd ^= vcmp.eq($Vu.uh, $Vv.uh)",
+      (V6_veqh_xor VecPredRegs:$Qd, VectorRegs:$Vu, VectorRegs:$Vv)>,
+      Requires<[HasV60T]>;
+
+// maps   "$Qd = vcmp.eq($Vu.uw, $Vv.uw)" -> "$Qd = vcmp.eq($Vu.w, $Vv.w)"
+def : InstAlias<"$Qd = vcmp.eq($Vu.uw, $Vv.uw)",
+      (V6_veqw VecPredRegs:$Qd, VectorRegs:$Vu, VectorRegs:$Vv)>,
+      Requires<[HasV60T]>;
+
+// maps   "$Qd &= vcmp.eq($Vu.uw, $Vv.uw)" -> "$Qd &= vcmp.eq($Vu.w, $Vv.w)"
+def : InstAlias<"$Qd &= vcmp.eq($Vu.uw, $Vv.uw)",
+      (V6_veqw_and VecPredRegs:$Qd, VectorRegs:$Vu, VectorRegs:$Vv)>,
+      Requires<[HasV60T]>;
+
+// maps   "$Qd |= vcmp.eq($Vu.uw, $Vv.uw)" -> "$Qd |= vcmp.eq($Vu.w, $Vv.w)"
+def : InstAlias<"$Qd |= vcmp.eq($Vu.uw, $Vv.uw)",
+      (V6_veqh_or VecPredRegs:$Qd, VectorRegs:$Vu, VectorRegs:$Vv)>,
+      Requires<[HasV60T]>;
+
+// maps   "$Qd ^= vcmp.eq($Vu.uw, $Vv.uw)" -> "$Qd ^= vcmp.eq($Vu.w, $Vv.w)"
+def : InstAlias<"$Qd ^= vcmp.eq($Vu.uw, $Vv.uw)",
+      (V6_veqw_xor VecPredRegs:$Qd, VectorRegs:$Vu, VectorRegs:$Vv)>,
+      Requires<[HasV60T]>;
+
+// maps   "$Qd = vcmp.eq($Vu.ub, $Vv.ub)" -> "$Qd = vcmp.eq($Vu.b, $Vv.b)"
+def : InstAlias<"$Qd = vcmp.eq($Vu.ub, $Vv.ub)",
+      (V6_veqb VecPredRegs:$Qd, VectorRegs:$Vu, VectorRegs:$Vv)>,
+      Requires<[HasV60T]>;
+
+// maps   "$Qd &= vcmp.eq($Vu.ub, $Vv.ub)" -> "$Qd &= vcmp.eq($Vu.b, $Vv.b)"
+def : InstAlias<"$Qd &= vcmp.eq($Vu.ub, $Vv.ub)",
+      (V6_veqb_and VecPredRegs:$Qd, VectorRegs:$Vu, VectorRegs:$Vv)>,
+      Requires<[HasV60T]>;
+
+// maps   "$Qd |= vcmp.eq($Vu.ub, $Vv.ub)" -> "$Qd |= vcmp.eq($Vu.b, $Vv.b)"
+def : InstAlias<"$Qd |= vcmp.eq($Vu.ub, $Vv.ub)",
+      (V6_veqb_or VecPredRegs:$Qd, VectorRegs:$Vu, VectorRegs:$Vv)>,
+      Requires<[HasV60T]>;
+
+// maps   "$Qd ^= vcmp.eq($Vu.ub, $Vv.ub)" -> "$Qd ^= vcmp.eq($Vu.b, $Vv.b)"
+def : InstAlias<"$Qd ^= vcmp.eq($Vu.ub, $Vv.ub)",
+      (V6_veqb_xor VecPredRegs:$Qd, VectorRegs:$Vu, VectorRegs:$Vv)>,
+      Requires<[HasV60T]>;
+
+// maps   "$Rd.w = vextract($Vu, $Rs)" -> "$Rd = vextract($Vu, $Rs)"
+def : InstAlias<"$Rd.w = vextract($Vu, $Rs)",
+      (V6_extractw IntRegs:$Rd, VectorRegs:$Vu, IntRegs:$Rs)>,
+      Requires<[HasV60T]>;
+
+// Mapping from vtrans2x2(Vy32,Vx32,Rt32) to vshuff(Vy32,Vx32,Rt32)
+def : InstAlias<"vtrans2x2($Vy, $Vx, $Rt)",
+      (V6_vshuff VectorRegs:$Vy, VectorRegs:$Vx, IntRegs:$Rt)>,
+      Requires<[HasV60T]>;
+
+def : InstAlias<"$Vt=vmem($Rs)",
+      (V6_vL32b_ai VectorRegs:$Vt, IntRegs:$Rs, 0)>,
+      Requires<[HasV60T]>;
+
+def : InstAlias<"$Vt=vmem($Rs):nt",
+      (V6_vL32b_nt_ai VectorRegs:$Vt, IntRegs:$Rs, 0)>,
+      Requires<[HasV60T]>;
+
+def : InstAlias<"vmem($Rs)=$Vt",
+      (V6_vS32b_ai IntRegs:$Rs, 0, VectorRegs:$Vt)>,
+      Requires<[HasV60T]>;
+
+def : InstAlias<"vmem($Rs):nt=$Vt",
+      (V6_vS32b_nt_ai IntRegs:$Rs, 0, VectorRegs:$Vt)>,
+      Requires<[HasV60T]>;
+
+def : InstAlias<"vmem($Rs)=$Vt.new",
+      (V6_vS32b_new_ai IntRegs:$Rs, 0, VectorRegs:$Vt)>,
+      Requires<[HasV60T]>;
+
+def : InstAlias<"vmem($Rs):nt=$Vt.new",
+      (V6_vS32b_nt_new_ai IntRegs:$Rs, 0, VectorRegs:$Vt)>,
+      Requires<[HasV60T]>;
+
+def : InstAlias<"if ($Qv) vmem($Rs)=$Vt",
+      (V6_vS32b_qpred_ai VecPredRegs:$Qv, IntRegs:$Rs, 0, VectorRegs:$Vt)>,
+      Requires<[HasV60T]>;
+
+def : InstAlias<"if (!$Qv) vmem($Rs)=$Vt",
+      (V6_vS32b_nqpred_ai VecPredRegs:$Qv, IntRegs:$Rs, 0, VectorRegs:$Vt)>,
+      Requires<[HasV60T]>;
+
+def : InstAlias<"if ($Qv) vmem($Rs):nt=$Vt",
+      (V6_vS32b_nt_qpred_ai VecPredRegs:$Qv, IntRegs:$Rs, 0, VectorRegs:$Vt)>,
+      Requires<[HasV60T]>;
+
+def : InstAlias<"if (!$Qv) vmem($Rs):nt=$Vt",
+      (V6_vS32b_nt_nqpred_ai VecPredRegs:$Qv, IntRegs:$Rs, 0, VectorRegs:$Vt)>,
+      Requires<[HasV60T]>;
+
+def : InstAlias<"if ($Pv) vmem($Rs)=$Vt",
+      (V6_vS32b_pred_ai PredRegs:$Pv, IntRegs:$Rs, 0, VectorRegs:$Vt)>,
+      Requires<[HasV60T]>;
+
+def : InstAlias<"if (!$Pv) vmem($Rs)=$Vt",
+      (V6_vS32b_npred_ai PredRegs:$Pv, IntRegs:$Rs, 0, VectorRegs:$Vt)>,
+      Requires<[HasV60T]>;
+
+def : InstAlias<"if ($Pv) vmem($Rs):nt=$Vt",
+      (V6_vS32b_nt_pred_ai PredRegs:$Pv, IntRegs:$Rs, 0, VectorRegs:$Vt)>,
+      Requires<[HasV60T]>;
+
+def : InstAlias<"if (!$Pv) vmem($Rs):nt=$Vt",
+      (V6_vS32b_nt_npred_ai PredRegs:$Pv, IntRegs:$Rs, 0, VectorRegs:$Vt)>,
+      Requires<[HasV60T]>;
+
+def : InstAlias<"$Vt=vmemu($Rs)",
+      (V6_vL32Ub_ai VectorRegs:$Vt, IntRegs:$Rs, 0)>,
+      Requires<[HasV60T]>;
+
+def : InstAlias<"vmemu($Rs)=$Vt",
+      (V6_vS32Ub_ai IntRegs:$Rs, 0, VectorRegs:$Vt)>,
+      Requires<[HasV60T]>;
+
+def : InstAlias<"if ($Pv) vmemu($Rs)=$Vt",
+      (V6_vS32Ub_pred_ai PredRegs:$Pv, IntRegs:$Rs, 0, VectorRegs:$Vt)>,
+      Requires<[HasV60T]>;
+
+def : InstAlias<"if (!$Pv) vmemu($Rs)=$Vt",
+      (V6_vS32Ub_npred_ai PredRegs:$Pv, IntRegs:$Rs, 0, VectorRegs:$Vt)>,
+      Requires<[HasV60T]>;
+
+
diff --git a/lib/Target/Hexagon/HexagonInstrFormats.td b/lib/Target/Hexagon/HexagonInstrFormats.td
index 3c5ec1701dc2..0bfb04447f2f 100644
--- a/lib/Target/Hexagon/HexagonInstrFormats.td
+++ b/lib/Target/Hexagon/HexagonInstrFormats.td
@@ -342,6 +342,10 @@ class JInst<dag outs, dag ins, string asmstr, list<dag> pattern = [],
             string cstr = "", InstrItinClass itin = J_tc_2early_SLOT23>
   : InstHexagon<outs, ins, asmstr, pattern, cstr, itin, TypeJ>, OpcodeHexagon;
 
+class JInst_CJUMP_UCJUMP<dag outs, dag ins, string asmstr, list<dag> pattern = [],
+            string cstr = "", InstrItinClass itin = J_tc_2early_CJUMP_UCJUMP_ARCHDEPSLOT>
+  : InstHexagon<outs, ins, asmstr, pattern, cstr, itin, TypeJ>, OpcodeHexagon;
+
 // JR Instruction Class in V2/V3/V4.
 // Definition of the instruction class NOT CHANGED.
 class JRInst<dag outs, dag ins, string asmstr, list<dag> pattern = [],
@@ -412,21 +416,11 @@ class STInstPI<dag outs, dag ins, string asmstr, list<dag> pattern = [],
                string cstr = "">
   : STInst<outs, ins, asmstr, pattern, cstr>;
 
-let mayStore = 1 in
-class STInst2PI<dag outs, dag ins, string asmstr, list<dag> pattern = [],
-                string cstr = "">
-  : STInst<outs, ins, asmstr, pattern, cstr>;
-
 // Post increment LD Instruction.
 class LDInstPI<dag outs, dag ins, string asmstr, list<dag> pattern = [],
                string cstr = "">
   : LDInst<outs, ins, asmstr, pattern, cstr>;
 
-let mayLoad = 1 in
-class LDInst2PI<dag outs, dag ins, string asmstr, list<dag> pattern = [],
-                string cstr = "">
-  : LDInst<outs, ins, asmstr, pattern, cstr>;
-
 //===----------------------------------------------------------------------===//
 // V4 Instruction Format Definitions +
 //===----------------------------------------------------------------------===//
diff --git a/lib/Target/Hexagon/HexagonInstrFormatsV4.td b/lib/Target/Hexagon/HexagonInstrFormatsV4.td
index 2d1dea526eed..e17f71fe4e6a 100644
--- a/lib/Target/Hexagon/HexagonInstrFormatsV4.td
+++ b/lib/Target/Hexagon/HexagonInstrFormatsV4.td
@@ -139,7 +139,6 @@ class MEMInst_V4<dag outs, dag ins, string asmstr, list<dag> pattern = [],
                  string cstr = "", InstrItinClass itin = V4LDST_tc_st_SLOT0>
   : MEMInst<outs, ins, asmstr, pattern, cstr, itin>;
 
-let isCodeGenOnly = 1 in
 class EXTENDERInst<dag outs, dag ins, string asmstr, list<dag> pattern = []>
   : InstHexagon<outs, ins, asmstr, pattern, "", EXTENDER_tc_1_SLOT0123,
                 TypePREFIX>, OpcodeHexagon;
@@ -151,5 +150,11 @@ class SUBInst<dag outs, dag ins, string asmstr, list<dag> pattern = [],
 
 class CJInst<dag outs, dag ins, string asmstr, list<dag> pattern = [],
               string cstr = "">
+  : InstHexagon<outs, ins, asmstr, pattern, cstr, COMPOUND_CJ_ARCHDEPSLOT, TypeCOMPOUND>,
+    OpcodeHexagon;
+
+class CJInst_JMPSET<dag outs, dag ins, string asmstr, list<dag> pattern = [],
+              string cstr = "">
   : InstHexagon<outs, ins, asmstr, pattern, cstr, COMPOUND, TypeCOMPOUND>,
     OpcodeHexagon;
+
diff --git a/lib/Target/Hexagon/HexagonInstrInfo.cpp b/lib/Target/Hexagon/HexagonInstrInfo.cpp
index eb3590cb1076..fe9f97d1d5e7 100644
--- a/lib/Target/Hexagon/HexagonInstrInfo.cpp
+++ b/lib/Target/Hexagon/HexagonInstrInfo.cpp
@@ -24,6 +24,7 @@
 #include "llvm/CodeGen/MachineRegisterInfo.h"
 #include "llvm/CodeGen/PseudoSourceValue.h"
 #include "llvm/MC/MCAsmInfo.h"
+#include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/MathExtras.h"
 #include "llvm/Support/raw_ostream.h"
@@ -69,10 +70,10 @@ static cl::opt<bool> BranchRelaxAsmLarge("branch-relax-asm-large",
 ///
 /// Constants for Hexagon instructions.
 ///
-const int Hexagon_MEMV_OFFSET_MAX_128B = 2047;  // #s7
-const int Hexagon_MEMV_OFFSET_MIN_128B = -2048; // #s7
-const int Hexagon_MEMV_OFFSET_MAX = 1023;  // #s6
-const int Hexagon_MEMV_OFFSET_MIN = -1024; // #s6
+const int Hexagon_MEMV_OFFSET_MAX_128B = 896;   // #s4: -8*128...7*128
+const int Hexagon_MEMV_OFFSET_MIN_128B = -1024; // #s4
+const int Hexagon_MEMV_OFFSET_MAX = 448;  // #s4: -8*64...7*64
+const int Hexagon_MEMV_OFFSET_MIN = -512; // #s4
 const int Hexagon_MEMW_OFFSET_MAX = 4095;
 const int Hexagon_MEMW_OFFSET_MIN = -4096;
 const int Hexagon_MEMD_OFFSET_MAX = 8191;
@@ -91,10 +92,10 @@ const int Hexagon_MEMH_AUTOINC_MAX = 14;
 const int Hexagon_MEMH_AUTOINC_MIN = -16;
 const int Hexagon_MEMB_AUTOINC_MAX = 7;
 const int Hexagon_MEMB_AUTOINC_MIN = -8;
-const int Hexagon_MEMV_AUTOINC_MAX = 192;
-const int Hexagon_MEMV_AUTOINC_MIN = -256;
-const int Hexagon_MEMV_AUTOINC_MAX_128B = 384;
-const int Hexagon_MEMV_AUTOINC_MIN_128B = -512;
+const int Hexagon_MEMV_AUTOINC_MAX = 192;   // #s3
+const int Hexagon_MEMV_AUTOINC_MIN = -256;  // #s3
+const int Hexagon_MEMV_AUTOINC_MAX_128B = 384;  // #s3
+const int Hexagon_MEMV_AUTOINC_MIN_128B = -512; // #s3
 
 // Pin the vtable to this file.
 void HexagonInstrInfo::anchor() {}
@@ -230,22 +231,64 @@ static bool isDuplexPairMatch(unsigned Ga, unsigned Gb) {
 /// the destination along with the FrameIndex of the loaded stack slot.  If
 /// not, return 0.  This predicate must return 0 if the instruction has
 /// any side effects other than loading from the stack slot.
-unsigned HexagonInstrInfo::isLoadFromStackSlot(const MachineInstr *MI,
+unsigned HexagonInstrInfo::isLoadFromStackSlot(const MachineInstr &MI,
                                                int &FrameIndex) const {
-  switch (MI->getOpcode()) {
-  default: break;
-  case Hexagon::L2_loadri_io:
-  case Hexagon::L2_loadrd_io:
-  case Hexagon::L2_loadrh_io:
+  switch (MI.getOpcode()) {
+  default:
+    break;
   case Hexagon::L2_loadrb_io:
   case Hexagon::L2_loadrub_io:
-    if (MI->getOperand(2).isFI() &&
-        MI->getOperand(1).isImm() && (MI->getOperand(1).getImm() == 0)) {
-      FrameIndex = MI->getOperand(2).getIndex();
-      return MI->getOperand(0).getReg();
-    }
-    break;
+  case Hexagon::L2_loadrh_io:
+  case Hexagon::L2_loadruh_io:
+  case Hexagon::L2_loadri_io:
+  case Hexagon::L2_loadrd_io:
+  case Hexagon::V6_vL32b_ai:
+  case Hexagon::V6_vL32b_ai_128B:
+  case Hexagon::V6_vL32Ub_ai:
+  case Hexagon::V6_vL32Ub_ai_128B:
+  case Hexagon::LDriw_pred:
+  case Hexagon::LDriw_mod:
+  case Hexagon::LDriq_pred_V6:
+  case Hexagon::LDriq_pred_vec_V6:
+  case Hexagon::LDriv_pseudo_V6:
+  case Hexagon::LDrivv_pseudo_V6:
+  case Hexagon::LDriq_pred_V6_128B:
+  case Hexagon::LDriq_pred_vec_V6_128B:
+  case Hexagon::LDriv_pseudo_V6_128B:
+  case Hexagon::LDrivv_pseudo_V6_128B: {
+    const MachineOperand OpFI = MI.getOperand(1);
+    if (!OpFI.isFI())
+      return 0;
+    const MachineOperand OpOff = MI.getOperand(2);
+    if (!OpOff.isImm() || OpOff.getImm() != 0)
+      return 0;
+    FrameIndex = OpFI.getIndex();
+    return MI.getOperand(0).getReg();
+  }
+
+  case Hexagon::L2_ploadrbt_io:
+  case Hexagon::L2_ploadrbf_io:
+  case Hexagon::L2_ploadrubt_io:
+  case Hexagon::L2_ploadrubf_io:
+  case Hexagon::L2_ploadrht_io:
+  case Hexagon::L2_ploadrhf_io:
+  case Hexagon::L2_ploadruht_io:
+  case Hexagon::L2_ploadruhf_io:
+  case Hexagon::L2_ploadrit_io:
+  case Hexagon::L2_ploadrif_io:
+  case Hexagon::L2_ploadrdt_io:
+  case Hexagon::L2_ploadrdf_io: {
+    const MachineOperand OpFI = MI.getOperand(2);
+    if (!OpFI.isFI())
+      return 0;
+    const MachineOperand OpOff = MI.getOperand(3);
+    if (!OpOff.isImm() || OpOff.getImm() != 0)
+      return 0;
+    FrameIndex = OpFI.getIndex();
+    return MI.getOperand(0).getReg();
+  }
   }
+
   return 0;
 }
 
@@ -255,21 +298,58 @@ unsigned HexagonInstrInfo::isLoadFromStackSlot(const MachineInstr *MI,
 /// the source reg along with the FrameIndex of the loaded stack slot.  If
 /// not, return 0.  This predicate must return 0 if the instruction has
 /// any side effects other than storing to the stack slot.
-unsigned HexagonInstrInfo::isStoreToStackSlot(const MachineInstr *MI,
+unsigned HexagonInstrInfo::isStoreToStackSlot(const MachineInstr &MI,
                                               int &FrameIndex) const {
-  switch (MI->getOpcode()) {
-  default: break;
+  switch (MI.getOpcode()) {
+  default:
+    break;
+  case Hexagon::S2_storerb_io:
+  case Hexagon::S2_storerh_io:
   case Hexagon::S2_storeri_io:
   case Hexagon::S2_storerd_io:
-  case Hexagon::S2_storerh_io:
-  case Hexagon::S2_storerb_io:
-    if (MI->getOperand(2).isFI() &&
-        MI->getOperand(1).isImm() && (MI->getOperand(1).getImm() == 0)) {
-      FrameIndex = MI->getOperand(0).getIndex();
-      return MI->getOperand(2).getReg();
-    }
-    break;
+  case Hexagon::V6_vS32b_ai:
+  case Hexagon::V6_vS32b_ai_128B:
+  case Hexagon::V6_vS32Ub_ai:
+  case Hexagon::V6_vS32Ub_ai_128B:
+  case Hexagon::STriw_pred:
+  case Hexagon::STriw_mod:
+  case Hexagon::STriq_pred_V6:
+  case Hexagon::STriq_pred_vec_V6:
+  case Hexagon::STriv_pseudo_V6:
+  case Hexagon::STrivv_pseudo_V6:
+  case Hexagon::STriq_pred_V6_128B:
+  case Hexagon::STriq_pred_vec_V6_128B:
+  case Hexagon::STriv_pseudo_V6_128B:
+  case Hexagon::STrivv_pseudo_V6_128B: {
+    const MachineOperand &OpFI = MI.getOperand(0);
+    if (!OpFI.isFI())
+      return 0;
+    const MachineOperand &OpOff = MI.getOperand(1);
+    if (!OpOff.isImm() || OpOff.getImm() != 0)
+      return 0;
+    FrameIndex = OpFI.getIndex();
+    return MI.getOperand(2).getReg();
   }
+
+  case Hexagon::S2_pstorerbt_io:
+  case Hexagon::S2_pstorerbf_io:
+  case Hexagon::S2_pstorerht_io:
+  case Hexagon::S2_pstorerhf_io:
+  case Hexagon::S2_pstorerit_io:
+  case Hexagon::S2_pstorerif_io:
+  case Hexagon::S2_pstorerdt_io:
+  case Hexagon::S2_pstorerdf_io: {
+    const MachineOperand &OpFI = MI.getOperand(1);
+    if (!OpFI.isFI())
+      return 0;
+    const MachineOperand &OpOff = MI.getOperand(2);
+    if (!OpOff.isImm() || OpOff.getImm() != 0)
+      return 0;
+    FrameIndex = OpFI.getIndex();
+    return MI.getOperand(3).getReg();
+  }
+  }
+
   return 0;
 }
 
@@ -290,7 +370,7 @@ unsigned HexagonInstrInfo::isStoreToStackSlot(const MachineInstr *MI,
 /// Cond[1] = R
 /// Cond[2] = Imm
 ///
-bool HexagonInstrInfo::AnalyzeBranch(MachineBasicBlock &MBB,
+bool HexagonInstrInfo::analyzeBranch(MachineBasicBlock &MBB,
                                      MachineBasicBlock *&TBB,
                                      MachineBasicBlock *&FBB,
                                      SmallVectorImpl<MachineOperand> &Cond,
@@ -344,7 +424,7 @@ bool HexagonInstrInfo::AnalyzeBranch(MachineBasicBlock &MBB,
       return false;
     --I;
   }
-  if (!isUnpredicatedTerminator(&*I))
+  if (!isUnpredicatedTerminator(*I))
     return false;
 
   // Get the last instruction in the block.
@@ -352,7 +432,7 @@ bool HexagonInstrInfo::AnalyzeBranch(MachineBasicBlock &MBB,
   MachineInstr *SecondLastInst = nullptr;
   // Find one more terminator if present.
   for (;;) {
-    if (&*I != LastInst && !I->isBundle() && isUnpredicatedTerminator(&*I)) {
+    if (&*I != LastInst && !I->isBundle() && isUnpredicatedTerminator(*I)) {
       if (!SecondLastInst)
         SecondLastInst = &*I;
       else
@@ -377,6 +457,9 @@ bool HexagonInstrInfo::AnalyzeBranch(MachineBasicBlock &MBB,
   bool LastOpcodeHasJMP_c = PredOpcodeHasJMP_c(LastOpcode);
   bool LastOpcodeHasNVJump = isNewValueJump(LastInst);
 
+  if (LastOpcodeHasJMP_c && !LastInst->getOperand(1).isMBB())
+    return true;
+
   // If there is only one terminator instruction, process it.
   if (LastInst && !SecondLastInst) {
     if (LastOpcode == Hexagon::J2_jump) {
@@ -412,6 +495,8 @@ bool HexagonInstrInfo::AnalyzeBranch(MachineBasicBlock &MBB,
   bool SecLastOpcodeHasJMP_c = PredOpcodeHasJMP_c(SecLastOpcode);
   bool SecLastOpcodeHasNVJump = isNewValueJump(SecondLastInst);
   if (SecLastOpcodeHasJMP_c && (LastOpcode == Hexagon::J2_jump)) {
+    if (!SecondLastInst->getOperand(1).isMBB())
+      return true;
     TBB =  SecondLastInst->getOperand(1).getMBB();
     Cond.push_back(MachineOperand::CreateImm(SecondLastInst->getOpcode()));
     Cond.push_back(SecondLastInst->getOperand(0));
@@ -476,10 +561,11 @@ unsigned HexagonInstrInfo::RemoveBranch(MachineBasicBlock &MBB) const {
   return Count;
 }
 
-
 unsigned HexagonInstrInfo::InsertBranch(MachineBasicBlock &MBB,
-      MachineBasicBlock *TBB, MachineBasicBlock *FBB,
-      ArrayRef<MachineOperand> Cond, DebugLoc DL) const {
+                                        MachineBasicBlock *TBB,
+                                        MachineBasicBlock *FBB,
+                                        ArrayRef<MachineOperand> Cond,
+                                        const DebugLoc &DL) const {
   unsigned BOpc   = Hexagon::J2_jump;
   unsigned BccOpc = Hexagon::J2_jumpt;
   assert(validateBranchCond(Cond) && "Invalid branching condition");
@@ -499,9 +585,9 @@ unsigned HexagonInstrInfo::InsertBranch(MachineBasicBlock &MBB,
       // into an infinite loop.
       MachineBasicBlock *NewTBB, *NewFBB;
       SmallVector<MachineOperand, 4> Cond;
-      MachineInstr *Term = MBB.getFirstTerminator();
-      if (Term != MBB.end() && isPredicated(Term) &&
-          !AnalyzeBranch(MBB, NewTBB, NewFBB, Cond, false)) {
+      auto Term = MBB.getFirstTerminator();
+      if (Term != MBB.end() && isPredicated(*Term) &&
+          !analyzeBranch(MBB, NewTBB, NewFBB, Cond, false)) {
         MachineBasicBlock *NextBB = &*++MBB.getIterator();
         if (NewTBB == NextBB) {
           ReverseBranchCondition(Cond);
@@ -592,85 +678,84 @@ bool HexagonInstrInfo::isProfitableToDupForIfCvt(MachineBasicBlock &MBB,
   return NumInstrs <= 4;
 }
 
-
 void HexagonInstrInfo::copyPhysReg(MachineBasicBlock &MBB,
-      MachineBasicBlock::iterator I, DebugLoc DL, unsigned DestReg,
-      unsigned SrcReg, bool KillSrc) const {
+                                   MachineBasicBlock::iterator I,
+                                   const DebugLoc &DL, unsigned DestReg,
+                                   unsigned SrcReg, bool KillSrc) const {
   auto &HRI = getRegisterInfo();
+  unsigned KillFlag = getKillRegState(KillSrc);
+
   if (Hexagon::IntRegsRegClass.contains(SrcReg, DestReg)) {
-    BuildMI(MBB, I, DL, get(Hexagon::A2_tfr), DestReg).addReg(SrcReg);
+    BuildMI(MBB, I, DL, get(Hexagon::A2_tfr), DestReg)
+      .addReg(SrcReg, KillFlag);
     return;
   }
   if (Hexagon::DoubleRegsRegClass.contains(SrcReg, DestReg)) {
-    BuildMI(MBB, I, DL, get(Hexagon::A2_tfrp), DestReg).addReg(SrcReg);
+    BuildMI(MBB, I, DL, get(Hexagon::A2_tfrp), DestReg)
+      .addReg(SrcReg, KillFlag);
     return;
   }
   if (Hexagon::PredRegsRegClass.contains(SrcReg, DestReg)) {
     // Map Pd = Ps to Pd = or(Ps, Ps).
-    BuildMI(MBB, I, DL, get(Hexagon::C2_or),
-            DestReg).addReg(SrcReg).addReg(SrcReg);
+    BuildMI(MBB, I, DL, get(Hexagon::C2_or), DestReg)
+      .addReg(SrcReg).addReg(SrcReg, KillFlag);
     return;
   }
-  if (Hexagon::DoubleRegsRegClass.contains(DestReg) &&
+  if (Hexagon::CtrRegsRegClass.contains(DestReg) &&
       Hexagon::IntRegsRegClass.contains(SrcReg)) {
-    // We can have an overlap between single and double reg: r1:0 = r0.
-    if(SrcReg == RI.getSubReg(DestReg, Hexagon::subreg_loreg)) {
-        // r1:0 = r0
-        BuildMI(MBB, I, DL, get(Hexagon::A2_tfrsi), (RI.getSubReg(DestReg,
-                Hexagon::subreg_hireg))).addImm(0);
-    } else {
-        // r1:0 = r1 or no overlap.
-        BuildMI(MBB, I, DL, get(Hexagon::A2_tfr), (RI.getSubReg(DestReg,
-                Hexagon::subreg_loreg))).addReg(SrcReg);
-        BuildMI(MBB, I, DL, get(Hexagon::A2_tfrsi), (RI.getSubReg(DestReg,
-                Hexagon::subreg_hireg))).addImm(0);
-    }
+    BuildMI(MBB, I, DL, get(Hexagon::A2_tfrrcr), DestReg)
+      .addReg(SrcReg, KillFlag);
     return;
   }
-  if (Hexagon::CtrRegsRegClass.contains(DestReg) &&
+  if (Hexagon::IntRegsRegClass.contains(DestReg) &&
+      Hexagon::CtrRegsRegClass.contains(SrcReg)) {
+    BuildMI(MBB, I, DL, get(Hexagon::A2_tfrcrr), DestReg)
+      .addReg(SrcReg, KillFlag);
+    return;
+  }
+  if (Hexagon::ModRegsRegClass.contains(DestReg) &&
       Hexagon::IntRegsRegClass.contains(SrcReg)) {
-    BuildMI(MBB, I, DL, get(Hexagon::A2_tfrrcr), DestReg).addReg(SrcReg);
+    BuildMI(MBB, I, DL, get(Hexagon::A2_tfrrcr), DestReg)
+      .addReg(SrcReg, KillFlag);
     return;
   }
   if (Hexagon::PredRegsRegClass.contains(SrcReg) &&
       Hexagon::IntRegsRegClass.contains(DestReg)) {
-    BuildMI(MBB, I, DL, get(Hexagon::C2_tfrpr), DestReg).
-      addReg(SrcReg, getKillRegState(KillSrc));
+    BuildMI(MBB, I, DL, get(Hexagon::C2_tfrpr), DestReg)
+      .addReg(SrcReg, KillFlag);
     return;
   }
   if (Hexagon::IntRegsRegClass.contains(SrcReg) &&
       Hexagon::PredRegsRegClass.contains(DestReg)) {
-    BuildMI(MBB, I, DL, get(Hexagon::C2_tfrrp), DestReg).
-      addReg(SrcReg, getKillRegState(KillSrc));
+    BuildMI(MBB, I, DL, get(Hexagon::C2_tfrrp), DestReg)
+      .addReg(SrcReg, KillFlag);
     return;
   }
   if (Hexagon::PredRegsRegClass.contains(SrcReg) &&
       Hexagon::IntRegsRegClass.contains(DestReg)) {
-    BuildMI(MBB, I, DL, get(Hexagon::C2_tfrpr), DestReg).
-      addReg(SrcReg, getKillRegState(KillSrc));
+    BuildMI(MBB, I, DL, get(Hexagon::C2_tfrpr), DestReg)
+      .addReg(SrcReg, KillFlag);
     return;
   }
   if (Hexagon::VectorRegsRegClass.contains(SrcReg, DestReg)) {
     BuildMI(MBB, I, DL, get(Hexagon::V6_vassign), DestReg).
-      addReg(SrcReg, getKillRegState(KillSrc));
+      addReg(SrcReg, KillFlag);
     return;
   }
   if (Hexagon::VecDblRegsRegClass.contains(SrcReg, DestReg)) {
-    BuildMI(MBB, I, DL, get(Hexagon::V6_vcombine), DestReg).
-      addReg(HRI.getSubReg(SrcReg, Hexagon::subreg_hireg),
-             getKillRegState(KillSrc)).
-      addReg(HRI.getSubReg(SrcReg, Hexagon::subreg_loreg),
-             getKillRegState(KillSrc));
+    BuildMI(MBB, I, DL, get(Hexagon::V6_vcombine), DestReg)
+      .addReg(HRI.getSubReg(SrcReg, Hexagon::subreg_hireg), KillFlag)
+      .addReg(HRI.getSubReg(SrcReg, Hexagon::subreg_loreg), KillFlag);
     return;
   }
   if (Hexagon::VecPredRegsRegClass.contains(SrcReg, DestReg)) {
-    BuildMI(MBB, I, DL, get(Hexagon::V6_pred_and), DestReg).
-      addReg(SrcReg).
-      addReg(SrcReg, getKillRegState(KillSrc));
+    BuildMI(MBB, I, DL, get(Hexagon::V6_pred_and), DestReg)
+      .addReg(SrcReg)
+      .addReg(SrcReg, KillFlag);
     return;
   }
   if (Hexagon::VecPredRegsRegClass.contains(SrcReg) &&
-    Hexagon::VectorRegsRegClass.contains(DestReg)) {
+      Hexagon::VectorRegsRegClass.contains(DestReg)) {
     llvm_unreachable("Unimplemented pred to vec");
     return;
   }
@@ -680,14 +765,12 @@ void HexagonInstrInfo::copyPhysReg(MachineBasicBlock &MBB,
     return;
   }
   if (Hexagon::VecPredRegs128BRegClass.contains(SrcReg, DestReg)) {
-    BuildMI(MBB, I, DL, get(Hexagon::V6_pred_and),
-      HRI.getSubReg(DestReg, Hexagon::subreg_hireg)).
-      addReg(HRI.getSubReg(SrcReg, Hexagon::subreg_hireg),
-             getKillRegState(KillSrc));
-    BuildMI(MBB, I, DL, get(Hexagon::V6_pred_and),
-      HRI.getSubReg(DestReg, Hexagon::subreg_loreg)).
-      addReg(HRI.getSubReg(SrcReg, Hexagon::subreg_loreg),
-             getKillRegState(KillSrc));
+    unsigned DstHi = HRI.getSubReg(DestReg, Hexagon::subreg_hireg);
+    BuildMI(MBB, I, DL, get(Hexagon::V6_pred_and), DstHi)
+      .addReg(HRI.getSubReg(SrcReg, Hexagon::subreg_hireg), KillFlag);
+    unsigned DstLo = HRI.getSubReg(DestReg, Hexagon::subreg_loreg);
+    BuildMI(MBB, I, DL, get(Hexagon::V6_pred_and), DstLo)
+      .addReg(HRI.getSubReg(SrcReg, Hexagon::subreg_loreg), KillFlag);
     return;
   }
 
@@ -708,6 +791,7 @@ void HexagonInstrInfo::storeRegToStackSlot(MachineBasicBlock &MBB,
   MachineFunction &MF = *MBB.getParent();
   MachineFrameInfo &MFI = *MF.getFrameInfo();
   unsigned Align = MFI.getObjectAlignment(FI);
+  unsigned KillFlag = getKillRegState(isKill);
 
   MachineMemOperand *MMO = MF.getMachineMemOperand(
       MachinePointerInfo::getFixedStack(MF, FI), MachineMemOperand::MOStore,
@@ -715,25 +799,57 @@ void HexagonInstrInfo::storeRegToStackSlot(MachineBasicBlock &MBB,
 
   if (Hexagon::IntRegsRegClass.hasSubClassEq(RC)) {
     BuildMI(MBB, I, DL, get(Hexagon::S2_storeri_io))
-          .addFrameIndex(FI).addImm(0)
-          .addReg(SrcReg, getKillRegState(isKill)).addMemOperand(MMO);
+      .addFrameIndex(FI).addImm(0)
+      .addReg(SrcReg, KillFlag).addMemOperand(MMO);
   } else if (Hexagon::DoubleRegsRegClass.hasSubClassEq(RC)) {
     BuildMI(MBB, I, DL, get(Hexagon::S2_storerd_io))
-          .addFrameIndex(FI).addImm(0)
-          .addReg(SrcReg, getKillRegState(isKill)).addMemOperand(MMO);
+      .addFrameIndex(FI).addImm(0)
+      .addReg(SrcReg, KillFlag).addMemOperand(MMO);
   } else if (Hexagon::PredRegsRegClass.hasSubClassEq(RC)) {
     BuildMI(MBB, I, DL, get(Hexagon::STriw_pred))
-          .addFrameIndex(FI).addImm(0)
-          .addReg(SrcReg, getKillRegState(isKill)).addMemOperand(MMO);
+      .addFrameIndex(FI).addImm(0)
+      .addReg(SrcReg, KillFlag).addMemOperand(MMO);
+  } else if (Hexagon::ModRegsRegClass.hasSubClassEq(RC)) {
+    BuildMI(MBB, I, DL, get(Hexagon::STriw_mod))
+      .addFrameIndex(FI).addImm(0)
+      .addReg(SrcReg, KillFlag).addMemOperand(MMO);
+  } else if (Hexagon::VecPredRegs128BRegClass.hasSubClassEq(RC)) {
+    BuildMI(MBB, I, DL, get(Hexagon::STriq_pred_V6_128B))
+      .addFrameIndex(FI).addImm(0)
+      .addReg(SrcReg, KillFlag).addMemOperand(MMO);
+  } else if (Hexagon::VecPredRegsRegClass.hasSubClassEq(RC)) {
+    BuildMI(MBB, I, DL, get(Hexagon::STriq_pred_V6))
+      .addFrameIndex(FI).addImm(0)
+      .addReg(SrcReg, KillFlag).addMemOperand(MMO);
+  } else if (Hexagon::VectorRegs128BRegClass.hasSubClassEq(RC)) {
+    DEBUG(dbgs() << "++Generating 128B vector spill");
+    BuildMI(MBB, I, DL, get(Hexagon::STriv_pseudo_V6_128B))
+      .addFrameIndex(FI).addImm(0)
+      .addReg(SrcReg, KillFlag).addMemOperand(MMO);
+  } else if (Hexagon::VectorRegsRegClass.hasSubClassEq(RC)) {
+    DEBUG(dbgs() << "++Generating vector spill");
+    BuildMI(MBB, I, DL, get(Hexagon::STriv_pseudo_V6))
+      .addFrameIndex(FI).addImm(0)
+      .addReg(SrcReg, KillFlag).addMemOperand(MMO);
+  } else if (Hexagon::VecDblRegsRegClass.hasSubClassEq(RC)) {
+    DEBUG(dbgs() << "++Generating double vector spill");
+    BuildMI(MBB, I, DL, get(Hexagon::STrivv_pseudo_V6))
+      .addFrameIndex(FI).addImm(0)
+      .addReg(SrcReg, KillFlag).addMemOperand(MMO);
+  } else if (Hexagon::VecDblRegs128BRegClass.hasSubClassEq(RC)) {
+    DEBUG(dbgs() << "++Generating 128B double vector spill");
+    BuildMI(MBB, I, DL, get(Hexagon::STrivv_pseudo_V6_128B))
+      .addFrameIndex(FI).addImm(0)
+      .addReg(SrcReg, KillFlag).addMemOperand(MMO);
   } else {
     llvm_unreachable("Unimplemented");
   }
 }
 
-
-void HexagonInstrInfo::loadRegFromStackSlot(MachineBasicBlock &MBB,
-      MachineBasicBlock::iterator I, unsigned DestReg, int FI,
-      const TargetRegisterClass *RC, const TargetRegisterInfo *TRI) const {
+void HexagonInstrInfo::loadRegFromStackSlot(
+    MachineBasicBlock &MBB, MachineBasicBlock::iterator I, unsigned DestReg,
+    int FI, const TargetRegisterClass *RC,
+    const TargetRegisterInfo *TRI) const {
   DebugLoc DL = MBB.findDebugLoc(I);
   MachineFunction &MF = *MBB.getParent();
   MachineFrameInfo &MFI = *MF.getFrameInfo();
@@ -742,15 +858,41 @@ void HexagonInstrInfo::loadRegFromStackSlot(MachineBasicBlock &MBB,
   MachineMemOperand *MMO = MF.getMachineMemOperand(
       MachinePointerInfo::getFixedStack(MF, FI), MachineMemOperand::MOLoad,
       MFI.getObjectSize(FI), Align);
-  if (RC == &Hexagon::IntRegsRegClass) {
+
+  if (Hexagon::IntRegsRegClass.hasSubClassEq(RC)) {
     BuildMI(MBB, I, DL, get(Hexagon::L2_loadri_io), DestReg)
-          .addFrameIndex(FI).addImm(0).addMemOperand(MMO);
-  } else if (RC == &Hexagon::DoubleRegsRegClass) {
+      .addFrameIndex(FI).addImm(0).addMemOperand(MMO);
+  } else if (Hexagon::DoubleRegsRegClass.hasSubClassEq(RC)) {
     BuildMI(MBB, I, DL, get(Hexagon::L2_loadrd_io), DestReg)
-          .addFrameIndex(FI).addImm(0).addMemOperand(MMO);
-  } else if (RC == &Hexagon::PredRegsRegClass) {
+      .addFrameIndex(FI).addImm(0).addMemOperand(MMO);
+  } else if (Hexagon::PredRegsRegClass.hasSubClassEq(RC)) {
     BuildMI(MBB, I, DL, get(Hexagon::LDriw_pred), DestReg)
-          .addFrameIndex(FI).addImm(0).addMemOperand(MMO);
+      .addFrameIndex(FI).addImm(0).addMemOperand(MMO);
+  } else if (Hexagon::ModRegsRegClass.hasSubClassEq(RC)) {
+    BuildMI(MBB, I, DL, get(Hexagon::LDriw_mod), DestReg)
+      .addFrameIndex(FI).addImm(0).addMemOperand(MMO);
+  } else if (Hexagon::VecPredRegs128BRegClass.hasSubClassEq(RC)) {
+    BuildMI(MBB, I, DL, get(Hexagon::LDriq_pred_V6_128B), DestReg)
+      .addFrameIndex(FI).addImm(0).addMemOperand(MMO);
+  } else if (Hexagon::VecPredRegsRegClass.hasSubClassEq(RC)) {
+    BuildMI(MBB, I, DL, get(Hexagon::LDriq_pred_V6), DestReg)
+      .addFrameIndex(FI).addImm(0).addMemOperand(MMO);
+  } else if (Hexagon::VecDblRegs128BRegClass.hasSubClassEq(RC)) {
+    DEBUG(dbgs() << "++Generating 128B double vector restore");
+    BuildMI(MBB, I, DL, get(Hexagon::LDrivv_pseudo_V6_128B), DestReg)
+      .addFrameIndex(FI).addImm(0).addMemOperand(MMO);
+  } else if (Hexagon::VectorRegs128BRegClass.hasSubClassEq(RC)) {
+    DEBUG(dbgs() << "++Generating 128B vector restore");
+    BuildMI(MBB, I, DL, get(Hexagon::LDriv_pseudo_V6_128B), DestReg)
+      .addFrameIndex(FI).addImm(0).addMemOperand(MMO);
+  } else if (Hexagon::VectorRegsRegClass.hasSubClassEq(RC)) {
+    DEBUG(dbgs() << "++Generating vector restore");
+    BuildMI(MBB, I, DL, get(Hexagon::LDriv_pseudo_V6), DestReg)
+      .addFrameIndex(FI).addImm(0).addMemOperand(MMO);
+  } else if (Hexagon::VecDblRegsRegClass.hasSubClassEq(RC)) {
+    DEBUG(dbgs() << "++Generating double vector restore");
+    BuildMI(MBB, I, DL, get(Hexagon::LDrivv_pseudo_V6), DestReg)
+      .addFrameIndex(FI).addImm(0).addMemOperand(MMO);
   } else {
     llvm_unreachable("Can't store this register to stack slot");
   }
@@ -763,48 +905,58 @@ void HexagonInstrInfo::loadRegFromStackSlot(MachineBasicBlock &MBB,
 /// into real instructions. The target can edit MI in place, or it can insert
 /// new instructions and erase MI. The function should return true if
 /// anything was changed.
-bool HexagonInstrInfo::expandPostRAPseudo(MachineBasicBlock::iterator MI)
-      const {
+bool HexagonInstrInfo::expandPostRAPseudo(MachineInstr &MI) const {
   const HexagonRegisterInfo &HRI = getRegisterInfo();
-  MachineRegisterInfo &MRI = MI->getParent()->getParent()->getRegInfo();
-  MachineBasicBlock &MBB = *MI->getParent();
-  DebugLoc DL = MI->getDebugLoc();
-  unsigned Opc = MI->getOpcode();
+  MachineRegisterInfo &MRI = MI.getParent()->getParent()->getRegInfo();
+  MachineBasicBlock &MBB = *MI.getParent();
+  DebugLoc DL = MI.getDebugLoc();
+  unsigned Opc = MI.getOpcode();
   const unsigned VecOffset = 1;
   bool Is128B = false;
 
   switch (Opc) {
+    case TargetOpcode::COPY: {
+      MachineOperand &MD = MI.getOperand(0);
+      MachineOperand &MS = MI.getOperand(1);
+      MachineBasicBlock::iterator MBBI = MI.getIterator();
+      if (MD.getReg() != MS.getReg() && !MS.isUndef()) {
+        copyPhysReg(MBB, MI, DL, MD.getReg(), MS.getReg(), MS.isKill());
+        std::prev(MBBI)->copyImplicitOps(*MBB.getParent(), MI);
+      }
+      MBB.erase(MBBI);
+      return true;
+    }
     case Hexagon::ALIGNA:
-      BuildMI(MBB, MI, DL, get(Hexagon::A2_andir), MI->getOperand(0).getReg())
+      BuildMI(MBB, MI, DL, get(Hexagon::A2_andir), MI.getOperand(0).getReg())
           .addReg(HRI.getFrameRegister())
-          .addImm(-MI->getOperand(1).getImm());
+          .addImm(-MI.getOperand(1).getImm());
       MBB.erase(MI);
       return true;
     case Hexagon::HEXAGON_V6_vassignp_128B:
     case Hexagon::HEXAGON_V6_vassignp: {
-      unsigned SrcReg = MI->getOperand(1).getReg();
-      unsigned DstReg = MI->getOperand(0).getReg();
+      unsigned SrcReg = MI.getOperand(1).getReg();
+      unsigned DstReg = MI.getOperand(0).getReg();
       if (SrcReg != DstReg)
-        copyPhysReg(MBB, MI, DL, DstReg, SrcReg, MI->getOperand(1).isKill());
+        copyPhysReg(MBB, MI, DL, DstReg, SrcReg, MI.getOperand(1).isKill());
       MBB.erase(MI);
       return true;
     }
     case Hexagon::HEXAGON_V6_lo_128B:
     case Hexagon::HEXAGON_V6_lo: {
-      unsigned SrcReg = MI->getOperand(1).getReg();
-      unsigned DstReg = MI->getOperand(0).getReg();
+      unsigned SrcReg = MI.getOperand(1).getReg();
+      unsigned DstReg = MI.getOperand(0).getReg();
       unsigned SrcSubLo = HRI.getSubReg(SrcReg, Hexagon::subreg_loreg);
-      copyPhysReg(MBB, MI, DL, DstReg, SrcSubLo, MI->getOperand(1).isKill());
+      copyPhysReg(MBB, MI, DL, DstReg, SrcSubLo, MI.getOperand(1).isKill());
       MBB.erase(MI);
       MRI.clearKillFlags(SrcSubLo);
       return true;
     }
     case Hexagon::HEXAGON_V6_hi_128B:
     case Hexagon::HEXAGON_V6_hi: {
-      unsigned SrcReg = MI->getOperand(1).getReg();
-      unsigned DstReg = MI->getOperand(0).getReg();
+      unsigned SrcReg = MI.getOperand(1).getReg();
+      unsigned DstReg = MI.getOperand(0).getReg();
       unsigned SrcSubHi = HRI.getSubReg(SrcReg, Hexagon::subreg_hireg);
-      copyPhysReg(MBB, MI, DL, DstReg, SrcSubHi, MI->getOperand(1).isKill());
+      copyPhysReg(MBB, MI, DL, DstReg, SrcSubHi, MI.getOperand(1).isKill());
       MBB.erase(MI);
       MRI.clearKillFlags(SrcSubHi);
       return true;
@@ -812,24 +964,25 @@ bool HexagonInstrInfo::expandPostRAPseudo(MachineBasicBlock::iterator MI)
     case Hexagon::STrivv_indexed_128B:
       Is128B = true;
     case Hexagon::STrivv_indexed: {
-      unsigned SrcReg = MI->getOperand(2).getReg();
+      unsigned SrcReg = MI.getOperand(2).getReg();
       unsigned SrcSubHi = HRI.getSubReg(SrcReg, Hexagon::subreg_hireg);
       unsigned SrcSubLo = HRI.getSubReg(SrcReg, Hexagon::subreg_loreg);
       unsigned NewOpcd = Is128B ? Hexagon::V6_vS32b_ai_128B
                                 : Hexagon::V6_vS32b_ai;
       unsigned Offset = Is128B ? VecOffset << 7 : VecOffset << 6;
-      MachineInstr *MI1New = BuildMI(MBB, MI, DL, get(NewOpcd))
-          .addOperand(MI->getOperand(0))
-          .addImm(MI->getOperand(1).getImm())
-          .addReg(SrcSubLo)
-          .setMemRefs(MI->memoperands_begin(), MI->memoperands_end());
+      MachineInstr *MI1New =
+          BuildMI(MBB, MI, DL, get(NewOpcd))
+              .addOperand(MI.getOperand(0))
+              .addImm(MI.getOperand(1).getImm())
+              .addReg(SrcSubLo)
+              .setMemRefs(MI.memoperands_begin(), MI.memoperands_end());
       MI1New->getOperand(0).setIsKill(false);
       BuildMI(MBB, MI, DL, get(NewOpcd))
-        .addOperand(MI->getOperand(0))
-        // The Vectors are indexed in multiples of vector size.
-        .addImm(MI->getOperand(1).getImm()+Offset)
-        .addReg(SrcSubHi)
-        .setMemRefs(MI->memoperands_begin(), MI->memoperands_end());
+          .addOperand(MI.getOperand(0))
+          // The Vectors are indexed in multiples of vector size.
+          .addImm(MI.getOperand(1).getImm() + Offset)
+          .addReg(SrcSubHi)
+          .setMemRefs(MI.memoperands_begin(), MI.memoperands_end());
       MBB.erase(MI);
       return true;
     }
@@ -840,35 +993,34 @@ bool HexagonInstrInfo::expandPostRAPseudo(MachineBasicBlock::iterator MI)
     case Hexagon::LDrivv_indexed: {
       unsigned NewOpcd = Is128B ? Hexagon::V6_vL32b_ai_128B
                                 : Hexagon::V6_vL32b_ai;
-      unsigned DstReg = MI->getOperand(0).getReg();
+      unsigned DstReg = MI.getOperand(0).getReg();
       unsigned Offset = Is128B ? VecOffset << 7 : VecOffset << 6;
       MachineInstr *MI1New =
           BuildMI(MBB, MI, DL, get(NewOpcd),
                   HRI.getSubReg(DstReg, Hexagon::subreg_loreg))
-              .addOperand(MI->getOperand(1))
-              .addImm(MI->getOperand(2).getImm());
+              .addOperand(MI.getOperand(1))
+              .addImm(MI.getOperand(2).getImm());
       MI1New->getOperand(1).setIsKill(false);
       BuildMI(MBB, MI, DL, get(NewOpcd),
               HRI.getSubReg(DstReg, Hexagon::subreg_hireg))
-          .addOperand(MI->getOperand(1))
+          .addOperand(MI.getOperand(1))
           // The Vectors are indexed in multiples of vector size.
-          .addImm(MI->getOperand(2).getImm() + Offset)
-          .setMemRefs(MI->memoperands_begin(), MI->memoperands_end());
+          .addImm(MI.getOperand(2).getImm() + Offset)
+          .setMemRefs(MI.memoperands_begin(), MI.memoperands_end());
       MBB.erase(MI);
       return true;
     }
     case Hexagon::LDriv_pseudo_V6_128B:
       Is128B = true;
     case Hexagon::LDriv_pseudo_V6: {
-      unsigned DstReg = MI->getOperand(0).getReg();
+      unsigned DstReg = MI.getOperand(0).getReg();
       unsigned NewOpc = Is128B ? Hexagon::V6_vL32b_ai_128B
                                : Hexagon::V6_vL32b_ai;
-      int32_t Off = MI->getOperand(2).getImm();
-      int32_t Idx = Off;
+      int32_t Off = MI.getOperand(2).getImm();
       BuildMI(MBB, MI, DL, get(NewOpc), DstReg)
-        .addOperand(MI->getOperand(1))
-        .addImm(Idx)
-        .setMemRefs(MI->memoperands_begin(), MI->memoperands_end());
+          .addOperand(MI.getOperand(1))
+          .addImm(Off)
+          .setMemRefs(MI.memoperands_begin(), MI.memoperands_end());
       MBB.erase(MI);
       return true;
     }
@@ -877,18 +1029,17 @@ bool HexagonInstrInfo::expandPostRAPseudo(MachineBasicBlock::iterator MI)
     case Hexagon::STriv_pseudo_V6: {
       unsigned NewOpc = Is128B ? Hexagon::V6_vS32b_ai_128B
                                : Hexagon::V6_vS32b_ai;
-      int32_t Off = MI->getOperand(1).getImm();
-      int32_t Idx = Is128B ? (Off >> 7) : (Off >> 6);
+      int32_t Off = MI.getOperand(1).getImm();
       BuildMI(MBB, MI, DL, get(NewOpc))
-        .addOperand(MI->getOperand(0))
-        .addImm(Idx)
-        .addOperand(MI->getOperand(2))
-        .setMemRefs(MI->memoperands_begin(), MI->memoperands_end());
+          .addOperand(MI.getOperand(0))
+          .addImm(Off)
+          .addOperand(MI.getOperand(2))
+          .setMemRefs(MI.memoperands_begin(), MI.memoperands_end());
       MBB.erase(MI);
       return true;
     }
     case Hexagon::TFR_PdTrue: {
-      unsigned Reg = MI->getOperand(0).getReg();
+      unsigned Reg = MI.getOperand(0).getReg();
       BuildMI(MBB, MI, DL, get(Hexagon::C2_orn), Reg)
         .addReg(Reg, RegState::Undef)
         .addReg(Reg, RegState::Undef);
@@ -896,7 +1047,7 @@ bool HexagonInstrInfo::expandPostRAPseudo(MachineBasicBlock::iterator MI)
       return true;
     }
     case Hexagon::TFR_PdFalse: {
-      unsigned Reg = MI->getOperand(0).getReg();
+      unsigned Reg = MI.getOperand(0).getReg();
       BuildMI(MBB, MI, DL, get(Hexagon::C2_andn), Reg)
         .addReg(Reg, RegState::Undef)
         .addReg(Reg, RegState::Undef);
@@ -905,18 +1056,20 @@ bool HexagonInstrInfo::expandPostRAPseudo(MachineBasicBlock::iterator MI)
     }
     case Hexagon::VMULW: {
       // Expand a 64-bit vector multiply into 2 32-bit scalar multiplies.
-      unsigned DstReg = MI->getOperand(0).getReg();
-      unsigned Src1Reg = MI->getOperand(1).getReg();
-      unsigned Src2Reg = MI->getOperand(2).getReg();
+      unsigned DstReg = MI.getOperand(0).getReg();
+      unsigned Src1Reg = MI.getOperand(1).getReg();
+      unsigned Src2Reg = MI.getOperand(2).getReg();
       unsigned Src1SubHi = HRI.getSubReg(Src1Reg, Hexagon::subreg_hireg);
       unsigned Src1SubLo = HRI.getSubReg(Src1Reg, Hexagon::subreg_loreg);
       unsigned Src2SubHi = HRI.getSubReg(Src2Reg, Hexagon::subreg_hireg);
       unsigned Src2SubLo = HRI.getSubReg(Src2Reg, Hexagon::subreg_loreg);
-      BuildMI(MBB, MI, MI->getDebugLoc(), get(Hexagon::M2_mpyi),
-              HRI.getSubReg(DstReg, Hexagon::subreg_hireg)).addReg(Src1SubHi)
+      BuildMI(MBB, MI, MI.getDebugLoc(), get(Hexagon::M2_mpyi),
+              HRI.getSubReg(DstReg, Hexagon::subreg_hireg))
+          .addReg(Src1SubHi)
           .addReg(Src2SubHi);
-      BuildMI(MBB, MI, MI->getDebugLoc(), get(Hexagon::M2_mpyi),
-              HRI.getSubReg(DstReg, Hexagon::subreg_loreg)).addReg(Src1SubLo)
+      BuildMI(MBB, MI, MI.getDebugLoc(), get(Hexagon::M2_mpyi),
+              HRI.getSubReg(DstReg, Hexagon::subreg_loreg))
+          .addReg(Src1SubLo)
           .addReg(Src2SubLo);
       MBB.erase(MI);
       MRI.clearKillFlags(Src1SubHi);
@@ -927,22 +1080,26 @@ bool HexagonInstrInfo::expandPostRAPseudo(MachineBasicBlock::iterator MI)
     }
     case Hexagon::VMULW_ACC: {
       // Expand 64-bit vector multiply with addition into 2 scalar multiplies.
-      unsigned DstReg = MI->getOperand(0).getReg();
-      unsigned Src1Reg = MI->getOperand(1).getReg();
-      unsigned Src2Reg = MI->getOperand(2).getReg();
-      unsigned Src3Reg = MI->getOperand(3).getReg();
+      unsigned DstReg = MI.getOperand(0).getReg();
+      unsigned Src1Reg = MI.getOperand(1).getReg();
+      unsigned Src2Reg = MI.getOperand(2).getReg();
+      unsigned Src3Reg = MI.getOperand(3).getReg();
       unsigned Src1SubHi = HRI.getSubReg(Src1Reg, Hexagon::subreg_hireg);
       unsigned Src1SubLo = HRI.getSubReg(Src1Reg, Hexagon::subreg_loreg);
       unsigned Src2SubHi = HRI.getSubReg(Src2Reg, Hexagon::subreg_hireg);
       unsigned Src2SubLo = HRI.getSubReg(Src2Reg, Hexagon::subreg_loreg);
       unsigned Src3SubHi = HRI.getSubReg(Src3Reg, Hexagon::subreg_hireg);
       unsigned Src3SubLo = HRI.getSubReg(Src3Reg, Hexagon::subreg_loreg);
-      BuildMI(MBB, MI, MI->getDebugLoc(), get(Hexagon::M2_maci),
-              HRI.getSubReg(DstReg, Hexagon::subreg_hireg)).addReg(Src1SubHi)
-          .addReg(Src2SubHi).addReg(Src3SubHi);
-      BuildMI(MBB, MI, MI->getDebugLoc(), get(Hexagon::M2_maci),
-              HRI.getSubReg(DstReg, Hexagon::subreg_loreg)).addReg(Src1SubLo)
-          .addReg(Src2SubLo).addReg(Src3SubLo);
+      BuildMI(MBB, MI, MI.getDebugLoc(), get(Hexagon::M2_maci),
+              HRI.getSubReg(DstReg, Hexagon::subreg_hireg))
+          .addReg(Src1SubHi)
+          .addReg(Src2SubHi)
+          .addReg(Src3SubHi);
+      BuildMI(MBB, MI, MI.getDebugLoc(), get(Hexagon::M2_maci),
+              HRI.getSubReg(DstReg, Hexagon::subreg_loreg))
+          .addReg(Src1SubLo)
+          .addReg(Src2SubLo)
+          .addReg(Src3SubLo);
       MBB.erase(MI);
       MRI.clearKillFlags(Src1SubHi);
       MRI.clearKillFlags(Src1SubLo);
@@ -952,16 +1109,58 @@ bool HexagonInstrInfo::expandPostRAPseudo(MachineBasicBlock::iterator MI)
       MRI.clearKillFlags(Src3SubLo);
       return true;
     }
+    case Hexagon::Insert4: {
+      unsigned DstReg = MI.getOperand(0).getReg();
+      unsigned Src1Reg = MI.getOperand(1).getReg();
+      unsigned Src2Reg = MI.getOperand(2).getReg();
+      unsigned Src3Reg = MI.getOperand(3).getReg();
+      unsigned Src4Reg = MI.getOperand(4).getReg();
+      unsigned Src1RegIsKill = getKillRegState(MI.getOperand(1).isKill());
+      unsigned Src2RegIsKill = getKillRegState(MI.getOperand(2).isKill());
+      unsigned Src3RegIsKill = getKillRegState(MI.getOperand(3).isKill());
+      unsigned Src4RegIsKill = getKillRegState(MI.getOperand(4).isKill());
+      unsigned DstSubHi = HRI.getSubReg(DstReg, Hexagon::subreg_hireg);
+      unsigned DstSubLo = HRI.getSubReg(DstReg, Hexagon::subreg_loreg);
+      BuildMI(MBB, MI, MI.getDebugLoc(), get(Hexagon::S2_insert),
+              HRI.getSubReg(DstReg, Hexagon::subreg_loreg))
+          .addReg(DstSubLo)
+          .addReg(Src1Reg, Src1RegIsKill)
+          .addImm(16)
+          .addImm(0);
+      BuildMI(MBB, MI, MI.getDebugLoc(), get(Hexagon::S2_insert),
+              HRI.getSubReg(DstReg, Hexagon::subreg_loreg))
+          .addReg(DstSubLo)
+          .addReg(Src2Reg, Src2RegIsKill)
+          .addImm(16)
+          .addImm(16);
+      BuildMI(MBB, MI, MI.getDebugLoc(), get(Hexagon::S2_insert),
+              HRI.getSubReg(DstReg, Hexagon::subreg_hireg))
+          .addReg(DstSubHi)
+          .addReg(Src3Reg, Src3RegIsKill)
+          .addImm(16)
+          .addImm(0);
+      BuildMI(MBB, MI, MI.getDebugLoc(), get(Hexagon::S2_insert),
+              HRI.getSubReg(DstReg, Hexagon::subreg_hireg))
+          .addReg(DstSubHi)
+          .addReg(Src4Reg, Src4RegIsKill)
+          .addImm(16)
+          .addImm(16);
+      MBB.erase(MI);
+      MRI.clearKillFlags(DstReg);
+      MRI.clearKillFlags(DstSubHi);
+      MRI.clearKillFlags(DstSubLo);
+      return true;
+    }
     case Hexagon::MUX64_rr: {
-      const MachineOperand &Op0 = MI->getOperand(0);
-      const MachineOperand &Op1 = MI->getOperand(1);
-      const MachineOperand &Op2 = MI->getOperand(2);
-      const MachineOperand &Op3 = MI->getOperand(3);
+      const MachineOperand &Op0 = MI.getOperand(0);
+      const MachineOperand &Op1 = MI.getOperand(1);
+      const MachineOperand &Op2 = MI.getOperand(2);
+      const MachineOperand &Op3 = MI.getOperand(3);
       unsigned Rd = Op0.getReg();
       unsigned Pu = Op1.getReg();
       unsigned Rs = Op2.getReg();
       unsigned Rt = Op3.getReg();
-      DebugLoc DL = MI->getDebugLoc();
+      DebugLoc DL = MI.getDebugLoc();
       unsigned K1 = getKillRegState(Op1.isKill());
       unsigned K2 = getKillRegState(Op2.isKill());
       unsigned K3 = getKillRegState(Op3.isKill());
@@ -976,24 +1175,62 @@ bool HexagonInstrInfo::expandPostRAPseudo(MachineBasicBlock::iterator MI)
       MBB.erase(MI);
       return true;
     }
+    case Hexagon::VSelectPseudo_V6: {
+      const MachineOperand &Op0 = MI.getOperand(0);
+      const MachineOperand &Op1 = MI.getOperand(1);
+      const MachineOperand &Op2 = MI.getOperand(2);
+      const MachineOperand &Op3 = MI.getOperand(3);
+      BuildMI(MBB, MI, DL, get(Hexagon::V6_vcmov))
+        .addOperand(Op0)
+        .addOperand(Op1)
+        .addOperand(Op2);
+      BuildMI(MBB, MI, DL, get(Hexagon::V6_vncmov))
+        .addOperand(Op0)
+        .addOperand(Op1)
+        .addOperand(Op3);
+      MBB.erase(MI);
+      return true;
+    }
+    case Hexagon::VSelectDblPseudo_V6: {
+      MachineOperand &Op0 = MI.getOperand(0);
+      MachineOperand &Op1 = MI.getOperand(1);
+      MachineOperand &Op2 = MI.getOperand(2);
+      MachineOperand &Op3 = MI.getOperand(3);
+      unsigned SrcLo = HRI.getSubReg(Op2.getReg(), Hexagon::subreg_loreg);
+      unsigned SrcHi = HRI.getSubReg(Op2.getReg(), Hexagon::subreg_hireg);
+      BuildMI(MBB, MI, DL, get(Hexagon::V6_vccombine))
+        .addOperand(Op0)
+        .addOperand(Op1)
+        .addReg(SrcHi)
+        .addReg(SrcLo);
+      SrcLo = HRI.getSubReg(Op3.getReg(), Hexagon::subreg_loreg);
+      SrcHi = HRI.getSubReg(Op3.getReg(), Hexagon::subreg_hireg);
+      BuildMI(MBB, MI, DL, get(Hexagon::V6_vnccombine))
+        .addOperand(Op0)
+        .addOperand(Op1)
+        .addReg(SrcHi)
+        .addReg(SrcLo);
+      MBB.erase(MI);
+      return true;
+    }
     case Hexagon::TCRETURNi:
-      MI->setDesc(get(Hexagon::J2_jump));
+      MI.setDesc(get(Hexagon::J2_jump));
       return true;
     case Hexagon::TCRETURNr:
-      MI->setDesc(get(Hexagon::J2_jumpr));
+      MI.setDesc(get(Hexagon::J2_jumpr));
       return true;
     case Hexagon::TFRI_f:
     case Hexagon::TFRI_cPt_f:
     case Hexagon::TFRI_cNotPt_f: {
       unsigned Opx = (Opc == Hexagon::TFRI_f) ? 1 : 2;
-      APFloat FVal = MI->getOperand(Opx).getFPImm()->getValueAPF();
+      APFloat FVal = MI.getOperand(Opx).getFPImm()->getValueAPF();
       APInt IVal = FVal.bitcastToAPInt();
-      MI->RemoveOperand(Opx);
+      MI.RemoveOperand(Opx);
       unsigned NewOpc = (Opc == Hexagon::TFRI_f)     ? Hexagon::A2_tfrsi   :
                         (Opc == Hexagon::TFRI_cPt_f) ? Hexagon::C2_cmoveit :
                                                        Hexagon::C2_cmoveif;
-      MI->setDesc(get(NewOpc));
-      MI->addOperand(MachineOperand::CreateImm(IVal.getZExtValue()));
+      MI.setDesc(get(NewOpc));
+      MI.addOperand(MachineOperand::CreateImm(IVal.getZExtValue()));
       return true;
     }
   }
@@ -1035,20 +1272,20 @@ void HexagonInstrInfo::insertNoop(MachineBasicBlock &MBB,
 // if (!p0.new) R1 = add(R2, R3)
 // Note: New-value stores are not included here as in the current
 // implementation, we don't need to check their predicate sense.
-bool HexagonInstrInfo::isPredicated(const MachineInstr *MI) const {
-  const uint64_t F = MI->getDesc().TSFlags;
+bool HexagonInstrInfo::isPredicated(const MachineInstr &MI) const {
+  const uint64_t F = MI.getDesc().TSFlags;
   return (F >> HexagonII::PredicatedPos) & HexagonII::PredicatedMask;
 }
 
 
-bool HexagonInstrInfo::PredicateInstruction(MachineInstr *MI,
-      ArrayRef<MachineOperand> Cond) const {
+bool HexagonInstrInfo::PredicateInstruction(
+    MachineInstr &MI, ArrayRef<MachineOperand> Cond) const {
   if (Cond.empty() || isNewValueJump(Cond[0].getImm()) ||
       isEndLoopN(Cond[0].getImm())) {
-    DEBUG(dbgs() << "\nCannot predicate:"; MI->dump(););
+    DEBUG(dbgs() << "\nCannot predicate:"; MI.dump(););
     return false;
   }
-  int Opc = MI->getOpcode();
+  int Opc = MI.getOpcode();
   assert (isPredicable(MI) && "Expected predicable instruction");
   bool invertJump = predOpcodeHasNot(Cond);
 
@@ -1057,13 +1294,13 @@ bool HexagonInstrInfo::PredicateInstruction(MachineInstr *MI,
   // plicated manipulations with the operands (handling tied operands,
   // etc.), build a new temporary instruction, then overwrite MI with it.
 
-  MachineBasicBlock &B = *MI->getParent();
-  DebugLoc DL = MI->getDebugLoc();
+  MachineBasicBlock &B = *MI.getParent();
+  DebugLoc DL = MI.getDebugLoc();
   unsigned PredOpc = getCondOpcode(Opc, invertJump);
   MachineInstrBuilder T = BuildMI(B, MI, DL, get(PredOpc));
-  unsigned NOp = 0, NumOps = MI->getNumOperands();
+  unsigned NOp = 0, NumOps = MI.getNumOperands();
   while (NOp < NumOps) {
-    MachineOperand &Op = MI->getOperand(NOp);
+    MachineOperand &Op = MI.getOperand(NOp);
     if (!Op.isReg() || !Op.isDef() || Op.isImplicit())
       break;
     T.addOperand(Op);
@@ -1076,13 +1313,13 @@ bool HexagonInstrInfo::PredicateInstruction(MachineInstr *MI,
   assert(GotPredReg);
   T.addReg(PredReg, PredRegFlags);
   while (NOp < NumOps)
-    T.addOperand(MI->getOperand(NOp++));
+    T.addOperand(MI.getOperand(NOp++));
 
-  MI->setDesc(get(PredOpc));
-  while (unsigned n = MI->getNumOperands())
-    MI->RemoveOperand(n-1);
+  MI.setDesc(get(PredOpc));
+  while (unsigned n = MI.getNumOperands())
+    MI.RemoveOperand(n-1);
   for (unsigned i = 0, n = T->getNumOperands(); i < n; ++i)
-    MI->addOperand(T->getOperand(i));
+    MI.addOperand(T->getOperand(i));
 
   MachineBasicBlock::instr_iterator TI = T->getIterator();
   B.erase(TI);
@@ -1100,11 +1337,11 @@ bool HexagonInstrInfo::SubsumesPredicate(ArrayRef<MachineOperand> Pred1,
 }
 
 
-bool HexagonInstrInfo::DefinesPredicate(MachineInstr *MI,
-                                   std::vector<MachineOperand> &Pred) const {
+bool HexagonInstrInfo::DefinesPredicate(
+    MachineInstr &MI, std::vector<MachineOperand> &Pred) const {
   auto &HRI = getRegisterInfo();
-  for (unsigned oper = 0; oper < MI->getNumOperands(); ++oper) {
-    MachineOperand MO = MI->getOperand(oper);
+  for (unsigned oper = 0; oper < MI.getNumOperands(); ++oper) {
+    MachineOperand MO = MI.getOperand(oper);
     if (MO.isReg() && MO.isDef()) {
       const TargetRegisterClass* RC = HRI.getMinimalPhysRegClass(MO.getReg());
       if (RC == &Hexagon::PredRegsRegClass) {
@@ -1116,107 +1353,25 @@ bool HexagonInstrInfo::DefinesPredicate(MachineInstr *MI,
   return false;
 }
 
-bool HexagonInstrInfo::isPredicable(MachineInstr *MI) const {
-  bool isPred = MI->getDesc().isPredicable();
-
-  if (!isPred)
-    return false;
-
-  const int Opc = MI->getOpcode();
-  int NumOperands = MI->getNumOperands();
-
-  // Keep a flag for upto 4 operands in the instructions, to indicate if
-  // that operand has been constant extended.
-  bool OpCExtended[4];
-  if (NumOperands > 4)
-    NumOperands = 4;
-
-  for (int i = 0; i < NumOperands; i++)
-    OpCExtended[i] = (isOperandExtended(MI, i) && isConstExtended(MI));
-
-  switch(Opc) {
-  case Hexagon::A2_tfrsi:
-    return (isOperandExtended(MI, 1) && isConstExtended(MI)) ||
-           isInt<12>(MI->getOperand(1).getImm());
-
-  case Hexagon::S2_storerd_io:
-    return isShiftedUInt<6,3>(MI->getOperand(1).getImm());
-
-  case Hexagon::S2_storeri_io:
-  case Hexagon::S2_storerinew_io:
-    return isShiftedUInt<6,2>(MI->getOperand(1).getImm());
-
-  case Hexagon::S2_storerh_io:
-  case Hexagon::S2_storerhnew_io:
-    return isShiftedUInt<6,1>(MI->getOperand(1).getImm());
-
-  case Hexagon::S2_storerb_io:
-  case Hexagon::S2_storerbnew_io:
-    return isUInt<6>(MI->getOperand(1).getImm());
-
-  case Hexagon::L2_loadrd_io:
-    return isShiftedUInt<6,3>(MI->getOperand(2).getImm());
-
-  case Hexagon::L2_loadri_io:
-    return isShiftedUInt<6,2>(MI->getOperand(2).getImm());
-
-  case Hexagon::L2_loadrh_io:
-  case Hexagon::L2_loadruh_io:
-    return isShiftedUInt<6,1>(MI->getOperand(2).getImm());
-
-  case Hexagon::L2_loadrb_io:
-  case Hexagon::L2_loadrub_io:
-    return isUInt<6>(MI->getOperand(2).getImm());
-
-  case Hexagon::L2_loadrd_pi:
-    return isShiftedInt<4,3>(MI->getOperand(3).getImm());
-
-  case Hexagon::L2_loadri_pi:
-    return isShiftedInt<4,2>(MI->getOperand(3).getImm());
-
-  case Hexagon::L2_loadrh_pi:
-  case Hexagon::L2_loadruh_pi:
-    return isShiftedInt<4,1>(MI->getOperand(3).getImm());
-
-  case Hexagon::L2_loadrb_pi:
-  case Hexagon::L2_loadrub_pi:
-    return isInt<4>(MI->getOperand(3).getImm());
-
-  case Hexagon::S4_storeirb_io:
-  case Hexagon::S4_storeirh_io:
-  case Hexagon::S4_storeiri_io:
-    return (OpCExtended[1] || isUInt<6>(MI->getOperand(1).getImm())) &&
-           (OpCExtended[2] || isInt<6>(MI->getOperand(2).getImm()));
-
-  case Hexagon::A2_addi:
-    return isInt<8>(MI->getOperand(2).getImm());
 
-  case Hexagon::A2_aslh:
-  case Hexagon::A2_asrh:
-  case Hexagon::A2_sxtb:
-  case Hexagon::A2_sxth:
-  case Hexagon::A2_zxtb:
-  case Hexagon::A2_zxth:
-    return true;
-  }
-
-  return true;
+bool HexagonInstrInfo::isPredicable(MachineInstr &MI) const {
+  return MI.getDesc().isPredicable();
 }
 
-
-bool HexagonInstrInfo::isSchedulingBoundary(const MachineInstr *MI,
-      const MachineBasicBlock *MBB, const MachineFunction &MF) const {
+bool HexagonInstrInfo::isSchedulingBoundary(const MachineInstr &MI,
+                                            const MachineBasicBlock *MBB,
+                                            const MachineFunction &MF) const {
   // Debug info is never a scheduling boundary. It's necessary to be explicit
   // due to the special treatment of IT instructions below, otherwise a
   // dbg_value followed by an IT will result in the IT instruction being
   // considered a scheduling hazard, which is wrong. It should be the actual
   // instruction preceding the dbg_value instruction(s), just like it is
   // when debug info is not present.
-  if (MI->isDebugValue())
+  if (MI.isDebugValue())
     return false;
 
   // Throwing call is a boundary.
-  if (MI->isCall()) {
+  if (MI.isCall()) {
     // If any of the block's successors is a landing pad, this could be a
     // throwing call.
     for (auto I : MBB->successors())
@@ -1225,15 +1380,15 @@ bool HexagonInstrInfo::isSchedulingBoundary(const MachineInstr *MI,
   }
 
   // Don't mess around with no return calls.
-  if (MI->getOpcode() == Hexagon::CALLv3nr)
+  if (MI.getOpcode() == Hexagon::CALLv3nr)
     return true;
 
   // Terminators and labels can't be scheduled around.
-  if (MI->getDesc().isTerminator() || MI->isPosition())
+  if (MI.getDesc().isTerminator() || MI.isPosition())
     return true;
 
-  if (MI->isInlineAsm() && !ScheduleInlineAsm)
-      return true;
+  if (MI.isInlineAsm() && !ScheduleInlineAsm)
+    return true;
 
   return false;
 }
@@ -1286,9 +1441,10 @@ HexagonInstrInfo::CreateTargetPostRAHazardRecognizer(
 /// \p SrcReg and \p SrcReg2 if having two register operands, and the value it
 /// compares against in CmpValue. Return true if the comparison instruction
 /// can be analyzed.
-bool HexagonInstrInfo::analyzeCompare(const MachineInstr *MI,
-      unsigned &SrcReg, unsigned &SrcReg2, int &Mask, int &Value) const {
-  unsigned Opc = MI->getOpcode();
+bool HexagonInstrInfo::analyzeCompare(const MachineInstr &MI, unsigned &SrcReg,
+                                      unsigned &SrcReg2, int &Mask,
+                                      int &Value) const {
+  unsigned Opc = MI.getOpcode();
 
   // Set mask and the first source register.
   switch (Opc) {
@@ -1307,7 +1463,7 @@ bool HexagonInstrInfo::analyzeCompare(const MachineInstr *MI,
     case Hexagon::C4_cmpneqi:
     case Hexagon::C4_cmplteui:
     case Hexagon::C4_cmpltei:
-      SrcReg = MI->getOperand(1).getReg();
+      SrcReg = MI.getOperand(1).getReg();
       Mask = ~0;
       break;
     case Hexagon::A4_cmpbeq:
@@ -1316,7 +1472,7 @@ bool HexagonInstrInfo::analyzeCompare(const MachineInstr *MI,
     case Hexagon::A4_cmpbeqi:
     case Hexagon::A4_cmpbgti:
     case Hexagon::A4_cmpbgtui:
-      SrcReg = MI->getOperand(1).getReg();
+      SrcReg = MI.getOperand(1).getReg();
       Mask = 0xFF;
       break;
     case Hexagon::A4_cmpheq:
@@ -1325,7 +1481,7 @@ bool HexagonInstrInfo::analyzeCompare(const MachineInstr *MI,
     case Hexagon::A4_cmpheqi:
     case Hexagon::A4_cmphgti:
     case Hexagon::A4_cmphgtui:
-      SrcReg = MI->getOperand(1).getReg();
+      SrcReg = MI.getOperand(1).getReg();
       Mask = 0xFFFF;
       break;
   }
@@ -1347,7 +1503,7 @@ bool HexagonInstrInfo::analyzeCompare(const MachineInstr *MI,
     case Hexagon::C4_cmpneq:
     case Hexagon::C4_cmplte:
     case Hexagon::C4_cmplteu:
-      SrcReg2 = MI->getOperand(2).getReg();
+      SrcReg2 = MI.getOperand(2).getReg();
       return true;
 
     case Hexagon::C2_cmpeqi:
@@ -1363,17 +1519,17 @@ bool HexagonInstrInfo::analyzeCompare(const MachineInstr *MI,
     case Hexagon::A4_cmphgti:
     case Hexagon::A4_cmphgtui:
       SrcReg2 = 0;
-      Value = MI->getOperand(2).getImm();
+      Value = MI.getOperand(2).getImm();
       return true;
   }
 
   return false;
 }
 
-
 unsigned HexagonInstrInfo::getInstrLatency(const InstrItineraryData *ItinData,
-      const MachineInstr *MI, unsigned *PredCost) const {
-  return getInstrTimingClassLatency(ItinData, MI);
+                                           const MachineInstr &MI,
+                                           unsigned *PredCost) const {
+  return getInstrTimingClassLatency(ItinData, &MI);
 }
 
 
@@ -1388,27 +1544,27 @@ DFAPacketizer *HexagonInstrInfo::CreateTargetScheduleState(
 //  %R13<def> = L2_loadri_io %R29, 136; mem:LD4[FixedStack0]
 //  S2_storeri_io %R29, 132, %R1<kill>; flags:  mem:ST4[FixedStack1]
 // Currently AA considers the addresses in these instructions to be aliasing.
-bool HexagonInstrInfo::areMemAccessesTriviallyDisjoint(MachineInstr *MIa,
-      MachineInstr *MIb, AliasAnalysis *AA) const {
+bool HexagonInstrInfo::areMemAccessesTriviallyDisjoint(
+    MachineInstr &MIa, MachineInstr &MIb, AliasAnalysis *AA) const {
   int OffsetA = 0, OffsetB = 0;
   unsigned SizeA = 0, SizeB = 0;
 
-  if (MIa->hasUnmodeledSideEffects() || MIb->hasUnmodeledSideEffects() ||
-      MIa->hasOrderedMemoryRef() || MIa->hasOrderedMemoryRef())
+  if (MIa.hasUnmodeledSideEffects() || MIb.hasUnmodeledSideEffects() ||
+      MIa.hasOrderedMemoryRef() || MIb.hasOrderedMemoryRef())
     return false;
 
   // Instructions that are pure loads, not loads and stores like memops are not
   // dependent.
-  if (MIa->mayLoad() && !isMemOp(MIa) && MIb->mayLoad() && !isMemOp(MIb))
+  if (MIa.mayLoad() && !isMemOp(&MIa) && MIb.mayLoad() && !isMemOp(&MIb))
     return true;
 
   // Get base, offset, and access size in MIa.
-  unsigned BaseRegA = getBaseAndOffset(MIa, OffsetA, SizeA);
+  unsigned BaseRegA = getBaseAndOffset(&MIa, OffsetA, SizeA);
   if (!BaseRegA || !SizeA)
     return false;
 
   // Get base, offset, and access size in MIb.
-  unsigned BaseRegB = getBaseAndOffset(MIb, OffsetB, SizeB);
+  unsigned BaseRegB = getBaseAndOffset(&MIb, OffsetB, SizeB);
   if (!BaseRegB || !SizeB)
     return false;
 
@@ -1486,13 +1642,13 @@ bool HexagonInstrInfo::isCompoundBranchInstr(const MachineInstr *MI) const {
 
 
 bool HexagonInstrInfo::isCondInst(const MachineInstr *MI) const {
-  return (MI->isBranch() && isPredicated(MI)) ||
+  return (MI->isBranch() && isPredicated(*MI)) ||
          isConditionalTransfer(MI) ||
          isConditionalALU32(MI)    ||
          isConditionalLoad(MI)     ||
          // Predicated stores which don't have a .new on any operands.
-         (MI->mayStore() && isPredicated(MI) && !isNewValueStore(MI) &&
-          !isPredicatedNew(MI));
+         (MI->mayStore() && isPredicated(*MI) && !isNewValueStore(MI) &&
+          !isPredicatedNew(*MI));
 }
 
 
@@ -1557,7 +1713,7 @@ bool HexagonInstrInfo::isConditionalALU32(const MachineInstr* MI) const {
 // FIXME - Function name and it's functionality don't match.
 // It should be renamed to hasPredNewOpcode()
 bool HexagonInstrInfo::isConditionalLoad(const MachineInstr* MI) const {
-  if (!MI->getDesc().mayLoad() || !isPredicated(MI))
+  if (!MI->getDesc().mayLoad() || !isPredicated(*MI))
     return false;
 
   int PNewOpcode = Hexagon::getPredNewOpcode(MI->getOpcode());
@@ -1763,8 +1919,7 @@ bool HexagonInstrInfo::isDotCurInst(const MachineInstr* MI) const {
 // Returns true, if any one of the operands is a dot new
 // insn, whether it is predicated dot new or register dot new.
 bool HexagonInstrInfo::isDotNewInst(const MachineInstr* MI) const {
-  if (isNewValueInst(MI) ||
-     (isPredicated(MI) && isPredicatedNew(MI)))
+  if (isNewValueInst(MI) || (isPredicated(*MI) && isPredicatedNew(*MI)))
     return true;
 
   return false;
@@ -2129,8 +2284,8 @@ bool HexagonInstrInfo::isPostIncrement(const MachineInstr* MI) const {
 }
 
 
-bool HexagonInstrInfo::isPredicatedNew(const MachineInstr *MI) const {
-  const uint64_t F = MI->getDesc().TSFlags;
+bool HexagonInstrInfo::isPredicatedNew(const MachineInstr &MI) const {
+  const uint64_t F = MI.getDesc().TSFlags;
   assert(isPredicated(MI));
   return (F >> HexagonII::PredicatedNewPos) & HexagonII::PredicatedNewMask;
 }
@@ -2143,8 +2298,8 @@ bool HexagonInstrInfo::isPredicatedNew(unsigned Opcode) const {
 }
 
 
-bool HexagonInstrInfo::isPredicatedTrue(const MachineInstr *MI) const {
-  const uint64_t F = MI->getDesc().TSFlags;
+bool HexagonInstrInfo::isPredicatedTrue(const MachineInstr &MI) const {
+  const uint64_t F = MI.getDesc().TSFlags;
   return !((F >> HexagonII::PredicatedFalsePos) &
            HexagonII::PredicatedFalseMask);
 }
@@ -2181,7 +2336,87 @@ bool HexagonInstrInfo::isPredictedTaken(unsigned Opcode) const {
 
 bool HexagonInstrInfo::isSaveCalleeSavedRegsCall(const MachineInstr *MI) const {
   return MI->getOpcode() == Hexagon::SAVE_REGISTERS_CALL_V4 ||
-         MI->getOpcode() == Hexagon::SAVE_REGISTERS_CALL_V4_EXT;
+         MI->getOpcode() == Hexagon::SAVE_REGISTERS_CALL_V4_EXT ||
+         MI->getOpcode() == Hexagon::SAVE_REGISTERS_CALL_V4_PIC ||
+         MI->getOpcode() == Hexagon::SAVE_REGISTERS_CALL_V4_EXT_PIC;
+}
+
+bool HexagonInstrInfo::isSignExtendingLoad(const MachineInstr &MI) const {
+  switch (MI.getOpcode()) {
+  // Byte
+  case Hexagon::L2_loadrb_io:
+  case Hexagon::L4_loadrb_ur:
+  case Hexagon::L4_loadrb_ap:
+  case Hexagon::L2_loadrb_pr:
+  case Hexagon::L2_loadrb_pbr:
+  case Hexagon::L2_loadrb_pi:
+  case Hexagon::L2_loadrb_pci:
+  case Hexagon::L2_loadrb_pcr:
+  case Hexagon::L2_loadbsw2_io:
+  case Hexagon::L4_loadbsw2_ur:
+  case Hexagon::L4_loadbsw2_ap:
+  case Hexagon::L2_loadbsw2_pr:
+  case Hexagon::L2_loadbsw2_pbr:
+  case Hexagon::L2_loadbsw2_pi:
+  case Hexagon::L2_loadbsw2_pci:
+  case Hexagon::L2_loadbsw2_pcr:
+  case Hexagon::L2_loadbsw4_io:
+  case Hexagon::L4_loadbsw4_ur:
+  case Hexagon::L4_loadbsw4_ap:
+  case Hexagon::L2_loadbsw4_pr:
+  case Hexagon::L2_loadbsw4_pbr:
+  case Hexagon::L2_loadbsw4_pi:
+  case Hexagon::L2_loadbsw4_pci:
+  case Hexagon::L2_loadbsw4_pcr:
+  case Hexagon::L4_loadrb_rr:
+  case Hexagon::L2_ploadrbt_io:
+  case Hexagon::L2_ploadrbt_pi:
+  case Hexagon::L2_ploadrbf_io:
+  case Hexagon::L2_ploadrbf_pi:
+  case Hexagon::L2_ploadrbtnew_io:
+  case Hexagon::L2_ploadrbfnew_io:
+  case Hexagon::L4_ploadrbt_rr:
+  case Hexagon::L4_ploadrbf_rr:
+  case Hexagon::L4_ploadrbtnew_rr:
+  case Hexagon::L4_ploadrbfnew_rr:
+  case Hexagon::L2_ploadrbtnew_pi:
+  case Hexagon::L2_ploadrbfnew_pi:
+  case Hexagon::L4_ploadrbt_abs:
+  case Hexagon::L4_ploadrbf_abs:
+  case Hexagon::L4_ploadrbtnew_abs:
+  case Hexagon::L4_ploadrbfnew_abs:
+  case Hexagon::L2_loadrbgp:
+  // Half
+  case Hexagon::L2_loadrh_io:
+  case Hexagon::L4_loadrh_ur:
+  case Hexagon::L4_loadrh_ap:
+  case Hexagon::L2_loadrh_pr:
+  case Hexagon::L2_loadrh_pbr:
+  case Hexagon::L2_loadrh_pi:
+  case Hexagon::L2_loadrh_pci:
+  case Hexagon::L2_loadrh_pcr:
+  case Hexagon::L4_loadrh_rr:
+  case Hexagon::L2_ploadrht_io:
+  case Hexagon::L2_ploadrht_pi:
+  case Hexagon::L2_ploadrhf_io:
+  case Hexagon::L2_ploadrhf_pi:
+  case Hexagon::L2_ploadrhtnew_io:
+  case Hexagon::L2_ploadrhfnew_io:
+  case Hexagon::L4_ploadrht_rr:
+  case Hexagon::L4_ploadrhf_rr:
+  case Hexagon::L4_ploadrhtnew_rr:
+  case Hexagon::L4_ploadrhfnew_rr:
+  case Hexagon::L2_ploadrhtnew_pi:
+  case Hexagon::L2_ploadrhfnew_pi:
+  case Hexagon::L4_ploadrht_abs:
+  case Hexagon::L4_ploadrhf_abs:
+  case Hexagon::L4_ploadrhtnew_abs:
+  case Hexagon::L4_ploadrhfnew_abs:
+  case Hexagon::L2_loadrhgp:
+    return true;
+  default:
+    return false;
+  }
 }
 
 
@@ -2202,6 +2437,17 @@ bool HexagonInstrInfo::isSpillPredRegOp(const MachineInstr *MI) const {
 }
 
 
+bool HexagonInstrInfo::isTailCall(const MachineInstr *MI) const {
+  if (!MI->isBranch())
+    return false;
+
+  for (auto &Op : MI->operands())
+    if (Op.isGlobal() || Op.isSymbol())
+      return true;
+  return false;
+}
+
+
 // Returns true when SU has a timing class TC1.
 bool HexagonInstrInfo::isTC1(const MachineInstr *MI) const {
   unsigned SchedClass = MI->getDesc().getSchedClass();
@@ -2269,6 +2515,28 @@ bool HexagonInstrInfo::isTC4x(const MachineInstr *MI) const {
 }
 
 
+// Schedule this ASAP.
+bool HexagonInstrInfo::isToBeScheduledASAP(const MachineInstr *MI1,
+      const MachineInstr *MI2) const {
+  if (!MI1 || !MI2)
+    return false;
+  if (mayBeCurLoad(MI1)) {
+    // if (result of SU is used in Next) return true;
+    unsigned DstReg = MI1->getOperand(0).getReg();
+    int N = MI2->getNumOperands();
+    for (int I = 0; I < N; I++)
+      if (MI2->getOperand(I).isReg() && DstReg == MI2->getOperand(I).getReg())
+        return true;
+  }
+  if (mayBeNewStore(MI2))
+    if (MI2->getOpcode() == Hexagon::V6_vS32b_pi)
+      if (MI1->getOperand(0).isReg() && MI2->getOperand(3).isReg() &&
+          MI1->getOperand(0).getReg() == MI2->getOperand(3).getReg())
+        return true;
+  return false;
+}
+
+
 bool HexagonInstrInfo::isV60VectorInstruction(const MachineInstr *MI) const {
   if (!MI)
     return false;
@@ -2366,6 +2634,21 @@ bool HexagonInstrInfo::isValidOffset(unsigned Opcode, int Offset,
   case Hexagon::J2_loop0i:
   case Hexagon::J2_loop1i:
     return isUInt<10>(Offset);
+
+  case Hexagon::S4_storeirb_io:
+  case Hexagon::S4_storeirbt_io:
+  case Hexagon::S4_storeirbf_io:
+    return isUInt<6>(Offset);
+
+  case Hexagon::S4_storeirh_io:
+  case Hexagon::S4_storeirht_io:
+  case Hexagon::S4_storeirhf_io:
+    return isShiftedUInt<6,1>(Offset);
+
+  case Hexagon::S4_storeiri_io:
+  case Hexagon::S4_storeirit_io:
+  case Hexagon::S4_storeirif_io:
+    return isShiftedUInt<6,2>(Offset);
   }
 
   if (Extend)
@@ -2422,10 +2705,12 @@ bool HexagonInstrInfo::isValidOffset(unsigned Opcode, int Offset,
   case Hexagon::L4_or_memopb_io :
     return (0 <= Offset && Offset <= 63);
 
-  // LDri_pred and STriw_pred are pseudo operations, so it has to take offset of
+  // LDriw_xxx and STriw_xxx are pseudo operations, so it has to take offset of
   // any size. Later pass knows how to handle it.
   case Hexagon::STriw_pred:
   case Hexagon::LDriw_pred:
+  case Hexagon::STriw_mod:
+  case Hexagon::LDriw_mod:
     return true;
 
   case Hexagon::TFR_FI:
@@ -2439,9 +2724,6 @@ bool HexagonInstrInfo::isValidOffset(unsigned Opcode, int Offset,
   case Hexagon::L2_ploadrubf_io:
   case Hexagon::S2_pstorerbt_io:
   case Hexagon::S2_pstorerbf_io:
-  case Hexagon::S4_storeirb_io:
-  case Hexagon::S4_storeirbt_io:
-  case Hexagon::S4_storeirbf_io:
     return isUInt<6>(Offset);
 
   case Hexagon::L2_ploadrht_io:
@@ -2450,18 +2732,12 @@ bool HexagonInstrInfo::isValidOffset(unsigned Opcode, int Offset,
   case Hexagon::L2_ploadruhf_io:
   case Hexagon::S2_pstorerht_io:
   case Hexagon::S2_pstorerhf_io:
-  case Hexagon::S4_storeirh_io:
-  case Hexagon::S4_storeirht_io:
-  case Hexagon::S4_storeirhf_io:
     return isShiftedUInt<6,1>(Offset);
 
   case Hexagon::L2_ploadrit_io:
   case Hexagon::L2_ploadrif_io:
   case Hexagon::S2_pstorerit_io:
   case Hexagon::S2_pstorerif_io:
-  case Hexagon::S4_storeiri_io:
-  case Hexagon::S4_storeirit_io:
-  case Hexagon::S4_storeirif_io:
     return isShiftedUInt<6,2>(Offset);
 
   case Hexagon::L2_ploadrdt_io:
@@ -2506,6 +2782,94 @@ bool HexagonInstrInfo::isVecUsableNextPacket(const MachineInstr *ProdMI,
   return false;
 }
 
+bool HexagonInstrInfo::isZeroExtendingLoad(const MachineInstr &MI) const {
+  switch (MI.getOpcode()) {
+  // Byte
+  case Hexagon::L2_loadrub_io:
+  case Hexagon::L4_loadrub_ur:
+  case Hexagon::L4_loadrub_ap:
+  case Hexagon::L2_loadrub_pr:
+  case Hexagon::L2_loadrub_pbr:
+  case Hexagon::L2_loadrub_pi:
+  case Hexagon::L2_loadrub_pci:
+  case Hexagon::L2_loadrub_pcr:
+  case Hexagon::L2_loadbzw2_io:
+  case Hexagon::L4_loadbzw2_ur:
+  case Hexagon::L4_loadbzw2_ap:
+  case Hexagon::L2_loadbzw2_pr:
+  case Hexagon::L2_loadbzw2_pbr:
+  case Hexagon::L2_loadbzw2_pi:
+  case Hexagon::L2_loadbzw2_pci:
+  case Hexagon::L2_loadbzw2_pcr:
+  case Hexagon::L2_loadbzw4_io:
+  case Hexagon::L4_loadbzw4_ur:
+  case Hexagon::L4_loadbzw4_ap:
+  case Hexagon::L2_loadbzw4_pr:
+  case Hexagon::L2_loadbzw4_pbr:
+  case Hexagon::L2_loadbzw4_pi:
+  case Hexagon::L2_loadbzw4_pci:
+  case Hexagon::L2_loadbzw4_pcr:
+  case Hexagon::L4_loadrub_rr:
+  case Hexagon::L2_ploadrubt_io:
+  case Hexagon::L2_ploadrubt_pi:
+  case Hexagon::L2_ploadrubf_io:
+  case Hexagon::L2_ploadrubf_pi:
+  case Hexagon::L2_ploadrubtnew_io:
+  case Hexagon::L2_ploadrubfnew_io:
+  case Hexagon::L4_ploadrubt_rr:
+  case Hexagon::L4_ploadrubf_rr:
+  case Hexagon::L4_ploadrubtnew_rr:
+  case Hexagon::L4_ploadrubfnew_rr:
+  case Hexagon::L2_ploadrubtnew_pi:
+  case Hexagon::L2_ploadrubfnew_pi:
+  case Hexagon::L4_ploadrubt_abs:
+  case Hexagon::L4_ploadrubf_abs:
+  case Hexagon::L4_ploadrubtnew_abs:
+  case Hexagon::L4_ploadrubfnew_abs:
+  case Hexagon::L2_loadrubgp:
+  // Half
+  case Hexagon::L2_loadruh_io:
+  case Hexagon::L4_loadruh_ur:
+  case Hexagon::L4_loadruh_ap:
+  case Hexagon::L2_loadruh_pr:
+  case Hexagon::L2_loadruh_pbr:
+  case Hexagon::L2_loadruh_pi:
+  case Hexagon::L2_loadruh_pci:
+  case Hexagon::L2_loadruh_pcr:
+  case Hexagon::L4_loadruh_rr:
+  case Hexagon::L2_ploadruht_io:
+  case Hexagon::L2_ploadruht_pi:
+  case Hexagon::L2_ploadruhf_io:
+  case Hexagon::L2_ploadruhf_pi:
+  case Hexagon::L2_ploadruhtnew_io:
+  case Hexagon::L2_ploadruhfnew_io:
+  case Hexagon::L4_ploadruht_rr:
+  case Hexagon::L4_ploadruhf_rr:
+  case Hexagon::L4_ploadruhtnew_rr:
+  case Hexagon::L4_ploadruhfnew_rr:
+  case Hexagon::L2_ploadruhtnew_pi:
+  case Hexagon::L2_ploadruhfnew_pi:
+  case Hexagon::L4_ploadruht_abs:
+  case Hexagon::L4_ploadruhf_abs:
+  case Hexagon::L4_ploadruhtnew_abs:
+  case Hexagon::L4_ploadruhfnew_abs:
+  case Hexagon::L2_loadruhgp:
+    return true;
+  default:
+    return false;
+  }
+}
+
+
+// Add latency to instruction.
+bool HexagonInstrInfo::addLatencyToSchedule(const MachineInstr *MI1,
+      const MachineInstr *MI2) const {
+  if (isV60VectorInstruction(MI1) && isV60VectorInstruction(MI2))
+    if (!isVecUsableNextPacket(MI1, MI2))
+      return true;
+  return false;
+}
+
 
 /// \brief Can these instructions execute at the same time in a bundle.
 bool HexagonInstrInfo::canExecuteInBundle(const MachineInstr *First,
@@ -2687,6 +3051,11 @@ bool HexagonInstrInfo::predOpcodeHasNot(ArrayRef<MachineOperand> Cond) const {
 }
 
 
+short HexagonInstrInfo::getAbsoluteForm(const MachineInstr *MI) const {
+  return Hexagon::getAbsoluteForm(MI->getOpcode());
+}
+
+
 unsigned HexagonInstrInfo::getAddrMode(const MachineInstr* MI) const {
   const uint64_t F = MI->getDesc().TSFlags;
   return (F >> HexagonII::AddrModePos) & HexagonII::AddrModeMask;
@@ -2735,8 +3104,6 @@ bool HexagonInstrInfo::getBaseAndOffsetPosition(const MachineInstr *MI,
       unsigned &BasePos, unsigned &OffsetPos) const {
   // Deal with memops first.
   if (isMemOp(MI)) {
-    assert (MI->getOperand(0).isReg() && MI->getOperand(1).isImm() &&
-            "Bad Memop.");
     BasePos = 0;
     OffsetPos = 1;
   } else if (MI->mayStore()) {
@@ -2748,7 +3115,7 @@ bool HexagonInstrInfo::getBaseAndOffsetPosition(const MachineInstr *MI,
   } else
     return false;
 
-  if (isPredicated(MI)) {
+  if (isPredicated(*MI)) {
     BasePos++;
     OffsetPos++;
   }
@@ -2802,7 +3169,7 @@ SmallVector<MachineInstr*, 2> HexagonInstrInfo::getBranchingInstrs(
       return Jumpers;
     --I;
   }
-  if (!isUnpredicatedTerminator(&*I))
+  if (!isUnpredicatedTerminator(*I))
     return Jumpers;
 
   // Get the last instruction in the block.
@@ -2811,7 +3178,7 @@ SmallVector<MachineInstr*, 2> HexagonInstrInfo::getBranchingInstrs(
   MachineInstr *SecondLastInst = nullptr;
   // Find one more terminator if present.
   do {
-    if (&*I != LastInst && !I->isBundle() && isUnpredicatedTerminator(&*I)) {
+    if (&*I != LastInst && !I->isBundle() && isUnpredicatedTerminator(*I)) {
       if (!SecondLastInst) {
         SecondLastInst = &*I;
         Jumpers.push_back(SecondLastInst);
@@ -2826,6 +3193,23 @@ SmallVector<MachineInstr*, 2> HexagonInstrInfo::getBranchingInstrs(
 }
 
 
+short HexagonInstrInfo::getBaseWithLongOffset(short Opcode) const {
+  if (Opcode < 0)
+    return -1;
+  return Hexagon::getBaseWithLongOffset(Opcode);
+}
+
+
+short HexagonInstrInfo::getBaseWithLongOffset(const MachineInstr *MI) const {
+  return Hexagon::getBaseWithLongOffset(MI->getOpcode());
+}
+
+
+short HexagonInstrInfo::getBaseWithRegOffset(const MachineInstr *MI) const {
+  return Hexagon::getBaseWithRegOffset(MI->getOpcode());
+}
+
+
 // Returns Operand Index for the constant extended instruction.
 unsigned HexagonInstrInfo::getCExtOpNum(const MachineInstr *MI) const {
   const uint64_t F = MI->getDesc().TSFlags;
@@ -3102,6 +3486,7 @@ int HexagonInstrInfo::getDotNewOp(const MachineInstr* MI) const {
   return 0;
 }
 
+
 // Returns the opcode to use when converting MI, which is a conditional jump,
 // into a conditional instruction which uses the .new value of the predicate.
 // We also use branch probabilities to add a hint to the jump.
@@ -3353,8 +3738,8 @@ HexagonII::SubInstructionGroup HexagonInstrInfo::getDuplexCandidateGroup(
   case Hexagon::S4_storeirb_io:
     // memb(Rs+#u4) = #U1
     Src1Reg = MI->getOperand(0).getReg();
-    if (isIntRegForSubInst(Src1Reg) && MI->getOperand(1).isImm() &&
-        isUInt<4>(MI->getOperand(1).getImm()) && MI->getOperand(2).isImm() &&
+    if (isIntRegForSubInst(Src1Reg) &&
+        MI->getOperand(1).isImm() && isUInt<4>(MI->getOperand(1).getImm()) &&
         MI->getOperand(2).isImm() && isUInt<1>(MI->getOperand(2).getImm()))
       return HexagonII::HSIG_S2;
     break;
@@ -3532,7 +3917,7 @@ unsigned HexagonInstrInfo::getInstrTimingClassLatency(
   // Default to one cycle for no itinerary. However, an "empty" itinerary may
   // still have a MinLatency property, which getStageLatency checks.
   if (!ItinData)
-    return getInstrLatency(ItinData, MI);
+    return getInstrLatency(ItinData, *MI);
 
   // Get the latency embedded in the itinerary. If we're not using timing class
   // latencies or if we using BSB scheduling, then restrict the maximum latency
@@ -3737,7 +4122,7 @@ unsigned HexagonInstrInfo::nonDbgBundleSize(
   assert(BundleHead->isBundle() && "Not a bundle header");
   auto MII = BundleHead.getInstrIterator();
   // Skip the bundle header.
-  return nonDbgMICount(++MII, getBundleEnd(BundleHead));
+  return nonDbgMICount(++MII, getBundleEnd(*BundleHead));
 }
 
 
@@ -3770,7 +4155,7 @@ bool HexagonInstrInfo::invertAndChangeJumpTarget(
     --TargetPos;
   assert((TargetPos >= 0) && MI->getOperand(TargetPos).isMBB());
   MI->getOperand(TargetPos).setMBB(NewTarget);
-  if (EnableBranchPrediction && isPredicatedNew(MI)) {
+  if (EnableBranchPrediction && isPredicatedNew(*MI)) {
     NewOpcode = reversePrediction(NewOpcode);
   }
   MI->setDesc(get(NewOpcode));
@@ -3826,3 +4211,7 @@ bool HexagonInstrInfo::validateBranchCond(const ArrayRef<MachineOperand> &Cond)
   return Cond.empty() || (Cond[0].isImm() && (Cond.size() != 1));
 }
 
+
+short HexagonInstrInfo::xformRegToImmOffset(const MachineInstr *MI) const {
+  return Hexagon::xformRegToImmOffset(MI->getOpcode());
+}
diff --git a/lib/Target/Hexagon/HexagonInstrInfo.h b/lib/Target/Hexagon/HexagonInstrInfo.h
index 9530d9f2aa0d..66b6883c955b 100644
--- a/lib/Target/Hexagon/HexagonInstrInfo.h
+++ b/lib/Target/Hexagon/HexagonInstrInfo.h
@@ -43,7 +43,7 @@ public:
   /// the destination along with the FrameIndex of the loaded stack slot.  If
   /// not, return 0.  This predicate must return 0 if the instruction has
   /// any side effects other than loading from the stack slot.
-  unsigned isLoadFromStackSlot(const MachineInstr *MI,
+  unsigned isLoadFromStackSlot(const MachineInstr &MI,
                                int &FrameIndex) const override;
 
   /// If the specified machine instruction is a direct
@@ -51,7 +51,7 @@ public:
   /// the source reg along with the FrameIndex of the loaded stack slot.  If
   /// not, return 0.  This predicate must return 0 if the instruction has
   /// any side effects other than storing to the stack slot.
-  unsigned isStoreToStackSlot(const MachineInstr *MI,
+  unsigned isStoreToStackSlot(const MachineInstr &MI,
                               int &FrameIndex) const override;
 
   /// Analyze the branching code at the end of MBB, returning
@@ -79,10 +79,10 @@ public:
   /// If AllowModify is true, then this routine is allowed to modify the basic
   /// block (e.g. delete instructions after the unconditional branch).
   ///
-  bool AnalyzeBranch(MachineBasicBlock &MBB,MachineBasicBlock *&TBB,
-                         MachineBasicBlock *&FBB,
-                         SmallVectorImpl<MachineOperand> &Cond,
-                         bool AllowModify) const override;
+  bool analyzeBranch(MachineBasicBlock &MBB, MachineBasicBlock *&TBB,
+                     MachineBasicBlock *&FBB,
+                     SmallVectorImpl<MachineOperand> &Cond,
+                     bool AllowModify) const override;
 
   /// Remove the branching code at the end of the specific MBB.
   /// This is only invoked in cases where AnalyzeBranch returns success. It
@@ -101,7 +101,7 @@ public:
   /// merging needs to be disabled.
   unsigned InsertBranch(MachineBasicBlock &MBB, MachineBasicBlock *TBB,
                         MachineBasicBlock *FBB, ArrayRef<MachineOperand> Cond,
-                        DebugLoc DL) const override;
+                        const DebugLoc &DL) const override;
 
   /// Return true if it's profitable to predicate
   /// instructions with accumulated instruction latency of "NumCycles"
@@ -141,9 +141,8 @@ public:
   /// The source and destination registers may overlap, which may require a
   /// careful implementation when multiple copy instructions are required for
   /// large registers. See for example the ARM target.
-  void copyPhysReg(MachineBasicBlock &MBB,
-                   MachineBasicBlock::iterator I, DebugLoc DL,
-                   unsigned DestReg, unsigned SrcReg,
+  void copyPhysReg(MachineBasicBlock &MBB, MachineBasicBlock::iterator I,
+                   const DebugLoc &DL, unsigned DestReg, unsigned SrcReg,
                    bool KillSrc) const override;
 
   /// Store the specified register of the given register class to the specified
@@ -171,7 +170,7 @@ public:
   /// into real instructions. The target can edit MI in place, or it can insert
   /// new instructions and erase MI. The function should return true if
   /// anything was changed.
-  bool expandPostRAPseudo(MachineBasicBlock::iterator MI) const override;
+  bool expandPostRAPseudo(MachineInstr &MI) const override;
 
   /// Reverses the branch condition of the specified condition list,
   /// returning false on success and true if it cannot be reversed.
@@ -183,11 +182,11 @@ public:
                   MachineBasicBlock::iterator MI) const override;
 
   /// Returns true if the instruction is already predicated.
-  bool isPredicated(const MachineInstr *MI) const override;
+  bool isPredicated(const MachineInstr &MI) const override;
 
   /// Convert the instruction into a predicated instruction.
   /// It returns true if the operation was successful.
-  bool PredicateInstruction(MachineInstr *MI,
+  bool PredicateInstruction(MachineInstr &MI,
                             ArrayRef<MachineOperand> Cond) const override;
 
   /// Returns true if the first specified predicate
@@ -198,17 +197,17 @@ public:
   /// If the specified instruction defines any predicate
   /// or condition code register(s) used for predication, returns true as well
   /// as the definition predicate(s) by reference.
-  bool DefinesPredicate(MachineInstr *MI,
+  bool DefinesPredicate(MachineInstr &MI,
                         std::vector<MachineOperand> &Pred) const override;
 
   /// Return true if the specified instruction can be predicated.
   /// By default, this returns true for every instruction with a
   /// PredicateOperand.
-  bool isPredicable(MachineInstr *MI) const override;
+  bool isPredicable(MachineInstr &MI) const override;
 
   /// Test if the given instruction should be considered a scheduling boundary.
   /// This primarily includes labels and terminators.
-  bool isSchedulingBoundary(const MachineInstr *MI,
+  bool isSchedulingBoundary(const MachineInstr &MI,
                             const MachineBasicBlock *MBB,
                             const MachineFunction &MF) const override;
 
@@ -227,15 +226,14 @@ public:
   /// in SrcReg and SrcReg2 if having two register operands, and the value it
   /// compares against in CmpValue. Return true if the comparison instruction
   /// can be analyzed.
-  bool analyzeCompare(const MachineInstr *MI,
-                      unsigned &SrcReg, unsigned &SrcReg2,
-                      int &Mask, int &Value) const override;
+  bool analyzeCompare(const MachineInstr &MI, unsigned &SrcReg,
+                      unsigned &SrcReg2, int &Mask, int &Value) const override;
 
   /// Compute the instruction latency of a given instruction.
   /// If the instruction has higher cost when predicated, it's returned via
   /// PredCost.
   unsigned getInstrLatency(const InstrItineraryData *ItinData,
-                           const MachineInstr *MI,
+                           const MachineInstr &MI,
                            unsigned *PredCost = 0) const override;
 
   /// Create machine specific model for scheduling.
@@ -246,10 +244,9 @@ public:
   // to tell, even without aliasing information, that two MIs access different
   // memory addresses. This function returns true if two MIs access different
   // memory addresses and false otherwise.
-  bool areMemAccessesTriviallyDisjoint(MachineInstr *MIa, MachineInstr *MIb,
-                                       AliasAnalysis *AA = nullptr)
-                                       const override;
-
+  bool
+  areMemAccessesTriviallyDisjoint(MachineInstr &MIa, MachineInstr &MIb,
+                                  AliasAnalysis *AA = nullptr) const override;
 
   /// HexagonInstrInfo specifics.
   ///
@@ -301,20 +298,24 @@ public:
   bool isNewValueStore(unsigned Opcode) const;
   bool isOperandExtended(const MachineInstr *MI, unsigned OperandNum) const;
   bool isPostIncrement(const MachineInstr* MI) const;
-  bool isPredicatedNew(const MachineInstr *MI) const;
+  bool isPredicatedNew(const MachineInstr &MI) const;
   bool isPredicatedNew(unsigned Opcode) const;
-  bool isPredicatedTrue(const MachineInstr *MI) const;
+  bool isPredicatedTrue(const MachineInstr &MI) const;
   bool isPredicatedTrue(unsigned Opcode) const;
   bool isPredicated(unsigned Opcode) const;
   bool isPredicateLate(unsigned Opcode) const;
   bool isPredictedTaken(unsigned Opcode) const;
   bool isSaveCalleeSavedRegsCall(const MachineInstr *MI) const;
+  bool isSignExtendingLoad(const MachineInstr &MI) const;
   bool isSolo(const MachineInstr* MI) const;
   bool isSpillPredRegOp(const MachineInstr *MI) const;
+  bool isTailCall(const MachineInstr *MI) const;
   bool isTC1(const MachineInstr *MI) const;
   bool isTC2(const MachineInstr *MI) const;
   bool isTC2Early(const MachineInstr *MI) const;
   bool isTC4x(const MachineInstr *MI) const;
+  bool isToBeScheduledASAP(const MachineInstr *MI1,
+                           const MachineInstr *MI2) const;
   bool isV60VectorInstruction(const MachineInstr *MI) const;
   bool isValidAutoIncImm(const EVT VT, const int Offset) const;
   bool isValidOffset(unsigned Opcode, int Offset, bool Extend = true) const;
@@ -322,8 +323,10 @@ public:
   bool isVecALU(const MachineInstr *MI) const;
   bool isVecUsableNextPacket(const MachineInstr *ProdMI,
                              const MachineInstr *ConsMI) const;
+  bool isZeroExtendingLoad(const MachineInstr &MI) const;
 
-
+  bool addLatencyToSchedule(const MachineInstr *MI1,
+                            const MachineInstr *MI2) const;
   bool canExecuteInBundle(const MachineInstr *First,
                           const MachineInstr *Second) const;
   bool hasEHLabel(const MachineBasicBlock *B) const;
@@ -341,11 +344,15 @@ public:
   bool predOpcodeHasNot(ArrayRef<MachineOperand> Cond) const;
 
 
+  short getAbsoluteForm(const MachineInstr *MI) const;
   unsigned getAddrMode(const MachineInstr* MI) const;
   unsigned getBaseAndOffset(const MachineInstr *MI, int &Offset,
                             unsigned &AccessSize) const;
   bool getBaseAndOffsetPosition(const MachineInstr *MI, unsigned &BasePos,
                                 unsigned &OffsetPos) const;
+  short getBaseWithLongOffset(short Opcode) const;
+  short getBaseWithLongOffset(const MachineInstr *MI) const;
+  short getBaseWithRegOffset(const MachineInstr *MI) const;
   SmallVector<MachineInstr*,2> getBranchingInstrs(MachineBasicBlock& MBB) const;
   unsigned getCExtOpNum(const MachineInstr *MI) const;
   HexagonII::CompoundGroup
@@ -395,6 +402,7 @@ public:
   bool reversePredSense(MachineInstr* MI) const;
   unsigned reversePrediction(unsigned Opcode) const;
   bool validateBranchCond(const ArrayRef<MachineOperand> &Cond) const;
+  short xformRegToImmOffset(const MachineInstr *MI) const;
 };
 
 }
diff --git a/lib/Target/Hexagon/HexagonInstrInfo.td b/lib/Target/Hexagon/HexagonInstrInfo.td
index 421403f49724..74dc5ac9a3ad 100644
--- a/lib/Target/Hexagon/HexagonInstrInfo.td
+++ b/lib/Target/Hexagon/HexagonInstrInfo.td
@@ -32,6 +32,9 @@ def LoReg: OutPatFrag<(ops node:$Rs),
 def HiReg: OutPatFrag<(ops node:$Rs),
                       (EXTRACT_SUBREG (i64 $Rs), subreg_hireg)>;
 
+def orisadd: PatFrag<(ops node:$Addr, node:$off),
+    (or node:$Addr, node:$off), [{ return orIsAdd(N); }]>;
+
 // SDNode for converting immediate C to C-1.
 def DEC_CONST_SIGNED : SDNodeXForm<imm, [{
    // Return the byte immediate const-1 as an SDNode.
@@ -418,6 +421,12 @@ defm addi : Addri_base<"add", add>, ImmRegRel, PredNewRel;
 def: Pat<(i32 (add I32:$Rs, s32ImmPred:$s16)),
          (i32 (A2_addi I32:$Rs, imm:$s16))>;
 
+let hasNewValue = 1, hasSideEffects = 0, isPseudo = 1 in
+def A2_iconst
+  : ALU32_ri <(outs IntRegs:$Rd),
+              (ins s23_2Imm:$s23_2),
+  "$Rd = iconst(#$s23_2)"> {}
+
 //===----------------------------------------------------------------------===//
 // Template class used for the following ALU32 instructions.
 // Rd=and(Rs,#s10)
@@ -1430,7 +1439,7 @@ class CondStr<string CReg, bit True, bit New> {
   string S = "if (" # !if(True,"","!") # CReg # !if(New,".new","") # ") ";
 }
 class JumpOpcStr<string Mnemonic, bit New, bit Taken> {
-  string S = Mnemonic # !if(Taken, ":t", !if(New, ":nt", ""));
+  string S = Mnemonic # !if(Taken, ":t", ":nt");
 }
 
 let isBranch = 1, isBarrier = 1, Defs = [PC], hasSideEffects = 0,
@@ -1438,9 +1447,9 @@ let isBranch = 1, isBarrier = 1, Defs = [PC], hasSideEffects = 0,
     isExtendable = 1, opExtendable = 0, isExtentSigned = 1,
     opExtentBits = 24, opExtentAlign = 2, InputType = "imm" in
 class T_JMP<string ExtStr>
-  : JInst<(outs), (ins brtarget:$dst),
+  : JInst_CJUMP_UCJUMP<(outs), (ins brtarget:$dst),
       "jump " # ExtStr # "$dst",
-      [], "", J_tc_2early_SLOT23> {
+      [], "", J_tc_2early_CJUMP_UCJUMP_ARCHDEPSLOT> {
     bits<24> dst;
     let IClass = 0b0101;
 
@@ -1453,11 +1462,11 @@ let isBranch = 1, Defs = [PC], hasSideEffects = 0, isPredicated = 1,
     isExtendable = 1, opExtendable = 1, isExtentSigned = 1,
     opExtentBits = 17, opExtentAlign = 2, InputType = "imm" in
 class T_JMP_c<bit PredNot, bit isPredNew, bit isTak, string ExtStr>
-  : JInst<(outs), (ins PredRegs:$src, brtarget:$dst),
+  : JInst_CJUMP_UCJUMP<(outs), (ins PredRegs:$src, brtarget:$dst),
       CondStr<"$src", !if(PredNot,0,1), isPredNew>.S #
         JumpOpcStr<"jump", isPredNew, isTak>.S # " " #
         ExtStr # "$dst",
-      [], "", J_tc_2early_SLOT23>, ImmRegRel {
+      [], "", J_tc_2early_CJUMP_UCJUMP_ARCHDEPSLOT>, ImmRegRel {
     let isTaken = isTak;
     let isPredicatedFalse = PredNot;
     let isPredicatedNew = isPredNew;
@@ -1576,19 +1585,31 @@ let Defs = VolatileV3.Regs in {
 let isTerminator = 1, hasSideEffects = 0 in {
   defm J2_jump : JMP_base<"JMP", "">, PredNewRel;
 
-  // Deal with explicit assembly
-  //  - never extened a jump #,  always extend a jump ##
-  let isAsmParserOnly = 1 in {
-    defm J2_jump_ext   : JMP_base<"JMP", "##">;
-    defm J2_jump_noext : JMP_base<"JMP", "#">;
-  }
-
   defm J2_jumpr : JMPR_base<"JMPr">, PredNewRel;
 
   let isReturn = 1, isCodeGenOnly = 1 in
   defm JMPret : JMPR_base<"JMPret">, PredNewRel;
 }
 
+let validSubTargets  = HasV60SubT in
+multiclass JMPpt_base<string BaseOp> {
+  let BaseOpcode = BaseOp in {
+    def tpt : T_JMP_c <0, 0, 1, "">; // Predicate true - taken
+    def fpt : T_JMP_c <1, 0, 1, "">; // Predicate false - taken
+  }
+}
+
+let validSubTargets  = HasV60SubT in
+multiclass JMPRpt_base<string BaseOp> {
+  let BaseOpcode = BaseOp in {
+    def tpt : T_JMPr_c<0, 0, 1>; // predicate true - taken
+    def fpt : T_JMPr_c<1, 0, 1>; // predicate false - taken
+  }
+}
+
+defm J2_jumpr : JMPRpt_base<"JMPr">;
+defm J2_jump  : JMPpt_base<"JMP">;
+
 def: Pat<(br bb:$dst),
          (J2_jump brtarget:$dst)>;
 def: Pat<(retflag),
@@ -1769,6 +1790,8 @@ multiclass Loadx_pat<PatFrag Load, ValueType VT, PatLeaf ImmPred,
   def: Pat<(VT (Load AddrFI:$fi)), (VT (MI AddrFI:$fi, 0))>;
   def: Pat<(VT (Load (add (i32 AddrFI:$fi), ImmPred:$Off))),
            (VT (MI AddrFI:$fi, imm:$Off))>;
+  def: Pat<(VT (Load (orisadd (i32 AddrFI:$fi), ImmPred:$Off))),
+           (VT (MI AddrFI:$fi, imm:$Off))>;
   def: Pat<(VT (Load (add (i32 IntRegs:$Rs), ImmPred:$Off))),
            (VT (MI IntRegs:$Rs, imm:$Off))>;
   def: Pat<(VT (Load (i32 IntRegs:$Rs))), (VT (MI IntRegs:$Rs, 0))>;
@@ -2010,6 +2033,12 @@ let isExtendable = 1, opExtendable = 2, isExtentSigned = 1, opExtentBits = 13,
 def LDriw_pred : LDInst<(outs PredRegs:$dst),
                         (ins IntRegs:$addr, s11_2Ext:$off),
                         ".error \"should not emit\"", []>;
+// Load modifier.
+let isExtendable = 1, opExtendable = 2, isExtentSigned = 1, opExtentBits = 13,
+    isCodeGenOnly = 1, isPseudo = 1, hasSideEffects = 0 in
+def LDriw_mod : LDInst<(outs ModRegs:$dst),
+                        (ins IntRegs:$addr, s11_2Ext:$off),
+                        ".error \"should not emit\"", []>;
 
 let Defs = [R29, R30, R31], Uses = [R30], hasSideEffects = 0 in
   def L2_deallocframe : LDInst<(outs), (ins),
@@ -2023,7 +2052,7 @@ let Defs = [R29, R30, R31], Uses = [R30], hasSideEffects = 0 in
 }
 
 // Load / Post increment circular addressing mode.
-let Uses = [CS], hasSideEffects = 0 in
+let Uses = [CS], hasSideEffects = 0, addrMode = PostInc in
 class T_load_pcr<string mnemonic, RegisterClass RC, bits<4> MajOp>
   : LDInst <(outs RC:$dst, IntRegs:$_dst_),
             (ins IntRegs:$Rz, ModRegs:$Mu),
@@ -2070,7 +2099,7 @@ let accessSize = DoubleWordAccess in
 def L2_loadrd_pcr  : T_load_pcr <"memd", DoubleRegs, 0b1110>;
 
 // Load / Post increment circular addressing mode.
-let Uses = [CS], hasSideEffects = 0 in
+let Uses = [CS], hasSideEffects = 0, addrMode = PostInc in
 class T_loadalign_pcr<string mnemonic, bits<4> MajOp, MemAccessSize AccessSz >
   : LDInst <(outs DoubleRegs:$dst, IntRegs:$_dst_),
             (ins DoubleRegs:$_src_, IntRegs:$Rz, ModRegs:$Mu),
@@ -2099,7 +2128,7 @@ def L2_loadalignh_pcr : T_loadalign_pcr <"memh_fifo", 0b0010, HalfWordAccess>;
 //===----------------------------------------------------------------------===//
 // Circular loads with immediate offset.
 //===----------------------------------------------------------------------===//
-let Uses = [CS], mayLoad = 1, hasSideEffects = 0 in
+let Uses = [CS], mayLoad = 1, hasSideEffects = 0, addrMode = PostInc in
 class T_load_pci <string mnemonic, RegisterClass RC,
                   Operand ImmOp, bits<4> MajOp>
   : LDInstPI<(outs RC:$dst, IntRegs:$_dst_),
@@ -2155,28 +2184,6 @@ let accessSize = WordAccess, hasNewValue = 0 in {
 let accessSize = DoubleWordAccess, hasNewValue = 0 in
 def L2_loadrd_pci : T_load_pci <"memd", DoubleRegs, s4_3Imm, 0b1110>;
 
-//===----------------------------------------------------------------------===//
-// Circular loads - Pseudo
-//
-// Please note that the input operand order in the pseudo instructions
-// doesn't match with the real instructions. Pseudo instructions operand
-// order should mimics the ordering in the intrinsics. Also, 'src2' doesn't
-// appear in the AsmString because it's same as 'dst'.
-//===----------------------------------------------------------------------===//
-let isCodeGenOnly = 1,  mayLoad = 1, hasSideEffects = 0, isPseudo = 1 in
-class T_load_pci_pseudo <string opc, RegisterClass RC>
-  : LDInstPI<(outs IntRegs:$_dst_, RC:$dst),
-             (ins IntRegs:$src1, IntRegs:$src2, IntRegs:$src3, s4Imm:$src4),
-  ".error \"$dst = "#opc#"($src1++#$src4:circ($src3))\"",
-  [], "$src1 = $_dst_">;
-
-def L2_loadrb_pci_pseudo  : T_load_pci_pseudo <"memb",  IntRegs>;
-def L2_loadrub_pci_pseudo : T_load_pci_pseudo <"memub", IntRegs>;
-def L2_loadrh_pci_pseudo  : T_load_pci_pseudo <"memh",  IntRegs>;
-def L2_loadruh_pci_pseudo : T_load_pci_pseudo <"memuh", IntRegs>;
-def L2_loadri_pci_pseudo  : T_load_pci_pseudo <"memw",  IntRegs>;
-def L2_loadrd_pci_pseudo  : T_load_pci_pseudo <"memd",  DoubleRegs>;
-
 
 // TODO: memb_fifo and memh_fifo must take destination register as input.
 // One-off circ loads - not enough in common to break into a class.
@@ -2233,7 +2240,7 @@ def S4_stored_locked : T_store_locked <"memd_locked", DoubleRegs>;
 //===----------------------------------------------------------------------===//
 // Bit-reversed loads with auto-increment register
 //===----------------------------------------------------------------------===//
-let hasSideEffects = 0 in
+let hasSideEffects = 0, addrMode = PostInc in
 class T_load_pbr<string mnemonic, RegisterClass RC,
                             MemAccessSize addrSize, bits<4> majOp>
   : LDInst
@@ -2278,26 +2285,6 @@ def L2_loadalignh_pbr :T_load_pbr <"memh_fifo", DoubleRegs,
                                    HalfWordAccess, 0b0010>;
 
 //===----------------------------------------------------------------------===//
-// Bit-reversed loads - Pseudo
-//
-// Please note that 'src2' doesn't appear in the AsmString because
-// it's same as 'dst'.
-//===----------------------------------------------------------------------===//
-let isCodeGenOnly = 1, mayLoad = 1, hasSideEffects = 0, isPseudo = 1 in
-class T_load_pbr_pseudo <string opc, RegisterClass RC>
-  : LDInstPI<(outs IntRegs:$_dst_, RC:$dst),
-             (ins IntRegs:$src1, IntRegs:$src2, IntRegs:$src3),
-  ".error \"$dst = "#opc#"($src1++$src3:brev)\"",
-  [], "$src1 = $_dst_">;
-
-def L2_loadrb_pbr_pseudo  : T_load_pbr_pseudo <"memb",  IntRegs>;
-def L2_loadrub_pbr_pseudo : T_load_pbr_pseudo <"memub", IntRegs>;
-def L2_loadrh_pbr_pseudo  : T_load_pbr_pseudo <"memh",  IntRegs>;
-def L2_loadruh_pbr_pseudo : T_load_pbr_pseudo <"memuh", IntRegs>;
-def L2_loadri_pbr_pseudo  : T_load_pbr_pseudo <"memw",  IntRegs>;
-def L2_loadrd_pbr_pseudo  : T_load_pbr_pseudo <"memd",  DoubleRegs>;
-
-//===----------------------------------------------------------------------===//
 // LD -
 //===----------------------------------------------------------------------===//
 
@@ -3558,14 +3545,20 @@ let addrMode = BaseImmOffset, InputType = "imm" in {
 // AddedComplexity) to the individual patterns.
 class Storex_fi_pat<PatFrag Store, PatFrag Value, InstHexagon MI>
   : Pat<(Store Value:$Rs, AddrFI:$fi), (MI AddrFI:$fi, 0, Value:$Rs)>;
-class Storex_fi_add_pat<PatFrag Store, PatFrag Value, PatFrag ImmPred,
-                        InstHexagon MI>
-  : Pat<(Store Value:$Rs, (add (i32 AddrFI:$fi), ImmPred:$Off)),
-        (MI AddrFI:$fi, imm:$Off, Value:$Rs)>;
-class Storex_add_pat<PatFrag Store, PatFrag Value, PatFrag ImmPred,
-                     InstHexagon MI>
-  : Pat<(Store Value:$Rt, (add (i32 IntRegs:$Rs), ImmPred:$Off)),
-        (MI IntRegs:$Rs, imm:$Off, Value:$Rt)>;
+multiclass Storex_fi_add_pat<PatFrag Store, PatFrag Value, PatFrag ImmPred,
+                             InstHexagon MI> {
+  def: Pat<(Store Value:$Rs, (add (i32 AddrFI:$fi), ImmPred:$Off)),
+           (MI AddrFI:$fi, imm:$Off, Value:$Rs)>;
+  def: Pat<(Store Value:$Rs, (orisadd (i32 AddrFI:$fi), ImmPred:$Off)),
+           (MI AddrFI:$fi, imm:$Off, Value:$Rs)>;
+}
+multiclass Storex_add_pat<PatFrag Store, PatFrag Value, PatFrag ImmPred,
+                          InstHexagon MI> {
+  def: Pat<(Store Value:$Rt, (add (i32 IntRegs:$Rs), ImmPred:$Off)),
+           (MI IntRegs:$Rs, imm:$Off, Value:$Rt)>;
+  def: Pat<(Store Value:$Rt, (orisadd (i32 IntRegs:$Rs), ImmPred:$Off)),
+           (MI IntRegs:$Rs, imm:$Off, Value:$Rt)>;
+}
 class Storex_simple_pat<PatFrag Store, PatFrag Value, InstHexagon MI>
   : Pat<(Store Value:$Rt, (i32 IntRegs:$Rs)),
         (MI IntRegs:$Rs, 0, Value:$Rt)>;
@@ -3577,14 +3570,20 @@ class Storexm_fi_pat<PatFrag Store, PatFrag Value, PatFrag ValueMod,
                      InstHexagon MI>
   : Pat<(Store Value:$Rs, AddrFI:$fi),
         (MI AddrFI:$fi, 0, (ValueMod Value:$Rs))>;
-class Storexm_fi_add_pat<PatFrag Store, PatFrag Value, PatFrag ImmPred,
-                         PatFrag ValueMod, InstHexagon MI>
-  : Pat<(Store Value:$Rs, (add (i32 AddrFI:$fi), ImmPred:$Off)),
-        (MI AddrFI:$fi, imm:$Off, (ValueMod Value:$Rs))>;
-class Storexm_add_pat<PatFrag Store, PatFrag Value, PatFrag ImmPred,
-                      PatFrag ValueMod, InstHexagon MI>
-  : Pat<(Store Value:$Rt, (add (i32 IntRegs:$Rs), ImmPred:$Off)),
-        (MI IntRegs:$Rs, imm:$Off, (ValueMod Value:$Rt))>;
+multiclass Storexm_fi_add_pat<PatFrag Store, PatFrag Value, PatFrag ImmPred,
+                              PatFrag ValueMod, InstHexagon MI> {
+  def: Pat<(Store Value:$Rs, (add (i32 AddrFI:$fi), ImmPred:$Off)),
+           (MI AddrFI:$fi, imm:$Off, (ValueMod Value:$Rs))>;
+  def: Pat<(Store Value:$Rs, (orisadd (i32 AddrFI:$fi), ImmPred:$Off)),
+           (MI AddrFI:$fi, imm:$Off, (ValueMod Value:$Rs))>;
+}
+multiclass Storexm_add_pat<PatFrag Store, PatFrag Value, PatFrag ImmPred,
+                           PatFrag ValueMod, InstHexagon MI> {
+  def: Pat<(Store Value:$Rt, (add (i32 IntRegs:$Rs), ImmPred:$Off)),
+           (MI IntRegs:$Rs, imm:$Off, (ValueMod Value:$Rt))>;
+  def: Pat<(Store Value:$Rt, (orisadd (i32 IntRegs:$Rs), ImmPred:$Off)),
+           (MI IntRegs:$Rs, imm:$Off, (ValueMod Value:$Rt))>;
+}
 class Storexm_simple_pat<PatFrag Store, PatFrag Value, PatFrag ValueMod,
                          InstHexagon MI>
   : Pat<(Store Value:$Rt, (i32 IntRegs:$Rs)),
@@ -3592,16 +3591,16 @@ class Storexm_simple_pat<PatFrag Store, PatFrag Value, PatFrag ValueMod,
 
 multiclass Storex_pat<PatFrag Store, PatFrag Value, PatLeaf ImmPred,
                       InstHexagon MI> {
-  def: Storex_fi_pat     <Store, Value,          MI>;
-  def: Storex_fi_add_pat <Store, Value, ImmPred, MI>;
-  def: Storex_add_pat    <Store, Value, ImmPred, MI>;
+  def:  Storex_fi_pat     <Store, Value,          MI>;
+  defm: Storex_fi_add_pat <Store, Value, ImmPred, MI>;
+  defm: Storex_add_pat    <Store, Value, ImmPred, MI>;
 }
 
 multiclass Storexm_pat<PatFrag Store, PatFrag Value, PatLeaf ImmPred,
                        PatFrag ValueMod, InstHexagon MI> {
-  def: Storexm_fi_pat     <Store, Value,          ValueMod, MI>;
-  def: Storexm_fi_add_pat <Store, Value, ImmPred, ValueMod, MI>;
-  def: Storexm_add_pat    <Store, Value, ImmPred, ValueMod, MI>;
+  def:  Storexm_fi_pat     <Store, Value,          ValueMod, MI>;
+  defm: Storexm_fi_add_pat <Store, Value, ImmPred, ValueMod, MI>;
+  defm: Storexm_add_pat    <Store, Value, ImmPred, ValueMod, MI>;
 }
 
 // Regular stores in the DAG have two operands: value and address.
@@ -3610,7 +3609,8 @@ multiclass Storexm_pat<PatFrag Store, PatFrag Value, PatLeaf ImmPred,
 // swapped. This relies on the knowledge that the F.Fragment uses names
 // "ptr" and "val".
 class SwapSt<PatFrag F>
-  : PatFrag<(ops node:$val, node:$ptr), F.Fragment>;
+  : PatFrag<(ops node:$val, node:$ptr), F.Fragment, F.PredicateCode,
+            F.OperandTransform>;
 
 let AddedComplexity = 20 in {
   defm: Storex_pat<truncstorei8,    I32, s32_0ImmPred, S2_storerb_io>;
@@ -3651,6 +3651,12 @@ let isExtendable = 1, opExtendable = 1, isExtentSigned = 1, opExtentBits = 13,
 def STriw_pred : STInst<(outs),
       (ins IntRegs:$addr, s11_2Ext:$off, PredRegs:$src1),
       ".error \"should not emit\"", []>;
+// Store modifier.
+let isExtendable = 1, opExtendable = 1, isExtentSigned = 1, opExtentBits = 13,
+    isCodeGenOnly = 1, isPseudo = 1, hasSideEffects = 0 in
+def STriw_mod : STInst<(outs),
+      (ins IntRegs:$addr, s11_2Ext:$off, ModRegs:$src1),
+      ".error \"should not emit\"", []>;
 
 // S2_allocframe: Allocate stack frame.
 let Defs = [R29, R30], Uses = [R29, R31, R30],
@@ -3668,7 +3674,7 @@ def S2_allocframe: ST0Inst <
 
 // S2_storer[bhwdf]_pci: Store byte/half/word/double.
 // S2_storer[bhwdf]_pci -> S2_storerbnew_pci
-let Uses = [CS] in
+let Uses = [CS], addrMode = PostInc in
 class T_store_pci <string mnemonic, RegisterClass RC,
                          Operand Imm, bits<4>MajOp,
                          MemAccessSize AlignSize, string RegSrc = "Rt">
@@ -3711,7 +3717,8 @@ def S2_storeri_pci : T_store_pci<"memw", IntRegs, s4_2Imm, 0b1100,
 def S2_storerd_pci : T_store_pci<"memd", DoubleRegs, s4_3Imm, 0b1110,
                                  DoubleWordAccess>;
 
-let Uses = [CS], isNewValue = 1, mayStore = 1, isNVStore = 1, opNewValue = 4 in
+let Uses = [CS], isNewValue = 1, mayStore = 1, isNVStore = 1, opNewValue = 4,
+    addrMode = PostInc in
 class T_storenew_pci <string mnemonic, Operand Imm,
                              bits<2>MajOp, MemAccessSize AlignSize>
   : NVInst < (outs IntRegs:$_dst_),
@@ -3745,29 +3752,9 @@ def S2_storerhnew_pci : T_storenew_pci <"memh", s4_1Imm, 0b01, HalfWordAccess>;
 def S2_storerinew_pci : T_storenew_pci <"memw", s4_2Imm, 0b10, WordAccess>;
 
 //===----------------------------------------------------------------------===//
-// Circular stores - Pseudo
-//
-// Please note that the input operand order in the pseudo instructions
-// doesn't match with the real instructions. Pseudo instructions operand
-// order should mimics the ordering in the intrinsics.
-//===----------------------------------------------------------------------===//
-let isCodeGenOnly = 1, mayStore = 1, hasSideEffects = 0, isPseudo = 1 in
-class T_store_pci_pseudo <string opc, RegisterClass RC>
-  : STInstPI<(outs IntRegs:$_dst_),
-             (ins IntRegs:$src1, RC:$src2, IntRegs:$src3, s4Imm:$src4),
-  ".error \""#opc#"($src1++#$src4:circ($src3)) = $src2\"",
-  [], "$_dst_ = $src1">;
-
-def S2_storerb_pci_pseudo : T_store_pci_pseudo <"memb", IntRegs>;
-def S2_storerh_pci_pseudo : T_store_pci_pseudo <"memh", IntRegs>;
-def S2_storerf_pci_pseudo : T_store_pci_pseudo <"memh", IntRegs>;
-def S2_storeri_pci_pseudo : T_store_pci_pseudo <"memw", IntRegs>;
-def S2_storerd_pci_pseudo : T_store_pci_pseudo <"memd", DoubleRegs>;
-
-//===----------------------------------------------------------------------===//
 // Circular stores with auto-increment register
 //===----------------------------------------------------------------------===//
-let Uses = [CS] in
+let Uses = [CS], addrMode = PostInc in
 class T_store_pcr <string mnemonic, RegisterClass RC, bits<4>MajOp,
                                MemAccessSize AlignSize, string RegSrc = "Rt">
   : STInst <(outs IntRegs:$_dst_),
@@ -3803,7 +3790,8 @@ def S2_storerf_pcr : T_store_pcr<"memh", IntRegs, 0b1011,
 //===----------------------------------------------------------------------===//
 // Circular .new stores with auto-increment register
 //===----------------------------------------------------------------------===//
-let Uses = [CS], isNewValue = 1, mayStore = 1, isNVStore = 1, opNewValue = 3 in
+let Uses = [CS], isNewValue = 1, mayStore = 1, isNVStore = 1, opNewValue = 3,
+    addrMode = PostInc in
 class T_storenew_pcr <string mnemonic, bits<2>MajOp,
                                    MemAccessSize AlignSize>
   : NVInst <(outs IntRegs:$_dst_),
@@ -3834,7 +3822,7 @@ def S2_storerinew_pcr : T_storenew_pcr <"memw", 0b10, WordAccess>;
 //===----------------------------------------------------------------------===//
 // Bit-reversed stores with auto-increment register
 //===----------------------------------------------------------------------===//
-let hasSideEffects = 0 in
+let hasSideEffects = 0, addrMode = PostInc in
 class T_store_pbr<string mnemonic, RegisterClass RC,
                             MemAccessSize addrSize, bits<3> majOp,
                             bit isHalf = 0>
@@ -3879,7 +3867,7 @@ def S2_storerd_pbr : T_store_pbr<"memd", DoubleRegs, DoubleWordAccess, 0b110>;
 // Bit-reversed .new stores with auto-increment register
 //===----------------------------------------------------------------------===//
 let isNewValue = 1, mayStore = 1, isNVStore = 1, opNewValue = 3,
-    hasSideEffects = 0 in
+    hasSideEffects = 0, addrMode = PostInc in
 class T_storenew_pbr<string mnemonic, MemAccessSize addrSize, bits<2> majOp>
   : NVInst <(outs IntRegs:$_dst_),
             (ins IntRegs:$Rz, ModRegs:$Mu, IntRegs:$Nt),
@@ -3910,26 +3898,6 @@ let BaseOpcode = "S2_storeri_pbr" in
 def S2_storerinew_pbr : T_storenew_pbr<"memw", WordAccess, 0b10>;
 
 //===----------------------------------------------------------------------===//
-// Bit-reversed stores - Pseudo
-//
-// Please note that the input operand order in the pseudo instructions
-// doesn't match with the real instructions. Pseudo instructions operand
-// order should mimics the ordering in the intrinsics.
-//===----------------------------------------------------------------------===//
-let isCodeGenOnly = 1,  mayStore = 1, hasSideEffects = 0, isPseudo = 1 in
-class T_store_pbr_pseudo <string opc, RegisterClass RC>
-  : STInstPI<(outs IntRegs:$_dst_),
-             (ins IntRegs:$src1, RC:$src2, IntRegs:$src3),
-  ".error \""#opc#"($src1++$src3:brev) = $src2\"",
-  [], "$_dst_ = $src1">;
-
-def S2_storerb_pbr_pseudo : T_store_pbr_pseudo <"memb", IntRegs>;
-def S2_storerh_pbr_pseudo : T_store_pbr_pseudo <"memh", IntRegs>;
-def S2_storeri_pbr_pseudo : T_store_pbr_pseudo <"memw", IntRegs>;
-def S2_storerf_pbr_pseudo : T_store_pbr_pseudo <"memh", IntRegs>;
-def S2_storerd_pbr_pseudo : T_store_pbr_pseudo <"memd", DoubleRegs>;
-
-//===----------------------------------------------------------------------===//
 // ST -
 //===----------------------------------------------------------------------===//
 
@@ -4201,22 +4169,16 @@ def S2_clbnorm : T_COUNT_LEADING_32<"normamt", 0b000, 0b111>;
 // Count leading zeros.
 def: Pat<(i32 (ctlz I32:$Rs)), (S2_cl0 I32:$Rs)>;
 def: Pat<(i32 (trunc (ctlz I64:$Rss))), (S2_cl0p I64:$Rss)>;
-def: Pat<(i32 (ctlz_zero_undef I32:$Rs)), (S2_cl0 I32:$Rs)>;
-def: Pat<(i32 (trunc (ctlz_zero_undef I64:$Rss))), (S2_cl0p I64:$Rss)>;
 
 // Count trailing zeros: 32-bit.
 def: Pat<(i32 (cttz I32:$Rs)), (S2_ct0 I32:$Rs)>;
-def: Pat<(i32 (cttz_zero_undef I32:$Rs)), (S2_ct0 I32:$Rs)>;
 
 // Count leading ones.
 def: Pat<(i32 (ctlz (not I32:$Rs))), (S2_cl1 I32:$Rs)>;
 def: Pat<(i32 (trunc (ctlz (not I64:$Rss)))), (S2_cl1p I64:$Rss)>;
-def: Pat<(i32 (ctlz_zero_undef (not I32:$Rs))), (S2_cl1 I32:$Rs)>;
-def: Pat<(i32 (trunc (ctlz_zero_undef (not I64:$Rss)))), (S2_cl1p I64:$Rss)>;
 
 // Count trailing ones: 32-bit.
 def: Pat<(i32 (cttz (not I32:$Rs))), (S2_ct1 I32:$Rs)>;
-def: Pat<(i32 (cttz_zero_undef (not I32:$Rs))), (S2_ct1 I32:$Rs)>;
 
 // The 64-bit counts leading/trailing are defined in HexagonInstrInfoV4.td.
 
@@ -4561,6 +4523,9 @@ let isMoveImm = 1, isAsCheapAsAMove = 1, isReMaterializable = 1,
                          (ins IntRegs:$Rs, IntRegs:$fi, s32Imm:$off), "">;
 }
 
+def: Pat<(i32 (orisadd (i32 AddrFI:$Rs), s32ImmPred:$off)),
+         (i32 (TFR_FI (i32 AddrFI:$Rs), s32ImmPred:$off))>;
+
 //===----------------------------------------------------------------------===//
 // CRUSER - Type.
 //===----------------------------------------------------------------------===//
@@ -4779,10 +4744,10 @@ def HexagonCONST32_GP : SDNode<"HexagonISD::CONST32_GP", SDTHexagonCONST32>;
 // HI/LO Instructions
 let isReMaterializable = 1, isMoveImm = 1, hasSideEffects = 0,
     hasNewValue = 1, opNewValue = 0 in
-class REG_IMMED<string RegHalf, string Op, bit Rs, bits<3> MajOp, bit MinOp>
+class REG_IMMED<string RegHalf, bit Rs, bits<3> MajOp, bit MinOp>
   : ALU32_ri<(outs IntRegs:$dst),
-              (ins i32imm:$imm_value),
-              "$dst"#RegHalf#" = #"#Op#"($imm_value)", []> {
+              (ins u16Imm:$imm_value),
+              "$dst"#RegHalf#" = $imm_value", []> {
     bits<5> dst;
     bits<32> imm_value;
     let IClass = 0b0111;
@@ -4791,15 +4756,13 @@ class REG_IMMED<string RegHalf, string Op, bit Rs, bits<3> MajOp, bit MinOp>
     let Inst{26-24} = MajOp;
     let Inst{21} = MinOp;
     let Inst{20-16} = dst;
-    let Inst{23-22} = !if (!eq(Op, "LO"), imm_value{15-14}, imm_value{31-30});
-    let Inst{13-0} = !if (!eq(Op, "LO"), imm_value{13-0}, imm_value{29-16});
+    let Inst{23-22} = imm_value{15-14};
+    let Inst{13-0} = imm_value{13-0};
 }
 
 let isAsmParserOnly = 1 in {
-  def LO : REG_IMMED<".l", "LO", 0b0, 0b001, 0b1>;
-  def LO_H : REG_IMMED<".l", "HI", 0b0, 0b001, 0b1>;
-  def HI : REG_IMMED<".h", "HI", 0b0, 0b010, 0b1>;
-  def HI_L : REG_IMMED<".h", "LO", 0b0, 0b010, 0b1>;
+  def LO : REG_IMMED<".l", 0b0, 0b001, 0b1>;
+  def HI : REG_IMMED<".h", 0b0, 0b010, 0b1>;
 }
 
 let  isMoveImm = 1, isCodeGenOnly = 1 in
@@ -4866,7 +4829,7 @@ def TFR_PdTrue : SInst<(outs PredRegs:$dst), (ins), "",
 
 let hasSideEffects = 0, isReMaterializable = 1, isPseudo = 1,
     isCodeGenOnly = 1 in
-def TFR_PdFalse : SInst<(outs PredRegs:$dst), (ins), "$dst = xor($dst, $dst)",
+def TFR_PdFalse : SInst<(outs PredRegs:$dst), (ins), "",
                   [(set (i1 PredRegs:$dst), 0)]>;
 
 // Pseudo instructions.
diff --git a/lib/Target/Hexagon/HexagonInstrInfoV3.td b/lib/Target/Hexagon/HexagonInstrInfoV3.td
index 84d035da451b..9024a43aa7eb 100644
--- a/lib/Target/Hexagon/HexagonInstrInfoV3.td
+++ b/lib/Target/Hexagon/HexagonInstrInfoV3.td
@@ -21,25 +21,26 @@ def callv3nr : SDNode<"HexagonISD::CALLv3nr", SDT_SPCall,
 // J +
 //===----------------------------------------------------------------------===//
 // Call subroutine.
-let isCall = 1, hasSideEffects = 1, Defs = VolatileV3.Regs, isPredicable = 1,
+let isCall = 1, hasSideEffects = 1, isPredicable = 1,
     isExtended = 0, isExtendable = 1, opExtendable = 0,
     isExtentSigned = 1, opExtentBits = 24, opExtentAlign = 2 in
-class T_Call<string ExtStr>
+class T_Call<bit CSR, string ExtStr>
   : JInst<(outs), (ins calltarget:$dst),
       "call " # ExtStr # "$dst", [], "", J_tc_2early_SLOT23> {
   let BaseOpcode = "call";
   bits<24> dst;
 
+  let Defs = !if (CSR, VolatileV3.Regs, []);
   let IClass = 0b0101;
   let Inst{27-25} = 0b101;
   let Inst{24-16,13-1} = dst{23-2};
   let Inst{0} = 0b0;
 }
 
-let isCall = 1, hasSideEffects = 1, Defs = VolatileV3.Regs, isPredicated = 1,
+let isCall = 1, hasSideEffects = 1, isPredicated = 1,
     isExtended = 0, isExtendable = 1, opExtendable = 1,
     isExtentSigned = 1, opExtentBits = 17, opExtentAlign = 2 in
-class T_CallPred<bit IfTrue, string ExtStr>
+class T_CallPred<bit CSR, bit IfTrue, string ExtStr>
   : JInst<(outs), (ins PredRegs:$Pu, calltarget:$dst),
       CondStr<"$Pu", IfTrue, 0>.S # "call " # ExtStr # "$dst",
       [], "", J_tc_2early_SLOT23> {
@@ -48,6 +49,7 @@ class T_CallPred<bit IfTrue, string ExtStr>
   bits<2> Pu;
   bits<17> dst;
 
+  let Defs = !if (CSR, VolatileV3.Regs, []);
   let IClass = 0b0101;
   let Inst{27-24} = 0b1101;
   let Inst{23-22,20-16,13,7-1} = dst{16-2};
@@ -56,16 +58,19 @@ class T_CallPred<bit IfTrue, string ExtStr>
   let Inst{9-8} = Pu;
 }
 
-multiclass T_Calls<string ExtStr> {
-  def NAME : T_Call<ExtStr>;
-  def t    : T_CallPred<1, ExtStr>;
-  def f    : T_CallPred<0, ExtStr>;
+multiclass T_Calls<bit CSR, string ExtStr> {
+  def NAME : T_Call<CSR, ExtStr>;
+  def t    : T_CallPred<CSR, 1, ExtStr>;
+  def f    : T_CallPred<CSR, 0, ExtStr>;
 }
 
-defm J2_call: T_Calls<"">, PredRel;
+defm J2_call: T_Calls<1, "">, PredRel;
 
 let isCodeGenOnly = 1, isCall = 1, hasSideEffects = 1, Defs = VolatileV3.Regs in
-def CALLv3nr :  T_Call<"">, PredRel;
+def CALLv3nr :  T_Call<1, "">, PredRel;
+
+let isCodeGenOnly = 1, isCall = 1, hasSideEffects = 1, Defs = [PC, R31, R6, R7, P0] in
+def CALLstk :  T_Call<0, "">, PredRel;
 
 //===----------------------------------------------------------------------===//
 // J -
diff --git a/lib/Target/Hexagon/HexagonInstrInfoV4.td b/lib/Target/Hexagon/HexagonInstrInfoV4.td
index 37c2042a2ccd..398d2d3bc716 100644
--- a/lib/Target/Hexagon/HexagonInstrInfoV4.td
+++ b/lib/Target/Hexagon/HexagonInstrInfoV4.td
@@ -1047,6 +1047,18 @@ let AddedComplexity = 40 in {
   def: Storexs_pat<store,         I64, S4_storerd_rr>;
 }
 
+class Store_rr_pat<PatFrag Store, PatFrag Value, InstHexagon MI>
+  : Pat<(Store Value:$Ru, (add I32:$Rs, I32:$Rt)),
+        (MI IntRegs:$Rs, IntRegs:$Rt, 0, Value:$Ru)>;
+
+let AddedComplexity = 20 in {
+  def: Store_rr_pat<truncstorei8,  I32, S4_storerb_rr>;
+  def: Store_rr_pat<truncstorei16, I32, S4_storerh_rr>;
+  def: Store_rr_pat<store,         I32, S4_storeri_rr>;
+  def: Store_rr_pat<store,         I64, S4_storerd_rr>;
+}
+
+
 // memd(Rx++#s4:3)=Rtt
 // memd(Rx++#s4:3:circ(Mu))=Rtt
 // memd(Rx++I:circ(Mu))=Rtt
@@ -1188,17 +1200,52 @@ def ToImmByte : OutPatFrag<(ops node:$R), (IMM_BYTE $R)>;
 def ToImmHalf : OutPatFrag<(ops node:$R), (IMM_HALF $R)>;
 def ToImmWord : OutPatFrag<(ops node:$R), (IMM_WORD $R)>;
 
+// Emit store-immediate, but only when the stored value will not be constant-
+// extended. The reason for that is that there is no pass that can optimize
+// constant extenders in store-immediate instructions. In some cases we can
+// end up will a number of such stores, all of which store the same extended
+// value (e.g. after unrolling a loop that initializes floating point array).
+
+// Predicates to determine if the 16-bit immediate is expressible as a sign-
+// extended 8-bit immediate. Store-immediate-halfword will ignore any bits
+// beyond 0..15, so we don't care what is in there.
+
+def i16in8ImmPred: PatLeaf<(i32 imm), [{
+  int64_t v = (int16_t)N->getSExtValue();
+  return v == (int64_t)(int8_t)v;
+}]>;
+
+// Predicates to determine if the 32-bit immediate is expressible as a sign-
+// extended 8-bit immediate.
+def i32in8ImmPred: PatLeaf<(i32 imm), [{
+  int64_t v = (int32_t)N->getSExtValue();
+  return v == (int64_t)(int8_t)v;
+}]>;
+
+
 let AddedComplexity = 40 in {
-  // Not using frameindex patterns for these stores, because the offset
-  // is not extendable. This could cause problems during removing the frame
-  // indices, since the offset with respect to R29/R30 may not fit in the
-  // u6 field.
-  def: Storexm_add_pat<truncstorei8, s32ImmPred, u6_0ImmPred, ToImmByte,
-                       S4_storeirb_io>;
-  def: Storexm_add_pat<truncstorei16, s32ImmPred, u6_1ImmPred, ToImmHalf,
-                       S4_storeirh_io>;
-  def: Storexm_add_pat<store, s32ImmPred, u6_2ImmPred, ToImmWord,
-                       S4_storeiri_io>;
+  // Even though the offset is not extendable in the store-immediate, we
+  // can still generate the fi# in the base address. If the final offset
+  // is not valid for the instruction, we will replace it with a scratch
+  // register.
+//  def: Storexm_fi_pat <truncstorei8, s32ImmPred, ToImmByte, S4_storeirb_io>;
+//  def: Storexm_fi_pat <truncstorei16, i16in8ImmPred, ToImmHalf,
+//                       S4_storeirh_io>;
+//  def: Storexm_fi_pat <store, i32in8ImmPred, ToImmWord, S4_storeiri_io>;
+
+//  defm: Storexm_fi_add_pat <truncstorei8, s32ImmPred, u6_0ImmPred, ToImmByte,
+//                            S4_storeirb_io>;
+//  defm: Storexm_fi_add_pat <truncstorei16, i16in8ImmPred, u6_1ImmPred,
+//                            ToImmHalf, S4_storeirh_io>;
+//  defm: Storexm_fi_add_pat <store, i32in8ImmPred, u6_2ImmPred, ToImmWord,
+//                            S4_storeiri_io>;
+
+  defm: Storexm_add_pat<truncstorei8, s32ImmPred, u6_0ImmPred, ToImmByte,
+                        S4_storeirb_io>;
+  defm: Storexm_add_pat<truncstorei16, i16in8ImmPred, u6_1ImmPred, ToImmHalf,
+                        S4_storeirh_io>;
+  defm: Storexm_add_pat<store, i32in8ImmPred, u6_2ImmPred, ToImmWord,
+                        S4_storeiri_io>;
 }
 
 def: Storexm_simple_pat<truncstorei8,  s32ImmPred, ToImmByte, S4_storeirb_io>;
@@ -1698,7 +1745,7 @@ class NVJ_ConstImm_template<string mnemonic, bits<3> majOp, string ImmVal,
   : NVInst_V4<(outs),
     (ins IntRegs:$src1, brtarget:$offset),
     "if ("#!if(isNegCond, "!","")#mnemonic
-    #"($src1.new, #"#ImmVal#")) jump:"
+    #"($src1.new, #" # ImmVal # ")) jump:"
     #!if(isTak, "t","nt")#" $offset", []> {
 
       let isTaken = isTak;
@@ -2318,21 +2365,15 @@ def S4_clbpnorm : T_COUNT_LEADING_64<"normamt", 0b011, 0b000>;
 
 // Count trailing zeros: 64-bit.
 def: Pat<(i32 (trunc (cttz I64:$Rss))), (S2_ct0p I64:$Rss)>;
-def: Pat<(i32 (trunc (cttz_zero_undef I64:$Rss))), (S2_ct0p I64:$Rss)>;
 
 // Count trailing ones: 64-bit.
 def: Pat<(i32 (trunc (cttz (not I64:$Rss)))), (S2_ct1p I64:$Rss)>;
-def: Pat<(i32 (trunc (cttz_zero_undef (not I64:$Rss)))), (S2_ct1p I64:$Rss)>;
 
 // Define leading/trailing patterns that require zero-extensions to 64 bits.
 def: Pat<(i64 (ctlz I64:$Rss)), (Zext64 (S2_cl0p I64:$Rss))>;
-def: Pat<(i64 (ctlz_zero_undef I64:$Rss)), (Zext64 (S2_cl0p I64:$Rss))>;
 def: Pat<(i64 (cttz I64:$Rss)), (Zext64 (S2_ct0p I64:$Rss))>;
-def: Pat<(i64 (cttz_zero_undef I64:$Rss)), (Zext64 (S2_ct0p I64:$Rss))>;
 def: Pat<(i64 (ctlz (not I64:$Rss))), (Zext64 (S2_cl1p I64:$Rss))>;
-def: Pat<(i64 (ctlz_zero_undef (not I64:$Rss))), (Zext64 (S2_cl1p I64:$Rss))>;
 def: Pat<(i64 (cttz (not I64:$Rss))), (Zext64 (S2_ct1p I64:$Rss))>;
-def: Pat<(i64 (cttz_zero_undef (not I64:$Rss))), (Zext64 (S2_ct1p I64:$Rss))>;
 
 
 let hasSideEffects = 0, hasNewValue = 1 in
@@ -2789,79 +2830,75 @@ def S4_lsli: SInst <(outs IntRegs:$Rd), (ins s6Imm:$s6, IntRegs:$Rt),
 //===----------------------------------------------------------------------===//
 
 //===----------------------------------------------------------------------===//
-// MEMOP: Word, Half, Byte
+// MEMOP
 //===----------------------------------------------------------------------===//
 
-def MEMOPIMM : SDNodeXForm<imm, [{
-  // Call the transformation function XformM5ToU5Imm to get the negative
-  // immediate's positive counterpart.
-  int32_t imm = N->getSExtValue();
-  return XformM5ToU5Imm(imm, SDLoc(N));
+def m5Imm8Pred : PatLeaf<(i32 imm), [{
+  int8_t v = (int8_t)N->getSExtValue();
+  return v > -32 && v <= -1;
 }]>;
 
-def MEMOPIMM_HALF : SDNodeXForm<imm, [{
-  // -1 .. -31 represented as 65535..65515
-  // assigning to a short restores our desired signed value.
-  // Call the transformation function XformM5ToU5Imm to get the negative
-  // immediate's positive counterpart.
-  int16_t imm = N->getSExtValue();
-  return XformM5ToU5Imm(imm, SDLoc(N));
+def m5Imm16Pred : PatLeaf<(i32 imm), [{
+  int16_t v = (int16_t)N->getSExtValue();
+  return v > -32 && v <= -1;
 }]>;
 
-def MEMOPIMM_BYTE : SDNodeXForm<imm, [{
-  // -1 .. -31 represented as 255..235
-  // assigning to a char restores our desired signed value.
-  // Call the transformation function XformM5ToU5Imm to get the negative
-  // immediate's positive counterpart.
-  int8_t imm = N->getSExtValue();
-  return XformM5ToU5Imm(imm, SDLoc(N));
+def Clr5Imm8Pred : PatLeaf<(i32 imm), [{
+  uint32_t v = (uint8_t)~N->getZExtValue();
+  return ImmIsSingleBit(v);
 }]>;
 
-def SETMEMIMM : SDNodeXForm<imm, [{
-   // Return the bit position we will set [0-31].
-   // As an SDNode.
-   int32_t imm = N->getSExtValue();
+def Clr5Imm16Pred : PatLeaf<(i32 imm), [{
+  uint32_t v = (uint16_t)~N->getZExtValue();
+  return ImmIsSingleBit(v);
+}]>;
+
+def Set5Imm8 : SDNodeXForm<imm, [{
+   uint32_t imm = (uint8_t)N->getZExtValue();
    return XformMskToBitPosU5Imm(imm, SDLoc(N));
 }]>;
 
-def CLRMEMIMM : SDNodeXForm<imm, [{
-   // Return the bit position we will clear [0-31].
-   // As an SDNode.
-   // we bit negate the value first
-   int32_t imm = ~(N->getSExtValue());
+def Set5Imm16 : SDNodeXForm<imm, [{
+   uint32_t imm = (uint16_t)N->getZExtValue();
    return XformMskToBitPosU5Imm(imm, SDLoc(N));
 }]>;
 
-def SETMEMIMM_SHORT : SDNodeXForm<imm, [{
-   // Return the bit position we will set [0-15].
-   // As an SDNode.
-   int16_t imm = N->getSExtValue();
-   return XformMskToBitPosU4Imm(imm, SDLoc(N));
+def Set5Imm32 : SDNodeXForm<imm, [{
+   uint32_t imm = (uint32_t)N->getZExtValue();
+   return XformMskToBitPosU5Imm(imm, SDLoc(N));
 }]>;
 
-def CLRMEMIMM_SHORT : SDNodeXForm<imm, [{
-   // Return the bit position we will clear [0-15].
-   // As an SDNode.
-   // we bit negate the value first
-   int16_t imm = ~(N->getSExtValue());
-   return XformMskToBitPosU4Imm(imm, SDLoc(N));
+def Clr5Imm8 : SDNodeXForm<imm, [{
+   uint32_t imm = (uint8_t)~N->getZExtValue();
+   return XformMskToBitPosU5Imm(imm, SDLoc(N));
 }]>;
 
-def SETMEMIMM_BYTE : SDNodeXForm<imm, [{
-   // Return the bit position we will set [0-7].
-   // As an SDNode.
-   int8_t imm =  N->getSExtValue();
-   return XformMskToBitPosU3Imm(imm, SDLoc(N));
+def Clr5Imm16 : SDNodeXForm<imm, [{
+   uint32_t imm = (uint16_t)~N->getZExtValue();
+   return XformMskToBitPosU5Imm(imm, SDLoc(N));
 }]>;
 
-def CLRMEMIMM_BYTE : SDNodeXForm<imm, [{
-   // Return the bit position we will clear [0-7].
-   // As an SDNode.
-   // we bit negate the value first
-   int8_t imm = ~(N->getSExtValue());
-   return XformMskToBitPosU3Imm(imm, SDLoc(N));
+def Clr5Imm32 : SDNodeXForm<imm, [{
+   int32_t imm = (int32_t)~N->getZExtValue();
+   return XformMskToBitPosU5Imm(imm, SDLoc(N));
+}]>;
+
+def NegImm8 : SDNodeXForm<imm, [{
+  int8_t V = N->getSExtValue();
+  return CurDAG->getTargetConstant(-V, SDLoc(N), MVT::i32);
+}]>;
+
+def NegImm16 : SDNodeXForm<imm, [{
+  int16_t V = N->getSExtValue();
+  return CurDAG->getTargetConstant(-V, SDLoc(N), MVT::i32);
 }]>;
 
+def NegImm32 : SDNodeXForm<imm, [{
+  return CurDAG->getTargetConstant(-N->getSExtValue(), SDLoc(N), MVT::i32);
+}]>;
+
+def IdImm : SDNodeXForm<imm, [{ return SDValue(N, 0); }]>;
+
 //===----------------------------------------------------------------------===//
 // Template class for MemOp instructions with the register value.
 //===----------------------------------------------------------------------===//
@@ -2958,197 +2995,234 @@ let isExtendable = 1, opExtendable = 1, isExtentSigned = 0 in {
   defm memopw_io : MemOp_base <"memw", 0b10, u6_2Ext>;
 }
 
-//===----------------------------------------------------------------------===//
-// Multiclass to define 'Def Pats' for ALU operations on the memory
-// Here value used for the ALU operation is an immediate value.
-// mem[bh](Rs+#0) += #U5
-// mem[bh](Rs+#u6) += #U5
-//===----------------------------------------------------------------------===//
-
-multiclass MemOpi_u5Pats <PatFrag ldOp, PatFrag stOp, PatLeaf ImmPred,
-                          InstHexagon MI, SDNode OpNode> {
-  let AddedComplexity = 180 in
-  def: Pat<(stOp (OpNode (ldOp IntRegs:$addr), u5ImmPred:$addend),
-                  IntRegs:$addr),
-            (MI IntRegs:$addr, 0, u5ImmPred:$addend)>;
-
-  let AddedComplexity = 190 in
-  def: Pat<(stOp (OpNode (ldOp (add IntRegs:$base, ImmPred:$offset)),
-                  u5ImmPred:$addend),
-            (add IntRegs:$base, ImmPred:$offset)),
-            (MI IntRegs:$base, ImmPred:$offset, u5ImmPred:$addend)>;
-}
-
-multiclass MemOpi_u5ALUOp<PatFrag ldOp, PatFrag stOp, PatLeaf ImmPred,
-                          InstHexagon addMI, InstHexagon subMI> {
-  defm: MemOpi_u5Pats<ldOp, stOp, ImmPred, addMI, add>;
-  defm: MemOpi_u5Pats<ldOp, stOp, ImmPred, subMI, sub>;
-}
 
-multiclass MemOpi_u5ExtType<PatFrag ldOpByte, PatFrag ldOpHalf > {
-  // Half Word
-  defm: MemOpi_u5ALUOp <ldOpHalf, truncstorei16, u31_1ImmPred,
-                        L4_iadd_memoph_io, L4_isub_memoph_io>;
-  // Byte
-  defm: MemOpi_u5ALUOp <ldOpByte, truncstorei8, u32ImmPred,
-                        L4_iadd_memopb_io, L4_isub_memopb_io>;
+multiclass Memopxr_simple_pat<PatFrag Load, PatFrag Store, SDNode Oper,
+                              InstHexagon MI> {
+  // Addr: i32
+  def: Pat<(Store (Oper (Load I32:$Rs), I32:$A), I32:$Rs),
+           (MI I32:$Rs, 0, I32:$A)>;
+  // Addr: fi
+  def: Pat<(Store (Oper (Load AddrFI:$Rs), I32:$A), AddrFI:$Rs),
+           (MI AddrFI:$Rs, 0, I32:$A)>;
+}
+
+multiclass Memopxr_add_pat<PatFrag Load, PatFrag Store, PatFrag ImmPred,
+                           SDNode Oper, InstHexagon MI> {
+  // Addr: i32
+  def: Pat<(Store (Oper (Load (add I32:$Rs, ImmPred:$Off)), I32:$A),
+                  (add I32:$Rs, ImmPred:$Off)),
+           (MI I32:$Rs, imm:$Off, I32:$A)>;
+  def: Pat<(Store (Oper (Load (orisadd I32:$Rs, ImmPred:$Off)), I32:$A),
+                  (orisadd I32:$Rs, ImmPred:$Off)),
+           (MI I32:$Rs, imm:$Off, I32:$A)>;
+  // Addr: fi
+  def: Pat<(Store (Oper (Load (add AddrFI:$Rs, ImmPred:$Off)), I32:$A),
+                  (add AddrFI:$Rs, ImmPred:$Off)),
+           (MI AddrFI:$Rs, imm:$Off, I32:$A)>;
+  def: Pat<(Store (Oper (Load (orisadd AddrFI:$Rs, ImmPred:$Off)), I32:$A),
+                  (orisadd AddrFI:$Rs, ImmPred:$Off)),
+           (MI AddrFI:$Rs, imm:$Off, I32:$A)>;
+}
+
+multiclass Memopxr_pat<PatFrag Load, PatFrag Store, PatFrag ImmPred,
+                       SDNode Oper, InstHexagon MI> {
+  defm: Memopxr_simple_pat <Load, Store,          Oper, MI>;
+  defm: Memopxr_add_pat    <Load, Store, ImmPred, Oper, MI>;
+}
+
+let AddedComplexity = 180 in {
+  // add reg
+  defm: Memopxr_pat<extloadi8, truncstorei8, u6_0ImmPred, add,
+        /*anyext*/  L4_add_memopb_io>;
+  defm: Memopxr_pat<sextloadi8, truncstorei8, u6_0ImmPred, add,
+        /*sext*/    L4_add_memopb_io>;
+  defm: Memopxr_pat<zextloadi8, truncstorei8, u6_0ImmPred, add,
+        /*zext*/    L4_add_memopb_io>;
+  defm: Memopxr_pat<extloadi16, truncstorei16, u6_1ImmPred, add,
+        /*anyext*/  L4_add_memoph_io>;
+  defm: Memopxr_pat<sextloadi16, truncstorei16, u6_1ImmPred, add,
+        /*sext*/    L4_add_memoph_io>;
+  defm: Memopxr_pat<zextloadi16, truncstorei16, u6_1ImmPred, add,
+        /*zext*/    L4_add_memoph_io>;
+  defm: Memopxr_pat<load, store, u6_2ImmPred, add, L4_add_memopw_io>;
+
+  // sub reg
+  defm: Memopxr_pat<extloadi8, truncstorei8, u6_0ImmPred, sub,
+        /*anyext*/  L4_sub_memopb_io>;
+  defm: Memopxr_pat<sextloadi8, truncstorei8, u6_0ImmPred, sub,
+        /*sext*/    L4_sub_memopb_io>;
+  defm: Memopxr_pat<zextloadi8, truncstorei8, u6_0ImmPred, sub,
+        /*zext*/    L4_sub_memopb_io>;
+  defm: Memopxr_pat<extloadi16, truncstorei16, u6_1ImmPred, sub,
+        /*anyext*/  L4_sub_memoph_io>;
+  defm: Memopxr_pat<sextloadi16, truncstorei16, u6_1ImmPred, sub,
+        /*sext*/    L4_sub_memoph_io>;
+  defm: Memopxr_pat<zextloadi16, truncstorei16, u6_1ImmPred, sub,
+        /*zext*/    L4_sub_memoph_io>;
+  defm: Memopxr_pat<load, store, u6_2ImmPred, sub, L4_sub_memopw_io>;
+
+  // and reg
+  defm: Memopxr_pat<extloadi8, truncstorei8, u6_0ImmPred, and,
+        /*anyext*/  L4_and_memopb_io>;
+  defm: Memopxr_pat<sextloadi8, truncstorei8, u6_0ImmPred, and,
+        /*sext*/    L4_and_memopb_io>;
+  defm: Memopxr_pat<zextloadi8, truncstorei8, u6_0ImmPred, and,
+        /*zext*/    L4_and_memopb_io>;
+  defm: Memopxr_pat<extloadi16, truncstorei16, u6_1ImmPred, and,
+        /*anyext*/  L4_and_memoph_io>;
+  defm: Memopxr_pat<sextloadi16, truncstorei16, u6_1ImmPred, and,
+        /*sext*/    L4_and_memoph_io>;
+  defm: Memopxr_pat<zextloadi16, truncstorei16, u6_1ImmPred, and,
+        /*zext*/    L4_and_memoph_io>;
+  defm: Memopxr_pat<load, store, u6_2ImmPred, and, L4_and_memopw_io>;
+
+  // or reg
+  defm: Memopxr_pat<extloadi8, truncstorei8, u6_0ImmPred, or,
+        /*anyext*/  L4_or_memopb_io>;
+  defm: Memopxr_pat<sextloadi8, truncstorei8, u6_0ImmPred, or,
+        /*sext*/    L4_or_memopb_io>;
+  defm: Memopxr_pat<zextloadi8, truncstorei8, u6_0ImmPred, or,
+        /*zext*/    L4_or_memopb_io>;
+  defm: Memopxr_pat<extloadi16, truncstorei16, u6_1ImmPred, or,
+        /*anyext*/  L4_or_memoph_io>;
+  defm: Memopxr_pat<sextloadi16, truncstorei16, u6_1ImmPred, or,
+        /*sext*/    L4_or_memoph_io>;
+  defm: Memopxr_pat<zextloadi16, truncstorei16, u6_1ImmPred, or,
+        /*zext*/    L4_or_memoph_io>;
+  defm: Memopxr_pat<load, store, u6_2ImmPred, or, L4_or_memopw_io>;
+}
+
+
+multiclass Memopxi_simple_pat<PatFrag Load, PatFrag Store, SDNode Oper,
+                              PatFrag Arg, SDNodeXForm ArgMod,
+                              InstHexagon MI> {
+  // Addr: i32
+  def: Pat<(Store (Oper (Load I32:$Rs), Arg:$A), I32:$Rs),
+           (MI I32:$Rs, 0, (ArgMod Arg:$A))>;
+  // Addr: fi
+  def: Pat<(Store (Oper (Load AddrFI:$Rs), Arg:$A), AddrFI:$Rs),
+           (MI AddrFI:$Rs, 0, (ArgMod Arg:$A))>;
+}
+
+multiclass Memopxi_add_pat<PatFrag Load, PatFrag Store, PatFrag ImmPred,
+                           SDNode Oper, PatFrag Arg, SDNodeXForm ArgMod,
+                           InstHexagon MI> {
+  // Addr: i32
+  def: Pat<(Store (Oper (Load (add I32:$Rs, ImmPred:$Off)), Arg:$A),
+                  (add I32:$Rs, ImmPred:$Off)),
+           (MI I32:$Rs, imm:$Off, (ArgMod Arg:$A))>;
+  def: Pat<(Store (Oper (Load (orisadd I32:$Rs, ImmPred:$Off)), Arg:$A),
+                  (orisadd I32:$Rs, ImmPred:$Off)),
+           (MI I32:$Rs, imm:$Off, (ArgMod Arg:$A))>;
+  // Addr: fi
+  def: Pat<(Store (Oper (Load (add AddrFI:$Rs, ImmPred:$Off)), Arg:$A),
+                  (add AddrFI:$Rs, ImmPred:$Off)),
+           (MI AddrFI:$Rs, imm:$Off, (ArgMod Arg:$A))>;
+  def: Pat<(Store (Oper (Load (orisadd AddrFI:$Rs, ImmPred:$Off)), Arg:$A),
+                  (orisadd AddrFI:$Rs, ImmPred:$Off)),
+           (MI AddrFI:$Rs, imm:$Off, (ArgMod Arg:$A))>;
+}
+
+multiclass Memopxi_pat<PatFrag Load, PatFrag Store, PatFrag ImmPred,
+                       SDNode Oper, PatFrag Arg, SDNodeXForm ArgMod,
+                       InstHexagon MI> {
+  defm: Memopxi_simple_pat <Load, Store,          Oper, Arg, ArgMod, MI>;
+  defm: Memopxi_add_pat    <Load, Store, ImmPred, Oper, Arg, ArgMod, MI>;
 }
 
-let Predicates = [UseMEMOP] in {
-  defm: MemOpi_u5ExtType<zextloadi8, zextloadi16>; // zero extend
-  defm: MemOpi_u5ExtType<sextloadi8, sextloadi16>; // sign extend
-  defm: MemOpi_u5ExtType<extloadi8,  extloadi16>;  // any extend
 
-  // Word
-  defm: MemOpi_u5ALUOp <load, store, u30_2ImmPred, L4_iadd_memopw_io,
-                        L4_isub_memopw_io>;
-}
-
-//===----------------------------------------------------------------------===//
-// multiclass to define 'Def Pats' for ALU operations on the memory.
-// Here value used for the ALU operation is a negative value.
-// mem[bh](Rs+#0) += #m5
-// mem[bh](Rs+#u6) += #m5
-//===----------------------------------------------------------------------===//
-
-multiclass MemOpi_m5Pats <PatFrag ldOp, PatFrag stOp, PatLeaf ImmPred,
-                          PatLeaf immPred, SDNodeXForm xformFunc,
-                          InstHexagon MI> {
-  let AddedComplexity = 190 in
-  def: Pat<(stOp (add (ldOp IntRegs:$addr), immPred:$subend), IntRegs:$addr),
-           (MI IntRegs:$addr, 0, (xformFunc immPred:$subend))>;
-
-  let AddedComplexity = 195 in
-  def: Pat<(stOp (add (ldOp (add IntRegs:$base, ImmPred:$offset)),
-                  immPred:$subend),
-           (add IntRegs:$base, ImmPred:$offset)),
-           (MI IntRegs:$base, ImmPred:$offset, (xformFunc immPred:$subend))>;
-}
-
-multiclass MemOpi_m5ExtType<PatFrag ldOpByte, PatFrag ldOpHalf > {
-  // Half Word
-  defm: MemOpi_m5Pats <ldOpHalf, truncstorei16, u31_1ImmPred, m5HImmPred,
-                       MEMOPIMM_HALF, L4_isub_memoph_io>;
-  // Byte
-  defm: MemOpi_m5Pats <ldOpByte, truncstorei8, u32ImmPred, m5BImmPred,
-                       MEMOPIMM_BYTE, L4_isub_memopb_io>;
-}
-
-let Predicates = [UseMEMOP] in {
-  defm: MemOpi_m5ExtType<zextloadi8, zextloadi16>; // zero extend
-  defm: MemOpi_m5ExtType<sextloadi8, sextloadi16>; // sign extend
-  defm: MemOpi_m5ExtType<extloadi8,  extloadi16>;  // any extend
-
-  // Word
-  defm: MemOpi_m5Pats <load, store, u30_2ImmPred, m5ImmPred,
-                       MEMOPIMM, L4_isub_memopw_io>;
-}
-
-//===----------------------------------------------------------------------===//
-// Multiclass to define 'def Pats' for bit operations on the memory.
-// mem[bhw](Rs+#0) = [clrbit|setbit](#U5)
-// mem[bhw](Rs+#u6) = [clrbit|setbit](#U5)
-//===----------------------------------------------------------------------===//
-
-multiclass MemOpi_bitPats <PatFrag ldOp, PatFrag stOp, PatLeaf immPred,
-                     PatLeaf extPred, SDNodeXForm xformFunc, InstHexagon MI,
-                     SDNode OpNode> {
-
-  // mem[bhw](Rs+#u6:[012]) = [clrbit|setbit](#U5)
-  let AddedComplexity = 250 in
-  def: Pat<(stOp (OpNode (ldOp (add IntRegs:$base, extPred:$offset)),
-                  immPred:$bitend),
-           (add IntRegs:$base, extPred:$offset)),
-           (MI IntRegs:$base, extPred:$offset, (xformFunc immPred:$bitend))>;
-
-  // mem[bhw](Rs+#0) = [clrbit|setbit](#U5)
-  let AddedComplexity = 225 in
-  def: Pat<(stOp (OpNode (ldOp IntRegs:$addr), immPred:$bitend), IntRegs:$addr),
-           (MI IntRegs:$addr, 0, (xformFunc immPred:$bitend))>;
-}
-
-multiclass MemOpi_bitExtType<PatFrag ldOpByte, PatFrag ldOpHalf> {
-  // Byte - clrbit
-  defm: MemOpi_bitPats<ldOpByte, truncstorei8, Clr3ImmPred, u32ImmPred,
-                       CLRMEMIMM_BYTE, L4_iand_memopb_io, and>;
-  // Byte - setbit
-  defm: MemOpi_bitPats<ldOpByte, truncstorei8, Set3ImmPred, u32ImmPred,
-                       SETMEMIMM_BYTE, L4_ior_memopb_io, or>;
-  // Half Word - clrbit
-  defm: MemOpi_bitPats<ldOpHalf, truncstorei16, Clr4ImmPred, u31_1ImmPred,
-                       CLRMEMIMM_SHORT, L4_iand_memoph_io, and>;
-  // Half Word - setbit
-  defm: MemOpi_bitPats<ldOpHalf, truncstorei16, Set4ImmPred, u31_1ImmPred,
-                       SETMEMIMM_SHORT, L4_ior_memoph_io, or>;
-}
-
-let Predicates = [UseMEMOP] in {
-  // mem[bh](Rs+#0) = [clrbit|setbit](#U5)
-  // mem[bh](Rs+#u6:[01]) = [clrbit|setbit](#U5)
-  defm: MemOpi_bitExtType<zextloadi8, zextloadi16>; // zero extend
-  defm: MemOpi_bitExtType<sextloadi8, sextloadi16>; // sign extend
-  defm: MemOpi_bitExtType<extloadi8,  extloadi16>;  // any extend
-
-  // memw(Rs+#0) = [clrbit|setbit](#U5)
-  // memw(Rs+#u6:2) = [clrbit|setbit](#U5)
-  defm: MemOpi_bitPats<load, store, Clr5ImmPred, u30_2ImmPred, CLRMEMIMM,
-                       L4_iand_memopw_io, and>;
-  defm: MemOpi_bitPats<load, store, Set5ImmPred, u30_2ImmPred, SETMEMIMM,
-                       L4_ior_memopw_io, or>;
-}
-
-//===----------------------------------------------------------------------===//
-// Multiclass to define 'def Pats' for ALU operations on the memory
-// where addend is a register.
-// mem[bhw](Rs+#0) [+-&|]= Rt
-// mem[bhw](Rs+#U6:[012]) [+-&|]= Rt
-//===----------------------------------------------------------------------===//
-
-multiclass MemOpr_Pats <PatFrag ldOp, PatFrag stOp, PatLeaf extPred,
-                        InstHexagon MI, SDNode OpNode> {
-  let AddedComplexity = 141 in
-  // mem[bhw](Rs+#0) [+-&|]= Rt
-  def: Pat<(stOp (OpNode (ldOp IntRegs:$addr), (i32 IntRegs:$addend)),
-                 IntRegs:$addr),
-           (MI IntRegs:$addr, 0, (i32 IntRegs:$addend))>;
-
-  // mem[bhw](Rs+#U6:[012]) [+-&|]= Rt
-  let AddedComplexity = 150 in
-  def: Pat<(stOp (OpNode (ldOp (add IntRegs:$base, extPred:$offset)),
-                  (i32 IntRegs:$orend)),
-           (add IntRegs:$base, extPred:$offset)),
-           (MI IntRegs:$base, extPred:$offset, (i32 IntRegs:$orend))>;
-}
-
-multiclass MemOPr_ALUOp<PatFrag ldOp, PatFrag stOp, PatLeaf extPred,
-                        InstHexagon addMI, InstHexagon subMI,
-                        InstHexagon andMI, InstHexagon orMI> {
-  defm: MemOpr_Pats <ldOp, stOp, extPred, addMI, add>;
-  defm: MemOpr_Pats <ldOp, stOp, extPred, subMI, sub>;
-  defm: MemOpr_Pats <ldOp, stOp, extPred, andMI, and>;
-  defm: MemOpr_Pats <ldOp, stOp, extPred, orMI,  or>;
-}
-
-multiclass MemOPr_ExtType<PatFrag ldOpByte, PatFrag ldOpHalf > {
-  // Half Word
-  defm: MemOPr_ALUOp <ldOpHalf, truncstorei16, u31_1ImmPred,
-                      L4_add_memoph_io, L4_sub_memoph_io,
-                      L4_and_memoph_io, L4_or_memoph_io>;
-  // Byte
-  defm: MemOPr_ALUOp <ldOpByte, truncstorei8, u32ImmPred,
-                      L4_add_memopb_io, L4_sub_memopb_io,
-                      L4_and_memopb_io, L4_or_memopb_io>;
-}
-
-// Define 'def Pats' for MemOps with register addend.
-let Predicates = [UseMEMOP] in {
-  // Byte, Half Word
-  defm: MemOPr_ExtType<zextloadi8, zextloadi16>; // zero extend
-  defm: MemOPr_ExtType<sextloadi8, sextloadi16>; // sign extend
-  defm: MemOPr_ExtType<extloadi8,  extloadi16>;  // any extend
-  // Word
-  defm: MemOPr_ALUOp <load, store, u30_2ImmPred, L4_add_memopw_io,
-                      L4_sub_memopw_io, L4_and_memopw_io, L4_or_memopw_io>;
+let AddedComplexity = 200 in {
+  // add imm
+  defm: Memopxi_pat<extloadi8, truncstorei8, u6_0ImmPred, add, u5ImmPred,
+        /*anyext*/  IdImm, L4_iadd_memopb_io>;
+  defm: Memopxi_pat<sextloadi8, truncstorei8, u6_0ImmPred, add, u5ImmPred,
+        /*sext*/    IdImm, L4_iadd_memopb_io>;
+  defm: Memopxi_pat<zextloadi8, truncstorei8, u6_0ImmPred, add, u5ImmPred,
+        /*zext*/    IdImm, L4_iadd_memopb_io>;
+  defm: Memopxi_pat<extloadi16, truncstorei16, u6_1ImmPred, add, u5ImmPred,
+        /*anyext*/  IdImm, L4_iadd_memoph_io>;
+  defm: Memopxi_pat<extloadi16, truncstorei16, u6_1ImmPred, add, u5ImmPred,
+        /*sext*/    IdImm, L4_iadd_memoph_io>;
+  defm: Memopxi_pat<extloadi16, truncstorei16, u6_1ImmPred, add, u5ImmPred,
+        /*zext*/    IdImm, L4_iadd_memoph_io>;
+  defm: Memopxi_pat<load, store, u6_2ImmPred, add, u5ImmPred, IdImm,
+                    L4_iadd_memopw_io>;
+  defm: Memopxi_pat<extloadi8, truncstorei8, u6_0ImmPred, sub, m5Imm8Pred,
+        /*anyext*/  NegImm8, L4_iadd_memopb_io>;
+  defm: Memopxi_pat<sextloadi8, truncstorei8, u6_0ImmPred, sub, m5Imm8Pred,
+        /*sext*/    NegImm8, L4_iadd_memopb_io>;
+  defm: Memopxi_pat<zextloadi8, truncstorei8, u6_0ImmPred, sub, m5Imm8Pred,
+        /*zext*/    NegImm8, L4_iadd_memopb_io>;
+  defm: Memopxi_pat<extloadi16, truncstorei16, u6_1ImmPred, sub, m5Imm16Pred,
+        /*anyext*/  NegImm16, L4_iadd_memoph_io>;
+  defm: Memopxi_pat<sextloadi16, truncstorei16, u6_1ImmPred, sub, m5Imm16Pred,
+        /*sext*/    NegImm16, L4_iadd_memoph_io>;
+  defm: Memopxi_pat<zextloadi16, truncstorei16, u6_1ImmPred, sub, m5Imm16Pred,
+        /*zext*/    NegImm16, L4_iadd_memoph_io>;
+  defm: Memopxi_pat<load, store, u6_2ImmPred, sub, m5ImmPred, NegImm32,
+                    L4_iadd_memopw_io>;
+
+  // sub imm
+  defm: Memopxi_pat<extloadi8, truncstorei8, u6_0ImmPred, sub, u5ImmPred,
+        /*anyext*/  IdImm, L4_isub_memopb_io>;
+  defm: Memopxi_pat<sextloadi8, truncstorei8, u6_0ImmPred, sub, u5ImmPred,
+        /*sext*/    IdImm, L4_isub_memopb_io>;
+  defm: Memopxi_pat<zextloadi8, truncstorei8, u6_0ImmPred, sub, u5ImmPred,
+        /*zext*/    IdImm, L4_isub_memopb_io>;
+  defm: Memopxi_pat<extloadi16, truncstorei16, u6_1ImmPred, sub, u5ImmPred,
+        /*anyext*/  IdImm, L4_isub_memoph_io>;
+  defm: Memopxi_pat<sextloadi16, truncstorei16, u6_1ImmPred, sub, u5ImmPred,
+        /*sext*/    IdImm, L4_isub_memoph_io>;
+  defm: Memopxi_pat<zextloadi16, truncstorei16, u6_1ImmPred, sub, u5ImmPred,
+        /*zext*/    IdImm, L4_isub_memoph_io>;
+  defm: Memopxi_pat<load, store, u6_2ImmPred, sub, u5ImmPred, IdImm,
+                    L4_isub_memopw_io>;
+  defm: Memopxi_pat<extloadi8, truncstorei8, u6_0ImmPred, add, m5Imm8Pred,
+        /*anyext*/  NegImm8, L4_isub_memopb_io>;
+  defm: Memopxi_pat<sextloadi8, truncstorei8, u6_0ImmPred, add, m5Imm8Pred,
+        /*sext*/    NegImm8, L4_isub_memopb_io>;
+  defm: Memopxi_pat<zextloadi8, truncstorei8, u6_0ImmPred, add, m5Imm8Pred,
+        /*zext*/    NegImm8, L4_isub_memopb_io>;
+  defm: Memopxi_pat<extloadi16, truncstorei16, u6_1ImmPred, add, m5Imm16Pred,
+        /*anyext*/  NegImm16, L4_isub_memoph_io>;
+  defm: Memopxi_pat<sextloadi16, truncstorei16, u6_1ImmPred, add, m5Imm16Pred,
+        /*sext*/    NegImm16, L4_isub_memoph_io>;
+  defm: Memopxi_pat<zextloadi16, truncstorei16, u6_1ImmPred, add, m5Imm16Pred,
+        /*zext*/    NegImm16, L4_isub_memoph_io>;
+  defm: Memopxi_pat<load, store, u6_2ImmPred, add, m5ImmPred, NegImm32,
+                    L4_isub_memopw_io>;
+
+  // clrbit imm
+  defm: Memopxi_pat<extloadi8, truncstorei8, u6_0ImmPred, and, Clr5Imm8Pred,
+        /*anyext*/  Clr5Imm8, L4_iand_memopb_io>;
+  defm: Memopxi_pat<sextloadi8, truncstorei8, u6_0ImmPred, and, Clr5Imm8Pred,
+        /*sext*/    Clr5Imm8, L4_iand_memopb_io>;
+  defm: Memopxi_pat<zextloadi8, truncstorei8, u6_0ImmPred, and, Clr5Imm8Pred,
+        /*zext*/    Clr5Imm8, L4_iand_memopb_io>;
+  defm: Memopxi_pat<extloadi16, truncstorei16, u6_1ImmPred, and, Clr5Imm16Pred,
+        /*anyext*/  Clr5Imm16, L4_iand_memoph_io>;
+  defm: Memopxi_pat<sextloadi16, truncstorei16, u6_1ImmPred, and, Clr5Imm16Pred,
+        /*sext*/    Clr5Imm16, L4_iand_memoph_io>;
+  defm: Memopxi_pat<zextloadi16, truncstorei16, u6_1ImmPred, and, Clr5Imm16Pred,
+        /*zext*/    Clr5Imm16, L4_iand_memoph_io>;
+  defm: Memopxi_pat<load, store, u6_2ImmPred, and, Clr5ImmPred, Clr5Imm32,
+                    L4_iand_memopw_io>;
+
+  // setbit imm
+  defm: Memopxi_pat<extloadi8, truncstorei8, u6_0ImmPred, or, Set5ImmPred,
+        /*anyext*/  Set5Imm8, L4_ior_memopb_io>;
+  defm: Memopxi_pat<sextloadi8, truncstorei8, u6_0ImmPred, or, Set5ImmPred,
+        /*sext*/    Set5Imm8, L4_ior_memopb_io>;
+  defm: Memopxi_pat<zextloadi8, truncstorei8, u6_0ImmPred, or, Set5ImmPred,
+        /*zext*/    Set5Imm8, L4_ior_memopb_io>;
+  defm: Memopxi_pat<extloadi16, truncstorei16, u6_1ImmPred, or, Set5ImmPred,
+        /*anyext*/  Set5Imm16, L4_ior_memoph_io>;
+  defm: Memopxi_pat<sextloadi16, truncstorei16, u6_1ImmPred, or, Set5ImmPred,
+        /*sext*/    Set5Imm16, L4_ior_memoph_io>;
+  defm: Memopxi_pat<zextloadi16, truncstorei16, u6_1ImmPred, or, Set5ImmPred,
+        /*zext*/    Set5Imm16, L4_ior_memoph_io>;
+  defm: Memopxi_pat<load, store, u6_2ImmPred, or, Set5ImmPred, Set5Imm32,
+                    L4_ior_memopw_io>;
 }
 
 //===----------------------------------------------------------------------===//
@@ -3281,22 +3355,57 @@ defm L4_return: LD_MISC_L4_RETURN <"dealloc_return">, PredNewRel;
 let isCall = 1, isBarrier = 1, isReturn = 1, isTerminator = 1,
     Defs = [R29, R30, R31, PC], isPredicable = 0, isAsmParserOnly = 1 in {
   def RESTORE_DEALLOC_RET_JMP_V4 : T_JMP<"">;
+
   let isExtended = 1, opExtendable = 0 in
-    def RESTORE_DEALLOC_RET_JMP_V4_EXT : T_JMP<"">;
+  def RESTORE_DEALLOC_RET_JMP_V4_EXT : T_JMP<"">;
+
+  let Defs = [R14, R15, R28, R29, R30, R31, PC] in {
+    def RESTORE_DEALLOC_RET_JMP_V4_PIC : T_JMP<"">;
+
+    let isExtended = 1, opExtendable = 0 in
+    def RESTORE_DEALLOC_RET_JMP_V4_EXT_PIC : T_JMP<"">;
+  }
 }
 
 // Restore registers and dealloc frame before a tail call.
 let isCall = 1, Defs = [R29, R30, R31, PC], isAsmParserOnly = 1 in {
-  def RESTORE_DEALLOC_BEFORE_TAILCALL_V4 : T_Call<"">, PredRel;
+  def RESTORE_DEALLOC_BEFORE_TAILCALL_V4 : T_Call<0, "">, PredRel;
+
   let isExtended = 1, opExtendable = 0 in
-    def RESTORE_DEALLOC_BEFORE_TAILCALL_V4_EXT : T_Call<"">, PredRel;
+  def RESTORE_DEALLOC_BEFORE_TAILCALL_V4_EXT : T_Call<0, "">, PredRel;
+
+  let Defs = [R14, R15, R28, R29, R30, R31, PC] in {
+    def RESTORE_DEALLOC_BEFORE_TAILCALL_V4_PIC : T_Call<0, "">, PredRel;
+
+    let isExtended = 1, opExtendable = 0 in
+    def RESTORE_DEALLOC_BEFORE_TAILCALL_V4_EXT_PIC : T_Call<0, "">, PredRel;
+  }
 }
 
 // Save registers function call.
 let isCall = 1, Uses = [R29, R31], isAsmParserOnly = 1 in {
-  def SAVE_REGISTERS_CALL_V4 : T_Call<"">, PredRel;
+  def SAVE_REGISTERS_CALL_V4 : T_Call<0, "">, PredRel;
+
   let isExtended = 1, opExtendable = 0 in
-    def SAVE_REGISTERS_CALL_V4_EXT : T_Call<"">, PredRel;
+  def SAVE_REGISTERS_CALL_V4_EXT : T_Call<0, "">, PredRel;
+
+  let Defs = [P0] in
+  def SAVE_REGISTERS_CALL_V4STK : T_Call<0, "">, PredRel;
+
+  let Defs = [P0], isExtended = 1, opExtendable = 0 in
+  def SAVE_REGISTERS_CALL_V4STK_EXT : T_Call<0, "">, PredRel;
+
+  let Defs = [R14, R15, R28] in
+  def SAVE_REGISTERS_CALL_V4_PIC : T_Call<0, "">, PredRel;
+
+  let Defs = [R14, R15, R28], isExtended = 1, opExtendable = 0 in
+  def SAVE_REGISTERS_CALL_V4_EXT_PIC : T_Call<0, "">, PredRel;
+
+  let Defs = [R14, R15, R28, P0] in
+  def SAVE_REGISTERS_CALL_V4STK_PIC : T_Call<0, "">, PredRel;
+
+  let Defs = [R14, R15, R28, P0], isExtended = 1, opExtendable = 0 in
+  def SAVE_REGISTERS_CALL_V4STK_EXT_PIC : T_Call<0, "">, PredRel;
 }
 
 //===----------------------------------------------------------------------===//
@@ -3413,9 +3522,9 @@ multiclass ST_Abs<string mnemonic, string CextOp, RegisterClass RC,
 //===----------------------------------------------------------------------===//
 let hasSideEffects = 0, isPredicable = 1, mayStore = 1, isNVStore = 1,
     isNewValue = 1, opNewValue = 1 in
-class T_StoreAbsGP_NV <string mnemonic, Operand ImmOp, bits<2>MajOp, bit isAbs>
-  : NVInst_V4<(outs), (ins u32Imm:$addr, IntRegs:$src),
-  mnemonic # !if(isAbs, "(##", "(#")#"$addr) = $src.new",
+class T_StoreAbsGP_NV <string mnemonic, Operand ImmOp, bits<2>MajOp>
+  : NVInst_V4<(outs), (ins ImmOp:$addr, IntRegs:$src),
+  mnemonic #"(#$addr) = $src.new",
   [], "", V2LDST_tc_st_SLOT0> {
     bits<19> addr;
     bits<3> src;
@@ -3426,7 +3535,6 @@ class T_StoreAbsGP_NV <string mnemonic, Operand ImmOp, bits<2>MajOp, bit isAbs>
                      !if (!eq(ImmOpStr, "u16_2Imm"), addr{17-2},
                      !if (!eq(ImmOpStr, "u16_1Imm"), addr{16-1},
                                       /* u16_0Imm */ addr{15-0})));
-    let Uses = !if (isAbs, [], [GP]);
     let IClass = 0b0100;
 
     let Inst{27} = 1;
@@ -3446,7 +3554,7 @@ class T_StoreAbsGP_NV <string mnemonic, Operand ImmOp, bits<2>MajOp, bit isAbs>
 let hasSideEffects = 0, isPredicated = 1, mayStore = 1, isNVStore = 1,
     isNewValue = 1, opNewValue = 2, opExtentBits = 6, opExtendable = 1 in
 class T_StoreAbs_NV_Pred <string mnemonic, bits<2> MajOp, bit isNot, bit isNew>
-  : NVInst_V4<(outs), (ins PredRegs:$src1, u6Ext:$absaddr, IntRegs:$src2),
+  : NVInst_V4<(outs), (ins PredRegs:$src1, u32MustExt:$absaddr, IntRegs:$src2),
   !if(isNot, "if (!$src1", "if ($src1")#!if(isNew, ".new) ",
   ") ")#mnemonic#"(#$absaddr) = $src2.new",
   [], "", ST_tc_st_SLOT0>, AddrModeRel {
@@ -3476,7 +3584,7 @@ class T_StoreAbs_NV_Pred <string mnemonic, bits<2> MajOp, bit isNot, bit isNew>
 // absolute addressing.
 //===----------------------------------------------------------------------===//
 class T_StoreAbs_NV <string mnemonic, Operand ImmOp, bits<2> MajOp>
-  : T_StoreAbsGP_NV <mnemonic, ImmOp, MajOp, 1>, AddrModeRel {
+  : T_StoreAbsGP_NV <mnemonic, u32MustExt, MajOp>, AddrModeRel {
 
   string ImmOpStr = !cast<string>(ImmOp);
   let opExtentBits = !if (!eq(ImmOpStr, "u16_3Imm"), 19,
@@ -3538,7 +3646,7 @@ defm storerf : ST_Abs <"memh", "STrif", IntRegs, u16_1Imm, 0b01, 1>;
 // if ([!]Pv[.new]) mem[bhwd](##global)=Rt
 //===----------------------------------------------------------------------===//
 
-let isAsmParserOnly = 1 in
+let Uses = [GP], isAsmParserOnly = 1 in
 class T_StoreGP <string mnemonic, string BaseOp, RegisterClass RC,
                  Operand ImmOp, bits<2> MajOp, bit isHalf = 0>
   : T_StoreAbsGP <mnemonic, RC, ImmOp, MajOp, 0, isHalf> {
@@ -3548,7 +3656,7 @@ class T_StoreGP <string mnemonic, string BaseOp, RegisterClass RC,
     let BaseOpcode = BaseOp#_abs;
   }
 
-let isAsmParserOnly = 1 in
+let Uses = [GP], isAsmParserOnly = 1 in
 multiclass ST_GP <string mnemonic, string BaseOp, Operand ImmOp,
                   bits<2> MajOp, bit isHalf = 0> {
   // Set BaseOpcode same as absolute addressing instructions so that
@@ -3558,7 +3666,7 @@ multiclass ST_GP <string mnemonic, string BaseOp, Operand ImmOp,
     def NAME#gp : T_StoreAbsGP <mnemonic, IntRegs, ImmOp, MajOp,
                                 0, isHalf>;
     // New-value store
-    def NAME#newgp : T_StoreAbsGP_NV <mnemonic, ImmOp, MajOp, 0> ;
+    def NAME#newgp : T_StoreAbsGP_NV <mnemonic, ImmOp, MajOp> ;
   }
 }
 
@@ -3594,6 +3702,17 @@ class Stoream_pat<PatFrag Store, PatFrag Value, PatFrag Addr, PatFrag ValueMod,
   : Pat<(Store Value:$val, Addr:$addr),
         (MI Addr:$addr, (ValueMod Value:$val))>;
 
+let AddedComplexity = 30 in {
+  def: Storea_pat<truncstorei8,  I32, addrga, S2_storerbabs>;
+  def: Storea_pat<truncstorei16, I32, addrga, S2_storerhabs>;
+  def: Storea_pat<store,         I32, addrga, S2_storeriabs>;
+  def: Storea_pat<store,         I64, addrga, S2_storerdabs>;
+
+  def: Stoream_pat<truncstorei8,  I64, addrga, LoReg, S2_storerbabs>;
+  def: Stoream_pat<truncstorei16, I64, addrga, LoReg, S2_storerhabs>;
+  def: Stoream_pat<truncstorei32, I64, addrga, LoReg, S2_storeriabs>;
+}
+
 def: Storea_pat<SwapSt<atomic_store_8>,  I32, addrgp, S2_storerbgp>;
 def: Storea_pat<SwapSt<atomic_store_16>, I32, addrgp, S2_storerhgp>;
 def: Storea_pat<SwapSt<atomic_store_32>, I32, addrgp, S2_storerigp>;
@@ -3731,6 +3850,26 @@ defm loadri  : LD_Abs<"memw",  "LDriw",  IntRegs, u16_2Imm, 0b100>;
 let accessSize = DoubleWordAccess in
 defm loadrd  : LD_Abs<"memd",  "LDrid", DoubleRegs, u16_3Imm, 0b110>;
 
+class LoadAbs_pats <PatFrag ldOp, InstHexagon MI, ValueType VT = i32>
+  : Pat <(VT (ldOp (HexagonCONST32 tglobaladdr:$absaddr))),
+         (VT (MI tglobaladdr:$absaddr))>;
+
+let AddedComplexity  = 30 in {
+  def: LoadAbs_pats <load,        L4_loadri_abs>;
+  def: LoadAbs_pats <zextloadi1,  L4_loadrub_abs>;
+  def: LoadAbs_pats <sextloadi8,  L4_loadrb_abs>;
+  def: LoadAbs_pats <extloadi8,   L4_loadrub_abs>;
+  def: LoadAbs_pats <zextloadi8,  L4_loadrub_abs>;
+  def: LoadAbs_pats <sextloadi16, L4_loadrh_abs>;
+  def: LoadAbs_pats <extloadi16,  L4_loadruh_abs>;
+  def: LoadAbs_pats <zextloadi16, L4_loadruh_abs>;
+  def: LoadAbs_pats <load,        L4_loadrd_abs, i64>;
+}
+
+let AddedComplexity  = 30 in
+def: Pat<(i64 (zextloadi1 (HexagonCONST32 tglobaladdr:$absaddr))),
+         (Zext64 (L4_loadrub_abs tglobaladdr:$absaddr))>;
+
 //===----------------------------------------------------------------------===//
 // multiclass for load instructions with GP-relative addressing mode.
 // Rx=mem[bhwd](##global)
@@ -3779,14 +3918,14 @@ class LoadGP_pats <PatFrag ldOp, InstHexagon MI, ValueType VT = i32>
          (VT (MI tglobaladdr:$global))>;
 
 let AddedComplexity = 100 in {
-  def: LoadGP_pats <extloadi8, L2_loadrbgp>;
-  def: LoadGP_pats <sextloadi8, L2_loadrbgp>;
-  def: LoadGP_pats <zextloadi8, L2_loadrubgp>;
-  def: LoadGP_pats <extloadi16, L2_loadrhgp>;
+  def: LoadGP_pats <extloadi8,   L2_loadrubgp>;
+  def: LoadGP_pats <sextloadi8,  L2_loadrbgp>;
+  def: LoadGP_pats <zextloadi8,  L2_loadrubgp>;
+  def: LoadGP_pats <extloadi16,  L2_loadruhgp>;
   def: LoadGP_pats <sextloadi16, L2_loadrhgp>;
   def: LoadGP_pats <zextloadi16, L2_loadruhgp>;
-  def: LoadGP_pats <load, L2_loadrigp>;
-  def: LoadGP_pats <load, L2_loadrdgp, i64>;
+  def: LoadGP_pats <load,        L2_loadrigp>;
+  def: LoadGP_pats <load,        L2_loadrdgp, i64>;
 }
 
 // When the Interprocedural Global Variable optimizer realizes that a certain
@@ -3819,7 +3958,7 @@ let AddedComplexity  = 30 in {
 // Indexed store word - global address.
 // memw(Rs+#u6:2)=#S8
 let AddedComplexity = 100 in
-def: Storex_add_pat<store, addrga, u6_2ImmPred, S4_storeiri_io>;
+defm: Storex_add_pat<store, addrga, u6_2ImmPred, S4_storeiri_io>;
 
 // Load from a global address that has only one use in the current basic block.
 let AddedComplexity = 100 in {
@@ -3996,6 +4135,10 @@ def Y2_dcfetchbo : LD0Inst<(outs), (ins IntRegs:$Rs, u11_3Imm:$u11_3),
   let Inst{10-0} = u11_3{13-3};
 }
 
+
+def: Pat<(HexagonDCFETCH (i32 (add IntRegs:$Rs, u11_3ImmPred:$u11_3)), (i32 0)),
+         (Y2_dcfetchbo IntRegs:$Rs, u11_3ImmPred:$u11_3)>;
+
 //===----------------------------------------------------------------------===//
 // Compound instructions
 //===----------------------------------------------------------------------===//
@@ -4008,7 +4151,7 @@ class CJInst_tstbit_R0<string px, bit np, string tnt>
   : InstHexagon<(outs), (ins IntRegs:$Rs, brtarget:$r9_2),
   ""#px#" = tstbit($Rs, #0); if ("
     #!if(np, "!","")#""#px#".new) jump:"#tnt#" $r9_2",
-  [], "", COMPOUND, TypeCOMPOUND>, OpcodeHexagon {
+  [], "", COMPOUND_CJ_ARCHDEPSLOT, TypeCOMPOUND>, OpcodeHexagon {
   bits<4> Rs;
   bits<11> r9_2;
 
@@ -4054,7 +4197,7 @@ class CJInst_RR<string px, string op, bit np, string tnt>
   : InstHexagon<(outs), (ins IntRegs:$Rs, IntRegs:$Rt, brtarget:$r9_2),
   ""#px#" = cmp."#op#"($Rs, $Rt); if ("
    #!if(np, "!","")#""#px#".new) jump:"#tnt#" $r9_2",
-  [], "", COMPOUND, TypeCOMPOUND>, OpcodeHexagon {
+  [], "", COMPOUND_CJ_ARCHDEPSLOT, TypeCOMPOUND>, OpcodeHexagon {
   bits<4> Rs;
   bits<4> Rt;
   bits<11> r9_2;
@@ -4108,7 +4251,7 @@ class CJInst_RU5<string px, string op, bit np, string tnt>
   : InstHexagon<(outs), (ins IntRegs:$Rs, u5Imm:$U5, brtarget:$r9_2),
   ""#px#" = cmp."#op#"($Rs, #$U5); if ("
     #!if(np, "!","")#""#px#".new) jump:"#tnt#" $r9_2",
-  [], "", COMPOUND, TypeCOMPOUND>, OpcodeHexagon {
+  [], "", COMPOUND_CJ_ARCHDEPSLOT, TypeCOMPOUND>, OpcodeHexagon {
   bits<4> Rs;
   bits<5> U5;
   bits<11> r9_2;
@@ -4163,7 +4306,7 @@ class CJInst_Rn1<string px, string op, bit np, string tnt>
   : InstHexagon<(outs), (ins IntRegs:$Rs, brtarget:$r9_2),
   ""#px#" = cmp."#op#"($Rs,#-1); if ("
   #!if(np, "!","")#""#px#".new) jump:"#tnt#" $r9_2",
-  [], "", COMPOUND, TypeCOMPOUND>, OpcodeHexagon {
+  [], "", COMPOUND_CJ_ARCHDEPSLOT, TypeCOMPOUND>, OpcodeHexagon {
   bits<4> Rs;
   bits<11> r9_2;
 
@@ -4212,7 +4355,7 @@ defm gt : T_pnp_CJInst_Rn1<"gt">;
 let Defs = [PC], isBranch = 1, hasSideEffects = 0, hasNewValue = 1,
     isExtentSigned = 1, opNewValue = 0, isExtendable = 1, opExtentBits = 11,
     opExtentAlign = 2, opExtendable = 2 in
-def J4_jumpseti: CJInst <
+def J4_jumpseti: CJInst_JMPSET <
   (outs IntRegs:$Rd),
   (ins u6Imm:$U6, brtarget:$r9_2),
   "$Rd = #$U6 ; jump $r9_2"> {
@@ -4232,7 +4375,7 @@ def J4_jumpseti: CJInst <
 let Defs = [PC], isBranch = 1, hasSideEffects = 0, hasNewValue = 1,
     isExtentSigned = 1, opNewValue = 0, isExtendable = 1, opExtentBits = 11,
     opExtentAlign = 2, opExtendable = 2 in
-def J4_jumpsetr: CJInst <
+def J4_jumpsetr: CJInst_JMPSET <
   (outs IntRegs:$Rd),
   (ins IntRegs:$Rs, brtarget:$r9_2),
   "$Rd = $Rs ; jump $r9_2"> {
diff --git a/lib/Target/Hexagon/HexagonInstrInfoV60.td b/lib/Target/Hexagon/HexagonInstrInfoV60.td
index 897ada081534..c3f09b69ce85 100644
--- a/lib/Target/Hexagon/HexagonInstrInfoV60.td
+++ b/lib/Target/Hexagon/HexagonInstrInfoV60.td
@@ -10,6 +10,21 @@
 // This file describes the Hexagon V60 instructions in TableGen format.
 //
 //===----------------------------------------------------------------------===//
+def alignedload : PatFrag<(ops node:$addr), (load $addr), [{
+  return isAlignedMemNode(dyn_cast<MemSDNode>(N));
+}]>;
+
+def unalignedload : PatFrag<(ops node:$addr), (load $addr), [{
+  return !isAlignedMemNode(dyn_cast<MemSDNode>(N));
+}]>;
+
+def alignedstore : PatFrag<(ops node:$val, node:$addr), (store $val, $addr), [{
+  return isAlignedMemNode(dyn_cast<MemSDNode>(N));
+}]>;
+
+def unalignedstore : PatFrag<(ops node:$val, node:$addr), (store $val, $addr), [{
+  return !isAlignedMemNode(dyn_cast<MemSDNode>(N));
+}]>;
 
 
 // Vector store
@@ -102,7 +117,7 @@ let Itinerary = CVI_VM_TMP_LD, Type = TypeCVI_VM_TMP_LD, hasNewValue = 1 in {
 //===----------------------------------------------------------------------===//
 // Vector stores with base + immediate offset - unconditional
 //===----------------------------------------------------------------------===//
-let addrMode = BaseImmOffset, accessSize = Vector64Access in
+let addrMode = BaseImmOffset, accessSize = Vector64Access, isPredicable = 1 in
 class T_vstore_ai <string mnemonic, string baseOp, Operand ImmOp,
                    RegisterClass RC, bit isNT>
   : V6_STInst <(outs), (ins IntRegs:$src1, ImmOp:$src2, RC:$src3),
@@ -133,16 +148,16 @@ let isNVStorable = 1, isNonTemporal = 1 in {
 }
 
 let Itinerary = CVI_VM_STU, Type = TypeCVI_VM_STU in {
-  def V6_vS32Ub_ai      : T_vstore_ai_64B <"vmemu", "vs32Ub_ai">,
+  def V6_vS32Ub_ai      : T_vstore_ai_64B <"vmemu", "vS32Ub_ai">,
                           V6_vS32Ub_ai_enc;
-  def V6_vS32Ub_ai_128B : T_vstore_ai_128B <"vmemu", "vs32Ub_ai">,
+  def V6_vS32Ub_ai_128B : T_vstore_ai_128B <"vmemu", "vS32Ub_ai">,
                           V6_vS32Ub_ai_128B_enc;
 }
 //===----------------------------------------------------------------------===//
 // Vector stores with base + immediate offset - unconditional new
 //===----------------------------------------------------------------------===//
 let addrMode = BaseImmOffset, isNewValue = 1, opNewValue = 2, isNVStore = 1,
-    Itinerary = CVI_VM_NEW_ST, Type = TypeCVI_VM_NEW_ST in
+    isPredicable = 1, Itinerary = CVI_VM_NEW_ST, Type = TypeCVI_VM_NEW_ST in
 class T_vstore_new_ai <string baseOp, Operand ImmOp, RegisterClass RC, bit isNT>
   : V6_STInst <(outs ), (ins IntRegs:$src1, ImmOp:$src2, RC:$src3),
     "vmem($src1+#$src2)"#!if(isNT, ":nt", "")#" = $src3.new">, NewValueRel {
@@ -384,13 +399,15 @@ let Itinerary = CVI_VM_TMP_LD, Type = TypeCVI_VM_TMP_LD in {
 //===----------------------------------------------------------------------===//
 // Post increment vector stores with immediate offset.
 //===----------------------------------------------------------------------===//
-let addrMode = PostInc in
+let addrMode = PostInc, isPredicable = 1 in
 class T_vstore_pi <string mnemonic, string baseOp, Operand ImmOp,
                    RegisterClass RC, bit isNT>
   : V6_STInst <(outs IntRegs:$_dst_),
                (ins IntRegs:$src1, ImmOp:$src2, RC:$src3),
     mnemonic#"($src1++#$src2)"#!if(isNT, ":nt", "")#" = $src3", [],
-    "$src1 = $_dst_">, NewValueRel;
+    "$src1 = $_dst_">, NewValueRel {
+  let BaseOpcode = baseOp;
+}
 
 let accessSize = Vector64Access in
 class T_vstore_pi_64B <string mnemonic, string baseOp, bit isNT = 0>
@@ -398,7 +415,7 @@ class T_vstore_pi_64B <string mnemonic, string baseOp, bit isNT = 0>
 
 let isCodeGenOnly = 1, accessSize = Vector128Access in
 class T_vstore_pi_128B <string mnemonic, string baseOp, bit isNT = 0>
-  : T_vstore_pi <mnemonic, baseOp, s3_7Imm, VectorRegs128B, isNT>;
+  : T_vstore_pi <mnemonic, baseOp#"128B", s3_7Imm, VectorRegs128B, isNT>;
 
 let isNVStorable = 1 in {
   def V6_vS32b_pi      : T_vstore_pi_64B <"vmem", "vS32b_pi">, V6_vS32b_pi_enc;
@@ -426,7 +443,7 @@ let Itinerary = CVI_VM_STU, Type = TypeCVI_VM_STU in {
 //===----------------------------------------------------------------------===//
 let addrMode = PostInc, isNVStore = 1 in
 let Itinerary = CVI_VM_NEW_ST, Type = TypeCVI_VM_NEW_ST, isNewValue = 1,
-    opNewValue = 3, isNVStore = 1 in
+    isPredicable = 1, opNewValue = 3, isNVStore = 1 in
 class T_vstore_new_pi <string baseOp, Operand ImmOp, RegisterClass RC, bit isNT>
   : V6_STInst <(outs IntRegs:$_dst_),
                (ins IntRegs:$src1, ImmOp:$src2, RC:$src3),
@@ -644,6 +661,7 @@ let Itinerary = CVI_VM_TMP_LD, Type = TypeCVI_VM_TMP_LD in {
 //===----------------------------------------------------------------------===//
 // Post increment vector stores with register offset
 //===----------------------------------------------------------------------===//
+let isPredicable = 1 in
 class T_vstore_ppu <string mnemonic, bit isNT = 0>
   : V6_STInst <(outs IntRegs:$_dst_),
                (ins IntRegs:$src1, ModRegs:$src2, VectorRegs:$src3),
@@ -665,7 +683,7 @@ def V6_vS32Ub_ppu   : T_vstore_ppu <"vmemu">, V6_vS32Ub_ppu_enc;
 // Post increment .new vector stores with register offset
 //===----------------------------------------------------------------------===//
 let Itinerary = CVI_VM_NEW_ST, Type = TypeCVI_VM_NEW_ST, isNewValue = 1,
-    opNewValue = 3, isNVStore = 1 in
+    isPredicable = 1, opNewValue = 3, isNVStore = 1 in
 class T_vstore_new_ppu <bit isNT = 0>
   : V6_STInst <(outs IntRegs:$_dst_),
                (ins IntRegs:$src1, ModRegs:$src2, VectorRegs:$src3),
@@ -785,30 +803,46 @@ defm : STrivv_pats <v16i64, v32i64>;
 
 multiclass vS32b_ai_pats <ValueType VTSgl, ValueType VTDbl> {
   // Aligned stores
-  def : Pat<(store (VTSgl VectorRegs:$src1), IntRegs:$addr),
+  def : Pat<(alignedstore (VTSgl VectorRegs:$src1), IntRegs:$addr),
             (V6_vS32b_ai IntRegs:$addr, #0, (VTSgl VectorRegs:$src1))>,
             Requires<[UseHVXSgl]>;
+  def : Pat<(unalignedstore (VTSgl VectorRegs:$src1), IntRegs:$addr),
+            (V6_vS32Ub_ai IntRegs:$addr, #0, (VTSgl VectorRegs:$src1))>,
+            Requires<[UseHVXSgl]>;
 
   // 128B Aligned stores
-  def : Pat<(store (VTDbl VectorRegs128B:$src1), IntRegs:$addr),
+  def : Pat<(alignedstore (VTDbl VectorRegs128B:$src1), IntRegs:$addr),
             (V6_vS32b_ai_128B IntRegs:$addr, #0, (VTDbl VectorRegs128B:$src1))>,
             Requires<[UseHVXDbl]>;
+  def : Pat<(unalignedstore (VTDbl VectorRegs128B:$src1), IntRegs:$addr),
+            (V6_vS32Ub_ai_128B IntRegs:$addr, #0, (VTDbl VectorRegs128B:$src1))>,
+            Requires<[UseHVXDbl]>;
 
   // Fold Add R+IFF into vector store.
-  let AddedComplexity = 10 in
-  def : Pat<(store (VTSgl VectorRegs:$src1),
-                   (add IntRegs:$src2, s4_6ImmPred:$offset)),
-            (V6_vS32b_ai IntRegs:$src2, s4_6ImmPred:$offset,
-                         (VTSgl VectorRegs:$src1))>,
-            Requires<[UseHVXSgl]>;
+  let AddedComplexity = 10 in {
+    def : Pat<(alignedstore (VTSgl VectorRegs:$src1),
+                     (add IntRegs:$src2, s4_6ImmPred:$offset)),
+              (V6_vS32b_ai IntRegs:$src2, s4_6ImmPred:$offset,
+                           (VTSgl VectorRegs:$src1))>,
+              Requires<[UseHVXSgl]>;
+    def : Pat<(unalignedstore (VTSgl VectorRegs:$src1),
+                     (add IntRegs:$src2, s4_6ImmPred:$offset)),
+              (V6_vS32Ub_ai IntRegs:$src2, s4_6ImmPred:$offset,
+                           (VTSgl VectorRegs:$src1))>,
+              Requires<[UseHVXSgl]>;
 
-  // Fold Add R+IFF into vector store 128B.
-  let AddedComplexity = 10 in
-  def : Pat<(store (VTDbl VectorRegs128B:$src1),
-                   (add IntRegs:$src2, s4_7ImmPred:$offset)),
-            (V6_vS32b_ai_128B IntRegs:$src2, s4_7ImmPred:$offset,
-                              (VTDbl VectorRegs128B:$src1))>,
-            Requires<[UseHVXDbl]>;
+    // Fold Add R+IFF into vector store 128B.
+    def : Pat<(alignedstore (VTDbl VectorRegs128B:$src1),
+                     (add IntRegs:$src2, s4_7ImmPred:$offset)),
+              (V6_vS32b_ai_128B IntRegs:$src2, s4_7ImmPred:$offset,
+                                (VTDbl VectorRegs128B:$src1))>,
+              Requires<[UseHVXDbl]>;
+    def : Pat<(unalignedstore (VTDbl VectorRegs128B:$src1),
+                     (add IntRegs:$src2, s4_7ImmPred:$offset)),
+              (V6_vS32Ub_ai_128B IntRegs:$src2, s4_7ImmPred:$offset,
+                                (VTDbl VectorRegs128B:$src1))>,
+              Requires<[UseHVXDbl]>;
+  }
 }
 
 defm : vS32b_ai_pats <v64i8,  v128i8>;
@@ -843,25 +877,37 @@ defm : LDrivv_pats <v16i64, v32i64>;
 
 multiclass vL32b_ai_pats <ValueType VTSgl, ValueType VTDbl> {
   // Aligned loads
-  def : Pat < (VTSgl (load IntRegs:$addr)),
+  def : Pat < (VTSgl (alignedload IntRegs:$addr)),
               (V6_vL32b_ai IntRegs:$addr, #0) >,
               Requires<[UseHVXSgl]>;
+  def : Pat < (VTSgl (unalignedload IntRegs:$addr)),
+              (V6_vL32Ub_ai IntRegs:$addr, #0) >,
+              Requires<[UseHVXSgl]>;
 
   // 128B Load
-  def : Pat < (VTDbl (load IntRegs:$addr)),
+  def : Pat < (VTDbl (alignedload IntRegs:$addr)),
               (V6_vL32b_ai_128B IntRegs:$addr, #0) >,
               Requires<[UseHVXDbl]>;
+  def : Pat < (VTDbl (unalignedload IntRegs:$addr)),
+              (V6_vL32Ub_ai_128B IntRegs:$addr, #0) >,
+              Requires<[UseHVXDbl]>;
 
   // Fold Add R+IFF into vector load.
-  let AddedComplexity = 10 in
-  def : Pat<(VTDbl (load (add IntRegs:$src2, s4_7ImmPred:$offset))),
-            (V6_vL32b_ai_128B IntRegs:$src2, s4_7ImmPred:$offset)>,
-             Requires<[UseHVXDbl]>;
-
-  let AddedComplexity = 10 in
-  def : Pat<(VTSgl (load (add IntRegs:$src2, s4_6ImmPred:$offset))),
-            (V6_vL32b_ai IntRegs:$src2, s4_6ImmPred:$offset)>,
-            Requires<[UseHVXSgl]>;
+  let AddedComplexity = 10 in {
+    def : Pat<(VTDbl (alignedload (add IntRegs:$src2, s4_7ImmPred:$offset))),
+              (V6_vL32b_ai_128B IntRegs:$src2, s4_7ImmPred:$offset)>,
+               Requires<[UseHVXDbl]>;
+    def : Pat<(VTDbl (unalignedload (add IntRegs:$src2, s4_7ImmPred:$offset))),
+              (V6_vL32Ub_ai_128B IntRegs:$src2, s4_7ImmPred:$offset)>,
+               Requires<[UseHVXDbl]>;
+
+    def : Pat<(VTSgl (alignedload (add IntRegs:$src2, s4_6ImmPred:$offset))),
+              (V6_vL32b_ai IntRegs:$src2, s4_6ImmPred:$offset)>,
+              Requires<[UseHVXSgl]>;
+    def : Pat<(VTSgl (unalignedload (add IntRegs:$src2, s4_6ImmPred:$offset))),
+              (V6_vL32Ub_ai IntRegs:$src2, s4_6ImmPred:$offset)>,
+              Requires<[UseHVXSgl]>;
+  }
 }
 
 defm : vL32b_ai_pats <v64i8,  v128i8>;
diff --git a/lib/Target/Hexagon/HexagonInstrInfoVector.td b/lib/Target/Hexagon/HexagonInstrInfoVector.td
index 96dd5315b87f..0277d5e3c28c 100644
--- a/lib/Target/Hexagon/HexagonInstrInfoVector.td
+++ b/lib/Target/Hexagon/HexagonInstrInfoVector.td
@@ -35,61 +35,12 @@ multiclass bitconvert_64<ValueType a, ValueType b> {
              (a DoubleRegs:$src)>;
 }
 
-multiclass bitconvert_vec<ValueType a, ValueType b> {
-  def : Pat <(b (bitconvert (a VectorRegs:$src))),
-             (b  VectorRegs:$src)>;
-  def : Pat <(a (bitconvert (b VectorRegs:$src))),
-             (a  VectorRegs:$src)>;
-}
-
-multiclass bitconvert_dblvec<ValueType a, ValueType b> {
-  def : Pat <(b (bitconvert (a VecDblRegs:$src))),
-             (b  VecDblRegs:$src)>;
-  def : Pat <(a (bitconvert (b VecDblRegs:$src))),
-             (a  VecDblRegs:$src)>;
-}
-
-multiclass bitconvert_predvec<ValueType a, ValueType b> {
-  def : Pat <(b (bitconvert (a VecPredRegs:$src))),
-             (b  VectorRegs:$src)>;
-  def : Pat <(a (bitconvert (b VectorRegs:$src))),
-             (a  VecPredRegs:$src)>;
-}
-
-multiclass bitconvert_dblvec128B<ValueType a, ValueType b> {
-  def : Pat <(b (bitconvert (a VecDblRegs128B:$src))),
-             (b  VecDblRegs128B:$src)>;
-  def : Pat <(a (bitconvert (b VecDblRegs128B:$src))),
-             (a  VecDblRegs128B:$src)>;
-}
-
-// Bit convert vector types.
-defm : bitconvert_32<v4i8, i32>;
+// Bit convert vector types to integers.
+defm : bitconvert_32<v4i8,  i32>;
 defm : bitconvert_32<v2i16, i32>;
-defm : bitconvert_32<v2i16, v4i8>;
-
-defm : bitconvert_64<v8i8, i64>;
+defm : bitconvert_64<v8i8,  i64>;
 defm : bitconvert_64<v4i16, i64>;
 defm : bitconvert_64<v2i32, i64>;
-defm : bitconvert_64<v8i8, v4i16>;
-defm : bitconvert_64<v8i8, v2i32>;
-defm : bitconvert_64<v4i16, v2i32>;
-
-defm : bitconvert_vec<v64i8, v16i32>;
-defm : bitconvert_vec<v8i64 , v16i32>;
-defm : bitconvert_vec<v32i16, v16i32>;
-
-defm : bitconvert_dblvec<v16i64, v128i8>;
-defm : bitconvert_dblvec<v32i32, v128i8>;
-defm : bitconvert_dblvec<v64i16, v128i8>;
-
-defm : bitconvert_dblvec128B<v64i32, v128i16>;
-defm : bitconvert_dblvec128B<v256i8, v128i16>;
-defm : bitconvert_dblvec128B<v32i64, v128i16>;
-
-defm : bitconvert_dblvec128B<v64i32, v256i8>;
-defm : bitconvert_dblvec128B<v32i64, v256i8>;
-defm : bitconvert_dblvec128B<v128i16, v256i8>;
 
 // Vector shift support. Vector shifting in Hexagon is rather different
 // from internal representation of LLVM.
diff --git a/lib/Target/Hexagon/HexagonIntrinsics.td b/lib/Target/Hexagon/HexagonIntrinsics.td
index b207aaf392f4..a319dd4f9789 100644
--- a/lib/Target/Hexagon/HexagonIntrinsics.td
+++ b/lib/Target/Hexagon/HexagonIntrinsics.td
@@ -23,27 +23,29 @@ class T_R_pat <InstHexagon MI, Intrinsic IntID>
 
 class T_P_pat <InstHexagon MI, Intrinsic IntID>
   : Pat <(IntID I64:$Rs),
-         (MI DoubleRegs:$Rs)>;
+         (MI I64:$Rs)>;
 
 class T_II_pat <InstHexagon MI, Intrinsic IntID, PatFrag Imm1, PatFrag Imm2>
   : Pat<(IntID Imm1:$Is, Imm2:$It),
         (MI Imm1:$Is, Imm2:$It)>;
 
-class T_RI_pat <InstHexagon MI, Intrinsic IntID, PatLeaf ImmPred = PatLeaf<(i32 imm)>>
+class T_RI_pat <InstHexagon MI, Intrinsic IntID,
+                PatLeaf ImmPred = PatLeaf<(i32 imm)>>
   : Pat<(IntID I32:$Rs, ImmPred:$It),
         (MI I32:$Rs, ImmPred:$It)>;
 
-class T_IR_pat <InstHexagon MI, Intrinsic IntID, PatFrag ImmPred = PatLeaf<(i32 imm)>>
+class T_IR_pat <InstHexagon MI, Intrinsic IntID,
+                PatFrag ImmPred = PatLeaf<(i32 imm)>>
   : Pat<(IntID ImmPred:$Is, I32:$Rt),
         (MI ImmPred:$Is, I32:$Rt)>;
 
 class T_PI_pat <InstHexagon MI, Intrinsic IntID>
   : Pat<(IntID I64:$Rs, imm:$It),
-        (MI DoubleRegs:$Rs, imm:$It)>;
+        (MI I64:$Rs, imm:$It)>;
 
 class T_RP_pat <InstHexagon MI, Intrinsic IntID>
   : Pat<(IntID I32:$Rs, I64:$Rt),
-        (MI I32:$Rs, DoubleRegs:$Rt)>;
+        (MI I32:$Rs, I64:$Rt)>;
 
 class T_RR_pat <InstHexagon MI, Intrinsic IntID>
   : Pat <(IntID I32:$Rs, I32:$Rt),
@@ -51,19 +53,31 @@ class T_RR_pat <InstHexagon MI, Intrinsic IntID>
 
 class T_PP_pat <InstHexagon MI, Intrinsic IntID>
   : Pat <(IntID I64:$Rs, I64:$Rt),
-         (MI DoubleRegs:$Rs, DoubleRegs:$Rt)>;
+         (MI I64:$Rs, I64:$Rt)>;
+
+class T_QQ_pat <InstHexagon MI, Intrinsic IntID>
+  : Pat <(IntID I32:$Rs, I32:$Rt),
+         (MI (C2_tfrrp I32:$Rs), (C2_tfrrp I32:$Rt))>;
 
 class T_QII_pat <InstHexagon MI, Intrinsic IntID, PatFrag Imm1, PatFrag Imm2>
-  : Pat <(IntID (i32 PredRegs:$Ps), Imm1:$Is, Imm2:$It),
-         (MI PredRegs:$Ps, Imm1:$Is, Imm2:$It)>;
+  : Pat <(IntID I32:$Rp, Imm1:$Is, Imm2:$It),
+         (MI (C2_tfrrp I32:$Rp), Imm1:$Is, Imm2:$It)>;
+
+class T_QRR_pat <InstHexagon MI, Intrinsic IntID>
+  : Pat <(IntID I32:$Rp, I32:$Rs, I32:$Rt),
+         (MI (C2_tfrrp I32:$Rp), I32:$Rs, I32:$Rt)>;
 
 class T_QRI_pat <InstHexagon MI, Intrinsic IntID, PatFrag ImmPred>
-  : Pat <(IntID (i32 PredRegs:$Ps), I32:$Rs, ImmPred:$Is),
-         (MI PredRegs:$Ps, I32:$Rs, ImmPred:$Is)>;
+  : Pat <(IntID I32:$Rp, I32:$Rs, ImmPred:$Is),
+         (MI (C2_tfrrp I32:$Rp), I32:$Rs, ImmPred:$Is)>;
 
 class T_QIR_pat <InstHexagon MI, Intrinsic IntID, PatFrag ImmPred>
-  : Pat <(IntID (i32 PredRegs:$Ps), ImmPred:$Is, I32:$Rs),
-         (MI PredRegs:$Ps, ImmPred:$Is, I32:$Rs)>;
+  : Pat <(IntID I32:$Rp, ImmPred:$Is, I32:$Rs),
+         (MI (C2_tfrrp I32:$Rp), ImmPred:$Is, I32:$Rs)>;
+
+class T_QPP_pat <InstHexagon MI, Intrinsic IntID>
+  : Pat <(IntID I32:$Rp, I64:$Rs, I64:$Rt),
+         (MI (C2_tfrrp I32:$Rp), I64:$Rs, I64:$Rt)>;
 
 class T_RRI_pat <InstHexagon MI, Intrinsic IntID>
   : Pat <(IntID I32:$Rs, I32:$Rt, imm:$Iu),
@@ -91,31 +105,31 @@ class T_RRR_pat <InstHexagon MI, Intrinsic IntID>
 
 class T_PPI_pat <InstHexagon MI, Intrinsic IntID>
   : Pat <(IntID I64:$Rs, I64:$Rt, imm:$Iu),
-         (MI DoubleRegs:$Rs, DoubleRegs:$Rt, imm:$Iu)>;
+         (MI I64:$Rs, I64:$Rt, imm:$Iu)>;
 
 class T_PII_pat <InstHexagon MI, Intrinsic IntID>
   : Pat <(IntID I64:$Rs, imm:$It, imm:$Iu),
-         (MI DoubleRegs:$Rs, imm:$It, imm:$Iu)>;
+         (MI I64:$Rs, imm:$It, imm:$Iu)>;
 
 class T_PPP_pat <InstHexagon MI, Intrinsic IntID>
   : Pat <(IntID I64:$Rs, I64:$Rt, I64:$Ru),
-         (MI DoubleRegs:$Rs, DoubleRegs:$Rt, DoubleRegs:$Ru)>;
+         (MI I64:$Rs, I64:$Rt, I64:$Ru)>;
 
 class T_PPR_pat <InstHexagon MI, Intrinsic IntID>
   : Pat <(IntID I64:$Rs, I64:$Rt, I32:$Ru),
-         (MI DoubleRegs:$Rs, DoubleRegs:$Rt, I32:$Ru)>;
+         (MI I64:$Rs, I64:$Rt, I32:$Ru)>;
 
 class T_PRR_pat <InstHexagon MI, Intrinsic IntID>
   : Pat <(IntID I64:$Rs, I32:$Rt, I32:$Ru),
-         (MI DoubleRegs:$Rs, I32:$Rt, I32:$Ru)>;
+         (MI I64:$Rs, I32:$Rt, I32:$Ru)>;
 
 class T_PPQ_pat <InstHexagon MI, Intrinsic IntID>
-  : Pat <(IntID I64:$Rs, I64:$Rt, (i32 PredRegs:$Ru)),
-         (MI DoubleRegs:$Rs, DoubleRegs:$Rt, PredRegs:$Ru)>;
+  : Pat <(IntID I64:$Rs, I64:$Rt, I32:$Rp),
+         (MI I64:$Rs, I64:$Rt, (C2_tfrrp I32:$Rp))>;
 
 class T_PR_pat <InstHexagon MI, Intrinsic IntID>
   : Pat <(IntID I64:$Rs, I32:$Rt),
-         (MI DoubleRegs:$Rs, I32:$Rt)>;
+         (MI I64:$Rs, I32:$Rt)>;
 
 class T_D_pat <InstHexagon MI, Intrinsic IntID>
   : Pat<(IntID (F64:$Rs)),
@@ -131,7 +145,7 @@ class T_F_pat <InstHexagon MI, Intrinsic IntID>
         (MI F32:$Rs)>;
 
 class T_FI_pat <InstHexagon MI, Intrinsic IntID,
-                 PatLeaf ImmPred = PatLeaf<(i32 imm)>>
+                PatLeaf ImmPred = PatLeaf<(i32 imm)>>
   : Pat<(IntID F32:$Rs, ImmPred:$It),
         (MI F32:$Rs, ImmPred:$It)>;
 
@@ -148,8 +162,62 @@ class T_FFF_pat <InstHexagon MI, Intrinsic IntID>
         (MI F32:$Rs, F32:$Rt, F32:$Ru)>;
 
 class T_FFFQ_pat <InstHexagon MI, Intrinsic IntID>
-  : Pat <(IntID F32:$Rs, F32:$Rt, F32:$Ru, (i32 PredRegs:$Rx)),
-         (MI F32:$Rs, F32:$Rt, F32:$Ru, PredRegs:$Rx)>;
+  : Pat <(IntID F32:$Rs, F32:$Rt, F32:$Ru, I32:$Rp),
+         (MI F32:$Rs, F32:$Rt, F32:$Ru, (C2_tfrrp I32:$Rp))>;
+
+class T_Q_RI_pat <InstHexagon MI, Intrinsic IntID,
+                  PatLeaf ImmPred = PatLeaf<(i32 imm)>>
+  : Pat<(IntID I32:$Rs, ImmPred:$It),
+        (C2_tfrpr (MI I32:$Rs, ImmPred:$It))>;
+
+class T_Q_RR_pat <InstHexagon MI, Intrinsic IntID>
+  : Pat <(IntID I32:$Rs, I32:$Rt),
+         (C2_tfrpr (MI I32:$Rs, I32:$Rt))>;
+
+class T_Q_RP_pat <InstHexagon MI, Intrinsic IntID>
+  : Pat <(IntID I32:$Rs, I64:$Rt),
+         (C2_tfrpr (MI I32:$Rs, I64:$Rt))>;
+
+class T_Q_PR_pat <InstHexagon MI, Intrinsic IntID>
+  : Pat <(IntID I64:$Rs, I32:$Rt),
+         (C2_tfrpr (MI I64:$Rs, I32:$Rt))>;
+
+class T_Q_PI_pat <InstHexagon MI, Intrinsic IntID>
+  : Pat<(IntID I64:$Rs, imm:$It),
+        (C2_tfrpr (MI I64:$Rs, imm:$It))>;
+
+class T_Q_PP_pat <InstHexagon MI, Intrinsic IntID>
+  : Pat <(IntID I64:$Rs, I64:$Rt),
+         (C2_tfrpr (MI I64:$Rs, I64:$Rt))>;
+
+class T_Q_Q_pat <InstHexagon MI, Intrinsic IntID>
+  : Pat <(IntID I32:$Rp),
+         (C2_tfrpr (MI (C2_tfrrp I32:$Rp)))>;
+
+class T_Q_QQ_pat <InstHexagon MI, Intrinsic IntID>
+  : Pat <(IntID I32:$Rp, I32:$Rq),
+         (C2_tfrpr (MI (C2_tfrrp I32:$Rp), (C2_tfrrp I32:$Rq)))>;
+
+class T_Q_FF_pat <InstHexagon MI, Intrinsic IntID>
+  : Pat<(IntID F32:$Rs, F32:$Rt),
+        (C2_tfrpr (MI F32:$Rs, F32:$Rt))>;
+
+class T_Q_DD_pat <InstHexagon MI, Intrinsic IntID>
+  : Pat<(IntID F64:$Rs, F64:$Rt),
+        (C2_tfrpr (MI F64:$Rs, F64:$Rt))>;
+
+class T_Q_FI_pat <InstHexagon MI, Intrinsic IntID>
+  : Pat<(IntID F32:$Rs, imm:$It),
+        (C2_tfrpr (MI F32:$Rs, imm:$It))>;
+
+class T_Q_DI_pat <InstHexagon MI, Intrinsic IntID>
+  : Pat<(IntID F64:$Rs, imm:$It),
+        (C2_tfrpr (MI F64:$Rs, imm:$It))>;
+
+class T_Q_QQQ_pat <InstHexagon MI, Intrinsic IntID>
+  : Pat <(IntID I32:$Rp, I32:$Rq, I32:$Rs),
+         (C2_tfrpr (MI (C2_tfrrp I32:$Rp), (C2_tfrrp I32:$Rq),
+                       (C2_tfrrp I32:$Rs)))>;
 
 //===----------------------------------------------------------------------===//
 // MPYS / Multipy signed/unsigned halfwords
@@ -645,9 +713,9 @@ def : T_PPR_pat <S2_lsr_r_p_or,   int_hexagon_S2_lsr_r_p_or>;
 def : T_PPR_pat <S2_asl_r_p_or,   int_hexagon_S2_asl_r_p_or>;
 def : T_PPR_pat <S2_lsl_r_p_or,   int_hexagon_S2_lsl_r_p_or>;
 
-/********************************************************************
-*            ALU32/ALU                                              *
-*********************************************************************/
+//*******************************************************************
+//           ALU32/ALU
+//*******************************************************************
 def : T_RR_pat<A2_add,      int_hexagon_A2_add>;
 def : T_RI_pat<A2_addi,     int_hexagon_A2_addi>;
 def : T_RR_pat<A2_sub,      int_hexagon_A2_sub>;
@@ -660,31 +728,46 @@ def : T_RR_pat<A2_xor,      int_hexagon_A2_xor>;
 def : T_RR_pat<A2_combinew, int_hexagon_A2_combinew>;
 
 // Assembler mapped from Rd32=not(Rs32) to Rd32=sub(#-1,Rs32)
-def : Pat <(int_hexagon_A2_not (I32:$Rs)),
-           (A2_subri -1, IntRegs:$Rs)>;
+def : Pat <(int_hexagon_A2_not I32:$Rs),
+           (A2_subri -1, I32:$Rs)>;
 
 // Assembler mapped from Rd32=neg(Rs32) to Rd32=sub(#0,Rs32)
-def : Pat <(int_hexagon_A2_neg IntRegs:$Rs),
-           (A2_subri 0, IntRegs:$Rs)>;
+def : Pat <(int_hexagon_A2_neg I32:$Rs),
+           (A2_subri 0, I32:$Rs)>;
 
 // Transfer immediate
-def  : Pat <(int_hexagon_A2_tfril (I32:$Rs), u16_0ImmPred:$Is),
-            (A2_tfril IntRegs:$Rs, u16_0ImmPred:$Is)>;
-def  : Pat <(int_hexagon_A2_tfrih (I32:$Rs), u16_0ImmPred:$Is),
-            (A2_tfrih IntRegs:$Rs, u16_0ImmPred:$Is)>;
+def  : Pat <(int_hexagon_A2_tfril I32:$Rs, u16_0ImmPred:$Is),
+            (A2_tfril I32:$Rs, u16_0ImmPred:$Is)>;
+def  : Pat <(int_hexagon_A2_tfrih I32:$Rs, u16_0ImmPred:$Is),
+            (A2_tfrih I32:$Rs, u16_0ImmPred:$Is)>;
 
 //  Transfer Register/immediate.
 def : T_R_pat <A2_tfr, int_hexagon_A2_tfr>;
 def : T_I_pat <A2_tfrsi, int_hexagon_A2_tfrsi>;
-def : T_I_pat <A2_tfrpi, int_hexagon_A2_tfrpi>;
+
+def ImmExt64: SDNodeXForm<imm, [{
+  int64_t V = N->getSExtValue();
+  return CurDAG->getTargetConstant(V, SDLoc(N), MVT::i64);
+}]>;
+
+// A2_tfrpi has an operand of type i64. This is necessary, since it is
+// generated from "(set I64:$Rd, imm)". That pattern would not appear
+// in the DAG, if the immediate was not a 64-bit value.
+// The builtin for A2_tfrpi, on the other hand, takes a 32-bit value,
+// which makes it impossible to simply replace it with the instruction.
+// To connect the builtin with the instruction, the builtin's operand
+// needs to be extended to the right type.
+
+def : Pat<(int_hexagon_A2_tfrpi imm:$Is),
+          (A2_tfrpi (ImmExt64 $Is))>;
 
 // Assembler mapped from Rdd32=Rss32 to Rdd32=combine(Rss.H32,Rss.L32)
-def : Pat<(int_hexagon_A2_tfrp DoubleRegs:$src),
-          (A2_combinew (HiReg DoubleRegs:$src), (LoReg DoubleRegs:$src))>;
+def : Pat<(int_hexagon_A2_tfrp I64:$src),
+          (A2_combinew (HiReg I64:$src), (LoReg I64:$src))>;
 
-/********************************************************************
-*            ALU32/PERM                                             *
-*********************************************************************/
+//*******************************************************************
+//           ALU32/PERM
+//*******************************************************************
 // Combine
 def: T_RR_pat<A2_combine_hh, int_hexagon_A2_combine_hh>;
 def: T_RR_pat<A2_combine_hl, int_hexagon_A2_combine_hl>;
@@ -693,10 +776,8 @@ def: T_RR_pat<A2_combine_ll, int_hexagon_A2_combine_ll>;
 
 def: T_II_pat<A2_combineii, int_hexagon_A2_combineii, s32ImmPred, s8ImmPred>;
 
-def: Pat<(i32 (int_hexagon_C2_mux (I32:$Rp), (I32:$Rs), (I32:$Rt))),
-         (i32 (C2_mux (C2_tfrrp IntRegs:$Rp), IntRegs:$Rs, IntRegs:$Rt))>;
-
 // Mux
+def : T_QRR_pat<C2_mux,   int_hexagon_C2_mux>;
 def : T_QRI_pat<C2_muxir, int_hexagon_C2_muxir, s32ImmPred>;
 def : T_QIR_pat<C2_muxri, int_hexagon_C2_muxri, s32ImmPred>;
 def : T_QII_pat<C2_muxii, int_hexagon_C2_muxii, s32ImmPred, s8ImmPred>;
@@ -712,41 +793,36 @@ def : T_R_pat<A2_sxtb, int_hexagon_A2_sxtb>;
 def : T_R_pat<A2_zxth, int_hexagon_A2_zxth>;
 def : T_R_pat<A2_zxtb, int_hexagon_A2_zxtb>;
 
-/********************************************************************
-*            ALU32/PRED                                             *
-*********************************************************************/
+//*******************************************************************
+//           ALU32/PRED
+//*******************************************************************
 // Compare
-def : T_RR_pat<C2_cmpeq,  int_hexagon_C2_cmpeq>;
-def : T_RR_pat<C2_cmpgt,  int_hexagon_C2_cmpgt>;
-def : T_RR_pat<C2_cmpgtu, int_hexagon_C2_cmpgtu>;
-
-def : T_RI_pat<C2_cmpeqi, int_hexagon_C2_cmpeqi, s32ImmPred>;
-def : T_RI_pat<C2_cmpgti, int_hexagon_C2_cmpgti, s32ImmPred>;
-def : T_RI_pat<C2_cmpgtui, int_hexagon_C2_cmpgtui, u32ImmPred>;
-
-def : Pat <(i32 (int_hexagon_C2_cmpgei (I32:$src1), s32ImmPred:$src2)),
-      (i32 (C2_cmpgti (I32:$src1),
-                      (DEC_CONST_SIGNED s32ImmPred:$src2)))>;
-
-def : Pat <(i32 (int_hexagon_C2_cmpgeui (I32:$src1), u32ImmPred:$src2)),
-      (i32 (C2_cmpgtui (I32:$src1),
-                       (DEC_CONST_UNSIGNED u32ImmPred:$src2)))>;
-
-// The instruction, Pd=cmp.geu(Rs, #u8) -> Pd=cmp.eq(Rs,Rs) when #u8 == 0.
-def : Pat <(i32 (int_hexagon_C2_cmpgeui (I32:$src1), 0)),
-      (i32 (C2_cmpeq (I32:$src1), (I32:$src1)))>;
-
-def : Pat <(i32 (int_hexagon_C2_cmplt (I32:$src1),
-                                      (I32:$src2))),
-      (i32 (C2_cmpgt (I32:$src2), (I32:$src1)))>;
-
-def : Pat <(i32 (int_hexagon_C2_cmpltu (I32:$src1),
-                                       (I32:$src2))),
-      (i32 (C2_cmpgtu (I32:$src2), (I32:$src1)))>;
-
-/********************************************************************
-*            ALU32/VH                                               *
-*********************************************************************/
+def : T_Q_RR_pat<C2_cmpeq,  int_hexagon_C2_cmpeq>;
+def : T_Q_RR_pat<C2_cmpgt,  int_hexagon_C2_cmpgt>;
+def : T_Q_RR_pat<C2_cmpgtu, int_hexagon_C2_cmpgtu>;
+
+def : T_Q_RI_pat<C2_cmpeqi,  int_hexagon_C2_cmpeqi, s32ImmPred>;
+def : T_Q_RI_pat<C2_cmpgti,  int_hexagon_C2_cmpgti, s32ImmPred>;
+def : T_Q_RI_pat<C2_cmpgtui, int_hexagon_C2_cmpgtui, u32ImmPred>;
+
+def : Pat <(int_hexagon_C2_cmpgei I32:$src1, s32ImmPred:$src2),
+           (C2_tfrpr (C2_cmpgti I32:$src1,
+                                (DEC_CONST_SIGNED s32ImmPred:$src2)))>;
+
+def : Pat <(int_hexagon_C2_cmpgeui I32:$src1, u32ImmPred:$src2),
+           (C2_tfrpr (C2_cmpgtui I32:$src1,
+                                 (DEC_CONST_UNSIGNED u32ImmPred:$src2)))>;
+
+def : Pat <(int_hexagon_C2_cmpgeui I32:$src, 0),
+           (C2_tfrpr (C2_cmpeq I32:$src, I32:$src))>;
+def : Pat <(int_hexagon_C2_cmplt I32:$src1, I32:$src2),
+           (C2_tfrpr (C2_cmpgt I32:$src2, I32:$src1))>;
+def : Pat <(int_hexagon_C2_cmpltu I32:$src1, I32:$src2),
+           (C2_tfrpr (C2_cmpgtu I32:$src2, I32:$src1))>;
+
+//*******************************************************************
+//           ALU32/VH
+//*******************************************************************
 // Vector add, subtract, average halfwords
 def: T_RR_pat<A2_svaddh,   int_hexagon_A2_svaddh>;
 def: T_RR_pat<A2_svaddhs,  int_hexagon_A2_svaddhs>;
@@ -760,28 +836,28 @@ def: T_RR_pat<A2_svavgh,   int_hexagon_A2_svavgh>;
 def: T_RR_pat<A2_svavghs,  int_hexagon_A2_svavghs>;
 def: T_RR_pat<A2_svnavgh,  int_hexagon_A2_svnavgh>;
 
-/********************************************************************
-*            ALU64/ALU                                              *
-*********************************************************************/
-def: T_RR_pat<A2_addsat,   int_hexagon_A2_addsat>;
-def: T_RR_pat<A2_subsat,   int_hexagon_A2_subsat>;
-def: T_PP_pat<A2_addp,     int_hexagon_A2_addp>;
-def: T_PP_pat<A2_subp,     int_hexagon_A2_subp>;
+//*******************************************************************
+//           ALU64/ALU
+//*******************************************************************
+def: T_RR_pat<A2_addsat,     int_hexagon_A2_addsat>;
+def: T_RR_pat<A2_subsat,     int_hexagon_A2_subsat>;
+def: T_PP_pat<A2_addp,       int_hexagon_A2_addp>;
+def: T_PP_pat<A2_subp,       int_hexagon_A2_subp>;
 
-def: T_PP_pat<A2_andp,     int_hexagon_A2_andp>;
-def: T_PP_pat<A2_orp,      int_hexagon_A2_orp>;
-def: T_PP_pat<A2_xorp,     int_hexagon_A2_xorp>;
+def: T_PP_pat<A2_andp,       int_hexagon_A2_andp>;
+def: T_PP_pat<A2_orp,        int_hexagon_A2_orp>;
+def: T_PP_pat<A2_xorp,       int_hexagon_A2_xorp>;
 
-def: T_PP_pat<C2_cmpeqp,   int_hexagon_C2_cmpeqp>;
-def: T_PP_pat<C2_cmpgtp,   int_hexagon_C2_cmpgtp>;
-def: T_PP_pat<C2_cmpgtup,  int_hexagon_C2_cmpgtup>;
+def: T_Q_PP_pat<C2_cmpeqp,   int_hexagon_C2_cmpeqp>;
+def: T_Q_PP_pat<C2_cmpgtp,   int_hexagon_C2_cmpgtp>;
+def: T_Q_PP_pat<C2_cmpgtup,  int_hexagon_C2_cmpgtup>;
 
-def: T_PP_pat<S2_parityp,  int_hexagon_S2_parityp>;
-def: T_RR_pat<S2_packhl,   int_hexagon_S2_packhl>;
+def: T_PP_pat<S2_parityp,    int_hexagon_S2_parityp>;
+def: T_RR_pat<S2_packhl,     int_hexagon_S2_packhl>;
 
-/********************************************************************
-*            ALU64/VB                                               *
-*********************************************************************/
+//*******************************************************************
+//           ALU64/VB
+//*******************************************************************
 // ALU64 - Vector add
 def : T_PP_pat <A2_vaddub,   int_hexagon_A2_vaddub>;
 def : T_PP_pat <A2_vaddubs,  int_hexagon_A2_vaddubs>;
@@ -838,23 +914,22 @@ def : T_PP_pat <A2_vsubw,    int_hexagon_A2_vsubw>;
 def : T_PP_pat <A2_vsubws,   int_hexagon_A2_vsubws>;
 
 // ALU64 - Vector compare bytes
-def : T_PP_pat <A2_vcmpbeq,  int_hexagon_A2_vcmpbeq>;
-def : T_PP_pat <A4_vcmpbgt,  int_hexagon_A4_vcmpbgt>;
-def : T_PP_pat <A2_vcmpbgtu, int_hexagon_A2_vcmpbgtu>;
+def : T_Q_PP_pat <A2_vcmpbeq,  int_hexagon_A2_vcmpbeq>;
+def : T_Q_PP_pat <A4_vcmpbgt,  int_hexagon_A4_vcmpbgt>;
+def : T_Q_PP_pat <A2_vcmpbgtu, int_hexagon_A2_vcmpbgtu>;
 
 // ALU64 - Vector compare halfwords
-def : T_PP_pat <A2_vcmpheq,  int_hexagon_A2_vcmpheq>;
-def : T_PP_pat <A2_vcmphgt,  int_hexagon_A2_vcmphgt>;
-def : T_PP_pat <A2_vcmphgtu, int_hexagon_A2_vcmphgtu>;
+def : T_Q_PP_pat <A2_vcmpheq,  int_hexagon_A2_vcmpheq>;
+def : T_Q_PP_pat <A2_vcmphgt,  int_hexagon_A2_vcmphgt>;
+def : T_Q_PP_pat <A2_vcmphgtu, int_hexagon_A2_vcmphgtu>;
 
 // ALU64 - Vector compare words
-def : T_PP_pat <A2_vcmpweq,  int_hexagon_A2_vcmpweq>;
-def : T_PP_pat <A2_vcmpwgt,  int_hexagon_A2_vcmpwgt>;
-def : T_PP_pat <A2_vcmpwgtu, int_hexagon_A2_vcmpwgtu>;
+def : T_Q_PP_pat <A2_vcmpweq,  int_hexagon_A2_vcmpweq>;
+def : T_Q_PP_pat <A2_vcmpwgt,  int_hexagon_A2_vcmpwgt>;
+def : T_Q_PP_pat <A2_vcmpwgtu, int_hexagon_A2_vcmpwgtu>;
 
 // ALU64 / VB / Vector mux.
-def : Pat<(int_hexagon_C2_vmux PredRegs:$Pu, DoubleRegs:$Rs, DoubleRegs:$Rt),
-          (C2_vmux PredRegs:$Pu, DoubleRegs:$Rs, DoubleRegs:$Rt)>;
+def : T_QPP_pat <C2_vmux,      int_hexagon_C2_vmux>;
 
 // MPY - Multiply and use full result
 // Rdd = mpy[u](Rs, Rt)
@@ -903,35 +978,24 @@ def : T_PRR_pat <M2_vmac2, int_hexagon_M2_vmac2>;
 def : T_PRR_pat <M2_vmac2s_s0, int_hexagon_M2_vmac2s_s0>;
 def : T_PRR_pat <M2_vmac2s_s1, int_hexagon_M2_vmac2s_s1>;
 
-/********************************************************************
-*            CR                                                     *
-*********************************************************************/
-class qi_CRInst_qi_pat<InstHexagon Inst, Intrinsic IntID> :
-  Pat<(i32 (IntID IntRegs:$Rs)),
-      (i32 (C2_tfrpr (Inst (C2_tfrrp IntRegs:$Rs))))>;
-
-class qi_CRInst_qiqi_pat<InstHexagon Inst, Intrinsic IntID> :
-  Pat<(i32 (IntID IntRegs:$Rs, IntRegs:$Rt)),
-      (i32 (C2_tfrpr (Inst (C2_tfrrp IntRegs:$Rs), (C2_tfrrp IntRegs:$Rt))))>;
-
-def: qi_CRInst_qi_pat<C2_not,     int_hexagon_C2_not>;
-def: qi_CRInst_qi_pat<C2_all8,    int_hexagon_C2_all8>;
-def: qi_CRInst_qi_pat<C2_any8,    int_hexagon_C2_any8>;
+//*******************************************************************
+//           CR
+//*******************************************************************
+def: T_Q_Q_pat<C2_not,       int_hexagon_C2_not>;
+def: T_Q_Q_pat<C2_all8,      int_hexagon_C2_all8>;
+def: T_Q_Q_pat<C2_any8,      int_hexagon_C2_any8>;
+def: T_Q_Q_pat<C2_pxfer_map, int_hexagon_C2_pxfer_map>;
 
-def: qi_CRInst_qiqi_pat<C2_and,   int_hexagon_C2_and>;
-def: qi_CRInst_qiqi_pat<C2_andn,  int_hexagon_C2_andn>;
-def: qi_CRInst_qiqi_pat<C2_or,    int_hexagon_C2_or>;
-def: qi_CRInst_qiqi_pat<C2_orn,   int_hexagon_C2_orn>;
-def: qi_CRInst_qiqi_pat<C2_xor,   int_hexagon_C2_xor>;
-
-// Assembler mapped from  Pd4=Ps4 to Pd4=or(Ps4,Ps4)
-def : Pat<(int_hexagon_C2_pxfer_map PredRegs:$src),
-          (C2_pxfer_map PredRegs:$src)>;
+def: T_Q_QQ_pat<C2_and,      int_hexagon_C2_and>;
+def: T_Q_QQ_pat<C2_andn,     int_hexagon_C2_andn>;
+def: T_Q_QQ_pat<C2_or,       int_hexagon_C2_or>;
+def: T_Q_QQ_pat<C2_orn,      int_hexagon_C2_orn>;
+def: T_Q_QQ_pat<C2_xor,      int_hexagon_C2_xor>;
 
 // Multiply 32x32 and use lower result
 def : T_RRI_pat <M2_macsip, int_hexagon_M2_macsip>;
 def : T_RRI_pat <M2_macsin, int_hexagon_M2_macsin>;
-def : T_RRR_pat <M2_maci, int_hexagon_M2_maci>;
+def : T_RRR_pat <M2_maci,   int_hexagon_M2_maci>;
 
 // Subtract and accumulate
 def : T_RRR_pat <M2_subacc, int_hexagon_M2_subacc>;
@@ -945,54 +1009,45 @@ def : T_RRI_pat <M2_naccii, int_hexagon_M2_naccii>;
 // XOR and XOR with destination
 def : T_RRR_pat <M2_xor_xacc, int_hexagon_M2_xor_xacc>;
 
-class MType_R32_pat <Intrinsic IntID, InstHexagon OutputInst> :
-      Pat <(IntID IntRegs:$src1, IntRegs:$src2),
-           (OutputInst IntRegs:$src1, IntRegs:$src2)>;
-
 // Vector dual multiply with round and pack
-
-def : Pat <(int_hexagon_M2_vdmpyrs_s0 DoubleRegs:$src1, DoubleRegs:$src2),
-           (M2_vdmpyrs_s0 DoubleRegs:$src1, DoubleRegs:$src2)>;
-
-def : Pat <(int_hexagon_M2_vdmpyrs_s1 DoubleRegs:$src1, DoubleRegs:$src2),
-           (M2_vdmpyrs_s1 DoubleRegs:$src1, DoubleRegs:$src2)>;
+def : T_PP_pat <M2_vdmpyrs_s0, int_hexagon_M2_vdmpyrs_s0>;
+def : T_PP_pat <M2_vdmpyrs_s1, int_hexagon_M2_vdmpyrs_s1>;
 
 // Vector multiply halfwords with round and pack
-
-def : MType_R32_pat <int_hexagon_M2_vmpy2s_s0pack, M2_vmpy2s_s0pack>;
-def : MType_R32_pat <int_hexagon_M2_vmpy2s_s1pack, M2_vmpy2s_s1pack>;
+def : T_RR_pat <M2_vmpy2s_s0pack, int_hexagon_M2_vmpy2s_s0pack>;
+def : T_RR_pat <M2_vmpy2s_s1pack, int_hexagon_M2_vmpy2s_s1pack>;
 
 // Multiply and use lower result
-def : MType_R32_pat <int_hexagon_M2_mpyi, M2_mpyi>;
-def : T_RI_pat<M2_mpysmi, int_hexagon_M2_mpysmi>;
+def : T_RR_pat <M2_mpyi, int_hexagon_M2_mpyi>;
+def : T_RI_pat <M2_mpysmi, int_hexagon_M2_mpysmi>;
 
 // Assembler mapped from Rd32=mpyui(Rs32,Rt32) to Rd32=mpyi(Rs32,Rt32)
-def : MType_R32_pat <int_hexagon_M2_mpyui, M2_mpyi>;
+def : T_RR_pat <M2_mpyi, int_hexagon_M2_mpyui>;
 
 // Multiply and use upper result
-def : MType_R32_pat <int_hexagon_M2_mpy_up, M2_mpy_up>;
-def : MType_R32_pat <int_hexagon_M2_mpyu_up, M2_mpyu_up>;
-def : MType_R32_pat <int_hexagon_M2_hmmpyh_rs1, M2_hmmpyh_rs1>;
-def : MType_R32_pat <int_hexagon_M2_hmmpyl_rs1, M2_hmmpyl_rs1>;
-def : MType_R32_pat <int_hexagon_M2_dpmpyss_rnd_s0, M2_dpmpyss_rnd_s0>;
+def : T_RR_pat <M2_mpy_up, int_hexagon_M2_mpy_up>;
+def : T_RR_pat <M2_mpyu_up, int_hexagon_M2_mpyu_up>;
+def : T_RR_pat <M2_hmmpyh_rs1, int_hexagon_M2_hmmpyh_rs1>;
+def : T_RR_pat <M2_hmmpyl_rs1, int_hexagon_M2_hmmpyl_rs1>;
+def : T_RR_pat <M2_dpmpyss_rnd_s0, int_hexagon_M2_dpmpyss_rnd_s0>;
 
 // Complex multiply with round and pack
 // Rxx32+=cmpy(Rs32,[*]Rt32:<<1]:rnd:sat
-def : MType_R32_pat <int_hexagon_M2_cmpyrs_s0, M2_cmpyrs_s0>;
-def : MType_R32_pat <int_hexagon_M2_cmpyrs_s1, M2_cmpyrs_s1>;
-def : MType_R32_pat <int_hexagon_M2_cmpyrsc_s0, M2_cmpyrsc_s0>;
-def : MType_R32_pat <int_hexagon_M2_cmpyrsc_s1, M2_cmpyrsc_s1>;
-
-/********************************************************************
-*            STYPE/ALU                                              *
-*********************************************************************/
+def : T_RR_pat <M2_cmpyrs_s0, int_hexagon_M2_cmpyrs_s0>;
+def : T_RR_pat <M2_cmpyrs_s1, int_hexagon_M2_cmpyrs_s1>;
+def : T_RR_pat <M2_cmpyrsc_s0, int_hexagon_M2_cmpyrsc_s0>;
+def : T_RR_pat <M2_cmpyrsc_s1, int_hexagon_M2_cmpyrsc_s1>;
+
+//*******************************************************************
+//           STYPE/ALU
+//*******************************************************************
 def : T_P_pat <A2_absp, int_hexagon_A2_absp>;
 def : T_P_pat <A2_negp, int_hexagon_A2_negp>;
 def : T_P_pat <A2_notp, int_hexagon_A2_notp>;
 
-/********************************************************************
-*            STYPE/BIT                                              *
-*********************************************************************/
+//*******************************************************************
+//           STYPE/BIT
+//*******************************************************************
 
 // Count leading/trailing
 def: T_R_pat<S2_cl0,     int_hexagon_S2_cl0>;
@@ -1023,6 +1078,11 @@ def : T_PP_pat <S2_vtrunowh, int_hexagon_S2_vtrunowh>;
 // Linear feedback-shift Iteration.
 def : T_PP_pat <S2_lfsp, int_hexagon_S2_lfsp>;
 
+// Vector align
+// Need custom lowering
+def : T_PPQ_pat <S2_valignrb, int_hexagon_S2_valignrb>;
+def : T_PPI_pat <S2_valignib, int_hexagon_S2_valignib>;
+
 // Vector splice
 def : T_PPQ_pat <S2_vsplicerb, int_hexagon_S2_vsplicerb>;
 def : T_PPI_pat <S2_vspliceib, int_hexagon_S2_vspliceib>;
@@ -1037,26 +1097,22 @@ def : T_RP_pat <S2_extractu_rp,  int_hexagon_S2_extractu_rp>;
 def : T_PP_pat <S2_extractup_rp, int_hexagon_S2_extractup_rp>;
 
 // Insert bitfield
-def : Pat <(int_hexagon_S2_insert_rp IntRegs:$src1, IntRegs:$src2,
-                                     DoubleRegs:$src3),
-           (S2_insert_rp IntRegs:$src1, IntRegs:$src2, DoubleRegs:$src3)>;
+def : Pat <(int_hexagon_S2_insert_rp I32:$src1, I32:$src2, I64:$src3),
+           (S2_insert_rp I32:$src1, I32:$src2, I64:$src3)>;
 
-def : Pat<(i64 (int_hexagon_S2_insertp_rp (I64:$src1),
-                 (I64:$src2), (I64:$src3))),
-          (i64 (S2_insertp_rp (I64:$src1), (I64:$src2),
-                              (I64:$src3)))>;
+def : Pat<(i64 (int_hexagon_S2_insertp_rp I64:$src1, I64:$src2, I64:$src3)),
+          (i64 (S2_insertp_rp I64:$src1, I64:$src2, I64:$src3))>;
 
-def : Pat<(int_hexagon_S2_insert IntRegs:$src1, IntRegs:$src2,
+def : Pat<(int_hexagon_S2_insert I32:$src1, I32:$src2,
                                  u5ImmPred:$src3, u5ImmPred:$src4),
-          (S2_insert IntRegs:$src1, IntRegs:$src2,
+          (S2_insert I32:$src1, I32:$src2,
                      u5ImmPred:$src3, u5ImmPred:$src4)>;
 
-def : Pat<(i64 (int_hexagon_S2_insertp (I64:$src1),
-                 (I64:$src2), u6ImmPred:$src3, u6ImmPred:$src4)),
-          (i64 (S2_insertp (I64:$src1), (I64:$src2),
+def : Pat<(i64 (int_hexagon_S2_insertp I64:$src1, I64:$src2,
+                                       u6ImmPred:$src3, u6ImmPred:$src4)),
+          (i64 (S2_insertp I64:$src1, I64:$src2,
                            u6ImmPred:$src3, u6ImmPred:$src4))>;
 
-
 // Innterleave/deinterleave
 def : T_P_pat <S2_interleave, int_hexagon_S2_interleave>;
 def : T_P_pat <S2_deinterleave, int_hexagon_S2_deinterleave>;
@@ -1071,21 +1127,21 @@ def: T_RR_pat<S2_clrbit_r,    int_hexagon_S2_clrbit_r>;
 def: T_RR_pat<S2_togglebit_r, int_hexagon_S2_togglebit_r>;
 
 // Test Bit
-def: T_RI_pat<S2_tstbit_i,    int_hexagon_S2_tstbit_i>;
-def: T_RR_pat<S2_tstbit_r,    int_hexagon_S2_tstbit_r>;
+def: T_Q_RI_pat<S2_tstbit_i,  int_hexagon_S2_tstbit_i>;
+def: T_Q_RR_pat<S2_tstbit_r,  int_hexagon_S2_tstbit_r>;
 
-/********************************************************************
-*            STYPE/COMPLEX                                          *
-*********************************************************************/
+//*******************************************************************
+//           STYPE/COMPLEX
+//*******************************************************************
 // Vector Complex conjugate
 def : T_P_pat <A2_vconj, int_hexagon_A2_vconj>;
 
 // Vector Complex rotate
 def : T_PR_pat <S2_vcrotate, int_hexagon_S2_vcrotate>;
 
-/********************************************************************
-*            STYPE/PERM                                             *
-*********************************************************************/
+//*******************************************************************
+//           STYPE/PERM
+//*******************************************************************
 
 // Vector saturate without pack
 def : T_P_pat <S2_vsathb_nopack, int_hexagon_S2_vsathb_nopack>;
@@ -1093,28 +1149,26 @@ def : T_P_pat <S2_vsathub_nopack, int_hexagon_S2_vsathub_nopack>;
 def : T_P_pat <S2_vsatwh_nopack, int_hexagon_S2_vsatwh_nopack>;
 def : T_P_pat <S2_vsatwuh_nopack, int_hexagon_S2_vsatwuh_nopack>;
 
-/********************************************************************
-*            STYPE/PRED                                             *
-*********************************************************************/
+//*******************************************************************
+//           STYPE/PRED
+//*******************************************************************
 
 // Predicate transfer
-def: Pat<(i32 (int_hexagon_C2_tfrpr (I32:$Rs))),
-         (i32 (C2_tfrpr (C2_tfrrp (I32:$Rs))))>;
-def: Pat<(i32 (int_hexagon_C2_tfrrp (I32:$Rs))),
-         (i32 (C2_tfrpr (C2_tfrrp (I32:$Rs))))>;
+def: Pat<(i32 (int_hexagon_C2_tfrpr I32:$Rs)),
+         (i32 (C2_tfrpr (C2_tfrrp I32:$Rs)))>;
+def: Pat<(i32 (int_hexagon_C2_tfrrp I32:$Rs)),
+         (i32 (C2_tfrpr (C2_tfrrp I32:$Rs)))>;
 
 // Mask generate from predicate
-def: Pat<(i64 (int_hexagon_C2_mask (I32:$Rs))),
-         (i64 (C2_mask (C2_tfrrp (I32:$Rs))))>;
+def: Pat<(i64 (int_hexagon_C2_mask I32:$Rs)),
+         (i64 (C2_mask (C2_tfrrp I32:$Rs)))>;
 
 // Viterbi pack even and odd predicate bits
-def: Pat<(i32 (int_hexagon_C2_vitpack (I32:$Rs), (I32:$Rt))),
-         (i32 (C2_vitpack (C2_tfrrp (I32:$Rs)),
-                          (C2_tfrrp (I32:$Rt))))>;
+def: T_QQ_pat<C2_vitpack, int_hexagon_C2_vitpack>;
 
-/********************************************************************
-*            STYPE/SHIFT                                            *
-*********************************************************************/
+//*******************************************************************
+//           STYPE/SHIFT
+//*******************************************************************
 
 def : T_PI_pat <S2_asr_i_p, int_hexagon_S2_asr_i_p>;
 def : T_PI_pat <S2_lsr_i_p, int_hexagon_S2_lsr_i_p>;
@@ -1185,8 +1239,8 @@ def : T_RI_pat <S2_asl_i_r_sat, int_hexagon_S2_asl_i_r_sat>;
 //===----------------------------------------------------------------------===//
 class S2op_tableidx_pat <Intrinsic IntID, InstHexagon OutputInst,
                          SDNodeXForm XformImm>
-  : Pat <(IntID IntRegs:$src1, IntRegs:$src2, u4ImmPred:$src3, u5ImmPred:$src4),
-         (OutputInst IntRegs:$src1, IntRegs:$src2, u4ImmPred:$src3,
+  : Pat <(IntID I32:$src1, I32:$src2, u4ImmPred:$src3, u5ImmPred:$src4),
+         (OutputInst I32:$src1, I32:$src2, u4ImmPred:$src3,
                      (XformImm u5ImmPred:$src4))>;
 
 
@@ -1195,9 +1249,9 @@ class S2op_tableidx_pat <Intrinsic IntID, InstHexagon OutputInst,
 // values from the 4th input operand. Please note that subtraction is not
 // needed for int_hexagon_S2_tableidxb_goodsyntax.
 
-def : Pat <(int_hexagon_S2_tableidxb_goodsyntax IntRegs:$src1, IntRegs:$src2,
+def : Pat <(int_hexagon_S2_tableidxb_goodsyntax I32:$src1, I32:$src2,
                                               u4ImmPred:$src3, u5ImmPred:$src4),
-           (S2_tableidxb IntRegs:$src1, IntRegs:$src2,
+           (S2_tableidxb I32:$src1, I32:$src2,
                          u4ImmPred:$src3, u5ImmPred:$src4)>;
 
 def : S2op_tableidx_pat <int_hexagon_S2_tableidxh_goodsyntax, S2_tableidxh,
@@ -1207,9 +1261,9 @@ def : S2op_tableidx_pat <int_hexagon_S2_tableidxw_goodsyntax, S2_tableidxw,
 def : S2op_tableidx_pat <int_hexagon_S2_tableidxd_goodsyntax, S2_tableidxd,
                          DEC3_CONST_SIGNED>;
 
-/********************************************************************
-*            STYPE/VH                                               *
-*********************************************************************/
+//*******************************************************************
+//           STYPE/VH
+//*******************************************************************
 
 // Vector absolute value halfwords with and without saturation
 // Rdd64=vabsh(Rss64)[:sat]
@@ -1229,9 +1283,9 @@ def : T_PR_pat <S2_lsr_r_vh, int_hexagon_S2_lsr_r_vh>;
 def : T_PR_pat <S2_asl_r_vh, int_hexagon_S2_asl_r_vh>;
 def : T_PR_pat <S2_lsl_r_vh, int_hexagon_S2_lsl_r_vh>;
 
-/********************************************************************
-*            STYPE/VW                                               *
-*********************************************************************/
+//*******************************************************************
+//           STYPE/VW
+//*******************************************************************
 
 // Vector absolute value words with and without saturation
 def : T_P_pat <A2_vabsw, int_hexagon_A2_vabsw>;
@@ -1251,43 +1305,42 @@ def : T_PR_pat <S2_asl_r_vw, int_hexagon_S2_asl_r_vw>;
 def : T_PR_pat <S2_lsl_r_vw, int_hexagon_S2_lsl_r_vw>;
 
 // Vector shift words with truncate and pack
-
 def : T_PR_pat <S2_asr_r_svw_trun, int_hexagon_S2_asr_r_svw_trun>;
 
+// Load/store locked.
 def : T_R_pat<L2_loadw_locked, int_hexagon_L2_loadw_locked>;
 def : T_R_pat<L4_loadd_locked, int_hexagon_L4_loadd_locked>;
 
-def: Pat<(i32 (int_hexagon_S2_storew_locked (I32:$Rs), (I32:$Rt))),
-         (i32 (C2_tfrpr (S2_storew_locked (I32:$Rs), (I32:$Rt))))>;
-def: Pat<(i32 (int_hexagon_S4_stored_locked (I32:$Rs), (I64:$Rt))),
-         (i32 (C2_tfrpr (S4_stored_locked (I32:$Rs), (I64:$Rt))))>;
+def : Pat<(int_hexagon_S2_storew_locked I32:$Rs, I32:$Rt),
+          (C2_tfrpr (S2_storew_locked I32:$Rs, I32:$Rt))>;
+def : Pat<(int_hexagon_S4_stored_locked I32:$Rs, I64:$Rt),
+          (C2_tfrpr (S4_stored_locked I32:$Rs, I64:$Rt))>;
 
-/********************************************************************
-*            ST
-*********************************************************************/
+//*******************************************************************
+//           ST
+//*******************************************************************
 
 class T_stb_pat <InstHexagon MI, Intrinsic IntID, PatLeaf Val>
   : Pat<(IntID I32:$Rs, Val:$Rt, I32:$Ru),
-        (MI I32:$Rs, Val:$Rt, I32:$Ru)>;
+        (MI I32:$Rs, I32:$Ru, Val:$Rt)>;
 
-def : T_stb_pat <S2_storerh_pbr_pseudo, int_hexagon_brev_sth,   I32>;
-def : T_stb_pat <S2_storerb_pbr_pseudo, int_hexagon_brev_stb,   I32>;
-def : T_stb_pat <S2_storeri_pbr_pseudo, int_hexagon_brev_stw,   I32>;
-def : T_stb_pat <S2_storerf_pbr_pseudo, int_hexagon_brev_sthhi, I32>;
-def : T_stb_pat <S2_storerd_pbr_pseudo, int_hexagon_brev_std,   I64>;
+def : T_stb_pat <S2_storerh_pbr, int_hexagon_brev_sth,   I32>;
+def : T_stb_pat <S2_storerb_pbr, int_hexagon_brev_stb,   I32>;
+def : T_stb_pat <S2_storeri_pbr, int_hexagon_brev_stw,   I32>;
+def : T_stb_pat <S2_storerf_pbr, int_hexagon_brev_sthhi, I32>;
+def : T_stb_pat <S2_storerd_pbr, int_hexagon_brev_std,   I64>;
 
 class T_stc_pat <InstHexagon MI, Intrinsic IntID, PatLeaf Imm, PatLeaf Val>
   : Pat<(IntID I32:$Rs, Val:$Rt, I32:$Ru, Imm:$s),
-        (MI I32:$Rs, Val:$Rt, I32:$Ru, Imm:$s)>;
+        (MI I32:$Rs, Imm:$s, I32:$Ru, Val:$Rt)>;
 
-def: T_stc_pat<S2_storerb_pci_pseudo, int_hexagon_circ_stb,   s4_0ImmPred, I32>;
-def: T_stc_pat<S2_storerh_pci_pseudo, int_hexagon_circ_sth,   s4_1ImmPred, I32>;
-def: T_stc_pat<S2_storeri_pci_pseudo, int_hexagon_circ_stw,   s4_2ImmPred, I32>;
-def: T_stc_pat<S2_storerd_pci_pseudo, int_hexagon_circ_std,   s4_3ImmPred, I64>;
-def: T_stc_pat<S2_storerf_pci_pseudo, int_hexagon_circ_sthhi, s4_1ImmPred, I32>;
+def: T_stc_pat<S2_storerb_pci, int_hexagon_circ_stb,   s4_0ImmPred, I32>;
+def: T_stc_pat<S2_storerh_pci, int_hexagon_circ_sth,   s4_1ImmPred, I32>;
+def: T_stc_pat<S2_storeri_pci, int_hexagon_circ_stw,   s4_2ImmPred, I32>;
+def: T_stc_pat<S2_storerd_pci, int_hexagon_circ_std,   s4_3ImmPred, I64>;
+def: T_stc_pat<S2_storerf_pci, int_hexagon_circ_sthhi, s4_1ImmPred, I32>;
 
 include "HexagonIntrinsicsV3.td"
 include "HexagonIntrinsicsV4.td"
 include "HexagonIntrinsicsV5.td"
 include "HexagonIntrinsicsV60.td"
-
diff --git a/lib/Target/Hexagon/HexagonIntrinsicsV4.td b/lib/Target/Hexagon/HexagonIntrinsicsV4.td
index c80a188d82e7..578973db1933 100644
--- a/lib/Target/Hexagon/HexagonIntrinsicsV4.td
+++ b/lib/Target/Hexagon/HexagonIntrinsicsV4.td
@@ -60,71 +60,60 @@ def : T_PPR_pat <S2_lsr_r_p_xor, int_hexagon_S2_lsr_r_p_xor>;
 def : T_PPR_pat <S2_lsl_r_p_xor, int_hexagon_S2_lsl_r_p_xor>;
 
 // Multiply and use upper result
-def : MType_R32_pat <int_hexagon_M2_mpysu_up, M2_mpysu_up>;
-def : MType_R32_pat <int_hexagon_M2_mpy_up_s1, M2_mpy_up_s1>;
-def : MType_R32_pat <int_hexagon_M2_hmmpyh_s1, M2_hmmpyh_s1>;
-def : MType_R32_pat <int_hexagon_M2_hmmpyl_s1, M2_hmmpyl_s1>;
-def : MType_R32_pat <int_hexagon_M2_mpy_up_s1_sat, M2_mpy_up_s1_sat>;
+def : T_RR_pat <M2_mpysu_up, int_hexagon_M2_mpysu_up>;
+def : T_RR_pat <M2_mpy_up_s1, int_hexagon_M2_mpy_up_s1>;
+def : T_RR_pat <M2_hmmpyh_s1, int_hexagon_M2_hmmpyh_s1>;
+def : T_RR_pat <M2_hmmpyl_s1, int_hexagon_M2_hmmpyl_s1>;
+def : T_RR_pat <M2_mpy_up_s1_sat, int_hexagon_M2_mpy_up_s1_sat>;
 
-// Vector reduce add unsigned halfwords
-def : Pat <(int_hexagon_M2_vraddh DoubleRegs:$src1, DoubleRegs:$src2),
-           (M2_vraddh DoubleRegs:$src1, DoubleRegs:$src2)>;
-
-def : T_P_pat <S2_brevp, int_hexagon_S2_brevp>;
-
-def: T_P_pat  <S2_ct0p,      int_hexagon_S2_ct0p>;
-def: T_P_pat  <S2_ct1p,      int_hexagon_S2_ct1p>;
-def: T_RR_pat<C4_nbitsset,  int_hexagon_C4_nbitsset>;
-def: T_RR_pat<C4_nbitsclr,  int_hexagon_C4_nbitsclr>;
-def: T_RI_pat<C4_nbitsclri, int_hexagon_C4_nbitsclri>;
-
-
-class vcmpImm_pat <InstHexagon MI, Intrinsic IntID, PatLeaf immPred> :
-      Pat <(IntID  (i64 DoubleRegs:$src1), immPred:$src2),
-           (MI (i64 DoubleRegs:$src1), immPred:$src2)>;
-
-def : vcmpImm_pat <A4_vcmpbeqi, int_hexagon_A4_vcmpbeqi, u8ImmPred>;
-def : vcmpImm_pat <A4_vcmpbgti, int_hexagon_A4_vcmpbgti, s8ImmPred>;
-def : vcmpImm_pat <A4_vcmpbgtui, int_hexagon_A4_vcmpbgtui, u7ImmPred>;
-
-def : vcmpImm_pat <A4_vcmpheqi, int_hexagon_A4_vcmpheqi, s8ImmPred>;
-def : vcmpImm_pat <A4_vcmphgti, int_hexagon_A4_vcmphgti, s8ImmPred>;
-def : vcmpImm_pat <A4_vcmphgtui, int_hexagon_A4_vcmphgtui, u7ImmPred>;
-
-def : vcmpImm_pat <A4_vcmpweqi, int_hexagon_A4_vcmpweqi, s8ImmPred>;
-def : vcmpImm_pat <A4_vcmpwgti, int_hexagon_A4_vcmpwgti, s8ImmPred>;
-def : vcmpImm_pat <A4_vcmpwgtui, int_hexagon_A4_vcmpwgtui, u7ImmPred>;
-
-def : T_PP_pat<A4_vcmpbeq_any, int_hexagon_A4_vcmpbeq_any>;
+def : T_PP_pat <A2_vaddub, int_hexagon_A2_vaddb_map>;
+def : T_PP_pat <A2_vsubub, int_hexagon_A2_vsubb_map>;
 
-def : T_RR_pat<A4_cmpbeq,   int_hexagon_A4_cmpbeq>;
-def : T_RR_pat<A4_cmpbgt,   int_hexagon_A4_cmpbgt>;
-def : T_RR_pat<A4_cmpbgtu,  int_hexagon_A4_cmpbgtu>;
-def : T_RR_pat<A4_cmpheq,   int_hexagon_A4_cmpheq>;
-def : T_RR_pat<A4_cmphgt,   int_hexagon_A4_cmphgt>;
-def : T_RR_pat<A4_cmphgtu,  int_hexagon_A4_cmphgtu>;
-
-def : T_RI_pat<A4_cmpbeqi,  int_hexagon_A4_cmpbeqi>;
-def : T_RI_pat<A4_cmpbgti,  int_hexagon_A4_cmpbgti>;
-def : T_RI_pat<A4_cmpbgtui, int_hexagon_A4_cmpbgtui>;
-
-def : T_RI_pat<A4_cmpheqi,  int_hexagon_A4_cmpheqi>;
-def : T_RI_pat<A4_cmphgti,  int_hexagon_A4_cmphgti>;
-def : T_RI_pat<A4_cmphgtui, int_hexagon_A4_cmphgtui>;
-
-def : T_RP_pat <A4_boundscheck, int_hexagon_A4_boundscheck>;
-
-def : T_PR_pat<A4_tlbmatch, int_hexagon_A4_tlbmatch>;
-
-def : Pat <(int_hexagon_M4_mpyrr_addr IntRegs:$src1, IntRegs:$src2,
-                                      IntRegs:$src3),
-           (M4_mpyrr_addr IntRegs:$src1, IntRegs:$src2, IntRegs:$src3)>;
-
-def : T_IRR_pat <M4_mpyrr_addi, int_hexagon_M4_mpyrr_addi>;
-def : T_IRI_pat <M4_mpyri_addi, int_hexagon_M4_mpyri_addi>;
+// Vector reduce add unsigned halfwords
+def : T_PP_pat <M2_vraddh, int_hexagon_M2_vraddh>;
+
+def: T_P_pat<S2_brevp, int_hexagon_S2_brevp>;
+def: T_P_pat<S2_ct0p,  int_hexagon_S2_ct0p>;
+def: T_P_pat<S2_ct1p,  int_hexagon_S2_ct1p>;
+
+def: T_Q_RR_pat<C4_nbitsset,  int_hexagon_C4_nbitsset>;
+def: T_Q_RR_pat<C4_nbitsclr,  int_hexagon_C4_nbitsclr>;
+def: T_Q_RI_pat<C4_nbitsclri, int_hexagon_C4_nbitsclri>;
+
+def : T_Q_PI_pat<A4_vcmpbeqi,     int_hexagon_A4_vcmpbeqi>;
+def : T_Q_PI_pat<A4_vcmpbgti,     int_hexagon_A4_vcmpbgti>;
+def : T_Q_PI_pat<A4_vcmpbgtui,    int_hexagon_A4_vcmpbgtui>;
+def : T_Q_PI_pat<A4_vcmpheqi,     int_hexagon_A4_vcmpheqi>;
+def : T_Q_PI_pat<A4_vcmphgti,     int_hexagon_A4_vcmphgti>;
+def : T_Q_PI_pat<A4_vcmphgtui,    int_hexagon_A4_vcmphgtui>;
+def : T_Q_PI_pat<A4_vcmpweqi,     int_hexagon_A4_vcmpweqi>;
+def : T_Q_PI_pat<A4_vcmpwgti,     int_hexagon_A4_vcmpwgti>;
+def : T_Q_PI_pat<A4_vcmpwgtui,    int_hexagon_A4_vcmpwgtui>;
+def : T_Q_PP_pat<A4_vcmpbeq_any,  int_hexagon_A4_vcmpbeq_any>;
+
+def : T_Q_RR_pat<A4_cmpbeq,   int_hexagon_A4_cmpbeq>;
+def : T_Q_RR_pat<A4_cmpbgt,   int_hexagon_A4_cmpbgt>;
+def : T_Q_RR_pat<A4_cmpbgtu,  int_hexagon_A4_cmpbgtu>;
+def : T_Q_RR_pat<A4_cmpheq,   int_hexagon_A4_cmpheq>;
+def : T_Q_RR_pat<A4_cmphgt,   int_hexagon_A4_cmphgt>;
+def : T_Q_RR_pat<A4_cmphgtu,  int_hexagon_A4_cmphgtu>;
+
+def : T_Q_RI_pat<A4_cmpbeqi,  int_hexagon_A4_cmpbeqi>;
+def : T_Q_RI_pat<A4_cmpbgti,  int_hexagon_A4_cmpbgti>;
+def : T_Q_RI_pat<A4_cmpbgtui, int_hexagon_A4_cmpbgtui>;
+
+def : T_Q_RI_pat<A4_cmpheqi,  int_hexagon_A4_cmpheqi>;
+def : T_Q_RI_pat<A4_cmphgti,  int_hexagon_A4_cmphgti>;
+def : T_Q_RI_pat<A4_cmphgtui, int_hexagon_A4_cmphgtui>;
+
+def : T_Q_RP_pat<A4_boundscheck, int_hexagon_A4_boundscheck>;
+def : T_Q_PR_pat<A4_tlbmatch,    int_hexagon_A4_tlbmatch>;
+
+def : T_RRR_pat <M4_mpyrr_addr,    int_hexagon_M4_mpyrr_addr>;
+def : T_IRR_pat <M4_mpyrr_addi,    int_hexagon_M4_mpyrr_addi>;
+def : T_IRI_pat <M4_mpyri_addi,    int_hexagon_M4_mpyri_addi>;
 def : T_RIR_pat <M4_mpyri_addr_u2, int_hexagon_M4_mpyri_addr_u2>;
-def : T_RRI_pat <M4_mpyri_addr, int_hexagon_M4_mpyri_addr>;
-// Multiply 32x32 and use upper result
+def : T_RRI_pat <M4_mpyri_addr,    int_hexagon_M4_mpyri_addr>;
 def : T_RRR_pat <M4_mac_up_s1_sat, int_hexagon_M4_mac_up_s1_sat>;
 def : T_RRR_pat <M4_nac_up_s1_sat, int_hexagon_M4_nac_up_s1_sat>;
 
@@ -210,41 +199,46 @@ def : T_IRI_pat <S4_subi_lsr_ri, int_hexagon_S4_subi_lsr_ri>;
 
 // Split bitfield
 def : T_RI_pat <A4_bitspliti, int_hexagon_A4_bitspliti>;
-def : T_RR_pat <A4_bitsplit, int_hexagon_A4_bitsplit>;
+def : T_RR_pat <A4_bitsplit,  int_hexagon_A4_bitsplit>;
 
-def: T_RR_pat<S4_parity,   int_hexagon_S4_parity>;
+def: T_RR_pat<S4_parity,      int_hexagon_S4_parity>;
 
-def: T_RI_pat<S4_ntstbit_i,  int_hexagon_S4_ntstbit_i>;
-def: T_RR_pat<S4_ntstbit_r,  int_hexagon_S4_ntstbit_r>;
+def: T_Q_RI_pat<S4_ntstbit_i, int_hexagon_S4_ntstbit_i>;
+def: T_Q_RR_pat<S4_ntstbit_r, int_hexagon_S4_ntstbit_r>;
 
-def: T_RI_pat<S4_clbaddi,  int_hexagon_S4_clbaddi>;
-def: T_PI_pat<S4_clbpaddi, int_hexagon_S4_clbpaddi>;
-def: T_P_pat <S4_clbpnorm, int_hexagon_S4_clbpnorm>;
+def: T_RI_pat<S4_clbaddi,     int_hexagon_S4_clbaddi>;
+def: T_PI_pat<S4_clbpaddi,    int_hexagon_S4_clbpaddi>;
+def: T_P_pat <S4_clbpnorm,    int_hexagon_S4_clbpnorm>;
 
-/********************************************************************
-*            ALU32/ALU                                              *
-*********************************************************************/
+//*******************************************************************
+//            ALU32/ALU
+//*******************************************************************
 
 // ALU32 / ALU / Logical Operations.
 def: T_RR_pat<A4_andn, int_hexagon_A4_andn>;
 def: T_RR_pat<A4_orn,  int_hexagon_A4_orn>;
 
-/********************************************************************
-*            ALU32/PERM                                             *
-*********************************************************************/
+//*******************************************************************
+//            ALU32/PERM
+//*******************************************************************
 
 // Combine Words Into Doublewords.
 def: T_RI_pat<A4_combineri, int_hexagon_A4_combineri, s32ImmPred>;
 def: T_IR_pat<A4_combineir, int_hexagon_A4_combineir, s32ImmPred>;
 
-/********************************************************************
-*            ALU32/PRED                                             *
-*********************************************************************/
+//*******************************************************************
+//           ALU32/PRED
+//*******************************************************************
 
 // Compare
-def : T_RI_pat<C4_cmpneqi, int_hexagon_C4_cmpneqi, s32ImmPred>;
-def : T_RI_pat<C4_cmpltei, int_hexagon_C4_cmpltei, s32ImmPred>;
-def : T_RI_pat<C4_cmplteui, int_hexagon_C4_cmplteui, u32ImmPred>;
+def : T_Q_RI_pat<C4_cmpneqi, int_hexagon_C4_cmpneqi, s32ImmPred>;
+def : T_Q_RI_pat<C4_cmpltei, int_hexagon_C4_cmpltei, s32ImmPred>;
+def : T_Q_RI_pat<C4_cmplteui, int_hexagon_C4_cmplteui, u32ImmPred>;
+
+// Compare To General Register.
+def: T_Q_RR_pat<C4_cmpneq,  int_hexagon_C4_cmpneq>;
+def: T_Q_RR_pat<C4_cmplte,  int_hexagon_C4_cmplte>;
+def: T_Q_RR_pat<C4_cmplteu, int_hexagon_C4_cmplteu>;
 
 def: T_RR_pat<A4_rcmpeq,  int_hexagon_A4_rcmpeq>;
 def: T_RR_pat<A4_rcmpneq, int_hexagon_A4_rcmpneq>;
@@ -252,30 +246,23 @@ def: T_RR_pat<A4_rcmpneq, int_hexagon_A4_rcmpneq>;
 def: T_RI_pat<A4_rcmpeqi,  int_hexagon_A4_rcmpeqi>;
 def: T_RI_pat<A4_rcmpneqi, int_hexagon_A4_rcmpneqi>;
 
-/********************************************************************
-*            CR                                                     *
-*********************************************************************/
+//*******************************************************************
+//           CR
+//*******************************************************************
 
 // CR / Logical Operations On Predicates.
-
-class qi_CRInst_qiqiqi_pat<Intrinsic IntID, InstHexagon Inst> :
-  Pat<(i32 (IntID IntRegs:$Rs, IntRegs:$Rt, IntRegs:$Ru)),
-      (i32 (C2_tfrpr (Inst (C2_tfrrp IntRegs:$Rs),
-                           (C2_tfrrp IntRegs:$Rt),
-                           (C2_tfrrp IntRegs:$Ru))))>;
-
-def: qi_CRInst_qiqiqi_pat<int_hexagon_C4_and_and,   C4_and_and>;
-def: qi_CRInst_qiqiqi_pat<int_hexagon_C4_and_andn,  C4_and_andn>;
-def: qi_CRInst_qiqiqi_pat<int_hexagon_C4_and_or,    C4_and_or>;
-def: qi_CRInst_qiqiqi_pat<int_hexagon_C4_and_orn,   C4_and_orn>;
-def: qi_CRInst_qiqiqi_pat<int_hexagon_C4_or_and,    C4_or_and>;
-def: qi_CRInst_qiqiqi_pat<int_hexagon_C4_or_andn,   C4_or_andn>;
-def: qi_CRInst_qiqiqi_pat<int_hexagon_C4_or_or,     C4_or_or>;
-def: qi_CRInst_qiqiqi_pat<int_hexagon_C4_or_orn,    C4_or_orn>;
-
-/********************************************************************
-*            XTYPE/ALU                                              *
-*********************************************************************/
+def: T_Q_QQQ_pat<C4_and_and,  int_hexagon_C4_and_and>;
+def: T_Q_QQQ_pat<C4_and_andn, int_hexagon_C4_and_andn>;
+def: T_Q_QQQ_pat<C4_and_or,   int_hexagon_C4_and_or>;
+def: T_Q_QQQ_pat<C4_and_orn,  int_hexagon_C4_and_orn>;
+def: T_Q_QQQ_pat<C4_or_and,   int_hexagon_C4_or_and>;
+def: T_Q_QQQ_pat<C4_or_andn,  int_hexagon_C4_or_andn>;
+def: T_Q_QQQ_pat<C4_or_or,    int_hexagon_C4_or_or>;
+def: T_Q_QQQ_pat<C4_or_orn,   int_hexagon_C4_or_orn>;
+
+//*******************************************************************
+//           XTYPE/ALU
+//*******************************************************************
 
 // Add And Accumulate.
 
diff --git a/lib/Target/Hexagon/HexagonIntrinsicsV5.td b/lib/Target/Hexagon/HexagonIntrinsicsV5.td
index 60e6b1eb4479..f27a63e20e61 100644
--- a/lib/Target/Hexagon/HexagonIntrinsicsV5.td
+++ b/lib/Target/Hexagon/HexagonIntrinsicsV5.td
@@ -43,8 +43,8 @@ def : T_FF_pat<F2_sffixupn, int_hexagon_F2_sffixupn>;
 def : T_FF_pat<F2_sffixupd, int_hexagon_F2_sffixupd>;
 def : T_F_pat <F2_sffixupr, int_hexagon_F2_sffixupr>;
 
-def: qi_CRInst_qiqi_pat<C4_fastcorner9,     int_hexagon_C4_fastcorner9>;
-def: qi_CRInst_qiqi_pat<C4_fastcorner9_not, int_hexagon_C4_fastcorner9_not>;
+def : T_Q_QQ_pat<C4_fastcorner9,     int_hexagon_C4_fastcorner9>;
+def : T_Q_QQ_pat<C4_fastcorner9_not, int_hexagon_C4_fastcorner9_not>;
 
 def : T_P_pat <S5_popcountp, int_hexagon_S5_popcountp>;
 def : T_PI_pat <S5_asrhub_sat, int_hexagon_S5_asrhub_sat>;
@@ -65,15 +65,15 @@ def : T_FFF_pat <F2_sffms_lib, int_hexagon_F2_sffms_lib>;
 def : T_FFFQ_pat <F2_sffma_sc, int_hexagon_F2_sffma_sc>;
 
 // Compare floating-point value
-def : T_FF_pat <F2_sfcmpge, int_hexagon_F2_sfcmpge>;
-def : T_FF_pat <F2_sfcmpuo, int_hexagon_F2_sfcmpuo>;
-def : T_FF_pat <F2_sfcmpeq, int_hexagon_F2_sfcmpeq>;
-def : T_FF_pat <F2_sfcmpgt, int_hexagon_F2_sfcmpgt>;
+def : T_Q_FF_pat <F2_sfcmpge, int_hexagon_F2_sfcmpge>;
+def : T_Q_FF_pat <F2_sfcmpuo, int_hexagon_F2_sfcmpuo>;
+def : T_Q_FF_pat <F2_sfcmpeq, int_hexagon_F2_sfcmpeq>;
+def : T_Q_FF_pat <F2_sfcmpgt, int_hexagon_F2_sfcmpgt>;
 
-def : T_DD_pat <F2_dfcmpeq, int_hexagon_F2_dfcmpeq>;
-def : T_DD_pat <F2_dfcmpgt, int_hexagon_F2_dfcmpgt>;
-def : T_DD_pat <F2_dfcmpge, int_hexagon_F2_dfcmpge>;
-def : T_DD_pat <F2_dfcmpuo, int_hexagon_F2_dfcmpuo>;
+def : T_Q_DD_pat <F2_dfcmpeq, int_hexagon_F2_dfcmpeq>;
+def : T_Q_DD_pat <F2_dfcmpgt, int_hexagon_F2_dfcmpgt>;
+def : T_Q_DD_pat <F2_dfcmpge, int_hexagon_F2_dfcmpge>;
+def : T_Q_DD_pat <F2_dfcmpuo, int_hexagon_F2_dfcmpuo>;
 
 // Create floating-point value
 def : T_I_pat <F2_sfimm_p, int_hexagon_F2_sfimm_p>;
@@ -81,8 +81,8 @@ def : T_I_pat <F2_sfimm_n, int_hexagon_F2_sfimm_n>;
 def : T_I_pat <F2_dfimm_p, int_hexagon_F2_dfimm_p>;
 def : T_I_pat <F2_dfimm_n, int_hexagon_F2_dfimm_n>;
 
-def : T_DI_pat <F2_dfclass, int_hexagon_F2_dfclass>;
-def : T_FI_pat <F2_sfclass, int_hexagon_F2_sfclass>;
+def : T_Q_DI_pat <F2_dfclass, int_hexagon_F2_dfclass>;
+def : T_Q_FI_pat <F2_sfclass, int_hexagon_F2_sfclass>;
 def : T_F_pat <F2_conv_sf2df, int_hexagon_F2_conv_sf2df>;
 def : T_D_pat <F2_conv_df2sf, int_hexagon_F2_conv_df2sf>;
 def : T_R_pat <F2_conv_uw2sf, int_hexagon_F2_conv_uw2sf>;
diff --git a/lib/Target/Hexagon/HexagonIntrinsicsV60.td b/lib/Target/Hexagon/HexagonIntrinsicsV60.td
index 24a3e4d36de9..82bc91bb3021 100644
--- a/lib/Target/Hexagon/HexagonIntrinsicsV60.td
+++ b/lib/Target/Hexagon/HexagonIntrinsicsV60.td
@@ -11,6 +11,7 @@
 //
 //===----------------------------------------------------------------------===//
 
+
 let isCodeGenOnly = 1 in {
 def HEXAGON_V6_vd0_pseudo : CVI_VA_Resource<(outs VectorRegs:$dst),
     (ins ),
@@ -22,6 +23,7 @@ def HEXAGON_V6_vd0_pseudo_128B : CVI_VA_Resource<(outs VectorRegs128B:$dst),
     "$dst=#0",
     [(set VectorRegs128B:$dst, (int_hexagon_V6_vd0_128B ))]>;
 }
+
 let isPseudo = 1 in
 def HEXAGON_V6_vassignp : CVI_VA_Resource<(outs VecDblRegs:$dst),
     (ins VecDblRegs:$src1),
@@ -800,7 +802,7 @@ defm : T_VQR_pat <V6_vandqrt_acc, int_hexagon_V6_vandqrt_acc>;
 defm : T_QVR_pat <V6_vandvrt_acc, int_hexagon_V6_vandvrt_acc>;
 defm : T_QR_pat <V6_vandqrt, int_hexagon_V6_vandqrt>;
 defm : T_R_pat <V6_lvsplatw, int_hexagon_V6_lvsplatw>;
-defm  : T_R_pat <V6_pred_scalar2, int_hexagon_V6_pred_scalar2>;
+defm : T_R_pat <V6_pred_scalar2, int_hexagon_V6_pred_scalar2>;
 defm : T_VR_pat <V6_vandvrt, int_hexagon_V6_vandvrt>;
 
 defm : T_VVR_pat <V6_vlutvvb, int_hexagon_V6_vlutvvb>;
diff --git a/lib/Target/Hexagon/HexagonMCInstLower.cpp b/lib/Target/Hexagon/HexagonMCInstLower.cpp
index 624c0f6cf49d..a5dc002642c8 100644
--- a/lib/Target/Hexagon/HexagonMCInstLower.cpp
+++ b/lib/Target/Hexagon/HexagonMCInstLower.cpp
@@ -32,7 +32,7 @@ namespace llvm {
 }
 
 static MCOperand GetSymbolRef(const MachineOperand &MO, const MCSymbol *Symbol,
-                              HexagonAsmPrinter &Printer) {
+                              HexagonAsmPrinter &Printer, bool MustExtend) {
   MCContext &MC = Printer.OutContext;
   const MCExpr *ME;
 
@@ -58,6 +58,21 @@ static MCOperand GetSymbolRef(const MachineOperand &MO, const MCSymbol *Symbol,
   case HexagonII::MO_GPREL:
     RelocationType = MCSymbolRefExpr::VK_Hexagon_GPREL;
     break;
+  case HexagonII::MO_GDGOT:
+    RelocationType = MCSymbolRefExpr::VK_Hexagon_GD_GOT;
+    break;
+  case HexagonII::MO_GDPLT:
+    RelocationType = MCSymbolRefExpr::VK_Hexagon_GD_PLT;
+    break;
+  case HexagonII::MO_IE:
+    RelocationType = MCSymbolRefExpr::VK_Hexagon_IE;
+    break;
+  case HexagonII::MO_IEGOT:
+    RelocationType = MCSymbolRefExpr::VK_Hexagon_IE_GOT;
+    break;
+  case HexagonII::MO_TPREL:
+    RelocationType = MCSymbolRefExpr::VK_TPREL;
+    break;
   }
 
   ME = MCSymbolRefExpr::create(Symbol, RelocationType, MC);
@@ -66,6 +81,8 @@ static MCOperand GetSymbolRef(const MachineOperand &MO, const MCSymbol *Symbol,
     ME = MCBinaryExpr::createAdd(ME, MCConstantExpr::create(MO.getOffset(), MC),
                                  MC);
 
+  ME = HexagonMCExpr::create(ME, MC);
+  HexagonMCInstrInfo::setMustExtend(*ME, MustExtend);
   return MCOperand::createExpr(ME);
 }
 
@@ -84,13 +101,11 @@ void llvm::HexagonLowerToMC(const MCInstrInfo &MCII, const MachineInstr *MI,
   MCI->setOpcode(MI->getOpcode());
   assert(MCI->getOpcode() == static_cast<unsigned>(MI->getOpcode()) &&
          "MCI opcode should have been set on construction");
-  bool MustExtend = false;
 
   for (unsigned i = 0, e = MI->getNumOperands(); i < e; i++) {
     const MachineOperand &MO = MI->getOperand(i);
     MCOperand MCO;
-    if (MO.getTargetFlags() & HexagonII::HMOTF_ConstExtended)
-      MustExtend = true;
+    bool MustExtend = MO.getTargetFlags() & HexagonII::HMOTF_ConstExtended;
 
     switch (MO.getType()) {
     default:
@@ -105,42 +120,51 @@ void llvm::HexagonLowerToMC(const MCInstrInfo &MCII, const MachineInstr *MI,
       APFloat Val = MO.getFPImm()->getValueAPF();
       // FP immediates are used only when setting GPRs, so they may be dealt
       // with like regular immediates from this point on.
-      MCO = MCOperand::createExpr(
-        MCConstantExpr::create(*Val.bitcastToAPInt().getRawData(),
-                               AP.OutContext));
+      auto Expr = HexagonMCExpr::create(
+          MCConstantExpr::create(*Val.bitcastToAPInt().getRawData(),
+                                 AP.OutContext),
+          AP.OutContext);
+      HexagonMCInstrInfo::setMustExtend(*Expr, MustExtend);
+      MCO = MCOperand::createExpr(Expr);
       break;
     }
-    case MachineOperand::MO_Immediate:
-      MCO = MCOperand::createExpr(
-        MCConstantExpr::create(MO.getImm(), AP.OutContext));
+    case MachineOperand::MO_Immediate: {
+      auto Expr = HexagonMCExpr::create(
+          MCConstantExpr::create(MO.getImm(), AP.OutContext), AP.OutContext);
+      HexagonMCInstrInfo::setMustExtend(*Expr, MustExtend);
+      MCO = MCOperand::createExpr(Expr);
       break;
-    case MachineOperand::MO_MachineBasicBlock:
-      MCO = MCOperand::createExpr
-              (MCSymbolRefExpr::create(MO.getMBB()->getSymbol(),
-               AP.OutContext));
+    }
+    case MachineOperand::MO_MachineBasicBlock: {
+      MCExpr const *Expr = MCSymbolRefExpr::create(MO.getMBB()->getSymbol(),
+                                                   AP.OutContext);
+      Expr = HexagonMCExpr::create(Expr, AP.OutContext);
+      HexagonMCInstrInfo::setMustExtend(*Expr, MustExtend);
+      MCO = MCOperand::createExpr(Expr);
       break;
+    }
     case MachineOperand::MO_GlobalAddress:
-      MCO = GetSymbolRef(MO, AP.getSymbol(MO.getGlobal()), AP);
+      MCO = GetSymbolRef(MO, AP.getSymbol(MO.getGlobal()), AP, MustExtend);
       break;
     case MachineOperand::MO_ExternalSymbol:
       MCO = GetSymbolRef(MO, AP.GetExternalSymbolSymbol(MO.getSymbolName()),
-                         AP);
+                         AP, MustExtend);
       break;
     case MachineOperand::MO_JumpTableIndex:
-      MCO = GetSymbolRef(MO, AP.GetJTISymbol(MO.getIndex()), AP);
+      MCO = GetSymbolRef(MO, AP.GetJTISymbol(MO.getIndex()), AP, MustExtend);
       break;
     case MachineOperand::MO_ConstantPoolIndex:
-      MCO = GetSymbolRef(MO, AP.GetCPISymbol(MO.getIndex()), AP);
+      MCO = GetSymbolRef(MO, AP.GetCPISymbol(MO.getIndex()), AP, MustExtend);
       break;
     case MachineOperand::MO_BlockAddress:
-      MCO = GetSymbolRef(MO, AP.GetBlockAddressSymbol(MO.getBlockAddress()),AP);
+      MCO = GetSymbolRef(MO, AP.GetBlockAddressSymbol(MO.getBlockAddress()), AP,
+                         MustExtend);
       break;
     }
 
     MCI->addOperand(MCO);
   }
   AP.HexagonProcessInstruction(*MCI, *MI);
-  HexagonMCInstrInfo::extendIfNeeded(AP.OutContext, MCII, MCB, *MCI,
-                                     MustExtend);
+  HexagonMCInstrInfo::extendIfNeeded(AP.OutContext, MCII, MCB, *MCI);
   MCB.addOperand(MCOperand::createInst(MCI));
 }
diff --git a/lib/Target/Hexagon/HexagonMachineFunctionInfo.h b/lib/Target/Hexagon/HexagonMachineFunctionInfo.h
index 76723586c66e..26c5b63fec6c 100644
--- a/lib/Target/Hexagon/HexagonMachineFunctionInfo.h
+++ b/lib/Target/Hexagon/HexagonMachineFunctionInfo.h
@@ -27,7 +27,8 @@ class HexagonMachineFunctionInfo : public MachineFunctionInfo {
   // returning the value of the returned struct in a register. This field
   // holds the virtual register into which the sret argument is passed.
   unsigned SRetReturnReg;
-  unsigned StackAlignBaseReg;
+  unsigned StackAlignBaseVReg;    // Aligned-stack base register (virtual)
+  unsigned StackAlignBasePhysReg; //                             (physical)
   std::vector<MachineInstr*> AllocaAdjustInsts;
   int VarArgsFrameIndex;
   bool HasClobberLR;
@@ -36,13 +37,12 @@ class HexagonMachineFunctionInfo : public MachineFunctionInfo {
   virtual void anchor();
 
 public:
-  HexagonMachineFunctionInfo() : SRetReturnReg(0), StackAlignBaseReg(0),
-    HasClobberLR(0), HasEHReturn(false) {}
+  HexagonMachineFunctionInfo() : SRetReturnReg(0), StackAlignBaseVReg(0),
+      StackAlignBasePhysReg(0), HasClobberLR(0), HasEHReturn(false) {}
 
   HexagonMachineFunctionInfo(MachineFunction &MF) : SRetReturnReg(0),
-                                                    StackAlignBaseReg(0),
-                                                    HasClobberLR(0),
-                                                    HasEHReturn(false) {}
+      StackAlignBaseVReg(0), StackAlignBasePhysReg(0), HasClobberLR(0),
+      HasEHReturn(false) {}
 
   unsigned getSRetReturnReg() const { return SRetReturnReg; }
   void setSRetReturnReg(unsigned Reg) { SRetReturnReg = Reg; }
@@ -77,8 +77,11 @@ public:
   bool hasEHReturn() const { return HasEHReturn; };
   void setHasEHReturn(bool H = true) { HasEHReturn = H; };
 
-  void setStackAlignBaseVReg(unsigned R) { StackAlignBaseReg = R; }
-  unsigned getStackAlignBaseVReg() const { return StackAlignBaseReg; }
+  void setStackAlignBaseVReg(unsigned R) { StackAlignBaseVReg = R; }
+  unsigned getStackAlignBaseVReg() const { return StackAlignBaseVReg; }
+
+  void setStackAlignBasePhysReg(unsigned R) { StackAlignBasePhysReg = R; }
+  unsigned getStackAlignBasePhysReg() const { return StackAlignBasePhysReg; }
 };
 } // End llvm namespace
 
diff --git a/lib/Target/Hexagon/HexagonMachineScheduler.cpp b/lib/Target/Hexagon/HexagonMachineScheduler.cpp
index 7a52d6874c33..6dcac0dc7ee2 100644
--- a/lib/Target/Hexagon/HexagonMachineScheduler.cpp
+++ b/lib/Target/Hexagon/HexagonMachineScheduler.cpp
@@ -13,28 +13,126 @@
 //===----------------------------------------------------------------------===//
 
 #include "HexagonMachineScheduler.h"
+#include "HexagonSubtarget.h"
 #include "llvm/CodeGen/MachineLoopInfo.h"
+#include "llvm/CodeGen/ScheduleDAGMutation.h"
 #include "llvm/IR/Function.h"
 
+#include <iomanip>
+#include <sstream>
+
+static cl::opt<bool> IgnoreBBRegPressure("ignore-bb-reg-pressure",
+    cl::Hidden, cl::ZeroOrMore, cl::init(false));
+
+static cl::opt<bool> SchedPredsCloser("sched-preds-closer",
+    cl::Hidden, cl::ZeroOrMore, cl::init(true));
+
+static cl::opt<unsigned> SchedDebugVerboseLevel("misched-verbose-level",
+    cl::Hidden, cl::ZeroOrMore, cl::init(1));
+
+static cl::opt<bool> TopUseShorterTie("top-use-shorter-tie",
+    cl::Hidden, cl::ZeroOrMore, cl::init(false));
+
+static cl::opt<bool> BotUseShorterTie("bot-use-shorter-tie",
+    cl::Hidden, cl::ZeroOrMore, cl::init(false));
+
+static cl::opt<bool> DisableTCTie("disable-tc-tie",
+    cl::Hidden, cl::ZeroOrMore, cl::init(false));
+
+static cl::opt<bool> SchedRetvalOptimization("sched-retval-optimization",
+    cl::Hidden, cl::ZeroOrMore, cl::init(true));
+
+// Check if the scheduler should penalize instructions that are available to
+// early due to a zero-latency dependence.
+static cl::opt<bool> CheckEarlyAvail("check-early-avail", cl::Hidden,
+    cl::ZeroOrMore, cl::init(true));
+
 using namespace llvm;
 
 #define DEBUG_TYPE "misched"
 
-/// Platform-specific modifications to DAG.
-void VLIWMachineScheduler::postprocessDAG() {
+class HexagonCallMutation : public ScheduleDAGMutation {
+public:
+  void apply(ScheduleDAGInstrs *DAG) override;
+private:
+  bool shouldTFRICallBind(const HexagonInstrInfo &HII,
+                          const SUnit &Inst1, const SUnit &Inst2) const;
+};
+
+// Check if a call and subsequent A2_tfrpi instructions should maintain
+// scheduling affinity. We are looking for the TFRI to be consumed in
+// the next instruction. This should help reduce the instances of
+// double register pairs being allocated and scheduled before a call
+// when not used until after the call. This situation is exacerbated
+// by the fact that we allocate the pair from the callee saves list,
+// leading to excess spills and restores.
+bool HexagonCallMutation::shouldTFRICallBind(const HexagonInstrInfo &HII,
+      const SUnit &Inst1, const SUnit &Inst2) const {
+  if (Inst1.getInstr()->getOpcode() != Hexagon::A2_tfrpi)
+    return false;
+
+  // TypeXTYPE are 64 bit operations.
+  if (HII.getType(Inst2.getInstr()) == HexagonII::TypeXTYPE)
+    return true;
+  return false;
+}
+
+void HexagonCallMutation::apply(ScheduleDAGInstrs *DAG) {
   SUnit* LastSequentialCall = nullptr;
+  unsigned VRegHoldingRet = 0;
+  unsigned RetRegister;
+  SUnit* LastUseOfRet = nullptr;
+  auto &TRI = *DAG->MF.getSubtarget().getRegisterInfo();
+  auto &HII = *DAG->MF.getSubtarget<HexagonSubtarget>().getInstrInfo();
+
   // Currently we only catch the situation when compare gets scheduled
   // before preceding call.
-  for (unsigned su = 0, e = SUnits.size(); su != e; ++su) {
+  for (unsigned su = 0, e = DAG->SUnits.size(); su != e; ++su) {
     // Remember the call.
-    if (SUnits[su].getInstr()->isCall())
-      LastSequentialCall = &(SUnits[su]);
+    if (DAG->SUnits[su].getInstr()->isCall())
+      LastSequentialCall = &DAG->SUnits[su];
     // Look for a compare that defines a predicate.
-    else if (SUnits[su].getInstr()->isCompare() && LastSequentialCall)
-      SUnits[su].addPred(SDep(LastSequentialCall, SDep::Barrier));
+    else if (DAG->SUnits[su].getInstr()->isCompare() && LastSequentialCall)
+      DAG->SUnits[su].addPred(SDep(LastSequentialCall, SDep::Barrier));
+    // Look for call and tfri* instructions.
+    else if (SchedPredsCloser && LastSequentialCall && su > 1 && su < e-1 &&
+             shouldTFRICallBind(HII, DAG->SUnits[su], DAG->SUnits[su+1]))
+      DAG->SUnits[su].addPred(SDep(&DAG->SUnits[su-1], SDep::Barrier));
+    // Prevent redundant register copies between two calls, which are caused by
+    // both the return value and the argument for the next call being in %R0.
+    // Example:
+    //   1: <call1>
+    //   2: %VregX = COPY %R0
+    //   3: <use of %VregX>
+    //   4: %R0 = ...
+    //   5: <call2>
+    // The scheduler would often swap 3 and 4, so an additional register is
+    // needed. This code inserts a Barrier dependence between 3 & 4 to prevent
+    // this. The same applies for %D0 and %V0/%W0, which are also handled.
+    else if (SchedRetvalOptimization) {
+      const MachineInstr *MI = DAG->SUnits[su].getInstr();
+      if (MI->isCopy() && (MI->readsRegister(Hexagon::R0, &TRI) ||
+                           MI->readsRegister(Hexagon::V0, &TRI)))  {
+        // %vregX = COPY %R0
+        VRegHoldingRet = MI->getOperand(0).getReg();
+        RetRegister = MI->getOperand(1).getReg();
+        LastUseOfRet = nullptr;
+      } else if (VRegHoldingRet && MI->readsVirtualRegister(VRegHoldingRet))
+        // <use of %vregX>
+        LastUseOfRet = &DAG->SUnits[su];
+      else if (LastUseOfRet && MI->definesRegister(RetRegister, &TRI))
+        // %R0 = ...
+        DAG->SUnits[su].addPred(SDep(LastUseOfRet, SDep::Barrier));
+    }
   }
 }
 
+
+/// Save the last formed packet
+void VLIWResourceModel::savePacket() {
+  OldPacket = Packet;
+}
+
 /// Check if scheduling of this SU is possible
 /// in the current packet.
 /// It is _not_ precise (statefull), it is more like
@@ -48,7 +146,7 @@ bool VLIWResourceModel::isResourceAvailable(SUnit *SU) {
   // in the current cycle.
   switch (SU->getInstr()->getOpcode()) {
   default:
-    if (!ResourcesModel->canReserveResources(SU->getInstr()))
+    if (!ResourcesModel->canReserveResources(*SU->getInstr()))
       return false;
   case TargetOpcode::EXTRACT_SUBREG:
   case TargetOpcode::INSERT_SUBREG:
@@ -60,11 +158,19 @@ bool VLIWResourceModel::isResourceAvailable(SUnit *SU) {
     break;
   }
 
+  MachineFunction &MF = *SU->getInstr()->getParent()->getParent();
+  auto &QII = *MF.getSubtarget<HexagonSubtarget>().getInstrInfo();
+
   // Now see if there are no other dependencies to instructions already
   // in the packet.
   for (unsigned i = 0, e = Packet.size(); i != e; ++i) {
     if (Packet[i]->Succs.size() == 0)
       continue;
+
+    // Enable .cur formation.
+    if (QII.mayBeCurLoad(Packet[i]->getInstr()))
+      continue;
+
     for (SUnit::const_succ_iterator I = Packet[i]->Succs.begin(),
          E = Packet[i]->Succs.end(); I != E; ++I) {
       // Since we do not add pseudos to packets, might as well
@@ -85,6 +191,7 @@ bool VLIWResourceModel::reserveResources(SUnit *SU) {
   // Artificially reset state.
   if (!SU) {
     ResourcesModel->clearResources();
+    savePacket();
     Packet.clear();
     TotalPackets++;
     return false;
@@ -93,6 +200,7 @@ bool VLIWResourceModel::reserveResources(SUnit *SU) {
   // start a new one.
   if (!isResourceAvailable(SU)) {
     ResourcesModel->clearResources();
+    savePacket();
     Packet.clear();
     TotalPackets++;
     startNewCycle = true;
@@ -100,7 +208,7 @@ bool VLIWResourceModel::reserveResources(SUnit *SU) {
 
   switch (SU->getInstr()->getOpcode()) {
   default:
-    ResourcesModel->reserveResources(SU->getInstr());
+    ResourcesModel->reserveResources(*SU->getInstr());
     break;
   case TargetOpcode::EXTRACT_SUBREG:
   case TargetOpcode::INSERT_SUBREG:
@@ -129,6 +237,7 @@ bool VLIWResourceModel::reserveResources(SUnit *SU) {
   // we start fresh.
   if (Packet.size() >= SchedModel->getIssueWidth()) {
     ResourcesModel->clearResources();
+    savePacket();
     Packet.clear();
     TotalPackets++;
     startNewCycle = true;
@@ -150,19 +259,12 @@ void VLIWMachineScheduler::schedule() {
 
   buildDAGWithRegPressure();
 
-  // Postprocess the DAG to add platform-specific artificial dependencies.
-  postprocessDAG();
-
   SmallVector<SUnit*, 8> TopRoots, BotRoots;
   findRootsAndBiasEdges(TopRoots, BotRoots);
 
   // Initialize the strategy before modifying the DAG.
   SchedImpl->initialize(this);
 
-  // To view Height/Depth correctly, they should be accessed at least once.
-  //
-  // FIXME: SUnit::dumpAll always recompute depth and height now. The max
-  // depth/height could be computed directly from the roots and leaves.
   DEBUG(unsigned maxH = 0;
         for (unsigned su = 0, e = SUnits.size(); su != e; ++su)
           if (SUnits[su].getHeight() > maxH)
@@ -197,6 +299,13 @@ void VLIWMachineScheduler::schedule() {
   assert(CurrentTop == CurrentBottom && "Nonempty unscheduled zone.");
 
   placeDebugValues();
+
+  DEBUG({
+    unsigned BBNum = begin()->getParent()->getNumber();
+    dbgs() << "*** Final schedule for BB#" << BBNum << " ***\n";
+    dumpSchedule();
+    dbgs() << '\n';
+  });
 }
 
 void ConvergingVLIWScheduler::initialize(ScheduleDAGMI *dag) {
@@ -223,16 +332,18 @@ void ConvergingVLIWScheduler::initialize(ScheduleDAGMI *dag) {
 
   assert((!llvm::ForceTopDown || !llvm::ForceBottomUp) &&
          "-misched-topdown incompatible with -misched-bottomup");
+
+  DAG->addMutation(make_unique<HexagonSubtarget::HexagonDAGMutation>());
+  DAG->addMutation(make_unique<HexagonCallMutation>());
 }
 
 void ConvergingVLIWScheduler::releaseTopNode(SUnit *SU) {
   if (SU->isScheduled)
     return;
 
-  for (SUnit::succ_iterator I = SU->Preds.begin(), E = SU->Preds.end();
-       I != E; ++I) {
-    unsigned PredReadyCycle = I->getSUnit()->TopReadyCycle;
-    unsigned MinLatency = I->getLatency();
+  for (const SDep &PI : SU->Preds) {
+    unsigned PredReadyCycle = PI.getSUnit()->TopReadyCycle;
+    unsigned MinLatency = PI.getLatency();
 #ifndef NDEBUG
     Top.MaxMinLatency = std::max(MinLatency, Top.MaxMinLatency);
 #endif
@@ -321,8 +432,8 @@ void ConvergingVLIWScheduler::VLIWSchedBoundary::bumpCycle() {
   }
   CheckPending = true;
 
-  DEBUG(dbgs() << "*** " << Available.getName() << " cycle "
-        << CurrCycle << '\n');
+  DEBUG(dbgs() << "*** Next cycle " << Available.getName() << " cycle "
+               << CurrCycle << '\n');
 }
 
 /// Move the boundary of scheduled code by one SUnit.
@@ -414,16 +525,38 @@ SUnit *ConvergingVLIWScheduler::VLIWSchedBoundary::pickOnlyChoice() {
 
 #ifndef NDEBUG
 void ConvergingVLIWScheduler::traceCandidate(const char *Label,
-                                             const ReadyQueue &Q,
-                                             SUnit *SU, PressureChange P) {
+      const ReadyQueue &Q, SUnit *SU, int Cost, PressureChange P) {
   dbgs() << Label << " " << Q.getName() << " ";
   if (P.isValid())
     dbgs() << DAG->TRI->getRegPressureSetName(P.getPSet()) << ":"
            << P.getUnitInc() << " ";
   else
     dbgs() << "     ";
+  dbgs() << "cost(" << Cost << ")\t";
   SU->dump(DAG);
 }
+
+// Very detailed queue dump, to be used with higher verbosity levels.
+void ConvergingVLIWScheduler::readyQueueVerboseDump(
+      const RegPressureTracker &RPTracker, SchedCandidate &Candidate,
+      ReadyQueue &Q) {
+  RegPressureTracker &TempTracker = const_cast<RegPressureTracker &>(RPTracker);
+
+  dbgs() << ">>> " << Q.getName() << "\n";
+  for (ReadyQueue::iterator I = Q.begin(), E = Q.end(); I != E; ++I) {
+    RegPressureDelta RPDelta;
+    TempTracker.getMaxPressureDelta((*I)->getInstr(), RPDelta,
+                                    DAG->getRegionCriticalPSets(),
+                                    DAG->getRegPressure().MaxSetPressure);
+    std::stringstream dbgstr;
+    dbgstr << "SU(" << std::setw(3) << (*I)->NodeNum << ")";
+    dbgs() << dbgstr.str();
+    SchedulingCost(Q, *I, Candidate, RPDelta, true);
+    dbgs() << "\t";
+    (*I)->getInstr()->dump();
+  }
+  dbgs() << "\n";
+}
 #endif
 
 /// getSingleUnscheduledPred - If there is exactly one unscheduled predecessor
@@ -466,6 +599,7 @@ static SUnit *getSingleUnscheduledSucc(SUnit *SU) {
 // heuristic components for cost computation.
 static const unsigned PriorityOne = 200;
 static const unsigned PriorityTwo = 50;
+static const unsigned PriorityThree = 75;
 static const unsigned ScaleTwo = 10;
 static const unsigned FactorOne = 2;
 
@@ -482,25 +616,50 @@ int ConvergingVLIWScheduler::SchedulingCost(ReadyQueue &Q, SUnit *SU,
   if (!SU || SU->isScheduled)
     return ResCount;
 
+  MachineInstr *Instr = SU->getInstr();
+
+  DEBUG(if (verbose) dbgs() << ((Q.getID() == TopQID) ? "(top|" : "(bot|"));
   // Forced priority is high.
-  if (SU->isScheduleHigh)
+  if (SU->isScheduleHigh) {
     ResCount += PriorityOne;
+    DEBUG(dbgs() << "H|");
+  }
 
   // Critical path first.
   if (Q.getID() == TopQID) {
     ResCount += (SU->getHeight() * ScaleTwo);
 
+    DEBUG(if (verbose) {
+      std::stringstream dbgstr;
+      dbgstr << "h" << std::setw(3) << SU->getHeight() << "|";
+      dbgs() << dbgstr.str();
+    });
+
     // If resources are available for it, multiply the
     // chance of scheduling.
-    if (Top.ResourceModel->isResourceAvailable(SU))
+    if (Top.ResourceModel->isResourceAvailable(SU)) {
       ResCount <<= FactorOne;
+      ResCount += PriorityThree;
+      DEBUG(if (verbose) dbgs() << "A|");
+    } else
+      DEBUG(if (verbose) dbgs() << " |");
   } else {
     ResCount += (SU->getDepth() * ScaleTwo);
 
+    DEBUG(if (verbose) {
+      std::stringstream dbgstr;
+      dbgstr << "d" << std::setw(3) << SU->getDepth() << "|";
+      dbgs() << dbgstr.str();
+    });
+
     // If resources are available for it, multiply the
     // chance of scheduling.
-    if (Bot.ResourceModel->isResourceAvailable(SU))
+    if (Bot.ResourceModel->isResourceAvailable(SU)) {
       ResCount <<= FactorOne;
+      ResCount += PriorityThree;
+      DEBUG(if (verbose) dbgs() << "A|");
+    } else
+      DEBUG(if (verbose) dbgs() << " |");
   }
 
   unsigned NumNodesBlocking = 0;
@@ -509,24 +668,121 @@ int ConvergingVLIWScheduler::SchedulingCost(ReadyQueue &Q, SUnit *SU,
     // Look at all of the successors of this node.
     // Count the number of nodes that
     // this node is the sole unscheduled node for.
-    for (SUnit::const_succ_iterator I = SU->Succs.begin(), E = SU->Succs.end();
-         I != E; ++I)
-      if (getSingleUnscheduledPred(I->getSUnit()) == SU)
+    for (const SDep &SI : SU->Succs)
+      if (getSingleUnscheduledPred(SI.getSUnit()) == SU)
         ++NumNodesBlocking;
   } else {
     // How many unscheduled predecessors block this node?
-    for (SUnit::const_pred_iterator I = SU->Preds.begin(), E = SU->Preds.end();
-         I != E; ++I)
-      if (getSingleUnscheduledSucc(I->getSUnit()) == SU)
+    for (const SDep &PI : SU->Preds)
+      if (getSingleUnscheduledSucc(PI.getSUnit()) == SU)
         ++NumNodesBlocking;
   }
   ResCount += (NumNodesBlocking * ScaleTwo);
 
+  DEBUG(if (verbose) {
+    std::stringstream dbgstr;
+    dbgstr << "blk " << std::setw(2) << NumNodesBlocking << ")|";
+    dbgs() << dbgstr.str();
+  });
+
   // Factor in reg pressure as a heuristic.
-  ResCount -= (Delta.Excess.getUnitInc()*PriorityTwo);
-  ResCount -= (Delta.CriticalMax.getUnitInc()*PriorityTwo);
+  if (!IgnoreBBRegPressure) {
+    // Decrease priority by the amount that register pressure exceeds the limit.
+    ResCount -= (Delta.Excess.getUnitInc()*PriorityOne);
+    // Decrease priority if register pressure exceeds the limit.
+    ResCount -= (Delta.CriticalMax.getUnitInc()*PriorityOne);
+    // Decrease priority slightly if register pressure would increase over the
+    // current maximum.
+    ResCount -= (Delta.CurrentMax.getUnitInc()*PriorityTwo);
+    DEBUG(if (verbose) {
+        dbgs() << "RP " << Delta.Excess.getUnitInc() << "/"
+               << Delta.CriticalMax.getUnitInc() <<"/"
+               << Delta.CurrentMax.getUnitInc() << ")|";
+    });
+  }
+
+  // Give a little extra priority to a .cur instruction if there is a resource
+  // available for it.
+  auto &QST = DAG->MF.getSubtarget<HexagonSubtarget>();
+  auto &QII = *QST.getInstrInfo();
+  if (SU->isInstr() && QII.mayBeCurLoad(SU->getInstr())) {
+    if (Q.getID() == TopQID && Top.ResourceModel->isResourceAvailable(SU)) {
+      ResCount += PriorityTwo;
+      DEBUG(if (verbose) dbgs() << "C|");
+    } else if (Q.getID() == BotQID &&
+               Bot.ResourceModel->isResourceAvailable(SU)) {
+      ResCount += PriorityTwo;
+      DEBUG(if (verbose) dbgs() << "C|");
+    }
+  }
+
+  // Give preference to a zero latency instruction if the dependent
+  // instruction is in the current packet.
+  if (Q.getID() == TopQID) {
+    for (const SDep &PI : SU->Preds) {
+      if (!PI.getSUnit()->getInstr()->isPseudo() && PI.isAssignedRegDep() &&
+          PI.getLatency() == 0 &&
+          Top.ResourceModel->isInPacket(PI.getSUnit())) {
+        ResCount += PriorityThree;
+        DEBUG(if (verbose) dbgs() << "Z|");
+      }
+    }
+  } else {
+    for (const SDep &SI : SU->Succs) {
+      if (!SI.getSUnit()->getInstr()->isPseudo() && SI.isAssignedRegDep() &&
+          SI.getLatency() == 0 &&
+          Bot.ResourceModel->isInPacket(SI.getSUnit())) {
+        ResCount += PriorityThree;
+        DEBUG(if (verbose) dbgs() << "Z|");
+      }
+    }
+  }
+
+  // Give less preference to an instruction that will cause a stall with
+  // an instruction in the previous packet.
+  if (QII.isV60VectorInstruction(Instr)) {
+    // Check for stalls in the previous packet.
+    if (Q.getID() == TopQID) {
+      for (auto J : Top.ResourceModel->OldPacket)
+        if (QII.producesStall(J->getInstr(), Instr))
+          ResCount -= PriorityOne;
+    } else {
+      for (auto J : Bot.ResourceModel->OldPacket)
+        if (QII.producesStall(Instr, J->getInstr()))
+          ResCount -= PriorityOne;
+    }
+  }
 
-  DEBUG(if (verbose) dbgs() << " Total(" << ResCount << ")");
+  // If the instruction has a non-zero latency dependence with an instruction in
+  // the current packet, then it should not be scheduled yet. The case occurs
+  // when the dependent instruction is scheduled in a new packet, so the
+  // scheduler updates the current cycle and pending instructions become
+  // available.
+  if (CheckEarlyAvail) {
+    if (Q.getID() == TopQID) {
+      for (const auto &PI : SU->Preds) {
+        if (PI.getLatency() > 0 &&
+            Top.ResourceModel->isInPacket(PI.getSUnit())) {
+          ResCount -= PriorityOne;
+          DEBUG(if (verbose) dbgs() << "D|");
+        }
+      }
+    } else {
+      for (const auto &SI : SU->Succs) {
+        if (SI.getLatency() > 0 &&
+            Bot.ResourceModel->isInPacket(SI.getSUnit())) {
+          ResCount -= PriorityOne;
+          DEBUG(if (verbose) dbgs() << "D|");
+        }
+      }
+    }
+  }
+
+  DEBUG(if (verbose) {
+    std::stringstream dbgstr;
+    dbgstr << "Total " << std::setw(4) << ResCount << ")";
+    dbgs() << dbgstr.str();
+  });
 
   return ResCount;
 }
@@ -539,7 +795,9 @@ int ConvergingVLIWScheduler::SchedulingCost(ReadyQueue &Q, SUnit *SU,
 ConvergingVLIWScheduler::CandResult ConvergingVLIWScheduler::
 pickNodeFromQueue(ReadyQueue &Q, const RegPressureTracker &RPTracker,
                   SchedCandidate &Candidate) {
-  DEBUG(Q.dump());
+  DEBUG(if (SchedDebugVerboseLevel > 1)
+        readyQueueVerboseDump(RPTracker, Candidate, Q);
+        else Q.dump(););
 
   // getMaxPressureDelta temporarily modifies the tracker.
   RegPressureTracker &TempTracker = const_cast<RegPressureTracker&>(RPTracker);
@@ -556,6 +814,7 @@ pickNodeFromQueue(ReadyQueue &Q, const RegPressureTracker &RPTracker,
 
     // Initialize the candidate if needed.
     if (!Candidate.SU) {
+      DEBUG(traceCandidate("DCAND", Q, *I, CurrentCost));
       Candidate.SU = *I;
       Candidate.RPDelta = RPDelta;
       Candidate.SCost = CurrentCost;
@@ -565,7 +824,7 @@ pickNodeFromQueue(ReadyQueue &Q, const RegPressureTracker &RPTracker,
 
     // Best cost.
     if (CurrentCost > Candidate.SCost) {
-      DEBUG(traceCandidate("CCAND", Q, *I));
+      DEBUG(traceCandidate("CCAND", Q, *I, CurrentCost));
       Candidate.SU = *I;
       Candidate.RPDelta = RPDelta;
       Candidate.SCost = CurrentCost;
@@ -573,6 +832,69 @@ pickNodeFromQueue(ReadyQueue &Q, const RegPressureTracker &RPTracker,
       continue;
     }
 
+    // Tie breaker using Timing Class.
+    if (!DisableTCTie) {
+      auto &QST = DAG->MF.getSubtarget<HexagonSubtarget>();
+      auto &QII = *QST.getInstrInfo();
+
+      const MachineInstr *MI = (*I)->getInstr();
+      const MachineInstr *CandI = Candidate.SU->getInstr();
+      const InstrItineraryData *InstrItins = QST.getInstrItineraryData();
+
+      unsigned InstrLatency = QII.getInstrTimingClassLatency(InstrItins, MI);
+      unsigned CandLatency = QII.getInstrTimingClassLatency(InstrItins, CandI);
+      DEBUG(dbgs() << "TC Tie Breaker Cand: "
+                   << CandLatency << " Instr:" << InstrLatency << "\n"
+                   << *MI << *CandI << "\n");
+      if (Q.getID() == TopQID && CurrentCost == Candidate.SCost) {
+        if (InstrLatency < CandLatency && TopUseShorterTie) {
+          Candidate.SU = *I;
+          Candidate.RPDelta = RPDelta;
+          Candidate.SCost = CurrentCost;
+          FoundCandidate = BestCost;
+          DEBUG(dbgs() << "Used top shorter tie breaker\n");
+          continue;
+        } else if (InstrLatency > CandLatency && !TopUseShorterTie) {
+          Candidate.SU = *I;
+          Candidate.RPDelta = RPDelta;
+          Candidate.SCost = CurrentCost;
+          FoundCandidate = BestCost;
+          DEBUG(dbgs() << "Used top longer tie breaker\n");
+          continue;
+        }
+      } else if (Q.getID() == BotQID && CurrentCost == Candidate.SCost) {
+        if (InstrLatency < CandLatency && BotUseShorterTie) {
+          Candidate.SU = *I;
+          Candidate.RPDelta = RPDelta;
+          Candidate.SCost = CurrentCost;
+          FoundCandidate = BestCost;
+          DEBUG(dbgs() << "Used Bot shorter tie breaker\n");
+          continue;
+        } else if (InstrLatency > CandLatency && !BotUseShorterTie) {
+          Candidate.SU = *I;
+          Candidate.RPDelta = RPDelta;
+          Candidate.SCost = CurrentCost;
+          FoundCandidate = BestCost;
+          DEBUG(dbgs() << "Used Bot longer tie breaker\n");
+          continue;
+        }
+      }
+    }
+
+    if (CurrentCost == Candidate.SCost) {
+      if ((Q.getID() == TopQID &&
+           (*I)->Succs.size() > Candidate.SU->Succs.size()) ||
+          (Q.getID() == BotQID &&
+           (*I)->Preds.size() < Candidate.SU->Preds.size())) {
+        DEBUG(traceCandidate("SPCAND", Q, *I, CurrentCost));
+        Candidate.SU = *I;
+        Candidate.RPDelta = RPDelta;
+        Candidate.SCost = CurrentCost;
+        FoundCandidate = BestCost;
+        continue;
+      }
+    }
+
     // Fall through to original instruction order.
     // Only consider node order if Candidate was chosen from this Q.
     if (FoundCandidate == NoCand)
@@ -586,10 +908,12 @@ SUnit *ConvergingVLIWScheduler::pickNodeBidrectional(bool &IsTopNode) {
   // Schedule as far as possible in the direction of no choice. This is most
   // efficient, but also provides the best heuristics for CriticalPSets.
   if (SUnit *SU = Bot.pickOnlyChoice()) {
+    DEBUG(dbgs() << "Picked only Bottom\n");
     IsTopNode = false;
     return SU;
   }
   if (SUnit *SU = Top.pickOnlyChoice()) {
+    DEBUG(dbgs() << "Picked only Top\n");
     IsTopNode = true;
     return SU;
   }
@@ -607,6 +931,7 @@ SUnit *ConvergingVLIWScheduler::pickNodeBidrectional(bool &IsTopNode) {
   // increase pressure for one of the excess PSets, then schedule in that
   // direction first to provide more freedom in the other direction.
   if (BotResult == SingleExcess || BotResult == SingleCritical) {
+    DEBUG(dbgs() << "Prefered Bottom Node\n");
     IsTopNode = false;
     return BotCand.SU;
   }
@@ -617,24 +942,29 @@ SUnit *ConvergingVLIWScheduler::pickNodeBidrectional(bool &IsTopNode) {
   assert(TopResult != NoCand && "failed to find the first candidate");
 
   if (TopResult == SingleExcess || TopResult == SingleCritical) {
+    DEBUG(dbgs() << "Prefered Top Node\n");
     IsTopNode = true;
     return TopCand.SU;
   }
   // If either Q has a single candidate that minimizes pressure above the
   // original region's pressure pick it.
   if (BotResult == SingleMax) {
+    DEBUG(dbgs() << "Prefered Bottom Node SingleMax\n");
     IsTopNode = false;
     return BotCand.SU;
   }
   if (TopResult == SingleMax) {
+    DEBUG(dbgs() << "Prefered Top Node SingleMax\n");
     IsTopNode = true;
     return TopCand.SU;
   }
   if (TopCand.SCost > BotCand.SCost) {
+    DEBUG(dbgs() << "Prefered Top Node Cost\n");
     IsTopNode = true;
     return TopCand.SU;
   }
   // Otherwise prefer the bottom candidate in node order.
+  DEBUG(dbgs() << "Prefered Bottom in Node order\n");
   IsTopNode = false;
   return BotCand.SU;
 }
diff --git a/lib/Target/Hexagon/HexagonMachineScheduler.h b/lib/Target/Hexagon/HexagonMachineScheduler.h
index 60343442e327..51c84a4cee31 100644
--- a/lib/Target/Hexagon/HexagonMachineScheduler.h
+++ b/lib/Target/Hexagon/HexagonMachineScheduler.h
@@ -24,7 +24,6 @@
 #include "llvm/CodeGen/ResourcePriorityQueue.h"
 #include "llvm/CodeGen/ScheduleDAGInstrs.h"
 #include "llvm/CodeGen/ScheduleHazardRecognizer.h"
-#include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/raw_ostream.h"
@@ -54,6 +53,10 @@ class VLIWResourceModel {
   unsigned TotalPackets;
 
 public:
+  /// Save the last formed packet.
+  std::vector<SUnit*> OldPacket;
+
+public:
   VLIWResourceModel(const TargetSubtargetInfo &STI, const TargetSchedModel *SM)
       : SchedModel(SM), TotalPackets(0) {
   ResourcesModel = STI.getInstrInfo()->CreateTargetScheduleState(STI);
@@ -64,6 +67,8 @@ public:
 
     Packet.resize(SchedModel->getIssueWidth());
     Packet.clear();
+    OldPacket.resize(SchedModel->getIssueWidth());
+    OldPacket.clear();
     ResourcesModel->clearResources();
   }
 
@@ -86,7 +91,12 @@ public:
 
   bool isResourceAvailable(SUnit *SU);
   bool reserveResources(SUnit *SU);
+  void savePacket();
   unsigned getTotalPackets() const { return TotalPackets; }
+
+  bool isInPacket(SUnit *SU) const {
+    return std::find(Packet.begin(), Packet.end(), SU) != Packet.end();
+  }
 };
 
 /// Extend the standard ScheduleDAGMI to provide more context and override the
@@ -100,8 +110,6 @@ public:
   /// Schedule - This is called back from ScheduleDAGInstrs::Run() when it's
   /// time to do some work.
   void schedule() override;
-  /// Perform platform-specific DAG postprocessing.
-  void postprocessDAG();
 };
 
 /// ConvergingVLIWScheduler shrinks the unscheduled zone using heuristics
@@ -167,6 +175,7 @@ class ConvergingVLIWScheduler : public MachineSchedStrategy {
     void init(VLIWMachineScheduler *dag, const TargetSchedModel *smodel) {
       DAG = dag;
       SchedModel = smodel;
+      IssueCount = 0;
     }
 
     bool isTop() const {
@@ -234,7 +243,10 @@ protected:
                                SchedCandidate &Candidate);
 #ifndef NDEBUG
   void traceCandidate(const char *Label, const ReadyQueue &Q, SUnit *SU,
-                      PressureChange P = PressureChange());
+                      int Cost, PressureChange P = PressureChange());
+
+  void readyQueueVerboseDump(const RegPressureTracker &RPTracker,
+                             SchedCandidate &Candidate, ReadyQueue &Q);
 #endif
 };
 
diff --git a/lib/Target/Hexagon/HexagonNewValueJump.cpp b/lib/Target/Hexagon/HexagonNewValueJump.cpp
index 20c4ab112b5f..3ffb9cffc6a6 100644
--- a/lib/Target/Hexagon/HexagonNewValueJump.cpp
+++ b/lib/Target/Hexagon/HexagonNewValueJump.cpp
@@ -21,14 +21,12 @@
 //
 //
 //===----------------------------------------------------------------------===//
-#include "llvm/PassSupport.h"
 #include "Hexagon.h"
 #include "HexagonInstrInfo.h"
 #include "HexagonMachineFunctionInfo.h"
 #include "HexagonRegisterInfo.h"
 #include "HexagonSubtarget.h"
 #include "HexagonTargetMachine.h"
-#include "llvm/ADT/DenseMap.h"
 #include "llvm/ADT/Statistic.h"
 #include "llvm/CodeGen/LiveVariables.h"
 #include "llvm/CodeGen/MachineFunctionAnalysis.h"
@@ -37,14 +35,13 @@
 #include "llvm/CodeGen/MachineRegisterInfo.h"
 #include "llvm/CodeGen/Passes.h"
 #include "llvm/CodeGen/ScheduleDAGInstrs.h"
+#include "llvm/PassSupport.h"
 #include "llvm/Support/CommandLine.h"
-#include "llvm/Support/Compiler.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/Target/TargetInstrInfo.h"
 #include "llvm/Target/TargetMachine.h"
 #include "llvm/Target/TargetRegisterInfo.h"
-#include <map>
 using namespace llvm;
 
 #define DEBUG_TYPE "hexagon-nvj"
@@ -87,12 +84,16 @@ namespace {
     }
 
     bool runOnMachineFunction(MachineFunction &Fn) override;
+    MachineFunctionProperties getRequiredProperties() const override {
+      return MachineFunctionProperties().set(
+          MachineFunctionProperties::Property::AllVRegsAllocated);
+    }
 
   private:
     /// \brief A handle to the branch probability pass.
     const MachineBranchProbabilityInfo *MBPI;
 
-    bool isNewValueJumpCandidate(const MachineInstr *MI) const;
+    bool isNewValueJumpCandidate(const MachineInstr &MI) const;
   };
 
 } // end of anonymous namespace
@@ -116,7 +117,7 @@ static bool canBeFeederToNewValueJump(const HexagonInstrInfo *QII,
                                       MachineFunction &MF) {
 
   // Predicated instruction can not be feeder to NVJ.
-  if (QII->isPredicated(II))
+  if (QII->isPredicated(*II))
     return false;
 
   // Bail out if feederReg is a paired register (double regs in
@@ -219,25 +220,24 @@ static bool canCompareBeNewValueJump(const HexagonInstrInfo *QII,
                                      MachineBasicBlock::iterator end,
                                      MachineFunction &MF) {
 
-  MachineInstr *MI = II;
+  MachineInstr &MI = *II;
 
   // If the second operand of the compare is an imm, make sure it's in the
   // range specified by the arch.
   if (!secondReg) {
-    int64_t v = MI->getOperand(2).getImm();
+    int64_t v = MI.getOperand(2).getImm();
 
-    if (!(isUInt<5>(v) ||
-         ((MI->getOpcode() == Hexagon::C2_cmpeqi ||
-           MI->getOpcode() == Hexagon::C2_cmpgti) &&
-          (v == -1))))
+    if (!(isUInt<5>(v) || ((MI.getOpcode() == Hexagon::C2_cmpeqi ||
+                            MI.getOpcode() == Hexagon::C2_cmpgti) &&
+                           (v == -1))))
       return false;
   }
 
   unsigned cmpReg1, cmpOp2 = 0; // cmpOp2 assignment silences compiler warning.
-  cmpReg1 = MI->getOperand(1).getReg();
+  cmpReg1 = MI.getOperand(1).getReg();
 
   if (secondReg) {
-    cmpOp2 = MI->getOperand(2).getReg();
+    cmpOp2 = MI.getOperand(2).getReg();
 
     // Make sure that that second register is not from COPY
     // At machine code level, we don't need this, but if we decide
@@ -367,22 +367,22 @@ static unsigned getNewValueJumpOpcode(MachineInstr *MI, int reg,
   return 0;
 }
 
-bool HexagonNewValueJump::isNewValueJumpCandidate(const MachineInstr *MI)
-      const {
-  switch (MI->getOpcode()) {
-    case Hexagon::C2_cmpeq:
-    case Hexagon::C2_cmpeqi:
-    case Hexagon::C2_cmpgt:
-    case Hexagon::C2_cmpgti:
-    case Hexagon::C2_cmpgtu:
-    case Hexagon::C2_cmpgtui:
-    case Hexagon::C4_cmpneq:
-    case Hexagon::C4_cmplte:
-    case Hexagon::C4_cmplteu:
-      return true;
-
-    default:
-      return false;
+bool HexagonNewValueJump::isNewValueJumpCandidate(
+    const MachineInstr &MI) const {
+  switch (MI.getOpcode()) {
+  case Hexagon::C2_cmpeq:
+  case Hexagon::C2_cmpeqi:
+  case Hexagon::C2_cmpgt:
+  case Hexagon::C2_cmpgti:
+  case Hexagon::C2_cmpgtu:
+  case Hexagon::C2_cmpgtui:
+  case Hexagon::C4_cmpneq:
+  case Hexagon::C4_cmplte:
+  case Hexagon::C4_cmplteu:
+    return true;
+
+  default:
+    return false;
   }
 }
 
@@ -393,6 +393,9 @@ bool HexagonNewValueJump::runOnMachineFunction(MachineFunction &MF) {
                << "********** Function: "
                << MF.getName() << "\n");
 
+  if (skipFunction(*MF.getFunction()))
+    return false;
+
   // If we move NewValueJump before register allocation we'll need live variable
   // analysis here too.
 
@@ -435,28 +438,27 @@ bool HexagonNewValueJump::runOnMachineFunction(MachineFunction &MF) {
     // Traverse the basic block - bottom up
     for (MachineBasicBlock::iterator MII = MBB->end(), E = MBB->begin();
              MII != E;) {
-      MachineInstr *MI = --MII;
-      if (MI->isDebugValue()) {
+      MachineInstr &MI = *--MII;
+      if (MI.isDebugValue()) {
         continue;
       }
 
       if ((nvjCount == 0) || (nvjCount > -1 && nvjCount <= nvjGenerated))
         break;
 
-      DEBUG(dbgs() << "Instr: "; MI->dump(); dbgs() << "\n");
+      DEBUG(dbgs() << "Instr: "; MI.dump(); dbgs() << "\n");
 
-      if (!foundJump &&
-         (MI->getOpcode() == Hexagon::J2_jumpt ||
-          MI->getOpcode() == Hexagon::J2_jumpf ||
-          MI->getOpcode() == Hexagon::J2_jumptnewpt ||
-          MI->getOpcode() == Hexagon::J2_jumptnew ||
-          MI->getOpcode() == Hexagon::J2_jumpfnewpt ||
-          MI->getOpcode() == Hexagon::J2_jumpfnew)) {
+      if (!foundJump && (MI.getOpcode() == Hexagon::J2_jumpt ||
+                         MI.getOpcode() == Hexagon::J2_jumpf ||
+                         MI.getOpcode() == Hexagon::J2_jumptnewpt ||
+                         MI.getOpcode() == Hexagon::J2_jumptnew ||
+                         MI.getOpcode() == Hexagon::J2_jumpfnewpt ||
+                         MI.getOpcode() == Hexagon::J2_jumpfnew)) {
         // This is where you would insert your compare and
         // instr that feeds compare
         jmpPos = MII;
-        jmpInstr = MI;
-        predReg = MI->getOperand(0).getReg();
+        jmpInstr = &MI;
+        predReg = MI.getOperand(0).getReg();
         afterRA = TargetRegisterInfo::isPhysicalRegister(predReg);
 
         // If ifconverter had not messed up with the kill flags of the
@@ -485,11 +487,13 @@ bool HexagonNewValueJump::runOnMachineFunction(MachineFunction &MF) {
         if (predLive)
           break;
 
-        jmpTarget = MI->getOperand(1).getMBB();
+        if (!MI.getOperand(1).isMBB())
+          continue;
+        jmpTarget = MI.getOperand(1).getMBB();
         foundJump = true;
-        if (MI->getOpcode() == Hexagon::J2_jumpf ||
-            MI->getOpcode() == Hexagon::J2_jumpfnewpt ||
-            MI->getOpcode() == Hexagon::J2_jumpfnew) {
+        if (MI.getOpcode() == Hexagon::J2_jumpf ||
+            MI.getOpcode() == Hexagon::J2_jumpfnewpt ||
+            MI.getOpcode() == Hexagon::J2_jumpfnew) {
           invertPredicate = true;
         }
         continue;
@@ -498,41 +502,40 @@ bool HexagonNewValueJump::runOnMachineFunction(MachineFunction &MF) {
       // No new value jump if there is a barrier. A barrier has to be in its
       // own packet. A barrier has zero operands. We conservatively bail out
       // here if we see any instruction with zero operands.
-      if (foundJump && MI->getNumOperands() == 0)
+      if (foundJump && MI.getNumOperands() == 0)
         break;
 
-      if (foundJump &&
-         !foundCompare &&
-          MI->getOperand(0).isReg() &&
-          MI->getOperand(0).getReg() == predReg) {
+      if (foundJump && !foundCompare && MI.getOperand(0).isReg() &&
+          MI.getOperand(0).getReg() == predReg) {
 
         // Not all compares can be new value compare. Arch Spec: 7.6.1.1
         if (isNewValueJumpCandidate(MI)) {
 
-          assert((MI->getDesc().isCompare()) &&
+          assert(
+              (MI.getDesc().isCompare()) &&
               "Only compare instruction can be collapsed into New Value Jump");
-          isSecondOpReg = MI->getOperand(2).isReg();
+          isSecondOpReg = MI.getOperand(2).isReg();
 
           if (!canCompareBeNewValueJump(QII, QRI, MII, predReg, isSecondOpReg,
                                         afterRA, jmpPos, MF))
             break;
 
-          cmpInstr = MI;
+          cmpInstr = &MI;
           cmpPos = MII;
           foundCompare = true;
 
           // We need cmpReg1 and cmpOp2(imm or reg) while building
           // new value jump instruction.
-          cmpReg1 = MI->getOperand(1).getReg();
-          if (MI->getOperand(1).isKill())
+          cmpReg1 = MI.getOperand(1).getReg();
+          if (MI.getOperand(1).isKill())
             MO1IsKill = true;
 
           if (isSecondOpReg) {
-            cmpOp2 = MI->getOperand(2).getReg();
-            if (MI->getOperand(2).isKill())
+            cmpOp2 = MI.getOperand(2).getReg();
+            if (MI.getOperand(2).isKill())
               MO2IsKill = true;
           } else
-            cmpOp2 = MI->getOperand(2).getImm();
+            cmpOp2 = MI.getOperand(2).getImm();
           continue;
         }
       }
@@ -545,13 +548,12 @@ bool HexagonNewValueJump::runOnMachineFunction(MachineFunction &MF) {
 
         bool foundFeeder = false;
         MachineBasicBlock::iterator feederPos = MII;
-        if (MI->getOperand(0).isReg() &&
-            MI->getOperand(0).isDef() &&
-           (MI->getOperand(0).getReg() == cmpReg1 ||
-            (isSecondOpReg &&
-             MI->getOperand(0).getReg() == (unsigned) cmpOp2))) {
+        if (MI.getOperand(0).isReg() && MI.getOperand(0).isDef() &&
+            (MI.getOperand(0).getReg() == cmpReg1 ||
+             (isSecondOpReg &&
+              MI.getOperand(0).getReg() == (unsigned)cmpOp2))) {
 
-          unsigned feederReg = MI->getOperand(0).getReg();
+          unsigned feederReg = MI.getOperand(0).getReg();
 
           // First try to see if we can get the feeder from the first operand
           // of the compare. If we can not, and if secondOpReg is true
@@ -600,15 +602,15 @@ bool HexagonNewValueJump::runOnMachineFunction(MachineFunction &MF) {
           // the operands of the feeder.
 
           bool updatedIsKill = false;
-          for (unsigned i = 0; i < MI->getNumOperands(); i++) {
-            MachineOperand &MO = MI->getOperand(i);
+          for (unsigned i = 0; i < MI.getNumOperands(); i++) {
+            MachineOperand &MO = MI.getOperand(i);
             if (MO.isReg() && MO.isUse()) {
               unsigned feederReg = MO.getReg();
               for (MachineBasicBlock::iterator localII = feederPos,
                    end = jmpPos; localII != end; localII++) {
-                MachineInstr *localMI = localII;
-                for (unsigned j = 0; j < localMI->getNumOperands(); j++) {
-                  MachineOperand &localMO = localMI->getOperand(j);
+                MachineInstr &localMI = *localII;
+                for (unsigned j = 0; j < localMI.getNumOperands(); j++) {
+                  MachineOperand &localMO = localMI.getOperand(j);
                   if (localMO.isReg() && localMO.isUse() &&
                       localMO.isKill() && feederReg == localMO.getReg()) {
                     // We found that there is kill of a use register
@@ -625,12 +627,12 @@ bool HexagonNewValueJump::runOnMachineFunction(MachineFunction &MF) {
             if (updatedIsKill) break;
           }
 
-          MBB->splice(jmpPos, MI->getParent(), MI);
-          MBB->splice(jmpPos, MI->getParent(), cmpInstr);
-          DebugLoc dl = MI->getDebugLoc();
+          MBB->splice(jmpPos, MI.getParent(), MI);
+          MBB->splice(jmpPos, MI.getParent(), cmpInstr);
+          DebugLoc dl = MI.getDebugLoc();
           MachineInstr *NewMI;
 
-          assert((isNewValueJumpCandidate(cmpInstr)) &&
+          assert((isNewValueJumpCandidate(*cmpInstr)) &&
                  "This compare is not a New Value Jump candidate.");
           unsigned opc = getNewValueJumpOpcode(cmpInstr, cmpOp2,
                                                isSecondOpNewified,
diff --git a/lib/Target/Hexagon/HexagonOperands.td b/lib/Target/Hexagon/HexagonOperands.td
index fbd29cd4d6d1..11092d2b92fe 100644
--- a/lib/Target/Hexagon/HexagonOperands.td
+++ b/lib/Target/Hexagon/HexagonOperands.td
@@ -8,6 +8,7 @@
 //===----------------------------------------------------------------------===//
 
 def s32ImmOperand : AsmOperandClass { let Name = "s32Imm"; }
+def s23_2ImmOperand : AsmOperandClass { let Name = "s23_2Imm"; }
 def s8ImmOperand : AsmOperandClass { let Name = "s8Imm"; }
 def s8Imm64Operand : AsmOperandClass { let Name = "s8Imm64"; }
 def s6ImmOperand : AsmOperandClass { let Name = "s6Imm"; }
@@ -48,6 +49,7 @@ let OperandType = "OPERAND_IMMEDIATE",
     DecoderMethod = "unsignedImmDecoder" in {
   def s32Imm : Operand<i32> { let ParserMatchClass = s32ImmOperand;
                               let DecoderMethod = "s32ImmDecoder"; }
+  def s23_2Imm : Operand<i32> { let ParserMatchClass = s23_2ImmOperand; }
   def s8Imm : Operand<i32> { let ParserMatchClass = s8ImmOperand;
                              let DecoderMethod = "s8ImmDecoder"; }
   def s8Imm64 : Operand<i64>  { let ParserMatchClass = s8Imm64Operand;
@@ -345,22 +347,6 @@ def u1ImmPred32  : PatLeaf<(i32 imm), [{
   return isUInt<1>(v);
 }]>;
 
-def m5BImmPred  : PatLeaf<(i32 imm), [{
-  // m5BImmPred predicate - True if the (char) number is in range -1 .. -31
-  // and will fit in a 5 bit field when made positive, for use in memops.
-  // this is specific to the zero extending of a negative by CombineInstr
-  int8_t v = (int8_t)N->getSExtValue();
-  return (-31 <= v && v <= -1);
-}]>;
-
-def m5HImmPred  : PatLeaf<(i32 imm), [{
-  // m5HImmPred predicate - True if the (short) number is in range -1 .. -31
-  // and will fit in a 5 bit field when made positive, for use in memops.
-  // this is specific to the zero extending of a negative by CombineInstr
-  int16_t v = (int16_t)N->getSExtValue();
-  return (-31 <= v && v <= -1);
-}]>;
-
 def m5ImmPred  : PatLeaf<(i32 imm), [{
   // m5ImmPred predicate - True if the number is in range -1 .. -31
   // and will fit in a 5 bit field when made positive, for use in memops.
@@ -402,60 +388,6 @@ def Clr5ImmPred : PatLeaf<(i32 imm), [{
   return ImmIsSingleBit(v);
 }]>;
 
-def SetClr5ImmPred : PatLeaf<(i32 imm), [{
-  // True if the immediate is in range 0..31.
-  int32_t v = (int32_t)N->getSExtValue();
-  return (v >= 0 && v <= 31);
-}]>;
-
-def Set4ImmPred : PatLeaf<(i32 imm), [{
-  // Set4ImmPred predicate - True if the number is in the series of values:
-  // [ 2^0, 2^1, ... 2^15 ].
-  // For use in setbit immediate.
-  uint16_t v = (int16_t)N->getSExtValue();
-  // Constrain to 16 bits, and then check for single bit.
-  return ImmIsSingleBit(v);
-}]>;
-
-def Clr4ImmPred : PatLeaf<(i32 imm), [{
-  // Clr4ImmPred predicate - True if the number is in the series of
-  // bit negated values:
-  // [ 2^0, 2^1, ... 2^15 ].
-  // For use in setbit and clrbit immediate.
-  uint16_t v = ~ (int16_t)N->getSExtValue();
-  // Constrain to 16 bits, and then check for single bit.
-  return ImmIsSingleBit(v);
-}]>;
-
-def SetClr4ImmPred : PatLeaf<(i32 imm), [{
-  // True if the immediate is in the range 0..15.
-  int16_t v = (int16_t)N->getSExtValue();
-  return (v >= 0 && v <= 15);
-}]>;
-
-def Set3ImmPred : PatLeaf<(i32 imm), [{
-  // True if the number is in the series of values: [ 2^0, 2^1, ... 2^7 ].
-  // For use in setbit immediate.
-  uint8_t v = (int8_t)N->getSExtValue();
-  // Constrain to 8 bits, and then check for single bit.
-  return ImmIsSingleBit(v);
-}]>;
-
-def Clr3ImmPred : PatLeaf<(i32 imm), [{
-  // True if the number is in the series of bit negated values: [ 2^0, 2^1, ... 2^7 ].
-  // For use in setbit and clrbit immediate.
-  uint8_t v = ~ (int8_t)N->getSExtValue();
-  // Constrain to 8 bits, and then check for single bit.
-  return ImmIsSingleBit(v);
-}]>;
-
-def SetClr3ImmPred : PatLeaf<(i32 imm), [{
-  // True if the immediate is in the range  0..7.
-  int8_t v = (int8_t)N->getSExtValue();
-  return (v >= 0 && v <= 7);
-}]>;
-
-
 // Extendable immediate operands.
 def f32ExtOperand : AsmOperandClass { let Name = "f32Ext"; }
 def s16ExtOperand : AsmOperandClass { let Name = "s16Ext"; }
diff --git a/lib/Target/Hexagon/HexagonOptAddrMode.cpp b/lib/Target/Hexagon/HexagonOptAddrMode.cpp
new file mode 100644
index 000000000000..4dff0dbc2b71
--- /dev/null
+++ b/lib/Target/Hexagon/HexagonOptAddrMode.cpp
@@ -0,0 +1,663 @@
+//===--- HexagonOptAddrMode.cpp -------------------------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+// This implements a Hexagon-specific pass to optimize addressing mode for
+// load/store instructions.
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "opt-addr-mode"
+
+#include "HexagonTargetMachine.h"
+#include "RDFGraph.h"
+#include "RDFLiveness.h"
+
+#include "llvm/ADT/DenseSet.h"
+#include "llvm/CodeGen/MachineBasicBlock.h"
+#include "llvm/CodeGen/MachineDominanceFrontier.h"
+#include "llvm/CodeGen/MachineDominators.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/Passes.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Target/TargetInstrInfo.h"
+#include "llvm/Target/TargetMachine.h"
+#include "llvm/Target/TargetRegisterInfo.h"
+
+static cl::opt<int> CodeGrowthLimit("hexagon-amode-growth-limit",
+  cl::Hidden, cl::init(0), cl::desc("Code growth limit for address mode "
+  "optimization"));
+
+using namespace llvm;
+using namespace rdf;
+
+namespace llvm {
+  FunctionPass *createHexagonOptAddrMode();
+  void initializeHexagonOptAddrModePass(PassRegistry &);
+}
+
+namespace {
+class HexagonOptAddrMode : public MachineFunctionPass {
+public:
+  static char ID;
+  HexagonOptAddrMode()
+      : MachineFunctionPass(ID), HII(0), MDT(0), DFG(0), LV(0) {
+    PassRegistry &R = *PassRegistry::getPassRegistry();
+    initializeHexagonOptAddrModePass(R);
+  }
+  const char *getPassName() const override {
+    return "Optimize addressing mode of load/store";
+  }
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    MachineFunctionPass::getAnalysisUsage(AU);
+    AU.addRequired<MachineDominatorTree>();
+    AU.addRequired<MachineDominanceFrontier>();
+    AU.setPreservesAll();
+  }
+  bool runOnMachineFunction(MachineFunction &MF) override;
+
+private:
+  typedef DenseSet<MachineInstr *> MISetType;
+  typedef DenseMap<MachineInstr *, bool> InstrEvalMap;
+  const HexagonInstrInfo *HII;
+  MachineDominatorTree *MDT;
+  DataFlowGraph *DFG;
+  DataFlowGraph::DefStackMap DefM;
+  std::map<RegisterRef, std::map<NodeId, NodeId>> RDefMap;
+  Liveness *LV;
+  MISetType Deleted;
+
+  bool processBlock(NodeAddr<BlockNode *> BA);
+  bool xformUseMI(MachineInstr *TfrMI, MachineInstr *UseMI,
+                  NodeAddr<UseNode *> UseN, unsigned UseMOnum);
+  bool analyzeUses(unsigned DefR, const NodeList &UNodeList,
+                   InstrEvalMap &InstrEvalResult, short &SizeInc);
+  bool hasRepForm(MachineInstr *MI, unsigned TfrDefR);
+  bool canRemoveAddasl(NodeAddr<StmtNode *> AddAslSN, MachineInstr *MI,
+                       const NodeList &UNodeList);
+  void getAllRealUses(NodeAddr<StmtNode *> SN, NodeList &UNodeList);
+  bool allValidCandidates(NodeAddr<StmtNode *> SA, NodeList &UNodeList);
+  short getBaseWithLongOffset(const MachineInstr *MI) const;
+  void updateMap(NodeAddr<InstrNode *> IA);
+  bool constructDefMap(MachineBasicBlock *B);
+  bool changeStore(MachineInstr *OldMI, MachineOperand ImmOp,
+                   unsigned ImmOpNum);
+  bool changeLoad(MachineInstr *OldMI, MachineOperand ImmOp, unsigned ImmOpNum);
+  bool changeAddAsl(NodeAddr<UseNode *> AddAslUN, MachineInstr *AddAslMI,
+                    const MachineOperand &ImmOp, unsigned ImmOpNum);
+};
+}
+
+char HexagonOptAddrMode::ID = 0;
+
+INITIALIZE_PASS_BEGIN(HexagonOptAddrMode, "opt-amode",
+                      "Optimize addressing mode", false, false)
+INITIALIZE_PASS_DEPENDENCY(MachineDominatorTree)
+INITIALIZE_PASS_DEPENDENCY(MachineDominanceFrontier)
+INITIALIZE_PASS_END(HexagonOptAddrMode, "opt-amode", "Optimize addressing mode",
+                    false, false)
+
+bool HexagonOptAddrMode::hasRepForm(MachineInstr *MI, unsigned TfrDefR) {
+  const MCInstrDesc &MID = MI->getDesc();
+
+  if ((!MID.mayStore() && !MID.mayLoad()) || HII->isPredicated(*MI))
+    return false;
+
+  if (MID.mayStore()) {
+    MachineOperand StOp = MI->getOperand(MI->getNumOperands() - 1);
+    if (StOp.isReg() && StOp.getReg() == TfrDefR)
+      return false;
+  }
+
+  if (HII->getAddrMode(MI) == HexagonII::BaseRegOffset)
+    // Tranform to Absolute plus register offset.
+    return (HII->getBaseWithLongOffset(MI) >= 0);
+  else if (HII->getAddrMode(MI) == HexagonII::BaseImmOffset)
+    // Tranform to absolute addressing mode.
+    return (HII->getAbsoluteForm(MI) >= 0);
+
+  return false;
+}
+
+// Check if addasl instruction can be removed. This is possible only
+// if it's feeding to only load/store instructions with base + register
+// offset as these instruction can be tranformed to use 'absolute plus
+// shifted register offset'.
+// ex:
+// Rs = ##foo
+// Rx = addasl(Rs, Rt, #2)
+// Rd = memw(Rx + #28)
+// Above three instructions can be replaced with Rd = memw(Rt<<#2 + ##foo+28)
+
+bool HexagonOptAddrMode::canRemoveAddasl(NodeAddr<StmtNode *> AddAslSN,
+                                         MachineInstr *MI,
+                                         const NodeList &UNodeList) {
+  // check offset size in addasl. if 'offset > 3' return false
+  const MachineOperand &OffsetOp = MI->getOperand(3);
+  if (!OffsetOp.isImm() || OffsetOp.getImm() > 3)
+    return false;
+
+  unsigned OffsetReg = MI->getOperand(2).getReg();
+  RegisterRef OffsetRR;
+  NodeId OffsetRegRD = 0;
+  for (NodeAddr<UseNode *> UA : AddAslSN.Addr->members_if(DFG->IsUse, *DFG)) {
+    RegisterRef RR = UA.Addr->getRegRef();
+    if (OffsetReg == RR.Reg) {
+      OffsetRR = RR;
+      OffsetRegRD = UA.Addr->getReachingDef();
+    }
+  }
+
+  for (auto I = UNodeList.rbegin(), E = UNodeList.rend(); I != E; ++I) {
+    NodeAddr<UseNode *> UA = *I;
+    NodeAddr<InstrNode *> IA = UA.Addr->getOwner(*DFG);
+    if ((UA.Addr->getFlags() & NodeAttrs::PhiRef) ||
+        RDefMap[OffsetRR][IA.Id] != OffsetRegRD)
+      return false;
+
+    MachineInstr *UseMI = NodeAddr<StmtNode *>(IA).Addr->getCode();
+    NodeAddr<DefNode *> OffsetRegDN = DFG->addr<DefNode *>(OffsetRegRD);
+    // Reaching Def to an offset register can't be a phi.
+    if ((OffsetRegDN.Addr->getFlags() & NodeAttrs::PhiRef) &&
+        MI->getParent() != UseMI->getParent())
+    return false;
+
+    const MCInstrDesc &UseMID = UseMI->getDesc();
+    if ((!UseMID.mayLoad() && !UseMID.mayStore()) ||
+        HII->getAddrMode(UseMI) != HexagonII::BaseImmOffset ||
+        getBaseWithLongOffset(UseMI) < 0)
+      return false;
+
+    // Addasl output can't be a store value.
+    if (UseMID.mayStore() && UseMI->getOperand(2).isReg() &&
+        UseMI->getOperand(2).getReg() == MI->getOperand(0).getReg())
+      return false;
+
+    for (auto &Mo : UseMI->operands())
+      if (Mo.isFI())
+        return false;
+  }
+  return true;
+}
+
+bool HexagonOptAddrMode::allValidCandidates(NodeAddr<StmtNode *> SA,
+                                            NodeList &UNodeList) {
+  for (auto I = UNodeList.rbegin(), E = UNodeList.rend(); I != E; ++I) {
+    NodeAddr<UseNode *> UN = *I;
+    RegisterRef UR = UN.Addr->getRegRef();
+    NodeSet Visited, Defs;
+    const auto &ReachingDefs = LV->getAllReachingDefsRec(UR, UN, Visited, Defs);
+    if (ReachingDefs.size() > 1) {
+      DEBUG({
+        dbgs() << "*** Multiple Reaching Defs found!!! ***\n";
+        for (auto DI : ReachingDefs) {
+          NodeAddr<UseNode *> DA = DFG->addr<UseNode *>(DI);
+          NodeAddr<StmtNode *> TempIA = DA.Addr->getOwner(*DFG);
+          dbgs() << "\t\t[Reaching Def]: "
+                 << Print<NodeAddr<InstrNode *>>(TempIA, *DFG) << "\n";
+        }
+      });
+      return false;
+    }
+  }
+  return true;
+}
+
+void HexagonOptAddrMode::getAllRealUses(NodeAddr<StmtNode *> SA,
+                                        NodeList &UNodeList) {
+  for (NodeAddr<DefNode *> DA : SA.Addr->members_if(DFG->IsDef, *DFG)) {
+    DEBUG(dbgs() << "\t\t[DefNode]: " << Print<NodeAddr<DefNode *>>(DA, *DFG)
+                 << "\n");
+    RegisterRef DR = DA.Addr->getRegRef();
+    auto UseSet = LV->getAllReachedUses(DR, DA);
+
+    for (auto UI : UseSet) {
+      NodeAddr<UseNode *> UA = DFG->addr<UseNode *>(UI);
+      DEBUG({
+        NodeAddr<StmtNode *> TempIA = UA.Addr->getOwner(*DFG);
+        dbgs() << "\t\t\t[Reached Use]: "
+               << Print<NodeAddr<InstrNode *>>(TempIA, *DFG) << "\n";
+      });
+
+      if (UA.Addr->getFlags() & NodeAttrs::PhiRef) {
+        NodeAddr<PhiNode *> PA = UA.Addr->getOwner(*DFG);
+        NodeId id = PA.Id;
+        const Liveness::RefMap &phiUse = LV->getRealUses(id);
+        DEBUG(dbgs() << "\t\t\t\tphi real Uses"
+                     << Print<Liveness::RefMap>(phiUse, *DFG) << "\n");
+        if (phiUse.size() > 0) {
+          for (auto I : phiUse) {
+            if (DR != I.first)
+              continue;
+            auto phiUseSet = I.second;
+            for (auto phiUI : phiUseSet) {
+              NodeAddr<UseNode *> phiUA = DFG->addr<UseNode *>(phiUI);
+              UNodeList.push_back(phiUA);
+            }
+          }
+        }
+      } else
+        UNodeList.push_back(UA);
+    }
+  }
+}
+
+bool HexagonOptAddrMode::analyzeUses(unsigned tfrDefR,
+                                     const NodeList &UNodeList,
+                                     InstrEvalMap &InstrEvalResult,
+                                     short &SizeInc) {
+  bool KeepTfr = false;
+  bool HasRepInstr = false;
+  InstrEvalResult.clear();
+
+  for (auto I = UNodeList.rbegin(), E = UNodeList.rend(); I != E; ++I) {
+    bool CanBeReplaced = false;
+    NodeAddr<UseNode *> UN = *I;
+    NodeAddr<StmtNode *> SN = UN.Addr->getOwner(*DFG);
+    MachineInstr *MI = SN.Addr->getCode();
+    const MCInstrDesc &MID = MI->getDesc();
+    if ((MID.mayLoad() || MID.mayStore())) {
+      if (!hasRepForm(MI, tfrDefR)) {
+        KeepTfr = true;
+        continue;
+      }
+      SizeInc++;
+      CanBeReplaced = true;
+    } else if (MI->getOpcode() == Hexagon::S2_addasl_rrri) {
+      NodeList AddaslUseList;
+
+      DEBUG(dbgs() << "\nGetting ReachedUses for === " << *MI << "\n");
+      getAllRealUses(SN, AddaslUseList);
+      // Process phi nodes.
+      if (allValidCandidates(SN, AddaslUseList) &&
+          canRemoveAddasl(SN, MI, AddaslUseList)) {
+        SizeInc += AddaslUseList.size();
+        SizeInc -= 1; // Reduce size by 1 as addasl itself can be removed.
+        CanBeReplaced = true;
+      } else
+        SizeInc++;
+    } else
+      // Currently, only load/store and addasl are handled.
+      // Some other instructions to consider -
+      // A2_add -> A2_addi
+      // M4_mpyrr_addr -> M4_mpyrr_addi
+      KeepTfr = true;
+
+    InstrEvalResult[MI] = CanBeReplaced;
+    HasRepInstr |= CanBeReplaced;
+  }
+
+  // Reduce total size by 2 if original tfr can be deleted.
+  if (!KeepTfr)
+    SizeInc -= 2;
+
+  return HasRepInstr;
+}
+
+bool HexagonOptAddrMode::changeLoad(MachineInstr *OldMI, MachineOperand ImmOp,
+                                    unsigned ImmOpNum) {
+  bool Changed = false;
+  MachineBasicBlock *BB = OldMI->getParent();
+  auto UsePos = MachineBasicBlock::iterator(OldMI);
+  MachineBasicBlock::instr_iterator InsertPt = UsePos.getInstrIterator();
+  ++InsertPt;
+  unsigned OpStart;
+  unsigned OpEnd = OldMI->getNumOperands();
+  MachineInstrBuilder MIB;
+
+  if (ImmOpNum == 1) {
+    if (HII->getAddrMode(OldMI) == HexagonII::BaseRegOffset) {
+      short NewOpCode = HII->getBaseWithLongOffset(OldMI);
+      assert(NewOpCode >= 0 && "Invalid New opcode\n");
+      MIB = BuildMI(*BB, InsertPt, OldMI->getDebugLoc(), HII->get(NewOpCode));
+      MIB.addOperand(OldMI->getOperand(0));
+      MIB.addOperand(OldMI->getOperand(2));
+      MIB.addOperand(OldMI->getOperand(3));
+      MIB.addOperand(ImmOp);
+      OpStart = 4;
+      Changed = true;
+    } else if (HII->getAddrMode(OldMI) == HexagonII::BaseImmOffset) {
+      short NewOpCode = HII->getAbsoluteForm(OldMI);
+      assert(NewOpCode >= 0 && "Invalid New opcode\n");
+      MIB = BuildMI(*BB, InsertPt, OldMI->getDebugLoc(), HII->get(NewOpCode))
+                .addOperand(OldMI->getOperand(0));
+      const GlobalValue *GV = ImmOp.getGlobal();
+      int64_t Offset = ImmOp.getOffset() + OldMI->getOperand(2).getImm();
+
+      MIB.addGlobalAddress(GV, Offset, ImmOp.getTargetFlags());
+      OpStart = 3;
+      Changed = true;
+    } else
+      Changed = false;
+
+    DEBUG(dbgs() << "[Changing]: " << *OldMI << "\n");
+    DEBUG(dbgs() << "[TO]: " << MIB << "\n");
+  } else if (ImmOpNum == 2 && OldMI->getOperand(3).getImm() == 0) {
+    short NewOpCode = HII->xformRegToImmOffset(OldMI);
+    assert(NewOpCode >= 0 && "Invalid New opcode\n");
+    MIB = BuildMI(*BB, InsertPt, OldMI->getDebugLoc(), HII->get(NewOpCode));
+    MIB.addOperand(OldMI->getOperand(0));
+    MIB.addOperand(OldMI->getOperand(1));
+    MIB.addOperand(ImmOp);
+    OpStart = 4;
+    Changed = true;
+    DEBUG(dbgs() << "[Changing]: " << *OldMI << "\n");
+    DEBUG(dbgs() << "[TO]: " << MIB << "\n");
+  }
+
+  if (Changed)
+    for (unsigned i = OpStart; i < OpEnd; ++i)
+      MIB.addOperand(OldMI->getOperand(i));
+
+  return Changed;
+}
+
+bool HexagonOptAddrMode::changeStore(MachineInstr *OldMI, MachineOperand ImmOp,
+                                     unsigned ImmOpNum) {
+  bool Changed = false;
+  unsigned OpStart;
+  unsigned OpEnd = OldMI->getNumOperands();
+  MachineBasicBlock *BB = OldMI->getParent();
+  auto UsePos = MachineBasicBlock::iterator(OldMI);
+  MachineBasicBlock::instr_iterator InsertPt = UsePos.getInstrIterator();
+  ++InsertPt;
+  MachineInstrBuilder MIB;
+  if (ImmOpNum == 0) {
+    if (HII->getAddrMode(OldMI) == HexagonII::BaseRegOffset) {
+      short NewOpCode = HII->getBaseWithLongOffset(OldMI);
+      assert(NewOpCode >= 0 && "Invalid New opcode\n");
+      MIB = BuildMI(*BB, InsertPt, OldMI->getDebugLoc(), HII->get(NewOpCode));
+      MIB.addOperand(OldMI->getOperand(1));
+      MIB.addOperand(OldMI->getOperand(2));
+      MIB.addOperand(ImmOp);
+      MIB.addOperand(OldMI->getOperand(3));
+      OpStart = 4;
+    } else if (HII->getAddrMode(OldMI) == HexagonII::BaseImmOffset) {
+      short NewOpCode = HII->getAbsoluteForm(OldMI);
+      assert(NewOpCode >= 0 && "Invalid New opcode\n");
+      MIB = BuildMI(*BB, InsertPt, OldMI->getDebugLoc(), HII->get(NewOpCode));
+      const GlobalValue *GV = ImmOp.getGlobal();
+      int64_t Offset = ImmOp.getOffset() + OldMI->getOperand(1).getImm();
+      MIB.addGlobalAddress(GV, Offset, ImmOp.getTargetFlags());
+      MIB.addOperand(OldMI->getOperand(2));
+      OpStart = 3;
+    }
+    Changed = true;
+    DEBUG(dbgs() << "[Changing]: " << *OldMI << "\n");
+    DEBUG(dbgs() << "[TO]: " << MIB << "\n");
+  } else if (ImmOpNum == 1 && OldMI->getOperand(2).getImm() == 0) {
+    short NewOpCode = HII->xformRegToImmOffset(OldMI);
+    assert(NewOpCode >= 0 && "Invalid New opcode\n");
+    MIB = BuildMI(*BB, InsertPt, OldMI->getDebugLoc(), HII->get(NewOpCode));
+    MIB.addOperand(OldMI->getOperand(0));
+    MIB.addOperand(ImmOp);
+    MIB.addOperand(OldMI->getOperand(1));
+    OpStart = 2;
+    Changed = true;
+    DEBUG(dbgs() << "[Changing]: " << *OldMI << "\n");
+    DEBUG(dbgs() << "[TO]: " << MIB << "\n");
+  }
+  if (Changed)
+    for (unsigned i = OpStart; i < OpEnd; ++i)
+      MIB.addOperand(OldMI->getOperand(i));
+
+  return Changed;
+}
+
+short HexagonOptAddrMode::getBaseWithLongOffset(const MachineInstr *MI) const {
+  if (HII->getAddrMode(MI) == HexagonII::BaseImmOffset) {
+    short TempOpCode = HII->getBaseWithRegOffset(MI);
+    return HII->getBaseWithLongOffset(TempOpCode);
+  } else
+    return HII->getBaseWithLongOffset(MI);
+}
+
+bool HexagonOptAddrMode::changeAddAsl(NodeAddr<UseNode *> AddAslUN,
+                                      MachineInstr *AddAslMI,
+                                      const MachineOperand &ImmOp,
+                                      unsigned ImmOpNum) {
+  NodeAddr<StmtNode *> SA = AddAslUN.Addr->getOwner(*DFG);
+
+  DEBUG(dbgs() << "Processing addasl :" << *AddAslMI << "\n");
+
+  NodeList UNodeList;
+  getAllRealUses(SA, UNodeList);
+
+  for (auto I = UNodeList.rbegin(), E = UNodeList.rend(); I != E; ++I) {
+    NodeAddr<UseNode *> UseUN = *I;
+    assert(!(UseUN.Addr->getFlags() & NodeAttrs::PhiRef) &&
+           "Can't transform this 'AddAsl' instruction!");
+
+    NodeAddr<StmtNode *> UseIA = UseUN.Addr->getOwner(*DFG);
+    DEBUG(dbgs() << "[InstrNode]: " << Print<NodeAddr<InstrNode *>>(UseIA, *DFG)
+                 << "\n");
+    MachineInstr *UseMI = UseIA.Addr->getCode();
+    DEBUG(dbgs() << "[MI <BB#" << UseMI->getParent()->getNumber()
+                 << ">]: " << *UseMI << "\n");
+    const MCInstrDesc &UseMID = UseMI->getDesc();
+    assert(HII->getAddrMode(UseMI) == HexagonII::BaseImmOffset);
+
+    auto UsePos = MachineBasicBlock::iterator(UseMI);
+    MachineBasicBlock::instr_iterator InsertPt = UsePos.getInstrIterator();
+    short NewOpCode = getBaseWithLongOffset(UseMI);
+    assert(NewOpCode >= 0 && "Invalid New opcode\n");
+
+    unsigned OpStart;
+    unsigned OpEnd = UseMI->getNumOperands();
+
+    MachineBasicBlock *BB = UseMI->getParent();
+    MachineInstrBuilder MIB =
+        BuildMI(*BB, InsertPt, UseMI->getDebugLoc(), HII->get(NewOpCode));
+    // change mem(Rs + # ) -> mem(Rt << # + ##)
+    if (UseMID.mayLoad()) {
+      MIB.addOperand(UseMI->getOperand(0));
+      MIB.addOperand(AddAslMI->getOperand(2));
+      MIB.addOperand(AddAslMI->getOperand(3));
+      const GlobalValue *GV = ImmOp.getGlobal();
+      MIB.addGlobalAddress(GV, UseMI->getOperand(2).getImm(),
+                           ImmOp.getTargetFlags());
+      OpStart = 3;
+    } else if (UseMID.mayStore()) {
+      MIB.addOperand(AddAslMI->getOperand(2));
+      MIB.addOperand(AddAslMI->getOperand(3));
+      const GlobalValue *GV = ImmOp.getGlobal();
+      MIB.addGlobalAddress(GV, UseMI->getOperand(1).getImm(),
+                           ImmOp.getTargetFlags());
+      MIB.addOperand(UseMI->getOperand(2));
+      OpStart = 3;
+    } else
+      llvm_unreachable("Unhandled instruction");
+
+    for (unsigned i = OpStart; i < OpEnd; ++i)
+      MIB.addOperand(UseMI->getOperand(i));
+
+    Deleted.insert(UseMI);
+  }
+
+  return true;
+}
+
+bool HexagonOptAddrMode::xformUseMI(MachineInstr *TfrMI, MachineInstr *UseMI,
+                                    NodeAddr<UseNode *> UseN,
+                                    unsigned UseMOnum) {
+  const MachineOperand ImmOp = TfrMI->getOperand(1);
+  const MCInstrDesc &MID = UseMI->getDesc();
+  unsigned Changed = false;
+  if (MID.mayLoad())
+    Changed = changeLoad(UseMI, ImmOp, UseMOnum);
+  else if (MID.mayStore())
+    Changed = changeStore(UseMI, ImmOp, UseMOnum);
+  else if (UseMI->getOpcode() == Hexagon::S2_addasl_rrri)
+    Changed = changeAddAsl(UseN, UseMI, ImmOp, UseMOnum);
+
+  if (Changed)
+    Deleted.insert(UseMI);
+
+  return Changed;
+}
+
+bool HexagonOptAddrMode::processBlock(NodeAddr<BlockNode *> BA) {
+  bool Changed = false;
+
+  for (auto IA : BA.Addr->members(*DFG)) {
+    if (!DFG->IsCode<NodeAttrs::Stmt>(IA))
+      continue;
+
+    NodeAddr<StmtNode *> SA = IA;
+    MachineInstr *MI = SA.Addr->getCode();
+    if (MI->getOpcode() != Hexagon::A2_tfrsi ||
+        !MI->getOperand(1).isGlobal())
+      continue;
+
+    DEBUG(dbgs() << "[Analyzing A2_tfrsi]: " << *MI << "\n");
+    DEBUG(dbgs() << "\t[InstrNode]: " << Print<NodeAddr<InstrNode *>>(IA, *DFG)
+                 << "\n");
+
+    NodeList UNodeList;
+    getAllRealUses(SA, UNodeList);
+
+    if (!allValidCandidates(SA, UNodeList))
+      continue;
+
+    short SizeInc = 0;
+    unsigned DefR = MI->getOperand(0).getReg();
+    InstrEvalMap InstrEvalResult;
+
+    // Analyze all uses and calculate increase in size. Perform the optimization
+    // only if there is no increase in size.
+    if (!analyzeUses(DefR, UNodeList, InstrEvalResult, SizeInc))
+      continue;
+    if (SizeInc > CodeGrowthLimit)
+      continue;
+
+    bool KeepTfr = false;
+
+    DEBUG(dbgs() << "\t[Total reached uses] : " << UNodeList.size() << "\n");
+    DEBUG(dbgs() << "\t[Processing Reached Uses] ===\n");
+    for (auto I = UNodeList.rbegin(), E = UNodeList.rend(); I != E; ++I) {
+      NodeAddr<UseNode *> UseN = *I;
+      assert(!(UseN.Addr->getFlags() & NodeAttrs::PhiRef) &&
+             "Found a PhiRef node as a real reached use!!");
+
+      NodeAddr<StmtNode *> OwnerN = UseN.Addr->getOwner(*DFG);
+      MachineInstr *UseMI = OwnerN.Addr->getCode();
+      DEBUG(dbgs() << "\t\t[MI <BB#" << UseMI->getParent()->getNumber()
+                   << ">]: " << *UseMI << "\n");
+
+      int UseMOnum = -1;
+      unsigned NumOperands = UseMI->getNumOperands();
+      for (unsigned j = 0; j < NumOperands - 1; ++j) {
+        const MachineOperand &op = UseMI->getOperand(j);
+        if (op.isReg() && op.isUse() && DefR == op.getReg())
+          UseMOnum = j;
+      }
+      assert(UseMOnum >= 0 && "Invalid reached use!");
+
+      if (InstrEvalResult[UseMI])
+        // Change UseMI if replacement is possible.
+        Changed |= xformUseMI(MI, UseMI, UseN, UseMOnum);
+      else
+        KeepTfr = true;
+    }
+    if (!KeepTfr)
+      Deleted.insert(MI);
+  }
+  return Changed;
+}
+
+void HexagonOptAddrMode::updateMap(NodeAddr<InstrNode *> IA) {
+  RegisterSet RRs;
+  for (NodeAddr<RefNode *> RA : IA.Addr->members(*DFG))
+    RRs.insert(RA.Addr->getRegRef());
+  bool Common = false;
+  for (auto &R : RDefMap) {
+    if (!RRs.count(R.first))
+      continue;
+    Common = true;
+    break;
+  }
+  if (!Common)
+    return;
+
+  for (auto &R : RDefMap) {
+    auto F = DefM.find(R.first);
+    if (F == DefM.end() || F->second.empty())
+      continue;
+    R.second[IA.Id] = F->second.top()->Id;
+  }
+}
+
+bool HexagonOptAddrMode::constructDefMap(MachineBasicBlock *B) {
+  bool Changed = false;
+  auto BA = DFG->getFunc().Addr->findBlock(B, *DFG);
+  DFG->markBlock(BA.Id, DefM);
+
+  for (NodeAddr<InstrNode *> IA : BA.Addr->members(*DFG)) {
+    updateMap(IA);
+    DFG->pushDefs(IA, DefM);
+  }
+
+  MachineDomTreeNode *N = MDT->getNode(B);
+  for (auto I : *N)
+    Changed |= constructDefMap(I->getBlock());
+
+  DFG->releaseBlock(BA.Id, DefM);
+  return Changed;
+}
+
+bool HexagonOptAddrMode::runOnMachineFunction(MachineFunction &MF) {
+  bool Changed = false;
+  auto &HST = MF.getSubtarget<HexagonSubtarget>();
+  auto &MRI = MF.getRegInfo();
+  HII = HST.getInstrInfo();
+  const auto &MDF = getAnalysis<MachineDominanceFrontier>();
+  MDT = &getAnalysis<MachineDominatorTree>();
+  const auto &TRI = *MF.getSubtarget().getRegisterInfo();
+  const TargetOperandInfo TOI(*HII);
+
+  RegisterAliasInfo RAI(TRI);
+  DataFlowGraph G(MF, *HII, TRI, *MDT, MDF, RAI, TOI);
+  G.build();
+  DFG = &G;
+
+  Liveness L(MRI, *DFG);
+  L.computePhiInfo();
+  LV = &L;
+
+  constructDefMap(&DFG->getMF().front());
+
+  Deleted.clear();
+  NodeAddr<FuncNode *> FA = DFG->getFunc();
+  DEBUG(dbgs() << "==== [RefMap#]=====:\n "
+               << Print<NodeAddr<FuncNode *>>(FA, *DFG) << "\n");
+
+  for (NodeAddr<BlockNode *> BA : FA.Addr->members(*DFG))
+    Changed |= processBlock(BA);
+
+  for (auto MI : Deleted)
+    MI->eraseFromParent();
+
+  if (Changed) {
+    G.build();
+    L.computeLiveIns();
+    L.resetLiveIns();
+    L.resetKills();
+  }
+
+  return Changed;
+}
+
+//===----------------------------------------------------------------------===//
+//                         Public Constructor Functions
+//===----------------------------------------------------------------------===//
+
+FunctionPass *llvm::createHexagonOptAddrMode() {
+  return new HexagonOptAddrMode();
+}
diff --git a/lib/Target/Hexagon/HexagonOptimizeSZextends.cpp b/lib/Target/Hexagon/HexagonOptimizeSZextends.cpp
index 1723771550c9..7937a7908b06 100644
--- a/lib/Target/Hexagon/HexagonOptimizeSZextends.cpp
+++ b/lib/Target/Hexagon/HexagonOptimizeSZextends.cpp
@@ -69,6 +69,9 @@ bool HexagonOptimizeSZextends::intrinsicAlreadySextended(Intrinsic::ID IntID) {
 }
 
 bool HexagonOptimizeSZextends::runOnFunction(Function &F) {
+  if (skipFunction(F))
+    return false;
+
   unsigned Idx = 1;
   // Try to optimize sign extends in formal parameters. It's relying on
   // callee already sign extending the values. I'm not sure if our ABI
diff --git a/lib/Target/Hexagon/HexagonPeephole.cpp b/lib/Target/Hexagon/HexagonPeephole.cpp
index e68ff85b1da6..b064decc5c76 100644
--- a/lib/Target/Hexagon/HexagonPeephole.cpp
+++ b/lib/Target/Hexagon/HexagonPeephole.cpp
@@ -67,11 +67,11 @@ static cl::opt<bool> DisablePNotP("disable-hexagon-pnotp",
     cl::desc("Disable Optimization of PNotP"));
 
 static cl::opt<bool> DisableOptSZExt("disable-hexagon-optszext",
-    cl::Hidden, cl::ZeroOrMore, cl::init(false),
+    cl::Hidden, cl::ZeroOrMore, cl::init(true),
     cl::desc("Disable Optimization of Sign/Zero Extends"));
 
 static cl::opt<bool> DisableOptExtTo64("disable-hexagon-opt-ext-to-64",
-    cl::Hidden, cl::ZeroOrMore, cl::init(false),
+    cl::Hidden, cl::ZeroOrMore, cl::init(true),
     cl::desc("Disable Optimization of extensions to i64."));
 
 namespace llvm {
@@ -112,6 +112,9 @@ INITIALIZE_PASS(HexagonPeephole, "hexagon-peephole", "Hexagon Peephole",
                 false, false)
 
 bool HexagonPeephole::runOnMachineFunction(MachineFunction &MF) {
+  if (skipFunction(*MF.getFunction()))
+    return false;
+
   QII = static_cast<const HexagonInstrInfo *>(MF.getSubtarget().getInstrInfo());
   QRI = MF.getSubtarget<HexagonSubtarget>().getRegisterInfo();
   MRI = &MF.getRegInfo();
@@ -129,15 +132,13 @@ bool HexagonPeephole::runOnMachineFunction(MachineFunction &MF) {
     PeepholeDoubleRegsMap.clear();
 
     // Traverse the basic block.
-    for (MachineBasicBlock::iterator MII = MBB->begin(); MII != MBB->end();
-                                     ++MII) {
-      MachineInstr *MI = MII;
+    for (MachineInstr &MI : *MBB) {
       // Look for sign extends:
       // %vreg170<def> = SXTW %vreg166
-      if (!DisableOptSZExt && MI->getOpcode() == Hexagon::A2_sxtw) {
-        assert (MI->getNumOperands() == 2);
-        MachineOperand &Dst = MI->getOperand(0);
-        MachineOperand &Src  = MI->getOperand(1);
+      if (!DisableOptSZExt && MI.getOpcode() == Hexagon::A2_sxtw) {
+        assert(MI.getNumOperands() == 2);
+        MachineOperand &Dst = MI.getOperand(0);
+        MachineOperand &Src = MI.getOperand(1);
         unsigned DstReg = Dst.getReg();
         unsigned SrcReg = Src.getReg();
         // Just handle virtual registers.
@@ -152,12 +153,11 @@ bool HexagonPeephole::runOnMachineFunction(MachineFunction &MF) {
 
       // Look for  %vreg170<def> = COMBINE_ir_V4 (0, %vreg169)
       // %vreg170:DoublRegs, %vreg169:IntRegs
-      if (!DisableOptExtTo64 &&
-          MI->getOpcode () == Hexagon::A4_combineir) {
-        assert (MI->getNumOperands() == 3);
-        MachineOperand &Dst = MI->getOperand(0);
-        MachineOperand &Src1 = MI->getOperand(1);
-        MachineOperand &Src2 = MI->getOperand(2);
+      if (!DisableOptExtTo64 && MI.getOpcode() == Hexagon::A4_combineir) {
+        assert(MI.getNumOperands() == 3);
+        MachineOperand &Dst = MI.getOperand(0);
+        MachineOperand &Src1 = MI.getOperand(1);
+        MachineOperand &Src2 = MI.getOperand(2);
         if (Src1.getImm() != 0)
           continue;
         unsigned DstReg = Dst.getReg();
@@ -170,11 +170,11 @@ bool HexagonPeephole::runOnMachineFunction(MachineFunction &MF) {
       // %vregIntReg = COPY %vregDoubleReg1:subreg_loreg.
       // and convert into
       // %vregIntReg = COPY %vregDoubleReg0:subreg_hireg.
-      if (MI->getOpcode() == Hexagon::S2_lsr_i_p) {
-        assert(MI->getNumOperands() == 3);
-        MachineOperand &Dst = MI->getOperand(0);
-        MachineOperand &Src1 = MI->getOperand(1);
-        MachineOperand &Src2 = MI->getOperand(2);
+      if (MI.getOpcode() == Hexagon::S2_lsr_i_p) {
+        assert(MI.getNumOperands() == 3);
+        MachineOperand &Dst = MI.getOperand(0);
+        MachineOperand &Src1 = MI.getOperand(1);
+        MachineOperand &Src2 = MI.getOperand(2);
         if (Src2.getImm() != 32)
           continue;
         unsigned DstReg = Dst.getReg();
@@ -184,11 +184,10 @@ bool HexagonPeephole::runOnMachineFunction(MachineFunction &MF) {
       }
 
       // Look for P=NOT(P).
-      if (!DisablePNotP &&
-          (MI->getOpcode() == Hexagon::C2_not)) {
-        assert (MI->getNumOperands() == 2);
-        MachineOperand &Dst = MI->getOperand(0);
-        MachineOperand &Src  = MI->getOperand(1);
+      if (!DisablePNotP && MI.getOpcode() == Hexagon::C2_not) {
+        assert(MI.getNumOperands() == 2);
+        MachineOperand &Dst = MI.getOperand(0);
+        MachineOperand &Src = MI.getOperand(1);
         unsigned DstReg = Dst.getReg();
         unsigned SrcReg = Src.getReg();
         // Just handle virtual registers.
@@ -203,10 +202,10 @@ bool HexagonPeephole::runOnMachineFunction(MachineFunction &MF) {
 
       // Look for copy:
       // %vreg176<def> = COPY %vreg170:subreg_loreg
-      if (!DisableOptSZExt && MI->isCopy()) {
-        assert (MI->getNumOperands() == 2);
-        MachineOperand &Dst = MI->getOperand(0);
-        MachineOperand &Src  = MI->getOperand(1);
+      if (!DisableOptSZExt && MI.isCopy()) {
+        assert(MI.getNumOperands() == 2);
+        MachineOperand &Dst = MI.getOperand(0);
+        MachineOperand &Src = MI.getOperand(1);
 
         // Make sure we are copying the lower 32 bits.
         if (Src.getSubReg() != Hexagon::subreg_loreg)
@@ -219,22 +218,18 @@ bool HexagonPeephole::runOnMachineFunction(MachineFunction &MF) {
           // Try to find in the map.
           if (unsigned PeepholeSrc = PeepholeMap.lookup(SrcReg)) {
             // Change the 1st operand.
-            MI->RemoveOperand(1);
-            MI->addOperand(MachineOperand::CreateReg(PeepholeSrc, false));
+            MI.RemoveOperand(1);
+            MI.addOperand(MachineOperand::CreateReg(PeepholeSrc, false));
           } else  {
             DenseMap<unsigned, std::pair<unsigned, unsigned> >::iterator DI =
               PeepholeDoubleRegsMap.find(SrcReg);
             if (DI != PeepholeDoubleRegsMap.end()) {
               std::pair<unsigned,unsigned> PeepholeSrc = DI->second;
-              MI->RemoveOperand(1);
-              MI->addOperand(MachineOperand::CreateReg(PeepholeSrc.first,
-                                                       false /*isDef*/,
-                                                       false /*isImp*/,
-                                                       false /*isKill*/,
-                                                       false /*isDead*/,
-                                                       false /*isUndef*/,
-                                                       false /*isEarlyClobber*/,
-                                                       PeepholeSrc.second));
+              MI.RemoveOperand(1);
+              MI.addOperand(MachineOperand::CreateReg(
+                  PeepholeSrc.first, false /*isDef*/, false /*isImp*/,
+                  false /*isKill*/, false /*isDead*/, false /*isUndef*/,
+                  false /*isEarlyClobber*/, PeepholeSrc.second));
             }
           }
         }
@@ -244,7 +239,7 @@ bool HexagonPeephole::runOnMachineFunction(MachineFunction &MF) {
       if (!DisablePNotP) {
         bool Done = false;
         if (QII->isPredicated(MI)) {
-          MachineOperand &Op0 = MI->getOperand(0);
+          MachineOperand &Op0 = MI.getOperand(0);
           unsigned Reg0 = Op0.getReg();
           const TargetRegisterClass *RC0 = MRI->getRegClass(Reg0);
           if (RC0->getID() == Hexagon::PredRegsRegClassID) {
@@ -254,9 +249,9 @@ bool HexagonPeephole::runOnMachineFunction(MachineFunction &MF) {
               // Try to find in the map.
               if (unsigned PeepholeSrc = PeepholeMap.lookup(Reg0)) {
                 // Change the 1st operand and, flip the opcode.
-                MI->getOperand(0).setReg(PeepholeSrc);
-                int NewOp = QII->getInvertedPredicatedOpcode(MI->getOpcode());
-                MI->setDesc(QII->get(NewOp));
+                MI.getOperand(0).setReg(PeepholeSrc);
+                int NewOp = QII->getInvertedPredicatedOpcode(MI.getOpcode());
+                MI.setDesc(QII->get(NewOp));
                 Done = true;
               }
             }
@@ -265,7 +260,7 @@ bool HexagonPeephole::runOnMachineFunction(MachineFunction &MF) {
 
         if (!Done) {
           // Handle special instructions.
-          unsigned Op = MI->getOpcode();
+          unsigned Op = MI.getOpcode();
           unsigned NewOp = 0;
           unsigned PR = 1, S1 = 2, S2 = 3;   // Operand indices.
 
@@ -282,15 +277,15 @@ bool HexagonPeephole::runOnMachineFunction(MachineFunction &MF) {
               break;
           }
           if (NewOp) {
-            unsigned PSrc = MI->getOperand(PR).getReg();
+            unsigned PSrc = MI.getOperand(PR).getReg();
             if (unsigned POrig = PeepholeMap.lookup(PSrc)) {
-              MI->getOperand(PR).setReg(POrig);
-              MI->setDesc(QII->get(NewOp));
+              MI.getOperand(PR).setReg(POrig);
+              MI.setDesc(QII->get(NewOp));
               // Swap operands S1 and S2.
-              MachineOperand Op1 = MI->getOperand(S1);
-              MachineOperand Op2 = MI->getOperand(S2);
-              ChangeOpInto(MI->getOperand(S1), Op2);
-              ChangeOpInto(MI->getOperand(S2), Op1);
+              MachineOperand Op1 = MI.getOperand(S1);
+              MachineOperand Op2 = MI.getOperand(S2);
+              ChangeOpInto(MI.getOperand(S1), Op2);
+              ChangeOpInto(MI.getOperand(S2), Op1);
             }
           } // if (NewOp)
         } // if (!Done)
@@ -308,6 +303,7 @@ void HexagonPeephole::ChangeOpInto(MachineOperand &Dst, MachineOperand &Src) {
     case MachineOperand::MO_Register:
       if (Src.isReg()) {
         Dst.setReg(Src.getReg());
+        Dst.setSubReg(Src.getSubReg());
       } else if (Src.isImm()) {
         Dst.ChangeToImmediate(Src.getImm());
       } else {
@@ -322,6 +318,7 @@ void HexagonPeephole::ChangeOpInto(MachineOperand &Dst, MachineOperand &Src) {
         Dst.ChangeToRegister(Src.getReg(), Src.isDef(), Src.isImplicit(),
                              Src.isKill(), Src.isDead(), Src.isUndef(),
                              Src.isDebug());
+        Dst.setSubReg(Src.getSubReg());
       } else {
         llvm_unreachable("Unexpected src operand type");
       }
diff --git a/lib/Target/Hexagon/HexagonRDF.h b/lib/Target/Hexagon/HexagonRDF.h
index 00c1889e8eb5..9a63150c377d 100644
--- a/lib/Target/Hexagon/HexagonRDF.h
+++ b/lib/Target/Hexagon/HexagonRDF.h
@@ -13,7 +13,6 @@
 
 namespace llvm {
   class TargetRegisterInfo;
-}
 
 namespace rdf {
   struct HexagonRegisterAliasInfo : public RegisterAliasInfo {
@@ -22,7 +21,8 @@ namespace rdf {
     bool covers(RegisterRef RA, RegisterRef RR) const override;
     bool covers(const RegisterSet &RRs, RegisterRef RR) const override;
   };
-}
+} // namespace rdf
+} // namespace llvm
 
 #endif
 
diff --git a/lib/Target/Hexagon/HexagonRDFOpt.cpp b/lib/Target/Hexagon/HexagonRDFOpt.cpp
index 3fcda984d265..642a8785def9 100644
--- a/lib/Target/Hexagon/HexagonRDFOpt.cpp
+++ b/lib/Target/Hexagon/HexagonRDFOpt.cpp
@@ -35,8 +35,8 @@ namespace llvm {
 }
 
 namespace {
-  cl::opt<unsigned> RDFLimit("rdf-limit", cl::init(UINT_MAX));
   unsigned RDFCount = 0;
+  cl::opt<unsigned> RDFLimit("rdf-limit", cl::init(UINT_MAX));
   cl::opt<bool> RDFDump("rdf-dump", cl::init(false));
 
   class HexagonRDFOpt : public MachineFunctionPass {
@@ -55,6 +55,11 @@ namespace {
     }
     bool runOnMachineFunction(MachineFunction &MF) override;
 
+    MachineFunctionProperties getRequiredProperties() const override {
+      return MachineFunctionProperties().set(
+          MachineFunctionProperties::Property::AllVRegsAllocated);
+    }
+
     static char ID;
 
   private:
@@ -71,6 +76,13 @@ INITIALIZE_PASS_DEPENDENCY(MachineDominanceFrontier)
 INITIALIZE_PASS_END(HexagonRDFOpt, "rdfopt", "Hexagon RDF opt", false, false)
 
 
+namespace {
+struct HexagonCP : public CopyPropagation {
+  HexagonCP(DataFlowGraph &G) : CopyPropagation(G) {}
+  bool interpretAsCopy(const MachineInstr *MI, EqualityMap &EM) override;
+};
+
+
 struct HexagonDCE : public DeadCodeElimination {
   HexagonDCE(DataFlowGraph &G, MachineRegisterInfo &MRI)
     : DeadCodeElimination(G, MRI) {}
@@ -79,6 +91,44 @@ struct HexagonDCE : public DeadCodeElimination {
 
   bool run();
 };
+} // end anonymous namespace
+
+
+bool HexagonCP::interpretAsCopy(const MachineInstr *MI, EqualityMap &EM) {
+  auto mapRegs = [MI,&EM] (RegisterRef DstR, RegisterRef SrcR) -> void {
+    EM.insert(std::make_pair(DstR, SrcR));
+  };
+
+  unsigned Opc = MI->getOpcode();
+  switch (Opc) {
+    case Hexagon::A2_combinew: {
+      const MachineOperand &DstOp = MI->getOperand(0);
+      const MachineOperand &HiOp = MI->getOperand(1);
+      const MachineOperand &LoOp = MI->getOperand(2);
+      assert(DstOp.getSubReg() == 0 && "Unexpected subregister");
+      mapRegs({ DstOp.getReg(), Hexagon::subreg_hireg },
+              { HiOp.getReg(), HiOp.getSubReg() });
+      mapRegs({ DstOp.getReg(), Hexagon::subreg_loreg },
+              { LoOp.getReg(), LoOp.getSubReg() });
+      return true;
+    }
+    case Hexagon::A2_addi: {
+      const MachineOperand &A = MI->getOperand(2);
+      if (!A.isImm() || A.getImm() != 0)
+        return false;
+    }
+    // Fall through.
+    case Hexagon::A2_tfr: {
+      const MachineOperand &DstOp = MI->getOperand(0);
+      const MachineOperand &SrcOp = MI->getOperand(1);
+      mapRegs({ DstOp.getReg(), DstOp.getSubReg() },
+              { SrcOp.getReg(), SrcOp.getSubReg() });
+      return true;
+    }
+  }
+
+  return CopyPropagation::interpretAsCopy(MI, EM);
+}
 
 
 bool HexagonDCE::run() {
@@ -106,6 +156,7 @@ bool HexagonDCE::run() {
     }
   }
 
+
   // Nodes to remove.
   SetVector<NodeId> Remove = DeadInstrs;
 
@@ -216,6 +267,9 @@ bool HexagonDCE::rewrite(NodeAddr<InstrNode*> IA, SetVector<NodeId> &Remove) {
 
 
 bool HexagonRDFOpt::runOnMachineFunction(MachineFunction &MF) {
+  if (skipFunction(*MF.getFunction()))
+    return false;
+
   if (RDFLimit.getPosition()) {
     if (RDFCount >= RDFLimit)
       return false;
@@ -227,31 +281,36 @@ bool HexagonRDFOpt::runOnMachineFunction(MachineFunction &MF) {
   const auto &HII = *MF.getSubtarget<HexagonSubtarget>().getInstrInfo();
   const auto &HRI = *MF.getSubtarget<HexagonSubtarget>().getRegisterInfo();
   MRI = &MF.getRegInfo();
-
-  HexagonRegisterAliasInfo HAI(HRI);
-  TargetOperandInfo TOI(HII);
+  bool Changed;
 
   if (RDFDump)
     MF.print(dbgs() << "Before " << getPassName() << "\n", nullptr);
+
+  HexagonRegisterAliasInfo HAI(HRI);
+  TargetOperandInfo TOI(HII);
   DataFlowGraph G(MF, HII, HRI, *MDT, MDF, HAI, TOI);
-  G.build();
-  if (RDFDump) {
-    dbgs() << PrintNode<FuncNode*>(G.getFunc(), G) << '\n';
-    dbgs() << MF.getName() << '\n';
-  }
+  // Dead phi nodes are necessary for copy propagation: we can add a use
+  // of a register in a block where it would need a phi node, but which
+  // was dead (and removed) during the graph build time.
+  G.build(BuildOptions::KeepDeadPhis);
 
-  bool Changed;
-  CopyPropagation CP(G);
+  if (RDFDump)
+    dbgs() << "Starting copy propagation on: " << MF.getName() << '\n'
+           << PrintNode<FuncNode*>(G.getFunc(), G) << '\n';
+  HexagonCP CP(G);
   CP.trace(RDFDump);
   Changed = CP.run();
-  if (Changed)
-    G.build();
 
+  if (RDFDump)
+    dbgs() << "Starting dead code elimination on: " << MF.getName() << '\n'
+           << PrintNode<FuncNode*>(G.getFunc(), G) << '\n';
   HexagonDCE DCE(G, *MRI);
   DCE.trace(RDFDump);
   Changed |= DCE.run();
 
   if (Changed) {
+    if (RDFDump)
+      dbgs() << "Starting liveness recomputation on: " << MF.getName() << '\n';
     Liveness LV(*MRI, G);
     LV.trace(RDFDump);
     LV.computeLiveIns();
@@ -261,6 +320,7 @@ bool HexagonRDFOpt::runOnMachineFunction(MachineFunction &MF) {
 
   if (RDFDump)
     MF.print(dbgs() << "After " << getPassName() << "\n", nullptr);
+
   return false;
 }
 
@@ -268,5 +328,3 @@ bool HexagonRDFOpt::runOnMachineFunction(MachineFunction &MF) {
 FunctionPass *llvm::createHexagonRDFOpt() {
   return new HexagonRDFOpt();
 }
-
-
diff --git a/lib/Target/Hexagon/HexagonRegisterInfo.cpp b/lib/Target/Hexagon/HexagonRegisterInfo.cpp
index 6e5f7324aca8..23ebfd484be9 100644
--- a/lib/Target/Hexagon/HexagonRegisterInfo.cpp
+++ b/lib/Target/Hexagon/HexagonRegisterInfo.cpp
@@ -29,7 +29,6 @@
 #include "llvm/IR/Function.h"
 #include "llvm/IR/Type.h"
 #include "llvm/MC/MachineLocation.h"
-#include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/raw_ostream.h"
@@ -54,24 +53,51 @@ bool HexagonRegisterInfo::isCalleeSaveReg(unsigned Reg) const {
 
 
 const MCPhysReg *
-HexagonRegisterInfo::getCallerSavedRegs(const MachineFunction *MF) const {
-  static const MCPhysReg CallerSavedRegsV4[] = {
-    Hexagon::R0, Hexagon::R1, Hexagon::R2, Hexagon::R3, Hexagon::R4,
-    Hexagon::R5, Hexagon::R6, Hexagon::R7, Hexagon::R8, Hexagon::R9,
-    Hexagon::R10, Hexagon::R11, Hexagon::R12, Hexagon::R13, Hexagon::R14,
-    Hexagon::R15, 0
+HexagonRegisterInfo::getCallerSavedRegs(const MachineFunction *MF,
+      const TargetRegisterClass *RC) const {
+  using namespace Hexagon;
+
+  static const MCPhysReg Int32[] = {
+    R0, R1, R2, R3, R4, R5, R6, R7, R8, R9, R10, R11, R12, R13, R14, R15, 0
+  };
+  static const MCPhysReg Int64[] = {
+    D0, D1, D2, D3, D4, D5, D6, D7, 0
+  };
+  static const MCPhysReg Pred[] = {
+    P0, P1, P2, P3, 0
+  };
+  static const MCPhysReg VecSgl[] = {
+     V0,  V1,  V2,  V3,  V4,  V5,  V6,  V7,  V8,  V9, V10, V11, V12, V13,
+    V14, V15, V16, V17, V18, V19, V20, V21, V22, V23, V24, V25, V26, V27,
+    V28, V29, V30, V31,   0
+  };
+  static const MCPhysReg VecDbl[] = {
+    W0, W1, W2, W3, W4, W5, W6, W7, W8, W9, W10, W11, W12, W13, W14, W15, 0
   };
 
-  auto &HST = static_cast<const HexagonSubtarget&>(MF->getSubtarget());
-  switch (HST.getHexagonArchVersion()) {
-  case HexagonSubtarget::V4:
-  case HexagonSubtarget::V5:
-  case HexagonSubtarget::V55:
-  case HexagonSubtarget::V60:
-    return CallerSavedRegsV4;
+  switch (RC->getID()) {
+    case IntRegsRegClassID:
+      return Int32;
+    case DoubleRegsRegClassID:
+      return Int64;
+    case PredRegsRegClassID:
+      return Pred;
+    case VectorRegsRegClassID:
+    case VectorRegs128BRegClassID:
+      return VecSgl;
+    case VecDblRegsRegClassID:
+    case VecDblRegs128BRegClassID:
+      return VecDbl;
+    default:
+      break;
   }
-  llvm_unreachable(
-    "Callee saved registers requested for unknown archtecture version");
+
+  static const MCPhysReg Empty[] = { 0 };
+#ifndef NDEBUG
+  dbgs() << "Register class: " << getRegClassName(RC) << "\n";
+#endif
+  llvm_unreachable("Unexpected register class");
+  return Empty;
 }
 
 
@@ -83,33 +109,48 @@ HexagonRegisterInfo::getCalleeSavedRegs(const MachineFunction *MF) const {
     Hexagon::R24,   Hexagon::R25,   Hexagon::R26,   Hexagon::R27, 0
   };
 
+  // Functions that contain a call to __builtin_eh_return also save the first 4
+  // parameter registers.
+  static const MCPhysReg CalleeSavedRegsV3EHReturn[] = {
+    Hexagon::R0,    Hexagon::R1,    Hexagon::R2,    Hexagon::R3,
+    Hexagon::R16,   Hexagon::R17,   Hexagon::R18,   Hexagon::R19,
+    Hexagon::R20,   Hexagon::R21,   Hexagon::R22,   Hexagon::R23,
+    Hexagon::R24,   Hexagon::R25,   Hexagon::R26,   Hexagon::R27, 0
+  };
+
+  bool HasEHReturn = MF->getInfo<HexagonMachineFunctionInfo>()->hasEHReturn();
+
   switch (MF->getSubtarget<HexagonSubtarget>().getHexagonArchVersion()) {
   case HexagonSubtarget::V4:
   case HexagonSubtarget::V5:
   case HexagonSubtarget::V55:
   case HexagonSubtarget::V60:
-    return CalleeSavedRegsV3;
+    return HasEHReturn ? CalleeSavedRegsV3EHReturn : CalleeSavedRegsV3;
   }
+
   llvm_unreachable("Callee saved registers requested for unknown architecture "
                    "version");
 }
 
+
 BitVector HexagonRegisterInfo::getReservedRegs(const MachineFunction &MF)
   const {
   BitVector Reserved(getNumRegs());
-  Reserved.set(HEXAGON_RESERVED_REG_1);
-  Reserved.set(HEXAGON_RESERVED_REG_2);
   Reserved.set(Hexagon::R29);
   Reserved.set(Hexagon::R30);
   Reserved.set(Hexagon::R31);
   Reserved.set(Hexagon::PC);
-  Reserved.set(Hexagon::GP);
   Reserved.set(Hexagon::D14);
   Reserved.set(Hexagon::D15);
   Reserved.set(Hexagon::LC0);
   Reserved.set(Hexagon::LC1);
   Reserved.set(Hexagon::SA0);
   Reserved.set(Hexagon::SA1);
+  Reserved.set(Hexagon::UGP);
+  Reserved.set(Hexagon::GP);
+  Reserved.set(Hexagon::CS0);
+  Reserved.set(Hexagon::CS1);
+  Reserved.set(Hexagon::CS);
   return Reserved;
 }
 
@@ -135,6 +176,7 @@ void HexagonRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II,
   int Offset = HFI.getFrameIndexReference(MF, FI, BP);
   // Add the offset from the instruction.
   int RealOffset = Offset + MI.getOperand(FIOp+1).getImm();
+  bool IsKill = false;
 
   unsigned Opc = MI.getOpcode();
   switch (Opc) {
@@ -149,20 +191,22 @@ void HexagonRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II,
       break;
   }
 
-  if (HII.isValidOffset(Opc, RealOffset)) {
-    MI.getOperand(FIOp).ChangeToRegister(BP, false);
-    MI.getOperand(FIOp+1).ChangeToImmediate(RealOffset);
-    return;
+  if (!HII.isValidOffset(Opc, RealOffset)) {
+    // If the offset is not valid, calculate the address in a temporary
+    // register and use it with offset 0.
+    auto &MRI = MF.getRegInfo();
+    unsigned TmpR = MRI.createVirtualRegister(&Hexagon::IntRegsRegClass);
+    const DebugLoc &DL = MI.getDebugLoc();
+    BuildMI(MB, II, DL, HII.get(Hexagon::A2_addi), TmpR)
+      .addReg(BP)
+      .addImm(RealOffset);
+    BP = TmpR;
+    RealOffset = 0;
+    IsKill = true;
   }
 
-#ifndef NDEBUG
-  const Function *F = MF.getFunction();
-  dbgs() << "In function ";
-  if (F) dbgs() << F->getName();
-  else   dbgs() << "<?>";
-  dbgs() << ", BB#" << MB.getNumber() << "\n" << MI;
-#endif
-  llvm_unreachable("Unhandled instruction");
+  MI.getOperand(FIOp).ChangeToRegister(BP, false, false, IsKill);
+  MI.getOperand(FIOp+1).ChangeToImmediate(RealOffset);
 }
 
 
diff --git a/lib/Target/Hexagon/HexagonRegisterInfo.h b/lib/Target/Hexagon/HexagonRegisterInfo.h
index db7e0f27815d..fc70679bc930 100644
--- a/lib/Target/Hexagon/HexagonRegisterInfo.h
+++ b/lib/Target/Hexagon/HexagonRegisterInfo.h
@@ -21,21 +21,6 @@
 #define GET_REGINFO_HEADER
 #include "HexagonGenRegisterInfo.inc"
 
-//
-//  We try not to hard code the reserved registers in our code,
-//  so the following two macros were defined. However, there
-//  are still a few places that R11 and R10 are hard wired.
-//  See below. If, in the future, we decided to change the reserved
-//  register. Don't forget changing the following places.
-//
-//  1. the "Defs" set of STriw_pred in HexagonInstrInfo.td
-//  2. the "Defs" set of LDri_pred in HexagonInstrInfo.td
-//  3. the definition of "IntRegs" in HexagonRegisterInfo.td
-//  4. the definition of "DoubleRegs" in HexagonRegisterInfo.td
-//
-#define HEXAGON_RESERVED_REG_1 Hexagon::R10
-#define HEXAGON_RESERVED_REG_2 Hexagon::R11
-
 namespace llvm {
 class HexagonRegisterInfo : public HexagonGenRegisterInfo {
 public:
@@ -76,7 +61,8 @@ public:
   unsigned getFrameRegister() const;
   unsigned getStackRegister() const;
 
-  const MCPhysReg *getCallerSavedRegs(const MachineFunction *MF) const;
+  const MCPhysReg *getCallerSavedRegs(const MachineFunction *MF,
+        const TargetRegisterClass *RC) const;
 
   unsigned getFirstCallerSavedNonParamReg() const;
 
diff --git a/lib/Target/Hexagon/HexagonRegisterInfo.td b/lib/Target/Hexagon/HexagonRegisterInfo.td
index 81629dc6d47f..4d0d411d73da 100644
--- a/lib/Target/Hexagon/HexagonRegisterInfo.td
+++ b/lib/Target/Hexagon/HexagonRegisterInfo.td
@@ -13,8 +13,8 @@
 
 let Namespace = "Hexagon" in {
 
-  class HexagonReg<bits<5> num, string n, list<string> alt = [], 
-                   list<Register> alias = []> : Register<n> {
+  class HexagonReg<bits<5> num, string n, list<string> alt = [],
+                   list<Register> alias = []> : Register<n, alt> {
     field bits<5> Num;
     let Aliases = alias;
     let HWEncoding{4-0} = num;
@@ -31,7 +31,8 @@ let Namespace = "Hexagon" in {
 
   // Registers are identified with 5-bit ID numbers.
   // Ri - 32-bit integer registers.
-  class Ri<bits<5> num, string n, list<string> alt = []> : HexagonReg<num, n, alt> {
+  class Ri<bits<5> num, string n, list<string> alt = []> :
+        HexagonReg<num, n, alt> {
     let Num = num;
   }
 
@@ -42,8 +43,9 @@ let Namespace = "Hexagon" in {
 
 
   // Rd - 64-bit registers.
-  class Rd<bits<5> num, string n, list<Register> subregs> :
-        HexagonDoubleReg<num, n, subregs> {
+  class Rd<bits<5> num, string n, list<Register> subregs,
+           list<string> alt = []> :
+        HexagonDoubleReg<num, n, subregs, alt> {
     let Num = num;
     let SubRegs = subregs;
   }
@@ -94,11 +96,11 @@ let Namespace = "Hexagon" in {
 
   // Aliases of the R* registers used to hold 64-bit int values (doubles).
   let SubRegIndices = [subreg_loreg, subreg_hireg], CoveredBySubRegs = 1 in {
-  def D0  : Rd< 0,  "r1:0",  [R0,  R1]>, DwarfRegNum<[32]>;
-  def D1  : Rd< 2,  "r3:2",  [R2,  R3]>, DwarfRegNum<[34]>;
-  def D2  : Rd< 4,  "r5:4",  [R4,  R5]>, DwarfRegNum<[36]>;
-  def D3  : Rd< 6,  "r7:6",  [R6,  R7]>, DwarfRegNum<[38]>;
-  def D4  : Rd< 8,  "r9:8",  [R8,  R9]>, DwarfRegNum<[40]>;
+  def D0  : Rd< 0,  "r1:0",  [R0,  R1]>,  DwarfRegNum<[32]>;
+  def D1  : Rd< 2,  "r3:2",  [R2,  R3]>,  DwarfRegNum<[34]>;
+  def D2  : Rd< 4,  "r5:4",  [R4,  R5]>,  DwarfRegNum<[36]>;
+  def D3  : Rd< 6,  "r7:6",  [R6,  R7]>,  DwarfRegNum<[38]>;
+  def D4  : Rd< 8,  "r9:8",  [R8,  R9]>,  DwarfRegNum<[40]>;
   def D5  : Rd<10, "r11:10", [R10, R11]>, DwarfRegNum<[42]>;
   def D6  : Rd<12, "r13:12", [R12, R13]>, DwarfRegNum<[44]>;
   def D7  : Rd<14, "r15:14", [R14, R15]>, DwarfRegNum<[46]>;
@@ -109,7 +111,7 @@ let Namespace = "Hexagon" in {
   def D12 : Rd<24, "r25:24", [R24, R25]>, DwarfRegNum<[56]>;
   def D13 : Rd<26, "r27:26", [R26, R27]>, DwarfRegNum<[58]>;
   def D14 : Rd<28, "r29:28", [R28, R29]>, DwarfRegNum<[60]>;
-  def D15 : Rd<30, "r31:30", [R30, R31]>, DwarfRegNum<[62]>;
+  def D15 : Rd<30, "r31:30", [R30, R31], ["lr:fp"]>, DwarfRegNum<[62]>;
   }
 
   // Predicate registers.
@@ -130,6 +132,11 @@ let Namespace = "Hexagon" in {
   // on the entire USR.
   def USR_OVF : Rc<?, "usr.ovf">;
 
+  def USR  : Rc<8,  "usr",       ["c8"]>,   DwarfRegNum<[75]> {
+    let SubRegIndices = [subreg_overflow];
+    let SubRegs = [USR_OVF];
+  }
+
   // Control registers.
   def SA0  : Rc<0,  "sa0",       ["c0"]>,   DwarfRegNum<[67]>;
   def LC0  : Rc<1,  "lc0",       ["c1"]>,   DwarfRegNum<[68]>;
@@ -140,11 +147,12 @@ let Namespace = "Hexagon" in {
   def C5   : Rc<5,  "c5",        ["c5"]>,   DwarfRegNum<[72]>; // future use
   def C6   : Rc<6,  "c6",        [], [M0]>, DwarfRegNum<[73]>;
   def C7   : Rc<7,  "c7",        [], [M1]>, DwarfRegNum<[74]>;
-
-  def USR  : Rc<8,  "usr",       ["c8"]>,   DwarfRegNum<[75]> {
-    let SubRegIndices = [subreg_overflow];
-    let SubRegs = [USR_OVF];
-  }
+  // Define C8 separately and make it aliased with USR.
+  // The problem is that USR has subregisters (e.g. overflow). If USR was
+  // specified as a subregister of C9_8, it would imply that subreg_overflow
+  // and subreg_loreg can be composed, which leads to all kinds of issues
+  // with lane masks.
+  def C8   : Rc<8,  "c8",       [], [USR]>, DwarfRegNum<[75]>;
   def PC   : Rc<9,  "pc">,                  DwarfRegNum<[76]>;
   def UGP  : Rc<10, "ugp",       ["c10"]>,  DwarfRegNum<[77]>;
   def GP   : Rc<11, "gp">,                  DwarfRegNum<[78]>;
@@ -159,7 +167,8 @@ let Namespace = "Hexagon" in {
     def C1_0   : Rcc<0,   "c1:0",  [SA0, LC0], ["lc0:sa0"]>, DwarfRegNum<[67]>;
     def C3_2   : Rcc<2,   "c3:2",  [SA1, LC1], ["lc1:sa1"]>, DwarfRegNum<[69]>;
     def C7_6   : Rcc<6,   "c7:6",  [C6, C7],   ["m1:0"]>,    DwarfRegNum<[72]>;
-    def C9_8   : Rcc<8,   "c9:8",  [USR, PC]>,               DwarfRegNum<[74]>;
+    // Use C8 instead of USR as a subregister of C9_8.
+    def C9_8   : Rcc<8,   "c9:8",  [C8, PC]>,                DwarfRegNum<[74]>;
     def C11_10 : Rcc<10, "c11:10", [UGP, GP]>,               DwarfRegNum<[76]>;
     def CS     : Rcc<12, "c13:12", [CS0, CS1], ["cs1:0"]>,   DwarfRegNum<[78]>;
     def UPC    : Rcc<14, "c15:14", [UPCL, UPCH]>,            DwarfRegNum<[80]>;
@@ -261,7 +270,13 @@ def VolatileV3 {
                          R28, R31,
                          P0, P1, P2, P3,
                          M0, M1,
-                         LC0, LC1, SA0, SA1, USR, USR_OVF];
+                         LC0, LC1, SA0, SA1, USR, USR_OVF, CS0, CS1,
+                         V0, V1, V2, V3, V4, V5, V6, V7, V8, V9, V10, V11,
+                         V12, V13, V14, V15, V16, V17, V18, V19, V20, V21,
+                         V22, V23, V24, V25, V26, V27, V28, V29, V30, V31,
+                         W0, W1, W2, W3, W4, W5, W6, W7, W8, W9, W10, W11,
+                         W12, W13, W14, W15,
+                         Q0, Q1, Q2, Q3];
 }
 
 def PositiveHalfWord : PatLeaf<(i32 IntRegs:$a),
diff --git a/lib/Target/Hexagon/HexagonScheduleV4.td b/lib/Target/Hexagon/HexagonScheduleV4.td
index 67af147b25b3..7416baab392c 100644
--- a/lib/Target/Hexagon/HexagonScheduleV4.td
+++ b/lib/Target/Hexagon/HexagonScheduleV4.td
@@ -49,7 +49,6 @@ def ALU32_3op_tc_1_SLOT0123  : InstrItinClass;
 def ALU32_3op_tc_2_SLOT0123  : InstrItinClass;
 def ALU32_ADDI_tc_1_SLOT0123 : InstrItinClass;
 def ALU64_tc_1_SLOT23        : InstrItinClass;
-def ALU64_tc_1or2_SLOT23     : InstrItinClass;
 def ALU64_tc_2_SLOT23        : InstrItinClass;
 def ALU64_tc_2early_SLOT23   : InstrItinClass;
 def ALU64_tc_3x_SLOT23       : InstrItinClass;
@@ -64,10 +63,9 @@ def J_tc_2early_SLOT2        : InstrItinClass;
 def LD_tc_ld_SLOT01          : InstrItinClass;
 def LD_tc_ld_SLOT0           : InstrItinClass;
 def LD_tc_3or4stall_SLOT0    : InstrItinClass;
-def M_tc_1_SLOT23            : InstrItinClass;
-def M_tc_1or2_SLOT23         : InstrItinClass;
 def M_tc_2_SLOT23            : InstrItinClass;
 def M_tc_3_SLOT23            : InstrItinClass;
+def M_tc_1_SLOT23            : InstrItinClass;
 def M_tc_3x_SLOT23           : InstrItinClass;
 def M_tc_3or4x_SLOT23        : InstrItinClass;
 def ST_tc_st_SLOT01          : InstrItinClass;
@@ -79,7 +77,6 @@ def S_2op_tc_2_SLOT23        : InstrItinClass;
 def S_2op_tc_2early_SLOT23   : InstrItinClass;
 def S_2op_tc_3or4x_SLOT23    : InstrItinClass;
 def S_3op_tc_1_SLOT23        : InstrItinClass;
-def S_3op_tc_1or2_SLOT23     : InstrItinClass;
 def S_3op_tc_2_SLOT23        : InstrItinClass;
 def S_3op_tc_2early_SLOT23   : InstrItinClass;
 def S_3op_tc_3_SLOT23        : InstrItinClass;
@@ -95,7 +92,6 @@ def J_tc_2early_SLOT0123     : InstrItinClass;
 def EXTENDER_tc_1_SLOT0123   : InstrItinClass;
 def S_3op_tc_3stall_SLOT23   : InstrItinClass;
 
-
 def HexagonItinerariesV4 :
       ProcessorItineraries<[SLOT0, SLOT1, SLOT2, SLOT3, SLOT_ENDLOOP], [], [
         // ALU32
@@ -114,7 +110,6 @@ def HexagonItinerariesV4 :
 
         // ALU64
         InstrItinData<ALU64_tc_1_SLOT23      , [InstrStage<1, [SLOT2, SLOT3]>]>,
-        InstrItinData<ALU64_tc_1or2_SLOT23   , [InstrStage<1, [SLOT2, SLOT3]>]>,
         InstrItinData<ALU64_tc_2_SLOT23      , [InstrStage<1, [SLOT2, SLOT3]>]>,
         InstrItinData<ALU64_tc_2early_SLOT23 , [InstrStage<1, [SLOT2, SLOT3]>]>,
         InstrItinData<ALU64_tc_3x_SLOT23     , [InstrStage<1, [SLOT2, SLOT3]>]>,
@@ -130,6 +125,7 @@ def HexagonItinerariesV4 :
         InstrItinData<CR_tc_3x_SLOT23        , [InstrStage<1, [SLOT2, SLOT3]>]>,
         // J
         InstrItinData<J_tc_2early_SLOT23     , [InstrStage<1, [SLOT2, SLOT3]>]>,
+        InstrItinData<J_tc_2early_CJUMP_UCJUMP_ARCHDEPSLOT     , [InstrStage<1, [SLOT2, SLOT3]>]>,
         // JR
         InstrItinData<J_tc_2early_SLOT2      , [InstrStage<1, [SLOT2]>]>,
 
@@ -140,7 +136,6 @@ def HexagonItinerariesV4 :
 
         // M
         InstrItinData<M_tc_1_SLOT23          , [InstrStage<1, [SLOT2, SLOT3]>]>,
-        InstrItinData<M_tc_1or2_SLOT23       , [InstrStage<1, [SLOT2, SLOT3]>]>,
         InstrItinData<M_tc_2_SLOT23          , [InstrStage<1, [SLOT2, SLOT3]>]>,
         InstrItinData<M_tc_3_SLOT23          , [InstrStage<1, [SLOT2, SLOT3]>]>,
         InstrItinData<M_tc_3x_SLOT23         , [InstrStage<1, [SLOT2, SLOT3]>]>,
@@ -159,11 +154,11 @@ def HexagonItinerariesV4 :
         InstrItinData<S_2op_tc_2early_SLOT23 , [InstrStage<1, [SLOT2, SLOT3]>]>,
         InstrItinData<S_2op_tc_3or4x_SLOT23  , [InstrStage<1, [SLOT2, SLOT3]>]>,
         InstrItinData<S_3op_tc_1_SLOT23      , [InstrStage<1, [SLOT2, SLOT3]>]>,
-        InstrItinData<S_3op_tc_1or2_SLOT23   , [InstrStage<1, [SLOT2, SLOT3]>]>,
         InstrItinData<S_3op_tc_2early_SLOT23 , [InstrStage<1, [SLOT2, SLOT3]>]>,
         InstrItinData<S_3op_tc_2_SLOT23      , [InstrStage<1, [SLOT2, SLOT3]>]>,
         InstrItinData<S_3op_tc_3_SLOT23      , [InstrStage<1, [SLOT2, SLOT3]>]>,
         InstrItinData<S_3op_tc_3x_SLOT23     , [InstrStage<1, [SLOT2, SLOT3]>]>,
+        InstrItinData<S_3op_tc_3stall_SLOT23 , [InstrStage<3, [SLOT2, SLOT3]>]>,
 
         // SYS
         InstrItinData<ST_tc_3stall_SLOT0     , [InstrStage<1, [SLOT0]>]>,
@@ -188,6 +183,7 @@ def HexagonItinerariesV4 :
         InstrItinData<EXTENDER_tc_1_SLOT0123,
                      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>]>,
 
+        InstrItinData<COMPOUND_CJ_ARCHDEPSLOT , [InstrStage<1, [SLOT2, SLOT3]>]>,
         InstrItinData<COMPOUND , [InstrStage<1, [SLOT2, SLOT3]>]>,
         InstrItinData<PSEUDO , [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>]>,
         InstrItinData<PSEUDOM, [InstrStage<1, [SLOT2, SLOT3], 0>,
@@ -199,6 +195,7 @@ def HexagonModelV4 : SchedMachineModel {
   let IssueWidth = 4;
   let Itineraries = HexagonItinerariesV4;
   let LoadLatency = 1;
+  let CompleteModel = 0;
 }
 
 //===----------------------------------------------------------------------===//
diff --git a/lib/Target/Hexagon/HexagonScheduleV55.td b/lib/Target/Hexagon/HexagonScheduleV55.td
index d9ad25d4cd5a..b2a75f7200d7 100644
--- a/lib/Target/Hexagon/HexagonScheduleV55.td
+++ b/lib/Target/Hexagon/HexagonScheduleV55.td
@@ -31,131 +31,154 @@ def COPROC_VX_vtc_SLOT23        : InstrItinClass;
 def J_tc_3stall_SLOT2           : InstrItinClass;
 def MAPPING_tc_1_SLOT0123       : InstrItinClass;
 def M_tc_3stall_SLOT23          : InstrItinClass;
-def SUBINSN_tc_1_SLOT01         : InstrItinClass;
-def SUBINSN_tc_2early_SLOT0     : InstrItinClass;
-def SUBINSN_tc_2early_SLOT01    : InstrItinClass;
-def SUBINSN_tc_3stall_SLOT0     : InstrItinClass;
-def SUBINSN_tc_ld_SLOT0         : InstrItinClass;
-def SUBINSN_tc_ld_SLOT01        : InstrItinClass;
-def SUBINSN_tc_st_SLOT01        : InstrItinClass;
 
 def HexagonItinerariesV55 :
       ProcessorItineraries<[SLOT0, SLOT1, SLOT2, SLOT3, SLOT_ENDLOOP], [], [
         // ALU32
         InstrItinData<ALU32_2op_tc_1_SLOT0123     ,
-                      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>]>,
+                      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>], [1, 1, 1]>,
         InstrItinData<ALU32_2op_tc_2early_SLOT0123,
-                      [InstrStage<2, [SLOT0, SLOT1, SLOT2, SLOT3]>]>,
+                      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>], [2, 1, 1]>,
         InstrItinData<ALU32_3op_tc_1_SLOT0123     ,
-                      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>]>,
+                      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>], [1, 1, 1]>,
         InstrItinData<ALU32_3op_tc_2_SLOT0123     ,
-                      [InstrStage<2, [SLOT0, SLOT1, SLOT2, SLOT3]>]>,
+                      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>], [2, 1, 1]>,
         InstrItinData<ALU32_3op_tc_2early_SLOT0123,
-                      [InstrStage<2, [SLOT0, SLOT1, SLOT2, SLOT3]>]>,
+                      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>], [2, 1, 1]>,
         InstrItinData<ALU32_ADDI_tc_1_SLOT0123    ,
-                      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>]>,
+                      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>], [1, 1, 1]>,
 
         // ALU64
-        InstrItinData<ALU64_tc_1_SLOT23     , [InstrStage<1, [SLOT2, SLOT3]>]>,
-        InstrItinData<ALU64_tc_2_SLOT23     , [InstrStage<2, [SLOT2, SLOT3]>]>,
-        InstrItinData<ALU64_tc_2early_SLOT23, [InstrStage<2, [SLOT2, SLOT3]>]>,
-        InstrItinData<ALU64_tc_3x_SLOT23    , [InstrStage<3, [SLOT2, SLOT3]>]>,
+        InstrItinData<ALU64_tc_1_SLOT23     , [InstrStage<1, [SLOT2, SLOT3]>],
+                                              [1, 1, 1]>,
+        InstrItinData<ALU64_tc_2_SLOT23     , [InstrStage<1, [SLOT2, SLOT3]>],
+                                              [2, 1, 1]>,
+        InstrItinData<ALU64_tc_2early_SLOT23, [InstrStage<1, [SLOT2, SLOT3]>],
+                                              [2, 1, 1]>,
+        InstrItinData<ALU64_tc_3x_SLOT23    , [InstrStage<1, [SLOT2, SLOT3]>],
+                                              [3, 1, 1]>,
 
         // CR -> System
-        InstrItinData<CR_tc_2_SLOT3      , [InstrStage<2, [SLOT3]>]>,
-        InstrItinData<CR_tc_2early_SLOT3 , [InstrStage<2, [SLOT3]>]>,
-        InstrItinData<CR_tc_3x_SLOT3     , [InstrStage<3, [SLOT3]>]>,
+        InstrItinData<CR_tc_2_SLOT3      , [InstrStage<1, [SLOT3]>], [2, 1, 1]>,
+        InstrItinData<CR_tc_2early_SLOT3 , [InstrStage<1, [SLOT3]>], [2, 1, 1]>,
+        InstrItinData<CR_tc_3x_SLOT3     , [InstrStage<1, [SLOT3]>], [3, 1, 1]>,
 
         // Jump (conditional/unconditional/return etc)
-        InstrItinData<CR_tc_2early_SLOT23, [InstrStage<2, [SLOT2, SLOT3]>]>,
-        InstrItinData<CR_tc_3x_SLOT23    , [InstrStage<3, [SLOT2, SLOT3]>]>,
-        InstrItinData<CJ_tc_1_SLOT23     , [InstrStage<1, [SLOT2, SLOT3]>]>,
-        InstrItinData<CJ_tc_2early_SLOT23, [InstrStage<2, [SLOT2, SLOT3]>]>,
-        InstrItinData<J_tc_2early_SLOT23 , [InstrStage<2, [SLOT2, SLOT3]>]>,
-        InstrItinData<J_tc_2early_CJUMP_UCJUMP_ARCHDEPSLOT     , [InstrStage<1, [SLOT2, SLOT3]>]>,
+        InstrItinData<CR_tc_2early_SLOT23, [InstrStage<1, [SLOT2, SLOT3]>],
+                                           [2, 1, 1, 1]>,
+        InstrItinData<CR_tc_3x_SLOT23    , [InstrStage<1, [SLOT2, SLOT3]>],
+                                           [3, 1, 1, 1]>,
+        InstrItinData<CJ_tc_1_SLOT23     , [InstrStage<1, [SLOT2, SLOT3]>],
+                                           [1, 1, 1, 1]>,
+        InstrItinData<CJ_tc_2early_SLOT23, [InstrStage<1, [SLOT2, SLOT3]>],
+                                           [2, 1, 1, 1]>,
+        InstrItinData<J_tc_2early_SLOT23 , [InstrStage<1, [SLOT2, SLOT3]>],
+                                           [2, 1, 1, 1]>,
+        InstrItinData<J_tc_2early_CJUMP_UCJUMP_ARCHDEPSLOT,
+                                 [InstrStage<1, [SLOT2, SLOT3]>], [2, 1, 1, 1]>,
 
         // JR
-        InstrItinData<J_tc_2early_SLOT2  , [InstrStage<2, [SLOT2]>]>,
-        InstrItinData<J_tc_3stall_SLOT2  , [InstrStage<3, [SLOT2]>]>,
+        InstrItinData<J_tc_2early_SLOT2  , [InstrStage<1, [SLOT2]>], [2, 1, 1]>,
+        InstrItinData<J_tc_3stall_SLOT2  , [InstrStage<1, [SLOT2]>], [3, 1, 1]>,
 
         // Extender
         InstrItinData<EXTENDER_tc_1_SLOT0123,
-                      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>]>,
+                      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>], [1, 1, 1]>,
 
         // Load
-        InstrItinData<LD_tc_ld_SLOT01      , [InstrStage<3, [SLOT0, SLOT1]>]>,
-        InstrItinData<LD_tc_3or4stall_SLOT0, [InstrStage<3, [SLOT0]>]>,
-        InstrItinData<LD_tc_ld_SLOT0       , [InstrStage<3, [SLOT0]>]>,
+        InstrItinData<LD_tc_ld_SLOT01      , [InstrStage<1, [SLOT0, SLOT1]>],
+                                             [2, 1]>,
+        InstrItinData<LD_tc_3or4stall_SLOT0, [InstrStage<1, [SLOT0]>], [2, 1]>,
+        InstrItinData<LD_tc_ld_SLOT0       , [InstrStage<1, [SLOT0]>], [2, 1]>,
 
         // M
-        InstrItinData<M_tc_1_SLOT23     , [InstrStage<1, [SLOT2, SLOT3]>]>,
-        InstrItinData<M_tc_2_SLOT23     , [InstrStage<2, [SLOT2, SLOT3]>]>,
-        InstrItinData<M_tc_3_SLOT23     , [InstrStage<3, [SLOT2, SLOT3]>]>,
-        InstrItinData<M_tc_3x_SLOT23    , [InstrStage<3, [SLOT2, SLOT3]>]>,
-        InstrItinData<M_tc_3or4x_SLOT23 , [InstrStage<3, [SLOT2, SLOT3]>]>,
-        InstrItinData<M_tc_3stall_SLOT23, [InstrStage<3, [SLOT2, SLOT3]>]>,
+        InstrItinData<M_tc_1_SLOT23     , [InstrStage<1, [SLOT2, SLOT3]>],
+                                          [1, 1, 1]>,
+        InstrItinData<M_tc_2_SLOT23     , [InstrStage<1, [SLOT2, SLOT3]>],
+                                          [2, 1, 1]>,
+        InstrItinData<M_tc_3_SLOT23     , [InstrStage<1, [SLOT2, SLOT3]>],
+                                          [1, 1, 1]>,
+        InstrItinData<M_tc_3x_SLOT23    , [InstrStage<1, [SLOT2, SLOT3]>],
+                                          [3, 1, 1]>,
+        InstrItinData<M_tc_3or4x_SLOT23 , [InstrStage<1, [SLOT2, SLOT3]>],
+                                          [3, 1, 1]>,
+        InstrItinData<M_tc_3stall_SLOT23, [InstrStage<1, [SLOT2, SLOT3]>],
+                                          [3, 1, 1]>,
 
         // Store
-        InstrItinData<ST_tc_st_SLOT01   , [InstrStage<1, [SLOT0, SLOT1]>]>,
-        InstrItinData<ST_tc_3stall_SLOT0, [InstrStage<3, [SLOT0]>]>,
-        InstrItinData<ST_tc_ld_SLOT0    , [InstrStage<3, [SLOT0]>]>,
-        InstrItinData<ST_tc_st_SLOT0    , [InstrStage<1, [SLOT0]>]>,
-
-        // Subinsn
-        InstrItinData<SUBINSN_tc_2early_SLOT0, [InstrStage<2, [SLOT0]>]>,
-        InstrItinData<SUBINSN_tc_3stall_SLOT0, [InstrStage<3, [SLOT0]>]>,
-        InstrItinData<SUBINSN_tc_ld_SLOT0    , [InstrStage<3, [SLOT0]>]>,
-        InstrItinData<SUBINSN_tc_1_SLOT01    , [InstrStage<1, [SLOT0, SLOT1]>]>,
-        InstrItinData<SUBINSN_tc_2early_SLOT01,
-                              [InstrStage<2, [SLOT0, SLOT1]>]>,
-        InstrItinData<SUBINSN_tc_ld_SLOT01   , [InstrStage<3, [SLOT0, SLOT1]>]>,
-        InstrItinData<SUBINSN_tc_st_SLOT01   , [InstrStage<1, [SLOT0, SLOT1]>]>,
+        InstrItinData<ST_tc_st_SLOT01   , [InstrStage<1, [SLOT0, SLOT1]>],
+                                          [1, 1, 1]>,
+        InstrItinData<ST_tc_3stall_SLOT0, [InstrStage<1, [SLOT0]>], [2, 1, 1]>,
+        InstrItinData<ST_tc_ld_SLOT0    , [InstrStage<1, [SLOT0]>], [2, 1, 1]>,
+        InstrItinData<ST_tc_st_SLOT0    , [InstrStage<1, [SLOT0]>], [1, 1, 1]>,
 
         // S
-        InstrItinData<S_2op_tc_1_SLOT23     , [InstrStage<1, [SLOT2, SLOT3]>]>,
-        InstrItinData<S_2op_tc_2_SLOT23     , [InstrStage<2, [SLOT2, SLOT3]>]>,
-        InstrItinData<S_2op_tc_2early_SLOT23, [InstrStage<2, [SLOT2, SLOT3]>]>,
-        InstrItinData<S_2op_tc_3or4x_SLOT23 , [InstrStage<3, [SLOT2, SLOT3]>]>,
-        InstrItinData<S_3op_tc_1_SLOT23     , [InstrStage<1, [SLOT2, SLOT3]>]>,
-        InstrItinData<S_3op_tc_2_SLOT23     , [InstrStage<2, [SLOT2, SLOT3]>]>,
-        InstrItinData<S_3op_tc_2early_SLOT23, [InstrStage<2, [SLOT2, SLOT3]>]>,
-        InstrItinData<S_3op_tc_3_SLOT23     , [InstrStage<3, [SLOT2, SLOT3]>]>,
-        InstrItinData<S_3op_tc_3stall_SLOT23, [InstrStage<3, [SLOT2, SLOT3]>]>,
-        InstrItinData<S_3op_tc_3x_SLOT23    , [InstrStage<3, [SLOT2, SLOT3]>]>,
+        InstrItinData<S_2op_tc_1_SLOT23     , [InstrStage<1, [SLOT2, SLOT3]>],
+                                              [1, 1, 1]>,
+        InstrItinData<S_2op_tc_2_SLOT23     , [InstrStage<1, [SLOT2, SLOT3]>],
+                                              [2, 1, 1]>,
+        InstrItinData<S_2op_tc_2early_SLOT23, [InstrStage<1, [SLOT2, SLOT3]>],
+                                              [2, 1, 1]>,
+        InstrItinData<S_2op_tc_3or4x_SLOT23 , [InstrStage<1, [SLOT2, SLOT3]>],
+                                              [3, 1, 1]>,
+        InstrItinData<S_3op_tc_1_SLOT23     , [InstrStage<1, [SLOT2, SLOT3]>],
+                                              [1, 1, 1]>,
+        InstrItinData<S_3op_tc_2_SLOT23     , [InstrStage<1, [SLOT2, SLOT3]>],
+                                              [2, 1, 1]>,
+        InstrItinData<S_3op_tc_2early_SLOT23, [InstrStage<1, [SLOT2, SLOT3]>],
+                                              [2, 1, 1]>,
+        InstrItinData<S_3op_tc_3_SLOT23     , [InstrStage<1, [SLOT2, SLOT3]>],
+                                              [3, 1, 1]>,
+        InstrItinData<S_3op_tc_3stall_SLOT23, [InstrStage<1, [SLOT2, SLOT3]>],
+                                              [3, 1, 1]>,
+        InstrItinData<S_3op_tc_3x_SLOT23    , [InstrStage<1, [SLOT2, SLOT3]>],
+                                              [3, 1, 1]>,
 
         // New Value Compare Jump
-        InstrItinData<NCJ_tc_3or4stall_SLOT0, [InstrStage<3, [SLOT0]>]>,
+        InstrItinData<NCJ_tc_3or4stall_SLOT0, [InstrStage<1, [SLOT0]>],
+                                              [3, 1, 1, 1]>,
 
         // Mem ops
-        InstrItinData<V2LDST_tc_st_SLOT0  , [InstrStage<1, [SLOT0]>]>,
-        InstrItinData<V2LDST_tc_ld_SLOT01 , [InstrStage<2, [SLOT0, SLOT1]>]>,
-        InstrItinData<V2LDST_tc_st_SLOT01 , [InstrStage<1, [SLOT0, SLOT1]>]>,
-        InstrItinData<V4LDST_tc_st_SLOT0  , [InstrStage<1, [SLOT0]>]>,
-        InstrItinData<V4LDST_tc_ld_SLOT01 , [InstrStage<3, [SLOT0, SLOT1]>]>,
-        InstrItinData<V4LDST_tc_st_SLOT01 , [InstrStage<1, [SLOT0, SLOT1]>]>,
+        InstrItinData<V2LDST_tc_st_SLOT0  , [InstrStage<1, [SLOT0]>],
+                                            [1, 1, 1, 1]>,
+        InstrItinData<V2LDST_tc_ld_SLOT01 , [InstrStage<1, [SLOT0, SLOT1]>],
+                                            [2, 1, 1, 1]>,
+        InstrItinData<V2LDST_tc_st_SLOT01 , [InstrStage<1, [SLOT0, SLOT1]>],
+                                            [1, 1, 1, 1]>,
+        InstrItinData<V4LDST_tc_st_SLOT0  , [InstrStage<1, [SLOT0]>],
+                                            [1, 1, 1, 1]>,
+        InstrItinData<V4LDST_tc_ld_SLOT01 , [InstrStage<1, [SLOT0, SLOT1]>],
+                                            [3, 1, 1, 1]>,
+        InstrItinData<V4LDST_tc_st_SLOT01 , [InstrStage<1, [SLOT0, SLOT1]>],
+                                            [1, 1, 1, 1]>,
 
         // Endloop
-        InstrItinData<J_tc_2early_SLOT0123, [InstrStage<2, [SLOT_ENDLOOP]>]>,
+        InstrItinData<J_tc_2early_SLOT0123, [InstrStage<1, [SLOT_ENDLOOP]>],
+                                            [2]>,
 
         // Vector
         InstrItinData<COPROC_VMEM_vtc_long_SLOT01,
-                      [InstrStage<3, [SLOT0, SLOT1]>]>,
+                      [InstrStage<1, [SLOT0, SLOT1]>], [2, 1, 1, 1]>,
         InstrItinData<COPROC_VX_vtc_long_SLOT23  ,
-                      [InstrStage<3, [SLOT2, SLOT3]>]>,
+                      [InstrStage<1, [SLOT2, SLOT3]>], [3, 1, 1, 1]>,
         InstrItinData<COPROC_VX_vtc_SLOT23 ,
-                      [InstrStage<3, [SLOT2, SLOT3]>]>,
+                      [InstrStage<1, [SLOT2, SLOT3]>], [3, 1, 1, 1]>,
         InstrItinData<MAPPING_tc_1_SLOT0123      ,
-                      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>]>,
+                      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>],
+                      [1, 1, 1, 1]>,
 
         // Misc
-        InstrItinData<COMPOUND_CJ_ARCHDEPSLOT , [InstrStage<1, [SLOT2, SLOT3]>]>,
-        InstrItinData<COMPOUND , [InstrStage<1, [SLOT2, SLOT3]>]>,
-        InstrItinData<DUPLEX , [InstrStage<1, [SLOT0]>]>,
-        InstrItinData<PREFIX , [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>]>,
-        InstrItinData<PSEUDO , [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>]>,
+        InstrItinData<COMPOUND_CJ_ARCHDEPSLOT , [InstrStage<1, [SLOT2, SLOT3]>],
+                                                [1, 1, 1]>,
+        InstrItinData<COMPOUND , [InstrStage<1, [SLOT2, SLOT3]>],
+                                 [1, 1, 1]>,
+        InstrItinData<DUPLEX , [InstrStage<1, [SLOT0]>], [1, 1, 1]>,
+        InstrItinData<PREFIX , [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>],
+                               [1, 1, 1]>,
+        InstrItinData<PSEUDO , [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>],
+                               [1, 1, 1]>,
         InstrItinData<PSEUDOM, [InstrStage<1, [SLOT2, SLOT3], 0>,
-                                InstrStage<1, [SLOT2, SLOT3]>]>
-
+                                InstrStage<1, [SLOT2, SLOT3]>], [1, 1, 1]>
       ]>;
 
 def HexagonModelV55 : SchedMachineModel {
@@ -163,6 +186,7 @@ def HexagonModelV55 : SchedMachineModel {
   let IssueWidth = 4;
   let Itineraries = HexagonItinerariesV55;
   let LoadLatency = 1;
+  let CompleteModel = 0;
 }
 
 //===----------------------------------------------------------------------===//
diff --git a/lib/Target/Hexagon/HexagonScheduleV60.td b/lib/Target/Hexagon/HexagonScheduleV60.td
index 2ccff8242a47..dc2ce43b0579 100644
--- a/lib/Target/Hexagon/HexagonScheduleV60.td
+++ b/lib/Target/Hexagon/HexagonScheduleV60.td
@@ -167,16 +167,6 @@ def HexagonItinerariesV60 :
         InstrItinData<ST_tc_ld_SLOT0    , [InstrStage<3, [SLOT0]>]>,
         InstrItinData<ST_tc_st_SLOT0    , [InstrStage<1, [SLOT0]>]>,
 
-        // Subinsn
-        InstrItinData<SUBINSN_tc_2early_SLOT0, [InstrStage<2, [SLOT0]>]>,
-        InstrItinData<SUBINSN_tc_3stall_SLOT0, [InstrStage<3, [SLOT0]>]>,
-        InstrItinData<SUBINSN_tc_ld_SLOT0    , [InstrStage<3, [SLOT0]>]>,
-        InstrItinData<SUBINSN_tc_1_SLOT01    , [InstrStage<1, [SLOT0, SLOT1]>]>,
-        InstrItinData<SUBINSN_tc_2early_SLOT01,
-                                               [InstrStage<2, [SLOT0, SLOT1]>]>,
-        InstrItinData<SUBINSN_tc_ld_SLOT01   , [InstrStage<3, [SLOT0, SLOT1]>]>,
-        InstrItinData<SUBINSN_tc_st_SLOT01   , [InstrStage<1, [SLOT0, SLOT1]>]>,
-
         // S
         InstrItinData<S_2op_tc_1_SLOT23     , [InstrStage<1, [SLOT2, SLOT3]>]>,
         InstrItinData<S_2op_tc_2_SLOT23     , [InstrStage<2, [SLOT2, SLOT3]>]>,
@@ -303,6 +293,7 @@ def HexagonModelV60 : SchedMachineModel {
   let IssueWidth = 4;
   let Itineraries = HexagonItinerariesV60;
   let LoadLatency = 1;
+  let CompleteModel = 0;
 }
 
 //===----------------------------------------------------------------------===//
diff --git a/lib/Target/Hexagon/HexagonSelectionDAGInfo.cpp b/lib/Target/Hexagon/HexagonSelectionDAGInfo.cpp
index 239dbda8f27b..00dfed754995 100644
--- a/lib/Target/Hexagon/HexagonSelectionDAGInfo.cpp
+++ b/lib/Target/Hexagon/HexagonSelectionDAGInfo.cpp
@@ -17,13 +17,10 @@ using namespace llvm;
 
 #define DEBUG_TYPE "hexagon-selectiondag-info"
 
-SDValue
-HexagonSelectionDAGInfo::
-EmitTargetCodeForMemcpy(SelectionDAG &DAG, SDLoc dl, SDValue Chain,
-                        SDValue Dst, SDValue Src, SDValue Size, unsigned Align,
-                        bool isVolatile, bool AlwaysInline,
-                        MachinePointerInfo DstPtrInfo,
-                        MachinePointerInfo SrcPtrInfo) const {
+SDValue HexagonSelectionDAGInfo::EmitTargetCodeForMemcpy(
+    SelectionDAG &DAG, const SDLoc &dl, SDValue Chain, SDValue Dst, SDValue Src,
+    SDValue Size, unsigned Align, bool isVolatile, bool AlwaysInline,
+    MachinePointerInfo DstPtrInfo, MachinePointerInfo SrcPtrInfo) const {
   ConstantSDNode *ConstantSize = dyn_cast<ConstantSDNode>(Size);
   if (AlwaysInline || (Align & 0x3) != 0 || !ConstantSize)
     return SDValue();
@@ -55,7 +52,7 @@ EmitTargetCodeForMemcpy(SelectionDAG &DAG, SDLoc dl, SDValue Chain,
                  Type::getVoidTy(*DAG.getContext()),
                  DAG.getTargetExternalSymbol(
                      SpecialMemcpyName, TLI.getPointerTy(DAG.getDataLayout())),
-                 std::move(Args), 0)
+                 std::move(Args))
       .setDiscardResult();
 
   std::pair<SDValue, SDValue> CallResult = TLI.LowerCallTo(CLI);
diff --git a/lib/Target/Hexagon/HexagonSelectionDAGInfo.h b/lib/Target/Hexagon/HexagonSelectionDAGInfo.h
index 80ac5d7bd9e2..6f2a42ce97f6 100644
--- a/lib/Target/Hexagon/HexagonSelectionDAGInfo.h
+++ b/lib/Target/Hexagon/HexagonSelectionDAGInfo.h
@@ -7,25 +7,23 @@
 //
 //===----------------------------------------------------------------------===//
 //
-// This file defines the Hexagon subclass for TargetSelectionDAGInfo.
+// This file defines the Hexagon subclass for SelectionDAGTargetInfo.
 //
 //===----------------------------------------------------------------------===//
 
 #ifndef LLVM_LIB_TARGET_HEXAGON_HEXAGONSELECTIONDAGINFO_H
 #define LLVM_LIB_TARGET_HEXAGON_HEXAGONSELECTIONDAGINFO_H
 
-#include "llvm/Target/TargetSelectionDAGInfo.h"
+#include "llvm/CodeGen/SelectionDAGTargetInfo.h"
 
 namespace llvm {
 
-class HexagonSelectionDAGInfo : public TargetSelectionDAGInfo {
+class HexagonSelectionDAGInfo : public SelectionDAGTargetInfo {
 public:
-
-  SDValue EmitTargetCodeForMemcpy(SelectionDAG &DAG, SDLoc dl,
-                                  SDValue Chain,
-                                  SDValue Dst, SDValue Src,
-                                  SDValue Size, unsigned Align,
-                                  bool isVolatile, bool AlwaysInline,
+  SDValue EmitTargetCodeForMemcpy(SelectionDAG &DAG, const SDLoc &dl,
+                                  SDValue Chain, SDValue Dst, SDValue Src,
+                                  SDValue Size, unsigned Align, bool isVolatile,
+                                  bool AlwaysInline,
                                   MachinePointerInfo DstPtrInfo,
                                   MachinePointerInfo SrcPtrInfo) const override;
 };
diff --git a/lib/Target/Hexagon/HexagonSplitConst32AndConst64.cpp b/lib/Target/Hexagon/HexagonSplitConst32AndConst64.cpp
index 10fe606985dd..5a94cce4ce57 100644
--- a/lib/Target/Hexagon/HexagonSplitConst32AndConst64.cpp
+++ b/lib/Target/Hexagon/HexagonSplitConst32AndConst64.cpp
@@ -21,7 +21,6 @@
 #include "HexagonSubtarget.h"
 #include "HexagonTargetMachine.h"
 #include "HexagonTargetObjectFile.h"
-#include "llvm/ADT/Statistic.h"
 #include "llvm/CodeGen/LatencyPriorityQueue.h"
 #include "llvm/CodeGen/MachineDominators.h"
 #include "llvm/CodeGen/MachineFunctionPass.h"
@@ -32,14 +31,11 @@
 #include "llvm/CodeGen/ScheduleDAGInstrs.h"
 #include "llvm/CodeGen/ScheduleHazardRecognizer.h"
 #include "llvm/CodeGen/SchedulerRegistry.h"
-#include "llvm/Support/CommandLine.h"
-#include "llvm/Support/Compiler.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/MathExtras.h"
 #include "llvm/Target/TargetInstrInfo.h"
 #include "llvm/Target/TargetMachine.h"
 #include "llvm/Target/TargetRegisterInfo.h"
-#include <map>
 
 using namespace llvm;
 
@@ -61,6 +57,10 @@ class HexagonSplitConst32AndConst64 : public MachineFunctionPass {
       return "Hexagon Split Const32s and Const64s";
     }
     bool runOnMachineFunction(MachineFunction &Fn) override;
+    MachineFunctionProperties getRequiredProperties() const override {
+      return MachineFunctionProperties().set(
+          MachineFunctionProperties::Property::AllVRegsAllocated);
+    }
 };
 
 
@@ -72,7 +72,7 @@ bool HexagonSplitConst32AndConst64::runOnMachineFunction(MachineFunction &Fn) {
   const HexagonTargetObjectFile &TLOF =
       *static_cast<const HexagonTargetObjectFile *>(
           Fn.getTarget().getObjFileLowering());
-  if (TLOF.IsSmallDataEnabled())
+  if (TLOF.isSmallDataEnabled())
     return true;
 
   const TargetInstrInfo *TII = Fn.getSubtarget().getInstrInfo();
@@ -86,55 +86,56 @@ bool HexagonSplitConst32AndConst64::runOnMachineFunction(MachineFunction &Fn) {
     MachineBasicBlock::iterator MII = MBB->begin();
     MachineBasicBlock::iterator MIE = MBB->end ();
     while (MII != MIE) {
-      MachineInstr *MI = MII;
-      int Opc = MI->getOpcode();
+      MachineInstr &MI = *MII;
+      int Opc = MI.getOpcode();
       if (Opc == Hexagon::CONST32_Int_Real &&
-          MI->getOperand(1).isBlockAddress()) {
-        int DestReg = MI->getOperand(0).getReg();
-        MachineOperand &Symbol = MI->getOperand (1);
-
-        BuildMI (*MBB, MII, MI->getDebugLoc(),
-                 TII->get(Hexagon::LO), DestReg).addOperand(Symbol);
-        BuildMI (*MBB, MII, MI->getDebugLoc(),
-                 TII->get(Hexagon::HI), DestReg).addOperand(Symbol);
+          MI.getOperand(1).isBlockAddress()) {
+        int DestReg = MI.getOperand(0).getReg();
+        MachineOperand &Symbol = MI.getOperand(1);
+
+        BuildMI(*MBB, MII, MI.getDebugLoc(), TII->get(Hexagon::LO), DestReg)
+            .addOperand(Symbol);
+        BuildMI(*MBB, MII, MI.getDebugLoc(), TII->get(Hexagon::HI), DestReg)
+            .addOperand(Symbol);
         // MBB->erase returns the iterator to the next instruction, which is the
         // one we want to process next
-        MII = MBB->erase (MI);
+        MII = MBB->erase(&MI);
         continue;
       }
 
       else if (Opc == Hexagon::CONST32_Int_Real ||
                Opc == Hexagon::CONST32_Float_Real) {
-        int DestReg = MI->getOperand(0).getReg();
+        int DestReg = MI.getOperand(0).getReg();
 
         // We have to convert an FP immediate into its corresponding integer
         // representation
         int64_t ImmValue;
         if (Opc == Hexagon::CONST32_Float_Real) {
-          APFloat Val = MI->getOperand(1).getFPImm()->getValueAPF();
+          APFloat Val = MI.getOperand(1).getFPImm()->getValueAPF();
           ImmValue = *Val.bitcastToAPInt().getRawData();
         }
         else
-          ImmValue = MI->getOperand(1).getImm();
+          ImmValue = MI.getOperand(1).getImm();
 
-        BuildMI(*MBB, MII, MI->getDebugLoc(),
-                 TII->get(Hexagon::A2_tfrsi), DestReg).addImm(ImmValue);
-        MII = MBB->erase (MI);
+        BuildMI(*MBB, MII, MI.getDebugLoc(), TII->get(Hexagon::A2_tfrsi),
+                DestReg)
+            .addImm(ImmValue);
+        MII = MBB->erase(&MI);
         continue;
       }
       else if (Opc == Hexagon::CONST64_Int_Real ||
                Opc == Hexagon::CONST64_Float_Real) {
-        int DestReg = MI->getOperand(0).getReg();
+        int DestReg = MI.getOperand(0).getReg();
 
         // We have to convert an FP immediate into its corresponding integer
         // representation
         int64_t ImmValue;
         if (Opc == Hexagon::CONST64_Float_Real) {
-          APFloat Val =  MI->getOperand(1).getFPImm()->getValueAPF();
+          APFloat Val = MI.getOperand(1).getFPImm()->getValueAPF();
           ImmValue = *Val.bitcastToAPInt().getRawData();
         }
         else
-          ImmValue = MI->getOperand(1).getImm();
+          ImmValue = MI.getOperand(1).getImm();
 
         unsigned DestLo = TRI->getSubReg(DestReg, Hexagon::subreg_loreg);
         unsigned DestHi = TRI->getSubReg(DestReg, Hexagon::subreg_hireg);
@@ -142,11 +143,13 @@ bool HexagonSplitConst32AndConst64::runOnMachineFunction(MachineFunction &Fn) {
         int32_t LowWord = (ImmValue & 0xFFFFFFFF);
         int32_t HighWord = (ImmValue >> 32) & 0xFFFFFFFF;
 
-        BuildMI(*MBB, MII, MI->getDebugLoc(),
-                 TII->get(Hexagon::A2_tfrsi), DestLo).addImm(LowWord);
-        BuildMI (*MBB, MII, MI->getDebugLoc(),
-                 TII->get(Hexagon::A2_tfrsi), DestHi).addImm(HighWord);
-        MII = MBB->erase (MI);
+        BuildMI(*MBB, MII, MI.getDebugLoc(), TII->get(Hexagon::A2_tfrsi),
+                DestLo)
+            .addImm(LowWord);
+        BuildMI(*MBB, MII, MI.getDebugLoc(), TII->get(Hexagon::A2_tfrsi),
+                DestHi)
+            .addImm(HighWord);
+        MII = MBB->erase(&MI);
         continue;
       }
       ++MII;
diff --git a/lib/Target/Hexagon/HexagonSplitDouble.cpp b/lib/Target/Hexagon/HexagonSplitDouble.cpp
index d4e95b0d0210..25b2affa2f0b 100644
--- a/lib/Target/Hexagon/HexagonSplitDouble.cpp
+++ b/lib/Target/Hexagon/HexagonSplitDouble.cpp
@@ -12,13 +12,12 @@
 #include "HexagonRegisterInfo.h"
 #include "HexagonTargetMachine.h"
 
-#include "llvm/Pass.h"
-#include "llvm/ADT/DenseMap.h"
 #include "llvm/CodeGen/MachineFunction.h"
 #include "llvm/CodeGen/MachineFunctionPass.h"
 #include "llvm/CodeGen/MachineInstrBuilder.h"
 #include "llvm/CodeGen/MachineLoopInfo.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/Pass.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/raw_ostream.h"
@@ -109,18 +108,6 @@ INITIALIZE_PASS(HexagonSplitDoubleRegs, "hexagon-split-double",
   "Hexagon Split Double Registers", false, false)
 
 
-static inline uint32_t getRegState(const MachineOperand &R) {
-  assert(R.isReg());
-  return getDefRegState(R.isDef()) |
-         getImplRegState(R.isImplicit()) |
-         getKillRegState(R.isKill()) |
-         getDeadRegState(R.isDead()) |
-         getUndefRegState(R.isUndef()) |
-         getInternalReadRegState(R.isInternalRead()) |
-         (R.isDebug() ? RegState::Debug : 0);
-}
-
-
 void HexagonSplitDoubleRegs::dump_partition(raw_ostream &os,
       const USet &Part, const TargetRegisterInfo &TRI) {
   dbgs() << '{';
@@ -452,7 +439,7 @@ void HexagonSplitDoubleRegs::collectIndRegsForLoop(const MachineLoop *L,
   MachineBasicBlock *TB = 0, *FB = 0;
   MachineBasicBlock *TmpLB = const_cast<MachineBasicBlock*>(LB);
   SmallVector<MachineOperand,2> Cond;
-  bool BadLB = TII->AnalyzeBranch(*TmpLB, TB, FB, Cond, false);
+  bool BadLB = TII->analyzeBranch(*TmpLB, TB, FB, Cond, false);
   // Only analyzable conditional branches. HII::AnalyzeBranch will put
   // the branch opcode as the first element of Cond, and the predicate
   // operand as the second.
@@ -477,7 +464,7 @@ void HexagonSplitDoubleRegs::collectIndRegsForLoop(const MachineLoop *L,
     CmpI = MRI->getVRegDef(CmpI->getOperand(1).getReg());
 
   int Mask = 0, Val = 0;
-  bool OkCI = TII->analyzeCompare(CmpI, CmpR1, CmpR2, Mask, Val);
+  bool OkCI = TII->analyzeCompare(*CmpI, CmpR1, CmpR2, Mask, Val);
   if (!OkCI)
     return;
   // Eliminate non-double input registers.
@@ -655,7 +642,7 @@ void HexagonSplitDoubleRegs::splitMemRef(MachineInstr *MI,
   MachineFunction &MF = *B.getParent();
   for (auto &MO : MI->memoperands()) {
     const MachinePointerInfo &Ptr = MO->getPointerInfo();
-    unsigned F = MO->getFlags();
+    MachineMemOperand::Flags F = MO->getFlags();
     int A = MO->getAlignment();
 
     auto *Tmp1 = MF.getMachineMemOperand(Ptr, F, 4/*size*/, A);
@@ -1164,6 +1151,9 @@ bool HexagonSplitDoubleRegs::runOnMachineFunction(MachineFunction &MF) {
   DEBUG(dbgs() << "Splitting double registers in function: "
         << MF.getName() << '\n');
 
+  if (skipFunction(*MF.getFunction()))
+    return false;
+
   auto &ST = MF.getSubtarget<HexagonSubtarget>();
   TRI = ST.getRegisterInfo();
   TII = ST.getInstrInfo();
diff --git a/lib/Target/Hexagon/HexagonStoreWidening.cpp b/lib/Target/Hexagon/HexagonStoreWidening.cpp
index b5339ff4c0dc..54bc3cf6f6ff 100644
--- a/lib/Target/Hexagon/HexagonStoreWidening.cpp
+++ b/lib/Target/Hexagon/HexagonStoreWidening.cpp
@@ -594,6 +594,9 @@ bool HexagonStoreWidening::processBasicBlock(MachineBasicBlock &MBB) {
 
 
 bool HexagonStoreWidening::runOnMachineFunction(MachineFunction &MFn) {
+  if (skipFunction(*MFn.getFunction()))
+    return false;
+
   MF = &MFn;
   auto &ST = MFn.getSubtarget<HexagonSubtarget>();
   TII = ST.getInstrInfo();
diff --git a/lib/Target/Hexagon/HexagonSubtarget.cpp b/lib/Target/Hexagon/HexagonSubtarget.cpp
index aa0efd4f65e0..fb315a730f39 100644
--- a/lib/Target/Hexagon/HexagonSubtarget.cpp
+++ b/lib/Target/Hexagon/HexagonSubtarget.cpp
@@ -14,6 +14,8 @@
 #include "HexagonSubtarget.h"
 #include "Hexagon.h"
 #include "HexagonRegisterInfo.h"
+#include "llvm/CodeGen/ScheduleDAG.h"
+#include "llvm/CodeGen/ScheduleDAGInstrs.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/ErrorHandling.h"
 #include <map>
@@ -49,10 +51,24 @@ static cl::opt<bool> EnableHexagonHVX("enable-hexagon-hvx",
   cl::Hidden, cl::ZeroOrMore, cl::init(false),
   cl::desc("Enable Hexagon Vector eXtensions"));
 
+static cl::opt<bool> EnableTCLatencySched("enable-tc-latency-sched",
+  cl::Hidden, cl::ZeroOrMore, cl::init(false));
+
+static cl::opt<bool> EnableDotCurSched("enable-cur-sched",
+  cl::Hidden, cl::ZeroOrMore, cl::init(true),
+  cl::desc("Enable the scheduler to generate .cur"));
+
+static cl::opt<bool> EnableVecFrwdSched("enable-evec-frwd-sched",
+  cl::Hidden, cl::ZeroOrMore, cl::init(true));
+
 static cl::opt<bool> DisableHexagonMISched("disable-hexagon-misched",
   cl::Hidden, cl::ZeroOrMore, cl::init(false),
   cl::desc("Disable Hexagon MI Scheduling"));
 
+static cl::opt<bool> EnableSubregLiveness("hexagon-subreg-liveness",
+  cl::Hidden, cl::ZeroOrMore, cl::init(false),
+  cl::desc("Enable subregister liveness tracking for Hexagon"));
+
 void HexagonSubtarget::initializeEnvironment() {
   UseMemOps = false;
   ModeIEEERndNear = false;
@@ -115,6 +131,57 @@ HexagonSubtarget::HexagonSubtarget(const Triple &TT, StringRef CPU,
   UseBSBScheduling = hasV60TOps() && EnableBSBSched;
 }
 
+
+void HexagonSubtarget::HexagonDAGMutation::apply(ScheduleDAGInstrs *DAG) {
+  for (auto &SU : DAG->SUnits) {
+    if (!SU.isInstr())
+      continue;
+    SmallVector<SDep, 4> Erase;
+    for (auto &D : SU.Preds)
+      if (D.getKind() == SDep::Output && D.getReg() == Hexagon::USR_OVF)
+        Erase.push_back(D);
+    for (auto &E : Erase)
+      SU.removePred(E);
+  }
+
+  for (auto &SU : DAG->SUnits) {
+    // Update the latency of chain edges between v60 vector load or store
+    // instructions to be 1. These instructions cannot be scheduled in the
+    // same packet.
+    MachineInstr *MI1 = SU.getInstr();
+    auto *QII = static_cast<const HexagonInstrInfo*>(DAG->TII);
+    bool IsStoreMI1 = MI1->mayStore();
+    bool IsLoadMI1 = MI1->mayLoad();
+    if (!QII->isV60VectorInstruction(MI1) || !(IsStoreMI1 || IsLoadMI1))
+      continue;
+    for (auto &SI : SU.Succs) {
+      if (SI.getKind() != SDep::Order || SI.getLatency() != 0)
+        continue;
+      MachineInstr *MI2 = SI.getSUnit()->getInstr();
+      if (!QII->isV60VectorInstruction(MI2))
+        continue;
+      if ((IsStoreMI1 && MI2->mayStore()) || (IsLoadMI1 && MI2->mayLoad())) {
+        SI.setLatency(1);
+        SU.setHeightDirty();
+        // Change the dependence in the opposite direction too.
+        for (auto &PI : SI.getSUnit()->Preds) {
+          if (PI.getSUnit() != &SU || PI.getKind() != SDep::Order)
+            continue;
+          PI.setLatency(1);
+          SI.getSUnit()->setDepthDirty();
+        }
+      }
+    }
+  }
+}
+
+
+void HexagonSubtarget::getPostRAMutations(
+      std::vector<std::unique_ptr<ScheduleDAGMutation>> &Mutations) const {
+  Mutations.push_back(make_unique<HexagonSubtarget::HexagonDAGMutation>());
+}
+
+
 // Pin the vtable to this file.
 void HexagonSubtarget::anchor() {}
 
@@ -123,3 +190,180 @@ bool HexagonSubtarget::enableMachineScheduler() const {
     return !DisableHexagonMISched;
   return true;
 }
+
+bool HexagonSubtarget::enableSubRegLiveness() const {
+  return EnableSubregLiveness;
+}
+
+// This helper function is responsible for increasing the latency only.
+void HexagonSubtarget::updateLatency(MachineInstr *SrcInst,
+      MachineInstr *DstInst, SDep &Dep) const {
+  if (!hasV60TOps())
+    return;
+
+  auto &QII = static_cast<const HexagonInstrInfo&>(*getInstrInfo());
+
+  if (EnableVecFrwdSched && QII.addLatencyToSchedule(SrcInst, DstInst)) {
+    // Vec frwd scheduling.
+    Dep.setLatency(Dep.getLatency() + 1);
+  } else if (useBSBScheduling() &&
+             QII.isLateInstrFeedsEarlyInstr(SrcInst, DstInst)) {
+    // BSB scheduling.
+    Dep.setLatency(Dep.getLatency() + 1);
+  } else if (EnableTCLatencySched) {
+    // TClass latency scheduling.
+    // Check if SrcInst produces in 2C an operand of DstInst taken in stage 2B.
+    if (QII.isTC1(SrcInst) || QII.isTC2(SrcInst))
+      if (!QII.isTC1(DstInst) && !QII.isTC2(DstInst))
+        Dep.setLatency(Dep.getLatency() + 1);
+  }
+}
+
+/// If the SUnit has a zero latency edge, return the other SUnit.
+static SUnit *getZeroLatency(SUnit *N, SmallVector<SDep, 4> &Deps) {
+  for (auto &I : Deps)
+    if (I.isAssignedRegDep() && I.getLatency() == 0 &&
+        !I.getSUnit()->getInstr()->isPseudo())
+      return I.getSUnit();
+  return nullptr;
+}
+
+/// Change the latency between the two SUnits.
+void HexagonSubtarget::changeLatency(SUnit *Src, SmallVector<SDep, 4> &Deps,
+      SUnit *Dst, unsigned Lat) const {
+  MachineInstr *SrcI = Src->getInstr();
+  for (auto &I : Deps) {
+    if (I.getSUnit() != Dst)
+      continue;
+    I.setLatency(Lat);
+    SUnit *UpdateDst = I.getSUnit();
+    updateLatency(SrcI, UpdateDst->getInstr(), I);
+    // Update the latency of opposite edge too.
+    for (auto &PI : UpdateDst->Preds) {
+      if (PI.getSUnit() != Src || !PI.isAssignedRegDep())
+        continue;
+      PI.setLatency(Lat);
+      updateLatency(SrcI, UpdateDst->getInstr(), PI);
+    }
+  }
+}
+
+// Return true if these are the best two instructions to schedule
+// together with a zero latency. Only one dependence should have a zero
+// latency. If there are multiple choices, choose the best, and change
+// ther others, if needed.
+bool HexagonSubtarget::isBestZeroLatency(SUnit *Src, SUnit *Dst,
+      const HexagonInstrInfo *TII) const {
+  MachineInstr *SrcInst = Src->getInstr();
+  MachineInstr *DstInst = Dst->getInstr();
+
+  if (SrcInst->isPHI() || DstInst->isPHI())
+    return false;
+
+  // Check if the Dst instruction is the best candidate first.
+  SUnit *Best = nullptr;
+  SUnit *DstBest = nullptr;
+  SUnit *SrcBest = getZeroLatency(Dst, Dst->Preds);
+  if (SrcBest == nullptr || Src->NodeNum >= SrcBest->NodeNum) {
+    // Check that Src doesn't have a better candidate.
+    DstBest = getZeroLatency(Src, Src->Succs);
+    if (DstBest == nullptr || Dst->NodeNum <= DstBest->NodeNum)
+      Best = Dst;
+  }
+  if (Best != Dst)
+    return false;
+
+  // The caller frequents adds the same dependence twice. If so, then
+  // return true for this case too.
+  if (Src == SrcBest && Dst == DstBest)
+    return true;
+
+  // Reassign the latency for the previous bests, which requires setting
+  // the dependence edge in both directions.
+  if (SrcBest != nullptr)
+    changeLatency(SrcBest, SrcBest->Succs, Dst, 1);
+  if (DstBest != nullptr)
+    changeLatency(Src, Src->Succs, DstBest, 1);
+  // If there is an edge from SrcBest to DstBst, then try to change that
+  // to 0 now.
+  if (SrcBest && DstBest)
+    changeLatency(SrcBest, SrcBest->Succs, DstBest, 0);
+
+  return true;
+}
+
+// Update the latency of a Phi when the Phi bridges two instructions that
+// require a multi-cycle latency.
+void HexagonSubtarget::changePhiLatency(MachineInstr *SrcInst, SUnit *Dst,
+      SDep &Dep) const {
+  if (!SrcInst->isPHI() || Dst->NumPreds == 0 || Dep.getLatency() != 0)
+    return;
+
+  for (const SDep &PI : Dst->Preds) {
+    if (PI.getLatency() != 0)
+      continue;
+    Dep.setLatency(2);
+    break;
+  }
+}
+
+/// \brief Perform target specific adjustments to the latency of a schedule
+/// dependency.
+void HexagonSubtarget::adjustSchedDependency(SUnit *Src, SUnit *Dst,
+                                             SDep &Dep) const {
+  MachineInstr *SrcInst = Src->getInstr();
+  MachineInstr *DstInst = Dst->getInstr();
+  if (!Src->isInstr() || !Dst->isInstr())
+    return;
+
+  const HexagonInstrInfo *QII = static_cast<const HexagonInstrInfo *>(getInstrInfo());
+
+  // Instructions with .new operands have zero latency.
+  if (QII->canExecuteInBundle(SrcInst, DstInst) &&
+      isBestZeroLatency(Src, Dst, QII)) {
+    Dep.setLatency(0);
+    return;
+  }
+
+  if (!hasV60TOps())
+    return;
+
+  // Don't adjust the latency of post-increment part of the instruction.
+  if (QII->isPostIncrement(SrcInst) && Dep.isAssignedRegDep()) {
+    if (SrcInst->mayStore())
+      return;
+    if (Dep.getReg() != SrcInst->getOperand(0).getReg())
+      return;
+  } else if (QII->isPostIncrement(DstInst) && Dep.getKind() == SDep::Anti) {
+    if (DstInst->mayStore())
+      return;
+    if (Dep.getReg() != DstInst->getOperand(0).getReg())
+      return;
+  } else if (QII->isPostIncrement(DstInst) && DstInst->mayStore() &&
+             Dep.isAssignedRegDep()) {
+    MachineOperand &Op = DstInst->getOperand(DstInst->getNumOperands() - 1);
+    if (Op.isReg() && Dep.getReg() != Op.getReg())
+      return;
+  }
+
+  // Check if we need to change any the latency values when Phis are added.
+  if (useBSBScheduling() && SrcInst->isPHI()) {
+    changePhiLatency(SrcInst, Dst, Dep);
+    return;
+  }
+
+  // If it's a REG_SEQUENCE, use its destination instruction to determine
+  // the correct latency.
+  if (DstInst->isRegSequence() && Dst->NumSuccs == 1)
+    DstInst = Dst->Succs[0].getSUnit()->getInstr();
+
+  // Try to schedule uses near definitions to generate .cur.
+  if (EnableDotCurSched && QII->isToBeScheduledASAP(SrcInst, DstInst) &&
+      isBestZeroLatency(Src, Dst, QII)) {
+    Dep.setLatency(0);
+    return;
+  }
+
+  updateLatency(SrcInst, DstInst, Dep);
+}
+
diff --git a/lib/Target/Hexagon/HexagonSubtarget.h b/lib/Target/Hexagon/HexagonSubtarget.h
index c7ae139c4346..9b40c130e622 100644
--- a/lib/Target/Hexagon/HexagonSubtarget.h
+++ b/lib/Target/Hexagon/HexagonSubtarget.h
@@ -18,7 +18,6 @@
 #include "HexagonISelLowering.h"
 #include "HexagonInstrInfo.h"
 #include "HexagonSelectionDAGInfo.h"
-#include "llvm/IR/DataLayout.h"
 #include "llvm/Target/TargetMachine.h"
 #include "llvm/Target/TargetSubtargetInfo.h"
 #include <string>
@@ -47,6 +46,11 @@ public:
   /// default for V60.
   bool UseBSBScheduling;
 
+  class HexagonDAGMutation : public ScheduleDAGMutation {
+  public:
+    void apply(ScheduleDAGInstrs *DAG) override;
+  };
+
 private:
   std::string CPUString;
   HexagonInstrInfo InstrInfo;
@@ -105,6 +109,11 @@ public:
   // compiler time and will be removed eventually anyway.
   bool enableMachineSchedDefaultSched() const override { return false; }
 
+  AntiDepBreakMode getAntiDepBreakMode() const override { return ANTIDEP_ALL; }
+  bool enablePostRAScheduler() const override { return true; }
+
+  bool enableSubRegLiveness() const override;
+
   const std::string &getCPUString () const { return CPUString; }
 
   // Threshold for small data section
@@ -114,6 +123,24 @@ public:
   const HexagonArchEnum &getHexagonArchVersion() const {
     return HexagonArchVersion;
   }
+
+  void getPostRAMutations(
+      std::vector<std::unique_ptr<ScheduleDAGMutation>> &Mutations)
+      const override;
+
+  /// \brief Perform target specific adjustments to the latency of a schedule
+  /// dependency.
+  void adjustSchedDependency(SUnit *def, SUnit *use, SDep& dep) const override;
+
+private:
+  // Helper function responsible for increasing the latency only.
+  void updateLatency(MachineInstr *SrcInst, MachineInstr *DstInst, SDep &Dep)
+      const;
+  void changeLatency(SUnit *Src, SmallVector<SDep, 4> &Deps, SUnit *Dst,
+      unsigned Lat) const;
+  bool isBestZeroLatency(SUnit *Src, SUnit *Dst, const HexagonInstrInfo *TII)
+      const;
+  void changePhiLatency(MachineInstr *SrcInst, SUnit *Dst, SDep &Dep) const;
 };
 
 } // end namespace llvm
diff --git a/lib/Target/Hexagon/HexagonSystemInst.td b/lib/Target/Hexagon/HexagonSystemInst.td
index 784686a437ad..771498a40b99 100644
--- a/lib/Target/Hexagon/HexagonSystemInst.td
+++ b/lib/Target/Hexagon/HexagonSystemInst.td
@@ -111,3 +111,24 @@ def Y2_isync: JRInst <(outs), (ins),
     let Inst{9-0} = 0b0000000010;
   }
 
+//===----------------------------------------------------------------------===//
+//                     System/User instructions.
+//===----------------------------------------------------------------------===//
+// traps and pause
+let hasSideEffects = 0, isSolo = 1 in
+class J2_MISC_TRAP_PAUSE<string mnemonic, bits<2> MajOp>
+  : JRInst
+  <(outs), (ins u8Imm:$u8),
+   #mnemonic#"(#$u8)"> {
+    bits<8> u8;
+
+    let IClass = 0b0101;
+    let Inst{27-24} = 0b0100;
+    let Inst{23-22} = MajOp;
+    let Inst{12-8} = u8{7-3};
+    let Inst{4-2} = u8{2-0};
+  }
+def J2_trap0 : J2_MISC_TRAP_PAUSE<"trap0", 0b00>;
+def J2_trap1 : J2_MISC_TRAP_PAUSE<"trap1", 0b10>;
+def J2_pause : J2_MISC_TRAP_PAUSE<"pause", 0b01>;
+
diff --git a/lib/Target/Hexagon/HexagonTargetMachine.cpp b/lib/Target/Hexagon/HexagonTargetMachine.cpp
index 34b03fb74cef..f964a6612f43 100644
--- a/lib/Target/Hexagon/HexagonTargetMachine.cpp
+++ b/lib/Target/Hexagon/HexagonTargetMachine.cpp
@@ -18,6 +18,7 @@
 #include "HexagonTargetObjectFile.h"
 #include "HexagonTargetTransformInfo.h"
 #include "llvm/CodeGen/Passes.h"
+#include "llvm/CodeGen/TargetPassConfig.h"
 #include "llvm/IR/LegacyPassManager.h"
 #include "llvm/IR/Module.h"
 #include "llvm/Support/CommandLine.h"
@@ -33,6 +34,10 @@ static cl::opt<bool> EnableRDFOpt("rdf-opt", cl::Hidden, cl::ZeroOrMore,
 static cl::opt<bool> DisableHardwareLoops("disable-hexagon-hwloops",
   cl::Hidden, cl::desc("Disable Hardware Loops for Hexagon target"));
 
+static cl::opt<bool> DisableAModeOpt("disable-hexagon-amodeopt",
+  cl::Hidden, cl::ZeroOrMore, cl::init(false),
+  cl::desc("Disable Hexagon Addressing Mode Optimization"));
+
 static cl::opt<bool> DisableHexagonCFGOpt("disable-hexagon-cfgopt",
   cl::Hidden, cl::ZeroOrMore, cl::init(false),
   cl::desc("Disable Hexagon CFG Optimization"));
@@ -72,6 +77,9 @@ static cl::opt<bool> EnableBitSimplify("hexagon-bit", cl::init(true),
 static cl::opt<bool> EnableLoopResched("hexagon-loop-resched", cl::init(true),
   cl::Hidden, cl::desc("Loop rescheduling"));
 
+static cl::opt<bool> HexagonNoOpt("hexagon-noopt", cl::init(false),
+  cl::Hidden, cl::desc("Disable backend optimizations"));
+
 /// HexagonTargetMachineModule - Note that this is used on hosts that
 /// cannot link in a library unless there are references into the
 /// library.  In particular, it seems that it is not possible to get
@@ -95,13 +103,13 @@ SchedCustomRegistry("hexagon", "Run Hexagon's custom scheduler",
 
 namespace llvm {
   FunctionPass *createHexagonBitSimplify();
+  FunctionPass *createHexagonBranchRelaxation();
   FunctionPass *createHexagonCallFrameInformation();
   FunctionPass *createHexagonCFGOptimizer();
   FunctionPass *createHexagonCommonGEP();
   FunctionPass *createHexagonCopyToCombine();
   FunctionPass *createHexagonEarlyIfConversion();
   FunctionPass *createHexagonExpandCondsets();
-  FunctionPass *createHexagonExpandPredSpillCode();
   FunctionPass *createHexagonFixupHwLoops();
   FunctionPass *createHexagonGenExtract();
   FunctionPass *createHexagonGenInsert();
@@ -113,6 +121,7 @@ namespace llvm {
   FunctionPass *createHexagonLoopRescheduling();
   FunctionPass *createHexagonNewValueJump();
   FunctionPass *createHexagonOptimizeSZextends();
+  FunctionPass *createHexagonOptAddrMode();
   FunctionPass *createHexagonPacketizer();
   FunctionPass *createHexagonPeephole();
   FunctionPass *createHexagonRDFOpt();
@@ -121,19 +130,27 @@ namespace llvm {
   FunctionPass *createHexagonStoreWidening();
 } // end namespace llvm;
 
-/// HexagonTargetMachine ctor - Create an ILP32 architecture model.
-///
+static Reloc::Model getEffectiveRelocModel(Optional<Reloc::Model> RM) {
+  if (!RM.hasValue())
+    return Reloc::Static;
+  return *RM;
+}
 
-/// Hexagon_TODO: Do I need an aggregate alignment?
-///
 HexagonTargetMachine::HexagonTargetMachine(const Target &T, const Triple &TT,
                                            StringRef CPU, StringRef FS,
                                            const TargetOptions &Options,
-                                           Reloc::Model RM, CodeModel::Model CM,
+                                           Optional<Reloc::Model> RM,
+                                           CodeModel::Model CM,
                                            CodeGenOpt::Level OL)
-    : LLVMTargetMachine(T, "e-m:e-p:32:32:32-i64:64:64-i32:32:32-i16:16:16-"
-                        "i1:8:8-f64:64:64-f32:32:32-v64:64:64-v32:32:32-a:0-"
-                        "n16:32", TT, CPU, FS, Options, RM, CM, OL),
+    // Specify the vector alignment explicitly. For v512x1, the calculated
+    // alignment would be 512*alignment(i1), which is 512 bytes, instead of
+    // the required minimum of 64 bytes.
+    : LLVMTargetMachine(
+          T, "e-m:e-p:32:32:32-a:0-n16:32-"
+             "i64:64:64-i32:32:32-i16:16:16-i1:8:8-f32:32:32-f64:64:64-"
+             "v32:32:32-v64:64:64-v512:512:512-v1024:1024:1024-v2048:2048:2048",
+          TT, CPU, FS, Options, getEffectiveRelocModel(RM), CM,
+          (HexagonNoOpt ? CodeGenOpt::None : OL)),
       TLOF(make_unique<HexagonTargetObjectFile>()) {
   initAsmInfo();
 }
@@ -178,15 +195,7 @@ namespace {
 class HexagonPassConfig : public TargetPassConfig {
 public:
   HexagonPassConfig(HexagonTargetMachine *TM, PassManagerBase &PM)
-    : TargetPassConfig(TM, PM) {
-    bool NoOpt = (TM->getOptLevel() == CodeGenOpt::None);
-    if (!NoOpt) {
-      if (EnableExpandCondsets) {
-        Pass *Exp = createHexagonExpandCondsets();
-        insertPass(&RegisterCoalescerID, IdentifyingPassPtr(Exp));
-      }
-    }
-  }
+    : TargetPassConfig(TM, PM) {}
 
   HexagonTargetMachine &getHexagonTargetMachine() const {
     return getTM<HexagonTargetMachine>();
@@ -259,6 +268,10 @@ bool HexagonPassConfig::addInstSelector() {
 
 void HexagonPassConfig::addPreRegAlloc() {
   if (getOptLevel() != CodeGenOpt::None) {
+    if (EnableExpandCondsets) {
+      Pass *Exp = createHexagonExpandCondsets();
+      insertPass(&RegisterCoalescerID, IdentifyingPassPtr(Exp));
+    }
     if (!DisableStoreWidening)
       addPass(createHexagonStoreWidening(), false);
     if (!DisableHardwareLoops)
@@ -272,6 +285,8 @@ void HexagonPassConfig::addPostRegAlloc() {
       addPass(createHexagonRDFOpt());
     if (!DisableHexagonCFGOpt)
       addPass(createHexagonCFGOptimizer(), false);
+    if (!DisableAModeOpt)
+      addPass(createHexagonOptAddrMode(), false);
   }
 }
 
@@ -288,8 +303,7 @@ void HexagonPassConfig::addPreEmitPass() {
   if (!NoOpt)
     addPass(createHexagonNewValueJump(), false);
 
-  // Expand Spill code for predicate registers.
-  addPass(createHexagonExpandPredSpillCode(), false);
+  addPass(createHexagonBranchRelaxation(), false);
 
   // Create Packets.
   if (!NoOpt) {
diff --git a/lib/Target/Hexagon/HexagonTargetMachine.h b/lib/Target/Hexagon/HexagonTargetMachine.h
index 968814b3ea32..70835c0d4ac5 100644
--- a/lib/Target/Hexagon/HexagonTargetMachine.h
+++ b/lib/Target/Hexagon/HexagonTargetMachine.h
@@ -30,7 +30,7 @@ class HexagonTargetMachine : public LLVMTargetMachine {
 public:
   HexagonTargetMachine(const Target &T, const Triple &TT, StringRef CPU,
                        StringRef FS, const TargetOptions &Options,
-                       Reloc::Model RM, CodeModel::Model CM,
+                       Optional<Reloc::Model> RM, CodeModel::Model CM,
                        CodeGenOpt::Level OL);
   ~HexagonTargetMachine() override;
   const HexagonSubtarget *getSubtargetImpl(const Function &F) const override;
diff --git a/lib/Target/Hexagon/HexagonTargetObjectFile.cpp b/lib/Target/Hexagon/HexagonTargetObjectFile.cpp
index ccca62021f5b..82b437eb6a0c 100644
--- a/lib/Target/Hexagon/HexagonTargetObjectFile.cpp
+++ b/lib/Target/Hexagon/HexagonTargetObjectFile.cpp
@@ -1,4 +1,4 @@
-//===-- HexagonTargetObjectFile.cpp - Hexagon asm properties --------------===//
+//===-- HexagonTargetObjectFile.cpp ---------------------------------------===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -10,10 +10,10 @@
 // This file contains the declarations of the HexagonTargetAsmInfo properties.
 //
 //===----------------------------------------------------------------------===//
+#define DEBUG_TYPE "hexagon-sdata"
 
-#include "HexagonTargetObjectFile.h"
-#include "HexagonSubtarget.h"
 #include "HexagonTargetMachine.h"
+#include "HexagonTargetObjectFile.h"
 #include "llvm/IR/DataLayout.h"
 #include "llvm/IR/DerivedTypes.h"
 #include "llvm/IR/Function.h"
@@ -24,75 +24,368 @@
 
 using namespace llvm;
 
-static cl::opt<int> SmallDataThreshold("hexagon-small-data-threshold",
-                                cl::init(8), cl::Hidden,
-                cl::desc("The maximum size of an object in the sdata section"));
+static cl::opt<unsigned> SmallDataThreshold("hexagon-small-data-threshold",
+  cl::init(8), cl::Hidden,
+  cl::desc("The maximum size of an object in the sdata section"));
+
+static cl::opt<bool> NoSmallDataSorting("mno-sort-sda", cl::init(false),
+  cl::Hidden, cl::desc("Disable small data sections sorting"));
+
+static cl::opt<bool> StaticsInSData("hexagon-statics-in-small-data",
+  cl::init(false), cl::Hidden, cl::ZeroOrMore,
+  cl::desc("Allow static variables in .sdata"));
+
+static cl::opt<bool> TraceGVPlacement("trace-gv-placement",
+  cl::Hidden, cl::init(false),
+  cl::desc("Trace global value placement"));
+
+// TraceGVPlacement controls messages for all builds. For builds with assertions
+// (debug or release), messages are also controlled by the usual debug flags
+// (e.g. -debug and -debug-only=globallayout)
+#define TRACE_TO(s, X) s << X
+#ifdef NDEBUG
+#define TRACE(X) do { if (TraceGVPlacement) { TRACE_TO(errs(), X); } } while (0)
+#else
+#define TRACE(X) \
+  do { \
+    if (TraceGVPlacement) { TRACE_TO(errs(), X); } \
+    else { DEBUG( TRACE_TO(dbgs(), X) ); } \
+  } while (0)
+#endif
+
+// Returns true if the section name is such that the symbol will be put
+// in a small data section.
+// For instance, global variables with section attributes such as ".sdata"
+// ".sdata.*", ".sbss", and ".sbss.*" will go into small data.
+static bool isSmallDataSection(StringRef Sec) {
+  // sectionName is either ".sdata" or ".sbss". Looking for an exact match
+  // obviates the need for checks for section names such as ".sdatafoo".
+  if (Sec.equals(".sdata") || Sec.equals(".sbss") || Sec.equals(".scommon"))
+    return true;
+  // If either ".sdata." or ".sbss." is a substring of the section name
+  // then put the symbol in small data.
+  return Sec.find(".sdata.") != StringRef::npos ||
+         Sec.find(".sbss.") != StringRef::npos ||
+         Sec.find(".scommon.") != StringRef::npos;
+}
+
+
+static const char *getSectionSuffixForSize(unsigned Size) {
+  switch (Size) {
+  default:
+    return "";
+  case 1:
+    return ".1";
+  case 2:
+    return ".2";
+  case 4:
+    return ".4";
+  case 8:
+    return ".8";
+  }
+}
 
 void HexagonTargetObjectFile::Initialize(MCContext &Ctx,
-                                         const TargetMachine &TM) {
+      const TargetMachine &TM) {
   TargetLoweringObjectFileELF::Initialize(Ctx, TM);
   InitializeELF(TM.Options.UseInitArray);
 
-  SmallDataSection = getContext().getELFSection(
-      ".sdata", ELF::SHT_PROGBITS, ELF::SHF_WRITE | ELF::SHF_ALLOC);
-  SmallBSSSection = getContext().getELFSection(".sbss", ELF::SHT_NOBITS,
-                                               ELF::SHF_WRITE | ELF::SHF_ALLOC);
+  SmallDataSection =
+    getContext().getELFSection(".sdata", ELF::SHT_PROGBITS,
+                               ELF::SHF_WRITE | ELF::SHF_ALLOC |
+                               ELF::SHF_HEX_GPREL);
+  SmallBSSSection =
+    getContext().getELFSection(".sbss", ELF::SHT_NOBITS,
+                               ELF::SHF_WRITE | ELF::SHF_ALLOC |
+                               ELF::SHF_HEX_GPREL);
 }
 
-// sdata/sbss support taken largely from the MIPS Backend.
-static bool IsInSmallSection(uint64_t Size) {
-  return Size > 0 && Size <= (uint64_t)SmallDataThreshold;
-}
 
-bool HexagonTargetObjectFile::IsSmallDataEnabled () const {
-  return SmallDataThreshold > 0;
+MCSection *HexagonTargetObjectFile::SelectSectionForGlobal(
+      const GlobalValue *GV, SectionKind Kind, Mangler &Mang,
+      const TargetMachine &TM) const {
+  TRACE("[SelectSectionForGlobal] GV(" << GV->getName() << ") ");
+  TRACE("input section(" << GV->getSection() << ") ");
+
+  TRACE((GV->hasPrivateLinkage() ? "private_linkage " : "")
+         << (GV->hasLocalLinkage() ? "local_linkage " : "")
+         << (GV->hasInternalLinkage() ? "internal " : "")
+         << (GV->hasExternalLinkage() ? "external " : "")
+         << (GV->hasCommonLinkage() ? "common_linkage " : "")
+         << (GV->hasCommonLinkage() ? "common " : "" )
+         << (Kind.isCommon() ? "kind_common " : "" )
+         << (Kind.isBSS() ? "kind_bss " : "" )
+         << (Kind.isBSSLocal() ? "kind_bss_local " : "" ));
+
+  if (isGlobalInSmallSection(GV, TM))
+    return selectSmallSectionForGlobal(GV, Kind, Mang, TM);
+
+  if (Kind.isCommon()) {
+    // This is purely for LTO+Linker Script because commons don't really have a
+    // section. However, the BitcodeSectionWriter pass will query for the
+    // sections of commons (and the linker expects us to know their section) so
+    // we'll return one here.
+    return BSSSection;
+  }
+
+  TRACE("default_ELF_section\n");
+  // Otherwise, we work the same as ELF.
+  return TargetLoweringObjectFileELF::SelectSectionForGlobal(GV, Kind,
+              Mang, TM);
 }
 
-/// IsGlobalInSmallSection - Return true if this global value should be
-/// placed into small data/bss section.
-bool HexagonTargetObjectFile::IsGlobalInSmallSection(const GlobalValue *GV,
-                                                const TargetMachine &TM) const {
-  // If the primary definition of this global value is outside the current
-  // translation unit or the global value is available for inspection but not
-  // emission, then do nothing.
-  if (GV->isDeclaration() || GV->hasAvailableExternallyLinkage())
-    return false;
 
-  // Otherwise, Check if GV should be in sdata/sbss, when normally it would end
-  // up in getKindForGlobal(GV, TM).
-  return IsGlobalInSmallSection(GV, TM, getKindForGlobal(GV, TM));
+MCSection *HexagonTargetObjectFile::getExplicitSectionGlobal(
+      const GlobalValue *GV, SectionKind Kind, Mangler &Mang,
+      const TargetMachine &TM) const {
+  TRACE("[getExplicitSectionGlobal] GV(" << GV->getName() << ") from("
+        << GV->getSection() << ") ");
+  TRACE((GV->hasPrivateLinkage() ? "private_linkage " : "")
+         << (GV->hasLocalLinkage() ? "local_linkage " : "")
+         << (GV->hasInternalLinkage() ? "internal " : "")
+         << (GV->hasExternalLinkage() ? "external " : "")
+         << (GV->hasCommonLinkage() ? "common_linkage " : "")
+         << (GV->hasCommonLinkage() ? "common " : "" )
+         << (Kind.isCommon() ? "kind_common " : "" )
+         << (Kind.isBSS() ? "kind_bss " : "" )
+         << (Kind.isBSSLocal() ? "kind_bss_local " : "" ));
+
+  if (GV->hasSection()) {
+    StringRef Section = GV->getSection();
+    if (Section.find(".access.text.group") != StringRef::npos)
+      return getContext().getELFSection(GV->getSection(), ELF::SHT_PROGBITS,
+                                        ELF::SHF_ALLOC | ELF::SHF_EXECINSTR);
+    if (Section.find(".access.data.group") != StringRef::npos)
+      return getContext().getELFSection(GV->getSection(), ELF::SHT_PROGBITS,
+                                        ELF::SHF_WRITE | ELF::SHF_ALLOC);
+  }
+
+  if (isGlobalInSmallSection(GV, TM))
+    return selectSmallSectionForGlobal(GV, Kind, Mang, TM);
+
+  // Otherwise, we work the same as ELF.
+  TRACE("default_ELF_section\n");
+  return TargetLoweringObjectFileELF::getExplicitSectionGlobal(GV, Kind,
+            Mang, TM);
 }
 
-/// IsGlobalInSmallSection - Return true if this global value should be
-/// placed into small data/bss section.
-bool HexagonTargetObjectFile::
-IsGlobalInSmallSection(const GlobalValue *GV, const TargetMachine &TM,
-                       SectionKind Kind) const {
+
+/// Return true if this global value should be placed into small data/bss
+/// section.
+bool HexagonTargetObjectFile::isGlobalInSmallSection(const GlobalValue *GV,
+      const TargetMachine &TM) const {
   // Only global variables, not functions.
-  const GlobalVariable *GVA = dyn_cast<GlobalVariable>(GV);
-  if (!GVA)
+  DEBUG(dbgs() << "Checking if value is in small-data, -G"
+               << SmallDataThreshold << ": \"" << GV->getName() << "\": ");
+  const GlobalVariable *GVar = dyn_cast<GlobalVariable>(GV);
+  if (!GVar) {
+    DEBUG(dbgs() << "no, not a global variable\n");
     return false;
+  }
 
-  if (Kind.isBSS() || Kind.isData() || Kind.isCommon()) {
-    Type *Ty = GV->getType()->getElementType();
-    return IsInSmallSection(
-        GV->getParent()->getDataLayout().getTypeAllocSize(Ty));
+  // Globals with external linkage that have an original section set must be
+  // emitted to that section, regardless of whether we would put them into
+  // small data or not. This is how we can support mixing -G0/-G8 in LTO.
+  if (GVar->hasSection()) {
+    bool IsSmall = isSmallDataSection(GVar->getSection());
+    DEBUG(dbgs() << (IsSmall ? "yes" : "no") << ", has section: "
+                 << GVar->getSection() << '\n');
+    return IsSmall;
   }
 
-  return false;
+  if (GVar->isConstant()) {
+    DEBUG(dbgs() << "no, is a constant\n");
+    return false;
+  }
+
+  bool IsLocal = GVar->hasLocalLinkage();
+  if (!StaticsInSData && IsLocal) {
+    DEBUG(dbgs() << "no, is static\n");
+    return false;
+  }
+
+  Type *GType = GVar->getType();
+  if (PointerType *PT = dyn_cast<PointerType>(GType))
+    GType = PT->getElementType();
+
+  if (isa<ArrayType>(GType)) {
+    DEBUG(dbgs() << "no, is an array\n");
+    return false;
+  }
+
+  // If the type is a struct with no body provided, treat is conservatively.
+  // There cannot be actual definitions of object of such a type in this CU
+  // (only references), so assuming that they are not in sdata is safe. If
+  // these objects end up in the sdata, the references will still be valid.
+  if (StructType *ST = dyn_cast<StructType>(GType)) {
+    if (ST->isOpaque()) {
+      DEBUG(dbgs() << "no, has opaque type\n");
+      return false;
+    }
+  }
+
+  unsigned Size = GVar->getParent()->getDataLayout().getTypeAllocSize(GType);
+  if (Size == 0) {
+    DEBUG(dbgs() << "no, has size 0\n");
+    return false;
+  }
+  if (Size > SmallDataThreshold) {
+    DEBUG(dbgs() << "no, size exceeds sdata threshold: " << Size << '\n');
+    return false;
+  }
+
+  DEBUG(dbgs() << "yes\n");
+  return true;
+}
+
+
+bool HexagonTargetObjectFile::isSmallDataEnabled() const {
+  return SmallDataThreshold > 0;
+}
+
+
+unsigned HexagonTargetObjectFile::getSmallDataSize() const {
+  return SmallDataThreshold;
+}
+
+
+/// Descends any type down to "elementary" components,
+/// discovering the smallest addressable one.
+/// If zero is returned, declaration will not be modified.
+unsigned HexagonTargetObjectFile::getSmallestAddressableSize(const Type *Ty,
+      const GlobalValue *GV, const TargetMachine &TM) const {
+  // Assign the smallest element access size to the highest
+  // value which assembler can handle.
+  unsigned SmallestElement = 8;
+
+  if (!Ty)
+    return 0;
+  switch (Ty->getTypeID()) {
+  case Type::StructTyID: {
+    const StructType *STy = cast<const StructType>(Ty);
+    for (auto &E : STy->elements()) {
+      unsigned AtomicSize = getSmallestAddressableSize(E, GV, TM);
+      if (AtomicSize < SmallestElement)
+        SmallestElement = AtomicSize;
+    }
+    return (STy->getNumElements() == 0) ? 0 : SmallestElement;
+  }
+  case Type::ArrayTyID: {
+    const ArrayType *ATy = cast<const ArrayType>(Ty);
+    return getSmallestAddressableSize(ATy->getElementType(), GV, TM);
+  }
+  case Type::VectorTyID: {
+    const VectorType *PTy = cast<const VectorType>(Ty);
+    return getSmallestAddressableSize(PTy->getElementType(), GV, TM);
+  }
+  case Type::PointerTyID:
+  case Type::HalfTyID:
+  case Type::FloatTyID:
+  case Type::DoubleTyID:
+  case Type::IntegerTyID: {
+    const DataLayout &DL = GV->getParent()->getDataLayout();
+    // It is unfortunate that DL's function take non-const Type*.
+    return DL.getTypeAllocSize(const_cast<Type*>(Ty));
+  }
+  case Type::FunctionTyID:
+  case Type::VoidTyID:
+  case Type::X86_FP80TyID:
+  case Type::FP128TyID:
+  case Type::PPC_FP128TyID:
+  case Type::LabelTyID:
+  case Type::MetadataTyID:
+  case Type::X86_MMXTyID:
+  case Type::TokenTyID:
+    return 0;
+  }
+
+  return 0;
 }
 
-MCSection *
-HexagonTargetObjectFile::SelectSectionForGlobal(const GlobalValue *GV,
-                                                SectionKind Kind, Mangler &Mang,
-                                                const TargetMachine &TM) const {
 
+MCSection *HexagonTargetObjectFile::selectSmallSectionForGlobal(
+      const GlobalValue *GV, SectionKind Kind, Mangler &Mang,
+      const TargetMachine &TM) const {
+  const Type *GTy = GV->getType()->getElementType();
+  unsigned Size = getSmallestAddressableSize(GTy, GV, TM);
+
+  // If we have -ffunction-section or -fdata-section then we should emit the
+  // global value to a unique section specifically for it... even for sdata.
+  bool EmitUniquedSection = TM.getDataSections();
+
+  TRACE("Small data. Size(" << Size << ")");
   // Handle Small Section classification here.
-  if (Kind.isBSS() && IsGlobalInSmallSection(GV, TM, Kind))
-    return SmallBSSSection;
-  if (Kind.isData() && IsGlobalInSmallSection(GV, TM, Kind))
-    return SmallDataSection;
+  if (Kind.isBSS() || Kind.isBSSLocal()) {
+    // If -mno-sort-sda is not set, find out smallest accessible entity in
+    // declaration and add it to the section name string.
+    // Note. It does not track the actual usage of the value, only its de-
+    // claration. Also, compiler adds explicit pad fields to some struct
+    // declarations - they are currently counted towards smallest addres-
+    // sable entity.
+    if (NoSmallDataSorting) {
+      TRACE(" default sbss\n");
+      return SmallBSSSection;
+    }
+
+    StringRef Prefix(".sbss");
+    SmallString<128> Name(Prefix);
+    Name.append(getSectionSuffixForSize(Size));
+
+    if (EmitUniquedSection) {
+      Name.append(".");
+      Name.append(GV->getName());
+    }
+    TRACE(" unique sbss(" << Name << ")\n");
+    return getContext().getELFSection(Name.str(), ELF::SHT_NOBITS,
+                ELF::SHF_WRITE | ELF::SHF_ALLOC | ELF::SHF_HEX_GPREL);
+  }
+
+  if (Kind.isCommon()) {
+    // This is purely for LTO+Linker Script because commons don't really have a
+    // section. However, the BitcodeSectionWriter pass will query for the
+    // sections of commons (and the linker expects us to know their section) so
+    // we'll return one here.
+    if (NoSmallDataSorting)
+      return BSSSection;
+
+    Twine Name = Twine(".scommon") + getSectionSuffixForSize(Size);
+    TRACE(" small COMMON (" << Name << ")\n");
+
+    return getContext().getELFSection(Name.str(), ELF::SHT_NOBITS,
+                                      ELF::SHF_WRITE | ELF::SHF_ALLOC |
+                                      ELF::SHF_HEX_GPREL);
+  }
+
+  // We could have changed sdata object to a constant... in this
+  // case the Kind could be wrong for it.
+  if (Kind.isMergeableConst()) {
+    TRACE(" const_object_as_data ");
+    const GlobalVariable *GVar = dyn_cast<GlobalVariable>(GV);
+    if (GVar->hasSection() && isSmallDataSection(GVar->getSection()))
+      Kind = SectionKind::getData();
+  }
+
+  if (Kind.isData()) {
+    if (NoSmallDataSorting) {
+      TRACE(" default sdata\n");
+      return SmallDataSection;
+    }
+
+    StringRef Prefix(".sdata");
+    SmallString<128> Name(Prefix);
+    Name.append(getSectionSuffixForSize(Size));
+
+    if (EmitUniquedSection) {
+      Name.append(".");
+      Name.append(GV->getName());
+    }
+    TRACE(" unique sdata(" << Name << ")\n");
+    return getContext().getELFSection(Name.str(), ELF::SHT_PROGBITS,
+                ELF::SHF_WRITE | ELF::SHF_ALLOC | ELF::SHF_HEX_GPREL);
+  }
 
+  TRACE("default ELF section\n");
   // Otherwise, we work the same as ELF.
-  return TargetLoweringObjectFileELF::SelectSectionForGlobal(GV, Kind, Mang,TM);
+  return TargetLoweringObjectFileELF::SelectSectionForGlobal(GV, Kind,
+              Mang, TM);
 }
diff --git a/lib/Target/Hexagon/HexagonTargetObjectFile.h b/lib/Target/Hexagon/HexagonTargetObjectFile.h
index da0eeeb3fd28..cbc00da88c58 100644
--- a/lib/Target/Hexagon/HexagonTargetObjectFile.h
+++ b/lib/Target/Hexagon/HexagonTargetObjectFile.h
@@ -1,4 +1,4 @@
-//===-- HexagonTargetAsmInfo.h - Hexagon asm properties --------*- C++ -*--===//
+//===-- HexagonTargetObjectFile.h -----------------------------------------===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -16,24 +16,31 @@
 namespace llvm {
 
   class HexagonTargetObjectFile : public TargetLoweringObjectFileELF {
-    MCSectionELF *SmallDataSection;
-    MCSectionELF *SmallBSSSection;
-
   public:
     void Initialize(MCContext &Ctx, const TargetMachine &TM) override;
 
-    /// IsGlobalInSmallSection - Return true if this global address should be
-    /// placed into small data/bss section.
-    bool IsGlobalInSmallSection(const GlobalValue *GV,
-                                const TargetMachine &TM,
-                                SectionKind Kind) const;
-    bool IsGlobalInSmallSection(const GlobalValue *GV,
-                                const TargetMachine &TM) const;
-
-    bool IsSmallDataEnabled () const;
     MCSection *SelectSectionForGlobal(const GlobalValue *GV, SectionKind Kind,
-                                      Mangler &Mang,
-                                      const TargetMachine &TM) const override;
+        Mangler &Mang, const TargetMachine &TM) const override;
+
+    MCSection *getExplicitSectionGlobal(const GlobalValue *GV, SectionKind Kind,
+        Mangler &Mang, const TargetMachine &TM) const override;
+
+    bool isGlobalInSmallSection(const GlobalValue *GV, const TargetMachine &TM)
+        const;
+
+    bool isSmallDataEnabled() const;
+
+    unsigned getSmallDataSize() const;
+
+  private:
+    MCSectionELF *SmallDataSection;
+    MCSectionELF *SmallBSSSection;
+
+    unsigned getSmallestAddressableSize(const Type *Ty, const GlobalValue *GV,
+        const TargetMachine &TM) const;
+
+    MCSection *selectSmallSectionForGlobal(const GlobalValue *GV,
+        SectionKind Kind, Mangler &Mang, const TargetMachine &TM) const;
   };
 
 } // namespace llvm
diff --git a/lib/Target/Hexagon/HexagonVLIWPacketizer.cpp b/lib/Target/Hexagon/HexagonVLIWPacketizer.cpp
index 81850548bb6e..d326b9471315 100644
--- a/lib/Target/Hexagon/HexagonVLIWPacketizer.cpp
+++ b/lib/Target/Hexagon/HexagonVLIWPacketizer.cpp
@@ -29,8 +29,6 @@
 #include "llvm/CodeGen/Passes.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Debug.h"
-#include <map>
-#include <vector>
 
 using namespace llvm;
 
@@ -81,6 +79,10 @@ namespace {
       return "Hexagon Packetizer";
     }
     bool runOnMachineFunction(MachineFunction &Fn) override;
+    MachineFunctionProperties getRequiredProperties() const override {
+      return MachineFunctionProperties().set(
+          MachineFunctionProperties::Property::AllVRegsAllocated);
+    }
 
   private:
     const HexagonInstrInfo *HII;
@@ -106,16 +108,19 @@ HexagonPacketizerList::HexagonPacketizerList(MachineFunction &MF,
     : VLIWPacketizerList(MF, MLI, AA), MBPI(MBPI), MLI(&MLI) {
   HII = MF.getSubtarget<HexagonSubtarget>().getInstrInfo();
   HRI = MF.getSubtarget<HexagonSubtarget>().getRegisterInfo();
+
+  addMutation(make_unique<HexagonSubtarget::HexagonDAGMutation>());
 }
 
 // Check if FirstI modifies a register that SecondI reads.
-static bool hasWriteToReadDep(const MachineInstr *FirstI,
-      const MachineInstr *SecondI, const TargetRegisterInfo *TRI) {
-  for (auto &MO : FirstI->operands()) {
+static bool hasWriteToReadDep(const MachineInstr &FirstI,
+                              const MachineInstr &SecondI,
+                              const TargetRegisterInfo *TRI) {
+  for (auto &MO : FirstI.operands()) {
     if (!MO.isReg() || !MO.isDef())
       continue;
     unsigned R = MO.getReg();
-    if (SecondI->readsRegister(R, TRI))
+    if (SecondI.readsRegister(R, TRI))
       return true;
   }
   return false;
@@ -146,7 +151,7 @@ static MachineBasicBlock::iterator moveInstrOut(MachineInstr *MI,
   B.splice(InsertPt, &B, MI);
 
   // Get the size of the bundle without asserting.
-  MachineBasicBlock::const_instr_iterator I(BundleIt);
+  MachineBasicBlock::const_instr_iterator I = BundleIt.getInstrIterator();
   MachineBasicBlock::const_instr_iterator E = B.instr_end();
   unsigned Size = 0;
   for (++I; I != E && I->isBundledWithPred(); ++I)
@@ -168,7 +173,7 @@ static MachineBasicBlock::iterator moveInstrOut(MachineInstr *MI,
 
 
 bool HexagonPacketizer::runOnMachineFunction(MachineFunction &MF) {
-  if (DisablePacketizer)
+  if (DisablePacketizer || skipFunction(*MF.getFunction()))
     return false;
 
   HII = MF.getSubtarget<HexagonSubtarget>().getInstrInfo();
@@ -216,12 +221,12 @@ bool HexagonPacketizer::runOnMachineFunction(MachineFunction &MF) {
       // First the first non-boundary starting from the end of the last
       // scheduling region.
       MachineBasicBlock::iterator RB = Begin;
-      while (RB != End && HII->isSchedulingBoundary(RB, &MB, MF))
+      while (RB != End && HII->isSchedulingBoundary(*RB, &MB, MF))
         ++RB;
       // First the first boundary starting from the beginning of the new
       // region.
       MachineBasicBlock::iterator RE = RB;
-      while (RE != End && !HII->isSchedulingBoundary(RE, &MB, MF))
+      while (RE != End && !HII->isSchedulingBoundary(*RE, &MB, MF))
         ++RE;
       // Add the scheduling boundary if it's not block end.
       if (RE != End)
@@ -254,9 +259,9 @@ bool HexagonPacketizerList::canReserveResourcesForConstExt() {
 // return true, otherwise, return false.
 bool HexagonPacketizerList::tryAllocateResourcesForConstExt(bool Reserve) {
   auto *ExtMI = MF.CreateMachineInstr(HII->get(Hexagon::A4_ext), DebugLoc());
-  bool Avail = ResourceTracker->canReserveResources(ExtMI);
+  bool Avail = ResourceTracker->canReserveResources(*ExtMI);
   if (Reserve && Avail)
-    ResourceTracker->reserveResources(ExtMI);
+    ResourceTracker->reserveResources(*ExtMI);
   MF.DeleteMachineInstr(ExtMI);
   return Avail;
 }
@@ -365,7 +370,7 @@ bool HexagonPacketizerList::canPromoteToDotCur(const MachineInstr *MI,
       const TargetRegisterClass *RC) {
   if (!HII->isV60VectorInstruction(MI))
     return false;
-  if (!HII->isV60VectorInstruction(MII))
+  if (!HII->isV60VectorInstruction(&*MII))
     return false;
 
   // Already a dot new instruction.
@@ -383,11 +388,14 @@ bool HexagonPacketizerList::canPromoteToDotCur(const MachineInstr *MI,
   DEBUG(dbgs() << "Can we DOT Cur Vector MI\n";
         MI->dump();
         dbgs() << "in packet\n";);
-  MachineInstr *MJ = MII;
-  DEBUG(dbgs() << "Checking CUR against "; MJ->dump(););
+  MachineInstr &MJ = *MII;
+  DEBUG({
+    dbgs() << "Checking CUR against ";
+    MJ.dump();
+  });
   unsigned DestReg = MI->getOperand(0).getReg();
   bool FoundMatch = false;
-  for (auto &MO : MJ->operands())
+  for (auto &MO : MJ.operands())
     if (MO.isReg() && MO.getReg() == DestReg)
       FoundMatch = true;
   if (!FoundMatch)
@@ -436,7 +444,7 @@ enum PredicateKind {
 
 /// Returns true if an instruction is predicated on p0 and false if it's
 /// predicated on !p0.
-static PredicateKind getPredicateSense(const MachineInstr *MI,
+static PredicateKind getPredicateSense(const MachineInstr &MI,
                                        const HexagonInstrInfo *HII) {
   if (!HII->isPredicated(MI))
     return PK_Unknown;
@@ -570,8 +578,8 @@ bool HexagonPacketizerList::canPromoteToNewValueStore(const MachineInstr *MI,
 
   // If the source that feeds the store is predicated, new value store must
   // also be predicated.
-  if (HII->isPredicated(PacketMI)) {
-    if (!HII->isPredicated(MI))
+  if (HII->isPredicated(*PacketMI)) {
+    if (!HII->isPredicated(*MI))
       return false;
 
     // Check to make sure that they both will have their predicates
@@ -613,8 +621,8 @@ bool HexagonPacketizerList::canPromoteToNewValueStore(const MachineInstr *MI,
     // 3) Both new-value register producer and user should have same predicate
     // sense, i.e, either both should be negated or both should be non-negated.
     if (predRegNumDst != predRegNumSrc ||
-        HII->isDotNewInst(PacketMI) != HII->isDotNewInst(MI)  ||
-        getPredicateSense(MI, HII) != getPredicateSense(PacketMI, HII))
+        HII->isDotNewInst(PacketMI) != HII->isDotNewInst(MI) ||
+        getPredicateSense(*MI, HII) != getPredicateSense(*PacketMI, HII))
       return false;
   }
 
@@ -762,7 +770,7 @@ bool HexagonPacketizerList::canPromoteToDotNew(const MachineInstr *MI,
   int NewOpcode = HII->getDotNewOp(MI);
   const MCInstrDesc &D = HII->get(NewOpcode);
   MachineInstr *NewMI = MF.CreateMachineInstr(D, DebugLoc());
-  bool ResourcesAvailable = ResourceTracker->canReserveResources(NewMI);
+  bool ResourcesAvailable = ResourceTracker->canReserveResources(*NewMI);
   MF.DeleteMachineInstr(NewMI);
   if (!ResourcesAvailable)
     return false;
@@ -793,7 +801,7 @@ bool HexagonPacketizerList::restrictingDepExistInPacket(MachineInstr* MI,
 
   for (auto I : CurrentPacketMIs) {
     // We only care for dependencies to predicated instructions
-    if (!HII->isPredicated(I))
+    if (!HII->isPredicated(*I))
       continue;
 
     // Scheduling Unit for current insn in the packet
@@ -817,13 +825,13 @@ bool HexagonPacketizerList::restrictingDepExistInPacket(MachineInstr* MI,
 
 
 /// Gets the predicate register of a predicated instruction.
-static unsigned getPredicatedRegister(MachineInstr *MI,
+static unsigned getPredicatedRegister(MachineInstr &MI,
                                       const HexagonInstrInfo *QII) {
   /// We use the following rule: The first predicate register that is a use is
   /// the predicate register of a predicated instruction.
   assert(QII->isPredicated(MI) && "Must be predicated instruction");
 
-  for (auto &Op : MI->operands()) {
+  for (auto &Op : MI.operands()) {
     if (Op.isReg() && Op.getReg() && Op.isUse() &&
         Hexagon::PredRegsRegClass.contains(Op.getReg()))
       return Op.getReg();
@@ -835,8 +843,8 @@ static unsigned getPredicatedRegister(MachineInstr *MI,
 
 // Given two predicated instructions, this function detects whether
 // the predicates are complements.
-bool HexagonPacketizerList::arePredicatesComplements(MachineInstr *MI1,
-                                                     MachineInstr *MI2) {
+bool HexagonPacketizerList::arePredicatesComplements(MachineInstr &MI1,
+                                                     MachineInstr &MI2) {
   // If we don't know the predicate sense of the instructions bail out early, we
   // need it later.
   if (getPredicateSense(MI1, HII) == PK_Unknown ||
@@ -844,7 +852,7 @@ bool HexagonPacketizerList::arePredicatesComplements(MachineInstr *MI1,
     return false;
 
   // Scheduling unit for candidate.
-  SUnit *SU = MIToSUnit[MI1];
+  SUnit *SU = MIToSUnit[&MI1];
 
   // One corner case deals with the following scenario:
   // Trying to add
@@ -898,7 +906,7 @@ bool HexagonPacketizerList::arePredicatesComplements(MachineInstr *MI1,
          Hexagon::PredRegsRegClass.contains(PReg1) &&
          Hexagon::PredRegsRegClass.contains(PReg2) &&
          getPredicateSense(MI1, HII) != getPredicateSense(MI2, HII) &&
-         HII->isDotNewInst(MI1) == HII->isDotNewInst(MI2);
+         HII->isDotNewInst(&MI1) == HII->isDotNewInst(&MI2);
 }
 
 // Initialize packetizer flags.
@@ -911,31 +919,31 @@ void HexagonPacketizerList::initPacketizerState() {
 }
 
 // Ignore bundling of pseudo instructions.
-bool HexagonPacketizerList::ignorePseudoInstruction(const MachineInstr *MI,
-      const MachineBasicBlock*) {
-  if (MI->isDebugValue())
+bool HexagonPacketizerList::ignorePseudoInstruction(const MachineInstr &MI,
+                                                    const MachineBasicBlock *) {
+  if (MI.isDebugValue())
     return true;
 
-  if (MI->isCFIInstruction())
+  if (MI.isCFIInstruction())
     return false;
 
   // We must print out inline assembly.
-  if (MI->isInlineAsm())
+  if (MI.isInlineAsm())
     return false;
 
-  if (MI->isImplicitDef())
+  if (MI.isImplicitDef())
     return false;
 
   // We check if MI has any functional units mapped to it. If it doesn't,
   // we ignore the instruction.
-  const MCInstrDesc& TID = MI->getDesc();
+  const MCInstrDesc& TID = MI.getDesc();
   auto *IS = ResourceTracker->getInstrItins()->beginStage(TID.getSchedClass());
   unsigned FuncUnits = IS->getUnits();
   return !FuncUnits;
 }
 
-bool HexagonPacketizerList::isSoloInstruction(const MachineInstr *MI) {
-  if (MI->isEHLabel() || MI->isCFIInstruction())
+bool HexagonPacketizerList::isSoloInstruction(const MachineInstr &MI) {
+  if (MI.isEHLabel() || MI.isCFIInstruction())
     return true;
 
   // Consider inline asm to not be a solo instruction by default.
@@ -943,19 +951,19 @@ bool HexagonPacketizerList::isSoloInstruction(const MachineInstr *MI) {
   // removed, and placed outside of the packet (before or after, depending
   // on dependencies).  This is to reduce the impact of inline asm as a
   // "packet splitting" instruction.
-  if (MI->isInlineAsm() && !ScheduleInlineAsm)
+  if (MI.isInlineAsm() && !ScheduleInlineAsm)
     return true;
 
   // From Hexagon V4 Programmer's Reference Manual 3.4.4 Grouping constraints:
   // trap, pause, barrier, icinva, isync, and syncht are solo instructions.
   // They must not be grouped with other instructions in a packet.
-  if (isSchedBarrier(MI))
+  if (isSchedBarrier(&MI))
     return true;
 
-  if (HII->isSolo(MI))
+  if (HII->isSolo(&MI))
     return true;
 
-  if (MI->getOpcode() == Hexagon::A2_nop)
+  if (MI.getOpcode() == Hexagon::A2_nop)
     return true;
 
   return false;
@@ -1016,7 +1024,7 @@ void HexagonPacketizerList::unpacketizeSoloInstrs(MachineFunction &MF) {
       // after the bundle (to preserve the bundle semantics).
       bool InsertBeforeBundle;
       if (MI->isInlineAsm())
-        InsertBeforeBundle = !hasWriteToReadDep(MI, BundleIt, HRI);
+        InsertBeforeBundle = !hasWriteToReadDep(*MI, *BundleIt, HRI);
       else if (MI->isDebugValue())
         InsertBeforeBundle = true;
       else
@@ -1045,7 +1053,7 @@ bool HexagonPacketizerList::hasDeadDependence(const MachineInstr *I,
   // defining the same (dead) register.
   if (I->isCall() || J->isCall())
     return false;
-  if (HII->isPredicated(I) || HII->isPredicated(J))
+  if (HII->isPredicated(*I) || HII->isPredicated(*J))
     return false;
 
   BitVector DeadDefs(Hexagon::NUM_TARGET_REGS);
@@ -1085,7 +1093,7 @@ bool HexagonPacketizerList::hasControlDependence(const MachineInstr *I,
   auto isBadForLoopN = [this] (const MachineInstr *MI) -> bool {
     if (MI->isCall() || HII->isDeallocRet(MI) || HII->isNewValueJump(MI))
       return true;
-    if (HII->isPredicated(MI) && HII->isPredicatedNew(MI) && HII->isJumpR(MI))
+    if (HII->isPredicated(*MI) && HII->isPredicatedNew(*MI) && HII->isJumpR(MI))
       return true;
     return false;
   };
@@ -1139,7 +1147,7 @@ bool HexagonPacketizerList::isLegalToPacketizeTogether(SUnit *SUI, SUnit *SUJ) {
   const unsigned FrameSize = MF.getFrameInfo()->getStackSize();
 
   // Solo instructions cannot go in the packet.
-  assert(!isSoloInstruction(I) && "Unexpected solo instr!");
+  assert(!isSoloInstruction(*I) && "Unexpected solo instr!");
 
   if (cannotCoexist(I, J))
     return false;
@@ -1158,12 +1166,12 @@ bool HexagonPacketizerList::isLegalToPacketizeTogether(SUnit *SUI, SUnit *SUJ) {
   // If an instruction feeds new value jump, glue it.
   MachineBasicBlock::iterator NextMII = I;
   ++NextMII;
-  if (NextMII != I->getParent()->end() && HII->isNewValueJump(NextMII)) {
-    MachineInstr *NextMI = NextMII;
+  if (NextMII != I->getParent()->end() && HII->isNewValueJump(&*NextMII)) {
+    MachineInstr &NextMI = *NextMII;
 
     bool secondRegMatch = false;
-    const MachineOperand &NOp0 = NextMI->getOperand(0);
-    const MachineOperand &NOp1 = NextMI->getOperand(1);
+    const MachineOperand &NOp0 = NextMI.getOperand(0);
+    const MachineOperand &NOp1 = NextMI.getOperand(1);
 
     if (NOp1.isReg() && I->getOperand(0).getReg() == NOp1.getReg())
       secondRegMatch = true;
@@ -1242,7 +1250,7 @@ bool HexagonPacketizerList::isLegalToPacketizeTogether(SUnit *SUI, SUnit *SUJ) {
       RC = HRI->getMinimalPhysRegClass(DepReg);
     }
 
-    if (I->isCall() || I->isReturn()) {
+    if (I->isCall() || I->isReturn() || HII->isTailCall(I)) {
       if (!isRegDependence(DepType))
         continue;
       if (!isCallDependent(I, DepType, SUJ->Succs[i].getReg()))
@@ -1275,8 +1283,8 @@ bool HexagonPacketizerList::isLegalToPacketizeTogether(SUnit *SUI, SUnit *SUJ) {
 
     // For predicated instructions, if the predicates are complements then
     // there can be no dependence.
-    if (HII->isPredicated(I) && HII->isPredicated(J) &&
-        arePredicatesComplements(I, J)) {
+    if (HII->isPredicated(*I) && HII->isPredicated(*J) &&
+        arePredicatesComplements(*I, *J)) {
       // Not always safe to do this translation.
       // DAG Builder attempts to reduce dependence edges using transitive
       // nature of dependencies. Here is an example:
@@ -1400,8 +1408,30 @@ bool HexagonPacketizerList::isLegalToPacketizeTogether(SUnit *SUI, SUnit *SUJ) {
       }
     }
 
-    // Skip over anti-dependences. Two instructions that are anti-dependent
-    // can share a packet.
+    // There are certain anti-dependencies that cannot be ignored.
+    // Specifically:
+    //   J2_call ... %R0<imp-def>   ; SUJ
+    //   R0 = ...                   ; SUI
+    // Those cannot be packetized together, since the call will observe
+    // the effect of the assignment to R0.
+    if (DepType == SDep::Anti && J->isCall()) {
+      // Check if I defines any volatile register. We should also check
+      // registers that the call may read, but these happen to be a
+      // subset of the volatile register set.
+      for (const MCPhysReg *P = J->getDesc().ImplicitDefs; P && *P; ++P) {
+        if (!I->modifiesRegister(*P, HRI))
+          continue;
+        FoundSequentialDependence = true;
+        break;
+      }
+    }
+
+    // Skip over remaining anti-dependences. Two instructions that are
+    // anti-dependent can share a packet, since in most such cases all
+    // operands are read before any modifications take place.
+    // The exceptions are branch and call instructions, since they are
+    // executed after all other instructions have completed (at least
+    // conceptually).
     if (DepType != SDep::Anti) {
       FoundSequentialDependence = true;
       break;
@@ -1444,26 +1474,25 @@ bool HexagonPacketizerList::isLegalToPruneDependencies(SUnit *SUI, SUnit *SUJ) {
   return false;
 }
 
-
 MachineBasicBlock::iterator
-HexagonPacketizerList::addToPacket(MachineInstr *MI) {
+HexagonPacketizerList::addToPacket(MachineInstr &MI) {
   MachineBasicBlock::iterator MII = MI;
-  MachineBasicBlock *MBB = MI->getParent();
-  if (MI->isImplicitDef()) {
-    unsigned R = MI->getOperand(0).getReg();
+  MachineBasicBlock *MBB = MI.getParent();
+  if (MI.isImplicitDef()) {
+    unsigned R = MI.getOperand(0).getReg();
     if (Hexagon::IntRegsRegClass.contains(R)) {
       MCSuperRegIterator S(R, HRI, false);
-      MI->addOperand(MachineOperand::CreateReg(*S, true, true));
+      MI.addOperand(MachineOperand::CreateReg(*S, true, true));
     }
     return MII;
   }
   assert(ResourceTracker->canReserveResources(MI));
 
-  bool ExtMI = HII->isExtended(MI) || HII->isConstExtended(MI);
+  bool ExtMI = HII->isExtended(&MI) || HII->isConstExtended(&MI);
   bool Good = true;
 
   if (GlueToNewValueJump) {
-    MachineInstr *NvjMI = ++MII;
+    MachineInstr &NvjMI = *++MII;
     // We need to put both instructions in the same packet: MI and NvjMI.
     // Either of them can require a constant extender. Try to add both to
     // the current packet, and if that fails, end the packet and start a
@@ -1472,7 +1501,7 @@ HexagonPacketizerList::addToPacket(MachineInstr *MI) {
     if (ExtMI)
       Good = tryAllocateResourcesForConstExt(true);
 
-    bool ExtNvjMI = HII->isExtended(NvjMI) || HII->isConstExtended(NvjMI);
+    bool ExtNvjMI = HII->isExtended(&NvjMI) || HII->isConstExtended(&NvjMI);
     if (Good) {
       if (ResourceTracker->canReserveResources(NvjMI))
         ResourceTracker->reserveResources(NvjMI);
@@ -1497,8 +1526,8 @@ HexagonPacketizerList::addToPacket(MachineInstr *MI) {
         reserveResourcesForConstExt();
       }
     }
-    CurrentPacketMIs.push_back(MI);
-    CurrentPacketMIs.push_back(NvjMI);
+    CurrentPacketMIs.push_back(&MI);
+    CurrentPacketMIs.push_back(&NvjMI);
     return MII;
   }
 
@@ -1506,23 +1535,23 @@ HexagonPacketizerList::addToPacket(MachineInstr *MI) {
   if (ExtMI && !tryAllocateResourcesForConstExt(true)) {
     endPacket(MBB, MI);
     if (PromotedToDotNew)
-      demoteToDotOld(MI);
+      demoteToDotOld(&MI);
     ResourceTracker->reserveResources(MI);
     reserveResourcesForConstExt();
   }
 
-  CurrentPacketMIs.push_back(MI);
+  CurrentPacketMIs.push_back(&MI);
   return MII;
 }
 
 void HexagonPacketizerList::endPacket(MachineBasicBlock *MBB,
-      MachineInstr *MI) {
+                                      MachineBasicBlock::iterator MI) {
   OldPacketMIs = CurrentPacketMIs;
   VLIWPacketizerList::endPacket(MBB, MI);
 }
 
-bool HexagonPacketizerList::shouldAddToPacket(const MachineInstr *MI) {
-  return !producesStall(MI);
+bool HexagonPacketizerList::shouldAddToPacket(const MachineInstr &MI) {
+  return !producesStall(&MI);
 }
 
 
@@ -1598,4 +1627,3 @@ bool HexagonPacketizerList::producesStall(const MachineInstr *I) {
 FunctionPass *llvm::createHexagonPacketizer() {
   return new HexagonPacketizer();
 }
-
diff --git a/lib/Target/Hexagon/HexagonVLIWPacketizer.h b/lib/Target/Hexagon/HexagonVLIWPacketizer.h
index 960cf6ca5bbc..3f8ed5af3540 100644
--- a/lib/Target/Hexagon/HexagonVLIWPacketizer.h
+++ b/lib/Target/Hexagon/HexagonVLIWPacketizer.h
@@ -50,12 +50,12 @@ public:
   void initPacketizerState() override;
 
   // ignorePseudoInstruction - Ignore bundling of pseudo instructions.
-  bool ignorePseudoInstruction(const MachineInstr *MI,
+  bool ignorePseudoInstruction(const MachineInstr &MI,
                                const MachineBasicBlock *MBB) override;
 
   // isSoloInstruction - return true if instruction MI can not be packetized
   // with any other instruction, which means that MI itself is a packet.
-  bool isSoloInstruction(const MachineInstr *MI) override;
+  bool isSoloInstruction(const MachineInstr &MI) override;
 
   // isLegalToPacketizeTogether - Is it legal to packetize SUI and SUJ
   // together.
@@ -65,9 +65,10 @@ public:
   // and SUJ.
   bool isLegalToPruneDependencies(SUnit *SUI, SUnit *SUJ) override;
 
-  MachineBasicBlock::iterator addToPacket(MachineInstr *MI) override;
-  void endPacket(MachineBasicBlock *MBB, MachineInstr *MI) override;
-  bool shouldAddToPacket(const MachineInstr *MI) override;
+  MachineBasicBlock::iterator addToPacket(MachineInstr &MI) override;
+  void endPacket(MachineBasicBlock *MBB,
+                 MachineBasicBlock::iterator MI) override;
+  bool shouldAddToPacket(const MachineInstr &MI) override;
 
   void unpacketizeSoloInstrs(MachineFunction &MF);
 
@@ -93,7 +94,7 @@ protected:
   bool canPromoteToNewValueStore(const MachineInstr* MI,
                                  const MachineInstr* PacketMI, unsigned DepReg);
   bool demoteToDotOld(MachineInstr* MI);
-  bool arePredicatesComplements(MachineInstr* MI1, MachineInstr* MI2);
+  bool arePredicatesComplements(MachineInstr &MI1, MachineInstr &MI2);
   bool restrictingDepExistInPacket(MachineInstr*, unsigned);
   bool isNewifiable(const MachineInstr *MI);
   bool isCurifiable(MachineInstr* MI);
diff --git a/lib/Target/Hexagon/MCTargetDesc/HexagonAsmBackend.cpp b/lib/Target/Hexagon/MCTargetDesc/HexagonAsmBackend.cpp
index b73af8249cb5..2898b056a03d 100644
--- a/lib/Target/Hexagon/MCTargetDesc/HexagonAsmBackend.cpp
+++ b/lib/Target/Hexagon/MCTargetDesc/HexagonAsmBackend.cpp
@@ -11,7 +11,10 @@
 #include "HexagonFixupKinds.h"
 #include "HexagonMCTargetDesc.h"
 #include "MCTargetDesc/HexagonBaseInfo.h"
+#include "MCTargetDesc/HexagonMCChecker.h"
+#include "MCTargetDesc/HexagonMCCodeEmitter.h"
 #include "MCTargetDesc/HexagonMCInstrInfo.h"
+#include "MCTargetDesc/HexagonMCShuffler.h"
 #include "llvm/MC/MCAsmBackend.h"
 #include "llvm/MC/MCAsmLayout.h"
 #include "llvm/MC/MCAssembler.h"
@@ -19,14 +22,20 @@
 #include "llvm/MC/MCELFObjectWriter.h"
 #include "llvm/MC/MCFixupKindInfo.h"
 #include "llvm/MC/MCInstrInfo.h"
+#include "llvm/MC/MCObjectWriter.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/TargetRegistry.h"
 
+#include <sstream>
+
 using namespace llvm;
 using namespace Hexagon;
 
 #define DEBUG_TYPE "hexagon-asm-backend"
 
+static cl::opt<bool> DisableFixup
+  ("mno-fixup", cl::desc("Disable fixing up resolved relocations for Hexagon"));
+
 namespace {
 
 class HexagonAsmBackend : public MCAsmBackend {
@@ -36,8 +45,21 @@ class HexagonAsmBackend : public MCAsmBackend {
   std::unique_ptr <MCInstrInfo> MCII;
   std::unique_ptr <MCInst *> RelaxTarget;
   MCInst * Extender;
+
+  void ReplaceInstruction(MCCodeEmitter &E, MCRelaxableFragment &RF,
+                          MCInst &HMB) const {
+    SmallVector<MCFixup, 4> Fixups;
+    SmallString<256> Code;
+    raw_svector_ostream VecOS(Code);
+    E.encodeInstruction(HMB, VecOS, Fixups, RF.getSubtargetInfo());
+
+    // Update the fragment.
+    RF.setInst(HMB);
+    RF.getContents() = Code;
+    RF.getFixups() = Fixups;
+  }
 public:
-  HexagonAsmBackend(Target const &T,  uint8_t OSABI, StringRef CPU) :
+  HexagonAsmBackend(const Target &T, uint8_t OSABI, StringRef CPU) :
     OSABI(OSABI), MCII (T.createMCInstrInfo()), RelaxTarget(new MCInst *),
     Extender(nullptr) {}
 
@@ -63,118 +85,438 @@ public:
 
   const MCFixupKindInfo &getFixupKindInfo(MCFixupKind Kind) const override {
     const static MCFixupKindInfo Infos[Hexagon::NumTargetFixupKinds] = {
-        // This table *must* be in same the order of fixup_* kinds in
-        // HexagonFixupKinds.h.
-        //
-        // namei                          offset  bits    flags
-        {"fixup_Hexagon_B22_PCREL", 0, 32, MCFixupKindInfo::FKF_IsPCRel},
-        {"fixup_Hexagon_B15_PCREL", 0, 32, MCFixupKindInfo::FKF_IsPCRel},
-        {"fixup_Hexagon_B7_PCREL", 0, 32, MCFixupKindInfo::FKF_IsPCRel},
-        {"fixup_Hexagon_LO16", 0, 32, 0},
-        {"fixup_Hexagon_HI16", 0, 32, 0},
-        {"fixup_Hexagon_32", 0, 32, 0},
-        {"fixup_Hexagon_16", 0, 32, 0},
-        {"fixup_Hexagon_8", 0, 32, 0},
-        {"fixup_Hexagon_GPREL16_0", 0, 32, 0},
-        {"fixup_Hexagon_GPREL16_1", 0, 32, 0},
-        {"fixup_Hexagon_GPREL16_2", 0, 32, 0},
-        {"fixup_Hexagon_GPREL16_3", 0, 32, 0},
-        {"fixup_Hexagon_HL16", 0, 32, 0},
-        {"fixup_Hexagon_B13_PCREL", 0, 32, MCFixupKindInfo::FKF_IsPCRel},
-        {"fixup_Hexagon_B9_PCREL", 0, 32, MCFixupKindInfo::FKF_IsPCRel},
-        {"fixup_Hexagon_B32_PCREL_X", 0, 32, MCFixupKindInfo::FKF_IsPCRel},
-        {"fixup_Hexagon_32_6_X", 0, 32, 0},
-        {"fixup_Hexagon_B22_PCREL_X", 0, 32, MCFixupKindInfo::FKF_IsPCRel},
-        {"fixup_Hexagon_B15_PCREL_X", 0, 32, MCFixupKindInfo::FKF_IsPCRel},
-        {"fixup_Hexagon_B13_PCREL_X", 0, 32, MCFixupKindInfo::FKF_IsPCRel},
-        {"fixup_Hexagon_B9_PCREL_X", 0, 32, MCFixupKindInfo::FKF_IsPCRel},
-        {"fixup_Hexagon_B7_PCREL_X", 0, 32, MCFixupKindInfo::FKF_IsPCRel},
-        {"fixup_Hexagon_16_X", 0, 32, 0},
-        {"fixup_Hexagon_12_X", 0, 32, 0},
-        {"fixup_Hexagon_11_X", 0, 32, 0},
-        {"fixup_Hexagon_10_X", 0, 32, 0},
-        {"fixup_Hexagon_9_X", 0, 32, 0},
-        {"fixup_Hexagon_8_X", 0, 32, 0},
-        {"fixup_Hexagon_7_X", 0, 32, 0},
-        {"fixup_Hexagon_6_X", 0, 32, 0},
-        {"fixup_Hexagon_32_PCREL", 0, 32, MCFixupKindInfo::FKF_IsPCRel},
-        {"fixup_Hexagon_COPY", 0, 32, 0},
-        {"fixup_Hexagon_GLOB_DAT", 0, 32, 0},
-        {"fixup_Hexagon_JMP_SLOT", 0, 32, 0},
-        {"fixup_Hexagon_RELATIVE", 0, 32, 0},
-        {"fixup_Hexagon_PLT_B22_PCREL", 0, 32, MCFixupKindInfo::FKF_IsPCRel},
-        {"fixup_Hexagon_GOTREL_LO16", 0, 32, 0},
-        {"fixup_Hexagon_GOTREL_HI16", 0, 32, 0},
-        {"fixup_Hexagon_GOTREL_32", 0, 32, 0},
-        {"fixup_Hexagon_GOT_LO16", 0, 32, 0},
-        {"fixup_Hexagon_GOT_HI16", 0, 32, 0},
-        {"fixup_Hexagon_GOT_32", 0, 32, 0},
-        {"fixup_Hexagon_GOT_16", 0, 32, 0},
-        {"fixup_Hexagon_DTPMOD_32", 0, 32, 0},
-        {"fixup_Hexagon_DTPREL_LO16", 0, 32, 0},
-        {"fixup_Hexagon_DTPREL_HI16", 0, 32, 0},
-        {"fixup_Hexagon_DTPREL_32", 0, 32, 0},
-        {"fixup_Hexagon_DTPREL_16", 0, 32, 0},
-        {"fixup_Hexagon_GD_PLT_B22_PCREL", 0, 32, MCFixupKindInfo::FKF_IsPCRel},
-        {"fixup_Hexagon_LD_PLT_B22_PCREL", 0, 32, MCFixupKindInfo::FKF_IsPCRel},
-        {"fixup_Hexagon_GD_GOT_LO16", 0, 32, 0},
-        {"fixup_Hexagon_GD_GOT_HI16", 0, 32, 0},
-        {"fixup_Hexagon_GD_GOT_32", 0, 32, 0},
-        {"fixup_Hexagon_GD_GOT_16", 0, 32, 0},
-        {"fixup_Hexagon_LD_GOT_LO16", 0, 32, 0},
-        {"fixup_Hexagon_LD_GOT_HI16", 0, 32, 0},
-        {"fixup_Hexagon_LD_GOT_32", 0, 32, 0},
-        {"fixup_Hexagon_LD_GOT_16", 0, 32, 0},
-        {"fixup_Hexagon_IE_LO16", 0, 32, 0},
-        {"fixup_Hexagon_IE_HI16", 0, 32, 0},
-        {"fixup_Hexagon_IE_32", 0, 32, 0},
-        {"fixup_Hexagon_IE_16", 0, 32, 0},
-        {"fixup_Hexagon_IE_GOT_LO16", 0, 32, 0},
-        {"fixup_Hexagon_IE_GOT_HI16", 0, 32, 0},
-        {"fixup_Hexagon_IE_GOT_32", 0, 32, 0},
-        {"fixup_Hexagon_IE_GOT_16", 0, 32, 0},
-        {"fixup_Hexagon_TPREL_LO16", 0, 32, 0},
-        {"fixup_Hexagon_TPREL_HI16", 0, 32, 0},
-        {"fixup_Hexagon_TPREL_32", 0, 32, 0},
-        {"fixup_Hexagon_TPREL_16", 0, 32, 0},
-        {"fixup_Hexagon_6_PCREL_X", 0, 32, MCFixupKindInfo::FKF_IsPCRel},
-        {"fixup_Hexagon_GOTREL_32_6_X", 0, 32, 0},
-        {"fixup_Hexagon_GOTREL_16_X", 0, 32, 0},
-        {"fixup_Hexagon_GOTREL_11_X", 0, 32, 0},
-        {"fixup_Hexagon_GOT_32_6_X", 0, 32, 0},
-        {"fixup_Hexagon_GOT_16_X", 0, 32, 0},
-        {"fixup_Hexagon_GOT_11_X", 0, 32, 0},
-        {"fixup_Hexagon_DTPREL_32_6_X", 0, 32, 0},
-        {"fixup_Hexagon_DTPREL_16_X", 0, 32, 0},
-        {"fixup_Hexagon_DTPREL_11_X", 0, 32, 0},
-        {"fixup_Hexagon_GD_GOT_32_6_X", 0, 32, 0},
-        {"fixup_Hexagon_GD_GOT_16_X", 0, 32, 0},
-        {"fixup_Hexagon_GD_GOT_11_X", 0, 32, 0},
-        {"fixup_Hexagon_LD_GOT_32_6_X", 0, 32, 0},
-        {"fixup_Hexagon_LD_GOT_16_X", 0, 32, 0},
-        {"fixup_Hexagon_LD_GOT_11_X", 0, 32, 0},
-        {"fixup_Hexagon_IE_32_6_X", 0, 32, 0},
-        {"fixup_Hexagon_IE_16_X", 0, 32, 0},
-        {"fixup_Hexagon_IE_GOT_32_6_X", 0, 32, 0},
-        {"fixup_Hexagon_IE_GOT_16_X", 0, 32, 0},
-        {"fixup_Hexagon_IE_GOT_11_X", 0, 32, 0},
-        {"fixup_Hexagon_TPREL_32_6_X", 0, 32, 0},
-        {"fixup_Hexagon_TPREL_16_X", 0, 32, 0},
-        {"fixup_Hexagon_TPREL_11_X", 0, 32, 0}};
-
-    if (Kind < FirstTargetFixupKind) {
+      // This table *must* be in same the order of fixup_* kinds in
+      // HexagonFixupKinds.h.
+      //
+      // namei                          offset  bits  flags
+      { "fixup_Hexagon_B22_PCREL",        0,    32,   MCFixupKindInfo::FKF_IsPCRel },
+      { "fixup_Hexagon_B15_PCREL",        0,    32,   MCFixupKindInfo::FKF_IsPCRel },
+      { "fixup_Hexagon_B7_PCREL",         0,    32,   MCFixupKindInfo::FKF_IsPCRel },
+      { "fixup_Hexagon_LO16",             0,    32,   0 },
+      { "fixup_Hexagon_HI16",             0,    32,   0 },
+      { "fixup_Hexagon_32",               0,    32,   0 },
+      { "fixup_Hexagon_16",               0,    32,   0 },
+      { "fixup_Hexagon_8",                0,    32,   0 },
+      { "fixup_Hexagon_GPREL16_0",        0,    32,   0 },
+      { "fixup_Hexagon_GPREL16_1",        0,    32,   0 },
+      { "fixup_Hexagon_GPREL16_2",        0,    32,   0 },
+      { "fixup_Hexagon_GPREL16_3",        0,    32,   0 },
+      { "fixup_Hexagon_HL16",             0,    32,   0 },
+      { "fixup_Hexagon_B13_PCREL",        0,    32,   MCFixupKindInfo::FKF_IsPCRel },
+      { "fixup_Hexagon_B9_PCREL",         0,    32,   MCFixupKindInfo::FKF_IsPCRel },
+      { "fixup_Hexagon_B32_PCREL_X",      0,    32,   MCFixupKindInfo::FKF_IsPCRel },
+      { "fixup_Hexagon_32_6_X",           0,    32,   0 },
+      { "fixup_Hexagon_B22_PCREL_X",      0,    32,   MCFixupKindInfo::FKF_IsPCRel },
+      { "fixup_Hexagon_B15_PCREL_X",      0,    32,   MCFixupKindInfo::FKF_IsPCRel },
+      { "fixup_Hexagon_B13_PCREL_X",      0,    32,   MCFixupKindInfo::FKF_IsPCRel },
+      { "fixup_Hexagon_B9_PCREL_X",       0,    32,   MCFixupKindInfo::FKF_IsPCRel },
+      { "fixup_Hexagon_B7_PCREL_X",       0,    32,   MCFixupKindInfo::FKF_IsPCRel },
+      { "fixup_Hexagon_16_X",             0,    32,   0 },
+      { "fixup_Hexagon_12_X",             0,    32,   0 },
+      { "fixup_Hexagon_11_X",             0,    32,   0 },
+      { "fixup_Hexagon_10_X",             0,    32,   0 },
+      { "fixup_Hexagon_9_X",              0,    32,   0 },
+      { "fixup_Hexagon_8_X",              0,    32,   0 },
+      { "fixup_Hexagon_7_X",              0,    32,   0 },
+      { "fixup_Hexagon_6_X",              0,    32,   0 },
+      { "fixup_Hexagon_32_PCREL",         0,    32,   MCFixupKindInfo::FKF_IsPCRel },
+      { "fixup_Hexagon_COPY",             0,    32,   0 },
+      { "fixup_Hexagon_GLOB_DAT",         0,    32,   0 },
+      { "fixup_Hexagon_JMP_SLOT",         0,    32,   0 },
+      { "fixup_Hexagon_RELATIVE",         0,    32,   0 },
+      { "fixup_Hexagon_PLT_B22_PCREL",    0,    32,   MCFixupKindInfo::FKF_IsPCRel },
+      { "fixup_Hexagon_GOTREL_LO16",      0,    32,   0 },
+      { "fixup_Hexagon_GOTREL_HI16",      0,    32,   0 },
+      { "fixup_Hexagon_GOTREL_32",        0,    32,   0 },
+      { "fixup_Hexagon_GOT_LO16",         0,    32,   0 },
+      { "fixup_Hexagon_GOT_HI16",         0,    32,   0 },
+      { "fixup_Hexagon_GOT_32",           0,    32,   0 },
+      { "fixup_Hexagon_GOT_16",           0,    32,   0 },
+      { "fixup_Hexagon_DTPMOD_32",        0,    32,   0 },
+      { "fixup_Hexagon_DTPREL_LO16",      0,    32,   0 },
+      { "fixup_Hexagon_DTPREL_HI16",      0,    32,   0 },
+      { "fixup_Hexagon_DTPREL_32",        0,    32,   0 },
+      { "fixup_Hexagon_DTPREL_16",        0,    32,   0 },
+      { "fixup_Hexagon_GD_PLT_B22_PCREL", 0,    32,   MCFixupKindInfo::FKF_IsPCRel },
+      { "fixup_Hexagon_LD_PLT_B22_PCREL", 0,    32,   MCFixupKindInfo::FKF_IsPCRel },
+      { "fixup_Hexagon_GD_GOT_LO16",      0,    32,   0 },
+      { "fixup_Hexagon_GD_GOT_HI16",      0,    32,   0 },
+      { "fixup_Hexagon_GD_GOT_32",        0,    32,   0 },
+      { "fixup_Hexagon_GD_GOT_16",        0,    32,   0 },
+      { "fixup_Hexagon_LD_GOT_LO16",      0,    32,   0 },
+      { "fixup_Hexagon_LD_GOT_HI16",      0,    32,   0 },
+      { "fixup_Hexagon_LD_GOT_32",        0,    32,   0 },
+      { "fixup_Hexagon_LD_GOT_16",        0,    32,   0 },
+      { "fixup_Hexagon_IE_LO16",          0,    32,   0 },
+      { "fixup_Hexagon_IE_HI16",          0,    32,   0 },
+      { "fixup_Hexagon_IE_32",            0,    32,   0 },
+      { "fixup_Hexagon_IE_16",            0,    32,   0 },
+      { "fixup_Hexagon_IE_GOT_LO16",      0,    32,   0 },
+      { "fixup_Hexagon_IE_GOT_HI16",      0,    32,   0 },
+      { "fixup_Hexagon_IE_GOT_32",        0,    32,   0 },
+      { "fixup_Hexagon_IE_GOT_16",        0,    32,   0 },
+      { "fixup_Hexagon_TPREL_LO16",       0,    32,   0 },
+      { "fixup_Hexagon_TPREL_HI16",       0,    32,   0 },
+      { "fixup_Hexagon_TPREL_32",         0,    32,   0 },
+      { "fixup_Hexagon_TPREL_16",         0,    32,   0 },
+      { "fixup_Hexagon_6_PCREL_X",        0,    32,   MCFixupKindInfo::FKF_IsPCRel },
+      { "fixup_Hexagon_GOTREL_32_6_X",    0,    32,   0 },
+      { "fixup_Hexagon_GOTREL_16_X",      0,    32,   0 },
+      { "fixup_Hexagon_GOTREL_11_X",      0,    32,   0 },
+      { "fixup_Hexagon_GOT_32_6_X",       0,    32,   0 },
+      { "fixup_Hexagon_GOT_16_X",         0,    32,   0 },
+      { "fixup_Hexagon_GOT_11_X",         0,    32,   0 },
+      { "fixup_Hexagon_DTPREL_32_6_X",    0,    32,   0 },
+      { "fixup_Hexagon_DTPREL_16_X",      0,    32,   0 },
+      { "fixup_Hexagon_DTPREL_11_X",      0,    32,   0 },
+      { "fixup_Hexagon_GD_GOT_32_6_X",    0,    32,   0 },
+      { "fixup_Hexagon_GD_GOT_16_X",      0,    32,   0 },
+      { "fixup_Hexagon_GD_GOT_11_X",      0,    32,   0 },
+      { "fixup_Hexagon_LD_GOT_32_6_X",    0,    32,   0 },
+      { "fixup_Hexagon_LD_GOT_16_X",      0,    32,   0 },
+      { "fixup_Hexagon_LD_GOT_11_X",      0,    32,   0 },
+      { "fixup_Hexagon_IE_32_6_X",        0,    32,   0 },
+      { "fixup_Hexagon_IE_16_X",          0,    32,   0 },
+      { "fixup_Hexagon_IE_GOT_32_6_X",    0,    32,   0 },
+      { "fixup_Hexagon_IE_GOT_16_X",      0,    32,   0 },
+      { "fixup_Hexagon_IE_GOT_11_X",      0,    32,   0 },
+      { "fixup_Hexagon_TPREL_32_6_X",     0,    32,   0 },
+      { "fixup_Hexagon_TPREL_16_X",       0,    32,   0 },
+      { "fixup_Hexagon_TPREL_11_X",       0,    32,   0 }
+    };
+
+    if (Kind < FirstTargetFixupKind)
       return MCAsmBackend::getFixupKindInfo(Kind);
-    }
 
     assert(unsigned(Kind - FirstTargetFixupKind) < getNumFixupKinds() &&
            "Invalid kind!");
     return Infos[Kind - FirstTargetFixupKind];
   }
 
-  void applyFixup(MCFixup const & /*Fixup*/, char * /*Data*/,
-                  unsigned /*DataSize*/, uint64_t /*Value*/,
-                  bool /*IsPCRel*/) const override {
-    return;
+  /// processFixupValue - Target hook to adjust the literal value of a fixup
+  /// if necessary. IsResolved signals whether the caller believes a relocation
+  /// is needed; the target can modify the value. The default does nothing.
+  void processFixupValue(const MCAssembler &Asm, const MCAsmLayout &Layout,
+                         const MCFixup &Fixup, const MCFragment *DF,
+                         const MCValue &Target, uint64_t &Value,
+                         bool &IsResolved) override {
+    MCFixupKind Kind = Fixup.getKind();
+
+    switch((unsigned)Kind) {
+      default:
+        llvm_unreachable("Unknown Fixup Kind!");
+
+      case fixup_Hexagon_LO16:
+      case fixup_Hexagon_HI16:
+      case fixup_Hexagon_16:
+      case fixup_Hexagon_8:
+      case fixup_Hexagon_GPREL16_0:
+      case fixup_Hexagon_GPREL16_1:
+      case fixup_Hexagon_GPREL16_2:
+      case fixup_Hexagon_GPREL16_3:
+      case fixup_Hexagon_HL16:
+      case fixup_Hexagon_32_6_X:
+      case fixup_Hexagon_16_X:
+      case fixup_Hexagon_12_X:
+      case fixup_Hexagon_11_X:
+      case fixup_Hexagon_10_X:
+      case fixup_Hexagon_9_X:
+      case fixup_Hexagon_8_X:
+      case fixup_Hexagon_7_X:
+      case fixup_Hexagon_6_X:
+      case fixup_Hexagon_COPY:
+      case fixup_Hexagon_GLOB_DAT:
+      case fixup_Hexagon_JMP_SLOT:
+      case fixup_Hexagon_RELATIVE:
+      case fixup_Hexagon_PLT_B22_PCREL:
+      case fixup_Hexagon_GOTREL_LO16:
+      case fixup_Hexagon_GOTREL_HI16:
+      case fixup_Hexagon_GOTREL_32:
+      case fixup_Hexagon_GOT_LO16:
+      case fixup_Hexagon_GOT_HI16:
+      case fixup_Hexagon_GOT_32:
+      case fixup_Hexagon_GOT_16:
+      case fixup_Hexagon_DTPMOD_32:
+      case fixup_Hexagon_DTPREL_LO16:
+      case fixup_Hexagon_DTPREL_HI16:
+      case fixup_Hexagon_DTPREL_32:
+      case fixup_Hexagon_DTPREL_16:
+      case fixup_Hexagon_GD_PLT_B22_PCREL:
+      case fixup_Hexagon_LD_PLT_B22_PCREL:
+      case fixup_Hexagon_GD_GOT_LO16:
+      case fixup_Hexagon_GD_GOT_HI16:
+      case fixup_Hexagon_GD_GOT_32:
+      case fixup_Hexagon_GD_GOT_16:
+      case fixup_Hexagon_LD_GOT_LO16:
+      case fixup_Hexagon_LD_GOT_HI16:
+      case fixup_Hexagon_LD_GOT_32:
+      case fixup_Hexagon_LD_GOT_16:
+      case fixup_Hexagon_IE_LO16:
+      case fixup_Hexagon_IE_HI16:
+      case fixup_Hexagon_IE_32:
+      case fixup_Hexagon_IE_16:
+      case fixup_Hexagon_IE_GOT_LO16:
+      case fixup_Hexagon_IE_GOT_HI16:
+      case fixup_Hexagon_IE_GOT_32:
+      case fixup_Hexagon_IE_GOT_16:
+      case fixup_Hexagon_TPREL_LO16:
+      case fixup_Hexagon_TPREL_HI16:
+      case fixup_Hexagon_TPREL_32:
+      case fixup_Hexagon_TPREL_16:
+      case fixup_Hexagon_GOTREL_32_6_X:
+      case fixup_Hexagon_GOTREL_16_X:
+      case fixup_Hexagon_GOTREL_11_X:
+      case fixup_Hexagon_GOT_32_6_X:
+      case fixup_Hexagon_GOT_16_X:
+      case fixup_Hexagon_GOT_11_X:
+      case fixup_Hexagon_DTPREL_32_6_X:
+      case fixup_Hexagon_DTPREL_16_X:
+      case fixup_Hexagon_DTPREL_11_X:
+      case fixup_Hexagon_GD_GOT_32_6_X:
+      case fixup_Hexagon_GD_GOT_16_X:
+      case fixup_Hexagon_GD_GOT_11_X:
+      case fixup_Hexagon_LD_GOT_32_6_X:
+      case fixup_Hexagon_LD_GOT_16_X:
+      case fixup_Hexagon_LD_GOT_11_X:
+      case fixup_Hexagon_IE_32_6_X:
+      case fixup_Hexagon_IE_16_X:
+      case fixup_Hexagon_IE_GOT_32_6_X:
+      case fixup_Hexagon_IE_GOT_16_X:
+      case fixup_Hexagon_IE_GOT_11_X:
+      case fixup_Hexagon_TPREL_32_6_X:
+      case fixup_Hexagon_TPREL_16_X:
+      case fixup_Hexagon_TPREL_11_X:
+      case fixup_Hexagon_32_PCREL:
+      case fixup_Hexagon_6_PCREL_X:
+      case fixup_Hexagon_23_REG:
+        // These relocations should always have a relocation recorded
+        IsResolved = false;
+        return;
+
+      case fixup_Hexagon_B22_PCREL:
+        //IsResolved = false;
+        break;
+
+      case fixup_Hexagon_B13_PCREL:
+      case fixup_Hexagon_B13_PCREL_X:
+      case fixup_Hexagon_B32_PCREL_X:
+      case fixup_Hexagon_B22_PCREL_X:
+      case fixup_Hexagon_B15_PCREL:
+      case fixup_Hexagon_B15_PCREL_X:
+      case fixup_Hexagon_B9_PCREL:
+      case fixup_Hexagon_B9_PCREL_X:
+      case fixup_Hexagon_B7_PCREL:
+      case fixup_Hexagon_B7_PCREL_X:
+        if (DisableFixup)
+          IsResolved = false;
+        break;
+
+      case FK_Data_1:
+      case FK_Data_2:
+      case FK_Data_4:
+      case FK_PCRel_4:
+      case fixup_Hexagon_32:
+        // Leave these relocations alone as they are used for EH.
+        return;
+    }
+  }
+
+  /// getFixupKindNumBytes - The number of bytes the fixup may change.
+  static unsigned getFixupKindNumBytes(unsigned Kind) {
+    switch (Kind) {
+    default:
+        return 0;
+
+      case FK_Data_1:
+        return 1;
+      case FK_Data_2:
+        return 2;
+      case FK_Data_4:         // this later gets mapped to R_HEX_32
+      case FK_PCRel_4:        // this later gets mapped to R_HEX_32_PCREL
+      case fixup_Hexagon_32:
+      case fixup_Hexagon_B32_PCREL_X:
+      case fixup_Hexagon_B22_PCREL:
+      case fixup_Hexagon_B22_PCREL_X:
+      case fixup_Hexagon_B15_PCREL:
+      case fixup_Hexagon_B15_PCREL_X:
+      case fixup_Hexagon_B13_PCREL:
+      case fixup_Hexagon_B13_PCREL_X:
+      case fixup_Hexagon_B9_PCREL:
+      case fixup_Hexagon_B9_PCREL_X:
+      case fixup_Hexagon_B7_PCREL:
+      case fixup_Hexagon_B7_PCREL_X:
+        return 4;
+    }
+  }
+
+  // Make up for left shift when encoding the operand.
+  static uint64_t adjustFixupValue(MCFixupKind Kind, uint64_t Value) {
+    switch((unsigned)Kind) {
+      default:
+        break;
+
+      case fixup_Hexagon_B7_PCREL:
+      case fixup_Hexagon_B9_PCREL:
+      case fixup_Hexagon_B13_PCREL:
+      case fixup_Hexagon_B15_PCREL:
+      case fixup_Hexagon_B22_PCREL:
+        Value >>= 2;
+        break;
+
+      case fixup_Hexagon_B7_PCREL_X:
+      case fixup_Hexagon_B9_PCREL_X:
+      case fixup_Hexagon_B13_PCREL_X:
+      case fixup_Hexagon_B15_PCREL_X:
+      case fixup_Hexagon_B22_PCREL_X:
+        Value &= 0x3f;
+        break;
+
+      case fixup_Hexagon_B32_PCREL_X:
+        Value >>= 6;
+        break;
+    }
+    return (Value);
+  }
+
+  void HandleFixupError(const int bits, const int align_bits,
+    const int64_t FixupValue, const char *fixupStr) const {
+    // Error: value 1124 out of range: -1024-1023 when resolving
+    // symbol in file xprtsock.S
+    const APInt IntMin = APInt::getSignedMinValue(bits+align_bits);
+    const APInt IntMax = APInt::getSignedMaxValue(bits+align_bits);
+    std::stringstream errStr;
+    errStr << "\nError: value " <<
+      FixupValue <<
+      " out of range: " <<
+      IntMin.getSExtValue() <<
+      "-" <<
+      IntMax.getSExtValue() <<
+      " when resolving " <<
+      fixupStr <<
+      " fixup\n";
+    llvm_unreachable(errStr.str().c_str());
+  }
+
+  /// ApplyFixup - Apply the \arg Value for given \arg Fixup into the provided
+  /// data fragment, at the offset specified by the fixup and following the
+  /// fixup kind as appropriate.
+  void applyFixup(const MCFixup &Fixup, char *Data, unsigned DataSize,
+                  uint64_t FixupValue, bool IsPCRel) const override {
+
+    // When FixupValue is 0 the relocation is external and there
+    // is nothing for us to do.
+    if (!FixupValue) return;
+
+    MCFixupKind Kind = Fixup.getKind();
+    uint64_t Value;
+    uint32_t InstMask;
+    uint32_t Reloc;
+
+    // LLVM gives us an encoded value, we have to convert it back
+    // to a real offset before we can use it.
+    uint32_t Offset = Fixup.getOffset();
+    unsigned NumBytes = getFixupKindNumBytes(Kind);
+    assert(Offset + NumBytes <= DataSize && "Invalid fixup offset!");
+    char *InstAddr = Data + Offset;
+
+    Value = adjustFixupValue(Kind, FixupValue);
+    if(!Value)
+      return;
+    int sValue = (int)Value;
+
+    switch((unsigned)Kind) {
+      default:
+        return;
+
+      case fixup_Hexagon_B7_PCREL:
+        if (!(isIntN(7, sValue)))
+          HandleFixupError(7, 2, (int64_t)FixupValue, "B7_PCREL");
+      case fixup_Hexagon_B7_PCREL_X:
+        InstMask = 0x00001f18;  // Word32_B7
+        Reloc = (((Value >> 2) & 0x1f) << 8) |    // Value 6-2 = Target 12-8
+                ((Value & 0x3) << 3);             // Value 1-0 = Target 4-3
+        break;
+
+      case fixup_Hexagon_B9_PCREL:
+        if (!(isIntN(9, sValue)))
+          HandleFixupError(9, 2, (int64_t)FixupValue, "B9_PCREL");
+      case fixup_Hexagon_B9_PCREL_X:
+        InstMask = 0x003000fe;  // Word32_B9
+        Reloc = (((Value >> 7) & 0x3) << 20) |    // Value 8-7 = Target 21-20
+                ((Value & 0x7f) << 1);            // Value 6-0 = Target 7-1
+        break;
+
+        // Since the existing branches that use this relocation cannot be
+        // extended, they should only be fixed up if the target is within range.
+      case fixup_Hexagon_B13_PCREL:
+        if (!(isIntN(13, sValue)))
+          HandleFixupError(13, 2, (int64_t)FixupValue, "B13_PCREL");
+      case fixup_Hexagon_B13_PCREL_X:
+        InstMask = 0x00202ffe;  // Word32_B13
+        Reloc = (((Value >> 12) & 0x1) << 21) |    // Value 12   = Target 21
+                (((Value >> 11) & 0x1) << 13) |    // Value 11   = Target 13
+                ((Value & 0x7ff) << 1);            // Value 10-0 = Target 11-1
+        break;
+
+      case fixup_Hexagon_B15_PCREL:
+        if (!(isIntN(15, sValue)))
+          HandleFixupError(15, 2, (int64_t)FixupValue, "B15_PCREL");
+      case fixup_Hexagon_B15_PCREL_X:
+        InstMask = 0x00df20fe;  // Word32_B15
+        Reloc = (((Value >> 13) & 0x3) << 22) |    // Value 14-13 = Target 23-22
+                (((Value >> 8) & 0x1f) << 16) |    // Value 12-8  = Target 20-16
+                (((Value >> 7) & 0x1)  << 13) |    // Value 7     = Target 13
+                ((Value & 0x7f) << 1);             // Value 6-0   = Target 7-1
+        break;
+
+      case fixup_Hexagon_B22_PCREL:
+        if (!(isIntN(22, sValue)))
+          HandleFixupError(22, 2, (int64_t)FixupValue, "B22_PCREL");
+      case fixup_Hexagon_B22_PCREL_X:
+        InstMask = 0x01ff3ffe;  // Word32_B22
+        Reloc = (((Value >> 13) & 0x1ff) << 16) |  // Value 21-13 = Target 24-16
+                ((Value & 0x1fff) << 1);           // Value 12-0  = Target 13-1
+        break;
+
+      case fixup_Hexagon_B32_PCREL_X:
+        InstMask = 0x0fff3fff;  // Word32_X26
+        Reloc = (((Value >> 14) & 0xfff) << 16) |  // Value 25-14 = Target 27-16
+                (Value & 0x3fff);                  // Value 13-0  = Target 13-0
+        break;
+
+      case FK_Data_1:
+      case FK_Data_2:
+      case FK_Data_4:
+      case fixup_Hexagon_32:
+        InstMask = 0xffffffff;  // Word32
+        Reloc = Value;
+        break;
+    }
+
+    DEBUG(dbgs() << "Name=" << getFixupKindInfo(Kind).Name << "(" <<
+          (unsigned)Kind << ")\n");
+    DEBUG(uint32_t OldData = 0;
+          for (unsigned i = 0; i < NumBytes; i++)
+            OldData |= (InstAddr[i] << (i * 8)) & (0xff << (i * 8));
+          dbgs() << "\tBValue=0x"; dbgs().write_hex(Value) <<
+            ": AValue=0x"; dbgs().write_hex(FixupValue) <<
+            ": Offset=" << Offset <<
+            ": Size=" << DataSize <<
+            ": OInst=0x"; dbgs().write_hex(OldData) <<
+            ": Reloc=0x"; dbgs().write_hex(Reloc););
+
+    // For each byte of the fragment that the fixup touches, mask in the
+    // bits from the fixup value. The Value has been "split up" into the
+    // appropriate bitfields above.
+    for (unsigned i = 0; i < NumBytes; i++){
+      InstAddr[i] &= uint8_t(~InstMask >> (i * 8)) & 0xff; // Clear reloc bits
+      InstAddr[i] |= uint8_t(Reloc >> (i * 8)) & 0xff;     // Apply new reloc
+    }
+
+    DEBUG(uint32_t NewData = 0;
+          for (unsigned i = 0; i < NumBytes; i++)
+            NewData |= (InstAddr[i] << (i * 8)) & (0xff << (i * 8));
+          dbgs() << ": NInst=0x"; dbgs().write_hex(NewData) << "\n";);
   }
 
   bool isInstRelaxable(MCInst const &HMI) const {
@@ -182,12 +524,20 @@ public:
     bool Relaxable = false;
     // Branches and loop-setup insns are handled as necessary by relaxation.
     if (llvm::HexagonMCInstrInfo::getType(*MCII, HMI) == HexagonII::TypeJ ||
+        (llvm::HexagonMCInstrInfo::getType(*MCII, HMI) ==
+             HexagonII::TypeCOMPOUND &&
+         MCID.isBranch()) ||
         (llvm::HexagonMCInstrInfo::getType(*MCII, HMI) == HexagonII::TypeNV &&
          MCID.isBranch()) ||
         (llvm::HexagonMCInstrInfo::getType(*MCII, HMI) == HexagonII::TypeCR &&
          HMI.getOpcode() != Hexagon::C4_addipc))
-      if (HexagonMCInstrInfo::isExtendable(*MCII, HMI))
+      if (HexagonMCInstrInfo::isExtendable(*MCII, HMI)) {
         Relaxable = true;
+        MCOperand const &Operand =
+            HMI.getOperand(HexagonMCInstrInfo::getExtendableOp(*MCII, HMI));
+        if (HexagonMCInstrInfo::mustNotExtend(*Operand.getExpr()))
+          Relaxable = false;
+      }
 
     return Relaxable;
   }
@@ -197,17 +547,7 @@ public:
   ///
   /// \param Inst - The instruction to test.
   bool mayNeedRelaxation(MCInst const &Inst) const override {
-    assert(HexagonMCInstrInfo::isBundle(Inst));
-    bool PreviousIsExtender = false;
-    for (auto const &I : HexagonMCInstrInfo::bundleInstructions(Inst)) {
-      auto const &Inst = *I.getInst();
-      if (!PreviousIsExtender) {
-        if (isInstRelaxable(Inst))
-          return true;
-      }
-      PreviousIsExtender = HexagonMCInstrInfo::isImmext(Inst);
-    }
-    return false;
+    return true;
   }
 
   /// fixupNeedsRelaxation - Target specific predicate for whether a given
@@ -222,6 +562,9 @@ public:
     *RelaxTarget = nullptr;
     MCInst &MCI = const_cast<MCInst &>(HexagonMCInstrInfo::instruction(
         MCB, Fixup.getOffset() / HEXAGON_INSTR_SIZE));
+    bool Relaxable = isInstRelaxable(MCI);
+    if (Relaxable == false)
+      return false;
     // If we cannot resolve the fixup value, it requires relaxation.
     if (!Resolved) {
       switch ((unsigned)Fixup.getKind()) {
@@ -247,9 +590,6 @@ public:
       }
       }
     }
-    bool Relaxable = isInstRelaxable(MCI);
-    if (Relaxable == false)
-      return false;
 
     MCFixupKind Kind = Fixup.getKind();
     int64_t sValue = Value;
@@ -294,8 +634,8 @@ public:
     llvm_unreachable("Handled by fixupNeedsRelaxationAdvanced");
   }
 
-  void relaxInstruction(MCInst const & Inst,
-                        MCInst & Res) const override {
+  void relaxInstruction(const MCInst &Inst, const MCSubtargetInfo &STI,
+                        MCInst &Res) const override {
     assert(HexagonMCInstrInfo::isBundle(Inst) &&
            "Hexagon relaxInstruction only works on bundles");
 
@@ -347,6 +687,58 @@ public:
     }
     return true;
   }
+
+  void finishLayout(MCAssembler const &Asm,
+                    MCAsmLayout &Layout) const override {
+    for (auto I : Layout.getSectionOrder()) {
+      auto &Fragments = I->getFragmentList();
+      for (auto &J : Fragments) {
+        switch (J.getKind()) {
+        default:
+          break;
+        case MCFragment::FT_Align: {
+          auto Size = Asm.computeFragmentSize(Layout, J);
+          for (auto K = J.getIterator();
+               K != Fragments.begin() && Size >= HEXAGON_PACKET_SIZE;) {
+            --K;
+            switch (K->getKind()) {
+            default:
+              break;
+            case MCFragment::FT_Align: {
+              // Don't pad before other alignments
+              Size = 0;
+              break;
+            }
+            case MCFragment::FT_Relaxable: {
+              auto &RF = cast<MCRelaxableFragment>(*K);
+              auto &Inst = const_cast<MCInst &>(RF.getInst());
+              while (Size > 0 && HexagonMCInstrInfo::bundleSize(Inst) < 4) {
+                MCInst *Nop = new (Asm.getContext()) MCInst;
+                Nop->setOpcode(Hexagon::A2_nop);
+                Inst.addOperand(MCOperand::createInst(Nop));
+                Size -= 4;
+                if (!HexagonMCChecker(
+                           *MCII, RF.getSubtargetInfo(), Inst, Inst,
+                           *Asm.getContext().getRegisterInfo()).check()) {
+                  Inst.erase(Inst.end() - 1);
+                  Size = 0;
+                }
+              }
+              bool Error = HexagonMCShuffle(*MCII, RF.getSubtargetInfo(), Inst);
+              //assert(!Error);
+              (void)Error;
+              ReplaceInstruction(Asm.getEmitter(), RF, Inst);
+              Layout.invalidateFragmentsFrom(&RF);
+              Size = 0; // Only look back one instruction
+              break;
+            }
+            }
+          }
+        }
+        }
+      }
+    }
+  }
 };
 } // end anonymous namespace
 
diff --git a/lib/Target/Hexagon/MCTargetDesc/HexagonBaseInfo.h b/lib/Target/Hexagon/MCTargetDesc/HexagonBaseInfo.h
index 47a6f8636276..c63f044b7128 100644
--- a/lib/Target/Hexagon/MCTargetDesc/HexagonBaseInfo.h
+++ b/lib/Target/Hexagon/MCTargetDesc/HexagonBaseInfo.h
@@ -227,7 +227,27 @@ namespace HexagonII {
     MO_LO16, MO_HI16,
 
     // Offset from the base of the SDA.
-    MO_GPREL
+    MO_GPREL,
+
+    // MO_GDGOT - indicates GOT relative relocation for TLS
+    // GeneralDynamic method
+    MO_GDGOT,
+
+    // MO_GDPLT - indicates PLT relative relocation for TLS
+    // GeneralDynamic method
+    MO_GDPLT,
+
+    // MO_IE - indicates non PIC relocation for TLS
+    // Initial Executable method
+    MO_IE,
+
+    // MO_IEGOT - indicates PIC relocation for TLS
+    // Initial Executable method
+    MO_IEGOT,
+
+    // MO_TPREL - indicates relocation for TLS
+    // local Executable method
+    MO_TPREL
   };
 
   // Hexagon Sub-instruction classes.
diff --git a/lib/Target/Hexagon/MCTargetDesc/HexagonELFObjectWriter.cpp b/lib/Target/Hexagon/MCTargetDesc/HexagonELFObjectWriter.cpp
index da5d4d1da69b..944e235e72f2 100644
--- a/lib/Target/Hexagon/MCTargetDesc/HexagonELFObjectWriter.cpp
+++ b/lib/Target/Hexagon/MCTargetDesc/HexagonELFObjectWriter.cpp
@@ -11,6 +11,7 @@
 #include "MCTargetDesc/HexagonFixupKinds.h"
 #include "llvm/MC/MCAssembler.h"
 #include "llvm/MC/MCELFObjectWriter.h"
+#include "llvm/MC/MCValue.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/raw_ostream.h"
 
@@ -28,8 +29,8 @@ private:
 public:
   HexagonELFObjectWriter(uint8_t OSABI, StringRef C);
 
-  unsigned GetRelocType(MCValue const &Target, MCFixup const &Fixup,
-                        bool IsPCRel) const override;
+  unsigned getRelocType(MCContext &Ctx, MCValue const &Target,
+                        MCFixup const &Fixup, bool IsPCRel) const override;
 };
 }
 
@@ -38,20 +39,61 @@ HexagonELFObjectWriter::HexagonELFObjectWriter(uint8_t OSABI, StringRef C)
                               /*HasRelocationAddend*/ true),
       CPU(C) {}
 
-unsigned HexagonELFObjectWriter::GetRelocType(MCValue const & /*Target*/,
+unsigned HexagonELFObjectWriter::getRelocType(MCContext &Ctx,
+                                              MCValue const &Target,
                                               MCFixup const &Fixup,
                                               bool IsPCRel) const {
+  MCSymbolRefExpr::VariantKind Variant = Target.getAccessVariant();
   switch ((unsigned)Fixup.getKind()) {
   default:
-    DEBUG(dbgs() << "unrecognized relocation " << Fixup.getKind() << "\n");
-    llvm_unreachable("Unimplemented Fixup kind!");
-    return ELF::R_HEX_NONE;
+    report_fatal_error("Unrecognized relocation type");
+    break;
   case FK_Data_4:
-    return (IsPCRel) ? ELF::R_HEX_32_PCREL : ELF::R_HEX_32;
+    switch(Variant) {
+    case MCSymbolRefExpr::VariantKind::VK_DTPREL:
+      return ELF::R_HEX_DTPREL_32;
+    case MCSymbolRefExpr::VariantKind::VK_GOT:
+      return ELF::R_HEX_GOT_32;
+    case MCSymbolRefExpr::VariantKind::VK_GOTREL:
+      return ELF::R_HEX_GOTREL_32;
+    case MCSymbolRefExpr::VariantKind::VK_Hexagon_GD_GOT:
+      return ELF::R_HEX_GD_GOT_32;
+    case MCSymbolRefExpr::VariantKind::VK_Hexagon_IE:
+      return ELF::R_HEX_IE_32;
+    case MCSymbolRefExpr::VariantKind::VK_Hexagon_IE_GOT:
+      return ELF::R_HEX_IE_GOT_32;
+    case MCSymbolRefExpr::VariantKind::VK_Hexagon_LD_GOT:
+      return ELF::R_HEX_LD_GOT_32;
+    case MCSymbolRefExpr::VariantKind::VK_Hexagon_PCREL:
+      return ELF::R_HEX_32_PCREL;
+    case MCSymbolRefExpr::VariantKind::VK_TPREL:
+      return ELF::R_HEX_TPREL_32;
+    case MCSymbolRefExpr::VariantKind::VK_None:
+      return IsPCRel ? ELF::R_HEX_32_PCREL : ELF::R_HEX_32;
+    default:
+      report_fatal_error("Unrecognized variant type");
+    };
   case FK_PCRel_4:
     return ELF::R_HEX_32_PCREL;
   case FK_Data_2:
-    return ELF::R_HEX_16;
+    switch(Variant) {
+    case MCSymbolRefExpr::VariantKind::VK_DTPREL:
+      return ELF::R_HEX_DTPREL_16;
+    case MCSymbolRefExpr::VariantKind::VK_GOT:
+      return ELF::R_HEX_GOT_16;
+    case MCSymbolRefExpr::VariantKind::VK_Hexagon_GD_GOT:
+      return ELF::R_HEX_GD_GOT_16;
+    case MCSymbolRefExpr::VariantKind::VK_Hexagon_IE_GOT:
+      return ELF::R_HEX_IE_GOT_16;
+    case MCSymbolRefExpr::VariantKind::VK_Hexagon_LD_GOT:
+      return ELF::R_HEX_LD_GOT_16;
+    case MCSymbolRefExpr::VariantKind::VK_TPREL:
+      return ELF::R_HEX_TPREL_16;
+    case MCSymbolRefExpr::VariantKind::VK_None:
+      return ELF::R_HEX_16;
+    default:
+      report_fatal_error("Unrecognized variant type");
+    };
   case FK_Data_1:
     return ELF::R_HEX_8;
   case fixup_Hexagon_B22_PCREL:
@@ -240,6 +282,8 @@ unsigned HexagonELFObjectWriter::GetRelocType(MCValue const & /*Target*/,
     return ELF::R_HEX_TPREL_16_X;
   case fixup_Hexagon_TPREL_11_X:
     return ELF::R_HEX_TPREL_11_X;
+  case fixup_Hexagon_23_REG:
+    return ELF::R_HEX_23_REG;
   }
 }
 
diff --git a/lib/Target/Hexagon/MCTargetDesc/HexagonFixupKinds.h b/lib/Target/Hexagon/MCTargetDesc/HexagonFixupKinds.h
index 4bbfbec883c4..4c97ebbdd346 100644
--- a/lib/Target/Hexagon/MCTargetDesc/HexagonFixupKinds.h
+++ b/lib/Target/Hexagon/MCTargetDesc/HexagonFixupKinds.h
@@ -110,6 +110,7 @@ enum Fixups {
   fixup_Hexagon_TPREL_32_6_X,
   fixup_Hexagon_TPREL_16_X,
   fixup_Hexagon_TPREL_11_X,
+  fixup_Hexagon_23_REG,
 
   LastTargetFixupKind,
   NumTargetFixupKinds = LastTargetFixupKind - FirstTargetFixupKind
diff --git a/lib/Target/Hexagon/MCTargetDesc/HexagonInstPrinter.cpp b/lib/Target/Hexagon/MCTargetDesc/HexagonInstPrinter.cpp
index 06ccec532211..42fcc5a6aa89 100644
--- a/lib/Target/Hexagon/MCTargetDesc/HexagonInstPrinter.cpp
+++ b/lib/Target/Hexagon/MCTargetDesc/HexagonInstPrinter.cpp
@@ -79,7 +79,6 @@ void HexagonInstPrinter::printInst(const MCInst *MI, raw_ostream &OS,
   }
   if (HexagonMCInstrInfo::isOuterLoop(*MI)) {
     OS << Separator;
-    Separator = " ";
     MCInst ME;
     ME.setOpcode(Hexagon::ENDLOOP1);
     printInstruction(&ME, OS);
@@ -203,16 +202,11 @@ void HexagonInstPrinter::printPredicateOperand(MCInst const *MI, unsigned OpNo,
 
 void HexagonInstPrinter::printSymbol(MCInst const *MI, unsigned OpNo,
                                      raw_ostream &O, bool hi) const {
-  MCOperand const &MO = MI->getOperand(OpNo);
+  assert(MI->getOperand(OpNo).isImm() && "Unknown symbol operand");
 
   O << '#' << (hi ? "HI" : "LO") << '(';
-  if (MO.isImm()) {
-    O << '#';
-    printOperand(MI, OpNo, O);
-  } else {
-    printOperand(MI, OpNo, O);
-    assert("Unknown symbol operand");
-  }
+  O << '#';
+  printOperand(MI, OpNo, O);
   O << ')';
 }
 
diff --git a/lib/Target/Hexagon/MCTargetDesc/HexagonMCAsmInfo.cpp b/lib/Target/Hexagon/MCTargetDesc/HexagonMCAsmInfo.cpp
index 51d2f1c878dc..9e2c28076432 100644
--- a/lib/Target/Hexagon/MCTargetDesc/HexagonMCAsmInfo.cpp
+++ b/lib/Target/Hexagon/MCTargetDesc/HexagonMCAsmInfo.cpp
@@ -32,6 +32,7 @@ HexagonMCAsmInfo::HexagonMCAsmInfo(const Triple &TT) {
   AscizDirective = "\t.string\t";
 
   SupportsDebugInformation = true;
+  MinInstAlignment = 4;
   UsesELFSectionDirectiveForBSS  = true;
   ExceptionsType = ExceptionHandling::DwarfCFI;
 }
diff --git a/lib/Target/Hexagon/MCTargetDesc/HexagonMCAsmInfo.h b/lib/Target/Hexagon/MCTargetDesc/HexagonMCAsmInfo.h
index a8456b4ead9c..efeff2436234 100644
--- a/lib/Target/Hexagon/MCTargetDesc/HexagonMCAsmInfo.h
+++ b/lib/Target/Hexagon/MCTargetDesc/HexagonMCAsmInfo.h
@@ -14,7 +14,6 @@
 #ifndef LLVM_LIB_TARGET_HEXAGON_MCTARGETDESC_HEXAGONMCASMINFO_H
 #define LLVM_LIB_TARGET_HEXAGON_MCTARGETDESC_HEXAGONMCASMINFO_H
 
-#include "llvm/ADT/StringRef.h"
 #include "llvm/MC/MCAsmInfoELF.h"
 
 namespace llvm {
diff --git a/lib/Target/Hexagon/MCTargetDesc/HexagonMCChecker.cpp b/lib/Target/Hexagon/MCTargetDesc/HexagonMCChecker.cpp
index 46b7b41fec3b..07c9ad96a0d7 100644
--- a/lib/Target/Hexagon/MCTargetDesc/HexagonMCChecker.cpp
+++ b/lib/Target/Hexagon/MCTargetDesc/HexagonMCChecker.cpp
@@ -16,7 +16,6 @@
 
 #include "HexagonBaseInfo.h"
 
-#include "llvm/ADT/SmallVector.h"
 #include "llvm/MC/MCInstrDesc.h"
 #include "llvm/MC/MCInstrInfo.h"
 #include "llvm/Support/CommandLine.h"
@@ -117,6 +116,11 @@ void HexagonMCChecker::init(MCInst const& MCI) {
   for (unsigned i = 0; i < MCID.getNumDefs(); ++i) {
     unsigned R = MCI.getOperand(i).getReg(),
              S = Hexagon::NoRegister;
+    // USR has subregisters (while C8 does not for technical reasons), so
+    // reset R to USR, since we know how to handle multiple defs of USR,
+    // taking into account its subregisters.
+    if (R == Hexagon::C8)
+      R = Hexagon::USR;
 
     // Note register definitions, direct ones as well as indirect side-effects.
     // Super-registers are not tracked directly, but their components.
diff --git a/lib/Target/Hexagon/MCTargetDesc/HexagonMCChecker.h b/lib/Target/Hexagon/MCTargetDesc/HexagonMCChecker.h
index 5fc0bdeaccbb..33e22798c954 100644
--- a/lib/Target/Hexagon/MCTargetDesc/HexagonMCChecker.h
+++ b/lib/Target/Hexagon/MCTargetDesc/HexagonMCChecker.h
@@ -15,10 +15,9 @@
 #ifndef HEXAGONMCCHECKER_H
 #define HEXAGONMCCHECKER_H
 
-#include <map>
-#include <set>
-#include <queue>
 #include "MCTargetDesc/HexagonMCShuffler.h"
+#include <queue>
+#include <set>
 
 using namespace llvm;
 
diff --git a/lib/Target/Hexagon/MCTargetDesc/HexagonMCCodeEmitter.cpp b/lib/Target/Hexagon/MCTargetDesc/HexagonMCCodeEmitter.cpp
index 4b07ca7490a8..39b828d8a03a 100644
--- a/lib/Target/Hexagon/MCTargetDesc/HexagonMCCodeEmitter.cpp
+++ b/lib/Target/Hexagon/MCTargetDesc/HexagonMCCodeEmitter.cpp
@@ -88,6 +88,19 @@ void HexagonMCCodeEmitter::encodeInstruction(MCInst const &MI, raw_ostream &OS,
   return;
 }
 
+static bool RegisterMatches(unsigned Consumer, unsigned Producer,
+                            unsigned Producer2) {
+  if (Consumer == Producer)
+    return true;
+  if (Consumer == Producer2)
+    return true;
+  // Calculate if we're a single vector consumer referencing a double producer
+  if (Producer >= Hexagon::W0 && Producer <= Hexagon::W15)
+    if (Consumer >= Hexagon::V0 && Consumer <= Hexagon::V31)
+      return ((Consumer - Hexagon::V0) >> 1) == (Producer - Hexagon::W0);
+  return false;
+}
+
 /// EncodeSingleInstruction - Emit a single
 void HexagonMCCodeEmitter::EncodeSingleInstruction(
     const MCInst &MI, raw_ostream &OS, SmallVectorImpl<MCFixup> &Fixups,
@@ -125,8 +138,10 @@ void HexagonMCCodeEmitter::EncodeSingleInstruction(
     MCOperand &MCO =
         HMB.getOperand(HexagonMCInstrInfo::getNewValueOp(MCII, HMB));
     unsigned SOffset = 0;
+    unsigned VOffset = 0;
     unsigned Register = MCO.getReg();
     unsigned Register1;
+    unsigned Register2;
     auto Instructions = HexagonMCInstrInfo::bundleInstructions(**CurrentBundle);
     auto i = Instructions.begin() + Index - 1;
     for (;; --i) {
@@ -135,11 +150,18 @@ void HexagonMCCodeEmitter::EncodeSingleInstruction(
       if (HexagonMCInstrInfo::isImmext(Inst))
         continue;
       ++SOffset;
+      if (HexagonMCInstrInfo::isVector(MCII, Inst))
+        // Vector instructions don't count scalars
+        ++VOffset;
       Register1 =
           HexagonMCInstrInfo::hasNewValue(MCII, Inst)
               ? HexagonMCInstrInfo::getNewValueOperand(MCII, Inst).getReg()
               : static_cast<unsigned>(Hexagon::NoRegister);
-      if (Register != Register1)
+      Register2 =
+          HexagonMCInstrInfo::hasNewValue2(MCII, Inst)
+              ? HexagonMCInstrInfo::getNewValueOperand2(MCII, Inst).getReg()
+              : static_cast<unsigned>(Hexagon::NoRegister);
+      if (!RegisterMatches(Register, Register1, Register2))
         // This isn't the register we're looking for
         continue;
       if (!HexagonMCInstrInfo::isPredicated(MCII, Inst))
@@ -153,8 +175,11 @@ void HexagonMCCodeEmitter::EncodeSingleInstruction(
         break;
     }
     // Hexagon PRM 10.11 Construct Nt from distance
-    unsigned Offset = SOffset;
+    unsigned Offset =
+        HexagonMCInstrInfo::isVector(MCII, HMB) ? VOffset : SOffset;
     Offset <<= 1;
+    Offset |=
+        HexagonMCInstrInfo::SubregisterBit(Register, Register1, Register2);
     MCO.setReg(Offset + Hexagon::R0);
   }
 
@@ -165,7 +190,6 @@ void HexagonMCCodeEmitter::EncodeSingleInstruction(
       ((HMB.getOpcode() != DuplexIClass0) && (HMB.getOpcode() != A4_ext) &&
        (HMB.getOpcode() != A4_ext_b) && (HMB.getOpcode() != A4_ext_c) &&
        (HMB.getOpcode() != A4_ext_g))) {
-    // Use a A2_nop for unimplemented instructions.
     DEBUG(dbgs() << "Unimplemented inst: "
                     " `" << HexagonMCInstrInfo::getName(MCII, HMB) << "'"
                                                                       "\n");
@@ -251,7 +275,23 @@ void HexagonMCCodeEmitter::EncodeSingleInstruction(
   ++MCNumEmitted;
 }
 
-static Hexagon::Fixups getFixupNoBits(MCInstrInfo const &MCII, const MCInst &MI,
+namespace {
+void raise_relocation_error(unsigned bits, unsigned kind) {
+  std::string Text;
+  {
+    llvm::raw_string_ostream Stream(Text);
+    Stream << "Unrecognized relocation combination bits: " << bits
+           << " kind: " << kind;
+  }
+  report_fatal_error(Text);
+}
+}
+
+/// getFixupNoBits - Some insns are not extended and thus have no
+/// bits.  These cases require a more brute force method for determining
+/// the correct relocation.
+namespace {
+Hexagon::Fixups getFixupNoBits(MCInstrInfo const &MCII, const MCInst &MI,
                                       const MCOperand &MO,
                                       const MCSymbolRefExpr::VariantKind kind) {
   const MCInstrDesc &MCID = HexagonMCInstrInfo::getDesc(MCII, MI);
@@ -259,83 +299,90 @@ static Hexagon::Fixups getFixupNoBits(MCInstrInfo const &MCII, const MCInst &MI,
 
   if (insnType == HexagonII::TypePREFIX) {
     switch (kind) {
-    case llvm::MCSymbolRefExpr::VK_GOTOFF:
+    case MCSymbolRefExpr::VK_GOTREL:
       return Hexagon::fixup_Hexagon_GOTREL_32_6_X;
-    case llvm::MCSymbolRefExpr::VK_GOT:
+    case MCSymbolRefExpr::VK_GOT:
       return Hexagon::fixup_Hexagon_GOT_32_6_X;
-    case llvm::MCSymbolRefExpr::VK_TPREL:
+    case MCSymbolRefExpr::VK_TPREL:
       return Hexagon::fixup_Hexagon_TPREL_32_6_X;
-    case llvm::MCSymbolRefExpr::VK_DTPREL:
+    case MCSymbolRefExpr::VK_DTPREL:
       return Hexagon::fixup_Hexagon_DTPREL_32_6_X;
-    case llvm::MCSymbolRefExpr::VK_Hexagon_GD_GOT:
+    case MCSymbolRefExpr::VK_Hexagon_GD_GOT:
       return Hexagon::fixup_Hexagon_GD_GOT_32_6_X;
-    case llvm::MCSymbolRefExpr::VK_Hexagon_LD_GOT:
+    case MCSymbolRefExpr::VK_Hexagon_LD_GOT:
       return Hexagon::fixup_Hexagon_LD_GOT_32_6_X;
-    case llvm::MCSymbolRefExpr::VK_Hexagon_IE:
+    case MCSymbolRefExpr::VK_Hexagon_IE:
       return Hexagon::fixup_Hexagon_IE_32_6_X;
-    case llvm::MCSymbolRefExpr::VK_Hexagon_IE_GOT:
+    case MCSymbolRefExpr::VK_Hexagon_IE_GOT:
       return Hexagon::fixup_Hexagon_IE_GOT_32_6_X;
-    default:
+    case MCSymbolRefExpr::VK_Hexagon_PCREL:
+    case MCSymbolRefExpr::VK_None:
       if (MCID.isBranch())
         return Hexagon::fixup_Hexagon_B32_PCREL_X;
       else
         return Hexagon::fixup_Hexagon_32_6_X;
+    default:
+      raise_relocation_error(0, kind);
     }
   } else if (MCID.isBranch())
-    return (Hexagon::fixup_Hexagon_B13_PCREL);
+    return Hexagon::fixup_Hexagon_B13_PCREL;
 
   switch (MCID.getOpcode()) {
   case Hexagon::HI:
   case Hexagon::A2_tfrih:
     switch (kind) {
-    case llvm::MCSymbolRefExpr::VK_GOT:
+    case MCSymbolRefExpr::VK_GOT:
       return Hexagon::fixup_Hexagon_GOT_HI16;
-    case llvm::MCSymbolRefExpr::VK_GOTOFF:
+    case MCSymbolRefExpr::VK_GOTREL:
       return Hexagon::fixup_Hexagon_GOTREL_HI16;
-    case llvm::MCSymbolRefExpr::VK_Hexagon_GD_GOT:
+    case MCSymbolRefExpr::VK_Hexagon_GD_GOT:
       return Hexagon::fixup_Hexagon_GD_GOT_HI16;
-    case llvm::MCSymbolRefExpr::VK_Hexagon_LD_GOT:
+    case MCSymbolRefExpr::VK_Hexagon_LD_GOT:
       return Hexagon::fixup_Hexagon_LD_GOT_HI16;
-    case llvm::MCSymbolRefExpr::VK_Hexagon_IE:
+    case MCSymbolRefExpr::VK_Hexagon_IE:
       return Hexagon::fixup_Hexagon_IE_HI16;
-    case llvm::MCSymbolRefExpr::VK_Hexagon_IE_GOT:
+    case MCSymbolRefExpr::VK_Hexagon_IE_GOT:
       return Hexagon::fixup_Hexagon_IE_GOT_HI16;
-    case llvm::MCSymbolRefExpr::VK_TPREL:
+    case MCSymbolRefExpr::VK_TPREL:
       return Hexagon::fixup_Hexagon_TPREL_HI16;
-    case llvm::MCSymbolRefExpr::VK_DTPREL:
+    case MCSymbolRefExpr::VK_DTPREL:
       return Hexagon::fixup_Hexagon_DTPREL_HI16;
-    default:
+    case MCSymbolRefExpr::VK_None:
       return Hexagon::fixup_Hexagon_HI16;
+    default:
+      raise_relocation_error(0, kind);
     }
 
   case Hexagon::LO:
   case Hexagon::A2_tfril:
     switch (kind) {
-    case llvm::MCSymbolRefExpr::VK_GOT:
+    case MCSymbolRefExpr::VK_GOT:
       return Hexagon::fixup_Hexagon_GOT_LO16;
-    case llvm::MCSymbolRefExpr::VK_GOTOFF:
+    case MCSymbolRefExpr::VK_GOTREL:
       return Hexagon::fixup_Hexagon_GOTREL_LO16;
-    case llvm::MCSymbolRefExpr::VK_Hexagon_GD_GOT:
+    case MCSymbolRefExpr::VK_Hexagon_GD_GOT:
       return Hexagon::fixup_Hexagon_GD_GOT_LO16;
-    case llvm::MCSymbolRefExpr::VK_Hexagon_LD_GOT:
+    case MCSymbolRefExpr::VK_Hexagon_LD_GOT:
       return Hexagon::fixup_Hexagon_LD_GOT_LO16;
-    case llvm::MCSymbolRefExpr::VK_Hexagon_IE:
+    case MCSymbolRefExpr::VK_Hexagon_IE:
       return Hexagon::fixup_Hexagon_IE_LO16;
-    case llvm::MCSymbolRefExpr::VK_Hexagon_IE_GOT:
+    case MCSymbolRefExpr::VK_Hexagon_IE_GOT:
       return Hexagon::fixup_Hexagon_IE_GOT_LO16;
-    case llvm::MCSymbolRefExpr::VK_TPREL:
+    case MCSymbolRefExpr::VK_TPREL:
       return Hexagon::fixup_Hexagon_TPREL_LO16;
-    case llvm::MCSymbolRefExpr::VK_DTPREL:
+    case MCSymbolRefExpr::VK_DTPREL:
       return Hexagon::fixup_Hexagon_DTPREL_LO16;
-    default:
+    case MCSymbolRefExpr::VK_None:
       return Hexagon::fixup_Hexagon_LO16;
+    default:
+      raise_relocation_error(0, kind);
     }
 
   // The only relocs left should be GP relative:
   default:
     if (MCID.mayStore() || MCID.mayLoad()) {
-      for (const MCPhysReg *ImpUses = MCID.getImplicitUses();
-           ImpUses && *ImpUses; ++ImpUses) {
+      for (const MCPhysReg *ImpUses = MCID.getImplicitUses(); *ImpUses;
+           ++ImpUses) {
         if (*ImpUses != Hexagon::GP)
           continue;
         switch (HexagonMCInstrInfo::getAccessSize(MCII, MI)) {
@@ -348,14 +395,14 @@ static Hexagon::Fixups getFixupNoBits(MCInstrInfo const &MCII, const MCInst &MI,
         case HexagonII::MemAccessSize::DoubleWordAccess:
           return fixup_Hexagon_GPREL16_3;
         default:
-          llvm_unreachable("unhandled fixup");
+          raise_relocation_error(0, kind);
         }
       }
-    } else
-      llvm_unreachable("unhandled fixup");
+    }
+    raise_relocation_error(0, kind);
   }
-
-  return LastTargetFixupKind;
+  llvm_unreachable("Relocation exit not taken");
+}
 }
 
 namespace llvm {
@@ -395,23 +442,18 @@ unsigned HexagonMCCodeEmitter::getExprOpValue(const MCInst &MI,
                                               const MCSubtargetInfo &STI) const
 
 {
-  int64_t Res;
-
-  if (ME->evaluateAsAbsolute(Res))
-    return Res;
-
-  MCExpr::ExprKind MK = ME->getKind();
-  if (MK == MCExpr::Constant) {
-    return cast<MCConstantExpr>(ME)->getValue();
-  }
-  if (MK == MCExpr::Binary) {
-    getExprOpValue(MI, MO, cast<MCBinaryExpr>(ME)->getLHS(), Fixups, STI);
-    getExprOpValue(MI, MO, cast<MCBinaryExpr>(ME)->getRHS(), Fixups, STI);
+  if (isa<HexagonMCExpr>(ME))
+    ME = &HexagonMCInstrInfo::getExpr(*ME);
+  int64_t Value;
+  if (ME->evaluateAsAbsolute(Value))
+    return Value;
+  assert(ME->getKind() == MCExpr::SymbolRef || ME->getKind() == MCExpr::Binary);
+  if (ME->getKind() == MCExpr::Binary) {
+    MCBinaryExpr const *Binary = cast<MCBinaryExpr>(ME);
+    getExprOpValue(MI, MO, Binary->getLHS(), Fixups, STI);
+    getExprOpValue(MI, MO, Binary->getRHS(), Fixups, STI);
     return 0;
   }
-
-  assert(MK == MCExpr::SymbolRef);
-
   Hexagon::Fixups FixupKind =
       Hexagon::Fixups(Hexagon::fixup_Hexagon_TPREL_LO16);
   const MCSymbolRefExpr *MCSRE = static_cast<const MCSymbolRefExpr *>(ME);
@@ -430,275 +472,302 @@ unsigned HexagonMCCodeEmitter::getExprOpValue(const MCInst &MI,
 
   switch (bits) {
   default:
-    DEBUG(dbgs() << "unrecognized bit count of " << bits << '\n');
-    break;
-
+    raise_relocation_error(bits, kind);
   case 32:
     switch (kind) {
-    case llvm::MCSymbolRefExpr::VK_Hexagon_PCREL:
-      FixupKind = Hexagon::fixup_Hexagon_32_PCREL;
+    case MCSymbolRefExpr::VK_DTPREL:
+      FixupKind = *Extended ? Hexagon::fixup_Hexagon_DTPREL_32_6_X
+                            : Hexagon::fixup_Hexagon_DTPREL_32;
       break;
-    case llvm::MCSymbolRefExpr::VK_GOT:
+    case MCSymbolRefExpr::VK_GOT:
       FixupKind = *Extended ? Hexagon::fixup_Hexagon_GOT_32_6_X
                             : Hexagon::fixup_Hexagon_GOT_32;
       break;
-    case llvm::MCSymbolRefExpr::VK_GOTOFF:
+    case MCSymbolRefExpr::VK_GOTREL:
       FixupKind = *Extended ? Hexagon::fixup_Hexagon_GOTREL_32_6_X
                             : Hexagon::fixup_Hexagon_GOTREL_32;
       break;
-    case llvm::MCSymbolRefExpr::VK_Hexagon_GD_GOT:
+    case MCSymbolRefExpr::VK_Hexagon_GD_GOT:
       FixupKind = *Extended ? Hexagon::fixup_Hexagon_GD_GOT_32_6_X
                             : Hexagon::fixup_Hexagon_GD_GOT_32;
       break;
-    case llvm::MCSymbolRefExpr::VK_Hexagon_LD_GOT:
-      FixupKind = *Extended ? Hexagon::fixup_Hexagon_LD_GOT_32_6_X
-                            : Hexagon::fixup_Hexagon_LD_GOT_32;
-      break;
-    case llvm::MCSymbolRefExpr::VK_Hexagon_IE:
+    case MCSymbolRefExpr::VK_Hexagon_IE:
       FixupKind = *Extended ? Hexagon::fixup_Hexagon_IE_32_6_X
                             : Hexagon::fixup_Hexagon_IE_32;
       break;
-    case llvm::MCSymbolRefExpr::VK_Hexagon_IE_GOT:
+    case MCSymbolRefExpr::VK_Hexagon_IE_GOT:
       FixupKind = *Extended ? Hexagon::fixup_Hexagon_IE_GOT_32_6_X
                             : Hexagon::fixup_Hexagon_IE_GOT_32;
       break;
-    case llvm::MCSymbolRefExpr::VK_TPREL:
-      FixupKind = *Extended ? Hexagon::fixup_Hexagon_TPREL_32_6_X
-                            : Hexagon::fixup_Hexagon_TPREL_32;
+    case MCSymbolRefExpr::VK_Hexagon_LD_GOT:
+      FixupKind = *Extended ? Hexagon::fixup_Hexagon_LD_GOT_32_6_X
+                            : Hexagon::fixup_Hexagon_LD_GOT_32;
       break;
-    case llvm::MCSymbolRefExpr::VK_DTPREL:
-      FixupKind = *Extended ? Hexagon::fixup_Hexagon_DTPREL_32_6_X
-                            : Hexagon::fixup_Hexagon_DTPREL_32;
+    case MCSymbolRefExpr::VK_Hexagon_PCREL:
+      FixupKind = Hexagon::fixup_Hexagon_32_PCREL;
       break;
-    default:
+    case MCSymbolRefExpr::VK_None:
       FixupKind =
           *Extended ? Hexagon::fixup_Hexagon_32_6_X : Hexagon::fixup_Hexagon_32;
       break;
+    case MCSymbolRefExpr::VK_TPREL:
+      FixupKind = *Extended ? Hexagon::fixup_Hexagon_TPREL_32_6_X
+                            : Hexagon::fixup_Hexagon_TPREL_32;
+      break;
+    default:
+      raise_relocation_error(bits, kind);
     }
     break;
 
   case 22:
     switch (kind) {
-    case llvm::MCSymbolRefExpr::VK_Hexagon_GD_PLT:
+    case MCSymbolRefExpr::VK_Hexagon_GD_PLT:
       FixupKind = Hexagon::fixup_Hexagon_GD_PLT_B22_PCREL;
       break;
-    case llvm::MCSymbolRefExpr::VK_Hexagon_LD_PLT:
+    case MCSymbolRefExpr::VK_Hexagon_LD_PLT:
       FixupKind = Hexagon::fixup_Hexagon_LD_PLT_B22_PCREL;
       break;
-    default:
-      if (MCID.isBranch() || MCID.isCall()) {
-        FixupKind = *Extended ? Hexagon::fixup_Hexagon_B22_PCREL_X
-                              : Hexagon::fixup_Hexagon_B22_PCREL;
-      } else {
-        errs() << "unrecognized relocation, bits: " << bits << "\n";
-        errs() << "name = " << HexagonMCInstrInfo::getName(MCII, MI) << "\n";
-      }
+    case MCSymbolRefExpr::VK_None:
+      FixupKind = *Extended ? Hexagon::fixup_Hexagon_B22_PCREL_X
+                            : Hexagon::fixup_Hexagon_B22_PCREL;
+      break;
+    case MCSymbolRefExpr::VK_PLT:
+      FixupKind = Hexagon::fixup_Hexagon_PLT_B22_PCREL;
       break;
+    default:
+      raise_relocation_error(bits, kind);
     }
     break;
 
   case 16:
     if (*Extended) {
       switch (kind) {
-      default:
-        FixupKind = Hexagon::fixup_Hexagon_16_X;
+      case MCSymbolRefExpr::VK_DTPREL:
+        FixupKind = Hexagon::fixup_Hexagon_DTPREL_16_X;
         break;
-      case llvm::MCSymbolRefExpr::VK_GOT:
+      case MCSymbolRefExpr::VK_GOT:
         FixupKind = Hexagon::fixup_Hexagon_GOT_16_X;
         break;
-      case llvm::MCSymbolRefExpr::VK_GOTOFF:
+      case MCSymbolRefExpr::VK_GOTREL:
         FixupKind = Hexagon::fixup_Hexagon_GOTREL_16_X;
         break;
-      case llvm::MCSymbolRefExpr::VK_Hexagon_GD_GOT:
+      case MCSymbolRefExpr::VK_Hexagon_GD_GOT:
         FixupKind = Hexagon::fixup_Hexagon_GD_GOT_16_X;
         break;
-      case llvm::MCSymbolRefExpr::VK_Hexagon_LD_GOT:
-        FixupKind = Hexagon::fixup_Hexagon_LD_GOT_16_X;
-        break;
-      case llvm::MCSymbolRefExpr::VK_Hexagon_IE:
+      case MCSymbolRefExpr::VK_Hexagon_IE:
         FixupKind = Hexagon::fixup_Hexagon_IE_16_X;
         break;
-      case llvm::MCSymbolRefExpr::VK_Hexagon_IE_GOT:
+      case MCSymbolRefExpr::VK_Hexagon_IE_GOT:
         FixupKind = Hexagon::fixup_Hexagon_IE_GOT_16_X;
         break;
-      case llvm::MCSymbolRefExpr::VK_TPREL:
-        FixupKind = Hexagon::fixup_Hexagon_TPREL_16_X;
+      case MCSymbolRefExpr::VK_Hexagon_LD_GOT:
+        FixupKind = Hexagon::fixup_Hexagon_LD_GOT_16_X;
         break;
-      case llvm::MCSymbolRefExpr::VK_DTPREL:
-        FixupKind = Hexagon::fixup_Hexagon_DTPREL_16_X;
+      case MCSymbolRefExpr::VK_None:
+        FixupKind = Hexagon::fixup_Hexagon_16_X;
+        break;
+      case MCSymbolRefExpr::VK_TPREL:
+        FixupKind = Hexagon::fixup_Hexagon_TPREL_16_X;
         break;
+      default:
+        raise_relocation_error(bits, kind);
       }
     } else
       switch (kind) {
-      default:
-        errs() << "unrecognized relocation, bits " << bits << "\n";
-        errs() << "name = " << HexagonMCInstrInfo::getName(MCII, MI) << "\n";
+      case MCSymbolRefExpr::VK_None: {
+        if (HexagonMCInstrInfo::s23_2_reloc(*MO.getExpr()))
+          FixupKind = Hexagon::fixup_Hexagon_23_REG;
+        else
+          raise_relocation_error(bits, kind);
+        break;
+      }
+      case MCSymbolRefExpr::VK_DTPREL:
+        FixupKind = Hexagon::fixup_Hexagon_DTPREL_16;
         break;
-      case llvm::MCSymbolRefExpr::VK_GOTOFF:
-        if ((MCID.getOpcode() == Hexagon::HI) ||
-            (MCID.getOpcode() == Hexagon::LO_H))
+      case MCSymbolRefExpr::VK_GOTREL:
+        if (MCID.getOpcode() == Hexagon::HI)
           FixupKind = Hexagon::fixup_Hexagon_GOTREL_HI16;
         else
           FixupKind = Hexagon::fixup_Hexagon_GOTREL_LO16;
         break;
-      case llvm::MCSymbolRefExpr::VK_Hexagon_GPREL:
-        FixupKind = Hexagon::fixup_Hexagon_GPREL16_0;
+      case MCSymbolRefExpr::VK_Hexagon_GD_GOT:
+        FixupKind = Hexagon::fixup_Hexagon_GD_GOT_16;
         break;
-      case llvm::MCSymbolRefExpr::VK_Hexagon_LO16:
-        FixupKind = Hexagon::fixup_Hexagon_LO16;
+      case MCSymbolRefExpr::VK_Hexagon_GPREL:
+        FixupKind = Hexagon::fixup_Hexagon_GPREL16_0;
         break;
-      case llvm::MCSymbolRefExpr::VK_Hexagon_HI16:
+      case MCSymbolRefExpr::VK_Hexagon_HI16:
         FixupKind = Hexagon::fixup_Hexagon_HI16;
         break;
-      case llvm::MCSymbolRefExpr::VK_Hexagon_GD_GOT:
-        FixupKind = Hexagon::fixup_Hexagon_GD_GOT_16;
+      case MCSymbolRefExpr::VK_Hexagon_IE_GOT:
+        FixupKind = Hexagon::fixup_Hexagon_IE_GOT_16;
         break;
-      case llvm::MCSymbolRefExpr::VK_Hexagon_LD_GOT:
+      case MCSymbolRefExpr::VK_Hexagon_LD_GOT:
         FixupKind = Hexagon::fixup_Hexagon_LD_GOT_16;
         break;
-      case llvm::MCSymbolRefExpr::VK_Hexagon_IE_GOT:
-        FixupKind = Hexagon::fixup_Hexagon_IE_GOT_16;
+      case MCSymbolRefExpr::VK_Hexagon_LO16:
+        FixupKind = Hexagon::fixup_Hexagon_LO16;
         break;
-      case llvm::MCSymbolRefExpr::VK_TPREL:
+      case MCSymbolRefExpr::VK_TPREL:
         FixupKind = Hexagon::fixup_Hexagon_TPREL_16;
         break;
-      case llvm::MCSymbolRefExpr::VK_DTPREL:
-        FixupKind = Hexagon::fixup_Hexagon_DTPREL_16;
-        break;
+      default:
+        raise_relocation_error(bits, kind);
       }
     break;
 
   case 15:
-    if (MCID.isBranch() || MCID.isCall())
+    switch (kind) {
+    case MCSymbolRefExpr::VK_None:
       FixupKind = *Extended ? Hexagon::fixup_Hexagon_B15_PCREL_X
                             : Hexagon::fixup_Hexagon_B15_PCREL;
+      break;
+    default:
+      raise_relocation_error(bits, kind);
+    }
     break;
 
   case 13:
-    if (MCID.isBranch())
+    switch (kind) {
+    case MCSymbolRefExpr::VK_None:
       FixupKind = Hexagon::fixup_Hexagon_B13_PCREL;
-    else {
-      errs() << "unrecognized relocation, bits " << bits << "\n";
-      errs() << "name = " << HexagonMCInstrInfo::getName(MCII, MI) << "\n";
+      break;
+    default:
+      raise_relocation_error(bits, kind);
     }
     break;
 
   case 12:
     if (*Extended)
       switch (kind) {
-      default:
-        FixupKind = Hexagon::fixup_Hexagon_12_X;
-        break;
       // There isn't a GOT_12_X, both 11_X and 16_X resolve to 6/26
-      case llvm::MCSymbolRefExpr::VK_GOT:
+      case MCSymbolRefExpr::VK_GOT:
         FixupKind = Hexagon::fixup_Hexagon_GOT_16_X;
         break;
-      case llvm::MCSymbolRefExpr::VK_GOTOFF:
+      case MCSymbolRefExpr::VK_GOTREL:
         FixupKind = Hexagon::fixup_Hexagon_GOTREL_16_X;
         break;
+      case MCSymbolRefExpr::VK_None:
+        FixupKind = Hexagon::fixup_Hexagon_12_X;
+        break;
+      default:
+        raise_relocation_error(bits, kind);
       }
-    else {
-      errs() << "unrecognized relocation, bits " << bits << "\n";
-      errs() << "name = " << HexagonMCInstrInfo::getName(MCII, MI) << "\n";
-    }
+    else
+      raise_relocation_error(bits, kind);
     break;
 
   case 11:
     if (*Extended)
       switch (kind) {
-      default:
-        FixupKind = Hexagon::fixup_Hexagon_11_X;
+      case MCSymbolRefExpr::VK_DTPREL:
+        FixupKind = Hexagon::fixup_Hexagon_DTPREL_11_X;
         break;
-      case llvm::MCSymbolRefExpr::VK_GOT:
+      case MCSymbolRefExpr::VK_GOT:
         FixupKind = Hexagon::fixup_Hexagon_GOT_11_X;
         break;
-      case llvm::MCSymbolRefExpr::VK_GOTOFF:
+      case MCSymbolRefExpr::VK_GOTREL:
         FixupKind = Hexagon::fixup_Hexagon_GOTREL_11_X;
         break;
-      case llvm::MCSymbolRefExpr::VK_Hexagon_GD_GOT:
+      case MCSymbolRefExpr::VK_Hexagon_GD_GOT:
         FixupKind = Hexagon::fixup_Hexagon_GD_GOT_11_X;
         break;
-      case llvm::MCSymbolRefExpr::VK_Hexagon_LD_GOT:
+      case MCSymbolRefExpr::VK_Hexagon_IE_GOT:
+        FixupKind = Hexagon::fixup_Hexagon_IE_GOT_11_X;
+        break;
+      case MCSymbolRefExpr::VK_Hexagon_LD_GOT:
         FixupKind = Hexagon::fixup_Hexagon_LD_GOT_11_X;
         break;
-      case llvm::MCSymbolRefExpr::VK_Hexagon_IE_GOT:
-        FixupKind = Hexagon::fixup_Hexagon_IE_GOT_11_X;
+      case MCSymbolRefExpr::VK_None:
+        FixupKind = Hexagon::fixup_Hexagon_11_X;
         break;
-      case llvm::MCSymbolRefExpr::VK_TPREL:
+      case MCSymbolRefExpr::VK_TPREL:
         FixupKind = Hexagon::fixup_Hexagon_TPREL_11_X;
         break;
-      case llvm::MCSymbolRefExpr::VK_DTPREL:
-        FixupKind = Hexagon::fixup_Hexagon_DTPREL_11_X;
-        break;
+      default:
+        raise_relocation_error(bits, kind);
       }
     else {
-      errs() << "unrecognized relocation, bits " << bits << "\n";
-      errs() << "name = " << HexagonMCInstrInfo::getName(MCII, MI) << "\n";
+      switch (kind) {
+      case MCSymbolRefExpr::VK_TPREL:
+        FixupKind = Hexagon::fixup_Hexagon_TPREL_11_X;
+        break;
+      default:
+        raise_relocation_error(bits, kind);
+      }
     }
     break;
 
   case 10:
-    if (*Extended)
-      FixupKind = Hexagon::fixup_Hexagon_10_X;
+    if (*Extended) {
+      switch (kind) {
+      case MCSymbolRefExpr::VK_None:
+        FixupKind = Hexagon::fixup_Hexagon_10_X;
+        break;
+      default:
+        raise_relocation_error(bits, kind);
+      }
+    } else
+      raise_relocation_error(bits, kind);
     break;
 
   case 9:
     if (MCID.isBranch() ||
-        (llvm::HexagonMCInstrInfo::getType(MCII, MI) == HexagonII::TypeCR))
+        (HexagonMCInstrInfo::getType(MCII, MI) == HexagonII::TypeCR))
       FixupKind = *Extended ? Hexagon::fixup_Hexagon_B9_PCREL_X
                             : Hexagon::fixup_Hexagon_B9_PCREL;
     else if (*Extended)
       FixupKind = Hexagon::fixup_Hexagon_9_X;
-    else {
-      errs() << "unrecognized relocation, bits " << bits << "\n";
-      errs() << "name = " << HexagonMCInstrInfo::getName(MCII, MI) << "\n";
-    }
+    else
+      raise_relocation_error(bits, kind);
     break;
 
   case 8:
     if (*Extended)
       FixupKind = Hexagon::fixup_Hexagon_8_X;
-    else {
-      errs() << "unrecognized relocation, bits " << bits << "\n";
-      errs() << "name = " << HexagonMCInstrInfo::getName(MCII, MI) << "\n";
-    }
+    else
+      raise_relocation_error(bits, kind);
     break;
 
   case 7:
     if (MCID.isBranch() ||
-        (llvm::HexagonMCInstrInfo::getType(MCII, MI) == HexagonII::TypeCR))
+        (HexagonMCInstrInfo::getType(MCII, MI) == HexagonII::TypeCR))
       FixupKind = *Extended ? Hexagon::fixup_Hexagon_B7_PCREL_X
                             : Hexagon::fixup_Hexagon_B7_PCREL;
     else if (*Extended)
       FixupKind = Hexagon::fixup_Hexagon_7_X;
-    else {
-      errs() << "unrecognized relocation, bits " << bits << "\n";
-      errs() << "name = " << HexagonMCInstrInfo::getName(MCII, MI) << "\n";
-    }
+    else
+      raise_relocation_error(bits, kind);
     break;
 
   case 6:
     if (*Extended) {
       switch (kind) {
-      default:
-        FixupKind = Hexagon::fixup_Hexagon_6_X;
-        break;
-      case llvm::MCSymbolRefExpr::VK_Hexagon_PCREL:
-        FixupKind = Hexagon::fixup_Hexagon_6_PCREL_X;
+      case MCSymbolRefExpr::VK_DTPREL:
+        FixupKind = Hexagon::fixup_Hexagon_DTPREL_16_X;
         break;
       // This is part of an extender, GOT_11 is a
       // Word32_U6 unsigned/truncated reloc.
-      case llvm::MCSymbolRefExpr::VK_GOT:
+      case MCSymbolRefExpr::VK_GOT:
         FixupKind = Hexagon::fixup_Hexagon_GOT_11_X;
         break;
-      case llvm::MCSymbolRefExpr::VK_GOTOFF:
+      case MCSymbolRefExpr::VK_GOTREL:
         FixupKind = Hexagon::fixup_Hexagon_GOTREL_11_X;
         break;
+      case MCSymbolRefExpr::VK_Hexagon_PCREL:
+        FixupKind = Hexagon::fixup_Hexagon_6_PCREL_X;
+        break;
+      case MCSymbolRefExpr::VK_TPREL:
+        FixupKind = Hexagon::fixup_Hexagon_TPREL_16_X;
+        break;
+      case MCSymbolRefExpr::VK_None:
+        FixupKind = Hexagon::fixup_Hexagon_6_X;
+        break;
+      default:
+        raise_relocation_error(bits, kind);
       }
-    } else {
-      errs() << "unrecognized relocation, bits " << bits << "\n";
-      errs() << "name = " << HexagonMCInstrInfo::getName(MCII, MI) << "\n";
-    }
+    } else
+      raise_relocation_error(bits, kind);
     break;
 
   case 0:
@@ -706,29 +775,39 @@ unsigned HexagonMCCodeEmitter::getExprOpValue(const MCInst &MI,
     break;
   }
 
-  MCExpr const *FixupExpression = (*Addend > 0 && isPCRel(FixupKind)) ?
-    MCBinaryExpr::createAdd(MO.getExpr(),
-                            MCConstantExpr::create(*Addend, MCT), MCT) :
-    MO.getExpr();
+  MCExpr const *FixupExpression =
+      (*Addend > 0 && isPCRel(FixupKind))
+          ? MCBinaryExpr::createAdd(MO.getExpr(),
+                                    MCConstantExpr::create(*Addend, MCT), MCT)
+          : MO.getExpr();
 
-  MCFixup fixup = MCFixup::create(*Addend, FixupExpression, 
+  MCFixup fixup = MCFixup::create(*Addend, FixupExpression,
                                   MCFixupKind(FixupKind), MI.getLoc());
   Fixups.push_back(fixup);
   // All of the information is in the fixup.
-  return (0);
+  return 0;
 }
 
 unsigned
 HexagonMCCodeEmitter::getMachineOpValue(MCInst const &MI, MCOperand const &MO,
                                         SmallVectorImpl<MCFixup> &Fixups,
                                         MCSubtargetInfo const &STI) const {
-  if (MO.isReg())
-    return MCT.getRegisterInfo()->getEncodingValue(MO.getReg());
-  if (MO.isImm())
-    return static_cast<unsigned>(MO.getImm());
+  assert(!MO.isImm());
+  if (MO.isReg()) {
+    unsigned Reg = MO.getReg();
+    if (HexagonMCInstrInfo::isSubInstruction(MI))
+      return HexagonMCInstrInfo::getDuplexRegisterNumbering(Reg);
+    switch(MI.getOpcode()){
+    case Hexagon::A2_tfrrcr:
+    case Hexagon::A2_tfrcrr:
+      if(Reg == Hexagon::M0)
+        Reg = Hexagon::C6;
+      if(Reg == Hexagon::M1)
+        Reg = Hexagon::C7;
+    }
+    return MCT.getRegisterInfo()->getEncodingValue(Reg);
+  }
 
-  // MO must be an ME.
-  assert(MO.isExpr());
   return getExprOpValue(MI, MO, MO.getExpr(), Fixups, STI);
 }
 
diff --git a/lib/Target/Hexagon/MCTargetDesc/HexagonMCDuplexInfo.cpp b/lib/Target/Hexagon/MCTargetDesc/HexagonMCDuplexInfo.cpp
index e6194f61a6ba..88336217cc8d 100644
--- a/lib/Target/Hexagon/MCTargetDesc/HexagonMCDuplexInfo.cpp
+++ b/lib/Target/Hexagon/MCTargetDesc/HexagonMCDuplexInfo.cpp
@@ -80,9 +80,6 @@ static const std::pair<unsigned, unsigned> opcodeData[] = {
     std::make_pair((unsigned)V4_SS2_storewi0, 4096),
     std::make_pair((unsigned)V4_SS2_storewi1, 4352)};
 
-static std::map<unsigned, unsigned>
-    subinstOpcodeMap(std::begin(opcodeData), std::end(opcodeData));
-
 bool HexagonMCInstrInfo::isDuplexPairMatch(unsigned Ga, unsigned Gb) {
   switch (Ga) {
   case HexagonII::HSIG_None:
@@ -587,6 +584,9 @@ bool HexagonMCInstrInfo::isOrderedDuplexPair(MCInstrInfo const &MCII,
   unsigned MIaG = HexagonMCInstrInfo::getDuplexCandidateGroup(MIa),
            MIbG = HexagonMCInstrInfo::getDuplexCandidateGroup(MIb);
 
+  static std::map<unsigned, unsigned> subinstOpcodeMap(std::begin(opcodeData),
+                                                       std::end(opcodeData));
+
   // If a duplex contains 2 insns in the same group, the insns must be
   // ordered such that the numerically smaller opcode is in slot 1.
   if ((MIaG != HexagonII::HSIG_None) && (MIaG == MIbG) && bisReversable) {
diff --git a/lib/Target/Hexagon/MCTargetDesc/HexagonMCELFStreamer.cpp b/lib/Target/Hexagon/MCTargetDesc/HexagonMCELFStreamer.cpp
index eaa3550d07f6..67dcb8fea739 100644
--- a/lib/Target/Hexagon/MCTargetDesc/HexagonMCELFStreamer.cpp
+++ b/lib/Target/Hexagon/MCTargetDesc/HexagonMCELFStreamer.cpp
@@ -107,15 +107,20 @@ void HexagonMCELFStreamer::HexagonMCEmitCommonSymbol(MCSymbol *Symbol,
         ((AccessSize == 0) || (Size == 0) || (Size > GPSize))
             ? ".bss"
             : sbss[(Log2_64(AccessSize))];
-
-    MCSection *CrntSection = getCurrentSection().first;
-    MCSection *Section = getAssembler().getContext().getELFSection(
+    MCSection &Section = *getAssembler().getContext().getELFSection(
         SectionName, ELF::SHT_NOBITS, ELF::SHF_WRITE | ELF::SHF_ALLOC);
-    SwitchSection(Section);
-    AssignFragment(Symbol, getCurrentFragment());
+    MCSectionSubPair P = getCurrentSection();
+    SwitchSection(&Section);
+
+    EmitValueToAlignment(ByteAlignment, 0, 1, 0);
+    EmitLabel(Symbol);
+    EmitZeros(Size);
+
+    // Update the maximum alignment of the section if necessary.
+    if (ByteAlignment > Section.getAlignment())
+      Section.setAlignment(ByteAlignment);
 
-    MCELFStreamer::EmitCommonSymbol(Symbol, Size, ByteAlignment);
-    SwitchSection(CrntSection);
+    SwitchSection(P.first, P.second);
   } else {
     if (ELFSymbol->declareCommon(Size, ByteAlignment))
       report_fatal_error("Symbol: " + Symbol->getName() +
diff --git a/lib/Target/Hexagon/MCTargetDesc/HexagonMCExpr.cpp b/lib/Target/Hexagon/MCTargetDesc/HexagonMCExpr.cpp
index fc6262657514..e93906a0a396 100644
--- a/lib/Target/Hexagon/MCTargetDesc/HexagonMCExpr.cpp
+++ b/lib/Target/Hexagon/MCTargetDesc/HexagonMCExpr.cpp
@@ -10,6 +10,7 @@
 
 #include "HexagonMCExpr.h"
 #include "llvm/MC/MCContext.h"
+#include "llvm/MC/MCStreamer.h"
 #include "llvm/MC/MCValue.h"
 #include "llvm/Support/raw_ostream.h"
 
@@ -17,33 +18,61 @@ using namespace llvm;
 
 #define DEBUG_TYPE "hexagon-mcexpr"
 
-HexagonNoExtendOperand *HexagonNoExtendOperand::Create(MCExpr const *Expr,
-                                                       MCContext &Ctx) {
-  return new (Ctx) HexagonNoExtendOperand(Expr);
+HexagonMCExpr *HexagonMCExpr::create(MCExpr const *Expr, MCContext &Ctx) {
+  return new (Ctx) HexagonMCExpr(Expr);
 }
 
-bool HexagonNoExtendOperand::evaluateAsRelocatableImpl(
-    MCValue &Res, MCAsmLayout const *Layout, MCFixup const *Fixup) const {
+bool HexagonMCExpr::evaluateAsRelocatableImpl(MCValue &Res,
+                                              MCAsmLayout const *Layout,
+                                              MCFixup const *Fixup) const {
   return Expr->evaluateAsRelocatable(Res, Layout, Fixup);
 }
 
-void HexagonNoExtendOperand::visitUsedExpr(MCStreamer &Streamer) const {}
+void HexagonMCExpr::visitUsedExpr(MCStreamer &Streamer) const {
+  Streamer.visitUsedExpr(*Expr);
+}
 
-MCFragment *llvm::HexagonNoExtendOperand::findAssociatedFragment() const {
+MCFragment *llvm::HexagonMCExpr::findAssociatedFragment() const {
   return Expr->findAssociatedFragment();
 }
 
-void HexagonNoExtendOperand::fixELFSymbolsInTLSFixups(MCAssembler &Asm) const {}
+void HexagonMCExpr::fixELFSymbolsInTLSFixups(MCAssembler &Asm) const {}
+
+MCExpr const *HexagonMCExpr::getExpr() const { return Expr; }
+
+void HexagonMCExpr::setMustExtend(bool Val) {
+  assert((!Val || !MustNotExtend) && "Extension contradiction");
+  MustExtend = Val;
+}
+
+bool HexagonMCExpr::mustExtend() const { return MustExtend; }
+void HexagonMCExpr::setMustNotExtend(bool Val) {
+  assert((!Val || !MustExtend) && "Extension contradiction");
+  MustNotExtend = Val;
+}
+bool HexagonMCExpr::mustNotExtend() const { return MustNotExtend; }
 
-MCExpr const *HexagonNoExtendOperand::getExpr() const { return Expr; }
+bool HexagonMCExpr::s23_2_reloc() const { return S23_2_reloc; }
+void HexagonMCExpr::setS23_2_reloc(bool Val) {
+  S23_2_reloc = Val;
+}
 
-bool HexagonNoExtendOperand::classof(MCExpr const *E) {
+bool HexagonMCExpr::classof(MCExpr const *E) {
   return E->getKind() == MCExpr::Target;
 }
 
-HexagonNoExtendOperand::HexagonNoExtendOperand(MCExpr const *Expr)
-    : Expr(Expr) {}
+HexagonMCExpr::HexagonMCExpr(MCExpr const *Expr)
+    : Expr(Expr), MustNotExtend(false), MustExtend(false), S23_2_reloc(false),
+      SignMismatch(false) {}
 
-void HexagonNoExtendOperand::printImpl(raw_ostream &OS, const MCAsmInfo *MAI) const {
+void HexagonMCExpr::printImpl(raw_ostream &OS, const MCAsmInfo *MAI) const {
   Expr->print(OS, MAI);
 }
+
+void HexagonMCExpr::setSignMismatch(bool Val) {
+  SignMismatch = Val;
+}
+
+bool HexagonMCExpr::signMismatch() const {
+  return SignMismatch;
+}
+\ No newline at end of file
diff --git a/lib/Target/Hexagon/MCTargetDesc/HexagonMCExpr.h b/lib/Target/Hexagon/MCTargetDesc/HexagonMCExpr.h
index 60f180fb2bc4..bca40cfaf6f4 100644
--- a/lib/Target/Hexagon/MCTargetDesc/HexagonMCExpr.h
+++ b/lib/Target/Hexagon/MCTargetDesc/HexagonMCExpr.h
@@ -14,9 +14,9 @@
 
 namespace llvm {
 class MCInst;
-class HexagonNoExtendOperand : public MCTargetExpr {
+class HexagonMCExpr : public MCTargetExpr {
 public:
-  static HexagonNoExtendOperand *Create(MCExpr const *Expr, MCContext &Ctx);
+  static HexagonMCExpr *create(MCExpr const *Expr, MCContext &Ctx);
   void printImpl(raw_ostream &OS, const MCAsmInfo *MAI) const override;
   bool evaluateAsRelocatableImpl(MCValue &Res, const MCAsmLayout *Layout,
                                  const MCFixup *Fixup) const override;
@@ -25,10 +25,22 @@ public:
   void fixELFSymbolsInTLSFixups(MCAssembler &Asm) const override;
   static bool classof(MCExpr const *E);
   MCExpr const *getExpr() const;
+  void setMustExtend(bool Val = true);
+  bool mustExtend() const;
+  void setMustNotExtend(bool Val = true);
+  bool mustNotExtend() const;
+  void setS23_2_reloc(bool Val = true);
+  bool s23_2_reloc() const;
+  void setSignMismatch(bool Val = true);
+  bool signMismatch() const;
 
 private:
-  HexagonNoExtendOperand(MCExpr const *Expr);
+  HexagonMCExpr(MCExpr const *Expr);
   MCExpr const *Expr;
+  bool MustNotExtend;
+  bool MustExtend;
+  bool S23_2_reloc;
+  bool SignMismatch;
 };
 } // end namespace llvm
 
diff --git a/lib/Target/Hexagon/MCTargetDesc/HexagonMCInstrInfo.cpp b/lib/Target/Hexagon/MCTargetDesc/HexagonMCInstrInfo.cpp
index e6842076db2a..941cbd6dc35d 100644
--- a/lib/Target/Hexagon/MCTargetDesc/HexagonMCInstrInfo.cpp
+++ b/lib/Target/Hexagon/MCTargetDesc/HexagonMCInstrInfo.cpp
@@ -99,7 +99,8 @@ void HexagonMCInstrInfo::clampExtended(MCInstrInfo const &MCII,
   int64_t Value;
   if (exOp.getExpr()->evaluateAsAbsolute(Value)) {
     unsigned Shift = HexagonMCInstrInfo::getExtentAlignment(MCII, MCI);
-    exOp.setExpr(MCConstantExpr::create((Value & 0x3f) << Shift, Context));
+    exOp.setExpr(HexagonMCExpr::create(
+        MCConstantExpr::create((Value & 0x3f) << Shift, Context), Context));
   }
 }
 
@@ -159,8 +160,8 @@ MCInst const *HexagonMCInstrInfo::extenderForIndex(MCInst const &MCB,
 
 void HexagonMCInstrInfo::extendIfNeeded(MCContext &Context,
                                         MCInstrInfo const &MCII, MCInst &MCB,
-                                        MCInst const &MCI, bool MustExtend) {
-  if (isConstExtended(MCII, MCI) || MustExtend)
+                                        MCInst const &MCI) {
+  if (isConstExtended(MCII, MCI))
     addConstExtender(Context, MCII, MCB, MCI);
 }
 
@@ -190,6 +191,61 @@ MCInstrDesc const &HexagonMCInstrInfo::getDesc(MCInstrInfo const &MCII,
   return (MCII.get(MCI.getOpcode()));
 }
 
+unsigned HexagonMCInstrInfo::getDuplexRegisterNumbering(unsigned Reg) {
+  using namespace Hexagon;
+  switch (Reg) {
+  default:
+    llvm_unreachable("unknown duplex register");
+  // Rs       Rss
+  case R0:
+  case D0:
+    return 0;
+  case R1:
+  case D1:
+    return 1;
+  case R2:
+  case D2:
+    return 2;
+  case R3:
+  case D3:
+    return 3;
+  case R4:
+  case D8:
+    return 4;
+  case R5:
+  case D9:
+    return 5;
+  case R6:
+  case D10:
+    return 6;
+  case R7:
+  case D11:
+    return 7;
+  case R16:
+    return 8;
+  case R17:
+    return 9;
+  case R18:
+    return 10;
+  case R19:
+    return 11;
+  case R20:
+    return 12;
+  case R21:
+    return 13;
+  case R22:
+    return 14;
+  case R23:
+    return 15;
+  }
+}
+
+MCExpr const &HexagonMCInstrInfo::getExpr(MCExpr const &Expr) {
+  const auto &HExpr = cast<HexagonMCExpr>(Expr);
+  assert(HExpr.getExpr());
+  return *HExpr.getExpr();
+}
+
 unsigned short HexagonMCInstrInfo::getExtendableOp(MCInstrInfo const &MCII,
                                                    MCInst const &MCI) {
   const uint64_t F = HexagonMCInstrInfo::getDesc(MCII, MCI).TSFlags;
@@ -401,6 +457,12 @@ bool HexagonMCInstrInfo::isConstExtended(MCInstrInfo const &MCII,
                                          MCInst const &MCI) {
   if (HexagonMCInstrInfo::isExtended(MCII, MCI))
     return true;
+  if (!HexagonMCInstrInfo::isExtendable(MCII, MCI))
+    return false;
+  MCOperand const &MO = HexagonMCInstrInfo::getExtendableOperand(MCII, MCI);
+  if (isa<HexagonMCExpr>(MO.getExpr()) &&
+      HexagonMCInstrInfo::mustExtend(*MO.getExpr()))
+    return true;
   // Branch insns are handled as necessary by relaxation.
   if ((HexagonMCInstrInfo::getType(MCII, MCI) == HexagonII::TypeJ) ||
       (HexagonMCInstrInfo::getType(MCII, MCI) == HexagonII::TypeCOMPOUND &&
@@ -412,18 +474,11 @@ bool HexagonMCInstrInfo::isConstExtended(MCInstrInfo const &MCII,
   else if ((HexagonMCInstrInfo::getType(MCII, MCI) == HexagonII::TypeCR) &&
            (MCI.getOpcode() != Hexagon::C4_addipc))
     return false;
-  else if (!HexagonMCInstrInfo::isExtendable(MCII, MCI))
-    return false;
 
-  MCOperand const &MO = HexagonMCInstrInfo::getExtendableOperand(MCII, MCI);
-
-  // We could be using an instruction with an extendable immediate and shoehorn
-  // a global address into it. If it is a global address it will be constant
-  // extended. We do this for COMBINE.
-  // We currently only handle isGlobal() because it is the only kind of
-  // object we are going to end up with here for now.
-  // In the future we probably should add isSymbol(), etc.
   assert(!MO.isImm());
+  if (isa<HexagonMCExpr>(MO.getExpr()) &&
+      HexagonMCInstrInfo::mustNotExtend(*MO.getExpr()))
+    return false;
   int64_t Value;
   if (!MO.getExpr()->evaluateAsAbsolute(Value))
     return true;
@@ -543,6 +598,66 @@ bool HexagonMCInstrInfo::isMemStoreReorderEnabled(MCInst const &MCI) {
   return (Flags & memStoreReorderEnabledMask) != 0;
 }
 
+bool HexagonMCInstrInfo::isSubInstruction(MCInst const &MCI) {
+  switch (MCI.getOpcode()) {
+  default:
+    return false;
+  case Hexagon::V4_SA1_addi:
+  case Hexagon::V4_SA1_addrx:
+  case Hexagon::V4_SA1_addsp:
+  case Hexagon::V4_SA1_and1:
+  case Hexagon::V4_SA1_clrf:
+  case Hexagon::V4_SA1_clrfnew:
+  case Hexagon::V4_SA1_clrt:
+  case Hexagon::V4_SA1_clrtnew:
+  case Hexagon::V4_SA1_cmpeqi:
+  case Hexagon::V4_SA1_combine0i:
+  case Hexagon::V4_SA1_combine1i:
+  case Hexagon::V4_SA1_combine2i:
+  case Hexagon::V4_SA1_combine3i:
+  case Hexagon::V4_SA1_combinerz:
+  case Hexagon::V4_SA1_combinezr:
+  case Hexagon::V4_SA1_dec:
+  case Hexagon::V4_SA1_inc:
+  case Hexagon::V4_SA1_seti:
+  case Hexagon::V4_SA1_setin1:
+  case Hexagon::V4_SA1_sxtb:
+  case Hexagon::V4_SA1_sxth:
+  case Hexagon::V4_SA1_tfr:
+  case Hexagon::V4_SA1_zxtb:
+  case Hexagon::V4_SA1_zxth:
+  case Hexagon::V4_SL1_loadri_io:
+  case Hexagon::V4_SL1_loadrub_io:
+  case Hexagon::V4_SL2_deallocframe:
+  case Hexagon::V4_SL2_jumpr31:
+  case Hexagon::V4_SL2_jumpr31_f:
+  case Hexagon::V4_SL2_jumpr31_fnew:
+  case Hexagon::V4_SL2_jumpr31_t:
+  case Hexagon::V4_SL2_jumpr31_tnew:
+  case Hexagon::V4_SL2_loadrb_io:
+  case Hexagon::V4_SL2_loadrd_sp:
+  case Hexagon::V4_SL2_loadrh_io:
+  case Hexagon::V4_SL2_loadri_sp:
+  case Hexagon::V4_SL2_loadruh_io:
+  case Hexagon::V4_SL2_return:
+  case Hexagon::V4_SL2_return_f:
+  case Hexagon::V4_SL2_return_fnew:
+  case Hexagon::V4_SL2_return_t:
+  case Hexagon::V4_SL2_return_tnew:
+  case Hexagon::V4_SS1_storeb_io:
+  case Hexagon::V4_SS1_storew_io:
+  case Hexagon::V4_SS2_allocframe:
+  case Hexagon::V4_SS2_storebi0:
+  case Hexagon::V4_SS2_storebi1:
+  case Hexagon::V4_SS2_stored_sp:
+  case Hexagon::V4_SS2_storeh_io:
+  case Hexagon::V4_SS2_storew_sp:
+  case Hexagon::V4_SS2_storewi0:
+  case Hexagon::V4_SS2_storewi1:
+    return true;
+  }
+}
+
 bool HexagonMCInstrInfo::isSoloAX(MCInstrInfo const &MCII, MCInst const &MCI) {
   const uint64_t F = HexagonMCInstrInfo::getDesc(MCII, MCI).TSFlags;
   return ((F >> HexagonII::SoloAXPos) & HexagonII::SoloAXMask);
@@ -575,6 +690,25 @@ int64_t HexagonMCInstrInfo::minConstant(MCInst const &MCI, size_t Index) {
   return Value;
 }
 
+void HexagonMCInstrInfo::setMustExtend(MCExpr const &Expr, bool Val) {
+  HexagonMCExpr &HExpr = const_cast<HexagonMCExpr &>(cast<HexagonMCExpr>(Expr));
+  HExpr.setMustExtend(Val);
+}
+
+bool HexagonMCInstrInfo::mustExtend(MCExpr const &Expr) {
+  HexagonMCExpr const &HExpr = cast<HexagonMCExpr>(Expr);
+  return HExpr.mustExtend();
+}
+void HexagonMCInstrInfo::setMustNotExtend(MCExpr const &Expr, bool Val) {
+  HexagonMCExpr &HExpr =
+      const_cast<HexagonMCExpr &>(cast<HexagonMCExpr>(Expr));
+  HExpr.setMustNotExtend(Val);
+}
+bool HexagonMCInstrInfo::mustNotExtend(MCExpr const &Expr) {
+  HexagonMCExpr const &HExpr = cast<HexagonMCExpr>(Expr);
+  return HExpr.mustNotExtend();
+}
+
 void HexagonMCInstrInfo::padEndloop(MCContext &Context, MCInst &MCB) {
   MCInst Nop;
   Nop.setOpcode(Hexagon::A2_nop);
@@ -639,10 +773,32 @@ void HexagonMCInstrInfo::setMemStoreReorderEnabled(MCInst &MCI) {
   Operand.setImm(Operand.getImm() | memStoreReorderEnabledMask);
   assert(isMemStoreReorderEnabled(MCI));
 }
+void HexagonMCInstrInfo::setS23_2_reloc(MCExpr const &Expr, bool Val) {
+  HexagonMCExpr &HExpr =
+      const_cast<HexagonMCExpr &>(*llvm::cast<HexagonMCExpr>(&Expr));
+  HExpr.setS23_2_reloc(Val);
+}
+bool HexagonMCInstrInfo::s23_2_reloc(MCExpr const &Expr) {
+  HexagonMCExpr const &HExpr = *llvm::cast<HexagonMCExpr>(&Expr);
+  return HExpr.s23_2_reloc();
+}
 
 void HexagonMCInstrInfo::setOuterLoop(MCInst &MCI) {
   assert(isBundle(MCI));
   MCOperand &Operand = MCI.getOperand(0);
   Operand.setImm(Operand.getImm() | outerLoopMask);
 }
+
+unsigned HexagonMCInstrInfo::SubregisterBit(unsigned Consumer,
+                                            unsigned Producer,
+                                            unsigned Producer2) {
+  // If we're a single vector consumer of a double producer, set subreg bit
+  // based on if we're accessing the lower or upper register component
+  if (Producer >= Hexagon::W0 && Producer <= Hexagon::W15)
+    if (Consumer >= Hexagon::V0 && Consumer <= Hexagon::V31)
+      return (Consumer - Hexagon::V0) & 0x1;
+  if (Consumer == Producer2)
+    return 0x1;
+  return 0;
+}
 }
diff --git a/lib/Target/Hexagon/MCTargetDesc/HexagonMCInstrInfo.h b/lib/Target/Hexagon/MCTargetDesc/HexagonMCInstrInfo.h
index 0237b2884a3b..58a8f68b9847 100644
--- a/lib/Target/Hexagon/MCTargetDesc/HexagonMCInstrInfo.h
+++ b/lib/Target/Hexagon/MCTargetDesc/HexagonMCInstrInfo.h
@@ -75,7 +75,7 @@ MCInst createBundle();
 // Return the extender for instruction at Index or nullptr if none
 MCInst const *extenderForIndex(MCInst const &MCB, size_t Index);
 void extendIfNeeded(MCContext &Context, MCInstrInfo const &MCII, MCInst &MCB,
-                    MCInst const &MCI, bool MustExtend);
+                    MCInst const &MCI);
 
 // Create a duplex instruction given the two subinsts
 MCInst *deriveDuplex(MCContext &Context, unsigned iClass, MCInst const &inst0,
@@ -107,6 +107,9 @@ unsigned getDuplexCandidateGroup(MCInst const &MI);
 // Return a list of all possible instruction duplex combinations
 SmallVector<DuplexCandidate, 8> getDuplexPossibilties(MCInstrInfo const &MCII,
                                                       MCInst const &MCB);
+unsigned getDuplexRegisterNumbering(unsigned Reg);
+
+MCExpr const &getExpr(MCExpr const &Expr);
 
 // Return the index of the extendable operand
 unsigned short getExtendableOp(MCInstrInfo const &MCII, MCInst const &MCI);
@@ -260,7 +263,10 @@ bool isSoloAX(MCInstrInfo const &MCII, MCInst const &MCI);
 
 /// Return whether the insn can be packaged only with an A-type insn in slot #1.
 bool isSoloAin1(MCInstrInfo const &MCII, MCInst const &MCI);
+bool isSubInstruction(MCInst const &MCI);
 bool isVector(MCInstrInfo const &MCII, MCInst const &MCI);
+bool mustExtend(MCExpr const &Expr);
+bool mustNotExtend(MCExpr const &Expr);
 
 // Pad the bundle with nops to satisfy endloop requirements
 void padEndloop(MCContext &Context, MCInst &MCI);
@@ -270,16 +276,22 @@ bool prefersSlot3(MCInstrInfo const &MCII, MCInst const &MCI);
 // Replace the instructions inside MCB, represented by Candidate
 void replaceDuplex(MCContext &Context, MCInst &MCB, DuplexCandidate Candidate);
 
+bool s23_2_reloc(MCExpr const &Expr);
 // Marks a bundle as endloop0
 void setInnerLoop(MCInst &MCI);
 void setMemReorderDisabled(MCInst &MCI);
 void setMemStoreReorderEnabled(MCInst &MCI);
+void setMustExtend(MCExpr const &Expr, bool Val = true);
+void setMustNotExtend(MCExpr const &Expr, bool Val = true);
+void setS23_2_reloc(MCExpr const &Expr, bool Val = true);
 
 // Marks a bundle as endloop1
 void setOuterLoop(MCInst &MCI);
 
 // Would duplexing this instruction create a requirement to extend
 bool subInstWouldBeExtended(MCInst const &potentialDuplex);
+unsigned SubregisterBit(unsigned Consumer, unsigned Producer,
+                        unsigned Producer2);
 
 // Attempt to find and replace compound pairs
 void tryCompound(MCInstrInfo const &MCII, MCContext &Context, MCInst &MCI);
diff --git a/lib/Target/Hexagon/MCTargetDesc/HexagonMCShuffler.cpp b/lib/Target/Hexagon/MCTargetDesc/HexagonMCShuffler.cpp
index 8e70280c1a0d..7f8e7a4edb0c 100644
--- a/lib/Target/Hexagon/MCTargetDesc/HexagonMCShuffler.cpp
+++ b/lib/Target/Hexagon/MCTargetDesc/HexagonMCShuffler.cpp
@@ -180,7 +180,6 @@ llvm::HexagonMCShuffle(MCInstrInfo const &MCII, MCSubtargetInfo const &STI,
     if (MCS.size() == 1) {                     // case of one duplex
       // copy the created duplex in the shuffler to the bundle
       MCS.copyTo(MCB);
-      doneShuffling = true;
       return HexagonShuffler::SHUFFLE_SUCCESS;
     }
     // try shuffle with this duplex
diff --git a/lib/Target/Hexagon/MCTargetDesc/HexagonMCTargetDesc.cpp b/lib/Target/Hexagon/MCTargetDesc/HexagonMCTargetDesc.cpp
index 9a292577a8f3..35a1a23a8892 100644
--- a/lib/Target/Hexagon/MCTargetDesc/HexagonMCTargetDesc.cpp
+++ b/lib/Target/Hexagon/MCTargetDesc/HexagonMCTargetDesc.cpp
@@ -16,7 +16,6 @@
 #include "HexagonMCAsmInfo.h"
 #include "HexagonMCELFStreamer.h"
 #include "MCTargetDesc/HexagonInstPrinter.h"
-#include "llvm/MC/MCCodeGenInfo.h"
 #include "llvm/MC/MCContext.h"
 #include "llvm/MC/MCELFStreamer.h"
 #include "llvm/MC/MCInstrInfo.h"
@@ -48,10 +47,46 @@ cl::opt<bool> llvm::HexagonDisableDuplex
   ("mno-pairing",
    cl::desc("Disable looking for duplex instructions for Hexagon"));
 
+static cl::opt<bool> HexagonV4ArchVariant("mv4", cl::Hidden, cl::init(false),
+  cl::desc("Build for Hexagon V4"));
+
+static cl::opt<bool> HexagonV5ArchVariant("mv5", cl::Hidden, cl::init(false),
+  cl::desc("Build for Hexagon V5"));
+
+static cl::opt<bool> HexagonV55ArchVariant("mv55", cl::Hidden, cl::init(false),
+  cl::desc("Build for Hexagon V55"));
+
+static cl::opt<bool> HexagonV60ArchVariant("mv60", cl::Hidden, cl::init(false),
+  cl::desc("Build for Hexagon V60"));
+
+
+static StringRef DefaultArch = "hexagonv60";
+
+static StringRef HexagonGetArchVariant() {
+  if (HexagonV4ArchVariant)
+    return "hexagonv4";
+  if (HexagonV5ArchVariant)
+    return "hexagonv5";
+  if (HexagonV55ArchVariant)
+    return "hexagonv55";
+  if (HexagonV60ArchVariant)
+    return "hexagonv60";
+  return "";
+}
+
 StringRef HEXAGON_MC::selectHexagonCPU(const Triple &TT, StringRef CPU) {
-  if (CPU.empty())
-    CPU = "hexagonv60";
-  return CPU;
+  StringRef ArchV = HexagonGetArchVariant();
+  if (!ArchV.empty() && !CPU.empty()) {
+    if (ArchV != CPU)
+      report_fatal_error("conflicting architectures specified.");
+    return CPU;
+  }
+  if (ArchV.empty()) {
+    if (CPU.empty())
+      CPU = DefaultArch;
+    return CPU;
+  }
+  return ArchV;
 }
 
 MCInstrInfo *llvm::createHexagonMCInstrInfo() {
@@ -62,7 +97,7 @@ MCInstrInfo *llvm::createHexagonMCInstrInfo() {
 
 static MCRegisterInfo *createHexagonMCRegisterInfo(const Triple &TT) {
   MCRegisterInfo *X = new MCRegisterInfo();
-  InitHexagonMCRegisterInfo(X, Hexagon::R0);
+  InitHexagonMCRegisterInfo(X, Hexagon::R31);
   return X;
 }
 
@@ -121,10 +156,14 @@ public:
   HexagonTargetELFStreamer(MCStreamer &S, MCSubtargetInfo const &STI)
       : HexagonTargetStreamer(S) {
     auto Bits = STI.getFeatureBits();
-    unsigned Flags;
-    if (Bits.to_ullong() & llvm::Hexagon::ArchV5)
+    unsigned Flags = 0;
+    if (Bits[Hexagon::ArchV60])
+      Flags = ELF::EF_HEXAGON_MACH_V60;
+    else if (Bits[Hexagon::ArchV55])
+      Flags = ELF::EF_HEXAGON_MACH_V55;
+    else if (Bits[Hexagon::ArchV5])
       Flags = ELF::EF_HEXAGON_MACH_V5;
-    else
+    else if (Bits[Hexagon::ArchV4])
       Flags = ELF::EF_HEXAGON_MACH_V4;
     getStreamer().getAssembler().setELFHeaderEFlags(Flags);
   }
@@ -159,17 +198,6 @@ static MCAsmInfo *createHexagonMCAsmInfo(const MCRegisterInfo &MRI,
   return MAI;
 }
 
-static MCCodeGenInfo *createHexagonMCCodeGenInfo(const Triple &TT,
-                                                 Reloc::Model RM,
-                                                 CodeModel::Model CM,
-                                                 CodeGenOpt::Level OL) {
-  MCCodeGenInfo *X = new MCCodeGenInfo();
-  if (RM == Reloc::Default)
-    RM = Reloc::Static;
-  X->initMCCodeGenInfo(RM, CM, OL);
-  return X;
-}
-
 static MCInstPrinter *createHexagonMCInstPrinter(const Triple &T,
                                                  unsigned SyntaxVariant,
                                                  const MCAsmInfo &MAI,
@@ -204,10 +232,6 @@ extern "C" void LLVMInitializeHexagonTargetMC() {
   // Register the MC asm info.
   RegisterMCAsmInfoFn X(TheHexagonTarget, createHexagonMCAsmInfo);
 
-  // Register the MC codegen info.
-  TargetRegistry::RegisterMCCodeGenInfo(TheHexagonTarget,
-                                        createHexagonMCCodeGenInfo);
-
   // Register the MC instruction info.
   TargetRegistry::RegisterMCInstrInfo(TheHexagonTarget,
                                       createHexagonMCInstrInfo);
diff --git a/lib/Target/Hexagon/MCTargetDesc/Makefile b/lib/Target/Hexagon/MCTargetDesc/Makefile
deleted file mode 100644
index 885be2ddbd88..000000000000
--- a/lib/Target/Hexagon/MCTargetDesc/Makefile
+++ /dev/null
@@ -1,16 +0,0 @@
-##===- lib/Target/Hexagon/TargetDesc/Makefile --------------*- Makefile -*-===##
-#
-#                     The LLVM Compiler Infrastructure
-#
-# This file is distributed under the University of Illinois Open Source
-# License. See LICENSE.TXT for details.
-#
-##===----------------------------------------------------------------------===##
-
-LEVEL = ../../../..
-LIBRARYNAME = LLVMHexagonDesc
-
-# Hack: we need to include 'main' target directory to grab private headers
-CPP.Flags += -I$(PROJ_OBJ_DIR)/.. -I$(PROJ_SRC_DIR)/..
-
-include $(LEVEL)/Makefile.common
diff --git a/lib/Target/Hexagon/Makefile b/lib/Target/Hexagon/Makefile
deleted file mode 100644
index c53b8e56aafc..000000000000
--- a/lib/Target/Hexagon/Makefile
+++ /dev/null
@@ -1,26 +0,0 @@
-##===- lib/Target/Hexagon/Makefile -------------------------*- Makefile -*-===##
-#
-#                     The LLVM Compiler Infrastructure
-#
-# This file is distributed under the University of Illinois Open Source
-# License. See LICENSE.TXT for details.
-#
-##===----------------------------------------------------------------------===##
-LEVEL = ../../..
-LIBRARYNAME = LLVMHexagonCodeGen
-TARGET = Hexagon
-
-# Make sure that tblgen is run, first thing.
-BUILT_SOURCES = HexagonGenRegisterInfo.inc \
-                HexagonGenInstrInfo.inc  \
-                HexagonGenAsmMatcher.inc \
-                HexagonGenAsmWriter.inc \
-                HexagonGenDAGISel.inc HexagonGenSubtargetInfo.inc \
-                HexagonGenCallingConv.inc \
-                HexagonGenDFAPacketizer.inc \
-                HexagonGenMCCodeEmitter.inc \
-                HexagonGenDisassemblerTables.inc
-
-DIRS = TargetInfo MCTargetDesc Disassembler AsmParser
-
-include $(LEVEL)/Makefile.common
diff --git a/lib/Target/Hexagon/RDFCopy.cpp b/lib/Target/Hexagon/RDFCopy.cpp
index c547c7195075..61a83dada218 100644
--- a/lib/Target/Hexagon/RDFCopy.cpp
+++ b/lib/Target/Hexagon/RDFCopy.cpp
@@ -7,37 +7,85 @@
 //
 //===----------------------------------------------------------------------===//
 //
-// Simplistic RDF-based copy propagation.
+// RDF-based copy propagation.
 
 #include "RDFCopy.h"
 #include "RDFGraph.h"
 #include "llvm/CodeGen/MachineBasicBlock.h"
 #include "llvm/CodeGen/MachineDominators.h"
 #include "llvm/CodeGen/MachineInstr.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
 #include "llvm/Support/CommandLine.h"
-
-#include <atomic>
+#include "llvm/Target/TargetInstrInfo.h"
+#include "llvm/Target/TargetRegisterInfo.h"
+using namespace llvm;
+using namespace rdf;
 
 #ifndef NDEBUG
 static cl::opt<unsigned> CpLimit("rdf-cp-limit", cl::init(0), cl::Hidden);
 static unsigned CpCount = 0;
 #endif
 
-using namespace llvm;
-using namespace rdf;
+bool CopyPropagation::interpretAsCopy(const MachineInstr *MI, EqualityMap &EM) {
+  unsigned Opc = MI->getOpcode();
+  switch (Opc) {
+    case TargetOpcode::COPY: {
+      const MachineOperand &Dst = MI->getOperand(0);
+      const MachineOperand &Src = MI->getOperand(1);
+      RegisterRef DstR = { Dst.getReg(), Dst.getSubReg() };
+      RegisterRef SrcR = { Src.getReg(), Src.getSubReg() };
+      if (TargetRegisterInfo::isVirtualRegister(DstR.Reg)) {
+        if (!TargetRegisterInfo::isVirtualRegister(SrcR.Reg))
+          return false;
+        MachineRegisterInfo &MRI = DFG.getMF().getRegInfo();
+        if (MRI.getRegClass(DstR.Reg) != MRI.getRegClass(SrcR.Reg))
+          return false;
+      } else if (TargetRegisterInfo::isPhysicalRegister(DstR.Reg)) {
+        if (!TargetRegisterInfo::isPhysicalRegister(SrcR.Reg))
+          return false;
+        const TargetRegisterInfo &TRI = DFG.getTRI();
+        if (TRI.getMinimalPhysRegClass(DstR.Reg) !=
+            TRI.getMinimalPhysRegClass(SrcR.Reg))
+          return false;
+      } else {
+        // Copy between some unknown objects.
+        return false;
+      }
+      EM.insert(std::make_pair(DstR, SrcR));
+      return true;
+    }
+    case TargetOpcode::REG_SEQUENCE: {
+      const MachineOperand &Dst = MI->getOperand(0);
+      RegisterRef DefR = { Dst.getReg(), Dst.getSubReg() };
+      SmallVector<TargetInstrInfo::RegSubRegPairAndIdx,2> Inputs;
+      const TargetInstrInfo &TII = DFG.getTII();
+      if (!TII.getRegSequenceInputs(*MI, 0, Inputs))
+        return false;
+      for (auto I : Inputs) {
+        unsigned S = DFG.getTRI().composeSubRegIndices(DefR.Sub, I.SubIdx);
+        RegisterRef DR = { DefR.Reg, S };
+        RegisterRef SR = { I.Reg, I.SubReg };
+        EM.insert(std::make_pair(DR, SR));
+      }
+      return true;
+    }
+  }
+  return false;
+}
 
-void CopyPropagation::recordCopy(NodeAddr<StmtNode*> SA, MachineInstr *MI) {
-  assert(MI->getOpcode() == TargetOpcode::COPY);
-  const MachineOperand &Op0 = MI->getOperand(0), &Op1 = MI->getOperand(1);
-  RegisterRef DstR = { Op0.getReg(), Op0.getSubReg() };
-  RegisterRef SrcR = { Op1.getReg(), Op1.getSubReg() };
-  auto FS = DefM.find(SrcR);
-  if (FS == DefM.end() || FS->second.empty())
-    return;
+
+void CopyPropagation::recordCopy(NodeAddr<StmtNode*> SA, EqualityMap &EM) {
+  CopyMap.insert(std::make_pair(SA.Id, EM));
   Copies.push_back(SA.Id);
-  RDefMap[SrcR][SA.Id] = FS->second.top()->Id;
-  // Insert DstR into the map.
-  RDefMap[DstR];
+
+  for (auto I : EM) {
+    auto FS = DefM.find(I.second);
+    if (FS == DefM.end() || FS->second.empty())
+      continue; // Undefined source
+    RDefMap[I.second][SA.Id] = FS->second.top()->Id;
+    // Insert DstR into the map.
+    RDefMap[I.first];
+  }
 }
 
 
@@ -74,9 +122,9 @@ bool CopyPropagation::scanBlock(MachineBasicBlock *B) {
   for (NodeAddr<InstrNode*> IA : BA.Addr->members(DFG)) {
     if (DFG.IsCode<NodeAttrs::Stmt>(IA)) {
       NodeAddr<StmtNode*> SA = IA;
-      MachineInstr *MI = SA.Addr->getCode();
-      if (MI->isCopy())
-        recordCopy(SA, MI);
+      EqualityMap EM;
+      if (interpretAsCopy(SA.Addr->getCode(), EM))
+        recordCopy(SA, EM);
     }
 
     updateMap(IA);
@@ -97,8 +145,14 @@ bool CopyPropagation::run() {
 
   if (trace()) {
     dbgs() << "Copies:\n";
-    for (auto I : Copies)
-      dbgs() << *DFG.addr<StmtNode*>(I).Addr->getCode();
+    for (auto I : Copies) {
+      dbgs() << "Instr: " << *DFG.addr<StmtNode*>(I).Addr->getCode();
+      dbgs() << "   eq: {";
+      for (auto J : CopyMap[I])
+        dbgs() << ' ' << Print<RegisterRef>(J.first, DFG) << '='
+               << Print<RegisterRef>(J.second, DFG);
+      dbgs() << " }\n";
+    }
     dbgs() << "\nRDef map:\n";
     for (auto R : RDefMap) {
       dbgs() << Print<RegisterRef>(R.first, DFG) << " -> {";
@@ -110,70 +164,87 @@ bool CopyPropagation::run() {
   }
 
   bool Changed = false;
-  NodeSet Deleted;
 #ifndef NDEBUG
   bool HasLimit = CpLimit.getNumOccurrences() > 0;
 #endif
 
-  for (auto I : Copies) {
+  for (auto C : Copies) {
 #ifndef NDEBUG
     if (HasLimit && CpCount >= CpLimit)
       break;
 #endif
-    if (Deleted.count(I))
-      continue;
-    auto SA = DFG.addr<InstrNode*>(I);
-    NodeList Ds = SA.Addr->members_if(DFG.IsDef, DFG);
-    if (Ds.size() != 1)
-      continue;
-    NodeAddr<DefNode*> DA = Ds[0];
-    RegisterRef DR0 = DA.Addr->getRegRef();
-    NodeList Us = SA.Addr->members_if(DFG.IsUse, DFG);
-    if (Us.size() != 1)
+    auto SA = DFG.addr<InstrNode*>(C);
+    auto FS = CopyMap.find(SA.Id);
+    if (FS == CopyMap.end())
       continue;
-    NodeAddr<UseNode*> UA0 = Us[0];
-    RegisterRef UR0 = UA0.Addr->getRegRef();
-    NodeId RD0 = UA0.Addr->getReachingDef();
-
-    for (NodeId N = DA.Addr->getReachedUse(), NextN; N; N = NextN) {
-      auto UA = DFG.addr<UseNode*>(N);
-      NextN = UA.Addr->getSibling();
-      uint16_t F = UA.Addr->getFlags();
-      if ((F & NodeAttrs::PhiRef) || (F & NodeAttrs::Fixed))
-        continue;
-      if (UA.Addr->getRegRef() != DR0)
-        continue;
-      NodeAddr<InstrNode*> IA = UA.Addr->getOwner(DFG);
-      assert(DFG.IsCode<NodeAttrs::Stmt>(IA));
-      MachineInstr *MI = NodeAddr<StmtNode*>(IA).Addr->getCode();
-      if (RDefMap[UR0][IA.Id] != RD0)
+
+    EqualityMap &EM = FS->second;
+    for (NodeAddr<DefNode*> DA : SA.Addr->members_if(DFG.IsDef, DFG)) {
+      RegisterRef DR = DA.Addr->getRegRef();
+      auto FR = EM.find(DR);
+      if (FR == EM.end())
         continue;
-      MachineOperand &Op = UA.Addr->getOp();
-      if (Op.isTied())
+      RegisterRef SR = FR->second;
+      if (DR == SR)
         continue;
-      if (trace()) {
-        dbgs() << "can replace " << Print<RegisterRef>(DR0, DFG)
-               << " with " << Print<RegisterRef>(UR0, DFG) << " in "
-               << *NodeAddr<StmtNode*>(IA).Addr->getCode();
-      }
-
-      Op.setReg(UR0.Reg);
-      Op.setSubReg(UR0.Sub);
-      Changed = true;
-#ifndef NDEBUG
-      if (HasLimit && CpCount >= CpLimit)
-        break;
-      CpCount++;
-#endif
 
-      if (MI->isCopy()) {
-        MachineOperand &Op0 = MI->getOperand(0), &Op1 = MI->getOperand(1);
-        if (Op0.getReg() == Op1.getReg() && Op0.getSubReg() == Op1.getSubReg())
-          MI->eraseFromParent();
-        Deleted.insert(IA.Id);
-      }
-    }
-  }
+      auto &RDefSR = RDefMap[SR];
+      NodeId RDefSR_SA = RDefSR[SA.Id];
+
+      for (NodeId N = DA.Addr->getReachedUse(), NextN; N; N = NextN) {
+        auto UA = DFG.addr<UseNode*>(N);
+        NextN = UA.Addr->getSibling();
+        uint16_t F = UA.Addr->getFlags();
+        if ((F & NodeAttrs::PhiRef) || (F & NodeAttrs::Fixed))
+          continue;
+        if (UA.Addr->getRegRef() != DR)
+          continue;
+
+        NodeAddr<InstrNode*> IA = UA.Addr->getOwner(DFG);
+        assert(DFG.IsCode<NodeAttrs::Stmt>(IA));
+        if (RDefSR[IA.Id] != RDefSR_SA)
+          continue;
+
+        MachineOperand &Op = UA.Addr->getOp();
+        if (Op.isTied())
+          continue;
+        if (trace()) {
+          dbgs() << "Can replace " << Print<RegisterRef>(DR, DFG)
+                 << " with " << Print<RegisterRef>(SR, DFG) << " in "
+                 << *NodeAddr<StmtNode*>(IA).Addr->getCode();
+        }
+
+        Op.setReg(SR.Reg);
+        Op.setSubReg(SR.Sub);
+        DFG.unlinkUse(UA, false);
+        if (RDefSR_SA != 0) {
+          UA.Addr->linkToDef(UA.Id, DFG.addr<DefNode*>(RDefSR_SA));
+        } else {
+          UA.Addr->setReachingDef(0);
+          UA.Addr->setSibling(0);
+        }
+
+        Changed = true;
+  #ifndef NDEBUG
+        if (HasLimit && CpCount >= CpLimit)
+          break;
+        CpCount++;
+  #endif
+
+        auto FC = CopyMap.find(IA.Id);
+        if (FC != CopyMap.end()) {
+          // Update the EM map in the copy's entry.
+          auto &M = FC->second;
+          for (auto &J : M) {
+            if (J.second != DR)
+              continue;
+            J.second = SR;
+            break;
+          }
+        }
+      } // for (N in reached-uses)
+    } // for (DA in defs)
+  } // for (C in Copies)
 
   return Changed;
 }
diff --git a/lib/Target/Hexagon/RDFCopy.h b/lib/Target/Hexagon/RDFCopy.h
index 02531b94c9b0..e8a576cf57a3 100644
--- a/lib/Target/Hexagon/RDFCopy.h
+++ b/lib/Target/Hexagon/RDFCopy.h
@@ -18,17 +18,20 @@ namespace llvm {
   class MachineBasicBlock;
   class MachineDominatorTree;
   class MachineInstr;
-}
 
 namespace rdf {
   struct CopyPropagation {
     CopyPropagation(DataFlowGraph &dfg) : MDT(dfg.getDT()), DFG(dfg),
         Trace(false) {}
+    virtual ~CopyPropagation() {}
 
     bool run();
     void trace(bool On) { Trace = On; }
     bool trace() const { return Trace; }
 
+    typedef std::map<RegisterRef, RegisterRef> EqualityMap;
+    virtual bool interpretAsCopy(const MachineInstr *MI, EqualityMap &EM);
+
   private:
     const MachineDominatorTree &MDT;
     DataFlowGraph &DFG;
@@ -37,12 +40,15 @@ namespace rdf {
 
     // map: register -> (map: stmt -> reaching def)
     std::map<RegisterRef,std::map<NodeId,NodeId>> RDefMap;
+    // map: statement -> (map: dst reg -> src reg)
+    std::map<NodeId, EqualityMap> CopyMap;
     std::vector<NodeId> Copies;
 
-    void recordCopy(NodeAddr<StmtNode*> SA, MachineInstr *MI);
+    void recordCopy(NodeAddr<StmtNode*> SA, EqualityMap &EM);
     void updateMap(NodeAddr<InstrNode*> IA);
     bool scanBlock(MachineBasicBlock *B);
   };
-}
+} // namespace rdf
+} // namespace llvm
 
 #endif
diff --git a/lib/Target/Hexagon/RDFDeadCode.cpp b/lib/Target/Hexagon/RDFDeadCode.cpp
index 95668577bd50..63177d51cada 100644
--- a/lib/Target/Hexagon/RDFDeadCode.cpp
+++ b/lib/Target/Hexagon/RDFDeadCode.cpp
@@ -18,9 +18,38 @@
 #include "llvm/CodeGen/MachineFunction.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
 
+#include <queue>
+
 using namespace llvm;
 using namespace rdf;
 
+// This drastically improves execution time in "collect" over using
+// SetVector as a work queue, and popping the first element from it.
+template<typename T> struct DeadCodeElimination::SetQueue {
+  SetQueue() : Set(), Queue() {}
+
+  bool empty() const {
+    return Queue.empty();
+  }
+  T pop_front() {
+    T V = Queue.front();
+    Queue.pop();
+    Set.erase(V);
+    return V;
+  }
+  void push_back(T V) {
+    if (Set.count(V))
+      return;
+    Queue.push(V);
+    Set.insert(V);
+  }
+
+private:
+  DenseSet<T> Set;
+  std::queue<T> Queue;
+};
+
+
 // Check if the given instruction has observable side-effects, i.e. if
 // it should be considered "live". It is safe for this function to be
 // overly conservative (i.e. return "true" for all instructions), but it
@@ -40,33 +69,33 @@ bool DeadCodeElimination::isLiveInstr(const MachineInstr *MI) const {
 }
 
 void DeadCodeElimination::scanInstr(NodeAddr<InstrNode*> IA,
-      SetVector<NodeId> &WorkQ) {
+      SetQueue<NodeId> &WorkQ) {
   if (!DFG.IsCode<NodeAttrs::Stmt>(IA))
     return;
   if (!isLiveInstr(NodeAddr<StmtNode*>(IA).Addr->getCode()))
     return;
   for (NodeAddr<RefNode*> RA : IA.Addr->members(DFG)) {
     if (!LiveNodes.count(RA.Id))
-      WorkQ.insert(RA.Id);
+      WorkQ.push_back(RA.Id);
   }
 }
 
 void DeadCodeElimination::processDef(NodeAddr<DefNode*> DA,
-      SetVector<NodeId> &WorkQ) {
+      SetQueue<NodeId> &WorkQ) {
   NodeAddr<InstrNode*> IA = DA.Addr->getOwner(DFG);
   for (NodeAddr<UseNode*> UA : IA.Addr->members_if(DFG.IsUse, DFG)) {
     if (!LiveNodes.count(UA.Id))
-      WorkQ.insert(UA.Id);
+      WorkQ.push_back(UA.Id);
   }
   for (NodeAddr<DefNode*> TA : DFG.getRelatedRefs(IA, DA))
     LiveNodes.insert(TA.Id);
 }
 
 void DeadCodeElimination::processUse(NodeAddr<UseNode*> UA,
-      SetVector<NodeId> &WorkQ) {
+      SetQueue<NodeId> &WorkQ) {
   for (NodeAddr<DefNode*> DA : LV.getAllReachingDefs(UA)) {
     if (!LiveNodes.count(DA.Id))
-      WorkQ.insert(DA.Id);
+      WorkQ.push_back(DA.Id);
   }
 }
 
@@ -84,14 +113,13 @@ bool DeadCodeElimination::collect() {
   // instruction are considered live. For each live use, all its reaching
   // defs are considered live.
   LiveNodes.clear();
-  SetVector<NodeId> WorkQ;
+  SetQueue<NodeId> WorkQ;
   for (NodeAddr<BlockNode*> BA : DFG.getFunc().Addr->members(DFG))
     for (NodeAddr<InstrNode*> IA : BA.Addr->members(DFG))
       scanInstr(IA, WorkQ);
 
   while (!WorkQ.empty()) {
-    NodeId N = *WorkQ.begin();
-    WorkQ.remove(N);
+    NodeId N = WorkQ.pop_front();
     LiveNodes.insert(N);
     auto RA = DFG.addr<RefNode*>(N);
     if (DFG.IsDef(RA))
@@ -183,9 +211,9 @@ bool DeadCodeElimination::erase(const SetVector<NodeId> &Nodes) {
     if (trace())
       dbgs() << "  " << PrintNode<RefNode*>(RA, DFG) << '\n';
     if (DFG.IsUse(RA))
-      DFG.unlinkUse(RA);
+      DFG.unlinkUse(RA, true);
     else if (DFG.IsDef(RA))
-      DFG.unlinkDef(RA);
+      DFG.unlinkDef(RA, true);
   }
 
   // Now, remove all dead instruction nodes.
diff --git a/lib/Target/Hexagon/RDFDeadCode.h b/lib/Target/Hexagon/RDFDeadCode.h
index f4373fb5007d..8977e730b855 100644
--- a/lib/Target/Hexagon/RDFDeadCode.h
+++ b/lib/Target/Hexagon/RDFDeadCode.h
@@ -30,7 +30,6 @@
 
 namespace llvm {
   class MachineRegisterInfo;
-}
 
 namespace rdf {
   struct DeadCodeElimination {
@@ -55,11 +54,14 @@ namespace rdf {
     MachineRegisterInfo &MRI;
     Liveness LV;
 
+    template<typename T> struct SetQueue;
+
     bool isLiveInstr(const MachineInstr *MI) const;
-    void scanInstr(NodeAddr<InstrNode*> IA, SetVector<NodeId> &WorkQ);
-    void processDef(NodeAddr<DefNode*> DA, SetVector<NodeId> &WorkQ);
-    void processUse(NodeAddr<UseNode*> UA, SetVector<NodeId> &WorkQ);
+    void scanInstr(NodeAddr<InstrNode*> IA, SetQueue<NodeId> &WorkQ);
+    void processDef(NodeAddr<DefNode*> DA, SetQueue<NodeId> &WorkQ);
+    void processUse(NodeAddr<UseNode*> UA, SetQueue<NodeId> &WorkQ);
   };
-}
+} // namespace rdf
+} // namespace llvm
 
 #endif
diff --git a/lib/Target/Hexagon/RDFGraph.cpp b/lib/Target/Hexagon/RDFGraph.cpp
index 9b47422153bb..273d6b7cb0c8 100644
--- a/lib/Target/Hexagon/RDFGraph.cpp
+++ b/lib/Target/Hexagon/RDFGraph.cpp
@@ -25,6 +25,7 @@ using namespace rdf;
 
 // Printing functions. Have them here first, so that the rest of the code
 // can use them.
+namespace llvm {
 namespace rdf {
 
 template<>
@@ -298,6 +299,7 @@ raw_ostream &operator<< (raw_ostream &OS,
 }
 
 } // namespace rdf
+} // namespace llvm
 
 // Node allocation functions.
 //
@@ -315,7 +317,7 @@ void NodeAllocator::startNewBlock() {
   // Check if the block index is still within the allowed range, i.e. less
   // than 2^N, where N is the number of bits in NodeId for the block index.
   // BitsPerIndex is the number of bits per node index.
-  assert((Blocks.size() < (1U << (8*sizeof(NodeId)-BitsPerIndex))) &&
+  assert((Blocks.size() < ((size_t)1 << (8*sizeof(NodeId)-BitsPerIndex))) &&
          "Out of bits for block index");
   ActiveEnd = P;
 }
@@ -674,7 +676,7 @@ bool RegisterAliasInfo::alias(RegisterRef RA, RegisterRef RB) const {
 // unchanged across this def.
 bool TargetOperandInfo::isPreserving(const MachineInstr &In, unsigned OpNum)
       const {
-  return TII.isPredicated(&In);
+  return TII.isPredicated(In);
 }
 
 // Check if the definition of RR produces an unspecified value.
@@ -686,11 +688,17 @@ bool TargetOperandInfo::isClobbering(const MachineInstr &In, unsigned OpNum)
   return false;
 }
 
-// Check if the given instruction specifically requires 
+// Check if the given instruction specifically requires
 bool TargetOperandInfo::isFixedReg(const MachineInstr &In, unsigned OpNum)
       const {
-  if (In.isCall() || In.isReturn())
+  if (In.isCall() || In.isReturn() || In.isInlineAsm())
     return true;
+  // Check for a tail call.
+  if (In.isBranch())
+    for (auto &O : In.operands())
+      if (O.isGlobal() || O.isSymbol())
+        return true;
+
   const MCInstrDesc &D = In.getDesc();
   if (!D.getImplicitDefs() && !D.getImplicitUses())
     return false;
@@ -919,7 +927,7 @@ NodeAddr<FuncNode*> DataFlowGraph::newFunc(MachineFunction *MF) {
 }
 
 // Build the data flow graph.
-void DataFlowGraph::build() {
+void DataFlowGraph::build(unsigned Options) {
   reset();
   Func = newFunc(&MF);
 
@@ -964,7 +972,8 @@ void DataFlowGraph::build() {
   linkBlockRefs(DM, EA);
 
   // Finally, remove all unused phi nodes.
-  removeUnusedPhis();
+  if (!(Options & BuildOptions::KeepDeadPhis))
+    removeUnusedPhis();
 }
 
 // For each stack in the map DefM, push the delimiter for block B on it.
@@ -1167,6 +1176,17 @@ NodeAddr<RefNode*> DataFlowGraph::getNextShadow(NodeAddr<InstrNode*> IA,
 void DataFlowGraph::buildStmt(NodeAddr<BlockNode*> BA, MachineInstr &In) {
   auto SA = newStmt(BA, &In);
 
+  auto isCall = [] (const MachineInstr &In) -> bool {
+    if (In.isCall())
+      return true;
+    // Is tail call?
+    if (In.isBranch())
+      for (auto &Op : In.operands())
+        if (Op.isGlobal() || Op.isSymbol())
+          return true;
+    return false;
+  };
+
   // Collect a set of registers that this instruction implicitly uses
   // or defines. Implicit operands from an instruction will be ignored
   // unless they are listed here.
@@ -1178,8 +1198,8 @@ void DataFlowGraph::buildStmt(NodeAddr<BlockNode*> BA, MachineInstr &In) {
     while (uint16_t R = *ImpU++)
       ImpUses.insert({R, 0});
 
-  bool IsCall = In.isCall(), IsReturn = In.isReturn();
-  bool IsPredicated = TII.isPredicated(&In);
+  bool NeedsImplicit = isCall(In) || In.isInlineAsm() || In.isReturn();
+  bool IsPredicated = TII.isPredicated(In);
   unsigned NumOps = In.getNumOperands();
 
   // Avoid duplicate implicit defs. This will not detect cases of implicit
@@ -1212,7 +1232,7 @@ void DataFlowGraph::buildStmt(NodeAddr<BlockNode*> BA, MachineInstr &In) {
     if (!Op.isReg() || !Op.isDef() || !Op.isImplicit())
       continue;
     RegisterRef RR = { Op.getReg(), Op.getSubReg() };
-    if (!IsCall && !ImpDefs.count(RR))
+    if (!NeedsImplicit && !ImpDefs.count(RR))
       continue;
     if (DoneDefs.count(RR))
       continue;
@@ -1237,7 +1257,7 @@ void DataFlowGraph::buildStmt(NodeAddr<BlockNode*> BA, MachineInstr &In) {
     // instructions regardless of whether or not they appear in the instruction
     // descriptor's list.
     bool Implicit = Op.isImplicit();
-    bool TakeImplicit = IsReturn || IsCall || IsPredicated;
+    bool TakeImplicit = NeedsImplicit || IsPredicated;
     if (Implicit && !TakeImplicit && !ImpUses.count(RR))
       continue;
     uint16_t Flags = NodeAttrs::None;
@@ -1456,9 +1476,9 @@ void DataFlowGraph::removeUnusedPhis() {
           PhiQ.insert(OA.Id);
       }
       if (RA.Addr->isDef())
-        unlinkDef(RA);
+        unlinkDef(RA, true);
       else
-        unlinkUse(RA);
+        unlinkUse(RA, true);
     }
     NodeAddr<BlockNode*> BA = PA.Addr->getOwner(*this);
     BA.Addr->removeMember(PA, *this);
@@ -1546,6 +1566,7 @@ void DataFlowGraph::linkBlockRefs(DefStackMap &DefM, NodeAddr<BlockNode*> BA) {
   // Push block delimiters.
   markBlock(BA.Id, DefM);
 
+  assert(BA.Addr && "block node address is needed to create a data-flow link");
   // For each non-phi instruction in the block, link all the defs and uses
   // to their reaching defs. For any member of the block (including phis),
   // push the defs on the corresponding stacks.
@@ -1593,13 +1614,10 @@ void DataFlowGraph::linkBlockRefs(DefStackMap &DefM, NodeAddr<BlockNode*> BA) {
 }
 
 // Remove the use node UA from any data-flow and structural links.
-void DataFlowGraph::unlinkUse(NodeAddr<UseNode*> UA) {
+void DataFlowGraph::unlinkUseDF(NodeAddr<UseNode*> UA) {
   NodeId RD = UA.Addr->getReachingDef();
   NodeId Sib = UA.Addr->getSibling();
 
-  NodeAddr<InstrNode*> IA = UA.Addr->getOwner(*this);
-  IA.Addr->removeMember(UA, *this);
-
   if (RD == 0) {
     assert(Sib == 0);
     return;
@@ -1623,7 +1641,7 @@ void DataFlowGraph::unlinkUse(NodeAddr<UseNode*> UA) {
 }
 
 // Remove the def node DA from any data-flow and structural links.
-void DataFlowGraph::unlinkDef(NodeAddr<DefNode*> DA) {
+void DataFlowGraph::unlinkDefDF(NodeAddr<DefNode*> DA) {
   //
   //         RD
   //         | reached
@@ -1710,7 +1728,4 @@ void DataFlowGraph::unlinkDef(NodeAddr<DefNode*> DA) {
     Last.Addr->setSibling(RDA.Addr->getReachedUse());
     RDA.Addr->setReachedUse(ReachedUses.front().Id);
   }
-
-  NodeAddr<InstrNode*> IA = DA.Addr->getOwner(*this);
-  IA.Addr->removeMember(DA, *this);
 }
diff --git a/lib/Target/Hexagon/RDFGraph.h b/lib/Target/Hexagon/RDFGraph.h
index 7da7bb5973cf..49b053741263 100644
--- a/lib/Target/Hexagon/RDFGraph.h
+++ b/lib/Target/Hexagon/RDFGraph.h
@@ -202,7 +202,6 @@
 #ifndef RDF_GRAPH_H
 #define RDF_GRAPH_H
 
-#include "llvm/ADT/BitVector.h"
 #include "llvm/Support/Allocator.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/raw_ostream.h"
@@ -213,8 +212,6 @@
 #include <set>
 #include <vector>
 
-using namespace llvm;
-
 namespace llvm {
   class MachineBasicBlock;
   class MachineFunction;
@@ -224,7 +221,6 @@ namespace llvm {
   class MachineDominatorTree;
   class TargetInstrInfo;
   class TargetRegisterInfo;
-}
 
 namespace rdf {
   typedef uint32_t NodeId;
@@ -288,6 +284,13 @@ namespace rdf {
     }
   };
 
+  struct BuildOptions {
+    enum : unsigned {
+      None          = 0x00,
+      KeepDeadPhis  = 0x01,   // Do not remove dead phis during build.
+    };
+  };
+
   template <typename T> struct NodeAddr {
     NodeAddr() : Addr(nullptr), Id(0) {}
     NodeAddr(T A, NodeId I) : Addr(A), Id(I) {}
@@ -678,7 +681,7 @@ namespace rdf {
 
     typedef std::map<RegisterRef,DefStack> DefStackMap;
 
-    void build();
+    void build(unsigned Options = BuildOptions::None);
     void pushDefs(NodeAddr<InstrNode*> IA, DefStackMap &DM);
     void markBlock(NodeId B, DefStackMap &DefM);
     void releaseBlock(NodeId B, DefStackMap &DefM);
@@ -697,8 +700,16 @@ namespace rdf {
     NodeList getRelatedRefs(NodeAddr<InstrNode*> IA,
         NodeAddr<RefNode*> RA) const;
 
-    void unlinkUse(NodeAddr<UseNode*> UA);
-    void unlinkDef(NodeAddr<DefNode*> DA);
+    void unlinkUse(NodeAddr<UseNode*> UA, bool RemoveFromOwner) {
+      unlinkUseDF(UA);
+      if (RemoveFromOwner)
+        removeFromOwner(UA);
+    }
+    void unlinkDef(NodeAddr<DefNode*> DA, bool RemoveFromOwner) {
+      unlinkDefDF(DA);
+      if (RemoveFromOwner)
+        removeFromOwner(DA);
+    }
 
     // Some useful filters.
     template <uint16_t Kind>
@@ -765,6 +776,13 @@ namespace rdf {
     void linkStmtRefs(DefStackMap &DefM, NodeAddr<StmtNode*> SA);
     void linkBlockRefs(DefStackMap &DefM, NodeAddr<BlockNode*> BA);
 
+    void unlinkUseDF(NodeAddr<UseNode*> UA);
+    void unlinkDefDF(NodeAddr<DefNode*> DA);
+    void removeFromOwner(NodeAddr<RefNode*> RA) {
+      NodeAddr<InstrNode*> IA = RA.Addr->getOwner(*this);
+      IA.Addr->removeMember(RA, *this);
+    }
+
     TimerGroup TimeG;
     NodeAddr<FuncNode*> Func;
     NodeAllocator Memory;
@@ -837,5 +855,6 @@ namespace rdf {
       : Print<NodeAddr<T>>(x, g) {}
   };
 } // namespace rdf
+} // namespace llvm
 
 #endif // RDF_GRAPH_H
diff --git a/lib/Target/Hexagon/RDFLiveness.cpp b/lib/Target/Hexagon/RDFLiveness.cpp
index 1d9bd372ff4e..641f01423176 100644
--- a/lib/Target/Hexagon/RDFLiveness.cpp
+++ b/lib/Target/Hexagon/RDFLiveness.cpp
@@ -36,6 +36,7 @@
 using namespace llvm;
 using namespace rdf;
 
+namespace llvm {
 namespace rdf {
   template<>
   raw_ostream &operator<< (raw_ostream &OS, const Print<Liveness::RefMap> &P) {
@@ -52,7 +53,8 @@ namespace rdf {
     OS << " }";
     return OS;
   }
-}
+} // namespace rdf
+} // namespace llvm
 
 // The order in the returned sequence is the order of reaching defs in the
 // upward traversal: the first def is the closest to the given reference RefA,
@@ -235,7 +237,93 @@ NodeList Liveness::getAllReachingDefs(NodeAddr<RefNode*> RefA) {
 }
 
 
+NodeSet Liveness::getAllReachingDefsRec(RegisterRef RefRR,
+      NodeAddr<RefNode*> RefA, NodeSet &Visited, const NodeSet &Defs) {
+  // Collect all defined registers. Do not consider phis to be defining
+  // anything, only collect "real" definitions.
+  RegisterSet DefRRs;
+  for (const auto D : Defs) {
+    const auto DA = DFG.addr<const DefNode*>(D);
+    if (!(DA.Addr->getFlags() & NodeAttrs::PhiRef))
+      DefRRs.insert(DA.Addr->getRegRef());
+  }
+
+  auto RDs = getAllReachingDefs(RefRR, RefA, true, DefRRs);
+  if (RDs.empty())
+    return Defs;
+
+  // Make a copy of the preexisting definitions and add the newly found ones.
+  NodeSet TmpDefs = Defs;
+  for (auto R : RDs)
+    TmpDefs.insert(R.Id);
+
+  NodeSet Result = Defs;
+
+  for (NodeAddr<DefNode*> DA : RDs) {
+    Result.insert(DA.Id);
+    if (!(DA.Addr->getFlags() & NodeAttrs::PhiRef))
+      continue;
+    NodeAddr<PhiNode*> PA = DA.Addr->getOwner(DFG);
+    if (Visited.count(PA.Id))
+      continue;
+    Visited.insert(PA.Id);
+    // Go over all phi uses and get the reaching defs for each use.
+    for (auto U : PA.Addr->members_if(DFG.IsRef<NodeAttrs::Use>, DFG)) {
+      const auto &T = getAllReachingDefsRec(RefRR, U, Visited, TmpDefs);
+      Result.insert(T.begin(), T.end());
+    }
+  }
+
+  return Result;
+}
+
+
+NodeSet Liveness::getAllReachedUses(RegisterRef RefRR,
+      NodeAddr<DefNode*> DefA, const RegisterSet &DefRRs) {
+  NodeSet Uses;
+
+  // If the original register is already covered by all the intervening
+  // defs, no more uses can be reached.
+  if (RAI.covers(DefRRs, RefRR))
+    return Uses;
+
+  // Add all directly reached uses.
+  NodeId U = DefA.Addr->getReachedUse();
+  while (U != 0) {
+    auto UA = DFG.addr<UseNode*>(U);
+    auto UR = UA.Addr->getRegRef();
+    if (RAI.alias(RefRR, UR) && !RAI.covers(DefRRs, UR))
+      Uses.insert(U);
+    U = UA.Addr->getSibling();
+  }
+
+  // Traverse all reached defs.
+  for (NodeId D = DefA.Addr->getReachedDef(), NextD; D != 0; D = NextD) {
+    auto DA = DFG.addr<DefNode*>(D);
+    NextD = DA.Addr->getSibling();
+    auto DR = DA.Addr->getRegRef();
+    // If this def is already covered, it cannot reach anything new.
+    // Similarly, skip it if it is not aliased to the interesting register.
+    if (RAI.covers(DefRRs, DR) || !RAI.alias(RefRR, DR))
+      continue;
+    NodeSet T;
+    if (DA.Addr->getFlags() & NodeAttrs::Preserving) {
+      // If it is a preserving def, do not update the set of intervening defs.
+      T = getAllReachedUses(RefRR, DA, DefRRs);
+    } else {
+      RegisterSet NewDefRRs = DefRRs;
+      NewDefRRs.insert(DR);
+      T = getAllReachedUses(RefRR, DA, NewDefRRs);
+    }
+    Uses.insert(T.begin(), T.end());
+  }
+  return Uses;
+}
+
+
 void Liveness::computePhiInfo() {
+  RealUseMap.clear();
+
   NodeList Phis;
   NodeAddr<FuncNode*> FA = DFG.getFunc();
   auto Blocks = FA.Addr->members(DFG);
@@ -601,7 +689,11 @@ void Liveness::resetKills(MachineBasicBlock *B) {
 
     MI->clearKillInfo();
     for (auto &Op : MI->operands()) {
-      if (!Op.isReg() || !Op.isDef())
+      // An implicit def of a super-register may not necessarily start a
+      // live range of it, since an implicit use could be used to keep parts
+      // of it live. Instead of analyzing the implicit operands, ignore
+      // implicit defs.
+      if (!Op.isReg() || !Op.isDef() || Op.isImplicit())
         continue;
       unsigned R = Op.getReg();
       if (!TargetRegisterInfo::isPhysicalRegister(R))
@@ -616,8 +708,8 @@ void Liveness::resetKills(MachineBasicBlock *B) {
       if (!TargetRegisterInfo::isPhysicalRegister(R))
         continue;
       bool IsLive = false;
-      for (MCSubRegIterator SR(R, &TRI, true); SR.isValid(); ++SR) {
-        if (!Live[*SR])
+      for (MCRegAliasIterator AR(R, &TRI, true); AR.isValid(); ++AR) {
+        if (!Live[*AR])
           continue;
         IsLive = true;
         break;
diff --git a/lib/Target/Hexagon/RDFLiveness.h b/lib/Target/Hexagon/RDFLiveness.h
index 4c1e8f3ee838..2b49c7488ce3 100644
--- a/lib/Target/Hexagon/RDFLiveness.h
+++ b/lib/Target/Hexagon/RDFLiveness.h
@@ -26,7 +26,6 @@ namespace llvm {
   class TargetRegisterInfo;
   class MachineDominatorTree;
   class MachineDominanceFrontier;
-}
 
 namespace rdf {
   struct Liveness {
@@ -41,6 +40,10 @@ namespace rdf {
     NodeList getAllReachingDefs(RegisterRef RefRR, NodeAddr<RefNode*> RefA,
         bool FullChain = false, const RegisterSet &DefRRs = RegisterSet());
     NodeList getAllReachingDefs(NodeAddr<RefNode*> RefA);
+    NodeSet getAllReachingDefsRec(RegisterRef RefRR, NodeAddr<RefNode*> RefA,
+        NodeSet &Visited, const NodeSet &Defs);
+    NodeSet getAllReachedUses(RegisterRef RefRR, NodeAddr<DefNode*> DefA,
+        const RegisterSet &DefRRs = RegisterSet());
 
     LiveMapType &getLiveMap() { return LiveMap; }
     const LiveMapType &getLiveMap() const { return LiveMap; }
@@ -101,6 +104,7 @@ namespace rdf {
     void traverse(MachineBasicBlock *B, RefMap &LiveIn);
     void emptify(RefMap &M);
   };
-}
+} // namespace rdf
+} // namespace llvm
 
 #endif // RDF_LIVENESS_H
diff --git a/lib/Target/Hexagon/TargetInfo/Makefile b/lib/Target/Hexagon/TargetInfo/Makefile
deleted file mode 100644
index 494cca112249..000000000000
--- a/lib/Target/Hexagon/TargetInfo/Makefile
+++ /dev/null
@@ -1,15 +0,0 @@
-##===- lib/Target/Hexagon/TargetInfo/Makefile ----------------*- Makefile -*-===##
-#
-#                     The LLVM Compiler Infrastructure
-#
-# This file is distributed under the University of Illinois Open Source
-# License. See LICENSE.TXT for details.
-#
-##===----------------------------------------------------------------------===##
-LEVEL = ../../../..
-LIBRARYNAME = LLVMHexagonInfo
-
-# Hack: we need to include 'main' target directory to grab private headers
-CPPFLAGS = -I$(PROJ_OBJ_DIR)/.. -I$(PROJ_SRC_DIR)/..
-
-include $(LEVEL)/Makefile.common
diff --git a/lib/Target/LLVMBuild.txt b/lib/Target/LLVMBuild.txt
index eb794ebc7216..43621629dd25 100644
--- a/lib/Target/LLVMBuild.txt
+++ b/lib/Target/LLVMBuild.txt
@@ -24,7 +24,7 @@ subdirectories =
  AArch64
  AVR
  BPF
- CppBackend
+ Lanai
  Hexagon
  MSP430
  NVPTX
diff --git a/lib/Target/Lanai/AsmParser/CMakeLists.txt b/lib/Target/Lanai/AsmParser/CMakeLists.txt
new file mode 100644
index 000000000000..3c88192ea6f0
--- /dev/null
+++ b/lib/Target/Lanai/AsmParser/CMakeLists.txt
@@ -0,0 +1,7 @@
+include_directories( ${CMAKE_CURRENT_BINARY_DIR}/.. ${CMAKE_CURRENT_SOURCE_DIR}/.. )
+
+add_llvm_library(LLVMLanaiAsmParser
+  LanaiAsmParser.cpp
+  )
+
+add_dependencies( LLVMLanaiAsmParser LanaiCommonTableGen )
diff --git a/lib/Target/Lanai/AsmParser/LLVMBuild.txt b/lib/Target/Lanai/AsmParser/LLVMBuild.txt
new file mode 100644
index 000000000000..08cf4033087d
--- /dev/null
+++ b/lib/Target/Lanai/AsmParser/LLVMBuild.txt
@@ -0,0 +1,23 @@
+;===- ./lib/Target/Lanai/AsmParser/LLVMBuild.txt ----------------*- Conf -*-===;
+;
+;                     The LLVM Compiler Infrastructure
+;
+; This file is distributed under the University of Illinois Open Source
+; License. See LICENSE.TXT for details.
+;
+;===------------------------------------------------------------------------===;
+;
+; This is an LLVMBuild description file for the components in this subdirectory.
+;
+; For more information on the LLVMBuild system, please see:
+;
+;   http://llvm.org/docs/LLVMBuild.html
+;
+;===------------------------------------------------------------------------===;
+
+[component_0]
+type = Library
+name = LanaiAsmParser
+parent = Lanai
+required_libraries = MC MCParser Support LanaiMCTargetDesc LanaiInfo
+add_to_library_groups = Lanai
diff --git a/lib/Target/Lanai/AsmParser/LanaiAsmParser.cpp b/lib/Target/Lanai/AsmParser/LanaiAsmParser.cpp
new file mode 100644
index 000000000000..cbb96d8b05b2
--- /dev/null
+++ b/lib/Target/Lanai/AsmParser/LanaiAsmParser.cpp
@@ -0,0 +1,1213 @@
+//===-- LanaiAsmParser.cpp - Parse Lanai assembly to MCInst instructions --===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "Lanai.h"
+#include "MCTargetDesc/LanaiMCExpr.h"
+#include "MCTargetDesc/LanaiMCTargetDesc.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/MC/MCContext.h"
+#include "llvm/MC/MCExpr.h"
+#include "llvm/MC/MCInst.h"
+#include "llvm/MC/MCParser/MCAsmLexer.h"
+#include "llvm/MC/MCParser/MCParsedAsmOperand.h"
+#include "llvm/MC/MCParser/MCTargetAsmParser.h"
+#include "llvm/MC/MCStreamer.h"
+#include "llvm/MC/MCSubtargetInfo.h"
+#include "llvm/MC/MCSymbol.h"
+#include "llvm/Support/MathExtras.h"
+#include "llvm/Support/TargetRegistry.h"
+
+namespace llvm {
+namespace {
+struct LanaiOperand;
+
+class LanaiAsmParser : public MCTargetAsmParser {
+  // Parse operands
+  std::unique_ptr<LanaiOperand> parseRegister();
+
+  std::unique_ptr<LanaiOperand> parseImmediate();
+
+  std::unique_ptr<LanaiOperand> parseIdentifier();
+
+  unsigned parseAluOperator(bool PreOp, bool PostOp);
+
+  // Split the mnemonic stripping conditional code and quantifiers
+  StringRef splitMnemonic(StringRef Name, SMLoc NameLoc,
+                          OperandVector *Operands);
+
+  bool parsePrePost(StringRef Type, int *OffsetValue);
+
+  bool ParseDirective(AsmToken DirectiveID) override;
+
+  bool ParseInstruction(ParseInstructionInfo &Info, StringRef Name,
+                        SMLoc NameLoc, OperandVector &Operands) override;
+
+  bool ParseRegister(unsigned &RegNum, SMLoc &StartLoc, SMLoc &EndLoc) override;
+
+  bool MatchAndEmitInstruction(SMLoc IdLoc, unsigned &Opcode,
+                               OperandVector &Operands, MCStreamer &Out,
+                               uint64_t &ErrorInfo,
+                               bool MatchingInlineAsm) override;
+
+// Auto-generated instruction matching functions
+#define GET_ASSEMBLER_HEADER
+#include "LanaiGenAsmMatcher.inc"
+
+  OperandMatchResultTy parseOperand(OperandVector *Operands,
+                                    StringRef Mnemonic);
+
+  OperandMatchResultTy parseMemoryOperand(OperandVector &Operands);
+
+public:
+  LanaiAsmParser(const MCSubtargetInfo &STI, MCAsmParser &Parser,
+                 const MCInstrInfo &MII, const MCTargetOptions &Options)
+      : MCTargetAsmParser(Options, STI), Parser(Parser),
+        Lexer(Parser.getLexer()), SubtargetInfo(STI) {
+    setAvailableFeatures(
+        ComputeAvailableFeatures(SubtargetInfo.getFeatureBits()));
+  }
+
+private:
+  MCAsmParser &Parser;
+  MCAsmLexer &Lexer;
+
+  const MCSubtargetInfo &SubtargetInfo;
+};
+
+// Auto-generated by TableGen
+static unsigned MatchRegisterName(llvm::StringRef Name);
+
+// LanaiOperand - Instances of this class represented a parsed machine
+// instruction
+struct LanaiOperand : public MCParsedAsmOperand {
+  enum KindTy {
+    TOKEN,
+    REGISTER,
+    IMMEDIATE,
+    MEMORY_IMM,
+    MEMORY_REG_IMM,
+    MEMORY_REG_REG,
+  } Kind;
+
+  SMLoc StartLoc, EndLoc;
+
+  struct Token {
+    const char *Data;
+    unsigned Length;
+  };
+
+  struct RegOp {
+    unsigned RegNum;
+  };
+
+  struct ImmOp {
+    const MCExpr *Value;
+  };
+
+  struct MemOp {
+    unsigned BaseReg;
+    unsigned OffsetReg;
+    unsigned AluOp;
+    const MCExpr *Offset;
+  };
+
+  union {
+    struct Token Tok;
+    struct RegOp Reg;
+    struct ImmOp Imm;
+    struct MemOp Mem;
+  };
+
+  explicit LanaiOperand(KindTy Kind) : MCParsedAsmOperand(), Kind(Kind) {}
+
+public:
+  // The functions below are used by the autogenerated ASM matcher and hence to
+  // be of the form expected.
+
+  // getStartLoc - Gets location of the first token of this operand
+  SMLoc getStartLoc() const override { return StartLoc; }
+
+  // getEndLoc - Gets location of the last token of this operand
+  SMLoc getEndLoc() const override { return EndLoc; }
+
+  unsigned getReg() const override {
+    assert(isReg() && "Invalid type access!");
+    return Reg.RegNum;
+  }
+
+  const MCExpr *getImm() const {
+    assert(isImm() && "Invalid type access!");
+    return Imm.Value;
+  }
+
+  StringRef getToken() const {
+    assert(isToken() && "Invalid type access!");
+    return StringRef(Tok.Data, Tok.Length);
+  }
+
+  unsigned getMemBaseReg() const {
+    assert(isMem() && "Invalid type access!");
+    return Mem.BaseReg;
+  }
+
+  unsigned getMemOffsetReg() const {
+    assert(isMem() && "Invalid type access!");
+    return Mem.OffsetReg;
+  }
+
+  const MCExpr *getMemOffset() const {
+    assert(isMem() && "Invalid type access!");
+    return Mem.Offset;
+  }
+
+  unsigned getMemOp() const {
+    assert(isMem() && "Invalid type access!");
+    return Mem.AluOp;
+  }
+
+  // Functions for testing operand type
+  bool isReg() const override { return Kind == REGISTER; }
+
+  bool isImm() const override { return Kind == IMMEDIATE; }
+
+  bool isMem() const override {
+    return isMemImm() || isMemRegImm() || isMemRegReg();
+  }
+
+  bool isMemImm() const { return Kind == MEMORY_IMM; }
+
+  bool isMemRegImm() const { return Kind == MEMORY_REG_IMM; }
+
+  bool isMemRegReg() const { return Kind == MEMORY_REG_REG; }
+
+  bool isMemSpls() const { return isMemRegImm() || isMemRegReg(); }
+
+  bool isToken() const override { return Kind == TOKEN; }
+
+  bool isBrImm() {
+    if (!isImm())
+      return false;
+
+    // Constant case
+    const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(Imm.Value);
+    if (!MCE)
+      return true;
+    int64_t Value = MCE->getValue();
+    // Check if value fits in 25 bits with 2 least significant bits 0.
+    return isShiftedUInt<23, 2>(static_cast<int32_t>(Value));
+  }
+
+  bool isBrTarget() { return isBrImm() || isToken(); }
+
+  bool isCallTarget() { return isImm() || isToken(); }
+
+  bool isHiImm16() {
+    if (!isImm())
+      return false;
+
+    // Constant case
+    if (const MCConstantExpr *ConstExpr = dyn_cast<MCConstantExpr>(Imm.Value)) {
+      int64_t Value = ConstExpr->getValue();
+      return Value != 0 && isShiftedUInt<16, 16>(Value);
+    }
+
+    // Symbolic reference expression
+    if (const LanaiMCExpr *SymbolRefExpr = dyn_cast<LanaiMCExpr>(Imm.Value))
+      return SymbolRefExpr->getKind() == LanaiMCExpr::VK_Lanai_ABS_HI;
+
+    // Binary expression
+    if (const MCBinaryExpr *BinaryExpr = dyn_cast<MCBinaryExpr>(Imm.Value))
+      if (const LanaiMCExpr *SymbolRefExpr =
+              dyn_cast<LanaiMCExpr>(BinaryExpr->getLHS()))
+        return SymbolRefExpr->getKind() == LanaiMCExpr::VK_Lanai_ABS_HI;
+
+    return false;
+  }
+
+  bool isHiImm16And() {
+    if (!isImm())
+      return false;
+
+    const MCConstantExpr *ConstExpr = dyn_cast<MCConstantExpr>(Imm.Value);
+    if (ConstExpr) {
+      int64_t Value = ConstExpr->getValue();
+      // Check if in the form 0xXYZWffff
+      return (Value != 0) && ((Value & ~0xffff0000) == 0xffff);
+    }
+    return false;
+  }
+
+  bool isLoImm16() {
+    if (!isImm())
+      return false;
+
+    // Constant case
+    if (const MCConstantExpr *ConstExpr = dyn_cast<MCConstantExpr>(Imm.Value)) {
+      int64_t Value = ConstExpr->getValue();
+      // Check if value fits in 16 bits
+      return isUInt<16>(static_cast<int32_t>(Value));
+    }
+
+    // Symbolic reference expression
+    if (const LanaiMCExpr *SymbolRefExpr = dyn_cast<LanaiMCExpr>(Imm.Value))
+      return SymbolRefExpr->getKind() == LanaiMCExpr::VK_Lanai_ABS_LO;
+
+    // Binary expression
+    if (const MCBinaryExpr *BinaryExpr = dyn_cast<MCBinaryExpr>(Imm.Value))
+      if (const LanaiMCExpr *SymbolRefExpr =
+              dyn_cast<LanaiMCExpr>(BinaryExpr->getLHS()))
+        return SymbolRefExpr->getKind() == LanaiMCExpr::VK_Lanai_ABS_LO;
+
+    return false;
+  }
+
+  bool isLoImm16Signed() {
+    if (!isImm())
+      return false;
+
+    // Constant case
+    if (const MCConstantExpr *ConstExpr = dyn_cast<MCConstantExpr>(Imm.Value)) {
+      int64_t Value = ConstExpr->getValue();
+      // Check if value fits in 16 bits or value of the form 0xffffxyzw
+      return isInt<16>(static_cast<int32_t>(Value));
+    }
+
+    // Symbolic reference expression
+    if (const LanaiMCExpr *SymbolRefExpr = dyn_cast<LanaiMCExpr>(Imm.Value))
+      return SymbolRefExpr->getKind() == LanaiMCExpr::VK_Lanai_ABS_LO;
+
+    // Binary expression
+    if (const MCBinaryExpr *BinaryExpr = dyn_cast<MCBinaryExpr>(Imm.Value))
+      if (const LanaiMCExpr *SymbolRefExpr =
+              dyn_cast<LanaiMCExpr>(BinaryExpr->getLHS()))
+        return SymbolRefExpr->getKind() == LanaiMCExpr::VK_Lanai_ABS_LO;
+
+    return false;
+  }
+
+  bool isLoImm16And() {
+    if (!isImm())
+      return false;
+
+    const MCConstantExpr *ConstExpr = dyn_cast<MCConstantExpr>(Imm.Value);
+    if (ConstExpr) {
+      int64_t Value = ConstExpr->getValue();
+      // Check if in the form 0xffffXYZW
+      return ((Value & ~0xffff) == 0xffff0000);
+    }
+    return false;
+  }
+
+  bool isImmShift() {
+    if (!isImm())
+      return false;
+
+    const MCConstantExpr *ConstExpr = dyn_cast<MCConstantExpr>(Imm.Value);
+    if (!ConstExpr)
+      return false;
+    int64_t Value = ConstExpr->getValue();
+    return (Value >= -31) && (Value <= 31);
+  }
+
+  bool isLoImm21() {
+    if (!isImm())
+      return false;
+
+    // Constant case
+    if (const MCConstantExpr *ConstExpr = dyn_cast<MCConstantExpr>(Imm.Value)) {
+      int64_t Value = ConstExpr->getValue();
+      return isUInt<21>(Value);
+    }
+
+    // Symbolic reference expression
+    if (const LanaiMCExpr *SymbolRefExpr = dyn_cast<LanaiMCExpr>(Imm.Value))
+      return SymbolRefExpr->getKind() == LanaiMCExpr::VK_Lanai_None;
+    if (const MCSymbolRefExpr *SymbolRefExpr =
+            dyn_cast<MCSymbolRefExpr>(Imm.Value)) {
+      return SymbolRefExpr->getKind() == MCSymbolRefExpr::VK_None;
+    }
+
+    // Binary expression
+    if (const MCBinaryExpr *BinaryExpr = dyn_cast<MCBinaryExpr>(Imm.Value)) {
+      if (const LanaiMCExpr *SymbolRefExpr =
+              dyn_cast<LanaiMCExpr>(BinaryExpr->getLHS()))
+        return SymbolRefExpr->getKind() == LanaiMCExpr::VK_Lanai_None;
+      if (const MCSymbolRefExpr *SymbolRefExpr =
+              dyn_cast<MCSymbolRefExpr>(BinaryExpr->getLHS()))
+        return SymbolRefExpr->getKind() == MCSymbolRefExpr::VK_None;
+    }
+
+    return false;
+  }
+
+  bool isImm10() {
+    if (!isImm())
+      return false;
+
+    const MCConstantExpr *ConstExpr = dyn_cast<MCConstantExpr>(Imm.Value);
+    if (!ConstExpr)
+      return false;
+    int64_t Value = ConstExpr->getValue();
+    return isInt<10>(Value);
+  }
+
+  bool isCondCode() {
+    if (!isImm())
+      return false;
+
+    const MCConstantExpr *ConstExpr = dyn_cast<MCConstantExpr>(Imm.Value);
+    if (!ConstExpr)
+      return false;
+    uint64_t Value = ConstExpr->getValue();
+    // The condition codes are between 0 (ICC_T) and 15 (ICC_LE). If the
+    // unsigned value of the immediate is less than LPCC::UNKNOWN (16) then
+    // value corresponds to a valid condition code.
+    return Value < LPCC::UNKNOWN;
+  }
+
+  void addExpr(MCInst &Inst, const MCExpr *Expr) const {
+    // Add as immediates where possible. Null MCExpr = 0
+    if (Expr == nullptr)
+      Inst.addOperand(MCOperand::createImm(0));
+    else if (const MCConstantExpr *ConstExpr = dyn_cast<MCConstantExpr>(Expr))
+      Inst.addOperand(
+          MCOperand::createImm(static_cast<int32_t>(ConstExpr->getValue())));
+    else
+      Inst.addOperand(MCOperand::createExpr(Expr));
+  }
+
+  void addRegOperands(MCInst &Inst, unsigned N) const {
+    assert(N == 1 && "Invalid number of operands!");
+    Inst.addOperand(MCOperand::createReg(getReg()));
+  }
+
+  void addImmOperands(MCInst &Inst, unsigned N) const {
+    assert(N == 1 && "Invalid number of operands!");
+    addExpr(Inst, getImm());
+  }
+
+  void addBrTargetOperands(MCInst &Inst, unsigned N) const {
+    assert(N == 1 && "Invalid number of operands!");
+    addExpr(Inst, getImm());
+  }
+
+  void addCallTargetOperands(MCInst &Inst, unsigned N) const {
+    assert(N == 1 && "Invalid number of operands!");
+    addExpr(Inst, getImm());
+  }
+
+  void addCondCodeOperands(MCInst &Inst, unsigned N) const {
+    assert(N == 1 && "Invalid number of operands!");
+    addExpr(Inst, getImm());
+  }
+
+  void addMemImmOperands(MCInst &Inst, unsigned N) const {
+    assert(N == 1 && "Invalid number of operands!");
+    const MCExpr *Expr = getMemOffset();
+    addExpr(Inst, Expr);
+  }
+
+  void addMemRegImmOperands(MCInst &Inst, unsigned N) const {
+    assert(N == 3 && "Invalid number of operands!");
+    Inst.addOperand(MCOperand::createReg(getMemBaseReg()));
+    const MCExpr *Expr = getMemOffset();
+    addExpr(Inst, Expr);
+    Inst.addOperand(MCOperand::createImm(getMemOp()));
+  }
+
+  void addMemRegRegOperands(MCInst &Inst, unsigned N) const {
+    assert(N == 3 && "Invalid number of operands!");
+    Inst.addOperand(MCOperand::createReg(getMemBaseReg()));
+    assert(getMemOffsetReg() != 0 && "Invalid offset");
+    Inst.addOperand(MCOperand::createReg(getMemOffsetReg()));
+    Inst.addOperand(MCOperand::createImm(getMemOp()));
+  }
+
+  void addMemSplsOperands(MCInst &Inst, unsigned N) const {
+    if (isMemRegImm())
+      addMemRegImmOperands(Inst, N);
+    if (isMemRegReg())
+      addMemRegRegOperands(Inst, N);
+  }
+
+  void addImmShiftOperands(MCInst &Inst, unsigned N) const {
+    assert(N == 1 && "Invalid number of operands!");
+    addExpr(Inst, getImm());
+  }
+
+  void addImm10Operands(MCInst &Inst, unsigned N) const {
+    assert(N == 1 && "Invalid number of operands!");
+    addExpr(Inst, getImm());
+  }
+
+  void addLoImm16Operands(MCInst &Inst, unsigned N) const {
+    assert(N == 1 && "Invalid number of operands!");
+    if (const MCConstantExpr *ConstExpr = dyn_cast<MCConstantExpr>(getImm()))
+      Inst.addOperand(
+          MCOperand::createImm(static_cast<int32_t>(ConstExpr->getValue())));
+    else if (isa<LanaiMCExpr>(getImm())) {
+#ifndef NDEBUG
+      const LanaiMCExpr *SymbolRefExpr = dyn_cast<LanaiMCExpr>(getImm());
+      assert(SymbolRefExpr->getKind() == LanaiMCExpr::VK_Lanai_ABS_LO);
+#endif
+      Inst.addOperand(MCOperand::createExpr(getImm()));
+    } else if (isa<MCBinaryExpr>(getImm())) {
+#ifndef NDEBUG
+      const MCBinaryExpr *BinaryExpr = dyn_cast<MCBinaryExpr>(getImm());
+      assert(dyn_cast<LanaiMCExpr>(BinaryExpr->getLHS()) &&
+             dyn_cast<LanaiMCExpr>(BinaryExpr->getLHS())->getKind() ==
+                 LanaiMCExpr::VK_Lanai_ABS_LO);
+#endif
+      Inst.addOperand(MCOperand::createExpr(getImm()));
+    } else
+      assert(false && "Operand type not supported.");
+  }
+
+  void addLoImm16AndOperands(MCInst &Inst, unsigned N) const {
+    assert(N == 1 && "Invalid number of operands!");
+    if (const MCConstantExpr *ConstExpr = dyn_cast<MCConstantExpr>(getImm()))
+      Inst.addOperand(MCOperand::createImm(ConstExpr->getValue() & 0xffff));
+    else
+      assert(false && "Operand type not supported.");
+  }
+
+  void addHiImm16Operands(MCInst &Inst, unsigned N) const {
+    assert(N == 1 && "Invalid number of operands!");
+    if (const MCConstantExpr *ConstExpr = dyn_cast<MCConstantExpr>(getImm()))
+      Inst.addOperand(MCOperand::createImm(ConstExpr->getValue() >> 16));
+    else if (isa<LanaiMCExpr>(getImm())) {
+#ifndef NDEBUG
+      const LanaiMCExpr *SymbolRefExpr = dyn_cast<LanaiMCExpr>(getImm());
+      assert(SymbolRefExpr->getKind() == LanaiMCExpr::VK_Lanai_ABS_HI);
+#endif
+      Inst.addOperand(MCOperand::createExpr(getImm()));
+    } else if (isa<MCBinaryExpr>(getImm())) {
+#ifndef NDEBUG
+      const MCBinaryExpr *BinaryExpr = dyn_cast<MCBinaryExpr>(getImm());
+      assert(dyn_cast<LanaiMCExpr>(BinaryExpr->getLHS()) &&
+             dyn_cast<LanaiMCExpr>(BinaryExpr->getLHS())->getKind() ==
+                 LanaiMCExpr::VK_Lanai_ABS_HI);
+#endif
+      Inst.addOperand(MCOperand::createExpr(getImm()));
+    } else
+      assert(false && "Operand type not supported.");
+  }
+
+  void addHiImm16AndOperands(MCInst &Inst, unsigned N) const {
+    assert(N == 1 && "Invalid number of operands!");
+    if (const MCConstantExpr *ConstExpr = dyn_cast<MCConstantExpr>(getImm()))
+      Inst.addOperand(MCOperand::createImm(ConstExpr->getValue() >> 16));
+    else
+      assert(false && "Operand type not supported.");
+  }
+
+  void addLoImm21Operands(MCInst &Inst, unsigned N) const {
+    assert(N == 1 && "Invalid number of operands!");
+    if (const MCConstantExpr *ConstExpr = dyn_cast<MCConstantExpr>(getImm()))
+      Inst.addOperand(MCOperand::createImm(ConstExpr->getValue() & 0x1fffff));
+    else if (isa<LanaiMCExpr>(getImm())) {
+#ifndef NDEBUG
+      const LanaiMCExpr *SymbolRefExpr = dyn_cast<LanaiMCExpr>(getImm());
+      assert(SymbolRefExpr &&
+             SymbolRefExpr->getKind() == LanaiMCExpr::VK_Lanai_None);
+#endif
+      Inst.addOperand(MCOperand::createExpr(getImm()));
+    } else if (isa<MCSymbolRefExpr>(getImm())) {
+#ifndef NDEBUG
+      const MCSymbolRefExpr *SymbolRefExpr =
+          dyn_cast<MCSymbolRefExpr>(getImm());
+      assert(SymbolRefExpr &&
+             SymbolRefExpr->getKind() == MCSymbolRefExpr::VK_None);
+#endif
+      Inst.addOperand(MCOperand::createExpr(getImm()));
+    } else if (isa<MCBinaryExpr>(getImm())) {
+#ifndef NDEBUG
+      const MCBinaryExpr *BinaryExpr = dyn_cast<MCBinaryExpr>(getImm());
+      const LanaiMCExpr *SymbolRefExpr =
+          dyn_cast<LanaiMCExpr>(BinaryExpr->getLHS());
+      assert(SymbolRefExpr &&
+             SymbolRefExpr->getKind() == LanaiMCExpr::VK_Lanai_None);
+#endif
+      Inst.addOperand(MCOperand::createExpr(getImm()));
+    } else
+      assert(false && "Operand type not supported.");
+  }
+
+  void print(raw_ostream &OS) const override {
+    switch (Kind) {
+    case IMMEDIATE:
+      OS << "Imm: " << getImm() << "\n";
+      break;
+    case TOKEN:
+      OS << "Token: " << getToken() << "\n";
+      break;
+    case REGISTER:
+      OS << "Reg: %r" << getReg() << "\n";
+      break;
+    case MEMORY_IMM:
+      OS << "MemImm: " << *getMemOffset() << "\n";
+      break;
+    case MEMORY_REG_IMM:
+      OS << "MemRegImm: " << getMemBaseReg() << "+" << *getMemOffset() << "\n";
+      break;
+    case MEMORY_REG_REG:
+      assert(getMemOffset() == nullptr);
+      OS << "MemRegReg: " << getMemBaseReg() << "+"
+         << "%r" << getMemOffsetReg() << "\n";
+      break;
+    }
+  }
+
+  static std::unique_ptr<LanaiOperand> CreateToken(StringRef Str, SMLoc Start) {
+    auto Op = make_unique<LanaiOperand>(TOKEN);
+    Op->Tok.Data = Str.data();
+    Op->Tok.Length = Str.size();
+    Op->StartLoc = Start;
+    Op->EndLoc = Start;
+    return Op;
+  }
+
+  static std::unique_ptr<LanaiOperand> createReg(unsigned RegNum, SMLoc Start,
+                                                 SMLoc End) {
+    auto Op = make_unique<LanaiOperand>(REGISTER);
+    Op->Reg.RegNum = RegNum;
+    Op->StartLoc = Start;
+    Op->EndLoc = End;
+    return Op;
+  }
+
+  static std::unique_ptr<LanaiOperand> createImm(const MCExpr *Value,
+                                                 SMLoc Start, SMLoc End) {
+    auto Op = make_unique<LanaiOperand>(IMMEDIATE);
+    Op->Imm.Value = Value;
+    Op->StartLoc = Start;
+    Op->EndLoc = End;
+    return Op;
+  }
+
+  static std::unique_ptr<LanaiOperand>
+  MorphToMemImm(std::unique_ptr<LanaiOperand> Op) {
+    const MCExpr *Imm = Op->getImm();
+    Op->Kind = MEMORY_IMM;
+    Op->Mem.BaseReg = 0;
+    Op->Mem.AluOp = LPAC::ADD;
+    Op->Mem.OffsetReg = 0;
+    Op->Mem.Offset = Imm;
+    return Op;
+  }
+
+  static std::unique_ptr<LanaiOperand>
+  MorphToMemRegReg(unsigned BaseReg, std::unique_ptr<LanaiOperand> Op,
+                   unsigned AluOp) {
+    unsigned OffsetReg = Op->getReg();
+    Op->Kind = MEMORY_REG_REG;
+    Op->Mem.BaseReg = BaseReg;
+    Op->Mem.AluOp = AluOp;
+    Op->Mem.OffsetReg = OffsetReg;
+    Op->Mem.Offset = nullptr;
+    return Op;
+  }
+
+  static std::unique_ptr<LanaiOperand>
+  MorphToMemRegImm(unsigned BaseReg, std::unique_ptr<LanaiOperand> Op,
+                   unsigned AluOp) {
+    const MCExpr *Imm = Op->getImm();
+    Op->Kind = MEMORY_REG_IMM;
+    Op->Mem.BaseReg = BaseReg;
+    Op->Mem.AluOp = AluOp;
+    Op->Mem.OffsetReg = 0;
+    Op->Mem.Offset = Imm;
+    return Op;
+  }
+};
+
+bool LanaiAsmParser::ParseDirective(AsmToken /*DirectiveId*/) { return true; }
+
+bool LanaiAsmParser::MatchAndEmitInstruction(SMLoc IdLoc, unsigned &Opcode,
+                                             OperandVector &Operands,
+                                             MCStreamer &Out,
+                                             uint64_t &ErrorInfo,
+                                             bool MatchingInlineAsm) {
+  MCInst Inst;
+  SMLoc ErrorLoc;
+
+  switch (MatchInstructionImpl(Operands, Inst, ErrorInfo, MatchingInlineAsm)) {
+  case Match_Success:
+    Out.EmitInstruction(Inst, SubtargetInfo);
+    Opcode = Inst.getOpcode();
+    return false;
+  case Match_MissingFeature:
+    return Error(IdLoc, "Instruction use requires option to be enabled");
+  case Match_MnemonicFail:
+    return Error(IdLoc, "Unrecognized instruction mnemonic");
+  case Match_InvalidOperand: {
+    ErrorLoc = IdLoc;
+    if (ErrorInfo != ~0U) {
+      if (ErrorInfo >= Operands.size())
+        return Error(IdLoc, "Too few operands for instruction");
+
+      ErrorLoc = ((LanaiOperand &)*Operands[ErrorInfo]).getStartLoc();
+      if (ErrorLoc == SMLoc())
+        ErrorLoc = IdLoc;
+    }
+    return Error(ErrorLoc, "Invalid operand for instruction");
+  }
+  default:
+    break;
+  }
+
+  llvm_unreachable("Unknown match type detected!");
+}
+
+// Both '%rN' and 'rN' are parsed as valid registers. This was done to remain
+// backwards compatible with GCC and the different ways inline assembly is
+// handled.
+// TODO: see if there isn't a better way to do this.
+std::unique_ptr<LanaiOperand> LanaiAsmParser::parseRegister() {
+  SMLoc Start = Parser.getTok().getLoc();
+  SMLoc End = SMLoc::getFromPointer(Parser.getTok().getLoc().getPointer() - 1);
+
+  unsigned RegNum;
+  // Eat the '%'.
+  if (Lexer.getKind() == AsmToken::Percent)
+    Parser.Lex();
+  if (Lexer.getKind() == AsmToken::Identifier) {
+    RegNum = MatchRegisterName(Lexer.getTok().getIdentifier());
+    if (RegNum == 0)
+      return 0;
+    Parser.Lex(); // Eat identifier token
+    return LanaiOperand::createReg(RegNum, Start, End);
+  }
+  return 0;
+}
+
+bool LanaiAsmParser::ParseRegister(unsigned &RegNum, SMLoc &StartLoc,
+                                   SMLoc &EndLoc) {
+  const AsmToken &Tok = getParser().getTok();
+  StartLoc = Tok.getLoc();
+  EndLoc = Tok.getEndLoc();
+  std::unique_ptr<LanaiOperand> Op = parseRegister();
+  if (Op != nullptr)
+    RegNum = Op->getReg();
+  return (Op == nullptr);
+}
+
+std::unique_ptr<LanaiOperand> LanaiAsmParser::parseIdentifier() {
+  SMLoc Start = Parser.getTok().getLoc();
+  SMLoc End = SMLoc::getFromPointer(Parser.getTok().getLoc().getPointer() - 1);
+  const MCExpr *Res, *RHS = 0;
+  LanaiMCExpr::VariantKind Kind = LanaiMCExpr::VK_Lanai_None;
+
+  if (Lexer.getKind() != AsmToken::Identifier)
+    return 0;
+
+  StringRef Identifier;
+  if (Parser.parseIdentifier(Identifier))
+    return 0;
+
+  // Check if identifier has a modifier
+  if (Identifier.equals_lower("hi"))
+    Kind = LanaiMCExpr::VK_Lanai_ABS_HI;
+  else if (Identifier.equals_lower("lo"))
+    Kind = LanaiMCExpr::VK_Lanai_ABS_LO;
+
+  // If the identifier corresponds to a variant then extract the real
+  // identifier.
+  if (Kind != LanaiMCExpr::VK_Lanai_None) {
+    if (Lexer.getKind() != AsmToken::LParen) {
+      Error(Lexer.getLoc(), "Expected '('");
+      return 0;
+    }
+    Lexer.Lex(); // lex '('
+
+    // Parse identifier
+    if (Parser.parseIdentifier(Identifier))
+      return 0;
+  }
+
+  // If addition parse the RHS.
+  if (Lexer.getKind() == AsmToken::Plus && Parser.parseExpression(RHS))
+    return 0;
+
+  // For variants parse the final ')'
+  if (Kind != LanaiMCExpr::VK_Lanai_None) {
+    if (Lexer.getKind() != AsmToken::RParen) {
+      Error(Lexer.getLoc(), "Expected ')'");
+      return 0;
+    }
+    Lexer.Lex(); // lex ')'
+  }
+
+  End = SMLoc::getFromPointer(Parser.getTok().getLoc().getPointer() - 1);
+  MCSymbol *Sym = getContext().getOrCreateSymbol(Identifier);
+  const MCExpr *Expr = MCSymbolRefExpr::create(Sym, getContext());
+  Res = LanaiMCExpr::create(Kind, Expr, getContext());
+
+  // Nest if this was an addition
+  if (RHS)
+    Res = MCBinaryExpr::createAdd(Res, RHS, getContext());
+
+  return LanaiOperand::createImm(Res, Start, End);
+}
+
+std::unique_ptr<LanaiOperand> LanaiAsmParser::parseImmediate() {
+  SMLoc Start = Parser.getTok().getLoc();
+  SMLoc End = SMLoc::getFromPointer(Parser.getTok().getLoc().getPointer() - 1);
+
+  const MCExpr *ExprVal;
+  switch (Lexer.getKind()) {
+  case AsmToken::Identifier:
+    return parseIdentifier();
+  case AsmToken::Plus:
+  case AsmToken::Minus:
+  case AsmToken::Integer:
+  case AsmToken::Dot:
+    if (!Parser.parseExpression(ExprVal))
+      return LanaiOperand::createImm(ExprVal, Start, End);
+  default:
+    return 0;
+  }
+}
+
+static unsigned AluWithPrePost(unsigned AluCode, bool PreOp, bool PostOp) {
+  if (PreOp)
+    return LPAC::makePreOp(AluCode);
+  if (PostOp)
+    return LPAC::makePostOp(AluCode);
+  return AluCode;
+}
+
+unsigned LanaiAsmParser::parseAluOperator(bool PreOp, bool PostOp) {
+  StringRef IdString;
+  Parser.parseIdentifier(IdString);
+  unsigned AluCode = LPAC::stringToLanaiAluCode(IdString);
+  if (AluCode == LPAC::UNKNOWN) {
+    Error(Parser.getTok().getLoc(), "Can't parse ALU operator");
+    return 0;
+  }
+  return AluCode;
+}
+
+static int SizeForSuffix(StringRef T) {
+  return StringSwitch<int>(T).EndsWith(".h", 2).EndsWith(".b", 1).Default(4);
+}
+
+bool LanaiAsmParser::parsePrePost(StringRef Type, int *OffsetValue) {
+  bool PreOrPost = false;
+  if (Lexer.getKind() == Lexer.peekTok(true).getKind()) {
+    PreOrPost = true;
+    if (Lexer.is(AsmToken::Minus))
+      *OffsetValue = -SizeForSuffix(Type);
+    else if (Lexer.is(AsmToken::Plus))
+      *OffsetValue = SizeForSuffix(Type);
+    else
+      return false;
+
+    // Eat the '-' '-' or '+' '+'
+    Parser.Lex();
+    Parser.Lex();
+  } else if (Lexer.is(AsmToken::Star)) {
+    Parser.Lex(); // Eat the '*'
+    PreOrPost = true;
+  }
+
+  return PreOrPost;
+}
+
+bool shouldBeSls(const LanaiOperand &Op) {
+  // The instruction should be encoded as an SLS if the constant is word
+  // aligned and will fit in 21 bits
+  if (const MCConstantExpr *ConstExpr = dyn_cast<MCConstantExpr>(Op.getImm())) {
+    int64_t Value = ConstExpr->getValue();
+    return (Value % 4 == 0) && (Value >= 0) && (Value <= 0x1fffff);
+  }
+  // The instruction should be encoded as an SLS if the operand is a symbolic
+  // reference with no variant.
+  if (const LanaiMCExpr *SymbolRefExpr = dyn_cast<LanaiMCExpr>(Op.getImm()))
+    return SymbolRefExpr->getKind() == LanaiMCExpr::VK_Lanai_None;
+  // The instruction should be encoded as an SLS if the operand is a binary
+  // expression with the left-hand side being a symbolic reference with no
+  // variant.
+  if (const MCBinaryExpr *BinaryExpr = dyn_cast<MCBinaryExpr>(Op.getImm())) {
+    const LanaiMCExpr *LHSSymbolRefExpr =
+        dyn_cast<LanaiMCExpr>(BinaryExpr->getLHS());
+    return (LHSSymbolRefExpr &&
+            LHSSymbolRefExpr->getKind() == LanaiMCExpr::VK_Lanai_None);
+  }
+  return false;
+}
+
+// Matches memory operand. Returns true if error encountered.
+LanaiAsmParser::OperandMatchResultTy
+LanaiAsmParser::parseMemoryOperand(OperandVector &Operands) {
+  // Try to match a memory operand.
+  // The memory operands are of the form:
+  //  (1)  Register|Immediate|'' '[' '*'? Register '*'? ']' or
+  //                            ^
+  //  (2)  '[' '*'? Register '*'? AluOperator Register ']'
+  //      ^
+  //  (3)  '[' '--'|'++' Register '--'|'++' ']'
+  //
+  //  (4) '[' Immediate ']' (for SLS)
+
+  // Store the type for use in parsing pre/post increment/decrement operators
+  StringRef Type;
+  if (Operands[0]->isToken())
+    Type = static_cast<LanaiOperand *>(Operands[0].get())->getToken();
+
+  // Use 0 if no offset given
+  int OffsetValue = 0;
+  unsigned BaseReg = 0;
+  unsigned AluOp = LPAC::ADD;
+  bool PostOp = false, PreOp = false;
+
+  // Try to parse the offset
+  std::unique_ptr<LanaiOperand> Op = parseRegister();
+  if (!Op)
+    Op = parseImmediate();
+
+  // Only continue if next token is '['
+  if (Lexer.isNot(AsmToken::LBrac)) {
+    if (!Op)
+      return MatchOperand_NoMatch;
+
+    // The start of this custom parsing overlaps with register/immediate so
+    // consider this as a successful match of an operand of that type as the
+    // token stream can't be rewound to allow them to match separately.
+    Operands.push_back(std::move(Op));
+    return MatchOperand_Success;
+  }
+
+  Parser.Lex(); // Eat the '['.
+  std::unique_ptr<LanaiOperand> Offset = nullptr;
+  if (Op)
+    Offset.swap(Op);
+
+  // Determine if a pre operation
+  PreOp = parsePrePost(Type, &OffsetValue);
+
+  Op = parseRegister();
+  if (!Op) {
+    if (!Offset) {
+      if ((Op = parseImmediate()) && Lexer.is(AsmToken::RBrac)) {
+        Parser.Lex(); // Eat the ']'
+
+        // Memory address operations aligned to word boundary are encoded as
+        // SLS, the rest as RM.
+        if (shouldBeSls(*Op)) {
+          Operands.push_back(LanaiOperand::MorphToMemImm(std::move(Op)));
+        } else {
+          if (!Op->isLoImm16Signed()) {
+            Error(Parser.getTok().getLoc(),
+                  "Memory address is not word "
+                  "aligned and larger than class RM can handle");
+            return MatchOperand_ParseFail;
+          }
+          Operands.push_back(LanaiOperand::MorphToMemRegImm(
+              Lanai::R0, std::move(Op), LPAC::ADD));
+        }
+        return MatchOperand_Success;
+      }
+    }
+
+    Error(Parser.getTok().getLoc(),
+          "Unknown operand, expected register or immediate");
+    return MatchOperand_ParseFail;
+  }
+  BaseReg = Op->getReg();
+
+  // Determine if a post operation
+  if (!PreOp)
+    PostOp = parsePrePost(Type, &OffsetValue);
+
+  // If ] match form (1) else match form (2)
+  if (Lexer.is(AsmToken::RBrac)) {
+    Parser.Lex(); // Eat the ']'.
+    if (!Offset) {
+      SMLoc Start = Parser.getTok().getLoc();
+      SMLoc End =
+          SMLoc::getFromPointer(Parser.getTok().getLoc().getPointer() - 1);
+      const MCConstantExpr *OffsetConstExpr =
+          MCConstantExpr::create(OffsetValue, getContext());
+      Offset = LanaiOperand::createImm(OffsetConstExpr, Start, End);
+    }
+  } else {
+    if (Offset || OffsetValue != 0) {
+      Error(Parser.getTok().getLoc(), "Expected ']'");
+      return MatchOperand_ParseFail;
+    }
+
+    // Parse operator
+    AluOp = parseAluOperator(PreOp, PostOp);
+
+    // Second form requires offset register
+    Offset = parseRegister();
+    if (!BaseReg || Lexer.isNot(AsmToken::RBrac)) {
+      Error(Parser.getTok().getLoc(), "Expected ']'");
+      return MatchOperand_ParseFail;
+    }
+    Parser.Lex(); // Eat the ']'.
+  }
+
+  // First form has addition as operator. Add pre- or post-op indicator as
+  // needed.
+  AluOp = AluWithPrePost(AluOp, PreOp, PostOp);
+
+  // Ensure immediate offset is not too large
+  if (Offset->isImm() && !Offset->isLoImm16Signed()) {
+    Error(Parser.getTok().getLoc(),
+          "Memory address is not word "
+          "aligned and larger than class RM can handle");
+    return MatchOperand_ParseFail;
+  }
+
+  Operands.push_back(
+      Offset->isImm()
+          ? LanaiOperand::MorphToMemRegImm(BaseReg, std::move(Offset), AluOp)
+          : LanaiOperand::MorphToMemRegReg(BaseReg, std::move(Offset), AluOp));
+
+  return MatchOperand_Success;
+}
+
+// Looks at a token type and creates the relevant operand from this
+// information, adding to operands.
+// If operand was parsed, returns false, else true.
+LanaiAsmParser::OperandMatchResultTy
+LanaiAsmParser::parseOperand(OperandVector *Operands, StringRef Mnemonic) {
+  // Check if the current operand has a custom associated parser, if so, try to
+  // custom parse the operand, or fallback to the general approach.
+  OperandMatchResultTy Result = MatchOperandParserImpl(*Operands, Mnemonic);
+
+  if (Result == MatchOperand_Success)
+    return Result;
+  if (Result == MatchOperand_ParseFail) {
+    Parser.eatToEndOfStatement();
+    return Result;
+  }
+
+  // Attempt to parse token as register
+  std::unique_ptr<LanaiOperand> Op = parseRegister();
+
+  // Attempt to parse token as immediate
+  if (!Op)
+    Op = parseImmediate();
+
+  // If the token could not be parsed then fail
+  if (!Op) {
+    Error(Parser.getTok().getLoc(), "Unknown operand");
+    Parser.eatToEndOfStatement();
+    return MatchOperand_ParseFail;
+  }
+
+  // Push back parsed operand into list of operands
+  Operands->push_back(std::move(Op));
+
+  return MatchOperand_Success;
+}
+
+// Split the mnemonic into ASM operand, conditional code and instruction
+// qualifier (half-word, byte).
+StringRef LanaiAsmParser::splitMnemonic(StringRef Name, SMLoc NameLoc,
+                                        OperandVector *Operands) {
+  size_t Next = Name.find('.');
+
+  StringRef Mnemonic = Name;
+
+  bool IsBRR = false;
+  if (Name.endswith(".r")) {
+    Mnemonic = Name.substr(0, Name.size() - 2);
+    IsBRR = true;
+  }
+
+  // Match b?? and s?? (BR, BRR, and SCC instruction classes).
+  if (Mnemonic[0] == 'b' ||
+      (Mnemonic[0] == 's' && !Mnemonic.startswith("sel") &&
+       !Mnemonic.startswith("st"))) {
+    // Parse instructions with a conditional code. For example, 'bne' is
+    // converted into two operands 'b' and 'ne'.
+    LPCC::CondCode CondCode =
+        LPCC::suffixToLanaiCondCode(Mnemonic.substr(1, Next));
+    if (CondCode != LPCC::UNKNOWN) {
+      Mnemonic = Mnemonic.slice(0, 1);
+      Operands->push_back(LanaiOperand::CreateToken(Mnemonic, NameLoc));
+      Operands->push_back(LanaiOperand::createImm(
+          MCConstantExpr::create(CondCode, getContext()), NameLoc, NameLoc));
+      if (IsBRR) {
+        Operands->push_back(LanaiOperand::CreateToken(".r", NameLoc));
+      }
+      return Mnemonic;
+    }
+  }
+
+  // Parse other instructions with condition codes (RR instructions).
+  // We ignore .f here and assume they are flag-setting operations, not
+  // conditional codes (except for select instructions where flag-setting
+  // variants are not yet implemented).
+  if (Mnemonic.startswith("sel") ||
+      (!Mnemonic.endswith(".f") && !Mnemonic.startswith("st"))) {
+    LPCC::CondCode CondCode = LPCC::suffixToLanaiCondCode(Mnemonic);
+    if (CondCode != LPCC::UNKNOWN) {
+      size_t Next = Mnemonic.rfind('.', Name.size());
+      // 'sel' doesn't use a predicate operand whose printer adds the period,
+      // but instead has the period as part of the identifier (i.e., 'sel.' is
+      // expected by the generated matcher). If the mnemonic starts with 'sel'
+      // then include the period as part of the mnemonic, else don't include it
+      // as part of the mnemonic.
+      if (Mnemonic.startswith("sel")) {
+        Mnemonic = Mnemonic.substr(0, Next + 1);
+      } else {
+        Mnemonic = Mnemonic.substr(0, Next);
+      }
+      Operands->push_back(LanaiOperand::CreateToken(Mnemonic, NameLoc));
+      Operands->push_back(LanaiOperand::createImm(
+          MCConstantExpr::create(CondCode, getContext()), NameLoc, NameLoc));
+      return Mnemonic;
+    }
+  }
+
+  Operands->push_back(LanaiOperand::CreateToken(Mnemonic, NameLoc));
+  if (IsBRR) {
+    Operands->push_back(LanaiOperand::CreateToken(".r", NameLoc));
+  }
+
+  return Mnemonic;
+}
+
+bool IsMemoryAssignmentError(const OperandVector &Operands) {
+  // Detects if a memory operation has an erroneous base register modification.
+  // Memory operations are detected by matching the types of operands.
+  //
+  // TODO: This test is focussed on one specific instance (ld/st).
+  // Extend it to handle more cases or be more robust.
+  bool Modifies = false;
+
+  int Offset = 0;
+
+  if (Operands.size() < 5)
+    return false;
+  else if (Operands[0]->isToken() && Operands[1]->isReg() &&
+           Operands[2]->isImm() && Operands[3]->isImm() && Operands[4]->isReg())
+    Offset = 0;
+  else if (Operands[0]->isToken() && Operands[1]->isToken() &&
+           Operands[2]->isReg() && Operands[3]->isImm() &&
+           Operands[4]->isImm() && Operands[5]->isReg())
+    Offset = 1;
+  else
+    return false;
+
+  int PossibleAluOpIdx = Offset + 3;
+  int PossibleBaseIdx = Offset + 1;
+  int PossibleDestIdx = Offset + 4;
+  if (LanaiOperand *PossibleAluOp =
+          static_cast<LanaiOperand *>(Operands[PossibleAluOpIdx].get()))
+    if (PossibleAluOp->isImm())
+      if (const MCConstantExpr *ConstExpr =
+              dyn_cast<MCConstantExpr>(PossibleAluOp->getImm()))
+        Modifies = LPAC::modifiesOp(ConstExpr->getValue());
+  return Modifies && Operands[PossibleBaseIdx]->isReg() &&
+         Operands[PossibleDestIdx]->isReg() &&
+         Operands[PossibleBaseIdx]->getReg() ==
+             Operands[PossibleDestIdx]->getReg();
+}
+
+static bool IsRegister(const MCParsedAsmOperand &op) {
+  return static_cast<const LanaiOperand &>(op).isReg();
+}
+
+static bool MaybePredicatedInst(const OperandVector &Operands) {
+  if (Operands.size() < 4 || !IsRegister(*Operands[1]) ||
+      !IsRegister(*Operands[2]))
+    return false;
+  return StringSwitch<bool>(
+             static_cast<const LanaiOperand &>(*Operands[0]).getToken())
+      .StartsWith("addc", true)
+      .StartsWith("add", true)
+      .StartsWith("and", true)
+      .StartsWith("sh", true)
+      .StartsWith("subb", true)
+      .StartsWith("sub", true)
+      .StartsWith("or", true)
+      .StartsWith("xor", true)
+      .Default(false);
+}
+
+bool LanaiAsmParser::ParseInstruction(ParseInstructionInfo & /*Info*/,
+                                      StringRef Name, SMLoc NameLoc,
+                                      OperandVector &Operands) {
+  // First operand is token for instruction
+  StringRef Mnemonic = splitMnemonic(Name, NameLoc, &Operands);
+
+  // If there are no more operands, then finish
+  if (Lexer.is(AsmToken::EndOfStatement))
+    return false;
+
+  // Parse first operand
+  if (parseOperand(&Operands, Mnemonic) != MatchOperand_Success)
+    return true;
+
+  // If it is a st instruction with one 1 operand then it is a "store true".
+  // Transform <"st"> to <"s">, <LPCC:ICC_T>
+  if (Lexer.is(AsmToken::EndOfStatement) && Name == "st" &&
+      Operands.size() == 2) {
+    Operands.erase(Operands.begin(), Operands.begin() + 1);
+    Operands.insert(Operands.begin(), LanaiOperand::CreateToken("s", NameLoc));
+    Operands.insert(Operands.begin() + 1,
+                    LanaiOperand::createImm(
+                        MCConstantExpr::create(LPCC::ICC_T, getContext()),
+                        NameLoc, NameLoc));
+  }
+
+  // If the instruction is a bt instruction with 1 operand (in assembly) then it
+  // is an unconditional branch instruction and the first two elements of
+  // operands need to be merged.
+  if (Lexer.is(AsmToken::EndOfStatement) && Name.startswith("bt") &&
+      Operands.size() == 3) {
+    Operands.erase(Operands.begin(), Operands.begin() + 2);
+    Operands.insert(Operands.begin(), LanaiOperand::CreateToken("bt", NameLoc));
+  }
+
+  // Parse until end of statement, consuming commas between operands
+  while (Lexer.isNot(AsmToken::EndOfStatement) && Lexer.is(AsmToken::Comma)) {
+    // Consume comma token
+    Lex();
+
+    // Parse next operand
+    if (parseOperand(&Operands, Mnemonic) != MatchOperand_Success)
+      return true;
+  }
+
+  if (IsMemoryAssignmentError(Operands)) {
+    Error(Parser.getTok().getLoc(),
+          "the destination register can't equal the base register in an "
+          "instruction that modifies the base register.");
+    return true;
+  }
+
+  // Insert always true operand for instruction that may be predicated but
+  // are not. Currently the autogenerated parser always expects a predicate.
+  if (MaybePredicatedInst(Operands)) {
+    Operands.insert(Operands.begin() + 1,
+                    LanaiOperand::createImm(
+                        MCConstantExpr::create(LPCC::ICC_T, getContext()),
+                        NameLoc, NameLoc));
+  }
+
+  return false;
+}
+
+#define GET_REGISTER_MATCHER
+#define GET_MATCHER_IMPLEMENTATION
+#include "LanaiGenAsmMatcher.inc"
+} // namespace
+
+extern "C" void LLVMInitializeLanaiAsmParser() {
+  RegisterMCAsmParser<LanaiAsmParser> x(TheLanaiTarget);
+}
+
+} // namespace llvm
diff --git a/lib/Target/Lanai/CMakeLists.txt b/lib/Target/Lanai/CMakeLists.txt
new file mode 100644
index 000000000000..867f6165c253
--- /dev/null
+++ b/lib/Target/Lanai/CMakeLists.txt
@@ -0,0 +1,35 @@
+set(LLVM_TARGET_DEFINITIONS Lanai.td)
+
+tablegen(LLVM LanaiGenAsmMatcher.inc -gen-asm-matcher)
+tablegen(LLVM LanaiGenAsmWriter.inc -gen-asm-writer)
+tablegen(LLVM LanaiGenCallingConv.inc -gen-callingconv)
+tablegen(LLVM LanaiGenDAGISel.inc -gen-dag-isel)
+tablegen(LLVM LanaiGenDisassemblerTables.inc -gen-disassembler)
+tablegen(LLVM LanaiGenInstrInfo.inc -gen-instr-info)
+tablegen(LLVM LanaiGenMCCodeEmitter.inc -gen-emitter)
+tablegen(LLVM LanaiGenRegisterInfo.inc -gen-register-info)
+tablegen(LLVM LanaiGenSubtargetInfo.inc -gen-subtarget)
+add_public_tablegen_target(LanaiCommonTableGen)
+
+add_llvm_target(LanaiCodeGen
+  LanaiAsmPrinter.cpp
+  LanaiDelaySlotFiller.cpp
+  LanaiFrameLowering.cpp
+  LanaiInstrInfo.cpp
+  LanaiISelDAGToDAG.cpp
+  LanaiISelLowering.cpp
+  LanaiMachineFunctionInfo.cpp
+  LanaiMCInstLower.cpp
+  LanaiMemAluCombiner.cpp
+  LanaiRegisterInfo.cpp
+  LanaiSelectionDAGInfo.cpp
+  LanaiSubtarget.cpp
+  LanaiTargetMachine.cpp
+  LanaiTargetObjectFile.cpp
+)
+
+add_subdirectory(AsmParser)
+add_subdirectory(TargetInfo)
+add_subdirectory(MCTargetDesc)
+add_subdirectory(InstPrinter)
+add_subdirectory(Disassembler)
diff --git a/lib/Target/Lanai/Disassembler/CMakeLists.txt b/lib/Target/Lanai/Disassembler/CMakeLists.txt
new file mode 100644
index 000000000000..785c98d8dff1
--- /dev/null
+++ b/lib/Target/Lanai/Disassembler/CMakeLists.txt
@@ -0,0 +1,3 @@
+add_llvm_library(LLVMLanaiDisassembler
+  LanaiDisassembler.cpp
+  )
diff --git a/lib/Target/Lanai/Disassembler/LLVMBuild.txt b/lib/Target/Lanai/Disassembler/LLVMBuild.txt
new file mode 100644
index 000000000000..fe789f90f2e3
--- /dev/null
+++ b/lib/Target/Lanai/Disassembler/LLVMBuild.txt
@@ -0,0 +1,23 @@
+;===-- ./lib/Target/Lanai/Disassembler/LLVMBuild.txt -----------*- Conf -*--===;
+;
+;                     The LLVM Compiler Infrastructure
+;
+; This file is distributed under the University of Illinois Open Source
+; License. See LICENSE.TXT for details.
+;
+;===------------------------------------------------------------------------===;
+;
+; This is an LLVMBuild description file for the components in this subdirectory.
+;
+; For more information on the LLVMBuild system, please see:
+;
+;   http://llvm.org/docs/LLVMBuild.html
+;
+;===------------------------------------------------------------------------===;
+
+[component_0]
+type = Library
+name = LanaiDisassembler
+parent = Lanai
+required_libraries = LanaiMCTargetDesc LanaiInfo MC MCDisassembler Support
+add_to_library_groups = Lanai
diff --git a/lib/Target/Lanai/Disassembler/LanaiDisassembler.cpp b/lib/Target/Lanai/Disassembler/LanaiDisassembler.cpp
new file mode 100644
index 000000000000..744441bc90dd
--- /dev/null
+++ b/lib/Target/Lanai/Disassembler/LanaiDisassembler.cpp
@@ -0,0 +1,240 @@
+//===- LanaiDisassembler.cpp - Disassembler for Lanai -----------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file is part of the Lanai Disassembler.
+//
+//===----------------------------------------------------------------------===//
+
+#include "LanaiDisassembler.h"
+
+#include "Lanai.h"
+#include "LanaiSubtarget.h"
+#include "llvm/MC/MCFixedLenDisassembler.h"
+#include "llvm/MC/MCInst.h"
+#include "llvm/MC/MCSubtargetInfo.h"
+#include "llvm/Support/MathExtras.h"
+#include "llvm/Support/MemoryObject.h"
+#include "llvm/Support/TargetRegistry.h"
+
+using namespace llvm;
+
+typedef MCDisassembler::DecodeStatus DecodeStatus;
+
+namespace llvm {
+extern Target TheLanaiTarget;
+}
+
+static MCDisassembler *createLanaiDisassembler(const Target & /*T*/,
+                                               const MCSubtargetInfo &STI,
+                                               MCContext &Ctx) {
+  return new LanaiDisassembler(STI, Ctx);
+}
+
+extern "C" void LLVMInitializeLanaiDisassembler() {
+  // Register the disassembler
+  TargetRegistry::RegisterMCDisassembler(TheLanaiTarget,
+                                         createLanaiDisassembler);
+}
+
+LanaiDisassembler::LanaiDisassembler(const MCSubtargetInfo &STI, MCContext &Ctx)
+    : MCDisassembler(STI, Ctx) {}
+
+// Forward declare because the autogenerated code will reference this.
+// Definition is further down.
+DecodeStatus DecodeGPRRegisterClass(MCInst &Inst, unsigned RegNo,
+                                    uint64_t Address, const void *Decoder);
+
+static DecodeStatus decodeRiMemoryValue(MCInst &Inst, unsigned Insn,
+                                        uint64_t Address, const void *Decoder);
+
+static DecodeStatus decodeRrMemoryValue(MCInst &Inst, unsigned Insn,
+                                        uint64_t Address, const void *Decoder);
+
+static DecodeStatus decodeSplsValue(MCInst &Inst, unsigned Insn,
+                                    uint64_t Address, const void *Decoder);
+
+static DecodeStatus decodeBranch(MCInst &Inst, unsigned Insn, uint64_t Address,
+                                 const void *Decoder);
+
+static DecodeStatus decodePredicateOperand(MCInst &Inst, unsigned Val,
+                                           uint64_t Address,
+                                           const void *Decoder);
+
+static DecodeStatus decodeShiftImm(MCInst &Inst, unsigned Insn,
+                                   uint64_t Address, const void *Decoder);
+
+#include "LanaiGenDisassemblerTables.inc"
+
+static DecodeStatus readInstruction32(ArrayRef<uint8_t> Bytes, uint64_t &Size,
+                                      uint32_t &Insn) {
+  // We want to read exactly 4 bytes of data.
+  if (Bytes.size() < 4) {
+    Size = 0;
+    return MCDisassembler::Fail;
+  }
+
+  // Encoded as big-endian 32-bit word in the stream.
+  Insn =
+      (Bytes[0] << 24) | (Bytes[1] << 16) | (Bytes[2] << 8) | (Bytes[3] << 0);
+
+  return MCDisassembler::Success;
+}
+
+static void PostOperandDecodeAdjust(MCInst &Instr, uint32_t Insn) {
+  unsigned AluOp = LPAC::ADD;
+  // Fix up for pre and post operations.
+  int PqShift = -1;
+  if (isRMOpcode(Instr.getOpcode()))
+    PqShift = 16;
+  else if (isSPLSOpcode(Instr.getOpcode()))
+    PqShift = 10;
+  else if (isRRMOpcode(Instr.getOpcode())) {
+    PqShift = 16;
+    // Determine RRM ALU op.
+    AluOp = (Insn >> 8) & 0x7;
+    if (AluOp == 7)
+      // Handle JJJJJ
+      // 0b10000 or 0b11000
+      AluOp |= 0x20 | (((Insn >> 3) & 0xf) << 1);
+  }
+
+  if (PqShift != -1) {
+    unsigned PQ = (Insn >> PqShift) & 0x3;
+    switch (PQ) {
+    case 0x0:
+      if (Instr.getOperand(2).isReg()) {
+        Instr.getOperand(2).setReg(Lanai::R0);
+      }
+      if (Instr.getOperand(2).isImm())
+        Instr.getOperand(2).setImm(0);
+      break;
+    case 0x1:
+      AluOp = LPAC::makePostOp(AluOp);
+      break;
+    case 0x2:
+      break;
+    case 0x3:
+      AluOp = LPAC::makePreOp(AluOp);
+      break;
+    }
+    Instr.addOperand(MCOperand::createImm(AluOp));
+  }
+}
+
+DecodeStatus LanaiDisassembler::getInstruction(
+    MCInst &Instr, uint64_t &Size, ArrayRef<uint8_t> Bytes, uint64_t Address,
+    raw_ostream & /*VStream*/, raw_ostream & /*CStream*/) const {
+  uint32_t Insn;
+
+  DecodeStatus Result = readInstruction32(Bytes, Size, Insn);
+
+  if (Result == MCDisassembler::Fail)
+    return MCDisassembler::Fail;
+
+  // Call auto-generated decoder function
+  Result =
+      decodeInstruction(DecoderTableLanai32, Instr, Insn, Address, this, STI);
+
+  if (Result != MCDisassembler::Fail) {
+    PostOperandDecodeAdjust(Instr, Insn);
+    Size = 4;
+    return Result;
+  }
+
+  return MCDisassembler::Fail;
+}
+
+static const unsigned GPRDecoderTable[] = {
+    Lanai::R0,  Lanai::R1,  Lanai::PC,  Lanai::R3,  Lanai::SP,  Lanai::FP,
+    Lanai::R6,  Lanai::R7,  Lanai::RV,  Lanai::R9,  Lanai::RR1, Lanai::RR2,
+    Lanai::R12, Lanai::R13, Lanai::R14, Lanai::RCA, Lanai::R16, Lanai::R17,
+    Lanai::R18, Lanai::R19, Lanai::R20, Lanai::R21, Lanai::R22, Lanai::R23,
+    Lanai::R24, Lanai::R25, Lanai::R26, Lanai::R27, Lanai::R28, Lanai::R29,
+    Lanai::R30, Lanai::R31};
+
+DecodeStatus DecodeGPRRegisterClass(MCInst &Inst, unsigned RegNo,
+                                    uint64_t /*Address*/,
+                                    const void * /*Decoder*/) {
+  if (RegNo > 31)
+    return MCDisassembler::Fail;
+
+  unsigned Reg = GPRDecoderTable[RegNo];
+  Inst.addOperand(MCOperand::createReg(Reg));
+  return MCDisassembler::Success;
+}
+
+static DecodeStatus decodeRiMemoryValue(MCInst &Inst, unsigned Insn,
+                                        uint64_t Address, const void *Decoder) {
+  // RI memory values encoded using 23 bits:
+  //   5 bit register, 16 bit constant
+  unsigned Register = (Insn >> 18) & 0x1f;
+  Inst.addOperand(MCOperand::createReg(GPRDecoderTable[Register]));
+  unsigned Offset = (Insn & 0xffff);
+  Inst.addOperand(MCOperand::createImm(SignExtend32<16>(Offset)));
+
+  return MCDisassembler::Success;
+}
+
+static DecodeStatus decodeRrMemoryValue(MCInst &Inst, unsigned Insn,
+                                        uint64_t Address, const void *Decoder) {
+  // RR memory values encoded using 20 bits:
+  //   5 bit register, 5 bit register, 2 bit PQ, 3 bit ALU operator, 5 bit JJJJJ
+  unsigned Register = (Insn >> 15) & 0x1f;
+  Inst.addOperand(MCOperand::createReg(GPRDecoderTable[Register]));
+  Register = (Insn >> 10) & 0x1f;
+  Inst.addOperand(MCOperand::createReg(GPRDecoderTable[Register]));
+
+  return MCDisassembler::Success;
+}
+
+static DecodeStatus decodeSplsValue(MCInst &Inst, unsigned Insn,
+                                    uint64_t Address, const void *Decoder) {
+  // RI memory values encoded using 17 bits:
+  //   5 bit register, 10 bit constant
+  unsigned Register = (Insn >> 12) & 0x1f;
+  Inst.addOperand(MCOperand::createReg(GPRDecoderTable[Register]));
+  unsigned Offset = (Insn & 0x3ff);
+  Inst.addOperand(MCOperand::createImm(SignExtend32<10>(Offset)));
+
+  return MCDisassembler::Success;
+}
+
+static bool tryAddingSymbolicOperand(int64_t Value, bool IsBranch,
+                                     uint64_t Address, uint64_t Offset,
+                                     uint64_t Width, MCInst &MI,
+                                     const void *Decoder) {
+  const MCDisassembler *Dis = static_cast<const MCDisassembler *>(Decoder);
+  return Dis->tryAddingSymbolicOperand(MI, Value, Address, IsBranch, Offset,
+                                       Width);
+}
+
+static DecodeStatus decodeBranch(MCInst &MI, unsigned Insn, uint64_t Address,
+                                 const void *Decoder) {
+  if (!tryAddingSymbolicOperand(Insn + Address, false, Address, 2, 23, MI,
+                                Decoder))
+    MI.addOperand(MCOperand::createImm(Insn));
+  return MCDisassembler::Success;
+}
+
+static DecodeStatus decodeShiftImm(MCInst &Inst, unsigned Insn,
+                                   uint64_t Address, const void *Decoder) {
+  unsigned Offset = (Insn & 0xffff);
+  Inst.addOperand(MCOperand::createImm(SignExtend32<16>(Offset)));
+
+  return MCDisassembler::Success;
+}
+
+static DecodeStatus decodePredicateOperand(MCInst &Inst, unsigned Val,
+                                           uint64_t Address,
+                                           const void *Decoder) {
+  if (Val >= LPCC::UNKNOWN)
+    return MCDisassembler::Fail;
+  Inst.addOperand(MCOperand::createImm(Val));
+  return MCDisassembler::Success;
+}
diff --git a/lib/Target/Lanai/Disassembler/LanaiDisassembler.h b/lib/Target/Lanai/Disassembler/LanaiDisassembler.h
new file mode 100644
index 000000000000..a317cd88ad63
--- /dev/null
+++ b/lib/Target/Lanai/Disassembler/LanaiDisassembler.h
@@ -0,0 +1,41 @@
+//===- LanaiDisassembler.cpp - Disassembler for Lanai -----------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file is part of the Lanai Disassembler.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_LANAI_DISASSEMBLER_LANAIDISASSEMBLER_H
+#define LLVM_LIB_TARGET_LANAI_DISASSEMBLER_LANAIDISASSEMBLER_H
+
+#define DEBUG_TYPE "lanai-disassembler"
+
+#include "llvm/MC/MCDisassembler/MCDisassembler.h"
+
+namespace llvm {
+
+class MCInst;
+class raw_ostream;
+
+class LanaiDisassembler : public MCDisassembler {
+public:
+  LanaiDisassembler(const MCSubtargetInfo &STI, MCContext &Ctx);
+
+  ~LanaiDisassembler() override {}
+
+  // getInstruction - See MCDisassembler.
+  MCDisassembler::DecodeStatus
+  getInstruction(MCInst &Instr, uint64_t &Size, ArrayRef<uint8_t> Bytes,
+                 uint64_t Address, raw_ostream &VStream,
+                 raw_ostream &CStream) const override;
+};
+
+} // namespace llvm
+
+#endif // LLVM_LIB_TARGET_LANAI_DISASSEMBLER_LANAIDISASSEMBLER_H
diff --git a/lib/Target/Lanai/InstPrinter/CMakeLists.txt b/lib/Target/Lanai/InstPrinter/CMakeLists.txt
new file mode 100644
index 000000000000..6badb1c98a6d
--- /dev/null
+++ b/lib/Target/Lanai/InstPrinter/CMakeLists.txt
@@ -0,0 +1,3 @@
+add_llvm_library(LLVMLanaiInstPrinter
+  LanaiInstPrinter.cpp
+  )
diff --git a/lib/Target/Lanai/InstPrinter/LLVMBuild.txt b/lib/Target/Lanai/InstPrinter/LLVMBuild.txt
new file mode 100644
index 000000000000..6366d7eded8f
--- /dev/null
+++ b/lib/Target/Lanai/InstPrinter/LLVMBuild.txt
@@ -0,0 +1,23 @@
+;===-- ./lib/Target/Lanai/InstPrinter/LLVMBuild.txt ------------*- Conf -*--===;
+;
+;                     The LLVM Compiler Infrastructure
+;
+; This file is distributed under the University of Illinois Open Source
+; License. See LICENSE.TXT for details.
+;
+;===------------------------------------------------------------------------===;
+;
+; This is an LLVMBuild description file for the components in this subdirectory.
+;
+; For more information on the LLVMBuild system, please see:
+;
+;   http://llvm.org/docs/LLVMBuild.html
+;
+;===------------------------------------------------------------------------===;
+
+[component_0]
+type = Library
+name = LanaiInstPrinter
+parent = Lanai
+required_libraries = LanaiInfo MC Support
+add_to_library_groups = Lanai
diff --git a/lib/Target/Lanai/InstPrinter/LanaiInstPrinter.cpp b/lib/Target/Lanai/InstPrinter/LanaiInstPrinter.cpp
new file mode 100644
index 000000000000..2fa411fcfd87
--- /dev/null
+++ b/lib/Target/Lanai/InstPrinter/LanaiInstPrinter.cpp
@@ -0,0 +1,305 @@
+//===-- LanaiInstPrinter.cpp - Convert Lanai MCInst to asm syntax ---------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This class prints an Lanai MCInst to a .s file.
+//
+//===----------------------------------------------------------------------===//
+
+#include "LanaiInstPrinter.h"
+#include "Lanai.h"
+#include "MCTargetDesc/LanaiMCExpr.h"
+#include "llvm/MC/MCAsmInfo.h"
+#include "llvm/MC/MCExpr.h"
+#include "llvm/MC/MCInst.h"
+#include "llvm/MC/MCSymbol.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/FormattedStream.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "asm-printer"
+
+// Include the auto-generated portion of the assembly writer.
+#define PRINT_ALIAS_INSTR
+#include "LanaiGenAsmWriter.inc"
+
+void LanaiInstPrinter::printRegName(raw_ostream &OS, unsigned RegNo) const {
+  OS << StringRef(getRegisterName(RegNo)).lower();
+}
+
+bool LanaiInstPrinter::printInst(const MCInst *MI, raw_ostream &OS,
+                                 StringRef Alias, unsigned OpNo0,
+                                 unsigned OpNo1) {
+  OS << "\t" << Alias << " ";
+  printOperand(MI, OpNo0, OS);
+  OS << ", ";
+  printOperand(MI, OpNo1, OS);
+  return true;
+}
+
+static bool usesGivenOffset(const MCInst *MI, int AddOffset) {
+  unsigned AluCode = MI->getOperand(3).getImm();
+  return LPAC::encodeLanaiAluCode(AluCode) == LPAC::ADD &&
+         (MI->getOperand(2).getImm() == AddOffset ||
+          MI->getOperand(2).getImm() == -AddOffset);
+}
+
+static bool isPreIncrementForm(const MCInst *MI, int AddOffset) {
+  unsigned AluCode = MI->getOperand(3).getImm();
+  return LPAC::isPreOp(AluCode) && usesGivenOffset(MI, AddOffset);
+}
+
+static bool isPostIncrementForm(const MCInst *MI, int AddOffset) {
+  unsigned AluCode = MI->getOperand(3).getImm();
+  return LPAC::isPostOp(AluCode) && usesGivenOffset(MI, AddOffset);
+}
+
+static StringRef decIncOperator(const MCInst *MI) {
+  if (MI->getOperand(2).getImm() < 0)
+    return "--";
+  return "++";
+}
+
+bool LanaiInstPrinter::printMemoryLoadIncrement(const MCInst *MI,
+                                                raw_ostream &OS,
+                                                StringRef Opcode,
+                                                int AddOffset) {
+  if (isPreIncrementForm(MI, AddOffset)) {
+    OS << "\t" << Opcode << "\t[" << decIncOperator(MI) << "%"
+       << getRegisterName(MI->getOperand(1).getReg()) << "], %"
+       << getRegisterName(MI->getOperand(0).getReg());
+    return true;
+  }
+  if (isPostIncrementForm(MI, AddOffset)) {
+    OS << "\t" << Opcode << "\t[%"
+       << getRegisterName(MI->getOperand(1).getReg()) << decIncOperator(MI)
+       << "], %" << getRegisterName(MI->getOperand(0).getReg());
+    return true;
+  }
+  return false;
+}
+
+bool LanaiInstPrinter::printMemoryStoreIncrement(const MCInst *MI,
+                                                 raw_ostream &OS,
+                                                 StringRef Opcode,
+                                                 int AddOffset) {
+  if (isPreIncrementForm(MI, AddOffset)) {
+    OS << "\t" << Opcode << "\t%" << getRegisterName(MI->getOperand(0).getReg())
+       << ", [" << decIncOperator(MI) << "%"
+       << getRegisterName(MI->getOperand(1).getReg()) << "]";
+    return true;
+  }
+  if (isPostIncrementForm(MI, AddOffset)) {
+    OS << "\t" << Opcode << "\t%" << getRegisterName(MI->getOperand(0).getReg())
+       << ", [%" << getRegisterName(MI->getOperand(1).getReg())
+       << decIncOperator(MI) << "]";
+    return true;
+  }
+  return false;
+}
+
+bool LanaiInstPrinter::printAlias(const MCInst *MI, raw_ostream &OS) {
+  switch (MI->getOpcode()) {
+  case Lanai::LDW_RI:
+    // ld 4[*%rN], %rX => ld [++imm], %rX
+    // ld -4[*%rN], %rX => ld [--imm], %rX
+    // ld 4[%rN*], %rX => ld [imm++], %rX
+    // ld -4[%rN*], %rX => ld [imm--], %rX
+    return printMemoryLoadIncrement(MI, OS, "ld", 4);
+  case Lanai::LDHs_RI:
+    return printMemoryLoadIncrement(MI, OS, "ld.h", 2);
+  case Lanai::LDHz_RI:
+    return printMemoryLoadIncrement(MI, OS, "uld.h", 2);
+  case Lanai::LDBs_RI:
+    return printMemoryLoadIncrement(MI, OS, "ld.b", 1);
+  case Lanai::LDBz_RI:
+    return printMemoryLoadIncrement(MI, OS, "uld.b", 1);
+  case Lanai::SW_RI:
+    // st %rX, 4[*%rN] => st %rX, [++imm]
+    // st %rX, -4[*%rN] => st %rX, [--imm]
+    // st %rX, 4[%rN*] => st %rX, [imm++]
+    // st %rX, -4[%rN*] => st %rX, [imm--]
+    return printMemoryStoreIncrement(MI, OS, "st", 4);
+  case Lanai::STH_RI:
+    return printMemoryStoreIncrement(MI, OS, "st.h", 2);
+  case Lanai::STB_RI:
+    return printMemoryStoreIncrement(MI, OS, "st.b", 1);
+  default:
+    return false;
+  }
+}
+
+void LanaiInstPrinter::printInst(const MCInst *MI, raw_ostream &OS,
+                                 StringRef Annotation,
+                                 const MCSubtargetInfo & /*STI*/) {
+  if (!printAlias(MI, OS) && !printAliasInstr(MI, OS))
+    printInstruction(MI, OS);
+  printAnnotation(OS, Annotation);
+}
+
+void LanaiInstPrinter::printOperand(const MCInst *MI, unsigned OpNo,
+                                    raw_ostream &OS, const char *Modifier) {
+  assert((Modifier == 0 || Modifier[0] == 0) && "No modifiers supported");
+  const MCOperand &Op = MI->getOperand(OpNo);
+  if (Op.isReg())
+    OS << "%" << getRegisterName(Op.getReg());
+  else if (Op.isImm())
+    OS << formatHex(Op.getImm());
+  else {
+    assert(Op.isExpr() && "Expected an expression");
+    Op.getExpr()->print(OS, &MAI);
+  }
+}
+
+void LanaiInstPrinter::printMemImmOperand(const MCInst *MI, unsigned OpNo,
+                                          raw_ostream &OS) {
+  const MCOperand &Op = MI->getOperand(OpNo);
+  if (Op.isImm()) {
+    OS << '[' << formatHex(Op.getImm()) << ']';
+  } else {
+    // Symbolic operand will be lowered to immediate value by linker
+    assert(Op.isExpr() && "Expected an expression");
+    OS << '[';
+    Op.getExpr()->print(OS, &MAI);
+    OS << ']';
+  }
+}
+
+void LanaiInstPrinter::printHi16ImmOperand(const MCInst *MI, unsigned OpNo,
+                                           raw_ostream &OS) {
+  const MCOperand &Op = MI->getOperand(OpNo);
+  if (Op.isImm()) {
+    OS << formatHex(Op.getImm() << 16);
+  } else {
+    // Symbolic operand will be lowered to immediate value by linker
+    assert(Op.isExpr() && "Expected an expression");
+    Op.getExpr()->print(OS, &MAI);
+  }
+}
+
+void LanaiInstPrinter::printHi16AndImmOperand(const MCInst *MI, unsigned OpNo,
+                                              raw_ostream &OS) {
+  const MCOperand &Op = MI->getOperand(OpNo);
+  if (Op.isImm()) {
+    OS << formatHex((Op.getImm() << 16) | 0xffff);
+  } else {
+    // Symbolic operand will be lowered to immediate value by linker
+    assert(Op.isExpr() && "Expected an expression");
+    Op.getExpr()->print(OS, &MAI);
+  }
+}
+
+void LanaiInstPrinter::printLo16AndImmOperand(const MCInst *MI, unsigned OpNo,
+                                              raw_ostream &OS) {
+  const MCOperand &Op = MI->getOperand(OpNo);
+  if (Op.isImm()) {
+    OS << formatHex(0xffff0000 | Op.getImm());
+  } else {
+    // Symbolic operand will be lowered to immediate value by linker
+    assert(Op.isExpr() && "Expected an expression");
+    Op.getExpr()->print(OS, &MAI);
+  }
+}
+
+static void printMemoryBaseRegister(raw_ostream &OS, const unsigned AluCode,
+                                    const MCOperand &RegOp) {
+  assert(RegOp.isReg() && "Register operand expected");
+  OS << "[";
+  if (LPAC::isPreOp(AluCode))
+    OS << "*";
+  OS << "%" << LanaiInstPrinter::getRegisterName(RegOp.getReg());
+  if (LPAC::isPostOp(AluCode))
+    OS << "*";
+  OS << "]";
+}
+
+template <unsigned SizeInBits>
+static void printMemoryImmediateOffset(const MCAsmInfo &MAI,
+                                       const MCOperand &OffsetOp,
+                                       raw_ostream &OS) {
+  assert((OffsetOp.isImm() || OffsetOp.isExpr()) && "Immediate expected");
+  if (OffsetOp.isImm()) {
+    assert(isInt<SizeInBits>(OffsetOp.getImm()) && "Constant value truncated");
+    OS << OffsetOp.getImm();
+  } else
+    OffsetOp.getExpr()->print(OS, &MAI);
+}
+
+void LanaiInstPrinter::printMemRiOperand(const MCInst *MI, int OpNo,
+                                         raw_ostream &OS,
+                                         const char * /*Modifier*/) {
+  const MCOperand &RegOp = MI->getOperand(OpNo);
+  const MCOperand &OffsetOp = MI->getOperand(OpNo + 1);
+  const MCOperand &AluOp = MI->getOperand(OpNo + 2);
+  const unsigned AluCode = AluOp.getImm();
+
+  // Offset
+  printMemoryImmediateOffset<16>(MAI, OffsetOp, OS);
+
+  // Register
+  printMemoryBaseRegister(OS, AluCode, RegOp);
+}
+
+void LanaiInstPrinter::printMemRrOperand(const MCInst *MI, int OpNo,
+                                         raw_ostream &OS,
+                                         const char * /*Modifier*/) {
+  const MCOperand &RegOp = MI->getOperand(OpNo);
+  const MCOperand &OffsetOp = MI->getOperand(OpNo + 1);
+  const MCOperand &AluOp = MI->getOperand(OpNo + 2);
+  const unsigned AluCode = AluOp.getImm();
+  assert(OffsetOp.isReg() && RegOp.isReg() && "Registers expected.");
+
+  // [ Base OP Offset ]
+  OS << "[";
+  if (LPAC::isPreOp(AluCode))
+    OS << "*";
+  OS << "%" << getRegisterName(RegOp.getReg());
+  if (LPAC::isPostOp(AluCode))
+    OS << "*";
+  OS << " " << LPAC::lanaiAluCodeToString(AluCode) << " ";
+  OS << "%" << getRegisterName(OffsetOp.getReg());
+  OS << "]";
+}
+
+void LanaiInstPrinter::printMemSplsOperand(const MCInst *MI, int OpNo,
+                                           raw_ostream &OS,
+                                           const char * /*Modifier*/) {
+  const MCOperand &RegOp = MI->getOperand(OpNo);
+  const MCOperand &OffsetOp = MI->getOperand(OpNo + 1);
+  const MCOperand &AluOp = MI->getOperand(OpNo + 2);
+  const unsigned AluCode = AluOp.getImm();
+
+  // Offset
+  printMemoryImmediateOffset<10>(MAI, OffsetOp, OS);
+
+  // Register
+  printMemoryBaseRegister(OS, AluCode, RegOp);
+}
+
+void LanaiInstPrinter::printCCOperand(const MCInst *MI, int OpNo,
+                                      raw_ostream &OS) {
+  LPCC::CondCode CC =
+      static_cast<LPCC::CondCode>(MI->getOperand(OpNo).getImm());
+  // Handle the undefined value here for printing so we don't abort().
+  if (CC >= LPCC::UNKNOWN)
+    OS << "<und>";
+  else
+    OS << lanaiCondCodeToString(CC);
+}
+
+void LanaiInstPrinter::printPredicateOperand(const MCInst *MI, unsigned OpNo,
+                                             raw_ostream &OS) {
+  LPCC::CondCode CC =
+      static_cast<LPCC::CondCode>(MI->getOperand(OpNo).getImm());
+  // Handle the undefined value here for printing so we don't abort().
+  if (CC >= LPCC::UNKNOWN)
+    OS << "<und>";
+  else if (CC != LPCC::ICC_T)
+    OS << "." << lanaiCondCodeToString(CC);
+}
diff --git a/lib/Target/Lanai/InstPrinter/LanaiInstPrinter.h b/lib/Target/Lanai/InstPrinter/LanaiInstPrinter.h
new file mode 100644
index 000000000000..1c9d186ad819
--- /dev/null
+++ b/lib/Target/Lanai/InstPrinter/LanaiInstPrinter.h
@@ -0,0 +1,65 @@
+//= LanaiInstPrinter.h - Convert Lanai MCInst to asm syntax -------*- C++ -*--//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This class prints a Lanai MCInst to a .s file.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_LANAI_INSTPRINTER_LANAIINSTPRINTER_H
+#define LLVM_LIB_TARGET_LANAI_INSTPRINTER_LANAIINSTPRINTER_H
+
+#include "llvm/MC/MCInstPrinter.h"
+
+namespace llvm {
+class MCOperand;
+
+class LanaiInstPrinter : public MCInstPrinter {
+public:
+  LanaiInstPrinter(const MCAsmInfo &MAI, const MCInstrInfo &MII,
+                   const MCRegisterInfo &MRI)
+      : MCInstPrinter(MAI, MII, MRI) {}
+
+  void printInst(const MCInst *MI, raw_ostream &O, StringRef Annot,
+                 const MCSubtargetInfo &STI) override;
+  void printOperand(const MCInst *MI, unsigned OpNo, raw_ostream &O,
+                    const char *Modifier = 0);
+  void printPredicateOperand(const MCInst *MI, unsigned OpNum, raw_ostream &O);
+  void printMemRiOperand(const MCInst *MI, int OpNo, raw_ostream &O,
+                         const char *Modifier = 0);
+  void printMemRrOperand(const MCInst *MI, int OpNo, raw_ostream &O,
+                         const char *Modifier = 0);
+  void printMemSplsOperand(const MCInst *MI, int OpNo, raw_ostream &O,
+                           const char *Modifier = 0);
+  void printCCOperand(const MCInst *MI, int OpNo, raw_ostream &O);
+  void printAluOperand(const MCInst *MI, int OpNo, raw_ostream &O);
+  void printHi16ImmOperand(const MCInst *MI, unsigned OpNo, raw_ostream &O);
+  void printHi16AndImmOperand(const MCInst *MI, unsigned OpNo, raw_ostream &O);
+  void printLo16AndImmOperand(const MCInst *MI, unsigned OpNo, raw_ostream &O);
+  void printMemImmOperand(const MCInst *MI, unsigned OpNo, raw_ostream &O);
+
+  // Autogenerated by tblgen.
+  void printInstruction(const MCInst *MI, raw_ostream &O);
+  bool printAliasInstr(const MCInst *MI, raw_ostream &OS);
+  void printCustomAliasOperand(const MCInst *MI, unsigned OpIdx,
+                               unsigned PrintMethodIdx, raw_ostream &O);
+  static const char *getRegisterName(unsigned RegNo);
+  void printRegName(raw_ostream &OS, unsigned RegNo) const override;
+
+private:
+  bool printAlias(const MCInst *MI, raw_ostream &Ostream);
+  bool printInst(const MCInst *MI, raw_ostream &Ostream, StringRef Alias,
+                 unsigned OpNo0, unsigned OpnNo1);
+  bool printMemoryLoadIncrement(const MCInst *MI, raw_ostream &Ostream,
+                                StringRef Opcode, int AddOffset);
+  bool printMemoryStoreIncrement(const MCInst *MI, raw_ostream &Ostream,
+                                 StringRef Opcode, int AddOffset);
+};
+} // namespace llvm
+
+#endif // LLVM_LIB_TARGET_LANAI_INSTPRINTER_LANAIINSTPRINTER_H
diff --git a/lib/Target/Lanai/LLVMBuild.txt b/lib/Target/Lanai/LLVMBuild.txt
new file mode 100644
index 000000000000..91accbbfb762
--- /dev/null
+++ b/lib/Target/Lanai/LLVMBuild.txt
@@ -0,0 +1,45 @@
+;===- ./lib/Target/Lanai/LLVMBuild.txt -------------------------*- Conf -*--===;
+;
+;                     The LLVM Compiler Infrastructure
+;
+; This file is distributed under the University of Illinois Open Source
+; License. See LICENSE.TXT for details.
+;
+;===------------------------------------------------------------------------===;
+;
+; This is an LLVMBuild description file for the components in this subdirectory.
+;
+; For more information on the LLVMBuild system, please see:
+;
+;   http://llvm.org/docs/LLVMBuild.html
+;
+;===------------------------------------------------------------------------===;
+
+[common]
+subdirectories = AsmParser Disassembler InstPrinter MCTargetDesc TargetInfo
+
+[component_0]
+type = TargetGroup
+name = Lanai
+parent = Target
+has_asmprinter = 1
+
+[component_1]
+type = Library
+name = LanaiCodeGen
+parent = Lanai
+required_libraries =
+ Analysis
+ AsmPrinter
+ CodeGen
+ Core
+ LanaiAsmParser
+ LanaiMCTargetDesc
+ LanaiInfo
+ LanaiInstPrinter
+ MC
+ SelectionDAG
+ Support
+ Target
+ TransformUtils
+add_to_library_groups = Lanai
diff --git a/lib/Target/Lanai/Lanai.h b/lib/Target/Lanai/Lanai.h
new file mode 100644
index 000000000000..47bd498c579c
--- /dev/null
+++ b/lib/Target/Lanai/Lanai.h
@@ -0,0 +1,51 @@
+//===-- Lanai.h - Top-level interface for Lanai representation --*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the entry points for global functions defined in the LLVM
+// Lanai back-end.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_LANAI_LANAI_H
+#define LLVM_LIB_TARGET_LANAI_LANAI_H
+
+#include "LanaiAluCode.h"
+#include "LanaiCondCode.h"
+#include "MCTargetDesc/LanaiBaseInfo.h"
+#include "MCTargetDesc/LanaiMCTargetDesc.h"
+#include "llvm/CodeGen/ISDOpcodes.h"
+#include "llvm/Target/TargetMachine.h"
+
+namespace llvm {
+class FunctionPass;
+class LanaiTargetMachine;
+class MachineFunctionPass;
+class TargetMachine;
+class formatted_raw_ostream;
+
+// createLanaiISelDag - This pass converts a legalized DAG into a
+// Lanai-specific DAG, ready for instruction scheduling.
+FunctionPass *createLanaiISelDag(LanaiTargetMachine &TM);
+
+// createLanaiDelaySlotFillerPass - This pass fills delay slots
+// with useful instructions or nop's
+FunctionPass *createLanaiDelaySlotFillerPass(const LanaiTargetMachine &TM);
+
+// createLanaiMemAluCombinerPass - This pass combines loads/stores and
+// arithmetic operations.
+FunctionPass *createLanaiMemAluCombinerPass();
+
+// createLanaiSetflagAluCombinerPass - This pass combines SET_FLAG and ALU
+// operations.
+FunctionPass *createLanaiSetflagAluCombinerPass();
+
+extern Target TheLanaiTarget;
+} // namespace llvm
+
+#endif // LLVM_LIB_TARGET_LANAI_LANAI_H
diff --git a/lib/Target/Lanai/Lanai.td b/lib/Target/Lanai/Lanai.td
new file mode 100644
index 000000000000..73d080457034
--- /dev/null
+++ b/lib/Target/Lanai/Lanai.td
@@ -0,0 +1,47 @@
+//===- Lanai.td - Describe the Lanai Target Machine --------*- tablegen -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+// Target-independent interfaces which we are implementing
+//===----------------------------------------------------------------------===//
+
+include "llvm/Target/Target.td"
+
+//===----------------------------------------------------------------------===//
+// Register File, Calling Conv, Instruction Descriptions
+//===----------------------------------------------------------------------===//
+
+include "LanaiSchedule.td"
+include "LanaiRegisterInfo.td"
+include "LanaiCallingConv.td"
+include "LanaiInstrInfo.td"
+
+def LanaiInstrInfo : InstrInfo;
+
+//===----------------------------------------------------------------------===//
+// Lanai processors supported.
+//===----------------------------------------------------------------------===//
+
+def : ProcessorModel<"generic", LanaiSchedModel, []>;
+def : ProcessorModel<"v11", LanaiSchedModel, []>;
+
+def LanaiInstPrinter : AsmWriter {
+  string AsmWriterClassName  = "InstPrinter";
+  bit isMCAsmWriter = 1;
+}
+
+//===----------------------------------------------------------------------===//
+// Declare the target which we are implementing
+//===----------------------------------------------------------------------===//
+
+def Lanai : Target {
+  // Pull in Instruction Info:
+  let InstructionSet = LanaiInstrInfo;
+  let AssemblyWriters = [LanaiInstPrinter];
+}
diff --git a/lib/Target/Lanai/LanaiAluCode.h b/lib/Target/Lanai/LanaiAluCode.h
new file mode 100644
index 000000000000..b6ceedef6651
--- /dev/null
+++ b/lib/Target/Lanai/LanaiAluCode.h
@@ -0,0 +1,148 @@
+//===-- LanaiAluCode.h - ALU operator encoding ----------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// The encoding for ALU operators used in RM and RRM operands
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_LANAI_LANAIALUCODE_H
+#define LLVM_LIB_TARGET_LANAI_LANAIALUCODE_H
+
+#include "llvm/ADT/StringSwitch.h"
+#include "llvm/CodeGen/ISDOpcodes.h"
+#include "llvm/Support/ErrorHandling.h"
+
+namespace llvm {
+namespace LPAC {
+enum AluCode {
+  ADD = 0x00,
+  ADDC = 0x01,
+  SUB = 0x02,
+  SUBB = 0x03,
+  AND = 0x04,
+  OR = 0x05,
+  XOR = 0x06,
+  SPECIAL = 0x07,
+
+  // Shift instructions are treated as SPECIAL when encoding the machine
+  // instruction, but kept distinct until lowering. The constant values are
+  // chosen to ease lowering.
+  SHL = 0x17,
+  SRL = 0x27,
+  SRA = 0x37,
+
+  // Indicates an unknown/unsupported operator
+  UNKNOWN = 0xFF,
+};
+
+// Bits indicating post- and pre-operators should be tested and set using Is*
+// and Make* utility functions
+constexpr int Lanai_PRE_OP = 0x40;
+constexpr int Lanai_POST_OP = 0x80;
+
+inline static unsigned encodeLanaiAluCode(unsigned AluOp) {
+  unsigned const OP_ENCODING_MASK = 0x07;
+  return AluOp & OP_ENCODING_MASK;
+}
+
+inline static unsigned getAluOp(unsigned AluOp) {
+  unsigned const ALU_MASK = 0x3F;
+  return AluOp & ALU_MASK;
+}
+
+inline static bool isPreOp(unsigned AluOp) { return AluOp & Lanai_PRE_OP; }
+
+inline static bool isPostOp(unsigned AluOp) { return AluOp & Lanai_POST_OP; }
+
+inline static unsigned makePreOp(unsigned AluOp) {
+  assert(!isPostOp(AluOp) && "Operator can't be a post- and pre-op");
+  return AluOp | Lanai_PRE_OP;
+}
+
+inline static unsigned makePostOp(unsigned AluOp) {
+  assert(!isPreOp(AluOp) && "Operator can't be a post- and pre-op");
+  return AluOp | Lanai_POST_OP;
+}
+
+inline static bool modifiesOp(unsigned AluOp) {
+  return isPreOp(AluOp) | isPostOp(AluOp);
+}
+
+inline static const char *lanaiAluCodeToString(unsigned AluOp) {
+  switch (getAluOp(AluOp)) {
+  case ADD:
+    return "add";
+  case ADDC:
+    return "addc";
+  case SUB:
+    return "sub";
+  case SUBB:
+    return "subb";
+  case AND:
+    return "and";
+  case OR:
+    return "or";
+  case XOR:
+    return "xor";
+  case SHL:
+    return "sh";
+  case SRL:
+    return "sh";
+  case SRA:
+    return "sha";
+  default:
+    llvm_unreachable("Invalid ALU code.");
+  }
+}
+
+inline static AluCode stringToLanaiAluCode(StringRef S) {
+  return StringSwitch<AluCode>(S)
+      .Case("add", ADD)
+      .Case("addc", ADDC)
+      .Case("sub", SUB)
+      .Case("subb", SUBB)
+      .Case("and", AND)
+      .Case("or", OR)
+      .Case("xor", XOR)
+      .Case("sh", SHL)
+      .Case("srl", SRL)
+      .Case("sha", SRA)
+      .Default(UNKNOWN);
+}
+
+inline static AluCode isdToLanaiAluCode(ISD::NodeType Node_type) {
+  switch (Node_type) {
+  case ISD::ADD:
+    return AluCode::ADD;
+  case ISD::ADDE:
+    return AluCode::ADDC;
+  case ISD::SUB:
+    return AluCode::SUB;
+  case ISD::SUBE:
+    return AluCode::SUBB;
+  case ISD::AND:
+    return AluCode::AND;
+  case ISD::OR:
+    return AluCode::OR;
+  case ISD::XOR:
+    return AluCode::XOR;
+  case ISD::SHL:
+    return AluCode::SHL;
+  case ISD::SRL:
+    return AluCode::SRL;
+  case ISD::SRA:
+    return AluCode::SRA;
+  default:
+    return AluCode::UNKNOWN;
+  }
+}
+} // namespace LPAC
+} // namespace llvm
+
+#endif // LLVM_LIB_TARGET_LANAI_LANAIALUCODE_H
diff --git a/lib/Target/Lanai/LanaiAsmPrinter.cpp b/lib/Target/Lanai/LanaiAsmPrinter.cpp
new file mode 100644
index 000000000000..9d39cef9f8ed
--- /dev/null
+++ b/lib/Target/Lanai/LanaiAsmPrinter.cpp
@@ -0,0 +1,243 @@
+//===-- LanaiAsmPrinter.cpp - Lanai LLVM assembly writer ------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains a printer that converts from our internal representation
+// of machine-dependent LLVM code to the Lanai assembly language.
+//
+//===----------------------------------------------------------------------===//
+
+#include "InstPrinter/LanaiInstPrinter.h"
+#include "Lanai.h"
+#include "LanaiInstrInfo.h"
+#include "LanaiMCInstLower.h"
+#include "LanaiTargetMachine.h"
+#include "llvm/CodeGen/AsmPrinter.h"
+#include "llvm/CodeGen/MachineConstantPool.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstr.h"
+#include "llvm/CodeGen/MachineModuleInfo.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/DerivedTypes.h"
+#include "llvm/IR/Mangler.h"
+#include "llvm/IR/Module.h"
+#include "llvm/MC/MCAsmInfo.h"
+#include "llvm/MC/MCInst.h"
+#include "llvm/MC/MCInstBuilder.h"
+#include "llvm/MC/MCStreamer.h"
+#include "llvm/MC/MCSymbol.h"
+#include "llvm/Support/TargetRegistry.h"
+#include "llvm/Support/raw_ostream.h"
+
+#define DEBUG_TYPE "asm-printer"
+
+using namespace llvm;
+
+namespace {
+class LanaiAsmPrinter : public AsmPrinter {
+public:
+  explicit LanaiAsmPrinter(TargetMachine &TM,
+                           std::unique_ptr<MCStreamer> Streamer)
+      : AsmPrinter(TM, std::move(Streamer)) {}
+
+  const char *getPassName() const override { return "Lanai Assembly Printer"; }
+
+  void printOperand(const MachineInstr *MI, int OpNum, raw_ostream &O);
+  bool PrintAsmOperand(const MachineInstr *MI, unsigned OpNo,
+                       unsigned AsmVariant, const char *ExtraCode,
+                       raw_ostream &O) override;
+  void EmitInstruction(const MachineInstr *MI) override;
+  bool isBlockOnlyReachableByFallthrough(
+      const MachineBasicBlock *MBB) const override;
+
+private:
+  void customEmitInstruction(const MachineInstr *MI);
+  void emitCallInstruction(const MachineInstr *MI);
+};
+} // end of anonymous namespace
+
+void LanaiAsmPrinter::printOperand(const MachineInstr *MI, int OpNum,
+                                   raw_ostream &O) {
+  const MachineOperand &MO = MI->getOperand(OpNum);
+
+  switch (MO.getType()) {
+  case MachineOperand::MO_Register:
+    O << LanaiInstPrinter::getRegisterName(MO.getReg());
+    break;
+
+  case MachineOperand::MO_Immediate:
+    O << MO.getImm();
+    break;
+
+  case MachineOperand::MO_MachineBasicBlock:
+    O << *MO.getMBB()->getSymbol();
+    break;
+
+  case MachineOperand::MO_GlobalAddress:
+    O << *getSymbol(MO.getGlobal());
+    break;
+
+  case MachineOperand::MO_BlockAddress: {
+    MCSymbol *BA = GetBlockAddressSymbol(MO.getBlockAddress());
+    O << BA->getName();
+    break;
+  }
+
+  case MachineOperand::MO_ExternalSymbol:
+    O << *GetExternalSymbolSymbol(MO.getSymbolName());
+    break;
+
+  case MachineOperand::MO_JumpTableIndex:
+    O << MAI->getPrivateGlobalPrefix() << "JTI" << getFunctionNumber() << '_'
+      << MO.getIndex();
+    break;
+
+  case MachineOperand::MO_ConstantPoolIndex:
+    O << MAI->getPrivateGlobalPrefix() << "CPI" << getFunctionNumber() << '_'
+      << MO.getIndex();
+    return;
+
+  default:
+    llvm_unreachable("<unknown operand type>");
+  }
+}
+
+// PrintAsmOperand - Print out an operand for an inline asm expression.
+bool LanaiAsmPrinter::PrintAsmOperand(const MachineInstr *MI, unsigned OpNo,
+                                      unsigned /*AsmVariant*/,
+                                      const char *ExtraCode, raw_ostream &O) {
+  // Does this asm operand have a single letter operand modifier?
+  if (ExtraCode && ExtraCode[0]) {
+    if (ExtraCode[1])
+      return true; // Unknown modifier.
+
+    switch (ExtraCode[0]) {
+    // The highest-numbered register of a pair.
+    case 'H': {
+      if (OpNo == 0)
+        return true;
+      const MachineOperand &FlagsOP = MI->getOperand(OpNo - 1);
+      if (!FlagsOP.isImm())
+        return true;
+      unsigned Flags = FlagsOP.getImm();
+      unsigned NumVals = InlineAsm::getNumOperandRegisters(Flags);
+      if (NumVals != 2)
+        return true;
+      unsigned RegOp = OpNo + 1;
+      if (RegOp >= MI->getNumOperands())
+        return true;
+      const MachineOperand &MO = MI->getOperand(RegOp);
+      if (!MO.isReg())
+        return true;
+      unsigned Reg = MO.getReg();
+      O << LanaiInstPrinter::getRegisterName(Reg);
+      return false;
+    }
+    default:
+      return true; // Unknown modifier.
+    }
+  }
+  printOperand(MI, OpNo, O);
+  return false;
+}
+
+//===----------------------------------------------------------------------===//
+void LanaiAsmPrinter::emitCallInstruction(const MachineInstr *MI) {
+  assert((MI->getOpcode() == Lanai::CALL || MI->getOpcode() == Lanai::CALLR) &&
+         "Unsupported call function");
+
+  LanaiMCInstLower MCInstLowering(OutContext, *Mang, *this);
+  MCSubtargetInfo STI = getSubtargetInfo();
+  // Insert save rca instruction immediately before the call.
+  // TODO: We should generate a pc-relative mov instruction here instead
+  // of pc + 16 (should be mov .+16 %rca).
+  OutStreamer->EmitInstruction(MCInstBuilder(Lanai::ADD_I_LO)
+                                   .addReg(Lanai::RCA)
+                                   .addReg(Lanai::PC)
+                                   .addImm(16),
+                               STI);
+
+  // Push rca onto the stack.
+  //   st %rca, [--%sp]
+  OutStreamer->EmitInstruction(MCInstBuilder(Lanai::SW_RI)
+                                   .addReg(Lanai::RCA)
+                                   .addReg(Lanai::SP)
+                                   .addImm(-4)
+                                   .addImm(LPAC::makePreOp(LPAC::ADD)),
+                               STI);
+
+  // Lower the call instruction.
+  if (MI->getOpcode() == Lanai::CALL) {
+    MCInst TmpInst;
+    MCInstLowering.Lower(MI, TmpInst);
+    TmpInst.setOpcode(Lanai::BT);
+    OutStreamer->EmitInstruction(TmpInst, STI);
+  } else {
+    OutStreamer->EmitInstruction(MCInstBuilder(Lanai::ADD_R)
+                                     .addReg(Lanai::PC)
+                                     .addReg(MI->getOperand(0).getReg())
+                                     .addReg(Lanai::R0)
+                                     .addImm(LPCC::ICC_T),
+                                 STI);
+  }
+}
+
+void LanaiAsmPrinter::customEmitInstruction(const MachineInstr *MI) {
+  LanaiMCInstLower MCInstLowering(OutContext, *Mang, *this);
+  MCSubtargetInfo STI = getSubtargetInfo();
+  MCInst TmpInst;
+  MCInstLowering.Lower(MI, TmpInst);
+  OutStreamer->EmitInstruction(TmpInst, STI);
+}
+
+void LanaiAsmPrinter::EmitInstruction(const MachineInstr *MI) {
+  MachineBasicBlock::const_instr_iterator I = MI->getIterator();
+  MachineBasicBlock::const_instr_iterator E = MI->getParent()->instr_end();
+
+  do {
+    if (I->isCall()) {
+      emitCallInstruction(&*I);
+      continue;
+    }
+
+    customEmitInstruction(&*I);
+  } while ((++I != E) && I->isInsideBundle());
+}
+
+// isBlockOnlyReachableByFallthough - Return true if the basic block has
+// exactly one predecessor and the control transfer mechanism between
+// the predecessor and this block is a fall-through.
+// FIXME: could the overridden cases be handled in AnalyzeBranch?
+bool LanaiAsmPrinter::isBlockOnlyReachableByFallthrough(
+    const MachineBasicBlock *MBB) const {
+  // The predecessor has to be immediately before this block.
+  const MachineBasicBlock *Pred = *MBB->pred_begin();
+
+  // If the predecessor is a switch statement, assume a jump table
+  // implementation, so it is not a fall through.
+  if (const BasicBlock *B = Pred->getBasicBlock())
+    if (isa<SwitchInst>(B->getTerminator()))
+      return false;
+
+  // Check default implementation
+  if (!AsmPrinter::isBlockOnlyReachableByFallthrough(MBB))
+    return false;
+
+  // Otherwise, check the last instruction.
+  // Check if the last terminator is an unconditional branch.
+  MachineBasicBlock::const_iterator I = Pred->end();
+  while (I != Pred->begin() && !(--I)->isTerminator()) {
+  }
+
+  return !I->isBarrier();
+}
+
+// Force static initialization.
+extern "C" void LLVMInitializeLanaiAsmPrinter() {
+  RegisterAsmPrinter<LanaiAsmPrinter> X(TheLanaiTarget);
+}
diff --git a/lib/Target/Lanai/LanaiCallingConv.td b/lib/Target/Lanai/LanaiCallingConv.td
new file mode 100644
index 000000000000..056b329c33c5
--- /dev/null
+++ b/lib/Target/Lanai/LanaiCallingConv.td
@@ -0,0 +1,50 @@
+//===- LanaiCallingConv.td - Calling Conventions Lanai -------*- tablegen -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This describes the calling conventions for the Lanai architectures.
+//
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+// Return Value Calling Conventions
+//===----------------------------------------------------------------------===//
+
+// Lanai 32-bit C Calling convention.
+def CC_Lanai32 : CallingConv<[
+  // Promote i8/i16 args to i32
+  CCIfType<[i8, i16], CCPromoteToType<i32>>,
+
+  // Put argument in registers if marked 'inreg' and not a vararg call.
+  CCIfNotVarArg<CCIfInReg<CCIfType<[i32],
+                                   CCAssignToReg<[R6, R7, R18, R19]>>>>,
+
+  // Otherwise they are assigned to the stack in 4-byte aligned units.
+  CCAssignToStack<4, 4>
+]>;
+
+// Lanai 32-bit Fast Calling convention.
+def CC_Lanai32_Fast : CallingConv<[
+  // Promote i8/i16 args to i32
+  CCIfType<[ i8, i16 ], CCPromoteToType<i32>>,
+
+  // Put arguments in registers.
+  CCIfNotVarArg<CCIfType<[i32], CCAssignToReg<[ R6, R7, R18, R19 ]>>>,
+
+  // Otherwise they are assigned to the stack in 4-byte aligned units.
+  CCAssignToStack<4, 4>
+]>;
+
+// Lanai 32-bit C return-value convention.
+def RetCC_Lanai32 : CallingConv<[
+  // Specify two registers to allow returning 64-bit results that have already
+  // been lowered to 2 32-bit values.
+  CCIfType<[i32], CCAssignToReg<[RV, R9]>>
+]>;
+
+def CSR: CalleeSavedRegs<(add)>;
diff --git a/lib/Target/Lanai/LanaiCondCode.h b/lib/Target/Lanai/LanaiCondCode.h
new file mode 100644
index 000000000000..6c5bdefc83dc
--- /dev/null
+++ b/lib/Target/Lanai/LanaiCondCode.h
@@ -0,0 +1,100 @@
+// The encoding used for conditional codes used in BR instructions
+
+#ifndef LLVM_LIB_TARGET_LANAI_LANAICONDCODE_H
+#define LLVM_LIB_TARGET_LANAI_LANAICONDCODE_H
+
+#include "llvm/ADT/StringSwitch.h"
+
+namespace llvm {
+namespace LPCC {
+enum CondCode {
+  ICC_T = 0,   //  true
+  ICC_F = 1,   //  false
+  ICC_HI = 2,  //  high
+  ICC_UGT = 2, //  unsigned greater than
+  ICC_LS = 3,  //  low or same
+  ICC_ULE = 3, //  unsigned less than or equal
+  ICC_CC = 4,  //  carry cleared
+  ICC_ULT = 4, //  unsigned less than
+  ICC_CS = 5,  //  carry set
+  ICC_UGE = 5, //  unsigned greater than or equal
+  ICC_NE = 6,  //  not equal
+  ICC_EQ = 7,  //  equal
+  ICC_VC = 8,  //  oVerflow cleared
+  ICC_VS = 9,  //  oVerflow set
+  ICC_PL = 10, //  plus
+  ICC_MI = 11, //  minus
+  ICC_GE = 12, //  greater than or equal
+  ICC_LT = 13, //  less than
+  ICC_GT = 14, //  greater than
+  ICC_LE = 15, //  less than or equal
+  UNKNOWN
+};
+
+inline static StringRef lanaiCondCodeToString(LPCC::CondCode CC) {
+  switch (CC) {
+  case LPCC::ICC_T:
+    return "t"; // true
+  case LPCC::ICC_F:
+    return "f"; // false
+  case LPCC::ICC_NE:
+    return "ne"; // not equal
+  case LPCC::ICC_EQ:
+    return "eq"; // equal
+  case LPCC::ICC_VC:
+    return "vc"; // oVerflow cleared
+  case LPCC::ICC_VS:
+    return "vs"; // oVerflow set
+  case LPCC::ICC_PL:
+    return "pl"; // plus
+  case LPCC::ICC_MI:
+    return "mi"; // minus
+  case LPCC::ICC_GE:
+    return "ge"; // greater than or equal
+  case LPCC::ICC_LT:
+    return "lt"; // less than
+  case LPCC::ICC_GT:
+    return "gt"; // greater than
+  case LPCC::ICC_LE:
+    return "le"; // less than or equal
+  case LPCC::ICC_UGT:
+    return "ugt"; // high | unsigned greater than
+  case LPCC::ICC_ULE:
+    return "ule"; // low or same | unsigned less or equal
+  case LPCC::ICC_ULT:
+    return "ult"; // carry cleared | unsigned less than
+  case LPCC::ICC_UGE:
+    return "uge"; // carry set | unsigned than or equal
+  default:
+    llvm_unreachable("Invalid cond code");
+  }
+}
+
+inline static CondCode suffixToLanaiCondCode(StringRef S) {
+  return StringSwitch<CondCode>(S)
+      .EndsWith("f", LPCC::ICC_F)
+      .EndsWith("hi", LPCC::ICC_HI)
+      .EndsWith("ugt", LPCC::ICC_UGT)
+      .EndsWith("ls", LPCC::ICC_LS)
+      .EndsWith("ule", LPCC::ICC_ULE)
+      .EndsWith("cc", LPCC::ICC_CC)
+      .EndsWith("ult", LPCC::ICC_ULT)
+      .EndsWith("cs", LPCC::ICC_CS)
+      .EndsWith("uge", LPCC::ICC_UGE)
+      .EndsWith("ne", LPCC::ICC_NE)
+      .EndsWith("eq", LPCC::ICC_EQ)
+      .EndsWith("vc", LPCC::ICC_VC)
+      .EndsWith("vs", LPCC::ICC_VS)
+      .EndsWith("pl", LPCC::ICC_PL)
+      .EndsWith("mi", LPCC::ICC_MI)
+      .EndsWith("ge", LPCC::ICC_GE)
+      .EndsWith("lt", LPCC::ICC_LT)
+      .EndsWith("gt", LPCC::ICC_GT)
+      .EndsWith("le", LPCC::ICC_LE)
+      .EndsWith("t", LPCC::ICC_T) // Has to be after others with suffix t
+      .Default(LPCC::UNKNOWN);
+}
+} // namespace LPCC
+} // namespace llvm
+
+#endif // LLVM_LIB_TARGET_LANAI_LANAICONDCODE_H
diff --git a/lib/Target/Lanai/LanaiDelaySlotFiller.cpp b/lib/Target/Lanai/LanaiDelaySlotFiller.cpp
new file mode 100644
index 000000000000..7b106547d60b
--- /dev/null
+++ b/lib/Target/Lanai/LanaiDelaySlotFiller.cpp
@@ -0,0 +1,263 @@
+//===-- LanaiDelaySlotFiller.cpp - Lanai delay slot filler ----------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// Simple pass to fills delay slots with useful instructions.
+//
+//===----------------------------------------------------------------------===//
+
+#include "Lanai.h"
+#include "LanaiTargetMachine.h"
+#include "llvm/ADT/SmallSet.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Target/TargetInstrInfo.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "delay-slot-filler"
+
+STATISTIC(FilledSlots, "Number of delay slots filled");
+
+static cl::opt<bool>
+    NopDelaySlotFiller("lanai-nop-delay-filler", cl::init(false),
+                       cl::desc("Fill Lanai delay slots with NOPs."),
+                       cl::Hidden);
+
+namespace {
+struct Filler : public MachineFunctionPass {
+  // Target machine description which we query for reg. names, data
+  // layout, etc.
+  const TargetInstrInfo *TII;
+  const TargetRegisterInfo *TRI;
+  MachineBasicBlock::instr_iterator LastFiller;
+
+  static char ID;
+  explicit Filler() : MachineFunctionPass(ID) {}
+
+  const char *getPassName() const override { return "Lanai Delay Slot Filler"; }
+
+  bool runOnMachineBasicBlock(MachineBasicBlock &MBB);
+
+  bool runOnMachineFunction(MachineFunction &MF) override {
+    const LanaiSubtarget &Subtarget = MF.getSubtarget<LanaiSubtarget>();
+    TII = Subtarget.getInstrInfo();
+    TRI = Subtarget.getRegisterInfo();
+
+    bool Changed = false;
+    for (MachineFunction::iterator FI = MF.begin(), FE = MF.end(); FI != FE;
+         ++FI)
+      Changed |= runOnMachineBasicBlock(*FI);
+    return Changed;
+  }
+
+  MachineFunctionProperties getRequiredProperties() const override {
+    return MachineFunctionProperties().set(
+        MachineFunctionProperties::Property::AllVRegsAllocated);
+  }
+
+  void insertDefsUses(MachineBasicBlock::instr_iterator MI,
+                      SmallSet<unsigned, 32> &RegDefs,
+                      SmallSet<unsigned, 32> &RegUses);
+
+  bool isRegInSet(SmallSet<unsigned, 32> &RegSet, unsigned Reg);
+
+  bool delayHasHazard(MachineBasicBlock::instr_iterator MI, bool &SawLoad,
+                      bool &SawStore, SmallSet<unsigned, 32> &RegDefs,
+                      SmallSet<unsigned, 32> &RegUses);
+
+  bool findDelayInstr(MachineBasicBlock &MBB,
+                      MachineBasicBlock::instr_iterator Slot,
+                      MachineBasicBlock::instr_iterator &Filler);
+};
+char Filler::ID = 0;
+} // end of anonymous namespace
+
+// createLanaiDelaySlotFillerPass - Returns a pass that fills in delay
+// slots in Lanai MachineFunctions
+FunctionPass *
+llvm::createLanaiDelaySlotFillerPass(const LanaiTargetMachine & /*tm*/) {
+  return new Filler();
+}
+
+// runOnMachineBasicBlock - Fill in delay slots for the given basic block.
+// There is one or two delay slot per delayed instruction.
+bool Filler::runOnMachineBasicBlock(MachineBasicBlock &MBB) {
+  bool Changed = false;
+  LastFiller = MBB.instr_end();
+
+  for (MachineBasicBlock::instr_iterator I = MBB.instr_begin();
+       I != MBB.instr_end(); ++I) {
+    if (I->getDesc().hasDelaySlot()) {
+      MachineBasicBlock::instr_iterator InstrWithSlot = I;
+      MachineBasicBlock::instr_iterator J = I;
+
+      // Treat RET specially as it is only instruction with 2 delay slots
+      // generated while all others generated have 1 delay slot.
+      if (I->getOpcode() == Lanai::RET) {
+        // RET is generated as part of epilogue generation and hence we know
+        // what the two instructions preceding it are and that it is safe to
+        // insert RET above them.
+        MachineBasicBlock::reverse_instr_iterator RI(I);
+        assert(RI->getOpcode() == Lanai::LDW_RI && RI->getOperand(0).isReg() &&
+               RI->getOperand(0).getReg() == Lanai::FP &&
+               RI->getOperand(1).isReg() &&
+               RI->getOperand(1).getReg() == Lanai::FP &&
+               RI->getOperand(2).isImm() && RI->getOperand(2).getImm() == -8);
+        ++RI;
+        assert(RI->getOpcode() == Lanai::ADD_I_LO &&
+               RI->getOperand(0).isReg() &&
+               RI->getOperand(0).getReg() == Lanai::SP &&
+               RI->getOperand(1).isReg() &&
+               RI->getOperand(1).getReg() == Lanai::FP);
+        ++RI;
+        MachineBasicBlock::instr_iterator FI(RI.base());
+        MBB.splice(std::next(I), &MBB, FI, I);
+        FilledSlots += 2;
+      } else {
+        if (!NopDelaySlotFiller && findDelayInstr(MBB, I, J)) {
+          MBB.splice(std::next(I), &MBB, J);
+        } else {
+          BuildMI(MBB, std::next(I), DebugLoc(), TII->get(Lanai::NOP));
+        }
+        ++FilledSlots;
+      }
+
+      Changed = true;
+      // Record the filler instruction that filled the delay slot.
+      // The instruction after it will be visited in the next iteration.
+      LastFiller = ++I;
+
+      // Bundle the delay slot filler to InstrWithSlot so that the machine
+      // verifier doesn't expect this instruction to be a terminator.
+      MIBundleBuilder(MBB, InstrWithSlot, std::next(LastFiller));
+    }
+  }
+  return Changed;
+}
+
+bool Filler::findDelayInstr(MachineBasicBlock &MBB,
+                            MachineBasicBlock::instr_iterator Slot,
+                            MachineBasicBlock::instr_iterator &Filler) {
+  SmallSet<unsigned, 32> RegDefs;
+  SmallSet<unsigned, 32> RegUses;
+
+  insertDefsUses(Slot, RegDefs, RegUses);
+
+  bool SawLoad = false;
+  bool SawStore = false;
+
+  for (MachineBasicBlock::reverse_instr_iterator I(Slot); I != MBB.instr_rend();
+       ++I) {
+    // skip debug value
+    if (I->isDebugValue())
+      continue;
+
+    // Convert to forward iterator.
+    MachineBasicBlock::instr_iterator FI(std::next(I).base());
+
+    if (I->hasUnmodeledSideEffects() || I->isInlineAsm() || I->isLabel() ||
+        FI == LastFiller || I->isPseudo())
+      break;
+
+    if (delayHasHazard(FI, SawLoad, SawStore, RegDefs, RegUses)) {
+      insertDefsUses(FI, RegDefs, RegUses);
+      continue;
+    }
+    Filler = FI;
+    return true;
+  }
+  return false;
+}
+
+bool Filler::delayHasHazard(MachineBasicBlock::instr_iterator MI, bool &SawLoad,
+                            bool &SawStore, SmallSet<unsigned, 32> &RegDefs,
+                            SmallSet<unsigned, 32> &RegUses) {
+  if (MI->isImplicitDef() || MI->isKill())
+    return true;
+
+  // Loads or stores cannot be moved past a store to the delay slot
+  // and stores cannot be moved past a load.
+  if (MI->mayLoad()) {
+    if (SawStore)
+      return true;
+    SawLoad = true;
+  }
+
+  if (MI->mayStore()) {
+    if (SawStore)
+      return true;
+    SawStore = true;
+    if (SawLoad)
+      return true;
+  }
+
+  assert((!MI->isCall() && !MI->isReturn()) &&
+         "Cannot put calls or returns in delay slot.");
+
+  for (unsigned I = 0, E = MI->getNumOperands(); I != E; ++I) {
+    const MachineOperand &MO = MI->getOperand(I);
+    unsigned Reg;
+
+    if (!MO.isReg() || !(Reg = MO.getReg()))
+      continue; // skip
+
+    if (MO.isDef()) {
+      // check whether Reg is defined or used before delay slot.
+      if (isRegInSet(RegDefs, Reg) || isRegInSet(RegUses, Reg))
+        return true;
+    }
+    if (MO.isUse()) {
+      // check whether Reg is defined before delay slot.
+      if (isRegInSet(RegDefs, Reg))
+        return true;
+    }
+  }
+  return false;
+}
+
+// Insert Defs and Uses of MI into the sets RegDefs and RegUses.
+void Filler::insertDefsUses(MachineBasicBlock::instr_iterator MI,
+                            SmallSet<unsigned, 32> &RegDefs,
+                            SmallSet<unsigned, 32> &RegUses) {
+  // If MI is a call or return, just examine the explicit non-variadic operands.
+  MCInstrDesc MCID = MI->getDesc();
+  unsigned E = MI->isCall() || MI->isReturn() ? MCID.getNumOperands()
+                                              : MI->getNumOperands();
+  for (unsigned I = 0; I != E; ++I) {
+    const MachineOperand &MO = MI->getOperand(I);
+    unsigned Reg;
+
+    if (!MO.isReg() || !(Reg = MO.getReg()))
+      continue;
+
+    if (MO.isDef())
+      RegDefs.insert(Reg);
+    else if (MO.isUse())
+      RegUses.insert(Reg);
+  }
+
+  // Call & return instructions defines SP implicitly. Implicit defines are not
+  // included in the RegDefs set of calls but instructions modifying SP cannot
+  // be inserted in the delay slot of a call/return as these instructions are
+  // expanded to multiple instructions with SP modified before the branch that
+  // has the delay slot.
+  if (MI->isCall() || MI->isReturn())
+    RegDefs.insert(Lanai::SP);
+}
+
+// Returns true if the Reg or its alias is in the RegSet.
+bool Filler::isRegInSet(SmallSet<unsigned, 32> &RegSet, unsigned Reg) {
+  // Check Reg and all aliased Registers.
+  for (MCRegAliasIterator AI(Reg, TRI, true); AI.isValid(); ++AI)
+    if (RegSet.count(*AI))
+      return true;
+  return false;
+}
diff --git a/lib/Target/Lanai/LanaiFrameLowering.cpp b/lib/Target/Lanai/LanaiFrameLowering.cpp
new file mode 100644
index 000000000000..cb048d568df7
--- /dev/null
+++ b/lib/Target/Lanai/LanaiFrameLowering.cpp
@@ -0,0 +1,220 @@
+//===-- LanaiFrameLowering.cpp - Lanai Frame Information ------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the Lanai implementation of TargetFrameLowering class.
+//
+//===----------------------------------------------------------------------===//
+
+#include "LanaiFrameLowering.h"
+
+#include "LanaiInstrInfo.h"
+#include "LanaiMachineFunctionInfo.h"
+#include "LanaiSubtarget.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/IR/Function.h"
+
+using namespace llvm;
+
+// Determines the size of the frame and maximum call frame size.
+void LanaiFrameLowering::determineFrameLayout(MachineFunction &MF) const {
+  MachineFrameInfo *MFI = MF.getFrameInfo();
+  const LanaiRegisterInfo *LRI = STI.getRegisterInfo();
+
+  // Get the number of bytes to allocate from the FrameInfo.
+  unsigned FrameSize = MFI->getStackSize();
+
+  // Get the alignment.
+  unsigned StackAlign = LRI->needsStackRealignment(MF) ? MFI->getMaxAlignment()
+                                                       : getStackAlignment();
+
+  // Get the maximum call frame size of all the calls.
+  unsigned MaxCallFrameSize = MFI->getMaxCallFrameSize();
+
+  // If we have dynamic alloca then MaxCallFrameSize needs to be aligned so
+  // that allocations will be aligned.
+  if (MFI->hasVarSizedObjects())
+    MaxCallFrameSize = alignTo(MaxCallFrameSize, StackAlign);
+
+  // Update maximum call frame size.
+  MFI->setMaxCallFrameSize(MaxCallFrameSize);
+
+  // Include call frame size in total.
+  if (!(hasReservedCallFrame(MF) && MFI->adjustsStack()))
+    FrameSize += MaxCallFrameSize;
+
+  // Make sure the frame is aligned.
+  FrameSize = alignTo(FrameSize, StackAlign);
+
+  // Update frame info.
+  MFI->setStackSize(FrameSize);
+}
+
+// Iterates through each basic block in a machine function and replaces
+// ADJDYNALLOC pseudo instructions with a Lanai:ADDI with the
+// maximum call frame size as the immediate.
+void LanaiFrameLowering::replaceAdjDynAllocPseudo(MachineFunction &MF) const {
+  const LanaiInstrInfo &LII =
+      *static_cast<const LanaiInstrInfo *>(STI.getInstrInfo());
+  unsigned MaxCallFrameSize = MF.getFrameInfo()->getMaxCallFrameSize();
+
+  for (MachineFunction::iterator MBB = MF.begin(), E = MF.end(); MBB != E;
+       ++MBB) {
+    MachineBasicBlock::iterator MBBI = MBB->begin();
+    while (MBBI != MBB->end()) {
+      MachineInstr &MI = *MBBI++;
+      if (MI.getOpcode() == Lanai::ADJDYNALLOC) {
+        DebugLoc DL = MI.getDebugLoc();
+        unsigned Dst = MI.getOperand(0).getReg();
+        unsigned Src = MI.getOperand(1).getReg();
+
+        BuildMI(*MBB, MI, DL, LII.get(Lanai::ADD_I_LO), Dst)
+            .addReg(Src)
+            .addImm(MaxCallFrameSize);
+        MI.eraseFromParent();
+      }
+    }
+  }
+}
+
+// Generates the following sequence for function entry:
+//   st %fp,-4[*%sp]        !push old FP
+//   add %sp,8,%fp          !generate new FP
+//   sub %sp,0x4,%sp        !allocate stack space (as needed)
+void LanaiFrameLowering::emitPrologue(MachineFunction &MF,
+                                      MachineBasicBlock &MBB) const {
+  assert(&MF.front() == &MBB && "Shrink-wrapping not yet supported");
+
+  MachineFrameInfo *MFI = MF.getFrameInfo();
+  const LanaiInstrInfo &LII =
+      *static_cast<const LanaiInstrInfo *>(STI.getInstrInfo());
+  MachineBasicBlock::iterator MBBI = MBB.begin();
+
+  // Debug location must be unknown since the first debug location is used
+  // to determine the end of the prologue.
+  DebugLoc DL;
+
+  // Determine the correct frame layout
+  determineFrameLayout(MF);
+
+  // FIXME: This appears to be overallocating.  Needs investigation.
+  // Get the number of bytes to allocate from the FrameInfo.
+  unsigned StackSize = MFI->getStackSize();
+
+  // Push old FP
+  // st %fp,-4[*%sp]
+  BuildMI(MBB, MBBI, DL, LII.get(Lanai::SW_RI))
+      .addReg(Lanai::FP)
+      .addReg(Lanai::SP)
+      .addImm(-4)
+      .addImm(LPAC::makePreOp(LPAC::ADD))
+      .setMIFlag(MachineInstr::FrameSetup);
+
+  // Generate new FP
+  // add %sp,8,%fp
+  BuildMI(MBB, MBBI, DL, LII.get(Lanai::ADD_I_LO), Lanai::FP)
+      .addReg(Lanai::SP)
+      .addImm(8)
+      .setMIFlag(MachineInstr::FrameSetup);
+
+  // Allocate space on the stack if needed
+  // sub %sp,StackSize,%sp
+  if (StackSize != 0) {
+    BuildMI(MBB, MBBI, DL, LII.get(Lanai::SUB_I_LO), Lanai::SP)
+        .addReg(Lanai::SP)
+        .addImm(StackSize)
+        .setMIFlag(MachineInstr::FrameSetup);
+  }
+
+  // Replace ADJDYNANALLOC
+  if (MFI->hasVarSizedObjects())
+    replaceAdjDynAllocPseudo(MF);
+}
+
+MachineBasicBlock::iterator LanaiFrameLowering::eliminateCallFramePseudoInstr(
+    MachineFunction & /*MF*/, MachineBasicBlock &MBB,
+    MachineBasicBlock::iterator I) const {
+  // Discard ADJCALLSTACKDOWN, ADJCALLSTACKUP instructions.
+  return MBB.erase(I);
+}
+
+// The function epilogue should not depend on the current stack pointer!
+// It should use the frame pointer only.  This is mandatory because
+// of alloca; we also take advantage of it to omit stack adjustments
+// before returning.
+//
+// Note that when we go to restore the preserved register values we must
+// not try to address their slots by using offsets from the stack pointer.
+// That's because the stack pointer may have been moved during the function
+// execution due to a call to alloca().  Rather, we must restore all
+// preserved registers via offsets from the frame pointer value.
+//
+// Note also that when the current frame is being "popped" (by adjusting
+// the value of the stack pointer) on function exit, we must (for the
+// sake of alloca) set the new value of the stack pointer based upon
+// the current value of the frame pointer.  We can't just add what we
+// believe to be the (static) frame size to the stack pointer because
+// if we did that, and alloca() had been called during this function,
+// we would end up returning *without* having fully deallocated all of
+// the space grabbed by alloca.  If that happened, and a function
+// containing one or more alloca() calls was called over and over again,
+// then the stack would grow without limit!
+//
+// RET is lowered to
+//      ld -4[%fp],%pc  # modify %pc (two delay slots)
+// as the return address is in the stack frame and mov to pc is allowed.
+// emitEpilogue emits
+//      mov %fp,%sp     # restore the stack pointer
+//      ld -8[%fp],%fp  # restore the caller's frame pointer
+// before RET and the delay slot filler will move RET such that these
+// instructions execute in the delay slots of the load to PC.
+void LanaiFrameLowering::emitEpilogue(MachineFunction & /*MF*/,
+                                      MachineBasicBlock &MBB) const {
+  MachineBasicBlock::iterator MBBI = MBB.getLastNonDebugInstr();
+  const LanaiInstrInfo &LII =
+      *static_cast<const LanaiInstrInfo *>(STI.getInstrInfo());
+  DebugLoc DL = MBBI->getDebugLoc();
+
+  // Restore the stack pointer using the callee's frame pointer value.
+  BuildMI(MBB, MBBI, DL, LII.get(Lanai::ADD_I_LO), Lanai::SP)
+      .addReg(Lanai::FP)
+      .addImm(0);
+
+  // Restore the frame pointer from the stack.
+  BuildMI(MBB, MBBI, DL, LII.get(Lanai::LDW_RI), Lanai::FP)
+      .addReg(Lanai::FP)
+      .addImm(-8)
+      .addImm(LPAC::ADD);
+}
+
+void LanaiFrameLowering::determineCalleeSaves(MachineFunction &MF,
+                                              BitVector &SavedRegs,
+                                              RegScavenger *RS) const {
+  TargetFrameLowering::determineCalleeSaves(MF, SavedRegs, RS);
+
+  MachineFrameInfo *MFI = MF.getFrameInfo();
+  const LanaiRegisterInfo *LRI =
+      static_cast<const LanaiRegisterInfo *>(STI.getRegisterInfo());
+  int Offset = -4;
+
+  // Reserve 4 bytes for the saved RCA
+  MFI->CreateFixedObject(4, Offset, true);
+  Offset -= 4;
+
+  // Reserve 4 bytes for the saved FP
+  MFI->CreateFixedObject(4, Offset, true);
+  Offset -= 4;
+
+  if (LRI->hasBasePointer(MF)) {
+    MFI->CreateFixedObject(4, Offset, true);
+    SavedRegs.reset(LRI->getBaseRegister());
+  }
+}
diff --git a/lib/Target/Lanai/LanaiFrameLowering.h b/lib/Target/Lanai/LanaiFrameLowering.h
new file mode 100644
index 000000000000..2f9b6c3c158f
--- /dev/null
+++ b/lib/Target/Lanai/LanaiFrameLowering.h
@@ -0,0 +1,57 @@
+//===-- LanaiFrameLowering.h - Define frame lowering for Lanai --*- C++-*--===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This class implements Lanai-specific bits of TargetFrameLowering class.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_LANAI_LANAIFRAMELOWERING_H
+#define LLVM_LIB_TARGET_LANAI_LANAIFRAMELOWERING_H
+
+#include "Lanai.h"
+#include "llvm/Target/TargetFrameLowering.h"
+
+namespace llvm {
+
+class BitVector;
+class LanaiSubtarget;
+
+class LanaiFrameLowering : public TargetFrameLowering {
+private:
+  void determineFrameLayout(MachineFunction &MF) const;
+  void replaceAdjDynAllocPseudo(MachineFunction &MF) const;
+
+protected:
+  const LanaiSubtarget &STI;
+
+public:
+  explicit LanaiFrameLowering(const LanaiSubtarget &Subtarget)
+      : TargetFrameLowering(StackGrowsDown,
+                            /*StackAlignment=*/8,
+                            /*LocalAreaOffset=*/0),
+        STI(Subtarget) {}
+
+  // emitProlog/emitEpilog - These methods insert prolog and epilog code into
+  // the function.
+  void emitPrologue(MachineFunction &MF, MachineBasicBlock &MBB) const override;
+  void emitEpilogue(MachineFunction &MF, MachineBasicBlock &MBB) const override;
+
+  MachineBasicBlock::iterator
+  eliminateCallFramePseudoInstr(MachineFunction &MF, MachineBasicBlock &MBB,
+                                MachineBasicBlock::iterator I) const override;
+
+  bool hasFP(const MachineFunction & /*MF*/) const override { return true; }
+
+  void determineCalleeSaves(MachineFunction &MF, BitVector &SavedRegs,
+                            RegScavenger *RS = nullptr) const override;
+};
+
+} // namespace llvm
+
+#endif // LLVM_LIB_TARGET_LANAI_LANAIFRAMELOWERING_H
diff --git a/lib/Target/Lanai/LanaiISelDAGToDAG.cpp b/lib/Target/Lanai/LanaiISelDAGToDAG.cpp
new file mode 100644
index 000000000000..29bc6e8a6c56
--- /dev/null
+++ b/lib/Target/Lanai/LanaiISelDAGToDAG.cpp
@@ -0,0 +1,317 @@
+//===-- LanaiISelDAGToDAG.cpp - A dag to dag inst selector for Lanai ------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines an instruction selector for the Lanai target.
+//
+//===----------------------------------------------------------------------===//
+
+#include "Lanai.h"
+#include "LanaiMachineFunctionInfo.h"
+#include "LanaiRegisterInfo.h"
+#include "LanaiSubtarget.h"
+#include "LanaiTargetMachine.h"
+#include "llvm/CodeGen/MachineConstantPool.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/SelectionDAGISel.h"
+#include "llvm/IR/CFG.h"
+#include "llvm/IR/GlobalValue.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/Intrinsics.h"
+#include "llvm/IR/Type.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Target/TargetMachine.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "lanai-isel"
+
+//===----------------------------------------------------------------------===//
+// Instruction Selector Implementation
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+// LanaiDAGToDAGISel - Lanai specific code to select Lanai machine
+// instructions for SelectionDAG operations.
+//===----------------------------------------------------------------------===//
+namespace {
+
+class LanaiDAGToDAGISel : public SelectionDAGISel {
+public:
+  explicit LanaiDAGToDAGISel(LanaiTargetMachine &TargetMachine)
+      : SelectionDAGISel(TargetMachine) {}
+
+  bool runOnMachineFunction(MachineFunction &MF) override {
+    return SelectionDAGISel::runOnMachineFunction(MF);
+  }
+
+  // Pass Name
+  const char *getPassName() const override {
+    return "Lanai DAG->DAG Pattern Instruction Selection";
+  }
+
+  bool SelectInlineAsmMemoryOperand(const SDValue &Op, unsigned ConstraintCode,
+                                    std::vector<SDValue> &OutOps) override;
+
+private:
+// Include the pieces autogenerated from the target description.
+#include "LanaiGenDAGISel.inc"
+
+  // Instruction Selection not handled by the auto-generated tablgen
+  void Select(SDNode *N) override;
+
+  // Support functions for the opcodes of Instruction Selection
+  // not handled by the auto-generated tablgen
+  void selectFrameIndex(SDNode *N);
+
+  // Complex Pattern for address selection.
+  bool selectAddrRi(SDValue Addr, SDValue &Base, SDValue &Offset,
+                    SDValue &AluOp);
+  bool selectAddrRr(SDValue Addr, SDValue &R1, SDValue &R2, SDValue &AluOp);
+  bool selectAddrSls(SDValue Addr, SDValue &Offset);
+  bool selectAddrSpls(SDValue Addr, SDValue &Base, SDValue &Offset,
+                      SDValue &AluOp);
+
+  // getI32Imm - Return a target constant with the specified value, of type i32.
+  inline SDValue getI32Imm(unsigned Imm, const SDLoc &DL) {
+    return CurDAG->getTargetConstant(Imm, DL, MVT::i32);
+  }
+
+private:
+  bool selectAddrRiSpls(SDValue Addr, SDValue &Base, SDValue &Offset,
+                        SDValue &AluOp, bool RiMode);
+};
+
+bool canBeRepresentedAsSls(const ConstantSDNode &CN) {
+  // Fits in 21-bit signed immediate and two low-order bits are zero.
+  return isInt<21>(CN.getSExtValue()) && ((CN.getSExtValue() & 0x3) == 0);
+}
+
+} // namespace
+
+// Helper functions for ComplexPattern used on LanaiInstrInfo
+// Used on Lanai Load/Store instructions.
+bool LanaiDAGToDAGISel::selectAddrSls(SDValue Addr, SDValue &Offset) {
+  if (ConstantSDNode *CN = dyn_cast<ConstantSDNode>(Addr)) {
+    SDLoc DL(Addr);
+    // Loading from a constant address.
+    if (canBeRepresentedAsSls(*CN)) {
+      int32_t Imm = CN->getSExtValue();
+      Offset = CurDAG->getTargetConstant(Imm, DL, CN->getValueType(0));
+      return true;
+    }
+  }
+  if (Addr.getOpcode() == ISD::OR &&
+      Addr.getOperand(1).getOpcode() == LanaiISD::SMALL) {
+    Offset = Addr.getOperand(1).getOperand(0);
+    return true;
+  }
+  return false;
+}
+
+bool LanaiDAGToDAGISel::selectAddrRiSpls(SDValue Addr, SDValue &Base,
+                                         SDValue &Offset, SDValue &AluOp,
+                                         bool RiMode) {
+  SDLoc DL(Addr);
+
+  if (ConstantSDNode *CN = dyn_cast<ConstantSDNode>(Addr)) {
+    if (RiMode) {
+      // Fits in 16-bit signed immediate.
+      if (isInt<16>(CN->getSExtValue())) {
+        int16_t Imm = CN->getSExtValue();
+        Offset = CurDAG->getTargetConstant(Imm, DL, CN->getValueType(0));
+        Base = CurDAG->getRegister(Lanai::R0, CN->getValueType(0));
+        AluOp = CurDAG->getTargetConstant(LPAC::ADD, DL, MVT::i32);
+        return true;
+      }
+      // Allow SLS to match if the constant doesn't fit in 16 bits but can be
+      // represented as an SLS.
+      if (canBeRepresentedAsSls(*CN))
+        return false;
+    } else {
+      // Fits in 10-bit signed immediate.
+      if (isInt<10>(CN->getSExtValue())) {
+        int16_t Imm = CN->getSExtValue();
+        Offset = CurDAG->getTargetConstant(Imm, DL, CN->getValueType(0));
+        Base = CurDAG->getRegister(Lanai::R0, CN->getValueType(0));
+        AluOp = CurDAG->getTargetConstant(LPAC::ADD, DL, MVT::i32);
+        return true;
+      }
+    }
+  }
+
+  // if Address is FI, get the TargetFrameIndex.
+  if (FrameIndexSDNode *FIN = dyn_cast<FrameIndexSDNode>(Addr)) {
+    Base = CurDAG->getTargetFrameIndex(
+        FIN->getIndex(),
+        getTargetLowering()->getPointerTy(CurDAG->getDataLayout()));
+    Offset = CurDAG->getTargetConstant(0, DL, MVT::i32);
+    AluOp = CurDAG->getTargetConstant(LPAC::ADD, DL, MVT::i32);
+    return true;
+  }
+
+  // Skip direct calls
+  if ((Addr.getOpcode() == ISD::TargetExternalSymbol ||
+       Addr.getOpcode() == ISD::TargetGlobalAddress))
+    return false;
+
+  // Address of the form imm + reg
+  ISD::NodeType AluOperator = static_cast<ISD::NodeType>(Addr.getOpcode());
+  if (AluOperator == ISD::ADD) {
+    AluOp = CurDAG->getTargetConstant(LPAC::ADD, DL, MVT::i32);
+    // Addresses of the form FI+const
+    if (ConstantSDNode *CN = dyn_cast<ConstantSDNode>(Addr.getOperand(1)))
+      if ((RiMode && isInt<16>(CN->getSExtValue())) ||
+          (!RiMode && isInt<10>(CN->getSExtValue()))) {
+        // If the first operand is a FI, get the TargetFI Node
+        if (FrameIndexSDNode *FIN =
+                dyn_cast<FrameIndexSDNode>(Addr.getOperand(0))) {
+          Base = CurDAG->getTargetFrameIndex(
+              FIN->getIndex(),
+              getTargetLowering()->getPointerTy(CurDAG->getDataLayout()));
+        } else {
+          Base = Addr.getOperand(0);
+        }
+
+        Offset = CurDAG->getTargetConstant(CN->getSExtValue(), DL, MVT::i32);
+        return true;
+      }
+  }
+
+  // Let SLS match SMALL instead of RI.
+  if (AluOperator == ISD::OR && RiMode &&
+      Addr.getOperand(1).getOpcode() == LanaiISD::SMALL)
+    return false;
+
+  Base = Addr;
+  Offset = CurDAG->getTargetConstant(0, DL, MVT::i32);
+  AluOp = CurDAG->getTargetConstant(LPAC::ADD, DL, MVT::i32);
+  return true;
+}
+
+bool LanaiDAGToDAGISel::selectAddrRi(SDValue Addr, SDValue &Base,
+                                     SDValue &Offset, SDValue &AluOp) {
+  return selectAddrRiSpls(Addr, Base, Offset, AluOp, /*RiMode=*/true);
+}
+
+bool LanaiDAGToDAGISel::selectAddrSpls(SDValue Addr, SDValue &Base,
+                                       SDValue &Offset, SDValue &AluOp) {
+  return selectAddrRiSpls(Addr, Base, Offset, AluOp, /*RiMode=*/false);
+}
+
+bool LanaiDAGToDAGISel::selectAddrRr(SDValue Addr, SDValue &R1, SDValue &R2,
+                                     SDValue &AluOp) {
+  // if Address is FI, get the TargetFrameIndex.
+  if (Addr.getOpcode() == ISD::FrameIndex)
+    return false;
+
+  // Skip direct calls
+  if ((Addr.getOpcode() == ISD::TargetExternalSymbol ||
+       Addr.getOpcode() == ISD::TargetGlobalAddress))
+    return false;
+
+  // Address of the form OP + OP
+  ISD::NodeType AluOperator = static_cast<ISD::NodeType>(Addr.getOpcode());
+  LPAC::AluCode AluCode = LPAC::isdToLanaiAluCode(AluOperator);
+  if (AluCode != LPAC::UNKNOWN) {
+    // Skip addresses of the form FI OP const
+    if (ConstantSDNode *CN = dyn_cast<ConstantSDNode>(Addr.getOperand(1)))
+      if (isInt<16>(CN->getSExtValue()))
+        return false;
+
+    // Skip addresses with hi/lo operands
+    if (Addr.getOperand(0).getOpcode() == LanaiISD::HI ||
+        Addr.getOperand(0).getOpcode() == LanaiISD::LO ||
+        Addr.getOperand(0).getOpcode() == LanaiISD::SMALL ||
+        Addr.getOperand(1).getOpcode() == LanaiISD::HI ||
+        Addr.getOperand(1).getOpcode() == LanaiISD::LO ||
+        Addr.getOperand(1).getOpcode() == LanaiISD::SMALL)
+      return false;
+
+    // Addresses of the form register OP register
+    R1 = Addr.getOperand(0);
+    R2 = Addr.getOperand(1);
+    AluOp = CurDAG->getTargetConstant(AluCode, SDLoc(Addr), MVT::i32);
+    return true;
+  }
+
+  // Skip addresses with zero offset
+  return false;
+}
+
+bool LanaiDAGToDAGISel::SelectInlineAsmMemoryOperand(
+    const SDValue &Op, unsigned ConstraintCode, std::vector<SDValue> &OutOps) {
+  SDValue Op0, Op1, AluOp;
+  switch (ConstraintCode) {
+  default:
+    return true;
+  case InlineAsm::Constraint_m: // memory
+    if (!selectAddrRr(Op, Op0, Op1, AluOp) &&
+        !selectAddrRi(Op, Op0, Op1, AluOp))
+      return true;
+    break;
+  }
+
+  OutOps.push_back(Op0);
+  OutOps.push_back(Op1);
+  OutOps.push_back(AluOp);
+  return false;
+}
+
+// Select instructions not customized! Used for
+// expanded, promoted and normal instructions
+void LanaiDAGToDAGISel::Select(SDNode *Node) {
+  unsigned Opcode = Node->getOpcode();
+
+  // Dump information about the Node being selected
+  DEBUG(errs() << "Selecting: "; Node->dump(CurDAG); errs() << "\n");
+
+  // If we have a custom node, we already have selected!
+  if (Node->isMachineOpcode()) {
+    DEBUG(errs() << "== "; Node->dump(CurDAG); errs() << "\n");
+    return;
+  }
+
+  // Instruction Selection not handled by the auto-generated
+  // tablegen selection should be handled here.
+  switch (Opcode) {
+  case ISD::FrameIndex:
+    selectFrameIndex(Node);
+    return;
+  default:
+    break;
+  }
+
+  // Select the default instruction
+  SelectCode(Node);
+}
+
+void LanaiDAGToDAGISel::selectFrameIndex(SDNode *Node) {
+  SDLoc DL(Node);
+  SDValue Imm = CurDAG->getTargetConstant(0, DL, MVT::i32);
+  int FI = dyn_cast<FrameIndexSDNode>(Node)->getIndex();
+  EVT VT = Node->getValueType(0);
+  SDValue TFI = CurDAG->getTargetFrameIndex(FI, VT);
+  unsigned Opc = Lanai::ADD_I_LO;
+  if (Node->hasOneUse()) {
+    CurDAG->SelectNodeTo(Node, Opc, VT, TFI, Imm);
+    return;
+  }
+  ReplaceNode(Node, CurDAG->getMachineNode(Opc, DL, VT, TFI, Imm));
+}
+
+// createLanaiISelDag - This pass converts a legalized DAG into a
+// Lanai-specific DAG, ready for instruction scheduling.
+FunctionPass *llvm::createLanaiISelDag(LanaiTargetMachine &TM) {
+  return new LanaiDAGToDAGISel(TM);
+}
diff --git a/lib/Target/Lanai/LanaiISelLowering.cpp b/lib/Target/Lanai/LanaiISelLowering.cpp
new file mode 100644
index 000000000000..66416b38e812
--- /dev/null
+++ b/lib/Target/Lanai/LanaiISelLowering.cpp
@@ -0,0 +1,1437 @@
+//===-- LanaiISelLowering.cpp - Lanai DAG Lowering Implementation ---------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the LanaiTargetLowering class.
+//
+//===----------------------------------------------------------------------===//
+
+#include "LanaiISelLowering.h"
+
+#include "Lanai.h"
+#include "LanaiMachineFunctionInfo.h"
+#include "LanaiSubtarget.h"
+#include "LanaiTargetMachine.h"
+#include "LanaiTargetObjectFile.h"
+#include "llvm/CodeGen/CallingConvLower.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/SelectionDAGISel.h"
+#include "llvm/CodeGen/TargetLoweringObjectFileImpl.h"
+#include "llvm/CodeGen/ValueTypes.h"
+#include "llvm/IR/CallingConv.h"
+#include "llvm/IR/DerivedTypes.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/GlobalAlias.h"
+#include "llvm/IR/GlobalVariable.h"
+#include "llvm/IR/Intrinsics.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/raw_ostream.h"
+
+#define DEBUG_TYPE "lanai-lower"
+
+using namespace llvm;
+
+// Limit on number of instructions the lowered multiplication may have before a
+// call to the library function should be generated instead. The threshold is
+// currently set to 14 as this was the smallest threshold that resulted in all
+// constant multiplications being lowered. A threshold of 5 covered all cases
+// except for one multiplication which required 14. mulsi3 requires 16
+// instructions (including the prologue and epilogue but excluding instructions
+// at call site). Until we can inline mulsi3, generating at most 14 instructions
+// will be faster than invoking mulsi3.
+static cl::opt<int> LanaiLowerConstantMulThreshold(
+    "lanai-constant-mul-threshold", cl::Hidden,
+    cl::desc("Maximum number of instruction to generate when lowering constant "
+             "multiplication instead of calling library function [default=14]"),
+    cl::init(14));
+
+LanaiTargetLowering::LanaiTargetLowering(const TargetMachine &TM,
+                                         const LanaiSubtarget &STI)
+    : TargetLowering(TM) {
+  // Set up the register classes.
+  addRegisterClass(MVT::i32, &Lanai::GPRRegClass);
+
+  // Compute derived properties from the register classes
+  TRI = STI.getRegisterInfo();
+  computeRegisterProperties(TRI);
+
+  setStackPointerRegisterToSaveRestore(Lanai::SP);
+
+  setOperationAction(ISD::BR_CC, MVT::i32, Custom);
+  setOperationAction(ISD::BR_JT, MVT::Other, Expand);
+  setOperationAction(ISD::BRCOND, MVT::Other, Expand);
+  setOperationAction(ISD::SETCC, MVT::i32, Custom);
+  setOperationAction(ISD::SETCCE, MVT::i32, Custom);
+  setOperationAction(ISD::SELECT, MVT::i32, Expand);
+  setOperationAction(ISD::SELECT_CC, MVT::i32, Custom);
+
+  setOperationAction(ISD::GlobalAddress, MVT::i32, Custom);
+  setOperationAction(ISD::BlockAddress, MVT::i32, Custom);
+  setOperationAction(ISD::JumpTable, MVT::i32, Custom);
+  setOperationAction(ISD::ConstantPool, MVT::i32, Custom);
+
+  setOperationAction(ISD::DYNAMIC_STACKALLOC, MVT::i32, Custom);
+  setOperationAction(ISD::STACKSAVE, MVT::Other, Expand);
+  setOperationAction(ISD::STACKRESTORE, MVT::Other, Expand);
+
+  setOperationAction(ISD::VASTART, MVT::Other, Custom);
+  setOperationAction(ISD::VAARG, MVT::Other, Expand);
+  setOperationAction(ISD::VACOPY, MVT::Other, Expand);
+  setOperationAction(ISD::VAEND, MVT::Other, Expand);
+
+  setOperationAction(ISD::SDIV, MVT::i32, Expand);
+  setOperationAction(ISD::UDIV, MVT::i32, Expand);
+  setOperationAction(ISD::SDIVREM, MVT::i32, Expand);
+  setOperationAction(ISD::UDIVREM, MVT::i32, Expand);
+  setOperationAction(ISD::SREM, MVT::i32, Expand);
+  setOperationAction(ISD::UREM, MVT::i32, Expand);
+
+  setOperationAction(ISD::MUL, MVT::i32, Custom);
+  setOperationAction(ISD::MULHU, MVT::i32, Expand);
+  setOperationAction(ISD::MULHS, MVT::i32, Expand);
+  setOperationAction(ISD::UMUL_LOHI, MVT::i32, Expand);
+  setOperationAction(ISD::SMUL_LOHI, MVT::i32, Expand);
+
+  setOperationAction(ISD::ROTR, MVT::i32, Expand);
+  setOperationAction(ISD::ROTL, MVT::i32, Expand);
+  setOperationAction(ISD::SHL_PARTS, MVT::i32, Expand);
+  setOperationAction(ISD::SRL_PARTS, MVT::i32, Custom);
+  setOperationAction(ISD::SRA_PARTS, MVT::i32, Expand);
+
+  setOperationAction(ISD::BSWAP, MVT::i32, Expand);
+  setOperationAction(ISD::CTPOP, MVT::i32, Legal);
+  setOperationAction(ISD::CTLZ, MVT::i32, Legal);
+  setOperationAction(ISD::CTTZ, MVT::i32, Legal);
+
+  setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i1, Expand);
+  setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i8, Expand);
+  setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i16, Expand);
+
+  // Extended load operations for i1 types must be promoted
+  for (MVT VT : MVT::integer_valuetypes()) {
+    setLoadExtAction(ISD::EXTLOAD, VT, MVT::i1, Promote);
+    setLoadExtAction(ISD::ZEXTLOAD, VT, MVT::i1, Promote);
+    setLoadExtAction(ISD::SEXTLOAD, VT, MVT::i1, Promote);
+  }
+
+  setTargetDAGCombine(ISD::ADD);
+  setTargetDAGCombine(ISD::SUB);
+  setTargetDAGCombine(ISD::AND);
+  setTargetDAGCombine(ISD::OR);
+  setTargetDAGCombine(ISD::XOR);
+
+  // Function alignments (log2)
+  setMinFunctionAlignment(2);
+  setPrefFunctionAlignment(2);
+
+  setJumpIsExpensive(true);
+
+  // TODO: Setting the minimum jump table entries needed before a
+  // switch is transformed to a jump table to 100 to avoid creating jump tables
+  // as this was causing bad performance compared to a large group of if
+  // statements. Re-evaluate this on new benchmarks.
+  setMinimumJumpTableEntries(100);
+
+  // Use fast calling convention for library functions.
+  for (int I = 0; I < RTLIB::UNKNOWN_LIBCALL; ++I) {
+    setLibcallCallingConv(static_cast<RTLIB::Libcall>(I), CallingConv::Fast);
+  }
+
+  MaxStoresPerMemset = 16; // For @llvm.memset -> sequence of stores
+  MaxStoresPerMemsetOptSize = 8;
+  MaxStoresPerMemcpy = 16; // For @llvm.memcpy -> sequence of stores
+  MaxStoresPerMemcpyOptSize = 8;
+  MaxStoresPerMemmove = 16; // For @llvm.memmove -> sequence of stores
+  MaxStoresPerMemmoveOptSize = 8;
+
+  // Booleans always contain 0 or 1.
+  setBooleanContents(ZeroOrOneBooleanContent);
+}
+
+SDValue LanaiTargetLowering::LowerOperation(SDValue Op,
+                                            SelectionDAG &DAG) const {
+  switch (Op.getOpcode()) {
+  case ISD::MUL:
+    return LowerMUL(Op, DAG);
+  case ISD::BR_CC:
+    return LowerBR_CC(Op, DAG);
+  case ISD::ConstantPool:
+    return LowerConstantPool(Op, DAG);
+  case ISD::GlobalAddress:
+    return LowerGlobalAddress(Op, DAG);
+  case ISD::BlockAddress:
+    return LowerBlockAddress(Op, DAG);
+  case ISD::JumpTable:
+    return LowerJumpTable(Op, DAG);
+  case ISD::SELECT_CC:
+    return LowerSELECT_CC(Op, DAG);
+  case ISD::SETCC:
+    return LowerSETCC(Op, DAG);
+  case ISD::SETCCE:
+    return LowerSETCCE(Op, DAG);
+  case ISD::SRL_PARTS:
+    return LowerSRL_PARTS(Op, DAG);
+  case ISD::VASTART:
+    return LowerVASTART(Op, DAG);
+  case ISD::DYNAMIC_STACKALLOC:
+    return LowerDYNAMIC_STACKALLOC(Op, DAG);
+  case ISD::RETURNADDR:
+    return LowerRETURNADDR(Op, DAG);
+  case ISD::FRAMEADDR:
+    return LowerFRAMEADDR(Op, DAG);
+  default:
+    llvm_unreachable("unimplemented operand");
+  }
+}
+//===----------------------------------------------------------------------===//
+//                       Lanai Inline Assembly Support
+//===----------------------------------------------------------------------===//
+
+unsigned LanaiTargetLowering::getRegisterByName(const char *RegName, EVT /*VT*/,
+                                                SelectionDAG & /*DAG*/) const {
+  // Only unallocatable registers should be matched here.
+  unsigned Reg = StringSwitch<unsigned>(RegName)
+                     .Case("pc", Lanai::PC)
+                     .Case("sp", Lanai::SP)
+                     .Case("fp", Lanai::FP)
+                     .Case("rr1", Lanai::RR1)
+                     .Case("r10", Lanai::R10)
+                     .Case("rr2", Lanai::RR2)
+                     .Case("r11", Lanai::R11)
+                     .Case("rca", Lanai::RCA)
+                     .Default(0);
+
+  if (Reg)
+    return Reg;
+  report_fatal_error("Invalid register name global variable");
+}
+
+std::pair<unsigned, const TargetRegisterClass *>
+LanaiTargetLowering::getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI,
+                                                  StringRef Constraint,
+                                                  MVT VT) const {
+  if (Constraint.size() == 1)
+    // GCC Constraint Letters
+    switch (Constraint[0]) {
+    case 'r': // GENERAL_REGS
+      return std::make_pair(0U, &Lanai::GPRRegClass);
+    default:
+      break;
+    }
+
+  return TargetLowering::getRegForInlineAsmConstraint(TRI, Constraint, VT);
+}
+
+// Examine constraint type and operand type and determine a weight value.
+// This object must already have been set up with the operand type
+// and the current alternative constraint selected.
+TargetLowering::ConstraintWeight
+LanaiTargetLowering::getSingleConstraintMatchWeight(
+    AsmOperandInfo &Info, const char *Constraint) const {
+  ConstraintWeight Weight = CW_Invalid;
+  Value *CallOperandVal = Info.CallOperandVal;
+  // If we don't have a value, we can't do a match,
+  // but allow it at the lowest weight.
+  if (CallOperandVal == NULL)
+    return CW_Default;
+  // Look at the constraint type.
+  switch (*Constraint) {
+  case 'I': // signed 16 bit immediate
+  case 'J': // integer zero
+  case 'K': // unsigned 16 bit immediate
+  case 'L': // immediate in the range 0 to 31
+  case 'M': // signed 32 bit immediate where lower 16 bits are 0
+  case 'N': // signed 26 bit immediate
+  case 'O': // integer zero
+    if (isa<ConstantInt>(CallOperandVal))
+      Weight = CW_Constant;
+    break;
+  default:
+    Weight = TargetLowering::getSingleConstraintMatchWeight(Info, Constraint);
+    break;
+  }
+  return Weight;
+}
+
+// LowerAsmOperandForConstraint - Lower the specified operand into the Ops
+// vector.  If it is invalid, don't add anything to Ops.
+void LanaiTargetLowering::LowerAsmOperandForConstraint(
+    SDValue Op, std::string &Constraint, std::vector<SDValue> &Ops,
+    SelectionDAG &DAG) const {
+  SDValue Result(0, 0);
+
+  // Only support length 1 constraints for now.
+  if (Constraint.length() > 1)
+    return;
+
+  char ConstraintLetter = Constraint[0];
+  switch (ConstraintLetter) {
+  case 'I': // Signed 16 bit constant
+    // If this fails, the parent routine will give an error
+    if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
+      if (isInt<16>(C->getSExtValue())) {
+        Result = DAG.getTargetConstant(C->getSExtValue(), SDLoc(C),
+                                       Op.getValueType());
+        break;
+      }
+    }
+    return;
+  case 'J': // integer zero
+  case 'O':
+    if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
+      if (C->getZExtValue() == 0) {
+        Result = DAG.getTargetConstant(0, SDLoc(C), Op.getValueType());
+        break;
+      }
+    }
+    return;
+  case 'K': // unsigned 16 bit immediate
+    if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
+      if (isUInt<16>(C->getZExtValue())) {
+        Result = DAG.getTargetConstant(C->getSExtValue(), SDLoc(C),
+                                       Op.getValueType());
+        break;
+      }
+    }
+    return;
+  case 'L': // immediate in the range 0 to 31
+    if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
+      if (C->getZExtValue() <= 31) {
+        Result = DAG.getTargetConstant(C->getZExtValue(), SDLoc(C),
+                                       Op.getValueType());
+        break;
+      }
+    }
+    return;
+  case 'M': // signed 32 bit immediate where lower 16 bits are 0
+    if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
+      int64_t Val = C->getSExtValue();
+      if ((isInt<32>(Val)) && ((Val & 0xffff) == 0)) {
+        Result = DAG.getTargetConstant(Val, SDLoc(C), Op.getValueType());
+        break;
+      }
+    }
+    return;
+  case 'N': // signed 26 bit immediate
+    if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
+      int64_t Val = C->getSExtValue();
+      if ((Val >= -33554432) && (Val <= 33554431)) {
+        Result = DAG.getTargetConstant(Val, SDLoc(C), Op.getValueType());
+        break;
+      }
+    }
+    return;
+  default:
+    break; // This will fall through to the generic implementation
+  }
+
+  if (Result.getNode()) {
+    Ops.push_back(Result);
+    return;
+  }
+
+  TargetLowering::LowerAsmOperandForConstraint(Op, Constraint, Ops, DAG);
+}
+
+//===----------------------------------------------------------------------===//
+//                      Calling Convention Implementation
+//===----------------------------------------------------------------------===//
+
+#include "LanaiGenCallingConv.inc"
+
+static unsigned NumFixedArgs;
+static bool CC_Lanai32_VarArg(unsigned ValNo, MVT ValVT, MVT LocVT,
+                              CCValAssign::LocInfo LocInfo,
+                              ISD::ArgFlagsTy ArgFlags, CCState &State) {
+  // Handle fixed arguments with default CC.
+  // Note: Both the default and fast CC handle VarArg the same and hence the
+  // calling convention of the function is not considered here.
+  if (ValNo < NumFixedArgs) {
+    return CC_Lanai32(ValNo, ValVT, LocVT, LocInfo, ArgFlags, State);
+  }
+
+  // Promote i8/i16 args to i32
+  if (LocVT == MVT::i8 || LocVT == MVT::i16) {
+    LocVT = MVT::i32;
+    if (ArgFlags.isSExt())
+      LocInfo = CCValAssign::SExt;
+    else if (ArgFlags.isZExt())
+      LocInfo = CCValAssign::ZExt;
+    else
+      LocInfo = CCValAssign::AExt;
+  }
+
+  // VarArgs get passed on stack
+  unsigned Offset = State.AllocateStack(4, 4);
+  State.addLoc(CCValAssign::getMem(ValNo, ValVT, Offset, LocVT, LocInfo));
+  return false;
+}
+
+SDValue LanaiTargetLowering::LowerFormalArguments(
+    SDValue Chain, CallingConv::ID CallConv, bool IsVarArg,
+    const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &DL,
+    SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals) const {
+  switch (CallConv) {
+  case CallingConv::C:
+  case CallingConv::Fast:
+    return LowerCCCArguments(Chain, CallConv, IsVarArg, Ins, DL, DAG, InVals);
+  default:
+    llvm_unreachable("Unsupported calling convention");
+  }
+}
+
+SDValue LanaiTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
+                                       SmallVectorImpl<SDValue> &InVals) const {
+  SelectionDAG &DAG = CLI.DAG;
+  SDLoc &DL = CLI.DL;
+  SmallVectorImpl<ISD::OutputArg> &Outs = CLI.Outs;
+  SmallVectorImpl<SDValue> &OutVals = CLI.OutVals;
+  SmallVectorImpl<ISD::InputArg> &Ins = CLI.Ins;
+  SDValue Chain = CLI.Chain;
+  SDValue Callee = CLI.Callee;
+  bool &IsTailCall = CLI.IsTailCall;
+  CallingConv::ID CallConv = CLI.CallConv;
+  bool IsVarArg = CLI.IsVarArg;
+
+  // Lanai target does not yet support tail call optimization.
+  IsTailCall = false;
+
+  switch (CallConv) {
+  case CallingConv::Fast:
+  case CallingConv::C:
+    return LowerCCCCallTo(Chain, Callee, CallConv, IsVarArg, IsTailCall, Outs,
+                          OutVals, Ins, DL, DAG, InVals);
+  default:
+    llvm_unreachable("Unsupported calling convention");
+  }
+}
+
+// LowerCCCArguments - transform physical registers into virtual registers and
+// generate load operations for arguments places on the stack.
+SDValue LanaiTargetLowering::LowerCCCArguments(
+    SDValue Chain, CallingConv::ID CallConv, bool IsVarArg,
+    const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &DL,
+    SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals) const {
+  MachineFunction &MF = DAG.getMachineFunction();
+  MachineFrameInfo *MFI = MF.getFrameInfo();
+  MachineRegisterInfo &RegInfo = MF.getRegInfo();
+  LanaiMachineFunctionInfo *LanaiMFI = MF.getInfo<LanaiMachineFunctionInfo>();
+
+  // Assign locations to all of the incoming arguments.
+  SmallVector<CCValAssign, 16> ArgLocs;
+  CCState CCInfo(CallConv, IsVarArg, DAG.getMachineFunction(), ArgLocs,
+                 *DAG.getContext());
+  if (CallConv == CallingConv::Fast) {
+    CCInfo.AnalyzeFormalArguments(Ins, CC_Lanai32_Fast);
+  } else {
+    CCInfo.AnalyzeFormalArguments(Ins, CC_Lanai32);
+  }
+
+  for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
+    CCValAssign &VA = ArgLocs[i];
+    if (VA.isRegLoc()) {
+      // Arguments passed in registers
+      EVT RegVT = VA.getLocVT();
+      switch (RegVT.getSimpleVT().SimpleTy) {
+      case MVT::i32: {
+        unsigned VReg = RegInfo.createVirtualRegister(&Lanai::GPRRegClass);
+        RegInfo.addLiveIn(VA.getLocReg(), VReg);
+        SDValue ArgValue = DAG.getCopyFromReg(Chain, DL, VReg, RegVT);
+
+        // If this is an 8/16-bit value, it is really passed promoted to 32
+        // bits. Insert an assert[sz]ext to capture this, then truncate to the
+        // right size.
+        if (VA.getLocInfo() == CCValAssign::SExt)
+          ArgValue = DAG.getNode(ISD::AssertSext, DL, RegVT, ArgValue,
+                                 DAG.getValueType(VA.getValVT()));
+        else if (VA.getLocInfo() == CCValAssign::ZExt)
+          ArgValue = DAG.getNode(ISD::AssertZext, DL, RegVT, ArgValue,
+                                 DAG.getValueType(VA.getValVT()));
+
+        if (VA.getLocInfo() != CCValAssign::Full)
+          ArgValue = DAG.getNode(ISD::TRUNCATE, DL, VA.getValVT(), ArgValue);
+
+        InVals.push_back(ArgValue);
+        break;
+      }
+      default:
+        DEBUG(dbgs() << "LowerFormalArguments Unhandled argument type: "
+                     << RegVT.getEVTString() << "\n");
+        llvm_unreachable("unhandled argument type");
+      }
+    } else {
+      // Sanity check
+      assert(VA.isMemLoc());
+      // Load the argument to a virtual register
+      unsigned ObjSize = VA.getLocVT().getSizeInBits() / 8;
+      // Check that the argument fits in stack slot
+      if (ObjSize > 4) {
+        errs() << "LowerFormalArguments Unhandled argument type: "
+               << EVT(VA.getLocVT()).getEVTString() << "\n";
+      }
+      // Create the frame index object for this incoming parameter...
+      int FI = MFI->CreateFixedObject(ObjSize, VA.getLocMemOffset(), true);
+
+      // Create the SelectionDAG nodes corresponding to a load
+      // from this parameter
+      SDValue FIN = DAG.getFrameIndex(FI, MVT::i32);
+      InVals.push_back(DAG.getLoad(
+          VA.getLocVT(), DL, Chain, FIN,
+          MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI)));
+    }
+  }
+
+  // The Lanai ABI for returning structs by value requires that we copy
+  // the sret argument into rv for the return. Save the argument into
+  // a virtual register so that we can access it from the return points.
+  if (MF.getFunction()->hasStructRetAttr()) {
+    unsigned Reg = LanaiMFI->getSRetReturnReg();
+    if (!Reg) {
+      Reg = MF.getRegInfo().createVirtualRegister(getRegClassFor(MVT::i32));
+      LanaiMFI->setSRetReturnReg(Reg);
+    }
+    SDValue Copy = DAG.getCopyToReg(DAG.getEntryNode(), DL, Reg, InVals[0]);
+    Chain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Copy, Chain);
+  }
+
+  if (IsVarArg) {
+    // Record the frame index of the first variable argument
+    // which is a value necessary to VASTART.
+    int FI = MFI->CreateFixedObject(4, CCInfo.getNextStackOffset(), true);
+    LanaiMFI->setVarArgsFrameIndex(FI);
+  }
+
+  return Chain;
+}
+
+SDValue
+LanaiTargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv,
+                                 bool IsVarArg,
+                                 const SmallVectorImpl<ISD::OutputArg> &Outs,
+                                 const SmallVectorImpl<SDValue> &OutVals,
+                                 const SDLoc &DL, SelectionDAG &DAG) const {
+  // CCValAssign - represent the assignment of the return value to a location
+  SmallVector<CCValAssign, 16> RVLocs;
+
+  // CCState - Info about the registers and stack slot.
+  CCState CCInfo(CallConv, IsVarArg, DAG.getMachineFunction(), RVLocs,
+                 *DAG.getContext());
+
+  // Analize return values.
+  CCInfo.AnalyzeReturn(Outs, RetCC_Lanai32);
+
+  SDValue Flag;
+  SmallVector<SDValue, 4> RetOps(1, Chain);
+
+  // Copy the result values into the output registers.
+  for (unsigned i = 0; i != RVLocs.size(); ++i) {
+    CCValAssign &VA = RVLocs[i];
+    assert(VA.isRegLoc() && "Can only return in registers!");
+
+    Chain = DAG.getCopyToReg(Chain, DL, VA.getLocReg(), OutVals[i], Flag);
+
+    // Guarantee that all emitted copies are stuck together with flags.
+    Flag = Chain.getValue(1);
+    RetOps.push_back(DAG.getRegister(VA.getLocReg(), VA.getLocVT()));
+  }
+
+  // The Lanai ABI for returning structs by value requires that we copy
+  // the sret argument into rv for the return. We saved the argument into
+  // a virtual register in the entry block, so now we copy the value out
+  // and into rv.
+  if (DAG.getMachineFunction().getFunction()->hasStructRetAttr()) {
+    MachineFunction &MF = DAG.getMachineFunction();
+    LanaiMachineFunctionInfo *LanaiMFI = MF.getInfo<LanaiMachineFunctionInfo>();
+    unsigned Reg = LanaiMFI->getSRetReturnReg();
+    assert(Reg &&
+           "SRetReturnReg should have been set in LowerFormalArguments().");
+    SDValue Val =
+        DAG.getCopyFromReg(Chain, DL, Reg, getPointerTy(DAG.getDataLayout()));
+
+    Chain = DAG.getCopyToReg(Chain, DL, Lanai::RV, Val, Flag);
+    Flag = Chain.getValue(1);
+    RetOps.push_back(
+        DAG.getRegister(Lanai::RV, getPointerTy(DAG.getDataLayout())));
+  }
+
+  RetOps[0] = Chain; // Update chain
+
+  unsigned Opc = LanaiISD::RET_FLAG;
+  if (Flag.getNode())
+    RetOps.push_back(Flag);
+
+  // Return Void
+  return DAG.getNode(Opc, DL, MVT::Other,
+                     ArrayRef<SDValue>(&RetOps[0], RetOps.size()));
+}
+
+// LowerCCCCallTo - functions arguments are copied from virtual regs to
+// (physical regs)/(stack frame), CALLSEQ_START and CALLSEQ_END are emitted.
+SDValue LanaiTargetLowering::LowerCCCCallTo(
+    SDValue Chain, SDValue Callee, CallingConv::ID CallConv, bool IsVarArg,
+    bool /*IsTailCall*/, const SmallVectorImpl<ISD::OutputArg> &Outs,
+    const SmallVectorImpl<SDValue> &OutVals,
+    const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &DL,
+    SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals) const {
+  // Analyze operands of the call, assigning locations to each operand.
+  SmallVector<CCValAssign, 16> ArgLocs;
+  CCState CCInfo(CallConv, IsVarArg, DAG.getMachineFunction(), ArgLocs,
+                 *DAG.getContext());
+  GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee);
+  MachineFrameInfo *MFI = DAG.getMachineFunction().getFrameInfo();
+
+  NumFixedArgs = 0;
+  if (IsVarArg && G) {
+    const Function *CalleeFn = dyn_cast<Function>(G->getGlobal());
+    if (CalleeFn)
+      NumFixedArgs = CalleeFn->getFunctionType()->getNumParams();
+  }
+  if (NumFixedArgs)
+    CCInfo.AnalyzeCallOperands(Outs, CC_Lanai32_VarArg);
+  else {
+    if (CallConv == CallingConv::Fast)
+      CCInfo.AnalyzeCallOperands(Outs, CC_Lanai32_Fast);
+    else
+      CCInfo.AnalyzeCallOperands(Outs, CC_Lanai32);
+  }
+
+  // Get a count of how many bytes are to be pushed on the stack.
+  unsigned NumBytes = CCInfo.getNextStackOffset();
+
+  // Create local copies for byval args.
+  SmallVector<SDValue, 8> ByValArgs;
+  for (unsigned I = 0, E = Outs.size(); I != E; ++I) {
+    ISD::ArgFlagsTy Flags = Outs[I].Flags;
+    if (!Flags.isByVal())
+      continue;
+
+    SDValue Arg = OutVals[I];
+    unsigned Size = Flags.getByValSize();
+    unsigned Align = Flags.getByValAlign();
+
+    int FI = MFI->CreateStackObject(Size, Align, false);
+    SDValue FIPtr = DAG.getFrameIndex(FI, getPointerTy(DAG.getDataLayout()));
+    SDValue SizeNode = DAG.getConstant(Size, DL, MVT::i32);
+
+    Chain = DAG.getMemcpy(Chain, DL, FIPtr, Arg, SizeNode, Align,
+                          /*IsVolatile=*/false,
+                          /*AlwaysInline=*/false,
+                          /*isTailCall=*/false, MachinePointerInfo(),
+                          MachinePointerInfo());
+    ByValArgs.push_back(FIPtr);
+  }
+
+  Chain = DAG.getCALLSEQ_START(
+      Chain,
+      DAG.getConstant(NumBytes, DL, getPointerTy(DAG.getDataLayout()), true),
+      DL);
+
+  SmallVector<std::pair<unsigned, SDValue>, 4> RegsToPass;
+  SmallVector<SDValue, 12> MemOpChains;
+  SDValue StackPtr;
+
+  // Walk the register/memloc assignments, inserting copies/loads.
+  for (unsigned I = 0, J = 0, E = ArgLocs.size(); I != E; ++I) {
+    CCValAssign &VA = ArgLocs[I];
+    SDValue Arg = OutVals[I];
+    ISD::ArgFlagsTy Flags = Outs[I].Flags;
+
+    // Promote the value if needed.
+    switch (VA.getLocInfo()) {
+    case CCValAssign::Full:
+      break;
+    case CCValAssign::SExt:
+      Arg = DAG.getNode(ISD::SIGN_EXTEND, DL, VA.getLocVT(), Arg);
+      break;
+    case CCValAssign::ZExt:
+      Arg = DAG.getNode(ISD::ZERO_EXTEND, DL, VA.getLocVT(), Arg);
+      break;
+    case CCValAssign::AExt:
+      Arg = DAG.getNode(ISD::ANY_EXTEND, DL, VA.getLocVT(), Arg);
+      break;
+    default:
+      llvm_unreachable("Unknown loc info!");
+    }
+
+    // Use local copy if it is a byval arg.
+    if (Flags.isByVal())
+      Arg = ByValArgs[J++];
+
+    // Arguments that can be passed on register must be kept at RegsToPass
+    // vector
+    if (VA.isRegLoc()) {
+      RegsToPass.push_back(std::make_pair(VA.getLocReg(), Arg));
+    } else {
+      assert(VA.isMemLoc());
+
+      if (StackPtr.getNode() == 0)
+        StackPtr = DAG.getCopyFromReg(Chain, DL, Lanai::SP,
+                                      getPointerTy(DAG.getDataLayout()));
+
+      SDValue PtrOff =
+          DAG.getNode(ISD::ADD, DL, getPointerTy(DAG.getDataLayout()), StackPtr,
+                      DAG.getIntPtrConstant(VA.getLocMemOffset(), DL));
+
+      MemOpChains.push_back(
+          DAG.getStore(Chain, DL, Arg, PtrOff, MachinePointerInfo()));
+    }
+  }
+
+  // Transform all store nodes into one single node because all store nodes are
+  // independent of each other.
+  if (!MemOpChains.empty())
+    Chain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other,
+                        ArrayRef<SDValue>(&MemOpChains[0], MemOpChains.size()));
+
+  SDValue InFlag;
+
+  // Build a sequence of copy-to-reg nodes chained together with token chain and
+  // flag operands which copy the outgoing args into registers.  The InFlag in
+  // necessary since all emitted instructions must be stuck together.
+  for (unsigned I = 0, E = RegsToPass.size(); I != E; ++I) {
+    Chain = DAG.getCopyToReg(Chain, DL, RegsToPass[I].first,
+                             RegsToPass[I].second, InFlag);
+    InFlag = Chain.getValue(1);
+  }
+
+  // If the callee is a GlobalAddress node (quite common, every direct call is)
+  // turn it into a TargetGlobalAddress node so that legalize doesn't hack it.
+  // Likewise ExternalSymbol -> TargetExternalSymbol.
+  uint8_t OpFlag = LanaiII::MO_NO_FLAG;
+  if (G) {
+    Callee = DAG.getTargetGlobalAddress(
+        G->getGlobal(), DL, getPointerTy(DAG.getDataLayout()), 0, OpFlag);
+  } else if (ExternalSymbolSDNode *E = dyn_cast<ExternalSymbolSDNode>(Callee)) {
+    Callee = DAG.getTargetExternalSymbol(
+        E->getSymbol(), getPointerTy(DAG.getDataLayout()), OpFlag);
+  }
+
+  // Returns a chain & a flag for retval copy to use.
+  SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
+  SmallVector<SDValue, 8> Ops;
+  Ops.push_back(Chain);
+  Ops.push_back(Callee);
+
+  // Add a register mask operand representing the call-preserved registers.
+  // TODO: Should return-twice functions be handled?
+  const uint32_t *Mask =
+      TRI->getCallPreservedMask(DAG.getMachineFunction(), CallConv);
+  assert(Mask && "Missing call preserved mask for calling convention");
+  Ops.push_back(DAG.getRegisterMask(Mask));
+
+  // Add argument registers to the end of the list so that they are
+  // known live into the call.
+  for (unsigned I = 0, E = RegsToPass.size(); I != E; ++I)
+    Ops.push_back(DAG.getRegister(RegsToPass[I].first,
+                                  RegsToPass[I].second.getValueType()));
+
+  if (InFlag.getNode())
+    Ops.push_back(InFlag);
+
+  Chain = DAG.getNode(LanaiISD::CALL, DL, NodeTys,
+                      ArrayRef<SDValue>(&Ops[0], Ops.size()));
+  InFlag = Chain.getValue(1);
+
+  // Create the CALLSEQ_END node.
+  Chain = DAG.getCALLSEQ_END(
+      Chain,
+      DAG.getConstant(NumBytes, DL, getPointerTy(DAG.getDataLayout()), true),
+      DAG.getConstant(0, DL, getPointerTy(DAG.getDataLayout()), true), InFlag,
+      DL);
+  InFlag = Chain.getValue(1);
+
+  // Handle result values, copying them out of physregs into vregs that we
+  // return.
+  return LowerCallResult(Chain, InFlag, CallConv, IsVarArg, Ins, DL, DAG,
+                         InVals);
+}
+
+// LowerCallResult - Lower the result values of a call into the
+// appropriate copies out of appropriate physical registers.
+SDValue LanaiTargetLowering::LowerCallResult(
+    SDValue Chain, SDValue InFlag, CallingConv::ID CallConv, bool IsVarArg,
+    const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &DL,
+    SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals) const {
+  // Assign locations to each value returned by this call.
+  SmallVector<CCValAssign, 16> RVLocs;
+  CCState CCInfo(CallConv, IsVarArg, DAG.getMachineFunction(), RVLocs,
+                 *DAG.getContext());
+
+  CCInfo.AnalyzeCallResult(Ins, RetCC_Lanai32);
+
+  // Copy all of the result registers out of their specified physreg.
+  for (unsigned I = 0; I != RVLocs.size(); ++I) {
+    Chain = DAG.getCopyFromReg(Chain, DL, RVLocs[I].getLocReg(),
+                               RVLocs[I].getValVT(), InFlag)
+                .getValue(1);
+    InFlag = Chain.getValue(2);
+    InVals.push_back(Chain.getValue(0));
+  }
+
+  return Chain;
+}
+
+//===----------------------------------------------------------------------===//
+//                      Custom Lowerings
+//===----------------------------------------------------------------------===//
+
+static LPCC::CondCode IntCondCCodeToICC(SDValue CC, const SDLoc &DL,
+                                        SDValue &RHS, SelectionDAG &DAG) {
+  ISD::CondCode SetCCOpcode = cast<CondCodeSDNode>(CC)->get();
+
+  // For integer, only the SETEQ, SETNE, SETLT, SETLE, SETGT, SETGE, SETULT,
+  // SETULE, SETUGT, and SETUGE opcodes are used (see CodeGen/ISDOpcodes.h)
+  // and Lanai only supports integer comparisons, so only provide definitions
+  // for them.
+  switch (SetCCOpcode) {
+  case ISD::SETEQ:
+    return LPCC::ICC_EQ;
+  case ISD::SETGT:
+    if (ConstantSDNode *RHSC = dyn_cast<ConstantSDNode>(RHS))
+      if (RHSC->getZExtValue() == 0xFFFFFFFF) {
+        // X > -1 -> X >= 0 -> is_plus(X)
+        RHS = DAG.getConstant(0, DL, RHS.getValueType());
+        return LPCC::ICC_PL;
+      }
+    return LPCC::ICC_GT;
+  case ISD::SETUGT:
+    return LPCC::ICC_UGT;
+  case ISD::SETLT:
+    if (ConstantSDNode *RHSC = dyn_cast<ConstantSDNode>(RHS))
+      if (RHSC->getZExtValue() == 0)
+        // X < 0 -> is_minus(X)
+        return LPCC::ICC_MI;
+    return LPCC::ICC_LT;
+  case ISD::SETULT:
+    return LPCC::ICC_ULT;
+  case ISD::SETLE:
+    if (ConstantSDNode *RHSC = dyn_cast<ConstantSDNode>(RHS))
+      if (RHSC->getZExtValue() == 0xFFFFFFFF) {
+        // X <= -1 -> X < 0 -> is_minus(X)
+        RHS = DAG.getConstant(0, DL, RHS.getValueType());
+        return LPCC::ICC_MI;
+      }
+    return LPCC::ICC_LE;
+  case ISD::SETULE:
+    return LPCC::ICC_ULE;
+  case ISD::SETGE:
+    if (ConstantSDNode *RHSC = dyn_cast<ConstantSDNode>(RHS))
+      if (RHSC->getZExtValue() == 0)
+        // X >= 0 -> is_plus(X)
+        return LPCC::ICC_PL;
+    return LPCC::ICC_GE;
+  case ISD::SETUGE:
+    return LPCC::ICC_UGE;
+  case ISD::SETNE:
+    return LPCC::ICC_NE;
+  case ISD::SETONE:
+  case ISD::SETUNE:
+  case ISD::SETOGE:
+  case ISD::SETOLE:
+  case ISD::SETOLT:
+  case ISD::SETOGT:
+  case ISD::SETOEQ:
+  case ISD::SETUEQ:
+  case ISD::SETO:
+  case ISD::SETUO:
+    llvm_unreachable("Unsupported comparison.");
+  default:
+    llvm_unreachable("Unknown integer condition code!");
+  }
+}
+
+SDValue LanaiTargetLowering::LowerBR_CC(SDValue Op, SelectionDAG &DAG) const {
+  SDValue Chain = Op.getOperand(0);
+  SDValue Cond = Op.getOperand(1);
+  SDValue LHS = Op.getOperand(2);
+  SDValue RHS = Op.getOperand(3);
+  SDValue Dest = Op.getOperand(4);
+  SDLoc DL(Op);
+
+  LPCC::CondCode CC = IntCondCCodeToICC(Cond, DL, RHS, DAG);
+  SDValue TargetCC = DAG.getConstant(CC, DL, MVT::i32);
+  SDValue Flag =
+      DAG.getNode(LanaiISD::SET_FLAG, DL, MVT::Glue, LHS, RHS, TargetCC);
+
+  return DAG.getNode(LanaiISD::BR_CC, DL, Op.getValueType(), Chain, Dest,
+                     TargetCC, Flag);
+}
+
+SDValue LanaiTargetLowering::LowerMUL(SDValue Op, SelectionDAG &DAG) const {
+  EVT VT = Op->getValueType(0);
+  if (VT != MVT::i32)
+    return SDValue();
+
+  ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op->getOperand(1));
+  if (!C)
+    return SDValue();
+
+  int64_t MulAmt = C->getSExtValue();
+  int32_t HighestOne = -1;
+  uint32_t NonzeroEntries = 0;
+  int SignedDigit[32] = {0};
+
+  // Convert to non-adjacent form (NAF) signed-digit representation.
+  // NAF is a signed-digit form where no adjacent digits are non-zero. It is the
+  // minimal Hamming weight representation of a number (on average 1/3 of the
+  // digits will be non-zero vs 1/2 for regular binary representation). And as
+  // the non-zero digits will be the only digits contributing to the instruction
+  // count, this is desirable. The next loop converts it to NAF (following the
+  // approach in 'Guide to Elliptic Curve Cryptography' [ISBN: 038795273X]) by
+  // choosing the non-zero coefficients such that the resulting quotient is
+  // divisible by 2 which will cause the next coefficient to be zero.
+  int64_t E = std::abs(MulAmt);
+  int S = (MulAmt < 0 ? -1 : 1);
+  int I = 0;
+  while (E > 0) {
+    int ZI = 0;
+    if (E % 2 == 1) {
+      ZI = 2 - (E % 4);
+      if (ZI != 0)
+        ++NonzeroEntries;
+    }
+    SignedDigit[I] = S * ZI;
+    if (SignedDigit[I] == 1)
+      HighestOne = I;
+    E = (E - ZI) / 2;
+    ++I;
+  }
+
+  // Compute number of instructions required. Due to differences in lowering
+  // between the different processors this count is not exact.
+  // Start by assuming a shift and a add/sub for every non-zero entry (hence
+  // every non-zero entry requires 1 shift and 1 add/sub except for the first
+  // entry).
+  int32_t InstrRequired = 2 * NonzeroEntries - 1;
+  // Correct possible over-adding due to shift by 0 (which is not emitted).
+  if (std::abs(MulAmt) % 2 == 1)
+    --InstrRequired;
+  // Return if the form generated would exceed the instruction threshold.
+  if (InstrRequired > LanaiLowerConstantMulThreshold)
+    return SDValue();
+
+  SDValue Res;
+  SDLoc DL(Op);
+  SDValue V = Op->getOperand(0);
+
+  // Initialize the running sum. Set the running sum to the maximal shifted
+  // positive value (i.e., largest i such that zi == 1 and MulAmt has V<<i as a
+  // term NAF).
+  if (HighestOne == -1)
+    Res = DAG.getConstant(0, DL, MVT::i32);
+  else {
+    Res = DAG.getNode(ISD::SHL, DL, VT, V,
+                      DAG.getConstant(HighestOne, DL, MVT::i32));
+    SignedDigit[HighestOne] = 0;
+  }
+
+  // Assemble multiplication from shift, add, sub using NAF form and running
+  // sum.
+  for (unsigned int I = 0; I < sizeof(SignedDigit) / sizeof(SignedDigit[0]);
+       ++I) {
+    if (SignedDigit[I] == 0)
+      continue;
+
+    // Shifted multiplicand (v<<i).
+    SDValue Op =
+        DAG.getNode(ISD::SHL, DL, VT, V, DAG.getConstant(I, DL, MVT::i32));
+    if (SignedDigit[I] == 1)
+      Res = DAG.getNode(ISD::ADD, DL, VT, Res, Op);
+    else if (SignedDigit[I] == -1)
+      Res = DAG.getNode(ISD::SUB, DL, VT, Res, Op);
+  }
+  return Res;
+}
+
+SDValue LanaiTargetLowering::LowerSETCCE(SDValue Op, SelectionDAG &DAG) const {
+  SDValue LHS = Op.getOperand(0);
+  SDValue RHS = Op.getOperand(1);
+  SDValue Carry = Op.getOperand(2);
+  SDValue Cond = Op.getOperand(3);
+  SDLoc DL(Op);
+
+  LPCC::CondCode CC = IntCondCCodeToICC(Cond, DL, RHS, DAG);
+  SDValue TargetCC = DAG.getConstant(CC, DL, MVT::i32);
+  SDValue Flag = DAG.getNode(LanaiISD::SUBBF, DL, MVT::Glue, LHS, RHS, Carry);
+  return DAG.getNode(LanaiISD::SETCC, DL, Op.getValueType(), TargetCC, Flag);
+}
+
+SDValue LanaiTargetLowering::LowerSETCC(SDValue Op, SelectionDAG &DAG) const {
+  SDValue LHS = Op.getOperand(0);
+  SDValue RHS = Op.getOperand(1);
+  SDValue Cond = Op.getOperand(2);
+  SDLoc DL(Op);
+
+  LPCC::CondCode CC = IntCondCCodeToICC(Cond, DL, RHS, DAG);
+  SDValue TargetCC = DAG.getConstant(CC, DL, MVT::i32);
+  SDValue Flag =
+      DAG.getNode(LanaiISD::SET_FLAG, DL, MVT::Glue, LHS, RHS, TargetCC);
+
+  return DAG.getNode(LanaiISD::SETCC, DL, Op.getValueType(), TargetCC, Flag);
+}
+
+SDValue LanaiTargetLowering::LowerSELECT_CC(SDValue Op,
+                                            SelectionDAG &DAG) const {
+  SDValue LHS = Op.getOperand(0);
+  SDValue RHS = Op.getOperand(1);
+  SDValue TrueV = Op.getOperand(2);
+  SDValue FalseV = Op.getOperand(3);
+  SDValue Cond = Op.getOperand(4);
+  SDLoc DL(Op);
+
+  LPCC::CondCode CC = IntCondCCodeToICC(Cond, DL, RHS, DAG);
+  SDValue TargetCC = DAG.getConstant(CC, DL, MVT::i32);
+  SDValue Flag =
+      DAG.getNode(LanaiISD::SET_FLAG, DL, MVT::Glue, LHS, RHS, TargetCC);
+
+  SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::Glue);
+  return DAG.getNode(LanaiISD::SELECT_CC, DL, VTs, TrueV, FalseV, TargetCC,
+                     Flag);
+}
+
+SDValue LanaiTargetLowering::LowerVASTART(SDValue Op, SelectionDAG &DAG) const {
+  MachineFunction &MF = DAG.getMachineFunction();
+  LanaiMachineFunctionInfo *FuncInfo = MF.getInfo<LanaiMachineFunctionInfo>();
+
+  SDLoc DL(Op);
+  SDValue FI = DAG.getFrameIndex(FuncInfo->getVarArgsFrameIndex(),
+                                 getPointerTy(DAG.getDataLayout()));
+
+  // vastart just stores the address of the VarArgsFrameIndex slot into the
+  // memory location argument.
+  const Value *SV = cast<SrcValueSDNode>(Op.getOperand(2))->getValue();
+  return DAG.getStore(Op.getOperand(0), DL, FI, Op.getOperand(1),
+                      MachinePointerInfo(SV));
+}
+
+SDValue LanaiTargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op,
+                                                     SelectionDAG &DAG) const {
+  SDValue Chain = Op.getOperand(0);
+  SDValue Size = Op.getOperand(1);
+  SDLoc DL(Op);
+
+  unsigned SPReg = getStackPointerRegisterToSaveRestore();
+
+  // Get a reference to the stack pointer.
+  SDValue StackPointer = DAG.getCopyFromReg(Chain, DL, SPReg, MVT::i32);
+
+  // Subtract the dynamic size from the actual stack size to
+  // obtain the new stack size.
+  SDValue Sub = DAG.getNode(ISD::SUB, DL, MVT::i32, StackPointer, Size);
+
+  // For Lanai, the outgoing memory arguments area should be on top of the
+  // alloca area on the stack i.e., the outgoing memory arguments should be
+  // at a lower address than the alloca area. Move the alloca area down the
+  // stack by adding back the space reserved for outgoing arguments to SP
+  // here.
+  //
+  // We do not know what the size of the outgoing args is at this point.
+  // So, we add a pseudo instruction ADJDYNALLOC that will adjust the
+  // stack pointer. We replace this instruction with on that has the correct,
+  // known offset in emitPrologue().
+  SDValue ArgAdjust = DAG.getNode(LanaiISD::ADJDYNALLOC, DL, MVT::i32, Sub);
+
+  // The Sub result contains the new stack start address, so it
+  // must be placed in the stack pointer register.
+  SDValue CopyChain = DAG.getCopyToReg(Chain, DL, SPReg, Sub);
+
+  SDValue Ops[2] = {ArgAdjust, CopyChain};
+  return DAG.getMergeValues(Ops, DL);
+}
+
+SDValue LanaiTargetLowering::LowerRETURNADDR(SDValue Op,
+                                             SelectionDAG &DAG) const {
+  MachineFunction &MF = DAG.getMachineFunction();
+  MachineFrameInfo *MFI = MF.getFrameInfo();
+  MFI->setReturnAddressIsTaken(true);
+
+  EVT VT = Op.getValueType();
+  SDLoc DL(Op);
+  unsigned Depth = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
+  if (Depth) {
+    SDValue FrameAddr = LowerFRAMEADDR(Op, DAG);
+    const unsigned Offset = -4;
+    SDValue Ptr = DAG.getNode(ISD::ADD, DL, VT, FrameAddr,
+                              DAG.getIntPtrConstant(Offset, DL));
+    return DAG.getLoad(VT, DL, DAG.getEntryNode(), Ptr, MachinePointerInfo());
+  }
+
+  // Return the link register, which contains the return address.
+  // Mark it an implicit live-in.
+  unsigned Reg = MF.addLiveIn(TRI->getRARegister(), getRegClassFor(MVT::i32));
+  return DAG.getCopyFromReg(DAG.getEntryNode(), DL, Reg, VT);
+}
+
+SDValue LanaiTargetLowering::LowerFRAMEADDR(SDValue Op,
+                                            SelectionDAG &DAG) const {
+  MachineFrameInfo *MFI = DAG.getMachineFunction().getFrameInfo();
+  MFI->setFrameAddressIsTaken(true);
+
+  EVT VT = Op.getValueType();
+  SDLoc DL(Op);
+  SDValue FrameAddr = DAG.getCopyFromReg(DAG.getEntryNode(), DL, Lanai::FP, VT);
+  unsigned Depth = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
+  while (Depth--) {
+    const unsigned Offset = -8;
+    SDValue Ptr = DAG.getNode(ISD::ADD, DL, VT, FrameAddr,
+                              DAG.getIntPtrConstant(Offset, DL));
+    FrameAddr =
+        DAG.getLoad(VT, DL, DAG.getEntryNode(), Ptr, MachinePointerInfo());
+  }
+  return FrameAddr;
+}
+
+const char *LanaiTargetLowering::getTargetNodeName(unsigned Opcode) const {
+  switch (Opcode) {
+  case LanaiISD::ADJDYNALLOC:
+    return "LanaiISD::ADJDYNALLOC";
+  case LanaiISD::RET_FLAG:
+    return "LanaiISD::RET_FLAG";
+  case LanaiISD::CALL:
+    return "LanaiISD::CALL";
+  case LanaiISD::SELECT_CC:
+    return "LanaiISD::SELECT_CC";
+  case LanaiISD::SETCC:
+    return "LanaiISD::SETCC";
+  case LanaiISD::SUBBF:
+    return "LanaiISD::SUBBF";
+  case LanaiISD::SET_FLAG:
+    return "LanaiISD::SET_FLAG";
+  case LanaiISD::BR_CC:
+    return "LanaiISD::BR_CC";
+  case LanaiISD::Wrapper:
+    return "LanaiISD::Wrapper";
+  case LanaiISD::HI:
+    return "LanaiISD::HI";
+  case LanaiISD::LO:
+    return "LanaiISD::LO";
+  case LanaiISD::SMALL:
+    return "LanaiISD::SMALL";
+  default:
+    return NULL;
+  }
+}
+
+SDValue LanaiTargetLowering::LowerConstantPool(SDValue Op,
+                                               SelectionDAG &DAG) const {
+  SDLoc DL(Op);
+  ConstantPoolSDNode *N = cast<ConstantPoolSDNode>(Op);
+  const Constant *C = N->getConstVal();
+  const LanaiTargetObjectFile *TLOF =
+      static_cast<const LanaiTargetObjectFile *>(
+          getTargetMachine().getObjFileLowering());
+
+  // If the code model is small or constant will be placed in the small section,
+  // then assume address will fit in 21-bits.
+  if (getTargetMachine().getCodeModel() == CodeModel::Small ||
+      TLOF->isConstantInSmallSection(DAG.getDataLayout(), C)) {
+    SDValue Small = DAG.getTargetConstantPool(
+        C, MVT::i32, N->getAlignment(), N->getOffset(), LanaiII::MO_NO_FLAG);
+    return DAG.getNode(ISD::OR, DL, MVT::i32,
+                       DAG.getRegister(Lanai::R0, MVT::i32),
+                       DAG.getNode(LanaiISD::SMALL, DL, MVT::i32, Small));
+  } else {
+    uint8_t OpFlagHi = LanaiII::MO_ABS_HI;
+    uint8_t OpFlagLo = LanaiII::MO_ABS_LO;
+
+    SDValue Hi = DAG.getTargetConstantPool(C, MVT::i32, N->getAlignment(),
+                                           N->getOffset(), OpFlagHi);
+    SDValue Lo = DAG.getTargetConstantPool(C, MVT::i32, N->getAlignment(),
+                                           N->getOffset(), OpFlagLo);
+    Hi = DAG.getNode(LanaiISD::HI, DL, MVT::i32, Hi);
+    Lo = DAG.getNode(LanaiISD::LO, DL, MVT::i32, Lo);
+    SDValue Result = DAG.getNode(ISD::OR, DL, MVT::i32, Hi, Lo);
+    return Result;
+  }
+}
+
+SDValue LanaiTargetLowering::LowerGlobalAddress(SDValue Op,
+                                                SelectionDAG &DAG) const {
+  SDLoc DL(Op);
+  const GlobalValue *GV = cast<GlobalAddressSDNode>(Op)->getGlobal();
+  int64_t Offset = cast<GlobalAddressSDNode>(Op)->getOffset();
+
+  const LanaiTargetObjectFile *TLOF =
+      static_cast<const LanaiTargetObjectFile *>(
+          getTargetMachine().getObjFileLowering());
+
+  // If the code model is small or global variable will be placed in the small
+  // section, then assume address will fit in 21-bits.
+  if (getTargetMachine().getCodeModel() == CodeModel::Small ||
+      TLOF->isGlobalInSmallSection(GV, getTargetMachine())) {
+    SDValue Small = DAG.getTargetGlobalAddress(
+        GV, DL, getPointerTy(DAG.getDataLayout()), Offset, LanaiII::MO_NO_FLAG);
+    return DAG.getNode(ISD::OR, DL, MVT::i32,
+                       DAG.getRegister(Lanai::R0, MVT::i32),
+                       DAG.getNode(LanaiISD::SMALL, DL, MVT::i32, Small));
+  } else {
+    uint8_t OpFlagHi = LanaiII::MO_ABS_HI;
+    uint8_t OpFlagLo = LanaiII::MO_ABS_LO;
+
+    // Create the TargetGlobalAddress node, folding in the constant offset.
+    SDValue Hi = DAG.getTargetGlobalAddress(
+        GV, DL, getPointerTy(DAG.getDataLayout()), Offset, OpFlagHi);
+    SDValue Lo = DAG.getTargetGlobalAddress(
+        GV, DL, getPointerTy(DAG.getDataLayout()), Offset, OpFlagLo);
+    Hi = DAG.getNode(LanaiISD::HI, DL, MVT::i32, Hi);
+    Lo = DAG.getNode(LanaiISD::LO, DL, MVT::i32, Lo);
+    return DAG.getNode(ISD::OR, DL, MVT::i32, Hi, Lo);
+  }
+}
+
+SDValue LanaiTargetLowering::LowerBlockAddress(SDValue Op,
+                                               SelectionDAG &DAG) const {
+  SDLoc DL(Op);
+  const BlockAddress *BA = cast<BlockAddressSDNode>(Op)->getBlockAddress();
+
+  uint8_t OpFlagHi = LanaiII::MO_ABS_HI;
+  uint8_t OpFlagLo = LanaiII::MO_ABS_LO;
+
+  SDValue Hi = DAG.getBlockAddress(BA, MVT::i32, true, OpFlagHi);
+  SDValue Lo = DAG.getBlockAddress(BA, MVT::i32, true, OpFlagLo);
+  Hi = DAG.getNode(LanaiISD::HI, DL, MVT::i32, Hi);
+  Lo = DAG.getNode(LanaiISD::LO, DL, MVT::i32, Lo);
+  SDValue Result = DAG.getNode(ISD::OR, DL, MVT::i32, Hi, Lo);
+  return Result;
+}
+
+SDValue LanaiTargetLowering::LowerJumpTable(SDValue Op,
+                                            SelectionDAG &DAG) const {
+  SDLoc DL(Op);
+  JumpTableSDNode *JT = cast<JumpTableSDNode>(Op);
+
+  // If the code model is small assume address will fit in 21-bits.
+  if (getTargetMachine().getCodeModel() == CodeModel::Small) {
+    SDValue Small = DAG.getTargetJumpTable(
+        JT->getIndex(), getPointerTy(DAG.getDataLayout()), LanaiII::MO_NO_FLAG);
+    return DAG.getNode(ISD::OR, DL, MVT::i32,
+                       DAG.getRegister(Lanai::R0, MVT::i32),
+                       DAG.getNode(LanaiISD::SMALL, DL, MVT::i32, Small));
+  } else {
+    uint8_t OpFlagHi = LanaiII::MO_ABS_HI;
+    uint8_t OpFlagLo = LanaiII::MO_ABS_LO;
+
+    SDValue Hi = DAG.getTargetJumpTable(
+        JT->getIndex(), getPointerTy(DAG.getDataLayout()), OpFlagHi);
+    SDValue Lo = DAG.getTargetJumpTable(
+        JT->getIndex(), getPointerTy(DAG.getDataLayout()), OpFlagLo);
+    Hi = DAG.getNode(LanaiISD::HI, DL, MVT::i32, Hi);
+    Lo = DAG.getNode(LanaiISD::LO, DL, MVT::i32, Lo);
+    SDValue Result = DAG.getNode(ISD::OR, DL, MVT::i32, Hi, Lo);
+    return Result;
+  }
+}
+
+SDValue LanaiTargetLowering::LowerSRL_PARTS(SDValue Op,
+                                            SelectionDAG &DAG) const {
+  MVT VT = Op.getSimpleValueType();
+  unsigned VTBits = VT.getSizeInBits();
+  SDLoc dl(Op);
+  SDValue ShOpLo = Op.getOperand(0);
+  SDValue ShOpHi = Op.getOperand(1);
+  SDValue ShAmt = Op.getOperand(2);
+
+  // Performs the following for a >> b:
+  //   unsigned r_high = a_high >> b;
+  //   r_high = (32 - b <= 0) ? 0 : r_high;
+  //
+  //   unsigned r_low = a_low >> b;
+  //   r_low = (32 - b <= 0) ? r_high : r_low;
+  //   r_low = (b == 0) ? r_low : r_low | (a_high << (32 - b));
+  //   return (unsigned long long)r_high << 32 | r_low;
+  // Note: This takes advantage of Lanai's shift behavior to avoid needing to
+  // mask the shift amount.
+
+  SDValue Zero = DAG.getConstant(0, dl, MVT::i32);
+  SDValue NegatedPlus32 = DAG.getNode(
+      ISD::SUB, dl, MVT::i32, DAG.getConstant(VTBits, dl, MVT::i32), ShAmt);
+  SDValue SetCC = DAG.getSetCC(dl, MVT::i32, NegatedPlus32, Zero, ISD::SETLE);
+
+  SDValue Hi = DAG.getNode(ISD::SRL, dl, MVT::i32, ShOpHi, ShAmt);
+  Hi = DAG.getSelect(dl, MVT::i32, SetCC, Zero, Hi);
+
+  SDValue Lo = DAG.getNode(ISD::SRL, dl, MVT::i32, ShOpLo, ShAmt);
+  Lo = DAG.getSelect(dl, MVT::i32, SetCC, Hi, Lo);
+  SDValue CarryBits =
+      DAG.getNode(ISD::SHL, dl, MVT::i32, ShOpHi, NegatedPlus32);
+  SDValue ShiftIsZero = DAG.getSetCC(dl, MVT::i32, ShAmt, Zero, ISD::SETEQ);
+  Lo = DAG.getSelect(dl, MVT::i32, ShiftIsZero, Lo,
+                     DAG.getNode(ISD::OR, dl, MVT::i32, Lo, CarryBits));
+
+  SDValue Ops[2] = {Lo, Hi};
+  return DAG.getMergeValues(Ops, dl);
+}
+
+// Helper function that checks if N is a null or all ones constant.
+static inline bool isZeroOrAllOnes(SDValue N, bool AllOnes) {
+  return AllOnes ? isAllOnesConstant(N) : isNullConstant(N);
+}
+
+// Return true if N is conditionally 0 or all ones.
+// Detects these expressions where cc is an i1 value:
+//
+//   (select cc 0, y)   [AllOnes=0]
+//   (select cc y, 0)   [AllOnes=0]
+//   (zext cc)          [AllOnes=0]
+//   (sext cc)          [AllOnes=0/1]
+//   (select cc -1, y)  [AllOnes=1]
+//   (select cc y, -1)  [AllOnes=1]
+//
+// * AllOnes determines whether to check for an all zero (AllOnes false) or an
+//   all ones operand (AllOnes true).
+// * Invert is set when N is the all zero/ones constant when CC is false.
+// * OtherOp is set to the alternative value of N.
+//
+// For example, for (select cc X, Y) and AllOnes = 0 if:
+// * X = 0, Invert = False and OtherOp = Y
+// * Y = 0, Invert = True and OtherOp = X
+static bool isConditionalZeroOrAllOnes(SDNode *N, bool AllOnes, SDValue &CC,
+                                       bool &Invert, SDValue &OtherOp,
+                                       SelectionDAG &DAG) {
+  switch (N->getOpcode()) {
+  default:
+    return false;
+  case ISD::SELECT: {
+    CC = N->getOperand(0);
+    SDValue N1 = N->getOperand(1);
+    SDValue N2 = N->getOperand(2);
+    if (isZeroOrAllOnes(N1, AllOnes)) {
+      Invert = false;
+      OtherOp = N2;
+      return true;
+    }
+    if (isZeroOrAllOnes(N2, AllOnes)) {
+      Invert = true;
+      OtherOp = N1;
+      return true;
+    }
+    return false;
+  }
+  case ISD::ZERO_EXTEND: {
+    // (zext cc) can never be the all ones value.
+    if (AllOnes)
+      return false;
+    CC = N->getOperand(0);
+    if (CC.getValueType() != MVT::i1)
+      return false;
+    SDLoc dl(N);
+    EVT VT = N->getValueType(0);
+    OtherOp = DAG.getConstant(1, dl, VT);
+    Invert = true;
+    return true;
+  }
+  case ISD::SIGN_EXTEND: {
+    CC = N->getOperand(0);
+    if (CC.getValueType() != MVT::i1)
+      return false;
+    SDLoc dl(N);
+    EVT VT = N->getValueType(0);
+    Invert = !AllOnes;
+    if (AllOnes)
+      // When looking for an AllOnes constant, N is an sext, and the 'other'
+      // value is 0.
+      OtherOp = DAG.getConstant(0, dl, VT);
+    else
+      OtherOp =
+          DAG.getConstant(APInt::getAllOnesValue(VT.getSizeInBits()), dl, VT);
+    return true;
+  }
+  }
+}
+
+// Combine a constant select operand into its use:
+//
+//   (add (select cc, 0, c), x)  -> (select cc, x, (add, x, c))
+//   (sub x, (select cc, 0, c))  -> (select cc, x, (sub, x, c))
+//   (and (select cc, -1, c), x) -> (select cc, x, (and, x, c))  [AllOnes=1]
+//   (or  (select cc, 0, c), x)  -> (select cc, x, (or, x, c))
+//   (xor (select cc, 0, c), x)  -> (select cc, x, (xor, x, c))
+//
+// The transform is rejected if the select doesn't have a constant operand that
+// is null, or all ones when AllOnes is set.
+//
+// Also recognize sext/zext from i1:
+//
+//   (add (zext cc), x) -> (select cc (add x, 1), x)
+//   (add (sext cc), x) -> (select cc (add x, -1), x)
+//
+// These transformations eventually create predicated instructions.
+static SDValue combineSelectAndUse(SDNode *N, SDValue Slct, SDValue OtherOp,
+                                   TargetLowering::DAGCombinerInfo &DCI,
+                                   bool AllOnes) {
+  SelectionDAG &DAG = DCI.DAG;
+  EVT VT = N->getValueType(0);
+  SDValue NonConstantVal;
+  SDValue CCOp;
+  bool SwapSelectOps;
+  if (!isConditionalZeroOrAllOnes(Slct.getNode(), AllOnes, CCOp, SwapSelectOps,
+                                  NonConstantVal, DAG))
+    return SDValue();
+
+  // Slct is now know to be the desired identity constant when CC is true.
+  SDValue TrueVal = OtherOp;
+  SDValue FalseVal =
+      DAG.getNode(N->getOpcode(), SDLoc(N), VT, OtherOp, NonConstantVal);
+  // Unless SwapSelectOps says CC should be false.
+  if (SwapSelectOps)
+    std::swap(TrueVal, FalseVal);
+
+  return DAG.getNode(ISD::SELECT, SDLoc(N), VT, CCOp, TrueVal, FalseVal);
+}
+
+// Attempt combineSelectAndUse on each operand of a commutative operator N.
+static SDValue
+combineSelectAndUseCommutative(SDNode *N, TargetLowering::DAGCombinerInfo &DCI,
+                               bool AllOnes) {
+  SDValue N0 = N->getOperand(0);
+  SDValue N1 = N->getOperand(1);
+  if (N0.getNode()->hasOneUse())
+    if (SDValue Result = combineSelectAndUse(N, N0, N1, DCI, AllOnes))
+      return Result;
+  if (N1.getNode()->hasOneUse())
+    if (SDValue Result = combineSelectAndUse(N, N1, N0, DCI, AllOnes))
+      return Result;
+  return SDValue();
+}
+
+// PerformSUBCombine - Target-specific dag combine xforms for ISD::SUB.
+static SDValue PerformSUBCombine(SDNode *N,
+                                 TargetLowering::DAGCombinerInfo &DCI) {
+  SDValue N0 = N->getOperand(0);
+  SDValue N1 = N->getOperand(1);
+
+  // fold (sub x, (select cc, 0, c)) -> (select cc, x, (sub, x, c))
+  if (N1.getNode()->hasOneUse())
+    if (SDValue Result = combineSelectAndUse(N, N1, N0, DCI, /*AllOnes=*/false))
+      return Result;
+
+  return SDValue();
+}
+
+SDValue LanaiTargetLowering::PerformDAGCombine(SDNode *N,
+                                               DAGCombinerInfo &DCI) const {
+  switch (N->getOpcode()) {
+  default:
+    break;
+  case ISD::ADD:
+  case ISD::OR:
+  case ISD::XOR:
+    return combineSelectAndUseCommutative(N, DCI, /*AllOnes=*/false);
+  case ISD::AND:
+    return combineSelectAndUseCommutative(N, DCI, /*AllOnes=*/true);
+  case ISD::SUB:
+    return PerformSUBCombine(N, DCI);
+  }
+
+  return SDValue();
+}
diff --git a/lib/Target/Lanai/LanaiISelLowering.h b/lib/Target/Lanai/LanaiISelLowering.h
new file mode 100644
index 000000000000..16ce8edd27c5
--- /dev/null
+++ b/lib/Target/Lanai/LanaiISelLowering.h
@@ -0,0 +1,148 @@
+//===-- LanaiISelLowering.h - Lanai DAG Lowering Interface -....-*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines the interfaces that Lanai uses to lower LLVM code into a
+// selection DAG.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_LANAI_LANAIISELLOWERING_H
+#define LLVM_LIB_TARGET_LANAI_LANAIISELLOWERING_H
+
+#include "Lanai.h"
+#include "LanaiRegisterInfo.h"
+#include "llvm/CodeGen/SelectionDAG.h"
+#include "llvm/Target/TargetLowering.h"
+
+namespace llvm {
+namespace LanaiISD {
+enum {
+  FIRST_NUMBER = ISD::BUILTIN_OP_END,
+
+  ADJDYNALLOC,
+
+  // Return with a flag operand. Operand 0 is the chain operand.
+  RET_FLAG,
+
+  // CALL - These operations represent an abstract call instruction, which
+  // includes a bunch of information.
+  CALL,
+
+  // SELECT_CC - Operand 0 and operand 1 are selection variable, operand 3
+  // is condition code and operand 4 is flag operand.
+  SELECT_CC,
+
+  // SETCC - Store the conditional code to a register.
+  SETCC,
+
+  // SET_FLAG - Set flag compare.
+  SET_FLAG,
+
+  // SUBBF - Subtract with borrow that sets flags.
+  SUBBF,
+
+  // BR_CC - Used to glue together a conditional branch and comparison
+  BR_CC,
+
+  // Wrapper - A wrapper node for TargetConstantPool, TargetExternalSymbol,
+  // and TargetGlobalAddress.
+  Wrapper,
+
+  // Get the Higher/Lower 16 bits from a 32-bit immediate.
+  HI,
+  LO,
+
+  // Small 21-bit immediate in global memory.
+  SMALL
+};
+} // namespace LanaiISD
+
+class LanaiSubtarget;
+
+class LanaiTargetLowering : public TargetLowering {
+public:
+  LanaiTargetLowering(const TargetMachine &TM, const LanaiSubtarget &STI);
+
+  // LowerOperation - Provide custom lowering hooks for some operations.
+  SDValue LowerOperation(SDValue Op, SelectionDAG &DAG) const override;
+
+  // getTargetNodeName - This method returns the name of a target specific
+  // DAG node.
+  const char *getTargetNodeName(unsigned Opcode) const override;
+
+  SDValue LowerBlockAddress(SDValue Op, SelectionDAG &DAG) const;
+  SDValue LowerBR_CC(SDValue Op, SelectionDAG &DAG) const;
+  SDValue LowerConstantPool(SDValue Op, SelectionDAG &DAG) const;
+  SDValue LowerDYNAMIC_STACKALLOC(SDValue Op, SelectionDAG &DAG) const;
+  SDValue LowerFRAMEADDR(SDValue Op, SelectionDAG &DAG) const;
+  SDValue LowerGlobalAddress(SDValue Op, SelectionDAG &DAG) const;
+  SDValue LowerJumpTable(SDValue Op, SelectionDAG &DAG) const;
+  SDValue LowerMUL(SDValue Op, SelectionDAG &DAG) const;
+  SDValue LowerRETURNADDR(SDValue Op, SelectionDAG &DAG) const;
+  SDValue LowerSELECT_CC(SDValue Op, SelectionDAG &DAG) const;
+  SDValue LowerSETCC(SDValue Op, SelectionDAG &DAG) const;
+  SDValue LowerSETCCE(SDValue Op, SelectionDAG &DAG) const;
+  SDValue LowerSRL_PARTS(SDValue Op, SelectionDAG &DAG) const;
+  SDValue LowerVASTART(SDValue Op, SelectionDAG &DAG) const;
+
+  unsigned getRegisterByName(const char *RegName, EVT VT,
+                             SelectionDAG &DAG) const override;
+  std::pair<unsigned, const TargetRegisterClass *>
+  getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI,
+                               StringRef Constraint, MVT VT) const override;
+  ConstraintWeight
+  getSingleConstraintMatchWeight(AsmOperandInfo &Info,
+                                 const char *Constraint) const override;
+  void LowerAsmOperandForConstraint(SDValue Op, std::string &Constraint,
+                                    std::vector<SDValue> &Ops,
+                                    SelectionDAG &DAG) const override;
+
+  SDValue PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const override;
+
+private:
+  SDValue LowerCCCCallTo(SDValue Chain, SDValue Callee,
+                         CallingConv::ID CallConv, bool IsVarArg,
+                         bool IsTailCall,
+                         const SmallVectorImpl<ISD::OutputArg> &Outs,
+                         const SmallVectorImpl<SDValue> &OutVals,
+                         const SmallVectorImpl<ISD::InputArg> &Ins,
+                         const SDLoc &dl, SelectionDAG &DAG,
+                         SmallVectorImpl<SDValue> &InVals) const;
+
+  SDValue LowerCCCArguments(SDValue Chain, CallingConv::ID CallConv,
+                            bool IsVarArg,
+                            const SmallVectorImpl<ISD::InputArg> &Ins,
+                            const SDLoc &DL, SelectionDAG &DAG,
+                            SmallVectorImpl<SDValue> &InVals) const;
+
+  SDValue LowerCallResult(SDValue Chain, SDValue InFlag,
+                          CallingConv::ID CallConv, bool IsVarArg,
+                          const SmallVectorImpl<ISD::InputArg> &Ins,
+                          const SDLoc &DL, SelectionDAG &DAG,
+                          SmallVectorImpl<SDValue> &InVals) const;
+
+  SDValue LowerCall(TargetLowering::CallLoweringInfo &CLI,
+                    SmallVectorImpl<SDValue> &InVals) const override;
+
+  SDValue LowerFormalArguments(SDValue Chain, CallingConv::ID CallConv,
+                               bool IsVarArg,
+                               const SmallVectorImpl<ISD::InputArg> &Ins,
+                               const SDLoc &DL, SelectionDAG &DAG,
+                               SmallVectorImpl<SDValue> &InVals) const override;
+
+  SDValue LowerReturn(SDValue Chain, CallingConv::ID CallConv, bool IsVarArg,
+                      const SmallVectorImpl<ISD::OutputArg> &Outs,
+                      const SmallVectorImpl<SDValue> &OutVals, const SDLoc &DL,
+                      SelectionDAG &DAG) const override;
+
+  const LanaiRegisterInfo *TRI;
+};
+} // namespace llvm
+
+#endif // LLVM_LIB_TARGET_LANAI_LANAIISELLOWERING_H
diff --git a/lib/Target/Lanai/LanaiInstrFormats.td b/lib/Target/Lanai/LanaiInstrFormats.td
new file mode 100644
index 000000000000..30289ea4ac0b
--- /dev/null
+++ b/lib/Target/Lanai/LanaiInstrFormats.td
@@ -0,0 +1,561 @@
+//===- LanaiInstrFormats.td - Lanai Instruction Formats ----*- tablegen -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+class InstLanai<dag outs, dag ins, string asmstr, list<dag> pattern>
+    : Instruction {
+  field bits<32> Inst;
+  field bits<32> SoftFail = 0;
+  let Size = 4;
+
+  let Namespace = "Lanai";
+  let DecoderNamespace = "Lanai";
+
+  bits<4> Opcode;
+  let Inst{31 - 28} = Opcode;
+
+  dag OutOperandList = outs;
+  dag InOperandList = ins;
+  let AsmString = asmstr;
+  let Pattern = pattern;
+}
+
+//------------------------------------------------------------------------------
+// Register Immediate (RI)
+//------------------------------------------------------------------------------
+// Encoding:
+//           -----------------------------------------------------------------
+//           |0.A.A.A| . . . . | . . . . |F.H| . . . . . . . . . . . . . . . |
+//           -----------------------------------------------------------------
+//            opcode     Rd        Rs1                constant (16)
+//
+// Action:
+//           Rd <- Rs1 op constant
+//
+// Except for shift instructions, `H' determines whether the constant
+// is in the high (1) or low (0) word.  The other halfword is 0x0000,
+// except for the `AND' instruction (`AAA' = 100), for which the other
+// halfword is 0xFFFF, and shifts (`AAA' = 111), for which the constant is
+// sign extended.
+//
+// `F' determines whether the instruction modifies (1) or does not
+// modify (0) the program flags.
+//
+// `AAA' specifies the operation: `add' (000), `addc' (001), `sub'
+// (010), `subb' (011), `and' (100), `or' (101), `xor' (110), or `shift'
+// (111).  For the shift, `H' specifies a logical (0) or arithmetic (1)
+// shift.  The amount and direction of the shift are determined by the
+// sign extended constant interpreted as a two's complement number.  The
+// shift operation is defined only for the range of:
+//      31 ... 0 -1 ... -31
+//      \      / \        /
+//        left     right
+//        shift    shift
+//
+// If and only if the `F' bit is 1, RI instructions modify the
+// condition bits, `Z' (Zero), `N' (Negative), `V' (oVerflow), and `C'
+// (Carry), according to the result.  If the flags are updated, they are
+// updated as follows:
+// `Z'
+//      is set if the result is zero and cleared otherwise.
+//
+// `N'
+//      is set to the most significant bit of the result.
+//
+// `V'
+//      For arithmetic instructions (`add', `addc', `sub', `subb') `V' is
+//      set if the sign (most significant) bits of the input operands are
+//      the same but different from the sign bit of the result and cleared
+//      otherwise.  For other RI instructions, `V' is cleared.
+//
+// `C'
+//      For arithmetic instructions, `C' is set/cleared if there is/is_not
+//      a carry generated out of the most significant when performing the
+//      twos-complement addition (`sub(a,b) == a + ~b + 1', `subb(a,b) ==
+//      a + ~b + `C'').  For left shifts, `C' is set to the least
+//      significant bit discarded by the shift operation.  For all other
+//      operations, `C' is cleared.
+//
+// A Jump is accomplished by `Rd' being `pc', and it has one shadow.
+//
+// The all-0s word is the instruction `R0 <- R0 + 0', which is a no-op.
+class InstRI<bits<3> op, dag outs, dag ins, string asmstr,
+             list<dag> pattern>
+    : InstLanai<outs, ins, asmstr, pattern>, Sched<[WriteALU]> {
+  let Itinerary = IIC_ALU;
+  bits<5> Rd;
+  bits<5> Rs1;
+  bit F;
+  bit H;
+  bits<16> imm16;
+
+  let Opcode{3} = 0;
+  let Opcode{2 - 0} = op;
+  let Inst{27 - 23} = Rd;
+  let Inst{22 - 18} = Rs1;
+  let Inst{17} = F;
+  let Inst{16} = H;
+  let Inst{15 - 0} = imm16;
+}
+
+//------------------------------------------------------------------------------
+// Register Register (RR)
+//------------------------------------------------------------------------------
+// Encoding:
+//           -----------------------------------------------------------------
+//           |1.1.0.0| . . . . | . . . . |F.I| . . . . |B.B.B|J.J.J.J.J|D.D.D|
+//           -----------------------------------------------------------------
+//            opcode     Rd        Rs1           Rs2   \       operation     /
+//
+// Action:
+//           `Rd <- Rs1 op Rs2' iff condition DDDI is true.
+//
+// `DDDI' is as described for the BR instruction.
+//
+// `F' determines whether the instruction modifies (1) or does not
+// modify (0) the program flags.
+//
+// `BBB' determines the operation: `add' (000), `addc' (001), `sub'
+// (010), `subb' (011), `and' (100), `or' (101), `xor' (110), or "special"
+// (111).  The `JJJJJ' field is irrelevant except for special.
+//
+// `JJJJJ' determines which special operation is performed.  `10---'
+// is a logical shift, and `11---' is an arithmetic shift, and ‘00000` is
+// the SELECT operation.  The amount and direction of the shift are
+// determined by the contents of `Rs2' interpreted as a two's complement
+// number (in the same way as shifts in the Register-Immediate
+// instructions in *Note RI::).  For the SELECT operation, Rd gets Rs1 if
+// condition DDDI is true, Rs2 otherwise. All other `JJJJJ' combinations
+// are reserved for instructions that may be defined in the future.
+//
+// If the `F' bit is 1, RR instructions modify the condition bits, `Z'
+// (Zero), `N' (Negative), `V' (oVerflow), and `C' (Carry), according to
+// the result.  All RR instructions modify the `Z', `N', and `V' flags.
+// Except for arithmetic instructions (`add', `addc', `sub', `subb'), `V'
+// is cleared.  Only arithmetic instructions and shifts modify `C'. Right
+// shifts clear C.
+//
+// DDDI is as described in the table for the BR instruction and only used for
+// the select instruction.
+//
+// A Jump is accomplished by `Rd' being `pc', and it has one shadow.
+class InstRR<bits<3> op, dag outs, dag ins, string asmstr,
+             list<dag> pattern>
+    : InstLanai<outs, ins, asmstr, pattern>, Sched<[WriteALU]> {
+  let Itinerary = IIC_ALU;
+  bits<5> Rd;
+  bits<5> Rs1;
+  bits<5> Rs2;
+  bit F;
+  bits<4> DDDI;
+  bits<5> JJJJJ;
+
+  let Opcode = 0b1100;
+  let Inst{27 - 23} = Rd;
+  let Inst{22 - 18} = Rs1;
+  let Inst{17} = F;
+  let Inst{16} = DDDI{0};
+  let Inst{15 - 11} = Rs2;
+  let Inst{10 - 8} = op;
+  let Inst{7 - 3} = JJJJJ;
+  let Inst{2 - 0} = DDDI{3 - 1};
+}
+
+//------------------------------------------------------------------------------
+// Register Memory (RM)
+//------------------------------------------------------------------------------
+// Encoding:
+//          -----------------------------------------------------------------
+//          |1.0.0.S| . . . . | . . . . |P.Q| . . . . . . . . . . . . . . . |
+//          -----------------------------------------------------------------
+//           opcode     Rd        Rs1                 constant (16)
+//
+// Action:
+//        Rd <- Memory(ea)      (Load)    see below for the
+//        Memory(ea) <- Rd      (Store)   definition of ea.
+//
+// `S' determines whether the instruction is a Load (0) or a Store (1).
+// Loads appear in Rd one cycle after this instruction executes.  If the
+// following instruction reads Rd, that instruction will be delayed by 1
+// clock cycle.
+//
+//   PQ      operation
+//   --      ------------------------------------------
+//   00      ea = Rs1
+//   01      ea = Rs1,             Rs1 <- Rs1 + constant
+//   10      ea = Rs1 + constant
+//   11      ea = Rs1 + constant,  Rs1 <- Rs1 + constant
+//
+// The constant is sign-extended for this instruction.
+//
+// A Jump is accomplished by `Rd' being `pc', and it has *two* delay slots.
+class InstRM<bit S, dag outs, dag ins, string asmstr, list<dag> pattern>
+    : InstLanai<outs, ins, asmstr, pattern> {
+  bits<5> Rd;
+  bits<5> Rs1;
+  bit P;
+  bit Q;
+  bits<16> imm16;
+  // Dummy variables to allow multiclass definition of RM and RRM
+  bits<2> YL;
+  bit E;
+
+  let Opcode{3 - 1} = 0b100;
+  let Opcode{0} = S;
+  let Inst{27 - 23} = Rd;
+  let Inst{22 - 18} = Rs1;
+  let Inst{17} = P;
+  let Inst{16} = Q;
+  let Inst{15 - 0} = imm16;
+
+  let PostEncoderMethod = "adjustPqBitsRmAndRrm";
+}
+
+//------------------------------------------------------------------------------
+// Register Register Memory (RRM)
+//------------------------------------------------------------------------------
+// Encoding:
+//           -----------------------------------------------------------------
+//           |1.0.1.S| . . . . | . . . . |P.Q| . . . . |B.B.B|J.J.J.J.J|Y.L.E|
+//           -----------------------------------------------------------------
+//            opcode     Rd        Rs1           Rs2   \       operation     /
+//
+// Action:
+//           Rd <- Memory(ea)      (Load)    see below for the
+//           Memory(ea) <- Rd      (Store)   definition of ea.
+//
+// The RRM instruction is identical to the RM (*note RM::.) instruction
+// except that:
+//
+// 1. `Rs1 + constant' is replaced with `Rs1 op Rs2', where `op' is
+//    determined in the same way as in the RR instruction (*note RR::.)
+//    and
+//
+// 2. part-word memory accesses are allowed as specified below.
+//
+//    If `BBB' != 111 (i.e.: For all but shift operations):
+//        If `YLE' = 01- => fuLl-word memory access
+//        If `YLE' = 00- => half-word memory access
+//        If `YLE' = 10- => bYte memory access
+//        If `YLE' = --1 => loads are zEro extended
+//        If `YLE' = --0 => loads are sign extended
+//
+//    If `BBB' = 111 (For shift operations):
+//        fullword memory access are performed.
+//
+// All part-word loads write the least significant part of the
+// destination register with the higher-order bits zero- or sign-extended.
+// All part-word stores store the least significant part-word of the
+// source register in the destination memory location.
+//
+// A Jump is accomplished by `Rd' being `pc', and it has *two* delay slots.
+class InstRRM<bit S, dag outs, dag ins, string asmstr,
+              list<dag> pattern>
+    : InstLanai<outs, ins, asmstr, pattern> {
+  bits<5> Rd;
+  bits<5> Rs1;
+  bits<5> Rs2;
+  bit P;
+  bit Q;
+  bits<3> BBB;
+  bits<5> JJJJJ;
+  bits<2> YL;
+  bit E;
+
+  let Opcode{3 - 1} = 0b101;
+  let Opcode{0} = S;
+  let Inst{27 - 23} = Rd;
+  let Inst{22 - 18} = Rs1;
+  let Inst{17} = P;
+  let Inst{16} = Q;
+  let Inst{15 - 11} = Rs2;
+  let Inst{10 - 8} = BBB;
+  let Inst{7 - 3} = JJJJJ;
+  let Inst{2 - 1} = YL;
+  let Inst{0} = E;
+
+  let PostEncoderMethod = "adjustPqBitsRmAndRrm";
+}
+
+//------------------------------------------------------------------------------
+// Conditional Branch (BR)
+//------------------------------------------------------------------------------
+// Encoding:
+//           -----------------------------------------------------------------
+//           |1.1.1.0|D.D.D| . . . . . . . . . . . . . . . . . . . . . . |0.I|
+//           -----------------------------------------------------------------
+//            opcode condition                   constant (23)
+//
+// Action:
+//            if (condition) { `pc' <- 4*(zero-extended constant) }
+//
+// The BR instruction is an absolute branch.
+// The constant is scaled as shown by its position in the instruction word such
+// that it specifies word-aligned addresses in the range [0,2^25-4]
+//
+// The `DDDI' field selects the condition that causes the branch to be taken.
+// (the `I' (Invert sense) bit inverts the sense of the condition):
+//
+//   DDDI  logical function                        [code, used for...]
+//   ----  --------------------------------------  ------------------------
+//   0000  1                                       [T, true]
+//   0001  0                                       [F, false]
+//   0010  C AND Z'                                [HI, high]
+//   0011  C' OR Z                                 [LS, low or same]
+//   0100  C'                                      [CC, carry cleared]
+//   0101  C                                       [CS, carry set]
+//   0110  Z'                                      [NE, not equal]
+//   0111  Z                                       [EQ, equal]
+//   1000  V'                                      [VC, oVerflow cleared]
+//   1001  V                                       [VS, oVerflow set]
+//   1010  N'                                      [PL, plus]
+//   1011  N                                       [MI, minus]
+//   1100  (N AND V) OR (N' AND V')                [GE, greater than or equal]
+//   1101  (N AND V') OR (N' AND V)                [LT, less than]
+//   1110  (N AND V AND Z') OR (N' AND V' AND Z')  [GT, greater than]
+//   1111  (Z) OR (N AND V') OR (N' AND V)         [LE, less than or equal]
+//
+// If the branch is not taken, the BR instruction is a no-op.  If the branch is
+// taken, the processor starts executing instructions at the branch target
+// address *after* the processor has executed one more instruction.  That is,
+// the branch has one “branch delay slot”.  Be very careful if you find yourself
+// wanting to put a branch in a branch delays slot!
+class InstBR<dag outs, dag ins, string asmstr, list<dag> pattern>
+    : InstLanai<outs, ins, asmstr, pattern> {
+  let Itinerary = IIC_ALU;
+  bits<25> addr;
+  bits<4> DDDI;
+
+  let Opcode = 0b1110;
+  let Inst{27 - 25} = DDDI{3 - 1};
+  let Inst{24 - 0} = addr;
+  // These instructions overwrite the last two address bits (which are assumed
+  // and ensured to be 0).
+  let Inst{1} = 0;
+  let Inst{0} = DDDI{0};
+}
+
+//------------------------------------------------------------------------------
+// Conditional Branch Relative (BRR)
+//------------------------------------------------------------------------------
+// Encoding:
+//           -----------------------------------------------------------------
+//           |1.1.1.0|D.D.D|1|-| . . . . |-.-| . . . . . . . . . . . . . |1.I|
+//           -----------------------------------------------------------------
+//            opcode condition     Rs1           constant (14)
+// Action:
+//           if (condition) { ‘pc’ <- Rs1 + 4*sign-extended constant) }
+//
+// BRR behaves like BR, except the branch target address is a 16-bit PC relative
+// offset.
+class InstBRR<dag outs, dag ins, string asmstr, list<dag> pattern>
+    : InstLanai<outs, ins, asmstr, pattern> {
+  bits<4> DDDI;
+  bits<5> Rs1;
+  bits<16> imm16;
+
+  let Opcode = 0b1110;
+  let Inst{27 - 25} = DDDI{3 - 1};
+  let Inst{24} = 1;
+  let Inst{22 - 18} = Rs1;
+  let Inst{17 - 16} = 0;
+  let Inst{15 - 0} = imm16;
+  // Overwrite last two bits which have to be zero
+  let Inst{1} = 1;
+  let Inst{0} = DDDI{0};
+
+  // Set don't cares to zero
+  let Inst{23} = 0;
+}
+
+//------------------------------------------------------------------------------
+// Conditional Set (SCC)
+//------------------------------------------------------------------------------
+// Encoding:
+//           -----------------------------------------------------------------
+//           |1.1.1.0|D.D.D|0.-| . . . . |-.-.-.-.-.-.-.-.-.-.-.-.-.-.-.-|1.I|
+//           -----------------------------------------------------------------
+//            opcode condition     Rs1
+//
+// Action:
+//       Rs1 <- logical function result
+//
+// SCC sets dst_reg to the boolean result of computing the logical function
+// specified by DDDI, as described in the table for the BR instruction.
+class InstSCC<dag outs, dag ins, string asmstr,
+              list<dag> pattern>
+    : InstLanai<outs, ins, asmstr, pattern> {
+  let Itinerary = IIC_ALU;
+  bits<5> Rs1; // dst_reg in documentation
+  bits<4> DDDI;
+
+  let Opcode = 0b1110;
+  let Inst{27 - 25} = DDDI{3 - 1};
+  let Inst{24} = 0;
+  let Inst{22 - 18} = Rs1;
+  let Inst{1} = 1;
+  let Inst{0} = DDDI{0};
+
+  // Set don't cares to zero
+  let Inst{23} = 0;
+  let Inst{17 - 2} = 0;
+}
+
+//------------------------------------------------------------------------------
+// Special Load/Store (SLS)
+//------------------------------------------------------------------------------
+//
+// Encoding:
+//           -----------------------------------------------------------------
+//           |1.1.1.1| . . . . | . . . . |0.S| . . . . . . . . . . . . . . . |
+//           -----------------------------------------------------------------
+//            opcode     Rd    addr 5msb's            address 16 lsb's
+//
+// Action:
+//           If S = 0 (LOAD):   Rd <- Memory(address);
+//           If S = 1 (STORE):  Memory(address) <- Rd
+//
+// The timing is the same as for RM (*note RM::.) and RRM (*note
+// RRM::.) instructions.  The two low-order bits of the 21-bit address are
+// ignored.  The address is zero extended.  Fullword memory accesses are
+// performed.
+class InstSLS<bit S, dag outs, dag ins, string asmstr, list<dag> pattern>
+    : InstLanai<outs, ins, asmstr, pattern> {
+  bits<5> Rd;
+  bits<5> msb;
+  bits<16> lsb;
+
+  let Opcode = 0b1111;
+  let Inst{27 - 23} = Rd;
+  let Inst{22 - 18} = msb;
+  let Inst{17} = 0;
+  let Inst{16} = S;
+  let Inst{15 - 0} = lsb;
+}
+
+//------------------------------------------------------------------------------
+// Special Load Immediate (SLI)
+//------------------------------------------------------------------------------
+// Encoding:
+//           -----------------------------------------------------------------
+//           |1.1.1.1| . . . . | . . . . |1.0| . . . . . . . . . . . . . . . |
+//           -----------------------------------------------------------------
+//            opcode     Rd    const 5msb's          constant 16 lsb's
+//
+// Action:
+//           Rd <- constant
+//
+// The 21-bit constant is zero-extended.  The timing is the same as the
+// RM instruction (*note RM::.).
+class InstSLI<dag outs, dag ins, string asmstr, list<dag> pattern>
+    : InstLanai<outs, ins, asmstr, pattern> {
+  bits<5> Rd;
+  bits<5> msb;
+  bits<16> lsb;
+
+  let Opcode = 0b1111;
+  let Inst{27 - 23} = Rd;
+  let Inst{22 - 18} = msb;
+  let Inst{17} = 1;
+  let Inst{16} = 0;
+  let Inst{15 - 0} = lsb;
+}
+
+//------------------------------------------------------------------------------
+// Special Part-Word Load/Store (SPLS)
+//------------------------------------------------------------------------------
+// Encoding:
+//        -----------------------------------------------------------------
+//        |1.1.1.1| . . . . | . . . . |1.1.0.Y.S.E.P.Q| . . . . . . . . . |
+//        -----------------------------------------------------------------
+//         opcode     Rd        Rs1                       constant (10)
+//
+// Action:
+//        If `YS' = 11  (bYte     Store):
+//             Memory(ea) <- (least significant byte of Rr)
+//        If `YS' = 01  (halfword Store):
+//             Memory(ea) <- (least significant half-word of Rr)
+//        If `YS' = 10  (bYte     load):  Rr <- Memory(ea)
+//        If `YS' = 00  (halfword load):  Rr <- Memory(ea)
+//             [Note: here ea is determined as in the the RM instruction. ]
+//        If `SE' = 01 then the value is zEro extended
+//             before being loaded into Rd.
+//        If `SE' = 00 then the value is sign extended
+//             before being loaded into Rd.
+//
+// `P' and `Q' are used to determine `ea' as in the RM instruction. The
+// constant is sign extended.  The timing is the same as the RM and RRM
+// instructions.  *Note RM:: and *Note RRM::.
+//
+// All part-word loads write the part-word into the least significant
+// part of the destination register, with the higher-order bits zero- or
+// sign-extended.  All part-word stores store the least significant
+// part-word of the source register into the destination memory location.
+class InstSPLS<dag outs, dag ins, string asmstr,
+               list<dag> pattern>
+    : InstLanai<outs, ins, asmstr, pattern> {
+  bits<5> Rd;
+  bits<5> Rs1;
+  bits<5> msb;
+  bit Y;
+  bit S;
+  bit E;
+  bit P;
+  bit Q;
+  bits<10> imm10;
+
+  let Opcode = 0b1111;
+  let Inst{27 - 23} = Rd;
+  let Inst{22 - 18} = Rs1;
+  let Inst{17 - 15} = 0b110;
+  let Inst{14} = Y;
+  let Inst{13} = S;
+  let Inst{12} = E;
+  let Inst{11} = P;
+  let Inst{10} = Q;
+  let Inst{9 - 0} = imm10;
+
+  let PostEncoderMethod = "adjustPqBitsSpls";
+}
+
+//------------------------------------------------------------------------------
+// Special instructions (popc, leadz, trailz)
+//------------------------------------------------------------------------------
+// Encoding:
+//         -----------------------------------------------------------------
+//         |1.1.0.1|    Rd   |   Rs1   |F.-| . . . . | . . | . . . . | OP  |
+//         -----------------------------------------------------------------
+//          opcode      Rd       Rs1
+// Action:
+//         Rd <- Perform action encoded in OP on Rs1
+//   OP is one of:
+//      0b001 POPC   Population count;
+//      0b010 LEADZ  Count number of leading zeros;
+//      0b011 TRAILZ Count number of trailing zeros;
+class InstSpecial<bits<3> op, dag outs, dag ins, string asmstr,
+                  list<dag> pattern> : InstLanai<outs, ins, asmstr,
+                  pattern>, Sched<[WriteALU]> {
+  let Itinerary = IIC_ALU;
+  bit F;
+  bits<5> Rd;
+  bits<5> Rs1;
+
+  let Opcode = 0b1101;
+  let Inst{27 - 23} = Rd;
+  let Inst{22 - 18} = Rs1;
+  let Inst{17} = F;
+  let Inst{16 - 3} = 0;
+  let Inst{2 - 0} = op;
+}
+
+// Pseudo instructions
+class Pseudo<dag outs, dag ins, string asmstr, list<dag> pattern>
+    : InstLanai<outs, ins, asmstr, pattern> {
+  let Inst{15 - 0} = 0;
+  let isPseudo = 1;
+}
diff --git a/lib/Target/Lanai/LanaiInstrInfo.cpp b/lib/Target/Lanai/LanaiInstrInfo.cpp
new file mode 100644
index 000000000000..673d23daf886
--- /dev/null
+++ b/lib/Target/Lanai/LanaiInstrInfo.cpp
@@ -0,0 +1,803 @@
+//===-- LanaiInstrInfo.cpp - Lanai Instruction Information ------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the Lanai implementation of the TargetInstrInfo class.
+//
+//===----------------------------------------------------------------------===//
+
+#include "Lanai.h"
+#include "LanaiInstrInfo.h"
+#include "LanaiMachineFunctionInfo.h"
+#include "LanaiTargetMachine.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/TargetRegistry.h"
+
+using namespace llvm;
+
+#define GET_INSTRINFO_CTOR_DTOR
+#include "LanaiGenInstrInfo.inc"
+
+LanaiInstrInfo::LanaiInstrInfo()
+    : LanaiGenInstrInfo(Lanai::ADJCALLSTACKDOWN, Lanai::ADJCALLSTACKUP),
+      RegisterInfo() {}
+
+void LanaiInstrInfo::copyPhysReg(MachineBasicBlock &MBB,
+                                 MachineBasicBlock::iterator Position,
+                                 const DebugLoc &DL,
+                                 unsigned DestinationRegister,
+                                 unsigned SourceRegister,
+                                 bool KillSource) const {
+  if (!Lanai::GPRRegClass.contains(DestinationRegister, SourceRegister)) {
+    llvm_unreachable("Impossible reg-to-reg copy");
+  }
+
+  BuildMI(MBB, Position, DL, get(Lanai::OR_I_LO), DestinationRegister)
+      .addReg(SourceRegister, getKillRegState(KillSource))
+      .addImm(0);
+}
+
+void LanaiInstrInfo::storeRegToStackSlot(
+    MachineBasicBlock &MBB, MachineBasicBlock::iterator Position,
+    unsigned SourceRegister, bool IsKill, int FrameIndex,
+    const TargetRegisterClass *RegisterClass,
+    const TargetRegisterInfo * /*RegisterInfo*/) const {
+  DebugLoc DL;
+  if (Position != MBB.end()) {
+    DL = Position->getDebugLoc();
+  }
+
+  if (!Lanai::GPRRegClass.hasSubClassEq(RegisterClass)) {
+    llvm_unreachable("Can't store this register to stack slot");
+  }
+  BuildMI(MBB, Position, DL, get(Lanai::SW_RI))
+      .addReg(SourceRegister, getKillRegState(IsKill))
+      .addFrameIndex(FrameIndex)
+      .addImm(0)
+      .addImm(LPAC::ADD);
+}
+
+void LanaiInstrInfo::loadRegFromStackSlot(
+    MachineBasicBlock &MBB, MachineBasicBlock::iterator Position,
+    unsigned DestinationRegister, int FrameIndex,
+    const TargetRegisterClass *RegisterClass,
+    const TargetRegisterInfo * /*RegisterInfo*/) const {
+  DebugLoc DL;
+  if (Position != MBB.end()) {
+    DL = Position->getDebugLoc();
+  }
+
+  if (!Lanai::GPRRegClass.hasSubClassEq(RegisterClass)) {
+    llvm_unreachable("Can't load this register from stack slot");
+  }
+  BuildMI(MBB, Position, DL, get(Lanai::LDW_RI), DestinationRegister)
+      .addFrameIndex(FrameIndex)
+      .addImm(0)
+      .addImm(LPAC::ADD);
+}
+
+bool LanaiInstrInfo::areMemAccessesTriviallyDisjoint(
+    MachineInstr &MIa, MachineInstr &MIb, AliasAnalysis * /*AA*/) const {
+  assert(MIa.mayLoadOrStore() && "MIa must be a load or store.");
+  assert(MIb.mayLoadOrStore() && "MIb must be a load or store.");
+
+  if (MIa.hasUnmodeledSideEffects() || MIb.hasUnmodeledSideEffects() ||
+      MIa.hasOrderedMemoryRef() || MIb.hasOrderedMemoryRef())
+    return false;
+
+  // Retrieve the base register, offset from the base register and width. Width
+  // is the size of memory that is being loaded/stored (e.g. 1, 2, 4).  If
+  // base registers are identical, and the offset of a lower memory access +
+  // the width doesn't overlap the offset of a higher memory access,
+  // then the memory accesses are different.
+  const TargetRegisterInfo *TRI = &getRegisterInfo();
+  unsigned BaseRegA = 0, BaseRegB = 0;
+  int64_t OffsetA = 0, OffsetB = 0;
+  unsigned int WidthA = 0, WidthB = 0;
+  if (getMemOpBaseRegImmOfsWidth(MIa, BaseRegA, OffsetA, WidthA, TRI) &&
+      getMemOpBaseRegImmOfsWidth(MIb, BaseRegB, OffsetB, WidthB, TRI)) {
+    if (BaseRegA == BaseRegB) {
+      int LowOffset = std::min(OffsetA, OffsetB);
+      int HighOffset = std::max(OffsetA, OffsetB);
+      int LowWidth = (LowOffset == OffsetA) ? WidthA : WidthB;
+      if (LowOffset + LowWidth <= HighOffset)
+        return true;
+    }
+  }
+  return false;
+}
+
+bool LanaiInstrInfo::expandPostRAPseudo(MachineInstr & /*MI*/) const {
+  return false;
+}
+
+static LPCC::CondCode getOppositeCondition(LPCC::CondCode CC) {
+  switch (CC) {
+  case LPCC::ICC_T: //  true
+    return LPCC::ICC_F;
+  case LPCC::ICC_F: //  false
+    return LPCC::ICC_T;
+  case LPCC::ICC_HI: //  high
+    return LPCC::ICC_LS;
+  case LPCC::ICC_LS: //  low or same
+    return LPCC::ICC_HI;
+  case LPCC::ICC_CC: //  carry cleared
+    return LPCC::ICC_CS;
+  case LPCC::ICC_CS: //  carry set
+    return LPCC::ICC_CC;
+  case LPCC::ICC_NE: //  not equal
+    return LPCC::ICC_EQ;
+  case LPCC::ICC_EQ: //  equal
+    return LPCC::ICC_NE;
+  case LPCC::ICC_VC: //  oVerflow cleared
+    return LPCC::ICC_VS;
+  case LPCC::ICC_VS: //  oVerflow set
+    return LPCC::ICC_VC;
+  case LPCC::ICC_PL: //  plus (note: 0 is "minus" too here)
+    return LPCC::ICC_MI;
+  case LPCC::ICC_MI: //  minus
+    return LPCC::ICC_PL;
+  case LPCC::ICC_GE: //  greater than or equal
+    return LPCC::ICC_LT;
+  case LPCC::ICC_LT: //  less than
+    return LPCC::ICC_GE;
+  case LPCC::ICC_GT: //  greater than
+    return LPCC::ICC_LE;
+  case LPCC::ICC_LE: //  less than or equal
+    return LPCC::ICC_GT;
+  default:
+    llvm_unreachable("Invalid condtional code");
+  }
+}
+
+std::pair<unsigned, unsigned>
+LanaiInstrInfo::decomposeMachineOperandsTargetFlags(unsigned TF) const {
+  return std::make_pair(TF, 0u);
+}
+
+ArrayRef<std::pair<unsigned, const char *>>
+LanaiInstrInfo::getSerializableDirectMachineOperandTargetFlags() const {
+  using namespace LanaiII;
+  static const std::pair<unsigned, const char *> TargetFlags[] = {
+      {MO_ABS_HI, "lanai-hi"},
+      {MO_ABS_LO, "lanai-lo"},
+      {MO_NO_FLAG, "lanai-nf"}};
+  return makeArrayRef(TargetFlags);
+}
+
+bool LanaiInstrInfo::analyzeCompare(const MachineInstr &MI, unsigned &SrcReg,
+                                    unsigned &SrcReg2, int &CmpMask,
+                                    int &CmpValue) const {
+  switch (MI.getOpcode()) {
+  default:
+    break;
+  case Lanai::SFSUB_F_RI_LO:
+  case Lanai::SFSUB_F_RI_HI:
+    SrcReg = MI.getOperand(0).getReg();
+    SrcReg2 = 0;
+    CmpMask = ~0;
+    CmpValue = MI.getOperand(1).getImm();
+    return true;
+  case Lanai::SFSUB_F_RR:
+    SrcReg = MI.getOperand(0).getReg();
+    SrcReg2 = MI.getOperand(1).getReg();
+    CmpMask = ~0;
+    CmpValue = 0;
+    return true;
+  }
+
+  return false;
+}
+
+// isRedundantFlagInstr - check whether the first instruction, whose only
+// purpose is to update flags, can be made redundant.
+// * SFSUB_F_RR can be made redundant by SUB_RI if the operands are the same.
+// * SFSUB_F_RI can be made redundant by SUB_I if the operands are the same.
+inline static bool isRedundantFlagInstr(MachineInstr *CmpI, unsigned SrcReg,
+                                        unsigned SrcReg2, int ImmValue,
+                                        MachineInstr *OI) {
+  if (CmpI->getOpcode() == Lanai::SFSUB_F_RR &&
+      OI->getOpcode() == Lanai::SUB_R &&
+      ((OI->getOperand(1).getReg() == SrcReg &&
+        OI->getOperand(2).getReg() == SrcReg2) ||
+       (OI->getOperand(1).getReg() == SrcReg2 &&
+        OI->getOperand(2).getReg() == SrcReg)))
+    return true;
+
+  if (((CmpI->getOpcode() == Lanai::SFSUB_F_RI_LO &&
+        OI->getOpcode() == Lanai::SUB_I_LO) ||
+       (CmpI->getOpcode() == Lanai::SFSUB_F_RI_HI &&
+        OI->getOpcode() == Lanai::SUB_I_HI)) &&
+      OI->getOperand(1).getReg() == SrcReg &&
+      OI->getOperand(2).getImm() == ImmValue)
+    return true;
+  return false;
+}
+
+inline static unsigned flagSettingOpcodeVariant(unsigned OldOpcode) {
+  switch (OldOpcode) {
+  case Lanai::ADD_I_HI:
+    return Lanai::ADD_F_I_HI;
+  case Lanai::ADD_I_LO:
+    return Lanai::ADD_F_I_LO;
+  case Lanai::ADD_R:
+    return Lanai::ADD_F_R;
+  case Lanai::ADDC_I_HI:
+    return Lanai::ADDC_F_I_HI;
+  case Lanai::ADDC_I_LO:
+    return Lanai::ADDC_F_I_LO;
+  case Lanai::ADDC_R:
+    return Lanai::ADDC_F_R;
+  case Lanai::AND_I_HI:
+    return Lanai::AND_F_I_HI;
+  case Lanai::AND_I_LO:
+    return Lanai::AND_F_I_LO;
+  case Lanai::AND_R:
+    return Lanai::AND_F_R;
+  case Lanai::OR_I_HI:
+    return Lanai::OR_F_I_HI;
+  case Lanai::OR_I_LO:
+    return Lanai::OR_F_I_LO;
+  case Lanai::OR_R:
+    return Lanai::OR_F_R;
+  case Lanai::SL_I:
+    return Lanai::SL_F_I;
+  case Lanai::SRL_R:
+    return Lanai::SRL_F_R;
+  case Lanai::SA_I:
+    return Lanai::SA_F_I;
+  case Lanai::SRA_R:
+    return Lanai::SRA_F_R;
+  case Lanai::SUB_I_HI:
+    return Lanai::SUB_F_I_HI;
+  case Lanai::SUB_I_LO:
+    return Lanai::SUB_F_I_LO;
+  case Lanai::SUB_R:
+    return Lanai::SUB_F_R;
+  case Lanai::SUBB_I_HI:
+    return Lanai::SUBB_F_I_HI;
+  case Lanai::SUBB_I_LO:
+    return Lanai::SUBB_F_I_LO;
+  case Lanai::SUBB_R:
+    return Lanai::SUBB_F_R;
+  case Lanai::XOR_I_HI:
+    return Lanai::XOR_F_I_HI;
+  case Lanai::XOR_I_LO:
+    return Lanai::XOR_F_I_LO;
+  case Lanai::XOR_R:
+    return Lanai::XOR_F_R;
+  default:
+    return Lanai::NOP;
+  }
+}
+
+bool LanaiInstrInfo::optimizeCompareInstr(
+    MachineInstr &CmpInstr, unsigned SrcReg, unsigned SrcReg2, int /*CmpMask*/,
+    int CmpValue, const MachineRegisterInfo *MRI) const {
+  // Get the unique definition of SrcReg.
+  MachineInstr *MI = MRI->getUniqueVRegDef(SrcReg);
+  if (!MI)
+    return false;
+
+  // Get ready to iterate backward from CmpInstr.
+  MachineBasicBlock::iterator I = CmpInstr, E = MI,
+                              B = CmpInstr.getParent()->begin();
+
+  // Early exit if CmpInstr is at the beginning of the BB.
+  if (I == B)
+    return false;
+
+  // There are two possible candidates which can be changed to set SR:
+  // One is MI, the other is a SUB instruction.
+  // * For SFSUB_F_RR(r1,r2), we are looking for SUB(r1,r2) or SUB(r2,r1).
+  // * For SFSUB_F_RI(r1, CmpValue), we are looking for SUB(r1, CmpValue).
+  MachineInstr *Sub = nullptr;
+  if (SrcReg2 != 0)
+    // MI is not a candidate to transform into a flag setting instruction.
+    MI = nullptr;
+  else if (MI->getParent() != CmpInstr.getParent() || CmpValue != 0) {
+    // Conservatively refuse to convert an instruction which isn't in the same
+    // BB as the comparison. Don't return if SFSUB_F_RI and CmpValue != 0 as Sub
+    // may still be a candidate.
+    if (CmpInstr.getOpcode() == Lanai::SFSUB_F_RI_LO)
+      MI = nullptr;
+    else
+      return false;
+  }
+
+  // Check that SR isn't set between the comparison instruction and the
+  // instruction we want to change while searching for Sub.
+  const TargetRegisterInfo *TRI = &getRegisterInfo();
+  for (--I; I != E; --I) {
+    const MachineInstr &Instr = *I;
+
+    if (Instr.modifiesRegister(Lanai::SR, TRI) ||
+        Instr.readsRegister(Lanai::SR, TRI))
+      // This instruction modifies or uses SR after the one we want to change.
+      // We can't do this transformation.
+      return false;
+
+    // Check whether CmpInstr can be made redundant by the current instruction.
+    if (isRedundantFlagInstr(&CmpInstr, SrcReg, SrcReg2, CmpValue, &*I)) {
+      Sub = &*I;
+      break;
+    }
+
+    // Don't search outside the containing basic block.
+    if (I == B)
+      return false;
+  }
+
+  // Return false if no candidates exist.
+  if (!MI && !Sub)
+    return false;
+
+  // The single candidate is called MI.
+  if (!MI)
+    MI = Sub;
+
+  if (flagSettingOpcodeVariant(MI->getOpcode()) != Lanai::NOP) {
+    bool isSafe = false;
+
+    SmallVector<std::pair<MachineOperand *, LPCC::CondCode>, 4>
+        OperandsToUpdate;
+    I = CmpInstr;
+    E = CmpInstr.getParent()->end();
+    while (!isSafe && ++I != E) {
+      const MachineInstr &Instr = *I;
+      for (unsigned IO = 0, EO = Instr.getNumOperands(); !isSafe && IO != EO;
+           ++IO) {
+        const MachineOperand &MO = Instr.getOperand(IO);
+        if (MO.isRegMask() && MO.clobbersPhysReg(Lanai::SR)) {
+          isSafe = true;
+          break;
+        }
+        if (!MO.isReg() || MO.getReg() != Lanai::SR)
+          continue;
+        if (MO.isDef()) {
+          isSafe = true;
+          break;
+        }
+        // Condition code is after the operand before SR.
+        LPCC::CondCode CC;
+        CC = (LPCC::CondCode)Instr.getOperand(IO - 1).getImm();
+
+        if (Sub) {
+          LPCC::CondCode NewCC = getOppositeCondition(CC);
+          if (NewCC == LPCC::ICC_T)
+            return false;
+          // If we have SUB(r1, r2) and CMP(r2, r1), the condition code based on
+          // CMP needs to be updated to be based on SUB.  Push the condition
+          // code operands to OperandsToUpdate.  If it is safe to remove
+          // CmpInstr, the condition code of these operands will be modified.
+          if (SrcReg2 != 0 && Sub->getOperand(1).getReg() == SrcReg2 &&
+              Sub->getOperand(2).getReg() == SrcReg) {
+            OperandsToUpdate.push_back(
+                std::make_pair(&((*I).getOperand(IO - 1)), NewCC));
+          }
+        } else {
+          // No Sub, so this is x = <op> y, z; cmp x, 0.
+          switch (CC) {
+          case LPCC::ICC_EQ: // Z
+          case LPCC::ICC_NE: // Z
+          case LPCC::ICC_MI: // N
+          case LPCC::ICC_PL: // N
+          case LPCC::ICC_F:  // none
+          case LPCC::ICC_T:  // none
+            // SR can be used multiple times, we should continue.
+            break;
+          case LPCC::ICC_CS: // C
+          case LPCC::ICC_CC: // C
+          case LPCC::ICC_VS: // V
+          case LPCC::ICC_VC: // V
+          case LPCC::ICC_HI: // C Z
+          case LPCC::ICC_LS: // C Z
+          case LPCC::ICC_GE: // N V
+          case LPCC::ICC_LT: // N V
+          case LPCC::ICC_GT: // Z N V
+          case LPCC::ICC_LE: // Z N V
+            // The instruction uses the V bit or C bit which is not safe.
+            return false;
+          case LPCC::UNKNOWN:
+            return false;
+          }
+        }
+      }
+    }
+
+    // If SR is not killed nor re-defined, we should check whether it is
+    // live-out. If it is live-out, do not optimize.
+    if (!isSafe) {
+      MachineBasicBlock *MBB = CmpInstr.getParent();
+      for (MachineBasicBlock::succ_iterator SI = MBB->succ_begin(),
+                                            SE = MBB->succ_end();
+           SI != SE; ++SI)
+        if ((*SI)->isLiveIn(Lanai::SR))
+          return false;
+    }
+
+    // Toggle the optional operand to SR.
+    MI->setDesc(get(flagSettingOpcodeVariant(MI->getOpcode())));
+    MI->addRegisterDefined(Lanai::SR);
+    CmpInstr.eraseFromParent();
+    return true;
+  }
+
+  return false;
+}
+
+bool LanaiInstrInfo::analyzeSelect(const MachineInstr &MI,
+                                   SmallVectorImpl<MachineOperand> &Cond,
+                                   unsigned &TrueOp, unsigned &FalseOp,
+                                   bool &Optimizable) const {
+  assert(MI.getOpcode() == Lanai::SELECT && "unknown select instruction");
+  // Select operands:
+  // 0: Def.
+  // 1: True use.
+  // 2: False use.
+  // 3: Condition code.
+  TrueOp = 1;
+  FalseOp = 2;
+  Cond.push_back(MI.getOperand(3));
+  Optimizable = true;
+  return false;
+}
+
+// Identify instructions that can be folded into a SELECT instruction, and
+// return the defining instruction.
+static MachineInstr *canFoldIntoSelect(unsigned Reg,
+                                       const MachineRegisterInfo &MRI) {
+  if (!TargetRegisterInfo::isVirtualRegister(Reg))
+    return nullptr;
+  if (!MRI.hasOneNonDBGUse(Reg))
+    return nullptr;
+  MachineInstr *MI = MRI.getVRegDef(Reg);
+  if (!MI)
+    return nullptr;
+  // MI is folded into the SELECT by predicating it.
+  if (!MI->isPredicable())
+    return nullptr;
+  // Check if MI has any non-dead defs or physreg uses. This also detects
+  // predicated instructions which will be reading SR.
+  for (unsigned i = 1, e = MI->getNumOperands(); i != e; ++i) {
+    const MachineOperand &MO = MI->getOperand(i);
+    // Reject frame index operands.
+    if (MO.isFI() || MO.isCPI() || MO.isJTI())
+      return nullptr;
+    if (!MO.isReg())
+      continue;
+    // MI can't have any tied operands, that would conflict with predication.
+    if (MO.isTied())
+      return nullptr;
+    if (TargetRegisterInfo::isPhysicalRegister(MO.getReg()))
+      return nullptr;
+    if (MO.isDef() && !MO.isDead())
+      return nullptr;
+  }
+  bool DontMoveAcrossStores = true;
+  if (!MI->isSafeToMove(/*AliasAnalysis=*/nullptr, DontMoveAcrossStores))
+    return nullptr;
+  return MI;
+}
+
+MachineInstr *
+LanaiInstrInfo::optimizeSelect(MachineInstr &MI,
+                               SmallPtrSetImpl<MachineInstr *> &SeenMIs,
+                               bool /*PreferFalse*/) const {
+  assert(MI.getOpcode() == Lanai::SELECT && "unknown select instruction");
+  MachineRegisterInfo &MRI = MI.getParent()->getParent()->getRegInfo();
+  MachineInstr *DefMI = canFoldIntoSelect(MI.getOperand(1).getReg(), MRI);
+  bool Invert = !DefMI;
+  if (!DefMI)
+    DefMI = canFoldIntoSelect(MI.getOperand(2).getReg(), MRI);
+  if (!DefMI)
+    return nullptr;
+
+  // Find new register class to use.
+  MachineOperand FalseReg = MI.getOperand(Invert ? 1 : 2);
+  unsigned DestReg = MI.getOperand(0).getReg();
+  const TargetRegisterClass *PreviousClass = MRI.getRegClass(FalseReg.getReg());
+  if (!MRI.constrainRegClass(DestReg, PreviousClass))
+    return nullptr;
+
+  // Create a new predicated version of DefMI.
+  MachineInstrBuilder NewMI =
+      BuildMI(*MI.getParent(), MI, MI.getDebugLoc(), DefMI->getDesc(), DestReg);
+
+  // Copy all the DefMI operands, excluding its (null) predicate.
+  const MCInstrDesc &DefDesc = DefMI->getDesc();
+  for (unsigned i = 1, e = DefDesc.getNumOperands();
+       i != e && !DefDesc.OpInfo[i].isPredicate(); ++i)
+    NewMI.addOperand(DefMI->getOperand(i));
+
+  unsigned CondCode = MI.getOperand(3).getImm();
+  if (Invert)
+    NewMI.addImm(getOppositeCondition(LPCC::CondCode(CondCode)));
+  else
+    NewMI.addImm(CondCode);
+  NewMI.copyImplicitOps(MI);
+
+  // The output register value when the predicate is false is an implicit
+  // register operand tied to the first def.  The tie makes the register
+  // allocator ensure the FalseReg is allocated the same register as operand 0.
+  FalseReg.setImplicit();
+  NewMI.addOperand(FalseReg);
+  NewMI->tieOperands(0, NewMI->getNumOperands() - 1);
+
+  // Update SeenMIs set: register newly created MI and erase removed DefMI.
+  SeenMIs.insert(NewMI);
+  SeenMIs.erase(DefMI);
+
+  // If MI is inside a loop, and DefMI is outside the loop, then kill flags on
+  // DefMI would be invalid when transferred inside the loop.  Checking for a
+  // loop is expensive, but at least remove kill flags if they are in different
+  // BBs.
+  if (DefMI->getParent() != MI.getParent())
+    NewMI->clearKillInfo();
+
+  // The caller will erase MI, but not DefMI.
+  DefMI->eraseFromParent();
+  return NewMI;
+}
+
+// The analyzeBranch function is used to examine conditional instructions and
+// remove unnecessary instructions. This method is used by BranchFolder and
+// IfConverter machine function passes to improve the CFG.
+// - TrueBlock is set to the destination if condition evaluates true (it is the
+//   nullptr if the destination is the fall-through branch);
+// - FalseBlock is set to the destination if condition evaluates to false (it
+//   is the nullptr if the branch is unconditional);
+// - condition is populated with machine operands needed to generate the branch
+//   to insert in InsertBranch;
+// Returns: false if branch could successfully be analyzed.
+bool LanaiInstrInfo::analyzeBranch(MachineBasicBlock &MBB,
+                                   MachineBasicBlock *&TrueBlock,
+                                   MachineBasicBlock *&FalseBlock,
+                                   SmallVectorImpl<MachineOperand> &Condition,
+                                   bool AllowModify) const {
+  // Iterator to current instruction being considered.
+  MachineBasicBlock::iterator Instruction = MBB.end();
+
+  // Start from the bottom of the block and work up, examining the
+  // terminator instructions.
+  while (Instruction != MBB.begin()) {
+    --Instruction;
+
+    // Skip over debug values.
+    if (Instruction->isDebugValue())
+      continue;
+
+    // Working from the bottom, when we see a non-terminator
+    // instruction, we're done.
+    if (!isUnpredicatedTerminator(*Instruction))
+      break;
+
+    // A terminator that isn't a branch can't easily be handled
+    // by this analysis.
+    if (!Instruction->isBranch())
+      return true;
+
+    // Handle unconditional branches.
+    if (Instruction->getOpcode() == Lanai::BT) {
+      if (!AllowModify) {
+        TrueBlock = Instruction->getOperand(0).getMBB();
+        continue;
+      }
+
+      // If the block has any instructions after a branch, delete them.
+      while (std::next(Instruction) != MBB.end()) {
+        std::next(Instruction)->eraseFromParent();
+      }
+
+      Condition.clear();
+      FalseBlock = nullptr;
+
+      // Delete the jump if it's equivalent to a fall-through.
+      if (MBB.isLayoutSuccessor(Instruction->getOperand(0).getMBB())) {
+        TrueBlock = nullptr;
+        Instruction->eraseFromParent();
+        Instruction = MBB.end();
+        continue;
+      }
+
+      // TrueBlock is used to indicate the unconditional destination.
+      TrueBlock = Instruction->getOperand(0).getMBB();
+      continue;
+    }
+
+    // Handle conditional branches
+    unsigned Opcode = Instruction->getOpcode();
+    if (Opcode != Lanai::BRCC)
+      return true; // Unknown opcode.
+
+    // Multiple conditional branches are not handled here so only proceed if
+    // there are no conditions enqueued.
+    if (Condition.empty()) {
+      LPCC::CondCode BranchCond =
+          static_cast<LPCC::CondCode>(Instruction->getOperand(1).getImm());
+
+      // TrueBlock is the target of the previously seen unconditional branch.
+      FalseBlock = TrueBlock;
+      TrueBlock = Instruction->getOperand(0).getMBB();
+      Condition.push_back(MachineOperand::CreateImm(BranchCond));
+      continue;
+    }
+
+    // Multiple conditional branches are not handled.
+    return true;
+  }
+
+  // Return false indicating branch successfully analyzed.
+  return false;
+}
+
+// ReverseBranchCondition - Reverses the branch condition of the specified
+// condition list, returning false on success and true if it cannot be
+// reversed.
+bool LanaiInstrInfo::ReverseBranchCondition(
+    SmallVectorImpl<llvm::MachineOperand> &Condition) const {
+  assert((Condition.size() == 1) &&
+         "Lanai branch conditions should have one component.");
+
+  LPCC::CondCode BranchCond =
+      static_cast<LPCC::CondCode>(Condition[0].getImm());
+  Condition[0].setImm(getOppositeCondition(BranchCond));
+  return false;
+}
+
+// Insert the branch with condition specified in condition and given targets
+// (TrueBlock and FalseBlock). This function returns the number of machine
+// instructions inserted.
+unsigned LanaiInstrInfo::InsertBranch(MachineBasicBlock &MBB,
+                                      MachineBasicBlock *TrueBlock,
+                                      MachineBasicBlock *FalseBlock,
+                                      ArrayRef<MachineOperand> Condition,
+                                      const DebugLoc &DL) const {
+  // Shouldn't be a fall through.
+  assert(TrueBlock && "InsertBranch must not be told to insert a fallthrough");
+
+  // If condition is empty then an unconditional branch is being inserted.
+  if (Condition.empty()) {
+    assert(!FalseBlock && "Unconditional branch with multiple successors!");
+    BuildMI(&MBB, DL, get(Lanai::BT)).addMBB(TrueBlock);
+    return 1;
+  }
+
+  // Else a conditional branch is inserted.
+  assert((Condition.size() == 1) &&
+         "Lanai branch conditions should have one component.");
+  unsigned ConditionalCode = Condition[0].getImm();
+  BuildMI(&MBB, DL, get(Lanai::BRCC)).addMBB(TrueBlock).addImm(ConditionalCode);
+
+  // If no false block, then false behavior is fall through and no branch needs
+  // to be inserted.
+  if (!FalseBlock)
+    return 1;
+
+  BuildMI(&MBB, DL, get(Lanai::BT)).addMBB(FalseBlock);
+  return 2;
+}
+
+unsigned LanaiInstrInfo::RemoveBranch(MachineBasicBlock &MBB) const {
+  MachineBasicBlock::iterator Instruction = MBB.end();
+  unsigned Count = 0;
+
+  while (Instruction != MBB.begin()) {
+    --Instruction;
+    if (Instruction->isDebugValue())
+      continue;
+    if (Instruction->getOpcode() != Lanai::BT &&
+        Instruction->getOpcode() != Lanai::BRCC) {
+      break;
+    }
+
+    // Remove the branch.
+    Instruction->eraseFromParent();
+    Instruction = MBB.end();
+    ++Count;
+  }
+
+  return Count;
+}
+
+unsigned LanaiInstrInfo::isLoadFromStackSlot(const MachineInstr &MI,
+                                             int &FrameIndex) const {
+  if (MI.getOpcode() == Lanai::LDW_RI)
+    if (MI.getOperand(1).isFI() && MI.getOperand(2).isImm() &&
+        MI.getOperand(2).getImm() == 0) {
+      FrameIndex = MI.getOperand(1).getIndex();
+      return MI.getOperand(0).getReg();
+    }
+  return 0;
+}
+
+unsigned LanaiInstrInfo::isLoadFromStackSlotPostFE(const MachineInstr &MI,
+                                                   int &FrameIndex) const {
+  if (MI.getOpcode() == Lanai::LDW_RI) {
+    unsigned Reg;
+    if ((Reg = isLoadFromStackSlot(MI, FrameIndex)))
+      return Reg;
+    // Check for post-frame index elimination operations
+    const MachineMemOperand *Dummy;
+    return hasLoadFromStackSlot(MI, Dummy, FrameIndex);
+  }
+  return 0;
+}
+
+unsigned LanaiInstrInfo::isStoreToStackSlot(const MachineInstr &MI,
+                                            int &FrameIndex) const {
+  if (MI.getOpcode() == Lanai::SW_RI)
+    if (MI.getOperand(0).isFI() && MI.getOperand(1).isImm() &&
+        MI.getOperand(1).getImm() == 0) {
+      FrameIndex = MI.getOperand(0).getIndex();
+      return MI.getOperand(2).getReg();
+    }
+  return 0;
+}
+
+bool LanaiInstrInfo::getMemOpBaseRegImmOfsWidth(
+    MachineInstr &LdSt, unsigned &BaseReg, int64_t &Offset, unsigned &Width,
+    const TargetRegisterInfo * /*TRI*/) const {
+  // Handle only loads/stores with base register followed by immediate offset
+  // and with add as ALU op.
+  if (LdSt.getNumOperands() != 4)
+    return false;
+  if (!LdSt.getOperand(1).isReg() || !LdSt.getOperand(2).isImm() ||
+      !(LdSt.getOperand(3).isImm() && LdSt.getOperand(3).getImm() == LPAC::ADD))
+    return false;
+
+  switch (LdSt.getOpcode()) {
+  default:
+    return false;
+  case Lanai::LDW_RI:
+  case Lanai::LDW_RR:
+  case Lanai::SW_RR:
+  case Lanai::SW_RI:
+    Width = 4;
+    break;
+  case Lanai::LDHs_RI:
+  case Lanai::LDHz_RI:
+  case Lanai::STH_RI:
+    Width = 2;
+    break;
+  case Lanai::LDBs_RI:
+  case Lanai::LDBz_RI:
+  case Lanai::STB_RI:
+    Width = 1;
+    break;
+  }
+
+  BaseReg = LdSt.getOperand(1).getReg();
+  Offset = LdSt.getOperand(2).getImm();
+  return true;
+}
+
+bool LanaiInstrInfo::getMemOpBaseRegImmOfs(
+    MachineInstr &LdSt, unsigned &BaseReg, int64_t &Offset,
+    const TargetRegisterInfo *TRI) const {
+  switch (LdSt.getOpcode()) {
+  default:
+    return false;
+  case Lanai::LDW_RI:
+  case Lanai::LDW_RR:
+  case Lanai::SW_RR:
+  case Lanai::SW_RI:
+  case Lanai::LDHs_RI:
+  case Lanai::LDHz_RI:
+  case Lanai::STH_RI:
+  case Lanai::LDBs_RI:
+  case Lanai::LDBz_RI:
+    unsigned Width;
+    return getMemOpBaseRegImmOfsWidth(LdSt, BaseReg, Offset, Width, TRI);
+  }
+}
diff --git a/lib/Target/Lanai/LanaiInstrInfo.h b/lib/Target/Lanai/LanaiInstrInfo.h
new file mode 100644
index 000000000000..51f6c6ea436d
--- /dev/null
+++ b/lib/Target/Lanai/LanaiInstrInfo.h
@@ -0,0 +1,184 @@
+//===- LanaiInstrInfo.h - Lanai Instruction Information ---------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the Lanai implementation of the TargetInstrInfo class.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_LANAI_LANAIINSTRINFO_H
+#define LLVM_LIB_TARGET_LANAI_LANAIINSTRINFO_H
+
+#include "LanaiRegisterInfo.h"
+#include "llvm/Target/TargetInstrInfo.h"
+
+#define GET_INSTRINFO_HEADER
+#include "LanaiGenInstrInfo.inc"
+
+namespace llvm {
+
+class LanaiInstrInfo : public LanaiGenInstrInfo {
+  const LanaiRegisterInfo RegisterInfo;
+
+public:
+  LanaiInstrInfo();
+
+  // getRegisterInfo - TargetInstrInfo is a superset of MRegister info.  As
+  // such, whenever a client has an instance of instruction info, it should
+  // always be able to get register info as well (through this method).
+  virtual const LanaiRegisterInfo &getRegisterInfo() const {
+    return RegisterInfo;
+  }
+
+  bool areMemAccessesTriviallyDisjoint(MachineInstr &MIa, MachineInstr &MIb,
+                                       AliasAnalysis *AA) const override;
+
+  unsigned isLoadFromStackSlot(const MachineInstr &MI,
+                               int &FrameIndex) const override;
+
+  unsigned isLoadFromStackSlotPostFE(const MachineInstr &MI,
+                                     int &FrameIndex) const override;
+
+  unsigned isStoreToStackSlot(const MachineInstr &MI,
+                              int &FrameIndex) const override;
+
+  void copyPhysReg(MachineBasicBlock &MBB, MachineBasicBlock::iterator Position,
+                   const DebugLoc &DL, unsigned DestinationRegister,
+                   unsigned SourceRegister, bool KillSource) const override;
+
+  void
+  storeRegToStackSlot(MachineBasicBlock &MBB,
+                      MachineBasicBlock::iterator Position,
+                      unsigned SourceRegister, bool IsKill, int FrameIndex,
+                      const TargetRegisterClass *RegisterClass,
+                      const TargetRegisterInfo *RegisterInfo) const override;
+
+  void
+  loadRegFromStackSlot(MachineBasicBlock &MBB,
+                       MachineBasicBlock::iterator Position,
+                       unsigned DestinationRegister, int FrameIndex,
+                       const TargetRegisterClass *RegisterClass,
+                       const TargetRegisterInfo *RegisterInfo) const override;
+
+  bool expandPostRAPseudo(MachineInstr &MI) const override;
+
+  bool getMemOpBaseRegImmOfs(MachineInstr &LdSt, unsigned &BaseReg,
+                             int64_t &Offset,
+                             const TargetRegisterInfo *TRI) const override;
+
+  bool getMemOpBaseRegImmOfsWidth(MachineInstr &LdSt, unsigned &BaseReg,
+                                  int64_t &Offset, unsigned &Width,
+                                  const TargetRegisterInfo *TRI) const;
+
+  std::pair<unsigned, unsigned>
+  decomposeMachineOperandsTargetFlags(unsigned TF) const override;
+
+  ArrayRef<std::pair<unsigned, const char *>>
+  getSerializableDirectMachineOperandTargetFlags() const override;
+
+  bool analyzeBranch(MachineBasicBlock &MBB, MachineBasicBlock *&TrueBlock,
+                     MachineBasicBlock *&FalseBlock,
+                     SmallVectorImpl<MachineOperand> &Condition,
+                     bool AllowModify) const override;
+
+  unsigned RemoveBranch(MachineBasicBlock &MBB) const override;
+
+  // For a comparison instruction, return the source registers in SrcReg and
+  // SrcReg2 if having two register operands, and the value it compares against
+  // in CmpValue. Return true if the comparison instruction can be analyzed.
+  bool analyzeCompare(const MachineInstr &MI, unsigned &SrcReg,
+                      unsigned &SrcReg2, int &CmpMask,
+                      int &CmpValue) const override;
+
+  // See if the comparison instruction can be converted into something more
+  // efficient. E.g., on Lanai register-register instructions can set the flag
+  // register, obviating the need for a separate compare.
+  bool optimizeCompareInstr(MachineInstr &CmpInstr, unsigned SrcReg,
+                            unsigned SrcReg2, int CmpMask, int CmpValue,
+                            const MachineRegisterInfo *MRI) const override;
+
+  // Analyze the given select instruction, returning true if it cannot be
+  // understood. It is assumed that MI->isSelect() is true.
+  //
+  // When successful, return the controlling condition and the operands that
+  // determine the true and false result values.
+  //
+  //   Result = SELECT Cond, TrueOp, FalseOp
+  //
+  // Lanai can optimize certain select instructions, for example by predicating
+  // the instruction defining one of the operands and sets Optimizable to true.
+  bool analyzeSelect(const MachineInstr &MI,
+                     SmallVectorImpl<MachineOperand> &Cond, unsigned &TrueOp,
+                     unsigned &FalseOp, bool &Optimizable) const override;
+
+  // Given a select instruction that was understood by analyzeSelect and
+  // returned Optimizable = true, attempt to optimize MI by merging it with one
+  // of its operands. Returns NULL on failure.
+  //
+  // When successful, returns the new select instruction. The client is
+  // responsible for deleting MI.
+  //
+  // If both sides of the select can be optimized, the TrueOp is modifed.
+  // PreferFalse is not used.
+  MachineInstr *optimizeSelect(MachineInstr &MI,
+                               SmallPtrSetImpl<MachineInstr *> &SeenMIs,
+                               bool PreferFalse) const override;
+
+  bool ReverseBranchCondition(
+      SmallVectorImpl<MachineOperand> &Condition) const override;
+
+  unsigned InsertBranch(MachineBasicBlock &MBB, MachineBasicBlock *TrueBlock,
+                        MachineBasicBlock *FalseBlock,
+                        ArrayRef<MachineOperand> Condition,
+                        const DebugLoc &DL) const override;
+};
+
+static inline bool isSPLSOpcode(unsigned Opcode) {
+  switch (Opcode) {
+  case Lanai::LDBs_RI:
+  case Lanai::LDBz_RI:
+  case Lanai::LDHs_RI:
+  case Lanai::LDHz_RI:
+  case Lanai::STB_RI:
+  case Lanai::STH_RI:
+    return true;
+  default:
+    return false;
+  }
+}
+
+static inline bool isRMOpcode(unsigned Opcode) {
+  switch (Opcode) {
+  case Lanai::LDW_RI:
+  case Lanai::SW_RI:
+    return true;
+  default:
+    return false;
+  }
+}
+
+static inline bool isRRMOpcode(unsigned Opcode) {
+  switch (Opcode) {
+  case Lanai::LDBs_RR:
+  case Lanai::LDBz_RR:
+  case Lanai::LDHs_RR:
+  case Lanai::LDHz_RR:
+  case Lanai::LDWz_RR:
+  case Lanai::LDW_RR:
+  case Lanai::STB_RR:
+  case Lanai::STH_RR:
+  case Lanai::SW_RR:
+    return true;
+  default:
+    return false;
+  }
+}
+
+} // namespace llvm
+
+#endif // LLVM_LIB_TARGET_LANAI_LANAIINSTRINFO_H
diff --git a/lib/Target/Lanai/LanaiInstrInfo.td b/lib/Target/Lanai/LanaiInstrInfo.td
new file mode 100644
index 000000000000..cd1abc1f3359
--- /dev/null
+++ b/lib/Target/Lanai/LanaiInstrInfo.td
@@ -0,0 +1,892 @@
+//===-- LanaiInstrInfo.td - Target Description for Lanai Target -----------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file describes the Lanai instructions in TableGen format.
+//
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+// Instruction format superclass
+//===----------------------------------------------------------------------===//
+
+include "LanaiInstrFormats.td"
+
+// -------------------------------------------------- //
+// Instruction Operands and Patterns
+// -------------------------------------------------- //
+
+//  These are target-independent nodes, but have target-specific formats.
+def SDT_LanaiCallSeqStart : SDCallSeqStart<[SDTCisVT<0, i32>]>;
+def SDT_LanaiCallSeqEnd   : SDCallSeqEnd<[SDTCisVT<0, i32>,
+                                          SDTCisVT<1, i32>]>;
+def SDT_LanaiCall         : SDTypeProfile<0, -1, [SDTCisVT<0, i32>]>;
+def SDT_LanaiSetFlag      : SDTypeProfile<0,  2, [SDTCisSameAs<0, 1>]>;
+def SDT_LanaiSelectCC     : SDTypeProfile<1,  3, [SDTCisSameAs<0, 1>,
+                                                  SDTCisSameAs<1, 2>]>;
+def SDT_LanaiSetCC        : SDTypeProfile<1,  1, [SDTCisVT<0, i32>,
+                                                  SDTCisVT<1, i32>]>;
+def SDT_LanaiBrCC         : SDTypeProfile<0,  2, [SDTCisVT<0, OtherVT>,
+                                                  SDTCisVT<1, i32>]>;
+def SDT_LanaiAdjDynAlloc  : SDTypeProfile<1,  1, [SDTCisVT<0, i32>,
+                                                  SDTCisVT<1, i32>]>;
+
+def Call             : SDNode<"LanaiISD::CALL", SDT_LanaiCall,
+                              [SDNPHasChain, SDNPOptInGlue, SDNPOutGlue,
+                               SDNPVariadic]>;
+def RetFlag          : SDNode<"LanaiISD::RET_FLAG", SDTNone,
+                              [SDNPHasChain, SDNPOptInGlue, SDNPVariadic]>;
+def CallSeqStart     : SDNode<"ISD::CALLSEQ_START", SDT_LanaiCallSeqStart,
+                              [SDNPHasChain, SDNPOutGlue]>;
+def CallSeqEnd       : SDNode<"ISD::CALLSEQ_END", SDT_LanaiCallSeqEnd,
+                              [SDNPHasChain, SDNPOptInGlue, SDNPOutGlue]>;
+def LanaiSetFlag     : SDNode<"LanaiISD::SET_FLAG", SDT_LanaiSetFlag,
+                              [SDNPOutGlue]>;
+def LanaiSubbF       : SDNode<"LanaiISD::SUBBF", SDT_LanaiSetFlag,
+                              [SDNPOutGlue, SDNPInGlue]>;
+def LanaiBrCC        : SDNode<"LanaiISD::BR_CC", SDT_LanaiBrCC,
+                              [SDNPHasChain, SDNPInGlue]>;
+def LanaiSelectCC    : SDNode<"LanaiISD::SELECT_CC", SDT_LanaiSelectCC,
+                              [SDNPInGlue]>;
+def LanaiSetCC       : SDNode<"LanaiISD::SETCC", SDT_LanaiSetCC,
+                              [SDNPInGlue]>;
+def LanaiHi          : SDNode<"LanaiISD::HI", SDTIntUnaryOp>;
+def LanaiLo          : SDNode<"LanaiISD::LO", SDTIntUnaryOp>;
+def LanaiSmall       : SDNode<"LanaiISD::SMALL", SDTIntUnaryOp>;
+def LanaiAdjDynAlloc : SDNode<"LanaiISD::ADJDYNALLOC", SDT_LanaiAdjDynAlloc>;
+
+// Extract bits 0-15 (low-end) of an immediate value.
+def LO16 : SDNodeXForm<imm, [{
+  return CurDAG->getTargetConstant((uint64_t)N->getZExtValue() & 0xffff,
+                                   SDLoc(N), MVT::i32);
+}]>;
+
+// Extract bits 16-31 (high-end) of an immediate value.
+// Transformation function: shift the immediate value down into the low bits.
+def HI16 : SDNodeXForm<imm, [{
+  return CurDAG->getTargetConstant((uint64_t)N->getZExtValue() >> 16, SDLoc(N),
+                                   MVT::i32);
+}]>;
+
+def NEG : SDNodeXForm<imm, [{
+  return CurDAG->getTargetConstant(-N->getSExtValue(), SDLoc(N), MVT::i32);
+}]>;
+
+def LO21 : SDNodeXForm<imm, [{
+  return CurDAG->getTargetConstant((uint64_t)N->getZExtValue() & 0x1fffff,
+                                   SDLoc(N), MVT::i32);
+}]>;
+
+// Branch targets
+def BrTargetAsmOperand : AsmOperandClass {
+  let Name = "BrTarget";
+}
+def BrTarget   : Operand<OtherVT> {
+  let ParserMatchClass = BrTargetAsmOperand;
+  let EncoderMethod = "getBranchTargetOpValue";
+  let DecoderMethod = "decodeBranch";
+}
+
+def CallTargetAsmOperand : AsmOperandClass {
+  let Name = "CallTarget";
+}
+def CallTarget : Operand<i32> {
+  let ParserMatchClass = CallTargetAsmOperand;
+  let EncoderMethod = "getBranchTargetOpValue";
+  let DecoderMethod = "decodeBranch";
+}
+
+def ImmShiftAsmOperand : AsmOperandClass { let Name = "ImmShift"; }
+def immShift : Operand<i32>, PatLeaf<(imm), [{
+    int Imm = N->getSExtValue();
+    return Imm >= -31 && Imm <= 31;}]> {
+  let ParserMatchClass = ImmShiftAsmOperand;
+  let DecoderMethod = "decodeShiftImm";
+}
+
+def Imm10AsmOperand : AsmOperandClass { let Name = "Imm10"; }
+def imm10 : Operand<i32>, PatLeaf<(imm), [{
+    return isInt<10>(N->getSExtValue()); }]> {
+  let ParserMatchClass = Imm10AsmOperand;
+}
+
+def immZExt21 : PatLeaf<(imm),
+                        [{return isUInt<21>(N->getZExtValue()); }], LO21>;
+
+def LoImm16AsmOperand : AsmOperandClass { let Name = "LoImm16"; }
+def i32lo16z : Operand<i32>, PatLeaf<(i32 imm), [{
+    // i32lo16 predicate - true if the 32-bit immediate has only rightmost 16
+    // bits set.
+    return ((N->getZExtValue() & 0xFFFFUL) == N->getZExtValue());}], LO16> {
+  let ParserMatchClass = LoImm16AsmOperand;
+}
+def i32neg16 : Operand<i32>, PatLeaf<(i32 imm), [{
+    // i32neg16 predicate - true if the 32-bit immediate is negative and can
+    // be represented by a 16 bit integer.
+    int Imm = N->getSExtValue();
+    return (Imm < 0) && (isInt<16>(Imm));}], LO16> {
+  let ParserMatchClass = LoImm16AsmOperand;
+}
+def i32lo16s : Operand<i32>, PatLeaf<(i32 imm), [{
+    // i32lo16 predicate - true if the 32-bit immediate has only rightmost 16
+    // bits set.
+    return ((int64_t)(N->getSExtValue() & 0xFFFFUL) == N->getSExtValue());}], LO16> {
+  let ParserMatchClass = LoImm16AsmOperand;
+}
+
+def LoImm16AndAsmOperand : AsmOperandClass { let Name = "LoImm16And"; }
+def i32lo16and : Operand<i32>, PatLeaf<(i32 imm), [{
+    // i32lo16 predicate - true if the 32-bit immediate has the rightmost 16
+    // bits set and the leftmost 16 bits 1's.
+    return (N->getZExtValue() >= 0xFFFF0000UL);}], LO16> {
+  let ParserMatchClass = LoImm16AndAsmOperand;
+  let PrintMethod = "printLo16AndImmOperand";
+}
+
+def HiImm16AsmOperand : AsmOperandClass { let Name = "HiImm16"; }
+def i32hi16 : Operand<i32>, PatLeaf<(i32 imm), [{
+    // i32hi16 predicate - true if the 32-bit immediate has only leftmost 16
+    // bits set.
+    return ((N->getZExtValue() & 0xFFFF0000UL) == N->getZExtValue());}], HI16> {
+  let ParserMatchClass = HiImm16AsmOperand;
+  let PrintMethod = "printHi16ImmOperand";
+}
+
+def HiImm16AndAsmOperand : AsmOperandClass { let Name = "HiImm16And"; }
+def i32hi16and : Operand<i32>, PatLeaf<(i32 imm), [{
+    // i32lo16 predicate - true if the 32-bit immediate has the leftmost 16
+    // bits set and the rightmost 16 bits 1's.
+    return ((N->getZExtValue() & 0xFFFFUL) == 0xFFFFUL);}], HI16> {
+  let ParserMatchClass = HiImm16AndAsmOperand;
+  let PrintMethod = "printHi16AndImmOperand";
+}
+
+def LoImm21AsmOperand : AsmOperandClass { let Name = "LoImm21"; }
+def i32lo21 : Operand<i32>, PatLeaf<(i32 imm), [{
+    // i32lo21 predicate - true if the 32-bit immediate has only rightmost 21
+    // bits set.
+    return ((N->getZExtValue() & 0x1FFFFFUL) == N->getZExtValue());}], LO21> {
+  let ParserMatchClass = LoImm21AsmOperand;
+}
+
+def AluOp : Operand<i32> {
+  let PrintMethod = "printAluOperand";
+}
+
+// Addressing modes.
+def ADDRrr : ComplexPattern<i32, 3, "selectAddrRr", [], []>;
+def ADDRri : ComplexPattern<i32, 3, "selectAddrRi", [frameindex], []>;
+def ADDRsls : ComplexPattern<i32, 1, "selectAddrSls", [frameindex], []>;
+def ADDRspls : ComplexPattern<i32, 3, "selectAddrSpls", [frameindex], []>;
+
+// Address operands
+def MemRegImmAsmOperand : AsmOperandClass {
+  let Name = "MemRegImm";
+  let ParserMethod  = "parseMemoryOperand";
+}
+def MEMri : Operand<i32> {
+  let DecoderMethod = "decodeRiMemoryValue";
+  let EncoderMethod = "getRiMemoryOpValue";
+  let MIOperandInfo = (ops GPR:$base, i32lo16s:$offset, AluOp:$Opcode);
+  let ParserMatchClass = MemRegImmAsmOperand;
+  let PrintMethod   = "printMemRiOperand";
+}
+
+def MemRegRegAsmOperand : AsmOperandClass {
+  let Name = "MemRegReg";
+  let ParserMethod  = "parseMemoryOperand";
+}
+def MEMrr : Operand<i32> {
+  let DecoderMethod = "decodeRrMemoryValue";
+  let EncoderMethod = "getRrMemoryOpValue";
+  let MIOperandInfo = (ops GPR:$Op1, GPR:$Op2, AluOp:$Opcode);
+  let ParserMatchClass = MemRegRegAsmOperand;
+  let PrintMethod   = "printMemRrOperand";
+}
+
+def MemImmAsmOperand : AsmOperandClass {
+  let Name = "MemImm";
+  let ParserMethod  = "parseMemoryOperand";
+}
+def MEMi : Operand<i32> {
+  let MIOperandInfo = (ops i32lo21:$offset);
+  let ParserMatchClass = MemImmAsmOperand;
+  let PrintMethod   = "printMemImmOperand";
+}
+
+def MemSplsAsmOperand : AsmOperandClass {
+  let Name = "MemSpls";
+  let ParserMethod  = "parseMemoryOperand";
+}
+def MEMspls : Operand<i32> {
+  let DecoderMethod = "decodeSplsValue";
+  let EncoderMethod = "getSplsOpValue";
+  let MIOperandInfo = (ops GPR:$base, imm10:$offset, AluOp:$Opcode);
+  let ParserMatchClass = MemSplsAsmOperand;
+  let PrintMethod   = "printMemSplsOperand";
+}
+
+def CCOp : Operand<i32> {
+  let PrintMethod = "printCCOperand";
+}
+
+// Predicate operand. Default to 0 = true.
+def CondCodeOperand : AsmOperandClass { let Name = "CondCode"; }
+
+def pred : PredicateOperand<i32, (ops i32imm), (ops (i32 0))> {
+  let PrintMethod = "printPredicateOperand";
+  let ParserMatchClass = CondCodeOperand;
+  let DecoderMethod = "decodePredicateOperand";
+}
+
+let hasSideEffects = 0, Inst = 0x00000001 in
+  def NOP : InstLanai<(outs), (ins), "nop", []>;
+
+// Special NOPs to change logging level in vlanai.
+let hasSideEffects = 0, Inst = 0x00000002 in
+  def LOG0 : InstLanai<(outs), (ins), "log_0", []>;
+let hasSideEffects = 0, Inst = 0x00000003 in
+  def LOG1 : InstLanai<(outs), (ins), "log_1", []>;
+let hasSideEffects = 0, Inst = 0x00000004 in
+  def LOG2 : InstLanai<(outs), (ins), "log_2", []>;
+let hasSideEffects = 0, Inst = 0x00000005 in
+  def LOG3 : InstLanai<(outs), (ins), "log_3", []>;
+let hasSideEffects = 0, Inst = 0x00000006 in
+  def LOG4 : InstLanai<(outs), (ins), "log_4", []>;
+
+// Map an SPLS instruction onto itself. All other instructions will be mapped
+// onto -1. Used to identify SPLS instructions.
+def splsIdempotent : InstrMapping {
+  let FilterClass = "InstSPLS";
+  let RowFields = ["AsmString"];
+  let ColFields = ["PostEncoderMethod"];
+  let KeyCol = ["adjustPqBitsSpls"];
+  let ValueCols = [["adjustPqBitsSpls"]];
+}
+
+// -------------------------------------------------- //
+// ALU instructions
+// -------------------------------------------------- //
+multiclass ALUbase<bits<3> subOp, string AsmStr, SDNode OpNode,
+                   PatLeaf LoExt, PatLeaf HiExt,
+                   list<dag> loPattern, list<dag> hiPattern> {
+  // Register Immediate
+  let H = 0 in
+    def LO : InstRI<subOp, (outs GPR:$Rd), (ins GPR:$Rs1, LoExt:$imm16),
+                    !strconcat(AsmStr, "\t$Rs1, $imm16, $Rd"),
+                    loPattern>;
+  let H = 1 in
+    def HI : InstRI<subOp, (outs GPR:$Rd), (ins GPR:$Rs1, HiExt:$imm16),
+                    !strconcat(AsmStr, "\t$Rs1, $imm16, $Rd"),
+                    hiPattern>;
+
+}
+
+multiclass ALUarith<bits<3> subOp, string AsmStr, SDNode OpNode,
+                    PatLeaf LoExt, PatLeaf HiExt> {
+  defm I_ : ALUbase<subOp, AsmStr, OpNode, LoExt, HiExt, [], []>;
+
+  // Register Register
+  let JJJJJ = 0 in
+    def R : InstRR<subOp, (outs GPR:$Rd), (ins GPR:$Rs1, GPR:$Rs2, pred:$DDDI),
+                   !strconcat(AsmStr, "$DDDI\t$Rs1, $Rs2, $Rd"),
+                   [(set GPR:$Rd, (OpNode GPR:$Rs1, GPR:$Rs2))]>;
+}
+
+multiclass ALUlogic<bits<3> subOp, string AsmStr, SDNode OpNode,
+                    PatLeaf LoExt, PatLeaf HiExt> {
+  defm I_ : ALUbase<subOp, AsmStr, OpNode, LoExt, HiExt,
+                    [(set GPR:$Rd, (OpNode GPR:$Rs1, LoExt:$imm16))],
+                    [(set GPR:$Rd, (OpNode GPR:$Rs1, HiExt:$imm16))]>;
+
+  // Register Register
+  let JJJJJ = 0 in
+    def R : InstRR<subOp, (outs GPR:$Rd), (ins GPR:$Rs1, GPR:$Rs2, pred:$DDDI),
+                   !strconcat(AsmStr, "$DDDI\t$Rs1, $Rs2, $Rd"),
+                   [(set GPR:$Rd, (OpNode GPR:$Rs1, GPR:$Rs2))]>;
+}
+
+// Non flag setting ALU operations
+let isAsCheapAsAMove = 1, F = 0 in {
+  let isCommutable = 1 in {
+    defm ADD_ : ALUarith<0b000, "add", add, i32lo16z, i32hi16>;
+  }
+  defm SUB_ : ALUarith<0b010,   "sub", sub, i32lo16z, i32hi16>;
+  let isCommutable = 1 in {
+    defm AND_ : ALUlogic<0b100, "and", and, i32lo16and, i32hi16and>;
+    defm OR_  : ALUlogic<0b101,  "or",  or, i32lo16z, i32hi16>;
+    defm XOR_ : ALUlogic<0b110, "xor", xor, i32lo16z, i32hi16>;
+  }
+}
+
+def : Pat<(add GPR:$Rs1, i32lo16z:$imm),
+          (ADD_I_LO GPR:$Rs1, i32lo16z:$imm)>;
+
+def : Pat<(sub GPR:$Rs1, i32lo16z:$imm),
+          (SUB_I_LO GPR:$Rs1, i32lo16z:$imm)>;
+
+def : Pat<(add GPR:$Rs1, i32hi16:$imm),
+          (ADD_I_HI GPR:$Rs1, i32hi16:$imm)>;
+
+def : Pat<(sub GPR:$Rs1, i32hi16:$imm),
+          (SUB_I_HI GPR:$Rs1, i32hi16:$imm)>;
+
+def : Pat<(i32 i32lo16and:$imm), (AND_I_LO (i32 R1), i32lo16and:$imm)>;
+def : Pat<(i32 i32hi16and:$imm), (AND_I_HI (i32 R1), i32hi16and:$imm)>;
+
+// Change add/sub with negative number to sub/add
+def : Pat<(add GPR:$Rs1, i32neg16:$imm),
+          (SUB_I_LO GPR:$Rs1, (NEG $imm))>;
+def : Pat<(sub GPR:$Rs1, i32neg16:$imm),
+          (ADD_I_LO GPR:$Rs1, (NEG $imm))>;
+
+// Flag (incl. carry) setting addition and subtraction
+let F = 1, Defs = [SR] in {
+  defm ADD_F_ : ALUarith<0b000, "add.f", addc, i32lo16z, i32hi16>;
+  defm SUB_F_ : ALUarith<0b010, "sub.f", subc, i32lo16z, i32hi16>;
+}
+
+def : Pat<(addc GPR:$Rs1, i32lo16z:$imm),
+          (ADD_F_I_LO GPR:$Rs1, i32lo16z:$imm)>;
+
+def : Pat<(subc GPR:$Rs1, i32lo16z:$imm),
+          (SUB_F_I_LO GPR:$Rs1, i32lo16z:$imm)>;
+
+def : Pat<(addc GPR:$Rs1, i32hi16:$imm),
+          (ADD_F_I_HI GPR:$Rs1, i32hi16:$imm)>;
+
+def : Pat<(subc GPR:$Rs1, i32hi16:$imm),
+          (SUB_F_I_HI GPR:$Rs1, i32hi16:$imm)>;
+
+// Carry using addition and subtraction
+let F = 0, Uses = [SR] in {
+  defm ADDC_ : ALUarith<0b001, "addc", adde, i32lo16z, i32hi16>;
+  defm SUBB_ : ALUarith<0b011, "subb", sube, i32lo16z, i32hi16>;
+}
+
+def : Pat<(adde GPR:$Rs1, i32lo16z:$imm),
+          (ADDC_I_LO GPR:$Rs1, i32lo16z:$imm)>;
+
+def : Pat<(sube GPR:$Rs1, i32lo16z:$imm),
+          (SUBB_I_LO GPR:$Rs1, i32lo16z:$imm)>;
+
+def : Pat<(adde GPR:$Rs1, i32hi16:$imm),
+          (ADDC_I_HI GPR:$Rs1, i32hi16:$imm)>;
+
+def : Pat<(sube GPR:$Rs1, i32hi16:$imm),
+          (SUBB_I_HI GPR:$Rs1, i32hi16:$imm)>;
+
+// Flag setting ALU operations
+let isAsCheapAsAMove = 1, F = 1, Defs = [SR] in {
+  let isCommutable = 1 in {
+    defm AND_F_ : ALUlogic<0b100, "and.f",  and, i32lo16and, i32hi16and>;
+    defm OR_F_  : ALUlogic<0b101,  "or.f",   or, i32lo16z, i32hi16>;
+    defm XOR_F_ : ALUlogic<0b110, "xor.f",  xor, i32lo16z, i32hi16>;
+  }
+}
+
+let isAsCheapAsAMove = 1, F = 1, Defs = [SR], Uses = [SR] in {
+  defm ADDC_F_ : ALUarith<0b001, "addc.f", adde, i32lo16z, i32hi16>;
+  defm SUBB_F_ : ALUarith<0b011, "subb.f", sube, i32lo16z, i32hi16>;
+}
+
+def : Pat<(LanaiSubbF GPR:$Rs1, GPR:$Rs2),
+          (SUBB_F_R GPR:$Rs1, GPR:$Rs2)>;
+
+def : Pat<(LanaiSubbF GPR:$Rs1, i32lo16z:$imm),
+          (SUBB_F_I_LO GPR:$Rs1, i32lo16z:$imm)>;
+
+def : Pat<(LanaiSubbF GPR:$Rs1, i32hi16:$imm),
+          (SUBB_F_I_HI GPR:$Rs1, i32hi16:$imm)>;
+
+def : InstAlias<"mov $src, $dst", (ADD_R GPR:$dst, GPR:$src, R0, 0)>;
+
+let isAsCheapAsAMove = 1, Rs1 = R0.Num, isCodeGenOnly = 1, H = 1, F = 0,
+  isReMaterializable = 1 in
+  def MOVHI : InstRI<0b000, (outs GPR:$Rd), (ins i32hi16:$imm16),
+                     "mov\t$imm16, $Rd",
+                     [(set GPR:$Rd, i32hi16:$imm16)]>;
+
+def : InstAlias<"mov $imm16, $dst", (ADD_I_LO GPR:$dst, R0, i32lo16z:$imm16)>;
+def : InstAlias<"mov $imm16, $dst", (ADD_I_HI GPR:$dst, R0, i32hi16:$imm16)>;
+def : InstAlias<"mov $imm16, $dst",
+                (AND_I_LO GPR:$dst, R1, i32lo16and:$imm16)>;
+def : InstAlias<"mov $imm16, $dst",
+                (AND_I_HI GPR:$dst, R1, i32hi16and:$imm16)>;
+
+// Shift instructions
+class ShiftRI<string AsmStr, list<dag> Pattern>
+  : InstRI<0b111, (outs GPR:$Rd), (ins GPR:$Rs1, immShift:$imm16),
+           !strconcat(AsmStr, "\t$Rs1, $imm16, $Rd"), Pattern> {
+  let isReMaterializable = 1;
+}
+
+let F = 0 in {
+  let H = 0 in
+    def SL_I : ShiftRI<"sh", [(set GPR:$Rd, (shl GPR:$Rs1, immShift:$imm16))]>;
+  let H = 1 in
+    def SA_I : ShiftRI<"sha", []>;
+}
+def : Pat<(srl GPR:$Rs1, immShift:$imm), (SL_I GPR:$Rs1, (NEG $imm))>;
+def : Pat<(sra GPR:$Rs1, immShift:$imm), (SA_I GPR:$Rs1, (NEG $imm))>;
+
+let F = 1, Defs = [SR] in {
+  let H = 0 in
+    def SL_F_I : ShiftRI<"sh.f", []>;
+  let H = 1 in
+    def SA_F_I : ShiftRI<"sha.f", []>;
+}
+
+class ShiftRR<string AsmStr, list<dag> Pattern>
+  : InstRR<0b111, (outs GPR:$Rd), (ins GPR:$Rs1, GPR:$Rs2, pred:$DDDI), AsmStr,
+           Pattern>;
+
+let F = 0 in {
+  let JJJJJ = 0b10000 in
+    def SHL_R : ShiftRR<"sh$DDDI\t$Rs1, $Rs2, $Rd",
+                        [(set GPR:$Rd, (shl GPR:$Rs1, GPR:$Rs2))]>;
+  let isCodeGenOnly = 1 in {
+    let JJJJJ = 0b10000 in
+      def SRL_R : ShiftRR<"sh$DDDI\t$Rs1, $Rs2, $Rd", []>;
+  }
+  let JJJJJ = 0b11000 in
+    def SRA_R : ShiftRR<"sha$DDDI\t$Rs1, $Rs2, $Rd", []>;
+}
+
+let F = 1, Defs = [SR] in {
+  let JJJJJ = 0b10000 in
+    def SHL_F_R : ShiftRR<"sh.f$DDDI\t$Rs1, $Rs2, $Rd", []>;
+  let isCodeGenOnly = 1 in {
+    let JJJJJ = 0b10000 in
+      def SRL_F_R : ShiftRR<"sh.f$DDDI\t$Rs1, $Rs2, $Rd", []>;
+  }
+  let JJJJJ = 0b11000 in
+    def SRA_F_R : ShiftRR<"sha.f$DDDI\t$Rs1, $Rs2, $Rd", []>;
+}
+
+// Expand shift-right operations
+def : Pat<(srl GPR:$Rs1, GPR:$Rs2),
+          (SRL_R GPR:$Rs1, (SUB_R R0, GPR:$Rs2))>;
+def : Pat<(sra GPR:$Rs1, GPR:$Rs2),
+          (SRA_R GPR:$Rs1, (SUB_R R0, GPR:$Rs2))>;
+
+// -------------------------------------------------- //
+// LOAD instructions
+// -------------------------------------------------- //
+
+class LoadRR<string OpcString, PatFrag OpNode, ValueType Ty>
+  : InstRRM<0b0, (outs GPR:$Rd), (ins MEMrr:$src),
+            !strconcat(OpcString, "\t$src, $Rd"),
+            [(set (Ty GPR:$Rd), (OpNode ADDRrr:$src))]>,
+    Sched<[WriteLD]> {
+  bits<20> src;
+
+  let Rs1 = src{19-15};
+  let Rs2 = src{14-10};
+  let P = src{9};
+  let Q = src{8};
+  let BBB = src{7-5};
+  let JJJJJ = src{4-0};
+  let mayLoad = 1;
+}
+
+class LoadRI<string OpcString, PatFrag OpNode, ValueType Ty>
+  : InstRM<0b0, (outs GPR:$Rd), (ins MEMri:$src),
+           !strconcat(OpcString, "\t$src, $Rd"),
+           [(set (Ty GPR:$Rd), (OpNode ADDRri:$src))]>,
+    Sched<[WriteLD]> {
+  bits<23> src;
+
+  let Itinerary = IIC_LD;
+  let Rs1 = src{22-18};
+  let P = src{17};
+  let Q = src{16};
+  let imm16 = src{15-0};
+  let isReMaterializable = 1;
+  let mayLoad = 1;
+}
+
+let E = 0 in {
+  let YL = 0b01 in {
+    // uld is used here and ld in the alias as the alias is printed out first if
+    // an alias exist
+    def LDW_RI : LoadRI<"uld", load, i32>;
+    def LDW_RR : LoadRR<"ld", load, i32>;
+  }
+}
+
+def : InstAlias<"ld $src, $dst", (LDW_RI GPR:$dst, MEMri:$src)>;
+
+let E = 1 in {
+  let YL = 0b01 in {
+    def LDWz_RR : LoadRR<"uld", zextloadi32, i32>;
+  }
+}
+
+let E = 1 in {
+  let YL = 0b00 in
+    def LDHz_RR : LoadRR<"uld.h", zextloadi16, i32>;
+  let YL = 0b10 in
+    def LDBz_RR : LoadRR<"uld.b", zextloadi8, i32>;
+}
+
+let E = 0 in {
+  let YL = 0b00 in
+    def LDHs_RR : LoadRR<"ld.h", sextloadi16, i32>;
+  let YL = 0b10 in
+    def LDBs_RR : LoadRR<"ld.b", sextloadi8, i32>;
+}
+
+def LDADDR : InstSLS<0x0, (outs GPR:$Rd), (ins MEMi:$src),
+                     "ld\t$src, $Rd",
+                     [(set (i32 GPR:$Rd), (load ADDRsls:$src))]>,
+    Sched<[WriteLD]> {
+  bits<21> src;
+
+  let Itinerary = IIC_LD;
+  let msb = src{20-16};
+  let lsb = src{15-0};
+  let isReMaterializable = 1;
+  let mayLoad = 1;
+}
+
+class LoadSPLS<string asmstring, PatFrag opNode>
+  : InstSPLS<(outs GPR:$Rd), (ins MEMspls:$src),
+             !strconcat(asmstring, "\t$src, $Rd"),
+             [(set (i32 GPR:$Rd), (opNode ADDRspls:$src))]>,
+    Sched<[WriteLDSW]> {
+  bits<17> src;
+  let Itinerary = IIC_LDSW;
+  let Rs1 = src{16-12};
+  let P = src{11};
+  let Q = src{10};
+  let imm10 = src{9-0};
+  let mayLoad = 1;
+  let isReMaterializable = 1;
+}
+
+let Y = 0, S = 0, E = 1 in
+  def LDHz_RI : LoadSPLS<"uld.h", zextloadi16>;
+
+let Y = 0, S = 0, E = 0 in
+  def LDHs_RI : LoadSPLS<"ld.h", sextloadi16>;
+
+let Y = 1, S = 0, E = 1 in
+  def LDBz_RI : LoadSPLS<"uld.b", zextloadi8>;
+
+let Y = 1, S = 0, E = 0 in
+  def LDBs_RI : LoadSPLS<"ld.b", sextloadi8>;
+
+def SLI : InstSLI<(outs GPR:$Rd), (ins i32lo21:$imm),
+                  "mov\t$imm, $Rd",
+                  [(set GPR:$Rd, i32lo21:$imm)]> {
+  bits<21> imm;
+
+  let msb = imm{20-16};
+  let lsb = imm{15-0};
+  let isReMaterializable = 1;
+  let isAsCheapAsAMove = 1;
+}
+
+// -------------------------------------------------- //
+// STORE instructions
+// -------------------------------------------------- //
+
+class StoreRR<string OpcString, PatFrag OpNode, ValueType Ty>
+  : InstRRM<0b1, (outs), (ins GPR:$Rd, MEMrr:$dst),
+            !strconcat(OpcString, "\t$Rd, $dst"),
+            [(OpNode (Ty GPR:$Rd), ADDRrr:$dst)]>,
+    Sched<[WriteST]> {
+  bits<20> dst;
+
+  let Itinerary = IIC_ST;
+  let Rs1 = dst{19-15};
+  let Rs2 = dst{14-10};
+  let P = dst{9};
+  let Q = dst{8};
+  let BBB = dst{7-5};
+  let JJJJJ = dst{4-0};
+  let mayStore = 1;
+}
+
+class StoreRI<string OpcString, PatFrag OpNode, ValueType Ty>
+  : InstRM<0b1, (outs), (ins GPR:$Rd, MEMri:$dst),
+           !strconcat(OpcString, "\t$Rd, $dst"),
+           [(OpNode (Ty GPR:$Rd), ADDRri:$dst)]>,
+    Sched<[WriteST]> {
+  bits<23> dst;
+
+  let Itinerary = IIC_ST;
+  let Rs1 = dst{22-18};
+  let P = dst{17};
+  let Q = dst{16};
+  let imm16 = dst{15-0};
+  let mayStore = 1;
+}
+
+let YL = 0b01, E = 0 in {
+  def SW_RR : StoreRR<"st", store, i32>;
+  def SW_RI : StoreRI<"st", store, i32>;
+}
+
+let E = 0 in {
+  let YL = 0b00 in
+    def STH_RR : StoreRR<"st.h", truncstorei16, i32>;
+  let YL = 0b10 in
+    def STB_RR : StoreRR<"st.b", truncstorei8, i32>;
+}
+
+def STADDR : InstSLS<0x1, (outs), (ins GPR:$Rd, MEMi:$dst),
+                     "st\t$Rd, $dst",
+                     [(store (i32 GPR:$Rd), ADDRsls:$dst)]>,
+    Sched<[WriteST]> {
+  bits<21> dst;
+
+  let Itinerary = IIC_ST;
+  let msb = dst{20-16};
+  let lsb = dst{15-0};
+  let mayStore = 1;
+}
+
+class StoreSPLS<string asmstring, PatFrag opNode>
+  : InstSPLS<(outs), (ins GPR:$Rd, MEMspls:$dst),
+             !strconcat(asmstring, "\t$Rd, $dst"),
+             [(opNode (i32 GPR:$Rd), ADDRspls:$dst)]>,
+    Sched<[WriteSTSW]> {
+  bits<17> dst;
+
+  let Itinerary = IIC_STSW;
+  let Rs1 = dst{16-12};
+  let P = dst{11};
+  let Q = dst{10};
+  let imm10 = dst{9-0};
+  let mayStore = 1;
+}
+
+let Y = 0, S = 1, E = 0 in
+  def STH_RI : StoreSPLS<"st.h", truncstorei16>;
+
+let Y = 1, S = 1, E = 0 in
+  def STB_RI : StoreSPLS<"st.b", truncstorei8>;
+
+// -------------------------------------------------- //
+// BRANCH instructions
+// -------------------------------------------------- //
+
+let isBranch = 1, isBarrier = 1, isTerminator = 1, hasDelaySlot = 1 in {
+  def BT : InstBR<(outs), (ins BrTarget:$addr),
+                  "bt\t$addr",
+                  [(br bb:$addr)]> {
+    let DDDI = 0b0000;
+  }
+  let Uses = [SR] in
+    def BRCC : InstBR<(outs), (ins BrTarget:$addr, CCOp:$DDDI),
+                      "b$DDDI\t$addr",
+                      [(LanaiBrCC bb:$addr, imm:$DDDI)]>;
+
+  let isIndirectBranch = 1 in {
+    def JR : InstRR<0b101, (outs), (ins GPR:$Rs2), "bt\t$Rs2",
+                    [(brind GPR:$Rs2)]> {
+      let Rs1 = R0.Num;
+      let Rd = R2.Num;
+      let F = 0;
+      let JJJJJ = 0;
+      let DDDI = 0;
+    }
+  }
+}
+
+// -------------------------------------------------- //
+// Condition/SF instructions
+// -------------------------------------------------- //
+
+// Instructions to set flags used in lowering comparisons.
+multiclass SF<bits<3> op2Val, string AsmStr> {
+  let F = 1, Rd = R0.Num, JJJJJ = 0, Defs = [SR], DDDI = 0 in
+    def _RR : InstRR<op2Val, (outs), (ins GPR:$Rs1, GPR:$Rs2),
+                     !strconcat(AsmStr, "\t$Rs1, $Rs2, %r0"),
+                     [(LanaiSetFlag (i32 GPR:$Rs1), (i32 GPR:$Rs2))]>;
+  let F = 1, Rd = R0.Num, H = 0, Defs = [SR] in
+    def _RI_LO : InstRI<op2Val, (outs), (ins GPR:$Rs1, i32lo16z:$imm16),
+                     !strconcat(AsmStr, "\t$Rs1, $imm16, %r0"),
+                     [(LanaiSetFlag (i32 GPR:$Rs1), i32lo16z:$imm16)]>;
+  let F = 1, Rd = R0.Num, H = 1, Defs = [SR] in
+    def _RI_HI : InstRI<op2Val, (outs), (ins GPR:$Rs1, i32hi16:$imm16),
+                     !strconcat(AsmStr, "\t$Rs1, $imm16, %r0"),
+                     [(LanaiSetFlag (i32 GPR:$Rs1), i32hi16:$imm16)]>;
+}
+let isCodeGenOnly = 1, isCompare = 1 in {
+  defm SFSUB_F : SF<0b010, "sub.f">;
+}
+
+// Jump and link
+let isCall = 1, hasDelaySlot = 1, isCodeGenOnly = 1, Uses = [SP],
+    Defs = [RCA] in {
+  def CALL : Pseudo<(outs), (ins CallTarget:$addr), "", []>;
+  def CALLR : Pseudo<(outs), (ins GPR:$Rs1), "", [(Call GPR:$Rs1)]>;
+}
+
+let isReturn = 1, isTerminator = 1, hasDelaySlot = 1, isBarrier = 1,
+    Uses = [RCA] in {
+  def RET : InstRM<0b0, (outs), (ins),
+                   "ld\t-4[%fp], %pc ! return",
+                   [(RetFlag)]> {
+    let Rd = PC.Num;
+    let Rs1 = FP.Num;
+    let P = 1;
+    let Q = 0;
+    let imm16 = -4;
+
+    // Post encoding is not needed for RET.
+    let PostEncoderMethod = "";
+  }
+}
+
+// ADJCALLSTACKDOWN/UP implicitly use/def SP because they may be expanded into
+// a stack adjustment and the codegen must know that they may modify the stack
+// pointer before prolog-epilog rewriting occurs.
+// Pessimistically assume ADJCALLSTACKDOWN / ADJCALLSTACKUP will become
+// sub / add which can clobber SP.
+let Defs = [SP], Uses = [SP] in {
+  def ADJCALLSTACKDOWN : Pseudo<(outs), (ins i32imm:$amt),
+                                "#ADJCALLSTACKDOWN $amt",
+                                [(CallSeqStart timm:$amt)]>;
+  def ADJCALLSTACKUP   : Pseudo<(outs), (ins i32imm:$amt1, i32imm:$amt2),
+                                "#ADJCALLSTACKUP $amt1 $amt2",
+                                [(CallSeqEnd timm:$amt1, timm:$amt2)]>;
+}
+
+let Defs = [SP], Uses = [SP] in {
+  def ADJDYNALLOC : Pseudo<(outs GPR:$dst), (ins GPR:$src),
+                           "#ADJDYNALLOC $dst $src",
+                           [(set GPR:$dst, (LanaiAdjDynAlloc GPR:$src))]>;
+}
+
+let Uses = [SR] in {
+  def SCC : InstSCC<(outs GPR:$Rs1), (ins CCOp:$DDDI),
+                    "s$DDDI\t$Rs1",
+                    [(set (i32 GPR:$Rs1), (LanaiSetCC imm:$DDDI))]>;
+}
+
+// SCC's output is already 1-bit so and'ing with 1 is redundant.
+def : Pat<(and (LanaiSetCC imm:$DDDI), 1), (SCC imm:$DDDI)>;
+
+// Select with hardware support
+let Uses = [SR], isSelect = 1 in {
+  def SELECT : InstRR<0b111, (outs GPR:$Rd),
+                      (ins GPR:$Rs1, GPR:$Rs2, CCOp:$DDDI),
+                      "sel.$DDDI $Rs1, $Rs2, $Rd",
+                      [(set (i32 GPR:$Rd),
+                       (LanaiSelectCC (i32 GPR:$Rs1), (i32 GPR:$Rs2),
+                                      (imm:$DDDI)))]> {
+    let JJJJJ = 0;
+    let F = 0;
+  }
+}
+
+let isBranch = 1, isBarrier = 1, isTerminator = 1, hasDelaySlot = 1,
+    isIndirectBranch = 1, Uses = [SR] in {
+  def BRIND_CC : InstRR<0b101, (outs), (ins GPR:$Rs1, CCOp:$DDDI),
+                        "b$DDDI\t$Rs1", []> {
+    let F = 0;
+    let JJJJJ = 0;
+    let Rd = PC.Num;
+    let Rs2 = R0.Num;
+  }
+
+  def BRIND_CCA : InstRR<0b101, (outs), (ins GPR:$Rs1, GPR:$Rs2, CCOp:$DDDI),
+                         "b${DDDI}\t$Rs1 add $Rs2", []> {
+    let F = 0;
+    let Rd = PC.Num;
+    let JJJJJ = 0;
+  }
+}
+
+// TODO: This only considers the case where BROFF is an immediate and not where
+// it is a register. Add support for register relative branching.
+let isBranch = 1, isBarrier = 1, isTerminator = 1, hasDelaySlot = 1, Rs1 = 0,
+    Uses = [SR] in
+  def BRR : InstBRR<(outs), (ins i16imm:$imm16, CCOp:$DDDI),
+                    "b${DDDI}.r\t$imm16", []>;
+
+let F = 0 in {
+// Population Count (POPC)
+def POPC: InstSpecial<0b001, (outs GPR:$Rd), (ins GPR:$Rs1),
+                      "popc\t$Rs1, $Rd",
+                      [(set GPR:$Rd, (ctpop GPR:$Rs1))]>;
+
+// Count Leading Zeros (LEADZ)
+def LEADZ: InstSpecial<0b010, (outs GPR:$Rd), (ins GPR:$Rs1),
+                       "leadz\t$Rs1, $Rd", [(set GPR:$Rd, (ctlz GPR:$Rs1))]>;
+
+// Count Trailing Zeros (TRAILZ)
+def TRAILZ : InstSpecial<0b011, (outs GPR:$Rd), (ins GPR:$Rs1),
+                         "trailz\t$Rs1, $Rd",
+                         [(set GPR:$Rd, (cttz GPR:$Rs1))]>;
+}
+
+//===----------------------------------------------------------------------===//
+// Non-Instruction Patterns
+//===----------------------------------------------------------------------===//
+
+// i32 0 and R0 can be used interchangeably.
+def : Pat<(i32 0), (i32 R0)>;
+// i32 -1 and R1 can be used interchangeably.
+def : Pat<(i32 -1), (i32 R1)>;
+
+// unsigned 16-bit immediate
+def : Pat<(i32 i32lo16z:$imm), (OR_I_LO (i32 R0), imm:$imm)>;
+
+// arbitrary immediate
+def : Pat<(i32 imm:$imm), (OR_I_LO (MOVHI (HI16 imm:$imm)), (LO16 imm:$imm))>;
+
+// Calls
+def : Pat<(Call tglobaladdr:$dst), (CALL tglobaladdr:$dst)>;
+def : Pat<(Call texternalsym:$dst), (CALL texternalsym:$dst)>;
+
+// Loads
+def : Pat<(extloadi8  ADDRspls:$src), (i32 (LDBz_RI ADDRspls:$src))>;
+def : Pat<(extloadi16 ADDRspls:$src), (i32 (LDHz_RI ADDRspls:$src))>;
+
+// GlobalAddress, ExternalSymbol, Jumptable, ConstantPool
+def : Pat<(LanaiHi tglobaladdr:$dst), (MOVHI tglobaladdr:$dst)>;
+def : Pat<(LanaiLo tglobaladdr:$dst), (OR_I_LO (i32 R0), tglobaladdr:$dst)>;
+def : Pat<(LanaiSmall tglobaladdr:$dst), (SLI tglobaladdr:$dst)>;
+def : Pat<(LanaiHi texternalsym:$dst), (MOVHI texternalsym:$dst)>;
+def : Pat<(LanaiLo texternalsym:$dst), (OR_I_LO (i32 R0), texternalsym:$dst)>;
+def : Pat<(LanaiSmall texternalsym:$dst), (SLI texternalsym:$dst)>;
+def : Pat<(LanaiHi tblockaddress:$dst), (MOVHI tblockaddress:$dst)>;
+def : Pat<(LanaiLo tblockaddress:$dst), (OR_I_LO (i32 R0), tblockaddress:$dst)>;
+def : Pat<(LanaiSmall tblockaddress:$dst), (SLI tblockaddress:$dst)>;
+def : Pat<(LanaiHi tjumptable:$dst), (MOVHI tjumptable:$dst)>;
+def : Pat<(LanaiLo tjumptable:$dst), (OR_I_LO (i32 R0), tjumptable:$dst)>;
+def : Pat<(LanaiSmall tjumptable:$dst), (SLI tjumptable:$dst)>;
+def : Pat<(LanaiHi tconstpool:$dst), (MOVHI tconstpool:$dst)>;
+def : Pat<(LanaiLo tconstpool:$dst), (OR_I_LO (i32 R0), tconstpool:$dst)>;
+def : Pat<(LanaiSmall tconstpool:$dst), (SLI tconstpool:$dst)>;
+
+def : Pat<(or GPR:$hi, (LanaiLo tglobaladdr:$lo)),
+          (OR_I_LO GPR:$hi, tglobaladdr:$lo)>;
+def : Pat<(or R0, (LanaiSmall tglobaladdr:$small)),
+          (SLI tglobaladdr:$small)>;
+def : Pat<(or GPR:$hi, (LanaiLo texternalsym:$lo)),
+          (OR_I_LO GPR:$hi, texternalsym:$lo)>;
+def : Pat<(or R0, (LanaiSmall texternalsym:$small)),
+          (SLI texternalsym:$small)>;
+def : Pat<(or GPR:$hi, (LanaiLo tblockaddress:$lo)),
+          (OR_I_LO GPR:$hi, tblockaddress:$lo)>;
+def : Pat<(or R0, (LanaiSmall tblockaddress:$small)),
+          (SLI tblockaddress:$small)>;
+def : Pat<(or GPR:$hi, (LanaiLo tjumptable:$lo)),
+          (OR_I_LO GPR:$hi, tjumptable:$lo)>;
+def : Pat<(or R0, (LanaiSmall tjumptable:$small)),
+          (SLI tjumptable:$small)>;
+def : Pat<(or GPR:$hi, (LanaiLo tconstpool:$lo)),
+          (OR_I_LO GPR:$hi, tconstpool:$lo)>;
+def : Pat<(or R0, (LanaiSmall tconstpool:$small)),
+          (SLI tconstpool:$small)>;
diff --git a/lib/Target/Lanai/LanaiMCInstLower.cpp b/lib/Target/Lanai/LanaiMCInstLower.cpp
new file mode 100644
index 000000000000..6c809b43f7ed
--- /dev/null
+++ b/lib/Target/Lanai/LanaiMCInstLower.cpp
@@ -0,0 +1,140 @@
+//=-- LanaiMCInstLower.cpp - Convert Lanai MachineInstr to an MCInst --------=//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains code to lower Lanai MachineInstrs to their corresponding
+// MCInst records.
+//
+//===----------------------------------------------------------------------===//
+
+#include "LanaiMCInstLower.h"
+
+#include "MCTargetDesc/LanaiBaseInfo.h"
+#include "MCTargetDesc/LanaiMCExpr.h"
+#include "llvm/ADT/SmallString.h"
+#include "llvm/CodeGen/AsmPrinter.h"
+#include "llvm/CodeGen/MachineBasicBlock.h"
+#include "llvm/CodeGen/MachineInstr.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/Mangler.h"
+#include "llvm/MC/MCAsmInfo.h"
+#include "llvm/MC/MCContext.h"
+#include "llvm/MC/MCExpr.h"
+#include "llvm/MC/MCInst.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/raw_ostream.h"
+
+using namespace llvm;
+
+MCSymbol *
+LanaiMCInstLower::GetGlobalAddressSymbol(const MachineOperand &MO) const {
+  return Printer.getSymbol(MO.getGlobal());
+}
+
+MCSymbol *
+LanaiMCInstLower::GetBlockAddressSymbol(const MachineOperand &MO) const {
+  return Printer.GetBlockAddressSymbol(MO.getBlockAddress());
+}
+
+MCSymbol *
+LanaiMCInstLower::GetExternalSymbolSymbol(const MachineOperand &MO) const {
+  return Printer.GetExternalSymbolSymbol(MO.getSymbolName());
+}
+
+MCSymbol *LanaiMCInstLower::GetJumpTableSymbol(const MachineOperand &MO) const {
+  SmallString<256> Name;
+  raw_svector_ostream(Name) << Printer.MAI->getPrivateGlobalPrefix() << "JTI"
+                            << Printer.getFunctionNumber() << '_'
+                            << MO.getIndex();
+  // Create a symbol for the name.
+  return Ctx.getOrCreateSymbol(Name.str());
+}
+
+MCSymbol *
+LanaiMCInstLower::GetConstantPoolIndexSymbol(const MachineOperand &MO) const {
+  SmallString<256> Name;
+  raw_svector_ostream(Name) << Printer.MAI->getPrivateGlobalPrefix() << "CPI"
+                            << Printer.getFunctionNumber() << '_'
+                            << MO.getIndex();
+  // Create a symbol for the name.
+  return Ctx.getOrCreateSymbol(Name.str());
+}
+
+MCOperand LanaiMCInstLower::LowerSymbolOperand(const MachineOperand &MO,
+                                               MCSymbol *Sym) const {
+  LanaiMCExpr::VariantKind Kind;
+
+  switch (MO.getTargetFlags()) {
+  case LanaiII::MO_NO_FLAG:
+    Kind = LanaiMCExpr::VK_Lanai_None;
+    break;
+  case LanaiII::MO_ABS_HI:
+    Kind = LanaiMCExpr::VK_Lanai_ABS_HI;
+    break;
+  case LanaiII::MO_ABS_LO:
+    Kind = LanaiMCExpr::VK_Lanai_ABS_LO;
+    break;
+  default:
+    llvm_unreachable("Unknown target flag on GV operand");
+  }
+
+  const MCExpr *Expr =
+      MCSymbolRefExpr::create(Sym, MCSymbolRefExpr::VK_None, Ctx);
+  if (!MO.isJTI() && MO.getOffset())
+    Expr = MCBinaryExpr::createAdd(
+        Expr, MCConstantExpr::create(MO.getOffset(), Ctx), Ctx);
+  Expr = LanaiMCExpr::create(Kind, Expr, Ctx);
+  return MCOperand::createExpr(Expr);
+}
+
+void LanaiMCInstLower::Lower(const MachineInstr *MI, MCInst &OutMI) const {
+  OutMI.setOpcode(MI->getOpcode());
+
+  for (unsigned I = 0, E = MI->getNumOperands(); I != E; ++I) {
+    const MachineOperand &MO = MI->getOperand(I);
+
+    MCOperand MCOp;
+    switch (MO.getType()) {
+    case MachineOperand::MO_Register:
+      // Ignore all implicit register operands.
+      if (MO.isImplicit())
+        continue;
+      MCOp = MCOperand::createReg(MO.getReg());
+      break;
+    case MachineOperand::MO_Immediate:
+      MCOp = MCOperand::createImm(MO.getImm());
+      break;
+    case MachineOperand::MO_MachineBasicBlock:
+      MCOp = MCOperand::createExpr(
+          MCSymbolRefExpr::create(MO.getMBB()->getSymbol(), Ctx));
+      break;
+    case MachineOperand::MO_RegisterMask:
+      continue;
+    case MachineOperand::MO_GlobalAddress:
+      MCOp = LowerSymbolOperand(MO, GetGlobalAddressSymbol(MO));
+      break;
+    case MachineOperand::MO_BlockAddress:
+      MCOp = LowerSymbolOperand(MO, GetBlockAddressSymbol(MO));
+      break;
+    case MachineOperand::MO_ExternalSymbol:
+      MCOp = LowerSymbolOperand(MO, GetExternalSymbolSymbol(MO));
+      break;
+    case MachineOperand::MO_JumpTableIndex:
+      MCOp = LowerSymbolOperand(MO, GetJumpTableSymbol(MO));
+      break;
+    case MachineOperand::MO_ConstantPoolIndex:
+      MCOp = LowerSymbolOperand(MO, GetConstantPoolIndexSymbol(MO));
+      break;
+    default:
+      MI->dump();
+      llvm_unreachable("unknown operand type");
+    }
+
+    OutMI.addOperand(MCOp);
+  }
+}
diff --git a/lib/Target/Lanai/LanaiMCInstLower.h b/lib/Target/Lanai/LanaiMCInstLower.h
new file mode 100644
index 000000000000..41c0766e86da
--- /dev/null
+++ b/lib/Target/Lanai/LanaiMCInstLower.h
@@ -0,0 +1,48 @@
+//===-- LanaiMCInstLower.h - Lower MachineInstr to MCInst -------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_LANAI_LANAIMCINSTLOWER_H
+#define LLVM_LIB_TARGET_LANAI_LANAIMCINSTLOWER_H
+
+#include "llvm/Support/Compiler.h"
+
+namespace llvm {
+class AsmPrinter;
+class MCContext;
+class MCInst;
+class MCOperand;
+class MCSymbol;
+class MachineInstr;
+class MachineModuleInfoMachO;
+class MachineOperand;
+class Mangler;
+
+// LanaiMCInstLower - This class is used to lower an MachineInstr
+// into an MCInst.
+class LLVM_LIBRARY_VISIBILITY LanaiMCInstLower {
+  MCContext &Ctx;
+
+  AsmPrinter &Printer;
+
+public:
+  LanaiMCInstLower(MCContext &CTX, Mangler & /*Mang*/, AsmPrinter &AP)
+      : Ctx(CTX), Printer(AP) {}
+  void Lower(const MachineInstr *MI, MCInst &OutMI) const;
+
+  MCOperand LowerSymbolOperand(const MachineOperand &MO, MCSymbol *Sym) const;
+
+  MCSymbol *GetGlobalAddressSymbol(const MachineOperand &MO) const;
+  MCSymbol *GetBlockAddressSymbol(const MachineOperand &MO) const;
+  MCSymbol *GetExternalSymbolSymbol(const MachineOperand &MO) const;
+  MCSymbol *GetJumpTableSymbol(const MachineOperand &MO) const;
+  MCSymbol *GetConstantPoolIndexSymbol(const MachineOperand &MO) const;
+};
+} // namespace llvm
+
+#endif // LLVM_LIB_TARGET_LANAI_LANAIMCINSTLOWER_H
diff --git a/lib/Target/Lanai/LanaiMachineFunctionInfo.cpp b/lib/Target/Lanai/LanaiMachineFunctionInfo.cpp
new file mode 100644
index 000000000000..c72271b67790
--- /dev/null
+++ b/lib/Target/Lanai/LanaiMachineFunctionInfo.cpp
@@ -0,0 +1,23 @@
+//===-- LanaiMachineFuctionInfo.cpp - Lanai machine function info ---===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "LanaiMachineFunctionInfo.h"
+
+using namespace llvm;
+
+void LanaiMachineFunctionInfo::anchor() {}
+
+unsigned LanaiMachineFunctionInfo::getGlobalBaseReg() {
+  // Return if it has already been initialized.
+  if (GlobalBaseReg)
+    return GlobalBaseReg;
+
+  return GlobalBaseReg =
+             MF.getRegInfo().createVirtualRegister(&Lanai::GPRRegClass);
+}
diff --git a/lib/Target/Lanai/LanaiMachineFunctionInfo.h b/lib/Target/Lanai/LanaiMachineFunctionInfo.h
new file mode 100644
index 000000000000..3bd9112a9e13
--- /dev/null
+++ b/lib/Target/Lanai/LanaiMachineFunctionInfo.h
@@ -0,0 +1,58 @@
+//===- LanaiMachineFuctionInfo.h - Lanai machine func info -------*- C++ -*-==//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file declares Lanai-specific per-machine-function information.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_LANAI_LANAIMACHINEFUNCTIONINFO_H
+#define LLVM_LIB_TARGET_LANAI_LANAIMACHINEFUNCTIONINFO_H
+
+#include "LanaiRegisterInfo.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+
+namespace llvm {
+
+// LanaiMachineFunctionInfo - This class is derived from MachineFunction and
+// contains private Lanai target-specific information for each MachineFunction.
+class LanaiMachineFunctionInfo : public MachineFunctionInfo {
+  virtual void anchor();
+
+  MachineFunction &MF;
+
+  // SRetReturnReg - Lanai ABI require that sret lowering includes
+  // returning the value of the returned struct in a register. This field
+  // holds the virtual register into which the sret argument is passed.
+  unsigned SRetReturnReg;
+
+  // GlobalBaseReg - keeps track of the virtual register initialized for
+  // use as the global base register. This is used for PIC in some PIC
+  // relocation models.
+  unsigned GlobalBaseReg;
+
+  // VarArgsFrameIndex - FrameIndex for start of varargs area.
+  int VarArgsFrameIndex;
+
+public:
+  explicit LanaiMachineFunctionInfo(MachineFunction &MF)
+      : MF(MF), SRetReturnReg(0), GlobalBaseReg(0), VarArgsFrameIndex(0) {}
+
+  unsigned getSRetReturnReg() const { return SRetReturnReg; }
+  void setSRetReturnReg(unsigned Reg) { SRetReturnReg = Reg; }
+
+  unsigned getGlobalBaseReg();
+
+  int getVarArgsFrameIndex() const { return VarArgsFrameIndex; }
+  void setVarArgsFrameIndex(int Index) { VarArgsFrameIndex = Index; }
+};
+
+} // namespace llvm
+
+#endif // LLVM_LIB_TARGET_LANAI_LANAIMACHINEFUNCTIONINFO_H
diff --git a/lib/Target/Lanai/LanaiMemAluCombiner.cpp b/lib/Target/Lanai/LanaiMemAluCombiner.cpp
new file mode 100644
index 000000000000..c5a46143ee56
--- /dev/null
+++ b/lib/Target/Lanai/LanaiMemAluCombiner.cpp
@@ -0,0 +1,422 @@
+//===-- LanaiMemAluCombiner.cpp - Pass to combine memory & ALU operations -===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+// Simple pass to combine memory and ALU operations
+//
+// The Lanai ISA supports instructions where a load/store modifies the base
+// register used in the load/store operation. This pass finds suitable
+// load/store and ALU instructions and combines them into one instruction.
+//
+// For example,
+//   ld [ %r6 -- ], %r12
+// is a supported instruction that is not currently generated by the instruction
+// selection pass of this backend. This pass generates these instructions by
+// merging
+//   add %r6, -4, %r6
+// followed by
+//   ld [ %r6 ], %r12
+// in the same machine basic block into one machine instruction.
+//===----------------------------------------------------------------------===//
+
+#include "Lanai.h"
+#include "LanaiTargetMachine.h"
+#include "llvm/ADT/SmallSet.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/RegisterScavenging.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Target/TargetInstrInfo.h"
+using namespace llvm;
+
+#define GET_INSTRMAP_INFO
+#include "LanaiGenInstrInfo.inc"
+
+#define DEBUG_TYPE "lanai-mem-alu-combiner"
+
+STATISTIC(NumLdStAluCombined, "Number of memory and ALU instructions combined");
+
+static llvm::cl::opt<bool> DisableMemAluCombiner(
+    "disable-lanai-mem-alu-combiner", llvm::cl::init(false),
+    llvm::cl::desc("Do not combine ALU and memory operators"),
+    llvm::cl::Hidden);
+
+namespace llvm {
+void initializeLanaiMemAluCombinerPass(PassRegistry &);
+} // namespace llvm
+
+namespace {
+typedef MachineBasicBlock::iterator MbbIterator;
+typedef MachineFunction::iterator MfIterator;
+
+class LanaiMemAluCombiner : public MachineFunctionPass {
+public:
+  static char ID;
+  explicit LanaiMemAluCombiner() : MachineFunctionPass(ID) {
+    initializeLanaiMemAluCombinerPass(*PassRegistry::getPassRegistry());
+  }
+
+  const char *getPassName() const override {
+    return "Lanai load / store optimization pass";
+  }
+
+  bool runOnMachineFunction(MachineFunction &F) override;
+
+  MachineFunctionProperties getRequiredProperties() const override {
+    return MachineFunctionProperties().set(
+        MachineFunctionProperties::Property::AllVRegsAllocated);
+  }
+
+private:
+  MbbIterator findClosestSuitableAluInstr(MachineBasicBlock *BB,
+                                          const MbbIterator &MemInstr,
+                                          bool Decrement);
+  void insertMergedInstruction(MachineBasicBlock *BB,
+                               const MbbIterator &MemInstr,
+                               const MbbIterator &AluInstr, bool Before);
+  bool combineMemAluInBasicBlock(MachineBasicBlock *BB);
+
+  // Target machine description which we query for register names, data
+  // layout, etc.
+  const TargetInstrInfo *TII;
+};
+} // namespace
+
+char LanaiMemAluCombiner::ID = 0;
+
+INITIALIZE_PASS(LanaiMemAluCombiner, DEBUG_TYPE,
+                "Lanai memory ALU combiner pass", false, false)
+
+namespace {
+bool isSpls(uint16_t Opcode) { return Lanai::splsIdempotent(Opcode) == Opcode; }
+
+// Determine the opcode for the merged instruction created by considering the
+// old memory operation's opcode and whether the merged opcode will have an
+// immediate offset.
+unsigned mergedOpcode(unsigned OldOpcode, bool ImmediateOffset) {
+  switch (OldOpcode) {
+  case Lanai::LDW_RI:
+  case Lanai::LDW_RR:
+    if (ImmediateOffset)
+      return Lanai::LDW_RI;
+    return Lanai::LDW_RR;
+  case Lanai::LDHs_RI:
+  case Lanai::LDHs_RR:
+    if (ImmediateOffset)
+      return Lanai::LDHs_RI;
+    return Lanai::LDHs_RR;
+  case Lanai::LDHz_RI:
+  case Lanai::LDHz_RR:
+    if (ImmediateOffset)
+      return Lanai::LDHz_RI;
+    return Lanai::LDHz_RR;
+  case Lanai::LDBs_RI:
+  case Lanai::LDBs_RR:
+    if (ImmediateOffset)
+      return Lanai::LDBs_RI;
+    return Lanai::LDBs_RR;
+  case Lanai::LDBz_RI:
+  case Lanai::LDBz_RR:
+    if (ImmediateOffset)
+      return Lanai::LDBz_RI;
+    return Lanai::LDBz_RR;
+  case Lanai::SW_RI:
+  case Lanai::SW_RR:
+    if (ImmediateOffset)
+      return Lanai::SW_RI;
+    return Lanai::SW_RR;
+  case Lanai::STB_RI:
+  case Lanai::STB_RR:
+    if (ImmediateOffset)
+      return Lanai::STB_RI;
+    return Lanai::STB_RR;
+  case Lanai::STH_RI:
+  case Lanai::STH_RR:
+    if (ImmediateOffset)
+      return Lanai::STH_RI;
+    return Lanai::STH_RR;
+  default:
+    return 0;
+  }
+}
+
+// Check if the machine instruction has non-volatile memory operands of the type
+// supported for combining with ALU instructions.
+bool isNonVolatileMemoryOp(const MachineInstr &MI) {
+  if (!MI.hasOneMemOperand())
+    return false;
+
+  // Determine if the machine instruction is a supported memory operation by
+  // testing if the computed merge opcode is a valid memory operation opcode.
+  if (mergedOpcode(MI.getOpcode(), false) == 0)
+    return false;
+
+  const MachineMemOperand *MemOperand = *MI.memoperands_begin();
+
+  // Don't move volatile memory accesses
+  if (MemOperand->isVolatile())
+    return false;
+
+  return true;
+}
+
+// Test to see if two machine operands are of the same type. This test is less
+// strict than the MachineOperand::isIdenticalTo function.
+bool isSameOperand(const MachineOperand &Op1, const MachineOperand &Op2) {
+  if (Op1.getType() != Op2.getType())
+    return false;
+
+  switch (Op1.getType()) {
+  case MachineOperand::MO_Register:
+    return Op1.getReg() == Op2.getReg();
+  case MachineOperand::MO_Immediate:
+    return Op1.getImm() == Op2.getImm();
+  default:
+    return false;
+  }
+}
+
+bool isZeroOperand(const MachineOperand &Op) {
+  return ((Op.isReg() && Op.getReg() == Lanai::R0) ||
+          (Op.isImm() && Op.getImm() == 0));
+}
+
+// Determines whether a register is used by an instruction.
+bool InstrUsesReg(const MbbIterator &Instr, const MachineOperand *Reg) {
+  for (MachineInstr::const_mop_iterator Mop = Instr->operands_begin();
+       Mop != Instr->operands_end(); ++Mop) {
+    if (isSameOperand(*Mop, *Reg))
+      return true;
+  }
+  return false;
+}
+
+// Converts between machine opcode and AluCode.
+// Flag using/modifying ALU operations should not be considered for merging and
+// are omitted from this list.
+LPAC::AluCode mergedAluCode(unsigned AluOpcode) {
+  switch (AluOpcode) {
+  case Lanai::ADD_I_LO:
+  case Lanai::ADD_R:
+    return LPAC::ADD;
+  case Lanai::SUB_I_LO:
+  case Lanai::SUB_R:
+    return LPAC::SUB;
+  case Lanai::AND_I_LO:
+  case Lanai::AND_R:
+    return LPAC::AND;
+  case Lanai::OR_I_LO:
+  case Lanai::OR_R:
+    return LPAC::OR;
+  case Lanai::XOR_I_LO:
+  case Lanai::XOR_R:
+    return LPAC::XOR;
+  case Lanai::SHL_R:
+    return LPAC::SHL;
+  case Lanai::SRL_R:
+    return LPAC::SRL;
+  case Lanai::SRA_R:
+    return LPAC::SRA;
+  case Lanai::SA_I:
+  case Lanai::SL_I:
+  default:
+    return LPAC::UNKNOWN;
+  }
+}
+
+// Insert a new combined memory and ALU operation instruction.
+//
+// This function builds a new machine instruction using the MachineInstrBuilder
+// class and inserts it before the memory instruction.
+void LanaiMemAluCombiner::insertMergedInstruction(MachineBasicBlock *BB,
+                                                  const MbbIterator &MemInstr,
+                                                  const MbbIterator &AluInstr,
+                                                  bool Before) {
+  // Insert new combined load/store + alu operation
+  MachineOperand Dest = MemInstr->getOperand(0);
+  MachineOperand Base = MemInstr->getOperand(1);
+  MachineOperand MemOffset = MemInstr->getOperand(2);
+  MachineOperand AluOffset = AluInstr->getOperand(2);
+
+  // Abort if ALU offset is not a register or immediate
+  assert((AluOffset.isReg() || AluOffset.isImm()) &&
+         "Unsupported operand type in merge");
+
+  // Determined merged instructions opcode and ALU code
+  LPAC::AluCode AluOpcode = mergedAluCode(AluInstr->getOpcode());
+  unsigned NewOpc = mergedOpcode(MemInstr->getOpcode(), AluOffset.isImm());
+
+  assert(AluOpcode != LPAC::UNKNOWN && "Unknown ALU code in merging");
+  assert(NewOpc != 0 && "Unknown merged node opcode");
+
+  // Build and insert new machine instruction
+  MachineInstrBuilder InstrBuilder =
+      BuildMI(*BB, MemInstr, MemInstr->getDebugLoc(), TII->get(NewOpc));
+  InstrBuilder.addReg(Dest.getReg(), getDefRegState(true));
+  InstrBuilder.addReg(Base.getReg(), getKillRegState(true));
+
+  // Add offset to machine instruction
+  if (AluOffset.isReg())
+    InstrBuilder.addReg(AluOffset.getReg());
+  else if (AluOffset.isImm())
+    InstrBuilder.addImm(AluOffset.getImm());
+  else
+    llvm_unreachable("Unsupported ld/st ALU merge.");
+
+  // Create a pre-op if the ALU operation preceded the memory operation or the
+  // MemOffset is non-zero (i.e. the memory value should be adjusted before
+  // accessing it), else create a post-op.
+  if (Before || !isZeroOperand(MemOffset))
+    InstrBuilder.addImm(LPAC::makePreOp(AluOpcode));
+  else
+    InstrBuilder.addImm(LPAC::makePostOp(AluOpcode));
+
+  // Transfer memory operands.
+  InstrBuilder->setMemRefs(MemInstr->memoperands_begin(),
+                           MemInstr->memoperands_end());
+}
+
+// Function determines if ALU operation (in alu_iter) can be combined with
+// a load/store with base and offset.
+bool isSuitableAluInstr(bool IsSpls, const MbbIterator &AluIter,
+                        const MachineOperand &Base,
+                        const MachineOperand &Offset) {
+  // ALU operations have 3 operands
+  if (AluIter->getNumOperands() != 3)
+    return false;
+
+  MachineOperand &Dest = AluIter->getOperand(0);
+  MachineOperand &Op1 = AluIter->getOperand(1);
+  MachineOperand &Op2 = AluIter->getOperand(2);
+
+  // Only match instructions using the base register as destination and with the
+  // base and first operand equal
+  if (!isSameOperand(Dest, Base) || !isSameOperand(Dest, Op1))
+    return false;
+
+  if (Op2.isImm()) {
+    // It is not a match if the 2nd operand in the ALU operation is an
+    // immediate but the ALU operation is not an addition.
+    if (AluIter->getOpcode() != Lanai::ADD_I_LO)
+      return false;
+
+    if (Offset.isReg() && Offset.getReg() == Lanai::R0)
+      return true;
+
+    if (Offset.isImm() &&
+        ((Offset.getImm() == 0 &&
+          // Check that the Op2 would fit in the immediate field of the
+          // memory operation.
+          ((IsSpls && isInt<10>(Op2.getImm())) ||
+           (!IsSpls && isInt<16>(Op2.getImm())))) ||
+         Offset.getImm() == Op2.getImm()))
+      return true;
+  } else if (Op2.isReg()) {
+    // The Offset and 2nd operand are both registers and equal
+    if (Offset.isReg() && Op2.getReg() == Offset.getReg())
+      return true;
+  } else
+    // Only consider operations with register or immediate values
+    return false;
+
+  return false;
+}
+
+MbbIterator LanaiMemAluCombiner::findClosestSuitableAluInstr(
+    MachineBasicBlock *BB, const MbbIterator &MemInstr, const bool Decrement) {
+  MachineOperand *Base = &MemInstr->getOperand(1);
+  MachineOperand *Offset = &MemInstr->getOperand(2);
+  bool IsSpls = isSpls(MemInstr->getOpcode());
+
+  MbbIterator First = MemInstr;
+  MbbIterator Last = Decrement ? BB->begin() : BB->end();
+
+  while (First != Last) {
+    Decrement ? --First : ++First;
+
+    // Skip over debug instructions
+    if (First->isDebugValue())
+      continue;
+
+    if (isSuitableAluInstr(IsSpls, First, *Base, *Offset)) {
+      return First;
+    }
+
+    // Usage of the base or offset register is not a form suitable for merging.
+    if (First != Last) {
+      if (InstrUsesReg(First, Base))
+        break;
+      if (Offset->isReg() && InstrUsesReg(First, Offset))
+        break;
+    }
+  }
+
+  return MemInstr;
+}
+
+bool LanaiMemAluCombiner::combineMemAluInBasicBlock(MachineBasicBlock *BB) {
+  bool Modified = false;
+
+  MbbIterator MBBIter = BB->begin(), End = BB->end();
+  while (MBBIter != End) {
+    bool IsMemOp = isNonVolatileMemoryOp(*MBBIter);
+
+    if (IsMemOp) {
+      MachineOperand AluOperand = MBBIter->getOperand(3);
+      unsigned int DestReg = MBBIter->getOperand(0).getReg(),
+                   BaseReg = MBBIter->getOperand(1).getReg();
+      assert(AluOperand.isImm() && "Unexpected memory operator type");
+      LPAC::AluCode AluOpcode = static_cast<LPAC::AluCode>(AluOperand.getImm());
+
+      // Skip memory operations that already modify the base register or if
+      // the destination and base register are the same
+      if (!LPAC::modifiesOp(AluOpcode) && DestReg != BaseReg) {
+        for (int Inc = 0; Inc <= 1; ++Inc) {
+          MbbIterator AluIter =
+              findClosestSuitableAluInstr(BB, MBBIter, Inc == 0);
+          if (AluIter != MBBIter) {
+            insertMergedInstruction(BB, MBBIter, AluIter, Inc == 0);
+
+            ++NumLdStAluCombined;
+            Modified = true;
+
+            // Erase the matching ALU instruction
+            BB->erase(AluIter);
+            // Erase old load/store instruction
+            BB->erase(MBBIter++);
+            break;
+          }
+        }
+      }
+    }
+    if (MBBIter == End)
+      break;
+    ++MBBIter;
+  }
+
+  return Modified;
+}
+
+// Driver function that iterates over the machine basic building blocks of a
+// machine function
+bool LanaiMemAluCombiner::runOnMachineFunction(MachineFunction &MF) {
+  if (DisableMemAluCombiner)
+    return false;
+
+  TII = MF.getSubtarget<LanaiSubtarget>().getInstrInfo();
+  bool Modified = false;
+  for (MfIterator MFI = MF.begin(); MFI != MF.end(); ++MFI) {
+    Modified |= combineMemAluInBasicBlock(&*MFI);
+  }
+  return Modified;
+}
+} // namespace
+
+FunctionPass *llvm::createLanaiMemAluCombinerPass() {
+  return new LanaiMemAluCombiner();
+}
diff --git a/lib/Target/Lanai/LanaiRegisterInfo.cpp b/lib/Target/Lanai/LanaiRegisterInfo.cpp
new file mode 100644
index 000000000000..a4c612258e7b
--- /dev/null
+++ b/lib/Target/Lanai/LanaiRegisterInfo.cpp
@@ -0,0 +1,287 @@
+//===-- LanaiRegisterInfo.cpp - Lanai Register Information ------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the Lanai implementation of the TargetRegisterInfo class.
+//
+//===----------------------------------------------------------------------===//
+
+#include "LanaiRegisterInfo.h"
+#include "Lanai.h"
+#include "LanaiSubtarget.h"
+#include "llvm/ADT/BitVector.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/RegisterScavenging.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/Type.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Target/TargetFrameLowering.h"
+#include "llvm/Target/TargetInstrInfo.h"
+
+#define GET_REGINFO_TARGET_DESC
+#include "LanaiGenRegisterInfo.inc"
+
+using namespace llvm;
+
+LanaiRegisterInfo::LanaiRegisterInfo() : LanaiGenRegisterInfo(Lanai::RCA) {}
+
+const uint16_t *
+LanaiRegisterInfo::getCalleeSavedRegs(const MachineFunction * /*MF*/) const {
+  return CSR_SaveList;
+}
+
+BitVector LanaiRegisterInfo::getReservedRegs(const MachineFunction &MF) const {
+  BitVector Reserved(getNumRegs());
+
+  Reserved.set(Lanai::R0);
+  Reserved.set(Lanai::R1);
+  Reserved.set(Lanai::PC);
+  Reserved.set(Lanai::R2);
+  Reserved.set(Lanai::SP);
+  Reserved.set(Lanai::R4);
+  Reserved.set(Lanai::FP);
+  Reserved.set(Lanai::R5);
+  Reserved.set(Lanai::RR1);
+  Reserved.set(Lanai::R10);
+  Reserved.set(Lanai::RR2);
+  Reserved.set(Lanai::R11);
+  Reserved.set(Lanai::RCA);
+  Reserved.set(Lanai::R15);
+  if (hasBasePointer(MF))
+    Reserved.set(getBaseRegister());
+  return Reserved;
+}
+
+bool LanaiRegisterInfo::requiresRegisterScavenging(
+    const MachineFunction & /*MF*/) const {
+  return true;
+}
+
+bool LanaiRegisterInfo::trackLivenessAfterRegAlloc(
+    const MachineFunction & /*MF*/) const {
+  return true;
+}
+
+static bool isALUArithLoOpcode(unsigned Opcode) {
+  switch (Opcode) {
+  case Lanai::ADD_I_LO:
+  case Lanai::SUB_I_LO:
+  case Lanai::ADD_F_I_LO:
+  case Lanai::SUB_F_I_LO:
+  case Lanai::ADDC_I_LO:
+  case Lanai::SUBB_I_LO:
+  case Lanai::ADDC_F_I_LO:
+  case Lanai::SUBB_F_I_LO:
+    return true;
+  default:
+    return false;
+  }
+}
+
+static unsigned getOppositeALULoOpcode(unsigned Opcode) {
+  switch (Opcode) {
+  case Lanai::ADD_I_LO:
+    return Lanai::SUB_I_LO;
+  case Lanai::SUB_I_LO:
+    return Lanai::ADD_I_LO;
+  case Lanai::ADD_F_I_LO:
+    return Lanai::SUB_F_I_LO;
+  case Lanai::SUB_F_I_LO:
+    return Lanai::ADD_F_I_LO;
+  case Lanai::ADDC_I_LO:
+    return Lanai::SUBB_I_LO;
+  case Lanai::SUBB_I_LO:
+    return Lanai::ADDC_I_LO;
+  case Lanai::ADDC_F_I_LO:
+    return Lanai::SUBB_F_I_LO;
+  case Lanai::SUBB_F_I_LO:
+    return Lanai::ADDC_F_I_LO;
+  default:
+    llvm_unreachable("Invalid ALU lo opcode");
+  }
+}
+
+static unsigned getRRMOpcodeVariant(unsigned Opcode) {
+  switch (Opcode) {
+  case Lanai::LDBs_RI:
+    return Lanai::LDBs_RR;
+  case Lanai::LDBz_RI:
+    return Lanai::LDBz_RR;
+  case Lanai::LDHs_RI:
+    return Lanai::LDHs_RR;
+  case Lanai::LDHz_RI:
+    return Lanai::LDHz_RR;
+  case Lanai::LDW_RI:
+    return Lanai::LDW_RR;
+  case Lanai::STB_RI:
+    return Lanai::STB_RR;
+  case Lanai::STH_RI:
+    return Lanai::STH_RR;
+  case Lanai::SW_RI:
+    return Lanai::SW_RR;
+  default:
+    llvm_unreachable("Opcode has no RRM variant");
+  }
+}
+
+void LanaiRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II,
+                                            int SPAdj, unsigned FIOperandNum,
+                                            RegScavenger *RS) const {
+  assert(SPAdj == 0 && "Unexpected");
+
+  MachineInstr &MI = *II;
+  MachineFunction &MF = *MI.getParent()->getParent();
+  const TargetInstrInfo *TII = MF.getSubtarget().getInstrInfo();
+  const TargetFrameLowering *TFI = MF.getSubtarget().getFrameLowering();
+  bool HasFP = TFI->hasFP(MF);
+  DebugLoc DL = MI.getDebugLoc();
+
+  int FrameIndex = MI.getOperand(FIOperandNum).getIndex();
+
+  int Offset = MF.getFrameInfo()->getObjectOffset(FrameIndex) +
+               MI.getOperand(FIOperandNum + 1).getImm();
+
+  // Addressable stack objects are addressed using neg. offsets from fp
+  // or pos. offsets from sp/basepointer
+  if (!HasFP || (needsStackRealignment(MF) && FrameIndex >= 0))
+    Offset += MF.getFrameInfo()->getStackSize();
+
+  unsigned FrameReg = getFrameRegister(MF);
+  if (FrameIndex >= 0) {
+    if (hasBasePointer(MF))
+      FrameReg = getBaseRegister();
+    else if (needsStackRealignment(MF))
+      FrameReg = Lanai::SP;
+  }
+
+  // Replace frame index with a frame pointer reference.
+  // If the offset is small enough to fit in the immediate field, directly
+  // encode it.
+  // Otherwise scavenge a register and encode it into a MOVHI, OR_I_LO sequence.
+  if ((isSPLSOpcode(MI.getOpcode()) && !isInt<10>(Offset)) ||
+      !isInt<16>(Offset)) {
+    assert(RS && "Register scavenging must be on");
+    unsigned Reg = RS->FindUnusedReg(&Lanai::GPRRegClass);
+    if (!Reg)
+      Reg = RS->scavengeRegister(&Lanai::GPRRegClass, II, SPAdj);
+    assert(Reg && "Register scavenger failed");
+
+    bool HasNegOffset = false;
+    // ALU ops have unsigned immediate values. If the Offset is negative, we
+    // negate it here and reverse the opcode later.
+    if (Offset < 0) {
+      HasNegOffset = true;
+      Offset = -Offset;
+    }
+
+    if (!isInt<16>(Offset)) {
+      // Reg = hi(offset) | lo(offset)
+      BuildMI(*MI.getParent(), II, DL, TII->get(Lanai::MOVHI), Reg)
+          .addImm(static_cast<uint32_t>(Offset) >> 16);
+      BuildMI(*MI.getParent(), II, DL, TII->get(Lanai::OR_I_LO), Reg)
+          .addReg(Reg)
+          .addImm(Offset & 0xffffU);
+    } else {
+      // Reg = mov(offset)
+      BuildMI(*MI.getParent(), II, DL, TII->get(Lanai::ADD_I_LO), Reg)
+          .addImm(0)
+          .addImm(Offset);
+    }
+    // Reg = FrameReg OP Reg
+    if (MI.getOpcode() == Lanai::ADD_I_LO) {
+      BuildMI(*MI.getParent(), II, DL,
+              HasNegOffset ? TII->get(Lanai::SUB_R) : TII->get(Lanai::ADD_R),
+              MI.getOperand(0).getReg())
+          .addReg(FrameReg)
+          .addReg(Reg)
+          .addImm(LPCC::ICC_T);
+      MI.eraseFromParent();
+      return;
+    }
+    if (isSPLSOpcode(MI.getOpcode()) || isRMOpcode(MI.getOpcode())) {
+      MI.setDesc(TII->get(getRRMOpcodeVariant(MI.getOpcode())));
+      if (HasNegOffset) {
+        // Change the ALU op (operand 3) from LPAC::ADD (the default) to
+        // LPAC::SUB with the already negated offset.
+        assert((MI.getOperand(3).getImm() == LPAC::ADD) &&
+               "Unexpected ALU op in RRM instruction");
+        MI.getOperand(3).setImm(LPAC::SUB);
+      }
+    } else
+      llvm_unreachable("Unexpected opcode in frame index operation");
+
+    MI.getOperand(FIOperandNum).ChangeToRegister(FrameReg, /*isDef=*/false);
+    MI.getOperand(FIOperandNum + 1)
+        .ChangeToRegister(Reg, /*isDef=*/false, /*isImp=*/false,
+                          /*isKill=*/true);
+    return;
+  }
+
+  // ALU arithmetic ops take unsigned immediates. If the offset is negative,
+  // we replace the instruction with one that inverts the opcode and negates
+  // the immediate.
+  if ((Offset < 0) && isALUArithLoOpcode(MI.getOpcode())) {
+    unsigned NewOpcode = getOppositeALULoOpcode(MI.getOpcode());
+    // We know this is an ALU op, so we know the operands are as follows:
+    // 0: destination register
+    // 1: source register (frame register)
+    // 2: immediate
+    BuildMI(*MI.getParent(), II, DL, TII->get(NewOpcode),
+            MI.getOperand(0).getReg())
+        .addReg(FrameReg)
+        .addImm(-Offset);
+    MI.eraseFromParent();
+  } else {
+    MI.getOperand(FIOperandNum).ChangeToRegister(FrameReg, /*isDef=*/false);
+    MI.getOperand(FIOperandNum + 1).ChangeToImmediate(Offset);
+  }
+}
+
+bool LanaiRegisterInfo::hasBasePointer(const MachineFunction &MF) const {
+  const MachineFrameInfo *MFI = MF.getFrameInfo();
+  // When we need stack realignment and there are dynamic allocas, we can't
+  // reference off of the stack pointer, so we reserve a base pointer.
+  if (needsStackRealignment(MF) && MFI->hasVarSizedObjects())
+    return true;
+
+  return false;
+}
+
+unsigned LanaiRegisterInfo::getRARegister() const { return Lanai::RCA; }
+
+unsigned
+LanaiRegisterInfo::getFrameRegister(const MachineFunction & /*MF*/) const {
+  return Lanai::FP;
+}
+
+unsigned LanaiRegisterInfo::getBaseRegister() const { return Lanai::R14; }
+
+bool LanaiRegisterInfo::canRealignStack(const MachineFunction &MF) const {
+  if (!TargetRegisterInfo::canRealignStack(MF))
+    return false;
+  return true;
+}
+
+unsigned LanaiRegisterInfo::getEHExceptionRegister() const {
+  llvm_unreachable("no exception support");
+  return 0;
+}
+
+unsigned LanaiRegisterInfo::getEHHandlerRegister() const {
+  llvm_unreachable("no exception support");
+  return 0;
+}
+
+const uint32_t *
+LanaiRegisterInfo::getCallPreservedMask(const MachineFunction & /*MF*/,
+                                        CallingConv::ID /*CC*/) const {
+  return CSR_RegMask;
+}
diff --git a/lib/Target/Lanai/LanaiRegisterInfo.h b/lib/Target/Lanai/LanaiRegisterInfo.h
new file mode 100644
index 000000000000..8b84bbc460e8
--- /dev/null
+++ b/lib/Target/Lanai/LanaiRegisterInfo.h
@@ -0,0 +1,63 @@
+//===- LanaiRegisterInfo.h - Lanai Register Information Impl ----*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the Lanai implementation of the TargetRegisterInfo class.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_LANAI_LANAIREGISTERINFO_H
+#define LLVM_LIB_TARGET_LANAI_LANAIREGISTERINFO_H
+
+#include "llvm/Target/TargetRegisterInfo.h"
+
+#define GET_REGINFO_HEADER
+#include "LanaiGenRegisterInfo.inc"
+
+namespace llvm {
+
+class TargetInstrInfo;
+class Type;
+
+struct LanaiRegisterInfo : public LanaiGenRegisterInfo {
+  LanaiRegisterInfo();
+
+  const uint32_t *getCallPreservedMask(const MachineFunction &MF,
+                                       CallingConv::ID) const override;
+
+  // Code Generation virtual methods.
+  const uint16_t *
+  getCalleeSavedRegs(const MachineFunction *MF = 0) const override;
+
+  BitVector getReservedRegs(const MachineFunction &MF) const override;
+
+  bool requiresRegisterScavenging(const MachineFunction &MF) const override;
+
+  bool trackLivenessAfterRegAlloc(const MachineFunction &MF) const override;
+
+  void eliminateFrameIndex(MachineBasicBlock::iterator II, int SPAdj,
+                           unsigned FIOperandNum,
+                           RegScavenger *RS = NULL) const override;
+
+  bool canRealignStack(const MachineFunction &MF) const override;
+
+  // Debug information queries.
+  unsigned getRARegister() const;
+  unsigned getFrameRegister(const MachineFunction &MF) const override;
+  unsigned getBaseRegister() const;
+  bool hasBasePointer(const MachineFunction &MF) const;
+
+  // Exception handling queries.
+  unsigned getEHExceptionRegister() const;
+  unsigned getEHHandlerRegister() const;
+  int getDwarfRegNum(unsigned RegNum, bool IsEH) const;
+};
+
+} // namespace llvm
+
+#endif // LLVM_LIB_TARGET_LANAI_LANAIREGISTERINFO_H
diff --git a/lib/Target/Lanai/LanaiRegisterInfo.td b/lib/Target/Lanai/LanaiRegisterInfo.td
new file mode 100644
index 000000000000..cf8cfe30cce9
--- /dev/null
+++ b/lib/Target/Lanai/LanaiRegisterInfo.td
@@ -0,0 +1,64 @@
+//===- LanaiRegisterInfo.td - Lanai Register defs ------------*- tablegen -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//  Declarations that describe the Lanai register file
+//===----------------------------------------------------------------------===//
+
+// Registers are identified with 5-bit ID numbers.
+class LanaiReg<bits<5> num, string n, list<Register> subregs = [],
+               list<string> altNames = []> : Register<n, altNames> {
+  field bits<5> Num;
+  let Num = num;
+  let Namespace = "Lanai";
+  let SubRegs = subregs;
+}
+
+let Namespace = "Lanai" in {
+  def sub_32 : SubRegIndex<32>;
+}
+
+// Integer registers
+foreach i = 0-31 in {
+  def R#i : LanaiReg<i, "r"#i>, DwarfRegNum<[i]>;
+}
+
+// Register aliases
+let SubRegIndices = [sub_32] in {
+  def PC  : LanaiReg< 2,  "pc",  [R2]>,  DwarfRegAlias<R2>;
+  def SP  : LanaiReg< 4,  "sp",  [R4]>,  DwarfRegAlias<R4>;
+  def FP  : LanaiReg< 5,  "fp",  [R5]>,  DwarfRegAlias<R5>;
+  def RV  : LanaiReg< 8,  "rv",  [R8]>,  DwarfRegAlias<R8>;
+  def RR1 : LanaiReg<10, "rr1", [R10]>, DwarfRegAlias<R10>;
+  def RR2 : LanaiReg<11, "rr2", [R11]>, DwarfRegAlias<R11>;
+  def RCA : LanaiReg<15, "rca", [R15]>, DwarfRegAlias<R15>;
+}
+
+// Define a status register to capture the dependencies between the set flag
+// and setcc instructions
+def SR : LanaiReg< 0, "sw">;
+
+// Register classes.
+def GPR : RegisterClass<"Lanai", [i32], 32,
+    (add R3, R9, R12, R13, R14, R16, R17,
+     (sequence "R%i", 20, 31),
+     R6, R7, R18, R19, // registers for passing arguments
+     R15, RCA, // register for constant addresses
+     R10, RR1, R11, RR2, // programmer controlled registers
+     R8,  RV,  // return value
+     R5,  FP,  // frame pointer
+     R4,  SP,  // stack pointer
+     R2,  PC,  // program counter
+     R1,       // all 1s (0xffffffff)
+     R0        // constant 0
+    )>;
+
+// Condition code register class
+def CCR : RegisterClass<"Lanai", [i32], 32, (add SR)> {
+  let CopyCost = -1; // Don't allow copying of status registers
+  let isAllocatable = 0;
+}
diff --git a/lib/Target/Lanai/LanaiSchedule.td b/lib/Target/Lanai/LanaiSchedule.td
new file mode 100644
index 000000000000..7f931c4be8bb
--- /dev/null
+++ b/lib/Target/Lanai/LanaiSchedule.td
@@ -0,0 +1,70 @@
+//=-LanaiSchedule.td - Lanai Scheduling Definitions --*- tablegen -*-=========//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+def ALU_FU  : FuncUnit;
+def LDST_FU : FuncUnit;
+
+def IIC_ALU  : InstrItinClass;
+def IIC_LD   : InstrItinClass;
+def IIC_ST   : InstrItinClass;
+def IIC_LDSW : InstrItinClass;
+def IIC_STSW : InstrItinClass;
+
+def LanaiItinerary : ProcessorItineraries<[ALU_FU, LDST_FU],[],[
+  InstrItinData<IIC_LD,   [InstrStage<1, [LDST_FU]>]>,
+  InstrItinData<IIC_ST,   [InstrStage<1, [LDST_FU]>]>,
+  InstrItinData<IIC_LDSW, [InstrStage<2, [LDST_FU]>]>,
+  InstrItinData<IIC_STSW, [InstrStage<2, [LDST_FU]>]>,
+  InstrItinData<IIC_ALU,  [InstrStage<1, [ALU_FU]>]>
+]>;
+
+def LanaiSchedModel : SchedMachineModel {
+  // Cycles for loads to access the cache [default = -1]
+  let LoadLatency = 2;
+
+  // Max micro-ops that can be buffered for optimized loop dispatch/execution.
+  // [default = -1]
+  let LoopMicroOpBufferSize = 0;
+
+  // Allow scheduler to assign default model to any unrecognized opcodes.
+  // [default = 1]
+  let CompleteModel = 0;
+
+  // Max micro-ops that may be scheduled per cycle. [default = 1]
+  let IssueWidth = 1;
+
+  // Extra cycles for a mispredicted branch. [default = -1]
+  let MispredictPenalty = 10;
+
+  // Enable Post RegAlloc Scheduler pass. [default = 0]
+  let PostRAScheduler = 0;
+
+  // Max micro-ops that can be buffered. [default = -1]
+  let MicroOpBufferSize = 0;
+
+  // Per-cycle resources tables. [default = NoItineraries]
+  let Itineraries = LanaiItinerary;
+}
+
+def ALU : ProcResource<1> { let BufferSize = 0; }
+def LdSt : ProcResource<1> { let BufferSize = 0; }
+
+def WriteLD   : SchedWrite;
+def WriteST   : SchedWrite;
+def WriteLDSW : SchedWrite;
+def WriteSTSW : SchedWrite;
+def WriteALU  : SchedWrite;
+
+let SchedModel = LanaiSchedModel in {
+  def : WriteRes<WriteLD, [LdSt]>   { let Latency = 2; }
+  def : WriteRes<WriteST, [LdSt]>   { let Latency = 2; }
+  def : WriteRes<WriteLDSW, [LdSt]> { let Latency = 2; }
+  def : WriteRes<WriteSTSW, [LdSt]> { let Latency = 4; }
+  def : WriteRes<WriteALU, [ALU]>   { let Latency = 1; }
+}
diff --git a/lib/Target/Lanai/LanaiSelectionDAGInfo.cpp b/lib/Target/Lanai/LanaiSelectionDAGInfo.cpp
new file mode 100644
index 000000000000..b71c30fe3e05
--- /dev/null
+++ b/lib/Target/Lanai/LanaiSelectionDAGInfo.cpp
@@ -0,0 +1,35 @@
+//===-- LanaiSelectionDAGInfo.cpp - Lanai SelectionDAG Info -------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the LanaiSelectionDAGInfo class.
+//
+//===----------------------------------------------------------------------===//
+
+#include "LanaiSelectionDAGInfo.h"
+
+#include "LanaiTargetMachine.h"
+
+#define DEBUG_TYPE "lanai-selectiondag-info"
+
+namespace llvm {
+
+SDValue LanaiSelectionDAGInfo::EmitTargetCodeForMemcpy(
+    SelectionDAG & /*DAG*/, const SDLoc & /*dl*/, SDValue /*Chain*/,
+    SDValue /*Dst*/, SDValue /*Src*/, SDValue Size, unsigned /*Align*/,
+    bool /*isVolatile*/, bool /*AlwaysInline*/,
+    MachinePointerInfo /*DstPtrInfo*/,
+    MachinePointerInfo /*SrcPtrInfo*/) const {
+  ConstantSDNode *ConstantSize = dyn_cast<ConstantSDNode>(Size);
+  if (!ConstantSize)
+    return SDValue();
+
+  return SDValue();
+}
+
+} // namespace llvm
diff --git a/lib/Target/Lanai/LanaiSelectionDAGInfo.h b/lib/Target/Lanai/LanaiSelectionDAGInfo.h
new file mode 100644
index 000000000000..bfd2be2ede09
--- /dev/null
+++ b/lib/Target/Lanai/LanaiSelectionDAGInfo.h
@@ -0,0 +1,36 @@
+//===-- LanaiSelectionDAGInfo.h - Lanai SelectionDAG Info -----*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines the Lanai subclass for TargetSelectionDAGInfo.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_LANAI_LANAISELECTIONDAGINFO_H
+#define LLVM_LIB_TARGET_LANAI_LANAISELECTIONDAGINFO_H
+
+#include "llvm/CodeGen/SelectionDAGTargetInfo.h"
+#include "llvm/Target/TargetMachine.h"
+
+namespace llvm {
+
+class LanaiSelectionDAGInfo : public SelectionDAGTargetInfo {
+public:
+  LanaiSelectionDAGInfo() = default;
+
+  SDValue EmitTargetCodeForMemcpy(SelectionDAG &DAG, const SDLoc &dl,
+                                  SDValue Chain, SDValue Dst, SDValue Src,
+                                  SDValue Size, unsigned Align, bool isVolatile,
+                                  bool AlwaysInline,
+                                  MachinePointerInfo DstPtrInfo,
+                                  MachinePointerInfo SrcPtrInfo) const override;
+};
+
+} // namespace llvm
+
+#endif // LLVM_LIB_TARGET_LANAI_LANAISELECTIONDAGINFO_H
diff --git a/lib/Target/Lanai/LanaiSubtarget.cpp b/lib/Target/Lanai/LanaiSubtarget.cpp
new file mode 100644
index 000000000000..0fa5e82a7a66
--- /dev/null
+++ b/lib/Target/Lanai/LanaiSubtarget.cpp
@@ -0,0 +1,47 @@
+//===- LanaiSubtarget.cpp - Lanai Subtarget Information -----------*- C++ -*-=//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the Lanai specific subclass of TargetSubtarget.
+//
+//===----------------------------------------------------------------------===//
+
+#include "LanaiSubtarget.h"
+
+#include "Lanai.h"
+
+#define DEBUG_TYPE "lanai-subtarget"
+
+#define GET_SUBTARGETINFO_TARGET_DESC
+#define GET_SUBTARGETINFO_CTOR
+#include "LanaiGenSubtargetInfo.inc"
+
+using namespace llvm;
+
+void LanaiSubtarget::initSubtargetFeatures(StringRef CPU, StringRef FS) {
+  std::string CPUName = CPU;
+  if (CPUName.empty())
+    CPUName = "generic";
+
+  ParseSubtargetFeatures(CPUName, FS);
+}
+
+LanaiSubtarget &LanaiSubtarget::initializeSubtargetDependencies(StringRef CPU,
+                                                                StringRef FS) {
+  initSubtargetFeatures(CPU, FS);
+  return *this;
+}
+
+LanaiSubtarget::LanaiSubtarget(const Triple &TargetTriple, StringRef Cpu,
+                               StringRef FeatureString, const TargetMachine &TM,
+                               const TargetOptions & /*Options*/,
+                               CodeModel::Model /*CodeModel*/,
+                               CodeGenOpt::Level /*OptLevel*/)
+    : LanaiGenSubtargetInfo(TargetTriple, Cpu, FeatureString),
+      FrameLowering(initializeSubtargetDependencies(Cpu, FeatureString)),
+      InstrInfo(), TLInfo(TM, *this), TSInfo() {}
diff --git a/lib/Target/Lanai/LanaiSubtarget.h b/lib/Target/Lanai/LanaiSubtarget.h
new file mode 100644
index 000000000000..2732ef3097ec
--- /dev/null
+++ b/lib/Target/Lanai/LanaiSubtarget.h
@@ -0,0 +1,76 @@
+//=====-- LanaiSubtarget.h - Define Subtarget for the Lanai -----*- C++ -*--==//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file declares the Lanai specific subclass of TargetSubtarget.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_LANAI_LANAISUBTARGET_H
+#define LLVM_LIB_TARGET_LANAI_LANAISUBTARGET_H
+
+#include "LanaiFrameLowering.h"
+#include "LanaiISelLowering.h"
+#include "LanaiInstrInfo.h"
+#include "LanaiSelectionDAGInfo.h"
+#include "llvm/IR/DataLayout.h"
+#include "llvm/Target/TargetFrameLowering.h"
+#include "llvm/Target/TargetMachine.h"
+#include "llvm/Target/TargetSubtargetInfo.h"
+
+#define GET_SUBTARGETINFO_HEADER
+#include "LanaiGenSubtargetInfo.inc"
+
+namespace llvm {
+
+class LanaiSubtarget : public LanaiGenSubtargetInfo {
+public:
+  // This constructor initializes the data members to match that
+  // of the specified triple.
+  LanaiSubtarget(const Triple &TargetTriple, StringRef Cpu,
+                 StringRef FeatureString, const TargetMachine &TM,
+                 const TargetOptions &Options, CodeModel::Model CodeModel,
+                 CodeGenOpt::Level OptLevel);
+
+  // ParseSubtargetFeatures - Parses features string setting specified
+  // subtarget options.  Definition of function is auto generated by tblgen.
+  void ParseSubtargetFeatures(StringRef CPU, StringRef FS);
+
+  LanaiSubtarget &initializeSubtargetDependencies(StringRef CPU, StringRef FS);
+
+  void initSubtargetFeatures(StringRef CPU, StringRef FS);
+
+  bool enableMachineScheduler() const override { return true; }
+
+  const LanaiInstrInfo *getInstrInfo() const override { return &InstrInfo; }
+
+  const TargetFrameLowering *getFrameLowering() const override {
+    return &FrameLowering;
+  }
+
+  const LanaiRegisterInfo *getRegisterInfo() const override {
+    return &InstrInfo.getRegisterInfo();
+  }
+
+  const LanaiTargetLowering *getTargetLowering() const override {
+    return &TLInfo;
+  }
+
+  const LanaiSelectionDAGInfo *getSelectionDAGInfo() const override {
+    return &TSInfo;
+  }
+
+private:
+  LanaiFrameLowering FrameLowering;
+  LanaiInstrInfo InstrInfo;
+  LanaiTargetLowering TLInfo;
+  LanaiSelectionDAGInfo TSInfo;
+};
+} // namespace llvm
+
+#endif // LLVM_LIB_TARGET_LANAI_LANAISUBTARGET_H
diff --git a/lib/Target/Lanai/LanaiTargetMachine.cpp b/lib/Target/Lanai/LanaiTargetMachine.cpp
new file mode 100644
index 000000000000..b1f4b496eb9e
--- /dev/null
+++ b/lib/Target/Lanai/LanaiTargetMachine.cpp
@@ -0,0 +1,112 @@
+//===-- LanaiTargetMachine.cpp - Define TargetMachine for Lanai ---------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// Implements the info about Lanai target spec.
+//
+//===----------------------------------------------------------------------===//
+
+#include "LanaiTargetMachine.h"
+
+#include "Lanai.h"
+#include "LanaiTargetObjectFile.h"
+#include "LanaiTargetTransformInfo.h"
+#include "llvm/Analysis/TargetTransformInfo.h"
+#include "llvm/CodeGen/Passes.h"
+#include "llvm/CodeGen/TargetLoweringObjectFileImpl.h"
+#include "llvm/CodeGen/TargetPassConfig.h"
+#include "llvm/Support/FormattedStream.h"
+#include "llvm/Support/TargetRegistry.h"
+#include "llvm/Target/TargetOptions.h"
+
+using namespace llvm;
+
+namespace llvm {
+void initializeLanaiMemAluCombinerPass(PassRegistry &);
+} // namespace llvm
+
+extern "C" void LLVMInitializeLanaiTarget() {
+  // Register the target.
+  RegisterTargetMachine<LanaiTargetMachine> registered_target(TheLanaiTarget);
+}
+
+static std::string computeDataLayout() {
+  // Data layout (keep in sync with clang/lib/Basic/Targets.cpp)
+  return "E"        // Big endian
+         "-m:e"     // ELF name manging
+         "-p:32:32" // 32-bit pointers, 32 bit aligned
+         "-i64:64"  // 64 bit integers, 64 bit aligned
+         "-a:0:32"  // 32 bit alignment of objects of aggregate type
+         "-n32"     // 32 bit native integer width
+         "-S64";    // 64 bit natural stack alignment
+}
+
+static Reloc::Model getEffectiveRelocModel(Optional<Reloc::Model> RM) {
+  if (!RM.hasValue())
+    return Reloc::PIC_;
+  return *RM;
+}
+
+LanaiTargetMachine::LanaiTargetMachine(const Target &T, const Triple &TT,
+                                       StringRef Cpu, StringRef FeatureString,
+                                       const TargetOptions &Options,
+                                       Optional<Reloc::Model> RM,
+                                       CodeModel::Model CodeModel,
+                                       CodeGenOpt::Level OptLevel)
+    : LLVMTargetMachine(T, computeDataLayout(), TT, Cpu, FeatureString, Options,
+                        getEffectiveRelocModel(RM), CodeModel, OptLevel),
+      Subtarget(TT, Cpu, FeatureString, *this, Options, CodeModel, OptLevel),
+      TLOF(new LanaiTargetObjectFile()) {
+  initAsmInfo();
+}
+
+TargetIRAnalysis LanaiTargetMachine::getTargetIRAnalysis() {
+  return TargetIRAnalysis([this](const Function &F) {
+    return TargetTransformInfo(LanaiTTIImpl(this, F));
+  });
+}
+
+namespace {
+// Lanai Code Generator Pass Configuration Options.
+class LanaiPassConfig : public TargetPassConfig {
+public:
+  LanaiPassConfig(LanaiTargetMachine *TM, PassManagerBase *PassManager)
+      : TargetPassConfig(TM, *PassManager) {}
+
+  LanaiTargetMachine &getLanaiTargetMachine() const {
+    return getTM<LanaiTargetMachine>();
+  }
+
+  bool addInstSelector() override;
+  void addPreSched2() override;
+  void addPreEmitPass() override;
+};
+} // namespace
+
+TargetPassConfig *
+LanaiTargetMachine::createPassConfig(PassManagerBase &PassManager) {
+  return new LanaiPassConfig(this, &PassManager);
+}
+
+// Install an instruction selector pass.
+bool LanaiPassConfig::addInstSelector() {
+  addPass(createLanaiISelDag(getLanaiTargetMachine()));
+  return false;
+}
+
+// Implemented by targets that want to run passes immediately before
+// machine code is emitted.
+void LanaiPassConfig::addPreEmitPass() {
+  addPass(createLanaiDelaySlotFillerPass(getLanaiTargetMachine()));
+}
+
+// Run passes after prolog-epilog insertion and before the second instruction
+// scheduling pass.
+void LanaiPassConfig::addPreSched2() {
+  addPass(createLanaiMemAluCombinerPass());
+}
diff --git a/lib/Target/Lanai/LanaiTargetMachine.h b/lib/Target/Lanai/LanaiTargetMachine.h
new file mode 100644
index 000000000000..5278c70d909d
--- /dev/null
+++ b/lib/Target/Lanai/LanaiTargetMachine.h
@@ -0,0 +1,55 @@
+//===-- LanaiTargetMachine.h - Define TargetMachine for Lanai --- C++ ---===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file declares the Lanai specific subclass of TargetMachine.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_LANAI_LANAITARGETMACHINE_H
+#define LLVM_LIB_TARGET_LANAI_LANAITARGETMACHINE_H
+
+#include "LanaiFrameLowering.h"
+#include "LanaiISelLowering.h"
+#include "LanaiInstrInfo.h"
+#include "LanaiSelectionDAGInfo.h"
+#include "LanaiSubtarget.h"
+#include "llvm/Target/TargetFrameLowering.h"
+#include "llvm/Target/TargetMachine.h"
+
+namespace llvm {
+class formatted_raw_ostream;
+
+class LanaiTargetMachine : public LLVMTargetMachine {
+  LanaiSubtarget Subtarget;
+  std::unique_ptr<TargetLoweringObjectFile> TLOF;
+
+public:
+  LanaiTargetMachine(const Target &TheTarget, const Triple &TargetTriple,
+                     StringRef Cpu, StringRef FeatureString,
+                     const TargetOptions &Options,
+                     Optional<Reloc::Model> RelocationModel,
+                     CodeModel::Model CodeModel, CodeGenOpt::Level OptLevel);
+
+  const LanaiSubtarget *
+  getSubtargetImpl(const llvm::Function & /*Fn*/) const override {
+    return &Subtarget;
+  }
+
+  TargetIRAnalysis getTargetIRAnalysis() override;
+
+  // Pass Pipeline Configuration
+  TargetPassConfig *createPassConfig(PassManagerBase &pass_manager) override;
+
+  TargetLoweringObjectFile *getObjFileLowering() const override {
+    return TLOF.get();
+  }
+};
+} // namespace llvm
+
+#endif // LLVM_LIB_TARGET_LANAI_LANAITARGETMACHINE_H
diff --git a/lib/Target/Lanai/LanaiTargetObjectFile.cpp b/lib/Target/Lanai/LanaiTargetObjectFile.cpp
new file mode 100644
index 000000000000..4048c8535215
--- /dev/null
+++ b/lib/Target/Lanai/LanaiTargetObjectFile.cpp
@@ -0,0 +1,123 @@
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "LanaiTargetObjectFile.h"
+
+#include "LanaiSubtarget.h"
+#include "LanaiTargetMachine.h"
+#include "llvm/IR/DataLayout.h"
+#include "llvm/IR/DerivedTypes.h"
+#include "llvm/IR/GlobalVariable.h"
+#include "llvm/MC/MCContext.h"
+#include "llvm/MC/MCSectionELF.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/ELF.h"
+#include "llvm/Target/TargetMachine.h"
+
+using namespace llvm;
+
+static cl::opt<unsigned> SSThreshold(
+    "lanai-ssection-threshold", cl::Hidden,
+    cl::desc("Small data and bss section threshold size (default=0)"),
+    cl::init(0));
+
+void LanaiTargetObjectFile::Initialize(MCContext &Ctx,
+                                       const TargetMachine &TM) {
+  TargetLoweringObjectFileELF::Initialize(Ctx, TM);
+  InitializeELF(TM.Options.UseInitArray);
+
+  SmallDataSection = getContext().getELFSection(
+      ".sdata", ELF::SHT_PROGBITS, ELF::SHF_WRITE | ELF::SHF_ALLOC);
+  SmallBSSSection = getContext().getELFSection(".sbss", ELF::SHT_NOBITS,
+                                               ELF::SHF_WRITE | ELF::SHF_ALLOC);
+}
+
+// A address must be loaded from a small section if its size is less than the
+// small section size threshold. Data in this section must be addressed using
+// gp_rel operator.
+static bool isInSmallSection(uint64_t Size) {
+  // gcc has traditionally not treated zero-sized objects as small data, so this
+  // is effectively part of the ABI.
+  return Size > 0 && Size <= SSThreshold;
+}
+
+// Return true if this global address should be placed into small data/bss
+// section.
+bool LanaiTargetObjectFile::isGlobalInSmallSection(
+    const GlobalValue *GV, const TargetMachine &TM) const {
+  // We first check the case where global is a declaration, because finding
+  // section kind using getKindForGlobal() is only allowed for global
+  // definitions.
+  if (GV->isDeclaration() || GV->hasAvailableExternallyLinkage())
+    return isGlobalInSmallSectionImpl(GV, TM);
+
+  return isGlobalInSmallSection(GV, TM, getKindForGlobal(GV, TM));
+}
+
+// Return true if this global address should be placed into small data/bss
+// section.
+bool LanaiTargetObjectFile::isGlobalInSmallSection(const GlobalValue *GV,
+                                                   const TargetMachine &TM,
+                                                   SectionKind Kind) const {
+  return (isGlobalInSmallSectionImpl(GV, TM) &&
+          (Kind.isData() || Kind.isBSS() || Kind.isCommon()));
+}
+
+// Return true if this global address should be placed into small data/bss
+// section. This method does all the work, except for checking the section
+// kind.
+bool LanaiTargetObjectFile::isGlobalInSmallSectionImpl(
+    const GlobalValue *GV, const TargetMachine & /*TM*/) const {
+  // Only global variables, not functions.
+  const GlobalVariable *GVA = dyn_cast<GlobalVariable>(GV);
+  if (!GVA)
+    return false;
+
+  if (GV->hasLocalLinkage())
+    return false;
+
+  if (((GV->hasExternalLinkage() && GV->isDeclaration()) ||
+       GV->hasCommonLinkage()))
+    return false;
+
+  Type *Ty = GV->getType()->getElementType();
+  return isInSmallSection(
+      GV->getParent()->getDataLayout().getTypeAllocSize(Ty));
+}
+
+MCSection *
+LanaiTargetObjectFile::SelectSectionForGlobal(const GlobalValue *GV,
+                                              SectionKind Kind, Mangler &Mang,
+                                              const TargetMachine &TM) const {
+  // Handle Small Section classification here.
+  if (Kind.isBSS() && isGlobalInSmallSection(GV, TM, Kind))
+    return SmallBSSSection;
+  if (Kind.isData() && isGlobalInSmallSection(GV, TM, Kind))
+    return SmallDataSection;
+
+  // Otherwise, we work the same as ELF.
+  return TargetLoweringObjectFileELF::SelectSectionForGlobal(GV, Kind, Mang,
+                                                             TM);
+}
+
+/// Return true if this constant should be placed into small data section.
+bool LanaiTargetObjectFile::isConstantInSmallSection(const DataLayout &DL,
+                                                     const Constant *CN) const {
+  return isInSmallSection(DL.getTypeAllocSize(CN->getType()));
+}
+
+MCSection *LanaiTargetObjectFile::getSectionForConstant(const DataLayout &DL,
+                                                        SectionKind Kind,
+                                                        const Constant *C,
+                                                        unsigned &Align) const {
+  if (isConstantInSmallSection(DL, C))
+    return SmallDataSection;
+
+  // Otherwise, we work the same as ELF.
+  return TargetLoweringObjectFileELF::getSectionForConstant(DL, Kind, C, Align);
+}
diff --git a/lib/Target/Lanai/LanaiTargetObjectFile.h b/lib/Target/Lanai/LanaiTargetObjectFile.h
new file mode 100644
index 000000000000..eb5195469f55
--- /dev/null
+++ b/lib/Target/Lanai/LanaiTargetObjectFile.h
@@ -0,0 +1,46 @@
+//===-- LanaiTargetObjectFile.h - Lanai Object Info -----------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_LANAI_LANAITARGETOBJECTFILE_H
+#define LLVM_LIB_TARGET_LANAI_LANAITARGETOBJECTFILE_H
+
+#include "llvm/CodeGen/TargetLoweringObjectFileImpl.h"
+
+namespace llvm {
+class LanaiTargetMachine;
+class LanaiTargetObjectFile : public TargetLoweringObjectFileELF {
+  MCSection *SmallDataSection;
+  MCSection *SmallBSSSection;
+
+public:
+  void Initialize(MCContext &Ctx, const TargetMachine &TM) override;
+
+  /// Return true if this global address should be placed into small data/bss
+  /// section.
+  bool isGlobalInSmallSection(const GlobalValue *GV, const TargetMachine &TM,
+                              SectionKind Kind) const;
+  bool isGlobalInSmallSection(const GlobalValue *GV,
+                              const TargetMachine &TM) const;
+  bool isGlobalInSmallSectionImpl(const GlobalValue *GV,
+                                  const TargetMachine &TM) const;
+
+  MCSection *SelectSectionForGlobal(const GlobalValue *GV, SectionKind Kind,
+                                    Mangler &Mang,
+                                    const TargetMachine &TM) const override;
+
+  /// Return true if this constant should be placed into small data section.
+  bool isConstantInSmallSection(const DataLayout &DL, const Constant *CN) const;
+
+  MCSection *getSectionForConstant(const DataLayout &DL, SectionKind Kind,
+                                   const Constant *C,
+                                   unsigned &Align) const override;
+};
+} // end namespace llvm
+
+#endif // LLVM_LIB_TARGET_LANAI_LANAITARGETOBJECTFILE_H
diff --git a/lib/Target/Lanai/LanaiTargetTransformInfo.h b/lib/Target/Lanai/LanaiTargetTransformInfo.h
new file mode 100644
index 000000000000..6300d2502d67
--- /dev/null
+++ b/lib/Target/Lanai/LanaiTargetTransformInfo.h
@@ -0,0 +1,86 @@
+//===-- LanaiTargetTransformInfo.h - Lanai specific TTI ---------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file a TargetTransformInfo::Concept conforming object specific to the
+// Lanai target machine. It uses the target's detailed information to
+// provide more precise answers to certain TTI queries, while letting the
+// target independent and default TTI implementations handle the rest.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_LANAI_LANAITARGETTRANSFORMINFO_H
+#define LLVM_LIB_TARGET_LANAI_LANAITARGETTRANSFORMINFO_H
+
+#include "Lanai.h"
+#include "LanaiSubtarget.h"
+#include "LanaiTargetMachine.h"
+#include "llvm/Analysis/TargetTransformInfo.h"
+#include "llvm/CodeGen/BasicTTIImpl.h"
+#include "llvm/Target/TargetLowering.h"
+
+namespace llvm {
+class LanaiTTIImpl : public BasicTTIImplBase<LanaiTTIImpl> {
+  typedef BasicTTIImplBase<LanaiTTIImpl> BaseT;
+  typedef TargetTransformInfo TTI;
+  friend BaseT;
+
+  const LanaiSubtarget *ST;
+  const LanaiTargetLowering *TLI;
+
+  const LanaiSubtarget *getST() const { return ST; }
+  const LanaiTargetLowering *getTLI() const { return TLI; }
+
+public:
+  explicit LanaiTTIImpl(const LanaiTargetMachine *TM, const Function &F)
+      : BaseT(TM, F.getParent()->getDataLayout()), ST(TM->getSubtargetImpl(F)),
+        TLI(ST->getTargetLowering()) {}
+
+  LanaiTTIImpl(const LanaiTTIImpl &Arg)
+      : BaseT(static_cast<const BaseT &>(Arg)), ST(Arg.ST), TLI(Arg.TLI) {}
+  LanaiTTIImpl(LanaiTTIImpl &&Arg)
+      : BaseT(std::move(static_cast<BaseT &>(Arg))), ST(Arg.ST), TLI(Arg.TLI) {}
+
+  bool shouldBuildLookupTables() const { return false; }
+
+  TargetTransformInfo::PopcntSupportKind getPopcntSupport(unsigned TyWidth) {
+    if (TyWidth == 32)
+      return TTI::PSK_FastHardware;
+    return TTI::PSK_Software;
+  }
+
+  unsigned getArithmeticInstrCost(
+      unsigned Opcode, Type *Ty,
+      TTI::OperandValueKind Opd1Info = TTI::OK_AnyValue,
+      TTI::OperandValueKind Opd2Info = TTI::OK_AnyValue,
+      TTI::OperandValueProperties Opd1PropInfo = TTI::OP_None,
+      TTI::OperandValueProperties Opd2PropInfo = TTI::OP_None) {
+    int ISD = TLI->InstructionOpcodeToISD(Opcode);
+
+    switch (ISD) {
+    default:
+      return BaseT::getArithmeticInstrCost(Opcode, Ty, Opd1Info, Opd2Info,
+                                           Opd1PropInfo, Opd2PropInfo);
+    case ISD::MUL:
+    case ISD::SDIV:
+    case ISD::UDIV:
+    case ISD::UREM:
+      // This increases the cost associated with multiplication and division
+      // to 64 times what the baseline arithmetic cost is. The arithmetic
+      // instruction cost was arbitrarily chosen to reduce the desirability
+      // of emitting arithmetic instructions that are emulated in software.
+      // TODO: Investigate the performance impact given specialized lowerings.
+      return 64 * BaseT::getArithmeticInstrCost(Opcode, Ty, Opd1Info, Opd2Info,
+                                                Opd1PropInfo, Opd2PropInfo);
+    }
+  }
+};
+
+} // end namespace llvm
+
+#endif // LLVM_LIB_TARGET_LANAI_LANAITARGETTRANSFORMINFO_H
diff --git a/lib/Target/Lanai/MCTargetDesc/CMakeLists.txt b/lib/Target/Lanai/MCTargetDesc/CMakeLists.txt
new file mode 100644
index 000000000000..d65a1fd58901
--- /dev/null
+++ b/lib/Target/Lanai/MCTargetDesc/CMakeLists.txt
@@ -0,0 +1,8 @@
+add_llvm_library(LLVMLanaiMCTargetDesc
+  LanaiAsmBackend.cpp
+  LanaiELFObjectWriter.cpp
+  LanaiMCAsmInfo.cpp
+  LanaiMCCodeEmitter.cpp
+  LanaiMCExpr.cpp
+  LanaiMCTargetDesc.cpp
+)
diff --git a/lib/Target/Lanai/MCTargetDesc/LLVMBuild.txt b/lib/Target/Lanai/MCTargetDesc/LLVMBuild.txt
new file mode 100644
index 000000000000..7dc2a7694ab1
--- /dev/null
+++ b/lib/Target/Lanai/MCTargetDesc/LLVMBuild.txt
@@ -0,0 +1,23 @@
+;===-- ./lib/Target/Lanai/MCTargetDesc/LLVMBuild.txt -----------*- Conf -*--===;
+;
+;                     The LLVM Compiler Infrastructure
+;
+; This file is distributed under the University of Illinois Open Source
+; License. See LICENSE.TXT for details.
+;
+;===------------------------------------------------------------------------===;
+;
+; This is an LLVMBuild description file for the components in this subdirectory.
+;
+; For more information on the LLVMBuild system, please see:
+;
+;   http://llvm.org/docs/LLVMBuild.html
+;
+;===------------------------------------------------------------------------===;
+
+[component_0]
+type = Library
+name = LanaiMCTargetDesc
+parent = Lanai
+required_libraries = LanaiInfo LanaiInstPrinter MC Support
+add_to_library_groups = Lanai
diff --git a/lib/Target/Lanai/MCTargetDesc/LanaiAsmBackend.cpp b/lib/Target/Lanai/MCTargetDesc/LanaiAsmBackend.cpp
new file mode 100644
index 000000000000..a3d8699f1317
--- /dev/null
+++ b/lib/Target/Lanai/MCTargetDesc/LanaiAsmBackend.cpp
@@ -0,0 +1,172 @@
+//===-- LanaiAsmBackend.cpp - Lanai Assembler Backend ---------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "LanaiFixupKinds.h"
+#include "MCTargetDesc/LanaiMCTargetDesc.h"
+#include "llvm/MC/MCAsmBackend.h"
+#include "llvm/MC/MCAssembler.h"
+#include "llvm/MC/MCDirectives.h"
+#include "llvm/MC/MCELFObjectWriter.h"
+#include "llvm/MC/MCFixupKindInfo.h"
+#include "llvm/MC/MCObjectWriter.h"
+#include "llvm/MC/MCSubtargetInfo.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/raw_ostream.h"
+
+using namespace llvm;
+
+// Prepare value for the target space
+static unsigned adjustFixupValue(unsigned Kind, uint64_t Value) {
+  switch (Kind) {
+  case FK_Data_1:
+  case FK_Data_2:
+  case FK_Data_4:
+  case FK_Data_8:
+    return Value;
+  case Lanai::FIXUP_LANAI_21:
+  case Lanai::FIXUP_LANAI_21_F:
+  case Lanai::FIXUP_LANAI_25:
+  case Lanai::FIXUP_LANAI_32:
+  case Lanai::FIXUP_LANAI_HI16:
+  case Lanai::FIXUP_LANAI_LO16:
+    return Value;
+  default:
+    llvm_unreachable("Unknown fixup kind!");
+  }
+}
+
+namespace {
+class LanaiAsmBackend : public MCAsmBackend {
+  Triple::OSType OSType;
+
+public:
+  LanaiAsmBackend(const Target &T, Triple::OSType OST)
+      : MCAsmBackend(), OSType(OST) {}
+
+  void applyFixup(const MCFixup &Fixup, char *Data, unsigned DataSize,
+                  uint64_t Value, bool IsPCRel) const override;
+
+  MCObjectWriter *createObjectWriter(raw_pwrite_stream &OS) const override;
+
+  // No instruction requires relaxation
+  bool fixupNeedsRelaxation(const MCFixup & /*Fixup*/, uint64_t /*Value*/,
+                            const MCRelaxableFragment * /*DF*/,
+                            const MCAsmLayout & /*Layout*/) const override {
+    return false;
+  }
+
+  const MCFixupKindInfo &getFixupKindInfo(MCFixupKind Kind) const override;
+
+  unsigned getNumFixupKinds() const override {
+    return Lanai::NumTargetFixupKinds;
+  }
+
+  bool mayNeedRelaxation(const MCInst & /*Inst*/) const override {
+    return false;
+  }
+
+  void relaxInstruction(const MCInst & /*Inst*/,
+                        const MCSubtargetInfo & /*STI*/,
+                        MCInst & /*Res*/) const override {}
+
+  bool writeNopData(uint64_t Count, MCObjectWriter *OW) const override;
+};
+
+bool LanaiAsmBackend::writeNopData(uint64_t Count, MCObjectWriter *OW) const {
+  if ((Count % 4) != 0)
+    return false;
+
+  for (uint64_t i = 0; i < Count; i += 4)
+    OW->write32(0x15000000);
+
+  return true;
+}
+
+void LanaiAsmBackend::applyFixup(const MCFixup &Fixup, char *Data,
+                                 unsigned /*DataSize*/, uint64_t Value,
+                                 bool /*IsPCRel*/) const {
+  MCFixupKind Kind = Fixup.getKind();
+  Value = adjustFixupValue(static_cast<unsigned>(Kind), Value);
+
+  if (!Value)
+    return; // This value doesn't change the encoding
+
+  // Where in the object and where the number of bytes that need
+  // fixing up
+  unsigned Offset = Fixup.getOffset();
+  unsigned NumBytes = (getFixupKindInfo(Kind).TargetSize + 7) / 8;
+  unsigned FullSize = 4;
+
+  // Grab current value, if any, from bits.
+  uint64_t CurVal = 0;
+
+  // Load instruction and apply value
+  for (unsigned i = 0; i != NumBytes; ++i) {
+    unsigned Idx = (FullSize - 1 - i);
+    CurVal |= static_cast<uint64_t>(static_cast<uint8_t>(Data[Offset + Idx]))
+              << (i * 8);
+  }
+
+  uint64_t Mask =
+      (static_cast<uint64_t>(-1) >> (64 - getFixupKindInfo(Kind).TargetSize));
+  CurVal |= Value & Mask;
+
+  // Write out the fixed up bytes back to the code/data bits.
+  for (unsigned i = 0; i != NumBytes; ++i) {
+    unsigned Idx = (FullSize - 1 - i);
+    Data[Offset + Idx] = static_cast<uint8_t>((CurVal >> (i * 8)) & 0xff);
+  }
+}
+
+MCObjectWriter *
+LanaiAsmBackend::createObjectWriter(raw_pwrite_stream &OS) const {
+  return createLanaiELFObjectWriter(OS,
+                                    MCELFObjectTargetWriter::getOSABI(OSType));
+}
+
+const MCFixupKindInfo &
+LanaiAsmBackend::getFixupKindInfo(MCFixupKind Kind) const {
+  static const MCFixupKindInfo Infos[Lanai::NumTargetFixupKinds] = {
+      // This table *must* be in same the order of fixup_* kinds in
+      // LanaiFixupKinds.h.
+      // Note: The number of bits indicated here are assumed to be contiguous.
+      //   This does not hold true for LANAI_21 and LANAI_21_F which are applied
+      //   to bits 0x7cffff and 0x7cfffc, respectively. Since the 'bits' counts
+      //   here are used only for cosmetic purposes, we set the size to 16 bits
+      //   for these 21-bit relocation as llvm/lib/MC/MCAsmStreamer.cpp checks
+      //   no bits are set in the fixup range.
+      //
+      // name          offset bits flags
+      {"FIXUP_LANAI_NONE", 0, 32, 0},
+      {"FIXUP_LANAI_21", 16, 16 /*21*/, 0},
+      {"FIXUP_LANAI_21_F", 16, 16 /*21*/, 0},
+      {"FIXUP_LANAI_25", 7, 25, 0},
+      {"FIXUP_LANAI_32", 0, 32, 0},
+      {"FIXUP_LANAI_HI16", 16, 16, 0},
+      {"FIXUP_LANAI_LO16", 16, 16, 0}};
+
+  if (Kind < FirstTargetFixupKind)
+    return MCAsmBackend::getFixupKindInfo(Kind);
+
+  assert(unsigned(Kind - FirstTargetFixupKind) < getNumFixupKinds() &&
+         "Invalid kind!");
+  return Infos[Kind - FirstTargetFixupKind];
+}
+
+} // namespace
+
+MCAsmBackend *llvm::createLanaiAsmBackend(const Target &T,
+                                          const MCRegisterInfo & /*MRI*/,
+                                          const Triple &TheTriple,
+                                          StringRef /*CPU*/) {
+  if (!TheTriple.isOSBinFormatELF())
+    llvm_unreachable("OS not supported");
+
+  return new LanaiAsmBackend(T, TheTriple.getOS());
+}
diff --git a/lib/Target/Lanai/MCTargetDesc/LanaiBaseInfo.h b/lib/Target/Lanai/MCTargetDesc/LanaiBaseInfo.h
new file mode 100644
index 000000000000..ce7f83509c9b
--- /dev/null
+++ b/lib/Target/Lanai/MCTargetDesc/LanaiBaseInfo.h
@@ -0,0 +1,119 @@
+//===-- LanaiBaseInfo.h - Top level definitions for Lanai MC ----*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains small standalone helper functions and enum definitions for
+// the Lanai target useful for the compiler back-end and the MC libraries.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_LANAI_MCTARGETDESC_LANAIBASEINFO_H
+#define LLVM_LIB_TARGET_LANAI_MCTARGETDESC_LANAIBASEINFO_H
+
+#include "LanaiMCTargetDesc.h"
+#include "llvm/MC/MCExpr.h"
+#include "llvm/Support/DataTypes.h"
+#include "llvm/Support/ErrorHandling.h"
+
+namespace llvm {
+
+// LanaiII - This namespace holds all of the target specific flags that
+// instruction info tracks.
+namespace LanaiII {
+// Target Operand Flag enum.
+enum TOF {
+  //===------------------------------------------------------------------===//
+  // Lanai Specific MachineOperand flags.
+  MO_NO_FLAG,
+
+  // MO_ABS_HI/LO - Represents the hi or low part of an absolute symbol
+  // address.
+  MO_ABS_HI,
+  MO_ABS_LO,
+};
+} // namespace LanaiII
+
+static inline unsigned getLanaiRegisterNumbering(unsigned Reg) {
+  switch (Reg) {
+  case Lanai::R0:
+    return 0;
+  case Lanai::R1:
+    return 1;
+  case Lanai::R2:
+  case Lanai::PC:
+    return 2;
+  case Lanai::R3:
+    return 3;
+  case Lanai::R4:
+  case Lanai::SP:
+    return 4;
+  case Lanai::R5:
+  case Lanai::FP:
+    return 5;
+  case Lanai::R6:
+    return 6;
+  case Lanai::R7:
+    return 7;
+  case Lanai::R8:
+  case Lanai::RV:
+    return 8;
+  case Lanai::R9:
+    return 9;
+  case Lanai::R10:
+  case Lanai::RR1:
+    return 10;
+  case Lanai::R11:
+  case Lanai::RR2:
+    return 11;
+  case Lanai::R12:
+    return 12;
+  case Lanai::R13:
+    return 13;
+  case Lanai::R14:
+    return 14;
+  case Lanai::R15:
+  case Lanai::RCA:
+    return 15;
+  case Lanai::R16:
+    return 16;
+  case Lanai::R17:
+    return 17;
+  case Lanai::R18:
+    return 18;
+  case Lanai::R19:
+    return 19;
+  case Lanai::R20:
+    return 20;
+  case Lanai::R21:
+    return 21;
+  case Lanai::R22:
+    return 22;
+  case Lanai::R23:
+    return 23;
+  case Lanai::R24:
+    return 24;
+  case Lanai::R25:
+    return 25;
+  case Lanai::R26:
+    return 26;
+  case Lanai::R27:
+    return 27;
+  case Lanai::R28:
+    return 28;
+  case Lanai::R29:
+    return 29;
+  case Lanai::R30:
+    return 30;
+  case Lanai::R31:
+    return 31;
+  default:
+    llvm_unreachable("Unknown register number!");
+  }
+}
+} // namespace llvm
+#endif // LLVM_LIB_TARGET_LANAI_MCTARGETDESC_LANAIBASEINFO_H
diff --git a/lib/Target/Lanai/MCTargetDesc/LanaiELFObjectWriter.cpp b/lib/Target/Lanai/MCTargetDesc/LanaiELFObjectWriter.cpp
new file mode 100644
index 000000000000..e30d5e9a18eb
--- /dev/null
+++ b/lib/Target/Lanai/MCTargetDesc/LanaiELFObjectWriter.cpp
@@ -0,0 +1,95 @@
+//===-- LanaiELFObjectWriter.cpp - Lanai ELF Writer -----------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "MCTargetDesc/LanaiBaseInfo.h"
+#include "MCTargetDesc/LanaiFixupKinds.h"
+#include "MCTargetDesc/LanaiMCTargetDesc.h"
+#include "llvm/MC/MCELFObjectWriter.h"
+#include "llvm/MC/MCSymbol.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/raw_ostream.h"
+
+using namespace llvm;
+
+namespace {
+class LanaiELFObjectWriter : public MCELFObjectTargetWriter {
+public:
+  explicit LanaiELFObjectWriter(uint8_t OSABI);
+
+  ~LanaiELFObjectWriter() override;
+
+protected:
+  unsigned getRelocType(MCContext &Ctx, const MCValue &Target,
+                        const MCFixup &Fixup, bool IsPCRel) const override;
+  bool needsRelocateWithSymbol(const MCSymbol &SD,
+                               unsigned Type) const override;
+};
+} // namespace
+
+LanaiELFObjectWriter::LanaiELFObjectWriter(uint8_t OSABI)
+    : MCELFObjectTargetWriter(/*Is64Bit_=*/false, OSABI, ELF::EM_LANAI,
+                              /*HasRelocationAddend=*/true) {}
+
+LanaiELFObjectWriter::~LanaiELFObjectWriter() {}
+
+unsigned LanaiELFObjectWriter::getRelocType(MCContext & /*Ctx*/,
+                                            const MCValue & /*Target*/,
+                                            const MCFixup &Fixup,
+                                            bool /*IsPCRel*/) const {
+  unsigned Type;
+  unsigned Kind = static_cast<unsigned>(Fixup.getKind());
+  switch (Kind) {
+  case Lanai::FIXUP_LANAI_21:
+    Type = ELF::R_LANAI_21;
+    break;
+  case Lanai::FIXUP_LANAI_21_F:
+    Type = ELF::R_LANAI_21_F;
+    break;
+  case Lanai::FIXUP_LANAI_25:
+    Type = ELF::R_LANAI_25;
+    break;
+  case Lanai::FIXUP_LANAI_32:
+  case FK_Data_4:
+    Type = ELF::R_LANAI_32;
+    break;
+  case Lanai::FIXUP_LANAI_HI16:
+    Type = ELF::R_LANAI_HI16;
+    break;
+  case Lanai::FIXUP_LANAI_LO16:
+    Type = ELF::R_LANAI_LO16;
+    break;
+  case Lanai::FIXUP_LANAI_NONE:
+    Type = ELF::R_LANAI_NONE;
+    break;
+
+  default:
+    llvm_unreachable("Invalid fixup kind!");
+  }
+  return Type;
+}
+
+bool LanaiELFObjectWriter::needsRelocateWithSymbol(const MCSymbol & /*SD*/,
+                                                   unsigned Type) const {
+  switch (Type) {
+  case ELF::R_LANAI_21:
+  case ELF::R_LANAI_21_F:
+  case ELF::R_LANAI_25:
+  case ELF::R_LANAI_32:
+  case ELF::R_LANAI_HI16:
+    return true;
+  default:
+    return false;
+  }
+}
+
+MCObjectWriter *llvm::createLanaiELFObjectWriter(raw_pwrite_stream &OS,
+                                                 uint8_t OSABI) {
+  MCELFObjectTargetWriter *MOTW = new LanaiELFObjectWriter(OSABI);
+  return createELFObjectWriter(MOTW, OS, /*IsLittleEndian=*/false);
+}
diff --git a/lib/Target/Lanai/MCTargetDesc/LanaiFixupKinds.h b/lib/Target/Lanai/MCTargetDesc/LanaiFixupKinds.h
new file mode 100644
index 000000000000..9ff8340d2922
--- /dev/null
+++ b/lib/Target/Lanai/MCTargetDesc/LanaiFixupKinds.h
@@ -0,0 +1,43 @@
+//===-- LanaiFixupKinds.h - Lanai Specific Fixup Entries --------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_LANAI_MCTARGETDESC_LANAIFIXUPKINDS_H
+#define LLVM_LIB_TARGET_LANAI_MCTARGETDESC_LANAIFIXUPKINDS_H
+
+#include "llvm/MC/MCFixup.h"
+
+namespace llvm {
+namespace Lanai {
+// Although most of the current fixup types reflect a unique relocation
+// one can have multiple fixup types for a given relocation and thus need
+// to be uniquely named.
+//
+// This table *must* be in the save order of
+// MCFixupKindInfo Infos[Lanai::NumTargetFixupKinds]
+// in LanaiAsmBackend.cpp.
+//
+enum Fixups {
+  // Results in R_Lanai_NONE
+  FIXUP_LANAI_NONE = FirstTargetFixupKind,
+
+  FIXUP_LANAI_21,   // 21-bit symbol relocation
+  FIXUP_LANAI_21_F, // 21-bit symbol relocation, last two bits masked to 0
+  FIXUP_LANAI_25,   // 25-bit branch targets
+  FIXUP_LANAI_32,   // general 32-bit relocation
+  FIXUP_LANAI_HI16, // upper 16-bits of a symbolic relocation
+  FIXUP_LANAI_LO16, // lower 16-bits of a symbolic relocation
+
+  // Marker
+  LastTargetFixupKind,
+  NumTargetFixupKinds = LastTargetFixupKind - FirstTargetFixupKind
+};
+} // namespace Lanai
+} // namespace llvm
+
+#endif // LLVM_LIB_TARGET_LANAI_MCTARGETDESC_LANAIFIXUPKINDS_H
diff --git a/lib/Target/Lanai/MCTargetDesc/LanaiMCAsmInfo.cpp b/lib/Target/Lanai/MCTargetDesc/LanaiMCAsmInfo.cpp
new file mode 100644
index 000000000000..7e2705e67b6d
--- /dev/null
+++ b/lib/Target/Lanai/MCTargetDesc/LanaiMCAsmInfo.cpp
@@ -0,0 +1,43 @@
+//===-- LanaiMCAsmInfo.cpp - Lanai asm properties -----------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the declarations of the LanaiMCAsmInfo properties.
+//
+//===----------------------------------------------------------------------===//
+
+#include "LanaiMCAsmInfo.h"
+
+#include "llvm/ADT/Triple.h"
+
+using namespace llvm;
+
+void LanaiMCAsmInfo::anchor() {}
+
+LanaiMCAsmInfo::LanaiMCAsmInfo(const Triple & /*TheTriple*/) {
+  IsLittleEndian = false;
+  PrivateGlobalPrefix = ".L";
+  WeakRefDirective = "\t.weak\t";
+  ExceptionsType = ExceptionHandling::DwarfCFI;
+
+  // Lanai assembly requires ".section" before ".bss"
+  UsesELFSectionDirectiveForBSS = true;
+
+  // Use the integrated assembler instead of system one.
+  UseIntegratedAssembler = true;
+
+  // Use '!' as comment string to correspond with old toolchain.
+  CommentString = "!";
+
+  // Target supports emission of debugging information.
+  SupportsDebugInformation = true;
+
+  // Set the instruction alignment. Currently used only for address adjustment
+  // in dwarf generation.
+  MinInstAlignment = 4;
+}
diff --git a/lib/Target/Lanai/MCTargetDesc/LanaiMCAsmInfo.h b/lib/Target/Lanai/MCTargetDesc/LanaiMCAsmInfo.h
new file mode 100644
index 000000000000..3eef0592d2fa
--- /dev/null
+++ b/lib/Target/Lanai/MCTargetDesc/LanaiMCAsmInfo.h
@@ -0,0 +1,31 @@
+//=====-- LanaiMCAsmInfo.h - Lanai asm properties -----------*- C++ -*--====//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the declaration of the LanaiMCAsmInfo class.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_LANAI_MCTARGETDESC_LANAIMCASMINFO_H
+#define LLVM_LIB_TARGET_LANAI_MCTARGETDESC_LANAIMCASMINFO_H
+
+#include "llvm/MC/MCAsmInfoELF.h"
+
+namespace llvm {
+class Triple;
+
+class LanaiMCAsmInfo : public MCAsmInfoELF {
+  void anchor() override;
+
+public:
+  explicit LanaiMCAsmInfo(const Triple &TheTriple);
+};
+
+} // namespace llvm
+
+#endif // LLVM_LIB_TARGET_LANAI_MCTARGETDESC_LANAIMCASMINFO_H
diff --git a/lib/Target/Lanai/MCTargetDesc/LanaiMCCodeEmitter.cpp b/lib/Target/Lanai/MCTargetDesc/LanaiMCCodeEmitter.cpp
new file mode 100644
index 000000000000..f14adc27dd45
--- /dev/null
+++ b/lib/Target/Lanai/MCTargetDesc/LanaiMCCodeEmitter.cpp
@@ -0,0 +1,326 @@
+//===-- LanaiMCCodeEmitter.cpp - Convert Lanai code to machine code -------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the LanaiMCCodeEmitter class.
+//
+//===----------------------------------------------------------------------===//
+
+#include "Lanai.h"
+#include "MCTargetDesc/LanaiBaseInfo.h"
+#include "MCTargetDesc/LanaiFixupKinds.h"
+#include "MCTargetDesc/LanaiMCExpr.h"
+#include "MCTargetDesc/LanaiMCTargetDesc.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/MC/MCCodeEmitter.h"
+#include "llvm/MC/MCFixup.h"
+#include "llvm/MC/MCInst.h"
+#include "llvm/MC/MCInstrInfo.h"
+#include "llvm/MC/MCRegisterInfo.h"
+#include "llvm/MC/MCSubtargetInfo.h"
+#include "llvm/MC/MCSymbol.h"
+#include "llvm/Support/raw_ostream.h"
+
+#define DEBUG_TYPE "mccodeemitter"
+
+STATISTIC(MCNumEmitted, "Number of MC instructions emitted");
+
+namespace llvm {
+namespace {
+class LanaiMCCodeEmitter : public MCCodeEmitter {
+  LanaiMCCodeEmitter(const LanaiMCCodeEmitter &); // DO NOT IMPLEMENT
+  void operator=(const LanaiMCCodeEmitter &);     // DO NOT IMPLEMENT
+  const MCInstrInfo &InstrInfo;
+  MCContext &Context;
+
+public:
+  LanaiMCCodeEmitter(const MCInstrInfo &MCII, MCContext &C)
+      : InstrInfo(MCII), Context(C) {}
+
+  ~LanaiMCCodeEmitter() override {}
+
+  // The functions below are called by TableGen generated functions for getting
+  // the binary encoding of instructions/opereands.
+
+  // getBinaryCodeForInstr - TableGen'erated function for getting the
+  // binary encoding for an instruction.
+  uint64_t getBinaryCodeForInstr(const MCInst &Inst,
+                                 SmallVectorImpl<MCFixup> &Fixups,
+                                 const MCSubtargetInfo &SubtargetInfo) const;
+
+  // getMachineOpValue - Return binary encoding of operand. If the machine
+  // operand requires relocation, record the relocation and return zero.
+  unsigned getMachineOpValue(const MCInst &Inst, const MCOperand &MCOp,
+                             SmallVectorImpl<MCFixup> &Fixups,
+                             const MCSubtargetInfo &SubtargetInfo) const;
+
+  unsigned getRiMemoryOpValue(const MCInst &Inst, unsigned OpNo,
+                              SmallVectorImpl<MCFixup> &Fixups,
+                              const MCSubtargetInfo &SubtargetInfo) const;
+
+  unsigned getRrMemoryOpValue(const MCInst &Inst, unsigned OpNo,
+                              SmallVectorImpl<MCFixup> &Fixups,
+                              const MCSubtargetInfo &SubtargetInfo) const;
+
+  unsigned getSplsOpValue(const MCInst &Inst, unsigned OpNo,
+                          SmallVectorImpl<MCFixup> &Fixups,
+                          const MCSubtargetInfo &SubtargetInfo) const;
+
+  unsigned getBranchTargetOpValue(const MCInst &Inst, unsigned OpNo,
+                                  SmallVectorImpl<MCFixup> &Fixups,
+                                  const MCSubtargetInfo &SubtargetInfo) const;
+
+  unsigned getCallTargetOpValue(const MCInst &Inst, unsigned OpNo,
+                                SmallVectorImpl<MCFixup> &Fixups,
+                                const MCSubtargetInfo &SubtargetInfo) const;
+
+  void encodeInstruction(const MCInst &Inst, raw_ostream &Ostream,
+                         SmallVectorImpl<MCFixup> &Fixups,
+                         const MCSubtargetInfo &SubtargetInfo) const override;
+
+  unsigned adjustPqBitsRmAndRrm(const MCInst &Inst, unsigned Value,
+                                const MCSubtargetInfo &STI) const;
+
+  unsigned adjustPqBitsSpls(const MCInst &Inst, unsigned Value,
+                            const MCSubtargetInfo &STI) const;
+};
+
+Lanai::Fixups FixupKind(const MCExpr *Expr) {
+  if (isa<MCSymbolRefExpr>(Expr))
+    return Lanai::FIXUP_LANAI_21;
+  if (const LanaiMCExpr *McExpr = dyn_cast<LanaiMCExpr>(Expr)) {
+    LanaiMCExpr::VariantKind ExprKind = McExpr->getKind();
+    switch (ExprKind) {
+    case LanaiMCExpr::VK_Lanai_None:
+      return Lanai::FIXUP_LANAI_21;
+    case LanaiMCExpr::VK_Lanai_ABS_HI:
+      return Lanai::FIXUP_LANAI_HI16;
+    case LanaiMCExpr::VK_Lanai_ABS_LO:
+      return Lanai::FIXUP_LANAI_LO16;
+    }
+  }
+  return Lanai::Fixups(0);
+}
+
+// getMachineOpValue - Return binary encoding of operand. If the machine
+// operand requires relocation, record the relocation and return zero.
+unsigned LanaiMCCodeEmitter::getMachineOpValue(
+    const MCInst &Inst, const MCOperand &MCOp, SmallVectorImpl<MCFixup> &Fixups,
+    const MCSubtargetInfo &SubtargetInfo) const {
+  if (MCOp.isReg())
+    return getLanaiRegisterNumbering(MCOp.getReg());
+  if (MCOp.isImm())
+    return static_cast<unsigned>(MCOp.getImm());
+
+  // MCOp must be an expression
+  assert(MCOp.isExpr());
+  const MCExpr *Expr = MCOp.getExpr();
+
+  // Extract the symbolic reference side of a binary expression.
+  if (Expr->getKind() == MCExpr::Binary) {
+    const MCBinaryExpr *BinaryExpr = static_cast<const MCBinaryExpr *>(Expr);
+    Expr = BinaryExpr->getLHS();
+  }
+
+  assert(isa<LanaiMCExpr>(Expr) || Expr->getKind() == MCExpr::SymbolRef);
+  // Push fixup (all info is contained within)
+  Fixups.push_back(
+      MCFixup::create(0, MCOp.getExpr(), MCFixupKind(FixupKind(Expr))));
+  return 0;
+}
+
+// Helper function to adjust P and Q bits on load and store instructions.
+unsigned adjustPqBits(const MCInst &Inst, unsigned Value, unsigned PBitShift,
+                      unsigned QBitShift) {
+  const MCOperand AluOp = Inst.getOperand(3);
+  unsigned AluCode = AluOp.getImm();
+
+  // Set the P bit to one iff the immediate is nonzero and not a post-op
+  // instruction.
+  const MCOperand Op2 = Inst.getOperand(2);
+  Value &= ~(1 << PBitShift);
+  if (!LPAC::isPostOp(AluCode) &&
+      ((Op2.isImm() && Op2.getImm() != 0) ||
+       (Op2.isReg() && Op2.getReg() != Lanai::R0) || (Op2.isExpr())))
+    Value |= (1 << PBitShift);
+
+  // Set the Q bit to one iff it is a post- or pre-op instruction.
+  assert(Inst.getOperand(0).isReg() && Inst.getOperand(1).isReg() &&
+         "Expected register operand.");
+  Value &= ~(1 << QBitShift);
+  if (LPAC::modifiesOp(AluCode) && ((Op2.isImm() && Op2.getImm() != 0) ||
+                                    (Op2.isReg() && Op2.getReg() != Lanai::R0)))
+    Value |= (1 << QBitShift);
+
+  return Value;
+}
+
+unsigned
+LanaiMCCodeEmitter::adjustPqBitsRmAndRrm(const MCInst &Inst, unsigned Value,
+                                         const MCSubtargetInfo &STI) const {
+  return adjustPqBits(Inst, Value, 17, 16);
+}
+
+unsigned
+LanaiMCCodeEmitter::adjustPqBitsSpls(const MCInst &Inst, unsigned Value,
+                                     const MCSubtargetInfo &STI) const {
+  return adjustPqBits(Inst, Value, 11, 10);
+}
+
+void LanaiMCCodeEmitter::encodeInstruction(
+    const MCInst &Inst, raw_ostream &Ostream, SmallVectorImpl<MCFixup> &Fixups,
+    const MCSubtargetInfo &SubtargetInfo) const {
+  // Get instruction encoding and emit it
+  unsigned Value = getBinaryCodeForInstr(Inst, Fixups, SubtargetInfo);
+  ++MCNumEmitted; // Keep track of the number of emitted insns.
+
+  // Emit bytes in big-endian
+  for (int i = (4 - 1) * 8; i >= 0; i -= 8)
+    Ostream << static_cast<char>((Value >> i) & 0xff);
+}
+
+// Encode Lanai Memory Operand
+unsigned LanaiMCCodeEmitter::getRiMemoryOpValue(
+    const MCInst &Inst, unsigned OpNo, SmallVectorImpl<MCFixup> &Fixups,
+    const MCSubtargetInfo &SubtargetInfo) const {
+  unsigned Encoding;
+  const MCOperand Op1 = Inst.getOperand(OpNo + 0);
+  const MCOperand Op2 = Inst.getOperand(OpNo + 1);
+  const MCOperand AluOp = Inst.getOperand(OpNo + 2);
+
+  assert(Op1.isReg() && "First operand is not register.");
+  assert((Op2.isImm() || Op2.isExpr()) &&
+         "Second operand is neither an immediate nor an expression.");
+  assert((LPAC::getAluOp(AluOp.getImm()) == LPAC::ADD) &&
+         "Register immediate only supports addition operator");
+
+  Encoding = (getLanaiRegisterNumbering(Op1.getReg()) << 18);
+  if (Op2.isImm()) {
+    assert(isInt<16>(Op2.getImm()) &&
+           "Constant value truncated (limited to 16-bit)");
+
+    Encoding |= (Op2.getImm() & 0xffff);
+    if (Op2.getImm() != 0) {
+      if (LPAC::isPreOp(AluOp.getImm()))
+        Encoding |= (0x3 << 16);
+      if (LPAC::isPostOp(AluOp.getImm()))
+        Encoding |= (0x1 << 16);
+    }
+  } else
+    getMachineOpValue(Inst, Op2, Fixups, SubtargetInfo);
+
+  return Encoding;
+}
+
+unsigned LanaiMCCodeEmitter::getRrMemoryOpValue(
+    const MCInst &Inst, unsigned OpNo, SmallVectorImpl<MCFixup> &Fixups,
+    const MCSubtargetInfo &SubtargetInfo) const {
+  unsigned Encoding;
+  const MCOperand Op1 = Inst.getOperand(OpNo + 0);
+  const MCOperand Op2 = Inst.getOperand(OpNo + 1);
+  const MCOperand AluMCOp = Inst.getOperand(OpNo + 2);
+
+  assert(Op1.isReg() && "First operand is not register.");
+  Encoding = (getLanaiRegisterNumbering(Op1.getReg()) << 15);
+  assert(Op2.isReg() && "Second operand is not register.");
+  Encoding |= (getLanaiRegisterNumbering(Op2.getReg()) << 10);
+
+  assert(AluMCOp.isImm() && "Third operator is not immediate.");
+  // Set BBB
+  unsigned AluOp = AluMCOp.getImm();
+  Encoding |= LPAC::encodeLanaiAluCode(AluOp) << 5;
+  // Set P and Q
+  if (LPAC::isPreOp(AluOp))
+    Encoding |= (0x3 << 8);
+  if (LPAC::isPostOp(AluOp))
+    Encoding |= (0x1 << 8);
+  // Set JJJJ
+  switch (LPAC::getAluOp(AluOp)) {
+  case LPAC::SHL:
+  case LPAC::SRL:
+    Encoding |= 0x10;
+    break;
+  case LPAC::SRA:
+    Encoding |= 0x18;
+    break;
+  default:
+    break;
+  }
+
+  return Encoding;
+}
+
+unsigned
+LanaiMCCodeEmitter::getSplsOpValue(const MCInst &Inst, unsigned OpNo,
+                                   SmallVectorImpl<MCFixup> &Fixups,
+                                   const MCSubtargetInfo &SubtargetInfo) const {
+  unsigned Encoding;
+  const MCOperand Op1 = Inst.getOperand(OpNo + 0);
+  const MCOperand Op2 = Inst.getOperand(OpNo + 1);
+  const MCOperand AluOp = Inst.getOperand(OpNo + 2);
+
+  assert(Op1.isReg() && "First operand is not register.");
+  assert((Op2.isImm() || Op2.isExpr()) &&
+         "Second operand is neither an immediate nor an expression.");
+  assert((LPAC::getAluOp(AluOp.getImm()) == LPAC::ADD) &&
+         "Register immediate only supports addition operator");
+
+  Encoding = (getLanaiRegisterNumbering(Op1.getReg()) << 12);
+  if (Op2.isImm()) {
+    assert(isInt<10>(Op2.getImm()) &&
+           "Constant value truncated (limited to 10-bit)");
+
+    Encoding |= (Op2.getImm() & 0x3ff);
+    if (Op2.getImm() != 0) {
+      if (LPAC::isPreOp(AluOp.getImm()))
+        Encoding |= (0x3 << 10);
+      if (LPAC::isPostOp(AluOp.getImm()))
+        Encoding |= (0x1 << 10);
+    }
+  } else
+    getMachineOpValue(Inst, Op2, Fixups, SubtargetInfo);
+
+  return Encoding;
+}
+
+unsigned LanaiMCCodeEmitter::getCallTargetOpValue(
+    const MCInst &Inst, unsigned OpNo, SmallVectorImpl<MCFixup> &Fixups,
+    const MCSubtargetInfo &SubtargetInfo) const {
+  const MCOperand &MCOp = Inst.getOperand(OpNo);
+  if (MCOp.isReg() || MCOp.isImm())
+    return getMachineOpValue(Inst, MCOp, Fixups, SubtargetInfo);
+
+  Fixups.push_back(MCFixup::create(
+      0, MCOp.getExpr(), static_cast<MCFixupKind>(Lanai::FIXUP_LANAI_25)));
+
+  return 0;
+}
+
+unsigned LanaiMCCodeEmitter::getBranchTargetOpValue(
+    const MCInst &Inst, unsigned OpNo, SmallVectorImpl<MCFixup> &Fixups,
+    const MCSubtargetInfo &SubtargetInfo) const {
+  const MCOperand &MCOp = Inst.getOperand(OpNo);
+  if (MCOp.isReg() || MCOp.isImm())
+    return getMachineOpValue(Inst, MCOp, Fixups, SubtargetInfo);
+
+  Fixups.push_back(MCFixup::create(
+      0, MCOp.getExpr(), static_cast<MCFixupKind>(Lanai::FIXUP_LANAI_25)));
+
+  return 0;
+}
+
+#include "LanaiGenMCCodeEmitter.inc"
+} // namespace
+} // namespace llvm
+
+llvm::MCCodeEmitter *
+llvm::createLanaiMCCodeEmitter(const MCInstrInfo &InstrInfo,
+                               const MCRegisterInfo & /*MRI*/,
+                               MCContext &context) {
+  return new LanaiMCCodeEmitter(InstrInfo, context);
+}
diff --git a/lib/Target/Lanai/MCTargetDesc/LanaiMCExpr.cpp b/lib/Target/Lanai/MCTargetDesc/LanaiMCExpr.cpp
new file mode 100644
index 000000000000..201c95de07f4
--- /dev/null
+++ b/lib/Target/Lanai/MCTargetDesc/LanaiMCExpr.cpp
@@ -0,0 +1,60 @@
+//===-- LanaiMCExpr.cpp - Lanai specific MC expression classes ------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "LanaiMCExpr.h"
+#include "llvm/MC/MCAssembler.h"
+#include "llvm/MC/MCContext.h"
+#include "llvm/MC/MCStreamer.h"
+using namespace llvm;
+
+#define DEBUG_TYPE "lanaimcexpr"
+
+const LanaiMCExpr *LanaiMCExpr::create(VariantKind Kind, const MCExpr *Expr,
+                                       MCContext &Ctx) {
+  return new (Ctx) LanaiMCExpr(Kind, Expr);
+}
+
+void LanaiMCExpr::printImpl(raw_ostream &OS, const MCAsmInfo *MAI) const {
+  if (Kind == VK_Lanai_None) {
+    Expr->print(OS, MAI);
+    return;
+  }
+
+  switch (Kind) {
+  default:
+    llvm_unreachable("Invalid kind!");
+  case VK_Lanai_ABS_HI:
+    OS << "hi";
+    break;
+  case VK_Lanai_ABS_LO:
+    OS << "lo";
+    break;
+  }
+
+  OS << '(';
+  const MCExpr *Expr = getSubExpr();
+  Expr->print(OS, MAI);
+  OS << ')';
+}
+
+void LanaiMCExpr::visitUsedExpr(MCStreamer &Streamer) const {
+  Streamer.visitUsedExpr(*getSubExpr());
+}
+
+bool LanaiMCExpr::evaluateAsRelocatableImpl(MCValue &Res,
+                                            const MCAsmLayout *Layout,
+                                            const MCFixup *Fixup) const {
+  if (!getSubExpr()->evaluateAsRelocatable(Res, Layout, Fixup))
+    return false;
+
+  Res =
+      MCValue::get(Res.getSymA(), Res.getSymB(), Res.getConstant(), getKind());
+
+  return true;
+}
diff --git a/lib/Target/Lanai/MCTargetDesc/LanaiMCExpr.h b/lib/Target/Lanai/MCTargetDesc/LanaiMCExpr.h
new file mode 100644
index 000000000000..5004d541ff70
--- /dev/null
+++ b/lib/Target/Lanai/MCTargetDesc/LanaiMCExpr.h
@@ -0,0 +1,56 @@
+//===-- LanaiMCExpr.h - Lanai specific MC expression classes ----*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_LANAI_MCTARGETDESC_LANAIMCEXPR_H
+#define LLVM_LIB_TARGET_LANAI_MCTARGETDESC_LANAIMCEXPR_H
+
+#include "llvm/MC/MCExpr.h"
+#include "llvm/MC/MCValue.h"
+
+namespace llvm {
+
+class LanaiMCExpr : public MCTargetExpr {
+public:
+  enum VariantKind { VK_Lanai_None, VK_Lanai_ABS_HI, VK_Lanai_ABS_LO };
+
+private:
+  const VariantKind Kind;
+  const MCExpr *Expr;
+
+  explicit LanaiMCExpr(VariantKind Kind, const MCExpr *Expr)
+      : Kind(Kind), Expr(Expr) {}
+
+public:
+  static const LanaiMCExpr *create(VariantKind Kind, const MCExpr *Expr,
+                                   MCContext &Ctx);
+
+  // Returns the kind of this expression.
+  VariantKind getKind() const { return Kind; }
+
+  // Returns the child of this expression.
+  const MCExpr *getSubExpr() const { return Expr; }
+
+  void printImpl(raw_ostream &OS, const MCAsmInfo *MAI) const override;
+  bool evaluateAsRelocatableImpl(MCValue &Res, const MCAsmLayout *Layout,
+                                 const MCFixup *Fixup) const override;
+  void visitUsedExpr(MCStreamer &Streamer) const override;
+  MCFragment *findAssociatedFragment() const override {
+    return getSubExpr()->findAssociatedFragment();
+  }
+
+  // There are no TLS LanaiMCExprs at the moment.
+  void fixELFSymbolsInTLSFixups(MCAssembler & /*Asm*/) const override {}
+
+  static bool classof(const MCExpr *E) {
+    return E->getKind() == MCExpr::Target;
+  }
+};
+} // end namespace llvm
+
+#endif
diff --git a/lib/Target/Lanai/MCTargetDesc/LanaiMCTargetDesc.cpp b/lib/Target/Lanai/MCTargetDesc/LanaiMCTargetDesc.cpp
new file mode 100644
index 000000000000..04bedfb7fba7
--- /dev/null
+++ b/lib/Target/Lanai/MCTargetDesc/LanaiMCTargetDesc.cpp
@@ -0,0 +1,149 @@
+//===-- LanaiMCTargetDesc.cpp - Lanai Target Descriptions -----------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file provides Lanai specific target descriptions.
+//
+//===----------------------------------------------------------------------===//
+
+#include "LanaiMCTargetDesc.h"
+
+#include "InstPrinter/LanaiInstPrinter.h"
+#include "LanaiMCAsmInfo.h"
+#include "llvm/MC/MCInstrAnalysis.h"
+#include "llvm/MC/MCInstrInfo.h"
+#include "llvm/MC/MCStreamer.h"
+#include "llvm/MC/MCSubtargetInfo.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/TargetRegistry.h"
+
+#define GET_INSTRINFO_MC_DESC
+#include "LanaiGenInstrInfo.inc"
+
+#define GET_SUBTARGETINFO_MC_DESC
+#include "LanaiGenSubtargetInfo.inc"
+
+#define GET_REGINFO_MC_DESC
+#include "LanaiGenRegisterInfo.inc"
+
+using namespace llvm;
+
+static MCInstrInfo *createLanaiMCInstrInfo() {
+  MCInstrInfo *X = new MCInstrInfo();
+  InitLanaiMCInstrInfo(X);
+  return X;
+}
+
+static MCRegisterInfo *createLanaiMCRegisterInfo(const Triple & /*TT*/) {
+  MCRegisterInfo *X = new MCRegisterInfo();
+  InitLanaiMCRegisterInfo(X, Lanai::RCA, 0, 0, Lanai::PC);
+  return X;
+}
+
+static MCSubtargetInfo *
+createLanaiMCSubtargetInfo(const Triple &TT, StringRef CPU, StringRef FS) {
+  std::string CPUName = CPU;
+  if (CPUName.empty())
+    CPUName = "generic";
+
+  return createLanaiMCSubtargetInfoImpl(TT, CPUName, FS);
+}
+
+static MCStreamer *createMCStreamer(const Triple &T, MCContext &Context,
+                                    MCAsmBackend &MAB, raw_pwrite_stream &OS,
+                                    MCCodeEmitter *Emitter, bool RelaxAll) {
+  if (!T.isOSBinFormatELF())
+    llvm_unreachable("OS not supported");
+
+  return createELFStreamer(Context, MAB, OS, Emitter, RelaxAll);
+}
+
+static MCInstPrinter *createLanaiMCInstPrinter(const Triple & /*T*/,
+                                               unsigned SyntaxVariant,
+                                               const MCAsmInfo &MAI,
+                                               const MCInstrInfo &MII,
+                                               const MCRegisterInfo &MRI) {
+  if (SyntaxVariant == 0)
+    return new LanaiInstPrinter(MAI, MII, MRI);
+  return 0;
+}
+
+MCRelocationInfo *createLanaiElfRelocation(const Triple &TheTriple,
+                                           MCContext &Ctx) {
+  return createMCRelocationInfo(TheTriple, Ctx);
+}
+
+class LanaiMCInstrAnalysis : public MCInstrAnalysis {
+public:
+  explicit LanaiMCInstrAnalysis(const MCInstrInfo *Info)
+      : MCInstrAnalysis(Info) {}
+
+  bool evaluateBranch(const MCInst &Inst, uint64_t Addr, uint64_t Size,
+                      uint64_t &Target) const override {
+    if (Inst.getNumOperands() == 0)
+      return false;
+
+    if (Info->get(Inst.getOpcode()).OpInfo[0].OperandType ==
+        MCOI::OPERAND_PCREL) {
+      int64_t Imm = Inst.getOperand(0).getImm();
+      Target = Addr + Size + Imm;
+      return true;
+    } else {
+      int64_t Imm = Inst.getOperand(0).getImm();
+
+      // Skip case where immediate is 0 as that occurs in file that isn't linked
+      // and the branch target inferred would be wrong.
+      if (Imm == 0)
+        return false;
+
+      Target = Imm;
+      return true;
+    }
+  }
+};
+
+static MCInstrAnalysis *createLanaiInstrAnalysis(const MCInstrInfo *Info) {
+  return new LanaiMCInstrAnalysis(Info);
+}
+
+extern "C" void LLVMInitializeLanaiTargetMC() {
+  // Register the MC asm info.
+  RegisterMCAsmInfo<LanaiMCAsmInfo> X(TheLanaiTarget);
+
+  // Register the MC instruction info.
+  TargetRegistry::RegisterMCInstrInfo(TheLanaiTarget, createLanaiMCInstrInfo);
+
+  // Register the MC register info.
+  TargetRegistry::RegisterMCRegInfo(TheLanaiTarget, createLanaiMCRegisterInfo);
+
+  // Register the MC subtarget info.
+  TargetRegistry::RegisterMCSubtargetInfo(TheLanaiTarget,
+                                          createLanaiMCSubtargetInfo);
+
+  // Register the MC code emitter
+  TargetRegistry::RegisterMCCodeEmitter(TheLanaiTarget,
+                                        llvm::createLanaiMCCodeEmitter);
+
+  // Register the ASM Backend
+  TargetRegistry::RegisterMCAsmBackend(TheLanaiTarget, createLanaiAsmBackend);
+
+  // Register the MCInstPrinter.
+  TargetRegistry::RegisterMCInstPrinter(TheLanaiTarget,
+                                        createLanaiMCInstPrinter);
+
+  // Register the ELF streamer.
+  TargetRegistry::RegisterELFStreamer(TheLanaiTarget, createMCStreamer);
+
+  // Register the MC relocation info.
+  TargetRegistry::RegisterMCRelocationInfo(TheLanaiTarget,
+                                           createLanaiElfRelocation);
+
+  // Register the MC instruction analyzer.
+  TargetRegistry::RegisterMCInstrAnalysis(TheLanaiTarget,
+                                          createLanaiInstrAnalysis);
+}
diff --git a/lib/Target/Lanai/MCTargetDesc/LanaiMCTargetDesc.h b/lib/Target/Lanai/MCTargetDesc/LanaiMCTargetDesc.h
new file mode 100644
index 000000000000..e117ed7a500f
--- /dev/null
+++ b/lib/Target/Lanai/MCTargetDesc/LanaiMCTargetDesc.h
@@ -0,0 +1,59 @@
+//===-- LanaiMCTargetDesc.h - Lanai Target Descriptions ---------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file provides Lanai specific target descriptions.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_LANAI_MCTARGETDESC_LANAIMCTARGETDESC_H
+#define LLVM_LIB_TARGET_LANAI_MCTARGETDESC_LANAIMCTARGETDESC_H
+
+#include "llvm/MC/MCRegisterInfo.h"
+#include "llvm/Support/DataTypes.h"
+
+namespace llvm {
+class MCAsmBackend;
+class MCCodeEmitter;
+class MCContext;
+class MCInstrInfo;
+class MCInstrAnalysis;
+class MCObjectWriter;
+class MCRelocationInfo;
+class MCSubtargetInfo;
+class Target;
+class Triple;
+class StringRef;
+class raw_pwrite_stream;
+
+extern Target TheLanaiTarget;
+
+MCCodeEmitter *createLanaiMCCodeEmitter(const MCInstrInfo &MCII,
+                                        const MCRegisterInfo &MRI,
+                                        MCContext &Ctx);
+
+MCAsmBackend *createLanaiAsmBackend(const Target &T, const MCRegisterInfo &MRI,
+                                    const Triple &TheTriple, StringRef CPU);
+
+MCObjectWriter *createLanaiELFObjectWriter(raw_pwrite_stream &OS,
+                                           uint8_t OSABI);
+} // namespace llvm
+
+// Defines symbolic names for Lanai registers.  This defines a mapping from
+// register name to register number.
+#define GET_REGINFO_ENUM
+#include "LanaiGenRegisterInfo.inc"
+
+// Defines symbolic names for the Lanai instructions.
+#define GET_INSTRINFO_ENUM
+#include "LanaiGenInstrInfo.inc"
+
+#define GET_SUBTARGETINFO_ENUM
+#include "LanaiGenSubtargetInfo.inc"
+
+#endif // LLVM_LIB_TARGET_LANAI_MCTARGETDESC_LANAIMCTARGETDESC_H
diff --git a/lib/Target/Lanai/TargetInfo/CMakeLists.txt b/lib/Target/Lanai/TargetInfo/CMakeLists.txt
new file mode 100644
index 000000000000..01611b54b2d8
--- /dev/null
+++ b/lib/Target/Lanai/TargetInfo/CMakeLists.txt
@@ -0,0 +1,3 @@
+add_llvm_library(LLVMLanaiInfo
+  LanaiTargetInfo.cpp
+  )
diff --git a/lib/Target/CppBackend/TargetInfo/LLVMBuild.txt b/lib/Target/Lanai/TargetInfo/LLVMBuild.txt
index 9c186a52f4fa..9922ec36daae 100644
--- a/lib/Target/CppBackend/TargetInfo/LLVMBuild.txt
+++ b/lib/Target/Lanai/TargetInfo/LLVMBuild.txt
@@ -1,4 +1,4 @@
-;===- ./lib/Target/CppBackend/TargetInfo/LLVMBuild.txt ---------*- Conf -*--===;
+;===- ./lib/Target/Lanai/TargetInfo/LLVMBuild.txt --------------*- Conf -*--===;
 ;
 ;                     The LLVM Compiler Infrastructure
 ;
@@ -17,7 +17,7 @@
 
 [component_0]
 type = Library
-name = CppBackendInfo
-parent = CppBackend
+name = LanaiInfo
+parent = Lanai
 required_libraries = Support
-add_to_library_groups = CppBackend
+add_to_library_groups = Lanai
diff --git a/lib/Target/Lanai/TargetInfo/LanaiTargetInfo.cpp b/lib/Target/Lanai/TargetInfo/LanaiTargetInfo.cpp
new file mode 100644
index 000000000000..bd615d6ad3ac
--- /dev/null
+++ b/lib/Target/Lanai/TargetInfo/LanaiTargetInfo.cpp
@@ -0,0 +1,20 @@
+//===-- LanaiTargetInfo.cpp - Lanai Target Implementation -----------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "Lanai.h"
+#include "llvm/IR/Module.h"
+#include "llvm/Support/TargetRegistry.h"
+
+using namespace llvm;
+
+Target llvm::TheLanaiTarget;
+
+extern "C" void LLVMInitializeLanaiTargetInfo() {
+  RegisterTarget<Triple::lanai> X(TheLanaiTarget, "lanai", "Lanai");
+}
diff --git a/lib/Target/MSP430/InstPrinter/Makefile b/lib/Target/MSP430/InstPrinter/Makefile
deleted file mode 100644
index a5293ab8a234..000000000000
--- a/lib/Target/MSP430/InstPrinter/Makefile
+++ /dev/null
@@ -1,15 +0,0 @@
-##===- lib/Target/MSP430/AsmPrinter/Makefile ---------------*- Makefile -*-===##
-#
-#                     The LLVM Compiler Infrastructure
-#
-# This file is distributed under the University of Illinois Open Source
-# License. See LICENSE.TXT for details.
-#
-##===----------------------------------------------------------------------===##
-LEVEL = ../../../..
-LIBRARYNAME = LLVMMSP430AsmPrinter
-
-# Hack: we need to include 'main' MSP430 target directory to grab private headers
-CPP.Flags += -I$(PROJ_OBJ_DIR)/.. -I$(PROJ_SRC_DIR)/..
-
-include $(LEVEL)/Makefile.common
diff --git a/lib/Target/MSP430/MCTargetDesc/MSP430MCTargetDesc.cpp b/lib/Target/MSP430/MCTargetDesc/MSP430MCTargetDesc.cpp
index 807d1129b5fc..b3631caca952 100644
--- a/lib/Target/MSP430/MCTargetDesc/MSP430MCTargetDesc.cpp
+++ b/lib/Target/MSP430/MCTargetDesc/MSP430MCTargetDesc.cpp
@@ -14,7 +14,6 @@
 #include "MSP430MCTargetDesc.h"
 #include "InstPrinter/MSP430InstPrinter.h"
 #include "MSP430MCAsmInfo.h"
-#include "llvm/MC/MCCodeGenInfo.h"
 #include "llvm/MC/MCInstrInfo.h"
 #include "llvm/MC/MCRegisterInfo.h"
 #include "llvm/MC/MCSubtargetInfo.h"
@@ -48,15 +47,6 @@ createMSP430MCSubtargetInfo(const Triple &TT, StringRef CPU, StringRef FS) {
   return createMSP430MCSubtargetInfoImpl(TT, CPU, FS);
 }
 
-static MCCodeGenInfo *createMSP430MCCodeGenInfo(const Triple &TT,
-                                                Reloc::Model RM,
-                                                CodeModel::Model CM,
-                                                CodeGenOpt::Level OL) {
-  MCCodeGenInfo *X = new MCCodeGenInfo();
-  X->initMCCodeGenInfo(RM, CM, OL);
-  return X;
-}
-
 static MCInstPrinter *createMSP430MCInstPrinter(const Triple &T,
                                                 unsigned SyntaxVariant,
                                                 const MCAsmInfo &MAI,
@@ -71,10 +61,6 @@ extern "C" void LLVMInitializeMSP430TargetMC() {
   // Register the MC asm info.
   RegisterMCAsmInfo<MSP430MCAsmInfo> X(TheMSP430Target);
 
-  // Register the MC codegen info.
-  TargetRegistry::RegisterMCCodeGenInfo(TheMSP430Target,
-                                        createMSP430MCCodeGenInfo);
-
   // Register the MC instruction info.
   TargetRegistry::RegisterMCInstrInfo(TheMSP430Target, createMSP430MCInstrInfo);
 
diff --git a/lib/Target/MSP430/MCTargetDesc/Makefile b/lib/Target/MSP430/MCTargetDesc/Makefile
deleted file mode 100644
index bb857998eef9..000000000000
--- a/lib/Target/MSP430/MCTargetDesc/Makefile
+++ /dev/null
@@ -1,16 +0,0 @@
-##===- lib/Target/MSP430/TargetDesc/Makefile ---------------*- Makefile -*-===##
-#
-#                     The LLVM Compiler Infrastructure
-#
-# This file is distributed under the University of Illinois Open Source
-# License. See LICENSE.TXT for details.
-#
-##===----------------------------------------------------------------------===##
-
-LEVEL = ../../../..
-LIBRARYNAME = LLVMMSP430Desc
-
-# Hack: we need to include 'main' target directory to grab private headers
-CPP.Flags += -I$(PROJ_OBJ_DIR)/.. -I$(PROJ_SRC_DIR)/..
-
-include $(LEVEL)/Makefile.common
diff --git a/lib/Target/MSP430/MSP430BranchSelector.cpp b/lib/Target/MSP430/MSP430BranchSelector.cpp
index 606abc250d98..511e5bcdec0d 100644
--- a/lib/Target/MSP430/MSP430BranchSelector.cpp
+++ b/lib/Target/MSP430/MSP430BranchSelector.cpp
@@ -39,6 +39,11 @@ namespace {
 
     bool runOnMachineFunction(MachineFunction &Fn) override;
 
+    MachineFunctionProperties getRequiredProperties() const override {
+      return MachineFunctionProperties().set(
+          MachineFunctionProperties::Property::AllVRegsAllocated);
+    }
+
     const char *getPassName() const override {
       return "MSP430 Branch Selector";
     }
@@ -62,16 +67,12 @@ bool MSP430BSel::runOnMachineFunction(MachineFunction &Fn) {
 
   // Measure each MBB and compute a size for the entire function.
   unsigned FuncSize = 0;
-  for (MachineFunction::iterator MFI = Fn.begin(), E = Fn.end(); MFI != E;
-       ++MFI) {
-    MachineBasicBlock *MBB = &*MFI;
-
+  for (MachineBasicBlock &MBB : Fn) {
     unsigned BlockSize = 0;
-    for (MachineBasicBlock::iterator MBBI = MBB->begin(), EE = MBB->end();
-         MBBI != EE; ++MBBI)
-      BlockSize += TII->GetInstSizeInBytes(MBBI);
+    for (MachineInstr &MI : MBB)
+      BlockSize += TII->GetInstSizeInBytes(MI);
 
-    BlockSizes[MBB->getNumber()] = BlockSize;
+    BlockSizes[MBB.getNumber()] = BlockSize;
     FuncSize += BlockSize;
   }
 
@@ -106,7 +107,7 @@ bool MSP430BSel::runOnMachineFunction(MachineFunction &Fn) {
            I != E; ++I) {
         if ((I->getOpcode() != MSP430::JCC || I->getOperand(0).isImm()) &&
             I->getOpcode() != MSP430::JMP) {
-          MBBStartOffset += TII->GetInstSizeInBytes(I);
+          MBBStartOffset += TII->GetInstSizeInBytes(*I);
           continue;
         }
 
@@ -140,8 +141,8 @@ bool MSP430BSel::runOnMachineFunction(MachineFunction &Fn) {
 
         // Otherwise, we have to expand it to a long branch.
         unsigned NewSize;
-        MachineInstr *OldBranch = I;
-        DebugLoc dl = OldBranch->getDebugLoc();
+        MachineInstr &OldBranch = *I;
+        DebugLoc dl = OldBranch.getDebugLoc();
 
         if (I->getOpcode() == MSP430::JMP) {
           NewSize = 4;
@@ -163,7 +164,7 @@ bool MSP430BSel::runOnMachineFunction(MachineFunction &Fn) {
         I = BuildMI(MBB, I, dl, TII->get(MSP430::Bi)).addMBB(Dest);
 
         // Remove the old branch from the function.
-        OldBranch->eraseFromParent();
+        OldBranch.eraseFromParent();
 
         // Remember that this instruction is NewSize bytes, increase the size of the
         // block by NewSize-2, remember to iterate.
diff --git a/lib/Target/MSP430/MSP430FrameLowering.cpp b/lib/Target/MSP430/MSP430FrameLowering.cpp
index eb720809e47c..29555f99e23d 100644
--- a/lib/Target/MSP430/MSP430FrameLowering.cpp
+++ b/lib/Target/MSP430/MSP430FrameLowering.cpp
@@ -22,7 +22,6 @@
 #include "llvm/CodeGen/MachineRegisterInfo.h"
 #include "llvm/IR/DataLayout.h"
 #include "llvm/IR/Function.h"
-#include "llvm/Support/CommandLine.h"
 #include "llvm/Target/TargetOptions.h"
 
 using namespace llvm;
@@ -224,9 +223,9 @@ MSP430FrameLowering::restoreCalleeSavedRegisters(MachineBasicBlock &MBB,
   return true;
 }
 
-void MSP430FrameLowering::
-eliminateCallFramePseudoInstr(MachineFunction &MF, MachineBasicBlock &MBB,
-                              MachineBasicBlock::iterator I) const {
+MachineBasicBlock::iterator MSP430FrameLowering::eliminateCallFramePseudoInstr(
+    MachineFunction &MF, MachineBasicBlock &MBB,
+    MachineBasicBlock::iterator I) const {
   const MSP430InstrInfo &TII =
       *static_cast<const MSP430InstrInfo *>(MF.getSubtarget().getInstrInfo());
   unsigned StackAlign = getStackAlignment();
@@ -236,8 +235,8 @@ eliminateCallFramePseudoInstr(MachineFunction &MF, MachineBasicBlock &MBB,
     // adjcallstackup instruction into a 'sub SP, <amt>' and the
     // adjcallstackdown instruction into 'add SP, <amt>'
     // TODO: consider using push / pop instead of sub + store / add
-    MachineInstr *Old = I;
-    uint64_t Amount = Old->getOperand(0).getImm();
+    MachineInstr &Old = *I;
+    uint64_t Amount = Old.getOperand(0).getImm();
     if (Amount != 0) {
       // We need to keep the stack aligned properly.  To do this, we round the
       // amount of space needed for the outgoing arguments up to the next
@@ -245,19 +244,21 @@ eliminateCallFramePseudoInstr(MachineFunction &MF, MachineBasicBlock &MBB,
       Amount = (Amount+StackAlign-1)/StackAlign*StackAlign;
 
       MachineInstr *New = nullptr;
-      if (Old->getOpcode() == TII.getCallFrameSetupOpcode()) {
-        New = BuildMI(MF, Old->getDebugLoc(),
-                      TII.get(MSP430::SUB16ri), MSP430::SP)
-          .addReg(MSP430::SP).addImm(Amount);
+      if (Old.getOpcode() == TII.getCallFrameSetupOpcode()) {
+        New =
+            BuildMI(MF, Old.getDebugLoc(), TII.get(MSP430::SUB16ri), MSP430::SP)
+                .addReg(MSP430::SP)
+                .addImm(Amount);
       } else {
-        assert(Old->getOpcode() == TII.getCallFrameDestroyOpcode());
+        assert(Old.getOpcode() == TII.getCallFrameDestroyOpcode());
         // factor out the amount the callee already popped.
-        uint64_t CalleeAmt = Old->getOperand(1).getImm();
+        uint64_t CalleeAmt = Old.getOperand(1).getImm();
         Amount -= CalleeAmt;
         if (Amount)
-          New = BuildMI(MF, Old->getDebugLoc(),
-                        TII.get(MSP430::ADD16ri), MSP430::SP)
-            .addReg(MSP430::SP).addImm(Amount);
+          New = BuildMI(MF, Old.getDebugLoc(), TII.get(MSP430::ADD16ri),
+                        MSP430::SP)
+                    .addReg(MSP430::SP)
+                    .addImm(Amount);
       }
 
       if (New) {
@@ -272,10 +273,11 @@ eliminateCallFramePseudoInstr(MachineFunction &MF, MachineBasicBlock &MBB,
     // If we are performing frame pointer elimination and if the callee pops
     // something off the stack pointer, add it back.
     if (uint64_t CalleeAmt = I->getOperand(1).getImm()) {
-      MachineInstr *Old = I;
+      MachineInstr &Old = *I;
       MachineInstr *New =
-        BuildMI(MF, Old->getDebugLoc(), TII.get(MSP430::SUB16ri),
-                MSP430::SP).addReg(MSP430::SP).addImm(CalleeAmt);
+          BuildMI(MF, Old.getDebugLoc(), TII.get(MSP430::SUB16ri), MSP430::SP)
+              .addReg(MSP430::SP)
+              .addImm(CalleeAmt);
       // The SRW implicit def is dead.
       New->getOperand(3).setIsDead();
 
@@ -283,7 +285,7 @@ eliminateCallFramePseudoInstr(MachineFunction &MF, MachineBasicBlock &MBB,
     }
   }
 
-  MBB.erase(I);
+  return MBB.erase(I);
 }
 
 void
diff --git a/lib/Target/MSP430/MSP430FrameLowering.h b/lib/Target/MSP430/MSP430FrameLowering.h
index 48c4dc866a63..f77de18b4d16 100644
--- a/lib/Target/MSP430/MSP430FrameLowering.h
+++ b/lib/Target/MSP430/MSP430FrameLowering.h
@@ -30,9 +30,9 @@ public:
   void emitPrologue(MachineFunction &MF, MachineBasicBlock &MBB) const override;
   void emitEpilogue(MachineFunction &MF, MachineBasicBlock &MBB) const override;
 
-  void eliminateCallFramePseudoInstr(MachineFunction &MF,
-                                  MachineBasicBlock &MBB,
-                                  MachineBasicBlock::iterator I) const override;
+  MachineBasicBlock::iterator
+  eliminateCallFramePseudoInstr(MachineFunction &MF, MachineBasicBlock &MBB,
+                                MachineBasicBlock::iterator I) const override;
 
   bool spillCalleeSavedRegisters(MachineBasicBlock &MBB,
                                  MachineBasicBlock::iterator MI,
diff --git a/lib/Target/MSP430/MSP430ISelDAGToDAG.cpp b/lib/Target/MSP430/MSP430ISelDAGToDAG.cpp
index 8a01334ee2dd..69c609d04b5e 100644
--- a/lib/Target/MSP430/MSP430ISelDAGToDAG.cpp
+++ b/lib/Target/MSP430/MSP430ISelDAGToDAG.cpp
@@ -24,7 +24,6 @@
 #include "llvm/IR/DerivedTypes.h"
 #include "llvm/IR/Function.h"
 #include "llvm/IR/Intrinsics.h"
-#include "llvm/Support/Compiler.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/raw_ostream.h"
@@ -111,10 +110,10 @@ namespace {
   #include "MSP430GenDAGISel.inc"
 
   private:
-    SDNode *Select(SDNode *N) override;
-    SDNode *SelectIndexedLoad(SDNode *Op);
-    SDNode *SelectIndexedBinOp(SDNode *Op, SDValue N1, SDValue N2,
-                               unsigned Opc8, unsigned Opc16);
+    void Select(SDNode *N) override;
+    bool tryIndexedLoad(SDNode *Op);
+    bool tryIndexedBinOp(SDNode *Op, SDValue N1, SDValue N2, unsigned Opc8,
+                         unsigned Opc16);
 
     bool SelectAddr(SDValue Addr, SDValue &Base, SDValue &Disp);
   };
@@ -324,10 +323,10 @@ static bool isValidIndexedLoad(const LoadSDNode *LD) {
   return true;
 }
 
-SDNode *MSP430DAGToDAGISel::SelectIndexedLoad(SDNode *N) {
+bool MSP430DAGToDAGISel::tryIndexedLoad(SDNode *N) {
   LoadSDNode *LD = cast<LoadSDNode>(N);
   if (!isValidIndexedLoad(LD))
-    return nullptr;
+    return false;
 
   MVT VT = LD->getMemoryVT().getSimpleVT();
 
@@ -340,23 +339,23 @@ SDNode *MSP430DAGToDAGISel::SelectIndexedLoad(SDNode *N) {
     Opcode = MSP430::MOV16rm_POST;
     break;
   default:
-    return nullptr;
+    return false;
   }
 
-   return CurDAG->getMachineNode(Opcode, SDLoc(N),
-                                 VT, MVT::i16, MVT::Other,
-                                 LD->getBasePtr(), LD->getChain());
+  ReplaceNode(N,
+              CurDAG->getMachineNode(Opcode, SDLoc(N), VT, MVT::i16, MVT::Other,
+                                     LD->getBasePtr(), LD->getChain()));
+  return true;
 }
 
-SDNode *MSP430DAGToDAGISel::SelectIndexedBinOp(SDNode *Op,
-                                               SDValue N1, SDValue N2,
-                                               unsigned Opc8, unsigned Opc16) {
+bool MSP430DAGToDAGISel::tryIndexedBinOp(SDNode *Op, SDValue N1, SDValue N2,
+                                         unsigned Opc8, unsigned Opc16) {
   if (N1.getOpcode() == ISD::LOAD &&
       N1.hasOneUse() &&
       IsLegalToFold(N1, Op, Op, OptLevel)) {
     LoadSDNode *LD = cast<LoadSDNode>(N1);
     if (!isValidIndexedLoad(LD))
-      return nullptr;
+      return false;
 
     MVT VT = LD->getMemoryVT().getSimpleVT();
     unsigned Opc = (VT == MVT::i16 ? Opc16 : Opc8);
@@ -370,14 +369,14 @@ SDNode *MSP430DAGToDAGISel::SelectIndexedBinOp(SDNode *Op,
     ReplaceUses(SDValue(N1.getNode(), 2), SDValue(ResNode, 2));
     // Transfer writeback.
     ReplaceUses(SDValue(N1.getNode(), 1), SDValue(ResNode, 1));
-    return ResNode;
+    return true;
   }
 
-  return nullptr;
+  return false;
 }
 
 
-SDNode *MSP430DAGToDAGISel::Select(SDNode *Node) {
+void MSP430DAGToDAGISel::Select(SDNode *Node) {
   SDLoc dl(Node);
 
   // Dump information about the Node being selected
@@ -391,7 +390,7 @@ SDNode *MSP430DAGToDAGISel::Select(SDNode *Node) {
           Node->dump(CurDAG);
           errs() << "\n");
     Node->setNodeId(-1);
-    return nullptr;
+    return;
   }
 
   // Few custom selection stuff.
@@ -401,89 +400,70 @@ SDNode *MSP430DAGToDAGISel::Select(SDNode *Node) {
     assert(Node->getValueType(0) == MVT::i16);
     int FI = cast<FrameIndexSDNode>(Node)->getIndex();
     SDValue TFI = CurDAG->getTargetFrameIndex(FI, MVT::i16);
-    if (Node->hasOneUse())
-      return CurDAG->SelectNodeTo(Node, MSP430::ADD16ri, MVT::i16, TFI,
-                                  CurDAG->getTargetConstant(0, dl, MVT::i16));
-    return CurDAG->getMachineNode(MSP430::ADD16ri, dl, MVT::i16, TFI,
-                                  CurDAG->getTargetConstant(0, dl, MVT::i16));
+    if (Node->hasOneUse()) {
+      CurDAG->SelectNodeTo(Node, MSP430::ADD16ri, MVT::i16, TFI,
+                           CurDAG->getTargetConstant(0, dl, MVT::i16));
+      return;
+    }
+    ReplaceNode(Node, CurDAG->getMachineNode(
+                          MSP430::ADD16ri, dl, MVT::i16, TFI,
+                          CurDAG->getTargetConstant(0, dl, MVT::i16)));
+    return;
   }
   case ISD::LOAD:
-    if (SDNode *ResNode = SelectIndexedLoad(Node))
-      return ResNode;
+    if (tryIndexedLoad(Node))
+      return;
     // Other cases are autogenerated.
     break;
   case ISD::ADD:
-    if (SDNode *ResNode =
-        SelectIndexedBinOp(Node,
-                           Node->getOperand(0), Node->getOperand(1),
-                           MSP430::ADD8rm_POST, MSP430::ADD16rm_POST))
-      return ResNode;
-    else if (SDNode *ResNode =
-             SelectIndexedBinOp(Node, Node->getOperand(1), Node->getOperand(0),
-                                MSP430::ADD8rm_POST, MSP430::ADD16rm_POST))
-      return ResNode;
+    if (tryIndexedBinOp(Node, Node->getOperand(0), Node->getOperand(1),
+                        MSP430::ADD8rm_POST, MSP430::ADD16rm_POST))
+      return;
+    else if (tryIndexedBinOp(Node, Node->getOperand(1), Node->getOperand(0),
+                             MSP430::ADD8rm_POST, MSP430::ADD16rm_POST))
+      return;
 
     // Other cases are autogenerated.
     break;
   case ISD::SUB:
-    if (SDNode *ResNode =
-        SelectIndexedBinOp(Node,
-                           Node->getOperand(0), Node->getOperand(1),
-                           MSP430::SUB8rm_POST, MSP430::SUB16rm_POST))
-      return ResNode;
+    if (tryIndexedBinOp(Node, Node->getOperand(0), Node->getOperand(1),
+                        MSP430::SUB8rm_POST, MSP430::SUB16rm_POST))
+      return;
 
     // Other cases are autogenerated.
     break;
   case ISD::AND:
-    if (SDNode *ResNode =
-        SelectIndexedBinOp(Node,
-                           Node->getOperand(0), Node->getOperand(1),
-                           MSP430::AND8rm_POST, MSP430::AND16rm_POST))
-      return ResNode;
-    else if (SDNode *ResNode =
-             SelectIndexedBinOp(Node, Node->getOperand(1), Node->getOperand(0),
-                                MSP430::AND8rm_POST, MSP430::AND16rm_POST))
-      return ResNode;
+    if (tryIndexedBinOp(Node, Node->getOperand(0), Node->getOperand(1),
+                        MSP430::AND8rm_POST, MSP430::AND16rm_POST))
+      return;
+    else if (tryIndexedBinOp(Node, Node->getOperand(1), Node->getOperand(0),
+                             MSP430::AND8rm_POST, MSP430::AND16rm_POST))
+      return;
 
     // Other cases are autogenerated.
     break;
   case ISD::OR:
-    if (SDNode *ResNode =
-        SelectIndexedBinOp(Node,
-                           Node->getOperand(0), Node->getOperand(1),
-                           MSP430::OR8rm_POST, MSP430::OR16rm_POST))
-      return ResNode;
-    else if (SDNode *ResNode =
-             SelectIndexedBinOp(Node, Node->getOperand(1), Node->getOperand(0),
-                                MSP430::OR8rm_POST, MSP430::OR16rm_POST))
-      return ResNode;
+    if (tryIndexedBinOp(Node, Node->getOperand(0), Node->getOperand(1),
+                        MSP430::OR8rm_POST, MSP430::OR16rm_POST))
+      return;
+    else if (tryIndexedBinOp(Node, Node->getOperand(1), Node->getOperand(0),
+                             MSP430::OR8rm_POST, MSP430::OR16rm_POST))
+      return;
 
     // Other cases are autogenerated.
     break;
   case ISD::XOR:
-    if (SDNode *ResNode =
-        SelectIndexedBinOp(Node,
-                           Node->getOperand(0), Node->getOperand(1),
-                           MSP430::XOR8rm_POST, MSP430::XOR16rm_POST))
-      return ResNode;
-    else if (SDNode *ResNode =
-             SelectIndexedBinOp(Node, Node->getOperand(1), Node->getOperand(0),
-                                MSP430::XOR8rm_POST, MSP430::XOR16rm_POST))
-      return ResNode;
+    if (tryIndexedBinOp(Node, Node->getOperand(0), Node->getOperand(1),
+                        MSP430::XOR8rm_POST, MSP430::XOR16rm_POST))
+      return;
+    else if (tryIndexedBinOp(Node, Node->getOperand(1), Node->getOperand(0),
+                             MSP430::XOR8rm_POST, MSP430::XOR16rm_POST))
+      return;
 
     // Other cases are autogenerated.
     break;
   }
 
   // Select the default instruction
-  SDNode *ResNode = SelectCode(Node);
-
-  DEBUG(errs() << "=> ");
-  if (ResNode == nullptr || ResNode == Node)
-    DEBUG(Node->dump(CurDAG));
-  else
-    DEBUG(ResNode->dump(CurDAG));
-  DEBUG(errs() << "\n");
-
-  return ResNode;
+  SelectCode(Node);
 }
diff --git a/lib/Target/MSP430/MSP430ISelLowering.cpp b/lib/Target/MSP430/MSP430ISelLowering.cpp
index 18f38b7e90da..cb2c62029454 100644
--- a/lib/Target/MSP430/MSP430ISelLowering.cpp
+++ b/lib/Target/MSP430/MSP430ISelLowering.cpp
@@ -117,12 +117,8 @@ MSP430TargetLowering::MSP430TargetLowering(const TargetMachine &TM,
 
   setOperationAction(ISD::CTTZ,             MVT::i8,    Expand);
   setOperationAction(ISD::CTTZ,             MVT::i16,   Expand);
-  setOperationAction(ISD::CTTZ_ZERO_UNDEF,  MVT::i8,    Expand);
-  setOperationAction(ISD::CTTZ_ZERO_UNDEF,  MVT::i16,   Expand);
   setOperationAction(ISD::CTLZ,             MVT::i8,    Expand);
   setOperationAction(ISD::CTLZ,             MVT::i16,   Expand);
-  setOperationAction(ISD::CTLZ_ZERO_UNDEF,  MVT::i8,    Expand);
-  setOperationAction(ISD::CTLZ_ZERO_UNDEF,  MVT::i16,   Expand);
   setOperationAction(ISD::CTPOP,            MVT::i8,    Expand);
   setOperationAction(ISD::CTPOP,            MVT::i16,   Expand);
 
@@ -362,16 +358,10 @@ static void AnalyzeReturnValues(CCState &State,
   std::reverse(RVLocs.begin(), RVLocs.end());
 }
 
-SDValue
-MSP430TargetLowering::LowerFormalArguments(SDValue Chain,
-                                           CallingConv::ID CallConv,
-                                           bool isVarArg,
-                                           const SmallVectorImpl<ISD::InputArg>
-                                             &Ins,
-                                           SDLoc dl,
-                                           SelectionDAG &DAG,
-                                           SmallVectorImpl<SDValue> &InVals)
-                                             const {
+SDValue MSP430TargetLowering::LowerFormalArguments(
+    SDValue Chain, CallingConv::ID CallConv, bool isVarArg,
+    const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &dl,
+    SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals) const {
 
   switch (CallConv) {
   default:
@@ -418,16 +408,10 @@ MSP430TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
 /// LowerCCCArguments - transform physical registers into virtual registers and
 /// generate load operations for arguments places on the stack.
 // FIXME: struct return stuff
-SDValue
-MSP430TargetLowering::LowerCCCArguments(SDValue Chain,
-                                        CallingConv::ID CallConv,
-                                        bool isVarArg,
-                                        const SmallVectorImpl<ISD::InputArg>
-                                          &Ins,
-                                        SDLoc dl,
-                                        SelectionDAG &DAG,
-                                        SmallVectorImpl<SDValue> &InVals)
-                                          const {
+SDValue MSP430TargetLowering::LowerCCCArguments(
+    SDValue Chain, CallingConv::ID CallConv, bool isVarArg,
+    const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &dl,
+    SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals) const {
   MachineFunction &MF = DAG.getMachineFunction();
   MachineFrameInfo *MFI = MF.getFrameInfo();
   MachineRegisterInfo &RegInfo = MF.getRegInfo();
@@ -455,7 +439,7 @@ MSP430TargetLowering::LowerCCCArguments(SDValue Chain,
         {
 #ifndef NDEBUG
           errs() << "LowerFormalArguments Unhandled argument type: "
-               << RegVT.getSimpleVT().SimpleTy << "\n";
+               << RegVT.getEVTString() << "\n";
 #endif
           llvm_unreachable(nullptr);
         }
@@ -506,8 +490,7 @@ MSP430TargetLowering::LowerCCCArguments(SDValue Chain,
         SDValue FIN = DAG.getFrameIndex(FI, MVT::i16);
         InVal = DAG.getLoad(
             VA.getLocVT(), dl, Chain, FIN,
-            MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI),
-            false, false, false, 0);
+            MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI));
       }
 
       InVals.push_back(InVal);
@@ -518,11 +501,11 @@ MSP430TargetLowering::LowerCCCArguments(SDValue Chain,
 }
 
 SDValue
-MSP430TargetLowering::LowerReturn(SDValue Chain,
-                                  CallingConv::ID CallConv, bool isVarArg,
+MSP430TargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv,
+                                  bool isVarArg,
                                   const SmallVectorImpl<ISD::OutputArg> &Outs,
                                   const SmallVectorImpl<SDValue> &OutVals,
-                                  SDLoc dl, SelectionDAG &DAG) const {
+                                  const SDLoc &dl, SelectionDAG &DAG) const {
 
   // CCValAssign - represent the assignment of the return value to a location
   SmallVector<CCValAssign, 16> RVLocs;
@@ -570,16 +553,12 @@ MSP430TargetLowering::LowerReturn(SDValue Chain,
 /// LowerCCCCallTo - functions arguments are copied from virtual regs to
 /// (physical regs)/(stack frame), CALLSEQ_START and CALLSEQ_END are emitted.
 // TODO: sret.
-SDValue
-MSP430TargetLowering::LowerCCCCallTo(SDValue Chain, SDValue Callee,
-                                     CallingConv::ID CallConv, bool isVarArg,
-                                     bool isTailCall,
-                                     const SmallVectorImpl<ISD::OutputArg>
-                                       &Outs,
-                                     const SmallVectorImpl<SDValue> &OutVals,
-                                     const SmallVectorImpl<ISD::InputArg> &Ins,
-                                     SDLoc dl, SelectionDAG &DAG,
-                                     SmallVectorImpl<SDValue> &InVals) const {
+SDValue MSP430TargetLowering::LowerCCCCallTo(
+    SDValue Chain, SDValue Callee, CallingConv::ID CallConv, bool isVarArg,
+    bool isTailCall, const SmallVectorImpl<ISD::OutputArg> &Outs,
+    const SmallVectorImpl<SDValue> &OutVals,
+    const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &dl,
+    SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals) const {
   // Analyze operands of the call, assigning locations to each operand.
   SmallVector<CCValAssign, 16> ArgLocs;
   CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), ArgLocs,
@@ -645,8 +624,7 @@ MSP430TargetLowering::LowerCCCCallTo(SDValue Chain, SDValue Callee,
                               MachinePointerInfo(),
                               MachinePointerInfo());
       } else {
-        MemOp = DAG.getStore(Chain, dl, Arg, PtrOff, MachinePointerInfo(),
-                             false, false, 0);
+        MemOp = DAG.getStore(Chain, dl, Arg, PtrOff, MachinePointerInfo());
       }
 
       MemOpChains.push_back(MemOp);
@@ -708,12 +686,10 @@ MSP430TargetLowering::LowerCCCCallTo(SDValue Chain, SDValue Callee,
 /// LowerCallResult - Lower the result values of a call into the
 /// appropriate copies out of appropriate physical registers.
 ///
-SDValue
-MSP430TargetLowering::LowerCallResult(SDValue Chain, SDValue InFlag,
-                                      CallingConv::ID CallConv, bool isVarArg,
-                                      const SmallVectorImpl<ISD::InputArg> &Ins,
-                                      SDLoc dl, SelectionDAG &DAG,
-                                      SmallVectorImpl<SDValue> &InVals) const {
+SDValue MSP430TargetLowering::LowerCallResult(
+    SDValue Chain, SDValue InFlag, CallingConv::ID CallConv, bool isVarArg,
+    const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &dl,
+    SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals) const {
 
   // Assign locations to each value returned by this call.
   SmallVector<CCValAssign, 16> RVLocs;
@@ -808,8 +784,7 @@ SDValue MSP430TargetLowering::LowerBlockAddress(SDValue Op,
 }
 
 static SDValue EmitCMP(SDValue &LHS, SDValue &RHS, SDValue &TargetCC,
-                       ISD::CondCode CC,
-                       SDLoc dl, SelectionDAG &DAG) {
+                       ISD::CondCode CC, const SDLoc &dl, SelectionDAG &DAG) {
   // FIXME: Handle bittests someday
   assert(!LHS.getValueType().isFloatingPoint() && "We don't handle FP yet");
 
@@ -1048,13 +1023,13 @@ SDValue MSP430TargetLowering::LowerRETURNADDR(SDValue Op,
         DAG.getConstant(DAG.getDataLayout().getPointerSize(), dl, MVT::i16);
     return DAG.getLoad(PtrVT, dl, DAG.getEntryNode(),
                        DAG.getNode(ISD::ADD, dl, PtrVT, FrameAddr, Offset),
-                       MachinePointerInfo(), false, false, false, 0);
+                       MachinePointerInfo());
   }
 
   // Just load the return address.
   SDValue RetAddrFI = getReturnAddressFrameIndex(DAG);
   return DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), RetAddrFI,
-                     MachinePointerInfo(), false, false, false, 0);
+                     MachinePointerInfo());
 }
 
 SDValue MSP430TargetLowering::LowerFRAMEADDR(SDValue Op,
@@ -1069,8 +1044,7 @@ SDValue MSP430TargetLowering::LowerFRAMEADDR(SDValue Op,
                                          MSP430::FP, VT);
   while (Depth--)
     FrameAddr = DAG.getLoad(VT, dl, DAG.getEntryNode(), FrameAddr,
-                            MachinePointerInfo(),
-                            false, false, false, 0);
+                            MachinePointerInfo());
   return FrameAddr;
 }
 
@@ -1086,9 +1060,8 @@ SDValue MSP430TargetLowering::LowerVASTART(SDValue Op,
   const Value *SV = cast<SrcValueSDNode>(Op.getOperand(2))->getValue();
 
   // Create a store of the frame index to the location operand
-  return DAG.getStore(Op.getOperand(0), SDLoc(Op), FrameIndex,
-                      Op.getOperand(1), MachinePointerInfo(SV),
-                      false, false, 0);
+  return DAG.getStore(Op.getOperand(0), SDLoc(Op), FrameIndex, Op.getOperand(1),
+                      MachinePointerInfo(SV));
 }
 
 SDValue MSP430TargetLowering::LowerJumpTable(SDValue Op,
@@ -1189,17 +1162,17 @@ bool MSP430TargetLowering::isZExtFree(SDValue Val, EVT VT2) const {
 //  Other Lowering Code
 //===----------------------------------------------------------------------===//
 
-MachineBasicBlock*
-MSP430TargetLowering::EmitShiftInstr(MachineInstr *MI,
+MachineBasicBlock *
+MSP430TargetLowering::EmitShiftInstr(MachineInstr &MI,
                                      MachineBasicBlock *BB) const {
   MachineFunction *F = BB->getParent();
   MachineRegisterInfo &RI = F->getRegInfo();
-  DebugLoc dl = MI->getDebugLoc();
+  DebugLoc dl = MI.getDebugLoc();
   const TargetInstrInfo &TII = *F->getSubtarget().getInstrInfo();
 
   unsigned Opc;
   const TargetRegisterClass * RC;
-  switch (MI->getOpcode()) {
+  switch (MI.getOpcode()) {
   default: llvm_unreachable("Invalid shift opcode!");
   case MSP430::Shl8:
    Opc = MSP430::SHL8r1;
@@ -1253,9 +1226,9 @@ MSP430TargetLowering::EmitShiftInstr(MachineInstr *MI,
   unsigned ShiftAmtReg2 = RI.createVirtualRegister(&MSP430::GR8RegClass);
   unsigned ShiftReg = RI.createVirtualRegister(RC);
   unsigned ShiftReg2 = RI.createVirtualRegister(RC);
-  unsigned ShiftAmtSrcReg = MI->getOperand(2).getReg();
-  unsigned SrcReg = MI->getOperand(1).getReg();
-  unsigned DstReg = MI->getOperand(0).getReg();
+  unsigned ShiftAmtSrcReg = MI.getOperand(2).getReg();
+  unsigned SrcReg = MI.getOperand(1).getReg();
+  unsigned DstReg = MI.getOperand(0).getReg();
 
   // BB:
   // cmp 0, N
@@ -1291,14 +1264,14 @@ MSP430TargetLowering::EmitShiftInstr(MachineInstr *MI,
     .addReg(SrcReg).addMBB(BB)
     .addReg(ShiftReg2).addMBB(LoopBB);
 
-  MI->eraseFromParent();   // The pseudo instruction is gone now.
+  MI.eraseFromParent(); // The pseudo instruction is gone now.
   return RemBB;
 }
 
-MachineBasicBlock*
-MSP430TargetLowering::EmitInstrWithCustomInserter(MachineInstr *MI,
+MachineBasicBlock *
+MSP430TargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI,
                                                   MachineBasicBlock *BB) const {
-  unsigned Opc = MI->getOpcode();
+  unsigned Opc = MI.getOpcode();
 
   if (Opc == MSP430::Shl8 || Opc == MSP430::Shl16 ||
       Opc == MSP430::Sra8 || Opc == MSP430::Sra16 ||
@@ -1306,7 +1279,7 @@ MSP430TargetLowering::EmitInstrWithCustomInserter(MachineInstr *MI,
     return EmitShiftInstr(MI, BB);
 
   const TargetInstrInfo &TII = *BB->getParent()->getSubtarget().getInstrInfo();
-  DebugLoc dl = MI->getDebugLoc();
+  DebugLoc dl = MI.getDebugLoc();
 
   assert((Opc == MSP430::Select16 || Opc == MSP430::Select8) &&
          "Unexpected instr type to insert");
@@ -1340,8 +1313,8 @@ MSP430TargetLowering::EmitInstrWithCustomInserter(MachineInstr *MI,
   BB->addSuccessor(copy1MBB);
 
   BuildMI(BB, dl, TII.get(MSP430::JCC))
-    .addMBB(copy1MBB)
-    .addImm(MI->getOperand(3).getImm());
+      .addMBB(copy1MBB)
+      .addImm(MI.getOperand(3).getImm());
 
   //  copy0MBB:
   //   %FalseValue = ...
@@ -1355,11 +1328,12 @@ MSP430TargetLowering::EmitInstrWithCustomInserter(MachineInstr *MI,
   //   %Result = phi [ %FalseValue, copy0MBB ], [ %TrueValue, thisMBB ]
   //  ...
   BB = copy1MBB;
-  BuildMI(*BB, BB->begin(), dl, TII.get(MSP430::PHI),
-          MI->getOperand(0).getReg())
-    .addReg(MI->getOperand(2).getReg()).addMBB(copy0MBB)
-    .addReg(MI->getOperand(1).getReg()).addMBB(thisMBB);
+  BuildMI(*BB, BB->begin(), dl, TII.get(MSP430::PHI), MI.getOperand(0).getReg())
+      .addReg(MI.getOperand(2).getReg())
+      .addMBB(copy0MBB)
+      .addReg(MI.getOperand(1).getReg())
+      .addMBB(thisMBB);
 
-  MI->eraseFromParent();   // The pseudo instruction is gone now.
+  MI.eraseFromParent(); // The pseudo instruction is gone now.
   return BB;
 }
diff --git a/lib/Target/MSP430/MSP430ISelLowering.h b/lib/Target/MSP430/MSP430ISelLowering.h
index 2d63852c185b..8864807e999e 100644
--- a/lib/Target/MSP430/MSP430ISelLowering.h
+++ b/lib/Target/MSP430/MSP430ISelLowering.h
@@ -121,9 +121,10 @@ namespace llvm {
     bool isZExtFree(EVT VT1, EVT VT2) const override;
     bool isZExtFree(SDValue Val, EVT VT2) const override;
 
-    MachineBasicBlock* EmitInstrWithCustomInserter(MachineInstr *MI,
-                                                   MachineBasicBlock *BB) const override;
-    MachineBasicBlock* EmitShiftInstr(MachineInstr *MI,
+    MachineBasicBlock *
+    EmitInstrWithCustomInserter(MachineInstr &MI,
+                                MachineBasicBlock *BB) const override;
+    MachineBasicBlock *EmitShiftInstr(MachineInstr &MI,
                                       MachineBasicBlock *BB) const;
 
   private:
@@ -133,38 +134,34 @@ namespace llvm {
                            const SmallVectorImpl<ISD::OutputArg> &Outs,
                            const SmallVectorImpl<SDValue> &OutVals,
                            const SmallVectorImpl<ISD::InputArg> &Ins,
-                           SDLoc dl, SelectionDAG &DAG,
+                           const SDLoc &dl, SelectionDAG &DAG,
                            SmallVectorImpl<SDValue> &InVals) const;
 
-    SDValue LowerCCCArguments(SDValue Chain,
-                              CallingConv::ID CallConv,
+    SDValue LowerCCCArguments(SDValue Chain, CallingConv::ID CallConv,
                               bool isVarArg,
                               const SmallVectorImpl<ISD::InputArg> &Ins,
-                              SDLoc dl,
-                              SelectionDAG &DAG,
+                              const SDLoc &dl, SelectionDAG &DAG,
                               SmallVectorImpl<SDValue> &InVals) const;
 
     SDValue LowerCallResult(SDValue Chain, SDValue InFlag,
                             CallingConv::ID CallConv, bool isVarArg,
                             const SmallVectorImpl<ISD::InputArg> &Ins,
-                            SDLoc dl, SelectionDAG &DAG,
+                            const SDLoc &dl, SelectionDAG &DAG,
                             SmallVectorImpl<SDValue> &InVals) const;
 
     SDValue
-      LowerFormalArguments(SDValue Chain,
-                           CallingConv::ID CallConv, bool isVarArg,
-                           const SmallVectorImpl<ISD::InputArg> &Ins,
-                           SDLoc dl, SelectionDAG &DAG,
-                           SmallVectorImpl<SDValue> &InVals) const override;
+    LowerFormalArguments(SDValue Chain, CallingConv::ID CallConv, bool isVarArg,
+                         const SmallVectorImpl<ISD::InputArg> &Ins,
+                         const SDLoc &dl, SelectionDAG &DAG,
+                         SmallVectorImpl<SDValue> &InVals) const override;
     SDValue
       LowerCall(TargetLowering::CallLoweringInfo &CLI,
                 SmallVectorImpl<SDValue> &InVals) const override;
 
-    SDValue LowerReturn(SDValue Chain,
-                        CallingConv::ID CallConv, bool isVarArg,
+    SDValue LowerReturn(SDValue Chain, CallingConv::ID CallConv, bool isVarArg,
                         const SmallVectorImpl<ISD::OutputArg> &Outs,
                         const SmallVectorImpl<SDValue> &OutVals,
-                        SDLoc dl, SelectionDAG &DAG) const override;
+                        const SDLoc &dl, SelectionDAG &DAG) const override;
 
     bool getPostIndexedAddressParts(SDNode *N, SDNode *Op,
                                     SDValue &Base,
diff --git a/lib/Target/MSP430/MSP430InstrInfo.cpp b/lib/Target/MSP430/MSP430InstrInfo.cpp
index d4f82bda1ec9..c834da3a11cd 100644
--- a/lib/Target/MSP430/MSP430InstrInfo.cpp
+++ b/lib/Target/MSP430/MSP430InstrInfo.cpp
@@ -78,18 +78,20 @@ void MSP430InstrInfo::loadRegFromStackSlot(MachineBasicBlock &MBB,
 
   if (RC == &MSP430::GR16RegClass)
     BuildMI(MBB, MI, DL, get(MSP430::MOV16rm))
-      .addReg(DestReg).addFrameIndex(FrameIdx).addImm(0).addMemOperand(MMO);
+      .addReg(DestReg, getDefRegState(true)).addFrameIndex(FrameIdx)
+      .addImm(0).addMemOperand(MMO);
   else if (RC == &MSP430::GR8RegClass)
     BuildMI(MBB, MI, DL, get(MSP430::MOV8rm))
-      .addReg(DestReg).addFrameIndex(FrameIdx).addImm(0).addMemOperand(MMO);
+      .addReg(DestReg, getDefRegState(true)).addFrameIndex(FrameIdx)
+      .addImm(0).addMemOperand(MMO);
   else
     llvm_unreachable("Cannot store this register to stack slot!");
 }
 
 void MSP430InstrInfo::copyPhysReg(MachineBasicBlock &MBB,
-                                  MachineBasicBlock::iterator I, DebugLoc DL,
-                                  unsigned DestReg, unsigned SrcReg,
-                                  bool KillSrc) const {
+                                  MachineBasicBlock::iterator I,
+                                  const DebugLoc &DL, unsigned DestReg,
+                                  unsigned SrcReg, bool KillSrc) const {
   unsigned Opc;
   if (MSP430::GR16RegClass.contains(DestReg, SrcReg))
     Opc = MSP430::MOV16rr;
@@ -156,18 +158,19 @@ ReverseBranchCondition(SmallVectorImpl<MachineOperand> &Cond) const {
   return false;
 }
 
-bool MSP430InstrInfo::isUnpredicatedTerminator(const MachineInstr *MI) const {
-  if (!MI->isTerminator()) return false;
+bool MSP430InstrInfo::isUnpredicatedTerminator(const MachineInstr &MI) const {
+  if (!MI.isTerminator())
+    return false;
 
   // Conditional branch is a special case.
-  if (MI->isBranch() && !MI->isBarrier())
+  if (MI.isBranch() && !MI.isBarrier())
     return true;
-  if (!MI->isPredicable())
+  if (!MI.isPredicable())
     return true;
   return !isPredicated(MI);
 }
 
-bool MSP430InstrInfo::AnalyzeBranch(MachineBasicBlock &MBB,
+bool MSP430InstrInfo::analyzeBranch(MachineBasicBlock &MBB,
                                     MachineBasicBlock *&TBB,
                                     MachineBasicBlock *&FBB,
                                     SmallVectorImpl<MachineOperand> &Cond,
@@ -182,7 +185,7 @@ bool MSP430InstrInfo::AnalyzeBranch(MachineBasicBlock &MBB,
 
     // Working from the bottom, when we see a non-terminator
     // instruction, we're done.
-    if (!isUnpredicatedTerminator(I))
+    if (!isUnpredicatedTerminator(*I))
       break;
 
     // A terminator that isn't a branch can't easily be handled
@@ -257,11 +260,11 @@ bool MSP430InstrInfo::AnalyzeBranch(MachineBasicBlock &MBB,
   return false;
 }
 
-unsigned
-MSP430InstrInfo::InsertBranch(MachineBasicBlock &MBB, MachineBasicBlock *TBB,
-                              MachineBasicBlock *FBB,
-                              ArrayRef<MachineOperand> Cond,
-                              DebugLoc DL) const {
+unsigned MSP430InstrInfo::InsertBranch(MachineBasicBlock &MBB,
+                                       MachineBasicBlock *TBB,
+                                       MachineBasicBlock *FBB,
+                                       ArrayRef<MachineOperand> Cond,
+                                       const DebugLoc &DL) const {
   // Shouldn't be a fall through.
   assert(TBB && "InsertBranch must not be told to insert a fallthrough");
   assert((Cond.size() == 1 || Cond.size() == 0) &&
@@ -290,8 +293,8 @@ MSP430InstrInfo::InsertBranch(MachineBasicBlock &MBB, MachineBasicBlock *TBB,
 /// GetInstSize - Return the number of bytes of code the specified
 /// instruction may be.  This returns the maximum number of bytes.
 ///
-unsigned MSP430InstrInfo::GetInstSizeInBytes(const MachineInstr *MI) const {
-  const MCInstrDesc &Desc = MI->getDesc();
+unsigned MSP430InstrInfo::GetInstSizeInBytes(const MachineInstr &MI) const {
+  const MCInstrDesc &Desc = MI.getDesc();
 
   switch (Desc.TSFlags & MSP430II::SizeMask) {
   default:
@@ -304,14 +307,14 @@ unsigned MSP430InstrInfo::GetInstSizeInBytes(const MachineInstr *MI) const {
     case TargetOpcode::DBG_VALUE:
       return 0;
     case TargetOpcode::INLINEASM: {
-      const MachineFunction *MF = MI->getParent()->getParent();
+      const MachineFunction *MF = MI.getParent()->getParent();
       const TargetInstrInfo &TII = *MF->getSubtarget().getInstrInfo();
-      return TII.getInlineAsmLength(MI->getOperand(0).getSymbolName(),
+      return TII.getInlineAsmLength(MI.getOperand(0).getSymbolName(),
                                     *MF->getTarget().getMCAsmInfo());
     }
     }
   case MSP430II::SizeSpecial:
-    switch (MI->getOpcode()) {
+    switch (MI.getOpcode()) {
     default: llvm_unreachable("Unknown instruction size!");
     case MSP430::SAR8r1c:
     case MSP430::SAR16r1c:
diff --git a/lib/Target/MSP430/MSP430InstrInfo.h b/lib/Target/MSP430/MSP430InstrInfo.h
index 3cf3b1bb8ab2..46d4738d89af 100644
--- a/lib/Target/MSP430/MSP430InstrInfo.h
+++ b/lib/Target/MSP430/MSP430InstrInfo.h
@@ -52,9 +52,8 @@ public:
   ///
   const TargetRegisterInfo &getRegisterInfo() const { return RI; }
 
-  void copyPhysReg(MachineBasicBlock &MBB,
-                   MachineBasicBlock::iterator I, DebugLoc DL,
-                   unsigned DestReg, unsigned SrcReg,
+  void copyPhysReg(MachineBasicBlock &MBB, MachineBasicBlock::iterator I,
+                   const DebugLoc &DL, unsigned DestReg, unsigned SrcReg,
                    bool KillSrc) const override;
 
   void storeRegToStackSlot(MachineBasicBlock &MBB,
@@ -69,22 +68,21 @@ public:
                             const TargetRegisterClass *RC,
                             const TargetRegisterInfo *TRI) const override;
 
-  unsigned GetInstSizeInBytes(const MachineInstr *MI) const;
+  unsigned GetInstSizeInBytes(const MachineInstr &MI) const;
 
   // Branch folding goodness
   bool
   ReverseBranchCondition(SmallVectorImpl<MachineOperand> &Cond) const override;
-  bool isUnpredicatedTerminator(const MachineInstr *MI) const override;
-  bool AnalyzeBranch(MachineBasicBlock &MBB,
-                     MachineBasicBlock *&TBB, MachineBasicBlock *&FBB,
+  bool isUnpredicatedTerminator(const MachineInstr &MI) const override;
+  bool analyzeBranch(MachineBasicBlock &MBB, MachineBasicBlock *&TBB,
+                     MachineBasicBlock *&FBB,
                      SmallVectorImpl<MachineOperand> &Cond,
                      bool AllowModify) const override;
 
   unsigned RemoveBranch(MachineBasicBlock &MBB) const override;
   unsigned InsertBranch(MachineBasicBlock &MBB, MachineBasicBlock *TBB,
                         MachineBasicBlock *FBB, ArrayRef<MachineOperand> Cond,
-                        DebugLoc DL) const override;
-
+                        const DebugLoc &DL) const override;
 };
 
 }
diff --git a/lib/Target/MSP430/MSP430Subtarget.h b/lib/Target/MSP430/MSP430Subtarget.h
index ff2656d26dd2..1a00d85e01cb 100644
--- a/lib/Target/MSP430/MSP430Subtarget.h
+++ b/lib/Target/MSP430/MSP430Subtarget.h
@@ -18,8 +18,8 @@
 #include "MSP430ISelLowering.h"
 #include "MSP430InstrInfo.h"
 #include "MSP430RegisterInfo.h"
+#include "llvm/CodeGen/SelectionDAGTargetInfo.h"
 #include "llvm/IR/DataLayout.h"
-#include "llvm/Target/TargetSelectionDAGInfo.h"
 #include "llvm/Target/TargetSubtargetInfo.h"
 #include <string>
 
@@ -35,7 +35,7 @@ class MSP430Subtarget : public MSP430GenSubtargetInfo {
   MSP430FrameLowering FrameLowering;
   MSP430InstrInfo InstrInfo;
   MSP430TargetLowering TLInfo;
-  TargetSelectionDAGInfo TSInfo;
+  SelectionDAGTargetInfo TSInfo;
 
 public:
   /// This constructor initializes the data members to match that
@@ -60,7 +60,7 @@ public:
   const MSP430TargetLowering *getTargetLowering() const override {
     return &TLInfo;
   }
-  const TargetSelectionDAGInfo *getSelectionDAGInfo() const override {
+  const SelectionDAGTargetInfo *getSelectionDAGInfo() const override {
     return &TSInfo;
   }
 };
diff --git a/lib/Target/MSP430/MSP430TargetMachine.cpp b/lib/Target/MSP430/MSP430TargetMachine.cpp
index 97a4047d1d63..b2e698ca5548 100644
--- a/lib/Target/MSP430/MSP430TargetMachine.cpp
+++ b/lib/Target/MSP430/MSP430TargetMachine.cpp
@@ -15,6 +15,7 @@
 #include "MSP430.h"
 #include "llvm/CodeGen/Passes.h"
 #include "llvm/CodeGen/TargetLoweringObjectFileImpl.h"
+#include "llvm/CodeGen/TargetPassConfig.h"
 #include "llvm/IR/LegacyPassManager.h"
 #include "llvm/MC/MCAsmInfo.h"
 #include "llvm/Support/TargetRegistry.h"
@@ -25,13 +26,20 @@ extern "C" void LLVMInitializeMSP430Target() {
   RegisterTargetMachine<MSP430TargetMachine> X(TheMSP430Target);
 }
 
+static Reloc::Model getEffectiveRelocModel(Optional<Reloc::Model> RM) {
+  if (!RM.hasValue())
+    return Reloc::Static;
+  return *RM;
+}
+
 MSP430TargetMachine::MSP430TargetMachine(const Target &T, const Triple &TT,
                                          StringRef CPU, StringRef FS,
                                          const TargetOptions &Options,
-                                         Reloc::Model RM, CodeModel::Model CM,
+                                         Optional<Reloc::Model> RM,
+                                         CodeModel::Model CM,
                                          CodeGenOpt::Level OL)
     : LLVMTargetMachine(T, "e-m:e-p:16:16-i32:16:32-a:16-n8:16", TT, CPU, FS,
-                        Options, RM, CM, OL),
+                        Options, getEffectiveRelocModel(RM), CM, OL),
       TLOF(make_unique<TargetLoweringObjectFileELF>()),
       // FIXME: Check DataLayout string.
       Subtarget(TT, CPU, FS, *this) {
diff --git a/lib/Target/MSP430/MSP430TargetMachine.h b/lib/Target/MSP430/MSP430TargetMachine.h
index 4f955a8049c7..de8f06e71dee 100644
--- a/lib/Target/MSP430/MSP430TargetMachine.h
+++ b/lib/Target/MSP430/MSP430TargetMachine.h
@@ -30,7 +30,7 @@ class MSP430TargetMachine : public LLVMTargetMachine {
 public:
   MSP430TargetMachine(const Target &T, const Triple &TT, StringRef CPU,
                       StringRef FS, const TargetOptions &Options,
-                      Reloc::Model RM, CodeModel::Model CM,
+                      Optional<Reloc::Model> RM, CodeModel::Model CM,
                       CodeGenOpt::Level OL);
   ~MSP430TargetMachine() override;
 
diff --git a/lib/Target/MSP430/Makefile b/lib/Target/MSP430/Makefile
deleted file mode 100644
index 82216edd81e4..000000000000
--- a/lib/Target/MSP430/Makefile
+++ /dev/null
@@ -1,23 +0,0 @@
-##===- lib/Target/MSP430/Makefile --------------------------*- Makefile -*-===##
-# 
-#                     The LLVM Compiler Infrastructure
-#
-# This file is distributed under the University of Illinois Open Source 
-# License. See LICENSE.TXT for details.
-# 
-##===----------------------------------------------------------------------===##
-
-LEVEL = ../../..
-LIBRARYNAME = LLVMMSP430CodeGen
-TARGET = MSP430
-
-# Make sure that tblgen is run, first thing.
-BUILT_SOURCES = MSP430GenRegisterInfo.inc MSP430GenInstrInfo.inc \
-		MSP430GenAsmWriter.inc \
-		MSP430GenDAGISel.inc MSP430GenCallingConv.inc \
-		MSP430GenSubtargetInfo.inc
-
-DIRS = InstPrinter TargetInfo MCTargetDesc
-
-include $(LEVEL)/Makefile.common
-
diff --git a/lib/Target/MSP430/TargetInfo/Makefile b/lib/Target/MSP430/TargetInfo/Makefile
deleted file mode 100644
index abb08f2548ee..000000000000
--- a/lib/Target/MSP430/TargetInfo/Makefile
+++ /dev/null
@@ -1,15 +0,0 @@
-##===- lib/Target/MSP430/TargetInfo/Makefile ---------------*- Makefile -*-===##
-#
-#                     The LLVM Compiler Infrastructure
-#
-# This file is distributed under the University of Illinois Open Source
-# License. See LICENSE.TXT for details.
-#
-##===----------------------------------------------------------------------===##
-LEVEL = ../../../..
-LIBRARYNAME = LLVMMSP430Info
-
-# Hack: we need to include 'main' target directory to grab private headers
-CPPFLAGS = -I$(PROJ_OBJ_DIR)/.. -I$(PROJ_SRC_DIR)/..
-
-include $(LEVEL)/Makefile.common
diff --git a/lib/Target/Makefile b/lib/Target/Makefile
deleted file mode 100644
index 50a360f1f868..000000000000
--- a/lib/Target/Makefile
+++ /dev/null
@@ -1,20 +0,0 @@
-#===- lib/Target/Makefile ----------------------------------*- Makefile -*-===##
-#
-#                     The LLVM Compiler Infrastructure
-#
-# This file is distributed under the University of Illinois Open Source
-# License. See LICENSE.TXT for details.
-#
-##===----------------------------------------------------------------------===##
-
-LEVEL = ../..
-LIBRARYNAME = LLVMTarget
-BUILD_ARCHIVE = 1
-
-# We include this early so we can access the value of TARGETS_TO_BUILD as the
-# value for PARALLEL_DIRS which must be set before Makefile.rules is included
-include $(LEVEL)/Makefile.config
-
-PARALLEL_DIRS := $(TARGETS_TO_BUILD)
-
-include $(LLVM_SRC_ROOT)/Makefile.rules
diff --git a/lib/Target/Mips/AsmParser/Makefile b/lib/Target/Mips/AsmParser/Makefile
deleted file mode 100644
index 679acee9fe72..000000000000
--- a/lib/Target/Mips/AsmParser/Makefile
+++ /dev/null
@@ -1,15 +0,0 @@
-##===- lib/Target/Mips/AsmParser/Makefile ------------------*- Makefile -*-===##
-#
-#                     The LLVM Compiler Infrastructure
-#
-# This file is distributed under the University of Illinois Open Source
-# License. See LICENSE.TXT for details.
-#
-##===----------------------------------------------------------------------===##
-LEVEL = ../../../..
-LIBRARYNAME = LLVMMipsAsmParser
-
-# Hack: we need to include 'main' mips target directory to grab private headers
-CPP.Flags += -I$(PROJ_OBJ_DIR)/.. -I$(PROJ_SRC_DIR)/..
-
-include $(LEVEL)/Makefile.common
diff --git a/lib/Target/Mips/AsmParser/MipsAsmParser.cpp b/lib/Target/Mips/AsmParser/MipsAsmParser.cpp
index d4e061f00d3a..b51d0200b0b1 100644
--- a/lib/Target/Mips/AsmParser/MipsAsmParser.cpp
+++ b/lib/Target/Mips/AsmParser/MipsAsmParser.cpp
@@ -13,7 +13,6 @@
 #include "MipsRegisterInfo.h"
 #include "MipsTargetObjectFile.h"
 #include "MipsTargetStreamer.h"
-#include "llvm/ADT/APInt.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/StringSwitch.h"
 #include "llvm/MC/MCContext.h"
@@ -22,11 +21,13 @@
 #include "llvm/MC/MCInstBuilder.h"
 #include "llvm/MC/MCParser/MCAsmLexer.h"
 #include "llvm/MC/MCParser/MCParsedAsmOperand.h"
+#include "llvm/MC/MCParser/MCTargetAsmParser.h"
+#include "llvm/MC/MCSectionELF.h"
 #include "llvm/MC/MCStreamer.h"
 #include "llvm/MC/MCSubtargetInfo.h"
 #include "llvm/MC/MCSymbol.h"
-#include "llvm/MC/MCTargetAsmParser.h"
 #include "llvm/Support/Debug.h"
+#include "llvm/Support/ELF.h"
 #include "llvm/Support/MathExtras.h"
 #include "llvm/Support/SourceMgr.h"
 #include "llvm/Support/TargetRegistry.h"
@@ -157,7 +158,6 @@ class MipsAsmParser : public MCTargetAsmParser {
   OperandMatchResultTy parseImm(OperandVector &Operands);
   OperandMatchResultTy parseJumpTarget(OperandVector &Operands);
   OperandMatchResultTy parseInvNum(OperandVector &Operands);
-  OperandMatchResultTy parseLSAImm(OperandVector &Operands);
   OperandMatchResultTy parseRegisterPair(OperandVector &Operands);
   OperandMatchResultTy parseMovePRegPair(OperandVector &Operands);
   OperandMatchResultTy parseRegisterList(OperandVector &Operands);
@@ -173,74 +173,77 @@ class MipsAsmParser : public MCTargetAsmParser {
   };
 
   // Expands assembly pseudo instructions.
-  MacroExpanderResultTy
-  tryExpandInstruction(MCInst &Inst, SMLoc IDLoc,
-                       SmallVectorImpl<MCInst> &Instructions);
+  MacroExpanderResultTy tryExpandInstruction(MCInst &Inst, SMLoc IDLoc,
+                                             MCStreamer &Out,
+                                             const MCSubtargetInfo *STI);
 
-  bool expandJalWithRegs(MCInst &Inst, SMLoc IDLoc,
-                         SmallVectorImpl<MCInst> &Instructions);
+  bool expandJalWithRegs(MCInst &Inst, SMLoc IDLoc, MCStreamer &Out,
+                         const MCSubtargetInfo *STI);
 
   bool loadImmediate(int64_t ImmValue, unsigned DstReg, unsigned SrcReg,
                      bool Is32BitImm, bool IsAddress, SMLoc IDLoc,
-                     SmallVectorImpl<MCInst> &Instructions);
+                     MCStreamer &Out, const MCSubtargetInfo *STI);
 
   bool loadAndAddSymbolAddress(const MCExpr *SymExpr, unsigned DstReg,
                                unsigned SrcReg, bool Is32BitSym, SMLoc IDLoc,
-                               SmallVectorImpl<MCInst> &Instructions);
+                               MCStreamer &Out, const MCSubtargetInfo *STI);
 
   bool expandLoadImm(MCInst &Inst, bool Is32BitImm, SMLoc IDLoc,
-                     SmallVectorImpl<MCInst> &Instructions);
+                     MCStreamer &Out, const MCSubtargetInfo *STI);
 
   bool expandLoadAddress(unsigned DstReg, unsigned BaseReg,
                          const MCOperand &Offset, bool Is32BitAddress,
-                         SMLoc IDLoc, SmallVectorImpl<MCInst> &Instructions);
+                         SMLoc IDLoc, MCStreamer &Out,
+                         const MCSubtargetInfo *STI);
 
-  bool expandUncondBranchMMPseudo(MCInst &Inst, SMLoc IDLoc,
-                                  SmallVectorImpl<MCInst> &Instructions);
+  bool expandUncondBranchMMPseudo(MCInst &Inst, SMLoc IDLoc, MCStreamer &Out,
+                                  const MCSubtargetInfo *STI);
 
-  void expandMemInst(MCInst &Inst, SMLoc IDLoc,
-                     SmallVectorImpl<MCInst> &Instructions, bool isLoad,
-                     bool isImmOpnd);
+  void expandMemInst(MCInst &Inst, SMLoc IDLoc, MCStreamer &Out,
+                     const MCSubtargetInfo *STI, bool IsLoad, bool IsImmOpnd);
 
-  bool expandLoadStoreMultiple(MCInst &Inst, SMLoc IDLoc,
-                               SmallVectorImpl<MCInst> &Instructions);
+  void expandLoadInst(MCInst &Inst, SMLoc IDLoc, MCStreamer &Out,
+                      const MCSubtargetInfo *STI, bool IsImmOpnd);
 
-  bool expandAliasImmediate(MCInst &Inst, SMLoc IDLoc,
-                          SmallVectorImpl<MCInst> &Instructions);
+  void expandStoreInst(MCInst &Inst, SMLoc IDLoc, MCStreamer &Out,
+                       const MCSubtargetInfo *STI, bool IsImmOpnd);
 
-  bool expandBranchImm(MCInst &Inst, SMLoc IDLoc,
-                       SmallVectorImpl<MCInst> &Instructions);
+  bool expandLoadStoreMultiple(MCInst &Inst, SMLoc IDLoc, MCStreamer &Out,
+                               const MCSubtargetInfo *STI);
 
-  bool expandCondBranches(MCInst &Inst, SMLoc IDLoc,
-                          SmallVectorImpl<MCInst> &Instructions);
+  bool expandAliasImmediate(MCInst &Inst, SMLoc IDLoc, MCStreamer &Out,
+                            const MCSubtargetInfo *STI);
 
-  bool expandDiv(MCInst &Inst, SMLoc IDLoc,
-                 SmallVectorImpl<MCInst> &Instructions, const bool IsMips64,
-                 const bool Signed);
+  bool expandBranchImm(MCInst &Inst, SMLoc IDLoc, MCStreamer &Out,
+                       const MCSubtargetInfo *STI);
 
-  bool expandUlh(MCInst &Inst, bool Signed, SMLoc IDLoc,
-                 SmallVectorImpl<MCInst> &Instructions);
+  bool expandCondBranches(MCInst &Inst, SMLoc IDLoc, MCStreamer &Out,
+                          const MCSubtargetInfo *STI);
 
-  bool expandUlw(MCInst &Inst, SMLoc IDLoc,
-                 SmallVectorImpl<MCInst> &Instructions);
+  bool expandDiv(MCInst &Inst, SMLoc IDLoc, MCStreamer &Out,
+                 const MCSubtargetInfo *STI, const bool IsMips64,
+                 const bool Signed);
 
-  bool expandRotation(MCInst &Inst, SMLoc IDLoc,
-                      SmallVectorImpl<MCInst> &Instructions);
-  bool expandRotationImm(MCInst &Inst, SMLoc IDLoc,
-                         SmallVectorImpl<MCInst> &Instructions);
-  bool expandDRotation(MCInst &Inst, SMLoc IDLoc,
-                       SmallVectorImpl<MCInst> &Instructions);
-  bool expandDRotationImm(MCInst &Inst, SMLoc IDLoc,
-                          SmallVectorImpl<MCInst> &Instructions);
+  bool expandTrunc(MCInst &Inst, bool IsDouble, bool Is64FPU, SMLoc IDLoc,
+                   MCStreamer &Out, const MCSubtargetInfo *STI);
 
-  void createNop(bool hasShortDelaySlot, SMLoc IDLoc,
-                 SmallVectorImpl<MCInst> &Instructions);
+  bool expandUlh(MCInst &Inst, bool Signed, SMLoc IDLoc, MCStreamer &Out,
+                 const MCSubtargetInfo *STI);
 
-  void createAddu(unsigned DstReg, unsigned SrcReg, unsigned TrgReg,
-                  bool Is64Bit, SmallVectorImpl<MCInst> &Instructions);
+  bool expandUlw(MCInst &Inst, SMLoc IDLoc, MCStreamer &Out,
+                 const MCSubtargetInfo *STI);
 
-  void createCpRestoreMemOp(bool IsLoad, int StackOffset, SMLoc IDLoc,
-                            SmallVectorImpl<MCInst> &Instructions);
+  bool expandRotation(MCInst &Inst, SMLoc IDLoc,
+                      MCStreamer &Out, const MCSubtargetInfo *STI);
+  bool expandRotationImm(MCInst &Inst, SMLoc IDLoc, MCStreamer &Out,
+                         const MCSubtargetInfo *STI);
+  bool expandDRotation(MCInst &Inst, SMLoc IDLoc, MCStreamer &Out,
+                       const MCSubtargetInfo *STI);
+  bool expandDRotationImm(MCInst &Inst, SMLoc IDLoc, MCStreamer &Out,
+                          const MCSubtargetInfo *STI);
+
+  bool expandAbs(MCInst &Inst, SMLoc IDLoc, MCStreamer &Out,
+                 const MCSubtargetInfo *STI);
 
   bool reportParseError(Twine ErrorMsg);
   bool reportParseError(SMLoc Loc, Twine ErrorMsg);
@@ -263,6 +266,7 @@ class MipsAsmParser : public MCTargetAsmParser {
   bool parseDirectiveSet();
   bool parseDirectiveOption();
   bool parseInsnDirective();
+  bool parseSSectionDirective(StringRef Section, unsigned Type);
 
   bool parseSetAtDirective();
   bool parseSetNoAtDirective();
@@ -295,16 +299,12 @@ class MipsAsmParser : public MCTargetAsmParser {
 
   bool parseInternalDirectiveReallowModule();
 
-  MCSymbolRefExpr::VariantKind getVariantKind(StringRef Symbol);
-
   bool eatComma(StringRef ErrorStr);
 
   int matchCPURegisterName(StringRef Symbol);
 
   int matchHWRegsRegisterName(StringRef Symbol);
 
-  int matchRegisterByNumber(unsigned RegNum, unsigned RegClass);
-
   int matchFPURegisterName(StringRef Name);
 
   int matchFCCRegisterName(StringRef Name);
@@ -317,15 +317,13 @@ class MipsAsmParser : public MCTargetAsmParser {
 
   unsigned getReg(int RC, int RegNo);
 
-  unsigned getGPR(int RegNo);
-
   /// Returns the internal register number for the current AT. Also checks if
   /// the current AT is unavailable (set to $0) and gives an error if it is.
   /// This should be used in pseudo-instruction expansions which need AT.
   unsigned getATReg(SMLoc Loc);
 
-  bool processInstruction(MCInst &Inst, SMLoc IDLoc,
-                          SmallVectorImpl<MCInst> &Instructions);
+  bool processInstruction(MCInst &Inst, SMLoc IDLoc, MCStreamer &Out,
+                          const MCSubtargetInfo *STI);
 
   // Helper function that checks if the value of a vector index is within the
   // boundaries of accepted values for each RegisterKind
@@ -395,6 +393,8 @@ class MipsAsmParser : public MCTargetAsmParser {
 public:
   enum MipsMatchResultTy {
     Match_RequiresDifferentSrcAndDst = FIRST_TARGET_MATCH_RESULT_TY,
+    Match_RequiresDifferentOperands,
+    Match_RequiresNoZeroRegister,
 #define GET_OPERAND_DIAGNOSTIC_TYPES
 #include "MipsGenAsmMatcher.inc"
 #undef GET_OPERAND_DIAGNOSTIC_TYPES
@@ -427,13 +427,12 @@ public:
 
     CurrentFn = nullptr;
 
-    IsPicEnabled =
-        (getContext().getObjectFileInfo()->getRelocM() == Reloc::PIC_);
+    IsPicEnabled = getContext().getObjectFileInfo()->isPositionIndependent();
 
     IsCpRestoreSet = false;
     CpRestoreOffset = -1;
 
-    Triple TheTriple(sti.getTargetTriple());
+    const Triple &TheTriple = sti.getTargetTriple();
     if ((TheTriple.getArch() == Triple::mips) ||
         (TheTriple.getArch() == Triple::mips64))
       IsLittleEndian = false;
@@ -584,7 +583,6 @@ private:
   enum KindTy {
     k_Immediate,     /// An immediate (possibly involving symbol references)
     k_Memory,        /// Base + Offset Memory Address
-    k_PhysRegister,  /// A physical register from the Mips namespace
     k_RegisterIndex, /// A register index in one or more RegKind.
     k_Token,         /// A simple token
     k_RegList,       /// A physical register list
@@ -604,10 +602,6 @@ private:
     unsigned Length;
   };
 
-  struct PhysRegOp {
-    unsigned Num; /// Register Number
-  };
-
   struct RegIdxOp {
     unsigned Index; /// Index into the register class
     RegKind Kind;   /// Bitfield of the kinds it could possibly be
@@ -629,7 +623,6 @@ private:
 
   union {
     struct Token Tok;
-    struct PhysRegOp PhysReg;
     struct RegIdxOp RegIdx;
     struct ImmOp Imm;
     struct MemOp Mem;
@@ -938,6 +931,34 @@ public:
     Inst.addOperand(MCOperand::createImm(Imm));
   }
 
+  template <unsigned Bits>
+  void addSImmOperands(MCInst &Inst, unsigned N) const {
+    if (isImm() && !isConstantImm()) {
+      addExpr(Inst, getImm());
+      return;
+    }
+    addConstantSImmOperands<Bits, 0, 0>(Inst, N);
+  }
+
+  template <unsigned Bits>
+  void addUImmOperands(MCInst &Inst, unsigned N) const {
+    if (isImm() && !isConstantImm()) {
+      addExpr(Inst, getImm());
+      return;
+    }
+    addConstantUImmOperands<Bits, 0, 0>(Inst, N);
+  }
+
+  template <unsigned Bits, int Offset = 0, int AdjustOffset = 0>
+  void addConstantSImmOperands(MCInst &Inst, unsigned N) const {
+    assert(N == 1 && "Invalid number of operands!");
+    int64_t Imm = getConstantImm() - Offset;
+    Imm = SignExtend64<Bits>(Imm);
+    Imm += Offset;
+    Imm += AdjustOffset;
+    Inst.addOperand(MCOperand::createImm(Imm));
+  }
+
   void addImmOperands(MCInst &Inst, unsigned N) const {
     assert(N == 1 && "Invalid number of operands!");
     const MCExpr *Expr = getImm();
@@ -973,9 +994,19 @@ public:
 
   void addRegPairOperands(MCInst &Inst, unsigned N) const {
     assert(N == 2 && "Invalid number of operands!");
+    assert((RegIdx.Kind & RegKind_GPR) && "Invalid access!");
     unsigned RegNo = getRegPair();
-    Inst.addOperand(MCOperand::createReg(RegNo++));
-    Inst.addOperand(MCOperand::createReg(RegNo));
+    AsmParser.warnIfRegIndexIsAT(RegNo, StartLoc);
+    Inst.addOperand(MCOperand::createReg(
+      RegIdx.RegInfo->getRegClass(
+        AsmParser.getABI().AreGprs64bit()
+          ? Mips::GPR64RegClassID
+          : Mips::GPR32RegClassID).getRegister(RegNo++)));
+    Inst.addOperand(MCOperand::createReg(
+      RegIdx.RegInfo->getRegClass(
+        AsmParser.getABI().AreGprs64bit()
+          ? Mips::GPR64RegClassID
+          : Mips::GPR32RegClassID).getRegister(RegNo)));
   }
 
   void addMovePRegPairOperands(MCInst &Inst, unsigned N) const {
@@ -985,12 +1016,9 @@ public:
   }
 
   bool isReg() const override {
-    // As a special case until we sort out the definition of div/divu, pretend
-    // that $0/$zero are k_PhysRegister so that MCK_ZERO works correctly.
-    if (isGPRAsmReg() && RegIdx.Index == 0)
-      return true;
-
-    return Kind == k_PhysRegister;
+    // As a special case until we sort out the definition of div/divu, accept
+    // $0/$zero here so that MCK_ZERO works correctly.
+    return isGPRAsmReg() && RegIdx.Index == 0;
   }
   bool isRegIdx() const { return Kind == k_RegisterIndex; }
   bool isImm() const override { return Kind == k_Immediate; }
@@ -1003,8 +1031,23 @@ public:
   template <unsigned Bits, int Offset = 0> bool isConstantUImm() const {
     return isConstantImm() && isUInt<Bits>(getConstantImm() - Offset);
   }
-  template <unsigned Bits> bool isConstantSImm() const {
-    return isConstantImm() && isInt<Bits>(getConstantImm());
+  template <unsigned Bits> bool isSImm() const {
+    return isConstantImm() ? isInt<Bits>(getConstantImm()) : isImm();
+  }
+  template <unsigned Bits> bool isUImm() const {
+    return isConstantImm() ? isUInt<Bits>(getConstantImm()) : isImm();
+  }
+  template <unsigned Bits> bool isAnyImm() const {
+    return isConstantImm() ? (isInt<Bits>(getConstantImm()) ||
+                              isUInt<Bits>(getConstantImm()))
+                           : isImm();
+  }
+  template <unsigned Bits, int Offset = 0> bool isConstantSImm() const {
+    return isConstantImm() && isInt<Bits>(getConstantImm() - Offset);
+  }
+  template <unsigned Bottom, unsigned Top> bool isConstantUImmRange() const {
+    return isConstantImm() && getConstantImm() >= Bottom &&
+           getConstantImm() <= Top;
   }
   bool isToken() const override {
     // Note: It's not possible to pretend that other operand kinds are tokens.
@@ -1015,13 +1058,22 @@ public:
   bool isConstantMemOff() const {
     return isMem() && isa<MCConstantExpr>(getMemOff());
   }
-  template <unsigned Bits> bool isMemWithSimmOffset() const {
-    return isMem() && isConstantMemOff() && isInt<Bits>(getConstantMemOff())
-      && getMemBase()->isGPRAsmReg();
-  }
-  template <unsigned Bits> bool isMemWithSimmOffsetGPR() const {
-    return isMem() && isConstantMemOff() && isInt<Bits>(getConstantMemOff()) &&
-           getMemBase()->isGPRAsmReg();
+  // Allow relocation operators.
+  // FIXME: This predicate and others need to look through binary expressions
+  //        and determine whether a Value is a constant or not.
+  template <unsigned Bits, unsigned ShiftAmount = 0>
+  bool isMemWithSimmOffset() const {
+    if (!isMem())
+      return false;
+    if (!getMemBase()->isGPRAsmReg())
+      return false;
+    if (isa<MCTargetExpr>(getMemOff()) ||
+        (isConstantMemOff() &&
+         isShiftedInt<Bits, ShiftAmount>(getConstantMemOff())))
+      return true;
+    MCValue Res;
+    bool IsReloc = getMemOff()->evaluateAsRelocatable(Res, nullptr, nullptr);
+    return IsReloc && isShiftedInt<Bits, ShiftAmount>(Res.getConstant());
   }
   bool isMemWithGRPMM16Base() const {
     return isMem() && getMemBase()->isMM16AsmReg();
@@ -1035,11 +1087,21 @@ public:
       && (getConstantMemOff() % 4 == 0) && getMemBase()->isRegIdx()
       && (getMemBase()->getGPR32Reg() == Mips::SP);
   }
+  template <unsigned Bits> bool isMemWithSimmWordAlignedOffsetGP() const {
+    return isMem() && isConstantMemOff() && isInt<Bits>(getConstantMemOff())
+      && (getConstantMemOff() % 4 == 0) && getMemBase()->isRegIdx()
+      && (getMemBase()->getGPR32Reg() == Mips::GP);
+  }
   template <unsigned Bits, unsigned ShiftLeftAmount>
   bool isScaledUImm() const {
     return isConstantImm() &&
            isShiftedUInt<Bits, ShiftLeftAmount>(getConstantImm());
   }
+  template <unsigned Bits, unsigned ShiftLeftAmount>
+  bool isScaledSImm() const {
+    return isConstantImm() &&
+           isShiftedInt<Bits, ShiftLeftAmount>(getConstantImm());
+  }
   bool isRegList16() const {
     if (!isRegList())
       return false;
@@ -1086,7 +1148,15 @@ public:
         (R0 == Mips::A0 && R1 == Mips::S6) ||
         (R0 == Mips::A0 && R1 == Mips::A1) ||
         (R0 == Mips::A0 && R1 == Mips::A2) ||
-        (R0 == Mips::A0 && R1 == Mips::A3))
+        (R0 == Mips::A0 && R1 == Mips::A3) ||
+        (R0 == Mips::A1_64 && R1 == Mips::A2_64) ||
+        (R0 == Mips::A1_64 && R1 == Mips::A3_64) ||
+        (R0 == Mips::A2_64 && R1 == Mips::A3_64) ||
+        (R0 == Mips::A0_64 && R1 == Mips::S5_64) ||
+        (R0 == Mips::A0_64 && R1 == Mips::S6_64) ||
+        (R0 == Mips::A0_64 && R1 == Mips::A1_64) ||
+        (R0 == Mips::A0_64 && R1 == Mips::A2_64) ||
+        (R0 == Mips::A0_64 && R1 == Mips::A3_64))
       return true;
 
     return false;
@@ -1096,17 +1166,19 @@ public:
     assert(Kind == k_Token && "Invalid access!");
     return StringRef(Tok.Data, Tok.Length);
   }
-  bool isRegPair() const { return Kind == k_RegPair; }
+  bool isRegPair() const {
+    return Kind == k_RegPair && RegIdx.Index <= 30;
+  }
 
   unsigned getReg() const override {
-    // As a special case until we sort out the definition of div/divu, pretend
-    // that $0/$zero are k_PhysRegister so that MCK_ZERO works correctly.
+    // As a special case until we sort out the definition of div/divu, accept
+    // $0/$zero here so that MCK_ZERO works correctly.
     if (Kind == k_RegisterIndex && RegIdx.Index == 0 &&
         RegIdx.Kind & RegKind_GPR)
       return getGPR32Reg(); // FIXME: GPR64 too
 
-    assert(Kind == k_PhysRegister && "Invalid access!");
-    return PhysReg.Num;
+    llvm_unreachable("Invalid access!");
+    return 0;
   }
 
   const MCExpr *getImm() const {
@@ -1250,10 +1322,13 @@ public:
     return Op;
   }
 
-  static std::unique_ptr<MipsOperand>
-  CreateRegPair(unsigned RegNo, SMLoc S, SMLoc E, MipsAsmParser &Parser) {
+  static std::unique_ptr<MipsOperand> CreateRegPair(const MipsOperand &MOP,
+                                                    SMLoc S, SMLoc E,
+                                                    MipsAsmParser &Parser) {
     auto Op = make_unique<MipsOperand>(k_RegPair, Parser);
-    Op->RegIdx.Index = RegNo;
+    Op->RegIdx.Index = MOP.RegIdx.Index;
+    Op->RegIdx.RegInfo = MOP.RegIdx.RegInfo;
+    Op->RegIdx.Kind = MOP.RegIdx.Kind;
     Op->StartLoc = S;
     Op->EndLoc = E;
     return Op;
@@ -1331,7 +1406,6 @@ public:
       break;
     case k_RegList:
       delete RegList.List;
-    case k_PhysRegister:
     case k_RegisterIndex:
     case k_Token:
     case k_RegPair:
@@ -1353,9 +1427,6 @@ public:
       OS << *Mem.Off;
       OS << ">";
       break;
-    case k_PhysRegister:
-      OS << "PhysReg<" << PhysReg.Num << ">";
-      break;
     case k_RegisterIndex:
       OS << "RegIdx<" << RegIdx.Index << ":" << RegIdx.Kind << ">";
       break;
@@ -1434,83 +1505,10 @@ static unsigned countMCSymbolRefExpr(const MCExpr *Expr) {
   return 0;
 }
 
-namespace {
-void emitRX(unsigned Opcode, unsigned Reg0, MCOperand Op1, SMLoc IDLoc,
-            SmallVectorImpl<MCInst> &Instructions) {
-  MCInst tmpInst;
-  tmpInst.setOpcode(Opcode);
-  tmpInst.addOperand(MCOperand::createReg(Reg0));
-  tmpInst.addOperand(Op1);
-  tmpInst.setLoc(IDLoc);
-  Instructions.push_back(tmpInst);
-}
-
-void emitRI(unsigned Opcode, unsigned Reg0, int32_t Imm, SMLoc IDLoc,
-            SmallVectorImpl<MCInst> &Instructions) {
-  emitRX(Opcode, Reg0, MCOperand::createImm(Imm), IDLoc, Instructions);
-}
-
-void emitRR(unsigned Opcode, unsigned Reg0, unsigned Reg1, SMLoc IDLoc,
-            SmallVectorImpl<MCInst> &Instructions) {
-  emitRX(Opcode, Reg0, MCOperand::createReg(Reg1), IDLoc, Instructions);
-}
-
-void emitII(unsigned Opcode, int16_t Imm1, int16_t Imm2, SMLoc IDLoc,
-            SmallVectorImpl<MCInst> &Instructions) {
-  MCInst tmpInst;
-  tmpInst.setOpcode(Opcode);
-  tmpInst.addOperand(MCOperand::createImm(Imm1));
-  tmpInst.addOperand(MCOperand::createImm(Imm2));
-  tmpInst.setLoc(IDLoc);
-  Instructions.push_back(tmpInst);
-}
-
-void emitR(unsigned Opcode, unsigned Reg0, SMLoc IDLoc,
-           SmallVectorImpl<MCInst> &Instructions) {
-  MCInst tmpInst;
-  tmpInst.setOpcode(Opcode);
-  tmpInst.addOperand(MCOperand::createReg(Reg0));
-  tmpInst.setLoc(IDLoc);
-  Instructions.push_back(tmpInst);
-}
-
-void emitRRX(unsigned Opcode, unsigned Reg0, unsigned Reg1, MCOperand Op2,
-             SMLoc IDLoc, SmallVectorImpl<MCInst> &Instructions) {
-  MCInst tmpInst;
-  tmpInst.setOpcode(Opcode);
-  tmpInst.addOperand(MCOperand::createReg(Reg0));
-  tmpInst.addOperand(MCOperand::createReg(Reg1));
-  tmpInst.addOperand(Op2);
-  tmpInst.setLoc(IDLoc);
-  Instructions.push_back(tmpInst);
-}
-
-void emitRRR(unsigned Opcode, unsigned Reg0, unsigned Reg1, unsigned Reg2,
-             SMLoc IDLoc, SmallVectorImpl<MCInst> &Instructions) {
-  emitRRX(Opcode, Reg0, Reg1, MCOperand::createReg(Reg2), IDLoc,
-          Instructions);
-}
-
-void emitRRI(unsigned Opcode, unsigned Reg0, unsigned Reg1, int16_t Imm,
-             SMLoc IDLoc, SmallVectorImpl<MCInst> &Instructions) {
-  emitRRX(Opcode, Reg0, Reg1, MCOperand::createImm(Imm), IDLoc,
-          Instructions);
-}
-
-void emitAppropriateDSLL(unsigned DstReg, unsigned SrcReg, int16_t ShiftAmount,
-                         SMLoc IDLoc, SmallVectorImpl<MCInst> &Instructions) {
-  if (ShiftAmount >= 32) {
-    emitRRI(Mips::DSLL32, DstReg, SrcReg, ShiftAmount - 32, IDLoc,
-            Instructions);
-    return;
-  }
-
-  emitRRI(Mips::DSLL, DstReg, SrcReg, ShiftAmount, IDLoc, Instructions);
-}
-} // end anonymous namespace.
-
 bool MipsAsmParser::processInstruction(MCInst &Inst, SMLoc IDLoc,
-                                       SmallVectorImpl<MCInst> &Instructions) {
+                                       MCStreamer &Out,
+                                       const MCSubtargetInfo *STI) {
+  MipsTargetStreamer &TOut = getTargetStreamer();
   const MCInstrDesc &MCID = getInstDesc(Inst.getOpcode());
   bool ExpandedJalSym = false;
 
@@ -1560,6 +1558,10 @@ bool MipsAsmParser::processInstruction(MCInst &Inst, SMLoc IDLoc,
     case Mips::BLTZAL_MM:
     case Mips::BC1F_MM:
     case Mips::BC1T_MM:
+    case Mips::BC1EQZC_MMR6:
+    case Mips::BC1NEZC_MMR6:
+    case Mips::BC2EQZC_MMR6:
+    case Mips::BC2NEZC_MMR6:
       assert(MCID.getNumOperands() == 2 && "unexpected number of operands");
       Offset = Inst.getOperand(1);
       if (!Offset.isImm())
@@ -1650,9 +1652,7 @@ bool MipsAsmParser::processInstruction(MCInst &Inst, SMLoc IDLoc,
       return Error(IDLoc, "jal doesn't support multiple symbols in PIC mode");
 
     // FIXME: This is checking the expression can be handled by the later stages
-    //        of the assembler. We ought to leave it to those later stages but
-    //        we can't do that until we stop evaluateRelocExpr() rewriting the
-    //        expressions into non-equivalent forms.
+    //        of the assembler. We ought to leave it to those later stages.
     const MCSymbol *JalSym = getSingleMCSymbol(JalExpr);
 
     // FIXME: Add support for label+offset operands (currently causes an error).
@@ -1666,33 +1666,38 @@ bool MipsAsmParser::processInstruction(MCInst &Inst, SMLoc IDLoc,
         //  addiu $25, $25, 0
         //    R_(MICRO)MIPS_LO16   label
         //  jalr  $25
-        const MCExpr *Got16RelocExpr = evaluateRelocExpr(JalExpr, "got");
-        const MCExpr *Lo16RelocExpr = evaluateRelocExpr(JalExpr, "lo");
-
-        emitRRX(Mips::LW, Mips::T9, Mips::GP,
-                MCOperand::createExpr(Got16RelocExpr), IDLoc, Instructions);
-        emitRRX(Mips::ADDiu, Mips::T9, Mips::T9,
-                MCOperand::createExpr(Lo16RelocExpr), IDLoc, Instructions);
+        const MCExpr *Got16RelocExpr =
+            MipsMCExpr::create(MipsMCExpr::MEK_GOT, JalExpr, getContext());
+        const MCExpr *Lo16RelocExpr =
+            MipsMCExpr::create(MipsMCExpr::MEK_LO, JalExpr, getContext());
+
+        TOut.emitRRX(Mips::LW, Mips::T9, Mips::GP,
+                     MCOperand::createExpr(Got16RelocExpr), IDLoc, STI);
+        TOut.emitRRX(Mips::ADDiu, Mips::T9, Mips::T9,
+                     MCOperand::createExpr(Lo16RelocExpr), IDLoc, STI);
       } else if (isABI_N32() || isABI_N64()) {
         // If it's a local symbol and the N32/N64 ABIs are being used,
         // we expand to:
         //  lw/ld $25, 0($gp)
         //    R_(MICRO)MIPS_GOT_DISP  label
         //  jalr  $25
-        const MCExpr *GotDispRelocExpr = evaluateRelocExpr(JalExpr, "got_disp");
+        const MCExpr *GotDispRelocExpr =
+            MipsMCExpr::create(MipsMCExpr::MEK_GOT_DISP, JalExpr, getContext());
 
-        emitRRX(ABI.ArePtrs64bit() ? Mips::LD : Mips::LW, Mips::T9, Mips::GP,
-                MCOperand::createExpr(GotDispRelocExpr), IDLoc, Instructions);
+        TOut.emitRRX(ABI.ArePtrs64bit() ? Mips::LD : Mips::LW, Mips::T9,
+                     Mips::GP, MCOperand::createExpr(GotDispRelocExpr), IDLoc,
+                     STI);
       }
     } else {
       // If it's an external/weak symbol, we expand to:
       //  lw/ld    $25, 0($gp)
       //    R_(MICRO)MIPS_CALL16  label
       //  jalr  $25
-      const MCExpr *Call16RelocExpr = evaluateRelocExpr(JalExpr, "call16");
+      const MCExpr *Call16RelocExpr =
+          MipsMCExpr::create(MipsMCExpr::MEK_GOT_CALL, JalExpr, getContext());
 
-      emitRRX(ABI.ArePtrs64bit() ? Mips::LD : Mips::LW, Mips::T9, Mips::GP,
-              MCOperand::createExpr(Call16RelocExpr), IDLoc, Instructions);
+      TOut.emitRRX(ABI.ArePtrs64bit() ? Mips::LD : Mips::LW, Mips::T9, Mips::GP,
+                   MCOperand::createExpr(Call16RelocExpr), IDLoc, STI);
     }
 
     MCInst JalrInst;
@@ -1723,7 +1728,7 @@ bool MipsAsmParser::processInstruction(MCInst &Inst, SMLoc IDLoc,
           int MemOffset = Op.getImm();
           if (MemOffset < -32768 || MemOffset > 32767) {
             // Offset can't exceed 16bit value.
-            expandMemInst(Inst, IDLoc, Instructions, MCID.mayLoad(), true);
+            expandMemInst(Inst, IDLoc, Out, STI, MCID.mayLoad(), true);
             return false;
           }
         } else if (Op.isExpr()) {
@@ -1733,11 +1738,11 @@ bool MipsAsmParser::processInstruction(MCInst &Inst, SMLoc IDLoc,
                 static_cast<const MCSymbolRefExpr *>(Expr);
             if (SR->getKind() == MCSymbolRefExpr::VK_None) {
               // Expand symbol.
-              expandMemInst(Inst, IDLoc, Instructions, MCID.mayLoad(), false);
+              expandMemInst(Inst, IDLoc, Out, STI, MCID.mayLoad(), false);
               return false;
             }
           } else if (!isEvaluated(Expr)) {
-            expandMemInst(Inst, IDLoc, Instructions, MCID.mayLoad(), false);
+            expandMemInst(Inst, IDLoc, Out, STI, MCID.mayLoad(), false);
             return false;
           }
         }
@@ -1763,8 +1768,8 @@ bool MipsAsmParser::processInstruction(MCInst &Inst, SMLoc IDLoc,
                 (BaseReg.getReg() == Mips::GP ||
                 BaseReg.getReg() == Mips::GP_64)) {
 
-              emitRRI(Mips::LWGP_MM, DstReg.getReg(), Mips::GP, MemOffset,
-                      IDLoc, Instructions);
+              TOut.emitRRI(Mips::LWGP_MM, DstReg.getReg(), Mips::GP, MemOffset,
+                           IDLoc, STI);
               return false;
             }
           }
@@ -1780,14 +1785,6 @@ bool MipsAsmParser::processInstruction(MCInst &Inst, SMLoc IDLoc,
     switch (Inst.getOpcode()) {
       default:
         break;
-      case Mips::ADDIUS5_MM:
-        Opnd = Inst.getOperand(2);
-        if (!Opnd.isImm())
-          return Error(IDLoc, "expected immediate operand kind");
-        Imm = Opnd.getImm();
-        if (Imm < -8 || Imm > 7)
-          return Error(IDLoc, "immediate operand value out of range");
-        break;
       case Mips::ADDIUSP_MM:
         Opnd = Inst.getOperand(0);
         if (!Opnd.isImm())
@@ -1823,16 +1820,6 @@ bool MipsAsmParser::processInstruction(MCInst &Inst, SMLoc IDLoc,
               ((Imm % 4 == 0) && Imm < 28 && Imm > 0)))
           return Error(IDLoc, "immediate operand value out of range");
         break;
-      case Mips::ADDIUR1SP_MM:
-        Opnd = Inst.getOperand(1);
-        if (!Opnd.isImm())
-          return Error(IDLoc, "expected immediate operand kind");
-        Imm = Opnd.getImm();
-        if (OffsetToAlignment(Imm, 4LL))
-          return Error(IDLoc, "misaligned immediate operand value");
-        if (Imm < 0 || Imm > 255)
-          return Error(IDLoc, "immediate operand value out of range");
-        break;
       case Mips::ANDI16_MM:
         Opnd = Inst.getOperand(2);
         if (!Opnd.isImm())
@@ -1851,12 +1838,6 @@ bool MipsAsmParser::processInstruction(MCInst &Inst, SMLoc IDLoc,
         if (Imm < -1 || Imm > 14)
           return Error(IDLoc, "immediate operand value out of range");
         break;
-      case Mips::TEQ_MM:
-      case Mips::TGE_MM:
-      case Mips::TGEU_MM:
-      case Mips::TLT_MM:
-      case Mips::TLTU_MM:
-      case Mips::TNE_MM:
       case Mips::SB16_MM:
       case Mips::SB16_MMR6:
         Opnd = Inst.getOperand(2);
@@ -1897,11 +1878,16 @@ bool MipsAsmParser::processInstruction(MCInst &Inst, SMLoc IDLoc,
     }
   }
 
+  bool FillDelaySlot =
+      MCID.hasDelaySlot() && AssemblerOptions.back()->isReorder();
+  if (FillDelaySlot)
+    TOut.emitDirectiveSetNoReorder();
+
   MacroExpanderResultTy ExpandResult =
-      tryExpandInstruction(Inst, IDLoc, Instructions);
+      tryExpandInstruction(Inst, IDLoc, Out, STI);
   switch (ExpandResult) {
   case MER_NotAMacro:
-    Instructions.push_back(Inst);
+    Out.EmitInstruction(Inst, *STI);
     break;
   case MER_Success:
     break;
@@ -1909,10 +1895,17 @@ bool MipsAsmParser::processInstruction(MCInst &Inst, SMLoc IDLoc,
     return true;
   }
 
+  // We know we emitted an instruction on the MER_NotAMacro or MER_Success path.
+  // If we're in microMIPS mode then we must also set EF_MIPS_MICROMIPS.
+  if (inMicroMipsMode())
+    TOut.setUsesMicroMips();
+
   // If this instruction has a delay slot and .set reorder is active,
   // emit a NOP after it.
-  if (MCID.hasDelaySlot() && AssemblerOptions.back()->isReorder())
-    createNop(hasShortDelaySlot(Inst.getOpcode()), IDLoc, Instructions);
+  if (FillDelaySlot) {
+    TOut.emitEmptyDelaySlot(hasShortDelaySlot(Inst.getOpcode()), IDLoc, STI);
+    TOut.emitDirectiveSetReorder();
+  }
 
   if ((Inst.getOpcode() == Mips::JalOneReg ||
        Inst.getOpcode() == Mips::JalTwoReg || ExpandedJalSym) &&
@@ -1922,16 +1915,11 @@ bool MipsAsmParser::processInstruction(MCInst &Inst, SMLoc IDLoc,
       // If .set reorder has been used, we've already emitted a NOP.
       // If .set noreorder has been used, we need to emit a NOP at this point.
       if (!AssemblerOptions.back()->isReorder())
-        createNop(hasShortDelaySlot(Inst.getOpcode()), IDLoc, Instructions);
+        TOut.emitEmptyDelaySlot(hasShortDelaySlot(Inst.getOpcode()), IDLoc,
+                                STI);
 
       // Load the $gp from the stack.
-      SmallVector<MCInst, 3> LoadInsts;
-      createCpRestoreMemOp(true /*IsLoad*/, CpRestoreOffset /*StackOffset*/,
-                           IDLoc, LoadInsts);
-
-      for (const MCInst &Inst : LoadInsts)
-        Instructions.push_back(Inst);
-
+      TOut.emitGPRestore(CpRestoreOffset, IDLoc, STI);
     } else
       Warning(IDLoc, "no .cprestore used in PIC mode");
   }
@@ -1940,17 +1928,15 @@ bool MipsAsmParser::processInstruction(MCInst &Inst, SMLoc IDLoc,
 }
 
 MipsAsmParser::MacroExpanderResultTy
-MipsAsmParser::tryExpandInstruction(MCInst &Inst, SMLoc IDLoc,
-                                    SmallVectorImpl<MCInst> &Instructions) {
+MipsAsmParser::tryExpandInstruction(MCInst &Inst, SMLoc IDLoc, MCStreamer &Out,
+                                    const MCSubtargetInfo *STI) {
   switch (Inst.getOpcode()) {
   default:
     return MER_NotAMacro;
   case Mips::LoadImm32:
-    return expandLoadImm(Inst, true, IDLoc, Instructions) ? MER_Fail
-                                                          : MER_Success;
+    return expandLoadImm(Inst, true, IDLoc, Out, STI) ? MER_Fail : MER_Success;
   case Mips::LoadImm64:
-    return expandLoadImm(Inst, false, IDLoc, Instructions) ? MER_Fail
-                                                           : MER_Success;
+    return expandLoadImm(Inst, false, IDLoc, Out, STI) ? MER_Fail : MER_Success;
   case Mips::LoadAddrImm32:
   case Mips::LoadAddrImm64:
     assert(Inst.getOperand(0).isReg() && "expected register operand kind");
@@ -1960,7 +1946,7 @@ MipsAsmParser::tryExpandInstruction(MCInst &Inst, SMLoc IDLoc,
     return expandLoadAddress(Inst.getOperand(0).getReg(), Mips::NoRegister,
                              Inst.getOperand(1),
                              Inst.getOpcode() == Mips::LoadAddrImm32, IDLoc,
-                             Instructions)
+                             Out, STI)
                ? MER_Fail
                : MER_Success;
   case Mips::LoadAddrReg32:
@@ -1973,24 +1959,23 @@ MipsAsmParser::tryExpandInstruction(MCInst &Inst, SMLoc IDLoc,
     return expandLoadAddress(Inst.getOperand(0).getReg(),
                              Inst.getOperand(1).getReg(), Inst.getOperand(2),
                              Inst.getOpcode() == Mips::LoadAddrReg32, IDLoc,
-                             Instructions)
+                             Out, STI)
                ? MER_Fail
                : MER_Success;
   case Mips::B_MM_Pseudo:
   case Mips::B_MMR6_Pseudo:
-    return expandUncondBranchMMPseudo(Inst, IDLoc, Instructions) ? MER_Fail
-                                                                 : MER_Success;
+    return expandUncondBranchMMPseudo(Inst, IDLoc, Out, STI) ? MER_Fail
+                                                             : MER_Success;
   case Mips::SWM_MM:
   case Mips::LWM_MM:
-    return expandLoadStoreMultiple(Inst, IDLoc, Instructions) ? MER_Fail
-                                                              : MER_Success;
+    return expandLoadStoreMultiple(Inst, IDLoc, Out, STI) ? MER_Fail
+                                                          : MER_Success;
   case Mips::JalOneReg:
   case Mips::JalTwoReg:
-    return expandJalWithRegs(Inst, IDLoc, Instructions) ? MER_Fail
-                                                        : MER_Success;
+    return expandJalWithRegs(Inst, IDLoc, Out, STI) ? MER_Fail : MER_Success;
   case Mips::BneImm:
   case Mips::BeqImm:
-    return expandBranchImm(Inst, IDLoc, Instructions) ? MER_Fail : MER_Success;
+    return expandBranchImm(Inst, IDLoc, Out, STI) ? MER_Fail : MER_Success;
   case Mips::BLT:
   case Mips::BLE:
   case Mips::BGE:
@@ -2023,29 +2008,36 @@ MipsAsmParser::tryExpandInstruction(MCInst &Inst, SMLoc IDLoc,
   case Mips::BLEULImmMacro:
   case Mips::BGEULImmMacro:
   case Mips::BGTULImmMacro:
-    return expandCondBranches(Inst, IDLoc, Instructions) ? MER_Fail
-                                                         : MER_Success;
+    return expandCondBranches(Inst, IDLoc, Out, STI) ? MER_Fail : MER_Success;
   case Mips::SDivMacro:
-    return expandDiv(Inst, IDLoc, Instructions, false, true) ? MER_Fail
-                                                             : MER_Success;
+    return expandDiv(Inst, IDLoc, Out, STI, false, true) ? MER_Fail
+                                                         : MER_Success;
   case Mips::DSDivMacro:
-    return expandDiv(Inst, IDLoc, Instructions, true, true) ? MER_Fail
-                                                            : MER_Success;
+    return expandDiv(Inst, IDLoc, Out, STI, true, true) ? MER_Fail
+                                                        : MER_Success;
   case Mips::UDivMacro:
-    return expandDiv(Inst, IDLoc, Instructions, false, false) ? MER_Fail
-                                                              : MER_Success;
+    return expandDiv(Inst, IDLoc, Out, STI, false, false) ? MER_Fail
+                                                          : MER_Success;
   case Mips::DUDivMacro:
-    return expandDiv(Inst, IDLoc, Instructions, true, false) ? MER_Fail
-                                                             : MER_Success;
+    return expandDiv(Inst, IDLoc, Out, STI, true, false) ? MER_Fail
+                                                         : MER_Success;
+  case Mips::PseudoTRUNC_W_S:
+    return expandTrunc(Inst, false, false, IDLoc, Out, STI) ? MER_Fail
+                                                            : MER_Success;
+  case Mips::PseudoTRUNC_W_D32:
+    return expandTrunc(Inst, true, false, IDLoc, Out, STI) ? MER_Fail
+                                                           : MER_Success;
+  case Mips::PseudoTRUNC_W_D:
+    return expandTrunc(Inst, true, true, IDLoc, Out, STI) ? MER_Fail
+                                                          : MER_Success;
   case Mips::Ulh:
-    return expandUlh(Inst, true, IDLoc, Instructions) ? MER_Fail : MER_Success;
+    return expandUlh(Inst, true, IDLoc, Out, STI) ? MER_Fail : MER_Success;
   case Mips::Ulhu:
-    return expandUlh(Inst, false, IDLoc, Instructions) ? MER_Fail : MER_Success;
+    return expandUlh(Inst, false, IDLoc, Out, STI) ? MER_Fail : MER_Success;
   case Mips::Ulw:
-    return expandUlw(Inst, IDLoc, Instructions) ? MER_Fail : MER_Success;
+    return expandUlw(Inst, IDLoc, Out, STI) ? MER_Fail : MER_Success;
   case Mips::NORImm:
-    return expandAliasImmediate(Inst, IDLoc, Instructions) ? MER_Fail
-                                                           : MER_Success;
+    return expandAliasImmediate(Inst, IDLoc, Out, STI) ? MER_Fail : MER_Success;
   case Mips::ADDi:
   case Mips::ADDiu:
   case Mips::SLTi:
@@ -2055,8 +2047,8 @@ MipsAsmParser::tryExpandInstruction(MCInst &Inst, SMLoc IDLoc,
       int64_t ImmValue = Inst.getOperand(2).getImm();
       if (isInt<16>(ImmValue))
         return MER_NotAMacro;
-      return expandAliasImmediate(Inst, IDLoc, Instructions) ? MER_Fail
-                                                             : MER_Success;
+      return expandAliasImmediate(Inst, IDLoc, Out, STI) ? MER_Fail
+                                                         : MER_Success;
     }
     return MER_NotAMacro;
   case Mips::ANDi:
@@ -2067,31 +2059,32 @@ MipsAsmParser::tryExpandInstruction(MCInst &Inst, SMLoc IDLoc,
       int64_t ImmValue = Inst.getOperand(2).getImm();
       if (isUInt<16>(ImmValue))
         return MER_NotAMacro;
-      return expandAliasImmediate(Inst, IDLoc, Instructions) ? MER_Fail
-                                                             : MER_Success;
+      return expandAliasImmediate(Inst, IDLoc, Out, STI) ? MER_Fail
+                                                         : MER_Success;
     }
     return MER_NotAMacro;
   case Mips::ROL:
   case Mips::ROR:
-    return expandRotation(Inst, IDLoc, Instructions) ? MER_Fail
-                                                     : MER_Success;
+    return expandRotation(Inst, IDLoc, Out, STI) ? MER_Fail : MER_Success;
   case Mips::ROLImm:
   case Mips::RORImm:
-    return expandRotationImm(Inst, IDLoc, Instructions) ? MER_Fail
-                                                        : MER_Success;
+    return expandRotationImm(Inst, IDLoc, Out, STI) ? MER_Fail : MER_Success;
   case Mips::DROL:
   case Mips::DROR:
-    return expandDRotation(Inst, IDLoc, Instructions) ? MER_Fail
-                                                      : MER_Success;
+    return expandDRotation(Inst, IDLoc, Out, STI) ? MER_Fail : MER_Success;
   case Mips::DROLImm:
   case Mips::DRORImm:
-    return expandDRotationImm(Inst, IDLoc, Instructions) ? MER_Fail
-                                                         : MER_Success;
+    return expandDRotationImm(Inst, IDLoc, Out, STI) ? MER_Fail : MER_Success;
+  case Mips::ABSMacro:
+    return expandAbs(Inst, IDLoc, Out, STI) ? MER_Fail : MER_Success;
   }
 }
 
 bool MipsAsmParser::expandJalWithRegs(MCInst &Inst, SMLoc IDLoc,
-                                      SmallVectorImpl<MCInst> &Instructions) {
+                                      MCStreamer &Out,
+                                      const MCSubtargetInfo *STI) {
+  MipsTargetStreamer &TOut = getTargetStreamer();
+
   // Create a JALR instruction which is going to replace the pseudo-JAL.
   MCInst JalrInst;
   JalrInst.setLoc(IDLoc);
@@ -2121,14 +2114,14 @@ bool MipsAsmParser::expandJalWithRegs(MCInst &Inst, SMLoc IDLoc,
     const MCOperand SecondRegOp = Inst.getOperand(1);
     JalrInst.addOperand(SecondRegOp);
   }
-  Instructions.push_back(JalrInst);
+  Out.EmitInstruction(JalrInst, *STI);
 
   // If .set reorder is active and branch instruction has a delay slot,
   // emit a NOP after it.
   const MCInstrDesc &MCID = getInstDesc(JalrInst.getOpcode());
-  if (MCID.hasDelaySlot() && AssemblerOptions.back()->isReorder()) {
-    createNop(hasShortDelaySlot(JalrInst.getOpcode()), IDLoc, Instructions);
-  }
+  if (MCID.hasDelaySlot() && AssemblerOptions.back()->isReorder())
+    TOut.emitEmptyDelaySlot(hasShortDelaySlot(JalrInst.getOpcode()), IDLoc,
+                            STI);
 
   return false;
 }
@@ -2150,11 +2143,12 @@ template <unsigned N> static bool isShiftedUIntAtAnyPosition(uint64_t x) {
 /// @param IsAddress    True if the immediate represents an address. False if it
 ///                     is an integer.
 /// @param IDLoc        Location of the immediate in the source file.
-/// @param Instructions The instructions emitted by this expansion.
 bool MipsAsmParser::loadImmediate(int64_t ImmValue, unsigned DstReg,
                                   unsigned SrcReg, bool Is32BitImm,
-                                  bool IsAddress, SMLoc IDLoc,
-                                  SmallVectorImpl<MCInst> &Instructions) {
+                                  bool IsAddress, SMLoc IDLoc, MCStreamer &Out,
+                                  const MCSubtargetInfo *STI) {
+  MipsTargetStreamer &TOut = getTargetStreamer();
+
   if (!Is32BitImm && !isGP64bit()) {
     Error(IDLoc, "instruction requires a 64-bit architecture");
     return true;
@@ -2180,7 +2174,8 @@ bool MipsAsmParser::loadImmediate(int64_t ImmValue, unsigned DstReg,
     UseSrcReg = true;
 
   unsigned TmpReg = DstReg;
-  if (UseSrcReg && (DstReg == SrcReg)) {
+  if (UseSrcReg &&
+      getContext().getRegisterInfo()->isSuperOrSubRegisterEq(DstReg, SrcReg)) {
     // At this point we need AT to perform the expansions and we exit if it is
     // not available.
     unsigned ATReg = getATReg(IDLoc);
@@ -2197,11 +2192,11 @@ bool MipsAsmParser::loadImmediate(int64_t ImmValue, unsigned DstReg,
     // traditional assembler behaviour. N32 would normally use addiu for both
     // integers and addresses.
     if (IsAddress && !Is32BitImm) {
-      emitRRI(Mips::DADDiu, DstReg, SrcReg, ImmValue, IDLoc, Instructions);
+      TOut.emitRRI(Mips::DADDiu, DstReg, SrcReg, ImmValue, IDLoc, STI);
       return false;
     }
 
-    emitRRI(Mips::ADDiu, DstReg, SrcReg, ImmValue, IDLoc, Instructions);
+    TOut.emitRRI(Mips::ADDiu, DstReg, SrcReg, ImmValue, IDLoc, STI);
     return false;
   }
 
@@ -2213,9 +2208,9 @@ bool MipsAsmParser::loadImmediate(int64_t ImmValue, unsigned DstReg,
         return true;
     }
 
-    emitRRI(Mips::ORi, TmpReg, ZeroReg, ImmValue, IDLoc, Instructions);
+    TOut.emitRRI(Mips::ORi, TmpReg, ZeroReg, ImmValue, IDLoc, STI);
     if (UseSrcReg)
-      emitRRR(ABI.GetPtrAdduOp(), DstReg, TmpReg, SrcReg, IDLoc, Instructions);
+      TOut.emitRRR(ABI.GetPtrAdduOp(), DstReg, TmpReg, SrcReg, IDLoc, STI);
     return false;
   }
 
@@ -2229,29 +2224,29 @@ bool MipsAsmParser::loadImmediate(int64_t ImmValue, unsigned DstReg,
       // Traditional behaviour seems to special case this particular value. It's
       // not clear why other masks are handled differently.
       if (ImmValue == 0xffffffff) {
-        emitRI(Mips::LUi, TmpReg, 0xffff, IDLoc, Instructions);
-        emitRRI(Mips::DSRL32, TmpReg, TmpReg, 0, IDLoc, Instructions);
+        TOut.emitRI(Mips::LUi, TmpReg, 0xffff, IDLoc, STI);
+        TOut.emitRRI(Mips::DSRL32, TmpReg, TmpReg, 0, IDLoc, STI);
         if (UseSrcReg)
-          emitRRR(AdduOp, DstReg, TmpReg, SrcReg, IDLoc, Instructions);
+          TOut.emitRRR(AdduOp, DstReg, TmpReg, SrcReg, IDLoc, STI);
         return false;
       }
 
       // Expand to an ORi instead of a LUi to avoid sign-extending into the
       // upper 32 bits.
-      emitRRI(Mips::ORi, TmpReg, ZeroReg, Bits31To16, IDLoc, Instructions);
-      emitRRI(Mips::DSLL, TmpReg, TmpReg, 16, IDLoc, Instructions);
+      TOut.emitRRI(Mips::ORi, TmpReg, ZeroReg, Bits31To16, IDLoc, STI);
+      TOut.emitRRI(Mips::DSLL, TmpReg, TmpReg, 16, IDLoc, STI);
       if (Bits15To0)
-        emitRRI(Mips::ORi, TmpReg, TmpReg, Bits15To0, IDLoc, Instructions);
+        TOut.emitRRI(Mips::ORi, TmpReg, TmpReg, Bits15To0, IDLoc, STI);
       if (UseSrcReg)
-        emitRRR(AdduOp, DstReg, TmpReg, SrcReg, IDLoc, Instructions);
+        TOut.emitRRR(AdduOp, DstReg, TmpReg, SrcReg, IDLoc, STI);
       return false;
     }
 
-    emitRI(Mips::LUi, TmpReg, Bits31To16, IDLoc, Instructions);
+    TOut.emitRI(Mips::LUi, TmpReg, Bits31To16, IDLoc, STI);
     if (Bits15To0)
-      emitRRI(Mips::ORi, TmpReg, TmpReg, Bits15To0, IDLoc, Instructions);
+      TOut.emitRRI(Mips::ORi, TmpReg, TmpReg, Bits15To0, IDLoc, STI);
     if (UseSrcReg)
-      emitRRR(AdduOp, DstReg, TmpReg, SrcReg, IDLoc, Instructions);
+      TOut.emitRRR(AdduOp, DstReg, TmpReg, SrcReg, IDLoc, STI);
     return false;
   }
 
@@ -2267,11 +2262,11 @@ bool MipsAsmParser::loadImmediate(int64_t ImmValue, unsigned DstReg,
     unsigned LastSet = findLastSet((uint64_t)ImmValue);
     unsigned ShiftAmount = FirstSet - (15 - (LastSet - FirstSet));
     uint16_t Bits = (ImmValue >> ShiftAmount) & 0xffff;
-    emitRRI(Mips::ORi, TmpReg, ZeroReg, Bits, IDLoc, Instructions);
-    emitRRI(Mips::DSLL, TmpReg, TmpReg, ShiftAmount, IDLoc, Instructions);
+    TOut.emitRRI(Mips::ORi, TmpReg, ZeroReg, Bits, IDLoc, STI);
+    TOut.emitRRI(Mips::DSLL, TmpReg, TmpReg, ShiftAmount, IDLoc, STI);
 
     if (UseSrcReg)
-      emitRRR(AdduOp, DstReg, TmpReg, SrcReg, IDLoc, Instructions);
+      TOut.emitRRR(AdduOp, DstReg, TmpReg, SrcReg, IDLoc, STI);
 
     return false;
   }
@@ -2284,7 +2279,7 @@ bool MipsAsmParser::loadImmediate(int64_t ImmValue, unsigned DstReg,
 
   // Load bits 32-63 of ImmValue into bits 0-31 of the temporary register.
   if (loadImmediate(ImmValue >> 32, TmpReg, Mips::NoRegister, true, false,
-                    IDLoc, Instructions))
+                    IDLoc, Out, STI))
     return false;
 
   // Shift and accumulate into the register. If a 16-bit chunk is zero, then
@@ -2294,9 +2289,8 @@ bool MipsAsmParser::loadImmediate(int64_t ImmValue, unsigned DstReg,
     uint16_t ImmChunk = (ImmValue >> BitNum) & 0xffff;
 
     if (ImmChunk != 0) {
-      emitAppropriateDSLL(TmpReg, TmpReg, ShiftCarriedForwards, IDLoc,
-                          Instructions);
-      emitRRI(Mips::ORi, TmpReg, TmpReg, ImmChunk, IDLoc, Instructions);
+      TOut.emitDSLL(TmpReg, TmpReg, ShiftCarriedForwards, IDLoc, STI);
+      TOut.emitRRI(Mips::ORi, TmpReg, TmpReg, ImmChunk, IDLoc, STI);
       ShiftCarriedForwards = 0;
     }
 
@@ -2306,24 +2300,23 @@ bool MipsAsmParser::loadImmediate(int64_t ImmValue, unsigned DstReg,
 
   // Finish any remaining shifts left by trailing zeros.
   if (ShiftCarriedForwards)
-    emitAppropriateDSLL(TmpReg, TmpReg, ShiftCarriedForwards, IDLoc,
-                        Instructions);
+    TOut.emitDSLL(TmpReg, TmpReg, ShiftCarriedForwards, IDLoc, STI);
 
   if (UseSrcReg)
-    emitRRR(AdduOp, DstReg, TmpReg, SrcReg, IDLoc, Instructions);
+    TOut.emitRRR(AdduOp, DstReg, TmpReg, SrcReg, IDLoc, STI);
 
   return false;
 }
 
 bool MipsAsmParser::expandLoadImm(MCInst &Inst, bool Is32BitImm, SMLoc IDLoc,
-                                  SmallVectorImpl<MCInst> &Instructions) {
+                                  MCStreamer &Out, const MCSubtargetInfo *STI) {
   const MCOperand &ImmOp = Inst.getOperand(1);
   assert(ImmOp.isImm() && "expected immediate operand kind");
   const MCOperand &DstRegOp = Inst.getOperand(0);
   assert(DstRegOp.isReg() && "expected register operand kind");
 
   if (loadImmediate(ImmOp.getImm(), DstRegOp.getReg(), Mips::NoRegister,
-                    Is32BitImm, false, IDLoc, Instructions))
+                    Is32BitImm, false, IDLoc, Out, STI))
     return true;
 
   return false;
@@ -2332,7 +2325,8 @@ bool MipsAsmParser::expandLoadImm(MCInst &Inst, bool Is32BitImm, SMLoc IDLoc,
 bool MipsAsmParser::expandLoadAddress(unsigned DstReg, unsigned BaseReg,
                                       const MCOperand &Offset,
                                       bool Is32BitAddress, SMLoc IDLoc,
-                                      SmallVectorImpl<MCInst> &Instructions) {
+                                      MCStreamer &Out,
+                                      const MCSubtargetInfo *STI) {
   // la can't produce a usable address when addresses are 64-bit.
   if (Is32BitAddress && ABI.ArePtrs64bit()) {
     // FIXME: Demote this to a warning and continue as if we had 'dla' instead.
@@ -2344,31 +2338,109 @@ bool MipsAsmParser::expandLoadAddress(unsigned DstReg, unsigned BaseReg,
   }
 
   // dla requires 64-bit addresses.
-  if (!Is32BitAddress && !ABI.ArePtrs64bit()) {
+  if (!Is32BitAddress && !hasMips3()) {
     Error(IDLoc, "instruction requires a 64-bit architecture");
     return true;
   }
 
   if (!Offset.isImm())
     return loadAndAddSymbolAddress(Offset.getExpr(), DstReg, BaseReg,
-                                   Is32BitAddress, IDLoc, Instructions);
+                                   Is32BitAddress, IDLoc, Out, STI);
+
+  if (!ABI.ArePtrs64bit()) {
+    // Continue as if we had 'la' whether we had 'la' or 'dla'.
+    Is32BitAddress = true;
+  }
 
   return loadImmediate(Offset.getImm(), DstReg, BaseReg, Is32BitAddress, true,
-                       IDLoc, Instructions);
+                       IDLoc, Out, STI);
 }
 
-bool MipsAsmParser::loadAndAddSymbolAddress(
-    const MCExpr *SymExpr, unsigned DstReg, unsigned SrcReg, bool Is32BitSym,
-    SMLoc IDLoc, SmallVectorImpl<MCInst> &Instructions) {
+bool MipsAsmParser::loadAndAddSymbolAddress(const MCExpr *SymExpr,
+                                            unsigned DstReg, unsigned SrcReg,
+                                            bool Is32BitSym, SMLoc IDLoc,
+                                            MCStreamer &Out,
+                                            const MCSubtargetInfo *STI) {
+  MipsTargetStreamer &TOut = getTargetStreamer();
+  bool UseSrcReg = SrcReg != Mips::NoRegister;
   warnIfNoMacro(IDLoc);
 
-  const MCExpr *Symbol = cast<MCExpr>(SymExpr);
-  const MipsMCExpr *HiExpr = MipsMCExpr::create(
-      MCSymbolRefExpr::VK_Mips_ABS_HI, Symbol, getContext());
-  const MipsMCExpr *LoExpr = MipsMCExpr::create(
-      MCSymbolRefExpr::VK_Mips_ABS_LO, Symbol, getContext());
+  if (inPicMode() && ABI.IsO32()) {
+    MCValue Res;
+    if (!SymExpr->evaluateAsRelocatable(Res, nullptr, nullptr)) {
+      Error(IDLoc, "expected relocatable expression");
+      return true;
+    }
+    if (Res.getSymB() != nullptr) {
+      Error(IDLoc, "expected relocatable expression with only one symbol");
+      return true;
+    }
+
+    // The case where the result register is $25 is somewhat special. If the
+    // symbol in the final relocation is external and not modified with a
+    // constant then we must use R_MIPS_CALL16 instead of R_MIPS_GOT16.
+    if ((DstReg == Mips::T9 || DstReg == Mips::T9_64) && !UseSrcReg &&
+        Res.getConstant() == 0 && !Res.getSymA()->getSymbol().isInSection() &&
+        !Res.getSymA()->getSymbol().isTemporary()) {
+      const MCExpr *CallExpr =
+          MipsMCExpr::create(MipsMCExpr::MEK_GOT_CALL, SymExpr, getContext());
+      TOut.emitRRX(Mips::LW, DstReg, ABI.GetGlobalPtr(),
+                   MCOperand::createExpr(CallExpr), IDLoc, STI);
+      return false;
+    }
 
-  bool UseSrcReg = SrcReg != Mips::NoRegister;
+    // The remaining cases are:
+    //   External GOT: lw $tmp, %got(symbol+offset)($gp)
+    //                >addiu $tmp, $tmp, %lo(offset)
+    //                >addiu $rd, $tmp, $rs
+    //   Local GOT:    lw $tmp, %got(symbol+offset)($gp)
+    //                 addiu $tmp, $tmp, %lo(symbol+offset)($gp)
+    //                >addiu $rd, $tmp, $rs
+    // The addiu's marked with a '>' may be omitted if they are redundant. If
+    // this happens then the last instruction must use $rd as the result
+    // register.
+    const MipsMCExpr *GotExpr =
+        MipsMCExpr::create(MipsMCExpr::MEK_GOT, SymExpr, getContext());
+    const MCExpr *LoExpr = nullptr;
+    if (Res.getSymA()->getSymbol().isInSection() ||
+        Res.getSymA()->getSymbol().isTemporary())
+      LoExpr = MipsMCExpr::create(MipsMCExpr::MEK_LO, SymExpr, getContext());
+    else if (Res.getConstant() != 0) {
+      // External symbols fully resolve the symbol with just the %got(symbol)
+      // but we must still account for any offset to the symbol for expressions
+      // like symbol+8.
+      LoExpr = MCConstantExpr::create(Res.getConstant(), getContext());
+    }
+
+    unsigned TmpReg = DstReg;
+    if (UseSrcReg &&
+        getContext().getRegisterInfo()->isSuperOrSubRegisterEq(DstReg,
+                                                               SrcReg)) {
+      // If $rs is the same as $rd, we need to use AT.
+      // If it is not available we exit.
+      unsigned ATReg = getATReg(IDLoc);
+      if (!ATReg)
+        return true;
+      TmpReg = ATReg;
+    }
+
+    TOut.emitRRX(Mips::LW, TmpReg, ABI.GetGlobalPtr(),
+                 MCOperand::createExpr(GotExpr), IDLoc, STI);
+
+    if (LoExpr)
+      TOut.emitRRX(Mips::ADDiu, TmpReg, TmpReg, MCOperand::createExpr(LoExpr),
+                   IDLoc, STI);
+
+    if (UseSrcReg)
+      TOut.emitRRR(Mips::ADDu, DstReg, TmpReg, SrcReg, IDLoc, STI);
+
+    return false;
+  }
+
+  const MipsMCExpr *HiExpr =
+      MipsMCExpr::create(MipsMCExpr::MEK_HI, SymExpr, getContext());
+  const MipsMCExpr *LoExpr =
+      MipsMCExpr::create(MipsMCExpr::MEK_LO, SymExpr, getContext());
 
   // This is the 64-bit symbol address expansion.
   if (ABI.ArePtrs64bit() && isGP64bit()) {
@@ -2378,12 +2450,14 @@ bool MipsAsmParser::loadAndAddSymbolAddress(
     if (!ATReg)
       return true;
 
-    const MipsMCExpr *HighestExpr = MipsMCExpr::create(
-        MCSymbolRefExpr::VK_Mips_HIGHEST, Symbol, getContext());
-    const MipsMCExpr *HigherExpr = MipsMCExpr::create(
-        MCSymbolRefExpr::VK_Mips_HIGHER, Symbol, getContext());
+    const MipsMCExpr *HighestExpr =
+        MipsMCExpr::create(MipsMCExpr::MEK_HIGHEST, SymExpr, getContext());
+    const MipsMCExpr *HigherExpr =
+        MipsMCExpr::create(MipsMCExpr::MEK_HIGHER, SymExpr, getContext());
 
-    if (UseSrcReg && (DstReg == SrcReg)) {
+    if (UseSrcReg &&
+        getContext().getRegisterInfo()->isSuperOrSubRegisterEq(DstReg,
+                                                               SrcReg)) {
       // If $rs is the same as $rd:
       // (d)la $rd, sym($rd) => lui    $at, %highest(sym)
       //                        daddiu $at, $at, %higher(sym)
@@ -2392,17 +2466,17 @@ bool MipsAsmParser::loadAndAddSymbolAddress(
       //                        dsll   $at, $at, 16
       //                        daddiu $at, $at, %lo(sym)
       //                        daddu  $rd, $at, $rd
-      emitRX(Mips::LUi, ATReg, MCOperand::createExpr(HighestExpr), IDLoc,
-             Instructions);
-      emitRRX(Mips::DADDiu, ATReg, ATReg, MCOperand::createExpr(HigherExpr),
-              IDLoc, Instructions);
-      emitRRI(Mips::DSLL, ATReg, ATReg, 16, IDLoc, Instructions);
-      emitRRX(Mips::DADDiu, ATReg, ATReg, MCOperand::createExpr(HiExpr), IDLoc,
-              Instructions);
-      emitRRI(Mips::DSLL, ATReg, ATReg, 16, IDLoc, Instructions);
-      emitRRX(Mips::DADDiu, ATReg, ATReg, MCOperand::createExpr(LoExpr), IDLoc,
-              Instructions);
-      emitRRR(Mips::DADDu, DstReg, ATReg, SrcReg, IDLoc, Instructions);
+      TOut.emitRX(Mips::LUi, ATReg, MCOperand::createExpr(HighestExpr), IDLoc,
+                  STI);
+      TOut.emitRRX(Mips::DADDiu, ATReg, ATReg,
+                   MCOperand::createExpr(HigherExpr), IDLoc, STI);
+      TOut.emitRRI(Mips::DSLL, ATReg, ATReg, 16, IDLoc, STI);
+      TOut.emitRRX(Mips::DADDiu, ATReg, ATReg, MCOperand::createExpr(HiExpr),
+                   IDLoc, STI);
+      TOut.emitRRI(Mips::DSLL, ATReg, ATReg, 16, IDLoc, STI);
+      TOut.emitRRX(Mips::DADDiu, ATReg, ATReg, MCOperand::createExpr(LoExpr),
+                   IDLoc, STI);
+      TOut.emitRRR(Mips::DADDu, DstReg, ATReg, SrcReg, IDLoc, STI);
 
       return false;
     }
@@ -2415,18 +2489,17 @@ bool MipsAsmParser::loadAndAddSymbolAddress(
     //                            dsll32 $rd, $rd, 0
     //                            daddu  $rd, $rd, $at
     //                            (daddu  $rd, $rd, $rs)
-    emitRX(Mips::LUi, DstReg, MCOperand::createExpr(HighestExpr), IDLoc,
-           Instructions);
-    emitRX(Mips::LUi, ATReg, MCOperand::createExpr(HiExpr), IDLoc,
-           Instructions);
-    emitRRX(Mips::DADDiu, DstReg, DstReg, MCOperand::createExpr(HigherExpr),
-            IDLoc, Instructions);
-    emitRRX(Mips::DADDiu, ATReg, ATReg, MCOperand::createExpr(LoExpr), IDLoc,
-            Instructions);
-    emitRRI(Mips::DSLL32, DstReg, DstReg, 0, IDLoc, Instructions);
-    emitRRR(Mips::DADDu, DstReg, DstReg, ATReg, IDLoc, Instructions);
+    TOut.emitRX(Mips::LUi, DstReg, MCOperand::createExpr(HighestExpr), IDLoc,
+                STI);
+    TOut.emitRX(Mips::LUi, ATReg, MCOperand::createExpr(HiExpr), IDLoc, STI);
+    TOut.emitRRX(Mips::DADDiu, DstReg, DstReg,
+                 MCOperand::createExpr(HigherExpr), IDLoc, STI);
+    TOut.emitRRX(Mips::DADDiu, ATReg, ATReg, MCOperand::createExpr(LoExpr),
+                 IDLoc, STI);
+    TOut.emitRRI(Mips::DSLL32, DstReg, DstReg, 0, IDLoc, STI);
+    TOut.emitRRR(Mips::DADDu, DstReg, DstReg, ATReg, IDLoc, STI);
     if (UseSrcReg)
-      emitRRR(Mips::DADDu, DstReg, DstReg, SrcReg, IDLoc, Instructions);
+      TOut.emitRRR(Mips::DADDu, DstReg, DstReg, SrcReg, IDLoc, STI);
 
     return false;
   }
@@ -2441,7 +2514,8 @@ bool MipsAsmParser::loadAndAddSymbolAddress(
   //                            ori   $rd, $rd, %lo(sym)
   //                            (addu $rd, $rd, $rs)
   unsigned TmpReg = DstReg;
-  if (UseSrcReg && (DstReg == SrcReg)) {
+  if (UseSrcReg &&
+      getContext().getRegisterInfo()->isSuperOrSubRegisterEq(DstReg, SrcReg)) {
     // If $rs is the same as $rd, we need to use AT.
     // If it is not available we exit.
     unsigned ATReg = getATReg(IDLoc);
@@ -2450,20 +2524,24 @@ bool MipsAsmParser::loadAndAddSymbolAddress(
     TmpReg = ATReg;
   }
 
-  emitRX(Mips::LUi, TmpReg, MCOperand::createExpr(HiExpr), IDLoc, Instructions);
-  emitRRX(Mips::ADDiu, TmpReg, TmpReg, MCOperand::createExpr(LoExpr), IDLoc,
-          Instructions);
+  TOut.emitRX(Mips::LUi, TmpReg, MCOperand::createExpr(HiExpr), IDLoc, STI);
+  TOut.emitRRX(Mips::ADDiu, TmpReg, TmpReg, MCOperand::createExpr(LoExpr),
+               IDLoc, STI);
 
   if (UseSrcReg)
-    emitRRR(Mips::ADDu, DstReg, TmpReg, SrcReg, IDLoc, Instructions);
+    TOut.emitRRR(Mips::ADDu, DstReg, TmpReg, SrcReg, IDLoc, STI);
   else
-    assert(DstReg == TmpReg);
+    assert(
+        getContext().getRegisterInfo()->isSuperOrSubRegisterEq(DstReg, TmpReg));
 
   return false;
 }
 
-bool MipsAsmParser::expandUncondBranchMMPseudo(
-    MCInst &Inst, SMLoc IDLoc, SmallVectorImpl<MCInst> &Instructions) {
+bool MipsAsmParser::expandUncondBranchMMPseudo(MCInst &Inst, SMLoc IDLoc,
+                                               MCStreamer &Out,
+                                               const MCSubtargetInfo *STI) {
+  MipsTargetStreamer &TOut = getTargetStreamer();
+
   assert(getInstDesc(Inst.getOpcode()).getNumOperands() == 1 &&
          "unexpected number of operands");
 
@@ -2493,19 +2571,20 @@ bool MipsAsmParser::expandUncondBranchMMPseudo(
       Inst.addOperand(MCOperand::createImm(Offset.getImm()));
     }
   }
-  Instructions.push_back(Inst);
+  Out.EmitInstruction(Inst, *STI);
 
   // If .set reorder is active and branch instruction has a delay slot,
   // emit a NOP after it.
   const MCInstrDesc &MCID = getInstDesc(Inst.getOpcode());
   if (MCID.hasDelaySlot() && AssemblerOptions.back()->isReorder())
-    createNop(true, IDLoc, Instructions);
+    TOut.emitEmptyDelaySlot(true, IDLoc, STI);
 
   return false;
 }
 
-bool MipsAsmParser::expandBranchImm(MCInst &Inst, SMLoc IDLoc,
-                                    SmallVectorImpl<MCInst> &Instructions) {
+bool MipsAsmParser::expandBranchImm(MCInst &Inst, SMLoc IDLoc, MCStreamer &Out,
+                                    const MCSubtargetInfo *STI) {
+  MipsTargetStreamer &TOut = getTargetStreamer();
   const MCOperand &DstRegOp = Inst.getOperand(0);
   assert(DstRegOp.isReg() && "expected register operand kind");
 
@@ -2513,7 +2592,8 @@ bool MipsAsmParser::expandBranchImm(MCInst &Inst, SMLoc IDLoc,
   assert(ImmOp.isImm() && "expected immediate operand kind");
 
   const MCOperand &MemOffsetOp = Inst.getOperand(2);
-  assert(MemOffsetOp.isImm() && "expected immediate operand kind");
+  assert((MemOffsetOp.isImm() || MemOffsetOp.isExpr()) &&
+         "expected immediate or expression operand");
 
   unsigned OpCode = 0;
   switch(Inst.getOpcode()) {
@@ -2530,8 +2610,8 @@ bool MipsAsmParser::expandBranchImm(MCInst &Inst, SMLoc IDLoc,
 
   int64_t ImmValue = ImmOp.getImm();
   if (ImmValue == 0)
-    emitRRX(OpCode, DstRegOp.getReg(), Mips::ZERO, MemOffsetOp, IDLoc,
-            Instructions);
+    TOut.emitRRX(OpCode, DstRegOp.getReg(), Mips::ZERO, MemOffsetOp, IDLoc,
+                 STI);
   else {
     warnIfNoMacro(IDLoc);
 
@@ -2540,94 +2620,112 @@ bool MipsAsmParser::expandBranchImm(MCInst &Inst, SMLoc IDLoc,
       return true;
 
     if (loadImmediate(ImmValue, ATReg, Mips::NoRegister, !isGP64bit(), true,
-                      IDLoc, Instructions))
+                      IDLoc, Out, STI))
       return true;
 
-    emitRRX(OpCode, DstRegOp.getReg(), ATReg, MemOffsetOp, IDLoc, Instructions);
+    TOut.emitRRX(OpCode, DstRegOp.getReg(), ATReg, MemOffsetOp, IDLoc, STI);
   }
   return false;
 }
 
-void MipsAsmParser::expandMemInst(MCInst &Inst, SMLoc IDLoc,
-                                  SmallVectorImpl<MCInst> &Instructions,
-                                  bool isLoad, bool isImmOpnd) {
-  unsigned ImmOffset, HiOffset, LoOffset;
-  const MCExpr *ExprOffset;
-  unsigned TmpRegNum;
-  // 1st operand is either the source or destination register.
-  assert(Inst.getOperand(0).isReg() && "expected register operand kind");
-  unsigned RegOpNum = Inst.getOperand(0).getReg();
-  // 2nd operand is the base register.
-  assert(Inst.getOperand(1).isReg() && "expected register operand kind");
-  unsigned BaseRegNum = Inst.getOperand(1).getReg();
-  // 3rd operand is either an immediate or expression.
-  if (isImmOpnd) {
-    assert(Inst.getOperand(2).isImm() && "expected immediate operand kind");
-    ImmOffset = Inst.getOperand(2).getImm();
-    LoOffset = ImmOffset & 0x0000ffff;
-    HiOffset = (ImmOffset & 0xffff0000) >> 16;
-    // If msb of LoOffset is 1(negative number) we must increment HiOffset.
-    if (LoOffset & 0x8000)
-      HiOffset++;
-  } else
-    ExprOffset = Inst.getOperand(2).getExpr();
-  // These are some of the types of expansions we perform here:
-  // 1) lw $8, sym        => lui $8, %hi(sym)
-  //                         lw $8, %lo(sym)($8)
-  // 2) lw $8, offset($9) => lui $8, %hi(offset)
-  //                         add $8, $8, $9
-  //                         lw $8, %lo(offset)($9)
-  // 3) lw $8, offset($8) => lui $at, %hi(offset)
-  //                         add $at, $at, $8
-  //                         lw $8, %lo(offset)($at)
-  // 4) sw $8, sym        => lui $at, %hi(sym)
-  //                         sw $8, %lo(sym)($at)
-  // 5) sw $8, offset($8) => lui $at, %hi(offset)
-  //                         add $at, $at, $8
-  //                         sw $8, %lo(offset)($at)
-  // 6) ldc1 $f0, sym     => lui $at, %hi(sym)
-  //                         ldc1 $f0, %lo(sym)($at)
-  //
-  // For load instructions we can use the destination register as a temporary
-  // if base and dst are different (examples 1 and 2) and if the base register
-  // is general purpose otherwise we must use $at (example 6) and error if it's
-  // not available. For stores we must use $at (examples 4 and 5) because we
-  // must not clobber the source register setting up the offset.
+void MipsAsmParser::expandMemInst(MCInst &Inst, SMLoc IDLoc, MCStreamer &Out,
+                                  const MCSubtargetInfo *STI, bool IsLoad,
+                                  bool IsImmOpnd) {
+  if (IsLoad) {
+    expandLoadInst(Inst, IDLoc, Out, STI, IsImmOpnd);
+    return;
+  }
+  expandStoreInst(Inst, IDLoc, Out, STI, IsImmOpnd);
+}
+
+void MipsAsmParser::expandLoadInst(MCInst &Inst, SMLoc IDLoc, MCStreamer &Out,
+                                   const MCSubtargetInfo *STI, bool IsImmOpnd) {
+  MipsTargetStreamer &TOut = getTargetStreamer();
+
+  unsigned DstReg = Inst.getOperand(0).getReg();
+  unsigned BaseReg = Inst.getOperand(1).getReg();
+
   const MCInstrDesc &Desc = getInstDesc(Inst.getOpcode());
-  int16_t RegClassOp0 = Desc.OpInfo[0].RegClass;
-  unsigned RegClassIDOp0 =
-      getContext().getRegisterInfo()->getRegClass(RegClassOp0).getID();
-  bool IsGPR = (RegClassIDOp0 == Mips::GPR32RegClassID) ||
-               (RegClassIDOp0 == Mips::GPR64RegClassID);
-  if (isLoad && IsGPR && (BaseRegNum != RegOpNum))
-    TmpRegNum = RegOpNum;
-  else {
+  int16_t DstRegClass = Desc.OpInfo[0].RegClass;
+  unsigned DstRegClassID =
+      getContext().getRegisterInfo()->getRegClass(DstRegClass).getID();
+  bool IsGPR = (DstRegClassID == Mips::GPR32RegClassID) ||
+               (DstRegClassID == Mips::GPR64RegClassID);
+
+  if (IsImmOpnd) {
+    // Try to use DstReg as the temporary.
+    if (IsGPR && (BaseReg != DstReg)) {
+      TOut.emitLoadWithImmOffset(Inst.getOpcode(), DstReg, BaseReg,
+                                 Inst.getOperand(2).getImm(), DstReg, IDLoc,
+                                 STI);
+      return;
+    }
+
     // At this point we need AT to perform the expansions and we exit if it is
     // not available.
-    TmpRegNum = getATReg(IDLoc);
-    if (!TmpRegNum)
+    unsigned ATReg = getATReg(IDLoc);
+    if (!ATReg)
       return;
+
+    TOut.emitLoadWithImmOffset(Inst.getOpcode(), DstReg, BaseReg,
+                               Inst.getOperand(2).getImm(), ATReg, IDLoc, STI);
+    return;
   }
 
-  emitRX(Mips::LUi, TmpRegNum,
-         isImmOpnd ? MCOperand::createImm(HiOffset)
-                   : MCOperand::createExpr(evaluateRelocExpr(ExprOffset, "hi")),
-         IDLoc, Instructions);
-  // Add temp register to base.
-  if (BaseRegNum != Mips::ZERO)
-    emitRRR(Mips::ADDu, TmpRegNum, TmpRegNum, BaseRegNum, IDLoc, Instructions);
-  // And finally, create original instruction with low part
-  // of offset and new base.
-  emitRRX(Inst.getOpcode(), RegOpNum, TmpRegNum,
-          isImmOpnd
-              ? MCOperand::createImm(LoOffset)
-              : MCOperand::createExpr(evaluateRelocExpr(ExprOffset, "lo")),
-          IDLoc, Instructions);
+  const MCExpr *ExprOffset = Inst.getOperand(2).getExpr();
+  MCOperand LoOperand = MCOperand::createExpr(
+      MipsMCExpr::create(MipsMCExpr::MEK_LO, ExprOffset, getContext()));
+  MCOperand HiOperand = MCOperand::createExpr(
+      MipsMCExpr::create(MipsMCExpr::MEK_HI, ExprOffset, getContext()));
+
+  // Try to use DstReg as the temporary.
+  if (IsGPR && (BaseReg != DstReg)) {
+    TOut.emitLoadWithSymOffset(Inst.getOpcode(), DstReg, BaseReg, HiOperand,
+                               LoOperand, DstReg, IDLoc, STI);
+    return;
+  }
+
+  // At this point we need AT to perform the expansions and we exit if it is
+  // not available.
+  unsigned ATReg = getATReg(IDLoc);
+  if (!ATReg)
+    return;
+
+  TOut.emitLoadWithSymOffset(Inst.getOpcode(), DstReg, BaseReg, HiOperand,
+                             LoOperand, ATReg, IDLoc, STI);
 }
 
-bool
-MipsAsmParser::expandLoadStoreMultiple(MCInst &Inst, SMLoc IDLoc,
-                                       SmallVectorImpl<MCInst> &Instructions) {
+void MipsAsmParser::expandStoreInst(MCInst &Inst, SMLoc IDLoc, MCStreamer &Out,
+                                    const MCSubtargetInfo *STI,
+                                    bool IsImmOpnd) {
+  MipsTargetStreamer &TOut = getTargetStreamer();
+
+  unsigned SrcReg = Inst.getOperand(0).getReg();
+  unsigned BaseReg = Inst.getOperand(1).getReg();
+
+  if (IsImmOpnd) {
+    TOut.emitStoreWithImmOffset(Inst.getOpcode(), SrcReg, BaseReg,
+                                Inst.getOperand(2).getImm(),
+                                [&]() { return getATReg(IDLoc); }, IDLoc, STI);
+    return;
+  }
+
+  unsigned ATReg = getATReg(IDLoc);
+  if (!ATReg)
+    return;
+
+  const MCExpr *ExprOffset = Inst.getOperand(2).getExpr();
+  MCOperand LoOperand = MCOperand::createExpr(
+      MipsMCExpr::create(MipsMCExpr::MEK_LO, ExprOffset, getContext()));
+  MCOperand HiOperand = MCOperand::createExpr(
+      MipsMCExpr::create(MipsMCExpr::MEK_HI, ExprOffset, getContext()));
+  TOut.emitStoreWithSymOffset(Inst.getOpcode(), SrcReg, BaseReg, HiOperand,
+                              LoOperand, ATReg, IDLoc, STI);
+}
+
+bool MipsAsmParser::expandLoadStoreMultiple(MCInst &Inst, SMLoc IDLoc,
+                                            MCStreamer &Out,
+                                            const MCSubtargetInfo *STI) {
   unsigned OpNum = Inst.getNumOperands();
   unsigned Opcode = Inst.getOpcode();
   unsigned NewOpcode = Opcode == Mips::SWM_MM ? Mips::SWM32_MM : Mips::LWM32_MM;
@@ -2650,12 +2748,14 @@ MipsAsmParser::expandLoadStoreMultiple(MCInst &Inst, SMLoc IDLoc,
   }
 
   Inst.setOpcode(NewOpcode);
-  Instructions.push_back(Inst);
+  Out.EmitInstruction(Inst, *STI);
   return false;
 }
 
 bool MipsAsmParser::expandCondBranches(MCInst &Inst, SMLoc IDLoc,
-                                       SmallVectorImpl<MCInst> &Instructions) {
+                                       MCStreamer &Out,
+                                       const MCSubtargetInfo *STI) {
+  MipsTargetStreamer &TOut = getTargetStreamer();
   bool EmittedNoMacroWarning = false;
   unsigned PseudoOpcode = Inst.getOpcode();
   unsigned SrcReg = Inst.getOperand(0).getReg();
@@ -2730,7 +2830,7 @@ bool MipsAsmParser::expandCondBranches(MCInst &Inst, SMLoc IDLoc,
     }
 
     if (loadImmediate(TrgOp.getImm(), TrgReg, Mips::NoRegister, !isGP64bit(),
-                      false, IDLoc, Instructions))
+                      false, IDLoc, Out, STI))
       return true;
   }
 
@@ -2790,37 +2890,37 @@ bool MipsAsmParser::expandCondBranches(MCInst &Inst, SMLoc IDLoc,
     // with GAS' behaviour. However, they may not generate the most efficient
     // code in some circumstances.
     if (PseudoOpcode == Mips::BLT) {
-      emitRX(Mips::BLTZ, Mips::ZERO, MCOperand::createExpr(OffsetExpr), IDLoc,
-             Instructions);
+      TOut.emitRX(Mips::BLTZ, Mips::ZERO, MCOperand::createExpr(OffsetExpr),
+                  IDLoc, STI);
       return false;
     }
     if (PseudoOpcode == Mips::BLE) {
-      emitRX(Mips::BLEZ, Mips::ZERO, MCOperand::createExpr(OffsetExpr), IDLoc,
-             Instructions);
+      TOut.emitRX(Mips::BLEZ, Mips::ZERO, MCOperand::createExpr(OffsetExpr),
+                  IDLoc, STI);
       Warning(IDLoc, "branch is always taken");
       return false;
     }
     if (PseudoOpcode == Mips::BGE) {
-      emitRX(Mips::BGEZ, Mips::ZERO, MCOperand::createExpr(OffsetExpr), IDLoc,
-             Instructions);
+      TOut.emitRX(Mips::BGEZ, Mips::ZERO, MCOperand::createExpr(OffsetExpr),
+                  IDLoc, STI);
       Warning(IDLoc, "branch is always taken");
       return false;
     }
     if (PseudoOpcode == Mips::BGT) {
-      emitRX(Mips::BGTZ, Mips::ZERO, MCOperand::createExpr(OffsetExpr), IDLoc,
-             Instructions);
+      TOut.emitRX(Mips::BGTZ, Mips::ZERO, MCOperand::createExpr(OffsetExpr),
+                  IDLoc, STI);
       return false;
     }
     if (PseudoOpcode == Mips::BGTU) {
-      emitRRX(Mips::BNE, Mips::ZERO, Mips::ZERO,
-              MCOperand::createExpr(OffsetExpr), IDLoc, Instructions);
+      TOut.emitRRX(Mips::BNE, Mips::ZERO, Mips::ZERO,
+                   MCOperand::createExpr(OffsetExpr), IDLoc, STI);
       return false;
     }
     if (AcceptsEquality) {
       // If both registers are $0 and the pseudo-branch accepts equality, it
       // will always be taken, so we emit an unconditional branch.
-      emitRRX(Mips::BEQ, Mips::ZERO, Mips::ZERO,
-              MCOperand::createExpr(OffsetExpr), IDLoc, Instructions);
+      TOut.emitRRX(Mips::BEQ, Mips::ZERO, Mips::ZERO,
+                   MCOperand::createExpr(OffsetExpr), IDLoc, STI);
       Warning(IDLoc, "branch is always taken");
       return false;
     }
@@ -2844,8 +2944,8 @@ bool MipsAsmParser::expandCondBranches(MCInst &Inst, SMLoc IDLoc,
       // the pseudo-branch will always be taken, so we emit an unconditional
       // branch.
       // This only applies to unsigned pseudo-branches.
-      emitRRX(Mips::BEQ, Mips::ZERO, Mips::ZERO,
-              MCOperand::createExpr(OffsetExpr), IDLoc, Instructions);
+      TOut.emitRRX(Mips::BEQ, Mips::ZERO, Mips::ZERO,
+                   MCOperand::createExpr(OffsetExpr), IDLoc, STI);
       Warning(IDLoc, "branch is always taken");
       return false;
     }
@@ -2862,17 +2962,17 @@ bool MipsAsmParser::expandCondBranches(MCInst &Inst, SMLoc IDLoc,
       //
       // Because only BLEU and BGEU branch on equality, we can use the
       // AcceptsEquality variable to decide when to emit the BEQZ.
-      emitRRX(AcceptsEquality ? Mips::BEQ : Mips::BNE,
-              IsSrcRegZero ? TrgReg : SrcReg, Mips::ZERO,
-              MCOperand::createExpr(OffsetExpr), IDLoc, Instructions);
+      TOut.emitRRX(AcceptsEquality ? Mips::BEQ : Mips::BNE,
+                   IsSrcRegZero ? TrgReg : SrcReg, Mips::ZERO,
+                   MCOperand::createExpr(OffsetExpr), IDLoc, STI);
       return false;
     }
     // If we have a signed pseudo-branch and one of the registers is $0,
     // we can use an appropriate compare-to-zero branch. We select which one
     // to use in the switch statement above.
-    emitRX(IsSrcRegZero ? ZeroSrcOpcode : ZeroTrgOpcode,
-           IsSrcRegZero ? TrgReg : SrcReg, MCOperand::createExpr(OffsetExpr),
-           IDLoc, Instructions);
+    TOut.emitRX(IsSrcRegZero ? ZeroSrcOpcode : ZeroTrgOpcode,
+                IsSrcRegZero ? TrgReg : SrcReg,
+                MCOperand::createExpr(OffsetExpr), IDLoc, STI);
     return false;
   }
 
@@ -2900,32 +3000,33 @@ bool MipsAsmParser::expandCondBranches(MCInst &Inst, SMLoc IDLoc,
   //
   // The same applies to the unsigned variants, except that SLTu is used
   // instead of SLT.
-  emitRRR(IsUnsigned ? Mips::SLTu : Mips::SLT, ATRegNum,
-          ReverseOrderSLT ? TrgReg : SrcReg, ReverseOrderSLT ? SrcReg : TrgReg,
-          IDLoc, Instructions);
-
-  emitRRX(IsLikely ? (AcceptsEquality ? Mips::BEQL : Mips::BNEL)
-                   : (AcceptsEquality ? Mips::BEQ : Mips::BNE),
-          ATRegNum, Mips::ZERO, MCOperand::createExpr(OffsetExpr), IDLoc,
-          Instructions);
+  TOut.emitRRR(IsUnsigned ? Mips::SLTu : Mips::SLT, ATRegNum,
+               ReverseOrderSLT ? TrgReg : SrcReg,
+               ReverseOrderSLT ? SrcReg : TrgReg, IDLoc, STI);
+
+  TOut.emitRRX(IsLikely ? (AcceptsEquality ? Mips::BEQL : Mips::BNEL)
+                        : (AcceptsEquality ? Mips::BEQ : Mips::BNE),
+               ATRegNum, Mips::ZERO, MCOperand::createExpr(OffsetExpr), IDLoc,
+               STI);
   return false;
 }
 
-bool MipsAsmParser::expandDiv(MCInst &Inst, SMLoc IDLoc,
-                              SmallVectorImpl<MCInst> &Instructions,
-                              const bool IsMips64, const bool Signed) {
-  if (hasMips32r6()) {
-    Error(IDLoc, "instruction not supported on mips32r6 or mips64r6");
-    return false;
-  }
+bool MipsAsmParser::expandDiv(MCInst &Inst, SMLoc IDLoc, MCStreamer &Out,
+                              const MCSubtargetInfo *STI, const bool IsMips64,
+                              const bool Signed) {
+  MipsTargetStreamer &TOut = getTargetStreamer();
 
   warnIfNoMacro(IDLoc);
 
-  const MCOperand &RsRegOp = Inst.getOperand(0);
+  const MCOperand &RdRegOp = Inst.getOperand(0);
+  assert(RdRegOp.isReg() && "expected register operand kind");
+  unsigned RdReg = RdRegOp.getReg();
+
+  const MCOperand &RsRegOp = Inst.getOperand(1);
   assert(RsRegOp.isReg() && "expected register operand kind");
   unsigned RsReg = RsRegOp.getReg();
 
-  const MCOperand &RtRegOp = Inst.getOperand(1);
+  const MCOperand &RtRegOp = Inst.getOperand(2);
   assert(RtRegOp.isReg() && "expected register operand kind");
   unsigned RtReg = RtRegOp.getReg();
   unsigned DivOp;
@@ -2947,15 +3048,15 @@ bool MipsAsmParser::expandDiv(MCInst &Inst, SMLoc IDLoc,
     if (IsMips64) {
       if (Signed && (RtReg == Mips::ZERO || RtReg == Mips::ZERO_64)) {
         if (UseTraps) {
-          emitRRI(Mips::TEQ, RtReg, ZeroReg, 0x7, IDLoc, Instructions);
+          TOut.emitRRI(Mips::TEQ, RtReg, ZeroReg, 0x7, IDLoc, STI);
           return false;
         }
 
-        emitII(Mips::BREAK, 0x7, 0, IDLoc, Instructions);
+        TOut.emitII(Mips::BREAK, 0x7, 0, IDLoc, STI);
         return false;
       }
     } else {
-      emitRR(DivOp, RsReg, RtReg, IDLoc, Instructions);
+      TOut.emitRR(DivOp, RsReg, RtReg, IDLoc, STI);
       return false;
     }
   }
@@ -2964,11 +3065,11 @@ bool MipsAsmParser::expandDiv(MCInst &Inst, SMLoc IDLoc,
     Warning(IDLoc, "division by zero");
     if (Signed) {
       if (UseTraps) {
-        emitRRI(Mips::TEQ, RtReg, ZeroReg, 0x7, IDLoc, Instructions);
+        TOut.emitRRI(Mips::TEQ, RtReg, ZeroReg, 0x7, IDLoc, STI);
         return false;
       }
 
-      emitII(Mips::BREAK, 0x7, 0, IDLoc, Instructions);
+      TOut.emitII(Mips::BREAK, 0x7, 0, IDLoc, STI);
       return false;
     }
   }
@@ -2980,22 +3081,21 @@ bool MipsAsmParser::expandDiv(MCInst &Inst, SMLoc IDLoc,
 
   if (UseTraps) {
     BranchTarget = IsMips64 ? 12 : 8;
-    emitRRI(Mips::TEQ, RtReg, ZeroReg, 0x7, IDLoc, Instructions);
+    TOut.emitRRI(Mips::TEQ, RtReg, ZeroReg, 0x7, IDLoc, STI);
   } else {
     BranchTarget = IsMips64 ? 20 : 16;
     BranchTargetNoTraps = 8;
     // Branch to the li instruction.
-    emitRRI(Mips::BNE, RtReg, ZeroReg, BranchTargetNoTraps, IDLoc,
-            Instructions);
+    TOut.emitRRI(Mips::BNE, RtReg, ZeroReg, BranchTargetNoTraps, IDLoc, STI);
   }
 
-  emitRR(DivOp, RsReg, RtReg, IDLoc, Instructions);
+  TOut.emitRR(DivOp, RsReg, RtReg, IDLoc, STI);
 
   if (!UseTraps)
-    emitII(Mips::BREAK, 0x7, 0, IDLoc, Instructions);
+    TOut.emitII(Mips::BREAK, 0x7, 0, IDLoc, STI);
 
   if (!Signed) {
-    emitR(Mips::MFLO, RsReg, IDLoc, Instructions);
+    TOut.emitR(Mips::MFLO, RdReg, IDLoc, STI);
     return false;
   }
 
@@ -3003,32 +3103,73 @@ bool MipsAsmParser::expandDiv(MCInst &Inst, SMLoc IDLoc,
   if (!ATReg)
     return true;
 
-  emitRRI(Mips::ADDiu, ATReg, ZeroReg, -1, IDLoc, Instructions);
+  TOut.emitRRI(Mips::ADDiu, ATReg, ZeroReg, -1, IDLoc, STI);
   if (IsMips64) {
     // Branch to the mflo instruction.
-    emitRRI(Mips::BNE, RtReg, ATReg, BranchTarget, IDLoc, Instructions);
-    emitRRI(Mips::ADDiu, ATReg, ZeroReg, 1, IDLoc, Instructions);
-    emitRRI(Mips::DSLL32, ATReg, ATReg, 0x1f, IDLoc, Instructions);
+    TOut.emitRRI(Mips::BNE, RtReg, ATReg, BranchTarget, IDLoc, STI);
+    TOut.emitRRI(Mips::ADDiu, ATReg, ZeroReg, 1, IDLoc, STI);
+    TOut.emitRRI(Mips::DSLL32, ATReg, ATReg, 0x1f, IDLoc, STI);
   } else {
     // Branch to the mflo instruction.
-    emitRRI(Mips::BNE, RtReg, ATReg, BranchTarget, IDLoc, Instructions);
-    emitRI(Mips::LUi, ATReg, (uint16_t)0x8000, IDLoc, Instructions);
+    TOut.emitRRI(Mips::BNE, RtReg, ATReg, BranchTarget, IDLoc, STI);
+    TOut.emitRI(Mips::LUi, ATReg, (uint16_t)0x8000, IDLoc, STI);
   }
 
   if (UseTraps)
-    emitRRI(Mips::TEQ, RsReg, ATReg, 0x6, IDLoc, Instructions);
+    TOut.emitRRI(Mips::TEQ, RsReg, ATReg, 0x6, IDLoc, STI);
   else {
     // Branch to the mflo instruction.
-    emitRRI(Mips::BNE, RsReg, ATReg, BranchTargetNoTraps, IDLoc, Instructions);
-    emitRRI(Mips::SLL, ZeroReg, ZeroReg, 0, IDLoc, Instructions);
-    emitII(Mips::BREAK, 0x6, 0, IDLoc, Instructions);
+    TOut.emitRRI(Mips::BNE, RsReg, ATReg, BranchTargetNoTraps, IDLoc, STI);
+    TOut.emitRRI(Mips::SLL, ZeroReg, ZeroReg, 0, IDLoc, STI);
+    TOut.emitII(Mips::BREAK, 0x6, 0, IDLoc, STI);
+  }
+  TOut.emitR(Mips::MFLO, RdReg, IDLoc, STI);
+  return false;
+}
+
+bool MipsAsmParser::expandTrunc(MCInst &Inst, bool IsDouble, bool Is64FPU,
+                                SMLoc IDLoc, MCStreamer &Out,
+                                const MCSubtargetInfo *STI) {
+  MipsTargetStreamer &TOut = getTargetStreamer();
+
+  assert(Inst.getNumOperands() == 3 && "Invalid operand count");
+  assert(Inst.getOperand(0).isReg() && Inst.getOperand(1).isReg() &&
+         Inst.getOperand(2).isReg() && "Invalid instruction operand.");
+
+  unsigned FirstReg = Inst.getOperand(0).getReg();
+  unsigned SecondReg = Inst.getOperand(1).getReg();
+  unsigned ThirdReg = Inst.getOperand(2).getReg();
+
+  if (hasMips1() && !hasMips2()) {
+    unsigned ATReg = getATReg(IDLoc);
+    if (!ATReg)
+      return true;
+    TOut.emitRR(Mips::CFC1, ThirdReg, Mips::RA, IDLoc, STI);
+    TOut.emitRR(Mips::CFC1, ThirdReg, Mips::RA, IDLoc, STI);
+    TOut.emitNop(IDLoc, STI);
+    TOut.emitRRI(Mips::ORi, ATReg, ThirdReg, 0x3, IDLoc, STI);
+    TOut.emitRRI(Mips::XORi, ATReg, ATReg, 0x2, IDLoc, STI);
+    TOut.emitRR(Mips::CTC1, Mips::RA, ATReg, IDLoc, STI);
+    TOut.emitNop(IDLoc, STI);
+    TOut.emitRR(IsDouble ? (Is64FPU ? Mips::CVT_W_D64 : Mips::CVT_W_D32)
+                         : Mips::CVT_W_S,
+                FirstReg, SecondReg, IDLoc, STI);
+    TOut.emitRR(Mips::CTC1, Mips::RA, ThirdReg, IDLoc, STI);
+    TOut.emitNop(IDLoc, STI);
+    return false;
   }
-  emitR(Mips::MFLO, RsReg, IDLoc, Instructions);
+
+  TOut.emitRR(IsDouble ? (Is64FPU ? Mips::TRUNC_W_D64 : Mips::TRUNC_W_D32)
+                       : Mips::TRUNC_W_S,
+              FirstReg, SecondReg, IDLoc, STI);
+
   return false;
 }
 
 bool MipsAsmParser::expandUlh(MCInst &Inst, bool Signed, SMLoc IDLoc,
-                              SmallVectorImpl<MCInst> &Instructions) {
+                              MCStreamer &Out, const MCSubtargetInfo *STI) {
+  MipsTargetStreamer &TOut = getTargetStreamer();
+
   if (hasMips32r6() || hasMips64r6()) {
     Error(IDLoc, "instruction not supported on mips32r6 or mips64r6");
     return false;
@@ -3063,7 +3204,7 @@ bool MipsAsmParser::expandUlh(MCInst &Inst, bool Signed, SMLoc IDLoc,
     LoadedOffsetInAT = true;
 
     if (loadImmediate(OffsetValue, ATReg, Mips::NoRegister, !ABI.ArePtrs64bit(),
-                      true, IDLoc, Instructions))
+                      true, IDLoc, Out, STI))
       return true;
 
     // NOTE: We do this (D)ADDu here instead of doing it in loadImmediate()
@@ -3073,7 +3214,7 @@ bool MipsAsmParser::expandUlh(MCInst &Inst, bool Signed, SMLoc IDLoc,
     // NOTE: If there is no source register specified in the ULHU, the parser
     // will interpret it as $0.
     if (SrcReg != Mips::ZERO && SrcReg != Mips::ZERO_64)
-      createAddu(ATReg, ATReg, SrcReg, ABI.ArePtrs64bit(), Instructions);
+      TOut.emitAddu(ATReg, ATReg, SrcReg, ABI.ArePtrs64bit(), STI);
   }
 
   unsigned FirstLbuDstReg = LoadedOffsetInAT ? DstReg : ATReg;
@@ -3091,21 +3232,23 @@ bool MipsAsmParser::expandUlh(MCInst &Inst, bool Signed, SMLoc IDLoc,
 
   unsigned SllReg = LoadedOffsetInAT ? DstReg : ATReg;
 
-  emitRRI(Signed ? Mips::LB : Mips::LBu, FirstLbuDstReg, LbuSrcReg,
-          FirstLbuOffset, IDLoc, Instructions);
+  TOut.emitRRI(Signed ? Mips::LB : Mips::LBu, FirstLbuDstReg, LbuSrcReg,
+               FirstLbuOffset, IDLoc, STI);
 
-  emitRRI(Mips::LBu, SecondLbuDstReg, LbuSrcReg, SecondLbuOffset, IDLoc,
-          Instructions);
+  TOut.emitRRI(Mips::LBu, SecondLbuDstReg, LbuSrcReg, SecondLbuOffset, IDLoc,
+               STI);
 
-  emitRRI(Mips::SLL, SllReg, SllReg, 8, IDLoc, Instructions);
+  TOut.emitRRI(Mips::SLL, SllReg, SllReg, 8, IDLoc, STI);
 
-  emitRRR(Mips::OR, DstReg, DstReg, ATReg, IDLoc, Instructions);
+  TOut.emitRRR(Mips::OR, DstReg, DstReg, ATReg, IDLoc, STI);
 
   return false;
 }
 
-bool MipsAsmParser::expandUlw(MCInst &Inst, SMLoc IDLoc,
-                              SmallVectorImpl<MCInst> &Instructions) {
+bool MipsAsmParser::expandUlw(MCInst &Inst, SMLoc IDLoc, MCStreamer &Out,
+                              const MCSubtargetInfo *STI) {
+  MipsTargetStreamer &TOut = getTargetStreamer();
+
   if (hasMips32r6() || hasMips64r6()) {
     Error(IDLoc, "instruction not supported on mips32r6 or mips64r6");
     return false;
@@ -3137,7 +3280,7 @@ bool MipsAsmParser::expandUlw(MCInst &Inst, SMLoc IDLoc,
     warnIfNoMacro(IDLoc);
 
     if (loadImmediate(OffsetValue, ATReg, Mips::NoRegister, !ABI.ArePtrs64bit(),
-                      true, IDLoc, Instructions))
+                      true, IDLoc, Out, STI))
       return true;
 
     // NOTE: We do this (D)ADDu here instead of doing it in loadImmediate()
@@ -3147,7 +3290,7 @@ bool MipsAsmParser::expandUlw(MCInst &Inst, SMLoc IDLoc,
     // NOTE: If there is no source register specified in the ULW, the parser
     // will interpret it as $0.
     if (SrcReg != Mips::ZERO && SrcReg != Mips::ZERO_64)
-      createAddu(ATReg, ATReg, SrcReg, ABI.ArePtrs64bit(), Instructions);
+      TOut.emitAddu(ATReg, ATReg, SrcReg, ABI.ArePtrs64bit(), STI);
   }
 
   unsigned FinalSrcReg = LoadedOffsetInAT ? ATReg : SrcReg;
@@ -3160,17 +3303,19 @@ bool MipsAsmParser::expandUlw(MCInst &Inst, SMLoc IDLoc,
     RightLoadOffset  = LoadedOffsetInAT ? 3 : (OffsetValue + 3);
   }
 
-  emitRRI(Mips::LWL, DstRegOp.getReg(), FinalSrcReg, LeftLoadOffset, IDLoc,
-          Instructions);
+  TOut.emitRRI(Mips::LWL, DstRegOp.getReg(), FinalSrcReg, LeftLoadOffset, IDLoc,
+               STI);
 
-  emitRRI(Mips::LWR, DstRegOp.getReg(), FinalSrcReg, RightLoadOffset, IDLoc,
-          Instructions);
+  TOut.emitRRI(Mips::LWR, DstRegOp.getReg(), FinalSrcReg, RightLoadOffset,
+               IDLoc, STI);
 
   return false;
 }
 
 bool MipsAsmParser::expandAliasImmediate(MCInst &Inst, SMLoc IDLoc,
-                                         SmallVectorImpl<MCInst> &Instructions) {
+                                         MCStreamer &Out,
+                                         const MCSubtargetInfo *STI) {
+  MipsTargetStreamer &TOut = getTargetStreamer();
 
   assert (Inst.getNumOperands() == 3 && "Invalid operand count");
   assert (Inst.getOperand(0).isReg() &&
@@ -3195,7 +3340,7 @@ bool MipsAsmParser::expandAliasImmediate(MCInst &Inst, SMLoc IDLoc,
     DstReg = ATReg;
   }
 
-  if (!loadImmediate(ImmValue, DstReg, Mips::NoRegister, Is32Bit, false, Inst.getLoc(), Instructions)) {
+  if (!loadImmediate(ImmValue, DstReg, Mips::NoRegister, Is32Bit, false, Inst.getLoc(), Out, STI)) {
     switch (FinalOpcode) {
     default:
       llvm_unreachable("unimplemented expansion");
@@ -3226,17 +3371,17 @@ bool MipsAsmParser::expandAliasImmediate(MCInst &Inst, SMLoc IDLoc,
     }
 
     if (FinalDstReg == Mips::NoRegister)
-      emitRRR(FinalOpcode, DstReg, DstReg, SrcReg, IDLoc, Instructions);
+      TOut.emitRRR(FinalOpcode, DstReg, DstReg, SrcReg, IDLoc, STI);
     else
-      emitRRR(FinalOpcode, FinalDstReg, FinalDstReg, DstReg, IDLoc,
-              Instructions);
+      TOut.emitRRR(FinalOpcode, FinalDstReg, FinalDstReg, DstReg, IDLoc, STI);
     return false;
   }
   return true;
 }
 
-bool MipsAsmParser::expandRotation(MCInst &Inst, SMLoc IDLoc,
-                                   SmallVectorImpl<MCInst> &Instructions) {
+bool MipsAsmParser::expandRotation(MCInst &Inst, SMLoc IDLoc, MCStreamer &Out,
+                                   const MCSubtargetInfo *STI) {
+  MipsTargetStreamer &TOut = getTargetStreamer();
   unsigned ATReg = Mips::NoRegister;
   unsigned DReg = Inst.getOperand(0).getReg();
   unsigned SReg = Inst.getOperand(1).getReg();
@@ -3255,13 +3400,13 @@ bool MipsAsmParser::expandRotation(MCInst &Inst, SMLoc IDLoc,
     }
 
     if (Inst.getOpcode() == Mips::ROL) {
-      emitRRR(Mips::SUBu, TmpReg, Mips::ZERO, TReg, Inst.getLoc(), Instructions);
-      emitRRR(Mips::ROTRV, DReg, SReg, TmpReg, Inst.getLoc(), Instructions);
+      TOut.emitRRR(Mips::SUBu, TmpReg, Mips::ZERO, TReg, Inst.getLoc(), STI);
+      TOut.emitRRR(Mips::ROTRV, DReg, SReg, TmpReg, Inst.getLoc(), STI);
       return false;
     }
 
     if (Inst.getOpcode() == Mips::ROR) {
-      emitRRR(Mips::ROTRV, DReg, SReg, TReg, Inst.getLoc(), Instructions);
+      TOut.emitRRR(Mips::ROTRV, DReg, SReg, TReg, Inst.getLoc(), STI);
       return false;
     }
 
@@ -3287,10 +3432,10 @@ bool MipsAsmParser::expandRotation(MCInst &Inst, SMLoc IDLoc,
     if (!ATReg)
       return true;
 
-    emitRRR(Mips::SUBu, ATReg, Mips::ZERO, TReg, Inst.getLoc(), Instructions);
-    emitRRR(FirstShift, ATReg, SReg, ATReg, Inst.getLoc(), Instructions);
-    emitRRR(SecondShift, DReg, SReg, TReg, Inst.getLoc(), Instructions);
-    emitRRR(Mips::OR, DReg, DReg, ATReg, Inst.getLoc(), Instructions);
+    TOut.emitRRR(Mips::SUBu, ATReg, Mips::ZERO, TReg, Inst.getLoc(), STI);
+    TOut.emitRRR(FirstShift, ATReg, SReg, ATReg, Inst.getLoc(), STI);
+    TOut.emitRRR(SecondShift, DReg, SReg, TReg, Inst.getLoc(), STI);
+    TOut.emitRRR(Mips::OR, DReg, DReg, ATReg, Inst.getLoc(), STI);
 
     return false;
   }
@@ -3299,8 +3444,9 @@ bool MipsAsmParser::expandRotation(MCInst &Inst, SMLoc IDLoc,
 }
 
 bool MipsAsmParser::expandRotationImm(MCInst &Inst, SMLoc IDLoc,
-                                      SmallVectorImpl<MCInst> &Instructions) {
-
+                                      MCStreamer &Out,
+                                      const MCSubtargetInfo *STI) {
+  MipsTargetStreamer &TOut = getTargetStreamer();
   unsigned ATReg = Mips::NoRegister;
   unsigned DReg = Inst.getOperand(0).getReg();
   unsigned SReg = Inst.getOperand(1).getReg();
@@ -3316,12 +3462,12 @@ bool MipsAsmParser::expandRotationImm(MCInst &Inst, SMLoc IDLoc,
       uint64_t ShiftValue = ImmValue;
       if (ImmValue != 0)
         ShiftValue = MaxShift - ImmValue;
-      emitRRI(Mips::ROTR, DReg, SReg, ShiftValue, Inst.getLoc(), Instructions);
+      TOut.emitRRI(Mips::ROTR, DReg, SReg, ShiftValue, Inst.getLoc(), STI);
       return false;
     }
 
     if (Inst.getOpcode() == Mips::RORImm) {
-      emitRRI(Mips::ROTR, DReg, SReg, ImmValue, Inst.getLoc(), Instructions);
+      TOut.emitRRI(Mips::ROTR, DReg, SReg, ImmValue, Inst.getLoc(), STI);
       return false;
     }
 
@@ -3331,7 +3477,7 @@ bool MipsAsmParser::expandRotationImm(MCInst &Inst, SMLoc IDLoc,
   if (hasMips32()) {
 
     if (ImmValue == 0) {
-      emitRRI(Mips::SRL, DReg, SReg, 0, Inst.getLoc(), Instructions);
+      TOut.emitRRI(Mips::SRL, DReg, SReg, 0, Inst.getLoc(), STI);
       return false;
     }
 
@@ -3352,9 +3498,9 @@ bool MipsAsmParser::expandRotationImm(MCInst &Inst, SMLoc IDLoc,
     if (!ATReg)
       return true;
 
-    emitRRI(FirstShift, ATReg, SReg, ImmValue, Inst.getLoc(), Instructions);
-    emitRRI(SecondShift, DReg, SReg, 32 - ImmValue, Inst.getLoc(), Instructions);
-    emitRRR(Mips::OR, DReg, DReg, ATReg, Inst.getLoc(), Instructions);
+    TOut.emitRRI(FirstShift, ATReg, SReg, ImmValue, Inst.getLoc(), STI);
+    TOut.emitRRI(SecondShift, DReg, SReg, 32 - ImmValue, Inst.getLoc(), STI);
+    TOut.emitRRR(Mips::OR, DReg, DReg, ATReg, Inst.getLoc(), STI);
 
     return false;
   }
@@ -3362,9 +3508,9 @@ bool MipsAsmParser::expandRotationImm(MCInst &Inst, SMLoc IDLoc,
   return true;
 }
 
-bool MipsAsmParser::expandDRotation(MCInst &Inst, SMLoc IDLoc,
-                                    SmallVectorImpl<MCInst> &Instructions) {
-
+bool MipsAsmParser::expandDRotation(MCInst &Inst, SMLoc IDLoc, MCStreamer &Out,
+                                    const MCSubtargetInfo *STI) {
+  MipsTargetStreamer &TOut = getTargetStreamer();
   unsigned ATReg = Mips::NoRegister;
   unsigned DReg = Inst.getOperand(0).getReg();
   unsigned SReg = Inst.getOperand(1).getReg();
@@ -3383,13 +3529,13 @@ bool MipsAsmParser::expandDRotation(MCInst &Inst, SMLoc IDLoc,
     }
 
     if (Inst.getOpcode() == Mips::DROL) {
-      emitRRR(Mips::DSUBu, TmpReg, Mips::ZERO, TReg, Inst.getLoc(), Instructions);
-      emitRRR(Mips::DROTRV, DReg, SReg, TmpReg, Inst.getLoc(), Instructions);
+      TOut.emitRRR(Mips::DSUBu, TmpReg, Mips::ZERO, TReg, Inst.getLoc(), STI);
+      TOut.emitRRR(Mips::DROTRV, DReg, SReg, TmpReg, Inst.getLoc(), STI);
       return false;
     }
 
     if (Inst.getOpcode() == Mips::DROR) {
-      emitRRR(Mips::DROTRV, DReg, SReg, TReg, Inst.getLoc(), Instructions);
+      TOut.emitRRR(Mips::DROTRV, DReg, SReg, TReg, Inst.getLoc(), STI);
       return false;
     }
 
@@ -3415,10 +3561,10 @@ bool MipsAsmParser::expandDRotation(MCInst &Inst, SMLoc IDLoc,
     if (!ATReg)
       return true;
 
-    emitRRR(Mips::DSUBu, ATReg, Mips::ZERO, TReg, Inst.getLoc(), Instructions);
-    emitRRR(FirstShift, ATReg, SReg, ATReg, Inst.getLoc(), Instructions);
-    emitRRR(SecondShift, DReg, SReg, TReg, Inst.getLoc(), Instructions);
-    emitRRR(Mips::OR, DReg, DReg, ATReg, Inst.getLoc(), Instructions);
+    TOut.emitRRR(Mips::DSUBu, ATReg, Mips::ZERO, TReg, Inst.getLoc(), STI);
+    TOut.emitRRR(FirstShift, ATReg, SReg, ATReg, Inst.getLoc(), STI);
+    TOut.emitRRR(SecondShift, DReg, SReg, TReg, Inst.getLoc(), STI);
+    TOut.emitRRR(Mips::OR, DReg, DReg, ATReg, Inst.getLoc(), STI);
 
     return false;
   }
@@ -3427,8 +3573,9 @@ bool MipsAsmParser::expandDRotation(MCInst &Inst, SMLoc IDLoc,
 }
 
 bool MipsAsmParser::expandDRotationImm(MCInst &Inst, SMLoc IDLoc,
-                                       SmallVectorImpl<MCInst> &Instructions) {
-
+                                       MCStreamer &Out,
+                                       const MCSubtargetInfo *STI) {
+  MipsTargetStreamer &TOut = getTargetStreamer();
   unsigned ATReg = Mips::NoRegister;
   unsigned DReg = Inst.getOperand(0).getReg();
   unsigned SReg = Inst.getOperand(1).getReg();
@@ -3462,7 +3609,7 @@ bool MipsAsmParser::expandDRotationImm(MCInst &Inst, SMLoc IDLoc,
     if (Inst.getOpcode() == Mips::DROLImm)
       ShiftValue = (32 - ImmValue % 32) % 32;
 
-    emitRRI(FinalOpcode, DReg, SReg, ShiftValue, Inst.getLoc(), Instructions);
+    TOut.emitRRI(FinalOpcode, DReg, SReg, ShiftValue, Inst.getLoc(), STI);
 
     return false;
   }
@@ -3470,7 +3617,7 @@ bool MipsAsmParser::expandDRotationImm(MCInst &Inst, SMLoc IDLoc,
   if (hasMips64()) {
 
     if (ImmValue == 0) {
-      emitRRI(Mips::DSRL, DReg, SReg, 0, Inst.getLoc(), Instructions);
+      TOut.emitRRI(Mips::DSRL, DReg, SReg, 0, Inst.getLoc(), STI);
       return false;
     }
 
@@ -3511,9 +3658,10 @@ bool MipsAsmParser::expandDRotationImm(MCInst &Inst, SMLoc IDLoc,
     if (!ATReg)
       return true;
 
-    emitRRI(FirstShift, ATReg, SReg, ImmValue % 32, Inst.getLoc(), Instructions);
-    emitRRI(SecondShift, DReg, SReg, (32 - ImmValue % 32) % 32, Inst.getLoc(), Instructions);
-    emitRRR(Mips::OR, DReg, DReg, ATReg, Inst.getLoc(), Instructions);
+    TOut.emitRRI(FirstShift, ATReg, SReg, ImmValue % 32, Inst.getLoc(), STI);
+    TOut.emitRRI(SecondShift, DReg, SReg, (32 - ImmValue % 32) % 32,
+                 Inst.getLoc(), STI);
+    TOut.emitRRR(Mips::OR, DReg, DReg, ATReg, Inst.getLoc(), STI);
 
     return false;
   }
@@ -3521,49 +3669,76 @@ bool MipsAsmParser::expandDRotationImm(MCInst &Inst, SMLoc IDLoc,
   return true;
 }
 
-void MipsAsmParser::createNop(bool hasShortDelaySlot, SMLoc IDLoc,
-                              SmallVectorImpl<MCInst> &Instructions) {
-  if (hasShortDelaySlot)
-    emitRR(Mips::MOVE16_MM, Mips::ZERO, Mips::ZERO, IDLoc, Instructions);
-  else
-    emitRRI(Mips::SLL, Mips::ZERO, Mips::ZERO, 0, IDLoc, Instructions);
-}
+bool MipsAsmParser::expandAbs(MCInst &Inst, SMLoc IDLoc, MCStreamer &Out,
+                              const MCSubtargetInfo *STI) {
+  MipsTargetStreamer &TOut = getTargetStreamer();
+  unsigned FirstRegOp = Inst.getOperand(0).getReg();
+  unsigned SecondRegOp = Inst.getOperand(1).getReg();
 
-void MipsAsmParser::createAddu(unsigned DstReg, unsigned SrcReg,
-                               unsigned TrgReg, bool Is64Bit,
-                               SmallVectorImpl<MCInst> &Instructions) {
-  emitRRR(Is64Bit ? Mips::DADDu : Mips::ADDu, DstReg, SrcReg, TrgReg, SMLoc(),
-          Instructions);
-}
-
-void MipsAsmParser::createCpRestoreMemOp(
-    bool IsLoad, int StackOffset, SMLoc IDLoc,
-    SmallVectorImpl<MCInst> &Instructions) {
-  // If the offset can not fit into 16 bits, we need to expand.
-  if (!isInt<16>(StackOffset)) {
-    MCInst MemInst;
-    MemInst.setOpcode(IsLoad ? Mips::LW : Mips::SW);
-    MemInst.addOperand(MCOperand::createReg(Mips::GP));
-    MemInst.addOperand(MCOperand::createReg(Mips::SP));
-    MemInst.addOperand(MCOperand::createImm(StackOffset));
-    expandMemInst(MemInst, IDLoc, Instructions, IsLoad, true /*HasImmOpnd*/);
-    return;
-  }
+  TOut.emitRI(Mips::BGEZ, SecondRegOp, 8, IDLoc, STI);
+  if (FirstRegOp != SecondRegOp)
+    TOut.emitRRR(Mips::ADDu, FirstRegOp, SecondRegOp, Mips::ZERO, IDLoc, STI);
+  else
+    TOut.emitEmptyDelaySlot(false, IDLoc, STI);
+  TOut.emitRRR(Mips::SUB, FirstRegOp, Mips::ZERO, SecondRegOp, IDLoc, STI);
 
-  emitRRI(IsLoad ? Mips::LW : Mips::SW, Mips::GP, Mips::SP, StackOffset, IDLoc,
-          Instructions);
+  return false;
 }
 
 unsigned MipsAsmParser::checkTargetMatchPredicate(MCInst &Inst) {
+  switch (Inst.getOpcode()) {
   // As described by the Mips32r2 spec, the registers Rd and Rs for
   // jalr.hb must be different.
-  unsigned Opcode = Inst.getOpcode();
-
-  if (Opcode == Mips::JALR_HB &&
-      (Inst.getOperand(0).getReg() == Inst.getOperand(1).getReg()))
-    return Match_RequiresDifferentSrcAndDst;
-
-  return Match_Success;
+  // It also applies for registers Rt and Rs of microMIPSr6 jalrc.hb instruction
+  // and registers Rd and Base for microMIPS lwp instruction
+  case Mips::JALR_HB:
+  case Mips::JALRC_HB_MMR6:
+  case Mips::JALRC_MMR6:
+    if (Inst.getOperand(0).getReg() == Inst.getOperand(1).getReg())
+      return Match_RequiresDifferentSrcAndDst;
+    return Match_Success;
+  case Mips::LWP_MM:
+  case Mips::LWP_MMR6:
+    if (Inst.getOperand(0).getReg() == Inst.getOperand(2).getReg())
+      return Match_RequiresDifferentSrcAndDst;
+    return Match_Success;
+  // As described the MIPSR6 spec, the compact branches that compare registers
+  // must:
+  // a) Not use the zero register.
+  // b) Not use the same register twice.
+  // c) rs < rt for bnec, beqc.
+  //    NB: For this case, the encoding will swap the operands as their
+  //    ordering doesn't matter. GAS performs this transformation  too.
+  //    Hence, that constraint does not have to be enforced.
+  //
+  // The compact branches that branch iff the signed addition of two registers
+  // would overflow must have rs >= rt. That can be handled like beqc/bnec with
+  // operand swapping. They do not have restriction of using the zero register.
+  case Mips::BLEZC:
+  case Mips::BGEZC:
+  case Mips::BGTZC:
+  case Mips::BLTZC:
+  case Mips::BEQZC:
+  case Mips::BNEZC:
+    if (Inst.getOperand(0).getReg() == Mips::ZERO)
+      return Match_RequiresNoZeroRegister;
+    return Match_Success;
+  case Mips::BGEC:
+  case Mips::BLTC:
+  case Mips::BGEUC:
+  case Mips::BLTUC:
+  case Mips::BEQC:
+  case Mips::BNEC:
+    if (Inst.getOperand(0).getReg() == Mips::ZERO)
+      return Match_RequiresNoZeroRegister;
+    if (Inst.getOperand(1).getReg() == Mips::ZERO)
+      return Match_RequiresNoZeroRegister;
+    if (Inst.getOperand(0).getReg() == Inst.getOperand(1).getReg())
+      return Match_RequiresDifferentOperands;
+    return Match_Success;
+  default:
+    return Match_Success;
+  }
 }
 
 static SMLoc RefineErrorLoc(const SMLoc Loc, const OperandVector &Operands,
@@ -3584,16 +3759,13 @@ bool MipsAsmParser::MatchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode,
                                             bool MatchingInlineAsm) {
 
   MCInst Inst;
-  SmallVector<MCInst, 8> Instructions;
   unsigned MatchResult =
       MatchInstructionImpl(Operands, Inst, ErrorInfo, MatchingInlineAsm);
 
   switch (MatchResult) {
   case Match_Success: {
-    if (processInstruction(Inst, IDLoc, Instructions))
+    if (processInstruction(Inst, IDLoc, Out, STI))
       return true;
-    for (unsigned i = 0; i < Instructions.size(); i++)
-      Out.EmitInstruction(Instructions[i], getSTI());
     return false;
   }
   case Match_MissingFeature:
@@ -3616,6 +3788,10 @@ bool MipsAsmParser::MatchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode,
     return Error(IDLoc, "invalid instruction");
   case Match_RequiresDifferentSrcAndDst:
     return Error(IDLoc, "source and destination must be different");
+  case Match_RequiresDifferentOperands:
+    return Error(IDLoc, "registers must be different");
+  case Match_RequiresNoZeroRegister:
+    return Error(IDLoc, "invalid operand ($zero) for instruction");
   case Match_Immz:
     return Error(RefineErrorLoc(IDLoc, Operands, ErrorInfo), "expected '0'");
   case Match_UImm1_0:
@@ -3633,9 +3809,15 @@ bool MipsAsmParser::MatchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode,
   case Match_UImm4_0:
     return Error(RefineErrorLoc(IDLoc, Operands, ErrorInfo),
                  "expected 4-bit unsigned immediate");
+  case Match_SImm4_0:
+    return Error(RefineErrorLoc(IDLoc, Operands, ErrorInfo),
+                 "expected 4-bit signed immediate");
   case Match_UImm5_0:
     return Error(RefineErrorLoc(IDLoc, Operands, ErrorInfo),
                  "expected 5-bit unsigned immediate");
+  case Match_SImm5_0:
+    return Error(RefineErrorLoc(IDLoc, Operands, ErrorInfo),
+                 "expected 5-bit signed immediate");
   case Match_UImm5_1:
     return Error(RefineErrorLoc(IDLoc, Operands, ErrorInfo),
                  "expected immediate in range 1 .. 32");
@@ -3653,21 +3835,81 @@ bool MipsAsmParser::MatchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode,
   case Match_UImm5_Lsl2:
     return Error(RefineErrorLoc(IDLoc, Operands, ErrorInfo),
                  "expected both 7-bit unsigned immediate and multiple of 4");
+  case Match_UImmRange2_64:
+    return Error(RefineErrorLoc(IDLoc, Operands, ErrorInfo),
+                 "expected immediate in range 2 .. 64");
   case Match_UImm6_0:
     return Error(RefineErrorLoc(IDLoc, Operands, ErrorInfo),
                  "expected 6-bit unsigned immediate");
-  case Match_SImm6:
+  case Match_UImm6_Lsl2:
+    return Error(RefineErrorLoc(IDLoc, Operands, ErrorInfo),
+                 "expected both 8-bit unsigned immediate and multiple of 4");
+  case Match_SImm6_0:
     return Error(RefineErrorLoc(IDLoc, Operands, ErrorInfo),
                  "expected 6-bit signed immediate");
   case Match_UImm7_0:
     return Error(RefineErrorLoc(IDLoc, Operands, ErrorInfo),
                  "expected 7-bit unsigned immediate");
+  case Match_UImm7_N1:
+    return Error(RefineErrorLoc(IDLoc, Operands, ErrorInfo),
+                 "expected immediate in range -1 .. 126");
+  case Match_SImm7_Lsl2:
+    return Error(RefineErrorLoc(IDLoc, Operands, ErrorInfo),
+                 "expected both 9-bit signed immediate and multiple of 4");
   case Match_UImm8_0:
     return Error(RefineErrorLoc(IDLoc, Operands, ErrorInfo),
                  "expected 8-bit unsigned immediate");
   case Match_UImm10_0:
     return Error(RefineErrorLoc(IDLoc, Operands, ErrorInfo),
                  "expected 10-bit unsigned immediate");
+  case Match_SImm10_0:
+    return Error(RefineErrorLoc(IDLoc, Operands, ErrorInfo),
+                 "expected 10-bit signed immediate");
+  case Match_SImm11_0:
+    return Error(RefineErrorLoc(IDLoc, Operands, ErrorInfo),
+                 "expected 11-bit signed immediate");
+  case Match_UImm16:
+  case Match_UImm16_Relaxed:
+    return Error(RefineErrorLoc(IDLoc, Operands, ErrorInfo),
+                 "expected 16-bit unsigned immediate");
+  case Match_SImm16:
+  case Match_SImm16_Relaxed:
+    return Error(RefineErrorLoc(IDLoc, Operands, ErrorInfo),
+                 "expected 16-bit signed immediate");
+  case Match_UImm20_0:
+    return Error(RefineErrorLoc(IDLoc, Operands, ErrorInfo),
+                 "expected 20-bit unsigned immediate");
+  case Match_UImm26_0:
+    return Error(RefineErrorLoc(IDLoc, Operands, ErrorInfo),
+                 "expected 26-bit unsigned immediate");
+  case Match_SImm32:
+  case Match_SImm32_Relaxed:
+    return Error(RefineErrorLoc(IDLoc, Operands, ErrorInfo),
+                 "expected 32-bit signed immediate");
+  case Match_MemSImm9:
+    return Error(RefineErrorLoc(IDLoc, Operands, ErrorInfo),
+                 "expected memory with 9-bit signed offset");
+  case Match_MemSImm10:
+    return Error(RefineErrorLoc(IDLoc, Operands, ErrorInfo),
+                 "expected memory with 10-bit signed offset");
+  case Match_MemSImm10Lsl1:
+    return Error(RefineErrorLoc(IDLoc, Operands, ErrorInfo),
+                 "expected memory with 11-bit signed offset and multiple of 2");
+  case Match_MemSImm10Lsl2:
+    return Error(RefineErrorLoc(IDLoc, Operands, ErrorInfo),
+                 "expected memory with 12-bit signed offset and multiple of 4");
+  case Match_MemSImm10Lsl3:
+    return Error(RefineErrorLoc(IDLoc, Operands, ErrorInfo),
+                 "expected memory with 13-bit signed offset and multiple of 8");
+  case Match_MemSImm11:
+    return Error(RefineErrorLoc(IDLoc, Operands, ErrorInfo),
+                 "expected memory with 11-bit signed offset");
+  case Match_MemSImm12:
+    return Error(RefineErrorLoc(IDLoc, Operands, ErrorInfo),
+                 "expected memory with 12-bit signed offset");
+  case Match_MemSImm16:
+    return Error(RefineErrorLoc(IDLoc, Operands, ErrorInfo),
+                 "expected memory with 16-bit signed offset");
   }
 
   llvm_unreachable("Implement any new match types added!");
@@ -3871,19 +4113,6 @@ unsigned MipsAsmParser::getReg(int RC, int RegNo) {
   return *(getContext().getRegisterInfo()->getRegClass(RC).begin() + RegNo);
 }
 
-unsigned MipsAsmParser::getGPR(int RegNo) {
-  return getReg(isGP64bit() ? Mips::GPR64RegClassID : Mips::GPR32RegClassID,
-                RegNo);
-}
-
-int MipsAsmParser::matchRegisterByNumber(unsigned RegNum, unsigned RegClass) {
-  if (RegNum >
-      getContext().getRegisterInfo()->getRegClass(RegClass).getNumRegs() - 1)
-    return -1;
-
-  return getReg(RegClass, RegNum);
-}
-
 bool MipsAsmParser::parseOperand(OperandVector &Operands, StringRef Mnemonic) {
   MCAsmParser &Parser = getParser();
   DEBUG(dbgs() << "parseOperand\n");
@@ -3960,63 +4189,41 @@ bool MipsAsmParser::parseOperand(OperandVector &Operands, StringRef Mnemonic) {
 
 const MCExpr *MipsAsmParser::evaluateRelocExpr(const MCExpr *Expr,
                                                StringRef RelocStr) {
-  const MCExpr *Res;
-  // Check the type of the expression.
-  if (const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(Expr)) {
-    // It's a constant, evaluate reloc value.
-    int16_t Val;
-    switch (getVariantKind(RelocStr)) {
-    case MCSymbolRefExpr::VK_Mips_ABS_LO:
-      // Get the 1st 16-bits.
-      Val = MCE->getValue() & 0xffff;
-      break;
-    case MCSymbolRefExpr::VK_Mips_ABS_HI:
-      // Get the 2nd 16-bits. Also add 1 if bit 15 is 1, to compensate for low
-      // 16 bits being negative.
-      Val = ((MCE->getValue() + 0x8000) >> 16) & 0xffff;
-      break;
-    case MCSymbolRefExpr::VK_Mips_HIGHER:
-      // Get the 3rd 16-bits.
-      Val = ((MCE->getValue() + 0x80008000LL) >> 32) & 0xffff;
-      break;
-    case MCSymbolRefExpr::VK_Mips_HIGHEST:
-      // Get the 4th 16-bits.
-      Val = ((MCE->getValue() + 0x800080008000LL) >> 48) & 0xffff;
-      break;
-    default:
-      report_fatal_error("unsupported reloc value");
-    }
-    return MCConstantExpr::create(Val, getContext());
-  }
-
-  if (const MCSymbolRefExpr *MSRE = dyn_cast<MCSymbolRefExpr>(Expr)) {
-    // It's a symbol, create a symbolic expression from the symbol.
-    const MCSymbol *Symbol = &MSRE->getSymbol();
-    MCSymbolRefExpr::VariantKind VK = getVariantKind(RelocStr);
-    Res = MCSymbolRefExpr::create(Symbol, VK, getContext());
-    return Res;
-  }
-
-  if (const MCBinaryExpr *BE = dyn_cast<MCBinaryExpr>(Expr)) {
-    MCSymbolRefExpr::VariantKind VK = getVariantKind(RelocStr);
-
-    // Try to create target expression.
-    if (MipsMCExpr::isSupportedBinaryExpr(VK, BE))
-      return MipsMCExpr::create(VK, Expr, getContext());
-
-    const MCExpr *LExp = evaluateRelocExpr(BE->getLHS(), RelocStr);
-    const MCExpr *RExp = evaluateRelocExpr(BE->getRHS(), RelocStr);
-    Res = MCBinaryExpr::create(BE->getOpcode(), LExp, RExp, getContext());
-    return Res;
-  }
-
-  if (const MCUnaryExpr *UN = dyn_cast<MCUnaryExpr>(Expr)) {
-    const MCExpr *UnExp = evaluateRelocExpr(UN->getSubExpr(), RelocStr);
-    Res = MCUnaryExpr::create(UN->getOpcode(), UnExp, getContext());
-    return Res;
-  }
-  // Just return the original expression.
-  return Expr;
+  if (RelocStr == "hi(%neg(%gp_rel")
+    return MipsMCExpr::createGpOff(MipsMCExpr::MEK_HI, Expr, getContext());
+  else if (RelocStr == "lo(%neg(%gp_rel")
+    return MipsMCExpr::createGpOff(MipsMCExpr::MEK_LO, Expr, getContext());
+
+  MipsMCExpr::MipsExprKind Kind =
+      StringSwitch<MipsMCExpr::MipsExprKind>(RelocStr)
+          .Case("call16", MipsMCExpr::MEK_GOT_CALL)
+          .Case("call_hi", MipsMCExpr::MEK_CALL_HI16)
+          .Case("call_lo", MipsMCExpr::MEK_CALL_LO16)
+          .Case("dtprel_hi", MipsMCExpr::MEK_DTPREL_HI)
+          .Case("dtprel_lo", MipsMCExpr::MEK_DTPREL_LO)
+          .Case("got", MipsMCExpr::MEK_GOT)
+          .Case("got_disp", MipsMCExpr::MEK_GOT_DISP)
+          .Case("got_hi", MipsMCExpr::MEK_GOT_HI16)
+          .Case("got_lo", MipsMCExpr::MEK_GOT_LO16)
+          .Case("got_ofst", MipsMCExpr::MEK_GOT_OFST)
+          .Case("got_page", MipsMCExpr::MEK_GOT_PAGE)
+          .Case("gottprel", MipsMCExpr::MEK_GOTTPREL)
+          .Case("gp_rel", MipsMCExpr::MEK_GPREL)
+          .Case("hi", MipsMCExpr::MEK_HI)
+          .Case("higher", MipsMCExpr::MEK_HIGHER)
+          .Case("highest", MipsMCExpr::MEK_HIGHEST)
+          .Case("lo", MipsMCExpr::MEK_LO)
+          .Case("neg", MipsMCExpr::MEK_NEG)
+          .Case("pcrel_hi", MipsMCExpr::MEK_PCREL_HI16)
+          .Case("pcrel_lo", MipsMCExpr::MEK_PCREL_LO16)
+          .Case("tlsgd", MipsMCExpr::MEK_TLSGD)
+          .Case("tlsldm", MipsMCExpr::MEK_TLSLDM)
+          .Case("tprel_hi", MipsMCExpr::MEK_TPREL_HI)
+          .Case("tprel_lo", MipsMCExpr::MEK_TPREL_LO)
+          .Default(MipsMCExpr::MEK_None);
+
+  assert(Kind != MipsMCExpr::MEK_None);
+  return MipsMCExpr::create(Kind, Expr, getContext());
 }
 
 bool MipsAsmParser::isEvaluated(const MCExpr *Expr) {
@@ -4248,12 +4455,6 @@ bool MipsAsmParser::searchSymbolAlias(OperandVector &Operands) {
           llvm_unreachable("Should never ParseFail");
         return false;
       }
-    } else if (Expr->getKind() == MCExpr::Constant) {
-      Parser.Lex();
-      const MCConstantExpr *Const = static_cast<const MCConstantExpr *>(Expr);
-      Operands.push_back(
-          MipsOperand::CreateImm(Const, S, Parser.getTok().getLoc(), *this));
-      return true;
     }
   }
   return false;
@@ -4439,46 +4640,6 @@ MipsAsmParser::parseInvNum(OperandVector &Operands) {
 }
 
 MipsAsmParser::OperandMatchResultTy
-MipsAsmParser::parseLSAImm(OperandVector &Operands) {
-  MCAsmParser &Parser = getParser();
-  switch (getLexer().getKind()) {
-  default:
-    return MatchOperand_NoMatch;
-  case AsmToken::LParen:
-  case AsmToken::Plus:
-  case AsmToken::Minus:
-  case AsmToken::Integer:
-    break;
-  }
-
-  const MCExpr *Expr;
-  SMLoc S = Parser.getTok().getLoc();
-
-  if (getParser().parseExpression(Expr))
-    return MatchOperand_ParseFail;
-
-  int64_t Val;
-  if (!Expr->evaluateAsAbsolute(Val)) {
-    Error(S, "expected immediate value");
-    return MatchOperand_ParseFail;
-  }
-
-  // The LSA instruction allows a 2-bit unsigned immediate. For this reason
-  // and because the CPU always adds one to the immediate field, the allowed
-  // range becomes 1..4. We'll only check the range here and will deal
-  // with the addition/subtraction when actually decoding/encoding
-  // the instruction.
-  if (Val < 1 || Val > 4) {
-    Error(S, "immediate not in range (1..4)");
-    return MatchOperand_ParseFail;
-  }
-
-  Operands.push_back(
-      MipsOperand::CreateImm(Expr, S, Parser.getTok().getLoc(), *this));
-  return MatchOperand_Success;
-}
-
-MipsAsmParser::OperandMatchResultTy
 MipsAsmParser::parseRegisterList(OperandVector &Operands) {
   MCAsmParser &Parser = getParser();
   SmallVector<unsigned, 10> Regs;
@@ -4573,10 +4734,10 @@ MipsAsmParser::parseRegisterPair(OperandVector &Operands) {
     return MatchOperand_ParseFail;
 
   SMLoc E = Parser.getTok().getLoc();
-  MipsOperand &Op = static_cast<MipsOperand &>(*Operands.back());
-  unsigned Reg = Op.getGPR32Reg();
+  MipsOperand Op = static_cast<MipsOperand &>(*Operands.back());
+
   Operands.pop_back();
-  Operands.push_back(MipsOperand::CreateRegPair(Reg, S, E, *this));
+  Operands.push_back(MipsOperand::CreateRegPair(Op, S, E, *this));
   return MatchOperand_Success;
 }
 
@@ -4619,42 +4780,6 @@ MipsAsmParser::parseMovePRegPair(OperandVector &Operands) {
   return MatchOperand_Success;
 }
 
-MCSymbolRefExpr::VariantKind MipsAsmParser::getVariantKind(StringRef Symbol) {
-
-  MCSymbolRefExpr::VariantKind VK =
-      StringSwitch<MCSymbolRefExpr::VariantKind>(Symbol)
-          .Case("hi", MCSymbolRefExpr::VK_Mips_ABS_HI)
-          .Case("lo", MCSymbolRefExpr::VK_Mips_ABS_LO)
-          .Case("gp_rel", MCSymbolRefExpr::VK_Mips_GPREL)
-          .Case("call16", MCSymbolRefExpr::VK_Mips_GOT_CALL)
-          .Case("got", MCSymbolRefExpr::VK_Mips_GOT)
-          .Case("tlsgd", MCSymbolRefExpr::VK_Mips_TLSGD)
-          .Case("tlsldm", MCSymbolRefExpr::VK_Mips_TLSLDM)
-          .Case("dtprel_hi", MCSymbolRefExpr::VK_Mips_DTPREL_HI)
-          .Case("dtprel_lo", MCSymbolRefExpr::VK_Mips_DTPREL_LO)
-          .Case("gottprel", MCSymbolRefExpr::VK_Mips_GOTTPREL)
-          .Case("tprel_hi", MCSymbolRefExpr::VK_Mips_TPREL_HI)
-          .Case("tprel_lo", MCSymbolRefExpr::VK_Mips_TPREL_LO)
-          .Case("got_disp", MCSymbolRefExpr::VK_Mips_GOT_DISP)
-          .Case("got_page", MCSymbolRefExpr::VK_Mips_GOT_PAGE)
-          .Case("got_ofst", MCSymbolRefExpr::VK_Mips_GOT_OFST)
-          .Case("hi(%neg(%gp_rel", MCSymbolRefExpr::VK_Mips_GPOFF_HI)
-          .Case("lo(%neg(%gp_rel", MCSymbolRefExpr::VK_Mips_GPOFF_LO)
-          .Case("got_hi", MCSymbolRefExpr::VK_Mips_GOT_HI16)
-          .Case("got_lo", MCSymbolRefExpr::VK_Mips_GOT_LO16)
-          .Case("call_hi", MCSymbolRefExpr::VK_Mips_CALL_HI16)
-          .Case("call_lo", MCSymbolRefExpr::VK_Mips_CALL_LO16)
-          .Case("higher", MCSymbolRefExpr::VK_Mips_HIGHER)
-          .Case("highest", MCSymbolRefExpr::VK_Mips_HIGHEST)
-          .Case("pcrel_hi", MCSymbolRefExpr::VK_Mips_PCREL_HI16)
-          .Case("pcrel_lo", MCSymbolRefExpr::VK_Mips_PCREL_LO16)
-          .Default(MCSymbolRefExpr::VK_None);
-
-  assert(VK != MCSymbolRefExpr::VK_None);
-
-  return VK;
-}
-
 /// Sometimes (i.e. load/stores) the operand may be followed immediately by
 /// either this.
 /// ::= '(', register, ')'
@@ -4767,6 +4892,8 @@ bool MipsAsmParser::ParseInstruction(ParseInstructionInfo &Info, StringRef Name,
   return false;
 }
 
+// FIXME: Given that these have the same name, these should both be
+// consistent on affecting the Parser.
 bool MipsAsmParser::reportParseError(Twine ErrorMsg) {
   MCAsmParser &Parser = getParser();
   SMLoc Loc = getLexer().getLoc();
@@ -5174,7 +5301,7 @@ bool MipsAsmParser::parseSetArchDirective() {
           .Case("mips64r3", "mips64r3")
           .Case("mips64r5", "mips64r5")
           .Case("mips64r6", "mips64r6")
-          .Case("cnmips", "cnmips")
+          .Case("octeon", "cnmips")
           .Case("r4000", "mips3") // This is an implementation of Mips3.
           .Default("");
 
@@ -5200,6 +5327,7 @@ bool MipsAsmParser::parseSetFeature(uint64_t Feature) {
     getTargetStreamer().emitDirectiveSetDsp();
     break;
   case Mips::FeatureMicroMips:
+    setFeatureBits(Mips::FeatureMicroMips, "micromips");
     getTargetStreamer().emitDirectiveSetMicroMips();
     break;
   case Mips::FeatureMips1:
@@ -5356,12 +5484,9 @@ bool MipsAsmParser::parseDirectiveCpRestore(SMLoc Loc) {
     return false;
   }
 
-  // Store the $gp on the stack.
-  SmallVector<MCInst, 3> StoreInsts;
-  createCpRestoreMemOp(false /*IsLoad*/, CpRestoreOffset /*StackOffset*/, Loc,
-                       StoreInsts);
-
-  getTargetStreamer().emitDirectiveCpRestore(StoreInsts, CpRestoreOffset);
+  if (!getTargetStreamer().emitDirectiveCpRestore(
+          CpRestoreOffset, [&]() { return getATReg(Loc); }, Loc, STI))
+    return true;
   Parser.Lex(); // Consume the EndOfStatement.
   return false;
 }
@@ -5376,7 +5501,6 @@ bool MipsAsmParser::parseDirectiveCPSetup() {
   OperandMatchResultTy ResTy = parseAnyRegister(TmpReg);
   if (ResTy == MatchOperand_NoMatch) {
     reportParseError("expected register containing function address");
-    Parser.eatToEndOfStatement();
     return false;
   }
 
@@ -5502,6 +5626,7 @@ bool MipsAsmParser::parseDirectiveSet() {
   } else if (Tok.getString() == "nomips16") {
     return parseSetNoMips16Directive();
   } else if (Tok.getString() == "nomicromips") {
+    clearFeatureBits(Mips::FeatureMicroMips, "micromips");
     getTargetStreamer().emitDirectiveSetNoMicroMips();
     Parser.eatToEndOfStatement();
     return false;
@@ -5686,6 +5811,24 @@ bool MipsAsmParser::parseInsnDirective() {
   return false;
 }
 
+/// parseSSectionDirective
+///  ::= .sbss
+///  ::= .sdata
+bool MipsAsmParser::parseSSectionDirective(StringRef Section, unsigned Type) {
+  // If this is not the end of the statement, report an error.
+  if (getLexer().isNot(AsmToken::EndOfStatement)) {
+    reportParseError("unexpected token, expected end of statement");
+    return false;
+  }
+
+  MCSection *ELFSection = getContext().getELFSection(
+      Section, Type, ELF::SHF_WRITE | ELF::SHF_ALLOC | ELF::SHF_MIPS_GPREL);
+  getParser().getStreamer().SwitchSection(ELFSection);
+
+  getParser().Lex(); // Eat EndOfStatement token.
+  return false;
+}
+
 /// parseDirectiveModule
 ///  ::= .module oddspreg
 ///  ::= .module nooddspreg
@@ -5905,13 +6048,22 @@ bool MipsAsmParser::parseFpABIValue(MipsABIFlagsSection::FpABIKind &FpABI,
 }
 
 bool MipsAsmParser::ParseDirective(AsmToken DirectiveID) {
+  // This returns false if this function recognizes the directive
+  // regardless of whether it is successfully handles or reports an
+  // error. Otherwise it returns true to give the generic parser a
+  // chance at recognizing it.
+
   MCAsmParser &Parser = getParser();
   StringRef IDVal = DirectiveID.getString();
 
-  if (IDVal == ".cpload")
-    return parseDirectiveCpLoad(DirectiveID.getLoc());
-  if (IDVal == ".cprestore")
-    return parseDirectiveCpRestore(DirectiveID.getLoc());
+  if (IDVal == ".cpload") {
+    parseDirectiveCpLoad(DirectiveID.getLoc());
+    return false;
+  }
+  if (IDVal == ".cprestore") {
+    parseDirectiveCpRestore(DirectiveID.getLoc());
+    return false;
+  }
   if (IDVal == ".dword") {
     parseDataDirective(8, DirectiveID.getLoc());
     return false;
@@ -6068,7 +6220,8 @@ bool MipsAsmParser::ParseDirective(AsmToken DirectiveID) {
   }
 
   if (IDVal == ".set") {
-    return parseDirectiveSet();
+    parseDirectiveSet();
+    return false;
   }
 
   if (IDVal == ".mask" || IDVal == ".fmask") {
@@ -6147,8 +6300,15 @@ bool MipsAsmParser::ParseDirective(AsmToken DirectiveID) {
     return false;
   }
 
-  if (IDVal == ".option")
-    return parseDirectiveOption();
+  if (IDVal == ".hword") {
+    parseDataDirective(2, DirectiveID.getLoc());
+    return false;
+  }
+
+  if (IDVal == ".option") {
+    parseDirectiveOption();
+    return false;
+  }
 
   if (IDVal == ".abicalls") {
     getTargetStreamer().emitDirectiveAbiCalls();
@@ -6161,20 +6321,34 @@ bool MipsAsmParser::ParseDirective(AsmToken DirectiveID) {
     return false;
   }
 
-  if (IDVal == ".cpsetup")
-    return parseDirectiveCPSetup();
-
-  if (IDVal == ".cpreturn")
-    return parseDirectiveCPReturn();
-
-  if (IDVal == ".module")
-    return parseDirectiveModule();
-
-  if (IDVal == ".llvm_internal_mips_reallow_module_directive")
-    return parseInternalDirectiveReallowModule();
-
-  if (IDVal == ".insn")
-    return parseInsnDirective();
+  if (IDVal == ".cpsetup") {
+    parseDirectiveCPSetup();
+    return false;
+  }
+  if (IDVal == ".cpreturn") {
+    parseDirectiveCPReturn();
+    return false;
+  }
+  if (IDVal == ".module") {
+    parseDirectiveModule();
+    return false;
+  }
+  if (IDVal == ".llvm_internal_mips_reallow_module_directive") {
+    parseInternalDirectiveReallowModule();
+    return false;
+  }
+  if (IDVal == ".insn") {
+    parseInsnDirective();
+    return false;
+  }
+  if (IDVal == ".sbss") {
+    parseSSectionDirective(IDVal, ELF::SHT_NOBITS);
+    return false;
+  }
+  if (IDVal == ".sdata") {
+    parseSSectionDirective(IDVal, ELF::SHT_PROGBITS);
+    return false;
+  }
 
   return true;
 }
diff --git a/lib/Target/Mips/CMakeLists.txt b/lib/Target/Mips/CMakeLists.txt
index bde843afd3d2..3650cc9fe072 100644
--- a/lib/Target/Mips/CMakeLists.txt
+++ b/lib/Target/Mips/CMakeLists.txt
@@ -27,6 +27,7 @@ add_llvm_target(MipsCodeGen
   MipsConstantIslandPass.cpp
   MipsDelaySlotFiller.cpp
   MipsFastISel.cpp
+  MipsHazardSchedule.cpp
   MipsInstrInfo.cpp
   MipsISelDAGToDAG.cpp
   MipsISelLowering.cpp
diff --git a/lib/Target/Mips/Disassembler/Makefile b/lib/Target/Mips/Disassembler/Makefile
deleted file mode 100644
index 7900373dd2b2..000000000000
--- a/lib/Target/Mips/Disassembler/Makefile
+++ /dev/null
@@ -1,16 +0,0 @@
-##===- lib/Target/Mips/Disassembler/Makefile ---------------*- Makefile -*-===##
-#
-#                     The LLVM Compiler Infrastructure
-#
-# This file is distributed under the University of Illinois Open Source
-# License. See LICENSE.TXT for details.
-#
-##===----------------------------------------------------------------------===##
-
-LEVEL = ../../../..
-LIBRARYNAME = LLVMMipsDisassembler
-
-# Hack: we need to include 'main' Mips target directory to grab private headers
-CPP.Flags += -I$(PROJ_OBJ_DIR)/.. -I$(PROJ_SRC_DIR)/..
-
-include $(LEVEL)/Makefile.common
diff --git a/lib/Target/Mips/Disassembler/MipsDisassembler.cpp b/lib/Target/Mips/Disassembler/MipsDisassembler.cpp
index 3c1a771f97e9..aebb4ef419d1 100644
--- a/lib/Target/Mips/Disassembler/MipsDisassembler.cpp
+++ b/lib/Target/Mips/Disassembler/MipsDisassembler.cpp
@@ -15,7 +15,7 @@
 #include "MipsRegisterInfo.h"
 #include "MipsSubtarget.h"
 #include "llvm/MC/MCContext.h"
-#include "llvm/MC/MCDisassembler.h"
+#include "llvm/MC/MCDisassembler/MCDisassembler.h"
 #include "llvm/MC/MCFixedLenDisassembler.h"
 #include "llvm/MC/MCInst.h"
 #include "llvm/MC/MCSubtargetInfo.h"
@@ -39,14 +39,18 @@ public:
         IsMicroMips(STI.getFeatureBits()[Mips::FeatureMicroMips]),
         IsBigEndian(IsBigEndian) {}
 
+  bool hasMips2() const { return STI.getFeatureBits()[Mips::FeatureMips2]; }
   bool hasMips3() const { return STI.getFeatureBits()[Mips::FeatureMips3]; }
   bool hasMips32() const { return STI.getFeatureBits()[Mips::FeatureMips32]; }
   bool hasMips32r6() const {
     return STI.getFeatureBits()[Mips::FeatureMips32r6];
   }
+  bool isFP64() const { return STI.getFeatureBits()[Mips::FeatureFP64Bit]; }
 
   bool isGP64() const { return STI.getFeatureBits()[Mips::FeatureGP64Bit]; }
 
+  bool isPTR64() const { return STI.getFeatureBits()[Mips::FeaturePTR64Bit]; }
+
   bool hasCnMips() const { return STI.getFeatureBits()[Mips::FeatureCnMips]; }
 
   bool hasCOP3() const {
@@ -193,6 +197,11 @@ static DecodeStatus DecodeBranchTarget(MCInst &Inst,
                                        uint64_t Address,
                                        const void *Decoder);
 
+static DecodeStatus DecodeBranchTarget1SImm16(MCInst &Inst,
+                                              unsigned Offset,
+                                              uint64_t Address,
+                                              const void *Decoder);
+
 static DecodeStatus DecodeJumpTarget(MCInst &Inst,
                                      unsigned Insn,
                                      uint64_t Address,
@@ -203,6 +212,11 @@ static DecodeStatus DecodeBranchTarget21(MCInst &Inst,
                                          uint64_t Address,
                                          const void *Decoder);
 
+static DecodeStatus DecodeBranchTarget21MM(MCInst &Inst,
+                                           unsigned Offset,
+                                           uint64_t Address,
+                                           const void *Decoder);
+
 static DecodeStatus DecodeBranchTarget26(MCInst &Inst,
                                          unsigned Offset,
                                          uint64_t Address,
@@ -340,6 +354,10 @@ static DecodeStatus DecodeFMem(MCInst &Inst, unsigned Insn,
                                uint64_t Address,
                                const void *Decoder);
 
+static DecodeStatus DecodeFMemMMR2(MCInst &Inst, unsigned Insn,
+                                   uint64_t Address,
+                                   const void *Decoder);
+
 static DecodeStatus DecodeFMem2(MCInst &Inst, unsigned Insn,
                                uint64_t Address,
                                const void *Decoder);
@@ -352,6 +370,10 @@ static DecodeStatus DecodeFMemCop2R6(MCInst &Inst, unsigned Insn,
                                uint64_t Address,
                                const void *Decoder);
 
+static DecodeStatus DecodeFMemCop2MMR6(MCInst &Inst, unsigned Insn,
+                                       uint64_t Address,
+                                       const void *Decoder);
+
 static DecodeStatus DecodeSpecial3LlSc(MCInst &Inst,
                                        unsigned Insn,
                                        uint64_t Address,
@@ -362,12 +384,7 @@ static DecodeStatus DecodeAddiur2Simm7(MCInst &Inst,
                                        uint64_t Address,
                                        const void *Decoder);
 
-static DecodeStatus DecodeUImm6Lsl2(MCInst &Inst,
-                                    unsigned Value,
-                                    uint64_t Address,
-                                    const void *Decoder);
-
-static DecodeStatus DecodeLiSimm7(MCInst &Inst,
+static DecodeStatus DecodeLi16Imm(MCInst &Inst,
                                   unsigned Value,
                                   uint64_t Address,
                                   const void *Decoder);
@@ -377,19 +394,23 @@ static DecodeStatus DecodePOOL16BEncodedField(MCInst &Inst,
                                               uint64_t Address,
                                               const void *Decoder);
 
-static DecodeStatus DecodeSimm4(MCInst &Inst,
-                                unsigned Value,
-                                uint64_t Address,
-                                const void *Decoder);
-
-static DecodeStatus DecodeSimm16(MCInst &Inst,
-                                 unsigned Insn,
-                                 uint64_t Address,
-                                 const void *Decoder);
+template <unsigned Bits, int Offset, int Scale>
+static DecodeStatus DecodeUImmWithOffsetAndScale(MCInst &Inst, unsigned Value,
+                                                 uint64_t Address,
+                                                 const void *Decoder);
 
 template <unsigned Bits, int Offset>
 static DecodeStatus DecodeUImmWithOffset(MCInst &Inst, unsigned Value,
-                                         uint64_t Address, const void *Decoder);
+                                         uint64_t Address,
+                                         const void *Decoder) {
+  return DecodeUImmWithOffsetAndScale<Bits, Offset, 1>(Inst, Value, Address,
+                                                       Decoder);
+}
+
+template <unsigned Bits, int Offset = 0, int ScaleBy = 1>
+static DecodeStatus DecodeSImmWithOffsetAndScale(MCInst &Inst, unsigned Value,
+                                                 uint64_t Address,
+                                                 const void *Decoder);
 
 static DecodeStatus DecodeInsSize(MCInst &Inst,
                                   unsigned Insn,
@@ -408,9 +429,6 @@ static DecodeStatus DecodeSimm9SP(MCInst &Inst, unsigned Insn,
 static DecodeStatus DecodeANDI16Imm(MCInst &Inst, unsigned Insn,
                                     uint64_t Address, const void *Decoder);
 
-static DecodeStatus DecodeUImm5lsl2(MCInst &Inst, unsigned Insn,
-                                   uint64_t Address, const void *Decoder);
-
 static DecodeStatus DecodeSimm23Lsl2(MCInst &Inst, unsigned Insn,
                                      uint64_t Address, const void *Decoder);
 
@@ -427,11 +445,21 @@ DecodeAddiGroupBranch(MCInst &MI, InsnType insn, uint64_t Address,
 
 template <typename InsnType>
 static DecodeStatus
+DecodePOP35GroupBranchMMR6(MCInst &MI, InsnType insn, uint64_t Address,
+                           const void *Decoder);
+
+template <typename InsnType>
+static DecodeStatus
 DecodeDaddiGroupBranch(MCInst &MI, InsnType insn, uint64_t Address,
                        const void *Decoder);
 
 template <typename InsnType>
 static DecodeStatus
+DecodePOP37GroupBranchMMR6(MCInst &MI, InsnType insn, uint64_t Address,
+                           const void *Decoder);
+
+template <typename InsnType>
+static DecodeStatus
 DecodeBlezlGroupBranch(MCInst &MI, InsnType insn, uint64_t Address,
                        const void *Decoder);
 
@@ -450,6 +478,16 @@ static DecodeStatus
 DecodeBlezGroupBranch(MCInst &MI, InsnType insn, uint64_t Address,
                        const void *Decoder);
 
+template <typename InsnType>
+static DecodeStatus
+DecodeBgtzGroupBranchMMR6(MCInst &MI, InsnType insn, uint64_t Address,
+                          const void *Decoder);
+
+template <typename InsnType>
+static DecodeStatus
+DecodeBlezGroupBranchMMR6(MCInst &MI, InsnType insn, uint64_t Address,
+                          const void *Decoder);
+
 static DecodeStatus DecodeRegListOperand(MCInst &Inst, unsigned Insn,
                                          uint64_t Address,
                                          const void *Decoder);
@@ -563,7 +601,7 @@ static DecodeStatus DecodeAddiGroupBranch(MCInst &MI, InsnType insn,
 
   InsnType Rs = fieldFromInstruction(insn, 21, 5);
   InsnType Rt = fieldFromInstruction(insn, 16, 5);
-  InsnType Imm = SignExtend64(fieldFromInstruction(insn, 0, 16), 16) * 4;
+  int64_t Imm = SignExtend64(fieldFromInstruction(insn, 0, 16), 16) * 4 + 4;
   bool HasRs = false;
 
   if (Rs >= Rt) {
@@ -587,6 +625,37 @@ static DecodeStatus DecodeAddiGroupBranch(MCInst &MI, InsnType insn,
 }
 
 template <typename InsnType>
+static DecodeStatus DecodePOP35GroupBranchMMR6(MCInst &MI, InsnType insn,
+                                               uint64_t Address,
+                                               const void *Decoder) {
+  InsnType Rt = fieldFromInstruction(insn, 21, 5);
+  InsnType Rs = fieldFromInstruction(insn, 16, 5);
+  InsnType Imm = SignExtend64(fieldFromInstruction(insn, 0, 16), 16) * 2;
+
+  if (Rs >= Rt) {
+    MI.setOpcode(Mips::BOVC_MMR6);
+    MI.addOperand(MCOperand::createReg(getReg(Decoder, Mips::GPR32RegClassID,
+                                       Rt)));
+    MI.addOperand(MCOperand::createReg(getReg(Decoder, Mips::GPR32RegClassID,
+                                       Rs)));
+  } else if (Rs != 0 && Rs < Rt) {
+    MI.setOpcode(Mips::BEQC_MMR6);
+    MI.addOperand(MCOperand::createReg(getReg(Decoder, Mips::GPR32RegClassID,
+                                       Rs)));
+    MI.addOperand(MCOperand::createReg(getReg(Decoder, Mips::GPR32RegClassID,
+                                       Rt)));
+  } else {
+    MI.setOpcode(Mips::BEQZALC_MMR6);
+    MI.addOperand(MCOperand::createReg(getReg(Decoder, Mips::GPR32RegClassID,
+                                       Rt)));
+  }
+
+  MI.addOperand(MCOperand::createImm(Imm));
+
+  return MCDisassembler::Success;
+}
+
+template <typename InsnType>
 static DecodeStatus DecodeDaddiGroupBranch(MCInst &MI, InsnType insn,
                                            uint64_t Address,
                                            const void *Decoder) {
@@ -602,7 +671,7 @@ static DecodeStatus DecodeDaddiGroupBranch(MCInst &MI, InsnType insn,
 
   InsnType Rs = fieldFromInstruction(insn, 21, 5);
   InsnType Rt = fieldFromInstruction(insn, 16, 5);
-  InsnType Imm = SignExtend64(fieldFromInstruction(insn, 0, 16), 16) * 4;
+  int64_t  Imm = SignExtend64(fieldFromInstruction(insn, 0, 16), 16) * 4 + 4;
   bool HasRs = false;
 
   if (Rs >= Rt) {
@@ -626,6 +695,37 @@ static DecodeStatus DecodeDaddiGroupBranch(MCInst &MI, InsnType insn,
 }
 
 template <typename InsnType>
+static DecodeStatus DecodePOP37GroupBranchMMR6(MCInst &MI, InsnType insn,
+                                               uint64_t Address,
+                                               const void *Decoder) {
+  InsnType Rt = fieldFromInstruction(insn, 21, 5);
+  InsnType Rs = fieldFromInstruction(insn, 16, 5);
+  InsnType Imm = SignExtend64(fieldFromInstruction(insn, 0, 16), 16) * 2;
+
+  if (Rs >= Rt) {
+    MI.setOpcode(Mips::BNVC_MMR6);
+    MI.addOperand(MCOperand::createReg(getReg(Decoder, Mips::GPR32RegClassID,
+                                       Rt)));
+    MI.addOperand(MCOperand::createReg(getReg(Decoder, Mips::GPR32RegClassID,
+                                       Rs)));
+  } else if (Rs != 0 && Rs < Rt) {
+    MI.setOpcode(Mips::BNEC_MMR6);
+    MI.addOperand(MCOperand::createReg(getReg(Decoder, Mips::GPR32RegClassID,
+                                       Rs)));
+    MI.addOperand(MCOperand::createReg(getReg(Decoder, Mips::GPR32RegClassID,
+                                       Rt)));
+  } else {
+    MI.setOpcode(Mips::BNEZALC_MMR6);
+    MI.addOperand(MCOperand::createReg(getReg(Decoder, Mips::GPR32RegClassID,
+                                       Rt)));
+  }
+
+  MI.addOperand(MCOperand::createImm(Imm));
+
+  return MCDisassembler::Success;
+}
+
+template <typename InsnType>
 static DecodeStatus DecodeBlezlGroupBranch(MCInst &MI, InsnType insn,
                                            uint64_t Address,
                                            const void *Decoder) {
@@ -642,7 +742,7 @@ static DecodeStatus DecodeBlezlGroupBranch(MCInst &MI, InsnType insn,
 
   InsnType Rs = fieldFromInstruction(insn, 21, 5);
   InsnType Rt = fieldFromInstruction(insn, 16, 5);
-  InsnType Imm = SignExtend64(fieldFromInstruction(insn, 0, 16), 16) * 4;
+  int64_t Imm = SignExtend64(fieldFromInstruction(insn, 0, 16), 16) * 4 + 4;
   bool HasRs = false;
 
   if (Rt == 0)
@@ -687,7 +787,7 @@ static DecodeStatus DecodeBgtzlGroupBranch(MCInst &MI, InsnType insn,
 
   InsnType Rs = fieldFromInstruction(insn, 21, 5);
   InsnType Rt = fieldFromInstruction(insn, 16, 5);
-  InsnType Imm = SignExtend64(fieldFromInstruction(insn, 0, 16), 16) * 4;
+  int64_t Imm = SignExtend64(fieldFromInstruction(insn, 0, 16), 16) * 4 + 4;
 
   if (Rt == 0)
     return MCDisassembler::Fail;
@@ -729,7 +829,7 @@ static DecodeStatus DecodeBgtzGroupBranch(MCInst &MI, InsnType insn,
 
   InsnType Rs = fieldFromInstruction(insn, 21, 5);
   InsnType Rt = fieldFromInstruction(insn, 16, 5);
-  InsnType Imm = SignExtend64(fieldFromInstruction(insn, 0, 16), 16) * 4;
+  int64_t Imm = SignExtend64(fieldFromInstruction(insn, 0, 16), 16) * 4 + 4;
   bool HasRs = false;
   bool HasRt = false;
 
@@ -778,7 +878,7 @@ static DecodeStatus DecodeBlezGroupBranch(MCInst &MI, InsnType insn,
 
   InsnType Rs = fieldFromInstruction(insn, 21, 5);
   InsnType Rt = fieldFromInstruction(insn, 16, 5);
-  InsnType Imm = SignExtend64(fieldFromInstruction(insn, 0, 16), 16) * 4;
+  int64_t Imm = SignExtend64(fieldFromInstruction(insn, 0, 16), 16) * 4 + 4;
   bool HasRs = false;
 
   if (Rt == 0)
@@ -917,6 +1017,17 @@ DecodeStatus MipsDisassembler::getInstruction(MCInst &Instr, uint64_t &Size,
       Size = 4;
       return Result;
     }
+
+    if (hasMips32r6() && isFP64()) {
+      DEBUG(dbgs() << "Trying MicroMips32r6FP64 table (32-bit opcodes):\n");
+      Result = decodeInstruction(DecoderTableMicroMips32r6FP6432, Instr, Insn,
+                                 Address, this, STI);
+      if (Result != MCDisassembler::Fail) {
+        Size = 4;
+        return Result;
+      }
+    }
+
     // This is an invalid instruction. Let the disassembler move forward by the
     // minimum instruction size.
     Size = 2;
@@ -949,6 +1060,16 @@ DecodeStatus MipsDisassembler::getInstruction(MCInst &Instr, uint64_t &Size,
     }
   }
 
+  if (hasMips32r6() && isPTR64()) {
+    DEBUG(dbgs() << "Trying Mips32r6_64r6 (PTR64) table (32-bit opcodes):\n");
+    Result = decodeInstruction(DecoderTableMips32r6_64r6_PTR6432, Instr, Insn,
+                               Address, this, STI);
+    if (Result != MCDisassembler::Fail) {
+      Size = 4;
+      return Result;
+    }
+  }
+
   if (hasMips32r6()) {
     DEBUG(dbgs() << "Trying Mips32r6_64r6 table (32-bit opcodes):\n");
     Result = decodeInstruction(DecoderTableMips32r6_64r632, Instr, Insn,
@@ -959,6 +1080,16 @@ DecodeStatus MipsDisassembler::getInstruction(MCInst &Instr, uint64_t &Size,
     }
   }
 
+  if (hasMips2() && isPTR64()) {
+    DEBUG(dbgs() << "Trying Mips32r6_64r6 (PTR64) table (32-bit opcodes):\n");
+    Result = decodeInstruction(DecoderTableMips32_64_PTR6432, Instr, Insn,
+                               Address, this, STI);
+    if (Result != MCDisassembler::Fail) {
+      Size = 4;
+      return Result;
+    }
+  }
+
   if (hasCnMips()) {
     DEBUG(dbgs() << "Trying CnMips table (32-bit opcodes):\n");
     Result = decodeInstruction(DecoderTableCnMips32, Instr, Insn,
@@ -1534,7 +1665,8 @@ static DecodeStatus DecodeMemMMImm12(MCInst &Inst,
     // fallthrough
   default:
     Inst.addOperand(MCOperand::createReg(Reg));
-    if (Inst.getOpcode() == Mips::LWP_MM || Inst.getOpcode() == Mips::SWP_MM)
+    if (Inst.getOpcode() == Mips::LWP_MM || Inst.getOpcode() == Mips::SWP_MM ||
+        Inst.getOpcode() == Mips::LWP_MMR6 || Inst.getOpcode() == Mips::SWP_MMR6)
       Inst.addOperand(MCOperand::createReg(Reg+1));
 
     Inst.addOperand(MCOperand::createReg(Base));
@@ -1580,6 +1712,24 @@ static DecodeStatus DecodeFMem(MCInst &Inst,
   return MCDisassembler::Success;
 }
 
+static DecodeStatus DecodeFMemMMR2(MCInst &Inst, unsigned Insn,
+                                   uint64_t Address, const void *Decoder) {
+  // This function is the same as DecodeFMem but with the Reg and Base fields
+  // swapped according to microMIPS spec.
+  int Offset = SignExtend32<16>(Insn & 0xffff);
+  unsigned Base = fieldFromInstruction(Insn, 16, 5);
+  unsigned Reg = fieldFromInstruction(Insn, 21, 5);
+
+  Reg = getReg(Decoder, Mips::FGR64RegClassID, Reg);
+  Base = getReg(Decoder, Mips::GPR32RegClassID, Base);
+
+  Inst.addOperand(MCOperand::createReg(Reg));
+  Inst.addOperand(MCOperand::createReg(Base));
+  Inst.addOperand(MCOperand::createImm(Offset));
+
+  return MCDisassembler::Success;
+}
+
 static DecodeStatus DecodeFMem2(MCInst &Inst,
                                unsigned Insn,
                                uint64_t Address,
@@ -1633,6 +1783,23 @@ static DecodeStatus DecodeFMemCop2R6(MCInst &Inst,
 
   return MCDisassembler::Success;
 }
+
+static DecodeStatus DecodeFMemCop2MMR6(MCInst &Inst, unsigned Insn,
+                                       uint64_t Address, const void *Decoder) {
+  int Offset = SignExtend32<11>(Insn & 0x07ff);
+  unsigned Reg = fieldFromInstruction(Insn, 21, 5);
+  unsigned Base = fieldFromInstruction(Insn, 16, 5);
+
+  Reg = getReg(Decoder, Mips::COP2RegClassID, Reg);
+  Base = getReg(Decoder, Mips::GPR32RegClassID, Base);
+
+  Inst.addOperand(MCOperand::createReg(Reg));
+  Inst.addOperand(MCOperand::createReg(Base));
+  Inst.addOperand(MCOperand::createImm(Offset));
+
+  return MCDisassembler::Success;
+}
+
 static DecodeStatus DecodeSpecial3LlSc(MCInst &Inst,
                                        unsigned Insn,
                                        uint64_t Address,
@@ -1808,6 +1975,15 @@ static DecodeStatus DecodeBranchTarget(MCInst &Inst,
   return MCDisassembler::Success;
 }
 
+static DecodeStatus DecodeBranchTarget1SImm16(MCInst &Inst,
+                                              unsigned Offset,
+                                              uint64_t Address,
+                                              const void *Decoder) {
+  int32_t BranchOffset = (SignExtend32<16>(Offset) * 2);
+  Inst.addOperand(MCOperand::createImm(BranchOffset));
+  return MCDisassembler::Success;
+}
+
 static DecodeStatus DecodeJumpTarget(MCInst &Inst,
                                      unsigned Insn,
                                      uint64_t Address,
@@ -1822,7 +1998,17 @@ static DecodeStatus DecodeBranchTarget21(MCInst &Inst,
                                          unsigned Offset,
                                          uint64_t Address,
                                          const void *Decoder) {
-  int32_t BranchOffset = SignExtend32<21>(Offset) * 4;
+  int32_t BranchOffset = SignExtend32<21>(Offset) * 4 + 4;
+
+  Inst.addOperand(MCOperand::createImm(BranchOffset));
+  return MCDisassembler::Success;
+}
+
+static DecodeStatus DecodeBranchTarget21MM(MCInst &Inst,
+                                           unsigned Offset,
+                                           uint64_t Address,
+                                           const void *Decoder) {
+  int32_t BranchOffset = SignExtend32<21>(Offset) << 1;
 
   Inst.addOperand(MCOperand::createImm(BranchOffset));
   return MCDisassembler::Success;
@@ -1832,7 +2018,7 @@ static DecodeStatus DecodeBranchTarget26(MCInst &Inst,
                                          unsigned Offset,
                                          uint64_t Address,
                                          const void *Decoder) {
-  int32_t BranchOffset = SignExtend32<26>(Offset) * 4;
+  int32_t BranchOffset = SignExtend32<26>(Offset) * 4 + 4;
 
   Inst.addOperand(MCOperand::createImm(BranchOffset));
   return MCDisassembler::Success;
@@ -1897,15 +2083,7 @@ static DecodeStatus DecodeAddiur2Simm7(MCInst &Inst,
   return MCDisassembler::Success;
 }
 
-static DecodeStatus DecodeUImm6Lsl2(MCInst &Inst,
-                                    unsigned Value,
-                                    uint64_t Address,
-                                    const void *Decoder) {
-  Inst.addOperand(MCOperand::createImm(Value << 2));
-  return MCDisassembler::Success;
-}
-
-static DecodeStatus DecodeLiSimm7(MCInst &Inst,
+static DecodeStatus DecodeLi16Imm(MCInst &Inst,
                                   unsigned Value,
                                   uint64_t Address,
                                   const void *Decoder) {
@@ -1924,28 +2102,22 @@ static DecodeStatus DecodePOOL16BEncodedField(MCInst &Inst,
   return MCDisassembler::Success;
 }
 
-static DecodeStatus DecodeSimm4(MCInst &Inst,
-                                unsigned Value,
-                                uint64_t Address,
-                                const void *Decoder) {
-  Inst.addOperand(MCOperand::createImm(SignExtend32<4>(Value)));
-  return MCDisassembler::Success;
-}
-
-static DecodeStatus DecodeSimm16(MCInst &Inst,
-                                 unsigned Insn,
-                                 uint64_t Address,
-                                 const void *Decoder) {
-  Inst.addOperand(MCOperand::createImm(SignExtend32<16>(Insn)));
+template <unsigned Bits, int Offset, int Scale>
+static DecodeStatus DecodeUImmWithOffsetAndScale(MCInst &Inst, unsigned Value,
+                                                 uint64_t Address,
+                                                 const void *Decoder) {
+  Value &= ((1 << Bits) - 1);
+  Value *= Scale;
+  Inst.addOperand(MCOperand::createImm(Value + Offset));
   return MCDisassembler::Success;
 }
 
-template <unsigned Bits, int Offset>
-static DecodeStatus DecodeUImmWithOffset(MCInst &Inst, unsigned Value,
-                                         uint64_t Address,
-                                         const void *Decoder) {
-  Value &= ((1 << Bits) - 1);
-  Inst.addOperand(MCOperand::createImm(Value + Offset));
+template <unsigned Bits, int Offset, int ScaleBy>
+static DecodeStatus DecodeSImmWithOffsetAndScale(MCInst &Inst, unsigned Value,
+                                                 uint64_t Address,
+                                                 const void *Decoder) {
+  int32_t Imm = SignExtend32<Bits>(Value) * ScaleBy;
+  Inst.addOperand(MCOperand::createImm(Imm + Offset));
   return MCDisassembler::Success;
 }
 
@@ -1996,12 +2168,6 @@ static DecodeStatus DecodeANDI16Imm(MCInst &Inst, unsigned Insn,
   return MCDisassembler::Success;
 }
 
-static DecodeStatus DecodeUImm5lsl2(MCInst &Inst, unsigned Insn,
-                                    uint64_t Address, const void *Decoder) {
-  Inst.addOperand(MCOperand::createImm(Insn << 2));
-  return MCDisassembler::Success;
-}
-
 static DecodeStatus DecodeRegListOperand(MCInst &Inst,
                                          unsigned Insn,
                                          uint64_t Address,
@@ -2105,3 +2271,87 @@ static DecodeStatus DecodeSimm23Lsl2(MCInst &Inst, unsigned Insn,
   Inst.addOperand(MCOperand::createImm(SignExtend32<25>(Insn << 2)));
   return MCDisassembler::Success;
 }
+
+template <typename InsnType>
+static DecodeStatus DecodeBgtzGroupBranchMMR6(MCInst &MI, InsnType insn,
+  uint64_t Address,
+  const void *Decoder) {
+  // We have:
+  //    0b000111 ttttt sssss iiiiiiiiiiiiiiii
+  //      Invalid      if rt == 0
+  //      BGTZALC_MMR6 if rs == 0 && rt != 0
+  //      BLTZALC_MMR6 if rs != 0 && rs == rt
+  //      BLTUC_MMR6   if rs != 0 && rs != rt
+
+  InsnType Rt = fieldFromInstruction(insn, 21, 5);
+  InsnType Rs = fieldFromInstruction(insn, 16, 5);
+  InsnType Imm = SignExtend64(fieldFromInstruction(insn, 0, 16), 16) * 2;
+  bool HasRs = false;
+  bool HasRt = false;
+
+  if (Rt == 0)
+    return MCDisassembler::Fail;
+  else if (Rs == 0) {
+    MI.setOpcode(Mips::BGTZALC_MMR6);
+    HasRt = true;
+  }
+  else if (Rs == Rt) {
+    MI.setOpcode(Mips::BLTZALC_MMR6);
+    HasRs = true;
+  }
+  else {
+    MI.setOpcode(Mips::BLTUC_MMR6);
+    HasRs = true;
+    HasRt = true;
+  }
+
+  if (HasRs)
+    MI.addOperand(
+    MCOperand::createReg(getReg(Decoder, Mips::GPR32RegClassID, Rs)));
+
+  if (HasRt)
+    MI.addOperand(
+    MCOperand::createReg(getReg(Decoder, Mips::GPR32RegClassID, Rt)));
+
+  MI.addOperand(MCOperand::createImm(Imm));
+
+  return MCDisassembler::Success;
+}
+
+template <typename InsnType>
+static DecodeStatus DecodeBlezGroupBranchMMR6(MCInst &MI, InsnType insn,
+  uint64_t Address,
+  const void *Decoder) {
+  // We have:
+  //    0b000110 ttttt sssss iiiiiiiiiiiiiiii
+  //      Invalid        if rs == 0
+  //      BLEZALC_MMR6   if rs == 0  && rt != 0
+  //      BGEZALC_MMR6   if rs == rt && rt != 0
+  //      BGEUC_MMR6     if rs != rt && rs != 0  && rt != 0
+
+  InsnType Rt = fieldFromInstruction(insn, 21, 5);
+  InsnType Rs = fieldFromInstruction(insn, 16, 5);
+  InsnType Imm = SignExtend64(fieldFromInstruction(insn, 0, 16), 16) * 2;
+  bool HasRs = false;
+
+  if (Rt == 0)
+    return MCDisassembler::Fail;
+  else if (Rs == 0)
+    MI.setOpcode(Mips::BLEZALC_MMR6);
+  else if (Rs == Rt)
+    MI.setOpcode(Mips::BGEZALC_MMR6);
+  else {
+    HasRs = true;
+    MI.setOpcode(Mips::BGEUC_MMR6);
+  }
+
+  if (HasRs)
+    MI.addOperand(
+    MCOperand::createReg(getReg(Decoder, Mips::GPR32RegClassID, Rs)));
+  MI.addOperand(
+    MCOperand::createReg(getReg(Decoder, Mips::GPR32RegClassID, Rt)));
+
+  MI.addOperand(MCOperand::createImm(Imm));
+
+  return MCDisassembler::Success;
+}
diff --git a/lib/Target/Mips/InstPrinter/Makefile b/lib/Target/Mips/InstPrinter/Makefile
deleted file mode 100644
index f07f3ed381ee..000000000000
--- a/lib/Target/Mips/InstPrinter/Makefile
+++ /dev/null
@@ -1,16 +0,0 @@
-##===- lib/Target/Mips/AsmPrinter/Makefile -----------------*- Makefile -*-===##
-#
-#                     The LLVM Compiler Infrastructure
-#
-# This file is distributed under the University of Illinois Open Source
-# License. See LICENSE.TXT for details.
-#
-##===----------------------------------------------------------------------===##
-
-LEVEL = ../../../..
-LIBRARYNAME = LLVMMipsAsmPrinter
-
-# Hack: we need to include 'main' mips target directory to grab private headers
-CPP.Flags += -I$(PROJ_OBJ_DIR)/.. -I$(PROJ_SRC_DIR)/..
-
-include $(LEVEL)/Makefile.common
diff --git a/lib/Target/Mips/InstPrinter/MipsInstPrinter.cpp b/lib/Target/Mips/InstPrinter/MipsInstPrinter.cpp
index a7b7d2e080ee..0fd593fcfbe1 100644
--- a/lib/Target/Mips/InstPrinter/MipsInstPrinter.cpp
+++ b/lib/Target/Mips/InstPrinter/MipsInstPrinter.cpp
@@ -122,70 +122,6 @@ void MipsInstPrinter::printInst(const MCInst *MI, raw_ostream &O,
   }
 }
 
-static void printExpr(const MCExpr *Expr, const MCAsmInfo *MAI,
-                      raw_ostream &OS) {
-  int Offset = 0;
-  const MCSymbolRefExpr *SRE;
-
-  if (const MCBinaryExpr *BE = dyn_cast<MCBinaryExpr>(Expr)) {
-    SRE = dyn_cast<MCSymbolRefExpr>(BE->getLHS());
-    const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(BE->getRHS());
-    assert(SRE && CE && "Binary expression must be sym+const.");
-    Offset = CE->getValue();
-  } else if (const MipsMCExpr *ME = dyn_cast<MipsMCExpr>(Expr)) {
-    ME->print(OS, MAI);
-    return;
-  } else
-    SRE = cast<MCSymbolRefExpr>(Expr);
-
-  MCSymbolRefExpr::VariantKind Kind = SRE->getKind();
-
-  switch (Kind) {
-  default:                                 llvm_unreachable("Invalid kind!");
-  case MCSymbolRefExpr::VK_None:           break;
-  case MCSymbolRefExpr::VK_Mips_GPREL:     OS << "%gp_rel("; break;
-  case MCSymbolRefExpr::VK_Mips_GOT_CALL:  OS << "%call16("; break;
-  case MCSymbolRefExpr::VK_Mips_GOT16:     OS << "%got(";    break;
-  case MCSymbolRefExpr::VK_Mips_GOT:       OS << "%got(";    break;
-  case MCSymbolRefExpr::VK_Mips_ABS_HI:    OS << "%hi(";     break;
-  case MCSymbolRefExpr::VK_Mips_ABS_LO:    OS << "%lo(";     break;
-  case MCSymbolRefExpr::VK_Mips_TLSGD:     OS << "%tlsgd(";  break;
-  case MCSymbolRefExpr::VK_Mips_TLSLDM:    OS << "%tlsldm(";  break;
-  case MCSymbolRefExpr::VK_Mips_DTPREL_HI: OS << "%dtprel_hi(";  break;
-  case MCSymbolRefExpr::VK_Mips_DTPREL_LO: OS << "%dtprel_lo(";  break;
-  case MCSymbolRefExpr::VK_Mips_GOTTPREL:  OS << "%gottprel("; break;
-  case MCSymbolRefExpr::VK_Mips_TPREL_HI:  OS << "%tprel_hi("; break;
-  case MCSymbolRefExpr::VK_Mips_TPREL_LO:  OS << "%tprel_lo("; break;
-  case MCSymbolRefExpr::VK_Mips_GPOFF_HI:  OS << "%hi(%neg(%gp_rel("; break;
-  case MCSymbolRefExpr::VK_Mips_GPOFF_LO:  OS << "%lo(%neg(%gp_rel("; break;
-  case MCSymbolRefExpr::VK_Mips_GOT_DISP:  OS << "%got_disp("; break;
-  case MCSymbolRefExpr::VK_Mips_GOT_PAGE:  OS << "%got_page("; break;
-  case MCSymbolRefExpr::VK_Mips_GOT_OFST:  OS << "%got_ofst("; break;
-  case MCSymbolRefExpr::VK_Mips_HIGHER:    OS << "%higher("; break;
-  case MCSymbolRefExpr::VK_Mips_HIGHEST:   OS << "%highest("; break;
-  case MCSymbolRefExpr::VK_Mips_GOT_HI16:  OS << "%got_hi("; break;
-  case MCSymbolRefExpr::VK_Mips_GOT_LO16:  OS << "%got_lo("; break;
-  case MCSymbolRefExpr::VK_Mips_CALL_HI16: OS << "%call_hi("; break;
-  case MCSymbolRefExpr::VK_Mips_CALL_LO16: OS << "%call_lo("; break;
-  case MCSymbolRefExpr::VK_Mips_PCREL_HI16: OS << "%pcrel_hi("; break;
-  case MCSymbolRefExpr::VK_Mips_PCREL_LO16: OS << "%pcrel_lo("; break;
-  }
-
-  SRE->getSymbol().print(OS, MAI);
-
-  if (Offset) {
-    if (Offset > 0)
-      OS << '+';
-    OS << Offset;
-  }
-
-  if ((Kind == MCSymbolRefExpr::VK_Mips_GPOFF_HI) ||
-      (Kind == MCSymbolRefExpr::VK_Mips_GPOFF_LO))
-    OS << ")))";
-  else if (Kind != MCSymbolRefExpr::VK_None)
-    OS << ')';
-}
-
 void MipsInstPrinter::printOperand(const MCInst *MI, unsigned OpNo,
                                    raw_ostream &O) {
   const MCOperand &Op = MI->getOperand(OpNo);
@@ -195,30 +131,27 @@ void MipsInstPrinter::printOperand(const MCInst *MI, unsigned OpNo,
   }
 
   if (Op.isImm()) {
-    O << Op.getImm();
+    O << formatImm(Op.getImm());
     return;
   }
 
   assert(Op.isExpr() && "unknown operand kind in printOperand");
-  printExpr(Op.getExpr(), &MAI, O);
+  Op.getExpr()->print(O, &MAI, true);
 }
 
-void MipsInstPrinter::printUnsignedImm(const MCInst *MI, int opNum,
-                                       raw_ostream &O) {
+template <unsigned Bits, unsigned Offset>
+void MipsInstPrinter::printUImm(const MCInst *MI, int opNum, raw_ostream &O) {
   const MCOperand &MO = MI->getOperand(opNum);
-  if (MO.isImm())
-    O << (unsigned short int)MO.getImm();
-  else
-    printOperand(MI, opNum, O);
-}
+  if (MO.isImm()) {
+    uint64_t Imm = MO.getImm();
+    Imm -= Offset;
+    Imm &= (1 << Bits) - 1;
+    Imm += Offset;
+    O << formatImm(Imm);
+    return;
+  }
 
-void MipsInstPrinter::printUnsignedImm8(const MCInst *MI, int opNum,
-                                        raw_ostream &O) {
-  const MCOperand &MO = MI->getOperand(opNum);
-  if (MO.isImm())
-    O << (unsigned short int)(unsigned char)MO.getImm();
-  else
-    printOperand(MI, opNum, O);
+  printOperand(MI, opNum, O);
 }
 
 void MipsInstPrinter::
@@ -325,6 +258,7 @@ bool MipsInstPrinter::printAlias(const MCInst &MI, raw_ostream &OS) {
     return isReg<Mips::RA_64>(MI, 0) && printAlias("jalr", MI, 1, OS);
   case Mips::NOR:
   case Mips::NOR_MM:
+  case Mips::NOR_MMR6:
     // nor $r0, $r1, $zero => not $r0, $r1
     return isReg<Mips::ZERO>(MI, 2) && printAlias("not", MI, 0, 1, OS);
   case Mips::NOR64:
@@ -343,7 +277,7 @@ void MipsInstPrinter::printSaveRestore(const MCInst *MI, raw_ostream &O) {
     if (MI->getOperand(i).isReg())
       printRegName(O, MI->getOperand(i).getReg());
     else
-      printUnsignedImm(MI, i, O);
+      printUImm<16>(MI, i, O);
   }
 }
 
diff --git a/lib/Target/Mips/InstPrinter/MipsInstPrinter.h b/lib/Target/Mips/InstPrinter/MipsInstPrinter.h
index 0e61ea61899a..4a76b5acac79 100644
--- a/lib/Target/Mips/InstPrinter/MipsInstPrinter.h
+++ b/lib/Target/Mips/InstPrinter/MipsInstPrinter.h
@@ -93,8 +93,8 @@ public:
 
 private:
   void printOperand(const MCInst *MI, unsigned OpNo, raw_ostream &O);
-  void printUnsignedImm(const MCInst *MI, int opNum, raw_ostream &O);
-  void printUnsignedImm8(const MCInst *MI, int opNum, raw_ostream &O);
+  template <unsigned Bits, unsigned Offset = 0>
+  void printUImm(const MCInst *MI, int opNum, raw_ostream &O);
   void printMemOperand(const MCInst *MI, int opNum, raw_ostream &O);
   void printMemOperandEA(const MCInst *MI, int opNum, raw_ostream &O);
   void printFCCOperand(const MCInst *MI, int opNum, raw_ostream &O);
diff --git a/lib/Target/Mips/MCTargetDesc/Makefile b/lib/Target/Mips/MCTargetDesc/Makefile
deleted file mode 100644
index 22a27218f28d..000000000000
--- a/lib/Target/Mips/MCTargetDesc/Makefile
+++ /dev/null
@@ -1,17 +0,0 @@
-##===- lib/Target/Mips/TargetDesc/Makefile -----------------*- Makefile -*-===##
-#
-#                     The LLVM Compiler Infrastructure
-#
-# This file is distributed under the University of Illinois Open Source
-# License. See LICENSE.TXT for details.
-#
-##===----------------------------------------------------------------------===##
-
-LEVEL = ../../../..
-LIBRARYNAME = LLVMMipsDesc
-
-# Hack: we need to include 'main' target directory to grab private headers
-CPP.Flags += -I$(PROJ_OBJ_DIR)/.. -I$(PROJ_SRC_DIR)/..
-
-include $(LEVEL)/Makefile.common
-
diff --git a/lib/Target/Mips/MCTargetDesc/MipsABIFlagsSection.cpp b/lib/Target/Mips/MCTargetDesc/MipsABIFlagsSection.cpp
index 70b9cca8cf6e..932d38a0b9fe 100644
--- a/lib/Target/Mips/MCTargetDesc/MipsABIFlagsSection.cpp
+++ b/lib/Target/Mips/MCTargetDesc/MipsABIFlagsSection.cpp
@@ -53,17 +53,17 @@ uint8_t MipsABIFlagsSection::getCPR1SizeValue() {
 namespace llvm {
 MCStreamer &operator<<(MCStreamer &OS, MipsABIFlagsSection &ABIFlagsSection) {
   // Write out a Elf_Internal_ABIFlags_v0 struct
-  OS.EmitIntValue(ABIFlagsSection.getVersionValue(), 2);         // version
-  OS.EmitIntValue(ABIFlagsSection.getISALevelValue(), 1);        // isa_level
-  OS.EmitIntValue(ABIFlagsSection.getISARevisionValue(), 1);     // isa_rev
-  OS.EmitIntValue(ABIFlagsSection.getGPRSizeValue(), 1);         // gpr_size
-  OS.EmitIntValue(ABIFlagsSection.getCPR1SizeValue(), 1);        // cpr1_size
-  OS.EmitIntValue(ABIFlagsSection.getCPR2SizeValue(), 1);        // cpr2_size
-  OS.EmitIntValue(ABIFlagsSection.getFpABIValue(), 1);           // fp_abi
-  OS.EmitIntValue(ABIFlagsSection.getISAExtensionSetValue(), 4); // isa_ext
-  OS.EmitIntValue(ABIFlagsSection.getASESetValue(), 4);          // ases
-  OS.EmitIntValue(ABIFlagsSection.getFlags1Value(), 4);          // flags1
-  OS.EmitIntValue(ABIFlagsSection.getFlags2Value(), 4);          // flags2
+  OS.EmitIntValue(ABIFlagsSection.getVersionValue(), 2);      // version
+  OS.EmitIntValue(ABIFlagsSection.getISALevelValue(), 1);     // isa_level
+  OS.EmitIntValue(ABIFlagsSection.getISARevisionValue(), 1);  // isa_rev
+  OS.EmitIntValue(ABIFlagsSection.getGPRSizeValue(), 1);      // gpr_size
+  OS.EmitIntValue(ABIFlagsSection.getCPR1SizeValue(), 1);     // cpr1_size
+  OS.EmitIntValue(ABIFlagsSection.getCPR2SizeValue(), 1);     // cpr2_size
+  OS.EmitIntValue(ABIFlagsSection.getFpABIValue(), 1);        // fp_abi
+  OS.EmitIntValue(ABIFlagsSection.getISAExtensionValue(), 4); // isa_ext
+  OS.EmitIntValue(ABIFlagsSection.getASESetValue(), 4);       // ases
+  OS.EmitIntValue(ABIFlagsSection.getFlags1Value(), 4);       // flags1
+  OS.EmitIntValue(ABIFlagsSection.getFlags2Value(), 4);       // flags2
   return OS;
 }
 }
diff --git a/lib/Target/Mips/MCTargetDesc/MipsABIFlagsSection.h b/lib/Target/Mips/MCTargetDesc/MipsABIFlagsSection.h
index b078cd30a87b..3966cae9fe33 100644
--- a/lib/Target/Mips/MCTargetDesc/MipsABIFlagsSection.h
+++ b/lib/Target/Mips/MCTargetDesc/MipsABIFlagsSection.h
@@ -35,7 +35,7 @@ struct MipsABIFlagsSection {
   // The size of co-processor 2 registers.
   Mips::AFL_REG CPR2Size;
   // Processor-specific extension.
-  uint32_t ISAExtensionSet;
+  Mips::AFL_EXT ISAExtension;
   // Mask of ASEs used.
   uint32_t ASESet;
 
@@ -51,8 +51,8 @@ public:
   MipsABIFlagsSection()
       : Version(0), ISALevel(0), ISARevision(0), GPRSize(Mips::AFL_REG_NONE),
         CPR1Size(Mips::AFL_REG_NONE), CPR2Size(Mips::AFL_REG_NONE),
-        ISAExtensionSet(0), ASESet(0), OddSPReg(false), Is32BitABI(false),
-        FpABI(FpABIKind::ANY) {}
+        ISAExtension(Mips::AFL_EXT_NONE), ASESet(0), OddSPReg(false),
+        Is32BitABI(false), FpABI(FpABIKind::ANY) {}
 
   uint16_t getVersionValue() { return (uint16_t)Version; }
   uint8_t getISALevelValue() { return (uint8_t)ISALevel; }
@@ -61,7 +61,7 @@ public:
   uint8_t getCPR1SizeValue();
   uint8_t getCPR2SizeValue() { return (uint8_t)CPR2Size; }
   uint8_t getFpABIValue();
-  uint32_t getISAExtensionSetValue() { return (uint32_t)ISAExtensionSet; }
+  uint32_t getISAExtensionValue() { return (uint32_t)ISAExtension; }
   uint32_t getASESetValue() { return (uint32_t)ASESet; }
 
   uint32_t getFlags1Value() {
@@ -141,6 +141,14 @@ public:
   }
 
   template <class PredicateLibrary>
+  void setISAExtensionFromPredicates(const PredicateLibrary &P) {
+    if (P.hasCnMips())
+      ISAExtension = Mips::AFL_EXT_OCTEON;
+    else
+      ISAExtension = Mips::AFL_EXT_NONE;
+  }
+
+  template <class PredicateLibrary>
   void setASESetFromPredicates(const PredicateLibrary &P) {
     ASESet = 0;
     if (P.hasDSP())
@@ -179,6 +187,7 @@ public:
     setISALevelAndRevisionFromPredicates(P);
     setGPRSizeFromPredicates(P);
     setCPR1SizeFromPredicates(P);
+    setISAExtensionFromPredicates(P);
     setASESetFromPredicates(P);
     setFpAbiFromPredicates(P);
     OddSPReg = P.useOddSPReg();
diff --git a/lib/Target/Mips/MCTargetDesc/MipsABIInfo.cpp b/lib/Target/Mips/MCTargetDesc/MipsABIInfo.cpp
index cdcc3923b81e..3cf632e789de 100644
--- a/lib/Target/Mips/MCTargetDesc/MipsABIInfo.cpp
+++ b/lib/Target/Mips/MCTargetDesc/MipsABIInfo.cpp
@@ -42,7 +42,7 @@ ArrayRef<MCPhysReg> MipsABIInfo::GetVarArgRegs() const {
 unsigned MipsABIInfo::GetCalleeAllocdArgSizeInBytes(CallingConv::ID CC) const {
   if (IsO32())
     return CC != CallingConv::Fast ? 16 : 0;
-  if (IsN32() || IsN64() || IsEABI())
+  if (IsN32() || IsN64())
     return 0;
   llvm_unreachable("Unhandled ABI");
 }
@@ -55,39 +55,12 @@ MipsABIInfo MipsABIInfo::computeTargetABI(const Triple &TT, StringRef CPU,
     return MipsABIInfo::N32();
   else if (Options.getABIName().startswith("n64"))
     return MipsABIInfo::N64();
-  else if (Options.getABIName().startswith("eabi"))
-    return MipsABIInfo::EABI();
   else if (!Options.getABIName().empty())
     llvm_unreachable("Unknown ABI option for MIPS");
 
-  // FIXME: This shares code with the selectMipsCPU routine that's
-  // used and not shared in a couple of other places. This needs unifying
-  // at some level.
-  if (CPU.empty() || CPU == "generic") {
-    if (TT.getArch() == Triple::mips || TT.getArch() == Triple::mipsel)
-      CPU = "mips32";
-    else
-      CPU = "mips64";
-  }
-
-  return StringSwitch<MipsABIInfo>(CPU)
-      .Case("mips1", MipsABIInfo::O32())
-      .Case("mips2", MipsABIInfo::O32())
-      .Case("mips32", MipsABIInfo::O32())
-      .Case("mips32r2", MipsABIInfo::O32())
-      .Case("mips32r3", MipsABIInfo::O32())
-      .Case("mips32r5", MipsABIInfo::O32())
-      .Case("mips32r6", MipsABIInfo::O32())
-      .Case("mips3", MipsABIInfo::N64())
-      .Case("mips4", MipsABIInfo::N64())
-      .Case("mips5", MipsABIInfo::N64())
-      .Case("mips64", MipsABIInfo::N64())
-      .Case("mips64r2", MipsABIInfo::N64())
-      .Case("mips64r3", MipsABIInfo::N64())
-      .Case("mips64r5", MipsABIInfo::N64())
-      .Case("mips64r6", MipsABIInfo::N64())
-      .Case("octeon", MipsABIInfo::N64())
-      .Default(MipsABIInfo::Unknown());
+  if (TT.getArch() == Triple::mips64 || TT.getArch() == Triple::mips64el)
+    return MipsABIInfo::N64();
+  return MipsABIInfo::O32();
 }
 
 unsigned MipsABIInfo::GetStackPtr() const {
@@ -102,6 +75,10 @@ unsigned MipsABIInfo::GetBasePtr() const {
   return ArePtrs64bit() ? Mips::S7_64 : Mips::S7;
 }
 
+unsigned MipsABIInfo::GetGlobalPtr() const {
+  return ArePtrs64bit() ? Mips::GP_64 : Mips::GP;
+}
+
 unsigned MipsABIInfo::GetNullPtr() const {
   return ArePtrs64bit() ? Mips::ZERO_64 : Mips::ZERO;
 }
@@ -118,6 +95,14 @@ unsigned MipsABIInfo::GetPtrAddiuOp() const {
   return ArePtrs64bit() ? Mips::DADDiu : Mips::ADDiu;
 }
 
+unsigned MipsABIInfo::GetPtrSubuOp() const {
+  return ArePtrs64bit() ? Mips::DSUBu : Mips::SUBu;
+}
+
+unsigned MipsABIInfo::GetPtrAndOp() const {
+  return ArePtrs64bit() ? Mips::AND64 : Mips::AND;
+}
+
 unsigned MipsABIInfo::GetGPRMoveOp() const {
   return ArePtrs64bit() ? Mips::OR64 : Mips::OR;
 }
diff --git a/lib/Target/Mips/MCTargetDesc/MipsABIInfo.h b/lib/Target/Mips/MCTargetDesc/MipsABIInfo.h
index ffa2c765e79b..9372a3c2bb1f 100644
--- a/lib/Target/Mips/MCTargetDesc/MipsABIInfo.h
+++ b/lib/Target/Mips/MCTargetDesc/MipsABIInfo.h
@@ -10,20 +10,20 @@
 #ifndef LLVM_LIB_TARGET_MIPS_MCTARGETDESC_MIPSABIINFO_H
 #define LLVM_LIB_TARGET_MIPS_MCTARGETDESC_MIPSABIINFO_H
 
-#include "llvm/ADT/ArrayRef.h"
 #include "llvm/ADT/Triple.h"
 #include "llvm/IR/CallingConv.h"
 #include "llvm/MC/MCRegisterInfo.h"
 
 namespace llvm {
 
+template <typename T> class ArrayRef;
 class MCTargetOptions;
 class StringRef;
 class TargetRegisterClass;
 
 class MipsABIInfo {
 public:
-  enum class ABI { Unknown, O32, N32, N64, EABI };
+  enum class ABI { Unknown, O32, N32, N64 };
 
 protected:
   ABI ThisABI;
@@ -35,7 +35,6 @@ public:
   static MipsABIInfo O32() { return MipsABIInfo(ABI::O32); }
   static MipsABIInfo N32() { return MipsABIInfo(ABI::N32); }
   static MipsABIInfo N64() { return MipsABIInfo(ABI::N64); }
-  static MipsABIInfo EABI() { return MipsABIInfo(ABI::EABI); }
   static MipsABIInfo computeTargetABI(const Triple &TT, StringRef CPU,
                                       const MCTargetOptions &Options);
 
@@ -43,7 +42,6 @@ public:
   bool IsO32() const { return ThisABI == ABI::O32; }
   bool IsN32() const { return ThisABI == ABI::N32; }
   bool IsN64() const { return ThisABI == ABI::N64; }
-  bool IsEABI() const { return ThisABI == ABI::EABI; }
   ABI GetEnumValue() const { return ThisABI; }
 
   /// The registers to use for byval arguments.
@@ -66,10 +64,13 @@ public:
   unsigned GetStackPtr() const;
   unsigned GetFramePtr() const;
   unsigned GetBasePtr() const;
+  unsigned GetGlobalPtr() const;
   unsigned GetNullPtr() const;
   unsigned GetZeroReg() const;
   unsigned GetPtrAdduOp() const;
   unsigned GetPtrAddiuOp() const;
+  unsigned GetPtrSubuOp() const;
+  unsigned GetPtrAndOp() const;
   unsigned GetGPRMoveOp() const;
   inline bool ArePtrs64bit() const { return IsN64(); }
   inline bool AreGprs64bit() const { return IsN32() || IsN64(); }
diff --git a/lib/Target/Mips/MCTargetDesc/MipsAsmBackend.cpp b/lib/Target/Mips/MCTargetDesc/MipsAsmBackend.cpp
index e4865e2455ee..8292d6b4c55a 100644
--- a/lib/Target/Mips/MCTargetDesc/MipsAsmBackend.cpp
+++ b/lib/Target/Mips/MCTargetDesc/MipsAsmBackend.cpp
@@ -14,6 +14,7 @@
 
 #include "MCTargetDesc/MipsFixupKinds.h"
 #include "MCTargetDesc/MipsAsmBackend.h"
+#include "MCTargetDesc/MipsMCExpr.h"
 #include "MCTargetDesc/MipsMCTargetDesc.h"
 #include "llvm/MC/MCAsmBackend.h"
 #include "llvm/MC/MCAssembler.h"
@@ -23,7 +24,9 @@
 #include "llvm/MC/MCFixupKindInfo.h"
 #include "llvm/MC/MCObjectWriter.h"
 #include "llvm/MC/MCSubtargetInfo.h"
+#include "llvm/MC/MCValue.h"
 #include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/Format.h"
 #include "llvm/Support/MathExtras.h"
 #include "llvm/Support/raw_ostream.h"
 
@@ -40,9 +43,6 @@ static unsigned adjustFixupValue(const MCFixup &Fixup, uint64_t Value,
   default:
     return 0;
   case FK_Data_2:
-  case FK_GPRel_4:
-  case FK_Data_4:
-  case FK_Data_8:
   case Mips::fixup_Mips_LO16:
   case Mips::fixup_Mips_GPREL16:
   case Mips::fixup_Mips_GPOFF_HI:
@@ -57,6 +57,11 @@ static unsigned adjustFixupValue(const MCFixup &Fixup, uint64_t Value,
   case Mips::fixup_MICROMIPS_GOT_OFST:
   case Mips::fixup_MICROMIPS_GOT_DISP:
   case Mips::fixup_MIPS_PCLO16:
+    Value &= 0xffff;
+    break;
+  case FK_GPRel_4:
+  case FK_Data_4:
+  case FK_Data_8:
     break;
   case Mips::fixup_Mips_PC16:
     // The displacement is then divided by 4 to give us an 18 bit
@@ -69,6 +74,7 @@ static unsigned adjustFixupValue(const MCFixup &Fixup, uint64_t Value,
     }
     break;
   case Mips::fixup_MIPS_PC19_S2:
+  case Mips::fixup_MICROMIPS_PC19_S2:
     // Forcing a signed division because Value can be negative.
     Value = (int64_t)Value / 4;
     // We now check if Value can be encoded as a 19-bit signed immediate.
@@ -84,7 +90,8 @@ static unsigned adjustFixupValue(const MCFixup &Fixup, uint64_t Value,
     Value >>= 2;
     break;
   case Mips::fixup_Mips_HI16:
-  case Mips::fixup_Mips_GOT_Local:
+  case Mips::fixup_Mips_GOT:
+  case Mips::fixup_MICROMIPS_GOT16:
   case Mips::fixup_Mips_GOT_HI16:
   case Mips::fixup_Mips_CALL_HI16:
   case Mips::fixup_MICROMIPS_HI16:
@@ -142,6 +149,19 @@ static unsigned adjustFixupValue(const MCFixup &Fixup, uint64_t Value,
       return 0;
     }
     break;
+  case Mips::fixup_MICROMIPS_PC18_S3:
+    // Check alignment.
+    if ((Value & 7) && Ctx) {
+      Ctx->reportError(Fixup.getLoc(), "out of range PC18 fixup");
+    }
+    // Forcing a signed division because Value can be negative.
+    Value = (int64_t)Value / 8;
+    // We now check if Value can be encoded as a 18-bit signed immediate.
+    if (!isInt<18>(Value) && Ctx) {
+      Ctx->reportError(Fixup.getLoc(), "out of range PC18 fixup");
+      return 0;
+    }
+    break;
   case Mips::fixup_MIPS_PC21_S2:
     // Forcing a signed division because Value can be negative.
     Value = (int64_t) Value / 4;
@@ -160,6 +180,24 @@ static unsigned adjustFixupValue(const MCFixup &Fixup, uint64_t Value,
       return 0;
     }
     break;
+  case Mips::fixup_MICROMIPS_PC26_S1:
+    // Forcing a signed division because Value can be negative.
+    Value = (int64_t)Value / 2;
+    // We now check if Value can be encoded as a 26-bit signed immediate.
+    if (!isInt<26>(Value) && Ctx) {
+      Ctx->reportFatalError(Fixup.getLoc(), "out of range PC26 fixup");
+      return 0;
+    }
+    break;
+  case Mips::fixup_MICROMIPS_PC21_S1:
+    // Forcing a signed division because Value can be negative.
+    Value = (int64_t)Value / 2;
+    // We now check if Value can be encoded as a 21-bit signed immediate.
+    if (!isInt<21>(Value) && Ctx) {
+      Ctx->reportError(Fixup.getLoc(), "out of range PC21 fixup");
+      return 0;
+    }
+    break;
   }
 
   return Value;
@@ -248,16 +286,11 @@ void MipsAsmBackend::applyFixup(const MCFixup &Fixup, char *Data,
   }
 }
 
-bool MipsAsmBackend::getFixupKind(StringRef Name, MCFixupKind &MappedKind) const {
-  if (Name == "R_MIPS_NONE") {
-    MappedKind = (MCFixupKind)Mips::fixup_Mips_NONE;
-    return true;
-  }
-  if (Name == "R_MIPS_32") {
-    MappedKind = FK_Data_4;
-    return true;
-  }
-  return MCAsmBackend::getFixupKind(Name, MappedKind);
+Optional<MCFixupKind> MipsAsmBackend::getFixupKind(StringRef Name) const {
+  return StringSwitch<Optional<MCFixupKind>>(Name)
+      .Case("R_MIPS_NONE", (MCFixupKind)Mips::fixup_Mips_NONE)
+      .Case("R_MIPS_32", FK_Data_4)
+      .Default(MCAsmBackend::getFixupKind(Name));
 }
 
 const MCFixupKindInfo &MipsAsmBackend::
@@ -276,8 +309,7 @@ getFixupKindInfo(MCFixupKind Kind) const {
     { "fixup_Mips_LO16",         0,     16,   0 },
     { "fixup_Mips_GPREL16",      0,     16,   0 },
     { "fixup_Mips_LITERAL",      0,     16,   0 },
-    { "fixup_Mips_GOT_Global",   0,     16,   0 },
-    { "fixup_Mips_GOT_Local",    0,     16,   0 },
+    { "fixup_Mips_GOT",          0,     16,   0 },
     { "fixup_Mips_PC16",         0,     16,  MCFixupKindInfo::FKF_IsPCRel },
     { "fixup_Mips_CALL16",       0,     16,   0 },
     { "fixup_Mips_GPREL32",      0,     32,   0 },
@@ -316,6 +348,10 @@ getFixupKindInfo(MCFixupKind Kind) const {
     { "fixup_MICROMIPS_PC7_S1",  0,      7,   MCFixupKindInfo::FKF_IsPCRel },
     { "fixup_MICROMIPS_PC10_S1", 0,     10,   MCFixupKindInfo::FKF_IsPCRel },
     { "fixup_MICROMIPS_PC16_S1", 0,     16,   MCFixupKindInfo::FKF_IsPCRel },
+    { "fixup_MICROMIPS_PC26_S1", 0,     26,   MCFixupKindInfo::FKF_IsPCRel },
+    { "fixup_MICROMIPS_PC19_S2", 0,     19,   MCFixupKindInfo::FKF_IsPCRel },
+    { "fixup_MICROMIPS_PC18_S3", 0,     18,   MCFixupKindInfo::FKF_IsPCRel },
+    { "fixup_MICROMIPS_PC21_S1", 0,     21,   MCFixupKindInfo::FKF_IsPCRel },
     { "fixup_MICROMIPS_CALL16",  0,     16,   0 },
     { "fixup_MICROMIPS_GOT_DISP",        0,     16,   0 },
     { "fixup_MICROMIPS_GOT_PAGE",        0,     16,   0 },
@@ -342,8 +378,7 @@ getFixupKindInfo(MCFixupKind Kind) const {
     { "fixup_Mips_LO16",        16,     16,   0 },
     { "fixup_Mips_GPREL16",     16,     16,   0 },
     { "fixup_Mips_LITERAL",     16,     16,   0 },
-    { "fixup_Mips_GOT_Global",  16,     16,   0 },
-    { "fixup_Mips_GOT_Local",   16,     16,   0 },
+    { "fixup_Mips_GOT",         16,     16,   0 },
     { "fixup_Mips_PC16",        16,     16,  MCFixupKindInfo::FKF_IsPCRel },
     { "fixup_Mips_CALL16",      16,     16,   0 },
     { "fixup_Mips_GPREL32",      0,     32,   0 },
@@ -382,6 +417,10 @@ getFixupKindInfo(MCFixupKind Kind) const {
     { "fixup_MICROMIPS_PC7_S1",  9,      7,   MCFixupKindInfo::FKF_IsPCRel },
     { "fixup_MICROMIPS_PC10_S1", 6,     10,   MCFixupKindInfo::FKF_IsPCRel },
     { "fixup_MICROMIPS_PC16_S1",16,     16,   MCFixupKindInfo::FKF_IsPCRel },
+    { "fixup_MICROMIPS_PC26_S1", 6,     26,   MCFixupKindInfo::FKF_IsPCRel },
+    { "fixup_MICROMIPS_PC19_S2",13,     19,   MCFixupKindInfo::FKF_IsPCRel },
+    { "fixup_MICROMIPS_PC18_S3",14,     18,   MCFixupKindInfo::FKF_IsPCRel },
+    { "fixup_MICROMIPS_PC21_S1",11,     21,   MCFixupKindInfo::FKF_IsPCRel },
     { "fixup_MICROMIPS_CALL16", 16,     16,   0 },
     { "fixup_MICROMIPS_GOT_DISP",        16,     16,   0 },
     { "fixup_MICROMIPS_GOT_PAGE",        16,     16,   0 },
@@ -435,6 +474,8 @@ void MipsAsmBackend::processFixupValue(const MCAssembler &Asm,
   // we are only checking if the fixup can be applied correctly. We have
   // access to MCContext from here which allows us to report a fatal error
   // with *possibly* a source code location.
+  // The caller will also ignore any changes we make to Value
+  // (recordRelocation() overwrites it with it's own calculation).
   (void)adjustFixupValue(Fixup, Value, &Asm.getContext());
 }
 
diff --git a/lib/Target/Mips/MCTargetDesc/MipsAsmBackend.h b/lib/Target/Mips/MCTargetDesc/MipsAsmBackend.h
index 1c9af9227ffe..f260cfa566c9 100644
--- a/lib/Target/Mips/MCTargetDesc/MipsAsmBackend.h
+++ b/lib/Target/Mips/MCTargetDesc/MipsAsmBackend.h
@@ -41,7 +41,7 @@ public:
   void applyFixup(const MCFixup &Fixup, char *Data, unsigned DataSize,
                   uint64_t Value, bool IsPCRel) const override;
 
-  bool getFixupKind(StringRef Name, MCFixupKind &MappedKind) const override;
+  Optional<MCFixupKind> getFixupKind(StringRef Name) const override;
   const MCFixupKindInfo &getFixupKindInfo(MCFixupKind Kind) const override;
 
   unsigned getNumFixupKinds() const override {
@@ -75,7 +75,8 @@ public:
   /// \param Inst - The instruction to relax, which may be the same
   /// as the output.
   /// \param [out] Res On return, the relaxed instruction.
-  void relaxInstruction(const MCInst &Inst, MCInst &Res) const override {}
+  void relaxInstruction(const MCInst &Inst, const MCSubtargetInfo &STI,
+                        MCInst &Res) const override {}
 
   /// @}
 
diff --git a/lib/Target/Mips/MCTargetDesc/MipsBaseInfo.h b/lib/Target/Mips/MCTargetDesc/MipsBaseInfo.h
index ff7779ec1e78..2bcff881788c 100644
--- a/lib/Target/Mips/MCTargetDesc/MipsBaseInfo.h
+++ b/lib/Target/Mips/MCTargetDesc/MipsBaseInfo.h
@@ -33,9 +33,8 @@ namespace MipsII {
 
     MO_NO_FLAG,
 
-    /// MO_GOT16 - Represents the offset into the global offset table at which
+    /// MO_GOT - Represents the offset into the global offset table at which
     /// the address the relocation entry symbol resides during execution.
-    MO_GOT16,
     MO_GOT,
 
     /// MO_GOT_CALL - Represents the offset into the global offset table at
@@ -117,7 +116,12 @@ namespace MipsII {
     /// FrmOther - This form is for instructions that have no specific format.
     FrmOther = 6,
 
-    FormMask = 15
+    FormMask = 15,
+    /// IsCTI - Instruction is a Control Transfer Instruction.
+    IsCTI = 1 << 4,
+    /// HasForbiddenSlot - Instruction has a forbidden slot.
+    HasForbiddenSlot = 1 << 5
+
   };
 }
 }
diff --git a/lib/Target/Mips/MCTargetDesc/MipsELFObjectWriter.cpp b/lib/Target/Mips/MCTargetDesc/MipsELFObjectWriter.cpp
index 5b9f02b89be5..cdad7ce1b73a 100644
--- a/lib/Target/Mips/MCTargetDesc/MipsELFObjectWriter.cpp
+++ b/lib/Target/Mips/MCTargetDesc/MipsELFObjectWriter.cpp
@@ -7,8 +7,11 @@
 //
 //===----------------------------------------------------------------------===//
 
+#include <algorithm>
+#include <list>
 #include "MCTargetDesc/MipsBaseInfo.h"
 #include "MCTargetDesc/MipsFixupKinds.h"
+#include "MCTargetDesc/MipsMCExpr.h"
 #include "MCTargetDesc/MipsMCTargetDesc.h"
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/MC/MCAssembler.h"
@@ -17,42 +20,190 @@
 #include "llvm/MC/MCSection.h"
 #include "llvm/MC/MCSymbolELF.h"
 #include "llvm/MC/MCValue.h"
+#include "llvm/Support/Debug.h"
 #include "llvm/Support/ErrorHandling.h"
-#include <list>
+
+#define DEBUG_TYPE "mips-elf-object-writer"
 
 using namespace llvm;
 
 namespace {
-// A helper structure based on ELFRelocationEntry, used for sorting entries in
-// the relocation table.
+/// Holds additional information needed by the relocation ordering algorithm.
 struct MipsRelocationEntry {
-  MipsRelocationEntry(const ELFRelocationEntry &R)
-      : R(R), SortOffset(R.Offset), HasMatchingHi(false) {}
-  const ELFRelocationEntry R;
-  // SortOffset equals R.Offset except for the *HI16 relocations, for which it
-  // will be set based on the R.Offset of the matching *LO16 relocation.
-  int64_t SortOffset;
-  // True when this is a *LO16 relocation chosen as a match for a *HI16
-  // relocation.
-  bool HasMatchingHi;
+  const ELFRelocationEntry R; ///< The relocation.
+  bool Matched;               ///< Is this relocation part of a match.
+
+  MipsRelocationEntry(const ELFRelocationEntry &R) : R(R), Matched(false) {}
+
+  void print(raw_ostream &Out) const {
+    R.print(Out);
+    Out << ", Matched=" << Matched;
+  }
+};
+
+#ifndef NDEBUG
+raw_ostream &operator<<(raw_ostream &OS, const MipsRelocationEntry &RHS) {
+  RHS.print(OS);
+  return OS;
+}
+#endif
+
+class MipsELFObjectWriter : public MCELFObjectTargetWriter {
+public:
+  MipsELFObjectWriter(bool _is64Bit, uint8_t OSABI, bool _isN64,
+                      bool IsLittleEndian);
+
+  ~MipsELFObjectWriter() override;
+
+  unsigned getRelocType(MCContext &Ctx, const MCValue &Target,
+                        const MCFixup &Fixup, bool IsPCRel) const override;
+  bool needsRelocateWithSymbol(const MCSymbol &Sym,
+                               unsigned Type) const override;
+  virtual void sortRelocs(const MCAssembler &Asm,
+                          std::vector<ELFRelocationEntry> &Relocs) override;
+};
+
+/// Copy elements in the range [First, Last) to d1 when the predicate is true or
+/// d2 when the predicate is false. This is essentially both std::copy_if and
+/// std::remove_copy_if combined into a single pass.
+template <class InputIt, class OutputIt1, class OutputIt2, class UnaryPredicate>
+std::pair<OutputIt1, OutputIt2> copy_if_else(InputIt First, InputIt Last,
+                                             OutputIt1 d1, OutputIt2 d2,
+                                             UnaryPredicate Predicate) {
+  for (InputIt I = First; I != Last; ++I) {
+    if (Predicate(*I)) {
+      *d1 = *I;
+      d1++;
+    } else {
+      *d2 = *I;
+      d2++;
+    }
+  }
+
+  return std::make_pair(d1, d2);
+}
+
+/// The possible results of the Predicate function used by find_best.
+enum FindBestPredicateResult {
+  FindBest_NoMatch = 0,  ///< The current element is not a match.
+  FindBest_Match,        ///< The current element is a match but better ones are
+                         ///  possible.
+  FindBest_PerfectMatch, ///< The current element is an unbeatable match.
 };
 
-  class MipsELFObjectWriter : public MCELFObjectTargetWriter {
-  public:
-    MipsELFObjectWriter(bool _is64Bit, uint8_t OSABI,
-                        bool _isN64, bool IsLittleEndian);
+/// Find the best match in the range [First, Last).
+///
+/// An element matches when Predicate(X) returns FindBest_Match or
+/// FindBest_PerfectMatch. A value of FindBest_PerfectMatch also terminates
+/// the search. BetterThan(A, B) is a comparator that returns true when A is a
+/// better match than B. The return value is the position of the best match.
+///
+/// This is similar to std::find_if but finds the best of multiple possible
+/// matches.
+template <class InputIt, class UnaryPredicate, class Comparator>
+InputIt find_best(InputIt First, InputIt Last, UnaryPredicate Predicate,
+                  Comparator BetterThan) {
+  InputIt Best = Last;
+
+  for (InputIt I = First; I != Last; ++I) {
+    unsigned Matched = Predicate(*I);
+    if (Matched != FindBest_NoMatch) {
+      DEBUG(dbgs() << std::distance(First, I) << " is a match (";
+            I->print(dbgs()); dbgs() << ")\n");
+      if (Best == Last || BetterThan(*I, *Best)) {
+        DEBUG(dbgs() << ".. and it beats the last one\n");
+        Best = I;
+      }
+    }
+    if (Matched == FindBest_PerfectMatch) {
+      DEBUG(dbgs() << ".. and it is unbeatable\n");
+      break;
+    }
+  }
+
+  return Best;
+}
+
+/// Determine the low relocation that matches the given relocation.
+/// If the relocation does not need a low relocation then the return value
+/// is ELF::R_MIPS_NONE.
+///
+/// The relocations that need a matching low part are
+/// R_(MIPS|MICROMIPS|MIPS16)_HI16 for all symbols and
+/// R_(MIPS|MICROMIPS|MIPS16)_GOT16 for local symbols only.
+static unsigned getMatchingLoType(const ELFRelocationEntry &Reloc) {
+  unsigned Type = Reloc.Type;
+  if (Type == ELF::R_MIPS_HI16)
+    return ELF::R_MIPS_LO16;
+  if (Type == ELF::R_MICROMIPS_HI16)
+    return ELF::R_MICROMIPS_LO16;
+  if (Type == ELF::R_MIPS16_HI16)
+    return ELF::R_MIPS16_LO16;
+
+  if (Reloc.OriginalSymbol->getBinding() != ELF::STB_LOCAL)
+    return ELF::R_MIPS_NONE;
+
+  if (Type == ELF::R_MIPS_GOT16)
+    return ELF::R_MIPS_LO16;
+  if (Type == ELF::R_MICROMIPS_GOT16)
+    return ELF::R_MICROMIPS_LO16;
+  if (Type == ELF::R_MIPS16_GOT16)
+    return ELF::R_MIPS16_LO16;
+
+  return ELF::R_MIPS_NONE;
+}
 
-    ~MipsELFObjectWriter() override;
+/// Determine whether a relocation (X) matches the one given in R.
+///
+/// A relocation matches if:
+/// - It's type matches that of a corresponding low part. This is provided in
+///   MatchingType for efficiency.
+/// - It's based on the same symbol.
+/// - It's offset of greater or equal to that of the one given in R.
+///   It should be noted that this rule assumes the programmer does not use
+///   offsets that exceed the alignment of the symbol. The carry-bit will be
+///   incorrect if this is not true.
+///
+/// A matching relocation is unbeatable if:
+/// - It is not already involved in a match.
+/// - It's offset is exactly that of the one given in R.
+static FindBestPredicateResult isMatchingReloc(const MipsRelocationEntry &X,
+                                               const ELFRelocationEntry &R,
+                                               unsigned MatchingType) {
+  if (X.R.Type == MatchingType && X.R.OriginalSymbol == R.OriginalSymbol) {
+    if (!X.Matched &&
+        X.R.OriginalAddend == R.OriginalAddend)
+      return FindBest_PerfectMatch;
+    else if (X.R.OriginalAddend >= R.OriginalAddend)
+      return FindBest_Match;
+  }
+  return FindBest_NoMatch;
+}
 
-    unsigned GetRelocType(const MCValue &Target, const MCFixup &Fixup,
-                          bool IsPCRel) const override;
-    bool needsRelocateWithSymbol(const MCSymbol &Sym,
-                                 unsigned Type) const override;
-    virtual void sortRelocs(const MCAssembler &Asm,
-                            std::vector<ELFRelocationEntry> &Relocs) override;
-  };
+/// Determine whether Candidate or PreviousBest is the better match.
+/// The return value is true if Candidate is the better match.
+///
+/// A matching relocation is a better match if:
+/// - It has a smaller addend.
+/// - It is not already involved in a match.
+static bool compareMatchingRelocs(const MipsRelocationEntry &Candidate,
+                                  const MipsRelocationEntry &PreviousBest) {
+  if (Candidate.R.OriginalAddend != PreviousBest.R.OriginalAddend)
+    return Candidate.R.OriginalAddend < PreviousBest.R.OriginalAddend;
+  return PreviousBest.Matched && !Candidate.Matched;
 }
 
+#ifndef NDEBUG
+/// Print all the relocations.
+template <class Container>
+static void dumpRelocs(const char *Prefix, const Container &Relocs) {
+  for (const auto &R : Relocs)
+    dbgs() << Prefix << R << "\n";
+}
+#endif
+
+} // end anonymous namespace
+
 MipsELFObjectWriter::MipsELFObjectWriter(bool _is64Bit, uint8_t OSABI,
                                          bool _isN64, bool IsLittleEndian)
     : MCELFObjectTargetWriter(_is64Bit, OSABI, ELF::EM_MIPS,
@@ -61,7 +212,8 @@ MipsELFObjectWriter::MipsELFObjectWriter(bool _is64Bit, uint8_t OSABI,
 
 MipsELFObjectWriter::~MipsELFObjectWriter() {}
 
-unsigned MipsELFObjectWriter::GetRelocType(const MCValue &Target,
+unsigned MipsELFObjectWriter::getRelocType(MCContext &Ctx,
+                                           const MCValue &Target,
                                            const MCFixup &Fixup,
                                            bool IsPCRel) const {
   // Determine the type of the relocation.
@@ -89,6 +241,14 @@ unsigned MipsELFObjectWriter::GetRelocType(const MCValue &Target,
       return ELF::R_MICROMIPS_PC10_S1;
     case Mips::fixup_MICROMIPS_PC16_S1:
       return ELF::R_MICROMIPS_PC16_S1;
+    case Mips::fixup_MICROMIPS_PC26_S1:
+      return ELF::R_MICROMIPS_PC26_S1;
+    case Mips::fixup_MICROMIPS_PC19_S2:
+      return ELF::R_MICROMIPS_PC19_S2;
+    case Mips::fixup_MICROMIPS_PC18_S3:
+      return ELF::R_MICROMIPS_PC18_S3;
+    case Mips::fixup_MICROMIPS_PC21_S1:
+      return ELF::R_MICROMIPS_PC21_S1;
     case Mips::fixup_MIPS_PC19_S2:
       return ELF::R_MIPS_PC19_S2;
     case Mips::fixup_MIPS_PC18_S3:
@@ -125,8 +285,7 @@ unsigned MipsELFObjectWriter::GetRelocType(const MCValue &Target,
     return ELF::R_MIPS_26;
   case Mips::fixup_Mips_CALL16:
     return ELF::R_MIPS_CALL16;
-  case Mips::fixup_Mips_GOT_Global:
-  case Mips::fixup_Mips_GOT_Local:
+  case Mips::fixup_Mips_GOT:
     return ELF::R_MIPS_GOT16;
   case Mips::fixup_Mips_HI16:
     return ELF::R_MIPS_HI16;
@@ -211,213 +370,266 @@ unsigned MipsELFObjectWriter::GetRelocType(const MCValue &Target,
   llvm_unreachable("invalid fixup kind!");
 }
 
-// Sort entries by SortOffset in descending order.
-// When there are more *HI16 relocs paired with one *LO16 reloc, the 2nd rule
-// sorts them in ascending order of R.Offset.
-static int cmpRelMips(const MipsRelocationEntry *AP,
-                      const MipsRelocationEntry *BP) {
-  const MipsRelocationEntry &A = *AP;
-  const MipsRelocationEntry &B = *BP;
-  if (A.SortOffset != B.SortOffset)
-    return B.SortOffset - A.SortOffset;
-  if (A.R.Offset != B.R.Offset)
-    return A.R.Offset - B.R.Offset;
-  if (B.R.Type != A.R.Type)
-    return B.R.Type - A.R.Type;
-  //llvm_unreachable("ELFRelocs might be unstable!");
-  return 0;
-}
-
-// For the given Reloc.Type, return the matching relocation type, as in the
-// table below.
-static unsigned getMatchingLoType(const MCAssembler &Asm,
-                                  const ELFRelocationEntry &Reloc) {
-  unsigned Type = Reloc.Type;
-  if (Type == ELF::R_MIPS_HI16)
-    return ELF::R_MIPS_LO16;
-  if (Type == ELF::R_MICROMIPS_HI16)
-    return ELF::R_MICROMIPS_LO16;
-  if (Type == ELF::R_MIPS16_HI16)
-    return ELF::R_MIPS16_LO16;
-
-  if (Reloc.Symbol->getBinding() != ELF::STB_LOCAL)
-    return ELF::R_MIPS_NONE;
-
-  if (Type == ELF::R_MIPS_GOT16)
-    return ELF::R_MIPS_LO16;
-  if (Type == ELF::R_MICROMIPS_GOT16)
-    return ELF::R_MICROMIPS_LO16;
-  if (Type == ELF::R_MIPS16_GOT16)
-    return ELF::R_MIPS16_LO16;
-
-  return ELF::R_MIPS_NONE;
-}
-
-// Return true if First needs a matching *LO16, its matching *LO16 type equals
-// Second's type and both relocations are against the same symbol.
-static bool areMatchingHiAndLo(const MCAssembler &Asm,
-                               const ELFRelocationEntry &First,
-                               const ELFRelocationEntry &Second) {
-  return getMatchingLoType(Asm, First) != ELF::R_MIPS_NONE &&
-         getMatchingLoType(Asm, First) == Second.Type &&
-         First.Symbol && First.Symbol == Second.Symbol;
-}
-
-// Return true if MipsRelocs[Index] is a *LO16 preceded by a matching *HI16.
-static bool
-isPrecededByMatchingHi(const MCAssembler &Asm, uint32_t Index,
-                       std::vector<MipsRelocationEntry> &MipsRelocs) {
-  return Index < MipsRelocs.size() - 1 &&
-         areMatchingHiAndLo(Asm, MipsRelocs[Index + 1].R, MipsRelocs[Index].R);
-}
-
-// Return true if MipsRelocs[Index] is a *LO16 not preceded by a matching *HI16
-// and not chosen by a *HI16 as a match.
-static bool isFreeLo(const MCAssembler &Asm, uint32_t Index,
-                     std::vector<MipsRelocationEntry> &MipsRelocs) {
-  return Index < MipsRelocs.size() && !MipsRelocs[Index].HasMatchingHi &&
-         !isPrecededByMatchingHi(Asm, Index, MipsRelocs);
-}
-
-// Lo is chosen as a match for Hi, set their fields accordingly.
-// Mips instructions have fixed length of at least two bytes (two for
-// micromips/mips16, four for mips32/64), so we can set HI's SortOffset to
-// matching LO's Offset minus one to simplify the sorting function.
-static void setMatch(MipsRelocationEntry &Hi, MipsRelocationEntry &Lo) {
-  Lo.HasMatchingHi = true;
-  Hi.SortOffset = Lo.R.Offset - 1;
-}
-
-// We sort relocation table entries by offset, except for one additional rule
-// required by MIPS ABI: every *HI16 relocation must be immediately followed by
-// the corresponding *LO16 relocation. We also support a GNU extension that
-// allows more *HI16s paired with one *LO16.
-//
-// *HI16 relocations and their matching *LO16 are:
-//
-// +---------------------------------------------+-------------------+
-// |               *HI16                         |  matching *LO16   |
-// |---------------------------------------------+-------------------|
-// |  R_MIPS_HI16, local R_MIPS_GOT16            |    R_MIPS_LO16    |
-// |  R_MICROMIPS_HI16, local R_MICROMIPS_GOT16  | R_MICROMIPS_LO16  |
-// |  R_MIPS16_HI16, local R_MIPS16_GOT16        |  R_MIPS16_LO16    |
-// +---------------------------------------------+-------------------+
-//
-// (local R_*_GOT16 meaning R_*_GOT16 against the local symbol.)
-//
-// To handle *HI16 and *LO16 relocations, the linker needs a combined addend
-// ("AHL") calculated from both *HI16 ("AHI") and *LO16 ("ALO") relocations:
-// AHL = (AHI << 16) + (short)ALO;
-//
-// We are reusing gnu as sorting algorithm so we are emitting the relocation
-// table sorted the same way as gnu as would sort it, for easier comparison of
-// the generated .o files.
-//
-// The logic is:
-// search the table (starting from the highest offset and going back to zero)
-// for all *HI16 relocations that don't have a matching *LO16.
-// For every such HI, find a matching LO with highest offset that isn't already
-// matched with another HI. If there are no free LOs, match it with the first
-// found (starting from lowest offset).
-// When there are more HIs matched with one LO, sort them in descending order by
-// offset.
-//
-// In other words, when searching for a matching LO:
-// - don't look for a 'better' match for the HIs that are already followed by a
-//   matching LO;
-// - prefer LOs without a pair;
-// - prefer LOs with higher offset;
-
-static int cmpRel(const ELFRelocationEntry *AP, const ELFRelocationEntry *BP) {
-  const ELFRelocationEntry &A = *AP;
-  const ELFRelocationEntry &B = *BP;
-  if (A.Offset != B.Offset)
-    return B.Offset - A.Offset;
-  if (B.Type != A.Type)
-    return A.Type - B.Type;
-  return 0;
-}
-
+/// Sort relocation table entries by offset except where another order is
+/// required by the MIPS ABI.
+///
+/// MIPS has a few relocations that have an AHL component in the expression used
+/// to evaluate them. This AHL component is an addend with the same number of
+/// bits as a symbol value but not all of our ABI's are able to supply a
+/// sufficiently sized addend in a single relocation.
+///
+/// The O32 ABI for example, uses REL relocations which store the addend in the
+/// section data. All the relocations with AHL components affect 16-bit fields
+/// so the addend for a single relocation is limited to 16-bit. This ABI
+/// resolves the limitation by linking relocations (e.g. R_MIPS_HI16 and
+/// R_MIPS_LO16) and distributing the addend between the linked relocations. The
+/// ABI mandates that such relocations must be next to each other in a
+/// particular order (e.g. R_MIPS_HI16 must be immediately followed by a
+/// matching R_MIPS_LO16) but the rule is less strict in practice.
+///
+/// The de facto standard is lenient in the following ways:
+/// - 'Immediately following' does not refer to the next relocation entry but
+///   the next matching relocation.
+/// - There may be multiple high parts relocations for one low part relocation.
+/// - There may be multiple low part relocations for one high part relocation.
+/// - The AHL addend in each part does not have to be exactly equal as long as
+///   the difference does not affect the carry bit from bit 15 into 16. This is
+///   to allow, for example, the use of %lo(foo) and %lo(foo+4) when loading
+///   both halves of a long long.
+///
+/// See getMatchingLoType() for a description of which high part relocations
+/// match which low part relocations. One particular thing to note is that
+/// R_MIPS_GOT16 and similar only have AHL addends if they refer to local
+/// symbols.
+///
+/// It should also be noted that this function is not affected by whether
+/// the symbol was kept or rewritten into a section-relative equivalent. We
+/// always match using the expressions from the source.
 void MipsELFObjectWriter::sortRelocs(const MCAssembler &Asm,
                                      std::vector<ELFRelocationEntry> &Relocs) {
   if (Relocs.size() < 2)
     return;
 
-  // Sorts entries by Offset in descending order.
-  array_pod_sort(Relocs.begin(), Relocs.end(), cmpRel);
-
-  // Init MipsRelocs from Relocs.
-  std::vector<MipsRelocationEntry> MipsRelocs;
-  for (unsigned I = 0, E = Relocs.size(); I != E; ++I)
-    MipsRelocs.push_back(MipsRelocationEntry(Relocs[I]));
-
-  // Find a matching LO for all HIs that need it.
-  for (int32_t I = 0, E = MipsRelocs.size(); I != E; ++I) {
-    if (getMatchingLoType(Asm, MipsRelocs[I].R) == ELF::R_MIPS_NONE ||
-        (I > 0 && isPrecededByMatchingHi(Asm, I - 1, MipsRelocs)))
-      continue;
-
-    int32_t MatchedLoIndex = -1;
-
-    // Search the list in the ascending order of Offset.
-    for (int32_t J = MipsRelocs.size() - 1, N = -1; J != N; --J) {
-      // check for a match
-      if (areMatchingHiAndLo(Asm, MipsRelocs[I].R, MipsRelocs[J].R) &&
-          (MatchedLoIndex == -1 || // first match
-           // or we already have a match,
-           // but this one is with higher offset and it's free
-           (MatchedLoIndex > J && isFreeLo(Asm, J, MipsRelocs))))
-        MatchedLoIndex = J;
-    }
-
-    if (MatchedLoIndex != -1)
-      // We have a match.
-      setMatch(MipsRelocs[I], MipsRelocs[MatchedLoIndex]);
+  // Sort relocations by the address they are applied to.
+  std::sort(Relocs.begin(), Relocs.end(),
+            [](const ELFRelocationEntry &A, const ELFRelocationEntry &B) {
+              return A.Offset < B.Offset;
+            });
+
+  std::list<MipsRelocationEntry> Sorted;
+  std::list<ELFRelocationEntry> Remainder;
+
+  DEBUG(dumpRelocs("R: ", Relocs));
+
+  // Separate the movable relocations (AHL relocations using the high bits) from
+  // the immobile relocations (everything else). This does not preserve high/low
+  // matches that already existed in the input.
+  copy_if_else(Relocs.begin(), Relocs.end(), std::back_inserter(Remainder),
+               std::back_inserter(Sorted), [](const ELFRelocationEntry &Reloc) {
+                 return getMatchingLoType(Reloc) != ELF::R_MIPS_NONE;
+               });
+
+  for (auto &R : Remainder) {
+    DEBUG(dbgs() << "Matching: " << R << "\n");
+
+    unsigned MatchingType = getMatchingLoType(R);
+    assert(MatchingType != ELF::R_MIPS_NONE &&
+           "Wrong list for reloc that doesn't need a match");
+
+    // Find the best matching relocation for the current high part.
+    // See isMatchingReloc for a description of a matching relocation and
+    // compareMatchingRelocs for a description of what 'best' means.
+    auto InsertionPoint =
+        find_best(Sorted.begin(), Sorted.end(),
+                  [&R, &MatchingType](const MipsRelocationEntry &X) {
+                    return isMatchingReloc(X, R, MatchingType);
+                  },
+                  compareMatchingRelocs);
+
+    // If we matched then insert the high part in front of the match and mark
+    // both relocations as being involved in a match. We only mark the high
+    // part for cosmetic reasons in the debug output.
+    //
+    // If we failed to find a match then the high part is orphaned. This is not
+    // permitted since the relocation cannot be evaluated without knowing the
+    // carry-in. We can sometimes handle this using a matching low part that is
+    // already used in a match but we already cover that case in
+    // isMatchingReloc and compareMatchingRelocs. For the remaining cases we
+    // should insert the high part at the end of the list. This will cause the
+    // linker to fail but the alternative is to cause the linker to bind the
+    // high part to a semi-matching low part and silently calculate the wrong
+    // value. Unfortunately we have no means to warn the user that we did this
+    // so leave it up to the linker to complain about it.
+    if (InsertionPoint != Sorted.end())
+      InsertionPoint->Matched = true;
+    Sorted.insert(InsertionPoint, R)->Matched = true;
   }
 
-  // SortOffsets are calculated, call the sorting function.
-  array_pod_sort(MipsRelocs.begin(), MipsRelocs.end(), cmpRelMips);
+  DEBUG(dumpRelocs("S: ", Sorted));
+
+  assert(Relocs.size() == Sorted.size() && "Some relocs were not consumed");
 
-  // Copy sorted MipsRelocs back to Relocs.
-  for (unsigned I = 0, E = MipsRelocs.size(); I != E; ++I)
-    Relocs[I] = MipsRelocs[I].R;
+  // Overwrite the original vector with the sorted elements. The caller expects
+  // them in reverse order.
+  unsigned CopyTo = 0;
+  for (const auto &R : reverse(Sorted))
+    Relocs[CopyTo++] = R.R;
 }
 
 bool MipsELFObjectWriter::needsRelocateWithSymbol(const MCSymbol &Sym,
                                                   unsigned Type) const {
-  // FIXME: This is extremely conservative. This really needs to use a
-  // whitelist with a clear explanation for why each realocation needs to
-  // point to the symbol, not to the section.
+  // If it's a compound relocation for N64 then we need the relocation if any
+  // sub-relocation needs it.
+  if (!isUInt<8>(Type))
+    return needsRelocateWithSymbol(Sym, Type & 0xff) ||
+           needsRelocateWithSymbol(Sym, (Type >> 8) & 0xff) ||
+           needsRelocateWithSymbol(Sym, (Type >> 16) & 0xff);
+
   switch (Type) {
   default:
+    errs() << Type << "\n";
+    llvm_unreachable("Unexpected relocation");
     return true;
 
+  // This relocation doesn't affect the section data.
+  case ELF::R_MIPS_NONE:
+    return false;
+
+  // On REL ABI's (e.g. O32), these relocations form pairs. The pairing is done
+  // by the static linker by matching the symbol and offset.
+  // We only see one relocation at a time but it's still safe to relocate with
+  // the section so long as both relocations make the same decision.
+  //
+  // Some older linkers may require the symbol for particular cases. Such cases
+  // are not supported yet but can be added as required.
   case ELF::R_MIPS_GOT16:
   case ELF::R_MIPS16_GOT16:
   case ELF::R_MICROMIPS_GOT16:
-    llvm_unreachable("Should have been handled already");
-
-  // These relocations might be paired with another relocation. The pairing is
-  // done by the static linker by matching the symbol. Since we only see one
-  // relocation at a time, we have to force them to relocate with a symbol to
-  // avoid ending up with a pair where one points to a section and another
-  // points to a symbol.
   case ELF::R_MIPS_HI16:
   case ELF::R_MIPS16_HI16:
   case ELF::R_MICROMIPS_HI16:
   case ELF::R_MIPS_LO16:
   case ELF::R_MIPS16_LO16:
   case ELF::R_MICROMIPS_LO16:
-    return true;
+    // FIXME: It should be safe to return false for the STO_MIPS_MICROMIPS but
+    //        we neglect to handle the adjustment to the LSB of the addend that
+    //        it causes in applyFixup() and similar.
+    if (cast<MCSymbolELF>(Sym).getOther() & ELF::STO_MIPS_MICROMIPS)
+      return true;
+    return false;
 
+  case ELF::R_MIPS_16:
   case ELF::R_MIPS_32:
+  case ELF::R_MIPS_GPREL32:
     if (cast<MCSymbolELF>(Sym).getOther() & ELF::STO_MIPS_MICROMIPS)
       return true;
-    // falltrough
+    // fallthrough
   case ELF::R_MIPS_26:
   case ELF::R_MIPS_64:
   case ELF::R_MIPS_GPREL16:
+  case ELF::R_MIPS_PC16:
+  case ELF::R_MIPS_SUB:
     return false;
+
+  // FIXME: Many of these relocations should probably return false but this
+  //        hasn't been confirmed to be safe yet.
+  case ELF::R_MIPS_REL32:
+  case ELF::R_MIPS_LITERAL:
+  case ELF::R_MIPS_CALL16:
+  case ELF::R_MIPS_SHIFT5:
+  case ELF::R_MIPS_SHIFT6:
+  case ELF::R_MIPS_GOT_DISP:
+  case ELF::R_MIPS_GOT_PAGE:
+  case ELF::R_MIPS_GOT_OFST:
+  case ELF::R_MIPS_GOT_HI16:
+  case ELF::R_MIPS_GOT_LO16:
+  case ELF::R_MIPS_INSERT_A:
+  case ELF::R_MIPS_INSERT_B:
+  case ELF::R_MIPS_DELETE:
+  case ELF::R_MIPS_HIGHER:
+  case ELF::R_MIPS_HIGHEST:
+  case ELF::R_MIPS_CALL_HI16:
+  case ELF::R_MIPS_CALL_LO16:
+  case ELF::R_MIPS_SCN_DISP:
+  case ELF::R_MIPS_REL16:
+  case ELF::R_MIPS_ADD_IMMEDIATE:
+  case ELF::R_MIPS_PJUMP:
+  case ELF::R_MIPS_RELGOT:
+  case ELF::R_MIPS_JALR:
+  case ELF::R_MIPS_TLS_DTPMOD32:
+  case ELF::R_MIPS_TLS_DTPREL32:
+  case ELF::R_MIPS_TLS_DTPMOD64:
+  case ELF::R_MIPS_TLS_DTPREL64:
+  case ELF::R_MIPS_TLS_GD:
+  case ELF::R_MIPS_TLS_LDM:
+  case ELF::R_MIPS_TLS_DTPREL_HI16:
+  case ELF::R_MIPS_TLS_DTPREL_LO16:
+  case ELF::R_MIPS_TLS_GOTTPREL:
+  case ELF::R_MIPS_TLS_TPREL32:
+  case ELF::R_MIPS_TLS_TPREL64:
+  case ELF::R_MIPS_TLS_TPREL_HI16:
+  case ELF::R_MIPS_TLS_TPREL_LO16:
+  case ELF::R_MIPS_GLOB_DAT:
+  case ELF::R_MIPS_PC21_S2:
+  case ELF::R_MIPS_PC26_S2:
+  case ELF::R_MIPS_PC18_S3:
+  case ELF::R_MIPS_PC19_S2:
+  case ELF::R_MIPS_PCHI16:
+  case ELF::R_MIPS_PCLO16:
+  case ELF::R_MIPS_COPY:
+  case ELF::R_MIPS_JUMP_SLOT:
+  case ELF::R_MIPS_NUM:
+  case ELF::R_MIPS_PC32:
+  case ELF::R_MIPS_EH:
+  case ELF::R_MICROMIPS_26_S1:
+  case ELF::R_MICROMIPS_GPREL16:
+  case ELF::R_MICROMIPS_LITERAL:
+  case ELF::R_MICROMIPS_PC7_S1:
+  case ELF::R_MICROMIPS_PC10_S1:
+  case ELF::R_MICROMIPS_PC16_S1:
+  case ELF::R_MICROMIPS_CALL16:
+  case ELF::R_MICROMIPS_GOT_DISP:
+  case ELF::R_MICROMIPS_GOT_PAGE:
+  case ELF::R_MICROMIPS_GOT_OFST:
+  case ELF::R_MICROMIPS_GOT_HI16:
+  case ELF::R_MICROMIPS_GOT_LO16:
+  case ELF::R_MICROMIPS_SUB:
+  case ELF::R_MICROMIPS_HIGHER:
+  case ELF::R_MICROMIPS_HIGHEST:
+  case ELF::R_MICROMIPS_CALL_HI16:
+  case ELF::R_MICROMIPS_CALL_LO16:
+  case ELF::R_MICROMIPS_SCN_DISP:
+  case ELF::R_MICROMIPS_JALR:
+  case ELF::R_MICROMIPS_HI0_LO16:
+  case ELF::R_MICROMIPS_TLS_GD:
+  case ELF::R_MICROMIPS_TLS_LDM:
+  case ELF::R_MICROMIPS_TLS_DTPREL_HI16:
+  case ELF::R_MICROMIPS_TLS_DTPREL_LO16:
+  case ELF::R_MICROMIPS_TLS_GOTTPREL:
+  case ELF::R_MICROMIPS_TLS_TPREL_HI16:
+  case ELF::R_MICROMIPS_TLS_TPREL_LO16:
+  case ELF::R_MICROMIPS_GPREL7_S2:
+  case ELF::R_MICROMIPS_PC23_S2:
+  case ELF::R_MICROMIPS_PC21_S1:
+  case ELF::R_MICROMIPS_PC26_S1:
+  case ELF::R_MICROMIPS_PC18_S3:
+  case ELF::R_MICROMIPS_PC19_S2:
+    return true;
+
+  // FIXME: Many of these should probably return false but MIPS16 isn't
+  //        supported by the integrated assembler.
+  case ELF::R_MIPS16_26:
+  case ELF::R_MIPS16_GPREL:
+  case ELF::R_MIPS16_CALL16:
+  case ELF::R_MIPS16_TLS_GD:
+  case ELF::R_MIPS16_TLS_LDM:
+  case ELF::R_MIPS16_TLS_DTPREL_HI16:
+  case ELF::R_MIPS16_TLS_DTPREL_LO16:
+  case ELF::R_MIPS16_TLS_GOTTPREL:
+  case ELF::R_MIPS16_TLS_TPREL_HI16:
+  case ELF::R_MIPS16_TLS_TPREL_LO16:
+    llvm_unreachable("Unsupported MIPS16 relocation");
+    return true;
   }
 }
 
diff --git a/lib/Target/Mips/MCTargetDesc/MipsFixupKinds.h b/lib/Target/Mips/MCTargetDesc/MipsFixupKinds.h
index 3652f4bab0d4..b4d8e9494650 100644
--- a/lib/Target/Mips/MCTargetDesc/MipsFixupKinds.h
+++ b/lib/Target/Mips/MCTargetDesc/MipsFixupKinds.h
@@ -50,11 +50,8 @@ namespace Mips {
     // 16 bit literal fixup resulting in - R_MIPS_LITERAL.
     fixup_Mips_LITERAL,
 
-    // Global symbol fixup resulting in - R_MIPS_GOT16.
-    fixup_Mips_GOT_Global,
-
-    // Local symbol fixup resulting in - R_MIPS_GOT16.
-    fixup_Mips_GOT_Local,
+    // Symbol fixup resulting in - R_MIPS_GOT16.
+    fixup_Mips_GOT,
 
     // PC relative branch fixup resulting in - R_MIPS_PC16.
     fixup_Mips_PC16,
@@ -170,6 +167,18 @@ namespace Mips {
     // resulting in - R_MICROMIPS_PC16_S1
     fixup_MICROMIPS_PC16_S1,
 
+    // resulting in - R_MICROMIPS_PC26_S1
+    fixup_MICROMIPS_PC26_S1,
+
+    // resulting in - R_MICROMIPS_PC19_S2
+    fixup_MICROMIPS_PC19_S2,
+
+    // resulting in - R_MICROMIPS_PC18_S3
+    fixup_MICROMIPS_PC18_S3,
+
+    // resulting in - R_MICROMIPS_PC21_S1
+    fixup_MICROMIPS_PC21_S1,
+
     // resulting in - R_MICROMIPS_CALL16
     fixup_MICROMIPS_CALL16,
 
diff --git a/lib/Target/Mips/MCTargetDesc/MipsMCAsmInfo.cpp b/lib/Target/Mips/MCTargetDesc/MipsMCAsmInfo.cpp
index 4d554583dc78..1622b2212665 100644
--- a/lib/Target/Mips/MCTargetDesc/MipsMCAsmInfo.cpp
+++ b/lib/Target/Mips/MCTargetDesc/MipsMCAsmInfo.cpp
@@ -42,4 +42,9 @@ MipsMCAsmInfo::MipsMCAsmInfo(const Triple &TheTriple) {
   SupportsDebugInformation = true;
   ExceptionsType = ExceptionHandling::DwarfCFI;
   DwarfRegNumForCFI = true;
+
+  // Enable IAS by default for O32.
+  if (TheTriple.getArch() == Triple::mips ||
+      TheTriple.getArch() == Triple::mipsel)
+    UseIntegratedAssembler = true;
 }
diff --git a/lib/Target/Mips/MCTargetDesc/MipsMCCodeEmitter.cpp b/lib/Target/Mips/MCTargetDesc/MipsMCCodeEmitter.cpp
index 4b030ebfce8c..401c7d42c4ce 100644
--- a/lib/Target/Mips/MCTargetDesc/MipsMCCodeEmitter.cpp
+++ b/lib/Target/Mips/MCTargetDesc/MipsMCCodeEmitter.cpp
@@ -78,19 +78,25 @@ static void LowerLargeShift(MCInst& Inst) {
   case Mips::DROTR:
     Inst.setOpcode(Mips::DROTR32);
     return;
+  case Mips::DSLL_MM64R6:
+    Inst.setOpcode(Mips::DSLL32_MM64R6);
+    return;
+  case Mips::DSRL_MM64R6:
+    Inst.setOpcode(Mips::DSRL32_MM64R6);
+    return;
+  case Mips::DSRA_MM64R6:
+    Inst.setOpcode(Mips::DSRA32_MM64R6);
+    return;
+  case Mips::DROTR_MM64R6:
+    Inst.setOpcode(Mips::DROTR32_MM64R6);
+    return;
   }
 }
 
-// Pick a DEXT or DINS instruction variant based on the pos and size operands
-static void LowerDextDins(MCInst& InstIn) {
-  int Opcode = InstIn.getOpcode();
-
-  if (Opcode == Mips::DEXT)
-    assert(InstIn.getNumOperands() == 4 &&
-           "Invalid no. of machine operands for DEXT!");
-  else // Only DEXT and DINS are possible
-    assert(InstIn.getNumOperands() == 5 &&
-           "Invalid no. of machine operands for DINS!");
+// Pick a DINS instruction variant based on the pos and size operands
+static void LowerDins(MCInst& InstIn) {
+  assert(InstIn.getNumOperands() == 5 &&
+         "Invalid no. of machine operands for DINS!");
 
   assert(InstIn.getOperand(2).isImm());
   int64_t pos = InstIn.getOperand(2).getImm();
@@ -98,20 +104,50 @@ static void LowerDextDins(MCInst& InstIn) {
   int64_t size = InstIn.getOperand(3).getImm();
 
   if (size <= 32) {
-    if (pos < 32)  // DEXT/DINS, do nothing
+    if (pos < 32)  // DINS, do nothing
       return;
-    // DEXTU/DINSU
+    // DINSU
     InstIn.getOperand(2).setImm(pos - 32);
-    InstIn.setOpcode((Opcode == Mips::DEXT) ? Mips::DEXTU : Mips::DINSU);
+    InstIn.setOpcode(Mips::DINSU);
     return;
   }
-  // DEXTM/DINSM
-  assert(pos < 32 && "DEXT/DINS cannot have both size and pos > 32");
+  // DINSM
+  assert(pos < 32 && "DINS cannot have both size and pos > 32");
   InstIn.getOperand(3).setImm(size - 32);
-  InstIn.setOpcode((Opcode == Mips::DEXT) ? Mips::DEXTM : Mips::DINSM);
+  InstIn.setOpcode(Mips::DINSM);
   return;
 }
 
+// Fix a bad compact branch encoding for beqc/bnec.
+void MipsMCCodeEmitter::LowerCompactBranch(MCInst& Inst) const {
+
+  // Encoding may be illegal !(rs < rt), but this situation is
+  // easily fixed.
+  unsigned RegOp0 = Inst.getOperand(0).getReg();
+  unsigned RegOp1 = Inst.getOperand(1).getReg();
+
+  unsigned Reg0 =  Ctx.getRegisterInfo()->getEncodingValue(RegOp0);
+  unsigned Reg1 =  Ctx.getRegisterInfo()->getEncodingValue(RegOp1);
+
+  if (Inst.getOpcode() == Mips::BNEC || Inst.getOpcode() == Mips::BEQC) {
+    assert(Reg0 != Reg1 && "Instruction has bad operands ($rs == $rt)!");
+    if (Reg0 < Reg1)
+      return;
+  } else if (Inst.getOpcode() == Mips::BNVC || Inst.getOpcode() == Mips::BOVC) {
+    if (Reg0 >= Reg1)
+      return;
+  } else if (Inst.getOpcode() == Mips::BNVC_MMR6 ||
+             Inst.getOpcode() == Mips::BOVC_MMR6) {
+    if (Reg1 >= Reg0)
+      return;
+  } else
+   llvm_unreachable("Cannot rewrite unknown branch!");
+
+  Inst.getOperand(0).setReg(RegOp1);
+  Inst.getOperand(1).setReg(RegOp0);
+
+}
+
 bool MipsMCCodeEmitter::isMicroMips(const MCSubtargetInfo &STI) const {
   return STI.getFeatureBits()[Mips::FeatureMicroMips];
 }
@@ -161,12 +197,24 @@ encodeInstruction(const MCInst &MI, raw_ostream &OS,
   case Mips::DSRL:
   case Mips::DSRA:
   case Mips::DROTR:
+  case Mips::DSLL_MM64R6:
+  case Mips::DSRL_MM64R6:
+  case Mips::DSRA_MM64R6:
+  case Mips::DROTR_MM64R6:
     LowerLargeShift(TmpInst);
     break;
     // Double extract instruction is chosen by pos and size operands
-  case Mips::DEXT:
   case Mips::DINS:
-    LowerDextDins(TmpInst);
+    LowerDins(TmpInst);
+    break;
+  // Compact branches, enforce encoding restrictions.
+  case Mips::BEQC:
+  case Mips::BNEC:
+  case Mips::BOVC:
+  case Mips::BOVC_MMR6:
+  case Mips::BNVC:
+  case Mips::BNVC_MMR6:
+    LowerCompactBranch(TmpInst);
   }
 
   unsigned long N = Fixups.size();
@@ -237,6 +285,53 @@ getBranchTargetOpValue(const MCInst &MI, unsigned OpNo,
   return 0;
 }
 
+/// getBranchTargetOpValue1SImm16 - Return binary encoding of the branch
+/// target operand. If the machine operand requires relocation,
+/// record the relocation and return zero.
+unsigned MipsMCCodeEmitter::
+getBranchTargetOpValue1SImm16(const MCInst &MI, unsigned OpNo,
+                              SmallVectorImpl<MCFixup> &Fixups,
+                              const MCSubtargetInfo &STI) const {
+
+  const MCOperand &MO = MI.getOperand(OpNo);
+
+  // If the destination is an immediate, divide by 2.
+  if (MO.isImm()) return MO.getImm() >> 1;
+
+  assert(MO.isExpr() &&
+         "getBranchTargetOpValue expects only expressions or immediates");
+
+  const MCExpr *FixupExpression = MCBinaryExpr::createAdd(
+      MO.getExpr(), MCConstantExpr::create(-4, Ctx), Ctx);
+  Fixups.push_back(MCFixup::create(0, FixupExpression,
+                                   MCFixupKind(Mips::fixup_Mips_PC16)));
+  return 0;
+}
+
+/// getBranchTargetOpValueMMR6 - Return binary encoding of the branch
+/// target operand. If the machine operand requires relocation,
+/// record the relocation and return zero.
+unsigned MipsMCCodeEmitter::
+getBranchTargetOpValueMMR6(const MCInst &MI, unsigned OpNo,
+                           SmallVectorImpl<MCFixup> &Fixups,
+                           const MCSubtargetInfo &STI) const {
+
+  const MCOperand &MO = MI.getOperand(OpNo);
+
+  // If the destination is an immediate, divide by 2.
+  if (MO.isImm())
+    return MO.getImm() >> 1;
+
+  assert(MO.isExpr() &&
+         "getBranchTargetOpValueMMR6 expects only expressions or immediates");
+
+  const MCExpr *FixupExpression = MCBinaryExpr::createAdd(
+      MO.getExpr(), MCConstantExpr::create(-2, Ctx), Ctx);
+  Fixups.push_back(MCFixup::create(0, FixupExpression,
+                                   MCFixupKind(Mips::fixup_Mips_PC16)));
+  return 0;
+}
+
 /// getBranchTarget7OpValueMM - Return binary encoding of the microMIPS branch
 /// target operand. If the machine operand requires relocation,
 /// record the relocation and return zero.
@@ -327,6 +422,29 @@ getBranchTarget21OpValue(const MCInst &MI, unsigned OpNo,
   return 0;
 }
 
+/// getBranchTarget21OpValueMM - Return binary encoding of the branch
+/// target operand for microMIPS. If the machine operand requires
+/// relocation, record the relocation and return zero.
+unsigned MipsMCCodeEmitter::
+getBranchTarget21OpValueMM(const MCInst &MI, unsigned OpNo,
+                           SmallVectorImpl<MCFixup> &Fixups,
+                           const MCSubtargetInfo &STI) const {
+
+  const MCOperand &MO = MI.getOperand(OpNo);
+
+  // If the destination is an immediate, divide by 2.
+  if (MO.isImm()) return MO.getImm() >> 1;
+
+  assert(MO.isExpr() &&
+    "getBranchTarget21OpValueMM expects only expressions or immediates");
+
+  const MCExpr *FixupExpression = MCBinaryExpr::createAdd(
+      MO.getExpr(), MCConstantExpr::create(-4, Ctx), Ctx);
+  Fixups.push_back(MCFixup::create(0, FixupExpression,
+                                   MCFixupKind(Mips::fixup_MICROMIPS_PC21_S1)));
+  return 0;
+}
+
 /// getBranchTarget26OpValue - Return binary encoding of the branch
 /// target operand. If the machine operand requires relocation,
 /// record the relocation and return zero.
@@ -363,7 +481,13 @@ unsigned MipsMCCodeEmitter::getBranchTarget26OpValueMM(
   if (MO.isImm())
     return MO.getImm() >> 1;
 
-  // TODO: Push 26 PC fixup.
+  assert(MO.isExpr() &&
+         "getBranchTarget26OpValueMM expects only expressions or immediates");
+
+  const MCExpr *FixupExpression = MCBinaryExpr::createAdd(
+      MO.getExpr(), MCConstantExpr::create(-4, Ctx), Ctx);
+  Fixups.push_back(MCFixup::create(0, FixupExpression,
+                                   MCFixupKind(Mips::fixup_MICROMIPS_PC26_S1)));
   return 0;
 }
 
@@ -510,126 +634,117 @@ getExprOpValue(const MCExpr *Expr, SmallVectorImpl<MCFixup> &Fixups,
 
     Mips::Fixups FixupKind = Mips::Fixups(0);
     switch (MipsExpr->getKind()) {
-    default: llvm_unreachable("Unsupported fixup kind for target expression!");
-    case MipsMCExpr::VK_Mips_HIGHEST:
-      FixupKind = Mips::fixup_Mips_HIGHEST;
+    case MipsMCExpr::MEK_NEG:
+    case MipsMCExpr::MEK_None:
+    case MipsMCExpr::MEK_Special:
+      llvm_unreachable("Unhandled fixup kind!");
       break;
-    case MipsMCExpr::VK_Mips_HIGHER:
-      FixupKind = Mips::fixup_Mips_HIGHER;
+    case MipsMCExpr::MEK_CALL_HI16:
+      FixupKind = Mips::fixup_Mips_CALL_HI16;
       break;
-    case MipsMCExpr::VK_Mips_HI:
-      FixupKind = isMicroMips(STI) ? Mips::fixup_MICROMIPS_HI16
-                                   : Mips::fixup_Mips_HI16;
+    case MipsMCExpr::MEK_CALL_LO16:
+      FixupKind = Mips::fixup_Mips_CALL_LO16;
       break;
-    case MipsMCExpr::VK_Mips_LO:
-      FixupKind = isMicroMips(STI) ? Mips::fixup_MICROMIPS_LO16
-                                   : Mips::fixup_Mips_LO16;
+    case MipsMCExpr::MEK_DTPREL_HI:
+      FixupKind = isMicroMips(STI) ? Mips::fixup_MICROMIPS_TLS_DTPREL_HI16
+                                   : Mips::fixup_Mips_DTPREL_HI;
       break;
-    }
-    Fixups.push_back(MCFixup::create(0, MipsExpr, MCFixupKind(FixupKind)));
-    return 0;
-  }
-
-  if (Kind == MCExpr::SymbolRef) {
-    Mips::Fixups FixupKind = Mips::Fixups(0);
-
-    switch(cast<MCSymbolRefExpr>(Expr)->getKind()) {
-    default: llvm_unreachable("Unknown fixup kind!");
+    case MipsMCExpr::MEK_DTPREL_LO:
+      FixupKind = isMicroMips(STI) ? Mips::fixup_MICROMIPS_TLS_DTPREL_LO16
+                                   : Mips::fixup_Mips_DTPREL_LO;
       break;
-    case MCSymbolRefExpr::VK_None:
-      FixupKind = Mips::fixup_Mips_32; // FIXME: This is ok for O32/N32 but not N64.
+    case MipsMCExpr::MEK_GOTTPREL:
+      FixupKind = Mips::fixup_Mips_GOTTPREL;
+      break;
+    case MipsMCExpr::MEK_GOT:
+      FixupKind = isMicroMips(STI) ? Mips::fixup_MICROMIPS_GOT16
+                                   : Mips::fixup_Mips_GOT;
+      break;
+    case MipsMCExpr::MEK_GOT_CALL:
+      FixupKind = isMicroMips(STI) ? Mips::fixup_MICROMIPS_CALL16
+                                   : Mips::fixup_Mips_CALL16;
+      break;
+    case MipsMCExpr::MEK_GOT_DISP:
+      FixupKind = isMicroMips(STI) ? Mips::fixup_MICROMIPS_GOT_DISP
+                                   : Mips::fixup_Mips_GOT_DISP;
       break;
-    case MCSymbolRefExpr::VK_Mips_GPOFF_HI :
-      FixupKind = Mips::fixup_Mips_GPOFF_HI;
+    case MipsMCExpr::MEK_GOT_HI16:
+      FixupKind = Mips::fixup_Mips_GOT_HI16;
       break;
-    case MCSymbolRefExpr::VK_Mips_GPOFF_LO :
-      FixupKind = Mips::fixup_Mips_GPOFF_LO;
+    case MipsMCExpr::MEK_GOT_LO16:
+      FixupKind = Mips::fixup_Mips_GOT_LO16;
       break;
-    case MCSymbolRefExpr::VK_Mips_GOT_PAGE :
+    case MipsMCExpr::MEK_GOT_PAGE:
       FixupKind = isMicroMips(STI) ? Mips::fixup_MICROMIPS_GOT_PAGE
-                              : Mips::fixup_Mips_GOT_PAGE;
+                                   : Mips::fixup_Mips_GOT_PAGE;
       break;
-    case MCSymbolRefExpr::VK_Mips_GOT_OFST :
+    case MipsMCExpr::MEK_GOT_OFST:
       FixupKind = isMicroMips(STI) ? Mips::fixup_MICROMIPS_GOT_OFST
-                              : Mips::fixup_Mips_GOT_OFST;
+                                   : Mips::fixup_Mips_GOT_OFST;
       break;
-    case MCSymbolRefExpr::VK_Mips_GOT_DISP :
-      FixupKind = isMicroMips(STI) ? Mips::fixup_MICROMIPS_GOT_DISP
-                              : Mips::fixup_Mips_GOT_DISP;
-      break;
-    case MCSymbolRefExpr::VK_Mips_GPREL:
+    case MipsMCExpr::MEK_GPREL:
       FixupKind = Mips::fixup_Mips_GPREL16;
       break;
-    case MCSymbolRefExpr::VK_Mips_GOT_CALL:
-      FixupKind = isMicroMips(STI) ? Mips::fixup_MICROMIPS_CALL16
-                              : Mips::fixup_Mips_CALL16;
+    case MipsMCExpr::MEK_LO: {
+      // Check for %lo(%neg(%gp_rel(X)))
+      if (MipsExpr->isGpOff()) {
+        FixupKind = Mips::fixup_Mips_GPOFF_LO;
+        break;
+      }
+      FixupKind = isMicroMips(STI) ? Mips::fixup_MICROMIPS_LO16
+                                   : Mips::fixup_Mips_LO16;
       break;
-    case MCSymbolRefExpr::VK_Mips_GOT16:
-      FixupKind = isMicroMips(STI) ? Mips::fixup_MICROMIPS_GOT16
-                              : Mips::fixup_Mips_GOT_Global;
+    }
+    case MipsMCExpr::MEK_HIGHEST:
+      FixupKind = Mips::fixup_Mips_HIGHEST;
       break;
-    case MCSymbolRefExpr::VK_Mips_GOT:
-      FixupKind = isMicroMips(STI) ? Mips::fixup_MICROMIPS_GOT16
-                              : Mips::fixup_Mips_GOT_Local;
+    case MipsMCExpr::MEK_HIGHER:
+      FixupKind = Mips::fixup_Mips_HIGHER;
       break;
-    case MCSymbolRefExpr::VK_Mips_ABS_HI:
+    case MipsMCExpr::MEK_HI:
+      // Check for %hi(%neg(%gp_rel(X)))
+      if (MipsExpr->isGpOff()) {
+        FixupKind = Mips::fixup_Mips_GPOFF_HI;
+        break;
+      }
       FixupKind = isMicroMips(STI) ? Mips::fixup_MICROMIPS_HI16
-                              : Mips::fixup_Mips_HI16;
+                                   : Mips::fixup_Mips_HI16;
       break;
-    case MCSymbolRefExpr::VK_Mips_ABS_LO:
-      FixupKind = isMicroMips(STI) ? Mips::fixup_MICROMIPS_LO16
-                              : Mips::fixup_Mips_LO16;
+    case MipsMCExpr::MEK_PCREL_HI16:
+      FixupKind = Mips::fixup_MIPS_PCHI16;
+      break;
+    case MipsMCExpr::MEK_PCREL_LO16:
+      FixupKind = Mips::fixup_MIPS_PCLO16;
       break;
-    case MCSymbolRefExpr::VK_Mips_TLSGD:
+    case MipsMCExpr::MEK_TLSGD:
       FixupKind = isMicroMips(STI) ? Mips::fixup_MICROMIPS_TLS_GD
-                              : Mips::fixup_Mips_TLSGD;
+                                   : Mips::fixup_Mips_TLSGD;
       break;
-    case MCSymbolRefExpr::VK_Mips_TLSLDM:
+    case MipsMCExpr::MEK_TLSLDM:
       FixupKind = isMicroMips(STI) ? Mips::fixup_MICROMIPS_TLS_LDM
-                              : Mips::fixup_Mips_TLSLDM;
-      break;
-    case MCSymbolRefExpr::VK_Mips_DTPREL_HI:
-      FixupKind = isMicroMips(STI) ? Mips::fixup_MICROMIPS_TLS_DTPREL_HI16
-                              : Mips::fixup_Mips_DTPREL_HI;
-      break;
-    case MCSymbolRefExpr::VK_Mips_DTPREL_LO:
-      FixupKind = isMicroMips(STI) ? Mips::fixup_MICROMIPS_TLS_DTPREL_LO16
-                              : Mips::fixup_Mips_DTPREL_LO;
-      break;
-    case MCSymbolRefExpr::VK_Mips_GOTTPREL:
-      FixupKind = Mips::fixup_Mips_GOTTPREL;
+                                   : Mips::fixup_Mips_TLSLDM;
       break;
-    case MCSymbolRefExpr::VK_Mips_TPREL_HI:
+    case MipsMCExpr::MEK_TPREL_HI:
       FixupKind = isMicroMips(STI) ? Mips::fixup_MICROMIPS_TLS_TPREL_HI16
-                              : Mips::fixup_Mips_TPREL_HI;
+                                   : Mips::fixup_Mips_TPREL_HI;
       break;
-    case MCSymbolRefExpr::VK_Mips_TPREL_LO:
+    case MipsMCExpr::MEK_TPREL_LO:
       FixupKind = isMicroMips(STI) ? Mips::fixup_MICROMIPS_TLS_TPREL_LO16
-                              : Mips::fixup_Mips_TPREL_LO;
+                                   : Mips::fixup_Mips_TPREL_LO;
       break;
-    case MCSymbolRefExpr::VK_Mips_HIGHER:
-      FixupKind = Mips::fixup_Mips_HIGHER;
-      break;
-    case MCSymbolRefExpr::VK_Mips_HIGHEST:
-      FixupKind = Mips::fixup_Mips_HIGHEST;
-      break;
-    case MCSymbolRefExpr::VK_Mips_GOT_HI16:
-      FixupKind = Mips::fixup_Mips_GOT_HI16;
-      break;
-    case MCSymbolRefExpr::VK_Mips_GOT_LO16:
-      FixupKind = Mips::fixup_Mips_GOT_LO16;
-      break;
-    case MCSymbolRefExpr::VK_Mips_CALL_HI16:
-      FixupKind = Mips::fixup_Mips_CALL_HI16;
-      break;
-    case MCSymbolRefExpr::VK_Mips_CALL_LO16:
-      FixupKind = Mips::fixup_Mips_CALL_LO16;
-      break;
-    case MCSymbolRefExpr::VK_Mips_PCREL_HI16:
-      FixupKind = Mips::fixup_MIPS_PCHI16;
+    }
+    Fixups.push_back(MCFixup::create(0, MipsExpr, MCFixupKind(FixupKind)));
+    return 0;
+  }
+
+  if (Kind == MCExpr::SymbolRef) {
+    Mips::Fixups FixupKind = Mips::Fixups(0);
+
+    switch(cast<MCSymbolRefExpr>(Expr)->getKind()) {
+    default: llvm_unreachable("Unknown fixup kind!");
       break;
-    case MCSymbolRefExpr::VK_Mips_PCREL_LO16:
-      FixupKind = Mips::fixup_MIPS_PCLO16;
+    case MCSymbolRefExpr::VK_None:
+      FixupKind = Mips::fixup_Mips_32; // FIXME: This is ok for O32/N32 but not N64.
       break;
     } // switch
 
@@ -660,61 +775,20 @@ getMachineOpValue(const MCInst &MI, const MCOperand &MO,
   return getExprOpValue(MO.getExpr(),Fixups, STI);
 }
 
-/// getMSAMemEncoding - Return binary encoding of memory operand for LD/ST
-/// instructions.
-unsigned
-MipsMCCodeEmitter::getMSAMemEncoding(const MCInst &MI, unsigned OpNo,
-                                     SmallVectorImpl<MCFixup> &Fixups,
-                                     const MCSubtargetInfo &STI) const {
-  // Base register is encoded in bits 20-16, offset is encoded in bits 15-0.
-  assert(MI.getOperand(OpNo).isReg());
-  unsigned RegBits = getMachineOpValue(MI, MI.getOperand(OpNo),Fixups, STI) << 16;
-  unsigned OffBits = getMachineOpValue(MI, MI.getOperand(OpNo+1), Fixups, STI);
-
-  // The immediate field of an LD/ST instruction is scaled which means it must
-  // be divided (when encoding) by the size (in bytes) of the instructions'
-  // data format.
-  // .b - 1 byte
-  // .h - 2 bytes
-  // .w - 4 bytes
-  // .d - 8 bytes
-  switch(MI.getOpcode())
-  {
-  default:
-    assert (0 && "Unexpected instruction");
-    break;
-  case Mips::LD_B:
-  case Mips::ST_B:
-    // We don't need to scale the offset in this case
-    break;
-  case Mips::LD_H:
-  case Mips::ST_H:
-    OffBits >>= 1;
-    break;
-  case Mips::LD_W:
-  case Mips::ST_W:
-    OffBits >>= 2;
-    break;
-  case Mips::LD_D:
-  case Mips::ST_D:
-    OffBits >>= 3;
-    break;
-  }
-
-  return (OffBits & 0xFFFF) | RegBits;
-}
-
-/// getMemEncoding - Return binary encoding of memory related operand.
+/// Return binary encoding of memory related operand.
 /// If the offset operand requires relocation, record the relocation.
-unsigned
-MipsMCCodeEmitter::getMemEncoding(const MCInst &MI, unsigned OpNo,
-                                  SmallVectorImpl<MCFixup> &Fixups,
-                                  const MCSubtargetInfo &STI) const {
+template <unsigned ShiftAmount>
+unsigned MipsMCCodeEmitter::getMemEncoding(const MCInst &MI, unsigned OpNo,
+                                           SmallVectorImpl<MCFixup> &Fixups,
+                                           const MCSubtargetInfo &STI) const {
   // Base register is encoded in bits 20-16, offset is encoded in bits 15-0.
   assert(MI.getOperand(OpNo).isReg());
   unsigned RegBits = getMachineOpValue(MI, MI.getOperand(OpNo),Fixups, STI) << 16;
   unsigned OffBits = getMachineOpValue(MI, MI.getOperand(OpNo+1), Fixups, STI);
 
+  // Apply the scale factor if there is one.
+  OffBits >>= ShiftAmount;
+
   return (OffBits & 0xFFFF) | RegBits;
 }
 
@@ -804,6 +878,19 @@ getMemEncodingMMImm9(const MCInst &MI, unsigned OpNo,
 }
 
 unsigned MipsMCCodeEmitter::
+getMemEncodingMMImm11(const MCInst &MI, unsigned OpNo,
+                      SmallVectorImpl<MCFixup> &Fixups,
+                      const MCSubtargetInfo &STI) const {
+  // Base register is encoded in bits 20-16, offset is encoded in bits 10-0.
+  assert(MI.getOperand(OpNo).isReg());
+  unsigned RegBits = getMachineOpValue(MI, MI.getOperand(OpNo), Fixups,
+                                       STI) << 16;
+  unsigned OffBits = getMachineOpValue(MI, MI.getOperand(OpNo+1), Fixups, STI);
+
+  return (OffBits & 0x07FF) | RegBits;
+}
+
+unsigned MipsMCCodeEmitter::
 getMemEncodingMMImm12(const MCInst &MI, unsigned OpNo,
                       SmallVectorImpl<MCFixup> &Fixups,
                       const MCSubtargetInfo &STI) const {
@@ -906,8 +993,9 @@ MipsMCCodeEmitter::getSimm19Lsl2Encoding(const MCInst &MI, unsigned OpNo,
          "getSimm19Lsl2Encoding expects only expressions or an immediate");
 
   const MCExpr *Expr = MO.getExpr();
-  Fixups.push_back(MCFixup::create(0, Expr,
-                                   MCFixupKind(Mips::fixup_MIPS_PC19_S2)));
+  Mips::Fixups FixupKind = isMicroMips(STI) ? Mips::fixup_MICROMIPS_PC19_S2
+                                            : Mips::fixup_MIPS_PC19_S2;
+  Fixups.push_back(MCFixup::create(0, Expr, MCFixupKind(FixupKind)));
   return 0;
 }
 
@@ -927,8 +1015,9 @@ MipsMCCodeEmitter::getSimm18Lsl3Encoding(const MCInst &MI, unsigned OpNo,
          "getSimm18Lsl2Encoding expects only expressions or an immediate");
 
   const MCExpr *Expr = MO.getExpr();
-  Fixups.push_back(MCFixup::create(0, Expr,
-                                   MCFixupKind(Mips::fixup_MIPS_PC18_S3)));
+  Mips::Fixups FixupKind = isMicroMips(STI) ? Mips::fixup_MICROMIPS_PC18_S3
+                                            : Mips::fixup_MIPS_PC18_S3;
+  Fixups.push_back(MCFixup::create(0, Expr, MCFixupKind(FixupKind)));
   return 0;
 }
 
diff --git a/lib/Target/Mips/MCTargetDesc/MipsMCCodeEmitter.h b/lib/Target/Mips/MCTargetDesc/MipsMCCodeEmitter.h
index fdacd172e3a2..0f4dfe1200c0 100644
--- a/lib/Target/Mips/MCTargetDesc/MipsMCCodeEmitter.h
+++ b/lib/Target/Mips/MCTargetDesc/MipsMCCodeEmitter.h
@@ -102,6 +102,20 @@ public:
                                   SmallVectorImpl<MCFixup> &Fixups,
                                   const MCSubtargetInfo &STI) const;
 
+  // getBranchTargetOpValue1SImm16 - Return binary encoding of the branch
+  // target operand. If the machine operand requires relocation,
+  // record the relocation and return zero.
+  unsigned getBranchTargetOpValue1SImm16(const MCInst &MI, unsigned OpNo,
+                                         SmallVectorImpl<MCFixup> &Fixups,
+                                         const MCSubtargetInfo &STI) const;
+
+  // getBranchTargetOpValueMMR6 - Return binary encoding of the branch
+  // target operand. If the machine operand requires relocation,
+  // record the relocation and return zero.
+  unsigned getBranchTargetOpValueMMR6(const MCInst &MI, unsigned OpNo,
+                                      SmallVectorImpl<MCFixup> &Fixups,
+                                      const MCSubtargetInfo &STI) const;
+
   // getBranchTarget7OpValue - Return binary encoding of the microMIPS branch
   // target operand. If the machine operand requires relocation,
   // record the relocation and return zero.
@@ -130,6 +144,13 @@ public:
                                    SmallVectorImpl<MCFixup> &Fixups,
                                    const MCSubtargetInfo &STI) const;
 
+  // getBranchTarget21OpValueMM - Return binary encoding of the branch
+  // offset operand for microMIPS. If the machine operand requires
+  // relocation,record the relocation and return zero.
+  unsigned getBranchTarget21OpValueMM(const MCInst &MI, unsigned OpNo,
+                                      SmallVectorImpl<MCFixup> &Fixups,
+                                      const MCSubtargetInfo &STI) const;
+
   // getBranchTarget26OpValue - Return binary encoding of the branch
   // offset operand. If the machine operand requires relocation,
   // record the relocation and return zero.
@@ -161,6 +182,7 @@ public:
                              SmallVectorImpl<MCFixup> &Fixups,
                              const MCSubtargetInfo &STI) const;
 
+  template <unsigned ShiftAmount = 0>
   unsigned getMemEncoding(const MCInst &MI, unsigned OpNo,
                           SmallVectorImpl<MCFixup> &Fixups,
                           const MCSubtargetInfo &STI) const;
@@ -182,6 +204,9 @@ public:
   unsigned getMemEncodingMMImm9(const MCInst &MI, unsigned OpNo,
                                 SmallVectorImpl<MCFixup> &Fixups,
                                 const MCSubtargetInfo &STI) const;
+  unsigned getMemEncodingMMImm11(const MCInst &MI, unsigned OpNo,
+                                 SmallVectorImpl<MCFixup> &Fixups,
+                                 const MCSubtargetInfo &STI) const;
   unsigned getMemEncodingMMImm12(const MCInst &MI, unsigned OpNo,
                                  SmallVectorImpl<MCFixup> &Fixups,
                                  const MCSubtargetInfo &STI) const;
@@ -238,6 +263,8 @@ public:
   unsigned getRegisterListOpValue16(const MCInst &MI, unsigned OpNo,
                                     SmallVectorImpl<MCFixup> &Fixups,
                                     const MCSubtargetInfo &STI) const;
+  private:
+  void LowerCompactBranch(MCInst& Inst) const;
 }; // class MipsMCCodeEmitter
 } // namespace llvm.
 
diff --git a/lib/Target/Mips/MCTargetDesc/MipsMCExpr.cpp b/lib/Target/Mips/MCTargetDesc/MipsMCExpr.cpp
index c85fc4816b08..082bb87fcb8a 100644
--- a/lib/Target/Mips/MCTargetDesc/MipsMCExpr.cpp
+++ b/lib/Target/Mips/MCTargetDesc/MipsMCExpr.cpp
@@ -12,69 +12,110 @@
 #include "llvm/MC/MCAssembler.h"
 #include "llvm/MC/MCContext.h"
 #include "llvm/MC/MCObjectStreamer.h"
+#include "llvm/MC/MCSymbolELF.h"
+#include "llvm/Support/ELF.h"
 
 using namespace llvm;
 
 #define DEBUG_TYPE "mipsmcexpr"
 
-bool MipsMCExpr::isSupportedBinaryExpr(MCSymbolRefExpr::VariantKind VK,
-                                       const MCBinaryExpr *BE) {
-  switch (VK) {
-  case MCSymbolRefExpr::VK_Mips_ABS_LO:
-  case MCSymbolRefExpr::VK_Mips_ABS_HI:
-  case MCSymbolRefExpr::VK_Mips_HIGHER:
-  case MCSymbolRefExpr::VK_Mips_HIGHEST:
-    break;
-  default:
-    return false;
-  }
+const MipsMCExpr *MipsMCExpr::create(MipsMCExpr::MipsExprKind Kind,
+                                     const MCExpr *Expr, MCContext &Ctx) {
+  return new (Ctx) MipsMCExpr(Kind, Expr);
+}
 
-  // We support expressions of the form "(sym1 binop1 sym2) binop2 const",
-  // where "binop2 const" is optional.
-  if (isa<MCBinaryExpr>(BE->getLHS())) {
-    if (!isa<MCConstantExpr>(BE->getRHS()))
-      return false;
-    BE = cast<MCBinaryExpr>(BE->getLHS());
-  }
-  return (isa<MCSymbolRefExpr>(BE->getLHS())
-          && isa<MCSymbolRefExpr>(BE->getRHS()));
+const MipsMCExpr *MipsMCExpr::createGpOff(MipsMCExpr::MipsExprKind Kind,
+                                          const MCExpr *Expr, MCContext &Ctx) {
+  return create(Kind, create(MEK_NEG, create(MEK_GPREL, Expr, Ctx), Ctx), Ctx);
 }
 
-const MipsMCExpr*
-MipsMCExpr::create(MCSymbolRefExpr::VariantKind VK, const MCExpr *Expr,
-                   MCContext &Ctx) {
-  VariantKind Kind;
-  switch (VK) {
-  case MCSymbolRefExpr::VK_Mips_ABS_LO:
-    Kind = VK_Mips_LO;
+void MipsMCExpr::printImpl(raw_ostream &OS, const MCAsmInfo *MAI) const {
+  int64_t AbsVal;
+
+  switch (Kind) {
+  case MEK_None:
+  case MEK_Special:
+    llvm_unreachable("MEK_None and MEK_Special are invalid");
     break;
-  case MCSymbolRefExpr::VK_Mips_ABS_HI:
-    Kind = VK_Mips_HI;
+  case MEK_CALL_HI16:
+    OS << "%call_hi";
     break;
-  case MCSymbolRefExpr::VK_Mips_HIGHER:
-    Kind = VK_Mips_HIGHER;
+  case MEK_CALL_LO16:
+    OS << "%call_lo";
     break;
-  case MCSymbolRefExpr::VK_Mips_HIGHEST:
-    Kind = VK_Mips_HIGHEST;
+  case MEK_DTPREL_HI:
+    OS << "%dtprel_hi";
+    break;
+  case MEK_DTPREL_LO:
+    OS << "%dtprel_lo";
+    break;
+  case MEK_GOT:
+    OS << "%got";
+    break;
+  case MEK_GOTTPREL:
+    OS << "%gottprel";
+    break;
+  case MEK_GOT_CALL:
+    OS << "%call16";
+    break;
+  case MEK_GOT_DISP:
+    OS << "%got_disp";
+    break;
+  case MEK_GOT_HI16:
+    OS << "%got_hi";
+    break;
+  case MEK_GOT_LO16:
+    OS << "%got_lo";
+    break;
+  case MEK_GOT_PAGE:
+    OS << "%got_page";
+    break;
+  case MEK_GOT_OFST:
+    OS << "%got_ofst";
+    break;
+  case MEK_GPREL:
+    OS << "%gp_rel";
+    break;
+  case MEK_HI:
+    OS << "%hi";
+    break;
+  case MEK_HIGHER:
+    OS << "%higher";
+    break;
+  case MEK_HIGHEST:
+    OS << "%highest";
+    break;
+  case MEK_LO:
+    OS << "%lo";
+    break;
+  case MEK_NEG:
+    OS << "%neg";
+    break;
+  case MEK_PCREL_HI16:
+    OS << "%pcrel_hi";
+    break;
+  case MEK_PCREL_LO16:
+    OS << "%pcrel_lo";
+    break;
+  case MEK_TLSGD:
+    OS << "%tlsgd";
+    break;
+  case MEK_TLSLDM:
+    OS << "%tlsldm";
+    break;
+  case MEK_TPREL_HI:
+    OS << "%tprel_hi";
+    break;
+  case MEK_TPREL_LO:
+    OS << "%tprel_lo";
     break;
-  default:
-    llvm_unreachable("Invalid kind!");
-  }
-
-  return new (Ctx) MipsMCExpr(Kind, Expr);
-}
-
-void MipsMCExpr::printImpl(raw_ostream &OS, const MCAsmInfo *MAI) const {
-  switch (Kind) {
-  default: llvm_unreachable("Invalid kind!");
-  case VK_Mips_LO: OS << "%lo"; break;
-  case VK_Mips_HI: OS << "%hi"; break;
-  case VK_Mips_HIGHER: OS << "%higher"; break;
-  case VK_Mips_HIGHEST: OS << "%highest"; break;
   }
 
   OS << '(';
-  Expr->print(OS, MAI);
+  if (Expr->evaluateAsAbsolute(AbsVal))
+    OS << AbsVal;
+  else
+    Expr->print(OS, MAI, true);
   OS << ')';
 }
 
@@ -82,9 +123,165 @@ bool
 MipsMCExpr::evaluateAsRelocatableImpl(MCValue &Res,
                                       const MCAsmLayout *Layout,
                                       const MCFixup *Fixup) const {
-  return getSubExpr()->evaluateAsRelocatable(Res, Layout, Fixup);
+  // Look for the %hi(%neg(%gp_rel(X))) and %lo(%neg(%gp_rel(X))) special cases.
+  if (isGpOff()) {
+    const MCExpr *SubExpr =
+        cast<MipsMCExpr>(cast<MipsMCExpr>(getSubExpr())->getSubExpr())
+            ->getSubExpr();
+    if (!SubExpr->evaluateAsRelocatable(Res, Layout, Fixup))
+      return false;
+
+    Res = MCValue::get(Res.getSymA(), Res.getSymB(), Res.getConstant(),
+                       MEK_Special);
+    return true;
+  }
+
+  if (!getSubExpr()->evaluateAsRelocatable(Res, Layout, Fixup))
+    return false;
+
+  if (Res.getRefKind() != MCSymbolRefExpr::VK_None)
+    return false;
+
+  // evaluateAsAbsolute() and evaluateAsValue() require that we evaluate the
+  // %hi/%lo/etc. here. Fixup is a null pointer when either of these is the
+  // caller.
+  if (Res.isAbsolute() && Fixup == nullptr) {
+    int64_t AbsVal = Res.getConstant();
+    switch (Kind) {
+    case MEK_None:
+    case MEK_Special:
+      llvm_unreachable("MEK_None and MEK_Special are invalid");
+    case MEK_DTPREL_HI:
+    case MEK_DTPREL_LO:
+    case MEK_GOT:
+    case MEK_GOTTPREL:
+    case MEK_GOT_CALL:
+    case MEK_GOT_DISP:
+    case MEK_GOT_HI16:
+    case MEK_GOT_LO16:
+    case MEK_GOT_OFST:
+    case MEK_GOT_PAGE:
+    case MEK_GPREL:
+    case MEK_PCREL_HI16:
+    case MEK_PCREL_LO16:
+    case MEK_TLSGD:
+    case MEK_TLSLDM:
+    case MEK_TPREL_HI:
+    case MEK_TPREL_LO:
+      return false;
+    case MEK_LO:
+    case MEK_CALL_LO16:
+      AbsVal = SignExtend64<16>(AbsVal);
+      break;
+    case MEK_CALL_HI16:
+    case MEK_HI:
+      AbsVal = SignExtend64<16>((AbsVal + 0x8000) >> 16);
+      break;
+    case MEK_HIGHER:
+      AbsVal = SignExtend64<16>((AbsVal + 0x80008000LL) >> 32);
+      break;
+    case MEK_HIGHEST:
+      AbsVal = SignExtend64<16>((AbsVal + 0x800080008000LL) >> 48);
+      break;
+    case MEK_NEG:
+      AbsVal = -AbsVal;
+      break;
+    }
+    Res = MCValue::get(AbsVal);
+    return true;
+  }
+
+  // We want to defer it for relocatable expressions since the constant is
+  // applied to the whole symbol value.
+  //
+  // The value of getKind() that is given to MCValue is only intended to aid
+  // debugging when inspecting MCValue objects. It shouldn't be relied upon
+  // for decision making.
+  Res = MCValue::get(Res.getSymA(), Res.getSymB(), Res.getConstant(), getKind());
+
+  return true;
 }
 
 void MipsMCExpr::visitUsedExpr(MCStreamer &Streamer) const {
   Streamer.visitUsedExpr(*getSubExpr());
 }
+
+static void fixELFSymbolsInTLSFixupsImpl(const MCExpr *Expr, MCAssembler &Asm) {
+  switch (Expr->getKind()) {
+  case MCExpr::Target:
+    fixELFSymbolsInTLSFixupsImpl(cast<MipsMCExpr>(Expr)->getSubExpr(), Asm);
+    break;
+  case MCExpr::Constant:
+    break;
+  case MCExpr::Binary: {
+    const MCBinaryExpr *BE = cast<MCBinaryExpr>(Expr);
+    fixELFSymbolsInTLSFixupsImpl(BE->getLHS(), Asm);
+    fixELFSymbolsInTLSFixupsImpl(BE->getRHS(), Asm);
+    break;
+  }
+  case MCExpr::SymbolRef: {
+    // We're known to be under a TLS fixup, so any symbol should be
+    // modified. There should be only one.
+    const MCSymbolRefExpr &SymRef = *cast<MCSymbolRefExpr>(Expr);
+    cast<MCSymbolELF>(SymRef.getSymbol()).setType(ELF::STT_TLS);
+    break;
+  }
+  case MCExpr::Unary:
+    fixELFSymbolsInTLSFixupsImpl(cast<MCUnaryExpr>(Expr)->getSubExpr(), Asm);
+    break;
+  }
+}
+
+void MipsMCExpr::fixELFSymbolsInTLSFixups(MCAssembler &Asm) const {
+  switch (getKind()) {
+  case MEK_None:
+  case MEK_Special:
+    llvm_unreachable("MEK_None and MEK_Special are invalid");
+    break;
+  case MEK_CALL_HI16:
+  case MEK_CALL_LO16:
+  case MEK_DTPREL_HI:
+  case MEK_DTPREL_LO:
+  case MEK_GOT:
+  case MEK_GOT_CALL:
+  case MEK_GOT_DISP:
+  case MEK_GOT_HI16:
+  case MEK_GOT_LO16:
+  case MEK_GOT_OFST:
+  case MEK_GOT_PAGE:
+  case MEK_GPREL:
+  case MEK_HI:
+  case MEK_HIGHER:
+  case MEK_HIGHEST:
+  case MEK_LO:
+  case MEK_NEG:
+  case MEK_PCREL_HI16:
+  case MEK_PCREL_LO16:
+  case MEK_TLSLDM:
+    // If we do have nested target-specific expressions, they will be in
+    // a consecutive chain.
+    if (const MipsMCExpr *E = dyn_cast<const MipsMCExpr>(getSubExpr()))
+      E->fixELFSymbolsInTLSFixups(Asm);
+    break;
+  case MEK_GOTTPREL:
+  case MEK_TLSGD:
+  case MEK_TPREL_HI:
+  case MEK_TPREL_LO:
+    fixELFSymbolsInTLSFixupsImpl(getSubExpr(), Asm);
+    break;
+  }
+}
+
+bool MipsMCExpr::isGpOff(MipsExprKind &Kind) const {
+  if (getKind() == MEK_HI || getKind() == MEK_LO) {
+    if (const MipsMCExpr *S1 = dyn_cast<const MipsMCExpr>(getSubExpr())) {
+      if (const MipsMCExpr *S2 = dyn_cast<const MipsMCExpr>(S1->getSubExpr())) {
+        if (S1->getKind() == MEK_NEG && S2->getKind() == MEK_GPREL) {
+          Kind = getKind();
+          return true;
+        }
+      }
+    }
+  }
+  return false;
+}
diff --git a/lib/Target/Mips/MCTargetDesc/MipsMCExpr.h b/lib/Target/Mips/MCTargetDesc/MipsMCExpr.h
index e889972c5c0e..d1a4334ec640 100644
--- a/lib/Target/Mips/MCTargetDesc/MipsMCExpr.h
+++ b/lib/Target/Mips/MCTargetDesc/MipsMCExpr.h
@@ -18,49 +18,73 @@ namespace llvm {
 
 class MipsMCExpr : public MCTargetExpr {
 public:
-  enum VariantKind {
-    VK_Mips_None,
-    VK_Mips_LO,
-    VK_Mips_HI,
-    VK_Mips_HIGHER,
-    VK_Mips_HIGHEST
+  enum MipsExprKind {
+    MEK_None,
+    MEK_CALL_HI16,
+    MEK_CALL_LO16,
+    MEK_DTPREL_HI,
+    MEK_DTPREL_LO,
+    MEK_GOT,
+    MEK_GOTTPREL,
+    MEK_GOT_CALL,
+    MEK_GOT_DISP,
+    MEK_GOT_HI16,
+    MEK_GOT_LO16,
+    MEK_GOT_OFST,
+    MEK_GOT_PAGE,
+    MEK_GPREL,
+    MEK_HI,
+    MEK_HIGHER,
+    MEK_HIGHEST,
+    MEK_LO,
+    MEK_NEG,
+    MEK_PCREL_HI16,
+    MEK_PCREL_LO16,
+    MEK_TLSGD,
+    MEK_TLSLDM,
+    MEK_TPREL_HI,
+    MEK_TPREL_LO,
+    MEK_Special,
   };
 
 private:
-  const VariantKind Kind;
+  const MipsExprKind Kind;
   const MCExpr *Expr;
 
-  explicit MipsMCExpr(VariantKind Kind, const MCExpr *Expr)
-    : Kind(Kind), Expr(Expr) {}
+  explicit MipsMCExpr(MipsExprKind Kind, const MCExpr *Expr)
+      : Kind(Kind), Expr(Expr) {}
 
 public:
-  static bool isSupportedBinaryExpr(MCSymbolRefExpr::VariantKind VK,
-                                    const MCBinaryExpr *BE);
+  static const MipsMCExpr *create(MipsExprKind Kind, const MCExpr *Expr,
+                                  MCContext &Ctx);
+  static const MipsMCExpr *createGpOff(MipsExprKind Kind, const MCExpr *Expr,
+                                       MCContext &Ctx);
 
-  static const MipsMCExpr *create(MCSymbolRefExpr::VariantKind VK,
-                                  const MCExpr *Expr, MCContext &Ctx);
+  /// Get the kind of this expression.
+  MipsExprKind getKind() const { return Kind; }
 
-  /// getOpcode - Get the kind of this expression.
-  VariantKind getKind() const { return Kind; }
-
-  /// getSubExpr - Get the child of this expression.
+  /// Get the child of this expression.
   const MCExpr *getSubExpr() const { return Expr; }
 
   void printImpl(raw_ostream &OS, const MCAsmInfo *MAI) const override;
-  bool evaluateAsRelocatableImpl(MCValue &Res,
-                                 const MCAsmLayout *Layout,
+  bool evaluateAsRelocatableImpl(MCValue &Res, const MCAsmLayout *Layout,
                                  const MCFixup *Fixup) const override;
   void visitUsedExpr(MCStreamer &Streamer) const override;
   MCFragment *findAssociatedFragment() const override {
     return getSubExpr()->findAssociatedFragment();
   }
 
-  // There are no TLS MipsMCExprs at the moment.
-  void fixELFSymbolsInTLSFixups(MCAssembler &Asm) const override {}
+  void fixELFSymbolsInTLSFixups(MCAssembler &Asm) const override;
 
   static bool classof(const MCExpr *E) {
     return E->getKind() == MCExpr::Target;
   }
+
+  bool isGpOff(MipsExprKind &Kind) const;
+  bool isGpOff() const {
+    MipsExprKind Kind;
+    return isGpOff(Kind);
+  }
 };
 } // end namespace llvm
 
diff --git a/lib/Target/Mips/MCTargetDesc/MipsMCTargetDesc.cpp b/lib/Target/Mips/MCTargetDesc/MipsMCTargetDesc.cpp
index 949ee1474f96..a05573950974 100644
--- a/lib/Target/Mips/MCTargetDesc/MipsMCTargetDesc.cpp
+++ b/lib/Target/Mips/MCTargetDesc/MipsMCTargetDesc.cpp
@@ -11,21 +11,20 @@
 //
 //===----------------------------------------------------------------------===//
 
+#include "MipsMCTargetDesc.h"
 #include "InstPrinter/MipsInstPrinter.h"
 #include "MipsELFStreamer.h"
 #include "MipsMCAsmInfo.h"
 #include "MipsMCNaCl.h"
-#include "MipsMCTargetDesc.h"
 #include "MipsTargetStreamer.h"
 #include "llvm/ADT/Triple.h"
-#include "llvm/MC/MCCodeGenInfo.h"
 #include "llvm/MC/MCELFStreamer.h"
+#include "llvm/MC/MCInstrAnalysis.h"
 #include "llvm/MC/MCInstrInfo.h"
 #include "llvm/MC/MCRegisterInfo.h"
 #include "llvm/MC/MCSubtargetInfo.h"
 #include "llvm/MC/MCSymbol.h"
 #include "llvm/MC/MachineLocation.h"
-#include "llvm/Support/CommandLine.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/FormattedStream.h"
 #include "llvm/Support/TargetRegistry.h"
@@ -82,18 +81,6 @@ static MCAsmInfo *createMipsMCAsmInfo(const MCRegisterInfo &MRI,
   return MAI;
 }
 
-static MCCodeGenInfo *createMipsMCCodeGenInfo(const Triple &TT, Reloc::Model RM,
-                                              CodeModel::Model CM,
-                                              CodeGenOpt::Level OL) {
-  MCCodeGenInfo *X = new MCCodeGenInfo();
-  if (CM == CodeModel::JITDefault)
-    RM = Reloc::Static;
-  else if (RM == Reloc::Default)
-    RM = Reloc::PIC_;
-  X->initMCCodeGenInfo(RM, CM, OL);
-  return X;
-}
-
 static MCInstPrinter *createMipsMCInstPrinter(const Triple &T,
                                               unsigned SyntaxVariant,
                                               const MCAsmInfo &MAI,
@@ -129,15 +116,44 @@ createMipsObjectTargetStreamer(MCStreamer &S, const MCSubtargetInfo &STI) {
   return new MipsTargetELFStreamer(S, STI);
 }
 
+namespace {
+
+class MipsMCInstrAnalysis : public MCInstrAnalysis {
+public:
+  MipsMCInstrAnalysis(const MCInstrInfo *Info) : MCInstrAnalysis(Info) {}
+
+  bool evaluateBranch(const MCInst &Inst, uint64_t Addr, uint64_t Size,
+                      uint64_t &Target) const override {
+    unsigned NumOps = Inst.getNumOperands();
+    if (NumOps == 0)
+      return false;
+    switch (Info->get(Inst.getOpcode()).OpInfo[NumOps - 1].OperandType) {
+    case MCOI::OPERAND_UNKNOWN:
+    case MCOI::OPERAND_IMMEDIATE:
+      // jal, bal ...
+      Target = Inst.getOperand(NumOps - 1).getImm();
+      return true;
+    case MCOI::OPERAND_PCREL:
+      // b, j, beq ...
+      Target = Addr + Inst.getOperand(NumOps - 1).getImm();
+      return true;
+    default:
+      return false;
+    }
+  }
+};
+}
+
+static MCInstrAnalysis *createMipsMCInstrAnalysis(const MCInstrInfo *Info) {
+  return new MipsMCInstrAnalysis(Info);
+}
+
 extern "C" void LLVMInitializeMipsTargetMC() {
   for (Target *T : {&TheMipsTarget, &TheMipselTarget, &TheMips64Target,
                     &TheMips64elTarget}) {
     // Register the MC asm info.
     RegisterMCAsmInfoFn X(*T, createMipsMCAsmInfo);
 
-    // Register the MC codegen info.
-    TargetRegistry::RegisterMCCodeGenInfo(*T, createMipsMCCodeGenInfo);
-
     // Register the MC instruction info.
     TargetRegistry::RegisterMCInstrInfo(*T, createMipsMCInstrInfo);
 
@@ -156,6 +172,9 @@ extern "C" void LLVMInitializeMipsTargetMC() {
     // Register the MC subtarget info.
     TargetRegistry::RegisterMCSubtargetInfo(*T, createMipsMCSubtargetInfo);
 
+    // Register the MC instruction analyzer.
+    TargetRegistry::RegisterMCInstrAnalysis(*T, createMipsMCInstrAnalysis);
+
     // Register the MCInstPrinter.
     TargetRegistry::RegisterMCInstPrinter(*T, createMipsMCInstPrinter);
 
diff --git a/lib/Target/Mips/MCTargetDesc/MipsTargetStreamer.cpp b/lib/Target/Mips/MCTargetDesc/MipsTargetStreamer.cpp
index e5fa7556053f..7f79eb400f59 100644
--- a/lib/Target/Mips/MCTargetDesc/MipsTargetStreamer.cpp
+++ b/lib/Target/Mips/MCTargetDesc/MipsTargetStreamer.cpp
@@ -11,11 +11,12 @@
 //
 //===----------------------------------------------------------------------===//
 
+#include "MipsTargetStreamer.h"
 #include "InstPrinter/MipsInstPrinter.h"
 #include "MipsELFStreamer.h"
+#include "MipsMCExpr.h"
 #include "MipsMCTargetDesc.h"
 #include "MipsTargetObjectFile.h"
-#include "MipsTargetStreamer.h"
 #include "llvm/MC/MCContext.h"
 #include "llvm/MC/MCSectionELF.h"
 #include "llvm/MC/MCSubtargetInfo.h"
@@ -27,12 +28,19 @@
 
 using namespace llvm;
 
+namespace {
+static cl::opt<bool> RoundSectionSizes(
+    "mips-round-section-sizes", cl::init(false),
+    cl::desc("Round section sizes up to the section alignment"), cl::Hidden);
+} // end anonymous namespace
+
 MipsTargetStreamer::MipsTargetStreamer(MCStreamer &S)
     : MCTargetStreamer(S), ModuleDirectiveAllowed(true) {
   GPRInfoSet = FPRInfoSet = FrameInfoSet = false;
 }
 void MipsTargetStreamer::emitDirectiveSetMicroMips() {}
 void MipsTargetStreamer::emitDirectiveSetNoMicroMips() {}
+void MipsTargetStreamer::setUsesMicroMips() {}
 void MipsTargetStreamer::emitDirectiveSetMips16() {}
 void MipsTargetStreamer::emitDirectiveSetNoMips16() { forbidModuleDirective(); }
 void MipsTargetStreamer::emitDirectiveSetReorder() { forbidModuleDirective(); }
@@ -89,9 +97,11 @@ void MipsTargetStreamer::emitDirectiveSetHardFloat() {
 void MipsTargetStreamer::emitDirectiveSetDsp() { forbidModuleDirective(); }
 void MipsTargetStreamer::emitDirectiveSetNoDsp() { forbidModuleDirective(); }
 void MipsTargetStreamer::emitDirectiveCpLoad(unsigned RegNo) {}
-void MipsTargetStreamer::emitDirectiveCpRestore(
-    SmallVector<MCInst, 3> &StoreInsts, int Offset) {
+bool MipsTargetStreamer::emitDirectiveCpRestore(
+    int Offset, function_ref<unsigned()> GetATReg, SMLoc IDLoc,
+    const MCSubtargetInfo *STI) {
   forbidModuleDirective();
+  return true;
 }
 void MipsTargetStreamer::emitDirectiveCpsetup(unsigned RegNo, int RegOrOffset,
                                               const MCSymbol &Sym, bool IsReg) {
@@ -116,6 +126,217 @@ void MipsTargetStreamer::emitDirectiveSetNoOddSPReg() {
   forbidModuleDirective();
 }
 
+void MipsTargetStreamer::emitR(unsigned Opcode, unsigned Reg0, SMLoc IDLoc,
+                               const MCSubtargetInfo *STI) {
+  MCInst TmpInst;
+  TmpInst.setOpcode(Opcode);
+  TmpInst.addOperand(MCOperand::createReg(Reg0));
+  TmpInst.setLoc(IDLoc);
+  getStreamer().EmitInstruction(TmpInst, *STI);
+}
+
+void MipsTargetStreamer::emitRX(unsigned Opcode, unsigned Reg0, MCOperand Op1,
+                                SMLoc IDLoc, const MCSubtargetInfo *STI) {
+  MCInst TmpInst;
+  TmpInst.setOpcode(Opcode);
+  TmpInst.addOperand(MCOperand::createReg(Reg0));
+  TmpInst.addOperand(Op1);
+  TmpInst.setLoc(IDLoc);
+  getStreamer().EmitInstruction(TmpInst, *STI);
+}
+
+void MipsTargetStreamer::emitRI(unsigned Opcode, unsigned Reg0, int32_t Imm,
+                                SMLoc IDLoc, const MCSubtargetInfo *STI) {
+  emitRX(Opcode, Reg0, MCOperand::createImm(Imm), IDLoc, STI);
+}
+
+void MipsTargetStreamer::emitRR(unsigned Opcode, unsigned Reg0, unsigned Reg1,
+                                SMLoc IDLoc, const MCSubtargetInfo *STI) {
+  emitRX(Opcode, Reg0, MCOperand::createReg(Reg1), IDLoc, STI);
+}
+
+void MipsTargetStreamer::emitII(unsigned Opcode, int16_t Imm1, int16_t Imm2,
+                                SMLoc IDLoc, const MCSubtargetInfo *STI) {
+  MCInst TmpInst;
+  TmpInst.setOpcode(Opcode);
+  TmpInst.addOperand(MCOperand::createImm(Imm1));
+  TmpInst.addOperand(MCOperand::createImm(Imm2));
+  TmpInst.setLoc(IDLoc);
+  getStreamer().EmitInstruction(TmpInst, *STI);
+}
+
+void MipsTargetStreamer::emitRRX(unsigned Opcode, unsigned Reg0, unsigned Reg1,
+                                 MCOperand Op2, SMLoc IDLoc,
+                                 const MCSubtargetInfo *STI) {
+  MCInst TmpInst;
+  TmpInst.setOpcode(Opcode);
+  TmpInst.addOperand(MCOperand::createReg(Reg0));
+  TmpInst.addOperand(MCOperand::createReg(Reg1));
+  TmpInst.addOperand(Op2);
+  TmpInst.setLoc(IDLoc);
+  getStreamer().EmitInstruction(TmpInst, *STI);
+}
+
+void MipsTargetStreamer::emitRRR(unsigned Opcode, unsigned Reg0, unsigned Reg1,
+                                 unsigned Reg2, SMLoc IDLoc,
+                                 const MCSubtargetInfo *STI) {
+  emitRRX(Opcode, Reg0, Reg1, MCOperand::createReg(Reg2), IDLoc, STI);
+}
+
+void MipsTargetStreamer::emitRRI(unsigned Opcode, unsigned Reg0, unsigned Reg1,
+                                 int16_t Imm, SMLoc IDLoc,
+                                 const MCSubtargetInfo *STI) {
+  emitRRX(Opcode, Reg0, Reg1, MCOperand::createImm(Imm), IDLoc, STI);
+}
+
+void MipsTargetStreamer::emitAddu(unsigned DstReg, unsigned SrcReg,
+                                  unsigned TrgReg, bool Is64Bit,
+                                  const MCSubtargetInfo *STI) {
+  emitRRR(Is64Bit ? Mips::DADDu : Mips::ADDu, DstReg, SrcReg, TrgReg, SMLoc(),
+          STI);
+}
+
+void MipsTargetStreamer::emitDSLL(unsigned DstReg, unsigned SrcReg,
+                                  int16_t ShiftAmount, SMLoc IDLoc,
+                                  const MCSubtargetInfo *STI) {
+  if (ShiftAmount >= 32) {
+    emitRRI(Mips::DSLL32, DstReg, SrcReg, ShiftAmount - 32, IDLoc, STI);
+    return;
+  }
+
+  emitRRI(Mips::DSLL, DstReg, SrcReg, ShiftAmount, IDLoc, STI);
+}
+
+void MipsTargetStreamer::emitEmptyDelaySlot(bool hasShortDelaySlot, SMLoc IDLoc,
+                                            const MCSubtargetInfo *STI) {
+  if (hasShortDelaySlot)
+    emitRR(Mips::MOVE16_MM, Mips::ZERO, Mips::ZERO, IDLoc, STI);
+  else
+    emitRRI(Mips::SLL, Mips::ZERO, Mips::ZERO, 0, IDLoc, STI);
+}
+
+void MipsTargetStreamer::emitNop(SMLoc IDLoc, const MCSubtargetInfo *STI) {
+  emitRRI(Mips::SLL, Mips::ZERO, Mips::ZERO, 0, IDLoc, STI);
+}
+
+/// Emit the $gp restore operation for .cprestore.
+void MipsTargetStreamer::emitGPRestore(int Offset, SMLoc IDLoc,
+                                       const MCSubtargetInfo *STI) {
+  emitLoadWithImmOffset(Mips::LW, Mips::GP, Mips::SP, Offset, Mips::GP, IDLoc,
+                        STI);
+}
+
+/// Emit a store instruction with an immediate offset.
+void MipsTargetStreamer::emitStoreWithImmOffset(
+    unsigned Opcode, unsigned SrcReg, unsigned BaseReg, int64_t Offset,
+    function_ref<unsigned()> GetATReg, SMLoc IDLoc,
+    const MCSubtargetInfo *STI) {
+  if (isInt<16>(Offset)) {
+    emitRRI(Opcode, SrcReg, BaseReg, Offset, IDLoc, STI);
+    return;
+  }
+
+  // sw $8, offset($8) => lui $at, %hi(offset)
+  //                      add $at, $at, $8
+  //                      sw $8, %lo(offset)($at)
+
+  unsigned ATReg = GetATReg();
+  if (!ATReg)
+    return;
+
+  unsigned LoOffset = Offset & 0x0000ffff;
+  unsigned HiOffset = (Offset & 0xffff0000) >> 16;
+
+  // If msb of LoOffset is 1(negative number) we must increment HiOffset
+  // to account for the sign-extension of the low part.
+  if (LoOffset & 0x8000)
+    HiOffset++;
+
+  // Generate the base address in ATReg.
+  emitRI(Mips::LUi, ATReg, HiOffset, IDLoc, STI);
+  if (BaseReg != Mips::ZERO)
+    emitRRR(Mips::ADDu, ATReg, ATReg, BaseReg, IDLoc, STI);
+  // Emit the store with the adjusted base and offset.
+  emitRRI(Opcode, SrcReg, ATReg, LoOffset, IDLoc, STI);
+}
+
+/// Emit a store instruction with an symbol offset. Symbols are assumed to be
+/// out of range for a simm16 will be expanded to appropriate instructions.
+void MipsTargetStreamer::emitStoreWithSymOffset(
+    unsigned Opcode, unsigned SrcReg, unsigned BaseReg, MCOperand &HiOperand,
+    MCOperand &LoOperand, unsigned ATReg, SMLoc IDLoc,
+    const MCSubtargetInfo *STI) {
+  // sw $8, sym => lui $at, %hi(sym)
+  //               sw $8, %lo(sym)($at)
+
+  // Generate the base address in ATReg.
+  emitRX(Mips::LUi, ATReg, HiOperand, IDLoc, STI);
+  if (BaseReg != Mips::ZERO)
+    emitRRR(Mips::ADDu, ATReg, ATReg, BaseReg, IDLoc, STI);
+  // Emit the store with the adjusted base and offset.
+  emitRRX(Opcode, SrcReg, ATReg, LoOperand, IDLoc, STI);
+}
+
+/// Emit a load instruction with an immediate offset. DstReg and TmpReg are
+/// permitted to be the same register iff DstReg is distinct from BaseReg and
+/// DstReg is a GPR. It is the callers responsibility to identify such cases
+/// and pass the appropriate register in TmpReg.
+void MipsTargetStreamer::emitLoadWithImmOffset(unsigned Opcode, unsigned DstReg,
+                                               unsigned BaseReg, int64_t Offset,
+                                               unsigned TmpReg, SMLoc IDLoc,
+                                               const MCSubtargetInfo *STI) {
+  if (isInt<16>(Offset)) {
+    emitRRI(Opcode, DstReg, BaseReg, Offset, IDLoc, STI);
+    return;
+  }
+
+  // 1) lw $8, offset($9) => lui $8, %hi(offset)
+  //                         add $8, $8, $9
+  //                         lw $8, %lo(offset)($9)
+  // 2) lw $8, offset($8) => lui $at, %hi(offset)
+  //                         add $at, $at, $8
+  //                         lw $8, %lo(offset)($at)
+
+  unsigned LoOffset = Offset & 0x0000ffff;
+  unsigned HiOffset = (Offset & 0xffff0000) >> 16;
+
+  // If msb of LoOffset is 1(negative number) we must increment HiOffset
+  // to account for the sign-extension of the low part.
+  if (LoOffset & 0x8000)
+    HiOffset++;
+
+  // Generate the base address in TmpReg.
+  emitRI(Mips::LUi, TmpReg, HiOffset, IDLoc, STI);
+  if (BaseReg != Mips::ZERO)
+    emitRRR(Mips::ADDu, TmpReg, TmpReg, BaseReg, IDLoc, STI);
+  // Emit the load with the adjusted base and offset.
+  emitRRI(Opcode, DstReg, TmpReg, LoOffset, IDLoc, STI);
+}
+
+/// Emit a load instruction with an symbol offset. Symbols are assumed to be
+/// out of range for a simm16 will be expanded to appropriate instructions.
+/// DstReg and TmpReg are permitted to be the same register iff DstReg is a
+/// GPR. It is the callers responsibility to identify such cases and pass the
+/// appropriate register in TmpReg.
+void MipsTargetStreamer::emitLoadWithSymOffset(unsigned Opcode, unsigned DstReg,
+                                               unsigned BaseReg,
+                                               MCOperand &HiOperand,
+                                               MCOperand &LoOperand,
+                                               unsigned TmpReg, SMLoc IDLoc,
+                                               const MCSubtargetInfo *STI) {
+  // 1) lw $8, sym        => lui $8, %hi(sym)
+  //                         lw $8, %lo(sym)($8)
+  // 2) ldc1 $f0, sym     => lui $at, %hi(sym)
+  //                         ldc1 $f0, %lo(sym)($at)
+
+  // Generate the base address in TmpReg.
+  emitRX(Mips::LUi, TmpReg, HiOperand, IDLoc, STI);
+  if (BaseReg != Mips::ZERO)
+    emitRRR(Mips::ADDu, TmpReg, TmpReg, BaseReg, IDLoc, STI);
+  // Emit the load with the adjusted base and offset.
+  emitRRX(Opcode, DstReg, TmpReg, LoOperand, IDLoc, STI);
+}
+
 MipsTargetAsmStreamer::MipsTargetAsmStreamer(MCStreamer &S,
                                              formatted_raw_ostream &OS)
     : MipsTargetStreamer(S), OS(OS) {}
@@ -364,10 +585,12 @@ void MipsTargetAsmStreamer::emitDirectiveCpLoad(unsigned RegNo) {
   forbidModuleDirective();
 }
 
-void MipsTargetAsmStreamer::emitDirectiveCpRestore(
-    SmallVector<MCInst, 3> &StoreInsts, int Offset) {
-  MipsTargetStreamer::emitDirectiveCpRestore(StoreInsts, Offset);
+bool MipsTargetAsmStreamer::emitDirectiveCpRestore(
+    int Offset, function_ref<unsigned()> GetATReg, SMLoc IDLoc,
+    const MCSubtargetInfo *STI) {
+  MipsTargetStreamer::emitDirectiveCpRestore(Offset, GetATReg, IDLoc, STI);
   OS << "\t.cprestore\t" << Offset << "\n";
+  return true;
 }
 
 void MipsTargetAsmStreamer::emitDirectiveCpsetup(unsigned RegNo,
@@ -437,7 +660,15 @@ MipsTargetELFStreamer::MipsTargetELFStreamer(MCStreamer &S,
                                              const MCSubtargetInfo &STI)
     : MipsTargetStreamer(S), MicroMipsEnabled(false), STI(STI) {
   MCAssembler &MCA = getStreamer().getAssembler();
-  Pic = MCA.getContext().getObjectFileInfo()->getRelocM() == Reloc::PIC_;
+
+  // It's possible that MCObjectFileInfo isn't fully initialized at this point
+  // due to an initialization order problem where LLVMTargetMachine creates the
+  // target streamer before TargetLoweringObjectFile calls
+  // InitializeMCObjectFileInfo. There doesn't seem to be a single place that
+  // covers all cases so this statement covers most cases and direct object
+  // emission must call setPic() once MCObjectFileInfo has been initialized. The
+  // cases we don't handle here are covered by MipsAsmPrinter.
+  Pic = MCA.getContext().getObjectFileInfo()->isPositionIndependent();
 
   const FeatureBitset &Features = STI.getFeatureBits();
 
@@ -482,6 +713,10 @@ MipsTargetELFStreamer::MipsTargetELFStreamer(MCStreamer &S,
   else
     EFlags |= ELF::EF_MIPS_ARCH_1;
 
+  // Machine
+  if (Features[Mips::FeatureCnMips])
+    EFlags |= ELF::EF_MIPS_MACH_OCTEON;
+
   // Other options.
   if (Features[Mips::FeatureNaN2008])
     EFlags |= ELF::EF_MIPS_NAN2008;
@@ -521,6 +756,26 @@ void MipsTargetELFStreamer::finish() {
   DataSection.setAlignment(std::max(16u, DataSection.getAlignment()));
   BSSSection.setAlignment(std::max(16u, BSSSection.getAlignment()));
 
+  if (RoundSectionSizes) {
+    // Make sections sizes a multiple of the alignment. This is useful for
+    // verifying the output of IAS against the output of other assemblers but
+    // it's not necessary to produce a correct object and increases section
+    // size.
+    MCStreamer &OS = getStreamer();
+    for (MCSection &S : MCA) {
+      MCSectionELF &Section = static_cast<MCSectionELF &>(S);
+
+      unsigned Alignment = Section.getAlignment();
+      if (Alignment) {
+        OS.SwitchSection(&Section);
+        if (Section.UseCodeAlign())
+          OS.EmitCodeAlignment(Alignment, Alignment);
+        else
+          OS.EmitValueToAlignment(Alignment, 0, 1, Alignment);
+      }
+    }
+  }
+
   const FeatureBitset &Features = STI.getFeatureBits();
 
   // Update e_header flags. See the FIXME and comment above in
@@ -576,11 +831,6 @@ MCELFStreamer &MipsTargetELFStreamer::getStreamer() {
 
 void MipsTargetELFStreamer::emitDirectiveSetMicroMips() {
   MicroMipsEnabled = true;
-
-  MCAssembler &MCA = getStreamer().getAssembler();
-  unsigned Flags = MCA.getELFHeaderEFlags();
-  Flags |= ELF::EF_MIPS_MICROMIPS;
-  MCA.setELFHeaderEFlags(Flags);
   forbidModuleDirective();
 }
 
@@ -589,6 +839,13 @@ void MipsTargetELFStreamer::emitDirectiveSetNoMicroMips() {
   forbidModuleDirective();
 }
 
+void MipsTargetELFStreamer::setUsesMicroMips() {
+  MCAssembler &MCA = getStreamer().getAssembler();
+  unsigned Flags = MCA.getELFHeaderEFlags();
+  Flags |= ELF::EF_MIPS_MICROMIPS;
+  MCA.setELFHeaderEFlags(Flags);
+}
+
 void MipsTargetELFStreamer::emitDirectiveSetMips16() {
   MCAssembler &MCA = getStreamer().getAssembler();
   unsigned Flags = MCA.getELFHeaderEFlags();
@@ -610,8 +867,7 @@ void MipsTargetELFStreamer::emitDirectiveEnd(StringRef Name) {
   MCContext &Context = MCA.getContext();
   MCStreamer &OS = getStreamer();
 
-  MCSectionELF *Sec = Context.getELFSection(".pdr", ELF::SHT_PROGBITS,
-                                            ELF::SHF_ALLOC | ELF::SHT_REL);
+  MCSectionELF *Sec = Context.getELFSection(".pdr", ELF::SHT_PROGBITS, 0);
 
   MCSymbol *Sym = Context.getOrCreateSymbol(Name);
   const MCSymbolRefExpr *ExprRef =
@@ -760,8 +1016,11 @@ void MipsTargetELFStreamer::emitDirectiveCpLoad(unsigned RegNo) {
   MCInst TmpInst;
   TmpInst.setOpcode(Mips::LUi);
   TmpInst.addOperand(MCOperand::createReg(Mips::GP));
-  const MCSymbolRefExpr *HiSym = MCSymbolRefExpr::create(
-      "_gp_disp", MCSymbolRefExpr::VK_Mips_ABS_HI, MCA.getContext());
+  const MCExpr *HiSym = MipsMCExpr::create(
+      MipsMCExpr::MEK_HI,
+      MCSymbolRefExpr::create("_gp_disp", MCSymbolRefExpr::VK_None,
+                              MCA.getContext()),
+      MCA.getContext());
   TmpInst.addOperand(MCOperand::createExpr(HiSym));
   getStreamer().EmitInstruction(TmpInst, STI);
 
@@ -770,8 +1029,11 @@ void MipsTargetELFStreamer::emitDirectiveCpLoad(unsigned RegNo) {
   TmpInst.setOpcode(Mips::ADDiu);
   TmpInst.addOperand(MCOperand::createReg(Mips::GP));
   TmpInst.addOperand(MCOperand::createReg(Mips::GP));
-  const MCSymbolRefExpr *LoSym = MCSymbolRefExpr::create(
-      "_gp_disp", MCSymbolRefExpr::VK_Mips_ABS_LO, MCA.getContext());
+  const MCExpr *LoSym = MipsMCExpr::create(
+      MipsMCExpr::MEK_LO,
+      MCSymbolRefExpr::create("_gp_disp", MCSymbolRefExpr::VK_None,
+                              MCA.getContext()),
+      MCA.getContext());
   TmpInst.addOperand(MCOperand::createExpr(LoSym));
   getStreamer().EmitInstruction(TmpInst, STI);
 
@@ -786,9 +1048,10 @@ void MipsTargetELFStreamer::emitDirectiveCpLoad(unsigned RegNo) {
   forbidModuleDirective();
 }
 
-void MipsTargetELFStreamer::emitDirectiveCpRestore(
-    SmallVector<MCInst, 3> &StoreInsts, int Offset) {
-  MipsTargetStreamer::emitDirectiveCpRestore(StoreInsts, Offset);
+bool MipsTargetELFStreamer::emitDirectiveCpRestore(
+    int Offset, function_ref<unsigned()> GetATReg, SMLoc IDLoc,
+    const MCSubtargetInfo *STI) {
+  MipsTargetStreamer::emitDirectiveCpRestore(Offset, GetATReg, IDLoc, STI);
   // .cprestore offset
   // When PIC mode is enabled and the O32 ABI is used, this directive expands
   // to:
@@ -798,10 +1061,12 @@ void MipsTargetELFStreamer::emitDirectiveCpRestore(
   // Note that .cprestore is ignored if used with the N32 and N64 ABIs or if it
   // is used in non-PIC mode.
   if (!Pic || (getABI().IsN32() || getABI().IsN64()))
-    return;
+    return true;
 
-  for (const MCInst &Inst : StoreInsts)
-    getStreamer().EmitInstruction(Inst, STI);
+  // Store the $gp on the stack.
+  emitStoreWithImmOffset(Mips::SW, Mips::GP, Mips::SP, Offset, GetATReg, IDLoc,
+                         STI);
+  return true;
 }
 
 void MipsTargetELFStreamer::emitDirectiveCpsetup(unsigned RegNo,
@@ -812,54 +1077,55 @@ void MipsTargetELFStreamer::emitDirectiveCpsetup(unsigned RegNo,
   if (!Pic || !(getABI().IsN32() || getABI().IsN64()))
     return;
 
+  forbidModuleDirective();
+
   MCAssembler &MCA = getStreamer().getAssembler();
   MCInst Inst;
 
   // Either store the old $gp in a register or on the stack
   if (IsReg) {
     // move $save, $gpreg
-    Inst.setOpcode(Mips::OR64);
-    Inst.addOperand(MCOperand::createReg(RegOrOffset));
-    Inst.addOperand(MCOperand::createReg(Mips::GP));
-    Inst.addOperand(MCOperand::createReg(Mips::ZERO));
+    emitRRR(Mips::OR64, RegOrOffset, Mips::GP, Mips::ZERO, SMLoc(), &STI);
   } else {
     // sd $gpreg, offset($sp)
-    Inst.setOpcode(Mips::SD);
-    Inst.addOperand(MCOperand::createReg(Mips::GP));
-    Inst.addOperand(MCOperand::createReg(Mips::SP));
-    Inst.addOperand(MCOperand::createImm(RegOrOffset));
+    emitRRI(Mips::SD, Mips::GP, Mips::SP, RegOrOffset, SMLoc(), &STI);
   }
-  getStreamer().EmitInstruction(Inst, STI);
-  Inst.clear();
 
-  const MCSymbolRefExpr *HiExpr = MCSymbolRefExpr::create(
-      &Sym, MCSymbolRefExpr::VK_Mips_GPOFF_HI, MCA.getContext());
-  const MCSymbolRefExpr *LoExpr = MCSymbolRefExpr::create(
-      &Sym, MCSymbolRefExpr::VK_Mips_GPOFF_LO, MCA.getContext());
+  if (getABI().IsN32()) {
+    MCSymbol *GPSym = MCA.getContext().getOrCreateSymbol("__gnu_local_gp");
+    const MipsMCExpr *HiExpr = MipsMCExpr::create(
+        MipsMCExpr::MEK_HI, MCSymbolRefExpr::create(GPSym, MCA.getContext()),
+        MCA.getContext());
+    const MipsMCExpr *LoExpr = MipsMCExpr::create(
+        MipsMCExpr::MEK_LO, MCSymbolRefExpr::create(GPSym, MCA.getContext()),
+        MCA.getContext());
+
+    // lui $gp, %hi(__gnu_local_gp)
+    emitRX(Mips::LUi, Mips::GP, MCOperand::createExpr(HiExpr), SMLoc(), &STI);
+
+    // addiu  $gp, $gp, %lo(__gnu_local_gp)
+    emitRRX(Mips::ADDiu, Mips::GP, Mips::GP, MCOperand::createExpr(LoExpr),
+            SMLoc(), &STI);
+
+    return;
+  }
+
+  const MipsMCExpr *HiExpr = MipsMCExpr::createGpOff(
+      MipsMCExpr::MEK_HI, MCSymbolRefExpr::create(&Sym, MCA.getContext()),
+      MCA.getContext());
+  const MipsMCExpr *LoExpr = MipsMCExpr::createGpOff(
+      MipsMCExpr::MEK_LO, MCSymbolRefExpr::create(&Sym, MCA.getContext()),
+      MCA.getContext());
 
   // lui $gp, %hi(%neg(%gp_rel(funcSym)))
-  Inst.setOpcode(Mips::LUi);
-  Inst.addOperand(MCOperand::createReg(Mips::GP));
-  Inst.addOperand(MCOperand::createExpr(HiExpr));
-  getStreamer().EmitInstruction(Inst, STI);
-  Inst.clear();
+  emitRX(Mips::LUi, Mips::GP, MCOperand::createExpr(HiExpr), SMLoc(), &STI);
 
   // addiu  $gp, $gp, %lo(%neg(%gp_rel(funcSym)))
-  Inst.setOpcode(Mips::ADDiu);
-  Inst.addOperand(MCOperand::createReg(Mips::GP));
-  Inst.addOperand(MCOperand::createReg(Mips::GP));
-  Inst.addOperand(MCOperand::createExpr(LoExpr));
-  getStreamer().EmitInstruction(Inst, STI);
-  Inst.clear();
+  emitRRX(Mips::ADDiu, Mips::GP, Mips::GP, MCOperand::createExpr(LoExpr),
+          SMLoc(), &STI);
 
   // daddu  $gp, $gp, $funcreg
-  Inst.setOpcode(Mips::DADDu);
-  Inst.addOperand(MCOperand::createReg(Mips::GP));
-  Inst.addOperand(MCOperand::createReg(Mips::GP));
-  Inst.addOperand(MCOperand::createReg(RegNo));
-  getStreamer().EmitInstruction(Inst, STI);
-
-  forbidModuleDirective();
+  emitRRR(Mips::DADDu, Mips::GP, Mips::GP, RegNo, SMLoc(), &STI);
 }
 
 void MipsTargetELFStreamer::emitDirectiveCpreturn(unsigned SaveLocation,
diff --git a/lib/Target/Mips/Makefile b/lib/Target/Mips/Makefile
deleted file mode 100644
index 56db450f6961..000000000000
--- a/lib/Target/Mips/Makefile
+++ /dev/null
@@ -1,25 +0,0 @@
-##===- lib/Target/Mips/Makefile ----------------------------*- Makefile -*-===##
-#
-#                     The LLVM Compiler Infrastructure
-#
-# This file is distributed under the University of Illinois Open Source
-# License. See LICENSE.TXT for details.
-#
-##===----------------------------------------------------------------------===##
-
-LEVEL = ../../..
-LIBRARYNAME = LLVMMipsCodeGen
-TARGET = Mips
-
-# Make sure that tblgen is run, first thing.
-BUILT_SOURCES = MipsGenRegisterInfo.inc MipsGenInstrInfo.inc \
-                MipsGenAsmWriter.inc MipsGenFastISel.inc \
-                MipsGenDAGISel.inc MipsGenCallingConv.inc \
-                MipsGenSubtargetInfo.inc MipsGenMCCodeEmitter.inc \
-                MipsGenDisassemblerTables.inc \
-                MipsGenMCPseudoLowering.inc MipsGenAsmMatcher.inc
-
-DIRS = InstPrinter Disassembler AsmParser TargetInfo MCTargetDesc
-
-include $(LEVEL)/Makefile.common
-
diff --git a/lib/Target/Mips/MicroMips32r6InstrFormats.td b/lib/Target/Mips/MicroMips32r6InstrFormats.td
index 400f6eef3fb0..2f0933277e81 100644
--- a/lib/Target/Mips/MicroMips32r6InstrFormats.td
+++ b/lib/Target/Mips/MicroMips32r6InstrFormats.td
@@ -14,6 +14,7 @@
 class MMR6Arch<string opstr> {
   string Arch = "micromipsr6";
   string BaseOpcode = opstr;
+  string DecoderNamespace = "MicroMipsR6";
 }
 
 // Class used for microMIPS32r6 and microMIPS64r6 instructions.
@@ -22,6 +23,24 @@ class MicroMipsR6Inst16 : PredicateControl {
   let InsnPredicates = [HasMicroMips32r6];
 }
 
+//===----------------------------------------------------------------------===//
+//
+// Disambiguators
+//
+//===----------------------------------------------------------------------===//
+//
+// Some encodings are ambiguous except by comparing field values.
+
+class MMDecodeDisambiguatedBy<string Name> : DecodeDisambiguates<Name> {
+  string DecoderNamespace = "MicroMipsR6_Ambiguous";
+}
+
+//===----------------------------------------------------------------------===//
+//
+// Encoding Formats
+//
+//===----------------------------------------------------------------------===//
+
 class BC16_FM_MM16R6 {
   bits<10> offset;
 
@@ -52,6 +71,32 @@ class POOL16C_JALRC_FM_MM16R6<bits<5> op> {
   let Inst{4-0}   = op;
 }
 
+class POP35_BOVC_FM_MMR6<string instr_asm> : MipsR6Inst, MMR6Arch<instr_asm> {
+  bits<5> rt;
+  bits<5> rs;
+  bits<16> offset;
+
+  bits<32> Inst;
+
+  let Inst{31-26} = 0b011101;
+  let Inst{25-21} = rt;
+  let Inst{20-16} = rs;
+  let Inst{15-0} = offset;
+}
+
+class POP37_BNVC_FM_MMR6<string instr_asm> : MipsR6Inst, MMR6Arch<instr_asm> {
+  bits<5> rt;
+  bits<5> rs;
+  bits<16> offset;
+
+  bits<32> Inst;
+
+  let Inst{31-26} = 0b011111;
+  let Inst{25-21} = rt;
+  let Inst{20-16} = rs;
+  let Inst{15-0} = offset;
+}
+
 class POOL16C_JRCADDIUSP_FM_MM16R6<bits<5> op> {
   bits<5> imm;
 
@@ -449,7 +494,8 @@ class LOAD_UPPER_IMM_FM_MMR6 {
   let Inst{15-0}  = imm16;
 }
 
-class CMP_BRANCH_1R_RT_OFF16_FM_MMR6<bits<6> funct> : MipsR6Inst {
+class CMP_BRANCH_1R_RT_OFF16_FM_MMR6<string instr_asm, bits<6> funct>
+    : MMR6Arch<instr_asm>, MipsR6Inst {
   bits<5> rt;
   bits<16> offset;
 
@@ -461,7 +507,8 @@ class CMP_BRANCH_1R_RT_OFF16_FM_MMR6<bits<6> funct> : MipsR6Inst {
   let Inst{15-0}  = offset;
 }
 
-class CMP_BRANCH_1R_BOTH_OFF16_FM_MMR6<bits<6> funct> : MipsR6Inst {
+class CMP_BRANCH_1R_BOTH_OFF16_FM_MMR6<string instr_asm, bits<6> funct>
+    : MMR6Arch<instr_asm>, MipsR6Inst {
   bits<5> rt;
   bits<16> offset;
 
@@ -473,6 +520,37 @@ class CMP_BRANCH_1R_BOTH_OFF16_FM_MMR6<bits<6> funct> : MipsR6Inst {
   let Inst{15-0}  = offset;
 }
 
+class POOL32A_JALRC_FM_MMR6<string instr_asm, bits<10> funct>
+    : MipsR6Inst, MMR6Arch<instr_asm> {
+  bits<5> rt;
+  bits<5> rs;
+
+  bits<32> Inst;
+
+  let Inst{31-26} = 0;
+  let Inst{25-21} = rt;
+  let Inst{20-16} = rs;
+  let Inst{15-6} = funct;
+  let Inst{5-0} = 0b111100;
+}
+
+class POOL32A_EXT_INS_FM_MMR6<string instr_asm, bits<6> funct>
+    : MMR6Arch<instr_asm>, MipsR6Inst {
+  bits<5> rt;
+  bits<5> rs;
+  bits<5> size;
+  bits<5> pos;
+
+  bits<32> Inst;
+
+  let Inst{31-26} = 0;
+  let Inst{25-21} = rt;
+  let Inst{20-16} = rs;
+  let Inst{15-11} = size;
+  let Inst{10-6}  = pos;
+  let Inst{5-0}   = funct;
+}
+
 class POOL32A_ERET_FM_MMR6<string instr_asm, bits<10> funct>
     : MMR6Arch<instr_asm> {
   bits<32> Inst;
@@ -751,7 +829,7 @@ class POOL16C_NOT16_FM_MMR6 : MicroMipsR6Inst16 {
   let Inst{3-0}   = 0b0000;
 }
 
-class POOL16C_OR16_XOR16_FM_MMR6<bits<4> op> {
+class POOL16C_OR16_XOR16_FM_MMR6<bits<4> op> : MicroMipsR6Inst16 {
   bits<3> rt;
   bits<3> rs;
 
@@ -860,3 +938,157 @@ class POOL32F_CLASS_FM_MMR6<string instr_asm, bits<2> fmt, bits<9> funct>
   let Inst{10-9}  = fmt;
   let Inst{8-0}   = funct;
 }
+
+class POOL32A_TLBINV_FM_MMR6<string instr_asm, bits<10> funct>
+    : MMR6Arch<instr_asm>, MipsR6Inst {
+  bits<32> Inst;
+
+  let Inst{31-26} = 0x0;
+  let Inst{25-16} = 0x0;
+  let Inst{15-6}  = funct;
+  let Inst{5-0}   = 0b111100;
+}
+
+class POOL32A_MFTC0_FM_MMR6<string instr_asm, bits<5> funct, bits<6> opcode>
+    : MMR6Arch<instr_asm>, MipsR6Inst {
+  bits<5> rt;
+  bits<5> rs;
+  bits<3> sel;
+
+  bits<32> Inst;
+
+  let Inst{31-26} = 0b000000;
+  let Inst{25-21} = rt;
+  let Inst{20-16} = rs;
+  let Inst{15-14} = 0;
+  let Inst{13-11} = sel;
+  let Inst{10-6}  = funct;
+  let Inst{5-0}   = opcode;
+}
+
+class POOL32F_MFTC1_FM_MMR6<string instr_asm, bits<8> funct>
+    : MMR6Arch<instr_asm> {
+  bits<5> rt;
+  bits<5> fs;
+
+  bits<32> Inst;
+
+  let Inst{31-26} = 0b010101;
+  let Inst{25-21} = rt;
+  let Inst{20-16} = fs;
+  let Inst{15-14} = 0;
+  let Inst{13-6}  = funct;
+  let Inst{5-0}   = 0b111011;
+}
+
+class POOL32A_MFTC2_FM_MMR6<string instr_asm, bits<10> funct>
+    : MMR6Arch<instr_asm>, MipsR6Inst {
+  bits<5> rt;
+  bits<5> impl;
+
+  bits<32> Inst;
+
+  let Inst{31-26} = 0b000000;
+  let Inst{25-21} = rt;
+  let Inst{20-16} = impl;
+  let Inst{15-6}  = funct;
+  let Inst{5-0}   = 0b111100;
+}
+
+class CMP_BRANCH_2R_OFF16_FM_MMR6<string opstr, bits<6> funct>
+    : MipsR6Inst, MMR6Arch<opstr> {
+  bits<5> rt;
+  bits<5> rs;
+  bits<16> offset;
+
+  bits<32> Inst;
+
+  let Inst{31-26} = funct;
+  let Inst{25-21} = rt;
+  let Inst{20-16} = rs;
+  let Inst{15-0}  = offset;
+}
+
+class POOL32A_DVPEVP_FM_MMR6<string instr_asm, bits<10> funct>
+    : MMR6Arch<instr_asm>, MipsR6Inst {
+  bits<5> rs;
+
+  bits<32> Inst;
+
+  let Inst{31-26} = 0b000000;
+  let Inst{25-21} = 0b00000;
+  let Inst{20-16} = rs;
+  let Inst{15-6}  = funct;
+  let Inst{5-0}   = 0b111100;
+}
+
+class POOL32B_LWP_SWP_FM_MMR6<bits<4> funct> : MipsR6Inst {
+  bits<5> rd;
+  bits<21> addr;
+  bits<5> base = addr{20-16};
+  bits<12> offset = addr{11-0};
+
+  bits<32> Inst;
+
+  let Inst{31-26} = 0x8;
+  let Inst{25-21} = rd;
+  let Inst{20-16} = base;
+  let Inst{15-12} = funct;
+  let Inst{11-0}  = offset;
+}
+
+class CMP_BRANCH_OFF21_FM_MMR6<string opstr, bits<6> funct> : MipsR6Inst {
+  bits<5> rs;
+  bits<21> offset;
+
+  bits<32> Inst;
+
+  let Inst{31-26} = funct;
+  let Inst{25-21} = rs;
+  let Inst{20-0} = offset;
+}
+
+class POOL32I_BRANCH_COP_1_2_FM_MMR6<string instr_asm, bits<5> funct>
+    : MMR6Arch<instr_asm> {
+  bits<5> rt;
+  bits<16> offset;
+
+  bits<32> Inst;
+
+  let Inst{31-26} = 0b010000;
+  let Inst{25-21} = funct;
+  let Inst{20-16} = rt;
+  let Inst{15-0}  = offset;
+}
+
+class LDWC1_SDWC1_FM_MMR6<string instr_asm, bits<6> funct>
+    : MMR6Arch<instr_asm> {
+  bits<5> ft;
+  bits<21> addr;
+  bits<5> base = addr{20-16};
+  bits<16> offset = addr{15-0};
+
+  bits<32> Inst;
+
+  let Inst{31-26} = funct;
+  let Inst{25-21} = ft;
+  let Inst{20-16} = base;
+  let Inst{15-0}  = offset;
+}
+
+class POOL32B_LDWC2_SDWC2_FM_MMR6<string instr_asm, bits<4> funct>
+    : MMR6Arch<instr_asm>, MipsR6Inst {
+  bits<5> rt;
+  bits<21> addr;
+  bits<5> base = addr{20-16};
+  bits<11> offset = addr{10-0};
+
+  bits<32> Inst;
+
+  let Inst{31-26} = 0b001000;
+  let Inst{25-21} = rt;
+  let Inst{20-16} = base;
+  let Inst{15-12} = funct;
+  let Inst{11}    = 0;
+  let Inst{10-0}  = offset;
+}
diff --git a/lib/Target/Mips/MicroMips32r6InstrInfo.td b/lib/Target/Mips/MicroMips32r6InstrInfo.td
index 31b5db036daa..2b636cfe3bfe 100644
--- a/lib/Target/Mips/MicroMips32r6InstrInfo.td
+++ b/lib/Target/Mips/MicroMips32r6InstrInfo.td
@@ -11,6 +11,13 @@
 //
 //===----------------------------------------------------------------------===//
 
+def brtarget21_mm : Operand<OtherVT> {
+  let EncoderMethod = "getBranchTarget21OpValueMM";
+  let OperandType = "OPERAND_PCREL";
+  let DecoderMethod = "DecodeBranchTarget21MM";
+  let ParserMatchClass = MipsJumpTargetAsmOperand;
+}
+
 def brtarget26_mm : Operand<OtherVT> {
   let EncoderMethod = "getBranchTarget26OpValueMM";
   let OperandType = "OPERAND_PCREL";
@@ -18,6 +25,13 @@ def brtarget26_mm : Operand<OtherVT> {
   let ParserMatchClass = MipsJumpTargetAsmOperand;
 }
 
+def brtargetr6 : Operand<OtherVT> {
+  let EncoderMethod = "getBranchTargetOpValueMMR6";
+  let OperandType = "OPERAND_PCREL";
+  let DecoderMethod = "DecodeBranchTargetMM";
+  let ParserMatchClass = MipsJumpTargetAsmOperand;
+}
+
 //===----------------------------------------------------------------------===//
 //
 // Instruction Encodings
@@ -40,12 +54,26 @@ class BEQZC16_MMR6_ENC : BEQZC_BNEZC_FM_MM16R6<0x23>;
 class BNEZC16_MMR6_ENC : BEQZC_BNEZC_FM_MM16R6<0x2b>;
 class BITSWAP_MMR6_ENC : POOL32A_BITSWAP_FM_MMR6<0b101100>;
 class BRK_MMR6_ENC : BREAK_MMR6_ENC<"break">;
-class BEQZALC_MMR6_ENC : CMP_BRANCH_1R_RT_OFF16_FM_MMR6<0b011101>;
-class BNEZALC_MMR6_ENC : CMP_BRANCH_1R_RT_OFF16_FM_MMR6<0b011111>;
-class BGTZALC_MMR6_ENC : CMP_BRANCH_1R_RT_OFF16_FM_MMR6<0b111000>;
-class BLTZALC_MMR6_ENC : CMP_BRANCH_1R_BOTH_OFF16_FM_MMR6<0b111000>;
-class BGEZALC_MMR6_ENC : CMP_BRANCH_1R_BOTH_OFF16_FM_MMR6<0b110000>;
-class BLEZALC_MMR6_ENC : CMP_BRANCH_1R_RT_OFF16_FM_MMR6<0b110000>;
+class BEQZC_MMR6_ENC : CMP_BRANCH_OFF21_FM_MMR6<"beqzc", 0b100000>;
+class BNEZC_MMR6_ENC : CMP_BRANCH_OFF21_FM_MMR6<"bnezc", 0b101000>;
+class BGEC_MMR6_ENC : CMP_BRANCH_2R_OFF16_FM_MMR6<"bgec", 0b111001>;
+class BGEUC_MMR6_ENC : CMP_BRANCH_2R_OFF16_FM_MMR6<"bgeuc", 0b110000>,
+                       DecodeDisambiguates<"BlezGroupBranchMMR6">;
+class BLTC_MMR6_ENC : CMP_BRANCH_2R_OFF16_FM_MMR6<"bltc", 0b110001>;
+class BLTUC_MMR6_ENC : CMP_BRANCH_2R_OFF16_FM_MMR6<"bltuc", 0b111000>,
+                       DecodeDisambiguates<"BgtzGroupBranchMMR6">;
+class BEQC_MMR6_ENC : CMP_BRANCH_2R_OFF16_FM_MMR6<"beqc", 0b011101>;
+class BNEC_MMR6_ENC : CMP_BRANCH_2R_OFF16_FM_MMR6<"bnec", 0b011111>;
+class BEQZALC_MMR6_ENC : CMP_BRANCH_1R_RT_OFF16_FM_MMR6<"beqzalc", 0b011101>;
+class BNEZALC_MMR6_ENC : CMP_BRANCH_1R_RT_OFF16_FM_MMR6<"bnezalc", 0b011111>;
+class BGTZALC_MMR6_ENC : CMP_BRANCH_1R_RT_OFF16_FM_MMR6<"bgtzalc", 0b111000>,
+                         MMDecodeDisambiguatedBy<"BgtzGroupBranchMMR6">;
+class BLTZALC_MMR6_ENC : CMP_BRANCH_1R_BOTH_OFF16_FM_MMR6<"bltzalc", 0b111000>,
+                         MMDecodeDisambiguatedBy<"BgtzGroupBranchMMR6">;
+class BGEZALC_MMR6_ENC : CMP_BRANCH_1R_BOTH_OFF16_FM_MMR6<"bgezalc", 0b110000>,
+                         MMDecodeDisambiguatedBy<"BlezGroupBranchMMR6">;
+class BLEZALC_MMR6_ENC : CMP_BRANCH_1R_RT_OFF16_FM_MMR6<"blezalc", 0b110000>,
+                         MMDecodeDisambiguatedBy<"BlezGroupBranchMMR6">;
 class CACHE_MMR6_ENC : CACHE_PREF_FM_MMR6<0b001000, 0b0110>;
 class CLO_MMR6_ENC : POOL32A_2R_FM_MMR6<0b0100101100>;
 class CLZ_MMR6_ENC : SPECIAL_2R_FM_MMR6<0b010000>;
@@ -63,14 +91,27 @@ class JIC_MMR6_ENC   : JMP_IDX_COMPACT_FM<0b101000>;
 class JRC16_MMR6_ENC: POOL16C_JALRC_FM_MM16R6<0x3>;
 class JRCADDIUSP_MMR6_ENC : POOL16C_JRCADDIUSP_FM_MM16R6<0x13>;
 class LSA_MMR6_ENC : POOL32A_LSA_FM<0b001111>;
+class LWP_MMR6_ENC : POOL32B_LWP_SWP_FM_MMR6<0x1>;
 class LWPC_MMR6_ENC  : PCREL19_FM_MMR6<0b01>;
 class LWM16_MMR6_ENC : POOL16C_LWM_SWM_FM_MM16R6<0x2>;
+class MFC0_MMR6_ENC : POOL32A_MFTC0_FM_MMR6<"mfc0", 0b00011, 0b111100>;
+class MFC1_MMR6_ENC : POOL32F_MFTC1_FM_MMR6<"mfc1", 0b10000000>;
+class MFC2_MMR6_ENC : POOL32A_MFTC2_FM_MMR6<"mfc2", 0b0100110100>;
+class MFHC0_MMR6_ENC : POOL32A_MFTC0_FM_MMR6<"mfhc0", 0b00011, 0b110100>;
+class MFHC1_MMR6_ENC : POOL32F_MFTC1_FM_MMR6<"mfhc1", 0b11000000>;
+class MFHC2_MMR6_ENC : POOL32A_MFTC2_FM_MMR6<"mfhc2", 0b1000110100>;
 class MOD_MMR6_ENC : ARITH_FM_MMR6<"mod", 0x158>;
 class MODU_MMR6_ENC : ARITH_FM_MMR6<"modu", 0x1d8>;
 class MUL_MMR6_ENC : ARITH_FM_MMR6<"mul", 0x18>;
 class MUH_MMR6_ENC : ARITH_FM_MMR6<"muh", 0x58>;
 class MULU_MMR6_ENC : ARITH_FM_MMR6<"mulu", 0x98>;
 class MUHU_MMR6_ENC : ARITH_FM_MMR6<"muhu", 0xd8>;
+class MTC0_MMR6_ENC : POOL32A_MFTC0_FM_MMR6<"mtc0", 0b01011, 0b111100>;
+class MTC1_MMR6_ENC : POOL32F_MFTC1_FM_MMR6<"mtc1", 0b10100000>;
+class MTC2_MMR6_ENC : POOL32A_MFTC2_FM_MMR6<"mtc2", 0b0101110100>;
+class MTHC0_MMR6_ENC : POOL32A_MFTC0_FM_MMR6<"mthc0", 0b01011, 0b110100>;
+class MTHC1_MMR6_ENC : POOL32F_MFTC1_FM_MMR6<"mthc1", 0b11100000>;
+class MTHC2_MMR6_ENC : POOL32A_MFTC2_FM_MMR6<"mthc2", 0b1001110100>;
 class NOR_MMR6_ENC : ARITH_FM_MMR6<"nor", 0x2d0>;
 class OR_MMR6_ENC : ARITH_FM_MMR6<"or", 0x290>;
 class ORI_MMR6_ENC : ADDI_FM_MMR6<"ori", 0x14>;
@@ -89,6 +130,7 @@ class SWE_MMR6_ENC : POOL32C_SWE_FM_MMR6<"swe", 0x18, 0xa, 0x7>;
 class SW16_MMR6_ENC : LOAD_STORE_FM_MM16<0x3a>;
 class SWM16_MMR6_ENC : POOL16C_LWM_SWM_FM_MM16R6<0xa>;
 class SWSP_MMR6_ENC : LOAD_STORE_SP_FM_MM16<0x32>;
+class SWP_MMR6_ENC : POOL32B_LWP_SWP_FM_MMR6<0x9>;
 class PREFE_MMR6_ENC : POOL32C_ST_EVA_FM_MMR6<0b011000, 0b010>;
 class CACHEE_MMR6_ENC : POOL32C_ST_EVA_FM_MMR6<0b011000, 0b011>;
 class WRPGPR_MMR6_ENC : POOL32A_WRPGPR_WSBH_FM_MMR6<0x3c5>;
@@ -134,6 +176,7 @@ class LLE_MMR6_ENC : LOAD_WORD_EVA_FM_MMR6<0b110>;
 class LWE_MMR6_ENC : LOAD_WORD_EVA_FM_MMR6<0b111>;
 class LW_MMR6_ENC : LOAD_WORD_FM_MMR6;
 class LUI_MMR6_ENC : LOAD_UPPER_IMM_FM_MMR6;
+class JALRC_HB_MMR6_ENC : POOL32A_JALRC_FM_MMR6<"jalrc.hb", 0b0001111100>;
 class RECIP_S_MMR6_ENC : POOL32F_RECIP_ROUND_FM_MMR6<"recip.s", 0, 0b01001000>;
 class RECIP_D_MMR6_ENC : POOL32F_RECIP_ROUND_FM_MMR6<"recip.d", 1, 0b01001000>;
 class RINT_S_MMR6_ENC : POOL32F_RINT_FM_MMR6<"rint.s", 0>;
@@ -150,11 +193,15 @@ class SEL_S_MMR6_ENC : POOL32F_SEL_FM_MMR6<"sel.s", 0, 0b010111000>;
 class SEL_D_MMR6_ENC : POOL32F_SEL_FM_MMR6<"sel.d", 1, 0b010111000>;
 class SELEQZ_S_MMR6_ENC : POOL32F_SEL_FM_MMR6<"seleqz.s", 0, 0b000111000>;
 class SELEQZ_D_MMR6_ENC : POOL32F_SEL_FM_MMR6<"seleqz.d", 1, 0b000111000>;
-class SELENZ_S_MMR6_ENC : POOL32F_SEL_FM_MMR6<"selenz.s", 0, 0b001111000>;
-class SELENZ_D_MMR6_ENC : POOL32F_SEL_FM_MMR6<"selenz.d", 1, 0b001111000>;
+class SELNEZ_S_MMR6_ENC : POOL32F_SEL_FM_MMR6<"selnez.s", 0, 0b001111000>;
+class SELNEZ_D_MMR6_ENC : POOL32F_SEL_FM_MMR6<"selnez.d", 1, 0b001111000>;
 class CLASS_S_MMR6_ENC : POOL32F_CLASS_FM_MMR6<"class.s", 0, 0b001100000>;
 class CLASS_D_MMR6_ENC : POOL32F_CLASS_FM_MMR6<"class.d", 1, 0b001100000>;
-
+class EXT_MMR6_ENC : POOL32A_EXT_INS_FM_MMR6<"ext", 0b101100>;
+class INS_MMR6_ENC : POOL32A_EXT_INS_FM_MMR6<"ins", 0b001100>;
+class JALRC_MMR6_ENC : POOL32A_JALRC_FM_MMR6<"jalrc", 0b0000111100>;
+class BOVC_MMR6_ENC : POP35_BOVC_FM_MMR6<"bovc">;
+class BNVC_MMR6_ENC : POP37_BNVC_FM_MMR6<"bnvc">;
 class ADDU16_MMR6_ENC : POOL16A_ADDU16_FM_MMR6;
 class AND16_MMR6_ENC : POOL16C_AND16_FM_MMR6;
 class ANDI16_MMR6_ENC : ANDI_FM_MM16<0b001011>, MicroMipsR6Inst16;
@@ -168,10 +215,24 @@ class MOVE16_MMR6_ENC : MOVE_FM_MM16<0b000011>;
 class SDBBP16_MMR6_ENC : POOL16C_BREAKPOINT_FM_MMR6<0b111011>;
 class SUBU16_MMR6_ENC : POOL16A_SUBU16_FM_MMR6;
 class XOR16_MMR6_ENC : POOL16C_OR16_XOR16_FM_MMR6<0b1000>;
+class TLBINV_MMR6_ENC : POOL32A_TLBINV_FM_MMR6<"tlbinv", 0x10d>;
+class TLBINVF_MMR6_ENC : POOL32A_TLBINV_FM_MMR6<"tlbinvf", 0x14d>;
+class DVP_MMR6_ENC : POOL32A_DVPEVP_FM_MMR6<"dvp", 0b0001100101>;
+class EVP_MMR6_ENC : POOL32A_DVPEVP_FM_MMR6<"evp", 0b0011100101>;
+class BC1EQZC_MMR6_ENC : POOL32I_BRANCH_COP_1_2_FM_MMR6<"bc1eqzc", 0b01000>;
+class BC1NEZC_MMR6_ENC : POOL32I_BRANCH_COP_1_2_FM_MMR6<"bc1nezc", 0b01001>;
+class BC2EQZC_MMR6_ENC : POOL32I_BRANCH_COP_1_2_FM_MMR6<"bc2eqzc", 0b01010>;
+class BC2NEZC_MMR6_ENC : POOL32I_BRANCH_COP_1_2_FM_MMR6<"bc2nezc", 0b01011>;
+class LDC1_MMR6_ENC : LDWC1_SDWC1_FM_MMR6<"ldc1", 0b101111>;
+class SDC1_MMR6_ENC : LDWC1_SDWC1_FM_MMR6<"sdc1", 0b101110>;
+class LDC2_MMR6_ENC : POOL32B_LDWC2_SDWC2_FM_MMR6<"ldc2", 0b0010>;
+class SDC2_MMR6_ENC : POOL32B_LDWC2_SDWC2_FM_MMR6<"sdc2", 0b1010>;
+class LWC2_MMR6_ENC : POOL32B_LDWC2_SDWC2_FM_MMR6<"lwc2", 0b0000>;
+class SWC2_MMR6_ENC : POOL32B_LDWC2_SDWC2_FM_MMR6<"swc2", 0b1000>;
 
 class CMP_CBR_RT_Z_MMR6_DESC_BASE<string instr_asm, DAGOperand opnd,
                                   RegisterOperand GPROpnd>
-    : BRANCH_DESC_BASE, MMR6Arch<instr_asm> {
+    : BRANCH_DESC_BASE {
   dag InOperandList = (ins GPROpnd:$rt, opnd:$offset);
   dag OutOperandList = (outs);
   string AsmString = !strconcat(instr_asm, "\t$rt, $offset");
@@ -208,6 +269,27 @@ class BNEZALC_MMR6_DESC : CMP_CBR_RT_Z_MMR6_DESC_BASE<"bnezalc", brtarget_mm,
   list<Register> Defs = [RA];
 }
 
+class CMP_CBR_2R_MMR6_DESC_BASE<string instr_asm, DAGOperand opnd,
+                                RegisterOperand GPROpnd> : BRANCH_DESC_BASE {
+  dag InOperandList = (ins GPROpnd:$rs, GPROpnd:$rt, opnd:$offset);
+  dag OutOperandList = (outs);
+  string AsmString = !strconcat(instr_asm, "\t$rs, $rt, $offset");
+  list<Register> Defs = [AT];
+}
+
+class BGEC_MMR6_DESC : CMP_CBR_2R_MMR6_DESC_BASE<"bgec", brtarget_mm,
+                                                 GPR32Opnd>;
+class BGEUC_MMR6_DESC : CMP_CBR_2R_MMR6_DESC_BASE<"bgeuc", brtarget_mm,
+                                                 GPR32Opnd>;
+class BLTC_MMR6_DESC : CMP_CBR_2R_MMR6_DESC_BASE<"bltc", brtarget_mm,
+                                                 GPR32Opnd>;
+class BLTUC_MMR6_DESC : CMP_CBR_2R_MMR6_DESC_BASE<"bltuc", brtarget_mm,
+                                                 GPR32Opnd>;
+class BEQC_MMR6_DESC : CMP_CBR_2R_MMR6_DESC_BASE<"beqc", brtarget_mm,
+                                                 GPR32Opnd>;
+class BNEC_MMR6_DESC : CMP_CBR_2R_MMR6_DESC_BASE<"bnec", brtarget_mm,
+                                                 GPR32Opnd>;
+
 /// Floating Point Instructions
 class FADD_S_MMR6_ENC : POOL32F_ARITH_FM_MMR6<"add.s", 0, 0b00110000>;
 class FADD_D_MMR6_ENC : POOL32F_ARITH_FM_MMR6<"add.d", 1, 0b00110000>;
@@ -252,12 +334,12 @@ class CVT_S_L_MMR6_ENC : POOL32F_CVT_DS_FM<"cvt.s.l", 2, 0b1101101>;
 //===----------------------------------------------------------------------===//
 
 class ADD_MMR6_DESC : ArithLogicR<"add", GPR32Opnd>;
-class ADDIU_MMR6_DESC : ArithLogicI<"addiu", simm16, GPR32Opnd>;
+class ADDIU_MMR6_DESC : ArithLogicI<"addiu", simm16, GPR32Opnd, II_ADDIU, immSExt16, add>;
 class ADDU_MMR6_DESC : ArithLogicR<"addu", GPR32Opnd>;
-class MUL_MMR6_DESC : ArithLogicR<"mul", GPR32Opnd>;
-class MUH_MMR6_DESC : ArithLogicR<"muh", GPR32Opnd>;
-class MULU_MMR6_DESC : ArithLogicR<"mulu", GPR32Opnd>;
-class MUHU_MMR6_DESC : ArithLogicR<"muhu", GPR32Opnd>;
+class MUL_MMR6_DESC : ArithLogicR<"mul", GPR32Opnd, 1, II_MUL, mul>;
+class MUH_MMR6_DESC : ArithLogicR<"muh", GPR32Opnd, 1, II_MUH, mulhs>;
+class MULU_MMR6_DESC : ArithLogicR<"mulu", GPR32Opnd, 1, II_MULU>;
+class MUHU_MMR6_DESC : ArithLogicR<"muhu", GPR32Opnd, 1, II_MUHU, mulhu>;
 
 class BC_MMR6_DESC_BASE<string instr_asm, DAGOperand opnd>
     : BRANCH_DESC_BASE, MMR6Arch<instr_asm> {
@@ -476,6 +558,32 @@ class PCREL_MMR6_DESC_BASE<string instr_asm, RegisterOperand GPROpnd,
 class ADDIUPC_MMR6_DESC : PCREL_MMR6_DESC_BASE<"addiupc", GPR32Opnd, simm19_lsl2>;
 class LWPC_MMR6_DESC: PCREL_MMR6_DESC_BASE<"lwpc", GPR32Opnd, simm19_lsl2>;
 
+class LWP_MMR6_DESC : MMR6Arch<"lwp"> {
+  dag OutOperandList = (outs regpair:$rd);
+  dag InOperandList = (ins mem_simm12:$addr);
+  string AsmString = !strconcat("lwp", "\t$rd, $addr");
+  list<dag> Pattern = [];
+  InstrItinClass Itin = NoItinerary;
+  ComplexPattern Addr = addr;
+  Format f = FrmI;
+  string BaseOpcode = "lwp";
+  string DecoderMethod = "DecodeMemMMImm12";
+  bit mayLoad = 1;
+}
+
+class SWP_MMR6_DESC : MMR6Arch<"swp"> {
+  dag OutOperandList = (outs);
+  dag InOperandList = (ins regpair:$rd, mem_simm12:$addr);
+  string AsmString = !strconcat("swp", "\t$rd, $addr");
+  list<dag> Pattern = [];
+  InstrItinClass Itin = NoItinerary;
+  ComplexPattern Addr = addr;
+  Format f = FrmI;
+  string BaseOpcode = "swp";
+  string DecoderMethod = "DecodeMemMMImm12";
+  bit mayStore = 1;
+}
+
 class SELEQNE_Z_MMR6_DESC_BASE<string instr_asm, RegisterOperand GPROpnd>
     : MMR6Arch<instr_asm> {
   dag OutOperandList = (outs GPROpnd:$rd);
@@ -499,17 +607,38 @@ class RDHWR_MMR6_DESC : MMR6Arch<"rdhwr">, MipsR6Inst {
 class WAIT_MMR6_DESC : WaitMM<"wait">;
 class SSNOP_MMR6_DESC : Barrier<"ssnop">;
 class SLL_MMR6_DESC : shift_rotate_imm<"sll", uimm5, GPR32Opnd, II_SLL>;
-class DIV_MMR6_DESC : ArithLogicR<"div", GPR32Opnd>;
-class DIVU_MMR6_DESC : ArithLogicR<"divu", GPR32Opnd>;
-class MOD_MMR6_DESC : ArithLogicR<"mod", GPR32Opnd>;
-class MODU_MMR6_DESC : ArithLogicR<"modu", GPR32Opnd>;
-class AND_MMR6_DESC : ArithLogicR<"and", GPR32Opnd>;
-class ANDI_MMR6_DESC : ArithLogicI<"andi", simm16, GPR32Opnd>;
-class NOR_MMR6_DESC : ArithLogicR<"nor", GPR32Opnd>;
-class OR_MMR6_DESC : ArithLogicR<"or", GPR32Opnd>;
-class ORI_MMR6_DESC : ArithLogicI<"ori", simm16, GPR32Opnd>;
-class XOR_MMR6_DESC : ArithLogicR<"xor", GPR32Opnd>;
-class XORI_MMR6_DESC : ArithLogicI<"xori", simm16, GPR32Opnd>;
+
+class DIVMOD_MMR6_DESC_BASE<string opstr, RegisterOperand GPROpnd,
+                            SDPatternOperator OpNode=null_frag>
+    : MipsR6Inst {
+  dag OutOperandList = (outs GPROpnd:$rd);
+  dag InOperandList = (ins GPROpnd:$rs, GPROpnd:$rt);
+  string AsmString = !strconcat(opstr, "\t$rd, $rs, $rt");
+  list<dag> Pattern = [(set GPROpnd:$rd, (OpNode GPROpnd:$rs, GPROpnd:$rt))];
+  string BaseOpcode = opstr;
+  Format f = FrmR;
+  let isCommutable = 0;
+  let isReMaterializable = 1;
+
+  // This instruction doesn't trap division by zero itself. We must insert
+  // teq instructions as well.
+  bit usesCustomInserter = 1;
+}
+class DIV_MMR6_DESC  : DIVMOD_MMR6_DESC_BASE<"div", GPR32Opnd, sdiv>;
+class DIVU_MMR6_DESC : DIVMOD_MMR6_DESC_BASE<"divu", GPR32Opnd, udiv>;
+class MOD_MMR6_DESC  : DIVMOD_MMR6_DESC_BASE<"mod", GPR32Opnd, srem>;
+class MODU_MMR6_DESC : DIVMOD_MMR6_DESC_BASE<"modu", GPR32Opnd, urem>;
+class AND_MMR6_DESC : ArithLogicR<"and", GPR32Opnd, 1, II_AND, and>;
+class ANDI_MMR6_DESC : ArithLogicI<"andi", uimm16, GPR32Opnd, II_ANDI>;
+class NOR_MMR6_DESC : LogicNOR<"nor", GPR32Opnd>;
+class OR_MMR6_DESC : ArithLogicR<"or", GPR32Opnd, 1, II_OR, or>;
+class ORI_MMR6_DESC : ArithLogicI<"ori", uimm16, GPR32Opnd, II_ORI, immZExt16,
+                                  or> {
+  int AddedComplexity = 1;
+}
+class XOR_MMR6_DESC : ArithLogicR<"xor", GPR32Opnd, 1, II_XOR, xor>;
+class XORI_MMR6_DESC : ArithLogicI<"xori", uimm16, GPR32Opnd, II_XORI,
+                                   immZExt16, xor>;
 
 class SWE_MMR6_DESC_BASE<string opstr, DAGOperand RO, DAGOperand MO,
                   SDPatternOperator OpNode = null_frag,
@@ -536,6 +665,155 @@ class WRPGPR_WSBH_MMR6_DESC_BASE<string instr_asm, RegisterOperand RO>
 class WRPGPR_MMR6_DESC : WRPGPR_WSBH_MMR6_DESC_BASE<"wrpgpr", GPR32Opnd>;
 class WSBH_MMR6_DESC : WRPGPR_WSBH_MMR6_DESC_BASE<"wsbh", GPR32Opnd>;
 
+class MTC0_MMR6_DESC_BASE<string opstr, RegisterOperand DstRC,
+                         RegisterOperand SrcRC> {
+  dag InOperandList = (ins SrcRC:$rt, uimm3:$sel);
+  dag OutOperandList = (outs DstRC:$rs);
+  string AsmString = !strconcat(opstr, "\t$rt, $rs, $sel");
+  list<dag> Pattern = [];
+  Format f = FrmFR;
+  string BaseOpcode = opstr;
+}
+class MTC1_MMR6_DESC_BASE<
+      string opstr, RegisterOperand DstRC, RegisterOperand SrcRC,
+      InstrItinClass Itin = NoItinerary, SDPatternOperator OpNode = null_frag>
+      : MipsR6Inst {
+  dag InOperandList = (ins SrcRC:$rt);
+  dag OutOperandList = (outs DstRC:$fs);
+  string AsmString = !strconcat(opstr, "\t$rt, $fs");
+  list<dag> Pattern = [(set DstRC:$fs, (OpNode SrcRC:$rt))];
+  Format f = FrmFR;
+  InstrItinClass Itinerary = Itin;
+  string BaseOpcode = opstr;
+}
+class MTC1_64_MMR6_DESC_BASE<
+      string opstr, RegisterOperand DstRC, RegisterOperand SrcRC,
+      InstrItinClass Itin = NoItinerary> : MipsR6Inst {
+  dag InOperandList = (ins DstRC:$fs_in, SrcRC:$rt);
+  dag OutOperandList = (outs DstRC:$fs);
+  string AsmString = !strconcat(opstr, "\t$rt, $fs");
+  list<dag> Pattern = [];
+  Format f = FrmFR;
+  InstrItinClass Itinerary = Itin;
+  string BaseOpcode = opstr;
+  // $fs_in is part of a white lie to work around a widespread bug in the FPU
+  // implementation. See expandBuildPairF64 for details.
+  let Constraints = "$fs = $fs_in";
+}
+class MTC2_MMR6_DESC_BASE<string opstr, RegisterOperand DstRC,
+                         RegisterOperand SrcRC> {
+  dag InOperandList = (ins SrcRC:$rt);
+  dag OutOperandList = (outs DstRC:$impl);
+  string AsmString = !strconcat(opstr, "\t$rt, $impl");
+  list<dag> Pattern = [];
+  Format f = FrmFR;
+  string BaseOpcode = opstr;
+}
+
+class MTC0_MMR6_DESC : MTC0_MMR6_DESC_BASE<"mtc0", COP0Opnd, GPR32Opnd>;
+class MTC1_MMR6_DESC : MTC1_MMR6_DESC_BASE<"mtc1", FGR32Opnd, GPR32Opnd,
+                                           II_MTC1, bitconvert>, HARDFLOAT;
+class MTC2_MMR6_DESC : MTC2_MMR6_DESC_BASE<"mtc2", COP2Opnd, GPR32Opnd>;
+class MTHC0_MMR6_DESC : MTC0_MMR6_DESC_BASE<"mthc0", COP0Opnd, GPR32Opnd>;
+class MTHC1_D32_MMR6_DESC : MTC1_64_MMR6_DESC_BASE<"mthc1", AFGR64Opnd, GPR32Opnd>,
+                            HARDFLOAT, FGR_32;
+class MTHC1_D64_MMR6_DESC : MTC1_64_MMR6_DESC_BASE<"mthc1", FGR64Opnd, GPR32Opnd>,
+                            HARDFLOAT, FGR_64;
+class MTHC2_MMR6_DESC : MTC2_MMR6_DESC_BASE<"mthc2", COP2Opnd, GPR32Opnd>;
+
+class MFC0_MMR6_DESC_BASE<string opstr, RegisterOperand DstRC,
+                          RegisterOperand SrcRC> {
+  dag InOperandList = (ins SrcRC:$rs, uimm3:$sel);
+  dag OutOperandList = (outs DstRC:$rt);
+  string AsmString = !strconcat(opstr, "\t$rt, $rs, $sel");
+  list<dag> Pattern = [];
+  Format f = FrmFR;
+  string BaseOpcode = opstr;
+}
+class MFC1_MMR6_DESC_BASE<string opstr, RegisterOperand DstRC,
+                          RegisterOperand SrcRC,
+                          InstrItinClass Itin = NoItinerary,
+                          SDPatternOperator OpNode = null_frag> : MipsR6Inst {
+  dag InOperandList = (ins SrcRC:$fs);
+  dag OutOperandList = (outs DstRC:$rt);
+  string AsmString = !strconcat(opstr, "\t$rt, $fs");
+  list<dag> Pattern = [(set DstRC:$rt, (OpNode SrcRC:$fs))];
+  Format f = FrmFR;
+  InstrItinClass Itinerary = Itin;
+  string BaseOpcode = opstr;
+}
+class MFC2_MMR6_DESC_BASE<string opstr, RegisterOperand DstRC,
+                          RegisterOperand SrcRC> {
+  dag InOperandList = (ins SrcRC:$impl);
+  dag OutOperandList = (outs DstRC:$rt);
+  string AsmString = !strconcat(opstr, "\t$rt, $impl");
+  list<dag> Pattern = [];
+  Format f = FrmFR;
+  string BaseOpcode = opstr;
+}
+class MFC0_MMR6_DESC : MFC0_MMR6_DESC_BASE<"mfc0", GPR32Opnd, COP0Opnd>;
+class MFC1_MMR6_DESC : MFC1_MMR6_DESC_BASE<"mfc1", GPR32Opnd, FGR32Opnd,
+                                           II_MFC1, bitconvert>, HARDFLOAT;
+class MFC2_MMR6_DESC : MFC2_MMR6_DESC_BASE<"mfc2", GPR32Opnd, COP2Opnd>;
+class MFHC0_MMR6_DESC : MFC0_MMR6_DESC_BASE<"mfhc0", GPR32Opnd, COP0Opnd>;
+class MFHC1_D32_MMR6_DESC : MFC1_MMR6_DESC_BASE<"mfhc1", GPR32Opnd, AFGR64Opnd,
+                                                II_MFHC1>, HARDFLOAT, FGR_32;
+class MFHC1_D64_MMR6_DESC : MFC1_MMR6_DESC_BASE<"mfhc1", GPR32Opnd, FGR64Opnd,
+                                                II_MFHC1>, HARDFLOAT, FGR_64;
+class MFHC2_MMR6_DESC : MFC2_MMR6_DESC_BASE<"mfhc2", GPR32Opnd, COP2Opnd>;
+
+class LDC1_D64_MMR6_DESC : MipsR6Inst, HARDFLOAT, FGR_64 {
+  dag InOperandList = (ins mem_mm_16:$addr);
+  dag OutOperandList = (outs FGR64Opnd:$ft);
+  string AsmString = !strconcat("ldc1", "\t$ft, $addr");
+  list<dag> Pattern = [(set FGR64Opnd:$ft, (load addrimm16:$addr))];
+  Format f = FrmFI;
+  InstrItinClass Itinerary = II_LDC1;
+  string BaseOpcode = "ldc1";
+  bit mayLoad = 1;
+  let DecoderMethod = "DecodeFMemMMR2";
+}
+
+class SDC1_D64_MMR6_DESC : MipsR6Inst, HARDFLOAT, FGR_64 {
+  dag InOperandList = (ins FGR64Opnd:$ft, mem_mm_16:$addr);
+  dag OutOperandList = (outs);
+  string AsmString = !strconcat("sdc1", "\t$ft, $addr");
+  list<dag> Pattern = [(store FGR64Opnd:$ft, addrimm16:$addr)];
+  Format f = FrmFI;
+  InstrItinClass Itinerary = II_SDC1;
+  string BaseOpcode = "sdc1";
+  bit mayStore = 1;
+  let DecoderMethod = "DecodeFMemMMR2";
+}
+
+class LDC2_LWC2_MMR6_DESC_BASE<string opstr> {
+  dag OutOperandList = (outs COP2Opnd:$rt);
+  dag InOperandList = (ins mem_mm_11:$addr);
+  string AsmString = !strconcat(opstr, "\t$rt, $addr");
+  list<dag> Pattern = [(set COP2Opnd:$rt, (load addrimm11:$addr))];
+  Format f = FrmFI;
+  InstrItinClass Itinerary = NoItinerary;
+  string BaseOpcode = opstr;
+  bit mayLoad = 1;
+  string DecoderMethod = "DecodeFMemCop2MMR6";
+}
+class LDC2_MMR6_DESC : LDC2_LWC2_MMR6_DESC_BASE<"ldc2">;
+class LWC2_MMR6_DESC : LDC2_LWC2_MMR6_DESC_BASE<"lwc2">;
+
+class SDC2_SWC2_MMR6_DESC_BASE<string opstr> {
+  dag OutOperandList = (outs);
+  dag InOperandList = (ins COP2Opnd:$rt, mem_mm_11:$addr);
+  string AsmString = !strconcat(opstr, "\t$rt, $addr");
+  list<dag> Pattern = [(store COP2Opnd:$rt, addrimm11:$addr)];
+  Format f = FrmFI;
+  InstrItinClass Itinerary = NoItinerary;
+  string BaseOpcode = opstr;
+  bit mayStore = 1;
+  string DecoderMethod = "DecodeFMemCop2MMR6";
+}
+class SDC2_MMR6_DESC : SDC2_SWC2_MMR6_DESC_BASE<"sdc2">;
+class SWC2_MMR6_DESC : SDC2_SWC2_MMR6_DESC_BASE<"swc2">;
+
 /// Floating Point Instructions
 class FARITH_MMR6_DESC_BASE<string instr_asm, RegisterOperand RC,
                             InstrItinClass Itin, bit isComm,
@@ -633,69 +911,69 @@ class CVT_S_L_MMR6_DESC : CVT_MMR6_DESC_BASE<"cvt.s.l", FGR64Opnd, FGR32Opnd,
 
 multiclass CMP_CC_MMR6<bits<6> format, string Typestr,
                        RegisterOperand FGROpnd> {
-  def CMP_AF_#NAME : POOL32F_CMP_FM<
+  def CMP_AF_#NAME : R6MMR6Rel, POOL32F_CMP_FM<
       !strconcat("cmp.af.", Typestr), format, FIELD_CMP_COND_AF>,
-      CMP_CONDN_DESC_BASE<"af", Typestr, FGROpnd>, HARDFLOAT, R6MMR6Rel,
+      CMP_CONDN_DESC_BASE<"af", Typestr, FGROpnd>, HARDFLOAT,
       ISA_MICROMIPS32R6;
-  def CMP_UN_#NAME : POOL32F_CMP_FM<
+  def CMP_UN_#NAME : R6MMR6Rel, POOL32F_CMP_FM<
       !strconcat("cmp.un.", Typestr), format, FIELD_CMP_COND_UN>,
-      CMP_CONDN_DESC_BASE<"un", Typestr, FGROpnd>, HARDFLOAT, R6MMR6Rel,
+      CMP_CONDN_DESC_BASE<"un", Typestr, FGROpnd, setuo>, HARDFLOAT,
       ISA_MICROMIPS32R6;
-  def CMP_EQ_#NAME : POOL32F_CMP_FM<
+  def CMP_EQ_#NAME : R6MMR6Rel, POOL32F_CMP_FM<
       !strconcat("cmp.eq.", Typestr), format, FIELD_CMP_COND_EQ>,
-      CMP_CONDN_DESC_BASE<"eq", Typestr, FGROpnd>, HARDFLOAT, R6MMR6Rel,
+      CMP_CONDN_DESC_BASE<"eq", Typestr, FGROpnd, setoeq>, HARDFLOAT,
       ISA_MICROMIPS32R6;
-  def CMP_UEQ_#NAME : POOL32F_CMP_FM<
+  def CMP_UEQ_#NAME : R6MMR6Rel, POOL32F_CMP_FM<
       !strconcat("cmp.ueq.", Typestr), format, FIELD_CMP_COND_UEQ>,
-      CMP_CONDN_DESC_BASE<"ueq", Typestr, FGROpnd>, HARDFLOAT, R6MMR6Rel,
+      CMP_CONDN_DESC_BASE<"ueq", Typestr, FGROpnd, setueq>, HARDFLOAT,
       ISA_MICROMIPS32R6;
-  def CMP_LT_#NAME : POOL32F_CMP_FM<
+  def CMP_LT_#NAME : R6MMR6Rel, POOL32F_CMP_FM<
       !strconcat("cmp.lt.", Typestr), format, FIELD_CMP_COND_LT>,
-      CMP_CONDN_DESC_BASE<"lt", Typestr, FGROpnd>, HARDFLOAT, R6MMR6Rel,
+      CMP_CONDN_DESC_BASE<"lt", Typestr, FGROpnd, setolt>, HARDFLOAT,
       ISA_MICROMIPS32R6;
-  def CMP_ULT_#NAME : POOL32F_CMP_FM<
+  def CMP_ULT_#NAME : R6MMR6Rel, POOL32F_CMP_FM<
       !strconcat("cmp.ult.", Typestr), format, FIELD_CMP_COND_ULT>,
-      CMP_CONDN_DESC_BASE<"ult", Typestr, FGROpnd>, HARDFLOAT, R6MMR6Rel,
+      CMP_CONDN_DESC_BASE<"ult", Typestr, FGROpnd, setult>, HARDFLOAT,
       ISA_MICROMIPS32R6;
-  def CMP_LE_#NAME : POOL32F_CMP_FM<
+  def CMP_LE_#NAME : R6MMR6Rel, POOL32F_CMP_FM<
       !strconcat("cmp.le.", Typestr), format, FIELD_CMP_COND_LE>,
-      CMP_CONDN_DESC_BASE<"le", Typestr, FGROpnd>, HARDFLOAT, R6MMR6Rel,
+      CMP_CONDN_DESC_BASE<"le", Typestr, FGROpnd, setole>, HARDFLOAT,
       ISA_MICROMIPS32R6;
-  def CMP_ULE_#NAME : POOL32F_CMP_FM<
+  def CMP_ULE_#NAME : R6MMR6Rel, POOL32F_CMP_FM<
       !strconcat("cmp.ule.", Typestr), format, FIELD_CMP_COND_ULE>,
-      CMP_CONDN_DESC_BASE<"ule", Typestr, FGROpnd>, HARDFLOAT, R6MMR6Rel,
+      CMP_CONDN_DESC_BASE<"ule", Typestr, FGROpnd, setule>, HARDFLOAT,
       ISA_MICROMIPS32R6;
-  def CMP_SAF_#NAME : POOL32F_CMP_FM<
+  def CMP_SAF_#NAME : R6MMR6Rel, POOL32F_CMP_FM<
       !strconcat("cmp.saf.", Typestr), format, FIELD_CMP_COND_SAF>,
-      CMP_CONDN_DESC_BASE<"saf", Typestr, FGROpnd>, HARDFLOAT, R6MMR6Rel,
+      CMP_CONDN_DESC_BASE<"saf", Typestr, FGROpnd>, HARDFLOAT,
       ISA_MICROMIPS32R6;
-  def CMP_SUN_#NAME : POOL32F_CMP_FM<
+  def CMP_SUN_#NAME : R6MMR6Rel, POOL32F_CMP_FM<
       !strconcat("cmp.sun.", Typestr), format, FIELD_CMP_COND_SUN>,
-      CMP_CONDN_DESC_BASE<"sun", Typestr, FGROpnd>, HARDFLOAT, R6MMR6Rel,
+      CMP_CONDN_DESC_BASE<"sun", Typestr, FGROpnd>, HARDFLOAT,
       ISA_MICROMIPS32R6;
-  def CMP_SEQ_#NAME : POOL32F_CMP_FM<
+  def CMP_SEQ_#NAME : R6MMR6Rel, POOL32F_CMP_FM<
       !strconcat("cmp.seq.", Typestr), format, FIELD_CMP_COND_SEQ>,
-      CMP_CONDN_DESC_BASE<"seq", Typestr, FGROpnd>, HARDFLOAT, R6MMR6Rel,
+      CMP_CONDN_DESC_BASE<"seq", Typestr, FGROpnd>, HARDFLOAT,
       ISA_MICROMIPS32R6;
-  def CMP_SUEQ_#NAME : POOL32F_CMP_FM<
+  def CMP_SUEQ_#NAME : R6MMR6Rel, POOL32F_CMP_FM<
       !strconcat("cmp.sueq.", Typestr), format, FIELD_CMP_COND_SUEQ>,
-      CMP_CONDN_DESC_BASE<"sueq", Typestr, FGROpnd>, HARDFLOAT, R6MMR6Rel,
+      CMP_CONDN_DESC_BASE<"sueq", Typestr, FGROpnd>, HARDFLOAT,
       ISA_MICROMIPS32R6;
-  def CMP_SLT_#NAME : POOL32F_CMP_FM<
+  def CMP_SLT_#NAME : R6MMR6Rel, POOL32F_CMP_FM<
       !strconcat("cmp.slt.", Typestr), format, FIELD_CMP_COND_SLT>,
-      CMP_CONDN_DESC_BASE<"slt", Typestr, FGROpnd>, HARDFLOAT, R6MMR6Rel,
+      CMP_CONDN_DESC_BASE<"slt", Typestr, FGROpnd>, HARDFLOAT,
       ISA_MICROMIPS32R6;
-  def CMP_SULT_#NAME : POOL32F_CMP_FM<
+  def CMP_SULT_#NAME : R6MMR6Rel, POOL32F_CMP_FM<
       !strconcat("cmp.sult.", Typestr), format, FIELD_CMP_COND_SULT>,
-      CMP_CONDN_DESC_BASE<"sult", Typestr, FGROpnd>, HARDFLOAT, R6MMR6Rel,
+      CMP_CONDN_DESC_BASE<"sult", Typestr, FGROpnd>, HARDFLOAT,
       ISA_MICROMIPS32R6;
-  def CMP_SLE_#NAME : POOL32F_CMP_FM<
+  def CMP_SLE_#NAME : R6MMR6Rel, POOL32F_CMP_FM<
       !strconcat("cmp.sle.", Typestr), format, FIELD_CMP_COND_SLE>,
-      CMP_CONDN_DESC_BASE<"sle", Typestr, FGROpnd>, HARDFLOAT, R6MMR6Rel,
+      CMP_CONDN_DESC_BASE<"sle", Typestr, FGROpnd>, HARDFLOAT,
       ISA_MICROMIPS32R6;
-  def CMP_SULE_#NAME : POOL32F_CMP_FM<
+  def CMP_SULE_#NAME : R6MMR6Rel, POOL32F_CMP_FM<
       !strconcat("cmp.sule.", Typestr), format, FIELD_CMP_COND_SULE>,
-      CMP_CONDN_DESC_BASE<"sule", Typestr, FGROpnd>, HARDFLOAT, R6MMR6Rel,
+      CMP_CONDN_DESC_BASE<"sule", Typestr, FGROpnd>, HARDFLOAT,
       ISA_MICROMIPS32R6;
 }
 
@@ -769,8 +1047,8 @@ class SEL_D_MMR6_DESC : COP1_SEL_DESC_BASE<"sel.d", FGR64Opnd> {
 
 class SELEQZ_S_MMR6_DESC : SELEQNEZ_DESC_BASE<"seleqz.s", FGR32Opnd>;
 class SELEQZ_D_MMR6_DESC : SELEQNEZ_DESC_BASE<"seleqz.d", FGR64Opnd>;
-class SELENZ_S_MMR6_DESC : SELEQNEZ_DESC_BASE<"selnez.s", FGR32Opnd>;
-class SELENZ_D_MMR6_DESC : SELEQNEZ_DESC_BASE<"selnez.d", FGR64Opnd>;
+class SELNEZ_S_MMR6_DESC : SELEQNEZ_DESC_BASE<"selnez.s", FGR32Opnd>;
+class SELNEZ_D_MMR6_DESC : SELEQNEZ_DESC_BASE<"selnez.d", FGR64Opnd>;
 class RINT_S_MMR6_DESC : CLASS_RINT_DESC_BASE<"rint.s", FGR32Opnd>;
 class RINT_D_MMR6_DESC : CLASS_RINT_DESC_BASE<"rint.d", FGR64Opnd>;
 class CLASS_S_MMR6_DESC  : CLASS_RINT_DESC_BASE<"class.s", FGR32Opnd>;
@@ -785,7 +1063,7 @@ class SB_MMR6_DESC : STORE_MMR6_DESC_BASE<"sb", GPR32Opnd>;
 class STORE_EVA_MMR6_DESC_BASE<string instr_asm, RegisterOperand RO>
     : MMR6Arch<instr_asm>, MipsR6Inst {
   dag OutOperandList = (outs);
-  dag InOperandList = (ins RO:$rt, mem_mm_9:$addr);
+  dag InOperandList = (ins RO:$rt, mem_simm9:$addr);
   string AsmString = !strconcat(instr_asm, "\t$rt, $addr");
   string DecoderMethod = "DecodeStoreEvaOpMM";
   bit mayStore = 1;
@@ -797,7 +1075,7 @@ class SHE_MMR6_DESC : STORE_EVA_MMR6_DESC_BASE<"she", GPR32Opnd>;
 class LOAD_WORD_EVA_MMR6_DESC_BASE<string instr_asm, RegisterOperand RO> :
             MMR6Arch<instr_asm>, MipsR6Inst {
   dag OutOperandList = (outs RO:$rt);
-  dag InOperandList = (ins mem_mm_12:$addr);
+  dag InOperandList = (ins mem_simm9:$addr);
   string AsmString = !strconcat(instr_asm, "\t$rt, $addr");
   string DecoderMethod = "DecodeMemMMImm9";
   bit mayLoad = 1;
@@ -805,30 +1083,42 @@ class LOAD_WORD_EVA_MMR6_DESC_BASE<string instr_asm, RegisterOperand RO> :
 class LLE_MMR6_DESC : LOAD_WORD_EVA_MMR6_DESC_BASE<"lle", GPR32Opnd>;
 class LWE_MMR6_DESC : LOAD_WORD_EVA_MMR6_DESC_BASE<"lwe", GPR32Opnd>;
 class ADDU16_MMR6_DESC : ArithRMM16<"addu16", GPRMM16Opnd, 1, II_ADDU, add>,
-      MMR6Arch<"addu16">;
+      MMR6Arch<"addu16"> {
+  int AddedComplexity = 1;
+}
 class AND16_MMR6_DESC : LogicRMM16<"and16", GPRMM16Opnd, II_AND, and>,
-      MMR6Arch<"and16">;
+      MMR6Arch<"and16"> {
+  int AddedComplexity = 1;
+}
 class ANDI16_MMR6_DESC : AndImmMM16<"andi16", GPRMM16Opnd, II_AND>,
       MMR6Arch<"andi16">;
-class NOT16_MMR6_DESC : NotMM16<"not16", GPRMM16Opnd>, MMR6Arch<"not16">;
+class NOT16_MMR6_DESC : NotMM16<"not16", GPRMM16Opnd>, MMR6Arch<"not16"> {
+  int AddedComplexity = 1;
+}
 class OR16_MMR6_DESC : LogicRMM16<"or16", GPRMM16Opnd, II_OR, or>,
-      MMR6Arch<"or16">;
+      MMR6Arch<"or16"> {
+  int AddedComplexity = 1;
+}
 class SLL16_MMR6_DESC : ShiftIMM16<"sll16", uimm3_shift, GPRMM16Opnd, II_SLL>,
       MMR6Arch<"sll16">;
 class SRL16_MMR6_DESC : ShiftIMM16<"srl16", uimm3_shift, GPRMM16Opnd, II_SRL>,
       MMR6Arch<"srl16">;
-class BREAK16_MMR6_DESC : BrkSdbbp16MM<"break16">, MMR6Arch<"srl16">,
+class BREAK16_MMR6_DESC : BrkSdbbp16MM<"break16">, MMR6Arch<"break16">,
       MicroMipsR6Inst16;
-class LI16_MMR6_DESC : LoadImmMM16<"li16", li_simm7, GPRMM16Opnd>,
-      MMR6Arch<"srl16">, MicroMipsR6Inst16, IsAsCheapAsAMove;
-class MOVE16_MMR6_DESC : MoveMM16<"move16", GPR32Opnd>, MMR6Arch<"srl16">,
+class LI16_MMR6_DESC : LoadImmMM16<"li16", li16_imm, GPRMM16Opnd>,
+      MMR6Arch<"li16">, MicroMipsR6Inst16, IsAsCheapAsAMove;
+class MOVE16_MMR6_DESC : MoveMM16<"move16", GPR32Opnd>, MMR6Arch<"move16">,
       MicroMipsR6Inst16;
 class SDBBP16_MMR6_DESC : BrkSdbbp16MM<"sdbbp16">, MMR6Arch<"sdbbp16">,
       MicroMipsR6Inst16;
 class SUBU16_MMR6_DESC : ArithRMM16<"subu16", GPRMM16Opnd, 0, II_SUBU, sub>,
-      MMR6Arch<"sdbbp16">, MicroMipsR6Inst16;
+      MMR6Arch<"subu16">, MicroMipsR6Inst16 {
+  int AddedComplexity = 1;
+}
 class XOR16_MMR6_DESC : LogicRMM16<"xor16", GPRMM16Opnd, II_XOR, xor>,
-      MMR6Arch<"sdbbp16">, MicroMipsR6Inst16;
+      MMR6Arch<"xor16"> {
+  int AddedComplexity = 1;
+}
 
 class LW_MMR6_DESC : MMR6Arch<"lw">, MipsR6Inst {
   dag OutOperandList = (outs GPR32Opnd:$rt);
@@ -854,9 +1144,9 @@ class LUI_MMR6_DESC : IsAsCheapAsAMove, MMR6Arch<"lui">, MipsR6Inst{
 
 class SYNC_MMR6_DESC : MMR6Arch<"sync">, MipsR6Inst {
   dag OutOperandList = (outs);
-  dag InOperandList = (ins i32imm:$stype);
+  dag InOperandList = (ins uimm5:$stype);
   string AsmString = !strconcat("sync", "\t$stype");
-  list<dag> Pattern = [(MipsSync imm:$stype)];
+  list<dag> Pattern = [(MipsSync immZExt5:$stype)];
   InstrItinClass Itinerary = NoItinerary;
   bit HasSideEffects = 1;
 }
@@ -924,6 +1214,111 @@ class SWSP_MMR6_DESC
   let mayStore = 1;
 }
 
+class JALRC_HB_MMR6_DESC {
+  dag OutOperandList = (outs GPR32Opnd:$rt);
+  dag InOperandList = (ins GPR32Opnd:$rs);
+  string AsmString = !strconcat("jalrc.hb", "\t$rt, $rs");
+  list<dag> Pattern = [];
+  InstrItinClass Itinerary = NoItinerary;
+  Format Form = FrmJ;
+  bit isIndirectBranch = 1;
+  bit hasDelaySlot = 0;
+}
+
+class TLBINV_MMR6_DESC_BASE<string opstr> {
+  dag OutOperandList = (outs);
+  dag InOperandList = (ins);
+  string AsmString = opstr;
+  list<dag> Pattern = [];
+}
+
+class TLBINV_MMR6_DESC : TLBINV_MMR6_DESC_BASE<"tlbinv">;
+class TLBINVF_MMR6_DESC : TLBINV_MMR6_DESC_BASE<"tlbinvf">;
+
+class DVPEVP_MMR6_DESC_BASE<string opstr> {
+  dag OutOperandList = (outs);
+  dag InOperandList = (ins GPR32Opnd:$rs);
+  string AsmString = !strconcat(opstr, "\t$rs");
+  list<dag> Pattern = [];
+}
+
+class DVP_MMR6_DESC : DVPEVP_MMR6_DESC_BASE<"dvp">;
+class EVP_MMR6_DESC : DVPEVP_MMR6_DESC_BASE<"evp">;
+
+class BEQZC_MMR6_DESC
+    : CMP_CBR_EQNE_Z_DESC_BASE<"beqzc", brtarget21_mm, GPR32Opnd>,
+      MMR6Arch<"beqzc">;
+class BNEZC_MMR6_DESC
+    : CMP_CBR_EQNE_Z_DESC_BASE<"bnezc", brtarget21_mm, GPR32Opnd>,
+      MMR6Arch<"bnezc">;
+
+class BRANCH_COP1_MMR6_DESC_BASE<string opstr> :
+    InstSE<(outs), (ins FGR64Opnd:$rt, brtarget_mm:$offset),
+           !strconcat(opstr, "\t$rt, $offset"), [], II_BC1CCZ, FrmI>,
+    HARDFLOAT, BRANCH_DESC_BASE {
+  list<Register> Defs = [AT];
+}
+
+class BC1EQZC_MMR6_DESC : BRANCH_COP1_MMR6_DESC_BASE<"bc1eqzc">;
+class BC1NEZC_MMR6_DESC : BRANCH_COP1_MMR6_DESC_BASE<"bc1nezc">;
+
+class BRANCH_COP2_MMR6_DESC_BASE<string opstr> : BRANCH_DESC_BASE {
+  dag InOperandList = (ins COP2Opnd:$rt, brtarget_mm:$offset);
+  dag OutOperandList = (outs);
+  string AsmString = !strconcat(opstr, "\t$rt, $offset");
+  list<Register> Defs = [AT];
+}
+
+class BC2EQZC_MMR6_DESC : BRANCH_COP2_MMR6_DESC_BASE<"bc2eqzc">;
+class BC2NEZC_MMR6_DESC : BRANCH_COP2_MMR6_DESC_BASE<"bc2nezc">;
+
+class EXT_MMR6_DESC {
+  dag OutOperandList = (outs GPR32Opnd:$rt);
+  dag InOperandList = (ins GPR32Opnd:$rs, uimm5:$pos, uimm5_plus1:$size);
+  string AsmString = !strconcat("ext", "\t$rt, $rs, $pos, $size");
+  list<dag> Pattern = [(set GPR32Opnd:$rt, (MipsExt GPR32Opnd:$rs, imm:$pos,
+                       imm:$size))];
+  InstrItinClass Itinerary = II_EXT;
+  Format Form = FrmR;
+  string BaseOpcode = "ext";
+}
+
+class INS_MMR6_DESC {
+  dag OutOperandList = (outs GPR32Opnd:$rt);
+  dag InOperandList = (ins GPR32Opnd:$rs, uimm5:$pos, uimm5_inssize_plus1:$size,
+                       GPR32Opnd:$src);
+  string AsmString = !strconcat("ins", "\t$rt, $rs, $pos, $size");
+  list<dag> Pattern = [(set GPR32Opnd:$rt, (MipsIns GPR32Opnd:$rs, imm:$pos,
+                       imm:$size, GPR32Opnd:$src))];
+  InstrItinClass Itinerary = II_INS;
+  Format Form = FrmR;
+  string BaseOpcode = "ins";
+  string Constraints = "$src = $rt";
+}
+
+class JALRC_MMR6_DESC {
+  dag OutOperandList = (outs GPR32Opnd:$rt);
+  dag InOperandList = (ins GPR32Opnd:$rs);
+  string AsmString = !strconcat("jalrc", "\t$rt, $rs");
+  list<dag> Pattern = [];
+  InstrItinClass Itinerary = II_JALRC;
+  bit isCall = 1;
+  bit hasDelaySlot = 0;
+  list<Register> Defs = [RA];
+}
+
+class BOVC_BNVC_MMR6_DESC_BASE<string instr_asm, Operand opnd,
+                               RegisterOperand GPROpnd>
+    : BRANCH_DESC_BASE {
+  dag InOperandList = (ins GPROpnd:$rt, GPROpnd:$rs, opnd:$offset);
+  dag OutOperandList = (outs);
+  string AsmString = !strconcat(instr_asm, "\t$rt, $rs, $offset");
+  list<Register> Defs = [AT];
+}
+
+class BOVC_MMR6_DESC : BOVC_BNVC_MMR6_DESC_BASE<"bovc", brtargetr6, GPR32Opnd>;
+class BNVC_MMR6_DESC : BOVC_BNVC_MMR6_DESC_BASE<"bnvc", brtargetr6, GPR32Opnd>;
+
 //===----------------------------------------------------------------------===//
 //
 // Instruction Definitions
@@ -946,22 +1341,18 @@ def AUI_MMR6 : R6MMR6Rel, AUI_MMR6_ENC, AUI_MMR6_DESC, ISA_MICROMIPS32R6;
 def BALC_MMR6 : R6MMR6Rel, BALC_MMR6_ENC, BALC_MMR6_DESC, ISA_MICROMIPS32R6;
 def BC_MMR6 : R6MMR6Rel, BC_MMR6_ENC, BC_MMR6_DESC, ISA_MICROMIPS32R6;
 def BC16_MMR6 : StdMMR6Rel, BC16_MMR6_DESC, BC16_MMR6_ENC, ISA_MICROMIPS32R6;
+def BEQZC_MMR6 : R6MMR6Rel, BEQZC_MMR6_ENC, BEQZC_MMR6_DESC,
+                 ISA_MICROMIPS32R6;
 def BEQZC16_MMR6 : StdMMR6Rel, BEQZC16_MMR6_DESC, BEQZC16_MMR6_ENC,
                    ISA_MICROMIPS32R6;
+def BNEZC_MMR6 : R6MMR6Rel, BNEZC_MMR6_ENC, BNEZC_MMR6_DESC,
+                 ISA_MICROMIPS32R6;
 def BNEZC16_MMR6 : StdMMR6Rel, BNEZC16_MMR6_DESC, BNEZC16_MMR6_ENC,
                    ISA_MICROMIPS32R6;
 def BITSWAP_MMR6 : R6MMR6Rel, BITSWAP_MMR6_ENC, BITSWAP_MMR6_DESC,
                    ISA_MICROMIPS32R6;
 def BEQZALC_MMR6 : R6MMR6Rel, BEQZALC_MMR6_ENC, BEQZALC_MMR6_DESC,
                    ISA_MICROMIPS32R6;
-def BGEZALC_MMR6 : R6MMR6Rel, BGEZALC_MMR6_ENC, BGEZALC_MMR6_DESC,
-                   ISA_MICROMIPS32R6;
-def BGTZALC_MMR6 : R6MMR6Rel, BGTZALC_MMR6_ENC, BGTZALC_MMR6_DESC,
-                   ISA_MICROMIPS32R6;
-def BLEZALC_MMR6 : R6MMR6Rel, BLEZALC_MMR6_ENC, BLEZALC_MMR6_DESC,
-                   ISA_MICROMIPS32R6;
-def BLTZALC_MMR6 : R6MMR6Rel, BLTZALC_MMR6_ENC, BLTZALC_MMR6_DESC,
-                   ISA_MICROMIPS32R6;
 def BNEZALC_MMR6 : R6MMR6Rel, BNEZALC_MMR6_ENC, BNEZALC_MMR6_DESC,
                    ISA_MICROMIPS32R6;
 def BREAK_MMR6 : StdMMR6Rel, BRK_MMR6_DESC, BRK_MMR6_ENC, ISA_MICROMIPS32R6;
@@ -985,8 +1376,30 @@ def JRC16_MMR6 : R6MMR6Rel, JRC16_MMR6_DESC, JRC16_MMR6_ENC, ISA_MICROMIPS32R6;
 def JRCADDIUSP_MMR6 : R6MMR6Rel, JRCADDIUSP_MMR6_DESC, JRCADDIUSP_MMR6_ENC,
                       ISA_MICROMIPS32R6;
 def LSA_MMR6 : R6MMR6Rel, LSA_MMR6_ENC, LSA_MMR6_DESC, ISA_MICROMIPS32R6;
+def LWP_MMR6 : StdMMR6Rel, LWP_MMR6_ENC, LWP_MMR6_DESC, ISA_MICROMIPS32R6;
 def LWPC_MMR6 : R6MMR6Rel, LWPC_MMR6_ENC, LWPC_MMR6_DESC, ISA_MICROMIPS32R6;
 def LWM16_MMR6 : StdMMR6Rel, LWM16_MMR6_DESC, LWM16_MMR6_ENC, ISA_MICROMIPS32R6;
+def MTC0_MMR6 : StdMMR6Rel, MTC0_MMR6_ENC, MTC0_MMR6_DESC, ISA_MICROMIPS32R6;
+def MTC1_MMR6 : StdMMR6Rel, MTC1_MMR6_DESC, MTC1_MMR6_ENC, ISA_MICROMIPS32R6;
+def MTC2_MMR6 : StdMMR6Rel, MTC2_MMR6_ENC, MTC2_MMR6_DESC, ISA_MICROMIPS32R6;
+def MTHC0_MMR6 : R6MMR6Rel, MTHC0_MMR6_ENC, MTHC0_MMR6_DESC, ISA_MICROMIPS32R6;
+def MTHC1_D32_MMR6 : StdMMR6Rel, MTHC1_D32_MMR6_DESC, MTHC1_MMR6_ENC, ISA_MICROMIPS32R6;
+let DecoderNamespace = "MicroMips32r6FP64" in {
+  def MTHC1_D64_MMR6 : R6MMR6Rel, MTHC1_D64_MMR6_DESC, MTHC1_MMR6_ENC,
+                       ISA_MICROMIPS32R6;
+}
+def MTHC2_MMR6 : StdMMR6Rel, MTHC2_MMR6_ENC, MTHC2_MMR6_DESC, ISA_MICROMIPS32R6;
+def MFC0_MMR6 : StdMMR6Rel, MFC0_MMR6_ENC, MFC0_MMR6_DESC, ISA_MICROMIPS32R6;
+def MFC1_MMR6 : StdMMR6Rel, MFC1_MMR6_DESC, MFC1_MMR6_ENC, ISA_MICROMIPS32R6;
+def MFC2_MMR6 : StdMMR6Rel, MFC2_MMR6_ENC, MFC2_MMR6_DESC, ISA_MICROMIPS32R6;
+def MFHC0_MMR6 : R6MMR6Rel, MFHC0_MMR6_ENC, MFHC0_MMR6_DESC, ISA_MICROMIPS32R6;
+def MFHC1_D32_MMR6 : StdMMR6Rel, MFHC1_D32_MMR6_DESC, MFHC1_MMR6_ENC,
+                     ISA_MICROMIPS32R6;
+let DecoderNamespace = "MicroMips32r6FP64" in {
+  def MFHC1_D64_MMR6 : StdMMR6Rel, MFHC1_D64_MMR6_DESC, MFHC1_MMR6_ENC,
+                       ISA_MICROMIPS32R6;
+}
+def MFHC2_MMR6 : StdMMR6Rel, MFHC2_MMR6_ENC, MFHC2_MMR6_DESC, ISA_MICROMIPS32R6;
 def MOD_MMR6 : R6MMR6Rel, MOD_MMR6_DESC, MOD_MMR6_ENC, ISA_MICROMIPS32R6;
 def MODU_MMR6 : R6MMR6Rel, MODU_MMR6_DESC, MODU_MMR6_ENC, ISA_MICROMIPS32R6;
 def MUL_MMR6 : R6MMR6Rel, MUL_MMR6_DESC, MUL_MMR6_ENC, ISA_MICROMIPS32R6;
@@ -1011,6 +1424,7 @@ def SUBU_MMR6 : StdMMR6Rel, SUBU_MMR6_DESC, SUBU_MMR6_ENC, ISA_MICROMIPS32R6;
 def SW16_MMR6 : StdMMR6Rel, SW16_MMR6_DESC, SW16_MMR6_ENC, ISA_MICROMIPS32R6;
 def SWM16_MMR6 : StdMMR6Rel, SWM16_MMR6_DESC, SWM16_MMR6_ENC, ISA_MICROMIPS32R6;
 def SWSP_MMR6 : StdMMR6Rel, SWSP_MMR6_DESC, SWSP_MMR6_ENC, ISA_MICROMIPS32R6;
+def SWP_MMR6 : StdMMR6Rel, SWP_MMR6_ENC, SWP_MMR6_DESC, ISA_MICROMIPS32R6;
 def PREFE_MMR6 : StdMMR6Rel, PREFE_MMR6_ENC, PREFE_MMR6_DESC, ISA_MICROMIPS32R6;
 def CACHEE_MMR6 : StdMMR6Rel, CACHEE_MMR6_ENC, CACHEE_MMR6_DESC,
                   ISA_MICROMIPS32R6;
@@ -1174,6 +1588,11 @@ def SUBU16_MMR6 : StdMMR6Rel, SUBU16_MMR6_DESC, SUBU16_MMR6_ENC,
                   ISA_MICROMIPS32R6;
 def XOR16_MMR6 : StdMMR6Rel, XOR16_MMR6_DESC, XOR16_MMR6_ENC,
                  ISA_MICROMIPS32R6;
+def JALRC_HB_MMR6 : R6MMR6Rel, JALRC_HB_MMR6_ENC, JALRC_HB_MMR6_DESC,
+                    ISA_MICROMIPS32R6;
+def EXT_MMR6 : StdMMR6Rel, EXT_MMR6_ENC, EXT_MMR6_DESC, ISA_MICROMIPS32R6;
+def INS_MMR6 : StdMMR6Rel, INS_MMR6_ENC, INS_MMR6_DESC, ISA_MICROMIPS32R6;
+def JALRC_MMR6 : R6MMR6Rel, JALRC_MMR6_ENC, JALRC_MMR6_DESC, ISA_MICROMIPS32R6;
 def RECIP_S_MMR6 : StdMMR6Rel, RECIP_S_MMR6_ENC, RECIP_S_MMR6_DESC,
                    ISA_MICROMIPS32R6;
 def RECIP_D_MMR6 : StdMMR6Rel, RECIP_D_MMR6_ENC, RECIP_D_MMR6_DESC, ISA_MICROMIPS32R6;
@@ -1188,22 +1607,69 @@ def ROUND_W_S_MMR6 : StdMMR6Rel, ROUND_W_S_MMR6_ENC, ROUND_W_S_MMR6_DESC,
                      ISA_MICROMIPS32R6;
 def ROUND_W_D_MMR6 : StdMMR6Rel, ROUND_W_D_MMR6_ENC, ROUND_W_D_MMR6_DESC,
                      ISA_MICROMIPS32R6;
-def SEL_S_MMR6 : StdMMR6Rel, SEL_S_MMR6_ENC, SEL_S_MMR6_DESC, ISA_MICROMIPS32R6;
-def SEL_D_MMR6 : StdMMR6Rel, SEL_D_MMR6_ENC, SEL_D_MMR6_DESC, ISA_MICROMIPS32R6;
-def SELEQZ_S_MMR6 : StdMMR6Rel, SELEQZ_S_MMR6_ENC, SELEQZ_S_MMR6_DESC,
+def SEL_S_MMR6 : R6MMR6Rel, SEL_S_MMR6_ENC, SEL_S_MMR6_DESC, ISA_MICROMIPS32R6;
+def SEL_D_MMR6 : R6MMR6Rel, SEL_D_MMR6_ENC, SEL_D_MMR6_DESC, ISA_MICROMIPS32R6;
+def SELEQZ_S_MMR6 : R6MMR6Rel, SELEQZ_S_MMR6_ENC, SELEQZ_S_MMR6_DESC,
                     ISA_MICROMIPS32R6;
-def SELEQZ_D_MMR6 : StdMMR6Rel, SELEQZ_D_MMR6_ENC, SELEQZ_D_MMR6_DESC,
+def SELEQZ_D_MMR6 : R6MMR6Rel, SELEQZ_D_MMR6_ENC, SELEQZ_D_MMR6_DESC,
                     ISA_MICROMIPS32R6;
-def SELENZ_S_MMR6 : StdMMR6Rel, SELENZ_S_MMR6_ENC, SELENZ_S_MMR6_DESC,
+def SELNEZ_S_MMR6 : R6MMR6Rel, SELNEZ_S_MMR6_ENC, SELNEZ_S_MMR6_DESC,
                     ISA_MICROMIPS32R6;
-def SELENZ_D_MMR6 : StdMMR6Rel, SELENZ_D_MMR6_ENC, SELENZ_D_MMR6_DESC,
+def SELNEZ_D_MMR6 : R6MMR6Rel, SELNEZ_D_MMR6_ENC, SELNEZ_D_MMR6_DESC,
                     ISA_MICROMIPS32R6;
 def CLASS_S_MMR6 : StdMMR6Rel, CLASS_S_MMR6_ENC, CLASS_S_MMR6_DESC,
                    ISA_MICROMIPS32R6;
 def CLASS_D_MMR6 : StdMMR6Rel, CLASS_D_MMR6_ENC, CLASS_D_MMR6_DESC,
                    ISA_MICROMIPS32R6;
+def TLBINV_MMR6 : StdMMR6Rel, TLBINV_MMR6_ENC, TLBINV_MMR6_DESC,
+                  ISA_MICROMIPS32R6;
+def TLBINVF_MMR6 : StdMMR6Rel, TLBINVF_MMR6_ENC, TLBINVF_MMR6_DESC,
+                   ISA_MICROMIPS32R6;
+def DVP_MMR6 : R6MMR6Rel, DVP_MMR6_ENC, DVP_MMR6_DESC, ISA_MICROMIPS32R6;
+def EVP_MMR6 : R6MMR6Rel, EVP_MMR6_ENC, EVP_MMR6_DESC, ISA_MICROMIPS32R6;
+def BC1EQZC_MMR6 : R6MMR6Rel, BC1EQZC_MMR6_DESC, BC1EQZC_MMR6_ENC,
+                   ISA_MICROMIPS32R6;
+def BC1NEZC_MMR6 : R6MMR6Rel, BC1NEZC_MMR6_DESC, BC1NEZC_MMR6_ENC,
+                   ISA_MICROMIPS32R6;
+def BC2EQZC_MMR6 : R6MMR6Rel, MipsR6Inst, BC2EQZC_MMR6_ENC, BC2EQZC_MMR6_DESC,
+                   ISA_MICROMIPS32R6;
+def BC2NEZC_MMR6 : R6MMR6Rel, MipsR6Inst, BC2NEZC_MMR6_ENC, BC2NEZC_MMR6_DESC,
+                   ISA_MICROMIPS32R6;
+let DecoderNamespace = "MicroMips32r6FP64" in {
+  def LDC1_D64_MMR6 : StdMMR6Rel, LDC1_D64_MMR6_DESC, LDC1_MMR6_ENC,
+                      ISA_MICROMIPS32R6 {
+    let BaseOpcode = "LDC164";
+  }
+  def SDC1_D64_MMR6 : StdMMR6Rel, SDC1_D64_MMR6_DESC, SDC1_MMR6_ENC,
+                      ISA_MICROMIPS32R6;
+}
+def LDC2_MMR6 : StdMMR6Rel, LDC2_MMR6_ENC, LDC2_MMR6_DESC, ISA_MICROMIPS32R6;
+def SDC2_MMR6 : StdMMR6Rel, SDC2_MMR6_ENC, SDC2_MMR6_DESC, ISA_MICROMIPS32R6;
+def LWC2_MMR6 : StdMMR6Rel, LWC2_MMR6_ENC, LWC2_MMR6_DESC, ISA_MICROMIPS32R6;
+def SWC2_MMR6 : StdMMR6Rel, SWC2_MMR6_ENC, SWC2_MMR6_DESC, ISA_MICROMIPS32R6;
 }
 
+def BOVC_MMR6 : R6MMR6Rel, BOVC_MMR6_ENC, BOVC_MMR6_DESC, ISA_MICROMIPS32R6,
+                MMDecodeDisambiguatedBy<"POP35GroupBranchMMR6">;
+def BNVC_MMR6 : R6MMR6Rel, BNVC_MMR6_ENC, BNVC_MMR6_DESC, ISA_MICROMIPS32R6,
+                MMDecodeDisambiguatedBy<"POP37GroupBranchMMR6">;
+def BGEC_MMR6 : R6MMR6Rel, BGEC_MMR6_ENC, BGEC_MMR6_DESC, ISA_MICROMIPS32R6;
+def BGEUC_MMR6 : R6MMR6Rel, BGEUC_MMR6_ENC, BGEUC_MMR6_DESC, ISA_MICROMIPS32R6;
+def BLTC_MMR6 : R6MMR6Rel, BLTC_MMR6_ENC, BLTC_MMR6_DESC, ISA_MICROMIPS32R6;
+def BLTUC_MMR6 : R6MMR6Rel, BLTUC_MMR6_ENC, BLTUC_MMR6_DESC, ISA_MICROMIPS32R6;
+def BEQC_MMR6 : R6MMR6Rel, BEQC_MMR6_ENC, BEQC_MMR6_DESC, ISA_MICROMIPS32R6,
+                DecodeDisambiguates<"POP35GroupBranchMMR6">;
+def BNEC_MMR6 : R6MMR6Rel, BNEC_MMR6_ENC, BNEC_MMR6_DESC, ISA_MICROMIPS32R6,
+                DecodeDisambiguates<"POP37GroupBranchMMR6">;
+def BGEZALC_MMR6 : R6MMR6Rel, BGEZALC_MMR6_ENC, BGEZALC_MMR6_DESC,
+                   ISA_MICROMIPS32R6;
+def BGTZALC_MMR6 : R6MMR6Rel, BGTZALC_MMR6_ENC, BGTZALC_MMR6_DESC,
+                   ISA_MICROMIPS32R6;
+def BLEZALC_MMR6 : R6MMR6Rel, BLEZALC_MMR6_ENC, BLEZALC_MMR6_DESC,
+                   ISA_MICROMIPS32R6;
+def BLTZALC_MMR6 : R6MMR6Rel, BLTZALC_MMR6_ENC, BLTZALC_MMR6_DESC,
+                   ISA_MICROMIPS32R6;
+
 //===----------------------------------------------------------------------===//
 //
 // MicroMips instruction aliases
@@ -1222,6 +1688,45 @@ def : MipsInstAlias<"sdbbp", (SDBBP_MMR6 0), 1>, ISA_MICROMIPS32R6;
 def : MipsInstAlias<"rdhwr $rt, $rs",
                     (RDHWR_MMR6 GPR32Opnd:$rt, HWRegsOpnd:$rs, 0), 1>,
                     ISA_MICROMIPS32R6;
+def : MipsInstAlias<"mtc0 $rt, $rs",
+                    (MTC0_MMR6 COP0Opnd:$rs, GPR32Opnd:$rt, 0), 0>,
+                    ISA_MICROMIPS32R6;
+def : MipsInstAlias<"mthc0 $rt, $rs",
+                    (MTHC0_MMR6 COP0Opnd:$rs, GPR32Opnd:$rt, 0), 0>,
+                    ISA_MICROMIPS32R6;
+def : MipsInstAlias<"mfc0 $rt, $rs",
+                    (MFC0_MMR6 GPR32Opnd:$rt, COP0Opnd:$rs, 0), 0>,
+                    ISA_MICROMIPS32R6;
+def : MipsInstAlias<"mfhc0 $rt, $rs",
+                    (MFHC0_MMR6 GPR32Opnd:$rt, COP0Opnd:$rs, 0), 0>,
+                    ISA_MICROMIPS32R6;
+def : MipsInstAlias<"jalrc.hb $rs", (JALRC_HB_MMR6 RA, GPR32Opnd:$rs), 1>,
+                    ISA_MICROMIPS32R6;
+def : MipsInstAlias<"dvp", (DVP_MMR6 ZERO), 0>, ISA_MICROMIPS32R6;
+def : MipsInstAlias<"evp", (EVP_MMR6 ZERO), 0>, ISA_MICROMIPS32R6;
+def : MipsInstAlias<"jalrc $rs", (JALRC_MMR6 RA, GPR32Opnd:$rs), 1>,
+      ISA_MICROMIPS32R6;
+def : MipsInstAlias<"and $rs, $rt, $imm",
+                    (ANDI_MMR6 GPR32Opnd:$rs, GPR32Opnd:$rt, uimm16:$imm), 0>,
+                    ISA_MICROMIPS32R6;
+def : MipsInstAlias<"and $rs, $imm",
+                    (ANDI_MMR6 GPR32Opnd:$rs, GPR32Opnd:$rs, uimm16:$imm), 0>,
+                    ISA_MICROMIPS32R6;
+def : MipsInstAlias<"or $rs, $rt, $imm",
+                    (ORI_MMR6 GPR32Opnd:$rs, GPR32Opnd:$rt, uimm16:$imm), 0>,
+                    ISA_MICROMIPS32R6;
+def : MipsInstAlias<"or $rs, $imm",
+                    (ORI_MMR6 GPR32Opnd:$rs, GPR32Opnd:$rs, uimm16:$imm), 0>,
+                    ISA_MICROMIPS32R6;
+def : MipsInstAlias<"xor $rs, $rt, $imm",
+                    (XORI_MMR6 GPR32Opnd:$rs, GPR32Opnd:$rt, uimm16:$imm), 0>,
+                    ISA_MICROMIPS32R6;
+def : MipsInstAlias<"xor $rs, $imm",
+                    (XORI_MMR6 GPR32Opnd:$rs, GPR32Opnd:$rs, uimm16:$imm), 0>,
+                    ISA_MICROMIPS32R6;
+def : MipsInstAlias<"not $rt, $rs",
+                    (NOR_MMR6 GPR32Opnd:$rt, GPR32Opnd:$rs, ZERO), 0>,
+                    ISA_MICROMIPS32R6;
 
 //===----------------------------------------------------------------------===//
 //
@@ -1231,3 +1736,39 @@ def : MipsInstAlias<"rdhwr $rt, $rs",
 
 def : MipsPat<(store GPRMM16:$src, addrimm4lsl2:$addr),
               (SW16_MMR6 GPRMM16:$src, addrimm4lsl2:$addr)>, ISA_MICROMIPS32R6;
+def : MipsPat<(subc GPR32:$lhs, GPR32:$rhs),
+              (SUBU_MMR6 GPR32:$lhs, GPR32:$rhs)>, ISA_MICROMIPS32R6;
+
+def : MipsPat<(select i32:$cond, i32:$t, i32:$f),
+              (OR_MM (SELNEZ_MMR6 i32:$t, i32:$cond),
+                     (SELEQZ_MMR6 i32:$f, i32:$cond))>,
+              ISA_MICROMIPS32R6;
+def : MipsPat<(select i32:$cond, i32:$t, immz),
+              (SELNEZ_MMR6 i32:$t, i32:$cond)>,
+              ISA_MICROMIPS32R6;
+def : MipsPat<(select i32:$cond, immz, i32:$f),
+              (SELEQZ_MMR6 i32:$f, i32:$cond)>,
+              ISA_MICROMIPS32R6;
+
+defm : SelectInt_Pats<i32, OR_MM, XORI_MMR6, SLTi_MM, SLTiu_MM, SELEQZ_MMR6,
+                      SELNEZ_MMR6, immZExt16, i32>, ISA_MICROMIPS32R6;
+
+defm S_MMR6 : Cmp_Pats<f32, NOR_MMR6, ZERO>, ISA_MICROMIPS32R6;
+defm D_MMR6 : Cmp_Pats<f64, NOR_MMR6, ZERO>, ISA_MICROMIPS32R6;
+
+def : MipsPat<(and GPRMM16:$src, immZExtAndi16:$imm),
+              (ANDI16_MMR6 GPRMM16:$src, immZExtAndi16:$imm)>,
+              ISA_MICROMIPS32R6;
+def : MipsPat<(and GPR32:$src, immZExt16:$imm),
+              (ANDI_MMR6 GPR32:$src, immZExt16:$imm)>, ISA_MICROMIPS32R6;
+def : MipsPat<(i32 immZExt16:$imm),
+              (XORI_MMR6 ZERO, immZExt16:$imm)>, ISA_MICROMIPS32R6;
+def : MipsPat<(not GPRMM16:$in),
+              (NOT16_MMR6 GPRMM16:$in)>, ISA_MICROMIPS32R6;
+def : MipsPat<(not GPR32:$in),
+              (NOR_MMR6 GPR32Opnd:$in, ZERO)>, ISA_MICROMIPS32R6;
+// Patterns for load with a reg+imm operand.
+let AddedComplexity = 41 in {
+  def : LoadRegImmPat<LDC1_D64_MMR6, f64, load>, FGR_64, ISA_MICROMIPS32R6;
+  def : StoreRegImmPat<SDC1_D64_MMR6, f64>, FGR_64, ISA_MICROMIPS32R6;
+}
diff --git a/lib/Target/Mips/MicroMips64r6InstrFormats.td b/lib/Target/Mips/MicroMips64r6InstrFormats.td
index da305a2d508a..4add305522f7 100644
--- a/lib/Target/Mips/MicroMips64r6InstrFormats.td
+++ b/lib/Target/Mips/MicroMips64r6InstrFormats.td
@@ -71,16 +71,151 @@ class POOL32S_DALIGN_FM_MMR6 {
 
 class POOL32A_DIVMOD_FM_MMR6<string instr_asm, bits<9> funct>
     : MMR6Arch<instr_asm> {
+  bits<5> rt;
+  bits<5> rs;
   bits<5> rd;
+
+  bits<32> Inst;
+
+  let Inst{31-26} = 0b010110;
+  let Inst{25-21} = rt;
+  let Inst{20-16} = rs;
+  let Inst{15-11} = rd;
+  let Inst{10-9}  = 0b00;
+  let Inst{8-0}  = funct;
+}
+
+class POOL32S_DMFTC0_FM_MMR6<string instr_asm, bits<5> funct>
+    : MMR6Arch<instr_asm>, MipsR6Inst {
+  bits<5> rt;
   bits<5> rs;
+  bits<3> sel;
+
+  bits<32> Inst;
+
+  let Inst{31-26} = 0b010110;
+  let Inst{25-21} = rt;
+  let Inst{20-16} = rs;
+  let Inst{15-14} = 0;
+  let Inst{13-11} = sel;
+  let Inst{10-6}  = funct;
+  let Inst{5-0}   = 0b111100;
+}
+
+class POOL32S_ARITH_FM_MMR6<string opstr, bits<9> funct>
+    : MMR6Arch<opstr> {
   bits<5> rt;
+  bits<5> rs;
+  bits<5> rd;
 
   bits<32> Inst;
 
   let Inst{31-26} = 0b010110;
-  let Inst{25-21} = rd;
+  let Inst{25-21} = rt;
   let Inst{20-16} = rs;
-  let Inst{15-11} = rt;
+  let Inst{15-11} = rd;
   let Inst{10-9}  = 0b00;
-  let Inst{8-0}  = funct;
+  let Inst{8-0}   = funct;
+}
+
+class DADDIU_FM_MMR6<string opstr> : MMR6Arch<opstr> {
+  bits<5> rt;
+  bits<5> rs;
+  bits<16> imm16;
+
+  bits<32> Inst;
+
+  let Inst{31-26} = 0b010111;
+  let Inst{25-21} = rt;
+  let Inst{20-16} = rs;
+  let Inst{15-0}  = imm16;
+}
+
+class PCREL18_FM_MMR6<bits<3> funct> : MipsR6Inst {
+  bits<5> rt;
+  bits<18> imm;
+
+  bits<32> Inst;
+
+  let Inst{31-26} = 0b011110;
+  let Inst{25-21} = rt;
+  let Inst{20-18} = funct;
+  let Inst{17-0} = imm;
+}
+
+class POOL32S_2R_FM_MMR6<string instr_asm, bits<10> funct>
+    : MMR6Arch<instr_asm>, MipsR6Inst {
+  bits<5> rt;
+  bits<5> rs;
+
+  bits<32> Inst;
+
+  let Inst{31-26} = 0b010110;
+  let Inst{25-21} = rt;
+  let Inst{20-16} = rs;
+  let Inst{15-6} = funct;
+  let Inst{5-0} = 0b111100;
+}
+
+class POOL32S_2RSA5B0_FM_MMR6<string instr_asm, bits<9> funct>
+    : MMR6Arch<instr_asm>, MipsR6Inst {
+  bits<5> rt;
+  bits<5> rs;
+  bits<5> sa;
+
+  bits<32> Inst;
+
+  let Inst{31-26} = 0b010110;
+  let Inst{25-21} = rt;
+  let Inst{20-16} = rs;
+  let Inst{15-11} = sa;
+  let Inst{10-9} = 0b00;
+  let Inst{8-0} = funct;
+}
+
+class LD_SD_32_2R_OFFSET16_FM_MMR6<string instr_asm, bits<6> op>
+    : MMR6Arch<instr_asm>, MipsR6Inst {
+  bits<5> rt;
+  bits<21> addr;
+  bits<5> base = addr{20-16};
+  bits<16> offset = addr{15-0};
+
+  bits<32> Inst;
+
+  let Inst{31-26} = op;
+  let Inst{25-21} = rt;
+  let Inst{20-16} = base;
+  let Inst{15-0}  = offset;
+}
+
+class POOL32C_2R_OFFSET12_FM_MMR6<string instr_asm, bits<4> funct>
+    : MMR6Arch<instr_asm>, MipsR6Inst {
+  bits<5> rt;
+  bits<21> addr;
+  bits<5> base = addr{20-16};
+  bits<12> offset = addr{11-0};
+
+  bits<32> Inst;
+
+  let Inst{31-26} = 0b011000;
+  let Inst{25-21} = rt;
+  let Inst{20-16} = base;
+  let Inst{15-12} = funct;
+  let Inst{11-0}  = offset;
+}
+
+class POOL32S_3R_FM_MMR6<string instr_asm, bits<9> funct>
+    : MMR6Arch<instr_asm>, MipsR6Inst {
+  bits<5> rt;
+  bits<5> rs;
+  bits<5> rd;
+
+  bits<32> Inst;
+
+  let Inst{31-26} = 0b010110;
+  let Inst{25-21} = rt;
+  let Inst{20-16} = rs;
+  let Inst{15-11} = rd;
+  let Inst{10-9}  = 0b00;
+  let Inst{8-0}   = funct;
 }
diff --git a/lib/Target/Mips/MicroMips64r6InstrInfo.td b/lib/Target/Mips/MicroMips64r6InstrInfo.td
index ec1aef86a942..87c41deb85af 100644
--- a/lib/Target/Mips/MicroMips64r6InstrInfo.td
+++ b/lib/Target/Mips/MicroMips64r6InstrInfo.td
@@ -28,6 +28,45 @@ class DDIV_MM64R6_ENC : POOL32A_DIVMOD_FM_MMR6<"ddiv", 0b100011000>;
 class DMOD_MM64R6_ENC : POOL32A_DIVMOD_FM_MMR6<"dmod", 0b101011000>;
 class DDIVU_MM64R6_ENC : POOL32A_DIVMOD_FM_MMR6<"ddivu", 0b110011000>;
 class DMODU_MM64R6_ENC : POOL32A_DIVMOD_FM_MMR6<"dmodu", 0b111011000>;
+class DINSU_MM64R6_ENC : POOL32S_EXTBITS_FM_MMR6<0b110100>;
+class DINSM_MM64R6_ENC : POOL32S_EXTBITS_FM_MMR6<0b000100>;
+class DINS_MM64R6_ENC : POOL32S_EXTBITS_FM_MMR6<0b001100>;
+class DMTC0_MM64R6_ENC : POOL32S_DMFTC0_FM_MMR6<"dmtc0", 0b01011>;
+class DMTC1_MM64R6_ENC : POOL32F_MFTC1_FM_MMR6<"dmtc1", 0b10110000>;
+class DMTC2_MM64R6_ENC : POOL32A_MFTC2_FM_MMR6<"dmtc2", 0b0111110100>;
+class DMFC0_MM64R6_ENC : POOL32S_DMFTC0_FM_MMR6<"dmfc0", 0b00011>;
+class DMFC1_MM64R6_ENC : POOL32F_MFTC1_FM_MMR6<"dmfc1", 0b10010000>;
+class DMFC2_MM64R6_ENC : POOL32A_MFTC2_FM_MMR6<"dmfc2", 0b0110110100>;
+class DADD_MM64R6_ENC : POOL32S_ARITH_FM_MMR6<"dadd", 0b100010000>;
+class DADDIU_MM64R6_ENC : DADDIU_FM_MMR6<"daddiu">;
+class DADDU_MM64R6_ENC : POOL32S_ARITH_FM_MMR6<"daddu", 0b101010000>;
+class LDPC_MMR646_ENC : PCREL18_FM_MMR6<0b110>;
+class DSUB_MM64R6_ENC : POOL32S_ARITH_FM_MMR6<"dsub", 0b110010000>;
+class DSUBU_MM64R6_ENC : POOL32S_ARITH_FM_MMR6<"dsubu", 0b111010000>;
+class DMUL_MM64R6_ENC : POOL32S_ARITH_FM_MMR6<"dmul", 0b000011000>;
+class DMUH_MM64R6_ENC : POOL32S_ARITH_FM_MMR6<"dmuh", 0b001011000>;
+class DMULU_MM64R6_ENC : POOL32S_ARITH_FM_MMR6<"dmulu", 0b010011000>;
+class DMUHU_MM64R6_ENC : POOL32S_ARITH_FM_MMR6<"dmuhu", 0b011011000>;
+class DSBH_MM64R6_ENC : POOL32S_2R_FM_MMR6<"dsbh", 0b0111101100>;
+class DSHD_MM64R6_ENC : POOL32S_2R_FM_MMR6<"dshd", 0b1111101100>;
+class DSLL_MM64R6_ENC : POOL32S_2RSA5B0_FM_MMR6<"dsll", 0b000000000>;
+class DSLL32_MM64R6_ENC : POOL32S_2RSA5B0_FM_MMR6<"dsll32", 0b000001000>;
+class DSLLV_MM64R6_ENC : POOL32S_3R_FM_MMR6<"dsllv", 0b000010000>;
+class DSRAV_MM64R6_ENC : POOL32S_3R_FM_MMR6<"dsrav", 0b010010000>;
+class DSRA_MM64R6_ENC : POOL32S_2RSA5B0_FM_MMR6<"dsra", 0b010000000>;
+class DSRA32_MM64R6_ENC : POOL32S_2RSA5B0_FM_MMR6<"dsra32", 0b010000100>;
+class DCLO_MM64R6_ENC : POOL32S_2R_FM_MMR6<"dclo", 0b0100101100>;
+class DCLZ_MM64R6_ENC : POOL32S_2R_FM_MMR6<"dclz", 0b0101101100>;
+class DROTR_MM64R6_ENC : POOL32S_2RSA5B0_FM_MMR6<"drotr", 0b011000000>;
+class DROTR32_MM64R6_ENC : POOL32S_2RSA5B0_FM_MMR6<"drotr32", 0b011001000>;
+class DROTRV_MM64R6_ENC : POOL32S_3R_FM_MMR6<"drotrv", 0b011010000>;
+class LD_MM64R6_ENC : LD_SD_32_2R_OFFSET16_FM_MMR6<"ld", 0b110111>;
+class LLD_MM64R6_ENC : POOL32C_2R_OFFSET12_FM_MMR6<"lld", 0b0111>;
+class LWU_MM64R6_ENC : POOL32C_2R_OFFSET12_FM_MMR6<"lwu", 0b1110>;
+class SD_MM64R6_ENC : LD_SD_32_2R_OFFSET16_FM_MMR6<"sd", 0b110110>;
+class DSRL_MM64R6_ENC : POOL32S_2RSA5B0_FM_MMR6<"dsrl", 0b001000000>;
+class DSRL32_MM64R6_ENC : POOL32S_2RSA5B0_FM_MMR6<"dsrl32", 0b001001000>;
+class DSRLV_MM64R6_ENC : POOL32S_3R_FM_MMR6<"dsrlv", 0b001010000>;
 
 //===----------------------------------------------------------------------===//
 //
@@ -68,7 +107,7 @@ class EXTBITS_DESC_BASE<string instr_asm, RegisterOperand RO, Operand PosOpnd,
 // TODO: Add 'pos + size' constraint check to dext* instructions
 //       DEXT: 0 < pos + size <= 63
 //       DEXTM, DEXTU: 32 < pos + size <= 64
-class DEXT_MMR6_DESC : EXTBITS_DESC_BASE<"dext", GPR64Opnd, uimm5,
+class DEXT_MMR6_DESC : EXTBITS_DESC_BASE<"dext", GPR64Opnd, uimm5_report_uimm6,
                                          uimm5_plus1, MipsExt>;
 class DEXTM_MMR6_DESC : EXTBITS_DESC_BASE<"dextm", GPR64Opnd, uimm5,
                                           uimm5_plus33, MipsExt>;
@@ -85,10 +124,189 @@ class DALIGN_DESC_BASE<string instr_asm, RegisterOperand GPROpnd,
 
 class DALIGN_MMR6_DESC : DALIGN_DESC_BASE<"dalign", GPR64Opnd, uimm3>;
 
-class DDIV_MM64R6_DESC : ArithLogicR<"ddiv", GPR32Opnd>;
-class DMOD_MM64R6_DESC : ArithLogicR<"dmod", GPR32Opnd>;
-class DDIVU_MM64R6_DESC : ArithLogicR<"ddivu", GPR32Opnd>;
-class DMODU_MM64R6_DESC : ArithLogicR<"dmodu", GPR32Opnd>;
+class DDIV_MM64R6_DESC : DIVMOD_MMR6_DESC_BASE<"ddiv", GPR64Opnd, sdiv>;
+class DMOD_MM64R6_DESC : DIVMOD_MMR6_DESC_BASE<"dmod", GPR64Opnd, srem>;
+class DDIVU_MM64R6_DESC : DIVMOD_MMR6_DESC_BASE<"ddivu", GPR64Opnd, udiv>;
+class DMODU_MM64R6_DESC : DIVMOD_MMR6_DESC_BASE<"dmodu", GPR64Opnd, urem>;
+
+class DCLO_MM64R6_DESC {
+  dag OutOperandList = (outs GPR64Opnd:$rt);
+  dag InOperandList = (ins GPR64Opnd:$rs);
+  string AsmString = !strconcat("dclo", "\t$rt, $rs");
+  list<dag> Pattern = [(set GPR64Opnd:$rt, (ctlz (not GPR64Opnd:$rs)))];
+  InstrItinClass Itinerary = II_CLO;
+  Format Form = FrmR;
+  string BaseOpcode = "dclo";
+}
+
+class DCLZ_MM64R6_DESC {
+  dag OutOperandList = (outs GPR64Opnd:$rt);
+  dag InOperandList = (ins GPR64Opnd:$rs);
+  string AsmString = !strconcat("dclz", "\t$rt, $rs");
+  list<dag> Pattern = [(set GPR64Opnd:$rt, (ctlz GPR64Opnd:$rs))];
+  InstrItinClass Itinerary = II_CLZ;
+  Format Form = FrmR;
+  string BaseOpcode = "dclz";
+}
+
+class DINSU_MM64R6_DESC : InsBase<"dinsu", GPR64Opnd, uimm5_plus32,
+                                  uimm5_inssize_plus1, MipsIns>;
+class DINSM_MM64R6_DESC : InsBase<"dinsm", GPR64Opnd, uimm5, uimm_range_2_64>;
+class DINS_MM64R6_DESC : InsBase<"dins", GPR64Opnd, uimm5, uimm5_inssize_plus1,
+                                 MipsIns>;
+class DMTC0_MM64R6_DESC : MTC0_MMR6_DESC_BASE<"dmtc0", COP0Opnd, GPR64Opnd>;
+class DMTC1_MM64R6_DESC : MTC1_MMR6_DESC_BASE<"dmtc1", FGR64Opnd, GPR64Opnd,
+                                              II_DMTC1, bitconvert>;
+class DMTC2_MM64R6_DESC : MTC2_MMR6_DESC_BASE<"dmtc2", COP2Opnd, GPR64Opnd>;
+
+class DMFC0_MM64R6_DESC : MFC0_MMR6_DESC_BASE<"dmfc0", GPR64Opnd, COP0Opnd>;
+class DMFC1_MM64R6_DESC : MFC1_MMR6_DESC_BASE<"dmfc1", GPR64Opnd, FGR64Opnd,
+                                              II_DMFC1, bitconvert>;
+class DMFC2_MM64R6_DESC : MFC2_MMR6_DESC_BASE<"dmfc2", GPR64Opnd, COP2Opnd>;
+
+class DADD_MM64R6_DESC : ArithLogicR<"dadd", GPR64Opnd, 1, II_DADD>;
+class DADDIU_MM64R6_DESC : ArithLogicI<"daddiu", simm16_64, GPR64Opnd,
+                                       II_DADDIU, immSExt16, add>,
+                           IsAsCheapAsAMove;
+class DADDU_MM64R6_DESC : ArithLogicR<"daddu", GPR64Opnd, 1, II_DADDU, add>;
+
+class DSUB_DESC_BASE<string instr_asm, RegisterOperand RO,
+                     InstrItinClass Itin = NoItinerary,
+                     SDPatternOperator OpNode = null_frag>
+                     : MipsR6Inst {
+  dag OutOperandList = (outs RO:$rd);
+  dag InOperandList = (ins RO:$rs, RO:$rt);
+  string AsmString = !strconcat(instr_asm, "\t$rd, $rs, $rt");
+  list<dag> Pattern = [(set RO:$rd, (OpNode RO:$rs, RO:$rt))];
+  InstrItinClass Itinerary = Itin;
+  Format Form = FrmR;
+  string BaseOpcode = instr_asm;
+  let isCommutable = 0;
+  let isReMaterializable = 1;
+  let TwoOperandAliasConstraint = "$rd = $rs";
+}
+class DSUB_MM64R6_DESC : DSUB_DESC_BASE<"dsub", GPR64Opnd, II_DSUB>;
+class DSUBU_MM64R6_DESC : DSUB_DESC_BASE<"dsubu", GPR64Opnd, II_DSUBU, sub>;
+
+class LDPC_MM64R6_DESC : PCREL_MMR6_DESC_BASE<"ldpc", GPR64Opnd, simm18_lsl3>;
+
+class MUL_MM64R6_DESC_BASE<string opstr, RegisterOperand GPROpnd,
+                           InstrItinClass Itin = NoItinerary,
+                           SDPatternOperator Op = null_frag> : MipsR6Inst {
+  dag OutOperandList = (outs GPROpnd:$rd);
+  dag InOperandList = (ins GPROpnd:$rs, GPROpnd:$rt);
+  string AsmString = !strconcat(opstr, "\t$rd, $rs, $rt");
+  InstrItinClass Itinerary = Itin;
+  list<dag> Pattern = [(set GPROpnd:$rd, (Op GPROpnd:$rs, GPROpnd:$rt))];
+}
+
+class DMUL_MM64R6_DESC : MUL_MM64R6_DESC_BASE<"dmul", GPR64Opnd, II_DMUL, mul>;
+class DMUH_MM64R6_DESC : MUL_MM64R6_DESC_BASE<"dmuh", GPR64Opnd, II_DMUH,
+                                              mulhs>;
+class DMULU_MM64R6_DESC : MUL_MM64R6_DESC_BASE<"dmulu", GPR64Opnd, II_DMULU>;
+class DMUHU_MM64R6_DESC : MUL_MM64R6_DESC_BASE<"dmuhu", GPR64Opnd, II_DMUHU,
+                                               mulhu>;
+
+class DSBH_DSHD_DESC_BASE<string instr_asm, RegisterOperand GPROpnd> {
+  dag OutOperandList = (outs GPROpnd:$rt);
+  dag InOperandList = (ins GPROpnd:$rs);
+  string AsmString = !strconcat(instr_asm, "\t$rt, $rs");
+  bit hasSideEffects = 0;
+  list<dag> Pattern = [];
+  InstrItinClass Itinerary = NoItinerary;
+  Format Form = FrmR;
+  string BaseOpcode = instr_asm;
+}
+
+class DSBH_MM64R6_DESC : DSBH_DSHD_DESC_BASE<"dsbh", GPR64Opnd>;
+class DSHD_MM64R6_DESC : DSBH_DSHD_DESC_BASE<"dshd", GPR64Opnd>;
+
+class SHIFT_ROTATE_IMM_MM64R6<string instr_asm, Operand ImmOpnd,
+                              InstrItinClass itin,
+                              SDPatternOperator OpNode = null_frag,
+                              SDPatternOperator PO = null_frag> {
+  dag OutOperandList = (outs GPR64Opnd:$rt);
+  dag InOperandList = (ins GPR64Opnd:$rs, ImmOpnd:$sa);
+  string AsmString = !strconcat(instr_asm, "\t$rt, $rs, $sa");
+  list<dag> Pattern = [(set GPR64Opnd:$rt, (OpNode GPR64Opnd:$rs, PO:$sa))];
+  InstrItinClass Itinerary = itin;
+  Format Form = FrmR;
+  string TwoOperandAliasConstraint = "$rs = $rt";
+  string BaseOpcode = instr_asm;
+}
+
+class SHIFT_ROTATE_REG_MM64R6<string instr_asm, InstrItinClass itin,
+                              SDPatternOperator OpNode = null_frag> {
+  dag OutOperandList = (outs GPR64Opnd:$rd);
+  dag InOperandList = (ins GPR64Opnd:$rt, GPR32Opnd:$rs);
+  string AsmString = !strconcat(instr_asm, "\t$rd, $rt, $rs");
+  list<dag> Pattern = [(set GPR64Opnd:$rd,
+                       (OpNode GPR64Opnd:$rt, GPR32Opnd:$rs))];
+  InstrItinClass Itinerary = itin;
+  Format Form = FrmR;
+  string BaseOpcode = instr_asm;
+}
+
+class DSLL_MM64R6_DESC : SHIFT_ROTATE_IMM_MM64R6<"dsll", uimm6, II_DSLL, shl,
+                                                 immZExt6>;
+class DSLL32_MM64R6_DESC : SHIFT_ROTATE_IMM_MM64R6<"dsll32", uimm5, II_DSLL32>;
+class DSLLV_MM64R6_DESC : SHIFT_ROTATE_REG_MM64R6<"dsllv", II_DSLLV, shl>;
+class DSRAV_MM64R6_DESC : SHIFT_ROTATE_REG_MM64R6<"dsrav", II_DSRAV, sra>;
+class DSRA_MM64R6_DESC : SHIFT_ROTATE_IMM_MM64R6<"dsra", uimm6, II_DSRA, sra,
+                                                 immZExt6>;
+class DSRA32_MM64R6_DESC : SHIFT_ROTATE_IMM_MM64R6<"dsra32", uimm5, II_DSRA32>;
+class DROTR_MM64R6_DESC : SHIFT_ROTATE_IMM_MM64R6<"drotr", uimm6, II_DROTR,
+                                                  rotr, immZExt6>;
+class DROTR32_MM64R6_DESC : SHIFT_ROTATE_IMM_MM64R6<"drotr32", uimm5,
+                                                    II_DROTR32>;
+class DROTRV_MM64R6_DESC : SHIFT_ROTATE_REG_MM64R6<"drotrv", II_DROTRV, rotr>;
+class DSRL_MM64R6_DESC : SHIFT_ROTATE_IMM_MM64R6<"dsrl", uimm6, II_DSRL, srl,
+                                                 immZExt6>;
+class DSRL32_MM64R6_DESC : SHIFT_ROTATE_IMM_MM64R6<"dsrl32", uimm5, II_DSRL32>;
+class DSRLV_MM64R6_DESC : SHIFT_ROTATE_REG_MM64R6<"dsrlv", II_DSRLV, srl>;
+
+class Load_MM64R6<string instr_asm, Operand MemOpnd, InstrItinClass itin,
+                  SDPatternOperator OpNode = null_frag> {
+  dag OutOperandList = (outs GPR64Opnd:$rt);
+  dag InOperandList = (ins MemOpnd:$addr);
+  string AsmString = !strconcat(instr_asm, "\t$rt, $addr");
+  list<dag> Pattern = [(set GPR64Opnd:$rt, (OpNode addr:$addr))];
+  InstrItinClass Itinerary = itin;
+  Format Form = FrmI;
+  bit mayLoad = 1;
+  bit canFoldAsLoad = 1;
+  string BaseOpcode = instr_asm;
+}
+
+class LD_MM64R6_DESC : Load_MM64R6<"ld", mem_simm16, II_LD, load> {
+  string DecoderMethod = "DecodeMemMMImm16";
+}
+class LWU_MM64R6_DESC : Load_MM64R6<"lwu", mem_simm12, II_LWU, zextloadi32>{
+  string DecoderMethod = "DecodeMemMMImm12";
+}
+
+class LLD_MM64R6_DESC {
+  dag OutOperandList = (outs GPR64Opnd:$rt);
+  dag InOperandList = (ins mem_simm12:$addr);
+  string AsmString = "lld\t$rt, $addr";
+  list<dag> Pattern = [];
+  bit mayLoad = 1;
+  InstrItinClass Itinerary = II_LLD;
+  string BaseOpcode = "lld";
+  string DecoderMethod = "DecodeMemMMImm12";
+}
+
+class SD_MM64R6_DESC {
+  dag OutOperandList = (outs);
+  dag InOperandList = (ins GPR64Opnd:$rt, mem_simm16:$addr);
+  string AsmString = "sd\t$rt, $addr";
+  list<dag> Pattern = [(store GPR64Opnd:$rt, addr:$addr)];
+  InstrItinClass Itinerary = II_SD;
+  Format Form = FrmI;
+  bit mayStore = 1;
+  string BaseOpcode = "sd";
+  string DecoderMethod = "DecodeMemMMImm16";
+}
 
 //===----------------------------------------------------------------------===//
 //
@@ -116,4 +334,180 @@ let DecoderNamespace = "MicroMipsR6" in {
                      ISA_MICROMIPS64R6;
   def DMODU_MM64R6 : R6MMR6Rel, DMODU_MM64R6_DESC, DMODU_MM64R6_ENC,
                      ISA_MICROMIPS64R6;
+  def DINSU_MM64R6: R6MMR6Rel, DINSU_MM64R6_DESC, DINSU_MM64R6_ENC,
+                    ISA_MICROMIPS64R6;
+  def DINSM_MM64R6: R6MMR6Rel, DINSM_MM64R6_DESC, DINSM_MM64R6_ENC,
+                    ISA_MICROMIPS64R6;
+  def DINS_MM64R6: R6MMR6Rel, DINS_MM64R6_DESC, DINS_MM64R6_ENC,
+                   ISA_MICROMIPS64R6;
+  def DMTC0_MM64R6 : StdMMR6Rel, DMTC0_MM64R6_ENC, DMTC0_MM64R6_DESC,
+                     ISA_MICROMIPS64R6;
+  def DMTC1_MM64R6 : StdMMR6Rel, DMTC1_MM64R6_DESC, DMTC1_MM64R6_ENC,
+                     ISA_MICROMIPS64R6;
+  def DMTC2_MM64R6 : StdMMR6Rel, DMTC2_MM64R6_ENC, DMTC2_MM64R6_DESC,
+                     ISA_MICROMIPS64R6;
+  def DMFC0_MM64R6 : StdMMR6Rel, DMFC0_MM64R6_ENC, DMFC0_MM64R6_DESC,
+                     ISA_MICROMIPS64R6;
+  def DMFC1_MM64R6 : StdMMR6Rel, DMFC1_MM64R6_DESC, DMFC1_MM64R6_ENC,
+                     ISA_MICROMIPS64R6;
+  def DMFC2_MM64R6 : StdMMR6Rel, DMFC2_MM64R6_ENC, DMFC2_MM64R6_DESC,
+                     ISA_MICROMIPS64R6;
+  def DADD_MM64R6: StdMMR6Rel, DADD_MM64R6_DESC, DADD_MM64R6_ENC,
+                   ISA_MICROMIPS64R6;
+  def DADDIU_MM64R6: StdMMR6Rel, DADDIU_MM64R6_DESC, DADDIU_MM64R6_ENC,
+                     ISA_MICROMIPS64R6;
+  def DADDU_MM64R6: StdMMR6Rel, DADDU_MM64R6_DESC, DADDU_MM64R6_ENC,
+                    ISA_MICROMIPS64R6;
+  def LDPC_MM64R6 :  R6MMR6Rel, LDPC_MMR646_ENC, LDPC_MM64R6_DESC,
+                     ISA_MICROMIPS64R6;
+  def DSUB_MM64R6 : StdMMR6Rel, DSUB_MM64R6_DESC, DSUB_MM64R6_ENC,
+                    ISA_MICROMIPS64R6;
+  def DSUBU_MM64R6 : StdMMR6Rel, DSUBU_MM64R6_DESC, DSUBU_MM64R6_ENC,
+                     ISA_MICROMIPS64R6;
+  def DMUL_MM64R6 : R6MMR6Rel, DMUL_MM64R6_DESC, DMUL_MM64R6_ENC,
+                    ISA_MICROMIPS64R6;
+  def DMUH_MM64R6 : R6MMR6Rel, DMUH_MM64R6_DESC, DMUH_MM64R6_ENC,
+                    ISA_MICROMIPS64R6;
+  def DMULU_MM64R6 : R6MMR6Rel, DMULU_MM64R6_DESC, DMULU_MM64R6_ENC,
+                     ISA_MICROMIPS64R6;
+  def DMUHU_MM64R6 : R6MMR6Rel, DMUHU_MM64R6_DESC, DMUHU_MM64R6_ENC,
+                     ISA_MICROMIPS64R6;
+  def DSBH_MM64R6 : R6MMR6Rel, DSBH_MM64R6_ENC, DSBH_MM64R6_DESC,
+                    ISA_MICROMIPS64R6;
+  def DSHD_MM64R6 : R6MMR6Rel, DSHD_MM64R6_ENC, DSHD_MM64R6_DESC,
+                    ISA_MICROMIPS64R6;
+  def DSLL_MM64R6 : StdMMR6Rel, DSLL_MM64R6_ENC, DSLL_MM64R6_DESC,
+                    ISA_MICROMIPS64R6;
+  def DSLL32_MM64R6 : StdMMR6Rel, DSLL32_MM64R6_ENC, DSLL32_MM64R6_DESC,
+                    ISA_MICROMIPS64R6;
+  def DSLLV_MM64R6 : StdMMR6Rel, DSLLV_MM64R6_ENC, DSLLV_MM64R6_DESC,
+                     ISA_MICROMIPS64R6;
+  def DSRAV_MM64R6 : StdMMR6Rel, DSRAV_MM64R6_ENC, DSRAV_MM64R6_DESC,
+                    ISA_MICROMIPS64R6;
+  def DSRA_MM64R6 : StdMMR6Rel, DSRA_MM64R6_ENC, DSRA_MM64R6_DESC,
+                    ISA_MICROMIPS64R6;
+  def DSRA32_MM64R6 : StdMMR6Rel, DSRA32_MM64R6_ENC, DSRA32_MM64R6_DESC,
+                    ISA_MICROMIPS64R6;
+  def DCLO_MM64R6 : StdMMR6Rel, R6MMR6Rel, DCLO_MM64R6_ENC, DCLO_MM64R6_DESC,
+                    ISA_MICROMIPS64R6;
+  def DCLZ_MM64R6 : StdMMR6Rel, R6MMR6Rel, DCLZ_MM64R6_ENC, DCLZ_MM64R6_DESC,
+                    ISA_MICROMIPS64R6;
+  def DROTR_MM64R6 : StdMMR6Rel, DROTR_MM64R6_ENC, DROTR_MM64R6_DESC,
+                     ISA_MICROMIPS64R6;
+  def DROTR32_MM64R6 : StdMMR6Rel, DROTR32_MM64R6_ENC, DROTR32_MM64R6_DESC,
+                       ISA_MICROMIPS64R6;
+  def DROTRV_MM64R6 : StdMMR6Rel, DROTRV_MM64R6_ENC, DROTRV_MM64R6_DESC,
+                      ISA_MICROMIPS64R6;
+  def LD_MM64R6 : StdMMR6Rel, LD_MM64R6_ENC, LD_MM64R6_DESC,
+                  ISA_MICROMIPS64R6;
+  def LLD_MM64R6 : StdMMR6Rel, R6MMR6Rel, LLD_MM64R6_ENC, LLD_MM64R6_DESC,
+                   ISA_MICROMIPS64R6;
+  def LWU_MM64R6 : StdMMR6Rel, LWU_MM64R6_ENC, LWU_MM64R6_DESC,
+                   ISA_MICROMIPS64R6;
+  def SD_MM64R6 : StdMMR6Rel, SD_MM64R6_ENC, SD_MM64R6_DESC,
+                  ISA_MICROMIPS64R6;
+  def DSRL_MM64R6 : StdMMR6Rel, DSRL_MM64R6_ENC, DSRL_MM64R6_DESC,
+                    ISA_MICROMIPS64R6;
+  def DSRL32_MM64R6 : StdMMR6Rel, DSRL32_MM64R6_ENC, DSRL32_MM64R6_DESC,
+                      ISA_MICROMIPS64R6;
+  def DSRLV_MM64R6 : StdMMR6Rel, DSRLV_MM64R6_ENC, DSRLV_MM64R6_DESC,
+                     ISA_MICROMIPS64R6;
 }
+
+//===----------------------------------------------------------------------===//
+//
+// Arbitrary patterns that map to one or more instructions
+//
+//===----------------------------------------------------------------------===//
+
+def : MipsPat<(MipsLo tglobaladdr:$in),
+              (DADDIU_MM64R6 ZERO_64, tglobaladdr:$in)>, ISA_MICROMIPS64R6;
+def : MipsPat<(MipsLo tblockaddress:$in),
+              (DADDIU_MM64R6 ZERO_64, tblockaddress:$in)>, ISA_MICROMIPS64R6;
+def : MipsPat<(MipsLo tjumptable:$in),
+              (DADDIU_MM64R6 ZERO_64, tjumptable:$in)>, ISA_MICROMIPS64R6;
+def : MipsPat<(MipsLo tconstpool:$in),
+              (DADDIU_MM64R6 ZERO_64, tconstpool:$in)>, ISA_MICROMIPS64R6;
+def : MipsPat<(MipsLo tglobaltlsaddr:$in),
+              (DADDIU_MM64R6 ZERO_64, tglobaltlsaddr:$in)>, ISA_MICROMIPS64R6;
+def : MipsPat<(MipsLo texternalsym:$in),
+              (DADDIU_MM64R6 ZERO_64, texternalsym:$in)>, ISA_MICROMIPS64R6;
+
+def : MipsPat<(add GPR64:$hi, (MipsLo tglobaladdr:$lo)),
+              (DADDIU_MM64R6 GPR64:$hi, tglobaladdr:$lo)>, ISA_MICROMIPS64R6;
+def : MipsPat<(add GPR64:$hi, (MipsLo tblockaddress:$lo)),
+              (DADDIU_MM64R6 GPR64:$hi, tblockaddress:$lo)>, ISA_MICROMIPS64R6;
+def : MipsPat<(add GPR64:$hi, (MipsLo tjumptable:$lo)),
+              (DADDIU_MM64R6 GPR64:$hi, tjumptable:$lo)>, ISA_MICROMIPS64R6;
+def : MipsPat<(add GPR64:$hi, (MipsLo tconstpool:$lo)),
+              (DADDIU_MM64R6 GPR64:$hi, tconstpool:$lo)>, ISA_MICROMIPS64R6;
+def : MipsPat<(add GPR64:$hi, (MipsLo tglobaltlsaddr:$lo)),
+              (DADDIU_MM64R6 GPR64:$hi, tglobaltlsaddr:$lo)>, ISA_MICROMIPS64R6;
+
+def : MipsPat<(addc GPR64:$lhs, GPR64:$rhs),
+              (DADDU_MM64R6 GPR64:$lhs, GPR64:$rhs)>, ISA_MICROMIPS64R6;
+def : MipsPat<(addc GPR64:$lhs, immSExt16:$imm),
+              (DADDIU_MM64R6 GPR64:$lhs, imm:$imm)>, ISA_MICROMIPS64R6;
+
+
+def : MipsPat<(rotr GPR64:$rt, (i32 (trunc GPR64:$rs))),
+              (DROTRV_MM64R6 GPR64:$rt, (EXTRACT_SUBREG GPR64:$rs, sub_32))>,
+              ISA_MICROMIPS64R6;
+
+
+def : WrapperPat<tglobaladdr, DADDIU_MM64R6, GPR64>, ISA_MICROMIPS64R6;
+def : WrapperPat<tconstpool, DADDIU_MM64R6, GPR64>, ISA_MICROMIPS64R6;
+def : WrapperPat<texternalsym, DADDIU_MM64R6, GPR64>, ISA_MICROMIPS64R6;
+def : WrapperPat<tblockaddress, DADDIU_MM64R6, GPR64>, ISA_MICROMIPS64R6;
+def : WrapperPat<tjumptable, DADDIU_MM64R6, GPR64>, ISA_MICROMIPS64R6;
+def : WrapperPat<tglobaltlsaddr, DADDIU_MM64R6, GPR64>, ISA_MICROMIPS64R6;
+
+// Carry pattern
+def : MipsPat<(subc GPR64:$lhs, GPR64:$rhs),
+              (DSUBU_MM64R6 GPR64:$lhs, GPR64:$rhs)>, ISA_MICROMIPS64R6;
+
+def : MipsPat<(atomic_load_64 addr:$a), (LD_MM64R6 addr:$a)>, ISA_MICROMIPS64R6;
+
+//===----------------------------------------------------------------------===//
+//
+// Instruction aliases
+//
+//===----------------------------------------------------------------------===//
+
+def : MipsInstAlias<"dmtc0 $rt, $rd",
+                    (DMTC0_MM64R6 COP0Opnd:$rd, GPR64Opnd:$rt, 0), 0>;
+def : MipsInstAlias<"dmfc0 $rt, $rd",
+                    (DMFC0_MM64R6 GPR64Opnd:$rt, COP0Opnd:$rd, 0), 0>,
+                    ISA_MICROMIPS64R6;
+def : MipsInstAlias<"daddu $rs, $rt, $imm",
+                    (DADDIU_MM64R6 GPR64Opnd:$rs,
+                                   GPR64Opnd:$rt,
+                                   simm16_64:$imm),
+                    0>, ISA_MICROMIPS64R6;
+def : MipsInstAlias<"daddu $rs, $imm",
+                    (DADDIU_MM64R6 GPR64Opnd:$rs,
+                                   GPR64Opnd:$rs,
+                                   simm16_64:$imm),
+                    0>, ISA_MICROMIPS64R6;
+def : MipsInstAlias<"dsubu $rt, $rs, $imm",
+                    (DADDIU_MM64R6 GPR64Opnd:$rt,
+                                   GPR64Opnd:$rs,
+                                   InvertedImOperand64:$imm),
+                    0>, ISA_MICROMIPS64R6;
+def : MipsInstAlias<"dsubu $rs, $imm",
+                    (DADDIU_MM64R6 GPR64Opnd:$rs,
+                                   GPR64Opnd:$rs,
+                                   InvertedImOperand64:$imm),
+                    0>, ISA_MICROMIPS64R6;
+def : MipsInstAlias<"dneg $rt, $rs",
+                    (DSUB_MM64R6 GPR64Opnd:$rt, ZERO_64, GPR64Opnd:$rs), 1>,
+                    ISA_MICROMIPS64R6;
+def : MipsInstAlias<"dneg $rt",
+                    (DSUB_MM64R6 GPR64Opnd:$rt, ZERO_64, GPR64Opnd:$rt), 0>,
+                    ISA_MICROMIPS64R6;
+def : MipsInstAlias<"dnegu $rt, $rs",
+                    (DSUBU_MM64R6 GPR64Opnd:$rt, ZERO_64, GPR64Opnd:$rs), 1>,
+                    ISA_MICROMIPS64R6;
+def : MipsInstAlias<"dnegu $rt",
+                    (DSUBU_MM64R6 GPR64Opnd:$rt, ZERO_64, GPR64Opnd:$rt), 0>,
+                    ISA_MICROMIPS64R6;
diff --git a/lib/Target/Mips/MicroMipsDSPInstrFormats.td b/lib/Target/Mips/MicroMipsDSPInstrFormats.td
index f11c09abfc36..af6473c468d9 100644
--- a/lib/Target/Mips/MicroMipsDSPInstrFormats.td
+++ b/lib/Target/Mips/MicroMipsDSPInstrFormats.td
@@ -242,3 +242,61 @@ class POOL32A_5B01RAC_FMT<string opstr, bits<8> op> : MMDSPInst<opstr> {
   let Inst{13-6}  = op;
   let Inst{5-0}   = 0b111100;
 }
+
+class POOL32I_IMMB0_FMT<string opstr, bits<5> op> : MMDSPInst<opstr> {
+  bits<16> offset;
+
+  let Inst{31-26} = 0b010000;
+  let Inst{25-21} = op;
+  let Inst{20-16} = 0;
+  let Inst{15-0}  = offset;
+}
+
+class POOL32A_2RBP_FMT<string opstr> : MMDSPInst<opstr> {
+  bits<5> rt;
+  bits<5> rs;
+  bits<2> bp;
+
+  let Inst{31-26} = 0;
+  let Inst{25-21} = rt;
+  let Inst{20-16} = rs;
+  let Inst{15-14} = bp;
+  let Inst{13-6}  = 0b00100010;
+  let Inst{5-0}   = 0b111100;
+}
+
+class POOL32A_2RB0_FMT<string opstr, bits<10> op> : MMDSPInst<opstr> {
+  bits<5> rt;
+  bits<5> rs;
+
+  let Inst{31-26} = 0;
+  let Inst{25-21} = rt;
+  let Inst{20-16} = rs;
+  let Inst{15-10} = 0;
+  let Inst{9-0}   = op;
+}
+
+class POOL32S_3RB0_FMT<string opstr, bits<10> op> : MMDSPInst<opstr> {
+  bits<5> rt;
+  bits<5> rs;
+  bits<5> rd;
+
+  let Inst{31-26} = 0b010110;
+  let Inst{25-21} = rt;
+  let Inst{20-16} = rs;
+  let Inst{15-11} = rd;
+  let Inst{10}    = 0b0;
+  let Inst{9-0}   = op;
+}
+
+class POOL32A_2R2B0_FMT<string opstr, bits<10> op> : MMDSPInst<opstr> {
+  bits<5> rt;
+  bits<5> rs;
+
+  let Inst{31-26} = 0;
+  let Inst{25-21} = rt;
+  let Inst{20-16} = rs;
+  let Inst{15-11} = 0;
+  let Inst{10}    = 0;
+  let Inst{9-0}   = op;
+}
diff --git a/lib/Target/Mips/MicroMipsDSPInstrInfo.td b/lib/Target/Mips/MicroMipsDSPInstrInfo.td
index b342e2371df4..f82f82fc7e45 100644
--- a/lib/Target/Mips/MicroMipsDSPInstrInfo.td
+++ b/lib/Target/Mips/MicroMipsDSPInstrInfo.td
@@ -155,6 +155,26 @@ class PICK_QB_MM_ENC : POOL32A_3RB0_FMT<"pick.qb", 0b0111101101>;
 class SHILO_MM_ENC : POOL32A_4B0SHIFT6AC4B0_FMT<"shilo", 0b0000011101>;
 class SHILOV_MM_ENC : POOL32A_5B01RAC_FMT<"shilov", 0b01001001>;
 class WRDSP_MM_ENC : POOL32A_1RMASK7_FMT<"wrdsp", 0b01011001>;
+class APPEND_MMR2_ENC : POOL32A_2RSA5B0_FMT<"append", 0b1000010101>;
+class MODSUB_MM_ENC : POOL32A_3RB0_FMT<"modsub", 0b1010010101>;
+class MULSA_W_PH_MMR2_ENC : POOL32A_2RAC_FMT<"mulsa.w.ph", 0b10110010>;
+class MULSAQ_S_W_PH_MM_ENC : POOL32A_2RAC_FMT<"mulsaq_s.w.ph", 0b11110010>;
+class BPOSGE32C_MMR3_ENC : POOL32I_IMMB0_FMT<"bposge32c", 0b11001>;
+class BITREV_MM_ENC : POOL32A_2R_FMT<"bitrev", 0b0011000100>;
+class BALIGN_MMR2_ENC : POOL32A_2RBP_FMT<"balign">;
+class BPOSGE32_MM_ENC : POOL32I_IMMB0_FMT<"bposge32", 0b11011>;
+class CMP_EQ_PH_MM_ENC : POOL32A_2RB0_FMT<"cmp.eq.ph", 0b0000000101>;
+class CMP_LE_PH_MM_ENC : POOL32A_2RB0_FMT<"cmp.le.ph", 0b0010000101>;
+class CMP_LT_PH_MM_ENC : POOL32A_2RB0_FMT<"cmp.lt.ph", 0b0001000101>;
+class CMPGDU_EQ_QB_MMR2_ENC : POOL32A_3RB0_FMT<"cmpgdu.eq.qb", 0b0110000101>;
+class CMPGDU_LT_QB_MMR2_ENC : POOL32A_3RB0_FMT<"cmpgdu.lt.qb", 0b0111000101>;
+class CMPGDU_LE_QB_MMR2_ENC : POOL32A_3RB0_FMT<"cmpgdu.le.qb", 0b1000000101>;
+class CMPGU_EQ_QB_MM_ENC : POOL32S_3RB0_FMT<"cmpgu.eq.qb", 0b0011000101>;
+class CMPGU_LT_QB_MM_ENC : POOL32S_3RB0_FMT<"cmpgu.lt.qb", 0b0100000101>;
+class CMPGU_LE_QB_MM_ENC : POOL32S_3RB0_FMT<"cmpgu.le.qb", 0b0101000101>;
+class CMPU_EQ_QB_MM_ENC : POOL32A_2R2B0_FMT<"cmpu.eq.qb", 0b1001000101>;
+class CMPU_LT_QB_MM_ENC : POOL32A_2R2B0_FMT<"cmpu.lt.qb", 0b1010000101>;
+class CMPU_LE_QB_MM_ENC : POOL32A_2R2B0_FMT<"cmpu.le.qb", 0b1011000101>;
 
 // Instruction desc.
 class ABSQ_S_PH_MM_R2_DESC_BASE<string opstr, SDPatternOperator OpNode,
@@ -339,15 +359,15 @@ class RADDU_W_QB_MM_DESC {
 
 class RDDSP_MM_DESC {
   dag OutOperandList = (outs GPR32Opnd:$rt);
-  dag InOperandList = (ins uimm16:$mask);
+  dag InOperandList = (ins uimm7:$mask);
   string AsmString = !strconcat("rddsp", "\t$rt, $mask");
-  list<dag> Pattern = [(set GPR32Opnd:$rt, (int_mips_rddsp immZExt10:$mask))];
+  list<dag> Pattern = [(set GPR32Opnd:$rt, (int_mips_rddsp immZExt7:$mask))];
   InstrItinClass Itinerary = NoItinerary;
 }
 
 class REPL_QB_MM_DESC {
   dag OutOperandList = (outs DSPROpnd:$rt);
-  dag InOperandList = (ins uimm16:$imm);
+  dag InOperandList = (ins uimm8:$imm);
   string AsmString = !strconcat("repl.qb", "\t$rt, $imm");
   list<dag> Pattern = [(set DSPROpnd:$rt, (int_mips_repl_qb immZExt8:$imm))];
   InstrItinClass Itinerary = NoItinerary;
@@ -368,6 +388,33 @@ class WRDSP_MM_DESC {
   InstrItinClass Itinerary = NoItinerary;
 }
 
+class BPOSGE32C_MMR3_DESC {
+  dag OutOperandList = (outs);
+  dag InOperandList = (ins brtarget1SImm16:$offset);
+  string AsmString = !strconcat("bposge32c", "\t$offset");
+  InstrItinClass Itinerary = NoItinerary;
+  bit isBranch = 1;
+  bit isTerminator = 1;
+  bit hasDelaySlot = 0;
+}
+
+class BALIGN_MMR2_DESC {
+  dag OutOperandList = (outs GPR32Opnd:$rt);
+  dag InOperandList = (ins GPR32Opnd:$rs, uimm2:$bp, GPR32Opnd:$src);
+  string AsmString = !strconcat("balign", "\t$rt, $rs, $bp");
+  list<dag> Pattern =  [(set GPR32Opnd:$rt, (int_mips_balign GPR32Opnd:$src,
+                                                             GPR32Opnd:$rs,
+                                                             immZExt2:$bp))];
+  InstrItinClass Itinerary = NoItinerary;
+  string Constraints = "$src = $rt";
+}
+
+class BITREV_MM_DESC : ABSQ_S_PH_MM_R2_DESC_BASE<"bitrev", int_mips_bitrev,
+                                                 NoItinerary, GPR32Opnd>;
+
+class BPOSGE32_MM_DESC : BPOSGE32_DESC_BASE<"bposge32", brtarget_mm,
+                                            NoItinerary>;
+
 // Instruction defs.
 // microMIPS DSP Rev 1
 def ADDQ_PH_MM : DspMMRel, ADDQ_PH_MM_ENC, ADDQ_PH_DESC;
@@ -472,6 +519,20 @@ def PICK_QB_MM : DspMMRel, PICK_QB_MM_ENC, PICK_QB_DESC;
 def SHILO_MM : DspMMRel, SHILO_MM_ENC, SHILO_DESC;
 def SHILOV_MM : DspMMRel, SHILOV_MM_ENC, SHILOV_DESC;
 def WRDSP_MM : DspMMRel, WRDSP_MM_ENC, WRDSP_MM_DESC;
+def MODSUB_MM : DspMMRel, MODSUB_MM_ENC, MODSUB_DESC;
+def MULSAQ_S_W_PH_MM : DspMMRel, MULSAQ_S_W_PH_MM_ENC, MULSAQ_S_W_PH_DESC;
+def BITREV_MM : DspMMRel, BITREV_MM_ENC, BITREV_MM_DESC;
+def BPOSGE32_MM : DspMMRel, BPOSGE32_MM_ENC, BPOSGE32_MM_DESC,
+                  ISA_MIPS1_NOT_32R6_64R6;
+def CMP_EQ_PH_MM : DspMMRel, CMP_EQ_PH_MM_ENC, CMP_EQ_PH_DESC;
+def CMP_LT_PH_MM : DspMMRel, CMP_LT_PH_MM_ENC, CMP_LT_PH_DESC;
+def CMP_LE_PH_MM : DspMMRel, CMP_LE_PH_MM_ENC, CMP_LE_PH_DESC;
+def CMPGU_EQ_QB_MM : DspMMRel, CMPGU_EQ_QB_MM_ENC, CMPGU_EQ_QB_DESC;
+def CMPGU_LT_QB_MM : DspMMRel, CMPGU_LT_QB_MM_ENC, CMPGU_LT_QB_DESC;
+def CMPGU_LE_QB_MM : DspMMRel, CMPGU_LE_QB_MM_ENC, CMPGU_LE_QB_DESC;
+def CMPU_EQ_QB_MM : DspMMRel, CMPU_EQ_QB_MM_ENC, CMPU_EQ_QB_DESC;
+def CMPU_LT_QB_MM : DspMMRel, CMPU_LT_QB_MM_ENC, CMPU_LT_QB_DESC;
+def CMPU_LE_QB_MM : DspMMRel, CMPU_LE_QB_MM_ENC, CMPU_LE_QB_DESC;
 // microMIPS DSP Rev 2
 def ABSQ_S_QB_MMR2 : DspMMRel, ABSQ_S_QB_MMR2_ENC, ABSQ_S_QB_MMR2_DESC,
                      ISA_DSPR2;
@@ -495,6 +556,13 @@ def SHRA_R_QB_MMR2 : DspMMRel, SHRA_R_QB_MMR2_ENC, SHRA_R_QB_MMR2_DESC,
 def SHRAV_QB_MMR2 : DspMMRel, SHRAV_QB_MMR2_ENC, SHRAV_QB_MMR2_DESC, ISA_DSPR2;
 def SHRAV_R_QB_MMR2 : DspMMRel, SHRAV_R_QB_MMR2_ENC, SHRAV_R_QB_MMR2_DESC,
                       ISA_DSPR2;
+def BALIGN_MMR2 : DspMMRel, BALIGN_MMR2_ENC, BALIGN_MMR2_DESC, ISA_DSPR2;
+def CMPGDU_EQ_QB_MMR2 : DspMMRel, CMPGDU_EQ_QB_MMR2_ENC, CMPGDU_EQ_QB_DESC,
+                        ISA_DSPR2;
+def CMPGDU_LT_QB_MMR2 : DspMMRel, CMPGDU_LT_QB_MMR2_ENC, CMPGDU_LT_QB_DESC,
+                        ISA_DSPR2;
+def CMPGDU_LE_QB_MMR2 : DspMMRel, CMPGDU_LE_QB_MMR2_ENC, CMPGDU_LE_QB_DESC,
+                        ISA_DSPR2;
 def SHRL_PH_MMR2 : DspMMRel, SHRL_PH_MMR2_ENC, SHRL_PH_MMR2_DESC, ISA_DSPR2;
 def SHRLV_PH_MMR2 : DspMMRel, SHRLV_PH_MMR2_ENC, SHRLV_PH_MMR2_DESC, ISA_DSPR2;
 def SUBQH_PH_MMR2 : DspMMRel, SUBQH_PH_MMR2_ENC, SUBQH_PH_DESC, ISA_DSPR2;
@@ -526,3 +594,8 @@ def PREPEND_MMR2 : DspMMRel, PREPEND_MMR2_ENC, PREPEND_DESC, ISA_DSPR2;
 
 // Instruction alias.
 def : MMDSPInstAlias<"wrdsp $rt", (WRDSP_MM GPR32Opnd:$rt, 0x1F), 1>;
+def APPEND_MMR2 : DspMMRel, APPEND_MMR2_ENC, APPEND_DESC, ISA_DSPR2;
+def MULSA_W_PH_MMR2 : DspMMRel, MULSA_W_PH_MMR2_ENC, MULSA_W_PH_DESC, ISA_DSPR2;
+// microMIPS DSP Rev 3
+def BPOSGE32C_MMR3 : DspMMRel, BPOSGE32C_MMR3_ENC, BPOSGE32C_MMR3_DESC,
+                     ISA_DSPR3;
diff --git a/lib/Target/Mips/MicroMipsInstrFPU.td b/lib/Target/Mips/MicroMipsInstrFPU.td
index 756e6c92c1d1..7b0e00bd1c3c 100644
--- a/lib/Target/Mips/MicroMipsInstrFPU.td
+++ b/lib/Target/Mips/MicroMipsInstrFPU.td
@@ -17,12 +17,6 @@ def FMUL_MM  : MMRel, ADDS_FT<"mul.d", AFGR64Opnd, II_MUL_D, 1, fmul>,
 def FSUB_MM  : MMRel, ADDS_FT<"sub.d", AFGR64Opnd, II_SUB_D, 0, fsub>,
                ADDS_FM_MM<1, 0x70>;
 
-def LWC1_MM : MMRel, LW_FT<"lwc1", FGR32Opnd, II_LWC1, load>, LW_FM_MM<0x27>;
-def SWC1_MM : MMRel, SW_FT<"swc1", FGR32Opnd, II_SWC1, store>,
-              LW_FM_MM<0x26>;
-def LDC1_MM : MMRel, LW_FT<"ldc1", AFGR64Opnd, II_LDC1, load>, LW_FM_MM<0x2f>;
-def SDC1_MM : MMRel, SW_FT<"sdc1", AFGR64Opnd, II_SDC1, store>,
-              LW_FM_MM<0x2e>;
 def LWXC1_MM : MMRel, LWXC1_FT<"lwxc1", FGR32Opnd, II_LWXC1, load>,
                LWXC1_FM_MM<0x48>, INSN_MIPS4_32R2_NOT_32R6_64R6;
 def SWXC1_MM : MMRel, SWXC1_FT<"swxc1", FGR32Opnd, II_SWXC1, store>,
@@ -114,10 +108,6 @@ def MFC1_MM : MMRel, MFC1_FT<"mfc1", GPR32Opnd, FGR32Opnd,
                              II_MFC1, bitconvert>, MFC1_FM_MM<0x80>;
 def MTC1_MM : MMRel, MTC1_FT<"mtc1", FGR32Opnd, GPR32Opnd,
                              II_MTC1, bitconvert>, MFC1_FM_MM<0xa0>;
-def MFHC1_MM : MMRel, MFC1_FT<"mfhc1", GPR32Opnd, AFGR64Opnd, II_MFHC1>,
-               MFC1_FM_MM<0xc0>, ISA_MIPS32R2, FGR_32;
-def MTHC1_MM : MMRel, MTC1_64_FT<"mthc1", AFGR64Opnd, GPR32Opnd, II_MTHC1>,
-               MFC1_FM_MM<0xe0>, ISA_MIPS32R2, FGR_32;
 
 def MADD_S_MM : MMRel, MADDS_FT<"madd.s", FGR32Opnd, II_MADD_S, fadd>,
                 MADDS_FM_MM<0x1>;
@@ -147,4 +137,33 @@ let AdditionalPredicates = [InMicroMips] in {
     ROUND_W_FM_MM<0, 0x6c>;
   def FSQRT_S_MM : MMRel, ABSS_FT<"sqrt.s", FGR32Opnd, FGR32Opnd, II_SQRT_S,
     fsqrt>, ROUND_W_FM_MM<0, 0x28>;
+  def MTHC1_MM : MMRel, MTC1_64_FT<"mthc1", AFGR64Opnd, GPR32Opnd, II_MTHC1>,
+             MFC1_FM_MM<0xe0>, ISA_MIPS32R2, FGR_32;
+  def MFHC1_MM : MMRel, MFC1_FT<"mfhc1", GPR32Opnd, AFGR64Opnd, II_MFHC1>,
+                 MFC1_FM_MM<0xc0>, ISA_MIPS32R2, FGR_32;
+  let DecoderNamespace = "MicroMips",  DecoderMethod = "DecodeFMemMMR2" in {
+    def LDC1_MM : MMRel, LW_FT<"ldc1", AFGR64Opnd, mem_mm_16, II_LDC1, load>,
+                  LW_FM_MM<0x2f>, FGR_32 {
+      let BaseOpcode = "LDC132";
+    }
+    def SDC1_MM : MMRel, SW_FT<"sdc1", AFGR64Opnd, mem_mm_16, II_SDC1, store>,
+                  LW_FM_MM<0x2e>, FGR_32;
+    def LWC1_MM : MMRel, LW_FT<"lwc1", FGR32Opnd, mem_mm_16, II_LWC1, load>,
+                  LW_FM_MM<0x27>;
+    def SWC1_MM : MMRel, SW_FT<"swc1", FGR32Opnd, mem_mm_16, II_SWC1, store>,
+                  LW_FM_MM<0x26>;
+  }
+}
+
+//===----------------------------------------------------------------------===//
+// Floating Point Patterns
+//===----------------------------------------------------------------------===//
+let AdditionalPredicates = [InMicroMips] in {
+  // Patterns for loads/stores with a reg+imm operand.
+  let AddedComplexity = 40 in {
+    def : LoadRegImmPat<LDC1_MM, f64, load>, FGR_32;
+    def : StoreRegImmPat<SDC1_MM, f64>, FGR_32;
+    def : LoadRegImmPat<LWC1_MM, f32, load>;
+    def : StoreRegImmPat<SWC1_MM, f32>;
+  }
 }
diff --git a/lib/Target/Mips/MicroMipsInstrFormats.td b/lib/Target/Mips/MicroMipsInstrFormats.td
index b736367ee5fa..79ef6482180d 100644
--- a/lib/Target/Mips/MicroMipsInstrFormats.td
+++ b/lib/Target/Mips/MicroMipsInstrFormats.td
@@ -278,7 +278,6 @@ class MOVEP_FM_MM16 {
 
 class MMArch {
   string Arch = "micromips";
-  list<dag> Pattern = [];
 }
 
 class ADD_FM_MM<bits<6> op, bits<10> funct> : MMArch {
@@ -380,13 +379,15 @@ class SRLV_FM_MM<bits<10> funct, bit rotate> : MMArch {
 class LW_FM_MM<bits<6> op> : MMArch {
   bits<5> rt;
   bits<21> addr;
+  bits<5> base = addr{20-16};
+  bits<16> offset = addr{15-0};
 
   bits<32> Inst;
 
   let Inst{31-26} = op;
   let Inst{25-21} = rt;
-  let Inst{20-16} = addr{20-16};
-  let Inst{15-0}  = addr{15-0};
+  let Inst{20-16} = base;
+  let Inst{15-0}  = offset;
 }
 
 class POOL32C_LHUE_FM_MM<bits<6> op, bits<4> fmt, bits<3> funct> : MMArch {
@@ -674,7 +675,7 @@ class TEQI_FM_MM<bits<5> funct> : MMArch {
   let Inst{15-0}  = imm16;
 }
 
-class LL_FM_MM<bits<4> funct> {
+class LL_FM_MM<bits<4> funct> : MMArch {
   bits<5> rt;
   bits<21> addr;
 
diff --git a/lib/Target/Mips/MicroMipsInstrInfo.td b/lib/Target/Mips/MicroMipsInstrInfo.td
index 99f0f446deab..f27370f57fa0 100644
--- a/lib/Target/Mips/MicroMipsInstrInfo.td
+++ b/lib/Target/Mips/MicroMipsInstrInfo.td
@@ -1,23 +1,8 @@
-def addrimm12 : ComplexPattern<iPTR, 2, "selectIntAddrMM", [frameindex]>;
+def addrimm11 : ComplexPattern<iPTR, 2, "selectIntAddr11MM", [frameindex]>;
+def addrimm12 : ComplexPattern<iPTR, 2, "selectIntAddr12MM", [frameindex]>;
+def addrimm16 : ComplexPattern<iPTR, 2, "selectIntAddr16MM", [frameindex]>;
 def addrimm4lsl2 : ComplexPattern<iPTR, 2, "selectIntAddrLSL2MM", [frameindex]>;
 
-def simm4 : Operand<i32> {
-  let DecoderMethod = "DecodeSimm4";
-}
-def simm7 : Operand<i32>;
-def li_simm7 : Operand<i32> {
-  let DecoderMethod = "DecodeLiSimm7";
-}
-
-def simm12 : Operand<i32> {
-  let DecoderMethod = "DecodeSimm12";
-}
-
-def uimm6_lsl2 : Operand<i32> {
-  let EncoderMethod = "getUImm6Lsl2Encoding";
-  let DecoderMethod = "DecodeUImm6Lsl2";
-}
-
 def simm9_addiusp : Operand<i32> {
   let EncoderMethod = "getSImm9AddiuspValue";
   let DecoderMethod = "DecodeSimm9SP";
@@ -60,9 +45,15 @@ def MicroMipsMemGPRMM16AsmOperand : AsmOperandClass {
   let PredicateMethod = "isMemWithGRPMM16Base";
 }
 
+// Define the classes of pointers used by microMIPS.
+// The numbers must match those in MipsRegisterInfo::MipsPtrClass.
+def ptr_gpr16mm_rc : PointerLikeRegClass<1>;
+def ptr_sp_rc : PointerLikeRegClass<2>;
+def ptr_gp_rc : PointerLikeRegClass<3>;
+
 class mem_mm_4_generic : Operand<i32> {
   let PrintMethod = "printMemOperand";
-  let MIOperandInfo = (ops GPRMM16, simm4);
+  let MIOperandInfo = (ops ptr_gpr16mm_rc, simm4);
   let OperandType = "OPERAND_MEMORY";
   let ParserMatchClass = MicroMipsMemGPRMM16AsmOperand;
 }
@@ -86,32 +77,48 @@ def MicroMipsMemSPAsmOperand : AsmOperandClass {
   let PredicateMethod = "isMemWithUimmWordAlignedOffsetSP<7>";
 }
 
+def MicroMipsMemGPAsmOperand : AsmOperandClass {
+  let Name = "MicroMipsMemGP";
+  let RenderMethod = "addMemOperands";
+  let ParserMethod = "parseMemOperand";
+  let PredicateMethod = "isMemWithSimmWordAlignedOffsetGP<9>";
+}
+
 def mem_mm_sp_imm5_lsl2 : Operand<i32> {
   let PrintMethod = "printMemOperand";
-  let MIOperandInfo = (ops GPR32:$base, simm5:$offset);
+  let MIOperandInfo = (ops ptr_sp_rc:$base, simm5:$offset);
   let OperandType = "OPERAND_MEMORY";
   let ParserMatchClass = MicroMipsMemSPAsmOperand;
   let EncoderMethod = "getMemEncodingMMSPImm5Lsl2";
 }
 
-def mem_mm_gp_imm7_lsl2 : Operand<i32> {
+def mem_mm_gp_simm7_lsl2 : Operand<i32> {
   let PrintMethod = "printMemOperand";
-  let MIOperandInfo = (ops GPRMM16:$base, simm7:$offset);
+  let MIOperandInfo = (ops ptr_gp_rc:$base, simm7_lsl2:$offset);
   let OperandType = "OPERAND_MEMORY";
+  let ParserMatchClass = MicroMipsMemGPAsmOperand;
   let EncoderMethod = "getMemEncodingMMGPImm7Lsl2";
 }
 
 def mem_mm_9 : Operand<i32> {
   let PrintMethod = "printMemOperand";
-  let MIOperandInfo = (ops GPR32, simm9);
+  let MIOperandInfo = (ops ptr_rc, simm9);
   let EncoderMethod = "getMemEncodingMMImm9";
-  let ParserMatchClass = MipsMemAsmOperand;
+  let ParserMatchClass = MipsMemSimm9AsmOperand;
+  let OperandType = "OPERAND_MEMORY";
+}
+
+def mem_mm_11 : Operand<i32> {
+  let PrintMethod = "printMemOperand";
+  let MIOperandInfo = (ops GPR32, simm11);
+  let EncoderMethod = "getMemEncodingMMImm11";
+  let ParserMatchClass = MipsMemSimm11AsmOperand;
   let OperandType = "OPERAND_MEMORY";
 }
 
 def mem_mm_12 : Operand<i32> {
   let PrintMethod = "printMemOperand";
-  let MIOperandInfo = (ops GPR32, simm12);
+  let MIOperandInfo = (ops ptr_rc, simm12);
   let EncoderMethod = "getMemEncodingMMImm12";
   let ParserMatchClass = MipsMemAsmOperand;
   let OperandType = "OPERAND_MEMORY";
@@ -119,9 +126,9 @@ def mem_mm_12 : Operand<i32> {
 
 def mem_mm_16 : Operand<i32> {
   let PrintMethod = "printMemOperand";
-  let MIOperandInfo = (ops GPR32, simm16);
+  let MIOperandInfo = (ops ptr_rc, simm16);
   let EncoderMethod = "getMemEncodingMMImm16";
-  let ParserMatchClass = MipsMemAsmOperand;
+  let ParserMatchClass = MipsMemSimm16AsmOperand;
   let OperandType = "OPERAND_MEMORY";
 }
 
@@ -135,7 +142,7 @@ def MipsMemUimm4AsmOperand : AsmOperandClass {
 
 def mem_mm_4sp : Operand<i32> {
   let PrintMethod = "printMemOperand";
-  let MIOperandInfo = (ops GPR32, uimm8);
+  let MIOperandInfo = (ops ptr_sp_rc, uimm8);
   let EncoderMethod = "getMemEncodingMMImm4sp";
   let ParserMatchClass = MipsMemUimm4AsmOperand;
   let OperandType = "OPERAND_MEMORY";
@@ -216,7 +223,7 @@ def movep_regpair : Operand<i32> {
   let ParserMatchClass = MovePRegPairAsmOperand;
   let PrintMethod = "printRegisterList";
   let DecoderMethod = "DecodeMovePRegPair";
-  let MIOperandInfo = (ops GPR32Opnd, GPR32Opnd);
+  let MIOperandInfo = (ops ptr_rc, ptr_rc);
 }
 
 class MovePMM16<string opstr, RegisterOperand RO> :
@@ -230,6 +237,7 @@ MicroMipsInst16<(outs movep_regpair:$dst_regs), (ins RO:$rs, RO:$rt),
 def RegPairAsmOperand : AsmOperandClass {
   let Name = "RegPair";
   let ParserMethod = "parseRegisterPair";
+  let PredicateMethod = "isRegPair";
 }
 
 def regpair : Operand<i32> {
@@ -237,12 +245,12 @@ def regpair : Operand<i32> {
   let ParserMatchClass = RegPairAsmOperand;
   let PrintMethod = "printRegisterPair";
   let DecoderMethod = "DecodeRegPairOperand";
-  let MIOperandInfo = (ops GPR32Opnd, GPR32Opnd);
+  let MIOperandInfo = (ops ptr_rc, ptr_rc);
 }
 
 class StorePairMM<string opstr, InstrItinClass Itin = NoItinerary,
                   ComplexPattern Addr = addr> :
-  InstSE<(outs), (ins regpair:$rt, mem_mm_12:$addr),
+  InstSE<(outs), (ins regpair:$rt, mem_simm12:$addr),
          !strconcat(opstr, "\t$rt, $addr"), [], Itin, FrmI, opstr> {
   let DecoderMethod = "DecodeMemMMImm12";
   let mayStore = 1;
@@ -250,7 +258,7 @@ class StorePairMM<string opstr, InstrItinClass Itin = NoItinerary,
 
 class LoadPairMM<string opstr, InstrItinClass Itin = NoItinerary,
                  ComplexPattern Addr = addr> :
-  InstSE<(outs regpair:$rt), (ins mem_mm_12:$addr),
+  InstSE<(outs regpair:$rt), (ins mem_simm12:$addr),
           !strconcat(opstr, "\t$rt, $addr"), [], Itin, FrmI, opstr> {
   let DecoderMethod = "DecodeMemMMImm12";
   let mayLoad = 1;
@@ -264,7 +272,7 @@ class LLBaseMM<string opstr, RegisterOperand RO> :
 }
 
 class LLEBaseMM<string opstr, RegisterOperand RO> :
-  InstSE<(outs RO:$rt), (ins mem_mm_12:$addr),
+  InstSE<(outs RO:$rt), (ins mem_simm9:$addr),
          !strconcat(opstr, "\t$rt, $addr"), [], NoItinerary, FrmI> {
   let DecoderMethod = "DecodeMemMMImm9";
   let mayLoad = 1;
@@ -279,7 +287,7 @@ class SCBaseMM<string opstr, RegisterOperand RO> :
 }
 
 class SCEBaseMM<string opstr, RegisterOperand RO> :
-  InstSE<(outs RO:$dst), (ins RO:$rt, mem_mm_12:$addr),
+  InstSE<(outs RO:$dst), (ins RO:$rt, mem_simm9:$addr),
          !strconcat(opstr, "\t$rt, $addr"), [], NoItinerary, FrmI> {
   let DecoderMethod = "DecodeMemMMImm9";
   let mayStore = 1;
@@ -287,10 +295,10 @@ class SCEBaseMM<string opstr, RegisterOperand RO> :
 }
 
 class LoadMM<string opstr, DAGOperand RO, SDPatternOperator OpNode = null_frag,
-             InstrItinClass Itin = NoItinerary> :
-  InstSE<(outs RO:$rt), (ins mem_mm_12:$addr),
+             InstrItinClass Itin = NoItinerary, DAGOperand MO = mem_mm_12> :
+  InstSE<(outs RO:$rt), (ins MO:$addr),
          !strconcat(opstr, "\t$rt, $addr"),
-         [(set RO:$rt, (OpNode addrimm12:$addr))], Itin, FrmI> {
+         [(set RO:$rt, (OpNode addrimm12:$addr))], Itin, FrmI, opstr> {
   let DecoderMethod = "DecodeMemMMImm12";
   let canFoldAsLoad = 1;
   let mayLoad = 1;
@@ -615,7 +623,7 @@ def SH16_MM : StoreMM16<"sh16", GPRMM16OpndZero, GPRMM16Opnd, truncstorei16,
                         LOAD_STORE_FM_MM16<0x2a>;
 def SW16_MM : StoreMM16<"sw16", GPRMM16OpndZero, GPRMM16Opnd, store, II_SW,
                         mem_mm_4_lsl2>, LOAD_STORE_FM_MM16<0x3a>;
-def LWGP_MM : LoadGPMM16<"lw", GPRMM16Opnd, II_LW, mem_mm_gp_imm7_lsl2>,
+def LWGP_MM : LoadGPMM16<"lw", GPRMM16Opnd, II_LW, mem_mm_gp_simm7_lsl2>,
                          LOAD_GP_FM_MM16<0x19>;
 def LWSP_MM : LoadSPMM16<"lw", GPR32Opnd, II_LW, mem_mm_sp_imm5_lsl2>,
               LOAD_STORE_SP_FM_MM16<0x12>;
@@ -629,7 +637,7 @@ def MFHI16_MM : MoveFromHILOMM<"mfhi", GPR32Opnd, AC0>, MFHILO_FM_MM16<0x10>;
 def MFLO16_MM : MoveFromHILOMM<"mflo", GPR32Opnd, AC0>, MFHILO_FM_MM16<0x12>;
 def MOVE16_MM : MoveMM16<"move", GPR32Opnd>, MOVE_FM_MM16<0x03>;
 def MOVEP_MM : MovePMM16<"movep", GPRMM16OpndMoveP>, MOVEP_FM_MM16;
-def LI16_MM : LoadImmMM16<"li16", li_simm7, GPRMM16Opnd>, LI_FM_MM16,
+def LI16_MM : LoadImmMM16<"li16", li16_imm, GPRMM16Opnd>, LI_FM_MM16,
               IsAsCheapAsAMove;
 def JALR16_MM : JumpLinkRegMM16<"jalr", GPR32Opnd>, JALR_FM_MM16<0x0e>,
                 ISA_MICROMIPS32_NOT_MIPS32R6;
@@ -677,11 +685,11 @@ let DecoderNamespace = "MicroMips", Predicates = [InMicroMips] in {
                  SLTI_FM_MM<0x2c>;
   def ANDi_MM  : MMRel, ArithLogicI<"andi", uimm16, GPR32Opnd>,
                  ADDI_FM_MM<0x34>;
-  def ORi_MM   : MMRel, ArithLogicI<"ori", uimm16, GPR32Opnd>,
-                 ADDI_FM_MM<0x14>;
-  def XORi_MM  : MMRel, ArithLogicI<"xori", uimm16, GPR32Opnd>,
-                 ADDI_FM_MM<0x1c>;
-  def LUi_MM   : MMRel, LoadUpper<"lui", GPR32Opnd, uimm16>, LUI_FM_MM;
+  def ORi_MM   : MMRel, ArithLogicI<"ori", uimm16, GPR32Opnd, II_ORI, immZExt16,
+                                    or>, ADDI_FM_MM<0x14>;
+  def XORi_MM  : MMRel, ArithLogicI<"xori", uimm16, GPR32Opnd, II_XORI,
+                                    immZExt16, xor>, ADDI_FM_MM<0x1c>;
+  def LUi_MM   : MMRel, LoadUpper<"lui", GPR32Opnd, uimm16_relaxed>, LUI_FM_MM;
 
   def LEA_ADDiu_MM : MMRel, EffectiveAddress<"addiu", GPR32Opnd>,
                      LW_FM_MM<0xc>;
@@ -709,9 +717,9 @@ let DecoderNamespace = "MicroMips", Predicates = [InMicroMips] in {
   def MULTu_MM : MMRel, Mult<"multu", II_MULTU, GPR32Opnd, [HI0, LO0]>,
                  MULT_FM_MM<0x26c>;
   def SDIV_MM  : MMRel, Div<"div", II_DIV, GPR32Opnd, [HI0, LO0]>,
-                 MULT_FM_MM<0x2ac>;
+                 MULT_FM_MM<0x2ac>, ISA_MIPS1_NOT_32R6_64R6;
   def UDIV_MM  : MMRel, Div<"divu", II_DIVU, GPR32Opnd, [HI0, LO0]>,
-                 MULT_FM_MM<0x2ec>;
+                 MULT_FM_MM<0x2ec>, ISA_MIPS1_NOT_32R6_64R6;
 
   /// Arithmetic Instructions with PC and Immediate
   def ADDIUPC_MM : AddImmUPC<"addiupc", GPRMM16Opnd>, ADDIUPC_FM_MM;
@@ -730,16 +738,24 @@ let DecoderNamespace = "MicroMips", Predicates = [InMicroMips] in {
   def SRAV_MM  : MMRel, shift_rotate_reg<"srav", GPR32Opnd, II_SRAV>,
                  SRLV_FM_MM<0x90, 0>;
   def ROTR_MM  : MMRel, shift_rotate_imm<"rotr", uimm5, GPR32Opnd, II_ROTR>,
-                 SRA_FM_MM<0xc0, 0>;
+                 SRA_FM_MM<0xc0, 0> {
+    list<dag> Pattern = [(set GPR32Opnd:$rd,
+                          (rotr GPR32Opnd:$rt, immZExt5:$shamt))];
+  }
   def ROTRV_MM : MMRel, shift_rotate_reg<"rotrv", GPR32Opnd, II_ROTRV>,
-                 SRLV_FM_MM<0xd0, 0>;
+                 SRLV_FM_MM<0xd0, 0> {
+    list<dag> Pattern = [(set GPR32Opnd:$rd,
+                          (rotr GPR32Opnd:$rt, GPR32Opnd:$rs))];
+  }
 
   /// Load and Store Instructions - aligned
   let DecoderMethod = "DecodeMemMMImm16" in {
-    def LB_MM  : Load<"lb", GPR32Opnd>, MMRel, LW_FM_MM<0x7>;
-    def LBu_MM : Load<"lbu", GPR32Opnd>, MMRel, LW_FM_MM<0x5>;
-    def LH_MM  : Load<"lh", GPR32Opnd>, MMRel, LW_FM_MM<0xf>;
-    def LHu_MM : Load<"lhu", GPR32Opnd>, MMRel, LW_FM_MM<0xd>;
+    def LB_MM  : LoadMemory<"lb", GPR32Opnd, mem_mm_16>, MMRel, LW_FM_MM<0x7>;
+    def LBu_MM : LoadMemory<"lbu", GPR32Opnd, mem_mm_16>, MMRel, LW_FM_MM<0x5>;
+    def LH_MM  : LoadMemory<"lh", GPR32Opnd, mem_simm16, sextloadi16, II_LH,
+                            addrDefault>, MMRel, LW_FM_MM<0xf>;
+    def LHu_MM : LoadMemory<"lhu", GPR32Opnd, mem_simm16, zextloadi16, II_LHU>,
+                 MMRel, LW_FM_MM<0xd>;
     def LW_MM  : Load<"lw", GPR32Opnd>, MMRel, LW_FM_MM<0x3f>;
     def SB_MM  : Store<"sb", GPR32Opnd>, MMRel, LW_FM_MM<0x6>;
     def SH_MM  : Store<"sh", GPR32Opnd>, MMRel, LW_FM_MM<0xe>;
@@ -749,19 +765,22 @@ let DecoderNamespace = "MicroMips", Predicates = [InMicroMips] in {
   let DecoderMethod = "DecodeMemMMImm9" in {
     def LBE_MM  : Load<"lbe", GPR32Opnd>, POOL32C_LHUE_FM_MM<0x18, 0x6, 0x4>;
     def LBuE_MM : Load<"lbue", GPR32Opnd>, POOL32C_LHUE_FM_MM<0x18, 0x6, 0x0>;
-    def LHE_MM  : Load<"lhe", GPR32Opnd>, POOL32C_LHUE_FM_MM<0x18, 0x6, 0x5>;
-    def LHuE_MM : Load<"lhue", GPR32Opnd>, POOL32C_LHUE_FM_MM<0x18, 0x6, 0x1>;
-    def LWE_MM  : Load<"lwe", GPR32Opnd>, POOL32C_LHUE_FM_MM<0x18, 0x6, 0x7>;
-    def SBE_MM  : Store<"sbe", GPR32Opnd>, POOL32C_LHUE_FM_MM<0x18, 0xa, 0x4>;
-    def SHE_MM  : Store<"she", GPR32Opnd>, POOL32C_LHUE_FM_MM<0x18, 0xa, 0x5>;
-    def SWE_MM  : StoreMemory<"swe", GPR32Opnd, mem_simm9gpr>,
+    def LHE_MM  : LoadMemory<"lhe", GPR32Opnd, mem_simm9>,
+                  POOL32C_LHUE_FM_MM<0x18, 0x6, 0x5>;
+    def LHuE_MM : LoadMemory<"lhue", GPR32Opnd, mem_simm9>,
+                  POOL32C_LHUE_FM_MM<0x18, 0x6, 0x1>;
+    def LWE_MM  : LoadMemory<"lwe", GPR32Opnd, mem_simm9>,
+                  POOL32C_LHUE_FM_MM<0x18, 0x6, 0x7>;
+    def SBE_MM  : StoreMemory<"sbe", GPR32Opnd, mem_simm9>,
+                  POOL32C_LHUE_FM_MM<0x18, 0xa, 0x4>;
+    def SHE_MM  : StoreMemory<"she", GPR32Opnd, mem_simm9>,
+                  POOL32C_LHUE_FM_MM<0x18, 0xa, 0x5>;
+    def SWE_MM  : StoreMemory<"swe", GPR32Opnd, mem_simm9>,
                   POOL32C_LHUE_FM_MM<0x18, 0xa, 0x7>;
   }
 
   def LWXS_MM : LoadWordIndexedScaledMM<"lwxs", GPR32Opnd>, LWXS_FM_MM<0x118>;
 
-  def LWU_MM : LoadMM<"lwu", GPR32Opnd, zextloadi32, II_LWU>, LL_FM_MM<0xe>;
-
   /// Load and Store Instructions - unaligned
   def LWL_MM : LoadLeftRightMM<"lwl", MipsLWL, GPR32Opnd, mem_mm_12>,
                LWL_FM_MM<0x0>;
@@ -772,13 +791,13 @@ let DecoderNamespace = "MicroMips", Predicates = [InMicroMips] in {
   def SWR_MM : StoreLeftRightMM<"swr", MipsSWR, GPR32Opnd, mem_mm_12>,
                LWL_FM_MM<0x9>;
   let DecoderMethod = "DecodeMemMMImm9" in {
-    def LWLE_MM : LoadLeftRightMM<"lwle", MipsLWL, GPR32Opnd, mem_mm_12>,
+    def LWLE_MM : LoadLeftRightMM<"lwle", MipsLWL, GPR32Opnd, mem_mm_9>,
                   POOL32C_STEVA_LDEVA_FM_MM<0x6, 0x2>;
-    def LWRE_MM : LoadLeftRightMM<"lwre", MipsLWR, GPR32Opnd, mem_mm_12>,
+    def LWRE_MM : LoadLeftRightMM<"lwre", MipsLWR, GPR32Opnd, mem_mm_9>,
                   POOL32C_STEVA_LDEVA_FM_MM<0x6, 0x3>;
-    def SWLE_MM : StoreLeftRightMM<"swle", MipsSWL, GPR32Opnd, mem_mm_12>,
+    def SWLE_MM : StoreLeftRightMM<"swle", MipsSWL, GPR32Opnd, mem_mm_9>,
                   POOL32C_STEVA_LDEVA_FM_MM<0xa, 0x0>;
-    def SWRE_MM : StoreLeftRightMM<"swre", MipsSWR, GPR32Opnd, mem_mm_12>,
+    def SWRE_MM : StoreLeftRightMM<"swre", MipsSWR, GPR32Opnd, mem_mm_9>,
                   POOL32C_STEVA_LDEVA_FM_MM<0xa, 0x1>, ISA_MIPS1_NOT_32R6_64R6;
   }
 
@@ -845,10 +864,10 @@ let DecoderNamespace = "MicroMips", Predicates = [InMicroMips] in {
   def WSBH_MM : MMRel, SubwordSwap<"wsbh", GPR32Opnd, II_WSBH>,
                 SEB_FM_MM<0x1ec>, ISA_MIPS32R2;
   // TODO: Add '0 < pos+size <= 32' constraint check to ext instruction
-  def EXT_MM : MMRel, ExtBase<"ext", GPR32Opnd, uimm5, uimm5_plus1,
-                              MipsExt>, EXT_FM_MM<0x2c>;
-  def INS_MM : MMRel, InsBase<"ins", GPR32Opnd, uimm5, MipsIns>,
-               EXT_FM_MM<0x0c>;
+  def EXT_MM : MMRel, ExtBase<"ext", GPR32Opnd, uimm5, uimm5_plus1, immZExt5,
+                              immZExt5Plus1, MipsExt>, EXT_FM_MM<0x2c>;
+  def INS_MM : MMRel, InsBase<"ins", GPR32Opnd, uimm5, uimm5_inssize_plus1,
+                              MipsIns>, EXT_FM_MM<0x0c>;
 
   /// Jump Instructions
   let DecoderMethod = "DecodeJumpTargetMM" in {
@@ -857,7 +876,8 @@ let DecoderNamespace = "MicroMips", Predicates = [InMicroMips] in {
     def JAL_MM      : MMRel, JumpLink<"jal", calltarget_mm>, J_FM_MM<0x3d>;
     def JALX_MM     : MMRel, JumpLink<"jalx", calltarget>, J_FM_MM<0x3c>;
   }
-  def JR_MM   : MMRel, IndirectBranch<"jr", GPR32Opnd>, JR_FM_MM<0x3c>;
+  def JR_MM : MMRel, IndirectBranch<"jr", GPR32Opnd>, JR_FM_MM<0x3c>,
+              ISA_MICROMIPS32_NOT_MIPS32R6;
   def JALR_MM : JumpLinkReg<"jalr", GPR32Opnd>, JALR_FM_MM<0x03c>;
 
   /// Jump Instructions - Short Delay Slot
@@ -891,7 +911,7 @@ let DecoderNamespace = "MicroMips", Predicates = [InMicroMips] in {
   /// Control Instructions
   def SYNC_MM    : MMRel, SYNC_FT<"sync">, SYNC_FM_MM;
   def BREAK_MM   : MMRel, BRK_FT<"break">, BRK_FM_MM;
-  def SYSCALL_MM : MMRel, SYS_FT<"syscall">, SYS_FM_MM;
+  def SYSCALL_MM : MMRel, SYS_FT<"syscall", uimm10>, SYS_FM_MM;
   def WAIT_MM    : WaitMM<"wait">, WAIT_FM_MM;
   def ERET_MM    : MMRel, ER_FT<"eret">, ER_FM_MM<0x3cd>;
   def DERET_MM   : MMRel, ER_FT<"deret">, ER_FM_MM<0x38d>;
@@ -901,12 +921,12 @@ let DecoderNamespace = "MicroMips", Predicates = [InMicroMips] in {
                    ISA_MIPS32R2;
 
   /// Trap Instructions
-  def TEQ_MM  : MMRel, TEQ_FT<"teq", GPR32Opnd>, TEQ_FM_MM<0x0>;
-  def TGE_MM  : MMRel, TEQ_FT<"tge", GPR32Opnd>, TEQ_FM_MM<0x08>;
-  def TGEU_MM : MMRel, TEQ_FT<"tgeu", GPR32Opnd>, TEQ_FM_MM<0x10>;
-  def TLT_MM  : MMRel, TEQ_FT<"tlt", GPR32Opnd>, TEQ_FM_MM<0x20>;
-  def TLTU_MM : MMRel, TEQ_FT<"tltu", GPR32Opnd>, TEQ_FM_MM<0x28>;
-  def TNE_MM  : MMRel, TEQ_FT<"tne", GPR32Opnd>, TEQ_FM_MM<0x30>;
+  def TEQ_MM  : MMRel, TEQ_FT<"teq", GPR32Opnd, uimm4>, TEQ_FM_MM<0x0>;
+  def TGE_MM  : MMRel, TEQ_FT<"tge", GPR32Opnd, uimm4>, TEQ_FM_MM<0x08>;
+  def TGEU_MM : MMRel, TEQ_FT<"tgeu", GPR32Opnd, uimm4>, TEQ_FM_MM<0x10>;
+  def TLT_MM  : MMRel, TEQ_FT<"tlt", GPR32Opnd, uimm4>, TEQ_FM_MM<0x20>;
+  def TLTU_MM : MMRel, TEQ_FT<"tltu", GPR32Opnd, uimm4>, TEQ_FM_MM<0x28>;
+  def TNE_MM  : MMRel, TEQ_FT<"tne", GPR32Opnd, uimm4>, TEQ_FM_MM<0x30>;
 
   def TEQI_MM  : MMRel, TEQI_FT<"teqi", GPR32Opnd>, TEQI_FM_MM<0x0e>;
   def TGEI_MM  : MMRel, TEQI_FT<"tgei", GPR32Opnd>, TEQI_FM_MM<0x09>;
@@ -931,9 +951,9 @@ let DecoderNamespace = "MicroMips", Predicates = [InMicroMips] in {
 
   let DecoderMethod = "DecodePrefeOpMM" in {
     def PREFE_MM  : MMRel, CacheOp<"prefe", mem_mm_9>,
-                 CACHE_PREFE_FM_MM<0x18, 0x2>;
+                    CACHE_PREFE_FM_MM<0x18, 0x2>;
     def CACHEE_MM : MMRel, CacheOp<"cachee", mem_mm_9>,
-                 CACHE_PREFE_FM_MM<0x18, 0x3>;
+                    CACHE_PREFE_FM_MM<0x18, 0x3>;
   }
   def SSNOP_MM : MMRel, Barrier<"ssnop">, BARRIER_FM_MM<0x1>;
   def EHB_MM   : MMRel, Barrier<"ehb">, BARRIER_FM_MM<0x3>;
@@ -944,7 +964,7 @@ let DecoderNamespace = "MicroMips", Predicates = [InMicroMips] in {
   def TLBWI_MM : MMRel, TLB<"tlbwi">, COP0_TLB_FM_MM<0x8d>;
   def TLBWR_MM : MMRel, TLB<"tlbwr">, COP0_TLB_FM_MM<0xcd>;
 
-  def SDBBP_MM : MMRel, SYS_FT<"sdbbp">, SDBBP_FM_MM;
+  def SDBBP_MM : MMRel, SYS_FT<"sdbbp", uimm10>, SDBBP_FM_MM;
 
   def PREFX_MM : PrefetchIndexed<"prefx">, POOL32F_PREFX_FM_MM<0x15, 0x1A0>;
 }
@@ -952,54 +972,80 @@ let DecoderNamespace = "MicroMips", Predicates = [InMicroMips] in {
 let DecoderNamespace = "MicroMips" in {
   def RDHWR_MM : MMRel, R6MMR6Rel, ReadHardware<GPR32Opnd, HWRegsOpnd>,
                  RDHWR_FM_MM, ISA_MICROMIPS32_NOT_MIPS32R6;
+  def LWU_MM : MMRel, LoadMM<"lwu", GPR32Opnd, zextloadi32, II_LWU,
+                             mem_simm12>, LL_FM_MM<0xe>,
+               ISA_MICROMIPS32_NOT_MIPS32R6;
 }
 
-let Predicates = [InMicroMips] in {
-
 //===----------------------------------------------------------------------===//
 // MicroMips arbitrary patterns that map to one or more instructions
 //===----------------------------------------------------------------------===//
 
-def : MipsPat<(i32 immLi16:$imm),
-              (LI16_MM immLi16:$imm)>;
-def : MipsPat<(i32 immSExt16:$imm),
-              (ADDiu_MM ZERO, immSExt16:$imm)>;
-def : MipsPat<(i32 immZExt16:$imm),
-              (ORi_MM ZERO, immZExt16:$imm)>;
-def : MipsPat<(not GPR32:$in),
-              (NOR_MM GPR32Opnd:$in, ZERO)>;
-
-def : MipsPat<(add GPRMM16:$src, immSExtAddiur2:$imm),
-              (ADDIUR2_MM GPRMM16:$src, immSExtAddiur2:$imm)>;
-def : MipsPat<(add GPR32:$src, immSExtAddius5:$imm),
-              (ADDIUS5_MM GPR32:$src, immSExtAddius5:$imm)>;
-def : MipsPat<(add GPR32:$src, immSExt16:$imm),
-              (ADDiu_MM GPR32:$src, immSExt16:$imm)>;
-
-def : MipsPat<(and GPRMM16:$src, immZExtAndi16:$imm),
-              (ANDI16_MM GPRMM16:$src, immZExtAndi16:$imm)>;
-def : MipsPat<(and GPR32:$src, immZExt16:$imm),
-              (ANDi_MM GPR32:$src, immZExt16:$imm)>;
-
-def : MipsPat<(shl GPRMM16:$src, immZExt2Shift:$imm),
-              (SLL16_MM GPRMM16:$src, immZExt2Shift:$imm)>;
-def : MipsPat<(shl GPR32:$src, immZExt5:$imm),
-              (SLL_MM GPR32:$src, immZExt5:$imm)>;
-
-def : MipsPat<(srl GPRMM16:$src, immZExt2Shift:$imm),
-              (SRL16_MM GPRMM16:$src, immZExt2Shift:$imm)>;
-def : MipsPat<(srl GPR32:$src, immZExt5:$imm),
-              (SRL_MM GPR32:$src, immZExt5:$imm)>;
-
-def : MipsPat<(store GPRMM16:$src, addrimm4lsl2:$addr),
-              (SW16_MM GPRMM16:$src, addrimm4lsl2:$addr)>;
-def : MipsPat<(store GPR32:$src, addr:$addr),
-              (SW_MM GPR32:$src, addr:$addr)>;
-
-def : MipsPat<(load addrimm4lsl2:$addr),
-              (LW16_MM addrimm4lsl2:$addr)>;
-def : MipsPat<(load addr:$addr),
-              (LW_MM addr:$addr)>;
+let Predicates = [InMicroMips] in {
+  def : MipsPat<(i32 immLi16:$imm),
+                (LI16_MM immLi16:$imm)>;
+  def : MipsPat<(i32 immSExt16:$imm),
+                (ADDiu_MM ZERO, immSExt16:$imm)>;
+  def : MipsPat<(i32 immZExt16:$imm),
+                (ORi_MM ZERO, immZExt16:$imm)>;
+
+  def : MipsPat<(not GPRMM16:$in),
+                (NOT16_MM GPRMM16:$in)>;
+  def : MipsPat<(not GPR32:$in),
+                (NOR_MM GPR32Opnd:$in, ZERO)>;
+
+  def : MipsPat<(add GPRMM16:$src, immSExtAddiur2:$imm),
+                (ADDIUR2_MM GPRMM16:$src, immSExtAddiur2:$imm)>;
+  def : MipsPat<(add GPR32:$src, immSExtAddius5:$imm),
+                (ADDIUS5_MM GPR32:$src, immSExtAddius5:$imm)>;
+  def : MipsPat<(add GPR32:$src, immSExt16:$imm),
+                (ADDiu_MM GPR32:$src, immSExt16:$imm)>;
+
+  def : MipsPat<(and GPRMM16:$src, immZExtAndi16:$imm),
+                (ANDI16_MM GPRMM16:$src, immZExtAndi16:$imm)>;
+  def : MipsPat<(and GPR32:$src, immZExt16:$imm),
+                (ANDi_MM GPR32:$src, immZExt16:$imm)>;
+
+  def : MipsPat<(shl GPRMM16:$src, immZExt2Shift:$imm),
+                (SLL16_MM GPRMM16:$src, immZExt2Shift:$imm)>;
+  def : MipsPat<(shl GPR32:$src, immZExt5:$imm),
+                (SLL_MM GPR32:$src, immZExt5:$imm)>;
+  def : MipsPat<(shl GPR32:$lhs, GPR32:$rhs),
+                (SLLV_MM GPR32:$lhs, GPR32:$rhs)>;
+
+  def : MipsPat<(srl GPRMM16:$src, immZExt2Shift:$imm),
+                (SRL16_MM GPRMM16:$src, immZExt2Shift:$imm)>;
+  def : MipsPat<(srl GPR32:$src, immZExt5:$imm),
+                (SRL_MM GPR32:$src, immZExt5:$imm)>;
+  def : MipsPat<(srl GPR32:$lhs, GPR32:$rhs),
+                (SRLV_MM GPR32:$lhs, GPR32:$rhs)>;
+
+  def : MipsPat<(sra GPR32:$src, immZExt5:$imm),
+                (SRA_MM GPR32:$src, immZExt5:$imm)>;
+  def : MipsPat<(sra GPR32:$lhs, GPR32:$rhs),
+                (SRAV_MM GPR32:$lhs, GPR32:$rhs)>;
+
+  def : MipsPat<(store GPRMM16:$src, addrimm4lsl2:$addr),
+                (SW16_MM GPRMM16:$src, addrimm4lsl2:$addr)>;
+  def : MipsPat<(store GPR32:$src, addr:$addr),
+                (SW_MM GPR32:$src, addr:$addr)>;
+
+  def : MipsPat<(load addrimm4lsl2:$addr),
+                (LW16_MM addrimm4lsl2:$addr)>;
+  def : MipsPat<(load addr:$addr),
+                (LW_MM addr:$addr)>;
+  def : MipsPat<(subc GPR32:$lhs, GPR32:$rhs),
+                (SUBu_MM GPR32:$lhs, GPR32:$rhs)>;
+}
+
+let AddedComplexity = 40 in {
+  def : MipsPat<(i32 (sextloadi16 addrRegImm:$a)),
+                (LH_MM addrRegImm:$a)>;
+}
+def : MipsPat<(atomic_load_16 addr:$a),
+              (LH_MM addr:$a)>;
+def : MipsPat<(i32 (extloadi16 addr:$src)),
+              (LHu_MM addr:$src)>;
 
 //===----------------------------------------------------------------------===//
 // MicroMips instruction aliases
@@ -1011,24 +1057,62 @@ class UncondBranchMMPseudo<string opstr> :
 
 def B_MM_Pseudo : UncondBranchMMPseudo<"b">, ISA_MICROMIPS;
 
+let Predicates = [InMicroMips] in {
+  def SDIV_MM_Pseudo : MultDivPseudo<SDIV_MM, ACC64, GPR32Opnd, MipsDivRem,
+                                     II_DIV, 0, 1, 1>, ISA_MIPS1_NOT_32R6_64R6;
+  def UDIV_MM_Pseudo : MultDivPseudo<UDIV_MM, ACC64, GPR32Opnd, MipsDivRemU,
+                                     II_DIVU, 0, 1, 1>, ISA_MIPS1_NOT_32R6_64R6;
+
   def : MipsInstAlias<"wait", (WAIT_MM 0x0), 1>;
   def : MipsInstAlias<"nop", (SLL_MM ZERO, ZERO, 0), 1>;
   def : MipsInstAlias<"nop", (MOVE16_MM ZERO, ZERO), 1>;
-}
-
-let Predicates = [InMicroMips] in {
-def : MipsInstAlias<"ei", (EI_MM ZERO), 1>, ISA_MIPS32R2;
-def : MipsInstAlias<"di", (DI_MM ZERO), 1>, ISA_MIPS32R2;
-def : MipsInstAlias<"teq $rs, $rt",
-                    (TEQ_MM GPR32Opnd:$rs, GPR32Opnd:$rt, 0), 1>;
-def : MipsInstAlias<"tge $rs, $rt",
-                    (TGE_MM GPR32Opnd:$rs, GPR32Opnd:$rt, 0), 1>;
-def : MipsInstAlias<"tgeu $rs, $rt",
-                    (TGEU_MM GPR32Opnd:$rs, GPR32Opnd:$rt, 0), 1>;
-def : MipsInstAlias<"tlt $rs, $rt",
-                    (TLT_MM GPR32Opnd:$rs, GPR32Opnd:$rt, 0), 1>;
-def : MipsInstAlias<"tltu $rs, $rt",
-                    (TLTU_MM GPR32Opnd:$rs, GPR32Opnd:$rt, 0), 1>;
-def : MipsInstAlias<"tne $rs, $rt",
-                    (TNE_MM GPR32Opnd:$rs, GPR32Opnd:$rt, 0), 1>;
+  def : MipsInstAlias<"ei", (EI_MM ZERO), 1>, ISA_MIPS32R2;
+  def : MipsInstAlias<"di", (DI_MM ZERO), 1>, ISA_MIPS32R2;
+  def : MipsInstAlias<"teq $rs, $rt",
+                      (TEQ_MM GPR32Opnd:$rs, GPR32Opnd:$rt, 0), 1>;
+  def : MipsInstAlias<"tge $rs, $rt",
+                      (TGE_MM GPR32Opnd:$rs, GPR32Opnd:$rt, 0), 1>;
+  def : MipsInstAlias<"tgeu $rs, $rt",
+                      (TGEU_MM GPR32Opnd:$rs, GPR32Opnd:$rt, 0), 1>;
+  def : MipsInstAlias<"tlt $rs, $rt",
+                      (TLT_MM GPR32Opnd:$rs, GPR32Opnd:$rt, 0), 1>;
+  def : MipsInstAlias<"tltu $rs, $rt",
+                      (TLTU_MM GPR32Opnd:$rs, GPR32Opnd:$rt, 0), 1>;
+  def : MipsInstAlias<"tne $rs, $rt",
+                      (TNE_MM GPR32Opnd:$rs, GPR32Opnd:$rt, 0), 1>;
+  def : MipsInstAlias<"sll $rd, $rt, $rs",
+                      (SLLV_MM GPR32Opnd:$rd, GPR32Opnd:$rt, GPR32Opnd:$rs), 0>;
+  def : MipsInstAlias<"sra $rd, $rt, $rs",
+                      (SRAV_MM GPR32Opnd:$rd, GPR32Opnd:$rt, GPR32Opnd:$rs), 0>;
+  def : MipsInstAlias<"srl $rd, $rt, $rs",
+                      (SRLV_MM GPR32Opnd:$rd, GPR32Opnd:$rt, GPR32Opnd:$rs), 0>;
+  def : MipsInstAlias<"sll $rd, $rt",
+                      (SLLV_MM GPR32Opnd:$rd, GPR32Opnd:$rd, GPR32Opnd:$rt), 0>;
+  def : MipsInstAlias<"sra $rd, $rt",
+                      (SRAV_MM GPR32Opnd:$rd, GPR32Opnd:$rd, GPR32Opnd:$rt), 0>;
+  def : MipsInstAlias<"srl $rd, $rt",
+                      (SRLV_MM GPR32Opnd:$rd, GPR32Opnd:$rd, GPR32Opnd:$rt), 0>;
+  def : MipsInstAlias<"sll $rd, $shamt",
+                      (SLL_MM GPR32Opnd:$rd, GPR32Opnd:$rd, uimm5:$shamt), 0>;
+  def : MipsInstAlias<"sra $rd, $shamt",
+                      (SRA_MM GPR32Opnd:$rd, GPR32Opnd:$rd, uimm5:$shamt), 0>;
+  def : MipsInstAlias<"srl $rd, $shamt",
+                      (SRL_MM GPR32Opnd:$rd, GPR32Opnd:$rd, uimm5:$shamt), 0>;
+  def : MipsInstAlias<"rotr $rt, $imm",
+                      (ROTR_MM GPR32Opnd:$rt, GPR32Opnd:$rt, uimm5:$imm), 0>;
+  def : MipsInstAlias<"syscall", (SYSCALL_MM 0), 1>;
+  def : MipsInstAlias<"and $rs, $rt, $imm",
+                      (ANDi_MM GPR32Opnd:$rs, GPR32Opnd:$rt, simm16:$imm), 0>;
+  def : MipsInstAlias<"and $rs, $imm",
+                      (ANDi_MM GPR32Opnd:$rs, GPR32Opnd:$rs, simm16:$imm), 0>;
+  def : MipsInstAlias<"or $rs, $rt, $imm",
+                      (ORi_MM GPR32Opnd:$rs, GPR32Opnd:$rt, uimm16:$imm), 0>;
+  def : MipsInstAlias<"or $rs, $imm",
+                      (ORi_MM GPR32Opnd:$rs, GPR32Opnd:$rs, uimm16:$imm), 0>;
+  def : MipsInstAlias<"xor $rs, $rt, $imm",
+                      (XORi_MM GPR32Opnd:$rs, GPR32Opnd:$rt, uimm16:$imm), 0>;
+  def : MipsInstAlias<"xor $rs, $imm",
+                      (XORi_MM GPR32Opnd:$rs, GPR32Opnd:$rs, uimm16:$imm), 0>;
+  def : MipsInstAlias<"not $rt, $rs",
+                      (NOR_MM GPR32Opnd:$rt, GPR32Opnd:$rs, ZERO), 0>;
 }
diff --git a/lib/Target/Mips/Mips.h b/lib/Target/Mips/Mips.h
index 671d7a87cc3d..d9faf3325cac 100644
--- a/lib/Target/Mips/Mips.h
+++ b/lib/Target/Mips/Mips.h
@@ -29,8 +29,9 @@ namespace llvm {
   FunctionPass *createMipsModuleISelDagPass(MipsTargetMachine &TM);
   FunctionPass *createMipsOptimizePICCallPass(MipsTargetMachine &TM);
   FunctionPass *createMipsDelaySlotFillerPass(MipsTargetMachine &TM);
+  FunctionPass *createMipsHazardSchedule();
   FunctionPass *createMipsLongBranchPass(MipsTargetMachine &TM);
-  FunctionPass *createMipsConstantIslandPass(MipsTargetMachine &tm);
+  FunctionPass *createMipsConstantIslandPass();
 } // end namespace llvm;
 
 #endif
diff --git a/lib/Target/Mips/Mips.td b/lib/Target/Mips/Mips.td
index 35352b6115c5..ea3fa0a9578e 100644
--- a/lib/Target/Mips/Mips.td
+++ b/lib/Target/Mips/Mips.td
@@ -24,6 +24,8 @@ class PredicateControl {
   list<Predicate> EncodingPredicates = [];
   // Predicates for the GPR size such as IsGP64bit
   list<Predicate> GPRPredicates = [];
+  // Predicates for the PTR size such as IsPTR64bit
+  list<Predicate> PTRPredicates = [];
   // Predicates for the FGR size and layout such as IsFP64bit
   list<Predicate> FGRPredicates = [];
   // Predicates for the instruction group membership such as ISA's and ASE's
@@ -34,6 +36,7 @@ class PredicateControl {
   list<Predicate> AdditionalPredicates = [];
   list<Predicate> Predicates = !listconcat(EncodingPredicates,
                                            GPRPredicates,
+                                           PTRPredicates,
                                            FGRPredicates,
                                            InsnPredicates,
                                            HardFloatPredicate,
@@ -62,6 +65,8 @@ def MipsInstrInfo : InstrInfo;
 
 def FeatureNoABICalls  : SubtargetFeature<"noabicalls", "NoABICalls", "true",
                                 "Disable SVR4-style position-independent code">;
+def FeaturePTR64Bit    : SubtargetFeature<"ptr64", "IsPTR64bit", "true",
+                                "Pointers are 64-bit wide">;
 def FeatureGP64Bit     : SubtargetFeature<"gp64", "IsGP64bit", "true",
                                 "General Purpose Registers are 64-bit wide">;
 def FeatureFP64Bit     : SubtargetFeature<"fp64", "IsFP64bit", "true",
diff --git a/lib/Target/Mips/Mips16FrameLowering.cpp b/lib/Target/Mips/Mips16FrameLowering.cpp
index 26426c087164..e937ffa7a7ab 100644
--- a/lib/Target/Mips/Mips16FrameLowering.cpp
+++ b/lib/Target/Mips/Mips16FrameLowering.cpp
@@ -24,7 +24,6 @@
 #include "llvm/CodeGen/MachineRegisterInfo.h"
 #include "llvm/IR/DataLayout.h"
 #include "llvm/IR/Function.h"
-#include "llvm/Support/CommandLine.h"
 #include "llvm/Target/TargetOptions.h"
 
 using namespace llvm;
diff --git a/lib/Target/Mips/Mips16HardFloat.cpp b/lib/Target/Mips/Mips16HardFloat.cpp
index b2bc7e74c706..d2d1c65e40d9 100644
--- a/lib/Target/Mips/Mips16HardFloat.cpp
+++ b/lib/Target/Mips/Mips16HardFloat.cpp
@@ -261,7 +261,7 @@ static std::string swapFPIntParams(FPParamVariant PV, Module *M, bool LE,
 static void assureFPCallStub(Function &F, Module *M,
                              const MipsTargetMachine &TM) {
   // for now we only need them for static relocation
-  if (TM.getRelocationModel() == Reloc::PIC_)
+  if (TM.isPositionIndependent())
     return;
   LLVMContext &Context = M->getContext();
   bool LE = TM.isLittleEndian();
@@ -387,11 +387,9 @@ static bool fixupFPReturnAndCall(Function &F, Module *M,
   bool Modified = false;
   LLVMContext &C = M->getContext();
   Type *MyVoid = Type::getVoidTy(C);
-  for (Function::iterator BB = F.begin(), E = F.end(); BB != E; ++BB)
-    for (BasicBlock::iterator I = BB->begin(), E = BB->end();
-         I != E; ++I) {
-      Instruction &Inst = *I;
-      if (const ReturnInst *RI = dyn_cast<ReturnInst>(I)) {
+  for (auto &BB: F)
+    for (auto &I: BB) {
+      if (const ReturnInst *RI = dyn_cast<ReturnInst>(&I)) {
         Value *RVal = RI->getReturnValue();
         if (!RVal) continue;
         //
@@ -425,17 +423,11 @@ static bool fixupFPReturnAndCall(Function &F, Module *M,
         A = A.addAttribute(C, AttributeSet::FunctionIndex,
                            Attribute::NoInline);
         Value *F = (M->getOrInsertFunction(Name, A, MyVoid, T, nullptr));
-        CallInst::Create(F, Params, "", &Inst );
-      } else if (const CallInst *CI = dyn_cast<CallInst>(I)) {
-        const Value* V = CI->getCalledValue();
-        Type* T = nullptr;
-        if (V) T = V->getType();
-        PointerType *PFT = nullptr;
-        if (T) PFT = dyn_cast<PointerType>(T);
-        FunctionType *FT = nullptr;
-        if (PFT) FT = dyn_cast<FunctionType>(PFT->getElementType());
+        CallInst::Create(F, Params, "", &I);
+      } else if (const CallInst *CI = dyn_cast<CallInst>(&I)) {
+        FunctionType *FT = CI->getFunctionType();
         Function *F_ =  CI->getCalledFunction();
-        if (FT && needsFPReturnHelper(*FT) &&
+        if (needsFPReturnHelper(*FT) &&
             !(F_ && isIntrinsicInline(F_))) {
           Modified=true;
           F.addFnAttr("saveS2");
@@ -447,7 +439,7 @@ static bool fixupFPReturnAndCall(Function &F, Module *M,
             Modified=true;
             F.addFnAttr("saveS2");
           }
-          if (TM.getRelocationModel() != Reloc::PIC_ ) {
+          if (!TM.isPositionIndependent()) {
             if (needsFPHelperFromSig(*F_)) {
               assureFPCallStub(*F_, M, TM);
               Modified=true;
@@ -461,7 +453,7 @@ static bool fixupFPReturnAndCall(Function &F, Module *M,
 
 static void createFPFnStub(Function *F, Module *M, FPParamVariant PV,
                            const MipsTargetMachine &TM) {
-  bool PicMode = TM.getRelocationModel() == Reloc::PIC_;
+  bool PicMode = TM.isPositionIndependent();
   bool LE = TM.isLittleEndian();
   LLVMContext &Context = M->getContext();
   std::string Name = F->getName();
diff --git a/lib/Target/Mips/Mips16ISelDAGToDAG.cpp b/lib/Target/Mips/Mips16ISelDAGToDAG.cpp
index 5a1c2c67cc70..0405291431cd 100644
--- a/lib/Target/Mips/Mips16ISelDAGToDAG.cpp
+++ b/lib/Target/Mips/Mips16ISelDAGToDAG.cpp
@@ -14,7 +14,6 @@
 #include "Mips16ISelDAGToDAG.h"
 #include "MCTargetDesc/MipsBaseInfo.h"
 #include "Mips.h"
-#include "MipsAnalyzeImmediate.h"
 #include "MipsMachineFunction.h"
 #include "MipsRegisterInfo.h"
 #include "llvm/CodeGen/MachineConstantPool.h"
@@ -43,8 +42,8 @@ bool Mips16DAGToDAGISel::runOnMachineFunction(MachineFunction &MF) {
   return MipsDAGToDAGISel::runOnMachineFunction(MF);
 }
 /// Select multiply instructions.
-std::pair<SDNode*, SDNode*>
-Mips16DAGToDAGISel::selectMULT(SDNode *N, unsigned Opc, SDLoc DL, EVT Ty,
+std::pair<SDNode *, SDNode *>
+Mips16DAGToDAGISel::selectMULT(SDNode *N, unsigned Opc, const SDLoc &DL, EVT Ty,
                                bool HasLo, bool HasHi) {
   SDNode *Lo = nullptr, *Hi = nullptr;
   SDNode *Mul = CurDAG->getMachineNode(Opc, DL, MVT::Glue, N->getOperand(0),
@@ -81,125 +80,60 @@ void Mips16DAGToDAGISel::initGlobalBaseReg(MachineFunction &MF) {
   V1 = RegInfo.createVirtualRegister(RC);
   V2 = RegInfo.createVirtualRegister(RC);
 
-  BuildMI(MBB, I, DL, TII.get(Mips::GotPrologue16), V0).
-    addReg(V1, RegState::Define).
-    addExternalSymbol("_gp_disp", MipsII::MO_ABS_HI).
-    addExternalSymbol("_gp_disp", MipsII::MO_ABS_LO);
+  BuildMI(MBB, I, DL, TII.get(Mips::GotPrologue16), V0)
+      .addReg(V1, RegState::Define)
+      .addExternalSymbol("_gp_disp", MipsII::MO_ABS_HI)
+      .addExternalSymbol("_gp_disp", MipsII::MO_ABS_LO);
 
   BuildMI(MBB, I, DL, TII.get(Mips::SllX16), V2).addReg(V0).addImm(16);
   BuildMI(MBB, I, DL, TII.get(Mips::AdduRxRyRz16), GlobalBaseReg)
-    .addReg(V1).addReg(V2);
-}
-
-// Insert instructions to initialize the Mips16 SP Alias register in the
-// first MBB of the function.
-//
-void Mips16DAGToDAGISel::initMips16SPAliasReg(MachineFunction &MF) {
-  MipsFunctionInfo *MipsFI = MF.getInfo<MipsFunctionInfo>();
-
-  if (!MipsFI->mips16SPAliasRegSet())
-    return;
-
-  MachineBasicBlock &MBB = MF.front();
-  MachineBasicBlock::iterator I = MBB.begin();
-  const TargetInstrInfo &TII = *Subtarget->getInstrInfo();
-  DebugLoc DL = I != MBB.end() ? I->getDebugLoc() : DebugLoc();
-  unsigned Mips16SPAliasReg = MipsFI->getMips16SPAliasReg();
-
-  BuildMI(MBB, I, DL, TII.get(Mips::MoveR3216), Mips16SPAliasReg)
-    .addReg(Mips::SP);
+      .addReg(V1)
+      .addReg(V2);
 }
 
 void Mips16DAGToDAGISel::processFunctionAfterISel(MachineFunction &MF) {
   initGlobalBaseReg(MF);
-  initMips16SPAliasReg(MF);
-}
-
-/// getMips16SPAliasReg - Output the instructions required to put the
-/// SP into a Mips16 accessible aliased register.
-SDValue Mips16DAGToDAGISel::getMips16SPAliasReg() {
-  unsigned Mips16SPAliasReg =
-    MF->getInfo<MipsFunctionInfo>()->getMips16SPAliasReg();
-  auto PtrVT = getTargetLowering()->getPointerTy(CurDAG->getDataLayout());
-  return CurDAG->getRegister(Mips16SPAliasReg, PtrVT);
 }
 
-void Mips16DAGToDAGISel::getMips16SPRefReg(SDNode *Parent, SDValue &AliasReg) {
-  auto PtrVT = getTargetLowering()->getPointerTy(CurDAG->getDataLayout());
-  SDValue AliasFPReg = CurDAG->getRegister(Mips::S0, PtrVT);
-  if (Parent) {
-    switch (Parent->getOpcode()) {
-      case ISD::LOAD: {
-        LoadSDNode *SD = dyn_cast<LoadSDNode>(Parent);
-        switch (SD->getMemoryVT().getSizeInBits()) {
-        case 8:
-        case 16:
-          AliasReg = Subtarget->getFrameLowering()->hasFP(*MF)
-                         ? AliasFPReg
-                         : getMips16SPAliasReg();
-          return;
-        }
-        break;
-      }
-      case ISD::STORE: {
-        StoreSDNode *SD = dyn_cast<StoreSDNode>(Parent);
-        switch (SD->getMemoryVT().getSizeInBits()) {
-        case 8:
-        case 16:
-          AliasReg = Subtarget->getFrameLowering()->hasFP(*MF)
-                         ? AliasFPReg
-                         : getMips16SPAliasReg();
-          return;
-        }
-        break;
-      }
-    }
-  }
-  AliasReg = CurDAG->getRegister(Mips::SP, PtrVT);
-  return;
-
-}
-
-bool Mips16DAGToDAGISel::selectAddr16(
-  SDNode *Parent, SDValue Addr, SDValue &Base, SDValue &Offset,
-  SDValue &Alias) {
+bool Mips16DAGToDAGISel::selectAddr(bool SPAllowed, SDValue Addr, SDValue &Base,
+                                    SDValue &Offset) {
   SDLoc DL(Addr);
   EVT ValTy = Addr.getValueType();
 
-  Alias = CurDAG->getTargetConstant(0, DL, ValTy);
-
   // if Address is FI, get the TargetFrameIndex.
-  if (FrameIndexSDNode *FIN = dyn_cast<FrameIndexSDNode>(Addr)) {
-    Base   = CurDAG->getTargetFrameIndex(FIN->getIndex(), ValTy);
-    Offset = CurDAG->getTargetConstant(0, DL, ValTy);
-    getMips16SPRefReg(Parent, Alias);
-    return true;
+  if (SPAllowed) {
+    if (FrameIndexSDNode *FIN = dyn_cast<FrameIndexSDNode>(Addr)) {
+      Base = CurDAG->getTargetFrameIndex(FIN->getIndex(), ValTy);
+      Offset = CurDAG->getTargetConstant(0, DL, ValTy);
+      return true;
+    }
   }
   // on PIC code Load GA
   if (Addr.getOpcode() == MipsISD::Wrapper) {
-    Base   = Addr.getOperand(0);
+    Base = Addr.getOperand(0);
     Offset = Addr.getOperand(1);
     return true;
   }
-  if (TM.getRelocationModel() != Reloc::PIC_) {
+  if (!TM.isPositionIndependent()) {
     if ((Addr.getOpcode() == ISD::TargetExternalSymbol ||
-        Addr.getOpcode() == ISD::TargetGlobalAddress))
+         Addr.getOpcode() == ISD::TargetGlobalAddress))
       return false;
   }
   // Addresses of the form FI+const or FI|const
   if (CurDAG->isBaseWithConstantOffset(Addr)) {
     ConstantSDNode *CN = dyn_cast<ConstantSDNode>(Addr.getOperand(1));
     if (isInt<16>(CN->getSExtValue())) {
-
       // If the first operand is a FI, get the TargetFI Node
-      if (FrameIndexSDNode *FIN = dyn_cast<FrameIndexSDNode>
-                                  (Addr.getOperand(0))) {
-        Base = CurDAG->getTargetFrameIndex(FIN->getIndex(), ValTy);
-        getMips16SPRefReg(Parent, Alias);
+      if (SPAllowed) {
+        if (FrameIndexSDNode *FIN =
+                dyn_cast<FrameIndexSDNode>(Addr.getOperand(0))) {
+          Base = CurDAG->getTargetFrameIndex(FIN->getIndex(), ValTy);
+          Offset = CurDAG->getTargetConstant(CN->getZExtValue(), DL, ValTy);
+          return true;
+        }
       }
-      else
-        Base = Addr.getOperand(0);
 
+      Base = Addr.getOperand(0);
       Offset = CurDAG->getTargetConstant(CN->getZExtValue(), DL, ValTy);
       return true;
     }
@@ -224,25 +158,25 @@ bool Mips16DAGToDAGISel::selectAddr16(
         return true;
       }
     }
-
-    // If an indexed floating point load/store can be emitted, return false.
-    const LSBaseSDNode *LS = dyn_cast<LSBaseSDNode>(Parent);
-
-    if (LS) {
-      if (LS->getMemoryVT() == MVT::f32 && Subtarget->hasMips4_32r2())
-        return false;
-      if (LS->getMemoryVT() == MVT::f64 && Subtarget->hasMips4_32r2())
-        return false;
-    }
   }
-  Base   = Addr;
+  Base = Addr;
   Offset = CurDAG->getTargetConstant(0, DL, ValTy);
   return true;
 }
 
+bool Mips16DAGToDAGISel::selectAddr16(SDValue Addr, SDValue &Base,
+                                      SDValue &Offset) {
+  return selectAddr(false, Addr, Base, Offset);
+}
+
+bool Mips16DAGToDAGISel::selectAddr16SP(SDValue Addr, SDValue &Base,
+                                        SDValue &Offset) {
+  return selectAddr(true, Addr, Base, Offset);
+}
+
 /// Select instructions not customized! Used for
 /// expanded, promoted and normal instructions
-std::pair<bool, SDNode*> Mips16DAGToDAGISel::selectNode(SDNode *Node) {
+bool Mips16DAGToDAGISel::trySelect(SDNode *Node) {
   unsigned Opcode = Node->getOpcode();
   SDLoc DL(Node);
 
@@ -253,13 +187,15 @@ std::pair<bool, SDNode*> Mips16DAGToDAGISel::selectNode(SDNode *Node) {
   EVT NodeTy = Node->getValueType(0);
   unsigned MultOpc;
 
-  switch(Opcode) {
-  default: break;
+  switch (Opcode) {
+  default:
+    break;
 
   case ISD::SUBE:
   case ISD::ADDE: {
     SDValue InFlag = Node->getOperand(2), CmpLHS;
-    unsigned Opc = InFlag.getOpcode(); (void)Opc;
+    unsigned Opc = InFlag.getOpcode();
+    (void)Opc;
     assert(((Opc == ISD::ADDC || Opc == ISD::ADDE) ||
             (Opc == ISD::SUBC || Opc == ISD::SUBE)) &&
            "(ADD|SUB)E flag operand must come from (ADD|SUB)C/E insn");
@@ -273,7 +209,7 @@ std::pair<bool, SDNode*> Mips16DAGToDAGISel::selectNode(SDNode *Node) {
       MOp = Mips::SubuRxRyRz16;
     }
 
-    SDValue Ops[] = { CmpLHS, InFlag.getOperand(1) };
+    SDValue Ops[] = {CmpLHS, InFlag.getOperand(1)};
 
     SDValue LHS = Node->getOperand(0);
     SDValue RHS = Node->getOperand(1);
@@ -283,40 +219,42 @@ std::pair<bool, SDNode*> Mips16DAGToDAGISel::selectNode(SDNode *Node) {
     unsigned Sltu_op = Mips::SltuRxRyRz16;
     SDNode *Carry = CurDAG->getMachineNode(Sltu_op, DL, VT, Ops);
     unsigned Addu_op = Mips::AdduRxRyRz16;
-    SDNode *AddCarry = CurDAG->getMachineNode(Addu_op, DL, VT,
-                                              SDValue(Carry,0), RHS);
+    SDNode *AddCarry =
+        CurDAG->getMachineNode(Addu_op, DL, VT, SDValue(Carry, 0), RHS);
 
-    SDNode *Result = CurDAG->SelectNodeTo(Node, MOp, VT, MVT::Glue, LHS,
-                                          SDValue(AddCarry,0));
-    return std::make_pair(true, Result);
+    CurDAG->SelectNodeTo(Node, MOp, VT, MVT::Glue, LHS, SDValue(AddCarry, 0));
+    return true;
   }
 
   /// Mul with two results
   case ISD::SMUL_LOHI:
   case ISD::UMUL_LOHI: {
     MultOpc = (Opcode == ISD::UMUL_LOHI ? Mips::MultuRxRy16 : Mips::MultRxRy16);
-    std::pair<SDNode*, SDNode*> LoHi = selectMULT(Node, MultOpc, DL, NodeTy,
-                                                  true, true);
+    std::pair<SDNode *, SDNode *> LoHi =
+        selectMULT(Node, MultOpc, DL, NodeTy, true, true);
     if (!SDValue(Node, 0).use_empty())
       ReplaceUses(SDValue(Node, 0), SDValue(LoHi.first, 0));
 
     if (!SDValue(Node, 1).use_empty())
       ReplaceUses(SDValue(Node, 1), SDValue(LoHi.second, 0));
 
-    return std::make_pair(true, nullptr);
+    CurDAG->RemoveDeadNode(Node);
+    return true;
   }
 
   case ISD::MULHS:
   case ISD::MULHU: {
     MultOpc = (Opcode == ISD::MULHU ? Mips::MultuRxRy16 : Mips::MultRxRy16);
-    SDNode *Result = selectMULT(Node, MultOpc, DL, NodeTy, false, true).second;
-    return std::make_pair(true, Result);
+    auto LoHi = selectMULT(Node, MultOpc, DL, NodeTy, false, true);
+    ReplaceNode(Node, LoHi.second);
+    return true;
   }
   }
 
-  return std::make_pair(false, nullptr);
+  return false;
 }
 
-FunctionPass *llvm::createMips16ISelDag(MipsTargetMachine &TM) {
-  return new Mips16DAGToDAGISel(TM);
+FunctionPass *llvm::createMips16ISelDag(MipsTargetMachine &TM,
+                                        CodeGenOpt::Level OptLevel) {
+  return new Mips16DAGToDAGISel(TM, OptLevel);
 }
diff --git a/lib/Target/Mips/Mips16ISelDAGToDAG.h b/lib/Target/Mips/Mips16ISelDAGToDAG.h
index ae0e61e19d9d..bbf8cc36f241 100644
--- a/lib/Target/Mips/Mips16ISelDAGToDAG.h
+++ b/lib/Target/Mips/Mips16ISelDAGToDAG.h
@@ -20,22 +20,24 @@ namespace llvm {
 
 class Mips16DAGToDAGISel : public MipsDAGToDAGISel {
 public:
-  explicit Mips16DAGToDAGISel(MipsTargetMachine &TM) : MipsDAGToDAGISel(TM) {}
+  explicit Mips16DAGToDAGISel(MipsTargetMachine &TM, CodeGenOpt::Level OL)
+      : MipsDAGToDAGISel(TM, OL) {}
 
 private:
-  std::pair<SDNode*, SDNode*> selectMULT(SDNode *N, unsigned Opc, SDLoc DL,
-                                         EVT Ty, bool HasLo, bool HasHi);
-
-  SDValue getMips16SPAliasReg();
+  std::pair<SDNode *, SDNode *> selectMULT(SDNode *N, unsigned Opc,
+                                           const SDLoc &DL, EVT Ty, bool HasLo,
+                                           bool HasHi);
 
   bool runOnMachineFunction(MachineFunction &MF) override;
 
-  void getMips16SPRefReg(SDNode *Parent, SDValue &AliasReg);
-
-  bool selectAddr16(SDNode *Parent, SDValue N, SDValue &Base,
-                    SDValue &Offset, SDValue &Alias) override;
+  bool selectAddr(bool SPAllowed, SDValue Addr, SDValue &Base,
+                  SDValue &Offset);
+  bool selectAddr16(SDValue Addr, SDValue &Base,
+                    SDValue &Offset) override;
+  bool selectAddr16SP(SDValue Addr, SDValue &Base,
+                      SDValue &Offset) override;
 
-  std::pair<bool, SDNode*> selectNode(SDNode *Node) override;
+  bool trySelect(SDNode *Node) override;
 
   void processFunctionAfterISel(MachineFunction &MF) override;
 
@@ -46,8 +48,8 @@ private:
   void initMips16SPAliasReg(MachineFunction &MF);
 };
 
-FunctionPass *createMips16ISelDag(MipsTargetMachine &TM);
-
+FunctionPass *createMips16ISelDag(MipsTargetMachine &TM,
+                                  CodeGenOpt::Level OptLevel);
 }
 
 #endif
diff --git a/lib/Target/Mips/Mips16ISelLowering.cpp b/lib/Target/Mips/Mips16ISelLowering.cpp
index e7483253e61d..bdb9eec4cc5a 100644
--- a/lib/Target/Mips/Mips16ISelLowering.cpp
+++ b/lib/Target/Mips/Mips16ISelLowering.cpp
@@ -16,11 +16,9 @@
 #include "MipsMachineFunction.h"
 #include "MipsRegisterInfo.h"
 #include "MipsTargetMachine.h"
-#include "llvm/ADT/StringRef.h"
 #include "llvm/CodeGen/MachineInstrBuilder.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Target/TargetInstrInfo.h"
-#include <string>
 
 using namespace llvm;
 
@@ -167,9 +165,9 @@ Mips16TargetLowering::allowsMisalignedMemoryAccesses(EVT VT,
 }
 
 MachineBasicBlock *
-Mips16TargetLowering::EmitInstrWithCustomInserter(MachineInstr *MI,
+Mips16TargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI,
                                                   MachineBasicBlock *BB) const {
-  switch (MI->getOpcode()) {
+  switch (MI.getOpcode()) {
   default:
     return MipsTargetLowering::EmitInstrWithCustomInserter(MI, BB);
   case Mips::SelBeqZ:
@@ -519,12 +517,13 @@ getOpndList(SmallVectorImpl<SDValue> &Ops,
                                   Chain);
 }
 
-MachineBasicBlock *Mips16TargetLowering::
-emitSel16(unsigned Opc, MachineInstr *MI, MachineBasicBlock *BB) const {
+MachineBasicBlock *
+Mips16TargetLowering::emitSel16(unsigned Opc, MachineInstr &MI,
+                                MachineBasicBlock *BB) const {
   if (DontExpandCondPseudos16)
     return BB;
   const TargetInstrInfo *TII = Subtarget.getInstrInfo();
-  DebugLoc DL = MI->getDebugLoc();
+  DebugLoc DL = MI.getDebugLoc();
   // To "insert" a SELECT_CC instruction, we actually have to insert the
   // diamond control-flow pattern.  The incoming instruction knows the
   // destination vreg to set, the condition code register to branch on, the
@@ -554,8 +553,9 @@ emitSel16(unsigned Opc, MachineInstr *MI, MachineBasicBlock *BB) const {
   BB->addSuccessor(copy0MBB);
   BB->addSuccessor(sinkMBB);
 
-  BuildMI(BB, DL, TII->get(Opc)).addReg(MI->getOperand(3).getReg())
-    .addMBB(sinkMBB);
+  BuildMI(BB, DL, TII->get(Opc))
+      .addReg(MI.getOperand(3).getReg())
+      .addMBB(sinkMBB);
 
   //  copy0MBB:
   //   %FalseValue = ...
@@ -570,22 +570,23 @@ emitSel16(unsigned Opc, MachineInstr *MI, MachineBasicBlock *BB) const {
   //  ...
   BB = sinkMBB;
 
-  BuildMI(*BB, BB->begin(), DL,
-          TII->get(Mips::PHI), MI->getOperand(0).getReg())
-    .addReg(MI->getOperand(1).getReg()).addMBB(thisMBB)
-    .addReg(MI->getOperand(2).getReg()).addMBB(copy0MBB);
+  BuildMI(*BB, BB->begin(), DL, TII->get(Mips::PHI), MI.getOperand(0).getReg())
+      .addReg(MI.getOperand(1).getReg())
+      .addMBB(thisMBB)
+      .addReg(MI.getOperand(2).getReg())
+      .addMBB(copy0MBB);
 
-  MI->eraseFromParent();   // The pseudo instruction is gone now.
+  MI.eraseFromParent(); // The pseudo instruction is gone now.
   return BB;
 }
 
 MachineBasicBlock *
-Mips16TargetLowering::emitSelT16(unsigned Opc1, unsigned Opc2, MachineInstr *MI,
+Mips16TargetLowering::emitSelT16(unsigned Opc1, unsigned Opc2, MachineInstr &MI,
                                  MachineBasicBlock *BB) const {
   if (DontExpandCondPseudos16)
     return BB;
   const TargetInstrInfo *TII = Subtarget.getInstrInfo();
-  DebugLoc DL = MI->getDebugLoc();
+  DebugLoc DL = MI.getDebugLoc();
   // To "insert" a SELECT_CC instruction, we actually have to insert the
   // diamond control-flow pattern.  The incoming instruction knows the
   // destination vreg to set, the condition code register to branch on, the
@@ -615,8 +616,9 @@ Mips16TargetLowering::emitSelT16(unsigned Opc1, unsigned Opc2, MachineInstr *MI,
   BB->addSuccessor(copy0MBB);
   BB->addSuccessor(sinkMBB);
 
-  BuildMI(BB, DL, TII->get(Opc2)).addReg(MI->getOperand(3).getReg())
-    .addReg(MI->getOperand(4).getReg());
+  BuildMI(BB, DL, TII->get(Opc2))
+      .addReg(MI.getOperand(3).getReg())
+      .addReg(MI.getOperand(4).getReg());
   BuildMI(BB, DL, TII->get(Opc1)).addMBB(sinkMBB);
 
   //  copy0MBB:
@@ -632,24 +634,25 @@ Mips16TargetLowering::emitSelT16(unsigned Opc1, unsigned Opc2, MachineInstr *MI,
   //  ...
   BB = sinkMBB;
 
-  BuildMI(*BB, BB->begin(), DL,
-          TII->get(Mips::PHI), MI->getOperand(0).getReg())
-    .addReg(MI->getOperand(1).getReg()).addMBB(thisMBB)
-    .addReg(MI->getOperand(2).getReg()).addMBB(copy0MBB);
+  BuildMI(*BB, BB->begin(), DL, TII->get(Mips::PHI), MI.getOperand(0).getReg())
+      .addReg(MI.getOperand(1).getReg())
+      .addMBB(thisMBB)
+      .addReg(MI.getOperand(2).getReg())
+      .addMBB(copy0MBB);
 
-  MI->eraseFromParent();   // The pseudo instruction is gone now.
+  MI.eraseFromParent(); // The pseudo instruction is gone now.
   return BB;
 
 }
 
 MachineBasicBlock *
 Mips16TargetLowering::emitSeliT16(unsigned Opc1, unsigned Opc2,
-                                  MachineInstr *MI,
+                                  MachineInstr &MI,
                                   MachineBasicBlock *BB) const {
   if (DontExpandCondPseudos16)
     return BB;
   const TargetInstrInfo *TII = Subtarget.getInstrInfo();
-  DebugLoc DL = MI->getDebugLoc();
+  DebugLoc DL = MI.getDebugLoc();
   // To "insert" a SELECT_CC instruction, we actually have to insert the
   // diamond control-flow pattern.  The incoming instruction knows the
   // destination vreg to set, the condition code register to branch on, the
@@ -679,8 +682,9 @@ Mips16TargetLowering::emitSeliT16(unsigned Opc1, unsigned Opc2,
   BB->addSuccessor(copy0MBB);
   BB->addSuccessor(sinkMBB);
 
-  BuildMI(BB, DL, TII->get(Opc2)).addReg(MI->getOperand(3).getReg())
-    .addImm(MI->getOperand(4).getImm());
+  BuildMI(BB, DL, TII->get(Opc2))
+      .addReg(MI.getOperand(3).getReg())
+      .addImm(MI.getOperand(4).getImm());
   BuildMI(BB, DL, TII->get(Opc1)).addMBB(sinkMBB);
 
   //  copy0MBB:
@@ -696,42 +700,44 @@ Mips16TargetLowering::emitSeliT16(unsigned Opc1, unsigned Opc2,
   //  ...
   BB = sinkMBB;
 
-  BuildMI(*BB, BB->begin(), DL,
-          TII->get(Mips::PHI), MI->getOperand(0).getReg())
-    .addReg(MI->getOperand(1).getReg()).addMBB(thisMBB)
-    .addReg(MI->getOperand(2).getReg()).addMBB(copy0MBB);
+  BuildMI(*BB, BB->begin(), DL, TII->get(Mips::PHI), MI.getOperand(0).getReg())
+      .addReg(MI.getOperand(1).getReg())
+      .addMBB(thisMBB)
+      .addReg(MI.getOperand(2).getReg())
+      .addMBB(copy0MBB);
 
-  MI->eraseFromParent();   // The pseudo instruction is gone now.
+  MI.eraseFromParent(); // The pseudo instruction is gone now.
   return BB;
 
 }
 
 MachineBasicBlock *
 Mips16TargetLowering::emitFEXT_T8I816_ins(unsigned BtOpc, unsigned CmpOpc,
-                                          MachineInstr *MI,
+                                          MachineInstr &MI,
                                           MachineBasicBlock *BB) const {
   if (DontExpandCondPseudos16)
     return BB;
   const TargetInstrInfo *TII = Subtarget.getInstrInfo();
-  unsigned regX = MI->getOperand(0).getReg();
-  unsigned regY = MI->getOperand(1).getReg();
-  MachineBasicBlock *target = MI->getOperand(2).getMBB();
-  BuildMI(*BB, MI, MI->getDebugLoc(), TII->get(CmpOpc)).addReg(regX)
-    .addReg(regY);
-  BuildMI(*BB, MI, MI->getDebugLoc(), TII->get(BtOpc)).addMBB(target);
-  MI->eraseFromParent();   // The pseudo instruction is gone now.
+  unsigned regX = MI.getOperand(0).getReg();
+  unsigned regY = MI.getOperand(1).getReg();
+  MachineBasicBlock *target = MI.getOperand(2).getMBB();
+  BuildMI(*BB, MI, MI.getDebugLoc(), TII->get(CmpOpc))
+      .addReg(regX)
+      .addReg(regY);
+  BuildMI(*BB, MI, MI.getDebugLoc(), TII->get(BtOpc)).addMBB(target);
+  MI.eraseFromParent(); // The pseudo instruction is gone now.
   return BB;
 }
 
 MachineBasicBlock *Mips16TargetLowering::emitFEXT_T8I8I16_ins(
     unsigned BtOpc, unsigned CmpiOpc, unsigned CmpiXOpc, bool ImmSigned,
-    MachineInstr *MI, MachineBasicBlock *BB) const {
+    MachineInstr &MI, MachineBasicBlock *BB) const {
   if (DontExpandCondPseudos16)
     return BB;
   const TargetInstrInfo *TII = Subtarget.getInstrInfo();
-  unsigned regX = MI->getOperand(0).getReg();
-  int64_t imm = MI->getOperand(1).getImm();
-  MachineBasicBlock *target = MI->getOperand(2).getMBB();
+  unsigned regX = MI.getOperand(0).getReg();
+  int64_t imm = MI.getOperand(1).getImm();
+  MachineBasicBlock *target = MI.getOperand(2).getMBB();
   unsigned CmpOpc;
   if (isUInt<8>(imm))
     CmpOpc = CmpiOpc;
@@ -740,10 +746,9 @@ MachineBasicBlock *Mips16TargetLowering::emitFEXT_T8I8I16_ins(
     CmpOpc = CmpiXOpc;
   else
     llvm_unreachable("immediate field not usable");
-  BuildMI(*BB, MI, MI->getDebugLoc(), TII->get(CmpOpc)).addReg(regX)
-    .addImm(imm);
-  BuildMI(*BB, MI, MI->getDebugLoc(), TII->get(BtOpc)).addMBB(target);
-  MI->eraseFromParent();   // The pseudo instruction is gone now.
+  BuildMI(*BB, MI, MI.getDebugLoc(), TII->get(CmpOpc)).addReg(regX).addImm(imm);
+  BuildMI(*BB, MI, MI.getDebugLoc(), TII->get(BtOpc)).addMBB(target);
+  MI.eraseFromParent(); // The pseudo instruction is gone now.
   return BB;
 }
 
@@ -758,38 +763,38 @@ static unsigned Mips16WhichOp8uOr16simm
 }
 
 MachineBasicBlock *
-Mips16TargetLowering::emitFEXT_CCRX16_ins(unsigned SltOpc, MachineInstr *MI,
+Mips16TargetLowering::emitFEXT_CCRX16_ins(unsigned SltOpc, MachineInstr &MI,
                                           MachineBasicBlock *BB) const {
   if (DontExpandCondPseudos16)
     return BB;
   const TargetInstrInfo *TII = Subtarget.getInstrInfo();
-  unsigned CC = MI->getOperand(0).getReg();
-  unsigned regX = MI->getOperand(1).getReg();
-  unsigned regY = MI->getOperand(2).getReg();
-  BuildMI(*BB, MI, MI->getDebugLoc(), TII->get(SltOpc)).addReg(regX).addReg(
-      regY);
-  BuildMI(*BB, MI, MI->getDebugLoc(),
-          TII->get(Mips::MoveR3216), CC).addReg(Mips::T8);
-  MI->eraseFromParent();   // The pseudo instruction is gone now.
+  unsigned CC = MI.getOperand(0).getReg();
+  unsigned regX = MI.getOperand(1).getReg();
+  unsigned regY = MI.getOperand(2).getReg();
+  BuildMI(*BB, MI, MI.getDebugLoc(), TII->get(SltOpc))
+      .addReg(regX)
+      .addReg(regY);
+  BuildMI(*BB, MI, MI.getDebugLoc(), TII->get(Mips::MoveR3216), CC)
+      .addReg(Mips::T8);
+  MI.eraseFromParent(); // The pseudo instruction is gone now.
   return BB;
 }
 
 MachineBasicBlock *
 Mips16TargetLowering::emitFEXT_CCRXI16_ins(unsigned SltiOpc, unsigned SltiXOpc,
-                                           MachineInstr *MI,
+                                           MachineInstr &MI,
                                            MachineBasicBlock *BB) const {
   if (DontExpandCondPseudos16)
     return BB;
   const TargetInstrInfo *TII = Subtarget.getInstrInfo();
-  unsigned CC = MI->getOperand(0).getReg();
-  unsigned regX = MI->getOperand(1).getReg();
-  int64_t Imm = MI->getOperand(2).getImm();
+  unsigned CC = MI.getOperand(0).getReg();
+  unsigned regX = MI.getOperand(1).getReg();
+  int64_t Imm = MI.getOperand(2).getImm();
   unsigned SltOpc = Mips16WhichOp8uOr16simm(SltiOpc, SltiXOpc, Imm);
-  BuildMI(*BB, MI, MI->getDebugLoc(),
-          TII->get(SltOpc)).addReg(regX).addImm(Imm);
-  BuildMI(*BB, MI, MI->getDebugLoc(),
-          TII->get(Mips::MoveR3216), CC).addReg(Mips::T8);
-  MI->eraseFromParent();   // The pseudo instruction is gone now.
+  BuildMI(*BB, MI, MI.getDebugLoc(), TII->get(SltOpc)).addReg(regX).addImm(Imm);
+  BuildMI(*BB, MI, MI.getDebugLoc(), TII->get(Mips::MoveR3216), CC)
+      .addReg(Mips::T8);
+  MI.eraseFromParent(); // The pseudo instruction is gone now.
   return BB;
 
 }
diff --git a/lib/Target/Mips/Mips16ISelLowering.h b/lib/Target/Mips/Mips16ISelLowering.h
index d3b9f750f347..0ee0b816ef70 100644
--- a/lib/Target/Mips/Mips16ISelLowering.h
+++ b/lib/Target/Mips/Mips16ISelLowering.h
@@ -27,7 +27,7 @@ namespace llvm {
                                         bool *Fast) const override;
 
     MachineBasicBlock *
-    EmitInstrWithCustomInserter(MachineInstr *MI,
+    EmitInstrWithCustomInserter(MachineInstr &MI,
                                 MachineBasicBlock *MBB) const override;
 
   private:
@@ -50,32 +50,32 @@ namespace llvm {
                 bool IsCallReloc, CallLoweringInfo &CLI, SDValue Callee,
                 SDValue Chain) const override;
 
-    MachineBasicBlock *emitSel16(unsigned Opc, MachineInstr *MI,
+    MachineBasicBlock *emitSel16(unsigned Opc, MachineInstr &MI,
                                  MachineBasicBlock *BB) const;
 
     MachineBasicBlock *emitSeliT16(unsigned Opc1, unsigned Opc2,
-                                   MachineInstr *MI,
+                                   MachineInstr &MI,
                                    MachineBasicBlock *BB) const;
 
     MachineBasicBlock *emitSelT16(unsigned Opc1, unsigned Opc2,
-                                  MachineInstr *MI,
+                                  MachineInstr &MI,
                                   MachineBasicBlock *BB) const;
 
     MachineBasicBlock *emitFEXT_T8I816_ins(unsigned BtOpc, unsigned CmpOpc,
-                                           MachineInstr *MI,
+                                           MachineInstr &MI,
                                            MachineBasicBlock *BB) const;
 
-    MachineBasicBlock *emitFEXT_T8I8I16_ins(
-      unsigned BtOpc, unsigned CmpiOpc, unsigned CmpiXOpc, bool ImmSigned,
-      MachineInstr *MI,  MachineBasicBlock *BB) const;
+    MachineBasicBlock *emitFEXT_T8I8I16_ins(unsigned BtOpc, unsigned CmpiOpc,
+                                            unsigned CmpiXOpc, bool ImmSigned,
+                                            MachineInstr &MI,
+                                            MachineBasicBlock *BB) const;
 
-    MachineBasicBlock *emitFEXT_CCRX16_ins(
-      unsigned SltOpc,
-      MachineInstr *MI,  MachineBasicBlock *BB) const;
+    MachineBasicBlock *emitFEXT_CCRX16_ins(unsigned SltOpc, MachineInstr &MI,
+                                           MachineBasicBlock *BB) const;
 
-    MachineBasicBlock *emitFEXT_CCRXI16_ins(
-      unsigned SltiOpc, unsigned SltiXOpc,
-      MachineInstr *MI,  MachineBasicBlock *BB )const;
+    MachineBasicBlock *emitFEXT_CCRXI16_ins(unsigned SltiOpc, unsigned SltiXOpc,
+                                            MachineInstr &MI,
+                                            MachineBasicBlock *BB) const;
   };
 }
 
diff --git a/lib/Target/Mips/Mips16InstrInfo.cpp b/lib/Target/Mips/Mips16InstrInfo.cpp
index da8ada4e5391..daa1355ffefd 100644
--- a/lib/Target/Mips/Mips16InstrInfo.cpp
+++ b/lib/Target/Mips/Mips16InstrInfo.cpp
@@ -15,12 +15,10 @@
 #include "MipsMachineFunction.h"
 #include "MipsTargetMachine.h"
 #include "llvm/ADT/STLExtras.h"
-#include "llvm/ADT/StringRef.h"
 #include "llvm/CodeGen/MachineInstrBuilder.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
 #include "llvm/CodeGen/RegisterScavenging.h"
 #include "llvm/MC/MCAsmInfo.h"
-#include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/TargetRegistry.h"
@@ -43,7 +41,7 @@ const MipsRegisterInfo &Mips16InstrInfo::getRegisterInfo() const {
 /// the destination along with the FrameIndex of the loaded stack slot.  If
 /// not, return 0.  This predicate must return 0 if the instruction has
 /// any side effects other than loading from the stack slot.
-unsigned Mips16InstrInfo::isLoadFromStackSlot(const MachineInstr *MI,
+unsigned Mips16InstrInfo::isLoadFromStackSlot(const MachineInstr &MI,
                                               int &FrameIndex) const {
   return 0;
 }
@@ -53,15 +51,15 @@ unsigned Mips16InstrInfo::isLoadFromStackSlot(const MachineInstr *MI,
 /// the source reg along with the FrameIndex of the loaded stack slot.  If
 /// not, return 0.  This predicate must return 0 if the instruction has
 /// any side effects other than storing to the stack slot.
-unsigned Mips16InstrInfo::isStoreToStackSlot(const MachineInstr *MI,
+unsigned Mips16InstrInfo::isStoreToStackSlot(const MachineInstr &MI,
                                              int &FrameIndex) const {
   return 0;
 }
 
 void Mips16InstrInfo::copyPhysReg(MachineBasicBlock &MBB,
-                                  MachineBasicBlock::iterator I, DebugLoc DL,
-                                  unsigned DestReg, unsigned SrcReg,
-                                  bool KillSrc) const {
+                                  MachineBasicBlock::iterator I,
+                                  const DebugLoc &DL, unsigned DestReg,
+                                  unsigned SrcReg, bool KillSrc) const {
   unsigned Opc = 0;
 
   if (Mips::CPU16RegsRegClass.contains(DestReg) &&
@@ -126,9 +124,9 @@ void Mips16InstrInfo::loadRegFromStack(MachineBasicBlock &MBB,
     .addMemOperand(MMO);
 }
 
-bool Mips16InstrInfo::expandPostRAPseudo(MachineBasicBlock::iterator MI) const {
-  MachineBasicBlock &MBB = *MI->getParent();
-  switch(MI->getDesc().getOpcode()) {
+bool Mips16InstrInfo::expandPostRAPseudo(MachineInstr &MI) const {
+  MachineBasicBlock &MBB = *MI.getParent();
+  switch (MI.getDesc().getOpcode()) {
   default:
     return false;
   case Mips::RetRA16:
@@ -136,7 +134,7 @@ bool Mips16InstrInfo::expandPostRAPseudo(MachineBasicBlock::iterator MI) const {
     break;
   }
 
-  MBB.erase(MI);
+  MBB.erase(MI.getIterator());
   return true;
 }
 
@@ -307,7 +305,8 @@ void Mips16InstrInfo::adjustStackPtr(unsigned SP, int64_t Amount,
 unsigned Mips16InstrInfo::loadImmediate(unsigned FrameReg, int64_t Imm,
                                         MachineBasicBlock &MBB,
                                         MachineBasicBlock::iterator II,
-                                        DebugLoc DL, unsigned &NewImm) const {
+                                        const DebugLoc &DL,
+                                        unsigned &NewImm) const {
   //
   // given original instruction is:
   // Instr rx, T[offset] where offset is too big.
@@ -326,7 +325,7 @@ unsigned Mips16InstrInfo::loadImmediate(unsigned FrameReg, int64_t Imm,
   int Reg =0;
   int SpReg = 0;
 
-  rs.enterBasicBlock(&MBB);
+  rs.enterBasicBlock(MBB);
   rs.forward(II);
   //
   // We need to know which registers can be used, in the case where there
diff --git a/lib/Target/Mips/Mips16InstrInfo.h b/lib/Target/Mips/Mips16InstrInfo.h
index 6540b40bc9ab..ab559799f00b 100644
--- a/lib/Target/Mips/Mips16InstrInfo.h
+++ b/lib/Target/Mips/Mips16InstrInfo.h
@@ -32,7 +32,7 @@ public:
   /// the destination along with the FrameIndex of the loaded stack slot.  If
   /// not, return 0.  This predicate must return 0 if the instruction has
   /// any side effects other than loading from the stack slot.
-  unsigned isLoadFromStackSlot(const MachineInstr *MI,
+  unsigned isLoadFromStackSlot(const MachineInstr &MI,
                                int &FrameIndex) const override;
 
   /// isStoreToStackSlot - If the specified machine instruction is a direct
@@ -40,12 +40,11 @@ public:
   /// the source reg along with the FrameIndex of the loaded stack slot.  If
   /// not, return 0.  This predicate must return 0 if the instruction has
   /// any side effects other than storing to the stack slot.
-  unsigned isStoreToStackSlot(const MachineInstr *MI,
+  unsigned isStoreToStackSlot(const MachineInstr &MI,
                               int &FrameIndex) const override;
 
-  void copyPhysReg(MachineBasicBlock &MBB,
-                   MachineBasicBlock::iterator MI, DebugLoc DL,
-                   unsigned DestReg, unsigned SrcReg,
+  void copyPhysReg(MachineBasicBlock &MBB, MachineBasicBlock::iterator MI,
+                   const DebugLoc &DL, unsigned DestReg, unsigned SrcReg,
                    bool KillSrc) const override;
 
   void storeRegToStack(MachineBasicBlock &MBB,
@@ -62,7 +61,7 @@ public:
                         const TargetRegisterInfo *TRI,
                         int64_t Offset) const override;
 
-  bool expandPostRAPseudo(MachineBasicBlock::iterator MI) const override;
+  bool expandPostRAPseudo(MachineInstr &MI) const override;
 
   unsigned getOppositeBranchOpc(unsigned Opc) const override;
 
@@ -83,9 +82,8 @@ public:
   // This is to adjust some FrameReg. We return the new register to be used
   // in place of FrameReg and the adjusted immediate field (&NewImm)
   //
-  unsigned loadImmediate(unsigned FrameReg,
-                         int64_t Imm, MachineBasicBlock &MBB,
-                         MachineBasicBlock::iterator II, DebugLoc DL,
+  unsigned loadImmediate(unsigned FrameReg, int64_t Imm, MachineBasicBlock &MBB,
+                         MachineBasicBlock::iterator II, const DebugLoc &DL,
                          unsigned &NewImm) const;
 
   static bool validImmediate(unsigned Opcode, unsigned Reg, int64_t Amount);
diff --git a/lib/Target/Mips/Mips16InstrInfo.td b/lib/Target/Mips/Mips16InstrInfo.td
index dad6ea4c9e98..021fb8678686 100644
--- a/lib/Target/Mips/Mips16InstrInfo.td
+++ b/lib/Target/Mips/Mips16InstrInfo.td
@@ -14,14 +14,26 @@
 //
 // Mips Address
 //
-def addr16 :
-  ComplexPattern<iPTR, 3, "selectAddr16", [frameindex], [SDNPWantParent]>;
+def addr16 : ComplexPattern<iPTR, 2, "selectAddr16", [frameindex]>;
+def addr16sp : ComplexPattern<iPTR, 2, "selectAddr16SP", [frameindex]>;
 
 //
 // Address operand
 def mem16 : Operand<i32> {
   let PrintMethod = "printMemOperand";
-  let MIOperandInfo = (ops CPU16Regs, simm16, CPU16RegsPlusSP);
+  let MIOperandInfo = (ops CPU16Regs, simm16);
+  let EncoderMethod = "getMemEncoding";
+}
+
+def mem16sp : Operand<i32> {
+  let PrintMethod = "printMemOperand";
+  // This should be CPUSPReg but the MIPS16 subtarget isn't good enough at
+  // keeping the sp-relative load and the other varieties separate at the
+  // moment. This lie fixes the problem sufficiently well to fix the errors
+  // emitted by -verify-machineinstrs and the output ends up correct as long
+  // as we use an external assembler (which is already a requirement for MIPS16
+  // for several other reasons).
+  let MIOperandInfo = (ops CPU16RegsPlusSP, simm16);
   let EncoderMethod = "getMemEncoding";
 }
 
@@ -31,6 +43,8 @@ def mem16_ea : Operand<i32> {
   let EncoderMethod = "getMemEncoding";
 }
 
+def pcrel16 : Operand<i32>;
+
 //
 // I-type instruction format
 //
@@ -115,7 +129,7 @@ class FEXT_CCRXI16_ins<string asmstr>:
 //
 class FJAL16_ins<bits<1> _X, string asmstr,
                  InstrItinClass itin>:
-  FJAL16<_X, (outs), (ins simm20:$imm),
+  FJAL16<_X, (outs), (ins uimm26:$imm),
          !strconcat(asmstr, "\t$imm\n\tnop"),[],
          itin>  {
   let isCodeGenOnly=1;
@@ -124,7 +138,7 @@ class FJAL16_ins<bits<1> _X, string asmstr,
 
 class FJALB16_ins<bits<1> _X, string asmstr,
                  InstrItinClass itin>:
-  FJAL16<_X, (outs), (ins simm20:$imm),
+  FJAL16<_X, (outs), (ins uimm26:$imm),
          !strconcat(asmstr, "\t$imm\t# branch\n\tnop"),[],
          itin>  {
   let isCodeGenOnly=1;
@@ -213,19 +227,6 @@ class FEXT_2RI16_ins<bits<5> _op, string asmstr,
   let Constraints = "$rx_ = $rx";
 }
 
-
-// this has an explicit sp argument that we ignore to work around a problem
-// in the compiler
-class FEXT_RI16_SP_explicit_ins<bits<5> _op, string asmstr,
-                                InstrItinClass itin>:
-  FEXT_RI16<_op, (outs CPU16Regs:$rx), (ins CPUSPReg:$ry, simm16:$imm),
-            !strconcat(asmstr, "\t$rx, $imm ( $ry ); "), [], itin>;
-
-class FEXT_RI16_SP_Store_explicit_ins<bits<5> _op, string asmstr,
-                                InstrItinClass itin>:
-  FEXT_RI16<_op, (outs), (ins  CPU16Regs:$rx, CPUSPReg:$ry, simm16:$imm),
-            !strconcat(asmstr, "\t$rx, $imm ( $ry ); "), [], itin>;
-
 //
 // EXT-RRI instruction format
 //
@@ -483,13 +484,11 @@ class SelT<string op1, string op2>:
 //
 // 32 bit constant
 //
-def imm32: Operand<i32>;
-
 def Constant32:
-  MipsPseudo16<(outs), (ins imm32:$imm), "\t.word $imm", []>;
+  MipsPseudo16<(outs), (ins simm32:$imm), "\t.word $imm", []>;
 
 def LwConstant32:
-  MipsPseudo16<(outs CPU16Regs:$rx), (ins imm32:$imm, imm32:$constid),
+  MipsPseudo16<(outs CPU16Regs:$rx), (ins simm32:$imm, simm32:$constid),
     "lw\t$rx, 1f\n\tb\t2f\n\t.align\t2\n1: \t.word\t$imm\n2:", []>;
 
 
@@ -635,7 +634,7 @@ def BnezRxImmX16: FEXT_RI16_B_ins<0b00101, "bnez", IIM16Alu>, cbranch16;
 // Purpose: Breakpoint
 // To cause a Breakpoint exception.
 
-def Break16: FRRBreakNull16_ins<"break 0", NoItinerary>; 
+def Break16: FRRBreakNull16_ins<"break 0", IIM16Alu>;
 //
 // Format: BTEQZ offset MIPS16e
 // Purpose: Branch on T Equal to Zero (Extended)
@@ -851,9 +850,7 @@ def LwRxRyOffMemX16: FEXT_RRI16_mem_ins<0b10011, "lw", mem16, II_LW>, MayLoad{
 // Purpose: Load Word (SP-Relative, Extended)
 // To load an SP-relative word from memory as a signed value.
 //
-def LwRxSpImmX16: FEXT_RI16_SP_explicit_ins<0b10010, "lw", II_LW>, MayLoad{
-  let Uses = [SP];
-}
+def LwRxSpImmX16: FEXT_RRI16_mem_ins<0b10010, "lw", mem16sp, II_LW>, MayLoad;
 
 def LwRxPcTcp16: FRI16_TCP_ins<0b10110, "lw", II_LW>, MayLoad;
 
@@ -1277,16 +1274,14 @@ def SubuRxRyRz16: FRRR16_ins<0b11, "subu", IIM16Alu>, ArithLogic16Defs<0>;
 // Purpose: Store Word (Extended)
 // To store a word to memory.
 //
-def SwRxRyOffMemX16:
-  FEXT_RRI16_mem2_ins<0b11011, "sw", mem16, II_SW>, MayStore;
+def SwRxRyOffMemX16: FEXT_RRI16_mem2_ins<0b11011, "sw", mem16, II_SW>, MayStore;
 
 //
 // Format: SW rx, offset(sp) MIPS16e
 // Purpose: Store Word rx (SP-Relative)
 // To store an SP-relative word to memory.
 //
-def SwRxSpImmX16: FEXT_RI16_SP_Store_explicit_ins
-  <0b11010, "sw", II_SW>, MayStore;
+def SwRxSpImmX16: FEXT_RRI16_mem2_ins<0b11010, "sw", mem16sp, II_SW>, MayStore;
 
 //
 //
@@ -1340,22 +1335,21 @@ def: shift_rotate_reg16_pat<shl, SllvRxRy16>;
 def: shift_rotate_reg16_pat<sra, SravRxRy16>;
 def: shift_rotate_reg16_pat<srl, SrlvRxRy16>;
 
-class LoadM16_pat<PatFrag OpNode, Instruction I> :
-  Mips16Pat<(OpNode addr16:$addr), (I addr16:$addr)>;
+class LoadM16_pat<PatFrag OpNode, Instruction I, ComplexPattern Addr> :
+  Mips16Pat<(OpNode Addr:$addr), (I Addr:$addr)>;
 
-def: LoadM16_pat<sextloadi8, LbRxRyOffMemX16>;
-def: LoadM16_pat<zextloadi8, LbuRxRyOffMemX16>;
-def: LoadM16_pat<sextloadi16, LhRxRyOffMemX16>;
-def: LoadM16_pat<zextloadi16, LhuRxRyOffMemX16>;
-def: LoadM16_pat<load, LwRxRyOffMemX16>;
+def: LoadM16_pat<sextloadi8, LbRxRyOffMemX16, addr16>;
+def: LoadM16_pat<zextloadi8, LbuRxRyOffMemX16, addr16>;
+def: LoadM16_pat<sextloadi16, LhRxRyOffMemX16, addr16>;
+def: LoadM16_pat<zextloadi16, LhuRxRyOffMemX16, addr16>;
+def: LoadM16_pat<load, LwRxSpImmX16, addr16sp>;
 
-class StoreM16_pat<PatFrag OpNode, Instruction I> :
-  Mips16Pat<(OpNode CPU16Regs:$r, addr16:$addr),
-            (I CPU16Regs:$r, addr16:$addr)>;
+class StoreM16_pat<PatFrag OpNode, Instruction I, ComplexPattern Addr> :
+  Mips16Pat<(OpNode CPU16Regs:$r, Addr:$addr), (I CPU16Regs:$r, Addr:$addr)>;
 
-def: StoreM16_pat<truncstorei8, SbRxRyOffMemX16>;
-def: StoreM16_pat<truncstorei16, ShRxRyOffMemX16>;
-def: StoreM16_pat<store, SwRxRyOffMemX16>;
+def: StoreM16_pat<truncstorei8, SbRxRyOffMemX16, addr16>;
+def: StoreM16_pat<truncstorei16, ShRxRyOffMemX16, addr16>;
+def: StoreM16_pat<store, SwRxSpImmX16, addr16sp>;
 
 // Unconditional branch
 class UncondBranch16_pat<SDNode OpNode, Instruction I>:
@@ -1401,8 +1395,7 @@ class SetCC_I16<PatFrag cond_op, PatLeaf imm_type, Instruction I>:
             (I CPU16Regs:$rx, imm_type:$imm16)>;
 
 
-def: Mips16Pat<(i32  addr16:$addr),
-               (AddiuRxRyOffMemX16  addr16:$addr)>;
+def: Mips16Pat<(i32 addr16sp:$addr), (AddiuRxRyOffMemX16 addr16sp:$addr)>;
 
 
 // Large (>16 bit) immediate loads
@@ -1551,7 +1544,7 @@ def: UncondBranch16_pat<br, Bimm16>;
 
 // Small immediates
 def: Mips16Pat<(i32 immSExt16:$in),
-               (AddiuRxRxImmX16 (Move32R16 ZERO), immSExt16:$in)>;
+               (AddiuRxRxImmX16 (MoveR3216 ZERO), immSExt16:$in)>;
 
 def: Mips16Pat<(i32 immZExt16:$in), (LiRxImmX16 immZExt16:$in)>;
 
diff --git a/lib/Target/Mips/Mips16RegisterInfo.cpp b/lib/Target/Mips/Mips16RegisterInfo.cpp
index ebd51d7b5072..b034c26640e3 100644
--- a/lib/Target/Mips/Mips16RegisterInfo.cpp
+++ b/lib/Target/Mips/Mips16RegisterInfo.cpp
@@ -14,11 +14,9 @@
 #include "Mips16RegisterInfo.h"
 #include "Mips.h"
 #include "Mips16InstrInfo.h"
-#include "MipsAnalyzeImmediate.h"
 #include "MipsInstrInfo.h"
 #include "MipsMachineFunction.h"
 #include "MipsSubtarget.h"
-#include "llvm/ADT/BitVector.h"
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/CodeGen/MachineFrameInfo.h"
 #include "llvm/CodeGen/MachineFunction.h"
@@ -28,7 +26,6 @@
 #include "llvm/IR/DebugInfo.h"
 #include "llvm/IR/Function.h"
 #include "llvm/IR/Type.h"
-#include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/raw_ostream.h"
diff --git a/lib/Target/Mips/Mips32r6InstrFormats.td b/lib/Target/Mips/Mips32r6InstrFormats.td
index 13216bebe5b6..a20c683c1e5e 100644
--- a/lib/Target/Mips/Mips32r6InstrFormats.td
+++ b/lib/Target/Mips/Mips32r6InstrFormats.td
@@ -360,7 +360,7 @@ class SPECIAL_SDBBP_FM : MipsR6Inst {
 }
 
 // This class is ambiguous with other branches:
-//   BEQC/BNEC require that rs > rt
+//   BEQC/BNEC require that rs < rt && rs != 0
 class CMP_BRANCH_2R_OFF16_FM<OPGROUP funct> : MipsR6Inst {
   bits<5> rs;
   bits<5> rt;
diff --git a/lib/Target/Mips/Mips32r6InstrInfo.td b/lib/Target/Mips/Mips32r6InstrInfo.td
index 82d2c8ee9905..f552f8d37afb 100644
--- a/lib/Target/Mips/Mips32r6InstrInfo.td
+++ b/lib/Target/Mips/Mips32r6InstrInfo.td
@@ -182,58 +182,86 @@ class CMP_CONDN_DESC_BASE<string CondStr, string Typestr,
   dag InOperandList = (ins FGROpnd:$fs, FGROpnd:$ft);
   string AsmString = !strconcat("cmp.", CondStr, ".", Typestr, "\t$fd, $fs, $ft");
   list<dag> Pattern = [(set FGRCCOpnd:$fd, (Op FGROpnd:$fs, FGROpnd:$ft))];
+  bit isCTI = 1;
 }
 
 multiclass CMP_CC_M <FIELD_CMP_FORMAT Format, string Typestr,
                      RegisterOperand FGROpnd>{
   let AdditionalPredicates = [NotInMicroMips] in {
-    def CMP_F_#NAME : COP1_CMP_CONDN_FM<Format, FIELD_CMP_COND_AF>,
+    def CMP_F_#NAME : R6MMR6Rel, COP1_CMP_CONDN_FM<Format, FIELD_CMP_COND_AF>,
                       CMP_CONDN_DESC_BASE<"af", Typestr, FGROpnd>,
+                      MipsR6Arch<!strconcat("cmp.af.", Typestr)>,
                       ISA_MIPS32R6, HARDFLOAT;
-    def CMP_UN_#NAME : COP1_CMP_CONDN_FM<Format, FIELD_CMP_COND_UN>,
+    def CMP_UN_#NAME : R6MMR6Rel, COP1_CMP_CONDN_FM<Format, FIELD_CMP_COND_UN>,
                        CMP_CONDN_DESC_BASE<"un", Typestr, FGROpnd, setuo>,
+                       MipsR6Arch<!strconcat("cmp.un.", Typestr)>,
                        ISA_MIPS32R6, HARDFLOAT;
-    def CMP_EQ_#NAME : COP1_CMP_CONDN_FM<Format, FIELD_CMP_COND_EQ>,
+    def CMP_EQ_#NAME : R6MMR6Rel, COP1_CMP_CONDN_FM<Format, FIELD_CMP_COND_EQ>,
                        CMP_CONDN_DESC_BASE<"eq", Typestr, FGROpnd, setoeq>,
+                       MipsR6Arch<!strconcat("cmp.eq.", Typestr)>,
                        ISA_MIPS32R6, HARDFLOAT;
-    def CMP_UEQ_#NAME : COP1_CMP_CONDN_FM<Format, FIELD_CMP_COND_UEQ>,
+    def CMP_UEQ_#NAME : R6MMR6Rel, COP1_CMP_CONDN_FM<Format,
+                                                     FIELD_CMP_COND_UEQ>,
                         CMP_CONDN_DESC_BASE<"ueq", Typestr, FGROpnd, setueq>,
+                        MipsR6Arch<!strconcat("cmp.ueq.", Typestr)>,
                         ISA_MIPS32R6, HARDFLOAT;
-    def CMP_LT_#NAME : COP1_CMP_CONDN_FM<Format, FIELD_CMP_COND_LT>,
+    def CMP_LT_#NAME : R6MMR6Rel, COP1_CMP_CONDN_FM<Format, FIELD_CMP_COND_LT>,
                        CMP_CONDN_DESC_BASE<"lt", Typestr, FGROpnd, setolt>,
+                       MipsR6Arch<!strconcat("cmp.lt.", Typestr)>,
                        ISA_MIPS32R6, HARDFLOAT;
-    def CMP_ULT_#NAME : COP1_CMP_CONDN_FM<Format, FIELD_CMP_COND_ULT>,
+    def CMP_ULT_#NAME : R6MMR6Rel, COP1_CMP_CONDN_FM<Format,
+                                                     FIELD_CMP_COND_ULT>,
                         CMP_CONDN_DESC_BASE<"ult", Typestr, FGROpnd, setult>,
+                        MipsR6Arch<!strconcat("cmp.ult.", Typestr)>,
                         ISA_MIPS32R6, HARDFLOAT;
-    def CMP_LE_#NAME : COP1_CMP_CONDN_FM<Format, FIELD_CMP_COND_LE>,
+    def CMP_LE_#NAME : R6MMR6Rel, COP1_CMP_CONDN_FM<Format, FIELD_CMP_COND_LE>,
                        CMP_CONDN_DESC_BASE<"le", Typestr, FGROpnd, setole>,
+                       MipsR6Arch<!strconcat("cmp.le.", Typestr)>,
                        ISA_MIPS32R6, HARDFLOAT;
-    def CMP_ULE_#NAME : COP1_CMP_CONDN_FM<Format, FIELD_CMP_COND_ULE>,
+    def CMP_ULE_#NAME : R6MMR6Rel, COP1_CMP_CONDN_FM<Format,
+                                                     FIELD_CMP_COND_ULE>,
                         CMP_CONDN_DESC_BASE<"ule", Typestr, FGROpnd, setule>,
+                        MipsR6Arch<!strconcat("cmp.ule.", Typestr)>,
                         ISA_MIPS32R6, HARDFLOAT;
-    def CMP_SAF_#NAME : COP1_CMP_CONDN_FM<Format, FIELD_CMP_COND_SAF>,
+    def CMP_SAF_#NAME : R6MMR6Rel, COP1_CMP_CONDN_FM<Format,
+                                                     FIELD_CMP_COND_SAF>,
                         CMP_CONDN_DESC_BASE<"saf", Typestr, FGROpnd>,
+                        MipsR6Arch<!strconcat("cmp.saf.", Typestr)>,
                         ISA_MIPS32R6, HARDFLOAT;
-    def CMP_SUN_#NAME : COP1_CMP_CONDN_FM<Format, FIELD_CMP_COND_SUN>,
+    def CMP_SUN_#NAME : R6MMR6Rel, COP1_CMP_CONDN_FM<Format,
+                                                     FIELD_CMP_COND_SUN>,
                         CMP_CONDN_DESC_BASE<"sun", Typestr, FGROpnd>,
+                        MipsR6Arch<!strconcat("cmp.sun.", Typestr)>,
                         ISA_MIPS32R6, HARDFLOAT;
-    def CMP_SEQ_#NAME : COP1_CMP_CONDN_FM<Format, FIELD_CMP_COND_SEQ>,
+    def CMP_SEQ_#NAME : R6MMR6Rel, COP1_CMP_CONDN_FM<Format,
+                                                     FIELD_CMP_COND_SEQ>,
                         CMP_CONDN_DESC_BASE<"seq", Typestr, FGROpnd>,
+                        MipsR6Arch<!strconcat("cmp.seq.", Typestr)>,
                         ISA_MIPS32R6, HARDFLOAT;
-    def CMP_SUEQ_#NAME : COP1_CMP_CONDN_FM<Format, FIELD_CMP_COND_SUEQ>,
+    def CMP_SUEQ_#NAME : R6MMR6Rel, COP1_CMP_CONDN_FM<Format,
+                                                      FIELD_CMP_COND_SUEQ>,
                          CMP_CONDN_DESC_BASE<"sueq", Typestr, FGROpnd>,
+                         MipsR6Arch<!strconcat("cmp.sueq.", Typestr)>,
                          ISA_MIPS32R6, HARDFLOAT;
-    def CMP_SLT_#NAME : COP1_CMP_CONDN_FM<Format, FIELD_CMP_COND_SLT>,
+    def CMP_SLT_#NAME : R6MMR6Rel, COP1_CMP_CONDN_FM<Format,
+                                                     FIELD_CMP_COND_SLT>,
                         CMP_CONDN_DESC_BASE<"slt", Typestr, FGROpnd>,
+                        MipsR6Arch<!strconcat("cmp.slt.", Typestr)>,
                         ISA_MIPS32R6, HARDFLOAT;
-    def CMP_SULT_#NAME : COP1_CMP_CONDN_FM<Format, FIELD_CMP_COND_SULT>,
+    def CMP_SULT_#NAME : R6MMR6Rel, COP1_CMP_CONDN_FM<Format,
+                                                      FIELD_CMP_COND_SULT>,
                          CMP_CONDN_DESC_BASE<"sult", Typestr, FGROpnd>,
+                         MipsR6Arch<!strconcat("cmp.sult.", Typestr)>,
                          ISA_MIPS32R6, HARDFLOAT;
-    def CMP_SLE_#NAME : COP1_CMP_CONDN_FM<Format, FIELD_CMP_COND_SLE>,
+    def CMP_SLE_#NAME : R6MMR6Rel, COP1_CMP_CONDN_FM<Format,
+                                                     FIELD_CMP_COND_SLE>,
                         CMP_CONDN_DESC_BASE<"sle", Typestr, FGROpnd>,
+                        MipsR6Arch<!strconcat("cmp.sle.", Typestr)>,
                         ISA_MIPS32R6, HARDFLOAT;
-    def CMP_SULE_#NAME : COP1_CMP_CONDN_FM<Format, FIELD_CMP_COND_SULE>,
+    def CMP_SULE_#NAME : R6MMR6Rel, COP1_CMP_CONDN_FM<Format,
+                                                      FIELD_CMP_COND_SULE>,
                          CMP_CONDN_DESC_BASE<"sule", Typestr, FGROpnd>,
+                         MipsR6Arch<!strconcat("cmp.sule.", Typestr)>,
                          ISA_MIPS32R6, HARDFLOAT;
   }
 }
@@ -245,52 +273,62 @@ multiclass CMP_CC_M <FIELD_CMP_FORMAT Format, string Typestr,
 //===----------------------------------------------------------------------===//
 
 class PCREL_DESC_BASE<string instr_asm, RegisterOperand GPROpnd,
-                      Operand ImmOpnd> : MipsR6Arch<instr_asm> {
+                      Operand ImmOpnd, InstrItinClass itin>
+      : MipsR6Arch<instr_asm> {
   dag OutOperandList = (outs GPROpnd:$rs);
   dag InOperandList = (ins ImmOpnd:$imm);
   string AsmString = !strconcat(instr_asm, "\t$rs, $imm");
   list<dag> Pattern = [];
+  InstrItinClass Itinerary = itin;
 }
 
-class ADDIUPC_DESC : PCREL_DESC_BASE<"addiupc", GPR32Opnd, simm19_lsl2>;
-class LWPC_DESC: PCREL_DESC_BASE<"lwpc", GPR32Opnd, simm19_lsl2>;
-class LWUPC_DESC: PCREL_DESC_BASE<"lwupc", GPR32Opnd, simm19_lsl2>;
+class ADDIUPC_DESC : PCREL_DESC_BASE<"addiupc", GPR32Opnd, simm19_lsl2,
+                                     II_ADDIUPC>;
+class LWPC_DESC: PCREL_DESC_BASE<"lwpc", GPR32Opnd, simm19_lsl2, II_LWPC>;
+class LWUPC_DESC: PCREL_DESC_BASE<"lwupc", GPR32Opnd, simm19_lsl2, II_LWUPC>;
 
 class ALIGN_DESC_BASE<string instr_asm, RegisterOperand GPROpnd,
-                      Operand ImmOpnd>  : MipsR6Arch<instr_asm> {
+                      Operand ImmOpnd, InstrItinClass itin>
+      : MipsR6Arch<instr_asm> {
   dag OutOperandList = (outs GPROpnd:$rd);
   dag InOperandList = (ins GPROpnd:$rs, GPROpnd:$rt, ImmOpnd:$bp);
   string AsmString = !strconcat(instr_asm, "\t$rd, $rs, $rt, $bp");
   list<dag> Pattern = [];
+  InstrItinClass Itinerary = itin;
 }
 
-class ALIGN_DESC : ALIGN_DESC_BASE<"align", GPR32Opnd, uimm2>;
+class ALIGN_DESC : ALIGN_DESC_BASE<"align", GPR32Opnd, uimm2, II_ALIGN>;
 
-class ALUIPC_DESC_BASE<string instr_asm, RegisterOperand GPROpnd>
-    : MipsR6Arch<instr_asm> {
+class ALUIPC_DESC_BASE<string instr_asm, RegisterOperand GPROpnd,
+                       InstrItinClass itin = NoItinerary>
+      : MipsR6Arch<instr_asm> {
   dag OutOperandList = (outs GPROpnd:$rs);
   dag InOperandList = (ins simm16:$imm);
   string AsmString = !strconcat(instr_asm, "\t$rs, $imm");
   list<dag> Pattern = [];
+  InstrItinClass Itinerary = itin;
 }
 
-class ALUIPC_DESC : ALUIPC_DESC_BASE<"aluipc", GPR32Opnd>;
-class AUIPC_DESC : ALUIPC_DESC_BASE<"auipc", GPR32Opnd>;
+class ALUIPC_DESC : ALUIPC_DESC_BASE<"aluipc", GPR32Opnd, II_ALUIPC>;
+class AUIPC_DESC : ALUIPC_DESC_BASE<"auipc", GPR32Opnd, II_AUIPC>;
 
-class AUI_DESC_BASE<string instr_asm, RegisterOperand GPROpnd>
-    : MipsR6Arch<instr_asm> {
+class AUI_DESC_BASE<string instr_asm, RegisterOperand GPROpnd,
+                    InstrItinClass itin = NoItinerary>
+      : MipsR6Arch<instr_asm> {
   dag OutOperandList = (outs GPROpnd:$rs);
   dag InOperandList = (ins GPROpnd:$rt, simm16:$imm);
   string AsmString = !strconcat(instr_asm, "\t$rs, $rt, $imm");
   list<dag> Pattern = [];
+  InstrItinClass Itinerary = itin;
 }
 
-class AUI_DESC : AUI_DESC_BASE<"aui", GPR32Opnd>;
+class AUI_DESC : AUI_DESC_BASE<"aui", GPR32Opnd, II_AUI>;
 
 class BRANCH_DESC_BASE {
   bit isBranch = 1;
   bit isTerminator = 1;
   bit hasDelaySlot = 0;
+  bit isCTI = 1;
 }
 
 class BC_DESC_BASE<string instr_asm, DAGOperand opnd> : BRANCH_DESC_BASE,
@@ -299,22 +337,32 @@ class BC_DESC_BASE<string instr_asm, DAGOperand opnd> : BRANCH_DESC_BASE,
   dag OutOperandList = (outs);
   string AsmString = !strconcat(instr_asm, "\t$offset");
   bit isBarrier = 1;
+  InstrItinClass Itinerary = II_BC;
+  bit isCTI = 1;
 }
 
 class CMP_BC_DESC_BASE<string instr_asm, DAGOperand opnd,
-                       RegisterOperand GPROpnd> : BRANCH_DESC_BASE {
+                       RegisterOperand GPROpnd> : BRANCH_DESC_BASE,
+                                                  MipsR6Arch<instr_asm> {
   dag InOperandList = (ins GPROpnd:$rs, GPROpnd:$rt, opnd:$offset);
   dag OutOperandList = (outs);
   string AsmString = !strconcat(instr_asm, "\t$rs, $rt, $offset");
   list<Register> Defs = [AT];
+  InstrItinClass Itinerary = II_BCCC;
+  bit hasForbiddenSlot = 1;
+  bit isCTI = 1;
 }
 
 class CMP_CBR_EQNE_Z_DESC_BASE<string instr_asm, DAGOperand opnd,
-                               RegisterOperand GPROpnd> : BRANCH_DESC_BASE {
+                               RegisterOperand GPROpnd>
+    : BRANCH_DESC_BASE, MipsR6Arch<instr_asm> {
   dag InOperandList = (ins GPROpnd:$rs, opnd:$offset);
   dag OutOperandList = (outs);
   string AsmString = !strconcat(instr_asm, "\t$rs, $offset");
   list<Register> Defs = [AT];
+  InstrItinClass Itinerary = II_BCCZC;
+  bit hasForbiddenSlot = 1;
+  bit isCTI = 1;
 }
 
 class CMP_CBR_RT_Z_DESC_BASE<string instr_asm, DAGOperand opnd,
@@ -324,17 +372,23 @@ class CMP_CBR_RT_Z_DESC_BASE<string instr_asm, DAGOperand opnd,
   dag OutOperandList = (outs);
   string AsmString = !strconcat(instr_asm, "\t$rt, $offset");
   list<Register> Defs = [AT];
+  InstrItinClass Itinerary = II_BCCZC;
+  bit hasForbiddenSlot = 1;
+  bit isCTI = 1;
 }
 
 class BAL_DESC : BC_DESC_BASE<"bal", brtarget> {
   bit isCall = 1;
   bit hasDelaySlot = 1;
   list<Register> Defs = [RA];
+  bit isCTI = 1;
 }
 
 class BALC_DESC : BC_DESC_BASE<"balc", brtarget26> {
   bit isCall = 1;
   list<Register> Defs = [RA];
+  InstrItinClass Itinerary = II_BALC;
+  bit isCTI = 1;
 }
 
 class BC_DESC : BC_DESC_BASE<"bc", brtarget26>;
@@ -360,6 +414,7 @@ class COP1_BCCZ_DESC_BASE<string instr_asm> : BRANCH_DESC_BASE {
   dag OutOperandList = (outs);
   string AsmString = instr_asm;
   bit hasDelaySlot = 1;
+  InstrItinClass Itinerary = II_BC1CCZ;
 }
 
 class BC1EQZ_DESC : COP1_BCCZ_DESC_BASE<"bc1eqz $ft, $offset">;
@@ -370,6 +425,7 @@ class COP2_BCCZ_DESC_BASE<string instr_asm> : BRANCH_DESC_BASE {
   dag OutOperandList = (outs);
   string AsmString = instr_asm;
   bit hasDelaySlot = 1;
+  bit isCTI = 1;
 }
 
 class BC2EQZ_DESC : COP2_BCCZ_DESC_BASE<"bc2eqz $ct, $offset">;
@@ -379,23 +435,29 @@ class BOVC_DESC   : CMP_BC_DESC_BASE<"bovc", brtarget, GPR32Opnd>;
 class BNVC_DESC   : CMP_BC_DESC_BASE<"bnvc", brtarget, GPR32Opnd>;
 
 class JMP_IDX_COMPACT_DESC_BASE<string opstr, DAGOperand opnd,
-                                RegisterOperand GPROpnd>
+                                RegisterOperand GPROpnd,
+                                InstrItinClass itin = NoItinerary>
     : MipsR6Arch<opstr> {
   dag InOperandList = (ins GPROpnd:$rt, opnd:$offset);
   string AsmString = !strconcat(opstr, "\t$rt, $offset");
   list<dag> Pattern = [];
-  bit isTerminator = 1;
   bit hasDelaySlot = 0;
+  InstrItinClass Itinerary = itin;
+  bit isCTI = 1;
+  bit isBranch = 1;
+  bit isIndirectBranch = 1;
 }
 
 class JIALC_DESC : JMP_IDX_COMPACT_DESC_BASE<"jialc", calloffset16,
-                                             GPR32Opnd> {
+                                             GPR32Opnd, II_JIALC> {
   bit isCall = 1;
   list<Register> Defs = [RA];
 }
 
-class JIC_DESC : JMP_IDX_COMPACT_DESC_BASE<"jic", jmpoffset16, GPR32Opnd> {
+class JIC_DESC : JMP_IDX_COMPACT_DESC_BASE<"jic", jmpoffset16,
+                                           GPR32Opnd, II_JIALC> {
   bit isBarrier = 1;
+  bit isTerminator = 1;
   list<Register> Defs = [AT];
 }
 
@@ -405,35 +467,39 @@ class JR_HB_R6_DESC : JR_HB_DESC_BASE<"jr.hb", GPR32Opnd> {
   bit hasDelaySlot = 1;
   bit isTerminator=1;
   bit isBarrier=1;
+  bit isCTI = 1;
 }
 
-class BITSWAP_DESC_BASE<string instr_asm, RegisterOperand GPROpnd>
+class BITSWAP_DESC_BASE<string instr_asm, RegisterOperand GPROpnd,
+                        InstrItinClass itin>
     : MipsR6Arch<instr_asm> {
   dag OutOperandList = (outs GPROpnd:$rd);
   dag InOperandList = (ins GPROpnd:$rt);
   string AsmString = !strconcat(instr_asm, "\t$rd, $rt");
   list<dag> Pattern = [];
+  InstrItinClass Itinerary = itin;
 }
 
-class BITSWAP_DESC : BITSWAP_DESC_BASE<"bitswap", GPR32Opnd>;
+class BITSWAP_DESC : BITSWAP_DESC_BASE<"bitswap", GPR32Opnd, II_BITSWAP>;
 
 class DIVMOD_DESC_BASE<string instr_asm, RegisterOperand GPROpnd,
+                       InstrItinClass itin,
                        SDPatternOperator Op=null_frag>
     : MipsR6Arch<instr_asm> {
   dag OutOperandList = (outs GPROpnd:$rd);
   dag InOperandList = (ins GPROpnd:$rs, GPROpnd:$rt);
   string AsmString = !strconcat(instr_asm, "\t$rd, $rs, $rt");
   list<dag> Pattern = [(set GPROpnd:$rd, (Op GPROpnd:$rs, GPROpnd:$rt))];
-
+  InstrItinClass Itinerary = itin;
   // This instruction doesn't trap division by zero itself. We must insert
   // teq instructions as well.
   bit usesCustomInserter = 1;
 }
 
-class DIV_DESC  : DIVMOD_DESC_BASE<"div", GPR32Opnd, sdiv>;
-class DIVU_DESC : DIVMOD_DESC_BASE<"divu", GPR32Opnd, udiv>;
-class MOD_DESC  : DIVMOD_DESC_BASE<"mod", GPR32Opnd, srem>;
-class MODU_DESC : DIVMOD_DESC_BASE<"modu", GPR32Opnd, urem>;
+class DIV_DESC  : DIVMOD_DESC_BASE<"div", GPR32Opnd, II_DIV, sdiv>;
+class DIVU_DESC : DIVMOD_DESC_BASE<"divu", GPR32Opnd, II_DIVU, udiv>;
+class MOD_DESC  : DIVMOD_DESC_BASE<"mod", GPR32Opnd, II_MOD, srem>;
+class MODU_DESC : DIVMOD_DESC_BASE<"modu", GPR32Opnd, II_MODU, urem>;
 
 class BEQZALC_DESC : CMP_CBR_RT_Z_DESC_BASE<"beqzalc", brtarget, GPR32Opnd> {
   list<Register> Defs = [RA];
@@ -460,17 +526,19 @@ class BNEZALC_DESC : CMP_CBR_RT_Z_DESC_BASE<"bnezalc", brtarget, GPR32Opnd> {
 }
 
 class MUL_R6_DESC_BASE<string instr_asm, RegisterOperand GPROpnd,
+                       InstrItinClass itin,
                        SDPatternOperator Op=null_frag> : MipsR6Arch<instr_asm> {
   dag OutOperandList = (outs GPROpnd:$rd);
   dag InOperandList = (ins GPROpnd:$rs, GPROpnd:$rt);
   string AsmString = !strconcat(instr_asm, "\t$rd, $rs, $rt");
   list<dag> Pattern = [(set GPROpnd:$rd, (Op GPROpnd:$rs, GPROpnd:$rt))];
+  InstrItinClass Itinerary = itin;
 }
 
-class MUH_DESC    : MUL_R6_DESC_BASE<"muh", GPR32Opnd, mulhs>;
-class MUHU_DESC   : MUL_R6_DESC_BASE<"muhu", GPR32Opnd, mulhu>;
-class MUL_R6_DESC : MUL_R6_DESC_BASE<"mul", GPR32Opnd, mul>;
-class MULU_DESC   : MUL_R6_DESC_BASE<"mulu", GPR32Opnd>;
+class MUH_DESC    : MUL_R6_DESC_BASE<"muh", GPR32Opnd, II_MUH, mulhs>;
+class MUHU_DESC   : MUL_R6_DESC_BASE<"muhu", GPR32Opnd, II_MUHU, mulhu>;
+class MUL_R6_DESC : MUL_R6_DESC_BASE<"mul", GPR32Opnd, II_MUL, mul>;
+class MULU_DESC   : MUL_R6_DESC_BASE<"mulu", GPR32Opnd, II_MULU>;
 
 class COP1_SEL_DESC_BASE<string instr_asm, RegisterOperand FGROpnd> {
   dag OutOperandList = (outs FGROpnd:$fd);
@@ -482,11 +550,11 @@ class COP1_SEL_DESC_BASE<string instr_asm, RegisterOperand FGROpnd> {
   string Constraints = "$fd_in = $fd";
 }
 
-class SEL_D_DESC : COP1_SEL_DESC_BASE<"sel.d", FGR64Opnd> {
+class SEL_D_DESC : COP1_SEL_DESC_BASE<"sel.d", FGR64Opnd>, MipsR6Arch<"sel.d"> {
   // We must insert a SUBREG_TO_REG around $fd_in
   bit usesCustomInserter = 1;
 }
-class SEL_S_DESC : COP1_SEL_DESC_BASE<"sel.s", FGR32Opnd>;
+class SEL_S_DESC : COP1_SEL_DESC_BASE<"sel.s", FGR32Opnd>, MipsR6Arch<"sel.s">;
 
 class SELEQNE_Z_DESC_BASE<string instr_asm, RegisterOperand GPROpnd>
     : MipsR6Arch<instr_asm> {
@@ -494,23 +562,26 @@ class SELEQNE_Z_DESC_BASE<string instr_asm, RegisterOperand GPROpnd>
   dag InOperandList = (ins GPROpnd:$rs, GPROpnd:$rt);
   string AsmString = !strconcat(instr_asm, "\t$rd, $rs, $rt");
   list<dag> Pattern = [];
+  InstrItinClass Itinerary = II_SELCCZ;
 }
 
 class SELEQZ_DESC : SELEQNE_Z_DESC_BASE<"seleqz", GPR32Opnd>;
 class SELNEZ_DESC : SELEQNE_Z_DESC_BASE<"selnez", GPR32Opnd>;
 
-class COP1_4R_DESC_BASE<string instr_asm, RegisterOperand FGROpnd> {
+class COP1_4R_DESC_BASE<string instr_asm, RegisterOperand FGROpnd,
+                        InstrItinClass itin = NoItinerary> {
   dag OutOperandList = (outs FGROpnd:$fd);
   dag InOperandList = (ins FGROpnd:$fd_in, FGROpnd:$fs, FGROpnd:$ft);
   string AsmString = !strconcat(instr_asm, "\t$fd, $fs, $ft");
   list<dag> Pattern = [];
   string Constraints = "$fd_in = $fd";
+  InstrItinClass Itinerary = itin;
 }
 
-class MADDF_S_DESC  : COP1_4R_DESC_BASE<"maddf.s", FGR32Opnd>;
-class MADDF_D_DESC  : COP1_4R_DESC_BASE<"maddf.d", FGR64Opnd>;
-class MSUBF_S_DESC  : COP1_4R_DESC_BASE<"msubf.s", FGR32Opnd>;
-class MSUBF_D_DESC  : COP1_4R_DESC_BASE<"msubf.d", FGR64Opnd>;
+class MADDF_S_DESC  : COP1_4R_DESC_BASE<"maddf.s", FGR32Opnd, II_MADDF_S>;
+class MADDF_D_DESC  : COP1_4R_DESC_BASE<"maddf.d", FGR64Opnd, II_MADDF_D>;
+class MSUBF_S_DESC  : COP1_4R_DESC_BASE<"msubf.s", FGR32Opnd, II_MSUBF_S>;
+class MSUBF_D_DESC  : COP1_4R_DESC_BASE<"msubf.d", FGR64Opnd, II_MSUBF_D>;
 
 class MAX_MIN_DESC_BASE<string instr_asm, RegisterOperand FGROpnd> {
   dag OutOperandList = (outs FGROpnd:$fd);
@@ -536,10 +607,14 @@ class SELEQNEZ_DESC_BASE<string instr_asm, RegisterOperand FGROpnd> {
   list<dag> Pattern = [];
 }
 
-class SELEQZ_S_DESC : SELEQNEZ_DESC_BASE<"seleqz.s", FGR32Opnd>;
-class SELEQZ_D_DESC : SELEQNEZ_DESC_BASE<"seleqz.d", FGR64Opnd>;
-class SELNEZ_S_DESC : SELEQNEZ_DESC_BASE<"selnez.s", FGR32Opnd>;
-class SELNEZ_D_DESC : SELEQNEZ_DESC_BASE<"selnez.d", FGR64Opnd>;
+class SELEQZ_S_DESC : SELEQNEZ_DESC_BASE<"seleqz.s", FGR32Opnd>,
+                      MipsR6Arch<"seleqz.s">;
+class SELEQZ_D_DESC : SELEQNEZ_DESC_BASE<"seleqz.d", FGR64Opnd>,
+                      MipsR6Arch<"seleqz.d">;
+class SELNEZ_S_DESC : SELEQNEZ_DESC_BASE<"selnez.s", FGR32Opnd>,
+                      MipsR6Arch<"selnez.s">;
+class SELNEZ_D_DESC : SELEQNEZ_DESC_BASE<"selnez.d", FGR64Opnd>,
+                      MipsR6Arch<"selnez.d">;
 
 class CLASS_RINT_DESC_BASE<string instr_asm, RegisterOperand FGROpnd> {
   dag OutOperandList = (outs FGROpnd:$fd);
@@ -590,61 +665,73 @@ class SDC2_R6_DESC : COP2ST_DESC_BASE<"sdc2", COP2Opnd>;
 class SWC2_R6_DESC : COP2ST_DESC_BASE<"swc2", COP2Opnd>;
 
 class LSA_R6_DESC_BASE<string instr_asm, RegisterOperand GPROpnd,
-                       Operand ImmOpnd> : MipsR6Arch<instr_asm> {
+                       Operand ImmOpnd, InstrItinClass itin>
+      : MipsR6Arch<instr_asm> {
   dag OutOperandList = (outs GPROpnd:$rd);
   dag InOperandList = (ins GPROpnd:$rs, GPROpnd:$rt, ImmOpnd:$imm2);
   string AsmString = !strconcat(instr_asm, "\t$rd, $rs, $rt, $imm2");
   list<dag> Pattern = [];
+  InstrItinClass Itinerary = itin;
 }
 
-class LSA_R6_DESC : LSA_R6_DESC_BASE<"lsa", GPR32Opnd, uimm2_plus1>;
+class LSA_R6_DESC : LSA_R6_DESC_BASE<"lsa", GPR32Opnd, uimm2_plus1, II_LSA>;
 
-class LL_R6_DESC_BASE<string instr_asm, RegisterOperand GPROpnd> {
+class LL_R6_DESC_BASE<string instr_asm, RegisterOperand GPROpnd,
+                      Operand MemOpnd, InstrItinClass itin>
+      : MipsR6Arch<instr_asm> {
   dag OutOperandList = (outs GPROpnd:$rt);
-  dag InOperandList = (ins mem_simm9:$addr);
+  dag InOperandList = (ins MemOpnd:$addr);
   string AsmString = !strconcat(instr_asm, "\t$rt, $addr");
   list<dag> Pattern = [];
   bit mayLoad = 1;
+  InstrItinClass Itinerary = itin;
 }
 
-class LL_R6_DESC : LL_R6_DESC_BASE<"ll", GPR32Opnd>;
+class LL_R6_DESC : LL_R6_DESC_BASE<"ll", GPR32Opnd, mem_simm9, II_LL>;
 
-class SC_R6_DESC_BASE<string instr_asm, RegisterOperand GPROpnd> {
+class SC_R6_DESC_BASE<string instr_asm, RegisterOperand GPROpnd,
+                      InstrItinClass itin> {
   dag OutOperandList = (outs GPROpnd:$dst);
   dag InOperandList = (ins GPROpnd:$rt, mem_simm9:$addr);
   string AsmString = !strconcat(instr_asm, "\t$rt, $addr");
   list<dag> Pattern = [];
   bit mayStore = 1;
   string Constraints = "$rt = $dst";
+  InstrItinClass Itinerary = itin;
 }
 
-class SC_R6_DESC : SC_R6_DESC_BASE<"sc", GPR32Opnd>;
+class SC_R6_DESC : SC_R6_DESC_BASE<"sc", GPR32Opnd, II_SC>;
 
-class CLO_CLZ_R6_DESC_BASE<string instr_asm, RegisterOperand GPROpnd>
+class CLO_CLZ_R6_DESC_BASE<string instr_asm, RegisterOperand GPROpnd,
+                           InstrItinClass itin>
     : MipsR6Arch<instr_asm> {
   dag OutOperandList = (outs GPROpnd:$rd);
   dag InOperandList = (ins GPROpnd:$rs);
   string AsmString = !strconcat(instr_asm, "\t$rd, $rs");
+  InstrItinClass Itinerary = itin;
 }
 
-class CLO_R6_DESC_BASE<string instr_asm, RegisterOperand GPROpnd> :
-    CLO_CLZ_R6_DESC_BASE<instr_asm, GPROpnd> {
+class CLO_R6_DESC_BASE<string instr_asm, RegisterOperand GPROpnd,
+                       InstrItinClass itin> :
+    CLO_CLZ_R6_DESC_BASE<instr_asm, GPROpnd, itin> {
   list<dag> Pattern = [(set GPROpnd:$rd, (ctlz (not GPROpnd:$rs)))];
 }
 
-class CLZ_R6_DESC_BASE<string instr_asm, RegisterOperand GPROpnd> :
-    CLO_CLZ_R6_DESC_BASE<instr_asm, GPROpnd> {
+class CLZ_R6_DESC_BASE<string instr_asm, RegisterOperand GPROpnd,
+                       InstrItinClass itin> :
+    CLO_CLZ_R6_DESC_BASE<instr_asm, GPROpnd, itin> {
   list<dag> Pattern = [(set GPROpnd:$rd, (ctlz GPROpnd:$rs))];
 }
 
-class CLO_R6_DESC : CLO_R6_DESC_BASE<"clo", GPR32Opnd>;
-class CLZ_R6_DESC : CLZ_R6_DESC_BASE<"clz", GPR32Opnd>;
+class CLO_R6_DESC : CLO_R6_DESC_BASE<"clo", GPR32Opnd, II_CLO>;
+class CLZ_R6_DESC : CLZ_R6_DESC_BASE<"clz", GPR32Opnd, II_CLZ>;
 
 class SDBBP_R6_DESC {
   dag OutOperandList = (outs);
   dag InOperandList = (ins uimm20:$code_);
   string AsmString = "sdbbp\t$code_";
   list<dag> Pattern = [];
+  bit isCTI = 1;
 }
 
 //===----------------------------------------------------------------------===//
@@ -660,16 +747,18 @@ def AUI : R6MMR6Rel, AUI_ENC, AUI_DESC, ISA_MIPS32R6;
 def AUIPC : R6MMR6Rel, AUIPC_ENC, AUIPC_DESC, ISA_MIPS32R6;
 def BAL : BAL_ENC, BAL_DESC, ISA_MIPS32R6;
 def BALC : R6MMR6Rel, BALC_ENC, BALC_DESC, ISA_MIPS32R6;
-def BC1EQZ : BC1EQZ_ENC, BC1EQZ_DESC, ISA_MIPS32R6, HARDFLOAT;
-def BC1NEZ : BC1NEZ_ENC, BC1NEZ_DESC, ISA_MIPS32R6, HARDFLOAT;
-def BC2EQZ : BC2EQZ_ENC, BC2EQZ_DESC, ISA_MIPS32R6;
-def BC2NEZ : BC2NEZ_ENC, BC2NEZ_DESC, ISA_MIPS32R6;
+let AdditionalPredicates = [NotInMicroMips] in {
+  def BC1EQZ : BC1EQZ_ENC, BC1EQZ_DESC, ISA_MIPS32R6, HARDFLOAT;
+  def BC1NEZ : BC1NEZ_ENC, BC1NEZ_DESC, ISA_MIPS32R6, HARDFLOAT;
+  def BC2EQZ : BC2EQZ_ENC, BC2EQZ_DESC, ISA_MIPS32R6;
+  def BC2NEZ : BC2NEZ_ENC, BC2NEZ_DESC, ISA_MIPS32R6;
+}
 def BC : R6MMR6Rel, BC_ENC, BC_DESC, ISA_MIPS32R6;
-def BEQC : BEQC_ENC, BEQC_DESC, ISA_MIPS32R6;
+def BEQC : R6MMR6Rel, BEQC_ENC, BEQC_DESC, ISA_MIPS32R6;
 def BEQZALC : R6MMR6Rel, BEQZALC_ENC, BEQZALC_DESC, ISA_MIPS32R6;
-def BEQZC : BEQZC_ENC, BEQZC_DESC, ISA_MIPS32R6;
-def BGEC : BGEC_ENC, BGEC_DESC, ISA_MIPS32R6;
-def BGEUC : BGEUC_ENC, BGEUC_DESC, ISA_MIPS32R6;
+def BEQZC : R6MMR6Rel, BEQZC_ENC, BEQZC_DESC, ISA_MIPS32R6;
+def BGEC : R6MMR6Rel, BGEC_ENC, BGEC_DESC, ISA_MIPS32R6;
+def BGEUC : R6MMR6Rel, BGEUC_ENC, BGEUC_DESC, ISA_MIPS32R6;
 def BGEZALC : R6MMR6Rel, BGEZALC_ENC, BGEZALC_DESC, ISA_MIPS32R6;
 def BGEZC : BGEZC_ENC, BGEZC_DESC, ISA_MIPS32R6;
 def BGTZALC : R6MMR6Rel, BGTZALC_ENC, BGTZALC_DESC, ISA_MIPS32R6;
@@ -677,15 +766,17 @@ def BGTZC : BGTZC_ENC, BGTZC_DESC, ISA_MIPS32R6;
 def BITSWAP : R6MMR6Rel, BITSWAP_ENC, BITSWAP_DESC, ISA_MIPS32R6;
 def BLEZALC : R6MMR6Rel, BLEZALC_ENC, BLEZALC_DESC, ISA_MIPS32R6;
 def BLEZC : BLEZC_ENC, BLEZC_DESC, ISA_MIPS32R6;
-def BLTC : BLTC_ENC, BLTC_DESC, ISA_MIPS32R6;
-def BLTUC : BLTUC_ENC, BLTUC_DESC, ISA_MIPS32R6;
+def BLTC : R6MMR6Rel, BLTC_ENC, BLTC_DESC, ISA_MIPS32R6;
+def BLTUC : R6MMR6Rel, BLTUC_ENC, BLTUC_DESC, ISA_MIPS32R6;
 def BLTZALC : R6MMR6Rel, BLTZALC_ENC, BLTZALC_DESC, ISA_MIPS32R6;
 def BLTZC : BLTZC_ENC, BLTZC_DESC, ISA_MIPS32R6;
-def BNEC : BNEC_ENC, BNEC_DESC, ISA_MIPS32R6;
+def BNEC : R6MMR6Rel, BNEC_ENC, BNEC_DESC, ISA_MIPS32R6;
 def BNEZALC : R6MMR6Rel, BNEZALC_ENC, BNEZALC_DESC, ISA_MIPS32R6;
-def BNEZC : BNEZC_ENC, BNEZC_DESC, ISA_MIPS32R6;
-def BNVC : BNVC_ENC, BNVC_DESC, ISA_MIPS32R6;
-def BOVC : BOVC_ENC, BOVC_DESC, ISA_MIPS32R6;
+def BNEZC : R6MMR6Rel, BNEZC_ENC, BNEZC_DESC, ISA_MIPS32R6;
+let AdditionalPredicates = [NotInMicroMips] in {
+  def BNVC : R6MMR6Rel, BNVC_ENC, BNVC_DESC, ISA_MIPS32R6;
+  def BOVC : R6MMR6Rel, BOVC_ENC, BOVC_DESC, ISA_MIPS32R6;
+}
 def CACHE_R6 : R6MMR6Rel, CACHE_ENC, CACHE_DESC, ISA_MIPS32R6;
 let AdditionalPredicates = [NotInMicroMips] in {
   def CLASS_D : CLASS_D_ENC, CLASS_D_DESC, ISA_MIPS32R6, HARDFLOAT;
@@ -695,15 +786,21 @@ def CLO_R6 : R6MMR6Rel, CLO_R6_ENC, CLO_R6_DESC, ISA_MIPS32R6;
 def CLZ_R6 : R6MMR6Rel, CLZ_R6_ENC, CLZ_R6_DESC, ISA_MIPS32R6;
 defm S : CMP_CC_M<FIELD_CMP_FORMAT_S, "s", FGR32Opnd>;
 defm D : CMP_CC_M<FIELD_CMP_FORMAT_D, "d", FGR64Opnd>;
-def DIV : R6MMR6Rel, DIV_ENC, DIV_DESC, ISA_MIPS32R6;
-def DIVU : R6MMR6Rel, DIVU_ENC, DIVU_DESC, ISA_MIPS32R6;
+let AdditionalPredicates = [NotInMicroMips] in {
+  def DIV : R6MMR6Rel, DIV_ENC, DIV_DESC, ISA_MIPS32R6;
+  def DIVU : R6MMR6Rel, DIVU_ENC, DIVU_DESC, ISA_MIPS32R6;
+}
 def JIALC : R6MMR6Rel, JIALC_ENC, JIALC_DESC, ISA_MIPS32R6;
 def JIC : R6MMR6Rel, JIC_ENC, JIC_DESC, ISA_MIPS32R6;
 def JR_HB_R6 : JR_HB_R6_ENC, JR_HB_R6_DESC, ISA_MIPS32R6;
-def LDC2_R6 : LDC2_R6_ENC, LDC2_R6_DESC, ISA_MIPS32R6;
-def LL_R6 : LL_R6_ENC, LL_R6_DESC, ISA_MIPS32R6;
+let AdditionalPredicates = [NotInMicroMips] in {
+  def LDC2_R6 : LDC2_R6_ENC, LDC2_R6_DESC, ISA_MIPS32R6;
+  def LL_R6 : LL_R6_ENC, LL_R6_DESC, PTR_32, ISA_MIPS32R6;
+}
 def LSA_R6 : R6MMR6Rel, LSA_R6_ENC, LSA_R6_DESC, ISA_MIPS32R6;
-def LWC2_R6 : LWC2_R6_ENC, LWC2_R6_DESC, ISA_MIPS32R6;
+let AdditionalPredicates = [NotInMicroMips] in {
+  def LWC2_R6 : LWC2_R6_ENC, LWC2_R6_DESC, ISA_MIPS32R6;
+}
 def LWPC : R6MMR6Rel, LWPC_ENC, LWPC_DESC, ISA_MIPS32R6;
 def LWUPC : LWUPC_ENC, LWUPC_DESC, ISA_MIPS32R6;
 let AdditionalPredicates = [NotInMicroMips] in {
@@ -717,41 +814,40 @@ let AdditionalPredicates = [NotInMicroMips] in {
   def MINA_S : MINA_S_ENC, MINA_S_DESC, ISA_MIPS32R6, HARDFLOAT;
   def MIN_D : MIN_D_ENC, MIN_D_DESC, ISA_MIPS32R6, HARDFLOAT;
   def MIN_S : MIN_S_ENC, MIN_S_DESC, ISA_MIPS32R6, HARDFLOAT;
-}
-def MOD : R6MMR6Rel, MOD_ENC, MOD_DESC, ISA_MIPS32R6;
-def MODU : R6MMR6Rel, MODU_ENC, MODU_DESC, ISA_MIPS32R6;
-let AdditionalPredicates = [NotInMicroMips] in {
+
+  def MOD : R6MMR6Rel, MOD_ENC, MOD_DESC, ISA_MIPS32R6;
+  def MODU : R6MMR6Rel, MODU_ENC, MODU_DESC, ISA_MIPS32R6;
+
   def MSUBF_S : MSUBF_S_ENC, MSUBF_S_DESC, ISA_MIPS32R6, HARDFLOAT;
   def MSUBF_D : MSUBF_D_ENC, MSUBF_D_DESC, ISA_MIPS32R6, HARDFLOAT;
+
+  def MUH    : R6MMR6Rel, MUH_ENC, MUH_DESC, ISA_MIPS32R6;
+  def MUHU   : R6MMR6Rel, MUHU_ENC, MUHU_DESC, ISA_MIPS32R6;
+  def MUL_R6 : R6MMR6Rel, MUL_R6_ENC, MUL_R6_DESC, ISA_MIPS32R6;
+  def MULU   : R6MMR6Rel, MULU_ENC, MULU_DESC, ISA_MIPS32R6;
 }
-def MUH    : R6MMR6Rel, MUH_ENC, MUH_DESC, ISA_MIPS32R6;
-def MUHU   : R6MMR6Rel, MUHU_ENC, MUHU_DESC, ISA_MIPS32R6;
-def MUL_R6 : R6MMR6Rel, MUL_R6_ENC, MUL_R6_DESC, ISA_MIPS32R6;
-def MULU   : R6MMR6Rel, MULU_ENC, MULU_DESC, ISA_MIPS32R6;
 def NAL; // BAL with rd=0
 def PREF_R6 : R6MMR6Rel, PREF_ENC, PREF_DESC, ISA_MIPS32R6;
 let AdditionalPredicates = [NotInMicroMips] in {
   def RINT_D : RINT_D_ENC, RINT_D_DESC, ISA_MIPS32R6, HARDFLOAT;
   def RINT_S : RINT_S_ENC, RINT_S_DESC, ISA_MIPS32R6, HARDFLOAT;
+  def SC_R6 : SC_R6_ENC, SC_R6_DESC, PTR_32, ISA_MIPS32R6;
+  def SDBBP_R6 : SDBBP_R6_ENC, SDBBP_R6_DESC, ISA_MIPS32R6;
+  def SELEQZ : R6MMR6Rel, SELEQZ_ENC, SELEQZ_DESC, ISA_MIPS32R6, GPR_32;
+  def SELNEZ : R6MMR6Rel, SELNEZ_ENC, SELNEZ_DESC, ISA_MIPS32R6, GPR_32;
+  def SELEQZ_D : R6MMR6Rel, SELEQZ_D_ENC, SELEQZ_D_DESC, ISA_MIPS32R6,
+                 HARDFLOAT;
+  def SELEQZ_S : R6MMR6Rel, SELEQZ_S_ENC, SELEQZ_S_DESC, ISA_MIPS32R6,
+                 HARDFLOAT;
+  def SELNEZ_D : R6MMR6Rel, SELNEZ_D_ENC, SELNEZ_D_DESC, ISA_MIPS32R6,
+                 HARDFLOAT;
+  def SELNEZ_S : R6MMR6Rel, SELNEZ_S_ENC, SELNEZ_S_DESC, ISA_MIPS32R6,
+                 HARDFLOAT;
+  def SEL_D : R6MMR6Rel, SEL_D_ENC, SEL_D_DESC, ISA_MIPS32R6, HARDFLOAT;
+  def SEL_S : R6MMR6Rel, SEL_S_ENC, SEL_S_DESC, ISA_MIPS32R6, HARDFLOAT;
+  def SDC2_R6 : SDC2_R6_ENC, SDC2_R6_DESC, ISA_MIPS32R6;
+  def SWC2_R6 : SWC2_R6_ENC, SWC2_R6_DESC, ISA_MIPS32R6;
 }
-def SC_R6 : SC_R6_ENC, SC_R6_DESC, ISA_MIPS32R6;
-let AdditionalPredicates = [NotInMicroMips] in {
-def SDBBP_R6 : SDBBP_R6_ENC, SDBBP_R6_DESC, ISA_MIPS32R6;
-}
-def SDC2_R6 : SDC2_R6_ENC, SDC2_R6_DESC, ISA_MIPS32R6;
-def SELEQZ : R6MMR6Rel, SELEQZ_ENC, SELEQZ_DESC, ISA_MIPS32R6, GPR_32;
-let AdditionalPredicates = [NotInMicroMips] in {
-  def SELEQZ_D : SELEQZ_D_ENC, SELEQZ_D_DESC, ISA_MIPS32R6, HARDFLOAT;
-  def SELEQZ_S : SELEQZ_S_ENC, SELEQZ_S_DESC, ISA_MIPS32R6, HARDFLOAT;
-}
-def SELNEZ : R6MMR6Rel, SELNEZ_ENC, SELNEZ_DESC, ISA_MIPS32R6, GPR_32;
-let AdditionalPredicates = [NotInMicroMips] in {
-  def SELNEZ_D : SELNEZ_D_ENC, SELNEZ_D_DESC, ISA_MIPS32R6, HARDFLOAT;
-  def SELNEZ_S : SELNEZ_S_ENC, SELNEZ_S_DESC, ISA_MIPS32R6, HARDFLOAT;
-  def SEL_D : SEL_D_ENC, SEL_D_DESC, ISA_MIPS32R6, HARDFLOAT;
-  def SEL_S : SEL_S_ENC, SEL_S_DESC, ISA_MIPS32R6, HARDFLOAT;
-}
-def SWC2_R6 : SWC2_R6_ENC, SWC2_R6_DESC, ISA_MIPS32R6;
 
 //===----------------------------------------------------------------------===//
 //
@@ -761,9 +857,14 @@ def SWC2_R6 : SWC2_R6_ENC, SWC2_R6_DESC, ISA_MIPS32R6;
 
 let AdditionalPredicates = [NotInMicroMips] in {
 def : MipsInstAlias<"sdbbp", (SDBBP_R6 0)>, ISA_MIPS32R6;
+def : MipsInstAlias<"jr $rs", (JALR ZERO, GPR32Opnd:$rs), 1>, ISA_MIPS32R6, GPR_32;
 }
-def : MipsInstAlias<"jr $rs", (JALR ZERO, GPR32Opnd:$rs), 1>, ISA_MIPS32R6;
 
+def : MipsInstAlias<"jrc $rs", (JIC GPR32Opnd:$rs, 0), 1>, ISA_MIPS32R6, GPR_32;
+
+let AdditionalPredicates = [NotInMicroMips] in {
+def : MipsInstAlias<"jalrc $rs", (JIALC GPR32Opnd:$rs, 0), 1>, ISA_MIPS32R6, GPR_32;
+}
 //===----------------------------------------------------------------------===//
 //
 // Patterns and Pseudo Instructions
@@ -792,8 +893,10 @@ def : MipsPat<(setne VT:$lhs, VT:$rhs),
       (NOROp (!cast<Instruction>("CMP_EQ_"#NAME) VT:$lhs, VT:$rhs), ZEROReg)>;
 }
 
-defm S : Cmp_Pats<f32, NOR, ZERO>, ISA_MIPS32R6;
-defm D : Cmp_Pats<f64, NOR, ZERO>, ISA_MIPS32R6;
+let AdditionalPredicates = [NotInMicroMips] in {
+  defm S : Cmp_Pats<f32, NOR, ZERO>, ISA_MIPS32R6;
+  defm D : Cmp_Pats<f64, NOR, ZERO>, ISA_MIPS32R6;
+}
 
 // i32 selects
 multiclass SelectInt_Pats<ValueType RC, Instruction OROp, Instruction XORiOp,
@@ -832,6 +935,7 @@ def : MipsPat<(select (Opg (setne RC:$cond, immz)), immz, RC:$f),
               (SELEQZOp RC:$f, RC:$cond)>;
 }
 
+let AdditionalPredicates = [NotInMicroMips] in {
 defm : SelectInt_Pats<i32, OR, XORi, SLTi, SLTiu, SELEQZ, SELNEZ,
                       immZExt16, i32>, ISA_MIPS32R6;
 
@@ -845,3 +949,4 @@ def : MipsPat<(select i32:$cond, i32:$t, immz),
 def : MipsPat<(select i32:$cond, immz, i32:$f),
               (SELEQZ i32:$f, i32:$cond)>,
               ISA_MIPS32R6;
+}
diff --git a/lib/Target/Mips/Mips64InstrInfo.td b/lib/Target/Mips/Mips64InstrInfo.td
index cbdcdd788bec..88cfec5bc13c 100644
--- a/lib/Target/Mips/Mips64InstrInfo.td
+++ b/lib/Target/Mips/Mips64InstrInfo.td
@@ -15,14 +15,6 @@
 // Mips Operand, Complex Patterns and Transformations Definitions.
 //===----------------------------------------------------------------------===//
 
-// Unsigned Operand
-def uimm16_64      : Operand<i64> {
-  let PrintMethod = "printUnsignedImm";
-}
-
-// Signed Operand
-def simm10_64 : Operand<i64>;
-
 // Transformation Function - get Imm - 32.
 def Subtract32 : SDNodeXForm<imm, [{
   return getImm(N, (unsigned)N->getZExtValue() - 32);
@@ -37,7 +29,7 @@ def immSExt10_64 : PatLeaf<(i64 imm),
                            [{ return isInt<10>(N->getSExtValue()); }]>;
 
 def immZExt16_64 : PatLeaf<(i64 imm),
-                           [{ return isInt<16>(N->getZExtValue()); }]>;
+                           [{ return isUInt<16>(N->getZExtValue()); }]>;
 
 def immZExt5_64 : ImmLeaf<i64, [{ return Imm == (Imm & 0x1f); }]>;
 
@@ -71,6 +63,10 @@ def PowerOf2HI : PatLeaf<(imm), [{
     return false;
 }]>;
 
+def assertzext_lt_i32 : PatFrag<(ops node:$src), (assertzext node:$src), [{
+  return cast<VTSDNode>(N->getOperand(1))->getVT().bitsLT(MVT::i32);
+}]>;
+
 //===----------------------------------------------------------------------===//
 // Instructions specific format
 //===----------------------------------------------------------------------===//
@@ -86,7 +82,7 @@ let usesCustomInserter = 1 in {
 }
 
 /// Pseudo instructions for loading and storing accumulator registers.
-let isPseudo = 1, isCodeGenOnly = 1 in {
+let isPseudo = 1, isCodeGenOnly = 1, hasNoSchedulingInfo = 1 in {
   def LOAD_ACC128  : Load<"", ACC128>;
   def STORE_ACC128 : Store<"", ACC128>;
 }
@@ -96,11 +92,13 @@ let isPseudo = 1, isCodeGenOnly = 1 in {
 //===----------------------------------------------------------------------===//
 let DecoderNamespace = "Mips64" in {
 /// Arithmetic Instructions (ALU Immediate)
-def DADDi   : ArithLogicI<"daddi", simm16_64, GPR64Opnd>, ADDI_FM<0x18>,
-              ISA_MIPS3_NOT_32R6_64R6;
-def DADDiu  : ArithLogicI<"daddiu", simm16_64, GPR64Opnd, II_DADDIU,
-                          immSExt16, add>,
-              ADDI_FM<0x19>, IsAsCheapAsAMove, ISA_MIPS3;
+def DADDi   : ArithLogicI<"daddi", simm16_64, GPR64Opnd, II_DADDI>,
+              ADDI_FM<0x18>, ISA_MIPS3_NOT_32R6_64R6;
+let AdditionalPredicates = [NotInMicroMips] in {
+  def DADDiu : StdMMR6Rel, ArithLogicI<"daddiu", simm16_64, GPR64Opnd,
+                                       II_DADDIU, immSExt16, add>,
+               ADDI_FM<0x19>, IsAsCheapAsAMove, ISA_MIPS3;
+}
 
 let isCodeGenOnly = 1 in {
 def SLTi64  : SetCC_I<"slti", setlt, simm16_64, immSExt16, GPR64Opnd>,
@@ -113,18 +111,20 @@ def ORi64   : ArithLogicI<"ori", uimm16_64, GPR64Opnd, II_OR, immZExt16, or>,
               ADDI_FM<0xd>;
 def XORi64  : ArithLogicI<"xori", uimm16_64, GPR64Opnd, II_XOR, immZExt16, xor>,
               ADDI_FM<0xe>;
-def LUi64   : LoadUpper<"lui", GPR64Opnd, uimm16_64>, LUI_FM;
+def LUi64   : LoadUpper<"lui", GPR64Opnd, uimm16_64_relaxed>, LUI_FM;
 }
 
 /// Arithmetic Instructions (3-Operand, R-Type)
-def DADD   : ArithLogicR<"dadd", GPR64Opnd, 1, II_DADD>, ADD_FM<0, 0x2c>,
-             ISA_MIPS3;
-def DADDu  : ArithLogicR<"daddu", GPR64Opnd, 1, II_DADDU, add>, ADD_FM<0, 0x2d>,
-             ISA_MIPS3;
-def DSUBu  : ArithLogicR<"dsubu", GPR64Opnd, 0, II_DSUBU, sub>, ADD_FM<0, 0x2f>,
-             ISA_MIPS3;
-def DSUB   : ArithLogicR<"dsub", GPR64Opnd, 0, II_DSUB>, ADD_FM<0, 0x2e>,
-             ISA_MIPS3;
+let AdditionalPredicates = [NotInMicroMips] in {
+  def DADD   : StdMMR6Rel, ArithLogicR<"dadd", GPR64Opnd, 1, II_DADD>,
+               ADD_FM<0, 0x2c>, ISA_MIPS3;
+  def DADDu  : StdMMR6Rel, ArithLogicR<"daddu", GPR64Opnd, 1, II_DADDU, add>,
+               ADD_FM<0, 0x2d>, ISA_MIPS3;
+  def DSUBu  : StdMMR6Rel, ArithLogicR<"dsubu", GPR64Opnd, 0, II_DSUBU, sub>, ADD_FM<0, 0x2f>,
+               ISA_MIPS3;
+  def DSUB   : StdMMR6Rel, ArithLogicR<"dsub", GPR64Opnd, 0, II_DSUB>, ADD_FM<0, 0x2e>,
+               ISA_MIPS3;
+}
 
 let isCodeGenOnly = 1 in {
 def SLT64  : SetCC_R<"slt", setlt, GPR64Opnd>, ADD_FM<0, 0x2a>;
@@ -136,33 +136,43 @@ def NOR64  : LogicNOR<"nor", GPR64Opnd>, ADD_FM<0, 0x27>;
 }
 
 /// Shift Instructions
-def DSLL   : shift_rotate_imm<"dsll", uimm6, GPR64Opnd, II_DSLL, shl, immZExt6>,
+let AdditionalPredicates = [NotInMicroMips] in {
+  def DSLL : StdMMR6Rel, shift_rotate_imm<"dsll", uimm6, GPR64Opnd, II_DSLL,
+                                          shl, immZExt6>,
              SRA_FM<0x38, 0>, ISA_MIPS3;
-def DSRL   : shift_rotate_imm<"dsrl", uimm6, GPR64Opnd, II_DSRL, srl, immZExt6>,
+  def DSRL : StdMMR6Rel, shift_rotate_imm<"dsrl", uimm6, GPR64Opnd, II_DSRL,
+                                          srl, immZExt6>,
              SRA_FM<0x3a, 0>, ISA_MIPS3;
-def DSRA   : shift_rotate_imm<"dsra", uimm6, GPR64Opnd, II_DSRA, sra, immZExt6>,
+  def DSRA : StdMMR6Rel, shift_rotate_imm<"dsra", uimm6, GPR64Opnd, II_DSRA,
+                                          sra, immZExt6>,
              SRA_FM<0x3b, 0>, ISA_MIPS3;
-def DSLLV  : shift_rotate_reg<"dsllv", GPR64Opnd, II_DSLLV, shl>,
-             SRLV_FM<0x14, 0>, ISA_MIPS3;
-def DSRLV  : shift_rotate_reg<"dsrlv", GPR64Opnd, II_DSRLV, srl>,
-             SRLV_FM<0x16, 0>, ISA_MIPS3;
-def DSRAV  : shift_rotate_reg<"dsrav", GPR64Opnd, II_DSRAV, sra>,
-             SRLV_FM<0x17, 0>, ISA_MIPS3;
-def DSLL32 : shift_rotate_imm<"dsll32", uimm5, GPR64Opnd, II_DSLL32>,
-             SRA_FM<0x3c, 0>, ISA_MIPS3;
-def DSRL32 : shift_rotate_imm<"dsrl32", uimm5, GPR64Opnd, II_DSRL32>,
-             SRA_FM<0x3e, 0>, ISA_MIPS3;
-def DSRA32 : shift_rotate_imm<"dsra32", uimm5, GPR64Opnd, II_DSRA32>,
-             SRA_FM<0x3f, 0>, ISA_MIPS3;
+  def DSLLV  : StdMMR6Rel, shift_rotate_reg<"dsllv", GPR64Opnd, II_DSLLV, shl>,
+               SRLV_FM<0x14, 0>, ISA_MIPS3;
+  def DSRAV  : StdMMR6Rel, shift_rotate_reg<"dsrav", GPR64Opnd, II_DSRAV, sra>,
+               SRLV_FM<0x17, 0>, ISA_MIPS3;
+  def DSRLV  : StdMMR6Rel, shift_rotate_reg<"dsrlv", GPR64Opnd, II_DSRLV, srl>,
+               SRLV_FM<0x16, 0>, ISA_MIPS3;
+  def DSLL32 : StdMMR6Rel, shift_rotate_imm<"dsll32", uimm5, GPR64Opnd,
+                                            II_DSLL32>,
+               SRA_FM<0x3c, 0>, ISA_MIPS3;
+  def DSRL32 : StdMMR6Rel, shift_rotate_imm<"dsrl32", uimm5, GPR64Opnd,
+                                            II_DSRL32>,
+               SRA_FM<0x3e, 0>, ISA_MIPS3;
+  def DSRA32 : StdMMR6Rel, shift_rotate_imm<"dsra32", uimm5, GPR64Opnd,
+               II_DSRA32>,
+               SRA_FM<0x3f, 0>, ISA_MIPS3;
 
 // Rotate Instructions
-def DROTR  : shift_rotate_imm<"drotr", uimm6, GPR64Opnd, II_DROTR, rotr,
-                              immZExt6>,
-             SRA_FM<0x3a, 1>, ISA_MIPS64R2;
-def DROTRV : shift_rotate_reg<"drotrv", GPR64Opnd, II_DROTRV, rotr>,
-             SRLV_FM<0x16, 1>, ISA_MIPS64R2;
-def DROTR32 : shift_rotate_imm<"drotr32", uimm5, GPR64Opnd, II_DROTR32>,
-              SRA_FM<0x3e, 1>, ISA_MIPS64R2;
+  def DROTR  : StdMMR6Rel, shift_rotate_imm<"drotr", uimm6, GPR64Opnd, II_DROTR,
+                                            rotr, immZExt6>,
+               SRA_FM<0x3a, 1>, ISA_MIPS64R2;
+  def DROTRV : StdMMR6Rel, shift_rotate_reg<"drotrv", GPR64Opnd, II_DROTRV,
+                                            rotr>,
+               SRLV_FM<0x16, 1>, ISA_MIPS64R2;
+  def DROTR32 : StdMMR6Rel, shift_rotate_imm<"drotr32", uimm5, GPR64Opnd,
+                                             II_DROTR32>,
+                SRA_FM<0x3e, 1>, ISA_MIPS64R2;
+}
 
 /// Load and Store Instructions
 ///  aligned
@@ -177,9 +187,16 @@ def SH64  : Store<"sh", GPR64Opnd, truncstorei16, II_SH>, LW_FM<0x29>;
 def SW64  : Store<"sw", GPR64Opnd, truncstorei32, II_SW>, LW_FM<0x2b>;
 }
 
-def LWu   : Load<"lwu", GPR64Opnd, zextloadi32, II_LWU>, LW_FM<0x27>, ISA_MIPS3;
-def LD    : Load<"ld", GPR64Opnd, load, II_LD>, LW_FM<0x37>, ISA_MIPS3;
-def SD    : Store<"sd", GPR64Opnd, store, II_SD>, LW_FM<0x3f>, ISA_MIPS3;
+let AdditionalPredicates = [NotInMicroMips] in {
+  def LWu : StdMMR6Rel, MMRel, Load<"lwu", GPR64Opnd, zextloadi32, II_LWU>,
+            LW_FM<0x27>, ISA_MIPS3;
+  def LD  : StdMMR6Rel, LoadMemory<"ld", GPR64Opnd, mem_simm16, load, II_LD>,
+            LW_FM<0x37>, ISA_MIPS3;
+  def SD  : StdMMR6Rel, StoreMemory<"sd", GPR64Opnd, mem_simm16, store, II_SD>,
+            LW_FM<0x3f>, ISA_MIPS3;
+}
+
+
 
 /// load/store left/right
 let isCodeGenOnly = 1 in {
@@ -199,9 +216,20 @@ def SDR   : StoreLeftRight<"sdr", MipsSDR, GPR64Opnd, II_SDR>, LW_FM<0x2d>,
             ISA_MIPS3_NOT_32R6_64R6;
 
 /// Load-linked, Store-conditional
-def LLD : LLBase<"lld", GPR64Opnd>, LW_FM<0x34>, ISA_MIPS3_NOT_32R6_64R6;
+let AdditionalPredicates = [NotInMicroMips] in {
+  def LLD : StdMMR6Rel, LLBase<"lld", GPR64Opnd, mem_simm16>, LW_FM<0x34>,
+            ISA_MIPS3_NOT_32R6_64R6;
+}
 def SCD : SCBase<"scd", GPR64Opnd>, LW_FM<0x3c>, ISA_MIPS3_NOT_32R6_64R6;
 
+let AdditionalPredicates = [NotInMicroMips],
+    DecoderNamespace = "Mips32_64_PTR64" in {
+def LL64 : LLBase<"ll", GPR32Opnd>, LW_FM<0x30>, PTR_64,
+           ISA_MIPS2_NOT_32R6_64R6;
+def SC64 : SCBase<"sc", GPR32Opnd>, LW_FM<0x38>, PTR_64,
+           ISA_MIPS2_NOT_32R6_64R6;
+}
+
 /// Jump and Branch Instructions
 let isCodeGenOnly = 1 in {
   def JR64   : IndirectBranch<"jr", GPR64Opnd>, MTLO_FM<8>;
@@ -220,18 +248,22 @@ def PseudoReturn64 : PseudoReturnBase<GPR64Opnd>;
 def PseudoIndirectBranch64 : PseudoIndirectBranchBase<GPR64Opnd>;
 
 /// Multiply and Divide Instructions.
-def DMULT  : Mult<"dmult", II_DMULT, GPR64Opnd, [HI0_64, LO0_64]>,
-             MULT_FM<0, 0x1c>, ISA_MIPS3_NOT_32R6_64R6;
-def DMULTu : Mult<"dmultu", II_DMULTU, GPR64Opnd, [HI0_64, LO0_64]>,
-             MULT_FM<0, 0x1d>, ISA_MIPS3_NOT_32R6_64R6;
+let AdditionalPredicates = [NotInMicroMips] in {
+  def DMULT  : Mult<"dmult", II_DMULT, GPR64Opnd, [HI0_64, LO0_64]>,
+               MULT_FM<0, 0x1c>, ISA_MIPS3_NOT_32R6_64R6;
+  def DMULTu : Mult<"dmultu", II_DMULTU, GPR64Opnd, [HI0_64, LO0_64]>,
+               MULT_FM<0, 0x1d>, ISA_MIPS3_NOT_32R6_64R6;
+}
 def PseudoDMULT  : MultDivPseudo<DMULT, ACC128, GPR64Opnd, MipsMult,
                                  II_DMULT>, ISA_MIPS3_NOT_32R6_64R6;
 def PseudoDMULTu : MultDivPseudo<DMULTu, ACC128, GPR64Opnd, MipsMultu,
                                  II_DMULTU>, ISA_MIPS3_NOT_32R6_64R6;
-def DSDIV : Div<"ddiv", II_DDIV, GPR64Opnd, [HI0_64, LO0_64]>,
-            MULT_FM<0, 0x1e>, ISA_MIPS3_NOT_32R6_64R6;
-def DUDIV : Div<"ddivu", II_DDIVU, GPR64Opnd, [HI0_64, LO0_64]>,
-            MULT_FM<0, 0x1f>, ISA_MIPS3_NOT_32R6_64R6;
+let AdditionalPredicates = [NotInMicroMips] in {
+  def DSDIV : Div<"ddiv", II_DDIV, GPR64Opnd, [HI0_64, LO0_64]>,
+              MULT_FM<0, 0x1e>, ISA_MIPS3_NOT_32R6_64R6;
+  def DUDIV : Div<"ddivu", II_DDIVU, GPR64Opnd, [HI0_64, LO0_64]>,
+              MULT_FM<0, 0x1f>, ISA_MIPS3_NOT_32R6_64R6;
+}
 def PseudoDSDIV : MultDivPseudo<DSDIV, ACC128, GPR64Opnd, MipsDivRem,
                                 II_DDIV, 0, 1, 1>, ISA_MIPS3_NOT_32R6_64R6;
 def PseudoDUDIV : MultDivPseudo<DUDIV, ACC128, GPR64Opnd, MipsDivRemU,
@@ -260,12 +292,16 @@ def SEH64 : SignExtInReg<"seh", i16, GPR64Opnd, II_SEH>, SEB_FM<0x18, 0x20>,
 }
 
 /// Count Leading
-def DCLZ : CountLeading0<"dclz", GPR64Opnd>, CLO_FM<0x24>, ISA_MIPS64_NOT_64R6;
-def DCLO : CountLeading1<"dclo", GPR64Opnd>, CLO_FM<0x25>, ISA_MIPS64_NOT_64R6;
+let AdditionalPredicates = [NotInMicroMips] in {
+  def DCLZ : StdMMR6Rel, CountLeading0<"dclz", GPR64Opnd>, CLO_FM<0x24>,
+             ISA_MIPS64_NOT_64R6;
+  def DCLO : StdMMR6Rel, CountLeading1<"dclo", GPR64Opnd>, CLO_FM<0x25>,
+             ISA_MIPS64_NOT_64R6;
 
 /// Double Word Swap Bytes/HalfWords
-def DSBH : SubwordSwap<"dsbh", GPR64Opnd>, SEB_FM<2, 0x24>, ISA_MIPS64R2;
-def DSHD : SubwordSwap<"dshd", GPR64Opnd>, SEB_FM<5, 0x24>, ISA_MIPS64R2;
+  def DSBH : SubwordSwap<"dsbh", GPR64Opnd>, SEB_FM<2, 0x24>, ISA_MIPS64R2;
+  def DSHD : SubwordSwap<"dshd", GPR64Opnd>, SEB_FM<5, 0x24>, ISA_MIPS64R2;
+}
 
 def LEA_ADDiu64 : EffectiveAddress<"daddiu", GPR64Opnd>, LW_FM<0x19>;
 
@@ -273,21 +309,24 @@ let isCodeGenOnly = 1 in
 def RDHWR64 : ReadHardware<GPR64Opnd, HWRegsOpnd>, RDHWR_FM;
 
 let AdditionalPredicates = [NotInMicroMips] in {
-  // TODO: Add 'pos + size' constraint check to dext* instructions
-  //       DEXT: 0 < pos + size <= 63
-  //       DEXTM, DEXTU: 32 < pos + size <= 64
-  def DEXT : ExtBase<"dext", GPR64Opnd, uimm5, uimm5_plus1, MipsExt>,
-             EXT_FM<3>;
-  def DEXTM : ExtBase<"dextm", GPR64Opnd, uimm5, uimm5_plus33, MipsExt>,
-              EXT_FM<1>;
+  // The 'pos + size' constraints are enforced by the code that lowers into
+  // MipsISD::Ext.
+  def DEXT : ExtBase<"dext", GPR64Opnd, uimm5_report_uimm6, uimm5_plus1,
+                     immZExt5, immZExt5Plus1, MipsExt>, EXT_FM<3>,
+                     ISA_MIPS64R2;
+  def DEXTM : ExtBase<"dextm", GPR64Opnd, uimm5, uimm5_plus33, immZExt5,
+                      immZExt5Plus33, MipsExt>, EXT_FM<1>, ISA_MIPS64R2;
   def DEXTU : ExtBase<"dextu", GPR64Opnd, uimm5_plus32, uimm5_plus1,
-                      MipsExt>, EXT_FM<2>;
+                      immZExt5Plus32, immZExt5Plus1, MipsExt>, EXT_FM<2>,
+                      ISA_MIPS64R2;
+  def DINS : InsBase<"dins", GPR64Opnd, uimm6, uimm5_inssize_plus1, MipsIns>,
+             EXT_FM<7>, ISA_MIPS64R2;
+  def DINSU : InsBase<"dinsu", GPR64Opnd, uimm5_plus32, uimm5_inssize_plus1>,
+              EXT_FM<6>, ISA_MIPS64R2;
+  def DINSM : InsBase<"dinsm", GPR64Opnd, uimm5, uimm5_inssize_plus1>,
+              EXT_FM<5>, ISA_MIPS64R2;
 }
 
-def DINS : InsBase<"dins", GPR64Opnd, uimm6, MipsIns>, EXT_FM<7>;
-def DINSU : InsBase<"dinsu", GPR64Opnd, uimm5_plus32>, EXT_FM<6>;
-def DINSM : InsBase<"dinsm", GPR64Opnd, uimm5>, EXT_FM<5>;
-
 let isCodeGenOnly = 1, rs = 0, shamt = 0 in {
   def DSLL64_32 : FR<0x00, 0x3c, (outs GPR64:$rd), (ins GPR32:$rt),
                      "dsll\t$rd, $rt, 32", [], II_DSLL>;
@@ -309,8 +348,8 @@ def LONG_BRANCH_DADDiu : PseudoSE<(outs GPR64Opnd:$dst),
 
 // Cavium Octeon cnMIPS instructions
 let DecoderNamespace = "CnMips",
-    EncodingPredicates = []<Predicate>, // FIXME: The lack of HasStdEnc is probably a bug
-    AdditionalPredicates = [HasCnMips] in {
+    // FIXME: The lack of HasStdEnc is probably a bug
+    EncodingPredicates = []<Predicate> in {
 
 class Count1s<string opstr, RegisterOperand RO>:
   InstSE<(outs RO:$rd), (ins RO:$rs), !strconcat(opstr, "\t$rd, $rs"),
@@ -361,83 +400,94 @@ class MFC2OP<string asmstr, RegisterOperand RO> :
          !strconcat(asmstr, "\t$rt, $imm16"), [], NoItinerary, FrmFR>;
 
 // Unsigned Byte Add
-let Pattern = [(set GPR64Opnd:$rd,
-                    (and (add GPR64Opnd:$rs, GPR64Opnd:$rt), 255))] in
 def BADDu  : ArithLogicR<"baddu", GPR64Opnd, 1, II_BADDU>,
-                              ADD_FM<0x1c, 0x28>;
+             ADD_FM<0x1c, 0x28>, ASE_CNMIPS {
+  let Pattern = [(set GPR64Opnd:$rd,
+                      (and (add GPR64Opnd:$rs, GPR64Opnd:$rt), 255))];
+}
 
 // Branch on Bit Clear /+32
 def BBIT0  : CBranchBitNum<"bbit0", brtarget, seteq, GPR64Opnd,
-                           uimm5_64_report_uimm6>, BBIT_FM<0x32>;
+                           uimm5_64_report_uimm6>, BBIT_FM<0x32>, ASE_CNMIPS;
 def BBIT032: CBranchBitNum<"bbit032", brtarget, seteq, GPR64Opnd, uimm5_64,
-                           0x100000000>,
-                           BBIT_FM<0x36>;
+                           0x100000000>, BBIT_FM<0x36>, ASE_CNMIPS;
 
 // Branch on Bit Set /+32
 def BBIT1  : CBranchBitNum<"bbit1", brtarget, setne, GPR64Opnd,
-                           uimm5_64_report_uimm6>, BBIT_FM<0x3a>;
+                           uimm5_64_report_uimm6>, BBIT_FM<0x3a>, ASE_CNMIPS;
 def BBIT132: CBranchBitNum<"bbit132", brtarget, setne, GPR64Opnd, uimm5_64,
-                           0x100000000>, BBIT_FM<0x3e>;
+                           0x100000000>, BBIT_FM<0x3e>, ASE_CNMIPS;
 
 // Multiply Doubleword to GPR
-let Defs = [HI0, LO0, P0, P1, P2] in
 def DMUL  : ArithLogicR<"dmul", GPR64Opnd, 1, II_DMUL, mul>,
-                              ADD_FM<0x1c, 0x03>;
+            ADD_FM<0x1c, 0x03>, ASE_CNMIPS {
+  let Defs = [HI0, LO0, P0, P1, P2];
+}
 
 // Extract a signed bit field /+32
-def EXTS  : ExtsCins<"exts">, EXTS_FM<0x3a>;
-def EXTS32: ExtsCins<"exts32">, EXTS_FM<0x3b>;
+def EXTS  : ExtsCins<"exts">, EXTS_FM<0x3a>, ASE_CNMIPS;
+def EXTS32: ExtsCins<"exts32">, EXTS_FM<0x3b>, ASE_CNMIPS;
 
 // Clear and insert a bit field /+32
-def CINS  : ExtsCins<"cins">, EXTS_FM<0x32>;
-def CINS32: ExtsCins<"cins32">, EXTS_FM<0x33>;
+def CINS  : ExtsCins<"cins">, EXTS_FM<0x32>, ASE_CNMIPS;
+def CINS32: ExtsCins<"cins32">, EXTS_FM<0x33>, ASE_CNMIPS;
 
 // Move to multiplier/product register
-def MTM0   : MoveToLOHI<"mtm0", GPR64Opnd, [MPL0, P0, P1, P2]>, MTMR_FM<0x08>;
-def MTM1   : MoveToLOHI<"mtm1", GPR64Opnd, [MPL1, P0, P1, P2]>, MTMR_FM<0x0c>;
-def MTM2   : MoveToLOHI<"mtm2", GPR64Opnd, [MPL2, P0, P1, P2]>, MTMR_FM<0x0d>;
-def MTP0   : MoveToLOHI<"mtp0", GPR64Opnd, [P0]>, MTMR_FM<0x09>;
-def MTP1   : MoveToLOHI<"mtp1", GPR64Opnd, [P1]>, MTMR_FM<0x0a>;
-def MTP2   : MoveToLOHI<"mtp2", GPR64Opnd, [P2]>, MTMR_FM<0x0b>;
+def MTM0   : MoveToLOHI<"mtm0", GPR64Opnd, [MPL0, P0, P1, P2]>, MTMR_FM<0x08>,
+             ASE_CNMIPS;
+def MTM1   : MoveToLOHI<"mtm1", GPR64Opnd, [MPL1, P0, P1, P2]>, MTMR_FM<0x0c>,
+             ASE_CNMIPS;
+def MTM2   : MoveToLOHI<"mtm2", GPR64Opnd, [MPL2, P0, P1, P2]>, MTMR_FM<0x0d>,
+             ASE_CNMIPS;
+def MTP0   : MoveToLOHI<"mtp0", GPR64Opnd, [P0]>, MTMR_FM<0x09>, ASE_CNMIPS;
+def MTP1   : MoveToLOHI<"mtp1", GPR64Opnd, [P1]>, MTMR_FM<0x0a>, ASE_CNMIPS;
+def MTP2   : MoveToLOHI<"mtp2", GPR64Opnd, [P2]>, MTMR_FM<0x0b>, ASE_CNMIPS;
 
 // Count Ones in a Word/Doubleword
-def POP   : Count1s<"pop", GPR32Opnd>, POP_FM<0x2c>;
-def DPOP  : Count1s<"dpop", GPR64Opnd>, POP_FM<0x2d>;
+def POP   : Count1s<"pop", GPR32Opnd>, POP_FM<0x2c>, ASE_CNMIPS;
+def DPOP  : Count1s<"dpop", GPR64Opnd>, POP_FM<0x2d>, ASE_CNMIPS;
 
 // Set on equal/not equal
-def SEQ   : SetCC64_R<"seq", seteq>, SEQ_FM<0x2a>;
-def SEQi  : SetCC64_I<"seqi", seteq>, SEQI_FM<0x2e>;
-def SNE   : SetCC64_R<"sne", setne>, SEQ_FM<0x2b>;
-def SNEi  : SetCC64_I<"snei", setne>, SEQI_FM<0x2f>;
+def SEQ   : SetCC64_R<"seq", seteq>, SEQ_FM<0x2a>, ASE_CNMIPS;
+def SEQi  : SetCC64_I<"seqi", seteq>, SEQI_FM<0x2e>, ASE_CNMIPS;
+def SNE   : SetCC64_R<"sne", setne>, SEQ_FM<0x2b>, ASE_CNMIPS;
+def SNEi  : SetCC64_I<"snei", setne>, SEQI_FM<0x2f>, ASE_CNMIPS;
 
 // 192-bit x 64-bit Unsigned Multiply and Add
-let Defs = [P0, P1, P2] in
-def V3MULU: ArithLogicR<"v3mulu", GPR64Opnd, 0, II_DMUL>,
-                                  ADD_FM<0x1c, 0x11>;
+def V3MULU: ArithLogicR<"v3mulu", GPR64Opnd, 0, II_DMUL>, ADD_FM<0x1c, 0x11>,
+            ASE_CNMIPS {
+  let Defs = [P0, P1, P2];
+}
 
 // 64-bit Unsigned Multiply and Add Move
-let Defs = [MPL0, P0, P1, P2] in
-def VMM0  : ArithLogicR<"vmm0", GPR64Opnd, 0, II_DMUL>,
-                                ADD_FM<0x1c, 0x10>;
+def VMM0  : ArithLogicR<"vmm0", GPR64Opnd, 0, II_DMUL>, ADD_FM<0x1c, 0x10>,
+            ASE_CNMIPS {
+  let Defs = [MPL0, P0, P1, P2];
+}
 
 // 64-bit Unsigned Multiply and Add
-let Defs = [MPL1, MPL2, P0, P1, P2] in
-def VMULU : ArithLogicR<"vmulu", GPR64Opnd, 0, II_DMUL>,
-                                 ADD_FM<0x1c, 0x0f>;
+def VMULU : ArithLogicR<"vmulu", GPR64Opnd, 0, II_DMUL>, ADD_FM<0x1c, 0x0f>,
+            ASE_CNMIPS {
+  let Defs = [MPL1, MPL2, P0, P1, P2];
+}
 
 // Move between CPU and coprocessor registers
-def DMFC2_OCTEON : MFC2OP<"dmfc2", GPR64Opnd>, MFC2OP_FM<0x12, 1>;
-def DMTC2_OCTEON : MFC2OP<"dmtc2", GPR64Opnd>, MFC2OP_FM<0x12, 5>;
+def DMFC2_OCTEON : MFC2OP<"dmfc2", GPR64Opnd>, MFC2OP_FM<0x12, 1>, ASE_CNMIPS;
+def DMTC2_OCTEON : MFC2OP<"dmtc2", GPR64Opnd>, MFC2OP_FM<0x12, 5>, ASE_CNMIPS;
 }
 
 }
 
 /// Move between CPU and coprocessor registers
 let DecoderNamespace = "Mips64", Predicates = [HasMips64] in {
-def DMFC0 : MFC3OP<"dmfc0", GPR64Opnd, COP0Opnd>, MFC3OP_FM<0x10, 1>, ISA_MIPS3;
-def DMTC0 : MTC3OP<"dmtc0", COP0Opnd, GPR64Opnd>, MFC3OP_FM<0x10, 5>, ISA_MIPS3;
-def DMFC2 : MFC3OP<"dmfc2", GPR64Opnd, COP2Opnd>, MFC3OP_FM<0x12, 1>, ISA_MIPS3;
-def DMTC2 : MTC3OP<"dmtc2", COP2Opnd, GPR64Opnd>, MFC3OP_FM<0x12, 5>, ISA_MIPS3;
+def DMFC0 : MFC3OP<"dmfc0", GPR64Opnd, COP0Opnd, II_DMFC0>, MFC3OP_FM<0x10, 1>,
+            ISA_MIPS3;
+def DMTC0 : MTC3OP<"dmtc0", COP0Opnd, GPR64Opnd, II_DMTC0>, MFC3OP_FM<0x10, 5>,
+            ISA_MIPS3;
+def DMFC2 : MFC3OP<"dmfc2", GPR64Opnd, COP2Opnd, II_DMFC2>, MFC3OP_FM<0x12, 1>,
+            ISA_MIPS3;
+def DMTC2 : MTC3OP<"dmtc2", COP2Opnd, GPR64Opnd, II_DMTC2>, MFC3OP_FM<0x12, 5>,
+            ISA_MIPS3;
 }
 
 //===----------------------------------------------------------------------===//
@@ -458,31 +508,34 @@ def : MipsPat<(MipsHi tconstpool:$in), (LUi64 tconstpool:$in)>;
 def : MipsPat<(MipsHi tglobaltlsaddr:$in), (LUi64 tglobaltlsaddr:$in)>;
 def : MipsPat<(MipsHi texternalsym:$in), (LUi64 texternalsym:$in)>;
 
-def : MipsPat<(MipsLo tglobaladdr:$in), (DADDiu ZERO_64, tglobaladdr:$in)>;
-def : MipsPat<(MipsLo tblockaddress:$in), (DADDiu ZERO_64, tblockaddress:$in)>;
-def : MipsPat<(MipsLo tjumptable:$in), (DADDiu ZERO_64, tjumptable:$in)>;
-def : MipsPat<(MipsLo tconstpool:$in), (DADDiu ZERO_64, tconstpool:$in)>;
-def : MipsPat<(MipsLo tglobaltlsaddr:$in),
-              (DADDiu ZERO_64, tglobaltlsaddr:$in)>;
-def : MipsPat<(MipsLo texternalsym:$in), (DADDiu ZERO_64, texternalsym:$in)>;
-
-def : MipsPat<(add GPR64:$hi, (MipsLo tglobaladdr:$lo)),
-              (DADDiu GPR64:$hi, tglobaladdr:$lo)>;
-def : MipsPat<(add GPR64:$hi, (MipsLo tblockaddress:$lo)),
-              (DADDiu GPR64:$hi, tblockaddress:$lo)>;
-def : MipsPat<(add GPR64:$hi, (MipsLo tjumptable:$lo)),
-              (DADDiu GPR64:$hi, tjumptable:$lo)>;
-def : MipsPat<(add GPR64:$hi, (MipsLo tconstpool:$lo)),
-              (DADDiu GPR64:$hi, tconstpool:$lo)>;
-def : MipsPat<(add GPR64:$hi, (MipsLo tglobaltlsaddr:$lo)),
-              (DADDiu GPR64:$hi, tglobaltlsaddr:$lo)>;
-
-def : WrapperPat<tglobaladdr, DADDiu, GPR64>;
-def : WrapperPat<tconstpool, DADDiu, GPR64>;
-def : WrapperPat<texternalsym, DADDiu, GPR64>;
-def : WrapperPat<tblockaddress, DADDiu, GPR64>;
-def : WrapperPat<tjumptable, DADDiu, GPR64>;
-def : WrapperPat<tglobaltlsaddr, DADDiu, GPR64>;
+let AdditionalPredicates = [NotInMicroMips] in {
+  def : MipsPat<(MipsLo tglobaladdr:$in), (DADDiu ZERO_64, tglobaladdr:$in)>;
+  def : MipsPat<(MipsLo tblockaddress:$in),
+                (DADDiu ZERO_64, tblockaddress:$in)>;
+  def : MipsPat<(MipsLo tjumptable:$in), (DADDiu ZERO_64, tjumptable:$in)>;
+  def : MipsPat<(MipsLo tconstpool:$in), (DADDiu ZERO_64, tconstpool:$in)>;
+  def : MipsPat<(MipsLo tglobaltlsaddr:$in),
+                (DADDiu ZERO_64, tglobaltlsaddr:$in)>;
+  def : MipsPat<(MipsLo texternalsym:$in), (DADDiu ZERO_64, texternalsym:$in)>;
+
+  def : MipsPat<(add GPR64:$hi, (MipsLo tglobaladdr:$lo)),
+                (DADDiu GPR64:$hi, tglobaladdr:$lo)>;
+  def : MipsPat<(add GPR64:$hi, (MipsLo tblockaddress:$lo)),
+                (DADDiu GPR64:$hi, tblockaddress:$lo)>;
+  def : MipsPat<(add GPR64:$hi, (MipsLo tjumptable:$lo)),
+                (DADDiu GPR64:$hi, tjumptable:$lo)>;
+  def : MipsPat<(add GPR64:$hi, (MipsLo tconstpool:$lo)),
+                (DADDiu GPR64:$hi, tconstpool:$lo)>;
+  def : MipsPat<(add GPR64:$hi, (MipsLo tglobaltlsaddr:$lo)),
+                (DADDiu GPR64:$hi, tglobaltlsaddr:$lo)>;
+
+  def : WrapperPat<tglobaladdr, DADDiu, GPR64>;
+  def : WrapperPat<tconstpool, DADDiu, GPR64>;
+  def : WrapperPat<texternalsym, DADDiu, GPR64>;
+  def : WrapperPat<tblockaddress, DADDiu, GPR64>;
+  def : WrapperPat<tjumptable, DADDiu, GPR64>;
+  def : WrapperPat<tglobaltlsaddr, DADDiu, GPR64>;
+}
 
 defm : BrcondPats<GPR64, BEQ64, BNE64, SLT64, SLTu64, SLTi64, SLTiu64,
                   ZERO_64>;
@@ -502,7 +555,17 @@ defm : SetgeImmPats<GPR64, SLTi64, SLTiu64>;
 // truncate
 def : MipsPat<(trunc (assertsext GPR64:$src)),
               (EXTRACT_SUBREG GPR64:$src, sub_32)>;
-def : MipsPat<(trunc (assertzext GPR64:$src)),
+// The forward compatibility strategy employed by MIPS requires us to treat
+// values as being sign extended to an infinite number of bits. This allows
+// existing software to run without modification on any future MIPS
+// implementation (e.g. 128-bit, or 1024-bit). Being compatible with this
+// strategy requires that truncation acts as a sign-extension for values being
+// fed into instructions operating on 32-bit values. Such instructions have
+// undefined results if this is not true.
+// For our case, this means that we can't issue an extract_subreg for nodes
+// such as (trunc:i32 (assertzext:i64 X, i32)), because the sign-bit of the
+// lower subreg would not be replicated into the upper half.
+def : MipsPat<(trunc (assertzext_lt_i32 GPR64:$src)),
               (EXTRACT_SUBREG GPR64:$src, sub_32)>;
 def : MipsPat<(i32 (trunc GPR64:$src)),
               (SLL (EXTRACT_SUBREG GPR64:$src, sub_32), 0)>;
@@ -514,11 +577,14 @@ def : MipsPat<(srl GPR64:$rt, (i32 (trunc GPR64:$rs))),
               (DSRLV GPR64:$rt, (EXTRACT_SUBREG GPR64:$rs, sub_32))>;
 def : MipsPat<(sra GPR64:$rt, (i32 (trunc GPR64:$rs))),
               (DSRAV GPR64:$rt, (EXTRACT_SUBREG GPR64:$rs, sub_32))>;
-def : MipsPat<(rotr GPR64:$rt, (i32 (trunc GPR64:$rs))),
-              (DROTRV GPR64:$rt, (EXTRACT_SUBREG GPR64:$rs, sub_32))>;
+let AdditionalPredicates = [NotInMicroMips] in {
+  def : MipsPat<(rotr GPR64:$rt, (i32 (trunc GPR64:$rs))),
+                (DROTRV GPR64:$rt, (EXTRACT_SUBREG GPR64:$rs, sub_32))>;
+}
 
 // 32-to-64-bit extension
-def : MipsPat<(i64 (anyext GPR32:$src)), (SLL64_32 GPR32:$src)>;
+def : MipsPat<(i64 (anyext GPR32:$src)),
+              (INSERT_SUBREG (i64 (IMPLICIT_DEF)), GPR32:$src, sub_32)>;
 def : MipsPat<(i64 (zext GPR32:$src)), (DSRL (DSLL64_32 GPR32:$src), 32)>;
 def : MipsPat<(i64 (sext GPR32:$src)), (SLL64_32 GPR32:$src)>;
 
@@ -530,26 +596,24 @@ def : MipsPat<(i64 (sext_inreg GPR64:$src, i32)),
 def : MipsPat<(bswap GPR64:$rt), (DSHD (DSBH GPR64:$rt))>;
 
 // Carry pattern
-def : MipsPat<(subc GPR64:$lhs, GPR64:$rhs),
-              (DSUBu GPR64:$lhs, GPR64:$rhs)>;
-let AdditionalPredicates = [NotDSP] in {
+let AdditionalPredicates = [NotInMicroMips] in {
+  def : MipsPat<(subc GPR64:$lhs, GPR64:$rhs),
+                (DSUBu GPR64:$lhs, GPR64:$rhs)>;
   def : MipsPat<(addc GPR64:$lhs, GPR64:$rhs),
-                (DADDu GPR64:$lhs, GPR64:$rhs)>;
+                (DADDu GPR64:$lhs, GPR64:$rhs)>, ASE_NOT_DSP;
   def : MipsPat<(addc GPR64:$lhs, immSExt16:$imm),
-                (DADDiu GPR64:$lhs, imm:$imm)>;
+                (DADDiu GPR64:$lhs, imm:$imm)>, ASE_NOT_DSP;
 }
 
 // Octeon bbit0/bbit1 MipsPattern
-let Predicates = [HasMips64, HasCnMips] in {
 def : MipsPat<(brcond (i32 (seteq (and i64:$lhs, PowerOf2LO:$mask), 0)), bb:$dst),
-              (BBIT0 i64:$lhs, (Log2LO PowerOf2LO:$mask), bb:$dst)>;
+              (BBIT0 i64:$lhs, (Log2LO PowerOf2LO:$mask), bb:$dst)>, ASE_MIPS64_CNMIPS;
 def : MipsPat<(brcond (i32 (seteq (and i64:$lhs, PowerOf2HI:$mask), 0)), bb:$dst),
-              (BBIT032 i64:$lhs, (Log2HI PowerOf2HI:$mask), bb:$dst)>;
+              (BBIT032 i64:$lhs, (Log2HI PowerOf2HI:$mask), bb:$dst)>, ASE_MIPS64_CNMIPS;
 def : MipsPat<(brcond (i32 (setne (and i64:$lhs, PowerOf2LO:$mask), 0)), bb:$dst),
-              (BBIT1 i64:$lhs, (Log2LO PowerOf2LO:$mask), bb:$dst)>;
+              (BBIT1 i64:$lhs, (Log2LO PowerOf2LO:$mask), bb:$dst)>, ASE_MIPS64_CNMIPS;
 def : MipsPat<(brcond (i32 (setne (and i64:$lhs, PowerOf2HI:$mask), 0)), bb:$dst),
-              (BBIT132 i64:$lhs, (Log2HI PowerOf2HI:$mask), bb:$dst)>;
-}
+              (BBIT132 i64:$lhs, (Log2HI PowerOf2HI:$mask), bb:$dst)>, ASE_MIPS64_CNMIPS;
 
 // Atomic load patterns.
 def : MipsPat<(atomic_load_8 addr:$a), (LB64 addr:$a)>;
@@ -566,39 +630,40 @@ def : MipsPat<(atomic_store_64 addr:$a, GPR64:$v), (SD GPR64:$v, addr:$a)>;
 //===----------------------------------------------------------------------===//
 // Instruction aliases
 //===----------------------------------------------------------------------===//
-def : MipsInstAlias<"move $dst, $src",
-                    (OR64 GPR64Opnd:$dst,  GPR64Opnd:$src, ZERO_64), 1>,
-      GPR_64;
-def : MipsInstAlias<"move $dst, $src",
-                    (DADDu GPR64Opnd:$dst,  GPR64Opnd:$src, ZERO_64), 1>,
-      GPR_64;
-def : MipsInstAlias<"daddu $rs, $rt, $imm",
-                    (DADDiu GPR64Opnd:$rs, GPR64Opnd:$rt, simm16_64:$imm),
-                    0>, ISA_MIPS3;
-def : MipsInstAlias<"dadd $rs, $rt, $imm",
-                    (DADDi GPR64Opnd:$rs, GPR64Opnd:$rt, simm16_64:$imm),
-                    0>, ISA_MIPS3_NOT_32R6_64R6;
-def : MipsInstAlias<"daddu $rs, $imm",
-                    (DADDiu GPR64Opnd:$rs, GPR64Opnd:$rs, simm16_64:$imm),
-                    0>, ISA_MIPS3;
-def : MipsInstAlias<"dadd $rs, $imm",
-                    (DADDi GPR64Opnd:$rs, GPR64Opnd:$rs, simm16_64:$imm),
-                    0>, ISA_MIPS3_NOT_32R6_64R6;
+let AdditionalPredicates = [NotInMicroMips] in {
+  def : MipsInstAlias<"move $dst, $src",
+                      (OR64 GPR64Opnd:$dst,  GPR64Opnd:$src, ZERO_64), 1>,
+        GPR_64;
+  def : MipsInstAlias<"move $dst, $src",
+                      (DADDu GPR64Opnd:$dst,  GPR64Opnd:$src, ZERO_64), 1>,
+        GPR_64;
+  def : MipsInstAlias<"dadd $rs, $rt, $imm",
+                      (DADDi GPR64Opnd:$rs, GPR64Opnd:$rt, simm16_64:$imm),
+                      0>, ISA_MIPS3_NOT_32R6_64R6;
+  def : MipsInstAlias<"dadd $rs, $imm",
+                      (DADDi GPR64Opnd:$rs, GPR64Opnd:$rs, simm16_64:$imm),
+                      0>, ISA_MIPS3_NOT_32R6_64R6;
+  def : MipsInstAlias<"daddu $rs, $rt, $imm",
+                      (DADDiu GPR64Opnd:$rs, GPR64Opnd:$rt, simm16_64:$imm),
+                      0>, ISA_MIPS3;
+  def : MipsInstAlias<"daddu $rs, $imm",
+                      (DADDiu GPR64Opnd:$rs, GPR64Opnd:$rs, simm16_64:$imm),
+                      0>, ISA_MIPS3;
+}
 def : MipsInstAlias<"dsll $rd, $rt, $rs",
                     (DSLLV GPR64Opnd:$rd, GPR64Opnd:$rt, GPR32Opnd:$rs), 0>,
                     ISA_MIPS3;
-def : MipsInstAlias<"dneg $rt, $rs",
-                    (DSUB GPR64Opnd:$rt, ZERO_64, GPR64Opnd:$rs), 1>,
-                    ISA_MIPS3;
-def : MipsInstAlias<"dneg $rt",
-                    (DSUB GPR64Opnd:$rt, ZERO_64, GPR64Opnd:$rt), 0>,
-                    ISA_MIPS3;
-def : MipsInstAlias<"dnegu $rt, $rs",
-                    (DSUBu GPR64Opnd:$rt, ZERO_64, GPR64Opnd:$rs), 1>,
-                    ISA_MIPS3;
-def : MipsInstAlias<"dsubu $rt, $rs, $imm",
-                    (DADDiu GPR64Opnd:$rt, GPR64Opnd:$rs,
-                            InvertedImOperand64:$imm), 0>, ISA_MIPS3;
+let AdditionalPredicates = [NotInMicroMips] in {
+  def : MipsInstAlias<"dneg $rt, $rs",
+                      (DSUB GPR64Opnd:$rt, ZERO_64, GPR64Opnd:$rs), 1>,
+                      ISA_MIPS3;
+  def : MipsInstAlias<"dneg $rt",
+                      (DSUB GPR64Opnd:$rt, ZERO_64, GPR64Opnd:$rt), 0>,
+                      ISA_MIPS3;
+  def : MipsInstAlias<"dnegu $rt, $rs",
+                      (DSUBu GPR64Opnd:$rt, ZERO_64, GPR64Opnd:$rs), 1>,
+                      ISA_MIPS3;
+}
 def : MipsInstAlias<"dsubi $rs, $rt, $imm",
                     (DADDi GPR64Opnd:$rs, GPR64Opnd:$rt,
                            InvertedImOperand64:$imm),
@@ -615,29 +680,35 @@ def : MipsInstAlias<"dsub $rs, $imm",
                     (DADDi GPR64Opnd:$rs, GPR64Opnd:$rs,
                            InvertedImOperand64:$imm),
                     0>, ISA_MIPS3_NOT_32R6_64R6;
-def : MipsInstAlias<"dsubu $rs, $imm",
-                    (DADDiu GPR64Opnd:$rs, GPR64Opnd:$rs,
-                            InvertedImOperand64:$imm),
-                    0>, ISA_MIPS3;
+let AdditionalPredicates = [NotInMicroMips] in {
+  def : MipsInstAlias<"dsubu $rt, $rs, $imm",
+                      (DADDiu GPR64Opnd:$rt, GPR64Opnd:$rs,
+                              InvertedImOperand64:$imm), 0>, ISA_MIPS3;
+  def : MipsInstAlias<"dsubu $rs, $imm",
+                      (DADDiu GPR64Opnd:$rs, GPR64Opnd:$rs,
+                              InvertedImOperand64:$imm), 0>, ISA_MIPS3;
+}
 def : MipsInstAlias<"dsra $rd, $rt, $rs",
                     (DSRAV GPR64Opnd:$rd, GPR64Opnd:$rt, GPR32Opnd:$rs), 0>,
                     ISA_MIPS3;
-def : MipsInstAlias<"dsrl $rd, $rt, $rs",
-                    (DSRLV GPR64Opnd:$rd, GPR64Opnd:$rt, GPR32Opnd:$rs), 0>,
-                    ISA_MIPS3;
+let AdditionalPredicates = [NotInMicroMips] in {
+  def : MipsInstAlias<"dsrl $rd, $rt, $rs",
+                      (DSRLV GPR64Opnd:$rd, GPR64Opnd:$rt, GPR32Opnd:$rs), 0>,
+                      ISA_MIPS3;
 
 // Two operand (implicit 0 selector) versions:
-def : MipsInstAlias<"dmfc0 $rt, $rd", (DMFC0 GPR64Opnd:$rt, COP0Opnd:$rd, 0), 0>;
-def : MipsInstAlias<"dmtc0 $rt, $rd", (DMTC0 COP0Opnd:$rd, GPR64Opnd:$rt, 0), 0>;
+  def : MipsInstAlias<"dmtc0 $rt, $rd",
+                      (DMTC0 COP0Opnd:$rd, GPR64Opnd:$rt, 0), 0>;
+  def : MipsInstAlias<"dmfc0 $rt, $rd",
+                      (DMFC0 GPR64Opnd:$rt, COP0Opnd:$rd, 0), 0>;
+}
 def : MipsInstAlias<"dmfc2 $rt, $rd", (DMFC2 GPR64Opnd:$rt, COP2Opnd:$rd, 0), 0>;
 def : MipsInstAlias<"dmtc2 $rt, $rd", (DMTC2 COP2Opnd:$rd, GPR64Opnd:$rt, 0), 0>;
 
-let Predicates = [HasMips64, HasCnMips] in {
-def : MipsInstAlias<"synciobdma", (SYNC 0x2), 0>;
-def : MipsInstAlias<"syncs",      (SYNC 0x6), 0>;
-def : MipsInstAlias<"syncw",      (SYNC 0x4), 0>;
-def : MipsInstAlias<"syncws",     (SYNC 0x5), 0>;
-}
+def : MipsInstAlias<"synciobdma", (SYNC 0x2), 0>, ASE_MIPS64_CNMIPS;
+def : MipsInstAlias<"syncs", (SYNC 0x6), 0>, ASE_MIPS64_CNMIPS;
+def : MipsInstAlias<"syncw", (SYNC 0x4), 0>, ASE_MIPS64_CNMIPS;
+def : MipsInstAlias<"syncws", (SYNC 0x5), 0>, ASE_MIPS64_CNMIPS;
 
 // cnMIPS Aliases.
 
diff --git a/lib/Target/Mips/Mips64r6InstrInfo.td b/lib/Target/Mips/Mips64r6InstrInfo.td
index 6f34dbe28d30..64effbef8a6a 100644
--- a/lib/Target/Mips/Mips64r6InstrInfo.td
+++ b/lib/Target/Mips/Mips64r6InstrInfo.td
@@ -46,35 +46,50 @@ class SCD_R6_ENC : SPECIAL3_LL_SC_FM<OPCODE6_SCD>;
 //
 //===----------------------------------------------------------------------===//
 
-class AHI_ATI_DESC_BASE<string instr_asm, RegisterOperand GPROpnd> {
+class AHI_ATI_DESC_BASE<string instr_asm, RegisterOperand GPROpnd, InstrItinClass itin> {
   dag OutOperandList = (outs GPROpnd:$rs);
-  dag InOperandList = (ins GPROpnd:$rt, simm16:$imm);
+  dag InOperandList = (ins GPROpnd:$rt, simm16_relaxed:$imm);
   string AsmString = !strconcat(instr_asm, "\t$rt, $imm");
   string Constraints = "$rs = $rt";
+  InstrItinClass Itinerary = itin;
 }
 
-class DALIGN_DESC  : ALIGN_DESC_BASE<"dalign", GPR64Opnd, uimm3>;
-class DAHI_DESC    : AHI_ATI_DESC_BASE<"dahi", GPR64Opnd>;
-class DATI_DESC    : AHI_ATI_DESC_BASE<"dati", GPR64Opnd>;
-class DAUI_DESC    : AUI_DESC_BASE<"daui", GPR64Opnd>;
-class DBITSWAP_DESC : BITSWAP_DESC_BASE<"dbitswap", GPR64Opnd>;
-class DCLO_R6_DESC : CLO_R6_DESC_BASE<"dclo", GPR64Opnd>;
-class DCLZ_R6_DESC : CLZ_R6_DESC_BASE<"dclz", GPR64Opnd>;
-class DDIV_DESC    : DIVMOD_DESC_BASE<"ddiv", GPR64Opnd, sdiv>;
-class DDIVU_DESC   : DIVMOD_DESC_BASE<"ddivu", GPR64Opnd, udiv>;
-class DLSA_R6_DESC : LSA_R6_DESC_BASE<"dlsa", GPR64Opnd, uimm2_plus1>;
-class DMOD_DESC    : DIVMOD_DESC_BASE<"dmod", GPR64Opnd, srem>;
-class DMODU_DESC   : DIVMOD_DESC_BASE<"dmodu", GPR64Opnd, urem>;
-class DMUH_DESC    : MUL_R6_DESC_BASE<"dmuh", GPR64Opnd, mulhs>;
-class DMUHU_DESC   : MUL_R6_DESC_BASE<"dmuhu", GPR64Opnd, mulhu>;
-class DMUL_R6_DESC : MUL_R6_DESC_BASE<"dmul", GPR64Opnd, mul>;
-class DMULU_DESC   : MUL_R6_DESC_BASE<"dmulu", GPR64Opnd>;
-class LDPC_DESC    : PCREL_DESC_BASE<"ldpc", GPR64Opnd, simm18_lsl3>;
-class LLD_R6_DESC   : LL_R6_DESC_BASE<"lld", GPR64Opnd>;
-class SCD_R6_DESC   : SC_R6_DESC_BASE<"scd", GPR64Opnd>;
+class DALIGN_DESC  : ALIGN_DESC_BASE<"dalign", GPR64Opnd, uimm3, II_DALIGN>;
+class DAHI_DESC    : AHI_ATI_DESC_BASE<"dahi", GPR64Opnd, II_DAHI>;
+class DATI_DESC    : AHI_ATI_DESC_BASE<"dati", GPR64Opnd, II_DATI>;
+class DAUI_DESC    : AUI_DESC_BASE<"daui", GPR64Opnd, II_DAUI>;
+class DBITSWAP_DESC : BITSWAP_DESC_BASE<"dbitswap", GPR64Opnd, II_DBITSWAP>;
+class DCLO_R6_DESC : CLO_R6_DESC_BASE<"dclo", GPR64Opnd, II_DCLO>;
+class DCLZ_R6_DESC : CLZ_R6_DESC_BASE<"dclz", GPR64Opnd, II_DCLZ>;
+class DDIV_DESC    : DIVMOD_DESC_BASE<"ddiv", GPR64Opnd, II_DDIV, sdiv>;
+class DDIVU_DESC   : DIVMOD_DESC_BASE<"ddivu", GPR64Opnd, II_DDIVU, udiv>;
+class DLSA_R6_DESC : LSA_R6_DESC_BASE<"dlsa", GPR64Opnd, uimm2_plus1, II_DLSA>;
+class DMOD_DESC    : DIVMOD_DESC_BASE<"dmod", GPR64Opnd, II_DMOD, srem>;
+class DMODU_DESC   : DIVMOD_DESC_BASE<"dmodu", GPR64Opnd, II_DMODU, urem>;
+class DMUH_DESC    : MUL_R6_DESC_BASE<"dmuh", GPR64Opnd, II_DMUH, mulhs>;
+class DMUHU_DESC   : MUL_R6_DESC_BASE<"dmuhu", GPR64Opnd, II_DMUHU, mulhu>;
+class DMUL_R6_DESC : MUL_R6_DESC_BASE<"dmul", GPR64Opnd, II_DMUL, mul>;
+class DMULU_DESC   : MUL_R6_DESC_BASE<"dmulu", GPR64Opnd, II_DMUL>;
+class LDPC_DESC    : PCREL_DESC_BASE<"ldpc", GPR64Opnd, simm18_lsl3, II_LDPC>;
+class LLD_R6_DESC   : LL_R6_DESC_BASE<"lld", GPR64Opnd, mem_simm16, II_LLD>;
+class SCD_R6_DESC   : SC_R6_DESC_BASE<"scd", GPR64Opnd, II_SCD>;
 class SELEQZ64_DESC : SELEQNE_Z_DESC_BASE<"seleqz", GPR64Opnd>;
 class SELNEZ64_DESC : SELEQNE_Z_DESC_BASE<"selnez", GPR64Opnd>;
 
+class JIALC64_DESC : JMP_IDX_COMPACT_DESC_BASE<"jialc", calloffset16,
+                                             GPR64Opnd> {
+  bit isCall = 1;
+  list<Register> Defs = [RA];
+}
+
+class JIC64_DESC : JMP_IDX_COMPACT_DESC_BASE<"jic", jmpoffset16, GPR64Opnd> {
+  bit isBarrier = 1;
+  bit isTerminator = 1;
+  list<Register> Defs = [AT];
+}
+
+class LL64_R6_DESC : LL_R6_DESC_BASE<"ll", GPR32Opnd, mem_simm9, II_LL>;
+class SC64_R6_DESC : SC_R6_DESC_BASE<"sc", GPR32Opnd, II_SC>;
 //===----------------------------------------------------------------------===//
 //
 // Instruction Definitions
@@ -88,25 +103,37 @@ let AdditionalPredicates = [NotInMicroMips] in {
   def DALIGN : DALIGN_ENC, DALIGN_DESC, ISA_MIPS64R6;
 }
 def DBITSWAP : DBITSWAP_ENC, DBITSWAP_DESC, ISA_MIPS64R6;
-def DCLO_R6 : DCLO_R6_ENC, DCLO_R6_DESC, ISA_MIPS64R6;
-def DCLZ_R6 : DCLZ_R6_ENC, DCLZ_R6_DESC, ISA_MIPS64R6;
-def DDIV : DDIV_ENC, DDIV_DESC, ISA_MIPS64R6;
-def DDIVU : DDIVU_ENC, DDIVU_DESC, ISA_MIPS64R6;
+let AdditionalPredicates = [NotInMicroMips] in {
+  def DCLO_R6 : R6MMR6Rel, DCLO_R6_ENC, DCLO_R6_DESC, ISA_MIPS64R6;
+  def DCLZ_R6 : R6MMR6Rel, DCLZ_R6_ENC, DCLZ_R6_DESC, ISA_MIPS64R6;
+  def DDIV : DDIV_ENC, DDIV_DESC, ISA_MIPS64R6;
+  def DDIVU : DDIVU_ENC, DDIVU_DESC, ISA_MIPS64R6;
+  def DMOD : DMOD_ENC, DMOD_DESC, ISA_MIPS64R6;
+  def DMODU : DMODU_ENC, DMODU_DESC, ISA_MIPS64R6;
+}
 def DLSA_R6 : DLSA_R6_ENC, DLSA_R6_DESC, ISA_MIPS64R6;
-def DMOD : DMOD_ENC, DMOD_DESC, ISA_MIPS64R6;
-def DMODU : DMODU_ENC, DMODU_DESC, ISA_MIPS64R6;
-def DMUH: DMUH_ENC, DMUH_DESC, ISA_MIPS64R6;
-def DMUHU: DMUHU_ENC, DMUHU_DESC, ISA_MIPS64R6;
-def DMUL_R6: DMUL_R6_ENC, DMUL_R6_DESC, ISA_MIPS64R6;
-def DMULU: DMULU_ENC, DMULU_DESC, ISA_MIPS64R6;
-def LDPC: LDPC_ENC, LDPC_DESC, ISA_MIPS64R6;
-def LLD_R6 : LLD_R6_ENC, LLD_R6_DESC, ISA_MIPS32R6;
+let AdditionalPredicates = [NotInMicroMips] in {
+  def DMUH: DMUH_ENC, DMUH_DESC, ISA_MIPS64R6;
+  def DMUHU: DMUHU_ENC, DMUHU_DESC, ISA_MIPS64R6;
+  def DMUL_R6: DMUL_R6_ENC, DMUL_R6_DESC, ISA_MIPS64R6;
+  def DMULU: DMULU_ENC, DMULU_DESC, ISA_MIPS64R6;
+  def LLD_R6 : R6MMR6Rel, LLD_R6_ENC, LLD_R6_DESC, ISA_MIPS64R6;
+}
+def LDPC: R6MMR6Rel, LDPC_ENC, LDPC_DESC, ISA_MIPS64R6;
 def SCD_R6 : SCD_R6_ENC, SCD_R6_DESC, ISA_MIPS32R6;
 let DecoderNamespace = "Mips32r6_64r6_GP64" in {
   def SELEQZ64 : SELEQZ_ENC, SELEQZ64_DESC, ISA_MIPS32R6, GPR_64;
   def SELNEZ64 : SELNEZ_ENC, SELNEZ64_DESC, ISA_MIPS32R6, GPR_64;
 }
-
+let AdditionalPredicates = [NotInMicroMips],
+    DecoderNamespace = "Mips32r6_64r6_PTR64" in {
+  def LL64_R6 : LL_R6_ENC, LL64_R6_DESC, PTR_64, ISA_MIPS64R6;
+  def SC64_R6 : SC_R6_ENC, SC64_R6_DESC, PTR_64, ISA_MIPS64R6;
+}
+let isCodeGenOnly = 1 in {
+def JIALC64 : JIALC_ENC, JIALC64_DESC, ISA_MIPS64R6;
+def JIC64 : JIC_ENC, JIC64_DESC, ISA_MIPS64R6;
+}
 //===----------------------------------------------------------------------===//
 //
 // Instruction Aliases
@@ -115,6 +142,9 @@ let DecoderNamespace = "Mips32r6_64r6_GP64" in {
 
 def : MipsInstAlias<"jr $rs", (JALR64 ZERO_64, GPR64Opnd:$rs), 1>, ISA_MIPS64R6;
 
+def : MipsInstAlias<"jrc $rs", (JIC64 GPR64Opnd:$rs, 0), 1>, ISA_MIPS64R6;
+
+def : MipsInstAlias<"jalrc $rs", (JIALC64 GPR64Opnd:$rs, 0), 1>, ISA_MIPS64R6;
 //===----------------------------------------------------------------------===//
 //
 // Patterns and Pseudo Instructions
diff --git a/lib/Target/Mips/MipsAsmPrinter.cpp b/lib/Target/Mips/MipsAsmPrinter.cpp
index 957529376b37..3686c2fe3ec4 100644
--- a/lib/Target/Mips/MipsAsmPrinter.cpp
+++ b/lib/Target/Mips/MipsAsmPrinter.cpp
@@ -313,7 +313,6 @@ const char *MipsAsmPrinter::getCurrentABIString() const {
   case MipsABIInfo::ABI::O32:  return "abi32";
   case MipsABIInfo::ABI::N32:  return "abiN32";
   case MipsABIInfo::ABI::N64:  return "abi64";
-  case MipsABIInfo::ABI::EABI: return "eabi32"; // TODO: handle eabi64
   default: llvm_unreachable("Unknown Mips ABI");
   }
 }
@@ -326,9 +325,10 @@ void MipsAsmPrinter::EmitFunctionEntryLabel() {
   if (Subtarget->isTargetNaCl())
     EmitAlignment(std::max(MF->getAlignment(), MIPS_NACL_BUNDLE_ALIGN));
 
-  if (Subtarget->inMicroMipsMode())
+  if (Subtarget->inMicroMipsMode()) {
     TS.emitDirectiveSetMicroMips();
-  else
+    TS.setUsesMicroMips();
+  } else
     TS.emitDirectiveSetNoMicroMips();
 
   if (Subtarget->inMips16Mode())
@@ -620,24 +620,6 @@ void MipsAsmPrinter::printOperand(const MachineInstr *MI, int opNum,
   if (closeP) O << ")";
 }
 
-void MipsAsmPrinter::printUnsignedImm(const MachineInstr *MI, int opNum,
-                                      raw_ostream &O) {
-  const MachineOperand &MO = MI->getOperand(opNum);
-  if (MO.isImm())
-    O << (unsigned short int)MO.getImm();
-  else
-    printOperand(MI, opNum, O);
-}
-
-void MipsAsmPrinter::printUnsignedImm8(const MachineInstr *MI, int opNum,
-                                       raw_ostream &O) {
-  const MachineOperand &MO = MI->getOperand(opNum);
-  if (MO.isImm())
-    O << (unsigned short int)(unsigned char)MO.getImm();
-  else
-    printOperand(MI, opNum, O);
-}
-
 void MipsAsmPrinter::
 printMemOperand(const MachineInstr *MI, int opNum, raw_ostream &O) {
   // Load/Store memory operands -- imm($reg)
@@ -687,6 +669,12 @@ printRegisterList(const MachineInstr *MI, int opNum, raw_ostream &O) {
 }
 
 void MipsAsmPrinter::EmitStartOfAsmFile(Module &M) {
+  MipsTargetStreamer &TS = getTargetStreamer();
+
+  // MipsTargetStreamer has an initialization order problem when emitting an
+  // object file directly (see MipsTargetELFStreamer for full details). Work
+  // around it by re-initializing the PIC state here.
+  TS.setPic(OutContext.getObjectFileInfo()->isPositionIndependent());
 
   // Compute MIPS architecture attributes based on the default subtarget
   // that we'd have constructed. Module level directives aren't LTO
@@ -702,14 +690,13 @@ void MipsAsmPrinter::EmitStartOfAsmFile(Module &M) {
   bool IsABICalls = STI.isABICalls();
   const MipsABIInfo &ABI = MTM.getABI();
   if (IsABICalls) {
-    getTargetStreamer().emitDirectiveAbiCalls();
-    Reloc::Model RM = TM.getRelocationModel();
+    TS.emitDirectiveAbiCalls();
     // FIXME: This condition should be a lot more complicated that it is here.
     //        Ideally it should test for properties of the ABI and not the ABI
     //        itself.
     //        For the moment, I'm only correcting enough to make MIPS-IV work.
-    if (RM == Reloc::Static && !ABI.IsN64())
-      getTargetStreamer().emitDirectiveOptionPic0();
+    if (!isPositionIndependent() && !ABI.IsN64())
+      TS.emitDirectiveOptionPic0();
   }
 
   // Tell the assembler which ABI we are using
@@ -720,33 +707,24 @@ void MipsAsmPrinter::EmitStartOfAsmFile(Module &M) {
   // NaN: At the moment we only support:
   // 1. .nan legacy (default)
   // 2. .nan 2008
-  STI.isNaN2008() ? getTargetStreamer().emitDirectiveNaN2008()
-                  : getTargetStreamer().emitDirectiveNaNLegacy();
+  STI.isNaN2008() ? TS.emitDirectiveNaN2008()
+                  : TS.emitDirectiveNaNLegacy();
 
   // TODO: handle O64 ABI
 
-  if (ABI.IsEABI()) {
-    if (STI.isGP32bit())
-      OutStreamer->SwitchSection(OutContext.getELFSection(".gcc_compiled_long32",
-                                                          ELF::SHT_PROGBITS, 0));
-    else
-      OutStreamer->SwitchSection(OutContext.getELFSection(".gcc_compiled_long64",
-                                                          ELF::SHT_PROGBITS, 0));
-  }
-
-  getTargetStreamer().updateABIInfo(STI);
+  TS.updateABIInfo(STI);
 
   // We should always emit a '.module fp=...' but binutils 2.24 does not accept
   // it. We therefore emit it when it contradicts the ABI defaults (-mfpxx or
   // -mfp64) and omit it otherwise.
   if (ABI.IsO32() && (STI.isABI_FPXX() || STI.isFP64bit()))
-    getTargetStreamer().emitDirectiveModuleFP();
+    TS.emitDirectiveModuleFP();
 
   // We should always emit a '.module [no]oddspreg' but binutils 2.24 does not
   // accept it. We therefore emit it when it contradicts the default or an
   // option has changed the default (i.e. FPXX) and omit it otherwise.
   if (ABI.IsO32() && (!STI.useOddSPReg() || STI.isABI_FPXX()))
-    getTargetStreamer().emitDirectiveModuleOddSPReg();
+    TS.emitDirectiveModuleOddSPReg();
 }
 
 void MipsAsmPrinter::emitInlineAsmStart() const {
@@ -990,7 +968,7 @@ void MipsAsmPrinter::EmitFPCallStub(
   OutStreamer->EmitLabel(Stub);
 
   // Only handle non-pic for now.
-  assert(TM.getRelocationModel() != Reloc::PIC_ &&
+  assert(!isPositionIndependent() &&
          "should not be here if we are compiling pic");
   TS.emitDirectiveSetReorder();
   //
@@ -1071,10 +1049,9 @@ void MipsAsmPrinter::NaClAlignIndirectJumpTargets(MachineFunction &MF) {
   }
 
   // If basic block address is taken, block can be target of indirect branch.
-  for (MachineFunction::iterator MBB = MF.begin(), E = MF.end();
-                                 MBB != E; ++MBB) {
-    if (MBB->hasAddressTaken())
-      MBB->setAlignment(MIPS_NACL_BUNDLE_ALIGN);
+  for (auto &MBB : MF) {
+    if (MBB.hasAddressTaken())
+      MBB.setAlignment(MIPS_NACL_BUNDLE_ALIGN);
   }
 }
 
diff --git a/lib/Target/Mips/MipsAsmPrinter.h b/lib/Target/Mips/MipsAsmPrinter.h
index a7f3304a3da8..f30141fc918f 100644
--- a/lib/Target/Mips/MipsAsmPrinter.h
+++ b/lib/Target/Mips/MipsAsmPrinter.h
@@ -134,8 +134,6 @@ public:
                              unsigned AsmVariant, const char *ExtraCode,
                              raw_ostream &O) override;
   void printOperand(const MachineInstr *MI, int opNum, raw_ostream &O);
-  void printUnsignedImm(const MachineInstr *MI, int opNum, raw_ostream &O);
-  void printUnsignedImm8(const MachineInstr *MI, int opNum, raw_ostream &O);
   void printMemOperand(const MachineInstr *MI, int opNum, raw_ostream &O);
   void printMemOperandEA(const MachineInstr *MI, int opNum, raw_ostream &O);
   void printFCCOperand(const MachineInstr *MI, int opNum, raw_ostream &O,
diff --git a/lib/Target/Mips/MipsCCState.cpp b/lib/Target/Mips/MipsCCState.cpp
index d82063e3d2a9..7af988c1f64d 100644
--- a/lib/Target/Mips/MipsCCState.cpp
+++ b/lib/Target/Mips/MipsCCState.cpp
@@ -26,8 +26,8 @@ static bool isF128SoftLibCall(const char *CallSym) {
       "ceill",         "copysignl",    "cosl",          "exp2l",
       "expl",          "floorl",       "fmal",          "fmodl",
       "log10l",        "log2l",        "logl",          "nearbyintl",
-      "powl",          "rintl",        "sinl",          "sqrtl",
-      "truncl"};
+      "powl",          "rintl",        "roundl",        "sinl",
+      "sqrtl",         "truncl"};
 
   // Check that LibCalls is sorted alphabetically.
   auto Comp = [](const char *S1, const char *S2) { return strcmp(S1, S2) < 0; };
diff --git a/lib/Target/Mips/MipsCallingConv.td b/lib/Target/Mips/MipsCallingConv.td
index 0b4b7785af67..a57cb7badc17 100644
--- a/lib/Target/Mips/MipsCallingConv.td
+++ b/lib/Target/Mips/MipsCallingConv.td
@@ -212,48 +212,6 @@ def RetCC_MipsN : CallingConv<[
 ]>;
 
 //===----------------------------------------------------------------------===//
-// Mips EABI Calling Convention
-//===----------------------------------------------------------------------===//
-
-def CC_MipsEABI : CallingConv<[
-  // Promote i8/i16 arguments to i32.
-  CCIfType<[i8, i16], CCPromoteToType<i32>>,
-
-  // Integer arguments are passed in integer registers.
-  CCIfType<[i32], CCAssignToReg<[A0, A1, A2, A3, T0, T1, T2, T3]>>,
-
-  // Single fp arguments are passed in pairs within 32-bit mode
-  CCIfType<[f32], CCIfSubtarget<"isSingleFloat()",
-                  CCAssignToReg<[F12, F13, F14, F15, F16, F17, F18, F19]>>>,
-
-  CCIfType<[f32], CCIfSubtargetNot<"isSingleFloat()",
-                  CCAssignToReg<[F12, F14, F16, F18]>>>,
-
-  // The first 4 double fp arguments are passed in single fp registers.
-  CCIfType<[f64], CCIfSubtargetNot<"isSingleFloat()",
-                  CCAssignToReg<[D6, D7, D8, D9]>>>,
-
-  // Integer values get stored in stack slots that are 4 bytes in
-  // size and 4-byte aligned.
-  CCIfType<[i32, f32], CCAssignToStack<4, 4>>,
-
-  // Integer values get stored in stack slots that are 8 bytes in
-  // size and 8-byte aligned.
-  CCIfType<[f64], CCIfSubtargetNot<"isSingleFloat()", CCAssignToStack<8, 8>>>
-]>;
-
-def RetCC_MipsEABI : CallingConv<[
-  // i32 are returned in registers V0, V1
-  CCIfType<[i32], CCAssignToReg<[V0, V1]>>,
-
-  // f32 are returned in registers F0, F1
-  CCIfType<[f32], CCAssignToReg<[F0, F1]>>,
-
-  // f64 are returned in register D0
-  CCIfType<[f64], CCIfSubtargetNot<"isSingleFloat()", CCAssignToReg<[D0]>>>
-]>;
-
-//===----------------------------------------------------------------------===//
 // Mips FastCC Calling Convention
 //===----------------------------------------------------------------------===//
 def CC_MipsO32_FastCC : CallingConv<[
@@ -325,7 +283,6 @@ def CC_Mips_FastCC : CallingConv<[
   // Stack parameter slots for i32 and f32 are 32-bit words and 4-byte aligned.
   CCIfType<[i32, f32], CCAssignToStack<4, 4>>,
 
-  CCIfSubtarget<"isABI_EABI()", CCDelegateTo<CC_MipsEABI>>,
   CCIfSubtarget<"isABI_O32()", CCDelegateTo<CC_MipsO32_FastCC>>,
   CCDelegateTo<CC_MipsN_FastCC>
 ]>;
@@ -335,7 +292,6 @@ def CC_Mips_FastCC : CallingConv<[
 //===----------------------------------------------------------------------===//
 
 def RetCC_Mips : CallingConv<[
-  CCIfSubtarget<"isABI_EABI()", CCDelegateTo<RetCC_MipsEABI>>,
   CCIfSubtarget<"isABI_N32()", CCDelegateTo<RetCC_MipsN>>,
   CCIfSubtarget<"isABI_N64()", CCDelegateTo<RetCC_MipsN>>,
   CCDelegateTo<RetCC_MipsO32>
@@ -377,8 +333,6 @@ def CC_Mips_FixedArg : CallingConv<[
 
   CCIfCC<"CallingConv::Fast", CCDelegateTo<CC_Mips_FastCC>>,
 
-  // FIXME: There wasn't an EABI case in the original code and it seems unlikely
-  //        that it's the same as CC_MipsN
   CCIfSubtarget<"isABI_O32()", CCDelegateTo<CC_MipsO32_FP>>,
   CCDelegateTo<CC_MipsN>
 ]>;
@@ -386,8 +340,6 @@ def CC_Mips_FixedArg : CallingConv<[
 def CC_Mips_VarArg : CallingConv<[
   CCIfByVal<CCDelegateTo<CC_Mips_ByVal>>,
 
-  // FIXME: There wasn't an EABI case in the original code and it seems unlikely
-  //        that it's the same as CC_MipsN_VarArg
   CCIfSubtarget<"isABI_O32()", CCDelegateTo<CC_MipsO32_FP>>,
   CCDelegateTo<CC_MipsN_VarArg>
 ]>;
diff --git a/lib/Target/Mips/MipsCondMov.td b/lib/Target/Mips/MipsCondMov.td
index 2d96d9b48c0b..fd4517f25335 100644
--- a/lib/Target/Mips/MipsCondMov.td
+++ b/lib/Target/Mips/MipsCondMov.td
@@ -133,16 +133,14 @@ def MOVZ_I_S : MMRel, CMov_I_F_FT<"movz.s", GPR32Opnd, FGR32Opnd, II_MOVZ_S>,
 
 let isCodeGenOnly = 1 in
 def MOVZ_I64_S : CMov_I_F_FT<"movz.s", GPR64Opnd, FGR32Opnd, II_MOVZ_S>,
-                 CMov_I_F_FM<18, 16>, INSN_MIPS4_32_NOT_32R6_64R6,
-                 AdditionalRequires<[HasMips64]>;
+                 CMov_I_F_FM<18, 16>, INSN_MIPS4_32_NOT_32R6_64R6, GPR_64;
 
 def MOVN_I_S : MMRel, CMov_I_F_FT<"movn.s", GPR32Opnd, FGR32Opnd, II_MOVN_S>,
                CMov_I_F_FM<19, 16>, INSN_MIPS4_32_NOT_32R6_64R6;
 
 let isCodeGenOnly = 1 in
 def MOVN_I64_S : CMov_I_F_FT<"movn.s", GPR64Opnd, FGR32Opnd, II_MOVN_S>,
-                 CMov_I_F_FM<19, 16>, INSN_MIPS4_32_NOT_32R6_64R6,
-                 AdditionalRequires<[IsGP64bit]>;
+                 CMov_I_F_FM<19, 16>, INSN_MIPS4_32_NOT_32R6_64R6, GPR_64;
 
 def MOVZ_I_D32 : MMRel, CMov_I_F_FT<"movz.d", GPR32Opnd, AFGR64Opnd,
                                     II_MOVZ_D>, CMov_I_F_FM<18, 17>,
@@ -169,16 +167,14 @@ def MOVT_I : MMRel, CMov_F_I_FT<"movt", GPR32Opnd, II_MOVT, MipsCMovFP_T>,
 
 let isCodeGenOnly = 1 in
 def MOVT_I64 : CMov_F_I_FT<"movt", GPR64Opnd, II_MOVT, MipsCMovFP_T>,
-               CMov_F_I_FM<1>, INSN_MIPS4_32_NOT_32R6_64R6,
-               AdditionalRequires<[IsGP64bit]>;
+               CMov_F_I_FM<1>, INSN_MIPS4_32_NOT_32R6_64R6, GPR_64;
 
 def MOVF_I : MMRel, CMov_F_I_FT<"movf", GPR32Opnd, II_MOVF, MipsCMovFP_F>,
              CMov_F_I_FM<0>, INSN_MIPS4_32_NOT_32R6_64R6;
 
 let isCodeGenOnly = 1 in
 def MOVF_I64 : CMov_F_I_FT<"movf", GPR64Opnd, II_MOVF, MipsCMovFP_F>,
-               CMov_F_I_FM<0>, INSN_MIPS4_32_NOT_32R6_64R6,
-               AdditionalRequires<[IsGP64bit]>;
+               CMov_F_I_FM<0>, INSN_MIPS4_32_NOT_32R6_64R6, GPR_64;
 
 def MOVT_S : MMRel, CMov_F_F_FT<"movt.s", FGR32Opnd, II_MOVT_S, MipsCMovFP_T>,
              CMov_F_F_FM<16, 1>, INSN_MIPS4_32_NOT_32R6_64R6;
diff --git a/lib/Target/Mips/MipsConstantIslandPass.cpp b/lib/Target/Mips/MipsConstantIslandPass.cpp
index ea8c5871fa0e..1ea48e0439d4 100644
--- a/lib/Target/Mips/MipsConstantIslandPass.cpp
+++ b/lib/Target/Mips/MipsConstantIslandPass.cpp
@@ -97,16 +97,6 @@ static unsigned int branchTargetOperand(MachineInstr *MI) {
   llvm_unreachable("Unknown branch type");
 }
 
-static bool isUnconditionalBranch(unsigned int Opcode) {
-  switch (Opcode) {
-  default: return false;
-  case Mips::Bimm16:
-  case Mips::BimmX16:
-  case Mips::JalB16:
-    return true;
-  }
-}
-
 static unsigned int longformBranchOpcode(unsigned int Opcode) {
   switch (Opcode) {
   case Mips::Bimm16:
@@ -342,8 +332,6 @@ namespace {
   /// the branch fix up pass.
   bool HasFarJump;
 
-  const TargetMachine &TM;
-  bool IsPIC;
   const MipsSubtarget *STI;
   const Mips16InstrInfo *TII;
   MipsFunctionInfo *MFI;
@@ -364,10 +352,9 @@ namespace {
 
   public:
     static char ID;
-    MipsConstantIslands(TargetMachine &tm)
-        : MachineFunctionPass(ID), TM(tm),
-          IsPIC(TM.getRelocationModel() == Reloc::PIC_), STI(nullptr),
-          MF(nullptr), MCP(nullptr), PrescannedForConstants(false) {}
+    MipsConstantIslands()
+        : MachineFunctionPass(ID), STI(nullptr), MF(nullptr), MCP(nullptr),
+          PrescannedForConstants(false) {}
 
     const char *getPassName() const override {
       return "Mips Constant Islands";
@@ -375,9 +362,14 @@ namespace {
 
     bool runOnMachineFunction(MachineFunction &F) override;
 
+    MachineFunctionProperties getRequiredProperties() const override {
+      return MachineFunctionProperties().set(
+          MachineFunctionProperties::Property::AllVRegsAllocated);
+    }
+
     void doInitialPlacement(std::vector<MachineInstr*> &CPEMIs);
     CPEntry *findConstPoolEntry(unsigned CPI, const MachineInstr *CPEMI);
-    unsigned getCPELogAlign(const MachineInstr *CPEMI);
+    unsigned getCPELogAlign(const MachineInstr &CPEMI);
     void initializeFunctionInfo(const std::vector<MachineInstr*> &CPEMIs);
     unsigned getOffsetOf(MachineInstr *MI) const;
     unsigned getUserOffset(CPUser&) const;
@@ -389,7 +381,7 @@ namespace {
                          const CPUser &U);
 
     void computeBlockSize(MachineBasicBlock *MBB);
-    MachineBasicBlock *splitBlockBeforeInstr(MachineInstr *MI);
+    MachineBasicBlock *splitBlockBeforeInstr(MachineInstr &MI);
     void updateForInsertedWaterBlock(MachineBasicBlock *NewBB);
     void adjustBBOffsetsAfter(MachineBasicBlock *BB);
     bool decrementCPEReferenceCount(unsigned CPI, MachineInstr* CPEMI);
@@ -437,10 +429,9 @@ void MipsConstantIslands::dumpBBs() {
     }
   });
 }
-/// createMipsLongBranchPass - Returns a pass that converts branches to long
-/// branches.
-FunctionPass *llvm::createMipsConstantIslandPass(MipsTargetMachine &tm) {
-  return new MipsConstantIslands(tm);
+/// Returns a pass that converts branches to long branches.
+FunctionPass *llvm::createMipsConstantIslandPass() {
+  return new MipsConstantIslands();
 }
 
 bool MipsConstantIslands::runOnMachineFunction(MachineFunction &mf) {
@@ -629,14 +620,14 @@ MipsConstantIslands::CPEntry
 
 /// getCPELogAlign - Returns the required alignment of the constant pool entry
 /// represented by CPEMI.  Alignment is measured in log2(bytes) units.
-unsigned MipsConstantIslands::getCPELogAlign(const MachineInstr *CPEMI) {
-  assert(CPEMI && CPEMI->getOpcode() == Mips::CONSTPOOL_ENTRY);
+unsigned MipsConstantIslands::getCPELogAlign(const MachineInstr &CPEMI) {
+  assert(CPEMI.getOpcode() == Mips::CONSTPOOL_ENTRY);
 
   // Everything is 4-byte aligned unless AlignConstantIslands is set.
   if (!AlignConstantIslands)
     return 2;
 
-  unsigned CPI = CPEMI->getOperand(1).getIndex();
+  unsigned CPI = CPEMI.getOperand(1).getIndex();
   assert(CPI < MCP->getConstants().size() && "Invalid constant pool index.");
   unsigned Align = MCP->getConstants()[CPI].getAlignment();
   assert(isPowerOf2_32(Align) && "Invalid CPE alignment");
@@ -663,21 +654,17 @@ initializeFunctionInfo(const std::vector<MachineInstr*> &CPEMIs) {
   adjustBBOffsetsAfter(&MF->front());
 
   // Now go back through the instructions and build up our data structures.
-  for (MachineFunction::iterator MBBI = MF->begin(), E = MF->end();
-       MBBI != E; ++MBBI) {
-    MachineBasicBlock &MBB = *MBBI;
-
+  for (MachineBasicBlock &MBB : *MF) {
     // If this block doesn't fall through into the next MBB, then this is
     // 'water' that a constant pool island could be placed.
     if (!BBHasFallthrough(&MBB))
       WaterList.push_back(&MBB);
-    for (MachineBasicBlock::iterator I = MBB.begin(), E = MBB.end();
-         I != E; ++I) {
-      if (I->isDebugValue())
+    for (MachineInstr &MI : MBB) {
+      if (MI.isDebugValue())
         continue;
 
-      int Opc = I->getOpcode();
-      if (I->isBranch()) {
+      int Opc = MI.getOpcode();
+      if (MI.isBranch()) {
         bool isCond = false;
         unsigned Bits = 0;
         unsigned Scale = 1;
@@ -746,7 +733,7 @@ initializeFunctionInfo(const std::vector<MachineInstr*> &CPEMIs) {
         }
         // Record this immediate branch.
         unsigned MaxOffs = ((1 << (Bits-1))-1) * Scale;
-        ImmBranches.push_back(ImmBranch(I, MaxOffs, isCond, UOpc));
+        ImmBranches.push_back(ImmBranch(&MI, MaxOffs, isCond, UOpc));
       }
 
       if (Opc == Mips::CONSTPOOL_ENTRY)
@@ -754,8 +741,8 @@ initializeFunctionInfo(const std::vector<MachineInstr*> &CPEMIs) {
 
 
       // Scan the instructions for constant pool operands.
-      for (unsigned op = 0, e = I->getNumOperands(); op != e; ++op)
-        if (I->getOperand(op).isCPI()) {
+      for (unsigned op = 0, e = MI.getNumOperands(); op != e; ++op)
+        if (MI.getOperand(op).isCPI()) {
 
           // We found one.  The addressing mode tells us the max displacement
           // from the PC that this instruction permits.
@@ -784,12 +771,12 @@ initializeFunctionInfo(const std::vector<MachineInstr*> &CPEMIs) {
             break;
           }
           // Remember that this is a user of a CP entry.
-          unsigned CPI = I->getOperand(op).getIndex();
+          unsigned CPI = MI.getOperand(op).getIndex();
           MachineInstr *CPEMI = CPEMIs[CPI];
           unsigned MaxOffs = ((1 << Bits)-1) * Scale;
           unsigned LongFormMaxOffs = ((1 << LongFormBits)-1) * LongFormScale;
-          CPUsers.push_back(CPUser(I, CPEMI, MaxOffs, NegOk,
-                                   LongFormMaxOffs, LongFormOpcode));
+          CPUsers.push_back(CPUser(&MI, CPEMI, MaxOffs, NegOk, LongFormMaxOffs,
+                                   LongFormOpcode));
 
           // Increment corresponding CPEntry reference count.
           CPEntry *CPE = findConstPoolEntry(CPI, CPEMI);
@@ -813,10 +800,8 @@ void MipsConstantIslands::computeBlockSize(MachineBasicBlock *MBB) {
   BasicBlockInfo &BBI = BBInfo[MBB->getNumber()];
   BBI.Size = 0;
 
-  for (MachineBasicBlock::iterator I = MBB->begin(), E = MBB->end(); I != E;
-       ++I)
-    BBI.Size += TII->GetInstSizeInBytes(I);
-
+  for (const MachineInstr &MI : *MBB)
+    BBI.Size += TII->GetInstSizeInBytes(MI);
 }
 
 /// getOffsetOf - Return the current offset of the specified machine instruction
@@ -833,7 +818,7 @@ unsigned MipsConstantIslands::getOffsetOf(MachineInstr *MI) const {
   // Sum instructions before MI in MBB.
   for (MachineBasicBlock::iterator I = MBB->begin(); &*I != MI; ++I) {
     assert(I != MBB->end() && "Didn't find MI in its own basic block?");
-    Offset += TII->GetInstSizeInBytes(I);
+    Offset += TII->GetInstSizeInBytes(*I);
   }
   return Offset;
 }
@@ -872,9 +857,9 @@ unsigned MipsConstantIslands::getUserOffset(CPUser &U) const {
 /// Split the basic block containing MI into two blocks, which are joined by
 /// an unconditional branch.  Update data structures and renumber blocks to
 /// account for this change and returns the newly created block.
-MachineBasicBlock *MipsConstantIslands::splitBlockBeforeInstr
-  (MachineInstr *MI) {
-  MachineBasicBlock *OrigBB = MI->getParent();
+MachineBasicBlock *
+MipsConstantIslands::splitBlockBeforeInstr(MachineInstr &MI) {
+  MachineBasicBlock *OrigBB = MI.getParent();
 
   // Create a new MBB for the code after the OrigBB.
   MachineBasicBlock *NewBB =
@@ -964,7 +949,7 @@ bool MipsConstantIslands::isOffsetInRange(unsigned UserOffset,
 bool MipsConstantIslands::isWaterInRange(unsigned UserOffset,
                                         MachineBasicBlock* Water, CPUser &U,
                                         unsigned &Growth) {
-  unsigned CPELogAlign = getCPELogAlign(U.CPEMI);
+  unsigned CPELogAlign = getCPELogAlign(*U.CPEMI);
   unsigned CPEOffset = BBInfo[Water->getNumber()].postOffset(CPELogAlign);
   unsigned NextBlockOffset, NextBlockAlignment;
   MachineFunction::const_iterator NextBlock = ++Water->getIterator();
@@ -985,7 +970,7 @@ bool MipsConstantIslands::isWaterInRange(unsigned UserOffset,
     Growth = CPEEnd - NextBlockOffset;
     // Compute the padding that would go at the end of the CPE to align the next
     // block.
-    Growth += OffsetToAlignment(CPEEnd, 1u << NextBlockAlignment);
+    Growth += OffsetToAlignment(CPEEnd, 1ULL << NextBlockAlignment);
 
     // If the CPE is to be inserted before the instruction, that will raise
     // the offset of the instruction. Also account for unknown alignment padding
@@ -1246,7 +1231,7 @@ void MipsConstantIslands::createNewWater(unsigned CPUserIndex,
   CPUser &U = CPUsers[CPUserIndex];
   MachineInstr *UserMI = U.MI;
   MachineInstr *CPEMI  = U.CPEMI;
-  unsigned CPELogAlign = getCPELogAlign(CPEMI);
+  unsigned CPELogAlign = getCPELogAlign(*CPEMI);
   MachineBasicBlock *UserMBB = UserMI->getParent();
   const BasicBlockInfo &UserBBI = BBInfo[UserMBB->getNumber()];
 
@@ -1312,11 +1297,12 @@ void MipsConstantIslands::createNewWater(unsigned CPUserIndex,
   unsigned CPUIndex = CPUserIndex+1;
   unsigned NumCPUsers = CPUsers.size();
   //MachineInstr *LastIT = 0;
-  for (unsigned Offset = UserOffset+TII->GetInstSizeInBytes(UserMI);
+  for (unsigned Offset = UserOffset + TII->GetInstSizeInBytes(*UserMI);
        Offset < BaseInsertOffset;
-       Offset += TII->GetInstSizeInBytes(MI), MI = std::next(MI)) {
+       Offset += TII->GetInstSizeInBytes(*MI), MI = std::next(MI)) {
     assert(MI != UserMBB->end() && "Fell off end of block");
-    if (CPUIndex < NumCPUsers && CPUsers[CPUIndex].MI == MI) {
+    if (CPUIndex < NumCPUsers &&
+        CPUsers[CPUIndex].MI == static_cast<MachineInstr *>(MI)) {
       CPUser &U = CPUsers[CPUIndex];
       if (!isOffsetInRange(Offset, EndInsertOffset, U)) {
         // Shift intertion point by one unit of alignment so it is within reach.
@@ -1332,8 +1318,7 @@ void MipsConstantIslands::createNewWater(unsigned CPUserIndex,
     }
   }
 
-  --MI;
-  NewMBB = splitBlockBeforeInstr(MI);
+  NewMBB = splitBlockBeforeInstr(*--MI);
 }
 
 /// handleConstantPoolUser - Analyze the specified user, checking to see if it
@@ -1426,7 +1411,7 @@ bool MipsConstantIslands::handleConstantPoolUser(unsigned CPUserIndex) {
   ++NumCPEs;
 
   // Mark the basic block as aligned as required by the const-pool entry.
-  NewIsland->setAlignment(getCPELogAlign(U.CPEMI));
+  NewIsland->setAlignment(getCPELogAlign(*U.CPEMI));
 
   // Increase the size of the island block to account for the new entry.
   BBInfo[NewIsland->getNumber()].Size += Size;
@@ -1460,7 +1445,7 @@ void MipsConstantIslands::removeDeadCPEMI(MachineInstr *CPEMI) {
     CPEBB->setAlignment(0);
   } else
     // Entries are sorted by descending alignment, so realign from the front.
-    CPEBB->setAlignment(getCPELogAlign(CPEBB->begin()));
+    CPEBB->setAlignment(getCPELogAlign(*CPEBB->begin()));
 
   adjustBBOffsetsAfter(CPEBB);
   // An island has only one predecessor BB and one successor BB. Check if
@@ -1610,7 +1595,7 @@ MipsConstantIslands::fixupConditionalBr(ImmBranch &Br) {
   ++NumCBrFixed;
   if (BMI != MI) {
     if (std::next(MachineBasicBlock::iterator(MI)) == std::prev(MBB->end()) &&
-        isUnconditionalBranch(BMI->getOpcode())) {
+        BMI->isUnconditionalBranch()) {
       // Last MI in the BB is an unconditional branch. Can we simply invert the
       // condition and swap destinations:
       // beqz L1
@@ -1634,10 +1619,10 @@ MipsConstantIslands::fixupConditionalBr(ImmBranch &Br) {
 
 
   if (NeedSplit) {
-    splitBlockBeforeInstr(MI);
+    splitBlockBeforeInstr(*MI);
     // No need for the branch to the next block. We're adding an unconditional
     // branch to the destination.
-    int delta = TII->GetInstSizeInBytes(&MBB->back());
+    int delta = TII->GetInstSizeInBytes(MBB->back());
     BBInfo[MBB->getNumber()].Size -= delta;
     MBB->back().eraseFromParent();
     // BBInfo[SplitBB].Offset is wrong temporarily, fixed below
@@ -1659,14 +1644,14 @@ MipsConstantIslands::fixupConditionalBr(ImmBranch &Br) {
            .addMBB(NextBB);
   }
   Br.MI = &MBB->back();
-  BBInfo[MBB->getNumber()].Size += TII->GetInstSizeInBytes(&MBB->back());
+  BBInfo[MBB->getNumber()].Size += TII->GetInstSizeInBytes(MBB->back());
   BuildMI(MBB, DebugLoc(), TII->get(Br.UncondBr)).addMBB(DestBB);
-  BBInfo[MBB->getNumber()].Size += TII->GetInstSizeInBytes(&MBB->back());
+  BBInfo[MBB->getNumber()].Size += TII->GetInstSizeInBytes(MBB->back());
   unsigned MaxDisp = getUnconditionalBrDisp(Br.UncondBr);
   ImmBranches.push_back(ImmBranch(&MBB->back(), MaxDisp, false, Br.UncondBr));
 
   // Remove the old conditional branch.  It may or may not still be in MBB.
-  BBInfo[MI->getParent()->getNumber()].Size -= TII->GetInstSizeInBytes(MI);
+  BBInfo[MI->getParent()->getNumber()].Size -= TII->GetInstSizeInBytes(*MI);
   MI->eraseFromParent();
   adjustBBOffsetsAfter(MBB);
   return true;
@@ -1710,4 +1695,3 @@ void MipsConstantIslands::prescanForConstants() {
     }
   }
 }
-
diff --git a/lib/Target/Mips/MipsDSPInstrFormats.td b/lib/Target/Mips/MipsDSPInstrFormats.td
index f959bd4d8db3..0ceb1858fb09 100644
--- a/lib/Target/Mips/MipsDSPInstrFormats.td
+++ b/lib/Target/Mips/MipsDSPInstrFormats.td
@@ -32,6 +32,10 @@ class ISA_DSPR2 {
   list<Predicate> InsnPredicates = [HasDSPR2];
 }
 
+class ISA_DSPR3 {
+  list<Predicate> InsnPredicates = [HasDSPR3];
+}
+
 // Fields.
 class Field6<bits<6> val> {
   bits<6> V = val;
diff --git a/lib/Target/Mips/MipsDSPInstrInfo.td b/lib/Target/Mips/MipsDSPInstrInfo.td
index da6f174e2a19..ac9a81b1bb2f 100644
--- a/lib/Target/Mips/MipsDSPInstrInfo.td
+++ b/lib/Target/Mips/MipsDSPInstrInfo.td
@@ -16,7 +16,6 @@ def immZExt1 : ImmLeaf<i32, [{return isUInt<1>(Imm);}]>;
 def immZExt2 : ImmLeaf<i32, [{return isUInt<2>(Imm);}]>;
 def immZExt3 : ImmLeaf<i32, [{return isUInt<3>(Imm);}]>;
 def immZExt4 : ImmLeaf<i32, [{return isUInt<4>(Imm);}]>;
-def immZExt7 : ImmLeaf<i32, [{return isUInt<7>(Imm);}]>;
 def immZExt8 : ImmLeaf<i32, [{return isUInt<8>(Imm);}]>;
 def immZExt10 : ImmLeaf<i32, [{return isUInt<10>(Imm);}]>;
 def immSExt6 : ImmLeaf<i32, [{return isInt<6>(Imm);}]>;
@@ -287,6 +286,7 @@ class CMP_EQ_QB_R2_DESC_BASE<string instr_asm, SDPatternOperator OpNode,
   string AsmString = !strconcat(instr_asm, "\t$rs, $rt");
   list<dag> Pattern = [(OpNode ROS:$rs, ROT:$rt)];
   InstrItinClass Itinerary = itin;
+  string BaseOpcode = instr_asm;
 }
 
 class CMP_EQ_QB_R3_DESC_BASE<string instr_asm, SDPatternOperator OpNode,
@@ -324,9 +324,10 @@ class ABSQ_S_PH_R2_DESC_BASE<string instr_asm, SDPatternOperator OpNode,
 }
 
 class REPL_DESC_BASE<string instr_asm, SDPatternOperator OpNode,
-                     ImmLeaf immPat, InstrItinClass itin, RegisterOperand RO> {
+                     Operand ImmOp, ImmLeaf immPat, InstrItinClass itin,
+                     RegisterOperand RO> {
   dag OutOperandList = (outs RO:$rd);
-  dag InOperandList = (ins uimm16:$imm);
+  dag InOperandList = (ins ImmOp:$imm);
   string AsmString = !strconcat(instr_asm, "\t$rd, $imm");
   list<dag> Pattern = [(set RO:$rd, (OpNode immPat:$imm))];
   InstrItinClass Itinerary = itin;
@@ -401,7 +402,7 @@ class EXTR_W_TY1_R2_DESC_BASE<string instr_asm, SDPatternOperator OpNode,
 class EXTR_W_TY1_R1_DESC_BASE<string instr_asm, SDPatternOperator OpNode,
                               InstrItinClass itin> {
   dag OutOperandList = (outs GPR32Opnd:$rt);
-  dag InOperandList = (ins ACC64DSPOpnd:$ac, uimm16:$shift_rs);
+  dag InOperandList = (ins ACC64DSPOpnd:$ac, uimm5:$shift_rs);
   string AsmString = !strconcat(instr_asm, "\t$rt, $ac, $shift_rs");
   InstrItinClass Itinerary = itin;
   string BaseOpcode = instr_asm;
@@ -440,7 +441,7 @@ class MTHLIP_DESC_BASE<string instr_asm, SDPatternOperator OpNode> {
 class RDDSP_DESC_BASE<string instr_asm, SDPatternOperator OpNode,
                       InstrItinClass itin> {
   dag OutOperandList = (outs GPR32Opnd:$rd);
-  dag InOperandList = (ins uimm16:$mask);
+  dag InOperandList = (ins uimm10:$mask);
   string AsmString = !strconcat(instr_asm, "\t$rd, $mask");
   list<dag> Pattern = [(set GPR32Opnd:$rd, (OpNode immZExt10:$mask))];
   InstrItinClass Itinerary = itin;
@@ -513,14 +514,16 @@ class BPOSGE32_PSEUDO_DESC_BASE<SDPatternOperator OpNode, InstrItinClass itin> :
   bit usesCustomInserter = 1;
 }
 
-class BPOSGE32_DESC_BASE<string instr_asm, InstrItinClass itin> {
+class BPOSGE32_DESC_BASE<string instr_asm, DAGOperand opnd,
+                         InstrItinClass itin> {
   dag OutOperandList = (outs);
-  dag InOperandList = (ins brtarget:$offset);
+  dag InOperandList = (ins opnd:$offset);
   string AsmString = !strconcat(instr_asm, "\t$offset");
   InstrItinClass Itinerary = itin;
   bit isBranch = 1;
   bit isTerminator = 1;
   bit hasDelaySlot = 1;
+  string BaseOpcode = instr_asm;
 }
 
 class INSV_DESC_BASE<string instr_asm, SDPatternOperator OpNode,
@@ -845,11 +848,11 @@ class BITREV_DESC : ABSQ_S_PH_R2_DESC_BASE<"bitrev", int_mips_bitrev,
 class PACKRL_PH_DESC : CMP_EQ_QB_R3_DESC_BASE<"packrl.ph", int_mips_packrl_ph,
                                               NoItinerary, DSPROpnd, DSPROpnd>;
 
-class REPL_QB_DESC : REPL_DESC_BASE<"repl.qb", int_mips_repl_qb, immZExt8,
-                                    NoItinerary, DSPROpnd>;
+class REPL_QB_DESC : REPL_DESC_BASE<"repl.qb", int_mips_repl_qb, uimm8,
+                                    immZExt8, NoItinerary, DSPROpnd>;
 
-class REPL_PH_DESC : REPL_DESC_BASE<"repl.ph", int_mips_repl_ph, immZExt10,
-                                    NoItinerary, DSPROpnd>;
+class REPL_PH_DESC : REPL_DESC_BASE<"repl.ph", int_mips_repl_ph, uimm10,
+                                    immZExt10, NoItinerary, DSPROpnd>;
 
 class REPLV_QB_DESC : ABSQ_S_PH_R2_DESC_BASE<"replv.qb", int_mips_repl_qb,
                                              NoItinerary, DSPROpnd, GPR32Opnd>;
@@ -871,7 +874,7 @@ class LHX_DESC : LX_DESC_BASE<"lhx", int_mips_lhx, NoItinerary>;
 
 class LBUX_DESC : LX_DESC_BASE<"lbux", int_mips_lbux, NoItinerary>;
 
-class BPOSGE32_DESC : BPOSGE32_DESC_BASE<"bposge32", NoItinerary>;
+class BPOSGE32_DESC : BPOSGE32_DESC_BASE<"bposge32", brtarget, NoItinerary>;
 
 // Extr
 class EXTP_DESC : EXTR_W_TY1_R1_DESC_BASE<"extp", MipsEXTP, NoItinerary>,
@@ -1115,7 +1118,7 @@ def ADDQ_S_W : DspMMRel, ADDQ_S_W_ENC, ADDQ_S_W_DESC;
 def SUBQ_S_W : DspMMRel, SUBQ_S_W_ENC, SUBQ_S_W_DESC;
 def ADDSC : DspMMRel, ADDSC_ENC, ADDSC_DESC;
 def ADDWC : DspMMRel, ADDWC_ENC, ADDWC_DESC;
-def MODSUB : MODSUB_ENC, MODSUB_DESC;
+def MODSUB : DspMMRel, MODSUB_ENC, MODSUB_DESC;
 def RADDU_W_QB : DspMMRel, RADDU_W_QB_ENC, RADDU_W_QB_DESC;
 def ABSQ_S_PH : DspMMRel, ABSQ_S_PH_ENC, ABSQ_S_PH_DESC;
 def ABSQ_S_W : DspMMRel, ABSQ_S_W_ENC, ABSQ_S_W_DESC;
@@ -1154,7 +1157,7 @@ def MULEU_S_PH_QBR : DspMMRel, MULEU_S_PH_QBR_ENC, MULEU_S_PH_QBR_DESC;
 def MULEQ_S_W_PHL : DspMMRel, MULEQ_S_W_PHL_ENC, MULEQ_S_W_PHL_DESC;
 def MULEQ_S_W_PHR : DspMMRel, MULEQ_S_W_PHR_ENC, MULEQ_S_W_PHR_DESC;
 def MULQ_RS_PH : DspMMRel, MULQ_RS_PH_ENC, MULQ_RS_PH_DESC;
-def MULSAQ_S_W_PH : MULSAQ_S_W_PH_ENC, MULSAQ_S_W_PH_DESC;
+def MULSAQ_S_W_PH : DspMMRel, MULSAQ_S_W_PH_ENC, MULSAQ_S_W_PH_DESC;
 def MAQ_S_W_PHL : DspMMRel, MAQ_S_W_PHL_ENC, MAQ_S_W_PHL_DESC;
 def MAQ_S_W_PHR : DspMMRel, MAQ_S_W_PHR_ENC, MAQ_S_W_PHR_DESC;
 def MAQ_SA_W_PHL : DspMMRel, MAQ_SA_W_PHL_ENC, MAQ_SA_W_PHL_DESC;
@@ -1177,16 +1180,16 @@ def MADD_DSP : DspMMRel, MADD_DSP_ENC, MADD_DSP_DESC;
 def MADDU_DSP : DspMMRel, MADDU_DSP_ENC, MADDU_DSP_DESC;
 def MSUB_DSP : DspMMRel, MSUB_DSP_ENC, MSUB_DSP_DESC;
 def MSUBU_DSP : DspMMRel, MSUBU_DSP_ENC, MSUBU_DSP_DESC;
-def CMPU_EQ_QB : CMPU_EQ_QB_ENC, CMPU_EQ_QB_DESC;
-def CMPU_LT_QB : CMPU_LT_QB_ENC, CMPU_LT_QB_DESC;
-def CMPU_LE_QB : CMPU_LE_QB_ENC, CMPU_LE_QB_DESC;
-def CMPGU_EQ_QB : CMPGU_EQ_QB_ENC, CMPGU_EQ_QB_DESC;
-def CMPGU_LT_QB : CMPGU_LT_QB_ENC, CMPGU_LT_QB_DESC;
-def CMPGU_LE_QB : CMPGU_LE_QB_ENC, CMPGU_LE_QB_DESC;
-def CMP_EQ_PH : CMP_EQ_PH_ENC, CMP_EQ_PH_DESC;
-def CMP_LT_PH : CMP_LT_PH_ENC, CMP_LT_PH_DESC;
-def CMP_LE_PH : CMP_LE_PH_ENC, CMP_LE_PH_DESC;
-def BITREV : BITREV_ENC, BITREV_DESC;
+def CMPU_EQ_QB : DspMMRel, CMPU_EQ_QB_ENC, CMPU_EQ_QB_DESC;
+def CMPU_LT_QB : DspMMRel, CMPU_LT_QB_ENC, CMPU_LT_QB_DESC;
+def CMPU_LE_QB : DspMMRel, CMPU_LE_QB_ENC, CMPU_LE_QB_DESC;
+def CMPGU_EQ_QB : DspMMRel, CMPGU_EQ_QB_ENC, CMPGU_EQ_QB_DESC;
+def CMPGU_LT_QB : DspMMRel, CMPGU_LT_QB_ENC, CMPGU_LT_QB_DESC;
+def CMPGU_LE_QB : DspMMRel, CMPGU_LE_QB_ENC, CMPGU_LE_QB_DESC;
+def CMP_EQ_PH : DspMMRel, CMP_EQ_PH_ENC, CMP_EQ_PH_DESC;
+def CMP_LT_PH : DspMMRel, CMP_LT_PH_ENC, CMP_LT_PH_DESC;
+def CMP_LE_PH : DspMMRel, CMP_LE_PH_ENC, CMP_LE_PH_DESC;
+def BITREV : DspMMRel, BITREV_ENC, BITREV_DESC;
 def PACKRL_PH : DspMMRel, PACKRL_PH_ENC, PACKRL_PH_DESC;
 def REPL_QB : DspMMRel, REPL_QB_ENC, REPL_QB_DESC;
 def REPL_PH : DspMMRel, REPL_PH_ENC, REPL_PH_DESC;
@@ -1197,7 +1200,9 @@ def PICK_PH : DspMMRel, PICK_PH_ENC, PICK_PH_DESC;
 def LWX : DspMMRel, LWX_ENC, LWX_DESC;
 def LHX : DspMMRel, LHX_ENC, LHX_DESC;
 def LBUX : DspMMRel, LBUX_ENC, LBUX_DESC;
-def BPOSGE32 : BPOSGE32_ENC, BPOSGE32_DESC;
+let AdditionalPredicates = [NotInMicroMips] in {
+  def BPOSGE32 : DspMMRel, BPOSGE32_ENC, BPOSGE32_DESC;
+}
 def INSV : DspMMRel, INSV_ENC, INSV_DESC;
 def EXTP : DspMMRel, EXTP_ENC, EXTP_DESC;
 def EXTPV : DspMMRel, EXTPV_ENC, EXTPV_DESC;
@@ -1224,9 +1229,9 @@ def ADDU_PH : DspMMRel, ADDU_PH_ENC, ADDU_PH_DESC, ISA_DSPR2;
 def ADDU_S_PH : DspMMRel, ADDU_S_PH_ENC, ADDU_S_PH_DESC, ISA_DSPR2;
 def SUBU_PH : DspMMRel, SUBU_PH_ENC, SUBU_PH_DESC, ISA_DSPR2;
 def SUBU_S_PH : DspMMRel, SUBU_S_PH_ENC, SUBU_S_PH_DESC, ISA_DSPR2;
-def CMPGDU_EQ_QB : CMPGDU_EQ_QB_ENC, CMPGDU_EQ_QB_DESC, ISA_DSPR2;
-def CMPGDU_LT_QB : CMPGDU_LT_QB_ENC, CMPGDU_LT_QB_DESC, ISA_DSPR2;
-def CMPGDU_LE_QB : CMPGDU_LE_QB_ENC, CMPGDU_LE_QB_DESC, ISA_DSPR2;
+def CMPGDU_EQ_QB : DspMMRel, CMPGDU_EQ_QB_ENC, CMPGDU_EQ_QB_DESC, ISA_DSPR2;
+def CMPGDU_LT_QB : DspMMRel, CMPGDU_LT_QB_ENC, CMPGDU_LT_QB_DESC, ISA_DSPR2;
+def CMPGDU_LE_QB : DspMMRel, CMPGDU_LE_QB_ENC, CMPGDU_LE_QB_DESC, ISA_DSPR2;
 def ABSQ_S_QB : DspMMRel, ABSQ_S_QB_ENC, ABSQ_S_QB_DESC, ISA_DSPR2;
 def ADDUH_QB : DspMMRel, ADDUH_QB_ENC, ADDUH_QB_DESC, ISA_DSPR2;
 def ADDUH_R_QB : DspMMRel, ADDUH_R_QB_ENC, ADDUH_R_QB_DESC, ISA_DSPR2;
@@ -1253,7 +1258,7 @@ def DPAX_W_PH : DspMMRel, DPAX_W_PH_ENC, DPAX_W_PH_DESC, ISA_DSPR2;
 def DPSX_W_PH : DspMMRel, DPSX_W_PH_ENC, DPSX_W_PH_DESC, ISA_DSPR2;
 def DPSQX_S_W_PH : DspMMRel, DPSQX_S_W_PH_ENC, DPSQX_S_W_PH_DESC, ISA_DSPR2;
 def DPSQX_SA_W_PH : DspMMRel, DPSQX_SA_W_PH_ENC, DPSQX_SA_W_PH_DESC, ISA_DSPR2;
-def MULSA_W_PH : MULSA_W_PH_ENC, MULSA_W_PH_DESC, ISA_DSPR2;
+def MULSA_W_PH : DspMMRel, MULSA_W_PH_ENC, MULSA_W_PH_DESC, ISA_DSPR2;
 def PRECR_QB_PH : DspMMRel, PRECR_QB_PH_ENC, PRECR_QB_PH_DESC, ISA_DSPR2;
 def PRECR_SRA_PH_W : DspMMRel, PRECR_SRA_PH_W_ENC, PRECR_SRA_PH_W_DESC, ISA_DSPR2;
 def PRECR_SRA_R_PH_W : DspMMRel, PRECR_SRA_R_PH_W_ENC, PRECR_SRA_R_PH_W_DESC, ISA_DSPR2;
@@ -1263,12 +1268,12 @@ def SHRA_R_QB : DspMMRel, SHRA_R_QB_ENC, SHRA_R_QB_DESC, ISA_DSPR2;
 def SHRAV_R_QB : DspMMRel, SHRAV_R_QB_ENC, SHRAV_R_QB_DESC, ISA_DSPR2;
 def SHRL_PH : DspMMRel, SHRL_PH_ENC, SHRL_PH_DESC, ISA_DSPR2;
 def SHRLV_PH : DspMMRel, SHRLV_PH_ENC, SHRLV_PH_DESC, ISA_DSPR2;
-def APPEND : APPEND_ENC, APPEND_DESC, ISA_DSPR2;
-def BALIGN : BALIGN_ENC, BALIGN_DESC, ISA_DSPR2;
+def APPEND : DspMMRel, APPEND_ENC, APPEND_DESC, ISA_DSPR2;
+def BALIGN : DspMMRel, BALIGN_ENC, BALIGN_DESC, ISA_DSPR2;
 def PREPEND : DspMMRel, PREPEND_ENC, PREPEND_DESC, ISA_DSPR2;
 
 // Pseudos.
-let isPseudo = 1, isCodeGenOnly = 1 in {
+let isPseudo = 1, isCodeGenOnly = 1, hasNoSchedulingInfo = 1 in {
   // Pseudo instructions for loading and storing accumulator registers.
   def LOAD_ACC64DSP  : Load<"", ACC64DSPOpnd>;
   def STORE_ACC64DSP : Store<"", ACC64DSPOpnd>;
diff --git a/lib/Target/Mips/MipsDelaySlotFiller.cpp b/lib/Target/Mips/MipsDelaySlotFiller.cpp
index 8313d909df2a..b5ba770df7bd 100644
--- a/lib/Target/Mips/MipsDelaySlotFiller.cpp
+++ b/lib/Target/Mips/MipsDelaySlotFiller.cpp
@@ -62,6 +62,28 @@ static cl::opt<bool> DisableBackwardSearch(
   cl::desc("Disallow MIPS delay filler to search backward."),
   cl::Hidden);
 
+enum CompactBranchPolicy {
+  CB_Never,   ///< The policy 'never' may in some circumstances or for some
+              ///< ISAs not be absolutely adhered to.
+  CB_Optimal, ///< Optimal is the default and will produce compact branches
+              ///< when delay slots cannot be filled.
+  CB_Always   ///< 'always' may in some circumstances may not be
+              ///< absolutely adhered to there may not be a corresponding
+              ///< compact form of a branch.
+};
+
+static cl::opt<CompactBranchPolicy> MipsCompactBranchPolicy(
+  "mips-compact-branches",cl::Optional,
+  cl::init(CB_Optimal),
+  cl::desc("MIPS Specific: Compact branch policy."),
+  cl::values(
+    clEnumValN(CB_Never, "never", "Do not use compact branches if possible."),
+    clEnumValN(CB_Optimal, "optimal", "Use compact branches where appropiate (default)."),
+    clEnumValN(CB_Always, "always", "Always use compact branches if possible."),
+    clEnumValEnd
+  )
+);
+
 namespace {
   typedef MachineBasicBlock::iterator Iter;
   typedef MachineBasicBlock::reverse_iterator ReverseIter;
@@ -189,6 +211,11 @@ namespace {
       return Changed;
     }
 
+    MachineFunctionProperties getRequiredProperties() const override {
+      return MachineFunctionProperties().set(
+          MachineFunctionProperties::Property::AllVRegsAllocated);
+    }
+
     void getAnalysisUsage(AnalysisUsage &AU) const override {
       AU.addRequired<MachineBranchProbabilityInfo>();
       MachineFunctionPass::getAnalysisUsage(AU);
@@ -197,11 +224,8 @@ namespace {
   private:
     bool runOnMachineBasicBlock(MachineBasicBlock &MBB);
 
-    Iter replaceWithCompactBranch(MachineBasicBlock &MBB,
-                                  Iter Branch, DebugLoc DL);
-
-    Iter replaceWithCompactJump(MachineBasicBlock &MBB,
-                                Iter Jump, DebugLoc DL);
+    Iter replaceWithCompactBranch(MachineBasicBlock &MBB, Iter Branch,
+                                  const DebugLoc &DL);
 
     /// This function checks if it is valid to move Candidate to the delay slot
     /// and returns true if it isn't. It also updates memory and register
@@ -505,46 +529,18 @@ getUnderlyingObjects(const MachineInstr &MI,
 }
 
 // Replace Branch with the compact branch instruction.
-Iter Filler::replaceWithCompactBranch(MachineBasicBlock &MBB,
-                                      Iter Branch, DebugLoc DL) {
-  const MipsInstrInfo *TII =
-      MBB.getParent()->getSubtarget<MipsSubtarget>().getInstrInfo();
-
-  unsigned NewOpcode =
-    (((unsigned) Branch->getOpcode()) == Mips::BEQ) ? Mips::BEQZC_MM
-                                                    : Mips::BNEZC_MM;
-
-  const MCInstrDesc &NewDesc = TII->get(NewOpcode);
-  MachineInstrBuilder MIB = BuildMI(MBB, Branch, DL, NewDesc);
-
-  MIB.addReg(Branch->getOperand(0).getReg());
-  MIB.addMBB(Branch->getOperand(2).getMBB());
+Iter Filler::replaceWithCompactBranch(MachineBasicBlock &MBB, Iter Branch,
+                                      const DebugLoc &DL) {
+  const MipsSubtarget &STI = MBB.getParent()->getSubtarget<MipsSubtarget>();
+  const MipsInstrInfo *TII = STI.getInstrInfo();
 
-  Iter tmpIter = Branch;
-  Branch = std::prev(Branch);
-  MBB.erase(tmpIter);
+  unsigned NewOpcode = TII->getEquivalentCompactForm(Branch);
+  Branch = TII->genInstrWithNewOpc(NewOpcode, Branch);
 
+  std::next(Branch)->eraseFromParent();
   return Branch;
 }
 
-// Replace Jumps with the compact jump instruction.
-Iter Filler::replaceWithCompactJump(MachineBasicBlock &MBB,
-                                    Iter Jump, DebugLoc DL) {
-  const MipsInstrInfo *TII =
-      MBB.getParent()->getSubtarget<MipsSubtarget>().getInstrInfo();
-
-  const MCInstrDesc &NewDesc = TII->get(Mips::JRC16_MM);
-  MachineInstrBuilder MIB = BuildMI(MBB, Jump, DL, NewDesc);
-
-  MIB.addReg(Jump->getOperand(0).getReg());
-
-  Iter tmpIter = Jump;
-  Jump = std::prev(Jump);
-  MBB.erase(tmpIter);
-
-  return Jump;
-}
-
 // For given opcode returns opcode of corresponding instruction with short
 // delay slot.
 static int getEquivalentCallShort(int Opcode) {
@@ -572,6 +568,12 @@ bool Filler::runOnMachineBasicBlock(MachineBasicBlock &MBB) {
   bool InMicroMipsMode = STI.inMicroMipsMode();
   const MipsInstrInfo *TII = STI.getInstrInfo();
 
+  if (InMicroMipsMode && STI.hasMips32r6()) {
+    // This is microMIPS32r6 or microMIPS64r6 processor. Delay slot for
+    // branching instructions is not needed.
+    return Changed;
+  }
+
   for (Iter I = MBB.begin(); I != MBB.end(); ++I) {
     if (!hasUnoccupiedSlot(&*I))
       continue;
@@ -583,55 +585,49 @@ bool Filler::runOnMachineBasicBlock(MachineBasicBlock &MBB) {
     if (!DisableDelaySlotFiller && (TM.getOptLevel() != CodeGenOpt::None)) {
       bool Filled = false;
 
-      if (searchBackward(MBB, I)) {
-        Filled = true;
-      } else if (I->isTerminator()) {
-        if (searchSuccBBs(MBB, I)) {
+      if (MipsCompactBranchPolicy.getValue() != CB_Always ||
+           !TII->getEquivalentCompactForm(I)) {
+        if (searchBackward(MBB, I)) {
+          Filled = true;
+        } else if (I->isTerminator()) {
+          if (searchSuccBBs(MBB, I)) {
+            Filled = true;
+          }
+        } else if (searchForward(MBB, I)) {
           Filled = true;
         }
-      } else if (searchForward(MBB, I)) {
-        Filled = true;
       }
 
       if (Filled) {
         // Get instruction with delay slot.
-        MachineBasicBlock::instr_iterator DSI(I);
+        MachineBasicBlock::instr_iterator DSI = I.getInstrIterator();
 
-        if (InMicroMipsMode && TII->GetInstSizeInBytes(&*std::next(DSI)) == 2 &&
+        if (InMicroMipsMode && TII->GetInstSizeInBytes(*std::next(DSI)) == 2 &&
             DSI->isCall()) {
           // If instruction in delay slot is 16b change opcode to
           // corresponding instruction with short delay slot.
           DSI->setDesc(TII->get(getEquivalentCallShort(DSI->getOpcode())));
         }
-
         continue;
       }
     }
 
-    // If instruction is BEQ or BNE with one ZERO register, then instead of
-    // adding NOP replace this instruction with the corresponding compact
-    // branch instruction, i.e. BEQZC or BNEZC.
-    unsigned Opcode = I->getOpcode();
-    if (InMicroMipsMode) {
-      switch (Opcode) {
-        case Mips::BEQ:
-        case Mips::BNE:
-          if (((unsigned) I->getOperand(1).getReg()) == Mips::ZERO) {
-            I = replaceWithCompactBranch(MBB, I, I->getDebugLoc());
-            continue;
-          }
-          break;
-        case Mips::JR:
-        case Mips::PseudoReturn:
-        case Mips::PseudoIndirectBranch:
-          // For microMIPS the PseudoReturn and PseudoIndirectBranch are allways
-          // expanded to JR_MM, so they can be replaced with JRC16_MM.
-          I = replaceWithCompactJump(MBB, I, I->getDebugLoc());
-          continue;
-        default:
-          break;
-      }
+    // For microMIPS if instruction is BEQ or BNE with one ZERO register, then
+    // instead of adding NOP replace this instruction with the corresponding
+    // compact branch instruction, i.e. BEQZC or BNEZC. Additionally
+    // PseudoReturn and PseudoIndirectBranch are expanded to JR_MM, so they can
+    // be replaced with JRC16_MM.
+
+    // For MIPSR6 attempt to produce the corresponding compact (no delay slot)
+    // form of the CTI. For indirect jumps this will not require inserting a
+    // NOP and for branches will hopefully avoid requiring a NOP.
+    if ((InMicroMipsMode ||
+         (STI.hasMips32r6() && MipsCompactBranchPolicy != CB_Never)) &&
+        TII->getEquivalentCompactForm(I)) {
+      I = replaceWithCompactBranch(MBB, I, I->getDebugLoc());
+      continue;
     }
+
     // Bundle the NOP to the instruction with the delay slot.
     BuildMI(MBB, std::next(I), I->getDebugLoc(), TII->get(Mips::NOP));
     MIBundleBuilder(MBB, I, std::next(I, 2));
@@ -696,7 +692,7 @@ bool Filler::searchRange(MachineBasicBlock &MBB, IterTy Begin, IterTy End,
     bool InMicroMipsMode = STI.inMicroMipsMode();
     const MipsInstrInfo *TII = STI.getInstrInfo();
     unsigned Opcode = (*Slot).getOpcode();
-    if (InMicroMipsMode && TII->GetInstSizeInBytes(&(*CurrI)) == 2 &&
+    if (InMicroMipsMode && TII->GetInstSizeInBytes(*CurrI) == 2 &&
         (Opcode == Mips::JR || Opcode == Mips::PseudoIndirectBranch ||
          Opcode == Mips::PseudoReturn))
       continue;
@@ -819,7 +815,7 @@ Filler::getBranch(MachineBasicBlock &MBB, const MachineBasicBlock &Dst) const {
   SmallVector<MachineOperand, 2> Cond;
 
   MipsInstrInfo::BranchType R =
-    TII->AnalyzeBranch(MBB, TrueBB, FalseBB, Cond, false, BranchInstrs);
+      TII->analyzeBranch(MBB, TrueBB, FalseBB, Cond, false, BranchInstrs);
 
   if ((R == MipsInstrInfo::BT_None) || (R == MipsInstrInfo::BT_NoBranch))
     return std::make_pair(R, nullptr);
diff --git a/lib/Target/Mips/MipsEVAInstrInfo.td b/lib/Target/Mips/MipsEVAInstrInfo.td
index 36c9694cbadd..26df263d228b 100644
--- a/lib/Target/Mips/MipsEVAInstrInfo.td
+++ b/lib/Target/Mips/MipsEVAInstrInfo.td
@@ -51,7 +51,8 @@ class PREFE_ENC   : SPECIAL3_EVA_LOAD_STORE_FM<OPCODE6_PREFE>;
 //===----------------------------------------------------------------------===//
 
 // Memory Load/Store EVA descriptions
-class LOAD_EVA_DESC_BASE<string instr_asm, RegisterOperand GPROpnd> {
+class LOAD_EVA_DESC_BASE<string instr_asm, RegisterOperand GPROpnd,
+                         InstrItinClass itin = NoItinerary> {
   dag OutOperandList = (outs GPROpnd:$rt);
   dag InOperandList = (ins mem_simm9:$addr);
   string AsmString = !strconcat(instr_asm, "\t$rt, $addr");
@@ -59,30 +60,34 @@ class LOAD_EVA_DESC_BASE<string instr_asm, RegisterOperand GPROpnd> {
   string DecoderMethod = "DecodeMemEVA";
   bit canFoldAsLoad = 1;
   bit mayLoad = 1;
+  InstrItinClass Itinerary = itin;
 }
 
-class LBE_DESC  : LOAD_EVA_DESC_BASE<"lbe",  GPR32Opnd>;
-class LBuE_DESC : LOAD_EVA_DESC_BASE<"lbue", GPR32Opnd>;
-class LHE_DESC  : LOAD_EVA_DESC_BASE<"lhe",  GPR32Opnd>;
-class LHuE_DESC : LOAD_EVA_DESC_BASE<"lhue", GPR32Opnd>;
-class LWE_DESC  : LOAD_EVA_DESC_BASE<"lwe",  GPR32Opnd>;
+class LBE_DESC  : LOAD_EVA_DESC_BASE<"lbe",  GPR32Opnd, II_LBE>;
+class LBuE_DESC : LOAD_EVA_DESC_BASE<"lbue", GPR32Opnd, II_LBUE>;
+class LHE_DESC  : LOAD_EVA_DESC_BASE<"lhe",  GPR32Opnd, II_LHE>;
+class LHuE_DESC : LOAD_EVA_DESC_BASE<"lhue", GPR32Opnd, II_LHUE>;
+class LWE_DESC  : LOAD_EVA_DESC_BASE<"lwe",  GPR32Opnd, II_LWE>;
 
 class STORE_EVA_DESC_BASE<string instr_asm, RegisterOperand GPROpnd,
-                          SDPatternOperator OpNode = null_frag> {
+                          SDPatternOperator OpNode = null_frag,
+                          InstrItinClass itin = NoItinerary> {
   dag OutOperandList = (outs);
   dag InOperandList = (ins GPROpnd:$rt, mem_simm9:$addr);
   string AsmString = !strconcat(instr_asm, "\t$rt, $addr");
   list<dag> Pattern = [];
   string DecoderMethod = "DecodeMemEVA";
   bit mayStore = 1;
+  InstrItinClass Itinerary = itin;
 }
 
-class SBE_DESC  : STORE_EVA_DESC_BASE<"sbe",  GPR32Opnd>;
-class SHE_DESC  : STORE_EVA_DESC_BASE<"she",  GPR32Opnd>;
-class SWE_DESC  : STORE_EVA_DESC_BASE<"swe",  GPR32Opnd>;
+class SBE_DESC  : STORE_EVA_DESC_BASE<"sbe",  GPR32Opnd, null_frag, II_SBE>;
+class SHE_DESC  : STORE_EVA_DESC_BASE<"she",  GPR32Opnd, null_frag, II_SHE>;
+class SWE_DESC  : STORE_EVA_DESC_BASE<"swe",  GPR32Opnd, null_frag, II_SWE>;
 
 // Load/Store Left/Right EVA descriptions
-class LOAD_LEFT_RIGHT_EVA_DESC_BASE<string instr_asm, RegisterOperand GPROpnd> {
+class LOAD_LEFT_RIGHT_EVA_DESC_BASE<string instr_asm, RegisterOperand GPROpnd,
+                                    InstrItinClass itin = NoItinerary> {
   dag OutOperandList = (outs GPROpnd:$rt);
   dag InOperandList = (ins mem_simm9:$addr, GPROpnd:$src);
   string AsmString = !strconcat(instr_asm, "\t$rt, $addr");
@@ -90,35 +95,41 @@ class LOAD_LEFT_RIGHT_EVA_DESC_BASE<string instr_asm, RegisterOperand GPROpnd> {
   string DecoderMethod = "DecodeMemEVA";
   string Constraints = "$src = $rt";
   bit canFoldAsLoad = 1;
+  InstrItinClass Itinerary = itin;
 }
 
-class LWLE_DESC  : LOAD_LEFT_RIGHT_EVA_DESC_BASE<"lwle",  GPR32Opnd>;
-class LWRE_DESC  : LOAD_LEFT_RIGHT_EVA_DESC_BASE<"lwre",  GPR32Opnd>;
+class LWLE_DESC  : LOAD_LEFT_RIGHT_EVA_DESC_BASE<"lwle",  GPR32Opnd, II_LWLE>;
+class LWRE_DESC  : LOAD_LEFT_RIGHT_EVA_DESC_BASE<"lwre",  GPR32Opnd, II_LWRE>;
 
-class STORE_LEFT_RIGHT_EVA_DESC_BASE<string instr_asm, RegisterOperand GPROpnd> {
+class STORE_LEFT_RIGHT_EVA_DESC_BASE<string instr_asm, RegisterOperand GPROpnd,
+                                     InstrItinClass itin = NoItinerary> {
   dag OutOperandList = (outs);
   dag InOperandList = (ins GPROpnd:$rt, mem_simm9:$addr);
   string AsmString = !strconcat(instr_asm, "\t$rt, $addr");
   list<dag> Pattern = [];
   string DecoderMethod = "DecodeMemEVA";
+  InstrItinClass Itinerary = itin;
 }
 
-class SWLE_DESC  : LOAD_LEFT_RIGHT_EVA_DESC_BASE<"swle",  GPR32Opnd>;
-class SWRE_DESC  : LOAD_LEFT_RIGHT_EVA_DESC_BASE<"swre",  GPR32Opnd>;
+class SWLE_DESC  : LOAD_LEFT_RIGHT_EVA_DESC_BASE<"swle",  GPR32Opnd, II_SWLE>;
+class SWRE_DESC  : LOAD_LEFT_RIGHT_EVA_DESC_BASE<"swre",  GPR32Opnd, II_SWRE>;
 
 // Load-linked EVA, Store-conditional EVA descriptions
-class LLE_DESC_BASE<string instr_asm, RegisterOperand GPROpnd> {
+class LLE_DESC_BASE<string instr_asm, RegisterOperand GPROpnd,
+                    InstrItinClass itin = NoItinerary> {
   dag OutOperandList = (outs GPROpnd:$rt);
   dag InOperandList = (ins mem_simm9:$addr);
   string AsmString = !strconcat(instr_asm, "\t$rt, $addr");
   list<dag> Pattern = [];
   bit mayLoad = 1;
   string DecoderMethod = "DecodeMemEVA";
+  InstrItinClass Itinerary = itin;
 }
 
-class LLE_DESC : LLE_DESC_BASE<"lle", GPR32Opnd>;
+class LLE_DESC : LLE_DESC_BASE<"lle", GPR32Opnd, II_LLE>;
 
-class SCE_DESC_BASE<string instr_asm, RegisterOperand GPROpnd> {
+class SCE_DESC_BASE<string instr_asm, RegisterOperand GPROpnd,
+                    InstrItinClass itin = NoItinerary> {
   dag OutOperandList = (outs GPROpnd:$dst);
   dag InOperandList = (ins GPROpnd:$rt, mem_simm9:$addr);
   string AsmString = !strconcat(instr_asm, "\t$rt, $addr");
@@ -126,30 +137,34 @@ class SCE_DESC_BASE<string instr_asm, RegisterOperand GPROpnd> {
   bit mayStore = 1;
   string Constraints = "$rt = $dst";
   string DecoderMethod = "DecodeMemEVA";
+  InstrItinClass Itinerary = itin;
 }
 
-class SCE_DESC : SCE_DESC_BASE<"sce", GPR32Opnd>;
+class SCE_DESC : SCE_DESC_BASE<"sce", GPR32Opnd, II_SCE>;
 
-class TLB_DESC_BASE<string instr_asm> {
+class TLB_DESC_BASE<string instr_asm, InstrItinClass itin = NoItinerary> {
   dag OutOperandList = (outs);
   dag InOperandList = (ins);
   string AsmString = instr_asm;
   list<dag> Pattern = [];
+  InstrItinClass Itinerary = itin;
 }
 
-class TLBINV_DESC  : TLB_DESC_BASE<"tlbinv">;
-class TLBINVF_DESC : TLB_DESC_BASE<"tlbinvf">;
+class TLBINV_DESC  : TLB_DESC_BASE<"tlbinv", II_TLBINV>;
+class TLBINVF_DESC : TLB_DESC_BASE<"tlbinvf", II_TLBINVF>;
 
-class CACHEE_DESC_BASE<string instr_asm, Operand MemOpnd> {
+class CACHEE_DESC_BASE<string instr_asm, Operand MemOpnd,
+                       InstrItinClass itin = NoItinerary> {
   dag OutOperandList = (outs);
   dag InOperandList = (ins  MemOpnd:$addr, uimm5:$hint);
   string AsmString = !strconcat(instr_asm, "\t$hint, $addr");
   list<dag> Pattern = [];
   string DecoderMethod = "DecodeCacheeOp_CacheOpR6";
+  InstrItinClass Itinerary = itin;
 }
 
-class CACHEE_DESC  : CACHEE_DESC_BASE<"cachee", mem>;
-class PREFE_DESC   : CACHEE_DESC_BASE<"prefe", mem>;
+class CACHEE_DESC  : CACHEE_DESC_BASE<"cachee", mem_simm9, II_CACHEE>;
+class PREFE_DESC   : CACHEE_DESC_BASE<"prefe", mem_simm9, II_PREFE>;
 
 //===----------------------------------------------------------------------===//
 //
@@ -185,8 +200,10 @@ def LLE     : LLE_ENC, LLE_DESC, INSN_EVA;
 def SCE     : SCE_ENC, SCE_DESC, INSN_EVA;
 }
 
-def TLBINV  : TLBINV_ENC, TLBINV_DESC, INSN_EVA;
-def TLBINVF : TLBINVF_ENC, TLBINVF_DESC, INSN_EVA;
+let AdditionalPredicates = [NotInMicroMips] in {
+  def TLBINV  : TLBINV_ENC, TLBINV_DESC, INSN_EVA;
+  def TLBINVF : TLBINVF_ENC, TLBINVF_DESC, INSN_EVA;
+}
 
 def CACHEE  : CACHEE_ENC, CACHEE_DESC, INSN_EVA;
 def PREFE   : PREFE_ENC, PREFE_DESC, INSN_EVA;
diff --git a/lib/Target/Mips/MipsFastISel.cpp b/lib/Target/Mips/MipsFastISel.cpp
index e9eaf810637a..19c201d26b24 100644
--- a/lib/Target/Mips/MipsFastISel.cpp
+++ b/lib/Target/Mips/MipsFastISel.cpp
@@ -1,5 +1,18 @@
-//===-- MipsastISel.cpp - Mips FastISel implementation
-//---------------------===//
+//===-- MipsFastISel.cpp - Mips FastISel implementation --------------------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// \brief This file defines the MIPS-specific support for the FastISel class.
+/// Some of the target-specific code is generated by tablegen in the file
+/// MipsGenFastISel.inc, which is #included here.
+///
+//===----------------------------------------------------------------------===//
 
 #include "MipsCCState.h"
 #include "MipsInstrInfo.h"
@@ -192,9 +205,10 @@ public:
         TII(*Subtarget->getInstrInfo()), TLI(*Subtarget->getTargetLowering()) {
     MFI = funcInfo.MF->getInfo<MipsFunctionInfo>();
     Context = &funcInfo.Fn->getContext();
-    bool ISASupported = !Subtarget->hasMips32r6() && Subtarget->hasMips32();
+    bool ISASupported = !Subtarget->hasMips32r6() &&
+                        !Subtarget->inMicroMipsMode() && Subtarget->hasMips32();
     TargetSupported =
-        ISASupported && (TM.getRelocationModel() == Reloc::PIC_) &&
+        ISASupported && TM.isPositionIndependent() &&
         (static_cast<const MipsTargetMachine &>(TM).getABI().IsO32());
     UnsupportedFPMode = Subtarget->isFP64bit();
   }
@@ -691,11 +705,10 @@ bool MipsFastISel::emitCmp(unsigned ResultReg, const CmpInst *CI) {
     emitInst(Mips::ADDiu, RegWithOne).addReg(Mips::ZERO).addImm(1);
     emitInst(Opc).addReg(LeftReg).addReg(RightReg).addReg(
         Mips::FCC0, RegState::ImplicitDefine);
-    MachineInstrBuilder MI = emitInst(CondMovOpc, ResultReg)
-                                 .addReg(RegWithOne)
-                                 .addReg(Mips::FCC0)
-                                 .addReg(RegWithZero, RegState::Implicit);
-    MI->tieOperands(0, 3);
+    emitInst(CondMovOpc, ResultReg)
+        .addReg(RegWithOne)
+        .addReg(Mips::FCC0)
+        .addReg(RegWithZero);
     break;
   }
   }
@@ -802,7 +815,7 @@ bool MipsFastISel::emitStore(MVT VT, unsigned SrcReg, Address &Addr,
     unsigned Offset = Addr.getOffset();
     MachineFrameInfo &MFI = *MF->getFrameInfo();
     MachineMemOperand *MMO = MF->getMachineMemOperand(
-        MachinePointerInfo::getFixedStack(*MF, FI), MachineMemOperand::MOLoad,
+        MachinePointerInfo::getFixedStack(*MF, FI), MachineMemOperand::MOStore,
         MFI.getObjectSize(FI), Align);
     BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(Opc))
         .addReg(SrcReg)
@@ -943,7 +956,7 @@ bool MipsFastISel::selectFPExt(const Instruction *I) {
     return false;
 
   unsigned SrcReg =
-      getRegForValue(Src); // his must be a 32 bit floating point register class
+      getRegForValue(Src); // this must be a 32bit floating point register class
                            // maybe we should handle this differently
   if (!SrcReg)
     return false;
@@ -1180,7 +1193,7 @@ bool MipsFastISel::processCallArgs(CallLoweringInfo &CLI,
       // for now (will return false). We need to determine the right alignment
       // based on the normal alignment for the underlying machine type.
       //
-      unsigned ArgSize = RoundUpToAlignment(ArgVT.getSizeInBits(), 4);
+      unsigned ArgSize = alignTo(ArgVT.getSizeInBits(), 4);
 
       unsigned BEAlign = 0;
       if (ArgSize < 8 && !Subtarget->isLittle())
@@ -1207,7 +1220,7 @@ bool MipsFastISel::processCallArgs(CallLoweringInfo &CLI,
 bool MipsFastISel::finishCall(CallLoweringInfo &CLI, MVT RetVT,
                               unsigned NumBytes) {
   CallingConv::ID CC = CLI.CallConv;
-  emitInst(Mips::ADJCALLSTACKUP).addImm(16);
+  emitInst(Mips::ADJCALLSTACKUP).addImm(16).addImm(0);
   if (RetVT != MVT::isVoid) {
     SmallVector<CCValAssign, 16> RVLocs;
     CCState CCInfo(CC, false, *FuncInfo.MF, RVLocs, *Context);
diff --git a/lib/Target/Mips/MipsFrameLowering.cpp b/lib/Target/Mips/MipsFrameLowering.cpp
index a74c8abd2e2d..fe6f332f2bdf 100644
--- a/lib/Target/Mips/MipsFrameLowering.cpp
+++ b/lib/Target/Mips/MipsFrameLowering.cpp
@@ -13,7 +13,6 @@
 
 #include "MipsFrameLowering.h"
 #include "MCTargetDesc/MipsBaseInfo.h"
-#include "MipsAnalyzeImmediate.h"
 #include "MipsInstrInfo.h"
 #include "MipsMachineFunction.h"
 #include "MipsTargetMachine.h"
@@ -24,7 +23,6 @@
 #include "llvm/CodeGen/MachineRegisterInfo.h"
 #include "llvm/IR/DataLayout.h"
 #include "llvm/IR/Function.h"
-#include "llvm/Support/CommandLine.h"
 #include "llvm/Target/TargetOptions.h"
 
 using namespace llvm;
@@ -122,7 +120,7 @@ uint64_t MipsFrameLowering::estimateStackSize(const MachineFunction &MF) const {
   // Conservatively assume all callee-saved registers will be saved.
   for (const MCPhysReg *R = TRI.getCalleeSavedRegs(&MF); *R; ++R) {
     unsigned Size = TRI.getMinimalPhysRegClass(*R)->getSize();
-    Offset = RoundUpToAlignment(Offset + Size, Size);
+    Offset = alignTo(Offset + Size, Size);
   }
 
   unsigned MaxAlign = MFI->getMaxAlignment();
@@ -133,18 +131,18 @@ uint64_t MipsFrameLowering::estimateStackSize(const MachineFunction &MF) const {
 
   // Iterate over other objects.
   for (unsigned I = 0, E = MFI->getObjectIndexEnd(); I != E; ++I)
-    Offset = RoundUpToAlignment(Offset + MFI->getObjectSize(I), MaxAlign);
+    Offset = alignTo(Offset + MFI->getObjectSize(I), MaxAlign);
 
   // Call frame.
   if (MFI->adjustsStack() && hasReservedCallFrame(MF))
-    Offset = RoundUpToAlignment(Offset + MFI->getMaxCallFrameSize(),
-                                std::max(MaxAlign, getStackAlignment()));
+    Offset = alignTo(Offset + MFI->getMaxCallFrameSize(),
+                     std::max(MaxAlign, getStackAlignment()));
 
-  return RoundUpToAlignment(Offset, getStackAlignment());
+  return alignTo(Offset, getStackAlignment());
 }
 
 // Eliminate ADJCALLSTACKDOWN, ADJCALLSTACKUP pseudo instructions
-void MipsFrameLowering::
+MachineBasicBlock::iterator MipsFrameLowering::
 eliminateCallFramePseudoInstr(MachineFunction &MF, MachineBasicBlock &MBB,
                               MachineBasicBlock::iterator I) const {
   unsigned SP = STI.getABI().IsN64() ? Mips::SP_64 : Mips::SP;
@@ -157,5 +155,5 @@ eliminateCallFramePseudoInstr(MachineFunction &MF, MachineBasicBlock &MBB,
     STI.getInstrInfo()->adjustStackPtr(SP, Amount, MBB, I);
   }
 
-  MBB.erase(I);
+  return MBB.erase(I);
 }
diff --git a/lib/Target/Mips/MipsFrameLowering.h b/lib/Target/Mips/MipsFrameLowering.h
index 5eabd58e8686..8c4214c4c21d 100644
--- a/lib/Target/Mips/MipsFrameLowering.h
+++ b/lib/Target/Mips/MipsFrameLowering.h
@@ -36,7 +36,7 @@ public:
 
   bool isFPCloseToIncomingSP() const override { return false; }
 
-  void
+  MachineBasicBlock::iterator
   eliminateCallFramePseudoInstr(MachineFunction &MF,
                                 MachineBasicBlock &MBB,
                                 MachineBasicBlock::iterator I) const override;
diff --git a/lib/Target/Mips/MipsHazardSchedule.cpp b/lib/Target/Mips/MipsHazardSchedule.cpp
new file mode 100644
index 000000000000..10022ba60680
--- /dev/null
+++ b/lib/Target/Mips/MipsHazardSchedule.cpp
@@ -0,0 +1,147 @@
+//===-- MipsHazardSchedule.cpp - Workaround pipeline hazards --------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+/// \file
+/// This pass is used to workaround certain pipeline hazards. For now, this covers
+/// compact branch hazards. In future this pass can be extended to other pipeline
+/// hazards, such as various MIPS1 hazards, processor errata that require
+/// instruction reorganization, etc.
+///
+/// This pass has to run after the delay slot filler as that pass can introduce
+/// pipeline hazards, hence the existing hazard recognizer is not suitable.
+///
+/// Hazards handled: forbidden slots for MIPSR6.
+///
+/// A forbidden slot hazard occurs when a compact branch instruction is executed
+/// and the adjacent instruction in memory is a control transfer instruction such
+/// as a branch or jump, ERET, ERETNC, DERET, WAIT and PAUSE.
+///
+/// For example:
+///
+/// 0x8004      bnec    a1,v0,<P+0x18>
+/// 0x8008      beqc    a1,a2,<P+0x54>
+///
+/// In such cases, the processor is required to signal a Reserved Instruction
+/// exception.
+///
+/// Here, if the instruction at 0x8004 is executed, the processor will raise an
+/// exception as there is a control transfer instruction at 0x8008.
+///
+/// There are two sources of forbidden slot hazards:
+///
+/// A) A previous pass has created a compact branch directly.
+/// B) Transforming a delay slot branch into compact branch. This case can be
+///    difficult to process as lookahead for hazards is insufficent, as
+///    backwards delay slot fillling can also produce hazards in previously
+///    processed instuctions.
+///
+//===----------------------------------------------------------------------===//
+
+#include "Mips.h"
+#include "MipsInstrInfo.h"
+#include "MipsSEInstrInfo.h"
+#include "MipsTargetMachine.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/IR/Function.h"
+#include "llvm/Target/TargetInstrInfo.h"
+#include "llvm/Target/TargetMachine.h"
+#include "llvm/Target/TargetRegisterInfo.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "mips-hazard-schedule"
+
+STATISTIC(NumInsertedNops, "Number of nops inserted");
+
+namespace {
+
+typedef MachineBasicBlock::iterator Iter;
+typedef MachineBasicBlock::reverse_iterator ReverseIter;
+
+class MipsHazardSchedule : public MachineFunctionPass {
+
+public:
+  MipsHazardSchedule() : MachineFunctionPass(ID) {}
+
+  const char *getPassName() const override { return "Mips Hazard Schedule"; }
+
+  bool runOnMachineFunction(MachineFunction &F) override;
+
+  MachineFunctionProperties getRequiredProperties() const override {
+    return MachineFunctionProperties().set(
+        MachineFunctionProperties::Property::AllVRegsAllocated);
+  }
+
+private:
+  static char ID;
+};
+
+char MipsHazardSchedule::ID = 0;
+} // end of anonymous namespace
+
+/// Returns a pass that clears pipeline hazards.
+FunctionPass *llvm::createMipsHazardSchedule() {
+  return new MipsHazardSchedule();
+}
+
+// Find the next real instruction from the current position.
+static Iter getNextMachineInstr(Iter Position) {
+  Iter I = Position, E = Position->getParent()->end();
+  I = std::find_if_not(I, E, [](const Iter &Insn) { return Insn->isTransient(); });
+  assert(I != E);
+  return I;
+}
+
+bool MipsHazardSchedule::runOnMachineFunction(MachineFunction &MF) {
+
+  const MipsSubtarget *STI =
+      &static_cast<const MipsSubtarget &>(MF.getSubtarget());
+
+  // Forbidden slot hazards are only defined for MIPSR6.
+  if (!STI->hasMips32r6() || STI->inMicroMipsMode())
+    return false;
+
+  bool Changed = false;
+  const MipsInstrInfo *TII = STI->getInstrInfo();
+
+  for (MachineFunction::iterator FI = MF.begin(); FI != MF.end(); ++FI) {
+    for (Iter I = FI->begin(); I != FI->end(); ++I) {
+
+      // Forbidden slot hazard handling. Use lookahead over state.
+      if (!TII->HasForbiddenSlot(*I))
+        continue;
+
+      bool InsertNop = false;
+      // Next instruction in the basic block.
+      if (std::next(I) != FI->end() &&
+          !TII->SafeInForbiddenSlot(*getNextMachineInstr(std::next(I)))) {
+        InsertNop = true;
+      } else {
+        // Next instruction in the physical successor basic block.
+        for (auto *Succ : FI->successors()) {
+          if (FI->isLayoutSuccessor(Succ) &&
+              getNextMachineInstr(Succ->begin()) != Succ->end() &&
+              !TII->SafeInForbiddenSlot(*getNextMachineInstr(Succ->begin()))) {
+            InsertNop = true;
+            break;
+          }
+        }
+      }
+
+      if (InsertNop) {
+        Changed = true;
+        MIBundleBuilder(&*I).append(
+            BuildMI(MF, I->getDebugLoc(), TII->get(Mips::NOP)));
+        NumInsertedNops++;
+      }
+    }
+  }
+  return Changed;
+}
diff --git a/lib/Target/Mips/MipsISelDAGToDAG.cpp b/lib/Target/Mips/MipsISelDAGToDAG.cpp
index 06502397b6b8..83763a64ab6a 100644
--- a/lib/Target/Mips/MipsISelDAGToDAG.cpp
+++ b/lib/Target/Mips/MipsISelDAGToDAG.cpp
@@ -72,12 +72,6 @@ bool MipsDAGToDAGISel::selectAddrRegImm(SDValue Addr, SDValue &Base,
   return false;
 }
 
-bool MipsDAGToDAGISel::selectAddrRegReg(SDValue Addr, SDValue &Base,
-                                        SDValue &Offset) const {
-  llvm_unreachable("Unimplemented function.");
-  return false;
-}
-
 bool MipsDAGToDAGISel::selectAddrDefault(SDValue Addr, SDValue &Base,
                                          SDValue &Offset) const {
   llvm_unreachable("Unimplemented function.");
@@ -90,7 +84,19 @@ bool MipsDAGToDAGISel::selectIntAddr(SDValue Addr, SDValue &Base,
   return false;
 }
 
-bool MipsDAGToDAGISel::selectIntAddrMM(SDValue Addr, SDValue &Base,
+bool MipsDAGToDAGISel::selectIntAddr11MM(SDValue Addr, SDValue &Base,
+                                       SDValue &Offset) const {
+  llvm_unreachable("Unimplemented function.");
+  return false;
+}
+
+bool MipsDAGToDAGISel::selectIntAddr12MM(SDValue Addr, SDValue &Base,
+                                       SDValue &Offset) const {
+  llvm_unreachable("Unimplemented function.");
+  return false;
+}
+
+bool MipsDAGToDAGISel::selectIntAddr16MM(SDValue Addr, SDValue &Base,
                                        SDValue &Offset) const {
   llvm_unreachable("Unimplemented function.");
   return false;
@@ -108,8 +114,14 @@ bool MipsDAGToDAGISel::selectIntAddrMSA(SDValue Addr, SDValue &Base,
   return false;
 }
 
-bool MipsDAGToDAGISel::selectAddr16(SDNode *Parent, SDValue N, SDValue &Base,
-                                    SDValue &Offset, SDValue &Alias) {
+bool MipsDAGToDAGISel::selectAddr16(SDValue Addr, SDValue &Base,
+                                    SDValue &Offset) {
+  llvm_unreachable("Unimplemented function.");
+  return false;
+}
+
+bool MipsDAGToDAGISel::selectAddr16SP(SDValue Addr, SDValue &Base,
+                                      SDValue &Offset) {
   llvm_unreachable("Unimplemented function.");
   return false;
 }
@@ -182,7 +194,7 @@ bool MipsDAGToDAGISel::selectVSplatMaskR(SDValue N, SDValue &Imm) const {
 
 /// Select instructions not customized! Used for
 /// expanded, promoted and normal instructions
-SDNode* MipsDAGToDAGISel::Select(SDNode *Node) {
+void MipsDAGToDAGISel::Select(SDNode *Node) {
   unsigned Opcode = Node->getOpcode();
 
   // Dump information about the Node being selected
@@ -192,21 +204,20 @@ SDNode* MipsDAGToDAGISel::Select(SDNode *Node) {
   if (Node->isMachineOpcode()) {
     DEBUG(errs() << "== "; Node->dump(CurDAG); errs() << "\n");
     Node->setNodeId(-1);
-    return nullptr;
+    return;
   }
 
   // See if subclasses can handle this node.
-  std::pair<bool, SDNode*> Ret = selectNode(Node);
-
-  if (Ret.first)
-    return Ret.second;
+  if (trySelect(Node))
+    return;
 
   switch(Opcode) {
   default: break;
 
   // Get target GOT address.
   case ISD::GLOBAL_OFFSET_TABLE:
-    return getGlobalBaseReg();
+    ReplaceNode(Node, getGlobalBaseReg());
+    return;
 
 #ifndef NDEBUG
   case ISD::LOAD:
@@ -220,15 +231,7 @@ SDNode* MipsDAGToDAGISel::Select(SDNode *Node) {
   }
 
   // Select the default instruction
-  SDNode *ResNode = SelectCode(Node);
-
-  DEBUG(errs() << "=> ");
-  if (ResNode == nullptr || ResNode == Node)
-    DEBUG(Node->dump(CurDAG));
-  else
-    DEBUG(ResNode->dump(CurDAG));
-  DEBUG(errs() << "\n");
-  return ResNode;
+  SelectCode(Node);
 }
 
 bool MipsDAGToDAGISel::
diff --git a/lib/Target/Mips/MipsISelDAGToDAG.h b/lib/Target/Mips/MipsISelDAGToDAG.h
index 1426d0fbf516..289832a8064e 100644
--- a/lib/Target/Mips/MipsISelDAGToDAG.h
+++ b/lib/Target/Mips/MipsISelDAGToDAG.h
@@ -31,8 +31,8 @@ namespace llvm {
 
 class MipsDAGToDAGISel : public SelectionDAGISel {
 public:
-  explicit MipsDAGToDAGISel(MipsTargetMachine &TM)
-      : SelectionDAGISel(TM), Subtarget(nullptr) {}
+  explicit MipsDAGToDAGISel(MipsTargetMachine &TM, CodeGenOpt::Level OL)
+      : SelectionDAGISel(TM, OL), Subtarget(nullptr) {}
 
   // Pass Name
   const char *getPassName() const override {
@@ -57,11 +57,6 @@ private:
   virtual bool selectAddrRegImm(SDValue Addr, SDValue &Base,
                                 SDValue &Offset) const;
 
-  // Complex Pattern.
-  /// (reg + reg).
-  virtual bool selectAddrRegReg(SDValue Addr, SDValue &Base,
-                                SDValue &Offset) const;
-
   /// Fall back on this function if all else fails.
   virtual bool selectAddrDefault(SDValue Addr, SDValue &Base,
                                  SDValue &Offset) const;
@@ -70,9 +65,15 @@ private:
   virtual bool selectIntAddr(SDValue Addr, SDValue &Base,
                              SDValue &Offset) const;
 
-  virtual bool selectIntAddrMM(SDValue Addr, SDValue &Base,
+  virtual bool selectIntAddr11MM(SDValue Addr, SDValue &Base,
+                                 SDValue &Offset) const;
+
+  virtual bool selectIntAddr12MM(SDValue Addr, SDValue &Base,
                                SDValue &Offset) const;
 
+  virtual bool selectIntAddr16MM(SDValue Addr, SDValue &Base,
+                                 SDValue &Offset) const;
+
   virtual bool selectIntAddrLSL2MM(SDValue Addr, SDValue &Base,
                                    SDValue &Offset) const;
 
@@ -80,8 +81,8 @@ private:
   virtual bool selectIntAddrMSA(SDValue Addr, SDValue &Base,
                                 SDValue &Offset) const;
 
-  virtual bool selectAddr16(SDNode *Parent, SDValue N, SDValue &Base,
-                            SDValue &Offset, SDValue &Alias);
+  virtual bool selectAddr16(SDValue Addr, SDValue &Base, SDValue &Offset);
+  virtual bool selectAddr16SP(SDValue Addr, SDValue &Base, SDValue &Offset);
 
   /// \brief Select constant vector splats.
   virtual bool selectVSplat(SDNode *N, APInt &Imm,
@@ -114,9 +115,9 @@ private:
   /// starting at bit zero.
   virtual bool selectVSplatMaskR(SDValue N, SDValue &Imm) const;
 
-  SDNode *Select(SDNode *N) override;
+  void Select(SDNode *N) override;
 
-  virtual std::pair<bool, SDNode*> selectNode(SDNode *Node) = 0;
+  virtual bool trySelect(SDNode *Node) = 0;
 
   // getImm - Return a target constant with the specified value.
   inline SDValue getImm(const SDNode *Node, uint64_t Imm) {
diff --git a/lib/Target/Mips/MipsISelLowering.cpp b/lib/Target/Mips/MipsISelLowering.cpp
index 5680130b91b2..1d62a251cc66 100644
--- a/lib/Target/Mips/MipsISelLowering.cpp
+++ b/lib/Target/Mips/MipsISelLowering.cpp
@@ -341,10 +341,6 @@ MipsTargetLowering::MipsTargetLowering(const MipsTargetMachine &TM,
   }
   setOperationAction(ISD::CTTZ,              MVT::i32,   Expand);
   setOperationAction(ISD::CTTZ,              MVT::i64,   Expand);
-  setOperationAction(ISD::CTTZ_ZERO_UNDEF,   MVT::i32,   Expand);
-  setOperationAction(ISD::CTTZ_ZERO_UNDEF,   MVT::i64,   Expand);
-  setOperationAction(ISD::CTLZ_ZERO_UNDEF,   MVT::i32,   Expand);
-  setOperationAction(ISD::CTLZ_ZERO_UNDEF,   MVT::i64,   Expand);
   setOperationAction(ISD::ROTL,              MVT::i32,   Expand);
   setOperationAction(ISD::ROTL,              MVT::i64,   Expand);
   setOperationAction(ISD::DYNAMIC_STACKALLOC, MVT::i32,  Expand);
@@ -396,7 +392,6 @@ MipsTargetLowering::MipsTargetLowering(const MipsTargetMachine &TM,
     setOperationAction(ISD::ATOMIC_STORE,    MVT::i64,   Expand);
   }
 
-  setInsertFencesForAtomic(true);
 
   if (!Subtarget.hasMips32r2()) {
     setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i8,  Expand);
@@ -429,6 +424,7 @@ MipsTargetLowering::MipsTargetLowering(const MipsTargetMachine &TM,
   setTargetDAGCombine(ISD::AND);
   setTargetDAGCombine(ISD::OR);
   setTargetDAGCombine(ISD::ADD);
+  setTargetDAGCombine(ISD::AssertZext);
 
   setMinFunctionAlignment(Subtarget.isGP64bit() ? 3 : 2);
 
@@ -568,7 +564,7 @@ static SDValue createFPCmp(SelectionDAG &DAG, const SDValue &Op) {
 
 // Creates and returns a CMovFPT/F node.
 static SDValue createCMovFP(SelectionDAG &DAG, SDValue Cond, SDValue True,
-                            SDValue False, SDLoc DL) {
+                            SDValue False, const SDLoc &DL) {
   ConstantSDNode *CC = cast<ConstantSDNode>(Cond.getOperand(2));
   bool invert = invertFPCondCodeUser((Mips::CondCode)CC->getSExtValue());
   SDValue FCC0 = DAG.getRegister(Mips::FCC0, MVT::i32);
@@ -808,6 +804,37 @@ static SDValue performADDCombine(SDNode *N, SelectionDAG &DAG,
   return DAG.getNode(ISD::ADD, DL, ValTy, Add1, Lo);
 }
 
+static SDValue performAssertZextCombine(SDNode *N, SelectionDAG &DAG,
+                                        TargetLowering::DAGCombinerInfo &DCI,
+                                        const MipsSubtarget &Subtarget) {
+  SDValue N0 = N->getOperand(0);
+  EVT NarrowerVT = cast<VTSDNode>(N->getOperand(1))->getVT();
+
+  if (N0.getOpcode() != ISD::TRUNCATE)
+    return SDValue();
+
+  if (N0.getOperand(0).getOpcode() != ISD::AssertZext)
+    return SDValue();
+
+  // fold (AssertZext (trunc (AssertZext x))) -> (trunc (AssertZext x))
+  // if the type of the extension of the innermost AssertZext node is
+  // smaller from that of the outermost node, eg:
+  // (AssertZext:i32 (trunc:i32 (AssertZext:i64 X, i32)), i8)
+  //   -> (trunc:i32 (AssertZext X, i8))
+  SDValue WiderAssertZext = N0.getOperand(0);
+  EVT WiderVT = cast<VTSDNode>(WiderAssertZext->getOperand(1))->getVT();
+
+  if (NarrowerVT.bitsLT(WiderVT)) {
+    SDValue NewAssertZext = DAG.getNode(
+        ISD::AssertZext, SDLoc(N), WiderAssertZext.getValueType(),
+        WiderAssertZext.getOperand(0), DAG.getValueType(NarrowerVT));
+    return DAG.getNode(ISD::TRUNCATE, SDLoc(N), N->getValueType(0),
+                       NewAssertZext);
+  }
+
+  return SDValue();
+}
+
 SDValue  MipsTargetLowering::PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI)
   const {
   SelectionDAG &DAG = DCI.DAG;
@@ -829,6 +856,8 @@ SDValue  MipsTargetLowering::PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI)
     return performORCombine(N, DAG, DCI, Subtarget);
   case ISD::ADD:
     return performADDCombine(N, DAG, DCI, Subtarget);
+  case ISD::AssertZext:
+    return performAssertZextCombine(N, DAG, DCI, Subtarget);
   }
 
   return SDValue();
@@ -906,20 +935,22 @@ addLiveIn(MachineFunction &MF, unsigned PReg, const TargetRegisterClass *RC)
   return VReg;
 }
 
-static MachineBasicBlock *insertDivByZeroTrap(MachineInstr *MI,
+static MachineBasicBlock *insertDivByZeroTrap(MachineInstr &MI,
                                               MachineBasicBlock &MBB,
                                               const TargetInstrInfo &TII,
-                                              bool Is64Bit) {
+                                              bool Is64Bit, bool IsMicroMips) {
   if (NoZeroDivCheck)
     return &MBB;
 
   // Insert instruction "teq $divisor_reg, $zero, 7".
   MachineBasicBlock::iterator I(MI);
   MachineInstrBuilder MIB;
-  MachineOperand &Divisor = MI->getOperand(2);
-  MIB = BuildMI(MBB, std::next(I), MI->getDebugLoc(), TII.get(Mips::TEQ))
-    .addReg(Divisor.getReg(), getKillRegState(Divisor.isKill()))
-    .addReg(Mips::ZERO).addImm(7);
+  MachineOperand &Divisor = MI.getOperand(2);
+  MIB = BuildMI(MBB, std::next(I), MI.getDebugLoc(),
+                TII.get(IsMicroMips ? Mips::TEQ_MM : Mips::TEQ))
+            .addReg(Divisor.getReg(), getKillRegState(Divisor.isKill()))
+            .addReg(Mips::ZERO)
+            .addImm(7);
 
   // Use the 32-bit sub-register if this is a 64-bit division.
   if (Is64Bit)
@@ -935,9 +966,9 @@ static MachineBasicBlock *insertDivByZeroTrap(MachineInstr *MI,
 }
 
 MachineBasicBlock *
-MipsTargetLowering::EmitInstrWithCustomInserter(MachineInstr *MI,
+MipsTargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI,
                                                 MachineBasicBlock *BB) const {
-  switch (MI->getOpcode()) {
+  switch (MI.getOpcode()) {
   default:
     llvm_unreachable("Unexpected instr type to insert");
   case Mips::ATOMIC_LOAD_ADD_I8:
@@ -1017,15 +1048,31 @@ MipsTargetLowering::EmitInstrWithCustomInserter(MachineInstr *MI,
   case Mips::DIVU:
   case Mips::MOD:
   case Mips::MODU:
-    return insertDivByZeroTrap(MI, *BB, *Subtarget.getInstrInfo(), false);
+    return insertDivByZeroTrap(MI, *BB, *Subtarget.getInstrInfo(), false,
+                               false);
+  case Mips::SDIV_MM_Pseudo:
+  case Mips::UDIV_MM_Pseudo:
+  case Mips::SDIV_MM:
+  case Mips::UDIV_MM:
+  case Mips::DIV_MMR6:
+  case Mips::DIVU_MMR6:
+  case Mips::MOD_MMR6:
+  case Mips::MODU_MMR6:
+    return insertDivByZeroTrap(MI, *BB, *Subtarget.getInstrInfo(), false, true);
   case Mips::PseudoDSDIV:
   case Mips::PseudoDUDIV:
   case Mips::DDIV:
   case Mips::DDIVU:
   case Mips::DMOD:
   case Mips::DMODU:
-    return insertDivByZeroTrap(MI, *BB, *Subtarget.getInstrInfo(), true);
+    return insertDivByZeroTrap(MI, *BB, *Subtarget.getInstrInfo(), true, false);
+  case Mips::DDIV_MM64R6:
+  case Mips::DDIVU_MM64R6:
+  case Mips::DMOD_MM64R6:
+  case Mips::DMODU_MM64R6:
+    return insertDivByZeroTrap(MI, *BB, *Subtarget.getInstrInfo(), true, true);
   case Mips::SEL_D:
+  case Mips::SEL_D_MMR6:
     return emitSEL_D(MI, BB);
 
   case Mips::PseudoSELECT_I:
@@ -1051,17 +1098,19 @@ MipsTargetLowering::EmitInstrWithCustomInserter(MachineInstr *MI,
 
 // This function also handles Mips::ATOMIC_SWAP_I32 (when BinOpcode == 0), and
 // Mips::ATOMIC_LOAD_NAND_I32 (when Nand == true)
-MachineBasicBlock *
-MipsTargetLowering::emitAtomicBinary(MachineInstr *MI, MachineBasicBlock *BB,
-                                     unsigned Size, unsigned BinOpcode,
-                                     bool Nand) const {
+MachineBasicBlock *MipsTargetLowering::emitAtomicBinary(MachineInstr &MI,
+                                                        MachineBasicBlock *BB,
+                                                        unsigned Size,
+                                                        unsigned BinOpcode,
+                                                        bool Nand) const {
   assert((Size == 4 || Size == 8) && "Unsupported size for EmitAtomicBinary.");
 
   MachineFunction *MF = BB->getParent();
   MachineRegisterInfo &RegInfo = MF->getRegInfo();
   const TargetRegisterClass *RC = getRegClassFor(MVT::getIntegerVT(Size * 8));
   const TargetInstrInfo *TII = Subtarget.getInstrInfo();
-  DebugLoc DL = MI->getDebugLoc();
+  const bool ArePtrs64bit = ABI.ArePtrs64bit();
+  DebugLoc DL = MI.getDebugLoc();
   unsigned LL, SC, AND, NOR, ZERO, BEQ;
 
   if (Size == 4) {
@@ -1069,9 +1118,14 @@ MipsTargetLowering::emitAtomicBinary(MachineInstr *MI, MachineBasicBlock *BB,
       LL = Mips::LL_MM;
       SC = Mips::SC_MM;
     } else {
-      LL = Subtarget.hasMips32r6() ? Mips::LL_R6 : Mips::LL;
-      SC = Subtarget.hasMips32r6() ? Mips::SC_R6 : Mips::SC;
+      LL = Subtarget.hasMips32r6()
+               ? (ArePtrs64bit ? Mips::LL64_R6 : Mips::LL_R6)
+               : (ArePtrs64bit ? Mips::LL64 : Mips::LL);
+      SC = Subtarget.hasMips32r6()
+               ? (ArePtrs64bit ? Mips::SC64_R6 : Mips::SC_R6)
+               : (ArePtrs64bit ? Mips::SC64 : Mips::SC);
     }
+
     AND = Mips::AND;
     NOR = Mips::NOR;
     ZERO = Mips::ZERO;
@@ -1085,9 +1139,9 @@ MipsTargetLowering::emitAtomicBinary(MachineInstr *MI, MachineBasicBlock *BB,
     BEQ = Mips::BEQ64;
   }
 
-  unsigned OldVal = MI->getOperand(0).getReg();
-  unsigned Ptr = MI->getOperand(1).getReg();
-  unsigned Incr = MI->getOperand(2).getReg();
+  unsigned OldVal = MI.getOperand(0).getReg();
+  unsigned Ptr = MI.getOperand(1).getReg();
+  unsigned Incr = MI.getOperand(2).getReg();
 
   unsigned StoreVal = RegInfo.createVirtualRegister(RC);
   unsigned AndRes = RegInfo.createVirtualRegister(RC);
@@ -1134,16 +1188,16 @@ MipsTargetLowering::emitAtomicBinary(MachineInstr *MI, MachineBasicBlock *BB,
   BuildMI(BB, DL, TII->get(SC), Success).addReg(StoreVal).addReg(Ptr).addImm(0);
   BuildMI(BB, DL, TII->get(BEQ)).addReg(Success).addReg(ZERO).addMBB(loopMBB);
 
-  MI->eraseFromParent(); // The instruction is gone now.
+  MI.eraseFromParent(); // The instruction is gone now.
 
   return exitMBB;
 }
 
 MachineBasicBlock *MipsTargetLowering::emitSignExtendToI32InReg(
-    MachineInstr *MI, MachineBasicBlock *BB, unsigned Size, unsigned DstReg,
+    MachineInstr &MI, MachineBasicBlock *BB, unsigned Size, unsigned DstReg,
     unsigned SrcReg) const {
   const TargetInstrInfo *TII = Subtarget.getInstrInfo();
-  DebugLoc DL = MI->getDebugLoc();
+  const DebugLoc &DL = MI.getDebugLoc();
 
   if (Subtarget.hasMips32r2() && Size == 1) {
     BuildMI(BB, DL, TII->get(Mips::SEB), DstReg).addReg(SrcReg);
@@ -1170,7 +1224,7 @@ MachineBasicBlock *MipsTargetLowering::emitSignExtendToI32InReg(
 }
 
 MachineBasicBlock *MipsTargetLowering::emitAtomicBinaryPartword(
-    MachineInstr *MI, MachineBasicBlock *BB, unsigned Size, unsigned BinOpcode,
+    MachineInstr &MI, MachineBasicBlock *BB, unsigned Size, unsigned BinOpcode,
     bool Nand) const {
   assert((Size == 1 || Size == 2) &&
          "Unsupported size for EmitAtomicBinaryPartial.");
@@ -1178,21 +1232,24 @@ MachineBasicBlock *MipsTargetLowering::emitAtomicBinaryPartword(
   MachineFunction *MF = BB->getParent();
   MachineRegisterInfo &RegInfo = MF->getRegInfo();
   const TargetRegisterClass *RC = getRegClassFor(MVT::i32);
+  const bool ArePtrs64bit = ABI.ArePtrs64bit();
+  const TargetRegisterClass *RCp =
+    getRegClassFor(ArePtrs64bit ? MVT::i64 : MVT::i32);
   const TargetInstrInfo *TII = Subtarget.getInstrInfo();
-  DebugLoc DL = MI->getDebugLoc();
+  DebugLoc DL = MI.getDebugLoc();
 
-  unsigned Dest = MI->getOperand(0).getReg();
-  unsigned Ptr = MI->getOperand(1).getReg();
-  unsigned Incr = MI->getOperand(2).getReg();
+  unsigned Dest = MI.getOperand(0).getReg();
+  unsigned Ptr = MI.getOperand(1).getReg();
+  unsigned Incr = MI.getOperand(2).getReg();
 
-  unsigned AlignedAddr = RegInfo.createVirtualRegister(RC);
+  unsigned AlignedAddr = RegInfo.createVirtualRegister(RCp);
   unsigned ShiftAmt = RegInfo.createVirtualRegister(RC);
   unsigned Mask = RegInfo.createVirtualRegister(RC);
   unsigned Mask2 = RegInfo.createVirtualRegister(RC);
   unsigned NewVal = RegInfo.createVirtualRegister(RC);
   unsigned OldVal = RegInfo.createVirtualRegister(RC);
   unsigned Incr2 = RegInfo.createVirtualRegister(RC);
-  unsigned MaskLSB2 = RegInfo.createVirtualRegister(RC);
+  unsigned MaskLSB2 = RegInfo.createVirtualRegister(RCp);
   unsigned PtrLSB2 = RegInfo.createVirtualRegister(RC);
   unsigned MaskUpper = RegInfo.createVirtualRegister(RC);
   unsigned AndRes = RegInfo.createVirtualRegister(RC);
@@ -1203,6 +1260,17 @@ MachineBasicBlock *MipsTargetLowering::emitAtomicBinaryPartword(
   unsigned SrlRes = RegInfo.createVirtualRegister(RC);
   unsigned Success = RegInfo.createVirtualRegister(RC);
 
+  unsigned LL, SC;
+  if (isMicroMips) {
+    LL = Mips::LL_MM;
+    SC = Mips::SC_MM;
+  } else {
+    LL = Subtarget.hasMips32r6() ? (ArePtrs64bit ? Mips::LL64_R6 : Mips::LL_R6)
+                                 : (ArePtrs64bit ? Mips::LL64 : Mips::LL);
+    SC = Subtarget.hasMips32r6() ? (ArePtrs64bit ? Mips::SC64_R6 : Mips::SC_R6)
+                                 : (ArePtrs64bit ? Mips::SC64 : Mips::SC);
+  }
+
   // insert new blocks after the current block
   const BasicBlock *LLVM_BB = BB->getBasicBlock();
   MachineBasicBlock *loopMBB = MF->CreateMachineBasicBlock(LLVM_BB);
@@ -1234,11 +1302,12 @@ MachineBasicBlock *MipsTargetLowering::emitAtomicBinaryPartword(
   //    sll     incr2,incr,shiftamt
 
   int64_t MaskImm = (Size == 1) ? 255 : 65535;
-  BuildMI(BB, DL, TII->get(Mips::ADDiu), MaskLSB2)
-    .addReg(Mips::ZERO).addImm(-4);
-  BuildMI(BB, DL, TII->get(Mips::AND), AlignedAddr)
+  BuildMI(BB, DL, TII->get(ABI.GetPtrAddiuOp()), MaskLSB2)
+    .addReg(ABI.GetNullPtr()).addImm(-4);
+  BuildMI(BB, DL, TII->get(ABI.GetPtrAndOp()), AlignedAddr)
     .addReg(Ptr).addReg(MaskLSB2);
-  BuildMI(BB, DL, TII->get(Mips::ANDi), PtrLSB2).addReg(Ptr).addImm(3);
+  BuildMI(BB, DL, TII->get(Mips::ANDi), PtrLSB2)
+      .addReg(Ptr, 0, ArePtrs64bit ? Mips::sub_32 : 0).addImm(3);
   if (Subtarget.isLittle()) {
     BuildMI(BB, DL, TII->get(Mips::SLL), ShiftAmt).addReg(PtrLSB2).addImm(3);
   } else {
@@ -1274,7 +1343,6 @@ MachineBasicBlock *MipsTargetLowering::emitAtomicBinaryPartword(
   //   beq     success,$0,loopMBB
 
   BB = loopMBB;
-  unsigned LL = isMicroMips ? Mips::LL_MM : Mips::LL;
   BuildMI(BB, DL, TII->get(LL), OldVal).addReg(AlignedAddr).addImm(0);
   if (Nand) {
     //  and andres, oldval, incr2
@@ -1298,7 +1366,6 @@ MachineBasicBlock *MipsTargetLowering::emitAtomicBinaryPartword(
     .addReg(OldVal).addReg(Mask2);
   BuildMI(BB, DL, TII->get(Mips::OR), StoreVal)
     .addReg(MaskedOldVal0).addReg(NewVal);
-  unsigned SC = isMicroMips ? Mips::SC_MM : Mips::SC;
   BuildMI(BB, DL, TII->get(SC), Success)
     .addReg(StoreVal).addReg(AlignedAddr).addImm(0);
   BuildMI(BB, DL, TII->get(Mips::BEQ))
@@ -1316,31 +1383,37 @@ MachineBasicBlock *MipsTargetLowering::emitAtomicBinaryPartword(
       .addReg(MaskedOldVal1).addReg(ShiftAmt);
   BB = emitSignExtendToI32InReg(MI, BB, Size, Dest, SrlRes);
 
-  MI->eraseFromParent(); // The instruction is gone now.
+  MI.eraseFromParent(); // The instruction is gone now.
 
   return exitMBB;
 }
 
-MachineBasicBlock * MipsTargetLowering::emitAtomicCmpSwap(MachineInstr *MI,
-                                                          MachineBasicBlock *BB,
-                                                          unsigned Size) const {
+MachineBasicBlock *MipsTargetLowering::emitAtomicCmpSwap(MachineInstr &MI,
+                                                         MachineBasicBlock *BB,
+                                                         unsigned Size) const {
   assert((Size == 4 || Size == 8) && "Unsupported size for EmitAtomicCmpSwap.");
 
   MachineFunction *MF = BB->getParent();
   MachineRegisterInfo &RegInfo = MF->getRegInfo();
   const TargetRegisterClass *RC = getRegClassFor(MVT::getIntegerVT(Size * 8));
   const TargetInstrInfo *TII = Subtarget.getInstrInfo();
-  DebugLoc DL = MI->getDebugLoc();
+  const bool ArePtrs64bit = ABI.ArePtrs64bit();
+  DebugLoc DL = MI.getDebugLoc();
   unsigned LL, SC, ZERO, BNE, BEQ;
 
-   if (Size == 4) {
-     if (isMicroMips) {
-       LL = Mips::LL_MM;
-       SC = Mips::SC_MM;
-     } else {
-       LL = Subtarget.hasMips32r6() ? Mips::LL_R6 : Mips::LL;
-       SC = Subtarget.hasMips32r6() ? Mips::SC_R6 : Mips::SC;
-     }
+  if (Size == 4) {
+    if (isMicroMips) {
+      LL = Mips::LL_MM;
+      SC = Mips::SC_MM;
+    } else {
+      LL = Subtarget.hasMips32r6()
+               ? (ArePtrs64bit ? Mips::LL64_R6 : Mips::LL_R6)
+               : (ArePtrs64bit ? Mips::LL64 : Mips::LL);
+      SC = Subtarget.hasMips32r6()
+               ? (ArePtrs64bit ? Mips::SC64_R6 : Mips::SC_R6)
+               : (ArePtrs64bit ? Mips::SC64 : Mips::SC);
+    }
+
     ZERO = Mips::ZERO;
     BNE = Mips::BNE;
     BEQ = Mips::BEQ;
@@ -1352,10 +1425,10 @@ MachineBasicBlock * MipsTargetLowering::emitAtomicCmpSwap(MachineInstr *MI,
     BEQ = Mips::BEQ64;
   }
 
-  unsigned Dest    = MI->getOperand(0).getReg();
-  unsigned Ptr     = MI->getOperand(1).getReg();
-  unsigned OldVal  = MI->getOperand(2).getReg();
-  unsigned NewVal  = MI->getOperand(3).getReg();
+  unsigned Dest = MI.getOperand(0).getReg();
+  unsigned Ptr = MI.getOperand(1).getReg();
+  unsigned OldVal = MI.getOperand(2).getReg();
+  unsigned NewVal = MI.getOperand(3).getReg();
 
   unsigned Success = RegInfo.createVirtualRegister(RC);
 
@@ -1400,30 +1473,31 @@ MachineBasicBlock * MipsTargetLowering::emitAtomicCmpSwap(MachineInstr *MI,
   BuildMI(BB, DL, TII->get(BEQ))
     .addReg(Success).addReg(ZERO).addMBB(loop1MBB);
 
-  MI->eraseFromParent(); // The instruction is gone now.
+  MI.eraseFromParent(); // The instruction is gone now.
 
   return exitMBB;
 }
 
-MachineBasicBlock *
-MipsTargetLowering::emitAtomicCmpSwapPartword(MachineInstr *MI,
-                                              MachineBasicBlock *BB,
-                                              unsigned Size) const {
+MachineBasicBlock *MipsTargetLowering::emitAtomicCmpSwapPartword(
+    MachineInstr &MI, MachineBasicBlock *BB, unsigned Size) const {
   assert((Size == 1 || Size == 2) &&
       "Unsupported size for EmitAtomicCmpSwapPartial.");
 
   MachineFunction *MF = BB->getParent();
   MachineRegisterInfo &RegInfo = MF->getRegInfo();
   const TargetRegisterClass *RC = getRegClassFor(MVT::i32);
+  const bool ArePtrs64bit = ABI.ArePtrs64bit();
+  const TargetRegisterClass *RCp =
+    getRegClassFor(ArePtrs64bit ? MVT::i64 : MVT::i32);
   const TargetInstrInfo *TII = Subtarget.getInstrInfo();
-  DebugLoc DL = MI->getDebugLoc();
+  DebugLoc DL = MI.getDebugLoc();
 
-  unsigned Dest    = MI->getOperand(0).getReg();
-  unsigned Ptr     = MI->getOperand(1).getReg();
-  unsigned CmpVal  = MI->getOperand(2).getReg();
-  unsigned NewVal  = MI->getOperand(3).getReg();
+  unsigned Dest = MI.getOperand(0).getReg();
+  unsigned Ptr = MI.getOperand(1).getReg();
+  unsigned CmpVal = MI.getOperand(2).getReg();
+  unsigned NewVal = MI.getOperand(3).getReg();
 
-  unsigned AlignedAddr = RegInfo.createVirtualRegister(RC);
+  unsigned AlignedAddr = RegInfo.createVirtualRegister(RCp);
   unsigned ShiftAmt = RegInfo.createVirtualRegister(RC);
   unsigned Mask = RegInfo.createVirtualRegister(RC);
   unsigned Mask2 = RegInfo.createVirtualRegister(RC);
@@ -1431,7 +1505,7 @@ MipsTargetLowering::emitAtomicCmpSwapPartword(MachineInstr *MI,
   unsigned OldVal = RegInfo.createVirtualRegister(RC);
   unsigned MaskedOldVal0 = RegInfo.createVirtualRegister(RC);
   unsigned ShiftedNewVal = RegInfo.createVirtualRegister(RC);
-  unsigned MaskLSB2 = RegInfo.createVirtualRegister(RC);
+  unsigned MaskLSB2 = RegInfo.createVirtualRegister(RCp);
   unsigned PtrLSB2 = RegInfo.createVirtualRegister(RC);
   unsigned MaskUpper = RegInfo.createVirtualRegister(RC);
   unsigned MaskedCmpVal = RegInfo.createVirtualRegister(RC);
@@ -1440,6 +1514,17 @@ MipsTargetLowering::emitAtomicCmpSwapPartword(MachineInstr *MI,
   unsigned StoreVal = RegInfo.createVirtualRegister(RC);
   unsigned SrlRes = RegInfo.createVirtualRegister(RC);
   unsigned Success = RegInfo.createVirtualRegister(RC);
+  unsigned LL, SC;
+
+  if (isMicroMips) {
+    LL = Mips::LL_MM;
+    SC = Mips::SC_MM;
+  } else {
+    LL = Subtarget.hasMips32r6() ? (ArePtrs64bit ? Mips::LL64_R6 : Mips::LL_R6)
+                                 : (ArePtrs64bit ? Mips::LL64 : Mips::LL);
+    SC = Subtarget.hasMips32r6() ? (ArePtrs64bit ? Mips::SC64_R6 : Mips::SC_R6)
+                                 : (ArePtrs64bit ? Mips::SC64 : Mips::SC);
+  }
 
   // insert new blocks after the current block
   const BasicBlock *LLVM_BB = BB->getBasicBlock();
@@ -1470,6 +1555,7 @@ MipsTargetLowering::emitAtomicCmpSwapPartword(MachineInstr *MI,
   //    addiu   masklsb2,$0,-4                # 0xfffffffc
   //    and     alignedaddr,ptr,masklsb2
   //    andi    ptrlsb2,ptr,3
+  //    xori    ptrlsb2,ptrlsb2,3              # Only for BE
   //    sll     shiftamt,ptrlsb2,3
   //    ori     maskupper,$0,255               # 0xff
   //    sll     mask,maskupper,shiftamt
@@ -1479,11 +1565,12 @@ MipsTargetLowering::emitAtomicCmpSwapPartword(MachineInstr *MI,
   //    andi    maskednewval,newval,255
   //    sll     shiftednewval,maskednewval,shiftamt
   int64_t MaskImm = (Size == 1) ? 255 : 65535;
-  BuildMI(BB, DL, TII->get(Mips::ADDiu), MaskLSB2)
-    .addReg(Mips::ZERO).addImm(-4);
-  BuildMI(BB, DL, TII->get(Mips::AND), AlignedAddr)
+  BuildMI(BB, DL, TII->get(ArePtrs64bit ? Mips::DADDiu : Mips::ADDiu), MaskLSB2)
+    .addReg(ABI.GetNullPtr()).addImm(-4);
+  BuildMI(BB, DL, TII->get(ArePtrs64bit ? Mips::AND64 : Mips::AND), AlignedAddr)
     .addReg(Ptr).addReg(MaskLSB2);
-  BuildMI(BB, DL, TII->get(Mips::ANDi), PtrLSB2).addReg(Ptr).addImm(3);
+  BuildMI(BB, DL, TII->get(Mips::ANDi), PtrLSB2)
+      .addReg(Ptr, 0, ArePtrs64bit ? Mips::sub_32 : 0).addImm(3);
   if (Subtarget.isLittle()) {
     BuildMI(BB, DL, TII->get(Mips::SLL), ShiftAmt).addReg(PtrLSB2).addImm(3);
   } else {
@@ -1511,7 +1598,6 @@ MipsTargetLowering::emitAtomicCmpSwapPartword(MachineInstr *MI,
   //    and     maskedoldval0,oldval,mask
   //    bne     maskedoldval0,shiftedcmpval,sinkMBB
   BB = loop1MBB;
-  unsigned LL = isMicroMips ? Mips::LL_MM : Mips::LL;
   BuildMI(BB, DL, TII->get(LL), OldVal).addReg(AlignedAddr).addImm(0);
   BuildMI(BB, DL, TII->get(Mips::AND), MaskedOldVal0)
     .addReg(OldVal).addReg(Mask);
@@ -1528,7 +1614,6 @@ MipsTargetLowering::emitAtomicCmpSwapPartword(MachineInstr *MI,
     .addReg(OldVal).addReg(Mask2);
   BuildMI(BB, DL, TII->get(Mips::OR), StoreVal)
     .addReg(MaskedOldVal1).addReg(ShiftedNewVal);
-  unsigned SC = isMicroMips ? Mips::SC_MM : Mips::SC;
   BuildMI(BB, DL, TII->get(SC), Success)
       .addReg(StoreVal).addReg(AlignedAddr).addImm(0);
   BuildMI(BB, DL, TII->get(Mips::BEQ))
@@ -1543,21 +1628,21 @@ MipsTargetLowering::emitAtomicCmpSwapPartword(MachineInstr *MI,
       .addReg(MaskedOldVal0).addReg(ShiftAmt);
   BB = emitSignExtendToI32InReg(MI, BB, Size, Dest, SrlRes);
 
-  MI->eraseFromParent();   // The instruction is gone now.
+  MI.eraseFromParent(); // The instruction is gone now.
 
   return exitMBB;
 }
 
-MachineBasicBlock *MipsTargetLowering::emitSEL_D(MachineInstr *MI,
+MachineBasicBlock *MipsTargetLowering::emitSEL_D(MachineInstr &MI,
                                                  MachineBasicBlock *BB) const {
   MachineFunction *MF = BB->getParent();
   const TargetRegisterInfo *TRI = Subtarget.getRegisterInfo();
   const TargetInstrInfo *TII = Subtarget.getInstrInfo();
   MachineRegisterInfo &RegInfo = MF->getRegInfo();
-  DebugLoc DL = MI->getDebugLoc();
+  DebugLoc DL = MI.getDebugLoc();
   MachineBasicBlock::iterator II(MI);
 
-  unsigned Fc = MI->getOperand(1).getReg();
+  unsigned Fc = MI.getOperand(1).getReg();
   const auto &FGR64RegClass = TRI->getRegClass(Mips::FGR64RegClassID);
 
   unsigned Fc2 = RegInfo.createVirtualRegister(FGR64RegClass);
@@ -1569,7 +1654,7 @@ MachineBasicBlock *MipsTargetLowering::emitSEL_D(MachineInstr *MI,
 
   // We don't erase the original instruction, we just replace the condition
   // register with the 64-bit super-register.
-  MI->getOperand(1).setReg(Fc2);
+  MI.getOperand(1).setReg(Fc2);
 
   return BB;
 }
@@ -1592,13 +1677,12 @@ SDValue MipsTargetLowering::lowerBR_JT(SDValue Op, SelectionDAG &DAG) const {
   SDValue Addr = DAG.getNode(ISD::ADD, DL, PTy, Index, Table);
 
   EVT MemVT = EVT::getIntegerVT(*DAG.getContext(), EntrySize * 8);
-  Addr =
-      DAG.getExtLoad(ISD::SEXTLOAD, DL, PTy, Chain, Addr,
-                     MachinePointerInfo::getJumpTable(DAG.getMachineFunction()),
-                     MemVT, false, false, false, 0);
+  Addr = DAG.getExtLoad(
+      ISD::SEXTLOAD, DL, PTy, Chain, Addr,
+      MachinePointerInfo::getJumpTable(DAG.getMachineFunction()), MemVT);
   Chain = Addr.getValue(1);
 
-  if ((getTargetMachine().getRelocationModel() == Reloc::PIC_) || ABI.IsN64()) {
+  if (isPositionIndependent() || ABI.IsN64()) {
     // For PIC, the sequence is:
     // BRIND(load(Jumptable + index) + RelocBase)
     // RelocBase can be JumpTable, GOT or some sort of global base.
@@ -1667,7 +1751,7 @@ SDValue MipsTargetLowering::lowerGlobalAddress(SDValue Op,
   GlobalAddressSDNode *N = cast<GlobalAddressSDNode>(Op);
   const GlobalValue *GV = N->getGlobal();
 
-  if (getTargetMachine().getRelocationModel() != Reloc::PIC_ && !ABI.IsN64()) {
+  if (!isPositionIndependent() && !ABI.IsN64()) {
     const MipsTargetObjectFile *TLOF =
         static_cast<const MipsTargetObjectFile *>(
             getTargetMachine().getObjFileLowering());
@@ -1679,7 +1763,18 @@ SDValue MipsTargetLowering::lowerGlobalAddress(SDValue Op,
     return getAddrNonPIC(N, SDLoc(N), Ty, DAG);
   }
 
-  if (GV->hasInternalLinkage() || (GV->hasLocalLinkage() && !isa<Function>(GV)))
+  // Every other architecture would use shouldAssumeDSOLocal in here, but
+  // mips is special.
+  // * In PIC code mips requires got loads even for local statics!
+  // * To save on got entries, for local statics the got entry contains the
+  //   page and an additional add instruction takes care of the low bits.
+  // * It is legal to access a hidden symbol with a non hidden undefined,
+  //   so one cannot guarantee that all access to a hidden symbol will know
+  //   it is hidden.
+  // * Mips linkers don't support creating a page and a full got entry for
+  //   the same symbol.
+  // * Given all that, we have to use a full got entry for hidden symbols :-(
+  if (GV->hasLocalLinkage())
     return getAddrLocal(N, SDLoc(N), Ty, DAG, ABI.IsN32() || ABI.IsN64());
 
   if (LargeGOT)
@@ -1690,7 +1785,7 @@ SDValue MipsTargetLowering::lowerGlobalAddress(SDValue Op,
 
   return getAddrGlobal(
       N, SDLoc(N), Ty, DAG,
-      (ABI.IsN32() || ABI.IsN64()) ? MipsII::MO_GOT_DISP : MipsII::MO_GOT16,
+      (ABI.IsN32() || ABI.IsN64()) ? MipsII::MO_GOT_DISP : MipsII::MO_GOT,
       DAG.getEntryNode(), MachinePointerInfo::getGOT(DAG.getMachineFunction()));
 }
 
@@ -1699,7 +1794,7 @@ SDValue MipsTargetLowering::lowerBlockAddress(SDValue Op,
   BlockAddressSDNode *N = cast<BlockAddressSDNode>(Op);
   EVT Ty = Op.getValueType();
 
-  if (getTargetMachine().getRelocationModel() != Reloc::PIC_ && !ABI.IsN64())
+  if (!isPositionIndependent() && !ABI.IsN64())
     return getAddrNonPIC(N, SDLoc(N), Ty, DAG);
 
   return getAddrLocal(N, SDLoc(N), Ty, DAG, ABI.IsN32() || ABI.IsN64());
@@ -1743,7 +1838,7 @@ lowerGlobalTLSAddress(SDValue Op, SelectionDAG &DAG) const
 
     TargetLowering::CallLoweringInfo CLI(DAG);
     CLI.setDebugLoc(DL).setChain(DAG.getEntryNode())
-      .setCallee(CallingConv::C, PtrTy, TlsGetAddr, std::move(Args), 0);
+      .setCallee(CallingConv::C, PtrTy, TlsGetAddr, std::move(Args));
     std::pair<SDValue, SDValue> CallResult = LowerCallTo(CLI);
 
     SDValue Ret = CallResult.first;
@@ -1768,9 +1863,8 @@ lowerGlobalTLSAddress(SDValue Op, SelectionDAG &DAG) const
                                              MipsII::MO_GOTTPREL);
     TGA = DAG.getNode(MipsISD::Wrapper, DL, PtrVT, getGlobalReg(DAG, PtrVT),
                       TGA);
-    Offset = DAG.getLoad(PtrVT, DL,
-                         DAG.getEntryNode(), TGA, MachinePointerInfo(),
-                         false, false, false, 0);
+    Offset =
+        DAG.getLoad(PtrVT, DL, DAG.getEntryNode(), TGA, MachinePointerInfo());
   } else {
     // Local Exec TLS Model
     assert(model == TLSModel::LocalExec);
@@ -1793,7 +1887,7 @@ lowerJumpTable(SDValue Op, SelectionDAG &DAG) const
   JumpTableSDNode *N = cast<JumpTableSDNode>(Op);
   EVT Ty = Op.getValueType();
 
-  if (getTargetMachine().getRelocationModel() != Reloc::PIC_ && !ABI.IsN64())
+  if (!isPositionIndependent() && !ABI.IsN64())
     return getAddrNonPIC(N, SDLoc(N), Ty, DAG);
 
   return getAddrLocal(N, SDLoc(N), Ty, DAG, ABI.IsN32() || ABI.IsN64());
@@ -1805,7 +1899,7 @@ lowerConstantPool(SDValue Op, SelectionDAG &DAG) const
   ConstantPoolSDNode *N = cast<ConstantPoolSDNode>(Op);
   EVT Ty = Op.getValueType();
 
-  if (getTargetMachine().getRelocationModel() != Reloc::PIC_ && !ABI.IsN64()) {
+  if (!isPositionIndependent() && !ABI.IsN64()) {
     const MipsTargetObjectFile *TLOF =
         static_cast<const MipsTargetObjectFile *>(
             getTargetMachine().getObjFileLowering());
@@ -1833,7 +1927,7 @@ SDValue MipsTargetLowering::lowerVASTART(SDValue Op, SelectionDAG &DAG) const {
   // memory location argument.
   const Value *SV = cast<SrcValueSDNode>(Op.getOperand(2))->getValue();
   return DAG.getStore(Op.getOperand(0), DL, FI, Op.getOperand(1),
-                      MachinePointerInfo(SV), false, false, 0);
+                      MachinePointerInfo(SV));
 }
 
 SDValue MipsTargetLowering::lowerVAARG(SDValue Op, SelectionDAG &DAG) const {
@@ -1846,9 +1940,8 @@ SDValue MipsTargetLowering::lowerVAARG(SDValue Op, SelectionDAG &DAG) const {
   SDLoc DL(Node);
   unsigned ArgSlotSizeInBytes = (ABI.IsN32() || ABI.IsN64()) ? 8 : 4;
 
-  SDValue VAListLoad =
-      DAG.getLoad(getPointerTy(DAG.getDataLayout()), DL, Chain, VAListPtr,
-                  MachinePointerInfo(SV), false, false, false, 0);
+  SDValue VAListLoad = DAG.getLoad(getPointerTy(DAG.getDataLayout()), DL, Chain,
+                                   VAListPtr, MachinePointerInfo(SV));
   SDValue VAList = VAListLoad;
 
   // Re-align the pointer if necessary.
@@ -1873,13 +1966,13 @@ SDValue MipsTargetLowering::lowerVAARG(SDValue Op, SelectionDAG &DAG) const {
   auto &TD = DAG.getDataLayout();
   unsigned ArgSizeInBytes =
       TD.getTypeAllocSize(VT.getTypeForEVT(*DAG.getContext()));
-  SDValue Tmp3 = DAG.getNode(ISD::ADD, DL, VAList.getValueType(), VAList,
-                             DAG.getConstant(RoundUpToAlignment(ArgSizeInBytes,
-                                                            ArgSlotSizeInBytes),
-                                             DL, VAList.getValueType()));
+  SDValue Tmp3 =
+      DAG.getNode(ISD::ADD, DL, VAList.getValueType(), VAList,
+                  DAG.getConstant(alignTo(ArgSizeInBytes, ArgSlotSizeInBytes),
+                                  DL, VAList.getValueType()));
   // Store the incremented VAList to the legalized pointer
   Chain = DAG.getStore(VAListLoad.getValue(1), DL, Tmp3, VAListPtr,
-                      MachinePointerInfo(SV), false, false, 0);
+                       MachinePointerInfo(SV));
 
   // In big-endian mode we must adjust the pointer when the load size is smaller
   // than the argument slot size. We must also reduce the known alignment to
@@ -1892,8 +1985,7 @@ SDValue MipsTargetLowering::lowerVAARG(SDValue Op, SelectionDAG &DAG) const {
                          DAG.getIntPtrConstant(Adjustment, DL));
   }
   // Load the actual argument out of the pointer VAList
-  return DAG.getLoad(VT, DL, Chain, VAList, MachinePointerInfo(), false, false,
-                     false, 0);
+  return DAG.getLoad(VT, DL, Chain, VAList, MachinePointerInfo());
 }
 
 static SDValue lowerFCOPYSIGN32(SDValue Op, SelectionDAG &DAG,
@@ -2283,10 +2375,9 @@ static SDValue lowerFP_TO_SINT_STORE(StoreSDNode *SD, SelectionDAG &DAG) {
   EVT FPTy = EVT::getFloatingPointVT(Val.getValueSizeInBits());
   SDValue Tr = DAG.getNode(MipsISD::TruncIntFP, SDLoc(Val), FPTy,
                            Val.getOperand(0));
-
   return DAG.getStore(SD->getChain(), SDLoc(SD), Tr, SD->getBasePtr(),
-                      SD->getPointerInfo(), SD->isVolatile(),
-                      SD->isNonTemporal(), SD->getAlignment());
+                      SD->getPointerInfo(), SD->getAlignment(),
+                      SD->getMemOperand()->getFlags());
 }
 
 SDValue MipsTargetLowering::lowerSTORE(SDValue Op, SelectionDAG &DAG) const {
@@ -2472,23 +2563,22 @@ static unsigned getNextIntArgReg(unsigned Reg) {
   return (Reg == Mips::A0) ? Mips::A1 : Mips::A3;
 }
 
-SDValue
-MipsTargetLowering::passArgOnStack(SDValue StackPtr, unsigned Offset,
-                                   SDValue Chain, SDValue Arg, SDLoc DL,
-                                   bool IsTailCall, SelectionDAG &DAG) const {
+SDValue MipsTargetLowering::passArgOnStack(SDValue StackPtr, unsigned Offset,
+                                           SDValue Chain, SDValue Arg,
+                                           const SDLoc &DL, bool IsTailCall,
+                                           SelectionDAG &DAG) const {
   if (!IsTailCall) {
     SDValue PtrOff =
         DAG.getNode(ISD::ADD, DL, getPointerTy(DAG.getDataLayout()), StackPtr,
                     DAG.getIntPtrConstant(Offset, DL));
-    return DAG.getStore(Chain, DL, Arg, PtrOff, MachinePointerInfo(), false,
-                        false, 0);
+    return DAG.getStore(Chain, DL, Arg, PtrOff, MachinePointerInfo());
   }
 
   MachineFrameInfo *MFI = DAG.getMachineFunction().getFrameInfo();
   int FI = MFI->CreateFixedObject(Arg.getValueSizeInBits() / 8, Offset, false);
   SDValue FIN = DAG.getFrameIndex(FI, getPointerTy(DAG.getDataLayout()));
   return DAG.getStore(Chain, DL, Arg, FIN, MachinePointerInfo(),
-                      /*isVolatile=*/ true, false, 0);
+                      /* Alignment = */ 0, MachineMemOperand::MOVolatile);
 }
 
 void MipsTargetLowering::
@@ -2571,7 +2661,7 @@ MipsTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
   MachineFrameInfo *MFI = MF.getFrameInfo();
   const TargetFrameLowering *TFL = Subtarget.getFrameLowering();
   MipsFunctionInfo *FuncInfo = MF.getInfo<MipsFunctionInfo>();
-  bool IsPIC = getTargetMachine().getRelocationModel() == Reloc::PIC_;
+  bool IsPIC = isPositionIndependent();
 
   // Analyze operands of the call, assigning locations to each operand.
   SmallVector<CCValAssign, 16> ArgLocs;
@@ -2604,7 +2694,7 @@ MipsTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
   // ByValChain is the output chain of the last Memcpy node created for copying
   // byval arguments to the stack.
   unsigned StackAlignment = TFL->getStackAlignment();
-  NextStackOffset = RoundUpToAlignment(NextStackOffset, StackAlignment);
+  NextStackOffset = alignTo(NextStackOffset, StackAlignment);
   SDValue NextStackOffsetVal = DAG.getIntPtrConstant(NextStackOffset, DL, true);
 
   if (!IsTailCall)
@@ -2614,7 +2704,6 @@ MipsTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
       DAG.getCopyFromReg(Chain, DL, ABI.IsN64() ? Mips::SP_64 : Mips::SP,
                          getPointerTy(DAG.getDataLayout()));
 
-  // With EABI is it possible to have 16 args on registers.
   std::deque< std::pair<unsigned, SDValue> > RegsToPass;
   SmallVector<SDValue, 8> MemOpChains;
 
@@ -2802,8 +2891,8 @@ MipsTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
 /// appropriate copies out of appropriate physical registers.
 SDValue MipsTargetLowering::LowerCallResult(
     SDValue Chain, SDValue InFlag, CallingConv::ID CallConv, bool IsVarArg,
-    const SmallVectorImpl<ISD::InputArg> &Ins, SDLoc DL, SelectionDAG &DAG,
-    SmallVectorImpl<SDValue> &InVals,
+    const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &DL,
+    SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals,
     TargetLowering::CallLoweringInfo &CLI) const {
   // Assign locations to each value returned by this call.
   SmallVector<CCValAssign, 16> RVLocs;
@@ -2864,7 +2953,8 @@ SDValue MipsTargetLowering::LowerCallResult(
 }
 
 static SDValue UnpackFromArgumentSlot(SDValue Val, const CCValAssign &VA,
-                                      EVT ArgVT, SDLoc DL, SelectionDAG &DAG) {
+                                      EVT ArgVT, const SDLoc &DL,
+                                      SelectionDAG &DAG) {
   MVT LocVT = VA.getLocVT();
   EVT ValVT = VA.getValVT();
 
@@ -2922,14 +3012,10 @@ static SDValue UnpackFromArgumentSlot(SDValue Val, const CCValAssign &VA,
 //===----------------------------------------------------------------------===//
 /// LowerFormalArguments - transform physical registers into virtual registers
 /// and generate load operations for arguments places on the stack.
-SDValue
-MipsTargetLowering::LowerFormalArguments(SDValue Chain,
-                                         CallingConv::ID CallConv,
-                                         bool IsVarArg,
-                                      const SmallVectorImpl<ISD::InputArg> &Ins,
-                                         SDLoc DL, SelectionDAG &DAG,
-                                         SmallVectorImpl<SDValue> &InVals)
-                                          const {
+SDValue MipsTargetLowering::LowerFormalArguments(
+    SDValue Chain, CallingConv::ID CallConv, bool IsVarArg,
+    const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &DL,
+    SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals) const {
   MachineFunction &MF = DAG.getMachineFunction();
   MachineFrameInfo *MFI = MF.getFrameInfo();
   MipsFunctionInfo *MipsFI = MF.getInfo<MipsFunctionInfo>();
@@ -3037,8 +3123,7 @@ MipsTargetLowering::LowerFormalArguments(SDValue Chain,
       SDValue FIN = DAG.getFrameIndex(FI, getPointerTy(DAG.getDataLayout()));
       SDValue ArgValue = DAG.getLoad(
           LocVT, DL, Chain, FIN,
-          MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI),
-          false, false, false, 0);
+          MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI));
       OutChains.push_back(ArgValue.getValue(1));
 
       ArgValue = UnpackFromArgumentSlot(ArgValue, VA, Ins[i].ArgVT, DL, DAG);
@@ -3102,7 +3187,8 @@ MipsTargetLowering::shouldSignExtendTypeInLibCall(EVT Type, bool IsSigned) const
 
 SDValue
 MipsTargetLowering::LowerInterruptReturn(SmallVectorImpl<SDValue> &RetOps,
-                                         SDLoc DL, SelectionDAG &DAG) const {
+                                         const SDLoc &DL,
+                                         SelectionDAG &DAG) const {
 
   MachineFunction &MF = DAG.getMachineFunction();
   MipsFunctionInfo *MipsFI = MF.getInfo<MipsFunctionInfo>();
@@ -3117,7 +3203,7 @@ MipsTargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv,
                                 bool IsVarArg,
                                 const SmallVectorImpl<ISD::OutputArg> &Outs,
                                 const SmallVectorImpl<SDValue> &OutVals,
-                                SDLoc DL, SelectionDAG &DAG) const {
+                                const SDLoc &DL, SelectionDAG &DAG) const {
   // CCValAssign - represent the assignment of
   // the return value to a location
   SmallVector<CCValAssign, 16> RVLocs;
@@ -3625,10 +3711,11 @@ bool MipsTargetLowering::useSoftFloat() const {
 }
 
 void MipsTargetLowering::copyByValRegs(
-    SDValue Chain, SDLoc DL, std::vector<SDValue> &OutChains, SelectionDAG &DAG,
-    const ISD::ArgFlagsTy &Flags, SmallVectorImpl<SDValue> &InVals,
-    const Argument *FuncArg, unsigned FirstReg, unsigned LastReg,
-    const CCValAssign &VA, MipsCCState &State) const {
+    SDValue Chain, const SDLoc &DL, std::vector<SDValue> &OutChains,
+    SelectionDAG &DAG, const ISD::ArgFlagsTy &Flags,
+    SmallVectorImpl<SDValue> &InVals, const Argument *FuncArg,
+    unsigned FirstReg, unsigned LastReg, const CCValAssign &VA,
+    MipsCCState &State) const {
   MachineFunction &MF = DAG.getMachineFunction();
   MachineFrameInfo *MFI = MF.getFrameInfo();
   unsigned GPRSizeInBytes = Subtarget.getGPRSizeInBytes();
@@ -3665,15 +3752,14 @@ void MipsTargetLowering::copyByValRegs(
     SDValue StorePtr = DAG.getNode(ISD::ADD, DL, PtrTy, FIN,
                                    DAG.getConstant(Offset, DL, PtrTy));
     SDValue Store = DAG.getStore(Chain, DL, DAG.getRegister(VReg, RegTy),
-                                 StorePtr, MachinePointerInfo(FuncArg, Offset),
-                                 false, false, 0);
+                                 StorePtr, MachinePointerInfo(FuncArg, Offset));
     OutChains.push_back(Store);
   }
 }
 
 // Copy byVal arg to registers and stack.
 void MipsTargetLowering::passByValArg(
-    SDValue Chain, SDLoc DL,
+    SDValue Chain, const SDLoc &DL,
     std::deque<std::pair<unsigned, SDValue>> &RegsToPass,
     SmallVectorImpl<SDValue> &MemOpChains, SDValue StackPtr,
     MachineFrameInfo *MFI, SelectionDAG &DAG, SDValue Arg, unsigned FirstReg,
@@ -3697,8 +3783,7 @@ void MipsTargetLowering::passByValArg(
       SDValue LoadPtr = DAG.getNode(ISD::ADD, DL, PtrTy, Arg,
                                     DAG.getConstant(OffsetInBytes, DL, PtrTy));
       SDValue LoadVal = DAG.getLoad(RegTy, DL, Chain, LoadPtr,
-                                    MachinePointerInfo(), false, false, false,
-                                    Alignment);
+                                    MachinePointerInfo(), Alignment);
       MemOpChains.push_back(LoadVal.getValue(1));
       unsigned ArgReg = ArgRegs[FirstReg + I];
       RegsToPass.push_back(std::make_pair(ArgReg, LoadVal));
@@ -3725,8 +3810,7 @@ void MipsTargetLowering::passByValArg(
                                                       PtrTy));
         SDValue LoadVal = DAG.getExtLoad(
             ISD::ZEXTLOAD, DL, RegTy, Chain, LoadPtr, MachinePointerInfo(),
-            MVT::getIntegerVT(LoadSizeInBytes * 8), false, false, false,
-            Alignment);
+            MVT::getIntegerVT(LoadSizeInBytes * 8), Alignment);
         MemOpChains.push_back(LoadVal.getValue(1));
 
         // Shift the loaded value.
@@ -3771,7 +3855,7 @@ void MipsTargetLowering::passByValArg(
 }
 
 void MipsTargetLowering::writeVarArgRegs(std::vector<SDValue> &OutChains,
-                                         SDValue Chain, SDLoc DL,
+                                         SDValue Chain, const SDLoc &DL,
                                          SelectionDAG &DAG,
                                          CCState &State) const {
   ArrayRef<MCPhysReg> ArgRegs = ABI.GetVarArgRegs();
@@ -3787,8 +3871,7 @@ void MipsTargetLowering::writeVarArgRegs(std::vector<SDValue> &OutChains,
   int VaArgOffset;
 
   if (ArgRegs.size() == Idx)
-    VaArgOffset =
-        RoundUpToAlignment(State.getNextStackOffset(), RegSizeInBytes);
+    VaArgOffset = alignTo(State.getNextStackOffset(), RegSizeInBytes);
   else {
     VaArgOffset =
         (int)ABI.GetCalleeAllocdArgSizeInBytes(State.getCallingConv()) -
@@ -3810,8 +3893,8 @@ void MipsTargetLowering::writeVarArgRegs(std::vector<SDValue> &OutChains,
     SDValue ArgValue = DAG.getCopyFromReg(Chain, DL, Reg, RegTy);
     FI = MFI->CreateFixedObject(RegSizeInBytes, VaArgOffset, true);
     SDValue PtrOff = DAG.getFrameIndex(FI, getPointerTy(DAG.getDataLayout()));
-    SDValue Store = DAG.getStore(Chain, DL, ArgValue, PtrOff,
-                                 MachinePointerInfo(), false, false, 0);
+    SDValue Store =
+        DAG.getStore(Chain, DL, ArgValue, PtrOff, MachinePointerInfo());
     cast<StoreSDNode>(Store.getNode())->getMemOperand()->setValue(
         (Value *)nullptr);
     OutChains.push_back(Store);
@@ -3854,7 +3937,7 @@ void MipsTargetLowering::HandleByVal(CCState *State, unsigned &Size,
     }
 
     // Mark the registers allocated.
-    Size = RoundUpToAlignment(Size, RegSizeInBytes);
+    Size = alignTo(Size, RegSizeInBytes);
     for (unsigned I = FirstReg; Size > 0 && (I < IntArgRegs.size());
          Size -= RegSizeInBytes, ++I, ++NumRegs)
       State->AllocateReg(IntArgRegs[I], ShadowRegs[I]);
@@ -3863,16 +3946,17 @@ void MipsTargetLowering::HandleByVal(CCState *State, unsigned &Size,
   State->addInRegsParamInfo(FirstReg, FirstReg + NumRegs);
 }
 
-MachineBasicBlock *
-MipsTargetLowering::emitPseudoSELECT(MachineInstr *MI, MachineBasicBlock *BB,
-                                     bool isFPCmp, unsigned Opc) const {
+MachineBasicBlock *MipsTargetLowering::emitPseudoSELECT(MachineInstr &MI,
+                                                        MachineBasicBlock *BB,
+                                                        bool isFPCmp,
+                                                        unsigned Opc) const {
   assert(!(Subtarget.hasMips4() || Subtarget.hasMips32()) &&
          "Subtarget already supports SELECT nodes with the use of"
          "conditional-move instructions.");
 
   const TargetInstrInfo *TII =
       Subtarget.getInstrInfo();
-  DebugLoc DL = MI->getDebugLoc();
+  DebugLoc DL = MI.getDebugLoc();
 
   // To "insert" a SELECT instruction, we actually have to insert the
   // diamond control-flow pattern.  The incoming instruction knows the
@@ -3906,14 +3990,14 @@ MipsTargetLowering::emitPseudoSELECT(MachineInstr *MI, MachineBasicBlock *BB,
   if (isFPCmp) {
     // bc1[tf] cc, sinkMBB
     BuildMI(BB, DL, TII->get(Opc))
-      .addReg(MI->getOperand(1).getReg())
-      .addMBB(sinkMBB);
+        .addReg(MI.getOperand(1).getReg())
+        .addMBB(sinkMBB);
   } else {
     // bne rs, $0, sinkMBB
     BuildMI(BB, DL, TII->get(Opc))
-      .addReg(MI->getOperand(1).getReg())
-      .addReg(Mips::ZERO)
-      .addMBB(sinkMBB);
+        .addReg(MI.getOperand(1).getReg())
+        .addReg(Mips::ZERO)
+        .addMBB(sinkMBB);
   }
 
   //  copy0MBB:
@@ -3929,12 +4013,13 @@ MipsTargetLowering::emitPseudoSELECT(MachineInstr *MI, MachineBasicBlock *BB,
   //  ...
   BB = sinkMBB;
 
-  BuildMI(*BB, BB->begin(), DL,
-          TII->get(Mips::PHI), MI->getOperand(0).getReg())
-    .addReg(MI->getOperand(2).getReg()).addMBB(thisMBB)
-    .addReg(MI->getOperand(3).getReg()).addMBB(copy0MBB);
+  BuildMI(*BB, BB->begin(), DL, TII->get(Mips::PHI), MI.getOperand(0).getReg())
+      .addReg(MI.getOperand(2).getReg())
+      .addMBB(thisMBB)
+      .addReg(MI.getOperand(3).getReg())
+      .addMBB(copy0MBB);
 
-  MI->eraseFromParent();   // The pseudo instruction is gone now.
+  MI.eraseFromParent(); // The pseudo instruction is gone now.
 
   return BB;
 }
diff --git a/lib/Target/Mips/MipsISelLowering.h b/lib/Target/Mips/MipsISelLowering.h
index 0dc683e3df27..182dc9070fc8 100644
--- a/lib/Target/Mips/MipsISelLowering.h
+++ b/lib/Target/Mips/MipsISelLowering.h
@@ -238,6 +238,10 @@ namespace llvm {
     bool isCheapToSpeculateCttz() const override;
     bool isCheapToSpeculateCtlz() const override;
 
+    ISD::NodeType getExtendForAtomicOps() const override {
+      return ISD::SIGN_EXTEND;
+    }
+
     void LowerOperationWrapper(SDNode *N,
                                SmallVectorImpl<SDValue> &Results,
                                SelectionDAG &DAG) const override;
@@ -262,7 +266,7 @@ namespace llvm {
     SDValue PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const override;
 
     MachineBasicBlock *
-    EmitInstrWithCustomInserter(MachineInstr *MI,
+    EmitInstrWithCustomInserter(MachineInstr &MI,
                                 MachineBasicBlock *MBB) const override;
 
     void HandleByVal(CCState *, unsigned &, unsigned) const override;
@@ -300,15 +304,14 @@ namespace llvm {
     //
     // (add (load (wrapper $gp, %got(sym)), %lo(sym))
     template <class NodeTy>
-    SDValue getAddrLocal(NodeTy *N, SDLoc DL, EVT Ty, SelectionDAG &DAG,
+    SDValue getAddrLocal(NodeTy *N, const SDLoc &DL, EVT Ty, SelectionDAG &DAG,
                          bool IsN32OrN64) const {
       unsigned GOTFlag = IsN32OrN64 ? MipsII::MO_GOT_PAGE : MipsII::MO_GOT;
       SDValue GOT = DAG.getNode(MipsISD::Wrapper, DL, Ty, getGlobalReg(DAG, Ty),
                                 getTargetNode(N, Ty, DAG, GOTFlag));
       SDValue Load =
           DAG.getLoad(Ty, DL, DAG.getEntryNode(), GOT,
-                      MachinePointerInfo::getGOT(DAG.getMachineFunction()),
-                      false, false, false, 0);
+                      MachinePointerInfo::getGOT(DAG.getMachineFunction()));
       unsigned LoFlag = IsN32OrN64 ? MipsII::MO_GOT_OFST : MipsII::MO_ABS_LO;
       SDValue Lo = DAG.getNode(MipsISD::Lo, DL, Ty,
                                getTargetNode(N, Ty, DAG, LoFlag));
@@ -320,12 +323,12 @@ namespace llvm {
     //
     // (load (wrapper $gp, %got(sym)))
     template <class NodeTy>
-    SDValue getAddrGlobal(NodeTy *N, SDLoc DL, EVT Ty, SelectionDAG &DAG,
+    SDValue getAddrGlobal(NodeTy *N, const SDLoc &DL, EVT Ty, SelectionDAG &DAG,
                           unsigned Flag, SDValue Chain,
                           const MachinePointerInfo &PtrInfo) const {
       SDValue Tgt = DAG.getNode(MipsISD::Wrapper, DL, Ty, getGlobalReg(DAG, Ty),
                                 getTargetNode(N, Ty, DAG, Flag));
-      return DAG.getLoad(Ty, DL, Chain, Tgt, PtrInfo, false, false, false, 0);
+      return DAG.getLoad(Ty, DL, Chain, Tgt, PtrInfo);
     }
 
     // This method creates the following nodes, which are necessary for
@@ -333,7 +336,7 @@ namespace llvm {
     //
     // (load (wrapper (add %hi(sym), $gp), %lo(sym)))
     template <class NodeTy>
-    SDValue getAddrGlobalLargeGOT(NodeTy *N, SDLoc DL, EVT Ty,
+    SDValue getAddrGlobalLargeGOT(NodeTy *N, const SDLoc &DL, EVT Ty,
                                   SelectionDAG &DAG, unsigned HiFlag,
                                   unsigned LoFlag, SDValue Chain,
                                   const MachinePointerInfo &PtrInfo) const {
@@ -342,8 +345,7 @@ namespace llvm {
       Hi = DAG.getNode(ISD::ADD, DL, Ty, Hi, getGlobalReg(DAG, Ty));
       SDValue Wrapper = DAG.getNode(MipsISD::Wrapper, DL, Ty, Hi,
                                     getTargetNode(N, Ty, DAG, LoFlag));
-      return DAG.getLoad(Ty, DL, Chain, Wrapper, PtrInfo, false, false, false,
-                         0);
+      return DAG.getLoad(Ty, DL, Chain, Wrapper, PtrInfo);
     }
 
     // This method creates the following nodes, which are necessary for
@@ -351,7 +353,7 @@ namespace llvm {
     //
     // (add %hi(sym), %lo(sym))
     template <class NodeTy>
-    SDValue getAddrNonPIC(NodeTy *N, SDLoc DL, EVT Ty,
+    SDValue getAddrNonPIC(NodeTy *N, const SDLoc &DL, EVT Ty,
                           SelectionDAG &DAG) const {
       SDValue Hi = getTargetNode(N, Ty, DAG, MipsII::MO_ABS_HI);
       SDValue Lo = getTargetNode(N, Ty, DAG, MipsII::MO_ABS_LO);
@@ -365,7 +367,8 @@ namespace llvm {
     //
     // (add $gp, %gp_rel(sym))
     template <class NodeTy>
-    SDValue getAddrGPRel(NodeTy *N, SDLoc DL, EVT Ty, SelectionDAG &DAG) const {
+    SDValue getAddrGPRel(NodeTy *N, const SDLoc &DL, EVT Ty,
+                         SelectionDAG &DAG) const {
       assert(Ty == MVT::i32);
       SDValue GPRel = getTargetNode(N, Ty, DAG, MipsII::MO_GPREL);
       return DAG.getNode(ISD::ADD, DL, Ty,
@@ -417,8 +420,9 @@ namespace llvm {
     // Lower Operand helpers
     SDValue LowerCallResult(SDValue Chain, SDValue InFlag,
                             CallingConv::ID CallConv, bool isVarArg,
-                            const SmallVectorImpl<ISD::InputArg> &Ins, SDLoc dl,
-                            SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals,
+                            const SmallVectorImpl<ISD::InputArg> &Ins,
+                            const SDLoc &dl, SelectionDAG &DAG,
+                            SmallVectorImpl<SDValue> &InVals,
                             TargetLowering::CallLoweringInfo &CLI) const;
 
     // Lower Operand specifics
@@ -455,15 +459,16 @@ namespace llvm {
     /// copyByValArg - Copy argument registers which were used to pass a byval
     /// argument to the stack. Create a stack frame object for the byval
     /// argument.
-    void copyByValRegs(SDValue Chain, SDLoc DL, std::vector<SDValue> &OutChains,
-                       SelectionDAG &DAG, const ISD::ArgFlagsTy &Flags,
+    void copyByValRegs(SDValue Chain, const SDLoc &DL,
+                       std::vector<SDValue> &OutChains, SelectionDAG &DAG,
+                       const ISD::ArgFlagsTy &Flags,
                        SmallVectorImpl<SDValue> &InVals,
                        const Argument *FuncArg, unsigned FirstReg,
                        unsigned LastReg, const CCValAssign &VA,
                        MipsCCState &State) const;
 
     /// passByValArg - Pass a byval argument in registers or on stack.
-    void passByValArg(SDValue Chain, SDLoc DL,
+    void passByValArg(SDValue Chain, const SDLoc &DL,
                       std::deque<std::pair<unsigned, SDValue>> &RegsToPass,
                       SmallVectorImpl<SDValue> &MemOpChains, SDValue StackPtr,
                       MachineFrameInfo *MFI, SelectionDAG &DAG, SDValue Arg,
@@ -475,17 +480,17 @@ namespace llvm {
     /// to the stack. Also create a stack frame object for the first variable
     /// argument.
     void writeVarArgRegs(std::vector<SDValue> &OutChains, SDValue Chain,
-                         SDLoc DL, SelectionDAG &DAG, CCState &State) const;
+                         const SDLoc &DL, SelectionDAG &DAG,
+                         CCState &State) const;
 
     SDValue
-      LowerFormalArguments(SDValue Chain,
-                           CallingConv::ID CallConv, bool isVarArg,
-                           const SmallVectorImpl<ISD::InputArg> &Ins,
-                           SDLoc dl, SelectionDAG &DAG,
-                           SmallVectorImpl<SDValue> &InVals) const override;
+    LowerFormalArguments(SDValue Chain, CallingConv::ID CallConv, bool isVarArg,
+                         const SmallVectorImpl<ISD::InputArg> &Ins,
+                         const SDLoc &dl, SelectionDAG &DAG,
+                         SmallVectorImpl<SDValue> &InVals) const override;
 
     SDValue passArgOnStack(SDValue StackPtr, unsigned Offset, SDValue Chain,
-                           SDValue Arg, SDLoc DL, bool IsTailCall,
+                           SDValue Arg, const SDLoc &DL, bool IsTailCall,
                            SelectionDAG &DAG) const;
 
     SDValue LowerCall(TargetLowering::CallLoweringInfo &CLI,
@@ -496,14 +501,13 @@ namespace llvm {
                         const SmallVectorImpl<ISD::OutputArg> &Outs,
                         LLVMContext &Context) const override;
 
-    SDValue LowerReturn(SDValue Chain,
-                        CallingConv::ID CallConv, bool isVarArg,
+    SDValue LowerReturn(SDValue Chain, CallingConv::ID CallConv, bool isVarArg,
                         const SmallVectorImpl<ISD::OutputArg> &Outs,
                         const SmallVectorImpl<SDValue> &OutVals,
-                        SDLoc dl, SelectionDAG &DAG) const override;
+                        const SDLoc &dl, SelectionDAG &DAG) const override;
 
-    SDValue LowerInterruptReturn(SmallVectorImpl<SDValue> &RetOps, SDLoc DL,
-                                 SelectionDAG &DAG) const;
+    SDValue LowerInterruptReturn(SmallVectorImpl<SDValue> &RetOps,
+                                 const SDLoc &DL, SelectionDAG &DAG) const;
 
     bool shouldSignExtendTypeInLibCall(EVT Type, bool IsSigned) const override;
 
@@ -561,25 +565,33 @@ namespace llvm {
     unsigned getJumpTableEncoding() const override;
     bool useSoftFloat() const override;
 
+    bool shouldInsertFencesForAtomic(const Instruction *I) const override {
+      return true;
+    }
+
     /// Emit a sign-extension using sll/sra, seb, or seh appropriately.
-    MachineBasicBlock *emitSignExtendToI32InReg(MachineInstr *MI,
+    MachineBasicBlock *emitSignExtendToI32InReg(MachineInstr &MI,
                                                 MachineBasicBlock *BB,
                                                 unsigned Size, unsigned DstReg,
                                                 unsigned SrcRec) const;
 
-    MachineBasicBlock *emitAtomicBinary(MachineInstr *MI, MachineBasicBlock *BB,
-                    unsigned Size, unsigned BinOpcode, bool Nand = false) const;
-    MachineBasicBlock *emitAtomicBinaryPartword(MachineInstr *MI,
-                    MachineBasicBlock *BB, unsigned Size, unsigned BinOpcode,
-                    bool Nand = false) const;
-    MachineBasicBlock *emitAtomicCmpSwap(MachineInstr *MI,
-                                  MachineBasicBlock *BB, unsigned Size) const;
-    MachineBasicBlock *emitAtomicCmpSwapPartword(MachineInstr *MI,
-                                  MachineBasicBlock *BB, unsigned Size) const;
-    MachineBasicBlock *emitSEL_D(MachineInstr *MI, MachineBasicBlock *BB) const;
-    MachineBasicBlock *emitPseudoSELECT(MachineInstr *MI,
-                                        MachineBasicBlock *BB, bool isFPCmp,
-                                        unsigned Opc) const;
+    MachineBasicBlock *emitAtomicBinary(MachineInstr &MI, MachineBasicBlock *BB,
+                                        unsigned Size, unsigned BinOpcode,
+                                        bool Nand = false) const;
+    MachineBasicBlock *emitAtomicBinaryPartword(MachineInstr &MI,
+                                                MachineBasicBlock *BB,
+                                                unsigned Size,
+                                                unsigned BinOpcode,
+                                                bool Nand = false) const;
+    MachineBasicBlock *emitAtomicCmpSwap(MachineInstr &MI,
+                                         MachineBasicBlock *BB,
+                                         unsigned Size) const;
+    MachineBasicBlock *emitAtomicCmpSwapPartword(MachineInstr &MI,
+                                                 MachineBasicBlock *BB,
+                                                 unsigned Size) const;
+    MachineBasicBlock *emitSEL_D(MachineInstr &MI, MachineBasicBlock *BB) const;
+    MachineBasicBlock *emitPseudoSELECT(MachineInstr &MI, MachineBasicBlock *BB,
+                                        bool isFPCmp, unsigned Opc) const;
   };
 
   /// Create MipsTargetLowering objects.
diff --git a/lib/Target/Mips/MipsInstrFPU.td b/lib/Target/Mips/MipsInstrFPU.td
index 377260f89d10..87b02bdfdfdb 100644
--- a/lib/Target/Mips/MipsInstrFPU.td
+++ b/lib/Target/Mips/MipsInstrFPU.td
@@ -160,18 +160,18 @@ class MTC1_64_FT<string opstr, RegisterOperand DstRC, RegisterOperand SrcRC,
   let Constraints = "$fs = $fs_in";
 }
 
-class LW_FT<string opstr, RegisterOperand RC, InstrItinClass Itin,
-            SDPatternOperator OpNode= null_frag> :
-  InstSE<(outs RC:$rt), (ins mem:$addr), !strconcat(opstr, "\t$rt, $addr"),
+class LW_FT<string opstr, RegisterOperand RC, DAGOperand MO,
+            InstrItinClass Itin, SDPatternOperator OpNode = null_frag> :
+  InstSE<(outs RC:$rt), (ins MO:$addr), !strconcat(opstr, "\t$rt, $addr"),
          [(set RC:$rt, (OpNode addrDefault:$addr))], Itin, FrmFI, opstr>,
   HARDFLOAT {
   let DecoderMethod = "DecodeFMem";
   let mayLoad = 1;
 }
 
-class SW_FT<string opstr, RegisterOperand RC, InstrItinClass Itin,
-            SDPatternOperator OpNode= null_frag> :
-  InstSE<(outs), (ins RC:$rt, mem:$addr), !strconcat(opstr, "\t$rt, $addr"),
+class SW_FT<string opstr, RegisterOperand RC, DAGOperand MO,
+            InstrItinClass Itin, SDPatternOperator OpNode = null_frag> :
+  InstSE<(outs), (ins RC:$rt, MO:$addr), !strconcat(opstr, "\t$rt, $addr"),
          [(OpNode RC:$rt, addrDefault:$addr)], Itin, FrmFI, opstr>, HARDFLOAT {
   let DecoderMethod = "DecodeFMem";
   let mayStore = 1;
@@ -367,22 +367,28 @@ def MFC1 : MMRel, MFC1_FT<"mfc1", GPR32Opnd, FGR32Opnd, II_MFC1,
                           bitconvert>, MFC1_FM<0>;
 def MTC1 : MMRel, MTC1_FT<"mtc1", FGR32Opnd, GPR32Opnd, II_MTC1,
                           bitconvert>, MFC1_FM<4>;
-def MFHC1_D32 : MMRel, MFC1_FT<"mfhc1", GPR32Opnd, AFGR64Opnd, II_MFHC1>,
-                MFC1_FM<3>, ISA_MIPS32R2, FGR_32;
-def MFHC1_D64 : MFC1_FT<"mfhc1", GPR32Opnd, FGR64Opnd, II_MFHC1>,
-                MFC1_FM<3>, ISA_MIPS32R2, FGR_64 {
-  let DecoderNamespace = "Mips64";
-}
-def MTHC1_D32 : MMRel, MTC1_64_FT<"mthc1", AFGR64Opnd, GPR32Opnd, II_MTHC1>,
-                MFC1_FM<7>, ISA_MIPS32R2, FGR_32;
-def MTHC1_D64 : MTC1_64_FT<"mthc1", FGR64Opnd, GPR32Opnd, II_MTHC1>,
-                MFC1_FM<7>, ISA_MIPS32R2, FGR_64 {
-  let DecoderNamespace = "Mips64";
-}
-def DMFC1 : MFC1_FT<"dmfc1", GPR64Opnd, FGR64Opnd, II_DMFC1,
-            bitconvert>, MFC1_FM<1>, ISA_MIPS3;
-def DMTC1 : MTC1_FT<"dmtc1", FGR64Opnd, GPR64Opnd, II_DMTC1,
-            bitconvert>, MFC1_FM<5>, ISA_MIPS3;
+let AdditionalPredicates = [NotInMicroMips] in {
+  def MFHC1_D32 : MMRel, MFC1_FT<"mfhc1", GPR32Opnd, AFGR64Opnd, II_MFHC1>,
+                  MFC1_FM<3>, ISA_MIPS32R2, FGR_32;
+  def MFHC1_D64 : MFC1_FT<"mfhc1", GPR32Opnd, FGR64Opnd, II_MFHC1>,
+                  MFC1_FM<3>, ISA_MIPS32R2, FGR_64 {
+    let DecoderNamespace = "Mips64";
+  }
+}
+let AdditionalPredicates = [NotInMicroMips] in {
+  def MTHC1_D32 : MMRel, StdMMR6Rel, MTC1_64_FT<"mthc1", AFGR64Opnd, GPR32Opnd, II_MTHC1>,
+                  MFC1_FM<7>, ISA_MIPS32R2, FGR_32;
+  def MTHC1_D64 : MTC1_64_FT<"mthc1", FGR64Opnd, GPR32Opnd, II_MTHC1>,
+                  MFC1_FM<7>, ISA_MIPS32R2, FGR_64 {
+    let DecoderNamespace = "Mips64";
+  }
+}
+let AdditionalPredicates = [NotInMicroMips] in {
+  def DMTC1 : MTC1_FT<"dmtc1", FGR64Opnd, GPR64Opnd, II_DMTC1,
+              bitconvert>, MFC1_FM<5>, ISA_MIPS3;
+  def DMFC1 : MFC1_FT<"dmfc1", GPR64Opnd, FGR64Opnd, II_DMFC1,
+                      bitconvert>, MFC1_FM<1>, ISA_MIPS3;
+}
 
 def FMOV_S   : MMRel, ABSS_FT<"mov.s", FGR32Opnd, FGR32Opnd, II_MOV_S>,
                ABSS_FM<0x6, 16>;
@@ -394,20 +400,30 @@ def FMOV_D64 : ABSS_FT<"mov.d", FGR64Opnd, FGR64Opnd, II_MOV_D>,
 }
 
 /// Floating Point Memory Instructions
-def LWC1 : MMRel, LW_FT<"lwc1", FGR32Opnd, II_LWC1, load>, LW_FM<0x31>;
-def SWC1 : MMRel, SW_FT<"swc1", FGR32Opnd, II_SWC1, store>, LW_FM<0x39>;
+let AdditionalPredicates = [NotInMicroMips] in {
+  def LWC1 : MMRel, LW_FT<"lwc1", FGR32Opnd, mem_simm16, II_LWC1, load>,
+             LW_FM<0x31>;
+  def SWC1 : MMRel, SW_FT<"swc1", FGR32Opnd, mem_simm16, II_SWC1, store>,
+             LW_FM<0x39>;
+}
 
-let DecoderNamespace = "Mips64" in {
-  def LDC164 : LW_FT<"ldc1", FGR64Opnd, II_LDC1, load>, LW_FM<0x35>, ISA_MIPS2,
-               FGR_64;
-  def SDC164 : SW_FT<"sdc1", FGR64Opnd, II_SDC1, store>, LW_FM<0x3d>, ISA_MIPS2,
-               FGR_64;
+let DecoderNamespace = "Mips64", AdditionalPredicates = [NotInMicroMips] in {
+  def LDC164 : StdMMR6Rel, LW_FT<"ldc1", FGR64Opnd, mem_simm16, II_LDC1, load>,
+               LW_FM<0x35>, ISA_MIPS2, FGR_64 {
+    let BaseOpcode = "LDC164";
+  }
+  def SDC164 : StdMMR6Rel, SW_FT<"sdc1", FGR64Opnd, mem_simm16, II_SDC1, store>,
+               LW_FM<0x3d>, ISA_MIPS2, FGR_64;
 }
 
-def LDC1 : MMRel, LW_FT<"ldc1", AFGR64Opnd, II_LDC1, load>, LW_FM<0x35>,
-           ISA_MIPS2, FGR_32;
-def SDC1 : MMRel, SW_FT<"sdc1", AFGR64Opnd, II_SDC1, store>, LW_FM<0x3d>,
-           ISA_MIPS2, FGR_32;
+let AdditionalPredicates = [NotInMicroMips] in {
+  def LDC1 : MMRel, StdMMR6Rel, LW_FT<"ldc1", AFGR64Opnd, mem_simm16, II_LDC1,
+                                      load>, LW_FM<0x35>, ISA_MIPS2, FGR_32 {
+    let BaseOpcode = "LDC132";
+  }
+  def SDC1 : MMRel, SW_FT<"sdc1", AFGR64Opnd, mem_simm16, II_SDC1, store>,
+             LW_FM<0x3d>, ISA_MIPS2, FGR_32;
+}
 
 // Indexed loads and stores.
 // Base register + offset register addressing mode (indicated by "x" in the
@@ -519,10 +535,12 @@ def BC1TL : MMRel, BC1F_FT<"bc1tl", brtarget, II_BC1TL, MIPS_BRANCH_T, 0>,
             BC1F_FM<1, 1>, ISA_MIPS2_NOT_32R6_64R6;
 
 /// Floating Point Compare
-def FCMP_S32 : MMRel, CEQS_FT<"s", FGR32, II_C_CC_S, MipsFPCmp>, CEQS_FM<16>,
-               ISA_MIPS1_NOT_32R6_64R6;
-def FCMP_D32 : MMRel, CEQS_FT<"d", AFGR64, II_C_CC_D, MipsFPCmp>, CEQS_FM<17>,
-               ISA_MIPS1_NOT_32R6_64R6, FGR_32;
+let AdditionalPredicates = [NotInMicroMips] in {
+  def FCMP_S32 : MMRel, CEQS_FT<"s", FGR32, II_C_CC_S, MipsFPCmp>, CEQS_FM<16>,
+                 ISA_MIPS1_NOT_32R6_64R6;
+  def FCMP_D32 : MMRel, CEQS_FT<"d", AFGR64, II_C_CC_D, MipsFPCmp>, CEQS_FM<17>,
+                 ISA_MIPS1_NOT_32R6_64R6, FGR_32;
+}
 let DecoderNamespace = "Mips64" in
 def FCMP_D64 : CEQS_FT<"d", FGR64, II_C_CC_D, MipsFPCmp>, CEQS_FM<17>,
                ISA_MIPS1_NOT_32R6_64R6, FGR_64;
@@ -535,7 +553,8 @@ def FCMP_D64 : CEQS_FT<"d", FGR64, II_C_CC_D, MipsFPCmp>, CEQS_FM<17>,
 // allocation.
 class BuildPairF64Base<RegisterOperand RO> :
   PseudoSE<(outs RO:$dst), (ins GPR32Opnd:$lo, GPR32Opnd:$hi),
-           [(set RO:$dst, (MipsBuildPairF64 GPR32Opnd:$lo, GPR32Opnd:$hi))]>;
+           [(set RO:$dst, (MipsBuildPairF64 GPR32Opnd:$lo, GPR32Opnd:$hi))],
+           II_MTC1>;
 
 def BuildPairF64 : BuildPairF64Base<AFGR64Opnd>, FGR_32, HARDFLOAT;
 def BuildPairF64_64 : BuildPairF64Base<FGR64Opnd>, FGR_64, HARDFLOAT;
@@ -544,13 +563,30 @@ def BuildPairF64_64 : BuildPairF64Base<FGR64Opnd>, FGR_64, HARDFLOAT;
 // allocation.
 // if n is 0, lower part of src is extracted.
 // if n is 1, higher part of src is extracted.
+// This node has associated scheduling information as the pre RA scheduler
+// asserts otherwise.
 class ExtractElementF64Base<RegisterOperand RO> :
   PseudoSE<(outs GPR32Opnd:$dst), (ins RO:$src, i32imm:$n),
-           [(set GPR32Opnd:$dst, (MipsExtractElementF64 RO:$src, imm:$n))]>;
+           [(set GPR32Opnd:$dst, (MipsExtractElementF64 RO:$src, imm:$n))],
+           II_MFC1>;
 
 def ExtractElementF64 : ExtractElementF64Base<AFGR64Opnd>, FGR_32, HARDFLOAT;
 def ExtractElementF64_64 : ExtractElementF64Base<FGR64Opnd>, FGR_64, HARDFLOAT;
 
+def PseudoTRUNC_W_S : MipsAsmPseudoInst<(outs FGR32Opnd:$fd),
+                                        (ins FGR32Opnd:$fs, GPR32Opnd:$rs),
+                                        "trunc.w.s\t$fd, $fs, $rs">;
+
+def PseudoTRUNC_W_D32 : MipsAsmPseudoInst<(outs FGR32Opnd:$fd),
+                                          (ins AFGR64Opnd:$fs, GPR32Opnd:$rs),
+                                          "trunc.w.d\t$fd, $fs, $rs">,
+                        FGR_32, HARDFLOAT;
+
+def PseudoTRUNC_W_D : MipsAsmPseudoInst<(outs FGR32Opnd:$fd),
+                                        (ins FGR64Opnd:$fs, GPR32Opnd:$rs),
+                                        "trunc.w.d\t$fd, $fs, $rs">,
+                      FGR_64, HARDFLOAT;
+
 //===----------------------------------------------------------------------===//
 // InstAliases.
 //===----------------------------------------------------------------------===//
@@ -606,13 +642,15 @@ def : MipsPat<(f64 (fextend FGR32Opnd:$src)),
               (CVT_D64_S FGR32Opnd:$src)>, FGR_64;
 
 // Patterns for loads/stores with a reg+imm operand.
-let AddedComplexity = 40 in {
-  def : LoadRegImmPat<LWC1, f32, load>;
-  def : StoreRegImmPat<SWC1, f32>;
+let AdditionalPredicates = [NotInMicroMips] in {
+  let AddedComplexity = 40 in {
+    def : LoadRegImmPat<LWC1, f32, load>;
+    def : StoreRegImmPat<SWC1, f32>;
 
-  def : LoadRegImmPat<LDC164, f64, load>, FGR_64;
-  def : StoreRegImmPat<SDC164, f64>, FGR_64;
+    def : LoadRegImmPat<LDC164, f64, load>, FGR_64;
+    def : StoreRegImmPat<SDC164, f64>, FGR_64;
 
-  def : LoadRegImmPat<LDC1, f64, load>, FGR_32;
-  def : StoreRegImmPat<SDC1, f64>, FGR_32;
+    def : LoadRegImmPat<LDC1, f64, load>, FGR_32;
+    def : StoreRegImmPat<SDC1, f64>, FGR_32;
+  }
 }
diff --git a/lib/Target/Mips/MipsInstrFormats.td b/lib/Target/Mips/MipsInstrFormats.td
index 45baf27be518..0bbb49b6b08e 100644
--- a/lib/Target/Mips/MipsInstrFormats.td
+++ b/lib/Target/Mips/MipsInstrFormats.td
@@ -94,10 +94,15 @@ class MipsInst<dag outs, dag ins, string asmstr, list<dag> pattern,
   //
   // Attributes specific to Mips instructions...
   //
-  bits<4> FormBits = Form.Value;
+  bits<4> FormBits     = Form.Value;
+  bit isCTI            = 0; // Any form of Control Transfer Instruction.
+                            // Required for MIPSR6
+  bit hasForbiddenSlot = 0; // Instruction has a forbidden slot.
 
   // TSFlags layout should be kept in sync with MipsInstrInfo.h.
   let TSFlags{3-0}   = FormBits;
+  let TSFlags{4}     = isCTI;
+  let TSFlags{5}     = hasForbiddenSlot;
 
   let DecoderNamespace = "Mips";
 
diff --git a/lib/Target/Mips/MipsInstrInfo.cpp b/lib/Target/Mips/MipsInstrInfo.cpp
index b1d69506c16f..800d834e0ab8 100644
--- a/lib/Target/Mips/MipsInstrInfo.cpp
+++ b/lib/Target/Mips/MipsInstrInfo.cpp
@@ -13,7 +13,6 @@
 
 #include "MipsInstrInfo.h"
 #include "InstPrinter/MipsInstPrinter.h"
-#include "MipsAnalyzeImmediate.h"
 #include "MipsMachineFunction.h"
 #include "MipsSubtarget.h"
 #include "llvm/ADT/STLExtras.h"
@@ -47,6 +46,7 @@ bool MipsInstrInfo::isZeroImm(const MachineOperand &op) const {
 
 /// insertNoop - If data hazard condition is found insert the target nop
 /// instruction.
+// FIXME: This appears to be dead code.
 void MipsInstrInfo::
 insertNoop(MachineBasicBlock &MBB, MachineBasicBlock::iterator MI) const
 {
@@ -54,14 +54,15 @@ insertNoop(MachineBasicBlock &MBB, MachineBasicBlock::iterator MI) const
   BuildMI(MBB, MI, DL, get(Mips::NOP));
 }
 
-MachineMemOperand *MipsInstrInfo::GetMemOperand(MachineBasicBlock &MBB, int FI,
-                                                unsigned Flag) const {
+MachineMemOperand *
+MipsInstrInfo::GetMemOperand(MachineBasicBlock &MBB, int FI,
+                             MachineMemOperand::Flags Flags) const {
   MachineFunction &MF = *MBB.getParent();
   MachineFrameInfo &MFI = *MF.getFrameInfo();
   unsigned Align = MFI.getObjectAlignment(FI);
 
   return MF.getMachineMemOperand(MachinePointerInfo::getFixedStack(MF, FI),
-                                 Flag, MFI.getObjectSize(FI), Align);
+                                 Flags, MFI.getObjectSize(FI), Align);
 }
 
 //===----------------------------------------------------------------------===//
@@ -83,20 +84,20 @@ void MipsInstrInfo::AnalyzeCondBr(const MachineInstr *Inst, unsigned Opc,
     Cond.push_back(Inst->getOperand(i));
 }
 
-bool MipsInstrInfo::AnalyzeBranch(MachineBasicBlock &MBB,
+bool MipsInstrInfo::analyzeBranch(MachineBasicBlock &MBB,
                                   MachineBasicBlock *&TBB,
                                   MachineBasicBlock *&FBB,
                                   SmallVectorImpl<MachineOperand> &Cond,
                                   bool AllowModify) const {
   SmallVector<MachineInstr*, 2> BranchInstrs;
-  BranchType BT = AnalyzeBranch(MBB, TBB, FBB, Cond, AllowModify, BranchInstrs);
+  BranchType BT = analyzeBranch(MBB, TBB, FBB, Cond, AllowModify, BranchInstrs);
 
   return (BT == BT_None) || (BT == BT_Indirect);
 }
 
-void
-MipsInstrInfo::BuildCondBr(MachineBasicBlock &MBB, MachineBasicBlock *TBB,
-                           DebugLoc DL, ArrayRef<MachineOperand> Cond) const {
+void MipsInstrInfo::BuildCondBr(MachineBasicBlock &MBB, MachineBasicBlock *TBB,
+                                const DebugLoc &DL,
+                                ArrayRef<MachineOperand> Cond) const {
   unsigned Opc = Cond[0].getImm();
   const MCInstrDesc &MCID = get(Opc);
   MachineInstrBuilder MIB = BuildMI(&MBB, DL, MCID);
@@ -107,14 +108,16 @@ MipsInstrInfo::BuildCondBr(MachineBasicBlock &MBB, MachineBasicBlock *TBB,
     else if (Cond[i].isImm())
       MIB.addImm(Cond[i].getImm());
     else
-       assert(true && "Cannot copy operand");
+       assert(false && "Cannot copy operand");
   }
   MIB.addMBB(TBB);
 }
 
-unsigned MipsInstrInfo::InsertBranch(
-    MachineBasicBlock &MBB, MachineBasicBlock *TBB, MachineBasicBlock *FBB,
-    ArrayRef<MachineOperand> Cond, DebugLoc DL) const {
+unsigned MipsInstrInfo::InsertBranch(MachineBasicBlock &MBB,
+                                     MachineBasicBlock *TBB,
+                                     MachineBasicBlock *FBB,
+                                     ArrayRef<MachineOperand> Cond,
+                                     const DebugLoc &DL) const {
   // Shouldn't be a fall through.
   assert(TBB && "InsertBranch must not be told to insert a fallthrough");
 
@@ -174,7 +177,7 @@ bool MipsInstrInfo::ReverseBranchCondition(
   return false;
 }
 
-MipsInstrInfo::BranchType MipsInstrInfo::AnalyzeBranch(
+MipsInstrInfo::BranchType MipsInstrInfo::analyzeBranch(
     MachineBasicBlock &MBB, MachineBasicBlock *&TBB, MachineBasicBlock *&FBB,
     SmallVectorImpl<MachineOperand> &Cond, bool AllowModify,
     SmallVectorImpl<MachineInstr *> &BranchInstrs) const {
@@ -185,7 +188,7 @@ MipsInstrInfo::BranchType MipsInstrInfo::AnalyzeBranch(
   while (I != REnd && I->isDebugValue())
     ++I;
 
-  if (I == REnd || !isUnpredicatedTerminator(&*I)) {
+  if (I == REnd || !isUnpredicatedTerminator(*I)) {
     // This block ends with no branches (it just falls through to its succ).
     // Leave TBB/FBB null.
     TBB = FBB = nullptr;
@@ -209,14 +212,14 @@ MipsInstrInfo::BranchType MipsInstrInfo::AnalyzeBranch(
     SecondLastOpc = getAnalyzableBrOpc(SecondLastInst->getOpcode());
 
     // Not an analyzable branch (must be an indirect jump).
-    if (isUnpredicatedTerminator(SecondLastInst) && !SecondLastOpc)
+    if (isUnpredicatedTerminator(*SecondLastInst) && !SecondLastOpc)
       return BT_None;
   }
 
   // If there is only one terminator instruction, process it.
   if (!SecondLastOpc) {
     // Unconditional branch.
-    if (LastOpc == UncondBrOpc) {
+    if (LastInst->isUnconditionalBranch()) {
       TBB = LastInst->getOperand(0).getMBB();
       return BT_Uncond;
     }
@@ -228,14 +231,14 @@ MipsInstrInfo::BranchType MipsInstrInfo::AnalyzeBranch(
 
   // If we reached here, there are two branches.
   // If there are three terminators, we don't know what sort of block this is.
-  if (++I != REnd && isUnpredicatedTerminator(&*I))
+  if (++I != REnd && isUnpredicatedTerminator(*I))
     return BT_None;
 
   BranchInstrs.insert(BranchInstrs.begin(), SecondLastInst);
 
   // If second to last instruction is an unconditional branch,
   // analyze it and remove the last instruction.
-  if (SecondLastOpc == UncondBrOpc) {
+  if (SecondLastInst->isUnconditionalBranch()) {
     // Return if the last instruction cannot be removed.
     if (!AllowModify)
       return BT_None;
@@ -248,7 +251,7 @@ MipsInstrInfo::BranchType MipsInstrInfo::AnalyzeBranch(
 
   // Conditional branch followed by an unconditional branch.
   // The last one must be unconditional.
-  if (LastOpc != UncondBrOpc)
+  if (!LastInst->isUnconditionalBranch())
     return BT_None;
 
   AnalyzeCondBr(SecondLastInst, SecondLastOpc, TBB, Cond);
@@ -257,20 +260,137 @@ MipsInstrInfo::BranchType MipsInstrInfo::AnalyzeBranch(
   return BT_CondUncond;
 }
 
+/// Return the corresponding compact (no delay slot) form of a branch.
+unsigned MipsInstrInfo::getEquivalentCompactForm(
+    const MachineBasicBlock::iterator I) const {
+  unsigned Opcode = I->getOpcode();
+  bool canUseShortMicroMipsCTI = false;
+
+  if (Subtarget.inMicroMipsMode()) {
+    switch (Opcode) {
+    case Mips::BNE:
+    case Mips::BEQ:
+    // microMIPS has NE,EQ branches that do not have delay slots provided one
+    // of the operands is zero.
+      if (I->getOperand(1).getReg() == Subtarget.getABI().GetZeroReg())
+        canUseShortMicroMipsCTI = true;
+      break;
+    // For microMIPS the PseudoReturn and PseudoIndirectBranch are always
+    // expanded to JR_MM, so they can be replaced with JRC16_MM.
+    case Mips::JR:
+    case Mips::PseudoReturn:
+    case Mips::PseudoIndirectBranch:
+      canUseShortMicroMipsCTI = true;
+      break;
+    }
+  }
+
+  // MIPSR6 forbids both operands being the zero register.
+  if (Subtarget.hasMips32r6() && (I->getNumOperands() > 1) &&
+      (I->getOperand(0).isReg() &&
+       (I->getOperand(0).getReg() == Mips::ZERO ||
+        I->getOperand(0).getReg() == Mips::ZERO_64)) &&
+      (I->getOperand(1).isReg() &&
+       (I->getOperand(1).getReg() == Mips::ZERO ||
+        I->getOperand(1).getReg() == Mips::ZERO_64)))
+    return 0;
+
+  if (Subtarget.hasMips32r6() || canUseShortMicroMipsCTI) {
+    switch (Opcode) {
+    case Mips::B:
+      return Mips::BC;
+    case Mips::BAL:
+      return Mips::BALC;
+    case Mips::BEQ:
+      if (canUseShortMicroMipsCTI)
+        return Mips::BEQZC_MM;
+      else if (I->getOperand(0).getReg() == I->getOperand(1).getReg())
+        return 0;
+      return Mips::BEQC;
+    case Mips::BNE:
+      if (canUseShortMicroMipsCTI)
+        return Mips::BNEZC_MM;
+      else if (I->getOperand(0).getReg() == I->getOperand(1).getReg())
+        return 0;
+      return Mips::BNEC;
+    case Mips::BGE:
+      if (I->getOperand(0).getReg() == I->getOperand(1).getReg())
+        return 0;
+      return Mips::BGEC;
+    case Mips::BGEU:
+      if (I->getOperand(0).getReg() == I->getOperand(1).getReg())
+        return 0;
+      return Mips::BGEUC;
+    case Mips::BGEZ:
+      return Mips::BGEZC;
+    case Mips::BGTZ:
+      return Mips::BGTZC;
+    case Mips::BLEZ:
+      return Mips::BLEZC;
+    case Mips::BLT:
+      if (I->getOperand(0).getReg() == I->getOperand(1).getReg())
+        return 0;
+      return Mips::BLTC;
+    case Mips::BLTU:
+      if (I->getOperand(0).getReg() == I->getOperand(1).getReg())
+        return 0;
+      return Mips::BLTUC;
+    case Mips::BLTZ:
+      return Mips::BLTZC;
+    // For MIPSR6, the instruction 'jic' can be used for these cases. Some
+    // tools will accept 'jrc reg' as an alias for 'jic 0, $reg'.
+    case Mips::JR:
+    case Mips::PseudoReturn:
+    case Mips::PseudoIndirectBranch:
+      if (canUseShortMicroMipsCTI)
+        return Mips::JRC16_MM;
+      return Mips::JIC;
+    case Mips::JALRPseudo:
+      return Mips::JIALC;
+    case Mips::JR64:
+    case Mips::PseudoReturn64:
+    case Mips::PseudoIndirectBranch64:
+      return Mips::JIC64;
+    case Mips::JALR64Pseudo:
+      return Mips::JIALC64;
+    default:
+      return 0;
+    }
+  }
+
+  return 0;
+}
+
+/// Predicate for distingushing between control transfer instructions and all
+/// other instructions for handling forbidden slots. Consider inline assembly
+/// as unsafe as well.
+bool MipsInstrInfo::SafeInForbiddenSlot(const MachineInstr &MI) const {
+  if (MI.isInlineAsm())
+    return false;
+
+  return (MI.getDesc().TSFlags & MipsII::IsCTI) == 0;
+
+}
+
+/// Predicate for distingushing instructions that have forbidden slots.
+bool MipsInstrInfo::HasForbiddenSlot(const MachineInstr &MI) const {
+  return (MI.getDesc().TSFlags & MipsII::HasForbiddenSlot) != 0;
+}
+
 /// Return the number of bytes of code the specified instruction may be.
-unsigned MipsInstrInfo::GetInstSizeInBytes(const MachineInstr *MI) const {
-  switch (MI->getOpcode()) {
+unsigned MipsInstrInfo::GetInstSizeInBytes(const MachineInstr &MI) const {
+  switch (MI.getOpcode()) {
   default:
-    return MI->getDesc().getSize();
+    return MI.getDesc().getSize();
   case  TargetOpcode::INLINEASM: {       // Inline Asm: Variable size.
-    const MachineFunction *MF = MI->getParent()->getParent();
-    const char *AsmStr = MI->getOperand(0).getSymbolName();
+    const MachineFunction *MF = MI.getParent()->getParent();
+    const char *AsmStr = MI.getOperand(0).getSymbolName();
     return getInlineAsmLength(AsmStr, *MF->getTarget().getMCAsmInfo());
   }
   case Mips::CONSTPOOL_ENTRY:
     // If this machine instr is a constant pool entry, its size is recorded as
     // operand #2.
-    return MI->getOperand(2).getImm();
+    return MI.getOperand(2).getImm();
   }
 }
 
@@ -278,10 +398,70 @@ MachineInstrBuilder
 MipsInstrInfo::genInstrWithNewOpc(unsigned NewOpc,
                                   MachineBasicBlock::iterator I) const {
   MachineInstrBuilder MIB;
+
+  // Certain branches have two forms: e.g beq $1, $zero, dst vs beqz $1, dest
+  // Pick the zero form of the branch for readable assembly and for greater
+  // branch distance in non-microMIPS mode.
+  // FIXME: Certain atomic sequences on mips64 generate 32bit references to
+  // Mips::ZERO, which is incorrect. This test should be updated to use
+  // Subtarget.getABI().GetZeroReg() when those atomic sequences and others
+  // are fixed.
+  bool BranchWithZeroOperand =
+      (I->isBranch() && !I->isPseudo() && I->getOperand(1).isReg() &&
+       (I->getOperand(1).getReg() == Mips::ZERO ||
+        I->getOperand(1).getReg() == Mips::ZERO_64));
+
+  if (BranchWithZeroOperand) {
+    switch (NewOpc) {
+    case Mips::BEQC:
+      NewOpc = Mips::BEQZC;
+      break;
+    case Mips::BNEC:
+      NewOpc = Mips::BNEZC;
+      break;
+    case Mips::BGEC:
+      NewOpc = Mips::BGEZC;
+      break;
+    case Mips::BLTC:
+      NewOpc = Mips::BLTZC;
+      break;
+    }
+  }
+
   MIB = BuildMI(*I->getParent(), I, I->getDebugLoc(), get(NewOpc));
 
-  for (unsigned J = 0, E = I->getDesc().getNumOperands(); J < E; ++J)
-    MIB.addOperand(I->getOperand(J));
+  // For MIPSR6 JI*C requires an immediate 0 as an operand, JIALC(64) an
+  // immediate 0 as an operand and requires the removal of it's %RA<imp-def>
+  // implicit operand as copying the implicit operations of the instructio we're
+  // looking at will give us the correct flags.
+  if (NewOpc == Mips::JIC || NewOpc == Mips::JIALC || NewOpc == Mips::JIC64 ||
+      NewOpc == Mips::JIALC64) {
+
+    if (NewOpc == Mips::JIALC || NewOpc == Mips::JIALC64)
+      MIB->RemoveOperand(0);
+
+    for (unsigned J = 0, E = I->getDesc().getNumOperands(); J < E; ++J) {
+      MIB.addOperand(I->getOperand(J));
+    }
+
+    MIB.addImm(0);
+
+ } else if (BranchWithZeroOperand) {
+    // For MIPSR6 and microMIPS branches with an explicit zero operand, copy
+    // everything after the zero.
+     MIB.addOperand(I->getOperand(0));
+
+    for (unsigned J = 2, E = I->getDesc().getNumOperands(); J < E; ++J) {
+      MIB.addOperand(I->getOperand(J));
+    }
+  } else {
+    // All other cases copy all other operands.
+    for (unsigned J = 0, E = I->getDesc().getNumOperands(); J < E; ++J) {
+      MIB.addOperand(I->getOperand(J));
+    }
+  }
+
+  MIB.copyImplicitOps(*I);
 
   MIB.setMemRefs(I->memoperands_begin(), I->memoperands_end());
   return MIB;
diff --git a/lib/Target/Mips/MipsInstrInfo.h b/lib/Target/Mips/MipsInstrInfo.h
index 08efc3509046..2e55012eec40 100644
--- a/lib/Target/Mips/MipsInstrInfo.h
+++ b/lib/Target/Mips/MipsInstrInfo.h
@@ -19,7 +19,6 @@
 #define LLVM_LIB_TARGET_MIPS_MIPSINSTRINFO_H
 
 #include "Mips.h"
-#include "MipsAnalyzeImmediate.h"
 #include "MipsRegisterInfo.h"
 #include "llvm/CodeGen/MachineInstrBuilder.h"
 #include "llvm/Support/ErrorHandling.h"
@@ -51,7 +50,7 @@ public:
   static const MipsInstrInfo *create(MipsSubtarget &STI);
 
   /// Branch Analysis
-  bool AnalyzeBranch(MachineBasicBlock &MBB, MachineBasicBlock *&TBB,
+  bool analyzeBranch(MachineBasicBlock &MBB, MachineBasicBlock *&TBB,
                      MachineBasicBlock *&FBB,
                      SmallVectorImpl<MachineOperand> &Cond,
                      bool AllowModify) const override;
@@ -60,16 +59,25 @@ public:
 
   unsigned InsertBranch(MachineBasicBlock &MBB, MachineBasicBlock *TBB,
                         MachineBasicBlock *FBB, ArrayRef<MachineOperand> Cond,
-                        DebugLoc DL) const override;
+                        const DebugLoc &DL) const override;
 
   bool
   ReverseBranchCondition(SmallVectorImpl<MachineOperand> &Cond) const override;
 
-  BranchType AnalyzeBranch(MachineBasicBlock &MBB, MachineBasicBlock *&TBB,
+  BranchType analyzeBranch(MachineBasicBlock &MBB, MachineBasicBlock *&TBB,
                            MachineBasicBlock *&FBB,
                            SmallVectorImpl<MachineOperand> &Cond,
                            bool AllowModify,
-                           SmallVectorImpl<MachineInstr*> &BranchInstrs) const;
+                           SmallVectorImpl<MachineInstr *> &BranchInstrs) const;
+
+  /// Determine the opcode of a non-delay slot form for a branch if one exists.
+  unsigned getEquivalentCompactForm(const MachineBasicBlock::iterator I) const;
+
+  /// Predicate to determine if an instruction can go in a forbidden slot.
+  bool SafeInForbiddenSlot(const MachineInstr &MI) const;
+
+  /// Predicate to determine if an instruction has a forbidden slot.
+  bool HasForbiddenSlot(const MachineInstr &MI) const;
 
   /// Insert nop instruction when hazard condition is found
   void insertNoop(MachineBasicBlock &MBB,
@@ -84,7 +92,7 @@ public:
   virtual unsigned getOppositeBranchOpc(unsigned Opc) const = 0;
 
   /// Return the number of bytes of code the specified instruction may be.
-  unsigned GetInstSizeInBytes(const MachineInstr *MI) const;
+  unsigned GetInstSizeInBytes(const MachineInstr &MI) const;
 
   void storeRegToStackSlot(MachineBasicBlock &MBB,
                            MachineBasicBlock::iterator MBBI,
@@ -129,7 +137,7 @@ protected:
   bool isZeroImm(const MachineOperand &op) const;
 
   MachineMemOperand *GetMemOperand(MachineBasicBlock &MBB, int FI,
-                                   unsigned Flag) const;
+                                   MachineMemOperand::Flags Flags) const;
 
 private:
   virtual unsigned getAnalyzableBrOpc(unsigned Opc) const = 0;
@@ -138,8 +146,8 @@ private:
                      MachineBasicBlock *&BB,
                      SmallVectorImpl<MachineOperand> &Cond) const;
 
-  void BuildCondBr(MachineBasicBlock &MBB, MachineBasicBlock *TBB, DebugLoc DL,
-                   ArrayRef<MachineOperand> Cond) const;
+  void BuildCondBr(MachineBasicBlock &MBB, MachineBasicBlock *TBB,
+                   const DebugLoc &DL, ArrayRef<MachineOperand> Cond) const;
 };
 
 /// Create MipsInstrInfo objects.
diff --git a/lib/Target/Mips/MipsInstrInfo.td b/lib/Target/Mips/MipsInstrInfo.td
index ffda491f0c86..296f6e9b08bd 100644
--- a/lib/Target/Mips/MipsInstrInfo.td
+++ b/lib/Target/Mips/MipsInstrInfo.td
@@ -179,6 +179,10 @@ def IsGP64bit    :    Predicate<"Subtarget->isGP64bit()">,
                       AssemblerPredicate<"FeatureGP64Bit">;
 def IsGP32bit    :    Predicate<"!Subtarget->isGP64bit()">,
                       AssemblerPredicate<"!FeatureGP64Bit">;
+def IsPTR64bit    :   Predicate<"Subtarget->isABI_N64()">,
+                      AssemblerPredicate<"FeaturePTR64Bit">;
+def IsPTR32bit    :   Predicate<"!Subtarget->isABI_N64()">,
+                      AssemblerPredicate<"!FeaturePTR64Bit">;
 def HasMips64    :    Predicate<"Subtarget->hasMips64()">,
                       AssemblerPredicate<"FeatureMips64">;
 def NotMips64    :    Predicate<"!Subtarget->hasMips64()">,
@@ -197,8 +201,8 @@ def InMips16Mode :    Predicate<"Subtarget->inMips16Mode()">,
                       AssemblerPredicate<"FeatureMips16">;
 def HasCnMips    :    Predicate<"Subtarget->hasCnMips()">,
                       AssemblerPredicate<"FeatureCnMips">;
-def RelocStatic :     Predicate<"TM.getRelocationModel() == Reloc::Static">;
-def RelocPIC    :     Predicate<"TM.getRelocationModel() == Reloc::PIC_">;
+def RelocNotPIC :     Predicate<"!TM.isPositionIndependent()">;
+def RelocPIC    :     Predicate<"TM.isPositionIndependent()">;
 def NoNaNsFPMath :    Predicate<"TM.Options.NoNaNsFPMath">;
 def HasStdEnc :       Predicate<"Subtarget->hasStandardEncoding()">,
                       AssemblerPredicate<"!FeatureMips16">;
@@ -225,6 +229,9 @@ def HasMSA : Predicate<"Subtarget->hasMSA()">,
 class GPR_32 { list<Predicate> GPRPredicates = [IsGP32bit]; }
 class GPR_64 { list<Predicate> GPRPredicates = [IsGP64bit]; }
 
+class PTR_32 { list<Predicate> PTRPredicates = [IsPTR32bit]; }
+class PTR_64 { list<Predicate> PTRPredicates = [IsPTR64bit]; }
+
 //===----------------------------------------------------------------------===//
 // Mips ISA/ASE membership and instruction group membership adjectives.
 // They are mutually exclusive.
@@ -315,6 +322,10 @@ class ASE_CNMIPS {
   list<Predicate> InsnPredicates = [HasCnMips];
 }
 
+class ASE_MIPS64_CNMIPS {
+  list<Predicate> InsnPredicates = [HasMips64, HasCnMips];
+}
+
 class ASE_MSA {
   list<Predicate> InsnPredicates = [HasMSA];
 }
@@ -333,6 +344,10 @@ class ISA_MICROMIPS_NOT_32R6_64R6 : PredicateControl {
   let InsnPredicates = [InMicroMips, NotMips32r6, NotMips64r6];
 }
 
+class ASE_NOT_DSP {
+  list<Predicate> InsnPredicates = [NotDSP];
+}
+
 //===----------------------------------------------------------------------===//
 
 class MipsPat<dag pattern, dag result> : Pat<pattern, result>, PredicateControl {
@@ -348,14 +363,17 @@ class IsCommutable {
 
 class IsBranch {
   bit isBranch = 1;
+  bit isCTI = 1;
 }
 
 class IsReturn {
   bit isReturn = 1;
+  bit isCTI = 1;
 }
 
 class IsCall {
   bit isCall = 1;
+  bit isCTI = 1;
 }
 
 class IsTailCall {
@@ -365,6 +383,7 @@ class IsTailCall {
   bit isBarrier = 1;
   bit hasExtraSrcRegAllocReq = 1;
   bit isCodeGenOnly = 1;
+  bit isCTI = 1;
 }
 
 class IsAsCheapAsAMove {
@@ -385,13 +404,13 @@ include "MipsInstrFormats.td"
 // Mips Operand, Complex Patterns and Transformations Definitions.
 //===----------------------------------------------------------------------===//
 
-class ConstantSImmAsmOperandClass<int Bits, list<AsmOperandClass> Supers = []>
-    : AsmOperandClass {
-  let Name = "ConstantSImm" # Bits;
-  let RenderMethod = "addImmOperands";
-  let PredicateMethod = "isConstantSImm<" # Bits # ">";
+class ConstantSImmAsmOperandClass<int Bits, list<AsmOperandClass> Supers = [],
+                                  int Offset = 0> : AsmOperandClass {
+  let Name = "ConstantSImm" # Bits # "_" # Offset;
+  let RenderMethod = "addConstantSImmOperands<" # Bits # ", " # Offset # ">";
+  let PredicateMethod = "isConstantSImm<" # Bits # ", " # Offset # ">";
   let SuperClasses = Supers;
-  let DiagnosticType = "SImm" # Bits;
+  let DiagnosticType = "SImm" # Bits # "_" # Offset;
 }
 
 class ConstantUImmAsmOperandClass<int Bits, list<AsmOperandClass> Supers = [],
@@ -403,49 +422,181 @@ class ConstantUImmAsmOperandClass<int Bits, list<AsmOperandClass> Supers = [],
   let DiagnosticType = "UImm" # Bits # "_" # Offset;
 }
 
+class ConstantUImmRangeAsmOperandClass<int Bottom, int Top,
+                                       list<AsmOperandClass> Supers = []>
+    : AsmOperandClass {
+  let Name = "ConstantUImmRange" # Bottom # "_" # Top;
+  let RenderMethod = "addImmOperands";
+  let PredicateMethod = "isConstantUImmRange<" # Bottom # ", " # Top # ">";
+  let SuperClasses = Supers;
+  let DiagnosticType = "UImmRange" # Bottom # "_" # Top;
+}
+
+class SImmAsmOperandClass<int Bits, list<AsmOperandClass> Supers = []>
+    : AsmOperandClass {
+  let Name = "SImm" # Bits;
+  let RenderMethod = "addSImmOperands<" # Bits # ">";
+  let PredicateMethod = "isSImm<" # Bits # ">";
+  let SuperClasses = Supers;
+  let DiagnosticType = "SImm" # Bits;
+}
+
+class UImmAsmOperandClass<int Bits, list<AsmOperandClass> Supers = []>
+    : AsmOperandClass {
+  let Name = "UImm" # Bits;
+  let RenderMethod = "addUImmOperands<" # Bits # ">";
+  let PredicateMethod = "isUImm<" # Bits # ">";
+  let SuperClasses = Supers;
+  let DiagnosticType = "UImm" # Bits;
+}
+
+// AsmOperandClasses require a strict ordering which is difficult to manage
+// as a hierarchy. Instead, we use a linear ordering and impose an order that
+// is in some places arbitrary.
+//
+// Here the rules that are in use:
+// * Wider immediates are a superset of narrower immediates:
+//     uimm4 < uimm5 < uimm6
+// * For the same bit-width, unsigned immediates are a superset of signed
+//   immediates::
+//     simm4 < uimm4 < simm5 < uimm5
+// * For the same upper-bound, signed immediates are a superset of unsigned
+//   immediates:
+//     uimm3 < simm4 < uimm4 < simm4
+// * Modified immediates are a superset of ordinary immediates:
+//     uimm5 < uimm5_plus1 (1..32) < uimm5_plus32 (32..63) < uimm6
+//   The term 'superset' starts to break down here since the uimm5_plus* classes
+//   are not true supersets of uimm5 (but they are still subsets of uimm6).
+// * 'Relaxed' immediates are supersets of the corresponding unsigned immediate.
+//     uimm16 < uimm16_relaxed
+// * The codeGen pattern type is arbitrarily ordered.
+//     uimm5 < uimm5_64, and uimm5 < vsplat_uimm5
+//   This is entirely arbitrary. We need an ordering and what we pick is
+//   unimportant since only one is possible for a given mnemonic.
+def SImm32RelaxedAsmOperandClass
+    : SImmAsmOperandClass<32, []> {
+  let Name = "SImm32_Relaxed";
+  let PredicateMethod = "isAnyImm<32>";
+  let DiagnosticType = "SImm32_Relaxed";
+}
+def SImm32AsmOperandClass
+    : SImmAsmOperandClass<32, [SImm32RelaxedAsmOperandClass]>;
+def ConstantUImm26AsmOperandClass
+    : ConstantUImmAsmOperandClass<26, [SImm32AsmOperandClass]>;
+def ConstantUImm20AsmOperandClass
+    : ConstantUImmAsmOperandClass<20, [ConstantUImm26AsmOperandClass]>;
+def UImm16RelaxedAsmOperandClass
+    : UImmAsmOperandClass<16, [ConstantUImm20AsmOperandClass]> {
+  let Name = "UImm16_Relaxed";
+  let PredicateMethod = "isAnyImm<16>";
+  let DiagnosticType = "UImm16_Relaxed";
+}
+def UImm16AsmOperandClass
+    : UImmAsmOperandClass<16, [UImm16RelaxedAsmOperandClass]>;
+def SImm16RelaxedAsmOperandClass
+    : SImmAsmOperandClass<16, [UImm16RelaxedAsmOperandClass]> {
+  let Name = "SImm16_Relaxed";
+  let PredicateMethod = "isAnyImm<16>";
+  let DiagnosticType = "SImm16_Relaxed";
+}
+def SImm16AsmOperandClass
+    : SImmAsmOperandClass<16, [SImm16RelaxedAsmOperandClass]>;
+def ConstantSImm10Lsl3AsmOperandClass : AsmOperandClass {
+  let Name = "SImm10Lsl3";
+  let RenderMethod = "addImmOperands";
+  let PredicateMethod = "isScaledSImm<10, 3>";
+  let SuperClasses = [SImm16AsmOperandClass];
+  let DiagnosticType = "SImm10_Lsl3";
+}
+def ConstantSImm10Lsl2AsmOperandClass : AsmOperandClass {
+  let Name = "SImm10Lsl2";
+  let RenderMethod = "addImmOperands";
+  let PredicateMethod = "isScaledSImm<10, 2>";
+  let SuperClasses = [ConstantSImm10Lsl3AsmOperandClass];
+  let DiagnosticType = "SImm10_Lsl2";
+}
+def ConstantSImm11AsmOperandClass
+    : ConstantSImmAsmOperandClass<11, [ConstantSImm10Lsl2AsmOperandClass]>;
+def ConstantSImm10Lsl1AsmOperandClass : AsmOperandClass {
+  let Name = "SImm10Lsl1";
+  let RenderMethod = "addImmOperands";
+  let PredicateMethod = "isScaledSImm<10, 1>";
+  let SuperClasses = [ConstantSImm11AsmOperandClass];
+  let DiagnosticType = "SImm10_Lsl1";
+}
 def ConstantUImm10AsmOperandClass
-    : ConstantUImmAsmOperandClass<10, []>;
+    : ConstantUImmAsmOperandClass<10, [ConstantSImm10Lsl1AsmOperandClass]>;
+def ConstantSImm10AsmOperandClass
+    : ConstantSImmAsmOperandClass<10, [ConstantUImm10AsmOperandClass]>;
+def ConstantSImm9AsmOperandClass
+    : ConstantSImmAsmOperandClass<9, [ConstantSImm10AsmOperandClass]>;
+def ConstantSImm7Lsl2AsmOperandClass : AsmOperandClass {
+  let Name = "SImm7Lsl2";
+  let RenderMethod = "addImmOperands";
+  let PredicateMethod = "isScaledSImm<7, 2>";
+  let SuperClasses = [ConstantSImm9AsmOperandClass];
+  let DiagnosticType = "SImm7_Lsl2";
+}
 def ConstantUImm8AsmOperandClass
-    : ConstantUImmAsmOperandClass<8, [ConstantUImm10AsmOperandClass]>;
+    : ConstantUImmAsmOperandClass<8, [ConstantSImm7Lsl2AsmOperandClass]>;
+def ConstantUImm7Sub1AsmOperandClass
+    : ConstantUImmAsmOperandClass<7, [ConstantUImm8AsmOperandClass], -1> {
+  // Specify the names since the -1 offset causes invalid identifiers otherwise.
+  let Name = "UImm7_N1";
+  let DiagnosticType = "UImm7_N1";
+}
 def ConstantUImm7AsmOperandClass
-    : ConstantUImmAsmOperandClass<7, [ConstantUImm8AsmOperandClass]>;
+    : ConstantUImmAsmOperandClass<7, [ConstantUImm7Sub1AsmOperandClass]>;
+def ConstantUImm6Lsl2AsmOperandClass : AsmOperandClass {
+  let Name = "UImm6Lsl2";
+  let RenderMethod = "addImmOperands";
+  let PredicateMethod = "isScaledUImm<6, 2>";
+  let SuperClasses = [ConstantUImm7AsmOperandClass];
+  let DiagnosticType = "UImm6_Lsl2";
+}
 def ConstantUImm6AsmOperandClass
-    : ConstantUImmAsmOperandClass<6, [ConstantUImm7AsmOperandClass]>;
+    : ConstantUImmAsmOperandClass<6, [ConstantUImm6Lsl2AsmOperandClass]>;
 def ConstantSImm6AsmOperandClass
-    : ConstantSImmAsmOperandClass<6, [ConstantUImm7AsmOperandClass]>;
-def ConstantUImm5Plus1AsmOperandClass
-    : ConstantUImmAsmOperandClass<5, [ConstantUImm6AsmOperandClass], 1>;
-def ConstantUImm5Plus32AsmOperandClass
-    : ConstantUImmAsmOperandClass<5, [ConstantUImm6AsmOperandClass], 32>;
-def ConstantUImm5Plus33AsmOperandClass
-    : ConstantUImmAsmOperandClass<5, [ConstantUImm6AsmOperandClass], 33>;
-def ConstantUImm5Plus32NormalizeAsmOperandClass
-    : ConstantUImmAsmOperandClass<5, [ConstantUImm6AsmOperandClass], 32> {
-  let Name = "ConstantUImm5_32_Norm";
-  // We must also subtract 32 when we render the operand.
-  let RenderMethod = "addConstantUImmOperands<5, 32, -32>";
-}
+    : ConstantSImmAsmOperandClass<6, [ConstantUImm6AsmOperandClass]>;
 def ConstantUImm5Lsl2AsmOperandClass : AsmOperandClass {
   let Name = "UImm5Lsl2";
   let RenderMethod = "addImmOperands";
   let PredicateMethod = "isScaledUImm<5, 2>";
-  let SuperClasses = [ConstantUImm6AsmOperandClass];
+  let SuperClasses = [ConstantSImm6AsmOperandClass];
   let DiagnosticType = "UImm5_Lsl2";
 }
+def ConstantUImm5_Range2_64AsmOperandClass
+    : ConstantUImmRangeAsmOperandClass<2, 64, [ConstantUImm5Lsl2AsmOperandClass]>;
+def ConstantUImm5Plus33AsmOperandClass
+    : ConstantUImmAsmOperandClass<5, [ConstantUImm5_Range2_64AsmOperandClass],
+                                  33>;
 def ConstantUImm5ReportUImm6AsmOperandClass
-    : ConstantUImmAsmOperandClass<5, [ConstantUImm6AsmOperandClass]> {
+    : ConstantUImmAsmOperandClass<5, [ConstantUImm5Plus33AsmOperandClass]> {
   let Name = "ConstantUImm5_0_Report_UImm6";
   let DiagnosticType = "UImm5_0_Report_UImm6";
 }
+def ConstantUImm5Plus32AsmOperandClass
+    : ConstantUImmAsmOperandClass<
+          5, [ConstantUImm5ReportUImm6AsmOperandClass], 32>;
+def ConstantUImm5Plus32NormalizeAsmOperandClass
+    : ConstantUImmAsmOperandClass<5, [ConstantUImm5Plus32AsmOperandClass], 32> {
+  let Name = "ConstantUImm5_32_Norm";
+  // We must also subtract 32 when we render the operand.
+  let RenderMethod = "addConstantUImmOperands<5, 32, -32>";
+}
+def ConstantUImm5Plus1AsmOperandClass
+    : ConstantUImmAsmOperandClass<
+          5, [ConstantUImm5Plus32NormalizeAsmOperandClass], 1>;
 def ConstantUImm5AsmOperandClass
-    : ConstantUImmAsmOperandClass<5, [ConstantUImm6AsmOperandClass]>;
+    : ConstantUImmAsmOperandClass<5, [ConstantUImm5Plus1AsmOperandClass]>;
+def ConstantSImm5AsmOperandClass
+    : ConstantSImmAsmOperandClass<5, [ConstantUImm5AsmOperandClass]>;
 def ConstantUImm4AsmOperandClass
-    : ConstantUImmAsmOperandClass<
-          4, [ConstantUImm5AsmOperandClass,
-              ConstantUImm5Plus32AsmOperandClass,
-              ConstantUImm5Plus32NormalizeAsmOperandClass]>;
+    : ConstantUImmAsmOperandClass<4, [ConstantSImm5AsmOperandClass]>;
+def ConstantSImm4AsmOperandClass
+    : ConstantSImmAsmOperandClass<4, [ConstantUImm4AsmOperandClass]>;
 def ConstantUImm3AsmOperandClass
-    : ConstantUImmAsmOperandClass<3, [ConstantUImm4AsmOperandClass]>;
+    : ConstantUImmAsmOperandClass<3, [ConstantSImm4AsmOperandClass]>;
 def ConstantUImm2Plus1AsmOperandClass
     : ConstantUImmAsmOperandClass<2, [ConstantUImm3AsmOperandClass], 1>;
 def ConstantUImm2AsmOperandClass
@@ -478,6 +629,12 @@ def brtarget    : Operand<OtherVT> {
   let DecoderMethod = "DecodeBranchTarget";
   let ParserMatchClass = MipsJumpTargetAsmOperand;
 }
+def brtarget1SImm16 : Operand<OtherVT> {
+  let EncoderMethod = "getBranchTargetOpValue1SImm16";
+  let OperandType = "OPERAND_PCREL";
+  let DecoderMethod = "DecodeBranchTarget1SImm16";
+  let ParserMatchClass = MipsJumpTargetAsmOperand;
+}
 def calltarget  : Operand<iPTR> {
   let EncoderMethod = "getJumpTargetOpValue";
   let ParserMatchClass = MipsJumpTargetAsmOperand;
@@ -485,18 +642,6 @@ def calltarget  : Operand<iPTR> {
 
 def imm64: Operand<i64>;
 
-def simm6 : Operand<i32> {
-  let ParserMatchClass = ConstantSImm6AsmOperandClass;
-  let OperandType = "OPERAND_IMMEDIATE";
-}
-def simm9 : Operand<i32>;
-def simm10 : Operand<i32>;
-def simm11 : Operand<i32>;
-
-def simm16      : Operand<i32> {
-  let DecoderMethod= "DecodeSimm16";
-}
-
 def simm19_lsl2 : Operand<i32> {
   let EncoderMethod = "getSimm19Lsl2Encoding";
   let DecoderMethod = "DecodeSimm19Lsl2";
@@ -509,91 +654,207 @@ def simm18_lsl3 : Operand<i32> {
   let ParserMatchClass = MipsJumpTargetAsmOperand;
 }
 
-def simm20      : Operand<i32>;
-def simm32      : Operand<i32>;
-
-def uimm20      : Operand<i32> {
-}
-
-def simm16_64   : Operand<i64> {
-  let DecoderMethod = "DecodeSimm16";
-}
-
 // Zero
 def uimmz       : Operand<i32> {
-  let PrintMethod = "printUnsignedImm";
+  let PrintMethod = "printUImm<0>";
   let ParserMatchClass = ConstantImmzAsmOperandClass;
 }
 
+// size operand of ins instruction
+def uimm_range_2_64 : Operand<i32> {
+  let PrintMethod = "printUImm<6, 2>";
+  let EncoderMethod = "getSizeInsEncoding";
+  let DecoderMethod = "DecodeInsSize";
+  let ParserMatchClass = ConstantUImm5_Range2_64AsmOperandClass;
+}
+
 // Unsigned Operands
-foreach I = {1, 2, 3, 4, 5, 6, 7, 8, 10} in
+foreach I = {1, 2, 3, 4, 5, 6, 7, 8, 10, 20, 26} in
   def uimm # I : Operand<i32> {
-    let PrintMethod = "printUnsignedImm";
+    let PrintMethod = "printUImm<" # I # ">";
     let ParserMatchClass =
         !cast<AsmOperandClass>("ConstantUImm" # I # "AsmOperandClass");
   }
 
 def uimm2_plus1 : Operand<i32> {
-  let PrintMethod = "printUnsignedImm";
+  let PrintMethod = "printUImm<2, 1>";
   let EncoderMethod = "getUImmWithOffsetEncoding<2, 1>";
   let DecoderMethod = "DecodeUImmWithOffset<2, 1>";
   let ParserMatchClass = ConstantUImm2Plus1AsmOperandClass;
 }
 
 def uimm5_plus1 : Operand<i32> {
-  let PrintMethod = "printUnsignedImm";
+  let PrintMethod = "printUImm<5, 1>";
   let EncoderMethod = "getUImmWithOffsetEncoding<5, 1>";
   let DecoderMethod = "DecodeUImmWithOffset<5, 1>";
   let ParserMatchClass = ConstantUImm5Plus1AsmOperandClass;
 }
 
 def uimm5_plus32 : Operand<i32> {
-  let PrintMethod = "printUnsignedImm";
+  let PrintMethod = "printUImm<5, 32>";
   let ParserMatchClass = ConstantUImm5Plus32AsmOperandClass;
 }
 
 def uimm5_plus33 : Operand<i32> {
-  let PrintMethod = "printUnsignedImm";
+  let PrintMethod = "printUImm<5, 33>";
   let EncoderMethod = "getUImmWithOffsetEncoding<5, 1>";
   let DecoderMethod = "DecodeUImmWithOffset<5, 1>";
   let ParserMatchClass = ConstantUImm5Plus33AsmOperandClass;
 }
 
+def uimm5_inssize_plus1 : Operand<i32> {
+  let PrintMethod = "printUImm<6>";
+  let ParserMatchClass = ConstantUImm5Plus1AsmOperandClass;
+  let EncoderMethod = "getSizeInsEncoding";
+  let DecoderMethod = "DecodeInsSize";
+}
+
 def uimm5_plus32_normalize : Operand<i32> {
-  let PrintMethod = "printUnsignedImm";
+  let PrintMethod = "printUImm<5>";
   let ParserMatchClass = ConstantUImm5Plus32NormalizeAsmOperandClass;
 }
 
 def uimm5_lsl2 : Operand<OtherVT> {
   let EncoderMethod = "getUImm5Lsl2Encoding";
-  let DecoderMethod = "DecodeUImm5lsl2";
+  let DecoderMethod = "DecodeUImmWithOffsetAndScale<5, 0, 4>";
   let ParserMatchClass = ConstantUImm5Lsl2AsmOperandClass;
 }
 
 def uimm5_plus32_normalize_64 : Operand<i64> {
-  let PrintMethod = "printUnsignedImm";
+  let PrintMethod = "printUImm<5>";
   let ParserMatchClass = ConstantUImm5Plus32NormalizeAsmOperandClass;
 }
 
+def uimm6_lsl2 : Operand<OtherVT> {
+  let EncoderMethod = "getUImm6Lsl2Encoding";
+  let DecoderMethod = "DecodeUImmWithOffsetAndScale<6, 0, 4>";
+  let ParserMatchClass = ConstantUImm6Lsl2AsmOperandClass;
+}
+
+foreach I = {16} in
+  def uimm # I : Operand<i32> {
+    let PrintMethod = "printUImm<" # I # ">";
+    let ParserMatchClass =
+        !cast<AsmOperandClass>("UImm" # I # "AsmOperandClass");
+  }
+
+// Like uimm16_64 but coerces simm16 to uimm16.
+def uimm16_relaxed : Operand<i32> {
+  let PrintMethod = "printUImm<16>";
+  let ParserMatchClass =
+      !cast<AsmOperandClass>("UImm16RelaxedAsmOperandClass");
+}
+
 foreach I = {5} in
   def uimm # I # _64 : Operand<i64> {
-    let PrintMethod = "printUnsignedImm";
+    let PrintMethod = "printUImm<" # I # ">";
     let ParserMatchClass =
         !cast<AsmOperandClass>("ConstantUImm" # I # "AsmOperandClass");
   }
 
+foreach I = {16} in
+  def uimm # I # _64 : Operand<i64> {
+    let PrintMethod = "printUImm<" # I # ">";
+    let ParserMatchClass =
+        !cast<AsmOperandClass>("UImm" # I # "AsmOperandClass");
+  }
+
+// Like uimm16_64 but coerces simm16 to uimm16.
+def uimm16_64_relaxed : Operand<i64> {
+  let PrintMethod = "printUImm<16>";
+  let ParserMatchClass =
+      !cast<AsmOperandClass>("UImm16RelaxedAsmOperandClass");
+}
+
+// Like uimm5 but reports a less confusing error for 32-63 when
+// an instruction alias permits that.
+def uimm5_report_uimm6 : Operand<i32> {
+  let PrintMethod = "printUImm<5>";
+  let ParserMatchClass = ConstantUImm5ReportUImm6AsmOperandClass;
+}
+
 // Like uimm5_64 but reports a less confusing error for 32-63 when
 // an instruction alias permits that.
 def uimm5_64_report_uimm6 : Operand<i64> {
-  let PrintMethod = "printUnsignedImm";
+  let PrintMethod = "printUImm<5>";
   let ParserMatchClass = ConstantUImm5ReportUImm6AsmOperandClass;
 }
 
-def uimm16      : Operand<i32> {
-  let PrintMethod = "printUnsignedImm";
+foreach I = {1, 2, 3, 4} in
+  def uimm # I # _ptr : Operand<iPTR> {
+    let PrintMethod = "printUImm<" # I # ">";
+    let ParserMatchClass =
+        !cast<AsmOperandClass>("ConstantUImm" # I # "AsmOperandClass");
+  }
+
+foreach I = {1, 2, 3, 4, 5, 6, 8} in
+  def vsplat_uimm # I : Operand<vAny> {
+    let PrintMethod = "printUImm<" # I # ">";
+    let ParserMatchClass =
+        !cast<AsmOperandClass>("ConstantUImm" # I # "AsmOperandClass");
+  }
+
+// Signed operands
+foreach I = {4, 5, 6, 9, 10, 11} in
+  def simm # I : Operand<i32> {
+    let DecoderMethod = "DecodeSImmWithOffsetAndScale<" # I # ">";
+    let ParserMatchClass =
+        !cast<AsmOperandClass>("ConstantSImm" # I # "AsmOperandClass");
+  }
+
+foreach I = {1, 2, 3} in
+  def simm10_lsl # I : Operand<i32> {
+    let DecoderMethod = "DecodeSImmWithOffsetAndScale<10, " # I # ">";
+    let ParserMatchClass =
+        !cast<AsmOperandClass>("ConstantSImm10Lsl" # I # "AsmOperandClass");
+  }
+
+foreach I = {10} in
+  def simm # I # _64 : Operand<i64> {
+    let DecoderMethod = "DecodeSImmWithOffsetAndScale<" # I # ">";
+    let ParserMatchClass =
+        !cast<AsmOperandClass>("ConstantSImm" # I # "AsmOperandClass");
+  }
+
+foreach I = {5, 10} in
+  def vsplat_simm # I : Operand<vAny> {
+    let ParserMatchClass =
+        !cast<AsmOperandClass>("ConstantSImm" # I # "AsmOperandClass");
+  }
+
+def simm7_lsl2 : Operand<OtherVT> {
+  let EncoderMethod = "getSImm7Lsl2Encoding";
+  let DecoderMethod = "DecodeSImmWithOffsetAndScale<" # I # ", 0, 4>";
+  let ParserMatchClass = ConstantSImm7Lsl2AsmOperandClass;
+}
+
+foreach I = {16, 32} in
+  def simm # I : Operand<i32> {
+    let DecoderMethod = "DecodeSImmWithOffsetAndScale<" # I # ">";
+    let ParserMatchClass = !cast<AsmOperandClass>("SImm" # I # "AsmOperandClass");
+  }
+
+// Like simm16 but coerces uimm16 to simm16.
+def simm16_relaxed : Operand<i32> {
+  let DecoderMethod = "DecodeSImmWithOffsetAndScale<16>";
+  let ParserMatchClass = !cast<AsmOperandClass>("SImm16RelaxedAsmOperandClass");
+}
+
+def simm16_64 : Operand<i64> {
+  let DecoderMethod = "DecodeSImmWithOffsetAndScale<16>";
+  let ParserMatchClass = !cast<AsmOperandClass>("SImm16AsmOperandClass");
+}
+
+// Like simm32 but coerces uimm32 to simm32.
+def simm32_relaxed : Operand<i32> {
+  let DecoderMethod = "DecodeSImmWithOffsetAndScale<32>";
+  let ParserMatchClass = !cast<AsmOperandClass>("SImm32RelaxedAsmOperandClass");
 }
 
-def pcrel16      : Operand<i32> {
+// This is almost the same as a uimm7 but 0x7f is interpreted as -1.
+def li16_imm : Operand<i32> {
+  let DecoderMethod = "DecodeLi16Imm";
+  let ParserMatchClass = ConstantUImm7Sub1AsmOperandClass;
 }
 
 def MipsMemAsmOperand : AsmOperandClass {
@@ -607,22 +868,44 @@ def MipsMemSimm9AsmOperand : AsmOperandClass {
   let RenderMethod = "addMemOperands";
   let ParserMethod = "parseMemOperand";
   let PredicateMethod = "isMemWithSimmOffset<9>";
+  let DiagnosticType = "MemSImm9";
 }
 
-def MipsMemSimm9GPRAsmOperand : AsmOperandClass {
-  let Name = "MemOffsetSimm9GPR";
+def MipsMemSimm10AsmOperand : AsmOperandClass {
+  let Name = "MemOffsetSimm10";
   let SuperClasses = [MipsMemAsmOperand];
   let RenderMethod = "addMemOperands";
   let ParserMethod = "parseMemOperand";
-  let PredicateMethod = "isMemWithSimmOffsetGPR<9>";
+  let PredicateMethod = "isMemWithSimmOffset<10>";
+  let DiagnosticType = "MemSImm10";
 }
 
+def MipsMemSimm12AsmOperand : AsmOperandClass {
+  let Name = "MemOffsetSimm12";
+  let SuperClasses = [MipsMemAsmOperand];
+  let RenderMethod = "addMemOperands";
+  let ParserMethod = "parseMemOperand";
+  let PredicateMethod = "isMemWithSimmOffset<12>";
+  let DiagnosticType = "MemSImm12";
+}
+
+foreach I = {1, 2, 3} in
+  def MipsMemSimm10Lsl # I # AsmOperand : AsmOperandClass {
+    let Name = "MemOffsetSimm10_" # I;
+    let SuperClasses = [MipsMemAsmOperand];
+    let RenderMethod = "addMemOperands";
+    let ParserMethod = "parseMemOperand";
+    let PredicateMethod = "isMemWithSimmOffset<10, " # I # ">";
+    let DiagnosticType = "MemSImm10Lsl" # I;
+  }
+
 def MipsMemSimm11AsmOperand : AsmOperandClass {
   let Name = "MemOffsetSimm11";
   let SuperClasses = [MipsMemAsmOperand];
   let RenderMethod = "addMemOperands";
   let ParserMethod = "parseMemOperand";
   let PredicateMethod = "isMemWithSimmOffset<11>";
+  let DiagnosticType = "MemSImm11";
 }
 
 def MipsMemSimm16AsmOperand : AsmOperandClass {
@@ -631,6 +914,7 @@ def MipsMemSimm16AsmOperand : AsmOperandClass {
   let RenderMethod = "addMemOperands";
   let ParserMethod = "parseMemOperand";
   let PredicateMethod = "isMemWithSimmOffset<16>";
+  let DiagnosticType = "MemSImm16";
 }
 
 def MipsInvertedImmoperand : AsmOperandClass {
@@ -664,24 +948,42 @@ def mem_msa : mem_generic {
   let EncoderMethod = "getMSAMemEncoding";
 }
 
+def simm12 : Operand<i32> {
+  let DecoderMethod = "DecodeSimm12";
+}
+
 def mem_simm9 : mem_generic {
   let MIOperandInfo = (ops ptr_rc, simm9);
   let EncoderMethod = "getMemEncoding";
   let ParserMatchClass = MipsMemSimm9AsmOperand;
 }
 
-def mem_simm9gpr : mem_generic {
-  let MIOperandInfo = (ops ptr_rc, simm9);
+def mem_simm10 : mem_generic {
+  let MIOperandInfo = (ops ptr_rc, simm10);
   let EncoderMethod = "getMemEncoding";
-  let ParserMatchClass = MipsMemSimm9GPRAsmOperand;
+  let ParserMatchClass = MipsMemSimm10AsmOperand;
 }
 
+foreach I = {1, 2, 3} in
+  def mem_simm10_lsl # I : mem_generic {
+    let MIOperandInfo = (ops ptr_rc, !cast<Operand>("simm10_lsl" # I));
+    let EncoderMethod = "getMemEncoding<" # I  # ">";
+    let ParserMatchClass =
+            !cast<AsmOperandClass>("MipsMemSimm10Lsl" # I # "AsmOperand");
+  }
+
 def mem_simm11 : mem_generic {
   let MIOperandInfo = (ops ptr_rc, simm11);
   let EncoderMethod = "getMemEncoding";
   let ParserMatchClass = MipsMemSimm11AsmOperand;
 }
 
+def mem_simm12 : mem_generic {
+  let MIOperandInfo = (ops ptr_rc, simm12);
+  let EncoderMethod = "getMemEncoding";
+  let ParserMatchClass = MipsMemSimm12AsmOperand;
+}
+
 def mem_simm16 : mem_generic {
   let MIOperandInfo = (ops ptr_rc, simm16);
   let EncoderMethod = "getMemEncoding";
@@ -735,6 +1037,9 @@ def immSExt16  : PatLeaf<(imm), [{ return isInt<16>(N->getSExtValue()); }]>;
 // e.g. addi, andi
 def immSExt15  : PatLeaf<(imm), [{ return isInt<15>(N->getSExtValue()); }]>;
 
+// Node immediate fits as 7-bit zero extended on target immediate.
+def immZExt7 : PatLeaf<(imm), [{ return isUInt<7>(N->getZExtValue()); }]>;
+
 // Node immediate fits as 16-bit zero extended on target immediate.
 // The LO16 param means that only the lower 16 bits of the node
 // immediate are caught.
@@ -755,6 +1060,16 @@ def immLow16Zero : PatLeaf<(imm), [{
 // shamt field must fit in 5 bits.
 def immZExt5 : ImmLeaf<i32, [{return Imm == (Imm & 0x1f);}]>;
 
+def immZExt5Plus1 : PatLeaf<(imm), [{
+  return isUInt<5>(N->getZExtValue() - 1);
+}]>;
+def immZExt5Plus32 : PatLeaf<(imm), [{
+  return isUInt<5>(N->getZExtValue() - 32);
+}]>;
+def immZExt5Plus33 : PatLeaf<(imm), [{
+  return isUInt<5>(N->getZExtValue() - 33);
+}]>;
+
 // True if (N + 1) fits in 16-bit field.
 def immSExt16Plus1 : PatLeaf<(imm), [{
   return isInt<17>(N->getSExtValue()) && isInt<16>(N->getSExtValue() + 1);
@@ -768,9 +1083,6 @@ def addr :
 def addrRegImm :
   ComplexPattern<iPTR, 2, "selectAddrRegImm", [frameindex]>;
 
-def addrRegReg :
-  ComplexPattern<iPTR, 2, "selectAddrRegReg", [frameindex]>;
-
 def addrDefault :
   ComplexPattern<iPTR, 2, "selectAddrDefault", [frameindex]>;
 
@@ -849,15 +1161,21 @@ class LoadUpper<string opstr, RegisterOperand RO, Operand Imm>:
 }
 
 // Memory Load/Store
-class Load<string opstr, DAGOperand RO, SDPatternOperator OpNode = null_frag,
-           InstrItinClass Itin = NoItinerary, ComplexPattern Addr = addr> :
-  InstSE<(outs RO:$rt), (ins mem:$addr), !strconcat(opstr, "\t$rt, $addr"),
+class LoadMemory<string opstr, DAGOperand RO, DAGOperand MO,
+                 SDPatternOperator OpNode = null_frag,
+                 InstrItinClass Itin = NoItinerary,
+                 ComplexPattern Addr = addr> :
+  InstSE<(outs RO:$rt), (ins MO:$addr), !strconcat(opstr, "\t$rt, $addr"),
          [(set RO:$rt, (OpNode Addr:$addr))], Itin, FrmI, opstr> {
   let DecoderMethod = "DecodeMem";
   let canFoldAsLoad = 1;
   let mayLoad = 1;
 }
 
+class Load<string opstr, DAGOperand RO, SDPatternOperator OpNode = null_frag,
+           InstrItinClass Itin = NoItinerary, ComplexPattern Addr = addr> :
+  LoadMemory<opstr, RO, mem, OpNode, Itin, Addr>;
+
 class StoreMemory<string opstr, DAGOperand RO, DAGOperand MO,
             SDPatternOperator OpNode = null_frag,
             InstrItinClass Itin = NoItinerary, ComplexPattern Addr = addr> :
@@ -868,8 +1186,9 @@ class StoreMemory<string opstr, DAGOperand RO, DAGOperand MO,
 }
 
 class Store<string opstr, DAGOperand RO, SDPatternOperator OpNode = null_frag,
-            InstrItinClass Itin = NoItinerary, ComplexPattern Addr = addr> :
-  StoreMemory<opstr, RO, mem, OpNode, Itin, Addr>;
+            InstrItinClass Itin = NoItinerary, ComplexPattern Addr = addr,
+            DAGOperand MO = mem> :
+  StoreMemory<opstr, RO, MO, OpNode, Itin, Addr>;
 
 // Load/Store Left/Right
 let canFoldAsLoad = 1 in
@@ -892,7 +1211,8 @@ class StoreLeftRight<string opstr, SDNode OpNode, RegisterOperand RO,
 // COP2 Load/Store
 class LW_FT2<string opstr, RegisterOperand RC, InstrItinClass Itin,
              SDPatternOperator OpNode= null_frag> :
-  InstSE<(outs RC:$rt), (ins mem:$addr), !strconcat(opstr, "\t$rt, $addr"),
+  InstSE<(outs RC:$rt), (ins mem_simm16:$addr),
+         !strconcat(opstr, "\t$rt, $addr"),
          [(set RC:$rt, (OpNode addrDefault:$addr))], Itin, FrmFI, opstr> {
   let DecoderMethod = "DecodeFMem2";
   let mayLoad = 1;
@@ -900,7 +1220,8 @@ class LW_FT2<string opstr, RegisterOperand RC, InstrItinClass Itin,
 
 class SW_FT2<string opstr, RegisterOperand RC, InstrItinClass Itin,
              SDPatternOperator OpNode= null_frag> :
-  InstSE<(outs), (ins RC:$rt, mem:$addr), !strconcat(opstr, "\t$rt, $addr"),
+  InstSE<(outs), (ins RC:$rt, mem_simm16:$addr),
+         !strconcat(opstr, "\t$rt, $addr"),
          [(OpNode RC:$rt, addrDefault:$addr)], Itin, FrmFI, opstr> {
   let DecoderMethod = "DecodeFMem2";
   let mayStore = 1;
@@ -934,6 +1255,7 @@ class CBranch<string opstr, DAGOperand opnd, PatFrag cond_op,
   let isTerminator = 1;
   let hasDelaySlot = DelaySlot;
   let Defs = [AT];
+  bit isCTI = 1;
 }
 
 class CBranchZero<string opstr, DAGOperand opnd, PatFrag cond_op,
@@ -946,6 +1268,7 @@ class CBranchZero<string opstr, DAGOperand opnd, PatFrag cond_op,
   let isTerminator = 1;
   let hasDelaySlot = DelaySlot;
   let Defs = [AT];
+  bit isCTI = 1;
 }
 
 // SetCC
@@ -972,6 +1295,7 @@ class JumpFJ<DAGOperand opnd, string opstr, SDPatternOperator operator,
   let hasDelaySlot = 1;
   let DecoderMethod = "DecodeJumpTarget";
   let Defs = [AT];
+  bit isCTI = 1;
 }
 
 // Unconditional branch
@@ -984,10 +1308,11 @@ class UncondBranch<Instruction BEQInst> :
   let hasDelaySlot = 1;
   let AdditionalPredicates = [RelocPIC];
   let Defs = [AT];
+  bit isCTI = 1;
 }
 
 // Base class for indirect branch and return instruction classes.
-let isTerminator=1, isBarrier=1, hasDelaySlot = 1 in
+let isTerminator=1, isBarrier=1, hasDelaySlot = 1, isCTI = 1 in
 class JumpFR<string opstr, RegisterOperand RO,
              SDPatternOperator operator = null_frag>:
   InstSE<(outs), (ins RO:$rs), "jr\t$rs", [(operator RO:$rs)], II_JR,
@@ -1000,7 +1325,7 @@ class IndirectBranch<string opstr, RegisterOperand RO> : JumpFR<opstr, RO> {
 }
 
 // Jump and Link (Call)
-let isCall=1, hasDelaySlot=1, Defs = [RA] in {
+let isCall=1, hasDelaySlot=1, isCTI=1, Defs = [RA] in {
   class JumpLink<string opstr, DAGOperand opnd> :
     InstSE<(outs), (ins opnd:$target), !strconcat(opstr, "\t$target"),
            [(MipsJmpLink tglobaladdr:$target)], II_JAL, FrmJ, opstr> {
@@ -1026,7 +1351,7 @@ let isCall=1, hasDelaySlot=1, Defs = [RA] in {
 }
 
 let isCall = 1, isTerminator = 1, isReturn = 1, isBarrier = 1, hasDelaySlot = 1,
-    hasExtraSrcRegAllocReq = 1, Defs = [AT] in {
+    hasExtraSrcRegAllocReq = 1, isCTI = 1, Defs = [AT] in {
   class TailCall<Instruction JumpInst> :
     PseudoSE<(outs), (ins calltarget:$target), [], II_J>,
     PseudoInstExpansion<(JumpInst jmptarget:$target)>;
@@ -1045,54 +1370,61 @@ class BAL_BR_Pseudo<Instruction RealInst> :
   let isBarrier = 1;
   let hasDelaySlot = 1;
   let Defs = [RA];
+  bit isCTI = 1;
 }
 
+let isCTI = 1 in {
 // Syscall
-class SYS_FT<string opstr> :
-  InstSE<(outs), (ins uimm20:$code_),
-         !strconcat(opstr, "\t$code_"), [], NoItinerary, FrmI, opstr>;
+class SYS_FT<string opstr, Operand ImmOp, InstrItinClass itin = NoItinerary> :
+  InstSE<(outs), (ins ImmOp:$code_),
+         !strconcat(opstr, "\t$code_"), [], itin, FrmI, opstr>;
 // Break
 class BRK_FT<string opstr> :
   InstSE<(outs), (ins uimm10:$code_1, uimm10:$code_2),
-         !strconcat(opstr, "\t$code_1, $code_2"), [], NoItinerary,
+         !strconcat(opstr, "\t$code_1, $code_2"), [], II_BREAK,
          FrmOther, opstr>;
 
 // (D)Eret
-class ER_FT<string opstr> :
+class ER_FT<string opstr, InstrItinClass itin = NoItinerary> :
   InstSE<(outs), (ins),
-         opstr, [], NoItinerary, FrmOther, opstr>;
-
-// Interrupts
-class DEI_FT<string opstr, RegisterOperand RO> :
-  InstSE<(outs RO:$rt), (ins),
-         !strconcat(opstr, "\t$rt"), [], NoItinerary, FrmOther, opstr>;
+         opstr, [], itin, FrmOther, opstr>;
 
 // Wait
 class WAIT_FT<string opstr> :
-  InstSE<(outs), (ins), opstr, [], NoItinerary, FrmOther, opstr>;
+  InstSE<(outs), (ins), opstr, [], II_WAIT, FrmOther, opstr>;
+}
+
+// Interrupts
+class DEI_FT<string opstr, RegisterOperand RO,
+             InstrItinClass itin = NoItinerary> :
+  InstSE<(outs RO:$rt), (ins),
+         !strconcat(opstr, "\t$rt"), [], itin, FrmOther, opstr>;
 
 // Sync
 let hasSideEffects = 1 in
 class SYNC_FT<string opstr> :
-  InstSE<(outs), (ins i32imm:$stype), "sync $stype", [(MipsSync imm:$stype)],
-         NoItinerary, FrmOther, opstr>;
+  InstSE<(outs), (ins uimm5:$stype), "sync $stype",
+         [(MipsSync immZExt5:$stype)], II_SYNC, FrmOther, opstr>;
 
 class SYNCI_FT<string opstr> :
   InstSE<(outs), (ins mem_simm16:$addr), !strconcat(opstr, "\t$addr"), [],
-         NoItinerary, FrmOther, opstr> {
+         II_SYNCI, FrmOther, opstr> {
   let hasSideEffects = 1;
   let DecoderMethod = "DecodeSyncI";
 }
 
-let hasSideEffects = 1 in
-class TEQ_FT<string opstr, RegisterOperand RO> :
-  InstSE<(outs), (ins RO:$rs, RO:$rt, uimm16:$code_),
-         !strconcat(opstr, "\t$rs, $rt, $code_"), [], NoItinerary,
-         FrmI, opstr>;
-
-class TEQI_FT<string opstr, RegisterOperand RO> :
-  InstSE<(outs), (ins RO:$rs, uimm16:$imm16),
-         !strconcat(opstr, "\t$rs, $imm16"), [], NoItinerary, FrmOther, opstr>;
+let hasSideEffects = 1, isCTI = 1 in {
+class TEQ_FT<string opstr, RegisterOperand RO, Operand ImmOp,
+             InstrItinClass itin = NoItinerary> :
+  InstSE<(outs), (ins RO:$rs, RO:$rt, ImmOp:$code_),
+         !strconcat(opstr, "\t$rs, $rt, $code_"), [], itin, FrmI, opstr>;
+
+class TEQI_FT<string opstr, RegisterOperand RO,
+              InstrItinClass itin = NoItinerary> :
+  InstSE<(outs), (ins RO:$rs, simm16:$imm16),
+         !strconcat(opstr, "\t$rs, $imm16"), [], itin, FrmOther, opstr>;
+}
+
 // Mul, Div
 class Mult<string opstr, InstrItinClass itin, RegisterOperand RO,
            list<Register> DefRegs> :
@@ -1163,20 +1495,23 @@ class MoveToLOHI<string opstr, RegisterOperand RO, list<Register> DefRegs>:
 
 class EffectiveAddress<string opstr, RegisterOperand RO> :
   InstSE<(outs RO:$rt), (ins mem_ea:$addr), !strconcat(opstr, "\t$rt, $addr"),
-         [(set RO:$rt, addr:$addr)], NoItinerary, FrmI,
+         [(set RO:$rt, addr:$addr)], II_ADDIU, FrmI,
          !strconcat(opstr, "_lea")> {
   let isCodeGenOnly = 1;
+  let hasNoSchedulingInfo = 1;
   let DecoderMethod = "DecodeMem";
 }
 
 // Count Leading Ones/Zeros in Word
-class CountLeading0<string opstr, RegisterOperand RO>:
+class CountLeading0<string opstr, RegisterOperand RO,
+                  InstrItinClass itin = NoItinerary>:
   InstSE<(outs RO:$rd), (ins RO:$rs), !strconcat(opstr, "\t$rd, $rs"),
-         [(set RO:$rd, (ctlz RO:$rs))], II_CLZ, FrmR, opstr>;
+         [(set RO:$rd, (ctlz RO:$rs))], itin, FrmR, opstr>;
 
-class CountLeading1<string opstr, RegisterOperand RO>:
+class CountLeading1<string opstr, RegisterOperand RO,
+                  InstrItinClass itin = NoItinerary>:
   InstSE<(outs RO:$rd), (ins RO:$rs), !strconcat(opstr, "\t$rd, $rs"),
-         [(set RO:$rd, (ctlz (not RO:$rs)))], II_CLO, FrmR, opstr>;
+         [(set RO:$rd, (ctlz (not RO:$rs)))], itin, FrmR, opstr>;
 
 // Sign Extend in Register.
 class SignExtInReg<string opstr, ValueType vt, RegisterOperand RO,
@@ -1199,15 +1534,16 @@ class ReadHardware<RegisterOperand CPURegOperand, RegisterOperand RO> :
 
 // Ext and Ins
 class ExtBase<string opstr, RegisterOperand RO, Operand PosOpnd,
-              Operand SizeOpnd, SDPatternOperator Op = null_frag> :
+              Operand SizeOpnd, PatFrag PosImm, PatFrag SizeImm,
+              SDPatternOperator Op = null_frag> :
   InstSE<(outs RO:$rt), (ins RO:$rs, PosOpnd:$pos, SizeOpnd:$size),
          !strconcat(opstr, " $rt, $rs, $pos, $size"),
-         [(set RO:$rt, (Op RO:$rs, imm:$pos, imm:$size))], II_EXT,
+         [(set RO:$rt, (Op RO:$rs, PosImm:$pos, SizeImm:$size))], II_EXT,
          FrmR, opstr>, ISA_MIPS32R2;
 
 class InsBase<string opstr, RegisterOperand RO, Operand PosOpnd,
-              SDPatternOperator Op = null_frag>:
-  InstSE<(outs RO:$rt), (ins RO:$rs, PosOpnd:$pos, size_ins:$size, RO:$src),
+              Operand SizeOpnd, SDPatternOperator Op = null_frag>:
+  InstSE<(outs RO:$rt), (ins RO:$rs, PosOpnd:$pos, SizeOpnd:$size, RO:$src),
          !strconcat(opstr, " $rt, $rs, $pos, $size"),
          [(set RO:$rt, (Op RO:$rs, imm:$pos, imm:$size, RO:$src))],
          II_INS, FrmR, opstr>, ISA_MIPS32R2 {
@@ -1224,35 +1560,38 @@ class AtomicCmpSwap<PatFrag Op, RegisterClass DRC> :
   PseudoSE<(outs DRC:$dst), (ins PtrRC:$ptr, DRC:$cmp, DRC:$swap),
            [(set DRC:$dst, (Op iPTR:$ptr, DRC:$cmp, DRC:$swap))]>;
 
-class LLBase<string opstr, RegisterOperand RO> :
-  InstSE<(outs RO:$rt), (ins mem:$addr), !strconcat(opstr, "\t$rt, $addr"),
-         [], NoItinerary, FrmI> {
+class LLBase<string opstr, RegisterOperand RO, DAGOperand MO = mem> :
+  InstSE<(outs RO:$rt), (ins MO:$addr), !strconcat(opstr, "\t$rt, $addr"),
+         [], II_LL, FrmI, opstr> {
   let DecoderMethod = "DecodeMem";
   let mayLoad = 1;
 }
 
 class SCBase<string opstr, RegisterOperand RO> :
   InstSE<(outs RO:$dst), (ins RO:$rt, mem:$addr),
-         !strconcat(opstr, "\t$rt, $addr"), [], NoItinerary, FrmI> {
+         !strconcat(opstr, "\t$rt, $addr"), [], II_SC, FrmI> {
   let DecoderMethod = "DecodeMem";
   let mayStore = 1;
   let Constraints = "$rt = $dst";
 }
 
-class MFC3OP<string asmstr, RegisterOperand RO, RegisterOperand RD> :
-  InstSE<(outs RO:$rt), (ins RD:$rd, uimm16:$sel),
-         !strconcat(asmstr, "\t$rt, $rd, $sel"), [], NoItinerary, FrmFR>;
+class MFC3OP<string asmstr, RegisterOperand RO, RegisterOperand RD,
+             InstrItinClass itin> :
+  InstSE<(outs RO:$rt), (ins RD:$rd, uimm3:$sel),
+         !strconcat(asmstr, "\t$rt, $rd, $sel"), [], itin, FrmFR>;
 
-class MTC3OP<string asmstr, RegisterOperand RO, RegisterOperand RD> :
-  InstSE<(outs RO:$rd), (ins RD:$rt, uimm16:$sel),
-         !strconcat(asmstr, "\t$rt, $rd, $sel"), [], NoItinerary, FrmFR>;
+class MTC3OP<string asmstr, RegisterOperand RO, RegisterOperand RD,
+             InstrItinClass itin> :
+  InstSE<(outs RO:$rd), (ins RD:$rt, uimm3:$sel),
+         !strconcat(asmstr, "\t$rt, $rd, $sel"), [], itin, FrmFR>;
 
 class TrapBase<Instruction RealInst>
-  : PseudoSE<(outs), (ins), [(trap)], NoItinerary>,
+  : PseudoSE<(outs), (ins), [(trap)], II_TRAP>,
     PseudoInstExpansion<(RealInst 0, 0)> {
   let isBarrier = 1;
   let isTerminator = 1;
   let isCodeGenOnly = 1;
+  let isCTI = 1;
 }
 
 //===----------------------------------------------------------------------===//
@@ -1260,11 +1599,13 @@ class TrapBase<Instruction RealInst>
 //===----------------------------------------------------------------------===//
 
 // Return RA.
-let isReturn=1, isTerminator=1, hasDelaySlot=1, isBarrier=1, hasCtrlDep=1 in
-def RetRA : PseudoSE<(outs), (ins), [(MipsRet)]>;
+let isReturn=1, isTerminator=1, isBarrier=1, hasCtrlDep=1, isCTI=1 in {
+  let hasDelaySlot=1 in
+  def RetRA : PseudoSE<(outs), (ins), [(MipsRet)]>;
 
-let isReturn=1, isTerminator=1, isBarrier=1, hasCtrlDep=1, hasSideEffects=1 in
-def ERet : PseudoSE<(outs), (ins), [(MipsERet)]>;
+  let hasSideEffects=1 in
+  def ERet : PseudoSE<(outs), (ins), [(MipsERet)]>;
+}
 
 let Defs = [SP], Uses = [SP], hasSideEffects = 1 in {
 def ADJCALLSTACKDOWN : MipsPseudo<(outs), (ins i32imm:$amt),
@@ -1303,7 +1644,7 @@ let usesCustomInserter = 1 in {
 }
 
 /// Pseudo instructions for loading and storing accumulator registers.
-let isPseudo = 1, isCodeGenOnly = 1 in {
+let isPseudo = 1, isCodeGenOnly = 1, hasNoSchedulingInfo = 1 in {
   def LOAD_ACC64  : Load<"", ACC64>;
   def STORE_ACC64 : Store<"", ACC64>;
 }
@@ -1329,40 +1670,39 @@ def LONG_BRANCH_ADDiu : PseudoSE<(outs GPR32Opnd:$dst),
 
 /// Arithmetic Instructions (ALU Immediate)
 let AdditionalPredicates = [NotInMicroMips] in {
-def ADDiu : MMRel, StdMMR6Rel, ArithLogicI<"addiu", simm16, GPR32Opnd,
-                                           II_ADDIU, immSExt16, add>,
-            ADDI_FM<0x9>, IsAsCheapAsAMove;
-}
-def ADDi  : MMRel, ArithLogicI<"addi", simm16, GPR32Opnd>, ADDI_FM<0x8>,
+  def ADDiu : MMRel, StdMMR6Rel, ArithLogicI<"addiu", simm16_relaxed, GPR32Opnd,
+                                             II_ADDIU, immSExt16, add>,
+              ADDI_FM<0x9>, IsAsCheapAsAMove;
+
+  def ANDi : MMRel, StdMMR6Rel,
+             ArithLogicI<"andi", uimm16, GPR32Opnd, II_ANDI, immZExt16, and>,
+             ADDI_FM<0xc>;
+  def ORi  : MMRel, StdMMR6Rel,
+             ArithLogicI<"ori", uimm16, GPR32Opnd, II_ORI, immZExt16, or>,
+             ADDI_FM<0xd>;
+  def XORi : MMRel, StdMMR6Rel,
+             ArithLogicI<"xori", uimm16, GPR32Opnd, II_XORI, immZExt16, xor>,
+             ADDI_FM<0xe>;
+}
+def ADDi  : MMRel, ArithLogicI<"addi", simm16_relaxed, GPR32Opnd, II_ADDI>, ADDI_FM<0x8>,
             ISA_MIPS1_NOT_32R6_64R6;
 def SLTi  : MMRel, SetCC_I<"slti", setlt, simm16, immSExt16, GPR32Opnd>,
             SLTI_FM<0xa>;
 def SLTiu : MMRel, SetCC_I<"sltiu", setult, simm16, immSExt16, GPR32Opnd>,
             SLTI_FM<0xb>;
-let AdditionalPredicates = [NotInMicroMips] in {
-def ANDi  : MMRel, StdMMR6Rel,
-            ArithLogicI<"andi", uimm16, GPR32Opnd, II_ANDI, immZExt16, and>,
-            ADDI_FM<0xc>;
-}
-def ORi   : MMRel, StdMMR6Rel,
-            ArithLogicI<"ori", uimm16, GPR32Opnd, II_ORI, immZExt16, or>,
-            ADDI_FM<0xd>;
-def XORi  : MMRel, StdMMR6Rel,
-            ArithLogicI<"xori", uimm16, GPR32Opnd, II_XORI, immZExt16, xor>,
-            ADDI_FM<0xe>;
-def LUi   : MMRel, LoadUpper<"lui", GPR32Opnd, uimm16>, LUI_FM;
+def LUi   : MMRel, LoadUpper<"lui", GPR32Opnd, uimm16_relaxed>, LUI_FM;
 let AdditionalPredicates = [NotInMicroMips] in {
 /// Arithmetic Instructions (3-Operand, R-Type)
 def ADDu  : MMRel, StdMMR6Rel, ArithLogicR<"addu", GPR32Opnd, 1, II_ADDU, add>,
             ADD_FM<0, 0x21>;
-def SUBu  : MMRel, ArithLogicR<"subu", GPR32Opnd, 0, II_SUBU, sub>,
+def SUBu  : MMRel, StdMMR6Rel, ArithLogicR<"subu", GPR32Opnd, 0, II_SUBU, sub>,
             ADD_FM<0, 0x23>;
 }
 let Defs = [HI0, LO0] in
 def MUL   : MMRel, ArithLogicR<"mul", GPR32Opnd, 1, II_MUL, mul>,
             ADD_FM<0x1c, 2>, ISA_MIPS32_NOT_32R6_64R6;
-def ADD   : MMRel, StdMMR6Rel, ArithLogicR<"add", GPR32Opnd>, ADD_FM<0, 0x20>;
-def SUB   : MMRel, ArithLogicR<"sub", GPR32Opnd>, ADD_FM<0, 0x22>;
+def ADD   : MMRel, StdMMR6Rel, ArithLogicR<"add", GPR32Opnd, 1, II_ADD>, ADD_FM<0, 0x20>;
+def SUB   : MMRel, StdMMR6Rel, ArithLogicR<"sub", GPR32Opnd, 0, II_SUB>, ADD_FM<0, 0x22>;
 def SLT   : MMRel, SetCC_R<"slt", setlt, GPR32Opnd>, ADD_FM<0, 0x2a>;
 def SLTu  : MMRel, SetCC_R<"sltu", setult, GPR32Opnd>, ADD_FM<0, 0x2b>;
 let AdditionalPredicates = [NotInMicroMips] in {
@@ -1372,8 +1712,8 @@ def OR    : MMRel, StdMMR6Rel, ArithLogicR<"or", GPR32Opnd, 1, II_OR, or>,
             ADD_FM<0, 0x25>;
 def XOR   : MMRel, StdMMR6Rel, ArithLogicR<"xor", GPR32Opnd, 1, II_XOR, xor>,
             ADD_FM<0, 0x26>;
-}
 def NOR   : MMRel, StdMMR6Rel, LogicNOR<"nor", GPR32Opnd>, ADD_FM<0, 0x27>;
+}
 
 /// Shift Instructions
 let AdditionalPredicates = [NotInMicroMips] in {
@@ -1381,7 +1721,6 @@ def SLL  : MMRel, shift_rotate_imm<"sll", uimm5, GPR32Opnd, II_SLL, shl,
                                    immZExt5>, SRA_FM<0, 0>;
 def SRL  : MMRel, shift_rotate_imm<"srl", uimm5, GPR32Opnd, II_SRL, srl,
                                    immZExt5>, SRA_FM<2, 0>;
-}
 def SRA  : MMRel, shift_rotate_imm<"sra", uimm5, GPR32Opnd, II_SRA, sra,
                                    immZExt5>, SRA_FM<3, 0>;
 def SLLV : MMRel, shift_rotate_reg<"sllv", GPR32Opnd, II_SLLV, shl>,
@@ -1390,25 +1729,30 @@ def SRLV : MMRel, shift_rotate_reg<"srlv", GPR32Opnd, II_SRLV, srl>,
            SRLV_FM<6, 0>;
 def SRAV : MMRel, shift_rotate_reg<"srav", GPR32Opnd, II_SRAV, sra>,
            SRLV_FM<7, 0>;
+}
 
 // Rotate Instructions
-def ROTR  : MMRel, shift_rotate_imm<"rotr", uimm5, GPR32Opnd, II_ROTR, rotr,
-                                    immZExt5>,
-            SRA_FM<2, 1>, ISA_MIPS32R2;
-def ROTRV : MMRel, shift_rotate_reg<"rotrv", GPR32Opnd, II_ROTRV, rotr>,
-            SRLV_FM<6, 1>, ISA_MIPS32R2;
+let AdditionalPredicates = [NotInMicroMips] in {
+  def ROTR  : MMRel, shift_rotate_imm<"rotr", uimm5, GPR32Opnd, II_ROTR, rotr,
+                                      immZExt5>,
+              SRA_FM<2, 1>, ISA_MIPS32R2;
+  def ROTRV : MMRel, shift_rotate_reg<"rotrv", GPR32Opnd, II_ROTRV, rotr>,
+              SRLV_FM<6, 1>, ISA_MIPS32R2;
+}
 
 /// Load and Store Instructions
 ///  aligned
-def LB  : Load<"lb", GPR32Opnd, sextloadi8, II_LB>, MMRel, LW_FM<0x20>;
-def LBu : Load<"lbu", GPR32Opnd, zextloadi8, II_LBU, addrDefault>, MMRel,
-          LW_FM<0x24>;
-def LH  : Load<"lh", GPR32Opnd, sextloadi16, II_LH, addrDefault>, MMRel,
-          LW_FM<0x21>;
-def LHu : Load<"lhu", GPR32Opnd, zextloadi16, II_LHU>, MMRel, LW_FM<0x25>;
+def LB  : LoadMemory<"lb", GPR32Opnd, mem_simm16, sextloadi8, II_LB>, MMRel,
+          LW_FM<0x20>;
+def LBu : LoadMemory<"lbu", GPR32Opnd, mem_simm16, zextloadi8, II_LBU,
+                     addrDefault>, MMRel, LW_FM<0x24>;
 let AdditionalPredicates = [NotInMicroMips] in {
-def LW  : StdMMR6Rel, Load<"lw", GPR32Opnd, load, II_LW, addrDefault>, MMRel,
-          LW_FM<0x23>;
+  def LH  : LoadMemory<"lh", GPR32Opnd, mem_simm16, sextloadi16, II_LH,
+                       addrDefault>, MMRel, LW_FM<0x21>;
+  def LHu : LoadMemory<"lhu", GPR32Opnd, mem_simm16, zextloadi16, II_LHU>,
+            MMRel, LW_FM<0x25>;
+  def LW  : StdMMR6Rel, Load<"lw", GPR32Opnd, load, II_LW, addrDefault>, MMRel,
+            LW_FM<0x23>;
 }
 def SB  : StdMMR6Rel, Store<"sb", GPR32Opnd, truncstorei8, II_SB>, MMRel,
           LW_FM<0x28>;
@@ -1432,82 +1776,87 @@ def SWR : StoreLeftRight<"swr", MipsSWR, GPR32Opnd, II_SWR>, LW_FM<0x2e>,
 
 let AdditionalPredicates = [NotInMicroMips] in {
 // COP2 Memory Instructions
-def LWC2 : LW_FT2<"lwc2", COP2Opnd, NoItinerary, load>, LW_FM<0x32>,
+def LWC2 : StdMMR6Rel, LW_FT2<"lwc2", COP2Opnd, II_LWC2, load>, LW_FM<0x32>,
            ISA_MIPS1_NOT_32R6_64R6;
-def SWC2 : SW_FT2<"swc2", COP2Opnd, NoItinerary, store>, LW_FM<0x3a>,
-           ISA_MIPS1_NOT_32R6_64R6;
-def LDC2 : LW_FT2<"ldc2", COP2Opnd, NoItinerary, load>, LW_FM<0x36>,
-           ISA_MIPS2_NOT_32R6_64R6;
-def SDC2 : SW_FT2<"sdc2", COP2Opnd, NoItinerary, store>, LW_FM<0x3e>,
+def SWC2 : StdMMR6Rel, SW_FT2<"swc2", COP2Opnd, II_SWC2, store>,
+           LW_FM<0x3a>, ISA_MIPS1_NOT_32R6_64R6;
+def LDC2 : StdMMR6Rel, LW_FT2<"ldc2", COP2Opnd, II_LDC2, load>, LW_FM<0x36>,
            ISA_MIPS2_NOT_32R6_64R6;
+def SDC2 : StdMMR6Rel, SW_FT2<"sdc2", COP2Opnd, II_SDC2, store>,
+           LW_FM<0x3e>, ISA_MIPS2_NOT_32R6_64R6;
 
 // COP3 Memory Instructions
 let DecoderNamespace = "COP3_" in {
-  def LWC3 : LW_FT3<"lwc3", COP3Opnd, NoItinerary, load>, LW_FM<0x33>;
-  def SWC3 : SW_FT3<"swc3", COP3Opnd, NoItinerary, store>, LW_FM<0x3b>;
-  def LDC3 : LW_FT3<"ldc3", COP3Opnd, NoItinerary, load>, LW_FM<0x37>,
+  def LWC3 : LW_FT3<"lwc3", COP3Opnd, II_LWC3, load>, LW_FM<0x33>;
+  def SWC3 : SW_FT3<"swc3", COP3Opnd, II_SWC3, store>, LW_FM<0x3b>;
+  def LDC3 : LW_FT3<"ldc3", COP3Opnd, II_LDC3, load>, LW_FM<0x37>,
              ISA_MIPS2;
-  def SDC3 : SW_FT3<"sdc3", COP3Opnd, NoItinerary, store>, LW_FM<0x3f>,
+  def SDC3 : SW_FT3<"sdc3", COP3Opnd, II_SDC3, store>, LW_FM<0x3f>,
              ISA_MIPS2;
 }
 }
 
-def SYNC : MMRel, StdMMR6Rel, SYNC_FT<"sync">, SYNC_FM, ISA_MIPS32;
+def SYNC : MMRel, StdMMR6Rel, SYNC_FT<"sync">, SYNC_FM,
+           ISA_MIPS32;
 def SYNCI : MMRel, StdMMR6Rel, SYNCI_FT<"synci">, SYNCI_FM, ISA_MIPS32R2;
 
 let AdditionalPredicates = [NotInMicroMips] in {
-  def TEQ : MMRel, TEQ_FT<"teq", GPR32Opnd>, TEQ_FM<0x34>, ISA_MIPS2;
-  def TGE : MMRel, TEQ_FT<"tge", GPR32Opnd>, TEQ_FM<0x30>, ISA_MIPS2;
-  def TGEU : MMRel, TEQ_FT<"tgeu", GPR32Opnd>, TEQ_FM<0x31>, ISA_MIPS2;
-  def TLT : MMRel, TEQ_FT<"tlt", GPR32Opnd>, TEQ_FM<0x32>, ISA_MIPS2;
-  def TLTU : MMRel, TEQ_FT<"tltu", GPR32Opnd>, TEQ_FM<0x33>, ISA_MIPS2;
-  def TNE : MMRel, TEQ_FT<"tne", GPR32Opnd>, TEQ_FM<0x36>, ISA_MIPS2;
+  def TEQ : MMRel, TEQ_FT<"teq", GPR32Opnd, uimm10, II_TEQ>, TEQ_FM<0x34>, ISA_MIPS2;
+  def TGE : MMRel, TEQ_FT<"tge", GPR32Opnd, uimm10, II_TGE>, TEQ_FM<0x30>, ISA_MIPS2;
+  def TGEU : MMRel, TEQ_FT<"tgeu", GPR32Opnd, uimm10, II_TGEU>, TEQ_FM<0x31>, ISA_MIPS2;
+  def TLT : MMRel, TEQ_FT<"tlt", GPR32Opnd, uimm10, II_TLT>, TEQ_FM<0x32>, ISA_MIPS2;
+  def TLTU : MMRel, TEQ_FT<"tltu", GPR32Opnd, uimm10, II_TLTU>, TEQ_FM<0x33>, ISA_MIPS2;
+  def TNE : MMRel, TEQ_FT<"tne", GPR32Opnd, uimm10, II_TNE>, TEQ_FM<0x36>, ISA_MIPS2;
 }
 
-def TEQI : MMRel, TEQI_FT<"teqi", GPR32Opnd>, TEQI_FM<0xc>,
+def TEQI : MMRel, TEQI_FT<"teqi", GPR32Opnd, II_TEQI>, TEQI_FM<0xc>,
            ISA_MIPS2_NOT_32R6_64R6;
-def TGEI : MMRel, TEQI_FT<"tgei", GPR32Opnd>, TEQI_FM<0x8>,
+def TGEI : MMRel, TEQI_FT<"tgei", GPR32Opnd, II_TGEI>, TEQI_FM<0x8>,
            ISA_MIPS2_NOT_32R6_64R6;
-def TGEIU : MMRel, TEQI_FT<"tgeiu", GPR32Opnd>, TEQI_FM<0x9>,
+def TGEIU : MMRel, TEQI_FT<"tgeiu", GPR32Opnd, II_TGEIU>, TEQI_FM<0x9>,
            ISA_MIPS2_NOT_32R6_64R6;
-def TLTI : MMRel, TEQI_FT<"tlti", GPR32Opnd>, TEQI_FM<0xa>,
+def TLTI : MMRel, TEQI_FT<"tlti", GPR32Opnd, II_TLTI>, TEQI_FM<0xa>,
            ISA_MIPS2_NOT_32R6_64R6;
-def TTLTIU : MMRel, TEQI_FT<"tltiu", GPR32Opnd>, TEQI_FM<0xb>,
+def TTLTIU : MMRel, TEQI_FT<"tltiu", GPR32Opnd, II_TTLTIU>, TEQI_FM<0xb>,
            ISA_MIPS2_NOT_32R6_64R6;
-def TNEI : MMRel, TEQI_FT<"tnei", GPR32Opnd>, TEQI_FM<0xe>,
+def TNEI : MMRel, TEQI_FT<"tnei", GPR32Opnd, II_TNEI>, TEQI_FM<0xe>,
            ISA_MIPS2_NOT_32R6_64R6;
 
 let AdditionalPredicates = [NotInMicroMips] in {
 def BREAK : MMRel, StdMMR6Rel, BRK_FT<"break">, BRK_FM<0xd>;
+def SYSCALL : MMRel, SYS_FT<"syscall", uimm20, II_SYSCALL>, SYS_FM<0xc>;
 }
-def SYSCALL : MMRel, SYS_FT<"syscall">, SYS_FM<0xc>;
 def TRAP : TrapBase<BREAK>;
-def SDBBP : MMRel, SYS_FT<"sdbbp">, SDBBP_FM, ISA_MIPS32_NOT_32R6_64R6;
+let AdditionalPredicates = [NotInMicroMips] in {
+def SDBBP : MMRel, SYS_FT<"sdbbp", uimm20, II_SDBBP>, SDBBP_FM, ISA_MIPS32_NOT_32R6_64R6;
+}
 
 let AdditionalPredicates = [NotInMicroMips] in {
-  def ERET : MMRel, ER_FT<"eret">, ER_FM<0x18, 0x0>, INSN_MIPS3_32;
-  def ERETNC : MMRel, ER_FT<"eretnc">, ER_FM<0x18, 0x1>, ISA_MIPS32R5;
-  def DERET : MMRel, ER_FT<"deret">, ER_FM<0x1f, 0x0>, ISA_MIPS32;
+  def ERET : MMRel, ER_FT<"eret", II_ERET>, ER_FM<0x18, 0x0>, INSN_MIPS3_32;
+  def ERETNC : MMRel, ER_FT<"eretnc", II_ERETNC>, ER_FM<0x18, 0x1>, ISA_MIPS32R5;
+  def DERET : MMRel, ER_FT<"deret", II_DERET>, ER_FM<0x1f, 0x0>, ISA_MIPS32;
 }
 
 let AdditionalPredicates = [NotInMicroMips] in {
-  def EI : MMRel, StdMMR6Rel, DEI_FT<"ei", GPR32Opnd>, EI_FM<1>, ISA_MIPS32R2;
-  def DI : MMRel, StdMMR6Rel, DEI_FT<"di", GPR32Opnd>, EI_FM<0>, ISA_MIPS32R2;
+  def EI : MMRel, StdMMR6Rel, DEI_FT<"ei", GPR32Opnd, II_EI>, EI_FM<1>, ISA_MIPS32R2;
+  def DI : MMRel, StdMMR6Rel, DEI_FT<"di", GPR32Opnd, II_DI>, EI_FM<0>, ISA_MIPS32R2;
 }
 
 let EncodingPredicates = []<Predicate>, // FIXME: Lack of HasStdEnc is probably a bug
     AdditionalPredicates = [NotInMicroMips] in {
 def WAIT : WAIT_FT<"wait">, WAIT_FM;
+}
 
+let AdditionalPredicates = [NotInMicroMips] in {
 /// Load-linked, Store-conditional
-def LL : LLBase<"ll", GPR32Opnd>, LW_FM<0x30>, ISA_MIPS2_NOT_32R6_64R6;
-def SC : SCBase<"sc", GPR32Opnd>, LW_FM<0x38>, ISA_MIPS2_NOT_32R6_64R6;
+def LL : LLBase<"ll", GPR32Opnd>, LW_FM<0x30>, PTR_32, ISA_MIPS2_NOT_32R6_64R6;
+def SC : SCBase<"sc", GPR32Opnd>, LW_FM<0x38>, PTR_32, ISA_MIPS2_NOT_32R6_64R6;
 }
 
 /// Jump and Branch Instructions
 def J       : MMRel, JumpFJ<jmptarget, "j", br, bb, "j">, FJ<2>,
-              AdditionalRequires<[RelocStatic]>, IsBranch;
-def JR      : MMRel, IndirectBranch<"jr", GPR32Opnd>, MTLO_FM<8>;
+              AdditionalRequires<[RelocNotPIC]>, IsBranch;
+def JR      : MMRel, IndirectBranch<"jr", GPR32Opnd>, MTLO_FM<8>, ISA_MIPS1_NOT_32R6_64R6; 
 def BEQ     : MMRel, CBranch<"beq", brtarget, seteq, GPR32Opnd>, BEQ_FM<4>;
 def BEQL    : MMRel, CBranch<"beql", brtarget, seteq, GPR32Opnd, 0>,
               BEQ_FM<20>, ISA_MIPS2_NOT_32R6_64R6;
@@ -1562,6 +1911,7 @@ class PseudoIndirectBranchBase<RegisterOperand RO> :
   let hasDelaySlot = 1;
   let isBranch = 1;
   let isIndirectBranch = 1;
+  bit isCTI = 1;
 }
 
 def PseudoIndirectBranch : PseudoIndirectBranchBase<GPR32Opnd>;
@@ -1579,6 +1929,7 @@ class PseudoReturnBase<RegisterOperand RO> : MipsPseudo<(outs), (ins RO:$rs),
   let isCodeGenOnly = 1;
   let hasCtrlDep = 1;
   let hasExtraSrcRegAllocReq = 1;
+  bit isCTI = 1;
 }
 
 def PseudoReturn : PseudoReturnBase<GPR32Opnd>;
@@ -1596,7 +1947,7 @@ def SDT_MipsEHRET : SDTypeProfile<0, 2, [SDTCisInt<0>, SDTCisPtrTy<1>]>;
 def MIPSehret : SDNode<"MipsISD::EH_RETURN", SDT_MipsEHRET,
                       [SDNPHasChain, SDNPOptInGlue, SDNPVariadic]>;
 
-let Uses = [V0, V1], isTerminator = 1, isReturn = 1, isBarrier = 1 in {
+let Uses = [V0, V1], isTerminator = 1, isReturn = 1, isBarrier = 1, isCTI = 1 in {
   def MIPSeh_return32 : MipsPseudo<(outs), (ins GPR32:$spoff, GPR32:$dst),
                                 [(MIPSehret GPR32:$spoff, GPR32:$dst)]>;
   def MIPSeh_return64 : MipsPseudo<(outs), (ins GPR64:$spoff,
@@ -1609,11 +1960,12 @@ def MULT  : MMRel, Mult<"mult", II_MULT, GPR32Opnd, [HI0, LO0]>,
             MULT_FM<0, 0x18>, ISA_MIPS1_NOT_32R6_64R6;
 def MULTu : MMRel, Mult<"multu", II_MULTU, GPR32Opnd, [HI0, LO0]>,
             MULT_FM<0, 0x19>, ISA_MIPS1_NOT_32R6_64R6;
-def SDIV  : MMRel, Div<"div", II_DIV, GPR32Opnd, [HI0, LO0]>,
-            MULT_FM<0, 0x1a>, ISA_MIPS1_NOT_32R6_64R6;
-def UDIV  : MMRel, Div<"divu", II_DIVU, GPR32Opnd, [HI0, LO0]>,
-            MULT_FM<0, 0x1b>, ISA_MIPS1_NOT_32R6_64R6;
-
+let AdditionalPredicates = [NotInMicroMips] in {
+  def SDIV  : MMRel, Div<"div", II_DIV, GPR32Opnd, [HI0, LO0]>,
+              MULT_FM<0, 0x1a>, ISA_MIPS1_NOT_32R6_64R6;
+  def UDIV  : MMRel, Div<"divu", II_DIVU, GPR32Opnd, [HI0, LO0]>,
+              MULT_FM<0, 0x1b>, ISA_MIPS1_NOT_32R6_64R6;
+}
 def MTHI : MMRel, MoveToLOHI<"mthi", GPR32Opnd, [HI0]>, MTLO_FM<0x11>,
            ISA_MIPS1_NOT_32R6_64R6;
 def MTLO : MMRel, MoveToLOHI<"mtlo", GPR32Opnd, [LO0]>, MTLO_FM<0x13>,
@@ -1633,9 +1985,9 @@ def SEH : MMRel, StdMMR6Rel, SignExtInReg<"seh", i16, GPR32Opnd, II_SEH>,
           SEB_FM<0x18, 0x20>, ISA_MIPS32R2;
 
 /// Count Leading
-def CLZ : MMRel, CountLeading0<"clz", GPR32Opnd>, CLO_FM<0x20>,
+def CLZ : MMRel, CountLeading0<"clz", GPR32Opnd, II_CLZ>, CLO_FM<0x20>,
           ISA_MIPS32_NOT_32R6_64R6;
-def CLO : MMRel, CountLeading1<"clo", GPR32Opnd>, CLO_FM<0x21>,
+def CLO : MMRel, CountLeading1<"clo", GPR32Opnd, II_CLO>, CLO_FM<0x21>,
           ISA_MIPS32_NOT_32R6_64R6;
 
 let AdditionalPredicates = [NotInMicroMips] in {
@@ -1681,29 +2033,39 @@ def PseudoMSUBU : MAddSubPseudo<MSUBU, MipsMSubu, II_MSUBU>,
                   ISA_MIPS32_NOT_32R6_64R6;
 }
 
-def PseudoSDIV : MultDivPseudo<SDIV, ACC64, GPR32Opnd, MipsDivRem, II_DIV,
-                               0, 1, 1>, ISA_MIPS1_NOT_32R6_64R6;
-def PseudoUDIV : MultDivPseudo<UDIV, ACC64, GPR32Opnd, MipsDivRemU, II_DIVU,
-                               0, 1, 1>, ISA_MIPS1_NOT_32R6_64R6;
 let AdditionalPredicates = [NotInMicroMips] in {
-def RDHWR : MMRel, ReadHardware<GPR32Opnd, HWRegsOpnd>, RDHWR_FM;
+  def PseudoSDIV : MultDivPseudo<SDIV, ACC64, GPR32Opnd, MipsDivRem, II_DIV,
+                                 0, 1, 1>, ISA_MIPS1_NOT_32R6_64R6;
+  def PseudoUDIV : MultDivPseudo<UDIV, ACC64, GPR32Opnd, MipsDivRemU, II_DIVU,
+                                 0, 1, 1>, ISA_MIPS1_NOT_32R6_64R6;
+  def RDHWR : MMRel, ReadHardware<GPR32Opnd, HWRegsOpnd>, RDHWR_FM;
+  // TODO: Add '0 < pos+size <= 32' constraint check to ext instruction
+  def EXT : MMRel, StdMMR6Rel, ExtBase<"ext", GPR32Opnd, uimm5, uimm5_plus1,
+                                       immZExt5, immZExt5Plus1, MipsExt>,
+            EXT_FM<0>;
+  def INS : MMRel, StdMMR6Rel, InsBase<"ins", GPR32Opnd, uimm5,
+                                       uimm5_inssize_plus1, MipsIns>,
+            EXT_FM<4>;
 }
-// TODO: Add '0 < pos+size <= 32' constraint check to ext instruction
-def EXT : MMRel, ExtBase<"ext", GPR32Opnd, uimm5, uimm5_plus1, MipsExt>,
-          EXT_FM<0>;
-def INS : MMRel, InsBase<"ins", GPR32Opnd, uimm5, MipsIns>, EXT_FM<4>;
-
 /// Move Control Registers From/To CPU Registers
-def MFC0 : MFC3OP<"mfc0", GPR32Opnd, COP0Opnd>, MFC3OP_FM<0x10, 0>, ISA_MIPS32;
-def MTC0 : MTC3OP<"mtc0", COP0Opnd, GPR32Opnd>, MFC3OP_FM<0x10, 4>, ISA_MIPS32;
-def MFC2 : MFC3OP<"mfc2", GPR32Opnd, COP2Opnd>, MFC3OP_FM<0x12, 0>;
-def MTC2 : MTC3OP<"mtc2", COP2Opnd, GPR32Opnd>, MFC3OP_FM<0x12, 4>;
+let AdditionalPredicates = [NotInMicroMips] in {
+  def MTC0 : MTC3OP<"mtc0", COP0Opnd, GPR32Opnd, II_MTC0>, MFC3OP_FM<0x10, 4>,
+             ISA_MIPS32;
+  def MFC0 : MFC3OP<"mfc0", GPR32Opnd, COP0Opnd, II_MFC0>, MFC3OP_FM<0x10, 0>,
+             ISA_MIPS32;
+}
+def MFC2 : MFC3OP<"mfc2", GPR32Opnd, COP2Opnd, II_MFC2>, MFC3OP_FM<0x12, 0>;
+def MTC2 : MTC3OP<"mtc2", COP2Opnd, GPR32Opnd, II_MTC2>, MFC3OP_FM<0x12, 4>;
+
+class Barrier<string asmstr, InstrItinClass itin = NoItinerary> :
+  InstSE<(outs), (ins), asmstr, [], itin, FrmOther, asmstr>;
 
-class Barrier<string asmstr> : InstSE<(outs), (ins), asmstr, [], NoItinerary,
-                                      FrmOther, asmstr>;
-def SSNOP : MMRel, StdMMR6Rel, Barrier<"ssnop">, BARRIER_FM<1>;
-def EHB : MMRel, Barrier<"ehb">, BARRIER_FM<3>;
-def PAUSE : MMRel, StdMMR6Rel, Barrier<"pause">, BARRIER_FM<5>, ISA_MIPS32R2;
+def SSNOP : MMRel, StdMMR6Rel, Barrier<"ssnop", II_SSNOP>, BARRIER_FM<1>;
+def EHB : MMRel, Barrier<"ehb", II_EHB>, BARRIER_FM<3>;
+
+let isCTI = 1 in
+def PAUSE : MMRel, StdMMR6Rel, Barrier<"pause", II_PAUSE>, BARRIER_FM<5>,
+            ISA_MIPS32R2;
 
 // JR_HB and JALR_HB are defined here using the new style naming
 // scheme because some of this code is shared with Mips32r6InstrInfo.td
@@ -1724,19 +2086,21 @@ class JALR_HB_DESC_BASE<string instr_asm, RegisterOperand GPROpnd> {
   list<dag> Pattern = [];
 }
 
-class JR_HB_DESC : InstSE<(outs), (ins), "", [], NoItinerary, FrmJ>,
+class JR_HB_DESC : InstSE<(outs), (ins), "", [], II_JR_HB, FrmJ>,
                    JR_HB_DESC_BASE<"jr.hb", GPR32Opnd> {
   let isBranch=1;
   let isIndirectBranch=1;
   let hasDelaySlot=1;
   let isTerminator=1;
   let isBarrier=1;
+  bit isCTI = 1;
 }
 
-class JALR_HB_DESC : InstSE<(outs), (ins), "", [], NoItinerary, FrmJ>,
+class JALR_HB_DESC : InstSE<(outs), (ins), "", [], II_JALR_HB, FrmJ>,
                      JALR_HB_DESC_BASE<"jalr.hb", GPR32Opnd> {
   let isIndirectBranch=1;
   let hasDelaySlot=1;
+  bit isCTI = 1;
 }
 
 class JR_HB_ENC : JR_HB_FM<8>;
@@ -1745,23 +2109,25 @@ class JALR_HB_ENC : JALR_HB_FM<9>;
 def JR_HB : JR_HB_DESC, JR_HB_ENC, ISA_MIPS32_NOT_32R6_64R6;
 def JALR_HB : JALR_HB_DESC, JALR_HB_ENC, ISA_MIPS32;
 
-class TLB<string asmstr> : InstSE<(outs), (ins), asmstr, [], NoItinerary,
-                                      FrmOther, asmstr>;
-def TLBP : MMRel, TLB<"tlbp">, COP0_TLB_FM<0x08>;
-def TLBR : MMRel, TLB<"tlbr">, COP0_TLB_FM<0x01>;
-def TLBWI : MMRel, TLB<"tlbwi">, COP0_TLB_FM<0x02>;
-def TLBWR : MMRel, TLB<"tlbwr">, COP0_TLB_FM<0x06>;
-
-class CacheOp<string instr_asm, Operand MemOpnd> :
+class TLB<string asmstr, InstrItinClass itin = NoItinerary> :
+  InstSE<(outs), (ins), asmstr, [], itin, FrmOther, asmstr>;
+let AdditionalPredicates = [NotInMicroMips] in {
+def TLBP : MMRel, TLB<"tlbp", II_TLBP>, COP0_TLB_FM<0x08>;
+def TLBR : MMRel, TLB<"tlbr", II_TLBR>, COP0_TLB_FM<0x01>;
+def TLBWI : MMRel, TLB<"tlbwi", II_TLBWI>, COP0_TLB_FM<0x02>;
+def TLBWR : MMRel, TLB<"tlbwr", II_TLBWR>, COP0_TLB_FM<0x06>;
+}
+class CacheOp<string instr_asm, Operand MemOpnd,
+              InstrItinClass itin = NoItinerary> :
     InstSE<(outs), (ins  MemOpnd:$addr, uimm5:$hint),
-           !strconcat(instr_asm, "\t$hint, $addr"), [], NoItinerary, FrmOther,
+           !strconcat(instr_asm, "\t$hint, $addr"), [], itin, FrmOther,
            instr_asm> {
   let DecoderMethod = "DecodeCacheOp";
 }
 
-def CACHE : MMRel, CacheOp<"cache", mem>, CACHEOP_FM<0b101111>,
+def CACHE : MMRel, CacheOp<"cache", mem, II_CACHE>, CACHEOP_FM<0b101111>,
             INSN_MIPS3_32_NOT_32R6_64R6;
-def PREF :  MMRel, CacheOp<"pref", mem>, CACHEOP_FM<0b110011>,
+def PREF :  MMRel, CacheOp<"pref", mem, II_PREF>, CACHEOP_FM<0b110011>,
             INSN_MIPS3_32_NOT_32R6_64R6;
 
 def ROL : MipsAsmPseudoInst<(outs),
@@ -1808,6 +2174,9 @@ def : MipsInstAlias<"dror $rd, $rs",
 def : MipsInstAlias<"dror $rd, $imm",
                     (DRORImm GPR32Opnd:$rd, GPR32Opnd:$rd, simm16:$imm), 0>, ISA_MIPS64;
 
+def ABSMacro : MipsAsmPseudoInst<(outs GPR32Opnd:$rd), (ins GPR32Opnd:$rs),
+                                 "abs\t$rd, $rs">;
+
 //===----------------------------------------------------------------------===//
 // Instruction aliases
 //===----------------------------------------------------------------------===//
@@ -1823,47 +2192,66 @@ def : MipsInstAlias<"move $dst, $src",
 }
 def : MipsInstAlias<"bal $offset", (BGEZAL ZERO, brtarget:$offset), 0>,
       ISA_MIPS1_NOT_32R6_64R6;
-def : MipsInstAlias<"addu $rs, $rt, $imm",
-                    (ADDiu GPR32Opnd:$rs, GPR32Opnd:$rt, simm16:$imm), 0>;
-def : MipsInstAlias<"addu $rs, $imm",
-                    (ADDiu GPR32Opnd:$rs, GPR32Opnd:$rs, simm16:$imm), 0>;
-def : MipsInstAlias<"add $rs, $rt, $imm",
-                    (ADDi GPR32Opnd:$rs, GPR32Opnd:$rt, simm16:$imm), 0>,
-                    ISA_MIPS1_NOT_32R6_64R6;
-def : MipsInstAlias<"add $rs, $imm",
-                    (ADDi GPR32Opnd:$rs, GPR32Opnd:$rs, simm16:$imm), 0>,
-                    ISA_MIPS1_NOT_32R6_64R6;
-def : MipsInstAlias<"and $rs, $rt, $imm",
-                    (ANDi GPR32Opnd:$rs, GPR32Opnd:$rt, simm16:$imm), 0>;
-def : MipsInstAlias<"and $rs, $imm",
-                    (ANDi GPR32Opnd:$rs, GPR32Opnd:$rs, simm16:$imm), 0>;
+def : MipsInstAlias<
+          "addu $rs, $rt, $imm",
+          (ADDiu GPR32Opnd:$rs, GPR32Opnd:$rt, simm32_relaxed:$imm), 0>;
+def : MipsInstAlias<
+          "addu $rs, $imm",
+          (ADDiu GPR32Opnd:$rs, GPR32Opnd:$rs, simm32_relaxed:$imm), 0>;
+def : MipsInstAlias<
+          "add $rs, $rt, $imm",
+          (ADDi GPR32Opnd:$rs, GPR32Opnd:$rt, simm32_relaxed:$imm), 0>,
+          ISA_MIPS1_NOT_32R6_64R6;
+def : MipsInstAlias<
+          "add $rs, $imm",
+          (ADDi GPR32Opnd:$rs, GPR32Opnd:$rs, simm32_relaxed:$imm), 0>,
+          ISA_MIPS1_NOT_32R6_64R6;
+def : MipsInstAlias<
+          "and $rs, $rt, $imm",
+          (ANDi GPR32Opnd:$rs, GPR32Opnd:$rt, simm32_relaxed:$imm), 0>;
+def : MipsInstAlias<
+          "and $rs, $imm",
+          (ANDi GPR32Opnd:$rs, GPR32Opnd:$rs, simm32_relaxed:$imm), 0>;
 def : MipsInstAlias<"j $rs", (JR GPR32Opnd:$rs), 0>;
 let Predicates = [NotInMicroMips] in {
 def : MipsInstAlias<"jalr $rs", (JALR RA, GPR32Opnd:$rs), 0>;
 }
 def : MipsInstAlias<"jalr.hb $rs", (JALR_HB RA, GPR32Opnd:$rs), 1>, ISA_MIPS32;
-def : MipsInstAlias<"not $rt, $rs",
-                    (NOR GPR32Opnd:$rt, GPR32Opnd:$rs, ZERO), 0>;
 def : MipsInstAlias<"neg $rt, $rs",
                     (SUB GPR32Opnd:$rt, ZERO, GPR32Opnd:$rs), 1>;
 def : MipsInstAlias<"negu $rt",
                     (SUBu GPR32Opnd:$rt, ZERO, GPR32Opnd:$rt), 0>;
 def : MipsInstAlias<"negu $rt, $rs",
                     (SUBu GPR32Opnd:$rt, ZERO, GPR32Opnd:$rs), 1>;
-def : MipsInstAlias<"slt $rs, $rt, $imm",
-                    (SLTi GPR32Opnd:$rs, GPR32Opnd:$rt, simm16:$imm), 0>;
-def : MipsInstAlias<"sltu $rt, $rs, $imm",
-                    (SLTiu GPR32Opnd:$rt, GPR32Opnd:$rs, simm16:$imm), 0>;
-def : MipsInstAlias<"xor $rs, $rt, $imm",
-                    (XORi GPR32Opnd:$rs, GPR32Opnd:$rt, uimm16:$imm), 0>;
-def : MipsInstAlias<"xor $rs, $imm",
-                    (XORi GPR32Opnd:$rs, GPR32Opnd:$rs, uimm16:$imm), 0>;
-def : MipsInstAlias<"or $rs, $rt, $imm",
-                    (ORi GPR32Opnd:$rs, GPR32Opnd:$rt, uimm16:$imm), 0>;
-def : MipsInstAlias<"or $rs, $imm",
-                    (ORi GPR32Opnd:$rs, GPR32Opnd:$rs, uimm16:$imm), 0>;
+def : MipsInstAlias<
+          "slt $rs, $rt, $imm",
+          (SLTi GPR32Opnd:$rs, GPR32Opnd:$rt, simm32_relaxed:$imm), 0>;
+def : MipsInstAlias<
+          "sltu $rt, $rs, $imm",
+          (SLTiu GPR32Opnd:$rt, GPR32Opnd:$rs, simm32_relaxed:$imm), 0>;
 let AdditionalPredicates = [NotInMicroMips] in {
-def : MipsInstAlias<"nop", (SLL ZERO, ZERO, 0), 1>;
+  def : MipsInstAlias<
+          "and $rs, $rt, $imm",
+          (ANDi GPR32Opnd:$rs, GPR32Opnd:$rt, simm32_relaxed:$imm), 0>;
+  def : MipsInstAlias<
+          "and $rs, $imm",
+          (ANDi GPR32Opnd:$rs, GPR32Opnd:$rs, simm32_relaxed:$imm), 0>;
+  def : MipsInstAlias<
+          "xor $rs, $rt, $imm",
+          (XORi GPR32Opnd:$rs, GPR32Opnd:$rt, simm32_relaxed:$imm), 0>;
+  def : MipsInstAlias<
+          "xor $rs, $imm",
+          (XORi GPR32Opnd:$rs, GPR32Opnd:$rs, simm32_relaxed:$imm), 0>;
+  def : MipsInstAlias<
+          "or $rs, $rt, $imm",
+          (ORi GPR32Opnd:$rs, GPR32Opnd:$rt, simm32_relaxed:$imm), 0>;
+  def : MipsInstAlias<
+          "or $rs, $imm",
+          (ORi GPR32Opnd:$rs, GPR32Opnd:$rs, simm32_relaxed:$imm), 0>;
+  def : MipsInstAlias<
+          "not $rt, $rs",
+          (NOR GPR32Opnd:$rt, GPR32Opnd:$rs, ZERO), 0>;
+  def : MipsInstAlias<"nop", (SLL ZERO, ZERO, 0), 1>;
 }
 def : MipsInstAlias<"mfc0 $rt, $rd", (MFC0 GPR32Opnd:$rt, COP0Opnd:$rd, 0), 0>;
 def : MipsInstAlias<"mtc0 $rt, $rd", (MTC0 COP0Opnd:$rd, GPR32Opnd:$rt, 0), 0>;
@@ -1880,7 +2268,9 @@ def : MipsInstAlias<"beqz $rs,$offset",
                     (BEQ GPR32Opnd:$rs, ZERO, brtarget:$offset), 0>;
 def : MipsInstAlias<"beqzl $rs,$offset",
                     (BEQL GPR32Opnd:$rs, ZERO, brtarget:$offset), 0>;
-def : MipsInstAlias<"syscall", (SYSCALL 0), 1>;
+let AdditionalPredicates = [NotInMicroMips] in {
+  def : MipsInstAlias<"syscall", (SYSCALL 0), 1>;
+}
 
 def : MipsInstAlias<"break", (BREAK 0, 0), 1>;
 def : MipsInstAlias<"break $imm", (BREAK uimm10:$imm, 0), 1>;
@@ -1902,8 +2292,6 @@ let AdditionalPredicates = [NotInMicroMips] in {
   def : MipsInstAlias<"tne $rs, $rt",
                       (TNE GPR32Opnd:$rs, GPR32Opnd:$rt, 0), 1>, ISA_MIPS2;
 }
-def  : MipsInstAlias<"sll $rd, $rt, $rs",
-                     (SLLV GPR32Opnd:$rd, GPR32Opnd:$rt, GPR32Opnd:$rs), 0>;
 def : MipsInstAlias<"sub, $rd, $rs, $imm",
                     (ADDi GPR32Opnd:$rd, GPR32Opnd:$rs,
                           InvertedImOperand:$imm), 0>, ISA_MIPS1_NOT_32R6_64R6;
@@ -1915,10 +2303,14 @@ def : MipsInstAlias<"subu, $rd, $rs, $imm",
                            InvertedImOperand:$imm), 0>;
 def : MipsInstAlias<"subu $rs, $imm", (ADDiu GPR32Opnd:$rs, GPR32Opnd:$rs,
                                              InvertedImOperand:$imm), 0>;
-def : MipsInstAlias<"sra $rd, $rt, $rs",
-                    (SRAV GPR32Opnd:$rd, GPR32Opnd:$rt, GPR32Opnd:$rs), 0>;
-def : MipsInstAlias<"srl $rd, $rt, $rs",
-                    (SRLV GPR32Opnd:$rd, GPR32Opnd:$rt, GPR32Opnd:$rs), 0>;
+let AdditionalPredicates = [NotInMicroMips] in {
+  def : MipsInstAlias<"sll $rd, $rt, $rs",
+                      (SLLV GPR32Opnd:$rd, GPR32Opnd:$rt, GPR32Opnd:$rs), 0>;
+  def : MipsInstAlias<"sra $rd, $rt, $rs",
+                      (SRAV GPR32Opnd:$rd, GPR32Opnd:$rt, GPR32Opnd:$rs), 0>;
+  def : MipsInstAlias<"srl $rd, $rt, $rs",
+                      (SRLV GPR32Opnd:$rd, GPR32Opnd:$rt, GPR32Opnd:$rs), 0>;
+}
 def : MipsInstAlias<"sdbbp", (SDBBP 0)>, ISA_MIPS32_NOT_32R6_64R6;
 def : MipsInstAlias<"sync",
                     (SYNC 0), 1>, ISA_MIPS2;
@@ -1926,10 +2318,11 @@ def : MipsInstAlias<"sync",
 // Assembler Pseudo Instructions
 //===----------------------------------------------------------------------===//
 
+// We use i32imm on li/la to defer range checking to the assembler.
 class LoadImmediate32<string instr_asm, Operand Od, RegisterOperand RO> :
   MipsAsmPseudoInst<(outs RO:$rt), (ins Od:$imm32),
                      !strconcat(instr_asm, "\t$rt, $imm32")> ;
-def LoadImm32 : LoadImmediate32<"li", simm32, GPR32Opnd>;
+def LoadImm32 : LoadImmediate32<"li", i32imm, GPR32Opnd>;
 
 class LoadAddressFromReg32<string instr_asm, Operand MemOpnd,
                            RegisterOperand RO> :
@@ -1940,17 +2333,18 @@ def LoadAddrReg32 : LoadAddressFromReg32<"la", mem, GPR32Opnd>;
 class LoadAddressFromImm32<string instr_asm, Operand Od, RegisterOperand RO> :
   MipsAsmPseudoInst<(outs RO:$rt), (ins Od:$imm32),
                      !strconcat(instr_asm, "\t$rt, $imm32")> ;
-def LoadAddrImm32 : LoadAddressFromImm32<"la", simm32, GPR32Opnd>;
+def LoadAddrImm32 : LoadAddressFromImm32<"la", i32imm, GPR32Opnd>;
 
 def JalTwoReg : MipsAsmPseudoInst<(outs GPR32Opnd:$rd), (ins GPR32Opnd:$rs),
                       "jal\t$rd, $rs"> ;
 def JalOneReg : MipsAsmPseudoInst<(outs), (ins GPR32Opnd:$rs),
                       "jal\t$rs"> ;
 
-def NORImm : MipsAsmPseudoInst<(outs), (ins GPR32Opnd:$rs, GPR32Opnd:$rt, simm16:$imm),
-                               "nor\t$rs, $rt, $imm"> ;
+def NORImm : MipsAsmPseudoInst<
+                 (outs), (ins GPR32Opnd:$rs, GPR32Opnd:$rt, simm32:$imm),
+                 "nor\t$rs, $rt, $imm"> ;
 
-let hasDelaySlot = 1 in {
+let hasDelaySlot = 1, isCTI = 1 in {
 def BneImm : MipsAsmPseudoInst<(outs GPR32Opnd:$rt),
                                (ins imm64:$imm64, brtarget:$offset),
                                "bne\t$rt, $imm64, $offset">;
@@ -1981,6 +2375,7 @@ def BLEUL: CondBranchPseudo<"bleul">, ISA_MIPS2_NOT_32R6_64R6;
 def BGEUL: CondBranchPseudo<"bgeul">, ISA_MIPS2_NOT_32R6_64R6;
 def BGTUL: CondBranchPseudo<"bgtul">, ISA_MIPS2_NOT_32R6_64R6;
 
+let isCTI = 1 in
 class CondBranchImmPseudo<string instr_asm> :
   MipsAsmPseudoInst<(outs), (ins GPR32Opnd:$rs, imm64:$imm, brtarget:$offset),
                     !strconcat(instr_asm, "\t$rs, $imm, $offset")>;
@@ -2008,17 +2403,34 @@ def BGTULImmMacro : CondBranchImmPseudo<"bgtul">, ISA_MIPS2_NOT_32R6_64R6;
 // Once the tablegen-erated errors are made better, this needs to be fixed and
 // predicates needs to be restored.
 
-def SDivMacro : MipsAsmPseudoInst<(outs), (ins GPR32Opnd:$rs, GPR32Opnd:$rt),
-                                  "div\t$rs, $rt">; //, ISA_MIPS1_NOT_32R6_64R6;
-
-def UDivMacro : MipsAsmPseudoInst<(outs), (ins GPR32Opnd:$rs, GPR32Opnd:$rt),
-                                  "divu\t$rs, $rt">; //, ISA_MIPS1_NOT_32R6_64R6;
-
-def DSDivMacro : MipsAsmPseudoInst<(outs), (ins GPR32Opnd:$rs, GPR32Opnd:$rt),
-                                   "ddiv\t$rs, $rt">; //, ISA_MIPS64_NOT_64R6;
-
-def DUDivMacro : MipsAsmPseudoInst<(outs), (ins GPR32Opnd:$rs, GPR32Opnd:$rt),
-                                   "ddivu\t$rs, $rt">; //, ISA_MIPS64_NOT_64R6;
+def SDivMacro : MipsAsmPseudoInst<(outs GPR32Opnd:$rd),
+                                  (ins GPR32Opnd:$rs, GPR32Opnd:$rt),
+                                  "div\t$rd, $rs, $rt">,
+                ISA_MIPS1_NOT_32R6_64R6;
+def UDivMacro : MipsAsmPseudoInst<(outs GPR32Opnd:$rd),
+                                  (ins GPR32Opnd:$rs, GPR32Opnd:$rt),
+                                  "divu\t$rd, $rs, $rt">,
+                ISA_MIPS1_NOT_32R6_64R6;
+def : MipsInstAlias<"div $rt, $rs", (SDivMacro GPR32Opnd:$rt, GPR32Opnd:$rt,
+                                               GPR32Opnd:$rs), 0>,
+      ISA_MIPS1_NOT_32R6_64R6;
+def : MipsInstAlias<"divu $rt, $rs", (UDivMacro GPR32Opnd:$rt, GPR32Opnd:$rt,
+                                                GPR32Opnd:$rs), 0>,
+      ISA_MIPS1_NOT_32R6_64R6;
+def DSDivMacro : MipsAsmPseudoInst<(outs GPR32Opnd:$rd),
+                                   (ins GPR32Opnd:$rs, GPR32Opnd:$rt),
+                                   "ddiv\t$rd, $rs, $rt">,
+                 ISA_MIPS64_NOT_64R6;
+def DUDivMacro : MipsAsmPseudoInst<(outs GPR32Opnd:$rd),
+                                   (ins GPR32Opnd:$rs, GPR32Opnd:$rt),
+                                   "ddivu\t$rd, $rs, $rt">,
+                 ISA_MIPS64_NOT_64R6;
+def : MipsInstAlias<"ddiv $rt, $rs", (DSDivMacro GPR32Opnd:$rt, GPR32Opnd:$rt,
+                                                 GPR32Opnd:$rs), 0>,
+      ISA_MIPS64_NOT_64R6;
+def : MipsInstAlias<"ddivu $rt, $rs", (DUDivMacro GPR32Opnd:$rt, GPR32Opnd:$rt,
+                                                  GPR32Opnd:$rs), 0>,
+      ISA_MIPS64_NOT_64R6;
 
 def Ulh : MipsAsmPseudoInst<(outs GPR32Opnd:$rt), (ins mem:$addr),
                             "ulh\t$rt, $addr">; //, ISA_MIPS1_NOT_32R6_64R6;
@@ -2055,14 +2467,14 @@ def : MipsPat<(i32 imm:$imm),
           (ORi (LUi (HI16 imm:$imm)), (LO16 imm:$imm))>;
 
 // Carry MipsPatterns
-def : MipsPat<(subc GPR32:$lhs, GPR32:$rhs),
-              (SUBu GPR32:$lhs, GPR32:$rhs)>;
-let AdditionalPredicates = [NotDSP] in {
-  def : MipsPat<(addc GPR32:$lhs, GPR32:$rhs),
-                (ADDu GPR32:$lhs, GPR32:$rhs)>;
-  def : MipsPat<(addc  GPR32:$src, immSExt16:$imm),
-                (ADDiu GPR32:$src, imm:$imm)>;
+let AdditionalPredicates = [NotInMicroMips] in {
+  def : MipsPat<(subc GPR32:$lhs, GPR32:$rhs),
+                (SUBu GPR32:$lhs, GPR32:$rhs)>;
 }
+def : MipsPat<(addc GPR32:$lhs, GPR32:$rhs),
+              (ADDu GPR32:$lhs, GPR32:$rhs)>, ASE_NOT_DSP;
+def : MipsPat<(addc  GPR32:$src, immSExt16:$imm),
+              (ADDiu GPR32:$src, imm:$imm)>, ASE_NOT_DSP;
 
 // Support multiplication for pre-Mips32 targets that don't have
 // the MUL instruction.
@@ -2138,7 +2550,9 @@ def : MipsPat<(not GPR32:$in),
 // extended loads
 def : MipsPat<(i32 (extloadi1  addr:$src)), (LBu addr:$src)>;
 def : MipsPat<(i32 (extloadi8  addr:$src)), (LBu addr:$src)>;
-def : MipsPat<(i32 (extloadi16 addr:$src)), (LHu addr:$src)>;
+let AdditionalPredicates = [NotInMicroMips] in {
+  def : MipsPat<(i32 (extloadi16 addr:$src)), (LHu addr:$src)>;
+}
 
 // peepholes
 def : MipsPat<(store (i32 0), addr:$dst), (SW ZERO, addr:$dst)>;
@@ -2235,15 +2649,17 @@ def : MipsPat<(bswap GPR32:$rt), (ROTR (WSBH GPR32:$rt), 16)>;
 // Load halfword/word patterns.
 let AddedComplexity = 40 in {
   def : LoadRegImmPat<LBu, i32, zextloadi8>;
-  def : LoadRegImmPat<LH, i32, sextloadi16>;
   let AdditionalPredicates = [NotInMicroMips] in {
-  def : LoadRegImmPat<LW, i32, load>;
+    def : LoadRegImmPat<LH, i32, sextloadi16>;
+    def : LoadRegImmPat<LW, i32, load>;
   }
 }
 
 // Atomic load patterns.
 def : MipsPat<(atomic_load_8 addr:$a), (LB addr:$a)>;
-def : MipsPat<(atomic_load_16 addr:$a), (LH addr:$a)>;
+let AdditionalPredicates = [NotInMicroMips] in {
+  def : MipsPat<(atomic_load_16 addr:$a), (LH addr:$a)>;
+}
 def : MipsPat<(atomic_load_32 addr:$a), (LW addr:$a)>;
 
 // Atomic store patterns.
diff --git a/lib/Target/Mips/MipsLongBranch.cpp b/lib/Target/Mips/MipsLongBranch.cpp
index 49fb99a8ec43..e721312390d2 100644
--- a/lib/Target/Mips/MipsLongBranch.cpp
+++ b/lib/Target/Mips/MipsLongBranch.cpp
@@ -63,8 +63,7 @@ namespace {
   public:
     static char ID;
     MipsLongBranch(TargetMachine &tm)
-        : MachineFunctionPass(ID), TM(tm),
-          IsPIC(TM.getRelocationModel() == Reloc::PIC_),
+        : MachineFunctionPass(ID), TM(tm), IsPIC(TM.isPositionIndependent()),
           ABI(static_cast<const MipsTargetMachine &>(TM).getABI()) {}
 
     const char *getPassName() const override {
@@ -73,11 +72,16 @@ namespace {
 
     bool runOnMachineFunction(MachineFunction &F) override;
 
+    MachineFunctionProperties getRequiredProperties() const override {
+      return MachineFunctionProperties().set(
+          MachineFunctionProperties::Property::AllVRegsAllocated);
+    }
+
   private:
     void splitMBB(MachineBasicBlock *MBB);
     void initMBBInfo();
     int64_t computeOffset(const MachineInstr *Br);
-    void replaceBranch(MachineBasicBlock &MBB, Iter Br, DebugLoc DL,
+    void replaceBranch(MachineBasicBlock &MBB, Iter Br, const DebugLoc &DL,
                        MachineBasicBlock *MBBOpnd);
     void expandToLongBranch(MBBInfo &Info);
 
@@ -113,7 +117,7 @@ static MachineBasicBlock *getTargetMBB(const MachineInstr &Br) {
 
 // Traverse the list of instructions backwards until a non-debug instruction is
 // found or it reaches E.
-static ReverseIter getNonDebugInstr(ReverseIter B, ReverseIter E) {
+static ReverseIter getNonDebugInstr(ReverseIter B, const ReverseIter &E) {
   for (; B != E; ++B)
     if (!B->isDebugValue())
       return B;
@@ -160,8 +164,8 @@ void MipsLongBranch::splitMBB(MachineBasicBlock *MBB) {
 void MipsLongBranch::initMBBInfo() {
   // Split the MBBs if they have two branches. Each basic block should have at
   // most one branch after this loop is executed.
-  for (MachineFunction::iterator I = MF->begin(), E = MF->end(); I != E;)
-    splitMBB(&*I++);
+  for (auto &MBB : *MF)
+    splitMBB(&MBB);
 
   MF->RenumberBlocks();
   MBBInfos.clear();
@@ -175,17 +179,15 @@ void MipsLongBranch::initMBBInfo() {
     // Compute size of MBB.
     for (MachineBasicBlock::instr_iterator MI = MBB->instr_begin();
          MI != MBB->instr_end(); ++MI)
-      MBBInfos[I].Size += TII->GetInstSizeInBytes(&*MI);
+      MBBInfos[I].Size += TII->GetInstSizeInBytes(*MI);
 
     // Search for MBB's branch instruction.
     ReverseIter End = MBB->rend();
     ReverseIter Br = getNonDebugInstr(MBB->rbegin(), End);
 
     if ((Br != End) && !Br->isIndirectBranch() &&
-        (Br->isConditionalBranch() ||
-         (Br->isUnconditionalBranch() &&
-          TM.getRelocationModel() == Reloc::PIC_)))
-      MBBInfos[I].Br = (++Br).base();
+        (Br->isConditionalBranch() || (Br->isUnconditionalBranch() && IsPIC)))
+      MBBInfos[I].Br = &*(++Br).base();
   }
 }
 
@@ -213,7 +215,8 @@ int64_t MipsLongBranch::computeOffset(const MachineInstr *Br) {
 // Replace Br with a branch which has the opposite condition code and a
 // MachineBasicBlock operand MBBOpnd.
 void MipsLongBranch::replaceBranch(MachineBasicBlock &MBB, Iter Br,
-                                   DebugLoc DL, MachineBasicBlock *MBBOpnd) {
+                                   const DebugLoc &DL,
+                                   MachineBasicBlock *MBBOpnd) {
   const MipsInstrInfo *TII = static_cast<const MipsInstrInfo *>(
       MBB.getParent()->getSubtarget().getInstrInfo());
   unsigned NewOpc = TII->getOppositeBranchOpc(Br->getOpcode());
@@ -238,7 +241,7 @@ void MipsLongBranch::replaceBranch(MachineBasicBlock &MBB, Iter Br,
     // Bundle the instruction in the delay slot to the newly created branch
     // and erase the original branch.
     assert(Br->isBundledWithSucc());
-    MachineBasicBlock::instr_iterator II(Br);
+    MachineBasicBlock::instr_iterator II = Br.getInstrIterator();
     MIBundleBuilder(&*MIB).append((++II)->removeFromBundle());
   }
   Br->eraseFromParent();
@@ -329,23 +332,26 @@ void MipsLongBranch::expandToLongBranch(MBBInfo &I) {
       BuildMI(*BalTgtMBB, Pos, DL, TII->get(Mips::LW), Mips::RA)
         .addReg(Mips::SP).addImm(0);
 
-      if (!Subtarget.isTargetNaCl()) {
-        MIBundleBuilder(*BalTgtMBB, Pos)
-          .append(BuildMI(*MF, DL, TII->get(Mips::JR)).addReg(Mips::AT))
-          .append(BuildMI(*MF, DL, TII->get(Mips::ADDiu), Mips::SP)
-                  .addReg(Mips::SP).addImm(8));
-      } else {
-        // In NaCl, modifying the sp is not allowed in branch delay slot.
+      // In NaCl, modifying the sp is not allowed in branch delay slot.
+      if (Subtarget.isTargetNaCl())
         BuildMI(*BalTgtMBB, Pos, DL, TII->get(Mips::ADDiu), Mips::SP)
           .addReg(Mips::SP).addImm(8);
 
-        MIBundleBuilder(*BalTgtMBB, Pos)
-          .append(BuildMI(*MF, DL, TII->get(Mips::JR)).addReg(Mips::AT))
-          .append(BuildMI(*MF, DL, TII->get(Mips::NOP)));
+      if (Subtarget.hasMips32r6())
+        BuildMI(*BalTgtMBB, Pos, DL, TII->get(Mips::JALR))
+          .addReg(Mips::ZERO).addReg(Mips::AT);
+      else
+        BuildMI(*BalTgtMBB, Pos, DL, TII->get(Mips::JR)).addReg(Mips::AT);
 
+      if (Subtarget.isTargetNaCl()) {
+        BuildMI(*BalTgtMBB, Pos, DL, TII->get(Mips::NOP));
         // Bundle-align the target of indirect branch JR.
         TgtMBB->setAlignment(MIPS_NACL_BUNDLE_ALIGN);
-      }
+      } else
+        BuildMI(*BalTgtMBB, Pos, DL, TII->get(Mips::ADDiu), Mips::SP)
+          .addReg(Mips::SP).addImm(8);
+
+      BalTgtMBB->rbegin()->bundleWithPred();
     } else {
       // $longbr:
       //  daddiu $sp, $sp, -16
@@ -404,10 +410,15 @@ void MipsLongBranch::expandToLongBranch(MBBInfo &I) {
       BuildMI(*BalTgtMBB, Pos, DL, TII->get(Mips::LD), Mips::RA_64)
         .addReg(Mips::SP_64).addImm(0);
 
-      MIBundleBuilder(*BalTgtMBB, Pos)
-        .append(BuildMI(*MF, DL, TII->get(Mips::JR64)).addReg(Mips::AT_64))
-        .append(BuildMI(*MF, DL, TII->get(Mips::DADDiu), Mips::SP_64)
-                .addReg(Mips::SP_64).addImm(16));
+      if (Subtarget.hasMips64r6())
+        BuildMI(*BalTgtMBB, Pos, DL, TII->get(Mips::JALR64))
+          .addReg(Mips::ZERO_64).addReg(Mips::AT_64);
+      else
+        BuildMI(*BalTgtMBB, Pos, DL, TII->get(Mips::JR64)).addReg(Mips::AT_64);
+
+      BuildMI(*BalTgtMBB, Pos, DL, TII->get(Mips::DADDiu), Mips::SP_64)
+        .addReg(Mips::SP_64).addImm(16);
+      BalTgtMBB->rbegin()->bundleWithPred();
     }
 
     assert(LongBrMBB->size() + BalTgtMBB->size() == LongBranchSeqSize);
@@ -457,8 +468,7 @@ bool MipsLongBranch::runOnMachineFunction(MachineFunction &F) {
 
   if (STI.inMips16Mode() || !STI.enableLongBranchPass())
     return false;
-  if ((TM.getRelocationModel() == Reloc::PIC_) &&
-      static_cast<const MipsTargetMachine &>(TM).getABI().IsO32() &&
+  if (IsPIC && static_cast<const MipsTargetMachine &>(TM).getABI().IsO32() &&
       F.getInfo<MipsFunctionInfo>()->globalBaseRegSet())
     emitGPDisp(F, TII);
 
@@ -506,7 +516,7 @@ bool MipsLongBranch::runOnMachineFunction(MachineFunction &F) {
     return true;
 
   // Compute basic block addresses.
-  if (TM.getRelocationModel() == Reloc::PIC_) {
+  if (IsPIC) {
     uint64_t Address = 0;
 
     for (I = MBBInfos.begin(); I != E; Address += I->Size, ++I)
diff --git a/lib/Target/Mips/MipsMCInstLower.cpp b/lib/Target/Mips/MipsMCInstLower.cpp
index 80d9b75b85b7..d5bc4e537c37 100644
--- a/lib/Target/Mips/MipsMCInstLower.cpp
+++ b/lib/Target/Mips/MipsMCInstLower.cpp
@@ -36,36 +36,87 @@ void MipsMCInstLower::Initialize(MCContext *C) {
 MCOperand MipsMCInstLower::LowerSymbolOperand(const MachineOperand &MO,
                                               MachineOperandType MOTy,
                                               unsigned Offset) const {
-  MCSymbolRefExpr::VariantKind Kind;
+  MCSymbolRefExpr::VariantKind Kind = MCSymbolRefExpr::VK_None;
+  MipsMCExpr::MipsExprKind TargetKind = MipsMCExpr::MEK_None;
+  bool IsGpOff = false;
   const MCSymbol *Symbol;
 
   switch(MO.getTargetFlags()) {
-  default:                   llvm_unreachable("Invalid target flag!");
-  case MipsII::MO_NO_FLAG:   Kind = MCSymbolRefExpr::VK_None; break;
-  case MipsII::MO_GPREL:     Kind = MCSymbolRefExpr::VK_Mips_GPREL; break;
-  case MipsII::MO_GOT_CALL:  Kind = MCSymbolRefExpr::VK_Mips_GOT_CALL; break;
-  case MipsII::MO_GOT16:     Kind = MCSymbolRefExpr::VK_Mips_GOT16; break;
-  case MipsII::MO_GOT:       Kind = MCSymbolRefExpr::VK_Mips_GOT; break;
-  case MipsII::MO_ABS_HI:    Kind = MCSymbolRefExpr::VK_Mips_ABS_HI; break;
-  case MipsII::MO_ABS_LO:    Kind = MCSymbolRefExpr::VK_Mips_ABS_LO; break;
-  case MipsII::MO_TLSGD:     Kind = MCSymbolRefExpr::VK_Mips_TLSGD; break;
-  case MipsII::MO_TLSLDM:    Kind = MCSymbolRefExpr::VK_Mips_TLSLDM; break;
-  case MipsII::MO_DTPREL_HI: Kind = MCSymbolRefExpr::VK_Mips_DTPREL_HI; break;
-  case MipsII::MO_DTPREL_LO: Kind = MCSymbolRefExpr::VK_Mips_DTPREL_LO; break;
-  case MipsII::MO_GOTTPREL:  Kind = MCSymbolRefExpr::VK_Mips_GOTTPREL; break;
-  case MipsII::MO_TPREL_HI:  Kind = MCSymbolRefExpr::VK_Mips_TPREL_HI; break;
-  case MipsII::MO_TPREL_LO:  Kind = MCSymbolRefExpr::VK_Mips_TPREL_LO; break;
-  case MipsII::MO_GPOFF_HI:  Kind = MCSymbolRefExpr::VK_Mips_GPOFF_HI; break;
-  case MipsII::MO_GPOFF_LO:  Kind = MCSymbolRefExpr::VK_Mips_GPOFF_LO; break;
-  case MipsII::MO_GOT_DISP:  Kind = MCSymbolRefExpr::VK_Mips_GOT_DISP; break;
-  case MipsII::MO_GOT_PAGE:  Kind = MCSymbolRefExpr::VK_Mips_GOT_PAGE; break;
-  case MipsII::MO_GOT_OFST:  Kind = MCSymbolRefExpr::VK_Mips_GOT_OFST; break;
-  case MipsII::MO_HIGHER:    Kind = MCSymbolRefExpr::VK_Mips_HIGHER; break;
-  case MipsII::MO_HIGHEST:   Kind = MCSymbolRefExpr::VK_Mips_HIGHEST; break;
-  case MipsII::MO_GOT_HI16:  Kind = MCSymbolRefExpr::VK_Mips_GOT_HI16; break;
-  case MipsII::MO_GOT_LO16:  Kind = MCSymbolRefExpr::VK_Mips_GOT_LO16; break;
-  case MipsII::MO_CALL_HI16: Kind = MCSymbolRefExpr::VK_Mips_CALL_HI16; break;
-  case MipsII::MO_CALL_LO16: Kind = MCSymbolRefExpr::VK_Mips_CALL_LO16; break;
+  default:
+    llvm_unreachable("Invalid target flag!");
+  case MipsII::MO_NO_FLAG:
+    break;
+  case MipsII::MO_GPREL:
+    TargetKind = MipsMCExpr::MEK_GPREL;
+    break;
+  case MipsII::MO_GOT_CALL:
+    TargetKind = MipsMCExpr::MEK_GOT_CALL;
+    break;
+  case MipsII::MO_GOT:
+    TargetKind = MipsMCExpr::MEK_GOT;
+    break;
+  case MipsII::MO_ABS_HI:
+    TargetKind = MipsMCExpr::MEK_HI;
+    break;
+  case MipsII::MO_ABS_LO:
+    TargetKind = MipsMCExpr::MEK_LO;
+    break;
+  case MipsII::MO_TLSGD:
+    TargetKind = MipsMCExpr::MEK_TLSGD;
+    break;
+  case MipsII::MO_TLSLDM:
+    TargetKind = MipsMCExpr::MEK_TLSLDM;
+    break;
+  case MipsII::MO_DTPREL_HI:
+    TargetKind = MipsMCExpr::MEK_DTPREL_HI;
+    break;
+  case MipsII::MO_DTPREL_LO:
+    TargetKind = MipsMCExpr::MEK_DTPREL_LO;
+    break;
+  case MipsII::MO_GOTTPREL:
+    TargetKind = MipsMCExpr::MEK_GOTTPREL;
+    break;
+  case MipsII::MO_TPREL_HI:
+    TargetKind = MipsMCExpr::MEK_TPREL_HI;
+    break;
+  case MipsII::MO_TPREL_LO:
+    TargetKind = MipsMCExpr::MEK_TPREL_LO;
+    break;
+  case MipsII::MO_GPOFF_HI:
+    TargetKind = MipsMCExpr::MEK_HI;
+    IsGpOff = true;
+    break;
+  case MipsII::MO_GPOFF_LO:
+    TargetKind = MipsMCExpr::MEK_LO;
+    IsGpOff = true;
+    break;
+  case MipsII::MO_GOT_DISP:
+    TargetKind = MipsMCExpr::MEK_GOT_DISP;
+    break;
+  case MipsII::MO_GOT_HI16:
+    TargetKind = MipsMCExpr::MEK_GOT_HI16;
+    break;
+  case MipsII::MO_GOT_LO16:
+    TargetKind = MipsMCExpr::MEK_GOT_LO16;
+    break;
+  case MipsII::MO_GOT_PAGE:
+    TargetKind = MipsMCExpr::MEK_GOT_PAGE;
+    break;
+  case MipsII::MO_GOT_OFST:
+    TargetKind = MipsMCExpr::MEK_GOT_OFST;
+    break;
+  case MipsII::MO_HIGHER:
+    TargetKind = MipsMCExpr::MEK_HIGHER;
+    break;
+  case MipsII::MO_HIGHEST:
+    TargetKind = MipsMCExpr::MEK_HIGHEST;
+    break;
+  case MipsII::MO_CALL_HI16:
+    TargetKind = MipsMCExpr::MEK_CALL_HI16;
+    break;
+  case MipsII::MO_CALL_LO16:
+    TargetKind = MipsMCExpr::MEK_CALL_LO16;
+    break;
   }
 
   switch (MOTy) {
@@ -106,30 +157,23 @@ MCOperand MipsMCInstLower::LowerSymbolOperand(const MachineOperand &MO,
     llvm_unreachable("<unknown operand type>");
   }
 
-  const MCSymbolRefExpr *MCSym = MCSymbolRefExpr::create(Symbol, Kind, *Ctx);
+  const MCExpr *Expr = MCSymbolRefExpr::create(Symbol, Kind, *Ctx);
 
-  if (!Offset)
-    return MCOperand::createExpr(MCSym);
+  if (Offset) {
+    // Assume offset is never negative.
+    assert(Offset > 0);
 
-  // Assume offset is never negative.
-  assert(Offset > 0);
+    Expr = MCBinaryExpr::createAdd(Expr, MCConstantExpr::create(Offset, *Ctx),
+                                   *Ctx);
+  }
 
-  const MCConstantExpr *OffsetExpr =  MCConstantExpr::create(Offset, *Ctx);
-  const MCBinaryExpr *Add = MCBinaryExpr::createAdd(MCSym, OffsetExpr, *Ctx);
-  return MCOperand::createExpr(Add);
-}
+  if (IsGpOff)
+    Expr = MipsMCExpr::createGpOff(TargetKind, Expr, *Ctx);
+  else if (TargetKind != MipsMCExpr::MEK_None)
+    Expr = MipsMCExpr::create(TargetKind, Expr, *Ctx);
 
-/*
-static void CreateMCInst(MCInst& Inst, unsigned Opc, const MCOperand &Opnd0,
-                         const MCOperand &Opnd1,
-                         const MCOperand &Opnd2 = MCOperand()) {
-  Inst.setOpcode(Opc);
-  Inst.addOperand(Opnd0);
-  Inst.addOperand(Opnd1);
-  if (Opnd2.isValid())
-    Inst.addOperand(Opnd2);
+  return MCOperand::createExpr(Expr);
 }
-*/
 
 MCOperand MipsMCInstLower::LowerOperand(const MachineOperand &MO,
                                         unsigned offset) const {
@@ -160,7 +204,7 @@ MCOperand MipsMCInstLower::LowerOperand(const MachineOperand &MO,
 
 MCOperand MipsMCInstLower::createSub(MachineBasicBlock *BB1,
                                      MachineBasicBlock *BB2,
-                                     MCSymbolRefExpr::VariantKind Kind) const {
+                                     MipsMCExpr::MipsExprKind Kind) const {
   const MCSymbolRefExpr *Sym1 = MCSymbolRefExpr::create(BB1->getSymbol(), *Ctx);
   const MCSymbolRefExpr *Sym2 = MCSymbolRefExpr::create(BB2->getSymbol(), *Ctx);
   const MCBinaryExpr *Sub = MCBinaryExpr::createSub(Sym1, Sym2, *Ctx);
@@ -178,12 +222,12 @@ lowerLongBranchLUi(const MachineInstr *MI, MCInst &OutMI) const {
   // Create %hi($tgt-$baltgt).
   OutMI.addOperand(createSub(MI->getOperand(1).getMBB(),
                              MI->getOperand(2).getMBB(),
-                             MCSymbolRefExpr::VK_Mips_ABS_HI));
+                             MipsMCExpr::MEK_HI));
 }
 
-void MipsMCInstLower::
-lowerLongBranchADDiu(const MachineInstr *MI, MCInst &OutMI, int Opcode,
-                     MCSymbolRefExpr::VariantKind Kind) const {
+void MipsMCInstLower::lowerLongBranchADDiu(
+    const MachineInstr *MI, MCInst &OutMI, int Opcode,
+    MipsMCExpr::MipsExprKind Kind) const {
   OutMI.setOpcode(Opcode);
 
   // Lower two register operands.
@@ -206,17 +250,14 @@ bool MipsMCInstLower::lowerLongBranch(const MachineInstr *MI,
     lowerLongBranchLUi(MI, OutMI);
     return true;
   case Mips::LONG_BRANCH_ADDiu:
-    lowerLongBranchADDiu(MI, OutMI, Mips::ADDiu,
-                         MCSymbolRefExpr::VK_Mips_ABS_LO);
+    lowerLongBranchADDiu(MI, OutMI, Mips::ADDiu, MipsMCExpr::MEK_LO);
     return true;
   case Mips::LONG_BRANCH_DADDiu:
     unsigned TargetFlags = MI->getOperand(2).getTargetFlags();
     if (TargetFlags == MipsII::MO_ABS_HI)
-      lowerLongBranchADDiu(MI, OutMI, Mips::DADDiu,
-                           MCSymbolRefExpr::VK_Mips_ABS_HI);
+      lowerLongBranchADDiu(MI, OutMI, Mips::DADDiu, MipsMCExpr::MEK_HI);
     else if (TargetFlags == MipsII::MO_ABS_LO)
-      lowerLongBranchADDiu(MI, OutMI, Mips::DADDiu,
-                           MCSymbolRefExpr::VK_Mips_ABS_LO);
+      lowerLongBranchADDiu(MI, OutMI, Mips::DADDiu, MipsMCExpr::MEK_LO);
     else
       report_fatal_error("Unexpected flags for LONG_BRANCH_DADDiu");
     return true;
diff --git a/lib/Target/Mips/MipsMCInstLower.h b/lib/Target/Mips/MipsMCInstLower.h
index 1ce27e401850..c25f90005480 100644
--- a/lib/Target/Mips/MipsMCInstLower.h
+++ b/lib/Target/Mips/MipsMCInstLower.h
@@ -10,7 +10,6 @@
 #ifndef LLVM_LIB_TARGET_MIPS_MIPSMCINSTLOWER_H
 #define LLVM_LIB_TARGET_MIPS_MIPSMCINSTLOWER_H
 #include "MCTargetDesc/MipsMCExpr.h"
-#include "llvm/ADT/SmallVector.h"
 #include "llvm/CodeGen/MachineOperand.h"
 #include "llvm/Support/Compiler.h"
 
@@ -38,11 +37,10 @@ private:
   MCOperand LowerSymbolOperand(const MachineOperand &MO,
                                MachineOperandType MOTy, unsigned Offset) const;
   MCOperand createSub(MachineBasicBlock *BB1, MachineBasicBlock *BB2,
-                      MCSymbolRefExpr::VariantKind Kind) const;
+                      MipsMCExpr::MipsExprKind Kind) const;
   void lowerLongBranchLUi(const MachineInstr *MI, MCInst &OutMI) const;
-  void lowerLongBranchADDiu(const MachineInstr *MI, MCInst &OutMI,
-                            int Opcode,
-                            MCSymbolRefExpr::VariantKind Kind) const;
+  void lowerLongBranchADDiu(const MachineInstr *MI, MCInst &OutMI, int Opcode,
+                            MipsMCExpr::MipsExprKind Kind) const;
   bool lowerLongBranch(const MachineInstr *MI, MCInst &OutMI) const;
 };
 }
diff --git a/lib/Target/Mips/MipsMSAInstrInfo.td b/lib/Target/Mips/MipsMSAInstrInfo.td
index eacfcec78bc7..deb4345e2662 100644
--- a/lib/Target/Mips/MipsMSAInstrInfo.td
+++ b/lib/Target/Mips/MipsMSAInstrInfo.td
@@ -65,53 +65,11 @@ def MipsVExtractZExt : SDNode<"MipsISD::VEXTRACT_ZEXT_ELT",
 
 def immZExt1Ptr : ImmLeaf<iPTR, [{return isUInt<1>(Imm);}]>;
 def immZExt2Ptr : ImmLeaf<iPTR, [{return isUInt<2>(Imm);}]>;
+def immZExt3Ptr : ImmLeaf<iPTR, [{return isUInt<3>(Imm);}]>;
 def immZExt4Ptr : ImmLeaf<iPTR, [{return isUInt<4>(Imm);}]>;
-def immZExt6Ptr : ImmLeaf<iPTR, [{return isUInt<6>(Imm);}]>;
 
 // Operands
 
-def uimm4_ptr : Operand<iPTR> {
-  let PrintMethod = "printUnsignedImm8";
-}
-
-def uimm6_ptr : Operand<iPTR> {
-  let PrintMethod = "printUnsignedImm8";
-}
-
-def simm5 : Operand<i32>;
-
-def vsplat_uimm1 : Operand<vAny> {
-  let PrintMethod = "printUnsignedImm8";
-}
-
-def vsplat_uimm2 : Operand<vAny> {
-  let PrintMethod = "printUnsignedImm8";
-}
-
-def vsplat_uimm3 : Operand<vAny> {
-  let PrintMethod = "printUnsignedImm8";
-}
-
-def vsplat_uimm4 : Operand<vAny> {
-  let PrintMethod = "printUnsignedImm8";
-}
-
-def vsplat_uimm5 : Operand<vAny> {
-  let PrintMethod = "printUnsignedImm8";
-}
-
-def vsplat_uimm6 : Operand<vAny> {
-  let PrintMethod = "printUnsignedImm8";
-}
-
-def vsplat_uimm8 : Operand<vAny> {
-  let PrintMethod = "printUnsignedImm8";
-}
-
-def vsplat_simm5 : Operand<vAny>;
-
-def vsplat_simm10 : Operand<vAny>;
-
 def immZExt2Lsa : ImmLeaf<i32, [{return isUInt<2>(Imm - 1);}]>;
 
 // Pattern fragments
@@ -336,15 +294,33 @@ def vsplat_uimm_inv_pow2 : ComplexPattern<vAny, 1, "selectVSplatUimmInvPow2",
 
 // Any build_vector that is a constant splat with only a consecutive sequence
 // of left-most bits set.
-def vsplat_maskl_bits : SplatComplexPattern<vsplat_uimm8, vAny, 1,
-                                            "selectVSplatMaskL",
-                                            [build_vector, bitconvert]>;
+def vsplat_maskl_bits_uimm3
+    : SplatComplexPattern<vsplat_uimm3, vAny, 1, "selectVSplatMaskL",
+                          [build_vector, bitconvert]>;
+def vsplat_maskl_bits_uimm4
+    : SplatComplexPattern<vsplat_uimm4, vAny, 1, "selectVSplatMaskL",
+                          [build_vector, bitconvert]>;
+def vsplat_maskl_bits_uimm5
+    : SplatComplexPattern<vsplat_uimm5, vAny, 1, "selectVSplatMaskL",
+                          [build_vector, bitconvert]>;
+def vsplat_maskl_bits_uimm6
+    : SplatComplexPattern<vsplat_uimm6, vAny, 1, "selectVSplatMaskL",
+                          [build_vector, bitconvert]>;
 
 // Any build_vector that is a constant splat with only a consecutive sequence
 // of right-most bits set.
-def vsplat_maskr_bits : SplatComplexPattern<vsplat_uimm8, vAny, 1,
-                                            "selectVSplatMaskR",
-                                            [build_vector, bitconvert]>;
+def vsplat_maskr_bits_uimm3
+    : SplatComplexPattern<vsplat_uimm3, vAny, 1, "selectVSplatMaskR",
+                          [build_vector, bitconvert]>;
+def vsplat_maskr_bits_uimm4
+    : SplatComplexPattern<vsplat_uimm4, vAny, 1, "selectVSplatMaskR",
+                          [build_vector, bitconvert]>;
+def vsplat_maskr_bits_uimm5
+    : SplatComplexPattern<vsplat_uimm5, vAny, 1, "selectVSplatMaskR",
+                          [build_vector, bitconvert]>;
+def vsplat_maskr_bits_uimm6
+    : SplatComplexPattern<vsplat_uimm6, vAny, 1, "selectVSplatMaskR",
+                          [build_vector, bitconvert]>;
 
 // Any build_vector that is a constant splat with a value that equals 1
 // FIXME: These should be a ComplexPattern but we can't use them because the
@@ -1185,11 +1161,11 @@ class MSA_BIT_X_DESC_BASE<string instr_asm, SDPatternOperator OpNode,
 }
 
 class MSA_BIT_BINSXI_DESC_BASE<string instr_asm, ValueType Ty,
-                               ComplexPattern Mask, RegisterOperand ROWD,
+                               SplatComplexPattern Mask, RegisterOperand ROWD,
                                RegisterOperand ROWS = ROWD,
                                InstrItinClass itin = NoItinerary> {
   dag OutOperandList = (outs ROWD:$wd);
-  dag InOperandList = (ins ROWD:$wd_in, ROWS:$ws, vsplat_uimm8:$m);
+  dag InOperandList = (ins ROWD:$wd_in, ROWS:$ws, Mask.OpClass:$m);
   string AsmString = !strconcat(instr_asm, "\t$wd, $ws, $m");
   // Note that binsxi and vselect treat the condition operand the opposite
   // way to each other.
@@ -1202,16 +1178,16 @@ class MSA_BIT_BINSXI_DESC_BASE<string instr_asm, ValueType Ty,
 }
 
 class MSA_BIT_BINSLI_DESC_BASE<string instr_asm, ValueType Ty,
-                               RegisterOperand ROWD,
+                               SplatComplexPattern ImmOp, RegisterOperand ROWD,
                                RegisterOperand ROWS = ROWD,
                                InstrItinClass itin = NoItinerary> :
-  MSA_BIT_BINSXI_DESC_BASE<instr_asm, Ty, vsplat_maskl_bits, ROWD, ROWS, itin>;
+  MSA_BIT_BINSXI_DESC_BASE<instr_asm, Ty, ImmOp, ROWD, ROWS, itin>;
 
 class MSA_BIT_BINSRI_DESC_BASE<string instr_asm, ValueType Ty,
-                               RegisterOperand ROWD,
+                               SplatComplexPattern ImmOp, RegisterOperand ROWD,
                                RegisterOperand ROWS = ROWD,
                                InstrItinClass itin = NoItinerary> :
-  MSA_BIT_BINSXI_DESC_BASE<instr_asm, Ty, vsplat_maskr_bits, ROWD, ROWS, itin>;
+  MSA_BIT_BINSXI_DESC_BASE<instr_asm, Ty, ImmOp, ROWD, ROWS, itin>;
 
 class MSA_BIT_SPLAT_DESC_BASE<string instr_asm, SDPatternOperator OpNode,
                               SplatComplexPattern SplatImm,
@@ -1225,13 +1201,13 @@ class MSA_BIT_SPLAT_DESC_BASE<string instr_asm, SDPatternOperator OpNode,
 }
 
 class MSA_COPY_DESC_BASE<string instr_asm, SDPatternOperator OpNode,
-                         ValueType VecTy, RegisterOperand ROD,
-                         RegisterOperand ROWS,
+                         ValueType VecTy, Operand ImmOp, ImmLeaf Imm,
+                         RegisterOperand ROD, RegisterOperand ROWS,
                          InstrItinClass itin = NoItinerary> {
   dag OutOperandList = (outs ROD:$rd);
-  dag InOperandList = (ins ROWS:$ws, uimm4_ptr:$n);
+  dag InOperandList = (ins ROWS:$ws, ImmOp:$n);
   string AsmString = !strconcat(instr_asm, "\t$rd, $ws[$n]");
-  list<dag> Pattern = [(set ROD:$rd, (OpNode (VecTy ROWS:$ws), immZExt4Ptr:$n))];
+  list<dag> Pattern = [(set ROD:$rd, (OpNode (VecTy ROWS:$ws), Imm:$n))];
   InstrItinClass Itinerary = itin;
 }
 
@@ -1249,9 +1225,10 @@ class MSA_ELM_SLD_DESC_BASE<string instr_asm, SDPatternOperator OpNode,
 }
 
 class MSA_COPY_PSEUDO_BASE<SDPatternOperator OpNode, ValueType VecTy,
-                           RegisterClass RCD, RegisterClass RCWS> :
-      MSAPseudo<(outs RCD:$wd), (ins RCWS:$ws, uimm4_ptr:$n),
-                [(set RCD:$wd, (OpNode (VecTy RCWS:$ws), immZExt4Ptr:$n))]> {
+                           Operand ImmOp, ImmLeaf Imm, RegisterClass RCD,
+                           RegisterClass RCWS> :
+      MSAPseudo<(outs RCD:$wd), (ins RCWS:$ws, ImmOp:$n),
+                [(set RCD:$wd, (OpNode (VecTy RCWS:$ws), Imm:$n))]> {
   bit usesCustomInserter = 1;
 }
 
@@ -1433,23 +1410,22 @@ class MSA_CBRANCH_DESC_BASE<string instr_asm, RegisterOperand ROWD> {
 }
 
 class MSA_INSERT_DESC_BASE<string instr_asm, SDPatternOperator OpNode,
-                           RegisterOperand ROWD, RegisterOperand ROS,
+                           Operand ImmOp, ImmLeaf Imm, RegisterOperand ROWD,
+                           RegisterOperand ROS,
                            InstrItinClass itin = NoItinerary> {
   dag OutOperandList = (outs ROWD:$wd);
-  dag InOperandList = (ins ROWD:$wd_in, ROS:$rs, uimm6_ptr:$n);
+  dag InOperandList = (ins ROWD:$wd_in, ROS:$rs, ImmOp:$n);
   string AsmString = !strconcat(instr_asm, "\t$wd[$n], $rs");
-  list<dag> Pattern = [(set ROWD:$wd, (OpNode ROWD:$wd_in,
-                                              ROS:$rs,
-                                              immZExt6Ptr:$n))];
+  list<dag> Pattern = [(set ROWD:$wd, (OpNode ROWD:$wd_in, ROS:$rs, Imm:$n))];
   InstrItinClass Itinerary = itin;
   string Constraints = "$wd = $wd_in";
 }
 
 class MSA_INSERT_PSEUDO_BASE<SDPatternOperator OpNode, ValueType Ty,
-                             RegisterOperand ROWD, RegisterOperand ROFS> :
-      MSAPseudo<(outs ROWD:$wd), (ins ROWD:$wd_in, uimm6_ptr:$n, ROFS:$fs),
-                [(set ROWD:$wd, (OpNode (Ty ROWD:$wd_in), ROFS:$fs,
-                                        immZExt6Ptr:$n))]> {
+                             Operand ImmOp, ImmLeaf Imm, RegisterOperand ROWD,
+                             RegisterOperand ROFS> :
+      MSAPseudo<(outs ROWD:$wd), (ins ROWD:$wd_in, ImmOp:$n, ROFS:$fs),
+                [(set ROWD:$wd, (OpNode (Ty ROWD:$wd_in), ROFS:$fs, Imm:$n))]> {
   bit usesCustomInserter = 1;
   string Constraints = "$wd = $wd_in";
 }
@@ -1643,10 +1619,10 @@ class BINSL_W_DESC : MSA_3R_BINSX_DESC_BASE<"binsl.w", int_mips_binsl_w,
 class BINSL_D_DESC : MSA_3R_BINSX_DESC_BASE<"binsl.d", int_mips_binsl_d,
                                             MSA128DOpnd>;
 
-class BINSLI_B_DESC : MSA_BIT_BINSLI_DESC_BASE<"binsli.b", v16i8, MSA128BOpnd>;
-class BINSLI_H_DESC : MSA_BIT_BINSLI_DESC_BASE<"binsli.h", v8i16, MSA128HOpnd>;
-class BINSLI_W_DESC : MSA_BIT_BINSLI_DESC_BASE<"binsli.w", v4i32, MSA128WOpnd>;
-class BINSLI_D_DESC : MSA_BIT_BINSLI_DESC_BASE<"binsli.d", v2i64, MSA128DOpnd>;
+class BINSLI_B_DESC : MSA_BIT_BINSLI_DESC_BASE<"binsli.b", v16i8, vsplat_maskl_bits_uimm3, MSA128BOpnd>;
+class BINSLI_H_DESC : MSA_BIT_BINSLI_DESC_BASE<"binsli.h", v8i16, vsplat_maskl_bits_uimm4, MSA128HOpnd>;
+class BINSLI_W_DESC : MSA_BIT_BINSLI_DESC_BASE<"binsli.w", v4i32, vsplat_maskl_bits_uimm5, MSA128WOpnd>;
+class BINSLI_D_DESC : MSA_BIT_BINSLI_DESC_BASE<"binsli.d", v2i64, vsplat_maskl_bits_uimm6, MSA128DOpnd>;
 
 class BINSR_B_DESC : MSA_3R_BINSX_DESC_BASE<"binsr.b", int_mips_binsr_b,
                                             MSA128BOpnd>;
@@ -1657,10 +1633,18 @@ class BINSR_W_DESC : MSA_3R_BINSX_DESC_BASE<"binsr.w", int_mips_binsr_w,
 class BINSR_D_DESC : MSA_3R_BINSX_DESC_BASE<"binsr.d", int_mips_binsr_d,
                                             MSA128DOpnd>;
 
-class BINSRI_B_DESC : MSA_BIT_BINSRI_DESC_BASE<"binsri.b", v16i8, MSA128BOpnd>;
-class BINSRI_H_DESC : MSA_BIT_BINSRI_DESC_BASE<"binsri.h", v8i16, MSA128HOpnd>;
-class BINSRI_W_DESC : MSA_BIT_BINSRI_DESC_BASE<"binsri.w", v4i32, MSA128WOpnd>;
-class BINSRI_D_DESC : MSA_BIT_BINSRI_DESC_BASE<"binsri.d", v2i64, MSA128DOpnd>;
+class BINSRI_B_DESC
+    : MSA_BIT_BINSRI_DESC_BASE<"binsri.b", v16i8, vsplat_maskr_bits_uimm3,
+                               MSA128BOpnd>;
+class BINSRI_H_DESC
+    : MSA_BIT_BINSRI_DESC_BASE<"binsri.h", v8i16, vsplat_maskr_bits_uimm4,
+                               MSA128HOpnd>;
+class BINSRI_W_DESC
+    : MSA_BIT_BINSRI_DESC_BASE<"binsri.w", v4i32, vsplat_maskr_bits_uimm5,
+                               MSA128WOpnd>;
+class BINSRI_D_DESC
+    : MSA_BIT_BINSRI_DESC_BASE<"binsri.d", v2i64, vsplat_maskr_bits_uimm6,
+                               MSA128DOpnd>;
 
 class BMNZ_V_DESC {
   dag OutOperandList = (outs MSA128BOpnd:$wd);
@@ -1867,24 +1851,33 @@ class CLTI_U_D_DESC : MSA_I5_DESC_BASE<"clti_u.d", vsetult_v2i64,
                                        vsplati64_uimm5, MSA128DOpnd>;
 
 class COPY_S_B_DESC : MSA_COPY_DESC_BASE<"copy_s.b", vextract_sext_i8,  v16i8,
-                                         GPR32Opnd, MSA128BOpnd>;
+                                         uimm4_ptr, immZExt4Ptr, GPR32Opnd,
+                                         MSA128BOpnd>;
 class COPY_S_H_DESC : MSA_COPY_DESC_BASE<"copy_s.h", vextract_sext_i16, v8i16,
-                                         GPR32Opnd, MSA128HOpnd>;
+                                         uimm3_ptr, immZExt3Ptr, GPR32Opnd,
+                                         MSA128HOpnd>;
 class COPY_S_W_DESC : MSA_COPY_DESC_BASE<"copy_s.w", vextract_sext_i32, v4i32,
-                                         GPR32Opnd, MSA128WOpnd>;
+                                         uimm2_ptr, immZExt2Ptr, GPR32Opnd,
+                                         MSA128WOpnd>;
 class COPY_S_D_DESC : MSA_COPY_DESC_BASE<"copy_s.d", vextract_sext_i64, v2i64,
-                                         GPR64Opnd, MSA128DOpnd>;
+                                         uimm1_ptr, immZExt1Ptr, GPR64Opnd,
+                                         MSA128DOpnd>;
 
 class COPY_U_B_DESC : MSA_COPY_DESC_BASE<"copy_u.b", vextract_zext_i8,  v16i8,
-                                         GPR32Opnd, MSA128BOpnd>;
+                                         uimm4_ptr, immZExt4Ptr, GPR32Opnd,
+                                         MSA128BOpnd>;
 class COPY_U_H_DESC : MSA_COPY_DESC_BASE<"copy_u.h", vextract_zext_i16, v8i16,
-                                         GPR32Opnd, MSA128HOpnd>;
+                                         uimm3_ptr, immZExt3Ptr, GPR32Opnd,
+                                         MSA128HOpnd>;
 class COPY_U_W_DESC : MSA_COPY_DESC_BASE<"copy_u.w", vextract_zext_i32, v4i32,
-                                         GPR32Opnd, MSA128WOpnd>;
+                                         uimm2_ptr, immZExt2Ptr, GPR32Opnd,
+                                         MSA128WOpnd>;
 
-class COPY_FW_PSEUDO_DESC : MSA_COPY_PSEUDO_BASE<vector_extract, v4f32, FGR32,
+class COPY_FW_PSEUDO_DESC : MSA_COPY_PSEUDO_BASE<vector_extract, v4f32,
+                                                 uimm2_ptr, immZExt2Ptr, FGR32,
                                                  MSA128W>;
-class COPY_FD_PSEUDO_DESC : MSA_COPY_PSEUDO_BASE<vector_extract, v2f64, FGR64,
+class COPY_FD_PSEUDO_DESC : MSA_COPY_PSEUDO_BASE<vector_extract, v2f64,
+                                                 uimm1_ptr, immZExt1Ptr, FGR64,
                                                  MSA128D>;
 
 class CTCMSA_DESC {
@@ -2249,14 +2242,14 @@ class ILVR_H_DESC : MSA_3R_DESC_BASE<"ilvr.h", MipsILVR, MSA128HOpnd>;
 class ILVR_W_DESC : MSA_3R_DESC_BASE<"ilvr.w", MipsILVR, MSA128WOpnd>;
 class ILVR_D_DESC : MSA_3R_DESC_BASE<"ilvr.d", MipsILVR, MSA128DOpnd>;
 
-class INSERT_B_DESC : MSA_INSERT_DESC_BASE<"insert.b", vinsert_v16i8,
-                                           MSA128BOpnd, GPR32Opnd>;
-class INSERT_H_DESC : MSA_INSERT_DESC_BASE<"insert.h", vinsert_v8i16,
-                                           MSA128HOpnd, GPR32Opnd>;
-class INSERT_W_DESC : MSA_INSERT_DESC_BASE<"insert.w", vinsert_v4i32,
-                                           MSA128WOpnd, GPR32Opnd>;
-class INSERT_D_DESC : MSA_INSERT_DESC_BASE<"insert.d", vinsert_v2i64,
-                                           MSA128DOpnd, GPR64Opnd>;
+class INSERT_B_DESC : MSA_INSERT_DESC_BASE<"insert.b", vinsert_v16i8, uimm4,
+                                           immZExt4Ptr, MSA128BOpnd, GPR32Opnd>;
+class INSERT_H_DESC : MSA_INSERT_DESC_BASE<"insert.h", vinsert_v8i16, uimm3,
+                                           immZExt3Ptr, MSA128HOpnd, GPR32Opnd>;
+class INSERT_W_DESC : MSA_INSERT_DESC_BASE<"insert.w", vinsert_v4i32, uimm2,
+                                           immZExt2Ptr, MSA128WOpnd, GPR32Opnd>;
+class INSERT_D_DESC : MSA_INSERT_DESC_BASE<"insert.d", vinsert_v2i64, uimm1,
+                                           immZExt1Ptr, MSA128DOpnd, GPR64Opnd>;
 
 class INSERT_B_VIDX_PSEUDO_DESC :
     MSA_INSERT_VIDX_PSEUDO_BASE<vector_insert, v16i8, MSA128BOpnd, GPR32Opnd, GPR32Opnd>;
@@ -2268,8 +2261,10 @@ class INSERT_D_VIDX_PSEUDO_DESC :
     MSA_INSERT_VIDX_PSEUDO_BASE<vector_insert, v2i64, MSA128DOpnd, GPR64Opnd, GPR32Opnd>;
 
 class INSERT_FW_PSEUDO_DESC : MSA_INSERT_PSEUDO_BASE<vector_insert, v4f32,
+                                                     uimm2, immZExt2Ptr,
                                                      MSA128WOpnd, FGR32Opnd>;
 class INSERT_FD_PSEUDO_DESC : MSA_INSERT_PSEUDO_BASE<vector_insert, v2f64,
+                                                     uimm1, immZExt1Ptr,
                                                      MSA128DOpnd, FGR64Opnd>;
 
 class INSERT_FW_VIDX_PSEUDO_DESC :
@@ -2302,7 +2297,7 @@ class INSVE_D_DESC : MSA_INSVE_DESC_BASE<"insve.d", insve_v2i64, uimm1, immZExt1
 
 class LD_DESC_BASE<string instr_asm, SDPatternOperator OpNode,
                    ValueType TyNode, RegisterOperand ROWD,
-                   Operand MemOpnd = mem_msa, ComplexPattern Addr = addrimm10,
+                   Operand MemOpnd, ComplexPattern Addr = addrimm10,
                    InstrItinClass itin = NoItinerary> {
   dag OutOperandList = (outs ROWD:$wd);
   dag InOperandList = (ins MemOpnd:$addr);
@@ -2312,10 +2307,10 @@ class LD_DESC_BASE<string instr_asm, SDPatternOperator OpNode,
   string DecoderMethod = "DecodeMSA128Mem";
 }
 
-class LD_B_DESC : LD_DESC_BASE<"ld.b", load, v16i8, MSA128BOpnd>;
-class LD_H_DESC : LD_DESC_BASE<"ld.h", load, v8i16, MSA128HOpnd>;
-class LD_W_DESC : LD_DESC_BASE<"ld.w", load, v4i32, MSA128WOpnd>;
-class LD_D_DESC : LD_DESC_BASE<"ld.d", load, v2i64, MSA128DOpnd>;
+class LD_B_DESC : LD_DESC_BASE<"ld.b", load, v16i8, MSA128BOpnd, mem_simm10>;
+class LD_H_DESC : LD_DESC_BASE<"ld.h", load, v8i16, MSA128HOpnd, mem_simm10_lsl1>;
+class LD_W_DESC : LD_DESC_BASE<"ld.w", load, v4i32, MSA128WOpnd, mem_simm10_lsl2>;
+class LD_D_DESC : LD_DESC_BASE<"ld.d", load, v2i64, MSA128DOpnd, mem_simm10_lsl3>;
 
 class LDI_B_DESC : MSA_I10_LDI_DESC_BASE<"ldi.b", MSA128BOpnd>;
 class LDI_H_DESC : MSA_I10_LDI_DESC_BASE<"ldi.h", MSA128HOpnd>;
@@ -2323,19 +2318,18 @@ class LDI_W_DESC : MSA_I10_LDI_DESC_BASE<"ldi.w", MSA128WOpnd>;
 class LDI_D_DESC : MSA_I10_LDI_DESC_BASE<"ldi.d", MSA128DOpnd>;
 
 class LSA_DESC_BASE<string instr_asm, RegisterOperand RORD,
-                    RegisterOperand RORS = RORD, RegisterOperand RORT = RORD,
-                    InstrItinClass itin = NoItinerary > {
+                    InstrItinClass itin = NoItinerary> {
   dag OutOperandList = (outs RORD:$rd);
-  dag InOperandList = (ins RORS:$rs, RORT:$rt, uimm2_plus1:$sa);
+  dag InOperandList = (ins RORD:$rs, RORD:$rt, uimm2_plus1:$sa);
   string AsmString = !strconcat(instr_asm, "\t$rd, $rs, $rt, $sa");
-  list<dag> Pattern = [(set RORD:$rd, (add RORT:$rt,
-                                                (shl RORS:$rs,
+  list<dag> Pattern = [(set RORD:$rd, (add RORD:$rt,
+                                                (shl RORD:$rs,
                                                      immZExt2Lsa:$sa)))];
   InstrItinClass Itinerary = itin;
 }
 
-class LSA_DESC : LSA_DESC_BASE<"lsa", GPR32Opnd>;
-class DLSA_DESC : LSA_DESC_BASE<"dlsa", GPR64Opnd>;
+class LSA_DESC : LSA_DESC_BASE<"lsa", GPR32Opnd, II_LSA>;
+class DLSA_DESC : LSA_DESC_BASE<"dlsa", GPR64Opnd, II_DLSA>;
 
 class MADD_Q_H_DESC : MSA_3RF_4RF_DESC_BASE<"madd_q.h", int_mips_madd_q_h,
                                             MSA128HOpnd>;
@@ -2636,7 +2630,7 @@ class SRLRI_D_DESC : MSA_BIT_X_DESC_BASE<"srlri.d", int_mips_srlri_d, uimm6,
 
 class ST_DESC_BASE<string instr_asm, SDPatternOperator OpNode,
                    ValueType TyNode, RegisterOperand ROWD,
-                   Operand MemOpnd = mem_msa, ComplexPattern Addr = addrimm10,
+                   Operand MemOpnd, ComplexPattern Addr = addrimm10,
                    InstrItinClass itin = NoItinerary> {
   dag OutOperandList = (outs);
   dag InOperandList = (ins ROWD:$wd, MemOpnd:$addr);
@@ -2646,10 +2640,10 @@ class ST_DESC_BASE<string instr_asm, SDPatternOperator OpNode,
   string DecoderMethod = "DecodeMSA128Mem";
 }
 
-class ST_B_DESC : ST_DESC_BASE<"st.b", store, v16i8, MSA128BOpnd>;
-class ST_H_DESC : ST_DESC_BASE<"st.h", store, v8i16, MSA128HOpnd>;
-class ST_W_DESC : ST_DESC_BASE<"st.w", store, v4i32, MSA128WOpnd>;
-class ST_D_DESC : ST_DESC_BASE<"st.d", store, v2i64, MSA128DOpnd>;
+class ST_B_DESC : ST_DESC_BASE<"st.b", store, v16i8, MSA128BOpnd, mem_simm10>;
+class ST_H_DESC : ST_DESC_BASE<"st.h", store, v8i16, MSA128HOpnd, mem_simm10_lsl1>;
+class ST_W_DESC : ST_DESC_BASE<"st.w", store, v4i32, MSA128WOpnd, mem_simm10_lsl2>;
+class ST_D_DESC : ST_DESC_BASE<"st.d", store, v2i64, MSA128DOpnd, mem_simm10_lsl3>;
 
 class SUBS_S_B_DESC : MSA_3R_DESC_BASE<"subs_s.b", int_mips_subs_s_b,
                                        MSA128BOpnd>;
diff --git a/lib/Target/Mips/MipsMachineFunction.cpp b/lib/Target/Mips/MipsMachineFunction.cpp
index c7d2738af1d4..f81e64e06f43 100644
--- a/lib/Target/Mips/MipsMachineFunction.cpp
+++ b/lib/Target/Mips/MipsMachineFunction.cpp
@@ -42,7 +42,9 @@ unsigned MipsFunctionInfo::getGlobalBaseReg() {
       STI.inMips16Mode()
           ? &Mips::CPU16RegsRegClass
           : STI.inMicroMipsMode()
-                ? &Mips::GPRMM16RegClass
+                ? STI.hasMips64()
+                      ? &Mips::GPRMM16_64RegClass
+                      : &Mips::GPRMM16RegClass
                 : static_cast<const MipsTargetMachine &>(MF.getTarget())
                           .getABI()
                           .IsN64()
@@ -51,18 +53,6 @@ unsigned MipsFunctionInfo::getGlobalBaseReg() {
   return GlobalBaseReg = MF.getRegInfo().createVirtualRegister(RC);
 }
 
-bool MipsFunctionInfo::mips16SPAliasRegSet() const {
-  return Mips16SPAliasReg;
-}
-unsigned MipsFunctionInfo::getMips16SPAliasReg() {
-  // Return if it has already been initialized.
-  if (Mips16SPAliasReg)
-    return Mips16SPAliasReg;
-
-  const TargetRegisterClass *RC = &Mips::CPU16RegsRegClass;
-  return Mips16SPAliasReg = MF.getRegInfo().createVirtualRegister(RC);
-}
-
 void MipsFunctionInfo::createEhDataRegsFI() {
   for (int I = 0; I < 4; ++I) {
     const TargetRegisterClass *RC =
diff --git a/lib/Target/Mips/MipsMachineFunction.h b/lib/Target/Mips/MipsMachineFunction.h
index a2f6ee03604f..c9e5fddc1932 100644
--- a/lib/Target/Mips/MipsMachineFunction.h
+++ b/lib/Target/Mips/MipsMachineFunction.h
@@ -19,12 +19,9 @@
 #include "llvm/CodeGen/MachineFunction.h"
 #include "llvm/CodeGen/MachineMemOperand.h"
 #include "llvm/CodeGen/PseudoSourceValue.h"
-#include "llvm/IR/ValueMap.h"
 #include "llvm/Target/TargetFrameLowering.h"
 #include "llvm/Target/TargetMachine.h"
 #include <map>
-#include <string>
-#include <utility>
 
 namespace llvm {
 
@@ -33,8 +30,8 @@ namespace llvm {
 class MipsFunctionInfo : public MachineFunctionInfo {
 public:
   MipsFunctionInfo(MachineFunction &MF)
-      : MF(MF), SRetReturnReg(0), GlobalBaseReg(0), Mips16SPAliasReg(0),
-        VarArgsFrameIndex(0), CallsEhReturn(false), IsISR(false), SaveS2(false),
+      : MF(MF), SRetReturnReg(0), GlobalBaseReg(0), VarArgsFrameIndex(0),
+        CallsEhReturn(false), IsISR(false), SaveS2(false),
         MoveF64ViaSpillFI(-1) {}
 
   ~MipsFunctionInfo();
@@ -45,9 +42,6 @@ public:
   bool globalBaseRegSet() const;
   unsigned getGlobalBaseReg();
 
-  bool mips16SPAliasRegSet() const;
-  unsigned getMips16SPAliasReg();
-
   int getVarArgsFrameIndex() const { return VarArgsFrameIndex; }
   void setVarArgsFrameIndex(int Index) { VarArgsFrameIndex = Index; }
 
@@ -104,11 +98,6 @@ private:
   /// relocation models.
   unsigned GlobalBaseReg;
 
-  /// Mips16SPAliasReg - keeps track of the virtual register initialized for
-  /// use as an alias for SP for use in load/store of halfword/byte from/to
-  /// the stack
-  unsigned Mips16SPAliasReg;
-
   /// VarArgsFrameIndex - FrameIndex for start of varargs area.
   int VarArgsFrameIndex;
 
diff --git a/lib/Target/Mips/MipsOs16.cpp b/lib/Target/Mips/MipsOs16.cpp
index b6cd79193cfc..8136907de4d7 100644
--- a/lib/Target/Mips/MipsOs16.cpp
+++ b/lib/Target/Mips/MipsOs16.cpp
@@ -111,22 +111,27 @@ static bool needsFP(Function &F) {
 bool MipsOs16::runOnModule(Module &M) {
   bool usingMask = Mips32FunctionMask.length() > 0;
   bool doneUsingMask = false; // this will make it stop repeating
+
   DEBUG(dbgs() << "Run on Module MipsOs16 \n" << Mips32FunctionMask << "\n");
   if (usingMask)
     DEBUG(dbgs() << "using mask \n" << Mips32FunctionMask << "\n");
+
   unsigned int functionIndex = 0;
   bool modified = false;
-  for (Module::iterator F = M.begin(), E = M.end(); F != E; ++F) {
-    if (F->isDeclaration()) continue;
-    DEBUG(dbgs() << "Working on " << F->getName() << "\n");
+
+  for (auto &F : M) {
+    if (F.isDeclaration())
+      continue;
+
+    DEBUG(dbgs() << "Working on " << F.getName() << "\n");
     if (usingMask) {
       if (!doneUsingMask) {
         if (functionIndex == Mips32FunctionMask.length())
           functionIndex = 0;
         switch (Mips32FunctionMask[functionIndex]) {
         case '1':
-          DEBUG(dbgs() << "mask forced mips32: " << F->getName() << "\n");
-          F->addFnAttr("nomips16");
+          DEBUG(dbgs() << "mask forced mips32: " << F.getName() << "\n");
+          F.addFnAttr("nomips16");
           break;
         case '.':
           doneUsingMask = true;
@@ -138,16 +143,17 @@ bool MipsOs16::runOnModule(Module &M) {
       }
     }
     else {
-      if (needsFP(*F)) {
-        DEBUG(dbgs() << "os16 forced mips32: " << F->getName() << "\n");
-        F->addFnAttr("nomips16");
+      if (needsFP(F)) {
+        DEBUG(dbgs() << "os16 forced mips32: " << F.getName() << "\n");
+        F.addFnAttr("nomips16");
       }
       else {
-        DEBUG(dbgs() << "os16 forced mips16: " << F->getName() << "\n");
-        F->addFnAttr("mips16");
+        DEBUG(dbgs() << "os16 forced mips16: " << F.getName() << "\n");
+        F.addFnAttr("mips16");
       }
     }
   }
+
   return modified;
 }
 
diff --git a/lib/Target/Mips/MipsRegisterInfo.cpp b/lib/Target/Mips/MipsRegisterInfo.cpp
index 28e5a425849f..860cf9cfd138 100644
--- a/lib/Target/Mips/MipsRegisterInfo.cpp
+++ b/lib/Target/Mips/MipsRegisterInfo.cpp
@@ -13,7 +13,6 @@
 
 #include "MipsRegisterInfo.h"
 #include "Mips.h"
-#include "MipsAnalyzeImmediate.h"
 #include "MipsInstrInfo.h"
 #include "MipsMachineFunction.h"
 #include "MipsSubtarget.h"
@@ -21,14 +20,13 @@
 #include "llvm/ADT/BitVector.h"
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/CodeGen/MachineFrameInfo.h"
-#include "llvm/CodeGen/MachineRegisterInfo.h"
 #include "llvm/CodeGen/MachineFunction.h"
 #include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
 #include "llvm/IR/Constants.h"
 #include "llvm/IR/DebugInfo.h"
 #include "llvm/IR/Function.h"
 #include "llvm/IR/Type.h"
-#include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/raw_ostream.h"
@@ -52,7 +50,21 @@ const TargetRegisterClass *
 MipsRegisterInfo::getPointerRegClass(const MachineFunction &MF,
                                      unsigned Kind) const {
   MipsABIInfo ABI = MF.getSubtarget<MipsSubtarget>().getABI();
-  return ABI.ArePtrs64bit() ? &Mips::GPR64RegClass : &Mips::GPR32RegClass;
+  MipsPtrClass PtrClassKind = static_cast<MipsPtrClass>(Kind);
+
+  switch (PtrClassKind) {
+  case MipsPtrClass::Default:
+    return ABI.ArePtrs64bit() ? &Mips::GPR64RegClass : &Mips::GPR32RegClass;
+  case MipsPtrClass::GPR16MM:
+    return ABI.ArePtrs64bit() ? &Mips::GPRMM16_64RegClass
+                              : &Mips::GPRMM16RegClass;
+  case MipsPtrClass::StackPointer:
+    return ABI.ArePtrs64bit() ? &Mips::SP64RegClass : &Mips::SP32RegClass;
+  case MipsPtrClass::GlobalPointer:                              
+    return ABI.ArePtrs64bit() ? &Mips::GP64RegClass : &Mips::GP32RegClass;
+  }
+
+  llvm_unreachable("Unknown pointer kind");
 }
 
 unsigned
diff --git a/lib/Target/Mips/MipsRegisterInfo.h b/lib/Target/Mips/MipsRegisterInfo.h
index 5de68a21b73e..32f835e83108 100644
--- a/lib/Target/Mips/MipsRegisterInfo.h
+++ b/lib/Target/Mips/MipsRegisterInfo.h
@@ -23,18 +23,23 @@
 namespace llvm {
 class MipsRegisterInfo : public MipsGenRegisterInfo {
 public:
-  MipsRegisterInfo();
+  enum class MipsPtrClass {
+    /// The default register class for integer values.
+    Default = 0,
+    /// The subset of registers permitted in certain microMIPS instructions
+    /// such as lw16.
+    GPR16MM = 1,
+    /// The stack pointer only.
+    StackPointer = 2,
+    /// The global pointer only.
+    GlobalPointer = 3,
+  };
 
-  /// getRegisterNumbering - Given the enum value for some register, e.g.
-  /// Mips::RA, return the number that it corresponds to (e.g. 31).
-  static unsigned getRegisterNumbering(unsigned RegEnum);
+  MipsRegisterInfo();
 
   /// Get PIC indirect call register
   static unsigned getPICCallReg();
 
-  /// Adjust the Mips stack frame.
-  void adjustMipsStackFrame(MachineFunction &MF) const;
-
   /// Code Generation virtual methods...
   const TargetRegisterClass *getPointerRegClass(const MachineFunction &MF,
                                                 unsigned Kind) const override;
@@ -57,9 +62,6 @@ public:
                            int SPAdj, unsigned FIOperandNum,
                            RegScavenger *RS = nullptr) const override;
 
-  void processFunctionBeforeFrameFinalized(MachineFunction &MF,
-                                       RegScavenger *RS = nullptr) const;
-
   // Stack realignment queries.
   bool canRealignStack(const MachineFunction &MF) const override;
 
diff --git a/lib/Target/Mips/MipsRegisterInfo.td b/lib/Target/Mips/MipsRegisterInfo.td
index 02bcac5a3ddb..cfce7c8e6533 100644
--- a/lib/Target/Mips/MipsRegisterInfo.td
+++ b/lib/Target/Mips/MipsRegisterInfo.td
@@ -330,6 +330,12 @@ def GPR64 : RegisterClass<"Mips", [i64], 64, (add
   // Reserved
   K0_64, K1_64, GP_64, SP_64, FP_64, RA_64)>;
 
+def GPRMM16_64 : RegisterClass<"Mips", [i64], 64, (add
+  // Callee save
+  S0_64, S1_64,
+  // Return Values and Arguments
+  V0_64, V1_64, A0_64, A1_64, A2_64, A3_64)>;
+
 def CPU16Regs : RegisterClass<"Mips", [i32], 32, (add
   // Return Values and Arguments
   V0, V1, A0, A1, A2, A3,
@@ -447,6 +453,13 @@ def COP2 : RegisterClass<"Mips", [i32], 32, (sequence "COP2%u", 0, 31)>,
 def COP3 : RegisterClass<"Mips", [i32], 32, (sequence "COP3%u", 0, 31)>,
            Unallocatable;
 
+// Stack pointer and global pointer classes for instructions that are limited
+// to a single register such as lwgp/lwsp in microMIPS.
+def SP32 : RegisterClass<"Mips", [i32], 32, (add SP)>, Unallocatable;
+def SP64 : RegisterClass<"Mips", [i64], 64, (add SP_64)>, Unallocatable;
+def GP32 : RegisterClass<"Mips", [i32], 32, (add GP)>, Unallocatable;
+def GP64 : RegisterClass<"Mips", [i64], 64, (add GP_64)>, Unallocatable;
+
 // Octeon multiplier and product registers
 def OCTEON_MPL : RegisterClass<"Mips", [i64], 64, (add MPL0, MPL1, MPL2)>,
                  Unallocatable;
diff --git a/lib/Target/Mips/MipsSEFrameLowering.cpp b/lib/Target/Mips/MipsSEFrameLowering.cpp
index a4abd62ee607..a7ddd7752736 100644
--- a/lib/Target/Mips/MipsSEFrameLowering.cpp
+++ b/lib/Target/Mips/MipsSEFrameLowering.cpp
@@ -13,7 +13,6 @@
 
 #include "MipsSEFrameLowering.h"
 #include "MCTargetDesc/MipsBaseInfo.h"
-#include "MipsAnalyzeImmediate.h"
 #include "MipsMachineFunction.h"
 #include "MipsSEInstrInfo.h"
 #include "MipsSubtarget.h"
@@ -26,7 +25,6 @@
 #include "llvm/CodeGen/RegisterScavenging.h"
 #include "llvm/IR/DataLayout.h"
 #include "llvm/IR/Function.h"
-#include "llvm/Support/CommandLine.h"
 #include "llvm/Target/TargetOptions.h"
 
 using namespace llvm;
@@ -87,10 +85,10 @@ ExpandPseudo::ExpandPseudo(MachineFunction &MF_)
 bool ExpandPseudo::expand() {
   bool Expanded = false;
 
-  for (MachineFunction::iterator BB = MF.begin(), BBEnd = MF.end();
-       BB != BBEnd; ++BB)
-    for (Iter I = BB->begin(), End = BB->end(); I != End;)
-      Expanded |= expandInstr(*BB, I++);
+  for (auto &MBB : MF) {
+    for (Iter I = MBB.begin(), End = MBB.end(); I != End;)
+      Expanded |= expandInstr(MBB, I++);
+  }
 
   return Expanded;
 }
@@ -518,7 +516,7 @@ void MipsSEFrameLowering::emitPrologue(MachineFunction &MF,
       unsigned VR = MF.getRegInfo().createVirtualRegister(RC);
       assert(isInt<16>(MFI->getMaxAlignment()) &&
              "Function's alignment size requirement is not supported.");
-      int MaxAlign = - (signed) MFI->getMaxAlignment();
+      int MaxAlign = -(int)MFI->getMaxAlignment();
 
       BuildMI(MBB, MBBI, dl, TII.get(ADDiu), VR).addReg(ZERO) .addImm(MaxAlign);
       BuildMI(MBB, MBBI, dl, TII.get(AND), SP).addReg(SP).addReg(VR);
diff --git a/lib/Target/Mips/MipsSEISelDAGToDAG.cpp b/lib/Target/Mips/MipsSEISelDAGToDAG.cpp
index 6f001ea74b30..d9528da5a96d 100644
--- a/lib/Target/Mips/MipsSEISelDAGToDAG.cpp
+++ b/lib/Target/Mips/MipsSEISelDAGToDAG.cpp
@@ -47,7 +47,8 @@ void MipsSEDAGToDAGISel::addDSPCtrlRegOperands(bool IsDef, MachineInstr &MI,
                                                MachineFunction &MF) {
   MachineInstrBuilder MIB(MF, &MI);
   unsigned Mask = MI.getOperand(1).getImm();
-  unsigned Flag = IsDef ? RegState::ImplicitDefine : RegState::Implicit;
+  unsigned Flag =
+      IsDef ? RegState::ImplicitDefine : RegState::Implicit | RegState::Undef;
 
   if (Mask & 1)
     MIB.addReg(Mips::DSPPos, Flag);
@@ -162,7 +163,7 @@ void MipsSEDAGToDAGISel::initGlobalBaseReg(MachineFunction &MF) {
     return;
   }
 
-  if (MF.getTarget().getRelocationModel() == Reloc::Static) {
+  if (!MF.getTarget().isPositionIndependent()) {
     // Set global register to __gnu_local_gp.
     //
     // lui   $v0, %hi(__gnu_local_gp)
@@ -220,21 +221,25 @@ void MipsSEDAGToDAGISel::processFunctionAfterISel(MachineFunction &MF) {
 
   MachineRegisterInfo *MRI = &MF.getRegInfo();
 
-  for (MachineFunction::iterator MFI = MF.begin(), MFE = MF.end(); MFI != MFE;
-       ++MFI)
-    for (MachineBasicBlock::iterator I = MFI->begin(); I != MFI->end(); ++I) {
-      if (I->getOpcode() == Mips::RDDSP)
-        addDSPCtrlRegOperands(false, *I, MF);
-      else if (I->getOpcode() == Mips::WRDSP)
-        addDSPCtrlRegOperands(true, *I, MF);
-      else
-        replaceUsesWithZeroReg(MRI, *I);
+  for (auto &MBB: MF) {
+    for (auto &MI: MBB) {
+      switch (MI.getOpcode()) {
+      case Mips::RDDSP:
+        addDSPCtrlRegOperands(false, MI, MF);
+        break;
+      case Mips::WRDSP:
+        addDSPCtrlRegOperands(true, MI, MF);
+        break;
+      default:
+        replaceUsesWithZeroReg(MRI, MI);
+      }
     }
+  }
 }
 
-SDNode *MipsSEDAGToDAGISel::selectAddESubE(unsigned MOp, SDValue InFlag,
-                                           SDValue CmpLHS, SDLoc DL,
-                                           SDNode *Node) const {
+void MipsSEDAGToDAGISel::selectAddESubE(unsigned MOp, SDValue InFlag,
+                                        SDValue CmpLHS, const SDLoc &DL,
+                                        SDNode *Node) const {
   unsigned Opc = InFlag.getOpcode(); (void)Opc;
 
   assert(((Opc == ISD::ADDC || Opc == ISD::ADDE) ||
@@ -271,8 +276,7 @@ SDNode *MipsSEDAGToDAGISel::selectAddESubE(unsigned MOp, SDValue InFlag,
   if (!C || C->getZExtValue())
     AddCarry = CurDAG->getMachineNode(ADDuOp, DL, VT, SDValue(Carry, 0), RHS);
 
-  return CurDAG->SelectNodeTo(Node, MOp, VT, MVT::Glue, LHS,
-                              SDValue(AddCarry, 0));
+  CurDAG->SelectNodeTo(Node, MOp, VT, MVT::Glue, LHS, SDValue(AddCarry, 0));
 }
 
 /// Match frameindex
@@ -327,7 +331,7 @@ bool MipsSEDAGToDAGISel::selectAddrRegImm(SDValue Addr, SDValue &Base,
     return true;
   }
 
-  if (TM.getRelocationModel() != Reloc::PIC_) {
+  if (!TM.isPositionIndependent()) {
     if ((Addr.getOpcode() == ISD::TargetExternalSymbol ||
         Addr.getOpcode() == ISD::TargetGlobalAddress))
       return false;
@@ -364,18 +368,6 @@ bool MipsSEDAGToDAGISel::selectAddrRegImm(SDValue Addr, SDValue &Base,
 
 /// ComplexPattern used on MipsInstrInfo
 /// Used on Mips Load/Store instructions
-bool MipsSEDAGToDAGISel::selectAddrRegReg(SDValue Addr, SDValue &Base,
-                                          SDValue &Offset) const {
-  // Operand is a result from an ADD.
-  if (Addr.getOpcode() == ISD::ADD) {
-    Base = Addr.getOperand(0);
-    Offset = Addr.getOperand(1);
-    return true;
-  }
-
-  return false;
-}
-
 bool MipsSEDAGToDAGISel::selectAddrDefault(SDValue Addr, SDValue &Base,
                                            SDValue &Offset) const {
   Base = Addr;
@@ -411,6 +403,18 @@ bool MipsSEDAGToDAGISel::selectAddrRegImm10(SDValue Addr, SDValue &Base,
   return false;
 }
 
+/// Used on microMIPS LWC2, LDC2, SWC2 and SDC2 instructions (11-bit offset)
+bool MipsSEDAGToDAGISel::selectAddrRegImm11(SDValue Addr, SDValue &Base,
+                                            SDValue &Offset) const {
+  if (selectAddrFrameIndex(Addr, Base, Offset))
+    return true;
+
+  if (selectAddrFrameIndexOffset(Addr, Base, Offset, 11))
+    return true;
+
+  return false;
+}
+
 /// Used on microMIPS Load/Store unaligned instructions (12-bit offset)
 bool MipsSEDAGToDAGISel::selectAddrRegImm12(SDValue Addr, SDValue &Base,
                                             SDValue &Offset) const {
@@ -434,12 +438,24 @@ bool MipsSEDAGToDAGISel::selectAddrRegImm16(SDValue Addr, SDValue &Base,
   return false;
 }
 
-bool MipsSEDAGToDAGISel::selectIntAddrMM(SDValue Addr, SDValue &Base,
+bool MipsSEDAGToDAGISel::selectIntAddr11MM(SDValue Addr, SDValue &Base,
+                                         SDValue &Offset) const {
+  return selectAddrRegImm11(Addr, Base, Offset) ||
+    selectAddrDefault(Addr, Base, Offset);
+}
+
+bool MipsSEDAGToDAGISel::selectIntAddr12MM(SDValue Addr, SDValue &Base,
                                          SDValue &Offset) const {
   return selectAddrRegImm12(Addr, Base, Offset) ||
     selectAddrDefault(Addr, Base, Offset);
 }
 
+bool MipsSEDAGToDAGISel::selectIntAddr16MM(SDValue Addr, SDValue &Base,
+                                         SDValue &Offset) const {
+  return selectAddrRegImm16(Addr, Base, Offset) ||
+    selectAddrDefault(Addr, Base, Offset);
+}
+
 bool MipsSEDAGToDAGISel::selectIntAddrLSL2MM(SDValue Addr, SDValue &Base,
                                              SDValue &Offset) const {
   if (selectAddrFrameIndexOffset(Addr, Base, Offset, 7)) {
@@ -702,7 +718,7 @@ bool MipsSEDAGToDAGISel::selectVSplatUimmInvPow2(SDValue N,
   return false;
 }
 
-std::pair<bool, SDNode*> MipsSEDAGToDAGISel::selectNode(SDNode *Node) {
+bool MipsSEDAGToDAGISel::trySelect(SDNode *Node) {
   unsigned Opcode = Node->getOpcode();
   SDLoc DL(Node);
 
@@ -710,16 +726,14 @@ std::pair<bool, SDNode*> MipsSEDAGToDAGISel::selectNode(SDNode *Node) {
   // Instruction Selection not handled by the auto-generated
   // tablegen selection should be handled here.
   ///
-  SDNode *Result;
-
   switch(Opcode) {
   default: break;
 
   case ISD::SUBE: {
     SDValue InFlag = Node->getOperand(2);
     unsigned Opc = Subtarget->isGP64bit() ? Mips::DSUBu : Mips::SUBu;
-    Result = selectAddESubE(Opc, InFlag, InFlag.getOperand(0), DL, Node);
-    return std::make_pair(true, Result);
+    selectAddESubE(Opc, InFlag, InFlag.getOperand(0), DL, Node);
+    return true;
   }
 
   case ISD::ADDE: {
@@ -727,8 +741,8 @@ std::pair<bool, SDNode*> MipsSEDAGToDAGISel::selectNode(SDNode *Node) {
       break;
     SDValue InFlag = Node->getOperand(2);
     unsigned Opc = Subtarget->isGP64bit() ? Mips::DADDu : Mips::ADDu;
-    Result = selectAddESubE(Opc, InFlag, InFlag.getValue(0), DL, Node);
-    return std::make_pair(true, Result);
+    selectAddESubE(Opc, InFlag, InFlag.getValue(0), DL, Node);
+    return true;
   }
 
   case ISD::ConstantFP: {
@@ -737,20 +751,20 @@ std::pair<bool, SDNode*> MipsSEDAGToDAGISel::selectNode(SDNode *Node) {
       if (Subtarget->isGP64bit()) {
         SDValue Zero = CurDAG->getCopyFromReg(CurDAG->getEntryNode(), DL,
                                               Mips::ZERO_64, MVT::i64);
-        Result = CurDAG->getMachineNode(Mips::DMTC1, DL, MVT::f64, Zero);
+        ReplaceNode(Node,
+                    CurDAG->getMachineNode(Mips::DMTC1, DL, MVT::f64, Zero));
       } else if (Subtarget->isFP64bit()) {
         SDValue Zero = CurDAG->getCopyFromReg(CurDAG->getEntryNode(), DL,
                                               Mips::ZERO, MVT::i32);
-        Result = CurDAG->getMachineNode(Mips::BuildPairF64_64, DL, MVT::f64,
-                                        Zero, Zero);
+        ReplaceNode(Node, CurDAG->getMachineNode(Mips::BuildPairF64_64, DL,
+                                                 MVT::f64, Zero, Zero));
       } else {
         SDValue Zero = CurDAG->getCopyFromReg(CurDAG->getEntryNode(), DL,
                                               Mips::ZERO, MVT::i32);
-        Result = CurDAG->getMachineNode(Mips::BuildPairF64, DL, MVT::f64, Zero,
-                                        Zero);
+        ReplaceNode(Node, CurDAG->getMachineNode(Mips::BuildPairF64, DL,
+                                                 MVT::f64, Zero, Zero));
       }
-
-      return std::make_pair(true, Result);
+      return true;
     }
     break;
   }
@@ -793,7 +807,8 @@ std::pair<bool, SDNode*> MipsSEDAGToDAGISel::selectNode(SDNode *Node) {
                                        SDValue(RegOpnd, 0), ImmOpnd);
     }
 
-    return std::make_pair(true, RegOpnd);
+    ReplaceNode(Node, RegOpnd);
+    return true;
   }
 
   case ISD::INTRINSIC_W_CHAIN: {
@@ -806,7 +821,8 @@ std::pair<bool, SDNode*> MipsSEDAGToDAGISel::selectNode(SDNode *Node) {
       SDValue RegIdx = Node->getOperand(2);
       SDValue Reg = CurDAG->getCopyFromReg(ChainIn, DL,
                                            getMSACtrlReg(RegIdx), MVT::i32);
-      return std::make_pair(true, Reg.getNode());
+      ReplaceNode(Node, Reg.getNode());
+      return true;
     }
     }
     break;
@@ -820,10 +836,10 @@ std::pair<bool, SDNode*> MipsSEDAGToDAGISel::selectNode(SDNode *Node) {
     case Intrinsic::mips_move_v:
       // Like an assignment but will always produce a move.v even if
       // unnecessary.
-      return std::make_pair(true,
-                            CurDAG->getMachineNode(Mips::MOVE_V, DL,
-                                                   Node->getValueType(0),
-                                                   Node->getOperand(1)));
+      ReplaceNode(Node, CurDAG->getMachineNode(Mips::MOVE_V, DL,
+                                               Node->getValueType(0),
+                                               Node->getOperand(1)));
+      return true;
     }
     break;
   }
@@ -839,7 +855,8 @@ std::pair<bool, SDNode*> MipsSEDAGToDAGISel::selectNode(SDNode *Node) {
       SDValue Value   = Node->getOperand(3);
       SDValue ChainOut = CurDAG->getCopyToReg(ChainIn, DL,
                                               getMSACtrlReg(RegIdx), Value);
-      return std::make_pair(true, ChainOut.getNode());
+      ReplaceNode(Node, ChainOut.getNode());
+      return true;
     }
     }
     break;
@@ -864,8 +881,8 @@ std::pair<bool, SDNode*> MipsSEDAGToDAGISel::selectNode(SDNode *Node) {
     SDValue Chain = CurDAG->getCopyToReg(CurDAG->getEntryNode(), DL, DestReg,
                                          SDValue(Rdhwr, 0));
     SDValue ResNode = CurDAG->getCopyFromReg(Chain, DL, DestReg, PtrVT);
-    ReplaceUses(SDValue(Node, 0), ResNode);
-    return std::make_pair(true, ResNode.getNode());
+    ReplaceNode(Node, ResNode.getNode());
+    return true;
   }
 
   case ISD::BUILD_VECTOR: {
@@ -890,16 +907,16 @@ std::pair<bool, SDNode*> MipsSEDAGToDAGISel::selectNode(SDNode *Node) {
     EVT ViaVecTy;
 
     if (!Subtarget->hasMSA() || !BVN->getValueType(0).is128BitVector())
-      return std::make_pair(false, nullptr);
+      return false;
 
     if (!BVN->isConstantSplat(SplatValue, SplatUndef, SplatBitSize,
                               HasAnyUndefs, 8,
                               !Subtarget->isLittle()))
-      return std::make_pair(false, nullptr);
+      return false;
 
     switch (SplatBitSize) {
     default:
-      return std::make_pair(false, nullptr);
+      return false;
     case 8:
       LdiOp = Mips::LDI_B;
       ViaVecTy = MVT::v16i8;
@@ -919,7 +936,7 @@ std::pair<bool, SDNode*> MipsSEDAGToDAGISel::selectNode(SDNode *Node) {
     }
 
     if (!SplatValue.isSignedIntN(10))
-      return std::make_pair(false, nullptr);
+      return false;
 
     SDValue Imm = CurDAG->getTargetConstant(SplatValue, DL,
                                             ViaVecTy.getVectorElementType());
@@ -940,12 +957,13 @@ std::pair<bool, SDNode*> MipsSEDAGToDAGISel::selectNode(SDNode *Node) {
                                                              MVT::i32));
     }
 
-    return std::make_pair(true, Res);
+    ReplaceNode(Node, Res);
+    return true;
   }
 
   }
 
-  return std::make_pair(false, nullptr);
+  return false;
 }
 
 bool MipsSEDAGToDAGISel::
@@ -1015,6 +1033,7 @@ SelectInlineAsmMemoryOperand(const SDValue &Op, unsigned ConstraintID,
   return true;
 }
 
-FunctionPass *llvm::createMipsSEISelDag(MipsTargetMachine &TM) {
-  return new MipsSEDAGToDAGISel(TM);
+FunctionPass *llvm::createMipsSEISelDag(MipsTargetMachine &TM,
+                                        CodeGenOpt::Level OptLevel) {
+  return new MipsSEDAGToDAGISel(TM, OptLevel);
 }
diff --git a/lib/Target/Mips/MipsSEISelDAGToDAG.h b/lib/Target/Mips/MipsSEISelDAGToDAG.h
index a894034020e9..0f08b72a334e 100644
--- a/lib/Target/Mips/MipsSEISelDAGToDAG.h
+++ b/lib/Target/Mips/MipsSEISelDAGToDAG.h
@@ -21,7 +21,8 @@ namespace llvm {
 class MipsSEDAGToDAGISel : public MipsDAGToDAGISel {
 
 public:
-  explicit MipsSEDAGToDAGISel(MipsTargetMachine &TM) : MipsDAGToDAGISel(TM) {}
+  explicit MipsSEDAGToDAGISel(MipsTargetMachine &TM, CodeGenOpt::Level OL)
+      : MipsDAGToDAGISel(TM, OL) {}
 
 private:
 
@@ -34,11 +35,12 @@ private:
 
   bool replaceUsesWithZeroReg(MachineRegisterInfo *MRI, const MachineInstr&);
 
-  std::pair<SDNode*, SDNode*> selectMULT(SDNode *N, unsigned Opc, SDLoc dl,
-                                         EVT Ty, bool HasLo, bool HasHi);
+  std::pair<SDNode *, SDNode *> selectMULT(SDNode *N, unsigned Opc,
+                                           const SDLoc &dl, EVT Ty, bool HasLo,
+                                           bool HasHi);
 
-  SDNode *selectAddESubE(unsigned MOp, SDValue InFlag, SDValue CmpLHS,
-                         SDLoc DL, SDNode *Node) const;
+  void selectAddESubE(unsigned MOp, SDValue InFlag, SDValue CmpLHS,
+                      const SDLoc &DL, SDNode *Node) const;
 
   bool selectAddrFrameIndex(SDValue Addr, SDValue &Base, SDValue &Offset) const;
   bool selectAddrFrameIndexOffset(SDValue Addr, SDValue &Base, SDValue &Offset,
@@ -47,9 +49,6 @@ private:
   bool selectAddrRegImm(SDValue Addr, SDValue &Base,
                         SDValue &Offset) const override;
 
-  bool selectAddrRegReg(SDValue Addr, SDValue &Base,
-                        SDValue &Offset) const override;
-
   bool selectAddrDefault(SDValue Addr, SDValue &Base,
                          SDValue &Offset) const override;
 
@@ -62,14 +61,23 @@ private:
   bool selectAddrRegImm10(SDValue Addr, SDValue &Base,
                           SDValue &Offset) const;
 
+  bool selectAddrRegImm11(SDValue Addr, SDValue &Base,
+                          SDValue &Offset) const;
+
   bool selectAddrRegImm12(SDValue Addr, SDValue &Base,
                           SDValue &Offset) const;
 
   bool selectAddrRegImm16(SDValue Addr, SDValue &Base,
                           SDValue &Offset) const;
 
-  bool selectIntAddrMM(SDValue Addr, SDValue &Base,
-                       SDValue &Offset) const override;
+  bool selectIntAddr11MM(SDValue Addr, SDValue &Base,
+                         SDValue &Offset) const override;
+
+  bool selectIntAddr12MM(SDValue Addr, SDValue &Base,
+                         SDValue &Offset) const override;
+
+  bool selectIntAddr16MM(SDValue Addr, SDValue &Base,
+                         SDValue &Offset) const override;
 
   bool selectIntAddrLSL2MM(SDValue Addr, SDValue &Base,
                            SDValue &Offset) const override;
@@ -111,7 +119,7 @@ private:
   /// starting at bit zero.
   bool selectVSplatMaskR(SDValue N, SDValue &Imm) const override;
 
-  std::pair<bool, SDNode*> selectNode(SDNode *Node) override;
+  bool trySelect(SDNode *Node) override;
 
   void processFunctionAfterISel(MachineFunction &MF) override;
 
@@ -124,8 +132,8 @@ private:
                                     std::vector<SDValue> &OutOps) override;
 };
 
-FunctionPass *createMipsSEISelDag(MipsTargetMachine &TM);
-
+FunctionPass *createMipsSEISelDag(MipsTargetMachine &TM,
+                                  CodeGenOpt::Level OptLevel);
 }
 
 #endif
diff --git a/lib/Target/Mips/MipsSEISelLowering.cpp b/lib/Target/Mips/MipsSEISelLowering.cpp
index efe22fba98ce..80c000d5746d 100644
--- a/lib/Target/Mips/MipsSEISelLowering.cpp
+++ b/lib/Target/Mips/MipsSEISelLowering.cpp
@@ -793,7 +793,7 @@ static SDValue performSUBECombine(SDNode *N, SelectionDAG &DAG,
   return SDValue();
 }
 
-static SDValue genConstMult(SDValue X, uint64_t C, SDLoc DL, EVT VT,
+static SDValue genConstMult(SDValue X, uint64_t C, const SDLoc &DL, EVT VT,
                             EVT ShiftTy, SelectionDAG &DAG) {
   // Clear the upper (64 - VT.sizeInBits) bits.
   C &= ((uint64_t)-1) >> (64 - VT.getSizeInBits());
@@ -1111,9 +1111,9 @@ MipsSETargetLowering::PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const {
 }
 
 MachineBasicBlock *
-MipsSETargetLowering::EmitInstrWithCustomInserter(MachineInstr *MI,
+MipsSETargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI,
                                                   MachineBasicBlock *BB) const {
-  switch (MI->getOpcode()) {
+  switch (MI.getOpcode()) {
   default:
     return MipsTargetLowering::EmitInstrWithCustomInserter(MI, BB);
   case Mips::BPOSGE32_PSEUDO:
@@ -1218,17 +1218,14 @@ SDValue MipsSETargetLowering::lowerLOAD(SDValue Op, SelectionDAG &DAG) const {
   EVT PtrVT = Ptr.getValueType();
 
   // i32 load from lower address.
-  SDValue Lo = DAG.getLoad(MVT::i32, DL, Chain, Ptr,
-                           MachinePointerInfo(), Nd.isVolatile(),
-                           Nd.isNonTemporal(), Nd.isInvariant(),
-                           Nd.getAlignment());
+  SDValue Lo = DAG.getLoad(MVT::i32, DL, Chain, Ptr, MachinePointerInfo(),
+                           Nd.getAlignment(), Nd.getMemOperand()->getFlags());
 
   // i32 load from higher address.
   Ptr = DAG.getNode(ISD::ADD, DL, PtrVT, Ptr, DAG.getConstant(4, DL, PtrVT));
-  SDValue Hi = DAG.getLoad(MVT::i32, DL, Lo.getValue(1), Ptr,
-                           MachinePointerInfo(), Nd.isVolatile(),
-                           Nd.isNonTemporal(), Nd.isInvariant(),
-                           std::min(Nd.getAlignment(), 4U));
+  SDValue Hi = DAG.getLoad(
+      MVT::i32, DL, Lo.getValue(1), Ptr, MachinePointerInfo(),
+      std::min(Nd.getAlignment(), 4U), Nd.getMemOperand()->getFlags());
 
   if (!Subtarget.isLittle())
     std::swap(Lo, Hi);
@@ -1257,15 +1254,15 @@ SDValue MipsSETargetLowering::lowerSTORE(SDValue Op, SelectionDAG &DAG) const {
     std::swap(Lo, Hi);
 
   // i32 store to lower address.
-  Chain = DAG.getStore(Chain, DL, Lo, Ptr, MachinePointerInfo(),
-                       Nd.isVolatile(), Nd.isNonTemporal(), Nd.getAlignment(),
-                       Nd.getAAInfo());
+  Chain =
+      DAG.getStore(Chain, DL, Lo, Ptr, MachinePointerInfo(), Nd.getAlignment(),
+                   Nd.getMemOperand()->getFlags(), Nd.getAAInfo());
 
   // i32 store to higher address.
   Ptr = DAG.getNode(ISD::ADD, DL, PtrVT, Ptr, DAG.getConstant(4, DL, PtrVT));
   return DAG.getStore(Chain, DL, Hi, Ptr, MachinePointerInfo(),
-                      Nd.isVolatile(), Nd.isNonTemporal(),
-                      std::min(Nd.getAlignment(), 4U), Nd.getAAInfo());
+                      std::min(Nd.getAlignment(), 4U),
+                      Nd.getMemOperand()->getFlags(), Nd.getAAInfo());
 }
 
 SDValue MipsSETargetLowering::lowerMulDiv(SDValue Op, unsigned NewOpc,
@@ -1292,8 +1289,7 @@ SDValue MipsSETargetLowering::lowerMulDiv(SDValue Op, unsigned NewOpc,
   return DAG.getMergeValues(Vals, DL);
 }
 
-
-static SDValue initAccumulator(SDValue In, SDLoc DL, SelectionDAG &DAG) {
+static SDValue initAccumulator(SDValue In, const SDLoc &DL, SelectionDAG &DAG) {
   SDValue InLo = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, MVT::i32, In,
                              DAG.getConstant(0, DL, MVT::i32));
   SDValue InHi = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, MVT::i32, In,
@@ -1301,7 +1297,7 @@ static SDValue initAccumulator(SDValue In, SDLoc DL, SelectionDAG &DAG) {
   return DAG.getNode(MipsISD::MTLOHI, DL, MVT::Untyped, InLo, InHi);
 }
 
-static SDValue extractLOHI(SDValue Op, SDLoc DL, SelectionDAG &DAG) {
+static SDValue extractLOHI(SDValue Op, const SDLoc &DL, SelectionDAG &DAG) {
   SDValue Lo = DAG.getNode(MipsISD::MFLO, DL, MVT::i32, Op);
   SDValue Hi = DAG.getNode(MipsISD::MFHI, DL, MVT::i32, Op);
   return DAG.getNode(ISD::BUILD_PAIR, DL, MVT::i64, Lo, Hi);
@@ -1401,8 +1397,8 @@ static SDValue lowerMSASplatZExt(SDValue Op, unsigned OpNr, SelectionDAG &DAG) {
   SDValue Ops[16] = { LaneA, LaneB, LaneA, LaneB, LaneA, LaneB, LaneA, LaneB,
                       LaneA, LaneB, LaneA, LaneB, LaneA, LaneB, LaneA, LaneB };
 
-  SDValue Result = DAG.getNode(ISD::BUILD_VECTOR, DL, ViaVecTy,
-                       makeArrayRef(Ops, ViaVecTy.getVectorNumElements()));
+  SDValue Result = DAG.getBuildVector(
+      ViaVecTy, DL, makeArrayRef(Ops, ViaVecTy.getVectorNumElements()));
 
   if (ViaVecTy != ResVecTy)
     Result = DAG.getNode(ISD::BITCAST, DL, ResVecTy, Result);
@@ -1442,8 +1438,8 @@ static SDValue getBuildVectorSplat(EVT VecTy, SDValue SplatValue,
                       SplatValueA, SplatValueB, SplatValueA, SplatValueB,
                       SplatValueA, SplatValueB, SplatValueA, SplatValueB };
 
-  SDValue Result = DAG.getNode(ISD::BUILD_VECTOR, DL, ViaVecTy,
-                       makeArrayRef(Ops, ViaVecTy.getVectorNumElements()));
+  SDValue Result = DAG.getBuildVector(
+      ViaVecTy, DL, makeArrayRef(Ops, ViaVecTy.getVectorNumElements()));
 
   if (VecTy != ViaVecTy)
     Result = DAG.getNode(ISD::BITCAST, DL, VecTy, Result);
@@ -1471,10 +1467,10 @@ static SDValue lowerMSABinaryBitImmIntr(SDValue Op, SelectionDAG &DAG,
       if (BigEndian)
         std::swap(BitImmLoOp, BitImmHiOp);
 
-      Exp2Imm =
-          DAG.getNode(ISD::BITCAST, DL, MVT::v2i64,
-                      DAG.getNode(ISD::BUILD_VECTOR, DL, MVT::v4i32, BitImmLoOp,
-                                  BitImmHiOp, BitImmLoOp, BitImmHiOp));
+      Exp2Imm = DAG.getNode(
+          ISD::BITCAST, DL, MVT::v2i64,
+          DAG.getBuildVector(MVT::v4i32, DL,
+                             {BitImmLoOp, BitImmHiOp, BitImmLoOp, BitImmHiOp}));
     }
   }
 
@@ -1860,7 +1856,7 @@ SDValue MipsSETargetLowering::lowerINTRINSIC_WO_CHAIN(SDValue Op,
 
     // If ResTy is v2i64 then the type legalizer will break this node down into
     // an equivalent v4i32.
-    return DAG.getNode(ISD::BUILD_VECTOR, DL, ResTy, Ops);
+    return DAG.getBuildVector(ResTy, DL, Ops);
   }
   case Intrinsic::mips_fexp2_w:
   case Intrinsic::mips_fexp2_d: {
@@ -2166,6 +2162,10 @@ SDValue MipsSETargetLowering::lowerINTRINSIC_WO_CHAIN(SDValue Op,
   case Intrinsic::mips_xori_b:
     return DAG.getNode(ISD::XOR, DL, Op->getValueType(0),
                        Op->getOperand(1), lowerMSASplatImm(Op, 2, DAG));
+  case Intrinsic::thread_pointer: {
+    EVT PtrVT = getPointerTy(DAG.getDataLayout());
+    return DAG.getNode(MipsISD::ThreadPointer, DL, PtrVT);
+  }
   }
 }
 
@@ -2178,9 +2178,8 @@ static SDValue lowerMSALoadIntr(SDValue Op, SelectionDAG &DAG, unsigned Intr) {
   EVT PtrTy = Address->getValueType(0);
 
   Address = DAG.getNode(ISD::ADD, DL, PtrTy, Address, Offset);
-
-  return DAG.getLoad(ResTy, DL, ChainIn, Address, MachinePointerInfo(), false,
-                     false, false, 16);
+  return DAG.getLoad(ResTy, DL, ChainIn, Address, MachinePointerInfo(),
+                     /* Alignment = */ 16);
 }
 
 SDValue MipsSETargetLowering::lowerINTRINSIC_W_CHAIN(SDValue Op,
@@ -2247,8 +2246,8 @@ static SDValue lowerMSAStoreIntr(SDValue Op, SelectionDAG &DAG, unsigned Intr) {
 
   Address = DAG.getNode(ISD::ADD, DL, PtrTy, Address, Offset);
 
-  return DAG.getStore(ChainIn, DL, Value, Address, MachinePointerInfo(), false,
-                      false, 16);
+  return DAG.getStore(ChainIn, DL, Value, Address, MachinePointerInfo(),
+                      /* Alignment = */ 16);
 }
 
 SDValue MipsSETargetLowering::lowerINTRINSIC_VOID(SDValue Op,
@@ -2311,7 +2310,7 @@ lowerEXTRACT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG) const {
 }
 
 static bool isConstantOrUndef(const SDValue Op) {
-  if (Op->getOpcode() == ISD::UNDEF)
+  if (Op->isUndef())
     return true;
   if (isa<ConstantSDNode>(Op))
     return true;
@@ -2841,7 +2840,7 @@ static SDValue lowerVECTOR_SHUFFLE_VSHF(SDValue Op, EVT ResTy,
        ++I)
     Ops.push_back(DAG.getTargetConstant(*I, DL, MaskEltTy));
 
-  SDValue MaskVec = DAG.getNode(ISD::BUILD_VECTOR, DL, MaskVecTy, Ops);
+  SDValue MaskVec = DAG.getBuildVector(MaskVecTy, DL, Ops);
 
   if (Using1stVec && Using2ndVec) {
     Op0 = Op->getOperand(0);
@@ -2883,32 +2882,27 @@ SDValue MipsSETargetLowering::lowerVECTOR_SHUFFLE(SDValue Op,
   // MipsISD::VSHF.
   if (isVECTOR_SHUFFLE_SPLATI(Op, ResTy, Indices, DAG))
     return lowerVECTOR_SHUFFLE_VSHF(Op, ResTy, Indices, DAG);
-  SDValue Result = lowerVECTOR_SHUFFLE_ILVEV(Op, ResTy, Indices, DAG);
-  if (Result.getNode())
+  SDValue Result;
+  if ((Result = lowerVECTOR_SHUFFLE_ILVEV(Op, ResTy, Indices, DAG)))
     return Result;
-  Result = lowerVECTOR_SHUFFLE_ILVOD(Op, ResTy, Indices, DAG);
-  if (Result.getNode())
+  if ((Result = lowerVECTOR_SHUFFLE_ILVOD(Op, ResTy, Indices, DAG)))
     return Result;
-  Result = lowerVECTOR_SHUFFLE_ILVL(Op, ResTy, Indices, DAG);
-  if (Result.getNode())
+  if ((Result = lowerVECTOR_SHUFFLE_ILVL(Op, ResTy, Indices, DAG)))
     return Result;
-  Result = lowerVECTOR_SHUFFLE_ILVR(Op, ResTy, Indices, DAG);
-  if (Result.getNode())
+  if ((Result = lowerVECTOR_SHUFFLE_ILVR(Op, ResTy, Indices, DAG)))
     return Result;
-  Result = lowerVECTOR_SHUFFLE_PCKEV(Op, ResTy, Indices, DAG);
-  if (Result.getNode())
+  if ((Result = lowerVECTOR_SHUFFLE_PCKEV(Op, ResTy, Indices, DAG)))
     return Result;
-  Result = lowerVECTOR_SHUFFLE_PCKOD(Op, ResTy, Indices, DAG);
-  if (Result.getNode())
+  if ((Result = lowerVECTOR_SHUFFLE_PCKOD(Op, ResTy, Indices, DAG)))
     return Result;
-  Result = lowerVECTOR_SHUFFLE_SHF(Op, ResTy, Indices, DAG);
-  if (Result.getNode())
+  if ((Result = lowerVECTOR_SHUFFLE_SHF(Op, ResTy, Indices, DAG)))
     return Result;
   return lowerVECTOR_SHUFFLE_VSHF(Op, ResTy, Indices, DAG);
 }
 
-MachineBasicBlock * MipsSETargetLowering::
-emitBPOSGE32(MachineInstr *MI, MachineBasicBlock *BB) const{
+MachineBasicBlock *
+MipsSETargetLowering::emitBPOSGE32(MachineInstr &MI,
+                                   MachineBasicBlock *BB) const {
   // $bb:
   //  bposge32_pseudo $vr0
   //  =>
@@ -2925,7 +2919,7 @@ emitBPOSGE32(MachineInstr *MI, MachineBasicBlock *BB) const{
   MachineRegisterInfo &RegInfo = BB->getParent()->getRegInfo();
   const TargetInstrInfo *TII = Subtarget.getInstrInfo();
   const TargetRegisterClass *RC = &Mips::GPR32RegClass;
-  DebugLoc DL = MI->getDebugLoc();
+  DebugLoc DL = MI.getDebugLoc();
   const BasicBlock *LLVM_BB = BB->getBasicBlock();
   MachineFunction::iterator It = std::next(MachineFunction::iterator(BB));
   MachineFunction *F = BB->getParent();
@@ -2949,6 +2943,8 @@ emitBPOSGE32(MachineInstr *MI, MachineBasicBlock *BB) const{
 
   // Insert the real bposge32 instruction to $BB.
   BuildMI(BB, DL, TII->get(Mips::BPOSGE32)).addMBB(TBB);
+  // Insert the real bposge32c instruction to $BB.
+  BuildMI(BB, DL, TII->get(Mips::BPOSGE32C_MMR3)).addMBB(TBB);
 
   // Fill $FBB.
   unsigned VR2 = RegInfo.createVirtualRegister(RC);
@@ -2963,16 +2959,18 @@ emitBPOSGE32(MachineInstr *MI, MachineBasicBlock *BB) const{
 
   // Insert phi function to $Sink.
   BuildMI(*Sink, Sink->begin(), DL, TII->get(Mips::PHI),
-          MI->getOperand(0).getReg())
-    .addReg(VR2).addMBB(FBB).addReg(VR1).addMBB(TBB);
+          MI.getOperand(0).getReg())
+      .addReg(VR2)
+      .addMBB(FBB)
+      .addReg(VR1)
+      .addMBB(TBB);
 
-  MI->eraseFromParent();   // The pseudo instruction is gone now.
+  MI.eraseFromParent(); // The pseudo instruction is gone now.
   return Sink;
 }
 
-MachineBasicBlock * MipsSETargetLowering::
-emitMSACBranchPseudo(MachineInstr *MI, MachineBasicBlock *BB,
-                     unsigned BranchOp) const{
+MachineBasicBlock *MipsSETargetLowering::emitMSACBranchPseudo(
+    MachineInstr &MI, MachineBasicBlock *BB, unsigned BranchOp) const {
   // $bb:
   //  vany_nonzero $rd, $ws
   //  =>
@@ -2990,7 +2988,7 @@ emitMSACBranchPseudo(MachineInstr *MI, MachineBasicBlock *BB,
   MachineRegisterInfo &RegInfo = BB->getParent()->getRegInfo();
   const TargetInstrInfo *TII = Subtarget.getInstrInfo();
   const TargetRegisterClass *RC = &Mips::GPR32RegClass;
-  DebugLoc DL = MI->getDebugLoc();
+  DebugLoc DL = MI.getDebugLoc();
   const BasicBlock *LLVM_BB = BB->getBasicBlock();
   MachineFunction::iterator It = std::next(MachineFunction::iterator(BB));
   MachineFunction *F = BB->getParent();
@@ -3014,8 +3012,8 @@ emitMSACBranchPseudo(MachineInstr *MI, MachineBasicBlock *BB,
 
   // Insert the real bnz.b instruction to $BB.
   BuildMI(BB, DL, TII->get(BranchOp))
-    .addReg(MI->getOperand(1).getReg())
-    .addMBB(TBB);
+      .addReg(MI.getOperand(1).getReg())
+      .addMBB(TBB);
 
   // Fill $FBB.
   unsigned RD1 = RegInfo.createVirtualRegister(RC);
@@ -3030,10 +3028,13 @@ emitMSACBranchPseudo(MachineInstr *MI, MachineBasicBlock *BB,
 
   // Insert phi function to $Sink.
   BuildMI(*Sink, Sink->begin(), DL, TII->get(Mips::PHI),
-          MI->getOperand(0).getReg())
-    .addReg(RD1).addMBB(FBB).addReg(RD2).addMBB(TBB);
+          MI.getOperand(0).getReg())
+      .addReg(RD1)
+      .addMBB(FBB)
+      .addReg(RD2)
+      .addMBB(TBB);
 
-  MI->eraseFromParent();   // The pseudo instruction is gone now.
+  MI.eraseFromParent(); // The pseudo instruction is gone now.
   return Sink;
 }
 
@@ -3047,14 +3048,15 @@ emitMSACBranchPseudo(MachineInstr *MI, MachineBasicBlock *BB,
 // When n is zero, the equivalent operation can be performed with (potentially)
 // zero instructions due to register overlaps. This optimization is never valid
 // for lane 1 because it would require FR=0 mode which isn't supported by MSA.
-MachineBasicBlock * MipsSETargetLowering::
-emitCOPY_FW(MachineInstr *MI, MachineBasicBlock *BB) const{
+MachineBasicBlock *
+MipsSETargetLowering::emitCOPY_FW(MachineInstr &MI,
+                                  MachineBasicBlock *BB) const {
   const TargetInstrInfo *TII = Subtarget.getInstrInfo();
   MachineRegisterInfo &RegInfo = BB->getParent()->getRegInfo();
-  DebugLoc DL = MI->getDebugLoc();
-  unsigned Fd = MI->getOperand(0).getReg();
-  unsigned Ws = MI->getOperand(1).getReg();
-  unsigned Lane = MI->getOperand(2).getImm();
+  DebugLoc DL = MI.getDebugLoc();
+  unsigned Fd = MI.getOperand(0).getReg();
+  unsigned Ws = MI.getOperand(1).getReg();
+  unsigned Lane = MI.getOperand(2).getImm();
 
   if (Lane == 0) {
     unsigned Wt = Ws;
@@ -3076,7 +3078,7 @@ emitCOPY_FW(MachineInstr *MI, MachineBasicBlock *BB) const{
     BuildMI(*BB, MI, DL, TII->get(Mips::COPY), Fd).addReg(Wt, 0, Mips::sub_lo);
   }
 
-  MI->eraseFromParent();   // The pseudo instruction is gone now.
+  MI.eraseFromParent(); // The pseudo instruction is gone now.
   return BB;
 }
 
@@ -3090,16 +3092,17 @@ emitCOPY_FW(MachineInstr *MI, MachineBasicBlock *BB) const{
 // When n is zero, the equivalent operation can be performed with (potentially)
 // zero instructions due to register overlaps. This optimization is always
 // valid because FR=1 mode which is the only supported mode in MSA.
-MachineBasicBlock * MipsSETargetLowering::
-emitCOPY_FD(MachineInstr *MI, MachineBasicBlock *BB) const{
+MachineBasicBlock *
+MipsSETargetLowering::emitCOPY_FD(MachineInstr &MI,
+                                  MachineBasicBlock *BB) const {
   assert(Subtarget.isFP64bit());
 
   const TargetInstrInfo *TII = Subtarget.getInstrInfo();
   MachineRegisterInfo &RegInfo = BB->getParent()->getRegInfo();
-  unsigned Fd  = MI->getOperand(0).getReg();
-  unsigned Ws  = MI->getOperand(1).getReg();
-  unsigned Lane = MI->getOperand(2).getImm() * 2;
-  DebugLoc DL = MI->getDebugLoc();
+  unsigned Fd = MI.getOperand(0).getReg();
+  unsigned Ws = MI.getOperand(1).getReg();
+  unsigned Lane = MI.getOperand(2).getImm() * 2;
+  DebugLoc DL = MI.getDebugLoc();
 
   if (Lane == 0)
     BuildMI(*BB, MI, DL, TII->get(Mips::COPY), Fd).addReg(Ws, 0, Mips::sub_64);
@@ -3110,7 +3113,7 @@ emitCOPY_FD(MachineInstr *MI, MachineBasicBlock *BB) const{
     BuildMI(*BB, MI, DL, TII->get(Mips::COPY), Fd).addReg(Wt, 0, Mips::sub_64);
   }
 
-  MI->eraseFromParent();   // The pseudo instruction is gone now.
+  MI.eraseFromParent(); // The pseudo instruction is gone now.
   return BB;
 }
 
@@ -3121,15 +3124,15 @@ emitCOPY_FD(MachineInstr *MI, MachineBasicBlock *BB) const{
 // subreg_to_reg $wt:sub_lo, $fs
 // insve_w $wd[$n], $wd_in, $wt[0]
 MachineBasicBlock *
-MipsSETargetLowering::emitINSERT_FW(MachineInstr *MI,
+MipsSETargetLowering::emitINSERT_FW(MachineInstr &MI,
                                     MachineBasicBlock *BB) const {
   const TargetInstrInfo *TII = Subtarget.getInstrInfo();
   MachineRegisterInfo &RegInfo = BB->getParent()->getRegInfo();
-  DebugLoc DL = MI->getDebugLoc();
-  unsigned Wd = MI->getOperand(0).getReg();
-  unsigned Wd_in = MI->getOperand(1).getReg();
-  unsigned Lane = MI->getOperand(2).getImm();
-  unsigned Fs = MI->getOperand(3).getReg();
+  DebugLoc DL = MI.getDebugLoc();
+  unsigned Wd = MI.getOperand(0).getReg();
+  unsigned Wd_in = MI.getOperand(1).getReg();
+  unsigned Lane = MI.getOperand(2).getImm();
+  unsigned Fs = MI.getOperand(3).getReg();
   unsigned Wt = RegInfo.createVirtualRegister(
       Subtarget.useOddSPReg() ? &Mips::MSA128WRegClass :
                                 &Mips::MSA128WEvensRegClass);
@@ -3144,7 +3147,7 @@ MipsSETargetLowering::emitINSERT_FW(MachineInstr *MI,
       .addReg(Wt)
       .addImm(0);
 
-  MI->eraseFromParent(); // The pseudo instruction is gone now.
+  MI.eraseFromParent(); // The pseudo instruction is gone now.
   return BB;
 }
 
@@ -3155,17 +3158,17 @@ MipsSETargetLowering::emitINSERT_FW(MachineInstr *MI,
 // subreg_to_reg $wt:sub_64, $fs
 // insve_d $wd[$n], $wd_in, $wt[0]
 MachineBasicBlock *
-MipsSETargetLowering::emitINSERT_FD(MachineInstr *MI,
+MipsSETargetLowering::emitINSERT_FD(MachineInstr &MI,
                                     MachineBasicBlock *BB) const {
   assert(Subtarget.isFP64bit());
 
   const TargetInstrInfo *TII = Subtarget.getInstrInfo();
   MachineRegisterInfo &RegInfo = BB->getParent()->getRegInfo();
-  DebugLoc DL = MI->getDebugLoc();
-  unsigned Wd = MI->getOperand(0).getReg();
-  unsigned Wd_in = MI->getOperand(1).getReg();
-  unsigned Lane = MI->getOperand(2).getImm();
-  unsigned Fs = MI->getOperand(3).getReg();
+  DebugLoc DL = MI.getDebugLoc();
+  unsigned Wd = MI.getOperand(0).getReg();
+  unsigned Wd_in = MI.getOperand(1).getReg();
+  unsigned Lane = MI.getOperand(2).getImm();
+  unsigned Fs = MI.getOperand(3).getReg();
   unsigned Wt = RegInfo.createVirtualRegister(&Mips::MSA128DRegClass);
 
   BuildMI(*BB, MI, DL, TII->get(Mips::SUBREG_TO_REG), Wt)
@@ -3178,7 +3181,7 @@ MipsSETargetLowering::emitINSERT_FD(MachineInstr *MI,
       .addReg(Wt)
       .addImm(0);
 
-  MI->eraseFromParent(); // The pseudo instruction is gone now.
+  MI.eraseFromParent(); // The pseudo instruction is gone now.
   return BB;
 }
 
@@ -3202,22 +3205,23 @@ MipsSETargetLowering::emitINSERT_FD(MachineInstr *MI,
 // (INSVE_[WD], $wdtmp2, 0, $wdtmp1, 0)
 // (NEG $lanetmp2, $lanetmp1)
 // (SLD_B $wd, $wdtmp2, $wdtmp2,  $lanetmp2)
-MachineBasicBlock *
-MipsSETargetLowering::emitINSERT_DF_VIDX(MachineInstr *MI,
-                                         MachineBasicBlock *BB,
-                                         unsigned EltSizeInBytes,
-                                         bool IsFP) const {
+MachineBasicBlock *MipsSETargetLowering::emitINSERT_DF_VIDX(
+    MachineInstr &MI, MachineBasicBlock *BB, unsigned EltSizeInBytes,
+    bool IsFP) const {
   const TargetInstrInfo *TII = Subtarget.getInstrInfo();
   MachineRegisterInfo &RegInfo = BB->getParent()->getRegInfo();
-  DebugLoc DL = MI->getDebugLoc();
-  unsigned Wd = MI->getOperand(0).getReg();
-  unsigned SrcVecReg = MI->getOperand(1).getReg();
-  unsigned LaneReg = MI->getOperand(2).getReg();
-  unsigned SrcValReg = MI->getOperand(3).getReg();
+  DebugLoc DL = MI.getDebugLoc();
+  unsigned Wd = MI.getOperand(0).getReg();
+  unsigned SrcVecReg = MI.getOperand(1).getReg();
+  unsigned LaneReg = MI.getOperand(2).getReg();
+  unsigned SrcValReg = MI.getOperand(3).getReg();
 
   const TargetRegisterClass *VecRC = nullptr;
+  // FIXME: This should be true for N32 too.
   const TargetRegisterClass *GPRRC =
       Subtarget.isABI_N64() ? &Mips::GPR64RegClass : &Mips::GPR32RegClass;
+  unsigned SubRegIdx = Subtarget.isABI_N64() ? Mips::sub_32 : 0;
+  unsigned ShiftOp = Subtarget.isABI_N64() ? Mips::DSLL : Mips::SLL;
   unsigned EltLog2Size;
   unsigned InsertOp = 0;
   unsigned InsveOp = 0;
@@ -3262,7 +3266,7 @@ MipsSETargetLowering::emitINSERT_DF_VIDX(MachineInstr *MI,
   // Convert the lane index into a byte index
   if (EltSizeInBytes != 1) {
     unsigned LaneTmp1 = RegInfo.createVirtualRegister(GPRRC);
-    BuildMI(*BB, MI, DL, TII->get(Mips::SLL), LaneTmp1)
+    BuildMI(*BB, MI, DL, TII->get(ShiftOp), LaneTmp1)
         .addReg(LaneReg)
         .addImm(EltLog2Size);
     LaneReg = LaneTmp1;
@@ -3273,7 +3277,7 @@ MipsSETargetLowering::emitINSERT_DF_VIDX(MachineInstr *MI,
   BuildMI(*BB, MI, DL, TII->get(Mips::SLD_B), WdTmp1)
       .addReg(SrcVecReg)
       .addReg(SrcVecReg)
-      .addReg(LaneReg);
+      .addReg(LaneReg, 0, SubRegIdx);
 
   unsigned WdTmp2 = RegInfo.createVirtualRegister(VecRC);
   if (IsFP) {
@@ -3302,9 +3306,9 @@ MipsSETargetLowering::emitINSERT_DF_VIDX(MachineInstr *MI,
   BuildMI(*BB, MI, DL, TII->get(Mips::SLD_B), Wd)
       .addReg(WdTmp2)
       .addReg(WdTmp2)
-      .addReg(LaneTmp2);
+      .addReg(LaneTmp2, 0, SubRegIdx);
 
-  MI->eraseFromParent(); // The pseudo instruction is gone now.
+  MI.eraseFromParent(); // The pseudo instruction is gone now.
   return BB;
 }
 
@@ -3316,13 +3320,13 @@ MipsSETargetLowering::emitINSERT_DF_VIDX(MachineInstr *MI,
 // insert_subreg $wt2:subreg_lo, $wt1, $fs
 // splati.w $wd, $wt2[0]
 MachineBasicBlock *
-MipsSETargetLowering::emitFILL_FW(MachineInstr *MI,
+MipsSETargetLowering::emitFILL_FW(MachineInstr &MI,
                                   MachineBasicBlock *BB) const {
   const TargetInstrInfo *TII = Subtarget.getInstrInfo();
   MachineRegisterInfo &RegInfo = BB->getParent()->getRegInfo();
-  DebugLoc DL = MI->getDebugLoc();
-  unsigned Wd = MI->getOperand(0).getReg();
-  unsigned Fs = MI->getOperand(1).getReg();
+  DebugLoc DL = MI.getDebugLoc();
+  unsigned Wd = MI.getOperand(0).getReg();
+  unsigned Fs = MI.getOperand(1).getReg();
   unsigned Wt1 = RegInfo.createVirtualRegister(&Mips::MSA128WRegClass);
   unsigned Wt2 = RegInfo.createVirtualRegister(&Mips::MSA128WRegClass);
 
@@ -3333,7 +3337,7 @@ MipsSETargetLowering::emitFILL_FW(MachineInstr *MI,
       .addImm(Mips::sub_lo);
   BuildMI(*BB, MI, DL, TII->get(Mips::SPLATI_W), Wd).addReg(Wt2).addImm(0);
 
-  MI->eraseFromParent(); // The pseudo instruction is gone now.
+  MI.eraseFromParent(); // The pseudo instruction is gone now.
   return BB;
 }
 
@@ -3345,15 +3349,15 @@ MipsSETargetLowering::emitFILL_FW(MachineInstr *MI,
 // insert_subreg $wt2:subreg_64, $wt1, $fs
 // splati.d $wd, $wt2[0]
 MachineBasicBlock *
-MipsSETargetLowering::emitFILL_FD(MachineInstr *MI,
+MipsSETargetLowering::emitFILL_FD(MachineInstr &MI,
                                   MachineBasicBlock *BB) const {
   assert(Subtarget.isFP64bit());
 
   const TargetInstrInfo *TII = Subtarget.getInstrInfo();
   MachineRegisterInfo &RegInfo = BB->getParent()->getRegInfo();
-  DebugLoc DL = MI->getDebugLoc();
-  unsigned Wd = MI->getOperand(0).getReg();
-  unsigned Fs = MI->getOperand(1).getReg();
+  DebugLoc DL = MI.getDebugLoc();
+  unsigned Wd = MI.getOperand(0).getReg();
+  unsigned Fs = MI.getOperand(1).getReg();
   unsigned Wt1 = RegInfo.createVirtualRegister(&Mips::MSA128DRegClass);
   unsigned Wt2 = RegInfo.createVirtualRegister(&Mips::MSA128DRegClass);
 
@@ -3364,7 +3368,7 @@ MipsSETargetLowering::emitFILL_FD(MachineInstr *MI,
       .addImm(Mips::sub_64);
   BuildMI(*BB, MI, DL, TII->get(Mips::SPLATI_D), Wd).addReg(Wt2).addImm(0);
 
-  MI->eraseFromParent();   // The pseudo instruction is gone now.
+  MI.eraseFromParent(); // The pseudo instruction is gone now.
   return BB;
 }
 
@@ -3375,25 +3379,25 @@ MipsSETargetLowering::emitFILL_FD(MachineInstr *MI,
 // ldi.w $ws, 1
 // fexp2.w $wd, $ws, $wt
 MachineBasicBlock *
-MipsSETargetLowering::emitFEXP2_W_1(MachineInstr *MI,
+MipsSETargetLowering::emitFEXP2_W_1(MachineInstr &MI,
                                     MachineBasicBlock *BB) const {
   const TargetInstrInfo *TII = Subtarget.getInstrInfo();
   MachineRegisterInfo &RegInfo = BB->getParent()->getRegInfo();
   const TargetRegisterClass *RC = &Mips::MSA128WRegClass;
   unsigned Ws1 = RegInfo.createVirtualRegister(RC);
   unsigned Ws2 = RegInfo.createVirtualRegister(RC);
-  DebugLoc DL = MI->getDebugLoc();
+  DebugLoc DL = MI.getDebugLoc();
 
   // Splat 1.0 into a vector
   BuildMI(*BB, MI, DL, TII->get(Mips::LDI_W), Ws1).addImm(1);
   BuildMI(*BB, MI, DL, TII->get(Mips::FFINT_U_W), Ws2).addReg(Ws1);
 
   // Emit 1.0 * fexp2(Wt)
-  BuildMI(*BB, MI, DL, TII->get(Mips::FEXP2_W), MI->getOperand(0).getReg())
+  BuildMI(*BB, MI, DL, TII->get(Mips::FEXP2_W), MI.getOperand(0).getReg())
       .addReg(Ws2)
-      .addReg(MI->getOperand(1).getReg());
+      .addReg(MI.getOperand(1).getReg());
 
-  MI->eraseFromParent(); // The pseudo instruction is gone now.
+  MI.eraseFromParent(); // The pseudo instruction is gone now.
   return BB;
 }
 
@@ -3404,24 +3408,24 @@ MipsSETargetLowering::emitFEXP2_W_1(MachineInstr *MI,
 // ldi.d $ws, 1
 // fexp2.d $wd, $ws, $wt
 MachineBasicBlock *
-MipsSETargetLowering::emitFEXP2_D_1(MachineInstr *MI,
+MipsSETargetLowering::emitFEXP2_D_1(MachineInstr &MI,
                                     MachineBasicBlock *BB) const {
   const TargetInstrInfo *TII = Subtarget.getInstrInfo();
   MachineRegisterInfo &RegInfo = BB->getParent()->getRegInfo();
   const TargetRegisterClass *RC = &Mips::MSA128DRegClass;
   unsigned Ws1 = RegInfo.createVirtualRegister(RC);
   unsigned Ws2 = RegInfo.createVirtualRegister(RC);
-  DebugLoc DL = MI->getDebugLoc();
+  DebugLoc DL = MI.getDebugLoc();
 
   // Splat 1.0 into a vector
   BuildMI(*BB, MI, DL, TII->get(Mips::LDI_D), Ws1).addImm(1);
   BuildMI(*BB, MI, DL, TII->get(Mips::FFINT_U_D), Ws2).addReg(Ws1);
 
   // Emit 1.0 * fexp2(Wt)
-  BuildMI(*BB, MI, DL, TII->get(Mips::FEXP2_D), MI->getOperand(0).getReg())
+  BuildMI(*BB, MI, DL, TII->get(Mips::FEXP2_D), MI.getOperand(0).getReg())
       .addReg(Ws2)
-      .addReg(MI->getOperand(1).getReg());
+      .addReg(MI.getOperand(1).getReg());
 
-  MI->eraseFromParent(); // The pseudo instruction is gone now.
+  MI.eraseFromParent(); // The pseudo instruction is gone now.
   return BB;
 }
diff --git a/lib/Target/Mips/MipsSEISelLowering.h b/lib/Target/Mips/MipsSEISelLowering.h
index d44f8d82ec3e..54154662f261 100644
--- a/lib/Target/Mips/MipsSEISelLowering.h
+++ b/lib/Target/Mips/MipsSEISelLowering.h
@@ -40,7 +40,7 @@ namespace llvm {
     SDValue PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const override;
 
     MachineBasicBlock *
-    EmitInstrWithCustomInserter(MachineInstr *MI,
+    EmitInstrWithCustomInserter(MachineInstr &MI,
                                 MachineBasicBlock *MBB) const override;
 
     bool isShuffleMaskLegal(const SmallVectorImpl<int> &Mask,
@@ -77,39 +77,39 @@ namespace llvm {
     /// depending on the indices in the shuffle.
     SDValue lowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) const;
 
-    MachineBasicBlock *emitBPOSGE32(MachineInstr *MI,
+    MachineBasicBlock *emitBPOSGE32(MachineInstr &MI,
                                     MachineBasicBlock *BB) const;
-    MachineBasicBlock *emitMSACBranchPseudo(MachineInstr *MI,
+    MachineBasicBlock *emitMSACBranchPseudo(MachineInstr &MI,
                                             MachineBasicBlock *BB,
                                             unsigned BranchOp) const;
     /// \brief Emit the COPY_FW pseudo instruction
-    MachineBasicBlock *emitCOPY_FW(MachineInstr *MI,
+    MachineBasicBlock *emitCOPY_FW(MachineInstr &MI,
                                    MachineBasicBlock *BB) const;
     /// \brief Emit the COPY_FD pseudo instruction
-    MachineBasicBlock *emitCOPY_FD(MachineInstr *MI,
+    MachineBasicBlock *emitCOPY_FD(MachineInstr &MI,
                                    MachineBasicBlock *BB) const;
     /// \brief Emit the INSERT_FW pseudo instruction
-    MachineBasicBlock *emitINSERT_FW(MachineInstr *MI,
+    MachineBasicBlock *emitINSERT_FW(MachineInstr &MI,
                                      MachineBasicBlock *BB) const;
     /// \brief Emit the INSERT_FD pseudo instruction
-    MachineBasicBlock *emitINSERT_FD(MachineInstr *MI,
+    MachineBasicBlock *emitINSERT_FD(MachineInstr &MI,
                                      MachineBasicBlock *BB) const;
     /// \brief Emit the INSERT_([BHWD]|F[WD])_VIDX pseudo instruction
-    MachineBasicBlock *emitINSERT_DF_VIDX(MachineInstr *MI,
+    MachineBasicBlock *emitINSERT_DF_VIDX(MachineInstr &MI,
                                           MachineBasicBlock *BB,
                                           unsigned EltSizeInBytes,
                                           bool IsFP) const;
     /// \brief Emit the FILL_FW pseudo instruction
-    MachineBasicBlock *emitFILL_FW(MachineInstr *MI,
+    MachineBasicBlock *emitFILL_FW(MachineInstr &MI,
                                    MachineBasicBlock *BB) const;
     /// \brief Emit the FILL_FD pseudo instruction
-    MachineBasicBlock *emitFILL_FD(MachineInstr *MI,
+    MachineBasicBlock *emitFILL_FD(MachineInstr &MI,
                                    MachineBasicBlock *BB) const;
     /// \brief Emit the FEXP2_W_1 pseudo instructions.
-    MachineBasicBlock *emitFEXP2_W_1(MachineInstr *MI,
+    MachineBasicBlock *emitFEXP2_W_1(MachineInstr &MI,
                                      MachineBasicBlock *BB) const;
     /// \brief Emit the FEXP2_D_1 pseudo instructions.
-    MachineBasicBlock *emitFEXP2_D_1(MachineInstr *MI,
+    MachineBasicBlock *emitFEXP2_D_1(MachineInstr &MI,
                                      MachineBasicBlock *BB) const;
   };
 }
diff --git a/lib/Target/Mips/MipsSEInstrInfo.cpp b/lib/Target/Mips/MipsSEInstrInfo.cpp
index d4aeaf928655..29107b2c1aa5 100644
--- a/lib/Target/Mips/MipsSEInstrInfo.cpp
+++ b/lib/Target/Mips/MipsSEInstrInfo.cpp
@@ -13,20 +13,20 @@
 
 #include "MipsSEInstrInfo.h"
 #include "InstPrinter/MipsInstPrinter.h"
+#include "MipsAnalyzeImmediate.h"
 #include "MipsMachineFunction.h"
 #include "MipsTargetMachine.h"
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/CodeGen/MachineInstrBuilder.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
-#include "llvm/Support/CommandLine.h"
 #include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/MathExtras.h"
 #include "llvm/Support/TargetRegistry.h"
 
 using namespace llvm;
 
 MipsSEInstrInfo::MipsSEInstrInfo(const MipsSubtarget &STI)
-    : MipsInstrInfo(STI, STI.getRelocationModel() == Reloc::PIC_ ? Mips::B
-                                                                 : Mips::J),
+    : MipsInstrInfo(STI, STI.isPositionIndependent() ? Mips::B : Mips::J),
       RI() {}
 
 const MipsRegisterInfo &MipsSEInstrInfo::getRegisterInfo() const {
@@ -38,17 +38,17 @@ const MipsRegisterInfo &MipsSEInstrInfo::getRegisterInfo() const {
 /// the destination along with the FrameIndex of the loaded stack slot.  If
 /// not, return 0.  This predicate must return 0 if the instruction has
 /// any side effects other than loading from the stack slot.
-unsigned MipsSEInstrInfo::isLoadFromStackSlot(const MachineInstr *MI,
+unsigned MipsSEInstrInfo::isLoadFromStackSlot(const MachineInstr &MI,
                                               int &FrameIndex) const {
-  unsigned Opc = MI->getOpcode();
+  unsigned Opc = MI.getOpcode();
 
   if ((Opc == Mips::LW)   || (Opc == Mips::LD)   ||
       (Opc == Mips::LWC1) || (Opc == Mips::LDC1) || (Opc == Mips::LDC164)) {
-    if ((MI->getOperand(1).isFI()) && // is a stack slot
-        (MI->getOperand(2).isImm()) &&  // the imm is zero
-        (isZeroImm(MI->getOperand(2)))) {
-      FrameIndex = MI->getOperand(1).getIndex();
-      return MI->getOperand(0).getReg();
+    if ((MI.getOperand(1).isFI()) &&  // is a stack slot
+        (MI.getOperand(2).isImm()) && // the imm is zero
+        (isZeroImm(MI.getOperand(2)))) {
+      FrameIndex = MI.getOperand(1).getIndex();
+      return MI.getOperand(0).getReg();
     }
   }
 
@@ -60,26 +60,26 @@ unsigned MipsSEInstrInfo::isLoadFromStackSlot(const MachineInstr *MI,
 /// the source reg along with the FrameIndex of the loaded stack slot.  If
 /// not, return 0.  This predicate must return 0 if the instruction has
 /// any side effects other than storing to the stack slot.
-unsigned MipsSEInstrInfo::isStoreToStackSlot(const MachineInstr *MI,
+unsigned MipsSEInstrInfo::isStoreToStackSlot(const MachineInstr &MI,
                                              int &FrameIndex) const {
-  unsigned Opc = MI->getOpcode();
+  unsigned Opc = MI.getOpcode();
 
   if ((Opc == Mips::SW)   || (Opc == Mips::SD)   ||
       (Opc == Mips::SWC1) || (Opc == Mips::SDC1) || (Opc == Mips::SDC164)) {
-    if ((MI->getOperand(1).isFI()) && // is a stack slot
-        (MI->getOperand(2).isImm()) &&  // the imm is zero
-        (isZeroImm(MI->getOperand(2)))) {
-      FrameIndex = MI->getOperand(1).getIndex();
-      return MI->getOperand(0).getReg();
+    if ((MI.getOperand(1).isFI()) &&  // is a stack slot
+        (MI.getOperand(2).isImm()) && // the imm is zero
+        (isZeroImm(MI.getOperand(2)))) {
+      FrameIndex = MI.getOperand(1).getIndex();
+      return MI.getOperand(0).getReg();
     }
   }
   return 0;
 }
 
 void MipsSEInstrInfo::copyPhysReg(MachineBasicBlock &MBB,
-                                  MachineBasicBlock::iterator I, DebugLoc DL,
-                                  unsigned DestReg, unsigned SrcReg,
-                                  bool KillSrc) const {
+                                  MachineBasicBlock::iterator I,
+                                  const DebugLoc &DL, unsigned DestReg,
+                                  unsigned SrcReg, bool KillSrc) const {
   unsigned Opc = 0, ZeroReg = 0;
   bool isMicroMips = Subtarget.inMicroMipsMode();
 
@@ -129,9 +129,12 @@ void MipsSEInstrInfo::copyPhysReg(MachineBasicBlock &MBB,
         .addReg(SrcReg, getKillRegState(KillSrc)).addImm(1 << 4)
         .addReg(DestReg, RegState::ImplicitDefine);
       return;
+    } else if (Mips::MSACtrlRegClass.contains(DestReg)) {
+      BuildMI(MBB, I, DL, get(Mips::CTCMSA))
+          .addReg(DestReg)
+          .addReg(SrcReg, getKillRegState(KillSrc));
+      return;
     }
-    else if (Mips::MSACtrlRegClass.contains(DestReg))
-      Opc = Mips::CTCMSA;
   }
   else if (Mips::FGR32RegClass.contains(DestReg, SrcReg))
     Opc = Mips::FMOV_S;
@@ -325,12 +328,12 @@ loadRegFromStack(MachineBasicBlock &MBB, MachineBasicBlock::iterator I,
   }
 }
 
-bool MipsSEInstrInfo::expandPostRAPseudo(MachineBasicBlock::iterator MI) const {
-  MachineBasicBlock &MBB = *MI->getParent();
+bool MipsSEInstrInfo::expandPostRAPseudo(MachineInstr &MI) const {
+  MachineBasicBlock &MBB = *MI.getParent();
   bool isMicroMips = Subtarget.inMicroMipsMode();
   unsigned Opc;
 
-  switch(MI->getDesc().getOpcode()) {
+  switch (MI.getDesc().getOpcode()) {
   default:
     return false;
   case Mips::RetRA:
@@ -420,6 +423,14 @@ unsigned MipsSEInstrInfo::getOppositeBranchOpc(unsigned Opc) const {
   case Mips::BC1F:   return Mips::BC1T;
   case Mips::BEQZC_MM: return Mips::BNEZC_MM;
   case Mips::BNEZC_MM: return Mips::BEQZC_MM;
+  case Mips::BEQZC:  return Mips::BNEZC;
+  case Mips::BNEZC:  return Mips::BEQZC;
+  case Mips::BEQC:   return Mips::BNEC;
+  case Mips::BNEC:   return Mips::BEQC;
+  case Mips::BGTZC:  return Mips::BLEZC;
+  case Mips::BGEZC:  return Mips::BLTZC;
+  case Mips::BLTZC:  return Mips::BGEZC;
+  case Mips::BLEZC:  return Mips::BGTZC;
   }
 }
 
@@ -429,26 +440,33 @@ void MipsSEInstrInfo::adjustStackPtr(unsigned SP, int64_t Amount,
                                      MachineBasicBlock::iterator I) const {
   MipsABIInfo ABI = Subtarget.getABI();
   DebugLoc DL;
-  unsigned ADDu = ABI.GetPtrAdduOp();
   unsigned ADDiu = ABI.GetPtrAddiuOp();
 
   if (Amount == 0)
     return;
 
-  if (isInt<16>(Amount))// addi sp, sp, amount
+  if (isInt<16>(Amount)) {
+    // addi sp, sp, amount
     BuildMI(MBB, I, DL, get(ADDiu), SP).addReg(SP).addImm(Amount);
-  else { // Expand immediate that doesn't fit in 16-bit.
+  } else {
+    // For numbers which are not 16bit integers we synthesize Amount inline
+    // then add or subtract it from sp.
+    unsigned Opc = ABI.GetPtrAdduOp();
+    if (Amount < 0) {
+      Opc = ABI.GetPtrSubuOp();
+      Amount = -Amount;
+    }
     unsigned Reg = loadImmediate(Amount, MBB, I, DL, nullptr);
-    BuildMI(MBB, I, DL, get(ADDu), SP).addReg(SP).addReg(Reg, RegState::Kill);
+    BuildMI(MBB, I, DL, get(Opc), SP).addReg(SP).addReg(Reg, RegState::Kill);
   }
 }
 
 /// This function generates the sequence of instructions needed to get the
 /// result of adding register REG and immediate IMM.
-unsigned
-MipsSEInstrInfo::loadImmediate(int64_t Imm, MachineBasicBlock &MBB,
-                               MachineBasicBlock::iterator II, DebugLoc DL,
-                               unsigned *NewImm) const {
+unsigned MipsSEInstrInfo::loadImmediate(int64_t Imm, MachineBasicBlock &MBB,
+                                        MachineBasicBlock::iterator II,
+                                        const DebugLoc &DL,
+                                        unsigned *NewImm) const {
   MipsAnalyzeImmediate AnalyzeImm;
   const MipsSubtarget &STI = Subtarget;
   MachineRegisterInfo &RegInfo = MBB.getParent()->getRegInfo();
@@ -493,8 +511,12 @@ unsigned MipsSEInstrInfo::getAnalyzableBrOpc(unsigned Opc) const {
           Opc == Mips::BEQ64  || Opc == Mips::BNE64  || Opc == Mips::BGTZ64 ||
           Opc == Mips::BGEZ64 || Opc == Mips::BLTZ64 || Opc == Mips::BLEZ64 ||
           Opc == Mips::BC1T   || Opc == Mips::BC1F   || Opc == Mips::B      ||
-          Opc == Mips::J || Opc == Mips::BEQZC_MM || Opc == Mips::BNEZC_MM) ?
-         Opc : 0;
+          Opc == Mips::J  || Opc == Mips::BEQZC_MM || Opc == Mips::BNEZC_MM ||
+          Opc == Mips::BEQC   || Opc == Mips::BNEC   || Opc == Mips::BLTC   ||
+          Opc == Mips::BGEC   || Opc == Mips::BLTUC  || Opc == Mips::BGEUC  ||
+          Opc == Mips::BGTZC  || Opc == Mips::BLEZC  || Opc == Mips::BGEZC  ||
+          Opc == Mips::BLTZC  || Opc == Mips::BEQZC  || Opc == Mips::BNEZC  ||
+          Opc == Mips::BC) ? Opc : 0;
 }
 
 void MipsSEInstrInfo::expandRetRA(MachineBasicBlock &MBB,
@@ -697,7 +719,7 @@ void MipsSEInstrInfo::expandEhReturn(MachineBasicBlock &MBB,
   // addu $sp, $sp, $v1
   // jr   $ra (via RetRA)
   const TargetMachine &TM = MBB.getParent()->getTarget();
-  if (TM.getRelocationModel() == Reloc::PIC_)
+  if (TM.isPositionIndependent())
     BuildMI(MBB, I, I->getDebugLoc(), get(ADDU), T9)
         .addReg(TargetReg)
         .addReg(ZERO);
diff --git a/lib/Target/Mips/MipsSEInstrInfo.h b/lib/Target/Mips/MipsSEInstrInfo.h
index 5d73545ef6b9..b356909bf1cf 100644
--- a/lib/Target/Mips/MipsSEInstrInfo.h
+++ b/lib/Target/Mips/MipsSEInstrInfo.h
@@ -32,7 +32,7 @@ public:
   /// the destination along with the FrameIndex of the loaded stack slot.  If
   /// not, return 0.  This predicate must return 0 if the instruction has
   /// any side effects other than loading from the stack slot.
-  unsigned isLoadFromStackSlot(const MachineInstr *MI,
+  unsigned isLoadFromStackSlot(const MachineInstr &MI,
                                int &FrameIndex) const override;
 
   /// isStoreToStackSlot - If the specified machine instruction is a direct
@@ -40,12 +40,11 @@ public:
   /// the source reg along with the FrameIndex of the loaded stack slot.  If
   /// not, return 0.  This predicate must return 0 if the instruction has
   /// any side effects other than storing to the stack slot.
-  unsigned isStoreToStackSlot(const MachineInstr *MI,
+  unsigned isStoreToStackSlot(const MachineInstr &MI,
                               int &FrameIndex) const override;
 
-  void copyPhysReg(MachineBasicBlock &MBB,
-                   MachineBasicBlock::iterator MI, DebugLoc DL,
-                   unsigned DestReg, unsigned SrcReg,
+  void copyPhysReg(MachineBasicBlock &MBB, MachineBasicBlock::iterator MI,
+                   const DebugLoc &DL, unsigned DestReg, unsigned SrcReg,
                    bool KillSrc) const override;
 
   void storeRegToStack(MachineBasicBlock &MBB,
@@ -62,7 +61,7 @@ public:
                         const TargetRegisterInfo *TRI,
                         int64_t Offset) const override;
 
-  bool expandPostRAPseudo(MachineBasicBlock::iterator MI) const override;
+  bool expandPostRAPseudo(MachineInstr &MI) const override;
 
   unsigned getOppositeBranchOpc(unsigned Opc) const override;
 
@@ -74,7 +73,7 @@ public:
   /// non-NULL parameter, the last instruction is not emitted, but instead
   /// its immediate operand is returned in NewImm.
   unsigned loadImmediate(int64_t Imm, MachineBasicBlock &MBB,
-                         MachineBasicBlock::iterator II, DebugLoc DL,
+                         MachineBasicBlock::iterator II, const DebugLoc &DL,
                          unsigned *NewImm) const;
 
 private:
diff --git a/lib/Target/Mips/MipsSERegisterInfo.cpp b/lib/Target/Mips/MipsSERegisterInfo.cpp
index b1e2885f5ba3..e3431cd118ab 100644
--- a/lib/Target/Mips/MipsSERegisterInfo.cpp
+++ b/lib/Target/Mips/MipsSERegisterInfo.cpp
@@ -14,12 +14,10 @@
 
 #include "MipsSERegisterInfo.h"
 #include "Mips.h"
-#include "MipsAnalyzeImmediate.h"
 #include "MipsMachineFunction.h"
 #include "MipsSEInstrInfo.h"
 #include "MipsSubtarget.h"
 #include "MipsTargetMachine.h"
-#include "llvm/ADT/BitVector.h"
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/CodeGen/MachineFrameInfo.h"
 #include "llvm/CodeGen/MachineFunction.h"
@@ -29,7 +27,6 @@
 #include "llvm/IR/DebugInfo.h"
 #include "llvm/IR/Function.h"
 #include "llvm/IR/Type.h"
-#include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/raw_ostream.h"
@@ -63,10 +60,11 @@ MipsSERegisterInfo::intRegClass(unsigned Size) const {
   return &Mips::GPR64RegClass;
 }
 
-/// Get the size of the offset supported by the given load/store.
+/// Get the size of the offset supported by the given load/store/inline asm.
 /// The result includes the effects of any scale factors applied to the
 /// instruction immediate.
-static inline unsigned getLoadStoreOffsetSizeInBits(const unsigned Opcode) {
+static inline unsigned getLoadStoreOffsetSizeInBits(const unsigned Opcode,
+                                                    MachineOperand MO) {
   switch (Opcode) {
   case Mips::LD_B:
   case Mips::ST_B:
@@ -80,6 +78,49 @@ static inline unsigned getLoadStoreOffsetSizeInBits(const unsigned Opcode) {
   case Mips::LD_D:
   case Mips::ST_D:
     return 10 + 3 /* scale factor */;
+  case Mips::LL:
+  case Mips::LL64:
+  case Mips::LLD:
+  case Mips::LLE:
+  case Mips::SC:
+  case Mips::SC64:
+  case Mips::SCD:
+  case Mips::SCE:
+    return 16;
+  case Mips::LLE_MM:
+  case Mips::LLE_MMR6:
+  case Mips::LL_MM:
+  case Mips::SCE_MM:
+  case Mips::SCE_MMR6:
+  case Mips::SC_MM:
+    return 12;
+  case Mips::LL64_R6:
+  case Mips::LL_R6:
+  case Mips::LLD_R6:
+  case Mips::SC64_R6:
+  case Mips::SCD_R6:
+  case Mips::SC_R6:
+    return 9;
+  case Mips::INLINEASM: {
+    unsigned ConstraintID = InlineAsm::getMemoryConstraintID(MO.getImm());
+    switch (ConstraintID) {
+    case InlineAsm::Constraint_ZC: {
+      const MipsSubtarget &Subtarget = MO.getParent()
+                                           ->getParent()
+                                           ->getParent()
+                                           ->getSubtarget<MipsSubtarget>();
+      if (Subtarget.inMicroMipsMode())
+        return 12;
+
+      if (Subtarget.hasMips32r6())
+        return 9;
+
+      return 16;
+    }
+    default:
+      return 16;
+    }
+  }
   default:
     return 16;
   }
@@ -169,7 +210,8 @@ void MipsSERegisterInfo::eliminateFI(MachineBasicBlock::iterator II,
     // Make sure Offset fits within the field available.
     // For MSA instructions, this is a 10-bit signed immediate (scaled by
     // element size), otherwise it is a 16-bit signed immediate.
-    unsigned OffsetBitSize = getLoadStoreOffsetSizeInBits(MI.getOpcode());
+    unsigned OffsetBitSize =
+        getLoadStoreOffsetSizeInBits(MI.getOpcode(), MI.getOperand(OpNo - 1));
     unsigned OffsetAlign = getLoadStoreOffsetAlign(MI.getOpcode());
 
     if (OffsetBitSize < 16 && isInt<16>(Offset) &&
diff --git a/lib/Target/Mips/MipsSchedule.td b/lib/Target/Mips/MipsSchedule.td
index 37f9e491d546..738b6c46407a 100644
--- a/lib/Target/Mips/MipsSchedule.td
+++ b/lib/Target/Mips/MipsSchedule.td
@@ -23,24 +23,34 @@ def IIPseudo           : InstrItinClass;
 def II_ABS              : InstrItinClass;
 def II_ADDI             : InstrItinClass;
 def II_ADDIU            : InstrItinClass;
+def II_ADDIUPC          : InstrItinClass;
+def II_ADD              : InstrItinClass;
 def II_ADDU             : InstrItinClass;
 def II_ADD_D            : InstrItinClass;
 def II_ADD_S            : InstrItinClass;
+def II_ALIGN            : InstrItinClass;
 def II_AND              : InstrItinClass;
 def II_ANDI             : InstrItinClass;
+def II_ALUIPC           : InstrItinClass;
+def II_AUI              : InstrItinClass;
+def II_AUIPC            : InstrItinClass;
 def II_B                : InstrItinClass;
 def II_BADDU            : InstrItinClass;
 def II_BBIT             : InstrItinClass; // bbit[01], bbit[01]32
+def II_BALC             : InstrItinClass;
 def II_BC               : InstrItinClass;
 def II_BC1F             : InstrItinClass;
 def II_BC1FL            : InstrItinClass;
 def II_BC1T             : InstrItinClass;
 def II_BC1TL            : InstrItinClass;
+def II_BC1CCZ           : InstrItinClass;
 def II_BCC              : InstrItinClass; // beq and bne
 def II_BCCZ             : InstrItinClass; // b[gl][et]z
+def II_BCCC             : InstrItinClass; // b<cc>c
 def II_BCCZAL           : InstrItinClass; // bgezal and bltzal
 def II_BCCZALS          : InstrItinClass; // bgezals and bltzals
 def II_BCCZC            : InstrItinClass; // beqzc, bnezc
+def II_BITSWAP          : InstrItinClass;
 def II_CEIL             : InstrItinClass;
 def II_CFC1             : InstrItinClass;
 def II_CLO              : InstrItinClass;
@@ -51,16 +61,33 @@ def II_C_CC_D           : InstrItinClass; // Any c.<cc>.d instruction
 def II_C_CC_S           : InstrItinClass; // Any c.<cc>.s instruction
 def II_DADDIU           : InstrItinClass;
 def II_DADDU            : InstrItinClass;
+def II_DADDI            : InstrItinClass;
 def II_DADD             : InstrItinClass;
+def II_DAHI             : InstrItinClass;
+def II_DATI             : InstrItinClass;
+def II_DAUI             : InstrItinClass;
+def II_DALIGN           : InstrItinClass;
+def II_DBITSWAP         : InstrItinClass;
+def II_DCLO             : InstrItinClass;
+def II_DCLZ             : InstrItinClass;
 def II_DDIV             : InstrItinClass;
 def II_DDIVU            : InstrItinClass;
 def II_DIV              : InstrItinClass;
 def II_DIVU             : InstrItinClass;
 def II_DIV_D            : InstrItinClass;
 def II_DIV_S            : InstrItinClass;
+def II_DMFC0            : InstrItinClass;
+def II_DMTC0            : InstrItinClass;
 def II_DMFC1            : InstrItinClass;
 def II_DMTC1            : InstrItinClass;
+def II_DMOD             : InstrItinClass;
+def II_DMODU            : InstrItinClass;
+def II_DMUH             : InstrItinClass;
+def II_DMUHU            : InstrItinClass;
+def II_DMFC2            : InstrItinClass;
+def II_DMTC2            : InstrItinClass;
 def II_DMUL             : InstrItinClass;
+def II_DMULU            : InstrItinClass;
 def II_DMULT            : InstrItinClass;
 def II_DMULTU           : InstrItinClass;
 def II_DROTR            : InstrItinClass;
@@ -75,6 +102,8 @@ def II_DSRAV            : InstrItinClass;
 def II_DSRL             : InstrItinClass;
 def II_DSRL32           : InstrItinClass;
 def II_DSRLV            : InstrItinClass;
+def II_DSBH             : InstrItinClass;
+def II_DSHD             : InstrItinClass;
 def II_DSUBU            : InstrItinClass;
 def II_DSUB             : InstrItinClass;
 def II_EXT              : InstrItinClass; // Any EXT instruction
@@ -84,44 +113,96 @@ def II_IndirectBranchPseudo : InstrItinClass; // Indirect branch pseudo.
 def II_J                : InstrItinClass;
 def II_JAL              : InstrItinClass;
 def II_JALR             : InstrItinClass;
+def II_JALR_HB          : InstrItinClass;
 def II_JALRC            : InstrItinClass;
 def II_JALRS            : InstrItinClass;
 def II_JALS             : InstrItinClass;
+def II_JIC              : InstrItinClass;
+def II_JIALC            : InstrItinClass;
 def II_JR               : InstrItinClass;
+def II_JR_HB            : InstrItinClass;
 def II_JRADDIUSP        : InstrItinClass;
 def II_JRC              : InstrItinClass;
 def II_ReturnPseudo     : InstrItinClass; // Return pseudo.
+def II_ERET             : InstrItinClass;
+def II_DERET            : InstrItinClass;
+def II_ERETNC           : InstrItinClass;
+def II_EHB              : InstrItinClass;
+def II_SDBBP            : InstrItinClass;
+def II_SSNOP            : InstrItinClass;
+def II_SYSCALL          : InstrItinClass;
+def II_PAUSE            : InstrItinClass;
+def II_WAIT             : InstrItinClass;
+def II_EI               : InstrItinClass;
+def II_DI               : InstrItinClass;
+def II_TEQ              : InstrItinClass;
+def II_TEQI             : InstrItinClass;
+def II_TGE              : InstrItinClass;
+def II_TGEI             : InstrItinClass;
+def II_TGEIU            : InstrItinClass;
+def II_TGEU             : InstrItinClass;
+def II_TNE              : InstrItinClass;
+def II_TNEI             : InstrItinClass;
+def II_TLT              : InstrItinClass;
+def II_TLTI             : InstrItinClass;
+def II_TLTU             : InstrItinClass;
+def II_TTLTIU           : InstrItinClass;
+def II_TLBP             : InstrItinClass;
+def II_TLBR             : InstrItinClass;
+def II_TLBWI            : InstrItinClass;
+def II_TLBWR            : InstrItinClass;
+def II_TRAP             : InstrItinClass;
+def II_BREAK            : InstrItinClass;
+def II_SYNC             : InstrItinClass;
+def II_SYNCI            : InstrItinClass;
 def II_LB               : InstrItinClass;
 def II_LBE              : InstrItinClass;
 def II_LBU              : InstrItinClass;
 def II_LBUE             : InstrItinClass;
 def II_LD               : InstrItinClass;
 def II_LDC1             : InstrItinClass;
+def II_LDC2             : InstrItinClass;
+def II_LDC3             : InstrItinClass;
 def II_LDL              : InstrItinClass;
 def II_LDR              : InstrItinClass;
+def II_LDPC             : InstrItinClass;
 def II_LDXC1            : InstrItinClass;
 def II_LH               : InstrItinClass;
 def II_LHE              : InstrItinClass;
 def II_LHU              : InstrItinClass;
 def II_LHUE             : InstrItinClass;
+def II_LL               : InstrItinClass;
+def II_LLD              : InstrItinClass;
 def II_LUI              : InstrItinClass;
 def II_LUXC1            : InstrItinClass;
 def II_LW               : InstrItinClass;
 def II_LWE              : InstrItinClass;
 def II_LWC1             : InstrItinClass;
+def II_LWC2             : InstrItinClass;
+def II_LWC3             : InstrItinClass;
 def II_LWL              : InstrItinClass;
 def II_LWLE             : InstrItinClass;
+def II_LWPC             : InstrItinClass;
 def II_LWR              : InstrItinClass;
 def II_LWRE             : InstrItinClass;
 def II_LWU              : InstrItinClass;
+def II_LWUPC            : InstrItinClass;
 def II_LWXC1            : InstrItinClass;
+def II_LSA              : InstrItinClass;
+def II_DLSA             : InstrItinClass;
 def II_MADD             : InstrItinClass;
 def II_MADDU            : InstrItinClass;
 def II_MADD_D           : InstrItinClass;
 def II_MADD_S           : InstrItinClass;
+def II_MADDF_D          : InstrItinClass;
+def II_MADDF_S          : InstrItinClass;
+def II_MFC0             : InstrItinClass;
 def II_MFC1             : InstrItinClass;
 def II_MFHC1            : InstrItinClass;
+def II_MFC2             : InstrItinClass;
 def II_MFHI_MFLO        : InstrItinClass; // mfhi and mflo
+def II_MOD              : InstrItinClass;
+def II_MODU             : InstrItinClass;
 def II_MOVF             : InstrItinClass;
 def II_MOVF_D           : InstrItinClass;
 def II_MOVF_S           : InstrItinClass;
@@ -140,10 +221,17 @@ def II_MSUB             : InstrItinClass;
 def II_MSUBU            : InstrItinClass;
 def II_MSUB_D           : InstrItinClass;
 def II_MSUB_S           : InstrItinClass;
+def II_MSUBF_D          : InstrItinClass;
+def II_MSUBF_S          : InstrItinClass;
+def II_MTC0             : InstrItinClass;
 def II_MTC1             : InstrItinClass;
 def II_MTHC1            : InstrItinClass;
+def II_MTC2             : InstrItinClass;
 def II_MTHI_MTLO        : InstrItinClass; // mthi and mtlo
 def II_MUL              : InstrItinClass;
+def II_MUH              : InstrItinClass;
+def II_MUHU             : InstrItinClass;
+def II_MULU             : InstrItinClass;
 def II_MULT             : InstrItinClass;
 def II_MULTU            : InstrItinClass;
 def II_MUL_D            : InstrItinClass;
@@ -163,15 +251,20 @@ def II_ROTR             : InstrItinClass;
 def II_ROTRV            : InstrItinClass;
 def II_ROUND            : InstrItinClass;
 def II_SAVE             : InstrItinClass;
+def II_SC               : InstrItinClass;
+def II_SCD              : InstrItinClass;
 def II_SB               : InstrItinClass;
 def II_SBE              : InstrItinClass;
 def II_SD               : InstrItinClass;
 def II_SDC1             : InstrItinClass;
+def II_SDC2             : InstrItinClass;
+def II_SDC3             : InstrItinClass;
 def II_SDL              : InstrItinClass;
 def II_SDR              : InstrItinClass;
 def II_SDXC1            : InstrItinClass;
 def II_SEB              : InstrItinClass;
 def II_SEH              : InstrItinClass;
+def II_SELCCZ           : InstrItinClass;
 def II_SEQ_SNE          : InstrItinClass; // seq and sne
 def II_SEQI_SNEI        : InstrItinClass; // seqi and snei
 def II_SH               : InstrItinClass;
@@ -186,6 +279,7 @@ def II_SRA              : InstrItinClass;
 def II_SRAV             : InstrItinClass;
 def II_SRL              : InstrItinClass;
 def II_SRLV             : InstrItinClass;
+def II_SUB              : InstrItinClass;
 def II_SUBU             : InstrItinClass;
 def II_SUB_D            : InstrItinClass;
 def II_SUB_S            : InstrItinClass;
@@ -193,6 +287,8 @@ def II_SUXC1            : InstrItinClass;
 def II_SW               : InstrItinClass;
 def II_SWE              : InstrItinClass;
 def II_SWC1             : InstrItinClass;
+def II_SWC2             : InstrItinClass;
+def II_SWC3             : InstrItinClass;
 def II_SWL              : InstrItinClass;
 def II_SWLE             : InstrItinClass;
 def II_SWR              : InstrItinClass;
@@ -202,6 +298,14 @@ def II_TRUNC            : InstrItinClass;
 def II_WSBH             : InstrItinClass;
 def II_XOR              : InstrItinClass;
 def II_XORI             : InstrItinClass;
+def II_CACHE            : InstrItinClass;
+def II_PREF             : InstrItinClass;
+def II_CACHEE           : InstrItinClass;
+def II_PREFE            : InstrItinClass;
+def II_LLE              : InstrItinClass;
+def II_SCE              : InstrItinClass;
+def II_TLBINV           : InstrItinClass;
+def II_TLBINVF          : InstrItinClass;
 
 //===----------------------------------------------------------------------===//
 // Mips Generic instruction itineraries.
@@ -210,9 +314,16 @@ def MipsGenericItineraries : ProcessorItineraries<[ALU, IMULDIV], [], [
   InstrItinData<IIM16Alu           , [InstrStage<1,  [ALU]>]>,
   InstrItinData<II_ADDI            , [InstrStage<1,  [ALU]>]>,
   InstrItinData<II_ADDIU           , [InstrStage<1,  [ALU]>]>,
+  InstrItinData<II_ADDIUPC         , [InstrStage<1,  [ALU]>]>,
+  InstrItinData<II_ADD             , [InstrStage<1,  [ALU]>]>,
   InstrItinData<II_ADDU            , [InstrStage<1,  [ALU]>]>,
+  InstrItinData<II_AUI             , [InstrStage<1,  [ALU]>]>,
   InstrItinData<II_AND             , [InstrStage<1,  [ALU]>]>,
+  InstrItinData<II_ALUIPC          , [InstrStage<1,  [ALU]>]>,
+  InstrItinData<II_AUIPC           , [InstrStage<1,  [ALU]>]>,
+  InstrItinData<II_ALIGN           , [InstrStage<1,  [ALU]>]>,
   InstrItinData<II_BADDU           , [InstrStage<1,  [ALU]>]>,
+  InstrItinData<II_BITSWAP         , [InstrStage<1,  [ALU]>]>,
   InstrItinData<II_SLL             , [InstrStage<1,  [ALU]>]>,
   InstrItinData<II_SRA             , [InstrStage<1,  [ALU]>]>,
   InstrItinData<II_SRL             , [InstrStage<1,  [ALU]>]>,
@@ -225,17 +336,35 @@ def MipsGenericItineraries : ProcessorItineraries<[ALU, IMULDIV], [], [
   InstrItinData<II_CLZ             , [InstrStage<1,  [ALU]>]>,
   InstrItinData<II_DADDIU          , [InstrStage<1,  [ALU]>]>,
   InstrItinData<II_DADDU           , [InstrStage<1,  [ALU]>]>,
+  InstrItinData<II_DADDI           , [InstrStage<1,  [ALU]>]>,
   InstrItinData<II_DADD            , [InstrStage<1,  [ALU]>]>,
+  InstrItinData<II_DALIGN          , [InstrStage<1,  [ALU]>]>,
+  InstrItinData<II_DAHI            , [InstrStage<1,  [ALU]>]>,
+  InstrItinData<II_DATI            , [InstrStage<1,  [ALU]>]>,
+  InstrItinData<II_DAUI            , [InstrStage<1,  [ALU]>]>,
+  InstrItinData<II_DBITSWAP        , [InstrStage<1,  [ALU]>]>,
+  InstrItinData<II_DCLO            , [InstrStage<1,  [ALU]>]>,
+  InstrItinData<II_DCLZ            , [InstrStage<1,  [ALU]>]>,
+  InstrItinData<II_DMOD            , [InstrStage<17, [IMULDIV]>]>,
+  InstrItinData<II_DMODU           , [InstrStage<17, [IMULDIV]>]>,
   InstrItinData<II_DSLL            , [InstrStage<1,  [ALU]>]>,
+  InstrItinData<II_DSLL32          , [InstrStage<1,  [ALU]>]>,
   InstrItinData<II_DSRL            , [InstrStage<1,  [ALU]>]>,
+  InstrItinData<II_DSRL32          , [InstrStage<1,  [ALU]>]>,
   InstrItinData<II_DSRA            , [InstrStage<1,  [ALU]>]>,
+  InstrItinData<II_DSRA32          , [InstrStage<1,  [ALU]>]>,
   InstrItinData<II_DSLLV           , [InstrStage<1,  [ALU]>]>,
   InstrItinData<II_DSRLV           , [InstrStage<1,  [ALU]>]>,
   InstrItinData<II_DSRAV           , [InstrStage<1,  [ALU]>]>,
   InstrItinData<II_DSUBU           , [InstrStage<1,  [ALU]>]>,
   InstrItinData<II_DSUB            , [InstrStage<1,  [ALU]>]>,
   InstrItinData<II_DROTR           , [InstrStage<1,  [ALU]>]>,
+  InstrItinData<II_DROTR32         , [InstrStage<1,  [ALU]>]>,
   InstrItinData<II_DROTRV          , [InstrStage<1,  [ALU]>]>,
+  InstrItinData<II_DSBH            , [InstrStage<1,  [ALU]>]>,
+  InstrItinData<II_DSHD            , [InstrStage<1,  [ALU]>]>,
+  InstrItinData<II_DCLO            , [InstrStage<1,  [ALU]>]>,
+  InstrItinData<II_DCLZ            , [InstrStage<1,  [ALU]>]>,
   InstrItinData<II_EXT             , [InstrStage<1,  [ALU]>]>,
   InstrItinData<II_INS             , [InstrStage<1,  [ALU]>]>,
   InstrItinData<II_LUI             , [InstrStage<1,  [ALU]>]>,
@@ -249,41 +378,61 @@ def MipsGenericItineraries : ProcessorItineraries<[ALU, IMULDIV], [], [
   InstrItinData<II_OR              , [InstrStage<1,  [ALU]>]>,
   InstrItinData<II_POP             , [InstrStage<1,  [ALU]>]>,
   InstrItinData<II_RDHWR           , [InstrStage<1,  [ALU]>]>,
+  InstrItinData<II_SUB             , [InstrStage<1,  [ALU]>]>,
   InstrItinData<II_SUBU            , [InstrStage<1,  [ALU]>]>,
   InstrItinData<II_XOR             , [InstrStage<1,  [ALU]>]>,
   InstrItinData<II_ANDI            , [InstrStage<1,  [ALU]>]>,
   InstrItinData<II_ORI             , [InstrStage<1,  [ALU]>]>,
   InstrItinData<II_XORI            , [InstrStage<1,  [ALU]>]>,
   InstrItinData<II_LB              , [InstrStage<3,  [ALU]>]>,
+  InstrItinData<II_LBE             , [InstrStage<3,  [ALU]>]>,
   InstrItinData<II_LBU             , [InstrStage<3,  [ALU]>]>,
+  InstrItinData<II_LBUE            , [InstrStage<3,  [ALU]>]>,
   InstrItinData<II_LH              , [InstrStage<3,  [ALU]>]>,
   InstrItinData<II_LHU             , [InstrStage<3,  [ALU]>]>,
+  InstrItinData<II_LHUE            , [InstrStage<3,  [ALU]>]>,
   InstrItinData<II_LW              , [InstrStage<3,  [ALU]>]>,
+  InstrItinData<II_LWPC            , [InstrStage<3,  [ALU]>]>,
   InstrItinData<II_LWL             , [InstrStage<3,  [ALU]>]>,
+  InstrItinData<II_LWLE            , [InstrStage<3,  [ALU]>]>,
   InstrItinData<II_LWR             , [InstrStage<3,  [ALU]>]>,
+  InstrItinData<II_LWRE            , [InstrStage<3,  [ALU]>]>,
+  InstrItinData<II_LWUPC           , [InstrStage<3,  [ALU]>]>,
   InstrItinData<II_LD              , [InstrStage<3,  [ALU]>]>,
   InstrItinData<II_LDL             , [InstrStage<3,  [ALU]>]>,
   InstrItinData<II_LDR             , [InstrStage<3,  [ALU]>]>,
+  InstrItinData<II_LDPC            , [InstrStage<3,  [ALU]>]>,
+  InstrItinData<II_LL              , [InstrStage<3,  [ALU]>]>,
+  InstrItinData<II_LLD             , [InstrStage<3,  [ALU]>]>,
   InstrItinData<II_RESTORE         , [InstrStage<3,  [ALU]>]>,
   InstrItinData<II_SB              , [InstrStage<1,  [ALU]>]>,
   InstrItinData<II_SH              , [InstrStage<1,  [ALU]>]>,
+  InstrItinData<II_SHE             , [InstrStage<1,  [ALU]>]>,
   InstrItinData<II_SW              , [InstrStage<1,  [ALU]>]>,
   InstrItinData<II_SWL             , [InstrStage<1,  [ALU]>]>,
   InstrItinData<II_SWR             , [InstrStage<1,  [ALU]>]>,
   InstrItinData<II_SDL             , [InstrStage<1,  [ALU]>]>,
   InstrItinData<II_SDR             , [InstrStage<1,  [ALU]>]>,
   InstrItinData<II_SD              , [InstrStage<1,  [ALU]>]>,
+  InstrItinData<II_SC              , [InstrStage<1,  [ALU]>]>,
+  InstrItinData<II_SCD             , [InstrStage<1,  [ALU]>]>,
   InstrItinData<II_SAVE            , [InstrStage<1,  [ALU]>]>,
+  InstrItinData<II_SELCCZ          , [InstrStage<1,  [ALU]>]>,
   InstrItinData<II_SEQ_SNE         , [InstrStage<1,  [ALU]>]>,
   InstrItinData<II_SEQI_SNEI       , [InstrStage<1,  [ALU]>]>,
+  InstrItinData<II_SLTI_SLTIU      , [InstrStage<1,  [ALU]>]>,
+  InstrItinData<II_SLT_SLTU        , [InstrStage<1,  [ALU]>]>,
   InstrItinData<II_B               , [InstrStage<1,  [ALU]>]>,
+  InstrItinData<II_BALC            , [InstrStage<1,  [ALU]>]>,
   InstrItinData<II_BBIT            , [InstrStage<1,  [ALU]>]>,
   InstrItinData<II_BC              , [InstrStage<1,  [ALU]>]>,
   InstrItinData<II_BC1F            , [InstrStage<1,  [ALU]>]>,
   InstrItinData<II_BC1FL           , [InstrStage<1,  [ALU]>]>,
   InstrItinData<II_BC1T            , [InstrStage<1,  [ALU]>]>,
   InstrItinData<II_BC1TL           , [InstrStage<1,  [ALU]>]>,
+  InstrItinData<II_BC1CCZ          , [InstrStage<1,  [ALU]>]>,
   InstrItinData<II_BCC             , [InstrStage<1,  [ALU]>]>,
+  InstrItinData<II_BCCC            , [InstrStage<1,  [ALU]>]>,
   InstrItinData<II_BCCZ            , [InstrStage<1,  [ALU]>]>,
   InstrItinData<II_BCCZAL          , [InstrStage<1,  [ALU]>]>,
   InstrItinData<II_BCCZALS         , [InstrStage<1,  [ALU]>]>,
@@ -292,25 +441,69 @@ def MipsGenericItineraries : ProcessorItineraries<[ALU, IMULDIV], [], [
   InstrItinData<II_J               , [InstrStage<1,  [ALU]>]>,
   InstrItinData<II_JAL             , [InstrStage<1,  [ALU]>]>,
   InstrItinData<II_JALR            , [InstrStage<1,  [ALU]>]>,
+  InstrItinData<II_JALR_HB         , [InstrStage<1,  [ALU]>]>,
   InstrItinData<II_JALRC           , [InstrStage<1,  [ALU]>]>,
   InstrItinData<II_JALRS           , [InstrStage<1,  [ALU]>]>,
   InstrItinData<II_JALS            , [InstrStage<1,  [ALU]>]>,
+  InstrItinData<II_JIC             , [InstrStage<1,  [ALU]>]>,
+  InstrItinData<II_JIALC           , [InstrStage<1,  [ALU]>]>,
   InstrItinData<II_JR              , [InstrStage<1,  [ALU]>]>,
+  InstrItinData<II_JR_HB           , [InstrStage<1,  [ALU]>]>,
   InstrItinData<II_JRADDIUSP       , [InstrStage<1,  [ALU]>]>,
   InstrItinData<II_JRC             , [InstrStage<1,  [ALU]>]>,
   InstrItinData<II_ReturnPseudo    , [InstrStage<1,  [ALU]>]>,
+  InstrItinData<IIPseudo           , [InstrStage<1,  [ALU]>]>,
+  InstrItinData<II_DMUH            , [InstrStage<17, [IMULDIV]>]>,
+  InstrItinData<II_DMUHU           , [InstrStage<17, [IMULDIV]>]>,
+  InstrItinData<II_ERET            , [InstrStage<1,  [ALU]>]>,
+  InstrItinData<II_DERET           , [InstrStage<1,  [ALU]>]>,
+  InstrItinData<II_ERETNC          , [InstrStage<1,  [ALU]>]>,
+  InstrItinData<II_EHB             , [InstrStage<1,  [ALU]>]>,
+  InstrItinData<II_SDBBP           , [InstrStage<1,  [ALU]>]>,
+  InstrItinData<II_SSNOP           , [InstrStage<1,  [ALU]>]>,
+  InstrItinData<II_SYSCALL         , [InstrStage<1,  [ALU]>]>,
+  InstrItinData<II_PAUSE           , [InstrStage<1,  [ALU]>]>,
+  InstrItinData<II_WAIT            , [InstrStage<1,  [ALU]>]>,
+  InstrItinData<II_EI              , [InstrStage<1,  [ALU]>]>,
+  InstrItinData<II_DI              , [InstrStage<1,  [ALU]>]>,
+  InstrItinData<II_TEQ             , [InstrStage<1,  [ALU]>]>,
+  InstrItinData<II_TEQI            , [InstrStage<1,  [ALU]>]>,
+  InstrItinData<II_TGE             , [InstrStage<1,  [ALU]>]>,
+  InstrItinData<II_TGEI            , [InstrStage<1,  [ALU]>]>,
+  InstrItinData<II_TGEIU           , [InstrStage<1,  [ALU]>]>,
+  InstrItinData<II_TGEU            , [InstrStage<1,  [ALU]>]>,
+  InstrItinData<II_TNE             , [InstrStage<1,  [ALU]>]>,
+  InstrItinData<II_TNEI            , [InstrStage<1,  [ALU]>]>,
+  InstrItinData<II_TLT             , [InstrStage<1,  [ALU]>]>,
+  InstrItinData<II_TLTI            , [InstrStage<1,  [ALU]>]>,
+  InstrItinData<II_TLTU            , [InstrStage<1,  [ALU]>]>,
+  InstrItinData<II_TTLTIU          , [InstrStage<1,  [ALU]>]>,
+  InstrItinData<II_TLBP            , [InstrStage<1,  [ALU]>]>,
+  InstrItinData<II_TLBR            , [InstrStage<1,  [ALU]>]>,
+  InstrItinData<II_TLBWI           , [InstrStage<1,  [ALU]>]>,
+  InstrItinData<II_TLBWR           , [InstrStage<1,  [ALU]>]>,
+  InstrItinData<II_TRAP            , [InstrStage<1,  [ALU]>]>,
+  InstrItinData<II_BREAK           , [InstrStage<1,  [ALU]>]>,
+  InstrItinData<II_SYNC            , [InstrStage<1,  [ALU]>]>,
+  InstrItinData<II_SYNCI           , [InstrStage<1,  [ALU]>]>,
   InstrItinData<II_DMUL            , [InstrStage<17, [IMULDIV]>]>,
   InstrItinData<II_DMULT           , [InstrStage<17, [IMULDIV]>]>,
   InstrItinData<II_DMULTU          , [InstrStage<17, [IMULDIV]>]>,
+  InstrItinData<II_DMULU           , [InstrStage<17, [IMULDIV]>]>,
   InstrItinData<II_MADD            , [InstrStage<17, [IMULDIV]>]>,
   InstrItinData<II_MADDU           , [InstrStage<17, [IMULDIV]>]>,
   InstrItinData<II_MFHI_MFLO       , [InstrStage<1,  [IMULDIV]>]>,
+  InstrItinData<II_MOD             , [InstrStage<38, [IMULDIV]>]>,
+  InstrItinData<II_MODU            , [InstrStage<38, [IMULDIV]>]>,
   InstrItinData<II_MSUB            , [InstrStage<17, [IMULDIV]>]>,
   InstrItinData<II_MSUBU           , [InstrStage<17, [IMULDIV]>]>,
   InstrItinData<II_MTHI_MTLO       , [InstrStage<1,  [IMULDIV]>]>,
+  InstrItinData<II_MUH             , [InstrStage<17, [IMULDIV]>]>,
+  InstrItinData<II_MUHU            , [InstrStage<17, [IMULDIV]>]>,
   InstrItinData<II_MUL             , [InstrStage<17, [IMULDIV]>]>,
   InstrItinData<II_MULT            , [InstrStage<17, [IMULDIV]>]>,
   InstrItinData<II_MULTU           , [InstrStage<17, [IMULDIV]>]>,
+  InstrItinData<II_MULU            , [InstrStage<17, [IMULDIV]>]>,
   InstrItinData<II_MSUB            , [InstrStage<17, [IMULDIV]>]>,
   InstrItinData<II_MSUBU           , [InstrStage<17, [IMULDIV]>]>,
   InstrItinData<II_DIV             , [InstrStage<38, [IMULDIV]>]>,
@@ -342,34 +535,65 @@ def MipsGenericItineraries : ProcessorItineraries<[ALU, IMULDIV], [], [
   InstrItinData<II_SUB_S           , [InstrStage<4,  [ALU]>]>,
   InstrItinData<II_MUL_S           , [InstrStage<7,  [ALU]>]>,
   InstrItinData<II_MADD_S          , [InstrStage<7,  [ALU]>]>,
+  InstrItinData<II_MADDF_S         , [InstrStage<7,  [ALU]>]>,
   InstrItinData<II_MSUB_S          , [InstrStage<7,  [ALU]>]>,
+  InstrItinData<II_MSUBF_S         , [InstrStage<7,  [ALU]>]>,
   InstrItinData<II_NMADD_S         , [InstrStage<7,  [ALU]>]>,
   InstrItinData<II_NMSUB_S         , [InstrStage<7,  [ALU]>]>,
   InstrItinData<II_MUL_D           , [InstrStage<8,  [ALU]>]>,
   InstrItinData<II_MADD_D          , [InstrStage<8,  [ALU]>]>,
+  InstrItinData<II_MADDF_D         , [InstrStage<8,  [ALU]>]>,
   InstrItinData<II_MSUB_D          , [InstrStage<8,  [ALU]>]>,
+  InstrItinData<II_MSUBF_D         , [InstrStage<8,  [ALU]>]>,
   InstrItinData<II_NMADD_D         , [InstrStage<8,  [ALU]>]>,
   InstrItinData<II_NMSUB_D         , [InstrStage<8,  [ALU]>]>,
   InstrItinData<II_DIV_S           , [InstrStage<23, [ALU]>]>,
   InstrItinData<II_DIV_D           , [InstrStage<36, [ALU]>]>,
   InstrItinData<II_SQRT_S          , [InstrStage<54, [ALU]>]>,
   InstrItinData<II_SQRT_D          , [InstrStage<12, [ALU]>]>,
+  InstrItinData<II_WSBH            , [InstrStage<1,  [ALU]>]>,
+  InstrItinData<II_LSA             , [InstrStage<1,  [ALU]>]>,
+  InstrItinData<II_DLSA            , [InstrStage<1,  [ALU]>]>,
   InstrItinData<II_LDC1            , [InstrStage<3,  [ALU]>]>,
+  InstrItinData<II_LDC2            , [InstrStage<3,  [ALU]>]>,
+  InstrItinData<II_LDC3            , [InstrStage<3,  [ALU]>]>,
   InstrItinData<II_LWC1            , [InstrStage<3,  [ALU]>]>,
+  InstrItinData<II_LWC2            , [InstrStage<3,  [ALU]>]>,
+  InstrItinData<II_LWC3            , [InstrStage<3,  [ALU]>]>,
   InstrItinData<II_LDXC1           , [InstrStage<3,  [ALU]>]>,
   InstrItinData<II_LWXC1           , [InstrStage<3,  [ALU]>]>,
   InstrItinData<II_LUXC1           , [InstrStage<3,  [ALU]>]>,
   InstrItinData<II_SDC1            , [InstrStage<1,  [ALU]>]>,
+  InstrItinData<II_SDC2            , [InstrStage<1,  [ALU]>]>,
+  InstrItinData<II_SDC3            , [InstrStage<1,  [ALU]>]>,
   InstrItinData<II_SWC1            , [InstrStage<1,  [ALU]>]>,
+  InstrItinData<II_SWC2            , [InstrStage<1,  [ALU]>]>,
+  InstrItinData<II_SWC3            , [InstrStage<1,  [ALU]>]>,
   InstrItinData<II_SDXC1           , [InstrStage<1,  [ALU]>]>,
   InstrItinData<II_SWXC1           , [InstrStage<1,  [ALU]>]>,
   InstrItinData<II_SUXC1           , [InstrStage<1,  [ALU]>]>,
+  InstrItinData<II_DMFC0           , [InstrStage<2,  [ALU]>]>,
   InstrItinData<II_DMFC1           , [InstrStage<2,  [ALU]>]>,
+  InstrItinData<II_DMFC2           , [InstrStage<2,  [ALU]>]>,
+  InstrItinData<II_DMTC0           , [InstrStage<2,  [ALU]>]>,
   InstrItinData<II_DMTC1           , [InstrStage<2,  [ALU]>]>,
+  InstrItinData<II_DMTC2           , [InstrStage<2,  [ALU]>]>,
+  InstrItinData<II_MFC0            , [InstrStage<2,  [ALU]>]>,
   InstrItinData<II_MFC1            , [InstrStage<2,  [ALU]>]>,
+  InstrItinData<II_MFC2            , [InstrStage<2,  [ALU]>]>,
+  InstrItinData<II_MTC0            , [InstrStage<2,  [ALU]>]>,
   InstrItinData<II_MTC1            , [InstrStage<2,  [ALU]>]>,
+  InstrItinData<II_MTC2            , [InstrStage<2,  [ALU]>]>,
   InstrItinData<II_MFHC1           , [InstrStage<2,  [ALU]>]>,
-  InstrItinData<II_MTHC1           , [InstrStage<2,  [ALU]>]>
+  InstrItinData<II_MTHC1           , [InstrStage<2,  [ALU]>]>,
+  InstrItinData<II_CACHE           , [InstrStage<1,  [ALU]>]>,
+  InstrItinData<II_PREF            , [InstrStage<1,  [ALU]>]>,
+  InstrItinData<II_CACHEE          , [InstrStage<1,  [ALU]>]>,
+  InstrItinData<II_PREFE           , [InstrStage<1,  [ALU]>]>,
+  InstrItinData<II_TLBINV          , [InstrStage<1,  [ALU]>]>,
+  InstrItinData<II_TLBINVF         , [InstrStage<1,  [ALU]>]>,
+  InstrItinData<II_LLE             , [InstrStage<3,  [ALU]>]>,
+  InstrItinData<II_SCE             , [InstrStage<1,  [ALU]>]>
 ]>;
 
 include "MipsScheduleP5600.td"
diff --git a/lib/Target/Mips/MipsScheduleP5600.td b/lib/Target/Mips/MipsScheduleP5600.td
index d32ae4f55eaf..cee42873c6e8 100644
--- a/lib/Target/Mips/MipsScheduleP5600.td
+++ b/lib/Target/Mips/MipsScheduleP5600.td
@@ -13,7 +13,7 @@ def MipsP5600Model : SchedMachineModel {
   int LoadLatency = 4;
   int MispredictPenalty = 8; // TODO: Estimated
 
-  let CompleteModel = 1;
+  let CompleteModel = 0;
 }
 
 let SchedModel = MipsP5600Model in {
diff --git a/lib/Target/Mips/MipsSubtarget.cpp b/lib/Target/Mips/MipsSubtarget.cpp
index 8a18b517d16b..3e7570ff46ed 100644
--- a/lib/Target/Mips/MipsSubtarget.cpp
+++ b/lib/Target/Mips/MipsSubtarget.cpp
@@ -90,7 +90,7 @@ MipsSubtarget::MipsSubtarget(const Triple &TT, const std::string &CPU,
     report_fatal_error("Code generation for MIPS-V is not implemented", false);
 
   // Check if Architecture and ABI are compatible.
-  assert(((!isGP64bit() && (isABI_O32() || isABI_EABI())) ||
+  assert(((!isGP64bit() && isABI_O32()) ||
           (isGP64bit() && (isABI_N32() || isABI_N64()))) &&
          "Invalid  Arch & ABI pair.");
 
@@ -114,7 +114,7 @@ MipsSubtarget::MipsSubtarget(const Triple &TT, const std::string &CPU,
       report_fatal_error(ISA + " is not compatible with the DSP ASE", false);
   }
 
-  if (NoABICalls && TM.getRelocationModel() == Reloc::PIC_)
+  if (NoABICalls && TM.isPositionIndependent())
     report_fatal_error("position-independent code requires '-mabicalls'");
 
   // Set UseSmallSection.
@@ -126,6 +126,10 @@ MipsSubtarget::MipsSubtarget(const Triple &TT, const std::string &CPU,
   }
 }
 
+bool MipsSubtarget::isPositionIndependent() const {
+  return TM.isPositionIndependent();
+}
+
 /// This overrides the PostRAScheduler bit in the SchedModel for any CPU.
 bool MipsSubtarget::enablePostRAScheduler() const { return true; }
 
@@ -164,7 +168,6 @@ Reloc::Model MipsSubtarget::getRelocationModel() const {
   return TM.getRelocationModel();
 }
 
-bool MipsSubtarget::isABI_EABI() const { return getABI().IsEABI(); }
 bool MipsSubtarget::isABI_N64() const { return getABI().IsN64(); }
 bool MipsSubtarget::isABI_N32() const { return getABI().IsN32(); }
 bool MipsSubtarget::isABI_O32() const { return getABI().IsO32(); }
diff --git a/lib/Target/Mips/MipsSubtarget.h b/lib/Target/Mips/MipsSubtarget.h
index fbb01fe77029..38d3cee70477 100644
--- a/lib/Target/Mips/MipsSubtarget.h
+++ b/lib/Target/Mips/MipsSubtarget.h
@@ -18,10 +18,10 @@
 #include "MipsFrameLowering.h"
 #include "MipsISelLowering.h"
 #include "MipsInstrInfo.h"
+#include "llvm/CodeGen/SelectionDAGTargetInfo.h"
 #include "llvm/IR/DataLayout.h"
 #include "llvm/MC/MCInstrItineraries.h"
 #include "llvm/Support/ErrorHandling.h"
-#include "llvm/Target/TargetSelectionDAGInfo.h"
 #include "llvm/Target/TargetSubtargetInfo.h"
 #include <string>
 
@@ -81,6 +81,9 @@ class MipsSubtarget : public MipsGenSubtargetInfo {
   // IsFP64bit - General-purpose registers are 64 bits wide
   bool IsGP64bit;
 
+  // IsPTR64bit - Pointers are 64 bit wide
+  bool IsPTR64bit;
+
   // HasVFPU - Processor has a vector floating point unit.
   bool HasVFPU;
 
@@ -152,19 +155,18 @@ class MipsSubtarget : public MipsGenSubtargetInfo {
 
   Triple TargetTriple;
 
-  const TargetSelectionDAGInfo TSInfo;
+  const SelectionDAGTargetInfo TSInfo;
   std::unique_ptr<const MipsInstrInfo> InstrInfo;
   std::unique_ptr<const MipsFrameLowering> FrameLowering;
   std::unique_ptr<const MipsTargetLowering> TLInfo;
 
 public:
+  bool isPositionIndependent() const;
   /// This overrides the PostRAScheduler bit in the SchedModel for each CPU.
   bool enablePostRAScheduler() const override;
   void getCriticalPathRCs(RegClassVector &CriticalPathRCs) const override;
   CodeGenOpt::Level getOptLevelToEnablePostRAScheduler() const override;
 
-  /// Only O32 and EABI supported right now.
-  bool isABI_EABI() const;
   bool isABI_N64() const;
   bool isABI_N32() const;
   bool isABI_O32() const;
@@ -225,6 +227,8 @@ public:
   bool isGP64bit() const { return IsGP64bit; }
   bool isGP32bit() const { return !IsGP64bit; }
   unsigned getGPRSizeInBytes() const { return isGP64bit() ? 8 : 4; }
+  bool isPTR64bit() const { return IsPTR64bit; }
+  bool isPTR32bit() const { return !IsPTR64bit; }
   bool isSingleFloat() const { return IsSingleFloat; }
   bool hasVFPU() const { return HasVFPU; }
   bool inMips16Mode() const { return InMips16Mode; }
@@ -290,7 +294,7 @@ public:
   void setHelperClassesMips16();
   void setHelperClassesMipsSE();
 
-  const TargetSelectionDAGInfo *getSelectionDAGInfo() const override {
+  const SelectionDAGTargetInfo *getSelectionDAGInfo() const override {
     return &TSInfo;
   }
   const MipsInstrInfo *getInstrInfo() const override { return InstrInfo.get(); }
diff --git a/lib/Target/Mips/MipsTargetMachine.cpp b/lib/Target/Mips/MipsTargetMachine.cpp
index 3e638720e839..c248c3a50ac8 100644
--- a/lib/Target/Mips/MipsTargetMachine.cpp
+++ b/lib/Target/Mips/MipsTargetMachine.cpp
@@ -26,6 +26,7 @@
 #include "MipsTargetObjectFile.h"
 #include "llvm/Analysis/TargetTransformInfo.h"
 #include "llvm/CodeGen/Passes.h"
+#include "llvm/CodeGen/TargetPassConfig.h"
 #include "llvm/IR/LegacyPassManager.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/TargetRegistry.h"
@@ -77,6 +78,13 @@ static std::string computeDataLayout(const Triple &TT, StringRef CPU,
   return Ret;
 }
 
+static Reloc::Model getEffectiveRelocModel(CodeModel::Model CM,
+                                           Optional<Reloc::Model> RM) {
+  if (!RM.hasValue() || CM == CodeModel::JITDefault)
+    return Reloc::Static;
+  return *RM;
+}
+
 // On function prologue, the stack is created by decrementing
 // its pointer. Once decremented, all references are done with positive
 // offset from the stack/frame pointer, using StackGrowsUp enables
@@ -85,10 +93,12 @@ static std::string computeDataLayout(const Triple &TT, StringRef CPU,
 MipsTargetMachine::MipsTargetMachine(const Target &T, const Triple &TT,
                                      StringRef CPU, StringRef FS,
                                      const TargetOptions &Options,
-                                     Reloc::Model RM, CodeModel::Model CM,
-                                     CodeGenOpt::Level OL, bool isLittle)
+                                     Optional<Reloc::Model> RM,
+                                     CodeModel::Model CM, CodeGenOpt::Level OL,
+                                     bool isLittle)
     : LLVMTargetMachine(T, computeDataLayout(TT, CPU, Options, isLittle), TT,
-                        CPU, FS, Options, RM, CM, OL),
+                        CPU, FS, Options, getEffectiveRelocModel(CM, RM), CM,
+                        OL),
       isLittle(isLittle), TLOF(make_unique<MipsTargetObjectFile>()),
       ABI(MipsABIInfo::computeTargetABI(TT, CPU, Options.MCOptions)),
       Subtarget(nullptr), DefaultSubtarget(TT, CPU, FS, isLittle, *this),
@@ -107,7 +117,8 @@ void MipsebTargetMachine::anchor() { }
 MipsebTargetMachine::MipsebTargetMachine(const Target &T, const Triple &TT,
                                          StringRef CPU, StringRef FS,
                                          const TargetOptions &Options,
-                                         Reloc::Model RM, CodeModel::Model CM,
+                                         Optional<Reloc::Model> RM,
+                                         CodeModel::Model CM,
                                          CodeGenOpt::Level OL)
     : MipsTargetMachine(T, TT, CPU, FS, Options, RM, CM, OL, false) {}
 
@@ -116,7 +127,8 @@ void MipselTargetMachine::anchor() { }
 MipselTargetMachine::MipselTargetMachine(const Target &T, const Triple &TT,
                                          StringRef CPU, StringRef FS,
                                          const TargetOptions &Options,
-                                         Reloc::Model RM, CodeModel::Model CM,
+                                         Optional<Reloc::Model> RM,
+                                         CodeModel::Model CM,
                                          CodeGenOpt::Level OL)
     : MipsTargetMachine(T, TT, CPU, FS, Options, RM, CM, OL, true) {}
 
@@ -217,8 +229,8 @@ void MipsPassConfig::addIRPasses() {
 // the ISelDag to gen Mips code.
 bool MipsPassConfig::addInstSelector() {
   addPass(createMipsModuleISelDagPass(getMipsTargetMachine()));
-  addPass(createMips16ISelDag(getMipsTargetMachine()));
-  addPass(createMipsSEISelDag(getMipsTargetMachine()));
+  addPass(createMips16ISelDag(getMipsTargetMachine(), getOptLevel()));
+  addPass(createMipsSEISelDag(getMipsTargetMachine(), getOptLevel()));
   return false;
 }
 
@@ -250,7 +262,13 @@ TargetIRAnalysis MipsTargetMachine::getTargetIRAnalysis() {
 // print out the code after the passes.
 void MipsPassConfig::addPreEmitPass() {
   MipsTargetMachine &TM = getMipsTargetMachine();
+
+  // The delay slot filler pass can potientially create forbidden slot (FS)
+  // hazards for MIPSR6 which the hazard schedule pass (HSP) will fix. Any
+  // (new) pass that creates compact branches after the HSP must handle FS
+  // hazards itself or be pipelined before the HSP.
   addPass(createMipsDelaySlotFillerPass(TM));
+  addPass(createMipsHazardSchedule());
   addPass(createMipsLongBranchPass(TM));
-  addPass(createMipsConstantIslandPass(TM));
+  addPass(createMipsConstantIslandPass());
 }
diff --git a/lib/Target/Mips/MipsTargetMachine.h b/lib/Target/Mips/MipsTargetMachine.h
index 38b2ecff7d7f..e4cf17e2abd8 100644
--- a/lib/Target/Mips/MipsTargetMachine.h
+++ b/lib/Target/Mips/MipsTargetMachine.h
@@ -40,8 +40,9 @@ class MipsTargetMachine : public LLVMTargetMachine {
 
 public:
   MipsTargetMachine(const Target &T, const Triple &TT, StringRef CPU,
-                    StringRef FS, const TargetOptions &Options, Reloc::Model RM,
-                    CodeModel::Model CM, CodeGenOpt::Level OL, bool isLittle);
+                    StringRef FS, const TargetOptions &Options,
+                    Optional<Reloc::Model> RM, CodeModel::Model CM,
+                    CodeGenOpt::Level OL, bool isLittle);
   ~MipsTargetMachine() override;
 
   TargetIRAnalysis getTargetIRAnalysis() override;
@@ -68,25 +69,25 @@ public:
   const MipsABIInfo &getABI() const { return ABI; }
 };
 
-/// MipsebTargetMachine - Mips32/64 big endian target machine.
+/// Mips32/64 big endian target machine.
 ///
 class MipsebTargetMachine : public MipsTargetMachine {
   virtual void anchor();
 public:
   MipsebTargetMachine(const Target &T, const Triple &TT, StringRef CPU,
                       StringRef FS, const TargetOptions &Options,
-                      Reloc::Model RM, CodeModel::Model CM,
+                      Optional<Reloc::Model> RM, CodeModel::Model CM,
                       CodeGenOpt::Level OL);
 };
 
-/// MipselTargetMachine - Mips32/64 little endian target machine.
+/// Mips32/64 little endian target machine.
 ///
 class MipselTargetMachine : public MipsTargetMachine {
   virtual void anchor();
 public:
   MipselTargetMachine(const Target &T, const Triple &TT, StringRef CPU,
                       StringRef FS, const TargetOptions &Options,
-                      Reloc::Model RM, CodeModel::Model CM,
+                      Optional<Reloc::Model> RM, CodeModel::Model CM,
                       CodeGenOpt::Level OL);
 };
 
diff --git a/lib/Target/Mips/MipsTargetObjectFile.cpp b/lib/Target/Mips/MipsTargetObjectFile.cpp
index 146f33bda249..3bd4567e3792 100644
--- a/lib/Target/Mips/MipsTargetObjectFile.cpp
+++ b/lib/Target/Mips/MipsTargetObjectFile.cpp
@@ -41,10 +41,12 @@ void MipsTargetObjectFile::Initialize(MCContext &Ctx, const TargetMachine &TM){
   InitializeELF(TM.Options.UseInitArray);
 
   SmallDataSection = getContext().getELFSection(
-      ".sdata", ELF::SHT_PROGBITS, ELF::SHF_WRITE | ELF::SHF_ALLOC);
+      ".sdata", ELF::SHT_PROGBITS,
+      ELF::SHF_WRITE | ELF::SHF_ALLOC | ELF::SHF_MIPS_GPREL);
 
   SmallBSSSection = getContext().getELFSection(".sbss", ELF::SHT_NOBITS,
-                                               ELF::SHF_WRITE | ELF::SHF_ALLOC);
+                                               ELF::SHF_WRITE | ELF::SHF_ALLOC |
+                                                   ELF::SHF_MIPS_GPREL);
   this->TM = &static_cast<const MipsTargetMachine &>(TM);
 }
 
@@ -106,7 +108,7 @@ IsGlobalInSmallSectionImpl(const GlobalValue *GV,
                        GV->hasCommonLinkage()))
     return false;
 
-  Type *Ty = GV->getType()->getElementType();
+  Type *Ty = GV->getValueType();
   return IsInSmallSection(
       GV->getParent()->getDataLayout().getTypeAllocSize(Ty));
 }
@@ -138,11 +140,13 @@ bool MipsTargetObjectFile::IsConstantInSmallSection(
 }
 
 /// Return true if this constant should be placed into small data section.
-MCSection *MipsTargetObjectFile::getSectionForConstant(
-    const DataLayout &DL, SectionKind Kind, const Constant *C) const {
+MCSection *MipsTargetObjectFile::getSectionForConstant(const DataLayout &DL,
+                                                       SectionKind Kind,
+                                                       const Constant *C,
+                                                       unsigned &Align) const {
   if (IsConstantInSmallSection(DL, C, *TM))
     return SmallDataSection;
 
   // Otherwise, we work the same as ELF.
-  return TargetLoweringObjectFileELF::getSectionForConstant(DL, Kind, C);
+  return TargetLoweringObjectFileELF::getSectionForConstant(DL, Kind, C, Align);
 }
diff --git a/lib/Target/Mips/MipsTargetObjectFile.h b/lib/Target/Mips/MipsTargetObjectFile.h
index ba04343bad87..9840769aff69 100644
--- a/lib/Target/Mips/MipsTargetObjectFile.h
+++ b/lib/Target/Mips/MipsTargetObjectFile.h
@@ -40,7 +40,8 @@ class MipsTargetMachine;
                                   const TargetMachine &TM) const;
 
     MCSection *getSectionForConstant(const DataLayout &DL, SectionKind Kind,
-                                     const Constant *C) const override;
+                                     const Constant *C,
+                                     unsigned &Align) const override;
   };
 } // end namespace llvm
 
diff --git a/lib/Target/Mips/MipsTargetStreamer.h b/lib/Target/Mips/MipsTargetStreamer.h
index b3222f5d89ef..41ebe411b98d 100644
--- a/lib/Target/Mips/MipsTargetStreamer.h
+++ b/lib/Target/Mips/MipsTargetStreamer.h
@@ -13,6 +13,7 @@
 #include "MCTargetDesc/MipsABIFlagsSection.h"
 #include "MCTargetDesc/MipsABIInfo.h"
 #include "llvm/ADT/Optional.h"
+#include "llvm/ADT/STLExtras.h"
 #include "llvm/MC/MCELFStreamer.h"
 #include "llvm/MC/MCRegisterInfo.h"
 #include "llvm/MC/MCStreamer.h"
@@ -24,8 +25,12 @@ struct MipsABIFlagsSection;
 class MipsTargetStreamer : public MCTargetStreamer {
 public:
   MipsTargetStreamer(MCStreamer &S);
+
+  virtual void setPic(bool Value) {}
+
   virtual void emitDirectiveSetMicroMips();
   virtual void emitDirectiveSetNoMicroMips();
+  virtual void setUsesMicroMips();
   virtual void emitDirectiveSetMips16();
   virtual void emitDirectiveSetNoMips16();
 
@@ -78,8 +83,9 @@ public:
 
   // PIC support
   virtual void emitDirectiveCpLoad(unsigned RegNo);
-  virtual void emitDirectiveCpRestore(SmallVector<MCInst, 3> &StoreInsts,
-                                      int Offset);
+  virtual bool emitDirectiveCpRestore(int Offset,
+                                      function_ref<unsigned()> GetATReg,
+                                      SMLoc IDLoc, const MCSubtargetInfo *STI);
   virtual void emitDirectiveCpsetup(unsigned RegNo, int RegOrOffset,
                                     const MCSymbol &Sym, bool IsReg);
   virtual void emitDirectiveCpreturn(unsigned SaveLocation,
@@ -94,6 +100,54 @@ public:
   virtual void emitDirectiveSetOddSPReg();
   virtual void emitDirectiveSetNoOddSPReg();
 
+  void emitR(unsigned Opcode, unsigned Reg0, SMLoc IDLoc,
+             const MCSubtargetInfo *STI);
+  void emitII(unsigned Opcode, int16_t Imm1, int16_t Imm2, SMLoc IDLoc,
+              const MCSubtargetInfo *STI);
+  void emitRX(unsigned Opcode, unsigned Reg0, MCOperand Op1, SMLoc IDLoc,
+              const MCSubtargetInfo *STI);
+  void emitRI(unsigned Opcode, unsigned Reg0, int32_t Imm, SMLoc IDLoc,
+              const MCSubtargetInfo *STI);
+  void emitRR(unsigned Opcode, unsigned Reg0, unsigned Reg1, SMLoc IDLoc,
+              const MCSubtargetInfo *STI);
+  void emitRRX(unsigned Opcode, unsigned Reg0, unsigned Reg1, MCOperand Op2,
+               SMLoc IDLoc, const MCSubtargetInfo *STI);
+  void emitRRR(unsigned Opcode, unsigned Reg0, unsigned Reg1, unsigned Reg2,
+               SMLoc IDLoc, const MCSubtargetInfo *STI);
+  void emitRRI(unsigned Opcode, unsigned Reg0, unsigned Reg1, int16_t Imm,
+               SMLoc IDLoc, const MCSubtargetInfo *STI);
+  void emitAddu(unsigned DstReg, unsigned SrcReg, unsigned TrgReg, bool Is64Bit,
+                const MCSubtargetInfo *STI);
+  void emitDSLL(unsigned DstReg, unsigned SrcReg, int16_t ShiftAmount,
+                SMLoc IDLoc, const MCSubtargetInfo *STI);
+  void emitEmptyDelaySlot(bool hasShortDelaySlot, SMLoc IDLoc,
+                          const MCSubtargetInfo *STI);
+  void emitNop(SMLoc IDLoc, const MCSubtargetInfo *STI);
+
+  /// Emit a store instruction with an offset. If the offset is out of range
+  /// then it will be synthesized using the assembler temporary.
+  ///
+  /// GetATReg() is a callback that can be used to obtain the current assembler
+  /// temporary and is only called when the assembler temporary is required. It
+  /// must handle the case where no assembler temporary is available (typically
+  /// by reporting an error).
+  void emitStoreWithImmOffset(unsigned Opcode, unsigned SrcReg,
+                              unsigned BaseReg, int64_t Offset,
+                              function_ref<unsigned()> GetATReg, SMLoc IDLoc,
+                              const MCSubtargetInfo *STI);
+  void emitStoreWithSymOffset(unsigned Opcode, unsigned SrcReg,
+                              unsigned BaseReg, MCOperand &HiOperand,
+                              MCOperand &LoOperand, unsigned ATReg, SMLoc IDLoc,
+                              const MCSubtargetInfo *STI);
+  void emitLoadWithImmOffset(unsigned Opcode, unsigned DstReg, unsigned BaseReg,
+                             int64_t Offset, unsigned TmpReg, SMLoc IDLoc,
+                             const MCSubtargetInfo *STI);
+  void emitLoadWithSymOffset(unsigned Opcode, unsigned DstReg, unsigned BaseReg,
+                             MCOperand &HiOperand, MCOperand &LoOperand,
+                             unsigned ATReg, SMLoc IDLoc,
+                             const MCSubtargetInfo *STI);
+  void emitGPRestore(int Offset, SMLoc IDLoc, const MCSubtargetInfo *STI);
+
   void forbidModuleDirective() { ModuleDirectiveAllowed = false; }
   void reallowModuleDirective() { ModuleDirectiveAllowed = true; }
   bool isModuleDirectiveAllowed() { return ModuleDirectiveAllowed; }
@@ -193,8 +247,16 @@ public:
 
   // PIC support
   void emitDirectiveCpLoad(unsigned RegNo) override;
-  void emitDirectiveCpRestore(SmallVector<MCInst, 3> &StoreInsts,
-                              int Offset) override;
+
+  /// Emit a .cprestore directive.  If the offset is out of range then it will
+  /// be synthesized using the assembler temporary.
+  ///
+  /// GetATReg() is a callback that can be used to obtain the current assembler
+  /// temporary and is only called when the assembler temporary is required. It
+  /// must handle the case where no assembler temporary is available (typically
+  /// by reporting an error).
+  bool emitDirectiveCpRestore(int Offset, function_ref<unsigned()> GetATReg,
+                              SMLoc IDLoc, const MCSubtargetInfo *STI) override;
   void emitDirectiveCpsetup(unsigned RegNo, int RegOrOffset,
                             const MCSymbol &Sym, bool IsReg) override;
   void emitDirectiveCpreturn(unsigned SaveLocation,
@@ -221,12 +283,15 @@ public:
   MCELFStreamer &getStreamer();
   MipsTargetELFStreamer(MCStreamer &S, const MCSubtargetInfo &STI);
 
+  void setPic(bool Value) override { Pic = Value; }
+
   void emitLabel(MCSymbol *Symbol) override;
   void emitAssignment(MCSymbol *Symbol, const MCExpr *Value) override;
   void finish() override;
 
   void emitDirectiveSetMicroMips() override;
   void emitDirectiveSetNoMicroMips() override;
+  void setUsesMicroMips() override;
   void emitDirectiveSetMips16() override;
 
   void emitDirectiveSetNoReorder() override;
@@ -246,8 +311,8 @@ public:
 
   // PIC support
   void emitDirectiveCpLoad(unsigned RegNo) override;
-  void emitDirectiveCpRestore(SmallVector<MCInst, 3> &StoreInsts,
-                              int Offset) override;
+  bool emitDirectiveCpRestore(int Offset, function_ref<unsigned()> GetATReg,
+                              SMLoc IDLoc, const MCSubtargetInfo *STI) override;
   void emitDirectiveCpsetup(unsigned RegNo, int RegOrOffset,
                             const MCSymbol &Sym, bool IsReg) override;
   void emitDirectiveCpreturn(unsigned SaveLocation,
diff --git a/lib/Target/Mips/TargetInfo/Makefile b/lib/Target/Mips/TargetInfo/Makefile
deleted file mode 100644
index 32f4e1695b1d..000000000000
--- a/lib/Target/Mips/TargetInfo/Makefile
+++ /dev/null
@@ -1,15 +0,0 @@
-##===- lib/Target/Mips/TargetInfo/Makefile -----------------*- Makefile -*-===##
-#
-#                     The LLVM Compiler Infrastructure
-#
-# This file is distributed under the University of Illinois Open Source
-# License. See LICENSE.TXT for details.
-#
-##===----------------------------------------------------------------------===##
-LEVEL = ../../../..
-LIBRARYNAME = LLVMMipsInfo
-
-# Hack: we need to include 'main' target directory to grab private headers
-CPPFLAGS = -I$(PROJ_OBJ_DIR)/.. -I$(PROJ_SRC_DIR)/..
-
-include $(LEVEL)/Makefile.common
diff --git a/lib/Target/NVPTX/CMakeLists.txt b/lib/Target/NVPTX/CMakeLists.txt
index 05fe06dbc07c..b67c40500861 100644
--- a/lib/Target/NVPTX/CMakeLists.txt
+++ b/lib/Target/NVPTX/CMakeLists.txt
@@ -18,6 +18,7 @@ set(NVPTXCodeGen_sources
   NVPTXISelDAGToDAG.cpp
   NVPTXISelLowering.cpp
   NVPTXImageOptimizer.cpp
+  NVPTXInferAddressSpaces.cpp
   NVPTXInstrInfo.cpp
   NVPTXLowerAggrCopies.cpp
   NVPTXLowerKernelArgs.cpp
@@ -31,6 +32,7 @@ set(NVPTXCodeGen_sources
   NVPTXTargetMachine.cpp
   NVPTXTargetTransformInfo.cpp
   NVPTXUtilities.cpp
+  NVVMIntrRange.cpp
   NVVMReflect.cpp
   )
 
diff --git a/lib/Target/NVPTX/InstPrinter/Makefile b/lib/Target/NVPTX/InstPrinter/Makefile
deleted file mode 100644
index 7b7865436bf3..000000000000
--- a/lib/Target/NVPTX/InstPrinter/Makefile
+++ /dev/null
@@ -1,15 +0,0 @@
-##===- lib/Target/NVPTX/AsmPrinter/Makefile ----------------*- Makefile -*-===##
-#
-#											The LLVM Compiler Infrastructure
-#
-# This file is distributed under the University of Illinois Open Source
-# License. See LICENSE.TXT for details.
-#
-##===----------------------------------------------------------------------===##
-LEVEL = ../../../..
-LIBRARYNAME = LLVMNVPTXAsmPrinter
-
-# Hack: we need to include 'main' ptx target directory to grab private headers
-CPP.Flags += -I$(PROJ_OBJ_DIR)/.. -I$(PROJ_SRC_DIR)/..
-
-include $(LEVEL)/Makefile.common
diff --git a/lib/Target/NVPTX/MCTargetDesc/Makefile b/lib/Target/NVPTX/MCTargetDesc/Makefile
deleted file mode 100644
index 31d06cb5948d..000000000000
--- a/lib/Target/NVPTX/MCTargetDesc/Makefile
+++ /dev/null
@@ -1,16 +0,0 @@
-##===- lib/Target/NVPTX/TargetDesc/Makefile ----------------*- Makefile -*-===##
-#
-#                     The LLVM Compiler Infrastructure
-#
-# This file is distributed under the University of Illinois Open Source
-# License. See LICENSE.TXT for details.
-#
-##===----------------------------------------------------------------------===##
-
-LEVEL = ../../../..
-LIBRARYNAME = LLVMNVPTXDesc
-
-# Hack: we need to include 'main' target directory to grab private headers
-CPP.Flags += -I$(PROJ_OBJ_DIR)/.. -I$(PROJ_SRC_DIR)/..
-
-include $(LEVEL)/Makefile.common
diff --git a/lib/Target/NVPTX/MCTargetDesc/NVPTXMCAsmInfo.cpp b/lib/Target/NVPTX/MCTargetDesc/NVPTXMCAsmInfo.cpp
index ef36c13b49f1..78bdf4e698d8 100644
--- a/lib/Target/NVPTX/MCTargetDesc/NVPTXMCAsmInfo.cpp
+++ b/lib/Target/NVPTX/MCTargetDesc/NVPTXMCAsmInfo.cpp
@@ -34,13 +34,16 @@ NVPTXMCAsmInfo::NVPTXMCAsmInfo(const Triple &TheTriple) {
 
   HasSingleParameterDotFile = false;
 
-  InlineAsmStart = " inline asm";
-  InlineAsmEnd = " inline asm";
+  InlineAsmStart = " begin inline asm";
+  InlineAsmEnd = " end inline asm";
 
   SupportsDebugInformation = CompileForDebugging;
   // PTX does not allow .align on functions.
   HasFunctionAlignment = false;
   HasDotTypeDotSizeDirective = false;
+  // PTX does not allow .hidden or .protected
+  HiddenDeclarationVisibilityAttr = HiddenVisibilityAttr = MCSA_Invalid;
+  ProtectedVisibilityAttr = MCSA_Invalid;
 
   Data8bitsDirective = " .b8 ";
   Data16bitsDirective = " .b16 ";
diff --git a/lib/Target/NVPTX/MCTargetDesc/NVPTXMCTargetDesc.cpp b/lib/Target/NVPTX/MCTargetDesc/NVPTXMCTargetDesc.cpp
index ad7302037cad..e356a965a04b 100644
--- a/lib/Target/NVPTX/MCTargetDesc/NVPTXMCTargetDesc.cpp
+++ b/lib/Target/NVPTX/MCTargetDesc/NVPTXMCTargetDesc.cpp
@@ -14,7 +14,6 @@
 #include "NVPTXMCTargetDesc.h"
 #include "InstPrinter/NVPTXInstPrinter.h"
 #include "NVPTXMCAsmInfo.h"
-#include "llvm/MC/MCCodeGenInfo.h"
 #include "llvm/MC/MCInstrInfo.h"
 #include "llvm/MC/MCRegisterInfo.h"
 #include "llvm/MC/MCSubtargetInfo.h"
@@ -49,18 +48,6 @@ createNVPTXMCSubtargetInfo(const Triple &TT, StringRef CPU, StringRef FS) {
   return createNVPTXMCSubtargetInfoImpl(TT, CPU, FS);
 }
 
-static MCCodeGenInfo *createNVPTXMCCodeGenInfo(const Triple &TT,
-                                               Reloc::Model RM,
-                                               CodeModel::Model CM,
-                                               CodeGenOpt::Level OL) {
-  MCCodeGenInfo *X = new MCCodeGenInfo();
-
-  // The default relocation model is used regardless of what the client has
-  // specified, as it is the only relocation model currently supported.
-  X->initMCCodeGenInfo(Reloc::Default, CM, OL);
-  return X;
-}
-
 static MCInstPrinter *createNVPTXMCInstPrinter(const Triple &T,
                                                unsigned SyntaxVariant,
                                                const MCAsmInfo &MAI,
@@ -77,9 +64,6 @@ extern "C" void LLVMInitializeNVPTXTargetMC() {
     // Register the MC asm info.
     RegisterMCAsmInfo<NVPTXMCAsmInfo> X(*T);
 
-    // Register the MC codegen info.
-    TargetRegistry::RegisterMCCodeGenInfo(*T, createNVPTXMCCodeGenInfo);
-
     // Register the MC instruction info.
     TargetRegistry::RegisterMCInstrInfo(*T, createNVPTXMCInstrInfo);
 
diff --git a/lib/Target/NVPTX/Makefile b/lib/Target/NVPTX/Makefile
deleted file mode 100644
index 8db20ebed2c2..000000000000
--- a/lib/Target/NVPTX/Makefile
+++ /dev/null
@@ -1,23 +0,0 @@
-##===- lib/Target/NVPTX/Makefile ---------------------------*- Makefile -*-===##
-#
-#                     The LLVM Compiler Infrastructure
-#
-# This file is distributed under the University of Illinois Open Source
-# License. See LICENSE.TXT for details.
-#
-##===----------------------------------------------------------------------===##
-
-LEVEL = ../../..
-LIBRARYNAME = LLVMNVPTXCodeGen
-TARGET = NVPTX
-
-# Make sure that tblgen is run, first thing.
-BUILT_SOURCES = NVPTXGenAsmWriter.inc \
-		NVPTXGenDAGISel.inc \
-		NVPTXGenInstrInfo.inc \
-		NVPTXGenRegisterInfo.inc \
-		NVPTXGenSubtargetInfo.inc
-
-DIRS = InstPrinter TargetInfo MCTargetDesc
-
-include $(LEVEL)/Makefile.common
diff --git a/lib/Target/NVPTX/NVPTX.h b/lib/Target/NVPTX/NVPTX.h
index e5fae85bacf2..e91385ac13f2 100644
--- a/lib/Target/NVPTX/NVPTX.h
+++ b/lib/Target/NVPTX/NVPTX.h
@@ -46,8 +46,10 @@ FunctionPass *createNVPTXISelDag(NVPTXTargetMachine &TM,
 ModulePass *createNVPTXAssignValidGlobalNamesPass();
 ModulePass *createGenericToNVVMPass();
 FunctionPass *createNVPTXFavorNonGenericAddrSpacesPass();
-ModulePass *createNVVMReflectPass();
-ModulePass *createNVVMReflectPass(const StringMap<int>& Mapping);
+FunctionPass *createNVPTXInferAddressSpacesPass();
+FunctionPass *createNVVMIntrRangePass(unsigned int SmVersion);
+FunctionPass *createNVVMReflectPass();
+FunctionPass *createNVVMReflectPass(const StringMap<int> &Mapping);
 MachineFunctionPass *createNVPTXPrologEpilogPass();
 MachineFunctionPass *createNVPTXReplaceImageHandlesPass();
 FunctionPass *createNVPTXImageOptimizerPass();
@@ -55,8 +57,6 @@ FunctionPass *createNVPTXLowerKernelArgsPass(const NVPTXTargetMachine *TM);
 BasicBlockPass *createNVPTXLowerAllocaPass();
 MachineFunctionPass *createNVPTXPeephole();
 
-bool isImageOrSamplerVal(const Value *, const Module *);
-
 extern Target TheNVPTXTarget32;
 extern Target TheNVPTXTarget64;
 
diff --git a/lib/Target/NVPTX/NVPTX.td b/lib/Target/NVPTX/NVPTX.td
index 96abfa859119..032991a20cc9 100644
--- a/lib/Target/NVPTX/NVPTX.td
+++ b/lib/Target/NVPTX/NVPTX.td
@@ -44,6 +44,12 @@ def SM52 : SubtargetFeature<"sm_52", "SmVersion", "52",
                             "Target SM 5.2">;
 def SM53 : SubtargetFeature<"sm_53", "SmVersion", "53",
                             "Target SM 5.3">;
+def SM60 : SubtargetFeature<"sm_60", "SmVersion", "60",
+                             "Target SM 6.0">;
+def SM61 : SubtargetFeature<"sm_61", "SmVersion", "61",
+                             "Target SM 6.1">;
+def SM62 : SubtargetFeature<"sm_62", "SmVersion", "62",
+                             "Target SM 6.2">;
 
 // PTX Versions
 def PTX32 : SubtargetFeature<"ptx32", "PTXVersion", "32",
@@ -54,6 +60,10 @@ def PTX41 : SubtargetFeature<"ptx41", "PTXVersion", "41",
                              "Use PTX version 4.1">;
 def PTX42 : SubtargetFeature<"ptx42", "PTXVersion", "42",
                              "Use PTX version 4.2">;
+def PTX43 : SubtargetFeature<"ptx43", "PTXVersion", "43",
+                             "Use PTX version 4.3">;
+def PTX50 : SubtargetFeature<"ptx50", "PTXVersion", "50",
+                             "Use PTX version 5.0">;
 
 //===----------------------------------------------------------------------===//
 // NVPTX supported processors.
@@ -71,7 +81,9 @@ def : Proc<"sm_37", [SM37, PTX41]>;
 def : Proc<"sm_50", [SM50, PTX40]>;
 def : Proc<"sm_52", [SM52, PTX41]>;
 def : Proc<"sm_53", [SM53, PTX42]>;
-
+def : Proc<"sm_60", [SM60, PTX50]>;
+def : Proc<"sm_61", [SM61, PTX50]>;
+def : Proc<"sm_62", [SM62, PTX50]>;
 
 def NVPTXInstrInfo : InstrInfo {
 }
diff --git a/lib/Target/NVPTX/NVPTXAsmPrinter.cpp b/lib/Target/NVPTX/NVPTXAsmPrinter.cpp
index e8c36089a779..660016bfcd05 100644
--- a/lib/Target/NVPTX/NVPTXAsmPrinter.cpp
+++ b/lib/Target/NVPTX/NVPTXAsmPrinter.cpp
@@ -117,7 +117,7 @@ void NVPTXAsmPrinter::emitLineNumberAsDotLoc(const MachineInstr &MI) {
   if (ignoreLoc(MI))
     return;
 
-  DebugLoc curLoc = MI.getDebugLoc();
+  const DebugLoc &curLoc = MI.getDebugLoc();
 
   if (!prevDebugLoc && !curLoc)
     return;
@@ -277,7 +277,7 @@ bool NVPTXAsmPrinter::lowerOperand(const MachineOperand &MO,
     break;
   case MachineOperand::MO_FPImmediate: {
     const ConstantFP *Cnt = MO.getFPImm();
-    APFloat Val = Cnt->getValueAPF();
+    const APFloat &Val = Cnt->getValueAPF();
 
     switch (Cnt->getType()->getTypeID()) {
     default: report_fatal_error("Unsupported FP type"); break;
@@ -432,7 +432,8 @@ bool NVPTXAsmPrinter::isLoopHeaderOfNoUnroll(
       continue;
     }
     if (const BasicBlock *PBB = PMBB->getBasicBlock()) {
-      if (MDNode *LoopID = PBB->getTerminator()->getMetadata("llvm.loop")) {
+      if (MDNode *LoopID =
+              PBB->getTerminator()->getMetadata(LLVMContext::MD_loop)) {
         if (GetUnrollMetadata(LoopID, "llvm.loop.unroll.disable"))
           return true;
       }
@@ -798,10 +799,18 @@ void NVPTXAsmPrinter::recordAndEmitFilenames(Module &M) {
     if (filenameMap.find(Filename) != filenameMap.end())
       continue;
     filenameMap[Filename] = i;
+    OutStreamer->EmitDwarfFileDirective(i, "", Filename);
     ++i;
   }
 }
 
+static bool isEmptyXXStructor(GlobalVariable *GV) {
+  if (!GV) return true;
+  const ConstantArray *InitList = dyn_cast<ConstantArray>(GV->getInitializer());
+  if (!InitList) return true;  // Not an array; we don't know how to parse.
+  return InitList->getNumOperands() == 0;
+}
+
 bool NVPTXAsmPrinter::doInitialization(Module &M) {
   // Construct a default subtarget off of the TargetMachine defaults. The
   // rest of NVPTX isn't friendly to change subtargets per function and
@@ -812,6 +821,21 @@ bool NVPTXAsmPrinter::doInitialization(Module &M) {
   const NVPTXTargetMachine &NTM = static_cast<const NVPTXTargetMachine &>(TM);
   const NVPTXSubtarget STI(TT, CPU, FS, NTM);
 
+  if (M.alias_size()) {
+    report_fatal_error("Module has aliases, which NVPTX does not support.");
+    return true; // error
+  }
+  if (!isEmptyXXStructor(M.getNamedGlobal("llvm.global_ctors"))) {
+    report_fatal_error(
+        "Module has a nontrivial global ctor, which NVPTX does not support.");
+    return true;  // error
+  }
+  if (!isEmptyXXStructor(M.getNamedGlobal("llvm.global_dtors"))) {
+    report_fatal_error(
+        "Module has a nontrivial global dtor, which NVPTX does not support.");
+    return true;  // error
+  }
+
   SmallString<128> Str1;
   raw_svector_ostream OS1(Str1);
 
@@ -1017,7 +1041,7 @@ void NVPTXAsmPrinter::printModuleLevelGV(const GlobalVariable *GVar,
 
   // Skip meta data
   if (GVar->hasSection()) {
-    if (GVar->getSection() == StringRef("llvm.metadata"))
+    if (GVar->getSection() == "llvm.metadata")
       return;
   }
 
@@ -1030,7 +1054,7 @@ void NVPTXAsmPrinter::printModuleLevelGV(const GlobalVariable *GVar,
 
   // GlobalVariables are always constant pointers themselves.
   PointerType *PTy = GVar->getType();
-  Type *ETy = PTy->getElementType();
+  Type *ETy = GVar->getValueType();
 
   if (GVar->hasExternalLinkage()) {
     if (GVar->hasInitializer())
@@ -1341,11 +1365,10 @@ void NVPTXAsmPrinter::emitPTXGlobalVariable(const GlobalVariable *GVar,
   const DataLayout &DL = getDataLayout();
 
   // GlobalVariables are always constant pointers themselves.
-  PointerType *PTy = GVar->getType();
-  Type *ETy = PTy->getElementType();
+  Type *ETy = GVar->getValueType();
 
   O << ".";
-  emitPTXAddressSpace(PTy->getAddressSpace(), O);
+  emitPTXAddressSpace(GVar->getType()->getAddressSpace(), O);
   if (GVar->getAlignment() == 0)
     O << " .align " << (int)DL.getPrefTypeAlignment(ETy);
   else
@@ -1429,6 +1452,11 @@ void NVPTXAsmPrinter::emitFunctionParamList(const Function *F, raw_ostream &O) {
   bool isABI = (nvptxSubtarget->getSmVersion() >= 20);
   MVT thePointerTy = TLI->getPointerTy(DL);
 
+  if (F->arg_empty()) {
+    O << "()\n";
+    return;
+  }
+
   O << "(\n";
 
   for (I = F->arg_begin(), E = F->arg_end(); I != E; ++I, paramIndex++) {
@@ -1715,9 +1743,8 @@ void NVPTXAsmPrinter::printScalarConstant(const Constant *CPV, raw_ostream &O) {
     return;
   }
   if (const GlobalValue *GVar = dyn_cast<GlobalValue>(CPV)) {
-    PointerType *PTy = dyn_cast<PointerType>(GVar->getType());
     bool IsNonGenericPointer = false;
-    if (PTy && PTy->getAddressSpace() != 0) {
+    if (GVar->getType()->getAddressSpace() != 0) {
       IsNonGenericPointer = true;
     }
     if (EmitGeneric && !isa<Function>(CPV) && !IsNonGenericPointer) {
@@ -1883,8 +1910,7 @@ void NVPTXAsmPrinter::bufferLEByte(const Constant *CPV, int Bytes,
   case Type::ArrayTyID:
   case Type::VectorTyID:
   case Type::StructTyID: {
-    if (isa<ConstantArray>(CPV) || isa<ConstantVector>(CPV) ||
-        isa<ConstantStruct>(CPV) || isa<ConstantDataSequential>(CPV)) {
+    if (isa<ConstantAggregate>(CPV) || isa<ConstantDataSequential>(CPV)) {
       int ElementSize = DL.getTypeAllocSize(CPV->getType());
       bufferAggregateConstant(CPV, aggBuffer);
       if (Bytes > ElementSize)
@@ -2315,7 +2341,7 @@ void NVPTXAsmPrinter::emitSrcInText(StringRef filename, unsigned line) {
   this->OutStreamer->EmitRawText(temp.str());
 }
 
-LineReader *NVPTXAsmPrinter::getReader(std::string filename) {
+LineReader *NVPTXAsmPrinter::getReader(const std::string &filename) {
   if (!reader) {
     reader = new LineReader(filename);
   }
diff --git a/lib/Target/NVPTX/NVPTXAsmPrinter.h b/lib/Target/NVPTX/NVPTXAsmPrinter.h
index 76bf179896a8..85660fbdb26e 100644
--- a/lib/Target/NVPTX/NVPTXAsmPrinter.h
+++ b/lib/Target/NVPTX/NVPTXAsmPrinter.h
@@ -18,14 +18,14 @@
 #include "NVPTX.h"
 #include "NVPTXSubtarget.h"
 #include "NVPTXTargetMachine.h"
-#include "llvm/ADT/SmallString.h"
 #include "llvm/ADT/StringExtras.h"
 #include "llvm/CodeGen/AsmPrinter.h"
+#include "llvm/CodeGen/MachineLoopInfo.h"
 #include "llvm/IR/Function.h"
 #include "llvm/MC/MCAsmInfo.h"
 #include "llvm/MC/MCExpr.h"
+#include "llvm/MC/MCStreamer.h"
 #include "llvm/MC/MCSymbol.h"
-#include "llvm/Support/CommandLine.h"
 #include "llvm/Support/FormattedStream.h"
 #include "llvm/Target/TargetMachine.h"
 #include <fstream>
@@ -293,7 +293,7 @@ private:
   bool isLoopHeaderOfNoUnroll(const MachineBasicBlock &MBB) const;
 
   LineReader *reader;
-  LineReader *getReader(std::string);
+  LineReader *getReader(const std::string &);
 
   // Used to control the need to emit .generic() in the initializer of
   // module scope variables.
diff --git a/lib/Target/NVPTX/NVPTXFavorNonGenericAddrSpaces.cpp b/lib/Target/NVPTX/NVPTXFavorNonGenericAddrSpaces.cpp
index 95813c8430d1..7c5a54162d77 100644
--- a/lib/Target/NVPTX/NVPTXFavorNonGenericAddrSpaces.cpp
+++ b/lib/Target/NVPTX/NVPTXFavorNonGenericAddrSpaces.cpp
@@ -7,6 +7,9 @@
 //
 //===----------------------------------------------------------------------===//
 //
+// FIXME: This pass is deprecated in favor of NVPTXInferAddressSpaces, which
+// uses a new algorithm that handles pointer induction variables.
+//
 // When a load/store accesses the generic address space, checks whether the
 // address is casted from a non-generic address space. If so, remove this
 // addrspacecast because accessing non-generic address spaces is typically
@@ -164,8 +167,8 @@ Value *NVPTXFavorNonGenericAddrSpaces::hoistAddrSpaceCastFromGEP(
         GEP->getSourceElementType(), Cast->getOperand(0), Indices,
         "", GEPI);
     NewGEP->setIsInBounds(GEP->isInBounds());
+    NewGEP->takeName(GEP);
     NewASC = new AddrSpaceCastInst(NewGEP, GEP->getType(), "", GEPI);
-    NewASC->takeName(GEP);
     // Without RAUWing GEP, the compiler would visit GEP again and emit
     // redundant instructions. This is exercised in test @rauw in
     // access-non-generic.ll.
@@ -263,7 +266,7 @@ bool NVPTXFavorNonGenericAddrSpaces::optimizeMemoryInstruction(Instruction *MI,
 }
 
 bool NVPTXFavorNonGenericAddrSpaces::runOnFunction(Function &F) {
-  if (DisableFavorNonGeneric)
+  if (DisableFavorNonGeneric || skipFunction(F))
     return false;
 
   bool Changed = false;
diff --git a/lib/Target/NVPTX/NVPTXFrameLowering.cpp b/lib/Target/NVPTX/NVPTXFrameLowering.cpp
index 9b34aef3fdec..bbcb497ead9d 100644
--- a/lib/Target/NVPTX/NVPTXFrameLowering.cpp
+++ b/lib/Target/NVPTX/NVPTXFrameLowering.cpp
@@ -16,7 +16,6 @@
 #include "NVPTXRegisterInfo.h"
 #include "NVPTXSubtarget.h"
 #include "NVPTXTargetMachine.h"
-#include "llvm/ADT/BitVector.h"
 #include "llvm/CodeGen/MachineFrameInfo.h"
 #include "llvm/CodeGen/MachineFunction.h"
 #include "llvm/CodeGen/MachineInstrBuilder.h"
@@ -35,7 +34,7 @@ void NVPTXFrameLowering::emitPrologue(MachineFunction &MF,
                                       MachineBasicBlock &MBB) const {
   if (MF.getFrameInfo()->hasStackObjects()) {
     assert(&MF.front() == &MBB && "Shrink-wrapping not yet supported");
-    MachineInstr *MI = MBB.begin();
+    MachineInstr *MI = &MBB.front();
     MachineRegisterInfo &MR = MF.getRegInfo();
 
     // This instruction really occurs before first instruction
@@ -70,10 +69,10 @@ void NVPTXFrameLowering::emitEpilogue(MachineFunction &MF,
 
 // This function eliminates ADJCALLSTACKDOWN,
 // ADJCALLSTACKUP pseudo instructions
-void NVPTXFrameLowering::eliminateCallFramePseudoInstr(
+MachineBasicBlock::iterator NVPTXFrameLowering::eliminateCallFramePseudoInstr(
     MachineFunction &MF, MachineBasicBlock &MBB,
     MachineBasicBlock::iterator I) const {
   // Simply discard ADJCALLSTACKDOWN,
   // ADJCALLSTACKUP instructions.
-  MBB.erase(I);
+  return MBB.erase(I);
 }
diff --git a/lib/Target/NVPTX/NVPTXFrameLowering.h b/lib/Target/NVPTX/NVPTXFrameLowering.h
index 14f8bb7b98fe..320ca9a2f095 100644
--- a/lib/Target/NVPTX/NVPTXFrameLowering.h
+++ b/lib/Target/NVPTX/NVPTXFrameLowering.h
@@ -26,7 +26,7 @@ public:
   void emitPrologue(MachineFunction &MF, MachineBasicBlock &MBB) const override;
   void emitEpilogue(MachineFunction &MF, MachineBasicBlock &MBB) const override;
 
-  void
+  MachineBasicBlock::iterator
   eliminateCallFramePseudoInstr(MachineFunction &MF, MachineBasicBlock &MBB,
                                 MachineBasicBlock::iterator I) const override;
 };
diff --git a/lib/Target/NVPTX/NVPTXGenericToNVVM.cpp b/lib/Target/NVPTX/NVPTXGenericToNVVM.cpp
index 62ca5e9f9f62..66a964082c5f 100644
--- a/lib/Target/NVPTX/NVPTXGenericToNVVM.cpp
+++ b/lib/Target/NVPTX/NVPTXGenericToNVVM.cpp
@@ -86,7 +86,7 @@ bool GenericToNVVM::runOnModule(Module &M) {
         !llvm::isTexture(*GV) && !llvm::isSurface(*GV) &&
         !llvm::isSampler(*GV) && !GV->getName().startswith("llvm.")) {
       GlobalVariable *NewGV = new GlobalVariable(
-          M, GV->getType()->getElementType(), GV->isConstant(),
+          M, GV->getValueType(), GV->isConstant(),
           GV->getLinkage(),
           GV->hasInitializer() ? GV->getInitializer() : nullptr,
           "", GV, GV->getThreadLocalMode(), llvm::ADDRESS_SPACE_GLOBAL);
@@ -172,7 +172,7 @@ Value *GenericToNVVM::getOrInsertCVTA(Module *M, Function *F,
 
   // See if the address space conversion requires the operand to be bitcast
   // to i8 addrspace(n)* first.
-  EVT ExtendedGVType = EVT::getEVT(GVType->getElementType(), true);
+  EVT ExtendedGVType = EVT::getEVT(GV->getValueType(), true);
   if (!ExtendedGVType.isInteger() && !ExtendedGVType.isFloatingPoint()) {
     // A bitcast to i8 addrspace(n)* on the operand is needed.
     LLVMContext &Context = M->getContext();
@@ -182,21 +182,18 @@ Value *GenericToNVVM::getOrInsertCVTA(Module *M, Function *F,
     // Insert the address space conversion.
     Type *ResultType =
         PointerType::get(Type::getInt8Ty(Context), llvm::ADDRESS_SPACE_GENERIC);
-    SmallVector<Type *, 2> ParamTypes;
-    ParamTypes.push_back(ResultType);
-    ParamTypes.push_back(DestTy);
     Function *CVTAFunction = Intrinsic::getDeclaration(
-        M, Intrinsic::nvvm_ptr_global_to_gen, ParamTypes);
+        M, Intrinsic::nvvm_ptr_global_to_gen, {ResultType, DestTy});
     CVTA = Builder.CreateCall(CVTAFunction, CVTA, "cvta");
     // Another bitcast from i8 * to <the element type of GVType> * is
     // required.
     DestTy =
-        PointerType::get(GVType->getElementType(), llvm::ADDRESS_SPACE_GENERIC);
+        PointerType::get(GV->getValueType(), llvm::ADDRESS_SPACE_GENERIC);
     CVTA = Builder.CreateBitCast(CVTA, DestTy, "cvta");
   } else {
     // A simple CVTA is enough.
     SmallVector<Type *, 2> ParamTypes;
-    ParamTypes.push_back(PointerType::get(GVType->getElementType(),
+    ParamTypes.push_back(PointerType::get(GV->getValueType(),
                                           llvm::ADDRESS_SPACE_GENERIC));
     ParamTypes.push_back(GVType);
     Function *CVTAFunction = Intrinsic::getDeclaration(
@@ -230,8 +227,7 @@ Value *GenericToNVVM::remapConstant(Module *M, Function *F, Constant *C,
     if (I != GVMap.end()) {
       NewValue = getOrInsertCVTA(M, F, I->second, Builder);
     }
-  } else if (isa<ConstantVector>(C) || isa<ConstantArray>(C) ||
-             isa<ConstantStruct>(C)) {
+  } else if (isa<ConstantAggregate>(C)) {
     // If any element in the constant vector or aggregate C is or uses a global
     // variable in GVMap, the constant C needs to be reconstructed, using a set
     // of instructions.
diff --git a/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp b/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp
index 2d0098b392f4..61c6758ef118 100644
--- a/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp
+++ b/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp
@@ -105,57 +105,66 @@ bool NVPTXDAGToDAGISel::allowFMA() const {
 
 /// Select - Select instructions not customized! Used for
 /// expanded, promoted and normal instructions.
-SDNode *NVPTXDAGToDAGISel::Select(SDNode *N) {
+void NVPTXDAGToDAGISel::Select(SDNode *N) {
 
   if (N->isMachineOpcode()) {
     N->setNodeId(-1);
-    return nullptr; // Already selected.
+    return; // Already selected.
   }
 
-  SDNode *ResNode = nullptr;
   switch (N->getOpcode()) {
   case ISD::LOAD:
-    ResNode = SelectLoad(N);
+    if (tryLoad(N))
+      return;
     break;
   case ISD::STORE:
-    ResNode = SelectStore(N);
+    if (tryStore(N))
+      return;
     break;
   case NVPTXISD::LoadV2:
   case NVPTXISD::LoadV4:
-    ResNode = SelectLoadVector(N);
+    if (tryLoadVector(N))
+      return;
     break;
   case NVPTXISD::LDGV2:
   case NVPTXISD::LDGV4:
   case NVPTXISD::LDUV2:
   case NVPTXISD::LDUV4:
-    ResNode = SelectLDGLDU(N);
+    if (tryLDGLDU(N))
+      return;
     break;
   case NVPTXISD::StoreV2:
   case NVPTXISD::StoreV4:
-    ResNode = SelectStoreVector(N);
+    if (tryStoreVector(N))
+      return;
     break;
   case NVPTXISD::LoadParam:
   case NVPTXISD::LoadParamV2:
   case NVPTXISD::LoadParamV4:
-    ResNode = SelectLoadParam(N);
+    if (tryLoadParam(N))
+      return;
     break;
   case NVPTXISD::StoreRetval:
   case NVPTXISD::StoreRetvalV2:
   case NVPTXISD::StoreRetvalV4:
-    ResNode = SelectStoreRetval(N);
+    if (tryStoreRetval(N))
+      return;
     break;
   case NVPTXISD::StoreParam:
   case NVPTXISD::StoreParamV2:
   case NVPTXISD::StoreParamV4:
   case NVPTXISD::StoreParamS32:
   case NVPTXISD::StoreParamU32:
-    ResNode = SelectStoreParam(N);
+    if (tryStoreParam(N))
+      return;
     break;
   case ISD::INTRINSIC_WO_CHAIN:
-    ResNode = SelectIntrinsicNoChain(N);
+    if (tryIntrinsicNoChain(N))
+      return;
     break;
   case ISD::INTRINSIC_W_CHAIN:
-    ResNode = SelectIntrinsicChain(N);
+    if (tryIntrinsicChain(N))
+      return;
     break;
   case NVPTXISD::Tex1DFloatS32:
   case NVPTXISD::Tex1DFloatFloat:
@@ -325,7 +334,8 @@ SDNode *NVPTXDAGToDAGISel::Select(SDNode *N) {
   case NVPTXISD::Tld4UnifiedG2DU64Float:
   case NVPTXISD::Tld4UnifiedB2DU64Float:
   case NVPTXISD::Tld4UnifiedA2DU64Float:
-    ResNode = SelectTextureIntrinsic(N);
+    if (tryTextureIntrinsic(N))
+      return;
     break;
   case NVPTXISD::Suld1DI8Clamp:
   case NVPTXISD::Suld1DI16Clamp:
@@ -492,37 +502,37 @@ SDNode *NVPTXDAGToDAGISel::Select(SDNode *N) {
   case NVPTXISD::Suld3DV4I8Zero:
   case NVPTXISD::Suld3DV4I16Zero:
   case NVPTXISD::Suld3DV4I32Zero:
-    ResNode = SelectSurfaceIntrinsic(N);
+    if (trySurfaceIntrinsic(N))
+      return;
     break;
   case ISD::AND:
   case ISD::SRA:
   case ISD::SRL:
     // Try to select BFE
-    ResNode = SelectBFE(N);
+    if (tryBFE(N))
+      return;
     break;
   case ISD::ADDRSPACECAST:
-    ResNode = SelectAddrSpaceCast(N);
-    break;
+    SelectAddrSpaceCast(N);
+    return;
   default:
     break;
   }
-  if (ResNode)
-    return ResNode;
-  return SelectCode(N);
+  SelectCode(N);
 }
 
-SDNode *NVPTXDAGToDAGISel::SelectIntrinsicChain(SDNode *N) {
+bool NVPTXDAGToDAGISel::tryIntrinsicChain(SDNode *N) {
   unsigned IID = cast<ConstantSDNode>(N->getOperand(1))->getZExtValue();
   switch (IID) {
   default:
-    return NULL;
+    return false;
   case Intrinsic::nvvm_ldg_global_f:
   case Intrinsic::nvvm_ldg_global_i:
   case Intrinsic::nvvm_ldg_global_p:
   case Intrinsic::nvvm_ldu_global_f:
   case Intrinsic::nvvm_ldu_global_i:
   case Intrinsic::nvvm_ldu_global_p:
-    return SelectLDGLDU(N);
+    return tryLDGLDU(N);
   }
 }
 
@@ -579,25 +589,26 @@ static bool canLowerToLDG(MemSDNode *N, const NVPTXSubtarget &Subtarget,
   return true;
 }
 
-SDNode *NVPTXDAGToDAGISel::SelectIntrinsicNoChain(SDNode *N) {
+bool NVPTXDAGToDAGISel::tryIntrinsicNoChain(SDNode *N) {
   unsigned IID = cast<ConstantSDNode>(N->getOperand(0))->getZExtValue();
   switch (IID) {
   default:
-    return nullptr;
+    return false;
   case Intrinsic::nvvm_texsurf_handle_internal:
-    return SelectTexSurfHandle(N);
+    SelectTexSurfHandle(N);
+    return true;
   }
 }
 
-SDNode *NVPTXDAGToDAGISel::SelectTexSurfHandle(SDNode *N) {
+void NVPTXDAGToDAGISel::SelectTexSurfHandle(SDNode *N) {
   // Op 0 is the intrinsic ID
   SDValue Wrapper = N->getOperand(1);
   SDValue GlobalVal = Wrapper.getOperand(0);
-  return CurDAG->getMachineNode(NVPTX::texsurf_handles, SDLoc(N), MVT::i64,
-                                GlobalVal);
+  ReplaceNode(N, CurDAG->getMachineNode(NVPTX::texsurf_handles, SDLoc(N),
+                                        MVT::i64, GlobalVal));
 }
 
-SDNode *NVPTXDAGToDAGISel::SelectAddrSpaceCast(SDNode *N) {
+void NVPTXDAGToDAGISel::SelectAddrSpaceCast(SDNode *N) {
   SDValue Src = N->getOperand(0);
   AddrSpaceCastSDNode *CastN = cast<AddrSpaceCastSDNode>(N);
   unsigned SrcAddrSpace = CastN->getSrcAddressSpace();
@@ -624,7 +635,9 @@ SDNode *NVPTXDAGToDAGISel::SelectAddrSpaceCast(SDNode *N) {
       Opc = TM.is64Bit() ? NVPTX::cvta_local_yes_64 : NVPTX::cvta_local_yes;
       break;
     }
-    return CurDAG->getMachineNode(Opc, SDLoc(N), N->getValueType(0), Src);
+    ReplaceNode(N, CurDAG->getMachineNode(Opc, SDLoc(N), N->getValueType(0),
+                                          Src));
+    return;
   } else {
     // Generic to specific
     if (SrcAddrSpace != 0)
@@ -653,11 +666,13 @@ SDNode *NVPTXDAGToDAGISel::SelectAddrSpaceCast(SDNode *N) {
                          : NVPTX::nvvm_ptr_gen_to_param;
       break;
     }
-    return CurDAG->getMachineNode(Opc, SDLoc(N), N->getValueType(0), Src);
+    ReplaceNode(N, CurDAG->getMachineNode(Opc, SDLoc(N), N->getValueType(0),
+                                          Src));
+    return;
   }
 }
 
-SDNode *NVPTXDAGToDAGISel::SelectLoad(SDNode *N) {
+bool NVPTXDAGToDAGISel::tryLoad(SDNode *N) {
   SDLoc dl(N);
   LoadSDNode *LD = cast<LoadSDNode>(N);
   EVT LoadedVT = LD->getMemoryVT();
@@ -665,16 +680,16 @@ SDNode *NVPTXDAGToDAGISel::SelectLoad(SDNode *N) {
 
   // do not support pre/post inc/dec
   if (LD->isIndexed())
-    return nullptr;
+    return false;
 
   if (!LoadedVT.isSimple())
-    return nullptr;
+    return false;
 
   // Address Space Setting
   unsigned int codeAddrSpace = getCodeAddrSpace(LD);
 
   if (canLowerToLDG(LD, *Subtarget, codeAddrSpace, MF)) {
-    return SelectLDGLDU(N);
+    return tryLDGLDU(N);
   }
 
   // Volatile Setting
@@ -695,7 +710,7 @@ SDNode *NVPTXDAGToDAGISel::SelectLoad(SDNode *N) {
     else if (num == 4)
       vecType = NVPTX::PTXLdStInstCode::V4;
     else
-      return nullptr;
+      return false;
   }
 
   // Type Setting: fromType + fromTypeWidth
@@ -744,7 +759,7 @@ SDNode *NVPTXDAGToDAGISel::SelectLoad(SDNode *N) {
       Opcode = NVPTX::LD_f64_avar;
       break;
     default:
-      return nullptr;
+      return false;
     }
     SDValue Ops[] = { getI32Imm(isVolatile, dl), getI32Imm(codeAddrSpace, dl),
                       getI32Imm(vecType, dl), getI32Imm(fromType, dl),
@@ -772,7 +787,7 @@ SDNode *NVPTXDAGToDAGISel::SelectLoad(SDNode *N) {
       Opcode = NVPTX::LD_f64_asi;
       break;
     default:
-      return nullptr;
+      return false;
     }
     SDValue Ops[] = { getI32Imm(isVolatile, dl), getI32Imm(codeAddrSpace, dl),
                       getI32Imm(vecType, dl), getI32Imm(fromType, dl),
@@ -801,7 +816,7 @@ SDNode *NVPTXDAGToDAGISel::SelectLoad(SDNode *N) {
         Opcode = NVPTX::LD_f64_ari_64;
         break;
       default:
-        return nullptr;
+        return false;
       }
     } else {
       switch (TargetVT) {
@@ -824,7 +839,7 @@ SDNode *NVPTXDAGToDAGISel::SelectLoad(SDNode *N) {
         Opcode = NVPTX::LD_f64_ari;
         break;
       default:
-        return nullptr;
+        return false;
       }
     }
     SDValue Ops[] = { getI32Imm(isVolatile, dl), getI32Imm(codeAddrSpace, dl),
@@ -853,7 +868,7 @@ SDNode *NVPTXDAGToDAGISel::SelectLoad(SDNode *N) {
         Opcode = NVPTX::LD_f64_areg_64;
         break;
       default:
-        return nullptr;
+        return false;
       }
     } else {
       switch (TargetVT) {
@@ -876,7 +891,7 @@ SDNode *NVPTXDAGToDAGISel::SelectLoad(SDNode *N) {
         Opcode = NVPTX::LD_f64_areg;
         break;
       default:
-        return nullptr;
+        return false;
       }
     }
     SDValue Ops[] = { getI32Imm(isVolatile, dl), getI32Imm(codeAddrSpace, dl),
@@ -885,16 +900,18 @@ SDNode *NVPTXDAGToDAGISel::SelectLoad(SDNode *N) {
     NVPTXLD = CurDAG->getMachineNode(Opcode, dl, TargetVT, MVT::Other, Ops);
   }
 
-  if (NVPTXLD) {
-    MachineSDNode::mmo_iterator MemRefs0 = MF->allocateMemRefsArray(1);
-    MemRefs0[0] = cast<MemSDNode>(N)->getMemOperand();
-    cast<MachineSDNode>(NVPTXLD)->setMemRefs(MemRefs0, MemRefs0 + 1);
-  }
+  if (!NVPTXLD)
+    return false;
+
+  MachineSDNode::mmo_iterator MemRefs0 = MF->allocateMemRefsArray(1);
+  MemRefs0[0] = cast<MemSDNode>(N)->getMemOperand();
+  cast<MachineSDNode>(NVPTXLD)->setMemRefs(MemRefs0, MemRefs0 + 1);
 
-  return NVPTXLD;
+  ReplaceNode(N, NVPTXLD);
+  return true;
 }
 
-SDNode *NVPTXDAGToDAGISel::SelectLoadVector(SDNode *N) {
+bool NVPTXDAGToDAGISel::tryLoadVector(SDNode *N) {
 
   SDValue Chain = N->getOperand(0);
   SDValue Op1 = N->getOperand(1);
@@ -906,13 +923,13 @@ SDNode *NVPTXDAGToDAGISel::SelectLoadVector(SDNode *N) {
   EVT LoadedVT = MemSD->getMemoryVT();
 
   if (!LoadedVT.isSimple())
-    return nullptr;
+    return false;
 
   // Address Space Setting
   unsigned int CodeAddrSpace = getCodeAddrSpace(MemSD);
 
   if (canLowerToLDG(MemSD, *Subtarget, CodeAddrSpace, MF)) {
-    return SelectLDGLDU(N);
+    return tryLDGLDU(N);
   }
 
   // Volatile Setting
@@ -956,7 +973,7 @@ SDNode *NVPTXDAGToDAGISel::SelectLoadVector(SDNode *N) {
     VecType = NVPTX::PTXLdStInstCode::V4;
     break;
   default:
-    return nullptr;
+    return false;
   }
 
   EVT EltVT = N->getValueType(0);
@@ -964,11 +981,11 @@ SDNode *NVPTXDAGToDAGISel::SelectLoadVector(SDNode *N) {
   if (SelectDirectAddr(Op1, Addr)) {
     switch (N->getOpcode()) {
     default:
-      return nullptr;
+      return false;
     case NVPTXISD::LoadV2:
       switch (EltVT.getSimpleVT().SimpleTy) {
       default:
-        return nullptr;
+        return false;
       case MVT::i8:
         Opcode = NVPTX::LDV_i8_v2_avar;
         break;
@@ -992,7 +1009,7 @@ SDNode *NVPTXDAGToDAGISel::SelectLoadVector(SDNode *N) {
     case NVPTXISD::LoadV4:
       switch (EltVT.getSimpleVT().SimpleTy) {
       default:
-        return nullptr;
+        return false;
       case MVT::i8:
         Opcode = NVPTX::LDV_i8_v4_avar;
         break;
@@ -1017,11 +1034,11 @@ SDNode *NVPTXDAGToDAGISel::SelectLoadVector(SDNode *N) {
                           : SelectADDRsi(Op1.getNode(), Op1, Base, Offset)) {
     switch (N->getOpcode()) {
     default:
-      return nullptr;
+      return false;
     case NVPTXISD::LoadV2:
       switch (EltVT.getSimpleVT().SimpleTy) {
       default:
-        return nullptr;
+        return false;
       case MVT::i8:
         Opcode = NVPTX::LDV_i8_v2_asi;
         break;
@@ -1045,7 +1062,7 @@ SDNode *NVPTXDAGToDAGISel::SelectLoadVector(SDNode *N) {
     case NVPTXISD::LoadV4:
       switch (EltVT.getSimpleVT().SimpleTy) {
       default:
-        return nullptr;
+        return false;
       case MVT::i8:
         Opcode = NVPTX::LDV_i8_v4_asi;
         break;
@@ -1071,11 +1088,11 @@ SDNode *NVPTXDAGToDAGISel::SelectLoadVector(SDNode *N) {
     if (TM.is64Bit()) {
       switch (N->getOpcode()) {
       default:
-        return nullptr;
+        return false;
       case NVPTXISD::LoadV2:
         switch (EltVT.getSimpleVT().SimpleTy) {
         default:
-          return nullptr;
+          return false;
         case MVT::i8:
           Opcode = NVPTX::LDV_i8_v2_ari_64;
           break;
@@ -1099,7 +1116,7 @@ SDNode *NVPTXDAGToDAGISel::SelectLoadVector(SDNode *N) {
       case NVPTXISD::LoadV4:
         switch (EltVT.getSimpleVT().SimpleTy) {
         default:
-          return nullptr;
+          return false;
         case MVT::i8:
           Opcode = NVPTX::LDV_i8_v4_ari_64;
           break;
@@ -1118,11 +1135,11 @@ SDNode *NVPTXDAGToDAGISel::SelectLoadVector(SDNode *N) {
     } else {
       switch (N->getOpcode()) {
       default:
-        return nullptr;
+        return false;
       case NVPTXISD::LoadV2:
         switch (EltVT.getSimpleVT().SimpleTy) {
         default:
-          return nullptr;
+          return false;
         case MVT::i8:
           Opcode = NVPTX::LDV_i8_v2_ari;
           break;
@@ -1146,7 +1163,7 @@ SDNode *NVPTXDAGToDAGISel::SelectLoadVector(SDNode *N) {
       case NVPTXISD::LoadV4:
         switch (EltVT.getSimpleVT().SimpleTy) {
         default:
-          return nullptr;
+          return false;
         case MVT::i8:
           Opcode = NVPTX::LDV_i8_v4_ari;
           break;
@@ -1173,11 +1190,11 @@ SDNode *NVPTXDAGToDAGISel::SelectLoadVector(SDNode *N) {
     if (TM.is64Bit()) {
       switch (N->getOpcode()) {
       default:
-        return nullptr;
+        return false;
       case NVPTXISD::LoadV2:
         switch (EltVT.getSimpleVT().SimpleTy) {
         default:
-          return nullptr;
+          return false;
         case MVT::i8:
           Opcode = NVPTX::LDV_i8_v2_areg_64;
           break;
@@ -1201,7 +1218,7 @@ SDNode *NVPTXDAGToDAGISel::SelectLoadVector(SDNode *N) {
       case NVPTXISD::LoadV4:
         switch (EltVT.getSimpleVT().SimpleTy) {
         default:
-          return nullptr;
+          return false;
         case MVT::i8:
           Opcode = NVPTX::LDV_i8_v4_areg_64;
           break;
@@ -1220,11 +1237,11 @@ SDNode *NVPTXDAGToDAGISel::SelectLoadVector(SDNode *N) {
     } else {
       switch (N->getOpcode()) {
       default:
-        return nullptr;
+        return false;
       case NVPTXISD::LoadV2:
         switch (EltVT.getSimpleVT().SimpleTy) {
         default:
-          return nullptr;
+          return false;
         case MVT::i8:
           Opcode = NVPTX::LDV_i8_v2_areg;
           break;
@@ -1248,7 +1265,7 @@ SDNode *NVPTXDAGToDAGISel::SelectLoadVector(SDNode *N) {
       case NVPTXISD::LoadV4:
         switch (EltVT.getSimpleVT().SimpleTy) {
         default:
-          return nullptr;
+          return false;
         case MVT::i8:
           Opcode = NVPTX::LDV_i8_v4_areg;
           break;
@@ -1276,17 +1293,18 @@ SDNode *NVPTXDAGToDAGISel::SelectLoadVector(SDNode *N) {
   MemRefs0[0] = cast<MemSDNode>(N)->getMemOperand();
   cast<MachineSDNode>(LD)->setMemRefs(MemRefs0, MemRefs0 + 1);
 
-  return LD;
+  ReplaceNode(N, LD);
+  return true;
 }
 
-SDNode *NVPTXDAGToDAGISel::SelectLDGLDU(SDNode *N) {
+bool NVPTXDAGToDAGISel::tryLDGLDU(SDNode *N) {
 
   SDValue Chain = N->getOperand(0);
   SDValue Op1;
   MemSDNode *Mem;
   bool IsLDG = true;
 
-  // If this is an LDG intrinsic, the address is the third operand. Its its an
+  // If this is an LDG intrinsic, the address is the third operand. If its an
   // LDG/LDU SD node (from custom vector handling), then its the second operand
   if (N->getOpcode() == ISD::INTRINSIC_W_CHAIN) {
     Op1 = N->getOperand(2);
@@ -1294,7 +1312,7 @@ SDNode *NVPTXDAGToDAGISel::SelectLDGLDU(SDNode *N) {
     unsigned IID = cast<ConstantSDNode>(N->getOperand(1))->getZExtValue();
     switch (IID) {
     default:
-      return NULL;
+      return false;
     case Intrinsic::nvvm_ldg_global_f:
     case Intrinsic::nvvm_ldg_global_i:
     case Intrinsic::nvvm_ldg_global_p:
@@ -1317,19 +1335,32 @@ SDNode *NVPTXDAGToDAGISel::SelectLDGLDU(SDNode *N) {
   SDValue Base, Offset, Addr;
 
   EVT EltVT = Mem->getMemoryVT();
+  unsigned NumElts = 1;
   if (EltVT.isVector()) {
+    NumElts = EltVT.getVectorNumElements();
     EltVT = EltVT.getVectorElementType();
   }
 
+  // Build the "promoted" result VTList for the load. If we are really loading
+  // i8s, then the return type will be promoted to i16 since we do not expose
+  // 8-bit registers in NVPTX.
+  EVT NodeVT = (EltVT == MVT::i8) ? MVT::i16 : EltVT;
+  SmallVector<EVT, 5> InstVTs;
+  for (unsigned i = 0; i != NumElts; ++i) {
+    InstVTs.push_back(NodeVT);
+  }
+  InstVTs.push_back(MVT::Other);
+  SDVTList InstVTList = CurDAG->getVTList(InstVTs);
+
   if (SelectDirectAddr(Op1, Addr)) {
     switch (N->getOpcode()) {
     default:
-      return nullptr;
+      return false;
     case ISD::INTRINSIC_W_CHAIN:
       if (IsLDG) {
         switch (EltVT.getSimpleVT().SimpleTy) {
         default:
-          return nullptr;
+          return false;
         case MVT::i8:
           Opcode = NVPTX::INT_PTX_LDG_GLOBAL_i8avar;
           break;
@@ -1352,7 +1383,7 @@ SDNode *NVPTXDAGToDAGISel::SelectLDGLDU(SDNode *N) {
       } else {
         switch (EltVT.getSimpleVT().SimpleTy) {
         default:
-          return nullptr;
+          return false;
         case MVT::i8:
           Opcode = NVPTX::INT_PTX_LDU_GLOBAL_i8avar;
           break;
@@ -1377,7 +1408,7 @@ SDNode *NVPTXDAGToDAGISel::SelectLDGLDU(SDNode *N) {
     case NVPTXISD::LDGV2:
       switch (EltVT.getSimpleVT().SimpleTy) {
       default:
-        return nullptr;
+        return false;
       case MVT::i8:
         Opcode = NVPTX::INT_PTX_LDG_G_v2i8_ELE_avar;
         break;
@@ -1401,7 +1432,7 @@ SDNode *NVPTXDAGToDAGISel::SelectLDGLDU(SDNode *N) {
     case NVPTXISD::LDUV2:
       switch (EltVT.getSimpleVT().SimpleTy) {
       default:
-        return nullptr;
+        return false;
       case MVT::i8:
         Opcode = NVPTX::INT_PTX_LDU_G_v2i8_ELE_avar;
         break;
@@ -1425,7 +1456,7 @@ SDNode *NVPTXDAGToDAGISel::SelectLDGLDU(SDNode *N) {
     case NVPTXISD::LDGV4:
       switch (EltVT.getSimpleVT().SimpleTy) {
       default:
-        return nullptr;
+        return false;
       case MVT::i8:
         Opcode = NVPTX::INT_PTX_LDG_G_v4i8_ELE_avar;
         break;
@@ -1443,7 +1474,7 @@ SDNode *NVPTXDAGToDAGISel::SelectLDGLDU(SDNode *N) {
     case NVPTXISD::LDUV4:
       switch (EltVT.getSimpleVT().SimpleTy) {
       default:
-        return nullptr;
+        return false;
       case MVT::i8:
         Opcode = NVPTX::INT_PTX_LDU_G_v4i8_ELE_avar;
         break;
@@ -1461,19 +1492,19 @@ SDNode *NVPTXDAGToDAGISel::SelectLDGLDU(SDNode *N) {
     }
 
     SDValue Ops[] = { Addr, Chain };
-    LD = CurDAG->getMachineNode(Opcode, DL, N->getVTList(), Ops);
+    LD = CurDAG->getMachineNode(Opcode, DL, InstVTList, Ops);
   } else if (TM.is64Bit() ? SelectADDRri64(Op1.getNode(), Op1, Base, Offset)
                           : SelectADDRri(Op1.getNode(), Op1, Base, Offset)) {
     if (TM.is64Bit()) {
       switch (N->getOpcode()) {
       default:
-        return nullptr;
+        return false;
       case ISD::LOAD:
       case ISD::INTRINSIC_W_CHAIN:
         if (IsLDG) {
           switch (EltVT.getSimpleVT().SimpleTy) {
           default:
-            return nullptr;
+            return false;
           case MVT::i8:
             Opcode = NVPTX::INT_PTX_LDG_GLOBAL_i8ari64;
             break;
@@ -1496,7 +1527,7 @@ SDNode *NVPTXDAGToDAGISel::SelectLDGLDU(SDNode *N) {
         } else {
           switch (EltVT.getSimpleVT().SimpleTy) {
           default:
-            return nullptr;
+            return false;
           case MVT::i8:
             Opcode = NVPTX::INT_PTX_LDU_GLOBAL_i8ari64;
             break;
@@ -1522,7 +1553,7 @@ SDNode *NVPTXDAGToDAGISel::SelectLDGLDU(SDNode *N) {
       case NVPTXISD::LDGV2:
         switch (EltVT.getSimpleVT().SimpleTy) {
         default:
-          return nullptr;
+          return false;
         case MVT::i8:
           Opcode = NVPTX::INT_PTX_LDG_G_v2i8_ELE_ari64;
           break;
@@ -1546,7 +1577,7 @@ SDNode *NVPTXDAGToDAGISel::SelectLDGLDU(SDNode *N) {
       case NVPTXISD::LDUV2:
         switch (EltVT.getSimpleVT().SimpleTy) {
         default:
-          return nullptr;
+          return false;
         case MVT::i8:
           Opcode = NVPTX::INT_PTX_LDU_G_v2i8_ELE_ari64;
           break;
@@ -1571,7 +1602,7 @@ SDNode *NVPTXDAGToDAGISel::SelectLDGLDU(SDNode *N) {
       case NVPTXISD::LDGV4:
         switch (EltVT.getSimpleVT().SimpleTy) {
         default:
-          return nullptr;
+          return false;
         case MVT::i8:
           Opcode = NVPTX::INT_PTX_LDG_G_v4i8_ELE_ari64;
           break;
@@ -1589,7 +1620,7 @@ SDNode *NVPTXDAGToDAGISel::SelectLDGLDU(SDNode *N) {
       case NVPTXISD::LDUV4:
         switch (EltVT.getSimpleVT().SimpleTy) {
         default:
-          return nullptr;
+          return false;
         case MVT::i8:
           Opcode = NVPTX::INT_PTX_LDU_G_v4i8_ELE_ari64;
           break;
@@ -1608,13 +1639,13 @@ SDNode *NVPTXDAGToDAGISel::SelectLDGLDU(SDNode *N) {
     } else {
       switch (N->getOpcode()) {
       default:
-        return nullptr;
+        return false;
       case ISD::LOAD:
       case ISD::INTRINSIC_W_CHAIN:
         if (IsLDG) {
           switch (EltVT.getSimpleVT().SimpleTy) {
           default:
-            return nullptr;
+            return false;
           case MVT::i8:
             Opcode = NVPTX::INT_PTX_LDG_GLOBAL_i8ari;
             break;
@@ -1637,7 +1668,7 @@ SDNode *NVPTXDAGToDAGISel::SelectLDGLDU(SDNode *N) {
         } else {
           switch (EltVT.getSimpleVT().SimpleTy) {
           default:
-            return nullptr;
+            return false;
           case MVT::i8:
             Opcode = NVPTX::INT_PTX_LDU_GLOBAL_i8ari;
             break;
@@ -1663,7 +1694,7 @@ SDNode *NVPTXDAGToDAGISel::SelectLDGLDU(SDNode *N) {
       case NVPTXISD::LDGV2:
         switch (EltVT.getSimpleVT().SimpleTy) {
         default:
-          return nullptr;
+          return false;
         case MVT::i8:
           Opcode = NVPTX::INT_PTX_LDG_G_v2i8_ELE_ari32;
           break;
@@ -1687,7 +1718,7 @@ SDNode *NVPTXDAGToDAGISel::SelectLDGLDU(SDNode *N) {
       case NVPTXISD::LDUV2:
         switch (EltVT.getSimpleVT().SimpleTy) {
         default:
-          return nullptr;
+          return false;
         case MVT::i8:
           Opcode = NVPTX::INT_PTX_LDU_G_v2i8_ELE_ari32;
           break;
@@ -1712,7 +1743,7 @@ SDNode *NVPTXDAGToDAGISel::SelectLDGLDU(SDNode *N) {
       case NVPTXISD::LDGV4:
         switch (EltVT.getSimpleVT().SimpleTy) {
         default:
-          return nullptr;
+          return false;
         case MVT::i8:
           Opcode = NVPTX::INT_PTX_LDG_G_v4i8_ELE_ari32;
           break;
@@ -1730,7 +1761,7 @@ SDNode *NVPTXDAGToDAGISel::SelectLDGLDU(SDNode *N) {
       case NVPTXISD::LDUV4:
         switch (EltVT.getSimpleVT().SimpleTy) {
         default:
-          return nullptr;
+          return false;
         case MVT::i8:
           Opcode = NVPTX::INT_PTX_LDU_G_v4i8_ELE_ari32;
           break;
@@ -1750,18 +1781,18 @@ SDNode *NVPTXDAGToDAGISel::SelectLDGLDU(SDNode *N) {
 
     SDValue Ops[] = { Base, Offset, Chain };
 
-    LD = CurDAG->getMachineNode(Opcode, DL, N->getVTList(), Ops);
+    LD = CurDAG->getMachineNode(Opcode, DL, InstVTList, Ops);
   } else {
     if (TM.is64Bit()) {
       switch (N->getOpcode()) {
       default:
-        return nullptr;
+        return false;
       case ISD::LOAD:
       case ISD::INTRINSIC_W_CHAIN:
         if (IsLDG) {
           switch (EltVT.getSimpleVT().SimpleTy) {
           default:
-            return nullptr;
+            return false;
           case MVT::i8:
             Opcode = NVPTX::INT_PTX_LDG_GLOBAL_i8areg64;
             break;
@@ -1784,7 +1815,7 @@ SDNode *NVPTXDAGToDAGISel::SelectLDGLDU(SDNode *N) {
         } else {
           switch (EltVT.getSimpleVT().SimpleTy) {
           default:
-            return nullptr;
+            return false;
           case MVT::i8:
             Opcode = NVPTX::INT_PTX_LDU_GLOBAL_i8areg64;
             break;
@@ -1810,7 +1841,7 @@ SDNode *NVPTXDAGToDAGISel::SelectLDGLDU(SDNode *N) {
       case NVPTXISD::LDGV2:
         switch (EltVT.getSimpleVT().SimpleTy) {
         default:
-          return nullptr;
+          return false;
         case MVT::i8:
           Opcode = NVPTX::INT_PTX_LDG_G_v2i8_ELE_areg64;
           break;
@@ -1834,7 +1865,7 @@ SDNode *NVPTXDAGToDAGISel::SelectLDGLDU(SDNode *N) {
       case NVPTXISD::LDUV2:
         switch (EltVT.getSimpleVT().SimpleTy) {
         default:
-          return nullptr;
+          return false;
         case MVT::i8:
           Opcode = NVPTX::INT_PTX_LDU_G_v2i8_ELE_areg64;
           break;
@@ -1859,7 +1890,7 @@ SDNode *NVPTXDAGToDAGISel::SelectLDGLDU(SDNode *N) {
       case NVPTXISD::LDGV4:
         switch (EltVT.getSimpleVT().SimpleTy) {
         default:
-          return nullptr;
+          return false;
         case MVT::i8:
           Opcode = NVPTX::INT_PTX_LDG_G_v4i8_ELE_areg64;
           break;
@@ -1877,7 +1908,7 @@ SDNode *NVPTXDAGToDAGISel::SelectLDGLDU(SDNode *N) {
       case NVPTXISD::LDUV4:
         switch (EltVT.getSimpleVT().SimpleTy) {
         default:
-          return nullptr;
+          return false;
         case MVT::i8:
           Opcode = NVPTX::INT_PTX_LDU_G_v4i8_ELE_areg64;
           break;
@@ -1896,13 +1927,13 @@ SDNode *NVPTXDAGToDAGISel::SelectLDGLDU(SDNode *N) {
     } else {
       switch (N->getOpcode()) {
       default:
-        return nullptr;
+        return false;
       case ISD::LOAD:
       case ISD::INTRINSIC_W_CHAIN:
         if (IsLDG) {
           switch (EltVT.getSimpleVT().SimpleTy) {
           default:
-            return nullptr;
+            return false;
           case MVT::i8:
             Opcode = NVPTX::INT_PTX_LDG_GLOBAL_i8areg;
             break;
@@ -1925,7 +1956,7 @@ SDNode *NVPTXDAGToDAGISel::SelectLDGLDU(SDNode *N) {
         } else {
           switch (EltVT.getSimpleVT().SimpleTy) {
           default:
-            return nullptr;
+            return false;
           case MVT::i8:
             Opcode = NVPTX::INT_PTX_LDU_GLOBAL_i8areg;
             break;
@@ -1951,7 +1982,7 @@ SDNode *NVPTXDAGToDAGISel::SelectLDGLDU(SDNode *N) {
       case NVPTXISD::LDGV2:
         switch (EltVT.getSimpleVT().SimpleTy) {
         default:
-          return nullptr;
+          return false;
         case MVT::i8:
           Opcode = NVPTX::INT_PTX_LDG_G_v2i8_ELE_areg32;
           break;
@@ -1975,7 +2006,7 @@ SDNode *NVPTXDAGToDAGISel::SelectLDGLDU(SDNode *N) {
       case NVPTXISD::LDUV2:
         switch (EltVT.getSimpleVT().SimpleTy) {
         default:
-          return nullptr;
+          return false;
         case MVT::i8:
           Opcode = NVPTX::INT_PTX_LDU_G_v2i8_ELE_areg32;
           break;
@@ -2000,7 +2031,7 @@ SDNode *NVPTXDAGToDAGISel::SelectLDGLDU(SDNode *N) {
       case NVPTXISD::LDGV4:
         switch (EltVT.getSimpleVT().SimpleTy) {
         default:
-          return nullptr;
+          return false;
         case MVT::i8:
           Opcode = NVPTX::INT_PTX_LDG_G_v4i8_ELE_areg32;
           break;
@@ -2018,7 +2049,7 @@ SDNode *NVPTXDAGToDAGISel::SelectLDGLDU(SDNode *N) {
       case NVPTXISD::LDUV4:
         switch (EltVT.getSimpleVT().SimpleTy) {
         default:
-          return nullptr;
+          return false;
         case MVT::i8:
           Opcode = NVPTX::INT_PTX_LDU_G_v4i8_ELE_areg32;
           break;
@@ -2037,17 +2068,54 @@ SDNode *NVPTXDAGToDAGISel::SelectLDGLDU(SDNode *N) {
     }
 
     SDValue Ops[] = { Op1, Chain };
-    LD = CurDAG->getMachineNode(Opcode, DL, N->getVTList(), Ops);
+    LD = CurDAG->getMachineNode(Opcode, DL, InstVTList, Ops);
   }
 
   MachineSDNode::mmo_iterator MemRefs0 = MF->allocateMemRefsArray(1);
   MemRefs0[0] = Mem->getMemOperand();
   cast<MachineSDNode>(LD)->setMemRefs(MemRefs0, MemRefs0 + 1);
 
-  return LD;
+  // For automatic generation of LDG (through SelectLoad[Vector], not the
+  // intrinsics), we may have an extending load like:
+  //
+  //   i32,ch = load<LD1[%data1(addrspace=1)], zext from i8> t0, t7, undef:i64
+  //
+  // In this case, the matching logic above will select a load for the original
+  // memory type (in this case, i8) and our types will not match (the node needs
+  // to return an i32 in this case). Our LDG/LDU nodes do not support the
+  // concept of sign-/zero-extension, so emulate it here by adding an explicit
+  // CVT instruction. Ptxas should clean up any redundancies here.
+
+  EVT OrigType = N->getValueType(0);
+  LoadSDNode *LdNode = dyn_cast<LoadSDNode>(N);
+
+  if (OrigType != EltVT && LdNode) {
+    // We have an extending-load. The instruction we selected operates on the
+    // smaller type, but the SDNode we are replacing has the larger type. We
+    // need to emit a CVT to make the types match.
+    bool IsSigned = LdNode->getExtensionType() == ISD::SEXTLOAD;
+    unsigned CvtOpc = GetConvertOpcode(OrigType.getSimpleVT(),
+                                       EltVT.getSimpleVT(), IsSigned);
+
+    // For each output value, apply the manual sign/zero-extension and make sure
+    // all users of the load go through that CVT.
+    for (unsigned i = 0; i != NumElts; ++i) {
+      SDValue Res(LD, i);
+      SDValue OrigVal(N, i);
+
+      SDNode *CvtNode =
+        CurDAG->getMachineNode(CvtOpc, DL, OrigType, Res,
+                               CurDAG->getTargetConstant(NVPTX::PTXCvtMode::NONE,
+                                                         DL, MVT::i32));
+      ReplaceUses(OrigVal, SDValue(CvtNode, 0));
+    }
+  }
+
+  ReplaceNode(N, LD);
+  return true;
 }
 
-SDNode *NVPTXDAGToDAGISel::SelectStore(SDNode *N) {
+bool NVPTXDAGToDAGISel::tryStore(SDNode *N) {
   SDLoc dl(N);
   StoreSDNode *ST = cast<StoreSDNode>(N);
   EVT StoreVT = ST->getMemoryVT();
@@ -2055,10 +2123,10 @@ SDNode *NVPTXDAGToDAGISel::SelectStore(SDNode *N) {
 
   // do not support pre/post inc/dec
   if (ST->isIndexed())
-    return nullptr;
+    return false;
 
   if (!StoreVT.isSimple())
-    return nullptr;
+    return false;
 
   // Address Space Setting
   unsigned int codeAddrSpace = getCodeAddrSpace(ST);
@@ -2081,7 +2149,7 @@ SDNode *NVPTXDAGToDAGISel::SelectStore(SDNode *N) {
     else if (num == 4)
       vecType = NVPTX::PTXLdStInstCode::V4;
     else
-      return nullptr;
+      return false;
   }
 
   // Type Setting: toType + toTypeWidth
@@ -2125,7 +2193,7 @@ SDNode *NVPTXDAGToDAGISel::SelectStore(SDNode *N) {
       Opcode = NVPTX::ST_f64_avar;
       break;
     default:
-      return nullptr;
+      return false;
     }
     SDValue Ops[] = { N1, getI32Imm(isVolatile, dl),
                       getI32Imm(codeAddrSpace, dl), getI32Imm(vecType, dl),
@@ -2154,7 +2222,7 @@ SDNode *NVPTXDAGToDAGISel::SelectStore(SDNode *N) {
       Opcode = NVPTX::ST_f64_asi;
       break;
     default:
-      return nullptr;
+      return false;
     }
     SDValue Ops[] = { N1, getI32Imm(isVolatile, dl),
                       getI32Imm(codeAddrSpace, dl), getI32Imm(vecType, dl),
@@ -2184,7 +2252,7 @@ SDNode *NVPTXDAGToDAGISel::SelectStore(SDNode *N) {
         Opcode = NVPTX::ST_f64_ari_64;
         break;
       default:
-        return nullptr;
+        return false;
       }
     } else {
       switch (SourceVT) {
@@ -2207,7 +2275,7 @@ SDNode *NVPTXDAGToDAGISel::SelectStore(SDNode *N) {
         Opcode = NVPTX::ST_f64_ari;
         break;
       default:
-        return nullptr;
+        return false;
       }
     }
     SDValue Ops[] = { N1, getI32Imm(isVolatile, dl),
@@ -2237,7 +2305,7 @@ SDNode *NVPTXDAGToDAGISel::SelectStore(SDNode *N) {
         Opcode = NVPTX::ST_f64_areg_64;
         break;
       default:
-        return nullptr;
+        return false;
       }
     } else {
       switch (SourceVT) {
@@ -2260,7 +2328,7 @@ SDNode *NVPTXDAGToDAGISel::SelectStore(SDNode *N) {
         Opcode = NVPTX::ST_f64_areg;
         break;
       default:
-        return nullptr;
+        return false;
       }
     }
     SDValue Ops[] = { N1, getI32Imm(isVolatile, dl),
@@ -2270,16 +2338,17 @@ SDNode *NVPTXDAGToDAGISel::SelectStore(SDNode *N) {
     NVPTXST = CurDAG->getMachineNode(Opcode, dl, MVT::Other, Ops);
   }
 
-  if (NVPTXST) {
-    MachineSDNode::mmo_iterator MemRefs0 = MF->allocateMemRefsArray(1);
-    MemRefs0[0] = cast<MemSDNode>(N)->getMemOperand();
-    cast<MachineSDNode>(NVPTXST)->setMemRefs(MemRefs0, MemRefs0 + 1);
-  }
+  if (!NVPTXST)
+    return false;
 
-  return NVPTXST;
+  MachineSDNode::mmo_iterator MemRefs0 = MF->allocateMemRefsArray(1);
+  MemRefs0[0] = cast<MemSDNode>(N)->getMemOperand();
+  cast<MachineSDNode>(NVPTXST)->setMemRefs(MemRefs0, MemRefs0 + 1);
+  ReplaceNode(N, NVPTXST);
+  return true;
 }
 
-SDNode *NVPTXDAGToDAGISel::SelectStoreVector(SDNode *N) {
+bool NVPTXDAGToDAGISel::tryStoreVector(SDNode *N) {
   SDValue Chain = N->getOperand(0);
   SDValue Op1 = N->getOperand(1);
   SDValue Addr, Offset, Base;
@@ -2337,7 +2406,7 @@ SDNode *NVPTXDAGToDAGISel::SelectStoreVector(SDNode *N) {
     N2 = N->getOperand(5);
     break;
   default:
-    return nullptr;
+    return false;
   }
 
   StOps.push_back(getI32Imm(IsVolatile, DL));
@@ -2349,11 +2418,11 @@ SDNode *NVPTXDAGToDAGISel::SelectStoreVector(SDNode *N) {
   if (SelectDirectAddr(N2, Addr)) {
     switch (N->getOpcode()) {
     default:
-      return nullptr;
+      return false;
     case NVPTXISD::StoreV2:
       switch (EltVT.getSimpleVT().SimpleTy) {
       default:
-        return nullptr;
+        return false;
       case MVT::i8:
         Opcode = NVPTX::STV_i8_v2_avar;
         break;
@@ -2377,7 +2446,7 @@ SDNode *NVPTXDAGToDAGISel::SelectStoreVector(SDNode *N) {
     case NVPTXISD::StoreV4:
       switch (EltVT.getSimpleVT().SimpleTy) {
       default:
-        return nullptr;
+        return false;
       case MVT::i8:
         Opcode = NVPTX::STV_i8_v4_avar;
         break;
@@ -2398,11 +2467,11 @@ SDNode *NVPTXDAGToDAGISel::SelectStoreVector(SDNode *N) {
                           : SelectADDRsi(N2.getNode(), N2, Base, Offset)) {
     switch (N->getOpcode()) {
     default:
-      return nullptr;
+      return false;
     case NVPTXISD::StoreV2:
       switch (EltVT.getSimpleVT().SimpleTy) {
       default:
-        return nullptr;
+        return false;
       case MVT::i8:
         Opcode = NVPTX::STV_i8_v2_asi;
         break;
@@ -2426,7 +2495,7 @@ SDNode *NVPTXDAGToDAGISel::SelectStoreVector(SDNode *N) {
     case NVPTXISD::StoreV4:
       switch (EltVT.getSimpleVT().SimpleTy) {
       default:
-        return nullptr;
+        return false;
       case MVT::i8:
         Opcode = NVPTX::STV_i8_v4_asi;
         break;
@@ -2449,11 +2518,11 @@ SDNode *NVPTXDAGToDAGISel::SelectStoreVector(SDNode *N) {
     if (TM.is64Bit()) {
       switch (N->getOpcode()) {
       default:
-        return nullptr;
+        return false;
       case NVPTXISD::StoreV2:
         switch (EltVT.getSimpleVT().SimpleTy) {
         default:
-          return nullptr;
+          return false;
         case MVT::i8:
           Opcode = NVPTX::STV_i8_v2_ari_64;
           break;
@@ -2477,7 +2546,7 @@ SDNode *NVPTXDAGToDAGISel::SelectStoreVector(SDNode *N) {
       case NVPTXISD::StoreV4:
         switch (EltVT.getSimpleVT().SimpleTy) {
         default:
-          return nullptr;
+          return false;
         case MVT::i8:
           Opcode = NVPTX::STV_i8_v4_ari_64;
           break;
@@ -2496,11 +2565,11 @@ SDNode *NVPTXDAGToDAGISel::SelectStoreVector(SDNode *N) {
     } else {
       switch (N->getOpcode()) {
       default:
-        return nullptr;
+        return false;
       case NVPTXISD::StoreV2:
         switch (EltVT.getSimpleVT().SimpleTy) {
         default:
-          return nullptr;
+          return false;
         case MVT::i8:
           Opcode = NVPTX::STV_i8_v2_ari;
           break;
@@ -2524,7 +2593,7 @@ SDNode *NVPTXDAGToDAGISel::SelectStoreVector(SDNode *N) {
       case NVPTXISD::StoreV4:
         switch (EltVT.getSimpleVT().SimpleTy) {
         default:
-          return nullptr;
+          return false;
         case MVT::i8:
           Opcode = NVPTX::STV_i8_v4_ari;
           break;
@@ -2547,11 +2616,11 @@ SDNode *NVPTXDAGToDAGISel::SelectStoreVector(SDNode *N) {
     if (TM.is64Bit()) {
       switch (N->getOpcode()) {
       default:
-        return nullptr;
+        return false;
       case NVPTXISD::StoreV2:
         switch (EltVT.getSimpleVT().SimpleTy) {
         default:
-          return nullptr;
+          return false;
         case MVT::i8:
           Opcode = NVPTX::STV_i8_v2_areg_64;
           break;
@@ -2575,7 +2644,7 @@ SDNode *NVPTXDAGToDAGISel::SelectStoreVector(SDNode *N) {
       case NVPTXISD::StoreV4:
         switch (EltVT.getSimpleVT().SimpleTy) {
         default:
-          return nullptr;
+          return false;
         case MVT::i8:
           Opcode = NVPTX::STV_i8_v4_areg_64;
           break;
@@ -2594,11 +2663,11 @@ SDNode *NVPTXDAGToDAGISel::SelectStoreVector(SDNode *N) {
     } else {
       switch (N->getOpcode()) {
       default:
-        return nullptr;
+        return false;
       case NVPTXISD::StoreV2:
         switch (EltVT.getSimpleVT().SimpleTy) {
         default:
-          return nullptr;
+          return false;
         case MVT::i8:
           Opcode = NVPTX::STV_i8_v2_areg;
           break;
@@ -2622,7 +2691,7 @@ SDNode *NVPTXDAGToDAGISel::SelectStoreVector(SDNode *N) {
       case NVPTXISD::StoreV4:
         switch (EltVT.getSimpleVT().SimpleTy) {
         default:
-          return nullptr;
+          return false;
         case MVT::i8:
           Opcode = NVPTX::STV_i8_v4_areg;
           break;
@@ -2650,10 +2719,11 @@ SDNode *NVPTXDAGToDAGISel::SelectStoreVector(SDNode *N) {
   MemRefs0[0] = cast<MemSDNode>(N)->getMemOperand();
   cast<MachineSDNode>(ST)->setMemRefs(MemRefs0, MemRefs0 + 1);
 
-  return ST;
+  ReplaceNode(N, ST);
+  return true;
 }
 
-SDNode *NVPTXDAGToDAGISel::SelectLoadParam(SDNode *Node) {
+bool NVPTXDAGToDAGISel::tryLoadParam(SDNode *Node) {
   SDValue Chain = Node->getOperand(0);
   SDValue Offset = Node->getOperand(2);
   SDValue Flag = Node->getOperand(3);
@@ -2663,7 +2733,7 @@ SDNode *NVPTXDAGToDAGISel::SelectLoadParam(SDNode *Node) {
   unsigned VecSize;
   switch (Node->getOpcode()) {
   default:
-    return nullptr;
+    return false;
   case NVPTXISD::LoadParam:
     VecSize = 1;
     break;
@@ -2682,11 +2752,11 @@ SDNode *NVPTXDAGToDAGISel::SelectLoadParam(SDNode *Node) {
 
   switch (VecSize) {
   default:
-    return nullptr;
+    return false;
   case 1:
     switch (MemVT.getSimpleVT().SimpleTy) {
     default:
-      return nullptr;
+      return false;
     case MVT::i1:
       Opc = NVPTX::LoadParamMemI8;
       break;
@@ -2713,7 +2783,7 @@ SDNode *NVPTXDAGToDAGISel::SelectLoadParam(SDNode *Node) {
   case 2:
     switch (MemVT.getSimpleVT().SimpleTy) {
     default:
-      return nullptr;
+      return false;
     case MVT::i1:
       Opc = NVPTX::LoadParamMemV2I8;
       break;
@@ -2740,7 +2810,7 @@ SDNode *NVPTXDAGToDAGISel::SelectLoadParam(SDNode *Node) {
   case 4:
     switch (MemVT.getSimpleVT().SimpleTy) {
     default:
-      return nullptr;
+      return false;
     case MVT::i1:
       Opc = NVPTX::LoadParamMemV4I8;
       break;
@@ -2777,10 +2847,11 @@ SDNode *NVPTXDAGToDAGISel::SelectLoadParam(SDNode *Node) {
   Ops.push_back(Chain);
   Ops.push_back(Flag);
 
-  return CurDAG->getMachineNode(Opc, DL, VTs, Ops);
+  ReplaceNode(Node, CurDAG->getMachineNode(Opc, DL, VTs, Ops));
+  return true;
 }
 
-SDNode *NVPTXDAGToDAGISel::SelectStoreRetval(SDNode *N) {
+bool NVPTXDAGToDAGISel::tryStoreRetval(SDNode *N) {
   SDLoc DL(N);
   SDValue Chain = N->getOperand(0);
   SDValue Offset = N->getOperand(1);
@@ -2791,7 +2862,7 @@ SDNode *NVPTXDAGToDAGISel::SelectStoreRetval(SDNode *N) {
   unsigned NumElts = 1;
   switch (N->getOpcode()) {
   default:
-    return nullptr;
+    return false;
   case NVPTXISD::StoreRetval:
     NumElts = 1;
     break;
@@ -2816,11 +2887,11 @@ SDNode *NVPTXDAGToDAGISel::SelectStoreRetval(SDNode *N) {
   unsigned Opcode = 0;
   switch (NumElts) {
   default:
-    return nullptr;
+    return false;
   case 1:
     switch (Mem->getMemoryVT().getSimpleVT().SimpleTy) {
     default:
-      return nullptr;
+      return false;
     case MVT::i1:
       Opcode = NVPTX::StoreRetvalI8;
       break;
@@ -2847,7 +2918,7 @@ SDNode *NVPTXDAGToDAGISel::SelectStoreRetval(SDNode *N) {
   case 2:
     switch (Mem->getMemoryVT().getSimpleVT().SimpleTy) {
     default:
-      return nullptr;
+      return false;
     case MVT::i1:
       Opcode = NVPTX::StoreRetvalV2I8;
       break;
@@ -2874,7 +2945,7 @@ SDNode *NVPTXDAGToDAGISel::SelectStoreRetval(SDNode *N) {
   case 4:
     switch (Mem->getMemoryVT().getSimpleVT().SimpleTy) {
     default:
-      return nullptr;
+      return false;
     case MVT::i1:
       Opcode = NVPTX::StoreRetvalV4I8;
       break;
@@ -2900,10 +2971,11 @@ SDNode *NVPTXDAGToDAGISel::SelectStoreRetval(SDNode *N) {
   MemRefs0[0] = cast<MemSDNode>(N)->getMemOperand();
   cast<MachineSDNode>(Ret)->setMemRefs(MemRefs0, MemRefs0 + 1);
 
-  return Ret;
+  ReplaceNode(N, Ret);
+  return true;
 }
 
-SDNode *NVPTXDAGToDAGISel::SelectStoreParam(SDNode *N) {
+bool NVPTXDAGToDAGISel::tryStoreParam(SDNode *N) {
   SDLoc DL(N);
   SDValue Chain = N->getOperand(0);
   SDValue Param = N->getOperand(1);
@@ -2917,7 +2989,7 @@ SDNode *NVPTXDAGToDAGISel::SelectStoreParam(SDNode *N) {
   unsigned NumElts = 1;
   switch (N->getOpcode()) {
   default:
-    return nullptr;
+    return false;
   case NVPTXISD::StoreParamU32:
   case NVPTXISD::StoreParamS32:
   case NVPTXISD::StoreParam:
@@ -2948,11 +3020,11 @@ SDNode *NVPTXDAGToDAGISel::SelectStoreParam(SDNode *N) {
   default:
     switch (NumElts) {
     default:
-      return nullptr;
+      return false;
     case 1:
       switch (Mem->getMemoryVT().getSimpleVT().SimpleTy) {
       default:
-        return nullptr;
+        return false;
       case MVT::i1:
         Opcode = NVPTX::StoreParamI8;
         break;
@@ -2979,7 +3051,7 @@ SDNode *NVPTXDAGToDAGISel::SelectStoreParam(SDNode *N) {
     case 2:
       switch (Mem->getMemoryVT().getSimpleVT().SimpleTy) {
       default:
-        return nullptr;
+        return false;
       case MVT::i1:
         Opcode = NVPTX::StoreParamV2I8;
         break;
@@ -3006,7 +3078,7 @@ SDNode *NVPTXDAGToDAGISel::SelectStoreParam(SDNode *N) {
     case 4:
       switch (Mem->getMemoryVT().getSimpleVT().SimpleTy) {
       default:
-        return nullptr;
+        return false;
       case MVT::i1:
         Opcode = NVPTX::StoreParamV4I8;
         break;
@@ -3056,17 +3128,17 @@ SDNode *NVPTXDAGToDAGISel::SelectStoreParam(SDNode *N) {
   MemRefs0[0] = cast<MemSDNode>(N)->getMemOperand();
   cast<MachineSDNode>(Ret)->setMemRefs(MemRefs0, MemRefs0 + 1);
 
-  return Ret;
+  ReplaceNode(N, Ret);
+  return true;
 }
 
-SDNode *NVPTXDAGToDAGISel::SelectTextureIntrinsic(SDNode *N) {
+bool NVPTXDAGToDAGISel::tryTextureIntrinsic(SDNode *N) {
   SDValue Chain = N->getOperand(0);
-  SDNode *Ret = nullptr;
   unsigned Opc = 0;
   SmallVector<SDValue, 8> Ops;
 
   switch (N->getOpcode()) {
-  default: return nullptr;
+  default: return false;
   case NVPTXISD::Tex1DFloatS32:
     Opc = NVPTX::TEX_1D_F32_S32;
     break;
@@ -3579,18 +3651,17 @@ SDNode *NVPTXDAGToDAGISel::SelectTextureIntrinsic(SDNode *N) {
   }
 
   Ops.push_back(Chain);
-  Ret = CurDAG->getMachineNode(Opc, SDLoc(N), N->getVTList(), Ops);
-  return Ret;
+  ReplaceNode(N, CurDAG->getMachineNode(Opc, SDLoc(N), N->getVTList(), Ops));
+  return true;
 }
 
-SDNode *NVPTXDAGToDAGISel::SelectSurfaceIntrinsic(SDNode *N) {
+bool NVPTXDAGToDAGISel::trySurfaceIntrinsic(SDNode *N) {
   SDValue Chain = N->getOperand(0);
   SDValue TexHandle = N->getOperand(1);
-  SDNode *Ret = nullptr;
   unsigned Opc = 0;
   SmallVector<SDValue, 8> Ops;
   switch (N->getOpcode()) {
-  default: return nullptr;
+  default: return false;
   case NVPTXISD::Suld1DI8Clamp:
     Opc = NVPTX::SULD_1D_I8_CLAMP;
     Ops.push_back(TexHandle);
@@ -4780,14 +4851,14 @@ SDNode *NVPTXDAGToDAGISel::SelectSurfaceIntrinsic(SDNode *N) {
     Ops.push_back(Chain);
     break;
   }
-  Ret = CurDAG->getMachineNode(Opc, SDLoc(N), N->getVTList(), Ops);
-  return Ret;
+  ReplaceNode(N, CurDAG->getMachineNode(Opc, SDLoc(N), N->getVTList(), Ops));
+  return true;
 }
 
 
 /// SelectBFE - Look for instruction sequences that can be made more efficient
 /// by using the 'bfe' (bit-field extract) PTX instruction
-SDNode *NVPTXDAGToDAGISel::SelectBFE(SDNode *N) {
+bool NVPTXDAGToDAGISel::tryBFE(SDNode *N) {
   SDLoc DL(N);
   SDValue LHS = N->getOperand(0);
   SDValue RHS = N->getOperand(1);
@@ -4806,7 +4877,7 @@ SDNode *NVPTXDAGToDAGISel::SelectBFE(SDNode *N) {
     ConstantSDNode *Mask = dyn_cast<ConstantSDNode>(RHS);
     if (!Mask) {
       // We need a constant mask on the RHS of the AND
-      return NULL;
+      return false;
     }
 
     // Extract the mask bits
@@ -4815,7 +4886,7 @@ SDNode *NVPTXDAGToDAGISel::SelectBFE(SDNode *N) {
       // We *could* handle shifted masks here, but doing so would require an
       // 'and' operation to fix up the low-order bits so we would trade
       // shr+and for bfe+and, which has the same throughput
-      return NULL;
+      return false;
     }
 
     // How many bits are in our mask?
@@ -4836,7 +4907,7 @@ SDNode *NVPTXDAGToDAGISel::SelectBFE(SDNode *N) {
           // Do not handle the case where bits have been shifted in. In theory
           // we could handle this, but the cost is likely higher than just
           // emitting the srl/and pair.
-          return NULL;
+          return false;
         }
         Start = CurDAG->getTargetConstant(StartVal, DL, MVT::i32);
       } else {
@@ -4844,20 +4915,20 @@ SDNode *NVPTXDAGToDAGISel::SelectBFE(SDNode *N) {
         // was found) is not constant. We could handle this case, but it would
         // require run-time logic that would be more expensive than just
         // emitting the srl/and pair.
-        return NULL;
+        return false;
       }
     } else {
       // Do not handle the case where the LHS of the and is not a shift. While
       // it would be trivial to handle this case, it would just transform
       // 'and' -> 'bfe', but 'and' has higher-throughput.
-      return NULL;
+      return false;
     }
   } else if (N->getOpcode() == ISD::SRL || N->getOpcode() == ISD::SRA) {
     if (LHS->getOpcode() == ISD::AND) {
       ConstantSDNode *ShiftCnst = dyn_cast<ConstantSDNode>(RHS);
       if (!ShiftCnst) {
         // Shift amount must be constant
-        return NULL;
+        return false;
       }
 
       uint64_t ShiftAmt = ShiftCnst->getZExtValue();
@@ -4873,7 +4944,7 @@ SDNode *NVPTXDAGToDAGISel::SelectBFE(SDNode *N) {
       ConstantSDNode *MaskCnst = dyn_cast<ConstantSDNode>(AndRHS);
       if (!MaskCnst) {
         // Mask must be constant
-        return NULL;
+        return false;
       }
 
       uint64_t MaskVal = MaskCnst->getZExtValue();
@@ -4893,13 +4964,13 @@ SDNode *NVPTXDAGToDAGISel::SelectBFE(SDNode *N) {
         NumBits = NumZeros + NumOnes - ShiftAmt;
       } else {
         // This is not a mask we can handle
-        return NULL;
+        return false;
       }
 
       if (ShiftAmt < NumZeros) {
         // Handling this case would require extra logic that would make this
         // transformation non-profitable
-        return NULL;
+        return false;
       }
 
       Val = AndLHS;
@@ -4919,7 +4990,7 @@ SDNode *NVPTXDAGToDAGISel::SelectBFE(SDNode *N) {
       ConstantSDNode *ShlCnst = dyn_cast<ConstantSDNode>(ShlRHS);
       if (!ShlCnst) {
         // Shift amount must be constant
-        return NULL;
+        return false;
       }
       uint64_t InnerShiftAmt = ShlCnst->getZExtValue();
 
@@ -4927,20 +4998,20 @@ SDNode *NVPTXDAGToDAGISel::SelectBFE(SDNode *N) {
       ConstantSDNode *ShrCnst = dyn_cast<ConstantSDNode>(ShrRHS);
       if (!ShrCnst) {
         // Shift amount must be constant
-        return NULL;
+        return false;
       }
       uint64_t OuterShiftAmt = ShrCnst->getZExtValue();
 
       // To avoid extra codegen and be profitable, we need Outer >= Inner
       if (OuterShiftAmt < InnerShiftAmt) {
-        return NULL;
+        return false;
       }
 
       // If the outer shift is more than the type size, we have no bitfield to
       // extract (since we also check that the inner shift is <= the outer shift
       // then this also implies that the inner shift is < the type size)
       if (OuterShiftAmt >= Val.getValueType().getSizeInBits()) {
-        return NULL;
+        return false;
       }
 
       Start =
@@ -4956,11 +5027,11 @@ SDNode *NVPTXDAGToDAGISel::SelectBFE(SDNode *N) {
       }
     } else {
       // No can do...
-      return NULL;
+      return false;
     }
   } else {
     // No can do...
-    return NULL;
+    return false;
   }
 
 
@@ -4981,14 +5052,15 @@ SDNode *NVPTXDAGToDAGISel::SelectBFE(SDNode *N) {
     }
   } else {
     // We cannot handle this type
-    return NULL;
+    return false;
   }
 
   SDValue Ops[] = {
     Val, Start, Len
   };
 
-  return CurDAG->getMachineNode(Opc, DL, N->getVTList(), Ops);
+  ReplaceNode(N, CurDAG->getMachineNode(Opc, DL, N->getVTList(), Ops));
+  return true;
 }
 
 // SelectDirectAddr - Match a direct address for DAG.
@@ -5122,3 +5194,57 @@ bool NVPTXDAGToDAGISel::SelectInlineAsmMemoryOperand(
   }
   return true;
 }
+
+/// GetConvertOpcode - Returns the CVT_ instruction opcode that implements a
+/// conversion from \p SrcTy to \p DestTy.
+unsigned NVPTXDAGToDAGISel::GetConvertOpcode(MVT DestTy, MVT SrcTy,
+                                             bool IsSigned) {
+  switch (SrcTy.SimpleTy) {
+  default:
+    llvm_unreachable("Unhandled source type");
+  case MVT::i8:
+    switch (DestTy.SimpleTy) {
+    default:
+      llvm_unreachable("Unhandled dest type");
+    case MVT::i16:
+      return IsSigned ? NVPTX::CVT_s16_s8 : NVPTX::CVT_u16_u8;
+    case MVT::i32:
+      return IsSigned ? NVPTX::CVT_s32_s8 : NVPTX::CVT_u32_u8;
+    case MVT::i64:
+      return IsSigned ? NVPTX::CVT_s64_s8 : NVPTX::CVT_u64_u8;
+    }
+  case MVT::i16:
+    switch (DestTy.SimpleTy) {
+    default:
+      llvm_unreachable("Unhandled dest type");
+    case MVT::i8:
+      return IsSigned ? NVPTX::CVT_s8_s16 : NVPTX::CVT_u8_u16;
+    case MVT::i32:
+      return IsSigned ? NVPTX::CVT_s32_s16 : NVPTX::CVT_u32_u16;
+    case MVT::i64:
+      return IsSigned ? NVPTX::CVT_s64_s16 : NVPTX::CVT_u64_u16;
+    }
+  case MVT::i32:
+    switch (DestTy.SimpleTy) {
+    default:
+      llvm_unreachable("Unhandled dest type");
+    case MVT::i8:
+      return IsSigned ? NVPTX::CVT_s8_s32 : NVPTX::CVT_u8_u32;
+    case MVT::i16:
+      return IsSigned ? NVPTX::CVT_s16_s32 : NVPTX::CVT_u16_u32;
+    case MVT::i64:
+      return IsSigned ? NVPTX::CVT_s64_s32 : NVPTX::CVT_u64_u32;
+    }
+  case MVT::i64:
+    switch (DestTy.SimpleTy) {
+    default:
+      llvm_unreachable("Unhandled dest type");
+    case MVT::i8:
+      return IsSigned ? NVPTX::CVT_s8_s64 : NVPTX::CVT_u8_u64;
+    case MVT::i16:
+      return IsSigned ? NVPTX::CVT_s16_s64 : NVPTX::CVT_u16_u64;
+    case MVT::i32:
+      return IsSigned ? NVPTX::CVT_s32_s64 : NVPTX::CVT_u32_u64;
+    }
+  }
+}
diff --git a/lib/Target/NVPTX/NVPTXISelDAGToDAG.h b/lib/Target/NVPTX/NVPTXISelDAGToDAG.h
index fe20580c83a2..d53c92f1eff3 100644
--- a/lib/Target/NVPTX/NVPTXISelDAGToDAG.h
+++ b/lib/Target/NVPTX/NVPTXISelDAGToDAG.h
@@ -21,9 +21,8 @@
 #include "llvm/CodeGen/SelectionDAGISel.h"
 #include "llvm/IR/Intrinsics.h"
 #include "llvm/Support/Compiler.h"
-using namespace llvm;
 
-namespace {
+namespace llvm {
 
 class LLVM_LIBRARY_VISIBILITY NVPTXDAGToDAGISel : public SelectionDAGISel {
   const NVPTXTargetMachine &TM;
@@ -54,24 +53,24 @@ private:
 // Include the pieces autogenerated from the target description.
 #include "NVPTXGenDAGISel.inc"
 
-  SDNode *Select(SDNode *N) override;
-  SDNode *SelectIntrinsicNoChain(SDNode *N);
-  SDNode *SelectIntrinsicChain(SDNode *N);
-  SDNode *SelectTexSurfHandle(SDNode *N);
-  SDNode *SelectLoad(SDNode *N);
-  SDNode *SelectLoadVector(SDNode *N);
-  SDNode *SelectLDGLDU(SDNode *N);
-  SDNode *SelectStore(SDNode *N);
-  SDNode *SelectStoreVector(SDNode *N);
-  SDNode *SelectLoadParam(SDNode *N);
-  SDNode *SelectStoreRetval(SDNode *N);
-  SDNode *SelectStoreParam(SDNode *N);
-  SDNode *SelectAddrSpaceCast(SDNode *N);
-  SDNode *SelectTextureIntrinsic(SDNode *N);
-  SDNode *SelectSurfaceIntrinsic(SDNode *N);
-  SDNode *SelectBFE(SDNode *N);
-        
-  inline SDValue getI32Imm(unsigned Imm, SDLoc DL) {
+  void Select(SDNode *N) override;
+  bool tryIntrinsicNoChain(SDNode *N);
+  bool tryIntrinsicChain(SDNode *N);
+  void SelectTexSurfHandle(SDNode *N);
+  bool tryLoad(SDNode *N);
+  bool tryLoadVector(SDNode *N);
+  bool tryLDGLDU(SDNode *N);
+  bool tryStore(SDNode *N);
+  bool tryStoreVector(SDNode *N);
+  bool tryLoadParam(SDNode *N);
+  bool tryStoreRetval(SDNode *N);
+  bool tryStoreParam(SDNode *N);
+  void SelectAddrSpaceCast(SDNode *N);
+  bool tryTextureIntrinsic(SDNode *N);
+  bool trySurfaceIntrinsic(SDNode *N);
+  bool tryBFE(SDNode *N);
+
+  inline SDValue getI32Imm(unsigned Imm, const SDLoc &DL) {
     return CurDAG->getTargetConstant(Imm, DL, MVT::i32);
   }
 
@@ -94,7 +93,8 @@ private:
 
   bool ChkMemSDNodeAddressSpace(SDNode *N, unsigned int spN) const;
 
+  static unsigned GetConvertOpcode(MVT DestTy, MVT SrcTy, bool IsSigned);
 };
-}
+} // end namespace llvm
 
 #endif
diff --git a/lib/Target/NVPTX/NVPTXISelLowering.cpp b/lib/Target/NVPTX/NVPTXISelLowering.cpp
index be735f6c1bce..f28c89cd976a 100644
--- a/lib/Target/NVPTX/NVPTXISelLowering.cpp
+++ b/lib/Target/NVPTX/NVPTXISelLowering.cpp
@@ -257,15 +257,9 @@ NVPTXTargetLowering::NVPTXTargetLowering(const NVPTXTargetMachine &TM,
   setOperationAction(ISD::CTLZ, MVT::i16, Legal);
   setOperationAction(ISD::CTLZ, MVT::i32, Legal);
   setOperationAction(ISD::CTLZ, MVT::i64, Legal);
-  setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i16, Legal);
-  setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i32, Legal);
-  setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i64, Legal);
   setOperationAction(ISD::CTTZ, MVT::i16, Expand);
   setOperationAction(ISD::CTTZ, MVT::i32, Expand);
   setOperationAction(ISD::CTTZ, MVT::i64, Expand);
-  setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::i16, Expand);
-  setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::i32, Expand);
-  setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::i64, Expand);
   setOperationAction(ISD::CTPOP, MVT::i16, Legal);
   setOperationAction(ISD::CTPOP, MVT::i32, Legal);
   setOperationAction(ISD::CTPOP, MVT::i64, Legal);
@@ -273,6 +267,10 @@ NVPTXTargetLowering::NVPTXTargetLowering(const NVPTXTargetMachine &TM,
   // PTX does not directly support SELP of i1, so promote to i32 first
   setOperationAction(ISD::SELECT, MVT::i1, Custom);
 
+  // PTX cannot multiply two i64s in a single instruction.
+  setOperationAction(ISD::SMUL_LOHI, MVT::i64, Expand);
+  setOperationAction(ISD::UMUL_LOHI, MVT::i64, Expand);
+
   // We have some custom DAG combine patterns for these nodes
   setTargetDAGCombine(ISD::ADD);
   setTargetDAGCombine(ISD::AND);
@@ -310,8 +308,12 @@ const char *NVPTXTargetLowering::getTargetNodeName(unsigned Opcode) const {
     return "NVPTXISD::DeclareRetParam";
   case NVPTXISD::PrintCall:
     return "NVPTXISD::PrintCall";
+  case NVPTXISD::PrintConvergentCall:
+    return "NVPTXISD::PrintConvergentCall";
   case NVPTXISD::PrintCallUni:
     return "NVPTXISD::PrintCallUni";
+  case NVPTXISD::PrintConvergentCallUni:
+    return "NVPTXISD::PrintConvergentCallUni";
   case NVPTXISD::LoadParam:
     return "NVPTXISD::LoadParam";
   case NVPTXISD::LoadParamV2:
@@ -1309,9 +1311,9 @@ SDValue NVPTXTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
                                  InFlag };
 
       unsigned opcode = NVPTXISD::StoreParam;
-      if (Outs[OIdx].Flags.isZExt())
+      if (Outs[OIdx].Flags.isZExt() && VT.getSizeInBits() < 32)
         opcode = NVPTXISD::StoreParamU32;
-      else if (Outs[OIdx].Flags.isSExt())
+      else if (Outs[OIdx].Flags.isSExt() && VT.getSizeInBits() < 32)
         opcode = NVPTXISD::StoreParamS32;
       Chain = DAG.getMemIntrinsicNode(opcode, dl, CopyParamVTs, CopyParamOps,
                                       VT, MachinePointerInfo());
@@ -1351,8 +1353,7 @@ SDValue NVPTXTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
       SDValue srcAddr = DAG.getNode(ISD::ADD, dl, PtrVT, OutVals[OIdx],
                                     DAG.getConstant(curOffset, dl, PtrVT));
       SDValue theVal = DAG.getLoad(elemtype, dl, tempChain, srcAddr,
-                                   MachinePointerInfo(), false, false, false,
-                                   PartAlign);
+                                   MachinePointerInfo(), PartAlign);
       if (elemtype.getSizeInBits() < 16) {
         theVal = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i16, theVal);
       }
@@ -1435,8 +1436,12 @@ SDValue NVPTXTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
   SDValue PrintCallOps[] = {
     Chain, DAG.getConstant((Ins.size() == 0) ? 0 : 1, dl, MVT::i32), InFlag
   };
-  Chain = DAG.getNode(Func ? (NVPTXISD::PrintCallUni) : (NVPTXISD::PrintCall),
-                      dl, PrintCallVTs, PrintCallOps);
+  // We model convergent calls as separate opcodes.
+  unsigned Opcode = Func ? NVPTXISD::PrintCallUni : NVPTXISD::PrintCall;
+  if (CLI.IsConvergent)
+    Opcode = Opcode == NVPTXISD::PrintCallUni ? NVPTXISD::PrintConvergentCallUni
+                                              : NVPTXISD::PrintConvergentCall;
+  Chain = DAG.getNode(Opcode, dl, PrintCallVTs, PrintCallOps);
   InFlag = Chain.getValue(1);
 
   // Ops to print out the function name
@@ -1608,9 +1613,11 @@ SDValue NVPTXTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
       for (unsigned i = 0, e = Ins.size(); i != e; ++i) {
         unsigned sz = VTs[i].getSizeInBits();
         unsigned AlignI = GreatestCommonDivisor64(RetAlign, Offsets[i]);
-        bool needTruncate = sz < 8;
-        if (VTs[i].isInteger() && (sz < 8))
+        bool needTruncate = false;
+        if (VTs[i].isInteger() && sz < 8) {
           sz = 8;
+          needTruncate = true;
+        }
 
         SmallVector<EVT, 4> LoadRetVTs;
         EVT TheLoadType = VTs[i];
@@ -1619,10 +1626,16 @@ SDValue NVPTXTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
           // aggregates.
           LoadRetVTs.push_back(MVT::i32);
           TheLoadType = MVT::i32;
+          needTruncate = true;
         } else if (sz < 16) {
           // If loading i1/i8 result, generate
           //   load i8 (-> i16)
           //   trunc i16 to i1/i8
+
+          // FIXME: Do we need to set needTruncate to true here, too?  We could
+          // not figure out what this branch is for in D17872, so we left it
+          // alone.  The comment above about loading i1/i8 may be wrong, as the
+          // branch above seems to cover integers of size < 32.
           LoadRetVTs.push_back(MVT::i16);
         } else
           LoadRetVTs.push_back(Ins[i].VT);
@@ -1678,7 +1691,7 @@ NVPTXTargetLowering::LowerCONCAT_VECTORS(SDValue Op, SelectionDAG &DAG) const {
                                 DAG.getIntPtrConstant(j, dl)));
     }
   }
-  return DAG.getNode(ISD::BUILD_VECTOR, dl, Node->getValueType(0), Ops);
+  return DAG.getBuildVector(Node->getValueType(0), dl, Ops);
 }
 
 /// LowerShiftRightParts - Lower SRL_PARTS, SRA_PARTS, which
@@ -1872,10 +1885,9 @@ SDValue NVPTXTargetLowering::LowerLOADi1(SDValue Op, SelectionDAG &DAG) const {
   assert(LD->getExtensionType() == ISD::NON_EXTLOAD);
   assert(Node->getValueType(0) == MVT::i1 &&
          "Custom lowering for i1 load only");
-  SDValue newLD =
-      DAG.getLoad(MVT::i16, dl, LD->getChain(), LD->getBasePtr(),
-                  LD->getPointerInfo(), LD->isVolatile(), LD->isNonTemporal(),
-                  LD->isInvariant(), LD->getAlignment());
+  SDValue newLD = DAG.getLoad(MVT::i16, dl, LD->getChain(), LD->getBasePtr(),
+                              LD->getPointerInfo(), LD->getAlignment(),
+                              LD->getMemOperand()->getFlags());
   SDValue result = DAG.getNode(ISD::TRUNCATE, dl, MVT::i1, newLD);
   // The legalizer (the caller) is expecting two values from the legalized
   // load, so we build a MergeValues node for it. See ExpandUnalignedLoad()
@@ -2002,13 +2014,10 @@ SDValue NVPTXTargetLowering::LowerSTOREi1(SDValue Op, SelectionDAG &DAG) const {
   SDValue Tmp2 = ST->getBasePtr();
   SDValue Tmp3 = ST->getValue();
   assert(Tmp3.getValueType() == MVT::i1 && "Custom lowering for i1 store only");
-  unsigned Alignment = ST->getAlignment();
-  bool isVolatile = ST->isVolatile();
-  bool isNonTemporal = ST->isNonTemporal();
   Tmp3 = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i16, Tmp3);
-  SDValue Result = DAG.getTruncStore(Tmp1, dl, Tmp3, Tmp2,
-                                     ST->getPointerInfo(), MVT::i8, isNonTemporal,
-                                     isVolatile, Alignment);
+  SDValue Result =
+      DAG.getTruncStore(Tmp1, dl, Tmp3, Tmp2, ST->getPointerInfo(), MVT::i8,
+                        ST->getAlignment(), ST->getMemOperand()->getFlags());
   return Result;
 }
 
@@ -2027,7 +2036,7 @@ NVPTXTargetLowering::getParamSymbol(SelectionDAG &DAG, int idx, EVT v) const {
 
 // Check to see if the kernel argument is image*_t or sampler_t
 
-bool llvm::isImageOrSamplerVal(const Value *arg, const Module *context) {
+static bool isImageOrSamplerVal(const Value *arg, const Module *context) {
   static const char *const specialTypes[] = { "struct._image2d_t",
                                               "struct._image3d_t",
                                               "struct._sampler_t" };
@@ -2042,16 +2051,17 @@ bool llvm::isImageOrSamplerVal(const Value *arg, const Module *context) {
     return false;
 
   auto *STy = dyn_cast<StructType>(PTy->getElementType());
-  const std::string TypeName = STy && !STy->isLiteral() ? STy->getName() : "";
+  if (!STy || STy->isLiteral())
+    return false;
 
   return std::find(std::begin(specialTypes), std::end(specialTypes),
-                   TypeName) != std::end(specialTypes);
+                   STy->getName()) != std::end(specialTypes);
 }
 
 SDValue NVPTXTargetLowering::LowerFormalArguments(
     SDValue Chain, CallingConv::ID CallConv, bool isVarArg,
-    const SmallVectorImpl<ISD::InputArg> &Ins, SDLoc dl, SelectionDAG &DAG,
-    SmallVectorImpl<SDValue> &InVals) const {
+    const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &dl,
+    SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals) const {
   MachineFunction &MF = DAG.getMachineFunction();
   const DataLayout &DL = DAG.getDataLayout();
   auto PtrVT = getPointerTy(DAG.getDataLayout());
@@ -2171,12 +2181,10 @@ SDValue NVPTXTargetLowering::LowerFormalArguments(
             ISD::LoadExtType ExtOp = Ins[InsIdx].Flags.isSExt() ? 
                                      ISD::SEXTLOAD : ISD::ZEXTLOAD;
             p = DAG.getExtLoad(ExtOp, dl, Ins[InsIdx].VT, Root, srcAddr,
-                               MachinePointerInfo(srcValue), partVT, false,
-                               false, false, partAlign);
+                               MachinePointerInfo(srcValue), partVT, partAlign);
           } else {
             p = DAG.getLoad(partVT, dl, Root, srcAddr,
-                            MachinePointerInfo(srcValue), false, false, false,
-                            partAlign);
+                            MachinePointerInfo(srcValue), partAlign);
           }
           if (p.getNode())
             p.getNode()->setIROrder(idx + 1);
@@ -2202,9 +2210,9 @@ SDValue NVPTXTargetLowering::LowerFormalArguments(
           Value *SrcValue = Constant::getNullValue(PointerType::get(
               EltVT.getTypeForEVT(F->getContext()), llvm::ADDRESS_SPACE_PARAM));
           SDValue P = DAG.getLoad(
-              EltVT, dl, Root, Arg, MachinePointerInfo(SrcValue), false, false,
-              true,
-              DL.getABITypeAlignment(EltVT.getTypeForEVT(F->getContext())));
+              EltVT, dl, Root, Arg, MachinePointerInfo(SrcValue),
+              DL.getABITypeAlignment(EltVT.getTypeForEVT(F->getContext())),
+              MachineMemOperand::MOInvariant);
           if (P.getNode())
             P.getNode()->setIROrder(idx + 1);
 
@@ -2219,9 +2227,9 @@ SDValue NVPTXTargetLowering::LowerFormalArguments(
           Value *SrcValue = Constant::getNullValue(PointerType::get(
               VecVT.getTypeForEVT(F->getContext()), llvm::ADDRESS_SPACE_PARAM));
           SDValue P = DAG.getLoad(
-              VecVT, dl, Root, Arg, MachinePointerInfo(SrcValue), false, false,
-              true,
-              DL.getABITypeAlignment(VecVT.getTypeForEVT(F->getContext())));
+              VecVT, dl, Root, Arg, MachinePointerInfo(SrcValue),
+              DL.getABITypeAlignment(VecVT.getTypeForEVT(F->getContext())),
+              MachineMemOperand::MOInvariant);
           if (P.getNode())
             P.getNode()->setIROrder(idx + 1);
 
@@ -2241,10 +2249,9 @@ SDValue NVPTXTargetLowering::LowerFormalArguments(
         } else {
           // V4 loads
           // We have at least 4 elements (<3 x Ty> expands to 4 elements) and
-          // the
-          // vector will be expanded to a power of 2 elements, so we know we can
-          // always round up to the next multiple of 4 when creating the vector
-          // loads.
+          // the vector will be expanded to a power of 2 elements, so we know we
+          // can always round up to the next multiple of 4 when creating the
+          // vector loads.
           // e.g.  4 elem => 1 ld.v4
           //       6 elem => 2 ld.v4
           //       8 elem => 2 ld.v4
@@ -2262,9 +2269,9 @@ SDValue NVPTXTargetLowering::LowerFormalArguments(
             SDValue SrcAddr = DAG.getNode(ISD::ADD, dl, PtrVT, Arg,
                                           DAG.getConstant(Ofst, dl, PtrVT));
             SDValue P = DAG.getLoad(
-                VecVT, dl, Root, SrcAddr, MachinePointerInfo(SrcValue), false,
-                false, true,
-                DL.getABITypeAlignment(VecVT.getTypeForEVT(F->getContext())));
+                VecVT, dl, Root, SrcAddr, MachinePointerInfo(SrcValue),
+                DL.getABITypeAlignment(VecVT.getTypeForEVT(F->getContext())),
+                MachineMemOperand::MOInvariant);
             if (P.getNode())
               P.getNode()->setIROrder(idx + 1);
 
@@ -2298,12 +2305,11 @@ SDValue NVPTXTargetLowering::LowerFormalArguments(
                                        ISD::SEXTLOAD : ISD::ZEXTLOAD;
         p = DAG.getExtLoad(
             ExtOp, dl, Ins[InsIdx].VT, Root, Arg, MachinePointerInfo(srcValue),
-            ObjectVT, false, false, false,
+            ObjectVT,
             DL.getABITypeAlignment(ObjectVT.getTypeForEVT(F->getContext())));
       } else {
         p = DAG.getLoad(
-            Ins[InsIdx].VT, dl, Root, Arg, MachinePointerInfo(srcValue), false,
-            false, false,
+            Ins[InsIdx].VT, dl, Root, Arg, MachinePointerInfo(srcValue),
             DL.getABITypeAlignment(ObjectVT.getTypeForEVT(F->getContext())));
       }
       if (p.getNode())
@@ -2350,13 +2356,12 @@ SDValue NVPTXTargetLowering::LowerFormalArguments(
   return Chain;
 }
 
-
 SDValue
 NVPTXTargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv,
                                  bool isVarArg,
                                  const SmallVectorImpl<ISD::OutputArg> &Outs,
                                  const SmallVectorImpl<SDValue> &OutVals,
-                                 SDLoc dl, SelectionDAG &DAG) const {
+                                 const SDLoc &dl, SelectionDAG &DAG) const {
   MachineFunction &MF = DAG.getMachineFunction();
   const Function *F = MF.getFunction();
   Type *RetTy = F->getReturnType();
@@ -3940,9 +3945,8 @@ static SDValue PerformADDCombine(SDNode *N,
   SDValue N1 = N->getOperand(1);
 
   // First try with the default operand order.
-  SDValue Result = PerformADDCombineWithOperands(N, N0, N1, DCI, Subtarget,
-                                                 OptLevel);
-  if (Result.getNode())
+  if (SDValue Result =
+          PerformADDCombineWithOperands(N, N0, N1, DCI, Subtarget, OptLevel))
     return Result;
 
   // If that didn't work, try again with the operands commuted.
@@ -4139,7 +4143,7 @@ static bool AreMulWideOperandsDemotable(SDValue LHS, SDValue RHS,
 
   // The RHS can be a demotable op or a constant
   if (ConstantSDNode *CI = dyn_cast<ConstantSDNode>(RHS)) {
-    APInt Val = CI->getAPIntValue();
+    const APInt &Val = CI->getAPIntValue();
     if (LHSSign == Unsigned) {
       return Val.isIntN(OptSize);
     } else {
@@ -4230,8 +4234,7 @@ static SDValue PerformMULCombine(SDNode *N,
                                  CodeGenOpt::Level OptLevel) {
   if (OptLevel > 0) {
     // Try mul.wide combining at OptLevel > 0
-    SDValue Ret = TryMULWIDECombine(N, DCI);
-    if (Ret.getNode())
+    if (SDValue Ret = TryMULWIDECombine(N, DCI))
       return Ret;
   }
 
@@ -4244,8 +4247,7 @@ static SDValue PerformSHLCombine(SDNode *N,
                                  CodeGenOpt::Level OptLevel) {
   if (OptLevel > 0) {
     // Try mul.wide combining at OptLevel > 0
-    SDValue Ret = TryMULWIDECombine(N, DCI);
-    if (Ret.getNode())
+    if (SDValue Ret = TryMULWIDECombine(N, DCI))
       return Ret;
   }
 
@@ -4368,7 +4370,7 @@ static void ReplaceLoadVector(SDNode *N, SelectionDAG &DAG,
 
   SDValue LoadChain = NewLD.getValue(NumElts);
 
-  SDValue BuildVec = DAG.getNode(ISD::BUILD_VECTOR, DL, ResVT, ScalarRes);
+  SDValue BuildVec = DAG.getBuildVector(ResVT, DL, ScalarRes);
 
   Results.push_back(BuildVec);
   Results.push_back(LoadChain);
@@ -4481,7 +4483,7 @@ static void ReplaceINTRINSIC_W_CHAIN(SDNode *N, SelectionDAG &DAG,
       SDValue LoadChain = NewLD.getValue(NumElts);
 
       SDValue BuildVec =
-          DAG.getNode(ISD::BUILD_VECTOR, DL, ResVT, ScalarRes);
+          DAG.getBuildVector(ResVT, DL, ScalarRes);
 
       Results.push_back(BuildVec);
       Results.push_back(LoadChain);
diff --git a/lib/Target/NVPTX/NVPTXISelLowering.h b/lib/Target/NVPTX/NVPTXISelLowering.h
index 60914c1d09b4..1c32232024d1 100644
--- a/lib/Target/NVPTX/NVPTXISelLowering.h
+++ b/lib/Target/NVPTX/NVPTXISelLowering.h
@@ -34,7 +34,9 @@ enum NodeType : unsigned {
   DeclareRet,
   DeclareScalarRet,
   PrintCall,
+  PrintConvergentCall,
   PrintCallUni,
+  PrintConvergentCallUni,
   CallArgBegin,
   CallArg,
   LastCallArg,
@@ -475,10 +477,11 @@ public:
   getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI,
                                StringRef Constraint, MVT VT) const override;
 
-  SDValue LowerFormalArguments(
-      SDValue Chain, CallingConv::ID CallConv, bool isVarArg,
-      const SmallVectorImpl<ISD::InputArg> &Ins, SDLoc dl, SelectionDAG &DAG,
-      SmallVectorImpl<SDValue> &InVals) const override;
+  SDValue LowerFormalArguments(SDValue Chain, CallingConv::ID CallConv,
+                               bool isVarArg,
+                               const SmallVectorImpl<ISD::InputArg> &Ins,
+                               const SDLoc &dl, SelectionDAG &DAG,
+                               SmallVectorImpl<SDValue> &InVals) const override;
 
   SDValue LowerCall(CallLoweringInfo &CLI,
                     SmallVectorImpl<SDValue> &InVals) const override;
@@ -488,11 +491,10 @@ public:
                            unsigned retAlignment,
                            const ImmutableCallSite *CS) const;
 
-  SDValue
-  LowerReturn(SDValue Chain, CallingConv::ID CallConv, bool isVarArg,
-              const SmallVectorImpl<ISD::OutputArg> &Outs,
-              const SmallVectorImpl<SDValue> &OutVals, SDLoc dl,
-              SelectionDAG &DAG) const override;
+  SDValue LowerReturn(SDValue Chain, CallingConv::ID CallConv, bool isVarArg,
+                      const SmallVectorImpl<ISD::OutputArg> &Outs,
+                      const SmallVectorImpl<SDValue> &OutVals, const SDLoc &dl,
+                      SelectionDAG &DAG) const override;
 
   void LowerAsmOperandForConstraint(SDValue Op, std::string &Constraint,
                                     std::vector<SDValue> &Ops,
diff --git a/lib/Target/NVPTX/NVPTXImageOptimizer.cpp b/lib/Target/NVPTX/NVPTXImageOptimizer.cpp
index aa36b6be7250..8d00bbb5e9c2 100644
--- a/lib/Target/NVPTX/NVPTXImageOptimizer.cpp
+++ b/lib/Target/NVPTX/NVPTXImageOptimizer.cpp
@@ -50,6 +50,9 @@ NVPTXImageOptimizer::NVPTXImageOptimizer()
   : FunctionPass(ID) {}
 
 bool NVPTXImageOptimizer::runOnFunction(Function &F) {
+  if (skipFunction(F))
+    return false;
+
   bool Changed = false;
   InstrToDelete.clear();
 
diff --git a/lib/Target/NVPTX/NVPTXInferAddressSpaces.cpp b/lib/Target/NVPTX/NVPTXInferAddressSpaces.cpp
new file mode 100644
index 000000000000..e451d273cf44
--- /dev/null
+++ b/lib/Target/NVPTX/NVPTXInferAddressSpaces.cpp
@@ -0,0 +1,586 @@
+//===-- NVPTXInferAddressSpace.cpp - ---------------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// CUDA C/C++ includes memory space designation as variable type qualifers (such
+// as __global__ and __shared__). Knowing the space of a memory access allows
+// CUDA compilers to emit faster PTX loads and stores. For example, a load from
+// shared memory can be translated to `ld.shared` which is roughly 10% faster
+// than a generic `ld` on an NVIDIA Tesla K40c.
+//
+// Unfortunately, type qualifiers only apply to variable declarations, so CUDA
+// compilers must infer the memory space of an address expression from
+// type-qualified variables.
+//
+// LLVM IR uses non-zero (so-called) specific address spaces to represent memory
+// spaces (e.g. addrspace(3) means shared memory). The Clang frontend
+// places only type-qualified variables in specific address spaces, and then
+// conservatively `addrspacecast`s each type-qualified variable to addrspace(0)
+// (so-called the generic address space) for other instructions to use.
+//
+// For example, the Clang translates the following CUDA code
+//   __shared__ float a[10];
+//   float v = a[i];
+// to
+//   %0 = addrspacecast [10 x float] addrspace(3)* @a to [10 x float]*
+//   %1 = gep [10 x float], [10 x float]* %0, i64 0, i64 %i
+//   %v = load float, float* %1 ; emits ld.f32
+// @a is in addrspace(3) since it's type-qualified, but its use from %1 is
+// redirected to %0 (the generic version of @a).
+//
+// The optimization implemented in this file propagates specific address spaces
+// from type-qualified variable declarations to its users. For example, it
+// optimizes the above IR to
+//   %1 = gep [10 x float] addrspace(3)* @a, i64 0, i64 %i
+//   %v = load float addrspace(3)* %1 ; emits ld.shared.f32
+// propagating the addrspace(3) from @a to %1. As the result, the NVPTX
+// codegen is able to emit ld.shared.f32 for %v.
+//
+// Address space inference works in two steps. First, it uses a data-flow
+// analysis to infer as many generic pointers as possible to point to only one
+// specific address space. In the above example, it can prove that %1 only
+// points to addrspace(3). This algorithm was published in
+//   CUDA: Compiling and optimizing for a GPU platform
+//   Chakrabarti, Grover, Aarts, Kong, Kudlur, Lin, Marathe, Murphy, Wang
+//   ICCS 2012
+//
+// Then, address space inference replaces all refinable generic pointers with
+// equivalent specific pointers.
+//
+// The major challenge of implementing this optimization is handling PHINodes,
+// which may create loops in the data flow graph. This brings two complications.
+//
+// First, the data flow analysis in Step 1 needs to be circular. For example,
+//     %generic.input = addrspacecast float addrspace(3)* %input to float*
+//   loop:
+//     %y = phi [ %generic.input, %y2 ]
+//     %y2 = getelementptr %y, 1
+//     %v = load %y2
+//     br ..., label %loop, ...
+// proving %y specific requires proving both %generic.input and %y2 specific,
+// but proving %y2 specific circles back to %y. To address this complication,
+// the data flow analysis operates on a lattice:
+//   uninitialized > specific address spaces > generic.
+// All address expressions (our implementation only considers phi, bitcast,
+// addrspacecast, and getelementptr) start with the uninitialized address space.
+// The monotone transfer function moves the address space of a pointer down a
+// lattice path from uninitialized to specific and then to generic. A join
+// operation of two different specific address spaces pushes the expression down
+// to the generic address space. The analysis completes once it reaches a fixed
+// point.
+//
+// Second, IR rewriting in Step 2 also needs to be circular. For example,
+// converting %y to addrspace(3) requires the compiler to know the converted
+// %y2, but converting %y2 needs the converted %y. To address this complication,
+// we break these cycles using "undef" placeholders. When converting an
+// instruction `I` to a new address space, if its operand `Op` is not converted
+// yet, we let `I` temporarily use `undef` and fix all the uses of undef later.
+// For instance, our algorithm first converts %y to
+//   %y' = phi float addrspace(3)* [ %input, undef ]
+// Then, it converts %y2 to
+//   %y2' = getelementptr %y', 1
+// Finally, it fixes the undef in %y' so that
+//   %y' = phi float addrspace(3)* [ %input, %y2' ]
+//
+// TODO: This pass is experimental and not enabled by default. Users can turn it
+// on by setting the -nvptx-use-infer-addrspace flag of llc. We plan to replace
+// NVPTXNonFavorGenericAddrSpaces with this pass shortly.
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "nvptx-infer-addrspace"
+
+#include "NVPTX.h"
+#include "MCTargetDesc/NVPTXBaseInfo.h"
+#include "llvm/ADT/DenseSet.h"
+#include "llvm/ADT/Optional.h"
+#include "llvm/ADT/SetVector.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/InstIterator.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/Operator.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Transforms/Utils/Local.h"
+#include "llvm/Transforms/Utils/ValueMapper.h"
+
+using namespace llvm;
+
+namespace {
+const unsigned ADDRESS_SPACE_UNINITIALIZED = (unsigned)-1;
+
+using ValueToAddrSpaceMapTy = DenseMap<const Value *, unsigned>;
+
+/// \brief NVPTXInferAddressSpaces
+class NVPTXInferAddressSpaces: public FunctionPass {
+public:
+  static char ID;
+
+  NVPTXInferAddressSpaces() : FunctionPass(ID) {}
+
+  bool runOnFunction(Function &F) override;
+
+private:
+  // Returns the new address space of V if updated; otherwise, returns None.
+  Optional<unsigned>
+  updateAddressSpace(const Value &V,
+                     const ValueToAddrSpaceMapTy &InferredAddrSpace);
+
+  // Tries to infer the specific address space of each address expression in
+  // Postorder.
+  void inferAddressSpaces(const std::vector<Value *> &Postorder,
+                          ValueToAddrSpaceMapTy *InferredAddrSpace);
+
+  // Changes the generic address expressions in function F to point to specific
+  // address spaces if InferredAddrSpace says so. Postorder is the postorder of
+  // all generic address expressions in the use-def graph of function F.
+  bool
+  rewriteWithNewAddressSpaces(const std::vector<Value *> &Postorder,
+                              const ValueToAddrSpaceMapTy &InferredAddrSpace,
+                              Function *F);
+};
+} // end anonymous namespace
+
+char NVPTXInferAddressSpaces::ID = 0;
+
+namespace llvm {
+void initializeNVPTXInferAddressSpacesPass(PassRegistry &);
+}
+INITIALIZE_PASS(NVPTXInferAddressSpaces, "nvptx-infer-addrspace",
+                "Infer address spaces",
+                false, false)
+
+// Returns true if V is an address expression.
+// TODO: Currently, we consider only phi, bitcast, addrspacecast, and
+// getelementptr operators.
+static bool isAddressExpression(const Value &V) {
+  if (!isa<Operator>(V))
+    return false;
+
+  switch (cast<Operator>(V).getOpcode()) {
+  case Instruction::PHI:
+  case Instruction::BitCast:
+  case Instruction::AddrSpaceCast:
+  case Instruction::GetElementPtr:
+    return true;
+  default:
+    return false;
+  }
+}
+
+// Returns the pointer operands of V.
+//
+// Precondition: V is an address expression.
+static SmallVector<Value *, 2> getPointerOperands(const Value &V) {
+  assert(isAddressExpression(V));
+  const Operator& Op = cast<Operator>(V);
+  switch (Op.getOpcode()) {
+  case Instruction::PHI: {
+    auto IncomingValues = cast<PHINode>(Op).incoming_values();
+    return SmallVector<Value *, 2>(IncomingValues.begin(),
+                                   IncomingValues.end());
+  }
+  case Instruction::BitCast:
+  case Instruction::AddrSpaceCast:
+  case Instruction::GetElementPtr:
+    return {Op.getOperand(0)};
+  default:
+    llvm_unreachable("Unexpected instruction type.");
+  }
+}
+
+// If V is an unvisited generic address expression, appends V to PostorderStack
+// and marks it as visited.
+static void appendsGenericAddressExpressionToPostorderStack(
+    Value *V, std::vector<std::pair<Value *, bool>> *PostorderStack,
+    DenseSet<Value *> *Visited) {
+  assert(V->getType()->isPointerTy());
+  if (isAddressExpression(*V) &&
+      V->getType()->getPointerAddressSpace() ==
+          AddressSpace::ADDRESS_SPACE_GENERIC) {
+    if (Visited->insert(V).second)
+      PostorderStack->push_back(std::make_pair(V, false));
+  }
+}
+
+// Returns all generic address expressions in function F. The elements are
+// ordered in postorder.
+static std::vector<Value *> collectGenericAddressExpressions(Function &F) {
+  // This function implements a non-recursive postorder traversal of a partial
+  // use-def graph of function F.
+  std::vector<std::pair<Value*, bool>> PostorderStack;
+  // The set of visited expressions.
+  DenseSet<Value*> Visited;
+  // We only explore address expressions that are reachable from loads and
+  // stores for now because we aim at generating faster loads and stores.
+  for (Instruction &I : instructions(F)) {
+    if (isa<LoadInst>(I)) {
+      appendsGenericAddressExpressionToPostorderStack(
+          I.getOperand(0), &PostorderStack, &Visited);
+    } else if (isa<StoreInst>(I)) {
+      appendsGenericAddressExpressionToPostorderStack(
+          I.getOperand(1), &PostorderStack, &Visited);
+    }
+  }
+
+  std::vector<Value *> Postorder; // The resultant postorder.
+  while (!PostorderStack.empty()) {
+    // If the operands of the expression on the top are already explored,
+    // adds that expression to the resultant postorder.
+    if (PostorderStack.back().second) {
+      Postorder.push_back(PostorderStack.back().first);
+      PostorderStack.pop_back();
+      continue;
+    }
+    // Otherwise, adds its operands to the stack and explores them.
+    PostorderStack.back().second = true;
+    for (Value *PtrOperand : getPointerOperands(*PostorderStack.back().first)) {
+      appendsGenericAddressExpressionToPostorderStack(
+          PtrOperand, &PostorderStack, &Visited);
+    }
+  }
+  return Postorder;
+}
+
+// A helper function for cloneInstructionWithNewAddressSpace. Returns the clone
+// of OperandUse.get() in the new address space. If the clone is not ready yet,
+// returns an undef in the new address space as a placeholder.
+static Value *operandWithNewAddressSpaceOrCreateUndef(
+    const Use &OperandUse, unsigned NewAddrSpace,
+    const ValueToValueMapTy &ValueWithNewAddrSpace,
+    SmallVectorImpl<const Use *> *UndefUsesToFix) {
+  Value *Operand = OperandUse.get();
+  if (Value *NewOperand = ValueWithNewAddrSpace.lookup(Operand))
+    return NewOperand;
+
+  UndefUsesToFix->push_back(&OperandUse);
+  return UndefValue::get(
+      Operand->getType()->getPointerElementType()->getPointerTo(NewAddrSpace));
+}
+
+// Returns a clone of `I` with its operands converted to those specified in
+// ValueWithNewAddrSpace. Due to potential cycles in the data flow graph, an
+// operand whose address space needs to be modified might not exist in
+// ValueWithNewAddrSpace. In that case, uses undef as a placeholder operand and
+// adds that operand use to UndefUsesToFix so that caller can fix them later.
+//
+// Note that we do not necessarily clone `I`, e.g., if it is an addrspacecast
+// from a pointer whose type already matches. Therefore, this function returns a
+// Value* instead of an Instruction*.
+static Value *cloneInstructionWithNewAddressSpace(
+    Instruction *I, unsigned NewAddrSpace,
+    const ValueToValueMapTy &ValueWithNewAddrSpace,
+    SmallVectorImpl<const Use *> *UndefUsesToFix) {
+  Type *NewPtrType =
+      I->getType()->getPointerElementType()->getPointerTo(NewAddrSpace);
+
+  if (I->getOpcode() == Instruction::AddrSpaceCast) {
+    Value *Src = I->getOperand(0);
+    // Because `I` is generic, the source address space must be specific.
+    // Therefore, the inferred address space must be the source space, according
+    // to our algorithm.
+    assert(Src->getType()->getPointerAddressSpace() == NewAddrSpace);
+    if (Src->getType() != NewPtrType)
+      return new BitCastInst(Src, NewPtrType);
+    return Src;
+  }
+
+  // Computes the converted pointer operands.
+  SmallVector<Value *, 4> NewPointerOperands;
+  for (const Use &OperandUse : I->operands()) {
+    if (!OperandUse.get()->getType()->isPointerTy())
+      NewPointerOperands.push_back(nullptr);
+    else
+      NewPointerOperands.push_back(operandWithNewAddressSpaceOrCreateUndef(
+          OperandUse, NewAddrSpace, ValueWithNewAddrSpace, UndefUsesToFix));
+  }
+
+  switch (I->getOpcode()) {
+  case Instruction::BitCast:
+    return new BitCastInst(NewPointerOperands[0], NewPtrType);
+  case Instruction::PHI: {
+    assert(I->getType()->isPointerTy());
+    PHINode *PHI = cast<PHINode>(I);
+    PHINode *NewPHI = PHINode::Create(NewPtrType, PHI->getNumIncomingValues());
+    for (unsigned Index = 0; Index < PHI->getNumIncomingValues(); ++Index) {
+      unsigned OperandNo = PHINode::getOperandNumForIncomingValue(Index);
+      NewPHI->addIncoming(NewPointerOperands[OperandNo],
+                          PHI->getIncomingBlock(Index));
+    }
+    return NewPHI;
+  }
+  case Instruction::GetElementPtr: {
+    GetElementPtrInst *GEP = cast<GetElementPtrInst>(I);
+    GetElementPtrInst *NewGEP = GetElementPtrInst::Create(
+        GEP->getSourceElementType(), NewPointerOperands[0],
+        SmallVector<Value *, 4>(GEP->idx_begin(), GEP->idx_end()));
+    NewGEP->setIsInBounds(GEP->isInBounds());
+    return NewGEP;
+  }
+  default:
+    llvm_unreachable("Unexpected opcode");
+  }
+}
+
+// Similar to cloneInstructionWithNewAddressSpace, returns a clone of the
+// constant expression `CE` with its operands replaced as specified in
+// ValueWithNewAddrSpace.
+static Value *cloneConstantExprWithNewAddressSpace(
+    ConstantExpr *CE, unsigned NewAddrSpace,
+    const ValueToValueMapTy &ValueWithNewAddrSpace) {
+  Type *TargetType =
+      CE->getType()->getPointerElementType()->getPointerTo(NewAddrSpace);
+
+  if (CE->getOpcode() == Instruction::AddrSpaceCast) {
+    // Because CE is generic, the source address space must be specific.
+    // Therefore, the inferred address space must be the source space according
+    // to our algorithm.
+    assert(CE->getOperand(0)->getType()->getPointerAddressSpace() ==
+           NewAddrSpace);
+    return ConstantExpr::getBitCast(CE->getOperand(0), TargetType);
+  }
+
+  // Computes the operands of the new constant expression.
+  SmallVector<Constant *, 4> NewOperands;
+  for (unsigned Index = 0; Index < CE->getNumOperands(); ++Index) {
+    Constant *Operand = CE->getOperand(Index);
+    // If the address space of `Operand` needs to be modified, the new operand
+    // with the new address space should already be in ValueWithNewAddrSpace
+    // because (1) the constant expressions we consider (i.e. addrspacecast,
+    // bitcast, and getelementptr) do not incur cycles in the data flow graph
+    // and (2) this function is called on constant expressions in postorder.
+    if (Value *NewOperand = ValueWithNewAddrSpace.lookup(Operand)) {
+      NewOperands.push_back(cast<Constant>(NewOperand));
+    } else {
+      // Otherwise, reuses the old operand.
+      NewOperands.push_back(Operand);
+    }
+  }
+
+  if (CE->getOpcode() == Instruction::GetElementPtr) {
+    // Needs to specify the source type while constructing a getelementptr
+    // constant expression.
+    return CE->getWithOperands(
+        NewOperands, TargetType, /*OnlyIfReduced=*/false,
+        NewOperands[0]->getType()->getPointerElementType());
+  }
+
+  return CE->getWithOperands(NewOperands, TargetType);
+}
+
+// Returns a clone of the value `V`, with its operands replaced as specified in
+// ValueWithNewAddrSpace. This function is called on every generic address
+// expression whose address space needs to be modified, in postorder.
+//
+// See cloneInstructionWithNewAddressSpace for the meaning of UndefUsesToFix.
+static Value *
+cloneValueWithNewAddressSpace(Value *V, unsigned NewAddrSpace,
+                              const ValueToValueMapTy &ValueWithNewAddrSpace,
+                              SmallVectorImpl<const Use *> *UndefUsesToFix) {
+  // All values in Postorder are generic address expressions.
+  assert(isAddressExpression(*V) &&
+         V->getType()->getPointerAddressSpace() ==
+             AddressSpace::ADDRESS_SPACE_GENERIC);
+
+  if (Instruction *I = dyn_cast<Instruction>(V)) {
+    Value *NewV = cloneInstructionWithNewAddressSpace(
+        I, NewAddrSpace, ValueWithNewAddrSpace, UndefUsesToFix);
+    if (Instruction *NewI = dyn_cast<Instruction>(NewV)) {
+      if (NewI->getParent() == nullptr) {
+        NewI->insertBefore(I);
+        NewI->takeName(I);
+      }
+    }
+    return NewV;
+  }
+
+  return cloneConstantExprWithNewAddressSpace(
+      cast<ConstantExpr>(V), NewAddrSpace, ValueWithNewAddrSpace);
+}
+
+// Defines the join operation on the address space lattice (see the file header
+// comments).
+static unsigned joinAddressSpaces(unsigned AS1, unsigned AS2) {
+  if (AS1 == AddressSpace::ADDRESS_SPACE_GENERIC ||
+      AS2 == AddressSpace::ADDRESS_SPACE_GENERIC)
+    return AddressSpace::ADDRESS_SPACE_GENERIC;
+
+  if (AS1 == ADDRESS_SPACE_UNINITIALIZED)
+    return AS2;
+  if (AS2 == ADDRESS_SPACE_UNINITIALIZED)
+    return AS1;
+
+  // The join of two different specific address spaces is generic.
+  return AS1 == AS2 ? AS1 : (unsigned)AddressSpace::ADDRESS_SPACE_GENERIC;
+}
+
+bool NVPTXInferAddressSpaces::runOnFunction(Function &F) {
+  if (skipFunction(F))
+    return false;
+
+  // Collects all generic address expressions in postorder.
+  std::vector<Value *> Postorder = collectGenericAddressExpressions(F);
+
+  // Runs a data-flow analysis to refine the address spaces of every expression
+  // in Postorder.
+  ValueToAddrSpaceMapTy InferredAddrSpace;
+  inferAddressSpaces(Postorder, &InferredAddrSpace);
+
+  // Changes the address spaces of the generic address expressions who are
+  // inferred to point to a specific address space.
+  return rewriteWithNewAddressSpaces(Postorder, InferredAddrSpace, &F);
+}
+
+void NVPTXInferAddressSpaces::inferAddressSpaces(
+    const std::vector<Value *> &Postorder,
+    ValueToAddrSpaceMapTy *InferredAddrSpace) {
+  SetVector<Value *> Worklist(Postorder.begin(), Postorder.end());
+  // Initially, all expressions are in the uninitialized address space.
+  for (Value *V : Postorder)
+    (*InferredAddrSpace)[V] = ADDRESS_SPACE_UNINITIALIZED;
+
+  while (!Worklist.empty()) {
+    Value* V = Worklist.pop_back_val();
+
+    // Tries to update the address space of the stack top according to the
+    // address spaces of its operands.
+    DEBUG(dbgs() << "Updating the address space of\n"
+                 << "  " << *V << "\n");
+    Optional<unsigned> NewAS = updateAddressSpace(*V, *InferredAddrSpace);
+    if (!NewAS.hasValue())
+      continue;
+    // If any updates are made, grabs its users to the worklist because
+    // their address spaces can also be possibly updated.
+    DEBUG(dbgs() << "  to " << NewAS.getValue() << "\n");
+    (*InferredAddrSpace)[V] = NewAS.getValue();
+
+    for (Value *User : V->users()) {
+      // Skip if User is already in the worklist.
+      if (Worklist.count(User))
+        continue;
+
+      auto Pos = InferredAddrSpace->find(User);
+      // Our algorithm only updates the address spaces of generic address
+      // expressions, which are those in InferredAddrSpace.
+      if (Pos == InferredAddrSpace->end())
+        continue;
+
+      // Function updateAddressSpace moves the address space down a lattice
+      // path. Therefore, nothing to do if User is already inferred as
+      // generic (the bottom element in the lattice).
+      if (Pos->second == AddressSpace::ADDRESS_SPACE_GENERIC)
+        continue;
+
+      Worklist.insert(User);
+    }
+  }
+}
+
+Optional<unsigned> NVPTXInferAddressSpaces::updateAddressSpace(
+    const Value &V, const ValueToAddrSpaceMapTy &InferredAddrSpace) {
+  assert(InferredAddrSpace.count(&V));
+
+  // The new inferred address space equals the join of the address spaces
+  // of all its pointer operands.
+  unsigned NewAS = ADDRESS_SPACE_UNINITIALIZED;
+  for (Value *PtrOperand : getPointerOperands(V)) {
+    unsigned OperandAS;
+    if (InferredAddrSpace.count(PtrOperand))
+      OperandAS = InferredAddrSpace.lookup(PtrOperand);
+    else
+      OperandAS = PtrOperand->getType()->getPointerAddressSpace();
+    NewAS = joinAddressSpaces(NewAS, OperandAS);
+    // join(generic, *) = generic. So we can break if NewAS is already generic.
+    if (NewAS == AddressSpace::ADDRESS_SPACE_GENERIC)
+      break;
+  }
+
+  unsigned OldAS = InferredAddrSpace.lookup(&V);
+  assert(OldAS != AddressSpace::ADDRESS_SPACE_GENERIC);
+  if (OldAS == NewAS)
+    return None;
+  return NewAS;
+}
+
+bool NVPTXInferAddressSpaces::rewriteWithNewAddressSpaces(
+    const std::vector<Value *> &Postorder,
+    const ValueToAddrSpaceMapTy &InferredAddrSpace, Function *F) {
+  // For each address expression to be modified, creates a clone of it with its
+  // pointer operands converted to the new address space. Since the pointer
+  // operands are converted, the clone is naturally in the new address space by
+  // construction.
+  ValueToValueMapTy ValueWithNewAddrSpace;
+  SmallVector<const Use *, 32> UndefUsesToFix;
+  for (Value* V : Postorder) {
+    unsigned NewAddrSpace = InferredAddrSpace.lookup(V);
+    if (V->getType()->getPointerAddressSpace() != NewAddrSpace) {
+      ValueWithNewAddrSpace[V] = cloneValueWithNewAddressSpace(
+          V, NewAddrSpace, ValueWithNewAddrSpace, &UndefUsesToFix);
+    }
+  }
+
+  if (ValueWithNewAddrSpace.empty())
+    return false;
+
+  // Fixes all the undef uses generated by cloneInstructionWithNewAddressSpace.
+  for (const Use* UndefUse : UndefUsesToFix) {
+    User *V = UndefUse->getUser();
+    User *NewV = cast<User>(ValueWithNewAddrSpace.lookup(V));
+    unsigned OperandNo = UndefUse->getOperandNo();
+    assert(isa<UndefValue>(NewV->getOperand(OperandNo)));
+    NewV->setOperand(OperandNo, ValueWithNewAddrSpace.lookup(UndefUse->get()));
+  }
+
+  // Replaces the uses of the old address expressions with the new ones.
+  for (Value *V : Postorder) {
+    Value *NewV = ValueWithNewAddrSpace.lookup(V);
+    if (NewV == nullptr)
+      continue;
+
+    SmallVector<Use *, 4> Uses;
+    for (Use &U : V->uses())
+      Uses.push_back(&U);
+    DEBUG(dbgs() << "Replacing the uses of " << *V << "\n  to\n  " << *NewV
+                 << "\n");
+    for (Use *U : Uses) {
+      if (isa<LoadInst>(U->getUser()) ||
+          (isa<StoreInst>(U->getUser()) && U->getOperandNo() == 1)) {
+        // If V is used as the pointer operand of a load/store, sets the pointer
+        // operand to NewV. This replacement does not change the element type,
+        // so the resultant load/store is still valid.
+        U->set(NewV);
+      } else if (isa<Instruction>(U->getUser())) {
+        // Otherwise, replaces the use with generic(NewV).
+        // TODO: Some optimization opportunities are missed. For example, in
+        //   %0 = icmp eq float* %p, %q
+        // if both p and q are inferred to be shared, we can rewrite %0 as
+        //   %0 = icmp eq float addrspace(3)* %new_p, %new_q
+        // instead of currently
+        //   %generic_p = addrspacecast float addrspace(3)* %new_p to float*
+        //   %generic_q = addrspacecast float addrspace(3)* %new_q to float*
+        //   %0 = icmp eq float* %generic_p, %generic_q
+        if (Instruction *I = dyn_cast<Instruction>(V)) {
+          BasicBlock::iterator InsertPos = std::next(I->getIterator());
+          while (isa<PHINode>(InsertPos))
+            ++InsertPos;
+          U->set(new AddrSpaceCastInst(NewV, V->getType(), "", &*InsertPos));
+        } else {
+          U->set(ConstantExpr::getAddrSpaceCast(cast<Constant>(NewV),
+                                                V->getType()));
+        }
+      }
+    }
+    if (V->use_empty())
+      RecursivelyDeleteTriviallyDeadInstructions(V);
+  }
+
+  return true;
+}
+
+FunctionPass *llvm::createNVPTXInferAddressSpacesPass() {
+  return new NVPTXInferAddressSpaces();
+}
diff --git a/lib/Target/NVPTX/NVPTXInstrInfo.cpp b/lib/Target/NVPTX/NVPTXInstrInfo.cpp
index 9f3cf4551955..0c7c6cbc4512 100644
--- a/lib/Target/NVPTX/NVPTXInstrInfo.cpp
+++ b/lib/Target/NVPTX/NVPTXInstrInfo.cpp
@@ -30,9 +30,10 @@ void NVPTXInstrInfo::anchor() {}
 
 NVPTXInstrInfo::NVPTXInstrInfo() : NVPTXGenInstrInfo(), RegInfo() {}
 
-void NVPTXInstrInfo::copyPhysReg(
-    MachineBasicBlock &MBB, MachineBasicBlock::iterator I, DebugLoc DL,
-    unsigned DestReg, unsigned SrcReg, bool KillSrc) const {
+void NVPTXInstrInfo::copyPhysReg(MachineBasicBlock &MBB,
+                                 MachineBasicBlock::iterator I,
+                                 const DebugLoc &DL, unsigned DestReg,
+                                 unsigned SrcReg, bool KillSrc) const {
   const MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
   const TargetRegisterClass *DestRC = MRI.getRegClass(DestReg);
   const TargetRegisterClass *SrcRC = MRI.getRegClass(SrcReg);
@@ -111,7 +112,7 @@ bool NVPTXInstrInfo::isStoreInstr(const MachineInstr &MI,
 
 bool NVPTXInstrInfo::CanTailMerge(const MachineInstr *MI) const {
   unsigned addrspace = 0;
-  if (MI->getOpcode() == NVPTX::INT_CUDA_SYNCTHREADS)
+  if (MI->getOpcode() == NVPTX::INT_BARRIER0)
     return false;
   if (isLoadInstr(*MI, addrspace))
     if (addrspace == NVPTX::PTXLdStInstCode::SHARED)
@@ -145,26 +146,28 @@ bool NVPTXInstrInfo::CanTailMerge(const MachineInstr *MI) const {
 /// Note that RemoveBranch and InsertBranch must be implemented to support
 /// cases where this method returns success.
 ///
-bool NVPTXInstrInfo::AnalyzeBranch(
-    MachineBasicBlock &MBB, MachineBasicBlock *&TBB, MachineBasicBlock *&FBB,
-    SmallVectorImpl<MachineOperand> &Cond, bool AllowModify) const {
+bool NVPTXInstrInfo::analyzeBranch(MachineBasicBlock &MBB,
+                                   MachineBasicBlock *&TBB,
+                                   MachineBasicBlock *&FBB,
+                                   SmallVectorImpl<MachineOperand> &Cond,
+                                   bool AllowModify) const {
   // If the block has no terminators, it just falls into the block after it.
   MachineBasicBlock::iterator I = MBB.end();
-  if (I == MBB.begin() || !isUnpredicatedTerminator(--I))
+  if (I == MBB.begin() || !isUnpredicatedTerminator(*--I))
     return false;
 
   // Get the last instruction in the block.
-  MachineInstr *LastInst = I;
+  MachineInstr &LastInst = *I;
 
   // If there is only one terminator instruction, process it.
-  if (I == MBB.begin() || !isUnpredicatedTerminator(--I)) {
-    if (LastInst->getOpcode() == NVPTX::GOTO) {
-      TBB = LastInst->getOperand(0).getMBB();
+  if (I == MBB.begin() || !isUnpredicatedTerminator(*--I)) {
+    if (LastInst.getOpcode() == NVPTX::GOTO) {
+      TBB = LastInst.getOperand(0).getMBB();
       return false;
-    } else if (LastInst->getOpcode() == NVPTX::CBranch) {
+    } else if (LastInst.getOpcode() == NVPTX::CBranch) {
       // Block ends with fall-through condbranch.
-      TBB = LastInst->getOperand(1).getMBB();
-      Cond.push_back(LastInst->getOperand(0));
+      TBB = LastInst.getOperand(1).getMBB();
+      Cond.push_back(LastInst.getOperand(0));
       return false;
     }
     // Otherwise, don't know what this is.
@@ -172,26 +175,26 @@ bool NVPTXInstrInfo::AnalyzeBranch(
   }
 
   // Get the instruction before it if it's a terminator.
-  MachineInstr *SecondLastInst = I;
+  MachineInstr &SecondLastInst = *I;
 
   // If there are three terminators, we don't know what sort of block this is.
-  if (SecondLastInst && I != MBB.begin() && isUnpredicatedTerminator(--I))
+  if (I != MBB.begin() && isUnpredicatedTerminator(*--I))
     return true;
 
   // If the block ends with NVPTX::GOTO and NVPTX:CBranch, handle it.
-  if (SecondLastInst->getOpcode() == NVPTX::CBranch &&
-      LastInst->getOpcode() == NVPTX::GOTO) {
-    TBB = SecondLastInst->getOperand(1).getMBB();
-    Cond.push_back(SecondLastInst->getOperand(0));
-    FBB = LastInst->getOperand(0).getMBB();
+  if (SecondLastInst.getOpcode() == NVPTX::CBranch &&
+      LastInst.getOpcode() == NVPTX::GOTO) {
+    TBB = SecondLastInst.getOperand(1).getMBB();
+    Cond.push_back(SecondLastInst.getOperand(0));
+    FBB = LastInst.getOperand(0).getMBB();
     return false;
   }
 
   // If the block ends with two NVPTX:GOTOs, handle it.  The second one is not
   // executed, so remove it.
-  if (SecondLastInst->getOpcode() == NVPTX::GOTO &&
-      LastInst->getOpcode() == NVPTX::GOTO) {
-    TBB = SecondLastInst->getOperand(0).getMBB();
+  if (SecondLastInst.getOpcode() == NVPTX::GOTO &&
+      LastInst.getOpcode() == NVPTX::GOTO) {
+    TBB = SecondLastInst.getOperand(0).getMBB();
     I = LastInst;
     if (AllowModify)
       I->eraseFromParent();
@@ -226,9 +229,11 @@ unsigned NVPTXInstrInfo::RemoveBranch(MachineBasicBlock &MBB) const {
   return 2;
 }
 
-unsigned NVPTXInstrInfo::InsertBranch(
-    MachineBasicBlock &MBB, MachineBasicBlock *TBB, MachineBasicBlock *FBB,
-    ArrayRef<MachineOperand> Cond, DebugLoc DL) const {
+unsigned NVPTXInstrInfo::InsertBranch(MachineBasicBlock &MBB,
+                                      MachineBasicBlock *TBB,
+                                      MachineBasicBlock *FBB,
+                                      ArrayRef<MachineOperand> Cond,
+                                      const DebugLoc &DL) const {
   // Shouldn't be a fall through.
   assert(TBB && "InsertBranch must not be told to insert a fallthrough");
   assert((Cond.size() == 1 || Cond.size() == 0) &&
diff --git a/lib/Target/NVPTX/NVPTXInstrInfo.h b/lib/Target/NVPTX/NVPTXInstrInfo.h
index 3e407223f010..050bf12fe859 100644
--- a/lib/Target/NVPTX/NVPTXInstrInfo.h
+++ b/lib/Target/NVPTX/NVPTXInstrInfo.h
@@ -49,9 +49,9 @@ public:
    *                               const TargetRegisterClass *RC) const;
    */
 
-  void copyPhysReg(
-      MachineBasicBlock &MBB, MachineBasicBlock::iterator I, DebugLoc DL,
-      unsigned DestReg, unsigned SrcReg, bool KillSrc) const override;
+  void copyPhysReg(MachineBasicBlock &MBB, MachineBasicBlock::iterator I,
+                   const DebugLoc &DL, unsigned DestReg, unsigned SrcReg,
+                   bool KillSrc) const override;
   virtual bool isMoveInstr(const MachineInstr &MI, unsigned &SrcReg,
                            unsigned &DestReg) const;
   bool isLoadInstr(const MachineInstr &MI, unsigned &AddrSpace) const;
@@ -59,13 +59,14 @@ public:
 
   virtual bool CanTailMerge(const MachineInstr *MI) const;
   // Branch analysis.
-  bool AnalyzeBranch(
-      MachineBasicBlock &MBB, MachineBasicBlock *&TBB, MachineBasicBlock *&FBB,
-      SmallVectorImpl<MachineOperand> &Cond, bool AllowModify) const override;
+  bool analyzeBranch(MachineBasicBlock &MBB, MachineBasicBlock *&TBB,
+                     MachineBasicBlock *&FBB,
+                     SmallVectorImpl<MachineOperand> &Cond,
+                     bool AllowModify) const override;
   unsigned RemoveBranch(MachineBasicBlock &MBB) const override;
-  unsigned InsertBranch(
-      MachineBasicBlock &MBB, MachineBasicBlock *TBB, MachineBasicBlock *FBB,
-      ArrayRef<MachineOperand> Cond, DebugLoc DL) const override;
+  unsigned InsertBranch(MachineBasicBlock &MBB, MachineBasicBlock *TBB,
+                        MachineBasicBlock *FBB, ArrayRef<MachineOperand> Cond,
+                        const DebugLoc &DL) const override;
   unsigned getLdStCodeAddrSpace(const MachineInstr &MI) const {
     return MI.getOperand(2).getImm();
   }
diff --git a/lib/Target/NVPTX/NVPTXInstrInfo.td b/lib/Target/NVPTX/NVPTXInstrInfo.td
index 6fdd60f3ed2d..c158cc6cdab2 100644
--- a/lib/Target/NVPTX/NVPTXInstrInfo.td
+++ b/lib/Target/NVPTX/NVPTXInstrInfo.td
@@ -14,7 +14,9 @@
 include "NVPTXInstrFormats.td"
 
 // A NOP instruction
-def NOP : NVPTXInst<(outs), (ins), "", []>;
+let hasSideEffects = 0 in {
+  def NOP : NVPTXInst<(outs), (ins), "", []>;
+}
 
 // List of vector specific properties
 def isVecLD      : VecInstTypeEnum<1>;
@@ -162,130 +164,146 @@ def hasPTX31 : Predicate<"Subtarget->getPTXVersion() >= 31">;
 // Some Common Instruction Class Templates
 //===----------------------------------------------------------------------===//
 
+// Template for instructions which take three int64, int32, or int16 args.
+// The instructions are named "<OpcStr><Width>" (e.g. "add.s64").
 multiclass I3<string OpcStr, SDNode OpNode> {
-  def i64rr : NVPTXInst<(outs Int64Regs:$dst), (ins Int64Regs:$a, Int64Regs:$b),
-                     !strconcat(OpcStr, "64 \t$dst, $a, $b;"),
-                     [(set Int64Regs:$dst, (OpNode Int64Regs:$a,
-                       Int64Regs:$b))]>;
-  def i64ri : NVPTXInst<(outs Int64Regs:$dst), (ins Int64Regs:$a, i64imm:$b),
-                     !strconcat(OpcStr, "64 \t$dst, $a, $b;"),
-                     [(set Int64Regs:$dst, (OpNode Int64Regs:$a, imm:$b))]>;
-  def i32rr : NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$a, Int32Regs:$b),
-                     !strconcat(OpcStr, "32 \t$dst, $a, $b;"),
-                     [(set Int32Regs:$dst, (OpNode Int32Regs:$a,
-                       Int32Regs:$b))]>;
-  def i32ri : NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$a, i32imm:$b),
-                     !strconcat(OpcStr, "32 \t$dst, $a, $b;"),
-                     [(set Int32Regs:$dst, (OpNode Int32Regs:$a, imm:$b))]>;
-  def i16rr : NVPTXInst<(outs Int16Regs:$dst), (ins Int16Regs:$a, Int16Regs:$b),
-                     !strconcat(OpcStr, "16 \t$dst, $a, $b;"),
-                     [(set Int16Regs:$dst, (OpNode Int16Regs:$a,
-                       Int16Regs:$b))]>;
-  def i16ri : NVPTXInst<(outs Int16Regs:$dst), (ins Int16Regs:$a, i16imm:$b),
-                     !strconcat(OpcStr, "16 \t$dst, $a, $b;"),
-                     [(set Int16Regs:$dst, (OpNode Int16Regs:$a, (imm):$b))]>;
+  def i64rr :
+    NVPTXInst<(outs Int64Regs:$dst), (ins Int64Regs:$a, Int64Regs:$b),
+              !strconcat(OpcStr, "64 \t$dst, $a, $b;"),
+              [(set Int64Regs:$dst, (OpNode Int64Regs:$a, Int64Regs:$b))]>;
+  def i64ri :
+    NVPTXInst<(outs Int64Regs:$dst), (ins Int64Regs:$a, i64imm:$b),
+              !strconcat(OpcStr, "64 \t$dst, $a, $b;"),
+              [(set Int64Regs:$dst, (OpNode Int64Regs:$a, imm:$b))]>;
+  def i32rr :
+    NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$a, Int32Regs:$b),
+              !strconcat(OpcStr, "32 \t$dst, $a, $b;"),
+              [(set Int32Regs:$dst, (OpNode Int32Regs:$a, Int32Regs:$b))]>;
+  def i32ri :
+    NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$a, i32imm:$b),
+              !strconcat(OpcStr, "32 \t$dst, $a, $b;"),
+              [(set Int32Regs:$dst, (OpNode Int32Regs:$a, imm:$b))]>;
+  def i16rr :
+    NVPTXInst<(outs Int16Regs:$dst), (ins Int16Regs:$a, Int16Regs:$b),
+              !strconcat(OpcStr, "16 \t$dst, $a, $b;"),
+              [(set Int16Regs:$dst, (OpNode Int16Regs:$a, Int16Regs:$b))]>;
+  def i16ri :
+    NVPTXInst<(outs Int16Regs:$dst), (ins Int16Regs:$a, i16imm:$b),
+              !strconcat(OpcStr, "16 \t$dst, $a, $b;"),
+              [(set Int16Regs:$dst, (OpNode Int16Regs:$a, (imm):$b))]>;
 }
 
+// Template for instructions which take 3 int32 args.  The instructions are
+// named "<OpcStr>.s32" (e.g. "addc.cc.s32").
 multiclass ADD_SUB_INT_32<string OpcStr, SDNode OpNode> {
-   def i32rr : NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$a,
-       Int32Regs:$b),
-                      !strconcat(OpcStr, ".s32 \t$dst, $a, $b;"),
-                      [(set Int32Regs:$dst, (OpNode Int32Regs:$a,
-                        Int32Regs:$b))]>;
-   def i32ri : NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$a, i32imm:$b),
-                      !strconcat(OpcStr, ".s32 \t$dst, $a, $b;"),
-                      [(set Int32Regs:$dst, (OpNode Int32Regs:$a, imm:$b))]>;
+   def i32rr :
+     NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$a, Int32Regs:$b),
+               !strconcat(OpcStr, ".s32 \t$dst, $a, $b;"),
+               [(set Int32Regs:$dst, (OpNode Int32Regs:$a, Int32Regs:$b))]>;
+   def i32ri :
+     NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$a, i32imm:$b),
+               !strconcat(OpcStr, ".s32 \t$dst, $a, $b;"),
+               [(set Int32Regs:$dst, (OpNode Int32Regs:$a, imm:$b))]>;
 }
 
+// Template for instructions which take three fp64 or fp32 args.  The
+// instructions are named "<OpcStr>.f<Width>" (e.g. "add.f64").
+//
+// Also defines ftz (flush subnormal inputs and results to sign-preserving
+// zero) variants for fp32 functions.
 multiclass F3<string OpcStr, SDNode OpNode> {
-   def f64rr : NVPTXInst<(outs Float64Regs:$dst),
-                      (ins Float64Regs:$a, Float64Regs:$b),
-                      !strconcat(OpcStr, ".f64 \t$dst, $a, $b;"),
-                      [(set Float64Regs:$dst,
-                        (OpNode Float64Regs:$a, Float64Regs:$b))]>,
-                      Requires<[allowFMA]>;
-   def f64ri : NVPTXInst<(outs Float64Regs:$dst),
-                      (ins Float64Regs:$a, f64imm:$b),
-                      !strconcat(OpcStr, ".f64 \t$dst, $a, $b;"),
-                      [(set Float64Regs:$dst,
-                        (OpNode Float64Regs:$a, fpimm:$b))]>,
-                      Requires<[allowFMA]>;
-   def f32rr_ftz : NVPTXInst<(outs Float32Regs:$dst),
-                      (ins Float32Regs:$a, Float32Regs:$b),
-                      !strconcat(OpcStr, ".ftz.f32 \t$dst, $a, $b;"),
-                      [(set Float32Regs:$dst,
-                        (OpNode Float32Regs:$a, Float32Regs:$b))]>,
-                      Requires<[allowFMA, doF32FTZ]>;
-   def f32ri_ftz : NVPTXInst<(outs Float32Regs:$dst),
-                      (ins Float32Regs:$a, f32imm:$b),
-                      !strconcat(OpcStr, ".ftz.f32 \t$dst, $a, $b;"),
-                      [(set Float32Regs:$dst,
-                        (OpNode Float32Regs:$a, fpimm:$b))]>,
-                      Requires<[allowFMA, doF32FTZ]>;
-   def f32rr : NVPTXInst<(outs Float32Regs:$dst),
-                      (ins Float32Regs:$a, Float32Regs:$b),
-                      !strconcat(OpcStr, ".f32 \t$dst, $a, $b;"),
-                      [(set Float32Regs:$dst,
-                        (OpNode Float32Regs:$a, Float32Regs:$b))]>,
-                      Requires<[allowFMA]>;
-   def f32ri : NVPTXInst<(outs Float32Regs:$dst),
-                      (ins Float32Regs:$a, f32imm:$b),
-                      !strconcat(OpcStr, ".f32 \t$dst, $a, $b;"),
-                      [(set Float32Regs:$dst,
-                        (OpNode Float32Regs:$a, fpimm:$b))]>,
-                      Requires<[allowFMA]>;
+   def f64rr :
+     NVPTXInst<(outs Float64Regs:$dst),
+               (ins Float64Regs:$a, Float64Regs:$b),
+               !strconcat(OpcStr, ".f64 \t$dst, $a, $b;"),
+               [(set Float64Regs:$dst, (OpNode Float64Regs:$a, Float64Regs:$b))]>,
+               Requires<[allowFMA]>;
+   def f64ri :
+     NVPTXInst<(outs Float64Regs:$dst),
+               (ins Float64Regs:$a, f64imm:$b),
+               !strconcat(OpcStr, ".f64 \t$dst, $a, $b;"),
+               [(set Float64Regs:$dst, (OpNode Float64Regs:$a, fpimm:$b))]>,
+               Requires<[allowFMA]>;
+   def f32rr_ftz :
+     NVPTXInst<(outs Float32Regs:$dst),
+               (ins Float32Regs:$a, Float32Regs:$b),
+               !strconcat(OpcStr, ".ftz.f32 \t$dst, $a, $b;"),
+               [(set Float32Regs:$dst, (OpNode Float32Regs:$a, Float32Regs:$b))]>,
+               Requires<[allowFMA, doF32FTZ]>;
+   def f32ri_ftz :
+     NVPTXInst<(outs Float32Regs:$dst),
+               (ins Float32Regs:$a, f32imm:$b),
+               !strconcat(OpcStr, ".ftz.f32 \t$dst, $a, $b;"),
+               [(set Float32Regs:$dst, (OpNode Float32Regs:$a, fpimm:$b))]>,
+               Requires<[allowFMA, doF32FTZ]>;
+   def f32rr :
+     NVPTXInst<(outs Float32Regs:$dst),
+               (ins Float32Regs:$a, Float32Regs:$b),
+               !strconcat(OpcStr, ".f32 \t$dst, $a, $b;"),
+               [(set Float32Regs:$dst, (OpNode Float32Regs:$a, Float32Regs:$b))]>,
+               Requires<[allowFMA]>;
+   def f32ri :
+     NVPTXInst<(outs Float32Regs:$dst),
+               (ins Float32Regs:$a, f32imm:$b),
+               !strconcat(OpcStr, ".f32 \t$dst, $a, $b;"),
+               [(set Float32Regs:$dst, (OpNode Float32Regs:$a, fpimm:$b))]>,
+               Requires<[allowFMA]>;
 }
 
+// Same as F3, but defines ".rn" variants (round to nearest even).
 multiclass F3_rn<string OpcStr, SDNode OpNode> {
-   def f64rr : NVPTXInst<(outs Float64Regs:$dst),
-                      (ins Float64Regs:$a, Float64Regs:$b),
-                      !strconcat(OpcStr, ".rn.f64 \t$dst, $a, $b;"),
-                      [(set Float64Regs:$dst,
-                        (OpNode Float64Regs:$a, Float64Regs:$b))]>,
-                      Requires<[noFMA]>;
-   def f64ri : NVPTXInst<(outs Float64Regs:$dst),
-                      (ins Float64Regs:$a, f64imm:$b),
-                      !strconcat(OpcStr, ".rn.f64 \t$dst, $a, $b;"),
-                      [(set Float64Regs:$dst,
-                        (OpNode Float64Regs:$a, fpimm:$b))]>,
-                      Requires<[noFMA]>;
-   def f32rr_ftz : NVPTXInst<(outs Float32Regs:$dst),
-                      (ins Float32Regs:$a, Float32Regs:$b),
-                      !strconcat(OpcStr, ".rn.ftz.f32 \t$dst, $a, $b;"),
-                      [(set Float32Regs:$dst,
-                        (OpNode Float32Regs:$a, Float32Regs:$b))]>,
-                      Requires<[noFMA, doF32FTZ]>;
-   def f32ri_ftz : NVPTXInst<(outs Float32Regs:$dst),
-                      (ins Float32Regs:$a, f32imm:$b),
-                      !strconcat(OpcStr, ".rn.ftz.f32 \t$dst, $a, $b;"),
-                      [(set Float32Regs:$dst,
-                        (OpNode Float32Regs:$a, fpimm:$b))]>,
-                      Requires<[noFMA, doF32FTZ]>;
-   def f32rr : NVPTXInst<(outs Float32Regs:$dst),
-                      (ins Float32Regs:$a, Float32Regs:$b),
-                      !strconcat(OpcStr, ".rn.f32 \t$dst, $a, $b;"),
-                      [(set Float32Regs:$dst,
-                        (OpNode Float32Regs:$a, Float32Regs:$b))]>,
-                      Requires<[noFMA]>;
-   def f32ri : NVPTXInst<(outs Float32Regs:$dst),
-                      (ins Float32Regs:$a, f32imm:$b),
-                      !strconcat(OpcStr, ".rn.f32 \t$dst, $a, $b;"),
-                      [(set Float32Regs:$dst,
-                        (OpNode Float32Regs:$a, fpimm:$b))]>,
-                      Requires<[noFMA]>;
+   def f64rr :
+     NVPTXInst<(outs Float64Regs:$dst),
+               (ins Float64Regs:$a, Float64Regs:$b),
+               !strconcat(OpcStr, ".rn.f64 \t$dst, $a, $b;"),
+               [(set Float64Regs:$dst, (OpNode Float64Regs:$a, Float64Regs:$b))]>,
+               Requires<[noFMA]>;
+   def f64ri :
+     NVPTXInst<(outs Float64Regs:$dst),
+               (ins Float64Regs:$a, f64imm:$b),
+               !strconcat(OpcStr, ".rn.f64 \t$dst, $a, $b;"),
+               [(set Float64Regs:$dst, (OpNode Float64Regs:$a, fpimm:$b))]>,
+               Requires<[noFMA]>;
+   def f32rr_ftz :
+     NVPTXInst<(outs Float32Regs:$dst),
+               (ins Float32Regs:$a, Float32Regs:$b),
+               !strconcat(OpcStr, ".rn.ftz.f32 \t$dst, $a, $b;"),
+               [(set Float32Regs:$dst, (OpNode Float32Regs:$a, Float32Regs:$b))]>,
+               Requires<[noFMA, doF32FTZ]>;
+   def f32ri_ftz :
+     NVPTXInst<(outs Float32Regs:$dst),
+               (ins Float32Regs:$a, f32imm:$b),
+               !strconcat(OpcStr, ".rn.ftz.f32 \t$dst, $a, $b;"),
+               [(set Float32Regs:$dst, (OpNode Float32Regs:$a, fpimm:$b))]>,
+               Requires<[noFMA, doF32FTZ]>;
+   def f32rr :
+     NVPTXInst<(outs Float32Regs:$dst),
+               (ins Float32Regs:$a, Float32Regs:$b),
+               !strconcat(OpcStr, ".rn.f32 \t$dst, $a, $b;"),
+               [(set Float32Regs:$dst, (OpNode Float32Regs:$a, Float32Regs:$b))]>,
+               Requires<[noFMA]>;
+   def f32ri :
+     NVPTXInst<(outs Float32Regs:$dst),
+               (ins Float32Regs:$a, f32imm:$b),
+               !strconcat(OpcStr, ".rn.f32 \t$dst, $a, $b;"),
+               [(set Float32Regs:$dst, (OpNode Float32Regs:$a, fpimm:$b))]>,
+               Requires<[noFMA]>;
 }
 
+// Template for operations which take two f32 or f64 operands.  Provides three
+// instructions: <OpcStr>.f64, <OpcStr>.f32, and <OpcStr>.ftz.f32 (flush
+// subnormal inputs and results to zero).
 multiclass F2<string OpcStr, SDNode OpNode> {
-   def f64 : NVPTXInst<(outs Float64Regs:$dst), (ins Float64Regs:$a),
-                      !strconcat(OpcStr, ".f64 \t$dst, $a;"),
-                      [(set Float64Regs:$dst, (OpNode Float64Regs:$a))]>;
+   def f64 :     NVPTXInst<(outs Float64Regs:$dst), (ins Float64Regs:$a),
+                           !strconcat(OpcStr, ".f64 \t$dst, $a;"),
+                           [(set Float64Regs:$dst, (OpNode Float64Regs:$a))]>;
    def f32_ftz : NVPTXInst<(outs Float32Regs:$dst), (ins Float32Regs:$a),
-                      !strconcat(OpcStr, ".ftz.f32 \t$dst, $a;"),
-                      [(set Float32Regs:$dst, (OpNode Float32Regs:$a))]>,
-                      Requires<[doF32FTZ]>;
-   def f32 : NVPTXInst<(outs Float32Regs:$dst), (ins Float32Regs:$a),
-                      !strconcat(OpcStr, ".f32 \t$dst, $a;"),
-                      [(set Float32Regs:$dst, (OpNode Float32Regs:$a))]>;
+                           !strconcat(OpcStr, ".ftz.f32 \t$dst, $a;"),
+                           [(set Float32Regs:$dst, (OpNode Float32Regs:$a))]>,
+                           Requires<[doF32FTZ]>;
+   def f32 :     NVPTXInst<(outs Float32Regs:$dst), (ins Float32Regs:$a),
+                           !strconcat(OpcStr, ".f32 \t$dst, $a;"),
+                           [(set Float32Regs:$dst, (OpNode Float32Regs:$a))]>;
 }
 
 //===----------------------------------------------------------------------===//
@@ -293,160 +311,251 @@ multiclass F2<string OpcStr, SDNode OpNode> {
 //===----------------------------------------------------------------------===//
 
 //-----------------------------------
-// General Type Conversion
+// Type Conversion
 //-----------------------------------
 
 let hasSideEffects = 0 in {
-// Generate a cvt to the given type from all possible types.
-// Each instance takes a CvtMode immediate that defines the conversion mode to
-// use.  It can be CvtNONE to omit a conversion mode.
-multiclass CVT_FROM_ALL<string FromName, RegisterClass RC> {
-  def _s16 : NVPTXInst<(outs RC:$dst),
-                       (ins Int16Regs:$src, CvtMode:$mode),
-                       !strconcat("cvt${mode:base}${mode:ftz}${mode:sat}.",
-                       FromName, ".s16\t$dst, $src;"),
-                       []>;
-  def _u16 : NVPTXInst<(outs RC:$dst),
-                       (ins Int16Regs:$src, CvtMode:$mode),
-                       !strconcat("cvt${mode:base}${mode:ftz}${mode:sat}.",
-                       FromName, ".u16\t$dst, $src;"),
-                       []>;
-  def _f16 : NVPTXInst<(outs RC:$dst),
-                       (ins Int16Regs:$src, CvtMode:$mode),
-                       !strconcat("cvt${mode:base}${mode:ftz}${mode:sat}.",
-                       FromName, ".f16\t$dst, $src;"),
-                       []>;
-  def _s32 : NVPTXInst<(outs RC:$dst),
-                       (ins Int32Regs:$src, CvtMode:$mode),
-                       !strconcat("cvt${mode:base}${mode:ftz}${mode:sat}.",
-                       FromName, ".s32\t$dst, $src;"),
-                       []>;
-  def _u32 : NVPTXInst<(outs RC:$dst),
-                       (ins Int32Regs:$src, CvtMode:$mode),
-                       !strconcat("cvt${mode:base}${mode:ftz}${mode:sat}.",
-                       FromName, ".u32\t$dst, $src;"),
-                       []>;
-  def _s64 : NVPTXInst<(outs RC:$dst),
-                       (ins Int64Regs:$src, CvtMode:$mode),
-                       !strconcat("cvt${mode:base}${mode:ftz}${mode:sat}.",
-                       FromName, ".s64\t$dst, $src;"),
-                       []>;
-  def _u64 : NVPTXInst<(outs RC:$dst),
-                       (ins Int64Regs:$src, CvtMode:$mode),
-                       !strconcat("cvt${mode:base}${mode:ftz}${mode:sat}.",
-                       FromName, ".u64\t$dst, $src;"),
-                       []>;
-  def _f32 : NVPTXInst<(outs RC:$dst),
-                       (ins Float32Regs:$src, CvtMode:$mode),
-                       !strconcat("cvt${mode:base}${mode:ftz}${mode:sat}.",
-                       FromName, ".f32\t$dst, $src;"),
-                       []>;
-  def _f64 : NVPTXInst<(outs RC:$dst),
-                       (ins Float64Regs:$src, CvtMode:$mode),
-                       !strconcat("cvt${mode:base}${mode:ftz}${mode:sat}.",
-                       FromName, ".f64\t$dst, $src;"),
-                       []>;
-}
-
-// Generate a cvt to all possible types.
-defm CVT_s16 : CVT_FROM_ALL<"s16", Int16Regs>;
-defm CVT_u16 : CVT_FROM_ALL<"u16", Int16Regs>;
-defm CVT_f16 : CVT_FROM_ALL<"f16", Int16Regs>;
-defm CVT_s32 : CVT_FROM_ALL<"s32", Int32Regs>;
-defm CVT_u32 : CVT_FROM_ALL<"u32", Int32Regs>;
-defm CVT_s64 : CVT_FROM_ALL<"s64", Int64Regs>;
-defm CVT_u64 : CVT_FROM_ALL<"u64", Int64Regs>;
-defm CVT_f32 : CVT_FROM_ALL<"f32", Float32Regs>;
-defm CVT_f64 : CVT_FROM_ALL<"f64", Float64Regs>;
-
-// This set of cvt is different from the above. The type of the source
-// and target are the same.
-//
-def CVT_INREG_s16_s8 : NVPTXInst<(outs Int16Regs:$dst), (ins Int16Regs:$src),
-                        "cvt.s16.s8 \t$dst, $src;", []>;
-def CVT_INREG_s32_s8 : NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$src),
-                        "cvt.s32.s8 \t$dst, $src;", []>;
-def CVT_INREG_s32_s16 : NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$src),
-                        "cvt.s32.s16 \t$dst, $src;", []>;
-def CVT_INREG_s64_s8 : NVPTXInst<(outs Int64Regs:$dst), (ins Int64Regs:$src),
-                        "cvt.s64.s8 \t$dst, $src;", []>;
-def CVT_INREG_s64_s16 : NVPTXInst<(outs Int64Regs:$dst), (ins Int64Regs:$src),
-                        "cvt.s64.s16 \t$dst, $src;", []>;
-def CVT_INREG_s64_s32 : NVPTXInst<(outs Int64Regs:$dst), (ins Int64Regs:$src),
-                        "cvt.s64.s32 \t$dst, $src;", []>;
+  // Generate a cvt to the given type from all possible types.  Each instance
+  // takes a CvtMode immediate that defines the conversion mode to use.  It can
+  // be CvtNONE to omit a conversion mode.
+  multiclass CVT_FROM_ALL<string FromName, RegisterClass RC> {
+    def _s8 :
+      NVPTXInst<(outs RC:$dst),
+                (ins Int16Regs:$src, CvtMode:$mode),
+                !strconcat("cvt${mode:base}${mode:ftz}${mode:sat}.",
+                FromName, ".s8\t$dst, $src;"), []>;
+    def _u8 :
+      NVPTXInst<(outs RC:$dst),
+                (ins Int16Regs:$src, CvtMode:$mode),
+                !strconcat("cvt${mode:base}${mode:ftz}${mode:sat}.",
+                FromName, ".u8\t$dst, $src;"), []>;
+    def _s16 :
+      NVPTXInst<(outs RC:$dst),
+                (ins Int16Regs:$src, CvtMode:$mode),
+                !strconcat("cvt${mode:base}${mode:ftz}${mode:sat}.",
+                FromName, ".s16\t$dst, $src;"), []>;
+    def _u16 :
+      NVPTXInst<(outs RC:$dst),
+                (ins Int16Regs:$src, CvtMode:$mode),
+                !strconcat("cvt${mode:base}${mode:ftz}${mode:sat}.",
+                FromName, ".u16\t$dst, $src;"), []>;
+    def _f16 :
+      NVPTXInst<(outs RC:$dst),
+                (ins Int16Regs:$src, CvtMode:$mode),
+                !strconcat("cvt${mode:base}${mode:ftz}${mode:sat}.",
+                FromName, ".f16\t$dst, $src;"), []>;
+    def _s32 :
+      NVPTXInst<(outs RC:$dst),
+                (ins Int32Regs:$src, CvtMode:$mode),
+                !strconcat("cvt${mode:base}${mode:ftz}${mode:sat}.",
+                FromName, ".s32\t$dst, $src;"), []>;
+    def _u32 :
+      NVPTXInst<(outs RC:$dst),
+                (ins Int32Regs:$src, CvtMode:$mode),
+                !strconcat("cvt${mode:base}${mode:ftz}${mode:sat}.",
+                FromName, ".u32\t$dst, $src;"), []>;
+    def _s64 :
+      NVPTXInst<(outs RC:$dst),
+                (ins Int64Regs:$src, CvtMode:$mode),
+                !strconcat("cvt${mode:base}${mode:ftz}${mode:sat}.",
+                FromName, ".s64\t$dst, $src;"), []>;
+    def _u64 :
+      NVPTXInst<(outs RC:$dst),
+                (ins Int64Regs:$src, CvtMode:$mode),
+                !strconcat("cvt${mode:base}${mode:ftz}${mode:sat}.",
+                FromName, ".u64\t$dst, $src;"), []>;
+    def _f32 :
+      NVPTXInst<(outs RC:$dst),
+                (ins Float32Regs:$src, CvtMode:$mode),
+                !strconcat("cvt${mode:base}${mode:ftz}${mode:sat}.",
+                FromName, ".f32\t$dst, $src;"), []>;
+    def _f64 :
+      NVPTXInst<(outs RC:$dst),
+                (ins Float64Regs:$src, CvtMode:$mode),
+                !strconcat("cvt${mode:base}${mode:ftz}${mode:sat}.",
+                FromName, ".f64\t$dst, $src;"), []>;
+  }
+
+  // Generate cvts from all types to all types.
+  defm CVT_s8  : CVT_FROM_ALL<"s8",  Int16Regs>;
+  defm CVT_u8  : CVT_FROM_ALL<"u8",  Int16Regs>;
+  defm CVT_s16 : CVT_FROM_ALL<"s16", Int16Regs>;
+  defm CVT_u16 : CVT_FROM_ALL<"u16", Int16Regs>;
+  defm CVT_f16 : CVT_FROM_ALL<"f16", Int16Regs>;
+  defm CVT_s32 : CVT_FROM_ALL<"s32", Int32Regs>;
+  defm CVT_u32 : CVT_FROM_ALL<"u32", Int32Regs>;
+  defm CVT_s64 : CVT_FROM_ALL<"s64", Int64Regs>;
+  defm CVT_u64 : CVT_FROM_ALL<"u64", Int64Regs>;
+  defm CVT_f32 : CVT_FROM_ALL<"f32", Float32Regs>;
+  defm CVT_f64 : CVT_FROM_ALL<"f64", Float64Regs>;
+
+  // These cvts are different from those above: The source and dest registers
+  // are of the same type.
+  def CVT_INREG_s16_s8 :  NVPTXInst<(outs Int16Regs:$dst), (ins Int16Regs:$src),
+                                    "cvt.s16.s8 \t$dst, $src;", []>;
+  def CVT_INREG_s32_s8 :  NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$src),
+                                    "cvt.s32.s8 \t$dst, $src;", []>;
+  def CVT_INREG_s32_s16 : NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$src),
+                                    "cvt.s32.s16 \t$dst, $src;", []>;
+  def CVT_INREG_s64_s8 :  NVPTXInst<(outs Int64Regs:$dst), (ins Int64Regs:$src),
+                                    "cvt.s64.s8 \t$dst, $src;", []>;
+  def CVT_INREG_s64_s16 : NVPTXInst<(outs Int64Regs:$dst), (ins Int64Regs:$src),
+                                    "cvt.s64.s16 \t$dst, $src;", []>;
+  def CVT_INREG_s64_s32 : NVPTXInst<(outs Int64Regs:$dst), (ins Int64Regs:$src),
+                                    "cvt.s64.s32 \t$dst, $src;", []>;
 }
 
 //-----------------------------------
 // Integer Arithmetic
 //-----------------------------------
 
+// Template for xor masquerading as int1 arithmetic.
 multiclass ADD_SUB_i1<SDNode OpNode> {
    def _rr: NVPTXInst<(outs Int1Regs:$dst), (ins Int1Regs:$a, Int1Regs:$b),
-          "xor.pred \t$dst, $a, $b;",
-      [(set Int1Regs:$dst, (OpNode Int1Regs:$a, Int1Regs:$b))]>;
+                      "xor.pred \t$dst, $a, $b;",
+                      [(set Int1Regs:$dst, (OpNode Int1Regs:$a, Int1Regs:$b))]>;
    def _ri: NVPTXInst<(outs Int1Regs:$dst), (ins Int1Regs:$a, i1imm:$b),
-          "xor.pred \t$dst, $a, $b;",
-      [(set Int1Regs:$dst, (OpNode Int1Regs:$a, (imm):$b))]>;
+                      "xor.pred \t$dst, $a, $b;",
+                      [(set Int1Regs:$dst, (OpNode Int1Regs:$a, (imm):$b))]>;
 }
 
+// int1 addition and subtraction are both just xor.
 defm ADD_i1 : ADD_SUB_i1<add>;
 defm SUB_i1 : ADD_SUB_i1<sub>;
 
-
+// int16, int32, and int64 signed addition.  Since nvptx is 2's compliment, we
+// also use these for unsigned arithmetic.
 defm ADD : I3<"add.s", add>;
 defm SUB : I3<"sub.s", sub>;
 
+// int32 addition and subtraction with carry-out.
+// FIXME: PTX 4.3 adds a 64-bit add.cc (and maybe also 64-bit addc.cc?).
 defm ADDCC : ADD_SUB_INT_32<"add.cc", addc>;
 defm SUBCC : ADD_SUB_INT_32<"sub.cc", subc>;
 
+// int32 addition and subtraction with carry-in and carry-out.
 defm ADDCCC : ADD_SUB_INT_32<"addc.cc", adde>;
 defm SUBCCC : ADD_SUB_INT_32<"subc.cc", sube>;
 
-//mul.wide PTX instruction
+defm MULT : I3<"mul.lo.s", mul>;
+
+defm MULTHS : I3<"mul.hi.s", mulhs>;
+defm MULTHU : I3<"mul.hi.u", mulhu>;
+
+defm SDIV : I3<"div.s", sdiv>;
+defm UDIV : I3<"div.u", udiv>;
+
+// The ri versions of rem.s and rem.u won't be selected; DAGCombiner::visitSREM
+// will lower it.
+defm SREM : I3<"rem.s", srem>;
+defm UREM : I3<"rem.u", urem>;
+
+
+//
+// Wide multiplication
+//
+def MULWIDES64 :
+  NVPTXInst<(outs Int64Regs:$dst), (ins Int32Regs:$a, Int32Regs:$b),
+            "mul.wide.s32 \t$dst, $a, $b;", []>;
+def MULWIDES64Imm :
+  NVPTXInst<(outs Int64Regs:$dst), (ins Int32Regs:$a, i32imm:$b),
+            "mul.wide.s32 \t$dst, $a, $b;", []>;
+def MULWIDES64Imm64 :
+  NVPTXInst<(outs Int64Regs:$dst), (ins Int32Regs:$a, i64imm:$b),
+            "mul.wide.s32 \t$dst, $a, $b;", []>;
+
+def MULWIDEU64 :
+  NVPTXInst<(outs Int64Regs:$dst), (ins Int32Regs:$a, Int32Regs:$b),
+            "mul.wide.u32 \t$dst, $a, $b;", []>;
+def MULWIDEU64Imm :
+  NVPTXInst<(outs Int64Regs:$dst), (ins Int32Regs:$a, i32imm:$b),
+            "mul.wide.u32 \t$dst, $a, $b;", []>;
+def MULWIDEU64Imm64 :
+  NVPTXInst<(outs Int64Regs:$dst), (ins Int32Regs:$a, i64imm:$b),
+            "mul.wide.u32 \t$dst, $a, $b;", []>;
+
+def MULWIDES32 :
+  NVPTXInst<(outs Int32Regs:$dst), (ins Int16Regs:$a, Int16Regs:$b),
+            "mul.wide.s16 \t$dst, $a, $b;", []>;
+def MULWIDES32Imm :
+  NVPTXInst<(outs Int32Regs:$dst), (ins Int16Regs:$a, i16imm:$b),
+            "mul.wide.s16 \t$dst, $a, $b;", []>;
+def MULWIDES32Imm32 :
+  NVPTXInst<(outs Int32Regs:$dst), (ins Int16Regs:$a, i32imm:$b),
+            "mul.wide.s16 \t$dst, $a, $b;", []>;
+
+def MULWIDEU32 :
+  NVPTXInst<(outs Int32Regs:$dst), (ins Int16Regs:$a, Int16Regs:$b),
+            "mul.wide.u16 \t$dst, $a, $b;", []>;
+def MULWIDEU32Imm :
+  NVPTXInst<(outs Int32Regs:$dst), (ins Int16Regs:$a, i16imm:$b),
+            "mul.wide.u16 \t$dst, $a, $b;", []>;
+def MULWIDEU32Imm32 :
+  NVPTXInst<(outs Int32Regs:$dst), (ins Int16Regs:$a, i32imm:$b),
+            "mul.wide.u16 \t$dst, $a, $b;", []>;
+
+def SDTMulWide : SDTypeProfile<1, 2, [SDTCisSameAs<1, 2>]>;
+def mul_wide_signed : SDNode<"NVPTXISD::MUL_WIDE_SIGNED", SDTMulWide>;
+def mul_wide_unsigned : SDNode<"NVPTXISD::MUL_WIDE_UNSIGNED", SDTMulWide>;
+
+// Matchers for signed, unsigned mul.wide ISD nodes.
+def : Pat<(i32 (mul_wide_signed Int16Regs:$a, Int16Regs:$b)),
+          (MULWIDES32 Int16Regs:$a, Int16Regs:$b)>,
+      Requires<[doMulWide]>;
+def : Pat<(i32 (mul_wide_signed Int16Regs:$a, imm:$b)),
+          (MULWIDES32Imm Int16Regs:$a, imm:$b)>,
+      Requires<[doMulWide]>;
+def : Pat<(i32 (mul_wide_unsigned Int16Regs:$a, Int16Regs:$b)),
+          (MULWIDEU32 Int16Regs:$a, Int16Regs:$b)>,
+      Requires<[doMulWide]>;
+def : Pat<(i32 (mul_wide_unsigned Int16Regs:$a, imm:$b)),
+          (MULWIDEU32Imm Int16Regs:$a, imm:$b)>,
+      Requires<[doMulWide]>;
+
+def : Pat<(i64 (mul_wide_signed Int32Regs:$a, Int32Regs:$b)),
+          (MULWIDES64 Int32Regs:$a, Int32Regs:$b)>,
+      Requires<[doMulWide]>;
+def : Pat<(i64 (mul_wide_signed Int32Regs:$a, imm:$b)),
+          (MULWIDES64Imm Int32Regs:$a, imm:$b)>,
+      Requires<[doMulWide]>;
+def : Pat<(i64 (mul_wide_unsigned Int32Regs:$a, Int32Regs:$b)),
+          (MULWIDEU64 Int32Regs:$a, Int32Regs:$b)>,
+      Requires<[doMulWide]>;
+def : Pat<(i64 (mul_wide_unsigned Int32Regs:$a, imm:$b)),
+          (MULWIDEU64Imm Int32Regs:$a, imm:$b)>,
+      Requires<[doMulWide]>;
+
+// Predicates used for converting some patterns to mul.wide.
 def SInt32Const : PatLeaf<(imm), [{
   const APInt &v = N->getAPIntValue();
-  if (v.isSignedIntN(32))
-    return true;
-  return false;
+  return v.isSignedIntN(32);
 }]>;
 
 def UInt32Const : PatLeaf<(imm), [{
   const APInt &v = N->getAPIntValue();
-  if (v.isIntN(32))
-    return true;
-  return false;
+  return v.isIntN(32);
 }]>;
 
 def SInt16Const : PatLeaf<(imm), [{
   const APInt &v = N->getAPIntValue();
-  if (v.isSignedIntN(16))
-    return true;
-  return false;
+  return v.isSignedIntN(16);
 }]>;
 
 def UInt16Const : PatLeaf<(imm), [{
   const APInt &v = N->getAPIntValue();
-  if (v.isIntN(16))
-    return true;
-  return false;
+  return v.isIntN(16);
 }]>;
 
 def Int5Const : PatLeaf<(imm), [{
+  // Check if 0 <= v < 32; only then will the result of (x << v) be an int32.
   const APInt &v = N->getAPIntValue();
-  // Check if 0 <= v < 32
-  // Only then the result from (x << v) will be i32
-  if (v.sge(0) && v.slt(32))
-    return true;
-  return false;
+  return v.sge(0) && v.slt(32);
 }]>;
 
 def Int4Const : PatLeaf<(imm), [{
+  // Check if 0 <= v < 16; only then will the result of (x << v) be an int16.
   const APInt &v = N->getAPIntValue();
-  // Check if 0 <= v < 16
-  // Only then the result from (x << v) will be i16
-  if (v.sge(0) && v.slt(16))
-    return true;
-  return false;
+  return v.sge(0) && v.slt(16);
 }]>;
 
 def SHL2MUL32 : SDNodeXForm<imm, [{
@@ -461,215 +570,133 @@ def SHL2MUL16 : SDNodeXForm<imm, [{
   return CurDAG->getTargetConstant(temp.shl(v), SDLoc(N), MVT::i16);
 }]>;
 
-def MULWIDES64
-  : NVPTXInst<(outs Int64Regs:$dst), (ins Int32Regs:$a, Int32Regs:$b),
-              "mul.wide.s32 \t$dst, $a, $b;", []>;
-def MULWIDES64Imm
-  : NVPTXInst<(outs Int64Regs:$dst), (ins Int32Regs:$a, i32imm:$b),
-                           "mul.wide.s32 \t$dst, $a, $b;", []>;
-def MULWIDES64Imm64
-  : NVPTXInst<(outs Int64Regs:$dst), (ins Int32Regs:$a, i64imm:$b),
-                           "mul.wide.s32 \t$dst, $a, $b;", []>;
-
-def MULWIDEU64
-  : NVPTXInst<(outs Int64Regs:$dst), (ins Int32Regs:$a, Int32Regs:$b),
-              "mul.wide.u32 \t$dst, $a, $b;", []>;
-def MULWIDEU64Imm
-  : NVPTXInst<(outs Int64Regs:$dst), (ins Int32Regs:$a, i32imm:$b),
-                           "mul.wide.u32 \t$dst, $a, $b;", []>;
-def MULWIDEU64Imm64
-  : NVPTXInst<(outs Int64Regs:$dst), (ins Int32Regs:$a, i64imm:$b),
-                           "mul.wide.u32 \t$dst, $a, $b;", []>;
-
-def MULWIDES32
-  : NVPTXInst<(outs Int32Regs:$dst), (ins Int16Regs:$a, Int16Regs:$b),
-                           "mul.wide.s16 \t$dst, $a, $b;", []>;
-def MULWIDES32Imm
-  : NVPTXInst<(outs Int32Regs:$dst), (ins Int16Regs:$a, i16imm:$b),
-              "mul.wide.s16 \t$dst, $a, $b;", []>;
-def MULWIDES32Imm32
-  : NVPTXInst<(outs Int32Regs:$dst), (ins Int16Regs:$a, i32imm:$b),
-                           "mul.wide.s16 \t$dst, $a, $b;", []>;
-
-def MULWIDEU32
-  : NVPTXInst<(outs Int32Regs:$dst), (ins Int16Regs:$a, Int16Regs:$b),
-              "mul.wide.u16 \t$dst, $a, $b;", []>;
-def MULWIDEU32Imm
-  : NVPTXInst<(outs Int32Regs:$dst), (ins Int16Regs:$a, i16imm:$b),
-                           "mul.wide.u16 \t$dst, $a, $b;", []>;
-def MULWIDEU32Imm32
-  : NVPTXInst<(outs Int32Regs:$dst), (ins Int16Regs:$a, i32imm:$b),
-                            "mul.wide.u16 \t$dst, $a, $b;", []>;
-
+// Convert "sign/zero-extend, then shift left by an immediate" to mul.wide.
 def : Pat<(shl (sext Int32Regs:$a), (i32 Int5Const:$b)),
           (MULWIDES64Imm Int32Regs:$a, (SHL2MUL32 node:$b))>,
-          Requires<[doMulWide]>;
+      Requires<[doMulWide]>;
 def : Pat<(shl (zext Int32Regs:$a), (i32 Int5Const:$b)),
           (MULWIDEU64Imm Int32Regs:$a, (SHL2MUL32 node:$b))>,
-          Requires<[doMulWide]>;
+      Requires<[doMulWide]>;
 
 def : Pat<(shl (sext Int16Regs:$a), (i16 Int4Const:$b)),
           (MULWIDES32Imm Int16Regs:$a, (SHL2MUL16 node:$b))>,
-          Requires<[doMulWide]>;
+      Requires<[doMulWide]>;
 def : Pat<(shl (zext Int16Regs:$a), (i16 Int4Const:$b)),
           (MULWIDEU32Imm Int16Regs:$a, (SHL2MUL16 node:$b))>,
-          Requires<[doMulWide]>;
+      Requires<[doMulWide]>;
 
+// Convert "sign/zero-extend then multiply" to mul.wide.
 def : Pat<(mul (sext Int32Regs:$a), (sext Int32Regs:$b)),
           (MULWIDES64 Int32Regs:$a, Int32Regs:$b)>,
-          Requires<[doMulWide]>;
+      Requires<[doMulWide]>;
 def : Pat<(mul (sext Int32Regs:$a), (i64 SInt32Const:$b)),
           (MULWIDES64Imm64 Int32Regs:$a, (i64 SInt32Const:$b))>,
-          Requires<[doMulWide]>;
+      Requires<[doMulWide]>;
 
 def : Pat<(mul (zext Int32Regs:$a), (zext Int32Regs:$b)),
           (MULWIDEU64 Int32Regs:$a, Int32Regs:$b)>,
       Requires<[doMulWide]>;
 def : Pat<(mul (zext Int32Regs:$a), (i64 UInt32Const:$b)),
           (MULWIDEU64Imm64 Int32Regs:$a, (i64 UInt32Const:$b))>,
-          Requires<[doMulWide]>;
+      Requires<[doMulWide]>;
 
 def : Pat<(mul (sext Int16Regs:$a), (sext Int16Regs:$b)),
           (MULWIDES32 Int16Regs:$a, Int16Regs:$b)>,
       Requires<[doMulWide]>;
 def : Pat<(mul (sext Int16Regs:$a), (i32 SInt16Const:$b)),
           (MULWIDES32Imm32 Int16Regs:$a, (i32 SInt16Const:$b))>,
-          Requires<[doMulWide]>;
+      Requires<[doMulWide]>;
 
 def : Pat<(mul (zext Int16Regs:$a), (zext Int16Regs:$b)),
           (MULWIDEU32 Int16Regs:$a, Int16Regs:$b)>,
       Requires<[doMulWide]>;
 def : Pat<(mul (zext Int16Regs:$a), (i32 UInt16Const:$b)),
           (MULWIDEU32Imm32 Int16Regs:$a, (i32 UInt16Const:$b))>,
-          Requires<[doMulWide]>;
-
-
-def SDTMulWide
-  : SDTypeProfile<1, 2, [SDTCisSameAs<1, 2>]>;
-def mul_wide_signed
-  : SDNode<"NVPTXISD::MUL_WIDE_SIGNED", SDTMulWide>;
-def mul_wide_unsigned
-  : SDNode<"NVPTXISD::MUL_WIDE_UNSIGNED", SDTMulWide>;
-
-def : Pat<(i32 (mul_wide_signed Int16Regs:$a, Int16Regs:$b)),
-          (MULWIDES32 Int16Regs:$a, Int16Regs:$b)>,
       Requires<[doMulWide]>;
-def : Pat<(i32 (mul_wide_signed Int16Regs:$a, imm:$b)),
-          (MULWIDES32Imm Int16Regs:$a, imm:$b)>,
-          Requires<[doMulWide]>;
-def : Pat<(i32 (mul_wide_unsigned Int16Regs:$a, Int16Regs:$b)),
-          (MULWIDEU32 Int16Regs:$a, Int16Regs:$b)>,
-          Requires<[doMulWide]>;
-def : Pat<(i32 (mul_wide_unsigned Int16Regs:$a, imm:$b)),
-          (MULWIDEU32Imm Int16Regs:$a, imm:$b)>,
-          Requires<[doMulWide]>;
-
 
-def : Pat<(i64 (mul_wide_signed Int32Regs:$a, Int32Regs:$b)),
-          (MULWIDES64 Int32Regs:$a, Int32Regs:$b)>,
-          Requires<[doMulWide]>;
-def : Pat<(i64 (mul_wide_signed Int32Regs:$a, imm:$b)),
-          (MULWIDES64Imm Int32Regs:$a, imm:$b)>,
-          Requires<[doMulWide]>;
-def : Pat<(i64 (mul_wide_unsigned Int32Regs:$a, Int32Regs:$b)),
-          (MULWIDEU64 Int32Regs:$a, Int32Regs:$b)>,
-          Requires<[doMulWide]>;
-def : Pat<(i64 (mul_wide_unsigned Int32Regs:$a, imm:$b)),
-          (MULWIDEU64Imm Int32Regs:$a, imm:$b)>,
-          Requires<[doMulWide]>;
-
-defm MULT : I3<"mul.lo.s", mul>;
-
-defm MULTHS : I3<"mul.hi.s", mulhs>;
-defm MULTHU : I3<"mul.hi.u", mulhu>;
-
-defm SDIV : I3<"div.s", sdiv>;
-defm UDIV : I3<"div.u", udiv>;
-
-defm SREM : I3<"rem.s", srem>;
-// The ri version will not be selected as DAGCombiner::visitSREM will lower it.
-defm UREM : I3<"rem.u", urem>;
-// The ri version will not be selected as DAGCombiner::visitUREM will lower it.
-
-def SDTIMAD
-  : SDTypeProfile<1, 3, [SDTCisSameAs<0, 1>, SDTCisInt<0>,
-                         SDTCisInt<2>, SDTCisSameAs<0, 2>,
-                         SDTCisSameAs<0, 3>]>;
-def imad
-  : SDNode<"NVPTXISD::IMAD", SDTIMAD>;
-
-def MAD16rrr : NVPTXInst<(outs Int16Regs:$dst),
-                      (ins Int16Regs:$a, Int16Regs:$b, Int16Regs:$c),
-                      "mad.lo.s16 \t$dst, $a, $b, $c;",
-                      [(set Int16Regs:$dst,
-                         (imad Int16Regs:$a, Int16Regs:$b, Int16Regs:$c))]>;
-def MAD16rri : NVPTXInst<(outs Int16Regs:$dst),
-                      (ins Int16Regs:$a, Int16Regs:$b, i16imm:$c),
-                      "mad.lo.s16 \t$dst, $a, $b, $c;",
-                      [(set Int16Regs:$dst,
-                         (imad Int16Regs:$a, Int16Regs:$b, imm:$c))]>;
-def MAD16rir : NVPTXInst<(outs Int16Regs:$dst),
-                      (ins Int16Regs:$a, i16imm:$b, Int16Regs:$c),
-                      "mad.lo.s16 \t$dst, $a, $b, $c;",
-                      [(set Int16Regs:$dst,
-                        (imad Int16Regs:$a, imm:$b, Int16Regs:$c))]>;
-def MAD16rii : NVPTXInst<(outs Int16Regs:$dst),
-    (ins Int16Regs:$a, i16imm:$b, i16imm:$c),
-                      "mad.lo.s16 \t$dst, $a, $b, $c;",
-                      [(set Int16Regs:$dst,
-                        (imad Int16Regs:$a, imm:$b, imm:$c))]>;
-
-def MAD32rrr : NVPTXInst<(outs Int32Regs:$dst),
-                      (ins Int32Regs:$a, Int32Regs:$b, Int32Regs:$c),
-                      "mad.lo.s32 \t$dst, $a, $b, $c;",
-                      [(set Int32Regs:$dst,
-                        (imad Int32Regs:$a, Int32Regs:$b, Int32Regs:$c))]>;
-def MAD32rri : NVPTXInst<(outs Int32Regs:$dst),
-                      (ins Int32Regs:$a, Int32Regs:$b, i32imm:$c),
-                      "mad.lo.s32 \t$dst, $a, $b, $c;",
-                      [(set Int32Regs:$dst,
-                        (imad Int32Regs:$a, Int32Regs:$b, imm:$c))]>;
-def MAD32rir : NVPTXInst<(outs Int32Regs:$dst),
-                      (ins Int32Regs:$a, i32imm:$b, Int32Regs:$c),
-                      "mad.lo.s32 \t$dst, $a, $b, $c;",
-                      [(set Int32Regs:$dst,
-                        (imad Int32Regs:$a, imm:$b, Int32Regs:$c))]>;
-def MAD32rii : NVPTXInst<(outs Int32Regs:$dst),
-                      (ins Int32Regs:$a, i32imm:$b, i32imm:$c),
-                      "mad.lo.s32 \t$dst, $a, $b, $c;",
-                      [(set Int32Regs:$dst,
-                        (imad Int32Regs:$a, imm:$b, imm:$c))]>;
-
-def MAD64rrr : NVPTXInst<(outs Int64Regs:$dst),
-                      (ins Int64Regs:$a, Int64Regs:$b, Int64Regs:$c),
-                      "mad.lo.s64 \t$dst, $a, $b, $c;",
-                      [(set Int64Regs:$dst,
-                        (imad Int64Regs:$a, Int64Regs:$b, Int64Regs:$c))]>;
-def MAD64rri : NVPTXInst<(outs Int64Regs:$dst),
-                      (ins Int64Regs:$a, Int64Regs:$b, i64imm:$c),
-                      "mad.lo.s64 \t$dst, $a, $b, $c;",
-                      [(set Int64Regs:$dst,
-                        (imad Int64Regs:$a, Int64Regs:$b, imm:$c))]>;
-def MAD64rir : NVPTXInst<(outs Int64Regs:$dst),
-                      (ins Int64Regs:$a, i64imm:$b, Int64Regs:$c),
-                      "mad.lo.s64 \t$dst, $a, $b, $c;",
-                      [(set Int64Regs:$dst,
-                        (imad Int64Regs:$a, imm:$b, Int64Regs:$c))]>;
-def MAD64rii : NVPTXInst<(outs Int64Regs:$dst),
-                      (ins Int64Regs:$a, i64imm:$b, i64imm:$c),
-                      "mad.lo.s64 \t$dst, $a, $b, $c;",
-                      [(set Int64Regs:$dst,
-                        (imad Int64Regs:$a, imm:$b, imm:$c))]>;
-
-def INEG16 : NVPTXInst<(outs Int16Regs:$dst), (ins Int16Regs:$src),
-                     "neg.s16 \t$dst, $src;",
-         [(set Int16Regs:$dst, (ineg Int16Regs:$src))]>;
-def INEG32 : NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$src),
-                     "neg.s32 \t$dst, $src;",
-         [(set Int32Regs:$dst, (ineg Int32Regs:$src))]>;
-def INEG64 : NVPTXInst<(outs Int64Regs:$dst), (ins Int64Regs:$src),
-                     "neg.s64 \t$dst, $src;",
-         [(set Int64Regs:$dst, (ineg Int64Regs:$src))]>;
+//
+// Integer multiply-add
+//
+def SDTIMAD :
+  SDTypeProfile<1, 3, [SDTCisSameAs<0, 1>, SDTCisInt<0>, SDTCisInt<2>,
+                       SDTCisSameAs<0, 2>, SDTCisSameAs<0, 3>]>;
+def imad : SDNode<"NVPTXISD::IMAD", SDTIMAD>;
+
+def MAD16rrr :
+  NVPTXInst<(outs Int16Regs:$dst),
+            (ins Int16Regs:$a, Int16Regs:$b, Int16Regs:$c),
+            "mad.lo.s16 \t$dst, $a, $b, $c;",
+            [(set Int16Regs:$dst, (imad Int16Regs:$a, Int16Regs:$b, Int16Regs:$c))]>;
+def MAD16rri :
+  NVPTXInst<(outs Int16Regs:$dst),
+            (ins Int16Regs:$a, Int16Regs:$b, i16imm:$c),
+            "mad.lo.s16 \t$dst, $a, $b, $c;",
+            [(set Int16Regs:$dst, (imad Int16Regs:$a, Int16Regs:$b, imm:$c))]>;
+def MAD16rir :
+  NVPTXInst<(outs Int16Regs:$dst),
+            (ins Int16Regs:$a, i16imm:$b, Int16Regs:$c),
+            "mad.lo.s16 \t$dst, $a, $b, $c;",
+            [(set Int16Regs:$dst, (imad Int16Regs:$a, imm:$b, Int16Regs:$c))]>;
+def MAD16rii :
+  NVPTXInst<(outs Int16Regs:$dst),
+            (ins Int16Regs:$a, i16imm:$b, i16imm:$c),
+            "mad.lo.s16 \t$dst, $a, $b, $c;",
+            [(set Int16Regs:$dst, (imad Int16Regs:$a, imm:$b, imm:$c))]>;
+
+def MAD32rrr :
+  NVPTXInst<(outs Int32Regs:$dst),
+            (ins Int32Regs:$a, Int32Regs:$b, Int32Regs:$c),
+            "mad.lo.s32 \t$dst, $a, $b, $c;",
+            [(set Int32Regs:$dst, (imad Int32Regs:$a, Int32Regs:$b, Int32Regs:$c))]>;
+def MAD32rri :
+  NVPTXInst<(outs Int32Regs:$dst),
+            (ins Int32Regs:$a, Int32Regs:$b, i32imm:$c),
+            "mad.lo.s32 \t$dst, $a, $b, $c;",
+            [(set Int32Regs:$dst, (imad Int32Regs:$a, Int32Regs:$b, imm:$c))]>;
+def MAD32rir :
+  NVPTXInst<(outs Int32Regs:$dst),
+            (ins Int32Regs:$a, i32imm:$b, Int32Regs:$c),
+            "mad.lo.s32 \t$dst, $a, $b, $c;",
+            [(set Int32Regs:$dst, (imad Int32Regs:$a, imm:$b, Int32Regs:$c))]>;
+def MAD32rii :
+  NVPTXInst<(outs Int32Regs:$dst),
+            (ins Int32Regs:$a, i32imm:$b, i32imm:$c),
+            "mad.lo.s32 \t$dst, $a, $b, $c;",
+            [(set Int32Regs:$dst, (imad Int32Regs:$a, imm:$b, imm:$c))]>;
+
+def MAD64rrr :
+  NVPTXInst<(outs Int64Regs:$dst),
+            (ins Int64Regs:$a, Int64Regs:$b, Int64Regs:$c),
+            "mad.lo.s64 \t$dst, $a, $b, $c;",
+            [(set Int64Regs:$dst, (imad Int64Regs:$a, Int64Regs:$b, Int64Regs:$c))]>;
+def MAD64rri :
+  NVPTXInst<(outs Int64Regs:$dst),
+            (ins Int64Regs:$a, Int64Regs:$b, i64imm:$c),
+            "mad.lo.s64 \t$dst, $a, $b, $c;",
+            [(set Int64Regs:$dst, (imad Int64Regs:$a, Int64Regs:$b, imm:$c))]>;
+def MAD64rir :
+  NVPTXInst<(outs Int64Regs:$dst),
+            (ins Int64Regs:$a, i64imm:$b, Int64Regs:$c),
+            "mad.lo.s64 \t$dst, $a, $b, $c;",
+            [(set Int64Regs:$dst, (imad Int64Regs:$a, imm:$b, Int64Regs:$c))]>;
+def MAD64rii :
+  NVPTXInst<(outs Int64Regs:$dst),
+            (ins Int64Regs:$a, i64imm:$b, i64imm:$c),
+            "mad.lo.s64 \t$dst, $a, $b, $c;",
+            [(set Int64Regs:$dst, (imad Int64Regs:$a, imm:$b, imm:$c))]>;
+
+def INEG16 :
+  NVPTXInst<(outs Int16Regs:$dst), (ins Int16Regs:$src),
+            "neg.s16 \t$dst, $src;",
+            [(set Int16Regs:$dst, (ineg Int16Regs:$src))]>;
+def INEG32 :
+  NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$src),
+            "neg.s32 \t$dst, $src;",
+            [(set Int32Regs:$dst, (ineg Int32Regs:$src))]>;
+def INEG64 :
+  NVPTXInst<(outs Int64Regs:$dst), (ins Int64Regs:$src),
+            "neg.s64 \t$dst, $src;",
+            [(set Int64Regs:$dst, (ineg Int64Regs:$src))]>;
 
 //-----------------------------------
 // Floating Point Arithmetic
@@ -677,17 +704,13 @@ def INEG64 : NVPTXInst<(outs Int64Regs:$dst), (ins Int64Regs:$src),
 
 // Constant 1.0f
 def FloatConst1 : PatLeaf<(fpimm), [{
-    if (&(N->getValueAPF().getSemantics()) != &llvm::APFloat::IEEEsingle)
-      return false;
-    float f = (float)N->getValueAPF().convertToFloat();
-    return (f==1.0f);
+  return &N->getValueAPF().getSemantics() == &llvm::APFloat::IEEEsingle &&
+         N->getValueAPF().convertToFloat() == 1.0f;
 }]>;
-// Constand (double)1.0
+// Constant 1.0 (double)
 def DoubleConst1 : PatLeaf<(fpimm), [{
-    if (&(N->getValueAPF().getSemantics()) != &llvm::APFloat::IEEEdouble)
-      return false;
-    double d = (double)N->getValueAPF().convertToDouble();
-    return (d==1.0);
+  return &N->getValueAPF().getSemantics() == &llvm::APFloat::IEEEdouble &&
+         N->getValueAPF().convertToDouble() == 1.0;
 }]>;
 
 defm FADD : F3<"add", fadd>;
@@ -698,157 +721,157 @@ defm FADD_rn : F3_rn<"add", fadd>;
 defm FSUB_rn : F3_rn<"sub", fsub>;
 defm FMUL_rn : F3_rn<"mul", fmul>;
 
-defm FABS : F2<"abs", fabs>;
-defm FNEG : F2<"neg", fneg>;
+defm FABS  : F2<"abs", fabs>;
+defm FNEG  : F2<"neg", fneg>;
 defm FSQRT : F2<"sqrt.rn", fsqrt>;
 
 //
 // F64 division
 //
-def FDIV641r : NVPTXInst<(outs Float64Regs:$dst),
-                      (ins f64imm:$a, Float64Regs:$b),
-                      "rcp.rn.f64 \t$dst, $b;",
-                      [(set Float64Regs:$dst,
-                        (fdiv DoubleConst1:$a, Float64Regs:$b))]>;
-def FDIV64rr : NVPTXInst<(outs Float64Regs:$dst),
-                      (ins Float64Regs:$a, Float64Regs:$b),
-                      "div.rn.f64 \t$dst, $a, $b;",
-                      [(set Float64Regs:$dst,
-                        (fdiv Float64Regs:$a, Float64Regs:$b))]>;
-def FDIV64ri : NVPTXInst<(outs Float64Regs:$dst),
-                      (ins Float64Regs:$a, f64imm:$b),
-                      "div.rn.f64 \t$dst, $a, $b;",
-                      [(set Float64Regs:$dst,
-                        (fdiv Float64Regs:$a, fpimm:$b))]>;
+def FDIV641r :
+  NVPTXInst<(outs Float64Regs:$dst),
+            (ins f64imm:$a, Float64Regs:$b),
+            "rcp.rn.f64 \t$dst, $b;",
+            [(set Float64Regs:$dst, (fdiv DoubleConst1:$a, Float64Regs:$b))]>;
+def FDIV64rr :
+  NVPTXInst<(outs Float64Regs:$dst),
+            (ins Float64Regs:$a, Float64Regs:$b),
+            "div.rn.f64 \t$dst, $a, $b;",
+            [(set Float64Regs:$dst, (fdiv Float64Regs:$a, Float64Regs:$b))]>;
+def FDIV64ri :
+  NVPTXInst<(outs Float64Regs:$dst),
+            (ins Float64Regs:$a, f64imm:$b),
+            "div.rn.f64 \t$dst, $a, $b;",
+            [(set Float64Regs:$dst, (fdiv Float64Regs:$a, fpimm:$b))]>;
 
 //
 // F32 Approximate reciprocal
 //
-def FDIV321r_ftz : NVPTXInst<(outs Float32Regs:$dst),
-                      (ins f32imm:$a, Float32Regs:$b),
-                      "rcp.approx.ftz.f32 \t$dst, $b;",
-                      [(set Float32Regs:$dst,
-                        (fdiv FloatConst1:$a, Float32Regs:$b))]>,
-                      Requires<[do_DIVF32_APPROX, doF32FTZ]>;
-def FDIV321r : NVPTXInst<(outs Float32Regs:$dst),
-                        (ins f32imm:$a, Float32Regs:$b),
-                       "rcp.approx.f32 \t$dst, $b;",
-                      [(set Float32Regs:$dst,
-                        (fdiv FloatConst1:$a, Float32Regs:$b))]>,
-                      Requires<[do_DIVF32_APPROX]>;
+def FDIV321r_ftz :
+  NVPTXInst<(outs Float32Regs:$dst),
+            (ins f32imm:$a, Float32Regs:$b),
+            "rcp.approx.ftz.f32 \t$dst, $b;",
+            [(set Float32Regs:$dst, (fdiv FloatConst1:$a, Float32Regs:$b))]>,
+            Requires<[do_DIVF32_APPROX, doF32FTZ]>;
+def FDIV321r :
+  NVPTXInst<(outs Float32Regs:$dst),
+            (ins f32imm:$a, Float32Regs:$b),
+            "rcp.approx.f32 \t$dst, $b;",
+            [(set Float32Regs:$dst, (fdiv FloatConst1:$a, Float32Regs:$b))]>,
+            Requires<[do_DIVF32_APPROX]>;
 //
 // F32 Approximate division
 //
-def FDIV32approxrr_ftz : NVPTXInst<(outs Float32Regs:$dst),
-                      (ins Float32Regs:$a, Float32Regs:$b),
-                      "div.approx.ftz.f32 \t$dst, $a, $b;",
-                      [(set Float32Regs:$dst,
-                        (fdiv Float32Regs:$a, Float32Regs:$b))]>,
-                      Requires<[do_DIVF32_APPROX, doF32FTZ]>;
-def FDIV32approxri_ftz : NVPTXInst<(outs Float32Regs:$dst),
-                      (ins Float32Regs:$a, f32imm:$b),
-                      "div.approx.ftz.f32 \t$dst, $a, $b;",
-                      [(set Float32Regs:$dst,
-                        (fdiv Float32Regs:$a, fpimm:$b))]>,
-                      Requires<[do_DIVF32_APPROX, doF32FTZ]>;
-def FDIV32approxrr     : NVPTXInst<(outs Float32Regs:$dst),
-                      (ins Float32Regs:$a, Float32Regs:$b),
-                      "div.approx.f32 \t$dst, $a, $b;",
-                      [(set Float32Regs:$dst,
-                        (fdiv Float32Regs:$a, Float32Regs:$b))]>,
-                      Requires<[do_DIVF32_APPROX]>;
-def FDIV32approxri : NVPTXInst<(outs Float32Regs:$dst),
-                      (ins Float32Regs:$a, f32imm:$b),
-                      "div.approx.f32 \t$dst, $a, $b;",
-                      [(set Float32Regs:$dst,
-                        (fdiv Float32Regs:$a, fpimm:$b))]>,
-                      Requires<[do_DIVF32_APPROX]>;
+def FDIV32approxrr_ftz :
+  NVPTXInst<(outs Float32Regs:$dst),
+            (ins Float32Regs:$a, Float32Regs:$b),
+            "div.approx.ftz.f32 \t$dst, $a, $b;",
+            [(set Float32Regs:$dst, (fdiv Float32Regs:$a, Float32Regs:$b))]>,
+            Requires<[do_DIVF32_APPROX, doF32FTZ]>;
+def FDIV32approxri_ftz :
+  NVPTXInst<(outs Float32Regs:$dst),
+            (ins Float32Regs:$a, f32imm:$b),
+            "div.approx.ftz.f32 \t$dst, $a, $b;",
+            [(set Float32Regs:$dst, (fdiv Float32Regs:$a, fpimm:$b))]>,
+            Requires<[do_DIVF32_APPROX, doF32FTZ]>;
+def FDIV32approxrr :
+  NVPTXInst<(outs Float32Regs:$dst),
+            (ins Float32Regs:$a, Float32Regs:$b),
+            "div.approx.f32 \t$dst, $a, $b;",
+            [(set Float32Regs:$dst, (fdiv Float32Regs:$a, Float32Regs:$b))]>,
+            Requires<[do_DIVF32_APPROX]>;
+def FDIV32approxri :
+  NVPTXInst<(outs Float32Regs:$dst),
+            (ins Float32Regs:$a, f32imm:$b),
+            "div.approx.f32 \t$dst, $a, $b;",
+            [(set Float32Regs:$dst, (fdiv Float32Regs:$a, fpimm:$b))]>,
+            Requires<[do_DIVF32_APPROX]>;
 //
 // F32 Semi-accurate reciprocal
 //
 // rcp.approx gives the same result as div.full(1.0f, a) and is faster.
 //
-def FDIV321r_approx_ftz : NVPTXInst<(outs Float32Regs:$dst),
-                      (ins f32imm:$a, Float32Regs:$b),
-                      "rcp.approx.ftz.f32 \t$dst, $b;",
-                      [(set Float32Regs:$dst,
-                        (fdiv FloatConst1:$a, Float32Regs:$b))]>,
-                      Requires<[do_DIVF32_FULL, doF32FTZ]>;
-def FDIV321r_approx : NVPTXInst<(outs Float32Regs:$dst),
-                      (ins f32imm:$a, Float32Regs:$b),
-                      "rcp.approx.f32 \t$dst, $b;",
-                      [(set Float32Regs:$dst,
-                        (fdiv FloatConst1:$a, Float32Regs:$b))]>,
-                      Requires<[do_DIVF32_FULL]>;
+def FDIV321r_approx_ftz :
+  NVPTXInst<(outs Float32Regs:$dst),
+            (ins f32imm:$a, Float32Regs:$b),
+            "rcp.approx.ftz.f32 \t$dst, $b;",
+            [(set Float32Regs:$dst, (fdiv FloatConst1:$a, Float32Regs:$b))]>,
+            Requires<[do_DIVF32_FULL, doF32FTZ]>;
+def FDIV321r_approx :
+  NVPTXInst<(outs Float32Regs:$dst),
+            (ins f32imm:$a, Float32Regs:$b),
+            "rcp.approx.f32 \t$dst, $b;",
+            [(set Float32Regs:$dst, (fdiv FloatConst1:$a, Float32Regs:$b))]>,
+            Requires<[do_DIVF32_FULL]>;
 //
 // F32 Semi-accurate division
 //
-def FDIV32rr_ftz : NVPTXInst<(outs Float32Regs:$dst),
-                      (ins Float32Regs:$a, Float32Regs:$b),
-                      "div.full.ftz.f32 \t$dst, $a, $b;",
-                      [(set Float32Regs:$dst,
-                        (fdiv Float32Regs:$a, Float32Regs:$b))]>,
-                      Requires<[do_DIVF32_FULL, doF32FTZ]>;
-def FDIV32ri_ftz : NVPTXInst<(outs Float32Regs:$dst),
-                      (ins Float32Regs:$a, f32imm:$b),
-                      "div.full.ftz.f32 \t$dst, $a, $b;",
-                      [(set Float32Regs:$dst,
-                        (fdiv Float32Regs:$a, fpimm:$b))]>,
-                      Requires<[do_DIVF32_FULL, doF32FTZ]>;
-def FDIV32rr : NVPTXInst<(outs Float32Regs:$dst),
-                      (ins Float32Regs:$a, Float32Regs:$b),
-                      "div.full.f32 \t$dst, $a, $b;",
-                      [(set Float32Regs:$dst,
-                        (fdiv Float32Regs:$a, Float32Regs:$b))]>,
-                      Requires<[do_DIVF32_FULL]>;
-def FDIV32ri : NVPTXInst<(outs Float32Regs:$dst),
-                      (ins Float32Regs:$a, f32imm:$b),
-                      "div.full.f32 \t$dst, $a, $b;",
-                      [(set Float32Regs:$dst,
-                        (fdiv Float32Regs:$a, fpimm:$b))]>,
-                      Requires<[do_DIVF32_FULL]>;
+def FDIV32rr_ftz :
+  NVPTXInst<(outs Float32Regs:$dst),
+            (ins Float32Regs:$a, Float32Regs:$b),
+            "div.full.ftz.f32 \t$dst, $a, $b;",
+            [(set Float32Regs:$dst, (fdiv Float32Regs:$a, Float32Regs:$b))]>,
+            Requires<[do_DIVF32_FULL, doF32FTZ]>;
+def FDIV32ri_ftz :
+  NVPTXInst<(outs Float32Regs:$dst),
+            (ins Float32Regs:$a, f32imm:$b),
+            "div.full.ftz.f32 \t$dst, $a, $b;",
+            [(set Float32Regs:$dst, (fdiv Float32Regs:$a, fpimm:$b))]>,
+            Requires<[do_DIVF32_FULL, doF32FTZ]>;
+def FDIV32rr :
+  NVPTXInst<(outs Float32Regs:$dst),
+            (ins Float32Regs:$a, Float32Regs:$b),
+            "div.full.f32 \t$dst, $a, $b;",
+            [(set Float32Regs:$dst, (fdiv Float32Regs:$a, Float32Regs:$b))]>,
+            Requires<[do_DIVF32_FULL]>;
+def FDIV32ri :
+  NVPTXInst<(outs Float32Regs:$dst),
+            (ins Float32Regs:$a, f32imm:$b),
+            "div.full.f32 \t$dst, $a, $b;",
+            [(set Float32Regs:$dst, (fdiv Float32Regs:$a, fpimm:$b))]>,
+            Requires<[do_DIVF32_FULL]>;
 //
 // F32 Accurate reciprocal
 //
-def FDIV321r_prec_ftz : NVPTXInst<(outs Float32Regs:$dst),
-                        (ins f32imm:$a, Float32Regs:$b),
-                       "rcp.rn.ftz.f32 \t$dst, $b;",
-                      [(set Float32Regs:$dst,
-                        (fdiv FloatConst1:$a, Float32Regs:$b))]>,
-                      Requires<[reqPTX20, doF32FTZ]>;
-def FDIV321r_prec : NVPTXInst<(outs Float32Regs:$dst),
-                      (ins f32imm:$a, Float32Regs:$b),
-                       "rcp.rn.f32 \t$dst, $b;",
-                      [(set Float32Regs:$dst,
-                        (fdiv FloatConst1:$a, Float32Regs:$b))]>,
-                      Requires<[reqPTX20]>;
+def FDIV321r_prec_ftz :
+  NVPTXInst<(outs Float32Regs:$dst),
+            (ins f32imm:$a, Float32Regs:$b),
+            "rcp.rn.ftz.f32 \t$dst, $b;",
+            [(set Float32Regs:$dst, (fdiv FloatConst1:$a, Float32Regs:$b))]>,
+            Requires<[reqPTX20, doF32FTZ]>;
+def FDIV321r_prec :
+  NVPTXInst<(outs Float32Regs:$dst),
+            (ins f32imm:$a, Float32Regs:$b),
+            "rcp.rn.f32 \t$dst, $b;",
+            [(set Float32Regs:$dst, (fdiv FloatConst1:$a, Float32Regs:$b))]>,
+            Requires<[reqPTX20]>;
 //
 // F32 Accurate division
 //
-def FDIV32rr_prec_ftz : NVPTXInst<(outs Float32Regs:$dst),
-                      (ins Float32Regs:$a, Float32Regs:$b),
-                      "div.rn.ftz.f32 \t$dst, $a, $b;",
-                      [(set Float32Regs:$dst,
-                        (fdiv Float32Regs:$a, Float32Regs:$b))]>,
-                      Requires<[doF32FTZ, reqPTX20]>;
-def FDIV32ri_prec_ftz : NVPTXInst<(outs Float32Regs:$dst),
-                      (ins Float32Regs:$a, f32imm:$b),
-                      "div.rn.ftz.f32 \t$dst, $a, $b;",
-                      [(set Float32Regs:$dst,
-                        (fdiv Float32Regs:$a, fpimm:$b))]>,
-                      Requires<[doF32FTZ, reqPTX20]>;
-def FDIV32rr_prec : NVPTXInst<(outs Float32Regs:$dst),
-                      (ins Float32Regs:$a, Float32Regs:$b),
-                      "div.rn.f32 \t$dst, $a, $b;",
-                      [(set Float32Regs:$dst,
-                        (fdiv Float32Regs:$a, Float32Regs:$b))]>,
-                      Requires<[reqPTX20]>;
-def FDIV32ri_prec : NVPTXInst<(outs Float32Regs:$dst),
-                      (ins Float32Regs:$a, f32imm:$b),
-                      "div.rn.f32 \t$dst, $a, $b;",
-                      [(set Float32Regs:$dst,
-                        (fdiv Float32Regs:$a, fpimm:$b))]>,
-                      Requires<[reqPTX20]>;
+def FDIV32rr_prec_ftz :
+  NVPTXInst<(outs Float32Regs:$dst),
+            (ins Float32Regs:$a, Float32Regs:$b),
+            "div.rn.ftz.f32 \t$dst, $a, $b;",
+            [(set Float32Regs:$dst, (fdiv Float32Regs:$a, Float32Regs:$b))]>,
+            Requires<[doF32FTZ, reqPTX20]>;
+def FDIV32ri_prec_ftz :
+  NVPTXInst<(outs Float32Regs:$dst),
+            (ins Float32Regs:$a, f32imm:$b),
+            "div.rn.ftz.f32 \t$dst, $a, $b;",
+            [(set Float32Regs:$dst, (fdiv Float32Regs:$a, fpimm:$b))]>,
+            Requires<[doF32FTZ, reqPTX20]>;
+def FDIV32rr_prec :
+  NVPTXInst<(outs Float32Regs:$dst),
+            (ins Float32Regs:$a, Float32Regs:$b),
+            "div.rn.f32 \t$dst, $a, $b;",
+            [(set Float32Regs:$dst, (fdiv Float32Regs:$a, Float32Regs:$b))]>,
+            Requires<[reqPTX20]>;
+def FDIV32ri_prec :
+  NVPTXInst<(outs Float32Regs:$dst),
+            (ins Float32Regs:$a, f32imm:$b),
+            "div.rn.f32 \t$dst, $a, $b;",
+            [(set Float32Regs:$dst, (fdiv Float32Regs:$a, fpimm:$b))]>,
+            Requires<[reqPTX20]>;
 
 //
 // F32 rsqrt
@@ -857,68 +880,39 @@ def FDIV32ri_prec : NVPTXInst<(outs Float32Regs:$dst),
 def RSQRTF32approx1r : NVPTXInst<(outs Float32Regs:$dst), (ins Float32Regs:$b),
                        "rsqrt.approx.f32 \t$dst, $b;", []>;
 
+// Convert 1.0f/sqrt(x) to rsqrt.approx.f32.  (There is an rsqrt.approx.f64, but
+// it's emulated in software.)
 def: Pat<(fdiv FloatConst1, (int_nvvm_sqrt_f Float32Regs:$b)),
          (RSQRTF32approx1r Float32Regs:$b)>,
          Requires<[do_DIVF32_FULL, do_SQRTF32_APPROX, doNoF32FTZ]>;
 
-multiclass FPCONTRACT32<string OpcStr, Predicate Pred> {
-   def rrr : NVPTXInst<(outs Float32Regs:$dst),
-                      (ins Float32Regs:$a, Float32Regs:$b, Float32Regs:$c),
-                      !strconcat(OpcStr, " \t$dst, $a, $b, $c;"),
-                      [(set Float32Regs:$dst,
-                        (fma Float32Regs:$a, Float32Regs:$b, Float32Regs:$c))]>,
-                      Requires<[Pred]>;
-   def rri : NVPTXInst<(outs Float32Regs:$dst),
-                      (ins Float32Regs:$a, Float32Regs:$b, f32imm:$c),
-                      !strconcat(OpcStr, " \t$dst, $a, $b, $c;"),
-                      [(set Float32Regs:$dst,
-                        (fma Float32Regs:$a, Float32Regs:$b, fpimm:$c))]>,
-                      Requires<[Pred]>;
-   def rir : NVPTXInst<(outs Float32Regs:$dst),
-                      (ins Float32Regs:$a, f32imm:$b, Float32Regs:$c),
-                      !strconcat(OpcStr, " \t$dst, $a, $b, $c;"),
-                      [(set Float32Regs:$dst,
-                        (fma Float32Regs:$a, fpimm:$b, Float32Regs:$c))]>,
-                      Requires<[Pred]>;
-   def rii : NVPTXInst<(outs Float32Regs:$dst),
-                      (ins Float32Regs:$a, f32imm:$b, f32imm:$c),
-                      !strconcat(OpcStr, " \t$dst, $a, $b, $c;"),
-                      [(set Float32Regs:$dst,
-                        (fma Float32Regs:$a, fpimm:$b, fpimm:$c))]>,
-                      Requires<[Pred]>;
+multiclass FMA<string OpcStr, RegisterClass RC, Operand ImmCls, Predicate Pred> {
+   def rrr : NVPTXInst<(outs RC:$dst), (ins RC:$a, RC:$b, RC:$c),
+                       !strconcat(OpcStr, " \t$dst, $a, $b, $c;"),
+                       [(set RC:$dst, (fma RC:$a, RC:$b, RC:$c))]>,
+                       Requires<[Pred]>;
+   def rri : NVPTXInst<(outs RC:$dst),
+                       (ins RC:$a, RC:$b, ImmCls:$c),
+                       !strconcat(OpcStr, " \t$dst, $a, $b, $c;"),
+                       [(set RC:$dst, (fma RC:$a, RC:$b, fpimm:$c))]>,
+                       Requires<[Pred]>;
+   def rir : NVPTXInst<(outs RC:$dst),
+                       (ins RC:$a, ImmCls:$b, RC:$c),
+                       !strconcat(OpcStr, " \t$dst, $a, $b, $c;"),
+                       [(set RC:$dst, (fma RC:$a, fpimm:$b, RC:$c))]>,
+                       Requires<[Pred]>;
+   def rii : NVPTXInst<(outs RC:$dst),
+                       (ins RC:$a, ImmCls:$b, ImmCls:$c),
+                       !strconcat(OpcStr, " \t$dst, $a, $b, $c;"),
+                       [(set RC:$dst, (fma RC:$a, fpimm:$b, fpimm:$c))]>,
+                       Requires<[Pred]>;
 }
 
-multiclass FPCONTRACT64<string OpcStr, Predicate Pred> {
-   def rrr : NVPTXInst<(outs Float64Regs:$dst),
-                      (ins Float64Regs:$a, Float64Regs:$b, Float64Regs:$c),
-                      !strconcat(OpcStr, " \t$dst, $a, $b, $c;"),
-                      [(set Float64Regs:$dst,
-                        (fma Float64Regs:$a, Float64Regs:$b, Float64Regs:$c))]>,
-                      Requires<[Pred]>;
-   def rri : NVPTXInst<(outs Float64Regs:$dst),
-                      (ins Float64Regs:$a, Float64Regs:$b, f64imm:$c),
-                      !strconcat(OpcStr, " \t$dst, $a, $b, $c;"),
-                      [(set Float64Regs:$dst,
-                        (fma Float64Regs:$a, Float64Regs:$b, fpimm:$c))]>,
-                      Requires<[Pred]>;
-   def rir : NVPTXInst<(outs Float64Regs:$dst),
-                      (ins Float64Regs:$a, f64imm:$b, Float64Regs:$c),
-                      !strconcat(OpcStr, " \t$dst, $a, $b, $c;"),
-                      [(set Float64Regs:$dst,
-                        (fma Float64Regs:$a, fpimm:$b, Float64Regs:$c))]>,
-                      Requires<[Pred]>;
-   def rii : NVPTXInst<(outs Float64Regs:$dst),
-                      (ins Float64Regs:$a, f64imm:$b, f64imm:$c),
-                      !strconcat(OpcStr, " \t$dst, $a, $b, $c;"),
-                      [(set Float64Regs:$dst,
-                        (fma Float64Regs:$a, fpimm:$b, fpimm:$c))]>,
-                      Requires<[Pred]>;
-}
-
-defm FMA32_ftz  : FPCONTRACT32<"fma.rn.ftz.f32", doF32FTZ>;
-defm FMA32  : FPCONTRACT32<"fma.rn.f32", true>;
-defm FMA64  : FPCONTRACT64<"fma.rn.f64", true>;
+defm FMA32_ftz : FMA<"fma.rn.ftz.f32", Float32Regs, f32imm, doF32FTZ>;
+defm FMA32     : FMA<"fma.rn.f32", Float32Regs, f32imm, true>;
+defm FMA64     : FMA<"fma.rn.f64", Float64Regs, f64imm, true>;
 
+// sin/cos
 def SINF:  NVPTXInst<(outs Float32Regs:$dst), (ins Float32Regs:$src),
                       "sin.approx.f32 \t$dst, $src;",
                       [(set Float32Regs:$dst, (fsin Float32Regs:$src))]>;
@@ -926,8 +920,8 @@ def COSF:  NVPTXInst<(outs Float32Regs:$dst), (ins Float32Regs:$src),
                       "cos.approx.f32 \t$dst, $src;",
                       [(set Float32Regs:$dst, (fcos Float32Regs:$src))]>;
 
-// Lower (frem x, y) into (sub x, (mul (floor (div x, y)) y))
-// e.g. "poor man's fmod()"
+// Lower (frem x, y) into (sub x, (mul (floor (div x, y)) y)),
+// i.e. "poor man's fmod()"
 
 // frem - f32 FTZ
 def : Pat<(frem Float32Regs:$x, Float32Regs:$y),
@@ -962,183 +956,152 @@ def : Pat<(frem Float64Regs:$x, fpimm:$y),
              fpimm:$y))>;
 
 //-----------------------------------
-// Logical Arithmetic
+// Bitwise operations
 //-----------------------------------
 
-multiclass LOG_FORMAT<string OpcStr, SDNode OpNode> {
-  def b1rr:  NVPTXInst<(outs Int1Regs:$dst), (ins Int1Regs:$a, Int1Regs:$b),
-                      !strconcat(OpcStr, ".pred  \t$dst, $a, $b;"),
-                      [(set Int1Regs:$dst, (OpNode Int1Regs:$a, Int1Regs:$b))]>;
-  def b1ri:  NVPTXInst<(outs Int1Regs:$dst), (ins Int1Regs:$a, i1imm:$b),
-                      !strconcat(OpcStr, ".pred  \t$dst, $a, $b;"),
-                      [(set Int1Regs:$dst, (OpNode Int1Regs:$a, imm:$b))]>;
-  def b16rr:  NVPTXInst<(outs Int16Regs:$dst), (ins Int16Regs:$a, Int16Regs:$b),
-                      !strconcat(OpcStr, ".b16  \t$dst, $a, $b;"),
-                      [(set Int16Regs:$dst, (OpNode Int16Regs:$a,
-                        Int16Regs:$b))]>;
-  def b16ri:  NVPTXInst<(outs Int16Regs:$dst), (ins Int16Regs:$a, i16imm:$b),
-                      !strconcat(OpcStr, ".b16  \t$dst, $a, $b;"),
-                      [(set Int16Regs:$dst, (OpNode Int16Regs:$a, imm:$b))]>;
-  def b32rr:  NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$a, Int32Regs:$b),
-                      !strconcat(OpcStr, ".b32  \t$dst, $a, $b;"),
-                      [(set Int32Regs:$dst, (OpNode Int32Regs:$a,
-                        Int32Regs:$b))]>;
-  def b32ri:  NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$a, i32imm:$b),
-                      !strconcat(OpcStr, ".b32  \t$dst, $a, $b;"),
-                      [(set Int32Regs:$dst, (OpNode Int32Regs:$a, imm:$b))]>;
-  def b64rr:  NVPTXInst<(outs Int64Regs:$dst), (ins Int64Regs:$a, Int64Regs:$b),
-                      !strconcat(OpcStr, ".b64  \t$dst, $a, $b;"),
-                      [(set Int64Regs:$dst, (OpNode Int64Regs:$a,
-                        Int64Regs:$b))]>;
-  def b64ri:  NVPTXInst<(outs Int64Regs:$dst), (ins Int64Regs:$a, i64imm:$b),
-                      !strconcat(OpcStr, ".b64  \t$dst, $a, $b;"),
-                      [(set Int64Regs:$dst, (OpNode Int64Regs:$a, imm:$b))]>;
+// Template for three-arg bitwise operations.  Takes three args, Creates .b16,
+// .b32, .b64, and .pred (predicate registers -- i.e., i1) versions of OpcStr.
+multiclass BITWISE<string OpcStr, SDNode OpNode> {
+  def b1rr :
+    NVPTXInst<(outs Int1Regs:$dst), (ins Int1Regs:$a, Int1Regs:$b),
+              !strconcat(OpcStr, ".pred  \t$dst, $a, $b;"),
+              [(set Int1Regs:$dst, (OpNode Int1Regs:$a, Int1Regs:$b))]>;
+  def b1ri :
+    NVPTXInst<(outs Int1Regs:$dst), (ins Int1Regs:$a, i1imm:$b),
+              !strconcat(OpcStr, ".pred  \t$dst, $a, $b;"),
+              [(set Int1Regs:$dst, (OpNode Int1Regs:$a, imm:$b))]>;
+  def b16rr :
+    NVPTXInst<(outs Int16Regs:$dst), (ins Int16Regs:$a, Int16Regs:$b),
+              !strconcat(OpcStr, ".b16  \t$dst, $a, $b;"),
+              [(set Int16Regs:$dst, (OpNode Int16Regs:$a, Int16Regs:$b))]>;
+  def b16ri :
+    NVPTXInst<(outs Int16Regs:$dst), (ins Int16Regs:$a, i16imm:$b),
+              !strconcat(OpcStr, ".b16  \t$dst, $a, $b;"),
+              [(set Int16Regs:$dst, (OpNode Int16Regs:$a, imm:$b))]>;
+  def b32rr :
+    NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$a, Int32Regs:$b),
+              !strconcat(OpcStr, ".b32  \t$dst, $a, $b;"),
+              [(set Int32Regs:$dst, (OpNode Int32Regs:$a, Int32Regs:$b))]>;
+  def b32ri :
+    NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$a, i32imm:$b),
+              !strconcat(OpcStr, ".b32  \t$dst, $a, $b;"),
+              [(set Int32Regs:$dst, (OpNode Int32Regs:$a, imm:$b))]>;
+  def b64rr :
+    NVPTXInst<(outs Int64Regs:$dst), (ins Int64Regs:$a, Int64Regs:$b),
+              !strconcat(OpcStr, ".b64  \t$dst, $a, $b;"),
+              [(set Int64Regs:$dst, (OpNode Int64Regs:$a, Int64Regs:$b))]>;
+  def b64ri :
+    NVPTXInst<(outs Int64Regs:$dst), (ins Int64Regs:$a, i64imm:$b),
+              !strconcat(OpcStr, ".b64  \t$dst, $a, $b;"),
+              [(set Int64Regs:$dst, (OpNode Int64Regs:$a, imm:$b))]>;
 }
 
-defm OR  : LOG_FORMAT<"or", or>;
-defm AND : LOG_FORMAT<"and", and>;
-defm XOR : LOG_FORMAT<"xor", xor>;
+defm OR  : BITWISE<"or", or>;
+defm AND : BITWISE<"and", and>;
+defm XOR : BITWISE<"xor", xor>;
 
-def NOT1:  NVPTXInst<(outs Int1Regs:$dst), (ins Int1Regs:$src),
+def NOT1  : NVPTXInst<(outs Int1Regs:$dst), (ins Int1Regs:$src),
                       "not.pred \t$dst, $src;",
                       [(set Int1Regs:$dst, (not Int1Regs:$src))]>;
-def NOT16:  NVPTXInst<(outs Int16Regs:$dst), (ins Int16Regs:$src),
+def NOT16 : NVPTXInst<(outs Int16Regs:$dst), (ins Int16Regs:$src),
                       "not.b16 \t$dst, $src;",
                       [(set Int16Regs:$dst, (not Int16Regs:$src))]>;
-def NOT32:  NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$src),
+def NOT32 : NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$src),
                       "not.b32 \t$dst, $src;",
                       [(set Int32Regs:$dst, (not Int32Regs:$src))]>;
-def NOT64:  NVPTXInst<(outs Int64Regs:$dst), (ins Int64Regs:$src),
-                      "not.b64 \t$dst, $src;",
-                      [(set Int64Regs:$dst, (not Int64Regs:$src))]>;
-
-// For shifts, the second src operand must be 32-bit value
-multiclass LSHIFT_FORMAT<string OpcStr, SDNode OpNode> {
-   def i64rr : NVPTXInst<(outs Int64Regs:$dst), (ins Int64Regs:$a,
-                      Int32Regs:$b),
-                      !strconcat(OpcStr, "64 \t$dst, $a, $b;"),
-                      [(set Int64Regs:$dst, (OpNode Int64Regs:$a,
-                        Int32Regs:$b))]>;
-   def i64ri : NVPTXInst<(outs Int64Regs:$dst), (ins Int64Regs:$a, i32imm:$b),
-                      !strconcat(OpcStr, "64 \t$dst, $a, $b;"),
-                      [(set Int64Regs:$dst, (OpNode Int64Regs:$a,
-                        (i32 imm:$b)))]>;
-   def i32rr : NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$a,
-                      Int32Regs:$b),
-                      !strconcat(OpcStr, "32 \t$dst, $a, $b;"),
-                      [(set Int32Regs:$dst, (OpNode Int32Regs:$a,
-                        Int32Regs:$b))]>;
-   def i32ri : NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$a, i32imm:$b),
-                      !strconcat(OpcStr, "32 \t$dst, $a, $b;"),
-                      [(set Int32Regs:$dst, (OpNode Int32Regs:$a,
-                        (i32 imm:$b)))]>;
-   def i32ii : NVPTXInst<(outs Int32Regs:$dst), (ins i32imm:$a, i32imm:$b),
-                      !strconcat(OpcStr, "32 \t$dst, $a, $b;"),
-                      [(set Int32Regs:$dst, (OpNode (i32 imm:$a),
-                        (i32 imm:$b)))]>;
-   def i16rr : NVPTXInst<(outs Int16Regs:$dst), (ins Int16Regs:$a,
-                      Int32Regs:$b),
-                      !strconcat(OpcStr, "16 \t$dst, $a, $b;"),
-                      [(set Int16Regs:$dst, (OpNode Int16Regs:$a,
-                        Int32Regs:$b))]>;
-   def i16ri : NVPTXInst<(outs Int16Regs:$dst), (ins Int16Regs:$a, i32imm:$b),
-                      !strconcat(OpcStr, "16 \t$dst, $a, $b;"),
-                      [(set Int16Regs:$dst, (OpNode Int16Regs:$a,
-                        (i32 imm:$b)))]>;
-}
+def NOT64 : NVPTXInst<(outs Int64Regs:$dst), (ins Int64Regs:$src),
+                       "not.b64 \t$dst, $src;",
+                       [(set Int64Regs:$dst, (not Int64Regs:$src))]>;
 
-defm SHL : LSHIFT_FORMAT<"shl.b", shl>;
-
-// For shifts, the second src operand must be 32-bit value
-// Need to add cvt for the 8-bits.
-multiclass RSHIFT_FORMAT<string OpcStr, SDNode OpNode> {
-   def i64rr : NVPTXInst<(outs Int64Regs:$dst), (ins Int64Regs:$a,
-                      Int32Regs:$b),
-                      !strconcat(OpcStr, "64 \t$dst, $a, $b;"),
-                      [(set Int64Regs:$dst, (OpNode Int64Regs:$a,
-                        Int32Regs:$b))]>;
-   def i64ri : NVPTXInst<(outs Int64Regs:$dst), (ins Int64Regs:$a, i32imm:$b),
-                      !strconcat(OpcStr, "64 \t$dst, $a, $b;"),
-                      [(set Int64Regs:$dst, (OpNode Int64Regs:$a,
-                        (i32 imm:$b)))]>;
-   def i32rr : NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$a,
-                      Int32Regs:$b),
-                      !strconcat(OpcStr, "32 \t$dst, $a, $b;"),
-                      [(set Int32Regs:$dst, (OpNode Int32Regs:$a,
-                        Int32Regs:$b))]>;
-   def i32ri : NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$a, i32imm:$b),
-                      !strconcat(OpcStr, "32 \t$dst, $a, $b;"),
-                      [(set Int32Regs:$dst, (OpNode Int32Regs:$a,
-                        (i32 imm:$b)))]>;
-   def i32ii : NVPTXInst<(outs Int32Regs:$dst), (ins i32imm:$a, i32imm:$b),
-                      !strconcat(OpcStr, "32 \t$dst, $a, $b;"),
-                      [(set Int32Regs:$dst, (OpNode (i32 imm:$a),
-                        (i32 imm:$b)))]>;
-   def i16rr : NVPTXInst<(outs Int16Regs:$dst), (ins Int16Regs:$a,
-                      Int32Regs:$b),
-                      !strconcat(OpcStr, "16 \t$dst, $a, $b;"),
-                      [(set Int16Regs:$dst, (OpNode Int16Regs:$a,
-                        Int32Regs:$b))]>;
-   def i16ri : NVPTXInst<(outs Int16Regs:$dst), (ins Int16Regs:$a, i32imm:$b),
-                      !strconcat(OpcStr, "16 \t$dst, $a, $b;"),
-                      [(set Int16Regs:$dst, (OpNode Int16Regs:$a,
-                        (i32 imm:$b)))]>;
+// Template for left/right shifts.  Takes three operands,
+//   [dest (reg), src (reg), shift (reg or imm)].
+// dest and src may be int64, int32, or int16, but shift is always int32.
+//
+// This template also defines a 32-bit shift (imm, imm) instruction.
+multiclass SHIFT<string OpcStr, SDNode OpNode> {
+   def i64rr :
+     NVPTXInst<(outs Int64Regs:$dst), (ins Int64Regs:$a, Int32Regs:$b),
+               !strconcat(OpcStr, "64 \t$dst, $a, $b;"),
+               [(set Int64Regs:$dst, (OpNode Int64Regs:$a, Int32Regs:$b))]>;
+   def i64ri :
+     NVPTXInst<(outs Int64Regs:$dst), (ins Int64Regs:$a, i32imm:$b),
+               !strconcat(OpcStr, "64 \t$dst, $a, $b;"),
+               [(set Int64Regs:$dst, (OpNode Int64Regs:$a, (i32 imm:$b)))]>;
+   def i32rr :
+     NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$a, Int32Regs:$b),
+               !strconcat(OpcStr, "32 \t$dst, $a, $b;"),
+               [(set Int32Regs:$dst, (OpNode Int32Regs:$a, Int32Regs:$b))]>;
+   def i32ri :
+     NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$a, i32imm:$b),
+               !strconcat(OpcStr, "32 \t$dst, $a, $b;"),
+               [(set Int32Regs:$dst, (OpNode Int32Regs:$a, (i32 imm:$b)))]>;
+   def i32ii :
+     NVPTXInst<(outs Int32Regs:$dst), (ins i32imm:$a, i32imm:$b),
+               !strconcat(OpcStr, "32 \t$dst, $a, $b;"),
+               [(set Int32Regs:$dst, (OpNode (i32 imm:$a), (i32 imm:$b)))]>;
+   def i16rr :
+     NVPTXInst<(outs Int16Regs:$dst), (ins Int16Regs:$a, Int32Regs:$b),
+               !strconcat(OpcStr, "16 \t$dst, $a, $b;"),
+               [(set Int16Regs:$dst, (OpNode Int16Regs:$a, Int32Regs:$b))]>;
+   def i16ri :
+     NVPTXInst<(outs Int16Regs:$dst), (ins Int16Regs:$a, i32imm:$b),
+               !strconcat(OpcStr, "16 \t$dst, $a, $b;"),
+               [(set Int16Regs:$dst, (OpNode Int16Regs:$a, (i32 imm:$b)))]>;
 }
 
-defm SRA : RSHIFT_FORMAT<"shr.s", sra>;
-defm SRL : RSHIFT_FORMAT<"shr.u", srl>;
+defm SHL : SHIFT<"shl.b", shl>;
+defm SRA : SHIFT<"shr.s", sra>;
+defm SRL : SHIFT<"shr.u", srl>;
 
 //
-// Rotate: use ptx shf instruction if available.
+// Rotate: Use ptx shf instruction if available.
 //
 
 // 32 bit r2 = rotl r1, n
 //    =>
 //        r2 = shf.l r1, r1, n
-def ROTL32imm_hw : NVPTXInst<(outs Int32Regs:$dst),
-                             (ins Int32Regs:$src, i32imm:$amt),
-              "shf.l.wrap.b32 \t$dst, $src, $src, $amt;",
-    [(set Int32Regs:$dst, (rotl Int32Regs:$src, (i32 imm:$amt)))]>,
-    Requires<[hasHWROT32]> ;
-
-def ROTL32reg_hw : NVPTXInst<(outs Int32Regs:$dst),
-                             (ins Int32Regs:$src, Int32Regs:$amt),
-              "shf.l.wrap.b32 \t$dst, $src, $src, $amt;",
-    [(set Int32Regs:$dst, (rotl Int32Regs:$src, Int32Regs:$amt))]>,
-    Requires<[hasHWROT32]>;
+def ROTL32imm_hw :
+  NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$src, i32imm:$amt),
+            "shf.l.wrap.b32 \t$dst, $src, $src, $amt;",
+            [(set Int32Regs:$dst, (rotl Int32Regs:$src, (i32 imm:$amt)))]>,
+           Requires<[hasHWROT32]>;
+
+def ROTL32reg_hw :
+  NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$src, Int32Regs:$amt),
+            "shf.l.wrap.b32 \t$dst, $src, $src, $amt;",
+            [(set Int32Regs:$dst, (rotl Int32Regs:$src, Int32Regs:$amt))]>,
+           Requires<[hasHWROT32]>;
 
 // 32 bit r2 = rotr r1, n
 //    =>
 //        r2 = shf.r r1, r1, n
-def ROTR32imm_hw : NVPTXInst<(outs Int32Regs:$dst),
-                             (ins Int32Regs:$src, i32imm:$amt),
-              "shf.r.wrap.b32 \t$dst, $src, $src, $amt;",
-    [(set Int32Regs:$dst, (rotr Int32Regs:$src, (i32 imm:$amt)))]>,
-    Requires<[hasHWROT32]>;
-
-def ROTR32reg_hw : NVPTXInst<(outs Int32Regs:$dst),
-                             (ins Int32Regs:$src, Int32Regs:$amt),
-              "shf.r.wrap.b32 \t$dst, $src, $src, $amt;",
-    [(set Int32Regs:$dst, (rotr Int32Regs:$src, Int32Regs:$amt))]>,
-    Requires<[hasHWROT32]>;
-
-//
-// Rotate: if ptx shf instruction is not available, then use shift+add
-//
-// 32bit
-def ROT32imm_sw : NVPTXInst<(outs Int32Regs:$dst),
-  (ins Int32Regs:$src, i32imm:$amt1, i32imm:$amt2),
-    !strconcat("{{\n\t",
-    !strconcat(".reg .b32 %lhs;\n\t",
-    !strconcat(".reg .b32 %rhs;\n\t",
-    !strconcat("shl.b32 \t%lhs, $src, $amt1;\n\t",
-    !strconcat("shr.b32 \t%rhs, $src, $amt2;\n\t",
-    !strconcat("add.u32 \t$dst, %lhs, %rhs;\n\t",
-    !strconcat("}}", ""))))))),
-    []>;
+def ROTR32imm_hw :
+  NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$src, i32imm:$amt),
+            "shf.r.wrap.b32 \t$dst, $src, $src, $amt;",
+            [(set Int32Regs:$dst, (rotr Int32Regs:$src, (i32 imm:$amt)))]>,
+           Requires<[hasHWROT32]>;
+
+def ROTR32reg_hw :
+  NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$src, Int32Regs:$amt),
+            "shf.r.wrap.b32 \t$dst, $src, $src, $amt;",
+            [(set Int32Regs:$dst, (rotr Int32Regs:$src, Int32Regs:$amt))]>,
+           Requires<[hasHWROT32]>;
+
+// 32-bit software rotate by immediate.  $amt2 should equal 32 - $amt1.
+def ROT32imm_sw :
+  NVPTXInst<(outs Int32Regs:$dst),
+            (ins Int32Regs:$src, i32imm:$amt1, i32imm:$amt2),
+            "{{\n\t"
+            ".reg .b32 %lhs;\n\t"
+            ".reg .b32 %rhs;\n\t"
+            "shl.b32 \t%lhs, $src, $amt1;\n\t"
+            "shr.b32 \t%rhs, $src, $amt2;\n\t"
+            "add.u32 \t$dst, %lhs, %rhs;\n\t"
+            "}}",
+            []>;
 
 def SUB_FRM_32 : SDNodeXForm<imm, [{
-    return CurDAG->getTargetConstant(32-N->getZExtValue(), SDLoc(N), MVT::i32);
+  return CurDAG->getTargetConstant(32 - N->getZExtValue(), SDLoc(N), MVT::i32);
 }]>;
 
 def : Pat<(rotl Int32Regs:$src, (i32 imm:$amt)),
@@ -1148,45 +1111,48 @@ def : Pat<(rotr Int32Regs:$src, (i32 imm:$amt)),
           (ROT32imm_sw Int32Regs:$src, (SUB_FRM_32 node:$amt), imm:$amt)>,
       Requires<[noHWROT32]>;
 
-def ROTL32reg_sw : NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$src,
-    Int32Regs:$amt),
-    !strconcat("{{\n\t",
-    !strconcat(".reg .b32 %lhs;\n\t",
-    !strconcat(".reg .b32 %rhs;\n\t",
-    !strconcat(".reg .b32 %amt2;\n\t",
-    !strconcat("shl.b32 \t%lhs, $src, $amt;\n\t",
-    !strconcat("sub.s32 \t%amt2, 32, $amt;\n\t",
-    !strconcat("shr.b32 \t%rhs, $src, %amt2;\n\t",
-    !strconcat("add.u32 \t$dst, %lhs, %rhs;\n\t",
-    !strconcat("}}", ""))))))))),
-    [(set Int32Regs:$dst, (rotl Int32Regs:$src, Int32Regs:$amt))]>,
-    Requires<[noHWROT32]>;
-
-def ROTR32reg_sw : NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$src,
-    Int32Regs:$amt),
-    !strconcat("{{\n\t",
-    !strconcat(".reg .b32 %lhs;\n\t",
-    !strconcat(".reg .b32 %rhs;\n\t",
-    !strconcat(".reg .b32 %amt2;\n\t",
-    !strconcat("shr.b32 \t%lhs, $src, $amt;\n\t",
-    !strconcat("sub.s32 \t%amt2, 32, $amt;\n\t",
-    !strconcat("shl.b32 \t%rhs, $src, %amt2;\n\t",
-    !strconcat("add.u32 \t$dst, %lhs, %rhs;\n\t",
-    !strconcat("}}", ""))))))))),
-    [(set Int32Regs:$dst, (rotr Int32Regs:$src, Int32Regs:$amt))]>,
-    Requires<[noHWROT32]>;
-
-// 64bit
-def ROT64imm_sw : NVPTXInst<(outs Int64Regs:$dst), (ins Int64Regs:$src,
-    i32imm:$amt1, i32imm:$amt2),
-    !strconcat("{{\n\t",
-    !strconcat(".reg .b64 %lhs;\n\t",
-    !strconcat(".reg .b64 %rhs;\n\t",
-    !strconcat("shl.b64 \t%lhs, $src, $amt1;\n\t",
-    !strconcat("shr.b64 \t%rhs, $src, $amt2;\n\t",
-    !strconcat("add.u64 \t$dst, %lhs, %rhs;\n\t",
-    !strconcat("}}", ""))))))),
-    []>;
+// 32-bit software rotate left by register.
+def ROTL32reg_sw :
+  NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$src, Int32Regs:$amt),
+            "{{\n\t"
+            ".reg .b32 %lhs;\n\t"
+            ".reg .b32 %rhs;\n\t"
+            ".reg .b32 %amt2;\n\t"
+            "shl.b32 \t%lhs, $src, $amt;\n\t"
+            "sub.s32 \t%amt2, 32, $amt;\n\t"
+            "shr.b32 \t%rhs, $src, %amt2;\n\t"
+            "add.u32 \t$dst, %lhs, %rhs;\n\t"
+            "}}",
+            [(set Int32Regs:$dst, (rotl Int32Regs:$src, Int32Regs:$amt))]>,
+           Requires<[noHWROT32]>;
+
+// 32-bit software rotate right by register.
+def ROTR32reg_sw :
+  NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$src, Int32Regs:$amt),
+            "{{\n\t"
+            ".reg .b32 %lhs;\n\t"
+            ".reg .b32 %rhs;\n\t"
+            ".reg .b32 %amt2;\n\t"
+            "shr.b32 \t%lhs, $src, $amt;\n\t"
+            "sub.s32 \t%amt2, 32, $amt;\n\t"
+            "shl.b32 \t%rhs, $src, %amt2;\n\t"
+            "add.u32 \t$dst, %lhs, %rhs;\n\t"
+            "}}",
+            [(set Int32Regs:$dst, (rotr Int32Regs:$src, Int32Regs:$amt))]>,
+           Requires<[noHWROT32]>;
+
+// 64-bit software rotate by immediate.  $amt2 should equal 64 - $amt1.
+def ROT64imm_sw :
+  NVPTXInst<(outs Int64Regs:$dst),
+            (ins Int64Regs:$src, i32imm:$amt1, i32imm:$amt2),
+            "{{\n\t"
+            ".reg .b64 %lhs;\n\t"
+            ".reg .b64 %rhs;\n\t"
+            "shl.b64 \t%lhs, $src, $amt1;\n\t"
+            "shr.b64 \t%rhs, $src, $amt2;\n\t"
+            "add.u64 \t$dst, %lhs, %rhs;\n\t"
+            "}}",
+            []>;
 
 def SUB_FRM_64 : SDNodeXForm<imm, [{
     return CurDAG->getTargetConstant(64-N->getZExtValue(), SDLoc(N), MVT::i32);
@@ -1197,37 +1163,70 @@ def : Pat<(rotl Int64Regs:$src, (i32 imm:$amt)),
 def : Pat<(rotr Int64Regs:$src, (i32 imm:$amt)),
           (ROT64imm_sw Int64Regs:$src, (SUB_FRM_64 node:$amt), imm:$amt)>;
 
-def ROTL64reg_sw : NVPTXInst<(outs Int64Regs:$dst), (ins Int64Regs:$src,
-    Int32Regs:$amt),
-    !strconcat("{{\n\t",
-    !strconcat(".reg .b64 %lhs;\n\t",
-    !strconcat(".reg .b64 %rhs;\n\t",
-    !strconcat(".reg .u32 %amt2;\n\t",
-    !strconcat("shl.b64 \t%lhs, $src, $amt;\n\t",
-    !strconcat("sub.u32 \t%amt2, 64, $amt;\n\t",
-    !strconcat("shr.b64 \t%rhs, $src, %amt2;\n\t",
-    !strconcat("add.u64 \t$dst, %lhs, %rhs;\n\t",
-    !strconcat("}}", ""))))))))),
-    [(set Int64Regs:$dst, (rotl Int64Regs:$src, Int32Regs:$amt))]>;
-
-def ROTR64reg_sw : NVPTXInst<(outs Int64Regs:$dst), (ins Int64Regs:$src,
-    Int32Regs:$amt),
-    !strconcat("{{\n\t",
-    !strconcat(".reg .b64 %lhs;\n\t",
-    !strconcat(".reg .b64 %rhs;\n\t",
-    !strconcat(".reg .u32 %amt2;\n\t",
-    !strconcat("shr.b64 \t%lhs, $src, $amt;\n\t",
-    !strconcat("sub.u32 \t%amt2, 64, $amt;\n\t",
-    !strconcat("shl.b64 \t%rhs, $src, %amt2;\n\t",
-    !strconcat("add.u64 \t$dst, %lhs, %rhs;\n\t",
-    !strconcat("}}", ""))))))))),
-    [(set Int64Regs:$dst, (rotr Int64Regs:$src, Int32Regs:$amt))]>;
+// 64-bit software rotate left by register.
+def ROTL64reg_sw :
+  NVPTXInst<(outs Int64Regs:$dst), (ins Int64Regs:$src, Int32Regs:$amt),
+            "{{\n\t"
+            ".reg .b64 %lhs;\n\t"
+            ".reg .b64 %rhs;\n\t"
+            ".reg .u32 %amt2;\n\t"
+            "shl.b64 \t%lhs, $src, $amt;\n\t"
+            "sub.u32 \t%amt2, 64, $amt;\n\t"
+            "shr.b64 \t%rhs, $src, %amt2;\n\t"
+            "add.u64 \t$dst, %lhs, %rhs;\n\t"
+            "}}",
+            [(set Int64Regs:$dst, (rotl Int64Regs:$src, Int32Regs:$amt))]>;
+
+def ROTR64reg_sw :
+  NVPTXInst<(outs Int64Regs:$dst), (ins Int64Regs:$src, Int32Regs:$amt),
+            "{{\n\t"
+            ".reg .b64 %lhs;\n\t"
+            ".reg .b64 %rhs;\n\t"
+            ".reg .u32 %amt2;\n\t"
+            "shr.b64 \t%lhs, $src, $amt;\n\t"
+            "sub.u32 \t%amt2, 64, $amt;\n\t"
+            "shl.b64 \t%rhs, $src, %amt2;\n\t"
+            "add.u64 \t$dst, %lhs, %rhs;\n\t"
+            "}}",
+            [(set Int64Regs:$dst, (rotr Int64Regs:$src, Int32Regs:$amt))]>;
+
+//
+// Funnnel shift in clamp mode
+//
+
+// Create SDNodes so they can be used in the DAG code, e.g.
+// NVPTXISelLowering (LowerShiftLeftParts and LowerShiftRightParts)
+def SDTIntShiftDOp :
+  SDTypeProfile<1, 3, [SDTCisSameAs<0, 1>, SDTCisSameAs<0, 2>,
+                       SDTCisInt<0>, SDTCisInt<3>]>;
+def FUN_SHFL_CLAMP : SDNode<"NVPTXISD::FUN_SHFL_CLAMP", SDTIntShiftDOp, []>;
+def FUN_SHFR_CLAMP : SDNode<"NVPTXISD::FUN_SHFR_CLAMP", SDTIntShiftDOp, []>;
+
+def FUNSHFLCLAMP :
+  NVPTXInst<(outs Int32Regs:$dst),
+            (ins Int32Regs:$lo, Int32Regs:$hi, Int32Regs:$amt),
+            "shf.l.clamp.b32 \t$dst, $lo, $hi, $amt;",
+            [(set Int32Regs:$dst,
+              (FUN_SHFL_CLAMP Int32Regs:$lo, Int32Regs:$hi, Int32Regs:$amt))]>;
 
+def FUNSHFRCLAMP :
+  NVPTXInst<(outs Int32Regs:$dst),
+            (ins Int32Regs:$lo, Int32Regs:$hi, Int32Regs:$amt),
+            "shf.r.clamp.b32 \t$dst, $lo, $hi, $amt;",
+            [(set Int32Regs:$dst,
+             (FUN_SHFR_CLAMP Int32Regs:$lo, Int32Regs:$hi, Int32Regs:$amt))]>;
+
+//
 // BFE - bit-field extract
+//
 
+// Template for BFE instructions.  Takes four args,
+//   [dest (reg), src (reg), start (reg or imm), end (reg or imm)].
+// Start may be an imm only if end is also an imm.  FIXME: Is this a
+// restriction in PTX?
+//
+// dest and src may be int32 or int64, but start and end are always int32.
 multiclass BFE<string TyStr, RegisterClass RC> {
-  // BFE supports both 32-bit and 64-bit values, but the start and length
-  // operands are always 32-bit
   def rrr
     : NVPTXInst<(outs RC:$d),
                 (ins RC:$a, Int32Regs:$b, Int32Regs:$c),
@@ -1242,29 +1241,35 @@ multiclass BFE<string TyStr, RegisterClass RC> {
                 !strconcat("bfe.", TyStr, " \t$d, $a, $b, $c;"), []>;
 }
 
-defm BFE_S32 : BFE<"s32", Int32Regs>;
-defm BFE_U32 : BFE<"u32", Int32Regs>;
-defm BFE_S64 : BFE<"s64", Int64Regs>;
-defm BFE_U64 : BFE<"u64", Int64Regs>;
+let hasSideEffects = 0 in {
+  defm BFE_S32 : BFE<"s32", Int32Regs>;
+  defm BFE_U32 : BFE<"u32", Int32Regs>;
+  defm BFE_S64 : BFE<"s64", Int64Regs>;
+  defm BFE_U64 : BFE<"u64", Int64Regs>;
+}
 
 //-----------------------------------
-// General Comparison
+// Comparison instructions (setp, set)
 //-----------------------------------
 
-// General setp instructions
-multiclass SETP<string TypeStr, RegisterClass RC, Operand ImmCls> {
-  def rr : NVPTXInst<(outs Int1Regs:$dst),
-                     (ins RC:$a, RC:$b, CmpMode:$cmp),
-            !strconcat("setp${cmp:base}${cmp:ftz}.", TypeStr, "\t$dst, $a, $b;"),
-                     []>;
-  def ri : NVPTXInst<(outs Int1Regs:$dst),
-                     (ins RC:$a, ImmCls:$b, CmpMode:$cmp),
-            !strconcat("setp${cmp:base}${cmp:ftz}.", TypeStr, "\t$dst, $a, $b;"),
-                     []>;
-  def ir : NVPTXInst<(outs Int1Regs:$dst),
-                     (ins ImmCls:$a, RC:$b, CmpMode:$cmp),
-            !strconcat("setp${cmp:base}${cmp:ftz}.", TypeStr, "\t$dst, $a, $b;"),
-                     []>;
+// FIXME: This doesn't cover versions of set and setp that combine with a
+// boolean predicate, e.g. setp.eq.and.b16.
+
+let hasSideEffects = 0 in {
+  multiclass SETP<string TypeStr, RegisterClass RC, Operand ImmCls> {
+    def rr :
+      NVPTXInst<(outs Int1Regs:$dst), (ins RC:$a, RC:$b, CmpMode:$cmp),
+                !strconcat("setp${cmp:base}${cmp:ftz}.", TypeStr,
+                           "\t$dst, $a, $b;"), []>;
+    def ri :
+      NVPTXInst<(outs Int1Regs:$dst), (ins RC:$a, ImmCls:$b, CmpMode:$cmp),
+                !strconcat("setp${cmp:base}${cmp:ftz}.", TypeStr,
+                           "\t$dst, $a, $b;"), []>;
+    def ir :
+      NVPTXInst<(outs Int1Regs:$dst), (ins ImmCls:$a, RC:$b, CmpMode:$cmp),
+                !strconcat("setp${cmp:base}${cmp:ftz}.", TypeStr,
+                           "\t$dst, $a, $b;"), []>;
+  }
 }
 
 defm SETP_b16 : SETP<"b16", Int16Regs, i16imm>;
@@ -1279,17 +1284,22 @@ defm SETP_u64 : SETP<"u64", Int64Regs, i64imm>;
 defm SETP_f32 : SETP<"f32", Float32Regs, f32imm>;
 defm SETP_f64 : SETP<"f64", Float64Regs, f64imm>;
 
-// General set instructions
-multiclass SET<string TypeStr, RegisterClass RC, Operand ImmCls> {
-  def rr : NVPTXInst<(outs Int32Regs:$dst),
-                     (ins RC:$a, RC:$b, CmpMode:$cmp),
-                     !strconcat("set$cmp.", TypeStr, "\t$dst, $a, $b;"), []>;
-  def ri : NVPTXInst<(outs Int32Regs:$dst),
-                     (ins RC:$a, ImmCls:$b, CmpMode:$cmp),
-                     !strconcat("set$cmp.", TypeStr, "\t$dst, $a, $b;"), []>;
-  def ir : NVPTXInst<(outs Int32Regs:$dst),
-                     (ins ImmCls:$a, RC:$b, CmpMode:$cmp),
-                     !strconcat("set$cmp.", TypeStr, "\t$dst, $a, $b;"), []>;
+// FIXME: This doesn't appear to be correct.  The "set" mnemonic has the form
+// "set.CmpOp{.ftz}.dtype.stype", where dtype is the type of the destination
+// reg, either u32, s32, or f32.  Anyway these aren't used at the moment.
+
+let hasSideEffects = 0 in {
+  multiclass SET<string TypeStr, RegisterClass RC, Operand ImmCls> {
+    def rr : NVPTXInst<(outs Int32Regs:$dst),
+                       (ins RC:$a, RC:$b, CmpMode:$cmp),
+                       !strconcat("set$cmp.", TypeStr, "\t$dst, $a, $b;"), []>;
+    def ri : NVPTXInst<(outs Int32Regs:$dst),
+                       (ins RC:$a, ImmCls:$b, CmpMode:$cmp),
+                       !strconcat("set$cmp.", TypeStr, "\t$dst, $a, $b;"), []>;
+    def ir : NVPTXInst<(outs Int32Regs:$dst),
+                       (ins ImmCls:$a, RC:$b, CmpMode:$cmp),
+                       !strconcat("set$cmp.", TypeStr, "\t$dst, $a, $b;"), []>;
+  }
 }
 
 defm SET_b16 : SET<"b16", Int16Regs, i16imm>;
@@ -1305,45 +1315,56 @@ defm SET_f32 : SET<"f32", Float32Regs, f32imm>;
 defm SET_f64 : SET<"f64", Float64Regs, f64imm>;
 
 //-----------------------------------
-// General Selection
+// Selection instructions (selp)
 //-----------------------------------
 
-// General selp instructions
-multiclass SELP<string TypeStr, RegisterClass RC, Operand ImmCls> {
-  def rr : NVPTXInst<(outs RC:$dst),
-                     (ins RC:$a, RC:$b, Int1Regs:$p),
-                     !strconcat("selp.", TypeStr, "\t$dst, $a, $b, $p;"), []>;
-  def ri : NVPTXInst<(outs RC:$dst),
-                     (ins RC:$a, ImmCls:$b, Int1Regs:$p),
-                     !strconcat("selp.", TypeStr, "\t$dst, $a, $b, $p;"), []>;
-  def ir : NVPTXInst<(outs RC:$dst),
-                     (ins ImmCls:$a, RC:$b, Int1Regs:$p),
-                     !strconcat("selp.", TypeStr, "\t$dst, $a, $b, $p;"), []>;
-  def ii : NVPTXInst<(outs RC:$dst),
-                     (ins ImmCls:$a, ImmCls:$b, Int1Regs:$p),
-                     !strconcat("selp.", TypeStr, "\t$dst, $a, $b, $p;"), []>;
-}
+// FIXME: Missing slct
 
-multiclass SELP_PATTERN<string TypeStr, RegisterClass RC, Operand ImmCls,
-                        SDNode ImmNode> {
-  def rr : NVPTXInst<(outs RC:$dst),
-                     (ins RC:$a, RC:$b, Int1Regs:$p),
-                     !strconcat("selp.", TypeStr, "\t$dst, $a, $b, $p;"),
-                     [(set RC:$dst, (select Int1Regs:$p, RC:$a, RC:$b))]>;
-  def ri : NVPTXInst<(outs RC:$dst),
-                     (ins RC:$a, ImmCls:$b, Int1Regs:$p),
-                     !strconcat("selp.", TypeStr, "\t$dst, $a, $b, $p;"),
-                     [(set RC:$dst, (select Int1Regs:$p, RC:$a, ImmNode:$b))]>;
-  def ir : NVPTXInst<(outs RC:$dst),
-                     (ins ImmCls:$a, RC:$b, Int1Regs:$p),
-                     !strconcat("selp.", TypeStr, "\t$dst, $a, $b, $p;"),
-                     [(set RC:$dst, (select Int1Regs:$p, ImmNode:$a, RC:$b))]>;
-  def ii : NVPTXInst<(outs RC:$dst),
-                     (ins ImmCls:$a, ImmCls:$b, Int1Regs:$p),
-                     !strconcat("selp.", TypeStr, "\t$dst, $a, $b, $p;"),
-                 [(set RC:$dst, (select Int1Regs:$p, ImmNode:$a, ImmNode:$b))]>;
+// selp instructions that don't have any pattern matches; we explicitly use
+// them within this file.
+let hasSideEffects = 0 in {
+  multiclass SELP<string TypeStr, RegisterClass RC, Operand ImmCls> {
+    def rr : NVPTXInst<(outs RC:$dst),
+                       (ins RC:$a, RC:$b, Int1Regs:$p),
+                       !strconcat("selp.", TypeStr, "\t$dst, $a, $b, $p;"), []>;
+    def ri : NVPTXInst<(outs RC:$dst),
+                       (ins RC:$a, ImmCls:$b, Int1Regs:$p),
+                       !strconcat("selp.", TypeStr, "\t$dst, $a, $b, $p;"), []>;
+    def ir : NVPTXInst<(outs RC:$dst),
+                       (ins ImmCls:$a, RC:$b, Int1Regs:$p),
+                       !strconcat("selp.", TypeStr, "\t$dst, $a, $b, $p;"), []>;
+    def ii : NVPTXInst<(outs RC:$dst),
+                       (ins ImmCls:$a, ImmCls:$b, Int1Regs:$p),
+                       !strconcat("selp.", TypeStr, "\t$dst, $a, $b, $p;"), []>;
+  }
+
+  multiclass SELP_PATTERN<string TypeStr, RegisterClass RC, Operand ImmCls,
+                          SDNode ImmNode> {
+    def rr :
+      NVPTXInst<(outs RC:$dst),
+                (ins RC:$a, RC:$b, Int1Regs:$p),
+                !strconcat("selp.", TypeStr, "\t$dst, $a, $b, $p;"),
+                [(set RC:$dst, (select Int1Regs:$p, RC:$a, RC:$b))]>;
+    def ri :
+      NVPTXInst<(outs RC:$dst),
+                (ins RC:$a, ImmCls:$b, Int1Regs:$p),
+                !strconcat("selp.", TypeStr, "\t$dst, $a, $b, $p;"),
+                [(set RC:$dst, (select Int1Regs:$p, RC:$a, ImmNode:$b))]>;
+    def ir :
+      NVPTXInst<(outs RC:$dst),
+                (ins ImmCls:$a, RC:$b, Int1Regs:$p),
+                !strconcat("selp.", TypeStr, "\t$dst, $a, $b, $p;"),
+                [(set RC:$dst, (select Int1Regs:$p, ImmNode:$a, RC:$b))]>;
+    def ii :
+      NVPTXInst<(outs RC:$dst),
+                (ins ImmCls:$a, ImmCls:$b, Int1Regs:$p),
+                !strconcat("selp.", TypeStr, "\t$dst, $a, $b, $p;"),
+                [(set RC:$dst, (select Int1Regs:$p, ImmNode:$a, ImmNode:$b))]>;
+  }
 }
 
+// Don't pattern match on selp.{s,u}{16,32,64} -- selp.b{16,32,64} is just as
+// good.
 defm SELP_b16 : SELP_PATTERN<"b16", Int16Regs, i16imm, imm>;
 defm SELP_s16 : SELP<"s16", Int16Regs, i16imm>;
 defm SELP_u16 : SELP<"u16", Int16Regs, i16imm>;
@@ -1356,40 +1377,14 @@ defm SELP_u64 : SELP<"u64", Int64Regs, i64imm>;
 defm SELP_f32 : SELP_PATTERN<"f32", Float32Regs, f32imm, fpimm>;
 defm SELP_f64 : SELP_PATTERN<"f64", Float64Regs, f64imm, fpimm>;
 
-//
-// Funnnel shift in clamp mode
-//
-// - SDNodes are created so they can be used in the DAG code,
-//   e.g. NVPTXISelLowering (LowerShiftLeftParts and LowerShiftRightParts)
-//
-def SDTIntShiftDOp: SDTypeProfile<1, 3,
-                                  [SDTCisSameAs<0, 1>, SDTCisSameAs<0, 2>,
-                                   SDTCisInt<0>, SDTCisInt<3>]>;
-def FUN_SHFL_CLAMP : SDNode<"NVPTXISD::FUN_SHFL_CLAMP", SDTIntShiftDOp, []>;
-def FUN_SHFR_CLAMP : SDNode<"NVPTXISD::FUN_SHFR_CLAMP", SDTIntShiftDOp, []>;
-
-def FUNSHFLCLAMP : NVPTXInst<(outs Int32Regs:$dst),
-                             (ins Int32Regs:$lo, Int32Regs:$hi, Int32Regs:$amt),
-                  "shf.l.clamp.b32 \t$dst, $lo, $hi, $amt;",
-                  [(set Int32Regs:$dst,
-                     (FUN_SHFL_CLAMP Int32Regs:$lo,
-                        Int32Regs:$hi, Int32Regs:$amt))]>;
-
-def FUNSHFRCLAMP : NVPTXInst<(outs Int32Regs:$dst),
-                             (ins Int32Regs:$lo, Int32Regs:$hi, Int32Regs:$amt),
-                  "shf.r.clamp.b32 \t$dst, $lo, $hi, $amt;",
-                  [(set Int32Regs:$dst,
-                     (FUN_SHFR_CLAMP Int32Regs:$lo,
-                        Int32Regs:$hi, Int32Regs:$amt))]>;
-
 //-----------------------------------
 // Data Movement (Load / Store, Move)
 //-----------------------------------
 
 def ADDRri : ComplexPattern<i32, 2, "SelectADDRri", [frameindex],
-  [SDNPWantRoot]>;
+                            [SDNPWantRoot]>;
 def ADDRri64 : ComplexPattern<i64, 2, "SelectADDRri64", [frameindex],
-  [SDNPWantRoot]>;
+                              [SDNPWantRoot]>;
 
 def MEMri : Operand<i32> {
   let PrintMethod = "printMemOperand";
@@ -1401,82 +1396,83 @@ def MEMri64 : Operand<i64> {
 }
 
 def imem : Operand<iPTR> {
-    let PrintMethod = "printOperand";
+  let PrintMethod = "printOperand";
 }
 
 def imemAny : Operand<iPTRAny> {
-    let PrintMethod = "printOperand";
+  let PrintMethod = "printOperand";
 }
 
 def LdStCode : Operand<i32> {
-    let PrintMethod = "printLdStCode";
+  let PrintMethod = "printLdStCode";
 }
 
 def SDTWrapper : SDTypeProfile<1, 1, [SDTCisSameAs<0, 1>, SDTCisPtrTy<0>]>;
 def Wrapper    : SDNode<"NVPTXISD::Wrapper", SDTWrapper>;
 
+// Load a memory address into a u32 or u64 register.
 def MOV_ADDR : NVPTXInst<(outs Int32Regs:$dst), (ins imem:$a),
-                     "mov.u32 \t$dst, $a;",
-                     [(set Int32Regs:$dst, (Wrapper tglobaladdr:$a))]>;
-
+                         "mov.u32 \t$dst, $a;",
+                         [(set Int32Regs:$dst, (Wrapper tglobaladdr:$a))]>;
 def MOV_ADDR64 : NVPTXInst<(outs Int64Regs:$dst), (ins imem:$a),
-                     "mov.u64 \t$dst, $a;",
-                     [(set Int64Regs:$dst, (Wrapper tglobaladdr:$a))]>;
+                           "mov.u64 \t$dst, $a;",
+                           [(set Int64Regs:$dst, (Wrapper tglobaladdr:$a))]>;
 
-// Get pointer to local stack
-def MOV_DEPOT_ADDR
-  : NVPTXInst<(outs Int32Regs:$d), (ins i32imm:$num),
-              "mov.u32 \t$d, __local_depot$num;", []>;
-def MOV_DEPOT_ADDR_64
-  : NVPTXInst<(outs Int64Regs:$d), (ins i32imm:$num),
-              "mov.u64 \t$d, __local_depot$num;", []>;
+// Get pointer to local stack.
+let hasSideEffects = 0 in {
+  def MOV_DEPOT_ADDR :    NVPTXInst<(outs Int32Regs:$d), (ins i32imm:$num),
+                                     "mov.u32 \t$d, __local_depot$num;", []>;
+  def MOV_DEPOT_ADDR_64 : NVPTXInst<(outs Int64Regs:$d), (ins i32imm:$num),
+                                    "mov.u64 \t$d, __local_depot$num;", []>;
+}
 
 
 // copyPhysreg is hard-coded in NVPTXInstrInfo.cpp
-let IsSimpleMove=1 in {
-def IMOV1rr: NVPTXInst<(outs Int1Regs:$dst), (ins Int1Regs:$sss),
-                   "mov.pred \t$dst, $sss;", []>;
-def IMOV16rr: NVPTXInst<(outs Int16Regs:$dst), (ins Int16Regs:$sss),
-                    "mov.u16 \t$dst, $sss;", []>;
-def IMOV32rr: NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$sss),
-                    "mov.u32 \t$dst, $sss;", []>;
-def IMOV64rr: NVPTXInst<(outs Int64Regs:$dst), (ins Int64Regs:$sss),
-                    "mov.u64 \t$dst, $sss;", []>;
-
-def FMOV32rr: NVPTXInst<(outs Float32Regs:$dst), (ins Float32Regs:$src),
-                    "mov.f32 \t$dst, $src;", []>;
-def FMOV64rr: NVPTXInst<(outs Float64Regs:$dst), (ins Float64Regs:$src),
-                    "mov.f64 \t$dst, $src;", []>;
+let IsSimpleMove=1, hasSideEffects=0 in {
+  def IMOV1rr :  NVPTXInst<(outs Int1Regs:$dst), (ins Int1Regs:$sss),
+                           "mov.pred \t$dst, $sss;", []>;
+  def IMOV16rr : NVPTXInst<(outs Int16Regs:$dst), (ins Int16Regs:$sss),
+                           "mov.u16 \t$dst, $sss;", []>;
+  def IMOV32rr : NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$sss),
+                           "mov.u32 \t$dst, $sss;", []>;
+  def IMOV64rr : NVPTXInst<(outs Int64Regs:$dst), (ins Int64Regs:$sss),
+                           "mov.u64 \t$dst, $sss;", []>;
+
+  def FMOV32rr : NVPTXInst<(outs Float32Regs:$dst), (ins Float32Regs:$src),
+                           "mov.f32 \t$dst, $src;", []>;
+  def FMOV64rr : NVPTXInst<(outs Float64Regs:$dst), (ins Float64Regs:$src),
+                           "mov.f64 \t$dst, $src;", []>;
 }
-def IMOV1ri: NVPTXInst<(outs Int1Regs:$dst), (ins i1imm:$src),
-                    "mov.pred \t$dst, $src;",
-          [(set Int1Regs:$dst, imm:$src)]>;
-def IMOV16ri: NVPTXInst<(outs Int16Regs:$dst), (ins i16imm:$src),
-                    "mov.u16 \t$dst, $src;",
-          [(set Int16Regs:$dst, imm:$src)]>;
-def IMOV32ri: NVPTXInst<(outs Int32Regs:$dst), (ins i32imm:$src),
-                    "mov.u32 \t$dst, $src;",
-          [(set Int32Regs:$dst, imm:$src)]>;
-def IMOV64i: NVPTXInst<(outs Int64Regs:$dst), (ins i64imm:$src),
-                    "mov.u64 \t$dst, $src;",
-          [(set Int64Regs:$dst, imm:$src)]>;
-
-def FMOV32ri: NVPTXInst<(outs Float32Regs:$dst), (ins f32imm:$src),
-                    "mov.f32 \t$dst, $src;",
-          [(set Float32Regs:$dst, fpimm:$src)]>;
-def FMOV64ri: NVPTXInst<(outs Float64Regs:$dst), (ins f64imm:$src),
-                    "mov.f64 \t$dst, $src;",
-          [(set Float64Regs:$dst, fpimm:$src)]>;
+
+def IMOV1ri : NVPTXInst<(outs Int1Regs:$dst), (ins i1imm:$src),
+                        "mov.pred \t$dst, $src;",
+                        [(set Int1Regs:$dst, imm:$src)]>;
+def IMOV16ri : NVPTXInst<(outs Int16Regs:$dst), (ins i16imm:$src),
+                         "mov.u16 \t$dst, $src;",
+                         [(set Int16Regs:$dst, imm:$src)]>;
+def IMOV32ri : NVPTXInst<(outs Int32Regs:$dst), (ins i32imm:$src),
+                         "mov.u32 \t$dst, $src;",
+                         [(set Int32Regs:$dst, imm:$src)]>;
+def IMOV64i : NVPTXInst<(outs Int64Regs:$dst), (ins i64imm:$src),
+                        "mov.u64 \t$dst, $src;",
+                        [(set Int64Regs:$dst, imm:$src)]>;
+
+def FMOV32ri : NVPTXInst<(outs Float32Regs:$dst), (ins f32imm:$src),
+                         "mov.f32 \t$dst, $src;",
+                         [(set Float32Regs:$dst, fpimm:$src)]>;
+def FMOV64ri : NVPTXInst<(outs Float64Regs:$dst), (ins f64imm:$src),
+                         "mov.f64 \t$dst, $src;",
+                         [(set Float64Regs:$dst, fpimm:$src)]>;
 
 def : Pat<(i32 (Wrapper texternalsym:$dst)), (IMOV32ri texternalsym:$dst)>;
 
 //---- Copy Frame Index ----
-def LEA_ADDRi : NVPTXInst<(outs Int32Regs:$dst), (ins MEMri:$addr),
-                        "add.u32 \t$dst, ${addr:add};",
-                        [(set Int32Regs:$dst, ADDRri:$addr)]>;
+def LEA_ADDRi :   NVPTXInst<(outs Int32Regs:$dst), (ins MEMri:$addr),
+                            "add.u32 \t$dst, ${addr:add};",
+                            [(set Int32Regs:$dst, ADDRri:$addr)]>;
 def LEA_ADDRi64 : NVPTXInst<(outs Int64Regs:$dst), (ins MEMri64:$addr),
-                        "add.u64 \t$dst, ${addr:add};",
-                        [(set Int64Regs:$dst, ADDRri64:$addr)]>;
+                            "add.u64 \t$dst, ${addr:add};",
+                            [(set Int64Regs:$dst, ADDRri64:$addr)]>;
 
 //-----------------------------------
 // Comparison and Selection
@@ -1554,7 +1550,7 @@ multiclass ISET_FORMAT_SIGNED<PatFrag OpNode, PatLeaf Mode>
                 SET_s16rr, SET_s16ri, SET_s16ir,
                 SET_s32rr, SET_s32ri, SET_s32ir,
                 SET_s64rr, SET_s64ri, SET_s64ir> {
-  // TableGen doesn't like empty multiclasses
+  // TableGen doesn't like empty multiclasses.
   def : PatLeaf<(i32 0)>;
 }
 
@@ -1566,21 +1562,21 @@ multiclass ISET_FORMAT_UNSIGNED<PatFrag OpNode, PatLeaf Mode>
                 SET_u16rr, SET_u16ri, SET_u16ir,
                 SET_u32rr, SET_u32ri, SET_u32ir,
                 SET_u64rr, SET_u64ri, SET_u64ir> {
-  // TableGen doesn't like empty multiclasses
+  // TableGen doesn't like empty multiclasses.
   def : PatLeaf<(i32 0)>;
 }
 
 defm : ISET_FORMAT_SIGNED<setgt, CmpGT>;
-defm : ISET_FORMAT_UNSIGNED<setugt, CmpGT>;
 defm : ISET_FORMAT_SIGNED<setlt, CmpLT>;
-defm : ISET_FORMAT_UNSIGNED<setult, CmpLT>;
 defm : ISET_FORMAT_SIGNED<setge, CmpGE>;
-defm : ISET_FORMAT_UNSIGNED<setuge, CmpGE>;
 defm : ISET_FORMAT_SIGNED<setle, CmpLE>;
-defm : ISET_FORMAT_UNSIGNED<setule, CmpLE>;
 defm : ISET_FORMAT_SIGNED<seteq, CmpEQ>;
-defm : ISET_FORMAT_UNSIGNED<setueq, CmpEQ>;
 defm : ISET_FORMAT_SIGNED<setne, CmpNE>;
+defm : ISET_FORMAT_UNSIGNED<setugt, CmpGT>;
+defm : ISET_FORMAT_UNSIGNED<setult, CmpLT>;
+defm : ISET_FORMAT_UNSIGNED<setuge, CmpGE>;
+defm : ISET_FORMAT_UNSIGNED<setule, CmpLE>;
+defm : ISET_FORMAT_UNSIGNED<setueq, CmpEQ>;
 defm : ISET_FORMAT_UNSIGNED<setune, CmpNE>;
 
 // i1 compares
@@ -1678,13 +1674,14 @@ defm FSetNE : FSET_FORMAT<setne, CmpNE, CmpNE_FTZ>;
 defm FSetNUM : FSET_FORMAT<seto, CmpNUM, CmpNUM_FTZ>;
 defm FSetNAN : FSET_FORMAT<setuo, CmpNAN, CmpNAN_FTZ>;
 
-//def ld_param         : SDNode<"NVPTXISD::LOAD_PARAM", SDTLoad,
-//                        [SDNPHasChain, SDNPMayLoad, SDNPMemOperand]>;
+// FIXME: What is this doing here?  Can it be deleted?
+// def ld_param         : SDNode<"NVPTXISD::LOAD_PARAM", SDTLoad,
+//                         [SDNPHasChain, SDNPMayLoad, SDNPMemOperand]>;
 
-def SDTDeclareParamProfile : SDTypeProfile<0, 3, [SDTCisInt<0>, SDTCisInt<1>,
-  SDTCisInt<2>]>;
-def SDTDeclareScalarParamProfile : SDTypeProfile<0, 3, [SDTCisInt<0>,
-  SDTCisInt<1>, SDTCisInt<2>]>;
+def SDTDeclareParamProfile :
+  SDTypeProfile<0, 3, [SDTCisInt<0>, SDTCisInt<1>, SDTCisInt<2>]>;
+def SDTDeclareScalarParamProfile :
+  SDTypeProfile<0, 3, [SDTCisInt<0>, SDTCisInt<1>, SDTCisInt<2>]>;
 def SDTLoadParamProfile : SDTypeProfile<1, 2, [SDTCisInt<1>, SDTCisInt<2>]>;
 def SDTLoadParamV2Profile : SDTypeProfile<2, 2, [SDTCisSameAs<0, 1>, SDTCisInt<2>, SDTCisInt<3>]>;
 def SDTLoadParamV4Profile : SDTypeProfile<4, 2, [SDTCisInt<4>, SDTCisInt<5>]>;
@@ -1704,185 +1701,200 @@ def SDTStoreRetvalV2Profile : SDTypeProfile<0, 3, [SDTCisInt<0>]>;
 def SDTStoreRetvalV4Profile : SDTypeProfile<0, 5, [SDTCisInt<0>]>;
 def SDTPseudoUseParamProfile : SDTypeProfile<0, 1, []>;
 
-def DeclareParam : SDNode<"NVPTXISD::DeclareParam", SDTDeclareParamProfile,
-                       [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>;
-def DeclareScalarParam : SDNode<"NVPTXISD::DeclareScalarParam",
-  SDTDeclareScalarParamProfile,
-                       [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>;
-def DeclareRetParam : SDNode<"NVPTXISD::DeclareRetParam",
-  SDTDeclareParamProfile,
-                       [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>;
-def DeclareRet   : SDNode<"NVPTXISD::DeclareRet", SDTDeclareScalarParamProfile,
-                       [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>;
-def LoadParam    : SDNode<"NVPTXISD::LoadParam", SDTLoadParamProfile,
-                         [SDNPHasChain, SDNPMayLoad, SDNPOutGlue, SDNPInGlue]>;
-def LoadParamV2  : SDNode<"NVPTXISD::LoadParamV2", SDTLoadParamV2Profile,
-                         [SDNPHasChain, SDNPMayLoad, SDNPOutGlue, SDNPInGlue]>;
-def LoadParamV4  : SDNode<"NVPTXISD::LoadParamV4", SDTLoadParamV4Profile,
-                         [SDNPHasChain, SDNPMayLoad, SDNPOutGlue, SDNPInGlue]>;
-def PrintCall    : SDNode<"NVPTXISD::PrintCall", SDTPrintCallProfile,
-                       [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>;
-def PrintCallUni : SDNode<"NVPTXISD::PrintCallUni", SDTPrintCallUniProfile,
-                       [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>;
-def StoreParam   : SDNode<"NVPTXISD::StoreParam", SDTStoreParamProfile,
-                       [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>;
-def StoreParamV2 : SDNode<"NVPTXISD::StoreParamV2", SDTStoreParamV2Profile,
-                         [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>;
-def StoreParamV4 : SDNode<"NVPTXISD::StoreParamV4", SDTStoreParamV4Profile,
-                         [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>;
-def StoreParamU32 : SDNode<"NVPTXISD::StoreParamU32", SDTStoreParam32Profile,
-                       [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>;
-def StoreParamS32 : SDNode<"NVPTXISD::StoreParamS32", SDTStoreParam32Profile,
-                       [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>;
-def CallArgBegin : SDNode<"NVPTXISD::CallArgBegin", SDTCallArgMarkProfile,
-                       [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>;
-def CallArg      : SDNode<"NVPTXISD::CallArg", SDTCallArgProfile,
-                       [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>;
-def LastCallArg  : SDNode<"NVPTXISD::LastCallArg", SDTCallArgProfile,
-                       [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>;
-def CallArgEnd   : SDNode<"NVPTXISD::CallArgEnd", SDTCallVoidProfile,
-                       [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>;
-def CallVoid     : SDNode<"NVPTXISD::CallVoid", SDTCallVoidProfile,
-                       [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>;
-def Prototype    : SDNode<"NVPTXISD::Prototype", SDTCallVoidProfile,
-                       [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>;
-def CallVal      : SDNode<"NVPTXISD::CallVal", SDTCallValProfile,
-                       [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>;
-def MoveParam    : SDNode<"NVPTXISD::MoveParam", SDTMoveParamProfile,
-                         []>;
-def StoreRetval  : SDNode<"NVPTXISD::StoreRetval", SDTStoreRetvalProfile,
-                         [SDNPHasChain, SDNPSideEffect]>;
-def StoreRetvalV2  : SDNode<"NVPTXISD::StoreRetvalV2", SDTStoreRetvalV2Profile,
-                           [SDNPHasChain, SDNPSideEffect]>;
-def StoreRetvalV4  : SDNode<"NVPTXISD::StoreRetvalV4", SDTStoreRetvalV4Profile,
-                           [SDNPHasChain, SDNPSideEffect]>;
-def PseudoUseParam : SDNode<"NVPTXISD::PseudoUseParam",
-  SDTPseudoUseParamProfile,
-                       [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>;
-def RETURNNode   : SDNode<"NVPTXISD::RETURN", SDTCallArgMarkProfile,
-                         [SDNPHasChain, SDNPSideEffect]>;
-
-class LoadParamMemInst<NVPTXRegClass regclass, string opstr> :
-      NVPTXInst<(outs regclass:$dst), (ins i32imm:$b),
-                !strconcat(!strconcat("ld.param", opstr),
-                "\t$dst, [retval0+$b];"),
-                []>;
+def DeclareParam :
+  SDNode<"NVPTXISD::DeclareParam", SDTDeclareParamProfile,
+         [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>;
+def DeclareScalarParam :
+  SDNode<"NVPTXISD::DeclareScalarParam", SDTDeclareScalarParamProfile,
+         [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>;
+def DeclareRetParam :
+  SDNode<"NVPTXISD::DeclareRetParam", SDTDeclareParamProfile,
+         [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>;
+def DeclareRet :
+  SDNode<"NVPTXISD::DeclareRet", SDTDeclareScalarParamProfile,
+         [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>;
+def LoadParam :
+  SDNode<"NVPTXISD::LoadParam", SDTLoadParamProfile,
+         [SDNPHasChain, SDNPMayLoad, SDNPOutGlue, SDNPInGlue]>;
+def LoadParamV2 :
+  SDNode<"NVPTXISD::LoadParamV2", SDTLoadParamV2Profile,
+         [SDNPHasChain, SDNPMayLoad, SDNPOutGlue, SDNPInGlue]>;
+def LoadParamV4 :
+  SDNode<"NVPTXISD::LoadParamV4", SDTLoadParamV4Profile,
+         [SDNPHasChain, SDNPMayLoad, SDNPOutGlue, SDNPInGlue]>;
+def PrintCall :
+  SDNode<"NVPTXISD::PrintCall", SDTPrintCallProfile,
+         [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>;
+def PrintConvergentCall :
+  SDNode<"NVPTXISD::PrintConvergentCall", SDTPrintCallProfile,
+         [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>;
+def PrintCallUni :
+  SDNode<"NVPTXISD::PrintCallUni", SDTPrintCallUniProfile,
+         [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>;
+def PrintConvergentCallUni :
+  SDNode<"NVPTXISD::PrintConvergentCallUni", SDTPrintCallUniProfile,
+         [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>;
+def StoreParam :
+  SDNode<"NVPTXISD::StoreParam", SDTStoreParamProfile,
+         [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>;
+def StoreParamV2 :
+  SDNode<"NVPTXISD::StoreParamV2", SDTStoreParamV2Profile,
+         [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>;
+def StoreParamV4 :
+  SDNode<"NVPTXISD::StoreParamV4", SDTStoreParamV4Profile,
+         [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>;
+def StoreParamU32 :
+  SDNode<"NVPTXISD::StoreParamU32", SDTStoreParam32Profile,
+         [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>;
+def StoreParamS32 :
+  SDNode<"NVPTXISD::StoreParamS32", SDTStoreParam32Profile,
+         [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>;
+def CallArgBegin :
+  SDNode<"NVPTXISD::CallArgBegin", SDTCallArgMarkProfile,
+         [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>;
+def CallArg :
+  SDNode<"NVPTXISD::CallArg", SDTCallArgProfile,
+         [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>;
+def LastCallArg :
+  SDNode<"NVPTXISD::LastCallArg", SDTCallArgProfile,
+         [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>;
+def CallArgEnd :
+  SDNode<"NVPTXISD::CallArgEnd", SDTCallVoidProfile,
+         [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>;
+def CallVoid :
+  SDNode<"NVPTXISD::CallVoid", SDTCallVoidProfile,
+         [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>;
+def Prototype :
+  SDNode<"NVPTXISD::Prototype", SDTCallVoidProfile,
+         [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>;
+def CallVal :
+  SDNode<"NVPTXISD::CallVal", SDTCallValProfile,
+         [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>;
+def MoveParam :
+  SDNode<"NVPTXISD::MoveParam", SDTMoveParamProfile, []>;
+def StoreRetval :
+  SDNode<"NVPTXISD::StoreRetval", SDTStoreRetvalProfile,
+         [SDNPHasChain, SDNPSideEffect]>;
+def StoreRetvalV2 :
+  SDNode<"NVPTXISD::StoreRetvalV2", SDTStoreRetvalV2Profile,
+         [SDNPHasChain, SDNPSideEffect]>;
+def StoreRetvalV4 :
+  SDNode<"NVPTXISD::StoreRetvalV4", SDTStoreRetvalV4Profile,
+         [SDNPHasChain, SDNPSideEffect]>;
+def PseudoUseParam :
+  SDNode<"NVPTXISD::PseudoUseParam", SDTPseudoUseParamProfile,
+         [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>;
+def RETURNNode :
+  SDNode<"NVPTXISD::RETURN", SDTCallArgMarkProfile,
+         [SDNPHasChain, SDNPSideEffect]>;
+
+let mayLoad = 1 in {
+  class LoadParamMemInst<NVPTXRegClass regclass, string opstr> :
+        NVPTXInst<(outs regclass:$dst), (ins i32imm:$b),
+                  !strconcat(!strconcat("ld.param", opstr),
+                             "\t$dst, [retval0+$b];"),
+                  []>;
+
+  class LoadParamV2MemInst<NVPTXRegClass regclass, string opstr> :
+        NVPTXInst<(outs regclass:$dst, regclass:$dst2), (ins i32imm:$b),
+                  !strconcat("ld.param.v2", opstr,
+                             "\t{{$dst, $dst2}}, [retval0+$b];"), []>;
+
+  class LoadParamV4MemInst<NVPTXRegClass regclass, string opstr> :
+        NVPTXInst<(outs regclass:$dst, regclass:$dst2, regclass:$dst3,
+                        regclass:$dst4),
+                  (ins i32imm:$b),
+                  !strconcat("ld.param.v4", opstr,
+                             "\t{{$dst, $dst2, $dst3, $dst4}}, [retval0+$b];"),
+                  []>;
+}
 
 class LoadParamRegInst<NVPTXRegClass regclass, string opstr> :
       NVPTXInst<(outs regclass:$dst), (ins i32imm:$b),
-                !strconcat(!strconcat("mov", opstr),
-                "\t$dst, retval$b;"),
+                !strconcat("mov", opstr, "\t$dst, retval$b;"),
                 [(set regclass:$dst, (LoadParam (i32 0), (i32 imm:$b)))]>;
 
-class LoadParamV2MemInst<NVPTXRegClass regclass, string opstr> :
-      NVPTXInst<(outs regclass:$dst, regclass:$dst2), (ins i32imm:$b),
-                !strconcat(!strconcat("ld.param.v2", opstr),
-                "\t{{$dst, $dst2}}, [retval0+$b];"), []>;
-
-class LoadParamV4MemInst<NVPTXRegClass regclass, string opstr> :
-      NVPTXInst<(outs regclass:$dst, regclass:$dst2, regclass:$dst3,
-                      regclass:$dst4),
-                (ins i32imm:$b),
-                !strconcat(!strconcat("ld.param.v4", opstr),
-                "\t{{$dst, $dst2, $dst3, $dst4}}, [retval0+$b];"), []>;
-
-class StoreParamInst<NVPTXRegClass regclass, string opstr> :
-      NVPTXInst<(outs), (ins regclass:$val, i32imm:$a, i32imm:$b),
-                !strconcat(!strconcat("st.param", opstr),
-                "\t[param$a+$b], $val;"),
-                []>;
-
-class StoreParamV2Inst<NVPTXRegClass regclass, string opstr> :
-      NVPTXInst<(outs), (ins regclass:$val, regclass:$val2,
-                             i32imm:$a, i32imm:$b),
-                !strconcat(!strconcat("st.param.v2", opstr),
-                "\t[param$a+$b], {{$val, $val2}};"),
-                []>;
-
-class StoreParamV4Inst<NVPTXRegClass regclass, string opstr> :
-      NVPTXInst<(outs), (ins regclass:$val, regclass:$val1, regclass:$val2,
-                             regclass:$val3, i32imm:$a, i32imm:$b),
-                !strconcat(!strconcat("st.param.v4", opstr),
-                "\t[param$a+$b], {{$val, $val2, $val3, $val4}};"),
-                []>;
-
-class StoreRetvalInst<NVPTXRegClass regclass, string opstr> :
-      NVPTXInst<(outs), (ins regclass:$val, i32imm:$a),
-                !strconcat(!strconcat("st.param", opstr),
-                "\t[func_retval0+$a], $val;"),
-                []>;
-
-class StoreRetvalV2Inst<NVPTXRegClass regclass, string opstr> :
-      NVPTXInst<(outs), (ins regclass:$val, regclass:$val2, i32imm:$a),
-                !strconcat(!strconcat("st.param.v2", opstr),
-                "\t[func_retval0+$a], {{$val, $val2}};"),
-                []>;
-
-class StoreRetvalV4Inst<NVPTXRegClass regclass, string opstr> :
-      NVPTXInst<(outs),
-                (ins regclass:$val, regclass:$val2, regclass:$val3,
-                     regclass:$val4, i32imm:$a),
-                !strconcat(!strconcat("st.param.v4", opstr),
-                "\t[func_retval0+$a], {{$val, $val2, $val3, $val4}};"),
-                []>;
-
-def PrintCallRetInst1 : NVPTXInst<(outs), (ins),
-"call (retval0), ",
-                                [(PrintCall (i32 1))]>;
-def PrintCallRetInst2 : NVPTXInst<(outs), (ins),
-"call (retval0, retval1), ",
-                                [(PrintCall (i32 2))]>;
-def PrintCallRetInst3 : NVPTXInst<(outs), (ins),
-"call (retval0, retval1, retval2), ",
-                                [(PrintCall (i32 3))]>;
-def PrintCallRetInst4 : NVPTXInst<(outs), (ins),
-"call (retval0, retval1, retval2, retval3), ",
-                                [(PrintCall (i32 4))]>;
-def PrintCallRetInst5 : NVPTXInst<(outs), (ins),
-"call (retval0, retval1, retval2, retval3, retval4), ",
-                                [(PrintCall (i32 5))]>;
-def PrintCallRetInst6 : NVPTXInst<(outs), (ins),
-"call (retval0, retval1, retval2, retval3, retval4, retval5), ",
-                                [(PrintCall (i32 6))]>;
-def PrintCallRetInst7 : NVPTXInst<(outs), (ins),
-"call (retval0, retval1, retval2, retval3, retval4, retval5, retval6), ",
-                                [(PrintCall (i32 7))]>;
-def PrintCallRetInst8 : NVPTXInst<(outs), (ins),
-!strconcat("call (retval0, retval1, retval2, retval3, retval4",
-           ", retval5, retval6, retval7), "),
-                                [(PrintCall (i32 8))]>;
-
-def PrintCallNoRetInst : NVPTXInst<(outs), (ins), "call ",
-                                [(PrintCall (i32 0))]>;
-
-def PrintCallUniRetInst1 : NVPTXInst<(outs), (ins),
-"call.uni (retval0), ",
-                                [(PrintCallUni (i32 1))]>;
-def PrintCallUniRetInst2 : NVPTXInst<(outs), (ins),
-"call.uni (retval0, retval1), ",
-                                [(PrintCallUni (i32 2))]>;
-def PrintCallUniRetInst3 : NVPTXInst<(outs), (ins),
-"call.uni (retval0, retval1, retval2), ",
-                                [(PrintCallUni (i32 3))]>;
-def PrintCallUniRetInst4 : NVPTXInst<(outs), (ins),
-"call.uni (retval0, retval1, retval2, retval3), ",
-                                [(PrintCallUni (i32 4))]>;
-def PrintCallUniRetInst5 : NVPTXInst<(outs), (ins),
-"call.uni (retval0, retval1, retval2, retval3, retval4), ",
-                                [(PrintCallUni (i32 5))]>;
-def PrintCallUniRetInst6 : NVPTXInst<(outs), (ins),
-"call.uni (retval0, retval1, retval2, retval3, retval4, retval5), ",
-                                [(PrintCallUni (i32 6))]>;
-def PrintCallUniRetInst7 : NVPTXInst<(outs), (ins),
-"call.uni (retval0, retval1, retval2, retval3, retval4, retval5, retval6), ",
-                                [(PrintCallUni (i32 7))]>;
-def PrintCallUniRetInst8 : NVPTXInst<(outs), (ins),
-!strconcat("call.uni (retval0, retval1, retval2, retval3, retval4",
-           ", retval5, retval6, retval7), "),
-                                [(PrintCallUni (i32 8))]>;
-
-def PrintCallUniNoRetInst : NVPTXInst<(outs), (ins), "call.uni ",
-                                [(PrintCallUni (i32 0))]>;
+let mayStore = 1 in {
+  class StoreParamInst<NVPTXRegClass regclass, string opstr> :
+        NVPTXInst<(outs), (ins regclass:$val, i32imm:$a, i32imm:$b),
+                  !strconcat("st.param", opstr, "\t[param$a+$b], $val;"),
+                  []>;
+
+  class StoreParamV2Inst<NVPTXRegClass regclass, string opstr> :
+        NVPTXInst<(outs), (ins regclass:$val, regclass:$val2,
+                               i32imm:$a, i32imm:$b),
+                  !strconcat("st.param.v2", opstr,
+                             "\t[param$a+$b], {{$val, $val2}};"),
+                  []>;
+
+  class StoreParamV4Inst<NVPTXRegClass regclass, string opstr> :
+        NVPTXInst<(outs), (ins regclass:$val, regclass:$val2, regclass:$val3,
+                               regclass:$val4, i32imm:$a,
+                               i32imm:$b),
+                  !strconcat("st.param.v4", opstr,
+                             "\t[param$a+$b], {{$val, $val2, $val3, $val4}};"),
+                  []>;
+
+  class StoreRetvalInst<NVPTXRegClass regclass, string opstr> :
+        NVPTXInst<(outs), (ins regclass:$val, i32imm:$a),
+                  !strconcat("st.param", opstr, "\t[func_retval0+$a], $val;"),
+                  []>;
+
+  class StoreRetvalV2Inst<NVPTXRegClass regclass, string opstr> :
+        NVPTXInst<(outs), (ins regclass:$val, regclass:$val2, i32imm:$a),
+                  !strconcat("st.param.v2", opstr,
+                             "\t[func_retval0+$a], {{$val, $val2}};"),
+                  []>;
+
+  class StoreRetvalV4Inst<NVPTXRegClass regclass, string opstr> :
+        NVPTXInst<(outs),
+                  (ins regclass:$val, regclass:$val2, regclass:$val3,
+                       regclass:$val4, i32imm:$a),
+                  !strconcat("st.param.v4", opstr,
+                             "\t[func_retval0+$a], {{$val, $val2, $val3, $val4}};"),
+                  []>;
+}
+
+let isCall=1 in {
+  multiclass CALL<string OpcStr, SDNode OpNode> {
+     def PrintCallNoRetInst : NVPTXInst<(outs), (ins),
+       !strconcat(OpcStr, " "), [(OpNode (i32 0))]>;
+     def PrintCallRetInst1 : NVPTXInst<(outs), (ins),
+       !strconcat(OpcStr, " (retval0), "), [(OpNode (i32 1))]>;
+     def PrintCallRetInst2 : NVPTXInst<(outs), (ins),
+       !strconcat(OpcStr, " (retval0, retval1), "), [(OpNode (i32 2))]>;
+     def PrintCallRetInst3 : NVPTXInst<(outs), (ins),
+       !strconcat(OpcStr, " (retval0, retval1, retval2), "), [(OpNode (i32 3))]>;
+     def PrintCallRetInst4 : NVPTXInst<(outs), (ins),
+       !strconcat(OpcStr, " (retval0, retval1, retval2, retval3), "),
+       [(OpNode (i32 4))]>;
+     def PrintCallRetInst5 : NVPTXInst<(outs), (ins),
+       !strconcat(OpcStr, " (retval0, retval1, retval2, retval3, retval4), "),
+       [(OpNode (i32 5))]>;
+     def PrintCallRetInst6 : NVPTXInst<(outs), (ins),
+       !strconcat(OpcStr, " (retval0, retval1, retval2, retval3, retval4, "
+                            "retval5), "),
+       [(OpNode (i32 6))]>;
+     def PrintCallRetInst7 : NVPTXInst<(outs), (ins),
+       !strconcat(OpcStr, " (retval0, retval1, retval2, retval3, retval4, "
+                            "retval5, retval6), "),
+       [(OpNode (i32 7))]>;
+     def PrintCallRetInst8 : NVPTXInst<(outs), (ins),
+       !strconcat(OpcStr, " (retval0, retval1, retval2, retval3, retval4, "
+                            "retval5, retval6, retval7), "),
+       [(OpNode (i32 8))]>;
+  }
+}
+
+defm Call : CALL<"call", PrintCall>;
+defm CallUni : CALL<"call.uni", PrintCallUni>;
+
+// Convergent call instructions.  These are identical to regular calls, except
+// they have the isConvergent bit set.
+let isConvergent=1 in {
+  defm ConvergentCall : CALL<"call", PrintConvergentCall>;
+  defm ConvergentCallUni : CALL<"call.uni", PrintConvergentCallUni>;
+}
 
 def LoadParamMemI64    : LoadParamMemInst<Int64Regs, ".b64">;
 def LoadParamMemI32    : LoadParamMemInst<Int32Regs, ".b32">;
@@ -1911,39 +1923,15 @@ def StoreParamV2I32  : StoreParamV2Inst<Int32Regs, ".b32">;
 def StoreParamV2I16  : StoreParamV2Inst<Int16Regs, ".b16">;
 def StoreParamV2I8   : StoreParamV2Inst<Int16Regs, ".b8">;
 
-// FIXME: StoreParamV4Inst crashes llvm-tblgen :(
-//def StoreParamV4I32    : StoreParamV4Inst<Int32Regs, ".b32">;
-def StoreParamV4I32    : NVPTXInst<(outs), (ins Int32Regs:$val, Int32Regs:$val2,
-                                               Int32Regs:$val3, Int32Regs:$val4,
-                                                i32imm:$a, i32imm:$b),
-                "st.param.v4.b32\t[param$a+$b], {{$val, $val2, $val3, $val4}};",
-                         []>;
-
-def StoreParamV4I16    : NVPTXInst<(outs), (ins Int16Regs:$val, Int16Regs:$val2,
-                                               Int16Regs:$val3, Int16Regs:$val4,
-                                                i32imm:$a, i32imm:$b),
-                "st.param.v4.b16\t[param$a+$b], {{$val, $val2, $val3, $val4}};",
-                         []>;
-
-def StoreParamV4I8     : NVPTXInst<(outs), (ins Int16Regs:$val, Int16Regs:$val2,
-                                                Int16Regs:$val3, Int16Regs:$val4,
-                                                i32imm:$a, i32imm:$b),
-                 "st.param.v4.b8\t[param$a+$b], {{$val, $val2, $val3, $val4}};",
-                         []>;
-
-def StoreParamF32    : StoreParamInst<Float32Regs, ".f32">;
-def StoreParamF64    : StoreParamInst<Float64Regs, ".f64">;
+def StoreParamV4I32  : StoreParamV4Inst<Int32Regs, ".b32">;
+def StoreParamV4I16  : StoreParamV4Inst<Int16Regs, ".b16">;
+def StoreParamV4I8   : StoreParamV4Inst<Int16Regs, ".b8">;
+
+def StoreParamF32      : StoreParamInst<Float32Regs, ".f32">;
+def StoreParamF64      : StoreParamInst<Float64Regs, ".f64">;
 def StoreParamV2F32    : StoreParamV2Inst<Float32Regs, ".f32">;
 def StoreParamV2F64    : StoreParamV2Inst<Float64Regs, ".f64">;
-// FIXME: StoreParamV4Inst crashes llvm-tblgen :(
-//def StoreParamV4F32    : StoreParamV4Inst<Float32Regs, ".f32">;
-def StoreParamV4F32    : NVPTXInst<(outs),
-                                   (ins Float32Regs:$val, Float32Regs:$val2,
-                                        Float32Regs:$val3, Float32Regs:$val4,
-                                        i32imm:$a, i32imm:$b),
-                "st.param.v4.f32\t[param$a+$b], {{$val, $val2, $val3, $val4}};",
-                        []>;
-
+def StoreParamV4F32    : StoreParamV4Inst<Float32Regs, ".f32">;
 
 def StoreRetvalI64    : StoreRetvalInst<Int64Regs, ".b64">;
 def StoreRetvalI32    : StoreRetvalInst<Int32Regs, ".b32">;
@@ -1969,89 +1957,88 @@ def CallArgEndInst0  : NVPTXInst<(outs), (ins), ")", [(CallArgEnd (i32 0))]>;
 def RETURNInst       : NVPTXInst<(outs), (ins), "ret;", [(RETURNNode)]>;
 
 class CallArgInst<NVPTXRegClass regclass> :
-      NVPTXInst<(outs), (ins regclass:$a), "$a, ",
-                [(CallArg (i32 0), regclass:$a)]>;
+  NVPTXInst<(outs), (ins regclass:$a), "$a, ",
+            [(CallArg (i32 0), regclass:$a)]>;
 
 class LastCallArgInst<NVPTXRegClass regclass> :
-      NVPTXInst<(outs), (ins regclass:$a), "$a",
-                [(LastCallArg (i32 0), regclass:$a)]>;
+  NVPTXInst<(outs), (ins regclass:$a), "$a",
+            [(LastCallArg (i32 0), regclass:$a)]>;
 
 def CallArgI64     : CallArgInst<Int64Regs>;
 def CallArgI32     : CallArgInst<Int32Regs>;
 def CallArgI16     : CallArgInst<Int16Regs>;
-
 def CallArgF64     : CallArgInst<Float64Regs>;
 def CallArgF32     : CallArgInst<Float32Regs>;
 
 def LastCallArgI64 : LastCallArgInst<Int64Regs>;
 def LastCallArgI32 : LastCallArgInst<Int32Regs>;
 def LastCallArgI16 : LastCallArgInst<Int16Regs>;
-
 def LastCallArgF64 : LastCallArgInst<Float64Regs>;
 def LastCallArgF32 : LastCallArgInst<Float32Regs>;
 
 def CallArgI32imm : NVPTXInst<(outs), (ins i32imm:$a), "$a, ",
                               [(CallArg (i32 0), (i32 imm:$a))]>;
 def LastCallArgI32imm : NVPTXInst<(outs), (ins i32imm:$a), "$a",
-                              [(LastCallArg (i32 0), (i32 imm:$a))]>;
+                                  [(LastCallArg (i32 0), (i32 imm:$a))]>;
 
 def CallArgParam : NVPTXInst<(outs), (ins i32imm:$a), "param$a, ",
                              [(CallArg (i32 1), (i32 imm:$a))]>;
 def LastCallArgParam : NVPTXInst<(outs), (ins i32imm:$a), "param$a",
-                             [(LastCallArg (i32 1), (i32 imm:$a))]>;
-
-def CallVoidInst : NVPTXInst<(outs), (ins imem:$addr),
-                             "$addr, ",
-                             [(CallVoid (Wrapper tglobaladdr:$addr))]>;
-def CallVoidInstReg : NVPTXInst<(outs), (ins Int32Regs:$addr),
-                             "$addr, ",
-                             [(CallVoid Int32Regs:$addr)]>;
-def CallVoidInstReg64 : NVPTXInst<(outs), (ins Int64Regs:$addr),
-                             "$addr, ",
-                             [(CallVoid Int64Regs:$addr)]>;
-def PrototypeInst : NVPTXInst<(outs), (ins i32imm:$val),
-                             ", prototype_$val;",
-                             [(Prototype (i32 imm:$val))]>;
-
-def DeclareRetMemInst : NVPTXInst<(outs),
-  (ins i32imm:$align, i32imm:$size, i32imm:$num),
-         ".param .align $align .b8 retval$num[$size];",
-         [(DeclareRetParam (i32 imm:$align), (i32 imm:$size), (i32 imm:$num))]>;
-def DeclareRetScalarInst : NVPTXInst<(outs), (ins i32imm:$size, i32imm:$num),
-         ".param .b$size retval$num;",
-         [(DeclareRet (i32 1), (i32 imm:$size), (i32 imm:$num))]>;
-def DeclareRetRegInst : NVPTXInst<(outs), (ins i32imm:$size, i32imm:$num),
-         ".reg .b$size retval$num;",
-         [(DeclareRet (i32 2), (i32 imm:$size), (i32 imm:$num))]>;
-
-def DeclareParamInst : NVPTXInst<(outs),
-  (ins i32imm:$align, i32imm:$a, i32imm:$size),
-         ".param .align $align .b8 param$a[$size];",
-         [(DeclareParam (i32 imm:$align), (i32 imm:$a), (i32 imm:$size))]>;
-def DeclareScalarParamInst : NVPTXInst<(outs), (ins i32imm:$a, i32imm:$size),
-         ".param .b$size param$a;",
-         [(DeclareScalarParam (i32 imm:$a), (i32 imm:$size), (i32 0))]>;
-def DeclareScalarRegInst : NVPTXInst<(outs), (ins i32imm:$a, i32imm:$size),
-         ".reg .b$size param$a;",
-         [(DeclareScalarParam (i32 imm:$a), (i32 imm:$size), (i32 1))]>;
+                                 [(LastCallArg (i32 1), (i32 imm:$a))]>;
+
+def CallVoidInst :      NVPTXInst<(outs), (ins imem:$addr), "$addr, ",
+                                  [(CallVoid (Wrapper tglobaladdr:$addr))]>;
+def CallVoidInstReg :   NVPTXInst<(outs), (ins Int32Regs:$addr), "$addr, ",
+                                  [(CallVoid Int32Regs:$addr)]>;
+def CallVoidInstReg64 : NVPTXInst<(outs), (ins Int64Regs:$addr), "$addr, ",
+                                  [(CallVoid Int64Regs:$addr)]>;
+def PrototypeInst :     NVPTXInst<(outs), (ins i32imm:$val), ", prototype_$val;",
+                                  [(Prototype (i32 imm:$val))]>;
+
+def DeclareRetMemInst :
+  NVPTXInst<(outs), (ins i32imm:$align, i32imm:$size, i32imm:$num),
+            ".param .align $align .b8 retval$num[$size];",
+            [(DeclareRetParam (i32 imm:$align), (i32 imm:$size), (i32 imm:$num))]>;
+def DeclareRetScalarInst :
+  NVPTXInst<(outs), (ins i32imm:$size, i32imm:$num),
+            ".param .b$size retval$num;",
+            [(DeclareRet (i32 1), (i32 imm:$size), (i32 imm:$num))]>;
+def DeclareRetRegInst :
+  NVPTXInst<(outs), (ins i32imm:$size, i32imm:$num),
+            ".reg .b$size retval$num;",
+            [(DeclareRet (i32 2), (i32 imm:$size), (i32 imm:$num))]>;
+
+def DeclareParamInst :
+  NVPTXInst<(outs), (ins i32imm:$align, i32imm:$a, i32imm:$size),
+            ".param .align $align .b8 param$a[$size];",
+            [(DeclareParam (i32 imm:$align), (i32 imm:$a), (i32 imm:$size))]>;
+def DeclareScalarParamInst :
+  NVPTXInst<(outs), (ins i32imm:$a, i32imm:$size),
+            ".param .b$size param$a;",
+            [(DeclareScalarParam (i32 imm:$a), (i32 imm:$size), (i32 0))]>;
+def DeclareScalarRegInst :
+  NVPTXInst<(outs), (ins i32imm:$a, i32imm:$size),
+            ".reg .b$size param$a;",
+            [(DeclareScalarParam (i32 imm:$a), (i32 imm:$size), (i32 1))]>;
 
 class MoveParamInst<NVPTXRegClass regclass, string asmstr> :
-      NVPTXInst<(outs regclass:$dst), (ins regclass:$src),
-                !strconcat(!strconcat("mov", asmstr), "\t$dst, $src;"),
-                [(set regclass:$dst, (MoveParam regclass:$src))]>;
+  NVPTXInst<(outs regclass:$dst), (ins regclass:$src),
+            !strconcat("mov", asmstr, "\t$dst, $src;"),
+            [(set regclass:$dst, (MoveParam regclass:$src))]>;
 
 def MoveParamI64 : MoveParamInst<Int64Regs, ".b64">;
 def MoveParamI32 : MoveParamInst<Int32Regs, ".b32">;
-def MoveParamI16 : NVPTXInst<(outs Int16Regs:$dst), (ins Int16Regs:$src),
-                   "cvt.u16.u32\t$dst, $src;",
-                   [(set Int16Regs:$dst, (MoveParam Int16Regs:$src))]>;
+def MoveParamI16 :
+  NVPTXInst<(outs Int16Regs:$dst), (ins Int16Regs:$src),
+            "cvt.u16.u32\t$dst, $src;",
+            [(set Int16Regs:$dst, (MoveParam Int16Regs:$src))]>;
 def MoveParamF64 : MoveParamInst<Float64Regs, ".f64">;
 def MoveParamF32 : MoveParamInst<Float32Regs, ".f32">;
 
 class PseudoUseParamInst<NVPTXRegClass regclass> :
-      NVPTXInst<(outs), (ins regclass:$src),
-      "// Pseudo use of $src",
-      [(PseudoUseParam regclass:$src)]>;
+  NVPTXInst<(outs), (ins regclass:$src),
+            "// Pseudo use of $src",
+            [(PseudoUseParam regclass:$src)]>;
 
 def PseudoUseParamI64 : PseudoUseParamInst<Int64Regs>;
 def PseudoUseParamI32 : PseudoUseParamInst<Int32Regs>;
@@ -2064,254 +2051,278 @@ def PseudoUseParamF32 : PseudoUseParamInst<Float32Regs>;
 // Load / Store Handling
 //
 multiclass LD<NVPTXRegClass regclass> {
-  def _avar : NVPTXInst<(outs regclass:$dst),
+  def _avar : NVPTXInst<
+    (outs regclass:$dst),
     (ins LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign,
-      i32imm:$fromWidth, imem:$addr),
-!strconcat("ld${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}",
-           "$fromWidth \t$dst, [$addr];"), []>;
-  def _areg : NVPTXInst<(outs regclass:$dst),
+         i32imm:$fromWidth, imem:$addr),
+    "ld${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
+    "\t$dst, [$addr];", []>;
+  def _areg : NVPTXInst<
+    (outs regclass:$dst),
     (ins LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign,
-      i32imm:$fromWidth, Int32Regs:$addr),
-!strconcat("ld${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}",
-           "$fromWidth \t$dst, [$addr];"), []>;
-  def _areg_64 : NVPTXInst<(outs regclass:$dst),
+         i32imm:$fromWidth, Int32Regs:$addr),
+    "ld${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
+    "\t$dst, [$addr];", []>;
+  def _areg_64 : NVPTXInst<
+    (outs regclass:$dst),
     (ins LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign,
-     i32imm:$fromWidth, Int64Regs:$addr),
-     !strconcat("ld${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth",
-                " \t$dst, [$addr];"), []>;
-  def _ari : NVPTXInst<(outs regclass:$dst),
+         i32imm:$fromWidth, Int64Regs:$addr),
+    "ld${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
+    "\t$dst, [$addr];", []>;
+  def _ari : NVPTXInst<
+    (outs regclass:$dst),
     (ins LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign,
-      i32imm:$fromWidth, Int32Regs:$addr, i32imm:$offset),
-!strconcat("ld${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}",
-           "$fromWidth \t$dst, [$addr+$offset];"), []>;
-  def _ari_64 : NVPTXInst<(outs regclass:$dst),
-    (ins LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign,
-     i32imm:$fromWidth, Int64Regs:$addr, i32imm:$offset),
-    !strconcat("ld${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth",
-               " \t$dst, [$addr+$offset];"), []>;
-  def _asi : NVPTXInst<(outs regclass:$dst),
-    (ins LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign,
-      i32imm:$fromWidth, imem:$addr, i32imm:$offset),
-!strconcat("ld${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}",
-           "$fromWidth \t$dst, [$addr+$offset];"), []>;
+         i32imm:$fromWidth, Int32Regs:$addr, i32imm:$offset),
+    "ld${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
+    "\t$dst, [$addr+$offset];", []>;
+  def _ari_64 : NVPTXInst<
+    (outs regclass:$dst),
+    (ins LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec,
+         LdStCode:$Sign, i32imm:$fromWidth, Int64Regs:$addr, i32imm:$offset),
+    "ld${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
+    "\t$dst, [$addr+$offset];", []>;
+  def _asi : NVPTXInst<
+    (outs regclass:$dst),
+    (ins LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec,
+         LdStCode:$Sign, i32imm:$fromWidth, imem:$addr, i32imm:$offset),
+    "ld${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
+    "\t$dst, [$addr+$offset];", []>;
 }
 
 let mayLoad=1, hasSideEffects=0 in {
-defm LD_i8  : LD<Int16Regs>;
-defm LD_i16 : LD<Int16Regs>;
-defm LD_i32 : LD<Int32Regs>;
-defm LD_i64 : LD<Int64Regs>;
-defm LD_f32 : LD<Float32Regs>;
-defm LD_f64 : LD<Float64Regs>;
+  defm LD_i8  : LD<Int16Regs>;
+  defm LD_i16 : LD<Int16Regs>;
+  defm LD_i32 : LD<Int32Regs>;
+  defm LD_i64 : LD<Int64Regs>;
+  defm LD_f32 : LD<Float32Regs>;
+  defm LD_f64 : LD<Float64Regs>;
 }
 
 multiclass ST<NVPTXRegClass regclass> {
-  def _avar : NVPTXInst<(outs),
-    (ins regclass:$src, LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec,
-      LdStCode:$Sign, i32imm:$toWidth, imem:$addr),
-!strconcat("st${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$toWidth",
-           " \t[$addr], $src;"), []>;
-  def _areg : NVPTXInst<(outs),
+  def _avar : NVPTXInst<
+    (outs),
     (ins regclass:$src, LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec,
-      LdStCode:$Sign, i32imm:$toWidth, Int32Regs:$addr),
-!strconcat("st${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$toWidth",
-           " \t[$addr], $src;"), []>;
-  def _areg_64 : NVPTXInst<(outs),
+         LdStCode:$Sign, i32imm:$toWidth, imem:$addr),
+    "st${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$toWidth"
+    " \t[$addr], $src;", []>;
+  def _areg : NVPTXInst<
+    (outs),
+    (ins regclass:$src, LdStCode:$isVol, LdStCode:$addsp,
+         LdStCode:$Vec, LdStCode:$Sign, i32imm:$toWidth, Int32Regs:$addr),
+    "st${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$toWidth"
+    " \t[$addr], $src;", []>;
+  def _areg_64 : NVPTXInst<
+    (outs),
     (ins regclass:$src, LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec,
-     LdStCode:$Sign, i32imm:$toWidth, Int64Regs:$addr),
-  !strconcat("st${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$toWidth ",
-               "\t[$addr], $src;"), []>;
-  def _ari : NVPTXInst<(outs),
+         LdStCode:$Sign, i32imm:$toWidth, Int64Regs:$addr),
+    "st${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$toWidth"
+    " \t[$addr], $src;", []>;
+  def _ari : NVPTXInst<
+    (outs),
     (ins regclass:$src, LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec,
-      LdStCode:$Sign, i32imm:$toWidth, Int32Regs:$addr, i32imm:$offset),
-!strconcat("st${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$toWidth",
-           " \t[$addr+$offset], $src;"), []>;
-  def _ari_64 : NVPTXInst<(outs),
+         LdStCode:$Sign, i32imm:$toWidth, Int32Regs:$addr, i32imm:$offset),
+    "st${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$toWidth"
+    " \t[$addr+$offset], $src;", []>;
+  def _ari_64 : NVPTXInst<
+    (outs),
     (ins regclass:$src, LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec,
-     LdStCode:$Sign, i32imm:$toWidth, Int64Regs:$addr, i32imm:$offset),
-  !strconcat("st${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$toWidth ",
-               "\t[$addr+$offset], $src;"), []>;
-  def _asi : NVPTXInst<(outs),
+         LdStCode:$Sign, i32imm:$toWidth, Int64Regs:$addr, i32imm:$offset),
+    "st${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$toWidth"
+    " \t[$addr+$offset], $src;", []>;
+  def _asi : NVPTXInst<
+    (outs),
     (ins regclass:$src, LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec,
-      LdStCode:$Sign, i32imm:$toWidth, imem:$addr, i32imm:$offset),
-!strconcat("st${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$toWidth",
-           " \t[$addr+$offset], $src;"), []>;
+         LdStCode:$Sign, i32imm:$toWidth, imem:$addr, i32imm:$offset),
+    "st${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$toWidth"
+    " \t[$addr+$offset], $src;", []>;
 }
 
 let mayStore=1, hasSideEffects=0 in {
-defm ST_i8  : ST<Int16Regs>;
-defm ST_i16 : ST<Int16Regs>;
-defm ST_i32 : ST<Int32Regs>;
-defm ST_i64 : ST<Int64Regs>;
-defm ST_f32 : ST<Float32Regs>;
-defm ST_f64 : ST<Float64Regs>;
+  defm ST_i8  : ST<Int16Regs>;
+  defm ST_i16 : ST<Int16Regs>;
+  defm ST_i32 : ST<Int32Regs>;
+  defm ST_i64 : ST<Int64Regs>;
+  defm ST_f32 : ST<Float32Regs>;
+  defm ST_f64 : ST<Float64Regs>;
 }
 
-// The following is used only in and after vector elementizations.
-// Vector elementization happens at the machine instruction level, so the
-// following instruction
-// never appears in the DAG.
+// The following is used only in and after vector elementizations.  Vector
+// elementization happens at the machine instruction level, so the following
+// instructions never appear in the DAG.
 multiclass LD_VEC<NVPTXRegClass regclass> {
-  def _v2_avar : NVPTXInst<(outs regclass:$dst1, regclass:$dst2),
+  def _v2_avar : NVPTXInst<
+    (outs regclass:$dst1, regclass:$dst2),
     (ins LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign,
-      i32imm:$fromWidth, imem:$addr),
-    !strconcat("ld${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}",
-               "$fromWidth \t{{$dst1, $dst2}}, [$addr];"), []>;
-  def _v2_areg : NVPTXInst<(outs regclass:$dst1, regclass:$dst2),
+         i32imm:$fromWidth, imem:$addr),
+    "ld${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
+    "\t{{$dst1, $dst2}}, [$addr];", []>;
+  def _v2_areg : NVPTXInst<
+    (outs regclass:$dst1, regclass:$dst2),
     (ins LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign,
-      i32imm:$fromWidth, Int32Regs:$addr),
-    !strconcat("ld${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}",
-               "$fromWidth \t{{$dst1, $dst2}}, [$addr];"), []>;
-  def _v2_areg_64 : NVPTXInst<(outs regclass:$dst1, regclass:$dst2),
+         i32imm:$fromWidth, Int32Regs:$addr),
+    "ld${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
+    "\t{{$dst1, $dst2}}, [$addr];", []>;
+  def _v2_areg_64 : NVPTXInst<
+    (outs regclass:$dst1, regclass:$dst2),
     (ins LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign,
-     i32imm:$fromWidth, Int64Regs:$addr),
-    !strconcat("ld${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}",
-               "$fromWidth \t{{$dst1, $dst2}}, [$addr];"), []>;
-  def _v2_ari : NVPTXInst<(outs regclass:$dst1, regclass:$dst2),
+         i32imm:$fromWidth, Int64Regs:$addr),
+    "ld${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
+    "\t{{$dst1, $dst2}}, [$addr];", []>;
+  def _v2_ari : NVPTXInst<
+    (outs regclass:$dst1, regclass:$dst2),
     (ins LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign,
-      i32imm:$fromWidth, Int32Regs:$addr, i32imm:$offset),
-    !strconcat("ld${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}",
-               "$fromWidth \t{{$dst1, $dst2}}, [$addr+$offset];"), []>;
-  def _v2_ari_64 : NVPTXInst<(outs regclass:$dst1, regclass:$dst2),
+         i32imm:$fromWidth, Int32Regs:$addr, i32imm:$offset),
+    "ld${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
+    "\t{{$dst1, $dst2}}, [$addr+$offset];", []>;
+  def _v2_ari_64 : NVPTXInst<
+    (outs regclass:$dst1, regclass:$dst2),
     (ins LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign,
-     i32imm:$fromWidth, Int64Regs:$addr, i32imm:$offset),
-    !strconcat("ld${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}",
-               "$fromWidth \t{{$dst1, $dst2}}, [$addr+$offset];"), []>;
-  def _v2_asi : NVPTXInst<(outs regclass:$dst1, regclass:$dst2),
+         i32imm:$fromWidth, Int64Regs:$addr, i32imm:$offset),
+    "ld${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
+    "\t{{$dst1, $dst2}}, [$addr+$offset];", []>;
+  def _v2_asi : NVPTXInst<
+    (outs regclass:$dst1, regclass:$dst2),
     (ins LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign,
-      i32imm:$fromWidth, imem:$addr, i32imm:$offset),
-    !strconcat("ld${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}",
-               "$fromWidth \t{{$dst1, $dst2}}, [$addr+$offset];"), []>;
-  def _v4_avar : NVPTXInst<(outs regclass:$dst1, regclass:$dst2,
-      regclass:$dst3, regclass:$dst4),
+         i32imm:$fromWidth, imem:$addr, i32imm:$offset),
+    "ld${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
+    "\t{{$dst1, $dst2}}, [$addr+$offset];", []>;
+  def _v4_avar : NVPTXInst<
+    (outs regclass:$dst1, regclass:$dst2, regclass:$dst3, regclass:$dst4),
     (ins LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign,
-      i32imm:$fromWidth, imem:$addr),
-    !strconcat("ld${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}",
-               "$fromWidth \t{{$dst1, $dst2, $dst3, $dst4}}, [$addr];"), []>;
-  def _v4_areg : NVPTXInst<(outs regclass:$dst1, regclass:$dst2, regclass:$dst3,
-      regclass:$dst4),
+         i32imm:$fromWidth, imem:$addr),
+    "ld${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
+    "\t{{$dst1, $dst2, $dst3, $dst4}}, [$addr];", []>;
+  def _v4_areg : NVPTXInst<
+    (outs regclass:$dst1, regclass:$dst2, regclass:$dst3, regclass:$dst4),
     (ins LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign,
-      i32imm:$fromWidth, Int32Regs:$addr),
-    !strconcat("ld${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}",
-               "$fromWidth \t{{$dst1, $dst2, $dst3, $dst4}}, [$addr];"), []>;
-  def _v4_areg_64 : NVPTXInst<(outs regclass:$dst1, regclass:$dst2,
-                               regclass:$dst3, regclass:$dst4),
+         i32imm:$fromWidth, Int32Regs:$addr),
+    "ld${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
+    "\t{{$dst1, $dst2, $dst3, $dst4}}, [$addr];", []>;
+  def _v4_areg_64 : NVPTXInst<
+    (outs regclass:$dst1, regclass:$dst2, regclass:$dst3, regclass:$dst4),
     (ins LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign,
-     i32imm:$fromWidth, Int64Regs:$addr),
-    !strconcat("ld${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}",
-               "$fromWidth \t{{$dst1, $dst2, $dst3, $dst4}}, [$addr];"), []>;
-  def _v4_ari : NVPTXInst<(outs regclass:$dst1, regclass:$dst2, regclass:$dst3,
-      regclass:$dst4),
+         i32imm:$fromWidth, Int64Regs:$addr),
+    "ld${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
+    "\t{{$dst1, $dst2, $dst3, $dst4}}, [$addr];", []>;
+  def _v4_ari : NVPTXInst<
+    (outs regclass:$dst1, regclass:$dst2, regclass:$dst3, regclass:$dst4),
     (ins LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign,
-      i32imm:$fromWidth, Int32Regs:$addr, i32imm:$offset),
-    !strconcat("ld${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}",
-               "$fromWidth \t{{$dst1, $dst2, $dst3, $dst4}}, [$addr+$offset];"),
-                []>;
-  def _v4_ari_64 : NVPTXInst<(outs regclass:$dst1, regclass:$dst2,
-                              regclass:$dst3, regclass:$dst4),
+         i32imm:$fromWidth, Int32Regs:$addr, i32imm:$offset),
+    "ld${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
+    "\t{{$dst1, $dst2, $dst3, $dst4}}, [$addr+$offset];", []>;
+  def _v4_ari_64 : NVPTXInst<
+    (outs regclass:$dst1, regclass:$dst2, regclass:$dst3, regclass:$dst4),
     (ins LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign,
-     i32imm:$fromWidth, Int64Regs:$addr, i32imm:$offset),
-    !strconcat("ld${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}",
-               "$fromWidth \t{{$dst1, $dst2, $dst3, $dst4}}, [$addr+$offset];"),
-    []>;
-  def _v4_asi : NVPTXInst<(outs regclass:$dst1, regclass:$dst2, regclass:$dst3,
-      regclass:$dst4),
+         i32imm:$fromWidth, Int64Regs:$addr, i32imm:$offset),
+    "ld${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
+    "\t{{$dst1, $dst2, $dst3, $dst4}}, [$addr+$offset];", []>;
+  def _v4_asi : NVPTXInst<
+    (outs regclass:$dst1, regclass:$dst2, regclass:$dst3, regclass:$dst4),
     (ins LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign,
-      i32imm:$fromWidth, imem:$addr, i32imm:$offset),
-    !strconcat("ld${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}",
-               "$fromWidth \t{{$dst1, $dst2, $dst3, $dst4}}, [$addr+$offset];"),
-                []>;
+         i32imm:$fromWidth, imem:$addr, i32imm:$offset),
+    "ld${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
+    "\t{{$dst1, $dst2, $dst3, $dst4}}, [$addr+$offset];", []>;
 }
 let mayLoad=1, hasSideEffects=0 in {
-defm LDV_i8  : LD_VEC<Int16Regs>;
-defm LDV_i16 : LD_VEC<Int16Regs>;
-defm LDV_i32 : LD_VEC<Int32Regs>;
-defm LDV_i64 : LD_VEC<Int64Regs>;
-defm LDV_f32 : LD_VEC<Float32Regs>;
-defm LDV_f64 : LD_VEC<Float64Regs>;
+  defm LDV_i8  : LD_VEC<Int16Regs>;
+  defm LDV_i16 : LD_VEC<Int16Regs>;
+  defm LDV_i32 : LD_VEC<Int32Regs>;
+  defm LDV_i64 : LD_VEC<Int64Regs>;
+  defm LDV_f32 : LD_VEC<Float32Regs>;
+  defm LDV_f64 : LD_VEC<Float64Regs>;
 }
 
 multiclass ST_VEC<NVPTXRegClass regclass> {
-  def _v2_avar : NVPTXInst<(outs),
+  def _v2_avar : NVPTXInst<
+    (outs),
     (ins regclass:$src1, regclass:$src2, LdStCode:$isVol, LdStCode:$addsp,
-      LdStCode:$Vec, LdStCode:$Sign, i32imm:$fromWidth, imem:$addr),
-    !strconcat("st${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}",
-               "$fromWidth \t[$addr], {{$src1, $src2}};"), []>;
-  def _v2_areg : NVPTXInst<(outs),
+         LdStCode:$Vec, LdStCode:$Sign, i32imm:$fromWidth, imem:$addr),
+    "st${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
+    "\t[$addr], {{$src1, $src2}};", []>;
+  def _v2_areg : NVPTXInst<
+    (outs),
     (ins regclass:$src1, regclass:$src2, LdStCode:$isVol, LdStCode:$addsp,
-      LdStCode:$Vec, LdStCode:$Sign, i32imm:$fromWidth, Int32Regs:$addr),
-    !strconcat("st${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}",
-               "$fromWidth \t[$addr], {{$src1, $src2}};"), []>;
-  def _v2_areg_64 : NVPTXInst<(outs),
+         LdStCode:$Vec, LdStCode:$Sign, i32imm:$fromWidth, Int32Regs:$addr),
+    "st${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
+    "\t[$addr], {{$src1, $src2}};", []>;
+  def _v2_areg_64 : NVPTXInst<
+    (outs),
     (ins regclass:$src1, regclass:$src2, LdStCode:$isVol, LdStCode:$addsp,
-     LdStCode:$Vec, LdStCode:$Sign, i32imm:$fromWidth, Int64Regs:$addr),
-    !strconcat("st${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}",
-               "$fromWidth \t[$addr], {{$src1, $src2}};"), []>;
-  def _v2_ari : NVPTXInst<(outs),
+         LdStCode:$Vec, LdStCode:$Sign, i32imm:$fromWidth, Int64Regs:$addr),
+    "st${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
+    "\t[$addr], {{$src1, $src2}};", []>;
+  def _v2_ari : NVPTXInst<
+    (outs),
     (ins regclass:$src1, regclass:$src2, LdStCode:$isVol, LdStCode:$addsp,
-      LdStCode:$Vec, LdStCode:$Sign, i32imm:$fromWidth, Int32Regs:$addr,
-      i32imm:$offset),
-    !strconcat("st${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}",
-               "$fromWidth \t[$addr+$offset], {{$src1, $src2}};"), []>;
-  def _v2_ari_64 : NVPTXInst<(outs),
+         LdStCode:$Vec, LdStCode:$Sign, i32imm:$fromWidth, Int32Regs:$addr,
+         i32imm:$offset),
+    "st${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
+    "\t[$addr+$offset], {{$src1, $src2}};", []>;
+  def _v2_ari_64 : NVPTXInst<
+    (outs),
     (ins regclass:$src1, regclass:$src2, LdStCode:$isVol, LdStCode:$addsp,
-     LdStCode:$Vec, LdStCode:$Sign, i32imm:$fromWidth, Int64Regs:$addr,
-     i32imm:$offset),
-    !strconcat("st${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}",
-               "$fromWidth \t[$addr+$offset], {{$src1, $src2}};"), []>;
-  def _v2_asi : NVPTXInst<(outs),
+         LdStCode:$Vec, LdStCode:$Sign, i32imm:$fromWidth, Int64Regs:$addr,
+         i32imm:$offset),
+    "st${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
+    "\t[$addr+$offset], {{$src1, $src2}};", []>;
+  def _v2_asi : NVPTXInst<
+    (outs),
     (ins regclass:$src1, regclass:$src2, LdStCode:$isVol, LdStCode:$addsp,
-      LdStCode:$Vec, LdStCode:$Sign, i32imm:$fromWidth, imem:$addr,
-      i32imm:$offset),
-    !strconcat("st${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}",
-               "$fromWidth \t[$addr+$offset], {{$src1, $src2}};"), []>;
-  def _v4_avar : NVPTXInst<(outs),
+         LdStCode:$Vec, LdStCode:$Sign, i32imm:$fromWidth, imem:$addr,
+         i32imm:$offset),
+    "st${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
+    "\t[$addr+$offset], {{$src1, $src2}};", []>;
+  def _v4_avar : NVPTXInst<
+    (outs),
     (ins regclass:$src1, regclass:$src2, regclass:$src3, regclass:$src4,
-      LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign,
-      i32imm:$fromWidth, imem:$addr),
-    !strconcat("st${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}",
-               "$fromWidth \t[$addr], {{$src1, $src2, $src3, $src4}};"), []>;
-  def _v4_areg : NVPTXInst<(outs),
+         LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign,
+         i32imm:$fromWidth, imem:$addr),
+    "st${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
+    "\t[$addr], {{$src1, $src2, $src3, $src4}};", []>;
+  def _v4_areg : NVPTXInst<
+    (outs),
     (ins regclass:$src1, regclass:$src2, regclass:$src3, regclass:$src4,
-      LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign,
-      i32imm:$fromWidth, Int32Regs:$addr),
-    !strconcat("st${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}",
-               "$fromWidth \t[$addr], {{$src1, $src2, $src3, $src4}};"), []>;
-  def _v4_areg_64 : NVPTXInst<(outs),
+         LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign,
+         i32imm:$fromWidth, Int32Regs:$addr),
+    "st${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
+    "\t[$addr], {{$src1, $src2, $src3, $src4}};", []>;
+  def _v4_areg_64 : NVPTXInst<
+    (outs),
     (ins regclass:$src1, regclass:$src2, regclass:$src3, regclass:$src4,
-     LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign,
-     i32imm:$fromWidth, Int64Regs:$addr),
-    !strconcat("st${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}",
-               "$fromWidth \t[$addr], {{$src1, $src2, $src3, $src4}};"), []>;
-  def _v4_ari : NVPTXInst<(outs),
+         LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign,
+         i32imm:$fromWidth, Int64Regs:$addr),
+    "st${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
+    "\t[$addr], {{$src1, $src2, $src3, $src4}};", []>;
+  def _v4_ari : NVPTXInst<
+    (outs),
     (ins regclass:$src1, regclass:$src2, regclass:$src3, regclass:$src4,
-      LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign,
-      i32imm:$fromWidth, Int32Regs:$addr, i32imm:$offset),
-    !strconcat("st${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}",
-               "$fromWidth \t[$addr+$offset], {{$src1, $src2, $src3, $src4}};"),
-    []>;
-  def _v4_ari_64 : NVPTXInst<(outs),
+         LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign,
+         i32imm:$fromWidth, Int32Regs:$addr, i32imm:$offset),
+    "st${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
+    "\t[$addr+$offset], {{$src1, $src2, $src3, $src4}};", []>;
+  def _v4_ari_64 : NVPTXInst<
+    (outs),
     (ins regclass:$src1, regclass:$src2, regclass:$src3, regclass:$src4,
-     LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign,
-     i32imm:$fromWidth, Int64Regs:$addr, i32imm:$offset),
-    !strconcat("st${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}",
-               "$fromWidth \t[$addr+$offset], {{$src1, $src2, $src3, $src4}};"),
-     []>;
-  def _v4_asi : NVPTXInst<(outs),
+         LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign,
+         i32imm:$fromWidth, Int64Regs:$addr, i32imm:$offset),
+    "st${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
+    "\t[$addr+$offset], {{$src1, $src2, $src3, $src4}};", []>;
+  def _v4_asi : NVPTXInst<
+    (outs),
     (ins regclass:$src1, regclass:$src2, regclass:$src3, regclass:$src4,
-      LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign,
-      i32imm:$fromWidth, imem:$addr, i32imm:$offset),
-    !strconcat("st${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}",
-               "$fromWidth \t[$addr+$offset], {{$src1, $src2, $src3, $src4}};"),
-    []>;
+         LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign,
+         i32imm:$fromWidth, imem:$addr, i32imm:$offset),
+    "st${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}"
+    "$fromWidth \t[$addr+$offset], {{$src1, $src2, $src3, $src4}};", []>;
 }
+
 let mayStore=1, hasSideEffects=0 in {
-defm STV_i8  : ST_VEC<Int16Regs>;
-defm STV_i16 : ST_VEC<Int16Regs>;
-defm STV_i32 : ST_VEC<Int32Regs>;
-defm STV_i64 : ST_VEC<Int64Regs>;
-defm STV_f32 : ST_VEC<Float32Regs>;
-defm STV_f64 : ST_VEC<Float64Regs>;
+  defm STV_i8  : ST_VEC<Int16Regs>;
+  defm STV_i16 : ST_VEC<Int16Regs>;
+  defm STV_i32 : ST_VEC<Int32Regs>;
+  defm STV_i64 : ST_VEC<Int64Regs>;
+  defm STV_f32 : ST_VEC<Float32Regs>;
+  defm STV_f64 : ST_VEC<Float64Regs>;
 }
 
 
@@ -2525,64 +2536,52 @@ def : Pat<(select Int32Regs:$pred, Float64Regs:$a, Float64Regs:$b),
           (SETP_b32ri (ANDb32ri Int32Regs:$pred, 1), 1, CmpEQ))>;
 
 
-// pack a set of smaller int registers to a larger int register
-def V4I16toI64 : NVPTXInst<(outs Int64Regs:$d),
-                          (ins Int16Regs:$s1, Int16Regs:$s2,
-                               Int16Regs:$s3, Int16Regs:$s4),
-                          "mov.b64\t$d, {{$s1, $s2, $s3, $s4}};",
-                          []>;
-def V2I16toI32 : NVPTXInst<(outs Int32Regs:$d),
-                          (ins Int16Regs:$s1, Int16Regs:$s2),
-                          "mov.b32\t$d, {{$s1, $s2}};",
-                          []>;
-def V2I32toI64 : NVPTXInst<(outs Int64Regs:$d),
-                          (ins Int32Regs:$s1, Int32Regs:$s2),
-                          "mov.b64\t$d, {{$s1, $s2}};",
-                          []>;
-def V2F32toF64 : NVPTXInst<(outs Float64Regs:$d),
-                          (ins Float32Regs:$s1, Float32Regs:$s2),
-                          "mov.b64\t$d, {{$s1, $s2}};",
-                          []>;
-
-// unpack a larger int register to a set of smaller int registers
-def I64toV4I16 : NVPTXInst<(outs Int16Regs:$d1, Int16Regs:$d2,
-                                 Int16Regs:$d3, Int16Regs:$d4),
-                           (ins Int64Regs:$s),
-                           "mov.b64\t{{$d1, $d2, $d3, $d4}}, $s;",
-                          []>;
-def I32toV2I16 : NVPTXInst<(outs Int16Regs:$d1, Int16Regs:$d2),
-                           (ins Int32Regs:$s),
-                           "mov.b32\t{{$d1, $d2}}, $s;",
-                          []>;
-def I64toV2I32 : NVPTXInst<(outs Int32Regs:$d1, Int32Regs:$d2),
-                           (ins Int64Regs:$s),
-                           "mov.b64\t{{$d1, $d2}}, $s;",
-                          []>;
-def F64toV2F32 : NVPTXInst<(outs Float32Regs:$d1, Float32Regs:$d2),
-                           (ins Float64Regs:$s),
-                           "mov.b64\t{{$d1, $d2}}, $s;",
-                          []>;
+let hasSideEffects = 0 in {
+  // pack a set of smaller int registers to a larger int register
+  def V4I16toI64 : NVPTXInst<(outs Int64Regs:$d),
+                             (ins Int16Regs:$s1, Int16Regs:$s2,
+                                  Int16Regs:$s3, Int16Regs:$s4),
+                             "mov.b64\t$d, {{$s1, $s2, $s3, $s4}};", []>;
+  def V2I16toI32 : NVPTXInst<(outs Int32Regs:$d),
+                             (ins Int16Regs:$s1, Int16Regs:$s2),
+                             "mov.b32\t$d, {{$s1, $s2}};", []>;
+  def V2I32toI64 : NVPTXInst<(outs Int64Regs:$d),
+                             (ins Int32Regs:$s1, Int32Regs:$s2),
+                             "mov.b64\t$d, {{$s1, $s2}};", []>;
+  def V2F32toF64 : NVPTXInst<(outs Float64Regs:$d),
+                             (ins Float32Regs:$s1, Float32Regs:$s2),
+                             "mov.b64\t$d, {{$s1, $s2}};", []>;
+
+  // unpack a larger int register to a set of smaller int registers
+  def I64toV4I16 : NVPTXInst<(outs Int16Regs:$d1, Int16Regs:$d2,
+                                   Int16Regs:$d3, Int16Regs:$d4),
+                             (ins Int64Regs:$s),
+                             "mov.b64\t{{$d1, $d2, $d3, $d4}}, $s;", []>;
+  def I32toV2I16 : NVPTXInst<(outs Int16Regs:$d1, Int16Regs:$d2),
+                             (ins Int32Regs:$s),
+                             "mov.b32\t{{$d1, $d2}}, $s;", []>;
+  def I64toV2I32 : NVPTXInst<(outs Int32Regs:$d1, Int32Regs:$d2),
+                             (ins Int64Regs:$s),
+                             "mov.b64\t{{$d1, $d2}}, $s;", []>;
+  def F64toV2F32 : NVPTXInst<(outs Float32Regs:$d1, Float32Regs:$d2),
+                             (ins Float64Regs:$s),
+                             "mov.b64\t{{$d1, $d2}}, $s;", []>;
+}
 
 // Count leading zeros
-def CLZr32 : NVPTXInst<(outs Int32Regs:$d), (ins Int32Regs:$a),
-                       "clz.b32\t$d, $a;",
-                       []>;
-def CLZr64 : NVPTXInst<(outs Int32Regs:$d), (ins Int64Regs:$a),
-                       "clz.b64\t$d, $a;",
-                       []>;
+let hasSideEffects = 0 in {
+  def CLZr32 : NVPTXInst<(outs Int32Regs:$d), (ins Int32Regs:$a),
+                         "clz.b32\t$d, $a;", []>;
+  def CLZr64 : NVPTXInst<(outs Int32Regs:$d), (ins Int64Regs:$a),
+                         "clz.b64\t$d, $a;", []>;
+}
 
 // 32-bit has a direct PTX instruction
-def : Pat<(ctlz Int32Regs:$a),
-          (CLZr32 Int32Regs:$a)>;
-def : Pat<(ctlz_zero_undef Int32Regs:$a),
-          (CLZr32 Int32Regs:$a)>;
+def : Pat<(ctlz Int32Regs:$a), (CLZr32 Int32Regs:$a)>;
 
 // For 64-bit, the result in PTX is actually 32-bit so we zero-extend
 // to 64-bit to match the LLVM semantics
-def : Pat<(ctlz Int64Regs:$a),
-          (CVT_u64_u32 (CLZr64 Int64Regs:$a), CvtNONE)>;
-def : Pat<(ctlz_zero_undef Int64Regs:$a),
-          (CVT_u64_u32 (CLZr64 Int64Regs:$a), CvtNONE)>;
+def : Pat<(ctlz Int64Regs:$a), (CVT_u64_u32 (CLZr64 Int64Regs:$a), CvtNONE)>;
 
 // For 16-bit, we zero-extend to 32-bit, then trunc the result back
 // to 16-bits (ctlz of a 16-bit value is guaranteed to require less
@@ -2592,34 +2591,27 @@ def : Pat<(ctlz Int16Regs:$a),
           (SUBi16ri (CVT_u16_u32 (CLZr32
             (CVT_u32_u16 Int16Regs:$a, CvtNONE)),
            CvtNONE), 16)>;
-def : Pat<(ctlz_zero_undef Int16Regs:$a),
-          (SUBi16ri (CVT_u16_u32 (CLZr32
-            (CVT_u32_u16 Int16Regs:$a, CvtNONE)),
-           CvtNONE), 16)>;
 
 // Population count
-def POPCr32 : NVPTXInst<(outs Int32Regs:$d), (ins Int32Regs:$a),
-                        "popc.b32\t$d, $a;",
-                        []>;
-def POPCr64 : NVPTXInst<(outs Int32Regs:$d), (ins Int64Regs:$a),
-                        "popc.b64\t$d, $a;",
-                        []>;
+let hasSideEffects = 0 in {
+  def POPCr32 : NVPTXInst<(outs Int32Regs:$d), (ins Int32Regs:$a),
+                          "popc.b32\t$d, $a;", []>;
+  def POPCr64 : NVPTXInst<(outs Int32Regs:$d), (ins Int64Regs:$a),
+                          "popc.b64\t$d, $a;", []>;
+}
 
 // 32-bit has a direct PTX instruction
-def : Pat<(ctpop Int32Regs:$a),
-          (POPCr32 Int32Regs:$a)>;
+def : Pat<(ctpop Int32Regs:$a), (POPCr32 Int32Regs:$a)>;
 
 // For 64-bit, the result in PTX is actually 32-bit so we zero-extend
 // to 64-bit to match the LLVM semantics
-def : Pat<(ctpop Int64Regs:$a),
-          (CVT_u64_u32 (POPCr64 Int64Regs:$a), CvtNONE)>;
+def : Pat<(ctpop Int64Regs:$a), (CVT_u64_u32 (POPCr64 Int64Regs:$a), CvtNONE)>;
 
 // For 16-bit, we zero-extend to 32-bit, then trunc the result back
 // to 16-bits (ctpop of a 16-bit value is guaranteed to require less
 // than 16 bits to store)
 def : Pat<(ctpop Int16Regs:$a),
-          (CVT_u16_u32 (POPCr32 (CVT_u32_u16 Int16Regs:$a, CvtNONE)),
-           CvtNONE)>;
+          (CVT_u16_u32 (POPCr32 (CVT_u32_u16 Int16Regs:$a, CvtNONE)), CvtNONE)>;
 
 // fround f64 -> f32
 def : Pat<(f32 (fround Float64Regs:$a)),
@@ -2633,8 +2625,8 @@ def : Pat<(f64 (fextend Float32Regs:$a)),
 def : Pat<(f64 (fextend Float32Regs:$a)),
           (CVT_f64_f32 Float32Regs:$a, CvtNONE)>;
 
-def retflag       : SDNode<"NVPTXISD::RET_FLAG", SDTNone,
-                           [SDNPHasChain, SDNPOptInGlue]>;
+def retflag : SDNode<"NVPTXISD::RET_FLAG", SDTNone,
+                     [SDNPHasChain, SDNPOptInGlue]>;
 
 //-----------------------------------
 // Control-flow
@@ -2646,88 +2638,77 @@ let isTerminator=1 in {
 
    let isBranch=1 in
       def CBranch : NVPTXInst<(outs), (ins Int1Regs:$a, brtarget:$target),
-                          "@$a bra \t$target;",
-                           [(brcond Int1Regs:$a, bb:$target)]>;
+                              "@$a bra \t$target;",
+                              [(brcond Int1Regs:$a, bb:$target)]>;
    let isBranch=1 in
       def CBranchOther : NVPTXInst<(outs), (ins Int1Regs:$a, brtarget:$target),
-                          "@!$a bra \t$target;",
-                           []>;
+                                   "@!$a bra \t$target;", []>;
 
    let isBranch=1, isBarrier=1 in
       def GOTO : NVPTXInst<(outs), (ins brtarget:$target),
-                        "bra.uni \t$target;",
-                  [(br bb:$target)]>;
+                           "bra.uni \t$target;", [(br bb:$target)]>;
 }
 
 def : Pat<(brcond Int32Regs:$a, bb:$target),
           (CBranch (SETP_u32ri Int32Regs:$a, 0, CmpNE), bb:$target)>;
 
 // SelectionDAGBuilder::visitSWitchCase() will invert the condition of a
-// conditional branch if
-// the target block is the next block so that the code can fall through to the
-// target block.
-// The invertion is done by 'xor condition, 1', which will be translated to
-// (setne condition, -1).
-// Since ptx supports '@!pred bra target', we should use it.
+// conditional branch if the target block is the next block so that the code
+// can fall through to the target block.  The invertion is done by 'xor
+// condition, 1', which will be translated to (setne condition, -1).  Since ptx
+// supports '@!pred bra target', we should use it.
 def : Pat<(brcond (i1 (setne Int1Regs:$a, -1)), bb:$target),
-  (CBranchOther Int1Regs:$a, bb:$target)>;
+          (CBranchOther Int1Regs:$a, bb:$target)>;
 
 // Call
-def SDT_NVPTXCallSeqStart : SDCallSeqStart<[ SDTCisVT<0, i32> ]>;
-def SDT_NVPTXCallSeqEnd   : SDCallSeqEnd<[ SDTCisVT<0, i32>,
-                                        SDTCisVT<1, i32> ]>;
+def SDT_NVPTXCallSeqStart : SDCallSeqStart<[SDTCisVT<0, i32>]>;
+def SDT_NVPTXCallSeqEnd   : SDCallSeqEnd<[SDTCisVT<0, i32>, SDTCisVT<1, i32>]>;
 
 def callseq_start : SDNode<"ISD::CALLSEQ_START", SDT_NVPTXCallSeqStart,
                            [SDNPHasChain, SDNPOutGlue, SDNPSideEffect]>;
-def callseq_end   : SDNode<"ISD::CALLSEQ_END",   SDT_NVPTXCallSeqEnd,
+def callseq_end   : SDNode<"ISD::CALLSEQ_END", SDT_NVPTXCallSeqEnd,
                            [SDNPHasChain, SDNPOptInGlue, SDNPOutGlue,
-                           SDNPSideEffect]>;
+                            SDNPSideEffect]>;
 
 def SDT_NVPTXCall : SDTypeProfile<0, 1, [SDTCisVT<0, i32>]>;
 def call          : SDNode<"NVPTXISD::CALL", SDT_NVPTXCall,
                            [SDNPHasChain, SDNPOptInGlue, SDNPOutGlue]>;
 def calltarget : Operand<i32>;
 let isCall=1 in {
-   def CALL : NVPTXInst<(outs), (ins calltarget:$dst),
-                  "call \t$dst, (1);", []>;
+   def CALL : NVPTXInst<(outs), (ins calltarget:$dst), "call \t$dst, (1);", []>;
 }
 
-def : Pat<(call tglobaladdr:$dst),
-          (CALL tglobaladdr:$dst)>;
-def : Pat<(call texternalsym:$dst),
-          (CALL texternalsym:$dst)>;
+def : Pat<(call tglobaladdr:$dst), (CALL tglobaladdr:$dst)>;
+def : Pat<(call texternalsym:$dst), (CALL texternalsym:$dst)>;
 
 // Pseudo instructions.
 class Pseudo<dag outs, dag ins, string asmstr, list<dag> pattern>
    : NVPTXInst<outs, ins, asmstr, pattern>;
 
-// @TODO: We use some tricks here to emit curly braces.  Can we clean this up
-// a bit without TableGen modifications?
-def Callseq_Start : NVPTXInst<(outs), (ins i32imm:$amt),
-  "// Callseq Start $amt\n\t{{\n\t.reg .b32 temp_param_reg;\n\t// <end>}}",
-                               [(callseq_start timm:$amt)]>;
-def Callseq_End : NVPTXInst<(outs), (ins i32imm:$amt1, i32imm:$amt2),
-  "\n\t//{{\n\t}}// Callseq End $amt1",
-                            [(callseq_end timm:$amt1, timm:$amt2)]>;
+def Callseq_Start :
+  NVPTXInst<(outs), (ins i32imm:$amt),
+            "\\{ // callseq $amt\n"
+            "\t.reg .b32 temp_param_reg;",
+           [(callseq_start timm:$amt)]>;
+def Callseq_End :
+  NVPTXInst<(outs), (ins i32imm:$amt1, i32imm:$amt2),
+            "\\} // callseq $amt1",
+            [(callseq_end timm:$amt1, timm:$amt2)]>;
 
 // trap instruction
-
-def trapinst : NVPTXInst<(outs), (ins),
-                         "trap;",
-                         [(trap)]>;
+def trapinst : NVPTXInst<(outs), (ins), "trap;", [(trap)]>;
 
 // Call prototype wrapper
 def SDTCallPrototype : SDTypeProfile<0, 1, [SDTCisInt<0>]>;
-def CallPrototype
-  : SDNode<"NVPTXISD::CallPrototype", SDTCallPrototype,
-           [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>;
+def CallPrototype :
+  SDNode<"NVPTXISD::CallPrototype", SDTCallPrototype,
+         [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>;
 def ProtoIdent : Operand<i32> {
   let PrintMethod = "printProtoIdent";
 }
-def CALL_PROTOTYPE
-  : NVPTXInst<(outs), (ins ProtoIdent:$ident),
-              "$ident", [(CallPrototype (i32 texternalsym:$ident))]>;
-
+def CALL_PROTOTYPE :
+  NVPTXInst<(outs), (ins ProtoIdent:$ident),
+            "$ident", [(CallPrototype (i32 texternalsym:$ident))]>;
 
 
 include "NVPTXIntrinsics.td"
diff --git a/lib/Target/NVPTX/NVPTXIntrinsics.td b/lib/Target/NVPTX/NVPTXIntrinsics.td
index 14e51aa309ea..ed16afa24752 100644
--- a/lib/Target/NVPTX/NVPTXIntrinsics.td
+++ b/lib/Target/NVPTX/NVPTXIntrinsics.td
@@ -30,11 +30,9 @@ def immDouble1 : PatLeaf<(fpimm), [{
 
 
 //-----------------------------------
-// Synchronization Functions
+// Synchronization and shuffle functions
 //-----------------------------------
-def INT_CUDA_SYNCTHREADS : NVPTXInst<(outs), (ins),
-                  "bar.sync \t0;",
-      [(int_cuda_syncthreads)]>;
+let isConvergent = 1 in {
 def INT_BARRIER0 : NVPTXInst<(outs), (ins),
                   "bar.sync \t0;",
       [(int_nvvm_barrier0)]>;
@@ -64,6 +62,51 @@ def INT_BARRIER0_OR : NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$pred),
         !strconcat("}}", ""))))))),
       [(set Int32Regs:$dst, (int_nvvm_barrier0_or Int32Regs:$pred))]>;
 
+def INT_BAR_SYNC : NVPTXInst<(outs), (ins i32imm:$i), "bar.sync\t$i;",
+                             [(int_nvvm_bar_sync imm:$i)]>;
+
+// shfl.{up,down,bfly,idx}.b32
+multiclass SHFL<NVPTXRegClass regclass, string mode, Intrinsic IntOp> {
+  // The last two parameters to shfl can be regs or imms.  ptxas is smart
+  // enough to inline constant registers, so strictly speaking we don't need to
+  // handle immediates here.  But it's easy enough, and it makes our ptx more
+  // readable.
+  def reg : NVPTXInst<
+      (outs regclass:$dst),
+      (ins regclass:$src, Int32Regs:$offset, Int32Regs:$mask),
+      !strconcat("shfl.", mode, ".b32 $dst, $src, $offset, $mask;"),
+      [(set regclass:$dst, (IntOp regclass:$src, Int32Regs:$offset, Int32Regs:$mask))]>;
+
+  def imm1 : NVPTXInst<
+      (outs regclass:$dst),
+      (ins regclass:$src, i32imm:$offset, Int32Regs:$mask),
+      !strconcat("shfl.", mode, ".b32 $dst, $src, $offset, $mask;"),
+      [(set regclass:$dst, (IntOp regclass:$src, imm:$offset, Int32Regs:$mask))]>;
+
+  def imm2 : NVPTXInst<
+      (outs regclass:$dst),
+      (ins regclass:$src, Int32Regs:$offset, i32imm:$mask),
+      !strconcat("shfl.", mode, ".b32 $dst, $src, $offset, $mask;"),
+      [(set regclass:$dst, (IntOp regclass:$src, Int32Regs:$offset, imm:$mask))]>;
+
+  def imm3 : NVPTXInst<
+      (outs regclass:$dst),
+      (ins regclass:$src, i32imm:$offset, i32imm:$mask),
+      !strconcat("shfl.", mode, ".b32 $dst, $src, $offset, $mask;"),
+      [(set regclass:$dst, (IntOp regclass:$src, imm:$offset, imm:$mask))]>;
+}
+
+defm INT_SHFL_DOWN_I32 : SHFL<Int32Regs, "down", int_nvvm_shfl_down_i32>;
+defm INT_SHFL_DOWN_F32 : SHFL<Float32Regs, "down", int_nvvm_shfl_down_f32>;
+defm INT_SHFL_UP_I32 : SHFL<Int32Regs, "up", int_nvvm_shfl_up_i32>;
+defm INT_SHFL_UP_F32 : SHFL<Float32Regs, "up", int_nvvm_shfl_up_f32>;
+defm INT_SHFL_BFLY_I32 : SHFL<Int32Regs, "bfly", int_nvvm_shfl_bfly_i32>;
+defm INT_SHFL_BFLY_F32 : SHFL<Float32Regs, "bfly", int_nvvm_shfl_bfly_f32>;
+defm INT_SHFL_IDX_I32 : SHFL<Int32Regs, "idx", int_nvvm_shfl_idx_i32>;
+defm INT_SHFL_IDX_F32 : SHFL<Float32Regs, "idx", int_nvvm_shfl_idx_f32>;
+
+} // isConvergent = 1
+
 
 //-----------------------------------
 // Explicit Memory Fence Functions
@@ -1335,51 +1378,17 @@ defm INT_PTX_ATOM_CAS_GEN_64_USE_G : F_ATOMIC_3<Int64Regs, ".global", ".b64",
   ".cas", atomic_cmp_swap_64_gen, i64imm, useAtomRedG64forGen64>;
 
 
-//-----------------------------------
-// Read Special Registers
-//-----------------------------------
-class F_SREG<string OpStr, NVPTXRegClass regclassOut, Intrinsic IntOp> :
-      NVPTXInst<(outs regclassOut:$dst), (ins),
-               OpStr,
-         [(set regclassOut:$dst, (IntOp))]>;
-
-def INT_PTX_SREG_TID_X : F_SREG<"mov.u32 \t$dst, %tid.x;", Int32Regs,
-  int_nvvm_read_ptx_sreg_tid_x>;
-def INT_PTX_SREG_TID_Y : F_SREG<"mov.u32 \t$dst, %tid.y;", Int32Regs,
-  int_nvvm_read_ptx_sreg_tid_y>;
-def INT_PTX_SREG_TID_Z : F_SREG<"mov.u32 \t$dst, %tid.z;", Int32Regs,
-  int_nvvm_read_ptx_sreg_tid_z>;
-
-def INT_PTX_SREG_NTID_X : F_SREG<"mov.u32 \t$dst, %ntid.x;", Int32Regs,
-  int_nvvm_read_ptx_sreg_ntid_x>;
-def INT_PTX_SREG_NTID_Y : F_SREG<"mov.u32 \t$dst, %ntid.y;", Int32Regs,
-  int_nvvm_read_ptx_sreg_ntid_y>;
-def INT_PTX_SREG_NTID_Z : F_SREG<"mov.u32 \t$dst, %ntid.z;", Int32Regs,
-  int_nvvm_read_ptx_sreg_ntid_z>;
-
-def INT_PTX_SREG_CTAID_X : F_SREG<"mov.u32 \t$dst, %ctaid.x;", Int32Regs,
-  int_nvvm_read_ptx_sreg_ctaid_x>;
-def INT_PTX_SREG_CTAID_Y : F_SREG<"mov.u32 \t$dst, %ctaid.y;", Int32Regs,
-  int_nvvm_read_ptx_sreg_ctaid_y>;
-def INT_PTX_SREG_CTAID_Z : F_SREG<"mov.u32 \t$dst, %ctaid.z;", Int32Regs,
-  int_nvvm_read_ptx_sreg_ctaid_z>;
-
-def INT_PTX_SREG_NCTAID_X : F_SREG<"mov.u32 \t$dst, %nctaid.x;", Int32Regs,
-  int_nvvm_read_ptx_sreg_nctaid_x>;
-def INT_PTX_SREG_NCTAID_Y : F_SREG<"mov.u32 \t$dst, %nctaid.y;", Int32Regs,
-  int_nvvm_read_ptx_sreg_nctaid_y>;
-def INT_PTX_SREG_NCTAID_Z : F_SREG<"mov.u32 \t$dst, %nctaid.z;", Int32Regs,
-  int_nvvm_read_ptx_sreg_nctaid_z>;
-
-def INT_PTX_SREG_WARPSIZE : F_SREG<"mov.u32 \t$dst, WARP_SZ;", Int32Regs,
-  int_nvvm_read_ptx_sreg_warpsize>;
 
 
 //-----------------------------------
 // Support for ldu on sm_20 or later
 //-----------------------------------
 
+// Don't annotate ldu instructions as mayLoad, as they load from memory that is
+// read-only in a kernel.
+
 // Scalar
+
 multiclass LDU_G<string TyStr, NVPTXRegClass regclass> {
   def areg: NVPTXInst<(outs regclass:$result), (ins Int32Regs:$src),
                !strconcat("ldu.global.", TyStr),
@@ -1475,6 +1484,10 @@ defm INT_PTX_LDU_G_v4f32_ELE
 // Support for ldg on sm_35 or later 
 //-----------------------------------
 
+// Don't annotate ld.global.nc as mayLoad, because these loads go through the
+// non-coherent texture cache, and therefore the values read must be read-only
+// during the lifetime of the kernel.
+
 multiclass LDG_G<string TyStr, NVPTXRegClass regclass> {
   def areg: NVPTXInst<(outs regclass:$result), (ins Int32Regs:$src),
                !strconcat("ld.global.nc.", TyStr),
@@ -1836,54 +1849,61 @@ def : Pat<(int_nvvm_rotate_b32 Int32Regs:$src, Int32Regs:$amt),
           (ROTL32reg_sw Int32Regs:$src, Int32Regs:$amt)>,
       Requires<[noHWROT32]> ;
 
-def GET_LO_INT64
-  : NVPTXInst<(outs Int32Regs:$dst), (ins Int64Regs:$src),
-              !strconcat("{{\n\t",
-              !strconcat(".reg .b32 %dummy;\n\t",
-              !strconcat("mov.b64 \t{$dst,%dummy}, $src;\n\t",
-        !strconcat("}}", "")))),
-        []> ;
-
-def GET_HI_INT64
-  : NVPTXInst<(outs Int32Regs:$dst), (ins Int64Regs:$src),
-              !strconcat("{{\n\t",
-              !strconcat(".reg .b32 %dummy;\n\t",
-              !strconcat("mov.b64 \t{%dummy,$dst}, $src;\n\t",
-        !strconcat("}}", "")))),
-        []> ;
-
-def PACK_TWO_INT32
-  : NVPTXInst<(outs Int64Regs:$dst), (ins Int32Regs:$lo, Int32Regs:$hi),
-              "mov.b64 \t$dst, {{$lo, $hi}};", []> ;
+let hasSideEffects = 0 in {
+  def GET_LO_INT64
+    : NVPTXInst<(outs Int32Regs:$dst), (ins Int64Regs:$src),
+                !strconcat("{{\n\t",
+                !strconcat(".reg .b32 %dummy;\n\t",
+                !strconcat("mov.b64 \t{$dst,%dummy}, $src;\n\t",
+          !strconcat("}}", "")))),
+          []> ;
+
+  def GET_HI_INT64
+    : NVPTXInst<(outs Int32Regs:$dst), (ins Int64Regs:$src),
+                !strconcat("{{\n\t",
+                !strconcat(".reg .b32 %dummy;\n\t",
+                !strconcat("mov.b64 \t{%dummy,$dst}, $src;\n\t",
+          !strconcat("}}", "")))),
+          []> ;
+}
+
+let hasSideEffects = 0 in {
+  def PACK_TWO_INT32
+    : NVPTXInst<(outs Int64Regs:$dst), (ins Int32Regs:$lo, Int32Regs:$hi),
+                "mov.b64 \t$dst, {{$lo, $hi}};", []> ;
+}
 
 def : Pat<(int_nvvm_swap_lo_hi_b64 Int64Regs:$src),
           (PACK_TWO_INT32 (GET_HI_INT64 Int64Regs:$src),
                           (GET_LO_INT64 Int64Regs:$src))> ;
 
-// funnel shift, requires >= sm_32
-def SHF_L_WRAP_B32_IMM
-  : NVPTXInst<(outs Int32Regs:$dst),
-              (ins  Int32Regs:$lo, Int32Regs:$hi, i32imm:$amt),
-              "shf.l.wrap.b32 \t$dst, $lo, $hi, $amt;",[]>,
-    Requires<[hasHWROT32]>;
+// Funnel shift, requires >= sm_32.  Does not trap if amt is out of range, so
+// no side effects.
+let hasSideEffects = 0 in {
+  def SHF_L_WRAP_B32_IMM
+    : NVPTXInst<(outs Int32Regs:$dst),
+                (ins  Int32Regs:$lo, Int32Regs:$hi, i32imm:$amt),
+                "shf.l.wrap.b32 \t$dst, $lo, $hi, $amt;",[]>,
+      Requires<[hasHWROT32]>;
 
-def SHF_L_WRAP_B32_REG
-  : NVPTXInst<(outs Int32Regs:$dst),
-              (ins  Int32Regs:$lo, Int32Regs:$hi, Int32Regs:$amt),
-              "shf.l.wrap.b32 \t$dst, $lo, $hi, $amt;",[]>,
-    Requires<[hasHWROT32]>;
+  def SHF_L_WRAP_B32_REG
+    : NVPTXInst<(outs Int32Regs:$dst),
+                (ins  Int32Regs:$lo, Int32Regs:$hi, Int32Regs:$amt),
+                "shf.l.wrap.b32 \t$dst, $lo, $hi, $amt;",[]>,
+      Requires<[hasHWROT32]>;
 
-def SHF_R_WRAP_B32_IMM
-  : NVPTXInst<(outs Int32Regs:$dst),
-              (ins  Int32Regs:$lo, Int32Regs:$hi, i32imm:$amt),
-              "shf.r.wrap.b32 \t$dst, $lo, $hi, $amt;",[]>,
-    Requires<[hasHWROT32]>;
+  def SHF_R_WRAP_B32_IMM
+    : NVPTXInst<(outs Int32Regs:$dst),
+                (ins  Int32Regs:$lo, Int32Regs:$hi, i32imm:$amt),
+                "shf.r.wrap.b32 \t$dst, $lo, $hi, $amt;",[]>,
+      Requires<[hasHWROT32]>;
 
-def SHF_R_WRAP_B32_REG
-  : NVPTXInst<(outs Int32Regs:$dst),
-              (ins  Int32Regs:$lo, Int32Regs:$hi, Int32Regs:$amt),
-              "shf.r.wrap.b32 \t$dst, $lo, $hi, $amt;",[]>,
-    Requires<[hasHWROT32]>;
+  def SHF_R_WRAP_B32_REG
+    : NVPTXInst<(outs Int32Regs:$dst),
+                (ins  Int32Regs:$lo, Int32Regs:$hi, Int32Regs:$amt),
+                "shf.r.wrap.b32 \t$dst, $lo, $hi, $amt;",[]>,
+      Requires<[hasHWROT32]>;
+}
 
 // HW version of rotate 64
 def : Pat<(int_nvvm_rotate_b64 Int64Regs:$src, (i32 imm:$amt)),
@@ -6950,98 +6970,95 @@ def : Pat<(int_nvvm_sust_p_3d_v4i32_trap
            Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
            Int32Regs:$r, Int32Regs:$g, Int32Regs:$b, Int32Regs:$a)>;
 
+//-----------------------------------
+// Read Special Registers
+//-----------------------------------
 
-
-//===-- Old PTX Back-end Intrinsics ---------------------------------------===//
-
-// These intrinsics are handled to retain compatibility with the old backend.
-
-// PTX Special Purpose Register Accessor Intrinsics
-
-class PTX_READ_SPECIAL_REGISTER_R64<string regname, Intrinsic intop>
+class PTX_READ_SREG_R64<string regname, Intrinsic intop>
   : NVPTXInst<(outs Int64Regs:$d), (ins),
               !strconcat(!strconcat("mov.u64\t$d, %", regname), ";"),
               [(set Int64Regs:$d, (intop))]>;
 
-class PTX_READ_SPECIAL_REGISTER_R32<string regname, Intrinsic intop>
+class PTX_READ_SREG_R32<string regname, Intrinsic intop>
   : NVPTXInst<(outs Int32Regs:$d), (ins),
               !strconcat(!strconcat("mov.u32\t$d, %", regname), ";"),
               [(set Int32Regs:$d, (intop))]>;
 
 // TODO Add read vector-version of special registers
 
-def PTX_READ_TID_X   : PTX_READ_SPECIAL_REGISTER_R32<"tid.x",
-                                                     int_ptx_read_tid_x>;
-def PTX_READ_TID_Y   : PTX_READ_SPECIAL_REGISTER_R32<"tid.y",
-                                                     int_ptx_read_tid_y>;
-def PTX_READ_TID_Z   : PTX_READ_SPECIAL_REGISTER_R32<"tid.z",
-                                                     int_ptx_read_tid_z>;
-def PTX_READ_TID_W   : PTX_READ_SPECIAL_REGISTER_R32<"tid.w",
-                                                     int_ptx_read_tid_w>;
-
-def PTX_READ_NTID_X   : PTX_READ_SPECIAL_REGISTER_R32<"ntid.x",
-                                                      int_ptx_read_ntid_x>;
-def PTX_READ_NTID_Y   : PTX_READ_SPECIAL_REGISTER_R32<"ntid.y",
-                                                      int_ptx_read_ntid_y>;
-def PTX_READ_NTID_Z   : PTX_READ_SPECIAL_REGISTER_R32<"ntid.z",
-                                                      int_ptx_read_ntid_z>;
-def PTX_READ_NTID_W   : PTX_READ_SPECIAL_REGISTER_R32<"ntid.w",
-                                                      int_ptx_read_ntid_w>;
-
-def PTX_READ_LANEID  : PTX_READ_SPECIAL_REGISTER_R32<"laneid",
-                                                     int_ptx_read_laneid>;
-def PTX_READ_WARPID  : PTX_READ_SPECIAL_REGISTER_R32<"warpid",
-                                                     int_ptx_read_warpid>;
-def PTX_READ_NWARPID : PTX_READ_SPECIAL_REGISTER_R32<"nwarpid",
-                                                     int_ptx_read_nwarpid>;
-
-def PTX_READ_CTAID_X   : PTX_READ_SPECIAL_REGISTER_R32<"ctaid.x",
-                                                       int_ptx_read_ctaid_x>;
-def PTX_READ_CTAID_Y   : PTX_READ_SPECIAL_REGISTER_R32<"ctaid.y",
-                                                       int_ptx_read_ctaid_y>;
-def PTX_READ_CTAID_Z   : PTX_READ_SPECIAL_REGISTER_R32<"ctaid.z",
-                                                       int_ptx_read_ctaid_z>;
-def PTX_READ_CTAID_W   : PTX_READ_SPECIAL_REGISTER_R32<"ctaid.w",
-                                                       int_ptx_read_ctaid_w>;
-
-def PTX_READ_NCTAID_X   : PTX_READ_SPECIAL_REGISTER_R32<"nctaid.x",
-                                                        int_ptx_read_nctaid_x>;
-def PTX_READ_NCTAID_Y   : PTX_READ_SPECIAL_REGISTER_R32<"nctaid.y",
-                                                        int_ptx_read_nctaid_y>;
-def PTX_READ_NCTAID_Z   : PTX_READ_SPECIAL_REGISTER_R32<"nctaid.z",
-                                                        int_ptx_read_nctaid_z>;
-def PTX_READ_NCTAID_W   : PTX_READ_SPECIAL_REGISTER_R32<"nctaid.w",
-                                                        int_ptx_read_nctaid_w>;
-
-def PTX_READ_SMID  : PTX_READ_SPECIAL_REGISTER_R32<"smid",
-                                                   int_ptx_read_smid>;
-def PTX_READ_NSMID  : PTX_READ_SPECIAL_REGISTER_R32<"nsmid",
-                                                    int_ptx_read_nsmid>;
-def PTX_READ_GRIDID  : PTX_READ_SPECIAL_REGISTER_R32<"gridid",
-                                                     int_ptx_read_gridid>;
-
-def PTX_READ_LANEMASK_EQ
-  : PTX_READ_SPECIAL_REGISTER_R32<"lanemask_eq", int_ptx_read_lanemask_eq>;
-def PTX_READ_LANEMASK_LE
-  : PTX_READ_SPECIAL_REGISTER_R32<"lanemask_le", int_ptx_read_lanemask_le>;
-def PTX_READ_LANEMASK_LT
-  : PTX_READ_SPECIAL_REGISTER_R32<"lanemask_lt", int_ptx_read_lanemask_lt>;
-def PTX_READ_LANEMASK_GE
-  : PTX_READ_SPECIAL_REGISTER_R32<"lanemask_ge", int_ptx_read_lanemask_ge>;
-def PTX_READ_LANEMASK_GT
-  : PTX_READ_SPECIAL_REGISTER_R32<"lanemask_gt", int_ptx_read_lanemask_gt>;
-
-def PTX_READ_CLOCK
-  : PTX_READ_SPECIAL_REGISTER_R32<"clock", int_ptx_read_clock>;
-def PTX_READ_CLOCK64
-  : PTX_READ_SPECIAL_REGISTER_R64<"clock64", int_ptx_read_clock64>;
-
-def PTX_READ_PM0 : PTX_READ_SPECIAL_REGISTER_R32<"pm0", int_ptx_read_pm0>;
-def PTX_READ_PM1 : PTX_READ_SPECIAL_REGISTER_R32<"pm1", int_ptx_read_pm1>;
-def PTX_READ_PM2 : PTX_READ_SPECIAL_REGISTER_R32<"pm2", int_ptx_read_pm2>;
-def PTX_READ_PM3 : PTX_READ_SPECIAL_REGISTER_R32<"pm3", int_ptx_read_pm3>;
-
-// PTX Parallel Synchronization and Communication Intrinsics
-
-def PTX_BAR_SYNC : NVPTXInst<(outs), (ins i32imm:$i), "bar.sync\t$i;",
-                             [(int_ptx_bar_sync imm:$i)]>;
+def INT_PTX_SREG_TID_X :
+    PTX_READ_SREG_R32<"tid.x", int_nvvm_read_ptx_sreg_tid_x>;
+def INT_PTX_SREG_TID_Y :
+    PTX_READ_SREG_R32<"tid.y", int_nvvm_read_ptx_sreg_tid_y>;
+def INT_PTX_SREG_TID_Z :
+    PTX_READ_SREG_R32<"tid.z", int_nvvm_read_ptx_sreg_tid_z>;
+def INT_PTX_SREG_TID_W :
+    PTX_READ_SREG_R32<"tid.w", int_nvvm_read_ptx_sreg_tid_w>;
+
+def INT_PTX_SREG_NTID_X :
+    PTX_READ_SREG_R32<"ntid.x", int_nvvm_read_ptx_sreg_ntid_x>;
+def INT_PTX_SREG_NTID_Y :
+    PTX_READ_SREG_R32<"ntid.y", int_nvvm_read_ptx_sreg_ntid_y>;
+def INT_PTX_SREG_NTID_Z :
+    PTX_READ_SREG_R32<"ntid.z", int_nvvm_read_ptx_sreg_ntid_z>;
+def INT_PTX_SREG_NTID_W :
+    PTX_READ_SREG_R32<"ntid.w", int_nvvm_read_ptx_sreg_ntid_w>;
+
+def INT_PTX_SREG_LANEID :
+    PTX_READ_SREG_R32<"laneid", int_nvvm_read_ptx_sreg_laneid>;
+def INT_PTX_SREG_WARPID :
+    PTX_READ_SREG_R32<"warpid", int_nvvm_read_ptx_sreg_warpid>;
+def INT_PTX_SREG_NWARPID :
+    PTX_READ_SREG_R32<"nwarpid", int_nvvm_read_ptx_sreg_nwarpid>;
+
+def INT_PTX_SREG_CTAID_X :
+    PTX_READ_SREG_R32<"ctaid.x", int_nvvm_read_ptx_sreg_ctaid_x>;
+def INT_PTX_SREG_CTAID_Y :
+    PTX_READ_SREG_R32<"ctaid.y", int_nvvm_read_ptx_sreg_ctaid_y>;
+def INT_PTX_SREG_CTAID_Z :
+    PTX_READ_SREG_R32<"ctaid.z", int_nvvm_read_ptx_sreg_ctaid_z>;
+def INT_PTX_SREG_CTAID_W :
+    PTX_READ_SREG_R32<"ctaid.w", int_nvvm_read_ptx_sreg_ctaid_w>;
+
+def INT_PTX_SREG_NCTAID_X :
+    PTX_READ_SREG_R32<"nctaid.x", int_nvvm_read_ptx_sreg_nctaid_x>;
+def INT_PTX_SREG_NCTAID_Y :
+    PTX_READ_SREG_R32<"nctaid.y", int_nvvm_read_ptx_sreg_nctaid_y>;
+def INT_PTX_SREG_NCTAID_Z :
+    PTX_READ_SREG_R32<"nctaid.z", int_nvvm_read_ptx_sreg_nctaid_z>;
+def INT_PTX_SREG_NCTAID_W :
+    PTX_READ_SREG_R32<"nctaid.w", int_nvvm_read_ptx_sreg_nctaid_w>;
+
+def INT_PTX_SREG_SMID :
+    PTX_READ_SREG_R32<"smid", int_nvvm_read_ptx_sreg_smid>;
+def INT_PTX_SREG_NSMID :
+    PTX_READ_SREG_R32<"nsmid", int_nvvm_read_ptx_sreg_nsmid>;
+def INT_PTX_SREG_GRIDID :
+    PTX_READ_SREG_R32<"gridid", int_nvvm_read_ptx_sreg_gridid>;
+
+def INT_PTX_SREG_LANEMASK_EQ :
+    PTX_READ_SREG_R32<"lanemask_eq", int_nvvm_read_ptx_sreg_lanemask_eq>;
+def INT_PTX_SREG_LANEMASK_LE :
+    PTX_READ_SREG_R32<"lanemask_le", int_nvvm_read_ptx_sreg_lanemask_le>;
+def INT_PTX_SREG_LANEMASK_LT :
+    PTX_READ_SREG_R32<"lanemask_lt", int_nvvm_read_ptx_sreg_lanemask_lt>;
+def INT_PTX_SREG_LANEMASK_GE :
+    PTX_READ_SREG_R32<"lanemask_ge", int_nvvm_read_ptx_sreg_lanemask_ge>;
+def INT_PTX_SREG_LANEMASK_GT :
+    PTX_READ_SREG_R32<"lanemask_gt", int_nvvm_read_ptx_sreg_lanemask_gt>;
+
+def INT_PTX_SREG_CLOCK :
+    PTX_READ_SREG_R32<"clock", int_nvvm_read_ptx_sreg_clock>;
+def INT_PTX_SREG_CLOCK64 :
+    PTX_READ_SREG_R64<"clock64", int_nvvm_read_ptx_sreg_clock64>;
+
+def INT_PTX_SREG_PM0 : PTX_READ_SREG_R32<"pm0", int_nvvm_read_ptx_sreg_pm0>;
+def INT_PTX_SREG_PM1 : PTX_READ_SREG_R32<"pm1", int_nvvm_read_ptx_sreg_pm1>;
+def INT_PTX_SREG_PM2 : PTX_READ_SREG_R32<"pm2", int_nvvm_read_ptx_sreg_pm2>;
+def INT_PTX_SREG_PM3 : PTX_READ_SREG_R32<"pm3", int_nvvm_read_ptx_sreg_pm3>;
+
+// TODO: It would be nice to use PTX_READ_SREG here, but it doesn't
+// handle the constant.
+def INT_PTX_SREG_WARPSIZE :
+    NVPTXInst<(outs Int32Regs:$dst), (ins), "mov.u32 \t$dst, WARP_SZ;",
+              [(set Int32Regs:$dst, (int_nvvm_read_ptx_sreg_warpsize))]>;
diff --git a/lib/Target/NVPTX/NVPTXLowerAlloca.cpp b/lib/Target/NVPTX/NVPTXLowerAlloca.cpp
index 624052e9b981..fa1a3ef3fe24 100644
--- a/lib/Target/NVPTX/NVPTXLowerAlloca.cpp
+++ b/lib/Target/NVPTX/NVPTXLowerAlloca.cpp
@@ -62,6 +62,9 @@ INITIALIZE_PASS(NVPTXLowerAlloca, "nvptx-lower-alloca",
 // Main function for this pass.
 // =============================================================================
 bool NVPTXLowerAlloca::runOnBasicBlock(BasicBlock &BB) {
+  if (skipBasicBlock(BB))
+    return false;
+
   bool Changed = false;
   for (auto &I : BB) {
     if (auto allocaInst = dyn_cast<AllocaInst>(&I)) {
diff --git a/lib/Target/NVPTX/NVPTXLowerKernelArgs.cpp b/lib/Target/NVPTX/NVPTXLowerKernelArgs.cpp
index 6656077348a1..d162a283f745 100644
--- a/lib/Target/NVPTX/NVPTXLowerKernelArgs.cpp
+++ b/lib/Target/NVPTX/NVPTXLowerKernelArgs.cpp
@@ -128,7 +128,7 @@ INITIALIZE_PASS(NVPTXLowerKernelArgs, "nvptx-lower-kernel-args",
                 "Lower kernel arguments (NVPTX)", false, false)
 
 // =============================================================================
-// If the function had a byval struct ptr arg, say foo(%struct.x *byval %d),
+// If the function had a byval struct ptr arg, say foo(%struct.x* byval %d),
 // then add the following instructions to the first basic block:
 //
 // %temp = alloca %struct.x, align 8
diff --git a/lib/Target/NVPTX/NVPTXMCExpr.cpp b/lib/Target/NVPTX/NVPTXMCExpr.cpp
index 3c98b9febf85..84d5239ec096 100644
--- a/lib/Target/NVPTX/NVPTXMCExpr.cpp
+++ b/lib/Target/NVPTX/NVPTXMCExpr.cpp
@@ -15,8 +15,8 @@ using namespace llvm;
 
 #define DEBUG_TYPE "nvptx-mcexpr"
 
-const NVPTXFloatMCExpr*
-NVPTXFloatMCExpr::create(VariantKind Kind, APFloat Flt, MCContext &Ctx) {
+const NVPTXFloatMCExpr *
+NVPTXFloatMCExpr::create(VariantKind Kind, const APFloat &Flt, MCContext &Ctx) {
   return new (Ctx) NVPTXFloatMCExpr(Kind, Flt);
 }
 
diff --git a/lib/Target/NVPTX/NVPTXMCExpr.h b/lib/Target/NVPTX/NVPTXMCExpr.h
index 81a606d7535c..7f833c42fa8f 100644
--- a/lib/Target/NVPTX/NVPTXMCExpr.h
+++ b/lib/Target/NVPTX/NVPTXMCExpr.h
@@ -14,6 +14,7 @@
 
 #include "llvm/ADT/APFloat.h"
 #include "llvm/MC/MCExpr.h"
+#include <utility>
 
 namespace llvm {
 
@@ -30,21 +31,21 @@ private:
   const APFloat Flt;
 
   explicit NVPTXFloatMCExpr(VariantKind Kind, APFloat Flt)
-      : Kind(Kind), Flt(Flt) {}
+      : Kind(Kind), Flt(std::move(Flt)) {}
 
 public:
   /// @name Construction
   /// @{
 
-  static const NVPTXFloatMCExpr *create(VariantKind Kind, APFloat Flt,
+  static const NVPTXFloatMCExpr *create(VariantKind Kind, const APFloat &Flt,
                                         MCContext &Ctx);
 
-  static const NVPTXFloatMCExpr *createConstantFPSingle(APFloat Flt,
+  static const NVPTXFloatMCExpr *createConstantFPSingle(const APFloat &Flt,
                                                         MCContext &Ctx) {
     return create(VK_NVPTX_SINGLE_PREC_FLOAT, Flt, Ctx);
   }
 
-  static const NVPTXFloatMCExpr *createConstantFPDouble(APFloat Flt,
+  static const NVPTXFloatMCExpr *createConstantFPDouble(const APFloat &Flt,
                                                         MCContext &Ctx) {
     return create(VK_NVPTX_DOUBLE_PREC_FLOAT, Flt, Ctx);
   }
diff --git a/lib/Target/NVPTX/NVPTXPeephole.cpp b/lib/Target/NVPTX/NVPTXPeephole.cpp
index a61c291d233f..7d0cd553e03f 100644
--- a/lib/Target/NVPTX/NVPTXPeephole.cpp
+++ b/lib/Target/NVPTX/NVPTXPeephole.cpp
@@ -125,6 +125,9 @@ static void CombineCVTAToLocal(MachineInstr &Root) {
 }
 
 bool NVPTXPeephole::runOnMachineFunction(MachineFunction &MF) {
+  if (skipFunction(*MF.getFunction()))
+    return false;
+
   bool Changed = false;
   // Loop over all of the basic blocks.
   for (auto &MBB : MF) {
diff --git a/lib/Target/NVPTX/NVPTXPrologEpilogPass.cpp b/lib/Target/NVPTX/NVPTXPrologEpilogPass.cpp
index 17019d7b364d..029e0097c5dc 100644
--- a/lib/Target/NVPTX/NVPTXPrologEpilogPass.cpp
+++ b/lib/Target/NVPTX/NVPTXPrologEpilogPass.cpp
@@ -55,11 +55,10 @@ bool NVPTXPrologEpilogPass::runOnMachineFunction(MachineFunction &MF) {
 
   calculateFrameObjectOffsets(MF);
 
-  for (MachineFunction::iterator BB = MF.begin(), E = MF.end(); BB != E; ++BB) {
-    for (MachineBasicBlock::iterator I = BB->begin(); I != BB->end(); ++I) {
-      MachineInstr *MI = I;
-      for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) {
-        if (!MI->getOperand(i).isFI())
+  for (MachineBasicBlock &MBB : MF) {
+    for (MachineInstr &MI : MBB) {
+      for (unsigned i = 0, e = MI.getNumOperands(); i != e; ++i) {
+        if (!MI.getOperand(i).isFI())
           continue;
         TRI.eliminateFrameIndex(MI, 0, i, nullptr);
         Modified = true;
diff --git a/lib/Target/NVPTX/NVPTXSection.h b/lib/Target/NVPTX/NVPTXSection.h
index 45a7309479ee..cad4f5668fdf 100644
--- a/lib/Target/NVPTX/NVPTXSection.h
+++ b/lib/Target/NVPTX/NVPTXSection.h
@@ -16,7 +16,6 @@
 
 #include "llvm/IR/GlobalVariable.h"
 #include "llvm/MC/MCSection.h"
-#include <vector>
 
 namespace llvm {
 /// Represents a section in PTX PTX does not have sections. We create this class
diff --git a/lib/Target/NVPTX/NVPTXSubtarget.h b/lib/Target/NVPTX/NVPTXSubtarget.h
index c7287719be5f..41670390c41b 100644
--- a/lib/Target/NVPTX/NVPTXSubtarget.h
+++ b/lib/Target/NVPTX/NVPTXSubtarget.h
@@ -19,8 +19,8 @@
 #include "NVPTXISelLowering.h"
 #include "NVPTXInstrInfo.h"
 #include "NVPTXRegisterInfo.h"
+#include "llvm/CodeGen/SelectionDAGTargetInfo.h"
 #include "llvm/IR/DataLayout.h"
-#include "llvm/Target/TargetSelectionDAGInfo.h"
 #include "llvm/Target/TargetSubtargetInfo.h"
 #include <string>
 
@@ -42,7 +42,7 @@ class NVPTXSubtarget : public NVPTXGenSubtargetInfo {
   const NVPTXTargetMachine &TM;
   NVPTXInstrInfo InstrInfo;
   NVPTXTargetLowering TLInfo;
-  TargetSelectionDAGInfo TSInfo;
+  SelectionDAGTargetInfo TSInfo;
 
   // NVPTX does not have any call stack frame, but need a NVPTX specific
   // FrameLowering class because TargetFrameLowering is abstract.
@@ -65,7 +65,7 @@ public:
   const NVPTXTargetLowering *getTargetLowering() const override {
     return &TLInfo;
   }
-  const TargetSelectionDAGInfo *getSelectionDAGInfo() const override {
+  const SelectionDAGTargetInfo *getSelectionDAGInfo() const override {
     return &TSInfo;
   }
 
diff --git a/lib/Target/NVPTX/NVPTXTargetMachine.cpp b/lib/Target/NVPTX/NVPTXTargetMachine.cpp
index aa931b134da9..b9f5919964c7 100644
--- a/lib/Target/NVPTX/NVPTXTargetMachine.cpp
+++ b/lib/Target/NVPTX/NVPTXTargetMachine.cpp
@@ -23,6 +23,7 @@
 #include "llvm/CodeGen/MachineFunctionAnalysis.h"
 #include "llvm/CodeGen/MachineModuleInfo.h"
 #include "llvm/CodeGen/Passes.h"
+#include "llvm/CodeGen/TargetPassConfig.h"
 #include "llvm/IR/DataLayout.h"
 #include "llvm/IR/IRPrintingPasses.h"
 #include "llvm/IR/LegacyPassManager.h"
@@ -44,15 +45,23 @@
 #include "llvm/Target/TargetRegisterInfo.h"
 #include "llvm/Target/TargetSubtargetInfo.h"
 #include "llvm/Transforms/Scalar.h"
+#include "llvm/Transforms/Scalar/GVN.h"
 
 using namespace llvm;
 
+static cl::opt<bool> UseInferAddressSpaces(
+    "nvptx-use-infer-addrspace", cl::init(false), cl::Hidden,
+    cl::desc("Optimize address spaces using NVPTXInferAddressSpaces instead of "
+             "NVPTXFavorNonGenericAddrSpaces"));
+
 namespace llvm {
+void initializeNVVMIntrRangePass(PassRegistry&);
 void initializeNVVMReflectPass(PassRegistry&);
 void initializeGenericToNVVMPass(PassRegistry&);
 void initializeNVPTXAllocaHoistingPass(PassRegistry &);
 void initializeNVPTXAssignValidGlobalNamesPass(PassRegistry&);
 void initializeNVPTXFavorNonGenericAddrSpacesPass(PassRegistry &);
+void initializeNVPTXInferAddressSpacesPass(PassRegistry &);
 void initializeNVPTXLowerAggrCopiesPass(PassRegistry &);
 void initializeNVPTXLowerKernelArgsPass(PassRegistry &);
 void initializeNVPTXLowerAllocaPass(PassRegistry &);
@@ -67,10 +76,12 @@ extern "C" void LLVMInitializeNVPTXTarget() {
   // but it's very NVPTX-specific.
   PassRegistry &PR = *PassRegistry::getPassRegistry();
   initializeNVVMReflectPass(PR);
+  initializeNVVMIntrRangePass(PR);
   initializeGenericToNVVMPass(PR);
   initializeNVPTXAllocaHoistingPass(PR);
   initializeNVPTXAssignValidGlobalNamesPass(PR);
   initializeNVPTXFavorNonGenericAddrSpacesPass(PR);
+  initializeNVPTXInferAddressSpacesPass(PR);
   initializeNVPTXLowerKernelArgsPass(PR);
   initializeNVPTXLowerAllocaPass(PR);
   initializeNVPTXLowerAggrCopiesPass(PR);
@@ -90,11 +101,15 @@ static std::string computeDataLayout(bool is64Bit) {
 NVPTXTargetMachine::NVPTXTargetMachine(const Target &T, const Triple &TT,
                                        StringRef CPU, StringRef FS,
                                        const TargetOptions &Options,
-                                       Reloc::Model RM, CodeModel::Model CM,
+                                       Optional<Reloc::Model> RM,
+                                       CodeModel::Model CM,
                                        CodeGenOpt::Level OL, bool is64bit)
-    : LLVMTargetMachine(T, computeDataLayout(is64bit), TT, CPU, FS, Options, RM,
-                        CM, OL),
-      is64bit(is64bit), TLOF(make_unique<NVPTXTargetObjectFile>()),
+    // The pic relocation model is used regardless of what the client has
+    // specified, as it is the only relocation model currently supported.
+    : LLVMTargetMachine(T, computeDataLayout(is64bit), TT, CPU, FS, Options,
+                        Reloc::PIC_, CM, OL),
+      is64bit(is64bit),
+      TLOF(make_unique<NVPTXTargetObjectFile>()),
       Subtarget(TT, CPU, FS, *this) {
   if (TT.getOS() == Triple::NVCL)
     drvInterface = NVPTX::NVCL;
@@ -110,7 +125,8 @@ void NVPTXTargetMachine32::anchor() {}
 NVPTXTargetMachine32::NVPTXTargetMachine32(const Target &T, const Triple &TT,
                                            StringRef CPU, StringRef FS,
                                            const TargetOptions &Options,
-                                           Reloc::Model RM, CodeModel::Model CM,
+                                           Optional<Reloc::Model> RM,
+                                           CodeModel::Model CM,
                                            CodeGenOpt::Level OL)
     : NVPTXTargetMachine(T, TT, CPU, FS, Options, RM, CM, OL, false) {}
 
@@ -119,7 +135,8 @@ void NVPTXTargetMachine64::anchor() {}
 NVPTXTargetMachine64::NVPTXTargetMachine64(const Target &T, const Triple &TT,
                                            StringRef CPU, StringRef FS,
                                            const TargetOptions &Options,
-                                           Reloc::Model RM, CodeModel::Model CM,
+                                           Optional<Reloc::Model> RM,
+                                           CodeModel::Model CM,
                                            CodeGenOpt::Level OL)
     : NVPTXTargetMachine(T, TT, CPU, FS, Options, RM, CM, OL, true) {}
 
@@ -143,14 +160,25 @@ public:
   void addOptimizedRegAlloc(FunctionPass *RegAllocPass) override;
 
 private:
-  // if the opt level is aggressive, add GVN; otherwise, add EarlyCSE.
+  // If the opt level is aggressive, add GVN; otherwise, add EarlyCSE. This
+  // function is only called in opt mode.
   void addEarlyCSEOrGVNPass();
+
+  // Add passes that propagate special memory spaces.
+  void addAddressSpaceInferencePasses();
+
+  // Add passes that perform straight-line scalar optimizations.
+  void addStraightLineScalarOptimizationPasses();
 };
 } // end anonymous namespace
 
 TargetPassConfig *NVPTXTargetMachine::createPassConfig(PassManagerBase &PM) {
-  NVPTXPassConfig *PassConfig = new NVPTXPassConfig(this, PM);
-  return PassConfig;
+  return new NVPTXPassConfig(this, PM);
+}
+
+void NVPTXTargetMachine::addEarlyAsPossiblePasses(PassManagerBase &PM) {
+  PM.add(createNVVMReflectPass());
+  PM.add(createNVVMIntrRangePass(Subtarget.getSmVersion()));
 }
 
 TargetIRAnalysis NVPTXTargetMachine::getTargetIRAnalysis() {
@@ -166,34 +194,23 @@ void NVPTXPassConfig::addEarlyCSEOrGVNPass() {
     addPass(createEarlyCSEPass());
 }
 
-void NVPTXPassConfig::addIRPasses() {
-  // The following passes are known to not play well with virtual regs hanging
-  // around after register allocation (which in our case, is *all* registers).
-  // We explicitly disable them here.  We do, however, need some functionality
-  // of the PrologEpilogCodeInserter pass, so we emulate that behavior in the
-  // NVPTXPrologEpilog pass (see NVPTXPrologEpilogPass.cpp).
-  disablePass(&PrologEpilogCodeInserterID);
-  disablePass(&MachineCopyPropagationID);
-  disablePass(&TailDuplicateID);
-
-  addPass(createNVVMReflectPass());
-  addPass(createNVPTXImageOptimizerPass());
-  addPass(createNVPTXAssignValidGlobalNamesPass());
-  addPass(createGenericToNVVMPass());
-
-  // === Propagate special address spaces ===
-  addPass(createNVPTXLowerKernelArgsPass(&getNVPTXTargetMachine()));
+void NVPTXPassConfig::addAddressSpaceInferencePasses() {
   // NVPTXLowerKernelArgs emits alloca for byval parameters which can often
   // be eliminated by SROA.
   addPass(createSROAPass());
   addPass(createNVPTXLowerAllocaPass());
-  addPass(createNVPTXFavorNonGenericAddrSpacesPass());
-  // FavorNonGenericAddrSpaces shortcuts unnecessary addrspacecasts, and leave
-  // them unused. We could remove dead code in an ad-hoc manner, but that
-  // requires manual work and might be error-prone.
-  addPass(createDeadCodeEliminationPass());
+  if (UseInferAddressSpaces) {
+    addPass(createNVPTXInferAddressSpacesPass());
+  } else {
+    addPass(createNVPTXFavorNonGenericAddrSpacesPass());
+    // FavorNonGenericAddrSpaces shortcuts unnecessary addrspacecasts, and leave
+    // them unused. We could remove dead code in an ad-hoc manner, but that
+    // requires manual work and might be error-prone.
+    addPass(createDeadCodeEliminationPass());
+  }
+}
 
-  // === Straight-line scalar optimizations ===
+void NVPTXPassConfig::addStraightLineScalarOptimizationPasses() {
   addPass(createSeparateConstOffsetFromGEPPass());
   addPass(createSpeculativeExecutionPass());
   // ReassociateGEPs exposes more opportunites for SLSR. See
@@ -208,6 +225,41 @@ void NVPTXPassConfig::addIRPasses() {
   // NaryReassociate on GEPs creates redundant common expressions, so run
   // EarlyCSE after it.
   addPass(createEarlyCSEPass());
+}
+
+void NVPTXPassConfig::addIRPasses() {
+  // The following passes are known to not play well with virtual regs hanging
+  // around after register allocation (which in our case, is *all* registers).
+  // We explicitly disable them here.  We do, however, need some functionality
+  // of the PrologEpilogCodeInserter pass, so we emulate that behavior in the
+  // NVPTXPrologEpilog pass (see NVPTXPrologEpilogPass.cpp).
+  disablePass(&PrologEpilogCodeInserterID);
+  disablePass(&MachineCopyPropagationID);
+  disablePass(&TailDuplicateID);
+  disablePass(&StackMapLivenessID);
+  disablePass(&LiveDebugValuesID);
+  disablePass(&PostRASchedulerID);
+  disablePass(&FuncletLayoutID);
+  disablePass(&PatchableFunctionID);
+
+  // NVVMReflectPass is added in addEarlyAsPossiblePasses, so hopefully running
+  // it here does nothing.  But since we need it for correctness when lowering
+  // to NVPTX, run it here too, in case whoever built our pass pipeline didn't
+  // call addEarlyAsPossiblePasses.
+  addPass(createNVVMReflectPass());
+
+  if (getOptLevel() != CodeGenOpt::None)
+    addPass(createNVPTXImageOptimizerPass());
+  addPass(createNVPTXAssignValidGlobalNamesPass());
+  addPass(createGenericToNVVMPass());
+
+  // NVPTXLowerKernelArgs is required for correctness and should be run right
+  // before the address space inference passes.
+  addPass(createNVPTXLowerKernelArgsPass(&getNVPTXTargetMachine()));
+  if (getOptLevel() != CodeGenOpt::None) {
+    addAddressSpaceInferencePasses();
+    addStraightLineScalarOptimizationPasses();
+  }
 
   // === LSR and other generic IR passes ===
   TargetPassConfig::addIRPasses();
@@ -223,7 +275,8 @@ void NVPTXPassConfig::addIRPasses() {
   //   %1 = shl %a, 2
   //
   // but EarlyCSE can do neither of them.
-  addEarlyCSEOrGVNPass();
+  if (getOptLevel() != CodeGenOpt::None)
+    addEarlyCSEOrGVNPass();
 }
 
 bool NVPTXPassConfig::addInstSelector() {
@@ -241,10 +294,12 @@ bool NVPTXPassConfig::addInstSelector() {
 
 void NVPTXPassConfig::addPostRegAlloc() {
   addPass(createNVPTXPrologEpilogPass(), false);
-  // NVPTXPrologEpilogPass calculates frame object offset and replace frame
-  // index with VRFrame register. NVPTXPeephole need to be run after that and
-  // will replace VRFrame with VRFrameLocal when possible.
-  addPass(createNVPTXPeephole());
+  if (getOptLevel() != CodeGenOpt::None) {
+    // NVPTXPrologEpilogPass calculates frame object offset and replace frame
+    // index with VRFrame register. NVPTXPeephole need to be run after that and
+    // will replace VRFrame with VRFrameLocal when possible.
+    addPass(createNVPTXPeephole());
+  }
 }
 
 FunctionPass *NVPTXPassConfig::createTargetRegisterAllocator(bool) {
diff --git a/lib/Target/NVPTX/NVPTXTargetMachine.h b/lib/Target/NVPTX/NVPTXTargetMachine.h
index da7f62bf9d9b..78a053831772 100644
--- a/lib/Target/NVPTX/NVPTXTargetMachine.h
+++ b/lib/Target/NVPTX/NVPTXTargetMachine.h
@@ -16,9 +16,9 @@
 
 #include "ManagedStringPool.h"
 #include "NVPTXSubtarget.h"
+#include "llvm/CodeGen/SelectionDAGTargetInfo.h"
 #include "llvm/Target/TargetFrameLowering.h"
 #include "llvm/Target/TargetMachine.h"
-#include "llvm/Target/TargetSelectionDAGInfo.h"
 
 namespace llvm {
 
@@ -36,8 +36,8 @@ class NVPTXTargetMachine : public LLVMTargetMachine {
 public:
   NVPTXTargetMachine(const Target &T, const Triple &TT, StringRef CPU,
                      StringRef FS, const TargetOptions &Options,
-                     Reloc::Model RM, CodeModel::Model CM, CodeGenOpt::Level OP,
-                     bool is64bit);
+                     Optional<Reloc::Model> RM, CodeModel::Model CM,
+                     CodeGenOpt::Level OP, bool is64bit);
 
   ~NVPTXTargetMachine() override;
   const NVPTXSubtarget *getSubtargetImpl(const Function &) const override {
@@ -61,6 +61,7 @@ public:
     return TLOF.get();
   }
 
+  void addEarlyAsPossiblePasses(PassManagerBase &PM) override;
   TargetIRAnalysis getTargetIRAnalysis() override;
 
 }; // NVPTXTargetMachine.
@@ -70,7 +71,7 @@ class NVPTXTargetMachine32 : public NVPTXTargetMachine {
 public:
   NVPTXTargetMachine32(const Target &T, const Triple &TT, StringRef CPU,
                        StringRef FS, const TargetOptions &Options,
-                       Reloc::Model RM, CodeModel::Model CM,
+                       Optional<Reloc::Model> RM, CodeModel::Model CM,
                        CodeGenOpt::Level OL);
 };
 
@@ -79,7 +80,7 @@ class NVPTXTargetMachine64 : public NVPTXTargetMachine {
 public:
   NVPTXTargetMachine64(const Target &T, const Triple &TT, StringRef CPU,
                        StringRef FS, const TargetOptions &Options,
-                       Reloc::Model RM, CodeModel::Model CM,
+                       Optional<Reloc::Model> RM, CodeModel::Model CM,
                        CodeGenOpt::Level OL);
 };
 
diff --git a/lib/Target/NVPTX/NVPTXTargetObjectFile.h b/lib/Target/NVPTX/NVPTXTargetObjectFile.h
index 683b9a3f49f7..045fbb75a2a0 100644
--- a/lib/Target/NVPTX/NVPTXTargetObjectFile.h
+++ b/lib/Target/NVPTX/NVPTXTargetObjectFile.h
@@ -12,7 +12,6 @@
 
 #include "NVPTXSection.h"
 #include "llvm/Target/TargetLoweringObjectFile.h"
-#include <string>
 
 namespace llvm {
 class GlobalVariable;
@@ -87,7 +86,8 @@ public:
   }
 
   MCSection *getSectionForConstant(const DataLayout &DL, SectionKind Kind,
-                                   const Constant *C) const override {
+                                   const Constant *C,
+                                   unsigned &Align) const override {
     return ReadOnlySection;
   }
 
diff --git a/lib/Target/NVPTX/NVPTXTargetTransformInfo.cpp b/lib/Target/NVPTX/NVPTXTargetTransformInfo.cpp
index 6e679dd0257c..580d345cc663 100644
--- a/lib/Target/NVPTX/NVPTXTargetTransformInfo.cpp
+++ b/lib/Target/NVPTX/NVPTXTargetTransformInfo.cpp
@@ -32,7 +32,7 @@ static bool readsThreadIndex(const IntrinsicInst *II) {
 }
 
 static bool readsLaneId(const IntrinsicInst *II) {
-  return II->getIntrinsicID() == Intrinsic::ptx_read_laneid;
+  return II->getIntrinsicID() == Intrinsic::nvvm_read_ptx_sreg_laneid;
 }
 
 // Whether the given intrinsic is an atomic instruction in PTX.
diff --git a/lib/Target/NVPTX/NVPTXTargetTransformInfo.h b/lib/Target/NVPTX/NVPTXTargetTransformInfo.h
index 0946a3293eec..08ffdf191151 100644
--- a/lib/Target/NVPTX/NVPTXTargetTransformInfo.h
+++ b/lib/Target/NVPTX/NVPTXTargetTransformInfo.h
@@ -52,6 +52,10 @@ public:
 
   bool isSourceOfDivergence(const Value *V);
 
+  // Increase the inlining cost threshold by a factor of 5, reflecting that
+  // calls are particularly expensive in NVPTX.
+  unsigned getInliningThresholdMultiplier() { return 5; }
+
   int getArithmeticInstrCost(
       unsigned Opcode, Type *Ty,
       TTI::OperandValueKind Opd1Info = TTI::OK_AnyValue,
diff --git a/lib/Target/NVPTX/NVPTXUtilities.cpp b/lib/Target/NVPTX/NVPTXUtilities.cpp
index 578b466568ae..835e4b442039 100644
--- a/lib/Target/NVPTX/NVPTXUtilities.cpp
+++ b/lib/Target/NVPTX/NVPTXUtilities.cpp
@@ -99,7 +99,7 @@ static void cacheAnnotationFromMD(const Module *m, const GlobalValue *gv) {
   }
 }
 
-bool llvm::findOneNVVMAnnotation(const GlobalValue *gv, std::string prop,
+bool llvm::findOneNVVMAnnotation(const GlobalValue *gv, const std::string &prop,
                                  unsigned &retval) {
   MutexGuard Guard(Lock);
   const Module *m = gv->getParent();
@@ -113,7 +113,7 @@ bool llvm::findOneNVVMAnnotation(const GlobalValue *gv, std::string prop,
   return true;
 }
 
-bool llvm::findAllNVVMAnnotation(const GlobalValue *gv, std::string prop,
+bool llvm::findAllNVVMAnnotation(const GlobalValue *gv, const std::string &prop,
                                  std::vector<unsigned> &retval) {
   MutexGuard Guard(Lock);
   const Module *m = gv->getParent();
diff --git a/lib/Target/NVPTX/NVPTXUtilities.h b/lib/Target/NVPTX/NVPTXUtilities.h
index a5262cb7412f..ec5bfc17afc7 100644
--- a/lib/Target/NVPTX/NVPTXUtilities.h
+++ b/lib/Target/NVPTX/NVPTXUtilities.h
@@ -30,8 +30,9 @@ namespace llvm {
 
 void clearAnnotationCache(const llvm::Module *);
 
-bool findOneNVVMAnnotation(const llvm::GlobalValue *, std::string, unsigned &);
-bool findAllNVVMAnnotation(const llvm::GlobalValue *, std::string,
+bool findOneNVVMAnnotation(const llvm::GlobalValue *, const std::string &,
+                           unsigned &);
+bool findAllNVVMAnnotation(const llvm::GlobalValue *, const std::string &,
                            std::vector<unsigned> &);
 
 bool isTexture(const llvm::Value &);
diff --git a/lib/Target/NVPTX/NVVMIntrRange.cpp b/lib/Target/NVPTX/NVVMIntrRange.cpp
new file mode 100644
index 000000000000..b9c02c431141
--- /dev/null
+++ b/lib/Target/NVPTX/NVVMIntrRange.cpp
@@ -0,0 +1,148 @@
+//===- NVVMIntrRange.cpp - Set !range metadata for NVVM intrinsics --------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This pass adds appropriate !range metadata for calls to NVVM
+// intrinsics that return a limited range of values.
+//
+//===----------------------------------------------------------------------===//
+
+#include "NVPTX.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/InstIterator.h"
+#include "llvm/IR/Intrinsics.h"
+#include "llvm/IR/Instructions.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "nvvm-intr-range"
+
+namespace llvm { void initializeNVVMIntrRangePass(PassRegistry &); }
+
+// Add !range metadata based on limits of given SM variant.
+static cl::opt<unsigned> NVVMIntrRangeSM("nvvm-intr-range-sm", cl::init(20),
+                                         cl::Hidden, cl::desc("SM variant"));
+
+namespace {
+class NVVMIntrRange : public FunctionPass {
+ private:
+   struct {
+     unsigned x, y, z;
+   } MaxBlockSize, MaxGridSize;
+
+ public:
+   static char ID;
+   NVVMIntrRange() : NVVMIntrRange(NVVMIntrRangeSM) {}
+   NVVMIntrRange(unsigned int SmVersion) : FunctionPass(ID) {
+     MaxBlockSize.x = 1024;
+     MaxBlockSize.y = 1024;
+     MaxBlockSize.z = 64;
+
+     MaxGridSize.x = SmVersion >= 30 ? 0x7fffffff : 0xffff;
+     MaxGridSize.y = 0xffff;
+     MaxGridSize.z = 0xffff;
+
+     initializeNVVMIntrRangePass(*PassRegistry::getPassRegistry());
+   }
+
+   bool runOnFunction(Function &) override;
+};
+}
+
+FunctionPass *llvm::createNVVMIntrRangePass(unsigned int SmVersion) {
+  return new NVVMIntrRange(SmVersion);
+}
+
+char NVVMIntrRange::ID = 0;
+INITIALIZE_PASS(NVVMIntrRange, "nvvm-intr-range",
+                "Add !range metadata to NVVM intrinsics.", false, false)
+
+// Adds the passed-in [Low,High) range information as metadata to the
+// passed-in call instruction.
+static bool addRangeMetadata(uint64_t Low, uint64_t High, CallInst *C) {
+  LLVMContext &Context = C->getParent()->getContext();
+  IntegerType *Int32Ty = Type::getInt32Ty(Context);
+  Metadata *LowAndHigh[] = {
+      ConstantAsMetadata::get(ConstantInt::get(Int32Ty, Low)),
+      ConstantAsMetadata::get(ConstantInt::get(Int32Ty, High))};
+  C->setMetadata(LLVMContext::MD_range, MDNode::get(Context, LowAndHigh));
+  return true;
+}
+
+bool NVVMIntrRange::runOnFunction(Function &F) {
+  // Go through the calls in this function.
+  bool Changed = false;
+  for (Instruction &I : instructions(F)) {
+    CallInst *Call = dyn_cast<CallInst>(&I);
+    if (!Call)
+      continue;
+
+    if (Function *Callee = Call->getCalledFunction()) {
+      switch (Callee->getIntrinsicID()) {
+      // Index within block
+      case Intrinsic::nvvm_read_ptx_sreg_tid_x:
+        Changed |= addRangeMetadata(0, MaxBlockSize.x, Call);
+        break;
+      case Intrinsic::nvvm_read_ptx_sreg_tid_y:
+        Changed |= addRangeMetadata(0, MaxBlockSize.y, Call);
+        break;
+      case Intrinsic::nvvm_read_ptx_sreg_tid_z:
+        Changed |= addRangeMetadata(0, MaxBlockSize.z, Call);
+        break;
+
+      // Block size
+      case Intrinsic::nvvm_read_ptx_sreg_ntid_x:
+        Changed |= addRangeMetadata(1, MaxBlockSize.x+1, Call);
+        break;
+      case Intrinsic::nvvm_read_ptx_sreg_ntid_y:
+        Changed |= addRangeMetadata(1, MaxBlockSize.y+1, Call);
+        break;
+      case Intrinsic::nvvm_read_ptx_sreg_ntid_z:
+        Changed |= addRangeMetadata(1, MaxBlockSize.z+1, Call);
+        break;
+
+      // Index within grid
+      case Intrinsic::nvvm_read_ptx_sreg_ctaid_x:
+        Changed |= addRangeMetadata(0, MaxGridSize.x, Call);
+        break;
+      case Intrinsic::nvvm_read_ptx_sreg_ctaid_y:
+        Changed |= addRangeMetadata(0, MaxGridSize.y, Call);
+        break;
+      case Intrinsic::nvvm_read_ptx_sreg_ctaid_z:
+        Changed |= addRangeMetadata(0, MaxGridSize.z, Call);
+        break;
+
+      // Grid size
+      case Intrinsic::nvvm_read_ptx_sreg_nctaid_x:
+        Changed |= addRangeMetadata(1, MaxGridSize.x+1, Call);
+        break;
+      case Intrinsic::nvvm_read_ptx_sreg_nctaid_y:
+        Changed |= addRangeMetadata(1, MaxGridSize.y+1, Call);
+        break;
+      case Intrinsic::nvvm_read_ptx_sreg_nctaid_z:
+        Changed |= addRangeMetadata(1, MaxGridSize.z+1, Call);
+        break;
+
+      // warp size is constant 32.
+      case Intrinsic::nvvm_read_ptx_sreg_warpsize:
+        Changed |= addRangeMetadata(32, 32+1, Call);
+        break;
+
+      // Lane ID is [0..warpsize)
+      case Intrinsic::nvvm_read_ptx_sreg_laneid:
+        Changed |= addRangeMetadata(0, 32, Call);
+        break;
+
+      default:
+        break;
+      }
+    }
+  }
+
+  return Changed;
+}
diff --git a/lib/Target/NVPTX/NVVMReflect.cpp b/lib/Target/NVPTX/NVVMReflect.cpp
index 20ab5db584d2..e0c35e7039e5 100644
--- a/lib/Target/NVPTX/NVVMReflect.cpp
+++ b/lib/Target/NVPTX/NVVMReflect.cpp
@@ -7,20 +7,26 @@
 //
 //===----------------------------------------------------------------------===//
 //
-// This pass replaces occurrences of __nvvm_reflect("string") with an
-// integer based on -nvvm-reflect-list string=<int> option given to this pass.
-// If an undefined string value is seen in a call to __nvvm_reflect("string"),
-// a default value of 0 will be used.
+// This pass replaces occurrences of __nvvm_reflect("foo") and llvm.nvvm.reflect
+// with an integer.
+//
+// We choose the value we use by looking, in this order, at:
+//
+//  * the -nvvm-reflect-list flag, which has the format "foo=1,bar=42",
+//  * the StringMap passed to the pass's constructor, and
+//  * metadata in the module itself.
+//
+// If we see an unknown string, we replace its call with 0.
 //
 //===----------------------------------------------------------------------===//
 
 #include "NVPTX.h"
-#include "llvm/ADT/DenseMap.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/StringMap.h"
 #include "llvm/IR/Constants.h"
 #include "llvm/IR/DerivedTypes.h"
 #include "llvm/IR/Function.h"
+#include "llvm/IR/InstIterator.h"
 #include "llvm/IR/Instructions.h"
 #include "llvm/IR/Intrinsics.h"
 #include "llvm/IR/Module.h"
@@ -31,11 +37,8 @@
 #include "llvm/Support/raw_os_ostream.h"
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/Transforms/Scalar.h"
-#include <map>
 #include <sstream>
 #include <string>
-#include <vector>
-
 #define NVVM_REFLECT_FUNCTION "__nvvm_reflect"
 
 using namespace llvm;
@@ -45,31 +48,21 @@ using namespace llvm;
 namespace llvm { void initializeNVVMReflectPass(PassRegistry &); }
 
 namespace {
-class NVVMReflect : public ModulePass {
+class NVVMReflect : public FunctionPass {
 private:
   StringMap<int> VarMap;
-  typedef DenseMap<std::string, int>::iterator VarMapIter;
 
 public:
   static char ID;
-  NVVMReflect() : ModulePass(ID) {
-    initializeNVVMReflectPass(*PassRegistry::getPassRegistry());
-    VarMap.clear();
-  }
+  NVVMReflect() : NVVMReflect(StringMap<int>()) {}
 
   NVVMReflect(const StringMap<int> &Mapping)
-  : ModulePass(ID) {
+      : FunctionPass(ID), VarMap(Mapping) {
     initializeNVVMReflectPass(*PassRegistry::getPassRegistry());
-    for (StringMap<int>::const_iterator I = Mapping.begin(), E = Mapping.end();
-         I != E; ++I) {
-      VarMap[(*I).getKey()] = (*I).getValue();
-    }
+    setVarMap();
   }
 
-  void getAnalysisUsage(AnalysisUsage &AU) const override {
-    AU.setPreservesAll();
-  }
-  bool runOnModule(Module &) override;
+  bool runOnFunction(Function &) override;
 
 private:
   bool handleFunction(Function *ReflectFunction);
@@ -77,11 +70,8 @@ private:
 };
 }
 
-ModulePass *llvm::createNVVMReflectPass() {
-  return new NVVMReflect();
-}
-
-ModulePass *llvm::createNVVMReflectPass(const StringMap<int>& Mapping) {
+FunctionPass *llvm::createNVVMReflectPass() { return new NVVMReflect(); }
+FunctionPass *llvm::createNVVMReflectPass(const StringMap<int> &Mapping) {
   return new NVVMReflect(Mapping);
 }
 
@@ -123,30 +113,35 @@ void NVVMReflect::setVarMap() {
   }
 }
 
-bool NVVMReflect::handleFunction(Function *ReflectFunction) {
-  // Validate _reflect function
-  assert(ReflectFunction->isDeclaration() &&
-         "_reflect function should not have a body");
-  assert(ReflectFunction->getReturnType()->isIntegerTy() &&
-         "_reflect's return type should be integer");
+bool NVVMReflect::runOnFunction(Function &F) {
+  if (!NVVMReflectEnabled)
+    return false;
+
+  if (F.getName() == NVVM_REFLECT_FUNCTION) {
+    assert(F.isDeclaration() && "_reflect function should not have a body");
+    assert(F.getReturnType()->isIntegerTy() &&
+           "_reflect's return type should be integer");
+    return false;
+  }
 
-  std::vector<Instruction *> ToRemove;
+  SmallVector<Instruction *, 4> ToRemove;
 
-  // Go through the uses of ReflectFunction in this Function.
-  // Each of them should a CallInst with a ConstantArray argument.
-  // First validate that. If the c-string corresponding to the
-  // ConstantArray can be found successfully, see if it can be
-  // found in VarMap. If so, replace the uses of CallInst with the
-  // value found in VarMap. If not, replace the use  with value 0.
+  // Go through the calls in this function.  Each call to __nvvm_reflect or
+  // llvm.nvvm.reflect should be a CallInst with a ConstantArray argument.
+  // First validate that. If the c-string corresponding to the ConstantArray can
+  // be found successfully, see if it can be found in VarMap. If so, replace the
+  // uses of CallInst with the value found in VarMap. If not, replace the use
+  // with value 0.
 
-  // IR for __nvvm_reflect calls differs between CUDA versions:
+  // The IR for __nvvm_reflect calls differs between CUDA versions.
+  //
   // CUDA 6.5 and earlier uses this sequence:
   //    %ptr = tail call i8* @llvm.nvvm.ptr.constant.to.gen.p0i8.p4i8
   //        (i8 addrspace(4)* getelementptr inbounds
   //           ([8 x i8], [8 x i8] addrspace(4)* @str, i32 0, i32 0))
   //    %reflect = tail call i32 @__nvvm_reflect(i8* %ptr)
   //
-  // Value returned by Sym->getOperand(0) is a Constant with a
+  // The value returned by Sym->getOperand(0) is a Constant with a
   // ConstantDataSequential operand which can be converted to string and used
   // for lookup.
   //
@@ -157,31 +152,37 @@ bool NVVMReflect::handleFunction(Function *ReflectFunction) {
   //
   // In this case, we get a Constant with a GlobalVariable operand and we need
   // to dig deeper to find its initializer with the string we'll use for lookup.
-
-  for (User *U : ReflectFunction->users()) {
-    assert(isa<CallInst>(U) && "Only a call instruction can use _reflect");
-    CallInst *Reflect = cast<CallInst>(U);
-
-    assert((Reflect->getNumOperands() == 2) &&
-           "Only one operand expect for _reflect function");
-    // In cuda, we will have an extra constant-to-generic conversion of
-    // the string.
-    const Value *Str = Reflect->getArgOperand(0);
-    if (isa<CallInst>(Str)) {
-      // CUDA path
-      const CallInst *ConvCall = cast<CallInst>(Str);
+  for (Instruction &I : instructions(F)) {
+    CallInst *Call = dyn_cast<CallInst>(&I);
+    if (!Call)
+      continue;
+    Function *Callee = Call->getCalledFunction();
+    if (!Callee || (Callee->getName() != NVVM_REFLECT_FUNCTION &&
+                    Callee->getIntrinsicID() != Intrinsic::nvvm_reflect))
+      continue;
+
+    // FIXME: Improve error handling here and elsewhere in this pass.
+    assert(Call->getNumOperands() == 2 &&
+           "Wrong number of operands to __nvvm_reflect function");
+
+    // In cuda 6.5 and earlier, we will have an extra constant-to-generic
+    // conversion of the string.
+    const Value *Str = Call->getArgOperand(0);
+    if (const CallInst *ConvCall = dyn_cast<CallInst>(Str)) {
+      // FIXME: Add assertions about ConvCall.
       Str = ConvCall->getArgOperand(0);
     }
     assert(isa<ConstantExpr>(Str) &&
-           "Format of _reflect function not recognized");
+           "Format of __nvvm__reflect function not recognized");
     const ConstantExpr *GEP = cast<ConstantExpr>(Str);
 
     const Value *Sym = GEP->getOperand(0);
-    assert(isa<Constant>(Sym) && "Format of _reflect function not recognized");
+    assert(isa<Constant>(Sym) &&
+           "Format of __nvvm_reflect function not recognized");
 
     const Value *Operand = cast<Constant>(Sym)->getOperand(0);
     if (const GlobalVariable *GV = dyn_cast<GlobalVariable>(Operand)) {
-      // For CUDA-7.0 style __nvvm_reflect calls we need to find operand's
+      // For CUDA-7.0 style __nvvm_reflect calls, we need to find the operand's
       // initializer.
       assert(GV->hasInitializer() &&
              "Format of _reflect function not recognized");
@@ -194,57 +195,26 @@ bool NVVMReflect::handleFunction(Function *ReflectFunction) {
     assert(cast<ConstantDataSequential>(Operand)->isCString() &&
            "Format of _reflect function not recognized");
 
-    std::string ReflectArg =
-        cast<ConstantDataSequential>(Operand)->getAsString();
-
+    StringRef ReflectArg = cast<ConstantDataSequential>(Operand)->getAsString();
     ReflectArg = ReflectArg.substr(0, ReflectArg.size() - 1);
     DEBUG(dbgs() << "Arg of _reflect : " << ReflectArg << "\n");
 
     int ReflectVal = 0; // The default value is 0
-    if (VarMap.find(ReflectArg) != VarMap.end()) {
-      ReflectVal = VarMap[ReflectArg];
-    }
-    Reflect->replaceAllUsesWith(
-        ConstantInt::get(Reflect->getType(), ReflectVal));
-    ToRemove.push_back(Reflect);
-  }
-  if (ToRemove.size() == 0)
-    return false;
-
-  for (unsigned i = 0, e = ToRemove.size(); i != e; ++i)
-    ToRemove[i]->eraseFromParent();
-  return true;
-}
-
-bool NVVMReflect::runOnModule(Module &M) {
-  if (!NVVMReflectEnabled)
-    return false;
-
-  setVarMap();
-
-
-  bool Res = false;
-  std::string Name;
-  Type *Tys[1];
-  Type *I8Ty = Type::getInt8Ty(M.getContext());
-  Function *ReflectFunction;
-
-  // Check for standard overloaded versions of llvm.nvvm.reflect
-
-  for (unsigned i = 0; i != 5; ++i) {
-    Tys[0] = PointerType::get(I8Ty, i);
-    Name = Intrinsic::getName(Intrinsic::nvvm_reflect, Tys);
-    ReflectFunction = M.getFunction(Name);
-    if(ReflectFunction != 0) {
-      Res |= handleFunction(ReflectFunction);
+    auto Iter = VarMap.find(ReflectArg);
+    if (Iter != VarMap.end())
+      ReflectVal = Iter->second;
+    else if (ReflectArg == "__CUDA_FTZ") {
+      // Try to pull __CUDA_FTZ from the nvvm-reflect-ftz module flag.
+      if (auto *Flag = mdconst::extract_or_null<ConstantInt>(
+              F.getParent()->getModuleFlag("nvvm-reflect-ftz")))
+        ReflectVal = Flag->getSExtValue();
     }
+    Call->replaceAllUsesWith(ConstantInt::get(Call->getType(), ReflectVal));
+    ToRemove.push_back(Call);
   }
 
-  ReflectFunction = M.getFunction(NVVM_REFLECT_FUNCTION);
-  // If reflect function is not used, then there will be
-  // no entry in the module.
-  if (ReflectFunction != 0)
-    Res |= handleFunction(ReflectFunction);
+  for (Instruction *I : ToRemove)
+    I->eraseFromParent();
 
-  return Res;
+  return ToRemove.size() > 0;
 }
diff --git a/lib/Target/NVPTX/TargetInfo/Makefile b/lib/Target/NVPTX/TargetInfo/Makefile
deleted file mode 100644
index 8622315b47b9..000000000000
--- a/lib/Target/NVPTX/TargetInfo/Makefile
+++ /dev/null
@@ -1,15 +0,0 @@
-##===- lib/Target/NVPTX/TargetInfo/Makefile ----------------*- Makefile -*-===##
-#
-#                     The LLVM Compiler Infrastructure
-#
-# This file is distributed under the University of Illinois Open Source
-# License. See LICENSE.TXT for details.
-#
-##===----------------------------------------------------------------------===##
-LEVEL = ../../../..
-LIBRARYNAME = LLVMNVPTXInfo
-
-# Hack: we need to include 'main' target directory to grab private headers
-CPPFLAGS = -I$(PROJ_OBJ_DIR)/.. -I$(PROJ_SRC_DIR)/..
-
-include $(LEVEL)/Makefile.common
diff --git a/lib/Target/PowerPC/AsmParser/Makefile b/lib/Target/PowerPC/AsmParser/Makefile
deleted file mode 100644
index c8a8915685ea..000000000000
--- a/lib/Target/PowerPC/AsmParser/Makefile
+++ /dev/null
@@ -1,15 +0,0 @@
-##===- lib/Target/PowerPC/AsmParser/Makefile ----------------*- Makefile -*-===##
-#
-#                     The LLVM Compiler Infrastructure
-#
-# This file is distributed under the University of Illinois Open Source
-# License. See LICENSE.TXT for details.
-#
-##===----------------------------------------------------------------------===##
-LEVEL = ../../../..
-LIBRARYNAME = LLVMPowerPCAsmParser
-
-# Hack: we need to include 'main' PowerPC target directory to grab private headers
-CPP.Flags += -I$(PROJ_OBJ_DIR)/.. -I$(PROJ_SRC_DIR)/..
-
-include $(LEVEL)/Makefile.common
diff --git a/lib/Target/PowerPC/AsmParser/PPCAsmParser.cpp b/lib/Target/PowerPC/AsmParser/PPCAsmParser.cpp
index 220c70a48542..4181775fc6da 100644
--- a/lib/Target/PowerPC/AsmParser/PPCAsmParser.cpp
+++ b/lib/Target/PowerPC/AsmParser/PPCAsmParser.cpp
@@ -1,4 +1,4 @@
-//===-- PPCAsmParser.cpp - Parse PowerPC asm to MCInst instructions ---------===//
+//===-- PPCAsmParser.cpp - Parse PowerPC asm to MCInst instructions -------===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -7,12 +7,10 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "MCTargetDesc/PPCMCTargetDesc.h"
 #include "MCTargetDesc/PPCMCExpr.h"
+#include "MCTargetDesc/PPCMCTargetDesc.h"
 #include "PPCTargetStreamer.h"
 #include "llvm/ADT/STLExtras.h"
-#include "llvm/ADT/SmallString.h"
-#include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/StringSwitch.h"
 #include "llvm/ADT/Twine.h"
 #include "llvm/MC/MCContext.h"
@@ -22,11 +20,11 @@
 #include "llvm/MC/MCParser/MCAsmLexer.h"
 #include "llvm/MC/MCParser/MCAsmParser.h"
 #include "llvm/MC/MCParser/MCParsedAsmOperand.h"
+#include "llvm/MC/MCParser/MCTargetAsmParser.h"
 #include "llvm/MC/MCRegisterInfo.h"
 #include "llvm/MC/MCStreamer.h"
-#include "llvm/MC/MCSymbolELF.h"
 #include "llvm/MC/MCSubtargetInfo.h"
-#include "llvm/MC/MCTargetAsmParser.h"
+#include "llvm/MC/MCSymbolELF.h"
 #include "llvm/Support/SourceMgr.h"
 #include "llvm/Support/TargetRegistry.h"
 #include "llvm/Support/raw_ostream.h"
@@ -294,7 +292,7 @@ public:
                const MCInstrInfo &MII, const MCTargetOptions &Options)
     : MCTargetAsmParser(Options, STI), MII(MII) {
     // Check for 64-bit vs. 32-bit pointer mode.
-    Triple TheTriple(STI.getTargetTriple());
+    const Triple &TheTriple = STI.getTargetTriple();
     IsPPC64 = (TheTriple.getArch() == Triple::ppc64 ||
                TheTriple.getArch() == Triple::ppc64le);
     IsDarwin = TheTriple.isMacOSX();
@@ -378,6 +376,10 @@ public:
     }
   }
 
+  // Disable use of sized deallocation due to overallocation of PPCOperand
+  // objects in CreateTokenWithStringCopy.
+  void operator delete(void *p) { ::operator delete(p); }
+
   /// getStartLoc - Get the location of the first token of this operand.
   SMLoc getStartLoc() const override { return StartLoc; }
 
@@ -392,13 +394,15 @@ public:
     return Imm.Val;
   }
   int64_t getImmS16Context() const {
-    assert((Kind == Immediate || Kind == ContextImmediate) && "Invalid access!");
+    assert((Kind == Immediate || Kind == ContextImmediate) &&
+           "Invalid access!");
     if (Kind == Immediate)
       return Imm.Val;
     return static_cast<int16_t>(Imm.Val);
   }
   int64_t getImmU16Context() const {
-    assert((Kind == Immediate || Kind == ContextImmediate) && "Invalid access!");
+    assert((Kind == Immediate || Kind == ContextImmediate) &&
+           "Invalid access!");
     return Imm.Val;
   }
 
@@ -443,7 +447,9 @@ public:
   }
 
   bool isToken() const override { return Kind == Token; }
-  bool isImm() const override { return Kind == Immediate || Kind == Expression; }
+  bool isImm() const override {
+    return Kind == Immediate || Kind == Expression;
+  }
   bool isU1Imm() const { return Kind == Immediate && isUInt<1>(getImm()); }
   bool isU2Imm() const { return Kind == Immediate && isUInt<2>(getImm()); }
   bool isU3Imm() const { return Kind == Immediate && isUInt<3>(getImm()); }
@@ -454,13 +460,15 @@ public:
   bool isU6ImmX2() const { return Kind == Immediate &&
                                   isUInt<6>(getImm()) &&
                                   (getImm() & 1) == 0; }
+  bool isU7Imm() const { return Kind == Immediate && isUInt<7>(getImm()); }
   bool isU7ImmX4() const { return Kind == Immediate &&
                                   isUInt<7>(getImm()) &&
                                   (getImm() & 3) == 0; }
+  bool isU8Imm() const { return Kind == Immediate && isUInt<8>(getImm()); }
   bool isU8ImmX8() const { return Kind == Immediate &&
                                   isUInt<8>(getImm()) &&
                                   (getImm() & 7) == 0; }
-  
+
   bool isU10Imm() const { return Kind == Immediate && isUInt<10>(getImm()); }
   bool isU12Imm() const { return Kind == Immediate && isUInt<12>(getImm()); }
   bool isU16Imm() const {
@@ -488,6 +496,9 @@ public:
   bool isS16ImmX4() const { return Kind == Expression ||
                                    (Kind == Immediate && isInt<16>(getImm()) &&
                                     (getImm() & 3) == 0); }
+  bool isS16ImmX16() const { return Kind == Expression ||
+                                    (Kind == Immediate && isInt<16>(getImm()) &&
+                                     (getImm() & 15) == 0); }
   bool isS17Imm() const {
     switch (Kind) {
       case Expression:
@@ -521,7 +532,9 @@ public:
                                  (Kind == Immediate && isInt<16>(getImm()) &&
                                   (getImm() & 3) == 0); }
   bool isRegNumber() const { return Kind == Immediate && isUInt<5>(getImm()); }
-  bool isVSRegNumber() const { return Kind == Immediate && isUInt<6>(getImm()); }
+  bool isVSRegNumber() const {
+    return Kind == Immediate && isUInt<6>(getImm());
+  }
   bool isCCRegNumber() const { return (Kind == Expression
                                        && isUInt<3>(getExprCRVal())) ||
                                       (Kind == Immediate
@@ -1190,6 +1203,29 @@ void PPCAsmParser::ProcessInstruction(MCInst &Inst,
     }
     break;
   }
+  case PPC::CP_COPYx:
+  case PPC::CP_COPY_FIRST: {
+    MCInst TmpInst;
+    TmpInst.setOpcode(PPC::CP_COPY);
+    TmpInst.addOperand(Inst.getOperand(0));
+    TmpInst.addOperand(Inst.getOperand(1));
+    TmpInst.addOperand(MCOperand::createImm(Opcode == PPC::CP_COPYx ? 0 : 1));
+
+    Inst = TmpInst;
+    break;
+  }
+  case PPC::CP_PASTEx :
+  case PPC::CP_PASTE_LAST: {
+    MCInst TmpInst;
+    TmpInst.setOpcode(Opcode == PPC::CP_PASTEx ?
+                      PPC::CP_PASTE : PPC::CP_PASTEo);
+    TmpInst.addOperand(Inst.getOperand(0));
+    TmpInst.addOperand(Inst.getOperand(1));
+    TmpInst.addOperand(MCOperand::createImm(Opcode == PPC::CP_PASTEx ? 0 : 1));
+
+    Inst = TmpInst;
+    break;
+  }
   }
 }
 
@@ -1454,8 +1490,8 @@ ParseExpression(const MCExpr *&EVal) {
 /// This differs from the default "parseExpression" in that it handles detection
 /// of the \code hi16(), ha16() and lo16() \endcode modifiers.  At present,
 /// parseExpression() doesn't recognise the modifiers when in the Darwin/MachO
-/// syntax form so it is done here.  TODO: Determine if there is merit in arranging
-/// for this to be done at a higher level.
+/// syntax form so it is done here.  TODO: Determine if there is merit in
+/// arranging for this to be done at a higher level.
 bool PPCAsmParser::
 ParseDarwinExpression(const MCExpr *&EVal) {
   MCAsmParser &Parser = getParser();
@@ -1674,7 +1710,7 @@ bool PPCAsmParser::ParseInstruction(ParseInstructionInfo &Info, StringRef Name,
   while (getLexer().isNot(AsmToken::EndOfStatement) &&
          getLexer().is(AsmToken::Comma)) {
     // Consume the comma token
-    getLexer().Lex();
+    Lex();
 
     // Parse the next operand
     if (ParseOperand(Operands))
diff --git a/lib/Target/PowerPC/CMakeLists.txt b/lib/Target/PowerPC/CMakeLists.txt
index c31ababafbe7..4842c3b7a656 100644
--- a/lib/Target/PowerPC/CMakeLists.txt
+++ b/lib/Target/PowerPC/CMakeLists.txt
@@ -16,6 +16,7 @@ add_llvm_target(PowerPCCodeGen
   PPCBoolRetToInt.cpp
   PPCAsmPrinter.cpp
   PPCBranchSelector.cpp
+  PPCCCState.cpp
   PPCCTRLoops.cpp
   PPCHazardRecognizers.cpp
   PPCInstrInfo.cpp
@@ -24,12 +25,12 @@ add_llvm_target(PowerPCCodeGen
   PPCEarlyReturn.cpp
   PPCFastISel.cpp
   PPCFrameLowering.cpp
-  PPCLoopDataPrefetch.cpp
   PPCLoopPreIncPrep.cpp
   PPCMCInstLower.cpp
   PPCMachineFunctionInfo.cpp
   PPCMIPeephole.cpp
   PPCRegisterInfo.cpp
+  PPCQPXLoadSplat.cpp
   PPCSubtarget.cpp
   PPCTargetMachine.cpp
   PPCTargetObjectFile.cpp
diff --git a/lib/Target/PowerPC/Disassembler/Makefile b/lib/Target/PowerPC/Disassembler/Makefile
deleted file mode 100644
index 86e3b4752207..000000000000
--- a/lib/Target/PowerPC/Disassembler/Makefile
+++ /dev/null
@@ -1,16 +0,0 @@
-##===-- lib/Target/PowerPC/Disassembler/Makefile -----------*- Makefile -*-===##
-#
-#                     The LLVM Compiler Infrastructure
-#
-# This file is distributed under the University of Illinois Open Source
-# License. See LICENSE.TXT for details.
-#
-##===----------------------------------------------------------------------===##
-
-LEVEL = ../../../..
-LIBRARYNAME = LLVMPowerPCDisassembler
-
-# Hack: we need to include 'main' PPC target directory to grab private headers
-CPP.Flags += -I$(PROJ_OBJ_DIR)/.. -I$(PROJ_SRC_DIR)/..
-
-include $(LEVEL)/Makefile.common
diff --git a/lib/Target/PowerPC/Disassembler/PPCDisassembler.cpp b/lib/Target/PowerPC/Disassembler/PPCDisassembler.cpp
index 1fc84fb76551..6ea4fb1bfbc3 100644
--- a/lib/Target/PowerPC/Disassembler/PPCDisassembler.cpp
+++ b/lib/Target/PowerPC/Disassembler/PPCDisassembler.cpp
@@ -8,7 +8,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "PPC.h"
-#include "llvm/MC/MCDisassembler.h"
+#include "llvm/MC/MCDisassembler/MCDisassembler.h"
 #include "llvm/MC/MCFixedLenDisassembler.h"
 #include "llvm/MC/MCInst.h"
 #include "llvm/MC/MCSubtargetInfo.h"
@@ -368,6 +368,21 @@ static DecodeStatus decodeMemRIXOperands(MCInst &Inst, uint64_t Imm,
   return MCDisassembler::Success;
 }
 
+static DecodeStatus decodeMemRIX16Operands(MCInst &Inst, uint64_t Imm,
+                                         int64_t Address, const void *Decoder) {
+  // Decode the memrix16 field (imm, reg), which has the low 12-bits as the
+  // displacement with 16-byte aligned, and the next 5 bits as the register #.
+
+  uint64_t Base = Imm >> 12;
+  uint64_t Disp = Imm & 0xFFF;
+
+  assert(Base < 32 && "Invalid base register");
+
+  Inst.addOperand(MCOperand::createImm(SignExtend64<16>(Disp << 4)));
+  Inst.addOperand(MCOperand::createReg(GP0Regs[Base]));
+  return MCDisassembler::Success;
+}
+
 static DecodeStatus decodeCRBitMOperand(MCInst &Inst, uint64_t Imm,
                                         int64_t Address, const void *Decoder) {
   // The cr bit encoding is 0x80 >> cr_reg_num.
diff --git a/lib/Target/PowerPC/InstPrinter/Makefile b/lib/Target/PowerPC/InstPrinter/Makefile
deleted file mode 100644
index f097e84248ff..000000000000
--- a/lib/Target/PowerPC/InstPrinter/Makefile
+++ /dev/null
@@ -1,16 +0,0 @@
-##===- lib/Target/PowerPC/AsmPrinter/Makefile --------------*- Makefile -*-===##
-#
-#                     The LLVM Compiler Infrastructure
-#
-# This file is distributed under the University of Illinois Open Source
-# License. See LICENSE.TXT for details.
-#
-##===----------------------------------------------------------------------===##
-
-LEVEL = ../../../..
-LIBRARYNAME = LLVMPowerPCAsmPrinter
-
-# Hack: we need to include 'main' powerpc target directory to grab private headers
-CPP.Flags += -I$(PROJ_OBJ_DIR)/.. -I$(PROJ_SRC_DIR)/..
-
-include $(LEVEL)/Makefile.common
diff --git a/lib/Target/PowerPC/InstPrinter/PPCInstPrinter.cpp b/lib/Target/PowerPC/InstPrinter/PPCInstPrinter.cpp
index 5e1d22789056..d9d9b4f180f7 100644
--- a/lib/Target/PowerPC/InstPrinter/PPCInstPrinter.cpp
+++ b/lib/Target/PowerPC/InstPrinter/PPCInstPrinter.cpp
@@ -136,17 +136,6 @@ void PPCInstPrinter::printInst(const MCInst *MI, raw_ostream &O,
     return;
   }
   
-  // For fast-isel, a COPY_TO_REGCLASS may survive this long.  This is
-  // used when converting a 32-bit float to a 64-bit float as part of
-  // conversion to an integer (see PPCFastISel.cpp:SelectFPToI()),
-  // as otherwise we have problems with incorrect register classes
-  // in machine instruction verification.  For now, just avoid trying
-  // to print it as such an instruction has no effect (a 32-bit float
-  // in a register is already in 64-bit form, just with lower
-  // precision).  FIXME: Is there a better solution?
-  if (MI->getOpcode() == TargetOpcode::COPY_TO_REGCLASS)
-    return;
-
   if (!printAliasInstr(MI, O))
     printInstruction(MI, O);
   printAnnotation(O, Annot);
@@ -299,6 +288,20 @@ void PPCInstPrinter::printU6ImmOperand(const MCInst *MI, unsigned OpNo,
   O << (unsigned int)Value;
 }
 
+void PPCInstPrinter::printU7ImmOperand(const MCInst *MI, unsigned OpNo,
+                                       raw_ostream &O) {
+  unsigned int Value = MI->getOperand(OpNo).getImm();
+  assert(Value <= 127 && "Invalid u7imm argument!");
+  O << (unsigned int)Value;
+}
+
+void PPCInstPrinter::printU8ImmOperand(const MCInst *MI, unsigned OpNo,
+                                       raw_ostream &O) {
+  unsigned int Value = MI->getOperand(OpNo).getImm();
+  assert(Value <= 255 && "Invalid u8imm argument!");
+  O << (unsigned int)Value;
+}
+
 void PPCInstPrinter::printU10ImmOperand(const MCInst *MI, unsigned OpNo,
                                         raw_ostream &O) {
   unsigned short Value = MI->getOperand(OpNo).getImm();
diff --git a/lib/Target/PowerPC/InstPrinter/PPCInstPrinter.h b/lib/Target/PowerPC/InstPrinter/PPCInstPrinter.h
index 53eb727d0b07..d0ffeff0247c 100644
--- a/lib/Target/PowerPC/InstPrinter/PPCInstPrinter.h
+++ b/lib/Target/PowerPC/InstPrinter/PPCInstPrinter.h
@@ -53,6 +53,8 @@ public:
   void printS5ImmOperand(const MCInst *MI, unsigned OpNo, raw_ostream &O);
   void printU5ImmOperand(const MCInst *MI, unsigned OpNo, raw_ostream &O);
   void printU6ImmOperand(const MCInst *MI, unsigned OpNo, raw_ostream &O);
+  void printU7ImmOperand(const MCInst *MI, unsigned OpNo, raw_ostream &O);
+  void printU8ImmOperand(const MCInst *MI, unsigned OpNo, raw_ostream &O);
   void printU10ImmOperand(const MCInst *MI, unsigned OpNo, raw_ostream &O);
   void printU12ImmOperand(const MCInst *MI, unsigned OpNo, raw_ostream &O);
   void printS16ImmOperand(const MCInst *MI, unsigned OpNo, raw_ostream &O);
diff --git a/lib/Target/PowerPC/MCTargetDesc/Makefile b/lib/Target/PowerPC/MCTargetDesc/Makefile
deleted file mode 100644
index 9db66622cced..000000000000
--- a/lib/Target/PowerPC/MCTargetDesc/Makefile
+++ /dev/null
@@ -1,16 +0,0 @@
-##===- lib/Target/PowerPC/TargetDesc/Makefile --------------*- Makefile -*-===##
-#
-#                     The LLVM Compiler Infrastructure
-#
-# This file is distributed under the University of Illinois Open Source
-# License. See LICENSE.TXT for details.
-#
-##===----------------------------------------------------------------------===##
-
-LEVEL = ../../../..
-LIBRARYNAME = LLVMPowerPCDesc
-
-# Hack: we need to include 'main' target directory to grab private headers
-CPP.Flags += -I$(PROJ_OBJ_DIR)/.. -I$(PROJ_SRC_DIR)/..
-
-include $(LEVEL)/Makefile.common
diff --git a/lib/Target/PowerPC/MCTargetDesc/PPCAsmBackend.cpp b/lib/Target/PowerPC/MCTargetDesc/PPCAsmBackend.cpp
index b6dd595ffb0e..9100ecb4aa37 100644
--- a/lib/Target/PowerPC/MCTargetDesc/PPCAsmBackend.cpp
+++ b/lib/Target/PowerPC/MCTargetDesc/PPCAsmBackend.cpp
@@ -168,8 +168,8 @@ public:
     llvm_unreachable("relaxInstruction() unimplemented");
   }
 
-
-  void relaxInstruction(const MCInst &Inst, MCInst &Res) const override {
+  void relaxInstruction(const MCInst &Inst, const MCSubtargetInfo &STI,
+                        MCInst &Res) const override {
     // FIXME.
     llvm_unreachable("relaxInstruction() unimplemented");
   }
diff --git a/lib/Target/PowerPC/MCTargetDesc/PPCELFObjectWriter.cpp b/lib/Target/PowerPC/MCTargetDesc/PPCELFObjectWriter.cpp
index dd994956870f..fd279c60f3f5 100644
--- a/lib/Target/PowerPC/MCTargetDesc/PPCELFObjectWriter.cpp
+++ b/lib/Target/PowerPC/MCTargetDesc/PPCELFObjectWriter.cpp
@@ -25,8 +25,8 @@ namespace {
     PPCELFObjectWriter(bool Is64Bit, uint8_t OSABI);
 
   protected:
-    unsigned GetRelocType(const MCValue &Target, const MCFixup &Fixup,
-                          bool IsPCRel) const override;
+    unsigned getRelocType(MCContext &Ctx, const MCValue &Target,
+                          const MCFixup &Fixup, bool IsPCRel) const override;
 
     bool needsRelocateWithSymbol(const MCSymbol &Sym,
                                  unsigned Type) const override;
@@ -66,7 +66,7 @@ static MCSymbolRefExpr::VariantKind getAccessVariant(const MCValue &Target,
   llvm_unreachable("unknown PPCMCExpr kind");
 }
 
-unsigned PPCELFObjectWriter::GetRelocType(const MCValue &Target,
+unsigned PPCELFObjectWriter::getRelocType(MCContext &Ctx, const MCValue &Target,
                                           const MCFixup &Fixup,
                                           bool IsPCRel) const {
   MCSymbolRefExpr::VariantKind Modifier = getAccessVariant(Target, Fixup);
@@ -186,7 +186,7 @@ unsigned PPCELFObjectWriter::GetRelocType(const MCValue &Target,
       case MCSymbolRefExpr::VK_PPC_TOC_HA:
         Type = ELF::R_PPC64_TOC16_HA;
         break;
-      case MCSymbolRefExpr::VK_PPC_TPREL:
+      case MCSymbolRefExpr::VK_TPREL:
         Type = ELF::R_PPC_TPREL16;
         break;
       case MCSymbolRefExpr::VK_PPC_TPREL_LO:
@@ -210,7 +210,7 @@ unsigned PPCELFObjectWriter::GetRelocType(const MCValue &Target,
       case MCSymbolRefExpr::VK_PPC_TPREL_HIGHESTA:
         Type = ELF::R_PPC64_TPREL16_HIGHESTA;
         break;
-      case MCSymbolRefExpr::VK_PPC_DTPREL:
+      case MCSymbolRefExpr::VK_DTPREL:
         Type = ELF::R_PPC64_DTPREL16;
         break;
       case MCSymbolRefExpr::VK_PPC_DTPREL_LO:
@@ -319,13 +319,13 @@ unsigned PPCELFObjectWriter::GetRelocType(const MCValue &Target,
       case MCSymbolRefExpr::VK_PPC_TOC_LO:
         Type = ELF::R_PPC64_TOC16_LO_DS;
         break;
-      case MCSymbolRefExpr::VK_PPC_TPREL:
+      case MCSymbolRefExpr::VK_TPREL:
         Type = ELF::R_PPC64_TPREL16_DS;
         break;
       case MCSymbolRefExpr::VK_PPC_TPREL_LO:
         Type = ELF::R_PPC64_TPREL16_LO_DS;
         break;
-      case MCSymbolRefExpr::VK_PPC_DTPREL:
+      case MCSymbolRefExpr::VK_DTPREL:
         Type = ELF::R_PPC64_DTPREL16_DS;
         break;
       case MCSymbolRefExpr::VK_PPC_DTPREL_LO:
@@ -380,10 +380,10 @@ unsigned PPCELFObjectWriter::GetRelocType(const MCValue &Target,
       case MCSymbolRefExpr::VK_PPC_DTPMOD:
         Type = ELF::R_PPC64_DTPMOD64;
         break;
-      case MCSymbolRefExpr::VK_PPC_TPREL:
+      case MCSymbolRefExpr::VK_TPREL:
         Type = ELF::R_PPC64_TPREL64;
         break;
-      case MCSymbolRefExpr::VK_PPC_DTPREL:
+      case MCSymbolRefExpr::VK_DTPREL:
         Type = ELF::R_PPC64_DTPREL64;
         break;
       }
diff --git a/lib/Target/PowerPC/MCTargetDesc/PPCMCCodeEmitter.cpp b/lib/Target/PowerPC/MCTargetDesc/PPCMCCodeEmitter.cpp
index b7291561c75d..e7b2d8369f2f 100644
--- a/lib/Target/PowerPC/MCTargetDesc/PPCMCCodeEmitter.cpp
+++ b/lib/Target/PowerPC/MCTargetDesc/PPCMCCodeEmitter.cpp
@@ -69,6 +69,9 @@ public:
   unsigned getMemRIXEncoding(const MCInst &MI, unsigned OpNo,
                              SmallVectorImpl<MCFixup> &Fixups,
                              const MCSubtargetInfo &STI) const;
+  unsigned getMemRIX16Encoding(const MCInst &MI, unsigned OpNo,
+                               SmallVectorImpl<MCFixup> &Fixups,
+                               const MCSubtargetInfo &STI) const;
   unsigned getSPE8DisEncoding(const MCInst &MI, unsigned OpNo,
                               SmallVectorImpl<MCFixup> &Fixups,
                               const MCSubtargetInfo &STI) const;
@@ -102,19 +105,16 @@ public:
   void encodeInstruction(const MCInst &MI, raw_ostream &OS,
                          SmallVectorImpl<MCFixup> &Fixups,
                          const MCSubtargetInfo &STI) const override {
-    // For fast-isel, a float COPY_TO_REGCLASS can survive this long.
-    // It's just a nop to keep the register classes happy, so don't
-    // generate anything.
     unsigned Opcode = MI.getOpcode();
     const MCInstrDesc &Desc = MCII.get(Opcode);
-    if (Opcode == TargetOpcode::COPY_TO_REGCLASS)
-      return;
 
     uint64_t Bits = getBinaryCodeForInstr(MI, Fixups, STI);
 
     // Output the constant in big/little endian byte order.
     unsigned Size = Desc.getSize();
     switch (Size) {
+    case 0:
+      break;
     case 4:
       if (IsLittleEndian) {
         support::endian::Writer<support::little>(OS).write<uint32_t>(Bits);
@@ -249,6 +249,19 @@ unsigned PPCMCCodeEmitter::getMemRIXEncoding(const MCInst &MI, unsigned OpNo,
   return RegBits;
 }
 
+unsigned PPCMCCodeEmitter::getMemRIX16Encoding(const MCInst &MI, unsigned OpNo,
+                                       SmallVectorImpl<MCFixup> &Fixups,
+                                       const MCSubtargetInfo &STI) const {
+  // Encode (imm, reg) as a memrix16, which has the low 12-bits as the
+  // displacement and the next 5 bits as the register #.
+  assert(MI.getOperand(OpNo+1).isReg());
+  unsigned RegBits = getMachineOpValue(MI, MI.getOperand(OpNo+1), Fixups, STI) << 12;
+
+  const MCOperand &MO = MI.getOperand(OpNo);
+  assert(MO.isImm());
+
+  return ((getMachineOpValue(MI, MO, Fixups, STI) >> 4) & 0xFFF) | RegBits;
+}
 
 unsigned PPCMCCodeEmitter::getSPE8DisEncoding(const MCInst &MI, unsigned OpNo,
                                               SmallVectorImpl<MCFixup> &Fixups,
diff --git a/lib/Target/PowerPC/MCTargetDesc/PPCMCTargetDesc.cpp b/lib/Target/PowerPC/MCTargetDesc/PPCMCTargetDesc.cpp
index 30f232a9a91e..c9074448fe45 100644
--- a/lib/Target/PowerPC/MCTargetDesc/PPCMCTargetDesc.cpp
+++ b/lib/Target/PowerPC/MCTargetDesc/PPCMCTargetDesc.cpp
@@ -15,7 +15,6 @@
 #include "InstPrinter/PPCInstPrinter.h"
 #include "PPCMCAsmInfo.h"
 #include "PPCTargetStreamer.h"
-#include "llvm/MC/MCCodeGenInfo.h"
 #include "llvm/MC/MCContext.h"
 #include "llvm/MC/MCELFStreamer.h"
 #include "llvm/MC/MCExpr.h"
@@ -87,24 +86,13 @@ static MCAsmInfo *createPPCMCAsmInfo(const MCRegisterInfo &MRI,
   return MAI;
 }
 
-static MCCodeGenInfo *createPPCMCCodeGenInfo(const Triple &TT, Reloc::Model RM,
-                                             CodeModel::Model CM,
-                                             CodeGenOpt::Level OL) {
-  MCCodeGenInfo *X = new MCCodeGenInfo();
-
-  if (RM == Reloc::Default) {
-    if (TT.isOSDarwin())
-      RM = Reloc::DynamicNoPIC;
-    else
-      RM = Reloc::Static;
-  }
+static void adjustCodeGenOpts(const Triple &TT, Reloc::Model RM,
+                              CodeModel::Model &CM) {
   if (CM == CodeModel::Default) {
     if (!TT.isOSDarwin() &&
         (TT.getArch() == Triple::ppc64 || TT.getArch() == Triple::ppc64le))
       CM = CodeModel::Medium;
   }
-  X->initMCCodeGenInfo(RM, CM, OL);
-  return X;
 }
 
 namespace {
@@ -245,7 +233,7 @@ extern "C" void LLVMInitializePowerPCTargetMC() {
     RegisterMCAsmInfoFn C(*T, createPPCMCAsmInfo);
 
     // Register the MC codegen info.
-    TargetRegistry::RegisterMCCodeGenInfo(*T, createPPCMCCodeGenInfo);
+    TargetRegistry::registerMCAdjustCodeGenOpts(*T, adjustCodeGenOpts);
 
     // Register the MC instruction info.
     TargetRegistry::RegisterMCInstrInfo(*T, createPPCMCInstrInfo);
diff --git a/lib/Target/PowerPC/MCTargetDesc/PPCMachObjectWriter.cpp b/lib/Target/PowerPC/MCTargetDesc/PPCMachObjectWriter.cpp
index b54a0e1b86b1..1f38a8c947e7 100644
--- a/lib/Target/PowerPC/MCTargetDesc/PPCMachObjectWriter.cpp
+++ b/lib/Target/PowerPC/MCTargetDesc/PPCMachObjectWriter.cpp
@@ -79,7 +79,7 @@ static unsigned getFixupKindLog2Size(unsigned Kind) {
 }
 
 /// Translates generic PPC fixup kind to Mach-O/PPC relocation type enum.
-/// Outline based on PPCELFObjectWriter::GetRelocType().
+/// Outline based on PPCELFObjectWriter::getRelocType().
 static unsigned getRelocType(const MCValue &Target,
                              const MCFixupKind FixupKind, // from
                                                           // Fixup.getKind()
diff --git a/lib/Target/PowerPC/Makefile b/lib/Target/PowerPC/Makefile
deleted file mode 100644
index cf516f4e5ec9..000000000000
--- a/lib/Target/PowerPC/Makefile
+++ /dev/null
@@ -1,24 +0,0 @@
-##===- lib/Target/PowerPC/Makefile -------------------------*- Makefile -*-===##
-#
-#                     The LLVM Compiler Infrastructure
-#
-# This file is distributed under the University of Illinois Open Source
-# License. See LICENSE.TXT for details.
-#
-##===----------------------------------------------------------------------===##
-
-LEVEL = ../../..
-LIBRARYNAME = LLVMPowerPCCodeGen
-TARGET = PPC
-
-# Make sure that tblgen is run, first thing.
-BUILT_SOURCES = PPCGenRegisterInfo.inc PPCGenAsmMatcher.inc \
-                PPCGenAsmWriter.inc  \
-                PPCGenInstrInfo.inc PPCGenDAGISel.inc \
-                PPCGenSubtargetInfo.inc PPCGenCallingConv.inc \
-                PPCGenMCCodeEmitter.inc PPCGenFastISel.inc \
-                PPCGenDisassemblerTables.inc
-
-DIRS = AsmParser Disassembler InstPrinter TargetInfo MCTargetDesc
-
-include $(LEVEL)/Makefile.common
diff --git a/lib/Target/PowerPC/PPC.h b/lib/Target/PowerPC/PPC.h
index a259ed3fd327..e01f49dce81e 100644
--- a/lib/Target/PowerPC/PPC.h
+++ b/lib/Target/PowerPC/PPC.h
@@ -16,7 +16,6 @@
 #define LLVM_LIB_TARGET_POWERPC_PPC_H
 
 #include "MCTargetDesc/PPCMCTargetDesc.h"
-#include <string>
 
 // GCC #defines PPC on Linux but we use it as our namespace name
 #undef PPC
@@ -34,7 +33,6 @@ namespace llvm {
 #ifndef NDEBUG
   FunctionPass *createPPCCTRLoopsVerify();
 #endif
-  FunctionPass *createPPCLoopDataPrefetchPass();
   FunctionPass *createPPCLoopPreIncPrepPass(PPCTargetMachine &TM);
   FunctionPass *createPPCTOCRegDepsPass();
   FunctionPass *createPPCEarlyReturnPass();
@@ -43,6 +41,7 @@ namespace llvm {
   FunctionPass *createPPCVSXSwapRemovalPass();
   FunctionPass *createPPCMIPeepholePass();
   FunctionPass *createPPCBranchSelectionPass();
+  FunctionPass *createPPCQPXLoadSplatPass();
   FunctionPass *createPPCISelDag(PPCTargetMachine &TM);
   FunctionPass *createPPCTLSDynamicCallPass();
   FunctionPass *createPPCBoolRetToIntPass();
@@ -60,13 +59,12 @@ namespace llvm {
     //===------------------------------------------------------------------===//
     // PPC Specific MachineOperand flags.
     MO_NO_FLAG,
-    
-    /// MO_PLT_OR_STUB - On a symbol operand "FOO", this indicates that the
-    /// reference is actually to the "FOO$stub" or "FOO@plt" symbol.  This is
-    /// used for calls and jumps to external functions on Tiger and earlier, and
+
+    /// On a symbol operand "FOO", this indicates that the reference is actually
+    /// to "FOO@plt".  This is used for calls and jumps to external functions on
     /// for PIC calls on Linux and ELF systems.
-    MO_PLT_OR_STUB = 1,
-    
+    MO_PLT = 1,
+
     /// MO_PIC_FLAG - If this bit is set, the symbol reference is relative to
     /// the function's picbase, e.g. lo16(symbol-picbase).
     MO_PIC_FLAG = 2,
@@ -74,7 +72,7 @@ namespace llvm {
     /// MO_NLP_FLAG - If this bit is set, the symbol reference is actually to
     /// the non_lazy_ptr for the global, e.g. lo16(symbol$non_lazy_ptr-picbase).
     MO_NLP_FLAG = 4,
-    
+
     /// MO_NLP_HIDDEN_FLAG - If this bit is set, the symbol reference is to a
     /// symbol with hidden visibility.  This causes a different kind of
     /// non-lazy-pointer to be generated.
@@ -93,11 +91,11 @@ namespace llvm {
     /// These values identify relocations on immediates folded
     /// into memory operations.
     MO_DTPREL_LO = 5 << 4,
-    MO_TLSLD_LO  = 6 << 4,
-    MO_TOC_LO    = 7 << 4,
+    MO_TLSLD_LO = 6 << 4,
+    MO_TOC_LO = 7 << 4,
 
     // Symbol for VK_PPC_TLS fixup attached to an ADD instruction
-    MO_TLS       = 8 << 4
+    MO_TLS = 8 << 4
   };
   } // end namespace PPCII
   
diff --git a/lib/Target/PowerPC/PPC.td b/lib/Target/PowerPC/PPC.td
index b03be12cfd97..b40b530f4c5d 100644
--- a/lib/Target/PowerPC/PPC.td
+++ b/lib/Target/PowerPC/PPC.td
@@ -37,16 +37,19 @@ def Directive64  : SubtargetFeature<"", "DarwinDirective", "PPC::DIR_64", "">;
 def DirectiveA2  : SubtargetFeature<"", "DarwinDirective", "PPC::DIR_A2", "">;
 def DirectiveE500mc : SubtargetFeature<"", "DarwinDirective",
                                        "PPC::DIR_E500mc", "">;
-def DirectiveE5500  : SubtargetFeature<"", "DarwinDirective", 
+def DirectiveE5500  : SubtargetFeature<"", "DarwinDirective",
                                        "PPC::DIR_E5500", "">;
 def DirectivePwr3: SubtargetFeature<"", "DarwinDirective", "PPC::DIR_PWR3", "">;
 def DirectivePwr4: SubtargetFeature<"", "DarwinDirective", "PPC::DIR_PWR4", "">;
 def DirectivePwr5: SubtargetFeature<"", "DarwinDirective", "PPC::DIR_PWR5", "">;
-def DirectivePwr5x: SubtargetFeature<"", "DarwinDirective", "PPC::DIR_PWR5X", "">;
+def DirectivePwr5x
+    : SubtargetFeature<"", "DarwinDirective", "PPC::DIR_PWR5X", "">;
 def DirectivePwr6: SubtargetFeature<"", "DarwinDirective", "PPC::DIR_PWR6", "">;
-def DirectivePwr6x: SubtargetFeature<"", "DarwinDirective", "PPC::DIR_PWR6X", "">;
+def DirectivePwr6x
+    : SubtargetFeature<"", "DarwinDirective", "PPC::DIR_PWR6X", "">;
 def DirectivePwr7: SubtargetFeature<"", "DarwinDirective", "PPC::DIR_PWR7", "">;
 def DirectivePwr8: SubtargetFeature<"", "DarwinDirective", "PPC::DIR_PWR8", "">;
+def DirectivePwr9: SubtargetFeature<"", "DarwinDirective", "PPC::DIR_PWR9", "">;
 
 def Feature64Bit     : SubtargetFeature<"64bit","Has64BitSupport", "true",
                                         "Enable 64-bit instructions">;
@@ -86,8 +89,6 @@ def FeatureFPCVT     : SubtargetFeature<"fpcvt", "HasFPCVT", "true",
   "Enable fc[ft]* (unsigned and single-precision) and lfiwzx instructions">;
 def FeatureISEL      : SubtargetFeature<"isel","HasISEL", "true",
                                         "Enable the isel instruction">;
-def FeaturePOPCNTD   : SubtargetFeature<"popcntd","HasPOPCNTD", "true",
-                                        "Enable the popcnt[dw] instructions">;
 def FeatureBPERMD    : SubtargetFeature<"bpermd", "HasBPERMD", "true",
                                         "Enable the bpermd instruction">;
 def FeatureExtDiv    : SubtargetFeature<"extdiv", "HasExtDiv", "true",
@@ -145,24 +146,43 @@ def FeatureFloat128 :
   SubtargetFeature<"float128", "HasFloat128", "true",
                    "Enable the __float128 data type for IEEE-754R Binary128.",
                    [FeatureVSX]>;
+def FeaturePOPCNTD   : SubtargetFeature<"popcntd","HasPOPCNTD",
+                                        "POPCNTD_Fast",
+                                        "Enable the popcnt[dw] instructions">;
+// Note that for the a2/a2q processor models we should not use popcnt[dw] by
+// default. These processors do support the instructions, but they're
+// microcoded, and the software emulation is about twice as fast.
+def FeatureSlowPOPCNTD : SubtargetFeature<"slow-popcntd","HasPOPCNTD",
+                                          "POPCNTD_Slow",
+                                          "Has slow popcnt[dw] instructions">;
 
 def DeprecatedDST    : SubtargetFeature<"", "DeprecatedDST", "true",
   "Treat vector data stream cache control instructions as deprecated">;
 
-/*  Since new processors generally contain a superset of features of those that
-    came before them, the idea is to make implementations of new processors
-    less error prone and easier to read.
-    Namely:
-        list<SubtargetFeature> Power8FeatureList = ...
-        list<SubtargetFeature> FutureProcessorSpecificFeatureList =
-            [ features that Power8 does not support ]
-        list<SubtargetFeature> FutureProcessorFeatureList =
-            !listconcat(Power8FeatureList, FutureProcessorSpecificFeatureList)
+def FeatureISA3_0 : SubtargetFeature<"isa-v30-instructions", "IsISA3_0",
+                                     "true",
+                                     "Enable instructions added in ISA 3.0.">;
+def FeatureP9Altivec : SubtargetFeature<"power9-altivec", "HasP9Altivec", "true",
+                                        "Enable POWER9 Altivec instructions",
+                                        [FeatureISA3_0, FeatureP8Altivec]>;
+def FeatureP9Vector  : SubtargetFeature<"power9-vector", "HasP9Vector", "true",
+                                        "Enable POWER9 vector instructions",
+                                        [FeatureISA3_0, FeatureP8Vector,
+                                         FeatureP9Altivec]>;
 
-    Makes it explicit and obvious what is new in FutureProcesor vs. Power8 as
-    well as providing a single point of definition if the feature set will be
-    used elsewhere.
-*/
+// Since new processors generally contain a superset of features of those that
+// came before them, the idea is to make implementations of new processors
+// less error prone and easier to read.
+// Namely:
+//     list<SubtargetFeature> Power8FeatureList = ...
+//     list<SubtargetFeature> FutureProcessorSpecificFeatureList =
+//         [ features that Power8 does not support ]
+//     list<SubtargetFeature> FutureProcessorFeatureList =
+//         !listconcat(Power8FeatureList, FutureProcessorSpecificFeatureList)
+
+// Makes it explicit and obvious what is new in FutureProcesor vs. Power8 as
+// well as providing a single point of definition if the feature set will be
+// used elsewhere.
 def ProcessorFeatures {
   list<SubtargetFeature> Power7FeatureList =
       [DirectivePwr7, FeatureAltivec, FeatureVSX,
@@ -180,6 +200,10 @@ def ProcessorFeatures {
        FeatureFusion];
   list<SubtargetFeature> Power8FeatureList =
       !listconcat(Power7FeatureList, Power8SpecificFeatures);
+  list<SubtargetFeature> Power9SpecificFeatures =
+      [FeatureP9Altivec, FeatureP9Vector, FeatureISA3_0];
+  list<SubtargetFeature> Power9FeatureList =
+      !listconcat(Power8FeatureList, Power9SpecificFeatures);
 }
 
 // Note: Future features to add when support is extended to more
@@ -331,16 +355,17 @@ def : ProcessorModel<"a2", PPCA2Model,
                    FeatureFRSQRTE, FeatureFRSQRTES, FeatureRecipPrec,
                    FeatureSTFIWX, FeatureLFIWAX,
                    FeatureFPRND, FeatureFPCVT, FeatureISEL,
-                   FeaturePOPCNTD, FeatureCMPB, FeatureLDBRX, Feature64Bit
-               /*, Feature64BitRegs */, FeatureMFTB]>;
+                   FeatureSlowPOPCNTD, FeatureCMPB, FeatureLDBRX,
+                   Feature64Bit /*, Feature64BitRegs */, FeatureMFTB]>;
 def : ProcessorModel<"a2q", PPCA2Model,
                   [DirectiveA2, FeatureICBT, FeatureBookE, FeatureMFOCRF,
                    FeatureFCPSGN, FeatureFSqrt, FeatureFRE, FeatureFRES,
                    FeatureFRSQRTE, FeatureFRSQRTES, FeatureRecipPrec,
                    FeatureSTFIWX, FeatureLFIWAX,
                    FeatureFPRND, FeatureFPCVT, FeatureISEL,
-                   FeaturePOPCNTD, FeatureCMPB, FeatureLDBRX, Feature64Bit
-               /*, Feature64BitRegs */, FeatureQPX, FeatureMFTB]>;
+                   FeatureSlowPOPCNTD, FeatureCMPB, FeatureLDBRX,
+                   Feature64Bit /*, Feature64BitRegs */, FeatureQPX,
+                   FeatureMFTB]>;
 def : ProcessorModel<"pwr3", G5Model,
                   [DirectivePwr3, FeatureAltivec,
                    FeatureFRES, FeatureFRSQRTE, FeatureMFOCRF,
@@ -377,6 +402,8 @@ def : ProcessorModel<"pwr6x", G5Model,
                    FeatureMFTB, DeprecatedDST]>;
 def : ProcessorModel<"pwr7", P7Model, ProcessorFeatures.Power7FeatureList>;
 def : ProcessorModel<"pwr8", P8Model, ProcessorFeatures.Power8FeatureList>;
+// FIXME: Same as P8 until the POWER9 scheduling info is available
+def : ProcessorModel<"pwr9", P8Model, ProcessorFeatures.Power9FeatureList>;
 def : Processor<"ppc", G3Itineraries, [Directive32, FeatureMFTB]>;
 def : ProcessorModel<"ppc64", G5Model,
                   [Directive64, FeatureAltivec,
diff --git a/lib/Target/PowerPC/PPCAsmPrinter.cpp b/lib/Target/PowerPC/PPCAsmPrinter.cpp
index ec354c209ca0..76c52ab6cf1e 100644
--- a/lib/Target/PowerPC/PPCAsmPrinter.cpp
+++ b/lib/Target/PowerPC/PPCAsmPrinter.cpp
@@ -25,7 +25,6 @@
 #include "PPCTargetMachine.h"
 #include "PPCTargetStreamer.h"
 #include "llvm/ADT/MapVector.h"
-#include "llvm/ADT/SmallString.h"
 #include "llvm/ADT/StringExtras.h"
 #include "llvm/CodeGen/AsmPrinter.h"
 #include "llvm/CodeGen/MachineConstantPool.h"
@@ -50,7 +49,6 @@
 #include "llvm/MC/MCSectionMachO.h"
 #include "llvm/MC/MCStreamer.h"
 #include "llvm/MC/MCSymbolELF.h"
-#include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/ELF.h"
 #include "llvm/Support/ErrorHandling.h"
@@ -82,6 +80,12 @@ public:
 
     MCSymbol *lookUpOrCreateTOCEntry(MCSymbol *Sym);
 
+    virtual bool doInitialization(Module &M) override {
+      if (!TOC.empty())
+        TOC.clear();
+      return AsmPrinter::doInitialization(M);
+    }
+
     void EmitInstruction(const MachineInstr *MI) override;
 
     void printOperand(const MachineInstr *MI, unsigned OpNo, raw_ostream &O);
@@ -138,8 +142,6 @@ public:
 
     bool doFinalization(Module &M) override;
     void EmitStartOfAsmFile(Module &M) override;
-
-    void EmitFunctionStubs(const MachineModuleInfoMachO::SymbolListTy &Stubs);
   };
 } // end of anonymous namespace
 
@@ -195,29 +197,14 @@ void PPCAsmPrinter::printOperand(const MachineInstr *MI, unsigned OpNo,
     MCSymbol *SymToPrint;
 
     // External or weakly linked global variables need non-lazily-resolved stubs
-    if (TM.getRelocationModel() != Reloc::Static &&
-        !GV->isStrongDefinitionForLinker()) {
-      if (!GV->hasHiddenVisibility()) {
-        SymToPrint = getSymbolWithGlobalValueBase(GV, "$non_lazy_ptr");
-        MachineModuleInfoImpl::StubValueTy &StubSym =
-            MMI->getObjFileInfo<MachineModuleInfoMachO>().getGVStubEntry(
-                SymToPrint);
-        if (!StubSym.getPointer())
-          StubSym = MachineModuleInfoImpl::
-            StubValueTy(getSymbol(GV), !GV->hasInternalLinkage());
-      } else if (GV->isDeclaration() || GV->hasCommonLinkage() ||
-                 GV->hasAvailableExternallyLinkage()) {
-        SymToPrint = getSymbolWithGlobalValueBase(GV, "$non_lazy_ptr");
-
-        MachineModuleInfoImpl::StubValueTy &StubSym =
-            MMI->getObjFileInfo<MachineModuleInfoMachO>().getHiddenGVStubEntry(
-                SymToPrint);
-        if (!StubSym.getPointer())
-          StubSym = MachineModuleInfoImpl::
-            StubValueTy(getSymbol(GV), !GV->hasInternalLinkage());
-      } else {
-        SymToPrint = getSymbol(GV);
-      }
+    if (Subtarget->hasLazyResolverStub(GV)) {
+      SymToPrint = getSymbolWithGlobalValueBase(GV, "$non_lazy_ptr");
+      MachineModuleInfoImpl::StubValueTy &StubSym =
+          MMI->getObjFileInfo<MachineModuleInfoMachO>().getGVStubEntry(
+              SymToPrint);
+      if (!StubSym.getPointer())
+        StubSym = MachineModuleInfoImpl::StubValueTy(getSymbol(GV),
+                                                     !GV->hasInternalLinkage());
     } else {
       SymToPrint = getSymbol(GV);
     }
@@ -470,7 +457,7 @@ void PPCAsmPrinter::EmitTlsCall(const MachineInstr *MI,
          "GETtls[ld]ADDR[32] must read GPR3");
 
   if (!Subtarget->isPPC64() && !Subtarget->isDarwin() &&
-      TM.getRelocationModel() == Reloc::PIC_)
+      isPositionIndependent())
     Kind = MCSymbolRefExpr::VK_PLT;
   const MCSymbolRefExpr *TlsRef =
     MCSymbolRefExpr::create(TlsGetAddr, Kind, OutContext);
@@ -597,7 +584,7 @@ void PPCAsmPrinter::EmitInstruction(const MachineInstr *MI) {
     else if (MO.isBlockAddress())
       MOSymbol = GetBlockAddressSymbol(MO.getBlockAddress());
 
-    if (PL == PICLevel::Small) {
+    if (PL == PICLevel::SmallPIC) {
       const MCExpr *Exp =
         MCSymbolRefExpr::create(MOSymbol, MCSymbolRefExpr::VK_GOT,
                                 OutContext);
@@ -1038,10 +1025,10 @@ void PPCLinuxAsmPrinter::EmitStartOfAsmFile(Module &M) {
   }
 
   if (static_cast<const PPCTargetMachine &>(TM).isPPC64() ||
-      TM.getRelocationModel() != Reloc::PIC_)
+      !isPositionIndependent())
     return AsmPrinter::EmitStartOfAsmFile(M);
 
-  if (M.getPICLevel() == PICLevel::Small)
+  if (M.getPICLevel() == PICLevel::SmallPIC)
     return AsmPrinter::EmitStartOfAsmFile(M);
 
   OutStreamer->SwitchSection(OutContext.getELFSection(
@@ -1067,8 +1054,8 @@ void PPCLinuxAsmPrinter::EmitStartOfAsmFile(Module &M) {
 void PPCLinuxAsmPrinter::EmitFunctionEntryLabel() {
   // linux/ppc32 - Normal entry label.
   if (!Subtarget->isPPC64() &&
-      (TM.getRelocationModel() != Reloc::PIC_ ||
-       MF->getFunction()->getParent()->getPICLevel() == PICLevel::Small))
+      (!isPositionIndependent() ||
+       MF->getFunction()->getParent()->getPICLevel() == PICLevel::SmallPIC))
     return AsmPrinter::EmitFunctionEntryLabel();
 
   if (!Subtarget->isPPC64()) {
@@ -1302,8 +1289,10 @@ void PPCDarwinAsmPrinter::EmitStartOfAsmFile(Module &M) {
     "power6",
     "power6x",
     "power7",
+    // FIXME: why is power8 missing here?
     "ppc64",
-    "ppc64le"
+    "ppc64le",
+    "power9"
   };
 
   // Get the numerically largest directive.
@@ -1350,161 +1339,6 @@ void PPCDarwinAsmPrinter::EmitStartOfAsmFile(Module &M) {
   OutStreamer->SwitchSection(getObjFileLowering().getTextSection());
 }
 
-static MCSymbol *GetLazyPtr(MCSymbol *Sym, MCContext &Ctx) {
-  // Remove $stub suffix, add $lazy_ptr.
-  StringRef NoStub = Sym->getName().substr(0, Sym->getName().size()-5);
-  return Ctx.getOrCreateSymbol(NoStub + "$lazy_ptr");
-}
-
-static MCSymbol *GetAnonSym(MCSymbol *Sym, MCContext &Ctx) {
-  // Add $tmp suffix to $stub, yielding $stub$tmp.
-  return Ctx.getOrCreateSymbol(Sym->getName() + "$tmp");
-}
-
-void PPCDarwinAsmPrinter::
-EmitFunctionStubs(const MachineModuleInfoMachO::SymbolListTy &Stubs) {
-  bool isPPC64 = getDataLayout().getPointerSizeInBits() == 64;
-
-  // Construct a local MCSubtargetInfo and shadow EmitToStreamer here.
-  // This is because the MachineFunction won't exist (but have not yet been
-  // freed) and since we're at the global level we can use the default
-  // constructed subtarget.
-  std::unique_ptr<MCSubtargetInfo> STI(TM.getTarget().createMCSubtargetInfo(
-      TM.getTargetTriple().str(), TM.getTargetCPU(),
-      TM.getTargetFeatureString()));
-  auto EmitToStreamer = [&STI] (MCStreamer &S, const MCInst &Inst) {
-    S.EmitInstruction(Inst, *STI);
-  };
-
-  const TargetLoweringObjectFileMachO &TLOFMacho =
-      static_cast<const TargetLoweringObjectFileMachO &>(getObjFileLowering());
-
-  // .lazy_symbol_pointer
-  MCSection *LSPSection = TLOFMacho.getLazySymbolPointerSection();
-
-  // Output stubs for dynamically-linked functions
-  if (TM.getRelocationModel() == Reloc::PIC_) {
-    MCSection *StubSection = OutContext.getMachOSection(
-        "__TEXT", "__picsymbolstub1",
-        MachO::S_SYMBOL_STUBS | MachO::S_ATTR_PURE_INSTRUCTIONS, 32,
-        SectionKind::getText());
-    for (unsigned i = 0, e = Stubs.size(); i != e; ++i) {
-      OutStreamer->SwitchSection(StubSection);
-      EmitAlignment(4);
-
-      MCSymbol *Stub = Stubs[i].first;
-      MCSymbol *RawSym = Stubs[i].second.getPointer();
-      MCSymbol *LazyPtr = GetLazyPtr(Stub, OutContext);
-      MCSymbol *AnonSymbol = GetAnonSym(Stub, OutContext);
-
-      OutStreamer->EmitLabel(Stub);
-      OutStreamer->EmitSymbolAttribute(RawSym, MCSA_IndirectSymbol);
-
-      const MCExpr *Anon = MCSymbolRefExpr::create(AnonSymbol, OutContext);
-      const MCExpr *LazyPtrExpr = MCSymbolRefExpr::create(LazyPtr, OutContext);
-      const MCExpr *Sub =
-        MCBinaryExpr::createSub(LazyPtrExpr, Anon, OutContext);
-
-      // mflr r0
-      EmitToStreamer(*OutStreamer, MCInstBuilder(PPC::MFLR).addReg(PPC::R0));
-      // bcl 20, 31, AnonSymbol
-      EmitToStreamer(*OutStreamer, MCInstBuilder(PPC::BCLalways).addExpr(Anon));
-      OutStreamer->EmitLabel(AnonSymbol);
-      // mflr r11
-      EmitToStreamer(*OutStreamer, MCInstBuilder(PPC::MFLR).addReg(PPC::R11));
-      // addis r11, r11, ha16(LazyPtr - AnonSymbol)
-      const MCExpr *SubHa16 = PPCMCExpr::createHa(Sub, true, OutContext);
-      EmitToStreamer(*OutStreamer, MCInstBuilder(PPC::ADDIS)
-        .addReg(PPC::R11)
-        .addReg(PPC::R11)
-        .addExpr(SubHa16));
-      // mtlr r0
-      EmitToStreamer(*OutStreamer, MCInstBuilder(PPC::MTLR).addReg(PPC::R0));
-
-      // ldu r12, lo16(LazyPtr - AnonSymbol)(r11)
-      // lwzu r12, lo16(LazyPtr - AnonSymbol)(r11)
-      const MCExpr *SubLo16 = PPCMCExpr::createLo(Sub, true, OutContext);
-      EmitToStreamer(*OutStreamer, MCInstBuilder(isPPC64 ? PPC::LDU : PPC::LWZU)
-        .addReg(PPC::R12)
-        .addExpr(SubLo16).addExpr(SubLo16)
-        .addReg(PPC::R11));
-      // mtctr r12
-      EmitToStreamer(*OutStreamer, MCInstBuilder(PPC::MTCTR).addReg(PPC::R12));
-      // bctr
-      EmitToStreamer(*OutStreamer, MCInstBuilder(PPC::BCTR));
-
-      OutStreamer->SwitchSection(LSPSection);
-      OutStreamer->EmitLabel(LazyPtr);
-      OutStreamer->EmitSymbolAttribute(RawSym, MCSA_IndirectSymbol);
-
-      MCSymbol *DyldStubBindingHelper =
-        OutContext.getOrCreateSymbol(StringRef("dyld_stub_binding_helper"));
-      if (isPPC64) {
-        // .quad dyld_stub_binding_helper
-        OutStreamer->EmitSymbolValue(DyldStubBindingHelper, 8);
-      } else {
-        // .long dyld_stub_binding_helper
-        OutStreamer->EmitSymbolValue(DyldStubBindingHelper, 4);
-      }
-    }
-    OutStreamer->AddBlankLine();
-    return;
-  }
-
-  MCSection *StubSection = OutContext.getMachOSection(
-      "__TEXT", "__symbol_stub1",
-      MachO::S_SYMBOL_STUBS | MachO::S_ATTR_PURE_INSTRUCTIONS, 16,
-      SectionKind::getText());
-  for (unsigned i = 0, e = Stubs.size(); i != e; ++i) {
-    MCSymbol *Stub = Stubs[i].first;
-    MCSymbol *RawSym = Stubs[i].second.getPointer();
-    MCSymbol *LazyPtr = GetLazyPtr(Stub, OutContext);
-    const MCExpr *LazyPtrExpr = MCSymbolRefExpr::create(LazyPtr, OutContext);
-
-    OutStreamer->SwitchSection(StubSection);
-    EmitAlignment(4);
-    OutStreamer->EmitLabel(Stub);
-    OutStreamer->EmitSymbolAttribute(RawSym, MCSA_IndirectSymbol);
-
-    // lis r11, ha16(LazyPtr)
-    const MCExpr *LazyPtrHa16 =
-      PPCMCExpr::createHa(LazyPtrExpr, true, OutContext);
-    EmitToStreamer(*OutStreamer, MCInstBuilder(PPC::LIS)
-      .addReg(PPC::R11)
-      .addExpr(LazyPtrHa16));
-
-    // ldu r12, lo16(LazyPtr)(r11)
-    // lwzu r12, lo16(LazyPtr)(r11)
-    const MCExpr *LazyPtrLo16 =
-      PPCMCExpr::createLo(LazyPtrExpr, true, OutContext);
-    EmitToStreamer(*OutStreamer, MCInstBuilder(isPPC64 ? PPC::LDU : PPC::LWZU)
-      .addReg(PPC::R12)
-      .addExpr(LazyPtrLo16).addExpr(LazyPtrLo16)
-      .addReg(PPC::R11));
-
-    // mtctr r12
-    EmitToStreamer(*OutStreamer, MCInstBuilder(PPC::MTCTR).addReg(PPC::R12));
-    // bctr
-    EmitToStreamer(*OutStreamer, MCInstBuilder(PPC::BCTR));
-
-    OutStreamer->SwitchSection(LSPSection);
-    OutStreamer->EmitLabel(LazyPtr);
-    OutStreamer->EmitSymbolAttribute(RawSym, MCSA_IndirectSymbol);
-
-    MCSymbol *DyldStubBindingHelper =
-      OutContext.getOrCreateSymbol(StringRef("dyld_stub_binding_helper"));
-    if (isPPC64) {
-      // .quad dyld_stub_binding_helper
-      OutStreamer->EmitSymbolValue(DyldStubBindingHelper, 8);
-    } else {
-      // .long dyld_stub_binding_helper
-      OutStreamer->EmitSymbolValue(DyldStubBindingHelper, 4);
-    }
-  }
-
-  OutStreamer->AddBlankLine();
-}
-
 bool PPCDarwinAsmPrinter::doFinalization(Module &M) {
   bool isPPC64 = getDataLayout().getPointerSizeInBits() == 64;
 
@@ -1514,10 +1348,6 @@ bool PPCDarwinAsmPrinter::doFinalization(Module &M) {
   MachineModuleInfoMachO &MMIMacho =
       MMI->getObjFileInfo<MachineModuleInfoMachO>();
 
-  MachineModuleInfoMachO::SymbolListTy Stubs = MMIMacho.GetFnStubList();
-  if (!Stubs.empty())
-    EmitFunctionStubs(Stubs);
-
   if (MAI->doesSupportExceptionHandling() && MMI) {
     // Add the (possibly multiple) personalities to the set of global values.
     // Only referenced functions get into the Personalities list.
@@ -1534,7 +1364,7 @@ bool PPCDarwinAsmPrinter::doFinalization(Module &M) {
   }
 
   // Output stubs for dynamically-linked functions.
-  Stubs = MMIMacho.GetGVStubList();
+  MachineModuleInfoMachO::SymbolListTy Stubs = MMIMacho.GetGVStubList();
 
   // Output macho stubs for external and common global variables.
   if (!Stubs.empty()) {
@@ -1568,25 +1398,6 @@ bool PPCDarwinAsmPrinter::doFinalization(Module &M) {
     OutStreamer->AddBlankLine();
   }
 
-  Stubs = MMIMacho.GetHiddenGVStubList();
-  if (!Stubs.empty()) {
-    OutStreamer->SwitchSection(getObjFileLowering().getDataSection());
-    EmitAlignment(isPPC64 ? 3 : 2);
-
-    for (unsigned i = 0, e = Stubs.size(); i != e; ++i) {
-      // L_foo$stub:
-      OutStreamer->EmitLabel(Stubs[i].first);
-      //   .long _foo
-      OutStreamer->EmitValue(MCSymbolRefExpr::
-                             create(Stubs[i].second.getPointer(),
-                                    OutContext),
-                             isPPC64 ? 8 : 4/*size*/);
-    }
-
-    Stubs.clear();
-    OutStreamer->AddBlankLine();
-  }
-
   // Funny Darwin hack: This flag tells the linker that no global symbols
   // contain code that falls through to other global symbols (e.g. the obvious
   // implementation of multiple entry points).  If this doesn't occur, the
diff --git a/lib/Target/PowerPC/PPCBoolRetToInt.cpp b/lib/Target/PowerPC/PPCBoolRetToInt.cpp
index 7920240bc2b9..bfb4d8756901 100644
--- a/lib/Target/PowerPC/PPCBoolRetToInt.cpp
+++ b/lib/Target/PowerPC/PPCBoolRetToInt.cpp
@@ -119,7 +119,7 @@ class PPCBoolRetToInt : public FunctionPass {
             Promotable.insert(P);
 
     SmallVector<const PHINode *, 8> ToRemove;
-    for (const auto &P : Promotable) {
+    for (const PHINode *P : Promotable) {
       // Condition 2 and 3
       auto IsValidUser = [] (const Value *V) -> bool {
         return isa<ReturnInst>(V) || isa<CallInst>(V) || isa<PHINode>(V) ||
@@ -146,7 +146,7 @@ class PPCBoolRetToInt : public FunctionPass {
         Promotable.erase(User);
       ToRemove.clear();
 
-      for (const auto &P : Promotable) {
+      for (const PHINode *P : Promotable) {
         // Condition 4 and 5
         const auto &Users = P->users();
         const auto &Operands = P->operands();
@@ -168,6 +168,9 @@ class PPCBoolRetToInt : public FunctionPass {
   }
 
   bool runOnFunction(Function &F) {
+    if (skipFunction(F))
+      return false;
+
     PHINodeSet PromotablePHINodes = getPromotablePHINodes(F);
     B2IMap Bool2IntMap;
     bool Changed = false;
@@ -199,11 +202,11 @@ class PPCBoolRetToInt : public FunctionPass {
     // Presently, we only know how to handle PHINode, Constant, and Arguments.
     // Potentially, bitwise operations (AND, OR, XOR, NOT) and sign extension
     // could also be handled in the future.
-    for (const auto &V : Defs)
+    for (Value *V : Defs)
       if (!isa<PHINode>(V) && !isa<Constant>(V) && !isa<Argument>(V))
         return false;
 
-    for (const auto &V : Defs)
+    for (Value *V : Defs)
       if (const PHINode *P = dyn_cast<PHINode>(V))
         if (!PromotablePHINodes.count(P))
           return false;
@@ -214,7 +217,7 @@ class PPCBoolRetToInt : public FunctionPass {
       ++NumBoolCallPromotion;
     ++NumBoolToIntPromotion;
 
-    for (const auto &V : Defs)
+    for (Value *V : Defs)
       if (!BoolToIntMap.count(V))
         BoolToIntMap[V] = translate(V);
 
diff --git a/lib/Target/PowerPC/PPCBranchSelector.cpp b/lib/Target/PowerPC/PPCBranchSelector.cpp
index 73a5305197ad..4d63c5b5703c 100644
--- a/lib/Target/PowerPC/PPCBranchSelector.cpp
+++ b/lib/Target/PowerPC/PPCBranchSelector.cpp
@@ -46,6 +46,11 @@ namespace {
 
     bool runOnMachineFunction(MachineFunction &Fn) override;
 
+    MachineFunctionProperties getRequiredProperties() const override {
+      return MachineFunctionProperties().set(
+          MachineFunctionProperties::Property::AllVRegsAllocated);
+    }
+
     const char *getPassName() const override {
       return "PowerPC Branch Selector";
     }
@@ -102,10 +107,9 @@ bool PPCBSel::runOnMachineFunction(MachineFunction &Fn) {
     }
 
     unsigned BlockSize = 0;
-    for (MachineBasicBlock::iterator MBBI = MBB->begin(), EE = MBB->end();
-         MBBI != EE; ++MBBI)
-      BlockSize += TII->GetInstSizeInBytes(MBBI);
-    
+    for (MachineInstr &MI : *MBB)
+      BlockSize += TII->GetInstSizeInBytes(MI);
+
     BlockSizes[MBB->getNumber()] = BlockSize;
     FuncSize += BlockSize;
   }
@@ -151,7 +155,7 @@ bool PPCBSel::runOnMachineFunction(MachineFunction &Fn) {
           Dest = I->getOperand(0).getMBB();
 
         if (!Dest) {
-          MBBStartOffset += TII->GetInstSizeInBytes(I);
+          MBBStartOffset += TII->GetInstSizeInBytes(*I);
           continue;
         }
         
@@ -234,4 +238,3 @@ bool PPCBSel::runOnMachineFunction(MachineFunction &Fn) {
   BlockSizes.clear();
   return true;
 }
-
diff --git a/lib/Target/PowerPC/PPCCCState.cpp b/lib/Target/PowerPC/PPCCCState.cpp
new file mode 100644
index 000000000000..5510a95430f5
--- /dev/null
+++ b/lib/Target/PowerPC/PPCCCState.cpp
@@ -0,0 +1,36 @@
+//===---- PPCCCState.cpp - CCState with PowerPC specific extensions ---------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "PPCCCState.h"
+#include "PPCSubtarget.h"
+#include "llvm/IR/Module.h"
+using namespace llvm;
+
+// Identify lowered values that originated from ppcf128 arguments and record
+// this.
+void PPCCCState::PreAnalyzeCallOperands(
+    const SmallVectorImpl<ISD::OutputArg> &Outs) {
+  for (const auto &I : Outs) {
+    if (I.ArgVT == llvm::MVT::ppcf128)
+      OriginalArgWasPPCF128.push_back(true);
+    else
+      OriginalArgWasPPCF128.push_back(false);
+  }
+}
+
+void PPCCCState::PreAnalyzeFormalArguments(
+    const SmallVectorImpl<ISD::InputArg> &Ins) {
+  for (const auto &I : Ins) {
+    if (I.ArgVT == llvm::MVT::ppcf128) {
+      OriginalArgWasPPCF128.push_back(true);
+    } else {
+      OriginalArgWasPPCF128.push_back(false);
+    }
+  }
+}
+\ No newline at end of file
diff --git a/lib/Target/PowerPC/PPCCCState.h b/lib/Target/PowerPC/PPCCCState.h
new file mode 100644
index 000000000000..9be9f11dbea3
--- /dev/null
+++ b/lib/Target/PowerPC/PPCCCState.h
@@ -0,0 +1,42 @@
+//===---- PPCCCState.h - CCState with PowerPC specific extensions -----------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef PPCCCSTATE_H
+#define PPCCCSTATE_H
+
+#include "PPCISelLowering.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/CodeGen/CallingConvLower.h"
+
+namespace llvm {
+
+class PPCCCState : public CCState {
+public:
+
+  void
+  PreAnalyzeCallOperands(const SmallVectorImpl<ISD::OutputArg> &Outs);
+  void
+  PreAnalyzeFormalArguments(const SmallVectorImpl<ISD::InputArg> &Ins);
+
+private:
+
+  // Records whether the value has been lowered from an ppcf128.
+  SmallVector<bool, 4> OriginalArgWasPPCF128;
+
+public:
+  PPCCCState(CallingConv::ID CC, bool isVarArg, MachineFunction &MF,
+             SmallVectorImpl<CCValAssign> &locs, LLVMContext &C)
+        : CCState(CC, isVarArg, MF, locs, C) {}
+
+  bool WasOriginalArgPPCF128(unsigned ValNo) { return OriginalArgWasPPCF128[ValNo]; }
+  void clearWasPPCF128() { OriginalArgWasPPCF128.clear(); }
+};
+}
+
+#endif
diff --git a/lib/Target/PowerPC/PPCCTRLoops.cpp b/lib/Target/PowerPC/PPCCTRLoops.cpp
index b6ac4d54d4c7..875226635917 100644
--- a/lib/Target/PowerPC/PPCCTRLoops.cpp
+++ b/lib/Target/PowerPC/PPCCTRLoops.cpp
@@ -54,9 +54,6 @@
 #include "llvm/CodeGen/MachineRegisterInfo.h"
 #endif
 
-#include <algorithm>
-#include <vector>
-
 using namespace llvm;
 
 #define DEBUG_TYPE "ctrloops"
@@ -169,6 +166,9 @@ FunctionPass *llvm::createPPCCTRLoopsVerify() {
 #endif // NDEBUG
 
 bool PPCCTRLoops::runOnFunction(Function &F) {
+  if (skipFunction(F))
+    return false;
+
   LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
   SE = &getAnalysis<ScalarEvolutionWrapperPass>().getSE();
   DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
@@ -245,7 +245,7 @@ bool PPCCTRLoops::mightUseCTR(const Triple &TT, BasicBlock *BB) {
       if (Function *F = CI->getCalledFunction()) {
         // Most intrinsics don't become function calls, but some might.
         // sin, cos, exp and log are always calls.
-        unsigned Opcode;
+        unsigned Opcode = 0;
         if (F->getIntrinsicID() != Intrinsic::not_intrinsic) {
           switch (F->getIntrinsicID()) {
           default: continue;
@@ -305,6 +305,8 @@ bool PPCCTRLoops::mightUseCTR(const Triple &TT, BasicBlock *BB) {
           case Intrinsic::rint:      Opcode = ISD::FRINT;      break;
           case Intrinsic::nearbyint: Opcode = ISD::FNEARBYINT; break;
           case Intrinsic::round:     Opcode = ISD::FROUND;     break;
+          case Intrinsic::minnum:    Opcode = ISD::FMINNUM;    break;
+          case Intrinsic::maxnum:    Opcode = ISD::FMAXNUM;    break;
           }
         }
 
@@ -364,8 +366,18 @@ bool PPCCTRLoops::mightUseCTR(const Triple &TT, BasicBlock *BB) {
           case LibFunc::truncf:
           case LibFunc::truncl:
             Opcode = ISD::FTRUNC; break;
+          case LibFunc::fmin:
+          case LibFunc::fminf:
+          case LibFunc::fminl:
+            Opcode = ISD::FMINNUM; break;
+          case LibFunc::fmax:
+          case LibFunc::fmaxf:
+          case LibFunc::fmaxl:
+            Opcode = ISD::FMAXNUM; break;
           }
+        }
 
+        if (Opcode) {
           auto &DL = CI->getModule()->getDataLayout();
           MVT VTy = TLI->getSimpleValueType(DL, CI->getArgOperand(0)->getType(),
                                             true);
@@ -422,6 +434,25 @@ bool PPCCTRLoops::mightUseCTR(const Triple &TT, BasicBlock *BB) {
       if (SI->getNumCases() + 1 >= (unsigned)TLI->getMinimumJumpTableEntries())
         return true;
     }
+
+    if (TM->getSubtargetImpl(*BB->getParent())->getTargetLowering()->useSoftFloat()) {
+      switch(J->getOpcode()) {
+      case Instruction::FAdd:
+      case Instruction::FSub:
+      case Instruction::FMul:
+      case Instruction::FDiv:
+      case Instruction::FRem:
+      case Instruction::FPTrunc:
+      case Instruction::FPExt:
+      case Instruction::FPToUI:
+      case Instruction::FPToSI:
+      case Instruction::UIToFP:
+      case Instruction::SIToFP:
+      case Instruction::FCmp:
+        return true;
+      }
+    }
+
     for (Value *Operand : J->operands())
       if (memAddrUsesCTR(TM, Operand))
         return true;
diff --git a/lib/Target/PowerPC/PPCCallingConv.td b/lib/Target/PowerPC/PPCCallingConv.td
index 5bc9124f8085..53d2f77ff918 100644
--- a/lib/Target/PowerPC/PPCCallingConv.td
+++ b/lib/Target/PowerPC/PPCCallingConv.td
@@ -23,6 +23,9 @@ class CCIfNotSubtarget<string F, CCAction A>
                        "(State.getMachineFunction().getSubtarget()).",
                      F),
           A>;
+class CCIfOrigArgWasNotPPCF128<CCAction A>
+    : CCIf<"!static_cast<PPCCCState *>(&State)->WasOriginalArgPPCF128(ValNo)",
+           A>;
 
 //===----------------------------------------------------------------------===//
 // Return Value Calling Convention
@@ -109,7 +112,7 @@ def RetCC_PPC64_ELF_FIS : CallingConv<[
   CCIfType<[i8],   CCPromoteToType<i64>>,
   CCIfType<[i16],  CCPromoteToType<i64>>,
   CCIfType<[i32],  CCPromoteToType<i64>>,
-  CCIfType<[i64],  CCAssignToReg<[X3, X4]>>,
+  CCIfType<[i64],  CCAssignToReg<[X3, X4, X5, X6]>>,
   CCIfType<[i128], CCAssignToReg<[X3, X4, X5, X6]>>,
   CCIfType<[f32],  CCAssignToReg<[F1, F2, F3, F4, F5, F6, F7, F8]>>,
   CCIfType<[f64],  CCAssignToReg<[F1, F2, F3, F4, F5, F6, F7, F8]>>,
@@ -131,7 +134,14 @@ def CC_PPC32_SVR4_Common : CallingConv<[
 
   // The ABI requires i64 to be passed in two adjacent registers with the first
   // register having an odd register number.
-  CCIfType<[i32], CCIfSplit<CCCustom<"CC_PPC32_SVR4_Custom_AlignArgRegs">>>,
+  CCIfType<[i32],
+  CCIfSplit<CCIfSubtarget<"useSoftFloat()", 
+            CCIfOrigArgWasNotPPCF128<
+            CCCustom<"CC_PPC32_SVR4_Custom_AlignArgRegs">>>>>,
+  
+  CCIfType<[i32],
+  CCIfSplit<CCIfNotSubtarget<"useSoftFloat()", 
+                            CCCustom<"CC_PPC32_SVR4_Custom_AlignArgRegs">>>>,
 
   // The 'nest' parameter, if any, is passed in R11.
   CCIfNest<CCAssignToReg<[R11]>>,
@@ -243,12 +253,23 @@ def CSR_SVR464   : CalleeSavedRegs<(add X14, X15, X16, X17, X18, X19, X20,
                                         F27, F28, F29, F30, F31, CR2, CR3, CR4
                                    )>;
 
+// CSRs that are handled by prologue, epilogue.
+def CSR_SRV464_TLS_PE : CalleeSavedRegs<(add)>;
+
+def CSR_SVR464_ViaCopy : CalleeSavedRegs<(add CSR_SVR464)>;
+
 def CSR_SVR464_Altivec : CalleeSavedRegs<(add CSR_SVR464, CSR_Altivec)>;
 
+def CSR_SVR464_Altivec_ViaCopy : CalleeSavedRegs<(add CSR_SVR464_Altivec)>;
+
 def CSR_SVR464_R2 : CalleeSavedRegs<(add CSR_SVR464, X2)>;
 
+def CSR_SVR464_R2_ViaCopy : CalleeSavedRegs<(add CSR_SVR464_R2)>;
+
 def CSR_SVR464_R2_Altivec : CalleeSavedRegs<(add CSR_SVR464_Altivec, X2)>;
 
+def CSR_SVR464_R2_Altivec_ViaCopy : CalleeSavedRegs<(add CSR_SVR464_R2_Altivec)>;
+
 def CSR_NoRegs : CalleeSavedRegs<(add)>;
 
 def CSR_64_AllRegs: CalleeSavedRegs<(add X0, (sequence "X%u", 3, 10),
diff --git a/lib/Target/PowerPC/PPCEarlyReturn.cpp b/lib/Target/PowerPC/PPCEarlyReturn.cpp
index 7cb1bb54c725..fcd2f50e1e3d 100644
--- a/lib/Target/PowerPC/PPCEarlyReturn.cpp
+++ b/lib/Target/PowerPC/PPCEarlyReturn.cpp
@@ -12,10 +12,10 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "PPCInstrInfo.h"
-#include "MCTargetDesc/PPCPredicates.h"
 #include "PPC.h"
+#include "MCTargetDesc/PPCPredicates.h"
 #include "PPCInstrBuilder.h"
+#include "PPCInstrInfo.h"
 #include "PPCMachineFunctionInfo.h"
 #include "PPCTargetMachine.h"
 #include "llvm/ADT/STLExtras.h"
@@ -26,7 +26,6 @@
 #include "llvm/CodeGen/MachineMemOperand.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
 #include "llvm/MC/MCAsmInfo.h"
-#include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/TargetRegistry.h"
@@ -84,7 +83,7 @@ protected:
               // This is an unconditional branch to the return. Replace the
               // branch with a blr.
               BuildMI(**PI, J, J->getDebugLoc(), TII->get(I->getOpcode()))
-                  .copyImplicitOps(I);
+                  .copyImplicitOps(*I);
               MachineBasicBlock::iterator K = J--;
               K->eraseFromParent();
               BlockChanged = true;
@@ -98,7 +97,7 @@ protected:
               BuildMI(**PI, J, J->getDebugLoc(), TII->get(PPC::BCCLR))
                   .addImm(J->getOperand(0).getImm())
                   .addReg(J->getOperand(1).getReg())
-                  .copyImplicitOps(I);
+                  .copyImplicitOps(*I);
               MachineBasicBlock::iterator K = J--;
               K->eraseFromParent();
               BlockChanged = true;
@@ -113,7 +112,7 @@ protected:
                   **PI, J, J->getDebugLoc(),
                   TII->get(J->getOpcode() == PPC::BC ? PPC::BCLR : PPC::BCLRn))
                   .addReg(J->getOperand(0).getReg())
-                  .copyImplicitOps(I);
+                  .copyImplicitOps(*I);
               MachineBasicBlock::iterator K = J--;
               K->eraseFromParent();
               BlockChanged = true;
@@ -174,6 +173,9 @@ protected:
 
 public:
     bool runOnMachineFunction(MachineFunction &MF) override {
+      if (skipFunction(*MF.getFunction()))
+        return false;
+
       TII = MF.getSubtarget().getInstrInfo();
 
       bool Changed = false;
@@ -192,6 +194,11 @@ public:
       return Changed;
     }
 
+    MachineFunctionProperties getRequiredProperties() const override {
+      return MachineFunctionProperties().set(
+          MachineFunctionProperties::Property::AllVRegsAllocated);
+    }
+
     void getAnalysisUsage(AnalysisUsage &AU) const override {
       MachineFunctionPass::getAnalysisUsage(AU);
     }
@@ -204,4 +211,3 @@ INITIALIZE_PASS(PPCEarlyReturn, DEBUG_TYPE,
 char PPCEarlyReturn::ID = 0;
 FunctionPass*
 llvm::createPPCEarlyReturnPass() { return new PPCEarlyReturn(); }
-
diff --git a/lib/Target/PowerPC/PPCFastISel.cpp b/lib/Target/PowerPC/PPCFastISel.cpp
index 16dcd468c91d..7e92042d2f96 100644
--- a/lib/Target/PowerPC/PPCFastISel.cpp
+++ b/lib/Target/PowerPC/PPCFastISel.cpp
@@ -16,6 +16,7 @@
 #include "PPC.h"
 #include "MCTargetDesc/PPCPredicates.h"
 #include "PPCCallingConv.h"
+#include "PPCCCState.h"
 #include "PPCISelLowering.h"
 #include "PPCMachineFunctionInfo.h"
 #include "PPCSubtarget.h"
@@ -158,7 +159,7 @@ class PPCFastISel final : public FastISel {
                      unsigned FP64LoadOpc = PPC::LFD);
     bool PPCEmitStore(MVT VT, unsigned SrcReg, Address &Addr);
     bool PPCComputeAddress(const Value *Obj, Address &Addr);
-    void PPCSimplifyAddress(Address &Addr, MVT VT, bool &UseOffset,
+    void PPCSimplifyAddress(Address &Addr, bool &UseOffset,
                             unsigned &IndexReg);
     bool PPCEmitIntExt(MVT SrcVT, unsigned SrcReg, MVT DestVT,
                            unsigned DestReg, bool IsZExt);
@@ -185,7 +186,7 @@ class PPCFastISel final : public FastISel {
                          unsigned &NumBytes,
                          bool IsVarArg);
     bool finishCall(MVT RetVT, CallLoweringInfo &CLI, unsigned &NumBytes);
-    CCAssignFn *usePPC32CCs(unsigned Flag);
+    LLVM_ATTRIBUTE_UNUSED CCAssignFn *usePPC32CCs(unsigned Flag);
 
   private:
   #include "PPCGenFastISel.inc"
@@ -196,7 +197,7 @@ class PPCFastISel final : public FastISel {
 
 #include "PPCGenCallingConv.inc"
 
-// Function whose sole purpose is to kill compiler warnings 
+// Function whose sole purpose is to kill compiler warnings
 // stemming from unused functions included from PPCGenCallingConv.inc.
 CCAssignFn *PPCFastISel::usePPC32CCs(unsigned Flag) {
   if (Flag == 1)
@@ -213,13 +214,29 @@ static Optional<PPC::Predicate> getComparePred(CmpInst::Predicate Pred) {
   switch (Pred) {
     // These are not representable with any single compare.
     case CmpInst::FCMP_FALSE:
+    case CmpInst::FCMP_TRUE:
+    // Major concern about the following 6 cases is NaN result. The comparison
+    // result consists of 4 bits, indicating lt, eq, gt and un (unordered),
+    // only one of which will be set. The result is generated by fcmpu
+    // instruction. However, bc instruction only inspects one of the first 3
+    // bits, so when un is set, bc instruction may jump to to an undesired
+    // place.
+    //
+    // More specifically, if we expect an unordered comparison and un is set, we
+    // expect to always go to true branch; in such case UEQ, UGT and ULT still
+    // give false, which are undesired; but UNE, UGE, ULE happen to give true,
+    // since they are tested by inspecting !eq, !lt, !gt, respectively.
+    //
+    // Similarly, for ordered comparison, when un is set, we always expect the
+    // result to be false. In such case OGT, OLT and OEQ is good, since they are
+    // actually testing GT, LT, and EQ respectively, which are false. OGE, OLE
+    // and ONE are tested through !lt, !gt and !eq, and these are true.
     case CmpInst::FCMP_UEQ:
     case CmpInst::FCMP_UGT:
-    case CmpInst::FCMP_UGE:
     case CmpInst::FCMP_ULT:
-    case CmpInst::FCMP_ULE:
-    case CmpInst::FCMP_UNE:
-    case CmpInst::FCMP_TRUE:
+    case CmpInst::FCMP_OGE:
+    case CmpInst::FCMP_OLE:
+    case CmpInst::FCMP_ONE:
     default:
       return Optional<PPC::Predicate>();
 
@@ -232,7 +249,7 @@ static Optional<PPC::Predicate> getComparePred(CmpInst::Predicate Pred) {
     case CmpInst::ICMP_SGT:
       return PPC::PRED_GT;
 
-    case CmpInst::FCMP_OGE:
+    case CmpInst::FCMP_UGE:
     case CmpInst::ICMP_UGE:
     case CmpInst::ICMP_SGE:
       return PPC::PRED_GE;
@@ -242,12 +259,12 @@ static Optional<PPC::Predicate> getComparePred(CmpInst::Predicate Pred) {
     case CmpInst::ICMP_SLT:
       return PPC::PRED_LT;
 
-    case CmpInst::FCMP_OLE:
+    case CmpInst::FCMP_ULE:
     case CmpInst::ICMP_ULE:
     case CmpInst::ICMP_SLE:
       return PPC::PRED_LE;
 
-    case CmpInst::FCMP_ONE:
+    case CmpInst::FCMP_UNE:
     case CmpInst::ICMP_NE:
       return PPC::PRED_NE;
 
@@ -412,7 +429,7 @@ bool PPCFastISel::PPCComputeAddress(const Value *Obj, Address &Addr) {
 // Fix up some addresses that can't be used directly.  For example, if
 // an offset won't fit in an instruction field, we may need to move it
 // into an index register.
-void PPCFastISel::PPCSimplifyAddress(Address &Addr, MVT VT, bool &UseOffset,
+void PPCFastISel::PPCSimplifyAddress(Address &Addr, bool &UseOffset,
                                      unsigned &IndexReg) {
 
   // Check whether the offset fits in the instruction field.
@@ -431,8 +448,7 @@ void PPCFastISel::PPCSimplifyAddress(Address &Addr, MVT VT, bool &UseOffset,
   }
 
   if (!UseOffset) {
-    IntegerType *OffsetTy = ((VT == MVT::i32) ? Type::getInt32Ty(*Context)
-                             : Type::getInt64Ty(*Context));
+    IntegerType *OffsetTy = Type::getInt64Ty(*Context);
     const ConstantInt *Offset =
       ConstantInt::getSigned(OffsetTy, (int64_t)(Addr.Offset));
     IndexReg = PPCMaterializeInt(Offset, MVT::i64);
@@ -501,7 +517,7 @@ bool PPCFastISel::PPCEmitLoad(MVT VT, unsigned &ResultReg, Address &Addr,
   // If necessary, materialize the offset into a register and use
   // the indexed form.  Also handle stack pointers with special needs.
   unsigned IndexReg = 0;
-  PPCSimplifyAddress(Addr, VT, UseOffset, IndexReg);
+  PPCSimplifyAddress(Addr, UseOffset, IndexReg);
 
   // If this is a potential VSX load with an offset of 0, a VSX indexed load can
   // be used.
@@ -637,7 +653,7 @@ bool PPCFastISel::PPCEmitStore(MVT VT, unsigned SrcReg, Address &Addr) {
   // If necessary, materialize the offset into a register and use
   // the indexed form.  Also handle stack pointers with special needs.
   unsigned IndexReg = 0;
-  PPCSimplifyAddress(Addr, VT, UseOffset, IndexReg);
+  PPCSimplifyAddress(Addr, UseOffset, IndexReg);
 
   // If this is a potential VSX store with an offset of 0, a VSX indexed store
   // can be used.
@@ -1068,10 +1084,10 @@ unsigned PPCFastISel::PPCMoveToIntReg(const Instruction *I, MVT VT,
   if (!PPCEmitStore(MVT::f64, SrcReg, Addr))
     return 0;
 
-  // Reload it into a GPR.  If we want an i32, modify the address
-  // to have a 4-byte offset so we load from the right place.
+  // Reload it into a GPR.  If we want an i32 on big endian, modify the
+  // address to have a 4-byte offset so we load from the right place.
   if (VT == MVT::i32)
-    Addr.Offset = 4;
+    Addr.Offset = (PPCSubTarget->isLittleEndian()) ? 0 : 4;
 
   // Look at the currently assigned register for this instruction
   // to determine the required register class.
@@ -1115,14 +1131,13 @@ bool PPCFastISel::SelectFPToI(const Instruction *I, bool IsSigned) {
     return false;
 
   // Convert f32 to f64 if necessary.  This is just a meaningless copy
-  // to get the register class right.  COPY_TO_REGCLASS is needed since
-  // a COPY from F4RC to F8RC is converted to a F4RC-F4RC copy downstream.
+  // to get the register class right.
   const TargetRegisterClass *InRC = MRI.getRegClass(SrcReg);
   if (InRC == &PPC::F4RCRegClass) {
     unsigned TmpReg = createResultReg(&PPC::F8RCRegClass);
     BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
-            TII.get(TargetOpcode::COPY_TO_REGCLASS), TmpReg)
-      .addReg(SrcReg).addImm(PPC::F8RCRegClassID);
+            TII.get(TargetOpcode::COPY), TmpReg)
+      .addReg(SrcReg);
     SrcReg = TmpReg;
   }
 
@@ -1583,6 +1598,9 @@ bool PPCFastISel::SelectRet(const Instruction *I) {
   if (!FuncInfo.CanLowerReturn)
     return false;
 
+  if (TLI.supportSplitCSR(FuncInfo.MF))
+    return false;
+
   const ReturnInst *Ret = cast<ReturnInst>(I);
   const Function &F = *I->getParent()->getParent();
 
@@ -2071,7 +2089,6 @@ unsigned PPCFastISel::PPCMaterialize64BitInt(int64_t Imm,
   return TmpReg3;
 }
 
-
 // Materialize an integer constant into a register, and return
 // the register number (or zero if we failed to handle it).
 unsigned PPCFastISel::PPCMaterializeInt(const ConstantInt *CI, MVT VT,
@@ -2085,12 +2102,12 @@ unsigned PPCFastISel::PPCMaterializeInt(const ConstantInt *CI, MVT VT,
     return ImmReg;
   }
 
-  if (VT != MVT::i64 && VT != MVT::i32 && VT != MVT::i16 &&
-      VT != MVT::i8 && VT != MVT::i1) 
+  if (VT != MVT::i64 && VT != MVT::i32 && VT != MVT::i16 && VT != MVT::i8 &&
+      VT != MVT::i1)
     return 0;
 
-  const TargetRegisterClass *RC = ((VT == MVT::i64) ? &PPC::G8RCRegClass :
-                                   &PPC::GPRCRegClass);
+  const TargetRegisterClass *RC =
+      ((VT == MVT::i64) ? &PPC::G8RCRegClass : &PPC::GPRCRegClass);
   int64_t Imm = UseSExt ? CI->getSExtValue() : CI->getZExtValue();
 
   // If the constant is in range, use a load-immediate.
diff --git a/lib/Target/PowerPC/PPCFrameLowering.cpp b/lib/Target/PowerPC/PPCFrameLowering.cpp
index 3fd509ae27f4..c480430dd29a 100644
--- a/lib/Target/PowerPC/PPCFrameLowering.cpp
+++ b/lib/Target/PowerPC/PPCFrameLowering.cpp
@@ -76,9 +76,7 @@ static unsigned computeBasePointerSaveOffset(const PPCSubtarget &STI) {
   // SVR4 ABI: First slot in the general register save area.
   return STI.isPPC64()
              ? -16U
-             : (STI.getTargetMachine().getRelocationModel() == Reloc::PIC_)
-                   ? -12U
-                   : -8U;
+             : STI.getTargetMachine().isPositionIndependent() ? -12U : -8U;
 }
 
 PPCFrameLowering::PPCFrameLowering(const PPCSubtarget &STI)
@@ -596,7 +594,7 @@ PPCFrameLowering::findScratchRegister(MachineBasicBlock *MBB,
       (!UseAtEnd && (&MBB->getParent()->front() == MBB)))
     return true;
 
-  RS.enterBasicBlock(MBB);
+  RS.enterBasicBlock(*MBB);
 
   if (UseAtEnd && !MBB->empty()) {
     // The scratch register will be used at the end of the block, so must
@@ -653,7 +651,7 @@ PPCFrameLowering::findScratchRegister(MachineBasicBlock *MBB,
 
   // Now that we've done our best to provide both registers, double check
   // whether we were unable to provide enough.
-  if (BV.count() < (TwoUniqueRegsRequired ? 2 : 1))
+  if (BV.count() < (TwoUniqueRegsRequired ? 2U : 1U))
     return false;
 
   return true;
@@ -838,13 +836,20 @@ void PPCFrameLowering::emitPrologue(MachineFunction &MF,
   // If we need to spill the CR and the LR but we don't have two separate
   // registers available, we must spill them one at a time
   if (MustSaveCR && SingleScratchReg && MustSaveLR) {
-    // FIXME: In the ELFv2 ABI, we are not required to save all CR fields.
-    // If only one or two CR fields are clobbered, it could be more
-    // efficient to use mfocrf to selectively save just those fields.
+    // In the ELFv2 ABI, we are not required to save all CR fields.
+    // If only one or two CR fields are clobbered, it is more efficient to use
+    // mfocrf to selectively save just those fields, because mfocrf has short
+    // latency compares to mfcr.
+    unsigned MfcrOpcode = PPC::MFCR8;
+    unsigned CrState = RegState::ImplicitKill;
+    if (isELFv2ABI && MustSaveCRs.size() == 1) {
+      MfcrOpcode = PPC::MFOCRF8;
+      CrState = RegState::Kill;
+    }
     MachineInstrBuilder MIB =
-      BuildMI(MBB, MBBI, dl, TII.get(PPC::MFCR8), TempReg);
+      BuildMI(MBB, MBBI, dl, TII.get(MfcrOpcode), TempReg);
     for (unsigned i = 0, e = MustSaveCRs.size(); i != e; ++i)
-      MIB.addReg(MustSaveCRs[i], RegState::ImplicitKill);
+      MIB.addReg(MustSaveCRs[i], CrState);
     BuildMI(MBB, MBBI, dl, TII.get(PPC::STW8))
       .addReg(TempReg, getKillRegState(true))
       .addImm(8)
@@ -856,13 +861,20 @@ void PPCFrameLowering::emitPrologue(MachineFunction &MF,
 
   if (MustSaveCR &&
       !(SingleScratchReg && MustSaveLR)) { // will only occur for PPC64
-    // FIXME: In the ELFv2 ABI, we are not required to save all CR fields.
-    // If only one or two CR fields are clobbered, it could be more
-    // efficient to use mfocrf to selectively save just those fields.
+    // In the ELFv2 ABI, we are not required to save all CR fields.
+    // If only one or two CR fields are clobbered, it is more efficient to use
+    // mfocrf to selectively save just those fields, because mfocrf has short
+    // latency compares to mfcr.
+    unsigned MfcrOpcode = PPC::MFCR8;
+    unsigned CrState = RegState::ImplicitKill;
+    if (isELFv2ABI && MustSaveCRs.size() == 1) {
+      MfcrOpcode = PPC::MFOCRF8;
+      CrState = RegState::Kill;
+    }
     MachineInstrBuilder MIB =
-      BuildMI(MBB, MBBI, dl, TII.get(PPC::MFCR8), TempReg);
+      BuildMI(MBB, MBBI, dl, TII.get(MfcrOpcode), TempReg);
     for (unsigned i = 0, e = MustSaveCRs.size(); i != e; ++i)
-      MIB.addReg(MustSaveCRs[i], RegState::ImplicitKill);
+      MIB.addReg(MustSaveCRs[i], CrState);
   }
 
   if (HasFP)
@@ -889,7 +901,7 @@ void PPCFrameLowering::emitPrologue(MachineFunction &MF,
   if (MustSaveLR)
     // FIXME: On PPC32 SVR4, we must not spill before claiming the stackframe.
     BuildMI(MBB, MBBI, dl, StoreInst)
-      .addReg(ScratchReg)
+      .addReg(ScratchReg, getKillRegState(true))
       .addImm(LROffset)
       .addReg(SPReg);
 
@@ -1315,36 +1327,53 @@ void PPCFrameLowering::emitEpilogue(MachineFunction &MF,
           .addReg(FPReg)
           .addReg(ScratchReg);
       }
-    } else if (RetOpcode == PPC::TCRETURNdi) {
-      MBBI = MBB.getLastNonDebugInstr();
-      MachineOperand &JumpTarget = MBBI->getOperand(0);
-      BuildMI(MBB, MBBI, dl, TII.get(PPC::TAILB)).
-        addGlobalAddress(JumpTarget.getGlobal(), JumpTarget.getOffset());
-    } else if (RetOpcode == PPC::TCRETURNri) {
-      MBBI = MBB.getLastNonDebugInstr();
-      assert(MBBI->getOperand(0).isReg() && "Expecting register operand.");
-      BuildMI(MBB, MBBI, dl, TII.get(PPC::TAILBCTR));
-    } else if (RetOpcode == PPC::TCRETURNai) {
-      MBBI = MBB.getLastNonDebugInstr();
-      MachineOperand &JumpTarget = MBBI->getOperand(0);
-      BuildMI(MBB, MBBI, dl, TII.get(PPC::TAILBA)).addImm(JumpTarget.getImm());
-    } else if (RetOpcode == PPC::TCRETURNdi8) {
-      MBBI = MBB.getLastNonDebugInstr();
-      MachineOperand &JumpTarget = MBBI->getOperand(0);
-      BuildMI(MBB, MBBI, dl, TII.get(PPC::TAILB8)).
-        addGlobalAddress(JumpTarget.getGlobal(), JumpTarget.getOffset());
-    } else if (RetOpcode == PPC::TCRETURNri8) {
-      MBBI = MBB.getLastNonDebugInstr();
-      assert(MBBI->getOperand(0).isReg() && "Expecting register operand.");
-      BuildMI(MBB, MBBI, dl, TII.get(PPC::TAILBCTR8));
-    } else if (RetOpcode == PPC::TCRETURNai8) {
-      MBBI = MBB.getLastNonDebugInstr();
-      MachineOperand &JumpTarget = MBBI->getOperand(0);
-      BuildMI(MBB, MBBI, dl, TII.get(PPC::TAILBA8)).addImm(JumpTarget.getImm());
+    } else {
+      createTailCallBranchInstr(MBB);
     }
   }
 }
 
+void PPCFrameLowering::createTailCallBranchInstr(MachineBasicBlock &MBB) const {
+  MachineBasicBlock::iterator MBBI = MBB.getFirstTerminator();
+  DebugLoc dl;
+
+  if (MBBI != MBB.end())
+    dl = MBBI->getDebugLoc();
+
+  const PPCInstrInfo &TII =
+      *static_cast<const PPCInstrInfo *>(Subtarget.getInstrInfo());
+
+  // Create branch instruction for pseudo tail call return instruction
+  unsigned RetOpcode = MBBI->getOpcode();
+  if (RetOpcode == PPC::TCRETURNdi) {
+    MBBI = MBB.getLastNonDebugInstr();
+    MachineOperand &JumpTarget = MBBI->getOperand(0);
+    BuildMI(MBB, MBBI, dl, TII.get(PPC::TAILB)).
+      addGlobalAddress(JumpTarget.getGlobal(), JumpTarget.getOffset());
+  } else if (RetOpcode == PPC::TCRETURNri) {
+    MBBI = MBB.getLastNonDebugInstr();
+    assert(MBBI->getOperand(0).isReg() && "Expecting register operand.");
+    BuildMI(MBB, MBBI, dl, TII.get(PPC::TAILBCTR));
+  } else if (RetOpcode == PPC::TCRETURNai) {
+    MBBI = MBB.getLastNonDebugInstr();
+    MachineOperand &JumpTarget = MBBI->getOperand(0);
+    BuildMI(MBB, MBBI, dl, TII.get(PPC::TAILBA)).addImm(JumpTarget.getImm());
+  } else if (RetOpcode == PPC::TCRETURNdi8) {
+    MBBI = MBB.getLastNonDebugInstr();
+    MachineOperand &JumpTarget = MBBI->getOperand(0);
+    BuildMI(MBB, MBBI, dl, TII.get(PPC::TAILB8)).
+      addGlobalAddress(JumpTarget.getGlobal(), JumpTarget.getOffset());
+  } else if (RetOpcode == PPC::TCRETURNri8) {
+    MBBI = MBB.getLastNonDebugInstr();
+    assert(MBBI->getOperand(0).isReg() && "Expecting register operand.");
+    BuildMI(MBB, MBBI, dl, TII.get(PPC::TAILBCTR8));
+  } else if (RetOpcode == PPC::TCRETURNai8) {
+    MBBI = MBB.getLastNonDebugInstr();
+    MachineOperand &JumpTarget = MBBI->getOperand(0);
+    BuildMI(MBB, MBBI, dl, TII.get(PPC::TAILBA8)).addImm(JumpTarget.getImm());
+  }
+}
+
 void PPCFrameLowering::determineCalleeSaves(MachineFunction &MF,
                                             BitVector &SavedRegs,
                                             RegScavenger *RS) const {
@@ -1421,6 +1450,18 @@ void PPCFrameLowering::processFunctionBeforeFrameFinalized(MachineFunction &MF,
   MachineFrameInfo *FFI = MF.getFrameInfo();
   const std::vector<CalleeSavedInfo> &CSI = FFI->getCalleeSavedInfo();
 
+  // If the function is shrink-wrapped, and if the function has a tail call, the
+  // tail call might not be in the new RestoreBlock, so real branch instruction
+  // won't be generated by emitEpilogue(), because shrink-wrap has chosen new
+  // RestoreBlock. So we handle this case here.
+  if (FFI->getSavePoint() && FFI->hasTailCall()) {
+    MachineBasicBlock *RestoreBlock = FFI->getRestorePoint();
+    for (MachineBasicBlock &MBB : MF) {
+      if (MBB.isReturnBlock() && (&MBB) != RestoreBlock)
+        createTailCallBranchInstr(MBB);
+    }
+  }
+
   // Early exit if no callee saved registers are modified!
   if (CSI.empty() && !needsFP(MF)) {
     addScavengingSpillSlot(MF, RS);
@@ -1770,7 +1811,7 @@ restoreCRs(bool isPPC64, bool is31,
                .addReg(MoveReg, getKillRegState(true)));
 }
 
-void PPCFrameLowering::
+MachineBasicBlock::iterator PPCFrameLowering::
 eliminateCallFramePseudoInstr(MachineFunction &MF, MachineBasicBlock &MBB,
                               MachineBasicBlock::iterator I) const {
   const TargetInstrInfo &TII = *Subtarget.getInstrInfo();
@@ -1787,7 +1828,7 @@ eliminateCallFramePseudoInstr(MachineFunction &MF, MachineBasicBlock &MBB,
       unsigned LISInstr = is64Bit ? PPC::LIS8 : PPC::LIS;
       unsigned ORIInstr = is64Bit ? PPC::ORI8 : PPC::ORI;
       MachineInstr *MI = I;
-      DebugLoc dl = MI->getDebugLoc();
+      const DebugLoc &dl = MI->getDebugLoc();
 
       if (isInt<16>(CalleeAmt)) {
         BuildMI(MBB, I, dl, TII.get(ADDIInstr), StackReg)
@@ -1807,7 +1848,7 @@ eliminateCallFramePseudoInstr(MachineFunction &MF, MachineBasicBlock &MBB,
     }
   }
   // Simply discard ADJCALLSTACKDOWN, ADJCALLSTACKUP instructions.
-  MBB.erase(I);
+  return MBB.erase(I);
 }
 
 bool
diff --git a/lib/Target/PowerPC/PPCFrameLowering.h b/lib/Target/PowerPC/PPCFrameLowering.h
index f1f3f0b831a7..28b0c57f0ffb 100644
--- a/lib/Target/PowerPC/PPCFrameLowering.h
+++ b/lib/Target/PowerPC/PPCFrameLowering.h
@@ -66,6 +66,13 @@ class PPCFrameLowering: public TargetFrameLowering {
                            unsigned *SR2 = nullptr) const;
   bool twoUniqueScratchRegsRequired(MachineBasicBlock *MBB) const;
 
+  /**
+   * \brief Create branch instruction for PPC::TCRETURN* (tail call return)
+   *
+   * \param[in] MBB that is terminated by PPC::TCRETURN*
+   */
+  void createTailCallBranchInstr(MachineBasicBlock &MBB) const;
+
 public:
   PPCFrameLowering(const PPCSubtarget &STI);
 
@@ -93,9 +100,9 @@ public:
                                  const std::vector<CalleeSavedInfo> &CSI,
                                  const TargetRegisterInfo *TRI) const override;
 
-  void eliminateCallFramePseudoInstr(MachineFunction &MF,
-                                  MachineBasicBlock &MBB,
-                                  MachineBasicBlock::iterator I) const override;
+  MachineBasicBlock::iterator
+  eliminateCallFramePseudoInstr(MachineFunction &MF, MachineBasicBlock &MBB,
+                                MachineBasicBlock::iterator I) const override;
 
   bool restoreCalleeSavedRegisters(MachineBasicBlock &MBB,
                                   MachineBasicBlock::iterator MI,
diff --git a/lib/Target/PowerPC/PPCHazardRecognizers.cpp b/lib/Target/PowerPC/PPCHazardRecognizers.cpp
index 7234e30fa73e..caab67d68b17 100644
--- a/lib/Target/PowerPC/PPCHazardRecognizers.cpp
+++ b/lib/Target/PowerPC/PPCHazardRecognizers.cpp
@@ -162,8 +162,9 @@ unsigned PPCDispatchGroupSBHazardRecognizer::PreEmitNoops(SUnit *SU) {
     unsigned Directive =
         DAG->MF.getSubtarget<PPCSubtarget>().getDarwinDirective();
     // If we're using a special group-terminating nop, then we need only one.
+    // FIXME: the same for P9 as previous gen until POWER9 scheduling is ready
     if (Directive == PPC::DIR_PWR6 || Directive == PPC::DIR_PWR7 ||
-        Directive == PPC::DIR_PWR8 )
+        Directive == PPC::DIR_PWR8 || Directive == PPC::DIR_PWR9)
       return 1;
 
     return 5 - CurSlots;
@@ -223,8 +224,10 @@ void PPCDispatchGroupSBHazardRecognizer::EmitNoop() {
       DAG->MF.getSubtarget<PPCSubtarget>().getDarwinDirective();
   // If the group has now filled all of its slots, or if we're using a special
   // group-terminating nop, the group is complete.
+  // FIXME: the same for P9 as previous gen until POWER9 scheduling is ready
   if (Directive == PPC::DIR_PWR6 || Directive == PPC::DIR_PWR7 ||
-      Directive == PPC::DIR_PWR8 || CurSlots == 6)  {
+      Directive == PPC::DIR_PWR8 || Directive == PPC::DIR_PWR8 ||
+      CurSlots == 6) {
     CurGroup.clear();
     CurSlots = CurBranches = 0;
   } else {
diff --git a/lib/Target/PowerPC/PPCISelDAGToDAG.cpp b/lib/Target/PowerPC/PPCISelDAGToDAG.cpp
index 1eaa8118ba0a..0e9b2daa0cb5 100644
--- a/lib/Target/PowerPC/PPCISelDAGToDAG.cpp
+++ b/lib/Target/PowerPC/PPCISelDAGToDAG.cpp
@@ -59,10 +59,6 @@ static cl::opt<bool> EnableBranchHint(
     cl::desc("Enable static hinting of branches on ppc"),
     cl::Hidden);
 
-namespace llvm {
-  void initializePPCDAGToDAGISelPass(PassRegistry&);
-}
-
 namespace {
   //===--------------------------------------------------------------------===//
   /// PPCDAGToDAGISel - PPC specific code to select PPC machine
@@ -75,9 +71,7 @@ namespace {
     unsigned GlobalBaseReg;
   public:
     explicit PPCDAGToDAGISel(PPCTargetMachine &tm)
-        : SelectionDAGISel(tm), TM(tm) {
-      initializePPCDAGToDAGISelPass(*PassRegistry::getPassRegistry());
-    }
+        : SelectionDAGISel(tm), TM(tm) {}
 
     bool runOnMachineFunction(MachineFunction &MF) override {
       // Make sure we re-emit a set of the global base reg if necessary
@@ -97,18 +91,18 @@ namespace {
 
     /// getI32Imm - Return a target constant with the specified value, of type
     /// i32.
-    inline SDValue getI32Imm(unsigned Imm, SDLoc dl) {
+    inline SDValue getI32Imm(unsigned Imm, const SDLoc &dl) {
       return CurDAG->getTargetConstant(Imm, dl, MVT::i32);
     }
 
     /// getI64Imm - Return a target constant with the specified value, of type
     /// i64.
-    inline SDValue getI64Imm(uint64_t Imm, SDLoc dl) {
+    inline SDValue getI64Imm(uint64_t Imm, const SDLoc &dl) {
       return CurDAG->getTargetConstant(Imm, dl, MVT::i64);
     }
 
     /// getSmallIPtrImm - Return a target constant of pointer type.
-    inline SDValue getSmallIPtrImm(unsigned Imm, SDLoc dl) {
+    inline SDValue getSmallIPtrImm(unsigned Imm, const SDLoc &dl) {
       return CurDAG->getTargetConstant(
           Imm, dl, PPCLowering->getPointerTy(CurDAG->getDataLayout()));
     }
@@ -122,18 +116,19 @@ namespace {
     /// base register.  Return the virtual register that holds this value.
     SDNode *getGlobalBaseReg();
 
-    SDNode *getFrameIndex(SDNode *SN, SDNode *N, unsigned Offset = 0);
+    void selectFrameIndex(SDNode *SN, SDNode *N, unsigned Offset = 0);
 
     // Select - Convert the specified operand from a target-independent to a
     // target-specific node if it hasn't already been changed.
-    SDNode *Select(SDNode *N) override;
+    void Select(SDNode *N) override;
 
-    SDNode *SelectBitfieldInsert(SDNode *N);
-    SDNode *SelectBitPermutation(SDNode *N);
+    bool tryBitfieldInsert(SDNode *N);
+    bool tryBitPermutation(SDNode *N);
 
     /// SelectCC - Select a comparison of the specified values with the
     /// specified condition code, returning the CR# of the expression.
-    SDValue SelectCC(SDValue LHS, SDValue RHS, ISD::CondCode CC, SDLoc dl);
+    SDValue SelectCC(SDValue LHS, SDValue RHS, ISD::CondCode CC,
+                     const SDLoc &dl);
 
     /// SelectAddrImm - Returns true if the address N can be represented by
     /// a base register plus a signed 16-bit displacement [r+imm].
@@ -228,7 +223,7 @@ namespace {
 #include "PPCGenDAGISel.inc"
 
 private:
-    SDNode *SelectSETCC(SDNode *N);
+    bool trySETCC(SDNode *N);
 
     void PeepholePPC64();
     void PeepholePPC64ZExt();
@@ -240,7 +235,7 @@ private:
     bool AllUsersSelectZero(SDNode *N);
     void SwapAllSelectUsers(SDNode *N);
 
-    SDNode *transferMemOperands(SDNode *N, SDNode *Result);
+    void transferMemOperands(SDNode *N, SDNode *Result);
   };
 }
 
@@ -324,7 +319,7 @@ SDNode *PPCDAGToDAGISel::getGlobalBaseReg() {
     if (PPCLowering->getPointerTy(CurDAG->getDataLayout()) == MVT::i32) {
       if (PPCSubTarget->isTargetELF()) {
         GlobalBaseReg = PPC::R30;
-        if (M->getPICLevel() == PICLevel::Small) {
+        if (M->getPICLevel() == PICLevel::SmallPIC) {
           BuildMI(FirstMBB, MBBI, dl, TII.get(PPC::MoveGOTtoLR));
           BuildMI(FirstMBB, MBBI, dl, TII.get(PPC::MFLR), GlobalBaseReg);
           MF->getInfo<PPCFunctionInfo>()->setUsesPICBase(true);
@@ -458,16 +453,17 @@ static bool isOpcWithIntImmediate(SDNode *N, unsigned Opc, unsigned& Imm) {
          && isInt32Immediate(N->getOperand(1).getNode(), Imm);
 }
 
-SDNode *PPCDAGToDAGISel::getFrameIndex(SDNode *SN, SDNode *N, unsigned Offset) {
+void PPCDAGToDAGISel::selectFrameIndex(SDNode *SN, SDNode *N, unsigned Offset) {
   SDLoc dl(SN);
   int FI = cast<FrameIndexSDNode>(N)->getIndex();
   SDValue TFI = CurDAG->getTargetFrameIndex(FI, N->getValueType(0));
   unsigned Opc = N->getValueType(0) == MVT::i32 ? PPC::ADDI : PPC::ADDI8;
   if (SN->hasOneUse())
-    return CurDAG->SelectNodeTo(SN, Opc, N->getValueType(0), TFI,
-                                getSmallIPtrImm(Offset, dl));
-  return CurDAG->getMachineNode(Opc, dl, N->getValueType(0), TFI,
-                                getSmallIPtrImm(Offset, dl));
+    CurDAG->SelectNodeTo(SN, Opc, N->getValueType(0), TFI,
+                         getSmallIPtrImm(Offset, dl));
+  else
+    ReplaceNode(SN, CurDAG->getMachineNode(Opc, dl, N->getValueType(0), TFI,
+                                           getSmallIPtrImm(Offset, dl)));
 }
 
 bool PPCDAGToDAGISel::isRotateAndMask(SDNode *N, unsigned Mask,
@@ -512,9 +508,9 @@ bool PPCDAGToDAGISel::isRotateAndMask(SDNode *N, unsigned Mask,
   return false;
 }
 
-/// SelectBitfieldInsert - turn an or of two masked values into
-/// the rotate left word immediate then mask insert (rlwimi) instruction.
-SDNode *PPCDAGToDAGISel::SelectBitfieldInsert(SDNode *N) {
+/// Turn an or of two masked values into the rotate left word immediate then
+/// mask insert (rlwimi) instruction.
+bool PPCDAGToDAGISel::tryBitfieldInsert(SDNode *N) {
   SDValue Op0 = N->getOperand(0);
   SDValue Op1 = N->getOperand(1);
   SDLoc dl(N);
@@ -584,15 +580,16 @@ SDNode *PPCDAGToDAGISel::SelectBitfieldInsert(SDNode *N) {
       SH &= 31;
       SDValue Ops[] = { Op0, Op1, getI32Imm(SH, dl), getI32Imm(MB, dl),
                           getI32Imm(ME, dl) };
-      return CurDAG->getMachineNode(PPC::RLWIMI, dl, MVT::i32, Ops);
+      ReplaceNode(N, CurDAG->getMachineNode(PPC::RLWIMI, dl, MVT::i32, Ops));
+      return true;
     }
   }
-  return nullptr;
+  return false;
 }
 
 // Predict the number of instructions that would be generated by calling
-// SelectInt64(N).
-static unsigned SelectInt64CountDirect(int64_t Imm) {
+// getInt64(N).
+static unsigned getInt64CountDirect(int64_t Imm) {
   // Assume no remaining bits.
   unsigned Remainder = 0;
   // Assume no shift required.
@@ -653,17 +650,17 @@ static uint64_t Rot64(uint64_t Imm, unsigned R) {
   return (Imm << R) | (Imm >> (64 - R));
 }
 
-static unsigned SelectInt64Count(int64_t Imm) {
-  unsigned Count = SelectInt64CountDirect(Imm);
+static unsigned getInt64Count(int64_t Imm) {
+  unsigned Count = getInt64CountDirect(Imm);
   if (Count == 1)
     return Count;
 
   for (unsigned r = 1; r < 63; ++r) {
     uint64_t RImm = Rot64(Imm, r);
-    unsigned RCount = SelectInt64CountDirect(RImm) + 1;
+    unsigned RCount = getInt64CountDirect(RImm) + 1;
     Count = std::min(Count, RCount);
 
-    // See comments in SelectInt64 for an explanation of the logic below.
+    // See comments in getInt64 for an explanation of the logic below.
     unsigned LS = findLastSet(RImm);
     if (LS != r-1)
       continue;
@@ -671,16 +668,17 @@ static unsigned SelectInt64Count(int64_t Imm) {
     uint64_t OnesMask = -(int64_t) (UINT64_C(1) << (LS+1));
     uint64_t RImmWithOnes = RImm | OnesMask;
 
-    RCount = SelectInt64CountDirect(RImmWithOnes) + 1;
+    RCount = getInt64CountDirect(RImmWithOnes) + 1;
     Count = std::min(Count, RCount);
   }
 
   return Count;
 }
 
-// Select a 64-bit constant. For cost-modeling purposes, SelectInt64Count
+// Select a 64-bit constant. For cost-modeling purposes, getInt64Count
 // (above) needs to be kept in sync with this function.
-static SDNode *SelectInt64Direct(SelectionDAG *CurDAG, SDLoc dl, int64_t Imm) {
+static SDNode *getInt64Direct(SelectionDAG *CurDAG, const SDLoc &dl,
+                              int64_t Imm) {
   // Assume no remaining bits.
   unsigned Remainder = 0;
   // Assume no shift required.
@@ -754,10 +752,10 @@ static SDNode *SelectInt64Direct(SelectionDAG *CurDAG, SDLoc dl, int64_t Imm) {
   return Result;
 }
 
-static SDNode *SelectInt64(SelectionDAG *CurDAG, SDLoc dl, int64_t Imm) {
-  unsigned Count = SelectInt64CountDirect(Imm);
+static SDNode *getInt64(SelectionDAG *CurDAG, const SDLoc &dl, int64_t Imm) {
+  unsigned Count = getInt64CountDirect(Imm);
   if (Count == 1)
-    return SelectInt64Direct(CurDAG, dl, Imm);
+    return getInt64Direct(CurDAG, dl, Imm);
 
   unsigned RMin = 0;
 
@@ -766,7 +764,7 @@ static SDNode *SelectInt64(SelectionDAG *CurDAG, SDLoc dl, int64_t Imm) {
 
   for (unsigned r = 1; r < 63; ++r) {
     uint64_t RImm = Rot64(Imm, r);
-    unsigned RCount = SelectInt64CountDirect(RImm) + 1;
+    unsigned RCount = getInt64CountDirect(RImm) + 1;
     if (RCount < Count) {
       Count = RCount;
       RMin = r;
@@ -789,7 +787,7 @@ static SDNode *SelectInt64(SelectionDAG *CurDAG, SDLoc dl, int64_t Imm) {
     uint64_t OnesMask = -(int64_t) (UINT64_C(1) << (LS+1));
     uint64_t RImmWithOnes = RImm | OnesMask;
 
-    RCount = SelectInt64CountDirect(RImmWithOnes) + 1;
+    RCount = getInt64CountDirect(RImmWithOnes) + 1;
     if (RCount < Count) {
       Count = RCount;
       RMin = r;
@@ -799,24 +797,24 @@ static SDNode *SelectInt64(SelectionDAG *CurDAG, SDLoc dl, int64_t Imm) {
   }
 
   if (!RMin)
-    return SelectInt64Direct(CurDAG, dl, Imm);
+    return getInt64Direct(CurDAG, dl, Imm);
 
   auto getI32Imm = [CurDAG, dl](unsigned Imm) {
       return CurDAG->getTargetConstant(Imm, dl, MVT::i32);
   };
 
-  SDValue Val = SDValue(SelectInt64Direct(CurDAG, dl, MatImm), 0);
+  SDValue Val = SDValue(getInt64Direct(CurDAG, dl, MatImm), 0);
   return CurDAG->getMachineNode(PPC::RLDICR, dl, MVT::i64, Val,
                                 getI32Imm(64 - RMin), getI32Imm(MaskEnd));
 }
 
 // Select a 64-bit constant.
-static SDNode *SelectInt64(SelectionDAG *CurDAG, SDNode *N) {
+static SDNode *getInt64(SelectionDAG *CurDAG, SDNode *N) {
   SDLoc dl(N);
 
   // Get 64 bit value.
   int64_t Imm = cast<ConstantSDNode>(N)->getZExtValue();
-  return SelectInt64(CurDAG, dl, Imm);
+  return getInt64(CurDAG, dl, Imm);
 }
 
 namespace {
@@ -1209,7 +1207,7 @@ class BitPermutationSelector {
                  "bit group ends at index 63 but there is another?");
           auto IN = BitGroups.begin();
 
-          if (IP->Repl32 && IN->Repl32 && I->V == IP->V && I->V == IN->V && 
+          if (IP->Repl32 && IN->Repl32 && I->V == IP->V && I->V == IN->V &&
               (I->RLAmt % 32) == IP->RLAmt && (I->RLAmt % 32) == IN->RLAmt &&
               IP->EndIdx == 31 && IN->StartIdx == 0 && I != IP &&
               IsAllLow32(*I)) {
@@ -1252,7 +1250,7 @@ class BitPermutationSelector {
     }
   }
 
-  SDValue getI32Imm(unsigned Imm, SDLoc dl) {
+  SDValue getI32Imm(unsigned Imm, const SDLoc &dl) {
     return CurDAG->getTargetConstant(Imm, dl, MVT::i32);
   }
 
@@ -1270,7 +1268,7 @@ class BitPermutationSelector {
   // Depending on the number of groups for a particular value, it might be
   // better to rotate, mask explicitly (using andi/andis), and then or the
   // result. Select this part of the result first.
-  void SelectAndParts32(SDLoc dl, SDValue &Res, unsigned *InstCnt) {
+  void SelectAndParts32(const SDLoc &dl, SDValue &Res, unsigned *InstCnt) {
     if (BPermRewriterNoMasking)
       return;
 
@@ -1466,8 +1464,8 @@ class BitPermutationSelector {
 
   // For 64-bit values, not all combinations of rotates and masks are
   // available. Produce one if it is available.
-  SDValue SelectRotMask64(SDValue V, SDLoc dl, unsigned RLAmt, bool Repl32,
-                          unsigned MaskStart, unsigned MaskEnd,
+  SDValue SelectRotMask64(SDValue V, const SDLoc &dl, unsigned RLAmt,
+                          bool Repl32, unsigned MaskStart, unsigned MaskEnd,
                           unsigned *InstCnt = nullptr) {
     // In the notation used by the instructions, 'start' and 'end' are reversed
     // because bits are counted from high to low order.
@@ -1527,8 +1525,8 @@ class BitPermutationSelector {
 
   // For 64-bit values, not all combinations of rotates and masks are
   // available. Produce a rotate-mask-and-insert if one is available.
-  SDValue SelectRotMaskIns64(SDValue Base, SDValue V, SDLoc dl, unsigned RLAmt,
-                             bool Repl32, unsigned MaskStart,
+  SDValue SelectRotMaskIns64(SDValue Base, SDValue V, const SDLoc &dl,
+                             unsigned RLAmt, bool Repl32, unsigned MaskStart,
                              unsigned MaskEnd, unsigned *InstCnt = nullptr) {
     // In the notation used by the instructions, 'start' and 'end' are reversed
     // because bits are counted from high to low order.
@@ -1574,7 +1572,7 @@ class BitPermutationSelector {
     return SelectRotMaskIns64(Base, V, dl, RLAmt2, false, MaskStart, MaskEnd);
   }
 
-  void SelectAndParts64(SDLoc dl, SDValue &Res, unsigned *InstCnt) {
+  void SelectAndParts64(const SDLoc &dl, SDValue &Res, unsigned *InstCnt) {
     if (BPermRewriterNoMasking)
       return;
 
@@ -1646,7 +1644,7 @@ class BitPermutationSelector {
         NumAndInsts += (unsigned) (ANDIMask != 0) + (unsigned) (ANDISMask != 0) +
                        (unsigned) (ANDIMask != 0 && ANDISMask != 0);
       else
-        NumAndInsts += SelectInt64Count(Mask) + /* and */ 1;
+        NumAndInsts += getInt64Count(Mask) + /* and */ 1;
 
       unsigned NumRLInsts = 0;
       bool FirstBG = true;
@@ -1709,7 +1707,7 @@ class BitPermutationSelector {
           TotalVal = SDValue(CurDAG->getMachineNode(PPC::OR8, dl, MVT::i64,
                                ANDIVal, ANDISVal), 0);
       } else {
-        TotalVal = SDValue(SelectInt64(CurDAG, dl, Mask), 0);
+        TotalVal = SDValue(getInt64(CurDAG, dl, Mask), 0);
         TotalVal =
           SDValue(CurDAG->getMachineNode(PPC::AND8, dl, MVT::i64,
                                          VRot, TotalVal), 0);
@@ -1852,9 +1850,9 @@ class BitPermutationSelector {
           Res = SDValue(CurDAG->getMachineNode(PPC::OR8, dl, MVT::i64,
                           ANDIVal, ANDISVal), 0);
       } else {
-        if (InstCnt) *InstCnt += SelectInt64Count(Mask) + /* and */ 1;
+        if (InstCnt) *InstCnt += getInt64Count(Mask) + /* and */ 1;
 
-        SDValue MaskVal = SDValue(SelectInt64(CurDAG, dl, Mask), 0);
+        SDValue MaskVal = SDValue(getInt64(CurDAG, dl, Mask), 0);
         Res =
           SDValue(CurDAG->getMachineNode(PPC::AND8, dl, MVT::i64,
                                          Res, MaskVal), 0);
@@ -1955,13 +1953,13 @@ public:
 };
 } // anonymous namespace
 
-SDNode *PPCDAGToDAGISel::SelectBitPermutation(SDNode *N) {
+bool PPCDAGToDAGISel::tryBitPermutation(SDNode *N) {
   if (N->getValueType(0) != MVT::i32 &&
       N->getValueType(0) != MVT::i64)
-    return nullptr;
+    return false;
 
   if (!UseBitPermRewriter)
-    return nullptr;
+    return false;
 
   switch (N->getOpcode()) {
   default: break;
@@ -1971,17 +1969,21 @@ SDNode *PPCDAGToDAGISel::SelectBitPermutation(SDNode *N) {
   case ISD::AND:
   case ISD::OR: {
     BitPermutationSelector BPS(CurDAG);
-    return BPS.Select(N);
+    if (SDNode *New = BPS.Select(N)) {
+      ReplaceNode(N, New);
+      return true;
+    }
+    return false;
   }
   }
 
-  return nullptr;
+  return false;
 }
 
 /// SelectCC - Select a comparison of the specified values with the specified
 /// condition code, returning the CR# of the expression.
-SDValue PPCDAGToDAGISel::SelectCC(SDValue LHS, SDValue RHS,
-                                    ISD::CondCode CC, SDLoc dl) {
+SDValue PPCDAGToDAGISel::SelectCC(SDValue LHS, SDValue RHS, ISD::CondCode CC,
+                                  const SDLoc &dl) {
   // Always select the LHS.
   unsigned Opc;
 
@@ -2255,7 +2257,7 @@ static unsigned int getVCmpInst(MVT VecVT, ISD::CondCode CC,
   }
 }
 
-SDNode *PPCDAGToDAGISel::SelectSETCC(SDNode *N) {
+bool PPCDAGToDAGISel::trySETCC(SDNode *N) {
   SDLoc dl(N);
   unsigned Imm;
   ISD::CondCode CC = cast<CondCodeSDNode>(N->getOperand(2))->get();
@@ -2276,20 +2278,22 @@ SDNode *PPCDAGToDAGISel::SelectSETCC(SDNode *N) {
         Op = SDValue(CurDAG->getMachineNode(PPC::CNTLZW, dl, MVT::i32, Op), 0);
         SDValue Ops[] = { Op, getI32Imm(27, dl), getI32Imm(5, dl),
                           getI32Imm(31, dl) };
-        return CurDAG->SelectNodeTo(N, PPC::RLWINM, MVT::i32, Ops);
+        CurDAG->SelectNodeTo(N, PPC::RLWINM, MVT::i32, Ops);
+        return true;
       }
       case ISD::SETNE: {
         if (isPPC64) break;
         SDValue AD =
           SDValue(CurDAG->getMachineNode(PPC::ADDIC, dl, MVT::i32, MVT::Glue,
                                          Op, getI32Imm(~0U, dl)), 0);
-        return CurDAG->SelectNodeTo(N, PPC::SUBFE, MVT::i32, AD, Op,
-                                    AD.getValue(1));
+        CurDAG->SelectNodeTo(N, PPC::SUBFE, MVT::i32, AD, Op, AD.getValue(1));
+        return true;
       }
       case ISD::SETLT: {
         SDValue Ops[] = { Op, getI32Imm(1, dl), getI32Imm(31, dl),
                           getI32Imm(31, dl) };
-        return CurDAG->SelectNodeTo(N, PPC::RLWINM, MVT::i32, Ops);
+        CurDAG->SelectNodeTo(N, PPC::RLWINM, MVT::i32, Ops);
+        return true;
       }
       case ISD::SETGT: {
         SDValue T =
@@ -2297,7 +2301,8 @@ SDNode *PPCDAGToDAGISel::SelectSETCC(SDNode *N) {
         T = SDValue(CurDAG->getMachineNode(PPC::ANDC, dl, MVT::i32, T, Op), 0);
         SDValue Ops[] = { T, getI32Imm(1, dl), getI32Imm(31, dl),
                           getI32Imm(31, dl) };
-        return CurDAG->SelectNodeTo(N, PPC::RLWINM, MVT::i32, Ops);
+        CurDAG->SelectNodeTo(N, PPC::RLWINM, MVT::i32, Ops);
+        return true;
       }
       }
     } else if (Imm == ~0U) {        // setcc op, -1
@@ -2308,18 +2313,20 @@ SDNode *PPCDAGToDAGISel::SelectSETCC(SDNode *N) {
         if (isPPC64) break;
         Op = SDValue(CurDAG->getMachineNode(PPC::ADDIC, dl, MVT::i32, MVT::Glue,
                                             Op, getI32Imm(1, dl)), 0);
-        return CurDAG->SelectNodeTo(N, PPC::ADDZE, MVT::i32,
-                              SDValue(CurDAG->getMachineNode(PPC::LI, dl,
-                                                             MVT::i32,
-                                                             getI32Imm(0, dl)),
-                                      0), Op.getValue(1));
+        CurDAG->SelectNodeTo(N, PPC::ADDZE, MVT::i32,
+                             SDValue(CurDAG->getMachineNode(PPC::LI, dl,
+                                                            MVT::i32,
+                                                            getI32Imm(0, dl)),
+                                     0), Op.getValue(1));
+        return true;
       case ISD::SETNE: {
         if (isPPC64) break;
         Op = SDValue(CurDAG->getMachineNode(PPC::NOR, dl, MVT::i32, Op, Op), 0);
         SDNode *AD = CurDAG->getMachineNode(PPC::ADDIC, dl, MVT::i32, MVT::Glue,
                                             Op, getI32Imm(~0U, dl));
-        return CurDAG->SelectNodeTo(N, PPC::SUBFE, MVT::i32, SDValue(AD, 0),
-                                    Op, SDValue(AD, 1));
+        CurDAG->SelectNodeTo(N, PPC::SUBFE, MVT::i32, SDValue(AD, 0), Op,
+                             SDValue(AD, 1));
+        return true;
       }
       case ISD::SETLT: {
         SDValue AD = SDValue(CurDAG->getMachineNode(PPC::ADDI, dl, MVT::i32, Op,
@@ -2328,14 +2335,15 @@ SDNode *PPCDAGToDAGISel::SelectSETCC(SDNode *N) {
                                                     Op), 0);
         SDValue Ops[] = { AN, getI32Imm(1, dl), getI32Imm(31, dl),
                           getI32Imm(31, dl) };
-        return CurDAG->SelectNodeTo(N, PPC::RLWINM, MVT::i32, Ops);
+        CurDAG->SelectNodeTo(N, PPC::RLWINM, MVT::i32, Ops);
+        return true;
       }
       case ISD::SETGT: {
         SDValue Ops[] = { Op, getI32Imm(1, dl), getI32Imm(31, dl),
                           getI32Imm(31, dl) };
         Op = SDValue(CurDAG->getMachineNode(PPC::RLWINM, dl, MVT::i32, Ops), 0);
-        return CurDAG->SelectNodeTo(N, PPC::XORI, MVT::i32, Op,
-                                    getI32Imm(1, dl));
+        CurDAG->SelectNodeTo(N, PPC::XORI, MVT::i32, Op, getI32Imm(1, dl));
+        return true;
       }
       }
     }
@@ -2348,7 +2356,7 @@ SDNode *PPCDAGToDAGISel::SelectSETCC(SDNode *N) {
   // vector compare operations return the same type as the operands.
   if (LHS.getValueType().isVector()) {
     if (PPCSubTarget->hasQPX())
-      return nullptr;
+      return false;
 
     EVT VecVT = LHS.getValueType();
     bool Swap, Negate;
@@ -2360,16 +2368,17 @@ SDNode *PPCDAGToDAGISel::SelectSETCC(SDNode *N) {
     EVT ResVT = VecVT.changeVectorElementTypeToInteger();
     if (Negate) {
       SDValue VCmp(CurDAG->getMachineNode(VCmpInst, dl, ResVT, LHS, RHS), 0);
-      return CurDAG->SelectNodeTo(N, PPCSubTarget->hasVSX() ? PPC::XXLNOR :
-                                                              PPC::VNOR,
-                                  ResVT, VCmp, VCmp);
+      CurDAG->SelectNodeTo(N, PPCSubTarget->hasVSX() ? PPC::XXLNOR : PPC::VNOR,
+                           ResVT, VCmp, VCmp);
+      return true;
     }
 
-    return CurDAG->SelectNodeTo(N, VCmpInst, ResVT, LHS, RHS);
+    CurDAG->SelectNodeTo(N, VCmpInst, ResVT, LHS, RHS);
+    return true;
   }
 
   if (PPCSubTarget->useCRBits())
-    return nullptr;
+    return false;
 
   bool Inv;
   unsigned Idx = getCRIdxForSetCC(CC, Inv);
@@ -2388,31 +2397,33 @@ SDNode *PPCDAGToDAGISel::SelectSETCC(SDNode *N) {
 
   SDValue Ops[] = { IntCR, getI32Imm((32 - (3 - Idx)) & 31, dl),
                       getI32Imm(31, dl), getI32Imm(31, dl) };
-  if (!Inv)
-    return CurDAG->SelectNodeTo(N, PPC::RLWINM, MVT::i32, Ops);
+  if (!Inv) {
+    CurDAG->SelectNodeTo(N, PPC::RLWINM, MVT::i32, Ops);
+    return true;
+  }
 
   // Get the specified bit.
   SDValue Tmp =
     SDValue(CurDAG->getMachineNode(PPC::RLWINM, dl, MVT::i32, Ops), 0);
-  return CurDAG->SelectNodeTo(N, PPC::XORI, MVT::i32, Tmp, getI32Imm(1, dl));
+  CurDAG->SelectNodeTo(N, PPC::XORI, MVT::i32, Tmp, getI32Imm(1, dl));
+  return true;
 }
 
-SDNode *PPCDAGToDAGISel::transferMemOperands(SDNode *N, SDNode *Result) {
+void PPCDAGToDAGISel::transferMemOperands(SDNode *N, SDNode *Result) {
   // Transfer memoperands.
   MachineSDNode::mmo_iterator MemOp = MF->allocateMemRefsArray(1);
   MemOp[0] = cast<MemSDNode>(N)->getMemOperand();
   cast<MachineSDNode>(Result)->setMemRefs(MemOp, MemOp + 1);
-  return Result;
 }
 
 
 // Select - Convert the specified operand from a target-independent to a
 // target-specific node if it hasn't already been changed.
-SDNode *PPCDAGToDAGISel::Select(SDNode *N) {
+void PPCDAGToDAGISel::Select(SDNode *N) {
   SDLoc dl(N);
   if (N->isMachineOpcode()) {
     N->setNodeId(-1);
-    return nullptr;   // Already selected.
+    return;   // Already selected.
   }
 
   // In case any misguided DAG-level optimizations form an ADD with a
@@ -2423,39 +2434,44 @@ SDNode *PPCDAGToDAGISel::Select(SDNode *N) {
     llvm_unreachable("Invalid ADD with TargetConstant operand");
 
   // Try matching complex bit permutations before doing anything else.
-  if (SDNode *NN = SelectBitPermutation(N))
-    return NN;
+  if (tryBitPermutation(N))
+    return;
 
   switch (N->getOpcode()) {
   default: break;
 
   case ISD::Constant: {
-    if (N->getValueType(0) == MVT::i64)
-      return SelectInt64(CurDAG, N);
+    if (N->getValueType(0) == MVT::i64) {
+      ReplaceNode(N, getInt64(CurDAG, N));
+      return;
+    }
     break;
   }
 
   case ISD::SETCC: {
-    SDNode *SN = SelectSETCC(N);
-    if (SN)
-      return SN;
+    if (trySETCC(N))
+      return;
     break;
   }
   case PPCISD::GlobalBaseReg:
-    return getGlobalBaseReg();
+    ReplaceNode(N, getGlobalBaseReg());
+    return;
 
   case ISD::FrameIndex:
-    return getFrameIndex(N, N);
+    selectFrameIndex(N, N);
+    return;
 
   case PPCISD::MFOCRF: {
     SDValue InFlag = N->getOperand(1);
-    return CurDAG->getMachineNode(PPC::MFOCRF, dl, MVT::i32,
-                                  N->getOperand(0), InFlag);
+    ReplaceNode(N, CurDAG->getMachineNode(PPC::MFOCRF, dl, MVT::i32,
+                                          N->getOperand(0), InFlag));
+    return;
   }
 
   case PPCISD::READ_TIME_BASE: {
-    return CurDAG->getMachineNode(PPC::ReadTB, dl, MVT::i32, MVT::i32,
-                                  MVT::Other, N->getOperand(0));
+    ReplaceNode(N, CurDAG->getMachineNode(PPC::ReadTB, dl, MVT::i32, MVT::i32,
+                                          MVT::Other, N->getOperand(0)));
+    return;
   }
 
   case PPCISD::SRA_ADDZE: {
@@ -2468,16 +2484,18 @@ SDNode *PPCDAGToDAGISel::Select(SDNode *N) {
       SDNode *Op =
         CurDAG->getMachineNode(PPC::SRADI, dl, MVT::i64, MVT::Glue,
                                N0, ShiftAmt);
-      return CurDAG->SelectNodeTo(N, PPC::ADDZE8, MVT::i64,
-                                  SDValue(Op, 0), SDValue(Op, 1));
+      CurDAG->SelectNodeTo(N, PPC::ADDZE8, MVT::i64, SDValue(Op, 0),
+                           SDValue(Op, 1));
+      return;
     } else {
       assert(N->getValueType(0) == MVT::i32 &&
              "Expecting i64 or i32 in PPCISD::SRA_ADDZE");
       SDNode *Op =
         CurDAG->getMachineNode(PPC::SRAWI, dl, MVT::i32, MVT::Glue,
                                N0, ShiftAmt);
-      return CurDAG->SelectNodeTo(N, PPC::ADDZE, MVT::i32,
-                                  SDValue(Op, 0), SDValue(Op, 1));
+      CurDAG->SelectNodeTo(N, PPC::ADDZE, MVT::i32, SDValue(Op, 0),
+                           SDValue(Op, 1));
+      return;
     }
   }
 
@@ -2524,11 +2542,12 @@ SDNode *PPCDAGToDAGISel::Select(SDNode *N) {
       SDValue Chain = LD->getChain();
       SDValue Base = LD->getBasePtr();
       SDValue Ops[] = { Offset, Base, Chain };
-      return transferMemOperands(
-          N, CurDAG->getMachineNode(
-                 Opcode, dl, LD->getValueType(0),
-                 PPCLowering->getPointerTy(CurDAG->getDataLayout()), MVT::Other,
-                 Ops));
+      SDNode *MN = CurDAG->getMachineNode(
+          Opcode, dl, LD->getValueType(0),
+          PPCLowering->getPointerTy(CurDAG->getDataLayout()), MVT::Other, Ops);
+      transferMemOperands(N, MN);
+      ReplaceNode(N, MN);
+      return;
     } else {
       unsigned Opcode;
       bool isSExt = LD->getExtensionType() == ISD::SEXTLOAD;
@@ -2563,11 +2582,12 @@ SDNode *PPCDAGToDAGISel::Select(SDNode *N) {
       SDValue Chain = LD->getChain();
       SDValue Base = LD->getBasePtr();
       SDValue Ops[] = { Base, Offset, Chain };
-      return transferMemOperands(
-          N, CurDAG->getMachineNode(
-                 Opcode, dl, LD->getValueType(0),
-                 PPCLowering->getPointerTy(CurDAG->getDataLayout()), MVT::Other,
-                 Ops));
+      SDNode *MN = CurDAG->getMachineNode(
+          Opcode, dl, LD->getValueType(0),
+          PPCLowering->getPointerTy(CurDAG->getDataLayout()), MVT::Other, Ops);
+      transferMemOperands(N, MN);
+      ReplaceNode(N, MN);
+      return;
     }
   }
 
@@ -2582,7 +2602,8 @@ SDNode *PPCDAGToDAGISel::Select(SDNode *N) {
       SDValue Val = N->getOperand(0).getOperand(0);
       SDValue Ops[] = { Val, getI32Imm(SH, dl), getI32Imm(MB, dl),
                         getI32Imm(ME, dl) };
-      return CurDAG->SelectNodeTo(N, PPC::RLWINM, MVT::i32, Ops);
+      CurDAG->SelectNodeTo(N, PPC::RLWINM, MVT::i32, Ops);
+      return;
     }
     // If this is just a masked value where the input is not handled above, and
     // is not a rotate-left (handled by a pattern in the .td file), emit rlwinm
@@ -2592,7 +2613,8 @@ SDNode *PPCDAGToDAGISel::Select(SDNode *N) {
       SDValue Val = N->getOperand(0);
       SDValue Ops[] = { Val, getI32Imm(0, dl), getI32Imm(MB, dl),
                         getI32Imm(ME, dl) };
-      return CurDAG->SelectNodeTo(N, PPC::RLWINM, MVT::i32, Ops);
+      CurDAG->SelectNodeTo(N, PPC::RLWINM, MVT::i32, Ops);
+      return;
     }
     // If this is a 64-bit zero-extension mask, emit rldicl.
     if (isInt64Immediate(N->getOperand(1).getNode(), Imm64) &&
@@ -2614,12 +2636,13 @@ SDNode *PPCDAGToDAGISel::Select(SDNode *N) {
       }
 
       SDValue Ops[] = { Val, getI32Imm(SH, dl), getI32Imm(MB, dl) };
-      return CurDAG->SelectNodeTo(N, PPC::RLDICL, MVT::i64, Ops);
+      CurDAG->SelectNodeTo(N, PPC::RLDICL, MVT::i64, Ops);
+      return;
     }
     // AND X, 0 -> 0, not "rlwinm 32".
     if (isInt32Immediate(N->getOperand(1), Imm) && (Imm == 0)) {
       ReplaceUses(SDValue(N, 0), N->getOperand(1));
-      return nullptr;
+      return;
     }
     // ISD::OR doesn't get all the bitfield insertion fun.
     // (and (or x, c1), c2) where isRunOfOnes(~(c1^c2)) might be a
@@ -2645,7 +2668,8 @@ SDNode *PPCDAGToDAGISel::Select(SDNode *N) {
                             N->getOperand(0).getOperand(1),
                             getI32Imm(0, dl), getI32Imm(MB, dl),
                             getI32Imm(ME, dl) };
-        return CurDAG->getMachineNode(PPC::RLWIMI, dl, MVT::i32, Ops);
+        ReplaceNode(N, CurDAG->getMachineNode(PPC::RLWIMI, dl, MVT::i32, Ops));
+        return;
       }
     }
 
@@ -2654,8 +2678,8 @@ SDNode *PPCDAGToDAGISel::Select(SDNode *N) {
   }
   case ISD::OR: {
     if (N->getValueType(0) == MVT::i32)
-      if (SDNode *I = SelectBitfieldInsert(N))
-        return I;
+      if (tryBitfieldInsert(N))
+        return;
 
     short Imm;
     if (N->getOperand(0)->getOpcode() == ISD::FrameIndex &&
@@ -2665,8 +2689,10 @@ SDNode *PPCDAGToDAGISel::Select(SDNode *N) {
 
       // If this is equivalent to an add, then we can fold it with the
       // FrameIndex calculation.
-      if ((LHSKnownZero.getZExtValue()|~(uint64_t)Imm) == ~0ULL)
-        return getFrameIndex(N, N->getOperand(0).getNode(), (int)Imm);
+      if ((LHSKnownZero.getZExtValue()|~(uint64_t)Imm) == ~0ULL) {
+        selectFrameIndex(N, N->getOperand(0).getNode(), (int)Imm);
+        return;
+      }
     }
 
     // Other cases are autogenerated.
@@ -2675,8 +2701,10 @@ SDNode *PPCDAGToDAGISel::Select(SDNode *N) {
   case ISD::ADD: {
     short Imm;
     if (N->getOperand(0)->getOpcode() == ISD::FrameIndex &&
-        isIntS16Immediate(N->getOperand(1), Imm))
-      return getFrameIndex(N, N->getOperand(0).getNode(), (int)Imm);
+        isIntS16Immediate(N->getOperand(1), Imm)) {
+      selectFrameIndex(N, N->getOperand(0).getNode(), (int)Imm);
+      return;
+    }
 
     break;
   }
@@ -2687,7 +2715,8 @@ SDNode *PPCDAGToDAGISel::Select(SDNode *N) {
       SDValue Ops[] = { N->getOperand(0).getOperand(0),
                           getI32Imm(SH, dl), getI32Imm(MB, dl),
                           getI32Imm(ME, dl) };
-      return CurDAG->SelectNodeTo(N, PPC::RLWINM, MVT::i32, Ops);
+      CurDAG->SelectNodeTo(N, PPC::RLWINM, MVT::i32, Ops);
+      return;
     }
 
     // Other cases are autogenerated.
@@ -2700,7 +2729,8 @@ SDNode *PPCDAGToDAGISel::Select(SDNode *N) {
       SDValue Ops[] = { N->getOperand(0).getOperand(0),
                           getI32Imm(SH, dl), getI32Imm(MB, dl),
                           getI32Imm(ME, dl) };
-      return CurDAG->SelectNodeTo(N, PPC::RLWINM, MVT::i32, Ops);
+      CurDAG->SelectNodeTo(N, PPC::RLWINM, MVT::i32, Ops);
+      return;
     }
 
     // Other cases are autogenerated.
@@ -2726,9 +2756,9 @@ SDNode *PPCDAGToDAGISel::Select(SDNode *N) {
       CurDAG->getTargetConstant(N->getOpcode() == PPCISD::ANDIo_1_EQ_BIT ?
                                 PPC::sub_eq : PPC::sub_gt, dl, MVT::i32);
 
-    return CurDAG->SelectNodeTo(N, TargetOpcode::EXTRACT_SUBREG, MVT::i1,
-                                CR0Reg, SRIdxVal,
-                                SDValue(AndI.getNode(), 1) /* glue */);
+    CurDAG->SelectNodeTo(N, TargetOpcode::EXTRACT_SUBREG, MVT::i1, CR0Reg,
+                         SRIdxVal, SDValue(AndI.getNode(), 1) /* glue */);
+    return;
   }
   case ISD::SELECT_CC: {
     ISD::CondCode CC = cast<CondCodeSDNode>(N->getOperand(4))->get();
@@ -2753,9 +2783,9 @@ SDNode *PPCDAGToDAGISel::Select(SDNode *N) {
               SDNode *Tmp =
                 CurDAG->getMachineNode(PPC::ADDIC, dl, MVT::i32, MVT::Glue,
                                        N->getOperand(0), getI32Imm(~0U, dl));
-              return CurDAG->SelectNodeTo(N, PPC::SUBFE, MVT::i32,
-                                          SDValue(Tmp, 0), N->getOperand(0),
-                                          SDValue(Tmp, 1));
+              CurDAG->SelectNodeTo(N, PPC::SUBFE, MVT::i32, SDValue(Tmp, 0),
+                                   N->getOperand(0), SDValue(Tmp, 1));
+              return;
             }
 
     SDValue CCReg = SelectCC(N->getOperand(0), N->getOperand(1), CC, dl);
@@ -2786,7 +2816,8 @@ SDNode *PPCDAGToDAGISel::Select(SDNode *N) {
       SDValue NotCAndF(CurDAG->getMachineNode(PPC::CRAND, dl, MVT::i1,
                                               NotC, N->getOperand(3)), 0);
 
-      return CurDAG->SelectNodeTo(N, PPC::CROR, MVT::i1, CAndT, NotCAndF);
+      CurDAG->SelectNodeTo(N, PPC::CROR, MVT::i1, CAndT, NotCAndF);
+      return;
     }
 
     unsigned BROpc = getPredicateForSetCC(CC);
@@ -2820,12 +2851,14 @@ SDNode *PPCDAGToDAGISel::Select(SDNode *N) {
 
     SDValue Ops[] = { CCReg, N->getOperand(2), N->getOperand(3),
                         getI32Imm(BROpc, dl) };
-    return CurDAG->SelectNodeTo(N, SelectCCOp, N->getValueType(0), Ops);
+    CurDAG->SelectNodeTo(N, SelectCCOp, N->getValueType(0), Ops);
+    return;
   }
   case ISD::VSELECT:
     if (PPCSubTarget->hasVSX()) {
       SDValue Ops[] = { N->getOperand(2), N->getOperand(1), N->getOperand(0) };
-      return CurDAG->SelectNodeTo(N, PPC::XXSEL, N->getValueType(0), Ops);
+      CurDAG->SelectNodeTo(N, PPC::XXSEL, N->getValueType(0), Ops);
+      return;
     }
 
     break;
@@ -2856,8 +2889,8 @@ SDNode *PPCDAGToDAGISel::Select(SDNode *N) {
             SelectAddrIdxOnly(LD->getBasePtr(), Base, Offset)) {
           SDValue Chain = LD->getChain();
           SDValue Ops[] = { Base, Offset, Chain };
-          return CurDAG->SelectNodeTo(N, PPC::LXVDSX,
-                                      N->getValueType(0), Ops);
+          CurDAG->SelectNodeTo(N, PPC::LXVDSX, N->getValueType(0), Ops);
+          return;
         }
       }
 
@@ -2873,7 +2906,8 @@ SDNode *PPCDAGToDAGISel::Select(SDNode *N) {
       SDValue DMV = CurDAG->getTargetConstant(DM[1] | (DM[0] << 1), dl,
                                               MVT::i32);
       SDValue Ops[] = { Op1, Op2, DMV };
-      return CurDAG->SelectNodeTo(N, PPC::XXPERMDI, N->getValueType(0), Ops);
+      CurDAG->SelectNodeTo(N, PPC::XXPERMDI, N->getValueType(0), Ops);
+      return;
     }
 
     break;
@@ -2881,10 +2915,11 @@ SDNode *PPCDAGToDAGISel::Select(SDNode *N) {
   case PPCISD::BDZ: {
     bool IsPPC64 = PPCSubTarget->isPPC64();
     SDValue Ops[] = { N->getOperand(1), N->getOperand(0) };
-    return CurDAG->SelectNodeTo(N, N->getOpcode() == PPCISD::BDNZ ?
-                                   (IsPPC64 ? PPC::BDNZ8 : PPC::BDNZ) :
-                                   (IsPPC64 ? PPC::BDZ8 : PPC::BDZ),
-                                MVT::Other, Ops);
+    CurDAG->SelectNodeTo(N, N->getOpcode() == PPCISD::BDNZ
+                                ? (IsPPC64 ? PPC::BDNZ8 : PPC::BDNZ)
+                                : (IsPPC64 ? PPC::BDZ8 : PPC::BDZ),
+                         MVT::Other, Ops);
+    return;
   }
   case PPCISD::COND_BRANCH: {
     // Op #0 is the Chain.
@@ -2900,7 +2935,8 @@ SDNode *PPCDAGToDAGISel::Select(SDNode *N) {
     SDValue Pred = getI32Imm(PCC, dl);
     SDValue Ops[] = { Pred, N->getOperand(2), N->getOperand(3),
       N->getOperand(0), N->getOperand(4) };
-    return CurDAG->SelectNodeTo(N, PPC::BCC, MVT::Other, Ops);
+    CurDAG->SelectNodeTo(N, PPC::BCC, MVT::Other, Ops);
+    return;
   }
   case ISD::BR_CC: {
     ISD::CondCode CC = cast<CondCodeSDNode>(N->getOperand(1))->get();
@@ -2922,8 +2958,9 @@ SDNode *PPCDAGToDAGISel::Select(SDNode *N) {
       SDValue BitComp(CurDAG->getMachineNode(Opc, dl, MVT::i1,
                                              N->getOperand(Swap ? 3 : 2),
                                              N->getOperand(Swap ? 2 : 3)), 0);
-      return CurDAG->SelectNodeTo(N, PPC::BC, MVT::Other,
-                                  BitComp, N->getOperand(4), N->getOperand(0));
+      CurDAG->SelectNodeTo(N, PPC::BC, MVT::Other, BitComp, N->getOperand(4),
+                           N->getOperand(0));
+      return;
     }
 
     if (EnableBranchHint)
@@ -2932,7 +2969,8 @@ SDNode *PPCDAGToDAGISel::Select(SDNode *N) {
     SDValue CondCode = SelectCC(N->getOperand(2), N->getOperand(3), CC, dl);
     SDValue Ops[] = { getI32Imm(PCC, dl), CondCode,
                         N->getOperand(4), N->getOperand(0) };
-    return CurDAG->SelectNodeTo(N, PPC::BCC, MVT::Other, Ops);
+    CurDAG->SelectNodeTo(N, PPC::BCC, MVT::Other, Ops);
+    return;
   }
   case ISD::BRIND: {
     // FIXME: Should custom lower this.
@@ -2942,15 +2980,19 @@ SDNode *PPCDAGToDAGISel::Select(SDNode *N) {
     unsigned Reg = Target.getValueType() == MVT::i32 ? PPC::BCTR : PPC::BCTR8;
     Chain = SDValue(CurDAG->getMachineNode(Opc, dl, MVT::Glue, Target,
                                            Chain), 0);
-    return CurDAG->SelectNodeTo(N, Reg, MVT::Other, Chain);
+    CurDAG->SelectNodeTo(N, Reg, MVT::Other, Chain);
+    return;
   }
   case PPCISD::TOC_ENTRY: {
     assert ((PPCSubTarget->isPPC64() || PPCSubTarget->isSVR4ABI()) &&
             "Only supported for 64-bit ABI and 32-bit SVR4");
     if (PPCSubTarget->isSVR4ABI() && !PPCSubTarget->isPPC64()) {
       SDValue GA = N->getOperand(0);
-      return transferMemOperands(N, CurDAG->getMachineNode(PPC::LWZtoc, dl,
-                                      MVT::i32, GA, N->getOperand(1)));
+      SDNode *MN = CurDAG->getMachineNode(PPC::LWZtoc, dl, MVT::i32, GA,
+                                          N->getOperand(1));
+      transferMemOperands(N, MN);
+      ReplaceNode(N, MN);
+      return;
     }
 
     // For medium and large code model, we generate two instructions as
@@ -2971,29 +3013,38 @@ SDNode *PPCDAGToDAGISel::Select(SDNode *N) {
                                          TOCbase, GA);
 
     if (isa<JumpTableSDNode>(GA) || isa<BlockAddressSDNode>(GA) ||
-        CModel == CodeModel::Large)
-      return transferMemOperands(N, CurDAG->getMachineNode(PPC::LDtocL, dl,
-                                      MVT::i64, GA, SDValue(Tmp, 0)));
+        CModel == CodeModel::Large) {
+      SDNode *MN = CurDAG->getMachineNode(PPC::LDtocL, dl, MVT::i64, GA,
+                                          SDValue(Tmp, 0));
+      transferMemOperands(N, MN);
+      ReplaceNode(N, MN);
+      return;
+    }
 
     if (GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(GA)) {
       const GlobalValue *GV = G->getGlobal();
       unsigned char GVFlags = PPCSubTarget->classifyGlobalReference(GV);
       if (GVFlags & PPCII::MO_NLP_FLAG) {
-        return transferMemOperands(N, CurDAG->getMachineNode(PPC::LDtocL, dl,
-                                        MVT::i64, GA, SDValue(Tmp, 0)));
+        SDNode *MN = CurDAG->getMachineNode(PPC::LDtocL, dl, MVT::i64, GA,
+                                            SDValue(Tmp, 0));
+        transferMemOperands(N, MN);
+        ReplaceNode(N, MN);
+        return;
       }
     }
 
-    return CurDAG->getMachineNode(PPC::ADDItocL, dl, MVT::i64,
-                                  SDValue(Tmp, 0), GA);
+    ReplaceNode(N, CurDAG->getMachineNode(PPC::ADDItocL, dl, MVT::i64,
+                                          SDValue(Tmp, 0), GA));
+    return;
   }
   case PPCISD::PPC32_PICGOT: {
     // Generate a PIC-safe GOT reference.
     assert(!PPCSubTarget->isPPC64() && PPCSubTarget->isSVR4ABI() &&
       "PPCISD::PPC32_PICGOT is only supported for 32-bit SVR4");
-    return CurDAG->SelectNodeTo(
-        N, PPC::PPC32PICGOT, PPCLowering->getPointerTy(CurDAG->getDataLayout()),
-        MVT::i32);
+    CurDAG->SelectNodeTo(N, PPC::PPC32PICGOT,
+                         PPCLowering->getPointerTy(CurDAG->getDataLayout()),
+                         MVT::i32);
+    return;
   }
   case PPCISD::VADD_SPLAT: {
     // This expands into one of three sequences, depending on whether
@@ -3035,7 +3086,8 @@ SDNode *PPCDAGToDAGISel::Select(SDNode *N) {
       SDValue EltVal = getI32Imm(Elt >> 1, dl);
       SDNode *Tmp = CurDAG->getMachineNode(Opc1, dl, VT, EltVal);
       SDValue TmpVal = SDValue(Tmp, 0);
-      return CurDAG->getMachineNode(Opc2, dl, VT, TmpVal, TmpVal);
+      ReplaceNode(N, CurDAG->getMachineNode(Opc2, dl, VT, TmpVal, TmpVal));
+      return;
 
     } else if (Elt > 0) {
       // Elt is odd and positive, in the range [17,31].
@@ -3048,8 +3100,9 @@ SDNode *PPCDAGToDAGISel::Select(SDNode *N) {
       SDNode *Tmp1 = CurDAG->getMachineNode(Opc1, dl, VT, EltVal);
       EltVal = getI32Imm(-16, dl);
       SDNode *Tmp2 = CurDAG->getMachineNode(Opc1, dl, VT, EltVal);
-      return CurDAG->getMachineNode(Opc3, dl, VT, SDValue(Tmp1, 0),
-                                    SDValue(Tmp2, 0));
+      ReplaceNode(N, CurDAG->getMachineNode(Opc3, dl, VT, SDValue(Tmp1, 0),
+                                            SDValue(Tmp2, 0)));
+      return;
 
     } else {
       // Elt is odd and negative, in the range [-31,-17].
@@ -3062,13 +3115,14 @@ SDNode *PPCDAGToDAGISel::Select(SDNode *N) {
       SDNode *Tmp1 = CurDAG->getMachineNode(Opc1, dl, VT, EltVal);
       EltVal = getI32Imm(-16, dl);
       SDNode *Tmp2 = CurDAG->getMachineNode(Opc1, dl, VT, EltVal);
-      return CurDAG->getMachineNode(Opc2, dl, VT, SDValue(Tmp1, 0),
-                                    SDValue(Tmp2, 0));
+      ReplaceNode(N, CurDAG->getMachineNode(Opc2, dl, VT, SDValue(Tmp1, 0),
+                                            SDValue(Tmp2, 0)));
+      return;
     }
   }
   }
 
-  return SelectCode(N);
+  SelectCode(N);
 }
 
 // If the target supports the cmpb instruction, do the idiom recognition here.
@@ -3565,11 +3619,12 @@ void PPCDAGToDAGISel::PeepholeCROps() {
                                            MVT::i1, MachineNode->getOperand(0),
                                            MachineNode->getOperand(1).
                                              getOperand(0));
-        else if (AllUsersSelectZero(MachineNode))
+        else if (AllUsersSelectZero(MachineNode)) {
           ResNode = CurDAG->getMachineNode(PPC::CRNAND, SDLoc(MachineNode),
                                            MVT::i1, MachineNode->getOperand(0),
-                                           MachineNode->getOperand(1)),
+                                           MachineNode->getOperand(1));
           SelectSwap = true;
+        }
         break;
       case PPC::CRNAND:
         if (MachineNode->getOperand(0) == MachineNode->getOperand(1))
@@ -3603,11 +3658,12 @@ void PPCDAGToDAGISel::PeepholeCROps() {
                                            MVT::i1, MachineNode->getOperand(1).
                                                       getOperand(0),
                                            MachineNode->getOperand(0));
-        else if (AllUsersSelectZero(MachineNode))
+        else if (AllUsersSelectZero(MachineNode)) {
           ResNode = CurDAG->getMachineNode(PPC::CRAND, SDLoc(MachineNode),
                                            MVT::i1, MachineNode->getOperand(0),
-                                           MachineNode->getOperand(1)),
+                                           MachineNode->getOperand(1));
           SelectSwap = true;
+        }
         break;
       case PPC::CROR:
         if (MachineNode->getOperand(0) == MachineNode->getOperand(1))
@@ -3635,11 +3691,12 @@ void PPCDAGToDAGISel::PeepholeCROps() {
                                            MVT::i1, MachineNode->getOperand(0),
                                            MachineNode->getOperand(1).
                                              getOperand(0));
-        else if (AllUsersSelectZero(MachineNode))
+        else if (AllUsersSelectZero(MachineNode)) {
           ResNode = CurDAG->getMachineNode(PPC::CRNOR, SDLoc(MachineNode),
                                            MVT::i1, MachineNode->getOperand(0),
-                                           MachineNode->getOperand(1)),
+                                           MachineNode->getOperand(1));
           SelectSwap = true;
+        }
         break;
       case PPC::CRXOR:
         if (MachineNode->getOperand(0) == MachineNode->getOperand(1))
@@ -3674,11 +3731,12 @@ void PPCDAGToDAGISel::PeepholeCROps() {
                                            MVT::i1, MachineNode->getOperand(0),
                                            MachineNode->getOperand(1).
                                              getOperand(0));
-        else if (AllUsersSelectZero(MachineNode))
+        else if (AllUsersSelectZero(MachineNode)) {
           ResNode = CurDAG->getMachineNode(PPC::CREQV, SDLoc(MachineNode),
                                            MVT::i1, MachineNode->getOperand(0),
-                                           MachineNode->getOperand(1)),
+                                           MachineNode->getOperand(1));
           SelectSwap = true;
+        }
         break;
       case PPC::CRNOR:
         if (Op1Set || Op2Set)
@@ -3707,11 +3765,12 @@ void PPCDAGToDAGISel::PeepholeCROps() {
                                            MVT::i1, MachineNode->getOperand(1).
                                                       getOperand(0),
                                            MachineNode->getOperand(0));
-        else if (AllUsersSelectZero(MachineNode))
+        else if (AllUsersSelectZero(MachineNode)) {
           ResNode = CurDAG->getMachineNode(PPC::CROR, SDLoc(MachineNode),
                                            MVT::i1, MachineNode->getOperand(0),
-                                           MachineNode->getOperand(1)),
+                                           MachineNode->getOperand(1));
           SelectSwap = true;
+        }
         break;
       case PPC::CREQV:
         if (MachineNode->getOperand(0) == MachineNode->getOperand(1))
@@ -3746,11 +3805,12 @@ void PPCDAGToDAGISel::PeepholeCROps() {
                                            MVT::i1, MachineNode->getOperand(0),
                                            MachineNode->getOperand(1).
                                              getOperand(0));
-        else if (AllUsersSelectZero(MachineNode))
+        else if (AllUsersSelectZero(MachineNode)) {
           ResNode = CurDAG->getMachineNode(PPC::CRXOR, SDLoc(MachineNode),
                                            MVT::i1, MachineNode->getOperand(0),
-                                           MachineNode->getOperand(1)),
+                                           MachineNode->getOperand(1));
           SelectSwap = true;
+        }
         break;
       case PPC::CRANDC:
         if (MachineNode->getOperand(0) == MachineNode->getOperand(1))
@@ -3781,11 +3841,12 @@ void PPCDAGToDAGISel::PeepholeCROps() {
                                            MVT::i1, MachineNode->getOperand(0),
                                            MachineNode->getOperand(1).
                                              getOperand(0));
-        else if (AllUsersSelectZero(MachineNode))
+        else if (AllUsersSelectZero(MachineNode)) {
           ResNode = CurDAG->getMachineNode(PPC::CRORC, SDLoc(MachineNode),
                                            MVT::i1, MachineNode->getOperand(1),
-                                           MachineNode->getOperand(0)),
+                                           MachineNode->getOperand(0));
           SelectSwap = true;
+        }
         break;
       case PPC::CRORC:
         if (MachineNode->getOperand(0) == MachineNode->getOperand(1))
@@ -3816,11 +3877,12 @@ void PPCDAGToDAGISel::PeepholeCROps() {
                                            MVT::i1, MachineNode->getOperand(0),
                                            MachineNode->getOperand(1).
                                              getOperand(0));
-        else if (AllUsersSelectZero(MachineNode))
+        else if (AllUsersSelectZero(MachineNode)) {
           ResNode = CurDAG->getMachineNode(PPC::CRANDC, SDLoc(MachineNode),
                                            MVT::i1, MachineNode->getOperand(1),
-                                           MachineNode->getOperand(0)),
+                                           MachineNode->getOperand(0));
           SelectSwap = true;
+        }
         break;
       case PPC::SELECT_I4:
       case PPC::SELECT_I8:
@@ -4365,15 +4427,3 @@ void PPCDAGToDAGISel::PeepholePPC64() {
 FunctionPass *llvm::createPPCISelDag(PPCTargetMachine &TM) {
   return new PPCDAGToDAGISel(TM);
 }
-
-static void initializePassOnce(PassRegistry &Registry) {
-  const char *Name = "PowerPC DAG->DAG Pattern Instruction Selection";
-  PassInfo *PI = new PassInfo(Name, "ppc-codegen", &SelectionDAGISel::ID,
-                              nullptr, false, false);
-  Registry.registerPass(*PI, true);
-}
-
-void llvm::initializePPCDAGToDAGISelPass(PassRegistry &Registry) {
-  CALL_ONCE_INITIALIZATION(initializePassOnce);
-}
-
diff --git a/lib/Target/PowerPC/PPCISelLowering.cpp b/lib/Target/PowerPC/PPCISelLowering.cpp
index af9ad077a7ce..6e3c830a8243 100644
--- a/lib/Target/PowerPC/PPCISelLowering.cpp
+++ b/lib/Target/PowerPC/PPCISelLowering.cpp
@@ -14,11 +14,13 @@
 #include "PPCISelLowering.h"
 #include "MCTargetDesc/PPCPredicates.h"
 #include "PPCCallingConv.h"
+#include "PPCCCState.h"
 #include "PPCMachineFunctionInfo.h"
 #include "PPCPerfectShuffle.h"
 #include "PPCTargetMachine.h"
 #include "PPCTargetObjectFile.h"
 #include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/Statistic.h"
 #include "llvm/ADT/StringSwitch.h"
 #include "llvm/ADT/Triple.h"
 #include "llvm/CodeGen/CallingConvLower.h"
@@ -36,12 +38,16 @@
 #include "llvm/IR/Intrinsics.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/Format.h"
 #include "llvm/Support/MathExtras.h"
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/Target/TargetOptions.h"
+#include <list>
 
 using namespace llvm;
 
+#define DEBUG_TYPE "ppc-lowering"
+
 static cl::opt<bool> DisablePPCPreinc("disable-ppc-preinc",
 cl::desc("disable preincrement load/store generation on PPC"), cl::Hidden);
 
@@ -51,6 +57,12 @@ cl::desc("disable setting the node scheduling preference to ILP on PPC"), cl::Hi
 static cl::opt<bool> DisablePPCUnaligned("disable-ppc-unaligned",
 cl::desc("disable unaligned load/store generation on PPC"), cl::Hidden);
 
+static cl::opt<bool> DisableSCO("disable-ppc-sco",
+cl::desc("disable sibling call optimization on ppc"), cl::Hidden);
+
+STATISTIC(NumTailCalls, "Number of tail calls");
+STATISTIC(NumSiblingCalls, "Number of sibling calls");
+
 // FIXME: Remove this once the bug has been fixed!
 extern cl::opt<bool> ANDIGlueBug;
 
@@ -68,7 +80,7 @@ PPCTargetLowering::PPCTargetLowering(const PPCTargetMachine &TM,
 
   // Set up the register classes.
   addRegisterClass(MVT::i32, &PPC::GPRCRegClass);
-  if (!Subtarget.useSoftFloat()) {
+  if (!useSoftFloat()) {
     addRegisterClass(MVT::f32, &PPC::F4RCRegClass);
     addRegisterClass(MVT::f64, &PPC::F8RCRegClass);
   }
@@ -207,14 +219,10 @@ PPCTargetLowering::PPCTargetLowering(const PPCTargetMachine &TM,
   // PowerPC does not have BSWAP, CTPOP or CTTZ
   setOperationAction(ISD::BSWAP, MVT::i32  , Expand);
   setOperationAction(ISD::CTTZ , MVT::i32  , Expand);
-  setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::i32, Expand);
-  setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i32, Expand);
   setOperationAction(ISD::BSWAP, MVT::i64  , Expand);
   setOperationAction(ISD::CTTZ , MVT::i64  , Expand);
-  setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::i64, Expand);
-  setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i64, Expand);
 
-  if (Subtarget.hasPOPCNTD()) {
+  if (Subtarget.hasPOPCNTD() == PPCSubtarget::POPCNTD_Fast) {
     setOperationAction(ISD::CTPOP, MVT::i32  , Legal);
     setOperationAction(ISD::CTPOP, MVT::i64  , Legal);
   } else {
@@ -255,7 +263,7 @@ PPCTargetLowering::PPCTargetLowering(const PPCTargetMachine &TM,
   setOperationAction(ISD::SINT_TO_FP, MVT::i32, Expand);
   setOperationAction(ISD::UINT_TO_FP, MVT::i32, Expand);
 
-  if (Subtarget.hasDirectMove()) {
+  if (Subtarget.hasDirectMove() && isPPC64) {
     setOperationAction(ISD::BITCAST, MVT::f32, Legal);
     setOperationAction(ISD::BITCAST, MVT::i32, Legal);
     setOperationAction(ISD::BITCAST, MVT::i64, Legal);
@@ -479,9 +487,7 @@ PPCTargetLowering::PPCTargetLowering(const PPCTargetMachine &TM,
       setOperationAction(ISD::SCALAR_TO_VECTOR, VT, Expand);
       setOperationAction(ISD::FPOW, VT, Expand);
       setOperationAction(ISD::BSWAP, VT, Expand);
-      setOperationAction(ISD::CTLZ_ZERO_UNDEF, VT, Expand);
       setOperationAction(ISD::CTTZ, VT, Expand);
-      setOperationAction(ISD::CTTZ_ZERO_UNDEF, VT, Expand);
       setOperationAction(ISD::VSELECT, VT, Expand);
       setOperationAction(ISD::SIGN_EXTEND_INREG, VT, Expand);
       setOperationAction(ISD::ROTL, VT, Expand);
@@ -557,7 +563,7 @@ PPCTargetLowering::PPCTargetLowering(const PPCTargetMachine &TM,
         setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v4f32, Legal);
         setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v4f32, Legal);
       }
-      if (Subtarget.hasDirectMove()) {
+      if (Subtarget.hasDirectMove() && isPPC64) {
         setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v16i8, Legal);
         setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v8i16, Legal);
         setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v4i32, Legal);
@@ -647,6 +653,11 @@ PPCTargetLowering::PPCTargetLowering(const PPCTargetMachine &TM,
       setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v2i16, Custom);
       setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v2i8, Custom);
 
+      setOperationAction(ISD::FNEG, MVT::v4f32, Legal);
+      setOperationAction(ISD::FNEG, MVT::v2f64, Legal);
+      setOperationAction(ISD::FABS, MVT::v4f32, Legal);
+      setOperationAction(ISD::FABS, MVT::v2f64, Legal);
+
       addRegisterClass(MVT::v2i64, &PPC::VSRCRegClass);
     }
 
@@ -654,6 +665,10 @@ PPCTargetLowering::PPCTargetLowering(const PPCTargetMachine &TM,
       addRegisterClass(MVT::v2i64, &PPC::VRRCRegClass);
       addRegisterClass(MVT::v1i128, &PPC::VRRCRegClass);
     }
+    if (Subtarget.hasP9Vector()) {
+      setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4i32, Legal);
+      setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4f32, Legal);
+    }
   }
 
   if (Subtarget.hasQPX()) {
@@ -840,6 +855,7 @@ PPCTargetLowering::PPCTargetLowering(const PPCTargetMachine &TM,
 
   // We have target-specific dag combine patterns for the following nodes:
   setTargetDAGCombine(ISD::SINT_TO_FP);
+  setTargetDAGCombine(ISD::BUILD_VECTOR);
   if (Subtarget.hasFPCVT())
     setTargetDAGCombine(ISD::UINT_TO_FP);
   setTargetDAGCombine(ISD::LOAD);
@@ -906,13 +922,12 @@ PPCTargetLowering::PPCTargetLowering(const PPCTargetMachine &TM,
   case PPC::DIR_PWR6X:
   case PPC::DIR_PWR7:
   case PPC::DIR_PWR8:
+  case PPC::DIR_PWR9:
     setPrefFunctionAlignment(4);
     setPrefLoopAlignment(4);
     break;
   }
 
-  setInsertFencesForAtomic(true);
-
   if (Subtarget.enableMachineScheduler())
     setSchedulingPreference(Sched::Source);
   else
@@ -1006,6 +1021,9 @@ const char *PPCTargetLowering::getTargetNodeName(unsigned Opcode) const {
   case PPCISD::VMADDFP:         return "PPCISD::VMADDFP";
   case PPCISD::VNMSUBFP:        return "PPCISD::VNMSUBFP";
   case PPCISD::VPERM:           return "PPCISD::VPERM";
+  case PPCISD::XXSPLT:          return "PPCISD::XXSPLT";
+  case PPCISD::XXINSERT:        return "PPCISD::XXINSERT";
+  case PPCISD::VECSHL:          return "PPCISD::VECSHL";
   case PPCISD::CMPB:            return "PPCISD::CMPB";
   case PPCISD::Hi:              return "PPCISD::Hi";
   case PPCISD::Lo:              return "PPCISD::Lo";
@@ -1030,6 +1048,8 @@ const char *PPCTargetLowering::getTargetNodeName(unsigned Opcode) const {
   case PPCISD::MFVSR:           return "PPCISD::MFVSR";
   case PPCISD::MTVSRA:          return "PPCISD::MTVSRA";
   case PPCISD::MTVSRZ:          return "PPCISD::MTVSRZ";
+  case PPCISD::SINT_VEC_TO_FP:  return "PPCISD::SINT_VEC_TO_FP";
+  case PPCISD::UINT_VEC_TO_FP:  return "PPCISD::UINT_VEC_TO_FP";
   case PPCISD::ANDIo_1_EQ_BIT:  return "PPCISD::ANDIo_1_EQ_BIT";
   case PPCISD::ANDIo_1_GT_BIT:  return "PPCISD::ANDIo_1_GT_BIT";
   case PPCISD::VCMP:            return "PPCISD::VCMP";
@@ -1069,6 +1089,7 @@ const char *PPCTargetLowering::getTargetNodeName(unsigned Opcode) const {
   case PPCISD::MFBHRBE:         return "PPCISD::MFBHRBE";
   case PPCISD::RFEBB:           return "PPCISD::RFEBB";
   case PPCISD::XXSWAPD:         return "PPCISD::XXSWAPD";
+  case PPCISD::SWAP_NO_CHAIN:   return "PPCISD::SWAP_NO_CHAIN";
   case PPCISD::QVFPERM:         return "PPCISD::QVFPERM";
   case PPCISD::QVGPCI:          return "PPCISD::QVGPCI";
   case PPCISD::QVALIGNI:        return "PPCISD::QVALIGNI";
@@ -1480,6 +1501,91 @@ bool PPC::isSplatShuffleMask(ShuffleVectorSDNode *N, unsigned EltSize) {
   return true;
 }
 
+bool PPC::isXXINSERTWMask(ShuffleVectorSDNode *N, unsigned &ShiftElts,
+                          unsigned &InsertAtByte, bool &Swap, bool IsLE) {
+
+  // Check that the mask is shuffling words
+  for (unsigned i = 0; i < 4; ++i) {
+    unsigned B0 = N->getMaskElt(i*4);
+    unsigned B1 = N->getMaskElt(i*4+1);
+    unsigned B2 = N->getMaskElt(i*4+2);
+    unsigned B3 = N->getMaskElt(i*4+3);
+    if (B0 % 4)
+      return false;
+    if (B1 != B0+1 || B2 != B1+1 || B3 != B2+1)
+      return false;
+  }
+
+  // Now we look at mask elements 0,4,8,12
+  unsigned M0 = N->getMaskElt(0) / 4;
+  unsigned M1 = N->getMaskElt(4) / 4;
+  unsigned M2 = N->getMaskElt(8) / 4;
+  unsigned M3 = N->getMaskElt(12) / 4;
+  unsigned LittleEndianShifts[] = { 2, 1, 0, 3 };
+  unsigned BigEndianShifts[] = { 3, 0, 1, 2 };
+
+  // Below, let H and L be arbitrary elements of the shuffle mask
+  // where H is in the range [4,7] and L is in the range [0,3].
+  // H, 1, 2, 3 or L, 5, 6, 7
+  if ((M0 > 3 && M1 == 1 && M2 == 2 && M3 == 3) ||
+      (M0 < 4 && M1 == 5 && M2 == 6 && M3 == 7)) {
+    ShiftElts = IsLE ? LittleEndianShifts[M0 & 0x3] : BigEndianShifts[M0 & 0x3];
+    InsertAtByte = IsLE ? 12 : 0;
+    Swap = M0 < 4;
+    return true;
+  }
+  // 0, H, 2, 3 or 4, L, 6, 7
+  if ((M1 > 3 && M0 == 0 && M2 == 2 && M3 == 3) ||
+      (M1 < 4 && M0 == 4 && M2 == 6 && M3 == 7)) {
+    ShiftElts = IsLE ? LittleEndianShifts[M1 & 0x3] : BigEndianShifts[M1 & 0x3];
+    InsertAtByte = IsLE ? 8 : 4;
+    Swap = M1 < 4;
+    return true;
+  }
+  // 0, 1, H, 3 or 4, 5, L, 7
+  if ((M2 > 3 && M0 == 0 && M1 == 1 && M3 == 3) ||
+      (M2 < 4 && M0 == 4 && M1 == 5 && M3 == 7)) {
+    ShiftElts = IsLE ? LittleEndianShifts[M2 & 0x3] : BigEndianShifts[M2 & 0x3];
+    InsertAtByte = IsLE ? 4 : 8;
+    Swap = M2 < 4;
+    return true;
+  }
+  // 0, 1, 2, H or 4, 5, 6, L
+  if ((M3 > 3 && M0 == 0 && M1 == 1 && M2 == 2) ||
+      (M3 < 4 && M0 == 4 && M1 == 5 && M2 == 6)) {
+    ShiftElts = IsLE ? LittleEndianShifts[M3 & 0x3] : BigEndianShifts[M3 & 0x3];
+    InsertAtByte = IsLE ? 0 : 12;
+    Swap = M3 < 4;
+    return true;
+  }
+
+  // If both vector operands for the shuffle are the same vector, the mask will
+  // contain only elements from the first one and the second one will be undef.
+  if (N->getOperand(1).isUndef()) {
+    ShiftElts = 0;
+    Swap = true;
+    unsigned XXINSERTWSrcElem = IsLE ? 2 : 1;
+    if (M0 == XXINSERTWSrcElem && M1 == 1 && M2 == 2 && M3 == 3) {
+      InsertAtByte = IsLE ? 12 : 0;
+      return true;
+    }
+    if (M0 == 0 && M1 == XXINSERTWSrcElem && M2 == 2 && M3 == 3) {
+      InsertAtByte = IsLE ? 8 : 4;
+      return true;
+    }
+    if (M0 == 0 && M1 == 1 && M2 == XXINSERTWSrcElem && M3 == 3) {
+      InsertAtByte = IsLE ? 4 : 8;
+      return true;
+    }
+    if (M0 == 0 && M1 == 1 && M2 == 2 && M3 == XXINSERTWSrcElem) {
+      InsertAtByte = IsLE ? 0 : 12;
+      return true;
+    }
+  }
+
+  return false;
+}
+
 /// getVSPLTImmediate - Return the appropriate VSPLT* immediate to splat the
 /// specified isSplatShuffleMask VECTOR_SHUFFLE mask.
 unsigned PPC::getVSPLTImmediate(SDNode *N, unsigned EltSize,
@@ -1511,7 +1617,7 @@ SDValue PPC::get_VSPLTI_elt(SDNode *N, unsigned ByteSize, SelectionDAG &DAG) {
 
     // See if all of the elements in the buildvector agree across.
     for (unsigned i = 0, e = N->getNumOperands(); i != e; ++i) {
-      if (N->getOperand(i).getOpcode() == ISD::UNDEF) continue;
+      if (N->getOperand(i).isUndef()) continue;
       // If the element isn't a constant, bail fully out.
       if (!isa<ConstantSDNode>(N->getOperand(i))) return SDValue();
 
@@ -1557,7 +1663,7 @@ SDValue PPC::get_VSPLTI_elt(SDNode *N, unsigned ByteSize, SelectionDAG &DAG) {
 
   // Check to see if this buildvec has a single non-undef value in its elements.
   for (unsigned i = 0, e = N->getNumOperands(); i != e; ++i) {
-    if (N->getOperand(i).getOpcode() == ISD::UNDEF) continue;
+    if (N->getOperand(i).isUndef()) continue;
     if (!OpVal.getNode())
       OpVal = N->getOperand(i);
     else if (OpVal != N->getOperand(i))
@@ -1950,19 +2056,16 @@ bool PPCTargetLowering::getPreIndexedAddressParts(SDNode *N, SDValue &Base,
 //  LowerOperation implementation
 //===----------------------------------------------------------------------===//
 
-/// GetLabelAccessInfo - Return true if we should reference labels using a
-/// PICBase, set the HiOpFlags and LoOpFlags to the target MO flags.
-static bool GetLabelAccessInfo(const TargetMachine &TM,
-                               const PPCSubtarget &Subtarget,
+/// Return true if we should reference labels using a PICBase, set the HiOpFlags
+/// and LoOpFlags to the target MO flags.
+static void getLabelAccessInfo(bool IsPIC, const PPCSubtarget &Subtarget,
                                unsigned &HiOpFlags, unsigned &LoOpFlags,
                                const GlobalValue *GV = nullptr) {
   HiOpFlags = PPCII::MO_HA;
   LoOpFlags = PPCII::MO_LO;
 
   // Don't use the pic base if not in PIC relocation model.
-  bool isPIC = TM.getRelocationModel() == Reloc::PIC_;
-
-  if (isPIC) {
+  if (IsPIC) {
     HiOpFlags |= PPCII::MO_PIC_FLAG;
     LoOpFlags |= PPCII::MO_PIC_FLAG;
   }
@@ -1978,8 +2081,6 @@ static bool GetLabelAccessInfo(const TargetMachine &TM,
       LoOpFlags |= PPCII::MO_NLP_HIDDEN_FLAG;
     }
   }
-
-  return isPIC;
 }
 
 static SDValue LowerLabelRef(SDValue HiPart, SDValue LoPart, bool isPIC,
@@ -2010,7 +2111,7 @@ static void setUsesTOCBasePtr(SelectionDAG &DAG) {
   setUsesTOCBasePtr(DAG.getMachineFunction());
 }
 
-static SDValue getTOCEntry(SelectionDAG &DAG, SDLoc dl, bool Is64Bit,
+static SDValue getTOCEntry(SelectionDAG &DAG, const SDLoc &dl, bool Is64Bit,
                            SDValue GA) {
   EVT VT = Is64Bit ? MVT::i64 : MVT::i32;
   SDValue Reg = Is64Bit ? DAG.getRegister(PPC::X2, VT) :
@@ -2038,10 +2139,10 @@ SDValue PPCTargetLowering::LowerConstantPool(SDValue Op,
   }
 
   unsigned MOHiFlag, MOLoFlag;
-  bool isPIC =
-      GetLabelAccessInfo(DAG.getTarget(), Subtarget, MOHiFlag, MOLoFlag);
+  bool IsPIC = isPositionIndependent();
+  getLabelAccessInfo(IsPIC, Subtarget, MOHiFlag, MOLoFlag);
 
-  if (isPIC && Subtarget.isSVR4ABI()) {
+  if (IsPIC && Subtarget.isSVR4ABI()) {
     SDValue GA = DAG.getTargetConstantPool(C, PtrVT, CP->getAlignment(),
                                            PPCII::MO_PIC_FLAG);
     return getTOCEntry(DAG, SDLoc(CP), false, GA);
@@ -2051,7 +2152,7 @@ SDValue PPCTargetLowering::LowerConstantPool(SDValue Op,
     DAG.getTargetConstantPool(C, PtrVT, CP->getAlignment(), 0, MOHiFlag);
   SDValue CPILo =
     DAG.getTargetConstantPool(C, PtrVT, CP->getAlignment(), 0, MOLoFlag);
-  return LowerLabelRef(CPIHi, CPILo, isPIC, DAG);
+  return LowerLabelRef(CPIHi, CPILo, IsPIC, DAG);
 }
 
 SDValue PPCTargetLowering::LowerJumpTable(SDValue Op, SelectionDAG &DAG) const {
@@ -2067,10 +2168,10 @@ SDValue PPCTargetLowering::LowerJumpTable(SDValue Op, SelectionDAG &DAG) const {
   }
 
   unsigned MOHiFlag, MOLoFlag;
-  bool isPIC =
-      GetLabelAccessInfo(DAG.getTarget(), Subtarget, MOHiFlag, MOLoFlag);
+  bool IsPIC = isPositionIndependent();
+  getLabelAccessInfo(IsPIC, Subtarget, MOHiFlag, MOLoFlag);
 
-  if (isPIC && Subtarget.isSVR4ABI()) {
+  if (IsPIC && Subtarget.isSVR4ABI()) {
     SDValue GA = DAG.getTargetJumpTable(JT->getIndex(), PtrVT,
                                         PPCII::MO_PIC_FLAG);
     return getTOCEntry(DAG, SDLoc(GA), false, GA);
@@ -2078,7 +2179,7 @@ SDValue PPCTargetLowering::LowerJumpTable(SDValue Op, SelectionDAG &DAG) const {
 
   SDValue JTIHi = DAG.getTargetJumpTable(JT->getIndex(), PtrVT, MOHiFlag);
   SDValue JTILo = DAG.getTargetJumpTable(JT->getIndex(), PtrVT, MOLoFlag);
-  return LowerLabelRef(JTIHi, JTILo, isPIC, DAG);
+  return LowerLabelRef(JTIHi, JTILo, IsPIC, DAG);
 }
 
 SDValue PPCTargetLowering::LowerBlockAddress(SDValue Op,
@@ -2096,11 +2197,11 @@ SDValue PPCTargetLowering::LowerBlockAddress(SDValue Op,
   }
 
   unsigned MOHiFlag, MOLoFlag;
-  bool isPIC =
-      GetLabelAccessInfo(DAG.getTarget(), Subtarget, MOHiFlag, MOLoFlag);
+  bool IsPIC = isPositionIndependent();
+  getLabelAccessInfo(IsPIC, Subtarget, MOHiFlag, MOLoFlag);
   SDValue TgtBAHi = DAG.getTargetBlockAddress(BA, PtrVT, 0, MOHiFlag);
   SDValue TgtBALo = DAG.getTargetBlockAddress(BA, PtrVT, 0, MOLoFlag);
-  return LowerLabelRef(TgtBAHi, TgtBALo, isPIC, DAG);
+  return LowerLabelRef(TgtBAHi, TgtBALo, IsPIC, DAG);
 }
 
 SDValue PPCTargetLowering::LowerGlobalTLSAddress(SDValue Op,
@@ -2160,7 +2261,7 @@ SDValue PPCTargetLowering::LowerGlobalTLSAddress(SDValue Op,
       GOTPtr = DAG.getNode(PPCISD::ADDIS_TLSGD_HA, dl, PtrVT,
                                    GOTReg, TGA);
     } else {
-      if (picLevel == PICLevel::Small)
+      if (picLevel == PICLevel::SmallPIC)
         GOTPtr = DAG.getNode(PPCISD::GlobalBaseReg, dl, PtrVT);
       else
         GOTPtr = DAG.getNode(PPCISD::PPC32_PICGOT, dl, PtrVT);
@@ -2178,7 +2279,7 @@ SDValue PPCTargetLowering::LowerGlobalTLSAddress(SDValue Op,
       GOTPtr = DAG.getNode(PPCISD::ADDIS_TLSLD_HA, dl, PtrVT,
                            GOTReg, TGA);
     } else {
-      if (picLevel == PICLevel::Small)
+      if (picLevel == PICLevel::SmallPIC)
         GOTPtr = DAG.getNode(PPCISD::GlobalBaseReg, dl, PtrVT);
       else
         GOTPtr = DAG.getNode(PPCISD::PPC32_PICGOT, dl, PtrVT);
@@ -2209,10 +2310,10 @@ SDValue PPCTargetLowering::LowerGlobalAddress(SDValue Op,
   }
 
   unsigned MOHiFlag, MOLoFlag;
-  bool isPIC =
-      GetLabelAccessInfo(DAG.getTarget(), Subtarget, MOHiFlag, MOLoFlag, GV);
+  bool IsPIC = isPositionIndependent();
+  getLabelAccessInfo(IsPIC, Subtarget, MOHiFlag, MOLoFlag, GV);
 
-  if (isPIC && Subtarget.isSVR4ABI()) {
+  if (IsPIC && Subtarget.isSVR4ABI()) {
     SDValue GA = DAG.getTargetGlobalAddress(GV, DL, PtrVT,
                                             GSDN->getOffset(),
                                             PPCII::MO_PIC_FLAG);
@@ -2224,13 +2325,12 @@ SDValue PPCTargetLowering::LowerGlobalAddress(SDValue Op,
   SDValue GALo =
     DAG.getTargetGlobalAddress(GV, DL, PtrVT, GSDN->getOffset(), MOLoFlag);
 
-  SDValue Ptr = LowerLabelRef(GAHi, GALo, isPIC, DAG);
+  SDValue Ptr = LowerLabelRef(GAHi, GALo, IsPIC, DAG);
 
   // If the global reference is actually to a non-lazy-pointer, we have to do an
   // extra load to get the address of the global.
   if (MOHiFlag & PPCII::MO_NLP_FLAG)
-    Ptr = DAG.getLoad(PtrVT, DL, DAG.getEntryNode(), Ptr, MachinePointerInfo(),
-                      false, false, false, 0);
+    Ptr = DAG.getLoad(PtrVT, DL, DAG.getEntryNode(), Ptr, MachinePointerInfo());
   return Ptr;
 }
 
@@ -2260,7 +2360,7 @@ SDValue PPCTargetLowering::LowerSETCC(SDValue Op, SelectionDAG &DAG) const {
   }
 
   // If we're comparing for equality to zero, expose the fact that this is
-  // implented as a ctlz/srl pair on ppc, so that the dag combiner can
+  // implemented as a ctlz/srl pair on ppc, so that the dag combiner can
   // fold the new nodes.
   if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op.getOperand(1))) {
     if (C->isNullValue() && CC == ISD::SETEQ) {
@@ -2298,11 +2398,10 @@ SDValue PPCTargetLowering::LowerSETCC(SDValue Op, SelectionDAG &DAG) const {
   return SDValue();
 }
 
-SDValue PPCTargetLowering::LowerVAARG(SDValue Op, SelectionDAG &DAG,
-                                      const PPCSubtarget &Subtarget) const {
+SDValue PPCTargetLowering::LowerVAARG(SDValue Op, SelectionDAG &DAG) const {
   SDNode *Node = Op.getNode();
   EVT VT = Node->getValueType(0);
-  EVT PtrVT = DAG.getTargetLoweringInfo().getPointerTy(DAG.getDataLayout());
+  EVT PtrVT = getPointerTy(DAG.getDataLayout());
   SDValue InChain = Node->getOperand(0);
   SDValue VAListPtr = Node->getOperand(1);
   const Value *SV = cast<SrcValueSDNode>(Node->getOperand(2))->getValue();
@@ -2312,8 +2411,7 @@ SDValue PPCTargetLowering::LowerVAARG(SDValue Op, SelectionDAG &DAG,
 
   // gpr_index
   SDValue GprIndex = DAG.getExtLoad(ISD::ZEXTLOAD, dl, MVT::i32, InChain,
-                                    VAListPtr, MachinePointerInfo(SV), MVT::i8,
-                                    false, false, false, 0);
+                                    VAListPtr, MachinePointerInfo(SV), MVT::i8);
   InChain = GprIndex.getValue(1);
 
   if (VT == MVT::i64) {
@@ -2335,8 +2433,7 @@ SDValue PPCTargetLowering::LowerVAARG(SDValue Op, SelectionDAG &DAG,
 
   // fpr
   SDValue FprIndex = DAG.getExtLoad(ISD::ZEXTLOAD, dl, MVT::i32, InChain,
-                                    FprPtr, MachinePointerInfo(SV), MVT::i8,
-                                    false, false, false, 0);
+                                    FprPtr, MachinePointerInfo(SV), MVT::i8);
   InChain = FprIndex.getValue(1);
 
   SDValue RegSaveAreaPtr = DAG.getNode(ISD::ADD, dl, PtrVT, VAListPtr,
@@ -2346,14 +2443,12 @@ SDValue PPCTargetLowering::LowerVAARG(SDValue Op, SelectionDAG &DAG,
                                         DAG.getConstant(4, dl, MVT::i32));
 
   // areas
-  SDValue OverflowArea = DAG.getLoad(MVT::i32, dl, InChain, OverflowAreaPtr,
-                                     MachinePointerInfo(), false, false,
-                                     false, 0);
+  SDValue OverflowArea =
+      DAG.getLoad(MVT::i32, dl, InChain, OverflowAreaPtr, MachinePointerInfo());
   InChain = OverflowArea.getValue(1);
 
-  SDValue RegSaveArea = DAG.getLoad(MVT::i32, dl, InChain, RegSaveAreaPtr,
-                                    MachinePointerInfo(), false, false,
-                                    false, 0);
+  SDValue RegSaveArea =
+      DAG.getLoad(MVT::i32, dl, InChain, RegSaveAreaPtr, MachinePointerInfo());
   InChain = RegSaveArea.getValue(1);
 
   // select overflow_area if index > 8
@@ -2383,8 +2478,7 @@ SDValue PPCTargetLowering::LowerVAARG(SDValue Op, SelectionDAG &DAG,
 
   InChain = DAG.getTruncStore(InChain, dl, IndexPlus1,
                               VT.isInteger() ? VAListPtr : FprPtr,
-                              MachinePointerInfo(SV),
-                              MVT::i8, false, false, 0);
+                              MachinePointerInfo(SV), MVT::i8);
 
   // determine if we should load from reg_save_area or overflow_area
   SDValue Result = DAG.getNode(ISD::SELECT, dl, PtrVT, CC, OurReg, OverflowArea);
@@ -2397,17 +2491,13 @@ SDValue PPCTargetLowering::LowerVAARG(SDValue Op, SelectionDAG &DAG,
   OverflowArea = DAG.getNode(ISD::SELECT, dl, MVT::i32, CC, OverflowArea,
                              OverflowAreaPlusN);
 
-  InChain = DAG.getTruncStore(InChain, dl, OverflowArea,
-                              OverflowAreaPtr,
-                              MachinePointerInfo(),
-                              MVT::i32, false, false, 0);
+  InChain = DAG.getTruncStore(InChain, dl, OverflowArea, OverflowAreaPtr,
+                              MachinePointerInfo(), MVT::i32);
 
-  return DAG.getLoad(VT, dl, InChain, Result, MachinePointerInfo(),
-                     false, false, false, 0);
+  return DAG.getLoad(VT, dl, InChain, Result, MachinePointerInfo());
 }
 
-SDValue PPCTargetLowering::LowerVACOPY(SDValue Op, SelectionDAG &DAG,
-                                       const PPCSubtarget &Subtarget) const {
+SDValue PPCTargetLowering::LowerVACOPY(SDValue Op, SelectionDAG &DAG) const {
   assert(!Subtarget.isPPC64() && "LowerVACOPY is PPC32 only");
 
   // We have to copy the entire va_list struct:
@@ -2431,7 +2521,7 @@ SDValue PPCTargetLowering::LowerINIT_TRAMPOLINE(SDValue Op,
   SDValue Nest = Op.getOperand(3); // 'nest' parameter value
   SDLoc dl(Op);
 
-  EVT PtrVT = DAG.getTargetLoweringInfo().getPointerTy(DAG.getDataLayout());
+  EVT PtrVT = getPointerTy(DAG.getDataLayout());
   bool isPPC64 = (PtrVT == MVT::i64);
   Type *IntPtrTy = DAG.getDataLayout().getIntPtrType(*DAG.getContext());
 
@@ -2454,28 +2544,26 @@ SDValue PPCTargetLowering::LowerINIT_TRAMPOLINE(SDValue Op,
   CLI.setDebugLoc(dl).setChain(Chain)
     .setCallee(CallingConv::C, Type::getVoidTy(*DAG.getContext()),
                DAG.getExternalSymbol("__trampoline_setup", PtrVT),
-               std::move(Args), 0);
+               std::move(Args));
 
   std::pair<SDValue, SDValue> CallResult = LowerCallTo(CLI);
   return CallResult.second;
 }
 
-SDValue PPCTargetLowering::LowerVASTART(SDValue Op, SelectionDAG &DAG,
-                                        const PPCSubtarget &Subtarget) const {
+SDValue PPCTargetLowering::LowerVASTART(SDValue Op, SelectionDAG &DAG) const {
   MachineFunction &MF = DAG.getMachineFunction();
   PPCFunctionInfo *FuncInfo = MF.getInfo<PPCFunctionInfo>();
+  EVT PtrVT = getPointerTy(MF.getDataLayout());
 
   SDLoc dl(Op);
 
   if (Subtarget.isDarwinABI() || Subtarget.isPPC64()) {
     // vastart just stores the address of the VarArgsFrameIndex slot into the
     // memory location argument.
-    EVT PtrVT = DAG.getTargetLoweringInfo().getPointerTy(MF.getDataLayout());
     SDValue FR = DAG.getFrameIndex(FuncInfo->getVarArgsFrameIndex(), PtrVT);
     const Value *SV = cast<SrcValueSDNode>(Op.getOperand(2))->getValue();
     return DAG.getStore(Op.getOperand(0), dl, FR, Op.getOperand(1),
-                        MachinePointerInfo(SV),
-                        false, false, 0);
+                        MachinePointerInfo(SV));
   }
 
   // For the 32-bit SVR4 ABI we follow the layout of the va_list struct.
@@ -2504,9 +2592,6 @@ SDValue PPCTargetLowering::LowerVASTART(SDValue Op, SelectionDAG &DAG,
 
   SDValue ArgGPR = DAG.getConstant(FuncInfo->getVarArgsNumGPR(), dl, MVT::i32);
   SDValue ArgFPR = DAG.getConstant(FuncInfo->getVarArgsNumFPR(), dl, MVT::i32);
-
-  EVT PtrVT = DAG.getTargetLoweringInfo().getPointerTy(MF.getDataLayout());
-
   SDValue StackOffsetFI = DAG.getFrameIndex(FuncInfo->getVarArgsStackOffset(),
                                             PtrVT);
   SDValue FR = DAG.getFrameIndex(FuncInfo->getVarArgsFrameIndex(),
@@ -2524,35 +2609,29 @@ SDValue PPCTargetLowering::LowerVASTART(SDValue Op, SelectionDAG &DAG,
   const Value *SV = cast<SrcValueSDNode>(Op.getOperand(2))->getValue();
 
   // Store first byte : number of int regs
-  SDValue firstStore = DAG.getTruncStore(Op.getOperand(0), dl, ArgGPR,
-                                         Op.getOperand(1),
-                                         MachinePointerInfo(SV),
-                                         MVT::i8, false, false, 0);
+  SDValue firstStore =
+      DAG.getTruncStore(Op.getOperand(0), dl, ArgGPR, Op.getOperand(1),
+                        MachinePointerInfo(SV), MVT::i8);
   uint64_t nextOffset = FPROffset;
   SDValue nextPtr = DAG.getNode(ISD::ADD, dl, PtrVT, Op.getOperand(1),
                                   ConstFPROffset);
 
   // Store second byte : number of float regs
   SDValue secondStore =
-    DAG.getTruncStore(firstStore, dl, ArgFPR, nextPtr,
-                      MachinePointerInfo(SV, nextOffset), MVT::i8,
-                      false, false, 0);
+      DAG.getTruncStore(firstStore, dl, ArgFPR, nextPtr,
+                        MachinePointerInfo(SV, nextOffset), MVT::i8);
   nextOffset += StackOffset;
   nextPtr = DAG.getNode(ISD::ADD, dl, PtrVT, nextPtr, ConstStackOffset);
 
   // Store second word : arguments given on stack
-  SDValue thirdStore =
-    DAG.getStore(secondStore, dl, StackOffsetFI, nextPtr,
-                 MachinePointerInfo(SV, nextOffset),
-                 false, false, 0);
+  SDValue thirdStore = DAG.getStore(secondStore, dl, StackOffsetFI, nextPtr,
+                                    MachinePointerInfo(SV, nextOffset));
   nextOffset += FrameOffset;
   nextPtr = DAG.getNode(ISD::ADD, dl, PtrVT, nextPtr, ConstFrameOffset);
 
   // Store third word : arguments given in registers
   return DAG.getStore(thirdStore, dl, FR, nextPtr,
-                      MachinePointerInfo(SV, nextOffset),
-                      false, false, 0);
-
+                      MachinePointerInfo(SV, nextOffset));
 }
 
 #include "PPCGenCallingConv.inc"
@@ -2762,14 +2841,10 @@ static unsigned EnsureStackAlignment(const PPCFrameLowering *Lowering,
   return NumBytes;
 }
 
-SDValue
-PPCTargetLowering::LowerFormalArguments(SDValue Chain,
-                                        CallingConv::ID CallConv, bool isVarArg,
-                                        const SmallVectorImpl<ISD::InputArg>
-                                          &Ins,
-                                        SDLoc dl, SelectionDAG &DAG,
-                                        SmallVectorImpl<SDValue> &InVals)
-                                          const {
+SDValue PPCTargetLowering::LowerFormalArguments(
+    SDValue Chain, CallingConv::ID CallConv, bool isVarArg,
+    const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &dl,
+    SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals) const {
   if (Subtarget.isSVR4ABI()) {
     if (Subtarget.isPPC64())
       return LowerFormalArguments_64SVR4(Chain, CallConv, isVarArg, Ins,
@@ -2783,14 +2858,10 @@ PPCTargetLowering::LowerFormalArguments(SDValue Chain,
   }
 }
 
-SDValue
-PPCTargetLowering::LowerFormalArguments_32SVR4(
-                                      SDValue Chain,
-                                      CallingConv::ID CallConv, bool isVarArg,
-                                      const SmallVectorImpl<ISD::InputArg>
-                                        &Ins,
-                                      SDLoc dl, SelectionDAG &DAG,
-                                      SmallVectorImpl<SDValue> &InVals) const {
+SDValue PPCTargetLowering::LowerFormalArguments_32SVR4(
+    SDValue Chain, CallingConv::ID CallConv, bool isVarArg,
+    const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &dl,
+    SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals) const {
 
   // 32-bit SVR4 ABI Stack Frame Layout:
   //              +-----------------------------------+
@@ -2825,7 +2896,7 @@ PPCTargetLowering::LowerFormalArguments_32SVR4(
   MachineFrameInfo *MFI = MF.getFrameInfo();
   PPCFunctionInfo *FuncInfo = MF.getInfo<PPCFunctionInfo>();
 
-  EVT PtrVT = DAG.getTargetLoweringInfo().getPointerTy(MF.getDataLayout());
+  EVT PtrVT = getPointerTy(MF.getDataLayout());
   // Potential tail calls could cause overwriting of argument stack slots.
   bool isImmutable = !(getTargetMachine().Options.GuaranteedTailCallOpt &&
                        (CallConv == CallingConv::Fast));
@@ -2833,14 +2904,17 @@ PPCTargetLowering::LowerFormalArguments_32SVR4(
 
   // Assign locations to all of the incoming arguments.
   SmallVector<CCValAssign, 16> ArgLocs;
-  CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), ArgLocs,
+  PPCCCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), ArgLocs,
                  *DAG.getContext());
 
   // Reserve space for the linkage area on the stack.
   unsigned LinkageSize = Subtarget.getFrameLowering()->getLinkageSize();
   CCInfo.AllocateStack(LinkageSize, PtrByteSize);
+  if (useSoftFloat())
+    CCInfo.PreAnalyzeFormalArguments(Ins);
 
   CCInfo.AnalyzeFormalArguments(Ins, CC_PPC32_SVR4);
+  CCInfo.clearWasPPCF128();
 
   for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
     CCValAssign &VA = ArgLocs[i];
@@ -2908,9 +2982,8 @@ PPCTargetLowering::LowerFormalArguments_32SVR4(
 
       // Create load nodes to retrieve arguments from the stack.
       SDValue FIN = DAG.getFrameIndex(FI, PtrVT);
-      InVals.push_back(DAG.getLoad(VA.getValVT(), dl, Chain, FIN,
-                                   MachinePointerInfo(),
-                                   false, false, false, 0));
+      InVals.push_back(
+          DAG.getLoad(VA.getValVT(), dl, Chain, FIN, MachinePointerInfo()));
     }
   }
 
@@ -2955,7 +3028,7 @@ PPCTargetLowering::LowerFormalArguments_32SVR4(
     };
     unsigned NumFPArgRegs = array_lengthof(FPArgRegs);
 
-    if (Subtarget.useSoftFloat())
+    if (useSoftFloat())
        NumFPArgRegs = 0;
 
     FuncInfo->setVarArgsNumGPR(CCInfo.getFirstUnallocated(GPArgRegs));
@@ -2973,8 +3046,8 @@ PPCTargetLowering::LowerFormalArguments_32SVR4(
     SDValue FIN = DAG.getFrameIndex(FuncInfo->getVarArgsFrameIndex(), PtrVT);
 
     // The fixed integer arguments of a variadic function are stored to the
-    // VarArgsFrameIndex on the stack so that they may be loaded by deferencing
-    // the result of va_next.
+    // VarArgsFrameIndex on the stack so that they may be loaded by
+    // dereferencing the result of va_next.
     for (unsigned GPRIndex = 0; GPRIndex != NumGPArgRegs; ++GPRIndex) {
       // Get an existing live-in vreg, or add a new one.
       unsigned VReg = MF.getRegInfo().getLiveInVirtReg(GPArgRegs[GPRIndex]);
@@ -2982,8 +3055,8 @@ PPCTargetLowering::LowerFormalArguments_32SVR4(
         VReg = MF.addLiveIn(GPArgRegs[GPRIndex], &PPC::GPRCRegClass);
 
       SDValue Val = DAG.getCopyFromReg(Chain, dl, VReg, PtrVT);
-      SDValue Store = DAG.getStore(Val.getValue(1), dl, Val, FIN,
-                                   MachinePointerInfo(), false, false, 0);
+      SDValue Store =
+          DAG.getStore(Val.getValue(1), dl, Val, FIN, MachinePointerInfo());
       MemOps.push_back(Store);
       // Increment the address by four for the next argument to store
       SDValue PtrOff = DAG.getConstant(PtrVT.getSizeInBits()/8, dl, PtrVT);
@@ -3001,8 +3074,8 @@ PPCTargetLowering::LowerFormalArguments_32SVR4(
         VReg = MF.addLiveIn(FPArgRegs[FPRIndex], &PPC::F8RCRegClass);
 
       SDValue Val = DAG.getCopyFromReg(Chain, dl, VReg, MVT::f64);
-      SDValue Store = DAG.getStore(Val.getValue(1), dl, Val, FIN,
-                                   MachinePointerInfo(), false, false, 0);
+      SDValue Store =
+          DAG.getStore(Val.getValue(1), dl, Val, FIN, MachinePointerInfo());
       MemOps.push_back(Store);
       // Increment the address by eight for the next argument to store
       SDValue PtrOff = DAG.getConstant(MVT(MVT::f64).getSizeInBits()/8, dl,
@@ -3019,10 +3092,10 @@ PPCTargetLowering::LowerFormalArguments_32SVR4(
 
 // PPC64 passes i8, i16, and i32 values in i64 registers. Promote
 // value to MVT::i64 and then truncate to the correct register size.
-SDValue
-PPCTargetLowering::extendArgForPPC64(ISD::ArgFlagsTy Flags, EVT ObjectVT,
-                                     SelectionDAG &DAG, SDValue ArgVal,
-                                     SDLoc dl) const {
+SDValue PPCTargetLowering::extendArgForPPC64(ISD::ArgFlagsTy Flags,
+                                             EVT ObjectVT, SelectionDAG &DAG,
+                                             SDValue ArgVal,
+                                             const SDLoc &dl) const {
   if (Flags.isSExt())
     ArgVal = DAG.getNode(ISD::AssertSext, dl, MVT::i64, ArgVal,
                          DAG.getValueType(ObjectVT));
@@ -3033,14 +3106,10 @@ PPCTargetLowering::extendArgForPPC64(ISD::ArgFlagsTy Flags, EVT ObjectVT,
   return DAG.getNode(ISD::TRUNCATE, dl, ObjectVT, ArgVal);
 }
 
-SDValue
-PPCTargetLowering::LowerFormalArguments_64SVR4(
-                                      SDValue Chain,
-                                      CallingConv::ID CallConv, bool isVarArg,
-                                      const SmallVectorImpl<ISD::InputArg>
-                                        &Ins,
-                                      SDLoc dl, SelectionDAG &DAG,
-                                      SmallVectorImpl<SDValue> &InVals) const {
+SDValue PPCTargetLowering::LowerFormalArguments_64SVR4(
+    SDValue Chain, CallingConv::ID CallConv, bool isVarArg,
+    const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &dl,
+    SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals) const {
   // TODO: add description of PPC stack frame format, or at least some docs.
   //
   bool isELFv2ABI = Subtarget.isELFv2ABI();
@@ -3052,7 +3121,7 @@ PPCTargetLowering::LowerFormalArguments_64SVR4(
   assert(!(CallConv == CallingConv::Fast && isVarArg) &&
          "fastcc not supported on varargs functions");
 
-  EVT PtrVT = DAG.getTargetLoweringInfo().getPointerTy(MF.getDataLayout());
+  EVT PtrVT = getPointerTy(MF.getDataLayout());
   // Potential tail calls could cause overwriting of argument stack slots.
   bool isImmutable = !(getTargetMachine().Options.GuaranteedTailCallOpt &&
                        (CallConv == CallingConv::Fast));
@@ -3199,15 +3268,13 @@ PPCTargetLowering::LowerFormalArguments_64SVR4(
             EVT ObjType = (ObjSize == 1 ? MVT::i8 :
                            (ObjSize == 2 ? MVT::i16 : MVT::i32));
             Store = DAG.getTruncStore(Val.getValue(1), dl, Val, Arg,
-                                      MachinePointerInfo(&*FuncArg), ObjType,
-                                      false, false, 0);
+                                      MachinePointerInfo(&*FuncArg), ObjType);
           } else {
             // For sizes that don't fit a truncating store (3, 5, 6, 7),
             // store the whole register as-is to the parameter save area
             // slot.
-            Store =
-                DAG.getStore(Val.getValue(1), dl, Val, FIN,
-                             MachinePointerInfo(&*FuncArg), false, false, 0);
+            Store = DAG.getStore(Val.getValue(1), dl, Val, FIN,
+                                 MachinePointerInfo(&*FuncArg));
           }
 
           MemOps.push_back(Store);
@@ -3234,9 +3301,8 @@ PPCTargetLowering::LowerFormalArguments_64SVR4(
           SDValue Off = DAG.getConstant(j, dl, PtrVT);
           Addr = DAG.getNode(ISD::ADD, dl, Off.getValueType(), Addr, Off);
         }
-        SDValue Store =
-            DAG.getStore(Val.getValue(1), dl, Val, Addr,
-                         MachinePointerInfo(&*FuncArg, j), false, false, 0);
+        SDValue Store = DAG.getStore(Val.getValue(1), dl, Val, Addr,
+                                     MachinePointerInfo(&*FuncArg, j));
         MemOps.push_back(Store);
         ++GPR_idx;
       }
@@ -3402,8 +3468,7 @@ PPCTargetLowering::LowerFormalArguments_64SVR4(
         CurArgOffset += ArgSize - ObjSize;
       int FI = MFI->CreateFixedObject(ObjSize, CurArgOffset, isImmutable);
       SDValue FIN = DAG.getFrameIndex(FI, PtrVT);
-      ArgVal = DAG.getLoad(ObjectVT, dl, Chain, FIN, MachinePointerInfo(),
-                           false, false, false, 0);
+      ArgVal = DAG.getLoad(ObjectVT, dl, Chain, FIN, MachinePointerInfo());
     }
 
     InVals.push_back(ArgVal);
@@ -3434,14 +3499,14 @@ PPCTargetLowering::LowerFormalArguments_64SVR4(
     SDValue FIN = DAG.getFrameIndex(FuncInfo->getVarArgsFrameIndex(), PtrVT);
 
     // If this function is vararg, store any remaining integer argument regs
-    // to their spots on the stack so that they may be loaded by deferencing the
-    // result of va_next.
+    // to their spots on the stack so that they may be loaded by dereferencing
+    // the result of va_next.
     for (GPR_idx = (ArgOffset - LinkageSize) / PtrByteSize;
          GPR_idx < Num_GPR_Regs; ++GPR_idx) {
       unsigned VReg = MF.addLiveIn(GPR[GPR_idx], &PPC::G8RCRegClass);
       SDValue Val = DAG.getCopyFromReg(Chain, dl, VReg, PtrVT);
-      SDValue Store = DAG.getStore(Val.getValue(1), dl, Val, FIN,
-                                   MachinePointerInfo(), false, false, 0);
+      SDValue Store =
+          DAG.getStore(Val.getValue(1), dl, Val, FIN, MachinePointerInfo());
       MemOps.push_back(Store);
       // Increment the address by four for the next argument to store
       SDValue PtrOff = DAG.getConstant(PtrByteSize, dl, PtrVT);
@@ -3455,21 +3520,17 @@ PPCTargetLowering::LowerFormalArguments_64SVR4(
   return Chain;
 }
 
-SDValue
-PPCTargetLowering::LowerFormalArguments_Darwin(
-                                      SDValue Chain,
-                                      CallingConv::ID CallConv, bool isVarArg,
-                                      const SmallVectorImpl<ISD::InputArg>
-                                        &Ins,
-                                      SDLoc dl, SelectionDAG &DAG,
-                                      SmallVectorImpl<SDValue> &InVals) const {
+SDValue PPCTargetLowering::LowerFormalArguments_Darwin(
+    SDValue Chain, CallingConv::ID CallConv, bool isVarArg,
+    const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &dl,
+    SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals) const {
   // TODO: add description of PPC stack frame format, or at least some docs.
   //
   MachineFunction &MF = DAG.getMachineFunction();
   MachineFrameInfo *MFI = MF.getFrameInfo();
   PPCFunctionInfo *FuncInfo = MF.getInfo<PPCFunctionInfo>();
 
-  EVT PtrVT = DAG.getTargetLoweringInfo().getPointerTy(MF.getDataLayout());
+  EVT PtrVT = getPointerTy(MF.getDataLayout());
   bool isPPC64 = PtrVT == MVT::i64;
   // Potential tail calls could cause overwriting of argument stack slots.
   bool isImmutable = !(getTargetMachine().Options.GuaranteedTailCallOpt &&
@@ -3613,9 +3674,9 @@ PPCTargetLowering::LowerFormalArguments_Darwin(
             VReg = MF.addLiveIn(GPR[GPR_idx], &PPC::GPRCRegClass);
           SDValue Val = DAG.getCopyFromReg(Chain, dl, VReg, PtrVT);
           EVT ObjType = ObjSize == 1 ? MVT::i8 : MVT::i16;
-          SDValue Store = DAG.getTruncStore(Val.getValue(1), dl, Val, FIN,
-                                            MachinePointerInfo(&*FuncArg),
-                                            ObjType, false, false, 0);
+          SDValue Store =
+              DAG.getTruncStore(Val.getValue(1), dl, Val, FIN,
+                                MachinePointerInfo(&*FuncArg), ObjType);
           MemOps.push_back(Store);
           ++GPR_idx;
         }
@@ -3637,9 +3698,8 @@ PPCTargetLowering::LowerFormalArguments_Darwin(
           int FI = MFI->CreateFixedObject(PtrByteSize, ArgOffset, true);
           SDValue FIN = DAG.getFrameIndex(FI, PtrVT);
           SDValue Val = DAG.getCopyFromReg(Chain, dl, VReg, PtrVT);
-          SDValue Store =
-              DAG.getStore(Val.getValue(1), dl, Val, FIN,
-                           MachinePointerInfo(&*FuncArg, j), false, false, 0);
+          SDValue Store = DAG.getStore(Val.getValue(1), dl, Val, FIN,
+                                       MachinePointerInfo(&*FuncArg, j));
           MemOps.push_back(Store);
           ++GPR_idx;
           ArgOffset += PtrByteSize;
@@ -3760,8 +3820,7 @@ PPCTargetLowering::LowerFormalArguments_Darwin(
                                       CurArgOffset + (ArgSize - ObjSize),
                                       isImmutable);
       SDValue FIN = DAG.getFrameIndex(FI, PtrVT);
-      ArgVal = DAG.getLoad(ObjectVT, dl, Chain, FIN, MachinePointerInfo(),
-                           false, false, false, 0);
+      ArgVal = DAG.getLoad(ObjectVT, dl, Chain, FIN, MachinePointerInfo());
     }
 
     InVals.push_back(ArgVal);
@@ -3795,8 +3854,8 @@ PPCTargetLowering::LowerFormalArguments_Darwin(
     SDValue FIN = DAG.getFrameIndex(FuncInfo->getVarArgsFrameIndex(), PtrVT);
 
     // If this function is vararg, store any remaining integer argument regs
-    // to their spots on the stack so that they may be loaded by deferencing the
-    // result of va_next.
+    // to their spots on the stack so that they may be loaded by dereferencing
+    // the result of va_next.
     for (; GPR_idx != Num_GPR_Regs; ++GPR_idx) {
       unsigned VReg;
 
@@ -3806,8 +3865,8 @@ PPCTargetLowering::LowerFormalArguments_Darwin(
         VReg = MF.addLiveIn(GPR[GPR_idx], &PPC::GPRCRegClass);
 
       SDValue Val = DAG.getCopyFromReg(Chain, dl, VReg, PtrVT);
-      SDValue Store = DAG.getStore(Val.getValue(1), dl, Val, FIN,
-                                   MachinePointerInfo(), false, false, 0);
+      SDValue Store =
+          DAG.getStore(Val.getValue(1), dl, Val, FIN, MachinePointerInfo());
       MemOps.push_back(Store);
       // Increment the address by four for the next argument to store
       SDValue PtrOff = DAG.getConstant(PtrVT.getSizeInBits()/8, dl, PtrVT);
@@ -3838,6 +3897,176 @@ static int CalculateTailCallSPDiff(SelectionDAG& DAG, bool isTailCall,
   return SPDiff;
 }
 
+static bool isFunctionGlobalAddress(SDValue Callee);
+
+static bool
+resideInSameModule(SDValue Callee, Reloc::Model RelMod) {
+  // If !G, Callee can be an external symbol.
+  GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee);
+  if (!G) return false;
+
+  const GlobalValue *GV = G->getGlobal();
+
+  if (GV->isDeclaration()) return false;
+
+  switch(GV->getLinkage()) {
+  default: llvm_unreachable("unknow linkage type");
+  case GlobalValue::AvailableExternallyLinkage:
+  case GlobalValue::ExternalWeakLinkage:
+    return false;
+
+  // Callee with weak linkage is allowed if it has hidden or protected
+  // visibility
+  case GlobalValue::LinkOnceAnyLinkage:
+  case GlobalValue::LinkOnceODRLinkage: // e.g. c++ inline functions
+  case GlobalValue::WeakAnyLinkage:
+  case GlobalValue::WeakODRLinkage:     // e.g. c++ template instantiation
+    if (GV->hasDefaultVisibility())
+      return false;
+
+  case GlobalValue::ExternalLinkage:
+  case GlobalValue::InternalLinkage:
+  case GlobalValue::PrivateLinkage:
+    break;
+  }
+
+  // With '-fPIC', calling default visiblity function need insert 'nop' after
+  // function call, no matter that function resides in same module or not, so
+  // we treat it as in different module.
+  if (RelMod == Reloc::PIC_ && GV->hasDefaultVisibility())
+    return false;
+
+  return true;
+}
+
+static bool
+needStackSlotPassParameters(const PPCSubtarget &Subtarget,
+                            const SmallVectorImpl<ISD::OutputArg> &Outs) {
+  assert(Subtarget.isSVR4ABI() && Subtarget.isPPC64());
+
+  const unsigned PtrByteSize = 8;
+  const unsigned LinkageSize = Subtarget.getFrameLowering()->getLinkageSize();
+
+  static const MCPhysReg GPR[] = {
+    PPC::X3, PPC::X4, PPC::X5, PPC::X6,
+    PPC::X7, PPC::X8, PPC::X9, PPC::X10,
+  };
+  static const MCPhysReg VR[] = {
+    PPC::V2, PPC::V3, PPC::V4, PPC::V5, PPC::V6, PPC::V7, PPC::V8,
+    PPC::V9, PPC::V10, PPC::V11, PPC::V12, PPC::V13
+  };
+
+  const unsigned NumGPRs = array_lengthof(GPR);
+  const unsigned NumFPRs = 13;
+  const unsigned NumVRs = array_lengthof(VR);
+  const unsigned ParamAreaSize = NumGPRs * PtrByteSize;
+
+  unsigned NumBytes = LinkageSize;
+  unsigned AvailableFPRs = NumFPRs;
+  unsigned AvailableVRs = NumVRs;
+
+  for (const ISD::OutputArg& Param : Outs) {
+    if (Param.Flags.isNest()) continue;
+
+    if (CalculateStackSlotUsed(Param.VT, Param.ArgVT, Param.Flags,
+                               PtrByteSize, LinkageSize, ParamAreaSize,
+                               NumBytes, AvailableFPRs, AvailableVRs,
+                               Subtarget.hasQPX()))
+      return true;
+  }
+  return false;
+}
+
+static bool
+hasSameArgumentList(const Function *CallerFn, ImmutableCallSite *CS) {
+  if (CS->arg_size() != CallerFn->getArgumentList().size())
+    return false;
+
+  ImmutableCallSite::arg_iterator CalleeArgIter = CS->arg_begin();
+  ImmutableCallSite::arg_iterator CalleeArgEnd = CS->arg_end();
+  Function::const_arg_iterator CallerArgIter = CallerFn->arg_begin();
+
+  for (; CalleeArgIter != CalleeArgEnd; ++CalleeArgIter, ++CallerArgIter) {
+    const Value* CalleeArg = *CalleeArgIter;
+    const Value* CallerArg = &(*CallerArgIter);
+    if (CalleeArg == CallerArg)
+      continue;
+
+    // e.g. @caller([4 x i64] %a, [4 x i64] %b) {
+    //        tail call @callee([4 x i64] undef, [4 x i64] %b)
+    //      }
+    // 1st argument of callee is undef and has the same type as caller.
+    if (CalleeArg->getType() == CallerArg->getType() &&
+        isa<UndefValue>(CalleeArg))
+      continue;
+
+    return false;
+  }
+
+  return true;
+}
+
+bool
+PPCTargetLowering::IsEligibleForTailCallOptimization_64SVR4(
+                                    SDValue Callee,
+                                    CallingConv::ID CalleeCC,
+                                    ImmutableCallSite *CS,
+                                    bool isVarArg,
+                                    const SmallVectorImpl<ISD::OutputArg> &Outs,
+                                    const SmallVectorImpl<ISD::InputArg> &Ins,
+                                    SelectionDAG& DAG) const {
+  bool TailCallOpt = getTargetMachine().Options.GuaranteedTailCallOpt;
+
+  if (DisableSCO && !TailCallOpt) return false;
+
+  // Variadic argument functions are not supported.
+  if (isVarArg) return false;
+
+  MachineFunction &MF = DAG.getMachineFunction();
+  CallingConv::ID CallerCC = MF.getFunction()->getCallingConv();
+
+  // Tail or Sibling call optimization (TCO/SCO) needs callee and caller has
+  // the same calling convention
+  if (CallerCC != CalleeCC) return false;
+
+  // SCO support C calling convention
+  if (CalleeCC != CallingConv::Fast && CalleeCC != CallingConv::C)
+    return false;
+
+  // Functions containing by val parameters are not supported.
+  if (std::any_of(Ins.begin(), Ins.end(),
+                  [](const ISD::InputArg& IA) { return IA.Flags.isByVal(); }))
+    return false;
+
+  // No TCO/SCO on indirect call because Caller have to restore its TOC
+  if (!isFunctionGlobalAddress(Callee) &&
+      !isa<ExternalSymbolSDNode>(Callee))
+    return false;
+
+  // Check if Callee resides in the same module, because for now, PPC64 SVR4 ABI
+  // (ELFv1/ELFv2) doesn't allow tail calls to a symbol resides in another
+  // module.
+  // ref: https://bugzilla.mozilla.org/show_bug.cgi?id=973977
+  if (!resideInSameModule(Callee, getTargetMachine().getRelocationModel()))
+    return false;
+
+  // TCO allows altering callee ABI, so we don't have to check further.
+  if (CalleeCC == CallingConv::Fast && TailCallOpt)
+    return true;
+
+  if (DisableSCO) return false;
+
+  // If callee use the same argument list that caller is using, then we can
+  // apply SCO on this case. If it is not, then we need to check if callee needs
+  // stack for passing arguments.
+  if (!hasSameArgumentList(MF.getFunction(), CS) &&
+      needStackSlotPassParameters(Subtarget, Outs)) {
+    return false;
+  }
+
+  return true;
+}
+
 /// IsEligibleForTailCallOptimization - Check whether the call is eligible
 /// for tail call optimization. Targets which want to do tail call
 /// optimization should implement this function.
@@ -3888,9 +4117,11 @@ static SDNode *isBLACompatibleAddress(SDValue Op, SelectionDAG &DAG) {
       SignExtend32<26>(Addr) != Addr)
     return nullptr;  // Top 6 bits have to be sext of immediate.
 
-  return DAG.getConstant((int)C->getZExtValue() >> 2, SDLoc(Op),
-                         DAG.getTargetLoweringInfo().getPointerTy(
-                             DAG.getDataLayout())).getNode();
+  return DAG
+      .getConstant(
+          (int)C->getZExtValue() >> 2, SDLoc(Op),
+          DAG.getTargetLoweringInfo().getPointerTy(DAG.getDataLayout()))
+      .getNode();
 }
 
 namespace {
@@ -3905,12 +4136,10 @@ struct TailCallArgumentInfo {
 }
 
 /// StoreTailCallArgumentsToStackSlot - Stores arguments to their stack slot.
-static void
-StoreTailCallArgumentsToStackSlot(SelectionDAG &DAG,
-                                           SDValue Chain,
-                   const SmallVectorImpl<TailCallArgumentInfo> &TailCallArgs,
-                   SmallVectorImpl<SDValue> &MemOpChains,
-                   SDLoc dl) {
+static void StoreTailCallArgumentsToStackSlot(
+    SelectionDAG &DAG, SDValue Chain,
+    const SmallVectorImpl<TailCallArgumentInfo> &TailCallArgs,
+    SmallVectorImpl<SDValue> &MemOpChains, const SDLoc &dl) {
   for (unsigned i = 0, e = TailCallArgs.size(); i != e; ++i) {
     SDValue Arg = TailCallArgs[i].Arg;
     SDValue FIN = TailCallArgs[i].FrameIdxOp;
@@ -3918,48 +4147,40 @@ StoreTailCallArgumentsToStackSlot(SelectionDAG &DAG,
     // Store relative to framepointer.
     MemOpChains.push_back(DAG.getStore(
         Chain, dl, Arg, FIN,
-        MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI), false,
-        false, 0));
+        MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI)));
   }
 }
 
 /// EmitTailCallStoreFPAndRetAddr - Move the frame pointer and return address to
 /// the appropriate stack slot for the tail call optimized function call.
-static SDValue EmitTailCallStoreFPAndRetAddr(SelectionDAG &DAG,
-                                               MachineFunction &MF,
-                                               SDValue Chain,
-                                               SDValue OldRetAddr,
-                                               SDValue OldFP,
-                                               int SPDiff,
-                                               bool isPPC64,
-                                               bool isDarwinABI,
-                                               SDLoc dl) {
+static SDValue EmitTailCallStoreFPAndRetAddr(SelectionDAG &DAG, SDValue Chain,
+                                             SDValue OldRetAddr, SDValue OldFP,
+                                             int SPDiff, const SDLoc &dl) {
   if (SPDiff) {
     // Calculate the new stack slot for the return address.
+    MachineFunction &MF = DAG.getMachineFunction();
+    const PPCSubtarget &Subtarget = MF.getSubtarget<PPCSubtarget>();
+    const PPCFrameLowering *FL = Subtarget.getFrameLowering();
+    bool isPPC64 = Subtarget.isPPC64();
     int SlotSize = isPPC64 ? 8 : 4;
-    const PPCFrameLowering *FL =
-        MF.getSubtarget<PPCSubtarget>().getFrameLowering();
     int NewRetAddrLoc = SPDiff + FL->getReturnSaveOffset();
     int NewRetAddr = MF.getFrameInfo()->CreateFixedObject(SlotSize,
                                                           NewRetAddrLoc, true);
     EVT VT = isPPC64 ? MVT::i64 : MVT::i32;
     SDValue NewRetAddrFrIdx = DAG.getFrameIndex(NewRetAddr, VT);
-    Chain = DAG.getStore(
-        Chain, dl, OldRetAddr, NewRetAddrFrIdx,
-        MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), NewRetAddr),
-        false, false, 0);
+    Chain = DAG.getStore(Chain, dl, OldRetAddr, NewRetAddrFrIdx,
+                         MachinePointerInfo::getFixedStack(MF, NewRetAddr));
 
     // When using the 32/64-bit SVR4 ABI there is no need to move the FP stack
     // slot as the FP is never overwritten.
-    if (isDarwinABI) {
+    if (Subtarget.isDarwinABI()) {
       int NewFPLoc = SPDiff + FL->getFramePointerSaveOffset();
       int NewFPIdx = MF.getFrameInfo()->CreateFixedObject(SlotSize, NewFPLoc,
                                                           true);
       SDValue NewFramePtrIdx = DAG.getFrameIndex(NewFPIdx, VT);
-      Chain = DAG.getStore(
-          Chain, dl, OldFP, NewFramePtrIdx,
-          MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), NewFPIdx),
-          false, false, 0);
+      Chain = DAG.getStore(Chain, dl, OldFP, NewFramePtrIdx,
+                           MachinePointerInfo::getFixedStack(
+                               DAG.getMachineFunction(), NewFPIdx));
     }
   }
   return Chain;
@@ -3986,27 +4207,21 @@ CalculateTailCallArgDest(SelectionDAG &DAG, MachineFunction &MF, bool isPPC64,
 /// EmitTCFPAndRetAddrLoad - Emit load from frame pointer and return address
 /// stack slot. Returns the chain as result and the loaded frame pointers in
 /// LROpOut/FPOpout. Used when tail calling.
-SDValue PPCTargetLowering::EmitTailCallLoadFPAndRetAddr(SelectionDAG & DAG,
-                                                        int SPDiff,
-                                                        SDValue Chain,
-                                                        SDValue &LROpOut,
-                                                        SDValue &FPOpOut,
-                                                        bool isDarwinABI,
-                                                        SDLoc dl) const {
+SDValue PPCTargetLowering::EmitTailCallLoadFPAndRetAddr(
+    SelectionDAG &DAG, int SPDiff, SDValue Chain, SDValue &LROpOut,
+    SDValue &FPOpOut, const SDLoc &dl) const {
   if (SPDiff) {
     // Load the LR and FP stack slot for later adjusting.
     EVT VT = Subtarget.isPPC64() ? MVT::i64 : MVT::i32;
     LROpOut = getReturnAddrFrameIndex(DAG);
-    LROpOut = DAG.getLoad(VT, dl, Chain, LROpOut, MachinePointerInfo(),
-                          false, false, false, 0);
+    LROpOut = DAG.getLoad(VT, dl, Chain, LROpOut, MachinePointerInfo());
     Chain = SDValue(LROpOut.getNode(), 1);
 
     // When using the 32/64-bit SVR4 ABI there is no need to load the FP stack
     // slot as the FP is never overwritten.
-    if (isDarwinABI) {
+    if (Subtarget.isDarwinABI()) {
       FPOpOut = getFramePointerFrameIndex(DAG);
-      FPOpOut = DAG.getLoad(VT, dl, Chain, FPOpOut, MachinePointerInfo(),
-                            false, false, false, 0);
+      FPOpOut = DAG.getLoad(VT, dl, Chain, FPOpOut, MachinePointerInfo());
       Chain = SDValue(FPOpOut.getNode(), 1);
     }
   }
@@ -4019,10 +4234,9 @@ SDValue PPCTargetLowering::EmitTailCallLoadFPAndRetAddr(SelectionDAG & DAG,
 /// a byval function parameter.
 /// Sometimes what we are copying is the end of a larger object, the part that
 /// does not fit in registers.
-static SDValue
-CreateCopyOfByValArgument(SDValue Src, SDValue Dst, SDValue Chain,
-                          ISD::ArgFlagsTy Flags, SelectionDAG &DAG,
-                          SDLoc dl) {
+static SDValue CreateCopyOfByValArgument(SDValue Src, SDValue Dst,
+                                         SDValue Chain, ISD::ArgFlagsTy Flags,
+                                         SelectionDAG &DAG, const SDLoc &dl) {
   SDValue SizeNode = DAG.getConstant(Flags.getByValSize(), dl, MVT::i32);
   return DAG.getMemcpy(Chain, dl, Dst, Src, SizeNode, Flags.getByValAlign(),
                        false, false, false, MachinePointerInfo(),
@@ -4031,13 +4245,11 @@ CreateCopyOfByValArgument(SDValue Src, SDValue Dst, SDValue Chain,
 
 /// LowerMemOpCallTo - Store the argument to the stack or remember it in case of
 /// tail calls.
-static void
-LowerMemOpCallTo(SelectionDAG &DAG, MachineFunction &MF, SDValue Chain,
-                 SDValue Arg, SDValue PtrOff, int SPDiff,
-                 unsigned ArgOffset, bool isPPC64, bool isTailCall,
-                 bool isVector, SmallVectorImpl<SDValue> &MemOpChains,
-                 SmallVectorImpl<TailCallArgumentInfo> &TailCallArguments,
-                 SDLoc dl) {
+static void LowerMemOpCallTo(
+    SelectionDAG &DAG, MachineFunction &MF, SDValue Chain, SDValue Arg,
+    SDValue PtrOff, int SPDiff, unsigned ArgOffset, bool isPPC64,
+    bool isTailCall, bool isVector, SmallVectorImpl<SDValue> &MemOpChains,
+    SmallVectorImpl<TailCallArgumentInfo> &TailCallArguments, const SDLoc &dl) {
   EVT PtrVT = DAG.getTargetLoweringInfo().getPointerTy(DAG.getDataLayout());
   if (!isTailCall) {
     if (isVector) {
@@ -4049,20 +4261,18 @@ LowerMemOpCallTo(SelectionDAG &DAG, MachineFunction &MF, SDValue Chain,
       PtrOff = DAG.getNode(ISD::ADD, dl, PtrVT, StackPtr,
                            DAG.getConstant(ArgOffset, dl, PtrVT));
     }
-    MemOpChains.push_back(DAG.getStore(Chain, dl, Arg, PtrOff,
-                                       MachinePointerInfo(), false, false, 0));
-  // Calculate and remember argument location.
+    MemOpChains.push_back(
+        DAG.getStore(Chain, dl, Arg, PtrOff, MachinePointerInfo()));
+    // Calculate and remember argument location.
   } else CalculateTailCallArgDest(DAG, MF, isPPC64, Arg, SPDiff, ArgOffset,
                                   TailCallArguments);
 }
 
-static
-void PrepareTailCall(SelectionDAG &DAG, SDValue &InFlag, SDValue &Chain,
-                     SDLoc dl, bool isPPC64, int SPDiff, unsigned NumBytes,
-                     SDValue LROp, SDValue FPOp, bool isDarwinABI,
-                     SmallVectorImpl<TailCallArgumentInfo> &TailCallArguments) {
-  MachineFunction &MF = DAG.getMachineFunction();
-
+static void
+PrepareTailCall(SelectionDAG &DAG, SDValue &InFlag, SDValue &Chain,
+                const SDLoc &dl, int SPDiff, unsigned NumBytes, SDValue LROp,
+                SDValue FPOp,
+                SmallVectorImpl<TailCallArgumentInfo> &TailCallArguments) {
   // Emit a sequence of copyto/copyfrom virtual registers for arguments that
   // might overwrite each other in case of tail call optimization.
   SmallVector<SDValue, 8> MemOpChains2;
@@ -4074,8 +4284,7 @@ void PrepareTailCall(SelectionDAG &DAG, SDValue &InFlag, SDValue &Chain,
     Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, MemOpChains2);
 
   // Store the return address to the appropriate stack slot.
-  Chain = EmitTailCallStoreFPAndRetAddr(DAG, MF, Chain, LROp, FPOp, SPDiff,
-                                        isPPC64, isDarwinABI, dl);
+  Chain = EmitTailCallStoreFPAndRetAddr(DAG, Chain, LROp, FPOp, SPDiff, dl);
 
   // Emit callseq_end just before tailcall node.
   Chain = DAG.getCALLSEQ_END(Chain, DAG.getIntPtrConstant(NumBytes, dl, true),
@@ -4091,19 +4300,19 @@ static bool isFunctionGlobalAddress(SDValue Callee) {
         Callee.getOpcode() == ISD::TargetGlobalTLSAddress)
       return false;
 
-    return G->getGlobal()->getType()->getElementType()->isFunctionTy();
+    return G->getGlobal()->getValueType()->isFunctionTy();
   }
 
   return false;
 }
 
-static
-unsigned PrepareCall(SelectionDAG &DAG, SDValue &Callee, SDValue &InFlag,
-                     SDValue &Chain, SDValue CallSeqStart, SDLoc dl, int SPDiff,
-                     bool isTailCall, bool IsPatchPoint, bool hasNest,
-                     SmallVectorImpl<std::pair<unsigned, SDValue> > &RegsToPass,
-                     SmallVectorImpl<SDValue> &Ops, std::vector<EVT> &NodeTys,
-                     ImmutableCallSite *CS, const PPCSubtarget &Subtarget) {
+static unsigned
+PrepareCall(SelectionDAG &DAG, SDValue &Callee, SDValue &InFlag, SDValue &Chain,
+            SDValue CallSeqStart, const SDLoc &dl, int SPDiff, bool isTailCall,
+            bool isPatchPoint, bool hasNest,
+            SmallVectorImpl<std::pair<unsigned, SDValue>> &RegsToPass,
+            SmallVectorImpl<SDValue> &Ops, std::vector<EVT> &NodeTys,
+            ImmutableCallSite *CS, const PPCSubtarget &Subtarget) {
 
   bool isPPC64 = Subtarget.isPPC64();
   bool isSVR4ABI = Subtarget.isSVR4ABI();
@@ -4123,23 +4332,24 @@ unsigned PrepareCall(SelectionDAG &DAG, SDValue &Callee, SDValue &InFlag,
       needIndirectCall = false;
     }
 
+  // PC-relative references to external symbols should go through $stub, unless
+  // we're building with the leopard linker or later, which automatically
+  // synthesizes these stubs.
+  const TargetMachine &TM = DAG.getTarget();
+  const Module *Mod = DAG.getMachineFunction().getFunction()->getParent();
+  const GlobalValue *GV = nullptr;
+  if (auto *G = dyn_cast<GlobalAddressSDNode>(Callee))
+    GV = G->getGlobal();
+  bool Local = TM.shouldAssumeDSOLocal(*Mod, GV);
+  bool UsePlt = !Local && Subtarget.isTargetELF() && !isPPC64;
+
   if (isFunctionGlobalAddress(Callee)) {
     GlobalAddressSDNode *G = cast<GlobalAddressSDNode>(Callee);
     // A call to a TLS address is actually an indirect call to a
     // thread-specific pointer.
     unsigned OpFlags = 0;
-    if ((DAG.getTarget().getRelocationModel() != Reloc::Static &&
-         (Subtarget.getTargetTriple().isMacOSX() &&
-          Subtarget.getTargetTriple().isMacOSXVersionLT(10, 5)) &&
-         !G->getGlobal()->isStrongDefinitionForLinker()) ||
-        (Subtarget.isTargetELF() && !isPPC64 &&
-         !G->getGlobal()->hasLocalLinkage() &&
-         DAG.getTarget().getRelocationModel() == Reloc::PIC_)) {
-      // PC-relative references to external symbols should go through $stub,
-      // unless we're building with the leopard linker or later, which
-      // automatically synthesizes these stubs.
-      OpFlags = PPCII::MO_PLT_OR_STUB;
-    }
+    if (UsePlt)
+      OpFlags = PPCII::MO_PLT;
 
     // If the callee is a GlobalAddress/ExternalSymbol node (quite common,
     // every direct call is) turn it into a TargetGlobalAddress /
@@ -4152,23 +4362,15 @@ unsigned PrepareCall(SelectionDAG &DAG, SDValue &Callee, SDValue &InFlag,
   if (ExternalSymbolSDNode *S = dyn_cast<ExternalSymbolSDNode>(Callee)) {
     unsigned char OpFlags = 0;
 
-    if ((DAG.getTarget().getRelocationModel() != Reloc::Static &&
-         (Subtarget.getTargetTriple().isMacOSX() &&
-          Subtarget.getTargetTriple().isMacOSXVersionLT(10, 5))) ||
-        (Subtarget.isTargetELF() && !isPPC64 &&
-         DAG.getTarget().getRelocationModel() == Reloc::PIC_)) {
-      // PC-relative references to external symbols should go through $stub,
-      // unless we're building with the leopard linker or later, which
-      // automatically synthesizes these stubs.
-      OpFlags = PPCII::MO_PLT_OR_STUB;
-    }
+    if (UsePlt)
+      OpFlags = PPCII::MO_PLT;
 
     Callee = DAG.getTargetExternalSymbol(S->getSymbol(), Callee.getValueType(),
                                          OpFlags);
     needIndirectCall = false;
   }
 
-  if (IsPatchPoint) {
+  if (isPatchPoint) {
     // We'll form an invalid direct call when lowering a patchpoint; the full
     // sequence for an indirect call is complicated, and many of the
     // instructions introduced might have side effects (and, thus, can't be
@@ -4217,24 +4419,26 @@ unsigned PrepareCall(SelectionDAG &DAG, SDValue &Callee, SDValue &InFlag,
       if (LDChain.getValueType() == MVT::Glue)
         LDChain = CallSeqStart.getValue(CallSeqStart->getNumValues()-2);
 
-      bool LoadsInv = Subtarget.hasInvariantFunctionDescriptors();
+      auto MMOFlags = Subtarget.hasInvariantFunctionDescriptors()
+                          ? MachineMemOperand::MOInvariant
+                          : MachineMemOperand::MONone;
 
       MachinePointerInfo MPI(CS ? CS->getCalledValue() : nullptr);
       SDValue LoadFuncPtr = DAG.getLoad(MVT::i64, dl, LDChain, Callee, MPI,
-                                        false, false, LoadsInv, 8);
+                                        /* Alignment = */ 8, MMOFlags);
 
       // Load environment pointer into r11.
       SDValue PtrOff = DAG.getIntPtrConstant(16, dl);
       SDValue AddPtr = DAG.getNode(ISD::ADD, dl, MVT::i64, Callee, PtrOff);
-      SDValue LoadEnvPtr = DAG.getLoad(MVT::i64, dl, LDChain, AddPtr,
-                                       MPI.getWithOffset(16), false, false,
-                                       LoadsInv, 8);
+      SDValue LoadEnvPtr =
+          DAG.getLoad(MVT::i64, dl, LDChain, AddPtr, MPI.getWithOffset(16),
+                      /* Alignment = */ 8, MMOFlags);
 
       SDValue TOCOff = DAG.getIntPtrConstant(8, dl);
       SDValue AddTOC = DAG.getNode(ISD::ADD, dl, MVT::i64, Callee, TOCOff);
-      SDValue TOCPtr = DAG.getLoad(MVT::i64, dl, LDChain, AddTOC,
-                                   MPI.getWithOffset(8), false, false,
-                                   LoadsInv, 8);
+      SDValue TOCPtr =
+          DAG.getLoad(MVT::i64, dl, LDChain, AddTOC, MPI.getWithOffset(8),
+                      /* Alignment = */ 8, MMOFlags);
 
       setUsesTOCBasePtr(DAG);
       SDValue TOCVal = DAG.getCopyToReg(Chain, dl, PPC::X2, TOCPtr,
@@ -4292,7 +4496,7 @@ unsigned PrepareCall(SelectionDAG &DAG, SDValue &Callee, SDValue &InFlag,
 
   // All calls, in both the ELF V1 and V2 ABIs, need the TOC register live
   // into the call.
-  if (isSVR4ABI && isPPC64 && !IsPatchPoint) {
+  if (isSVR4ABI && isPPC64 && !isPatchPoint) {
     setUsesTOCBasePtr(DAG);
     Ops.push_back(DAG.getRegister(PPC::X2, PtrVT));
   }
@@ -4308,12 +4512,10 @@ bool isLocalCall(const SDValue &Callee)
   return false;
 }
 
-SDValue
-PPCTargetLowering::LowerCallResult(SDValue Chain, SDValue InFlag,
-                                   CallingConv::ID CallConv, bool isVarArg,
-                                   const SmallVectorImpl<ISD::InputArg> &Ins,
-                                   SDLoc dl, SelectionDAG &DAG,
-                                   SmallVectorImpl<SDValue> &InVals) const {
+SDValue PPCTargetLowering::LowerCallResult(
+    SDValue Chain, SDValue InFlag, CallingConv::ID CallConv, bool isVarArg,
+    const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &dl,
+    SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals) const {
 
   SmallVector<CCValAssign, 16> RVLocs;
   CCState CCRetInfo(CallConv, isVarArg, DAG.getMachineFunction(), RVLocs,
@@ -4354,23 +4556,18 @@ PPCTargetLowering::LowerCallResult(SDValue Chain, SDValue InFlag,
   return Chain;
 }
 
-SDValue
-PPCTargetLowering::FinishCall(CallingConv::ID CallConv, SDLoc dl,
-                              bool isTailCall, bool isVarArg, bool IsPatchPoint,
-                              bool hasNest, SelectionDAG &DAG,
-                              SmallVector<std::pair<unsigned, SDValue>, 8>
-                                &RegsToPass,
-                              SDValue InFlag, SDValue Chain,
-                              SDValue CallSeqStart, SDValue &Callee,
-                              int SPDiff, unsigned NumBytes,
-                              const SmallVectorImpl<ISD::InputArg> &Ins,
-                              SmallVectorImpl<SDValue> &InVals,
-                              ImmutableCallSite *CS) const {
+SDValue PPCTargetLowering::FinishCall(
+    CallingConv::ID CallConv, const SDLoc &dl, bool isTailCall, bool isVarArg,
+    bool isPatchPoint, bool hasNest, SelectionDAG &DAG,
+    SmallVector<std::pair<unsigned, SDValue>, 8> &RegsToPass, SDValue InFlag,
+    SDValue Chain, SDValue CallSeqStart, SDValue &Callee, int SPDiff,
+    unsigned NumBytes, const SmallVectorImpl<ISD::InputArg> &Ins,
+    SmallVectorImpl<SDValue> &InVals, ImmutableCallSite *CS) const {
 
   std::vector<EVT> NodeTys;
   SmallVector<SDValue, 8> Ops;
   unsigned CallOpc = PrepareCall(DAG, Callee, InFlag, Chain, CallSeqStart, dl,
-                                 SPDiff, isTailCall, IsPatchPoint, hasNest,
+                                 SPDiff, isTailCall, isPatchPoint, hasNest,
                                  RegsToPass, Ops, NodeTys, CS, Subtarget);
 
   // Add implicit use of CR bit 6 for 32-bit SVR4 vararg calls
@@ -4417,7 +4614,7 @@ PPCTargetLowering::FinishCall(CallingConv::ID CallConv, SDLoc dl,
   // same TOC), the NOP will remain unchanged.
 
   if (!isTailCall && Subtarget.isSVR4ABI()&& Subtarget.isPPC64() &&
-      !IsPatchPoint) {
+      !isPatchPoint) {
     if (CallOpc == PPCISD::BCTRL) {
       // This is a call through a function pointer.
       // Restore the caller TOC from the save area into R2.
@@ -4430,7 +4627,7 @@ PPCTargetLowering::FinishCall(CallingConv::ID CallConv, SDLoc dl,
       // allocated and an unnecessary move instruction being generated.
       CallOpc = PPCISD::BCTRL_LOAD_TOC;
 
-      EVT PtrVT = DAG.getTargetLoweringInfo().getPointerTy(DAG.getDataLayout());
+      EVT PtrVT = getPointerTy(DAG.getDataLayout());
       SDValue StackPtr = DAG.getRegister(PPC::X1, PtrVT);
       unsigned TOCSaveOffset = Subtarget.getFrameLowering()->getTOCSaveOffset();
       SDValue TOCOff = DAG.getIntPtrConstant(TOCSaveOffset, dl);
@@ -4472,12 +4669,35 @@ PPCTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
   bool &isTailCall                      = CLI.IsTailCall;
   CallingConv::ID CallConv              = CLI.CallConv;
   bool isVarArg                         = CLI.IsVarArg;
-  bool IsPatchPoint                     = CLI.IsPatchPoint;
+  bool isPatchPoint                     = CLI.IsPatchPoint;
   ImmutableCallSite *CS                 = CLI.CS;
 
-  if (isTailCall)
-    isTailCall = IsEligibleForTailCallOptimization(Callee, CallConv, isVarArg,
-                                                   Ins, DAG);
+  if (isTailCall) {
+    if (Subtarget.isSVR4ABI() && Subtarget.isPPC64())
+      isTailCall =
+        IsEligibleForTailCallOptimization_64SVR4(Callee, CallConv, CS,
+                                                 isVarArg, Outs, Ins, DAG);
+    else
+      isTailCall = IsEligibleForTailCallOptimization(Callee, CallConv, isVarArg,
+                                                     Ins, DAG);
+    if (isTailCall) {
+      ++NumTailCalls;
+      if (!getTargetMachine().Options.GuaranteedTailCallOpt)
+        ++NumSiblingCalls;
+
+      assert(isa<GlobalAddressSDNode>(Callee) &&
+             "Callee should be an llvm::Function object.");
+      DEBUG(
+        const GlobalValue *GV = cast<GlobalAddressSDNode>(Callee)->getGlobal();
+        const unsigned Width = 80 - strlen("TCO caller: ")
+                                  - strlen(", callee linkage: 0, 0");
+        dbgs() << "TCO caller: "
+               << left_justify(DAG.getMachineFunction().getName(), Width)
+               << ", callee linkage: "
+               << GV->getVisibility() << ", " << GV->getLinkage() << "\n"
+      );
+    }
+  }
 
   if (!isTailCall && CS && CS->isMustTailCall())
     report_fatal_error("failed to perform tail call elimination on a call "
@@ -4486,29 +4706,27 @@ PPCTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
   if (Subtarget.isSVR4ABI()) {
     if (Subtarget.isPPC64())
       return LowerCall_64SVR4(Chain, Callee, CallConv, isVarArg,
-                              isTailCall, IsPatchPoint, Outs, OutVals, Ins,
+                              isTailCall, isPatchPoint, Outs, OutVals, Ins,
                               dl, DAG, InVals, CS);
     else
       return LowerCall_32SVR4(Chain, Callee, CallConv, isVarArg,
-                              isTailCall, IsPatchPoint, Outs, OutVals, Ins,
+                              isTailCall, isPatchPoint, Outs, OutVals, Ins,
                               dl, DAG, InVals, CS);
   }
 
   return LowerCall_Darwin(Chain, Callee, CallConv, isVarArg,
-                          isTailCall, IsPatchPoint, Outs, OutVals, Ins,
+                          isTailCall, isPatchPoint, Outs, OutVals, Ins,
                           dl, DAG, InVals, CS);
 }
 
-SDValue
-PPCTargetLowering::LowerCall_32SVR4(SDValue Chain, SDValue Callee,
-                                    CallingConv::ID CallConv, bool isVarArg,
-                                    bool isTailCall, bool IsPatchPoint,
-                                    const SmallVectorImpl<ISD::OutputArg> &Outs,
-                                    const SmallVectorImpl<SDValue> &OutVals,
-                                    const SmallVectorImpl<ISD::InputArg> &Ins,
-                                    SDLoc dl, SelectionDAG &DAG,
-                                    SmallVectorImpl<SDValue> &InVals,
-                                    ImmutableCallSite *CS) const {
+SDValue PPCTargetLowering::LowerCall_32SVR4(
+    SDValue Chain, SDValue Callee, CallingConv::ID CallConv, bool isVarArg,
+    bool isTailCall, bool isPatchPoint,
+    const SmallVectorImpl<ISD::OutputArg> &Outs,
+    const SmallVectorImpl<SDValue> &OutVals,
+    const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &dl,
+    SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals,
+    ImmutableCallSite *CS) const {
   // See PPCTargetLowering::LowerFormalArguments_32SVR4() for a description
   // of the 32-bit SVR4 ABI stack frame layout.
 
@@ -4534,12 +4752,13 @@ PPCTargetLowering::LowerCall_32SVR4(SDValue Chain, SDValue Callee,
 
   // Assign locations to all of the outgoing arguments.
   SmallVector<CCValAssign, 16> ArgLocs;
-  CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), ArgLocs,
-                 *DAG.getContext());
+  PPCCCState CCInfo(CallConv, isVarArg, MF, ArgLocs, *DAG.getContext());
 
   // Reserve space for the linkage area on the stack.
   CCInfo.AllocateStack(Subtarget.getFrameLowering()->getLinkageSize(),
                        PtrByteSize);
+  if (useSoftFloat())
+    CCInfo.PreAnalyzeCallOperands(Outs);
 
   if (isVarArg) {
     // Handle fixed and variable vector arguments differently.
@@ -4572,11 +4791,11 @@ PPCTargetLowering::LowerCall_32SVR4(SDValue Chain, SDValue Callee,
     // All arguments are treated the same.
     CCInfo.AnalyzeCallOperands(Outs, CC_PPC32_SVR4);
   }
+  CCInfo.clearWasPPCF128();
 
   // Assign locations to all of the outgoing aggregate by value arguments.
   SmallVector<CCValAssign, 16> ByValArgLocs;
-  CCState CCByValInfo(CallConv, isVarArg, DAG.getMachineFunction(),
-                      ByValArgLocs, *DAG.getContext());
+  CCState CCByValInfo(CallConv, isVarArg, MF, ByValArgLocs, *DAG.getContext());
 
   // Reserve stack space for the allocations in CCInfo.
   CCByValInfo.AllocateStack(CCInfo.getNextStackOffset(), PtrByteSize);
@@ -4601,8 +4820,7 @@ PPCTargetLowering::LowerCall_32SVR4(SDValue Chain, SDValue Callee,
   // Load the return address and frame pointer so it can be moved somewhere else
   // later.
   SDValue LROp, FPOp;
-  Chain = EmitTailCallLoadFPAndRetAddr(DAG, SPDiff, Chain, LROp, FPOp, false,
-                                       dl);
+  Chain = EmitTailCallLoadFPAndRetAddr(DAG, SPDiff, Chain, LROp, FPOp, dl);
 
   // Set up a copy of the stack pointer for use loading and storing any
   // arguments that may not fit in the registers available for argument
@@ -4676,9 +4894,8 @@ PPCTargetLowering::LowerCall_32SVR4(SDValue Chain, SDValue Callee,
         PtrOff = DAG.getNode(ISD::ADD, dl, getPointerTy(MF.getDataLayout()),
                              StackPtr, PtrOff);
 
-        MemOpChains.push_back(DAG.getStore(Chain, dl, Arg, PtrOff,
-                                           MachinePointerInfo(),
-                                           false, false, 0));
+        MemOpChains.push_back(
+            DAG.getStore(Chain, dl, Arg, PtrOff, MachinePointerInfo()));
       } else {
         // Calculate and remember argument location.
         CalculateTailCallArgDest(DAG, MF, false, Arg, SPDiff, LocMemOffset,
@@ -4712,10 +4929,10 @@ PPCTargetLowering::LowerCall_32SVR4(SDValue Chain, SDValue Callee,
   }
 
   if (isTailCall)
-    PrepareTailCall(DAG, InFlag, Chain, dl, false, SPDiff, NumBytes, LROp, FPOp,
-                    false, TailCallArguments);
+    PrepareTailCall(DAG, InFlag, Chain, dl, SPDiff, NumBytes, LROp, FPOp,
+                    TailCallArguments);
 
-  return FinishCall(CallConv, dl, isTailCall, isVarArg, IsPatchPoint,
+  return FinishCall(CallConv, dl, isTailCall, isVarArg, isPatchPoint,
                     /* unused except on PPC64 ELFv1 */ false, DAG,
                     RegsToPass, InFlag, Chain, CallSeqStart, Callee, SPDiff,
                     NumBytes, Ins, InVals, CS);
@@ -4723,12 +4940,9 @@ PPCTargetLowering::LowerCall_32SVR4(SDValue Chain, SDValue Callee,
 
 // Copy an argument into memory, being careful to do this outside the
 // call sequence for the call to which the argument belongs.
-SDValue
-PPCTargetLowering::createMemcpyOutsideCallSeq(SDValue Arg, SDValue PtrOff,
-                                              SDValue CallSeqStart,
-                                              ISD::ArgFlagsTy Flags,
-                                              SelectionDAG &DAG,
-                                              SDLoc dl) const {
+SDValue PPCTargetLowering::createMemcpyOutsideCallSeq(
+    SDValue Arg, SDValue PtrOff, SDValue CallSeqStart, ISD::ArgFlagsTy Flags,
+    SelectionDAG &DAG, const SDLoc &dl) const {
   SDValue MemcpyCall = CreateCopyOfByValArgument(Arg, PtrOff,
                         CallSeqStart.getNode()->getOperand(0),
                         Flags, DAG, dl);
@@ -4741,27 +4955,29 @@ PPCTargetLowering::createMemcpyOutsideCallSeq(SDValue Arg, SDValue PtrOff,
   return NewCallSeqStart;
 }
 
-SDValue
-PPCTargetLowering::LowerCall_64SVR4(SDValue Chain, SDValue Callee,
-                                    CallingConv::ID CallConv, bool isVarArg,
-                                    bool isTailCall, bool IsPatchPoint,
-                                    const SmallVectorImpl<ISD::OutputArg> &Outs,
-                                    const SmallVectorImpl<SDValue> &OutVals,
-                                    const SmallVectorImpl<ISD::InputArg> &Ins,
-                                    SDLoc dl, SelectionDAG &DAG,
-                                    SmallVectorImpl<SDValue> &InVals,
-                                    ImmutableCallSite *CS) const {
+SDValue PPCTargetLowering::LowerCall_64SVR4(
+    SDValue Chain, SDValue Callee, CallingConv::ID CallConv, bool isVarArg,
+    bool isTailCall, bool isPatchPoint,
+    const SmallVectorImpl<ISD::OutputArg> &Outs,
+    const SmallVectorImpl<SDValue> &OutVals,
+    const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &dl,
+    SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals,
+    ImmutableCallSite *CS) const {
 
   bool isELFv2ABI = Subtarget.isELFv2ABI();
   bool isLittleEndian = Subtarget.isLittleEndian();
   unsigned NumOps = Outs.size();
   bool hasNest = false;
+  bool IsSibCall = false;
 
-  EVT PtrVT = DAG.getTargetLoweringInfo().getPointerTy(DAG.getDataLayout());
+  EVT PtrVT = getPointerTy(DAG.getDataLayout());
   unsigned PtrByteSize = 8;
 
   MachineFunction &MF = DAG.getMachineFunction();
 
+  if (isTailCall && !getTargetMachine().Options.GuaranteedTailCallOpt)
+    IsSibCall = true;
+
   // Mark this function as potentially containing a function that contains a
   // tail call. As a consequence the frame pointer will be used for dynamicalloc
   // and restoring the callers stack pointer in this functions epilog. This is
@@ -4881,9 +5097,12 @@ PPCTargetLowering::LowerCall_64SVR4(SDValue Chain, SDValue Callee,
       CallConv == CallingConv::Fast)
     NumBytes = EnsureStackAlignment(Subtarget.getFrameLowering(), NumBytes);
 
+  int SPDiff = 0;
+
   // Calculate by how many bytes the stack has to be adjusted in case of tail
   // call optimization.
-  int SPDiff = CalculateTailCallSPDiff(DAG, isTailCall, NumBytes);
+  if (!IsSibCall)
+    SPDiff = CalculateTailCallSPDiff(DAG, isTailCall, NumBytes);
 
   // To protect arguments on the stack from being clobbered in a tail call,
   // force all the loads to happen before doing any other lowering.
@@ -4892,15 +5111,15 @@ PPCTargetLowering::LowerCall_64SVR4(SDValue Chain, SDValue Callee,
 
   // Adjust the stack pointer for the new arguments...
   // These operations are automatically eliminated by the prolog/epilog pass
-  Chain = DAG.getCALLSEQ_START(Chain, DAG.getIntPtrConstant(NumBytes, dl, true),
-                               dl);
+  if (!IsSibCall)
+    Chain = DAG.getCALLSEQ_START(Chain,
+                                 DAG.getIntPtrConstant(NumBytes, dl, true), dl);
   SDValue CallSeqStart = Chain;
 
   // Load the return address and frame pointer so it can be move somewhere else
   // later.
   SDValue LROp, FPOp;
-  Chain = EmitTailCallLoadFPAndRetAddr(DAG, SPDiff, Chain, LROp, FPOp, true,
-                                       dl);
+  Chain = EmitTailCallLoadFPAndRetAddr(DAG, SPDiff, Chain, LROp, FPOp, dl);
 
   // Set up a copy of the stack pointer for use loading and storing any
   // arguments that may not fit in the registers available for argument
@@ -4980,8 +5199,7 @@ PPCTargetLowering::LowerCall_64SVR4(SDValue Chain, SDValue Callee,
         EVT VT = (Size==1) ? MVT::i8 : ((Size==2) ? MVT::i16 : MVT::i32);
         if (GPR_idx != NumGPRs) {
           SDValue Load = DAG.getExtLoad(ISD::EXTLOAD, dl, PtrVT, Chain, Arg,
-                                        MachinePointerInfo(), VT,
-                                        false, false, false, 0);
+                                        MachinePointerInfo(), VT);
           MemOpChains.push_back(Load.getValue(1));
           RegsToPass.push_back(std::make_pair(GPR[GPR_idx++], Load));
 
@@ -5041,9 +5259,8 @@ PPCTargetLowering::LowerCall_64SVR4(SDValue Chain, SDValue Callee,
                                                           Flags, DAG, dl);
 
         // Load the slot into the register.
-        SDValue Load = DAG.getLoad(PtrVT, dl, Chain, PtrOff,
-                                   MachinePointerInfo(),
-                                   false, false, false, 0);
+        SDValue Load =
+            DAG.getLoad(PtrVT, dl, Chain, PtrOff, MachinePointerInfo());
         MemOpChains.push_back(Load.getValue(1));
         RegsToPass.push_back(std::make_pair(GPR[GPR_idx++], Load));
 
@@ -5058,9 +5275,8 @@ PPCTargetLowering::LowerCall_64SVR4(SDValue Chain, SDValue Callee,
         SDValue Const = DAG.getConstant(j, dl, PtrOff.getValueType());
         SDValue AddArg = DAG.getNode(ISD::ADD, dl, PtrVT, Arg, Const);
         if (GPR_idx != NumGPRs) {
-          SDValue Load = DAG.getLoad(PtrVT, dl, Chain, AddArg,
-                                     MachinePointerInfo(),
-                                     false, false, false, 0);
+          SDValue Load =
+              DAG.getLoad(PtrVT, dl, Chain, AddArg, MachinePointerInfo());
           MemOpChains.push_back(Load.getValue(1));
           RegsToPass.push_back(std::make_pair(GPR[GPR_idx++], Load));
           ArgOffset += PtrByteSize;
@@ -5214,13 +5430,12 @@ PPCTargetLowering::LowerCall_64SVR4(SDValue Chain, SDValue Callee,
       if (isVarArg) {
         // We could elide this store in the case where the object fits
         // entirely in R registers.  Maybe later.
-        SDValue Store = DAG.getStore(Chain, dl, Arg, PtrOff,
-                                     MachinePointerInfo(), false, false, 0);
+        SDValue Store =
+            DAG.getStore(Chain, dl, Arg, PtrOff, MachinePointerInfo());
         MemOpChains.push_back(Store);
         if (VR_idx != NumVRs) {
-          SDValue Load = DAG.getLoad(MVT::v4f32, dl, Store, PtrOff,
-                                     MachinePointerInfo(),
-                                     false, false, false, 0);
+          SDValue Load =
+              DAG.getLoad(MVT::v4f32, dl, Store, PtrOff, MachinePointerInfo());
           MemOpChains.push_back(Load.getValue(1));
 
           unsigned VReg = (Arg.getSimpleValueType() == MVT::v2f64 ||
@@ -5236,8 +5451,8 @@ PPCTargetLowering::LowerCall_64SVR4(SDValue Chain, SDValue Callee,
             break;
           SDValue Ix = DAG.getNode(ISD::ADD, dl, PtrVT, PtrOff,
                                    DAG.getConstant(i, dl, PtrVT));
-          SDValue Load = DAG.getLoad(PtrVT, dl, Store, Ix, MachinePointerInfo(),
-                                     false, false, false, 0);
+          SDValue Load =
+              DAG.getLoad(PtrVT, dl, Store, Ix, MachinePointerInfo());
           MemOpChains.push_back(Load.getValue(1));
           RegsToPass.push_back(std::make_pair(GPR[GPR_idx++], Load));
         }
@@ -5278,13 +5493,12 @@ PPCTargetLowering::LowerCall_64SVR4(SDValue Chain, SDValue Callee,
       if (isVarArg) {
         // We could elide this store in the case where the object fits
         // entirely in R registers.  Maybe later.
-        SDValue Store = DAG.getStore(Chain, dl, Arg, PtrOff,
-                                     MachinePointerInfo(), false, false, 0);
+        SDValue Store =
+            DAG.getStore(Chain, dl, Arg, PtrOff, MachinePointerInfo());
         MemOpChains.push_back(Store);
         if (QFPR_idx != NumQFPRs) {
-          SDValue Load = DAG.getLoad(IsF32 ? MVT::v4f32 : MVT::v4f64, dl,
-                                     Store, PtrOff, MachinePointerInfo(),
-                                     false, false, false, 0);
+          SDValue Load = DAG.getLoad(IsF32 ? MVT::v4f32 : MVT::v4f64, dl, Store,
+                                     PtrOff, MachinePointerInfo());
           MemOpChains.push_back(Load.getValue(1));
           RegsToPass.push_back(std::make_pair(QFPR[QFPR_idx++], Load));
         }
@@ -5294,8 +5508,8 @@ PPCTargetLowering::LowerCall_64SVR4(SDValue Chain, SDValue Callee,
             break;
           SDValue Ix = DAG.getNode(ISD::ADD, dl, PtrVT, PtrOff,
                                    DAG.getConstant(i, dl, PtrVT));
-          SDValue Load = DAG.getLoad(PtrVT, dl, Store, Ix, MachinePointerInfo(),
-                                     false, false, false, 0);
+          SDValue Load =
+              DAG.getLoad(PtrVT, dl, Store, Ix, MachinePointerInfo());
           MemOpChains.push_back(Load.getValue(1));
           RegsToPass.push_back(std::make_pair(GPR[GPR_idx++], Load));
         }
@@ -5332,7 +5546,7 @@ PPCTargetLowering::LowerCall_64SVR4(SDValue Chain, SDValue Callee,
   // Check if this is an indirect call (MTCTR/BCTRL).
   // See PrepareCall() for more information about calls through function
   // pointers in the 64-bit SVR4 ABI.
-  if (!isTailCall && !IsPatchPoint &&
+  if (!isTailCall && !isPatchPoint &&
       !isFunctionGlobalAddress(Callee) &&
       !isa<ExternalSymbolSDNode>(Callee)) {
     // Load r2 into a virtual register and store it to the TOC save area.
@@ -5344,12 +5558,11 @@ PPCTargetLowering::LowerCall_64SVR4(SDValue Chain, SDValue Callee,
     SDValue AddPtr = DAG.getNode(ISD::ADD, dl, PtrVT, StackPtr, PtrOff);
     Chain = DAG.getStore(
         Val.getValue(1), dl, Val, AddPtr,
-        MachinePointerInfo::getStack(DAG.getMachineFunction(), TOCSaveOffset),
-        false, false, 0);
+        MachinePointerInfo::getStack(DAG.getMachineFunction(), TOCSaveOffset));
     // In the ELFv2 ABI, R12 must contain the address of an indirect callee.
     // This does not mean the MTCTR instruction must use R12; it's easier
     // to model this as an extra parameter, so do that.
-    if (isELFv2ABI && !IsPatchPoint)
+    if (isELFv2ABI && !isPatchPoint)
       RegsToPass.push_back(std::make_pair((unsigned)PPC::X12, Callee));
   }
 
@@ -5362,29 +5575,27 @@ PPCTargetLowering::LowerCall_64SVR4(SDValue Chain, SDValue Callee,
     InFlag = Chain.getValue(1);
   }
 
-  if (isTailCall)
-    PrepareTailCall(DAG, InFlag, Chain, dl, true, SPDiff, NumBytes, LROp,
-                    FPOp, true, TailCallArguments);
+  if (isTailCall && !IsSibCall)
+    PrepareTailCall(DAG, InFlag, Chain, dl, SPDiff, NumBytes, LROp, FPOp,
+                    TailCallArguments);
 
-  return FinishCall(CallConv, dl, isTailCall, isVarArg, IsPatchPoint, hasNest,
+  return FinishCall(CallConv, dl, isTailCall, isVarArg, isPatchPoint, hasNest,
                     DAG, RegsToPass, InFlag, Chain, CallSeqStart, Callee,
                     SPDiff, NumBytes, Ins, InVals, CS);
 }
 
-SDValue
-PPCTargetLowering::LowerCall_Darwin(SDValue Chain, SDValue Callee,
-                                    CallingConv::ID CallConv, bool isVarArg,
-                                    bool isTailCall, bool IsPatchPoint,
-                                    const SmallVectorImpl<ISD::OutputArg> &Outs,
-                                    const SmallVectorImpl<SDValue> &OutVals,
-                                    const SmallVectorImpl<ISD::InputArg> &Ins,
-                                    SDLoc dl, SelectionDAG &DAG,
-                                    SmallVectorImpl<SDValue> &InVals,
-                                    ImmutableCallSite *CS) const {
+SDValue PPCTargetLowering::LowerCall_Darwin(
+    SDValue Chain, SDValue Callee, CallingConv::ID CallConv, bool isVarArg,
+    bool isTailCall, bool isPatchPoint,
+    const SmallVectorImpl<ISD::OutputArg> &Outs,
+    const SmallVectorImpl<SDValue> &OutVals,
+    const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &dl,
+    SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals,
+    ImmutableCallSite *CS) const {
 
   unsigned NumOps = Outs.size();
 
-  EVT PtrVT = DAG.getTargetLoweringInfo().getPointerTy(DAG.getDataLayout());
+  EVT PtrVT = getPointerTy(DAG.getDataLayout());
   bool isPPC64 = PtrVT == MVT::i64;
   unsigned PtrByteSize = isPPC64 ? 8 : 4;
 
@@ -5467,8 +5678,7 @@ PPCTargetLowering::LowerCall_Darwin(SDValue Chain, SDValue Callee,
   // Load the return address and frame pointer so it can be move somewhere else
   // later.
   SDValue LROp, FPOp;
-  Chain = EmitTailCallLoadFPAndRetAddr(DAG, SPDiff, Chain, LROp, FPOp, true,
-                                       dl);
+  Chain = EmitTailCallLoadFPAndRetAddr(DAG, SPDiff, Chain, LROp, FPOp, dl);
 
   // Set up a copy of the stack pointer for use loading and storing any
   // arguments that may not fit in the registers available for argument
@@ -5538,8 +5748,7 @@ PPCTargetLowering::LowerCall_Darwin(SDValue Chain, SDValue Callee,
         EVT VT = (Size==1) ? MVT::i8 : MVT::i16;
         if (GPR_idx != NumGPRs) {
           SDValue Load = DAG.getExtLoad(ISD::EXTLOAD, dl, PtrVT, Chain, Arg,
-                                        MachinePointerInfo(), VT,
-                                        false, false, false, 0);
+                                        MachinePointerInfo(), VT);
           MemOpChains.push_back(Load.getValue(1));
           RegsToPass.push_back(std::make_pair(GPR[GPR_idx++], Load));
 
@@ -5569,9 +5778,8 @@ PPCTargetLowering::LowerCall_Darwin(SDValue Chain, SDValue Callee,
         SDValue Const = DAG.getConstant(j, dl, PtrOff.getValueType());
         SDValue AddArg = DAG.getNode(ISD::ADD, dl, PtrVT, Arg, Const);
         if (GPR_idx != NumGPRs) {
-          SDValue Load = DAG.getLoad(PtrVT, dl, Chain, AddArg,
-                                     MachinePointerInfo(),
-                                     false, false, false, 0);
+          SDValue Load =
+              DAG.getLoad(PtrVT, dl, Chain, AddArg, MachinePointerInfo());
           MemOpChains.push_back(Load.getValue(1));
           RegsToPass.push_back(std::make_pair(GPR[GPR_idx++], Load));
           ArgOffset += PtrByteSize;
@@ -5606,24 +5814,22 @@ PPCTargetLowering::LowerCall_Darwin(SDValue Chain, SDValue Callee,
         RegsToPass.push_back(std::make_pair(FPR[FPR_idx++], Arg));
 
         if (isVarArg) {
-          SDValue Store = DAG.getStore(Chain, dl, Arg, PtrOff,
-                                       MachinePointerInfo(), false, false, 0);
+          SDValue Store =
+              DAG.getStore(Chain, dl, Arg, PtrOff, MachinePointerInfo());
           MemOpChains.push_back(Store);
 
           // Float varargs are always shadowed in available integer registers
           if (GPR_idx != NumGPRs) {
-            SDValue Load = DAG.getLoad(PtrVT, dl, Store, PtrOff,
-                                       MachinePointerInfo(), false, false,
-                                       false, 0);
+            SDValue Load =
+                DAG.getLoad(PtrVT, dl, Store, PtrOff, MachinePointerInfo());
             MemOpChains.push_back(Load.getValue(1));
             RegsToPass.push_back(std::make_pair(GPR[GPR_idx++], Load));
           }
           if (GPR_idx != NumGPRs && Arg.getValueType() == MVT::f64 && !isPPC64){
             SDValue ConstFour = DAG.getConstant(4, dl, PtrOff.getValueType());
             PtrOff = DAG.getNode(ISD::ADD, dl, PtrVT, PtrOff, ConstFour);
-            SDValue Load = DAG.getLoad(PtrVT, dl, Store, PtrOff,
-                                       MachinePointerInfo(),
-                                       false, false, false, 0);
+            SDValue Load =
+                DAG.getLoad(PtrVT, dl, Store, PtrOff, MachinePointerInfo());
             MemOpChains.push_back(Load.getValue(1));
             RegsToPass.push_back(std::make_pair(GPR[GPR_idx++], Load));
           }
@@ -5665,13 +5871,12 @@ PPCTargetLowering::LowerCall_Darwin(SDValue Chain, SDValue Callee,
         // entirely in R registers.  Maybe later.
         PtrOff = DAG.getNode(ISD::ADD, dl, PtrVT, StackPtr,
                              DAG.getConstant(ArgOffset, dl, PtrVT));
-        SDValue Store = DAG.getStore(Chain, dl, Arg, PtrOff,
-                                     MachinePointerInfo(), false, false, 0);
+        SDValue Store =
+            DAG.getStore(Chain, dl, Arg, PtrOff, MachinePointerInfo());
         MemOpChains.push_back(Store);
         if (VR_idx != NumVRs) {
-          SDValue Load = DAG.getLoad(MVT::v4f32, dl, Store, PtrOff,
-                                     MachinePointerInfo(),
-                                     false, false, false, 0);
+          SDValue Load =
+              DAG.getLoad(MVT::v4f32, dl, Store, PtrOff, MachinePointerInfo());
           MemOpChains.push_back(Load.getValue(1));
           RegsToPass.push_back(std::make_pair(VR[VR_idx++], Load));
         }
@@ -5681,8 +5886,8 @@ PPCTargetLowering::LowerCall_Darwin(SDValue Chain, SDValue Callee,
             break;
           SDValue Ix = DAG.getNode(ISD::ADD, dl, PtrVT, PtrOff,
                                    DAG.getConstant(i, dl, PtrVT));
-          SDValue Load = DAG.getLoad(PtrVT, dl, Store, Ix, MachinePointerInfo(),
-                                     false, false, false, 0);
+          SDValue Load =
+              DAG.getLoad(PtrVT, dl, Store, Ix, MachinePointerInfo());
           MemOpChains.push_back(Load.getValue(1));
           RegsToPass.push_back(std::make_pair(GPR[GPR_idx++], Load));
         }
@@ -5754,10 +5959,10 @@ PPCTargetLowering::LowerCall_Darwin(SDValue Chain, SDValue Callee,
   }
 
   if (isTailCall)
-    PrepareTailCall(DAG, InFlag, Chain, dl, isPPC64, SPDiff, NumBytes, LROp,
-                    FPOp, true, TailCallArguments);
+    PrepareTailCall(DAG, InFlag, Chain, dl, SPDiff, NumBytes, LROp, FPOp,
+                    TailCallArguments);
 
-  return FinishCall(CallConv, dl, isTailCall, isVarArg, IsPatchPoint,
+  return FinishCall(CallConv, dl, isTailCall, isVarArg, isPatchPoint,
                     /* unused except on PPC64 ELFv1 */ false, DAG,
                     RegsToPass, InFlag, Chain, CallSeqStart, Callee, SPDiff,
                     NumBytes, Ins, InVals, CS);
@@ -5774,11 +5979,11 @@ PPCTargetLowering::CanLowerReturn(CallingConv::ID CallConv,
 }
 
 SDValue
-PPCTargetLowering::LowerReturn(SDValue Chain,
-                               CallingConv::ID CallConv, bool isVarArg,
+PPCTargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv,
+                               bool isVarArg,
                                const SmallVectorImpl<ISD::OutputArg> &Outs,
                                const SmallVectorImpl<SDValue> &OutVals,
-                               SDLoc dl, SelectionDAG &DAG) const {
+                               const SDLoc &dl, SelectionDAG &DAG) const {
 
   SmallVector<CCValAssign, 16> RVLocs;
   CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), RVLocs,
@@ -5814,6 +6019,25 @@ PPCTargetLowering::LowerReturn(SDValue Chain,
     RetOps.push_back(DAG.getRegister(VA.getLocReg(), VA.getLocVT()));
   }
 
+  const PPCRegisterInfo *TRI = Subtarget.getRegisterInfo();
+  const MCPhysReg *I =
+    TRI->getCalleeSavedRegsViaCopy(&DAG.getMachineFunction());
+  if (I) {
+    for (; *I; ++I) {
+
+      if (PPC::G8RCRegClass.contains(*I))
+        RetOps.push_back(DAG.getRegister(*I, MVT::i64));
+      else if (PPC::F8RCRegClass.contains(*I))
+        RetOps.push_back(DAG.getRegister(*I, MVT::getFloatingPointVT(64)));
+      else if (PPC::CRRCRegClass.contains(*I))
+        RetOps.push_back(DAG.getRegister(*I, MVT::i1));
+      else if (PPC::VRRCRegClass.contains(*I))
+        RetOps.push_back(DAG.getRegister(*I, MVT::Other));
+      else
+        llvm_unreachable("Unexpected register class in CSRsViaCopy!");
+    }
+  }
+
   RetOps[0] = Chain;  // Update chain.
 
   // Add the flag if we have it.
@@ -5823,8 +6047,9 @@ PPCTargetLowering::LowerReturn(SDValue Chain,
   return DAG.getNode(PPCISD::RET_FLAG, dl, MVT::Other, RetOps);
 }
 
-SDValue PPCTargetLowering::LowerGET_DYNAMIC_AREA_OFFSET(
-    SDValue Op, SelectionDAG &DAG, const PPCSubtarget &Subtarget) const {
+SDValue
+PPCTargetLowering::LowerGET_DYNAMIC_AREA_OFFSET(SDValue Op,
+                                                SelectionDAG &DAG) const {
   SDLoc dl(Op);
 
   // Get the corect type for integers.
@@ -5839,13 +6064,13 @@ SDValue PPCTargetLowering::LowerGET_DYNAMIC_AREA_OFFSET(
   return DAG.getNode(PPCISD::DYNAREAOFFSET, dl, VTs, Ops);
 }
 
-SDValue PPCTargetLowering::LowerSTACKRESTORE(SDValue Op, SelectionDAG &DAG,
-                                   const PPCSubtarget &Subtarget) const {
+SDValue PPCTargetLowering::LowerSTACKRESTORE(SDValue Op,
+                                             SelectionDAG &DAG) const {
   // When we pop the dynamic allocation we need to restore the SP link.
   SDLoc dl(Op);
 
   // Get the corect type for pointers.
-  EVT PtrVT = DAG.getTargetLoweringInfo().getPointerTy(DAG.getDataLayout());
+  EVT PtrVT = getPointerTy(DAG.getDataLayout());
 
   // Construct the stack pointer operand.
   bool isPPC64 = Subtarget.isPPC64();
@@ -5857,22 +6082,20 @@ SDValue PPCTargetLowering::LowerSTACKRESTORE(SDValue Op, SelectionDAG &DAG,
   SDValue SaveSP = Op.getOperand(1);
 
   // Load the old link SP.
-  SDValue LoadLinkSP = DAG.getLoad(PtrVT, dl, Chain, StackPtr,
-                                   MachinePointerInfo(),
-                                   false, false, false, 0);
+  SDValue LoadLinkSP =
+      DAG.getLoad(PtrVT, dl, Chain, StackPtr, MachinePointerInfo());
 
   // Restore the stack pointer.
   Chain = DAG.getCopyToReg(LoadLinkSP.getValue(1), dl, SP, SaveSP);
 
   // Store the old link SP.
-  return DAG.getStore(Chain, dl, LoadLinkSP, StackPtr, MachinePointerInfo(),
-                      false, false, 0);
+  return DAG.getStore(Chain, dl, LoadLinkSP, StackPtr, MachinePointerInfo());
 }
 
 SDValue PPCTargetLowering::getReturnAddrFrameIndex(SelectionDAG &DAG) const {
   MachineFunction &MF = DAG.getMachineFunction();
   bool isPPC64 = Subtarget.isPPC64();
-  EVT PtrVT = DAG.getTargetLoweringInfo().getPointerTy(MF.getDataLayout());
+  EVT PtrVT = getPointerTy(MF.getDataLayout());
 
   // Get current frame pointer save index.  The users of this index will be
   // primarily DYNALLOC instructions.
@@ -5895,7 +6118,7 @@ SDValue
 PPCTargetLowering::getFramePointerFrameIndex(SelectionDAG & DAG) const {
   MachineFunction &MF = DAG.getMachineFunction();
   bool isPPC64 = Subtarget.isPPC64();
-  EVT PtrVT = DAG.getTargetLoweringInfo().getPointerTy(MF.getDataLayout());
+  EVT PtrVT = getPointerTy(MF.getDataLayout());
 
   // Get current frame pointer save index.  The users of this index will be
   // primarily DYNALLOC instructions.
@@ -5915,15 +6138,14 @@ PPCTargetLowering::getFramePointerFrameIndex(SelectionDAG & DAG) const {
 }
 
 SDValue PPCTargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op,
-                                         SelectionDAG &DAG,
-                                         const PPCSubtarget &Subtarget) const {
+                                                   SelectionDAG &DAG) const {
   // Get the inputs.
   SDValue Chain = Op.getOperand(0);
   SDValue Size  = Op.getOperand(1);
   SDLoc dl(Op);
 
   // Get the corect type for pointers.
-  EVT PtrVT = DAG.getTargetLoweringInfo().getPointerTy(DAG.getDataLayout());
+  EVT PtrVT = getPointerTy(DAG.getDataLayout());
   // Negate the size.
   SDValue NegSize = DAG.getNode(ISD::SUB, dl, PtrVT,
                                 DAG.getConstant(0, dl, PtrVT), Size);
@@ -6113,7 +6335,7 @@ SDValue PPCTargetLowering::LowerSELECT_CC(SDValue Op, SelectionDAG &DAG) const {
 
 void PPCTargetLowering::LowerFP_TO_INTForReuse(SDValue Op, ReuseLoadInfo &RLI,
                                                SelectionDAG &DAG,
-                                               SDLoc dl) const {
+                                               const SDLoc &dl) const {
   assert(Op.getOperand(0).getValueType().isFloatingPoint());
   SDValue Src = Op.getOperand(0);
   if (Src.getValueType() == MVT::f32)
@@ -6156,15 +6378,14 @@ void PPCTargetLowering::LowerFP_TO_INTForReuse(SDValue Op, ReuseLoadInfo &RLI,
     Chain = DAG.getMemIntrinsicNode(PPCISD::STFIWX, dl,
               DAG.getVTList(MVT::Other), Ops, MVT::i32, MMO);
   } else
-    Chain = DAG.getStore(DAG.getEntryNode(), dl, Tmp, FIPtr,
-                         MPI, false, false, 0);
+    Chain = DAG.getStore(DAG.getEntryNode(), dl, Tmp, FIPtr, MPI);
 
   // Result is a load from the stack slot.  If loading 4 bytes, make sure to
-  // add in a bias.
+  // add in a bias on big endian.
   if (Op.getValueType() == MVT::i32 && !i32Stack) {
     FIPtr = DAG.getNode(ISD::ADD, dl, FIPtr.getValueType(), FIPtr,
                         DAG.getConstant(4, dl, FIPtr.getValueType()));
-    MPI = MPI.getWithOffset(4);
+    MPI = MPI.getWithOffset(Subtarget.isLittleEndian() ? 0 : 4);
   }
 
   RLI.Chain = Chain;
@@ -6177,7 +6398,7 @@ void PPCTargetLowering::LowerFP_TO_INTForReuse(SDValue Op, ReuseLoadInfo &RLI,
 /// need for load/store combinations.
 SDValue PPCTargetLowering::LowerFP_TO_INTDirectMove(SDValue Op,
                                                     SelectionDAG &DAG,
-                                                    SDLoc dl) const {
+                                                    const SDLoc &dl) const {
   assert(Op.getOperand(0).getValueType().isFloatingPoint());
   SDValue Src = Op.getOperand(0);
 
@@ -6208,16 +6429,18 @@ SDValue PPCTargetLowering::LowerFP_TO_INTDirectMove(SDValue Op,
 }
 
 SDValue PPCTargetLowering::LowerFP_TO_INT(SDValue Op, SelectionDAG &DAG,
-                                          SDLoc dl) const {
+                                          const SDLoc &dl) const {
   if (Subtarget.hasDirectMove() && Subtarget.isPPC64())
     return LowerFP_TO_INTDirectMove(Op, DAG, dl);
 
   ReuseLoadInfo RLI;
   LowerFP_TO_INTForReuse(Op, RLI, DAG, dl);
 
-  return DAG.getLoad(Op.getValueType(), dl, RLI.Chain, RLI.Ptr, RLI.MPI, false,
-                     false, RLI.IsInvariant, RLI.Alignment, RLI.AAInfo,
-                     RLI.Ranges);
+  return DAG.getLoad(Op.getValueType(), dl, RLI.Chain, RLI.Ptr, RLI.MPI,
+                     RLI.Alignment,
+                     RLI.IsInvariant ? MachineMemOperand::MOInvariant
+                                     : MachineMemOperand::MONone,
+                     RLI.AAInfo, RLI.Ranges);
 }
 
 // We're trying to insert a regular store, S, and then a load, L. If the
@@ -6251,7 +6474,7 @@ bool PPCTargetLowering::canReuseLoadAddress(SDValue Op, EVT MemVT,
     return false;
 
   RLI.Ptr = LD->getBasePtr();
-  if (LD->isIndexed() && LD->getOffset().getOpcode() != ISD::UNDEF) {
+  if (LD->isIndexed() && !LD->getOffset().isUndef()) {
     assert(LD->getAddressingMode() == ISD::PRE_INC &&
            "Non-pre-inc AM on PPC?");
     RLI.Ptr = DAG.getNode(ISD::ADD, dl, RLI.Ptr.getValueType(), RLI.Ptr,
@@ -6289,12 +6512,36 @@ void PPCTargetLowering::spliceIntoChain(SDValue ResChain,
   DAG.UpdateNodeOperands(TF.getNode(), ResChain, NewResChain);
 }
 
+/// \brief Analyze profitability of direct move
+/// prefer float load to int load plus direct move
+/// when there is no integer use of int load
+static bool directMoveIsProfitable(const SDValue &Op) {
+  SDNode *Origin = Op.getOperand(0).getNode();
+  if (Origin->getOpcode() != ISD::LOAD)
+    return true;
+
+  for (SDNode::use_iterator UI = Origin->use_begin(),
+                            UE = Origin->use_end();
+       UI != UE; ++UI) {
+
+    // Only look at the users of the loaded value.
+    if (UI.getUse().get().getResNo() != 0)
+      continue;
+
+    if (UI->getOpcode() != ISD::SINT_TO_FP &&
+        UI->getOpcode() != ISD::UINT_TO_FP)
+      return true;
+  }
+
+  return false;
+}
+
 /// \brief Custom lowers integer to floating point conversions to use
 /// the direct move instructions available in ISA 2.07 to avoid the
 /// need for load/store combinations.
 SDValue PPCTargetLowering::LowerINT_TO_FPDirectMove(SDValue Op,
                                                     SelectionDAG &DAG,
-                                                    SDLoc dl) const {
+                                                    const SDLoc &dl) const {
   assert((Op.getValueType() == MVT::f32 ||
           Op.getValueType() == MVT::f64) &&
          "Invalid floating point type as target of conversion");
@@ -6335,9 +6582,7 @@ SDValue PPCTargetLowering::LowerINT_TO_FP(SDValue Op,
     // This can be done with an fma and the 0.5 constant: (V+1.0)*0.5 = 0.5*V+0.5
     Value = DAG.getNode(PPCISD::QBFLT, dl, MVT::v4f64, Value);
 
-    SDValue FPHalfs = DAG.getConstantFP(0.5, dl, MVT::f64);
-    FPHalfs = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4f64, FPHalfs, FPHalfs,
-                          FPHalfs, FPHalfs);
+    SDValue FPHalfs = DAG.getConstantFP(0.5, dl, MVT::v4f64);
 
     Value = DAG.getNode(ISD::FMA, dl, MVT::v4f64, Value, FPHalfs, FPHalfs);
 
@@ -6359,7 +6604,8 @@ SDValue PPCTargetLowering::LowerINT_TO_FP(SDValue Op,
 
   // If we have direct moves, we can do all the conversion, skip the store/load
   // however, without FPCVT we can't do most conversions.
-  if (Subtarget.hasDirectMove() && Subtarget.isPPC64() && Subtarget.hasFPCVT())
+  if (Subtarget.hasDirectMove() && directMoveIsProfitable(Op) &&
+      Subtarget.isPPC64() && Subtarget.hasFPCVT())
     return LowerINT_TO_FPDirectMove(Op, DAG, dl);
 
   assert((Op.getOpcode() == ISD::SINT_TO_FP || Subtarget.hasFPCVT()) &&
@@ -6429,9 +6675,11 @@ SDValue PPCTargetLowering::LowerINT_TO_FP(SDValue Op,
 
     MachineFunction &MF = DAG.getMachineFunction();
     if (canReuseLoadAddress(SINT, MVT::i64, RLI, DAG)) {
-      Bits = DAG.getLoad(MVT::f64, dl, RLI.Chain, RLI.Ptr, RLI.MPI, false,
-                         false, RLI.IsInvariant, RLI.Alignment, RLI.AAInfo,
-                         RLI.Ranges);
+      Bits =
+          DAG.getLoad(MVT::f64, dl, RLI.Chain, RLI.Ptr, RLI.MPI, RLI.Alignment,
+                      RLI.IsInvariant ? MachineMemOperand::MOInvariant
+                                      : MachineMemOperand::MONone,
+                      RLI.AAInfo, RLI.Ranges);
       spliceIntoChain(RLI.ResChain, Bits.getValue(1), DAG);
     } else if (Subtarget.hasLFIWAX() &&
                canReuseLoadAddress(SINT, MVT::i32, RLI, DAG, ISD::SEXTLOAD)) {
@@ -6459,15 +6707,15 @@ SDValue PPCTargetLowering::LowerINT_TO_FP(SDValue Op,
                  SINT.getOpcode() == ISD::ZERO_EXTEND)) &&
                SINT.getOperand(0).getValueType() == MVT::i32) {
       MachineFrameInfo *FrameInfo = MF.getFrameInfo();
-      EVT PtrVT = DAG.getTargetLoweringInfo().getPointerTy(DAG.getDataLayout());
+      EVT PtrVT = getPointerTy(DAG.getDataLayout());
 
       int FrameIdx = FrameInfo->CreateStackObject(4, 4, false);
       SDValue FIdx = DAG.getFrameIndex(FrameIdx, PtrVT);
 
-      SDValue Store = DAG.getStore(
-          DAG.getEntryNode(), dl, SINT.getOperand(0), FIdx,
-          MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FrameIdx),
-          false, false, 0);
+      SDValue Store =
+          DAG.getStore(DAG.getEntryNode(), dl, SINT.getOperand(0), FIdx,
+                       MachinePointerInfo::getFixedStack(
+                           DAG.getMachineFunction(), FrameIdx));
 
       assert(cast<StoreSDNode>(Store)->getMemoryVT() == MVT::i32 &&
              "Expected an i32 store");
@@ -6505,7 +6753,7 @@ SDValue PPCTargetLowering::LowerINT_TO_FP(SDValue Op,
   // then lfd it and fcfid it.
   MachineFunction &MF = DAG.getMachineFunction();
   MachineFrameInfo *FrameInfo = MF.getFrameInfo();
-  EVT PtrVT = DAG.getTargetLoweringInfo().getPointerTy(MF.getDataLayout());
+  EVT PtrVT = getPointerTy(MF.getDataLayout());
 
   SDValue Ld;
   if (Subtarget.hasLFIWAX() || Subtarget.hasFPCVT()) {
@@ -6516,10 +6764,10 @@ SDValue PPCTargetLowering::LowerINT_TO_FP(SDValue Op,
       int FrameIdx = FrameInfo->CreateStackObject(4, 4, false);
       SDValue FIdx = DAG.getFrameIndex(FrameIdx, PtrVT);
 
-      SDValue Store = DAG.getStore(
-          DAG.getEntryNode(), dl, Op.getOperand(0), FIdx,
-          MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FrameIdx),
-          false, false, 0);
+      SDValue Store =
+          DAG.getStore(DAG.getEntryNode(), dl, Op.getOperand(0), FIdx,
+                       MachinePointerInfo::getFixedStack(
+                           DAG.getMachineFunction(), FrameIdx));
 
       assert(cast<StoreSDNode>(Store)->getMemoryVT() == MVT::i32 &&
              "Expected an i32 store");
@@ -6554,14 +6802,12 @@ SDValue PPCTargetLowering::LowerINT_TO_FP(SDValue Op,
     // STD the extended value into the stack slot.
     SDValue Store = DAG.getStore(
         DAG.getEntryNode(), dl, Ext64, FIdx,
-        MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FrameIdx),
-        false, false, 0);
+        MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FrameIdx));
 
     // Load the value as a double.
     Ld = DAG.getLoad(
         MVT::f64, dl, Store, FIdx,
-        MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FrameIdx),
-        false, false, false, 0);
+        MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FrameIdx));
   }
 
   // FCFID it and return it.
@@ -6596,7 +6842,7 @@ SDValue PPCTargetLowering::LowerFLT_ROUNDS_(SDValue Op,
 
   MachineFunction &MF = DAG.getMachineFunction();
   EVT VT = Op.getValueType();
-  EVT PtrVT = DAG.getTargetLoweringInfo().getPointerTy(MF.getDataLayout());
+  EVT PtrVT = getPointerTy(MF.getDataLayout());
 
   // Save FP Control Word to register
   EVT NodeTys[] = {
@@ -6608,14 +6854,13 @@ SDValue PPCTargetLowering::LowerFLT_ROUNDS_(SDValue Op,
   // Save FP register to stack slot
   int SSFI = MF.getFrameInfo()->CreateStackObject(8, 8, false);
   SDValue StackSlot = DAG.getFrameIndex(SSFI, PtrVT);
-  SDValue Store = DAG.getStore(DAG.getEntryNode(), dl, Chain,
-                               StackSlot, MachinePointerInfo(), false, false,0);
+  SDValue Store = DAG.getStore(DAG.getEntryNode(), dl, Chain, StackSlot,
+                               MachinePointerInfo());
 
   // Load FP Control Word from low 32 bits of stack slot.
   SDValue Four = DAG.getConstant(4, dl, PtrVT);
   SDValue Addr = DAG.getNode(ISD::ADD, dl, PtrVT, StackSlot, Four);
-  SDValue CWD = DAG.getLoad(MVT::i32, dl, Store, Addr, MachinePointerInfo(),
-                            false, false, false, 0);
+  SDValue CWD = DAG.getLoad(MVT::i32, dl, Store, Addr, MachinePointerInfo());
 
   // Transform as necessary
   SDValue CWD1 =
@@ -6730,7 +6975,7 @@ SDValue PPCTargetLowering::LowerSRA_PARTS(SDValue Op, SelectionDAG &DAG) const {
 /// BuildSplatI - Build a canonical splati of Val with an element size of
 /// SplatSize.  Cast the result to VT.
 static SDValue BuildSplatI(int Val, unsigned SplatSize, EVT VT,
-                             SelectionDAG &DAG, SDLoc dl) {
+                           SelectionDAG &DAG, const SDLoc &dl) {
   assert(Val >= -16 && Val <= 15 && "vsplti is out of range!");
 
   static const MVT VTys[] = { // canonical VT to use for each size.
@@ -6746,18 +6991,13 @@ static SDValue BuildSplatI(int Val, unsigned SplatSize, EVT VT,
   EVT CanonicalVT = VTys[SplatSize-1];
 
   // Build a canonical splat for this value.
-  SDValue Elt = DAG.getConstant(Val, dl, MVT::i32);
-  SmallVector<SDValue, 8> Ops;
-  Ops.assign(CanonicalVT.getVectorNumElements(), Elt);
-  SDValue Res = DAG.getNode(ISD::BUILD_VECTOR, dl, CanonicalVT, Ops);
-  return DAG.getNode(ISD::BITCAST, dl, ReqVT, Res);
+  return DAG.getBitcast(ReqVT, DAG.getConstant(Val, dl, CanonicalVT));
 }
 
 /// BuildIntrinsicOp - Return a unary operator intrinsic node with the
 /// specified intrinsic ID.
-static SDValue BuildIntrinsicOp(unsigned IID, SDValue Op,
-                                SelectionDAG &DAG, SDLoc dl,
-                                EVT DestVT = MVT::Other) {
+static SDValue BuildIntrinsicOp(unsigned IID, SDValue Op, SelectionDAG &DAG,
+                                const SDLoc &dl, EVT DestVT = MVT::Other) {
   if (DestVT == MVT::Other) DestVT = Op.getValueType();
   return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, DestVT,
                      DAG.getConstant(IID, dl, MVT::i32), Op);
@@ -6766,7 +7006,7 @@ static SDValue BuildIntrinsicOp(unsigned IID, SDValue Op,
 /// BuildIntrinsicOp - Return a binary operator intrinsic node with the
 /// specified intrinsic ID.
 static SDValue BuildIntrinsicOp(unsigned IID, SDValue LHS, SDValue RHS,
-                                SelectionDAG &DAG, SDLoc dl,
+                                SelectionDAG &DAG, const SDLoc &dl,
                                 EVT DestVT = MVT::Other) {
   if (DestVT == MVT::Other) DestVT = LHS.getValueType();
   return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, DestVT,
@@ -6776,8 +7016,8 @@ static SDValue BuildIntrinsicOp(unsigned IID, SDValue LHS, SDValue RHS,
 /// BuildIntrinsicOp - Return a ternary operator intrinsic node with the
 /// specified intrinsic ID.
 static SDValue BuildIntrinsicOp(unsigned IID, SDValue Op0, SDValue Op1,
-                                SDValue Op2, SelectionDAG &DAG,
-                                SDLoc dl, EVT DestVT = MVT::Other) {
+                                SDValue Op2, SelectionDAG &DAG, const SDLoc &dl,
+                                EVT DestVT = MVT::Other) {
   if (DestVT == MVT::Other) DestVT = Op0.getValueType();
   return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, DestVT,
                      DAG.getConstant(IID, dl, MVT::i32), Op0, Op1, Op2);
@@ -6785,8 +7025,8 @@ static SDValue BuildIntrinsicOp(unsigned IID, SDValue Op0, SDValue Op1,
 
 /// BuildVSLDOI - Return a VECTOR_SHUFFLE that is a vsldoi of the specified
 /// amount.  The result has the specified value type.
-static SDValue BuildVSLDOI(SDValue LHS, SDValue RHS, unsigned Amt,
-                             EVT VT, SelectionDAG &DAG, SDLoc dl) {
+static SDValue BuildVSLDOI(SDValue LHS, SDValue RHS, unsigned Amt, EVT VT,
+                           SelectionDAG &DAG, const SDLoc &dl) {
   // Force LHS/RHS to be the right type.
   LHS = DAG.getNode(ISD::BITCAST, dl, MVT::v16i8, LHS);
   RHS = DAG.getNode(ISD::BITCAST, dl, MVT::v16i8, RHS);
@@ -6825,7 +7065,7 @@ SDValue PPCTargetLowering::LowerBUILD_VECTOR(SDValue Op,
 
     bool IsConst = true;
     for (unsigned i = 0; i < 4; ++i) {
-      if (BVN->getOperand(i).getOpcode() == ISD::UNDEF) continue;
+      if (BVN->getOperand(i).isUndef()) continue;
       if (!isa<ConstantSDNode>(BVN->getOperand(i))) {
         IsConst = false;
         break;
@@ -6838,12 +7078,12 @@ SDValue PPCTargetLowering::LowerBUILD_VECTOR(SDValue Op,
       Constant *NegOne =
         ConstantFP::get(Type::getFloatTy(*DAG.getContext()), -1.0);
 
-      SmallVector<Constant*, 4> CV(4, NegOne);
+      Constant *CV[4];
       for (unsigned i = 0; i < 4; ++i) {
-        if (BVN->getOperand(i).getOpcode() == ISD::UNDEF)
+        if (BVN->getOperand(i).isUndef())
           CV[i] = UndefValue::get(Type::getFloatTy(*DAG.getContext()));
         else if (isNullConstant(BVN->getOperand(i)))
-          continue;
+          CV[i] = NegOne;
         else
           CV[i] = One;
       }
@@ -6852,15 +7092,8 @@ SDValue PPCTargetLowering::LowerBUILD_VECTOR(SDValue Op,
       SDValue CPIdx = DAG.getConstantPool(CP, getPointerTy(DAG.getDataLayout()),
                                           16 /* alignment */);
 
-      SmallVector<SDValue, 2> Ops;
-      Ops.push_back(DAG.getEntryNode());
-      Ops.push_back(CPIdx);
-
-      SmallVector<EVT, 2> ValueVTs;
-      ValueVTs.push_back(MVT::v4i1);
-      ValueVTs.push_back(MVT::Other); // chain
-      SDVTList VTs = DAG.getVTList(ValueVTs);
-
+      SDValue Ops[] = {DAG.getEntryNode(), CPIdx};
+      SDVTList VTs = DAG.getVTList({MVT::v4i1, /*chain*/ MVT::Other});
       return DAG.getMemIntrinsicNode(
           PPCISD::QVLFSb, dl, VTs, Ops, MVT::v4f32,
           MachinePointerInfo::getConstantPool(DAG.getMachineFunction()));
@@ -6868,7 +7101,7 @@ SDValue PPCTargetLowering::LowerBUILD_VECTOR(SDValue Op,
 
     SmallVector<SDValue, 4> Stores;
     for (unsigned i = 0; i < 4; ++i) {
-      if (BVN->getOperand(i).getOpcode() == ISD::UNDEF) continue;
+      if (BVN->getOperand(i).isUndef()) continue;
 
       unsigned Offset = 4*i;
       SDValue Idx = DAG.getConstant(Offset, dl, FIdx.getValueType());
@@ -6876,19 +7109,16 @@ SDValue PPCTargetLowering::LowerBUILD_VECTOR(SDValue Op,
 
       unsigned StoreSize = BVN->getOperand(i).getValueType().getStoreSize();
       if (StoreSize > 4) {
-        Stores.push_back(DAG.getTruncStore(DAG.getEntryNode(), dl,
-                                           BVN->getOperand(i), Idx,
-                                           PtrInfo.getWithOffset(Offset),
-                                           MVT::i32, false, false, 0));
+        Stores.push_back(
+            DAG.getTruncStore(DAG.getEntryNode(), dl, BVN->getOperand(i), Idx,
+                              PtrInfo.getWithOffset(Offset), MVT::i32));
       } else {
         SDValue StoreValue = BVN->getOperand(i);
         if (StoreSize < 4)
           StoreValue = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, StoreValue);
 
-        Stores.push_back(DAG.getStore(DAG.getEntryNode(), dl,
-                                      StoreValue, Idx,
-                                      PtrInfo.getWithOffset(Offset),
-                                      false, false, 0));
+        Stores.push_back(DAG.getStore(DAG.getEntryNode(), dl, StoreValue, Idx,
+                                      PtrInfo.getWithOffset(Offset)));
       }
     }
 
@@ -6903,15 +7133,10 @@ SDValue PPCTargetLowering::LowerBUILD_VECTOR(SDValue Op,
     // is typed as v4f64 because the QPX register integer states are not
     // explicitly represented.
 
-    SmallVector<SDValue, 2> Ops;
-    Ops.push_back(StoreChain);
-    Ops.push_back(DAG.getConstant(Intrinsic::ppc_qpx_qvlfiwz, dl, MVT::i32));
-    Ops.push_back(FIdx);
-
-    SmallVector<EVT, 2> ValueVTs;
-    ValueVTs.push_back(MVT::v4f64);
-    ValueVTs.push_back(MVT::Other); // chain
-    SDVTList VTs = DAG.getVTList(ValueVTs);
+    SDValue Ops[] = {StoreChain,
+                     DAG.getConstant(Intrinsic::ppc_qpx_qvlfiwz, dl, MVT::i32),
+                     FIdx};
+    SDVTList VTs = DAG.getVTList({MVT::v4f64, /*chain*/ MVT::Other});
 
     SDValue LoadedVect = DAG.getMemIntrinsicNode(ISD::INTRINSIC_W_CHAIN,
       dl, VTs, Ops, MVT::v4i32, PtrInfo);
@@ -6919,9 +7144,7 @@ SDValue PPCTargetLowering::LowerBUILD_VECTOR(SDValue Op,
       DAG.getConstant(Intrinsic::ppc_qpx_qvfcfidu, dl, MVT::i32),
       LoadedVect);
 
-    SDValue FPZeros = DAG.getConstantFP(0.0, dl, MVT::f64);
-    FPZeros = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4f64,
-                          FPZeros, FPZeros, FPZeros, FPZeros);
+    SDValue FPZeros = DAG.getConstantFP(0.0, dl, MVT::v4f64);
 
     return DAG.getSetCC(dl, MVT::v4i1, LoadedVect, FPZeros, ISD::SETEQ);
   }
@@ -6949,8 +7172,7 @@ SDValue PPCTargetLowering::LowerBUILD_VECTOR(SDValue Op,
   if (SplatBits == 0) {
     // Canonicalize all zero vectors to be v4i32.
     if (Op.getValueType() != MVT::v4i32 || HasAnyUndefs) {
-      SDValue Z = DAG.getConstant(0, dl, MVT::i32);
-      Z = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4i32, Z, Z, Z, Z);
+      SDValue Z = DAG.getConstant(0, dl, MVT::v4i32);
       Op = DAG.getNode(ISD::BITCAST, dl, Op.getValueType(), Z);
     }
     return Op;
@@ -7089,7 +7311,7 @@ SDValue PPCTargetLowering::LowerBUILD_VECTOR(SDValue Op,
 /// the specified operations to build the shuffle.
 static SDValue GeneratePerfectShuffle(unsigned PFEntry, SDValue LHS,
                                       SDValue RHS, SelectionDAG &DAG,
-                                      SDLoc dl) {
+                                      const SDLoc &dl) {
   unsigned OpNum = (PFEntry >> 26) & 0x0F;
   unsigned LHSID = (PFEntry >> 13) & ((1 << 13)-1);
   unsigned RHSID = (PFEntry >>  0) & ((1 << 13)-1);
@@ -7175,11 +7397,50 @@ SDValue PPCTargetLowering::LowerVECTOR_SHUFFLE(SDValue Op,
   EVT VT = Op.getValueType();
   bool isLittleEndian = Subtarget.isLittleEndian();
 
+  unsigned ShiftElts, InsertAtByte;
+  bool Swap;
+  if (Subtarget.hasP9Vector() &&
+      PPC::isXXINSERTWMask(SVOp, ShiftElts, InsertAtByte, Swap,
+                           isLittleEndian)) {
+    if (Swap)
+      std::swap(V1, V2);
+    SDValue Conv1 = DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, V1);
+    SDValue Conv2 = DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, V2);
+    if (ShiftElts) {
+      SDValue Shl = DAG.getNode(PPCISD::VECSHL, dl, MVT::v4i32, Conv2, Conv2,
+                                DAG.getConstant(ShiftElts, dl, MVT::i32));
+      SDValue Ins = DAG.getNode(PPCISD::XXINSERT, dl, MVT::v4i32, Conv1, Shl,
+                                DAG.getConstant(InsertAtByte, dl, MVT::i32));
+      return DAG.getNode(ISD::BITCAST, dl, MVT::v16i8, Ins);
+    }
+    SDValue Ins = DAG.getNode(PPCISD::XXINSERT, dl, MVT::v4i32, Conv1, Conv2,
+                              DAG.getConstant(InsertAtByte, dl, MVT::i32));
+    return DAG.getNode(ISD::BITCAST, dl, MVT::v16i8, Ins);
+  }
+
+  if (Subtarget.hasVSX()) {
+    if (V2.isUndef() && PPC::isSplatShuffleMask(SVOp, 4)) {
+      int SplatIdx = PPC::getVSPLTImmediate(SVOp, 4, DAG);
+      SDValue Conv = DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, V1);
+      SDValue Splat = DAG.getNode(PPCISD::XXSPLT, dl, MVT::v4i32, Conv,
+                                  DAG.getConstant(SplatIdx, dl, MVT::i32));
+      return DAG.getNode(ISD::BITCAST, dl, MVT::v16i8, Splat);
+    }
+
+    // Left shifts of 8 bytes are actually swaps. Convert accordingly.
+    if (V2.isUndef() && PPC::isVSLDOIShuffleMask(SVOp, 1, DAG) == 8) {
+      SDValue Conv = DAG.getNode(ISD::BITCAST, dl, MVT::v2f64, V1);
+      SDValue Swap = DAG.getNode(PPCISD::SWAP_NO_CHAIN, dl, MVT::v2f64, Conv);
+      return DAG.getNode(ISD::BITCAST, dl, MVT::v16i8, Swap);
+    }
+
+  }
+
   if (Subtarget.hasQPX()) {
     if (VT.getVectorNumElements() != 4)
       return SDValue();
 
-    if (V2.getOpcode() == ISD::UNDEF) V2 = V1;
+    if (V2.isUndef()) V2 = V1;
 
     int AlignIdx = PPC::isQVALIGNIShuffleMask(SVOp);
     if (AlignIdx != -1) {
@@ -7192,9 +7453,6 @@ SDValue PPCTargetLowering::LowerVECTOR_SHUFFLE(SDValue Op,
         SplatIdx -= 4;
       }
 
-      // FIXME: If SplatIdx == 0 and the input came from a load, then there is
-      // nothing to do.
-
       return DAG.getNode(PPCISD::QVESPLATI, dl, VT, V1,
                          DAG.getConstant(SplatIdx, dl, MVT::i32));
     }
@@ -7217,7 +7475,7 @@ SDValue PPCTargetLowering::LowerVECTOR_SHUFFLE(SDValue Op,
   // Cases that are handled by instructions that take permute immediates
   // (such as vsplt*) should be left as VECTOR_SHUFFLE nodes so they can be
   // selected by the instruction selector.
-  if (V2.getOpcode() == ISD::UNDEF) {
+  if (V2.isUndef()) {
     if (PPC::isSplatShuffleMask(SVOp, 1) ||
         PPC::isSplatShuffleMask(SVOp, 2) ||
         PPC::isSplatShuffleMask(SVOp, 4) ||
@@ -7315,7 +7573,7 @@ SDValue PPCTargetLowering::LowerVECTOR_SHUFFLE(SDValue Op,
 
   // Lower this to a VPERM(V1, V2, V3) expression, where V3 is a constant
   // vector that will get spilled to the constant pool.
-  if (V2.getOpcode() == ISD::UNDEF) V2 = V1;
+  if (V2.isUndef()) V2 = V1;
 
   // The SHUFFLE_VECTOR mask is almost exactly what we want for vperm, except
   // that it is in input element units, not in bytes.  Convert now.
@@ -7340,8 +7598,7 @@ SDValue PPCTargetLowering::LowerVECTOR_SHUFFLE(SDValue Op,
                                              MVT::i32));
   }
 
-  SDValue VPermMask = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v16i8,
-                                  ResultMask);
+  SDValue VPermMask = DAG.getBuildVector(MVT::v16i8, dl, ResultMask);
   if (isLittleEndian)
     return DAG.getNode(PPCISD::VPERM, dl, V1.getValueType(),
                        V2, V1, VPermMask);
@@ -7468,6 +7725,16 @@ static bool getVectorCompareInfo(SDValue Intrin, int &CompareOpc,
 /// lower, do it, otherwise return null.
 SDValue PPCTargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
                                                    SelectionDAG &DAG) const {
+  unsigned IntrinsicID =
+    cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
+
+  if (IntrinsicID == Intrinsic::thread_pointer) {
+    // Reads the thread pointer register, used for __builtin_thread_pointer.
+    bool is64bit = Subtarget.isPPC64();
+    return DAG.getRegister(is64bit ? PPC::X13 : PPC::R2,
+                           is64bit ? MVT::i64 : MVT::i32);
+  }
+
   // If this is a lowered altivec predicate compare, CompareOpc is set to the
   // opcode number of the comparison.
   SDLoc dl(Op);
@@ -7566,12 +7833,10 @@ SDValue PPCTargetLowering::LowerSCALAR_TO_VECTOR(SDValue Op,
   SDValue FIdx = DAG.getFrameIndex(FrameIdx, PtrVT);
 
   // Store the input value into Value#0 of the stack slot.
-  SDValue Store = DAG.getStore(DAG.getEntryNode(), dl,
-                               Op.getOperand(0), FIdx, MachinePointerInfo(),
-                               false, false, 0);
+  SDValue Store = DAG.getStore(DAG.getEntryNode(), dl, Op.getOperand(0), FIdx,
+                               MachinePointerInfo());
   // Load it out.
-  return DAG.getLoad(Op.getValueType(), dl, Store, FIdx, MachinePointerInfo(),
-                     false, false, false, 0);
+  return DAG.getLoad(Op.getValueType(), dl, Store, FIdx, MachinePointerInfo());
 }
 
 SDValue PPCTargetLowering::LowerEXTRACT_VECTOR_ELT(SDValue Op,
@@ -7594,9 +7859,7 @@ SDValue PPCTargetLowering::LowerEXTRACT_VECTOR_ELT(SDValue Op,
 
   // FIXME: We can make this an f32 vector, but the BUILD_VECTOR code needs to
   // understand how to form the extending load.
-  SDValue FPHalfs = DAG.getConstantFP(0.5, dl, MVT::f64);
-  FPHalfs = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4f64,
-                        FPHalfs, FPHalfs, FPHalfs, FPHalfs);
+  SDValue FPHalfs = DAG.getConstantFP(0.5, dl, MVT::v4f64);
 
   Value = DAG.getNode(ISD::FMA, dl, MVT::v4f64, Value, FPHalfs, FPHalfs);
 
@@ -7613,15 +7876,10 @@ SDValue PPCTargetLowering::LowerEXTRACT_VECTOR_ELT(SDValue Op,
   SDValue FIdx = DAG.getFrameIndex(FrameIdx, PtrVT);
 
   SDValue StoreChain = DAG.getEntryNode();
-  SmallVector<SDValue, 2> Ops;
-  Ops.push_back(StoreChain);
-  Ops.push_back(DAG.getConstant(Intrinsic::ppc_qpx_qvstfiw, dl, MVT::i32));
-  Ops.push_back(Value);
-  Ops.push_back(FIdx);
-
-  SmallVector<EVT, 2> ValueVTs;
-  ValueVTs.push_back(MVT::Other); // chain
-  SDVTList VTs = DAG.getVTList(ValueVTs);
+  SDValue Ops[] = {StoreChain,
+                   DAG.getConstant(Intrinsic::ppc_qpx_qvstfiw, dl, MVT::i32),
+                   Value, FIdx};
+  SDVTList VTs = DAG.getVTList(/*chain*/ MVT::Other);
 
   StoreChain = DAG.getMemIntrinsicNode(ISD::INTRINSIC_VOID,
     dl, VTs, Ops, MVT::v4i32, PtrInfo);
@@ -7631,9 +7889,8 @@ SDValue PPCTargetLowering::LowerEXTRACT_VECTOR_ELT(SDValue Op,
   SDValue Idx = DAG.getConstant(Offset, dl, FIdx.getValueType());
   Idx = DAG.getNode(ISD::ADD, dl, FIdx.getValueType(), FIdx, Idx);
 
-  SDValue IntVal = DAG.getLoad(MVT::i32, dl, StoreChain, Idx,
-                               PtrInfo.getWithOffset(Offset),
-                               false, false, false, 0);
+  SDValue IntVal =
+      DAG.getLoad(MVT::i32, dl, StoreChain, Idx, PtrInfo.getWithOffset(Offset));
 
   if (!Subtarget.useCRBits())
     return IntVal;
@@ -7662,24 +7919,20 @@ SDValue PPCTargetLowering::LowerVectorLoad(SDValue Op,
         ScalarMemVT = MemVT.getScalarType();
     unsigned Stride = ScalarMemVT.getStoreSize();
 
-    SmallVector<SDValue, 8> Vals, LoadChains;
+    SDValue Vals[4], LoadChains[4];
     for (unsigned Idx = 0; Idx < 4; ++Idx) {
       SDValue Load;
       if (ScalarVT != ScalarMemVT)
-        Load =
-          DAG.getExtLoad(LN->getExtensionType(), dl, ScalarVT, LoadChain,
-                         BasePtr,
-                         LN->getPointerInfo().getWithOffset(Idx*Stride),
-                         ScalarMemVT, LN->isVolatile(), LN->isNonTemporal(),
-                         LN->isInvariant(), MinAlign(Alignment, Idx*Stride),
-                         LN->getAAInfo());
+        Load = DAG.getExtLoad(LN->getExtensionType(), dl, ScalarVT, LoadChain,
+                              BasePtr,
+                              LN->getPointerInfo().getWithOffset(Idx * Stride),
+                              ScalarMemVT, MinAlign(Alignment, Idx * Stride),
+                              LN->getMemOperand()->getFlags(), LN->getAAInfo());
       else
-        Load =
-          DAG.getLoad(ScalarVT, dl, LoadChain, BasePtr,
-                       LN->getPointerInfo().getWithOffset(Idx*Stride),
-                       LN->isVolatile(), LN->isNonTemporal(),
-                       LN->isInvariant(), MinAlign(Alignment, Idx*Stride),
-                       LN->getAAInfo());
+        Load = DAG.getLoad(ScalarVT, dl, LoadChain, BasePtr,
+                           LN->getPointerInfo().getWithOffset(Idx * Stride),
+                           MinAlign(Alignment, Idx * Stride),
+                           LN->getMemOperand()->getFlags(), LN->getAAInfo());
 
       if (Idx == 0 && LN->isIndexed()) {
         assert(LN->getAddressingMode() == ISD::PRE_INC &&
@@ -7688,8 +7941,8 @@ SDValue PPCTargetLowering::LowerVectorLoad(SDValue Op,
                                   LN->getAddressingMode());
       }
 
-      Vals.push_back(Load);
-      LoadChains.push_back(Load.getValue(1));
+      Vals[Idx] = Load;
+      LoadChains[Idx] = Load.getValue(1);
 
       BasePtr = DAG.getNode(ISD::ADD, dl, BasePtr.getValueType(), BasePtr,
                             DAG.getConstant(Stride, dl,
@@ -7697,8 +7950,7 @@ SDValue PPCTargetLowering::LowerVectorLoad(SDValue Op,
     }
 
     SDValue TF =  DAG.getNode(ISD::TokenFactor, dl, MVT::Other, LoadChains);
-    SDValue Value = DAG.getNode(ISD::BUILD_VECTOR, dl,
-                                Op.getValueType(), Vals);
+    SDValue Value = DAG.getBuildVector(Op.getValueType(), dl, Vals);
 
     if (LN->isIndexed()) {
       SDValue RetOps[] = { Value, Vals[0].getValue(1), TF };
@@ -7715,23 +7967,20 @@ SDValue PPCTargetLowering::LowerVectorLoad(SDValue Op,
   // To lower v4i1 from a byte array, we load the byte elements of the
   // vector and then reuse the BUILD_VECTOR logic.
 
-  SmallVector<SDValue, 4> VectElmts, VectElmtChains;
+  SDValue VectElmts[4], VectElmtChains[4];
   for (unsigned i = 0; i < 4; ++i) {
     SDValue Idx = DAG.getConstant(i, dl, BasePtr.getValueType());
     Idx = DAG.getNode(ISD::ADD, dl, BasePtr.getValueType(), BasePtr, Idx);
 
-    VectElmts.push_back(DAG.getExtLoad(ISD::EXTLOAD,
-                        dl, MVT::i32, LoadChain, Idx,
-                        LN->getPointerInfo().getWithOffset(i),
-                        MVT::i8 /* memory type */,
-                        LN->isVolatile(), LN->isNonTemporal(),
-                        LN->isInvariant(),
-                        1 /* alignment */, LN->getAAInfo()));
-    VectElmtChains.push_back(VectElmts[i].getValue(1));
+    VectElmts[i] = DAG.getExtLoad(
+        ISD::EXTLOAD, dl, MVT::i32, LoadChain, Idx,
+        LN->getPointerInfo().getWithOffset(i), MVT::i8,
+        /* Alignment = */ 1, LN->getMemOperand()->getFlags(), LN->getAAInfo());
+    VectElmtChains[i] = VectElmts[i].getValue(1);
   }
 
   LoadChain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, VectElmtChains);
-  SDValue Value = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4i1, VectElmts);
+  SDValue Value = DAG.getBuildVector(MVT::v4i1, dl, VectElmts);
 
   SDValue RVals[] = { Value, LoadChain };
   return DAG.getMergeValues(RVals, dl);
@@ -7759,7 +8008,7 @@ SDValue PPCTargetLowering::LowerVectorStore(SDValue Op,
         ScalarMemVT = MemVT.getScalarType();
     unsigned Stride = ScalarMemVT.getStoreSize();
 
-    SmallVector<SDValue, 8> Stores;
+    SDValue Stores[4];
     for (unsigned Idx = 0; Idx < 4; ++Idx) {
       SDValue Ex = DAG.getNode(
           ISD::EXTRACT_VECTOR_ELT, dl, ScalarVT, Value,
@@ -7767,16 +8016,15 @@ SDValue PPCTargetLowering::LowerVectorStore(SDValue Op,
       SDValue Store;
       if (ScalarVT != ScalarMemVT)
         Store =
-          DAG.getTruncStore(StoreChain, dl, Ex, BasePtr,
-                            SN->getPointerInfo().getWithOffset(Idx*Stride),
-                            ScalarMemVT, SN->isVolatile(), SN->isNonTemporal(),
-                            MinAlign(Alignment, Idx*Stride), SN->getAAInfo());
+            DAG.getTruncStore(StoreChain, dl, Ex, BasePtr,
+                              SN->getPointerInfo().getWithOffset(Idx * Stride),
+                              ScalarMemVT, MinAlign(Alignment, Idx * Stride),
+                              SN->getMemOperand()->getFlags(), SN->getAAInfo());
       else
-        Store =
-          DAG.getStore(StoreChain, dl, Ex, BasePtr,
-                       SN->getPointerInfo().getWithOffset(Idx*Stride),
-                       SN->isVolatile(), SN->isNonTemporal(),
-                       MinAlign(Alignment, Idx*Stride), SN->getAAInfo());
+        Store = DAG.getStore(StoreChain, dl, Ex, BasePtr,
+                             SN->getPointerInfo().getWithOffset(Idx * Stride),
+                             MinAlign(Alignment, Idx * Stride),
+                             SN->getMemOperand()->getFlags(), SN->getAAInfo());
 
       if (Idx == 0 && SN->isIndexed()) {
         assert(SN->getAddressingMode() == ISD::PRE_INC &&
@@ -7788,7 +8036,7 @@ SDValue PPCTargetLowering::LowerVectorStore(SDValue Op,
       BasePtr = DAG.getNode(ISD::ADD, dl, BasePtr.getValueType(), BasePtr,
                             DAG.getConstant(Stride, dl,
                                             BasePtr.getValueType()));
-      Stores.push_back(Store);
+      Stores[Idx] = Store;
     }
 
     SDValue TF =  DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Stores);
@@ -7811,9 +8059,7 @@ SDValue PPCTargetLowering::LowerVectorStore(SDValue Op,
 
   // FIXME: We can make this an f32 vector, but the BUILD_VECTOR code needs to
   // understand how to form the extending load.
-  SDValue FPHalfs = DAG.getConstantFP(0.5, dl, MVT::f64);
-  FPHalfs = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4f64,
-                        FPHalfs, FPHalfs, FPHalfs, FPHalfs);
+  SDValue FPHalfs = DAG.getConstantFP(0.5, dl, MVT::v4f64);
 
   Value = DAG.getNode(ISD::FMA, dl, MVT::v4f64, Value, FPHalfs, FPHalfs);
 
@@ -7829,43 +8075,37 @@ SDValue PPCTargetLowering::LowerVectorStore(SDValue Op,
   EVT PtrVT = getPointerTy(DAG.getDataLayout());
   SDValue FIdx = DAG.getFrameIndex(FrameIdx, PtrVT);
 
-  SmallVector<SDValue, 2> Ops;
-  Ops.push_back(StoreChain);
-  Ops.push_back(DAG.getConstant(Intrinsic::ppc_qpx_qvstfiw, dl, MVT::i32));
-  Ops.push_back(Value);
-  Ops.push_back(FIdx);
-
-  SmallVector<EVT, 2> ValueVTs;
-  ValueVTs.push_back(MVT::Other); // chain
-  SDVTList VTs = DAG.getVTList(ValueVTs);
+  SDValue Ops[] = {StoreChain,
+                   DAG.getConstant(Intrinsic::ppc_qpx_qvstfiw, dl, MVT::i32),
+                   Value, FIdx};
+  SDVTList VTs = DAG.getVTList(/*chain*/ MVT::Other);
 
   StoreChain = DAG.getMemIntrinsicNode(ISD::INTRINSIC_VOID,
     dl, VTs, Ops, MVT::v4i32, PtrInfo);
 
   // Move data into the byte array.
-  SmallVector<SDValue, 4> Loads, LoadChains;
+  SDValue Loads[4], LoadChains[4];
   for (unsigned i = 0; i < 4; ++i) {
     unsigned Offset = 4*i;
     SDValue Idx = DAG.getConstant(Offset, dl, FIdx.getValueType());
     Idx = DAG.getNode(ISD::ADD, dl, FIdx.getValueType(), FIdx, Idx);
 
-    Loads.push_back(DAG.getLoad(MVT::i32, dl, StoreChain, Idx,
-                                   PtrInfo.getWithOffset(Offset),
-                                   false, false, false, 0));
-    LoadChains.push_back(Loads[i].getValue(1));
+    Loads[i] = DAG.getLoad(MVT::i32, dl, StoreChain, Idx,
+                           PtrInfo.getWithOffset(Offset));
+    LoadChains[i] = Loads[i].getValue(1);
   }
 
   StoreChain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, LoadChains);
 
-  SmallVector<SDValue, 4> Stores;
+  SDValue Stores[4];
   for (unsigned i = 0; i < 4; ++i) {
     SDValue Idx = DAG.getConstant(i, dl, BasePtr.getValueType());
     Idx = DAG.getNode(ISD::ADD, dl, BasePtr.getValueType(), BasePtr, Idx);
 
-    Stores.push_back(DAG.getTruncStore(
+    Stores[i] = DAG.getTruncStore(
         StoreChain, dl, Loads[i], Idx, SN->getPointerInfo().getWithOffset(i),
-        MVT::i8 /* memory type */, SN->isNonTemporal(), SN->isVolatile(),
-        1 /* alignment */, SN->getAAInfo()));
+        MVT::i8, /* Alignment = */ 1, SN->getMemOperand()->getFlags(),
+        SN->getAAInfo());
   }
 
   StoreChain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Stores);
@@ -7958,18 +8198,22 @@ SDValue PPCTargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
   case ISD::INIT_TRAMPOLINE:    return LowerINIT_TRAMPOLINE(Op, DAG);
   case ISD::ADJUST_TRAMPOLINE:  return LowerADJUST_TRAMPOLINE(Op, DAG);
   case ISD::VASTART:
-    return LowerVASTART(Op, DAG, Subtarget);
+    return LowerVASTART(Op, DAG);
 
   case ISD::VAARG:
-    return LowerVAARG(Op, DAG, Subtarget);
+    return LowerVAARG(Op, DAG);
 
   case ISD::VACOPY:
-    return LowerVACOPY(Op, DAG, Subtarget);
+    return LowerVACOPY(Op, DAG);
+
+  case ISD::STACKRESTORE:
+    return LowerSTACKRESTORE(Op, DAG);
 
-  case ISD::STACKRESTORE:       return LowerSTACKRESTORE(Op, DAG, Subtarget);
   case ISD::DYNAMIC_STACKALLOC:
-    return LowerDYNAMIC_STACKALLOC(Op, DAG, Subtarget);
-  case ISD::GET_DYNAMIC_AREA_OFFSET: return LowerGET_DYNAMIC_AREA_OFFSET(Op, DAG, Subtarget);
+    return LowerDYNAMIC_STACKALLOC(Op, DAG);
+
+  case ISD::GET_DYNAMIC_AREA_OFFSET:
+    return LowerGET_DYNAMIC_AREA_OFFSET(Op, DAG);
 
   case ISD::EH_SJLJ_SETJMP:     return lowerEH_SJLJ_SETJMP(Op, DAG);
   case ISD::EH_SJLJ_LONGJMP:    return lowerEH_SJLJ_LONGJMP(Op, DAG);
@@ -8048,7 +8292,7 @@ void PPCTargetLowering::ReplaceNodeResults(SDNode *N,
     EVT VT = N->getValueType(0);
 
     if (VT == MVT::i64) {
-      SDValue NewNode = LowerVAARG(SDValue(N, 1), DAG, Subtarget);
+      SDValue NewNode = LowerVAARG(SDValue(N, 1), DAG);
 
       Results.push_back(NewNode);
       Results.push_back(NewNode.getValue(1));
@@ -8099,9 +8343,9 @@ static Instruction* callIntrinsic(IRBuilder<> &Builder, Intrinsic::ID Id) {
 Instruction* PPCTargetLowering::emitLeadingFence(IRBuilder<> &Builder,
                                          AtomicOrdering Ord, bool IsStore,
                                          bool IsLoad) const {
-  if (Ord == SequentiallyConsistent)
+  if (Ord == AtomicOrdering::SequentiallyConsistent)
     return callIntrinsic(Builder, Intrinsic::ppc_sync);
-  if (isAtLeastRelease(Ord))
+  if (isReleaseOrStronger(Ord))
     return callIntrinsic(Builder, Intrinsic::ppc_lwsync);
   return nullptr;
 }
@@ -8109,7 +8353,7 @@ Instruction* PPCTargetLowering::emitLeadingFence(IRBuilder<> &Builder,
 Instruction* PPCTargetLowering::emitTrailingFence(IRBuilder<> &Builder,
                                           AtomicOrdering Ord, bool IsStore,
                                           bool IsLoad) const {
-  if (IsLoad && isAtLeastAcquire(Ord))
+  if (IsLoad && isAcquireOrStronger(Ord))
     return callIntrinsic(Builder, Intrinsic::ppc_lwsync);
   // FIXME: this is too conservative, a dependent branch + isync is enough.
   // See http://www.cl.cam.ac.uk/~pes20/cpp/cpp0xmappings.html and
@@ -8119,7 +8363,7 @@ Instruction* PPCTargetLowering::emitTrailingFence(IRBuilder<> &Builder,
 }
 
 MachineBasicBlock *
-PPCTargetLowering::EmitAtomicBinary(MachineInstr *MI, MachineBasicBlock *BB,
+PPCTargetLowering::EmitAtomicBinary(MachineInstr &MI, MachineBasicBlock *BB,
                                     unsigned AtomicSize,
                                     unsigned BinOpcode) const {
   // This also handles ATOMIC_SWAP, indicated by BinOpcode==0.
@@ -8154,11 +8398,11 @@ PPCTargetLowering::EmitAtomicBinary(MachineInstr *MI, MachineBasicBlock *BB,
   MachineFunction *F = BB->getParent();
   MachineFunction::iterator It = ++BB->getIterator();
 
-  unsigned dest = MI->getOperand(0).getReg();
-  unsigned ptrA = MI->getOperand(1).getReg();
-  unsigned ptrB = MI->getOperand(2).getReg();
-  unsigned incr = MI->getOperand(3).getReg();
-  DebugLoc dl = MI->getDebugLoc();
+  unsigned dest = MI.getOperand(0).getReg();
+  unsigned ptrA = MI.getOperand(1).getReg();
+  unsigned ptrB = MI.getOperand(2).getReg();
+  unsigned incr = MI.getOperand(3).getReg();
+  DebugLoc dl = MI.getDebugLoc();
 
   MachineBasicBlock *loopMBB = F->CreateMachineBasicBlock(LLVM_BB);
   MachineBasicBlock *exitMBB = F->CreateMachineBasicBlock(LLVM_BB);
@@ -8203,9 +8447,9 @@ PPCTargetLowering::EmitAtomicBinary(MachineInstr *MI, MachineBasicBlock *BB,
 }
 
 MachineBasicBlock *
-PPCTargetLowering::EmitPartwordAtomicBinary(MachineInstr *MI,
+PPCTargetLowering::EmitPartwordAtomicBinary(MachineInstr &MI,
                                             MachineBasicBlock *BB,
-                                            bool is8bit,    // operation
+                                            bool is8bit, // operation
                                             unsigned BinOpcode) const {
   // If we support part-word atomic mnemonics, just use them
   if (Subtarget.hasPartwordAtomics())
@@ -8224,11 +8468,11 @@ PPCTargetLowering::EmitPartwordAtomicBinary(MachineInstr *MI,
   MachineFunction *F = BB->getParent();
   MachineFunction::iterator It = ++BB->getIterator();
 
-  unsigned dest = MI->getOperand(0).getReg();
-  unsigned ptrA = MI->getOperand(1).getReg();
-  unsigned ptrB = MI->getOperand(2).getReg();
-  unsigned incr = MI->getOperand(3).getReg();
-  DebugLoc dl = MI->getDebugLoc();
+  unsigned dest = MI.getOperand(0).getReg();
+  unsigned ptrA = MI.getOperand(1).getReg();
+  unsigned ptrB = MI.getOperand(2).getReg();
+  unsigned incr = MI.getOperand(3).getReg();
+  DebugLoc dl = MI.getDebugLoc();
 
   MachineBasicBlock *loopMBB = F->CreateMachineBasicBlock(LLVM_BB);
   MachineBasicBlock *exitMBB = F->CreateMachineBasicBlock(LLVM_BB);
@@ -8334,10 +8578,10 @@ PPCTargetLowering::EmitPartwordAtomicBinary(MachineInstr *MI,
   return BB;
 }
 
-llvm::MachineBasicBlock*
-PPCTargetLowering::emitEHSjLjSetJmp(MachineInstr *MI,
+llvm::MachineBasicBlock *
+PPCTargetLowering::emitEHSjLjSetJmp(MachineInstr &MI,
                                     MachineBasicBlock *MBB) const {
-  DebugLoc DL = MI->getDebugLoc();
+  DebugLoc DL = MI.getDebugLoc();
   const TargetInstrInfo *TII = Subtarget.getInstrInfo();
 
   MachineFunction *MF = MBB->getParent();
@@ -8347,10 +8591,10 @@ PPCTargetLowering::emitEHSjLjSetJmp(MachineInstr *MI,
   MachineFunction::iterator I = ++MBB->getIterator();
 
   // Memory Reference
-  MachineInstr::mmo_iterator MMOBegin = MI->memoperands_begin();
-  MachineInstr::mmo_iterator MMOEnd = MI->memoperands_end();
+  MachineInstr::mmo_iterator MMOBegin = MI.memoperands_begin();
+  MachineInstr::mmo_iterator MMOEnd = MI.memoperands_end();
 
-  unsigned DstReg = MI->getOperand(0).getReg();
+  unsigned DstReg = MI.getOperand(0).getReg();
   const TargetRegisterClass *RC = MRI.getRegClass(DstReg);
   assert(RC->hasType(MVT::i32) && "Invalid destination!");
   unsigned mainDstReg = MRI.createVirtualRegister(RC);
@@ -8407,7 +8651,7 @@ PPCTargetLowering::emitEHSjLjSetJmp(MachineInstr *MI,
   // Prepare IP either in reg.
   const TargetRegisterClass *PtrRC = getRegClassFor(PVT);
   unsigned LabelReg = MRI.createVirtualRegister(PtrRC);
-  unsigned BufReg = MI->getOperand(1).getReg();
+  unsigned BufReg = MI.getOperand(1).getReg();
 
   if (Subtarget.isPPC64() && Subtarget.isSVR4ABI()) {
     setUsesTOCBasePtr(*MBB->getParent());
@@ -8477,22 +8721,22 @@ PPCTargetLowering::emitEHSjLjSetJmp(MachineInstr *MI,
     .addReg(mainDstReg).addMBB(mainMBB)
     .addReg(restoreDstReg).addMBB(thisMBB);
 
-  MI->eraseFromParent();
+  MI.eraseFromParent();
   return sinkMBB;
 }
 
 MachineBasicBlock *
-PPCTargetLowering::emitEHSjLjLongJmp(MachineInstr *MI,
+PPCTargetLowering::emitEHSjLjLongJmp(MachineInstr &MI,
                                      MachineBasicBlock *MBB) const {
-  DebugLoc DL = MI->getDebugLoc();
+  DebugLoc DL = MI.getDebugLoc();
   const TargetInstrInfo *TII = Subtarget.getInstrInfo();
 
   MachineFunction *MF = MBB->getParent();
   MachineRegisterInfo &MRI = MF->getRegInfo();
 
   // Memory Reference
-  MachineInstr::mmo_iterator MMOBegin = MI->memoperands_begin();
-  MachineInstr::mmo_iterator MMOEnd = MI->memoperands_end();
+  MachineInstr::mmo_iterator MMOBegin = MI.memoperands_begin();
+  MachineInstr::mmo_iterator MMOEnd = MI.memoperands_end();
 
   MVT PVT = getPointerTy(MF->getDataLayout());
   assert((PVT == MVT::i64 || PVT == MVT::i32) &&
@@ -8507,10 +8751,8 @@ PPCTargetLowering::emitEHSjLjLongJmp(MachineInstr *MI,
   unsigned BP =
       (PVT == MVT::i64)
           ? PPC::X30
-          : (Subtarget.isSVR4ABI() &&
-                     MF->getTarget().getRelocationModel() == Reloc::PIC_
-                 ? PPC::R29
-                 : PPC::R30);
+          : (Subtarget.isSVR4ABI() && isPositionIndependent() ? PPC::R29
+                                                              : PPC::R30);
 
   MachineInstrBuilder MIB;
 
@@ -8519,7 +8761,7 @@ PPCTargetLowering::emitEHSjLjLongJmp(MachineInstr *MI,
   const int64_t TOCOffset   = 3 * PVT.getStoreSize();
   const int64_t BPOffset    = 4 * PVT.getStoreSize();
 
-  unsigned BufReg = MI->getOperand(0).getReg();
+  unsigned BufReg = MI.getOperand(0).getReg();
 
   // Reload FP (the jumped-to function may not have had a
   // frame pointer, and if so, then its r31 will be restored
@@ -8586,34 +8828,34 @@ PPCTargetLowering::emitEHSjLjLongJmp(MachineInstr *MI,
           TII->get(PVT == MVT::i64 ? PPC::MTCTR8 : PPC::MTCTR)).addReg(Tmp);
   BuildMI(*MBB, MI, DL, TII->get(PVT == MVT::i64 ? PPC::BCTR8 : PPC::BCTR));
 
-  MI->eraseFromParent();
+  MI.eraseFromParent();
   return MBB;
 }
 
 MachineBasicBlock *
-PPCTargetLowering::EmitInstrWithCustomInserter(MachineInstr *MI,
+PPCTargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI,
                                                MachineBasicBlock *BB) const {
-  if (MI->getOpcode() == TargetOpcode::STACKMAP ||
-      MI->getOpcode() == TargetOpcode::PATCHPOINT) {
+  if (MI.getOpcode() == TargetOpcode::STACKMAP ||
+      MI.getOpcode() == TargetOpcode::PATCHPOINT) {
     if (Subtarget.isPPC64() && Subtarget.isSVR4ABI() &&
-        MI->getOpcode() == TargetOpcode::PATCHPOINT) {
+        MI.getOpcode() == TargetOpcode::PATCHPOINT) {
       // Call lowering should have added an r2 operand to indicate a dependence
       // on the TOC base pointer value. It can't however, because there is no
       // way to mark the dependence as implicit there, and so the stackmap code
       // will confuse it with a regular operand. Instead, add the dependence
       // here.
       setUsesTOCBasePtr(*BB->getParent());
-      MI->addOperand(MachineOperand::CreateReg(PPC::X2, false, true));
+      MI.addOperand(MachineOperand::CreateReg(PPC::X2, false, true));
     }
 
     return emitPatchPoint(MI, BB);
   }
 
-  if (MI->getOpcode() == PPC::EH_SjLj_SetJmp32 ||
-      MI->getOpcode() == PPC::EH_SjLj_SetJmp64) {
+  if (MI.getOpcode() == PPC::EH_SjLj_SetJmp32 ||
+      MI.getOpcode() == PPC::EH_SjLj_SetJmp64) {
     return emitEHSjLjSetJmp(MI, BB);
-  } else if (MI->getOpcode() == PPC::EH_SjLj_LongJmp32 ||
-             MI->getOpcode() == PPC::EH_SjLj_LongJmp64) {
+  } else if (MI.getOpcode() == PPC::EH_SjLj_LongJmp32 ||
+             MI.getOpcode() == PPC::EH_SjLj_LongJmp64) {
     return emitEHSjLjLongJmp(MI, BB);
   }
 
@@ -8626,44 +8868,43 @@ PPCTargetLowering::EmitInstrWithCustomInserter(MachineInstr *MI,
 
   MachineFunction *F = BB->getParent();
 
-  if (Subtarget.hasISEL() && (MI->getOpcode() == PPC::SELECT_CC_I4 ||
-                              MI->getOpcode() == PPC::SELECT_CC_I8 ||
-                              MI->getOpcode() == PPC::SELECT_I4 ||
-                              MI->getOpcode() == PPC::SELECT_I8)) {
+  if (Subtarget.hasISEL() &&
+      (MI.getOpcode() == PPC::SELECT_CC_I4 ||
+       MI.getOpcode() == PPC::SELECT_CC_I8 ||
+       MI.getOpcode() == PPC::SELECT_I4 || MI.getOpcode() == PPC::SELECT_I8)) {
     SmallVector<MachineOperand, 2> Cond;
-    if (MI->getOpcode() == PPC::SELECT_CC_I4 ||
-        MI->getOpcode() == PPC::SELECT_CC_I8)
-      Cond.push_back(MI->getOperand(4));
+    if (MI.getOpcode() == PPC::SELECT_CC_I4 ||
+        MI.getOpcode() == PPC::SELECT_CC_I8)
+      Cond.push_back(MI.getOperand(4));
     else
       Cond.push_back(MachineOperand::CreateImm(PPC::PRED_BIT_SET));
-    Cond.push_back(MI->getOperand(1));
-
-    DebugLoc dl = MI->getDebugLoc();
-    TII->insertSelect(*BB, MI, dl, MI->getOperand(0).getReg(),
-                      Cond, MI->getOperand(2).getReg(),
-                      MI->getOperand(3).getReg());
-  } else if (MI->getOpcode() == PPC::SELECT_CC_I4 ||
-             MI->getOpcode() == PPC::SELECT_CC_I8 ||
-             MI->getOpcode() == PPC::SELECT_CC_F4 ||
-             MI->getOpcode() == PPC::SELECT_CC_F8 ||
-             MI->getOpcode() == PPC::SELECT_CC_QFRC ||
-             MI->getOpcode() == PPC::SELECT_CC_QSRC ||
-             MI->getOpcode() == PPC::SELECT_CC_QBRC ||
-             MI->getOpcode() == PPC::SELECT_CC_VRRC ||
-             MI->getOpcode() == PPC::SELECT_CC_VSFRC ||
-             MI->getOpcode() == PPC::SELECT_CC_VSSRC ||
-             MI->getOpcode() == PPC::SELECT_CC_VSRC ||
-             MI->getOpcode() == PPC::SELECT_I4 ||
-             MI->getOpcode() == PPC::SELECT_I8 ||
-             MI->getOpcode() == PPC::SELECT_F4 ||
-             MI->getOpcode() == PPC::SELECT_F8 ||
-             MI->getOpcode() == PPC::SELECT_QFRC ||
-             MI->getOpcode() == PPC::SELECT_QSRC ||
-             MI->getOpcode() == PPC::SELECT_QBRC ||
-             MI->getOpcode() == PPC::SELECT_VRRC ||
-             MI->getOpcode() == PPC::SELECT_VSFRC ||
-             MI->getOpcode() == PPC::SELECT_VSSRC ||
-             MI->getOpcode() == PPC::SELECT_VSRC) {
+    Cond.push_back(MI.getOperand(1));
+
+    DebugLoc dl = MI.getDebugLoc();
+    TII->insertSelect(*BB, MI, dl, MI.getOperand(0).getReg(), Cond,
+                      MI.getOperand(2).getReg(), MI.getOperand(3).getReg());
+  } else if (MI.getOpcode() == PPC::SELECT_CC_I4 ||
+             MI.getOpcode() == PPC::SELECT_CC_I8 ||
+             MI.getOpcode() == PPC::SELECT_CC_F4 ||
+             MI.getOpcode() == PPC::SELECT_CC_F8 ||
+             MI.getOpcode() == PPC::SELECT_CC_QFRC ||
+             MI.getOpcode() == PPC::SELECT_CC_QSRC ||
+             MI.getOpcode() == PPC::SELECT_CC_QBRC ||
+             MI.getOpcode() == PPC::SELECT_CC_VRRC ||
+             MI.getOpcode() == PPC::SELECT_CC_VSFRC ||
+             MI.getOpcode() == PPC::SELECT_CC_VSSRC ||
+             MI.getOpcode() == PPC::SELECT_CC_VSRC ||
+             MI.getOpcode() == PPC::SELECT_I4 ||
+             MI.getOpcode() == PPC::SELECT_I8 ||
+             MI.getOpcode() == PPC::SELECT_F4 ||
+             MI.getOpcode() == PPC::SELECT_F8 ||
+             MI.getOpcode() == PPC::SELECT_QFRC ||
+             MI.getOpcode() == PPC::SELECT_QSRC ||
+             MI.getOpcode() == PPC::SELECT_QBRC ||
+             MI.getOpcode() == PPC::SELECT_VRRC ||
+             MI.getOpcode() == PPC::SELECT_VSFRC ||
+             MI.getOpcode() == PPC::SELECT_VSSRC ||
+             MI.getOpcode() == PPC::SELECT_VSRC) {
     // The incoming instruction knows the destination vreg to set, the
     // condition code register to branch on, the true/false values to
     // select between, and a branch opcode to use.
@@ -8677,7 +8918,7 @@ PPCTargetLowering::EmitInstrWithCustomInserter(MachineInstr *MI,
     MachineBasicBlock *thisMBB = BB;
     MachineBasicBlock *copy0MBB = F->CreateMachineBasicBlock(LLVM_BB);
     MachineBasicBlock *sinkMBB = F->CreateMachineBasicBlock(LLVM_BB);
-    DebugLoc dl = MI->getDebugLoc();
+    DebugLoc dl = MI.getDebugLoc();
     F->insert(It, copy0MBB);
     F->insert(It, sinkMBB);
 
@@ -8690,23 +8931,24 @@ PPCTargetLowering::EmitInstrWithCustomInserter(MachineInstr *MI,
     BB->addSuccessor(copy0MBB);
     BB->addSuccessor(sinkMBB);
 
-    if (MI->getOpcode() == PPC::SELECT_I4 ||
-        MI->getOpcode() == PPC::SELECT_I8 ||
-        MI->getOpcode() == PPC::SELECT_F4 ||
-        MI->getOpcode() == PPC::SELECT_F8 ||
-        MI->getOpcode() == PPC::SELECT_QFRC ||
-        MI->getOpcode() == PPC::SELECT_QSRC ||
-        MI->getOpcode() == PPC::SELECT_QBRC ||
-        MI->getOpcode() == PPC::SELECT_VRRC ||
-        MI->getOpcode() == PPC::SELECT_VSFRC ||
-        MI->getOpcode() == PPC::SELECT_VSSRC ||
-        MI->getOpcode() == PPC::SELECT_VSRC) {
+    if (MI.getOpcode() == PPC::SELECT_I4 || MI.getOpcode() == PPC::SELECT_I8 ||
+        MI.getOpcode() == PPC::SELECT_F4 || MI.getOpcode() == PPC::SELECT_F8 ||
+        MI.getOpcode() == PPC::SELECT_QFRC ||
+        MI.getOpcode() == PPC::SELECT_QSRC ||
+        MI.getOpcode() == PPC::SELECT_QBRC ||
+        MI.getOpcode() == PPC::SELECT_VRRC ||
+        MI.getOpcode() == PPC::SELECT_VSFRC ||
+        MI.getOpcode() == PPC::SELECT_VSSRC ||
+        MI.getOpcode() == PPC::SELECT_VSRC) {
       BuildMI(BB, dl, TII->get(PPC::BC))
-        .addReg(MI->getOperand(1).getReg()).addMBB(sinkMBB);
+          .addReg(MI.getOperand(1).getReg())
+          .addMBB(sinkMBB);
     } else {
-      unsigned SelectPred = MI->getOperand(4).getImm();
+      unsigned SelectPred = MI.getOperand(4).getImm();
       BuildMI(BB, dl, TII->get(PPC::BCC))
-        .addImm(SelectPred).addReg(MI->getOperand(1).getReg()).addMBB(sinkMBB);
+          .addImm(SelectPred)
+          .addReg(MI.getOperand(1).getReg())
+          .addMBB(sinkMBB);
     }
 
     //  copy0MBB:
@@ -8721,11 +8963,12 @@ PPCTargetLowering::EmitInstrWithCustomInserter(MachineInstr *MI,
     //   %Result = phi [ %FalseValue, copy0MBB ], [ %TrueValue, thisMBB ]
     //  ...
     BB = sinkMBB;
-    BuildMI(*BB, BB->begin(), dl,
-            TII->get(PPC::PHI), MI->getOperand(0).getReg())
-      .addReg(MI->getOperand(3).getReg()).addMBB(copy0MBB)
-      .addReg(MI->getOperand(2).getReg()).addMBB(thisMBB);
-  } else if (MI->getOpcode() == PPC::ReadTB) {
+    BuildMI(*BB, BB->begin(), dl, TII->get(PPC::PHI), MI.getOperand(0).getReg())
+        .addReg(MI.getOperand(3).getReg())
+        .addMBB(copy0MBB)
+        .addReg(MI.getOperand(2).getReg())
+        .addMBB(thisMBB);
+  } else if (MI.getOpcode() == PPC::ReadTB) {
     // To read the 64-bit time-base register on a 32-bit target, we read the
     // two halves. Should the counter have wrapped while it was being read, we
     // need to try again.
@@ -8740,7 +8983,7 @@ PPCTargetLowering::EmitInstrWithCustomInserter(MachineInstr *MI,
 
     MachineBasicBlock *readMBB = F->CreateMachineBasicBlock(LLVM_BB);
     MachineBasicBlock *sinkMBB = F->CreateMachineBasicBlock(LLVM_BB);
-    DebugLoc dl = MI->getDebugLoc();
+    DebugLoc dl = MI.getDebugLoc();
     F->insert(It, readMBB);
     F->insert(It, sinkMBB);
 
@@ -8754,8 +8997,8 @@ PPCTargetLowering::EmitInstrWithCustomInserter(MachineInstr *MI,
 
     MachineRegisterInfo &RegInfo = F->getRegInfo();
     unsigned ReadAgainReg = RegInfo.createVirtualRegister(&PPC::GPRCRegClass);
-    unsigned LoReg = MI->getOperand(0).getReg();
-    unsigned HiReg = MI->getOperand(1).getReg();
+    unsigned LoReg = MI.getOperand(0).getReg();
+    unsigned HiReg = MI.getOperand(1).getReg();
 
     BuildMI(BB, dl, TII->get(PPC::MFSPR), HiReg).addImm(269);
     BuildMI(BB, dl, TII->get(PPC::MFSPR), LoReg).addImm(268);
@@ -8770,81 +9013,80 @@ PPCTargetLowering::EmitInstrWithCustomInserter(MachineInstr *MI,
 
     BB->addSuccessor(readMBB);
     BB->addSuccessor(sinkMBB);
-  }
-  else if (MI->getOpcode() == PPC::ATOMIC_LOAD_ADD_I8)
+  } else if (MI.getOpcode() == PPC::ATOMIC_LOAD_ADD_I8)
     BB = EmitPartwordAtomicBinary(MI, BB, true, PPC::ADD4);
-  else if (MI->getOpcode() == PPC::ATOMIC_LOAD_ADD_I16)
+  else if (MI.getOpcode() == PPC::ATOMIC_LOAD_ADD_I16)
     BB = EmitPartwordAtomicBinary(MI, BB, false, PPC::ADD4);
-  else if (MI->getOpcode() == PPC::ATOMIC_LOAD_ADD_I32)
+  else if (MI.getOpcode() == PPC::ATOMIC_LOAD_ADD_I32)
     BB = EmitAtomicBinary(MI, BB, 4, PPC::ADD4);
-  else if (MI->getOpcode() == PPC::ATOMIC_LOAD_ADD_I64)
+  else if (MI.getOpcode() == PPC::ATOMIC_LOAD_ADD_I64)
     BB = EmitAtomicBinary(MI, BB, 8, PPC::ADD8);
 
-  else if (MI->getOpcode() == PPC::ATOMIC_LOAD_AND_I8)
+  else if (MI.getOpcode() == PPC::ATOMIC_LOAD_AND_I8)
     BB = EmitPartwordAtomicBinary(MI, BB, true, PPC::AND);
-  else if (MI->getOpcode() == PPC::ATOMIC_LOAD_AND_I16)
+  else if (MI.getOpcode() == PPC::ATOMIC_LOAD_AND_I16)
     BB = EmitPartwordAtomicBinary(MI, BB, false, PPC::AND);
-  else if (MI->getOpcode() == PPC::ATOMIC_LOAD_AND_I32)
+  else if (MI.getOpcode() == PPC::ATOMIC_LOAD_AND_I32)
     BB = EmitAtomicBinary(MI, BB, 4, PPC::AND);
-  else if (MI->getOpcode() == PPC::ATOMIC_LOAD_AND_I64)
+  else if (MI.getOpcode() == PPC::ATOMIC_LOAD_AND_I64)
     BB = EmitAtomicBinary(MI, BB, 8, PPC::AND8);
 
-  else if (MI->getOpcode() == PPC::ATOMIC_LOAD_OR_I8)
+  else if (MI.getOpcode() == PPC::ATOMIC_LOAD_OR_I8)
     BB = EmitPartwordAtomicBinary(MI, BB, true, PPC::OR);
-  else if (MI->getOpcode() == PPC::ATOMIC_LOAD_OR_I16)
+  else if (MI.getOpcode() == PPC::ATOMIC_LOAD_OR_I16)
     BB = EmitPartwordAtomicBinary(MI, BB, false, PPC::OR);
-  else if (MI->getOpcode() == PPC::ATOMIC_LOAD_OR_I32)
+  else if (MI.getOpcode() == PPC::ATOMIC_LOAD_OR_I32)
     BB = EmitAtomicBinary(MI, BB, 4, PPC::OR);
-  else if (MI->getOpcode() == PPC::ATOMIC_LOAD_OR_I64)
+  else if (MI.getOpcode() == PPC::ATOMIC_LOAD_OR_I64)
     BB = EmitAtomicBinary(MI, BB, 8, PPC::OR8);
 
-  else if (MI->getOpcode() == PPC::ATOMIC_LOAD_XOR_I8)
+  else if (MI.getOpcode() == PPC::ATOMIC_LOAD_XOR_I8)
     BB = EmitPartwordAtomicBinary(MI, BB, true, PPC::XOR);
-  else if (MI->getOpcode() == PPC::ATOMIC_LOAD_XOR_I16)
+  else if (MI.getOpcode() == PPC::ATOMIC_LOAD_XOR_I16)
     BB = EmitPartwordAtomicBinary(MI, BB, false, PPC::XOR);
-  else if (MI->getOpcode() == PPC::ATOMIC_LOAD_XOR_I32)
+  else if (MI.getOpcode() == PPC::ATOMIC_LOAD_XOR_I32)
     BB = EmitAtomicBinary(MI, BB, 4, PPC::XOR);
-  else if (MI->getOpcode() == PPC::ATOMIC_LOAD_XOR_I64)
+  else if (MI.getOpcode() == PPC::ATOMIC_LOAD_XOR_I64)
     BB = EmitAtomicBinary(MI, BB, 8, PPC::XOR8);
 
-  else if (MI->getOpcode() == PPC::ATOMIC_LOAD_NAND_I8)
+  else if (MI.getOpcode() == PPC::ATOMIC_LOAD_NAND_I8)
     BB = EmitPartwordAtomicBinary(MI, BB, true, PPC::NAND);
-  else if (MI->getOpcode() == PPC::ATOMIC_LOAD_NAND_I16)
+  else if (MI.getOpcode() == PPC::ATOMIC_LOAD_NAND_I16)
     BB = EmitPartwordAtomicBinary(MI, BB, false, PPC::NAND);
-  else if (MI->getOpcode() == PPC::ATOMIC_LOAD_NAND_I32)
+  else if (MI.getOpcode() == PPC::ATOMIC_LOAD_NAND_I32)
     BB = EmitAtomicBinary(MI, BB, 4, PPC::NAND);
-  else if (MI->getOpcode() == PPC::ATOMIC_LOAD_NAND_I64)
+  else if (MI.getOpcode() == PPC::ATOMIC_LOAD_NAND_I64)
     BB = EmitAtomicBinary(MI, BB, 8, PPC::NAND8);
 
-  else if (MI->getOpcode() == PPC::ATOMIC_LOAD_SUB_I8)
+  else if (MI.getOpcode() == PPC::ATOMIC_LOAD_SUB_I8)
     BB = EmitPartwordAtomicBinary(MI, BB, true, PPC::SUBF);
-  else if (MI->getOpcode() == PPC::ATOMIC_LOAD_SUB_I16)
+  else if (MI.getOpcode() == PPC::ATOMIC_LOAD_SUB_I16)
     BB = EmitPartwordAtomicBinary(MI, BB, false, PPC::SUBF);
-  else if (MI->getOpcode() == PPC::ATOMIC_LOAD_SUB_I32)
+  else if (MI.getOpcode() == PPC::ATOMIC_LOAD_SUB_I32)
     BB = EmitAtomicBinary(MI, BB, 4, PPC::SUBF);
-  else if (MI->getOpcode() == PPC::ATOMIC_LOAD_SUB_I64)
+  else if (MI.getOpcode() == PPC::ATOMIC_LOAD_SUB_I64)
     BB = EmitAtomicBinary(MI, BB, 8, PPC::SUBF8);
 
-  else if (MI->getOpcode() == PPC::ATOMIC_SWAP_I8)
+  else if (MI.getOpcode() == PPC::ATOMIC_SWAP_I8)
     BB = EmitPartwordAtomicBinary(MI, BB, true, 0);
-  else if (MI->getOpcode() == PPC::ATOMIC_SWAP_I16)
+  else if (MI.getOpcode() == PPC::ATOMIC_SWAP_I16)
     BB = EmitPartwordAtomicBinary(MI, BB, false, 0);
-  else if (MI->getOpcode() == PPC::ATOMIC_SWAP_I32)
+  else if (MI.getOpcode() == PPC::ATOMIC_SWAP_I32)
     BB = EmitAtomicBinary(MI, BB, 4, 0);
-  else if (MI->getOpcode() == PPC::ATOMIC_SWAP_I64)
+  else if (MI.getOpcode() == PPC::ATOMIC_SWAP_I64)
     BB = EmitAtomicBinary(MI, BB, 8, 0);
 
-  else if (MI->getOpcode() == PPC::ATOMIC_CMP_SWAP_I32 ||
-           MI->getOpcode() == PPC::ATOMIC_CMP_SWAP_I64 ||
+  else if (MI.getOpcode() == PPC::ATOMIC_CMP_SWAP_I32 ||
+           MI.getOpcode() == PPC::ATOMIC_CMP_SWAP_I64 ||
            (Subtarget.hasPartwordAtomics() &&
-            MI->getOpcode() == PPC::ATOMIC_CMP_SWAP_I8) ||
+            MI.getOpcode() == PPC::ATOMIC_CMP_SWAP_I8) ||
            (Subtarget.hasPartwordAtomics() &&
-            MI->getOpcode() == PPC::ATOMIC_CMP_SWAP_I16)) {
-    bool is64bit = MI->getOpcode() == PPC::ATOMIC_CMP_SWAP_I64;
+            MI.getOpcode() == PPC::ATOMIC_CMP_SWAP_I16)) {
+    bool is64bit = MI.getOpcode() == PPC::ATOMIC_CMP_SWAP_I64;
 
     auto LoadMnemonic = PPC::LDARX;
     auto StoreMnemonic = PPC::STDCX;
-    switch(MI->getOpcode()) {
+    switch (MI.getOpcode()) {
     default:
       llvm_unreachable("Compare and swap of unknown size");
     case PPC::ATOMIC_CMP_SWAP_I8:
@@ -8866,12 +9108,12 @@ PPCTargetLowering::EmitInstrWithCustomInserter(MachineInstr *MI,
       StoreMnemonic = PPC::STDCX;
       break;
     }
-    unsigned dest   = MI->getOperand(0).getReg();
-    unsigned ptrA   = MI->getOperand(1).getReg();
-    unsigned ptrB   = MI->getOperand(2).getReg();
-    unsigned oldval = MI->getOperand(3).getReg();
-    unsigned newval = MI->getOperand(4).getReg();
-    DebugLoc dl     = MI->getDebugLoc();
+    unsigned dest = MI.getOperand(0).getReg();
+    unsigned ptrA = MI.getOperand(1).getReg();
+    unsigned ptrB = MI.getOperand(2).getReg();
+    unsigned oldval = MI.getOperand(3).getReg();
+    unsigned newval = MI.getOperand(4).getReg();
+    DebugLoc dl = MI.getDebugLoc();
 
     MachineBasicBlock *loop1MBB = F->CreateMachineBasicBlock(LLVM_BB);
     MachineBasicBlock *loop2MBB = F->CreateMachineBasicBlock(LLVM_BB);
@@ -8928,20 +9170,20 @@ PPCTargetLowering::EmitInstrWithCustomInserter(MachineInstr *MI,
     //  exitMBB:
     //   ...
     BB = exitMBB;
-  } else if (MI->getOpcode() == PPC::ATOMIC_CMP_SWAP_I8 ||
-             MI->getOpcode() == PPC::ATOMIC_CMP_SWAP_I16) {
+  } else if (MI.getOpcode() == PPC::ATOMIC_CMP_SWAP_I8 ||
+             MI.getOpcode() == PPC::ATOMIC_CMP_SWAP_I16) {
     // We must use 64-bit registers for addresses when targeting 64-bit,
     // since we're actually doing arithmetic on them.  Other registers
     // can be 32-bit.
     bool is64bit = Subtarget.isPPC64();
-    bool is8bit = MI->getOpcode() == PPC::ATOMIC_CMP_SWAP_I8;
+    bool is8bit = MI.getOpcode() == PPC::ATOMIC_CMP_SWAP_I8;
 
-    unsigned dest   = MI->getOperand(0).getReg();
-    unsigned ptrA   = MI->getOperand(1).getReg();
-    unsigned ptrB   = MI->getOperand(2).getReg();
-    unsigned oldval = MI->getOperand(3).getReg();
-    unsigned newval = MI->getOperand(4).getReg();
-    DebugLoc dl     = MI->getDebugLoc();
+    unsigned dest = MI.getOperand(0).getReg();
+    unsigned ptrA = MI.getOperand(1).getReg();
+    unsigned ptrB = MI.getOperand(2).getReg();
+    unsigned oldval = MI.getOperand(3).getReg();
+    unsigned newval = MI.getOperand(4).getReg();
+    DebugLoc dl = MI.getDebugLoc();
 
     MachineBasicBlock *loop1MBB = F->CreateMachineBasicBlock(LLVM_BB);
     MachineBasicBlock *loop2MBB = F->CreateMachineBasicBlock(LLVM_BB);
@@ -9076,14 +9318,14 @@ PPCTargetLowering::EmitInstrWithCustomInserter(MachineInstr *MI,
     BB = exitMBB;
     BuildMI(*BB, BB->begin(), dl, TII->get(PPC::SRW),dest).addReg(TmpReg)
       .addReg(ShiftReg);
-  } else if (MI->getOpcode() == PPC::FADDrtz) {
+  } else if (MI.getOpcode() == PPC::FADDrtz) {
     // This pseudo performs an FADD with rounding mode temporarily forced
     // to round-to-zero.  We emit this via custom inserter since the FPSCR
     // is not modeled at the SelectionDAG level.
-    unsigned Dest = MI->getOperand(0).getReg();
-    unsigned Src1 = MI->getOperand(1).getReg();
-    unsigned Src2 = MI->getOperand(2).getReg();
-    DebugLoc dl   = MI->getDebugLoc();
+    unsigned Dest = MI.getOperand(0).getReg();
+    unsigned Src1 = MI.getOperand(1).getReg();
+    unsigned Src2 = MI.getOperand(2).getReg();
+    DebugLoc dl = MI.getDebugLoc();
 
     MachineRegisterInfo &RegInfo = F->getRegInfo();
     unsigned MFFSReg = RegInfo.createVirtualRegister(&PPC::F8RCRegClass);
@@ -9100,29 +9342,31 @@ PPCTargetLowering::EmitInstrWithCustomInserter(MachineInstr *MI,
 
     // Restore FPSCR value.
     BuildMI(*BB, MI, dl, TII->get(PPC::MTFSFb)).addImm(1).addReg(MFFSReg);
-  } else if (MI->getOpcode() == PPC::ANDIo_1_EQ_BIT ||
-             MI->getOpcode() == PPC::ANDIo_1_GT_BIT ||
-             MI->getOpcode() == PPC::ANDIo_1_EQ_BIT8 ||
-             MI->getOpcode() == PPC::ANDIo_1_GT_BIT8) {
-    unsigned Opcode = (MI->getOpcode() == PPC::ANDIo_1_EQ_BIT8 ||
-                       MI->getOpcode() == PPC::ANDIo_1_GT_BIT8) ?
-                      PPC::ANDIo8 : PPC::ANDIo;
-    bool isEQ = (MI->getOpcode() == PPC::ANDIo_1_EQ_BIT ||
-                 MI->getOpcode() == PPC::ANDIo_1_EQ_BIT8);
+  } else if (MI.getOpcode() == PPC::ANDIo_1_EQ_BIT ||
+             MI.getOpcode() == PPC::ANDIo_1_GT_BIT ||
+             MI.getOpcode() == PPC::ANDIo_1_EQ_BIT8 ||
+             MI.getOpcode() == PPC::ANDIo_1_GT_BIT8) {
+    unsigned Opcode = (MI.getOpcode() == PPC::ANDIo_1_EQ_BIT8 ||
+                       MI.getOpcode() == PPC::ANDIo_1_GT_BIT8)
+                          ? PPC::ANDIo8
+                          : PPC::ANDIo;
+    bool isEQ = (MI.getOpcode() == PPC::ANDIo_1_EQ_BIT ||
+                 MI.getOpcode() == PPC::ANDIo_1_EQ_BIT8);
 
     MachineRegisterInfo &RegInfo = F->getRegInfo();
     unsigned Dest = RegInfo.createVirtualRegister(Opcode == PPC::ANDIo ?
                                                   &PPC::GPRCRegClass :
                                                   &PPC::G8RCRegClass);
 
-    DebugLoc dl   = MI->getDebugLoc();
+    DebugLoc dl = MI.getDebugLoc();
     BuildMI(*BB, MI, dl, TII->get(Opcode), Dest)
-      .addReg(MI->getOperand(1).getReg()).addImm(1);
+        .addReg(MI.getOperand(1).getReg())
+        .addImm(1);
     BuildMI(*BB, MI, dl, TII->get(TargetOpcode::COPY),
-            MI->getOperand(0).getReg())
-      .addReg(isEQ ? PPC::CR0EQ : PPC::CR0GT);
-  } else if (MI->getOpcode() == PPC::TCHECK_RET) {
-    DebugLoc Dl = MI->getDebugLoc();
+            MI.getOperand(0).getReg())
+        .addReg(isEQ ? PPC::CR0EQ : PPC::CR0GT);
+  } else if (MI.getOpcode() == PPC::TCHECK_RET) {
+    DebugLoc Dl = MI.getDebugLoc();
     MachineRegisterInfo &RegInfo = F->getRegInfo();
     unsigned CRReg = RegInfo.createVirtualRegister(&PPC::CRRCRegClass);
     BuildMI(*BB, MI, Dl, TII->get(PPC::TCHECK), CRReg);
@@ -9131,7 +9375,7 @@ PPCTargetLowering::EmitInstrWithCustomInserter(MachineInstr *MI,
     llvm_unreachable("Unexpected instr type to insert");
   }
 
-  MI->eraseFromParent();   // The pseudo instruction is gone now.
+  MI.eraseFromParent(); // The pseudo instruction is gone now.
   return BB;
 }
 
@@ -9650,14 +9894,18 @@ SDValue PPCTargetLowering::DAGCombineTruncBoolExt(SDNode *N,
       DAG.ReplaceAllUsesOfValueWith(Inputs[i], Inputs[i].getOperand(0));
   }
 
+  std::list<HandleSDNode> PromOpHandles;
+  for (auto &PromOp : PromOps)
+    PromOpHandles.emplace_back(PromOp);
+
   // Replace all operations (these are all the same, but have a different
   // (i1) return type). DAG.getNode will validate that the types of
   // a binary operator match, so go through the list in reverse so that
   // we've likely promoted both operands first. Any intermediate truncations or
   // extensions disappear.
-  while (!PromOps.empty()) {
-    SDValue PromOp = PromOps.back();
-    PromOps.pop_back();
+  while (!PromOpHandles.empty()) {
+    SDValue PromOp = PromOpHandles.back().getValue();
+    PromOpHandles.pop_back();
 
     if (PromOp.getOpcode() == ISD::TRUNCATE ||
         PromOp.getOpcode() == ISD::SIGN_EXTEND ||
@@ -9666,7 +9914,7 @@ SDValue PPCTargetLowering::DAGCombineTruncBoolExt(SDNode *N,
       if (!isa<ConstantSDNode>(PromOp.getOperand(0)) &&
           PromOp.getOperand(0).getValueType() != MVT::i1) {
         // The operand is not yet ready (see comment below).
-        PromOps.insert(PromOps.begin(), PromOp);
+        PromOpHandles.emplace_front(PromOp);
         continue;
       }
 
@@ -9693,7 +9941,7 @@ SDValue PPCTargetLowering::DAGCombineTruncBoolExt(SDNode *N,
       // promoted (this should be rare because we're going through the
       // list backward, but if one of the operands has several users in
       // this cluster of to-be-promoted nodes, it is possible).
-      PromOps.insert(PromOps.begin(), PromOp);
+      PromOpHandles.emplace_front(PromOp);
       continue;
     }
 
@@ -9900,13 +10148,17 @@ SDValue PPCTargetLowering::DAGCombineExtBoolTrunc(SDNode *N,
         DAG.getAnyExtOrTrunc(InSrc, dl, N->getValueType(0)));
   }
 
+  std::list<HandleSDNode> PromOpHandles;
+  for (auto &PromOp : PromOps)
+    PromOpHandles.emplace_back(PromOp);
+
   // Replace all operations (these are all the same, but have a different
   // (promoted) return type). DAG.getNode will validate that the types of
   // a binary operator match, so go through the list in reverse so that
   // we've likely promoted both operands first.
-  while (!PromOps.empty()) {
-    SDValue PromOp = PromOps.back();
-    PromOps.pop_back();
+  while (!PromOpHandles.empty()) {
+    SDValue PromOp = PromOpHandles.back().getValue();
+    PromOpHandles.pop_back();
 
     unsigned C;
     switch (PromOp.getOpcode()) {
@@ -9923,7 +10175,7 @@ SDValue PPCTargetLowering::DAGCombineExtBoolTrunc(SDNode *N,
       // promoted (this should be rare because we're going through the
       // list backward, but if one of the operands has several users in
       // this cluster of to-be-promoted nodes, it is possible).
-      PromOps.insert(PromOps.begin(), PromOp);
+      PromOpHandles.emplace_front(PromOp);
       continue;
     }
 
@@ -9935,7 +10187,7 @@ SDValue PPCTargetLowering::DAGCombineExtBoolTrunc(SDNode *N,
            PromOp.getOperand(0).getValueType() != N->getValueType(0)) ||
           (SelectTruncOp[1].count(PromOp.getNode()) &&
            PromOp.getOperand(1).getValueType() != N->getValueType(0))) {
-        PromOps.insert(PromOps.begin(), PromOp);
+        PromOpHandles.emplace_front(PromOp);
         continue;
       }
     }
@@ -9997,6 +10249,59 @@ SDValue PPCTargetLowering::DAGCombineExtBoolTrunc(SDNode *N,
       ShiftCst);
 }
 
+SDValue PPCTargetLowering::DAGCombineBuildVector(SDNode *N,
+                                                 DAGCombinerInfo &DCI) const {
+  assert(N->getOpcode() == ISD::BUILD_VECTOR &&
+         "Should be called with a BUILD_VECTOR node");
+
+  SelectionDAG &DAG = DCI.DAG;
+  SDLoc dl(N);
+  if (N->getValueType(0) != MVT::v2f64 || !Subtarget.hasVSX())
+    return SDValue();
+
+  // Looking for:
+  // (build_vector ([su]int_to_fp (extractelt 0)), [su]int_to_fp (extractelt 1))
+  if (N->getOperand(0).getOpcode() != ISD::SINT_TO_FP &&
+      N->getOperand(0).getOpcode() != ISD::UINT_TO_FP)
+    return SDValue();
+  if (N->getOperand(1).getOpcode() != ISD::SINT_TO_FP &&
+      N->getOperand(1).getOpcode() != ISD::UINT_TO_FP)
+    return SDValue();
+  if (N->getOperand(0).getOpcode() != N->getOperand(1).getOpcode())
+    return SDValue();
+
+  SDValue Ext1 = N->getOperand(0).getOperand(0);
+  SDValue Ext2 = N->getOperand(1).getOperand(0);
+  if(Ext1.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
+     Ext2.getOpcode() != ISD::EXTRACT_VECTOR_ELT)
+    return SDValue();
+
+  ConstantSDNode *Ext1Op = dyn_cast<ConstantSDNode>(Ext1.getOperand(1));
+  ConstantSDNode *Ext2Op = dyn_cast<ConstantSDNode>(Ext2.getOperand(1));
+  if (!Ext1Op || !Ext2Op)
+    return SDValue();
+  if (Ext1.getValueType() != MVT::i32 ||
+      Ext2.getValueType() != MVT::i32)
+  if (Ext1.getOperand(0) != Ext2.getOperand(0))
+    return SDValue();
+
+  int FirstElem = Ext1Op->getZExtValue();
+  int SecondElem = Ext2Op->getZExtValue();
+  int SubvecIdx;
+  if (FirstElem == 0 && SecondElem == 1)
+    SubvecIdx = Subtarget.isLittleEndian() ? 1 : 0;
+  else if (FirstElem == 2 && SecondElem == 3)
+    SubvecIdx = Subtarget.isLittleEndian() ? 0 : 1;
+  else
+    return SDValue();
+
+  SDValue SrcVec = Ext1.getOperand(0);
+  auto NodeType = (N->getOperand(1).getOpcode() == ISD::SINT_TO_FP) ?
+    PPCISD::SINT_VEC_TO_FP : PPCISD::UINT_VEC_TO_FP;
+  return DAG.getNode(NodeType, dl, MVT::v2f64,
+                     SrcVec, DAG.getIntPtrConstant(SubvecIdx, dl));
+}
+
 SDValue PPCTargetLowering::combineFPToIntToFP(SDNode *N,
                                               DAGCombinerInfo &DCI) const {
   assert((N->getOpcode() == ISD::SINT_TO_FP ||
@@ -10109,13 +10414,24 @@ SDValue PPCTargetLowering::expandVSXLoadForLE(SDNode *N,
   MVT VecTy = N->getValueType(0).getSimpleVT();
   SDValue LoadOps[] = { Chain, Base };
   SDValue Load = DAG.getMemIntrinsicNode(PPCISD::LXVD2X, dl,
-                                         DAG.getVTList(VecTy, MVT::Other),
-                                         LoadOps, VecTy, MMO);
+                                         DAG.getVTList(MVT::v2f64, MVT::Other),
+                                         LoadOps, MVT::v2f64, MMO);
+
   DCI.AddToWorklist(Load.getNode());
   Chain = Load.getValue(1);
-  SDValue Swap = DAG.getNode(PPCISD::XXSWAPD, dl,
-                             DAG.getVTList(VecTy, MVT::Other), Chain, Load);
+  SDValue Swap = DAG.getNode(
+      PPCISD::XXSWAPD, dl, DAG.getVTList(MVT::v2f64, MVT::Other), Chain, Load);
   DCI.AddToWorklist(Swap.getNode());
+
+  // Add a bitcast if the resulting load type doesn't match v2f64.
+  if (VecTy != MVT::v2f64) {
+    SDValue N = DAG.getNode(ISD::BITCAST, dl, VecTy, Swap);
+    DCI.AddToWorklist(N.getNode());
+    // Package {bitcast value, swap's chain} to match Load's shape.
+    return DAG.getNode(ISD::MERGE_VALUES, dl, DAG.getVTList(VecTy, MVT::Other),
+                       N, Swap.getValue(1));
+  }
+
   return Swap;
 }
 
@@ -10159,8 +10475,15 @@ SDValue PPCTargetLowering::expandVSXStoreForLE(SDNode *N,
 
   SDValue Src = N->getOperand(SrcOpnd);
   MVT VecTy = Src.getValueType().getSimpleVT();
+
+  // All stores are done as v2f64 and possible bit cast.
+  if (VecTy != MVT::v2f64) {
+    Src = DAG.getNode(ISD::BITCAST, dl, MVT::v2f64, Src);
+    DCI.AddToWorklist(Src.getNode());
+  }
+
   SDValue Swap = DAG.getNode(PPCISD::XXSWAPD, dl,
-                             DAG.getVTList(VecTy, MVT::Other), Chain, Src);
+                             DAG.getVTList(MVT::v2f64, MVT::Other), Chain, Src);
   DCI.AddToWorklist(Swap.getNode());
   Chain = Swap.getValue(1);
   SDValue StoreOps[] = { Chain, Swap, Base };
@@ -10277,6 +10600,111 @@ SDValue PPCTargetLowering::PerformDAGCombine(SDNode *N,
         return expandVSXLoadForLE(N, DCI);
     }
 
+    // We sometimes end up with a 64-bit integer load, from which we extract
+    // two single-precision floating-point numbers. This happens with
+    // std::complex<float>, and other similar structures, because of the way we
+    // canonicalize structure copies. However, if we lack direct moves,
+    // then the final bitcasts from the extracted integer values to the
+    // floating-point numbers turn into store/load pairs. Even with direct moves,
+    // just loading the two floating-point numbers is likely better.
+    auto ReplaceTwoFloatLoad = [&]() {
+      if (VT != MVT::i64)
+        return false;
+
+      if (LD->getExtensionType() != ISD::NON_EXTLOAD ||
+          LD->isVolatile())
+        return false;
+
+      //  We're looking for a sequence like this:
+      //  t13: i64,ch = load<LD8[%ref.tmp]> t0, t6, undef:i64
+      //      t16: i64 = srl t13, Constant:i32<32>
+      //    t17: i32 = truncate t16
+      //  t18: f32 = bitcast t17
+      //    t19: i32 = truncate t13
+      //  t20: f32 = bitcast t19
+
+      if (!LD->hasNUsesOfValue(2, 0))
+        return false;
+
+      auto UI = LD->use_begin();
+      while (UI.getUse().getResNo() != 0) ++UI;
+      SDNode *Trunc = *UI++;
+      while (UI.getUse().getResNo() != 0) ++UI;
+      SDNode *RightShift = *UI;
+      if (Trunc->getOpcode() != ISD::TRUNCATE)
+        std::swap(Trunc, RightShift);
+
+      if (Trunc->getOpcode() != ISD::TRUNCATE ||
+          Trunc->getValueType(0) != MVT::i32 ||
+          !Trunc->hasOneUse())
+        return false;
+      if (RightShift->getOpcode() != ISD::SRL ||
+          !isa<ConstantSDNode>(RightShift->getOperand(1)) ||
+          RightShift->getConstantOperandVal(1) != 32 ||
+          !RightShift->hasOneUse())
+        return false;
+
+      SDNode *Trunc2 = *RightShift->use_begin();
+      if (Trunc2->getOpcode() != ISD::TRUNCATE ||
+          Trunc2->getValueType(0) != MVT::i32 ||
+          !Trunc2->hasOneUse())
+        return false;
+
+      SDNode *Bitcast = *Trunc->use_begin();
+      SDNode *Bitcast2 = *Trunc2->use_begin();
+
+      if (Bitcast->getOpcode() != ISD::BITCAST ||
+          Bitcast->getValueType(0) != MVT::f32)
+        return false;
+      if (Bitcast2->getOpcode() != ISD::BITCAST ||
+          Bitcast2->getValueType(0) != MVT::f32)
+        return false;
+
+      if (Subtarget.isLittleEndian())
+        std::swap(Bitcast, Bitcast2);
+
+      // Bitcast has the second float (in memory-layout order) and Bitcast2
+      // has the first one.
+
+      SDValue BasePtr = LD->getBasePtr();
+      if (LD->isIndexed()) {
+        assert(LD->getAddressingMode() == ISD::PRE_INC &&
+               "Non-pre-inc AM on PPC?");
+        BasePtr =
+          DAG.getNode(ISD::ADD, dl, BasePtr.getValueType(), BasePtr,
+                      LD->getOffset());
+      }
+
+      auto MMOFlags =
+          LD->getMemOperand()->getFlags() & ~MachineMemOperand::MOVolatile;
+      SDValue FloatLoad = DAG.getLoad(MVT::f32, dl, LD->getChain(), BasePtr,
+                                      LD->getPointerInfo(), LD->getAlignment(),
+                                      MMOFlags, LD->getAAInfo());
+      SDValue AddPtr =
+        DAG.getNode(ISD::ADD, dl, BasePtr.getValueType(),
+                    BasePtr, DAG.getIntPtrConstant(4, dl));
+      SDValue FloatLoad2 = DAG.getLoad(
+          MVT::f32, dl, SDValue(FloatLoad.getNode(), 1), AddPtr,
+          LD->getPointerInfo().getWithOffset(4),
+          MinAlign(LD->getAlignment(), 4), MMOFlags, LD->getAAInfo());
+
+      if (LD->isIndexed()) {
+        // Note that DAGCombine should re-form any pre-increment load(s) from
+        // what is produced here if that makes sense.
+        DAG.ReplaceAllUsesOfValueWith(SDValue(LD, 1), BasePtr);
+      }
+
+      DCI.CombineTo(Bitcast2, FloatLoad);
+      DCI.CombineTo(Bitcast, FloatLoad2);
+
+      DAG.ReplaceAllUsesOfValueWith(SDValue(LD, LD->isIndexed() ? 2 : 1),
+                                    SDValue(FloatLoad2.getNode(), 1));
+      return true;
+    };
+
+    if (ReplaceTwoFloatLoad())
+      return SDValue(N, 0);
+
     EVT MemVT = LD->getMemoryVT();
     Type *Ty = MemVT.getTypeForEVT(*DAG.getContext());
     unsigned ABIAlignment = DAG.getDataLayout().getABITypeAlignment(Ty);
@@ -10710,6 +11138,8 @@ SDValue PPCTargetLowering::PerformDAGCombine(SDNode *N,
     }
     break;
   }
+  case ISD::BUILD_VECTOR:
+    return DAGCombineBuildVector(N, DCI);
   }
 
   return SDValue();
@@ -10801,7 +11231,8 @@ unsigned PPCTargetLowering::getPrefLoopAlignment(MachineLoop *ML) const {
   case PPC::DIR_PWR6:
   case PPC::DIR_PWR6X:
   case PPC::DIR_PWR7:
-  case PPC::DIR_PWR8: {
+  case PPC::DIR_PWR8:
+  case PPC::DIR_PWR9: {
     if (!ML)
       break;
 
@@ -10812,7 +11243,7 @@ unsigned PPCTargetLowering::getPrefLoopAlignment(MachineLoop *ML) const {
     uint64_t LoopSize = 0;
     for (auto I = ML->block_begin(), IE = ML->block_end(); I != IE; ++I)
       for (auto J = (*I)->begin(), JE = (*I)->end(); J != JE; ++J) {
-        LoopSize += TII->GetInstSizeInBytes(J);
+        LoopSize += TII->GetInstSizeInBytes(*J);
         if (LoopSize > 32)
           break;
       }
@@ -10837,6 +11268,7 @@ PPCTargetLowering::getConstraintType(StringRef Constraint) const {
     case 'b':
     case 'r':
     case 'f':
+    case 'd':
     case 'v':
     case 'y':
       return C_RegisterClass;
@@ -10928,6 +11360,10 @@ PPCTargetLowering::getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI,
       if (VT == MVT::i64 && Subtarget.isPPC64())
         return std::make_pair(0U, &PPC::G8RCRegClass);
       return std::make_pair(0U, &PPC::GPRCRegClass);
+    // 'd' and 'f' constraints are both defined to be "the floating point
+    // registers", where one is for 32-bit and the other for 64-bit. We don't
+    // really care overly much here so just give them all the same reg classes.
+    case 'd':
     case 'f':
       if (VT == MVT::f32 || VT == MVT::i32)
         return std::make_pair(0U, &PPC::F4RCRegClass);
@@ -11126,13 +11562,13 @@ SDValue PPCTargetLowering::LowerRETURNADDR(SDValue Op,
                         isPPC64 ? MVT::i64 : MVT::i32);
     return DAG.getLoad(PtrVT, dl, DAG.getEntryNode(),
                        DAG.getNode(ISD::ADD, dl, PtrVT, FrameAddr, Offset),
-                       MachinePointerInfo(), false, false, false, 0);
+                       MachinePointerInfo());
   }
 
   // Just load the return address off the stack.
   SDValue RetAddrFI = getReturnAddrFrameIndex(DAG);
   return DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), RetAddrFI,
-                     MachinePointerInfo(), false, false, false, 0);
+                     MachinePointerInfo());
 }
 
 SDValue PPCTargetLowering::LowerFRAMEADDR(SDValue Op,
@@ -11144,7 +11580,7 @@ SDValue PPCTargetLowering::LowerFRAMEADDR(SDValue Op,
   MachineFrameInfo *MFI = MF.getFrameInfo();
   MFI->setFrameAddressIsTaken(true);
 
-  EVT PtrVT = DAG.getTargetLoweringInfo().getPointerTy(MF.getDataLayout());
+  EVT PtrVT = getPointerTy(MF.getDataLayout());
   bool isPPC64 = PtrVT == MVT::i64;
 
   // Naked functions never have a frame pointer, and so we use r1. For all
@@ -11159,8 +11595,7 @@ SDValue PPCTargetLowering::LowerFRAMEADDR(SDValue Op,
                                          PtrVT);
   while (Depth--)
     FrameAddr = DAG.getLoad(Op.getValueType(), dl, DAG.getEntryNode(),
-                            FrameAddr, MachinePointerInfo(), false, false,
-                            false, 0);
+                            FrameAddr, MachinePointerInfo());
   return FrameAddr;
 }
 
@@ -11567,10 +12002,8 @@ PPCTargetLowering::shouldExpandBuildVectorWithShuffles(
   if (VT == MVT::v2i64)
     return Subtarget.hasDirectMove(); // Don't need stack ops with direct moves
 
-  if (Subtarget.hasQPX()) {
-    if (VT == MVT::v4f32 || VT == MVT::v4f64 || VT == MVT::v4i1)
-      return true;
-  }
+  if (Subtarget.hasVSX() || Subtarget.hasQPX())
+    return true;
 
   return TargetLowering::shouldExpandBuildVectorWithShuffles(VT, DefinedValues);
 }
@@ -11588,3 +12021,70 @@ PPCTargetLowering::createFastISel(FunctionLoweringInfo &FuncInfo,
                                   const TargetLibraryInfo *LibInfo) const {
   return PPC::createFastISel(FuncInfo, LibInfo);
 }
+
+void PPCTargetLowering::initializeSplitCSR(MachineBasicBlock *Entry) const {
+  if (Subtarget.isDarwinABI()) return;
+  if (!Subtarget.isPPC64()) return;
+
+  // Update IsSplitCSR in PPCFunctionInfo
+  PPCFunctionInfo *PFI = Entry->getParent()->getInfo<PPCFunctionInfo>();
+  PFI->setIsSplitCSR(true);
+}
+
+void PPCTargetLowering::insertCopiesSplitCSR(
+  MachineBasicBlock *Entry,
+  const SmallVectorImpl<MachineBasicBlock *> &Exits) const {
+  const PPCRegisterInfo *TRI = Subtarget.getRegisterInfo();
+  const MCPhysReg *IStart = TRI->getCalleeSavedRegsViaCopy(Entry->getParent());
+  if (!IStart)
+    return;
+
+  const TargetInstrInfo *TII = Subtarget.getInstrInfo();
+  MachineRegisterInfo *MRI = &Entry->getParent()->getRegInfo();
+  MachineBasicBlock::iterator MBBI = Entry->begin();
+  for (const MCPhysReg *I = IStart; *I; ++I) {
+    const TargetRegisterClass *RC = nullptr;
+    if (PPC::G8RCRegClass.contains(*I))
+      RC = &PPC::G8RCRegClass;
+    else if (PPC::F8RCRegClass.contains(*I))
+      RC = &PPC::F8RCRegClass;
+    else if (PPC::CRRCRegClass.contains(*I))
+      RC = &PPC::CRRCRegClass;
+    else if (PPC::VRRCRegClass.contains(*I))
+      RC = &PPC::VRRCRegClass;
+    else
+      llvm_unreachable("Unexpected register class in CSRsViaCopy!");
+
+    unsigned NewVR = MRI->createVirtualRegister(RC);
+    // Create copy from CSR to a virtual register.
+    // FIXME: this currently does not emit CFI pseudo-instructions, it works
+    // fine for CXX_FAST_TLS since the C++-style TLS access functions should be
+    // nounwind. If we want to generalize this later, we may need to emit
+    // CFI pseudo-instructions.
+    assert(Entry->getParent()->getFunction()->hasFnAttribute(
+             Attribute::NoUnwind) &&
+           "Function should be nounwind in insertCopiesSplitCSR!");
+    Entry->addLiveIn(*I);
+    BuildMI(*Entry, MBBI, DebugLoc(), TII->get(TargetOpcode::COPY), NewVR)
+      .addReg(*I);
+
+    // Insert the copy-back instructions right before the terminator
+    for (auto *Exit : Exits)
+      BuildMI(*Exit, Exit->getFirstTerminator(), DebugLoc(),
+              TII->get(TargetOpcode::COPY), *I)
+        .addReg(NewVR);
+  }
+}
+
+// Override to enable LOAD_STACK_GUARD lowering on Linux.
+bool PPCTargetLowering::useLoadStackGuardNode() const {
+  if (!Subtarget.isTargetLinux())
+    return TargetLowering::useLoadStackGuardNode();
+  return true;
+}
+
+// Override to disable global variable loading on Linux.
+void PPCTargetLowering::insertSSPDeclarations(Module &M) const {
+  if (!Subtarget.isTargetLinux())
+    return TargetLowering::insertSSPDeclarations(M);
+}
diff --git a/lib/Target/PowerPC/PPCISelLowering.h b/lib/Target/PowerPC/PPCISelLowering.h
index 44bcb8942cfc..e3be8074e62e 100644
--- a/lib/Target/PowerPC/PPCISelLowering.h
+++ b/lib/Target/PowerPC/PPCISelLowering.h
@@ -61,6 +61,18 @@ namespace llvm {
       ///
       VPERM,
 
+      /// XXSPLT - The PPC VSX splat instructions
+      ///
+      XXSPLT,
+
+      /// XXINSERT - The PPC VSX insert instruction
+      ///
+      XXINSERT,
+
+      /// VECSHL - The PPC VSX shift left instruction
+      ///
+      VECSHL,
+
       /// The CMPB instruction (takes two operands of i32 or i64).
       CMPB,
 
@@ -133,6 +145,16 @@ namespace llvm {
       /// Direct move from a GPR to a VSX register (zero)
       MTVSRZ,
 
+      /// Extract a subvector from signed integer vector and convert to FP.
+      /// It is primarily used to convert a (widened) illegal integer vector
+      /// type to a legal floating point vector type.
+      /// For example v2i32 -> widened to v4i32 -> v2f64
+      SINT_VEC_TO_FP,
+
+      /// Extract a subvector from unsigned integer vector and convert to FP.
+      /// As with SINT_VEC_TO_FP, used for converting illegal types.
+      UINT_VEC_TO_FP,
+
       // FIXME: Remove these once the ANDI glue bug is fixed:
       /// i1 = ANDIo_1_[EQ|GT]_BIT(i32 or i64 x) - Represents the result of the
       /// eq or gt bit of CR0 after executing andi. x, 1. This is used to
@@ -297,6 +319,10 @@ namespace llvm {
       /// of outputs.
       XXSWAPD,
 
+      /// An SDNode for swaps that are not associated with any loads/stores
+      /// and thereby have no chain.
+      SWAP_NO_CHAIN,
+
       /// QVFPERM = This corresponds to the QPX qvfperm instruction.
       QVFPERM,
 
@@ -402,6 +428,16 @@ namespace llvm {
     /// VSPLTB/VSPLTH/VSPLTW.
     bool isSplatShuffleMask(ShuffleVectorSDNode *N, unsigned EltSize);
 
+    /// isXXINSERTWMask - Return true if this VECTOR_SHUFFLE can be handled by
+    /// the XXINSERTW instruction introduced in ISA 3.0. This is essentially any
+    /// shuffle of v4f32/v4i32 vectors that just inserts one element from one
+    /// vector into the other. This function will also set a couple of
+    /// output parameters for how much the source vector needs to be shifted and
+    /// what byte number needs to be specified for the instruction to put the
+    /// element in the desired location of the target vector.
+    bool isXXINSERTWMask(ShuffleVectorSDNode *N, unsigned &ShiftElts,
+                         unsigned &InsertAtByte, bool &Swap, bool IsLE);
+
     /// getVSPLTImmediate - Return the appropriate VSPLT* immediate to splat the
     /// specified isSplatShuffleMask VECTOR_SHUFFLE mask.
     unsigned getVSPLTImmediate(SDNode *N, unsigned EltSize, SelectionDAG &DAG);
@@ -428,6 +464,20 @@ namespace llvm {
     /// DAG node.
     const char *getTargetNodeName(unsigned Opcode) const override;
 
+    /// getPreferredVectorAction - The code we generate when vector types are
+    /// legalized by promoting the integer element type is often much worse
+    /// than code we generate if we widen the type for applicable vector types.
+    /// The issue with promoting is that the vector is scalaraized, individual
+    /// elements promoted and then the vector is rebuilt. So say we load a pair
+    /// of v4i8's and shuffle them. This will turn into a mess of 8 extending
+    /// loads, moves back into VSR's (or memory ops if we don't have moves) and
+    /// then the VPERM for the shuffle. All in all a very slow sequence.
+    TargetLoweringBase::LegalizeTypeAction getPreferredVectorAction(EVT VT)
+      const override {
+      if (VT.getVectorElementType().getSizeInBits() % 8 == 0)
+        return TypeWidenVector;
+      return TargetLoweringBase::getPreferredVectorAction(VT);
+    }
     bool useSoftFloat() const override;
 
     MVT getScalarShiftAmountTy(const DataLayout &, EVT) const override {
@@ -442,6 +492,18 @@ namespace llvm {
       return true;
     }
 
+    bool supportSplitCSR(MachineFunction *MF) const override {
+      return
+        MF->getFunction()->getCallingConv() == CallingConv::CXX_FAST_TLS &&
+        MF->getFunction()->hasFnAttribute(Attribute::NoUnwind);
+    }
+
+    void initializeSplitCSR(MachineBasicBlock *Entry) const override;
+
+    void insertCopiesSplitCSR(
+      MachineBasicBlock *Entry,
+      const SmallVectorImpl<MachineBasicBlock *> &Exits) const override;
+
     /// getSetCCResultType - Return the ISD::SETCC ValueType
     EVT getSetCCResultType(const DataLayout &DL, LLVMContext &Context,
                            EVT VT) const override;
@@ -508,26 +570,31 @@ namespace llvm {
 
     unsigned getPrefLoopAlignment(MachineLoop *ML) const override;
 
+    bool shouldInsertFencesForAtomic(const Instruction *I) const override {
+      return true;
+    }
+
     Instruction* emitLeadingFence(IRBuilder<> &Builder, AtomicOrdering Ord,
                                   bool IsStore, bool IsLoad) const override;
     Instruction* emitTrailingFence(IRBuilder<> &Builder, AtomicOrdering Ord,
                                    bool IsStore, bool IsLoad) const override;
 
     MachineBasicBlock *
-      EmitInstrWithCustomInserter(MachineInstr *MI,
-                                  MachineBasicBlock *MBB) const override;
-    MachineBasicBlock *EmitAtomicBinary(MachineInstr *MI,
+    EmitInstrWithCustomInserter(MachineInstr &MI,
+                                MachineBasicBlock *MBB) const override;
+    MachineBasicBlock *EmitAtomicBinary(MachineInstr &MI,
                                         MachineBasicBlock *MBB,
                                         unsigned AtomicSize,
                                         unsigned BinOpcode) const;
-    MachineBasicBlock *EmitPartwordAtomicBinary(MachineInstr *MI,
+    MachineBasicBlock *EmitPartwordAtomicBinary(MachineInstr &MI,
                                                 MachineBasicBlock *MBB,
-                                            bool is8bit, unsigned Opcode) const;
+                                                bool is8bit,
+                                                unsigned Opcode) const;
 
-    MachineBasicBlock *emitEHSjLjSetJmp(MachineInstr *MI,
+    MachineBasicBlock *emitEHSjLjSetJmp(MachineInstr &MI,
                                         MachineBasicBlock *MBB) const;
 
-    MachineBasicBlock *emitEHSjLjLongJmp(MachineInstr *MI,
+    MachineBasicBlock *emitEHSjLjLongJmp(MachineInstr &MI,
                                          MachineBasicBlock *MBB) const;
 
     ConstraintType getConstraintType(StringRef Constraint) const override;
@@ -672,6 +739,10 @@ namespace llvm {
     unsigned
     getExceptionSelectorRegister(const Constant *PersonalityFn) const override;
 
+    /// Override to support customized stack guard loading.
+    bool useLoadStackGuardNode() const override;
+    void insertSSPDeclarations(Module &M) const override;
+
   private:
     struct ReuseLoadInfo {
       SDValue Ptr;
@@ -693,11 +764,11 @@ namespace llvm {
                          SelectionDAG &DAG) const;
 
     void LowerFP_TO_INTForReuse(SDValue Op, ReuseLoadInfo &RLI,
-                                SelectionDAG &DAG, SDLoc dl) const;
+                                SelectionDAG &DAG, const SDLoc &dl) const;
     SDValue LowerFP_TO_INTDirectMove(SDValue Op, SelectionDAG &DAG,
-                                     SDLoc dl) const;
+                                     const SDLoc &dl) const;
     SDValue LowerINT_TO_FPDirectMove(SDValue Op, SelectionDAG &DAG,
-                                     SDLoc dl) const;
+                                     const SDLoc &dl) const;
 
     SDValue getFramePointerFrameIndex(SelectionDAG & DAG) const;
     SDValue getReturnAddrFrameIndex(SelectionDAG & DAG) const;
@@ -709,13 +780,20 @@ namespace llvm {
                                       const SmallVectorImpl<ISD::InputArg> &Ins,
                                       SelectionDAG& DAG) const;
 
-    SDValue EmitTailCallLoadFPAndRetAddr(SelectionDAG & DAG,
-                                         int SPDiff,
-                                         SDValue Chain,
-                                         SDValue &LROpOut,
+    bool
+    IsEligibleForTailCallOptimization_64SVR4(
+                                    SDValue Callee,
+                                    CallingConv::ID CalleeCC,
+                                    ImmutableCallSite *CS,
+                                    bool isVarArg,
+                                    const SmallVectorImpl<ISD::OutputArg> &Outs,
+                                    const SmallVectorImpl<ISD::InputArg> &Ins,
+                                    SelectionDAG& DAG) const;
+
+    SDValue EmitTailCallLoadFPAndRetAddr(SelectionDAG &DAG, int SPDiff,
+                                         SDValue Chain, SDValue &LROpOut,
                                          SDValue &FPOpOut,
-                                         bool isDarwinABI,
-                                         SDLoc dl) const;
+                                         const SDLoc &dl) const;
 
     SDValue LowerRETURNADDR(SDValue Op, SelectionDAG &DAG) const;
     SDValue LowerFRAMEADDR(SDValue Op, SelectionDAG &DAG) const;
@@ -727,23 +805,18 @@ namespace llvm {
     SDValue LowerSETCC(SDValue Op, SelectionDAG &DAG) const;
     SDValue LowerINIT_TRAMPOLINE(SDValue Op, SelectionDAG &DAG) const;
     SDValue LowerADJUST_TRAMPOLINE(SDValue Op, SelectionDAG &DAG) const;
-    SDValue LowerVASTART(SDValue Op, SelectionDAG &DAG,
-                         const PPCSubtarget &Subtarget) const;
-    SDValue LowerVAARG(SDValue Op, SelectionDAG &DAG,
-                       const PPCSubtarget &Subtarget) const;
-    SDValue LowerVACOPY(SDValue Op, SelectionDAG &DAG,
-                        const PPCSubtarget &Subtarget) const;
-    SDValue LowerSTACKRESTORE(SDValue Op, SelectionDAG &DAG,
-                                const PPCSubtarget &Subtarget) const;
-    SDValue LowerGET_DYNAMIC_AREA_OFFSET(SDValue Op, SelectionDAG &DAG,
-                                         const PPCSubtarget &Subtarget) const;
-    SDValue LowerDYNAMIC_STACKALLOC(SDValue Op, SelectionDAG &DAG,
-                                      const PPCSubtarget &Subtarget) const;
+    SDValue LowerVASTART(SDValue Op, SelectionDAG &DAG) const;
+    SDValue LowerVAARG(SDValue Op, SelectionDAG &DAG) const;
+    SDValue LowerVACOPY(SDValue Op, SelectionDAG &DAG) const;
+    SDValue LowerSTACKRESTORE(SDValue Op, SelectionDAG &DAG) const;
+    SDValue LowerGET_DYNAMIC_AREA_OFFSET(SDValue Op, SelectionDAG &DAG) const;
+    SDValue LowerDYNAMIC_STACKALLOC(SDValue Op, SelectionDAG &DAG) const;
     SDValue LowerLOAD(SDValue Op, SelectionDAG &DAG) const;
     SDValue LowerSTORE(SDValue Op, SelectionDAG &DAG) const;
     SDValue LowerTRUNCATE(SDValue Op, SelectionDAG &DAG) const;
     SDValue LowerSELECT_CC(SDValue Op, SelectionDAG &DAG) const;
-    SDValue LowerFP_TO_INT(SDValue Op, SelectionDAG &DAG, SDLoc dl) const;
+    SDValue LowerFP_TO_INT(SDValue Op, SelectionDAG &DAG,
+                           const SDLoc &dl) const;
     SDValue LowerINT_TO_FP(SDValue Op, SelectionDAG &DAG) const;
     SDValue LowerFLT_ROUNDS_(SDValue Op, SelectionDAG &DAG) const;
     SDValue LowerSHL_PARTS(SDValue Op, SelectionDAG &DAG) const;
@@ -763,26 +836,23 @@ namespace llvm {
     SDValue LowerCallResult(SDValue Chain, SDValue InFlag,
                             CallingConv::ID CallConv, bool isVarArg,
                             const SmallVectorImpl<ISD::InputArg> &Ins,
-                            SDLoc dl, SelectionDAG &DAG,
+                            const SDLoc &dl, SelectionDAG &DAG,
                             SmallVectorImpl<SDValue> &InVals) const;
-    SDValue FinishCall(CallingConv::ID CallConv, SDLoc dl, bool isTailCall,
-                       bool isVarArg, bool IsPatchPoint, bool hasNest,
-                       SelectionDAG &DAG,
-                       SmallVector<std::pair<unsigned, SDValue>, 8>
-                         &RegsToPass,
+    SDValue FinishCall(CallingConv::ID CallConv, const SDLoc &dl,
+                       bool isTailCall, bool isVarArg, bool isPatchPoint,
+                       bool hasNest, SelectionDAG &DAG,
+                       SmallVector<std::pair<unsigned, SDValue>, 8> &RegsToPass,
                        SDValue InFlag, SDValue Chain, SDValue CallSeqStart,
-                       SDValue &Callee,
-                       int SPDiff, unsigned NumBytes,
+                       SDValue &Callee, int SPDiff, unsigned NumBytes,
                        const SmallVectorImpl<ISD::InputArg> &Ins,
                        SmallVectorImpl<SDValue> &InVals,
                        ImmutableCallSite *CS) const;
 
     SDValue
-      LowerFormalArguments(SDValue Chain,
-                           CallingConv::ID CallConv, bool isVarArg,
-                           const SmallVectorImpl<ISD::InputArg> &Ins,
-                           SDLoc dl, SelectionDAG &DAG,
-                           SmallVectorImpl<SDValue> &InVals) const override;
+    LowerFormalArguments(SDValue Chain, CallingConv::ID CallConv, bool isVarArg,
+                         const SmallVectorImpl<ISD::InputArg> &Ins,
+                         const SDLoc &dl, SelectionDAG &DAG,
+                         SmallVectorImpl<SDValue> &InVals) const override;
 
     SDValue
       LowerCall(TargetLowering::CallLoweringInfo &CLI,
@@ -794,75 +864,66 @@ namespace llvm {
                    const SmallVectorImpl<ISD::OutputArg> &Outs,
                    LLVMContext &Context) const override;
 
-    SDValue
-      LowerReturn(SDValue Chain,
-                  CallingConv::ID CallConv, bool isVarArg,
-                  const SmallVectorImpl<ISD::OutputArg> &Outs,
-                  const SmallVectorImpl<SDValue> &OutVals,
-                  SDLoc dl, SelectionDAG &DAG) const override;
-
-    SDValue
-      extendArgForPPC64(ISD::ArgFlagsTy Flags, EVT ObjectVT, SelectionDAG &DAG,
-                        SDValue ArgVal, SDLoc dl) const;
-
-    SDValue
-      LowerFormalArguments_Darwin(SDValue Chain,
-                                  CallingConv::ID CallConv, bool isVarArg,
-                                  const SmallVectorImpl<ISD::InputArg> &Ins,
-                                  SDLoc dl, SelectionDAG &DAG,
-                                  SmallVectorImpl<SDValue> &InVals) const;
-    SDValue
-      LowerFormalArguments_64SVR4(SDValue Chain,
-                                  CallingConv::ID CallConv, bool isVarArg,
-                                  const SmallVectorImpl<ISD::InputArg> &Ins,
-                                  SDLoc dl, SelectionDAG &DAG,
-                                  SmallVectorImpl<SDValue> &InVals) const;
-    SDValue
-      LowerFormalArguments_32SVR4(SDValue Chain,
-                                  CallingConv::ID CallConv, bool isVarArg,
-                                  const SmallVectorImpl<ISD::InputArg> &Ins,
-                                  SDLoc dl, SelectionDAG &DAG,
-                                  SmallVectorImpl<SDValue> &InVals) const;
-
-    SDValue
-      createMemcpyOutsideCallSeq(SDValue Arg, SDValue PtrOff,
-                                 SDValue CallSeqStart, ISD::ArgFlagsTy Flags,
-                                 SelectionDAG &DAG, SDLoc dl) const;
-
-    SDValue
-      LowerCall_Darwin(SDValue Chain, SDValue Callee,
-                       CallingConv::ID CallConv,
-                       bool isVarArg, bool isTailCall, bool IsPatchPoint,
-                       const SmallVectorImpl<ISD::OutputArg> &Outs,
-                       const SmallVectorImpl<SDValue> &OutVals,
-                       const SmallVectorImpl<ISD::InputArg> &Ins,
-                       SDLoc dl, SelectionDAG &DAG,
-                       SmallVectorImpl<SDValue> &InVals,
-                       ImmutableCallSite *CS) const;
-    SDValue
-      LowerCall_64SVR4(SDValue Chain, SDValue Callee,
-                       CallingConv::ID CallConv,
-                       bool isVarArg, bool isTailCall, bool IsPatchPoint,
-                       const SmallVectorImpl<ISD::OutputArg> &Outs,
-                       const SmallVectorImpl<SDValue> &OutVals,
-                       const SmallVectorImpl<ISD::InputArg> &Ins,
-                       SDLoc dl, SelectionDAG &DAG,
-                       SmallVectorImpl<SDValue> &InVals,
-                       ImmutableCallSite *CS) const;
-    SDValue
-    LowerCall_32SVR4(SDValue Chain, SDValue Callee, CallingConv::ID CallConv,
-                     bool isVarArg, bool isTailCall, bool IsPatchPoint,
-                     const SmallVectorImpl<ISD::OutputArg> &Outs,
-                     const SmallVectorImpl<SDValue> &OutVals,
-                     const SmallVectorImpl<ISD::InputArg> &Ins,
-                     SDLoc dl, SelectionDAG &DAG,
-                     SmallVectorImpl<SDValue> &InVals,
-                     ImmutableCallSite *CS) const;
+    SDValue LowerReturn(SDValue Chain, CallingConv::ID CallConv, bool isVarArg,
+                        const SmallVectorImpl<ISD::OutputArg> &Outs,
+                        const SmallVectorImpl<SDValue> &OutVals,
+                        const SDLoc &dl, SelectionDAG &DAG) const override;
+
+    SDValue extendArgForPPC64(ISD::ArgFlagsTy Flags, EVT ObjectVT,
+                              SelectionDAG &DAG, SDValue ArgVal,
+                              const SDLoc &dl) const;
+
+    SDValue LowerFormalArguments_Darwin(
+        SDValue Chain, CallingConv::ID CallConv, bool isVarArg,
+        const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &dl,
+        SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals) const;
+    SDValue LowerFormalArguments_64SVR4(
+        SDValue Chain, CallingConv::ID CallConv, bool isVarArg,
+        const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &dl,
+        SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals) const;
+    SDValue LowerFormalArguments_32SVR4(
+        SDValue Chain, CallingConv::ID CallConv, bool isVarArg,
+        const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &dl,
+        SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals) const;
+
+    SDValue createMemcpyOutsideCallSeq(SDValue Arg, SDValue PtrOff,
+                                       SDValue CallSeqStart,
+                                       ISD::ArgFlagsTy Flags, SelectionDAG &DAG,
+                                       const SDLoc &dl) const;
+
+    SDValue LowerCall_Darwin(SDValue Chain, SDValue Callee,
+                             CallingConv::ID CallConv, bool isVarArg,
+                             bool isTailCall, bool isPatchPoint,
+                             const SmallVectorImpl<ISD::OutputArg> &Outs,
+                             const SmallVectorImpl<SDValue> &OutVals,
+                             const SmallVectorImpl<ISD::InputArg> &Ins,
+                             const SDLoc &dl, SelectionDAG &DAG,
+                             SmallVectorImpl<SDValue> &InVals,
+                             ImmutableCallSite *CS) const;
+    SDValue LowerCall_64SVR4(SDValue Chain, SDValue Callee,
+                             CallingConv::ID CallConv, bool isVarArg,
+                             bool isTailCall, bool isPatchPoint,
+                             const SmallVectorImpl<ISD::OutputArg> &Outs,
+                             const SmallVectorImpl<SDValue> &OutVals,
+                             const SmallVectorImpl<ISD::InputArg> &Ins,
+                             const SDLoc &dl, SelectionDAG &DAG,
+                             SmallVectorImpl<SDValue> &InVals,
+                             ImmutableCallSite *CS) const;
+    SDValue LowerCall_32SVR4(SDValue Chain, SDValue Callee,
+                             CallingConv::ID CallConv, bool isVarArg,
+                             bool isTailCall, bool isPatchPoint,
+                             const SmallVectorImpl<ISD::OutputArg> &Outs,
+                             const SmallVectorImpl<SDValue> &OutVals,
+                             const SmallVectorImpl<ISD::InputArg> &Ins,
+                             const SDLoc &dl, SelectionDAG &DAG,
+                             SmallVectorImpl<SDValue> &InVals,
+                             ImmutableCallSite *CS) const;
 
     SDValue lowerEH_SJLJ_SETJMP(SDValue Op, SelectionDAG &DAG) const;
     SDValue lowerEH_SJLJ_LONGJMP(SDValue Op, SelectionDAG &DAG) const;
 
     SDValue DAGCombineExtBoolTrunc(SDNode *N, DAGCombinerInfo &DCI) const;
+    SDValue DAGCombineBuildVector(SDNode *N, DAGCombinerInfo &DCI) const;
     SDValue DAGCombineTruncBoolExt(SDNode *N, DAGCombinerInfo &DCI) const;
     SDValue combineFPToIntToFP(SDNode *N, DAGCombinerInfo &DCI) const;
 
diff --git a/lib/Target/PowerPC/PPCInstr64Bit.td b/lib/Target/PowerPC/PPCInstr64Bit.td
index 79e4fe379c2d..e7eb8a16180a 100644
--- a/lib/Target/PowerPC/PPCInstr64Bit.td
+++ b/lib/Target/PowerPC/PPCInstr64Bit.td
@@ -244,12 +244,22 @@ def LDARX : XForm_1<31,  84, (outs g8rc:$rD), (ins memrr:$ptr),
 // (EH=1 - see Power ISA 2.07 Book II 4.4.2)
 def LDARXL : XForm_1<31,  84, (outs g8rc:$rD), (ins memrr:$ptr),
                      "ldarx $rD, $ptr, 1", IIC_LdStLDARX, []>, isDOT;
+
+let hasExtraDefRegAllocReq = 1 in
+def LDAT : X_RD5_RS5_IM5<31, 614, (outs g8rc:$rD), (ins g8rc:$rA, u5imm:$FC),
+                         "ldat $rD, $rA, $FC", IIC_LdStLoad>, isPPC64,
+           Requires<[IsISA3_0]>;
 }
 
 let Defs = [CR0], mayStore = 1, hasSideEffects = 0 in
 def STDCX : XForm_1<31, 214, (outs), (ins g8rc:$rS, memrr:$dst),
                     "stdcx. $rS, $dst", IIC_LdStSTDCX, []>, isDOT;
 
+let mayStore = 1, hasSideEffects = 0 in
+def STDAT : X_RD5_RS5_IM5<31, 742, (outs), (ins g8rc:$rS, g8rc:$rA, u5imm:$FC),
+                          "stdat $rS, $rA, $FC", IIC_LdStStore>, isPPC64,
+            Requires<[IsISA3_0]>;
+
 let Interpretation64Bit = 1, isCodeGenOnly = 1 in {
 let isCall = 1, isTerminator = 1, isReturn = 1, isBarrier = 1, Uses = [RM] in
 def TCRETURNdi8 :Pseudo< (outs),
@@ -476,8 +486,10 @@ defm ADD8  : XOForm_1r<31, 266, 0, (outs g8rc:$rT), (ins g8rc:$rA, g8rc:$rB),
                        "add", "$rT, $rA, $rB", IIC_IntSimple,
                        [(set i64:$rT, (add i64:$rA, i64:$rB))]>;
 // ADD8 has a special form: reg = ADD8(reg, sym@tls) for use by the
-// initial-exec thread-local storage model.
-def ADD8TLS  : XOForm_1<31, 266, 0, (outs g8rc:$rT), (ins g8rc:$rA, tlsreg:$rB),
+// initial-exec thread-local storage model.  We need to forbid r0 here -
+// while it works for add just fine, the linker can relax this to local-exec
+// addi, which won't work for r0.
+def ADD8TLS  : XOForm_1<31, 266, 0, (outs g8rc:$rT), (ins g8rc_nox0:$rA, tlsreg:$rB),
                         "add $rT, $rA, $rB", IIC_IntSimple,
                         [(set i64:$rT, (add i64:$rA, tglobaltlsaddr:$rB))]>;
                      
@@ -502,11 +514,11 @@ let Defs = [CARRY] in {
 def SUBFIC8: DForm_2< 8, (outs g8rc:$rD), (ins g8rc:$rA, s16imm64:$imm),
                      "subfic $rD, $rA, $imm", IIC_IntGeneral,
                      [(set i64:$rD, (subc imm64SExt16:$imm, i64:$rA))]>;
-defm SUBFC8 : XOForm_1r<31, 8, 0, (outs g8rc:$rT), (ins g8rc:$rA, g8rc:$rB),
+}
+defm SUBFC8 : XOForm_1rc<31, 8, 0, (outs g8rc:$rT), (ins g8rc:$rA, g8rc:$rB),
                         "subfc", "$rT, $rA, $rB", IIC_IntGeneral,
                         [(set i64:$rT, (subc i64:$rB, i64:$rA))]>,
                         PPC970_DGroup_Cracked;
-}
 defm SUBF8 : XOForm_1r<31, 40, 0, (outs g8rc:$rT), (ins g8rc:$rA, g8rc:$rB),
                        "subf", "$rT, $rA, $rB", IIC_IntGeneral,
                        [(set i64:$rT, (sub i64:$rB, i64:$rA))]>;
@@ -564,6 +576,14 @@ let isCompare = 1, hasSideEffects = 0 in {
   def CMPLDI : DForm_6_ext<10, (outs crrc:$dst), (ins g8rc:$src1, u16imm64:$src2),
                            "cmpldi $dst, $src1, $src2",
                            IIC_IntCompare>, isPPC64;
+  let Interpretation64Bit = 1, isCodeGenOnly = 1 in
+  def CMPRB8 : X_BF3_L1_RS5_RS5<31, 192, (outs crbitrc:$BF),
+                                (ins u1imm:$L, g8rc:$rA, g8rc:$rB),
+                                "cmprb $BF, $L, $rA, $rB", IIC_IntCompare, []>,
+               Requires<[IsISA3_0]>;
+  def CMPEQB : X_BF3_RS5_RS5<31, 224, (outs crbitrc:$BF),
+                             (ins g8rc:$rA, g8rc:$rB), "cmpeqb $BF, $rA, $rB",
+                             IIC_IntCompare, []>, Requires<[IsISA3_0]>;
 }
 
 let hasSideEffects = 0 in {
@@ -580,6 +600,9 @@ defm SRAD : XForm_6rc<31, 794, (outs g8rc:$rA), (ins g8rc:$rS, gprc:$rB),
 let Interpretation64Bit = 1, isCodeGenOnly = 1 in {
 defm CNTLZW8 : XForm_11r<31,  26, (outs g8rc:$rA), (ins g8rc:$rS),
                         "cntlzw", "$rA, $rS", IIC_IntGeneral, []>;
+defm CNTTZW8 : XForm_11r<31, 538, (outs g8rc:$rA), (ins g8rc:$rS),
+                        "cnttzw", "$rA, $rS", IIC_IntGeneral, []>,
+               Requires<[IsISA3_0]>;
 
 defm EXTSB8 : XForm_11r<31, 954, (outs g8rc:$rA), (ins g8rc:$rS),
                         "extsb", "$rA, $rS", IIC_IntSimple,
@@ -613,9 +636,12 @@ defm EXTSW_32_64 : XForm_11r<31, 986, (outs g8rc:$rA), (ins gprc:$rS),
 defm SRADI  : XSForm_1rc<31, 413, (outs g8rc:$rA), (ins g8rc:$rS, u6imm:$SH),
                          "sradi", "$rA, $rS, $SH", IIC_IntRotateDI,
                          [(set i64:$rA, (sra i64:$rS, (i32 imm:$SH)))]>, isPPC64;
-defm CNTLZD : XForm_11r<31, 58, (outs g8rc:$rA), (ins g8rc:$rS),
+defm CNTLZD : XForm_11r<31,  58, (outs g8rc:$rA), (ins g8rc:$rS),
                         "cntlzd", "$rA, $rS", IIC_IntGeneral,
                         [(set i64:$rA, (ctlz i64:$rS))]>;
+defm CNTTZD : XForm_11r<31, 570, (outs g8rc:$rA), (ins g8rc:$rS),
+                        "cnttzd", "$rA, $rS", IIC_IntGeneral,
+                        [(set i64:$rA, (cttz i64:$rS))]>, Requires<[IsISA3_0]>;
 def POPCNTD : XForm_11<31, 506, (outs g8rc:$rA), (ins g8rc:$rS),
                        "popcntd $rA, $rS", IIC_IntGeneral,
                        [(set i64:$rA, (ctpop i64:$rS))]>;
@@ -905,6 +931,10 @@ def LDUX : XForm_1<31, 53, (outs g8rc:$rD, ptr_rc_nor0:$ea_result),
                    "ldux $rD, $addr", IIC_LdStLDUX,
                    []>, RegConstraint<"$addr.ptrreg = $ea_result">,
                    NoEncode<"$ea_result">, isPPC64;
+
+def LDMX : XForm_1<31, 309, (outs g8rc:$rD), (ins memrr:$src),
+                   "ldmx $rD, $src", IIC_LdStLD, []>, isPPC64,
+           Requires<[IsISA3_0]>;
 }
 }
 
@@ -1246,3 +1276,24 @@ def : Pat<(atomic_load_64 xaddr:$src),  (LDX memrr:$src)>;
 
 def : Pat<(atomic_store_64 ixaddr:$ptr, i64:$val), (STD  g8rc:$val, memrix:$ptr)>;
 def : Pat<(atomic_store_64 xaddr:$ptr,  i64:$val), (STDX g8rc:$val, memrr:$ptr)>;
+
+let Predicates = [IsISA3_0] in {
+
+class X_L1_RA5_RB5<bits<6> opcode, bits<10> xo, string opc, RegisterOperand ty,
+                   InstrItinClass itin, list<dag> pattern>
+  : X_L1_RS5_RS5<opcode, xo, (outs), (ins ty:$rA, ty:$rB, u1imm:$L),
+                 !strconcat(opc, " $rA, $rB, $L"), itin, pattern>;
+
+let Interpretation64Bit = 1, isCodeGenOnly = 1 in {
+def CP_COPY8   : X_L1_RA5_RB5<31, 774, "copy"  , g8rc, IIC_LdStCOPY, []>;
+def CP_PASTE8  : X_L1_RA5_RB5<31, 902, "paste" , g8rc, IIC_LdStPASTE, []>;
+def CP_PASTE8o : X_L1_RA5_RB5<31, 902, "paste.", g8rc, IIC_LdStPASTE, []>,isDOT;
+}
+
+// SLB Invalidate Entry Global
+def SLBIEG : XForm_26<31, 466, (outs), (ins gprc:$RS, gprc:$RB),
+                      "slbieg $RS, $RB", IIC_SprSLBIEG, []>;
+// SLB Synchronize
+def SLBSYNC : XForm_0<31, 338, (outs), (ins), "slbsync", IIC_SprSLBSYNC, []>;
+
+} // IsISA3_0
diff --git a/lib/Target/PowerPC/PPCInstrAltivec.td b/lib/Target/PowerPC/PPCInstrAltivec.td
index 53674681b213..e1c4673c2d7f 100644
--- a/lib/Target/PowerPC/PPCInstrAltivec.td
+++ b/lib/Target/PowerPC/PPCInstrAltivec.td
@@ -1213,3 +1213,187 @@ def VNCIPHERLAST : VX1_Int_Ty<1353, "vncipherlast",
                               int_ppc_altivec_crypto_vncipherlast, v2i64>;
 def VSBOX : VXBX_Int_Ty<1480, "vsbox", int_ppc_altivec_crypto_vsbox, v2i64>;
 } // HasP8Crypto
+
+// The following altivec instructions were introduced in Power ISA 3.0
+def HasP9Altivec : Predicate<"PPCSubTarget->hasP9Altivec()">;
+let Predicates = [HasP9Altivec] in {
+
+// Vector Compare Not Equal (Zero)
+class P9VCMP<bits<10> xo, string asmstr, ValueType Ty>
+  : VXRForm_1<xo, (outs vrrc:$vD), (ins vrrc:$vA, vrrc:$vB), asmstr,
+              IIC_VecFPCompare, []>;
+class P9VCMPo<bits<10> xo, string asmstr, ValueType Ty>
+  : VXRForm_1<xo, (outs vrrc:$vD), (ins vrrc:$vA, vrrc:$vB), asmstr,
+              IIC_VecFPCompare, []> {
+  let Defs = [CR6];
+  let RC = 1;
+}
+
+// i8 element comparisons.
+def VCMPNEB   : P9VCMP <  7, "vcmpneb $vD, $vA, $vB"  , v16i8>;
+def VCMPNEBo  : P9VCMPo<  7, "vcmpneb. $vD, $vA, $vB" , v16i8>;
+def VCMPNEZB  : P9VCMP <263, "vcmpnezb $vD, $vA, $vB" , v16i8>;
+def VCMPNEZBo : P9VCMPo<263, "vcmpnezb. $vD, $vA, $vB", v16i8>;
+
+// i16 element comparisons.
+def VCMPNEH   : P9VCMP < 71, "vcmpneh $vD, $vA, $vB"  , v8i16>;
+def VCMPNEHo  : P9VCMPo< 71, "vcmpneh. $vD, $vA, $vB" , v8i16>;
+def VCMPNEZH  : P9VCMP <327, "vcmpnezh $vD, $vA, $vB" , v8i16>;
+def VCMPNEZHo : P9VCMPo<327, "vcmpnezh. $vD, $vA, $vB", v8i16>;
+
+// i32 element comparisons.
+def VCMPNEW   : P9VCMP <135, "vcmpnew $vD, $vA, $vB"  , v4i32>;
+def VCMPNEWo  : P9VCMPo<135, "vcmpnew. $vD, $vA, $vB" , v4i32>;
+def VCMPNEZW  : P9VCMP <391, "vcmpnezw $vD, $vA, $vB" , v4i32>;
+def VCMPNEZWo : P9VCMPo<391, "vcmpnezw. $vD, $vA, $vB", v4i32>;
+
+// VX-Form: [PO VRT / UIM VRB XO].
+// We use VXForm_1 to implement it, that is, we use "VRA" (5 bit) to represent
+// "/ UIM" (1 + 4 bit)
+class VX1_VT5_UIM5_VB5<bits<11> xo, string opc, list<dag> pattern>
+  : VXForm_1<xo, (outs vrrc:$vD), (ins u4imm:$UIMM, vrrc:$vB),
+             !strconcat(opc, " $vD, $vB, $UIMM"), IIC_VecGeneral, pattern>;
+
+class VX1_RT5_RA5_VB5<bits<11> xo, string opc, list<dag> pattern>
+  : VXForm_1<xo, (outs g8rc:$rD), (ins g8rc:$rA, vrrc:$vB),
+             !strconcat(opc, " $rD, $rA, $vB"), IIC_VecGeneral, pattern>;
+
+// Vector Extract Unsigned
+def VEXTRACTUB : VX1_VT5_UIM5_VB5<525, "vextractub", []>;
+def VEXTRACTUH : VX1_VT5_UIM5_VB5<589, "vextractuh", []>;
+def VEXTRACTUW : VX1_VT5_UIM5_VB5<653, "vextractuw", []>;
+def VEXTRACTD  : VX1_VT5_UIM5_VB5<717, "vextractd" , []>;
+
+// Vector Extract Unsigned Byte/Halfword/Word Left/Right-Indexed
+def VEXTUBLX : VX1_RT5_RA5_VB5<1549, "vextublx", []>;
+def VEXTUBRX : VX1_RT5_RA5_VB5<1805, "vextubrx", []>;
+def VEXTUHLX : VX1_RT5_RA5_VB5<1613, "vextuhlx", []>;
+def VEXTUHRX : VX1_RT5_RA5_VB5<1869, "vextuhrx", []>;
+def VEXTUWLX : VX1_RT5_RA5_VB5<1677, "vextuwlx", []>;
+def VEXTUWRX : VX1_RT5_RA5_VB5<1933, "vextuwrx", []>;
+
+// Vector Insert Element Instructions
+def VINSERTB : VX1_VT5_UIM5_VB5<781, "vinsertb", []>;
+def VINSERTH : VX1_VT5_UIM5_VB5<845, "vinserth", []>;
+def VINSERTW : VX1_VT5_UIM5_VB5<909, "vinsertw", []>;
+def VINSERTD : VX1_VT5_UIM5_VB5<973, "vinsertd", []>;
+
+class VX_VT5_EO5_VB5<bits<11> xo, bits<5> eo, string opc, list<dag> pattern>
+  : VXForm_RD5_XO5_RS5<xo, eo, (outs vrrc:$vD), (ins vrrc:$vB),
+                       !strconcat(opc, " $vD, $vB"), IIC_VecGeneral, pattern>;
+
+// Vector Count Leading/Trailing Zero LSB. Result is placed into GPR[rD]
+def VCLZLSBB : VXForm_RD5_XO5_RS5<1538, 0, (outs g8rc:$rD), (ins vrrc:$vB),
+                                  "vclzlsbb $rD, $vB", IIC_VecGeneral, []>;
+def VCTZLSBB : VXForm_RD5_XO5_RS5<1538, 1, (outs g8rc:$rD), (ins vrrc:$vB),
+                                  "vctzlsbb $rD, $vB", IIC_VecGeneral, []>;
+// Vector Count Trailing Zeros
+def VCTZB : VX_VT5_EO5_VB5<1538, 28, "vctzb", []>;
+def VCTZH : VX_VT5_EO5_VB5<1538, 29, "vctzh", []>;
+def VCTZW : VX_VT5_EO5_VB5<1538, 30, "vctzw", []>;
+def VCTZD : VX_VT5_EO5_VB5<1538, 31, "vctzd", []>;
+
+// Vector Extend Sign
+def VEXTSB2W : VX_VT5_EO5_VB5<1538, 16, "vextsb2w", []>;
+def VEXTSH2W : VX_VT5_EO5_VB5<1538, 17, "vextsh2w", []>;
+def VEXTSB2D : VX_VT5_EO5_VB5<1538, 24, "vextsb2d", []>;
+def VEXTSH2D : VX_VT5_EO5_VB5<1538, 25, "vextsh2d", []>;
+def VEXTSW2D : VX_VT5_EO5_VB5<1538, 26, "vextsw2d", []>;
+
+// Vector Integer Negate
+def VNEGW : VX_VT5_EO5_VB5<1538, 6, "vnegw", []>;
+def VNEGD : VX_VT5_EO5_VB5<1538, 7, "vnegd", []>;
+
+// Vector Parity Byte
+def VPRTYBW : VX_VT5_EO5_VB5<1538,  8, "vprtybw", []>;
+def VPRTYBD : VX_VT5_EO5_VB5<1538,  9, "vprtybd", []>;
+def VPRTYBQ : VX_VT5_EO5_VB5<1538, 10, "vprtybq", []>;
+
+// Vector (Bit) Permute (Right-indexed)
+def VBPERMD : VXForm_1<1484, (outs vrrc:$vD), (ins vrrc:$vA, vrrc:$vB),
+                       "vbpermd $vD, $vA, $vB", IIC_VecFP, []>;
+def VPERMR : VAForm_1a<59, (outs vrrc:$vD), (ins vrrc:$vA, vrrc:$vB, vrrc:$vC),
+                       "vpermr $vD, $vA, $vB, $vC", IIC_VecFP, []>;
+
+class VX1_VT5_VA5_VB5<bits<11> xo, string opc, list<dag> pattern>
+  : VXForm_1<xo, (outs vrrc:$vD), (ins vrrc:$vA, vrrc:$vB),
+             !strconcat(opc, " $vD, $vA, $vB"), IIC_VecFP, pattern>;
+
+// Vector Rotate Left Mask/Mask-Insert
+def VRLWNM : VX1_VT5_VA5_VB5<389, "vrlwnm", []>;
+def VRLWMI : VX1_VT5_VA5_VB5<133, "vrlwmi", []>;
+def VRLDNM : VX1_VT5_VA5_VB5<453, "vrldnm", []>;
+def VRLDMI : VX1_VT5_VA5_VB5<197, "vrldmi", []>;
+
+// Vector Shift Left/Right
+def VSLV : VX1_VT5_VA5_VB5<1860, "vslv", []>;
+def VSRV : VX1_VT5_VA5_VB5<1796, "vsrv", []>;
+
+// Vector Multiply-by-10 (& Write Carry) Unsigned Quadword
+def VMUL10UQ   : VXForm_BX<513, (outs vrrc:$vD), (ins vrrc:$vA),
+                           "vmul10uq $vD, $vA", IIC_VecFP, []>;
+def VMUL10CUQ  : VXForm_BX<  1, (outs vrrc:$vD), (ins vrrc:$vA),
+                           "vmul10cuq $vD, $vA", IIC_VecFP, []>;
+
+// Vector Multiply-by-10 Extended (& Write Carry) Unsigned Quadword
+def VMUL10EUQ  : VX1_VT5_VA5_VB5<577, "vmul10euq" , []>;
+def VMUL10ECUQ : VX1_VT5_VA5_VB5< 65, "vmul10ecuq", []>;
+
+// Decimal Integer Format Conversion Instructions
+
+// [PO VRT EO VRB 1 PS XO], "_o" means CR6 is set.
+class VX_VT5_EO5_VB5_PS1_XO9_o<bits<5> eo, bits<9> xo, string opc,
+                               list<dag> pattern>
+  : VX_RD5_EO5_RS5_PS1_XO9<eo, xo, (outs vrrc:$vD), (ins vrrc:$vB, u1imm:$PS),
+                        !strconcat(opc, " $vD, $vB, $PS"), IIC_VecFP, pattern> {
+  let Defs = [CR6];
+}
+
+// [PO VRT EO VRB 1 / XO]
+class VX_VT5_EO5_VB5_XO9_o<bits<5> eo, bits<9> xo, string opc,
+                           list<dag> pattern>
+  : VX_RD5_EO5_RS5_PS1_XO9<eo, xo, (outs vrrc:$vD), (ins vrrc:$vB),
+                           !strconcat(opc, " $vD, $vB"), IIC_VecFP, pattern> {
+  let Defs = [CR6];
+  let PS = 0;
+}
+
+// Decimal Convert From/to National/Zoned/Signed-QWord
+def BCDCFNo  : VX_VT5_EO5_VB5_PS1_XO9_o<7, 385, "bcdcfn." , []>;
+def BCDCFZo  : VX_VT5_EO5_VB5_PS1_XO9_o<6, 385, "bcdcfz." , []>;
+def BCDCTNo  : VX_VT5_EO5_VB5_XO9_o    <5, 385, "bcdctn." , []>;
+def BCDCTZo  : VX_VT5_EO5_VB5_PS1_XO9_o<4, 385, "bcdctz." , []>;
+def BCDCFSQo : VX_VT5_EO5_VB5_PS1_XO9_o<2, 385, "bcdcfsq.", []>;
+def BCDCTSQo : VX_VT5_EO5_VB5_XO9_o    <0, 385, "bcdctsq.", []>;
+
+// Decimal Copy-Sign/Set-Sign
+let Defs = [CR6] in
+def BCDCPSGNo : VX1_VT5_VA5_VB5<833, "bcdcpsgn.", []>;
+
+def BCDSETSGNo : VX_VT5_EO5_VB5_PS1_XO9_o<31, 385, "bcdsetsgn.", []>;
+
+// [PO VRT VRA VRB 1 PS XO], "_o" means CR6 is set.
+class VX_VT5_VA5_VB5_PS1_XO9_o<bits<9> xo, string opc, list<dag> pattern>
+  : VX_RD5_RSp5_PS1_XO9<xo,
+                   (outs vrrc:$vD), (ins vrrc:$vA, vrrc:$vB, u1imm:$PS),
+                   !strconcat(opc, " $vD, $vA, $vB, $PS"), IIC_VecFP, pattern> {
+  let Defs = [CR6];
+}
+
+// [PO VRT VRA VRB 1 / XO]
+class VX_VT5_VA5_VB5_XO9_o<bits<9> xo, string opc, list<dag> pattern>
+  : VX_RD5_RSp5_PS1_XO9<xo, (outs vrrc:$vD), (ins vrrc:$vA, vrrc:$vB),
+                        !strconcat(opc, " $vD, $vA, $vB"), IIC_VecFP, pattern> {
+  let Defs = [CR6];
+  let PS = 0;
+}
+
+// Decimal Shift/Unsigned-Shift/Shift-and-Round
+def BCDSo :  VX_VT5_VA5_VB5_PS1_XO9_o<193, "bcds." , []>;
+def BCDUSo : VX_VT5_VA5_VB5_XO9_o    <129, "bcdus.", []>;
+def BCDSRo : VX_VT5_VA5_VB5_PS1_XO9_o<449, "bcdsr.", []>;
+
+// Decimal (Unsigned) Truncate
+def BCDTRUNCo :  VX_VT5_VA5_VB5_PS1_XO9_o<257, "bcdtrunc." , []>;
+def BCDUTRUNCo : VX_VT5_VA5_VB5_XO9_o    <321, "bcdutrunc.", []>;
+} // end HasP9Altivec
diff --git a/lib/Target/PowerPC/PPCInstrFormats.td b/lib/Target/PowerPC/PPCInstrFormats.td
index 4e03ed27653f..5acff7559544 100644
--- a/lib/Target/PowerPC/PPCInstrFormats.td
+++ b/lib/Target/PowerPC/PPCInstrFormats.td
@@ -360,6 +360,21 @@ class DSForm_1<bits<6> opcode, bits<2> xo, dag OOL, dag IOL, string asmstr,
   let Inst{30-31} = xo;
 }
 
+// DQ-Form: [PO T RA DQ TX XO] or [PO S RA DQ SX XO]
+class DQ_RD6_RS5_DQ12<bits<6> opcode, bits<3> xo, dag OOL, dag IOL,
+                      string asmstr, InstrItinClass itin, list<dag> pattern>
+  : I<opcode, OOL, IOL, asmstr, itin> {
+  bits<6>  XT;
+  bits<17> DS_RA;
+
+  let Pattern = pattern;
+
+  let Inst{6-10}  = XT{4-0};
+  let Inst{11-15} = DS_RA{16-12};  // Register #
+  let Inst{16-27} = DS_RA{11-0};   // Displacement.
+  let Inst{28}    = XT{5};
+  let Inst{29-31} = xo;
+}
 
 // 1.7.6 X-Form
 class XForm_base_r3xo<bits<6> opcode, bits<10> xo, dag OOL, dag IOL, string asmstr, 
@@ -747,6 +762,107 @@ class XForm_htm3<bits<6> opcode, bits<10> xo, dag OOL, dag IOL, string asmstr,
   let Inst{31}    = RC;
 }
 
+// [PO RT RA RB XO /]
+class X_BF3_L1_RS5_RS5<bits<6> opcode, bits<10> xo, dag OOL, dag IOL,
+                       string asmstr, InstrItinClass itin, list<dag> pattern>
+  : I<opcode, OOL, IOL, asmstr, itin> {
+  bits<3> BF;
+  bits<1> L;
+  bits<5> RA;
+  bits<5> RB;
+
+  let Pattern = pattern;
+
+  let Inst{6-8}   = BF;
+  let Inst{9}     = 0;
+  let Inst{10}    = L;
+  let Inst{11-15} = RA;
+  let Inst{16-20} = RB;
+  let Inst{21-30} = xo;
+  let Inst{31}    = 0;
+}
+
+// Same as XForm_17 but with GPR's and new naming convention
+class X_BF3_RS5_RS5<bits<6> opcode, bits<10> xo, dag OOL, dag IOL,
+                    string asmstr, InstrItinClass itin, list<dag> pattern>
+         : I<opcode, OOL, IOL, asmstr, itin> {
+  bits<3> BF;
+  bits<5> RA;
+  bits<5> RB;
+
+  let Pattern = pattern;
+
+  let Inst{6-8}   = BF;
+  let Inst{9-10}  = 0;
+  let Inst{11-15} = RA;
+  let Inst{16-20} = RB;
+  let Inst{21-30} = xo;
+  let Inst{31}    = 0;
+}
+
+// e.g. [PO VRT XO VRB XO /] or [PO VRT XO VRB XO RO]
+class X_RD5_XO5_RS5<bits<6> opcode, bits<5> xo2, bits<10> xo, dag OOL, dag IOL,
+                    string asmstr, InstrItinClass itin, list<dag> pattern>
+  : XForm_base_r3xo<opcode, xo, OOL, IOL, asmstr, itin, pattern> {
+  let A = xo2;
+}
+
+class X_BF3_DCMX7_RS5<bits<6> opcode, bits<10> xo, dag OOL, dag IOL,
+                      string asmstr, InstrItinClass itin, list<dag> pattern>
+  : I<opcode, OOL, IOL, asmstr, itin> {
+  bits<3> BF;
+  bits<7> DCMX;
+  bits<5> VB;
+
+  let Pattern = pattern;
+
+  let Inst{6-8}  = BF;
+  let Inst{9-15} = DCMX;
+  let Inst{16-20} = VB;
+  let Inst{21-30} = xo;
+  let Inst{31}    = 0;
+}
+
+class X_RD6_IMM8<bits<6> opcode, bits<10> xo, dag OOL, dag IOL,
+                 string asmstr, InstrItinClass itin, list<dag> pattern>
+  : I<opcode, OOL, IOL, asmstr, itin> {
+  bits<6> XT;
+  bits<8> IMM8;
+
+  let Pattern = pattern;
+
+  let Inst{6-10}  = XT{4-0};
+  let Inst{11-12} = 0;
+  let Inst{13-20} = IMM8;
+  let Inst{21-30} = xo;
+  let Inst{31}    = XT{5};
+}
+
+// XForm_base_r3xo for instructions such as P9 atomics where we don't want
+// to specify an SDAG pattern for matching.
+class X_RD5_RS5_IM5<bits<6> opcode, bits<10> xo, dag OOL, dag IOL,
+                    string asmstr, InstrItinClass itin>
+  : XForm_base_r3xo<opcode, xo, OOL, IOL, asmstr, itin, []> {
+}
+
+class X_BF3<bits<6> opcode, bits<10> xo, dag OOL, dag IOL, string asmstr,
+            InstrItinClass itin>
+  : XForm_17<opcode, xo, OOL, IOL, asmstr, itin> {
+  let FRA = 0;
+  let FRB = 0;
+}
+
+// [PO /// L RA RB XO /]
+class X_L1_RS5_RS5<bits<6> opcode, bits<10> xo, dag OOL, dag IOL,
+                   string asmstr, InstrItinClass itin, list<dag> pattern>
+  : XForm_16<opcode, xo, OOL, IOL, asmstr, itin> {
+  let BF = 0;
+  let Pattern = pattern;
+
+  bit RC = 0;
+  let Inst{31} = RC;
+}
+
 // XX*-Form (VSX)
 class XX1Form<bits<6> opcode, bits<10> xo, dag OOL, dag IOL, string asmstr, 
               InstrItinClass itin, list<dag> pattern>
@@ -820,6 +936,95 @@ class XX2Form_2<bits<6> opcode, bits<9> xo, dag OOL, dag IOL, string asmstr,
   let Inst{31}    = XT{5};
 }
 
+class XX2_RD6_UIM5_RS6<bits<6> opcode, bits<9> xo, dag OOL, dag IOL,
+                       string asmstr, InstrItinClass itin, list<dag> pattern>
+  : I<opcode, OOL, IOL, asmstr, itin> {
+  bits<6> XT;
+  bits<6> XB;
+  bits<5> UIM5;
+
+  let Pattern = pattern;
+
+  let Inst{6-10}  = XT{4-0};
+  let Inst{11-15} = UIM5;
+  let Inst{16-20} = XB{4-0};
+  let Inst{21-29} = xo;
+  let Inst{30}    = XB{5};
+  let Inst{31}    = XT{5};
+}
+
+// [PO T XO B XO BX /]
+class XX2_RD5_XO5_RS6<bits<6> opcode, bits<5> xo2, bits<9> xo, dag OOL, dag IOL,
+                       string asmstr, InstrItinClass itin, list<dag> pattern>
+  : I<opcode, OOL, IOL, asmstr, itin> {
+  bits<5> RT;
+  bits<6> XB;
+
+  let Pattern = pattern;
+
+  let Inst{6-10}  = RT;
+  let Inst{11-15} = xo2;
+  let Inst{16-20} = XB{4-0};
+  let Inst{21-29} = xo;
+  let Inst{30}    = XB{5};
+  let Inst{31}    = 0;
+}
+
+// [PO T XO B XO BX TX]
+class XX2_RD6_XO5_RS6<bits<6> opcode, bits<5> xo2, bits<9> xo, dag OOL, dag IOL,
+                      string asmstr, InstrItinClass itin, list<dag> pattern>
+  : I<opcode, OOL, IOL, asmstr, itin> {
+  bits<6> XT;
+  bits<6> XB;
+
+  let Pattern = pattern;
+
+  let Inst{6-10}  = XT{4-0};
+  let Inst{11-15} = xo2;
+  let Inst{16-20} = XB{4-0};
+  let Inst{21-29} = xo;
+  let Inst{30}    = XB{5};
+  let Inst{31}    = XT{5};
+}
+
+class XX2_BF3_DCMX7_RS6<bits<6> opcode, bits<9> xo, dag OOL, dag IOL,
+                      string asmstr, InstrItinClass itin, list<dag> pattern>
+  : I<opcode, OOL, IOL, asmstr, itin> {
+  bits<3> BF;
+  bits<7> DCMX;
+  bits<6> XB;
+
+  let Pattern = pattern;
+
+  let Inst{6-8}  = BF;
+  let Inst{9-15} = DCMX;
+  let Inst{16-20} = XB{4-0};
+  let Inst{21-29} = xo;
+  let Inst{30}    = XB{5};
+  let Inst{31}    = 0;
+}
+
+class XX2_RD6_DCMX7_RS6<bits<6> opcode, bits<4> xo1, bits<3> xo2,
+                        dag OOL, dag IOL, string asmstr, InstrItinClass itin,
+                        list<dag> pattern>
+  : I<opcode, OOL, IOL, asmstr, itin> {
+  bits<6> XT;
+  bits<7> DCMX;
+  bits<6> XB;
+
+  let Pattern = pattern;
+
+  let Inst{6-10}  = XT{4-0};
+  let Inst{11-15} = DCMX{4-0};
+  let Inst{16-20} = XB{4-0};
+  let Inst{21-24} = xo1;
+  let Inst{25}    = DCMX{5};
+  let Inst{26-28} = xo2;
+  let Inst{29}    = DCMX{6};
+  let Inst{30}    = XB{5};
+  let Inst{31}    = XT{5};
+}
+
 class XX3Form<bits<6> opcode, bits<8> xo, dag OOL, dag IOL, string asmstr, 
               InstrItinClass itin, list<dag> pattern>
   : I<opcode, OOL, IOL, asmstr, itin> {
@@ -1571,6 +1776,21 @@ class VXForm_5<bits<11> xo, dag OOL, dag IOL, string asmstr,
   let Inst{21-31} = xo;
 }
 
+// e.g. [PO VRT EO VRB XO]
+class VXForm_RD5_XO5_RS5<bits<11> xo, bits<5> eo, dag OOL, dag IOL,
+                         string asmstr, InstrItinClass itin, list<dag> pattern>
+    : I<4, OOL, IOL, asmstr, itin> {
+  bits<5> RD;
+  bits<5> VB;
+
+  let Pattern = pattern;
+
+  let Inst{6-10}  = RD;
+  let Inst{11-15} = eo;
+  let Inst{16-20} = VB;
+  let Inst{21-31} = xo;
+}
+
 /// VXForm_CR - VX crypto instructions with "VRT, VRA, ST, SIX"
 class VXForm_CR<bits<11> xo, dag OOL, dag IOL, string asmstr,
                InstrItinClass itin, list<dag> pattern>
@@ -1622,6 +1842,44 @@ class VXRForm_1<bits<10> xo, dag OOL, dag IOL, string asmstr,
   let Inst{22-31} = xo;
 }
 
+// VX-Form: [PO VRT EO VRB 1 PS XO]
+class VX_RD5_EO5_RS5_PS1_XO9<bits<5> eo, bits<9> xo,
+                             dag OOL, dag IOL, string asmstr,
+                             InstrItinClass itin, list<dag> pattern>
+  : I<4, OOL, IOL, asmstr, itin> {
+  bits<5> VD;
+  bits<5> VB;
+  bit PS;
+
+  let Pattern = pattern;
+
+  let Inst{6-10}  = VD;
+  let Inst{11-15} = eo;
+  let Inst{16-20} = VB;
+  let Inst{21}    = 1;
+  let Inst{22}    = PS;
+  let Inst{23-31} = xo;
+}
+
+// VX-Form: [PO VRT VRA VRB 1 PS XO] or [PO VRT VRA VRB 1 / XO]
+class VX_RD5_RSp5_PS1_XO9<bits<9> xo, dag OOL, dag IOL, string asmstr,
+                          InstrItinClass itin, list<dag> pattern>
+  : I<4, OOL, IOL, asmstr, itin> {
+  bits<5> VD;
+  bits<5> VA;
+  bits<5> VB;
+  bit PS;
+
+  let Pattern = pattern;
+
+  let Inst{6-10}  = VD;
+  let Inst{11-15} = VA;
+  let Inst{16-20} = VB;
+  let Inst{21}    = 1;
+  let Inst{22}    = PS;
+  let Inst{23-31} = xo;
+}
+
 // Z23-Form (used by QPX)
 class Z23Form_1<bits<6> opcode, bits<8> xo, dag OOL, dag IOL, string asmstr, 
               InstrItinClass itin, list<dag> pattern>
diff --git a/lib/Target/PowerPC/PPCInstrInfo.cpp b/lib/Target/PowerPC/PPCInstrInfo.cpp
index dcff6ad2486f..b6ae70ec1a2d 100644
--- a/lib/Target/PowerPC/PPCInstrInfo.cpp
+++ b/lib/Target/PowerPC/PPCInstrInfo.cpp
@@ -93,6 +93,7 @@ PPCInstrInfo::CreateTargetPostRAHazardRecognizer(const InstrItineraryData *II,
   unsigned Directive =
       DAG->MF.getSubtarget<PPCSubtarget>().getDarwinDirective();
 
+  // FIXME: Leaving this as-is until we have POWER9 scheduling info
   if (Directive == PPC::DIR_PWR7 || Directive == PPC::DIR_PWR8)
     return new PPCDispatchGroupSBHazardRecognizer(II, DAG);
 
@@ -108,7 +109,7 @@ PPCInstrInfo::CreateTargetPostRAHazardRecognizer(const InstrItineraryData *II,
 }
 
 unsigned PPCInstrInfo::getInstrLatency(const InstrItineraryData *ItinData,
-                                       const MachineInstr *MI,
+                                       const MachineInstr &MI,
                                        unsigned *PredCost) const {
   if (!ItinData || UseOldLatencyCalc)
     return PPCGenInstrInfo::getInstrLatency(ItinData, MI, PredCost);
@@ -121,9 +122,9 @@ unsigned PPCInstrInfo::getInstrLatency(const InstrItineraryData *ItinData,
   // is an output).
 
   unsigned Latency = 1;
-  unsigned DefClass = MI->getDesc().getSchedClass();
-  for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) {
-    const MachineOperand &MO = MI->getOperand(i);
+  unsigned DefClass = MI.getDesc().getSchedClass();
+  for (unsigned i = 0, e = MI.getNumOperands(); i != e; ++i) {
+    const MachineOperand &MO = MI.getOperand(i);
     if (!MO.isReg() || !MO.isDef() || MO.isImplicit())
       continue;
 
@@ -138,22 +139,22 @@ unsigned PPCInstrInfo::getInstrLatency(const InstrItineraryData *ItinData,
 }
 
 int PPCInstrInfo::getOperandLatency(const InstrItineraryData *ItinData,
-                                    const MachineInstr *DefMI, unsigned DefIdx,
-                                    const MachineInstr *UseMI,
+                                    const MachineInstr &DefMI, unsigned DefIdx,
+                                    const MachineInstr &UseMI,
                                     unsigned UseIdx) const {
   int Latency = PPCGenInstrInfo::getOperandLatency(ItinData, DefMI, DefIdx,
                                                    UseMI, UseIdx);
 
-  if (!DefMI->getParent())
+  if (!DefMI.getParent())
     return Latency;
 
-  const MachineOperand &DefMO = DefMI->getOperand(DefIdx);
+  const MachineOperand &DefMO = DefMI.getOperand(DefIdx);
   unsigned Reg = DefMO.getReg();
 
   bool IsRegCR;
   if (TargetRegisterInfo::isVirtualRegister(Reg)) {
     const MachineRegisterInfo *MRI =
-      &DefMI->getParent()->getParent()->getRegInfo();
+        &DefMI.getParent()->getParent()->getRegInfo();
     IsRegCR = MRI->getRegClass(Reg)->hasSuperClassEq(&PPC::CRRCRegClass) ||
               MRI->getRegClass(Reg)->hasSuperClassEq(&PPC::CRBITRCRegClass);
   } else {
@@ -161,7 +162,7 @@ int PPCInstrInfo::getOperandLatency(const InstrItineraryData *ItinData,
               PPC::CRBITRCRegClass.contains(Reg);
   }
 
-  if (UseMI->isBranch() && IsRegCR) {
+  if (UseMI.isBranch() && IsRegCR) {
     if (Latency < 0)
       Latency = getInstrLatency(ItinData, DefMI);
 
@@ -181,6 +182,7 @@ int PPCInstrInfo::getOperandLatency(const InstrItineraryData *ItinData,
     case PPC::DIR_PWR6X:
     case PPC::DIR_PWR7:
     case PPC::DIR_PWR8:
+    // FIXME: Is this needed for POWER9?
       Latency += 2;
       break;
     }
@@ -258,10 +260,10 @@ bool PPCInstrInfo::isCoalescableExtInstr(const MachineInstr &MI,
   }
 }
 
-unsigned PPCInstrInfo::isLoadFromStackSlot(const MachineInstr *MI,
+unsigned PPCInstrInfo::isLoadFromStackSlot(const MachineInstr &MI,
                                            int &FrameIndex) const {
   // Note: This list must be kept consistent with LoadRegFromStackSlot.
-  switch (MI->getOpcode()) {
+  switch (MI.getOpcode()) {
   default: break;
   case PPC::LD:
   case PPC::LWZ:
@@ -277,20 +279,20 @@ unsigned PPCInstrInfo::isLoadFromStackSlot(const MachineInstr *MI,
   case PPC::RESTORE_VRSAVE:
     // Check for the operands added by addFrameReference (the immediate is the
     // offset which defaults to 0).
-    if (MI->getOperand(1).isImm() && !MI->getOperand(1).getImm() &&
-        MI->getOperand(2).isFI()) {
-      FrameIndex = MI->getOperand(2).getIndex();
-      return MI->getOperand(0).getReg();
+    if (MI.getOperand(1).isImm() && !MI.getOperand(1).getImm() &&
+        MI.getOperand(2).isFI()) {
+      FrameIndex = MI.getOperand(2).getIndex();
+      return MI.getOperand(0).getReg();
     }
     break;
   }
   return 0;
 }
 
-unsigned PPCInstrInfo::isStoreToStackSlot(const MachineInstr *MI,
+unsigned PPCInstrInfo::isStoreToStackSlot(const MachineInstr &MI,
                                           int &FrameIndex) const {
   // Note: This list must be kept consistent with StoreRegToStackSlot.
-  switch (MI->getOpcode()) {
+  switch (MI.getOpcode()) {
   default: break;
   case PPC::STD:
   case PPC::STW:
@@ -306,25 +308,23 @@ unsigned PPCInstrInfo::isStoreToStackSlot(const MachineInstr *MI,
   case PPC::SPILL_VRSAVE:
     // Check for the operands added by addFrameReference (the immediate is the
     // offset which defaults to 0).
-    if (MI->getOperand(1).isImm() && !MI->getOperand(1).getImm() &&
-        MI->getOperand(2).isFI()) {
-      FrameIndex = MI->getOperand(2).getIndex();
-      return MI->getOperand(0).getReg();
+    if (MI.getOperand(1).isImm() && !MI.getOperand(1).getImm() &&
+        MI.getOperand(2).isFI()) {
+      FrameIndex = MI.getOperand(2).getIndex();
+      return MI.getOperand(0).getReg();
     }
     break;
   }
   return 0;
 }
 
-MachineInstr *PPCInstrInfo::commuteInstructionImpl(MachineInstr *MI,
-                                                   bool NewMI,
+MachineInstr *PPCInstrInfo::commuteInstructionImpl(MachineInstr &MI, bool NewMI,
                                                    unsigned OpIdx1,
                                                    unsigned OpIdx2) const {
-  MachineFunction &MF = *MI->getParent()->getParent();
+  MachineFunction &MF = *MI.getParent()->getParent();
 
   // Normal instructions can be commuted the obvious way.
-  if (MI->getOpcode() != PPC::RLWIMI &&
-      MI->getOpcode() != PPC::RLWIMIo)
+  if (MI.getOpcode() != PPC::RLWIMI && MI.getOpcode() != PPC::RLWIMIo)
     return TargetInstrInfo::commuteInstructionImpl(MI, NewMI, OpIdx1, OpIdx2);
   // Note that RLWIMI can be commuted as a 32-bit instruction, but not as a
   // 64-bit instruction (so we don't handle PPC::RLWIMI8 here), because
@@ -332,7 +332,7 @@ MachineInstr *PPCInstrInfo::commuteInstructionImpl(MachineInstr *MI,
   // to the high-bits of the mask (and, thus, the result).
 
   // Cannot commute if it has a non-zero rotate count.
-  if (MI->getOperand(3).getImm() != 0)
+  if (MI.getOperand(3).getImm() != 0)
     return nullptr;
 
   // If we have a zero rotate count, we have:
@@ -345,28 +345,28 @@ MachineInstr *PPCInstrInfo::commuteInstructionImpl(MachineInstr *MI,
   // Swap op1/op2
   assert(((OpIdx1 == 1 && OpIdx2 == 2) || (OpIdx1 == 2 && OpIdx2 == 1)) &&
          "Only the operands 1 and 2 can be swapped in RLSIMI/RLWIMIo.");
-  unsigned Reg0 = MI->getOperand(0).getReg();
-  unsigned Reg1 = MI->getOperand(1).getReg();
-  unsigned Reg2 = MI->getOperand(2).getReg();
-  unsigned SubReg1 = MI->getOperand(1).getSubReg();
-  unsigned SubReg2 = MI->getOperand(2).getSubReg();
-  bool Reg1IsKill = MI->getOperand(1).isKill();
-  bool Reg2IsKill = MI->getOperand(2).isKill();
+  unsigned Reg0 = MI.getOperand(0).getReg();
+  unsigned Reg1 = MI.getOperand(1).getReg();
+  unsigned Reg2 = MI.getOperand(2).getReg();
+  unsigned SubReg1 = MI.getOperand(1).getSubReg();
+  unsigned SubReg2 = MI.getOperand(2).getSubReg();
+  bool Reg1IsKill = MI.getOperand(1).isKill();
+  bool Reg2IsKill = MI.getOperand(2).isKill();
   bool ChangeReg0 = false;
   // If machine instrs are no longer in two-address forms, update
   // destination register as well.
   if (Reg0 == Reg1) {
     // Must be two address instruction!
-    assert(MI->getDesc().getOperandConstraint(0, MCOI::TIED_TO) &&
+    assert(MI.getDesc().getOperandConstraint(0, MCOI::TIED_TO) &&
            "Expecting a two-address instruction!");
-    assert(MI->getOperand(0).getSubReg() == SubReg1 && "Tied subreg mismatch");
+    assert(MI.getOperand(0).getSubReg() == SubReg1 && "Tied subreg mismatch");
     Reg2IsKill = false;
     ChangeReg0 = true;
   }
 
   // Masks.
-  unsigned MB = MI->getOperand(4).getImm();
-  unsigned ME = MI->getOperand(5).getImm();
+  unsigned MB = MI.getOperand(4).getImm();
+  unsigned ME = MI.getOperand(5).getImm();
 
   // We can't commute a trivial mask (there is no way to represent an all-zero
   // mask).
@@ -375,40 +375,40 @@ MachineInstr *PPCInstrInfo::commuteInstructionImpl(MachineInstr *MI,
 
   if (NewMI) {
     // Create a new instruction.
-    unsigned Reg0 = ChangeReg0 ? Reg2 : MI->getOperand(0).getReg();
-    bool Reg0IsDead = MI->getOperand(0).isDead();
-    return BuildMI(MF, MI->getDebugLoc(), MI->getDesc())
-      .addReg(Reg0, RegState::Define | getDeadRegState(Reg0IsDead))
-      .addReg(Reg2, getKillRegState(Reg2IsKill))
-      .addReg(Reg1, getKillRegState(Reg1IsKill))
-      .addImm((ME+1) & 31)
-      .addImm((MB-1) & 31);
+    unsigned Reg0 = ChangeReg0 ? Reg2 : MI.getOperand(0).getReg();
+    bool Reg0IsDead = MI.getOperand(0).isDead();
+    return BuildMI(MF, MI.getDebugLoc(), MI.getDesc())
+        .addReg(Reg0, RegState::Define | getDeadRegState(Reg0IsDead))
+        .addReg(Reg2, getKillRegState(Reg2IsKill))
+        .addReg(Reg1, getKillRegState(Reg1IsKill))
+        .addImm((ME + 1) & 31)
+        .addImm((MB - 1) & 31);
   }
 
   if (ChangeReg0) {
-    MI->getOperand(0).setReg(Reg2);
-    MI->getOperand(0).setSubReg(SubReg2);
+    MI.getOperand(0).setReg(Reg2);
+    MI.getOperand(0).setSubReg(SubReg2);
   }
-  MI->getOperand(2).setReg(Reg1);
-  MI->getOperand(1).setReg(Reg2);
-  MI->getOperand(2).setSubReg(SubReg1);
-  MI->getOperand(1).setSubReg(SubReg2);
-  MI->getOperand(2).setIsKill(Reg1IsKill);
-  MI->getOperand(1).setIsKill(Reg2IsKill);
+  MI.getOperand(2).setReg(Reg1);
+  MI.getOperand(1).setReg(Reg2);
+  MI.getOperand(2).setSubReg(SubReg1);
+  MI.getOperand(1).setSubReg(SubReg2);
+  MI.getOperand(2).setIsKill(Reg1IsKill);
+  MI.getOperand(1).setIsKill(Reg2IsKill);
 
   // Swap the mask around.
-  MI->getOperand(4).setImm((ME+1) & 31);
-  MI->getOperand(5).setImm((MB-1) & 31);
-  return MI;
+  MI.getOperand(4).setImm((ME + 1) & 31);
+  MI.getOperand(5).setImm((MB - 1) & 31);
+  return &MI;
 }
 
-bool PPCInstrInfo::findCommutedOpIndices(MachineInstr *MI, unsigned &SrcOpIdx1,
+bool PPCInstrInfo::findCommutedOpIndices(MachineInstr &MI, unsigned &SrcOpIdx1,
                                          unsigned &SrcOpIdx2) const {
   // For VSX A-Type FMA instructions, it is the first two operands that can be
   // commuted, however, because the non-encoded tied input operand is listed
   // first, the operands to swap are actually the second and third.
 
-  int AltOpc = PPC::getAltVSXFMAOpcode(MI->getOpcode());
+  int AltOpc = PPC::getAltVSXFMAOpcode(MI.getOpcode());
   if (AltOpc == -1)
     return TargetInstrInfo::findCommutedOpIndices(MI, SrcOpIdx1, SrcOpIdx2);
 
@@ -428,6 +428,8 @@ void PPCInstrInfo::insertNoop(MachineBasicBlock &MBB,
   case PPC::DIR_PWR6: Opcode = PPC::NOP_GT_PWR6; break;
   case PPC::DIR_PWR7: Opcode = PPC::NOP_GT_PWR7; break;
   case PPC::DIR_PWR8: Opcode = PPC::NOP_GT_PWR7; break; /* FIXME: Update when P8 InstrScheduling model is ready */
+  // FIXME: Update when POWER9 scheduling model is ready.
+  case PPC::DIR_PWR9: Opcode = PPC::NOP_GT_PWR7; break;
   }
 
   DebugLoc DL;
@@ -442,7 +444,8 @@ void PPCInstrInfo::getNoopForMachoTarget(MCInst &NopInst) const {
 // Branch analysis.
 // Note: If the condition register is set to CTR or CTR8 then this is a
 // BDNZ (imm == 1) or BDZ (imm == 0) branch.
-bool PPCInstrInfo::AnalyzeBranch(MachineBasicBlock &MBB,MachineBasicBlock *&TBB,
+bool PPCInstrInfo::analyzeBranch(MachineBasicBlock &MBB,
+                                 MachineBasicBlock *&TBB,
                                  MachineBasicBlock *&FBB,
                                  SmallVectorImpl<MachineOperand> &Cond,
                                  bool AllowModify) const {
@@ -453,14 +456,14 @@ bool PPCInstrInfo::AnalyzeBranch(MachineBasicBlock &MBB,MachineBasicBlock *&TBB,
   if (I == MBB.end())
     return false;
 
-  if (!isUnpredicatedTerminator(I))
+  if (!isUnpredicatedTerminator(*I))
     return false;
 
   // Get the last instruction in the block.
   MachineInstr *LastInst = I;
 
   // If there is only one terminator instruction, process it.
-  if (I == MBB.begin() || !isUnpredicatedTerminator(--I)) {
+  if (I == MBB.begin() || !isUnpredicatedTerminator(*--I)) {
     if (LastInst->getOpcode() == PPC::B) {
       if (!LastInst->getOperand(0).isMBB())
         return true;
@@ -522,8 +525,7 @@ bool PPCInstrInfo::AnalyzeBranch(MachineBasicBlock &MBB,MachineBasicBlock *&TBB,
   MachineInstr *SecondLastInst = I;
 
   // If there are three terminators, we don't know what sort of block this is.
-  if (SecondLastInst && I != MBB.begin() &&
-      isUnpredicatedTerminator(--I))
+  if (SecondLastInst && I != MBB.begin() && isUnpredicatedTerminator(*--I))
     return true;
 
   // If the block ends with PPC::B and PPC:BCC, handle it.
@@ -633,11 +635,11 @@ unsigned PPCInstrInfo::RemoveBranch(MachineBasicBlock &MBB) const {
   return 2;
 }
 
-unsigned
-PPCInstrInfo::InsertBranch(MachineBasicBlock &MBB, MachineBasicBlock *TBB,
-                           MachineBasicBlock *FBB,
-                           ArrayRef<MachineOperand> Cond,
-                           DebugLoc DL) const {
+unsigned PPCInstrInfo::InsertBranch(MachineBasicBlock &MBB,
+                                    MachineBasicBlock *TBB,
+                                    MachineBasicBlock *FBB,
+                                    ArrayRef<MachineOperand> Cond,
+                                    const DebugLoc &DL) const {
   // Shouldn't be a fall through.
   assert(TBB && "InsertBranch must not be told to insert a fallthrough");
   assert((Cond.size() == 2 || Cond.size() == 0) &&
@@ -721,9 +723,10 @@ bool PPCInstrInfo::canInsertSelect(const MachineBasicBlock &MBB,
 }
 
 void PPCInstrInfo::insertSelect(MachineBasicBlock &MBB,
-                                MachineBasicBlock::iterator MI, DebugLoc dl,
-                                unsigned DestReg, ArrayRef<MachineOperand> Cond,
-                                unsigned TrueReg, unsigned FalseReg) const {
+                                MachineBasicBlock::iterator MI,
+                                const DebugLoc &dl, unsigned DestReg,
+                                ArrayRef<MachineOperand> Cond, unsigned TrueReg,
+                                unsigned FalseReg) const {
   assert(Cond.size() == 2 &&
          "PPC branch conditions have two components!");
 
@@ -746,8 +749,8 @@ void PPCInstrInfo::insertSelect(MachineBasicBlock &MBB,
   unsigned OpCode = Is64Bit ? PPC::ISEL8 : PPC::ISEL;
   auto SelectPred = static_cast<PPC::Predicate>(Cond[0].getImm());
 
-  unsigned SubIdx;
-  bool SwapOps;
+  unsigned SubIdx = 0;
+  bool SwapOps = false;
   switch (SelectPred) {
   case PPC::PRED_EQ:
   case PPC::PRED_EQ_MINUS:
@@ -835,9 +838,9 @@ static unsigned getCRBitValue(unsigned CRBit) {
 }
 
 void PPCInstrInfo::copyPhysReg(MachineBasicBlock &MBB,
-                               MachineBasicBlock::iterator I, DebugLoc DL,
-                               unsigned DestReg, unsigned SrcReg,
-                               bool KillSrc) const {
+                               MachineBasicBlock::iterator I,
+                               const DebugLoc &DL, unsigned DestReg,
+                               unsigned SrcReg, bool KillSrc) const {
   // We can end up with self copies and similar things as a result of VSX copy
   // legalization. Promote them here.
   const TargetRegisterInfo *TRI = &getRegisterInfo();
@@ -883,8 +886,8 @@ void PPCInstrInfo::copyPhysReg(MachineBasicBlock &MBB,
   if (PPC::CRBITRCRegClass.contains(SrcReg) &&
       PPC::GPRCRegClass.contains(DestReg)) {
     unsigned CRReg = getCRFromCRBit(SrcReg);
-    BuildMI(MBB, I, DL, get(PPC::MFOCRF), DestReg)
-       .addReg(CRReg), getKillRegState(KillSrc);
+    BuildMI(MBB, I, DL, get(PPC::MFOCRF), DestReg).addReg(CRReg);
+    getKillRegState(KillSrc);
     // Rotate the CR bit in the CR fields to be the least significant bit and
     // then mask with 0x1 (MB = ME = 31).
     BuildMI(MBB, I, DL, get(PPC::RLWINM), DestReg)
@@ -895,13 +898,13 @@ void PPCInstrInfo::copyPhysReg(MachineBasicBlock &MBB,
     return;
   } else if (PPC::CRRCRegClass.contains(SrcReg) &&
       PPC::G8RCRegClass.contains(DestReg)) {
-    BuildMI(MBB, I, DL, get(PPC::MFOCRF8), DestReg)
-       .addReg(SrcReg), getKillRegState(KillSrc);
+    BuildMI(MBB, I, DL, get(PPC::MFOCRF8), DestReg).addReg(SrcReg);
+    getKillRegState(KillSrc);
     return;
   } else if (PPC::CRRCRegClass.contains(SrcReg) &&
       PPC::GPRCRegClass.contains(DestReg)) {
-    BuildMI(MBB, I, DL, get(PPC::MFOCRF), DestReg)
-       .addReg(SrcReg), getKillRegState(KillSrc);
+    BuildMI(MBB, I, DL, get(PPC::MFOCRF), DestReg).addReg(SrcReg);
+    getKillRegState(KillSrc);
     return;
    }
 
@@ -1085,12 +1088,11 @@ PPCInstrInfo::storeRegToStackSlot(MachineBasicBlock &MBB,
   NewMIs.back()->addMemOperand(MF, MMO);
 }
 
-bool
-PPCInstrInfo::LoadRegFromStackSlot(MachineFunction &MF, DebugLoc DL,
-                                   unsigned DestReg, int FrameIdx,
-                                   const TargetRegisterClass *RC,
-                                   SmallVectorImpl<MachineInstr*> &NewMIs,
-                                   bool &NonRI, bool &SpillsVRS) const{
+bool PPCInstrInfo::LoadRegFromStackSlot(MachineFunction &MF, const DebugLoc &DL,
+                                        unsigned DestReg, int FrameIdx,
+                                        const TargetRegisterClass *RC,
+                                        SmallVectorImpl<MachineInstr *> &NewMIs,
+                                        bool &NonRI, bool &SpillsVRS) const {
   // Note: If additional load instructions are added here,
   // update isLoadFromStackSlot.
 
@@ -1208,35 +1210,35 @@ ReverseBranchCondition(SmallVectorImpl<MachineOperand> &Cond) const {
   return false;
 }
 
-bool PPCInstrInfo::FoldImmediate(MachineInstr *UseMI, MachineInstr *DefMI,
-                             unsigned Reg, MachineRegisterInfo *MRI) const {
+bool PPCInstrInfo::FoldImmediate(MachineInstr &UseMI, MachineInstr &DefMI,
+                                 unsigned Reg, MachineRegisterInfo *MRI) const {
   // For some instructions, it is legal to fold ZERO into the RA register field.
   // A zero immediate should always be loaded with a single li.
-  unsigned DefOpc = DefMI->getOpcode();
+  unsigned DefOpc = DefMI.getOpcode();
   if (DefOpc != PPC::LI && DefOpc != PPC::LI8)
     return false;
-  if (!DefMI->getOperand(1).isImm())
+  if (!DefMI.getOperand(1).isImm())
     return false;
-  if (DefMI->getOperand(1).getImm() != 0)
+  if (DefMI.getOperand(1).getImm() != 0)
     return false;
 
   // Note that we cannot here invert the arguments of an isel in order to fold
   // a ZERO into what is presented as the second argument. All we have here
   // is the condition bit, and that might come from a CR-logical bit operation.
 
-  const MCInstrDesc &UseMCID = UseMI->getDesc();
+  const MCInstrDesc &UseMCID = UseMI.getDesc();
 
   // Only fold into real machine instructions.
   if (UseMCID.isPseudo())
     return false;
 
   unsigned UseIdx;
-  for (UseIdx = 0; UseIdx < UseMI->getNumOperands(); ++UseIdx)
-    if (UseMI->getOperand(UseIdx).isReg() &&
-        UseMI->getOperand(UseIdx).getReg() == Reg)
+  for (UseIdx = 0; UseIdx < UseMI.getNumOperands(); ++UseIdx)
+    if (UseMI.getOperand(UseIdx).isReg() &&
+        UseMI.getOperand(UseIdx).getReg() == Reg)
       break;
 
-  assert(UseIdx < UseMI->getNumOperands() && "Cannot find Reg in UseMI");
+  assert(UseIdx < UseMI.getNumOperands() && "Cannot find Reg in UseMI");
   assert(UseIdx < UseMCID.getNumOperands() && "No operand description for Reg");
 
   const MCOperandInfo *UseInfo = &UseMCID.OpInfo[UseIdx];
@@ -1268,10 +1270,10 @@ bool PPCInstrInfo::FoldImmediate(MachineInstr *UseMI, MachineInstr *DefMI,
   }
 
   bool DeleteDef = MRI->hasOneNonDBGUse(Reg);
-  UseMI->getOperand(UseIdx).setReg(ZeroReg);
+  UseMI.getOperand(UseIdx).setReg(ZeroReg);
 
   if (DeleteDef)
-    DefMI->eraseFromParent();
+    DefMI.eraseFromParent();
 
   return true;
 }
@@ -1299,7 +1301,7 @@ bool PPCInstrInfo::isProfitableToIfCvt(MachineBasicBlock &TMBB,
 }
 
 
-bool PPCInstrInfo::isPredicated(const MachineInstr *MI) const {
+bool PPCInstrInfo::isPredicated(const MachineInstr &MI) const {
   // The predicated branches are identified by their type, not really by the
   // explicit presence of a predicate. Furthermore, some of them can be
   // predicated more than once. Because if conversion won't try to predicate
@@ -1310,73 +1312,71 @@ bool PPCInstrInfo::isPredicated(const MachineInstr *MI) const {
   return false;
 }
 
-bool PPCInstrInfo::isUnpredicatedTerminator(const MachineInstr *MI) const {
-  if (!MI->isTerminator())
+bool PPCInstrInfo::isUnpredicatedTerminator(const MachineInstr &MI) const {
+  if (!MI.isTerminator())
     return false;
 
   // Conditional branch is a special case.
-  if (MI->isBranch() && !MI->isBarrier())
+  if (MI.isBranch() && !MI.isBarrier())
     return true;
 
   return !isPredicated(MI);
 }
 
-bool PPCInstrInfo::PredicateInstruction(MachineInstr *MI,
+bool PPCInstrInfo::PredicateInstruction(MachineInstr &MI,
                                         ArrayRef<MachineOperand> Pred) const {
-  unsigned OpC = MI->getOpcode();
+  unsigned OpC = MI.getOpcode();
   if (OpC == PPC::BLR || OpC == PPC::BLR8) {
     if (Pred[1].getReg() == PPC::CTR8 || Pred[1].getReg() == PPC::CTR) {
       bool isPPC64 = Subtarget.isPPC64();
-      MI->setDesc(get(Pred[0].getImm() ?
-                      (isPPC64 ? PPC::BDNZLR8 : PPC::BDNZLR) :
-                      (isPPC64 ? PPC::BDZLR8  : PPC::BDZLR)));
+      MI.setDesc(get(Pred[0].getImm() ? (isPPC64 ? PPC::BDNZLR8 : PPC::BDNZLR)
+                                      : (isPPC64 ? PPC::BDZLR8 : PPC::BDZLR)));
     } else if (Pred[0].getImm() == PPC::PRED_BIT_SET) {
-      MI->setDesc(get(PPC::BCLR));
-      MachineInstrBuilder(*MI->getParent()->getParent(), MI)
-        .addReg(Pred[1].getReg());
+      MI.setDesc(get(PPC::BCLR));
+      MachineInstrBuilder(*MI.getParent()->getParent(), MI)
+          .addReg(Pred[1].getReg());
     } else if (Pred[0].getImm() == PPC::PRED_BIT_UNSET) {
-      MI->setDesc(get(PPC::BCLRn));
-      MachineInstrBuilder(*MI->getParent()->getParent(), MI)
-        .addReg(Pred[1].getReg());
+      MI.setDesc(get(PPC::BCLRn));
+      MachineInstrBuilder(*MI.getParent()->getParent(), MI)
+          .addReg(Pred[1].getReg());
     } else {
-      MI->setDesc(get(PPC::BCCLR));
-      MachineInstrBuilder(*MI->getParent()->getParent(), MI)
-        .addImm(Pred[0].getImm())
-        .addReg(Pred[1].getReg());
+      MI.setDesc(get(PPC::BCCLR));
+      MachineInstrBuilder(*MI.getParent()->getParent(), MI)
+          .addImm(Pred[0].getImm())
+          .addReg(Pred[1].getReg());
     }
 
     return true;
   } else if (OpC == PPC::B) {
     if (Pred[1].getReg() == PPC::CTR8 || Pred[1].getReg() == PPC::CTR) {
       bool isPPC64 = Subtarget.isPPC64();
-      MI->setDesc(get(Pred[0].getImm() ?
-                      (isPPC64 ? PPC::BDNZ8 : PPC::BDNZ) :
-                      (isPPC64 ? PPC::BDZ8  : PPC::BDZ)));
+      MI.setDesc(get(Pred[0].getImm() ? (isPPC64 ? PPC::BDNZ8 : PPC::BDNZ)
+                                      : (isPPC64 ? PPC::BDZ8 : PPC::BDZ)));
     } else if (Pred[0].getImm() == PPC::PRED_BIT_SET) {
-      MachineBasicBlock *MBB = MI->getOperand(0).getMBB();
-      MI->RemoveOperand(0);
+      MachineBasicBlock *MBB = MI.getOperand(0).getMBB();
+      MI.RemoveOperand(0);
 
-      MI->setDesc(get(PPC::BC));
-      MachineInstrBuilder(*MI->getParent()->getParent(), MI)
-        .addReg(Pred[1].getReg())
-        .addMBB(MBB);
+      MI.setDesc(get(PPC::BC));
+      MachineInstrBuilder(*MI.getParent()->getParent(), MI)
+          .addReg(Pred[1].getReg())
+          .addMBB(MBB);
     } else if (Pred[0].getImm() == PPC::PRED_BIT_UNSET) {
-      MachineBasicBlock *MBB = MI->getOperand(0).getMBB();
-      MI->RemoveOperand(0);
+      MachineBasicBlock *MBB = MI.getOperand(0).getMBB();
+      MI.RemoveOperand(0);
 
-      MI->setDesc(get(PPC::BCn));
-      MachineInstrBuilder(*MI->getParent()->getParent(), MI)
-        .addReg(Pred[1].getReg())
-        .addMBB(MBB);
+      MI.setDesc(get(PPC::BCn));
+      MachineInstrBuilder(*MI.getParent()->getParent(), MI)
+          .addReg(Pred[1].getReg())
+          .addMBB(MBB);
     } else {
-      MachineBasicBlock *MBB = MI->getOperand(0).getMBB();
-      MI->RemoveOperand(0);
-
-      MI->setDesc(get(PPC::BCC));
-      MachineInstrBuilder(*MI->getParent()->getParent(), MI)
-        .addImm(Pred[0].getImm())
-        .addReg(Pred[1].getReg())
-        .addMBB(MBB);
+      MachineBasicBlock *MBB = MI.getOperand(0).getMBB();
+      MI.RemoveOperand(0);
+
+      MI.setDesc(get(PPC::BCC));
+      MachineInstrBuilder(*MI.getParent()->getParent(), MI)
+          .addImm(Pred[0].getImm())
+          .addReg(Pred[1].getReg())
+          .addMBB(MBB);
     }
 
     return true;
@@ -1389,24 +1389,24 @@ bool PPCInstrInfo::PredicateInstruction(MachineInstr *MI,
     bool isPPC64 = Subtarget.isPPC64();
 
     if (Pred[0].getImm() == PPC::PRED_BIT_SET) {
-      MI->setDesc(get(isPPC64 ? (setLR ? PPC::BCCTRL8 : PPC::BCCTR8) :
-                                (setLR ? PPC::BCCTRL  : PPC::BCCTR)));
-      MachineInstrBuilder(*MI->getParent()->getParent(), MI)
-        .addReg(Pred[1].getReg());
+      MI.setDesc(get(isPPC64 ? (setLR ? PPC::BCCTRL8 : PPC::BCCTR8)
+                             : (setLR ? PPC::BCCTRL : PPC::BCCTR)));
+      MachineInstrBuilder(*MI.getParent()->getParent(), MI)
+          .addReg(Pred[1].getReg());
       return true;
     } else if (Pred[0].getImm() == PPC::PRED_BIT_UNSET) {
-      MI->setDesc(get(isPPC64 ? (setLR ? PPC::BCCTRL8n : PPC::BCCTR8n) :
-                                (setLR ? PPC::BCCTRLn  : PPC::BCCTRn)));
-      MachineInstrBuilder(*MI->getParent()->getParent(), MI)
-        .addReg(Pred[1].getReg());
+      MI.setDesc(get(isPPC64 ? (setLR ? PPC::BCCTRL8n : PPC::BCCTR8n)
+                             : (setLR ? PPC::BCCTRLn : PPC::BCCTRn)));
+      MachineInstrBuilder(*MI.getParent()->getParent(), MI)
+          .addReg(Pred[1].getReg());
       return true;
     }
 
-    MI->setDesc(get(isPPC64 ? (setLR ? PPC::BCCCTRL8 : PPC::BCCCTR8) :
-                              (setLR ? PPC::BCCCTRL  : PPC::BCCCTR)));
-    MachineInstrBuilder(*MI->getParent()->getParent(), MI)
-      .addImm(Pred[0].getImm())
-      .addReg(Pred[1].getReg());
+    MI.setDesc(get(isPPC64 ? (setLR ? PPC::BCCCTRL8 : PPC::BCCCTR8)
+                           : (setLR ? PPC::BCCCTRL : PPC::BCCCTR)));
+    MachineInstrBuilder(*MI.getParent()->getParent(), MI)
+        .addImm(Pred[0].getImm())
+        .addReg(Pred[1].getReg());
     return true;
   }
 
@@ -1444,7 +1444,7 @@ bool PPCInstrInfo::SubsumesPredicate(ArrayRef<MachineOperand> Pred1,
   return false;
 }
 
-bool PPCInstrInfo::DefinesPredicate(MachineInstr *MI,
+bool PPCInstrInfo::DefinesPredicate(MachineInstr &MI,
                                     std::vector<MachineOperand> &Pred) const {
   // Note: At the present time, the contents of Pred from this function is
   // unused by IfConversion. This implementation follows ARM by pushing the
@@ -1457,8 +1457,8 @@ bool PPCInstrInfo::DefinesPredicate(MachineInstr *MI,
       &PPC::CTRRCRegClass, &PPC::CTRRC8RegClass };
 
   bool Found = false;
-  for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) {
-    const MachineOperand &MO = MI->getOperand(i);
+  for (unsigned i = 0, e = MI.getNumOperands(); i != e; ++i) {
+    const MachineOperand &MO = MI.getOperand(i);
     for (unsigned c = 0; c < array_lengthof(RCs) && !Found; ++c) {
       const TargetRegisterClass *RC = RCs[c];
       if (MO.isReg()) {
@@ -1480,8 +1480,8 @@ bool PPCInstrInfo::DefinesPredicate(MachineInstr *MI,
   return Found;
 }
 
-bool PPCInstrInfo::isPredicable(MachineInstr *MI) const {
-  unsigned OpC = MI->getOpcode();
+bool PPCInstrInfo::isPredicable(MachineInstr &MI) const {
+  unsigned OpC = MI.getOpcode();
   switch (OpC) {
   default:
     return false;
@@ -1496,10 +1496,10 @@ bool PPCInstrInfo::isPredicable(MachineInstr *MI) const {
   }
 }
 
-bool PPCInstrInfo::analyzeCompare(const MachineInstr *MI,
-                                  unsigned &SrcReg, unsigned &SrcReg2,
-                                  int &Mask, int &Value) const {
-  unsigned Opc = MI->getOpcode();
+bool PPCInstrInfo::analyzeCompare(const MachineInstr &MI, unsigned &SrcReg,
+                                  unsigned &SrcReg2, int &Mask,
+                                  int &Value) const {
+  unsigned Opc = MI.getOpcode();
 
   switch (Opc) {
   default: return false;
@@ -1507,9 +1507,9 @@ bool PPCInstrInfo::analyzeCompare(const MachineInstr *MI,
   case PPC::CMPLWI:
   case PPC::CMPDI:
   case PPC::CMPLDI:
-    SrcReg = MI->getOperand(1).getReg();
+    SrcReg = MI.getOperand(1).getReg();
     SrcReg2 = 0;
-    Value = MI->getOperand(2).getImm();
+    Value = MI.getOperand(2).getImm();
     Mask = 0xFFFF;
     return true;
   case PPC::CMPW:
@@ -1518,21 +1518,20 @@ bool PPCInstrInfo::analyzeCompare(const MachineInstr *MI,
   case PPC::CMPLD:
   case PPC::FCMPUS:
   case PPC::FCMPUD:
-    SrcReg = MI->getOperand(1).getReg();
-    SrcReg2 = MI->getOperand(2).getReg();
+    SrcReg = MI.getOperand(1).getReg();
+    SrcReg2 = MI.getOperand(2).getReg();
     return true;
   }
 }
 
-bool PPCInstrInfo::optimizeCompareInstr(MachineInstr *CmpInstr,
-                                        unsigned SrcReg, unsigned SrcReg2,
-                                        int Mask, int Value,
+bool PPCInstrInfo::optimizeCompareInstr(MachineInstr &CmpInstr, unsigned SrcReg,
+                                        unsigned SrcReg2, int Mask, int Value,
                                         const MachineRegisterInfo *MRI) const {
   if (DisableCmpOpt)
     return false;
 
-  int OpC = CmpInstr->getOpcode();
-  unsigned CRReg = CmpInstr->getOperand(0).getReg();
+  int OpC = CmpInstr.getOpcode();
+  unsigned CRReg = CmpInstr.getOperand(0).getReg();
 
   // FP record forms set CR1 based on the execption status bits, not a
   // comparison with zero.
@@ -1571,11 +1570,18 @@ bool PPCInstrInfo::optimizeCompareInstr(MachineInstr *CmpInstr,
       } else
         return false;
     } else if (is32BitUnsignedCompare) {
+      // 32-bit rotate and mask instructions are zero extending only if MB <= ME
+      bool isZeroExtendingRotate  =
+          (MIOpC == PPC::RLWINM || MIOpC == PPC::RLWINMo ||
+           MIOpC == PPC::RLWNM || MIOpC == PPC::RLWNMo)
+          && MI->getOperand(3).getImm() <= MI->getOperand(4).getImm();
+
       // We can perform this optimization, equality only, if MI is
       // zero-extending.
       if (MIOpC == PPC::CNTLZW || MIOpC == PPC::CNTLZWo ||
           MIOpC == PPC::SLW    || MIOpC == PPC::SLWo ||
-          MIOpC == PPC::SRW    || MIOpC == PPC::SRWo) {
+          MIOpC == PPC::SRW    || MIOpC == PPC::SRWo ||
+          isZeroExtendingRotate) {
         noSub = true;
         equalityOnly = true;
       } else
@@ -1608,8 +1614,8 @@ bool PPCInstrInfo::optimizeCompareInstr(MachineInstr *CmpInstr,
   MachineBasicBlock::iterator I = CmpInstr;
 
   // Scan forward to find the first use of the compare.
-  for (MachineBasicBlock::iterator EL = CmpInstr->getParent()->end();
-       I != EL; ++I) {
+  for (MachineBasicBlock::iterator EL = CmpInstr.getParent()->end(); I != EL;
+       ++I) {
     bool FoundUse = false;
     for (MachineRegisterInfo::use_instr_iterator J =MRI->use_instr_begin(CRReg),
          JE = MRI->use_instr_end(); J != JE; ++J)
@@ -1633,7 +1639,7 @@ bool PPCInstrInfo::optimizeCompareInstr(MachineInstr *CmpInstr,
   // same BB as the comparison. This is to allow the check below to avoid calls
   // (and other explicit clobbers); instead we should really check for these
   // more explicitly (in at least a few predecessors).
-  else if (MI->getParent() != CmpInstr->getParent() || Value != 0) {
+  else if (MI->getParent() != CmpInstr.getParent() || Value != 0) {
     // PPC does not have a record-form SUBri.
     return false;
   }
@@ -1643,16 +1649,14 @@ bool PPCInstrInfo::optimizeCompareInstr(MachineInstr *CmpInstr,
   --I;
 
   // Get ready to iterate backward from CmpInstr.
-  MachineBasicBlock::iterator E = MI,
-                              B = CmpInstr->getParent()->begin();
+  MachineBasicBlock::iterator E = MI, B = CmpInstr.getParent()->begin();
 
   for (; I != E && !noSub; --I) {
     const MachineInstr &Instr = *I;
     unsigned IOpC = Instr.getOpcode();
 
-    if (&*I != CmpInstr && (
-        Instr.modifiesRegister(PPC::CR0, TRI) ||
-        Instr.readsRegister(PPC::CR0, TRI)))
+    if (&*I != &CmpInstr && (Instr.modifiesRegister(PPC::CR0, TRI) ||
+                             Instr.readsRegister(PPC::CR0, TRI)))
       // This instruction modifies or uses the record condition register after
       // the one we want to change. While we could do this transformation, it
       // would likely not be profitable. This transformation removes one
@@ -1752,13 +1756,17 @@ bool PPCInstrInfo::optimizeCompareInstr(MachineInstr *CmpInstr,
   // Create a new virtual register to hold the value of the CR set by the
   // record-form instruction. If the instruction was not previously in
   // record form, then set the kill flag on the CR.
-  CmpInstr->eraseFromParent();
+  CmpInstr.eraseFromParent();
 
   MachineBasicBlock::iterator MII = MI;
   BuildMI(*MI->getParent(), std::next(MII), MI->getDebugLoc(),
           get(TargetOpcode::COPY), CRReg)
     .addReg(PPC::CR0, MIOpC != NewOpC ? RegState::Kill : 0);
 
+  // Even if CR0 register were dead before, it is alive now since the
+  // instruction we just built uses it.
+  MI->clearRegisterDeads(PPC::CR0);
+
   if (MIOpC != NewOpC) {
     // We need to be careful here: we're replacing one instruction with
     // another, and we need to make sure that we get all of the right
@@ -1783,6 +1791,8 @@ bool PPCInstrInfo::optimizeCompareInstr(MachineInstr *CmpInstr,
           MI->addOperand(*MI->getParent()->getParent(),
                          MachineOperand::CreateReg(*ImpUses, false, true));
   }
+  assert(MI->definesRegister(PPC::CR0) &&
+         "Record-form instruction does not define cr0?");
 
   // Modify the condition code of operands in OperandsToUpdate.
   // Since we have SUB(r1, r2) and CMP(r2, r1), the condition code needs to
@@ -1799,17 +1809,17 @@ bool PPCInstrInfo::optimizeCompareInstr(MachineInstr *CmpInstr,
 /// GetInstSize - Return the number of bytes of code the specified
 /// instruction may be.  This returns the maximum number of bytes.
 ///
-unsigned PPCInstrInfo::GetInstSizeInBytes(const MachineInstr *MI) const {
-  unsigned Opcode = MI->getOpcode();
+unsigned PPCInstrInfo::GetInstSizeInBytes(const MachineInstr &MI) const {
+  unsigned Opcode = MI.getOpcode();
 
   if (Opcode == PPC::INLINEASM) {
-    const MachineFunction *MF = MI->getParent()->getParent();
-    const char *AsmStr = MI->getOperand(0).getSymbolName();
+    const MachineFunction *MF = MI.getParent()->getParent();
+    const char *AsmStr = MI.getOperand(0).getSymbolName();
     return getInlineAsmLength(AsmStr, *MF->getTarget().getMCAsmInfo());
   } else if (Opcode == TargetOpcode::STACKMAP) {
-    return MI->getOperand(1).getImm();
+    return MI.getOperand(1).getImm();
   } else if (Opcode == TargetOpcode::PATCHPOINT) {
-    PatchPointOpers Opers(MI);
+    PatchPointOpers Opers(&MI);
     return Opers.getMetaOper(PatchPointOpers::NBytesPos).getImm();
   } else {
     const MCInstrDesc &Desc = get(Opcode);
@@ -1842,10 +1852,26 @@ ArrayRef<std::pair<unsigned, const char *>>
 PPCInstrInfo::getSerializableBitmaskMachineOperandTargetFlags() const {
   using namespace PPCII;
   static const std::pair<unsigned, const char *> TargetFlags[] = {
-      {MO_PLT_OR_STUB, "ppc-plt-or-stub"},
+      {MO_PLT, "ppc-plt"},
       {MO_PIC_FLAG, "ppc-pic"},
       {MO_NLP_FLAG, "ppc-nlp"},
       {MO_NLP_HIDDEN_FLAG, "ppc-nlp-hidden"}};
   return makeArrayRef(TargetFlags);
 }
 
+bool PPCInstrInfo::expandPostRAPseudo(MachineInstr &MI) const {
+  switch (MI.getOpcode()) {
+  case TargetOpcode::LOAD_STACK_GUARD: {
+    assert(Subtarget.isTargetLinux() &&
+           "Only Linux target is expected to contain LOAD_STACK_GUARD");
+    const int64_t Offset = Subtarget.isPPC64() ? -0x7010 : -0x7008;
+    const unsigned Reg = Subtarget.isPPC64() ? PPC::X13 : PPC::R2;
+    MI.setDesc(get(Subtarget.isPPC64() ? PPC::LD : PPC::LWZ));
+    MachineInstrBuilder(*MI.getParent()->getParent(), MI)
+        .addImm(Offset)
+        .addReg(Reg);
+    return true;
+  }
+  }
+  return false;
+}
diff --git a/lib/Target/PowerPC/PPCInstrInfo.h b/lib/Target/PowerPC/PPCInstrInfo.h
index c3c3a480a6aa..98baf125bdff 100644
--- a/lib/Target/PowerPC/PPCInstrInfo.h
+++ b/lib/Target/PowerPC/PPCInstrInfo.h
@@ -73,10 +73,10 @@ class PPCInstrInfo : public PPCGenInstrInfo {
                            const TargetRegisterClass *RC,
                            SmallVectorImpl<MachineInstr*> &NewMIs,
                            bool &NonRI, bool &SpillsVRS) const;
-  bool LoadRegFromStackSlot(MachineFunction &MF, DebugLoc DL,
+  bool LoadRegFromStackSlot(MachineFunction &MF, const DebugLoc &DL,
                             unsigned DestReg, int FrameIdx,
                             const TargetRegisterClass *RC,
-                            SmallVectorImpl<MachineInstr*> &NewMIs,
+                            SmallVectorImpl<MachineInstr *> &NewMIs,
                             bool &NonRI, bool &SpillsVRS) const;
   virtual void anchor();
 
@@ -91,8 +91,7 @@ protected:
   ///
   /// For example, we can commute rlwimi instructions, but only if the
   /// rotate amt is zero.  We also have to munge the immediates a bit.
-  MachineInstr *commuteInstructionImpl(MachineInstr *MI,
-                                       bool NewMI,
+  MachineInstr *commuteInstructionImpl(MachineInstr &MI, bool NewMI,
                                        unsigned OpIdx1,
                                        unsigned OpIdx2) const override;
 
@@ -113,12 +112,12 @@ public:
                                      const ScheduleDAG *DAG) const override;
 
   unsigned getInstrLatency(const InstrItineraryData *ItinData,
-                           const MachineInstr *MI,
+                           const MachineInstr &MI,
                            unsigned *PredCost = nullptr) const override;
 
   int getOperandLatency(const InstrItineraryData *ItinData,
-                        const MachineInstr *DefMI, unsigned DefIdx,
-                        const MachineInstr *UseMI,
+                        const MachineInstr &DefMI, unsigned DefIdx,
+                        const MachineInstr &UseMI,
                         unsigned UseIdx) const override;
   int getOperandLatency(const InstrItineraryData *ItinData,
                         SDNode *DefNode, unsigned DefIdx,
@@ -128,7 +127,7 @@ public:
   }
 
   bool hasLowDefLatency(const TargetSchedModel &SchedModel,
-                        const MachineInstr *DefMI,
+                        const MachineInstr &DefMI,
                         unsigned DefIdx) const override {
     // Machine LICM should hoist all instructions in low-register-pressure
     // situations; none are sufficiently free to justify leaving in a loop
@@ -152,12 +151,12 @@ public:
   bool isCoalescableExtInstr(const MachineInstr &MI,
                              unsigned &SrcReg, unsigned &DstReg,
                              unsigned &SubIdx) const override;
-  unsigned isLoadFromStackSlot(const MachineInstr *MI,
+  unsigned isLoadFromStackSlot(const MachineInstr &MI,
                                int &FrameIndex) const override;
-  unsigned isStoreToStackSlot(const MachineInstr *MI,
+  unsigned isStoreToStackSlot(const MachineInstr &MI,
                               int &FrameIndex) const override;
 
-  bool findCommutedOpIndices(MachineInstr *MI, unsigned &SrcOpIdx1,
+  bool findCommutedOpIndices(MachineInstr &MI, unsigned &SrcOpIdx1,
                              unsigned &SrcOpIdx2) const override;
 
   void insertNoop(MachineBasicBlock &MBB,
@@ -165,25 +164,25 @@ public:
 
 
   // Branch analysis.
-  bool AnalyzeBranch(MachineBasicBlock &MBB, MachineBasicBlock *&TBB,
+  bool analyzeBranch(MachineBasicBlock &MBB, MachineBasicBlock *&TBB,
                      MachineBasicBlock *&FBB,
                      SmallVectorImpl<MachineOperand> &Cond,
                      bool AllowModify) const override;
   unsigned RemoveBranch(MachineBasicBlock &MBB) const override;
   unsigned InsertBranch(MachineBasicBlock &MBB, MachineBasicBlock *TBB,
                         MachineBasicBlock *FBB, ArrayRef<MachineOperand> Cond,
-                        DebugLoc DL) const override;
+                        const DebugLoc &DL) const override;
 
   // Select analysis.
   bool canInsertSelect(const MachineBasicBlock &, ArrayRef<MachineOperand> Cond,
                        unsigned, unsigned, int &, int &, int &) const override;
   void insertSelect(MachineBasicBlock &MBB, MachineBasicBlock::iterator MI,
-                    DebugLoc DL, unsigned DstReg, ArrayRef<MachineOperand> Cond,
-                    unsigned TrueReg, unsigned FalseReg) const override;
+                    const DebugLoc &DL, unsigned DstReg,
+                    ArrayRef<MachineOperand> Cond, unsigned TrueReg,
+                    unsigned FalseReg) const override;
 
-  void copyPhysReg(MachineBasicBlock &MBB,
-                   MachineBasicBlock::iterator I, DebugLoc DL,
-                   unsigned DestReg, unsigned SrcReg,
+  void copyPhysReg(MachineBasicBlock &MBB, MachineBasicBlock::iterator I,
+                   const DebugLoc &DL, unsigned DestReg, unsigned SrcReg,
                    bool KillSrc) const override;
 
   void storeRegToStackSlot(MachineBasicBlock &MBB,
@@ -201,8 +200,8 @@ public:
   bool
   ReverseBranchCondition(SmallVectorImpl<MachineOperand> &Cond) const override;
 
-  bool FoldImmediate(MachineInstr *UseMI, MachineInstr *DefMI,
-                     unsigned Reg, MachineRegisterInfo *MRI) const override;
+  bool FoldImmediate(MachineInstr &UseMI, MachineInstr &DefMI, unsigned Reg,
+                     MachineRegisterInfo *MRI) const override;
 
   // If conversion by predication (only supported by some branch instructions).
   // All of the profitability checks always return true; it is always
@@ -230,37 +229,34 @@ public:
   }
 
   // Predication support.
-  bool isPredicated(const MachineInstr *MI) const override;
+  bool isPredicated(const MachineInstr &MI) const override;
 
-  bool isUnpredicatedTerminator(const MachineInstr *MI) const override;
+  bool isUnpredicatedTerminator(const MachineInstr &MI) const override;
 
-  bool PredicateInstruction(MachineInstr *MI,
+  bool PredicateInstruction(MachineInstr &MI,
                             ArrayRef<MachineOperand> Pred) const override;
 
   bool SubsumesPredicate(ArrayRef<MachineOperand> Pred1,
                          ArrayRef<MachineOperand> Pred2) const override;
 
-  bool DefinesPredicate(MachineInstr *MI,
+  bool DefinesPredicate(MachineInstr &MI,
                         std::vector<MachineOperand> &Pred) const override;
 
-  bool isPredicable(MachineInstr *MI) const override;
+  bool isPredicable(MachineInstr &MI) const override;
 
   // Comparison optimization.
 
+  bool analyzeCompare(const MachineInstr &MI, unsigned &SrcReg,
+                      unsigned &SrcReg2, int &Mask, int &Value) const override;
 
-  bool analyzeCompare(const MachineInstr *MI,
-                      unsigned &SrcReg, unsigned &SrcReg2,
-                      int &Mask, int &Value) const override;
-
-  bool optimizeCompareInstr(MachineInstr *CmpInstr,
-                            unsigned SrcReg, unsigned SrcReg2,
-                            int Mask, int Value,
+  bool optimizeCompareInstr(MachineInstr &CmpInstr, unsigned SrcReg,
+                            unsigned SrcReg2, int Mask, int Value,
                             const MachineRegisterInfo *MRI) const override;
 
   /// GetInstSize - Return the number of bytes of code the specified
   /// instruction may be.  This returns the maximum number of bytes.
   ///
-  unsigned GetInstSizeInBytes(const MachineInstr *MI) const;
+  unsigned GetInstSizeInBytes(const MachineInstr &MI) const;
 
   void getNoopForMachoTarget(MCInst &NopInst) const override;
 
@@ -272,6 +268,9 @@ public:
 
   ArrayRef<std::pair<unsigned, const char *>>
   getSerializableBitmaskMachineOperandTargetFlags() const override;
+
+  // Lower pseudo instructions after register allocation.
+  bool expandPostRAPseudo(MachineInstr &MI) const override;
 };
 
 }
diff --git a/lib/Target/PowerPC/PPCInstrInfo.td b/lib/Target/PowerPC/PPCInstrInfo.td
index ce0f9e6f52a7..4a42a947c6cb 100644
--- a/lib/Target/PowerPC/PPCInstrInfo.td
+++ b/lib/Target/PowerPC/PPCInstrInfo.td
@@ -31,6 +31,18 @@ def SDT_PPCvperm   : SDTypeProfile<1, 3, [
   SDTCisVT<3, v16i8>, SDTCisSameAs<0, 1>, SDTCisSameAs<0, 2>
 ]>;
 
+def SDT_PPCVecSplat : SDTypeProfile<1, 2, [ SDTCisVec<0>,
+  SDTCisVec<1>, SDTCisInt<2>
+]>;
+
+def SDT_PPCVecShift : SDTypeProfile<1, 3, [ SDTCisVec<0>,
+  SDTCisVec<1>, SDTCisVec<2>, SDTCisInt<3>
+]>;
+
+def SDT_PPCVecInsert : SDTypeProfile<1, 3, [ SDTCisVec<0>,
+  SDTCisVec<1>, SDTCisVec<2>, SDTCisInt<3>
+]>;
+
 def SDT_PPCvcmp : SDTypeProfile<1, 3, [
   SDTCisSameAs<0, 1>, SDTCisSameAs<1, 2>, SDTCisVT<3, i32>
 ]>;
@@ -140,7 +152,10 @@ def PPCaddiTlsldLAddr : SDNode<"PPCISD::ADDI_TLSLD_L_ADDR",
 def PPCaddisDtprelHA : SDNode<"PPCISD::ADDIS_DTPREL_HA", SDTIntBinOp>;
 def PPCaddiDtprelL   : SDNode<"PPCISD::ADDI_DTPREL_L", SDTIntBinOp>;
 
-def PPCvperm    : SDNode<"PPCISD::VPERM", SDT_PPCvperm, []>;
+def PPCvperm     : SDNode<"PPCISD::VPERM", SDT_PPCvperm, []>;
+def PPCxxsplt    : SDNode<"PPCISD::XXSPLT", SDT_PPCVecSplat, []>;
+def PPCxxinsert  : SDNode<"PPCISD::XXINSERT", SDT_PPCVecInsert, []>;
+def PPCvecshl    : SDNode<"PPCISD::VECSHL", SDT_PPCVecShift, []>;
 
 def PPCqvfperm   : SDNode<"PPCISD::QVFPERM", SDT_PPCqvfperm, []>;
 def PPCqvgpci    : SDNode<"PPCISD::QVGPCI", SDT_PPCqvgpci, []>;
@@ -257,7 +272,7 @@ def HI16 : SDNodeXForm<imm, [{
 
 def HA16 : SDNodeXForm<imm, [{
   // Transformation function: shift the immediate value down into the low bits.
-  signed int Val = N->getZExtValue();
+  int Val = N->getZExtValue();
   return getI32Imm((Val - (signed short)Val) >> 16, SDLoc(N));
 }]>;
 def MB : SDNodeXForm<imm, [{
@@ -507,6 +522,24 @@ def u6imm   : Operand<i32> {
   let ParserMatchClass = PPCU6ImmAsmOperand;
   let DecoderMethod = "decodeUImmOperand<6>";
 }
+def PPCU7ImmAsmOperand : AsmOperandClass {
+  let Name = "U7Imm"; let PredicateMethod = "isU7Imm";
+  let RenderMethod = "addImmOperands";
+}
+def u7imm   : Operand<i32> {
+  let PrintMethod = "printU7ImmOperand";
+  let ParserMatchClass = PPCU7ImmAsmOperand;
+  let DecoderMethod = "decodeUImmOperand<7>";
+}
+def PPCU8ImmAsmOperand : AsmOperandClass {
+  let Name = "U8Imm"; let PredicateMethod = "isU8Imm";
+  let RenderMethod = "addImmOperands";
+}
+def u8imm   : Operand<i32> {
+  let PrintMethod = "printU8ImmOperand";
+  let ParserMatchClass = PPCU8ImmAsmOperand;
+  let DecoderMethod = "decodeUImmOperand<8>";
+}
 def PPCU10ImmAsmOperand : AsmOperandClass {
   let Name = "U10Imm"; let PredicateMethod = "isU10Imm";
   let RenderMethod = "addImmOperands";
@@ -635,6 +668,13 @@ def PPCDispRIXOperand : AsmOperandClass {
 def dispRIX : Operand<iPTR> {
   let ParserMatchClass = PPCDispRIXOperand;
 }
+def PPCDispRIX16Operand : AsmOperandClass {
+ let Name = "DispRIX16"; let PredicateMethod = "isS16ImmX16";
+ let RenderMethod = "addImmOperands";
+}
+def dispRIX16 : Operand<iPTR> {
+  let ParserMatchClass = PPCDispRIX16Operand;
+}
 def PPCDispSPE8Operand : AsmOperandClass {
  let Name = "DispSPE8"; let PredicateMethod = "isU8ImmX8";
  let RenderMethod = "addImmOperands";
@@ -673,6 +713,12 @@ def memrix : Operand<iPTR> {   // memri where the imm is 4-aligned.
   let EncoderMethod = "getMemRIXEncoding";
   let DecoderMethod = "decodeMemRIXOperands";
 }
+def memrix16 : Operand<iPTR> { // memri, imm is 16-aligned, 12-bit, Inst{16:27}
+  let PrintMethod = "printMemRegImm";
+  let MIOperandInfo = (ops dispRIX16:$imm, ptr_rc_nor0:$reg);
+  let EncoderMethod = "getMemRIX16Encoding";
+  let DecoderMethod = "decodeMemRIX16Operands";
+}
 def spe8dis : Operand<iPTR> {   // SPE displacement where the imm is 8-aligned.
   let PrintMethod = "printMemRegImm";
   let MIOperandInfo = (ops dispSPE8:$imm, ptr_rc_nor0:$reg);
@@ -746,6 +792,7 @@ def NoNaNsFPMath : Predicate<"TM.Options.NoNaNsFPMath">;
 def NaNsFPMath   : Predicate<"!TM.Options.NoNaNsFPMath">;
 def HasBPERMD : Predicate<"PPCSubTarget->hasBPERMD()">;
 def HasExtDiv : Predicate<"PPCSubTarget->hasExtDiv()">;
+def IsISA3_0 : Predicate<"PPCSubTarget->isISA3_0()">;
 
 //===----------------------------------------------------------------------===//
 // PowerPC Multiclass Definitions.
@@ -1365,7 +1412,10 @@ let hasSideEffects = 1, isBarrier = 1, usesCustomInserter = 1 in {
                           Requires<[In32BitMode]>;
 }
 
-let isBranch = 1, isTerminator = 1 in {
+// This pseudo is never removed from the function, as it serves as
+// a terminator.  Size is set to 0 to prevent the builtin assembler
+// from emitting it.
+let isBranch = 1, isTerminator = 1, Size = 0 in {
   def EH_SjLj_Setup : Pseudo<(outs), (ins directbrtarget:$dst),
                         "#EH_SjLj_Setup\t$dst", []>;
 }
@@ -1543,6 +1593,13 @@ def LHARXL : XForm_1<31,  116, (outs gprc:$rD), (ins memrr:$src),
 
 def LWARXL : XForm_1<31,  20, (outs gprc:$rD), (ins memrr:$src),
                      "lwarx $rD, $src, 1", IIC_LdStLWARX, []>, isDOT;
+
+// The atomic instructions use the destination register as well as the next one
+// or two registers in order (modulo 31).
+let hasExtraSrcRegAllocReq = 1 in
+def LWAT : X_RD5_RS5_IM5<31, 582, (outs gprc:$rD), (ins gprc:$rA, u5imm:$FC),
+                         "lwat $rD, $rA, $FC", IIC_LdStLoad>,
+           Requires<[IsISA3_0]>;
 }
 
 let Defs = [CR0], mayStore = 1, hasSideEffects = 0 in {
@@ -1558,6 +1615,11 @@ def STWCX : XForm_1<31, 150, (outs), (ins gprc:$rS, memrr:$dst),
                     "stwcx. $rS, $dst", IIC_LdStSTWCX, []>, isDOT;
 }
 
+let mayStore = 1, hasSideEffects = 0 in
+def STWAT : X_RD5_RS5_IM5<31, 710, (outs), (ins gprc:$rS, gprc:$rA, u5imm:$FC),
+                          "stwat $rS, $rA, $FC", IIC_LdStStore>,
+            Requires<[IsISA3_0]>;
+
 let isTerminator = 1, isBarrier = 1, hasCtrlDep = 1 in
 def TRAP  : XForm_24<31, 4, (outs), (ins), "trap", IIC_LdStLoad, [(trap)]>;
 
@@ -1947,6 +2009,10 @@ let isCompare = 1, hasSideEffects = 0 in {
                           "cmpwi $crD, $rA, $imm", IIC_IntCompare>;
   def CMPLWI : DForm_6_ext<10, (outs crrc:$dst), (ins gprc:$src1, u16imm:$src2),
                            "cmplwi $dst, $src1, $src2", IIC_IntCompare>;
+  def CMPRB  : X_BF3_L1_RS5_RS5<31, 192, (outs crbitrc:$BF),
+                                (ins u1imm:$L, g8rc:$rA, g8rc:$rB),
+                                "cmprb $BF, $L, $rA, $rB", IIC_IntCompare, []>,
+               Requires<[IsISA3_0]>;
 }
 }
 
@@ -2000,6 +2066,9 @@ defm SRAWI : XForm_10rc<31, 824, (outs gprc:$rA), (ins gprc:$rS, u5imm:$SH),
 defm CNTLZW : XForm_11r<31,  26, (outs gprc:$rA), (ins gprc:$rS),
                         "cntlzw", "$rA, $rS", IIC_IntGeneral,
                         [(set i32:$rA, (ctlz i32:$rS))]>;
+defm CNTTZW : XForm_11r<31, 538, (outs gprc:$rA), (ins gprc:$rS),
+                        "cnttzw", "$rA, $rS", IIC_IntGeneral,
+                        [(set i32:$rA, (cttz i32:$rS))]>, Requires<[IsISA3_0]>;
 defm EXTSB  : XForm_11r<31, 954, (outs gprc:$rA), (ins gprc:$rS),
                         "extsb", "$rA, $rS", IIC_IntSimple,
                         [(set i32:$rA, (sext_inreg i32:$rS, i8))]>;
@@ -2286,6 +2355,10 @@ let isCodeGenOnly = 1 in {
                   PPC970_DGroup_First, PPC970_Unit_FXU;
 }
 
+// Aliases for mtvrsave/mfvrsave to mfspr/mtspr.
+def : InstAlias<"mtvrsave $rS", (MTVRSAVE gprc:$rS)>;
+def : InstAlias<"mfvrsave $rS", (MFVRSAVE gprc:$rS)>;
+
 // SPILL_VRSAVE - Indicate that we're dumping the VRSAVE register,
 // so we'll need to scavenge a register for it.
 let mayStore = 1 in
@@ -2328,6 +2401,9 @@ def MFCR : XFXForm_3<31, 19, (outs gprc:$rT), (ins),
                      "mfcr $rT", IIC_SprMFCR>,
                      PPC970_MicroCode, PPC970_Unit_CRU;
 } // hasExtraSrcRegAllocReq = 1
+
+def MCRXRX : X_BF3<31, 576, (outs crrc:$BF), (ins),
+                   "mcrxrx $BF", IIC_BrMCRX>, Requires<[IsISA3_0]>;
 } // hasSideEffects = 0
 
 // Pseudo instruction to perform FADD in round-to-zero mode.
@@ -4138,3 +4214,33 @@ def : Pat<(atomic_store_32 iaddr:$ptr, i32:$val), (STW  gprc:$val, memri:$ptr)>;
 def : Pat<(atomic_store_8  xaddr:$ptr, i32:$val), (STBX gprc:$val, memrr:$ptr)>;
 def : Pat<(atomic_store_16 xaddr:$ptr, i32:$val), (STHX gprc:$val, memrr:$ptr)>;
 def : Pat<(atomic_store_32 xaddr:$ptr, i32:$val), (STWX gprc:$val, memrr:$ptr)>;
+
+let Predicates = [IsISA3_0] in {
+
+// Copy-Paste Facility
+// We prefix 'CP' to COPY due to name conflict in Target.td. We also prefix to
+// PASTE for naming consistency.
+let mayLoad = 1 in
+def CP_COPY   : X_L1_RA5_RB5<31, 774, "copy"  , gprc, IIC_LdStCOPY, []>;
+
+let mayStore = 1 in
+def CP_PASTE  : X_L1_RA5_RB5<31, 902, "paste" , gprc, IIC_LdStPASTE, []>;
+
+let mayStore = 1, Defs = [CR0] in
+def CP_PASTEo : X_L1_RA5_RB5<31, 902, "paste.", gprc, IIC_LdStPASTE, []>, isDOT;
+
+def CP_COPYx  : PPCAsmPseudo<"copy $rA, $rB" , (ins gprc:$rA, gprc:$rB)>;
+def CP_PASTEx : PPCAsmPseudo<"paste $rA, $rB", (ins gprc:$rA, gprc:$rB)>;
+def CP_COPY_FIRST : PPCAsmPseudo<"copy_first $rA, $rB",
+                                  (ins gprc:$rA, gprc:$rB)>;
+def CP_PASTE_LAST : PPCAsmPseudo<"paste_last $rA, $rB",
+                                  (ins gprc:$rA, gprc:$rB)>;
+def CP_ABORT : XForm_0<31, 838, (outs), (ins), "cp_abort", IIC_SprABORT, []>;
+
+// Message Synchronize
+def MSGSYNC : XForm_0<31, 886, (outs), (ins), "msgsync", IIC_SprMSGSYNC, []>;
+
+// Power-Saving Mode Instruction:
+def STOP : XForm_0<19, 370, (outs), (ins), "stop", IIC_SprSTOP, []>;
+
+} // IsISA3_0
diff --git a/lib/Target/PowerPC/PPCInstrVSX.td b/lib/Target/PowerPC/PPCInstrVSX.td
index df1142cb42f3..a02ace00a76f 100644
--- a/lib/Target/PowerPC/PPCInstrVSX.td
+++ b/lib/Target/PowerPC/PPCInstrVSX.td
@@ -57,6 +57,9 @@ def SDT_PPCstxvd2x : SDTypeProfile<0, 2, [
 def SDT_PPCxxswapd : SDTypeProfile<1, 1, [
   SDTCisSameAs<0, 1>
 ]>;
+def SDTVecConv : SDTypeProfile<1, 2, [
+  SDTCisVec<0>, SDTCisVec<1>, SDTCisPtrTy<2>
+]>;
 
 def PPClxvd2x  : SDNode<"PPCISD::LXVD2X", SDT_PPClxvd2x,
                         [SDNPHasChain, SDNPMayLoad]>;
@@ -66,6 +69,9 @@ def PPCxxswapd : SDNode<"PPCISD::XXSWAPD", SDT_PPCxxswapd, [SDNPHasChain]>;
 def PPCmfvsr : SDNode<"PPCISD::MFVSR", SDTUnaryOp, []>;
 def PPCmtvsra : SDNode<"PPCISD::MTVSRA", SDTUnaryOp, []>;
 def PPCmtvsrz : SDNode<"PPCISD::MTVSRZ", SDTUnaryOp, []>;
+def PPCsvec2fp : SDNode<"PPCISD::SINT_VEC_TO_FP", SDTVecConv, []>;
+def PPCuvec2fp: SDNode<"PPCISD::UINT_VEC_TO_FP", SDTVecConv, []>;
+def PPCswapNoChain : SDNode<"PPCISD::SWAP_NO_CHAIN", SDT_PPCxxswapd>;
 
 multiclass XX3Form_Rcr<bits<6> opcode, bits<7> xo, string asmbase,
                     string asmstr, InstrItinClass itin, Intrinsic Int,
@@ -608,7 +614,8 @@ let Uses = [RM] in {
                       "xvcvsxwdp $XT, $XB", IIC_VecFP, []>;
   def XVCVSXWSP : XX2Form<60, 184,
                       (outs vsrc:$XT), (ins vsrc:$XB),
-                      "xvcvsxwsp $XT, $XB", IIC_VecFP, []>;
+                      "xvcvsxwsp $XT, $XB", IIC_VecFP,
+                      [(set v4f32:$XT, (sint_to_fp v4i32:$XB))]>;
   def XVCVUXDDP : XX2Form<60, 488,
                       (outs vsrc:$XT), (ins vsrc:$XB),
                       "xvcvuxddp $XT, $XB", IIC_VecFP,
@@ -772,10 +779,14 @@ let Uses = [RM] in {
 
   def XXSLDWI : XX3Form_2<60, 2,
                        (outs vsrc:$XT), (ins vsrc:$XA, vsrc:$XB, u2imm:$SHW),
-                       "xxsldwi $XT, $XA, $XB, $SHW", IIC_VecPerm, []>;
+                       "xxsldwi $XT, $XA, $XB, $SHW", IIC_VecPerm,
+                       [(set v4i32:$XT, (PPCvecshl v4i32:$XA, v4i32:$XB,
+                                                  imm32SExt16:$SHW))]>;
   def XXSPLTW : XX2Form_2<60, 164,
                        (outs vsrc:$XT), (ins vsrc:$XB, u2imm:$UIM),
-                       "xxspltw $XT, $XB, $UIM", IIC_VecPerm, []>;
+                       "xxspltw $XT, $XB, $UIM", IIC_VecPerm,
+                       [(set v4i32:$XT,
+                             (PPCxxsplt v4i32:$XB, imm32SExt16:$UIM))]>;
 } // hasSideEffects
 
 // SELECT_CC_* - Used to implement the SELECT_CC DAG operation.  Expanded after
@@ -926,6 +937,16 @@ def : Pat<(sext_inreg v2i64:$C, v2i32),
 def : Pat<(v2f64 (sint_to_fp (sext_inreg v2i64:$C, v2i32))),
           (XVCVSXWDP (XXSLDWI $C, $C, 1))>;
 
+def : Pat<(v2f64 (PPCsvec2fp v4i32:$C, 0)),
+          (v2f64 (XVCVSXWDP (v2i64 (XXMRGHW $C, $C))))>;
+def : Pat<(v2f64 (PPCsvec2fp v4i32:$C, 1)),
+          (v2f64 (XVCVSXWDP (v2i64 (XXMRGLW $C, $C))))>;
+
+def : Pat<(v2f64 (PPCuvec2fp v4i32:$C, 0)),
+          (v2f64 (XVCVUXWDP (v2i64 (XXMRGHW $C, $C))))>;
+def : Pat<(v2f64 (PPCuvec2fp v4i32:$C, 1)),
+          (v2f64 (XVCVUXWDP (v2i64 (XXMRGLW $C, $C))))>;
+
 // Loads.
 def : Pat<(v2f64 (load xoaddr:$src)), (LXVD2X xoaddr:$src)>;
 def : Pat<(v2i64 (load xoaddr:$src)), (LXVD2X xoaddr:$src)>;
@@ -945,6 +966,7 @@ def : Pat<(v2f64 (PPCxxswapd v2f64:$src)), (XXPERMDI $src, $src, 2)>;
 def : Pat<(v2i64 (PPCxxswapd v2i64:$src)), (XXPERMDI $src, $src, 2)>;
 def : Pat<(v4f32 (PPCxxswapd v4f32:$src)), (XXPERMDI $src, $src, 2)>;
 def : Pat<(v4i32 (PPCxxswapd v4i32:$src)), (XXPERMDI $src, $src, 2)>;
+def : Pat<(v2f64 (PPCswapNoChain v2f64:$src)), (XXPERMDI $src, $src, 2)>;
 
 // Selects.
 def : Pat<(v2f64 (selectcc i1:$lhs, i1:$rhs, v2f64:$tval, v2f64:$fval, SETLT)),
@@ -1007,6 +1029,28 @@ def : Pat<(int_ppc_vsx_xvrsqrtesp v4f32:$A),
 def : Pat<(int_ppc_vsx_xvrsqrtedp v2f64:$A),
           (XVRSQRTEDP $A)>;
 
+let Predicates = [IsLittleEndian] in {
+def : Pat<(f64 (PPCfcfid (PPCmtvsra (i64 (vector_extract v2i64:$S, 0))))),
+          (f64 (XSCVSXDDP (COPY_TO_REGCLASS (XXPERMDI $S, $S, 2), VSFRC)))>;
+def : Pat<(f64 (PPCfcfid (PPCmtvsra (i64 (vector_extract v2i64:$S, 1))))),
+          (f64 (XSCVSXDDP (COPY_TO_REGCLASS (f64 (COPY_TO_REGCLASS $S, VSRC)), VSFRC)))>;
+def : Pat<(f64 (PPCfcfidu (PPCmtvsra (i64 (vector_extract v2i64:$S, 0))))),
+          (f64 (XSCVUXDDP (COPY_TO_REGCLASS (XXPERMDI $S, $S, 2), VSFRC)))>;
+def : Pat<(f64 (PPCfcfidu (PPCmtvsra (i64 (vector_extract v2i64:$S, 1))))),
+          (f64 (XSCVUXDDP (COPY_TO_REGCLASS (f64 (COPY_TO_REGCLASS $S, VSRC)), VSFRC)))>;
+} // IsLittleEndian
+
+let Predicates = [IsBigEndian] in {
+def : Pat<(f64 (PPCfcfid (PPCmtvsra (i64 (vector_extract v2i64:$S, 0))))),
+          (f64 (XSCVSXDDP (COPY_TO_REGCLASS $S, VSFRC)))>;
+def : Pat<(f64 (PPCfcfid (PPCmtvsra (i64 (vector_extract v2i64:$S, 1))))),
+          (f64 (XSCVSXDDP (COPY_TO_REGCLASS (XXPERMDI $S, $S, 2), VSFRC)))>;
+def : Pat<(f64 (PPCfcfidu (PPCmtvsra (i64 (vector_extract v2i64:$S, 0))))),
+          (f64 (XSCVUXDDP (COPY_TO_REGCLASS $S, VSFRC)))>;
+def : Pat<(f64 (PPCfcfidu (PPCmtvsra (i64 (vector_extract v2i64:$S, 1))))),
+          (f64 (XSCVUXDDP (COPY_TO_REGCLASS (XXPERMDI $S, $S, 2), VSFRC)))>;
+} // IsBigEndian
+
 } // AddedComplexity
 } // HasVSX
 
@@ -1213,10 +1257,31 @@ let AddedComplexity = 400 in { // Prefer VSX patterns over non-VSX patterns.
   def XSCVSPDPN : XX2Form<60, 331, (outs vssrc:$XT), (ins vsrc:$XB),
                           "xscvspdpn $XT, $XB", IIC_VecFP, []>;
 
+  let Predicates = [IsLittleEndian] in {
+  def : Pat<(f32 (PPCfcfids (PPCmtvsra (i64 (vector_extract v2i64:$S, 0))))),
+            (f32 (XSCVSXDSP (COPY_TO_REGCLASS (XXPERMDI $S, $S, 2), VSFRC)))>;
+  def : Pat<(f32 (PPCfcfids (PPCmtvsra (i64 (vector_extract v2i64:$S, 1))))),
+            (f32 (XSCVSXDSP (COPY_TO_REGCLASS (f64 (COPY_TO_REGCLASS $S, VSRC)), VSFRC)))>;
+  def : Pat<(f32 (PPCfcfidus (PPCmtvsra (i64 (vector_extract v2i64:$S, 0))))),
+            (f32 (XSCVUXDSP (COPY_TO_REGCLASS (XXPERMDI $S, $S, 2), VSFRC)))>;
+  def : Pat<(f32 (PPCfcfidus (PPCmtvsra (i64 (vector_extract v2i64:$S, 1))))),
+            (f32 (XSCVUXDSP (COPY_TO_REGCLASS (f64 (COPY_TO_REGCLASS $S, VSRC)), VSFRC)))>;
+  }
+
+  let Predicates = [IsBigEndian] in {
+  def : Pat<(f32 (PPCfcfids (PPCmtvsra (i64 (vector_extract v2i64:$S, 0))))),
+            (f32 (XSCVSXDSP (COPY_TO_REGCLASS $S, VSFRC)))>;
+  def : Pat<(f32 (PPCfcfids (PPCmtvsra (i64 (vector_extract v2i64:$S, 1))))),
+            (f32 (XSCVSXDSP (COPY_TO_REGCLASS (XXPERMDI $S, $S, 2), VSFRC)))>;
+  def : Pat<(f32 (PPCfcfidus (PPCmtvsra (i64 (vector_extract v2i64:$S, 0))))),
+            (f32 (XSCVUXDSP (COPY_TO_REGCLASS $S, VSFRC)))>;
+  def : Pat<(f32 (PPCfcfidus (PPCmtvsra (i64 (vector_extract v2i64:$S, 1))))),
+            (f32 (XSCVUXDSP (COPY_TO_REGCLASS (XXPERMDI $S, $S, 2), VSFRC)))>;
+  }
 } // AddedComplexity = 400
 } // HasP8Vector
 
-let Predicates = [HasDirectMove, HasVSX] in {
+let Predicates = [HasDirectMove] in {
   // VSX direct move instructions
   def MFVSRD : XX1_RS6_RD5_XO<31, 51, (outs g8rc:$rA), (ins vsfrc:$XT),
                               "mfvsrd $rA, $XT", IIC_VecGeneral,
@@ -1235,7 +1300,22 @@ let Predicates = [HasDirectMove, HasVSX] in {
   def MTVSRWZ : XX1_RS6_RD5_XO<31, 243, (outs vsfrc:$XT), (ins gprc:$rA),
                                "mtvsrwz $XT, $rA", IIC_VecGeneral,
                                [(set f64:$XT, (PPCmtvsrz i32:$rA))]>;
-} // HasDirectMove, HasVSX
+} // HasDirectMove
+
+let Predicates = [IsISA3_0, HasDirectMove] in {
+  def MTVSRWS: XX1_RS6_RD5_XO<31, 403, (outs vsrc:$XT), (ins gprc:$rA),
+                              "mtvsrws $XT, $rA", IIC_VecGeneral,
+                              []>;
+
+  def MTVSRDD: XX1Form<31, 435, (outs vsrc:$XT), (ins g8rc:$rA, g8rc:$rB),
+                       "mtvsrdd $XT, $rA, $rB", IIC_VecGeneral,
+                       []>, Requires<[In64BitMode]>;
+
+  def MFVSRLD: XX1_RS6_RD5_XO<31, 307, (outs g8rc:$rA), (ins vsrc:$XT),
+                              "mfvsrld $rA, $XT", IIC_VecGeneral,
+                              []>, Requires<[In64BitMode]>;
+
+} // IsISA3_0, HasDirectMove
 
 /*  Direct moves of various widths from GPR's into VSR's. Each move lines
     the value up into element 0 (both BE and LE). Namely, entities smaller than
@@ -1285,7 +1365,7 @@ def VectorExtractions {
                        (v2i64 (COPY_TO_REGCLASS $S, VSRC)), sub_64));
 
   // Word extraction
-  dag LE_WORD_0 = (MFVSRWZ (EXTRACT_SUBREG (XXSLDWI $S, $S, 2), sub_64));
+  dag LE_WORD_0 = (MFVSRWZ (EXTRACT_SUBREG (XXPERMDI $S, $S, 2), sub_64));
   dag LE_WORD_1 = (MFVSRWZ (EXTRACT_SUBREG (XXSLDWI $S, $S, 1), sub_64));
   dag LE_WORD_2 = (MFVSRWZ (EXTRACT_SUBREG
                              (v2i64 (COPY_TO_REGCLASS $S, VSRC)), sub_64));
@@ -1555,7 +1635,7 @@ let Predicates = [IsBigEndian, HasP8Vector] in {
   def : Pat<(f32 (vector_extract v4f32:$S, 1)),
             (f32 (XSCVSPDPN (XXSLDWI $S, $S, 1)))>;
   def : Pat<(f32 (vector_extract v4f32:$S, 2)),
-            (f32 (XSCVSPDPN (XXSLDWI $S, $S, 2)))>;
+            (f32 (XSCVSPDPN (XXPERMDI $S, $S, 2)))>;
   def : Pat<(f32 (vector_extract v4f32:$S, 3)),
             (f32 (XSCVSPDPN (XXSLDWI $S, $S, 3)))>;
   def : Pat<(f32 (vector_extract v4f32:$S, i64:$Idx)),
@@ -1660,7 +1740,7 @@ let Predicates = [IsLittleEndian, HasP8Vector] in {
   def : Pat<(f32 (vector_extract v4f32:$S, 0)),
             (f32 (XSCVSPDPN (XXSLDWI $S, $S, 3)))>;
   def : Pat<(f32 (vector_extract v4f32:$S, 1)),
-            (f32 (XSCVSPDPN (XXSLDWI $S, $S, 2)))>;
+            (f32 (XSCVSPDPN (XXPERMDI $S, $S, 2)))>;
   def : Pat<(f32 (vector_extract v4f32:$S, 2)),
             (f32 (XSCVSPDPN (XXSLDWI $S, $S, 1)))>;
   def : Pat<(f32 (vector_extract v4f32:$S, 3)),
@@ -1783,3 +1863,423 @@ def : Pat<(i64 (bitconvert f64:$S)),
 def : Pat<(f64 (bitconvert i64:$S)),
           (f64 (MTVSRD $S))>;
 }
+
+def AlignValues {
+  dag F32_TO_BE_WORD1 = (v4f32 (XXSLDWI (XSCVDPSPN $B), (XSCVDPSPN $B), 3));
+  dag I32_TO_BE_WORD1 = (COPY_TO_REGCLASS (MTVSRWZ $B), VSRC);
+}
+
+// The following VSX instructions were introduced in Power ISA 3.0
+def HasP9Vector : Predicate<"PPCSubTarget->hasP9Vector()">;
+let AddedComplexity = 400, Predicates = [HasP9Vector] in {
+
+  // [PO VRT XO VRB XO /]
+  class X_VT5_XO5_VB5<bits<6> opcode, bits<5> xo2, bits<10> xo, string opc,
+                      list<dag> pattern>
+    : X_RD5_XO5_RS5<opcode, xo2, xo, (outs vrrc:$vT), (ins vrrc:$vB),
+                    !strconcat(opc, " $vT, $vB"), IIC_VecFP, pattern>;
+
+  // [PO VRT XO VRB XO RO], Round to Odd version of [PO VRT XO VRB XO /]
+  class X_VT5_XO5_VB5_Ro<bits<6> opcode, bits<5> xo2, bits<10> xo, string opc,
+                         list<dag> pattern>
+    : X_VT5_XO5_VB5<opcode, xo2, xo, opc, pattern>, isDOT;
+
+  // [PO VRT XO VRB XO /], but the VRB is only used the left 64 bits (or less),
+  // So we use different operand class for VRB
+  class X_VT5_XO5_VB5_TyVB<bits<6> opcode, bits<5> xo2, bits<10> xo, string opc,
+                           RegisterOperand vbtype, list<dag> pattern>
+    : X_RD5_XO5_RS5<opcode, xo2, xo, (outs vrrc:$vT), (ins vbtype:$vB),
+                    !strconcat(opc, " $vT, $vB"), IIC_VecFP, pattern>;
+
+  // [PO T XO B XO BX /]
+  class XX2_RT5_XO5_XB6<bits<6> opcode, bits<5> xo2, bits<9> xo, string opc,
+                        list<dag> pattern>
+    : XX2_RD5_XO5_RS6<opcode, xo2, xo, (outs g8rc:$rT), (ins vsfrc:$XB),
+                      !strconcat(opc, " $rT, $XB"), IIC_VecFP, pattern>;
+
+  // [PO T XO B XO BX TX]
+  class XX2_XT6_XO5_XB6<bits<6> opcode, bits<5> xo2, bits<9> xo, string opc,
+                        RegisterOperand vtype, list<dag> pattern>
+    : XX2_RD6_XO5_RS6<opcode, xo2, xo, (outs vtype:$XT), (ins vtype:$XB),
+                      !strconcat(opc, " $XT, $XB"), IIC_VecFP, pattern>;
+
+  // [PO T A B XO AX BX TX], src and dest register use different operand class
+  class XX3_XT5_XA5_XB5<bits<6> opcode, bits<8> xo, string opc,
+                  RegisterOperand xty, RegisterOperand aty, RegisterOperand bty,
+                  InstrItinClass itin, list<dag> pattern>
+    : XX3Form<opcode, xo, (outs xty:$XT), (ins aty:$XA, bty:$XB),
+              !strconcat(opc, " $XT, $XA, $XB"), itin, pattern>;
+
+  // [PO VRT VRA VRB XO /]
+  class X_VT5_VA5_VB5<bits<6> opcode, bits<10> xo, string opc,
+                      list<dag> pattern>
+    : XForm_1<opcode, xo, (outs vrrc:$vT), (ins vrrc:$vA, vrrc:$vB),
+              !strconcat(opc, " $vT, $vA, $vB"), IIC_VecFP, pattern>;
+
+  // [PO VRT VRA VRB XO RO], Round to Odd version of [PO VRT VRA VRB XO /]
+  class X_VT5_VA5_VB5_Ro<bits<6> opcode, bits<10> xo, string opc,
+                         list<dag> pattern>
+    : X_VT5_VA5_VB5<opcode, xo, opc, pattern>, isDOT;
+
+  //===--------------------------------------------------------------------===//
+  // Quad-Precision Scalar Move Instructions:
+
+  // Copy Sign
+  def XSCPSGNQP : X_VT5_VA5_VB5<63, 100, "xscpsgnqp", []>;
+
+  // Absolute/Negative-Absolute/Negate
+  def XSABSQP   : X_VT5_XO5_VB5<63,  0, 804, "xsabsqp" , []>;
+  def XSNABSQP  : X_VT5_XO5_VB5<63,  8, 804, "xsnabsqp", []>;
+  def XSNEGQP   : X_VT5_XO5_VB5<63, 16, 804, "xsnegqp" , []>;
+
+  //===--------------------------------------------------------------------===//
+  // Quad-Precision Scalar Floating-Point Arithmetic Instructions:
+
+  // Add/Divide/Multiply/Subtract
+  def XSADDQP   : X_VT5_VA5_VB5   <63,   4, "xsaddqp" , []>;
+  def XSADDQPO  : X_VT5_VA5_VB5_Ro<63,   4, "xsaddqpo", []>;
+  def XSDIVQP   : X_VT5_VA5_VB5   <63, 548, "xsdivqp" , []>;
+  def XSDIVQPO  : X_VT5_VA5_VB5_Ro<63, 548, "xsdivqpo", []>;
+  def XSMULQP   : X_VT5_VA5_VB5   <63,  36, "xsmulqp" , []>;
+  def XSMULQPO  : X_VT5_VA5_VB5_Ro<63,  36, "xsmulqpo", []>;
+  def XSSUBQP   : X_VT5_VA5_VB5   <63, 516, "xssubqp" , []>;
+  def XSSUBQPO  : X_VT5_VA5_VB5_Ro<63, 516, "xssubqpo", []>;
+
+  // Square-Root
+  def XSSQRTQP  : X_VT5_XO5_VB5   <63, 27, 804, "xssqrtqp" , []>;
+  def XSSQRTQPO : X_VT5_XO5_VB5_Ro<63, 27, 804, "xssqrtqpo", []>;
+
+  // (Negative) Multiply-{Add/Subtract}
+  def XSMADDQP  : X_VT5_VA5_VB5   <63, 388, "xsmaddqp"  , []>;
+  def XSMADDQPO : X_VT5_VA5_VB5_Ro<63, 388, "xsmaddqpo" , []>;
+  def XSMSUBQP  : X_VT5_VA5_VB5   <63, 420, "xsmsubqp"  , []>;
+  def XSMSUBQPO : X_VT5_VA5_VB5_Ro<63, 420, "xsmsubqpo" , []>;
+  def XSNMADDQP : X_VT5_VA5_VB5   <63, 452, "xsnmaddqp" , []>;
+  def XSNMADDQPO: X_VT5_VA5_VB5_Ro<63, 452, "xsnmaddqpo", []>;
+  def XSNMSUBQP : X_VT5_VA5_VB5   <63, 484, "xsnmsubqp" , []>;
+  def XSNMSUBQPO: X_VT5_VA5_VB5_Ro<63, 484, "xsnmsubqpo", []>;
+
+  //===--------------------------------------------------------------------===//
+  // Quad/Double-Precision Compare Instructions:
+
+  // [PO BF // VRA VRB XO /]
+  class X_BF3_VA5_VB5<bits<6> opcode, bits<10> xo, string opc,
+                      list<dag> pattern>
+    : XForm_17<opcode, xo, (outs crrc:$crD), (ins vrrc:$VA, vrrc:$VB),
+               !strconcat(opc, " $crD, $VA, $VB"), IIC_FPCompare> {
+    let Pattern = pattern;
+  }
+
+  // QP Compare Ordered/Unordered
+  def XSCMPOQP : X_BF3_VA5_VB5<63, 132, "xscmpoqp", []>;
+  def XSCMPUQP : X_BF3_VA5_VB5<63, 644, "xscmpuqp", []>;
+
+  // DP/QP Compare Exponents
+  def XSCMPEXPDP : XX3Form_1<60, 59,
+                             (outs crrc:$crD), (ins vsfrc:$XA, vsfrc:$XB),
+                             "xscmpexpdp $crD, $XA, $XB", IIC_FPCompare, []>;
+  def XSCMPEXPQP : X_BF3_VA5_VB5<63, 164, "xscmpexpqp", []>;
+
+  // DP Compare ==, >=, >, !=
+  // Use vsrc for XT, because the entire register of XT is set.
+  // XT.dword[1] = 0x0000_0000_0000_0000
+  def XSCMPEQDP : XX3_XT5_XA5_XB5<60,  3, "xscmpeqdp", vsrc, vsfrc, vsfrc,
+                                  IIC_FPCompare, []>;
+  def XSCMPGEDP : XX3_XT5_XA5_XB5<60, 19, "xscmpgedp", vsrc, vsfrc, vsfrc,
+                                  IIC_FPCompare, []>;
+  def XSCMPGTDP : XX3_XT5_XA5_XB5<60, 11, "xscmpgtdp", vsrc, vsfrc, vsfrc,
+                                  IIC_FPCompare, []>;
+  def XSCMPNEDP : XX3_XT5_XA5_XB5<60, 27, "xscmpnedp", vsrc, vsfrc, vsfrc,
+                                  IIC_FPCompare, []>;
+  // Vector Compare Not Equal
+  def XVCMPNEDP  : XX3Form_Rc<60, 123,
+                              (outs vsrc:$XT), (ins vsrc:$XA, vsrc:$XB),
+                              "xvcmpnedp  $XT, $XA, $XB", IIC_VecFPCompare, []>;
+  let Defs = [CR6] in
+  def XVCMPNEDPo : XX3Form_Rc<60, 123,
+                              (outs vsrc:$XT), (ins vsrc:$XA, vsrc:$XB),
+                              "xvcmpnedp. $XT, $XA, $XB", IIC_VecFPCompare, []>,
+                              isDOT;
+  def XVCMPNESP  : XX3Form_Rc<60,  91,
+                              (outs vsrc:$XT), (ins vsrc:$XA, vsrc:$XB),
+                              "xvcmpnesp  $XT, $XA, $XB", IIC_VecFPCompare, []>;
+  let Defs = [CR6] in
+  def XVCMPNESPo : XX3Form_Rc<60,  91,
+                              (outs vsrc:$XT), (ins vsrc:$XA, vsrc:$XB),
+                              "xvcmpnesp. $XT, $XA, $XB", IIC_VecFPCompare, []>,
+                              isDOT;
+
+  //===--------------------------------------------------------------------===//
+  // Quad-Precision Floating-Point Conversion Instructions:
+
+  // Convert DP -> QP
+  def XSCVDPQP  : X_VT5_XO5_VB5_TyVB<63, 22, 836, "xscvdpqp", vsfrc, []>;
+
+  // Round & Convert QP -> DP (dword[1] is set to zero)
+  def XSCVQPDP  : X_VT5_XO5_VB5   <63, 20, 836, "xscvqpdp" , []>;
+  def XSCVQPDPO : X_VT5_XO5_VB5_Ro<63, 20, 836, "xscvqpdpo", []>;
+
+  // Truncate & Convert QP -> (Un)Signed (D)Word (dword[1] is set to zero)
+  def XSCVQPSDZ : X_VT5_XO5_VB5<63, 25, 836, "xscvqpsdz", []>;
+  def XSCVQPSWZ : X_VT5_XO5_VB5<63,  9, 836, "xscvqpswz", []>;
+  def XSCVQPUDZ : X_VT5_XO5_VB5<63, 17, 836, "xscvqpudz", []>;
+  def XSCVQPUWZ : X_VT5_XO5_VB5<63,  1, 836, "xscvqpuwz", []>;
+
+  // Convert (Un)Signed DWord -> QP
+  def XSCVSDQP  : X_VT5_XO5_VB5_TyVB<63, 10, 836, "xscvsdqp", vsfrc, []>;
+  def XSCVUDQP  : X_VT5_XO5_VB5_TyVB<63,  2, 836, "xscvudqp", vsfrc, []>;
+
+  //===--------------------------------------------------------------------===//
+  // Round to Floating-Point Integer Instructions
+
+  // (Round &) Convert DP <-> HP
+  // Note! xscvdphp's src and dest register both use the left 64 bits, so we use
+  // vsfrc for src and dest register. xscvhpdp's src only use the left 16 bits,
+  // but we still use vsfrc for it.
+  def XSCVDPHP : XX2_XT6_XO5_XB6<60, 17, 347, "xscvdphp", vsfrc, []>;
+  def XSCVHPDP : XX2_XT6_XO5_XB6<60, 16, 347, "xscvhpdp", vsfrc, []>;
+
+  // Vector HP -> SP
+  def XVCVHPSP : XX2_XT6_XO5_XB6<60, 24, 475, "xvcvhpsp", vsrc, []>;
+  def XVCVSPHP : XX2_XT6_XO5_XB6<60, 25, 475, "xvcvsphp", vsrc, []>;
+
+  class Z23_VT5_R1_VB5_RMC2_EX1<bits<6> opcode, bits<8> xo, bit ex, string opc,
+                                list<dag> pattern>
+    : Z23Form_1<opcode, xo,
+                (outs vrrc:$vT), (ins u1imm:$r, vrrc:$vB, u2imm:$rmc),
+                !strconcat(opc, " $r, $vT, $vB, $rmc"), IIC_VecFP, pattern> {
+    let RC = ex;
+  }
+
+  // Round to Quad-Precision Integer [with Inexact]
+  def XSRQPI   : Z23_VT5_R1_VB5_RMC2_EX1<63,  5, 0, "xsrqpi" , []>;
+  def XSRQPIX  : Z23_VT5_R1_VB5_RMC2_EX1<63,  5, 1, "xsrqpix", []>;
+
+  // Round Quad-Precision to Double-Extended Precision (fp80)
+  def XSRQPXP  : Z23_VT5_R1_VB5_RMC2_EX1<63, 37, 0, "xsrqpxp", []>;
+
+  //===--------------------------------------------------------------------===//
+  // Insert/Extract Instructions
+
+  // Insert Exponent DP/QP
+  // XT NOTE: XT.dword[1] = 0xUUUU_UUUU_UUUU_UUUU
+  def XSIEXPDP : XX1Form <60, 918, (outs vsrc:$XT), (ins g8rc:$rA, g8rc:$rB),
+                          "xsiexpdp $XT, $rA, $rB", IIC_VecFP, []>;
+  // vB NOTE: only vB.dword[0] is used, that's why we don't use
+  //          X_VT5_VA5_VB5 form
+  def XSIEXPQP : XForm_18<63, 868, (outs vrrc:$vT), (ins vrrc:$vA, vsfrc:$vB),
+                          "xsiexpqp $vT, $vA, $vB", IIC_VecFP, []>;
+
+  // Extract Exponent/Significand DP/QP
+  def XSXEXPDP : XX2_RT5_XO5_XB6<60,  0, 347, "xsxexpdp", []>;
+  def XSXSIGDP : XX2_RT5_XO5_XB6<60,  1, 347, "xsxsigdp", []>;
+  def XSXEXPQP : X_VT5_XO5_VB5  <63,  2, 804, "xsxexpqp", []>;
+  def XSXSIGQP : X_VT5_XO5_VB5  <63, 18, 804, "xsxsigqp", []>;
+
+  // Vector Insert Word
+  // XB NOTE: Only XB.dword[1] is used, but we use vsrc on XB.
+  def XXINSERTW   :
+    XX2_RD6_UIM5_RS6<60, 181, (outs vsrc:$XT),
+                     (ins vsrc:$XTi, vsrc:$XB, u4imm:$UIM),
+                     "xxinsertw $XT, $XB, $UIM", IIC_VecFP,
+                     [(set v4i32:$XT, (PPCxxinsert v4i32:$XTi, v4i32:$XB,
+                                                   imm32SExt16:$UIM))]>,
+                     RegConstraint<"$XTi = $XT">, NoEncode<"$XTi">;
+
+  // Vector Extract Unsigned Word
+  def XXEXTRACTUW : XX2_RD6_UIM5_RS6<60, 165,
+                                  (outs vsfrc:$XT), (ins vsrc:$XB, u4imm:$UIMM),
+                                  "xxextractuw $XT, $XB, $UIMM", IIC_VecFP, []>;
+
+  // Vector Insert Exponent DP/SP
+  def XVIEXPDP : XX3_XT5_XA5_XB5<60, 248, "xviexpdp", vsrc, vsrc, vsrc,
+                                 IIC_VecFP, []>;
+  def XVIEXPSP : XX3_XT5_XA5_XB5<60, 216, "xviexpsp", vsrc, vsrc, vsrc,
+                                 IIC_VecFP, []>;
+
+  // Vector Extract Exponent/Significand DP/SP
+  def XVXEXPDP : XX2_XT6_XO5_XB6<60,  0, 475, "xvxexpdp", vsrc, []>;
+  def XVXEXPSP : XX2_XT6_XO5_XB6<60,  8, 475, "xvxexpsp", vsrc, []>;
+  def XVXSIGDP : XX2_XT6_XO5_XB6<60,  1, 475, "xvxsigdp", vsrc, []>;
+  def XVXSIGSP : XX2_XT6_XO5_XB6<60,  9, 475, "xvxsigsp", vsrc, []>;
+
+  //===--------------------------------------------------------------------===//
+
+  // Test Data Class SP/DP/QP
+  def XSTSTDCSP : XX2_BF3_DCMX7_RS6<60, 298,
+                              (outs crrc:$BF), (ins u7imm:$DCMX, vsfrc:$XB),
+                              "xststdcsp $BF, $XB, $DCMX", IIC_VecFP, []>;
+  def XSTSTDCDP : XX2_BF3_DCMX7_RS6<60, 362,
+                              (outs crrc:$BF), (ins u7imm:$DCMX, vsfrc:$XB),
+                              "xststdcdp $BF, $XB, $DCMX", IIC_VecFP, []>;
+  def XSTSTDCQP : X_BF3_DCMX7_RS5  <63, 708,
+                              (outs crrc:$BF), (ins u7imm:$DCMX, vrrc:$vB),
+                              "xststdcqp $BF, $vB, $DCMX", IIC_VecFP, []>;
+
+  // Vector Test Data Class SP/DP
+  def XVTSTDCSP : XX2_RD6_DCMX7_RS6<60, 13, 5,
+                              (outs vsrc:$XT), (ins u7imm:$DCMX, vsrc:$XB),
+                              "xvtstdcsp $XT, $XB, $DCMX", IIC_VecFP, []>;
+  def XVTSTDCDP : XX2_RD6_DCMX7_RS6<60, 15, 5,
+                              (outs vsrc:$XT), (ins u7imm:$DCMX, vsrc:$XB),
+                              "xvtstdcdp $XT, $XB, $DCMX", IIC_VecFP, []>;
+
+  //===--------------------------------------------------------------------===//
+
+  // Maximum/Minimum Type-C/Type-J DP
+  // XT.dword[1] = 0xUUUU_UUUU_UUUU_UUUU, so we use vsrc for XT
+  def XSMAXCDP : XX3_XT5_XA5_XB5<60, 128, "xsmaxcdp", vsrc, vsfrc, vsfrc,
+                                 IIC_VecFP, []>;
+  def XSMAXJDP : XX3_XT5_XA5_XB5<60, 144, "xsmaxjdp", vsrc, vsfrc, vsfrc,
+                                 IIC_VecFP, []>;
+  def XSMINCDP : XX3_XT5_XA5_XB5<60, 136, "xsmincdp", vsrc, vsfrc, vsfrc,
+                                 IIC_VecFP, []>;
+  def XSMINJDP : XX3_XT5_XA5_XB5<60, 152, "xsminjdp", vsrc, vsfrc, vsfrc,
+                                 IIC_VecFP, []>;
+
+  //===--------------------------------------------------------------------===//
+
+  // Vector Byte-Reverse H/W/D/Q Word
+  def XXBRH : XX2_XT6_XO5_XB6<60,  7, 475, "xxbrh", vsrc, []>;
+  def XXBRW : XX2_XT6_XO5_XB6<60, 15, 475, "xxbrw", vsrc, []>;
+  def XXBRD : XX2_XT6_XO5_XB6<60, 23, 475, "xxbrd", vsrc, []>;
+  def XXBRQ : XX2_XT6_XO5_XB6<60, 31, 475, "xxbrq", vsrc, []>;
+
+  // Vector Permute
+  def XXPERM  : XX3_XT5_XA5_XB5<60, 26, "xxperm" , vsrc, vsrc, vsrc,
+                                IIC_VecPerm, []>;
+  def XXPERMR : XX3_XT5_XA5_XB5<60, 58, "xxpermr", vsrc, vsrc, vsrc,
+                                IIC_VecPerm, []>;
+
+  // Vector Splat Immediate Byte
+  def XXSPLTIB : X_RD6_IMM8<60, 360, (outs vsrc:$XT), (ins u8imm:$IMM8),
+                            "xxspltib $XT, $IMM8", IIC_VecPerm, []>;
+
+  //===--------------------------------------------------------------------===//
+  // Vector/Scalar Load/Store Instructions
+
+  let mayLoad = 1 in {
+  // Load Vector
+  def LXV : DQ_RD6_RS5_DQ12<61, 1, (outs vsrc:$XT), (ins memrix16:$src),
+                            "lxv $XT, $src", IIC_LdStLFD, []>;
+  // Load DWord
+  def LXSD  : DSForm_1<57, 2, (outs vrrc:$vD), (ins memrix:$src),
+                       "lxsd $vD, $src", IIC_LdStLFD, []>;
+  // Load SP from src, convert it to DP, and place in dword[0]
+  def LXSSP : DSForm_1<57, 3, (outs vrrc:$vD), (ins memrix:$src),
+                       "lxssp $vD, $src", IIC_LdStLFD, []>;
+
+  // [PO T RA RB XO TX] almost equal to [PO S RA RB XO SX], but has different
+  // "out" and "in" dag
+  class X_XT6_RA5_RB5<bits<6> opcode, bits<10> xo, string opc,
+                      RegisterOperand vtype, list<dag> pattern>
+    : XX1Form<opcode, xo, (outs vtype:$XT), (ins memrr:$src),
+              !strconcat(opc, " $XT, $src"), IIC_LdStLFD, pattern>;
+
+  // Load as Integer Byte/Halfword & Zero Indexed
+  def LXSIBZX : X_XT6_RA5_RB5<31, 781, "lxsibzx", vsfrc, []>;
+  def LXSIHZX : X_XT6_RA5_RB5<31, 813, "lxsihzx", vsfrc, []>;
+
+  // Load Vector Halfword*8/Byte*16 Indexed
+  def LXVH8X  : X_XT6_RA5_RB5<31, 812, "lxvh8x" , vsrc, []>;
+  def LXVB16X : X_XT6_RA5_RB5<31, 876, "lxvb16x", vsrc, []>;
+
+  // Load Vector Indexed
+  def LXVX    : X_XT6_RA5_RB5<31, 268, "lxvx"   , vsrc, []>;
+
+  // Load Vector (Left-justified) with Length
+  def LXVL    : X_XT6_RA5_RB5<31, 269, "lxvl"   , vsrc, []>;
+  def LXVLL   : X_XT6_RA5_RB5<31, 301, "lxvll"  , vsrc, []>;
+
+  // Load Vector Word & Splat Indexed
+  def LXVWSX  : X_XT6_RA5_RB5<31, 364, "lxvwsx" , vsrc, []>;
+  } // end mayLoad
+
+  let mayStore = 1 in {
+  // Store Vector
+  def STXV : DQ_RD6_RS5_DQ12<61, 5, (outs), (ins vsrc:$XT, memrix16:$dst),
+                             "stxv $XT, $dst", IIC_LdStSTFD, []>;
+  // Store DWord
+  def STXSD  : DSForm_1<61, 2, (outs), (ins vrrc:$vS, memrix:$dst),
+                        "stxsd $vS, $dst", IIC_LdStSTFD, []>;
+  // Convert DP of dword[0] to SP, and Store to dst
+  def STXSSP : DSForm_1<61, 3, (outs), (ins vrrc:$vS, memrix:$dst),
+                        "stxssp $vS, $dst", IIC_LdStSTFD, []>;
+
+  // [PO S RA RB XO SX]
+  class X_XS6_RA5_RB5<bits<6> opcode, bits<10> xo, string opc,
+                      RegisterOperand vtype, list<dag> pattern>
+    : XX1Form<opcode, xo, (outs), (ins vtype:$XT, memrr:$dst),
+              !strconcat(opc, " $XT, $dst"), IIC_LdStSTFD, pattern>;
+
+  // Store as Integer Byte/Halfword Indexed
+  def STXSIBX  : X_XS6_RA5_RB5<31,  909, "stxsibx" , vsfrc, []>;
+  def STXSIHX  : X_XS6_RA5_RB5<31,  941, "stxsihx" , vsfrc, []>;
+
+  // Store Vector Halfword*8/Byte*16 Indexed
+  def STXVH8X  : X_XS6_RA5_RB5<31,  940, "stxvh8x" , vsrc, []>;
+  def STXVB16X : X_XS6_RA5_RB5<31, 1004, "stxvb16x", vsrc, []>;
+
+  // Store Vector Indexed
+  def STXVX    : X_XS6_RA5_RB5<31,  396, "stxvx"   , vsrc, []>;
+
+  // Store Vector (Left-justified) with Length
+  def STXVL    : X_XS6_RA5_RB5<31,  397, "stxvl"   , vsrc, []>;
+  def STXVLL   : X_XS6_RA5_RB5<31,  429, "stxvll"  , vsrc, []>;
+  } // end mayStore
+
+  // Patterns for which instructions from ISA 3.0 are a better match
+  let Predicates = [IsLittleEndian, HasP9Vector] in {
+  def : Pat<(f32 (PPCfcfidus (PPCmtvsrz (i32 (extractelt v4i32:$A, 0))))),
+            (f32 (XSCVUXDSP (XXEXTRACTUW $A, 12)))>;
+  def : Pat<(f32 (PPCfcfidus (PPCmtvsrz (i32 (extractelt v4i32:$A, 1))))),
+            (f32 (XSCVUXDSP (XXEXTRACTUW $A, 8)))>;
+  def : Pat<(f32 (PPCfcfidus (PPCmtvsrz (i32 (extractelt v4i32:$A, 2))))),
+            (f32 (XSCVUXDSP (XXEXTRACTUW $A, 4)))>;
+  def : Pat<(f32 (PPCfcfidus (PPCmtvsrz (i32 (extractelt v4i32:$A, 3))))),
+            (f32 (XSCVUXDSP (XXEXTRACTUW $A, 0)))>;
+  def : Pat<(v4i32 (insertelt v4i32:$A, i32:$B, 0)),
+            (v4i32 (XXINSERTW v4i32:$A, AlignValues.I32_TO_BE_WORD1, 12))>;
+  def : Pat<(v4i32 (insertelt v4i32:$A, i32:$B, 1)),
+            (v4i32 (XXINSERTW v4i32:$A, AlignValues.I32_TO_BE_WORD1, 8))>;
+  def : Pat<(v4i32 (insertelt v4i32:$A, i32:$B, 2)),
+            (v4i32 (XXINSERTW v4i32:$A, AlignValues.I32_TO_BE_WORD1, 4))>;
+  def : Pat<(v4i32 (insertelt v4i32:$A, i32:$B, 3)),
+            (v4i32 (XXINSERTW v4i32:$A, AlignValues.I32_TO_BE_WORD1, 0))>;
+  def : Pat<(v4f32 (insertelt v4f32:$A, f32:$B, 0)),
+            (v4f32 (XXINSERTW v4f32:$A, AlignValues.F32_TO_BE_WORD1, 12))>;
+  def : Pat<(v4f32 (insertelt v4f32:$A, f32:$B, 1)),
+            (v4f32 (XXINSERTW v4f32:$A, AlignValues.F32_TO_BE_WORD1, 8))>;
+  def : Pat<(v4f32 (insertelt v4f32:$A, f32:$B, 2)),
+            (v4f32 (XXINSERTW v4f32:$A, AlignValues.F32_TO_BE_WORD1, 4))>;
+  def : Pat<(v4f32 (insertelt v4f32:$A, f32:$B, 3)),
+            (v4f32 (XXINSERTW v4f32:$A, AlignValues.F32_TO_BE_WORD1, 0))>;
+  } // IsLittleEndian, HasP9Vector
+
+  let Predicates = [IsBigEndian, HasP9Vector] in {
+  def : Pat<(f32 (PPCfcfidus (PPCmtvsrz (i32 (extractelt v4i32:$A, 0))))),
+            (f32 (XSCVUXDSP (XXEXTRACTUW $A, 0)))>;
+  def : Pat<(f32 (PPCfcfidus (PPCmtvsrz (i32 (extractelt v4i32:$A, 1))))),
+            (f32 (XSCVUXDSP (XXEXTRACTUW $A, 4)))>;
+  def : Pat<(f32 (PPCfcfidus (PPCmtvsrz (i32 (extractelt v4i32:$A, 2))))),
+            (f32 (XSCVUXDSP (XXEXTRACTUW $A, 8)))>;
+  def : Pat<(f32 (PPCfcfidus (PPCmtvsrz (i32 (extractelt v4i32:$A, 3))))),
+            (f32 (XSCVUXDSP (XXEXTRACTUW $A, 12)))>;
+  def : Pat<(v4i32 (insertelt v4i32:$A, i32:$B, 0)),
+            (v4i32 (XXINSERTW v4i32:$A, AlignValues.I32_TO_BE_WORD1, 0))>;
+  def : Pat<(v4i32 (insertelt v4i32:$A, i32:$B, 1)),
+            (v4i32 (XXINSERTW v4i32:$A, AlignValues.I32_TO_BE_WORD1, 4))>;
+  def : Pat<(v4i32 (insertelt v4i32:$A, i32:$B, 2)),
+            (v4i32 (XXINSERTW v4i32:$A, AlignValues.I32_TO_BE_WORD1, 8))>;
+  def : Pat<(v4i32 (insertelt v4i32:$A, i32:$B, 3)),
+            (v4i32 (XXINSERTW v4i32:$A, AlignValues.I32_TO_BE_WORD1, 12))>;
+  def : Pat<(v4f32 (insertelt v4f32:$A, f32:$B, 0)),
+            (v4f32 (XXINSERTW v4f32:$A, AlignValues.F32_TO_BE_WORD1, 0))>;
+  def : Pat<(v4f32 (insertelt v4f32:$A, f32:$B, 1)),
+            (v4f32 (XXINSERTW v4f32:$A, AlignValues.F32_TO_BE_WORD1, 4))>;
+  def : Pat<(v4f32 (insertelt v4f32:$A, f32:$B, 2)),
+            (v4f32 (XXINSERTW v4f32:$A, AlignValues.F32_TO_BE_WORD1, 8))>;
+  def : Pat<(v4f32 (insertelt v4f32:$A, f32:$B, 3)),
+            (v4f32 (XXINSERTW v4f32:$A, AlignValues.F32_TO_BE_WORD1, 12))>;
+  } // IsLittleEndian, HasP9Vector
+} // end HasP9Vector, AddedComplexity
diff --git a/lib/Target/PowerPC/PPCLoopPreIncPrep.cpp b/lib/Target/PowerPC/PPCLoopPreIncPrep.cpp
index 5e188268fee9..48a71cfc2a6e 100644
--- a/lib/Target/PowerPC/PPCLoopPreIncPrep.cpp
+++ b/lib/Target/PowerPC/PPCLoopPreIncPrep.cpp
@@ -144,6 +144,9 @@ static Value *GetPointerOperand(Value *MemI) {
 }
 
 bool PPCLoopPreIncPrep::runOnFunction(Function &F) {
+  if (skipFunction(F))
+    return false;
+
   LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
   SE = &getAnalysis<ScalarEvolutionWrapperPass>().getSE();
   auto *DTWP = getAnalysisIfAvailable<DominatorTreeWrapperPass>();
diff --git a/lib/Target/PowerPC/PPCMCInstLower.cpp b/lib/Target/PowerPC/PPCMCInstLower.cpp
index 44a692d4bb42..18377a44a7f8 100644
--- a/lib/Target/PowerPC/PPCMCInstLower.cpp
+++ b/lib/Target/PowerPC/PPCMCInstLower.cpp
@@ -40,21 +40,15 @@ static MCSymbol *GetSymbolFromOperand(const MachineOperand &MO, AsmPrinter &AP){
   Mangler *Mang = AP.Mang;
   const DataLayout &DL = AP.getDataLayout();
   MCContext &Ctx = AP.OutContext;
-  bool isDarwin = TM.getTargetTriple().isOSDarwin();
 
   SmallString<128> Name;
   StringRef Suffix;
-  if (MO.getTargetFlags() == PPCII::MO_PLT_OR_STUB) {
-    if (isDarwin)
-      Suffix = "$stub";
-  } else if (MO.getTargetFlags() & PPCII::MO_NLP_FLAG)
+  if (MO.getTargetFlags() & PPCII::MO_NLP_FLAG)
     Suffix = "$non_lazy_ptr";
 
   if (!Suffix.empty())
     Name += DL.getPrivateGlobalPrefix();
 
-  unsigned PrefixLen = Name.size();
-
   if (!MO.isGlobal()) {
     assert(MO.isSymbol() && "Isn't a symbol reference");
     Mangler::getNameWithPrefix(Name, MO.getSymbolName(), DL);
@@ -63,42 +57,16 @@ static MCSymbol *GetSymbolFromOperand(const MachineOperand &MO, AsmPrinter &AP){
     TM.getNameWithPrefix(Name, GV, *Mang);
   }
 
-  unsigned OrigLen = Name.size() - PrefixLen;
-
   Name += Suffix;
   MCSymbol *Sym = Ctx.getOrCreateSymbol(Name);
-  StringRef OrigName = StringRef(Name).substr(PrefixLen, OrigLen);
-
-  // If the target flags on the operand changes the name of the symbol, do that
-  // before we return the symbol.
-  if (MO.getTargetFlags() == PPCII::MO_PLT_OR_STUB && isDarwin) {
-    MachineModuleInfoImpl::StubValueTy &StubSym =
-      getMachOMMI(AP).getFnStubEntry(Sym);
-    if (StubSym.getPointer())
-      return Sym;
-    
-    if (MO.isGlobal()) {
-      StubSym =
-      MachineModuleInfoImpl::
-      StubValueTy(AP.getSymbol(MO.getGlobal()),
-                  !MO.getGlobal()->hasInternalLinkage());
-    } else {
-      StubSym =
-      MachineModuleInfoImpl::
-      StubValueTy(Ctx.getOrCreateSymbol(OrigName), false);
-    }
-    return Sym;
-  }
 
   // If the symbol reference is actually to a non_lazy_ptr, not to the symbol,
   // then add the suffix.
   if (MO.getTargetFlags() & PPCII::MO_NLP_FLAG) {
     MachineModuleInfoMachO &MachO = getMachOMMI(AP);
-    
-    MachineModuleInfoImpl::StubValueTy &StubSym =
-      (MO.getTargetFlags() & PPCII::MO_NLP_HIDDEN_FLAG) ? 
-         MachO.getHiddenGVStubEntry(Sym) : MachO.getGVStubEntry(Sym);
-    
+
+    MachineModuleInfoImpl::StubValueTy &StubSym = MachO.getGVStubEntry(Sym);
+
     if (!StubSym.getPointer()) {
       assert(MO.isGlobal() && "Extern symbol not handled yet");
       StubSym = MachineModuleInfoImpl::
@@ -139,7 +107,7 @@ static MCOperand GetSymbolRef(const MachineOperand &MO, const MCSymbol *Symbol,
       break;
   }
 
-  if (MO.getTargetFlags() == PPCII::MO_PLT_OR_STUB && !isDarwin)
+  if (MO.getTargetFlags() == PPCII::MO_PLT)
     RefKind = MCSymbolRefExpr::VK_PLT;
 
   const MCExpr *Expr = MCSymbolRefExpr::create(Symbol, RefKind, Ctx);
diff --git a/lib/Target/PowerPC/PPCMIPeephole.cpp b/lib/Target/PowerPC/PPCMIPeephole.cpp
index fe339d70d7de..a57a83d7aa93 100644
--- a/lib/Target/PowerPC/PPCMIPeephole.cpp
+++ b/lib/Target/PowerPC/PPCMIPeephole.cpp
@@ -63,6 +63,8 @@ private:
 public:
   // Main entry point for this pass.
   bool runOnMachineFunction(MachineFunction &MF) override {
+    if (skipFunction(*MF.getFunction()))
+      return false;
     initialize(MF);
     return simplifyCode();
   }
diff --git a/lib/Target/PowerPC/PPCMachineFunctionInfo.h b/lib/Target/PowerPC/PPCMachineFunctionInfo.h
index 10a8ce068d40..4c29aa06f048 100644
--- a/lib/Target/PowerPC/PPCMachineFunctionInfo.h
+++ b/lib/Target/PowerPC/PPCMachineFunctionInfo.h
@@ -104,6 +104,10 @@ class PPCFunctionInfo : public MachineFunctionInfo {
   /// Whether this uses the PIC Base register or not.
   bool UsesPICBase;
 
+  /// True if this function has a subset of CSRs that is handled explicitly via
+  /// copies
+  bool IsSplitCSR;
+
 public:
   explicit PPCFunctionInfo(MachineFunction &MF) 
     : FramePointerSaveIndex(0),
@@ -125,7 +129,8 @@ public:
       VarArgsNumFPR(0),
       CRSpillFrameIndex(0),
       MF(MF),
-      UsesPICBase(0) {}
+      UsesPICBase(0),
+      IsSplitCSR(false) {}
 
   int getFramePointerSaveIndex() const { return FramePointerSaveIndex; }
   void setFramePointerSaveIndex(int Idx) { FramePointerSaveIndex = Idx; }
@@ -196,6 +201,9 @@ public:
   void setUsesPICBase(bool uses) { UsesPICBase = uses; }
   bool usesPICBase() const { return UsesPICBase; }
 
+  bool isSplitCSR() const { return IsSplitCSR; }
+  void setIsSplitCSR(bool s) { IsSplitCSR = s; }
+
   MCSymbol *getPICOffsetSymbol() const;
 
   MCSymbol *getGlobalEPSymbol() const;
diff --git a/lib/Target/PowerPC/PPCQPXLoadSplat.cpp b/lib/Target/PowerPC/PPCQPXLoadSplat.cpp
new file mode 100644
index 000000000000..bfe20c12974b
--- /dev/null
+++ b/lib/Target/PowerPC/PPCQPXLoadSplat.cpp
@@ -0,0 +1,166 @@
+//===----- PPCQPXLoadSplat.cpp - QPX Load Splat Simplification ------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// The QPX vector registers overlay the scalar floating-point registers, and
+// any scalar floating-point loads splat their value across all vector lanes.
+// Thus, if we have a scalar load followed by a splat, we can remove the splat
+// (i.e. replace the load with a load-and-splat pseudo instruction).
+//
+// This pass must run after anything that might do store-to-load forwarding.
+//
+//===----------------------------------------------------------------------===//
+
+#include "PPC.h"
+#include "PPCInstrBuilder.h"
+#include "PPCInstrInfo.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/Support/MathExtras.h"
+#include "llvm/Target/TargetMachine.h"
+#include "llvm/Target/TargetSubtargetInfo.h"
+using namespace llvm;
+
+#define DEBUG_TYPE "ppc-qpx-load-splat"
+
+STATISTIC(NumSimplified, "Number of QPX load splats simplified");
+
+namespace llvm {
+  void initializePPCQPXLoadSplatPass(PassRegistry&);
+}
+
+namespace {
+  struct PPCQPXLoadSplat : public MachineFunctionPass {
+    static char ID;
+    PPCQPXLoadSplat() : MachineFunctionPass(ID) {
+      initializePPCQPXLoadSplatPass(*PassRegistry::getPassRegistry());
+    }
+
+    bool runOnMachineFunction(MachineFunction &Fn) override;
+
+    const char *getPassName() const override {
+      return "PowerPC QPX Load Splat Simplification";
+    }
+  };
+  char PPCQPXLoadSplat::ID = 0;
+}
+
+INITIALIZE_PASS(PPCQPXLoadSplat, "ppc-qpx-load-splat",
+                "PowerPC QPX Load Splat Simplification",
+                false, false)
+
+FunctionPass *llvm::createPPCQPXLoadSplatPass() {
+  return new PPCQPXLoadSplat();
+}
+
+bool PPCQPXLoadSplat::runOnMachineFunction(MachineFunction &MF) {
+  if (skipFunction(*MF.getFunction()))
+    return false;
+
+  bool MadeChange = false;
+  const TargetRegisterInfo *TRI = MF.getSubtarget().getRegisterInfo();
+
+  for (auto MFI = MF.begin(), MFIE = MF.end(); MFI != MFIE; ++MFI) {
+    MachineBasicBlock *MBB = &*MFI;
+    SmallVector<MachineInstr *, 4> Splats;
+
+    for (auto MBBI = MBB->rbegin(); MBBI != MBB->rend(); ++MBBI) {
+      MachineInstr *MI = &*MBBI;
+
+      if (MI->hasUnmodeledSideEffects() || MI->isCall()) {
+        Splats.clear();
+        continue;
+      }
+
+      // We're looking for a sequence like this:
+      // %F0<def> = LFD 0, %X3<kill>, %QF0<imp-def>; mem:LD8[%a](tbaa=!2)
+      // %QF1<def> = QVESPLATI %QF0<kill>, 0, %RM<imp-use>
+
+      for (auto SI = Splats.begin(); SI != Splats.end();) {
+        MachineInstr *SMI = *SI;
+        unsigned SplatReg = SMI->getOperand(0).getReg();
+        unsigned SrcReg = SMI->getOperand(1).getReg();
+
+        if (MI->modifiesRegister(SrcReg, TRI)) {
+          switch (MI->getOpcode()) {
+          default:
+            SI = Splats.erase(SI);
+            continue;
+          case PPC::LFS:
+          case PPC::LFD:
+          case PPC::LFSU:
+          case PPC::LFDU:
+          case PPC::LFSUX:
+          case PPC::LFDUX:
+          case PPC::LFSX:
+          case PPC::LFDX:
+          case PPC::LFIWAX:
+          case PPC::LFIWZX:
+            if (SplatReg != SrcReg) {
+              // We need to change the load to define the scalar subregister of
+              // the QPX splat source register.
+              unsigned SubRegIndex =
+                TRI->getSubRegIndex(SrcReg, MI->getOperand(0).getReg());
+              unsigned SplatSubReg = TRI->getSubReg(SplatReg, SubRegIndex);
+
+              // Substitute both the explicit defined register, and also the
+              // implicit def of the containing QPX register.
+              MI->getOperand(0).setReg(SplatSubReg);
+              MI->substituteRegister(SrcReg, SplatReg, 0, *TRI);
+            }
+
+            SI = Splats.erase(SI);
+
+            // If SMI is directly after MI, then MBBI's base iterator is
+            // pointing at SMI.  Adjust MBBI around the call to erase SMI to
+            // avoid invalidating MBBI.
+            ++MBBI;
+            SMI->eraseFromParent();
+            --MBBI;
+
+            ++NumSimplified;
+            MadeChange = true;
+            continue;
+          }
+        }
+
+        // If this instruction defines the splat register, then we cannot move
+        // the previous definition above it. If it reads from the splat
+        // register, then it must already be alive from some previous
+        // definition, and if the splat register is different from the source
+        // register, then this definition must not be the load for which we're
+        // searching.
+        if (MI->modifiesRegister(SplatReg, TRI) ||
+            (SrcReg != SplatReg &&
+             MI->readsRegister(SplatReg, TRI))) {
+          SI = Splats.erase(SI);
+          continue;
+        }
+
+        ++SI;
+      }
+
+      if (MI->getOpcode() != PPC::QVESPLATI &&
+          MI->getOpcode() != PPC::QVESPLATIs &&
+          MI->getOpcode() != PPC::QVESPLATIb)
+        continue;
+      if (MI->getOperand(2).getImm() != 0)
+        continue;
+
+      // If there are other uses of the scalar value after this, replacing
+      // those uses might be non-trivial.
+      if (!MI->getOperand(1).isKill())
+        continue;
+
+      Splats.push_back(MI);
+    }
+  }
+
+  return MadeChange;
+}
diff --git a/lib/Target/PowerPC/PPCRegisterInfo.cpp b/lib/Target/PowerPC/PPCRegisterInfo.cpp
index 934bdf622418..f0161a03d2d4 100644
--- a/lib/Target/PowerPC/PPCRegisterInfo.cpp
+++ b/lib/Target/PowerPC/PPCRegisterInfo.cpp
@@ -116,6 +116,9 @@ PPCRegisterInfo::getCalleeSavedRegs(const MachineFunction *MF) const {
                : (Subtarget.hasAltivec() ? CSR_Darwin32_Altivec_SaveList
                                          : CSR_Darwin32_SaveList);
 
+  if (TM.isPPC64() && MF->getInfo<PPCFunctionInfo>()->isSplitCSR())
+    return CSR_SRV464_TLS_PE_SaveList;
+
   // On PPC64, we might need to save r2 (but only if it is not reserved).
   bool SaveR2 = MF->getRegInfo().isAllocatable(PPC::X2);
 
@@ -128,6 +131,31 @@ PPCRegisterInfo::getCalleeSavedRegs(const MachineFunction *MF) const {
                                        : CSR_SVR432_SaveList);
 }
 
+const MCPhysReg *
+PPCRegisterInfo::getCalleeSavedRegsViaCopy(const MachineFunction *MF) const {
+  assert(MF && "Invalid MachineFunction pointer.");
+  const PPCSubtarget &Subtarget = MF->getSubtarget<PPCSubtarget>();
+  if (Subtarget.isDarwinABI())
+    return nullptr;
+  if (!TM.isPPC64())
+    return nullptr;
+  if (MF->getFunction()->getCallingConv() != CallingConv::CXX_FAST_TLS)
+    return nullptr;
+  if (!MF->getInfo<PPCFunctionInfo>()->isSplitCSR())
+    return nullptr;
+
+  // On PPC64, we might need to save r2 (but only if it is not reserved).
+  bool SaveR2 = !getReservedRegs(*MF).test(PPC::X2);
+  if (Subtarget.hasAltivec())
+    return SaveR2
+      ? CSR_SVR464_R2_Altivec_ViaCopy_SaveList
+      : CSR_SVR464_Altivec_ViaCopy_SaveList;
+  else
+    return SaveR2
+      ? CSR_SVR464_R2_ViaCopy_SaveList
+      : CSR_SVR464_ViaCopy_SaveList;
+}
+
 const uint32_t *
 PPCRegisterInfo::getCallPreservedMask(const MachineFunction &MF,
                                       CallingConv::ID CC) const {
@@ -232,16 +260,15 @@ BitVector PPCRegisterInfo::getReservedRegs(const MachineFunction &MF) const {
   if (TFI->needsFP(MF))
     Reserved.set(PPC::R31);
 
+  bool IsPositionIndependent = TM.isPositionIndependent();
   if (hasBasePointer(MF)) {
-    if (Subtarget.isSVR4ABI() && !TM.isPPC64() &&
-        TM.getRelocationModel() == Reloc::PIC_)
+    if (Subtarget.isSVR4ABI() && !TM.isPPC64() && IsPositionIndependent)
       Reserved.set(PPC::R29);
     else
       Reserved.set(PPC::R30);
   }
 
-  if (Subtarget.isSVR4ABI() && !TM.isPPC64() &&
-      TM.getRelocationModel() == Reloc::PIC_)
+  if (Subtarget.isSVR4ABI() && !TM.isPPC64() && IsPositionIndependent)
     Reserved.set(PPC::R30);
 
   // Reserve Altivec registers when Altivec is unavailable.
@@ -907,8 +934,7 @@ unsigned PPCRegisterInfo::getBaseRegister(const MachineFunction &MF) const {
   if (TM.isPPC64())
     return PPC::X30;
 
-  if (Subtarget.isSVR4ABI() &&
-      TM.getRelocationModel() == Reloc::PIC_)
+  if (Subtarget.isSVR4ABI() && TM.isPositionIndependent())
     return PPC::R29;
 
   return PPC::R30;
diff --git a/lib/Target/PowerPC/PPCRegisterInfo.h b/lib/Target/PowerPC/PPCRegisterInfo.h
index b15fde83c9f3..459502eeb2e9 100644
--- a/lib/Target/PowerPC/PPCRegisterInfo.h
+++ b/lib/Target/PowerPC/PPCRegisterInfo.h
@@ -75,6 +75,7 @@ public:
 
   /// Code Generation virtual methods...
   const MCPhysReg *getCalleeSavedRegs(const MachineFunction *MF) const override;
+  const MCPhysReg *getCalleeSavedRegsViaCopy(const MachineFunction *MF) const override;
   const uint32_t *getCallPreservedMask(const MachineFunction &MF,
                                        CallingConv::ID CC) const override;
   const uint32_t *getNoPreservedMask() const override;
diff --git a/lib/Target/PowerPC/PPCSchedule.td b/lib/Target/PowerPC/PPCSchedule.td
index d0954a11cd6a..b4d72eff2b85 100644
--- a/lib/Target/PowerPC/PPCSchedule.td
+++ b/lib/Target/PowerPC/PPCSchedule.td
@@ -70,6 +70,8 @@ def IIC_LdStSTFDU    : InstrItinClass;
 def IIC_LdStSTVEBX   : InstrItinClass;
 def IIC_LdStSTWCX    : InstrItinClass;
 def IIC_LdStSync     : InstrItinClass;
+def IIC_LdStCOPY     : InstrItinClass;
+def IIC_LdStPASTE    : InstrItinClass;
 def IIC_SprISYNC     : InstrItinClass;
 def IIC_SprMFSR      : InstrItinClass;
 def IIC_SprMTMSR     : InstrItinClass;
@@ -104,12 +106,17 @@ def IIC_VecVSR       : InstrItinClass;
 def IIC_SprMTMSRD    : InstrItinClass;
 def IIC_SprSLIE      : InstrItinClass;
 def IIC_SprSLBIE     : InstrItinClass;
+def IIC_SprSLBIEG    : InstrItinClass;
 def IIC_SprSLBMTE    : InstrItinClass;
 def IIC_SprSLBMFEE   : InstrItinClass;
 def IIC_SprSLBIA     : InstrItinClass;
+def IIC_SprSLBSYNC   : InstrItinClass;
 def IIC_SprTLBIA     : InstrItinClass;
 def IIC_SprTLBIEL    : InstrItinClass;
 def IIC_SprTLBIE     : InstrItinClass;
+def IIC_SprABORT     : InstrItinClass;
+def IIC_SprMSGSYNC   : InstrItinClass;
+def IIC_SprSTOP      : InstrItinClass;
 
 //===----------------------------------------------------------------------===//
 // Processor instruction itineraries.
diff --git a/lib/Target/PowerPC/PPCSchedule440.td b/lib/Target/PowerPC/PPCSchedule440.td
index 04a43bc03251..2455e5e52de5 100644
--- a/lib/Target/PowerPC/PPCSchedule440.td
+++ b/lib/Target/PowerPC/PPCSchedule440.td
@@ -597,11 +597,12 @@ def PPC440Itineraries : ProcessorItineraries<
 
 def PPC440Model : SchedMachineModel {
   let IssueWidth = 2;  // 2 instructions are dispatched per cycle.
-  let MinLatency = -1; // OperandCycles are interpreted as MinLatency.
   let LoadLatency = 5; // Optimistic load latency assuming bypass.
                        // This is overriden by OperandCycles if the
                        // Itineraries are queried instead.
 
+  let CompleteModel = 0;
+
   let Itineraries = PPC440Itineraries;
 }
 
diff --git a/lib/Target/PowerPC/PPCScheduleA2.td b/lib/Target/PowerPC/PPCScheduleA2.td
index 21a357a2efcf..54cfae5d74b7 100644
--- a/lib/Target/PowerPC/PPCScheduleA2.td
+++ b/lib/Target/PowerPC/PPCScheduleA2.td
@@ -1,10 +1,10 @@
 //===- PPCScheduleA2.td - PPC A2 Scheduling Definitions --*- tablegen -*-===//
-// 
+//
 //                     The LLVM Compiler Infrastructure
 //
 // This file is distributed under the University of Illinois Open Source
 // License. See LICENSE.TXT for details.
-// 
+//
 //===----------------------------------------------------------------------===//
 
 // Primary reference:
@@ -160,12 +160,13 @@ def PPCA2Itineraries : ProcessorItineraries<
 
 def PPCA2Model : SchedMachineModel {
   let IssueWidth = 1;  // 1 instruction is dispatched per cycle.
-  let MinLatency = -1; // OperandCycles are interpreted as MinLatency.
   let LoadLatency = 6; // Optimistic load latency assuming bypass.
                        // This is overriden by OperandCycles if the
                        // Itineraries are queried instead.
   let MispredictPenalty = 13;
 
+  let CompleteModel = 0;
+
   let Itineraries = PPCA2Itineraries;
 }
 
diff --git a/lib/Target/PowerPC/PPCScheduleE500mc.td b/lib/Target/PowerPC/PPCScheduleE500mc.td
index 36b8517dabf1..f687d326b52d 100644
--- a/lib/Target/PowerPC/PPCScheduleE500mc.td
+++ b/lib/Target/PowerPC/PPCScheduleE500mc.td
@@ -7,9 +7,9 @@
 //
 //===----------------------------------------------------------------------===//
 //
-// This file defines the itinerary class data for the Freescale e500mc 32-bit 
+// This file defines the itinerary class data for the Freescale e500mc 32-bit
 // Power processor.
-// 
+//
 // All information is derived from the "e500mc Core Reference Manual",
 // Freescale Document Number E500MCRM, Rev. 1, 03/2012.
 //
@@ -25,12 +25,12 @@ def E500_DIS1 : FuncUnit; // Dispatch stage - insn 2
 //  * Execute
 //    6 pipelined execution units: SFX0, SFX1, BU, FPU, LSU, CFX.
 //    Some instructions can only execute in SFX0 but not SFX1.
-//    The CFX has a bypass path, allowing non-divide instructions to execute 
+//    The CFX has a bypass path, allowing non-divide instructions to execute
 //    while a divide instruction is executed.
 def E500_SFX0  : FuncUnit; // Simple unit 0
 def E500_SFX1  : FuncUnit; // Simple unit 1
 def E500_BU    : FuncUnit; // Branch unit
-def E500_CFX_DivBypass 
+def E500_CFX_DivBypass
                : FuncUnit; // CFX divide bypass path
 def E500_CFX_0 : FuncUnit; // CFX pipeline
 def E500_LSU_0 : FuncUnit; // LSU pipeline
@@ -271,12 +271,12 @@ def PPCE500mcItineraries : ProcessorItineraries<
                                  [NoBypass, E500_GPR_Bypass]>,
   InstrItinData<IIC_FPGeneral,   [InstrStage<1, [E500_DIS0, E500_DIS1], 0>,
                                   InstrStage<2, [E500_FPU_0]>],
-                                 [11, 1, 1], // Latency = 8, Repeat rate = 2 
+                                 [11, 1, 1], // Latency = 8, Repeat rate = 2
                                  [E500_FPR_Bypass,
                                   E500_FPR_Bypass, E500_FPR_Bypass]>,
   InstrItinData<IIC_FPAddSub,    [InstrStage<1, [E500_DIS0, E500_DIS1], 0>,
                                   InstrStage<4, [E500_FPU_0]>],
-                                 [13, 1, 1], // Latency = 10, Repeat rate = 4 
+                                 [13, 1, 1], // Latency = 10, Repeat rate = 4
                                  [E500_FPR_Bypass,
                                   E500_FPR_Bypass, E500_FPR_Bypass]>,
   InstrItinData<IIC_FPCompare,   [InstrStage<1, [E500_DIS0, E500_DIS1], 0>,
@@ -311,10 +311,11 @@ def PPCE500mcItineraries : ProcessorItineraries<
 
 def PPCE500mcModel : SchedMachineModel {
   let IssueWidth = 2;  // 2 micro-ops are dispatched per cycle.
-  let MinLatency = -1; // OperandCycles are interpreted as MinLatency.
   let LoadLatency = 5; // Optimistic load latency assuming bypass.
                        // This is overriden by OperandCycles if the
                        // Itineraries are queried instead.
 
+  let CompleteModel = 0;
+
   let Itineraries = PPCE500mcItineraries;
 }
diff --git a/lib/Target/PowerPC/PPCScheduleE5500.td b/lib/Target/PowerPC/PPCScheduleE5500.td
index 7c2693ef0d4f..5db886cf8f94 100644
--- a/lib/Target/PowerPC/PPCScheduleE5500.td
+++ b/lib/Target/PowerPC/PPCScheduleE5500.td
@@ -7,9 +7,9 @@
 //
 //===----------------------------------------------------------------------===//
 //
-// This file defines the itinerary class data for the Freescale e5500 64-bit 
+// This file defines the itinerary class data for the Freescale e5500 64-bit
 // Power processor.
-// 
+//
 // All information is derived from the "e5500 Core Reference Manual",
 // Freescale Document Number e5500RM, Rev. 1, 03/2012.
 //
@@ -25,16 +25,16 @@ def E5500_DIS1 : FuncUnit;
 
 //  * Execute
 //    6 pipelined execution units: SFX0, SFX1, BU, FPU, LSU, CFX.
-//    The CFX has a bypass path, allowing non-divide instructions to execute 
+//    The CFX has a bypass path, allowing non-divide instructions to execute
 //    while a divide instruction is being executed.
 def E5500_SFX0  : FuncUnit; // Simple unit 0
 def E5500_SFX1  : FuncUnit; // Simple unit 1
 def E5500_BU    : FuncUnit; // Branch unit
-def E5500_CFX_DivBypass 
+def E5500_CFX_DivBypass
                 : FuncUnit; // CFX divide bypass path
 def E5500_CFX_0 : FuncUnit; // CFX pipeline stage 0
 
-def E5500_CFX_1 : FuncUnit; // CFX pipeline stage 1 
+def E5500_CFX_1 : FuncUnit; // CFX pipeline stage 1
 
 def E5500_LSU_0 : FuncUnit; // LSU pipeline
 def E5500_FPU_0 : FuncUnit; // FPU pipeline
@@ -331,12 +331,12 @@ def PPCE5500Itineraries : ProcessorItineraries<
                                  [E5500_GPR_Bypass]>,
   InstrItinData<IIC_FPGeneral,   [InstrStage<1, [E5500_DIS0, E5500_DIS1], 0>,
                                   InstrStage<1, [E5500_FPU_0]>],
-                                 [11, 2, 2], // Latency = 7, Repeat rate = 1 
+                                 [11, 2, 2], // Latency = 7, Repeat rate = 1
                                  [E5500_FPR_Bypass,
                                   E5500_FPR_Bypass, E5500_FPR_Bypass]>,
   InstrItinData<IIC_FPAddSub,    [InstrStage<1, [E5500_DIS0, E5500_DIS1], 0>,
                                   InstrStage<1, [E5500_FPU_0]>],
-                                 [11, 2, 2], // Latency = 7, Repeat rate = 1 
+                                 [11, 2, 2], // Latency = 7, Repeat rate = 1
                                  [E5500_FPR_Bypass,
                                   E5500_FPR_Bypass, E5500_FPR_Bypass]>,
   InstrItinData<IIC_FPCompare,   [InstrStage<1, [E5500_DIS0, E5500_DIS1], 0>,
@@ -351,7 +351,7 @@ def PPCE5500Itineraries : ProcessorItineraries<
                                   E5500_FPR_Bypass, E5500_FPR_Bypass]>,
   InstrItinData<IIC_FPDivS,      [InstrStage<1, [E5500_DIS0, E5500_DIS1], 0>,
                                   InstrStage<16, [E5500_FPU_0]>],
-                                 [24, 2, 2], // Latency = 20, Repeat rate = 16 
+                                 [24, 2, 2], // Latency = 20, Repeat rate = 16
                                  [E5500_FPR_Bypass,
                                   E5500_FPR_Bypass, E5500_FPR_Bypass]>,
   InstrItinData<IIC_FPFused,     [InstrStage<1, [E5500_DIS0, E5500_DIS1], 0>,
@@ -371,10 +371,11 @@ def PPCE5500Itineraries : ProcessorItineraries<
 
 def PPCE5500Model : SchedMachineModel {
   let IssueWidth = 2;  // 2 micro-ops are dispatched per cycle.
-  let MinLatency = -1; // OperandCycles are interpreted as MinLatency.
   let LoadLatency = 6; // Optimistic load latency assuming bypass.
                        // This is overriden by OperandCycles if the
                        // Itineraries are queried instead.
 
+  let CompleteModel = 0;
+
   let Itineraries = PPCE5500Itineraries;
 }
diff --git a/lib/Target/PowerPC/PPCScheduleG5.td b/lib/Target/PowerPC/PPCScheduleG5.td
index a3b73ab4454f..b5a9f96d45ae 100644
--- a/lib/Target/PowerPC/PPCScheduleG5.td
+++ b/lib/Target/PowerPC/PPCScheduleG5.td
@@ -40,7 +40,7 @@ def G5Itineraries : ProcessorItineraries<
   InstrItinData<IIC_IntMulLI    , [InstrStage<4, [G5_IU1, G5_IU2]>]>,
   InstrItinData<IIC_IntRFID     , [InstrStage<1, [G5_IU2]>]>,
   InstrItinData<IIC_IntRotateD  , [InstrStage<2, [G5_IU1, G5_IU2]>]>,
-  InstrItinData<IIC_IntRotateDI , [InstrStage<2, [G5_IU1, G5_IU2]>]>,  
+  InstrItinData<IIC_IntRotateDI , [InstrStage<2, [G5_IU1, G5_IU2]>]>,
   InstrItinData<IIC_IntRotate   , [InstrStage<4, [G5_IU1, G5_IU2]>]>,
   InstrItinData<IIC_IntShift    , [InstrStage<2, [G5_IU1, G5_IU2]>]>,
   InstrItinData<IIC_IntTrapD    , [InstrStage<1, [G5_IU1, G5_IU2]>]>,
@@ -51,14 +51,14 @@ def G5Itineraries : ProcessorItineraries<
   InstrItinData<IIC_BrMCRX      , [InstrStage<3, [G5_BPU]>]>,
   InstrItinData<IIC_LdStDCBF    , [InstrStage<3, [G5_SLU]>]>,
   InstrItinData<IIC_LdStLoad    , [InstrStage<3, [G5_SLU]>]>,
-  InstrItinData<IIC_LdStLoadUpd , [InstrStage<3, [G5_SLU]>]>,  
-  InstrItinData<IIC_LdStLoadUpdX, [InstrStage<3, [G5_SLU]>]>,  
+  InstrItinData<IIC_LdStLoadUpd , [InstrStage<3, [G5_SLU]>]>,
+  InstrItinData<IIC_LdStLoadUpdX, [InstrStage<3, [G5_SLU]>]>,
   InstrItinData<IIC_LdStStore   , [InstrStage<3, [G5_SLU]>]>,
-  InstrItinData<IIC_LdStStoreUpd, [InstrStage<3, [G5_SLU]>]>,  
+  InstrItinData<IIC_LdStStoreUpd, [InstrStage<3, [G5_SLU]>]>,
   InstrItinData<IIC_LdStDSS     , [InstrStage<10, [G5_SLU]>]>,
   InstrItinData<IIC_LdStICBI    , [InstrStage<40, [G5_SLU]>]>,
   InstrItinData<IIC_LdStSTFD    , [InstrStage<4, [G5_SLU]>]>,
-  InstrItinData<IIC_LdStSTFDU   , [InstrStage<4, [G5_SLU]>]>,  
+  InstrItinData<IIC_LdStSTFDU   , [InstrStage<4, [G5_SLU]>]>,
   InstrItinData<IIC_LdStLD      , [InstrStage<3, [G5_SLU]>]>,
   InstrItinData<IIC_LdStLDU     , [InstrStage<3, [G5_SLU]>]>,
   InstrItinData<IIC_LdStLDUX    , [InstrStage<3, [G5_SLU]>]>,
@@ -67,8 +67,8 @@ def G5Itineraries : ProcessorItineraries<
   InstrItinData<IIC_LdStLFDU    , [InstrStage<5, [G5_SLU]>]>,
   InstrItinData<IIC_LdStLFDUX   , [InstrStage<5, [G5_SLU]>]>,
   InstrItinData<IIC_LdStLHA     , [InstrStage<5, [G5_SLU]>]>,
-  InstrItinData<IIC_LdStLHAU    , [InstrStage<5, [G5_SLU]>]>,  
-  InstrItinData<IIC_LdStLHAUX   , [InstrStage<5, [G5_SLU]>]>,  
+  InstrItinData<IIC_LdStLHAU    , [InstrStage<5, [G5_SLU]>]>,
+  InstrItinData<IIC_LdStLHAUX   , [InstrStage<5, [G5_SLU]>]>,
   InstrItinData<IIC_LdStLMW     , [InstrStage<64, [G5_SLU]>]>,
   InstrItinData<IIC_LdStLVecX   , [InstrStage<3, [G5_SLU]>]>,
   InstrItinData<IIC_LdStLWA     , [InstrStage<5, [G5_SLU]>]>,
@@ -118,12 +118,13 @@ def G5Itineraries : ProcessorItineraries<
 
 def G5Model : SchedMachineModel {
   let IssueWidth = 4;  // 4 (non-branch) instructions are dispatched per cycle.
-  let MinLatency = 0;  // Out-of-order dispatch.
   let LoadLatency = 3; // Optimistic load latency assuming bypass.
                        // This is overriden by OperandCycles if the
                        // Itineraries are queried instead.
   let MispredictPenalty = 16;
 
+  let CompleteModel = 0;
+
   let Itineraries = G5Itineraries;
 }
 
diff --git a/lib/Target/PowerPC/PPCScheduleP7.td b/lib/Target/PowerPC/PPCScheduleP7.td
index 267f56726180..a8678f56900e 100644
--- a/lib/Target/PowerPC/PPCScheduleP7.td
+++ b/lib/Target/PowerPC/PPCScheduleP7.td
@@ -382,7 +382,6 @@ def P7Model : SchedMachineModel {
                        // branches), but the total internal issue bandwidth per
                        // cycle (from all queues) is 8.
 
-  let MinLatency = 0;  // Out-of-order dispatch.
   let LoadLatency = 3; // Optimistic load latency assuming bypass.
                        // This is overriden by OperandCycles if the
                        // Itineraries are queried instead.
@@ -391,6 +390,8 @@ def P7Model : SchedMachineModel {
   // Try to make sure we have at least 10 dispatch groups in a loop.
   let LoopMicroOpBufferSize = 40;
 
+  let CompleteModel = 0;
+
   let Itineraries = P7Itineraries;
 }
 
diff --git a/lib/Target/PowerPC/PPCScheduleP8.td b/lib/Target/PowerPC/PPCScheduleP8.td
index 69e6d05c6604..8e52da583a0d 100644
--- a/lib/Target/PowerPC/PPCScheduleP8.td
+++ b/lib/Target/PowerPC/PPCScheduleP8.td
@@ -391,7 +391,6 @@ def P8Model : SchedMachineModel {
                        // up to six non-branch instructions.
                        // up to two branches in a dispatch group.
 
-  let MinLatency = 0;  // Out-of-order dispatch.
   let LoadLatency = 3; // Optimistic load latency assuming bypass.
                        // This is overriden by OperandCycles if the
                        // Itineraries are queried instead.
@@ -400,6 +399,8 @@ def P8Model : SchedMachineModel {
   // Try to make sure we have at least 10 dispatch groups in a loop.
   let LoopMicroOpBufferSize = 60;
 
+  let CompleteModel = 0;
+
   let Itineraries = P8Itineraries;
 }
 
diff --git a/lib/Target/PowerPC/PPCSubtarget.cpp b/lib/Target/PowerPC/PPCSubtarget.cpp
index c0fcb6cbb9dc..603f0fccc7c6 100644
--- a/lib/Target/PowerPC/PPCSubtarget.cpp
+++ b/lib/Target/PowerPC/PPCSubtarget.cpp
@@ -70,6 +70,8 @@ void PPCSubtarget::initializeEnvironment() {
   HasP8Vector = false;
   HasP8Altivec = false;
   HasP8Crypto = false;
+  HasP9Vector = false;
+  HasP9Altivec = false;
   HasFCPSGN = false;
   HasFSQRT = false;
   HasFRE = false;
@@ -82,7 +84,6 @@ void PPCSubtarget::initializeEnvironment() {
   HasFPRND = false;
   HasFPCVT = false;
   HasISEL = false;
-  HasPOPCNTD = false;
   HasBPERMD = false;
   HasExtDiv = false;
   HasCMPB = false;
@@ -103,12 +104,15 @@ void PPCSubtarget::initializeEnvironment() {
   HasHTM = false;
   HasFusion = false;
   HasFloat128 = false;
+  IsISA3_0 = false;
+
+  HasPOPCNTD = POPCNTD_Unavailable;
 }
 
 void PPCSubtarget::initSubtargetFeatures(StringRef CPU, StringRef FS) {
   // Determine default and user specified characteristics
   std::string CPUName = CPU;
-  if (CPUName.empty()) {
+  if (CPUName.empty() || CPU == "generic") {
     // If cross-compiling with -march=ppc64le without -mcpu
     if (TargetTriple.getArch() == Triple::ppc64le)
       CPUName = "ppc64le";
@@ -142,18 +146,20 @@ void PPCSubtarget::initSubtargetFeatures(StringRef CPU, StringRef FS) {
   IsLittleEndian = (TargetTriple.getArch() == Triple::ppc64le);
 }
 
-/// hasLazyResolverStub - Return true if accesses to the specified global have
-/// to go through a dyld lazy resolution stub.  This means that an extra load
-/// is required to get the address of the global.
+/// Return true if accesses to the specified global have to go through a dyld
+/// lazy resolution stub.  This means that an extra load is required to get the
+/// address of the global.
 bool PPCSubtarget::hasLazyResolverStub(const GlobalValue *GV) const {
-  // We never have stubs if HasLazyResolverStubs=false or if in static mode.
-  if (!HasLazyResolverStubs || TM.getRelocationModel() == Reloc::Static)
-    return false;
-  bool isDecl = GV->isDeclaration();
-  if (GV->hasHiddenVisibility() && !isDecl && !GV->hasCommonLinkage())
+  if (!HasLazyResolverStubs)
     return false;
-  return GV->hasWeakLinkage() || GV->hasLinkOnceLinkage() ||
-         GV->hasCommonLinkage() || isDecl;
+  if (!TM.shouldAssumeDSOLocal(*GV->getParent(), GV))
+    return true;
+  // 32 bit macho has no relocation for a-b if a is undefined, even if b is in
+  // the section that is being relocated. This means we have to use o load even
+  // for GVs that are known to be local to the dso.
+  if (GV->isDeclarationForLinker() || GV->hasCommonLinkage())
+    return true;
+  return false;
 }
 
 // Embedded cores need aggressive scheduling (and some others also benefit).
@@ -166,6 +172,8 @@ static bool needsAggressiveScheduling(unsigned Directive) {
   case PPC::DIR_E5500:
   case PPC::DIR_PWR7:
   case PPC::DIR_PWR8:
+  // FIXME: Same as P8 until POWER9 scheduling info is available
+  case PPC::DIR_PWR9:
     return true;
   }
 }
@@ -191,8 +199,6 @@ void PPCSubtarget::getCriticalPathRCs(RegClassVector &CriticalPathRCs) const {
 }
 
 void PPCSubtarget::overrideSchedPolicy(MachineSchedPolicy &Policy,
-                                       MachineInstr *begin,
-                                       MachineInstr *end,
                                        unsigned NumRegionInstrs) const {
   if (needsAggressiveScheduling(DarwinDirective)) {
     Policy.OnlyTopDown = false;
diff --git a/lib/Target/PowerPC/PPCSubtarget.h b/lib/Target/PowerPC/PPCSubtarget.h
index 4f5c95c1483f..9fe286a3b7a9 100644
--- a/lib/Target/PowerPC/PPCSubtarget.h
+++ b/lib/Target/PowerPC/PPCSubtarget.h
@@ -18,9 +18,9 @@
 #include "PPCISelLowering.h"
 #include "PPCInstrInfo.h"
 #include "llvm/ADT/Triple.h"
+#include "llvm/CodeGen/SelectionDAGTargetInfo.h"
 #include "llvm/IR/DataLayout.h"
 #include "llvm/MC/MCInstrItineraries.h"
-#include "llvm/Target/TargetSelectionDAGInfo.h"
 #include "llvm/Target/TargetSubtargetInfo.h"
 #include <string>
 
@@ -56,6 +56,7 @@ namespace PPC {
     DIR_PWR6X,
     DIR_PWR7,
     DIR_PWR8,
+    DIR_PWR9,
     DIR_64
   };
 }
@@ -64,6 +65,13 @@ class GlobalValue;
 class TargetMachine;
 
 class PPCSubtarget : public PPCGenSubtargetInfo {
+public:
+  enum POPCNTDKind {
+    POPCNTD_Unavailable,
+    POPCNTD_Slow,
+    POPCNTD_Fast
+  };
+
 protected:
   /// TargetTriple - What processor and OS we're targeting.
   Triple TargetTriple;
@@ -92,6 +100,8 @@ protected:
   bool HasP8Vector;
   bool HasP8Altivec;
   bool HasP8Crypto;
+  bool HasP9Vector;
+  bool HasP9Altivec;
   bool HasFCPSGN;
   bool HasFSQRT;
   bool HasFRE, HasFRES, HasFRSQRTE, HasFRSQRTES;
@@ -101,7 +111,6 @@ protected:
   bool HasFPRND;
   bool HasFPCVT;
   bool HasISEL;
-  bool HasPOPCNTD;
   bool HasBPERMD;
   bool HasExtDiv;
   bool HasCMPB;
@@ -122,6 +131,9 @@ protected:
   bool HasHTM;
   bool HasFusion;
   bool HasFloat128;
+  bool IsISA3_0;
+
+  POPCNTDKind HasPOPCNTD;
 
   /// When targeting QPX running a stock PPC64 Linux kernel where the stack
   /// alignment has not been changed, we need to keep the 16-byte alignment
@@ -132,7 +144,7 @@ protected:
   PPCFrameLowering FrameLowering;
   PPCInstrInfo InstrInfo;
   PPCTargetLowering TLInfo;
-  TargetSelectionDAGInfo TSInfo;
+  SelectionDAGTargetInfo TSInfo;
 
 public:
   /// This constructor initializes the data members to match that
@@ -167,7 +179,7 @@ public:
   const PPCTargetLowering *getTargetLowering() const override {
     return &TLInfo;
   }
-  const TargetSelectionDAGInfo *getSelectionDAGInfo() const override {
+  const SelectionDAGTargetInfo *getSelectionDAGInfo() const override {
     return &TSInfo;
   }
   const PPCRegisterInfo *getRegisterInfo() const override {
@@ -230,9 +242,10 @@ public:
   bool hasP8Vector() const { return HasP8Vector; }
   bool hasP8Altivec() const { return HasP8Altivec; }
   bool hasP8Crypto() const { return HasP8Crypto; }
+  bool hasP9Vector() const { return HasP9Vector; }
+  bool hasP9Altivec() const { return HasP9Altivec; }
   bool hasMFOCRF() const { return HasMFOCRF; }
   bool hasISEL() const { return HasISEL; }
-  bool hasPOPCNTD() const { return HasPOPCNTD; }
   bool hasBPERMD() const { return HasBPERMD; }
   bool hasExtDiv() const { return HasExtDiv; }
   bool hasCMPB() const { return HasCMPB; }
@@ -261,6 +274,9 @@ public:
   bool hasHTM() const { return HasHTM; }
   bool hasFusion() const { return HasFusion; }
   bool hasFloat128() const { return HasFloat128; }
+  bool isISA3_0() const { return IsISA3_0; }
+
+  POPCNTDKind hasPOPCNTD() const { return HasPOPCNTD; }
 
   const Triple &getTargetTriple() const { return TargetTriple; }
 
@@ -271,6 +287,7 @@ public:
 
   bool isTargetELF() const { return TargetTriple.isOSBinFormatELF(); }
   bool isTargetMachO() const { return TargetTriple.isOSBinFormatMachO(); }
+  bool isTargetLinux() const { return TargetTriple.isOSLinux(); }
 
   bool isDarwinABI() const { return isTargetMachO() || isDarwin(); }
   bool isSVR4ABI() const { return !isDarwinABI(); }
@@ -286,8 +303,6 @@ public:
   void getCriticalPathRCs(RegClassVector &CriticalPathRCs) const override;
 
   void overrideSchedPolicy(MachineSchedPolicy &Policy,
-                           MachineInstr *begin,
-                           MachineInstr *end,
                            unsigned NumRegionInstrs) const override;
   bool useAA() const override;
 
diff --git a/lib/Target/PowerPC/PPCTLSDynamicCall.cpp b/lib/Target/PowerPC/PPCTLSDynamicCall.cpp
index a9d2e888f4b7..61ce48ecd04f 100644
--- a/lib/Target/PowerPC/PPCTLSDynamicCall.cpp
+++ b/lib/Target/PowerPC/PPCTLSDynamicCall.cpp
@@ -73,10 +73,7 @@ protected:
         DebugLoc DL = MI->getDebugLoc();
         unsigned GPR3 = Is64Bit ? PPC::X3 : PPC::R3;
         unsigned Opc1, Opc2;
-        SmallVector<unsigned, 4> OrigRegs;
-        OrigRegs.push_back(OutReg);
-        OrigRegs.push_back(InReg);
-        OrigRegs.push_back(GPR3);
+        const unsigned OrigRegs[] = {OutReg, InReg, GPR3};
 
         switch (MI->getOpcode()) {
         default:
diff --git a/lib/Target/PowerPC/PPCTOCRegDeps.cpp b/lib/Target/PowerPC/PPCTOCRegDeps.cpp
index bf165c9edc6e..7c53a5601790 100644
--- a/lib/Target/PowerPC/PPCTOCRegDeps.cpp
+++ b/lib/Target/PowerPC/PPCTOCRegDeps.cpp
@@ -61,10 +61,10 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "PPCInstrInfo.h"
-#include "MCTargetDesc/PPCPredicates.h"
 #include "PPC.h"
+#include "MCTargetDesc/PPCPredicates.h"
 #include "PPCInstrBuilder.h"
+#include "PPCInstrInfo.h"
 #include "PPCMachineFunctionInfo.h"
 #include "PPCTargetMachine.h"
 #include "llvm/ADT/STLExtras.h"
@@ -74,7 +74,6 @@
 #include "llvm/CodeGen/MachineInstr.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
 #include "llvm/MC/MCAsmInfo.h"
-#include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/TargetRegistry.h"
diff --git a/lib/Target/PowerPC/PPCTargetMachine.cpp b/lib/Target/PowerPC/PPCTargetMachine.cpp
index d24b590317f5..a4ff86cb1e21 100644
--- a/lib/Target/PowerPC/PPCTargetMachine.cpp
+++ b/lib/Target/PowerPC/PPCTargetMachine.cpp
@@ -15,7 +15,9 @@
 #include "PPC.h"
 #include "PPCTargetObjectFile.h"
 #include "PPCTargetTransformInfo.h"
+#include "llvm/CodeGen/LiveVariables.h"
 #include "llvm/CodeGen/Passes.h"
+#include "llvm/CodeGen/TargetPassConfig.h"
 #include "llvm/IR/Function.h"
 #include "llvm/IR/LegacyPassManager.h"
 #include "llvm/MC/MCStreamer.h"
@@ -43,6 +45,10 @@ opt<bool> DisableVSXSwapRemoval("disable-ppc-vsx-swap-removal", cl::Hidden,
                                 cl::desc("Disable VSX Swap Removal for PPC"));
 
 static cl::
+opt<bool> DisableQPXLoadSplat("disable-ppc-qpx-load-splat", cl::Hidden,
+                              cl::desc("Disable QPX load splat simplification"));
+
+static cl::
 opt<bool> DisableMIPeephole("disable-ppc-peephole", cl::Hidden,
                             cl::desc("Disable machine peepholes for PPC"));
 
@@ -172,6 +178,16 @@ static PPCTargetMachine::PPCABI computeTargetABI(const Triple &TT,
   return PPCTargetMachine::PPC_ABI_UNKNOWN;
 }
 
+static Reloc::Model getEffectiveRelocModel(const Triple &TT,
+                                           Optional<Reloc::Model> RM) {
+  if (!RM.hasValue()) {
+    if (TT.isOSDarwin())
+      return Reloc::DynamicNoPIC;
+    return Reloc::Static;
+  }
+  return *RM;
+}
+
 // The FeatureString here is a little subtle. We are modifying the feature
 // string with what are (currently) non-function specific overrides as it goes
 // into the LLVMTargetMachine constructor and then using the stored value in the
@@ -179,10 +195,11 @@ static PPCTargetMachine::PPCABI computeTargetABI(const Triple &TT,
 PPCTargetMachine::PPCTargetMachine(const Target &T, const Triple &TT,
                                    StringRef CPU, StringRef FS,
                                    const TargetOptions &Options,
-                                   Reloc::Model RM, CodeModel::Model CM,
-                                   CodeGenOpt::Level OL)
+                                   Optional<Reloc::Model> RM,
+                                   CodeModel::Model CM, CodeGenOpt::Level OL)
     : LLVMTargetMachine(T, getDataLayoutString(TT), TT, CPU,
-                        computeFSAdditions(FS, OL, TT), Options, RM, CM, OL),
+                        computeFSAdditions(FS, OL, TT), Options,
+                        getEffectiveRelocModel(TT, RM), CM, OL),
       TLOF(createTLOF(getTargetTriple())),
       TargetABI(computeTargetABI(TT, Options)),
       Subtarget(TargetTriple, CPU, computeFSAdditions(FS, OL, TT), *this) {
@@ -214,7 +231,8 @@ void PPC32TargetMachine::anchor() { }
 PPC32TargetMachine::PPC32TargetMachine(const Target &T, const Triple &TT,
                                        StringRef CPU, StringRef FS,
                                        const TargetOptions &Options,
-                                       Reloc::Model RM, CodeModel::Model CM,
+                                       Optional<Reloc::Model> RM,
+                                       CodeModel::Model CM,
                                        CodeGenOpt::Level OL)
     : PPCTargetMachine(T, TT, CPU, FS, Options, RM, CM, OL) {}
 
@@ -223,7 +241,8 @@ void PPC64TargetMachine::anchor() { }
 PPC64TargetMachine::PPC64TargetMachine(const Target &T, const Triple &TT,
                                        StringRef CPU, StringRef FS,
                                        const TargetOptions &Options,
-                                       Reloc::Model RM, CodeModel::Model CM,
+                                       Optional<Reloc::Model> RM,
+                                       CodeModel::Model CM,
                                        CodeGenOpt::Level OL)
     : PPCTargetMachine(T, TT, CPU, FS, Options, RM, CM, OL) {}
 
@@ -245,8 +264,7 @@ PPCTargetMachine::getSubtargetImpl(const Function &F) const {
   // it as a key for the subtarget since that can be the only difference
   // between two functions.
   bool SoftFloat =
-    F.hasFnAttribute("use-soft-float") &&
-    F.getFnAttribute("use-soft-float").getValueAsString() == "true";
+      F.getFnAttribute("use-soft-float").getValueAsString() == "true";
   // If the soft float attribute is set on the function turn on the soft float
   // subtarget feature.
   if (SoftFloat)
@@ -313,9 +331,9 @@ void PPCPassConfig::addIRPasses() {
   if (EnablePrefetch.getNumOccurrences() > 0)
     UsePrefetching = EnablePrefetch;
   if (UsePrefetching)
-    addPass(createPPCLoopDataPrefetchPass());
+    addPass(createLoopDataPrefetchPass());
 
-  if (TM->getOptLevel() == CodeGenOpt::Aggressive && EnableGEPOpt) {
+  if (TM->getOptLevel() >= CodeGenOpt::Default && EnableGEPOpt) {
     // Call SeparateConstOffsetFromGEP pass to extract constants within indices
     // and lower a GEP with multiple indices to either arithmetic operations or
     // multiple GEPs with single index.
@@ -379,18 +397,35 @@ void PPCPassConfig::addMachineSSAOptimization() {
 }
 
 void PPCPassConfig::addPreRegAlloc() {
-  initializePPCVSXFMAMutatePass(*PassRegistry::getPassRegistry());
-  insertPass(VSXFMAMutateEarly ? &RegisterCoalescerID : &MachineSchedulerID,
-             &PPCVSXFMAMutateID);
-  if (getPPCTargetMachine().getRelocationModel() == Reloc::PIC_)
+  if (getOptLevel() != CodeGenOpt::None) {
+    initializePPCVSXFMAMutatePass(*PassRegistry::getPassRegistry());
+    insertPass(VSXFMAMutateEarly ? &RegisterCoalescerID : &MachineSchedulerID,
+               &PPCVSXFMAMutateID);
+  }
+
+  // FIXME: We probably don't need to run these for -fPIE.
+  if (getPPCTargetMachine().isPositionIndependent()) {
+    // FIXME: LiveVariables should not be necessary here!
+    // PPCTLSDYnamicCallPass uses LiveIntervals which previously dependet on
+    // LiveVariables. This (unnecessary) dependency has been removed now,
+    // however a stage-2 clang build fails without LiveVariables computed here.
+    addPass(&LiveVariablesID, false);
     addPass(createPPCTLSDynamicCallPass());
+  }
   if (EnableExtraTOCRegDeps)
     addPass(createPPCTOCRegDepsPass());
 }
 
 void PPCPassConfig::addPreSched2() {
-  if (getOptLevel() != CodeGenOpt::None)
+  if (getOptLevel() != CodeGenOpt::None) {
     addPass(&IfConverterID);
+
+    // This optimization must happen after anything that might do store-to-load
+    // forwarding. Here we're after RA (and, thus, when spills are inserted)
+    // but before post-RA scheduling.
+    if (!DisableQPXLoadSplat)
+      addPass(createPPCQPXLoadSplatPass());
+  }
 }
 
 void PPCPassConfig::addPreEmitPass() {
diff --git a/lib/Target/PowerPC/PPCTargetMachine.h b/lib/Target/PowerPC/PPCTargetMachine.h
index 6496339519a1..59b4f1e30c0e 100644
--- a/lib/Target/PowerPC/PPCTargetMachine.h
+++ b/lib/Target/PowerPC/PPCTargetMachine.h
@@ -21,7 +21,7 @@
 
 namespace llvm {
 
-/// PPCTargetMachine - Common code between 32-bit and 64-bit PowerPC targets.
+/// Common code between 32-bit and 64-bit PowerPC targets.
 ///
 class PPCTargetMachine : public LLVMTargetMachine {
 public:
@@ -35,8 +35,9 @@ private:
 
 public:
   PPCTargetMachine(const Target &T, const Triple &TT, StringRef CPU,
-                   StringRef FS, const TargetOptions &Options, Reloc::Model RM,
-                   CodeModel::Model CM, CodeGenOpt::Level OL);
+                   StringRef FS, const TargetOptions &Options,
+                   Optional<Reloc::Model> RM, CodeModel::Model CM,
+                   CodeGenOpt::Level OL);
 
   ~PPCTargetMachine() override;
 
@@ -57,25 +58,25 @@ public:
   };
 };
 
-/// PPC32TargetMachine - PowerPC 32-bit target machine.
+/// PowerPC 32-bit target machine.
 ///
 class PPC32TargetMachine : public PPCTargetMachine {
   virtual void anchor();
 public:
   PPC32TargetMachine(const Target &T, const Triple &TT, StringRef CPU,
                      StringRef FS, const TargetOptions &Options,
-                     Reloc::Model RM, CodeModel::Model CM,
+                     Optional<Reloc::Model> RM, CodeModel::Model CM,
                      CodeGenOpt::Level OL);
 };
 
-/// PPC64TargetMachine - PowerPC 64-bit target machine.
+/// PowerPC 64-bit target machine.
 ///
 class PPC64TargetMachine : public PPCTargetMachine {
   virtual void anchor();
 public:
   PPC64TargetMachine(const Target &T, const Triple &TT, StringRef CPU,
                      StringRef FS, const TargetOptions &Options,
-                     Reloc::Model RM, CodeModel::Model CM,
+                     Optional<Reloc::Model> RM, CodeModel::Model CM,
                      CodeGenOpt::Level OL);
 };
 
diff --git a/lib/Target/PowerPC/PPCTargetObjectFile.cpp b/lib/Target/PowerPC/PPCTargetObjectFile.cpp
index 798bb9d6b892..8f660355c0ac 100644
--- a/lib/Target/PowerPC/PPCTargetObjectFile.cpp
+++ b/lib/Target/PowerPC/PPCTargetObjectFile.cpp
@@ -53,7 +53,7 @@ MCSection *PPC64LinuxTargetObjectFile::SelectSectionForGlobal(
 const MCExpr *PPC64LinuxTargetObjectFile::
 getDebugThreadLocalSymbol(const MCSymbol *Sym) const {
   const MCExpr *Expr =
-    MCSymbolRefExpr::create(Sym, MCSymbolRefExpr::VK_PPC_DTPREL, getContext());
+    MCSymbolRefExpr::create(Sym, MCSymbolRefExpr::VK_DTPREL, getContext());
   return MCBinaryExpr::createAdd(Expr,
                                  MCConstantExpr::create(0x8000, getContext()),
                                  getContext());
diff --git a/lib/Target/PowerPC/PPCTargetTransformInfo.cpp b/lib/Target/PowerPC/PPCTargetTransformInfo.cpp
index cd86dabd5abe..9331e41fb9c1 100644
--- a/lib/Target/PowerPC/PPCTargetTransformInfo.cpp
+++ b/lib/Target/PowerPC/PPCTargetTransformInfo.cpp
@@ -21,6 +21,12 @@ using namespace llvm;
 static cl::opt<bool> DisablePPCConstHoist("disable-ppc-constant-hoisting",
 cl::desc("disable constant hoisting on PPC"), cl::init(false), cl::Hidden);
 
+// This is currently only used for the data prefetch pass which is only enabled
+// for BG/Q by default.
+static cl::opt<unsigned>
+CacheLineSize("ppc-loop-prefetch-cache-line", cl::Hidden, cl::init(64),
+              cl::desc("The loop prefetch cache line size"));
+
 //===----------------------------------------------------------------------===//
 //
 // PPC cost model.
@@ -30,8 +36,9 @@ cl::desc("disable constant hoisting on PPC"), cl::init(false), cl::Hidden);
 TargetTransformInfo::PopcntSupportKind
 PPCTTIImpl::getPopcntSupport(unsigned TyWidth) {
   assert(isPowerOf2_32(TyWidth) && "Ty width must be power of 2");
-  if (ST->hasPOPCNTD() && TyWidth <= 64)
-    return TTI::PSK_FastHardware;
+  if (ST->hasPOPCNTD() != PPCSubtarget::POPCNTD_Unavailable && TyWidth <= 64)
+    return ST->hasPOPCNTD() == PPCSubtarget::POPCNTD_Slow ?
+             TTI::PSK_SlowHardware : TTI::PSK_FastHardware;
   return TTI::PSK_Software;
 }
 
@@ -230,6 +237,18 @@ unsigned PPCTTIImpl::getRegisterBitWidth(bool Vector) {
 
 }
 
+unsigned PPCTTIImpl::getCacheLineSize() {
+  // This is currently only used for the data prefetch pass which is only
+  // enabled for BG/Q by default.
+  return CacheLineSize;
+}
+
+unsigned PPCTTIImpl::getPrefetchDistance() {
+  // This seems like a reasonable default for the BG/Q (this pass is enabled, by
+  // default, only on the BG/Q).
+  return 300;
+}
+
 unsigned PPCTTIImpl::getMaxInterleaveFactor(unsigned VF) {
   unsigned Directive = ST->getDarwinDirective();
   // The 440 has no SIMD support, but floating-point instructions
@@ -248,8 +267,9 @@ unsigned PPCTTIImpl::getMaxInterleaveFactor(unsigned VF) {
 
   // For P7 and P8, floating-point instructions have a 6-cycle latency and
   // there are two execution units, so unroll by 12x for latency hiding.
-  if (Directive == PPC::DIR_PWR7 ||
-      Directive == PPC::DIR_PWR8)
+  // FIXME: the same for P9 as previous gen until POWER9 scheduling is ready
+  if (Directive == PPC::DIR_PWR7 || Directive == PPC::DIR_PWR8 ||
+      Directive == PPC::DIR_PWR9)
     return 12;
 
   // For most things, modern systems have two execution units (and
@@ -355,7 +375,7 @@ int PPCTTIImpl::getMemoryOpCost(unsigned Opcode, Type *Src, unsigned Alignment,
   // If we can use the permutation-based load sequence, then this is also
   // relatively cheap (not counting loop-invariant instructions): one load plus
   // one permute (the last load in a series has extra cost, but we're
-  // neglecting that here). Note that on the P7, we should do unaligned loads
+  // neglecting that here). Note that on the P7, we could do unaligned loads
   // for Altivec types using the VSX instructions, but that's more expensive
   // than using the permutation-based load sequence. On the P8, that's no
   // longer true.
diff --git a/lib/Target/PowerPC/PPCTargetTransformInfo.h b/lib/Target/PowerPC/PPCTargetTransformInfo.h
index 04c1b02235f0..5ea9a543cdb1 100644
--- a/lib/Target/PowerPC/PPCTargetTransformInfo.h
+++ b/lib/Target/PowerPC/PPCTargetTransformInfo.h
@@ -70,6 +70,8 @@ public:
   bool enableInterleavedAccessVectorization();
   unsigned getNumberOfRegisters(bool Vector);
   unsigned getRegisterBitWidth(bool Vector);
+  unsigned getCacheLineSize();
+  unsigned getPrefetchDistance();
   unsigned getMaxInterleaveFactor(unsigned VF);
   int getArithmeticInstrCost(
       unsigned Opcode, Type *Ty,
diff --git a/lib/Target/PowerPC/PPCVSXCopy.cpp b/lib/Target/PowerPC/PPCVSXCopy.cpp
index 782583ce3423..60f1ad5585ff 100644
--- a/lib/Target/PowerPC/PPCVSXCopy.cpp
+++ b/lib/Target/PowerPC/PPCVSXCopy.cpp
@@ -13,11 +13,11 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "PPCInstrInfo.h"
-#include "MCTargetDesc/PPCPredicates.h"
 #include "PPC.h"
+#include "MCTargetDesc/PPCPredicates.h"
 #include "PPCHazardRecognizers.h"
 #include "PPCInstrBuilder.h"
+#include "PPCInstrInfo.h"
 #include "PPCMachineFunctionInfo.h"
 #include "PPCTargetMachine.h"
 #include "llvm/ADT/STLExtras.h"
@@ -28,7 +28,6 @@
 #include "llvm/CodeGen/MachineMemOperand.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
 #include "llvm/MC/MCAsmInfo.h"
-#include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/TargetRegistry.h"
diff --git a/lib/Target/PowerPC/PPCVSXFMAMutate.cpp b/lib/Target/PowerPC/PPCVSXFMAMutate.cpp
index 6b19a2f7118b..7c22cb22bfa5 100644
--- a/lib/Target/PowerPC/PPCVSXFMAMutate.cpp
+++ b/lib/Target/PowerPC/PPCVSXFMAMutate.cpp
@@ -38,8 +38,14 @@
 
 using namespace llvm;
 
-static cl::opt<bool> DisableVSXFMAMutate("disable-ppc-vsx-fma-mutation",
-cl::desc("Disable VSX FMA instruction mutation"), cl::Hidden);
+// Temporarily disable FMA mutation by default, since it doesn't handle
+// cross-basic-block intervals well.
+// See: http://lists.llvm.org/pipermail/llvm-dev/2016-February/095669.html
+//      http://reviews.llvm.org/D17087
+static cl::opt<bool> DisableVSXFMAMutate(
+    "disable-ppc-vsx-fma-mutation",
+    cl::desc("Disable VSX FMA instruction mutation"), cl::init(true),
+    cl::Hidden);
 
 #define DEBUG_TYPE "ppc-vsx-fma-mutate"
 
@@ -99,7 +105,7 @@ protected:
         //                         %RM<imp-use>; VSLRC:%vreg16,%vreg18,%vreg9
         // and we remove: %vreg5<def> = COPY %vreg9; VSLRC:%vreg5,%vreg9
 
-        SlotIndex FMAIdx = LIS->getInstructionIndex(MI);
+        SlotIndex FMAIdx = LIS->getInstructionIndex(*MI);
 
         VNInfo *AddendValNo =
           LIS->getInterval(MI->getOperand(1).getReg()).Query(FMAIdx).valueIn();
@@ -168,21 +174,32 @@ protected:
         if (OtherUsers || KillsAddendSrc)
           continue;
 
-        // Find one of the product operands that is killed by this instruction.
 
+        // The transformation doesn't work well with things like:
+        //    %vreg5 = A-form-op %vreg5, %vreg11, %vreg5;
+        // unless vreg11 is also a kill, so skip when it is not,
+        // and check operand 3 to see it is also a kill to handle the case:
+        //   %vreg5 = A-form-op %vreg5, %vreg5, %vreg11;
+        // where vreg5 and vreg11 are both kills. This case would be skipped
+        // otherwise.
+        unsigned OldFMAReg = MI->getOperand(0).getReg();
+
+        // Find one of the product operands that is killed by this instruction.
         unsigned KilledProdOp = 0, OtherProdOp = 0;
-        if (LIS->getInterval(MI->getOperand(2).getReg())
-                     .Query(FMAIdx).isKill()) {
+        unsigned Reg2 = MI->getOperand(2).getReg();
+        unsigned Reg3 = MI->getOperand(3).getReg();
+        if (LIS->getInterval(Reg2).Query(FMAIdx).isKill()
+            && Reg2 != OldFMAReg) {
           KilledProdOp = 2;
           OtherProdOp  = 3;
-        } else if (LIS->getInterval(MI->getOperand(3).getReg())
-                     .Query(FMAIdx).isKill()) {
+        } else if (LIS->getInterval(Reg3).Query(FMAIdx).isKill()
+            && Reg3 != OldFMAReg) {
           KilledProdOp = 3;
           OtherProdOp  = 2;
         }
 
-        // If there are no killed product operands, then this transformation is
-        // likely not profitable.
+        // If there are no usable killed product operands, then this
+        // transformation is likely not profitable.
         if (!KilledProdOp)
           continue;
 
@@ -212,14 +229,6 @@ protected:
         bool KilledProdRegUndef = MI->getOperand(KilledProdOp).isUndef();
         bool OtherProdRegUndef  = MI->getOperand(OtherProdOp).isUndef();
 
-        unsigned OldFMAReg = MI->getOperand(0).getReg();
-
-        // The transformation doesn't work well with things like:
-        //    %vreg5 = A-form-op %vreg5, %vreg11, %vreg5;
-        // so leave such things alone.
-        if (OldFMAReg == KilledProdReg)
-          continue;
-
         // If there isn't a class that fits, we can't perform the transform.
         // This is needed for correctness with a mixture of VSX and Altivec
         // instructions to make sure that a low VSX register is not assigned to
@@ -236,23 +245,33 @@ protected:
         MI->getOperand(0).setReg(KilledProdReg);
         MI->getOperand(1).setReg(KilledProdReg);
         MI->getOperand(3).setReg(AddendSrcReg);
-        MI->getOperand(2).setReg(OtherProdReg);
 
         MI->getOperand(0).setSubReg(KilledProdSubReg);
         MI->getOperand(1).setSubReg(KilledProdSubReg);
         MI->getOperand(3).setSubReg(AddSubReg);
-        MI->getOperand(2).setSubReg(OtherProdSubReg);
 
         MI->getOperand(1).setIsKill(KilledProdRegKill);
         MI->getOperand(3).setIsKill(AddRegKill);
-        MI->getOperand(2).setIsKill(OtherProdRegKill);
 
         MI->getOperand(1).setIsUndef(KilledProdRegUndef);
         MI->getOperand(3).setIsUndef(AddRegUndef);
-        MI->getOperand(2).setIsUndef(OtherProdRegUndef);
 
         MI->setDesc(TII->get(AltOpc));
 
+        // If the addend is also a multiplicand, replace it with the addend
+        // source in both places.
+        if (OtherProdReg == AddendMI->getOperand(0).getReg()) {
+          MI->getOperand(2).setReg(AddendSrcReg);
+          MI->getOperand(2).setSubReg(AddSubReg);
+          MI->getOperand(2).setIsKill(AddRegKill);
+          MI->getOperand(2).setIsUndef(AddRegUndef);
+        } else {
+          MI->getOperand(2).setReg(OtherProdReg);
+          MI->getOperand(2).setSubReg(OtherProdSubReg);
+          MI->getOperand(2).setIsKill(OtherProdRegKill);
+          MI->getOperand(2).setIsUndef(OtherProdRegUndef);
+        }
+
         DEBUG(dbgs() << " -> " << *MI);
 
         // The killed product operand was killed here, so we can reuse it now
@@ -312,7 +331,7 @@ protected:
         // Remove the (now unused) copy.
 
         DEBUG(dbgs() << "  removing: " << *AddendMI << '\n');
-        LIS->RemoveMachineInstrFromMaps(AddendMI);
+        LIS->RemoveMachineInstrFromMaps(*AddendMI);
         AddendMI->eraseFromParent();
 
         Changed = true;
@@ -323,6 +342,9 @@ protected:
 
 public:
     bool runOnMachineFunction(MachineFunction &MF) override {
+      if (skipFunction(*MF.getFunction()))
+        return false;
+
       // If we don't have VSX then go ahead and return without doing
       // anything.
       const PPCSubtarget &STI = MF.getSubtarget<PPCSubtarget>();
diff --git a/lib/Target/PowerPC/PPCVSXSwapRemoval.cpp b/lib/Target/PowerPC/PPCVSXSwapRemoval.cpp
index 27c540fcf211..d53c8e38254f 100644
--- a/lib/Target/PowerPC/PPCVSXSwapRemoval.cpp
+++ b/lib/Target/PowerPC/PPCVSXSwapRemoval.cpp
@@ -191,6 +191,9 @@ private:
 public:
   // Main entry point for this pass.
   bool runOnMachineFunction(MachineFunction &MF) override {
+    if (skipFunction(*MF.getFunction()))
+      return false;
+
     // If we don't have VSX on the subtarget, don't do anything.
     const PPCSubtarget &STI = MF.getSubtarget<PPCSubtarget>();
     if (!STI.hasVSX())
@@ -404,9 +407,9 @@ bool PPCVSXSwapRemoval::gatherVectorInstructions() {
       case PPC::VSPLTB:
       case PPC::VSPLTH:
       case PPC::VSPLTW:
+      case PPC::XXSPLTW:
         // Splats are lane-sensitive, but we can use special handling
-        // to adjust the source lane for the splat.  This is not yet
-        // implemented.  When it is, we need to uncomment the following:
+        // to adjust the source lane for the splat.
         SwapVector[VecIdx].IsSwappable = 1;
         SwapVector[VecIdx].SpecialHandling = SHValues::SH_SPLAT;
         break;
@@ -512,7 +515,6 @@ bool PPCVSXSwapRemoval::gatherVectorInstructions() {
       // permute control vectors (for shift values 1, 2, 3).  However,
       // VPERM has a more restrictive register class.
       case PPC::XXSLDWI:
-      case PPC::XXSPLTW:
         break;
       }
     }
@@ -690,6 +692,7 @@ void PPCVSXSwapRemoval::recordUnoptimizableWebs() {
       MachineInstr *MI = SwapVector[EntryIdx].VSEMI;
       unsigned UseReg = MI->getOperand(0).getReg();
       MachineInstr *DefMI = MRI->getVRegDef(UseReg);
+      unsigned DefReg = DefMI->getOperand(0).getReg();
       int DefIdx = SwapMap[DefMI];
 
       if (!SwapVector[DefIdx].IsSwap || SwapVector[DefIdx].IsLoad ||
@@ -705,6 +708,25 @@ void PPCVSXSwapRemoval::recordUnoptimizableWebs() {
         DEBUG(MI->dump());
         DEBUG(dbgs() << "\n");
       }
+
+      // Ensure all uses of the register defined by DefMI feed store
+      // instructions
+      for (MachineInstr &UseMI : MRI->use_nodbg_instructions(DefReg)) {
+        int UseIdx = SwapMap[&UseMI];
+
+        if (SwapVector[UseIdx].VSEMI->getOpcode() != MI->getOpcode()) {
+          SwapVector[Repr].WebRejected = 1;
+
+          DEBUG(dbgs() <<
+                format("Web %d rejected for swap not feeding only stores\n",
+                       Repr));
+          DEBUG(dbgs() << "  def " << " : ");
+          DEBUG(DefMI->dump());
+          DEBUG(dbgs() << "  use " << UseIdx << ": ");
+          DEBUG(SwapVector[UseIdx].VSEMI->dump());
+          DEBUG(dbgs() << "\n");
+        }
+      }
     }
   }
 
@@ -803,12 +825,21 @@ void PPCVSXSwapRemoval::handleSpecialSwappables(int EntryIdx) {
       llvm_unreachable("Unexpected splat opcode");
     case PPC::VSPLTB: NElts = 16; break;
     case PPC::VSPLTH: NElts = 8;  break;
-    case PPC::VSPLTW: NElts = 4;  break;
+    case PPC::VSPLTW:
+    case PPC::XXSPLTW: NElts = 4;  break;
     }
 
-    unsigned EltNo = MI->getOperand(1).getImm();
+    unsigned EltNo;
+    if (MI->getOpcode() == PPC::XXSPLTW)
+      EltNo = MI->getOperand(2).getImm();
+    else
+      EltNo = MI->getOperand(1).getImm();
+
     EltNo = (EltNo + NElts / 2) % NElts;
-    MI->getOperand(1).setImm(EltNo);
+    if (MI->getOpcode() == PPC::XXSPLTW)
+      MI->getOperand(2).setImm(EltNo);
+    else
+      MI->getOperand(1).setImm(EltNo);
 
     DEBUG(dbgs() << "  Into: ");
     DEBUG(MI->dump());
diff --git a/lib/Target/PowerPC/README.txt b/lib/Target/PowerPC/README.txt
index 01233ae4f578..f1d4ca7b7fab 100644
--- a/lib/Target/PowerPC/README.txt
+++ b/lib/Target/PowerPC/README.txt
@@ -589,6 +589,17 @@ entry:
 	%tmp34 = zext i1 %tmp3 to i32		; <i32> [#uses=1]
 	ret i32 %tmp34
 }
+
+//===---------------------------------------------------------------------===//
+for the following code:
+
+void foo (float *__restrict__ a, int *__restrict__ b, int n) {
+      a[n] = b[n]  * 2.321;
+}
+
+we load b[n] to GPR, then move it VSX register and convert it float. We should 
+use vsx scalar integer load instructions to avoid direct moves
+
 //===----------------------------------------------------------------------===//
 ; RUN: llvm-as < %s | llc -march=ppc32 | not grep fneg
 
diff --git a/lib/Target/PowerPC/README_P9.txt b/lib/Target/PowerPC/README_P9.txt
new file mode 100644
index 000000000000..d56f7cca7b21
--- /dev/null
+++ b/lib/Target/PowerPC/README_P9.txt
@@ -0,0 +1,605 @@
+//===- README_P9.txt - Notes for improving Power9 code gen ----------------===//
+
+TODO: Instructions Need Implement Instrinstics or Map to LLVM IR
+
+Altivec:
+- Vector Compare Not Equal (Zero):
+  vcmpneb(.) vcmpneh(.) vcmpnew(.)
+  vcmpnezb(.) vcmpnezh(.) vcmpnezw(.)
+  . Same as other VCMP*, use VCMP/VCMPo form (support intrinsic)
+
+- Vector Extract Unsigned: vextractub vextractuh vextractuw vextractd
+  . Don't use llvm extractelement because they have different semantics
+  . Use instrinstics:
+    (set v2i64:$vD, (int_ppc_altivec_vextractub v16i8:$vA, imm:$UIMM))
+    (set v2i64:$vD, (int_ppc_altivec_vextractuh v8i16:$vA, imm:$UIMM))
+    (set v2i64:$vD, (int_ppc_altivec_vextractuw v4i32:$vA, imm:$UIMM))
+    (set v2i64:$vD, (int_ppc_altivec_vextractd  v2i64:$vA, imm:$UIMM))
+
+- Vector Extract Unsigned Byte Left/Right-Indexed:
+  vextublx vextubrx vextuhlx vextuhrx vextuwlx vextuwrx
+  . Use instrinstics:
+    // Left-Indexed
+    (set i64:$rD, (int_ppc_altivec_vextublx i64:$rA, v16i8:$vB))
+    (set i64:$rD, (int_ppc_altivec_vextuhlx i64:$rA, v8i16:$vB))
+    (set i64:$rD, (int_ppc_altivec_vextuwlx i64:$rA, v4i32:$vB))
+
+    // Right-Indexed
+    (set i64:$rD, (int_ppc_altivec_vextubrx i64:$rA, v16i8:$vB))
+    (set i64:$rD, (int_ppc_altivec_vextuhrx i64:$rA, v8i16:$vB))
+    (set i64:$rD, (int_ppc_altivec_vextuwrx i64:$rA, v4i32:$vB))
+
+- Vector Insert Element Instructions: vinsertb vinsertd vinserth vinsertw
+    (set v16i8:$vD, (int_ppc_altivec_vinsertb v16i8:$vA, imm:$UIMM))
+    (set v8i16:$vD, (int_ppc_altivec_vinsertd v8i16:$vA, imm:$UIMM))
+    (set v4i32:$vD, (int_ppc_altivec_vinserth v4i32:$vA, imm:$UIMM))
+    (set v2i64:$vD, (int_ppc_altivec_vinsertw v2i64:$vA, imm:$UIMM))
+
+- Vector Count Leading/Trailing Zero LSB. Result is placed into GPR[rD]:
+  vclzlsbb vctzlsbb
+  . Use intrinsic:
+    (set i64:$rD, (int_ppc_altivec_vclzlsbb v16i8:$vB))
+    (set i64:$rD, (int_ppc_altivec_vctzlsbb v16i8:$vB))
+
+- Vector Count Trailing Zeros: vctzb vctzh vctzw vctzd
+  . Map to llvm cttz
+    (set v16i8:$vD, (cttz v16i8:$vB))     // vctzb
+    (set v8i16:$vD, (cttz v8i16:$vB))     // vctzh
+    (set v4i32:$vD, (cttz v4i32:$vB))     // vctzw
+    (set v2i64:$vD, (cttz v2i64:$vB))     // vctzd
+
+- Vector Extend Sign: vextsb2w vextsh2w vextsb2d vextsh2d vextsw2d
+  . vextsb2w:
+    (set v4i32:$vD, (sext v4i8:$vB))
+
+    // PowerISA_V3.0:
+    do i = 0 to 3
+       VR[VRT].word[i] ← EXTS32(VR[VRB].word[i].byte[3])
+    end
+
+  . vextsh2w:
+    (set v4i32:$vD, (sext v4i16:$vB))
+
+    // PowerISA_V3.0:
+    do i = 0 to 3
+       VR[VRT].word[i] ← EXTS32(VR[VRB].word[i].hword[1])
+    end
+
+  . vextsb2d
+    (set v2i64:$vD, (sext v2i8:$vB))
+
+    // PowerISA_V3.0:
+    do i = 0 to 1
+       VR[VRT].dword[i] ← EXTS64(VR[VRB].dword[i].byte[7])
+    end
+
+  . vextsh2d
+    (set v2i64:$vD, (sext v2i16:$vB))
+
+    // PowerISA_V3.0:
+    do i = 0 to 1
+       VR[VRT].dword[i] ← EXTS64(VR[VRB].dword[i].hword[3])
+    end
+
+  . vextsw2d
+    (set v2i64:$vD, (sext v2i32:$vB))
+
+    // PowerISA_V3.0:
+    do i = 0 to 1
+       VR[VRT].dword[i] ← EXTS64(VR[VRB].dword[i].word[1])
+    end
+
+- Vector Integer Negate: vnegw vnegd
+  . Map to llvm ineg
+    (set v4i32:$rT, (ineg v4i32:$rA))       // vnegw
+    (set v2i64:$rT, (ineg v2i64:$rA))       // vnegd
+
+- Vector Parity Byte: vprtybw vprtybd vprtybq
+  . Use intrinsic:
+    (set v4i32:$rD, (int_ppc_altivec_vprtybw v4i32:$vB))
+    (set v2i64:$rD, (int_ppc_altivec_vprtybd v2i64:$vB))
+    (set v1i128:$rD, (int_ppc_altivec_vprtybq v1i128:$vB))
+
+- Vector (Bit) Permute (Right-indexed):
+  . vbpermd: Same as "vbpermq", use VX1_Int_Ty2:
+    VX1_Int_Ty2<1484, "vbpermd", int_ppc_altivec_vbpermd, v2i64, v2i64>;
+
+  . vpermr: use VA1a_Int_Ty3
+    VA1a_Int_Ty3<59, "vpermr", int_ppc_altivec_vpermr, v16i8, v16i8, v16i8>;
+
+- Vector Rotate Left Mask/Mask-Insert: vrlwnm vrlwmi vrldnm vrldmi
+  . Use intrinsic:
+    VX1_Int_Ty<389, "vrlwnm", int_ppc_altivec_vrlwnm, v4i32>;
+    VX1_Int_Ty<133, "vrlwmi", int_ppc_altivec_vrlwmi, v4i32>;
+    VX1_Int_Ty<453, "vrldnm", int_ppc_altivec_vrldnm, v2i64>;
+    VX1_Int_Ty<197, "vrldmi", int_ppc_altivec_vrldmi, v2i64>;
+
+- Vector Shift Left/Right: vslv vsrv
+  . Use intrinsic, don't map to llvm shl and lshr, because they have different
+    semantics, e.g. vslv:
+
+      do i = 0 to 15
+         sh ← VR[VRB].byte[i].bit[5:7]
+         VR[VRT].byte[i] ← src.byte[i:i+1].bit[sh:sh+7]
+      end
+
+    VR[VRT].byte[i] is composed of 2 bytes from src.byte[i:i+1]
+
+  . VX1_Int_Ty<1860, "vslv", int_ppc_altivec_vslv, v16i8>;
+    VX1_Int_Ty<1796, "vsrv", int_ppc_altivec_vsrv, v16i8>;
+
+- Vector Multiply-by-10 (& Write Carry) Unsigned Quadword:
+  vmul10uq vmul10cuq
+  . Use intrinsic:
+    VX1_Int_Ty<513, "vmul10uq",   int_ppc_altivec_vmul10uq,  v1i128>;
+    VX1_Int_Ty<  1, "vmul10cuq",  int_ppc_altivec_vmul10cuq, v1i128>;
+
+- Vector Multiply-by-10 Extended (& Write Carry) Unsigned Quadword:
+  vmul10euq vmul10ecuq
+  . Use intrinsic:
+    VX1_Int_Ty<577, "vmul10euq",  int_ppc_altivec_vmul10euq, v1i128>;
+    VX1_Int_Ty< 65, "vmul10ecuq", int_ppc_altivec_vmul10ecuq, v1i128>;
+
+- Decimal Convert From/to National/Zoned/Signed-QWord:
+  bcdcfn. bcdcfz. bcdctn. bcdctz. bcdcfsq. bcdctsq.
+  . Use instrinstics:
+    (set v1i128:$vD, (int_ppc_altivec_bcdcfno  v1i128:$vB, i1:$PS))
+    (set v1i128:$vD, (int_ppc_altivec_bcdcfzo  v1i128:$vB, i1:$PS))
+    (set v1i128:$vD, (int_ppc_altivec_bcdctno  v1i128:$vB))
+    (set v1i128:$vD, (int_ppc_altivec_bcdctzo  v1i128:$vB, i1:$PS))
+    (set v1i128:$vD, (int_ppc_altivec_bcdcfsqo v1i128:$vB, i1:$PS))
+    (set v1i128:$vD, (int_ppc_altivec_bcdctsqo v1i128:$vB))
+
+- Decimal Copy-Sign/Set-Sign: bcdcpsgn. bcdsetsgn.
+  . Use instrinstics:
+    (set v1i128:$vD, (int_ppc_altivec_bcdcpsgno v1i128:$vA, v1i128:$vB))
+    (set v1i128:$vD, (int_ppc_altivec_bcdsetsgno v1i128:$vB, i1:$PS))
+
+- Decimal Shift/Unsigned-Shift/Shift-and-Round: bcds. bcdus. bcdsr.
+  . Use instrinstics:
+    (set v1i128:$vD, (int_ppc_altivec_bcdso  v1i128:$vA, v1i128:$vB, i1:$PS))
+    (set v1i128:$vD, (int_ppc_altivec_bcduso v1i128:$vA, v1i128:$vB))
+    (set v1i128:$vD, (int_ppc_altivec_bcdsro v1i128:$vA, v1i128:$vB, i1:$PS))
+
+  . Note! Their VA is accessed only 1 byte, i.e. VA.byte[7]
+
+- Decimal (Unsigned) Truncate: bcdtrunc. bcdutrunc.
+  . Use instrinstics:
+    (set v1i128:$vD, (int_ppc_altivec_bcdso  v1i128:$vA, v1i128:$vB, i1:$PS))
+    (set v1i128:$vD, (int_ppc_altivec_bcduso v1i128:$vA, v1i128:$vB))
+
+  . Note! Their VA is accessed only 2 byte, i.e. VA.hword[3] (VA.bit[48:63])
+
+VSX:
+- QP Copy Sign: xscpsgnqp
+  . Similar to xscpsgndp
+  . (set f128:$vT, (fcopysign f128:$vB, f128:$vA)
+
+- QP Absolute/Negative-Absolute/Negate: xsabsqp xsnabsqp xsnegqp
+  . Similar to xsabsdp/xsnabsdp/xsnegdp
+  . (set f128:$vT, (fabs f128:$vB))             // xsabsqp
+    (set f128:$vT, (fneg (fabs f128:$vB)))      // xsnabsqp
+    (set f128:$vT, (fneg f128:$vB))             // xsnegqp
+
+- QP Add/Divide/Multiply/Subtract/Square-Root:
+  xsaddqp xsdivqp xsmulqp xssubqp xssqrtqp
+  . Similar to xsadddp
+  . isCommutable = 1
+    (set f128:$vT, (fadd f128:$vA, f128:$vB))   // xsaddqp
+    (set f128:$vT, (fmul f128:$vA, f128:$vB))   // xsmulqp
+
+  . isCommutable = 0
+    (set f128:$vT, (fdiv f128:$vA, f128:$vB))   // xsdivqp
+    (set f128:$vT, (fsub f128:$vA, f128:$vB))   // xssubqp
+    (set f128:$vT, (fsqrt f128:$vB)))           // xssqrtqp
+
+- Round to Odd of QP Add/Divide/Multiply/Subtract/Square-Root:
+  xsaddqpo xsdivqpo xsmulqpo xssubqpo xssqrtqpo
+  . Similar to xsrsqrtedp??
+      def XSRSQRTEDP : XX2Form<60, 74,
+                               (outs vsfrc:$XT), (ins vsfrc:$XB),
+                               "xsrsqrtedp $XT, $XB", IIC_VecFP,
+                               [(set f64:$XT, (PPCfrsqrte f64:$XB))]>;
+
+  . Define DAG Node in PPCInstrInfo.td:
+    def PPCfaddrto: SDNode<"PPCISD::FADDRTO", SDTFPBinOp, []>;
+    def PPCfdivrto: SDNode<"PPCISD::FDIVRTO", SDTFPBinOp, []>;
+    def PPCfmulrto: SDNode<"PPCISD::FMULRTO", SDTFPBinOp, []>;
+    def PPCfsubrto: SDNode<"PPCISD::FSUBRTO", SDTFPBinOp, []>;
+    def PPCfsqrtrto: SDNode<"PPCISD::FSQRTRTO", SDTFPUnaryOp, []>;
+
+    DAG patterns of each instruction (PPCInstrVSX.td):
+    . isCommutable = 1
+      (set f128:$vT, (PPCfaddrto f128:$vA, f128:$vB))   // xsaddqpo
+      (set f128:$vT, (PPCfmulrto f128:$vA, f128:$vB))   // xsmulqpo
+
+    . isCommutable = 0
+      (set f128:$vT, (PPCfdivrto f128:$vA, f128:$vB))   // xsdivqpo
+      (set f128:$vT, (PPCfsubrto f128:$vA, f128:$vB))   // xssubqpo
+      (set f128:$vT, (PPCfsqrtrto f128:$vB))            // xssqrtqpo
+
+- QP (Negative) Multiply-{Add/Subtract}: xsmaddqp xsmsubqp xsnmaddqp xsnmsubqp
+  . Ref: xsmaddadp/xsmsubadp/xsnmaddadp/xsnmsubadp
+
+  . isCommutable = 1
+    // xsmaddqp
+    [(set f128:$vT, (fma f128:$vA, f128:$vB, f128:$vTi))]>,
+    RegConstraint<"$vTi = $vT">, NoEncode<"$vTi">,
+    AltVSXFMARel;
+
+    // xsmsubqp
+    [(set f128:$vT, (fma f128:$vA, f128:$vB, (fneg f128:$vTi)))]>,
+    RegConstraint<"$vTi = $vT">, NoEncode<"$vTi">,
+    AltVSXFMARel;
+
+    // xsnmaddqp
+    [(set f128:$vT, (fneg (fma f128:$vA, f128:$vB, f128:$vTi)))]>,
+    RegConstraint<"$vTi = $vT">, NoEncode<"$vTi">,
+    AltVSXFMARel;
+
+    // xsnmsubqp
+    [(set f128:$vT, (fneg (fma f128:$vA, f128:$vB, (fneg f128:$vTi))))]>,
+    RegConstraint<"$vTi = $vT">, NoEncode<"$vTi">,
+    AltVSXFMARel;
+
+- Round to Odd of QP (Negative) Multiply-{Add/Subtract}:
+  xsmaddqpo xsmsubqpo xsnmaddqpo xsnmsubqpo
+  . Similar to xsrsqrtedp??
+
+  . Define DAG Node in PPCInstrInfo.td:
+    def PPCfmarto: SDNode<"PPCISD::FMARTO", SDTFPTernaryOp, []>;
+
+    It looks like we only need to define "PPCfmarto" for these instructions,
+    because according to PowerISA_V3.0, these instructions perform RTO on
+    fma's result:
+        xsmaddqp(o)
+        v      ← bfp_MULTIPLY_ADD(src1, src3, src2)
+        rnd    ← bfp_ROUND_TO_BFP128(RO, FPSCR.RN, v)
+        result ← bfp_CONVERT_TO_BFP128(rnd)
+
+        xsmsubqp(o)
+        v      ← bfp_MULTIPLY_ADD(src1, src3, bfp_NEGATE(src2))
+        rnd    ← bfp_ROUND_TO_BFP128(RO, FPSCR.RN, v)
+        result ← bfp_CONVERT_TO_BFP128(rnd)
+
+        xsnmaddqp(o)
+        v      ← bfp_MULTIPLY_ADD(src1,src3,src2)
+        rnd    ← bfp_NEGATE(bfp_ROUND_TO_BFP128(RO, FPSCR.RN, v))
+        result ← bfp_CONVERT_TO_BFP128(rnd)
+
+        xsnmsubqp(o)
+        v      ← bfp_MULTIPLY_ADD(src1, src3, bfp_NEGATE(src2))
+        rnd    ← bfp_NEGATE(bfp_ROUND_TO_BFP128(RO, FPSCR.RN, v))
+        result ← bfp_CONVERT_TO_BFP128(rnd)
+
+    DAG patterns of each instruction (PPCInstrVSX.td):
+    . isCommutable = 1
+      // xsmaddqpo
+      [(set f128:$vT, (PPCfmarto f128:$vA, f128:$vB, f128:$vTi))]>,
+      RegConstraint<"$vTi = $vT">, NoEncode<"$vTi">,
+      AltVSXFMARel;
+
+      // xsmsubqpo
+      [(set f128:$vT, (PPCfmarto f128:$vA, f128:$vB, (fneg f128:$vTi)))]>,
+      RegConstraint<"$vTi = $vT">, NoEncode<"$vTi">,
+      AltVSXFMARel;
+
+      // xsnmaddqpo
+      [(set f128:$vT, (fneg (PPCfmarto f128:$vA, f128:$vB, f128:$vTi)))]>,
+      RegConstraint<"$vTi = $vT">, NoEncode<"$vTi">,
+      AltVSXFMARel;
+
+      // xsnmsubqpo
+      [(set f128:$vT, (fneg (PPCfmarto f128:$vA, f128:$vB, (fneg f128:$vTi))))]>,
+      RegConstraint<"$vTi = $vT">, NoEncode<"$vTi">,
+      AltVSXFMARel;
+
+- QP Compare Ordered/Unordered: xscmpoqp xscmpuqp
+  . ref: XSCMPUDP
+      def XSCMPUDP : XX3Form_1<60, 35,
+                               (outs crrc:$crD), (ins vsfrc:$XA, vsfrc:$XB),
+                               "xscmpudp $crD, $XA, $XB", IIC_FPCompare, []>;
+
+  . No SDAG, intrinsic, builtin are required??
+    Or llvm fcmp order/unorder compare??
+
+- DP/QP Compare Exponents: xscmpexpdp xscmpexpqp
+  . No SDAG, intrinsic, builtin are required?
+
+- DP Compare ==, >=, >, !=: xscmpeqdp xscmpgedp xscmpgtdp xscmpnedp
+  . I checked existing instruction "XSCMPUDP". They are different in target
+    register. "XSCMPUDP" write to CR field, xscmp*dp write to VSX register
+
+  . Use instrinsic:
+    (set i128:$XT, (int_ppc_vsx_xscmpeqdp f64:$XA, f64:$XB))
+    (set i128:$XT, (int_ppc_vsx_xscmpgedp f64:$XA, f64:$XB))
+    (set i128:$XT, (int_ppc_vsx_xscmpgtdp f64:$XA, f64:$XB))
+    (set i128:$XT, (int_ppc_vsx_xscmpnedp f64:$XA, f64:$XB))
+
+- Vector Compare Not Equal: xvcmpnedp xvcmpnedp. xvcmpnesp xvcmpnesp.
+  . Similar to xvcmpeqdp:
+      defm XVCMPEQDP : XX3Form_Rcr<60, 99,
+                                 "xvcmpeqdp", "$XT, $XA, $XB", IIC_VecFPCompare,
+                                 int_ppc_vsx_xvcmpeqdp, v2i64, v2f64>;
+
+  . So we should use "XX3Form_Rcr" to implement instrinsic
+
+- Convert DP -> QP: xscvdpqp
+  . Similar to XSCVDPSP:
+      def XSCVDPSP : XX2Form<60, 265,
+                          (outs vsfrc:$XT), (ins vsfrc:$XB),
+                          "xscvdpsp $XT, $XB", IIC_VecFP, []>;
+  . So, No SDAG, intrinsic, builtin are required??
+
+- Round & Convert QP -> DP (dword[1] is set to zero): xscvqpdp xscvqpdpo
+  . Similar to XSCVDPSP
+  . No SDAG, intrinsic, builtin are required??
+
+- Truncate & Convert QP -> (Un)Signed (D)Word (dword[1] is set to zero):
+  xscvqpsdz xscvqpswz xscvqpudz xscvqpuwz
+  . According to PowerISA_V3.0, these are similar to "XSCVDPSXDS", "XSCVDPSXWS",
+    "XSCVDPUXDS", "XSCVDPUXWS"
+
+  . DAG patterns:
+    (set f128:$XT, (PPCfctidz f128:$XB))    // xscvqpsdz
+    (set f128:$XT, (PPCfctiwz f128:$XB))    // xscvqpswz
+    (set f128:$XT, (PPCfctiduz f128:$XB))   // xscvqpudz
+    (set f128:$XT, (PPCfctiwuz f128:$XB))   // xscvqpuwz
+
+- Convert (Un)Signed DWord -> QP: xscvsdqp xscvudqp
+  . Similar to XSCVSXDSP
+  . (set f128:$XT, (PPCfcfids f64:$XB))     // xscvsdqp
+    (set f128:$XT, (PPCfcfidus f64:$XB))    // xscvudqp
+
+- (Round &) Convert DP <-> HP: xscvdphp xscvhpdp
+  . Similar to XSCVDPSP
+  . No SDAG, intrinsic, builtin are required??
+
+- Vector HP -> SP: xvcvhpsp xvcvsphp
+  . Similar to XVCVDPSP:
+      def XVCVDPSP : XX2Form<60, 393,
+                          (outs vsrc:$XT), (ins vsrc:$XB),
+                          "xvcvdpsp $XT, $XB", IIC_VecFP, []>;
+  . No SDAG, intrinsic, builtin are required??
+
+- Round to Quad-Precision Integer: xsrqpi xsrqpix
+  . These are combination of "XSRDPI", "XSRDPIC", "XSRDPIM", .., because you
+    need to assign rounding mode in instruction
+  . Provide builtin?
+    (set f128:$vT, (int_ppc_vsx_xsrqpi f128:$vB))
+    (set f128:$vT, (int_ppc_vsx_xsrqpix f128:$vB))
+
+- Round Quad-Precision to Double-Extended Precision (fp80): xsrqpxp
+  . Provide builtin?
+    (set f128:$vT, (int_ppc_vsx_xsrqpxp f128:$vB))
+
+Fixed Point Facility:
+
+- Exploit cmprb and cmpeqb (perhaps for something like
+  isalpha/isdigit/isupper/islower and isspace respectivelly). This can
+  perhaps be done through a builtin.
+
+- Provide testing for cnttz[dw]
+- Insert Exponent DP/QP: xsiexpdp xsiexpqp
+  . Use intrinsic?
+  . xsiexpdp:
+    // Note: rA and rB are the unsigned integer value.
+    (set f128:$XT, (int_ppc_vsx_xsiexpdp i64:$rA, i64:$rB))
+
+  . xsiexpqp:
+    (set f128:$vT, (int_ppc_vsx_xsiexpqp f128:$vA, f64:$vB))
+
+- Extract Exponent/Significand DP/QP: xsxexpdp xsxsigdp xsxexpqp xsxsigqp
+  . Use intrinsic?
+  . (set i64:$rT, (int_ppc_vsx_xsxexpdp f64$XB))    // xsxexpdp
+    (set i64:$rT, (int_ppc_vsx_xsxsigdp f64$XB))    // xsxsigdp
+    (set f128:$vT, (int_ppc_vsx_xsxexpqp f128$vB))  // xsxexpqp
+    (set f128:$vT, (int_ppc_vsx_xsxsigqp f128$vB))  // xsxsigqp
+
+- Vector Insert Word: xxinsertw
+  - Useful for inserting f32/i32 elements into vectors (the element to be
+    inserted needs to be prepared)
+  . Note: llvm has insertelem in "Vector Operations"
+    ; yields <n x <ty>>
+    <result> = insertelement <n x <ty>> <val>, <ty> <elt>, <ty2> <idx>
+
+    But how to map to it??
+    [(set v1f128:$XT, (insertelement v1f128:$XTi, f128:$XB, i4:$UIMM))]>,
+    RegConstraint<"$XTi = $XT">, NoEncode<"$XTi">,
+
+  . Or use intrinsic?
+    (set v1f128:$XT, (int_ppc_vsx_xxinsertw v1f128:$XTi, f128:$XB, i4:$UIMM))
+
+- Vector Extract Unsigned Word: xxextractuw
+  - Not useful for extraction of f32 from v4f32 (the current pattern is better -
+    shift->convert)
+  - It is useful for (uint_to_fp (vector_extract v4i32, N))
+  - Unfortunately, it can't be used for (sint_to_fp (vector_extract v4i32, N))
+  . Note: llvm has extractelement in "Vector Operations"
+    ; yields <ty>
+    <result> = extractelement <n x <ty>> <val>, <ty2> <idx>
+
+    How to map to it??
+    [(set f128:$XT, (extractelement v1f128:$XB, i4:$UIMM))]
+
+  . Or use intrinsic?
+    (set f128:$XT, (int_ppc_vsx_xxextractuw v1f128:$XB, i4:$UIMM))
+
+- Vector Insert Exponent DP/SP: xviexpdp xviexpsp
+  . Use intrinsic
+    (set v2f64:$XT, (int_ppc_vsx_xviexpdp v2f64:$XA, v2f64:$XB))
+    (set v4f32:$XT, (int_ppc_vsx_xviexpsp v4f32:$XA, v4f32:$XB))
+
+- Vector Extract Exponent/Significand DP/SP: xvxexpdp xvxexpsp xvxsigdp xvxsigsp
+  . Use intrinsic
+    (set v2f64:$XT, (int_ppc_vsx_xvxexpdp v2f64:$XB))
+    (set v4f32:$XT, (int_ppc_vsx_xvxexpsp v4f32:$XB))
+    (set v2f64:$XT, (int_ppc_vsx_xvxsigdp v2f64:$XB))
+    (set v4f32:$XT, (int_ppc_vsx_xvxsigsp v4f32:$XB))
+
+- Test Data Class SP/DP/QP: xststdcsp xststdcdp xststdcqp
+  . No SDAG, intrinsic, builtin are required?
+    Because it seems that we have no way to map BF field?
+
+    Instruction Form: [PO T XO B XO BX TX]
+    Asm: xststd* BF,XB,DCMX
+
+    BF is an index to CR register field.
+
+- Vector Test Data Class SP/DP: xvtstdcsp xvtstdcdp
+  . Use intrinsic
+    (set v4f32:$XT, (int_ppc_vsx_xvtstdcsp v4f32:$XB, i7:$DCMX))
+    (set v2f64:$XT, (int_ppc_vsx_xvtstdcdp v2f64:$XB, i7:$DCMX))
+
+- Maximum/Minimum Type-C/Type-J DP: xsmaxcdp xsmaxjdp xsmincdp xsminjdp
+  . PowerISA_V3.0:
+    "xsmaxcdp can be used to implement the C/C++/Java conditional operation
+     (x>y)?x:y for single-precision and double-precision arguments."
+
+    Note! c type and j type have different behavior when:
+    1. Either input is NaN
+    2. Both input are +-Infinity, +-Zero
+
+  . dtype map to llvm fmaxnum/fminnum
+    jtype use intrinsic
+
+  . xsmaxcdp xsmincdp
+    (set f64:$XT, (fmaxnum f64:$XA, f64:$XB))
+    (set f64:$XT, (fminnum f64:$XA, f64:$XB))
+
+  . xsmaxjdp xsminjdp
+    (set f64:$XT, (int_ppc_vsx_xsmaxjdp f64:$XA, f64:$XB))
+    (set f64:$XT, (int_ppc_vsx_xsminjdp f64:$XA, f64:$XB))
+
+- Vector Byte-Reverse H/W/D/Q Word: xxbrh xxbrw xxbrd xxbrq
+  . Use intrinsic
+    (set v8i16:$XT, (int_ppc_vsx_xxbrh v8i16:$XB))
+    (set v4i32:$XT, (int_ppc_vsx_xxbrw v4i32:$XB))
+    (set v2i64:$XT, (int_ppc_vsx_xxbrd v2i64:$XB))
+    (set v1i128:$XT, (int_ppc_vsx_xxbrq v1i128:$XB))
+
+- Vector Permute: xxperm xxpermr
+  . I have checked "PPCxxswapd" in PPCInstrVSX.td, but they are different
+  . Use intrinsic
+    (set v16i8:$XT, (int_ppc_vsx_xxperm v16i8:$XA, v16i8:$XB))
+    (set v16i8:$XT, (int_ppc_vsx_xxpermr v16i8:$XA, v16i8:$XB))
+
+- Vector Splat Immediate Byte: xxspltib
+  . Similar to XXSPLTW:
+      def XXSPLTW : XX2Form_2<60, 164,
+                           (outs vsrc:$XT), (ins vsrc:$XB, u2imm:$UIM),
+                           "xxspltw $XT, $XB, $UIM", IIC_VecPerm, []>;
+
+  . No SDAG, intrinsic, builtin are required?
+
+- Load/Store Vector: lxv stxv
+  . Has likely SDAG match:
+    (set v?:$XT, (load ix16addr:$src))
+    (set v?:$XT, (store ix16addr:$dst))
+
+  . Need define ix16addr in PPCInstrInfo.td
+    ix16addr: 16-byte aligned, see "def memrix16" in PPCInstrInfo.td
+
+- Load/Store Vector Indexed: lxvx stxvx
+  . Has likely SDAG match:
+    (set v?:$XT, (load xoaddr:$src))
+    (set v?:$XT, (store xoaddr:$dst))
+
+- Load/Store DWord: lxsd stxsd
+  . Similar to lxsdx/stxsdx:
+    def LXSDX : XX1Form<31, 588,
+                        (outs vsfrc:$XT), (ins memrr:$src),
+                        "lxsdx $XT, $src", IIC_LdStLFD,
+                        [(set f64:$XT, (load xoaddr:$src))]>;
+
+  . (set f64:$XT, (load ixaddr:$src))
+    (set f64:$XT, (store ixaddr:$dst))
+
+- Load/Store SP, with conversion from/to DP: lxssp stxssp
+  . Similar to lxsspx/stxsspx:
+    def LXSSPX : XX1Form<31, 524, (outs vssrc:$XT), (ins memrr:$src),
+                         "lxsspx $XT, $src", IIC_LdStLFD,
+                         [(set f32:$XT, (load xoaddr:$src))]>;
+
+  . (set f32:$XT, (load ixaddr:$src))
+    (set f32:$XT, (store ixaddr:$dst))
+
+- Load as Integer Byte/Halfword & Zero Indexed: lxsibzx lxsihzx
+  . Similar to lxsiwzx:
+    def LXSIWZX : XX1Form<31, 12, (outs vsfrc:$XT), (ins memrr:$src),
+                          "lxsiwzx $XT, $src", IIC_LdStLFD,
+                          [(set f64:$XT, (PPClfiwzx xoaddr:$src))]>;
+
+  . (set f64:$XT, (PPClfiwzx xoaddr:$src))
+
+- Store as Integer Byte/Halfword Indexed: stxsibx stxsihx
+  . Similar to stxsiwx:
+    def STXSIWX : XX1Form<31, 140, (outs), (ins vsfrc:$XT, memrr:$dst),
+                          "stxsiwx $XT, $dst", IIC_LdStSTFD,
+                          [(PPCstfiwx f64:$XT, xoaddr:$dst)]>;
+
+  . (PPCstfiwx f64:$XT, xoaddr:$dst)
+
+- Load Vector Halfword*8/Byte*16 Indexed: lxvh8x lxvb16x
+  . Similar to lxvd2x/lxvw4x:
+    def LXVD2X : XX1Form<31, 844,
+                         (outs vsrc:$XT), (ins memrr:$src),
+                         "lxvd2x $XT, $src", IIC_LdStLFD,
+                         [(set v2f64:$XT, (int_ppc_vsx_lxvd2x xoaddr:$src))]>;
+
+  . (set v8i16:$XT, (int_ppc_vsx_lxvh8x xoaddr:$src))
+    (set v16i8:$XT, (int_ppc_vsx_lxvb16x xoaddr:$src))
+
+- Store Vector Halfword*8/Byte*16 Indexed: stxvh8x stxvb16x
+  . Similar to stxvd2x/stxvw4x:
+    def STXVD2X : XX1Form<31, 972,
+                         (outs), (ins vsrc:$XT, memrr:$dst),
+                         "stxvd2x $XT, $dst", IIC_LdStSTFD,
+                         [(store v2f64:$XT, xoaddr:$dst)]>;
+
+  . (store v8i16:$XT, xoaddr:$dst)
+    (store v16i8:$XT, xoaddr:$dst)
+
+- Load/Store Vector (Left-justified) with Length: lxvl lxvll stxvl stxvll
+  . Likely needs an intrinsic
+  . (set v?:$XT, (int_ppc_vsx_lxvl xoaddr:$src))
+    (set v?:$XT, (int_ppc_vsx_lxvll xoaddr:$src))
+
+  . (int_ppc_vsx_stxvl xoaddr:$dst))
+    (int_ppc_vsx_stxvll xoaddr:$dst))
+
+- Load Vector Word & Splat Indexed: lxvwsx
+  . Likely needs an intrinsic
+  . (set v?:$XT, (int_ppc_vsx_lxvwsx xoaddr:$src))
+
+Atomic operations (l[dw]at, st[dw]at):
+- Provide custom lowering for common atomic operations to use these
+  instructions with the correct Function Code
+- Ensure the operands are in the correct register (i.e. RT+1, RT+2)
+- Provide builtins since not all FC's necessarily have an existing LLVM
+  atomic operation
+
+Load Doubleword Monitored (ldmx):
+- Investigate whether there are any uses for this. It seems to be related to
+  Garbage Collection so it isn't likely to be all that useful for most
+  languages we deal with.
+
+Move to CR from XER Extended (mcrxrx):
+- Is there a use for this in LLVM?
+
+Fixed Point Facility:
+
+- Copy-Paste Facility: copy copy_first cp_abort paste paste. paste_last
+  . Use instrinstics:
+    (int_ppc_copy_first i32:$rA, i32:$rB)
+    (int_ppc_copy i32:$rA, i32:$rB)
+
+    (int_ppc_paste i32:$rA, i32:$rB)
+    (int_ppc_paste_last i32:$rA, i32:$rB)
+
+    (int_cp_abort)
+
+- Message Synchronize: msgsync
+- SLB*: slbieg slbsync
+- stop
+  . No instrinstics
diff --git a/lib/Target/PowerPC/TargetInfo/Makefile b/lib/Target/PowerPC/TargetInfo/Makefile
deleted file mode 100644
index 2d0560d275f9..000000000000
--- a/lib/Target/PowerPC/TargetInfo/Makefile
+++ /dev/null
@@ -1,15 +0,0 @@
-##===- lib/Target/PowerPC/TargetInfo/Makefile --------------*- Makefile -*-===##
-#
-#                     The LLVM Compiler Infrastructure
-#
-# This file is distributed under the University of Illinois Open Source
-# License. See LICENSE.TXT for details.
-#
-##===----------------------------------------------------------------------===##
-LEVEL = ../../../..
-LIBRARYNAME = LLVMPowerPCInfo
-
-# Hack: we need to include 'main' target directory to grab private headers
-override CPPFLAGS += -I$(PROJ_OBJ_DIR)/.. -I$(PROJ_SRC_DIR)/..
-
-include $(LEVEL)/Makefile.common
diff --git a/lib/Target/PowerPC/p9-instrs.txt b/lib/Target/PowerPC/p9-instrs.txt
new file mode 100644
index 000000000000..a70582aca398
--- /dev/null
+++ b/lib/Target/PowerPC/p9-instrs.txt
@@ -0,0 +1,442 @@
+Content:
+========
+. Remaining Instructions (Total 56 Instructions, include 2 unknow instructions) 
+. Done (Total 155 Instructions: 101 VSX, 54 Altivec)
+
+//------------------------------------------------------------------------------
+//. Remaining Instructions
+//------------------------------------------------------------------------------
+GCC reference: https://sourceware.org/ml/binutils/2015-11/msg00071.html
+
+// Add PC Immediate Shifted  DX-form p69
+[PO RT d1 d0 XO d2]         addpcis     RT,D
+                            subpcis Rx,value = addpcis Rx,-value
+
+// 6.17.2 Decimal Integer Format Conversion Instructions
+
+// Decimal Convert From National VX-form p352
+[PO VRT EO VRB 1 PS XO]     bcdcfn.     VRT,VRB,PS
+
+// Decimal Convert From Zoned VX-form p353
+[PO VRT EO VRB 1 PS XO]     bcdcfz.     VRT,VRB,PS
+
+// Decimal Convert To National VX-form p354
+[PO VRT EO VRB 1 / XO]      bcdctn.     VRT,VRB
+
+// Decimal Convert To Zoned VX-form p355
+[PO VRT EO VRB 1 PS XO]     bcdctz.     VRT,VRB,PS
+
+// Decimal Convert From Signed Quadword VX-form p356
+[PO VRT EO VRB 1 PS XO]     bcdcfsq.    VRT,VRB,PS
+
+// Decimal Convert To Signed Quadword VX-form p356
+[PO VRT EO VRB 1 / XO]      bcdctsq.    VRT,VRB
+
+// 6.17.3 Decimal Integer Sign Manipulation Instructions
+
+// Decimal Copy Sign VX-form p358
+[PO VRT VRA VRB XO]         bcdcpsgn.   VRT,VRA,VRB
+
+// Decimal Set Sign VX-form p358
+[PO VRT EO VRB 1 PS XO]     bcdsetsgn.  VRT,VRB,PS
+
+// Decimal Shift VX-form p359
+[PO VRT VRA VRB 1 PS XO]    bcds.       VRT,VRA,VRB,PS
+
+// Decimal Unsigned Shift VX-form p360
+[PO VRT VRA VRB 1 / XO]     bcdus.      VRT,VRA,VRB
+
+// Decimal Shift and Round VX-form p361
+[PO VRT VRA VRB 1 PS XO]    bcdsr.      VRT,VRA,VRB,PS
+
+// 6.17.5 Decimal Integer Truncate Instructions
+
+// Decimal Truncate VX-form p362
+[PO VRT VRA VRB 1 PS XO]    bcdtrunc.   VRT,VRA,VRB,PS
+
+// Decimal Unsigned Truncate VX-form p363
+[PO VRT VRA VRB 1 / XO]     bcdutrunc.  VRT,VRA,VRB
+
+// 3.3.10.1 Character-Type Compare Instructions
+
+// Compare Ranged Byte X-form p87
+[PO BF / L RA RB XO /]      cmprb       BF,L,RA,RB
+
+// Compare Equal Byte X-form p88
+[PO BF // RA RB XO /]       cmpeqb      BF,RA,RB
+
+// 3.3.13 Fixed-Point Logical Instructions
+
+// Count Trailing Zeros Word X-form p95
+[PO RS RA /// XO Rc]        cnttzw(.)   RA,RS
+
+// 3.3.13.1 64-bit Fixed-Point Logical Instructions
+
+// Count Trailing Zeros Doubleword  X-form p98
+[PO RS RA /// XO Rc]        cnttzd(.)   RA,RS
+
+// 4.4 Copy-Paste Facility
+
+// Copy X-form p858
+[PO /// L RA RB XO /]       copy        RA,RB,L
+                            copy_first = copy RA, RB, 1
+// CP_Abort p860
+[PO /// /// /// XO /]       cp_abort
+
+// Paste p859
+[PO /// L RA RB XO Rc]      paste(.)    RA,RB,L
+                            paste_last = paste RA,RB,1
+
+// 3.3.9 Fixed-Point Arithmetic Instructions
+
+// Deliver A Random Number X-form p79
+[PO RT /// L /// XO /]      darn        RT,L
+
+// Multiply-Add High Doubleword VA-form p81
+[PO RT RA RB RC XO]         maddhd      RT,RA.RB,RC
+
+// Multiply-Add High Doubleword Unsigned VA-form  p81
+[PO RT RA RB RC XO]         maddhdu     RT,RA.RB,RC
+
+// Multiply-Add Low Doubleword VA-form p81
+[PO RT RA RB RC XO]         maddld      RT,RA.RB,RC
+
+// Modulo Signed Word X-form p76
+[PO RT RA RB XO /]          modsw       RT,RA,RB
+
+// Modulo Unsigned Word X-form p76
+[PO RT RA RB XO /]          moduw       RT,RA,RB 
+
+// Modulo Signed Doubleword X-form p84
+[PO RT RA RB XO /]          modsd       RT,RA,RB
+
+// Modulo Unsigned Doubleword X-form p84
+[PO RT RA RB XO /]          modud       RT,RA,RB
+
+
+// DFP Test Significance Immediate [Quad] X-form p204
+[PO BF / UIM FRB XO /]      dtstsfi     BF,UIM,FRB
+[PO BF / UIM FRBp XO /]     dtstsfiq    BF,UIM,FRBp
+
+// 3.3.14.2.1 64-bit Fixed-Point Shift Instructions
+
+// Extend-Sign Word and Shift Left Immediate XS-form p109
+[PO RS RA sh XO sh Rc]      extswsli(.) RA,RS,SH
+
+// 4.5.1 Load Atomic
+
+// Load Word Atomic   X-form p864
+[PO RT RA FC XO /]          lwat        RT,RA,FC
+
+// Load Doubleword Atomic X-form p864
+[PO RT RA FC XO /]          ldat        RT,RA,FC
+
+// 4.5.2 Store Atomic
+
+// Store Word Atomic   X-form p866
+[PO RS RA FC XO /]          stwat       RS,RA,FC
+
+// Store Doubleword Atomic   X-form p866
+[PO RS RA FC XO /]          stdat       RS,RA,FC
+
+// 3.3.2.1 64-bit Fixed-Point Load Instructions 
+
+// Load Doubleword Monitored Indexed X-form p54
+[PO RT RA RB XO /]          ldmx        RT,RA,RB
+
+// 3.3.16 Move To/From Vector-Scalar Register Instructions
+
+// Move From VSR Lower Doubleword XX1-form p111
+[PO S RA /// XO SX]         mfvsrld     RA,XS
+
+// Move To VSR Double Doubleword XX1-form p114
+[PO T RA RB XO TX]          mtvsrdd     XT,RA,RB
+
+// Move To VSR Word & Splat XX1-form p115
+[PO T RA /// XO TX]         mtvsrws     XT,RA
+
+// Move to CR from XER Extended X-form p119
+[PO BF // /// /// XO /]     mcrxrx      BF
+
+// Set Boolean X-form p121
+[PO RT BFA // /// XO /]     setb        RT,BFA
+
+// Message Synchronize X-form p1126
+[PO /// /// /// XO /]       msgsync
+
+// SLB Invalidate Entry Global  X-form p1026
+[PO RS /// RB XO /]         slbieg      RS,RB 
+
+// SLB Synchronize  X-form p1031
+[PO /// /// /// XO /]       slbsync
+
+// 3.3.2.1 Power-Saving Mode Instruction
+
+// stop    XL-form p957
+[PO /// /// /// XO /]       stop
+
+// 4.6.4 Wait Instruction
+// Wait X-form p880
+[PO /// WC /// /// XO /]    wait
+
+// Unknow Instructions:
+urfid
+- gcc's implementation:
+    {"urfid",	XL(19,306),	0xffffffff,  POWER9,	PPCNONE,	{0}},
+    (4c 00 02 64|64 02 00 4c) 	urfid
+
+rmieg
+- gcc's implementation: 
+    {"rmieg",	X(31,882),	XRTRA_MASK,  POWER9,	PPCNONE,	{RB}},
+    (7c 00 f6 e4|e4 f6 00 7c) 	rmieg   r30
+
+//------------------------------------------------------------------------------
+//. Done:
+//------------------------------------------------------------------------------
+
+//======================================
+"vsx instructions"
+
+//--------------------------------------
+"7.6.1.2.1 VSX Scalar Move Instructions"
+// VSX Scalar Quad-Precision Move Instructions
+
+// VSX Scalar Copy Sign Quad-Precision X-form p.553
+[PO VRT VRA VRB XO /] xscpsgnqp
+
+// VSX Scalar Absolute Quad-Precision X-form 531
+// VSX Scalar Negate Quad-Precision X-form 627
+// VSX Scalar Negative Absolute Quad-Precision X-form 626
+[PO VRT XO VRB XO /] xsabsqp xsnegqp xsnabsqp
+
+//--------------------------------------
+"7.6.1.3 VSX Floating-Point Arithmetic Instructions"
+
+// VSX Scalar Quad-Precision Elementary Arithmetic
+
+// VSX Scalar Add Quad-Precision [using round to Odd] X-form 539
+// VSX Scalar Divide Quad-Precision [using round to Odd] X-form 584
+// VSX Scalar Multiply Quad-Precision [using round to Odd] X-form 622
+[PO VRT VRA VRB XO RO] xsaddqp xsaddqpo xsdivqp xsdivqpo xsmulqp xsmulqpo
+
+// VSX Scalar Square Root Quad-Precision [using round to Odd] X-form 662
+// VSX Scalar Subtract Quad-Precision [using round to Odd] X-form 667
+                       xssubqp xssubqpo
+
+[PO VRT XO VRB XO RO] xssqrtqp xssqrtqpo
+
+// VSX Scalar Quad-Precision Multiply-Add Arithmetic Instructions
+
+// VSX Scalar Multiply-Add Quad-Precision [using round to Odd] X-form 596
+// VSX Scalar Multiply-Subtract Quad-Precision [using round to Odd] X-form 617
+// VSX Scalar Negative Multiply-Add Quad-Precision [using round to Odd] X-form 636
+// VSX Scalar Negative Multiply-Subtract Quad-Precision [using round to Odd]
+// X-form 645
+[PO VRT VRA VRB XO RO] xsmaddqp xsmaddqpo xsmsubqp xsmsubqpo 
+                       xsnmaddqp xsnmaddqpo xsnmsubqp xsnmsubqpo
+
+22
+//--------------------------------------
+"7.6.1.4 VSX Floating-Point Compare Instructions"
+
+// VSX Scalar Quad-Precision Compare Instructions
+
+// VSX Scalar Compare Ordered Quad-Precision X-form 549
+// VSX Scalar Compare Unordered Quad-Precision X-form 552
+[PO BF // VRA VRB XO /] xscmpoqp xscmpuqp 
+
+"7.6.1.8 VSX Scalar Floating-Point Support Instructions"
+// VSX Scalar Compare Exponents Quad-Precision X-form p. 541 542
+[PO BF // A B XO AX BX /] xscmpexpdp 
+[PO BF // VRA VRB XO /] xscmpexpqp
+
+// VSX Scalar Compare DP, XX3-form, p.543 544 545
+// VSX Scalar Compare Equal Double-Precision, 
+[PO T A B XO AX BX TX]  xscmpeqdp xscmpgedp xscmpgtdp xscmpnedp
+
+// VSX Vector Compare Not Equal Double-Precision XX3-form 691
+[PO T A B Rc XO AX BX TX] xvcmpnedp xvcmpnedp. xvcmpnesp xvcmpnesp.
+
+//--------------------------------------
+"7.6.1.5 VSX FP-FP Conversion Instructions"
+// VSX Scalar Quad-Precision Floating-Point Conversion Instructions
+
+// VSX Scalar round & Convert Quad-Precision format to Double-Precision format
+// [using round to Odd] X-form 567
+[PO VRT XO VRB XO /] xscvqpdp xscvqpdpo (actually [PO VRT XO VRB XO RO])
+[PO VRT XO VRB XO /] xscvdpqp
+
+// VSX Scalar Quad-Precision Convert to Integer Instructions
+
+// VSX Scalar truncate & Convert Quad-Precision format to Signed Doubleword format
+// 568 570 572 574
+[PO VRT XO VRB XO /] xscvqpsdz xscvqpswz xscvqpudz xscvqpuwz
+576 = 580            xscvsdqp xscvudqp
+
+"7.6.1.7 VSX Round to Floating-Point Integer Instructions"
+// VSX Scalar round & Convert Double-Precision format to Half-Precision format
+// XX2-form 554 566
+[PO T XO B XO BX TX] xscvdphp xscvhpdp
+
+// VSX Vector Convert Half-Precision format to Single-Precision format
+// XX2-form 703 705
+[PO T XO B XO BX TX] xvcvhpsp xvcvsphp
+
+// VSX Scalar Round to Quad-Precision Integer [with Inexact] Z23-form 654
+[PO VRT /// R VRB RMC XO EX] xsrqpi xsrqpix
+
+// VSX Scalar Round Quad-Precision to Double-Extended Precision Z23-form 656
+[PO VRT /// R VRB RMC XO /] xsrqpxp 
+def XSRQPXP : Z23Form_1<63, 37,
+                        (outs vrrc:$vT), (ins u5imm:$R, vrrc:$vB, u2imm:$RMC),
+                        "xsrqpxp $vT, $R, $vB, $RMC"), IIC_VecFP, []>;
+
+27~28
+//--------------------------------------
+// VSX Scalar Insert Exponent Double-Precision X-form 588
+// VSX Scalar Insert Exponent Quad-Precision X-form 589
+[PO VT rA rB XO /]  xsiexpdp 
+[PO VRT VRA VRB XO /]  xsiexpqp
+
+// VSX Vector Insert Exponent Double-Precision XX3-form 722
+[PO T A B XO AX BX TX] xviexpdp xviexpsp
+
+// VSX Vector Extract Unsigned Word XX2-form 788
+// VSX Vector Insert Word XX2-form
+[PO T / UIM B XO BX TX] xxextractuw xxinsertw
+
+// VSX Scalar Extract Exponent Double-Precision XX2-form 676
+[PO BF DCMX B XO BX /]  
+[PO T XO B XO BX /] xsxexpdp xsxsigdp
+// X-form
+[PO VRT XO VRB XO /] xsxexpqp xsxsigqp
+
+// VSX Vector Extract Exponent Double-Precision XX2-form 784
+[PO T XO B XO BX TX] xvxexpdp xvxexpsp
+
+// VSX Vector Extract Significand Double-Precision XX2-form 785
+[PO T XO B XO BX TX] xvxsigdp xvxsigsp
+
+//--------------------------------------
+// VSX Scalar Test Data Class Double-Precision XX2-form p673
+// VSX Scalar Test Data Class Quad-Precision X-form 674
+// VSX Scalar Test Data Class Single-Precision XX2-form 675
+[PO BF DCMX B XO BX /]  xststdcdp xststdcsp
+[PO BF DCMX VRB XO /]   xststdcqp 
+
+// VSX Vector Test Data Class Double-Precision XX2-form 782 783
+[PO T dx B XO dc XO dm BX TX] xvtstdcdp xvtstdcsp 
+
+//--------------------------------------
+// VSX Scalar Maximum Type-C Double-Precision XX3-form 601 ~ 609
+[PO T A B XO AX BX TX] xsmaxcdp xsmaxjdp xsmincdp xsminjdp
+
+//--------------------------------------
+// VSX Vector Byte-Reverse Doubleword XX2-form 786 787
+[PO T XO B XO BX TX] xxbrd xxbrh xxbrq xxbrw
+
+// VSX Vector Permute XX3-form 794
+[PO T A B XO AX BX TX] xxperm xxpermr
+
+// VSX Vector Splat Immediate Byte 796 x-form
+[PO T EO IMM8 XO TX] xxspltib   <= sign or unsigned?
+
+30
+//--------------------------------------
+// Load VSX Vector DQ-form 511
+[PO T RA DQ TX XO] lxv 
+
+// Store VSX Vector DQ-form 526
+[PO S RA DQ SX XO] stxv
+
+// Load VSX Scalar Doubleword DS-form 499
+// Load VSX Scalar Single DS-form 504
+[PO VRT RA DS XO] lxsd lxssp 
+
+// Store VSX Scalar Doubleword DS-form 517
+// Store VSX Scalar Single DS-form 520
+[PO VRT RA DS XO] stxsd stxssp
+
+
+// Load VSX Vector Indexed X-form 511
+// Load VSX Scalar as Integer Byte & Zero Indexed X-form 501
+// Load VSX Vector Byte*16 Indexed X-form 506
+// Load VSX Vector with Length X-form 508
+// Load VSX Vector Left-justified with Length X-form 510
+// Load VSX Vector Halfword*8 Indexed X-form 514
+// Load VSX Vector Word & Splat Indexed X-form 516
+[PO T RA RB XO TX] lxvx lxsibzx lxsihzx lxvb16x lxvl lxvll lxvh8x lxvwsx
+
+// Store VSX Scalar as Integer Byte Indexed X-form 518
+// Store VSX Scalar as Integer Halfword Indexed X-form 518
+// Store VSX Vector Byte*16 Indexed X-form 522
+// Store VSX Vector Halfword*8 Indexed X-form 524
+// Store VSX Vector with Length X-form 526
+// Store VSX Vector Left-justified with Length X-form 528
+// Store VSX Vector Indexed X-form 529
+[PO S RA RB XO SX] stxsibx stxsihx stxvb16x stxvh8x stxvl stxvll stxvx
+
+21
+
+//--------------------------------------
+". vector instructions"
+
+[1] PowerISA-v3.0 p.933 - Table 1, and Chapter 6. Vector Facility (altivec)
+[2] https://sourceware.org/ml/binutils/2015-11/msg00071.html
+
+//--------------------------------------
+New patch:
+// vector bit, p.367, 6.16 Vector Bit Permute Instruction
+[PO VRT VRA VRB XO] vbpermd, (existing: vbpermq)
+
+// vector permute, p.280
+[PO VRT VRA VRB VRC XO] vpermr
+
+// vector rotate left, p.341
+[PO VRT VRA VRB XO] vrlwnm vrlwmi vrldnm vrldmi
+
+// vector shift, p.285
+[PO VRT VRA VRB XO] vslv vsrv
+
+// vector multiply-by-10, p.375
+[PO VRT VRA /// XO] vmul10cuq vmul10uq
+[PO VRT VRA VRB XO] vmul10ecuq vmul10euq 
+
+12
+//--------------------------------------
+http://reviews.llvm.org/D15887 + ext + neg + prty - vbpermd
+// vector count leading/trailing zero
+. new vx-form: p.31, 1.6.14 VX-FORM
+[PO RT EO VRB XO] vclzlsbb vctzlsbb (p.363)
+
+// Vector Count Trailing Zeros Instructions, 362
+[PO VRT EO VRB XO] vctzb vctzh vctzw vctzd (v16i8 v8i16 v4i32 v2i64)
+
+// vector extend sign (p.314)
+[PO VRT EO VRB XO] vextsb2w vextsh2w vextsb2d vextsh2d vextsw2d
+
+// vector negate, p.313
+[PO VRT EO VRB XO] vnegd vnegw
+
+// vector parity, p.335
+[PO VRT EO VRB XO] vprtybd vprtybq vprtybw
+
+16
+//--------------------------------------
+// vector compare, p.330
+[PO VRT VRA VRB RC XO] vcmpneb vcmpneb. vcmpneh vcmpneh. vcmpnew vcmpnew.
+                       vcmpnezb vcmpnezb. vcmpnezh vcmpnezh. vcmpnezw vcmpnezw.
+12
+//--------------------------------------
+http://reviews.llvm.org/D15917 + insert
+// vector extract (p.287) ref: vspltb (v2.07, p.227)
+// vector insert, p.288
+[PO VRT / UIM VRB XO] vinsertb vinsertd vinserth vinsertw
+
+// Vector Extract Unsigned
+[PO VRT / UIM VRB XO] vextractub vextractuh vextractuw vextractd
+
+// p.364: Vector Extract Unsigned Left/Right-Indexed
+[PO RT RA VRB XO] vextublx vextubrx vextuhlx vextuhrx vextuwlx vextuwrx 
+
+14
diff --git a/lib/Target/README.txt b/lib/Target/README.txt
index 7e9888cc13e8..ab9a025930fe 100644
--- a/lib/Target/README.txt
+++ b/lib/Target/README.txt
@@ -2081,7 +2081,7 @@ struct x testfunc() {
 }
 
 We currently compile this to:
-$ clang t.c -S -o - -O0 -emit-llvm | opt -scalarrepl -S
+$ clang t.c -S -o - -O0 -emit-llvm | opt -sroa -S
 
 
 %struct.x = type { i8, [4 x i32] }
diff --git a/lib/Target/Sparc/AsmParser/Makefile b/lib/Target/Sparc/AsmParser/Makefile
deleted file mode 100644
index 46b3e45f2bed..000000000000
--- a/lib/Target/Sparc/AsmParser/Makefile
+++ /dev/null
@@ -1,15 +0,0 @@
-##===- lib/Target/Sparc/AsmParser/Makefile ------------------*- Makefile-*-===##
-#
-#                     The LLVM Compiler Infrastructure
-#
-# This file is distributed under the University of Illinois Open Source
-# License. See LICENSE.TXT for details.
-#
-##===----------------------------------------------------------------------===##
-LEVEL = ../../../..
-LIBRARYNAME = LLVMSparcAsmParser
-
-# Hack: we need to include 'main' Sparc target directory to grab private headers
-CPP.Flags += -I$(PROJ_OBJ_DIR)/.. -I$(PROJ_SRC_DIR)/..
-
-include $(LEVEL)/Makefile.common
diff --git a/lib/Target/Sparc/AsmParser/SparcAsmParser.cpp b/lib/Target/Sparc/AsmParser/SparcAsmParser.cpp
index a55274744fd1..b2003b8f101b 100644
--- a/lib/Target/Sparc/AsmParser/SparcAsmParser.cpp
+++ b/lib/Target/Sparc/AsmParser/SparcAsmParser.cpp
@@ -7,18 +7,18 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "MCTargetDesc/SparcMCTargetDesc.h"
 #include "MCTargetDesc/SparcMCExpr.h"
+#include "MCTargetDesc/SparcMCTargetDesc.h"
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/MC/MCContext.h"
 #include "llvm/MC/MCInst.h"
 #include "llvm/MC/MCObjectFileInfo.h"
 #include "llvm/MC/MCParser/MCParsedAsmOperand.h"
+#include "llvm/MC/MCParser/MCTargetAsmParser.h"
 #include "llvm/MC/MCRegisterInfo.h"
 #include "llvm/MC/MCStreamer.h"
 #include "llvm/MC/MCSubtargetInfo.h"
 #include "llvm/MC/MCSymbol.h"
-#include "llvm/MC/MCTargetAsmParser.h"
 #include "llvm/Support/TargetRegistry.h"
 
 using namespace llvm;
@@ -150,6 +150,22 @@ public:
     Sparc::L0_L1, Sparc::L2_L3, Sparc::L4_L5, Sparc::L6_L7,
     Sparc::I0_I1, Sparc::I2_I3, Sparc::I4_I5, Sparc::I6_I7};
 
+  static const MCPhysReg CoprocRegs[32] = {
+    Sparc::C0,  Sparc::C1,  Sparc::C2,  Sparc::C3,
+    Sparc::C4,  Sparc::C5,  Sparc::C6,  Sparc::C7,
+    Sparc::C8,  Sparc::C9,  Sparc::C10, Sparc::C11,
+    Sparc::C12, Sparc::C13, Sparc::C14, Sparc::C15,
+    Sparc::C16, Sparc::C17, Sparc::C18, Sparc::C19,
+    Sparc::C20, Sparc::C21, Sparc::C22, Sparc::C23,
+    Sparc::C24, Sparc::C25, Sparc::C26, Sparc::C27,
+    Sparc::C28, Sparc::C29, Sparc::C30, Sparc::C31 };
+
+  static const MCPhysReg CoprocPairRegs[] = {
+    Sparc::C0_C1,   Sparc::C2_C3,   Sparc::C4_C5,   Sparc::C6_C7,
+    Sparc::C8_C9,   Sparc::C10_C11, Sparc::C12_C13, Sparc::C14_C15,
+    Sparc::C16_C17, Sparc::C18_C19, Sparc::C20_C21, Sparc::C22_C23,
+    Sparc::C24_C25, Sparc::C26_C27, Sparc::C28_C29, Sparc::C30_C31};
+  
 /// SparcOperand - Instances of this class represent a parsed Sparc machine
 /// instruction.
 class SparcOperand : public MCParsedAsmOperand {
@@ -161,6 +177,8 @@ public:
     rk_FloatReg,
     rk_DoubleReg,
     rk_QuadReg,
+    rk_CoprocReg,
+    rk_CoprocPairReg,
     rk_Special,
   };
 
@@ -224,6 +242,9 @@ public:
                                    || Reg.Kind == rk_DoubleReg));
   }
 
+  bool isCoprocReg() const {
+    return (Kind == k_Register && Reg.Kind == rk_CoprocReg);
+  }
 
   StringRef getToken() const {
     assert(Kind == k_Token && "Invalid access!");
@@ -398,6 +419,19 @@ public:
     return true;
   }
 
+  static bool MorphToCoprocPairReg(SparcOperand &Op) {
+    unsigned Reg = Op.getReg();
+    assert(Op.Reg.Kind == rk_CoprocReg);
+    unsigned regIdx = 32;
+    if (Reg >= Sparc::C0 && Reg <= Sparc::C31)
+      regIdx = Reg - Sparc::C0;
+    if (regIdx % 2 || regIdx > 31)
+      return false;
+    Op.Reg.RegNum = CoprocPairRegs[regIdx / 2];
+    Op.Reg.Kind = rk_CoprocPairReg;
+    return true;
+  }
+  
   static std::unique_ptr<SparcOperand>
   MorphToMEMrr(unsigned Base, std::unique_ptr<SparcOperand> Op) {
     unsigned offsetReg = Op->getReg();
@@ -602,8 +636,12 @@ bool SparcAsmParser::ParseInstruction(ParseInstructionInfo &Info,
       return Error(Loc, "unexpected token");
     }
 
-    while (getLexer().is(AsmToken::Comma)) {
-      Parser.Lex(); // Eat the comma.
+    while (getLexer().is(AsmToken::Comma) || getLexer().is(AsmToken::Plus)) {
+      if (getLexer().is(AsmToken::Plus)) {
+      // Plus tokens are significant in software_traps (p83, sparcv8.pdf). We must capture them.
+        Operands.push_back(SparcOperand::CreateToken("+", Parser.getTok().getLoc()));
+      }
+      Parser.Lex(); // Eat the comma or plus.
       // Parse and remember the operand.
       if (parseOperand(Operands, Name) != MatchOperand_Success) {
         SMLoc Loc = getLexer().getLoc();
@@ -646,6 +684,12 @@ ParseDirective(AsmToken DirectiveID)
     Parser.eatToEndOfStatement();
     return false;
   }
+  if (IDVal == ".proc") {
+    // For compatibility, ignore this directive.
+    // (It's supposed to be an "optimization" in the Sun assembler)
+    Parser.eatToEndOfStatement();
+    return false;
+  }
 
   // Let the MC layer to handle other directives.
   return true;
@@ -728,7 +772,7 @@ SparcAsmParser::parseOperand(OperandVector &Operands, StringRef Mnemonic) {
                                                  Parser.getTok().getLoc()));
     Parser.Lex(); // Eat the [
 
-    if (Mnemonic == "cas" || Mnemonic == "casx") {
+    if (Mnemonic == "cas" || Mnemonic == "casx" || Mnemonic == "casa") {
       SMLoc S = Parser.getTok().getLoc();
       if (getLexer().getKind() != AsmToken::Percent)
         return MatchOperand_NoMatch;
@@ -809,6 +853,15 @@ SparcAsmParser::parseSparcAsmOperand(std::unique_ptr<SparcOperand> &Op,
       case Sparc::FSR:
         Op = SparcOperand::CreateToken("%fsr", S);
         break;
+      case Sparc::FQ:
+        Op = SparcOperand::CreateToken("%fq", S);
+        break;
+      case Sparc::CPSR:
+        Op = SparcOperand::CreateToken("%csr", S);
+        break;
+      case Sparc::CPQ:
+        Op = SparcOperand::CreateToken("%cq", S);
+        break;
       case Sparc::WIM:
         Op = SparcOperand::CreateToken("%wim", S);
         break;
@@ -846,8 +899,7 @@ SparcAsmParser::parseSparcAsmOperand(std::unique_ptr<SparcOperand> &Op,
 
       const MCExpr *Res = MCSymbolRefExpr::create(Sym, MCSymbolRefExpr::VK_None,
                                                   getContext());
-      if (isCall &&
-          getContext().getObjectFileInfo()->getRelocM() == Reloc::PIC_)
+      if (isCall && getContext().getObjectFileInfo()->isPositionIndependent())
         Res = SparcMCExpr::create(SparcMCExpr::VK_Sparc_WPLT30, Res,
                                   getContext());
       Op = SparcOperand::CreateImm(Res, S, E);
@@ -941,6 +993,24 @@ bool SparcAsmParser::matchRegisterName(const AsmToken &Tok,
       return true;
     }
 
+    if (name.equals("fq")) {
+      RegNo = Sparc::FQ;
+      RegKind = SparcOperand::rk_Special;
+      return true;
+    }
+
+    if (name.equals("csr")) {
+      RegNo = Sparc::CPSR;
+      RegKind = SparcOperand::rk_Special;
+      return true;
+    }
+
+    if (name.equals("cq")) {
+      RegNo = Sparc::CPQ;
+      RegKind = SparcOperand::rk_Special;
+      return true;
+    }
+    
     if (name.equals("wim")) {
       RegNo = Sparc::WIM;
       RegKind = SparcOperand::rk_Special;
@@ -1025,6 +1095,15 @@ bool SparcAsmParser::matchRegisterName(const AsmToken &Tok,
       return true;
     }
 
+    // %c0 - %c31
+    if (name.substr(0, 1).equals_lower("c")
+        && !name.substr(1).getAsInteger(10, intVal)
+        && intVal < 32) {
+      RegNo = CoprocRegs[intVal];
+      RegKind = SparcOperand::rk_CoprocReg;
+      return true;
+    }
+    
     if (name.equals("tpc")) {
       RegNo = Sparc::TPC;
       RegKind = SparcOperand::rk_Special;
@@ -1141,7 +1220,7 @@ SparcAsmParser::adjustPICRelocation(SparcMCExpr::VariantKind VK,
   // actually a %pc10 or %pc22 relocation. Otherwise, they are interpreted
   // as %got10 or %got22 relocation.
 
-  if (getContext().getObjectFileInfo()->getRelocM() == Reloc::PIC_) {
+  if (getContext().getObjectFileInfo()->isPositionIndependent()) {
     switch(VK) {
     default: break;
     case SparcMCExpr::VK_Sparc_LO:
@@ -1215,5 +1294,9 @@ unsigned SparcAsmParser::validateTargetOperandClass(MCParsedAsmOperand &GOp,
     if (SparcOperand::MorphToIntPairReg(Op))
       return MCTargetAsmParser::Match_Success;
   }
+  if (Op.isCoprocReg() && Kind == MCK_CoprocPair) {
+     if (SparcOperand::MorphToCoprocPairReg(Op))
+       return MCTargetAsmParser::Match_Success;
+   }
   return Match_InvalidOperand;
 }
diff --git a/lib/Target/Sparc/CMakeLists.txt b/lib/Target/Sparc/CMakeLists.txt
index 5b7bfdd28020..312215cf6cde 100644
--- a/lib/Target/Sparc/CMakeLists.txt
+++ b/lib/Target/Sparc/CMakeLists.txt
@@ -13,6 +13,7 @@ add_public_tablegen_target(SparcCommonTableGen)
 
 add_llvm_target(SparcCodeGen
   DelaySlotFiller.cpp
+  LeonPasses.cpp
   SparcAsmPrinter.cpp
   SparcInstrInfo.cpp
   SparcISelDAGToDAG.cpp
diff --git a/lib/Target/Sparc/DelaySlotFiller.cpp b/lib/Target/Sparc/DelaySlotFiller.cpp
index c689b7f7201e..944f3551279e 100644
--- a/lib/Target/Sparc/DelaySlotFiller.cpp
+++ b/lib/Target/Sparc/DelaySlotFiller.cpp
@@ -38,14 +38,10 @@ static cl::opt<bool> DisableDelaySlotFiller(
 
 namespace {
   struct Filler : public MachineFunctionPass {
-    /// Target machine description which we query for reg. names, data
-    /// layout, etc.
-    ///
-    TargetMachine &TM;
     const SparcSubtarget *Subtarget;
 
     static char ID;
-    Filler(TargetMachine &tm) : MachineFunctionPass(ID), TM(tm) {}
+    Filler() : MachineFunctionPass(ID) {}
 
     const char *getPassName() const override {
       return "SPARC Delay Slot Filler";
@@ -66,6 +62,11 @@ namespace {
       return Changed;
     }
 
+    MachineFunctionProperties getRequiredProperties() const override {
+      return MachineFunctionProperties().set(
+          MachineFunctionProperties::Property::AllVRegsAllocated);
+    }
+
     void insertCallDefsUses(MachineBasicBlock::iterator MI,
                             SmallSet<unsigned, 32>& RegDefs,
                             SmallSet<unsigned, 32>& RegUses);
@@ -98,7 +99,7 @@ namespace {
 /// slots in Sparc MachineFunctions
 ///
 FunctionPass *llvm::createSparcDelaySlotFillerPass(TargetMachine &tm) {
-  return new Filler(tm);
+  return new Filler;
 }
 
 
@@ -268,6 +269,22 @@ bool Filler::delayHasHazard(MachineBasicBlock::iterator candidate,
         return true;
     }
   }
+
+  unsigned Opcode = candidate->getOpcode();
+  // LD and LDD may have NOPs inserted afterwards in the case of some LEON
+  // processors, so we can't use the delay slot if this feature is switched-on.
+  if (Subtarget->insertNOPLoad()
+      &&
+      Opcode >=  SP::LDDArr && Opcode <= SP::LDrr)
+    return true;
+
+  // Same as above for FDIV and FSQRT on some LEON processors.
+  if (Subtarget->fixAllFDIVSQRT()
+      &&
+      Opcode >=  SP::FDIVD && Opcode <= SP::FSQRTD)
+    return true;
+
+
   return false;
 }
 
@@ -290,12 +307,12 @@ void Filler::insertCallDefsUses(MachineBasicBlock::iterator MI,
     assert(Reg.isUse() && "CALL first operand is not a use.");
     RegUses.insert(Reg.getReg());
 
-    const MachineOperand &RegOrImm = MI->getOperand(1);
-    if (RegOrImm.isImm())
+    const MachineOperand &Operand1 = MI->getOperand(1);
+    if (Operand1.isImm() || Operand1.isGlobal())
         break;
-    assert(RegOrImm.isReg() && "CALLrr second operand is not a register.");
-    assert(RegOrImm.isUse() && "CALLrr second operand is not a use.");
-    RegUses.insert(RegOrImm.getReg());
+    assert(Operand1.isReg() && "CALLrr second operand is not a register.");
+    assert(Operand1.isUse() && "CALLrr second operand is not a use.");
+    RegUses.insert(Operand1.getReg());
     break;
   }
 }
diff --git a/lib/Target/Sparc/Disassembler/Makefile b/lib/Target/Sparc/Disassembler/Makefile
deleted file mode 100644
index bc17ddc48c7d..000000000000
--- a/lib/Target/Sparc/Disassembler/Makefile
+++ /dev/null
@@ -1,16 +0,0 @@
-##===- lib/Target/Sparc/Disassembler/Makefile --------------*- Makefile -*-===##
-#
-#                     The LLVM Compiler Infrastructure
-#
-# This file is distributed under the University of Illinois Open Source
-# License. See LICENSE.TXT for details.
-#
-##===----------------------------------------------------------------------===##
-
-LEVEL = ../../../..
-LIBRARYNAME = LLVMSparcDisassembler
-
-# Hack: we need to include 'main' Sparc target directory to grab private headers
-CPP.Flags += -I$(PROJ_OBJ_DIR)/.. -I$(PROJ_SRC_DIR)/..
-
-include $(LEVEL)/Makefile.common
diff --git a/lib/Target/Sparc/Disassembler/SparcDisassembler.cpp b/lib/Target/Sparc/Disassembler/SparcDisassembler.cpp
index 51751ec511c9..1dea379e14ec 100644
--- a/lib/Target/Sparc/Disassembler/SparcDisassembler.cpp
+++ b/lib/Target/Sparc/Disassembler/SparcDisassembler.cpp
@@ -14,7 +14,7 @@
 #include "Sparc.h"
 #include "SparcRegisterInfo.h"
 #include "SparcSubtarget.h"
-#include "llvm/MC/MCDisassembler.h"
+#include "llvm/MC/MCDisassembler/MCDisassembler.h"
 #include "llvm/MC/MCFixedLenDisassembler.h"
 #include "llvm/MC/MCInst.h"
 #include "llvm/MC/MCContext.h"
@@ -130,6 +130,25 @@ static const uint16_t IntPairDecoderTable[] = {
   SP::I0_I1, SP::I2_I3, SP::I4_I5, SP::I6_I7,
 };
 
+static const unsigned CPRegDecoderTable[] = {
+  SP::C0,  SP::C1,  SP::C2,  SP::C3,
+  SP::C4,  SP::C5,  SP::C6,  SP::C7,
+  SP::C8,  SP::C9,  SP::C10, SP::C11,
+  SP::C12, SP::C13, SP::C14, SP::C15,
+  SP::C16, SP::C17, SP::C18, SP::C19,
+  SP::C20, SP::C21, SP::C22, SP::C23,
+  SP::C24, SP::C25, SP::C26, SP::C27,
+  SP::C28, SP::C29, SP::C30, SP::C31
+};
+
+
+static const uint16_t CPPairDecoderTable[] = {
+  SP::C0_C1,   SP::C2_C3,   SP::C4_C5,   SP::C6_C7,
+  SP::C8_C9,   SP::C10_C11, SP::C12_C13, SP::C14_C15,
+  SP::C16_C17, SP::C18_C19, SP::C20_C21, SP::C22_C23,
+  SP::C24_C25, SP::C26_C27, SP::C28_C29, SP::C30_C31
+};
+
 static DecodeStatus DecodeIntRegsRegisterClass(MCInst &Inst,
                                                unsigned RegNo,
                                                uint64_t Address,
@@ -191,6 +210,17 @@ static DecodeStatus DecodeQFPRegsRegisterClass(MCInst &Inst,
   return MCDisassembler::Success;
 }
 
+static DecodeStatus DecodeCPRegsRegisterClass(MCInst &Inst,
+                                               unsigned RegNo,
+                                               uint64_t Address,
+                                               const void *Decoder) {
+  if (RegNo > 31)
+    return MCDisassembler::Fail;
+  unsigned Reg = CPRegDecoderTable[RegNo];
+  Inst.addOperand(MCOperand::createReg(Reg));
+  return MCDisassembler::Success;
+}
+
 static DecodeStatus DecodeFCCRegsRegisterClass(MCInst &Inst, unsigned RegNo,
                                                uint64_t Address,
                                                const void *Decoder) {
@@ -233,6 +263,16 @@ static DecodeStatus DecodeIntPairRegisterClass(MCInst &Inst, unsigned RegNo,
   return S;
 }
 
+static DecodeStatus DecodeCPPairRegisterClass(MCInst &Inst, unsigned RegNo,
+                                   uint64_t Address, const void *Decoder) {
+  if (RegNo > 31)
+    return MCDisassembler::Fail;
+
+  unsigned RegisterPair = CPPairDecoderTable[RegNo/2];
+  Inst.addOperand(MCOperand::createReg(RegisterPair));
+  return MCDisassembler::Success;
+}
+
 static DecodeStatus DecodeLoadInt(MCInst &Inst, unsigned insn, uint64_t Address,
                                   const void *Decoder);
 static DecodeStatus DecodeLoadIntPair(MCInst &Inst, unsigned insn, uint64_t Address,
@@ -243,6 +283,10 @@ static DecodeStatus DecodeLoadDFP(MCInst &Inst, unsigned insn, uint64_t Address,
                                   const void *Decoder);
 static DecodeStatus DecodeLoadQFP(MCInst &Inst, unsigned insn, uint64_t Address,
                                   const void *Decoder);
+static DecodeStatus DecodeLoadCP(MCInst &Inst, unsigned insn, uint64_t Address,
+                                  const void *Decoder);
+static DecodeStatus DecodeLoadCPPair(MCInst &Inst, unsigned insn, uint64_t Address,
+                                  const void *Decoder);
 static DecodeStatus DecodeStoreInt(MCInst &Inst, unsigned insn,
                                    uint64_t Address, const void *Decoder);
 static DecodeStatus DecodeStoreIntPair(MCInst &Inst, unsigned insn,
@@ -253,6 +297,10 @@ static DecodeStatus DecodeStoreDFP(MCInst &Inst, unsigned insn,
                                    uint64_t Address, const void *Decoder);
 static DecodeStatus DecodeStoreQFP(MCInst &Inst, unsigned insn,
                                    uint64_t Address, const void *Decoder);
+static DecodeStatus DecodeStoreCP(MCInst &Inst, unsigned insn,
+                                   uint64_t Address, const void *Decoder);
+static DecodeStatus DecodeStoreCPPair(MCInst &Inst, unsigned insn,
+                                   uint64_t Address, const void *Decoder);
 static DecodeStatus DecodeCall(MCInst &Inst, unsigned insn,
                                uint64_t Address, const void *Decoder);
 static DecodeStatus DecodeSIMM13(MCInst &Inst, unsigned insn,
@@ -263,6 +311,8 @@ static DecodeStatus DecodeReturn(MCInst &MI, unsigned insn, uint64_t Address,
                                  const void *Decoder);
 static DecodeStatus DecodeSWAP(MCInst &Inst, unsigned insn, uint64_t Address,
                                const void *Decoder);
+static DecodeStatus DecodeTRAP(MCInst &Inst, unsigned insn, uint64_t Address,
+                               const void *Decoder);
 
 #include "SparcGenDisassemblerTables.inc"
 
@@ -298,6 +348,18 @@ DecodeStatus SparcDisassembler::getInstruction(MCInst &Instr, uint64_t &Size,
     return MCDisassembler::Fail;
 
   // Calling the auto-generated decoder function.
+  
+  if (STI.getFeatureBits()[Sparc::FeatureV9])
+  {
+    Result = decodeInstruction(DecoderTableSparcV932, Instr, Insn, Address, this, STI);
+  }
+  else
+  {
+    Result = decodeInstruction(DecoderTableSparcV832, Instr, Insn, Address, this, STI);      
+  }
+  if (Result != MCDisassembler::Fail)
+    return Result;
+  
   Result =
       decodeInstruction(DecoderTableSparc32, Instr, Insn, Address, this, STI);
 
@@ -390,6 +452,18 @@ static DecodeStatus DecodeLoadQFP(MCInst &Inst, unsigned insn, uint64_t Address,
                    DecodeQFPRegsRegisterClass);
 }
 
+static DecodeStatus DecodeLoadCP(MCInst &Inst, unsigned insn, uint64_t Address,
+                                  const void *Decoder) {
+  return DecodeMem(Inst, insn, Address, Decoder, true,
+                   DecodeCPRegsRegisterClass);
+}
+
+static DecodeStatus DecodeLoadCPPair(MCInst &Inst, unsigned insn, uint64_t Address,
+                                  const void *Decoder) {
+  return DecodeMem(Inst, insn, Address, Decoder, true,
+                   DecodeCPPairRegisterClass);
+}
+
 static DecodeStatus DecodeStoreInt(MCInst &Inst, unsigned insn,
                                    uint64_t Address, const void *Decoder) {
   return DecodeMem(Inst, insn, Address, Decoder, false,
@@ -420,6 +494,18 @@ static DecodeStatus DecodeStoreQFP(MCInst &Inst, unsigned insn,
                    DecodeQFPRegsRegisterClass);
 }
 
+static DecodeStatus DecodeStoreCP(MCInst &Inst, unsigned insn,
+                                   uint64_t Address, const void *Decoder) {
+  return DecodeMem(Inst, insn, Address, Decoder, false,
+                   DecodeCPRegsRegisterClass);
+}
+
+static DecodeStatus DecodeStoreCPPair(MCInst &Inst, unsigned insn,
+                                   uint64_t Address, const void *Decoder) {
+  return DecodeMem(Inst, insn, Address, Decoder, false,
+                   DecodeCPPairRegisterClass);
+}
+
 static bool tryAddingSymbolicOperand(int64_t Value,  bool isBranch,
                                      uint64_t Address, uint64_t Offset,
                                      uint64_t Width, MCInst &MI,
@@ -547,3 +633,36 @@ static DecodeStatus DecodeSWAP(MCInst &MI, unsigned insn, uint64_t Address,
 
   return MCDisassembler::Success;
 }
+
+static DecodeStatus DecodeTRAP(MCInst &MI, unsigned insn, uint64_t Address,
+                               const void *Decoder) {
+
+  unsigned rs1 = fieldFromInstruction(insn, 14, 5);
+  unsigned isImm = fieldFromInstruction(insn, 13, 1);
+  unsigned cc =fieldFromInstruction(insn, 25, 4);
+  unsigned rs2 = 0;
+  unsigned imm7 = 0;
+  if (isImm)
+    imm7 = fieldFromInstruction(insn, 0, 7);
+  else
+    rs2 = fieldFromInstruction(insn, 0, 5);
+
+  // Decode RS1.
+  DecodeStatus status = DecodeIntRegsRegisterClass(MI, rs1, Address, Decoder);
+  if (status != MCDisassembler::Success)
+    return status;
+
+  // Decode RS1 | IMM7.
+  if (isImm)
+    MI.addOperand(MCOperand::createImm(imm7));
+  else {
+    status = DecodeIntRegsRegisterClass(MI, rs2, Address, Decoder);
+    if (status != MCDisassembler::Success)
+      return status;
+  }
+  
+  // Decode CC
+  MI.addOperand(MCOperand::createImm(cc));
+
+  return MCDisassembler::Success;
+}
diff --git a/lib/Target/Sparc/InstPrinter/Makefile b/lib/Target/Sparc/InstPrinter/Makefile
deleted file mode 100644
index 2dabd82965f4..000000000000
--- a/lib/Target/Sparc/InstPrinter/Makefile
+++ /dev/null
@@ -1,16 +0,0 @@
-##===- lib/Target/Sparc/InstPrinter/Makefile ---------------*- Makefile -*-===##
-#
-#                     The LLVM Compiler Infrastructure
-#
-# This file is distributed under the University of Illinois Open Source
-# License. See LICENSE.TXT for details.
-#
-##===----------------------------------------------------------------------===##
-
-LEVEL = ../../../..
-LIBRARYNAME = LLVMSparcAsmPrinter
-
-# Hack: we need to include 'main'  target directory to grab private headers
-CPP.Flags += -I$(PROJ_OBJ_DIR)/.. -I$(PROJ_SRC_DIR)/..
-
-include $(LEVEL)/Makefile.common
diff --git a/lib/Target/Sparc/InstPrinter/SparcInstPrinter.cpp b/lib/Target/Sparc/InstPrinter/SparcInstPrinter.cpp
index 5d714fe4da92..4981deae6af6 100644
--- a/lib/Target/Sparc/InstPrinter/SparcInstPrinter.cpp
+++ b/lib/Target/Sparc/InstPrinter/SparcInstPrinter.cpp
@@ -16,6 +16,7 @@
 #include "llvm/MC/MCExpr.h"
 #include "llvm/MC/MCInst.h"
 #include "llvm/MC/MCRegisterInfo.h"
+#include "llvm/MC/MCSubtargetInfo.h"
 #include "llvm/MC/MCSymbol.h"
 #include "llvm/Support/raw_ostream.h"
 using namespace llvm;
@@ -115,8 +116,21 @@ void SparcInstPrinter::printOperand(const MCInst *MI, int opNum,
   }
 
   if (MO.isImm()) {
-    O << (int)MO.getImm();
-    return;
+    switch (MI->getOpcode()) {
+      default:
+        O << (int)MO.getImm(); 
+        return;
+        
+      case SP::TICCri: // Fall through
+      case SP::TICCrr: // Fall through
+      case SP::TRAPri: // Fall through
+      case SP::TRAPrr: // Fall through
+      case SP::TXCCri: // Fall through
+      case SP::TXCCrr: // Fall through
+        // Only seven-bit values up to 127.
+        O << ((int) MO.getImm() & 0x7f);  
+        return;
+    }
   }
 
   assert(MO.isExpr() && "Unknown operand kind in printOperand");
@@ -166,6 +180,11 @@ void SparcInstPrinter::printCCOperand(const MCInst *MI, int opNum,
     // Make sure CC is a fp conditional flag.
     CC = (CC < 16) ? (CC + 16) : CC;
     break;
+  case SP::CBCOND:
+  case SP::CBCONDA:
+    // Make sure CC is a cp conditional flag.
+    CC = (CC < 32) ? (CC + 32) : CC;
+    break;
   }
   O << SPARCCondCodeToString((SPCC::CondCodes)CC);
 }
diff --git a/lib/Target/Sparc/LeonFeatures.td b/lib/Target/Sparc/LeonFeatures.td
new file mode 100755
index 000000000000..63f8b33c80cf
--- /dev/null
+++ b/lib/Target/Sparc/LeonFeatures.td
@@ -0,0 +1,91 @@
+//===-- LeonFeatures.td - Describe the Leon Features -------*- tablegen -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+//
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+// CASA Support differs between LEON3-FT GR712RC and LEON3-FT UT699
+// We need to have the option to switch this on and off.
+//===----------------------------------------------------------------------===//
+
+// support to casa instruction; for leon3 subtarget only
+def LeonCASA : SubtargetFeature<
+                   "hasleoncasa", "HasLeonCasa", "true",
+                   "Enable CASA instruction for LEON3 and LEON4 processors">;
+
+//===----------------------------------------------------------------------===//
+// UMAC and SMAC support for LEON3 and LEON4 processors.
+//===----------------------------------------------------------------------===//
+
+// support to casa instruction; for leon3 subtarget only
+def UMACSMACSupport
+    : SubtargetFeature<"hasumacsmac", "HasUmacSmac", "true",
+                       "Enable UMAC and SMAC for LEON3 and LEON4 processors">;
+
+//===----------------------------------------------------------------------===//
+// LEON Erratum fixes
+//===----------------------------------------------------------------------===//
+
+def ReplaceSDIV
+    : SubtargetFeature<
+          "replacesdiv", "PerformSDIVReplace", "true",
+          "AT697E erratum fix: Do not emit SDIV, emit SDIVCC instead">;
+
+def FixCALL
+    : SubtargetFeature<"fixcall", "FixCallImmediates", "true",
+                       "AT697E erratum fix: Restrict the size of the immediate "
+                       "operand of the CALL instruction to 20 bits">;
+
+def IgnoreZeroFlag
+    : SubtargetFeature<"ignrzeroflag", "IgnoreZeroFlag", "true",
+                       "AT697E erratum fix: Do not rely on the zero bit flag "
+                       "on a divide overflow for SDIVCC and UDIVCC">;
+
+def InsertNOPDoublePrecision
+    : SubtargetFeature<"insrtnopdblprcsn", "InsertNOPDoublePrecision", "true",
+                       "LEON2 erratum fix: Insert a NOP before the double "
+                       "precision floating point instruction">;
+
+def FixFSMULD : SubtargetFeature<"fixfsmuld", "FixFSMULD", "true",
+                                 "LEON3 erratum fix: Do not select FSMULD">;
+
+def ReplaceFMULS
+    : SubtargetFeature<"replacefmuls", "ReplaceFMULS", "true",
+                       "LEON3 erratum fix: Replace FMULS instruction with a "
+                       "routine using conversions/double precision operations "
+                       "to replace FMULS">;
+
+def PreventRoundChange
+    : SubtargetFeature<"prvntroundchange", "PreventRoundChange", "true",
+                       "LEON3 erratum fix: Prevent any rounding mode change "
+                       "request: use only the round-to-nearest rounding mode">;
+
+def FixAllFDIVSQRT
+    : SubtargetFeature<"fixallfdivsqrt", "FixAllFDIVSQRT", "true",
+                       "LEON3 erratum fix: Fix FDIVS/FDIVD/FSQRTS/FSQRTD "
+                       "instructions with NOPs and floating-point store">;
+
+def InsertNOPLoad
+    : SubtargetFeature<"insertnopload", "InsertNOPLoad", "true",
+                       "LEON3 erratum fix: Insert a NOP instruction after "
+                       "every single-cycle load instruction when the next "
+                       "instruction is another load/store instruction">;
+
+def FlushCacheLineSWAP
+    : SubtargetFeature<"flshcachelineswap", "FlushCacheLineSWAP", "true",
+                       "LEON3 erratum fix: Flush cache line containing the "
+                       "lock before performing any of the atomic instructions "
+                       "SWAP and LDSTUB">;
+
+def InsertNOPsLoadStore
+    : SubtargetFeature<"insertnopsloadstore", "InsertNOPsLoadStore", "true",
+                       "LEON3 erratum fix: Insert NOPs between "
+                       "single-precision loads and the store, so the number of "
+                       "instructions between is 4">;
diff --git a/lib/Target/Sparc/LeonPasses.cpp b/lib/Target/Sparc/LeonPasses.cpp
new file mode 100755
index 000000000000..5d0920892ff0
--- /dev/null
+++ b/lib/Target/Sparc/LeonPasses.cpp
@@ -0,0 +1,933 @@
+//===------ LeonPasses.cpp - Define passes specific to LEON ---------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+//
+//===----------------------------------------------------------------------===//
+
+#include "LeonPasses.h"
+#include "llvm/CodeGen/ISDOpcodes.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineInstr.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/IR/LLVMContext.h"
+#include "llvm/Support/raw_ostream.h"
+using namespace llvm;
+
+LEONMachineFunctionPass::LEONMachineFunctionPass(TargetMachine &tm, char &ID)
+    : MachineFunctionPass(ID) {}
+
+LEONMachineFunctionPass::LEONMachineFunctionPass(char &ID)
+    : MachineFunctionPass(ID) {}
+
+int LEONMachineFunctionPass::GetRegIndexForOperand(MachineInstr &MI,
+                                                   int OperandIndex) {
+  if (MI.getNumOperands() > 0) {
+    if (OperandIndex == LAST_OPERAND) {
+      OperandIndex = MI.getNumOperands() - 1;
+    }
+
+    if (MI.getNumOperands() > (unsigned)OperandIndex &&
+        MI.getOperand(OperandIndex).isReg()) {
+      return (int)MI.getOperand(OperandIndex).getReg();
+    }
+  }
+
+  static int NotFoundIndex = -10;
+  // Return a different number each time to avoid any comparisons between the
+  // values returned.
+  NotFoundIndex -= 10;
+  return NotFoundIndex;
+}
+
+// finds a new free FP register
+// checks also the AllocatedRegisters vector
+int LEONMachineFunctionPass::getUnusedFPRegister(MachineRegisterInfo &MRI) {
+  for (int RegisterIndex = SP::F0; RegisterIndex <= SP::F31; ++RegisterIndex) {
+    if (!MRI.isPhysRegUsed(RegisterIndex) &&
+        !(std::find(UsedRegisters.begin(), UsedRegisters.end(),
+                    RegisterIndex) != UsedRegisters.end())) {
+      return RegisterIndex;
+    }
+  }
+
+  return -1;
+}
+
+//*****************************************************************************
+//**** InsertNOPLoad pass
+//*****************************************************************************
+// This pass fixes the incorrectly working Load instructions that exists for
+// some earlier versions of the LEON processor line. NOP instructions must
+// be inserted after the load instruction to ensure that the Load instruction
+// behaves as expected for these processors.
+//
+// This pass inserts a NOP after any LD or LDF instruction.
+//
+char InsertNOPLoad::ID = 0;
+
+InsertNOPLoad::InsertNOPLoad(TargetMachine &tm)
+    : LEONMachineFunctionPass(tm, ID) {}
+
+bool InsertNOPLoad::runOnMachineFunction(MachineFunction &MF) {
+  Subtarget = &MF.getSubtarget<SparcSubtarget>();
+  const TargetInstrInfo &TII = *Subtarget->getInstrInfo();
+  DebugLoc DL = DebugLoc();
+
+  bool Modified = false;
+  for (auto MFI = MF.begin(), E = MF.end(); MFI != E; ++MFI) {
+    MachineBasicBlock &MBB = *MFI;
+    for (auto MBBI = MBB.begin(), E = MBB.end(); MBBI != E; ++MBBI) {
+      MachineInstr &MI = *MBBI;
+      unsigned Opcode = MI.getOpcode();
+      if (Opcode >= SP::LDDArr && Opcode <= SP::LDrr) {
+        MachineBasicBlock::iterator NMBBI = std::next(MBBI);
+        BuildMI(MBB, NMBBI, DL, TII.get(SP::NOP));
+        Modified = true;
+      } else if (MI.isInlineAsm()) {
+        // Look for an inline ld or ldf instruction.
+        StringRef AsmString =
+            MI.getOperand(InlineAsm::MIOp_AsmString).getSymbolName();
+        if (AsmString.startswith_lower("ld")) {
+          MachineBasicBlock::iterator NMBBI = std::next(MBBI);
+          BuildMI(MBB, NMBBI, DL, TII.get(SP::NOP));
+          Modified = true;
+        }
+      }
+    }
+  }
+
+  return Modified;
+}
+
+//*****************************************************************************
+//**** FixFSMULD pass
+//*****************************************************************************
+// This pass fixes the incorrectly working FSMULD instruction that exists for
+// some earlier versions of the LEON processor line.
+//
+// The pass should convert the FSMULD operands to double precision in scratch
+// registers, then calculate the result with the FMULD instruction. Therefore,
+// the pass should replace operations of the form:
+// fsmuld %f20,%f21,%f8
+// with the sequence:
+// fstod %f20,%f0
+// fstod %f21,%f2
+// fmuld %f0,%f2,%f8
+//
+char FixFSMULD::ID = 0;
+
+FixFSMULD::FixFSMULD(TargetMachine &tm) : LEONMachineFunctionPass(tm, ID) {}
+
+bool FixFSMULD::runOnMachineFunction(MachineFunction &MF) {
+  Subtarget = &MF.getSubtarget<SparcSubtarget>();
+  const TargetInstrInfo &TII = *Subtarget->getInstrInfo();
+  DebugLoc DL = DebugLoc();
+
+  bool Modified = false;
+  for (auto MFI = MF.begin(), E = MF.end(); MFI != E; ++MFI) {
+    MachineBasicBlock &MBB = *MFI;
+    for (auto MBBI = MBB.begin(), E = MBB.end(); MBBI != E; ++MBBI) {
+
+      MachineInstr &MI = *MBBI;
+      unsigned Opcode = MI.getOpcode();
+
+      const int UNASSIGNED_INDEX = -1;
+      int Reg1Index = UNASSIGNED_INDEX;
+      int Reg2Index = UNASSIGNED_INDEX;
+      int Reg3Index = UNASSIGNED_INDEX;
+
+      if (Opcode == SP::FSMULD && MI.getNumOperands() == 3) {
+        // take the registers from fsmuld %f20,%f21,%f8
+        Reg1Index = MI.getOperand(0).getReg();
+        Reg2Index = MI.getOperand(1).getReg();
+        Reg3Index = MI.getOperand(2).getReg();
+      } else if (MI.isInlineAsm()) {
+        StringRef AsmString =
+            MI.getOperand(InlineAsm::MIOp_AsmString).getSymbolName();
+        if (AsmString.startswith_lower("fsmuld")) {
+          // this is an inline FSMULD instruction
+
+          unsigned StartOp = InlineAsm::MIOp_FirstOperand;
+
+          // extracts the registers from the inline assembly instruction
+          for (unsigned i = StartOp, e = MI.getNumOperands(); i != e; ++i) {
+            const MachineOperand &MO = MI.getOperand(i);
+            if (MO.isReg()) {
+              if (Reg1Index == UNASSIGNED_INDEX)
+                Reg1Index = MO.getReg();
+              else if (Reg2Index == UNASSIGNED_INDEX)
+                Reg2Index = MO.getReg();
+              else if (Reg3Index == UNASSIGNED_INDEX)
+                Reg3Index = MO.getReg();
+            }
+            if (Reg3Index != UNASSIGNED_INDEX)
+              break;
+          }
+        }
+      }
+
+      if (Reg1Index != UNASSIGNED_INDEX && Reg2Index != UNASSIGNED_INDEX &&
+          Reg3Index != UNASSIGNED_INDEX) {
+        clearUsedRegisterList();
+        MachineBasicBlock::iterator NMBBI = std::next(MBBI);
+        // Whatever Reg3Index is hasn't been used yet, so we need to reserve it.
+        markRegisterUsed(Reg3Index);
+        const int ScratchReg1Index = getUnusedFPRegister(MF.getRegInfo());
+        markRegisterUsed(ScratchReg1Index);
+        const int ScratchReg2Index = getUnusedFPRegister(MF.getRegInfo());
+        markRegisterUsed(ScratchReg2Index);
+
+        if (ScratchReg1Index == UNASSIGNED_INDEX ||
+            ScratchReg2Index == UNASSIGNED_INDEX) {
+          errs() << "Cannot allocate free scratch registers for the FixFSMULD "
+                    "pass."
+                 << "\n";
+        } else {
+          // create fstod %f20,%f0
+          BuildMI(MBB, MBBI, DL, TII.get(SP::FSTOD))
+              .addReg(ScratchReg1Index)
+              .addReg(Reg1Index);
+
+          // create fstod %f21,%f2
+          BuildMI(MBB, MBBI, DL, TII.get(SP::FSTOD))
+              .addReg(ScratchReg2Index)
+              .addReg(Reg2Index);
+
+          // create fmuld %f0,%f2,%f8
+          BuildMI(MBB, MBBI, DL, TII.get(SP::FMULD))
+              .addReg(Reg3Index)
+              .addReg(ScratchReg1Index)
+              .addReg(ScratchReg2Index);
+
+          MI.eraseFromParent();
+          MBBI = NMBBI;
+
+          Modified = true;
+        }
+      }
+    }
+  }
+
+  return Modified;
+}
+
+//*****************************************************************************
+//**** ReplaceFMULS pass
+//*****************************************************************************
+// This pass fixes the incorrectly working FMULS instruction that exists for
+// some earlier versions of the LEON processor line.
+//
+// This pass converts the FMULS operands to double precision in scratch
+// registers, then calculates the result with the FMULD instruction.
+// The pass should replace operations of the form:
+// fmuls %f20,%f21,%f8
+// with the sequence:
+// fstod %f20,%f0
+// fstod %f21,%f2
+// fmuld %f0,%f2,%f8
+//
+char ReplaceFMULS::ID = 0;
+
+ReplaceFMULS::ReplaceFMULS(TargetMachine &tm)
+    : LEONMachineFunctionPass(tm, ID) {}
+
+bool ReplaceFMULS::runOnMachineFunction(MachineFunction &MF) {
+  Subtarget = &MF.getSubtarget<SparcSubtarget>();
+  const TargetInstrInfo &TII = *Subtarget->getInstrInfo();
+  DebugLoc DL = DebugLoc();
+
+  bool Modified = false;
+  for (auto MFI = MF.begin(), E = MF.end(); MFI != E; ++MFI) {
+    MachineBasicBlock &MBB = *MFI;
+    for (auto MBBI = MBB.begin(), E = MBB.end(); MBBI != E; ++MBBI) {
+      MachineInstr &MI = *MBBI;
+      unsigned Opcode = MI.getOpcode();
+
+      const int UNASSIGNED_INDEX = -1;
+      int Reg1Index = UNASSIGNED_INDEX;
+      int Reg2Index = UNASSIGNED_INDEX;
+      int Reg3Index = UNASSIGNED_INDEX;
+
+      if (Opcode == SP::FMULS && MI.getNumOperands() == 3) {
+        // take the registers from fmuls %f20,%f21,%f8
+        Reg1Index = MI.getOperand(0).getReg();
+        Reg2Index = MI.getOperand(1).getReg();
+        Reg3Index = MI.getOperand(2).getReg();
+      } else if (MI.isInlineAsm()) {
+        StringRef AsmString =
+            MI.getOperand(InlineAsm::MIOp_AsmString).getSymbolName();
+        if (AsmString.startswith_lower("fmuls")) {
+          // this is an inline FMULS instruction
+          unsigned StartOp = InlineAsm::MIOp_FirstOperand;
+
+          // extracts the registers from the inline assembly instruction
+          for (unsigned i = StartOp, e = MI.getNumOperands(); i != e; ++i) {
+            const MachineOperand &MO = MI.getOperand(i);
+            if (MO.isReg()) {
+              if (Reg1Index == UNASSIGNED_INDEX)
+                Reg1Index = MO.getReg();
+              else if (Reg2Index == UNASSIGNED_INDEX)
+                Reg2Index = MO.getReg();
+              else if (Reg3Index == UNASSIGNED_INDEX)
+                Reg3Index = MO.getReg();
+            }
+            if (Reg3Index != UNASSIGNED_INDEX)
+              break;
+          }
+        }
+      }
+
+      if (Reg1Index != UNASSIGNED_INDEX && Reg2Index != UNASSIGNED_INDEX &&
+          Reg3Index != UNASSIGNED_INDEX) {
+        clearUsedRegisterList();
+        MachineBasicBlock::iterator NMBBI = std::next(MBBI);
+        // Whatever Reg3Index is hasn't been used yet, so we need to reserve it.
+        markRegisterUsed(Reg3Index);
+        const int ScratchReg1Index = getUnusedFPRegister(MF.getRegInfo());
+        markRegisterUsed(ScratchReg1Index);
+        const int ScratchReg2Index = getUnusedFPRegister(MF.getRegInfo());
+        markRegisterUsed(ScratchReg2Index);
+
+        if (ScratchReg1Index == UNASSIGNED_INDEX ||
+            ScratchReg2Index == UNASSIGNED_INDEX) {
+          errs() << "Cannot allocate free scratch registers for the "
+                    "ReplaceFMULS pass."
+                 << "\n";
+        } else {
+          // create fstod %f20,%f0
+          BuildMI(MBB, MBBI, DL, TII.get(SP::FSTOD))
+              .addReg(ScratchReg1Index)
+              .addReg(Reg1Index);
+
+          // create fstod %f21,%f2
+          BuildMI(MBB, MBBI, DL, TII.get(SP::FSTOD))
+              .addReg(ScratchReg2Index)
+              .addReg(Reg2Index);
+
+          // create fmuld %f0,%f2,%f8
+          BuildMI(MBB, MBBI, DL, TII.get(SP::FMULD))
+              .addReg(Reg3Index)
+              .addReg(ScratchReg1Index)
+              .addReg(ScratchReg2Index);
+
+          MI.eraseFromParent();
+          MBBI = NMBBI;
+
+          Modified = true;
+        }
+      }
+    }
+  }
+
+  return Modified;
+}
+
+//*****************************************************************************
+//**** FixAllFDIVSQRT pass
+//*****************************************************************************
+// This pass fixes the incorrectly working FDIVx and FSQRTx instructions that
+// exist for some earlier versions of the LEON processor line. Five NOP
+// instructions need to be inserted after these instructions to ensure the
+// correct result is placed in the destination registers before they are used.
+//
+// This pass implements two fixes:
+//  1) fixing the FSQRTS and FSQRTD instructions.
+//  2) fixing the FDIVS and FDIVD instructions.
+//
+// FSQRTS and FDIVS are converted to FDIVD and FSQRTD respectively earlier in
+// the pipeline when this option is enabled, so this pass needs only to deal
+// with the changes that still need implementing for the "double" versions
+// of these instructions.
+//
+char FixAllFDIVSQRT::ID = 0;
+
+FixAllFDIVSQRT::FixAllFDIVSQRT(TargetMachine &tm)
+    : LEONMachineFunctionPass(tm, ID) {}
+
+bool FixAllFDIVSQRT::runOnMachineFunction(MachineFunction &MF) {
+  Subtarget = &MF.getSubtarget<SparcSubtarget>();
+  const TargetInstrInfo &TII = *Subtarget->getInstrInfo();
+  DebugLoc DL = DebugLoc();
+
+  bool Modified = false;
+  for (auto MFI = MF.begin(), E = MF.end(); MFI != E; ++MFI) {
+    MachineBasicBlock &MBB = *MFI;
+    for (auto MBBI = MBB.begin(), E = MBB.end(); MBBI != E; ++MBBI) {
+      MachineInstr &MI = *MBBI;
+      unsigned Opcode = MI.getOpcode();
+
+      if (MI.isInlineAsm()) {
+        StringRef AsmString =
+            MI.getOperand(InlineAsm::MIOp_AsmString).getSymbolName();
+        if (AsmString.startswith_lower("fsqrtd")) {
+          // this is an inline fsqrts instruction
+          Opcode = SP::FSQRTD;
+        } else if (AsmString.startswith_lower("fdivd")) {
+          // this is an inline fsqrts instruction
+          Opcode = SP::FDIVD;
+        }
+      }
+
+      // Note: FDIVS and FSQRTS cannot be generated when this erratum fix is
+      // switched on so we don't need to check for them here. They will
+      // already have been converted to FSQRTD or FDIVD earlier in the
+      // pipeline.
+      if (Opcode == SP::FSQRTD || Opcode == SP::FDIVD) {
+        // Insert 5 NOPs before FSQRTD,FDIVD.
+        for (int InsertedCount = 0; InsertedCount < 5; InsertedCount++)
+          BuildMI(MBB, MBBI, DL, TII.get(SP::NOP));
+
+        MachineBasicBlock::iterator NMBBI = std::next(MBBI);
+        // ... and inserting 28 NOPs after FSQRTD,FDIVD.
+        for (int InsertedCount = 0; InsertedCount < 28; InsertedCount++)
+          BuildMI(MBB, NMBBI, DL, TII.get(SP::NOP));
+
+        Modified = true;
+      }
+    }
+  }
+
+  return Modified;
+}
+
+//*****************************************************************************
+//**** ReplaceSDIV pass
+//*****************************************************************************
+// This pass fixes the incorrectly working SDIV instruction that
+// exist for some earlier versions of the LEON processor line. The instruction
+// is replaced with an SDIVcc instruction instead, which is working.
+//
+char ReplaceSDIV::ID = 0;
+
+ReplaceSDIV::ReplaceSDIV() : LEONMachineFunctionPass(ID) {}
+
+ReplaceSDIV::ReplaceSDIV(TargetMachine &tm) : LEONMachineFunctionPass(tm, ID) {}
+
+bool ReplaceSDIV::runOnMachineFunction(MachineFunction &MF) {
+  Subtarget = &MF.getSubtarget<SparcSubtarget>();
+  const TargetInstrInfo &TII = *Subtarget->getInstrInfo();
+
+  bool Modified = false;
+  for (auto MFI = MF.begin(), E = MF.end(); MFI != E; ++MFI) {
+    MachineBasicBlock &MBB = *MFI;
+    for (auto MBBI = MBB.begin(), E = MBB.end(); MBBI != E; ++MBBI) {
+      MachineInstr &MI = *MBBI;
+      unsigned Opcode = MI.getOpcode();
+      if (Opcode == SP::SDIVrr) {
+        MI.setDesc(TII.get(SP::SDIVCCrr));
+        Modified = true;
+      } else if (Opcode == SP::SDIVri) {
+        MI.setDesc(TII.get(SP::SDIVCCri));
+        Modified = true;
+      }
+    }
+  }
+
+  return Modified;
+}
+
+static RegisterPass<ReplaceSDIV> X("replace-sdiv", "Replase SDIV Pass", false,
+                                   false);
+
+//*****************************************************************************
+//**** FixCALL pass
+//*****************************************************************************
+// This pass restricts the size of the immediate operand of the CALL
+// instruction, which can cause problems on some earlier versions of the LEON
+// processor, which can interpret some of the call address bits incorrectly.
+//
+char FixCALL::ID = 0;
+
+FixCALL::FixCALL(TargetMachine &tm) : LEONMachineFunctionPass(tm, ID) {}
+
+bool FixCALL::runOnMachineFunction(MachineFunction &MF) {
+  bool Modified = false;
+
+  for (auto MFI = MF.begin(), E = MF.end(); MFI != E; ++MFI) {
+    MachineBasicBlock &MBB = *MFI;
+    for (auto MBBI = MBB.begin(), E = MBB.end(); MBBI != E; ++MBBI) {
+      MachineInstr &MI = *MBBI;
+      MI.print(errs());
+      errs() << "\n";
+
+      unsigned Opcode = MI.getOpcode();
+      if (Opcode == SP::CALL || Opcode == SP::CALLrr) {
+        unsigned NumOperands = MI.getNumOperands();
+        for (unsigned OperandIndex = 0; OperandIndex < NumOperands;
+             OperandIndex++) {
+          MachineOperand &MO = MI.getOperand(OperandIndex);
+          if (MO.isImm()) {
+            int64_t Value = MO.getImm();
+            MO.setImm(Value & 0x000fffffL);
+            Modified = true;
+            break;
+          }
+        }
+      } else if (MI.isInlineAsm()) // inline assembly immediate call
+      {
+        StringRef AsmString =
+            MI.getOperand(InlineAsm::MIOp_AsmString).getSymbolName();
+        if (AsmString.startswith_lower("call")) {
+          // this is an inline call instruction
+          unsigned StartOp = InlineAsm::MIOp_FirstOperand;
+
+          // extracts the registers from the inline assembly instruction
+          for (unsigned i = StartOp, e = MI.getNumOperands(); i != e; ++i) {
+            MachineOperand &MO = MI.getOperand(i);
+            if (MO.isImm()) {
+              int64_t Value = MO.getImm();
+              MO.setImm(Value & 0x000fffffL);
+              Modified = true;
+            }
+          }
+        }
+      }
+    }
+  }
+
+  return Modified;
+}
+
+//*****************************************************************************
+//**** IgnoreZeroFlag pass
+//*****************************************************************************
+// This erratum fix fixes the overflow behavior of SDIVCC and UDIVCC
+// instructions that exists on some earlier LEON processors. Where these
+// instructions are detected, they are replaced by a sequence that will
+// explicitly write the overflow bit flag if this is required.
+//
+char IgnoreZeroFlag::ID = 0;
+
+IgnoreZeroFlag::IgnoreZeroFlag(TargetMachine &tm)
+    : LEONMachineFunctionPass(tm, ID) {}
+
+bool IgnoreZeroFlag::runOnMachineFunction(MachineFunction &MF) {
+  Subtarget = &MF.getSubtarget<SparcSubtarget>();
+  const TargetInstrInfo &TII = *Subtarget->getInstrInfo();
+  DebugLoc DL = DebugLoc();
+
+  bool Modified = false;
+  for (auto MFI = MF.begin(), E = MF.end(); MFI != E; ++MFI) {
+    MachineBasicBlock &MBB = *MFI;
+    for (auto MBBI = MBB.begin(), E = MBB.end(); MBBI != E; ++MBBI) {
+      MachineInstr &MI = *MBBI;
+      unsigned Opcode = MI.getOpcode();
+      if (Opcode == SP::SDIVCCrr || Opcode == SP::SDIVCCri ||
+          Opcode == SP::UDIVCCrr || Opcode == SP::UDIVCCri) {
+
+        // split the current machine basic block - just after the sdivcc/udivcc
+        // instruction
+        // create a label that help us skip the zero flag update (of PSR -
+        // Processor Status Register)
+        // if conditions are not met
+        const BasicBlock *LLVM_BB = MBB.getBasicBlock();
+        MachineFunction::iterator It =
+            std::next(MachineFunction::iterator(MBB));
+
+        MachineBasicBlock *dneBB = MF.CreateMachineBasicBlock(LLVM_BB);
+        MF.insert(It, dneBB);
+
+        // Transfer the remainder of MBB and its successor edges to dneBB.
+        dneBB->splice(dneBB->begin(), &MBB,
+                      std::next(MachineBasicBlock::iterator(MI)), MBB.end());
+        dneBB->transferSuccessorsAndUpdatePHIs(&MBB);
+
+        MBB.addSuccessor(dneBB);
+
+        MachineBasicBlock::iterator NextMBBI = std::next(MBBI);
+
+        // bvc - branch if overflow flag not set
+        BuildMI(MBB, NextMBBI, DL, TII.get(SP::BCOND))
+            .addMBB(dneBB)
+            .addImm(SPCC::ICC_VS);
+
+        // bnz - branch if not zero
+        BuildMI(MBB, NextMBBI, DL, TII.get(SP::BCOND))
+            .addMBB(dneBB)
+            .addImm(SPCC::ICC_NE);
+
+        // use the WRPSR (Write Processor State Register) instruction to set the
+        // zeo flag to 1
+        // create wr %g0, 1, %psr
+        BuildMI(MBB, NextMBBI, DL, TII.get(SP::WRPSRri))
+            .addReg(SP::G0)
+            .addImm(1);
+
+        BuildMI(MBB, NextMBBI, DL, TII.get(SP::NOP));
+
+        Modified = true;
+      } else if (MI.isInlineAsm()) {
+        StringRef AsmString =
+            MI.getOperand(InlineAsm::MIOp_AsmString).getSymbolName();
+        if (AsmString.startswith_lower("sdivcc") ||
+            AsmString.startswith_lower("udivcc")) {
+          // this is an inline SDIVCC or UDIVCC instruction
+
+          // split the current machine basic block - just after the
+          // sdivcc/udivcc instruction
+          // create a label that help us skip the zero flag update (of PSR -
+          // Processor Status Register)
+          // if conditions are not met
+          const BasicBlock *LLVM_BB = MBB.getBasicBlock();
+          MachineFunction::iterator It =
+              std::next(MachineFunction::iterator(MBB));
+
+          MachineBasicBlock *dneBB = MF.CreateMachineBasicBlock(LLVM_BB);
+          MF.insert(It, dneBB);
+
+          // Transfer the remainder of MBB and its successor edges to dneBB.
+          dneBB->splice(dneBB->begin(), &MBB,
+                        std::next(MachineBasicBlock::iterator(MI)), MBB.end());
+          dneBB->transferSuccessorsAndUpdatePHIs(&MBB);
+
+          MBB.addSuccessor(dneBB);
+
+          MachineBasicBlock::iterator NextMBBI = std::next(MBBI);
+
+          // bvc - branch if overflow flag not set
+          BuildMI(MBB, NextMBBI, DL, TII.get(SP::BCOND))
+              .addMBB(dneBB)
+              .addImm(SPCC::ICC_VS);
+
+          // bnz - branch if not zero
+          BuildMI(MBB, NextMBBI, DL, TII.get(SP::BCOND))
+              .addMBB(dneBB)
+              .addImm(SPCC::ICC_NE);
+
+          // use the WRPSR (Write Processor State Register) instruction to set
+          // the zeo flag to 1
+          // create wr %g0, 1, %psr
+          BuildMI(MBB, NextMBBI, DL, TII.get(SP::WRPSRri))
+              .addReg(SP::G0)
+              .addImm(1);
+
+          BuildMI(MBB, NextMBBI, DL, TII.get(SP::NOP));
+
+          Modified = true;
+        }
+      }
+    }
+  }
+
+  return Modified;
+}
+
+//*****************************************************************************
+//**** InsertNOPDoublePrecision pass
+//*****************************************************************************
+// This erratum fix for some earlier LEON processors fixes a problem where a
+// double precision load will not yield the correct result if used in FMUL,
+// FDIV, FADD, FSUB or FSQRT instructions later. If this sequence is detected,
+// inserting a NOP between the two instructions will fix the erratum.
+// 1.scans the code after register allocation;
+// 2.checks for the problem conditions as described in the AT697E erratum
+// “Odd-Numbered FPU Register Dependency not Properly Checked in some
+// Double-Precision FPU Operations”;
+// 3.inserts NOPs if the problem exists.
+//
+char InsertNOPDoublePrecision::ID = 0;
+
+InsertNOPDoublePrecision::InsertNOPDoublePrecision(TargetMachine &tm)
+    : LEONMachineFunctionPass(tm, ID) {}
+
+bool InsertNOPDoublePrecision::runOnMachineFunction(MachineFunction &MF) {
+  Subtarget = &MF.getSubtarget<SparcSubtarget>();
+  const TargetInstrInfo &TII = *Subtarget->getInstrInfo();
+  DebugLoc DL = DebugLoc();
+
+  bool Modified = false;
+  for (auto MFI = MF.begin(), E = MF.end(); MFI != E; ++MFI) {
+    MachineBasicBlock &MBB = *MFI;
+    for (auto MBBI = MBB.begin(), E = MBB.end(); MBBI != E; ++MBBI) {
+      MachineInstr &MI = *MBBI;
+      unsigned Opcode = MI.getOpcode();
+      if (Opcode == SP::LDDFri || Opcode == SP::LDDFrr) {
+        MachineBasicBlock::iterator NMBBI = std::next(MBBI);
+        MachineInstr &NMI = *NMBBI;
+
+        unsigned NextOpcode = NMI.getOpcode();
+        // NMI.print(errs());
+        if (NextOpcode == SP::FADDD || NextOpcode == SP::FSUBD ||
+            NextOpcode == SP::FMULD || NextOpcode == SP::FDIVD) {
+          int RegAIndex = GetRegIndexForOperand(MI, 0);
+          int RegBIndex = GetRegIndexForOperand(NMI, 0);
+          int RegCIndex =
+              GetRegIndexForOperand(NMI, 2); // Second source operand is index 2
+          int RegDIndex =
+              GetRegIndexForOperand(NMI, 1); // Destination operand is index 1
+
+          if ((RegAIndex == RegBIndex + 1 && RegBIndex == RegDIndex) ||
+              (RegAIndex == RegCIndex + 1 && RegCIndex == RegDIndex) ||
+              (RegAIndex == RegBIndex + 1 && RegCIndex == RegDIndex) ||
+              (RegAIndex == RegCIndex + 1 && RegBIndex == RegDIndex)) {
+            // Insert NOP between the two instructions.
+            BuildMI(MBB, NMBBI, DL, TII.get(SP::NOP));
+            Modified = true;
+          }
+
+          // Check the errata patterns that only happen for FADDD and FMULD
+          if (Modified == false &&
+              (NextOpcode == SP::FADDD || NextOpcode == SP::FMULD)) {
+            RegAIndex = GetRegIndexForOperand(MI, 1);
+            if (RegAIndex == RegBIndex + 1 && RegBIndex == RegCIndex &&
+                RegBIndex == RegDIndex) {
+              // Insert NOP between the two instructions.
+              BuildMI(MBB, NMBBI, DL, TII.get(SP::NOP));
+              Modified = true;
+            }
+          }
+        } else if (NextOpcode == SP::FSQRTD) {
+          int RegAIndex = GetRegIndexForOperand(MI, 1);
+          int RegBIndex = GetRegIndexForOperand(NMI, 0);
+          int RegCIndex = GetRegIndexForOperand(NMI, 1);
+
+          if (RegAIndex == RegBIndex + 1 && RegBIndex == RegCIndex) {
+            // Insert NOP between the two instructions.
+            BuildMI(MBB, NMBBI, DL, TII.get(SP::NOP));
+            Modified = true;
+          }
+        }
+      }
+    }
+  }
+
+  return Modified;
+}
+
+//*****************************************************************************
+//**** PreventRoundChange pass
+//*****************************************************************************
+// To prevent any explicit change of the default rounding mode, this pass
+// detects any call of the fesetround function and removes this call from the
+// list of generated operations.
+//
+char PreventRoundChange::ID = 0;
+
+PreventRoundChange::PreventRoundChange(TargetMachine &tm)
+    : LEONMachineFunctionPass(tm, ID) {}
+
+bool PreventRoundChange::runOnMachineFunction(MachineFunction &MF) {
+  Subtarget = &MF.getSubtarget<SparcSubtarget>();
+
+  bool Modified = false;
+  for (auto MFI = MF.begin(), E = MF.end(); MFI != E; ++MFI) {
+    MachineBasicBlock &MBB = *MFI;
+    for (auto MBBI = MBB.begin(), E = MBB.end(); MBBI != E; ++MBBI) {
+      MachineInstr &MI = *MBBI;
+      unsigned Opcode = MI.getOpcode();
+      if (Opcode == SP::CALL && MI.getNumOperands() > 0) {
+        MachineOperand &MO = MI.getOperand(0);
+
+        if (MO.isGlobal()) {
+          StringRef FuncName = MO.getGlobal()->getName();
+          if (FuncName.compare_lower("fesetround") == 0) {
+            MachineBasicBlock::iterator NMBBI = std::next(MBBI);
+            MI.eraseFromParent();
+            MBBI = NMBBI;
+            Modified = true;
+          }
+        }
+      }
+    }
+  }
+
+  return Modified;
+}
+//*****************************************************************************
+//**** FlushCacheLineSWAP pass
+//*****************************************************************************
+// This pass inserts FLUSHW just before any SWAP atomic instruction.
+//
+char FlushCacheLineSWAP::ID = 0;
+
+FlushCacheLineSWAP::FlushCacheLineSWAP(TargetMachine &tm)
+    : LEONMachineFunctionPass(tm, ID) {}
+
+bool FlushCacheLineSWAP::runOnMachineFunction(MachineFunction &MF) {
+  Subtarget = &MF.getSubtarget<SparcSubtarget>();
+  const TargetInstrInfo &TII = *Subtarget->getInstrInfo();
+  DebugLoc DL = DebugLoc();
+
+  bool Modified = false;
+  for (auto MFI = MF.begin(), E = MF.end(); MFI != E; ++MFI) {
+    MachineBasicBlock &MBB = *MFI;
+    for (auto MBBI = MBB.begin(), E = MBB.end(); MBBI != E; ++MBBI) {
+      MachineInstr &MI = *MBBI;
+      unsigned Opcode = MI.getOpcode();
+      if (Opcode == SP::SWAPrr || Opcode == SP::SWAPri ||
+          Opcode == SP::LDSTUBrr || Opcode == SP::LDSTUBri) {
+        // insert flush and 5 NOPs before the swap/ldstub instruction
+        BuildMI(MBB, MBBI, DL, TII.get(SP::FLUSH));
+        BuildMI(MBB, MBBI, DL, TII.get(SP::NOP));
+        BuildMI(MBB, MBBI, DL, TII.get(SP::NOP));
+        BuildMI(MBB, MBBI, DL, TII.get(SP::NOP));
+        BuildMI(MBB, MBBI, DL, TII.get(SP::NOP));
+        BuildMI(MBB, MBBI, DL, TII.get(SP::NOP));
+
+        Modified = true;
+      } else if (MI.isInlineAsm()) {
+        StringRef AsmString =
+            MI.getOperand(InlineAsm::MIOp_AsmString).getSymbolName();
+        if (AsmString.startswith_lower("swap") ||
+            AsmString.startswith_lower("ldstub")) {
+          // this is an inline swap or ldstub instruction
+
+          // insert flush and 5 NOPs before the swap/ldstub instruction
+          BuildMI(MBB, MBBI, DL, TII.get(SP::FLUSH));
+          BuildMI(MBB, MBBI, DL, TII.get(SP::NOP));
+          BuildMI(MBB, MBBI, DL, TII.get(SP::NOP));
+          BuildMI(MBB, MBBI, DL, TII.get(SP::NOP));
+          BuildMI(MBB, MBBI, DL, TII.get(SP::NOP));
+          BuildMI(MBB, MBBI, DL, TII.get(SP::NOP));
+
+          Modified = true;
+        }
+      }
+    }
+  }
+
+  return Modified;
+}
+
+//*****************************************************************************
+//**** InsertNOPsLoadStore pass
+//*****************************************************************************
+// This pass shall insert NOPs between floating point loads and stores when the
+// following circumstances are present [5]:
+// Pattern 1:
+// 1. single-precision load or single-precision FPOP to register %fX, where X is
+// the same register as the store being checked;
+// 2. single-precision load or single-precision FPOP to register %fY , where Y
+// is the opposite register in the same double-precision pair;
+// 3. 0-3 instructions of any kind, except stores from %fX or %fY or operations
+// with %fX as destination;
+// 4. the store (from register %fX) being considered.
+// Pattern 2:
+// 1. double-precision FPOP;
+// 2. any number of operations on any kind, except no double-precision FPOP and
+// at most one (less than two) single-precision or single-to-double FPOPs;
+// 3. the store (from register %fX) being considered.
+//
+char InsertNOPsLoadStore::ID = 0;
+
+InsertNOPsLoadStore::InsertNOPsLoadStore(TargetMachine &tm)
+    : LEONMachineFunctionPass(tm, ID) {}
+
+bool InsertNOPsLoadStore::runOnMachineFunction(MachineFunction &MF) {
+  Subtarget = &MF.getSubtarget<SparcSubtarget>();
+  const TargetInstrInfo &TII = *Subtarget->getInstrInfo();
+  DebugLoc DL = DebugLoc();
+
+  MachineInstr *Pattern1FirstInstruction = NULL;
+  MachineInstr *Pattern2FirstInstruction = NULL;
+  unsigned int StoreInstructionsToCheck = 0;
+  int FxRegIndex, FyRegIndex;
+
+  bool Modified = false;
+  for (auto MFI = MF.begin(), E = MF.end(); MFI != E; ++MFI) {
+    MachineBasicBlock &MBB = *MFI;
+    for (auto MBBI = MBB.begin(), E = MBB.end(); MBBI != E; ++MBBI) {
+      MachineInstr &MI = *MBBI;
+
+      if (StoreInstructionsToCheck > 0) {
+        if (((MI.getOpcode() == SP::STFrr || MI.getOpcode() == SP::STFri) &&
+             (GetRegIndexForOperand(MI, LAST_OPERAND) == FxRegIndex ||
+              GetRegIndexForOperand(MI, LAST_OPERAND) == FyRegIndex)) ||
+            GetRegIndexForOperand(MI, 0) == FxRegIndex) {
+          // Insert four NOPs
+          for (unsigned InsertedCount = 0; InsertedCount < 4; InsertedCount++) {
+            BuildMI(MBB, MBBI, DL, TII.get(SP::NOP));
+          }
+          Modified = true;
+        }
+        StoreInstructionsToCheck--;
+      }
+
+      switch (MI.getOpcode()) {
+      // Watch for Pattern 1 FPop instructions
+      case SP::LDrr:
+      case SP::LDri:
+      case SP::LDFrr:
+      case SP::LDFri:
+      case SP::FADDS:
+      case SP::FSUBS:
+      case SP::FMULS:
+      case SP::FDIVS:
+      case SP::FSQRTS:
+      case SP::FCMPS:
+      case SP::FMOVS:
+      case SP::FNEGS:
+      case SP::FABSS:
+      case SP::FITOS:
+      case SP::FSTOI:
+      case SP::FITOD:
+      case SP::FDTOI:
+      case SP::FDTOS:
+        if (Pattern1FirstInstruction != NULL) {
+          FxRegIndex = GetRegIndexForOperand(*Pattern1FirstInstruction, 0);
+          FyRegIndex = GetRegIndexForOperand(MI, 0);
+
+          // Check to see if these registers are part of the same double
+          // precision
+          // register pair.
+          int DoublePrecRegIndexForX = (FxRegIndex - SP::F0) / 2;
+          int DoublePrecRegIndexForY = (FyRegIndex - SP::F0) / 2;
+
+          if (DoublePrecRegIndexForX == DoublePrecRegIndexForY)
+            StoreInstructionsToCheck = 4;
+        }
+
+        Pattern1FirstInstruction = &MI;
+        break;
+      // End of Pattern 1
+
+      // Search for Pattern 2
+      case SP::FADDD:
+      case SP::FSUBD:
+      case SP::FMULD:
+      case SP::FDIVD:
+      case SP::FSQRTD:
+      case SP::FCMPD:
+        Pattern2FirstInstruction = &MI;
+        Pattern1FirstInstruction = NULL;
+        break;
+
+      case SP::STFrr:
+      case SP::STFri:
+      case SP::STDFrr:
+      case SP::STDFri:
+        if (Pattern2FirstInstruction != NULL) {
+          if (GetRegIndexForOperand(MI, LAST_OPERAND) ==
+              GetRegIndexForOperand(*Pattern2FirstInstruction, 0)) {
+            // Insert four NOPs
+            for (unsigned InsertedCount = 0; InsertedCount < 4;
+                 InsertedCount++) {
+              BuildMI(MBB, MBBI, DL, TII.get(SP::NOP));
+            }
+
+            Pattern2FirstInstruction = NULL;
+          }
+        }
+        Pattern1FirstInstruction = NULL;
+        break;
+      // End of Pattern 2
+
+      default:
+        // Ensure we don't count debug-only values while we're testing for the
+        // patterns.
+        if (!MI.isDebugValue())
+          Pattern1FirstInstruction = NULL;
+        break;
+      }
+    }
+  }
+
+  return Modified;
+}
diff --git a/lib/Target/Sparc/LeonPasses.h b/lib/Target/Sparc/LeonPasses.h
new file mode 100755
index 000000000000..5e21813ed029
--- /dev/null
+++ b/lib/Target/Sparc/LeonPasses.h
@@ -0,0 +1,199 @@
+//===------- LeonPasses.h - Define passes specific to LEON ----------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_SPARC_LEON_PASSES_H
+#define LLVM_LIB_TARGET_SPARC_LEON_PASSES_H
+
+#include "llvm/CodeGen/MachineBasicBlock.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/Passes.h"
+
+#include "Sparc.h"
+#include "SparcSubtarget.h"
+
+namespace llvm {
+class LLVM_LIBRARY_VISIBILITY LEONMachineFunctionPass
+    : public MachineFunctionPass {
+protected:
+  const SparcSubtarget *Subtarget;
+  const int LAST_OPERAND = -1;
+
+  // this vector holds free registers that we allocate in groups for some of the
+  // LEON passes
+  std::vector<int> UsedRegisters;
+
+protected:
+  LEONMachineFunctionPass(TargetMachine &tm, char &ID);
+  LEONMachineFunctionPass(char &ID);
+
+  int GetRegIndexForOperand(MachineInstr &MI, int OperandIndex);
+  void clearUsedRegisterList() { UsedRegisters.clear(); }
+
+  void markRegisterUsed(int registerIndex) {
+    UsedRegisters.push_back(registerIndex);
+  }
+  int getUnusedFPRegister(MachineRegisterInfo &MRI);
+};
+
+class LLVM_LIBRARY_VISIBILITY ReplaceSDIV : public LEONMachineFunctionPass {
+public:
+  static char ID;
+
+  ReplaceSDIV();
+  ReplaceSDIV(TargetMachine &tm);
+  bool runOnMachineFunction(MachineFunction &MF) override;
+
+  const char *getPassName() const override {
+    return "ReplaceSDIV: Erratum Fix LBR25:  do not emit SDIV, but emit SDIVCC "
+           "instead";
+  }
+};
+
+class LLVM_LIBRARY_VISIBILITY FixCALL : public LEONMachineFunctionPass {
+public:
+  static char ID;
+
+  FixCALL(TargetMachine &tm);
+  bool runOnMachineFunction(MachineFunction &MF) override;
+
+  const char *getPassName() const override {
+    return "FixCALL: Erratum Fix LBR26: restrict the size of the immediate "
+           "operand of the CALL instruction to 20 bits";
+  }
+};
+
+class LLVM_LIBRARY_VISIBILITY IgnoreZeroFlag : public LEONMachineFunctionPass {
+public:
+  static char ID;
+
+  IgnoreZeroFlag(TargetMachine &tm);
+  bool runOnMachineFunction(MachineFunction &MF) override;
+
+  const char *getPassName() const override {
+    return "IgnoreZeroFlag: Erratum Fix LBR28: do not rely on the zero bit "
+           "flag on a divide overflow for SDIVCC and UDIVCC";
+  }
+};
+
+class LLVM_LIBRARY_VISIBILITY InsertNOPDoublePrecision
+    : public LEONMachineFunctionPass {
+public:
+  static char ID;
+
+  InsertNOPDoublePrecision(TargetMachine &tm);
+  bool runOnMachineFunction(MachineFunction &MF) override;
+
+  const char *getPassName() const override {
+    return "InsertNOPDoublePrecision: Erratum Fix LBR30: insert a NOP before "
+           "the double precision floating point instruction";
+  }
+};
+
+class LLVM_LIBRARY_VISIBILITY FixFSMULD : public LEONMachineFunctionPass {
+public:
+  static char ID;
+
+  FixFSMULD(TargetMachine &tm);
+  bool runOnMachineFunction(MachineFunction &MF) override;
+
+  const char *getPassName() const override {
+    return "FixFSMULD: Erratum Fix LBR31: do not select FSMULD";
+  }
+};
+
+class LLVM_LIBRARY_VISIBILITY ReplaceFMULS : public LEONMachineFunctionPass {
+public:
+  static char ID;
+
+  ReplaceFMULS(TargetMachine &tm);
+  bool runOnMachineFunction(MachineFunction &MF) override;
+
+  const char *getPassName() const override {
+    return "ReplaceFMULS: Erratum Fix LBR32: replace FMULS instruction with a "
+           "routine using conversions/double precision operations to replace "
+           "FMULS";
+  }
+};
+
+class LLVM_LIBRARY_VISIBILITY PreventRoundChange
+    : public LEONMachineFunctionPass {
+public:
+  static char ID;
+
+  PreventRoundChange(TargetMachine &tm);
+  bool runOnMachineFunction(MachineFunction &MF) override;
+
+  const char *getPassName() const override {
+    return "PreventRoundChange: Erratum Fix LBR33: prevent any rounding mode "
+           "change request: use only the round-to-nearest rounding mode";
+  }
+};
+
+class LLVM_LIBRARY_VISIBILITY FixAllFDIVSQRT : public LEONMachineFunctionPass {
+public:
+  static char ID;
+
+  FixAllFDIVSQRT(TargetMachine &tm);
+  bool runOnMachineFunction(MachineFunction &MF) override;
+
+  const char *getPassName() const override {
+    return "FixAllFDIVSQRT: Erratum Fix LBR34: fix FDIVS/FDIVD/FSQRTS/FSQRTD "
+           "instructions with NOPs and floating-point store";
+  }
+};
+
+class LLVM_LIBRARY_VISIBILITY InsertNOPLoad : public LEONMachineFunctionPass {
+public:
+  static char ID;
+
+  InsertNOPLoad(TargetMachine &tm);
+  bool runOnMachineFunction(MachineFunction &MF) override;
+
+  const char *getPassName() const override {
+    return "InsertNOPLoad: insert a NOP instruction after "
+           "every single-cycle load instruction when the next instruction is "
+           "another load/store instruction";
+  }
+};
+
+class LLVM_LIBRARY_VISIBILITY FlushCacheLineSWAP
+    : public LEONMachineFunctionPass {
+public:
+  static char ID;
+
+  FlushCacheLineSWAP(TargetMachine &tm);
+  bool runOnMachineFunction(MachineFunction &MF) override;
+
+  const char *getPassName() const override {
+    return "FlushCacheLineSWAP: Erratum Fix LBR36: flush cache line containing "
+           "the lock before performing any of the atomic instructions SWAP and "
+           "LDSTUB";
+  }
+};
+
+class LLVM_LIBRARY_VISIBILITY InsertNOPsLoadStore
+    : public LEONMachineFunctionPass {
+public:
+  static char ID;
+
+  InsertNOPsLoadStore(TargetMachine &tm);
+  bool runOnMachineFunction(MachineFunction &MF) override;
+
+  const char *getPassName() const override {
+    return "InsertNOPsLoadStore: Erratum Fix LBR37: insert NOPs between "
+           "single-precision loads and the store, so the number of "
+           "instructions between is 4";
+  }
+};
+} // namespace lllvm
+
+#endif // LLVM_LIB_TARGET_SPARC_LEON_PASSES_H
diff --git a/lib/Target/Sparc/MCTargetDesc/Makefile b/lib/Target/Sparc/MCTargetDesc/Makefile
deleted file mode 100644
index abcbe2da18ec..000000000000
--- a/lib/Target/Sparc/MCTargetDesc/Makefile
+++ /dev/null
@@ -1,16 +0,0 @@
-##===- lib/Target/Sparc/TargetDesc/Makefile ----------------*- Makefile -*-===##
-#
-#                     The LLVM Compiler Infrastructure
-#
-# This file is distributed under the University of Illinois Open Source
-# License. See LICENSE.TXT for details.
-#
-##===----------------------------------------------------------------------===##
-
-LEVEL = ../../../..
-LIBRARYNAME = LLVMSparcDesc
-
-# Hack: we need to include 'main' target directory to grab private headers
-CPP.Flags += -I$(PROJ_OBJ_DIR)/.. -I$(PROJ_SRC_DIR)/..
-
-include $(LEVEL)/Makefile.common
diff --git a/lib/Target/Sparc/MCTargetDesc/SparcAsmBackend.cpp b/lib/Target/Sparc/MCTargetDesc/SparcAsmBackend.cpp
index d1d7aaa07eab..14a70d862f11 100644
--- a/lib/Target/Sparc/MCTargetDesc/SparcAsmBackend.cpp
+++ b/lib/Target/Sparc/MCTargetDesc/SparcAsmBackend.cpp
@@ -248,7 +248,8 @@ namespace {
       llvm_unreachable("fixupNeedsRelaxation() unimplemented");
       return false;
     }
-    void relaxInstruction(const MCInst &Inst, MCInst &Res) const override {
+    void relaxInstruction(const MCInst &Inst, const MCSubtargetInfo &STI,
+                          MCInst &Res) const override {
       // FIXME.
       llvm_unreachable("relaxInstruction() unimplemented");
     }
diff --git a/lib/Target/Sparc/MCTargetDesc/SparcELFObjectWriter.cpp b/lib/Target/Sparc/MCTargetDesc/SparcELFObjectWriter.cpp
index 0be60fd7a051..d35e45e03466 100644
--- a/lib/Target/Sparc/MCTargetDesc/SparcELFObjectWriter.cpp
+++ b/lib/Target/Sparc/MCTargetDesc/SparcELFObjectWriter.cpp
@@ -29,8 +29,8 @@ namespace {
     ~SparcELFObjectWriter() override {}
 
   protected:
-    unsigned GetRelocType(const MCValue &Target, const MCFixup &Fixup,
-                          bool IsPCRel) const override;
+    unsigned getRelocType(MCContext &Ctx, const MCValue &Target,
+                          const MCFixup &Fixup, bool IsPCRel) const override;
 
     bool needsRelocateWithSymbol(const MCSymbol &Sym,
                                  unsigned Type) const override;
@@ -38,7 +38,8 @@ namespace {
   };
 }
 
-unsigned SparcELFObjectWriter::GetRelocType(const MCValue &Target,
+unsigned SparcELFObjectWriter::getRelocType(MCContext &Ctx,
+                                            const MCValue &Target,
                                             const MCFixup &Fixup,
                                             bool IsPCRel) const {
 
diff --git a/lib/Target/Sparc/MCTargetDesc/SparcMCCodeEmitter.cpp b/lib/Target/Sparc/MCTargetDesc/SparcMCCodeEmitter.cpp
index 9171d4dc9c00..45bc4a1de01b 100644
--- a/lib/Target/Sparc/MCTargetDesc/SparcMCCodeEmitter.cpp
+++ b/lib/Target/Sparc/MCTargetDesc/SparcMCCodeEmitter.cpp
@@ -22,6 +22,7 @@
 #include "llvm/MC/MCRegisterInfo.h"
 #include "llvm/MC/MCSymbol.h"
 #include "llvm/MC/MCAsmInfo.h"
+#include "llvm/Support/EndianStream.h"
 #include "llvm/Support/raw_ostream.h"
 
 using namespace llvm;
diff --git a/lib/Target/Sparc/MCTargetDesc/SparcMCTargetDesc.cpp b/lib/Target/Sparc/MCTargetDesc/SparcMCTargetDesc.cpp
index 9113e4a46b96..dceaca791aab 100644
--- a/lib/Target/Sparc/MCTargetDesc/SparcMCTargetDesc.cpp
+++ b/lib/Target/Sparc/MCTargetDesc/SparcMCTargetDesc.cpp
@@ -15,7 +15,6 @@
 #include "InstPrinter/SparcInstPrinter.h"
 #include "SparcMCAsmInfo.h"
 #include "SparcTargetStreamer.h"
-#include "llvm/MC/MCCodeGenInfo.h"
 #include "llvm/MC/MCInstrInfo.h"
 #include "llvm/MC/MCRegisterInfo.h"
 #include "llvm/MC/MCSubtargetInfo.h"
@@ -81,12 +80,8 @@ createSparcMCSubtargetInfo(const Triple &TT, StringRef CPU, StringRef FS) {
 //
 // All code models require that the text segment is smaller than 2GB.
 
-static MCCodeGenInfo *createSparcMCCodeGenInfo(const Triple &TT,
-                                               Reloc::Model RM,
-                                               CodeModel::Model CM,
-                                               CodeGenOpt::Level OL) {
-  MCCodeGenInfo *X = new MCCodeGenInfo();
-
+static void adjustCodeGenOpts(const Triple &TT, Reloc::Model RM,
+                              CodeModel::Model &CM) {
   // The default 32-bit code model is abs32/pic32 and the default 32-bit
   // code model for JIT is abs32.
   switch (CM) {
@@ -94,17 +89,10 @@ static MCCodeGenInfo *createSparcMCCodeGenInfo(const Triple &TT,
   case CodeModel::Default:
   case CodeModel::JITDefault: CM = CodeModel::Small; break;
   }
-
-  X->initMCCodeGenInfo(RM, CM, OL);
-  return X;
 }
 
-static MCCodeGenInfo *createSparcV9MCCodeGenInfo(const Triple &TT,
-                                                 Reloc::Model RM,
-                                                 CodeModel::Model CM,
-                                                 CodeGenOpt::Level OL) {
-  MCCodeGenInfo *X = new MCCodeGenInfo();
-
+static void adjustCodeGenOptsV9(const Triple &TT, Reloc::Model RM,
+                                CodeModel::Model &CM) {
   // The default 64-bit code model is abs44/pic32 and the default 64-bit
   // code model for JIT is abs64.
   switch (CM) {
@@ -116,9 +104,6 @@ static MCCodeGenInfo *createSparcV9MCCodeGenInfo(const Triple &TT,
     CM = CodeModel::Large;
     break;
   }
-
-  X->initMCCodeGenInfo(RM, CM, OL);
-  return X;
 }
 
 static MCTargetStreamer *
@@ -175,10 +160,10 @@ extern "C" void LLVMInitializeSparcTargetMC() {
   }
 
   // Register the MC codegen info.
-  TargetRegistry::RegisterMCCodeGenInfo(TheSparcTarget,
-                                        createSparcMCCodeGenInfo);
-  TargetRegistry::RegisterMCCodeGenInfo(TheSparcV9Target,
-                                        createSparcV9MCCodeGenInfo);
-  TargetRegistry::RegisterMCCodeGenInfo(TheSparcelTarget,
-                                        createSparcMCCodeGenInfo);
+  TargetRegistry::registerMCAdjustCodeGenOpts(TheSparcTarget,
+                                              adjustCodeGenOpts);
+  TargetRegistry::registerMCAdjustCodeGenOpts(TheSparcV9Target,
+                                              adjustCodeGenOptsV9);
+  TargetRegistry::registerMCAdjustCodeGenOpts(TheSparcelTarget,
+                                              adjustCodeGenOpts);
 }
diff --git a/lib/Target/Sparc/Makefile b/lib/Target/Sparc/Makefile
deleted file mode 100644
index c2a95b47151a..000000000000
--- a/lib/Target/Sparc/Makefile
+++ /dev/null
@@ -1,24 +0,0 @@
-##===- lib/Target/Sparc/Makefile ---------------------------*- Makefile -*-===##
-#
-#                     The LLVM Compiler Infrastructure
-#
-# This file is distributed under the University of Illinois Open Source
-# License. See LICENSE.TXT for details.
-#
-##===----------------------------------------------------------------------===##
-
-LEVEL = ../../..
-LIBRARYNAME = LLVMSparcCodeGen
-TARGET = Sparc
-
-# Make sure that tblgen is run, first thing.
-BUILT_SOURCES = SparcGenRegisterInfo.inc SparcGenInstrInfo.inc \
-		SparcGenAsmWriter.inc SparcGenAsmMatcher.inc \
-		SparcGenDAGISel.inc SparcGenDisassemblerTables.inc \
-		SparcGenSubtargetInfo.inc SparcGenCallingConv.inc \
-		SparcGenMCCodeEmitter.inc
-
-DIRS = InstPrinter AsmParser Disassembler TargetInfo MCTargetDesc
-
-include $(LEVEL)/Makefile.common
-
diff --git a/lib/Target/Sparc/README.txt b/lib/Target/Sparc/README.txt
index 647c2763e521..d7686eba7af6 100644
--- a/lib/Target/Sparc/README.txt
+++ b/lib/Target/Sparc/README.txt
@@ -1,4 +1,3 @@
-
 To-do
 -----
 
diff --git a/lib/Target/Sparc/Sparc.h b/lib/Target/Sparc/Sparc.h
index 96378d522dc0..0a8272d89297 100644
--- a/lib/Target/Sparc/Sparc.h
+++ b/lib/Target/Sparc/Sparc.h
@@ -72,7 +72,24 @@ namespace llvm {
       FCC_UGE = 12+16,  // Unordered or Greater or Equal
       FCC_LE  = 13+16,  // Less or Equal
       FCC_ULE = 14+16,  // Unordered or Less or Equal
-      FCC_O   = 15+16   // Ordered
+      FCC_O   = 15+16,  // Ordered
+        
+      CPCC_A   =  8+32,  // Always
+      CPCC_N   =  0+32,  // Never
+      CPCC_3   =  7+32,
+      CPCC_2   =  6+32,
+      CPCC_23  =  5+32,
+      CPCC_1   =  4+32,
+      CPCC_13  =  3+32,
+      CPCC_12  =  2+32,
+      CPCC_123 =  1+32,
+      CPCC_0   =  9+32,
+      CPCC_03  = 10+32,
+      CPCC_02  = 11+32,
+      CPCC_023 = 12+32,
+      CPCC_01  = 13+32,
+      CPCC_013 = 14+32,
+      CPCC_012 = 15+32
     };
   }
 
@@ -110,6 +127,22 @@ namespace llvm {
     case SPCC::FCC_LE:  return "le";
     case SPCC::FCC_ULE: return "ule";
     case SPCC::FCC_O:   return "o";
+    case SPCC::CPCC_A:   return "a";
+    case SPCC::CPCC_N:   return "n";
+    case SPCC::CPCC_3:   return "3";
+    case SPCC::CPCC_2:   return "2";
+    case SPCC::CPCC_23:  return "23";
+    case SPCC::CPCC_1:   return "1";
+    case SPCC::CPCC_13:  return "13";
+    case SPCC::CPCC_12:  return "12";
+    case SPCC::CPCC_123: return "123";
+    case SPCC::CPCC_0:   return "0";
+    case SPCC::CPCC_03:  return "03";
+    case SPCC::CPCC_02:  return "02";
+    case SPCC::CPCC_023: return "023";
+    case SPCC::CPCC_01:  return "01";
+    case SPCC::CPCC_013: return "013";
+    case SPCC::CPCC_012: return "012";
     }
     llvm_unreachable("Invalid cond code");
   }
diff --git a/lib/Target/Sparc/Sparc.td b/lib/Target/Sparc/Sparc.td
index c34122eef92f..7a3d12448d52 100644
--- a/lib/Target/Sparc/Sparc.td
+++ b/lib/Target/Sparc/Sparc.td
@@ -21,79 +21,133 @@ include "llvm/Target/Target.td"
 //
 
 def FeatureV9
-  : SubtargetFeature<"v9", "IsV9", "true",
-                     "Enable SPARC-V9 instructions">;
+    : SubtargetFeature<"v9", "IsV9", "true", "Enable SPARC-V9 instructions">;
 def FeatureV8Deprecated
-  : SubtargetFeature<"deprecated-v8", "V8DeprecatedInsts", "true",
-                     "Enable deprecated V8 instructions in V9 mode">;
+    : SubtargetFeature<"deprecated-v8", "V8DeprecatedInsts", "true",
+                       "Enable deprecated V8 instructions in V9 mode">;
 def FeatureVIS
-  : SubtargetFeature<"vis", "IsVIS", "true",
-                     "Enable UltraSPARC Visual Instruction Set extensions">;
+    : SubtargetFeature<"vis", "IsVIS", "true",
+                       "Enable UltraSPARC Visual Instruction Set extensions">;
 def FeatureVIS2
-  : SubtargetFeature<"vis2", "IsVIS2", "true",
-                     "Enable Visual Instruction Set extensions II">;
+    : SubtargetFeature<"vis2", "IsVIS2", "true",
+                       "Enable Visual Instruction Set extensions II">;
 def FeatureVIS3
-  : SubtargetFeature<"vis3", "IsVIS3", "true",
-                     "Enable Visual Instruction Set extensions III">;
+    : SubtargetFeature<"vis3", "IsVIS3", "true",
+                       "Enable Visual Instruction Set extensions III">;
+def FeatureLeon
+    : SubtargetFeature<"leon", "IsLeon", "true", "Enable LEON extensions">;
 
 def FeatureHardQuad
-  : SubtargetFeature<"hard-quad-float", "HasHardQuad", "true",
-                     "Enable quad-word floating point instructions">;
+    : SubtargetFeature<"hard-quad-float", "HasHardQuad", "true",
+                       "Enable quad-word floating point instructions">;
 
 def UsePopc : SubtargetFeature<"popc", "UsePopc", "true",
                                "Use the popc (population count) instruction">;
 
+def FeatureSoftFloat
+    : SubtargetFeature<"soft-float", "UseSoftFloat", "true",
+                       "Use software emulation for floating point">;
+
+//==== Features added predmoninantly for LEON subtarget support
+include "LeonFeatures.td"
+
 //===----------------------------------------------------------------------===//
 // Register File, Calling Conv, Instruction Descriptions
 //===----------------------------------------------------------------------===//
 
 include "SparcRegisterInfo.td"
 include "SparcCallingConv.td"
+include "SparcSchedule.td"
 include "SparcInstrInfo.td"
 
 def SparcInstrInfo : InstrInfo;
 
-def SparcAsmParser : AsmParser {
-  bit ShouldEmitMatchRegisterName = 0;
-}
+def SparcAsmParser : AsmParser { bit ShouldEmitMatchRegisterName = 0; }
 
 //===----------------------------------------------------------------------===//
 // SPARC processors supported.
 //===----------------------------------------------------------------------===//
 
 class Proc<string Name, list<SubtargetFeature> Features>
- : Processor<Name, NoItineraries, Features>;
-
-def : Proc<"generic",         []>;
-def : Proc<"v7",              []>;
-def : Proc<"v8",              []>;
-def : Proc<"supersparc",      []>;
-def : Proc<"sparclite",       []>;
-def : Proc<"f934",            []>;
-def : Proc<"hypersparc",      []>;
-def : Proc<"sparclite86x",    []>;
-def : Proc<"sparclet",        []>;
-def : Proc<"tsc701",          []>;
-def : Proc<"v9",              [FeatureV9]>;
-def : Proc<"ultrasparc",      [FeatureV9, FeatureV8Deprecated, FeatureVIS]>;
-def : Proc<"ultrasparc3",     [FeatureV9, FeatureV8Deprecated, FeatureVIS,
-                               FeatureVIS2]>;
-def : Proc<"niagara",         [FeatureV9, FeatureV8Deprecated, FeatureVIS,
-                               FeatureVIS2]>;
-def : Proc<"niagara2",        [FeatureV9, FeatureV8Deprecated, UsePopc,
-                               FeatureVIS, FeatureVIS2]>;
-def : Proc<"niagara3",        [FeatureV9, FeatureV8Deprecated, UsePopc,
-                               FeatureVIS, FeatureVIS2]>;
-def : Proc<"niagara4",        [FeatureV9, FeatureV8Deprecated, UsePopc,
-                               FeatureVIS, FeatureVIS2, FeatureVIS3]>;
-
+    : Processor<Name, NoItineraries, Features>;
+
+def : Proc<"generic", []>;
+def : Proc<"v7", []>;
+def : Proc<"v8", []>;
+def : Proc<"supersparc", []>;
+def : Proc<"sparclite", []>;
+def : Proc<"f934", []>;
+def : Proc<"hypersparc", []>;
+def : Proc<"sparclite86x", []>;
+def : Proc<"sparclet", []>;
+def : Proc<"tsc701", []>;
+def : Proc<"myriad2", []>;
+def : Proc<"myriad2.1", []>;
+def : Proc<"myriad2.2", []>;
+def : Proc<"v9", [ FeatureV9 ]>;
+def : Proc<"ultrasparc", [ FeatureV9, FeatureV8Deprecated, FeatureVIS ]>;
+def : Proc<"ultrasparc3",
+           [ FeatureV9, FeatureV8Deprecated, FeatureVIS, FeatureVIS2 ]>;
+def : Proc<"niagara",
+           [ FeatureV9, FeatureV8Deprecated, FeatureVIS, FeatureVIS2 ]>;
+def : Proc<"niagara2", [
+  FeatureV9, FeatureV8Deprecated, UsePopc, FeatureVIS, FeatureVIS2
+]>;
+def : Proc<"niagara3", [
+  FeatureV9, FeatureV8Deprecated, UsePopc, FeatureVIS, FeatureVIS2
+]>;
+def : Proc<"niagara4", [
+  FeatureV9, FeatureV8Deprecated, UsePopc, FeatureVIS, FeatureVIS2, FeatureVIS3
+]>;
+
+// LEON 2 FT generic
+def : Processor<"leon2", LEON2Itineraries, [ FeatureLeon ]>;
+
+// LEON 2 FT (AT697E)
+// AT697E: Provides full coverage of AT697E - covers all the erratum fixes for
+// LEON2 AT697E
+def : Processor<"at697e", LEON2Itineraries, [
+  FeatureLeon, ReplaceSDIV, FixCALL, IgnoreZeroFlag, InsertNOPDoublePrecision
+]>;
+
+// LEON 2 FT (AT697F)
+// AT697F: Provides full coverage of AT697F - covers all the erratum fixes for
+// LEON2 AT697F
+def : Processor<"at697f", LEON2Itineraries,
+                [ FeatureLeon, InsertNOPDoublePrecision ]>;
+
+// LEON 3 FT generic
+def : Processor<"leon3", LEON3Itineraries, [ FeatureLeon, UMACSMACSupport ]>;
+
+// LEON 3 FT (UT699). Provides features for the UT699 processor
+// - covers all the erratum fixes for LEON3, but does not support the CASA
+// instruction.
+def : Processor<"ut699", LEON3Itineraries, [
+  FeatureLeon, FixFSMULD, ReplaceFMULS, PreventRoundChange,
+  FixAllFDIVSQRT, InsertNOPLoad, FlushCacheLineSWAP, InsertNOPsLoadStore
+]>;
+
+// LEON3 FT (GR712RC). Provides features for the GR712RC processor.
+// - covers all the erratum fixed for LEON3 and support for the CASA
+// instruction.
+def : Processor<"gr712rc", LEON3Itineraries,
+                [ FeatureLeon, LeonCASA ]>;
+
+// LEON 4 FT generic
+def : Processor<"leon4", LEON4Itineraries,
+                [ FeatureLeon, LeonCASA ]>;
+
+// GR740: Provides full coverage of GR740 - covers all the erratum fixes for
+// LEON3 + support to CASA + LEON 4 instruction timings
+def : Processor<"gr740", LEON4Itineraries,
+                [ FeatureLeon, LeonCASA ]> {}
 
 //===----------------------------------------------------------------------===//
 // Declare the target which we are implementing
 //===----------------------------------------------------------------------===//
 
 def SparcAsmWriter : AsmWriter {
-  string AsmWriterClassName  = "InstPrinter";
+  string AsmWriterClassName = "InstPrinter";
   int PassSubtarget = 1;
   int Variant = 0;
 }
@@ -101,6 +155,6 @@ def SparcAsmWriter : AsmWriter {
 def Sparc : Target {
   // Pull in Instruction Info:
   let InstructionSet = SparcInstrInfo;
-  let AssemblyParsers  = [SparcAsmParser];
-  let AssemblyWriters = [SparcAsmWriter];
+  let AssemblyParsers = [ SparcAsmParser ];
+  let AssemblyWriters = [ SparcAsmWriter ];
 }
diff --git a/lib/Target/Sparc/SparcAsmPrinter.cpp b/lib/Target/Sparc/SparcAsmPrinter.cpp
index e3b0f5266747..c068440f7c05 100644
--- a/lib/Target/Sparc/SparcAsmPrinter.cpp
+++ b/lib/Target/Sparc/SparcAsmPrinter.cpp
@@ -18,7 +18,6 @@
 #include "SparcInstrInfo.h"
 #include "SparcTargetMachine.h"
 #include "SparcTargetStreamer.h"
-#include "llvm/ADT/SmallString.h"
 #include "llvm/CodeGen/AsmPrinter.h"
 #include "llvm/CodeGen/MachineInstr.h"
 #include "llvm/CodeGen/MachineModuleInfoImpls.h"
@@ -54,7 +53,6 @@ namespace {
     void printOperand(const MachineInstr *MI, int opNum, raw_ostream &OS);
     void printMemOperand(const MachineInstr *MI, int opNum, raw_ostream &OS,
                          const char *Modifier = nullptr);
-    void printCCOperand(const MachineInstr *MI, int opNum, raw_ostream &OS);
 
     void EmitFunctionBodyStart() override;
     void EmitInstruction(const MachineInstr *MI) override;
@@ -185,7 +183,7 @@ void SparcAsmPrinter::LowerGETPCXAndEmitMCInsts(const MachineInstr *MI,
   MCOperand MCRegOP = MCOperand::createReg(MO.getReg());
 
 
-  if (TM.getRelocationModel() != Reloc::PIC_) {
+  if (!isPositionIndependent()) {
     // Just load the address of GOT to MCRegOP.
     switch(TM.getCodeModel()) {
     default:
@@ -376,6 +374,9 @@ void SparcAsmPrinter::printOperand(const MachineInstr *MI, int opNum,
     O << DL.getPrivateGlobalPrefix() << "CPI" << getFunctionNumber() << "_"
       << MO.getIndex();
     break;
+  case MachineOperand::MO_Metadata:
+    MO.getMetadata()->printAsOperand(O, MMI->getModule());
+    break;
   default:
     llvm_unreachable("<unknown operand type>");
   }
@@ -417,6 +418,7 @@ bool SparcAsmPrinter::PrintAsmOperand(const MachineInstr *MI, unsigned OpNo,
     default:
       // See if this is a generic print operand
       return AsmPrinter::PrintAsmOperand(MI, OpNo, AsmVariant, ExtraCode, O);
+    case 'f':
     case 'r':
      break;
     }
diff --git a/lib/Target/Sparc/SparcFrameLowering.cpp b/lib/Target/Sparc/SparcFrameLowering.cpp
index 39b5e809c9be..87b01553b37e 100644
--- a/lib/Target/Sparc/SparcFrameLowering.cpp
+++ b/lib/Target/Sparc/SparcFrameLowering.cpp
@@ -146,7 +146,7 @@ void SparcFrameLowering::emitPrologue(MachineFunction &MF,
   // Finally, ensure that the size is sufficiently aligned for the
   // data on the stack.
   if (MFI->getMaxAlignment() > 0) {
-    NumBytes = RoundUpToAlignment(NumBytes, MFI->getMaxAlignment());
+    NumBytes = alignTo(NumBytes, MFI->getMaxAlignment());
   }
 
   // Update stack size with corrected value.
@@ -183,7 +183,7 @@ void SparcFrameLowering::emitPrologue(MachineFunction &MF,
   }
 }
 
-void SparcFrameLowering::
+MachineBasicBlock::iterator SparcFrameLowering::
 eliminateCallFramePseudoInstr(MachineFunction &MF, MachineBasicBlock &MBB,
                               MachineBasicBlock::iterator I) const {
   if (!hasReservedCallFrame(MF)) {
@@ -195,7 +195,7 @@ eliminateCallFramePseudoInstr(MachineFunction &MF, MachineBasicBlock &MBB,
     if (Size)
       emitSPAdjustment(MF, MBB, I, Size, SP::ADDrr, SP::ADDri);
   }
-  MBB.erase(I);
+  return MBB.erase(I);
 }
 
 
@@ -350,7 +350,7 @@ void SparcFrameLowering::remapRegsForLeafProc(MachineFunction &MF) const {
   }
 
   assert(verifyLeafProcRegUse(&MRI));
-#ifdef XDEBUG
+#ifdef EXPENSIVE_CHECKS
   MF.verify(0, "After LeafProc Remapping");
 #endif
 }
diff --git a/lib/Target/Sparc/SparcFrameLowering.h b/lib/Target/Sparc/SparcFrameLowering.h
index cbb4dc04fc23..ac0e69ccde1e 100644
--- a/lib/Target/Sparc/SparcFrameLowering.h
+++ b/lib/Target/Sparc/SparcFrameLowering.h
@@ -29,7 +29,7 @@ public:
   void emitPrologue(MachineFunction &MF, MachineBasicBlock &MBB) const override;
   void emitEpilogue(MachineFunction &MF, MachineBasicBlock &MBB) const override;
 
-  void
+  MachineBasicBlock::iterator
   eliminateCallFramePseudoInstr(MachineFunction &MF,
                                 MachineBasicBlock &MBB,
                                 MachineBasicBlock::iterator I) const override;
diff --git a/lib/Target/Sparc/SparcISelDAGToDAG.cpp b/lib/Target/Sparc/SparcISelDAGToDAG.cpp
index c4c641659df3..07948a33cde6 100644
--- a/lib/Target/Sparc/SparcISelDAGToDAG.cpp
+++ b/lib/Target/Sparc/SparcISelDAGToDAG.cpp
@@ -15,7 +15,6 @@
 #include "llvm/CodeGen/MachineRegisterInfo.h"
 #include "llvm/CodeGen/SelectionDAGISel.h"
 #include "llvm/IR/Intrinsics.h"
-#include "llvm/Support/Compiler.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/raw_ostream.h"
@@ -42,7 +41,7 @@ public:
     return SelectionDAGISel::runOnMachineFunction(MF);
   }
 
-  SDNode *Select(SDNode *N) override;
+  void Select(SDNode *N) override;
 
   // Complex Pattern Selectors.
   bool SelectADDRrr(SDValue N, SDValue &R1, SDValue &R2);
@@ -63,7 +62,7 @@ public:
 
 private:
   SDNode* getGlobalBaseReg();
-  SDNode *SelectInlineAsm(SDNode *N);
+  bool tryInlineAsm(SDNode *N);
 };
 }  // end anonymous namespace
 
@@ -155,7 +154,7 @@ bool SparcDAGToDAGISel::SelectADDRrr(SDValue Addr, SDValue &R1, SDValue &R2) {
 // TODO: fix inline asm support so I can simply tell it that 'i64'
 // inputs to asm need to be allocated to the IntPair register type,
 // and have that work. Then, delete this function.
-SDNode *SparcDAGToDAGISel::SelectInlineAsm(SDNode *N){
+bool SparcDAGToDAGISel::tryInlineAsm(SDNode *N){
   std::vector<SDValue> AsmNodeOperands;
   unsigned Flag, Kind;
   bool Changed = false;
@@ -310,31 +309,32 @@ SDNode *SparcDAGToDAGISel::SelectInlineAsm(SDNode *N){
   if (Glue.getNode())
     AsmNodeOperands.push_back(Glue);
   if (!Changed)
-    return nullptr;
+    return false;
 
   SDValue New = CurDAG->getNode(ISD::INLINEASM, SDLoc(N),
       CurDAG->getVTList(MVT::Other, MVT::Glue), AsmNodeOperands);
   New->setNodeId(-1);
-  return New.getNode();
+  ReplaceNode(N, New.getNode());
+  return true;
 }
 
-SDNode *SparcDAGToDAGISel::Select(SDNode *N) {
+void SparcDAGToDAGISel::Select(SDNode *N) {
   SDLoc dl(N);
   if (N->isMachineOpcode()) {
     N->setNodeId(-1);
-    return nullptr;   // Already selected.
+    return;   // Already selected.
   }
 
   switch (N->getOpcode()) {
   default: break;
-    case ISD::INLINEASM: {
-    SDNode *ResNode = SelectInlineAsm(N);
-    if (ResNode)
-      return ResNode;
+  case ISD::INLINEASM: {
+    if (tryInlineAsm(N))
+      return;
     break;
   }
   case SPISD::GLOBAL_BASE_REG:
-    return getGlobalBaseReg();
+    ReplaceNode(N, getGlobalBaseReg());
+    return;
 
   case ISD::SDIV:
   case ISD::UDIV: {
@@ -360,8 +360,8 @@ SDNode *SparcDAGToDAGISel::Select(SDNode *N) {
 
     // FIXME: Handle div by immediate.
     unsigned Opcode = N->getOpcode() == ISD::SDIV ? SP::SDIVrr : SP::UDIVrr;
-    return CurDAG->SelectNodeTo(N, Opcode, MVT::i32, DivLHS, DivRHS,
-                                TopPart);
+    CurDAG->SelectNodeTo(N, Opcode, MVT::i32, DivLHS, DivRHS, TopPart);
+    return;
   }
   case ISD::MULHU:
   case ISD::MULHS: {
@@ -373,11 +373,12 @@ SDNode *SparcDAGToDAGISel::Select(SDNode *N) {
         CurDAG->getMachineNode(Opcode, dl, MVT::i32, MVT::i32, MulLHS, MulRHS);
     SDValue ResultHigh = SDValue(Mul, 1);
     ReplaceUses(SDValue(N, 0), ResultHigh);
-    return nullptr;
+    CurDAG->RemoveDeadNode(N);
+    return;
   }
   }
 
-  return SelectCode(N);
+  SelectCode(N);
 }
 
 
@@ -391,6 +392,7 @@ SparcDAGToDAGISel::SelectInlineAsmMemoryOperand(const SDValue &Op,
   switch (ConstraintID) {
   default: return true;
   case InlineAsm::Constraint_i:
+  case InlineAsm::Constraint_o:
   case InlineAsm::Constraint_m: // memory
    if (!SelectADDRrr(Op, Op0, Op1))
      SelectADDRri(Op, Op0, Op1);
diff --git a/lib/Target/Sparc/SparcISelLowering.cpp b/lib/Target/Sparc/SparcISelLowering.cpp
index 5e70ffe2223c..8738bc82683d 100644
--- a/lib/Target/Sparc/SparcISelLowering.cpp
+++ b/lib/Target/Sparc/SparcISelLowering.cpp
@@ -18,6 +18,7 @@
 #include "SparcRegisterInfo.h"
 #include "SparcTargetMachine.h"
 #include "SparcTargetObjectFile.h"
+#include "llvm/ADT/StringSwitch.h"
 #include "llvm/CodeGen/CallingConvLower.h"
 #include "llvm/CodeGen/MachineFrameInfo.h"
 #include "llvm/CodeGen/MachineFunction.h"
@@ -31,7 +32,6 @@
 #include "llvm/Support/ErrorHandling.h"
 using namespace llvm;
 
-
 //===----------------------------------------------------------------------===//
 // Calling Convention Implementation
 //===----------------------------------------------------------------------===//
@@ -184,29 +184,30 @@ static bool CC_Sparc64_Half(unsigned &ValNo, MVT &ValVT,
 // callee's register window. This function translates registers to the
 // corresponding caller window %o register.
 static unsigned toCallerWindow(unsigned Reg) {
-  assert(SP::I0 + 7 == SP::I7 && SP::O0 + 7 == SP::O7 && "Unexpected enum");
+  static_assert(SP::I0 + 7 == SP::I7 && SP::O0 + 7 == SP::O7,
+                "Unexpected enum");
   if (Reg >= SP::I0 && Reg <= SP::I7)
     return Reg - SP::I0 + SP::O0;
   return Reg;
 }
 
 SDValue
-SparcTargetLowering::LowerReturn(SDValue Chain,
-                                 CallingConv::ID CallConv, bool IsVarArg,
+SparcTargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv,
+                                 bool IsVarArg,
                                  const SmallVectorImpl<ISD::OutputArg> &Outs,
                                  const SmallVectorImpl<SDValue> &OutVals,
-                                 SDLoc DL, SelectionDAG &DAG) const {
+                                 const SDLoc &DL, SelectionDAG &DAG) const {
   if (Subtarget->is64Bit())
     return LowerReturn_64(Chain, CallConv, IsVarArg, Outs, OutVals, DL, DAG);
   return LowerReturn_32(Chain, CallConv, IsVarArg, Outs, OutVals, DL, DAG);
 }
 
 SDValue
-SparcTargetLowering::LowerReturn_32(SDValue Chain,
-                                    CallingConv::ID CallConv, bool IsVarArg,
+SparcTargetLowering::LowerReturn_32(SDValue Chain, CallingConv::ID CallConv,
+                                    bool IsVarArg,
                                     const SmallVectorImpl<ISD::OutputArg> &Outs,
                                     const SmallVectorImpl<SDValue> &OutVals,
-                                    SDLoc DL, SelectionDAG &DAG) const {
+                                    const SDLoc &DL, SelectionDAG &DAG) const {
   MachineFunction &MF = DAG.getMachineFunction();
 
   // CCValAssign - represent the assignment of the return value to locations.
@@ -287,11 +288,11 @@ SparcTargetLowering::LowerReturn_32(SDValue Chain,
 // Lower return values for the 64-bit ABI.
 // Return values are passed the exactly the same way as function arguments.
 SDValue
-SparcTargetLowering::LowerReturn_64(SDValue Chain,
-                                    CallingConv::ID CallConv, bool IsVarArg,
+SparcTargetLowering::LowerReturn_64(SDValue Chain, CallingConv::ID CallConv,
+                                    bool IsVarArg,
                                     const SmallVectorImpl<ISD::OutputArg> &Outs,
                                     const SmallVectorImpl<SDValue> &OutVals,
-                                    SDLoc DL, SelectionDAG &DAG) const {
+                                    const SDLoc &DL, SelectionDAG &DAG) const {
   // CCValAssign - represent the assignment of the return value to locations.
   SmallVector<CCValAssign, 16> RVLocs;
 
@@ -363,14 +364,10 @@ SparcTargetLowering::LowerReturn_64(SDValue Chain,
   return DAG.getNode(SPISD::RET_FLAG, DL, MVT::Other, RetOps);
 }
 
-SDValue SparcTargetLowering::
-LowerFormalArguments(SDValue Chain,
-                     CallingConv::ID CallConv,
-                     bool IsVarArg,
-                     const SmallVectorImpl<ISD::InputArg> &Ins,
-                     SDLoc DL,
-                     SelectionDAG &DAG,
-                     SmallVectorImpl<SDValue> &InVals) const {
+SDValue SparcTargetLowering::LowerFormalArguments(
+    SDValue Chain, CallingConv::ID CallConv, bool IsVarArg,
+    const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &DL,
+    SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals) const {
   if (Subtarget->is64Bit())
     return LowerFormalArguments_64(Chain, CallConv, IsVarArg, Ins,
                                    DL, DAG, InVals);
@@ -381,14 +378,10 @@ LowerFormalArguments(SDValue Chain,
 /// LowerFormalArguments32 - V8 uses a very simple ABI, where all values are
 /// passed in either one or two GPRs, including FP values.  TODO: we should
 /// pass FP values in FP registers for fastcc functions.
-SDValue SparcTargetLowering::
-LowerFormalArguments_32(SDValue Chain,
-                        CallingConv::ID CallConv,
-                        bool isVarArg,
-                        const SmallVectorImpl<ISD::InputArg> &Ins,
-                        SDLoc dl,
-                        SelectionDAG &DAG,
-                        SmallVectorImpl<SDValue> &InVals) const {
+SDValue SparcTargetLowering::LowerFormalArguments_32(
+    SDValue Chain, CallingConv::ID CallConv, bool isVarArg,
+    const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &dl,
+    SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals) const {
   MachineFunction &MF = DAG.getMachineFunction();
   MachineRegisterInfo &RegInfo = MF.getRegInfo();
   SparcMachineFunctionInfo *FuncInfo = MF.getInfo<SparcMachineFunctionInfo>();
@@ -412,9 +405,8 @@ LowerFormalArguments_32(SDValue Chain,
       // Get SRet from [%fp+64].
       int FrameIdx = MF.getFrameInfo()->CreateFixedObject(4, 64, true);
       SDValue FIPtr = DAG.getFrameIndex(FrameIdx, MVT::i32);
-      SDValue Arg = DAG.getLoad(MVT::i32, dl, Chain, FIPtr,
-                                MachinePointerInfo(),
-                                false, false, false, 0);
+      SDValue Arg =
+          DAG.getLoad(MVT::i32, dl, Chain, FIPtr, MachinePointerInfo());
       InVals.push_back(Arg);
       continue;
     }
@@ -435,9 +427,7 @@ LowerFormalArguments_32(SDValue Chain,
           int FrameIdx = MF.getFrameInfo()->
             CreateFixedObject(4, StackOffset+NextVA.getLocMemOffset(),true);
           SDValue FIPtr = DAG.getFrameIndex(FrameIdx, MVT::i32);
-          LoVal = DAG.getLoad(MVT::i32, dl, Chain, FIPtr,
-                              MachinePointerInfo(),
-                              false, false, false, 0);
+          LoVal = DAG.getLoad(MVT::i32, dl, Chain, FIPtr, MachinePointerInfo());
         } else {
           unsigned loReg = MF.addLiveIn(NextVA.getLocReg(),
                                         &SP::IntRegsRegClass);
@@ -473,16 +463,15 @@ LowerFormalArguments_32(SDValue Chain,
     auto PtrVT = getPointerTy(DAG.getDataLayout());
 
     if (VA.needsCustom()) {
-      assert(VA.getValVT() == MVT::f64 || MVT::v2i32);
+      assert(VA.getValVT() == MVT::f64 || VA.getValVT() == MVT::v2i32);
       // If it is double-word aligned, just load.
       if (Offset % 8 == 0) {
         int FI = MF.getFrameInfo()->CreateFixedObject(8,
                                                       Offset,
                                                       true);
         SDValue FIPtr = DAG.getFrameIndex(FI, PtrVT);
-        SDValue Load = DAG.getLoad(VA.getValVT(), dl, Chain, FIPtr,
-                                   MachinePointerInfo(),
-                                   false,false, false, 0);
+        SDValue Load =
+            DAG.getLoad(VA.getValVT(), dl, Chain, FIPtr, MachinePointerInfo());
         InVals.push_back(Load);
         continue;
       }
@@ -491,17 +480,15 @@ LowerFormalArguments_32(SDValue Chain,
                                                     Offset,
                                                     true);
       SDValue FIPtr = DAG.getFrameIndex(FI, PtrVT);
-      SDValue HiVal = DAG.getLoad(MVT::i32, dl, Chain, FIPtr,
-                                  MachinePointerInfo(),
-                                  false, false, false, 0);
+      SDValue HiVal =
+          DAG.getLoad(MVT::i32, dl, Chain, FIPtr, MachinePointerInfo());
       int FI2 = MF.getFrameInfo()->CreateFixedObject(4,
                                                      Offset+4,
                                                      true);
       SDValue FIPtr2 = DAG.getFrameIndex(FI2, PtrVT);
 
-      SDValue LoVal = DAG.getLoad(MVT::i32, dl, Chain, FIPtr2,
-                                  MachinePointerInfo(),
-                                  false, false, false, 0);
+      SDValue LoVal =
+          DAG.getLoad(MVT::i32, dl, Chain, FIPtr2, MachinePointerInfo());
 
       if (IsLittleEndian)
         std::swap(LoVal, HiVal);
@@ -519,9 +506,7 @@ LowerFormalArguments_32(SDValue Chain,
     SDValue FIPtr = DAG.getFrameIndex(FI, PtrVT);
     SDValue Load ;
     if (VA.getValVT() == MVT::i32 || VA.getValVT() == MVT::f32) {
-      Load = DAG.getLoad(VA.getValVT(), dl, Chain, FIPtr,
-                         MachinePointerInfo(),
-                         false, false, false, 0);
+      Load = DAG.getLoad(VA.getValVT(), dl, Chain, FIPtr, MachinePointerInfo());
     } else if (VA.getValVT() == MVT::f128) {
       report_fatal_error("SPARCv8 does not handle f128 in calls; "
                          "pass indirectly");
@@ -573,9 +558,8 @@ LowerFormalArguments_32(SDValue Chain,
                                                           true);
       SDValue FIPtr = DAG.getFrameIndex(FrameIdx, MVT::i32);
 
-      OutChains.push_back(DAG.getStore(DAG.getRoot(), dl, Arg, FIPtr,
-                                       MachinePointerInfo(),
-                                       false, false, 0));
+      OutChains.push_back(
+          DAG.getStore(DAG.getRoot(), dl, Arg, FIPtr, MachinePointerInfo()));
       ArgOffset += 4;
     }
 
@@ -589,14 +573,10 @@ LowerFormalArguments_32(SDValue Chain,
 }
 
 // Lower formal arguments for the 64 bit ABI.
-SDValue SparcTargetLowering::
-LowerFormalArguments_64(SDValue Chain,
-                        CallingConv::ID CallConv,
-                        bool IsVarArg,
-                        const SmallVectorImpl<ISD::InputArg> &Ins,
-                        SDLoc DL,
-                        SelectionDAG &DAG,
-                        SmallVectorImpl<SDValue> &InVals) const {
+SDValue SparcTargetLowering::LowerFormalArguments_64(
+    SDValue Chain, CallingConv::ID CallConv, bool IsVarArg,
+    const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &DL,
+    SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals) const {
   MachineFunction &MF = DAG.getMachineFunction();
 
   // Analyze arguments according to CC_Sparc64.
@@ -659,10 +639,10 @@ LowerFormalArguments_64(SDValue Chain,
     if (VA.isExtInLoc())
       Offset += 8 - ValSize;
     int FI = MF.getFrameInfo()->CreateFixedObject(ValSize, Offset, true);
-    InVals.push_back(DAG.getLoad(
-        VA.getValVT(), DL, Chain,
-        DAG.getFrameIndex(FI, getPointerTy(MF.getDataLayout())),
-        MachinePointerInfo::getFixedStack(MF, FI), false, false, false, 0));
+    InVals.push_back(
+        DAG.getLoad(VA.getValVT(), DL, Chain,
+                    DAG.getFrameIndex(FI, getPointerTy(MF.getDataLayout())),
+                    MachinePointerInfo::getFixedStack(MF, FI)));
   }
 
   if (!IsVarArg)
@@ -690,9 +670,9 @@ LowerFormalArguments_64(SDValue Chain,
     SDValue VArg = DAG.getCopyFromReg(Chain, DL, VReg, MVT::i64);
     int FI = MF.getFrameInfo()->CreateFixedObject(8, ArgOffset + ArgArea, true);
     auto PtrVT = getPointerTy(MF.getDataLayout());
-    OutChains.push_back(DAG.getStore(
-        Chain, DL, VArg, DAG.getFrameIndex(FI, PtrVT),
-        MachinePointerInfo::getFixedStack(MF, FI), false, false, 0));
+    OutChains.push_back(
+        DAG.getStore(Chain, DL, VArg, DAG.getFrameIndex(FI, PtrVT),
+                     MachinePointerInfo::getFixedStack(MF, FI)));
   }
 
   if (!OutChains.empty())
@@ -773,16 +753,22 @@ SparcTargetLowering::LowerCall_32(TargetLowering::CallLoweringInfo &CLI,
     unsigned Size = Flags.getByValSize();
     unsigned Align = Flags.getByValAlign();
 
-    int FI = MFI->CreateStackObject(Size, Align, false);
-    SDValue FIPtr = DAG.getFrameIndex(FI, getPointerTy(DAG.getDataLayout()));
-    SDValue SizeNode = DAG.getConstant(Size, dl, MVT::i32);
-
-    Chain = DAG.getMemcpy(Chain, dl, FIPtr, Arg, SizeNode, Align,
-                          false,        // isVolatile,
-                          (Size <= 32), // AlwaysInline if size <= 32,
-                          false,        // isTailCall
-                          MachinePointerInfo(), MachinePointerInfo());
-    ByValArgs.push_back(FIPtr);
+    if (Size > 0U) {
+      int FI = MFI->CreateStackObject(Size, Align, false);
+      SDValue FIPtr = DAG.getFrameIndex(FI, getPointerTy(DAG.getDataLayout()));
+      SDValue SizeNode = DAG.getConstant(Size, dl, MVT::i32);
+
+      Chain = DAG.getMemcpy(Chain, dl, FIPtr, Arg, SizeNode, Align,
+                            false,        // isVolatile,
+                            (Size <= 32), // AlwaysInline if size <= 32,
+                            false,        // isTailCall
+                            MachinePointerInfo(), MachinePointerInfo());
+      ByValArgs.push_back(FIPtr);
+    }
+    else {
+      SDValue nullVal;
+      ByValArgs.push_back(nullVal);
+    }
   }
 
   Chain = DAG.getCALLSEQ_START(Chain, DAG.getIntPtrConstant(ArgsSize, dl, true),
@@ -803,8 +789,12 @@ SparcTargetLowering::LowerCall_32(TargetLowering::CallLoweringInfo &CLI,
     ISD::ArgFlagsTy Flags = Outs[realArgIdx].Flags;
 
     // Use local copy if it is a byval arg.
-    if (Flags.isByVal())
+    if (Flags.isByVal()) {
       Arg = ByValArgs[byvalArgIdx++];
+      if (!Arg) {
+        continue;
+      }
+    }
 
     // Promote the value if needed.
     switch (VA.getLocInfo()) {
@@ -830,9 +820,8 @@ SparcTargetLowering::LowerCall_32(TargetLowering::CallLoweringInfo &CLI,
       SDValue StackPtr = DAG.getRegister(SP::O6, MVT::i32);
       SDValue PtrOff = DAG.getIntPtrConstant(64, dl);
       PtrOff = DAG.getNode(ISD::ADD, dl, MVT::i32, StackPtr, PtrOff);
-      MemOpChains.push_back(DAG.getStore(Chain, dl, Arg, PtrOff,
-                                         MachinePointerInfo(),
-                                         false, false, 0));
+      MemOpChains.push_back(
+          DAG.getStore(Chain, dl, Arg, PtrOff, MachinePointerInfo()));
       hasStructRetAttr = true;
       continue;
     }
@@ -847,9 +836,8 @@ SparcTargetLowering::LowerCall_32(TargetLowering::CallLoweringInfo &CLI,
           SDValue StackPtr = DAG.getRegister(SP::O6, MVT::i32);
           SDValue PtrOff = DAG.getIntPtrConstant(Offset, dl);
           PtrOff = DAG.getNode(ISD::ADD, dl, MVT::i32, StackPtr, PtrOff);
-          MemOpChains.push_back(DAG.getStore(Chain, dl, Arg, PtrOff,
-                                             MachinePointerInfo(),
-                                             false, false, 0));
+          MemOpChains.push_back(
+              DAG.getStore(Chain, dl, Arg, PtrOff, MachinePointerInfo()));
           continue;
         }
       }
@@ -884,9 +872,8 @@ SparcTargetLowering::LowerCall_32(TargetLowering::CallLoweringInfo &CLI,
           SDValue StackPtr = DAG.getRegister(SP::O6, MVT::i32);
           SDValue PtrOff = DAG.getIntPtrConstant(Offset, dl);
           PtrOff = DAG.getNode(ISD::ADD, dl, MVT::i32, StackPtr, PtrOff);
-          MemOpChains.push_back(DAG.getStore(Chain, dl, Part1, PtrOff,
-                                             MachinePointerInfo(),
-                                             false, false, 0));
+          MemOpChains.push_back(
+              DAG.getStore(Chain, dl, Part1, PtrOff, MachinePointerInfo()));
         }
       } else {
         unsigned Offset = VA.getLocMemOffset() + StackOffset;
@@ -894,15 +881,13 @@ SparcTargetLowering::LowerCall_32(TargetLowering::CallLoweringInfo &CLI,
         SDValue StackPtr = DAG.getRegister(SP::O6, MVT::i32);
         SDValue PtrOff = DAG.getIntPtrConstant(Offset, dl);
         PtrOff = DAG.getNode(ISD::ADD, dl, MVT::i32, StackPtr, PtrOff);
-        MemOpChains.push_back(DAG.getStore(Chain, dl, Part0, PtrOff,
-                                           MachinePointerInfo(),
-                                           false, false, 0));
+        MemOpChains.push_back(
+            DAG.getStore(Chain, dl, Part0, PtrOff, MachinePointerInfo()));
         // Store the second part.
         PtrOff = DAG.getIntPtrConstant(Offset + 4, dl);
         PtrOff = DAG.getNode(ISD::ADD, dl, MVT::i32, StackPtr, PtrOff);
-        MemOpChains.push_back(DAG.getStore(Chain, dl, Part1, PtrOff,
-                                           MachinePointerInfo(),
-                                           false, false, 0));
+        MemOpChains.push_back(
+            DAG.getStore(Chain, dl, Part1, PtrOff, MachinePointerInfo()));
       }
       continue;
     }
@@ -926,9 +911,8 @@ SparcTargetLowering::LowerCall_32(TargetLowering::CallLoweringInfo &CLI,
     SDValue PtrOff = DAG.getIntPtrConstant(VA.getLocMemOffset() + StackOffset,
                                            dl);
     PtrOff = DAG.getNode(ISD::ADD, dl, MVT::i32, StackPtr, PtrOff);
-    MemOpChains.push_back(DAG.getStore(Chain, dl, Arg, PtrOff,
-                                       MachinePointerInfo(),
-                                       false, false, 0));
+    MemOpChains.push_back(
+        DAG.getStore(Chain, dl, Arg, PtrOff, MachinePointerInfo()));
   }
 
 
@@ -953,8 +937,7 @@ SparcTargetLowering::LowerCall_32(TargetLowering::CallLoweringInfo &CLI,
   // If the callee is a GlobalAddress node (quite common, every direct call is)
   // turn it into a TargetGlobalAddress node so that legalize doesn't hack it.
   // Likewise ExternalSymbol -> TargetExternalSymbol.
-  unsigned TF = ((getTargetMachine().getRelocationModel() == Reloc::PIC_)
-                 ? SparcMCExpr::VK_Sparc_WPLT30 : 0);
+  unsigned TF = isPositionIndependent() ? SparcMCExpr::VK_Sparc_WPLT30 : 0;
   if (GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee))
     Callee = DAG.getTargetGlobalAddress(G->getGlobal(), dl, MVT::i32, 0, TF);
   else if (ExternalSymbolSDNode *E = dyn_cast<ExternalSymbolSDNode>(Callee))
@@ -999,15 +982,55 @@ SparcTargetLowering::LowerCall_32(TargetLowering::CallLoweringInfo &CLI,
 
   // Copy all of the result registers out of their specified physreg.
   for (unsigned i = 0; i != RVLocs.size(); ++i) {
-    Chain = DAG.getCopyFromReg(Chain, dl, toCallerWindow(RVLocs[i].getLocReg()),
-                               RVLocs[i].getValVT(), InFlag).getValue(1);
-    InFlag = Chain.getValue(2);
-    InVals.push_back(Chain.getValue(0));
+    if (RVLocs[i].getLocVT() == MVT::v2i32) {
+      SDValue Vec = DAG.getNode(ISD::UNDEF, dl, MVT::v2i32);
+      SDValue Lo = DAG.getCopyFromReg(
+          Chain, dl, toCallerWindow(RVLocs[i++].getLocReg()), MVT::i32, InFlag);
+      Chain = Lo.getValue(1);
+      InFlag = Lo.getValue(2);
+      Vec = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v2i32, Vec, Lo,
+                        DAG.getConstant(0, dl, MVT::i32));
+      SDValue Hi = DAG.getCopyFromReg(
+          Chain, dl, toCallerWindow(RVLocs[i].getLocReg()), MVT::i32, InFlag);
+      Chain = Hi.getValue(1);
+      InFlag = Hi.getValue(2);
+      Vec = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v2i32, Vec, Hi,
+                        DAG.getConstant(1, dl, MVT::i32));
+      InVals.push_back(Vec);
+    } else {
+      Chain =
+          DAG.getCopyFromReg(Chain, dl, toCallerWindow(RVLocs[i].getLocReg()),
+                             RVLocs[i].getValVT(), InFlag)
+              .getValue(1);
+      InFlag = Chain.getValue(2);
+      InVals.push_back(Chain.getValue(0));
+    }
   }
 
   return Chain;
 }
 
+// FIXME? Maybe this could be a TableGen attribute on some registers and
+// this table could be generated automatically from RegInfo.
+unsigned SparcTargetLowering::getRegisterByName(const char* RegName, EVT VT,
+                                               SelectionDAG &DAG) const {
+  unsigned Reg = StringSwitch<unsigned>(RegName)
+    .Case("i0", SP::I0).Case("i1", SP::I1).Case("i2", SP::I2).Case("i3", SP::I3)
+    .Case("i4", SP::I4).Case("i5", SP::I5).Case("i6", SP::I6).Case("i7", SP::I7)
+    .Case("o0", SP::O0).Case("o1", SP::O1).Case("o2", SP::O2).Case("o3", SP::O3)
+    .Case("o4", SP::O4).Case("o5", SP::O5).Case("o6", SP::O6).Case("o7", SP::O7)
+    .Case("l0", SP::L0).Case("l1", SP::L1).Case("l2", SP::L2).Case("l3", SP::L3)
+    .Case("l4", SP::L4).Case("l5", SP::L5).Case("l6", SP::L6).Case("l7", SP::L7)
+    .Case("g0", SP::G0).Case("g1", SP::G1).Case("g2", SP::G2).Case("g3", SP::G3)
+    .Case("g4", SP::G4).Case("g5", SP::G5).Case("g6", SP::G6).Case("g7", SP::G7)
+    .Default(0);
+
+  if (Reg)
+    return Reg;
+
+  report_fatal_error("Invalid register name global variable");
+}
+
 // This functions returns true if CalleeName is a ABI function that returns
 // a long double (fp128).
 static bool isFP128ABICall(const char *CalleeName)
@@ -1131,7 +1154,7 @@ SparcTargetLowering::LowerCall_64(TargetLowering::CallLoweringInfo &CLI,
   unsigned ArgsSize = std::max(6*8u, CCInfo.getNextStackOffset());
 
   // Keep stack frames 16-byte aligned.
-  ArgsSize = RoundUpToAlignment(ArgsSize, 16);
+  ArgsSize = alignTo(ArgsSize, 16);
 
   // Varargs calls require special treatment.
   if (CLI.IsVarArg)
@@ -1194,16 +1217,13 @@ SparcTargetLowering::LowerCall_64(TargetLowering::CallLoweringInfo &CLI,
         LoPtrOff = DAG.getNode(ISD::ADD, DL, PtrVT, StackPtr, LoPtrOff);
 
         // Store to %sp+BIAS+128+Offset
-        SDValue Store = DAG.getStore(Chain, DL, Arg, HiPtrOff,
-                                     MachinePointerInfo(),
-                                     false, false, 0);
+        SDValue Store =
+            DAG.getStore(Chain, DL, Arg, HiPtrOff, MachinePointerInfo());
         // Load into Reg and Reg+1
-        SDValue Hi64 = DAG.getLoad(MVT::i64, DL, Store, HiPtrOff,
-                                   MachinePointerInfo(),
-                                   false, false, false, 0);
-        SDValue Lo64 = DAG.getLoad(MVT::i64, DL, Store, LoPtrOff,
-                                   MachinePointerInfo(),
-                                   false, false, false, 0);
+        SDValue Hi64 =
+            DAG.getLoad(MVT::i64, DL, Store, HiPtrOff, MachinePointerInfo());
+        SDValue Lo64 =
+            DAG.getLoad(MVT::i64, DL, Store, LoPtrOff, MachinePointerInfo());
         RegsToPass.push_back(std::make_pair(toCallerWindow(VA.getLocReg()),
                                             Hi64));
         RegsToPass.push_back(std::make_pair(toCallerWindow(VA.getLocReg()+1),
@@ -1242,9 +1262,8 @@ SparcTargetLowering::LowerCall_64(TargetLowering::CallLoweringInfo &CLI,
                                            Subtarget->getStackPointerBias() +
                                            128, DL);
     PtrOff = DAG.getNode(ISD::ADD, DL, PtrVT, StackPtr, PtrOff);
-    MemOpChains.push_back(DAG.getStore(Chain, DL, Arg, PtrOff,
-                                       MachinePointerInfo(),
-                                       false, false, 0));
+    MemOpChains.push_back(
+        DAG.getStore(Chain, DL, Arg, PtrOff, MachinePointerInfo()));
   }
 
   // Emit all stores, make sure they occur before the call.
@@ -1267,8 +1286,7 @@ SparcTargetLowering::LowerCall_64(TargetLowering::CallLoweringInfo &CLI,
   // Likewise ExternalSymbol -> TargetExternalSymbol.
   SDValue Callee = CLI.Callee;
   bool hasReturnsTwice = hasReturnsTwiceAttr(DAG, Callee, CLI.CS);
-  unsigned TF = ((getTargetMachine().getRelocationModel() == Reloc::PIC_)
-                 ? SparcMCExpr::VK_Sparc_WPLT30 : 0);
+  unsigned TF = isPositionIndependent() ? SparcMCExpr::VK_Sparc_WPLT30 : 0;
   if (GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee))
     Callee = DAG.getTargetGlobalAddress(G->getGlobal(), DL, PtrVT, 0, TF);
   else if (ExternalSymbolSDNode *E = dyn_cast<ExternalSymbolSDNode>(Callee))
@@ -1375,6 +1393,14 @@ SparcTargetLowering::LowerCall_64(TargetLowering::CallLoweringInfo &CLI,
 // TargetLowering Implementation
 //===----------------------------------------------------------------------===//
 
+TargetLowering::AtomicExpansionKind SparcTargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *AI) const {
+  if (AI->getOperation() == AtomicRMWInst::Xchg &&
+      AI->getType()->getPrimitiveSizeInBits() == 32)
+    return AtomicExpansionKind::None; // Uses xchg instruction
+
+  return AtomicExpansionKind::CmpXChg;
+}
+
 /// IntCondCCodeToICC - Convert a DAG integer condition code to a SPARC ICC
 /// condition.
 static SPCC::CondCodes IntCondCCodeToICC(ISD::CondCode CC) {
@@ -1421,7 +1447,7 @@ static SPCC::CondCodes FPCondCCodeToFCC(ISD::CondCode CC) {
   }
 }
 
-SparcTargetLowering::SparcTargetLowering(TargetMachine &TM,
+SparcTargetLowering::SparcTargetLowering(const TargetMachine &TM,
                                          const SparcSubtarget &STI)
     : TargetLowering(TM), Subtarget(&STI) {
   MVT PtrVT = MVT::getIntegerVT(8 * TM.getPointerSize());
@@ -1436,9 +1462,11 @@ SparcTargetLowering::SparcTargetLowering(TargetMachine &TM,
 
   // Set up the register classes.
   addRegisterClass(MVT::i32, &SP::IntRegsRegClass);
-  addRegisterClass(MVT::f32, &SP::FPRegsRegClass);
-  addRegisterClass(MVT::f64, &SP::DFPRegsRegClass);
-  addRegisterClass(MVT::f128, &SP::QFPRegsRegClass);
+  if (!Subtarget->useSoftFloat()) {
+    addRegisterClass(MVT::f32, &SP::FPRegsRegClass);
+    addRegisterClass(MVT::f64, &SP::DFPRegsRegClass);
+    addRegisterClass(MVT::f128, &SP::QFPRegsRegClass);
+  }
   if (Subtarget->is64Bit()) {
     addRegisterClass(MVT::i64, &SP::I64RegsRegClass);
   } else {
@@ -1559,6 +1587,9 @@ SparcTargetLowering::SparcTargetLowering(TargetMachine &TM,
   setOperationAction(ISD::SELECT_CC, MVT::f64, Custom);
   setOperationAction(ISD::SELECT_CC, MVT::f128, Custom);
 
+  setOperationAction(ISD::EH_SJLJ_SETJMP, MVT::i32, Custom);
+  setOperationAction(ISD::EH_SJLJ_LONGJMP, MVT::Other, Custom);
+
   if (Subtarget->is64Bit()) {
     setOperationAction(ISD::ADDC, MVT::i64, Custom);
     setOperationAction(ISD::ADDE, MVT::i64, Custom);
@@ -1574,9 +1605,7 @@ SparcTargetLowering::SparcTargetLowering(TargetMachine &TM,
     setOperationAction(ISD::CTPOP, MVT::i64,
                        Subtarget->usePopc() ? Legal : Expand);
     setOperationAction(ISD::CTTZ , MVT::i64, Expand);
-    setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::i64, Expand);
     setOperationAction(ISD::CTLZ , MVT::i64, Expand);
-    setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i64, Expand);
     setOperationAction(ISD::BSWAP, MVT::i64, Expand);
     setOperationAction(ISD::ROTL , MVT::i64, Expand);
     setOperationAction(ISD::ROTR , MVT::i64, Expand);
@@ -1584,15 +1613,17 @@ SparcTargetLowering::SparcTargetLowering(TargetMachine &TM,
   }
 
   // ATOMICs.
-  // FIXME: We insert fences for each atomics and generate sub-optimal code
-  // for PSO/TSO. Also, implement other atomicrmw operations.
+  // Atomics are supported on SparcV9. 32-bit atomics are also
+  // supported by some Leon SparcV8 variants. Otherwise, atomics
+  // are unsupported.
+  if (Subtarget->isV9() || Subtarget->hasLeonCasa())
+    setMaxAtomicSizeInBitsSupported(64);
+  else
+    setMaxAtomicSizeInBitsSupported(0);
 
-  setInsertFencesForAtomic(true);
+  setMinCmpXchgSizeInBits(32);
 
   setOperationAction(ISD::ATOMIC_SWAP, MVT::i32, Legal);
-  setOperationAction(ISD::ATOMIC_CMP_SWAP, MVT::i32,
-                     (Subtarget->isV9() ? Legal: Expand));
-
 
   setOperationAction(ISD::ATOMIC_FENCE, MVT::Other, Legal);
 
@@ -1629,9 +1660,7 @@ SparcTargetLowering::SparcTargetLowering(TargetMachine &TM,
   setOperationAction(ISD::FREM , MVT::f32, Expand);
   setOperationAction(ISD::FMA  , MVT::f32, Expand);
   setOperationAction(ISD::CTTZ , MVT::i32, Expand);
-  setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::i32, Expand);
   setOperationAction(ISD::CTLZ , MVT::i32, Expand);
-  setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i32, Expand);
   setOperationAction(ISD::ROTL , MVT::i32, Expand);
   setOperationAction(ISD::ROTR , MVT::i32, Expand);
   setOperationAction(ISD::BSWAP, MVT::i32, Expand);
@@ -1730,7 +1759,7 @@ SparcTargetLowering::SparcTargetLowering(TargetMachine &TM,
     setOperationAction(ISD::FP_ROUND,  MVT::f32, Custom);
 
     // Setup Runtime library names.
-    if (Subtarget->is64Bit()) {
+    if (Subtarget->is64Bit() && !Subtarget->useSoftFloat()) {
       setLibcallName(RTLIB::ADD_F128,  "_Qp_add");
       setLibcallName(RTLIB::SUB_F128,  "_Qp_sub");
       setLibcallName(RTLIB::MUL_F128,  "_Qp_mul");
@@ -1748,7 +1777,7 @@ SparcTargetLowering::SparcTargetLowering(TargetMachine &TM,
       setLibcallName(RTLIB::FPEXT_F64_F128, "_Qp_dtoq");
       setLibcallName(RTLIB::FPROUND_F128_F32, "_Qp_qtos");
       setLibcallName(RTLIB::FPROUND_F128_F64, "_Qp_qtod");
-    } else {
+    } else if (!Subtarget->useSoftFloat()) {
       setLibcallName(RTLIB::ADD_F128,  "_Q_add");
       setLibcallName(RTLIB::SUB_F128,  "_Q_sub");
       setLibcallName(RTLIB::MUL_F128,  "_Q_mul");
@@ -1769,35 +1798,56 @@ SparcTargetLowering::SparcTargetLowering(TargetMachine &TM,
     }
   }
 
+  if (Subtarget->fixAllFDIVSQRT()) {
+    // Promote FDIVS and FSQRTS to FDIVD and FSQRTD instructions instead as
+    // the former instructions generate errata on LEON processors.
+    setOperationAction(ISD::FDIV, MVT::f32, Promote);
+    setOperationAction(ISD::FSQRT, MVT::f32, Promote);
+  }
+
+  if (Subtarget->replaceFMULS()) {
+    // Promote FMULS to FMULD instructions instead as
+    // the former instructions generate errata on LEON processors.
+    setOperationAction(ISD::FMUL, MVT::f32, Promote);
+  }
+
+  setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::Other, Custom);
+
   setMinFunctionAlignment(2);
 
   computeRegisterProperties(Subtarget->getRegisterInfo());
 }
 
+bool SparcTargetLowering::useSoftFloat() const {
+  return Subtarget->useSoftFloat();
+}
+
 const char *SparcTargetLowering::getTargetNodeName(unsigned Opcode) const {
   switch ((SPISD::NodeType)Opcode) {
-  case SPISD::FIRST_NUMBER: break;
-  case SPISD::CMPICC:     return "SPISD::CMPICC";
-  case SPISD::CMPFCC:     return "SPISD::CMPFCC";
-  case SPISD::BRICC:      return "SPISD::BRICC";
-  case SPISD::BRXCC:      return "SPISD::BRXCC";
-  case SPISD::BRFCC:      return "SPISD::BRFCC";
-  case SPISD::SELECT_ICC: return "SPISD::SELECT_ICC";
-  case SPISD::SELECT_XCC: return "SPISD::SELECT_XCC";
-  case SPISD::SELECT_FCC: return "SPISD::SELECT_FCC";
-  case SPISD::Hi:         return "SPISD::Hi";
-  case SPISD::Lo:         return "SPISD::Lo";
-  case SPISD::FTOI:       return "SPISD::FTOI";
-  case SPISD::ITOF:       return "SPISD::ITOF";
-  case SPISD::FTOX:       return "SPISD::FTOX";
-  case SPISD::XTOF:       return "SPISD::XTOF";
-  case SPISD::CALL:       return "SPISD::CALL";
-  case SPISD::RET_FLAG:   return "SPISD::RET_FLAG";
+  case SPISD::FIRST_NUMBER:    break;
+  case SPISD::CMPICC:          return "SPISD::CMPICC";
+  case SPISD::CMPFCC:          return "SPISD::CMPFCC";
+  case SPISD::BRICC:           return "SPISD::BRICC";
+  case SPISD::BRXCC:           return "SPISD::BRXCC";
+  case SPISD::BRFCC:           return "SPISD::BRFCC";
+  case SPISD::SELECT_ICC:      return "SPISD::SELECT_ICC";
+  case SPISD::SELECT_XCC:      return "SPISD::SELECT_XCC";
+  case SPISD::SELECT_FCC:      return "SPISD::SELECT_FCC";
+  case SPISD::EH_SJLJ_SETJMP:  return "SPISD::EH_SJLJ_SETJMP";
+  case SPISD::EH_SJLJ_LONGJMP: return "SPISD::EH_SJLJ_LONGJMP";
+  case SPISD::Hi:              return "SPISD::Hi";
+  case SPISD::Lo:              return "SPISD::Lo";
+  case SPISD::FTOI:            return "SPISD::FTOI";
+  case SPISD::ITOF:            return "SPISD::ITOF";
+  case SPISD::FTOX:            return "SPISD::FTOX";
+  case SPISD::XTOF:            return "SPISD::XTOF";
+  case SPISD::CALL:            return "SPISD::CALL";
+  case SPISD::RET_FLAG:        return "SPISD::RET_FLAG";
   case SPISD::GLOBAL_BASE_REG: return "SPISD::GLOBAL_BASE_REG";
-  case SPISD::FLUSHW:     return "SPISD::FLUSHW";
-  case SPISD::TLS_ADD:    return "SPISD::TLS_ADD";
-  case SPISD::TLS_LD:     return "SPISD::TLS_LD";
-  case SPISD::TLS_CALL:   return "SPISD::TLS_CALL";
+  case SPISD::FLUSHW:          return "SPISD::FLUSHW";
+  case SPISD::TLS_ADD:         return "SPISD::TLS_ADD";
+  case SPISD::TLS_LD:          return "SPISD::TLS_LD";
+  case SPISD::TLS_CALL:        return "SPISD::TLS_CALL";
   }
   return nullptr;
 }
@@ -1902,8 +1952,8 @@ SDValue SparcTargetLowering::makeAddress(SDValue Op, SelectionDAG &DAG) const {
   SDLoc DL(Op);
   EVT VT = getPointerTy(DAG.getDataLayout());
 
-  // Handle PIC mode first.
-  if (getTargetMachine().getRelocationModel() == Reloc::PIC_) {
+  // Handle PIC mode first. SPARC needs a got load for every variable!
+  if (isPositionIndependent()) {
     // This is the pic32 code model, the GOT is known to be smaller than 4GB.
     SDValue HiLo = makeHiLoPair(Op, SparcMCExpr::VK_Sparc_GOT22,
                                 SparcMCExpr::VK_Sparc_GOT10, DAG);
@@ -1914,8 +1964,7 @@ SDValue SparcTargetLowering::makeAddress(SDValue Op, SelectionDAG &DAG) const {
     MachineFrameInfo *MFI = DAG.getMachineFunction().getFrameInfo();
     MFI->setHasCalls(true);
     return DAG.getLoad(VT, DL, DAG.getEntryNode(), AbsAddr,
-                       MachinePointerInfo::getGOT(DAG.getMachineFunction()),
-                       false, false, false, 0);
+                       MachinePointerInfo::getGOT(DAG.getMachineFunction()));
   }
 
   // This is one of the absolute code models.
@@ -2004,16 +2053,15 @@ SDValue SparcTargetLowering::LowerGlobalTLSAddress(SDValue Op,
     SDValue Symbol = withTargetFlags(Op, callTF, DAG);
 
     SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
-    SmallVector<SDValue, 4> Ops;
-    Ops.push_back(Chain);
-    Ops.push_back(Callee);
-    Ops.push_back(Symbol);
-    Ops.push_back(DAG.getRegister(SP::O0, PtrVT));
     const uint32_t *Mask = Subtarget->getRegisterInfo()->getCallPreservedMask(
         DAG.getMachineFunction(), CallingConv::C);
     assert(Mask && "Missing call preserved mask for calling convention");
-    Ops.push_back(DAG.getRegisterMask(Mask));
-    Ops.push_back(InFlag);
+    SDValue Ops[] = {Chain,
+                     Callee,
+                     Symbol,
+                     DAG.getRegister(SP::O0, PtrVT),
+                     DAG.getRegisterMask(Mask),
+                     InFlag};
     Chain = DAG.getNode(SPISD::TLS_CALL, DL, NodeTys, Ops);
     InFlag = Chain.getValue(1);
     Chain = DAG.getCALLSEQ_END(Chain, DAG.getIntPtrConstant(1, DL, true),
@@ -2068,10 +2116,10 @@ SDValue SparcTargetLowering::LowerGlobalTLSAddress(SDValue Op,
                      DAG.getRegister(SP::G7, PtrVT), Offset);
 }
 
-SDValue
-SparcTargetLowering::LowerF128_LibCallArg(SDValue Chain, ArgListTy &Args,
-                                          SDValue Arg, SDLoc DL,
-                                          SelectionDAG &DAG) const {
+SDValue SparcTargetLowering::LowerF128_LibCallArg(SDValue Chain,
+                                                  ArgListTy &Args, SDValue Arg,
+                                                  const SDLoc &DL,
+                                                  SelectionDAG &DAG) const {
   MachineFrameInfo *MFI = DAG.getMachineFunction().getFrameInfo();
   EVT ArgVT = Arg.getValueType();
   Type *ArgTy = ArgVT.getTypeForEVT(*DAG.getContext());
@@ -2084,14 +2132,8 @@ SparcTargetLowering::LowerF128_LibCallArg(SDValue Chain, ArgListTy &Args,
     // Create a stack object and pass the pointer to the library function.
     int FI = MFI->CreateStackObject(16, 8, false);
     SDValue FIPtr = DAG.getFrameIndex(FI, getPointerTy(DAG.getDataLayout()));
-    Chain = DAG.getStore(Chain,
-                         DL,
-                         Entry.Node,
-                         FIPtr,
-                         MachinePointerInfo(),
-                         false,
-                         false,
-                         8);
+    Chain = DAG.getStore(Chain, DL, Entry.Node, FIPtr, MachinePointerInfo(),
+                         /* Alignment = */ 8);
 
     Entry.Node = FIPtr;
     Entry.Ty   = PointerType::getUnqual(ArgTy);
@@ -2136,7 +2178,7 @@ SparcTargetLowering::LowerF128Op(SDValue Op, SelectionDAG &DAG,
   }
   TargetLowering::CallLoweringInfo CLI(DAG);
   CLI.setDebugLoc(SDLoc(Op)).setChain(Chain)
-    .setCallee(CallingConv::C, RetTyABI, Callee, std::move(Args), 0);
+    .setCallee(CallingConv::C, RetTyABI, Callee, std::move(Args));
 
   std::pair<SDValue, SDValue> CallInfo = LowerCallTo(CLI);
 
@@ -2149,19 +2191,13 @@ SparcTargetLowering::LowerF128Op(SDValue Op, SelectionDAG &DAG,
   Chain = CallInfo.second;
 
   // Load RetPtr to get the return value.
-  return DAG.getLoad(Op.getValueType(),
-                     SDLoc(Op),
-                     Chain,
-                     RetPtr,
-                     MachinePointerInfo(),
-                     false, false, false, 8);
+  return DAG.getLoad(Op.getValueType(), SDLoc(Op), Chain, RetPtr,
+                     MachinePointerInfo(), /* Alignment = */ 8);
 }
 
-SDValue
-SparcTargetLowering::LowerF128Compare(SDValue LHS, SDValue RHS,
-                                      unsigned &SPCC,
-                                      SDLoc DL,
-                                      SelectionDAG &DAG) const {
+SDValue SparcTargetLowering::LowerF128Compare(SDValue LHS, SDValue RHS,
+                                              unsigned &SPCC, const SDLoc &DL,
+                                              SelectionDAG &DAG) const {
 
   const char *LibCall = nullptr;
   bool is64Bit = Subtarget->is64Bit();
@@ -2193,7 +2229,7 @@ SparcTargetLowering::LowerF128Compare(SDValue LHS, SDValue RHS,
 
   TargetLowering::CallLoweringInfo CLI(DAG);
   CLI.setDebugLoc(DL).setChain(Chain)
-    .setCallee(CallingConv::C, RetTy, Callee, std::move(Args), 0);
+    .setCallee(CallingConv::C, RetTy, Callee, std::move(Args));
 
   std::pair<SDValue, SDValue> CallInfo = LowerCallTo(CLI);
 
@@ -2460,6 +2496,20 @@ static SDValue LowerSELECT_CC(SDValue Op, SelectionDAG &DAG,
                      DAG.getConstant(SPCC, dl, MVT::i32), CompareFlag);
 }
 
+SDValue SparcTargetLowering::LowerEH_SJLJ_SETJMP(SDValue Op, SelectionDAG &DAG,
+    const SparcTargetLowering &TLI) const {
+  SDLoc DL(Op);
+  return DAG.getNode(SPISD::EH_SJLJ_SETJMP, DL,
+      DAG.getVTList(MVT::i32, MVT::Other), Op.getOperand(0), Op.getOperand(1));
+
+}
+
+SDValue SparcTargetLowering::LowerEH_SJLJ_LONGJMP(SDValue Op, SelectionDAG &DAG,
+    const SparcTargetLowering &TLI) const {
+  SDLoc DL(Op);
+  return DAG.getNode(SPISD::EH_SJLJ_LONGJMP, DL, MVT::Other, Op.getOperand(0), Op.getOperand(1));
+}
+
 static SDValue LowerVASTART(SDValue Op, SelectionDAG &DAG,
                             const SparcTargetLowering &TLI) {
   MachineFunction &MF = DAG.getMachineFunction();
@@ -2477,7 +2527,7 @@ static SDValue LowerVASTART(SDValue Op, SelectionDAG &DAG,
                   DAG.getIntPtrConstant(FuncInfo->getVarArgsFrameOffset(), DL));
   const Value *SV = cast<SrcValueSDNode>(Op.getOperand(2))->getValue();
   return DAG.getStore(Op.getOperand(0), DL, Offset, Op.getOperand(1),
-                      MachinePointerInfo(SV), false, false, 0);
+                      MachinePointerInfo(SV));
 }
 
 static SDValue LowerVAARG(SDValue Op, SelectionDAG &DAG) {
@@ -2488,20 +2538,19 @@ static SDValue LowerVAARG(SDValue Op, SelectionDAG &DAG) {
   EVT PtrVT = VAListPtr.getValueType();
   const Value *SV = cast<SrcValueSDNode>(Node->getOperand(2))->getValue();
   SDLoc DL(Node);
-  SDValue VAList = DAG.getLoad(PtrVT, DL, InChain, VAListPtr,
-                               MachinePointerInfo(SV), false, false, false, 0);
+  SDValue VAList =
+      DAG.getLoad(PtrVT, DL, InChain, VAListPtr, MachinePointerInfo(SV));
   // Increment the pointer, VAList, to the next vaarg.
   SDValue NextPtr = DAG.getNode(ISD::ADD, DL, PtrVT, VAList,
                                 DAG.getIntPtrConstant(VT.getSizeInBits()/8,
                                                       DL));
   // Store the incremented VAList to the legalized pointer.
-  InChain = DAG.getStore(VAList.getValue(1), DL, NextPtr,
-                         VAListPtr, MachinePointerInfo(SV), false, false, 0);
+  InChain = DAG.getStore(VAList.getValue(1), DL, NextPtr, VAListPtr,
+                         MachinePointerInfo(SV));
   // Load the actual argument out of the pointer VAList.
   // We can't count on greater alignment than the word size.
   return DAG.getLoad(VT, DL, InChain, VAList, MachinePointerInfo(),
-                     false, false, false,
-                     std::min(PtrVT.getSizeInBits(), VT.getSizeInBits())/8);
+                     std::min(PtrVT.getSizeInBits(), VT.getSizeInBits()) / 8);
 }
 
 static SDValue LowerDYNAMIC_STACKALLOC(SDValue Op, SelectionDAG &DAG,
@@ -2564,8 +2613,7 @@ static SDValue getFRAMEADDR(uint64_t depth, SDValue Op, SelectionDAG &DAG,
   while (depth--) {
     SDValue Ptr = DAG.getNode(ISD::ADD, dl, VT, FrameAddr,
                               DAG.getIntPtrConstant(Offset, dl));
-    FrameAddr = DAG.getLoad(VT, dl, Chain, Ptr, MachinePointerInfo(),
-                            false, false, false, 0);
+    FrameAddr = DAG.getLoad(VT, dl, Chain, Ptr, MachinePointerInfo());
   }
   if (Subtarget->is64Bit())
     FrameAddr = DAG.getNode(ISD::ADD, dl, VT, FrameAddr,
@@ -2580,7 +2628,6 @@ static SDValue LowerFRAMEADDR(SDValue Op, SelectionDAG &DAG,
   uint64_t depth = Op.getConstantOperandVal(0);
 
   return getFRAMEADDR(depth, Op, DAG, Subtarget);
-
 }
 
 static SDValue LowerRETURNADDR(SDValue Op, SelectionDAG &DAG,
@@ -2613,30 +2660,34 @@ static SDValue LowerRETURNADDR(SDValue Op, SelectionDAG &DAG,
                             dl, VT,
                             FrameAddr,
                             DAG.getIntPtrConstant(Offset, dl));
-  RetAddr = DAG.getLoad(VT, dl, DAG.getEntryNode(), Ptr,
-                        MachinePointerInfo(), false, false, false, 0);
+  RetAddr = DAG.getLoad(VT, dl, DAG.getEntryNode(), Ptr, MachinePointerInfo());
 
   return RetAddr;
 }
 
-static SDValue LowerF64Op(SDValue Op, SelectionDAG &DAG, unsigned opcode)
-{
-  SDLoc dl(Op);
-
-  assert(Op.getValueType() == MVT::f64 && "LowerF64Op called on non-double!");
+static SDValue LowerF64Op(SDValue SrcReg64, const SDLoc &dl, SelectionDAG &DAG,
+                          unsigned opcode) {
+  assert(SrcReg64.getValueType() == MVT::f64 && "LowerF64Op called on non-double!");
   assert(opcode == ISD::FNEG || opcode == ISD::FABS);
 
   // Lower fneg/fabs on f64 to fneg/fabs on f32.
   // fneg f64 => fneg f32:sub_even, fmov f32:sub_odd.
   // fabs f64 => fabs f32:sub_even, fmov f32:sub_odd.
 
-  SDValue SrcReg64 = Op.getOperand(0);
+  // Note: in little-endian, the floating-point value is stored in the
+  // registers are in the opposite order, so the subreg with the sign
+  // bit is the highest-numbered (odd), rather than the
+  // lowest-numbered (even).
+
   SDValue Hi32 = DAG.getTargetExtractSubreg(SP::sub_even, dl, MVT::f32,
                                             SrcReg64);
   SDValue Lo32 = DAG.getTargetExtractSubreg(SP::sub_odd, dl, MVT::f32,
                                             SrcReg64);
 
-  Hi32 = DAG.getNode(opcode, dl, MVT::f32, Hi32);
+  if (DAG.getDataLayout().isLittleEndian())
+    Lo32 = DAG.getNode(opcode, dl, MVT::f32, Lo32);
+  else
+    Hi32 = DAG.getNode(opcode, dl, MVT::f32, Hi32);
 
   SDValue DstReg64 = SDValue(DAG.getMachineNode(TargetOpcode::IMPLICIT_DEF,
                                                 dl, MVT::f64), 0);
@@ -2652,29 +2703,22 @@ static SDValue LowerF128Load(SDValue Op, SelectionDAG &DAG)
 {
   SDLoc dl(Op);
   LoadSDNode *LdNode = dyn_cast<LoadSDNode>(Op.getNode());
-  assert(LdNode && LdNode->getOffset().getOpcode() == ISD::UNDEF
+  assert(LdNode && LdNode->getOffset().isUndef()
          && "Unexpected node type");
 
   unsigned alignment = LdNode->getAlignment();
   if (alignment > 8)
     alignment = 8;
 
-  SDValue Hi64 = DAG.getLoad(MVT::f64,
-                             dl,
-                             LdNode->getChain(),
-                             LdNode->getBasePtr(),
-                             LdNode->getPointerInfo(),
-                             false, false, false, alignment);
+  SDValue Hi64 =
+      DAG.getLoad(MVT::f64, dl, LdNode->getChain(), LdNode->getBasePtr(),
+                  LdNode->getPointerInfo(), alignment);
   EVT addrVT = LdNode->getBasePtr().getValueType();
   SDValue LoPtr = DAG.getNode(ISD::ADD, dl, addrVT,
                               LdNode->getBasePtr(),
                               DAG.getConstant(8, dl, addrVT));
-  SDValue Lo64 = DAG.getLoad(MVT::f64,
-                             dl,
-                             LdNode->getChain(),
-                             LoPtr,
-                             LdNode->getPointerInfo(),
-                             false, false, false, alignment);
+  SDValue Lo64 = DAG.getLoad(MVT::f64, dl, LdNode->getChain(), LoPtr,
+                             LdNode->getPointerInfo(), alignment);
 
   SDValue SubRegEven = DAG.getTargetConstant(SP::sub_even64, dl, MVT::i32);
   SDValue SubRegOdd  = DAG.getTargetConstant(SP::sub_odd64, dl, MVT::i32);
@@ -2713,7 +2757,7 @@ static SDValue LowerLOAD(SDValue Op, SelectionDAG &DAG)
 static SDValue LowerF128Store(SDValue Op, SelectionDAG &DAG) {
   SDLoc dl(Op);
   StoreSDNode *StNode = dyn_cast<StoreSDNode>(Op.getNode());
-  assert(StNode && StNode->getOffset().getOpcode() == ISD::UNDEF
+  assert(StNode && StNode->getOffset().isUndef()
          && "Unexpected node type");
   SDValue SubRegEven = DAG.getTargetConstant(SP::sub_even64, dl, MVT::i32);
   SDValue SubRegOdd  = DAG.getTargetConstant(SP::sub_odd64, dl, MVT::i32);
@@ -2734,22 +2778,15 @@ static SDValue LowerF128Store(SDValue Op, SelectionDAG &DAG) {
     alignment = 8;
 
   SDValue OutChains[2];
-  OutChains[0] = DAG.getStore(StNode->getChain(),
-                              dl,
-                              SDValue(Hi64, 0),
-                              StNode->getBasePtr(),
-                              MachinePointerInfo(),
-                              false, false, alignment);
+  OutChains[0] =
+      DAG.getStore(StNode->getChain(), dl, SDValue(Hi64, 0),
+                   StNode->getBasePtr(), MachinePointerInfo(), alignment);
   EVT addrVT = StNode->getBasePtr().getValueType();
   SDValue LoPtr = DAG.getNode(ISD::ADD, dl, addrVT,
                               StNode->getBasePtr(),
                               DAG.getConstant(8, dl, addrVT));
-  OutChains[1] = DAG.getStore(StNode->getChain(),
-                             dl,
-                             SDValue(Lo64, 0),
-                             LoPtr,
-                             MachinePointerInfo(),
-                             false, false, alignment);
+  OutChains[1] = DAG.getStore(StNode->getChain(), dl, SDValue(Lo64, 0), LoPtr,
+                              MachinePointerInfo(), alignment);
   return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, OutChains);
 }
 
@@ -2768,8 +2805,7 @@ static SDValue LowerSTORE(SDValue Op, SelectionDAG &DAG)
     SDValue Val = DAG.getNode(ISD::BITCAST, dl, MVT::v2i32, St->getValue());
     SDValue Chain = DAG.getStore(
         St->getChain(), dl, Val, St->getBasePtr(), St->getPointerInfo(),
-        St->isVolatile(), St->isNonTemporal(), St->getAlignment(),
-        St->getAAInfo());
+        St->isVolatile(), St->getMemOperand()->getFlags(), St->getAAInfo());
     return Chain;
   }
 
@@ -2780,24 +2816,35 @@ static SDValue LowerFNEGorFABS(SDValue Op, SelectionDAG &DAG, bool isV9) {
   assert((Op.getOpcode() == ISD::FNEG || Op.getOpcode() == ISD::FABS)
          && "invalid opcode");
 
+  SDLoc dl(Op);
+
   if (Op.getValueType() == MVT::f64)
-    return LowerF64Op(Op, DAG, Op.getOpcode());
+    return LowerF64Op(Op.getOperand(0), dl, DAG, Op.getOpcode());
   if (Op.getValueType() != MVT::f128)
     return Op;
 
   // Lower fabs/fneg on f128 to fabs/fneg on f64
   // fabs/fneg f128 => fabs/fneg f64:sub_even64, fmov f64:sub_odd64
+  // (As with LowerF64Op, on little-endian, we need to negate the odd
+  // subreg)
 
-  SDLoc dl(Op);
   SDValue SrcReg128 = Op.getOperand(0);
   SDValue Hi64 = DAG.getTargetExtractSubreg(SP::sub_even64, dl, MVT::f64,
                                             SrcReg128);
   SDValue Lo64 = DAG.getTargetExtractSubreg(SP::sub_odd64, dl, MVT::f64,
                                             SrcReg128);
-  if (isV9)
-    Hi64 = DAG.getNode(Op.getOpcode(), dl, MVT::f64, Hi64);
-  else
-    Hi64 = LowerF64Op(Hi64, DAG, Op.getOpcode());
+
+  if (DAG.getDataLayout().isLittleEndian()) {
+    if (isV9)
+      Lo64 = DAG.getNode(Op.getOpcode(), dl, MVT::f64, Lo64);
+    else
+      Lo64 = LowerF64Op(Lo64, dl, DAG, Op.getOpcode());
+  } else {
+    if (isV9)
+      Hi64 = DAG.getNode(Op.getOpcode(), dl, MVT::f64, Hi64);
+    else
+      Hi64 = LowerF64Op(Hi64, dl, DAG, Op.getOpcode());
+  }
 
   SDValue DstReg128 = SDValue(DAG.getMachineNode(TargetOpcode::IMPLICIT_DEF,
                                                  dl, MVT::f128), 0);
@@ -2906,12 +2953,25 @@ static SDValue LowerUMULO_SMULO(SDValue Op, SelectionDAG &DAG,
 }
 
 static SDValue LowerATOMIC_LOAD_STORE(SDValue Op, SelectionDAG &DAG) {
+  if (isStrongerThanMonotonic(cast<AtomicSDNode>(Op)->getOrdering()))
+  // Expand with a fence.
+  return SDValue();
+
   // Monotonic load/stores are legal.
-  if (cast<AtomicSDNode>(Op)->getOrdering() <= Monotonic)
-    return Op;
+  return Op;
+}
 
-  // Otherwise, expand with a fence.
-  return SDValue();
+SDValue SparcTargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
+                                                     SelectionDAG &DAG) const {
+  unsigned IntNo = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
+  SDLoc dl(Op);
+  switch (IntNo) {
+  default: return SDValue();    // Don't custom lower most intrinsics.
+  case Intrinsic::thread_pointer: {
+    EVT PtrVT = getPointerTy(DAG.getDataLayout());
+    return DAG.getRegister(SP::G7, PtrVT);
+  }
+  }
 }
 
 SDValue SparcTargetLowering::
@@ -2943,6 +3003,8 @@ LowerOperation(SDValue Op, SelectionDAG &DAG) const {
                                                   hasHardQuad);
   case ISD::SELECT_CC:          return LowerSELECT_CC(Op, DAG, *this,
                                                       hasHardQuad);
+  case ISD::EH_SJLJ_SETJMP:     return LowerEH_SJLJ_SETJMP(Op, DAG, *this);
+  case ISD::EH_SJLJ_LONGJMP:    return LowerEH_SJLJ_LONGJMP(Op, DAG, *this);
   case ISD::VASTART:            return LowerVASTART(Op, DAG, *this);
   case ISD::VAARG:              return LowerVAARG(Op, DAG);
   case ISD::DYNAMIC_STACKALLOC: return LowerDYNAMIC_STACKALLOC(Op, DAG,
@@ -2972,14 +3034,15 @@ LowerOperation(SDValue Op, SelectionDAG &DAG) const {
   case ISD::SMULO:              return LowerUMULO_SMULO(Op, DAG, *this);
   case ISD::ATOMIC_LOAD:
   case ISD::ATOMIC_STORE:       return LowerATOMIC_LOAD_STORE(Op, DAG);
+  case ISD::INTRINSIC_WO_CHAIN: return LowerINTRINSIC_WO_CHAIN(Op, DAG);
   }
 }
 
 MachineBasicBlock *
-SparcTargetLowering::EmitInstrWithCustomInserter(MachineInstr *MI,
+SparcTargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI,
                                                  MachineBasicBlock *BB) const {
-  switch (MI->getOpcode()) {
-  default: llvm_unreachable("Unknown SELECT_CC!");
+  switch (MI.getOpcode()) {
+  default: llvm_unreachable("Unknown Custom Instruction!");
   case SP::SELECT_CC_Int_ICC:
   case SP::SELECT_CC_FP_ICC:
   case SP::SELECT_CC_DFP_ICC:
@@ -2990,61 +3053,21 @@ SparcTargetLowering::EmitInstrWithCustomInserter(MachineInstr *MI,
   case SP::SELECT_CC_DFP_FCC:
   case SP::SELECT_CC_QFP_FCC:
     return expandSelectCC(MI, BB, SP::FBCOND);
+  case SP::EH_SJLJ_SETJMP32ri:
+  case SP::EH_SJLJ_SETJMP32rr:
+    return emitEHSjLjSetJmp(MI, BB);
+  case SP::EH_SJLJ_LONGJMP32rr:
+  case SP::EH_SJLJ_LONGJMP32ri:
+    return emitEHSjLjLongJmp(MI, BB);
+  }
+}
 
-  case SP::ATOMIC_LOAD_ADD_32:
-    return expandAtomicRMW(MI, BB, SP::ADDrr);
-  case SP::ATOMIC_LOAD_ADD_64:
-    return expandAtomicRMW(MI, BB, SP::ADDXrr);
-  case SP::ATOMIC_LOAD_SUB_32:
-    return expandAtomicRMW(MI, BB, SP::SUBrr);
-  case SP::ATOMIC_LOAD_SUB_64:
-    return expandAtomicRMW(MI, BB, SP::SUBXrr);
-  case SP::ATOMIC_LOAD_AND_32:
-    return expandAtomicRMW(MI, BB, SP::ANDrr);
-  case SP::ATOMIC_LOAD_AND_64:
-    return expandAtomicRMW(MI, BB, SP::ANDXrr);
-  case SP::ATOMIC_LOAD_OR_32:
-    return expandAtomicRMW(MI, BB, SP::ORrr);
-  case SP::ATOMIC_LOAD_OR_64:
-    return expandAtomicRMW(MI, BB, SP::ORXrr);
-  case SP::ATOMIC_LOAD_XOR_32:
-    return expandAtomicRMW(MI, BB, SP::XORrr);
-  case SP::ATOMIC_LOAD_XOR_64:
-    return expandAtomicRMW(MI, BB, SP::XORXrr);
-  case SP::ATOMIC_LOAD_NAND_32:
-    return expandAtomicRMW(MI, BB, SP::ANDrr);
-  case SP::ATOMIC_LOAD_NAND_64:
-    return expandAtomicRMW(MI, BB, SP::ANDXrr);
-
-  case SP::ATOMIC_SWAP_64:
-    return expandAtomicRMW(MI, BB, 0);
-
-  case SP::ATOMIC_LOAD_MAX_32:
-    return expandAtomicRMW(MI, BB, SP::MOVICCrr, SPCC::ICC_G);
-  case SP::ATOMIC_LOAD_MAX_64:
-    return expandAtomicRMW(MI, BB, SP::MOVXCCrr, SPCC::ICC_G);
-  case SP::ATOMIC_LOAD_MIN_32:
-    return expandAtomicRMW(MI, BB, SP::MOVICCrr, SPCC::ICC_LE);
-  case SP::ATOMIC_LOAD_MIN_64:
-    return expandAtomicRMW(MI, BB, SP::MOVXCCrr, SPCC::ICC_LE);
-  case SP::ATOMIC_LOAD_UMAX_32:
-    return expandAtomicRMW(MI, BB, SP::MOVICCrr, SPCC::ICC_GU);
-  case SP::ATOMIC_LOAD_UMAX_64:
-    return expandAtomicRMW(MI, BB, SP::MOVXCCrr, SPCC::ICC_GU);
-  case SP::ATOMIC_LOAD_UMIN_32:
-    return expandAtomicRMW(MI, BB, SP::MOVICCrr, SPCC::ICC_LEU);
-  case SP::ATOMIC_LOAD_UMIN_64:
-    return expandAtomicRMW(MI, BB, SP::MOVXCCrr, SPCC::ICC_LEU);
-  }
-}
-
-MachineBasicBlock*
-SparcTargetLowering::expandSelectCC(MachineInstr *MI,
-                                    MachineBasicBlock *BB,
+MachineBasicBlock *
+SparcTargetLowering::expandSelectCC(MachineInstr &MI, MachineBasicBlock *BB,
                                     unsigned BROpcode) const {
   const TargetInstrInfo &TII = *Subtarget->getInstrInfo();
-  DebugLoc dl = MI->getDebugLoc();
-  unsigned CC = (SPCC::CondCodes)MI->getOperand(3).getImm();
+  DebugLoc dl = MI.getDebugLoc();
+  unsigned CC = (SPCC::CondCodes)MI.getOperand(3).getImm();
 
   // To "insert" a SELECT_CC instruction, we actually have to insert the diamond
   // control-flow pattern.  The incoming instruction knows the destination vreg
@@ -3089,107 +3112,211 @@ SparcTargetLowering::expandSelectCC(MachineInstr *MI,
   //   %Result = phi [ %FalseValue, copy0MBB ], [ %TrueValue, thisMBB ]
   //  ...
   BB = sinkMBB;
-  BuildMI(*BB, BB->begin(), dl, TII.get(SP::PHI), MI->getOperand(0).getReg())
-    .addReg(MI->getOperand(2).getReg()).addMBB(copy0MBB)
-    .addReg(MI->getOperand(1).getReg()).addMBB(thisMBB);
+  BuildMI(*BB, BB->begin(), dl, TII.get(SP::PHI), MI.getOperand(0).getReg())
+      .addReg(MI.getOperand(2).getReg())
+      .addMBB(copy0MBB)
+      .addReg(MI.getOperand(1).getReg())
+      .addMBB(thisMBB);
 
-  MI->eraseFromParent();   // The pseudo instruction is gone now.
+  MI.eraseFromParent(); // The pseudo instruction is gone now.
   return BB;
 }
 
-MachineBasicBlock*
-SparcTargetLowering::expandAtomicRMW(MachineInstr *MI,
-                                     MachineBasicBlock *MBB,
-                                     unsigned Opcode,
-                                     unsigned CondCode) const {
-  const TargetInstrInfo &TII = *Subtarget->getInstrInfo();
-  MachineRegisterInfo &MRI = MBB->getParent()->getRegInfo();
-  DebugLoc DL = MI->getDebugLoc();
+MachineBasicBlock *
+SparcTargetLowering::emitEHSjLjLongJmp(MachineInstr &MI,
+                                       MachineBasicBlock *MBB) const {
+  DebugLoc DL = MI.getDebugLoc();
+  const TargetInstrInfo *TII = Subtarget->getInstrInfo();
+
+  MachineFunction *MF = MBB->getParent();
+  MachineRegisterInfo &MRI = MF->getRegInfo();
+  MachineInstrBuilder MIB;
+
+  MVT PVT = getPointerTy(MF->getDataLayout());
+  unsigned RegSize = PVT.getStoreSize();
+  assert(PVT == MVT::i32 && "Invalid Pointer Size!");
+
+  unsigned Buf = MI.getOperand(0).getReg();
+  unsigned JmpLoc = MRI.createVirtualRegister(&SP::IntRegsRegClass);
+
+  // TO DO: If we do 64-bit handling, this perhaps should be FLUSHW, not TA 3
+  MIB = BuildMI(*MBB, MI, DL, TII->get(SP::TRAPri), SP::G0).addImm(3).addImm(SPCC::ICC_A);
+
+  // Instruction to restore FP
+  const unsigned FP  = SP::I6;
+  MIB = BuildMI(*MBB, MI, DL, TII->get(SP::LDri))
+            .addReg(FP)
+            .addReg(Buf)
+            .addImm(0);
 
-  // MI is an atomic read-modify-write instruction of the form:
+  // Instruction to load jmp location
+  MIB = BuildMI(*MBB, MI, DL, TII->get(SP::LDri))
+            .addReg(JmpLoc, RegState::Define)
+            .addReg(Buf)
+            .addImm(RegSize);
+
+  // Instruction to restore SP
+  const unsigned SP  = SP::O6;
+  MIB = BuildMI(*MBB, MI, DL, TII->get(SP::LDri))
+            .addReg(SP)
+            .addReg(Buf)
+            .addImm(2 * RegSize);
+
+  // Instruction to restore I7
+  MIB = BuildMI(*MBB, MI, DL, TII->get(SP::LDri))
+            .addReg(SP::I7)
+            .addReg(Buf, RegState::Kill)
+            .addImm(3 * RegSize);
+
+  // Jump to JmpLoc
+  BuildMI(*MBB, MI, DL, TII->get(SP::JMPLrr)).addReg(SP::G0).addReg(JmpLoc, RegState::Kill).addReg(SP::G0);
+
+  MI.eraseFromParent();
+  return MBB;
+}
+
+MachineBasicBlock *
+SparcTargetLowering::emitEHSjLjSetJmp(MachineInstr &MI,
+                                      MachineBasicBlock *MBB) const {
+  DebugLoc DL = MI.getDebugLoc();
+  const TargetInstrInfo *TII = Subtarget->getInstrInfo();
+
+  MachineFunction *MF = MBB->getParent();
+  MachineRegisterInfo &MRI = MF->getRegInfo();
+  MachineInstrBuilder MIB;
+
+  MVT PVT = getPointerTy(MF->getDataLayout());
+  unsigned RegSize = PVT.getStoreSize();
+  assert(PVT == MVT::i32 && "Invalid Pointer Size!");
+
+  unsigned DstReg = MI.getOperand(0).getReg();
+  const TargetRegisterClass *RC = MRI.getRegClass(DstReg);
+  assert(RC->hasType(MVT::i32) && "Invalid destination!");
+  unsigned mainDstReg = MRI.createVirtualRegister(RC);
+  unsigned restoreDstReg = MRI.createVirtualRegister(RC);
+
+  // For v = setjmp(buf), we generate
   //
-  //   rd = atomicrmw<op> addr, rs2
+  // thisMBB:
+  //  buf[0] = FP
+  //  buf[RegSize] = restoreMBB <-- takes address of restoreMBB
+  //  buf[RegSize * 2] = O6
+  //  buf[RegSize * 3] = I7
+  //  Ensure restoreMBB remains in the relocations list (done using a bn instruction)
+  //  b mainMBB
   //
-  // All three operands are registers.
-  unsigned DestReg = MI->getOperand(0).getReg();
-  unsigned AddrReg = MI->getOperand(1).getReg();
-  unsigned Rs2Reg  = MI->getOperand(2).getReg();
-
-  // SelectionDAG has already inserted memory barriers before and after MI, so
-  // we simply have to implement the operatiuon in terms of compare-and-swap.
+  // mainMBB:
+  //  v_main = 0
+  //  b sinkMBB
   //
-  //   %val0 = load %addr
-  // loop:
-  //   %val = phi %val0, %dest
-  //   %upd = op %val, %rs2
-  //   %dest = cas %addr, %val, %upd
-  //   cmp %val, %dest
-  //   bne loop
-  // done:
+  // restoreMBB:
+  //  v_restore = 1
+  //  --fall through--
   //
-  bool is64Bit = SP::I64RegsRegClass.hasSubClassEq(MRI.getRegClass(DestReg));
-  const TargetRegisterClass *ValueRC =
-    is64Bit ? &SP::I64RegsRegClass : &SP::IntRegsRegClass;
-  unsigned Val0Reg = MRI.createVirtualRegister(ValueRC);
+  // sinkMBB:
+  //  v = phi(main, restore)
 
-  BuildMI(*MBB, MI, DL, TII.get(is64Bit ? SP::LDXri : SP::LDri), Val0Reg)
-    .addReg(AddrReg).addImm(0);
+  const BasicBlock *BB = MBB->getBasicBlock();
+  MachineFunction::iterator It = ++MBB->getIterator();
+  MachineBasicBlock *thisMBB = MBB;
+  MachineBasicBlock *mainMBB = MF->CreateMachineBasicBlock(BB);
+  MachineBasicBlock *restoreMBB = MF->CreateMachineBasicBlock(BB);
+  MachineBasicBlock *sinkMBB = MF->CreateMachineBasicBlock(BB);
 
-  // Split the basic block MBB before MI and insert the loop block in the hole.
-  MachineFunction::iterator MFI = MBB->getIterator();
-  const BasicBlock *LLVM_BB = MBB->getBasicBlock();
-  MachineFunction *MF = MBB->getParent();
-  MachineBasicBlock *LoopMBB = MF->CreateMachineBasicBlock(LLVM_BB);
-  MachineBasicBlock *DoneMBB = MF->CreateMachineBasicBlock(LLVM_BB);
-  ++MFI;
-  MF->insert(MFI, LoopMBB);
-  MF->insert(MFI, DoneMBB);
-
-  // Move MI and following instructions to DoneMBB.
-  DoneMBB->splice(DoneMBB->begin(), MBB, MI, MBB->end());
-  DoneMBB->transferSuccessorsAndUpdatePHIs(MBB);
-
-  // Connect the CFG again.
-  MBB->addSuccessor(LoopMBB);
-  LoopMBB->addSuccessor(LoopMBB);
-  LoopMBB->addSuccessor(DoneMBB);
-
-  // Build the loop block.
-  unsigned ValReg = MRI.createVirtualRegister(ValueRC);
-  // Opcode == 0 means try to write Rs2Reg directly (ATOMIC_SWAP).
-  unsigned UpdReg = (Opcode ? MRI.createVirtualRegister(ValueRC) : Rs2Reg);
-
-  BuildMI(LoopMBB, DL, TII.get(SP::PHI), ValReg)
-    .addReg(Val0Reg).addMBB(MBB)
-    .addReg(DestReg).addMBB(LoopMBB);
-
-  if (CondCode) {
-    // This is one of the min/max operations. We need a CMPrr followed by a
-    // MOVXCC/MOVICC.
-    BuildMI(LoopMBB, DL, TII.get(SP::CMPrr)).addReg(ValReg).addReg(Rs2Reg);
-    BuildMI(LoopMBB, DL, TII.get(Opcode), UpdReg)
-      .addReg(ValReg).addReg(Rs2Reg).addImm(CondCode);
-  } else if (Opcode) {
-    BuildMI(LoopMBB, DL, TII.get(Opcode), UpdReg)
-      .addReg(ValReg).addReg(Rs2Reg);
-  }
-
-  if (MI->getOpcode() == SP::ATOMIC_LOAD_NAND_32 ||
-      MI->getOpcode() == SP::ATOMIC_LOAD_NAND_64) {
-    unsigned TmpReg = UpdReg;
-    UpdReg = MRI.createVirtualRegister(ValueRC);
-    BuildMI(LoopMBB, DL, TII.get(SP::XORri), UpdReg).addReg(TmpReg).addImm(-1);
-  }
-
-  BuildMI(LoopMBB, DL, TII.get(is64Bit ? SP::CASXrr : SP::CASrr), DestReg)
-    .addReg(AddrReg).addReg(ValReg).addReg(UpdReg)
-    .setMemRefs(MI->memoperands_begin(), MI->memoperands_end());
-  BuildMI(LoopMBB, DL, TII.get(SP::CMPrr)).addReg(ValReg).addReg(DestReg);
-  BuildMI(LoopMBB, DL, TII.get(is64Bit ? SP::BPXCC : SP::BCOND))
-    .addMBB(LoopMBB).addImm(SPCC::ICC_NE);
-
-  MI->eraseFromParent();
-  return DoneMBB;
+  MF->insert(It, mainMBB);
+  MF->insert(It, restoreMBB);
+  MF->insert(It, sinkMBB);
+  restoreMBB->setHasAddressTaken();
+
+  // Transfer the remainder of BB and its successor edges to sinkMBB.
+  sinkMBB->splice(sinkMBB->begin(), MBB,
+                  std::next(MachineBasicBlock::iterator(MI)),
+                  MBB->end());
+  sinkMBB->transferSuccessorsAndUpdatePHIs(MBB);
+
+  unsigned LabelReg = MRI.createVirtualRegister(&SP::IntRegsRegClass);
+  unsigned LabelReg2 = MRI.createVirtualRegister(&SP::IntRegsRegClass);
+  unsigned BufReg = MI.getOperand(1).getReg();
+
+  // Instruction to store FP
+  const unsigned FP  = SP::I6;
+  MIB = BuildMI(thisMBB, DL, TII->get(SP::STri))
+            .addReg(BufReg)
+            .addImm(0)
+            .addReg(FP);
+
+  // Instructions to store jmp location
+  MIB = BuildMI(thisMBB, DL, TII->get(SP::SETHIi))
+            .addReg(LabelReg, RegState::Define)
+            .addMBB(restoreMBB, SparcMCExpr::VK_Sparc_HI);
+
+  MIB = BuildMI(thisMBB, DL, TII->get(SP::ORri))
+            .addReg(LabelReg2, RegState::Define)
+            .addReg(LabelReg, RegState::Kill)
+            .addMBB(restoreMBB, SparcMCExpr::VK_Sparc_LO);
+
+  MIB = BuildMI(thisMBB, DL, TII->get(SP::STri))
+            .addReg(BufReg)
+            .addImm(RegSize)
+            .addReg(LabelReg2, RegState::Kill);
+
+  // Instruction to store SP
+  const unsigned SP  = SP::O6;
+  MIB = BuildMI(thisMBB, DL, TII->get(SP::STri))
+            .addReg(BufReg)
+            .addImm(2 * RegSize)
+            .addReg(SP);
+
+  // Instruction to store I7
+  MIB = BuildMI(thisMBB, DL, TII->get(SP::STri))
+            .addReg(BufReg)
+            .addImm(3 * RegSize)
+            .addReg(SP::I7);
+
+
+  // FIX ME: This next instruction ensures that the restoreMBB block address remains
+  // valid through optimization passes and serves no other purpose. The ICC_N ensures
+  // that the branch is never taken. This commented-out code here was an alternative
+  // attempt to achieve this which brought myriad problems.
+  //MIB = BuildMI(thisMBB, DL, TII->get(SP::EH_SjLj_Setup)).addMBB(restoreMBB, SparcMCExpr::VK_Sparc_None);
+  MIB = BuildMI(thisMBB, DL, TII->get(SP::BCOND))
+              .addMBB(restoreMBB)
+              .addImm(SPCC::ICC_N);
+
+  MIB = BuildMI(thisMBB, DL, TII->get(SP::BCOND))
+              .addMBB(mainMBB)
+              .addImm(SPCC::ICC_A);
+
+  thisMBB->addSuccessor(mainMBB);
+  thisMBB->addSuccessor(restoreMBB);
+
+
+  // mainMBB:
+  MIB = BuildMI(mainMBB, DL, TII->get(SP::ORrr))
+             .addReg(mainDstReg, RegState::Define)
+             .addReg(SP::G0)
+             .addReg(SP::G0);
+  MIB = BuildMI(mainMBB, DL, TII->get(SP::BCOND)).addMBB(sinkMBB).addImm(SPCC::ICC_A);
+
+  mainMBB->addSuccessor(sinkMBB);
+
+
+  // restoreMBB:
+  MIB = BuildMI(restoreMBB, DL, TII->get(SP::ORri))
+              .addReg(restoreDstReg, RegState::Define)
+              .addReg(SP::G0)
+              .addImm(1);
+  //MIB = BuildMI(restoreMBB, DL, TII->get(SP::BCOND)).addMBB(sinkMBB).addImm(SPCC::ICC_A);
+  restoreMBB->addSuccessor(sinkMBB);
+
+  // sinkMBB:
+  MIB = BuildMI(*sinkMBB, sinkMBB->begin(), DL,
+                TII->get(SP::PHI), DstReg)
+             .addReg(mainDstReg).addMBB(mainMBB)
+             .addReg(restoreDstReg).addMBB(restoreMBB);
+
+  MI.eraseFromParent();
+  return sinkMBB;
 }
 
 //===----------------------------------------------------------------------===//
@@ -3202,8 +3329,11 @@ SparcTargetLowering::ConstraintType
 SparcTargetLowering::getConstraintType(StringRef Constraint) const {
   if (Constraint.size() == 1) {
     switch (Constraint[0]) {
-    default:  break;
-    case 'r': return C_RegisterClass;
+    default:
+      break;
+    case 'f':
+    case 'r':
+      return C_RegisterClass;
     case 'I': // SIMM13
       return C_Other;
     }
@@ -3277,6 +3407,9 @@ SparcTargetLowering::getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI,
                                                   MVT VT) const {
   if (Constraint.size() == 1) {
     switch (Constraint[0]) {
+    case 'f':
+      return std::make_pair(0U, &SP::FPRegsRegClass);
+
     case 'r':
       if (VT == MVT::v2i32)
         return std::make_pair(0U, &SP::IntPairRegClass);
@@ -3368,10 +3501,9 @@ void SparcTargetLowering::ReplaceNodeResults(SDNode *N,
 
     SDLoc dl(N);
     SDValue LoadRes = DAG.getExtLoad(
-        Ld->getExtensionType(), dl, MVT::v2i32,
-        Ld->getChain(), Ld->getBasePtr(), Ld->getPointerInfo(),
-        MVT::v2i32, Ld->isVolatile(), Ld->isNonTemporal(),
-        Ld->isInvariant(), Ld->getAlignment(), Ld->getAAInfo());
+        Ld->getExtensionType(), dl, MVT::v2i32, Ld->getChain(),
+        Ld->getBasePtr(), Ld->getPointerInfo(), MVT::v2i32, Ld->getAlignment(),
+        Ld->getMemOperand()->getFlags(), Ld->getAAInfo());
 
     SDValue Res = DAG.getNode(ISD::BITCAST, dl, MVT::i64, LoadRes);
     Results.push_back(Res);
@@ -3380,3 +3512,16 @@ void SparcTargetLowering::ReplaceNodeResults(SDNode *N,
   }
   }
 }
+
+// Override to enable LOAD_STACK_GUARD lowering on Linux.
+bool SparcTargetLowering::useLoadStackGuardNode() const {
+  if (!Subtarget->isTargetLinux())
+    return TargetLowering::useLoadStackGuardNode();
+  return true;
+}
+
+// Override to disable global variable loading on Linux.
+void SparcTargetLowering::insertSSPDeclarations(Module &M) const {
+  if (!Subtarget->isTargetLinux())
+    return TargetLowering::insertSSPDeclarations(M);
+}
diff --git a/lib/Target/Sparc/SparcISelLowering.h b/lib/Target/Sparc/SparcISelLowering.h
index 4e46709cfc09..e0a421b83712 100644
--- a/lib/Target/Sparc/SparcISelLowering.h
+++ b/lib/Target/Sparc/SparcISelLowering.h
@@ -33,6 +33,9 @@ namespace llvm {
       SELECT_XCC,  // Select between two values using the current XCC flags.
       SELECT_FCC,  // Select between two values using the current FCC flags.
 
+      EH_SJLJ_SETJMP,  // builtin setjmp operation
+      EH_SJLJ_LONGJMP, // builtin longjmp operation
+
       Hi, Lo,      // Hi/Lo operations, typically on a global address.
 
       FTOI,        // FP to Int within a FP register.
@@ -54,9 +57,11 @@ namespace llvm {
   class SparcTargetLowering : public TargetLowering {
     const SparcSubtarget *Subtarget;
   public:
-    SparcTargetLowering(TargetMachine &TM, const SparcSubtarget &STI);
+    SparcTargetLowering(const TargetMachine &TM, const SparcSubtarget &STI);
     SDValue LowerOperation(SDValue Op, SelectionDAG &DAG) const override;
-
+    
+    bool useSoftFloat() const override;
+    
     /// computeKnownBitsForTargetNode - Determine which of the bits specified
     /// in Mask are known to be either zero or one and return them in the
     /// KnownZero/KnownOne bitsets.
@@ -67,8 +72,8 @@ namespace llvm {
                                        unsigned Depth = 0) const override;
 
     MachineBasicBlock *
-      EmitInstrWithCustomInserter(MachineInstr *MI,
-                                  MachineBasicBlock *MBB) const override;
+    EmitInstrWithCustomInserter(MachineInstr &MI,
+                                MachineBasicBlock *MBB) const override;
 
     const char *getTargetNodeName(unsigned Opcode) const override;
 
@@ -80,6 +85,14 @@ namespace llvm {
                                       std::string &Constraint,
                                       std::vector<SDValue> &Ops,
                                       SelectionDAG &DAG) const override;
+
+    unsigned
+    getInlineAsmMemConstraint(StringRef ConstraintCode) const override {
+      if (ConstraintCode == "o")
+        return InlineAsm::Constraint_o;
+      return TargetLowering::getInlineAsmMemConstraint(ConstraintCode);
+    }
+
     std::pair<unsigned, const TargetRegisterClass *>
     getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI,
                                  StringRef Constraint, MVT VT) const override;
@@ -89,6 +102,9 @@ namespace llvm {
       return MVT::i32;
     }
 
+    unsigned getRegisterByName(const char* RegName, EVT VT,
+                               SelectionDAG &DAG) const override;
+
     /// If a physical register, this returns the register that receives the
     /// exception address on entry to an EH pad.
     unsigned
@@ -103,28 +119,28 @@ namespace llvm {
       return SP::I1;
     }
 
+    /// Override to support customized stack guard loading.
+    bool useLoadStackGuardNode() const override;
+    void insertSSPDeclarations(Module &M) const override;
+
     /// getSetCCResultType - Return the ISD::SETCC ValueType
     EVT getSetCCResultType(const DataLayout &DL, LLVMContext &Context,
                            EVT VT) const override;
 
     SDValue
-      LowerFormalArguments(SDValue Chain,
-                           CallingConv::ID CallConv,
-                           bool isVarArg,
-                           const SmallVectorImpl<ISD::InputArg> &Ins,
-                           SDLoc dl, SelectionDAG &DAG,
-                           SmallVectorImpl<SDValue> &InVals) const override;
-    SDValue LowerFormalArguments_32(SDValue Chain,
-                                    CallingConv::ID CallConv,
+    LowerFormalArguments(SDValue Chain, CallingConv::ID CallConv, bool isVarArg,
+                         const SmallVectorImpl<ISD::InputArg> &Ins,
+                         const SDLoc &dl, SelectionDAG &DAG,
+                         SmallVectorImpl<SDValue> &InVals) const override;
+    SDValue LowerFormalArguments_32(SDValue Chain, CallingConv::ID CallConv,
                                     bool isVarArg,
                                     const SmallVectorImpl<ISD::InputArg> &Ins,
-                                    SDLoc dl, SelectionDAG &DAG,
+                                    const SDLoc &dl, SelectionDAG &DAG,
                                     SmallVectorImpl<SDValue> &InVals) const;
-    SDValue LowerFormalArguments_64(SDValue Chain,
-                                    CallingConv::ID CallConv,
+    SDValue LowerFormalArguments_64(SDValue Chain, CallingConv::ID CallConv,
                                     bool isVarArg,
                                     const SmallVectorImpl<ISD::InputArg> &Ins,
-                                    SDLoc dl, SelectionDAG &DAG,
+                                    const SDLoc &dl, SelectionDAG &DAG,
                                     SmallVectorImpl<SDValue> &InVals) const;
 
     SDValue
@@ -135,44 +151,46 @@ namespace llvm {
     SDValue LowerCall_64(TargetLowering::CallLoweringInfo &CLI,
                          SmallVectorImpl<SDValue> &InVals) const;
 
-    SDValue
-      LowerReturn(SDValue Chain,
-                  CallingConv::ID CallConv, bool isVarArg,
-                  const SmallVectorImpl<ISD::OutputArg> &Outs,
-                  const SmallVectorImpl<SDValue> &OutVals,
-                  SDLoc dl, SelectionDAG &DAG) const override;
-    SDValue LowerReturn_32(SDValue Chain,
-                           CallingConv::ID CallConv, bool IsVarArg,
+    SDValue LowerReturn(SDValue Chain, CallingConv::ID CallConv, bool isVarArg,
+                        const SmallVectorImpl<ISD::OutputArg> &Outs,
+                        const SmallVectorImpl<SDValue> &OutVals,
+                        const SDLoc &dl, SelectionDAG &DAG) const override;
+    SDValue LowerReturn_32(SDValue Chain, CallingConv::ID CallConv,
+                           bool IsVarArg,
                            const SmallVectorImpl<ISD::OutputArg> &Outs,
                            const SmallVectorImpl<SDValue> &OutVals,
-                           SDLoc DL, SelectionDAG &DAG) const;
-    SDValue LowerReturn_64(SDValue Chain,
-                           CallingConv::ID CallConv, bool IsVarArg,
+                           const SDLoc &DL, SelectionDAG &DAG) const;
+    SDValue LowerReturn_64(SDValue Chain, CallingConv::ID CallConv,
+                           bool IsVarArg,
                            const SmallVectorImpl<ISD::OutputArg> &Outs,
                            const SmallVectorImpl<SDValue> &OutVals,
-                           SDLoc DL, SelectionDAG &DAG) const;
+                           const SDLoc &DL, SelectionDAG &DAG) const;
 
     SDValue LowerGlobalAddress(SDValue Op, SelectionDAG &DAG) const;
     SDValue LowerGlobalTLSAddress(SDValue Op, SelectionDAG &DAG) const;
     SDValue LowerConstantPool(SDValue Op, SelectionDAG &DAG) const;
     SDValue LowerBlockAddress(SDValue Op, SelectionDAG &DAG) const;
 
+    SDValue LowerEH_SJLJ_SETJMP(SDValue Op, SelectionDAG &DAG,
+                                const SparcTargetLowering &TLI) const ;
+    SDValue LowerEH_SJLJ_LONGJMP(SDValue Op, SelectionDAG &DAG,
+                                 const SparcTargetLowering &TLI) const ;
+
     unsigned getSRetArgSize(SelectionDAG &DAG, SDValue Callee) const;
     SDValue withTargetFlags(SDValue Op, unsigned TF, SelectionDAG &DAG) const;
     SDValue makeHiLoPair(SDValue Op, unsigned HiTF, unsigned LoTF,
                          SelectionDAG &DAG) const;
     SDValue makeAddress(SDValue Op, SelectionDAG &DAG) const;
 
-    SDValue LowerF128_LibCallArg(SDValue Chain, ArgListTy &Args,
-                                 SDValue Arg, SDLoc DL,
-                                 SelectionDAG &DAG) const;
+    SDValue LowerF128_LibCallArg(SDValue Chain, ArgListTy &Args, SDValue Arg,
+                                 const SDLoc &DL, SelectionDAG &DAG) const;
     SDValue LowerF128Op(SDValue Op, SelectionDAG &DAG,
                         const char *LibFuncName,
                         unsigned numArgs) const;
-    SDValue LowerF128Compare(SDValue LHS, SDValue RHS,
-                             unsigned &SPCC,
-                             SDLoc DL,
-                             SelectionDAG &DAG) const;
+    SDValue LowerF128Compare(SDValue LHS, SDValue RHS, unsigned &SPCC,
+                             const SDLoc &DL, SelectionDAG &DAG) const;
+
+    SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG) const;
 
     bool ShouldShrinkFPConstant(EVT VT) const override {
       // Do not shrink FP constpool if VT == MVT::f128.
@@ -180,16 +198,25 @@ namespace llvm {
       return VT != MVT::f128;
     }
 
+    bool shouldInsertFencesForAtomic(const Instruction *I) const override {
+      // FIXME: We insert fences for each atomics and generate
+      // sub-optimal code for PSO/TSO. (Approximately nobody uses any
+      // mode but TSO, which makes this even more silly)
+      return true;
+    }
+
+    AtomicExpansionKind shouldExpandAtomicRMWInIR(AtomicRMWInst *AI) const override;
+
     void ReplaceNodeResults(SDNode *N,
                             SmallVectorImpl<SDValue>& Results,
                             SelectionDAG &DAG) const override;
 
-    MachineBasicBlock *expandSelectCC(MachineInstr *MI, MachineBasicBlock *BB,
+    MachineBasicBlock *expandSelectCC(MachineInstr &MI, MachineBasicBlock *BB,
                                       unsigned BROpcode) const;
-    MachineBasicBlock *expandAtomicRMW(MachineInstr *MI,
-                                       MachineBasicBlock *BB,
-                                       unsigned Opcode,
-                                       unsigned CondCode = 0) const;
+    MachineBasicBlock *emitEHSjLjSetJmp(MachineInstr &MI,
+                                        MachineBasicBlock *MBB) const;
+    MachineBasicBlock *emitEHSjLjLongJmp(MachineInstr &MI,
+                                         MachineBasicBlock *MBB) const;
   };
 } // end namespace llvm
 
diff --git a/lib/Target/Sparc/SparcInstr64Bit.td b/lib/Target/Sparc/SparcInstr64Bit.td
index 419e8ccb1024..f6518c936ebc 100644
--- a/lib/Target/Sparc/SparcInstr64Bit.td
+++ b/lib/Target/Sparc/SparcInstr64Bit.td
@@ -492,7 +492,7 @@ let Predicates = [Is64Bit], Constraints = "$swap = $rd", asi = 0b10000000 in {
                                      I64Regs:$swap),
                  "casx [$rs1], $rs2, $rd",
                  [(set i64:$rd,
-                     (atomic_cmp_swap i64:$rs1, i64:$rs2, i64:$swap))]>;
+                     (atomic_cmp_swap_64 i64:$rs1, i64:$rs2, i64:$swap))]>;
 
 } // Predicates = [Is64Bit], Constraints = ...
 
@@ -501,48 +501,15 @@ let Predicates = [Is64Bit] in {
 def : Pat<(atomic_fence imm, imm), (MEMBARi 0xf)>;
 
 // atomic_load_64 addr -> load addr
-def : Pat<(i64 (atomic_load ADDRrr:$src)), (LDXrr ADDRrr:$src)>;
-def : Pat<(i64 (atomic_load ADDRri:$src)), (LDXri ADDRri:$src)>;
+def : Pat<(i64 (atomic_load_64 ADDRrr:$src)), (LDXrr ADDRrr:$src)>;
+def : Pat<(i64 (atomic_load_64 ADDRri:$src)), (LDXri ADDRri:$src)>;
 
 // atomic_store_64 val, addr -> store val, addr
-def : Pat<(atomic_store ADDRrr:$dst, i64:$val), (STXrr ADDRrr:$dst, $val)>;
-def : Pat<(atomic_store ADDRri:$dst, i64:$val), (STXri ADDRri:$dst, $val)>;
+def : Pat<(atomic_store_64 ADDRrr:$dst, i64:$val), (STXrr ADDRrr:$dst, $val)>;
+def : Pat<(atomic_store_64 ADDRri:$dst, i64:$val), (STXri ADDRri:$dst, $val)>;
 
 } // Predicates = [Is64Bit]
 
-let usesCustomInserter = 1, hasCtrlDep = 1, mayLoad = 1, mayStore = 1,
-    Defs = [ICC] in
-multiclass AtomicRMW<SDPatternOperator op32, SDPatternOperator op64> {
-
-  def _32 : Pseudo<(outs IntRegs:$rd),
-                   (ins ptr_rc:$addr, IntRegs:$rs2), "",
-                   [(set i32:$rd, (op32 iPTR:$addr, i32:$rs2))]>;
-
-  let Predicates = [Is64Bit] in
-  def _64 : Pseudo<(outs I64Regs:$rd),
-                   (ins ptr_rc:$addr, I64Regs:$rs2), "",
-                   [(set i64:$rd, (op64 iPTR:$addr, i64:$rs2))]>;
-}
-
-defm ATOMIC_LOAD_ADD  : AtomicRMW<atomic_load_add_32,  atomic_load_add_64>;
-defm ATOMIC_LOAD_SUB  : AtomicRMW<atomic_load_sub_32,  atomic_load_sub_64>;
-defm ATOMIC_LOAD_AND  : AtomicRMW<atomic_load_and_32,  atomic_load_and_64>;
-defm ATOMIC_LOAD_OR   : AtomicRMW<atomic_load_or_32,   atomic_load_or_64>;
-defm ATOMIC_LOAD_XOR  : AtomicRMW<atomic_load_xor_32,  atomic_load_xor_64>;
-defm ATOMIC_LOAD_NAND : AtomicRMW<atomic_load_nand_32, atomic_load_nand_64>;
-defm ATOMIC_LOAD_MIN  : AtomicRMW<atomic_load_min_32,  atomic_load_min_64>;
-defm ATOMIC_LOAD_MAX  : AtomicRMW<atomic_load_max_32,  atomic_load_max_64>;
-defm ATOMIC_LOAD_UMIN : AtomicRMW<atomic_load_umin_32, atomic_load_umin_64>;
-defm ATOMIC_LOAD_UMAX : AtomicRMW<atomic_load_umax_32, atomic_load_umax_64>;
-
-// There is no 64-bit variant of SWAP, so use a pseudo.
-let usesCustomInserter = 1, hasCtrlDep = 1, mayLoad = 1, mayStore = 1,
-    Defs = [ICC], Predicates = [Is64Bit] in
-def ATOMIC_SWAP_64 : Pseudo<(outs I64Regs:$rd),
-                            (ins ptr_rc:$addr, I64Regs:$rs2), "",
-                            [(set i64:$rd,
-                                  (atomic_swap_64 iPTR:$addr, i64:$rs2))]>;
-
 let Predicates = [Is64Bit], hasSideEffects = 1, Uses = [ICC], cc = 0b10 in
  defm TXCC : TRAP<"%xcc">;
 
diff --git a/lib/Target/Sparc/SparcInstrAliases.td b/lib/Target/Sparc/SparcInstrAliases.td
index 361d21440a97..df570cea8da8 100644
--- a/lib/Target/Sparc/SparcInstrAliases.td
+++ b/lib/Target/Sparc/SparcInstrAliases.td
@@ -136,59 +136,68 @@ multiclass int_cond_alias<string cond, int condVal> {
                   (FMOVQ_XCC QFPRegs:$rd, QFPRegs:$rs2, condVal)>,
                   Requires<[Is64Bit, HasHardQuad]>;
 
-  // t<cond> %icc, rs1 + rs2
-  def : InstAlias<!strconcat(!strconcat("t", cond), " %icc, $rs1 + $rs2"),
-                  (TICCrr IntRegs:$rs1, IntRegs:$rs2, condVal)>,
-                  Requires<[HasV9]>;
-
   // t<cond> %icc,  rs => t<cond> %icc, G0 + rs
   def : InstAlias<!strconcat(!strconcat("t", cond), " %icc, $rs2"),
                   (TICCrr G0, IntRegs:$rs2, condVal)>,
                   Requires<[HasV9]>;
-
-  // t<cond> %xcc, rs1 + rs2
-  def : InstAlias<!strconcat(!strconcat("t", cond), " %xcc, $rs1 + $rs2"),
-                  (TXCCrr IntRegs:$rs1, IntRegs:$rs2, condVal)>,
+  // t<cond> %icc, rs1 + rs2
+  def : InstAlias<!strconcat(!strconcat("t", cond), " %icc, $rs1 + $rs2"),
+                  (TICCrr IntRegs:$rs1, IntRegs:$rs2, condVal)>,
                   Requires<[HasV9]>;
 
+
   // t<cond> %xcc, rs => t<cond> %xcc, G0 + rs
   def : InstAlias<!strconcat(!strconcat("t", cond), " %xcc, $rs2"),
                   (TXCCrr G0, IntRegs:$rs2, condVal)>,
                   Requires<[HasV9]>;
+  // t<cond> %xcc, rs1 + rs2
+  def : InstAlias<!strconcat(!strconcat("t", cond), " %xcc, $rs1 + $rs2"),
+                  (TXCCrr IntRegs:$rs1, IntRegs:$rs2, condVal)>,
+                  Requires<[HasV9]>;
 
-  // t<cond> rs1 + rs2 => t<cond> %icc, rs1 + rs2
-  def : InstAlias<!strconcat(!strconcat("t", cond), " $rs1 + $rs2"),
-                  (TICCrr IntRegs:$rs1, IntRegs:$rs2, condVal)>;
 
   // t<cond> rs=> t<cond> %icc,  G0 + rs2
-  def : InstAlias<!strconcat(!strconcat("t", cond), " $rs2"),
-                  (TICCrr G0, IntRegs:$rs2, condVal)>;
+  //def : InstAlias<!strconcat(!strconcat("t", cond), " $rs2"),
+  //                (TICCrr G0, IntRegs:$rs2, condVal)>,
+  //                Requires<[HasV9]>;
+
+  // t<cond> rs1 + rs2 => t<cond> %icc, rs1 + rs2
+  //def : InstAlias<!strconcat(!strconcat("t", cond), " $rs1 + $rs2"),
+  //                (TICCrr IntRegs:$rs1, IntRegs:$rs2, condVal)>,
+  //                Requires<[HasV9]>;
 
-  // t<cond> %icc, rs1 + imm
-  def : InstAlias<!strconcat(!strconcat("t", cond), " %icc, $rs1 + $imm"),
-                  (TICCri IntRegs:$rs1, i32imm:$imm, condVal)>,
-                  Requires<[HasV9]>;
   // t<cond> %icc, imm => t<cond> %icc, G0 + imm
   def : InstAlias<!strconcat(!strconcat("t", cond), " %icc, $imm"),
                   (TICCri G0, i32imm:$imm, condVal)>,
                   Requires<[HasV9]>;
-  // t<cond> %xcc, rs1 + imm
-  def : InstAlias<!strconcat(!strconcat("t", cond), " %xcc, $rs1 + $imm"),
-                  (TXCCri IntRegs:$rs1, i32imm:$imm, condVal)>,
+  // t<cond> %icc, rs1 + imm
+  def : InstAlias<!strconcat(!strconcat("t", cond), " %icc, $rs1 + $imm"),
+                  (TICCri IntRegs:$rs1, i32imm:$imm, condVal)>,
                   Requires<[HasV9]>;
   // t<cond> %xcc, imm => t<cond> %xcc, G0 + imm
   def : InstAlias<!strconcat(!strconcat("t", cond), " %xcc, $imm"),
                   (TXCCri G0, i32imm:$imm, condVal)>,
                   Requires<[HasV9]>;
+  // t<cond> %xcc, rs1 + imm
+  def : InstAlias<!strconcat(!strconcat("t", cond), " %xcc, $rs1 + $imm"),
+                  (TXCCri IntRegs:$rs1, i32imm:$imm, condVal)>,
+                  Requires<[HasV9]>;
+
+  // t<cond> imm => t<cond> G0 + imm
+  def : InstAlias<!strconcat(!strconcat("t", cond), " $imm"),
+                  (TRAPri G0, i32imm:$imm, condVal)>;
 
-  // t<cond> rs1 + imm => t<cond> %icc, rs1 + imm
+  // t<cond> rs1 + imm => t<cond> rs1 + imm
   def : InstAlias<!strconcat(!strconcat("t", cond), " $rs1 + $imm"),
-                  (TICCri IntRegs:$rs1, i32imm:$imm, condVal)>;
+                  (TRAPri IntRegs:$rs1, i32imm:$imm, condVal)>;
 
-  // t<cond> imm => t<cond> %icc, G0 + imm
-  def : InstAlias<!strconcat(!strconcat("t", cond), " $imm"),
-                  (TICCri G0, i32imm:$imm, condVal)>;
+  // t<cond> rs1 => t<cond> G0 + rs1
+  def : InstAlias<!strconcat(!strconcat("t", cond), " $rs1"),
+                  (TRAPrr G0, IntRegs:$rs1, condVal)>;
 
+  // t<cond> rs1 + rs2
+  def : InstAlias<!strconcat(!strconcat("t", cond), " $rs1 + $rs2"),
+                  (TRAPrr IntRegs:$rs1, IntRegs:$rs2, condVal)>;
 }
 
 
@@ -244,14 +253,23 @@ multiclass fp_cond_alias<string cond, int condVal> {
                   Requires<[HasV9, HasHardQuad]>;
 }
 
+
+// Instruction aliases for co-processor conditional branches.
+multiclass cp_cond_alias<string cond, int condVal> {
+
+  // cb<cond> $imm
+  def : InstAlias<!strconcat(!strconcat("cb", cond), " $imm"),
+                  (CBCOND brtarget:$imm, condVal), 0>;
+
+  // cb<cond>,a $imm
+  def : InstAlias<!strconcat(!strconcat("cb", cond), ",a $imm"),
+                  (CBCONDA brtarget:$imm, condVal), 0>;
+}
+
 defm : int_cond_alias<"a",    0b1000>;
-defm : int_cond_alias<"",     0b1000>; // same as a; gnu asm, not in manual
 defm : int_cond_alias<"n",    0b0000>;
 defm : int_cond_alias<"ne",   0b1001>;
-defm : int_cond_alias<"nz",   0b1001>; // same as ne
 defm : int_cond_alias<"e",    0b0001>;
-defm : int_cond_alias<"eq",    0b0001>; // same as e
-defm : int_cond_alias<"z",    0b0001>; // same as e
 defm : int_cond_alias<"g",    0b1010>;
 defm : int_cond_alias<"le",   0b0010>;
 defm : int_cond_alias<"ge",   0b1011>;
@@ -259,16 +277,21 @@ defm : int_cond_alias<"l",    0b0011>;
 defm : int_cond_alias<"gu",   0b1100>;
 defm : int_cond_alias<"leu",  0b0100>;
 defm : int_cond_alias<"cc",   0b1101>;
-defm : int_cond_alias<"geu",  0b1101>; // same as cc
 defm : int_cond_alias<"cs",   0b0101>;
-defm : int_cond_alias<"lu",   0b0101>; // same as cs
 defm : int_cond_alias<"pos",  0b1110>;
 defm : int_cond_alias<"neg",  0b0110>;
 defm : int_cond_alias<"vc",   0b1111>;
 defm : int_cond_alias<"vs",   0b0111>;
-
+let EmitPriority = 0 in 
+{
+  defm : int_cond_alias<"",     0b1000>; // same as a; gnu asm, not in manual
+  defm : int_cond_alias<"nz",   0b1001>; // same as ne
+  defm : int_cond_alias<"eq",   0b0001>; // same as e
+  defm : int_cond_alias<"z",    0b0001>; // same as e
+  defm : int_cond_alias<"geu",  0b1101>; // same as cc
+  defm : int_cond_alias<"lu",   0b0101>; // same as cs
+}
 defm : fp_cond_alias<"a",     0b1000>;
-defm : fp_cond_alias<"",      0b1000>; // same as a; gnu asm, not in manual
 defm : fp_cond_alias<"n",     0b0000>;
 defm : fp_cond_alias<"u",     0b0111>;
 defm : fp_cond_alias<"g",     0b0110>;
@@ -277,15 +300,37 @@ defm : fp_cond_alias<"l",     0b0100>;
 defm : fp_cond_alias<"ul",    0b0011>;
 defm : fp_cond_alias<"lg",    0b0010>;
 defm : fp_cond_alias<"ne",    0b0001>;
-defm : fp_cond_alias<"nz",    0b0001>; // same as ne
 defm : fp_cond_alias<"e",     0b1001>;
-defm : fp_cond_alias<"z",     0b1001>; // same as e
 defm : fp_cond_alias<"ue",    0b1010>;
 defm : fp_cond_alias<"ge",    0b1011>;
 defm : fp_cond_alias<"uge",   0b1100>;
 defm : fp_cond_alias<"le",    0b1101>;
 defm : fp_cond_alias<"ule",   0b1110>;
 defm : fp_cond_alias<"o",     0b1111>;
+let EmitPriority = 0 in 
+{
+  defm : fp_cond_alias<"",      0b1000>; // same as a; gnu asm, not in manual
+  defm : fp_cond_alias<"nz",    0b0001>; // same as ne
+  defm : fp_cond_alias<"z",     0b1001>; // same as e
+}
+
+defm : cp_cond_alias<"a",     0b1000>;
+defm : cp_cond_alias<"n",     0b0000>;
+defm : cp_cond_alias<"3",     0b0111>;
+defm : cp_cond_alias<"2",     0b0110>;
+defm : cp_cond_alias<"23",    0b0101>;
+defm : cp_cond_alias<"1",     0b0100>;
+defm : cp_cond_alias<"13",    0b0011>;
+defm : cp_cond_alias<"12",    0b0010>;
+defm : cp_cond_alias<"123",   0b0001>;
+defm : cp_cond_alias<"0",     0b1001>;
+defm : cp_cond_alias<"03",    0b1010>;
+defm : cp_cond_alias<"02",    0b1011>;
+defm : cp_cond_alias<"023",   0b1100>;
+defm : cp_cond_alias<"01",    0b1101>;
+defm : cp_cond_alias<"013",   0b1110>;
+defm : cp_cond_alias<"012",   0b1111>;
+let EmitPriority = 0 in defm : cp_cond_alias<"",      0b1000>; // same as a; gnu asm, not in manual
 
 // Section A.3 Synthetic Instructions
 
diff --git a/lib/Target/Sparc/SparcInstrFormats.td b/lib/Target/Sparc/SparcInstrFormats.td
index 74ccf551e473..76366c6695f4 100644
--- a/lib/Target/Sparc/SparcInstrFormats.td
+++ b/lib/Target/Sparc/SparcInstrFormats.td
@@ -7,8 +7,9 @@
 //
 //===----------------------------------------------------------------------===//
 
-class InstSP<dag outs, dag ins, string asmstr, list<dag> pattern>
-          : Instruction {
+class InstSP<dag outs, dag ins, string asmstr, list<dag> pattern,
+             InstrItinClass itin = NoItinerary>
+   : Instruction {
   field bits<32> Inst;
 
   let Namespace = "SP";
@@ -24,6 +25,8 @@ class InstSP<dag outs, dag ins, string asmstr, list<dag> pattern>
 
   let DecoderNamespace = "Sparc";
   field bits<32> SoftFail = 0;
+  
+  let Itinerary = itin;
 }
 
 //===----------------------------------------------------------------------===//
@@ -31,8 +34,9 @@ class InstSP<dag outs, dag ins, string asmstr, list<dag> pattern>
 //===----------------------------------------------------------------------===//
 
 // Format 2 instructions
-class F2<dag outs, dag ins, string asmstr, list<dag> pattern>
-   : InstSP<outs, ins, asmstr, pattern> {
+class F2<dag outs, dag ins, string asmstr, list<dag> pattern,
+         InstrItinClass itin = NoItinerary>
+   : InstSP<outs, ins, asmstr, pattern, itin> {
   bits<3>  op2;
   bits<22> imm22;
   let op          = 0;    // op = 0
@@ -42,8 +46,9 @@ class F2<dag outs, dag ins, string asmstr, list<dag> pattern>
 
 // Specific F2 classes: SparcV8 manual, page 44
 //
-class F2_1<bits<3> op2Val, dag outs, dag ins, string asmstr, list<dag> pattern>
-   : F2<outs, ins, asmstr, pattern> {
+class F2_1<bits<3> op2Val, dag outs, dag ins, string asmstr, list<dag> pattern,
+           InstrItinClass itin = NoItinerary>
+   : F2<outs, ins, asmstr, pattern, itin> {
   bits<5>  rd;
 
   let op2         = op2Val;
@@ -52,7 +57,8 @@ class F2_1<bits<3> op2Val, dag outs, dag ins, string asmstr, list<dag> pattern>
 }
 
 class F2_2<bits<3> op2Val, bit annul, dag outs, dag ins, string asmstr,
-           list<dag> pattern> : F2<outs, ins, asmstr, pattern> {
+           list<dag> pattern, InstrItinClass itin = NoItinerary>
+   : F2<outs, ins, asmstr, pattern, itin> {
   bits<4>   cond;
   let op2         = op2Val;
 
@@ -61,8 +67,9 @@ class F2_2<bits<3> op2Val, bit annul, dag outs, dag ins, string asmstr,
 }
 
 class F2_3<bits<3> op2Val, bit annul, bit pred,
-           dag outs, dag ins, string asmstr, list<dag> pattern>
-      : InstSP<outs, ins, asmstr, pattern> {
+           dag outs, dag ins, string asmstr, list<dag> pattern,
+           InstrItinClass itin = NoItinerary>
+   : InstSP<outs, ins, asmstr, pattern, itin> {
   bits<2>  cc;
   bits<4>  cond;
   bits<19> imm19;
@@ -77,9 +84,9 @@ class F2_3<bits<3> op2Val, bit annul, bit pred,
   let Inst{18-0}  = imm19;
 }
 
-class F2_4<bits<3> cond, bit annul, bit pred,
-           dag outs, dag ins, string asmstr, list<dag> pattern>
-      : InstSP<outs, ins, asmstr, pattern> {
+class F2_4<bits<3> cond, bit annul, bit pred, dag outs, dag ins,
+           string asmstr, list<dag> pattern, InstrItinClass itin = NoItinerary>
+   : InstSP<outs, ins, asmstr, pattern, itin> {
   bits<16> imm16;
   bits<5>  rs1;
 
@@ -100,8 +107,9 @@ class F2_4<bits<3> cond, bit annul, bit pred,
 // Format #3 instruction classes in the Sparc
 //===----------------------------------------------------------------------===//
 
-class F3<dag outs, dag ins, string asmstr, list<dag> pattern>
-    : InstSP<outs, ins, asmstr, pattern> {
+class F3<dag outs, dag ins, string asmstr, list<dag> pattern,
+         InstrItinClass itin = NoItinerary>
+   : InstSP<outs, ins, asmstr, pattern, itin> {
   bits<5> rd;
   bits<6> op3;
   bits<5> rs1;
@@ -114,7 +122,8 @@ class F3<dag outs, dag ins, string asmstr, list<dag> pattern>
 // Specific F3 classes: SparcV8 manual, page 44
 //
 class F3_1_asi<bits<2> opVal, bits<6> op3val, dag outs, dag ins,
-           string asmstr, list<dag> pattern> : F3<outs, ins, asmstr, pattern> {
+           string asmstr, list<dag> pattern, InstrItinClass itin = NoItinerary>
+   : F3<outs, ins, asmstr, pattern, itin> {
   bits<8> asi;
   bits<5> rs2;
 
@@ -127,13 +136,14 @@ class F3_1_asi<bits<2> opVal, bits<6> op3val, dag outs, dag ins,
 }
 
 class F3_1<bits<2> opVal, bits<6> op3val, dag outs, dag ins, string asmstr,
-       list<dag> pattern> : F3_1_asi<opVal, op3val, outs, ins,
-                                                     asmstr, pattern> {
+       list<dag> pattern, InstrItinClass itin = IIC_iu_instr>
+  : F3_1_asi<opVal, op3val, outs, ins, asmstr, pattern, itin> {
   let asi = 0;
 }
 
 class F3_2<bits<2> opVal, bits<6> op3val, dag outs, dag ins,
-           string asmstr, list<dag> pattern> : F3<outs, ins, asmstr, pattern> {
+           string asmstr, list<dag> pattern, InstrItinClass itin = IIC_iu_instr>
+   : F3<outs, ins, asmstr, pattern, itin> {
   bits<13> simm13;
 
   let op         = opVal;
@@ -145,7 +155,8 @@ class F3_2<bits<2> opVal, bits<6> op3val, dag outs, dag ins,
 
 // floating-point
 class F3_3<bits<2> opVal, bits<6> op3val, bits<9> opfval, dag outs, dag ins,
-           string asmstr, list<dag> pattern> : F3<outs, ins, asmstr, pattern> {
+           string asmstr, list<dag> pattern, InstrItinClass itin = NoItinerary>
+   : F3<outs, ins, asmstr, pattern, itin> {
   bits<5> rs2;
 
   let op         = opVal;
@@ -157,7 +168,8 @@ class F3_3<bits<2> opVal, bits<6> op3val, bits<9> opfval, dag outs, dag ins,
 
 // floating-point unary operations.
 class F3_3u<bits<2> opVal, bits<6> op3val, bits<9> opfval, dag outs, dag ins,
-           string asmstr, list<dag> pattern> : F3<outs, ins, asmstr, pattern> {
+           string asmstr, list<dag> pattern, InstrItinClass itin = NoItinerary>
+   : F3<outs, ins, asmstr, pattern, itin> {
   bits<5> rs2;
 
   let op         = opVal;
@@ -170,7 +182,8 @@ class F3_3u<bits<2> opVal, bits<6> op3val, bits<9> opfval, dag outs, dag ins,
 
 // floating-point compares.
 class F3_3c<bits<2> opVal, bits<6> op3val, bits<9> opfval, dag outs, dag ins,
-           string asmstr, list<dag> pattern> : F3<outs, ins, asmstr, pattern> {
+           string asmstr, list<dag> pattern, InstrItinClass itin = NoItinerary>
+   : F3<outs, ins, asmstr, pattern, itin> {
   bits<5> rs2;
 
   let op         = opVal;
@@ -182,7 +195,8 @@ class F3_3c<bits<2> opVal, bits<6> op3val, bits<9> opfval, dag outs, dag ins,
 
 // Shift by register rs2.
 class F3_Sr<bits<2> opVal, bits<6> op3val, bit xVal, dag outs, dag ins,
-            string asmstr, list<dag> pattern> : F3<outs, ins, asmstr, pattern> {
+            string asmstr, list<dag> pattern, InstrItinClass itin = IIC_iu_instr>
+   : F3<outs, ins, asmstr, pattern, itin> {
   bit x = xVal;           // 1 for 64-bit shifts.
   bits<5> rs2;
 
@@ -196,7 +210,8 @@ class F3_Sr<bits<2> opVal, bits<6> op3val, bit xVal, dag outs, dag ins,
 
 // Shift by immediate.
 class F3_Si<bits<2> opVal, bits<6> op3val, bit xVal, dag outs, dag ins,
-            string asmstr, list<dag> pattern> : F3<outs, ins, asmstr, pattern> {
+            string asmstr, list<dag> pattern, InstrItinClass itin = IIC_iu_instr>
+   : F3<outs, ins, asmstr, pattern, itin> {
   bit x = xVal;           // 1 for 64-bit shifts.
   bits<6> shcnt;          // shcnt32 / shcnt64.
 
@@ -210,17 +225,21 @@ class F3_Si<bits<2> opVal, bits<6> op3val, bit xVal, dag outs, dag ins,
 
 // Define rr and ri shift instructions with patterns.
 multiclass F3_S<string OpcStr, bits<6> Op3Val, bit XVal, SDNode OpNode,
-                ValueType VT, RegisterClass RC> {
+                ValueType VT, RegisterClass RC,
+                InstrItinClass itin = IIC_iu_instr> {
   def rr : F3_Sr<2, Op3Val, XVal, (outs RC:$rd), (ins RC:$rs1, IntRegs:$rs2),
                  !strconcat(OpcStr, " $rs1, $rs2, $rd"),
-                 [(set VT:$rd, (OpNode VT:$rs1, i32:$rs2))]>;
+                 [(set VT:$rd, (OpNode VT:$rs1, i32:$rs2))],
+                 itin>;
   def ri : F3_Si<2, Op3Val, XVal, (outs RC:$rd), (ins RC:$rs1, i32imm:$shcnt),
                  !strconcat(OpcStr, " $rs1, $shcnt, $rd"),
-                 [(set VT:$rd, (OpNode VT:$rs1, (i32 imm:$shcnt)))]>;
+                 [(set VT:$rd, (OpNode VT:$rs1, (i32 imm:$shcnt)))],
+                 itin>;
 }
 
-class F4<bits<6> op3, dag outs, dag ins, string asmstr, list<dag> pattern>
-      : InstSP<outs, ins, asmstr, pattern> {
+class F4<bits<6> op3, dag outs, dag ins, string asmstr, list<dag> pattern,
+           InstrItinClass itin = NoItinerary>
+   : InstSP<outs, ins, asmstr, pattern, itin> {
   bits<5> rd;
 
   let op          = 2;
@@ -230,9 +249,9 @@ class F4<bits<6> op3, dag outs, dag ins, string asmstr, list<dag> pattern>
 
 
 class F4_1<bits<6> op3, dag outs, dag ins,
-            string asmstr, list<dag> pattern>
-      : F4<op3, outs, ins, asmstr, pattern> {
-
+           string asmstr, list<dag> pattern,
+           InstrItinClass itin = NoItinerary>
+   : F4<op3, outs, ins, asmstr, pattern, itin> {
   bit    intcc;
   bits<2> cc;
   bits<4> cond;
@@ -243,12 +262,12 @@ class F4_1<bits<6> op3, dag outs, dag ins,
   let Inst{13}    = 0;
   let Inst{17-14} = cond;
   let Inst{18}    = intcc;
-
 }
 
 class F4_2<bits<6> op3, dag outs, dag ins,
-            string asmstr, list<dag> pattern>
-      : F4<op3, outs, ins, asmstr, pattern> {
+            string asmstr, list<dag> pattern,
+            InstrItinClass itin = NoItinerary>
+   : F4<op3, outs, ins, asmstr, pattern, itin> {
   bit      intcc;
   bits<2>  cc;
   bits<4>  cond;
@@ -262,8 +281,9 @@ class F4_2<bits<6> op3, dag outs, dag ins,
 }
 
 class F4_3<bits<6> op3, bits<6> opf_low, dag outs, dag ins,
-           string asmstr, list<dag> pattern>
-      : F4<op3, outs, ins, asmstr, pattern> {
+           string asmstr, list<dag> pattern,
+           InstrItinClass itin = NoItinerary>
+   : F4<op3, outs, ins, asmstr, pattern, itin> {
   bits<4> cond;
   bit     intcc;
   bits<2> opf_cc;
@@ -278,8 +298,9 @@ class F4_3<bits<6> op3, bits<6> opf_low, dag outs, dag ins,
 }
 
 class F4_4r<bits<6> op3, bits<5> opf_low, bits<3> rcond, dag outs, dag ins,
-            string asmstr, list<dag> pattern>
-       : F4<op3, outs, ins, asmstr, pattern> {
+            string asmstr, list<dag> pattern,
+            InstrItinClass itin = NoItinerary>
+   : F4<op3, outs, ins, asmstr, pattern, itin> {
   bits <5> rs1;
   bits <5> rs2;
   let Inst{18-14} = rs1;
@@ -291,8 +312,9 @@ class F4_4r<bits<6> op3, bits<5> opf_low, bits<3> rcond, dag outs, dag ins,
 
 
 class F4_4i<bits<6> op3, bits<3> rcond, dag outs, dag ins,
-            string asmstr, list<dag> pattern>
-       : F4<op3, outs, ins, asmstr, pattern> {
+            string asmstr, list<dag> pattern,
+           InstrItinClass itin = NoItinerary>
+   : F4<op3, outs, ins, asmstr, pattern, itin> {
   bits<5> rs1;
   bits<10> simm10;
   let Inst{18-14} = rs1;
@@ -302,9 +324,10 @@ class F4_4i<bits<6> op3, bits<3> rcond, dag outs, dag ins,
 }
 
 
-class TRAPSP<bits<6> op3Val, bit isimm, dag outs, dag ins, string asmstr,
-       list<dag> pattern>: F3<outs, ins, asmstr, pattern> {
-
+class TRAPSP<bits<6> op3Val, bit isimm, dag outs, dag ins,
+             string asmstr, list<dag> pattern,
+             InstrItinClass itin = NoItinerary>
+   : F3<outs, ins, asmstr, pattern, itin> {
    bits<4> cond;
    bits<2> cc;
 
@@ -317,15 +340,20 @@ class TRAPSP<bits<6> op3Val, bit isimm, dag outs, dag ins, string asmstr,
 
 }
 
-class TRAPSPrr<bits<6> op3Val, dag outs, dag ins, string asmstr,
-    list<dag> pattern>: TRAPSP<op3Val, 0, outs, ins, asmstr, pattern> {
+class TRAPSPrr<bits<6> op3Val, dag outs, dag ins,
+               string asmstr, list<dag> pattern,
+               InstrItinClass itin = NoItinerary>
+   : TRAPSP<op3Val, 0, outs, ins, asmstr, pattern, itin> {
    bits<5> rs2;
 
    let Inst{10-5} = 0;
    let Inst{4-0}  = rs2;
 }
-class TRAPSPri<bits<6> op3Val, dag outs, dag ins, string asmstr,
-    list<dag> pattern>: TRAPSP<op3Val, 1, outs, ins, asmstr, pattern> {
+
+class TRAPSPri<bits<6> op3Val, dag outs, dag ins,
+               string asmstr, list<dag> pattern,
+               InstrItinClass itin = NoItinerary>
+   : TRAPSP<op3Val, 1, outs, ins, asmstr, pattern, itin> {
    bits<8> imm;
 
    let Inst{10-8} = 0;
diff --git a/lib/Target/Sparc/SparcInstrInfo.cpp b/lib/Target/Sparc/SparcInstrInfo.cpp
index 05006ac5772b..cfd342410550 100644
--- a/lib/Target/Sparc/SparcInstrInfo.cpp
+++ b/lib/Target/Sparc/SparcInstrInfo.cpp
@@ -41,17 +41,15 @@ SparcInstrInfo::SparcInstrInfo(SparcSubtarget &ST)
 /// the destination along with the FrameIndex of the loaded stack slot.  If
 /// not, return 0.  This predicate must return 0 if the instruction has
 /// any side effects other than loading from the stack slot.
-unsigned SparcInstrInfo::isLoadFromStackSlot(const MachineInstr *MI,
+unsigned SparcInstrInfo::isLoadFromStackSlot(const MachineInstr &MI,
                                              int &FrameIndex) const {
-  if (MI->getOpcode() == SP::LDri ||
-      MI->getOpcode() == SP::LDXri ||
-      MI->getOpcode() == SP::LDFri ||
-      MI->getOpcode() == SP::LDDFri ||
-      MI->getOpcode() == SP::LDQFri) {
-    if (MI->getOperand(1).isFI() && MI->getOperand(2).isImm() &&
-        MI->getOperand(2).getImm() == 0) {
-      FrameIndex = MI->getOperand(1).getIndex();
-      return MI->getOperand(0).getReg();
+  if (MI.getOpcode() == SP::LDri || MI.getOpcode() == SP::LDXri ||
+      MI.getOpcode() == SP::LDFri || MI.getOpcode() == SP::LDDFri ||
+      MI.getOpcode() == SP::LDQFri) {
+    if (MI.getOperand(1).isFI() && MI.getOperand(2).isImm() &&
+        MI.getOperand(2).getImm() == 0) {
+      FrameIndex = MI.getOperand(1).getIndex();
+      return MI.getOperand(0).getReg();
     }
   }
   return 0;
@@ -62,17 +60,15 @@ unsigned SparcInstrInfo::isLoadFromStackSlot(const MachineInstr *MI,
 /// the source reg along with the FrameIndex of the loaded stack slot.  If
 /// not, return 0.  This predicate must return 0 if the instruction has
 /// any side effects other than storing to the stack slot.
-unsigned SparcInstrInfo::isStoreToStackSlot(const MachineInstr *MI,
+unsigned SparcInstrInfo::isStoreToStackSlot(const MachineInstr &MI,
                                             int &FrameIndex) const {
-  if (MI->getOpcode() == SP::STri ||
-      MI->getOpcode() == SP::STXri ||
-      MI->getOpcode() == SP::STFri ||
-      MI->getOpcode() == SP::STDFri ||
-      MI->getOpcode() == SP::STQFri) {
-    if (MI->getOperand(0).isFI() && MI->getOperand(1).isImm() &&
-        MI->getOperand(1).getImm() == 0) {
-      FrameIndex = MI->getOperand(0).getIndex();
-      return MI->getOperand(2).getReg();
+  if (MI.getOpcode() == SP::STri || MI.getOpcode() == SP::STXri ||
+      MI.getOpcode() == SP::STFri || MI.getOpcode() == SP::STDFri ||
+      MI.getOpcode() == SP::STQFri) {
+    if (MI.getOperand(0).isFI() && MI.getOperand(1).isImm() &&
+        MI.getOperand(1).getImm() == 0) {
+      FrameIndex = MI.getOperand(0).getIndex();
+      return MI.getOperand(2).getReg();
     }
   }
   return 0;
@@ -119,6 +115,28 @@ static SPCC::CondCodes GetOppositeBranchCondition(SPCC::CondCodes CC)
   case SPCC::FCC_UE:   return SPCC::FCC_LG;
   case SPCC::FCC_NE:   return SPCC::FCC_E;
   case SPCC::FCC_E:    return SPCC::FCC_NE;
+  
+  case SPCC::CPCC_A:   return SPCC::CPCC_N;
+  case SPCC::CPCC_N:   return SPCC::CPCC_A;
+  case SPCC::CPCC_3:   // Fall through
+  case SPCC::CPCC_2:   // Fall through
+  case SPCC::CPCC_23:  // Fall through
+  case SPCC::CPCC_1:   // Fall through
+  case SPCC::CPCC_13:  // Fall through
+  case SPCC::CPCC_12:  // Fall through
+  case SPCC::CPCC_123: // Fall through
+  case SPCC::CPCC_0:   // Fall through
+  case SPCC::CPCC_03:  // Fall through
+  case SPCC::CPCC_02:  // Fall through
+  case SPCC::CPCC_023: // Fall through
+  case SPCC::CPCC_01:  // Fall through
+  case SPCC::CPCC_013: // Fall through
+  case SPCC::CPCC_012:
+      // "Opposite" code is not meaningful, as we don't know
+      // what the CoProc condition means here. The cond-code will
+      // only be used in inline assembler, so this code should
+      // not be reached in a normal compilation pass.
+      llvm_unreachable("Meaningless inversion of co-processor cond code");
   }
   llvm_unreachable("Invalid cond code");
 }
@@ -139,7 +157,7 @@ static void parseCondBranch(MachineInstr *LastInst, MachineBasicBlock *&Target,
   Target = LastInst->getOperand(0).getMBB();
 }
 
-bool SparcInstrInfo::AnalyzeBranch(MachineBasicBlock &MBB,
+bool SparcInstrInfo::analyzeBranch(MachineBasicBlock &MBB,
                                    MachineBasicBlock *&TBB,
                                    MachineBasicBlock *&FBB,
                                    SmallVectorImpl<MachineOperand> &Cond,
@@ -148,15 +166,15 @@ bool SparcInstrInfo::AnalyzeBranch(MachineBasicBlock &MBB,
   if (I == MBB.end())
     return false;
 
-  if (!isUnpredicatedTerminator(I))
+  if (!isUnpredicatedTerminator(*I))
     return false;
 
   // Get the last instruction in the block.
-  MachineInstr *LastInst = I;
+  MachineInstr *LastInst = &*I;
   unsigned LastOpc = LastInst->getOpcode();
 
   // If there is only one terminator instruction, process it.
-  if (I == MBB.begin() || !isUnpredicatedTerminator(--I)) {
+  if (I == MBB.begin() || !isUnpredicatedTerminator(*--I)) {
     if (isUncondBranchOpcode(LastOpc)) {
       TBB = LastInst->getOperand(0).getMBB();
       return false;
@@ -170,7 +188,7 @@ bool SparcInstrInfo::AnalyzeBranch(MachineBasicBlock &MBB,
   }
 
   // Get the instruction before it if it is a terminator.
-  MachineInstr *SecondLastInst = I;
+  MachineInstr *SecondLastInst = &*I;
   unsigned SecondLastOpc = SecondLastInst->getOpcode();
 
   // If AllowModify is true and the block ends with two or more unconditional
@@ -180,19 +198,19 @@ bool SparcInstrInfo::AnalyzeBranch(MachineBasicBlock &MBB,
       LastInst->eraseFromParent();
       LastInst = SecondLastInst;
       LastOpc = LastInst->getOpcode();
-      if (I == MBB.begin() || !isUnpredicatedTerminator(--I)) {
+      if (I == MBB.begin() || !isUnpredicatedTerminator(*--I)) {
         // Return now the only terminator is an unconditional branch.
         TBB = LastInst->getOperand(0).getMBB();
         return false;
       } else {
-        SecondLastInst = I;
+        SecondLastInst = &*I;
         SecondLastOpc = SecondLastInst->getOpcode();
       }
     }
   }
 
   // If there are three terminators, we don't know what sort of block this is.
-  if (SecondLastInst && I != MBB.begin() && isUnpredicatedTerminator(--I))
+  if (SecondLastInst && I != MBB.begin() && isUnpredicatedTerminator(*--I))
     return true;
 
   // If the block ends with a B and a Bcc, handle it.
@@ -222,11 +240,11 @@ bool SparcInstrInfo::AnalyzeBranch(MachineBasicBlock &MBB,
   return true;
 }
 
-unsigned
-SparcInstrInfo::InsertBranch(MachineBasicBlock &MBB,MachineBasicBlock *TBB,
-                             MachineBasicBlock *FBB,
-                             ArrayRef<MachineOperand> Cond,
-                             DebugLoc DL) const {
+unsigned SparcInstrInfo::InsertBranch(MachineBasicBlock &MBB,
+                                      MachineBasicBlock *TBB,
+                                      MachineBasicBlock *FBB,
+                                      ArrayRef<MachineOperand> Cond,
+                                      const DebugLoc &DL) const {
   assert(TBB && "InsertBranch must not be told to insert a fallthrough");
   assert((Cond.size() == 1 || Cond.size() == 0) &&
          "Sparc branch conditions should have one component!");
@@ -282,9 +300,9 @@ bool SparcInstrInfo::ReverseBranchCondition(
 }
 
 void SparcInstrInfo::copyPhysReg(MachineBasicBlock &MBB,
-                                 MachineBasicBlock::iterator I, DebugLoc DL,
-                                 unsigned DestReg, unsigned SrcReg,
-                                 bool KillSrc) const {
+                                 MachineBasicBlock::iterator I,
+                                 const DebugLoc &DL, unsigned DestReg,
+                                 unsigned SrcReg, bool KillSrc) const {
   unsigned numSubRegs = 0;
   unsigned movOpc     = 0;
   const unsigned *subRegIdx = nullptr;
@@ -469,3 +487,20 @@ unsigned SparcInstrInfo::getGlobalBaseReg(MachineFunction *MF) const
   SparcFI->setGlobalBaseReg(GlobalBaseReg);
   return GlobalBaseReg;
 }
+
+bool SparcInstrInfo::expandPostRAPseudo(MachineInstr &MI) const {
+  switch (MI.getOpcode()) {
+  case TargetOpcode::LOAD_STACK_GUARD: {
+    assert(Subtarget.isTargetLinux() &&
+           "Only Linux target is expected to contain LOAD_STACK_GUARD");
+    // offsetof(tcbhead_t, stack_guard) from sysdeps/sparc/nptl/tls.h in glibc.
+    const int64_t Offset = Subtarget.is64Bit() ? 0x28 : 0x14;
+    MI.setDesc(get(Subtarget.is64Bit() ? SP::LDXri : SP::LDri));
+    MachineInstrBuilder(*MI.getParent()->getParent(), MI)
+        .addReg(SP::G7)
+        .addImm(Offset);
+    return true;
+  }
+  }
+  return false;
+}
diff --git a/lib/Target/Sparc/SparcInstrInfo.h b/lib/Target/Sparc/SparcInstrInfo.h
index 9de624cc9582..8ed97c1479ca 100644
--- a/lib/Target/Sparc/SparcInstrInfo.h
+++ b/lib/Target/Sparc/SparcInstrInfo.h
@@ -54,7 +54,7 @@ public:
   /// the destination along with the FrameIndex of the loaded stack slot.  If
   /// not, return 0.  This predicate must return 0 if the instruction has
   /// any side effects other than loading from the stack slot.
-  unsigned isLoadFromStackSlot(const MachineInstr *MI,
+  unsigned isLoadFromStackSlot(const MachineInstr &MI,
                                int &FrameIndex) const override;
 
   /// isStoreToStackSlot - If the specified machine instruction is a direct
@@ -62,26 +62,25 @@ public:
   /// the source reg along with the FrameIndex of the loaded stack slot.  If
   /// not, return 0.  This predicate must return 0 if the instruction has
   /// any side effects other than storing to the stack slot.
-  unsigned isStoreToStackSlot(const MachineInstr *MI,
+  unsigned isStoreToStackSlot(const MachineInstr &MI,
                               int &FrameIndex) const override;
 
-  bool AnalyzeBranch(MachineBasicBlock &MBB, MachineBasicBlock *&TBB,
+  bool analyzeBranch(MachineBasicBlock &MBB, MachineBasicBlock *&TBB,
                      MachineBasicBlock *&FBB,
                      SmallVectorImpl<MachineOperand> &Cond,
-                     bool AllowModify = false) const override ;
+                     bool AllowModify = false) const override;
 
   unsigned RemoveBranch(MachineBasicBlock &MBB) const override;
 
   unsigned InsertBranch(MachineBasicBlock &MBB, MachineBasicBlock *TBB,
                         MachineBasicBlock *FBB, ArrayRef<MachineOperand> Cond,
-                        DebugLoc DL) const override;
+                        const DebugLoc &DL) const override;
 
   bool
   ReverseBranchCondition(SmallVectorImpl<MachineOperand> &Cond) const override;
 
-  void copyPhysReg(MachineBasicBlock &MBB,
-                   MachineBasicBlock::iterator I, DebugLoc DL,
-                   unsigned DestReg, unsigned SrcReg,
+  void copyPhysReg(MachineBasicBlock &MBB, MachineBasicBlock::iterator I,
+                   const DebugLoc &DL, unsigned DestReg, unsigned SrcReg,
                    bool KillSrc) const override;
 
   void storeRegToStackSlot(MachineBasicBlock &MBB,
@@ -97,6 +96,9 @@ public:
                             const TargetRegisterInfo *TRI) const override;
 
   unsigned getGlobalBaseReg(MachineFunction *MF) const;
+
+  // Lower pseudo instructions after register allocation.
+  bool expandPostRAPseudo(MachineInstr &MI) const override;
 };
 
 }
diff --git a/lib/Target/Sparc/SparcInstrInfo.td b/lib/Target/Sparc/SparcInstrInfo.td
index ec37c22a5b33..cc55c9c8e032 100644
--- a/lib/Target/Sparc/SparcInstrInfo.td
+++ b/lib/Target/Sparc/SparcInstrInfo.td
@@ -49,6 +49,18 @@ def HasVIS3 : Predicate<"Subtarget->isVIS3()">,
 // point instructions.
 def HasHardQuad : Predicate<"Subtarget->hasHardQuad()">;
 
+// HasLeonCASA - This is true when the target processor supports the CASA
+// instruction
+def HasLeonCASA : Predicate<"Subtarget->hasLeonCasa()">;
+
+// HasUMAC_SMAC - This is true when the target processor supports the
+// UMAC and SMAC instructions
+def HasUMAC_SMAC : Predicate<"Subtarget->hasUmacSmac()">;
+
+def HasNoFdivSqrtFix : Predicate<"!Subtarget->fixAllFDIVSQRT()">;
+def HasNoFmulsFix : Predicate<"!Subtarget->replaceFMULS()">;
+def HasNoFsmuldFix : Predicate<"!Subtarget->fixFSMULD()">;
+
 // UseDeprecatedInsts - This predicate is true when the target processor is a
 // V8, or when it is V9 but the V8 deprecated instructions are efficient enough
 // to use when appropriate.  In either of these cases, the instruction selector
@@ -154,6 +166,9 @@ SDTypeProfile<1, 3, [SDTCisInt<0>, SDTCisSameAs<0, 1>, SDTCisPtrTy<2>]>;
 def SDTSPtlsld :
 SDTypeProfile<1, 2, [SDTCisPtrTy<0>, SDTCisPtrTy<1>]>;
 
+def SDTSPeh_sjlj_setjmp : SDTypeProfile<1, 1, [SDTCisInt<0>, SDTCisPtrTy<1>]>;
+def SDTSPeh_sjlj_longjmp: SDTypeProfile<0, 1, [SDTCisPtrTy<0>]>;
+
 def SPcmpicc : SDNode<"SPISD::CMPICC", SDTSPcmpicc, [SDNPOutGlue]>;
 def SPcmpfcc : SDNode<"SPISD::CMPFCC", SDTSPcmpfcc, [SDNPOutGlue]>;
 def SPbricc : SDNode<"SPISD::BRICC", SDTSPbrcc, [SDNPHasChain, SDNPInGlue]>;
@@ -172,6 +187,13 @@ def SPselecticc : SDNode<"SPISD::SELECT_ICC", SDTSPselectcc, [SDNPInGlue]>;
 def SPselectxcc : SDNode<"SPISD::SELECT_XCC", SDTSPselectcc, [SDNPInGlue]>;
 def SPselectfcc : SDNode<"SPISD::SELECT_FCC", SDTSPselectcc, [SDNPInGlue]>;
 
+def SPsjlj_setjmp: SDNode<"SPISD::EH_SJLJ_SETJMP",
+                          SDTSPeh_sjlj_setjmp,
+                          [SDNPHasChain, SDNPSideEffect]>;
+def SPsjlj_longjmp: SDNode<"SPISD::EH_SJLJ_LONGJMP",
+                           SDTSPeh_sjlj_longjmp,
+                           [SDNPHasChain, SDNPSideEffect]>;
+
 //  These are target-independent nodes, but have target-specific formats.
 def SDT_SPCallSeqStart : SDCallSeqStart<[ SDTCisVT<0, i32> ]>;
 def SDT_SPCallSeqEnd   : SDCallSeqEnd<[ SDTCisVT<0, i32>,
@@ -235,12 +257,28 @@ def FCC_UL  : FCC_VAL<19>;  // Unordered or Less
 def FCC_LG  : FCC_VAL<18>;  // Less or Greater
 def FCC_NE  : FCC_VAL<17>;  // Not Equal
 def FCC_E   : FCC_VAL<25>;  // Equal
-def FCC_UE  : FCC_VAL<24>;  // Unordered or Equal
-def FCC_GE  : FCC_VAL<25>;  // Greater or Equal
-def FCC_UGE : FCC_VAL<26>;  // Unordered or Greater or Equal
-def FCC_LE  : FCC_VAL<27>;  // Less or Equal
-def FCC_ULE : FCC_VAL<28>;  // Unordered or Less or Equal
-def FCC_O   : FCC_VAL<29>;  // Ordered
+def FCC_UE  : FCC_VAL<26>;  // Unordered or Equal
+def FCC_GE  : FCC_VAL<27>;  // Greater or Equal
+def FCC_UGE : FCC_VAL<28>;  // Unordered or Greater or Equal
+def FCC_LE  : FCC_VAL<29>;  // Less or Equal
+def FCC_ULE : FCC_VAL<30>;  // Unordered or Less or Equal
+def FCC_O   : FCC_VAL<31>;  // Ordered
+
+class CPCC_VAL<int N> : PatLeaf<(i32 N)>;
+def CPCC_3   : CPCC_VAL<39>;  // 3
+def CPCC_2   : CPCC_VAL<38>;  // 2
+def CPCC_23  : CPCC_VAL<37>;  // 2 or 3
+def CPCC_1   : CPCC_VAL<36>;  // 1
+def CPCC_13  : CPCC_VAL<35>;  // 1 or 3
+def CPCC_12  : CPCC_VAL<34>;  // 1 or 2
+def CPCC_123 : CPCC_VAL<33>;  // 1 or 2 or 3
+def CPCC_0   : CPCC_VAL<41>;  // 0
+def CPCC_03  : CPCC_VAL<42>;  // 0 or 3
+def CPCC_02  : CPCC_VAL<43>;  // 0 or 2
+def CPCC_023 : CPCC_VAL<44>;  // 0 or 2 or 3
+def CPCC_01  : CPCC_VAL<45>;  // 0 or 1
+def CPCC_013 : CPCC_VAL<46>;  // 0 or 1 or 3
+def CPCC_012 : CPCC_VAL<47>;  // 0 or 1 or 2
 
 //===----------------------------------------------------------------------===//
 // Instruction Class Templates
@@ -248,53 +286,61 @@ def FCC_O   : FCC_VAL<29>;  // Ordered
 
 /// F3_12 multiclass - Define a normal F3_1/F3_2 pattern in one shot.
 multiclass F3_12<string OpcStr, bits<6> Op3Val, SDNode OpNode,
-                 RegisterClass RC, ValueType Ty, Operand immOp> {
+                 RegisterClass RC, ValueType Ty, Operand immOp,
+                 InstrItinClass itin = IIC_iu_instr> {
   def rr  : F3_1<2, Op3Val,
                  (outs RC:$rd), (ins RC:$rs1, RC:$rs2),
                  !strconcat(OpcStr, " $rs1, $rs2, $rd"),
-                 [(set Ty:$rd, (OpNode Ty:$rs1, Ty:$rs2))]>;
+                 [(set Ty:$rd, (OpNode Ty:$rs1, Ty:$rs2))],
+                 itin>;
   def ri  : F3_2<2, Op3Val,
                  (outs RC:$rd), (ins RC:$rs1, immOp:$simm13),
                  !strconcat(OpcStr, " $rs1, $simm13, $rd"),
-                 [(set Ty:$rd, (OpNode Ty:$rs1, (Ty simm13:$simm13)))]>;
+                 [(set Ty:$rd, (OpNode Ty:$rs1, (Ty simm13:$simm13)))],
+                 itin>;
 }
 
 /// F3_12np multiclass - Define a normal F3_1/F3_2 pattern in one shot, with no
 /// pattern.
-multiclass F3_12np<string OpcStr, bits<6> Op3Val> {
+multiclass F3_12np<string OpcStr, bits<6> Op3Val, InstrItinClass itin = IIC_iu_instr> {
   def rr  : F3_1<2, Op3Val,
                  (outs IntRegs:$rd), (ins IntRegs:$rs1, IntRegs:$rs2),
-                 !strconcat(OpcStr, " $rs1, $rs2, $rd"), []>;
+                 !strconcat(OpcStr, " $rs1, $rs2, $rd"), [],
+                 itin>;
   def ri  : F3_2<2, Op3Val,
                  (outs IntRegs:$rd), (ins IntRegs:$rs1, simm13Op:$simm13),
-                 !strconcat(OpcStr, " $rs1, $simm13, $rd"), []>;
+                 !strconcat(OpcStr, " $rs1, $simm13, $rd"), [],
+                 itin>;
 }
 
 // Load multiclass - Define both Reg+Reg/Reg+Imm patterns in one shot.
 multiclass Load<string OpcStr, bits<6> Op3Val, SDPatternOperator OpNode,
-           RegisterClass RC, ValueType Ty> {
+           RegisterClass RC, ValueType Ty, InstrItinClass itin = IIC_iu_instr> {
   def rr  : F3_1<3, Op3Val,
                  (outs RC:$dst), (ins MEMrr:$addr),
                  !strconcat(OpcStr, " [$addr], $dst"),
-                 [(set Ty:$dst, (OpNode ADDRrr:$addr))]>;
+                 [(set Ty:$dst, (OpNode ADDRrr:$addr))],
+                 itin>;
   def ri  : F3_2<3, Op3Val,
                  (outs RC:$dst), (ins MEMri:$addr),
                  !strconcat(OpcStr, " [$addr], $dst"),
-                 [(set Ty:$dst, (OpNode ADDRri:$addr))]>;
+                 [(set Ty:$dst, (OpNode ADDRri:$addr))],
+                 itin>;
 }
 
 // TODO: Instructions of the LoadASI class are currently asm only; hooking up
 // CodeGen's address spaces to use these is a future task.
 class LoadASI<string OpcStr, bits<6> Op3Val, SDPatternOperator OpNode,
-              RegisterClass RC, ValueType Ty> :
+              RegisterClass RC, ValueType Ty, InstrItinClass itin = NoItinerary> :
   F3_1_asi<3, Op3Val, (outs RC:$dst), (ins MEMrr:$addr, i8imm:$asi),
                 !strconcat(OpcStr, "a [$addr] $asi, $dst"),
                 []>;
 
 // LoadA multiclass - As above, but also define alternate address space variant
 multiclass LoadA<string OpcStr, bits<6> Op3Val, bits<6> LoadAOp3Val,
-                 SDPatternOperator OpNode, RegisterClass RC, ValueType Ty> :
-             Load<OpcStr, Op3Val, OpNode, RC, Ty> {
+                 SDPatternOperator OpNode, RegisterClass RC, ValueType Ty,
+                 InstrItinClass itin = NoItinerary> :
+             Load<OpcStr, Op3Val, OpNode, RC, Ty, itin> {
   def Arr  : LoadASI<OpcStr, LoadAOp3Val, OpNode, RC, Ty>;
 }
 
@@ -302,38 +348,43 @@ multiclass LoadA<string OpcStr, bits<6> Op3Val, bits<6> LoadAOp3Val,
 // It is unlikely that general-purpose code could make use of it.
 // CAS is preferred for sparc v9.
 def LDSTUBrr : F3_1<3, 0b001101, (outs IntRegs:$dst), (ins MEMrr:$addr),
-                  "ldstub [$addr], $dst", []>;
+                    "ldstub [$addr], $dst", []>;
 def LDSTUBri : F3_2<3, 0b001101, (outs IntRegs:$dst), (ins MEMri:$addr),
-                  "ldstub [$addr], $dst", []>;
+                    "ldstub [$addr], $dst", []>;
 def LDSTUBArr : F3_1_asi<3, 0b011101, (outs IntRegs:$dst),
                          (ins MEMrr:$addr, i8imm:$asi),
                          "ldstuba [$addr] $asi, $dst", []>;
 
 // Store multiclass - Define both Reg+Reg/Reg+Imm patterns in one shot.
 multiclass Store<string OpcStr, bits<6> Op3Val, SDPatternOperator OpNode,
-           RegisterClass RC, ValueType Ty> {
+           RegisterClass RC, ValueType Ty, InstrItinClass itin = IIC_st> {
   def rr  : F3_1<3, Op3Val,
                  (outs), (ins MEMrr:$addr, RC:$rd),
                  !strconcat(OpcStr, " $rd, [$addr]"),
-                 [(OpNode Ty:$rd, ADDRrr:$addr)]>;
+                 [(OpNode Ty:$rd, ADDRrr:$addr)],
+                 itin>;
   def ri  : F3_2<3, Op3Val,
                  (outs), (ins MEMri:$addr, RC:$rd),
                  !strconcat(OpcStr, " $rd, [$addr]"),
-                 [(OpNode Ty:$rd, ADDRri:$addr)]>;
+                 [(OpNode Ty:$rd, ADDRri:$addr)],
+                 itin>;
 }
 
 // TODO: Instructions of the StoreASI class are currently asm only; hooking up
 // CodeGen's address spaces to use these is a future task.
 class StoreASI<string OpcStr, bits<6> Op3Val,
-                  SDPatternOperator OpNode, RegisterClass RC, ValueType Ty> :
+               SDPatternOperator OpNode, RegisterClass RC, ValueType Ty,
+               InstrItinClass itin = IIC_st> :
   F3_1_asi<3, Op3Val, (outs), (ins MEMrr:$addr, RC:$rd, i8imm:$asi),
-                  !strconcat(OpcStr, "a $rd, [$addr] $asi"),
-                  []>;
+           !strconcat(OpcStr, "a $rd, [$addr] $asi"),
+           [],
+           itin>;
 
 multiclass StoreA<string OpcStr, bits<6> Op3Val, bits<6> StoreAOp3Val,
-                  SDPatternOperator OpNode, RegisterClass RC, ValueType Ty> :
+                  SDPatternOperator OpNode, RegisterClass RC, ValueType Ty,
+                  InstrItinClass itin = IIC_st> :
              Store<OpcStr, Op3Val, OpNode, RC, Ty> {
-  def Arr : StoreASI<OpcStr, StoreAOp3Val, OpNode, RC, Ty>;
+  def Arr : StoreASI<OpcStr, StoreAOp3Val, OpNode, RC, Ty, itin>;
 }
 
 //===----------------------------------------------------------------------===//
@@ -418,6 +469,27 @@ let usesCustomInserter = 1, Uses = [FCC0] in {
             [(set f128:$dst, (SPselectfcc f128:$T, f128:$F, imm:$Cond))]>;
 }
 
+let hasSideEffects = 1, isBarrier = 1, usesCustomInserter = 1 in {
+  let Defs = [WIM] in
+  def EH_SJLJ_SETJMP32ri  : Pseudo<(outs IntRegs:$dst), (ins MEMri:$buf),
+                            "#EH_SJLJ_SETJMP32",
+                            [(set i32:$dst, (SPsjlj_setjmp ADDRri:$buf))]>,
+                            Requires<[Is32Bit]>;
+  def EH_SJLJ_SETJMP32rr  : Pseudo<(outs IntRegs:$dst), (ins MEMrr:$buf),
+                            "#EH_SJLJ_SETJMP32",
+                            [(set i32:$dst, (SPsjlj_setjmp ADDRrr:$buf))]>,
+                            Requires<[Is32Bit]>;
+  let isTerminator = 1 in
+  def EH_SJLJ_LONGJMP32ri : Pseudo<(outs), (ins MEMri:$buf),
+                            "#EH_SJLJ_LONGJMP32",
+                            [(SPsjlj_longjmp ADDRri:$buf)]>,
+                            Requires<[Is32Bit]>;
+  def EH_SJLJ_LONGJMP32rr : Pseudo<(outs), (ins MEMrr:$buf),
+                            "#EH_SJLJ_LONGJMP32",
+                            [(SPsjlj_longjmp ADDRrr:$buf)]>,
+                            Requires<[Is32Bit]>;
+}
+
 // Section B.1 - Load Integer Instructions, p. 90
 let DecoderMethod = "DecodeLoadInt" in {
   defm LDSB : LoadA<"ldsb", 0b001001, 0b011001, sextloadi8,  IntRegs, i32>;
@@ -428,16 +500,16 @@ let DecoderMethod = "DecodeLoadInt" in {
 }
 
 let DecoderMethod = "DecodeLoadIntPair" in
-  defm LDD : LoadA<"ldd", 0b000011, 0b010011, load, IntPair, v2i32>;
+  defm LDD : LoadA<"ldd", 0b000011, 0b010011, load, IntPair, v2i32, IIC_ldd>;
 
 // Section B.2 - Load Floating-point Instructions, p. 92
 let DecoderMethod = "DecodeLoadFP" in {
-  defm LDF   : Load<"ld",  0b100000, load,    FPRegs,  f32>;
-  def LDFArr : LoadASI<"ld",  0b110000, load, FPRegs,  f32>,
+  defm LDF   : Load<"ld",  0b100000, load,    FPRegs,  f32, IIC_iu_or_fpu_instr>;
+  def LDFArr : LoadASI<"ld",  0b110000, load, FPRegs,  f32, IIC_iu_or_fpu_instr>,
                 Requires<[HasV9]>;
 }
 let DecoderMethod = "DecodeLoadDFP" in {
-  defm LDDF   : Load<"ldd", 0b100011, load,    DFPRegs, f64>;
+  defm LDDF   : Load<"ldd", 0b100011, load,    DFPRegs, f64, IIC_ldd>;
   def LDDFArr : LoadASI<"ldd", 0b110011, load, DFPRegs, f64>,
                  Requires<[HasV9]>;
 }
@@ -445,13 +517,27 @@ let DecoderMethod = "DecodeLoadQFP" in
   defm LDQF  : LoadA<"ldq", 0b100010, 0b110010, load, QFPRegs, f128>,
                Requires<[HasV9, HasHardQuad]>;
 
+let DecoderMethod = "DecodeLoadCP" in 
+  defm LDC   : Load<"ld", 0b110000, load, CoprocRegs, i32>; 
+let DecoderMethod = "DecodeLoadCPPair" in 
+  defm LDDC   : Load<"ldd", 0b110011, load, CoprocPair, v2i32, IIC_ldd>;
+
+let DecoderMethod = "DecodeLoadCP", Defs = [CPSR] in {
+  let rd = 0 in {
+    def LDCSRrr : F3_1<3, 0b110001, (outs), (ins MEMrr:$addr),
+                       "ld [$addr], %csr", []>;
+    def LDCSRri : F3_2<3, 0b110001, (outs), (ins MEMri:$addr),
+                       "ld [$addr], %csr", []>;
+  }
+}
+  
 let DecoderMethod = "DecodeLoadFP" in
   let Defs = [FSR] in {
     let rd = 0 in {
       def LDFSRrr : F3_1<3, 0b100001, (outs), (ins MEMrr:$addr),
-                     "ld [$addr], %fsr", []>;
+                     "ld [$addr], %fsr", [], IIC_iu_or_fpu_instr>;
       def LDFSRri : F3_2<3, 0b100001, (outs), (ins MEMri:$addr),
-                     "ld [$addr], %fsr", []>;
+                     "ld [$addr], %fsr", [], IIC_iu_or_fpu_instr>;
     }
     let rd = 1 in {
       def LDXFSRrr : F3_1<3, 0b100001, (outs), (ins MEMrr:$addr),
@@ -469,7 +555,7 @@ let DecoderMethod = "DecodeStoreInt" in {
 }
 
 let DecoderMethod = "DecodeStoreIntPair" in
-  defm STD   : StoreA<"std", 0b000111, 0b010111, store, IntPair, v2i32>;
+  defm STD   : StoreA<"std", 0b000111, 0b010111, store, IntPair, v2i32, IIC_std>;
 
 // Section B.5 - Store Floating-point Instructions, p. 97
 let DecoderMethod = "DecodeStoreFP" in {
@@ -478,7 +564,7 @@ let DecoderMethod = "DecodeStoreFP" in {
                Requires<[HasV9]>;
 }
 let DecoderMethod = "DecodeStoreDFP" in {
-  defm STDF   : Store<"std", 0b100111, store,         DFPRegs, f64>;
+  defm STDF   : Store<"std", 0b100111, store,         DFPRegs, f64, IIC_std>;
   def STDFArr : StoreASI<"std", 0b110111, store,      DFPRegs, f64>,
                 Requires<[HasV9]>;
 }
@@ -486,21 +572,49 @@ let DecoderMethod = "DecodeStoreQFP" in
   defm STQF  : StoreA<"stq", 0b100110, 0b110110, store, QFPRegs, f128>,
                Requires<[HasV9, HasHardQuad]>;
 
-let DecoderMethod = "DecodeStoreFP" in
-  let Defs = [FSR] in {
-    let rd = 0 in {
+let DecoderMethod = "DecodeStoreCP" in 
+  defm STC   : Store<"st", 0b110100, store, CoprocRegs, i32>; 
+  
+let DecoderMethod = "DecodeStoreCPPair" in 
+  defm STDC   : Store<"std", 0b110111, store, CoprocPair, v2i32, IIC_std>;
+  
+let DecoderMethod = "DecodeStoreCP", rd = 0 in {
+  let Defs = [CPSR] in {
+    def STCSRrr : F3_1<3, 0b110101, (outs MEMrr:$addr), (ins),
+                       "st %csr, [$addr]", [], IIC_st>;
+    def STCSRri : F3_2<3, 0b110101, (outs MEMri:$addr), (ins),
+                       "st %csr, [$addr]", [], IIC_st>;
+  }
+  let Defs = [CPQ] in {
+    def STDCQrr : F3_1<3, 0b110110, (outs MEMrr:$addr), (ins),
+                       "std %cq, [$addr]", [], IIC_std>;
+    def STDCQri : F3_2<3, 0b110110, (outs MEMri:$addr), (ins),
+                       "std %cq, [$addr]", [], IIC_std>;
+  }
+}
+
+let DecoderMethod = "DecodeStoreFP" in {
+  let rd = 0 in {
+    let Defs = [FSR] in {
       def STFSRrr : F3_1<3, 0b100101, (outs MEMrr:$addr), (ins),
-                     "st %fsr, [$addr]", []>;
+                     "st %fsr, [$addr]", [], IIC_st>;
       def STFSRri : F3_2<3, 0b100101, (outs MEMri:$addr), (ins),
-                     "st %fsr, [$addr]", []>;
+                     "st %fsr, [$addr]", [], IIC_st>;
     }
-    let rd = 1 in {
-      def STXFSRrr : F3_1<3, 0b100101, (outs MEMrr:$addr), (ins),
-                     "stx %fsr, [$addr]", []>, Requires<[HasV9]>;
-      def STXFSRri : F3_2<3, 0b100101, (outs MEMri:$addr), (ins),
-                     "stx %fsr, [$addr]", []>, Requires<[HasV9]>;
+    let Defs = [FQ] in {
+      def STDFQrr : F3_1<3, 0b100110, (outs MEMrr:$addr), (ins),
+                     "std %fq, [$addr]", [], IIC_std>;
+      def STDFQri : F3_2<3, 0b100110, (outs MEMri:$addr), (ins),
+                     "std %fq, [$addr]", [], IIC_std>;
     }
   }
+  let rd = 1, Defs = [FSR] in {
+    def STXFSRrr : F3_1<3, 0b100101, (outs MEMrr:$addr), (ins),
+                   "stx %fsr, [$addr]", []>, Requires<[HasV9]>;
+    def STXFSRri : F3_2<3, 0b100101, (outs MEMri:$addr), (ins),
+                   "stx %fsr, [$addr]", []>, Requires<[HasV9]>;
+  }
+}
 
 // Section B.8 - SWAP Register with Memory Instruction
 // (Atomic swap)
@@ -524,7 +638,8 @@ let Constraints = "$val = $dst", DecoderMethod = "DecodeSWAP" in {
 def SETHIi: F2_1<0b100,
                  (outs IntRegs:$rd), (ins i32imm:$imm22),
                  "sethi $imm22, $rd",
-                 [(set i32:$rd, SETHIimm:$imm22)]>;
+                 [(set i32:$rd, SETHIimm:$imm22)],
+                 IIC_iu_instr>;
 
 // Section B.10 - NOP Instruction, p. 105
 // (It's a special case of SETHI)
@@ -619,13 +734,13 @@ let Defs = [ICC], rd = 0 in {
 
 // Section B.18 - Multiply Instructions, p. 113
 let Defs = [Y] in {
-  defm UMUL : F3_12np<"umul", 0b001010>;
-  defm SMUL : F3_12  <"smul", 0b001011, mul, IntRegs, i32, simm13Op>;
+  defm UMUL : F3_12np<"umul", 0b001010, IIC_iu_umul>;
+  defm SMUL : F3_12  <"smul", 0b001011, mul, IntRegs, i32, simm13Op, IIC_iu_smul>;
 }
 
 let Defs = [Y, ICC] in {
-  defm UMULCC : F3_12np<"umulcc", 0b011010>;
-  defm SMULCC : F3_12np<"smulcc", 0b011011>;
+  defm UMULCC : F3_12np<"umulcc", 0b011010, IIC_iu_umul>;
+  defm SMULCC : F3_12np<"smulcc", 0b011011, IIC_iu_smul>;
 }
 
 let Defs = [Y, ICC], Uses = [Y, ICC] in {
@@ -634,13 +749,13 @@ let Defs = [Y, ICC], Uses = [Y, ICC] in {
 
 // Section B.19 - Divide Instructions, p. 115
 let Uses = [Y], Defs = [Y] in {
-  defm UDIV : F3_12np<"udiv", 0b001110>;
-  defm SDIV : F3_12np<"sdiv", 0b001111>;
+  defm UDIV : F3_12np<"udiv", 0b001110, IIC_iu_div>;
+  defm SDIV : F3_12np<"sdiv", 0b001111, IIC_iu_div>;
 }
 
 let Uses = [Y], Defs = [Y, ICC] in {
-  defm UDIVCC : F3_12np<"udivcc", 0b011110>;
-  defm SDIVCC : F3_12np<"sdivcc", 0b011111>;
+  defm UDIVCC : F3_12np<"udivcc", 0b011110, IIC_iu_div>;
+  defm SDIVCC : F3_12np<"sdivcc", 0b011111, IIC_iu_div>;
 }
 
 // Section B.20 - SAVE and RESTORE, p. 117
@@ -666,26 +781,30 @@ let isBranch = 1, isTerminator = 1, hasDelaySlot = 1 in {
 
 // conditional branch class:
 class BranchSP<dag ins, string asmstr, list<dag> pattern>
- : F2_2<0b010, 0, (outs), ins, asmstr, pattern>;
+ : F2_2<0b010, 0, (outs), ins, asmstr, pattern, IIC_iu_instr>;
 
 // conditional branch with annul class:
 class BranchSPA<dag ins, string asmstr, list<dag> pattern>
- : F2_2<0b010, 1, (outs), ins, asmstr, pattern>;
+ : F2_2<0b010, 1, (outs), ins, asmstr, pattern, IIC_iu_instr>;
 
 // Conditional branch class on %icc|%xcc with predication:
 multiclass IPredBranch<string regstr, list<dag> CCPattern> {
   def CC    : F2_3<0b001, 0, 1, (outs), (ins bprtarget:$imm19, CCOp:$cond),
-                  !strconcat("b$cond ", !strconcat(regstr, ", $imm19")),
-                   CCPattern>;
+                   !strconcat("b$cond ", !strconcat(regstr, ", $imm19")),
+                   CCPattern,
+                   IIC_iu_instr>;
   def CCA   : F2_3<0b001, 1, 1, (outs), (ins bprtarget:$imm19, CCOp:$cond),
-                  !strconcat("b$cond,a ", !strconcat(regstr, ", $imm19")),
-                   []>;
+                   !strconcat("b$cond,a ", !strconcat(regstr, ", $imm19")),
+                   [],
+                   IIC_iu_instr>;
   def CCNT  : F2_3<0b001, 0, 0, (outs), (ins bprtarget:$imm19, CCOp:$cond),
                    !strconcat("b$cond,pn ", !strconcat(regstr, ", $imm19")),
-                   []>;
+                   [],
+                   IIC_iu_instr>;
   def CCANT : F2_3<0b001, 1, 0, (outs), (ins bprtarget:$imm19, CCOp:$cond),
                    !strconcat("b$cond,a,pn ", !strconcat(regstr, ", $imm19")),
-                   []>;
+                   [],
+                   IIC_iu_instr>;
 }
 
 } // let isBranch = 1, isTerminator = 1, hasDelaySlot = 1
@@ -721,26 +840,26 @@ let isBranch = 1, isTerminator = 1, hasDelaySlot = 1 in {
 
 // floating-point conditional branch class:
 class FPBranchSP<dag ins, string asmstr, list<dag> pattern>
- : F2_2<0b110, 0, (outs), ins, asmstr, pattern>;
+ : F2_2<0b110, 0, (outs), ins, asmstr, pattern, IIC_fpu_normal_instr>;
 
 // floating-point conditional branch with annul class:
 class FPBranchSPA<dag ins, string asmstr, list<dag> pattern>
- : F2_2<0b110, 1, (outs), ins, asmstr, pattern>;
+ : F2_2<0b110, 1, (outs), ins, asmstr, pattern, IIC_fpu_normal_instr>;
 
 // Conditional branch class on %fcc0-%fcc3 with predication:
 multiclass FPredBranch {
   def CC    : F2_3<0b101, 0, 1, (outs), (ins bprtarget:$imm19, CCOp:$cond,
                                          FCCRegs:$cc),
-                  "fb$cond $cc, $imm19", []>;
+                  "fb$cond $cc, $imm19", [], IIC_fpu_normal_instr>;
   def CCA   : F2_3<0b101, 1, 1, (outs), (ins bprtarget:$imm19, CCOp:$cond,
                                          FCCRegs:$cc),
-                  "fb$cond,a $cc, $imm19", []>;
+                  "fb$cond,a $cc, $imm19", [], IIC_fpu_normal_instr>;
   def CCNT  : F2_3<0b101, 0, 0, (outs), (ins bprtarget:$imm19, CCOp:$cond,
                                          FCCRegs:$cc),
-                  "fb$cond,pn $cc, $imm19", []>;
+                  "fb$cond,pn $cc, $imm19", [], IIC_fpu_normal_instr>;
   def CCANT : F2_3<0b101, 1, 0, (outs), (ins bprtarget:$imm19, CCOp:$cond,
                                          FCCRegs:$cc),
-                  "fb$cond,a,pn $cc, $imm19", []>;
+                  "fb$cond,a,pn $cc, $imm19", [], IIC_fpu_normal_instr>;
 }
 } // let isBranch = 1, isTerminator = 1, hasDelaySlot = 1
 
@@ -755,13 +874,33 @@ let Uses = [FCC0] in {
 let Predicates = [HasV9] in
   defm BPF : FPredBranch;
 
+// Section B.22 - Branch on Co-processor Condition Codes Instructions, p. 123
+let isBranch = 1, isTerminator = 1, hasDelaySlot = 1 in {
+
+// co-processor conditional branch class:
+class CPBranchSP<dag ins, string asmstr, list<dag> pattern>
+ : F2_2<0b111, 0, (outs), ins, asmstr, pattern>;
 
+// co-processor conditional branch with annul class:
+class CPBranchSPA<dag ins, string asmstr, list<dag> pattern>
+ : F2_2<0b111, 1, (outs), ins, asmstr, pattern>;
+
+} // let isBranch = 1, isTerminator = 1, hasDelaySlot = 1
+
+def CBCOND  : CPBranchSP<(ins brtarget:$imm22, CCOp:$cond),
+                          "cb$cond $imm22",
+                          [(SPbrfcc bb:$imm22, imm:$cond)]>;
+def CBCONDA : CPBranchSPA<(ins brtarget:$imm22, CCOp:$cond),
+                           "cb$cond,a $imm22", []>;
+                           
 // Section B.24 - Call and Link Instruction, p. 125
 // This is the only Format 1 instruction
 let Uses = [O6],
     hasDelaySlot = 1, isCall = 1 in {
   def CALL : InstSP<(outs), (ins calltarget:$disp, variable_ops),
-                    "call $disp", []> {
+                    "call $disp",
+                    [],
+                    IIC_jmp_or_call> {
     bits<30> disp;
     let op = 1;
     let Inst{29-0} = disp;
@@ -772,11 +911,13 @@ let Uses = [O6],
     def CALLrr : F3_1<2, 0b111000,
                       (outs), (ins MEMrr:$ptr, variable_ops),
                       "call $ptr",
-                      [(call ADDRrr:$ptr)]>;
+                      [(call ADDRrr:$ptr)],
+                      IIC_jmp_or_call>;
     def CALLri : F3_2<2, 0b111000,
                       (outs), (ins MEMri:$ptr, variable_ops),
                       "call $ptr",
-                      [(call ADDRri:$ptr)]>;
+                      [(call ADDRri:$ptr)],
+                      IIC_jmp_or_call>;
   }
 }
 
@@ -785,10 +926,16 @@ let Uses = [O6],
 // JMPL Instruction.
 let isTerminator = 1, hasDelaySlot = 1, isBarrier = 1,
     DecoderMethod = "DecodeJMPL" in {
-  def JMPLrr: F3_1<2, 0b111000, (outs IntRegs:$dst), (ins MEMrr:$addr),
-                  "jmpl $addr, $dst", []>;
-  def JMPLri: F3_2<2, 0b111000, (outs IntRegs:$dst), (ins MEMri:$addr),
-                  "jmpl $addr, $dst", []>;
+  def JMPLrr: F3_1<2, 0b111000,
+                   (outs IntRegs:$dst), (ins MEMrr:$addr),
+                   "jmpl $addr, $dst",
+                   [],
+                   IIC_jmp_or_call>;
+  def JMPLri: F3_2<2, 0b111000,
+                   (outs IntRegs:$dst), (ins MEMri:$addr),
+                   "jmpl $addr, $dst",
+                   [],
+                   IIC_jmp_or_call>;
 }
 
 // Section A.3 - Synthetic Instructions, p. 85
@@ -796,37 +943,65 @@ let isTerminator = 1, hasDelaySlot = 1, isBarrier = 1,
 let isReturn = 1, isTerminator = 1, hasDelaySlot = 1, isBarrier = 1,
     isCodeGenOnly = 1 in {
   let rd = 0, rs1 = 15 in
-    def RETL: F3_2<2, 0b111000, (outs), (ins i32imm:$val),
-                   "jmp %o7+$val", [(retflag simm13:$val)]>;
+    def RETL: F3_2<2, 0b111000,
+                   (outs), (ins i32imm:$val),
+                   "jmp %o7+$val",
+                   [(retflag simm13:$val)],
+                   IIC_jmp_or_call>;
 
   let rd = 0, rs1 = 31 in
-    def RET: F3_2<2, 0b111000, (outs), (ins i32imm:$val),
-                  "jmp %i7+$val", []>;
+    def RET: F3_2<2, 0b111000,
+                  (outs), (ins i32imm:$val),
+                  "jmp %i7+$val",
+                  [],
+                  IIC_jmp_or_call>;
 }
 
 // Section B.26 - Return from Trap Instruction
 let isReturn = 1, isTerminator = 1, hasDelaySlot = 1,
      isBarrier = 1, rd = 0, DecoderMethod = "DecodeReturn" in {
-  def RETTrr : F3_1<2, 0b111001, (outs), (ins MEMrr:$addr),
-                       "rett $addr", []>;
-  def RETTri : F3_2<2, 0b111001, (outs), (ins MEMri:$addr),
-                       "rett $addr", []>;
+  def RETTrr : F3_1<2, 0b111001,
+                   (outs), (ins MEMrr:$addr),
+                   "rett $addr",
+                   [],
+                   IIC_jmp_or_call>;
+  def RETTri : F3_2<2, 0b111001,
+                    (outs), (ins MEMri:$addr),
+                    "rett $addr",
+                    [],
+                    IIC_jmp_or_call>;
 }
 
 
 // Section B.27 - Trap on Integer Condition Codes Instruction
+// conditional branch class:
+let DecoderNamespace = "SparcV8", DecoderMethod = "DecodeTRAP", hasSideEffects = 1, Uses = [ICC], cc = 0b00 in
+{
+  def TRAPrr : TRAPSPrr<0b111010,
+                        (outs), (ins IntRegs:$rs1, IntRegs:$rs2, CCOp:$cond),
+                        "t$cond $rs1 + $rs2",
+                        []>;
+  def TRAPri : TRAPSPri<0b111010,
+                        (outs), (ins IntRegs:$rs1, i32imm:$imm, CCOp:$cond),
+                        "t$cond $rs1 + $imm",
+                        []>;
+}
+
 multiclass TRAP<string regStr> {
-  def rr : TRAPSPrr<0b111010, (outs), (ins IntRegs:$rs1, IntRegs:$rs2,
-                                       CCOp:$cond),
-              !strconcat(!strconcat("t$cond ", regStr), ", $rs1 + $rs2"), []>;
-  def ri : TRAPSPri<0b111010, (outs), (ins IntRegs:$rs1, i32imm:$imm,
-                                      CCOp:$cond),
-              !strconcat(!strconcat("t$cond ", regStr), ", $rs1 + $imm"), []>;
+  def rr : TRAPSPrr<0b111010,
+                    (outs), (ins IntRegs:$rs1, IntRegs:$rs2, CCOp:$cond),
+                    !strconcat(!strconcat("t$cond ", regStr), ", $rs1 + $rs2"),
+                    []>;
+  def ri : TRAPSPri<0b111010,
+                    (outs), (ins IntRegs:$rs1, i32imm:$imm, CCOp:$cond),
+                    !strconcat(!strconcat("t$cond ", regStr), ", $rs1 + $imm"),
+                    []>;
 }
 
-let hasSideEffects = 1, Uses = [ICC], cc = 0b00 in
+let DecoderNamespace = "SparcV9", DecoderMethod = "DecodeTRAP", Predicates = [HasV9], hasSideEffects = 1, Uses = [ICC], cc = 0b00 in
   defm TICC : TRAP<"%icc">;
 
+
 let isBarrier = 1, isTerminator = 1, rd = 0b01000, rs1 = 0, simm13 = 5 in
   def TA5 : F3_2<0b10, 0b111010, (outs), (ins), "ta 5", [(trap)]>;
 
@@ -922,11 +1097,13 @@ let rd = 0 in {
 def FITOS : F3_3u<2, 0b110100, 0b011000100,
                  (outs FPRegs:$rd), (ins FPRegs:$rs2),
                  "fitos $rs2, $rd",
-                 [(set FPRegs:$rd, (SPitof FPRegs:$rs2))]>;
+                 [(set FPRegs:$rd, (SPitof FPRegs:$rs2))],
+                 IIC_fpu_fast_instr>;
 def FITOD : F3_3u<2, 0b110100, 0b011001000,
                  (outs DFPRegs:$rd), (ins FPRegs:$rs2),
                  "fitod $rs2, $rd",
-                 [(set DFPRegs:$rd, (SPitof FPRegs:$rs2))]>;
+                 [(set DFPRegs:$rd, (SPitof FPRegs:$rs2))],
+                 IIC_fpu_fast_instr>;
 def FITOQ : F3_3u<2, 0b110100, 0b011001100,
                  (outs QFPRegs:$rd), (ins FPRegs:$rs2),
                  "fitoq $rs2, $rd",
@@ -937,11 +1114,13 @@ def FITOQ : F3_3u<2, 0b110100, 0b011001100,
 def FSTOI : F3_3u<2, 0b110100, 0b011010001,
                  (outs FPRegs:$rd), (ins FPRegs:$rs2),
                  "fstoi $rs2, $rd",
-                 [(set FPRegs:$rd, (SPftoi FPRegs:$rs2))]>;
+                 [(set FPRegs:$rd, (SPftoi FPRegs:$rs2))],
+                 IIC_fpu_fast_instr>;
 def FDTOI : F3_3u<2, 0b110100, 0b011010010,
                  (outs FPRegs:$rd), (ins DFPRegs:$rs2),
                  "fdtoi $rs2, $rd",
-                 [(set FPRegs:$rd, (SPftoi DFPRegs:$rs2))]>;
+                 [(set FPRegs:$rd, (SPftoi DFPRegs:$rs2))],
+                 IIC_fpu_fast_instr>;
 def FQTOI : F3_3u<2, 0b110100, 0b011010011,
                  (outs FPRegs:$rd), (ins QFPRegs:$rs2),
                  "fqtoi $rs2, $rd",
@@ -952,7 +1131,8 @@ def FQTOI : F3_3u<2, 0b110100, 0b011010011,
 def FSTOD : F3_3u<2, 0b110100, 0b011001001,
                  (outs DFPRegs:$rd), (ins FPRegs:$rs2),
                  "fstod $rs2, $rd",
-                 [(set f64:$rd, (fextend f32:$rs2))]>;
+                 [(set f64:$rd, (fextend f32:$rs2))],
+                 IIC_fpu_stod>;
 def FSTOQ : F3_3u<2, 0b110100, 0b011001101,
                  (outs QFPRegs:$rd), (ins FPRegs:$rs2),
                  "fstoq $rs2, $rd",
@@ -961,7 +1141,8 @@ def FSTOQ : F3_3u<2, 0b110100, 0b011001101,
 def FDTOS : F3_3u<2, 0b110100, 0b011000110,
                  (outs FPRegs:$rd), (ins DFPRegs:$rs2),
                  "fdtos $rs2, $rd",
-                 [(set f32:$rd, (fround f64:$rs2))]>;
+                 [(set f32:$rd, (fround f64:$rs2))],
+                 IIC_fpu_fast_instr>;
 def FDTOQ : F3_3u<2, 0b110100, 0b011001110,
                  (outs QFPRegs:$rd), (ins DFPRegs:$rs2),
                  "fdtoq $rs2, $rd",
@@ -985,22 +1166,29 @@ def FMOVS : F3_3u<2, 0b110100, 0b000000001,
 def FNEGS : F3_3u<2, 0b110100, 0b000000101,
                  (outs FPRegs:$rd), (ins FPRegs:$rs2),
                  "fnegs $rs2, $rd",
-                 [(set f32:$rd, (fneg f32:$rs2))]>;
+                 [(set f32:$rd, (fneg f32:$rs2))],
+                 IIC_fpu_negs>;
 def FABSS : F3_3u<2, 0b110100, 0b000001001,
                  (outs FPRegs:$rd), (ins FPRegs:$rs2),
                  "fabss $rs2, $rd",
-                 [(set f32:$rd, (fabs f32:$rs2))]>;
+                 [(set f32:$rd, (fabs f32:$rs2))],
+                 IIC_fpu_abs>;
 
 
 // Floating-point Square Root Instructions, p.145
+// FSQRTS generates an erratum on LEON processors, so by disabling this instruction
+// this will be promoted to use FSQRTD with doubles instead.
+let Predicates = [HasNoFdivSqrtFix] in 
 def FSQRTS : F3_3u<2, 0b110100, 0b000101001,
                   (outs FPRegs:$rd), (ins FPRegs:$rs2),
                   "fsqrts $rs2, $rd",
-                  [(set f32:$rd, (fsqrt f32:$rs2))]>;
+                  [(set f32:$rd, (fsqrt f32:$rs2))],
+                  IIC_fpu_sqrts>;
 def FSQRTD : F3_3u<2, 0b110100, 0b000101010,
                   (outs DFPRegs:$rd), (ins DFPRegs:$rs2),
                   "fsqrtd $rs2, $rd",
-                  [(set f64:$rd, (fsqrt f64:$rs2))]>;
+                  [(set f64:$rd, (fsqrt f64:$rs2))],
+                  IIC_fpu_sqrtd>;
 def FSQRTQ : F3_3u<2, 0b110100, 0b000101011,
                   (outs QFPRegs:$rd), (ins QFPRegs:$rs2),
                   "fsqrtq $rs2, $rd",
@@ -1013,11 +1201,13 @@ def FSQRTQ : F3_3u<2, 0b110100, 0b000101011,
 def FADDS  : F3_3<2, 0b110100, 0b001000001,
                   (outs FPRegs:$rd), (ins FPRegs:$rs1, FPRegs:$rs2),
                   "fadds $rs1, $rs2, $rd",
-                  [(set f32:$rd, (fadd f32:$rs1, f32:$rs2))]>;
+                  [(set f32:$rd, (fadd f32:$rs1, f32:$rs2))],
+                  IIC_fpu_fast_instr>;
 def FADDD  : F3_3<2, 0b110100, 0b001000010,
                   (outs DFPRegs:$rd), (ins DFPRegs:$rs1, DFPRegs:$rs2),
                   "faddd $rs1, $rs2, $rd",
-                  [(set f64:$rd, (fadd f64:$rs1, f64:$rs2))]>;
+                  [(set f64:$rd, (fadd f64:$rs1, f64:$rs2))],
+                  IIC_fpu_fast_instr>;
 def FADDQ  : F3_3<2, 0b110100, 0b001000011,
                   (outs QFPRegs:$rd), (ins QFPRegs:$rs1, QFPRegs:$rs2),
                   "faddq $rs1, $rs2, $rd",
@@ -1027,11 +1217,13 @@ def FADDQ  : F3_3<2, 0b110100, 0b001000011,
 def FSUBS  : F3_3<2, 0b110100, 0b001000101,
                   (outs FPRegs:$rd), (ins FPRegs:$rs1, FPRegs:$rs2),
                   "fsubs $rs1, $rs2, $rd",
-                  [(set f32:$rd, (fsub f32:$rs1, f32:$rs2))]>;
+                  [(set f32:$rd, (fsub f32:$rs1, f32:$rs2))],
+                  IIC_fpu_fast_instr>;
 def FSUBD  : F3_3<2, 0b110100, 0b001000110,
                   (outs DFPRegs:$rd), (ins DFPRegs:$rs1, DFPRegs:$rs2),
                   "fsubd $rs1, $rs2, $rd",
-                  [(set f64:$rd, (fsub f64:$rs1, f64:$rs2))]>;
+                  [(set f64:$rd, (fsub f64:$rs1, f64:$rs2))],
+                  IIC_fpu_fast_instr>;
 def FSUBQ  : F3_3<2, 0b110100, 0b001000111,
                   (outs QFPRegs:$rd), (ins QFPRegs:$rs1, QFPRegs:$rs2),
                   "fsubq $rs1, $rs2, $rd",
@@ -1040,25 +1232,32 @@ def FSUBQ  : F3_3<2, 0b110100, 0b001000111,
 
 
 // Floating-point Multiply and Divide Instructions, p. 147
+// FMULS generates an erratum on LEON processors, so by disabling this instruction
+// this will be promoted to use FMULD with doubles instead.
+let Predicates = [HasNoFmulsFix] in 
 def FMULS  : F3_3<2, 0b110100, 0b001001001,
                   (outs FPRegs:$rd), (ins FPRegs:$rs1, FPRegs:$rs2),
                   "fmuls $rs1, $rs2, $rd",
-                  [(set f32:$rd, (fmul f32:$rs1, f32:$rs2))]>;
+                  [(set f32:$rd, (fmul f32:$rs1, f32:$rs2))],
+                  IIC_fpu_muls>;
 def FMULD  : F3_3<2, 0b110100, 0b001001010,
                   (outs DFPRegs:$rd), (ins DFPRegs:$rs1, DFPRegs:$rs2),
                   "fmuld $rs1, $rs2, $rd",
-                  [(set f64:$rd, (fmul f64:$rs1, f64:$rs2))]>;
+                  [(set f64:$rd, (fmul f64:$rs1, f64:$rs2))],
+                  IIC_fpu_muld>;
 def FMULQ  : F3_3<2, 0b110100, 0b001001011,
                   (outs QFPRegs:$rd), (ins QFPRegs:$rs1, QFPRegs:$rs2),
                   "fmulq $rs1, $rs2, $rd",
                   [(set f128:$rd, (fmul f128:$rs1, f128:$rs2))]>,
                   Requires<[HasHardQuad]>;
 
+let Predicates = [HasNoFsmuldFix] in
 def FSMULD : F3_3<2, 0b110100, 0b001101001,
                   (outs DFPRegs:$rd), (ins FPRegs:$rs1, FPRegs:$rs2),
                   "fsmuld $rs1, $rs2, $rd",
                   [(set f64:$rd, (fmul (fextend f32:$rs1),
-                                        (fextend f32:$rs2)))]>;
+                                        (fextend f32:$rs2)))],
+                  IIC_fpu_muld>;
 def FDMULQ : F3_3<2, 0b110100, 0b001101110,
                   (outs QFPRegs:$rd), (ins DFPRegs:$rs1, DFPRegs:$rs2),
                   "fdmulq $rs1, $rs2, $rd",
@@ -1066,14 +1265,18 @@ def FDMULQ : F3_3<2, 0b110100, 0b001101110,
                                          (fextend f64:$rs2)))]>,
                   Requires<[HasHardQuad]>;
 
+// FDIVS generates an erratum on LEON processors, so by disabling this instruction
+// this will be promoted to use FDIVD with doubles instead.
 def FDIVS  : F3_3<2, 0b110100, 0b001001101,
                  (outs FPRegs:$rd), (ins FPRegs:$rs1, FPRegs:$rs2),
                  "fdivs $rs1, $rs2, $rd",
-                 [(set f32:$rd, (fdiv f32:$rs1, f32:$rs2))]>;
+                 [(set f32:$rd, (fdiv f32:$rs1, f32:$rs2))],
+                 IIC_fpu_divs>;
 def FDIVD  : F3_3<2, 0b110100, 0b001001110,
                  (outs DFPRegs:$rd), (ins DFPRegs:$rs1, DFPRegs:$rs2),
                  "fdivd $rs1, $rs2, $rd",
-                 [(set f64:$rd, (fdiv f64:$rs1, f64:$rs2))]>;
+                 [(set f64:$rd, (fdiv f64:$rs1, f64:$rs2))],
+                 IIC_fpu_divd>;
 def FDIVQ  : F3_3<2, 0b110100, 0b001001111,
                  (outs QFPRegs:$rd), (ins QFPRegs:$rs1, QFPRegs:$rs2),
                  "fdivq $rs1, $rs2, $rd",
@@ -1091,11 +1294,13 @@ let Defs = [FCC0], rd = 0, isCodeGenOnly = 1 in {
   def FCMPS  : F3_3c<2, 0b110101, 0b001010001,
                    (outs), (ins FPRegs:$rs1, FPRegs:$rs2),
                    "fcmps $rs1, $rs2",
-                   [(SPcmpfcc f32:$rs1, f32:$rs2)]>;
+                   [(SPcmpfcc f32:$rs1, f32:$rs2)],
+                   IIC_fpu_fast_instr>;
   def FCMPD  : F3_3c<2, 0b110101, 0b001010010,
                    (outs), (ins DFPRegs:$rs1, DFPRegs:$rs2),
                    "fcmpd $rs1, $rs2",
-                   [(SPcmpfcc f64:$rs1, f64:$rs2)]>;
+                   [(SPcmpfcc f64:$rs1, f64:$rs2)],
+                   IIC_fpu_fast_instr>;
   def FCMPQ  : F3_3c<2, 0b110101, 0b001010011,
                    (outs), (ins QFPRegs:$rs1, QFPRegs:$rs2),
                    "fcmpq $rs1, $rs2",
@@ -1125,7 +1330,8 @@ let Uses = [O6], isCall = 1, hasDelaySlot = 1 in
   def TLS_CALL : InstSP<(outs),
                         (ins calltarget:$disp, TLSSym:$sym, variable_ops),
                         "call $disp, $sym",
-                        [(tlscall texternalsym:$disp, tglobaltlsaddr:$sym)]> {
+                        [(tlscall texternalsym:$disp, tglobaltlsaddr:$sym)],
+                        IIC_jmp_or_call> {
   bits<30> disp;
   let op = 1;
   let Inst{29-0} = disp;
@@ -1303,19 +1509,60 @@ let Predicates = [HasV9], hasSideEffects = 1, rd = 0, rs1 = 0b01111 in
  def MEMBARi : F3_2<2, 0b101000, (outs), (ins simm13Op:$simm13),
                     "membar $simm13", []>;
 
-// TODO: Should add a CASArr variant. In fact, the CAS instruction,
-// unlike other instructions, only comes in a form which requires an
-// ASI be provided. The ASI value hardcoded here is ASI_PRIMARY, the
-// default unprivileged ASI for SparcV9.  (Also of note: some modern
-// SparcV8 implementations provide CASA as an extension, but require
-// the use of SparcV8's default ASI, 0xA ("User Data") instead.)
+// The CAS instruction, unlike other instructions, only comes in a 
+// form which requires an ASI be provided. The ASI value hardcoded 
+// here is ASI_PRIMARY, the default unprivileged ASI for SparcV9.
 let Predicates = [HasV9], Constraints = "$swap = $rd", asi = 0b10000000 in
   def CASrr: F3_1_asi<3, 0b111100,
                 (outs IntRegs:$rd), (ins IntRegs:$rs1, IntRegs:$rs2,
                                      IntRegs:$swap),
                  "cas [$rs1], $rs2, $rd",
                  [(set i32:$rd,
-                     (atomic_cmp_swap iPTR:$rs1, i32:$rs2, i32:$swap))]>;
+                     (atomic_cmp_swap_32 iPTR:$rs1, i32:$rs2, i32:$swap))]>;
+
+
+// CASA is supported as an instruction on some LEON3 and all LEON4 processors.
+// This version can be automatically lowered from C code, selecting ASI 10
+let Predicates = [HasLeonCASA], Constraints = "$swap = $rd", asi = 0b00001010 in
+  def CASAasi10: F3_1_asi<3, 0b111100,
+                (outs IntRegs:$rd), (ins IntRegs:$rs1, IntRegs:$rs2,
+                                     IntRegs:$swap),
+                 "casa [$rs1] 10, $rs2, $rd",
+                 [(set i32:$rd,
+                     (atomic_cmp_swap_32 iPTR:$rs1, i32:$rs2, i32:$swap))]>;
+                 
+// CASA supported on some LEON3 and all LEON4 processors. Same pattern as
+// CASrr, above, but with a different ASI. This version is supported for
+// inline assembly lowering only. 
+let Predicates = [HasLeonCASA], Constraints = "$swap = $rd" in
+  def CASArr: F3_1_asi<3, 0b111100,
+                (outs IntRegs:$rd), (ins IntRegs:$rs1, IntRegs:$rs2,
+                                     IntRegs:$swap, i8imm:$asi),
+                 "casa [$rs1] $asi, $rs2, $rd", []>;
+                
+// TODO: Add DAG sequence to lower these instructions. Currently, only provided
+// as inline assembler-supported instructions. 
+let Predicates = [HasUMAC_SMAC], Defs = [Y, ASR18], Uses = [Y, ASR18] in {
+  def SMACrr :  F3_1<2, 0b111111,
+                   (outs IntRegs:$rd), (ins IntRegs:$rs1, IntRegs:$rs2, ASRRegs:$asr18),
+                   "smac $rs1, $rs2, $rd",
+                   [], IIC_smac_umac>;
+
+  def SMACri :  F3_2<2, 0b111111,
+                  (outs IntRegs:$rd), (ins IntRegs:$rs1, simm13Op:$simm13, ASRRegs:$asr18),
+                   "smac $rs1, $simm13, $rd",
+                   [], IIC_smac_umac>;
+                 
+  def UMACrr :  F3_1<2, 0b111110,
+                  (outs IntRegs:$rd), (ins IntRegs:$rs1, IntRegs:$rs2, ASRRegs:$asr18),
+                   "umac $rs1, $rs2, $rd",
+                   [], IIC_smac_umac>;
+                 
+  def UMACri :  F3_2<2, 0b111110,
+                  (outs IntRegs:$rd), (ins IntRegs:$rs1, simm13Op:$simm13, ASRRegs:$asr18),
+                   "umac $rs1, $simm13, $rd",
+                   [], IIC_smac_umac>;
+}
 
 let Defs = [ICC] in {
 defm TADDCC   : F3_12np<"taddcc",   0b100000>;
@@ -1411,13 +1658,21 @@ def : Pat<(store (i32 0), ADDRri:$dst), (STri ADDRri:$dst, (i32 G0))>;
 let Predicates = [HasNoV9] in
   def : Pat<(atomic_fence imm, imm), (STBAR)>;
 
-// atomic_load_32 addr -> load addr
-def : Pat<(i32 (atomic_load ADDRrr:$src)), (LDrr ADDRrr:$src)>;
-def : Pat<(i32 (atomic_load ADDRri:$src)), (LDri ADDRri:$src)>;
-
-// atomic_store_32 val, addr -> store val, addr
-def : Pat<(atomic_store ADDRrr:$dst, i32:$val), (STrr ADDRrr:$dst, $val)>;
-def : Pat<(atomic_store ADDRri:$dst, i32:$val), (STri ADDRri:$dst, $val)>;
+// atomic_load addr -> load addr
+def : Pat<(i32 (atomic_load_8 ADDRrr:$src)), (LDUBrr ADDRrr:$src)>;
+def : Pat<(i32 (atomic_load_8 ADDRri:$src)), (LDUBri ADDRri:$src)>;
+def : Pat<(i32 (atomic_load_16 ADDRrr:$src)), (LDUHrr ADDRrr:$src)>;
+def : Pat<(i32 (atomic_load_16 ADDRri:$src)), (LDUHri ADDRri:$src)>;
+def : Pat<(i32 (atomic_load_32 ADDRrr:$src)), (LDrr ADDRrr:$src)>;
+def : Pat<(i32 (atomic_load_32 ADDRri:$src)), (LDri ADDRri:$src)>;
+
+// atomic_store val, addr -> store val, addr
+def : Pat<(atomic_store_8 ADDRrr:$dst, i32:$val), (STBrr ADDRrr:$dst, $val)>;
+def : Pat<(atomic_store_8 ADDRri:$dst, i32:$val), (STBri ADDRri:$dst, $val)>;
+def : Pat<(atomic_store_16 ADDRrr:$dst, i32:$val), (STHrr ADDRrr:$dst, $val)>;
+def : Pat<(atomic_store_16 ADDRri:$dst, i32:$val), (STHri ADDRri:$dst, $val)>;
+def : Pat<(atomic_store_32 ADDRrr:$dst, i32:$val), (STrr ADDRrr:$dst, $val)>;
+def : Pat<(atomic_store_32 ADDRri:$dst, i32:$val), (STri ADDRri:$dst, $val)>;
 
 // extract_vector
 def : Pat<(extractelt (v2i32 IntPair:$Rn), 0),
diff --git a/lib/Target/Sparc/SparcMCInstLower.cpp b/lib/Target/Sparc/SparcMCInstLower.cpp
index b084d0021ba0..a3cedcbf9dd1 100644
--- a/lib/Target/Sparc/SparcMCInstLower.cpp
+++ b/lib/Target/Sparc/SparcMCInstLower.cpp
@@ -14,7 +14,6 @@
 
 #include "Sparc.h"
 #include "MCTargetDesc/SparcMCExpr.h"
-#include "llvm/ADT/SmallString.h"
 #include "llvm/CodeGen/AsmPrinter.h"
 #include "llvm/CodeGen/MachineFunction.h"
 #include "llvm/CodeGen/MachineInstr.h"
diff --git a/lib/Target/Sparc/SparcRegisterInfo.cpp b/lib/Target/Sparc/SparcRegisterInfo.cpp
index da31783ba248..37a1fdf4d770 100644
--- a/lib/Target/Sparc/SparcRegisterInfo.cpp
+++ b/lib/Target/Sparc/SparcRegisterInfo.cpp
@@ -105,13 +105,9 @@ SparcRegisterInfo::getPointerRegClass(const MachineFunction &MF,
   return Subtarget.is64Bit() ? &SP::I64RegsRegClass : &SP::IntRegsRegClass;
 }
 
-static void replaceFI(MachineFunction &MF,
-                      MachineBasicBlock::iterator II,
-                      MachineInstr &MI,
-                      DebugLoc dl,
-                      unsigned FIOperandNum, int Offset,
-                      unsigned FramePtr)
-{
+static void replaceFI(MachineFunction &MF, MachineBasicBlock::iterator II,
+                      MachineInstr &MI, const DebugLoc &dl,
+                      unsigned FIOperandNum, int Offset, unsigned FramePtr) {
   // Replace frame index with a frame pointer reference.
   if (Offset >= -4096 && Offset <= 4095) {
     // If the offset is small enough to fit in the immediate field, directly
diff --git a/lib/Target/Sparc/SparcRegisterInfo.h b/lib/Target/Sparc/SparcRegisterInfo.h
index 32075b1df410..2ac51263957e 100644
--- a/lib/Target/Sparc/SparcRegisterInfo.h
+++ b/lib/Target/Sparc/SparcRegisterInfo.h
@@ -39,9 +39,6 @@ struct SparcRegisterInfo : public SparcGenRegisterInfo {
                            int SPAdj, unsigned FIOperandNum,
                            RegScavenger *RS = nullptr) const override;
 
-  void processFunctionBeforeFrameFinalized(MachineFunction &MF,
-                                       RegScavenger *RS = nullptr) const;
-
   unsigned getFrameRegister(const MachineFunction &MF) const override;
 
   bool canRealignStack(const MachineFunction &MF) const override;
diff --git a/lib/Target/Sparc/SparcRegisterInfo.td b/lib/Target/Sparc/SparcRegisterInfo.td
index cca9463562a4..d1ef3b19dca7 100644
--- a/lib/Target/Sparc/SparcRegisterInfo.td
+++ b/lib/Target/Sparc/SparcRegisterInfo.td
@@ -62,6 +62,12 @@ foreach I = 0-3 in
 
 def FSR : SparcCtrlReg<0, "FSR">; // Floating-point state register.
 
+def FQ : SparcCtrlReg<0, "FQ">; // Floating-point deferred-trap queue.
+
+def CPSR : SparcCtrlReg<0, "CPSR">; // Co-processor state register.
+
+def CPQ : SparcCtrlReg<0, "CPQ">; // Co-processor queue.
+
 // Y register
 def Y : SparcCtrlReg<0, "Y">, DwarfRegNum<[64]>;
 // Ancillary state registers (implementation defined)
@@ -204,6 +210,40 @@ def D13 : Rd<26, "F26", [F26, F27]>, DwarfRegNum<[85]>;
 def D14 : Rd<28, "F28", [F28, F29]>, DwarfRegNum<[86]>;
 def D15 : Rd<30, "F30", [F30, F31]>, DwarfRegNum<[87]>;
 
+// Co-processor registers
+def C0 : Ri< 0, "C0">;
+def C1 : Ri< 1, "C1">;
+def C2 : Ri< 2, "C2">;
+def C3 : Ri< 3, "C3">;
+def C4 : Ri< 4, "C4">;
+def C5 : Ri< 5, "C5">;
+def C6 : Ri< 6, "C6">;
+def C7 : Ri< 7, "C7">;
+def C8 : Ri< 8, "C8">;
+def C9 : Ri< 9, "C9">;
+def C10 : Ri< 10, "C10">;
+def C11 : Ri< 11, "C11">;
+def C12 : Ri< 12, "C12">;
+def C13 : Ri< 13, "C13">;
+def C14 : Ri< 14, "C14">;
+def C15 : Ri< 15, "C15">;
+def C16 : Ri< 16, "C16">;
+def C17 : Ri< 17, "C17">;
+def C18 : Ri< 18, "C18">;
+def C19 : Ri< 19, "C19">;
+def C20 : Ri< 20, "C20">;
+def C21 : Ri< 21, "C21">;
+def C22 : Ri< 22, "C22">;
+def C23 : Ri< 23, "C23">;
+def C24 : Ri< 24, "C24">;
+def C25 : Ri< 25, "C25">;
+def C26 : Ri< 26, "C26">;
+def C27 : Ri< 27, "C27">;
+def C28 : Ri< 28, "C28">;
+def C29 : Ri< 29, "C29">;
+def C30 : Ri< 30, "C30">;
+def C31 : Ri< 31, "C31">;
+
 // Unaliased double precision floating point registers.
 // FIXME: Define DwarfRegNum for these registers.
 def D16 : SparcReg< 1, "F32">;
@@ -259,6 +299,24 @@ def I2_I3 : Rdi<26, "I2", [I2, I3]>;
 def I4_I5 : Rdi<28, "I4", [I4, I5]>;
 def I6_I7 : Rdi<30, "I6", [I6, I7]>;
 
+// Aliases of the co-processor registers used for LDD/STD double-word operations
+def C0_C1 : Rdi<0, "C0", [C0, C1]>;
+def C2_C3 : Rdi<2, "C2", [C2, C3]>;
+def C4_C5 : Rdi<4, "C4", [C4, C5]>;
+def C6_C7 : Rdi<6, "C6", [C6, C7]>;
+def C8_C9 : Rdi<8, "C8", [C8, C9]>;
+def C10_C11 : Rdi<10, "C10", [C10, C11]>;
+def C12_C13 : Rdi<12, "C12", [C12, C13]>;
+def C14_C15 : Rdi<14, "C14", [C14, C15]>;
+def C16_C17 : Rdi<16, "C16", [C16, C17]>;
+def C18_C19 : Rdi<18, "C18", [C18, C19]>;
+def C20_C21 : Rdi<20, "C20", [C20, C21]>;
+def C22_C23 : Rdi<22, "C22", [C22, C23]>;
+def C24_C25 : Rdi<24, "C24", [C24, C25]>;
+def C26_C27 : Rdi<26, "C26", [C26, C27]>;
+def C28_C29 : Rdi<28, "C28", [C28, C29]>;
+def C30_C31 : Rdi<30, "C30", [C30, C31]>;
+
 // Register classes.
 //
 // FIXME: the register order should be defined in terms of the preferred
@@ -273,6 +331,7 @@ def IntRegs : RegisterClass<"SP", [i32, i64], 32,
                                  (sequence "L%u", 0, 7),
                                  (sequence "O%u", 0, 7))>;
 
+
 // Should be in the same order as IntRegs.
 def IntPair : RegisterClass<"SP", [v2i32], 64,
     (add I0_I1, I2_I3, I4_I5, I6_I7,
@@ -296,10 +355,21 @@ def QFPRegs : RegisterClass<"SP", [f128], 128, (sequence "Q%u", 0, 15)>;
 // Floating point control register classes.
 def FCCRegs : RegisterClass<"SP", [i1], 1, (sequence "FCC%u", 0, 3)>;
 
-// Ancillary state registers
-def ASRRegs : RegisterClass<"SP", [i32], 32,
-                            (add Y, (sequence "ASR%u", 1, 31))> {
-  let isAllocatable = 0;
+let isAllocatable = 0 in {
+  // Ancillary state registers
+  def ASRRegs : RegisterClass<"SP", [i32], 32,
+                              (add Y, (sequence "ASR%u", 1, 31))>;
+                            
+  // This register class should not be used to hold i64 values.
+  def CoprocRegs : RegisterClass<"SP", [i32], 32,
+                                (add (sequence "C%u", 0, 31))>;
+
+  // Should be in the same order as CoprocRegs.
+  def CoprocPair : RegisterClass<"SP", [v2i32], 64,
+    (add C0_C1,   C2_C3,   C4_C5,   C6_C7,   
+         C8_C9,   C10_C11, C12_C13, C14_C15,
+         C16_C17, C18_C19, C20_C21, C22_C23,
+         C24_C25, C26_C27, C28_C29, C30_C31)>;
 }
 
 // Privileged Registers
diff --git a/lib/Target/Sparc/SparcSchedule.td b/lib/Target/Sparc/SparcSchedule.td
new file mode 100755
index 000000000000..f243546b029b
--- /dev/null
+++ b/lib/Target/Sparc/SparcSchedule.td
@@ -0,0 +1,124 @@
+//===-- SparcSchedule.td - Describe the Sparc Itineries ----*- tablegen -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+//
+//===----------------------------------------------------------------------===//
+
+def IIC_iu_or_fpu_instr : InstrItinClass;
+def IIC_iu_instr : InstrItinClass;
+def IIC_fpu_normal_instr : InstrItinClass;
+def IIC_fpu_fast_instr : InstrItinClass;
+def IIC_jmp_or_call : InstrItinClass;
+def IIC_ldd : InstrItinClass;
+def IIC_st : InstrItinClass;
+def IIC_std : InstrItinClass;
+def IIC_iu_smul : InstrItinClass;
+def IIC_iu_umul : InstrItinClass;
+def IIC_iu_div : InstrItinClass;
+def IIC_ticc : InstrItinClass;
+def IIC_ldstub : InstrItinClass;
+def IIC_fpu_muls : InstrItinClass;
+def IIC_fpu_muld : InstrItinClass;
+def IIC_fpu_divs : InstrItinClass;
+def IIC_fpu_divd : InstrItinClass;
+def IIC_fpu_sqrts : InstrItinClass;
+def IIC_fpu_sqrtd : InstrItinClass;
+def IIC_fpu_abs : InstrItinClass;
+def IIC_fpu_movs : InstrItinClass;
+def IIC_fpu_negs : InstrItinClass;
+def IIC_smac_umac : InstrItinClass;
+def IIC_fpu_stod : InstrItinClass;
+
+def LEONIU : FuncUnit; // integer unit
+def LEONFPU : FuncUnit; // floating-point unit
+
+// Ref: http://www.atmel.com/Images/doc4226.pdf
+
+def LEON2Itineraries : ProcessorItineraries<
+[LEONIU, LEONFPU], [], [
+  InstrItinData<IIC_iu_or_fpu_instr, [InstrStage<1, [LEONIU, LEONFPU]>], [1, 1]>,
+  InstrItinData<IIC_iu_instr, [InstrStage<1, [LEONIU]>], [1, 1]>,
+  InstrItinData<IIC_fpu_normal_instr, [InstrStage<1, [LEONFPU]>], [7, 1]>,
+  InstrItinData<IIC_fpu_fast_instr, [InstrStage<1, [LEONFPU]>], [7, 1]>,
+  InstrItinData<IIC_jmp_or_call, [InstrStage<1, [LEONIU, LEONFPU]>], [2, 1]>,
+  InstrItinData<IIC_ldd, [InstrStage<1, [LEONIU, LEONFPU]>], [2, 1]>,
+  InstrItinData<IIC_st, [InstrStage<1, [LEONIU, LEONFPU]>], [2, 1]>,
+  InstrItinData<IIC_std, [InstrStage<1, [LEONIU, LEONFPU]>], [3, 1]>,
+  InstrItinData<IIC_iu_smul, [InstrStage<1, [LEONIU]>], [5, 1]>,
+  InstrItinData<IIC_iu_umul, [InstrStage<1, [LEONIU]>], [5, 1]>,
+  InstrItinData<IIC_iu_div, [InstrStage<1, [LEONIU]>], [35, 1]>,
+  InstrItinData<IIC_ticc, [InstrStage<1, [LEONIU, LEONFPU]>], [4, 1]>,
+  InstrItinData<IIC_ldstub, [InstrStage<1, [LEONIU, LEONFPU]>], [3, 1]>,
+  InstrItinData<IIC_fpu_muls, [InstrStage<1, [LEONFPU]>], [16, 1]>,
+  InstrItinData<IIC_fpu_muld, [InstrStage<1, [LEONFPU]>], [21, 1]>,
+  InstrItinData<IIC_fpu_divs, [InstrStage<1, [LEONFPU]>], [20, 1]>,
+  InstrItinData<IIC_fpu_divd, [InstrStage<1, [LEONFPU]>], [36, 1]>,
+  InstrItinData<IIC_fpu_sqrts, [InstrStage<1, [LEONFPU]>], [37, 1]>,
+  InstrItinData<IIC_fpu_sqrtd, [InstrStage<1, [LEONFPU]>], [65, 1]>,
+  InstrItinData<IIC_fpu_abs, [InstrStage<1, [LEONFPU]>], [2, 1]>,
+  InstrItinData<IIC_fpu_movs, [InstrStage<1, [LEONFPU]>], [2, 1]>,
+  InstrItinData<IIC_fpu_negs, [InstrStage<1, [LEONFPU]>], [2, 1]>,
+  InstrItinData<IIC_fpu_stod, [InstrStage<1, [LEONFPU]>], [2, 1]>
+]>;
+
+def LEON3Itineraries : ProcessorItineraries<
+[LEONIU, LEONFPU], [], [
+  InstrItinData<IIC_iu_or_fpu_instr, [InstrStage<1, [LEONIU, LEONFPU]>], [1, 1]>,
+  InstrItinData<IIC_iu_instr, [InstrStage<1, [LEONIU]>], [1, 1]>,
+  InstrItinData<IIC_fpu_normal_instr, [InstrStage<1, [LEONFPU]>], [7, 1]>,
+  InstrItinData<IIC_fpu_fast_instr, [InstrStage<1, [LEONFPU]>], [4, 1]>,
+  InstrItinData<IIC_jmp_or_call, [InstrStage<1, [LEONIU, LEONFPU]>], [3, 1]>,
+  InstrItinData<IIC_ldd, [InstrStage<1, [LEONIU, LEONFPU]>], [2, 1]>,
+  InstrItinData<IIC_st, [InstrStage<1, [LEONIU, LEONFPU]>], [4, 1]>,
+  InstrItinData<IIC_std, [InstrStage<1, [LEONIU, LEONFPU]>], [5, 1]>,
+  InstrItinData<IIC_iu_smul, [InstrStage<1, [LEONIU]>], [1, 1]>,
+  InstrItinData<IIC_iu_umul, [InstrStage<1, [LEONIU]>], [4, 1]>,
+  InstrItinData<IIC_iu_div, [InstrStage<1, [LEONIU]>], [35, 1]>,
+  InstrItinData<IIC_smac_umac, [InstrStage<1, [LEONIU]>], [2, 1]>,
+  InstrItinData<IIC_ticc, [InstrStage<1, [LEONIU, LEONFPU]>], [5, 1]>,
+  InstrItinData<IIC_ldstub, [InstrStage<1, [LEONIU, LEONFPU]>], [3, 1]>,
+  InstrItinData<IIC_fpu_muls, [InstrStage<1, [LEONFPU]>], [4, 1]>,
+  InstrItinData<IIC_fpu_muld, [InstrStage<1, [LEONFPU]>], [4, 1]>,
+  InstrItinData<IIC_fpu_divs, [InstrStage<1, [LEONFPU]>], [16, 1]>,
+  InstrItinData<IIC_fpu_divd, [InstrStage<1, [LEONFPU]>], [17, 1]>,
+  InstrItinData<IIC_fpu_sqrts, [InstrStage<1, [LEONFPU]>], [24, 1]>,
+  InstrItinData<IIC_fpu_sqrtd, [InstrStage<1, [LEONFPU]>], [25, 1]>,
+  InstrItinData<IIC_fpu_abs, [InstrStage<1, [LEONFPU]>], [2, 1]>,
+  InstrItinData<IIC_fpu_movs, [InstrStage<1, [LEONFPU]>], [2, 1]>,
+  InstrItinData<IIC_fpu_negs, [InstrStage<1, [LEONFPU]>], [2, 1]>,
+  InstrItinData<IIC_fpu_stod, [InstrStage<1, [LEONFPU]>], [4, 1]>
+]>;
+
+def LEON4Itineraries : ProcessorItineraries<
+[LEONIU, LEONFPU], [], [
+  InstrItinData<IIC_iu_or_fpu_instr, [InstrStage<1, [LEONIU, LEONFPU]>], [1, 1]>,
+  InstrItinData<IIC_iu_instr, [InstrStage<1, [LEONIU]>], [1, 1]>,
+  InstrItinData<IIC_fpu_normal_instr, [InstrStage<1, [LEONFPU]>], [7, 1]>,
+  InstrItinData<IIC_fpu_fast_instr, [InstrStage<1, [LEONFPU]>], [4, 1]>,
+  InstrItinData<IIC_jmp_or_call, [InstrStage<1, [LEONIU, LEONFPU]>], [3, 1]>,
+  InstrItinData<IIC_ldd, [InstrStage<1, [LEONIU, LEONFPU]>], [1, 1]>,
+  InstrItinData<IIC_st, [InstrStage<1, [LEONIU, LEONFPU]>], [1, 1]>,
+  InstrItinData<IIC_std, [InstrStage<1, [LEONIU, LEONFPU]>], [1, 1]>,
+  InstrItinData<IIC_iu_smul, [InstrStage<1, [LEONIU]>], [1, 1]>,
+  InstrItinData<IIC_iu_umul, [InstrStage<1, [LEONIU]>], [4, 1]>,
+  InstrItinData<IIC_iu_div, [InstrStage<1, [LEONIU]>], [35, 1]>,
+  InstrItinData<IIC_smac_umac, [InstrStage<1, [LEONIU]>], [2, 1]>,
+  InstrItinData<IIC_ticc, [InstrStage<1, [LEONIU, LEONFPU]>], [5, 1]>,
+  InstrItinData<IIC_ldstub, [InstrStage<1, [LEONIU, LEONFPU]>], [3, 1]>,
+  InstrItinData<IIC_fpu_muls, [InstrStage<1, [LEONFPU]>], [4, 1]>,
+  InstrItinData<IIC_fpu_muld, [InstrStage<1, [LEONFPU]>], [4, 1]>,
+  InstrItinData<IIC_fpu_divs, [InstrStage<1, [LEONFPU]>], [16, 1]>,
+  InstrItinData<IIC_fpu_divd, [InstrStage<1, [LEONFPU]>], [17, 1]>,
+  InstrItinData<IIC_fpu_sqrts, [InstrStage<1, [LEONFPU]>], [24, 1]>,
+  InstrItinData<IIC_fpu_sqrtd, [InstrStage<1, [LEONFPU]>], [25, 1]>,
+  InstrItinData<IIC_fpu_abs, [InstrStage<1, [LEONFPU]>], [2, 1]>,
+  InstrItinData<IIC_fpu_movs, [InstrStage<1, [LEONFPU]>], [2, 1]>,
+  InstrItinData<IIC_fpu_negs, [InstrStage<1, [LEONFPU]>], [2, 1]>,
+  InstrItinData<IIC_fpu_stod, [InstrStage<1, [LEONFPU]>], [4, 1]>
+]>;
diff --git a/lib/Target/Sparc/SparcSubtarget.cpp b/lib/Target/Sparc/SparcSubtarget.cpp
index d701594d27af..a6a4dc54faed 100644
--- a/lib/Target/Sparc/SparcSubtarget.cpp
+++ b/lib/Target/Sparc/SparcSubtarget.cpp
@@ -29,10 +29,27 @@ void SparcSubtarget::anchor() { }
 SparcSubtarget &SparcSubtarget::initializeSubtargetDependencies(StringRef CPU,
                                                                 StringRef FS) {
   IsV9 = false;
+  IsLeon = false;
   V8DeprecatedInsts = false;
   IsVIS = false;
   HasHardQuad = false;
   UsePopc = false;
+  UseSoftFloat = false;
+
+  // Leon features
+  HasLeonCasa = false;
+  HasUmacSmac = false;
+  PerformSDIVReplace = false;
+  FixCallImmediates = false;
+  IgnoreZeroFlag = false;
+  InsertNOPDoublePrecision = false;
+  FixFSMULD = false;
+  ReplaceFMULS = false;
+  PreventRoundChange = false;
+  FixAllFDIVSQRT = false;
+  InsertNOPLoad = false;
+  FlushCacheLineSWAP = false;
+  InsertNOPsLoadStore = false;
 
   // Determine default and user specified characteristics
   std::string CPUName = CPU;
@@ -50,9 +67,9 @@ SparcSubtarget &SparcSubtarget::initializeSubtargetDependencies(StringRef CPU,
 }
 
 SparcSubtarget::SparcSubtarget(const Triple &TT, const std::string &CPU,
-                               const std::string &FS, TargetMachine &TM,
+                               const std::string &FS, const TargetMachine &TM,
                                bool is64Bit)
-    : SparcGenSubtargetInfo(TT, CPU, FS), Is64Bit(is64Bit),
+    : SparcGenSubtargetInfo(TT, CPU, FS), TargetTriple(TT), Is64Bit(is64Bit),
       InstrInfo(initializeSubtargetDependencies(CPU, FS)), TLInfo(TM, *this),
       FrameLowering(*this) {}
 
@@ -64,7 +81,7 @@ int SparcSubtarget::getAdjustedFrameSize(int frameSize) const {
     frameSize += 128;
     // Frames with calls must also reserve space for 6 outgoing arguments
     // whether they are used or not. LowerCall_64 takes care of that.
-    frameSize = RoundUpToAlignment(frameSize, 16);
+    frameSize = alignTo(frameSize, 16);
   } else {
     // Emit the correct save instruction based on the number of bytes in
     // the frame. Minimum stack frame size according to V8 ABI is:
@@ -77,7 +94,7 @@ int SparcSubtarget::getAdjustedFrameSize(int frameSize) const {
 
     // Round up to next doubleword boundary -- a double-word boundary
     // is required by the ABI.
-    frameSize = RoundUpToAlignment(frameSize, 8);
+    frameSize = alignTo(frameSize, 8);
   }
   return frameSize;
 }
diff --git a/lib/Target/Sparc/SparcSubtarget.h b/lib/Target/Sparc/SparcSubtarget.h
index e2fd2f04528a..42d693699994 100644
--- a/lib/Target/Sparc/SparcSubtarget.h
+++ b/lib/Target/Sparc/SparcSubtarget.h
@@ -15,11 +15,11 @@
 #define LLVM_LIB_TARGET_SPARC_SPARCSUBTARGET_H
 
 #include "SparcFrameLowering.h"
-#include "SparcInstrInfo.h"
 #include "SparcISelLowering.h"
+#include "SparcInstrInfo.h"
+#include "llvm/CodeGen/SelectionDAGTargetInfo.h"
 #include "llvm/IR/DataLayout.h"
 #include "llvm/Target/TargetFrameLowering.h"
-#include "llvm/Target/TargetSelectionDAGInfo.h"
 #include "llvm/Target/TargetSubtargetInfo.h"
 #include <string>
 
@@ -30,21 +30,41 @@ namespace llvm {
 class StringRef;
 
 class SparcSubtarget : public SparcGenSubtargetInfo {
+  Triple TargetTriple;
   virtual void anchor();
   bool IsV9;
+  bool IsLeon;
   bool V8DeprecatedInsts;
   bool IsVIS, IsVIS2, IsVIS3;
   bool Is64Bit;
   bool HasHardQuad;
   bool UsePopc;
+  bool UseSoftFloat;
+
+  // LEON features
+  bool HasUmacSmac;
+  bool HasLeonCasa;
+  bool InsertNOPLoad;
+  bool FixFSMULD;
+  bool ReplaceFMULS;
+  bool FixAllFDIVSQRT;
+  bool UseSoftFpu;
+  bool PerformSDIVReplace;
+  bool FixCallImmediates;
+  bool IgnoreZeroFlag;
+  bool InsertNOPDoublePrecision;
+  bool PreventRoundChange;
+  bool FlushCacheLineSWAP;
+  bool InsertNOPsLoadStore;
+
   SparcInstrInfo InstrInfo;
   SparcTargetLowering TLInfo;
-  TargetSelectionDAGInfo TSInfo;
+  SelectionDAGTargetInfo TSInfo;
   SparcFrameLowering FrameLowering;
 
 public:
   SparcSubtarget(const Triple &TT, const std::string &CPU,
-                 const std::string &FS, TargetMachine &TM, bool is64bit);
+                 const std::string &FS, const TargetMachine &TM, bool is64bit);
 
   const SparcInstrInfo *getInstrInfo() const override { return &InstrInfo; }
   const TargetFrameLowering *getFrameLowering() const override {
@@ -56,19 +76,37 @@ public:
   const SparcTargetLowering *getTargetLowering() const override {
     return &TLInfo;
   }
-  const TargetSelectionDAGInfo *getSelectionDAGInfo() const override {
+  const SelectionDAGTargetInfo *getSelectionDAGInfo() const override {
     return &TSInfo;
   }
 
   bool enableMachineScheduler() const override;
 
   bool isV9() const { return IsV9; }
+  bool isLeon() const { return IsLeon; }
   bool isVIS() const { return IsVIS; }
   bool isVIS2() const { return IsVIS2; }
   bool isVIS3() const { return IsVIS3; }
   bool useDeprecatedV8Instructions() const { return V8DeprecatedInsts; }
   bool hasHardQuad() const { return HasHardQuad; }
   bool usePopc() const { return UsePopc; }
+  bool useSoftFloat() const { return UseSoftFloat; }
+
+  // Leon options
+  bool useSoftFpu() const { return UseSoftFpu; }
+  bool hasLeonCasa() const { return HasLeonCasa; }
+  bool hasUmacSmac() const { return HasUmacSmac; }
+  bool performSDIVReplace() const { return PerformSDIVReplace; }
+  bool fixCallImmediates() const { return FixCallImmediates; }
+  bool ignoreZeroFlag() const { return IgnoreZeroFlag; }
+  bool insertNOPDoublePrecision() const { return InsertNOPDoublePrecision; }
+  bool fixFSMULD() const { return FixFSMULD; }
+  bool replaceFMULS() const { return ReplaceFMULS; }
+  bool preventRoundChange() const { return PreventRoundChange; }
+  bool fixAllFDIVSQRT() const { return FixAllFDIVSQRT; }
+  bool flushCacheLineSWAP() const { return FlushCacheLineSWAP; }
+  bool insertNOPsLoadStore() const { return InsertNOPsLoadStore; }
+  bool insertNOPLoad() const { return InsertNOPLoad; }
 
   /// ParseSubtargetFeatures - Parses features string setting specified
   /// subtarget options.  Definition of function is auto generated by tblgen.
@@ -87,6 +125,8 @@ public:
   /// returns adjusted framesize which includes space for register window
   /// spills and arguments.
   int getAdjustedFrameSize(int stackSize) const;
+
+  bool isTargetLinux() const { return TargetTriple.isOSLinux(); }
 };
 
 } // end namespace llvm
diff --git a/lib/Target/Sparc/SparcTargetMachine.cpp b/lib/Target/Sparc/SparcTargetMachine.cpp
index 725d7f047c47..17fe86a70844 100644
--- a/lib/Target/Sparc/SparcTargetMachine.cpp
+++ b/lib/Target/Sparc/SparcTargetMachine.cpp
@@ -13,7 +13,9 @@
 #include "SparcTargetMachine.h"
 #include "SparcTargetObjectFile.h"
 #include "Sparc.h"
+#include "LeonPasses.h"
 #include "llvm/CodeGen/Passes.h"
+#include "llvm/CodeGen/TargetPassConfig.h"
 #include "llvm/IR/LegacyPassManager.h"
 #include "llvm/Support/TargetRegistry.h"
 using namespace llvm;
@@ -52,28 +54,68 @@ static std::string computeDataLayout(const Triple &T, bool is64Bit) {
   return Ret;
 }
 
-/// SparcTargetMachine ctor - Create an ILP32 architecture model
-///
+static Reloc::Model getEffectiveRelocModel(Optional<Reloc::Model> RM) {
+  if (!RM.hasValue())
+    return Reloc::Static;
+  return *RM;
+}
+
+/// Create an ILP32 architecture model
 SparcTargetMachine::SparcTargetMachine(const Target &T, const Triple &TT,
                                        StringRef CPU, StringRef FS,
                                        const TargetOptions &Options,
-                                       Reloc::Model RM, CodeModel::Model CM,
+                                       Optional<Reloc::Model> RM,
+                                       CodeModel::Model CM,
                                        CodeGenOpt::Level OL, bool is64bit)
     : LLVMTargetMachine(T, computeDataLayout(TT, is64bit), TT, CPU, FS, Options,
-                        RM, CM, OL),
+                        getEffectiveRelocModel(RM), CM, OL),
       TLOF(make_unique<SparcELFTargetObjectFile>()),
-      Subtarget(TT, CPU, FS, *this, is64bit) {
+      Subtarget(TT, CPU, FS, *this, is64bit), is64Bit(is64bit) {
   initAsmInfo();
 }
 
 SparcTargetMachine::~SparcTargetMachine() {}
 
+const SparcSubtarget *
+SparcTargetMachine::getSubtargetImpl(const Function &F) const {
+  Attribute CPUAttr = F.getFnAttribute("target-cpu");
+  Attribute FSAttr = F.getFnAttribute("target-features");
+
+  std::string CPU = !CPUAttr.hasAttribute(Attribute::None)
+                        ? CPUAttr.getValueAsString().str()
+                        : TargetCPU;
+  std::string FS = !FSAttr.hasAttribute(Attribute::None)
+                       ? FSAttr.getValueAsString().str()
+                       : TargetFS;
+
+  // FIXME: This is related to the code below to reset the target options,
+  // we need to know whether or not the soft float flag is set on the
+  // function, so we can enable it as a subtarget feature.
+  bool softFloat =
+      F.hasFnAttribute("use-soft-float") &&
+      F.getFnAttribute("use-soft-float").getValueAsString() == "true";
+
+  if (softFloat)
+    FS += FS.empty() ? "+soft-float" : ",+soft-float";
+
+  auto &I = SubtargetMap[CPU + FS];
+  if (!I) {
+    // This needs to be done before we create a new subtarget since any
+    // creation will depend on the TM and the code generation flags on the
+    // function that reside in TargetOptions.
+    resetTargetOptions(F);
+    I = llvm::make_unique<SparcSubtarget>(TargetTriple, CPU, FS, *this,
+                                          this->is64Bit);
+  }
+  return I.get();
+}
+
 namespace {
 /// Sparc Code Generator Pass Configuration Options.
 class SparcPassConfig : public TargetPassConfig {
 public:
   SparcPassConfig(SparcTargetMachine *TM, PassManagerBase &PM)
-    : TargetPassConfig(TM, PM) {}
+      : TargetPassConfig(TM, PM) {}
 
   SparcTargetMachine &getSparcTargetMachine() const {
     return getTM<SparcTargetMachine>();
@@ -100,25 +142,62 @@ bool SparcPassConfig::addInstSelector() {
   return false;
 }
 
-void SparcPassConfig::addPreEmitPass(){
+void SparcPassConfig::addPreEmitPass() {
   addPass(createSparcDelaySlotFillerPass(getSparcTargetMachine()));
+  if (this->getSparcTargetMachine().getSubtargetImpl()->ignoreZeroFlag()) {
+    addPass(new IgnoreZeroFlag(getSparcTargetMachine()));
+  }
+  if (this->getSparcTargetMachine().getSubtargetImpl()->performSDIVReplace()) {
+    addPass(new ReplaceSDIV(getSparcTargetMachine()));
+  }
+  if (this->getSparcTargetMachine().getSubtargetImpl()->fixCallImmediates()) {
+    addPass(new FixCALL(getSparcTargetMachine()));
+  }
+  if (this->getSparcTargetMachine().getSubtargetImpl()->fixFSMULD()) {
+    addPass(new FixFSMULD(getSparcTargetMachine()));
+  }
+  if (this->getSparcTargetMachine().getSubtargetImpl()->replaceFMULS()) {
+    addPass(new ReplaceFMULS(getSparcTargetMachine()));
+  }
+  if (this->getSparcTargetMachine().getSubtargetImpl()->preventRoundChange()) {
+    addPass(new PreventRoundChange(getSparcTargetMachine()));
+  }
+  if (this->getSparcTargetMachine().getSubtargetImpl()->fixAllFDIVSQRT()) {
+    addPass(new FixAllFDIVSQRT(getSparcTargetMachine()));
+  }
+  if (this->getSparcTargetMachine().getSubtargetImpl()->insertNOPsLoadStore()) {
+    addPass(new InsertNOPsLoadStore(getSparcTargetMachine()));
+  }
+  if (this->getSparcTargetMachine().getSubtargetImpl()->insertNOPLoad()) {
+    addPass(new InsertNOPLoad(getSparcTargetMachine()));
+  }
+  if (this->getSparcTargetMachine().getSubtargetImpl()->flushCacheLineSWAP()) {
+    addPass(new FlushCacheLineSWAP(getSparcTargetMachine()));
+  }
+  if (this->getSparcTargetMachine()
+          .getSubtargetImpl()
+          ->insertNOPDoublePrecision()) {
+    addPass(new InsertNOPDoublePrecision(getSparcTargetMachine()));
+  }
 }
 
-void SparcV8TargetMachine::anchor() { }
+void SparcV8TargetMachine::anchor() {}
 
 SparcV8TargetMachine::SparcV8TargetMachine(const Target &T, const Triple &TT,
                                            StringRef CPU, StringRef FS,
                                            const TargetOptions &Options,
-                                           Reloc::Model RM, CodeModel::Model CM,
+                                           Optional<Reloc::Model> RM,
+                                           CodeModel::Model CM,
                                            CodeGenOpt::Level OL)
     : SparcTargetMachine(T, TT, CPU, FS, Options, RM, CM, OL, false) {}
 
-void SparcV9TargetMachine::anchor() { }
+void SparcV9TargetMachine::anchor() {}
 
 SparcV9TargetMachine::SparcV9TargetMachine(const Target &T, const Triple &TT,
                                            StringRef CPU, StringRef FS,
                                            const TargetOptions &Options,
-                                           Reloc::Model RM, CodeModel::Model CM,
+                                           Optional<Reloc::Model> RM,
+                                           CodeModel::Model CM,
                                            CodeGenOpt::Level OL)
     : SparcTargetMachine(T, TT, CPU, FS, Options, RM, CM, OL, true) {}
 
@@ -127,6 +206,7 @@ void SparcelTargetMachine::anchor() {}
 SparcelTargetMachine::SparcelTargetMachine(const Target &T, const Triple &TT,
                                            StringRef CPU, StringRef FS,
                                            const TargetOptions &Options,
-                                           Reloc::Model RM, CodeModel::Model CM,
+                                           Optional<Reloc::Model> RM,
+                                           CodeModel::Model CM,
                                            CodeGenOpt::Level OL)
     : SparcTargetMachine(T, TT, CPU, FS, Options, RM, CM, OL, false) {}
diff --git a/lib/Target/Sparc/SparcTargetMachine.h b/lib/Target/Sparc/SparcTargetMachine.h
index 903c2d15629f..48193fe095be 100644
--- a/lib/Target/Sparc/SparcTargetMachine.h
+++ b/lib/Target/Sparc/SparcTargetMachine.h
@@ -23,16 +23,17 @@ namespace llvm {
 class SparcTargetMachine : public LLVMTargetMachine {
   std::unique_ptr<TargetLoweringObjectFile> TLOF;
   SparcSubtarget Subtarget;
+  bool is64Bit;
+  mutable StringMap<std::unique_ptr<SparcSubtarget>> SubtargetMap;
 public:
   SparcTargetMachine(const Target &T, const Triple &TT, StringRef CPU,
                      StringRef FS, const TargetOptions &Options,
-                     Reloc::Model RM, CodeModel::Model CM, CodeGenOpt::Level OL,
-                     bool is64bit);
+                     Optional<Reloc::Model> RM, CodeModel::Model CM,
+                     CodeGenOpt::Level OL, bool is64bit);
   ~SparcTargetMachine() override;
 
-  const SparcSubtarget *getSubtargetImpl(const Function &) const override {
-    return &Subtarget;
-  }
+  const SparcSubtarget *getSubtargetImpl() const { return &Subtarget; }
+  const SparcSubtarget *getSubtargetImpl(const Function &) const override;
 
   // Pass Pipeline Configuration
   TargetPassConfig *createPassConfig(PassManagerBase &PM) override;
@@ -41,25 +42,25 @@ public:
   }
 };
 
-/// SparcV8TargetMachine - Sparc 32-bit target machine
+/// Sparc 32-bit target machine
 ///
 class SparcV8TargetMachine : public SparcTargetMachine {
   virtual void anchor();
 public:
   SparcV8TargetMachine(const Target &T, const Triple &TT, StringRef CPU,
                        StringRef FS, const TargetOptions &Options,
-                       Reloc::Model RM, CodeModel::Model CM,
+                       Optional<Reloc::Model> RM, CodeModel::Model CM,
                        CodeGenOpt::Level OL);
 };
 
-/// SparcV9TargetMachine - Sparc 64-bit target machine
+/// Sparc 64-bit target machine
 ///
 class SparcV9TargetMachine : public SparcTargetMachine {
   virtual void anchor();
 public:
   SparcV9TargetMachine(const Target &T, const Triple &TT, StringRef CPU,
                        StringRef FS, const TargetOptions &Options,
-                       Reloc::Model RM, CodeModel::Model CM,
+                       Optional<Reloc::Model> RM, CodeModel::Model CM,
                        CodeGenOpt::Level OL);
 };
 
@@ -69,7 +70,7 @@ class SparcelTargetMachine : public SparcTargetMachine {
 public:
   SparcelTargetMachine(const Target &T, const Triple &TT, StringRef CPU,
                        StringRef FS, const TargetOptions &Options,
-                       Reloc::Model RM, CodeModel::Model CM,
+                       Optional<Reloc::Model> RM, CodeModel::Model CM,
                        CodeGenOpt::Level OL);
 };
 
diff --git a/lib/Target/Sparc/TargetInfo/Makefile b/lib/Target/Sparc/TargetInfo/Makefile
deleted file mode 100644
index 641ed87160c7..000000000000
--- a/lib/Target/Sparc/TargetInfo/Makefile
+++ /dev/null
@@ -1,15 +0,0 @@
-##===- lib/Target/Sparc/TargetInfo/Makefile ----------------*- Makefile -*-===##
-#
-#                     The LLVM Compiler Infrastructure
-#
-# This file is distributed under the University of Illinois Open Source
-# License. See LICENSE.TXT for details.
-#
-##===----------------------------------------------------------------------===##
-LEVEL = ../../../..
-LIBRARYNAME = LLVMSparcInfo
-
-# Hack: we need to include 'main' target directory to grab private headers
-CPPFLAGS = -I$(PROJ_OBJ_DIR)/.. -I$(PROJ_SRC_DIR)/..
-
-include $(LEVEL)/Makefile.common
diff --git a/lib/Target/SystemZ/AsmParser/Makefile b/lib/Target/SystemZ/AsmParser/Makefile
deleted file mode 100644
index 623ae2c4e3ef..000000000000
--- a/lib/Target/SystemZ/AsmParser/Makefile
+++ /dev/null
@@ -1,16 +0,0 @@
-##===- lib/Target/SystemZ/AsmParser/Makefile ---------------*- Makefile -*-===##
-#
-#                     The LLVM Compiler Infrastructure
-#
-# This file is distributed under the University of Illinois Open Source
-# License. See LICENSE.TXT for details.
-#
-##===----------------------------------------------------------------------===##
-
-LEVEL = ../../../..
-LIBRARYNAME = LLVMSystemZAsmParser
-
-# Hack: we need to include 'main' SystemZ target directory to grab private headers
-CPP.Flags += -I$(PROJ_OBJ_DIR)/.. -I$(PROJ_SRC_DIR)/..
-
-include $(LEVEL)/Makefile.common
diff --git a/lib/Target/SystemZ/AsmParser/SystemZAsmParser.cpp b/lib/Target/SystemZ/AsmParser/SystemZAsmParser.cpp
index 9c995bf42b0b..3923614c89d3 100644
--- a/lib/Target/SystemZ/AsmParser/SystemZAsmParser.cpp
+++ b/lib/Target/SystemZ/AsmParser/SystemZAsmParser.cpp
@@ -13,9 +13,9 @@
 #include "llvm/MC/MCExpr.h"
 #include "llvm/MC/MCInst.h"
 #include "llvm/MC/MCParser/MCParsedAsmOperand.h"
+#include "llvm/MC/MCParser/MCTargetAsmParser.h"
 #include "llvm/MC/MCStreamer.h"
 #include "llvm/MC/MCSubtargetInfo.h"
-#include "llvm/MC/MCTargetAsmParser.h"
 #include "llvm/Support/TargetRegistry.h"
 
 using namespace llvm;
@@ -391,6 +391,9 @@ public:
     : MCTargetAsmParser(Options, sti), Parser(parser) {
     MCAsmParserExtension::Initialize(Parser);
 
+    // Alias the .word directive to .short.
+    parser.addAliasForDirective(".word", ".short");
+
     // Initialize the set of available features.
     setAvailableFeatures(ComputeAvailableFeatures(getSTI().getFeatureBits()));
   }
diff --git a/lib/Target/SystemZ/CMakeLists.txt b/lib/Target/SystemZ/CMakeLists.txt
index 336f037bb733..4b849ad6491b 100644
--- a/lib/Target/SystemZ/CMakeLists.txt
+++ b/lib/Target/SystemZ/CMakeLists.txt
@@ -30,6 +30,7 @@ add_llvm_target(SystemZCodeGen
   SystemZSubtarget.cpp
   SystemZTargetMachine.cpp
   SystemZTargetTransformInfo.cpp
+  SystemZTDC.cpp
   )
 
 add_subdirectory(AsmParser)
diff --git a/lib/Target/SystemZ/Disassembler/Makefile b/lib/Target/SystemZ/Disassembler/Makefile
deleted file mode 100644
index efc4cc8e9cb0..000000000000
--- a/lib/Target/SystemZ/Disassembler/Makefile
+++ /dev/null
@@ -1,16 +0,0 @@
-##===-- lib/Target/SystemZ/Disassembler/Makefile -----------*- Makefile -*-===##
-#
-#                     The LLVM Compiler Infrastructure
-#
-# This file is distributed under the University of Illinois Open Source
-# License. See LICENSE.TXT for details.
-#
-##===----------------------------------------------------------------------===##
-
-LEVEL = ../../../..
-LIBRARYNAME = LLVMSystemZDisassembler
-
-# Hack: we need to include 'main' x86 target directory to grab private headers
-CPP.Flags += -I$(PROJ_OBJ_DIR)/.. -I$(PROJ_SRC_DIR)/..
-
-include $(LEVEL)/Makefile.common
diff --git a/lib/Target/SystemZ/Disassembler/SystemZDisassembler.cpp b/lib/Target/SystemZ/Disassembler/SystemZDisassembler.cpp
index bf67b75d5337..20e015b42d21 100644
--- a/lib/Target/SystemZ/Disassembler/SystemZDisassembler.cpp
+++ b/lib/Target/SystemZ/Disassembler/SystemZDisassembler.cpp
@@ -8,7 +8,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "SystemZ.h"
-#include "llvm/MC/MCDisassembler.h"
+#include "llvm/MC/MCDisassembler/MCDisassembler.h"
 #include "llvm/MC/MCFixedLenDisassembler.h"
 #include "llvm/MC/MCInst.h"
 #include "llvm/MC/MCSubtargetInfo.h"
@@ -46,6 +46,34 @@ extern "C" void LLVMInitializeSystemZDisassembler() {
                                          createSystemZDisassembler);
 }
 
+/// tryAddingSymbolicOperand - trys to add a symbolic operand in place of the
+/// immediate Value in the MCInst.
+///
+/// @param Value      - The immediate Value, has had any PC adjustment made by
+///                     the caller.
+/// @param isBranch   - If the instruction is a branch instruction
+/// @param Address    - The starting address of the instruction
+/// @param Offset     - The byte offset to this immediate in the instruction
+/// @param Width      - The byte width of this immediate in the instruction
+///
+/// If the getOpInfo() function was set when setupForSymbolicDisassembly() was
+/// called then that function is called to get any symbolic information for the
+/// immediate in the instruction using the Address, Offset and Width.  If that
+/// returns non-zero then the symbolic information it returns is used to create
+/// an MCExpr and that is added as an operand to the MCInst.  If getOpInfo()
+/// returns zero and isBranch is true then a symbol look up for immediate Value
+/// is done and if a symbol is found an MCExpr is created with that, else
+/// an MCExpr with the immediate Value is created.  This function returns true
+/// if it adds an operand to the MCInst and false otherwise.
+static bool tryAddingSymbolicOperand(int64_t Value, bool isBranch,
+                                     uint64_t Address, uint64_t Offset,
+                                     uint64_t Width, MCInst &MI,
+                                     const void *Decoder) {
+  const MCDisassembler *Dis = static_cast<const MCDisassembler*>(Decoder);
+  return Dis->tryAddingSymbolicOperand(MI, Value, Address, isBranch,
+                                       Offset, Width);
+}
+
 static DecodeStatus decodeRegisterClass(MCInst &Inst, uint64_t RegNo,
                                         const unsigned *Regs, unsigned Size) {
   assert(RegNo < Size && "Invalid register");
@@ -206,22 +234,35 @@ static DecodeStatus decodeS32ImmOperand(MCInst &Inst, uint64_t Imm,
 
 template<unsigned N>
 static DecodeStatus decodePCDBLOperand(MCInst &Inst, uint64_t Imm,
-                                       uint64_t Address) {
+                                       uint64_t Address,
+                                       bool isBranch,
+                                       const void *Decoder) {
   assert(isUInt<N>(Imm) && "Invalid PC-relative offset");
-  Inst.addOperand(MCOperand::createImm(SignExtend64<N>(Imm) * 2 + Address));
+  uint64_t Value = SignExtend64<N>(Imm) * 2 + Address;
+
+  if (!tryAddingSymbolicOperand(Value, isBranch, Address, 2, N / 8,
+                                Inst, Decoder))
+    Inst.addOperand(MCOperand::createImm(Value));
+
   return MCDisassembler::Success;
 }
 
-static DecodeStatus decodePC16DBLOperand(MCInst &Inst, uint64_t Imm,
-                                         uint64_t Address,
-                                         const void *Decoder) {
-  return decodePCDBLOperand<16>(Inst, Imm, Address);
+static DecodeStatus decodePC16DBLBranchOperand(MCInst &Inst, uint64_t Imm,
+                                               uint64_t Address,
+                                               const void *Decoder) {
+  return decodePCDBLOperand<16>(Inst, Imm, Address, true, Decoder);
+}
+
+static DecodeStatus decodePC32DBLBranchOperand(MCInst &Inst, uint64_t Imm,
+                                               uint64_t Address,
+                                               const void *Decoder) {
+  return decodePCDBLOperand<32>(Inst, Imm, Address, true, Decoder);
 }
 
 static DecodeStatus decodePC32DBLOperand(MCInst &Inst, uint64_t Imm,
                                          uint64_t Address,
                                          const void *Decoder) {
-  return decodePCDBLOperand<32>(Inst, Imm, Address);
+  return decodePCDBLOperand<32>(Inst, Imm, Address, false, Decoder);
 }
 
 static DecodeStatus decodeBDAddr12Operand(MCInst &Inst, uint64_t Field,
diff --git a/lib/Target/SystemZ/InstPrinter/Makefile b/lib/Target/SystemZ/InstPrinter/Makefile
deleted file mode 100644
index 3ba8126735a5..000000000000
--- a/lib/Target/SystemZ/InstPrinter/Makefile
+++ /dev/null
@@ -1,16 +0,0 @@
-##===- lib/Target/SystemZ/AsmPrinter/Makefile --------------*- Makefile -*-===##
-#
-#                     The LLVM Compiler Infrastructure
-#
-# This file is distributed under the University of Illinois Open Source
-# License. See LICENSE.TXT for details.
-#
-##===----------------------------------------------------------------------===##
-
-LEVEL = ../../../..
-LIBRARYNAME = LLVMSystemZAsmPrinter
-
-# Hack: we need to include 'main' mips target directory to grab private headers
-CPP.Flags += -I$(PROJ_OBJ_DIR)/.. -I$(PROJ_SRC_DIR)/..
-
-include $(LEVEL)/Makefile.common
diff --git a/lib/Target/SystemZ/MCTargetDesc/Makefile b/lib/Target/SystemZ/MCTargetDesc/Makefile
deleted file mode 100644
index 08f1a9d51fb5..000000000000
--- a/lib/Target/SystemZ/MCTargetDesc/Makefile
+++ /dev/null
@@ -1,16 +0,0 @@
-##===- lib/Target/SystemZ/TargetDesc/Makefile --------------*- Makefile -*-===##
-#
-#                     The LLVM Compiler Infrastructure
-#
-# This file is distributed under the University of Illinois Open Source
-# License. See LICENSE.TXT for details.
-#
-##===----------------------------------------------------------------------===##
-
-LEVEL = ../../../..
-LIBRARYNAME = LLVMSystemZDesc
-
-# Hack: we need to include 'main' target directory to grab private headers
-CPP.Flags += -I$(PROJ_OBJ_DIR)/.. -I$(PROJ_SRC_DIR)/..
-
-include $(LEVEL)/Makefile.common
diff --git a/lib/Target/SystemZ/MCTargetDesc/SystemZMCAsmBackend.cpp b/lib/Target/SystemZ/MCTargetDesc/SystemZMCAsmBackend.cpp
index 57eebe19c044..c4d546cb7dff 100644
--- a/lib/Target/SystemZ/MCTargetDesc/SystemZMCAsmBackend.cpp
+++ b/lib/Target/SystemZ/MCTargetDesc/SystemZMCAsmBackend.cpp
@@ -58,7 +58,8 @@ public:
                             const MCAsmLayout &Layout) const override {
     return false;
   }
-  void relaxInstruction(const MCInst &Inst, MCInst &Res) const override {
+  void relaxInstruction(const MCInst &Inst, const MCSubtargetInfo &STI,
+                        MCInst &Res) const override {
     llvm_unreachable("SystemZ does do not have assembler relaxation");
   }
   bool writeNopData(uint64_t Count, MCObjectWriter *OW) const override;
diff --git a/lib/Target/SystemZ/MCTargetDesc/SystemZMCObjectWriter.cpp b/lib/Target/SystemZ/MCTargetDesc/SystemZMCObjectWriter.cpp
index ee1af023769e..368c95f7bac2 100644
--- a/lib/Target/SystemZ/MCTargetDesc/SystemZMCObjectWriter.cpp
+++ b/lib/Target/SystemZ/MCTargetDesc/SystemZMCObjectWriter.cpp
@@ -24,8 +24,8 @@ public:
 
 protected:
   // Override MCELFObjectTargetWriter.
-  unsigned GetRelocType(const MCValue &Target, const MCFixup &Fixup,
-                        bool IsPCRel) const override;
+  unsigned getRelocType(MCContext &Ctx, const MCValue &Target,
+                        const MCFixup &Fixup, bool IsPCRel) const override;
 };
 } // end anonymous namespace
 
@@ -106,7 +106,8 @@ static unsigned getPLTReloc(unsigned Kind) {
   llvm_unreachable("Unsupported absolute address");
 }
 
-unsigned SystemZObjectWriter::GetRelocType(const MCValue &Target,
+unsigned SystemZObjectWriter::getRelocType(MCContext &Ctx,
+                                           const MCValue &Target,
                                            const MCFixup &Fixup,
                                            bool IsPCRel) const {
   MCSymbolRefExpr::VariantKind Modifier = Target.getAccessVariant();
diff --git a/lib/Target/SystemZ/MCTargetDesc/SystemZMCTargetDesc.cpp b/lib/Target/SystemZ/MCTargetDesc/SystemZMCTargetDesc.cpp
index 2115d4480eef..e16ba9e15317 100644
--- a/lib/Target/SystemZ/MCTargetDesc/SystemZMCTargetDesc.cpp
+++ b/lib/Target/SystemZ/MCTargetDesc/SystemZMCTargetDesc.cpp
@@ -10,7 +10,6 @@
 #include "SystemZMCTargetDesc.h"
 #include "InstPrinter/SystemZInstPrinter.h"
 #include "SystemZMCAsmInfo.h"
-#include "llvm/MC/MCCodeGenInfo.h"
 #include "llvm/MC/MCInstrInfo.h"
 #include "llvm/MC/MCRegisterInfo.h"
 #include "llvm/MC/MCStreamer.h"
@@ -159,17 +158,8 @@ createSystemZMCSubtargetInfo(const Triple &TT, StringRef CPU, StringRef FS) {
   return createSystemZMCSubtargetInfoImpl(TT, CPU, FS);
 }
 
-static MCCodeGenInfo *createSystemZMCCodeGenInfo(const Triple &TT,
-                                                 Reloc::Model RM,
-                                                 CodeModel::Model CM,
-                                                 CodeGenOpt::Level OL) {
-  MCCodeGenInfo *X = new MCCodeGenInfo();
-
-  // Static code is suitable for use in a dynamic executable; there is no
-  // separate DynamicNoPIC model.
-  if (RM == Reloc::Default || RM == Reloc::DynamicNoPIC)
-    RM = Reloc::Static;
-
+static void adjustCodeGenOpts(const Triple &TT, Reloc::Model RM,
+                              CodeModel::Model &CM) {
   // For SystemZ we define the models as follows:
   //
   // Small:  BRASL can call any function and will use a stub if necessary.
@@ -203,8 +193,6 @@ static MCCodeGenInfo *createSystemZMCCodeGenInfo(const Triple &TT,
     CM = CodeModel::Small;
   else if (CM == CodeModel::JITDefault)
     CM = RM == Reloc::PIC_ ? CodeModel::Small : CodeModel::Medium;
-  X->initMCCodeGenInfo(RM, CM, OL);
-  return X;
 }
 
 static MCInstPrinter *createSystemZMCInstPrinter(const Triple &T,
@@ -220,9 +208,9 @@ extern "C" void LLVMInitializeSystemZTargetMC() {
   TargetRegistry::RegisterMCAsmInfo(TheSystemZTarget,
                                     createSystemZMCAsmInfo);
 
-  // Register the MCCodeGenInfo.
-  TargetRegistry::RegisterMCCodeGenInfo(TheSystemZTarget,
-                                        createSystemZMCCodeGenInfo);
+  // Register the adjustCodeGenOpts.
+  TargetRegistry::registerMCAdjustCodeGenOpts(TheSystemZTarget,
+                                              adjustCodeGenOpts);
 
   // Register the MCCodeEmitter.
   TargetRegistry::RegisterMCCodeEmitter(TheSystemZTarget,
diff --git a/lib/Target/SystemZ/Makefile b/lib/Target/SystemZ/Makefile
deleted file mode 100644
index 732c31725538..000000000000
--- a/lib/Target/SystemZ/Makefile
+++ /dev/null
@@ -1,28 +0,0 @@
-##===- lib/Target/SystemZ/Makefile -------------------------*- Makefile -*-===##
-#
-#                     The LLVM Compiler Infrastructure
-#
-# This file is distributed under the University of Illinois Open Source
-# License. See LICENSE.TXT for details.
-#
-##===----------------------------------------------------------------------===##
-
-LEVEL = ../../..
-LIBRARYNAME = LLVMSystemZCodeGen
-TARGET = SystemZ
-
-# Make sure that tblgen is run, first thing.
-BUILT_SOURCES = SystemZGenRegisterInfo.inc \
-		SystemZGenAsmWriter.inc \
-		SystemZGenAsmMatcher.inc \
-		SystemZGenDisassemblerTables.inc \
-		SystemZGenInstrInfo.inc \
-		SystemZGenDAGISel.inc \
-		SystemZGenSubtargetInfo.inc \
-		SystemZGenCallingConv.inc \
-		SystemZGenMCCodeEmitter.inc
-
-DIRS = InstPrinter AsmParser Disassembler TargetInfo MCTargetDesc
-
-include $(LEVEL)/Makefile.common
-
diff --git a/lib/Target/SystemZ/README.txt b/lib/Target/SystemZ/README.txt
index cd367d60bab7..86a1322c9e23 100644
--- a/lib/Target/SystemZ/README.txt
+++ b/lib/Target/SystemZ/README.txt
@@ -7,13 +7,6 @@ for later architectures at some point.
 
 --
 
-SystemZDAGToDAGISel::SelectInlineAsmMemoryOperand() is passed "m" for all
-inline asm memory constraints; it doesn't get to see the original constraint.
-This means that it must conservatively treat all inline asm constraints
-as the most restricted type, "R".
-
---
-
 If an inline asm ties an i32 "r" result to an i64 input, the input
 will be treated as an i32, leaving the upper bits uninitialised.
 For example:
@@ -43,15 +36,6 @@ We don't use the BRANCH ON INDEX instructions.
 
 --
 
-We might want to use BRANCH ON CONDITION for conditional indirect calls
-and conditional returns.
-
---
-
-We don't use the TEST DATA CLASS instructions.
-
---
-
 We only use MVC, XC and CLC for constant-length block operations.
 We could extend them to variable-length operations too,
 using EXECUTE RELATIVE LONG.
@@ -79,11 +63,6 @@ via a register.)
 
 --
 
-We don't use the halfword forms of LOAD REVERSED and STORE REVERSED
-(LRVH and STRVH).
-
---
-
 We don't use ICM or STCM.
 
 --
@@ -123,7 +102,7 @@ ought to be implemented as:
         ngr     %r2, %r0
         br      %r14
 
-but two-address optimisations reverse the order of the AND and force:
+but two-address optimizations reverse the order of the AND and force:
 
         lhi     %r0, 1
         ngr     %r0, %r2
@@ -166,3 +145,10 @@ If needed, we can support 16-byte atomics using LPQ, STPQ and CSDG.
 
 We might want to model all access registers and use them to spill
 32-bit values.
+
+--
+
+We might want to use the 'overflow' condition of eg. AR to support
+llvm.sadd.with.overflow.i32 and related instructions - the generated code
+for signed overflow check is currently quite bad.  This would improve
+the results of using -ftrapv.
diff --git a/lib/Target/SystemZ/SystemZ.h b/lib/Target/SystemZ/SystemZ.h
index cafe2c5948c4..c8ea9641fb62 100644
--- a/lib/Target/SystemZ/SystemZ.h
+++ b/lib/Target/SystemZ/SystemZ.h
@@ -87,6 +87,11 @@ const unsigned CCMASK_VCMP_MIXED     = CCMASK_1;
 const unsigned CCMASK_VCMP_NONE      = CCMASK_3;
 const unsigned CCMASK_VCMP           = CCMASK_0 | CCMASK_1 | CCMASK_3;
 
+// Condition-code mask assignments for Test Data Class.
+const unsigned CCMASK_TDC_NOMATCH   = CCMASK_0;
+const unsigned CCMASK_TDC_MATCH     = CCMASK_1;
+const unsigned CCMASK_TDC           = CCMASK_TDC_NOMATCH | CCMASK_TDC_MATCH;
+
 // The position of the low CC bit in an IPM result.
 const unsigned IPM_CC = 28;
 
@@ -94,6 +99,41 @@ const unsigned IPM_CC = 28;
 const unsigned PFD_READ  = 1;
 const unsigned PFD_WRITE = 2;
 
+// Mask assignments for TDC
+const unsigned TDCMASK_ZERO_PLUS       = 0x800;
+const unsigned TDCMASK_ZERO_MINUS      = 0x400;
+const unsigned TDCMASK_NORMAL_PLUS     = 0x200;
+const unsigned TDCMASK_NORMAL_MINUS    = 0x100;
+const unsigned TDCMASK_SUBNORMAL_PLUS  = 0x080;
+const unsigned TDCMASK_SUBNORMAL_MINUS = 0x040;
+const unsigned TDCMASK_INFINITY_PLUS   = 0x020;
+const unsigned TDCMASK_INFINITY_MINUS  = 0x010;
+const unsigned TDCMASK_QNAN_PLUS       = 0x008;
+const unsigned TDCMASK_QNAN_MINUS      = 0x004;
+const unsigned TDCMASK_SNAN_PLUS       = 0x002;
+const unsigned TDCMASK_SNAN_MINUS      = 0x001;
+
+const unsigned TDCMASK_ZERO            = TDCMASK_ZERO_PLUS | TDCMASK_ZERO_MINUS;
+const unsigned TDCMASK_POSITIVE        = TDCMASK_NORMAL_PLUS |
+                                         TDCMASK_SUBNORMAL_PLUS |
+                                         TDCMASK_INFINITY_PLUS;
+const unsigned TDCMASK_NEGATIVE        = TDCMASK_NORMAL_MINUS |
+                                         TDCMASK_SUBNORMAL_MINUS |
+                                         TDCMASK_INFINITY_MINUS;
+const unsigned TDCMASK_NAN             = TDCMASK_QNAN_PLUS |
+                                         TDCMASK_QNAN_MINUS |
+                                         TDCMASK_SNAN_PLUS |
+                                         TDCMASK_SNAN_MINUS;
+const unsigned TDCMASK_PLUS            = TDCMASK_POSITIVE |
+                                         TDCMASK_ZERO_PLUS |
+                                         TDCMASK_QNAN_PLUS |
+                                         TDCMASK_SNAN_PLUS;
+const unsigned TDCMASK_MINUS           = TDCMASK_NEGATIVE |
+                                         TDCMASK_ZERO_MINUS |
+                                         TDCMASK_QNAN_MINUS |
+                                         TDCMASK_SNAN_MINUS;
+const unsigned TDCMASK_ALL             = TDCMASK_PLUS | TDCMASK_MINUS;
+
 // Number of bits in a vector register.
 const unsigned VectorBits = 128;
 
@@ -138,6 +178,7 @@ FunctionPass *createSystemZElimComparePass(SystemZTargetMachine &TM);
 FunctionPass *createSystemZShortenInstPass(SystemZTargetMachine &TM);
 FunctionPass *createSystemZLongBranchPass(SystemZTargetMachine &TM);
 FunctionPass *createSystemZLDCleanupPass(SystemZTargetMachine &TM);
+FunctionPass *createSystemZTDCPass();
 } // end namespace llvm
 
 #endif
diff --git a/lib/Target/SystemZ/SystemZAsmPrinter.cpp b/lib/Target/SystemZ/SystemZAsmPrinter.cpp
index 75273114d62f..9c0f327ff744 100644
--- a/lib/Target/SystemZ/SystemZAsmPrinter.cpp
+++ b/lib/Target/SystemZ/SystemZAsmPrinter.cpp
@@ -109,6 +109,85 @@ void SystemZAsmPrinter::EmitInstruction(const MachineInstr *MI) {
     LoweredMI = MCInstBuilder(SystemZ::BR).addReg(SystemZ::R14D);
     break;
 
+  case SystemZ::CondReturn:
+    LoweredMI = MCInstBuilder(SystemZ::BCR)
+      .addImm(MI->getOperand(0).getImm())
+      .addImm(MI->getOperand(1).getImm())
+      .addReg(SystemZ::R14D);
+    break;
+
+  case SystemZ::CRBReturn:
+    LoweredMI = MCInstBuilder(SystemZ::CRB)
+      .addReg(MI->getOperand(0).getReg())
+      .addReg(MI->getOperand(1).getReg())
+      .addImm(MI->getOperand(2).getImm())
+      .addReg(SystemZ::R14D)
+      .addImm(0);
+    break;
+
+  case SystemZ::CGRBReturn:
+    LoweredMI = MCInstBuilder(SystemZ::CGRB)
+      .addReg(MI->getOperand(0).getReg())
+      .addReg(MI->getOperand(1).getReg())
+      .addImm(MI->getOperand(2).getImm())
+      .addReg(SystemZ::R14D)
+      .addImm(0);
+    break;
+
+  case SystemZ::CIBReturn:
+    LoweredMI = MCInstBuilder(SystemZ::CIB)
+      .addReg(MI->getOperand(0).getReg())
+      .addImm(MI->getOperand(1).getImm())
+      .addImm(MI->getOperand(2).getImm())
+      .addReg(SystemZ::R14D)
+      .addImm(0);
+    break;
+
+  case SystemZ::CGIBReturn:
+    LoweredMI = MCInstBuilder(SystemZ::CGIB)
+      .addReg(MI->getOperand(0).getReg())
+      .addImm(MI->getOperand(1).getImm())
+      .addImm(MI->getOperand(2).getImm())
+      .addReg(SystemZ::R14D)
+      .addImm(0);
+    break;
+
+  case SystemZ::CLRBReturn:
+    LoweredMI = MCInstBuilder(SystemZ::CLRB)
+      .addReg(MI->getOperand(0).getReg())
+      .addReg(MI->getOperand(1).getReg())
+      .addImm(MI->getOperand(2).getImm())
+      .addReg(SystemZ::R14D)
+      .addImm(0);
+    break;
+
+  case SystemZ::CLGRBReturn:
+    LoweredMI = MCInstBuilder(SystemZ::CLGRB)
+      .addReg(MI->getOperand(0).getReg())
+      .addReg(MI->getOperand(1).getReg())
+      .addImm(MI->getOperand(2).getImm())
+      .addReg(SystemZ::R14D)
+      .addImm(0);
+    break;
+
+  case SystemZ::CLIBReturn:
+    LoweredMI = MCInstBuilder(SystemZ::CLIB)
+      .addReg(MI->getOperand(0).getReg())
+      .addImm(MI->getOperand(1).getImm())
+      .addImm(MI->getOperand(2).getImm())
+      .addReg(SystemZ::R14D)
+      .addImm(0);
+    break;
+
+  case SystemZ::CLGIBReturn:
+    LoweredMI = MCInstBuilder(SystemZ::CLGIB)
+      .addReg(MI->getOperand(0).getReg())
+      .addImm(MI->getOperand(1).getImm())
+      .addImm(MI->getOperand(2).getImm())
+      .addReg(SystemZ::R14D)
+      .addImm(0);
+    break;
+
   case SystemZ::CallBRASL:
     LoweredMI = MCInstBuilder(SystemZ::BRASL)
       .addReg(SystemZ::R14D)
@@ -126,10 +205,96 @@ void SystemZAsmPrinter::EmitInstruction(const MachineInstr *MI) {
       .addExpr(Lower.getExpr(MI->getOperand(0), MCSymbolRefExpr::VK_PLT));
     break;
 
+  case SystemZ::CallBRCL:
+    LoweredMI = MCInstBuilder(SystemZ::BRCL)
+      .addImm(MI->getOperand(0).getImm())
+      .addImm(MI->getOperand(1).getImm())
+      .addExpr(Lower.getExpr(MI->getOperand(2), MCSymbolRefExpr::VK_PLT));
+    break;
+
   case SystemZ::CallBR:
     LoweredMI = MCInstBuilder(SystemZ::BR).addReg(SystemZ::R1D);
     break;
 
+  case SystemZ::CallBCR:
+    LoweredMI = MCInstBuilder(SystemZ::BCR)
+      .addImm(MI->getOperand(0).getImm())
+      .addImm(MI->getOperand(1).getImm())
+      .addReg(SystemZ::R1D);
+    break;
+
+  case SystemZ::CRBCall:
+    LoweredMI = MCInstBuilder(SystemZ::CRB)
+      .addReg(MI->getOperand(0).getReg())
+      .addReg(MI->getOperand(1).getReg())
+      .addImm(MI->getOperand(2).getImm())
+      .addReg(SystemZ::R1D)
+      .addImm(0);
+    break;
+
+  case SystemZ::CGRBCall:
+    LoweredMI = MCInstBuilder(SystemZ::CGRB)
+      .addReg(MI->getOperand(0).getReg())
+      .addReg(MI->getOperand(1).getReg())
+      .addImm(MI->getOperand(2).getImm())
+      .addReg(SystemZ::R1D)
+      .addImm(0);
+    break;
+
+  case SystemZ::CIBCall:
+    LoweredMI = MCInstBuilder(SystemZ::CIB)
+      .addReg(MI->getOperand(0).getReg())
+      .addImm(MI->getOperand(1).getImm())
+      .addImm(MI->getOperand(2).getImm())
+      .addReg(SystemZ::R1D)
+      .addImm(0);
+    break;
+
+  case SystemZ::CGIBCall:
+    LoweredMI = MCInstBuilder(SystemZ::CGIB)
+      .addReg(MI->getOperand(0).getReg())
+      .addImm(MI->getOperand(1).getImm())
+      .addImm(MI->getOperand(2).getImm())
+      .addReg(SystemZ::R1D)
+      .addImm(0);
+    break;
+
+  case SystemZ::CLRBCall:
+    LoweredMI = MCInstBuilder(SystemZ::CLRB)
+      .addReg(MI->getOperand(0).getReg())
+      .addReg(MI->getOperand(1).getReg())
+      .addImm(MI->getOperand(2).getImm())
+      .addReg(SystemZ::R1D)
+      .addImm(0);
+    break;
+
+  case SystemZ::CLGRBCall:
+    LoweredMI = MCInstBuilder(SystemZ::CLGRB)
+      .addReg(MI->getOperand(0).getReg())
+      .addReg(MI->getOperand(1).getReg())
+      .addImm(MI->getOperand(2).getImm())
+      .addReg(SystemZ::R1D)
+      .addImm(0);
+    break;
+
+  case SystemZ::CLIBCall:
+    LoweredMI = MCInstBuilder(SystemZ::CLIB)
+      .addReg(MI->getOperand(0).getReg())
+      .addImm(MI->getOperand(1).getImm())
+      .addImm(MI->getOperand(2).getImm())
+      .addReg(SystemZ::R1D)
+      .addImm(0);
+    break;
+
+  case SystemZ::CLGIBCall:
+    LoweredMI = MCInstBuilder(SystemZ::CLGIB)
+      .addReg(MI->getOperand(0).getReg())
+      .addImm(MI->getOperand(1).getImm())
+      .addImm(MI->getOperand(2).getImm())
+      .addReg(SystemZ::R1D)
+      .addImm(0);
+    break;
+
   case SystemZ::TLS_GDCALL:
     LoweredMI = MCInstBuilder(SystemZ::BRASL)
       .addReg(SystemZ::R14D)
@@ -260,6 +425,41 @@ void SystemZAsmPrinter::EmitInstruction(const MachineInstr *MI) {
         .addImm(15).addReg(SystemZ::R0D);
     break;
 
+  // Emit nothing here but a comment if we can.
+  case SystemZ::MemBarrier:
+    OutStreamer->emitRawComment("MEMBARRIER");
+    return;
+
+  // We want to emit "j .+2" for traps, jumping to the relative immediate field
+  // of the jump instruction, which is an illegal instruction. We cannot emit a
+  // "." symbol, so create and emit a temp label before the instruction and use
+  // that instead.
+  case SystemZ::Trap: {
+    MCSymbol *DotSym = OutContext.createTempSymbol();
+    OutStreamer->EmitLabel(DotSym);
+
+    const MCSymbolRefExpr *Expr = MCSymbolRefExpr::create(DotSym, OutContext);
+    const MCConstantExpr *ConstExpr = MCConstantExpr::create(2, OutContext);
+    LoweredMI = MCInstBuilder(SystemZ::J)
+      .addExpr(MCBinaryExpr::createAdd(Expr, ConstExpr, OutContext));
+    }
+    break;
+
+  // Conditional traps will create a branch on condition instruction that jumps
+  // to the relative immediate field of the jump instruction. (eg. "jo .+2")
+  case SystemZ::CondTrap: {
+    MCSymbol *DotSym = OutContext.createTempSymbol();
+    OutStreamer->EmitLabel(DotSym);
+
+    const MCSymbolRefExpr *Expr = MCSymbolRefExpr::create(DotSym, OutContext);
+    const MCConstantExpr *ConstExpr = MCConstantExpr::create(2, OutContext);
+    LoweredMI = MCInstBuilder(SystemZ::BRC)
+      .addImm(MI->getOperand(0).getImm())
+      .addImm(MI->getOperand(1).getImm())
+      .addExpr(MCBinaryExpr::createAdd(Expr, ConstExpr, OutContext));
+    }
+    break;
+
   default:
     Lower.lower(MI, LoweredMI);
     break;
diff --git a/lib/Target/SystemZ/SystemZCallingConv.cpp b/lib/Target/SystemZ/SystemZCallingConv.cpp
index cc9c84b6a058..72da51f74b10 100644
--- a/lib/Target/SystemZ/SystemZCallingConv.cpp
+++ b/lib/Target/SystemZ/SystemZCallingConv.cpp
@@ -12,10 +12,10 @@
 
 using namespace llvm;
 
-const unsigned SystemZ::ArgGPRs[SystemZ::NumArgGPRs] = {
+const MCPhysReg SystemZ::ArgGPRs[SystemZ::NumArgGPRs] = {
   SystemZ::R2D, SystemZ::R3D, SystemZ::R4D, SystemZ::R5D, SystemZ::R6D
 };
 
-const unsigned SystemZ::ArgFPRs[SystemZ::NumArgFPRs] = {
+const MCPhysReg SystemZ::ArgFPRs[SystemZ::NumArgFPRs] = {
   SystemZ::F0D, SystemZ::F2D, SystemZ::F4D, SystemZ::F6D
 };
diff --git a/lib/Target/SystemZ/SystemZCallingConv.h b/lib/Target/SystemZ/SystemZCallingConv.h
index bff0706618aa..b5523e586f4c 100644
--- a/lib/Target/SystemZ/SystemZCallingConv.h
+++ b/lib/Target/SystemZ/SystemZCallingConv.h
@@ -12,14 +12,15 @@
 
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/CodeGen/CallingConvLower.h"
+#include "llvm/MC/MCRegisterInfo.h"
 
 namespace llvm {
 namespace SystemZ {
   const unsigned NumArgGPRs = 5;
-  extern const unsigned ArgGPRs[NumArgGPRs];
+  extern const MCPhysReg ArgGPRs[NumArgGPRs];
 
   const unsigned NumArgFPRs = 4;
-  extern const unsigned ArgFPRs[NumArgFPRs];
+  extern const MCPhysReg ArgFPRs[NumArgFPRs];
 } // end namespace SystemZ
 
 class SystemZCCState : public CCState {
@@ -79,6 +80,51 @@ public:
   bool IsShortVector(unsigned ValNo) { return ArgIsShortVector[ValNo]; }
 };
 
+// Handle i128 argument types.  These need to be passed by implicit
+// reference.  This could be as simple as the following .td line:
+//    CCIfType<[i128], CCPassIndirect<i64>>,
+// except that i128 is not a legal type, and therefore gets split by
+// common code into a pair of i64 arguments.
+inline bool CC_SystemZ_I128Indirect(unsigned &ValNo, MVT &ValVT,
+                                    MVT &LocVT,
+                                    CCValAssign::LocInfo &LocInfo,
+                                    ISD::ArgFlagsTy &ArgFlags,
+                                    CCState &State) {
+  SmallVectorImpl<CCValAssign> &PendingMembers = State.getPendingLocs();
+
+  // ArgFlags.isSplit() is true on the first part of a i128 argument;
+  // PendingMembers.empty() is false on all subsequent parts.
+  if (!ArgFlags.isSplit() && PendingMembers.empty())
+    return false;
+
+  // Push a pending Indirect value location for each part.
+  LocVT = MVT::i64;
+  LocInfo = CCValAssign::Indirect;
+  PendingMembers.push_back(CCValAssign::getPending(ValNo, ValVT,
+                                                   LocVT, LocInfo));
+  if (!ArgFlags.isSplitEnd())
+    return true;
+
+  // OK, we've collected all parts in the pending list.  Allocate
+  // the location (register or stack slot) for the indirect pointer.
+  // (This duplicates the usual i64 calling convention rules.)
+  unsigned Reg = State.AllocateReg(SystemZ::ArgGPRs);
+  unsigned Offset = Reg ? 0 : State.AllocateStack(8, 8);
+
+  // Use that same location for all the pending parts.
+  for (auto &It : PendingMembers) {
+    if (Reg)
+      It.convertToReg(Reg);
+    else
+      It.convertToMem(Offset);
+    State.addLoc(It);
+  }
+
+  PendingMembers.clear();
+
+  return true;
+}
+
 } // end namespace llvm
 
 #endif
diff --git a/lib/Target/SystemZ/SystemZCallingConv.td b/lib/Target/SystemZ/SystemZCallingConv.td
index bdd1b1598adb..2bf5ac29865f 100644
--- a/lib/Target/SystemZ/SystemZCallingConv.td
+++ b/lib/Target/SystemZ/SystemZCallingConv.td
@@ -33,6 +33,9 @@ def RetCC_SystemZ : CallingConv<[
   // Promote i32 to i64 if it has an explicit extension type.
   CCIfType<[i32], CCIfExtend<CCPromoteToType<i64>>>,
 
+  // A SwiftError is returned in R9.
+  CCIfSwiftError<CCIfType<[i64], CCAssignToReg<[R9D]>>>,
+
   // ABI-compliant code returns 64-bit integers in R2.  Make the other
   // call-clobbered argument registers available for code that doesn't
   // care about the ABI.  (R6 is an argument register too, but is
@@ -65,8 +68,17 @@ def CC_SystemZ : CallingConv<[
   // are smaller than 64 bits shouldn't.
   CCIfType<[i32], CCIfExtend<CCPromoteToType<i64>>>,
 
+  // A SwiftSelf is passed in callee-saved R10.
+  CCIfSwiftSelf<CCIfType<[i64], CCAssignToReg<[R10D]>>>,
+
+  // A SwiftError is passed in callee-saved R9.
+  CCIfSwiftError<CCIfType<[i64], CCAssignToReg<[R9D]>>>,
+
   // Force long double values to the stack and pass i64 pointers to them.
   CCIfType<[f128], CCPassIndirect<i64>>,
+  // Same for i128 values.  These are already split into two i64 here,
+  // so we have to use a custom handler.
+  CCIfType<[i64], CCCustom<"CC_SystemZ_I128Indirect">>,
 
   // The first 5 integer arguments are passed in R2-R6.  Note that R6
   // is call-saved.
@@ -105,3 +117,6 @@ def CC_SystemZ : CallingConv<[
 //===----------------------------------------------------------------------===//
 def CSR_SystemZ : CalleeSavedRegs<(add (sequence "R%dD", 6, 15),
                                        (sequence "F%dD", 8, 15))>;
+
+// R9 is used to return SwiftError; remove it from CSR.
+def CSR_SystemZ_SwiftError : CalleeSavedRegs<(sub CSR_SystemZ, R9D)>;
diff --git a/lib/Target/SystemZ/SystemZElimCompare.cpp b/lib/Target/SystemZ/SystemZElimCompare.cpp
index 4818ed015522..27350b88554d 100644
--- a/lib/Target/SystemZ/SystemZElimCompare.cpp
+++ b/lib/Target/SystemZ/SystemZElimCompare.cpp
@@ -18,7 +18,6 @@
 #include "llvm/CodeGen/MachineFunctionPass.h"
 #include "llvm/CodeGen/MachineInstrBuilder.h"
 #include "llvm/IR/Function.h"
-#include "llvm/Support/CommandLine.h"
 #include "llvm/Support/MathExtras.h"
 #include "llvm/Target/TargetInstrInfo.h"
 #include "llvm/Target/TargetMachine.h"
@@ -65,18 +64,22 @@ public:
 
   bool processBlock(MachineBasicBlock &MBB);
   bool runOnMachineFunction(MachineFunction &F) override;
+  MachineFunctionProperties getRequiredProperties() const override {
+    return MachineFunctionProperties().set(
+        MachineFunctionProperties::Property::AllVRegsAllocated);
+  }
 
 private:
-  Reference getRegReferences(MachineInstr *MI, unsigned Reg);
-  bool convertToBRCT(MachineInstr *MI, MachineInstr *Compare,
+  Reference getRegReferences(MachineInstr &MI, unsigned Reg);
+  bool convertToBRCT(MachineInstr &MI, MachineInstr &Compare,
                      SmallVectorImpl<MachineInstr *> &CCUsers);
-  bool convertToLoadAndTest(MachineInstr *MI);
-  bool adjustCCMasksForInstr(MachineInstr *MI, MachineInstr *Compare,
+  bool convertToLoadAndTest(MachineInstr &MI);
+  bool adjustCCMasksForInstr(MachineInstr &MI, MachineInstr &Compare,
                              SmallVectorImpl<MachineInstr *> &CCUsers);
-  bool optimizeCompareZero(MachineInstr *Compare,
+  bool optimizeCompareZero(MachineInstr &Compare,
                            SmallVectorImpl<MachineInstr *> &CCUsers);
-  bool fuseCompareAndBranch(MachineInstr *Compare,
-                            SmallVectorImpl<MachineInstr *> &CCUsers);
+  bool fuseCompareOperations(MachineInstr &Compare,
+                             SmallVectorImpl<MachineInstr *> &CCUsers);
 
   const SystemZInstrInfo *TII;
   const TargetRegisterInfo *TRI;
@@ -98,14 +101,12 @@ static bool isCCLiveOut(MachineBasicBlock &MBB) {
 }
 
 // Return true if any CC result of MI would reflect the value of Reg.
-static bool resultTests(MachineInstr *MI, unsigned Reg) {
-  if (MI->getNumOperands() > 0 &&
-      MI->getOperand(0).isReg() &&
-      MI->getOperand(0).isDef() &&
-      MI->getOperand(0).getReg() == Reg)
+static bool resultTests(MachineInstr &MI, unsigned Reg) {
+  if (MI.getNumOperands() > 0 && MI.getOperand(0).isReg() &&
+      MI.getOperand(0).isDef() && MI.getOperand(0).getReg() == Reg)
     return true;
 
-  switch (MI->getOpcode()) {
+  switch (MI.getOpcode()) {
   case SystemZ::LR:
   case SystemZ::LGR:
   case SystemZ::LGFR:
@@ -118,7 +119,7 @@ static bool resultTests(MachineInstr *MI, unsigned Reg) {
   case SystemZ::LTEBR:
   case SystemZ::LTDBR:
   case SystemZ::LTXBR:
-    if (MI->getOperand(1).getReg() == Reg)
+    if (MI.getOperand(1).getReg() == Reg)
       return true;
   }
 
@@ -126,10 +127,10 @@ static bool resultTests(MachineInstr *MI, unsigned Reg) {
 }
 
 // Describe the references to Reg or any of its aliases in MI.
-Reference SystemZElimCompare::getRegReferences(MachineInstr *MI, unsigned Reg) {
+Reference SystemZElimCompare::getRegReferences(MachineInstr &MI, unsigned Reg) {
   Reference Ref;
-  for (unsigned I = 0, E = MI->getNumOperands(); I != E; ++I) {
-    const MachineOperand &MO = MI->getOperand(I);
+  for (unsigned I = 0, E = MI.getNumOperands(); I != E; ++I) {
+    const MachineOperand &MO = MI.getOperand(I);
     if (MO.isReg()) {
       if (unsigned MOReg = MO.getReg()) {
         if (TRI->regsOverlap(MOReg, Reg)) {
@@ -146,23 +147,23 @@ Reference SystemZElimCompare::getRegReferences(MachineInstr *MI, unsigned Reg) {
 
 // Return true if this is a load and test which can be optimized the
 // same way as compare instruction.
-static bool isLoadAndTestAsCmp(MachineInstr *MI) {
+static bool isLoadAndTestAsCmp(MachineInstr &MI) {
   // If we during isel used a load-and-test as a compare with 0, the
   // def operand is dead.
-  return ((MI->getOpcode() == SystemZ::LTEBR ||
-           MI->getOpcode() == SystemZ::LTDBR ||
-           MI->getOpcode() == SystemZ::LTXBR) &&
-          MI->getOperand(0).isDead());
+  return (MI.getOpcode() == SystemZ::LTEBR ||
+          MI.getOpcode() == SystemZ::LTDBR ||
+          MI.getOpcode() == SystemZ::LTXBR) &&
+         MI.getOperand(0).isDead();
 }
 
 // Return the source register of Compare, which is the unknown value
 // being tested.
-static unsigned getCompareSourceReg(MachineInstr *Compare) {
+static unsigned getCompareSourceReg(MachineInstr &Compare) {
   unsigned reg = 0;
-  if (Compare->isCompare())
-    reg = Compare->getOperand(0).getReg();
+  if (Compare.isCompare())
+    reg = Compare.getOperand(0).getReg();
   else if (isLoadAndTestAsCmp(Compare))
-    reg = Compare->getOperand(1).getReg();
+    reg = Compare.getOperand(1).getReg();
   assert (reg);
 
   return reg;
@@ -171,11 +172,11 @@ static unsigned getCompareSourceReg(MachineInstr *Compare) {
 // Compare compares the result of MI against zero.  If MI is an addition
 // of -1 and if CCUsers is a single branch on nonzero, eliminate the addition
 // and convert the branch to a BRCT(G).  Return true on success.
-bool
-SystemZElimCompare::convertToBRCT(MachineInstr *MI, MachineInstr *Compare,
-                                  SmallVectorImpl<MachineInstr *> &CCUsers) {
+bool SystemZElimCompare::convertToBRCT(
+    MachineInstr &MI, MachineInstr &Compare,
+    SmallVectorImpl<MachineInstr *> &CCUsers) {
   // Check whether we have an addition of -1.
-  unsigned Opcode = MI->getOpcode();
+  unsigned Opcode = MI.getOpcode();
   unsigned BRCT;
   if (Opcode == SystemZ::AHI)
     BRCT = SystemZ::BRCT;
@@ -183,7 +184,7 @@ SystemZElimCompare::convertToBRCT(MachineInstr *MI, MachineInstr *Compare,
     BRCT = SystemZ::BRCTG;
   else
     return false;
-  if (MI->getOperand(2).getImm() != -1)
+  if (MI.getOperand(2).getImm() != -1)
     return false;
 
   // Check whether we have a single JLH.
@@ -201,7 +202,7 @@ SystemZElimCompare::convertToBRCT(MachineInstr *MI, MachineInstr *Compare,
   unsigned SrcReg = getCompareSourceReg(Compare);
   MachineBasicBlock::iterator MBBI = Compare, MBBE = Branch;
   for (++MBBI; MBBI != MBBE; ++MBBI)
-    if (getRegReferences(MBBI, SrcReg))
+    if (getRegReferences(*MBBI, SrcReg))
       return false;
 
   // The transformation is OK.  Rebuild Branch as a BRCT(G).
@@ -210,24 +211,24 @@ SystemZElimCompare::convertToBRCT(MachineInstr *MI, MachineInstr *Compare,
     Branch->RemoveOperand(0);
   Branch->setDesc(TII->get(BRCT));
   MachineInstrBuilder(*Branch->getParent()->getParent(), Branch)
-    .addOperand(MI->getOperand(0))
-    .addOperand(MI->getOperand(1))
-    .addOperand(Target)
-    .addReg(SystemZ::CC, RegState::ImplicitDefine);
-  MI->eraseFromParent();
+      .addOperand(MI.getOperand(0))
+      .addOperand(MI.getOperand(1))
+      .addOperand(Target)
+      .addReg(SystemZ::CC, RegState::ImplicitDefine | RegState::Dead);
+  MI.eraseFromParent();
   return true;
 }
 
 // If MI is a load instruction, try to convert it into a LOAD AND TEST.
 // Return true on success.
-bool SystemZElimCompare::convertToLoadAndTest(MachineInstr *MI) {
-  unsigned Opcode = TII->getLoadAndTest(MI->getOpcode());
+bool SystemZElimCompare::convertToLoadAndTest(MachineInstr &MI) {
+  unsigned Opcode = TII->getLoadAndTest(MI.getOpcode());
   if (!Opcode)
     return false;
 
-  MI->setDesc(TII->get(Opcode));
-  MachineInstrBuilder(*MI->getParent()->getParent(), MI)
-    .addReg(SystemZ::CC, RegState::ImplicitDefine);
+  MI.setDesc(TII->get(Opcode));
+  MachineInstrBuilder(*MI.getParent()->getParent(), MI)
+      .addReg(SystemZ::CC, RegState::ImplicitDefine);
   return true;
 }
 
@@ -236,10 +237,10 @@ bool SystemZElimCompare::convertToLoadAndTest(MachineInstr *MI) {
 // would also reflect the value of X.  Try to adjust CCUsers so that
 // they test the result of MI directly, returning true on success.
 // Leave everything unchanged on failure.
-bool SystemZElimCompare::
-adjustCCMasksForInstr(MachineInstr *MI, MachineInstr *Compare,
-                      SmallVectorImpl<MachineInstr *> &CCUsers) {
-  int Opcode = MI->getOpcode();
+bool SystemZElimCompare::adjustCCMasksForInstr(
+    MachineInstr &MI, MachineInstr &Compare,
+    SmallVectorImpl<MachineInstr *> &CCUsers) {
+  int Opcode = MI.getOpcode();
   const MCInstrDesc &Desc = TII->get(Opcode);
   unsigned MIFlags = Desc.TSFlags;
 
@@ -247,7 +248,7 @@ adjustCCMasksForInstr(MachineInstr *MI, MachineInstr *Compare,
   unsigned ReusableCCMask = SystemZII::getCompareZeroCCMask(MIFlags);
 
   // For unsigned comparisons with zero, only equality makes sense.
-  unsigned CompareFlags = Compare->getDesc().TSFlags;
+  unsigned CompareFlags = Compare.getDesc().TSFlags;
   if (CompareFlags & SystemZII::IsLogical)
     ReusableCCMask &= SystemZ::CCMASK_CMP_EQ;
 
@@ -296,9 +297,9 @@ adjustCCMasksForInstr(MachineInstr *MI, MachineInstr *Compare,
   }
 
   // CC is now live after MI.
-  int CCDef = MI->findRegisterDefOperandIdx(SystemZ::CC, false, true, TRI);
+  int CCDef = MI.findRegisterDefOperandIdx(SystemZ::CC, false, true, TRI);
   assert(CCDef >= 0 && "Couldn't find CC set");
-  MI->getOperand(CCDef).setIsDead(false);
+  MI.getOperand(CCDef).setIsDead(false);
 
   // Clear any intervening kills of CC.
   MachineBasicBlock::iterator MBBI = MI, MBBE = Compare;
@@ -309,8 +310,8 @@ adjustCCMasksForInstr(MachineInstr *MI, MachineInstr *Compare,
 }
 
 // Return true if Compare is a comparison against zero.
-static bool isCompareZero(MachineInstr *Compare) {
-  switch (Compare->getOpcode()) {
+static bool isCompareZero(MachineInstr &Compare) {
+  switch (Compare.getOpcode()) {
   case SystemZ::LTEBRCompare:
   case SystemZ::LTDBRCompare:
   case SystemZ::LTXBRCompare:
@@ -321,9 +322,8 @@ static bool isCompareZero(MachineInstr *Compare) {
     if (isLoadAndTestAsCmp(Compare))
       return true;
 
-    return (Compare->getNumExplicitOperands() == 2 &&
-            Compare->getOperand(1).isImm() &&
-            Compare->getOperand(1).getImm() == 0);
+    return Compare.getNumExplicitOperands() == 2 &&
+           Compare.getOperand(1).isImm() && Compare.getOperand(1).getImm() == 0;
   }
 }
 
@@ -331,21 +331,20 @@ static bool isCompareZero(MachineInstr *Compare) {
 // a value against zero.  Return true on success and if Compare should be
 // deleted as dead.  CCUsers is the list of instructions that use the CC
 // value produced by Compare.
-bool SystemZElimCompare::
-optimizeCompareZero(MachineInstr *Compare,
-                    SmallVectorImpl<MachineInstr *> &CCUsers) {
+bool SystemZElimCompare::optimizeCompareZero(
+    MachineInstr &Compare, SmallVectorImpl<MachineInstr *> &CCUsers) {
   if (!isCompareZero(Compare))
     return false;
 
   // Search back for CC results that are based on the first operand.
   unsigned SrcReg = getCompareSourceReg(Compare);
-  MachineBasicBlock &MBB = *Compare->getParent();
+  MachineBasicBlock &MBB = *Compare.getParent();
   MachineBasicBlock::iterator MBBI = Compare, MBBE = MBB.begin();
   Reference CCRefs;
   Reference SrcRefs;
   while (MBBI != MBBE) {
     --MBBI;
-    MachineInstr *MI = MBBI;
+    MachineInstr &MI = *MBBI;
     if (resultTests(MI, SrcReg)) {
       // Try to remove both MI and Compare by converting a branch to BRCT(G).
       // We don't care in this case whether CC is modified between MI and
@@ -373,54 +372,85 @@ optimizeCompareZero(MachineInstr *Compare,
 
 // Try to fuse comparison instruction Compare into a later branch.
 // Return true on success and if Compare is therefore redundant.
-bool SystemZElimCompare::
-fuseCompareAndBranch(MachineInstr *Compare,
-                     SmallVectorImpl<MachineInstr *> &CCUsers) {
-  // See whether we have a comparison that can be fused.
-  unsigned FusedOpcode = TII->getCompareAndBranch(Compare->getOpcode(),
-                                                  Compare);
-  if (!FusedOpcode)
-    return false;
-
+bool SystemZElimCompare::fuseCompareOperations(
+    MachineInstr &Compare, SmallVectorImpl<MachineInstr *> &CCUsers) {
   // See whether we have a single branch with which to fuse.
   if (CCUsers.size() != 1)
     return false;
   MachineInstr *Branch = CCUsers[0];
-  if (Branch->getOpcode() != SystemZ::BRC)
+  SystemZII::FusedCompareType Type;
+  switch (Branch->getOpcode()) {
+  case SystemZ::BRC:
+    Type = SystemZII::CompareAndBranch;
+    break;
+  case SystemZ::CondReturn:
+    Type = SystemZII::CompareAndReturn;
+    break;
+  case SystemZ::CallBCR:
+    Type = SystemZII::CompareAndSibcall;
+    break;
+  case SystemZ::CondTrap:
+    Type = SystemZII::CompareAndTrap;
+    break;
+  default:
+    return false;
+  }
+
+  // See whether we have a comparison that can be fused.
+  unsigned FusedOpcode =
+      TII->getFusedCompare(Compare.getOpcode(), Type, &Compare);
+  if (!FusedOpcode)
     return false;
 
   // Make sure that the operands are available at the branch.
-  unsigned SrcReg = Compare->getOperand(0).getReg();
-  unsigned SrcReg2 = (Compare->getOperand(1).isReg() ?
-                      Compare->getOperand(1).getReg() : 0);
+  unsigned SrcReg = Compare.getOperand(0).getReg();
+  unsigned SrcReg2 =
+      Compare.getOperand(1).isReg() ? Compare.getOperand(1).getReg() : 0;
   MachineBasicBlock::iterator MBBI = Compare, MBBE = Branch;
   for (++MBBI; MBBI != MBBE; ++MBBI)
     if (MBBI->modifiesRegister(SrcReg, TRI) ||
         (SrcReg2 && MBBI->modifiesRegister(SrcReg2, TRI)))
       return false;
 
-  // Read the branch mask and target.
+  // Read the branch mask, target (if applicable), regmask (if applicable).
   MachineOperand CCMask(MBBI->getOperand(1));
-  MachineOperand Target(MBBI->getOperand(2));
   assert((CCMask.getImm() & ~SystemZ::CCMASK_ICMP) == 0 &&
          "Invalid condition-code mask for integer comparison");
+  // This is only valid for CompareAndBranch.
+  MachineOperand Target(MBBI->getOperand(
+    Type == SystemZII::CompareAndBranch ? 2 : 0));
+  const uint32_t *RegMask;
+  if (Type == SystemZII::CompareAndSibcall)
+    RegMask = MBBI->getOperand(2).getRegMask();
 
   // Clear out all current operands.
   int CCUse = MBBI->findRegisterUseOperandIdx(SystemZ::CC, false, TRI);
-  assert(CCUse >= 0 && "BRC must use CC");
+  assert(CCUse >= 0 && "BRC/BCR must use CC");
   Branch->RemoveOperand(CCUse);
-  Branch->RemoveOperand(2);
+  // Remove target (branch) or regmask (sibcall).
+  if (Type == SystemZII::CompareAndBranch ||
+      Type == SystemZII::CompareAndSibcall)
+    Branch->RemoveOperand(2);
   Branch->RemoveOperand(1);
   Branch->RemoveOperand(0);
 
   // Rebuild Branch as a fused compare and branch.
   Branch->setDesc(TII->get(FusedOpcode));
-  MachineInstrBuilder(*Branch->getParent()->getParent(), Branch)
-    .addOperand(Compare->getOperand(0))
-    .addOperand(Compare->getOperand(1))
-    .addOperand(CCMask)
-    .addOperand(Target)
-    .addReg(SystemZ::CC, RegState::ImplicitDefine);
+  MachineInstrBuilder MIB(*Branch->getParent()->getParent(), Branch);
+  MIB.addOperand(Compare.getOperand(0))
+      .addOperand(Compare.getOperand(1))
+      .addOperand(CCMask);
+
+  if (Type == SystemZII::CompareAndBranch) {
+    // Only conditional branches define CC, as they may be converted back
+    // to a non-fused branch because of a long displacement.  Conditional
+    // returns don't have that problem.
+    MIB.addOperand(Target)
+       .addReg(SystemZ::CC, RegState::ImplicitDefine | RegState::Dead);
+  }
+
+  if (Type == SystemZII::CompareAndSibcall)
+    MIB.addRegMask(RegMask);
 
   // Clear any intervening kills of SrcReg and SrcReg2.
   MBBI = Compare;
@@ -445,29 +475,31 @@ bool SystemZElimCompare::processBlock(MachineBasicBlock &MBB) {
   SmallVector<MachineInstr *, 4> CCUsers;
   MachineBasicBlock::iterator MBBI = MBB.end();
   while (MBBI != MBB.begin()) {
-    MachineInstr *MI = --MBBI;
-    if (CompleteCCUsers &&
-        (MI->isCompare() || isLoadAndTestAsCmp(MI)) &&
+    MachineInstr &MI = *--MBBI;
+    if (CompleteCCUsers && (MI.isCompare() || isLoadAndTestAsCmp(MI)) &&
         (optimizeCompareZero(MI, CCUsers) ||
-         fuseCompareAndBranch(MI, CCUsers))) {
+         fuseCompareOperations(MI, CCUsers))) {
       ++MBBI;
-      MI->eraseFromParent();
+      MI.eraseFromParent();
       Changed = true;
       CCUsers.clear();
       continue;
     }
 
-    if (MI->definesRegister(SystemZ::CC)) {
+    if (MI.definesRegister(SystemZ::CC)) {
       CCUsers.clear();
       CompleteCCUsers = true;
     }
-    if (MI->readsRegister(SystemZ::CC) && CompleteCCUsers)
-      CCUsers.push_back(MI);
+    if (MI.readsRegister(SystemZ::CC) && CompleteCCUsers)
+      CCUsers.push_back(&MI);
   }
   return Changed;
 }
 
 bool SystemZElimCompare::runOnMachineFunction(MachineFunction &F) {
+  if (skipFunction(*F.getFunction()))
+    return false;
+
   TII = static_cast<const SystemZInstrInfo *>(F.getSubtarget().getInstrInfo());
   TRI = &TII->getRegisterInfo();
 
diff --git a/lib/Target/SystemZ/SystemZFrameLowering.cpp b/lib/Target/SystemZ/SystemZFrameLowering.cpp
index e1b20d0536d1..ccaed49475ca 100644
--- a/lib/Target/SystemZ/SystemZFrameLowering.cpp
+++ b/lib/Target/SystemZ/SystemZFrameLowering.cpp
@@ -81,6 +81,12 @@ void SystemZFrameLowering::determineCalleeSaves(MachineFunction &MF,
     for (unsigned I = MFI->getVarArgsFirstGPR(); I < SystemZ::NumArgGPRs; ++I)
       SavedRegs.set(SystemZ::ArgGPRs[I]);
 
+  // If there are any landing pads, entering them will modify r6/r7.
+  if (!MF.getMMI().getLandingPads().empty()) {
+    SavedRegs.set(SystemZ::R6D);
+    SavedRegs.set(SystemZ::R7D);
+  }
+
   // If the function requires a frame pointer, record that the hard
   // frame pointer will be clobbered.
   if (HasFP)
@@ -258,7 +264,8 @@ restoreCalleeSavedRegisters(MachineBasicBlock &MBB,
     // Do a second scan adding regs as being defined by instruction
     for (unsigned I = 0, E = CSI.size(); I != E; ++I) {
       unsigned Reg = CSI[I].getReg();
-      if (Reg != LowGPR && Reg != HighGPR)
+      if (Reg != LowGPR && Reg != HighGPR &&
+          SystemZ::GR64BitRegClass.contains(Reg))
         MIB.addReg(Reg, RegState::ImplicitDefine);
     }
   }
@@ -353,6 +360,15 @@ void SystemZFrameLowering::emitPrologue(MachineFunction &MF,
 
   uint64_t StackSize = getAllocatedStackSize(MF);
   if (StackSize) {
+    // Determine if we want to store a backchain.
+    bool StoreBackchain = MF.getFunction()->hasFnAttribute("backchain");
+
+    // If we need backchain, save current stack pointer.  R1 is free at this
+    // point.
+    if (StoreBackchain)
+      BuildMI(MBB, MBBI, DL, ZII->get(SystemZ::LGR))
+        .addReg(SystemZ::R1D, RegState::Define).addReg(SystemZ::R15D);
+
     // Allocate StackSize bytes.
     int64_t Delta = -int64_t(StackSize);
     emitIncrement(MBB, MBBI, DL, SystemZ::R15D, Delta, ZII);
@@ -363,6 +379,10 @@ void SystemZFrameLowering::emitPrologue(MachineFunction &MF,
     BuildMI(MBB, MBBI, DL, ZII->get(TargetOpcode::CFI_INSTRUCTION))
         .addCFIIndex(CFIIndex);
     SPOffsetFromCFA += Delta;
+
+    if (StoreBackchain)
+      BuildMI(MBB, MBBI, DL, ZII->get(SystemZ::STG))
+        .addReg(SystemZ::R1D, RegState::Kill).addReg(SystemZ::R15D).addImm(0).addReg(0);
   }
 
   if (HasFP) {
@@ -511,7 +531,7 @@ SystemZFrameLowering::hasReservedCallFrame(const MachineFunction &MF) const {
   return true;
 }
 
-void SystemZFrameLowering::
+MachineBasicBlock::iterator SystemZFrameLowering::
 eliminateCallFramePseudoInstr(MachineFunction &MF,
                               MachineBasicBlock &MBB,
                               MachineBasicBlock::iterator MI) const {
@@ -520,7 +540,7 @@ eliminateCallFramePseudoInstr(MachineFunction &MF,
   case SystemZ::ADJCALLSTACKUP:
     assert(hasReservedCallFrame(MF) &&
            "ADJSTACKDOWN and ADJSTACKUP should be no-ops");
-    MBB.erase(MI);
+    return MBB.erase(MI);
     break;
 
   default:
diff --git a/lib/Target/SystemZ/SystemZFrameLowering.h b/lib/Target/SystemZ/SystemZFrameLowering.h
index 46bb6b7a7573..d43a176ad874 100644
--- a/lib/Target/SystemZ/SystemZFrameLowering.h
+++ b/lib/Target/SystemZ/SystemZFrameLowering.h
@@ -46,10 +46,9 @@ public:
   int getFrameIndexReference(const MachineFunction &MF, int FI,
                              unsigned &FrameReg) const override;
   bool hasReservedCallFrame(const MachineFunction &MF) const override;
-  void eliminateCallFramePseudoInstr(MachineFunction &MF,
-                                     MachineBasicBlock &MBB,
-                                     MachineBasicBlock::iterator MI) const
-    override;
+  MachineBasicBlock::iterator
+  eliminateCallFramePseudoInstr(MachineFunction &MF, MachineBasicBlock &MBB,
+                                MachineBasicBlock::iterator MI) const override;
 
   // Return the number of bytes in the callee-allocated part of the frame.
   uint64_t getAllocatedStackSize(const MachineFunction &MF) const;
diff --git a/lib/Target/SystemZ/SystemZISelDAGToDAG.cpp b/lib/Target/SystemZ/SystemZISelDAGToDAG.cpp
index a9093094d884..cd7fcc3070a4 100644
--- a/lib/Target/SystemZ/SystemZISelDAGToDAG.cpp
+++ b/lib/Target/SystemZ/SystemZISelDAGToDAG.cpp
@@ -113,7 +113,8 @@ static uint64_t allOnes(unsigned int Count) {
 //   (and (rotl Input, Rotate), Mask)
 //
 // otherwise.  The output value has BitSize bits, although Input may be
-// narrower (in which case the upper bits are don't care).
+// narrower (in which case the upper bits are don't care), or wider (in which
+// case the result will be truncated as part of the operation).
 struct RxSBGOperands {
   RxSBGOperands(unsigned Op, SDValue N)
     : Opcode(Op), BitSize(N.getValueType().getSizeInBits()),
@@ -279,18 +280,18 @@ class SystemZDAGToDAGISel : public SelectionDAGISel {
   bool expandRxSBG(RxSBGOperands &RxSBG) const;
 
   // Return an undefined value of type VT.
-  SDValue getUNDEF(SDLoc DL, EVT VT) const;
+  SDValue getUNDEF(const SDLoc &DL, EVT VT) const;
 
   // Convert N to VT, if it isn't already.
-  SDValue convertTo(SDLoc DL, EVT VT, SDValue N) const;
+  SDValue convertTo(const SDLoc &DL, EVT VT, SDValue N) const;
 
   // Try to implement AND or shift node N using RISBG with the zero flag set.
   // Return the selected node on success, otherwise return null.
-  SDNode *tryRISBGZero(SDNode *N);
+  bool tryRISBGZero(SDNode *N);
 
   // Try to use RISBG or Opcode to implement OR or XOR node N.
   // Return the selected node on success, otherwise return null.
-  SDNode *tryRxSBG(SDNode *N, unsigned Opcode);
+  bool tryRxSBG(SDNode *N, unsigned Opcode);
 
   // If Op0 is null, then Node is a constant that can be loaded using:
   //
@@ -299,14 +300,14 @@ class SystemZDAGToDAGISel : public SelectionDAGISel {
   // If Op0 is nonnull, then Node can be implemented using:
   //
   //   (Opcode (Opcode Op0 UpperVal) LowerVal)
-  SDNode *splitLargeImmediate(unsigned Opcode, SDNode *Node, SDValue Op0,
-                              uint64_t UpperVal, uint64_t LowerVal);
+  void splitLargeImmediate(unsigned Opcode, SDNode *Node, SDValue Op0,
+                           uint64_t UpperVal, uint64_t LowerVal);
 
   // Try to use gather instruction Opcode to implement vector insertion N.
-  SDNode *tryGather(SDNode *N, unsigned Opcode);
+  bool tryGather(SDNode *N, unsigned Opcode);
 
   // Try to use scatter instruction Opcode to implement store Store.
-  SDNode *tryScatter(StoreSDNode *Store, unsigned Opcode);
+  bool tryScatter(StoreSDNode *Store, unsigned Opcode);
 
   // Return true if Load and Store are loads and stores of the same size
   // and are guaranteed not to overlap.  Such operations can be implemented
@@ -343,7 +344,7 @@ public:
   }
 
   // Override SelectionDAGISel.
-  SDNode *Select(SDNode *Node) override;
+  void Select(SDNode *Node) override;
   bool SelectInlineAsmMemoryOperand(const SDValue &Op, unsigned ConstraintID,
                                     std::vector<SDValue> &OutOps) override;
 
@@ -554,6 +555,10 @@ bool SystemZDAGToDAGISel::selectAddress(SDValue Addr,
       expandDisp(AM, true, SDValue(),
                  cast<ConstantSDNode>(Addr)->getSExtValue()))
     ;
+  // Also see if it's a bare ADJDYNALLOC.
+  else if (Addr.getOpcode() == SystemZISD::ADJDYNALLOC &&
+           expandAdjDynAlloc(AM, true, SDValue()))
+    ;
   else
     // Otherwise try expanding each component.
     while (expandAddress(AM, true) ||
@@ -741,6 +746,16 @@ bool SystemZDAGToDAGISel::expandRxSBG(RxSBGOperands &RxSBG) const {
   SDValue N = RxSBG.Input;
   unsigned Opcode = N.getOpcode();
   switch (Opcode) {
+  case ISD::TRUNCATE: {
+    if (RxSBG.Opcode == SystemZ::RNSBG)
+      return false;
+    uint64_t BitSize = N.getValueType().getSizeInBits();
+    uint64_t Mask = allOnes(BitSize);
+    if (!refineRxSBGMask(RxSBG, Mask))
+      return false;
+    RxSBG.Input = N.getOperand(0);
+    return true;
+  }
   case ISD::AND: {
     if (RxSBG.Opcode == SystemZ::RNSBG)
       return false;
@@ -888,12 +903,13 @@ bool SystemZDAGToDAGISel::expandRxSBG(RxSBGOperands &RxSBG) const {
   }
 }
 
-SDValue SystemZDAGToDAGISel::getUNDEF(SDLoc DL, EVT VT) const {
+SDValue SystemZDAGToDAGISel::getUNDEF(const SDLoc &DL, EVT VT) const {
   SDNode *N = CurDAG->getMachineNode(TargetOpcode::IMPLICIT_DEF, DL, VT);
   return SDValue(N, 0);
 }
 
-SDValue SystemZDAGToDAGISel::convertTo(SDLoc DL, EVT VT, SDValue N) const {
+SDValue SystemZDAGToDAGISel::convertTo(const SDLoc &DL, EVT VT,
+                                       SDValue N) const {
   if (N.getValueType() == MVT::i32 && VT == MVT::i64)
     return CurDAG->getTargetInsertSubreg(SystemZ::subreg_l32,
                                          DL, VT, getUNDEF(DL, MVT::i64), N);
@@ -903,23 +919,27 @@ SDValue SystemZDAGToDAGISel::convertTo(SDLoc DL, EVT VT, SDValue N) const {
   return N;
 }
 
-SDNode *SystemZDAGToDAGISel::tryRISBGZero(SDNode *N) {
+bool SystemZDAGToDAGISel::tryRISBGZero(SDNode *N) {
   SDLoc DL(N);
   EVT VT = N->getValueType(0);
   if (!VT.isInteger() || VT.getSizeInBits() > 64)
-    return nullptr;
+    return false;
   RxSBGOperands RISBG(SystemZ::RISBG, SDValue(N, 0));
   unsigned Count = 0;
   while (expandRxSBG(RISBG))
-    if (RISBG.Input.getOpcode() != ISD::ANY_EXTEND)
+    // The widening or narrowing is expected to be free.
+    // Counting widening or narrowing as a saved operation will result in
+    // preferring an R*SBG over a simple shift/logical instruction.
+    if (RISBG.Input.getOpcode() != ISD::ANY_EXTEND &&
+        RISBG.Input.getOpcode() != ISD::TRUNCATE)
       Count += 1;
   if (Count == 0)
-    return nullptr;
+    return false;
   if (Count == 1) {
     // Prefer to use normal shift instructions over RISBG, since they can handle
     // all cases and are sometimes shorter.
     if (N->getOpcode() != ISD::AND)
-      return nullptr;
+      return false;
 
     // Prefer register extensions like LLC over RISBG.  Also prefer to start
     // out with normal ANDs if one instruction would be enough.  We can convert
@@ -934,9 +954,10 @@ SDNode *SystemZDAGToDAGISel::tryRISBGZero(SDNode *N) {
       if (MaskN->getZExtValue() != RISBG.Mask) {
         SDValue NewMask = CurDAG->getConstant(RISBG.Mask, DL, VT);
         N = CurDAG->UpdateNodeOperands(N, N->getOperand(0), NewMask);
-        return SelectCode(N);
+        SelectCode(N);
+        return true;
       }
-      return nullptr;
+      return false;
     }
   }
 
@@ -952,8 +973,11 @@ SDNode *SystemZDAGToDAGISel::tryRISBGZero(SDNode *N) {
     }
 
     SDValue In = convertTo(DL, VT, RISBG.Input);
-    N = CurDAG->getMachineNode(OpCode, DL, VT, In);
-    return convertTo(DL, VT, SDValue(N, 0)).getNode();
+    SDValue New = convertTo(
+        DL, VT, SDValue(CurDAG->getMachineNode(OpCode, DL, VT, In), 0));
+    ReplaceUses(N, New.getNode());
+    CurDAG->RemoveDeadNode(N);
+    return true;
   }
 
   unsigned Opcode = SystemZ::RISBG;
@@ -974,15 +998,18 @@ SDNode *SystemZDAGToDAGISel::tryRISBGZero(SDNode *N) {
     CurDAG->getTargetConstant(RISBG.End | 128, DL, MVT::i32),
     CurDAG->getTargetConstant(RISBG.Rotate, DL, MVT::i32)
   };
-  N = CurDAG->getMachineNode(Opcode, DL, OpcodeVT, Ops);
-  return convertTo(DL, VT, SDValue(N, 0)).getNode();
+  SDValue New = convertTo(
+      DL, VT, SDValue(CurDAG->getMachineNode(Opcode, DL, OpcodeVT, Ops), 0));
+  ReplaceUses(N, New.getNode());
+  CurDAG->RemoveDeadNode(N);
+  return true;
 }
 
-SDNode *SystemZDAGToDAGISel::tryRxSBG(SDNode *N, unsigned Opcode) {
+bool SystemZDAGToDAGISel::tryRxSBG(SDNode *N, unsigned Opcode) {
   SDLoc DL(N);
   EVT VT = N->getValueType(0);
   if (!VT.isInteger() || VT.getSizeInBits() > 64)
-    return nullptr;
+    return false;
   // Try treating each operand of N as the second operand of the RxSBG
   // and see which goes deepest.
   RxSBGOperands RxSBG[] = {
@@ -992,12 +1019,16 @@ SDNode *SystemZDAGToDAGISel::tryRxSBG(SDNode *N, unsigned Opcode) {
   unsigned Count[] = { 0, 0 };
   for (unsigned I = 0; I < 2; ++I)
     while (expandRxSBG(RxSBG[I]))
-      if (RxSBG[I].Input.getOpcode() != ISD::ANY_EXTEND)
+      // The widening or narrowing is expected to be free.
+      // Counting widening or narrowing as a saved operation will result in
+      // preferring an R*SBG over a simple shift/logical instruction.
+      if (RxSBG[I].Input.getOpcode() != ISD::ANY_EXTEND &&
+          RxSBG[I].Input.getOpcode() != ISD::TRUNCATE)
         Count[I] += 1;
 
   // Do nothing if neither operand is suitable.
   if (Count[0] == 0 && Count[1] == 0)
-    return nullptr;
+    return false;
 
   // Pick the deepest second operand.
   unsigned I = Count[0] > Count[1] ? 0 : 1;
@@ -1007,7 +1038,7 @@ SDNode *SystemZDAGToDAGISel::tryRxSBG(SDNode *N, unsigned Opcode) {
   if (Opcode == SystemZ::ROSBG && (RxSBG[I].Mask & 0xff) == 0)
     if (auto *Load = dyn_cast<LoadSDNode>(Op0.getNode()))
       if (Load->getMemoryVT() == MVT::i8)
-        return nullptr;
+        return false;
 
   // See whether we can avoid an AND in the first operand by converting
   // ROSBG to RISBG.
@@ -1025,47 +1056,70 @@ SDNode *SystemZDAGToDAGISel::tryRxSBG(SDNode *N, unsigned Opcode) {
     CurDAG->getTargetConstant(RxSBG[I].End, DL, MVT::i32),
     CurDAG->getTargetConstant(RxSBG[I].Rotate, DL, MVT::i32)
   };
-  N = CurDAG->getMachineNode(Opcode, DL, MVT::i64, Ops);
-  return convertTo(DL, VT, SDValue(N, 0)).getNode();
+  SDValue New = convertTo(
+      DL, VT, SDValue(CurDAG->getMachineNode(Opcode, DL, MVT::i64, Ops), 0));
+  ReplaceNode(N, New.getNode());
+  return true;
 }
 
-SDNode *SystemZDAGToDAGISel::splitLargeImmediate(unsigned Opcode, SDNode *Node,
-                                                 SDValue Op0, uint64_t UpperVal,
-                                                 uint64_t LowerVal) {
+void SystemZDAGToDAGISel::splitLargeImmediate(unsigned Opcode, SDNode *Node,
+                                              SDValue Op0, uint64_t UpperVal,
+                                              uint64_t LowerVal) {
   EVT VT = Node->getValueType(0);
   SDLoc DL(Node);
   SDValue Upper = CurDAG->getConstant(UpperVal, DL, VT);
   if (Op0.getNode())
     Upper = CurDAG->getNode(Opcode, DL, VT, Op0, Upper);
-  Upper = SDValue(Select(Upper.getNode()), 0);
+
+  {
+    // When we haven't passed in Op0, Upper will be a constant. In order to
+    // prevent folding back to the large immediate in `Or = getNode(...)` we run
+    // SelectCode first and end up with an opaque machine node. This means that
+    // we need to use a handle to keep track of Upper in case it gets CSE'd by
+    // SelectCode.
+    //
+    // Note that in the case where Op0 is passed in we could just call
+    // SelectCode(Upper) later, along with the SelectCode(Or), and avoid needing
+    // the handle at all, but it's fine to do it here.
+    //
+    // TODO: This is a pretty hacky way to do this. Can we do something that
+    // doesn't require a two paragraph explanation?
+    HandleSDNode Handle(Upper);
+    SelectCode(Upper.getNode());
+    Upper = Handle.getValue();
+  }
 
   SDValue Lower = CurDAG->getConstant(LowerVal, DL, VT);
   SDValue Or = CurDAG->getNode(Opcode, DL, VT, Upper, Lower);
-  return Or.getNode();
+
+  ReplaceUses(Node, Or.getNode());
+  CurDAG->RemoveDeadNode(Node);
+
+  SelectCode(Or.getNode());
 }
 
-SDNode *SystemZDAGToDAGISel::tryGather(SDNode *N, unsigned Opcode) {
+bool SystemZDAGToDAGISel::tryGather(SDNode *N, unsigned Opcode) {
   SDValue ElemV = N->getOperand(2);
   auto *ElemN = dyn_cast<ConstantSDNode>(ElemV);
   if (!ElemN)
-    return 0;
+    return false;
 
   unsigned Elem = ElemN->getZExtValue();
   EVT VT = N->getValueType(0);
   if (Elem >= VT.getVectorNumElements())
-    return 0;
+    return false;
 
   auto *Load = dyn_cast<LoadSDNode>(N->getOperand(1));
   if (!Load || !Load->hasOneUse())
-    return 0;
+    return false;
   if (Load->getMemoryVT().getSizeInBits() !=
       Load->getValueType(0).getSizeInBits())
-    return 0;
+    return false;
 
   SDValue Base, Disp, Index;
   if (!selectBDVAddr12Only(Load->getBasePtr(), ElemV, Base, Disp, Index) ||
       Index.getValueType() != VT.changeVectorElementTypeToInteger())
-    return 0;
+    return false;
 
   SDLoc DL(Load);
   SDValue Ops[] = {
@@ -1074,39 +1128,41 @@ SDNode *SystemZDAGToDAGISel::tryGather(SDNode *N, unsigned Opcode) {
   };
   SDNode *Res = CurDAG->getMachineNode(Opcode, DL, VT, MVT::Other, Ops);
   ReplaceUses(SDValue(Load, 1), SDValue(Res, 1));
-  return Res;
+  ReplaceNode(N, Res);
+  return true;
 }
 
-SDNode *SystemZDAGToDAGISel::tryScatter(StoreSDNode *Store, unsigned Opcode) {
+bool SystemZDAGToDAGISel::tryScatter(StoreSDNode *Store, unsigned Opcode) {
   SDValue Value = Store->getValue();
   if (Value.getOpcode() != ISD::EXTRACT_VECTOR_ELT)
-    return 0;
+    return false;
   if (Store->getMemoryVT().getSizeInBits() !=
       Value.getValueType().getSizeInBits())
-    return 0;
+    return false;
 
   SDValue ElemV = Value.getOperand(1);
   auto *ElemN = dyn_cast<ConstantSDNode>(ElemV);
   if (!ElemN)
-    return 0;
+    return false;
 
   SDValue Vec = Value.getOperand(0);
   EVT VT = Vec.getValueType();
   unsigned Elem = ElemN->getZExtValue();
   if (Elem >= VT.getVectorNumElements())
-    return 0;
+    return false;
 
   SDValue Base, Disp, Index;
   if (!selectBDVAddr12Only(Store->getBasePtr(), ElemV, Base, Disp, Index) ||
       Index.getValueType() != VT.changeVectorElementTypeToInteger())
-    return 0;
+    return false;
 
   SDLoc DL(Store);
   SDValue Ops[] = {
     Vec, Base, Disp, Index, CurDAG->getTargetConstant(Elem, DL, MVT::i32),
     Store->getChain()
   };
-  return CurDAG->getMachineNode(Opcode, DL, MVT::Other, Ops);
+  ReplaceNode(Store, CurDAG->getMachineNode(Opcode, DL, MVT::Other, Ops));
+  return true;
 }
 
 bool SystemZDAGToDAGISel::canUseBlockOperation(StoreSDNode *Store,
@@ -1167,7 +1223,7 @@ bool SystemZDAGToDAGISel::storeLoadCanUseBlockBinary(SDNode *N,
   return !LoadA->isVolatile() && canUseBlockOperation(StoreA, LoadB);
 }
 
-SDNode *SystemZDAGToDAGISel::Select(SDNode *Node) {
+void SystemZDAGToDAGISel::Select(SDNode *Node) {
   // Dump information about the Node being selected
   DEBUG(errs() << "Selecting: "; Node->dump(CurDAG); errs() << "\n");
 
@@ -1175,43 +1231,47 @@ SDNode *SystemZDAGToDAGISel::Select(SDNode *Node) {
   if (Node->isMachineOpcode()) {
     DEBUG(errs() << "== "; Node->dump(CurDAG); errs() << "\n");
     Node->setNodeId(-1);
-    return nullptr;
+    return;
   }
 
   unsigned Opcode = Node->getOpcode();
-  SDNode *ResNode = nullptr;
   switch (Opcode) {
   case ISD::OR:
     if (Node->getOperand(1).getOpcode() != ISD::Constant)
-      ResNode = tryRxSBG(Node, SystemZ::ROSBG);
+      if (tryRxSBG(Node, SystemZ::ROSBG))
+        return;
     goto or_xor;
 
   case ISD::XOR:
     if (Node->getOperand(1).getOpcode() != ISD::Constant)
-      ResNode = tryRxSBG(Node, SystemZ::RXSBG);
+      if (tryRxSBG(Node, SystemZ::RXSBG))
+        return;
     // Fall through.
   or_xor:
     // If this is a 64-bit operation in which both 32-bit halves are nonzero,
     // split the operation into two.
-    if (!ResNode && Node->getValueType(0) == MVT::i64)
+    if (Node->getValueType(0) == MVT::i64)
       if (auto *Op1 = dyn_cast<ConstantSDNode>(Node->getOperand(1))) {
         uint64_t Val = Op1->getZExtValue();
-        if (!SystemZ::isImmLF(Val) && !SystemZ::isImmHF(Val))
-          Node = splitLargeImmediate(Opcode, Node, Node->getOperand(0),
-                                     Val - uint32_t(Val), uint32_t(Val));
+        if (!SystemZ::isImmLF(Val) && !SystemZ::isImmHF(Val)) {
+          splitLargeImmediate(Opcode, Node, Node->getOperand(0),
+                              Val - uint32_t(Val), uint32_t(Val));
+          return;
+        }
       }
     break;
 
   case ISD::AND:
     if (Node->getOperand(1).getOpcode() != ISD::Constant)
-      ResNode = tryRxSBG(Node, SystemZ::RNSBG);
+      if (tryRxSBG(Node, SystemZ::RNSBG))
+        return;
     // Fall through.
   case ISD::ROTL:
   case ISD::SHL:
   case ISD::SRL:
   case ISD::ZERO_EXTEND:
-    if (!ResNode)
-      ResNode = tryRISBGZero(Node);
+    if (tryRISBGZero(Node))
+      return;
     break;
 
   case ISD::Constant:
@@ -1219,9 +1279,11 @@ SDNode *SystemZDAGToDAGISel::Select(SDNode *Node) {
     // LLIHF and LGFI, split it into two 32-bit pieces.
     if (Node->getValueType(0) == MVT::i64) {
       uint64_t Val = cast<ConstantSDNode>(Node)->getZExtValue();
-      if (!SystemZ::isImmLF(Val) && !SystemZ::isImmHF(Val) && !isInt<32>(Val))
-        Node = splitLargeImmediate(ISD::OR, Node, SDValue(),
-                                   Val - uint32_t(Val), uint32_t(Val));
+      if (!SystemZ::isImmLF(Val) && !SystemZ::isImmHF(Val) && !isInt<32>(Val)) {
+        splitLargeImmediate(ISD::OR, Node, SDValue(), Val - uint32_t(Val),
+                            uint32_t(Val));
+        return;
+      }
     }
     break;
 
@@ -1249,63 +1311,75 @@ SDNode *SystemZDAGToDAGISel::Select(SDNode *Node) {
   case ISD::INSERT_VECTOR_ELT: {
     EVT VT = Node->getValueType(0);
     unsigned ElemBitSize = VT.getVectorElementType().getSizeInBits();
-    if (ElemBitSize == 32)
-      ResNode = tryGather(Node, SystemZ::VGEF);
-    else if (ElemBitSize == 64)
-      ResNode = tryGather(Node, SystemZ::VGEG);
+    if (ElemBitSize == 32) {
+      if (tryGather(Node, SystemZ::VGEF))
+        return;
+    } else if (ElemBitSize == 64) {
+      if (tryGather(Node, SystemZ::VGEG))
+        return;
+    }
     break;
   }
 
   case ISD::STORE: {
     auto *Store = cast<StoreSDNode>(Node);
     unsigned ElemBitSize = Store->getValue().getValueType().getSizeInBits();
-    if (ElemBitSize == 32)
-      ResNode = tryScatter(Store, SystemZ::VSCEF);
-    else if (ElemBitSize == 64)
-      ResNode = tryScatter(Store, SystemZ::VSCEG);
+    if (ElemBitSize == 32) {
+      if (tryScatter(Store, SystemZ::VSCEF))
+        return;
+    } else if (ElemBitSize == 64) {
+      if (tryScatter(Store, SystemZ::VSCEG))
+        return;
+    }
     break;
   }
   }
 
-  // Select the default instruction
-  if (!ResNode)
-    ResNode = SelectCode(Node);
-
-  DEBUG(errs() << "=> ";
-        if (ResNode == nullptr || ResNode == Node)
-          Node->dump(CurDAG);
-        else
-          ResNode->dump(CurDAG);
-        errs() << "\n";
-        );
-  return ResNode;
+  SelectCode(Node);
 }
 
 bool SystemZDAGToDAGISel::
 SelectInlineAsmMemoryOperand(const SDValue &Op,
                              unsigned ConstraintID,
                              std::vector<SDValue> &OutOps) {
+  SystemZAddressingMode::AddrForm Form;
+  SystemZAddressingMode::DispRange DispRange;
+  SDValue Base, Disp, Index;
+
   switch(ConstraintID) {
   default:
     llvm_unreachable("Unexpected asm memory constraint");
   case InlineAsm::Constraint_i:
-  case InlineAsm::Constraint_m:
   case InlineAsm::Constraint_Q:
+    // Accept an address with a short displacement, but no index.
+    Form = SystemZAddressingMode::FormBD;
+    DispRange = SystemZAddressingMode::Disp12Only;
+    break;
   case InlineAsm::Constraint_R:
+    // Accept an address with a short displacement and an index.
+    Form = SystemZAddressingMode::FormBDXNormal;
+    DispRange = SystemZAddressingMode::Disp12Only;
+    break;
   case InlineAsm::Constraint_S:
+    // Accept an address with a long displacement, but no index.
+    Form = SystemZAddressingMode::FormBD;
+    DispRange = SystemZAddressingMode::Disp20Only;
+    break;
   case InlineAsm::Constraint_T:
-    // Accept addresses with short displacements, which are compatible
-    // with Q, R, S and T.  But keep the index operand for future expansion.
-    SDValue Base, Disp, Index;
-    if (selectBDXAddr(SystemZAddressingMode::FormBD,
-                      SystemZAddressingMode::Disp12Only,
-                      Op, Base, Disp, Index)) {
-      OutOps.push_back(Base);
-      OutOps.push_back(Disp);
-      OutOps.push_back(Index);
-      return false;
-    }
+  case InlineAsm::Constraint_m:
+    // Accept an address with a long displacement and an index.
+    // m works the same as T, as this is the most general case.
+    Form = SystemZAddressingMode::FormBDXNormal;
+    DispRange = SystemZAddressingMode::Disp20Only;
     break;
   }
+
+  if (selectBDXAddr(Form, DispRange, Op, Base, Disp, Index)) {
+    OutOps.push_back(Base);
+    OutOps.push_back(Disp);
+    OutOps.push_back(Index);
+    return false;
+  }
+
   return true;
 }
diff --git a/lib/Target/SystemZ/SystemZISelLowering.cpp b/lib/Target/SystemZ/SystemZISelLowering.cpp
index b0a612764636..14991bbbd365 100644
--- a/lib/Target/SystemZ/SystemZISelLowering.cpp
+++ b/lib/Target/SystemZ/SystemZISelLowering.cpp
@@ -184,8 +184,6 @@ SystemZTargetLowering::SystemZTargetLowering(const TargetMachine &TM,
 
       // No special instructions for these.
       setOperationAction(ISD::CTTZ,            VT, Expand);
-      setOperationAction(ISD::CTTZ_ZERO_UNDEF, VT, Expand);
-      setOperationAction(ISD::CTLZ_ZERO_UNDEF, VT, Expand);
       setOperationAction(ISD::ROTR,            VT, Expand);
 
       // Use *MUL_LOHI where possible instead of MULH*.
@@ -216,6 +214,11 @@ SystemZTargetLowering::SystemZTargetLowering(const TargetMachine &TM,
   setOperationAction(ISD::ATOMIC_LOAD_UMAX, MVT::i32, Custom);
   setOperationAction(ISD::ATOMIC_CMP_SWAP,  MVT::i32, Custom);
 
+  setOperationAction(ISD::ATOMIC_FENCE, MVT::Other, Custom);
+
+  // Traps are legal, as we will convert them to "j .+2".
+  setOperationAction(ISD::TRAP, MVT::Other, Legal);
+
   // z10 has instructions for signed but not unsigned FP conversion.
   // Handle unsigned 32-bit types as signed 64-bit types.
   if (!Subtarget.hasFPExtension()) {
@@ -253,6 +256,7 @@ SystemZTargetLowering::SystemZTargetLowering(const TargetMachine &TM,
   // We need to handle dynamic allocations specially because of the
   // 160-byte area at the bottom of the stack.
   setOperationAction(ISD::DYNAMIC_STACKALLOC, PtrVT, Custom);
+  setOperationAction(ISD::GET_DYNAMIC_AREA_OFFSET, PtrVT, Custom);
 
   // Use custom expanders so that we can force the function to use
   // a frame pointer.
@@ -310,8 +314,6 @@ SystemZTargetLowering::SystemZTargetLowering(const TargetMachine &TM,
       setOperationAction(ISD::CTPOP, VT, Custom);
       setOperationAction(ISD::CTTZ, VT, Legal);
       setOperationAction(ISD::CTLZ, VT, Legal);
-      setOperationAction(ISD::CTTZ_ZERO_UNDEF, VT, Custom);
-      setOperationAction(ISD::CTLZ_ZERO_UNDEF, VT, Custom);
 
       // Convert a GPR scalar to a vector by inserting it into element 0.
       setOperationAction(ISD::SCALAR_TO_VECTOR, VT, Custom);
@@ -437,6 +439,11 @@ SystemZTargetLowering::SystemZTargetLowering(const TargetMachine &TM,
   setTargetDAGCombine(ISD::STORE);
   setTargetDAGCombine(ISD::EXTRACT_VECTOR_ELT);
   setTargetDAGCombine(ISD::FP_ROUND);
+  setTargetDAGCombine(ISD::BSWAP);
+  setTargetDAGCombine(ISD::SHL);
+  setTargetDAGCombine(ISD::SRA);
+  setTargetDAGCombine(ISD::SRL);
+  setTargetDAGCombine(ISD::ROTL);
 
   // Handle intrinsics.
   setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::Other, Custom);
@@ -799,7 +806,7 @@ static void VerifyVectorTypes(const SmallVectorImpl<ISD::OutputArg> &Outs) {
 // Value is a value that has been passed to us in the location described by VA
 // (and so has type VA.getLocVT()).  Convert Value to VA.getValVT(), chaining
 // any loads onto Chain.
-static SDValue convertLocVTToValVT(SelectionDAG &DAG, SDLoc DL,
+static SDValue convertLocVTToValVT(SelectionDAG &DAG, const SDLoc &DL,
                                    CCValAssign &VA, SDValue Chain,
                                    SDValue Value) {
   // If the argument has been promoted from a smaller type, insert an
@@ -813,16 +820,12 @@ static SDValue convertLocVTToValVT(SelectionDAG &DAG, SDLoc DL,
 
   if (VA.isExtInLoc())
     Value = DAG.getNode(ISD::TRUNCATE, DL, VA.getValVT(), Value);
-  else if (VA.getLocInfo() == CCValAssign::Indirect)
-    Value = DAG.getLoad(VA.getValVT(), DL, Chain, Value,
-                        MachinePointerInfo(), false, false, false, 0);
   else if (VA.getLocInfo() == CCValAssign::BCvt) {
     // If this is a short vector argument loaded from the stack,
     // extend from i64 to full vector size and then bitcast.
     assert(VA.getLocVT() == MVT::i64);
     assert(VA.getValVT().isVector());
-    Value = DAG.getNode(ISD::BUILD_VECTOR, DL, MVT::v2i64,
-                        Value, DAG.getUNDEF(MVT::i64));
+    Value = DAG.getBuildVector(MVT::v2i64, DL, {Value, DAG.getUNDEF(MVT::i64)});
     Value = DAG.getNode(ISD::BITCAST, DL, VA.getValVT(), Value);
   } else
     assert(VA.getLocInfo() == CCValAssign::Full && "Unsupported getLocInfo");
@@ -832,7 +835,7 @@ static SDValue convertLocVTToValVT(SelectionDAG &DAG, SDLoc DL,
 // Value is a value of type VA.getValVT() that we need to copy into
 // the location described by VA.  Return a copy of Value converted to
 // VA.getValVT().  The caller is responsible for handling indirect values.
-static SDValue convertValVTToLocVT(SelectionDAG &DAG, SDLoc DL,
+static SDValue convertValVTToLocVT(SelectionDAG &DAG, const SDLoc &DL,
                                    CCValAssign &VA, SDValue Value) {
   switch (VA.getLocInfo()) {
   case CCValAssign::SExt:
@@ -856,11 +859,10 @@ static SDValue convertValVTToLocVT(SelectionDAG &DAG, SDLoc DL,
   }
 }
 
-SDValue SystemZTargetLowering::
-LowerFormalArguments(SDValue Chain, CallingConv::ID CallConv, bool IsVarArg,
-                     const SmallVectorImpl<ISD::InputArg> &Ins,
-                     SDLoc DL, SelectionDAG &DAG,
-                     SmallVectorImpl<SDValue> &InVals) const {
+SDValue SystemZTargetLowering::LowerFormalArguments(
+    SDValue Chain, CallingConv::ID CallConv, bool IsVarArg,
+    const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &DL,
+    SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals) const {
   MachineFunction &MF = DAG.getMachineFunction();
   MachineFrameInfo *MFI = MF.getFrameInfo();
   MachineRegisterInfo &MRI = MF.getRegInfo();
@@ -868,6 +870,7 @@ LowerFormalArguments(SDValue Chain, CallingConv::ID CallConv, bool IsVarArg,
       MF.getInfo<SystemZMachineFunctionInfo>();
   auto *TFL =
       static_cast<const SystemZFrameLowering *>(Subtarget.getFrameLowering());
+  EVT PtrVT = getPointerTy(DAG.getDataLayout());
 
   // Detect unsupported vector argument types.
   if (Subtarget.hasVector())
@@ -930,19 +933,34 @@ LowerFormalArguments(SDValue Chain, CallingConv::ID CallConv, bool IsVarArg,
       // Create the SelectionDAG nodes corresponding to a load
       // from this parameter.  Unpromoted ints and floats are
       // passed as right-justified 8-byte values.
-      EVT PtrVT = getPointerTy(DAG.getDataLayout());
       SDValue FIN = DAG.getFrameIndex(FI, PtrVT);
       if (VA.getLocVT() == MVT::i32 || VA.getLocVT() == MVT::f32)
         FIN = DAG.getNode(ISD::ADD, DL, PtrVT, FIN,
                           DAG.getIntPtrConstant(4, DL));
       ArgValue = DAG.getLoad(LocVT, DL, Chain, FIN,
-                             MachinePointerInfo::getFixedStack(MF, FI), false,
-                             false, false, 0);
+                             MachinePointerInfo::getFixedStack(MF, FI));
     }
 
     // Convert the value of the argument register into the value that's
     // being passed.
-    InVals.push_back(convertLocVTToValVT(DAG, DL, VA, Chain, ArgValue));
+    if (VA.getLocInfo() == CCValAssign::Indirect) {
+      InVals.push_back(DAG.getLoad(VA.getValVT(), DL, Chain, ArgValue,
+                                   MachinePointerInfo()));
+      // If the original argument was split (e.g. i128), we need
+      // to load all parts of it here (using the same address).
+      unsigned ArgIndex = Ins[I].OrigArgIndex;
+      assert (Ins[I].PartOffset == 0);
+      while (I + 1 != E && Ins[I + 1].OrigArgIndex == ArgIndex) {
+        CCValAssign &PartVA = ArgLocs[I + 1];
+        unsigned PartOffset = Ins[I + 1].PartOffset;
+        SDValue Address = DAG.getNode(ISD::ADD, DL, PtrVT, ArgValue,
+                                      DAG.getIntPtrConstant(PartOffset, DL));
+        InVals.push_back(DAG.getLoad(PartVA.getValVT(), DL, Chain, Address,
+                                     MachinePointerInfo()));
+        ++I;
+      }
+    } else
+      InVals.push_back(convertLocVTToValVT(DAG, DL, VA, Chain, ArgValue));
   }
 
   if (IsVarArg) {
@@ -973,8 +991,7 @@ LowerFormalArguments(SDValue Chain, CallingConv::ID CallConv, bool IsVarArg,
                                      &SystemZ::FP64BitRegClass);
         SDValue ArgValue = DAG.getCopyFromReg(Chain, DL, VReg, MVT::f64);
         MemOps[I] = DAG.getStore(ArgValue.getValue(1), DL, ArgValue, FIN,
-                                 MachinePointerInfo::getFixedStack(MF, FI),
-                                 false, false, 0);
+                                 MachinePointerInfo::getFixedStack(MF, FI));
       }
       // Join the stores, which are independent of one another.
       Chain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other,
@@ -987,9 +1004,11 @@ LowerFormalArguments(SDValue Chain, CallingConv::ID CallConv, bool IsVarArg,
 }
 
 static bool canUseSiblingCall(const CCState &ArgCCInfo,
-                              SmallVectorImpl<CCValAssign> &ArgLocs) {
+                              SmallVectorImpl<CCValAssign> &ArgLocs,
+                              SmallVectorImpl<ISD::OutputArg> &Outs) {
   // Punt if there are any indirect or stack arguments, or if the call
-  // needs the call-saved argument register R6.
+  // needs the callee-saved argument register R6, or if the call uses
+  // the callee-saved register arguments SwiftSelf and SwiftError.
   for (unsigned I = 0, E = ArgLocs.size(); I != E; ++I) {
     CCValAssign &VA = ArgLocs[I];
     if (VA.getLocInfo() == CCValAssign::Indirect)
@@ -999,6 +1018,8 @@ static bool canUseSiblingCall(const CCState &ArgCCInfo,
     unsigned Reg = VA.getLocReg();
     if (Reg == SystemZ::R6H || Reg == SystemZ::R6L || Reg == SystemZ::R6D)
       return false;
+    if (Outs[I].Flags.isSwiftSelf() || Outs[I].Flags.isSwiftError())
+      return false;
   }
   return true;
 }
@@ -1032,7 +1053,7 @@ SystemZTargetLowering::LowerCall(CallLoweringInfo &CLI,
 
   // We don't support GuaranteedTailCallOpt, only automatically-detected
   // sibling calls.
-  if (IsTailCall && !canUseSiblingCall(ArgCCInfo, ArgLocs))
+  if (IsTailCall && !canUseSiblingCall(ArgCCInfo, ArgLocs, Outs))
     IsTailCall = false;
 
   // Get a count of how many bytes are to be pushed on the stack.
@@ -1054,11 +1075,25 @@ SystemZTargetLowering::LowerCall(CallLoweringInfo &CLI,
 
     if (VA.getLocInfo() == CCValAssign::Indirect) {
       // Store the argument in a stack slot and pass its address.
-      SDValue SpillSlot = DAG.CreateStackTemporary(VA.getValVT());
+      SDValue SpillSlot = DAG.CreateStackTemporary(Outs[I].ArgVT);
       int FI = cast<FrameIndexSDNode>(SpillSlot)->getIndex();
-      MemOpChains.push_back(DAG.getStore(
-          Chain, DL, ArgValue, SpillSlot,
-          MachinePointerInfo::getFixedStack(MF, FI), false, false, 0));
+      MemOpChains.push_back(
+          DAG.getStore(Chain, DL, ArgValue, SpillSlot,
+                       MachinePointerInfo::getFixedStack(MF, FI)));
+      // If the original argument was split (e.g. i128), we need
+      // to store all parts of it here (and pass just one address).
+      unsigned ArgIndex = Outs[I].OrigArgIndex;
+      assert (Outs[I].PartOffset == 0);
+      while (I + 1 != E && Outs[I + 1].OrigArgIndex == ArgIndex) {
+        SDValue PartValue = OutVals[I + 1];
+        unsigned PartOffset = Outs[I + 1].PartOffset;
+        SDValue Address = DAG.getNode(ISD::ADD, DL, PtrVT, SpillSlot,
+                                      DAG.getIntPtrConstant(PartOffset, DL));
+        MemOpChains.push_back(
+            DAG.getStore(Chain, DL, PartValue, Address,
+                         MachinePointerInfo::getFixedStack(MF, FI)));
+        ++I;
+      }
       ArgValue = SpillSlot;
     } else
       ArgValue = convertValVTToLocVT(DAG, DL, VA, ArgValue);
@@ -1080,9 +1115,8 @@ SystemZTargetLowering::LowerCall(CallLoweringInfo &CLI,
                                     DAG.getIntPtrConstant(Offset, DL));
 
       // Emit the store.
-      MemOpChains.push_back(DAG.getStore(Chain, DL, ArgValue, Address,
-                                         MachinePointerInfo(),
-                                         false, false, 0));
+      MemOpChains.push_back(
+          DAG.getStore(Chain, DL, ArgValue, Address, MachinePointerInfo()));
     }
   }
 
@@ -1180,17 +1214,23 @@ CanLowerReturn(CallingConv::ID CallConv,
   if (Subtarget.hasVector())
     VerifyVectorTypes(Outs);
 
+  // Special case that we cannot easily detect in RetCC_SystemZ since
+  // i128 is not a legal type.
+  for (auto &Out : Outs)
+    if (Out.ArgVT == MVT::i128)
+      return false;
+
   SmallVector<CCValAssign, 16> RetLocs;
   CCState RetCCInfo(CallConv, isVarArg, MF, RetLocs, Context);
   return RetCCInfo.CheckReturn(Outs, RetCC_SystemZ);
 }
 
 SDValue
-SystemZTargetLowering::LowerReturn(SDValue Chain,
-                                   CallingConv::ID CallConv, bool IsVarArg,
+SystemZTargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv,
+                                   bool IsVarArg,
                                    const SmallVectorImpl<ISD::OutputArg> &Outs,
                                    const SmallVectorImpl<SDValue> &OutVals,
-                                   SDLoc DL, SelectionDAG &DAG) const {
+                                   const SDLoc &DL, SelectionDAG &DAG) const {
   MachineFunction &MF = DAG.getMachineFunction();
 
   // Detect unsupported vector return types.
@@ -1235,8 +1275,8 @@ SystemZTargetLowering::LowerReturn(SDValue Chain,
   return DAG.getNode(SystemZISD::RET_FLAG, DL, MVT::Other, RetOps);
 }
 
-SDValue SystemZTargetLowering::
-prepareVolatileOrAtomicLoad(SDValue Chain, SDLoc DL, SelectionDAG &DAG) const {
+SDValue SystemZTargetLowering::prepareVolatileOrAtomicLoad(
+    SDValue Chain, const SDLoc &DL, SelectionDAG &DAG) const {
   return DAG.getNode(SystemZISD::SERIALIZE, DL, MVT::Other, Chain);
 }
 
@@ -1399,6 +1439,11 @@ static bool isIntrinsicWithCC(SDValue Op, unsigned &Opcode, unsigned &CCValid) {
     CCValid = SystemZ::CCMASK_VCMP;
     return true;
 
+  case Intrinsic::s390_tdc:
+    Opcode = SystemZISD::TDC;
+    CCValid = SystemZ::CCMASK_TDC;
+    return true;
+
   default:
     return false;
   }
@@ -1538,7 +1583,7 @@ static IPMConversion getIPMConversion(unsigned CCValid, unsigned CCMask) {
 
 // If C can be converted to a comparison against zero, adjust the operands
 // as necessary.
-static void adjustZeroCmp(SelectionDAG &DAG, SDLoc DL, Comparison &C) {
+static void adjustZeroCmp(SelectionDAG &DAG, const SDLoc &DL, Comparison &C) {
   if (C.ICmpType == SystemZICMP::UnsignedOnly)
     return;
 
@@ -1558,7 +1603,8 @@ static void adjustZeroCmp(SelectionDAG &DAG, SDLoc DL, Comparison &C) {
 
 // If a comparison described by C is suitable for CLI(Y), CHHSI or CLHHSI,
 // adjust the operands as necessary.
-static void adjustSubwordCmp(SelectionDAG &DAG, SDLoc DL, Comparison &C) {
+static void adjustSubwordCmp(SelectionDAG &DAG, const SDLoc &DL,
+                             Comparison &C) {
   // For us to make any changes, it must a comparison between a single-use
   // load and a constant.
   if (!C.Op0.hasOneUse() ||
@@ -1614,11 +1660,10 @@ static void adjustSubwordCmp(SelectionDAG &DAG, SDLoc DL, Comparison &C) {
                               ISD::ZEXTLOAD);
   if (C.Op0.getValueType() != MVT::i32 ||
       Load->getExtensionType() != ExtType)
-    C.Op0 = DAG.getExtLoad(ExtType, SDLoc(Load), MVT::i32,
-                           Load->getChain(), Load->getBasePtr(),
-                           Load->getPointerInfo(), Load->getMemoryVT(),
-                           Load->isVolatile(), Load->isNonTemporal(),
-                           Load->isInvariant(), Load->getAlignment());
+    C.Op0 = DAG.getExtLoad(ExtType, SDLoc(Load), MVT::i32, Load->getChain(),
+                           Load->getBasePtr(), Load->getPointerInfo(),
+                           Load->getMemoryVT(), Load->getAlignment(),
+                           Load->getMemOperand()->getFlags());
 
   // Make sure that the second operand is an i32 with the right value.
   if (C.Op1.getValueType() != MVT::i32 ||
@@ -1719,7 +1764,8 @@ static unsigned reverseCCMask(unsigned CCMask) {
 // Check whether C tests for equality between X and Y and whether X - Y
 // or Y - X is also computed.  In that case it's better to compare the
 // result of the subtraction against zero.
-static void adjustForSubtraction(SelectionDAG &DAG, SDLoc DL, Comparison &C) {
+static void adjustForSubtraction(SelectionDAG &DAG, const SDLoc &DL,
+                                 Comparison &C) {
   if (C.CCMask == SystemZ::CCMASK_CMP_EQ ||
       C.CCMask == SystemZ::CCMASK_CMP_NE) {
     for (auto I = C.Op0->use_begin(), E = C.Op0->use_end(); I != E; ++I) {
@@ -1784,7 +1830,8 @@ static void adjustForLTGFR(Comparison &C) {
 // If C compares the truncation of an extending load, try to compare
 // the untruncated value instead.  This exposes more opportunities to
 // reuse CC.
-static void adjustICmpTruncate(SelectionDAG &DAG, SDLoc DL, Comparison &C) {
+static void adjustICmpTruncate(SelectionDAG &DAG, const SDLoc &DL,
+                               Comparison &C) {
   if (C.Op0.getOpcode() == ISD::TRUNCATE &&
       C.Op0.getOperand(0).getOpcode() == ISD::LOAD &&
       C.Op1.getOpcode() == ISD::Constant &&
@@ -1915,7 +1962,8 @@ static unsigned getTestUnderMaskCond(unsigned BitSize, unsigned CCMask,
 
 // See whether C can be implemented as a TEST UNDER MASK instruction.
 // Update the arguments with the TM version if so.
-static void adjustForTestUnderMask(SelectionDAG &DAG, SDLoc DL, Comparison &C) {
+static void adjustForTestUnderMask(SelectionDAG &DAG, const SDLoc &DL,
+                                   Comparison &C) {
   // Check that we have a comparison with a constant.
   auto *ConstOp1 = dyn_cast<ConstantSDNode>(C.Op1);
   if (!ConstOp1)
@@ -2036,7 +2084,7 @@ static Comparison getIntrinsicCmp(SelectionDAG &DAG, unsigned Opcode,
 
 // Decide how to implement a comparison of type Cond between CmpOp0 with CmpOp1.
 static Comparison getCmp(SelectionDAG &DAG, SDValue CmpOp0, SDValue CmpOp1,
-                         ISD::CondCode Cond, SDLoc DL) {
+                         ISD::CondCode Cond, const SDLoc &DL) {
   if (CmpOp1.getOpcode() == ISD::Constant) {
     uint64_t Constant = cast<ConstantSDNode>(CmpOp1)->getZExtValue();
     unsigned Opcode, CCValid;
@@ -2089,7 +2137,7 @@ static Comparison getCmp(SelectionDAG &DAG, SDValue CmpOp0, SDValue CmpOp1,
 }
 
 // Emit the comparison instruction described by C.
-static SDValue emitCmp(SelectionDAG &DAG, SDLoc DL, Comparison &C) {
+static SDValue emitCmp(SelectionDAG &DAG, const SDLoc &DL, Comparison &C) {
   if (!C.Op1.getNode()) {
     SDValue Op;
     switch (C.Op0.getOpcode()) {
@@ -2119,9 +2167,9 @@ static SDValue emitCmp(SelectionDAG &DAG, SDLoc DL, Comparison &C) {
 // Implement a 32-bit *MUL_LOHI operation by extending both operands to
 // 64 bits.  Extend is the extension type to use.  Store the high part
 // in Hi and the low part in Lo.
-static void lowerMUL_LOHI32(SelectionDAG &DAG, SDLoc DL,
-                            unsigned Extend, SDValue Op0, SDValue Op1,
-                            SDValue &Hi, SDValue &Lo) {
+static void lowerMUL_LOHI32(SelectionDAG &DAG, const SDLoc &DL, unsigned Extend,
+                            SDValue Op0, SDValue Op1, SDValue &Hi,
+                            SDValue &Lo) {
   Op0 = DAG.getNode(Extend, DL, MVT::i64, Op0);
   Op1 = DAG.getNode(Extend, DL, MVT::i64, Op1);
   SDValue Mul = DAG.getNode(ISD::MUL, DL, MVT::i64, Op0, Op1);
@@ -2136,10 +2184,9 @@ static void lowerMUL_LOHI32(SelectionDAG &DAG, SDLoc DL,
 // Extend extends Op0 to a GR128, and Opcode performs the GR128 operation
 // on the extended Op0 and (unextended) Op1.  Store the even register result
 // in Even and the odd register result in Odd.
-static void lowerGR128Binary(SelectionDAG &DAG, SDLoc DL, EVT VT,
-                             unsigned Extend, unsigned Opcode,
-                             SDValue Op0, SDValue Op1,
-                             SDValue &Even, SDValue &Odd) {
+static void lowerGR128Binary(SelectionDAG &DAG, const SDLoc &DL, EVT VT,
+                             unsigned Extend, unsigned Opcode, SDValue Op0,
+                             SDValue Op1, SDValue &Even, SDValue &Odd) {
   SDNode *In128 = DAG.getMachineNode(Extend, DL, MVT::Untyped, Op0);
   SDValue Result = DAG.getNode(Opcode, DL, MVT::Untyped,
                                SDValue(In128, 0), Op1);
@@ -2151,7 +2198,7 @@ static void lowerGR128Binary(SelectionDAG &DAG, SDLoc DL, EVT VT,
 // Return an i32 value that is 1 if the CC value produced by Glue is
 // in the mask CCMask and 0 otherwise.  CC is known to have a value
 // in CCValid, so other values can be ignored.
-static SDValue emitSETCC(SelectionDAG &DAG, SDLoc DL, SDValue Glue,
+static SDValue emitSETCC(SelectionDAG &DAG, const SDLoc &DL, SDValue Glue,
                          unsigned CCValid, unsigned CCMask) {
   IPMConversion Conversion = getIPMConversion(CCValid, CCMask);
   SDValue Result = DAG.getNode(SystemZISD::IPM, DL, MVT::i32, Glue);
@@ -2220,7 +2267,7 @@ static unsigned getVectorComparisonOrInvert(ISD::CondCode CC, bool IsFP,
 
 // Return a v2f64 that contains the extended form of elements Start and Start+1
 // of v4f32 value Op.
-static SDValue expandV4F32ToV2F64(SelectionDAG &DAG, int Start, SDLoc DL,
+static SDValue expandV4F32ToV2F64(SelectionDAG &DAG, int Start, const SDLoc &DL,
                                   SDValue Op) {
   int Mask[] = { Start, -1, Start + 1, -1 };
   Op = DAG.getVectorShuffle(MVT::v4f32, DL, Op, DAG.getUNDEF(MVT::v4f32), Mask);
@@ -2229,7 +2276,7 @@ static SDValue expandV4F32ToV2F64(SelectionDAG &DAG, int Start, SDLoc DL,
 
 // Build a comparison of vectors CmpOp0 and CmpOp1 using opcode Opcode,
 // producing a result of type VT.
-static SDValue getVectorCmp(SelectionDAG &DAG, unsigned Opcode, SDLoc DL,
+static SDValue getVectorCmp(SelectionDAG &DAG, unsigned Opcode, const SDLoc &DL,
                             EVT VT, SDValue CmpOp0, SDValue CmpOp1) {
   // There is no hardware support for v4f32, so extend the vector into
   // two v2f64s and compare those.
@@ -2247,7 +2294,7 @@ static SDValue getVectorCmp(SelectionDAG &DAG, unsigned Opcode, SDLoc DL,
 
 // Lower a vector comparison of type CC between CmpOp0 and CmpOp1, producing
 // an integer mask of type VT.
-static SDValue lowerVectorSETCC(SelectionDAG &DAG, SDLoc DL, EVT VT,
+static SDValue lowerVectorSETCC(SelectionDAG &DAG, const SDLoc &DL, EVT VT,
                                 ISD::CondCode CC, SDValue CmpOp0,
                                 SDValue CmpOp1) {
   bool IsFP = CmpOp0.getValueType().isFloatingPoint();
@@ -2342,7 +2389,7 @@ static bool isAbsolute(SDValue CmpOp, SDValue Pos, SDValue Neg) {
 }
 
 // Return the absolute or negative absolute of Op; IsNegative decides which.
-static SDValue getAbsolute(SelectionDAG &DAG, SDLoc DL, SDValue Op,
+static SDValue getAbsolute(SelectionDAG &DAG, const SDLoc &DL, SDValue Op,
                            bool IsNegative) {
   Op = DAG.getNode(SystemZISD::IABS, DL, Op.getValueType(), Op);
   if (IsNegative)
@@ -2414,11 +2461,10 @@ SDValue SystemZTargetLowering::lowerGlobalAddress(GlobalAddressSDNode *Node,
   const GlobalValue *GV = Node->getGlobal();
   int64_t Offset = Node->getOffset();
   EVT PtrVT = getPointerTy(DAG.getDataLayout());
-  Reloc::Model RM = DAG.getTarget().getRelocationModel();
   CodeModel::Model CM = DAG.getTarget().getCodeModel();
 
   SDValue Result;
-  if (Subtarget.isPC32DBLSymbol(GV, RM, CM)) {
+  if (Subtarget.isPC32DBLSymbol(GV, CM)) {
     // Assign anchors at 1<<12 byte boundaries.
     uint64_t Anchor = Offset & ~uint64_t(0xfff);
     Result = DAG.getTargetGlobalAddress(GV, DL, PtrVT, Anchor);
@@ -2435,8 +2481,7 @@ SDValue SystemZTargetLowering::lowerGlobalAddress(GlobalAddressSDNode *Node,
     Result = DAG.getTargetGlobalAddress(GV, DL, PtrVT, 0, SystemZII::MO_GOT);
     Result = DAG.getNode(SystemZISD::PCREL_WRAPPER, DL, PtrVT, Result);
     Result = DAG.getLoad(PtrVT, DL, DAG.getEntryNode(), Result,
-                         MachinePointerInfo::getGOT(DAG.getMachineFunction()),
-                         false, false, false, 0);
+                         MachinePointerInfo::getGOT(DAG.getMachineFunction()));
   }
 
   // If there was a non-zero offset that we didn't fold, create an explicit
@@ -2495,14 +2540,9 @@ SDValue SystemZTargetLowering::lowerTLSGetOffset(GlobalAddressSDNode *Node,
   return DAG.getCopyFromReg(Chain, DL, SystemZ::R2D, PtrVT, Glue);
 }
 
-SDValue SystemZTargetLowering::lowerGlobalTLSAddress(GlobalAddressSDNode *Node,
-                                                     SelectionDAG &DAG) const {
-  if (DAG.getTarget().Options.EmulatedTLS)
-    return LowerToTLSEmulatedModel(Node, DAG);
-  SDLoc DL(Node);
-  const GlobalValue *GV = Node->getGlobal();
+SDValue SystemZTargetLowering::lowerThreadPointer(const SDLoc &DL,
+                                                  SelectionDAG &DAG) const {
   EVT PtrVT = getPointerTy(DAG.getDataLayout());
-  TLSModel::Model model = DAG.getTarget().getTLSModel(GV);
 
   // The high part of the thread pointer is in access register 0.
   SDValue TPHi = DAG.getNode(SystemZISD::EXTRACT_ACCESS, DL, MVT::i32,
@@ -2517,7 +2557,19 @@ SDValue SystemZTargetLowering::lowerGlobalTLSAddress(GlobalAddressSDNode *Node,
   // Merge them into a single 64-bit address.
   SDValue TPHiShifted = DAG.getNode(ISD::SHL, DL, PtrVT, TPHi,
                                     DAG.getConstant(32, DL, PtrVT));
-  SDValue TP = DAG.getNode(ISD::OR, DL, PtrVT, TPHiShifted, TPLo);
+  return DAG.getNode(ISD::OR, DL, PtrVT, TPHiShifted, TPLo);
+}
+
+SDValue SystemZTargetLowering::lowerGlobalTLSAddress(GlobalAddressSDNode *Node,
+                                                     SelectionDAG &DAG) const {
+  if (DAG.getTarget().Options.EmulatedTLS)
+    return LowerToTLSEmulatedModel(Node, DAG);
+  SDLoc DL(Node);
+  const GlobalValue *GV = Node->getGlobal();
+  EVT PtrVT = getPointerTy(DAG.getDataLayout());
+  TLSModel::Model model = DAG.getTarget().getTLSModel(GV);
+
+  SDValue TP = lowerThreadPointer(DL, DAG);
 
   // Get the offset of GA from the thread pointer, based on the TLS model.
   SDValue Offset;
@@ -2530,8 +2582,7 @@ SDValue SystemZTargetLowering::lowerGlobalTLSAddress(GlobalAddressSDNode *Node,
       Offset = DAG.getConstantPool(CPV, PtrVT, 8);
       Offset = DAG.getLoad(
           PtrVT, DL, DAG.getEntryNode(), Offset,
-          MachinePointerInfo::getConstantPool(DAG.getMachineFunction()), false,
-          false, false, 0);
+          MachinePointerInfo::getConstantPool(DAG.getMachineFunction()));
 
       // Call __tls_get_offset to retrieve the offset.
       Offset = lowerTLSGetOffset(Node, DAG, SystemZISD::TLS_GDCALL, Offset);
@@ -2546,8 +2597,7 @@ SDValue SystemZTargetLowering::lowerGlobalTLSAddress(GlobalAddressSDNode *Node,
       Offset = DAG.getConstantPool(CPV, PtrVT, 8);
       Offset = DAG.getLoad(
           PtrVT, DL, DAG.getEntryNode(), Offset,
-          MachinePointerInfo::getConstantPool(DAG.getMachineFunction()), false,
-          false, false, 0);
+          MachinePointerInfo::getConstantPool(DAG.getMachineFunction()));
 
       // Call __tls_get_offset to retrieve the module base offset.
       Offset = lowerTLSGetOffset(Node, DAG, SystemZISD::TLS_LDCALL, Offset);
@@ -2565,8 +2615,7 @@ SDValue SystemZTargetLowering::lowerGlobalTLSAddress(GlobalAddressSDNode *Node,
       SDValue DTPOffset = DAG.getConstantPool(CPV, PtrVT, 8);
       DTPOffset = DAG.getLoad(
           PtrVT, DL, DAG.getEntryNode(), DTPOffset,
-          MachinePointerInfo::getConstantPool(DAG.getMachineFunction()), false,
-          false, false, 0);
+          MachinePointerInfo::getConstantPool(DAG.getMachineFunction()));
 
       Offset = DAG.getNode(ISD::ADD, DL, PtrVT, Offset, DTPOffset);
       break;
@@ -2577,9 +2626,9 @@ SDValue SystemZTargetLowering::lowerGlobalTLSAddress(GlobalAddressSDNode *Node,
       Offset = DAG.getTargetGlobalAddress(GV, DL, PtrVT, 0,
                                           SystemZII::MO_INDNTPOFF);
       Offset = DAG.getNode(SystemZISD::PCREL_WRAPPER, DL, PtrVT, Offset);
-      Offset = DAG.getLoad(PtrVT, DL, DAG.getEntryNode(), Offset,
-                           MachinePointerInfo::getGOT(DAG.getMachineFunction()),
-                           false, false, false, 0);
+      Offset =
+          DAG.getLoad(PtrVT, DL, DAG.getEntryNode(), Offset,
+                      MachinePointerInfo::getGOT(DAG.getMachineFunction()));
       break;
     }
 
@@ -2591,8 +2640,7 @@ SDValue SystemZTargetLowering::lowerGlobalTLSAddress(GlobalAddressSDNode *Node,
       Offset = DAG.getConstantPool(CPV, PtrVT, 8);
       Offset = DAG.getLoad(
           PtrVT, DL, DAG.getEntryNode(), Offset,
-          MachinePointerInfo::getConstantPool(DAG.getMachineFunction()), false,
-          false, false, 0);
+          MachinePointerInfo::getConstantPool(DAG.getMachineFunction()));
       break;
     }
   }
@@ -2640,6 +2688,57 @@ SDValue SystemZTargetLowering::lowerConstantPool(ConstantPoolSDNode *CP,
   return DAG.getNode(SystemZISD::PCREL_WRAPPER, DL, PtrVT, Result);
 }
 
+SDValue SystemZTargetLowering::lowerFRAMEADDR(SDValue Op,
+                                              SelectionDAG &DAG) const {
+  MachineFunction &MF = DAG.getMachineFunction();
+  MachineFrameInfo *MFI = MF.getFrameInfo();
+  MFI->setFrameAddressIsTaken(true);
+
+  SDLoc DL(Op);
+  unsigned Depth = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
+  EVT PtrVT = getPointerTy(DAG.getDataLayout());
+
+  // If the back chain frame index has not been allocated yet, do so.
+  SystemZMachineFunctionInfo *FI = MF.getInfo<SystemZMachineFunctionInfo>();
+  int BackChainIdx = FI->getFramePointerSaveIndex();
+  if (!BackChainIdx) {
+    // By definition, the frame address is the address of the back chain.
+    BackChainIdx = MFI->CreateFixedObject(8, -SystemZMC::CallFrameSize, false);
+    FI->setFramePointerSaveIndex(BackChainIdx);
+  }
+  SDValue BackChain = DAG.getFrameIndex(BackChainIdx, PtrVT);
+
+  // FIXME The frontend should detect this case.
+  if (Depth > 0) {
+    report_fatal_error("Unsupported stack frame traversal count");
+  }
+
+  return BackChain;
+}
+
+SDValue SystemZTargetLowering::lowerRETURNADDR(SDValue Op,
+                                               SelectionDAG &DAG) const {
+  MachineFunction &MF = DAG.getMachineFunction();
+  MachineFrameInfo *MFI = MF.getFrameInfo();
+  MFI->setReturnAddressIsTaken(true);
+
+  if (verifyReturnAddressArgumentIsConstant(Op, DAG))
+    return SDValue();
+
+  SDLoc DL(Op);
+  unsigned Depth = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
+  EVT PtrVT = getPointerTy(DAG.getDataLayout());
+
+  // FIXME The frontend should detect this case.
+  if (Depth > 0) {
+    report_fatal_error("Unsupported stack frame traversal count");
+  }
+
+  // Return R14D, which has the return address. Mark it an implicit live-in.
+  unsigned LinkReg = MF.addLiveIn(SystemZ::R14D, &SystemZ::GR64BitRegClass);
+  return DAG.getCopyFromReg(DAG.getEntryNode(), DL, LinkReg, PtrVT);
+}
+
 SDValue SystemZTargetLowering::lowerBITCAST(SDValue Op,
                                             SelectionDAG &DAG) const {
   SDLoc DL(Op);
@@ -2715,8 +2814,7 @@ SDValue SystemZTargetLowering::lowerVASTART(SDValue Op,
       FieldAddr = DAG.getNode(ISD::ADD, DL, PtrVT, FieldAddr,
                               DAG.getIntPtrConstant(Offset, DL));
     MemOps[I] = DAG.getStore(Chain, DL, Fields[I], FieldAddr,
-                             MachinePointerInfo(SV, Offset),
-                             false, false, 0);
+                             MachinePointerInfo(SV, Offset));
     Offset += 8;
   }
   return DAG.getNode(ISD::TokenFactor, DL, MVT::Other, MemOps);
@@ -2740,8 +2838,9 @@ SDValue SystemZTargetLowering::lowerVACOPY(SDValue Op,
 SDValue SystemZTargetLowering::
 lowerDYNAMIC_STACKALLOC(SDValue Op, SelectionDAG &DAG) const {
   const TargetFrameLowering *TFI = Subtarget.getFrameLowering();
-  bool RealignOpt = !DAG.getMachineFunction().getFunction()->
-    hasFnAttribute("no-realign-stack");
+  MachineFunction &MF = DAG.getMachineFunction();
+  bool RealignOpt = !MF.getFunction()-> hasFnAttribute("no-realign-stack");
+  bool StoreBackchain = MF.getFunction()->hasFnAttribute("backchain");
 
   SDValue Chain = Op.getOperand(0);
   SDValue Size  = Op.getOperand(1);
@@ -2763,10 +2862,15 @@ lowerDYNAMIC_STACKALLOC(SDValue Op, SelectionDAG &DAG) const {
   // Get a reference to the stack pointer.
   SDValue OldSP = DAG.getCopyFromReg(Chain, DL, SPReg, MVT::i64);
 
+  // If we need a backchain, save it now.
+  SDValue Backchain;
+  if (StoreBackchain)
+    Backchain = DAG.getLoad(MVT::i64, DL, Chain, OldSP, MachinePointerInfo());
+
   // Add extra space for alignment if needed.
   if (ExtraAlignSpace)
     NeededSpace = DAG.getNode(ISD::ADD, DL, MVT::i64, NeededSpace,
-                              DAG.getConstant(ExtraAlignSpace, DL, MVT::i64)); 
+                              DAG.getConstant(ExtraAlignSpace, DL, MVT::i64));
 
   // Get the new stack pointer value.
   SDValue NewSP = DAG.getNode(ISD::SUB, DL, MVT::i64, OldSP, NeededSpace);
@@ -2790,10 +2894,20 @@ lowerDYNAMIC_STACKALLOC(SDValue Op, SelectionDAG &DAG) const {
                   DAG.getConstant(~(RequiredAlign - 1), DL, MVT::i64));
   }
 
+  if (StoreBackchain)
+    Chain = DAG.getStore(Chain, DL, Backchain, NewSP, MachinePointerInfo());
+
   SDValue Ops[2] = { Result, Chain };
   return DAG.getMergeValues(Ops, DL);
 }
 
+SDValue SystemZTargetLowering::lowerGET_DYNAMIC_AREA_OFFSET(
+    SDValue Op, SelectionDAG &DAG) const {
+  SDLoc DL(Op);
+
+  return DAG.getNode(SystemZISD::ADJDYNALLOC, DL, MVT::i64);
+}
+
 SDValue SystemZTargetLowering::lowerSMUL_LOHI(SDValue Op,
                                               SelectionDAG &DAG) const {
   EVT VT = Op.getValueType();
@@ -3031,6 +3145,27 @@ SDValue SystemZTargetLowering::lowerCTPOP(SDValue Op,
   return Op;
 }
 
+SDValue SystemZTargetLowering::lowerATOMIC_FENCE(SDValue Op,
+                                                 SelectionDAG &DAG) const {
+  SDLoc DL(Op);
+  AtomicOrdering FenceOrdering = static_cast<AtomicOrdering>(
+    cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue());
+  SynchronizationScope FenceScope = static_cast<SynchronizationScope>(
+    cast<ConstantSDNode>(Op.getOperand(2))->getZExtValue());
+
+  // The only fence that needs an instruction is a sequentially-consistent
+  // cross-thread fence.
+  if (FenceOrdering == AtomicOrdering::SequentiallyConsistent &&
+      FenceScope == CrossThread) {
+    return SDValue(DAG.getMachineNode(SystemZ::Serialize, DL, MVT::Other,
+                                      Op.getOperand(0)),
+                   0);
+  }
+
+  // MEMBARRIER is a compiler barrier; it codegens to a no-op.
+  return DAG.getNode(SystemZISD::MEMBARRIER, DL, MVT::Other, Op.getOperand(0));
+}
+
 // Op is an atomic load.  Lower it into a normal volatile load.
 SDValue SystemZTargetLowering::lowerATOMIC_LOAD(SDValue Op,
                                                 SelectionDAG &DAG) const {
@@ -3220,8 +3355,24 @@ SDValue SystemZTargetLowering::lowerSTACKRESTORE(SDValue Op,
                                                  SelectionDAG &DAG) const {
   MachineFunction &MF = DAG.getMachineFunction();
   MF.getInfo<SystemZMachineFunctionInfo>()->setManipulatesSP(true);
-  return DAG.getCopyToReg(Op.getOperand(0), SDLoc(Op),
-                          SystemZ::R15D, Op.getOperand(1));
+  bool StoreBackchain = MF.getFunction()->hasFnAttribute("backchain");
+
+  SDValue Chain = Op.getOperand(0);
+  SDValue NewSP = Op.getOperand(1);
+  SDValue Backchain;
+  SDLoc DL(Op);
+
+  if (StoreBackchain) {
+    SDValue OldSP = DAG.getCopyFromReg(Chain, DL, SystemZ::R15D, MVT::i64);
+    Backchain = DAG.getLoad(MVT::i64, DL, Chain, OldSP, MachinePointerInfo());
+  }
+
+  Chain = DAG.getCopyToReg(Chain, DL, SystemZ::R15D, NewSP);
+
+  if (StoreBackchain)
+    Chain = DAG.getStore(Chain, DL, Backchain, NewSP, MachinePointerInfo());
+
+  return Chain;
 }
 
 SDValue SystemZTargetLowering::lowerPREFETCH(SDValue Op,
@@ -3286,6 +3437,9 @@ SystemZTargetLowering::lowerINTRINSIC_WO_CHAIN(SDValue Op,
 
   unsigned Id = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
   switch (Id) {
+  case Intrinsic::thread_pointer:
+    return lowerThreadPointer(SDLoc(Op), DAG);
+
   case Intrinsic::s390_vpdi:
     return DAG.getNode(SystemZISD::PERMUTE_DWORDS, SDLoc(Op), Op.getValueType(),
                        Op.getOperand(1), Op.getOperand(2), Op.getOperand(3));
@@ -3553,7 +3707,7 @@ static bool isShlDoublePermute(const SmallVectorImpl<int> &Bytes,
 
 // Create a node that performs P on operands Op0 and Op1, casting the
 // operands to the appropriate type.  The type of the result is determined by P.
-static SDValue getPermuteNode(SelectionDAG &DAG, SDLoc DL,
+static SDValue getPermuteNode(SelectionDAG &DAG, const SDLoc &DL,
                               const Permute &P, SDValue Op0, SDValue Op1) {
   // VPDI (PERMUTE_DWORDS) always operates on v2i64s.  The input
   // elements of a PACK are twice as wide as the outputs.
@@ -3582,7 +3736,8 @@ static SDValue getPermuteNode(SelectionDAG &DAG, SDLoc DL,
 // Bytes is a VPERM-like permute vector, except that -1 is used for
 // undefined bytes.  Implement it on operands Ops[0] and Ops[1] using
 // VSLDI or VPERM.
-static SDValue getGeneralPermuteNode(SelectionDAG &DAG, SDLoc DL, SDValue *Ops,
+static SDValue getGeneralPermuteNode(SelectionDAG &DAG, const SDLoc &DL,
+                                     SDValue *Ops,
                                      const SmallVectorImpl<int> &Bytes) {
   for (unsigned I = 0; I < 2; ++I)
     Ops[I] = DAG.getNode(ISD::BITCAST, DL, MVT::v16i8, Ops[I]);
@@ -3600,7 +3755,7 @@ static SDValue getGeneralPermuteNode(SelectionDAG &DAG, SDLoc DL, SDValue *Ops,
       IndexNodes[I] = DAG.getConstant(Bytes[I], DL, MVT::i32);
     else
       IndexNodes[I] = DAG.getUNDEF(MVT::i32);
-  SDValue Op2 = DAG.getNode(ISD::BUILD_VECTOR, DL, MVT::v16i8, IndexNodes);
+  SDValue Op2 = DAG.getBuildVector(MVT::v16i8, DL, IndexNodes);
   return DAG.getNode(SystemZISD::PERMUTE, DL, MVT::v16i8, Ops[0], Ops[1], Op2);
 }
 
@@ -3610,7 +3765,7 @@ struct GeneralShuffle {
   GeneralShuffle(EVT vt) : VT(vt) {}
   void addUndef();
   void add(SDValue, unsigned);
-  SDValue getNode(SelectionDAG &, SDLoc);
+  SDValue getNode(SelectionDAG &, const SDLoc &);
 
   // The operands of the shuffle.
   SmallVector<SDValue, SystemZ::VectorBytes> Ops;
@@ -3667,7 +3822,7 @@ void GeneralShuffle::add(SDValue Op, unsigned Elem) {
       }
       Op = Op.getOperand(unsigned(NewByte) / SystemZ::VectorBytes);
       Byte = unsigned(NewByte) % SystemZ::VectorBytes;
-    } else if (Op.getOpcode() == ISD::UNDEF) {
+    } else if (Op.isUndef()) {
       addUndef();
       return;
     } else
@@ -3689,7 +3844,7 @@ void GeneralShuffle::add(SDValue Op, unsigned Elem) {
 }
 
 // Return SDNodes for the completed shuffle.
-SDValue GeneralShuffle::getNode(SelectionDAG &DAG, SDLoc DL) {
+SDValue GeneralShuffle::getNode(SelectionDAG &DAG, const SDLoc &DL) {
   assert(Bytes.size() == SystemZ::VectorBytes && "Incomplete vector");
 
   if (Ops.size() == 0)
@@ -3770,37 +3925,37 @@ SDValue GeneralShuffle::getNode(SelectionDAG &DAG, SDLoc DL) {
 // Return true if the given BUILD_VECTOR is a scalar-to-vector conversion.
 static bool isScalarToVector(SDValue Op) {
   for (unsigned I = 1, E = Op.getNumOperands(); I != E; ++I)
-    if (Op.getOperand(I).getOpcode() != ISD::UNDEF)
+    if (!Op.getOperand(I).isUndef())
       return false;
   return true;
 }
 
 // Return a vector of type VT that contains Value in the first element.
 // The other elements don't matter.
-static SDValue buildScalarToVector(SelectionDAG &DAG, SDLoc DL, EVT VT,
+static SDValue buildScalarToVector(SelectionDAG &DAG, const SDLoc &DL, EVT VT,
                                    SDValue Value) {
   // If we have a constant, replicate it to all elements and let the
   // BUILD_VECTOR lowering take care of it.
   if (Value.getOpcode() == ISD::Constant ||
       Value.getOpcode() == ISD::ConstantFP) {
     SmallVector<SDValue, 16> Ops(VT.getVectorNumElements(), Value);
-    return DAG.getNode(ISD::BUILD_VECTOR, DL, VT, Ops);
+    return DAG.getBuildVector(VT, DL, Ops);
   }
-  if (Value.getOpcode() == ISD::UNDEF)
+  if (Value.isUndef())
     return DAG.getUNDEF(VT);
   return DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VT, Value);
 }
 
 // Return a vector of type VT in which Op0 is in element 0 and Op1 is in
 // element 1.  Used for cases in which replication is cheap.
-static SDValue buildMergeScalars(SelectionDAG &DAG, SDLoc DL, EVT VT,
+static SDValue buildMergeScalars(SelectionDAG &DAG, const SDLoc &DL, EVT VT,
                                  SDValue Op0, SDValue Op1) {
-  if (Op0.getOpcode() == ISD::UNDEF) {
-    if (Op1.getOpcode() == ISD::UNDEF)
+  if (Op0.isUndef()) {
+    if (Op1.isUndef())
       return DAG.getUNDEF(VT);
     return DAG.getNode(SystemZISD::REPLICATE, DL, VT, Op1);
   }
-  if (Op1.getOpcode() == ISD::UNDEF)
+  if (Op1.isUndef())
     return DAG.getNode(SystemZISD::REPLICATE, DL, VT, Op0);
   return DAG.getNode(SystemZISD::MERGE_HIGH, DL, VT,
                      buildScalarToVector(DAG, DL, VT, Op0),
@@ -3809,15 +3964,15 @@ static SDValue buildMergeScalars(SelectionDAG &DAG, SDLoc DL, EVT VT,
 
 // Extend GPR scalars Op0 and Op1 to doublewords and return a v2i64
 // vector for them.
-static SDValue joinDwords(SelectionDAG &DAG, SDLoc DL, SDValue Op0,
+static SDValue joinDwords(SelectionDAG &DAG, const SDLoc &DL, SDValue Op0,
                           SDValue Op1) {
-  if (Op0.getOpcode() == ISD::UNDEF && Op1.getOpcode() == ISD::UNDEF)
+  if (Op0.isUndef() && Op1.isUndef())
     return DAG.getUNDEF(MVT::v2i64);
   // If one of the two inputs is undefined then replicate the other one,
   // in order to avoid using another register unnecessarily.
-  if (Op0.getOpcode() == ISD::UNDEF)
+  if (Op0.isUndef())
     Op0 = Op1 = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i64, Op1);
-  else if (Op1.getOpcode() == ISD::UNDEF)
+  else if (Op1.isUndef())
     Op0 = Op1 = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i64, Op0);
   else {
     Op0 = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i64, Op0);
@@ -3834,7 +3989,7 @@ static bool tryBuildVectorByteMask(BuildVectorSDNode *BVN, uint64_t &Mask) {
   unsigned BytesPerElement = ElemVT.getStoreSize();
   for (unsigned I = 0, E = BVN->getNumOperands(); I != E; ++I) {
     SDValue Op = BVN->getOperand(I);
-    if (Op.getOpcode() != ISD::UNDEF) {
+    if (!Op.isUndef()) {
       uint64_t Value;
       if (Op.getOpcode() == ISD::Constant)
         Value = dyn_cast<ConstantSDNode>(Op)->getZExtValue();
@@ -3862,7 +4017,7 @@ static bool tryBuildVectorByteMask(BuildVectorSDNode *BVN, uint64_t &Mask) {
 // an empty value.
 static SDValue tryBuildVectorReplicate(SelectionDAG &DAG,
                                        const SystemZInstrInfo *TII,
-                                       SDLoc DL, EVT VT, uint64_t Value,
+                                       const SDLoc &DL, EVT VT, uint64_t Value,
                                        unsigned BitsPerElement) {
   // Signed 16-bit values can be replicated using VREPI.
   int64_t SignedValue = SignExtend64(Value, BitsPerElement);
@@ -3919,7 +4074,7 @@ static SDValue tryBuildVectorShuffle(SelectionDAG &DAG,
       unsigned Elem = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue();
       GS.add(Op.getOperand(0), Elem);
       FoundOne = true;
-    } else if (Op.getOpcode() == ISD::UNDEF) {
+    } else if (Op.isUndef()) {
       GS.addUndef();
     } else {
       GS.add(SDValue(), ResidueOps.size());
@@ -3937,7 +4092,7 @@ static SDValue tryBuildVectorShuffle(SelectionDAG &DAG,
       ResidueOps.push_back(DAG.getUNDEF(ResidueOps[0].getValueType()));
     for (auto &Op : GS.Ops) {
       if (!Op.getNode()) {
-        Op = DAG.getNode(ISD::BUILD_VECTOR, SDLoc(BVN), VT, ResidueOps);
+        Op = DAG.getBuildVector(VT, SDLoc(BVN), ResidueOps);
         break;
       }
     }
@@ -3946,14 +4101,14 @@ static SDValue tryBuildVectorShuffle(SelectionDAG &DAG,
 }
 
 // Combine GPR scalar values Elems into a vector of type VT.
-static SDValue buildVector(SelectionDAG &DAG, SDLoc DL, EVT VT,
+static SDValue buildVector(SelectionDAG &DAG, const SDLoc &DL, EVT VT,
                            SmallVectorImpl<SDValue> &Elems) {
   // See whether there is a single replicated value.
   SDValue Single;
   unsigned int NumElements = Elems.size();
   unsigned int Count = 0;
   for (auto Elem : Elems) {
-    if (Elem.getOpcode() != ISD::UNDEF) {
+    if (!Elem.isUndef()) {
       if (!Single.getNode())
         Single = Elem;
       else if (Elem != Single) {
@@ -3998,9 +4153,9 @@ static SDValue buildVector(SelectionDAG &DAG, SDLoc DL, EVT VT,
     SDValue Op01 = buildMergeScalars(DAG, DL, VT, Elems[0], Elems[1]);
     SDValue Op23 = buildMergeScalars(DAG, DL, VT, Elems[2], Elems[3]);
     // Avoid unnecessary undefs by reusing the other operand.
-    if (Op01.getOpcode() == ISD::UNDEF)
+    if (Op01.isUndef())
       Op01 = Op23;
-    else if (Op23.getOpcode() == ISD::UNDEF)
+    else if (Op23.isUndef())
       Op23 = Op01;
     // Merging identical replications is a no-op.
     if (Op01.getOpcode() == SystemZISD::REPLICATE && Op01 == Op23)
@@ -4034,7 +4189,7 @@ static SDValue buildVector(SelectionDAG &DAG, SDLoc DL, EVT VT,
     for (unsigned I = 0; I < NumElements; ++I)
       if (!Constants[I].getNode())
         Constants[I] = DAG.getUNDEF(Elems[I].getValueType());
-    Result = DAG.getNode(ISD::BUILD_VECTOR, DL, VT, Constants);
+    Result = DAG.getBuildVector(VT, DL, Constants);
   } else {
     // Otherwise try to use VLVGP to start the sequence in order to
     // avoid a false dependency on any previous contents of the vector
@@ -4042,8 +4197,8 @@ static SDValue buildVector(SelectionDAG &DAG, SDLoc DL, EVT VT,
     // is defined.
     unsigned I1 = NumElements / 2 - 1;
     unsigned I2 = NumElements - 1;
-    bool Def1 = (Elems[I1].getOpcode() != ISD::UNDEF);
-    bool Def2 = (Elems[I2].getOpcode() != ISD::UNDEF);
+    bool Def1 = !Elems[I1].isUndef();
+    bool Def2 = !Elems[I2].isUndef();
     if (Def1 || Def2) {
       SDValue Elem1 = Elems[Def1 ? I1 : I2];
       SDValue Elem2 = Elems[Def2 ? I2 : I1];
@@ -4057,7 +4212,7 @@ static SDValue buildVector(SelectionDAG &DAG, SDLoc DL, EVT VT,
 
   // Use VLVGx to insert the other elements.
   for (unsigned I = 0; I < NumElements; ++I)
-    if (!Done[I] && Elems[I].getOpcode() != ISD::UNDEF)
+    if (!Done[I] && !Elems[I].isUndef())
       Result = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, VT, Result, Elems[I],
                            DAG.getConstant(I, DL, MVT::i32));
   return Result;
@@ -4120,8 +4275,7 @@ SDValue SystemZTargetLowering::lowerBUILD_VECTOR(SDValue Op,
   }
 
   // See if we should use shuffles to construct the vector from other vectors.
-  SDValue Res = tryBuildVectorShuffle(DAG, BVN);
-  if (Res.getNode())
+  if (SDValue Res = tryBuildVectorShuffle(DAG, BVN))
     return Res;
 
   // Detect SCALAR_TO_VECTOR conversions.
@@ -4312,6 +4466,10 @@ SDValue SystemZTargetLowering::lowerShift(SDValue Op, SelectionDAG &DAG,
 SDValue SystemZTargetLowering::LowerOperation(SDValue Op,
                                               SelectionDAG &DAG) const {
   switch (Op.getOpcode()) {
+  case ISD::FRAMEADDR:
+    return lowerFRAMEADDR(Op, DAG);
+  case ISD::RETURNADDR:
+    return lowerRETURNADDR(Op, DAG);
   case ISD::BR_CC:
     return lowerBR_CC(Op, DAG);
   case ISD::SELECT_CC:
@@ -4336,6 +4494,8 @@ SDValue SystemZTargetLowering::LowerOperation(SDValue Op,
     return lowerVACOPY(Op, DAG);
   case ISD::DYNAMIC_STACKALLOC:
     return lowerDYNAMIC_STACKALLOC(Op, DAG);
+  case ISD::GET_DYNAMIC_AREA_OFFSET:
+    return lowerGET_DYNAMIC_AREA_OFFSET(Op, DAG);
   case ISD::SMUL_LOHI:
     return lowerSMUL_LOHI(Op, DAG);
   case ISD::UMUL_LOHI:
@@ -4348,12 +4508,8 @@ SDValue SystemZTargetLowering::LowerOperation(SDValue Op,
     return lowerOR(Op, DAG);
   case ISD::CTPOP:
     return lowerCTPOP(Op, DAG);
-  case ISD::CTLZ_ZERO_UNDEF:
-    return DAG.getNode(ISD::CTLZ, SDLoc(Op),
-                       Op.getValueType(), Op.getOperand(0));
-  case ISD::CTTZ_ZERO_UNDEF:
-    return DAG.getNode(ISD::CTTZ, SDLoc(Op),
-                       Op.getValueType(), Op.getOperand(0));
+  case ISD::ATOMIC_FENCE:
+    return lowerATOMIC_FENCE(Op, DAG);
   case ISD::ATOMIC_SWAP:
     return lowerATOMIC_LOAD_OP(Op, DAG, SystemZISD::ATOMIC_SWAPW);
   case ISD::ATOMIC_STORE:
@@ -4457,6 +4613,7 @@ const char *SystemZTargetLowering::getTargetNodeName(unsigned Opcode) const {
     OPCODE(SEARCH_STRING);
     OPCODE(IPM);
     OPCODE(SERIALIZE);
+    OPCODE(MEMBARRIER);
     OPCODE(TBEGIN);
     OPCODE(TBEGIN_NOFLOAT);
     OPCODE(TEND);
@@ -4506,6 +4663,7 @@ const char *SystemZTargetLowering::getTargetNodeName(unsigned Opcode) const {
     OPCODE(VISTR_CC);
     OPCODE(VSTRC_CC);
     OPCODE(VSTRCZ_CC);
+    OPCODE(TDC);
     OPCODE(ATOMIC_SWAPW);
     OPCODE(ATOMIC_LOADW_ADD);
     OPCODE(ATOMIC_LOADW_SUB);
@@ -4518,6 +4676,8 @@ const char *SystemZTargetLowering::getTargetNodeName(unsigned Opcode) const {
     OPCODE(ATOMIC_LOADW_UMIN);
     OPCODE(ATOMIC_LOADW_UMAX);
     OPCODE(ATOMIC_CMP_SWAPW);
+    OPCODE(LRV);
+    OPCODE(STRV);
     OPCODE(PREFETCH);
   }
   return nullptr;
@@ -4535,8 +4695,9 @@ static bool canTreatAsByteVector(EVT VT) {
 // of the input vector and Index is the index (based on type VecVT) that
 // should be extracted.  Return the new extraction if a simplification
 // was possible or if Force is true.
-SDValue SystemZTargetLowering::combineExtract(SDLoc DL, EVT ResVT, EVT VecVT,
-                                              SDValue Op, unsigned Index,
+SDValue SystemZTargetLowering::combineExtract(const SDLoc &DL, EVT ResVT,
+                                              EVT VecVT, SDValue Op,
+                                              unsigned Index,
                                               DAGCombinerInfo &DCI,
                                               bool Force) const {
   SelectionDAG &DAG = DCI.DAG;
@@ -4639,9 +4800,8 @@ SDValue SystemZTargetLowering::combineExtract(SDLoc DL, EVT ResVT, EVT VecVT,
 
 // Optimize vector operations in scalar value Op on the basis that Op
 // is truncated to TruncVT.
-SDValue
-SystemZTargetLowering::combineTruncateExtract(SDLoc DL, EVT TruncVT, SDValue Op,
-                                              DAGCombinerInfo &DCI) const {
+SDValue SystemZTargetLowering::combineTruncateExtract(
+    const SDLoc &DL, EVT TruncVT, SDValue Op, DAGCombinerInfo &DCI) const {
   // If we have (trunc (extract_vector_elt X, Y)), try to turn it into
   // (extract_vector_elt (bitcast X), Y'), where (bitcast X) has elements
   // of type TruncVT.
@@ -4675,145 +4835,295 @@ SystemZTargetLowering::combineTruncateExtract(SDLoc DL, EVT TruncVT, SDValue Op,
   return SDValue();
 }
 
-SDValue SystemZTargetLowering::PerformDAGCombine(SDNode *N,
-                                                 DAGCombinerInfo &DCI) const {
+SDValue SystemZTargetLowering::combineSIGN_EXTEND(
+    SDNode *N, DAGCombinerInfo &DCI) const {
+  // Convert (sext (ashr (shl X, C1), C2)) to
+  // (ashr (shl (anyext X), C1'), C2')), since wider shifts are as
+  // cheap as narrower ones.
   SelectionDAG &DAG = DCI.DAG;
-  unsigned Opcode = N->getOpcode();
-  if (Opcode == ISD::SIGN_EXTEND) {
-    // Convert (sext (ashr (shl X, C1), C2)) to
-    // (ashr (shl (anyext X), C1'), C2')), since wider shifts are as
-    // cheap as narrower ones.
-    SDValue N0 = N->getOperand(0);
-    EVT VT = N->getValueType(0);
-    if (N0.hasOneUse() && N0.getOpcode() == ISD::SRA) {
-      auto *SraAmt = dyn_cast<ConstantSDNode>(N0.getOperand(1));
-      SDValue Inner = N0.getOperand(0);
-      if (SraAmt && Inner.hasOneUse() && Inner.getOpcode() == ISD::SHL) {
-        if (auto *ShlAmt = dyn_cast<ConstantSDNode>(Inner.getOperand(1))) {
-          unsigned Extra = (VT.getSizeInBits() -
-                            N0.getValueType().getSizeInBits());
-          unsigned NewShlAmt = ShlAmt->getZExtValue() + Extra;
-          unsigned NewSraAmt = SraAmt->getZExtValue() + Extra;
-          EVT ShiftVT = N0.getOperand(1).getValueType();
-          SDValue Ext = DAG.getNode(ISD::ANY_EXTEND, SDLoc(Inner), VT,
-                                    Inner.getOperand(0));
-          SDValue Shl = DAG.getNode(ISD::SHL, SDLoc(Inner), VT, Ext,
-                                    DAG.getConstant(NewShlAmt, SDLoc(Inner),
-                                                    ShiftVT));
-          return DAG.getNode(ISD::SRA, SDLoc(N0), VT, Shl,
-                             DAG.getConstant(NewSraAmt, SDLoc(N0), ShiftVT));
-        }
+  SDValue N0 = N->getOperand(0);
+  EVT VT = N->getValueType(0);
+  if (N0.hasOneUse() && N0.getOpcode() == ISD::SRA) {
+    auto *SraAmt = dyn_cast<ConstantSDNode>(N0.getOperand(1));
+    SDValue Inner = N0.getOperand(0);
+    if (SraAmt && Inner.hasOneUse() && Inner.getOpcode() == ISD::SHL) {
+      if (auto *ShlAmt = dyn_cast<ConstantSDNode>(Inner.getOperand(1))) {
+        unsigned Extra = (VT.getSizeInBits() -
+                          N0.getValueType().getSizeInBits());
+        unsigned NewShlAmt = ShlAmt->getZExtValue() + Extra;
+        unsigned NewSraAmt = SraAmt->getZExtValue() + Extra;
+        EVT ShiftVT = N0.getOperand(1).getValueType();
+        SDValue Ext = DAG.getNode(ISD::ANY_EXTEND, SDLoc(Inner), VT,
+                                  Inner.getOperand(0));
+        SDValue Shl = DAG.getNode(ISD::SHL, SDLoc(Inner), VT, Ext,
+                                  DAG.getConstant(NewShlAmt, SDLoc(Inner),
+                                                  ShiftVT));
+        return DAG.getNode(ISD::SRA, SDLoc(N0), VT, Shl,
+                           DAG.getConstant(NewSraAmt, SDLoc(N0), ShiftVT));
       }
     }
   }
-  if (Opcode == SystemZISD::MERGE_HIGH ||
-      Opcode == SystemZISD::MERGE_LOW) {
-    SDValue Op0 = N->getOperand(0);
-    SDValue Op1 = N->getOperand(1);
-    if (Op0.getOpcode() == ISD::BITCAST)
-      Op0 = Op0.getOperand(0);
-    if (Op0.getOpcode() == SystemZISD::BYTE_MASK &&
-        cast<ConstantSDNode>(Op0.getOperand(0))->getZExtValue() == 0) {
-      // (z_merge_* 0, 0) -> 0.  This is mostly useful for using VLLEZF
-      // for v4f32.
-      if (Op1 == N->getOperand(0))
-        return Op1;
-      // (z_merge_? 0, X) -> (z_unpackl_? 0, X).
-      EVT VT = Op1.getValueType();
-      unsigned ElemBytes = VT.getVectorElementType().getStoreSize();
-      if (ElemBytes <= 4) {
-        Opcode = (Opcode == SystemZISD::MERGE_HIGH ?
-                  SystemZISD::UNPACKL_HIGH : SystemZISD::UNPACKL_LOW);
-        EVT InVT = VT.changeVectorElementTypeToInteger();
-        EVT OutVT = MVT::getVectorVT(MVT::getIntegerVT(ElemBytes * 16),
-                                     SystemZ::VectorBytes / ElemBytes / 2);
-        if (VT != InVT) {
-          Op1 = DAG.getNode(ISD::BITCAST, SDLoc(N), InVT, Op1);
-          DCI.AddToWorklist(Op1.getNode());
-        }
-        SDValue Op = DAG.getNode(Opcode, SDLoc(N), OutVT, Op1);
-        DCI.AddToWorklist(Op.getNode());
-        return DAG.getNode(ISD::BITCAST, SDLoc(N), VT, Op);
+  return SDValue();
+}
+
+SDValue SystemZTargetLowering::combineMERGE(
+    SDNode *N, DAGCombinerInfo &DCI) const {
+  SelectionDAG &DAG = DCI.DAG;
+  unsigned Opcode = N->getOpcode();
+  SDValue Op0 = N->getOperand(0);
+  SDValue Op1 = N->getOperand(1);
+  if (Op0.getOpcode() == ISD::BITCAST)
+    Op0 = Op0.getOperand(0);
+  if (Op0.getOpcode() == SystemZISD::BYTE_MASK &&
+      cast<ConstantSDNode>(Op0.getOperand(0))->getZExtValue() == 0) {
+    // (z_merge_* 0, 0) -> 0.  This is mostly useful for using VLLEZF
+    // for v4f32.
+    if (Op1 == N->getOperand(0))
+      return Op1;
+    // (z_merge_? 0, X) -> (z_unpackl_? 0, X).
+    EVT VT = Op1.getValueType();
+    unsigned ElemBytes = VT.getVectorElementType().getStoreSize();
+    if (ElemBytes <= 4) {
+      Opcode = (Opcode == SystemZISD::MERGE_HIGH ?
+                SystemZISD::UNPACKL_HIGH : SystemZISD::UNPACKL_LOW);
+      EVT InVT = VT.changeVectorElementTypeToInteger();
+      EVT OutVT = MVT::getVectorVT(MVT::getIntegerVT(ElemBytes * 16),
+                                   SystemZ::VectorBytes / ElemBytes / 2);
+      if (VT != InVT) {
+        Op1 = DAG.getNode(ISD::BITCAST, SDLoc(N), InVT, Op1);
+        DCI.AddToWorklist(Op1.getNode());
       }
+      SDValue Op = DAG.getNode(Opcode, SDLoc(N), OutVT, Op1);
+      DCI.AddToWorklist(Op.getNode());
+      return DAG.getNode(ISD::BITCAST, SDLoc(N), VT, Op);
     }
   }
+  return SDValue();
+}
+
+SDValue SystemZTargetLowering::combineSTORE(
+    SDNode *N, DAGCombinerInfo &DCI) const {
+  SelectionDAG &DAG = DCI.DAG;
+  auto *SN = cast<StoreSDNode>(N);
+  auto &Op1 = N->getOperand(1);
+  EVT MemVT = SN->getMemoryVT();
   // If we have (truncstoreiN (extract_vector_elt X, Y), Z) then it is better
   // for the extraction to be done on a vMiN value, so that we can use VSTE.
   // If X has wider elements then convert it to:
   // (truncstoreiN (extract_vector_elt (bitcast X), Y2), Z).
-  if (Opcode == ISD::STORE) {
-    auto *SN = cast<StoreSDNode>(N);
-    EVT MemVT = SN->getMemoryVT();
-    if (MemVT.isInteger()) {
-      SDValue Value = combineTruncateExtract(SDLoc(N), MemVT,
-                                             SN->getValue(), DCI);
-      if (Value.getNode()) {
-        DCI.AddToWorklist(Value.getNode());
-
-        // Rewrite the store with the new form of stored value.
-        return DAG.getTruncStore(SN->getChain(), SDLoc(SN), Value,
-                                 SN->getBasePtr(), SN->getMemoryVT(),
-                                 SN->getMemOperand());
-      }
+  if (MemVT.isInteger()) {
+    if (SDValue Value =
+            combineTruncateExtract(SDLoc(N), MemVT, SN->getValue(), DCI)) {
+      DCI.AddToWorklist(Value.getNode());
+
+      // Rewrite the store with the new form of stored value.
+      return DAG.getTruncStore(SN->getChain(), SDLoc(SN), Value,
+                               SN->getBasePtr(), SN->getMemoryVT(),
+                               SN->getMemOperand());
     }
   }
-  // Try to simplify a vector extraction.
-  if (Opcode == ISD::EXTRACT_VECTOR_ELT) {
-    if (auto *IndexN = dyn_cast<ConstantSDNode>(N->getOperand(1))) {
-      SDValue Op0 = N->getOperand(0);
-      EVT VecVT = Op0.getValueType();
-      return combineExtract(SDLoc(N), N->getValueType(0), VecVT, Op0,
-                            IndexN->getZExtValue(), DCI, false);
+  // Combine STORE (BSWAP) into STRVH/STRV/STRVG
+  // See comment in combineBSWAP about volatile accesses.
+  if (!SN->isVolatile() &&
+      Op1.getOpcode() == ISD::BSWAP &&
+      Op1.getNode()->hasOneUse() &&
+      (Op1.getValueType() == MVT::i16 ||
+       Op1.getValueType() == MVT::i32 ||
+       Op1.getValueType() == MVT::i64)) {
+
+      SDValue BSwapOp = Op1.getOperand(0);
+
+      if (BSwapOp.getValueType() == MVT::i16)
+        BSwapOp = DAG.getNode(ISD::ANY_EXTEND, SDLoc(N), MVT::i32, BSwapOp);
+
+      SDValue Ops[] = {
+        N->getOperand(0), BSwapOp, N->getOperand(2),
+        DAG.getValueType(Op1.getValueType())
+      };
+
+      return
+        DAG.getMemIntrinsicNode(SystemZISD::STRV, SDLoc(N), DAG.getVTList(MVT::Other),
+                                Ops, MemVT, SN->getMemOperand());
     }
+  return SDValue();
+}
+
+SDValue SystemZTargetLowering::combineEXTRACT_VECTOR_ELT(
+    SDNode *N, DAGCombinerInfo &DCI) const {
+  // Try to simplify a vector extraction.
+  if (auto *IndexN = dyn_cast<ConstantSDNode>(N->getOperand(1))) {
+    SDValue Op0 = N->getOperand(0);
+    EVT VecVT = Op0.getValueType();
+    return combineExtract(SDLoc(N), N->getValueType(0), VecVT, Op0,
+                          IndexN->getZExtValue(), DCI, false);
   }
+  return SDValue();
+}
+
+SDValue SystemZTargetLowering::combineJOIN_DWORDS(
+    SDNode *N, DAGCombinerInfo &DCI) const {
+  SelectionDAG &DAG = DCI.DAG;
   // (join_dwords X, X) == (replicate X)
-  if (Opcode == SystemZISD::JOIN_DWORDS &&
-      N->getOperand(0) == N->getOperand(1))
+  if (N->getOperand(0) == N->getOperand(1))
     return DAG.getNode(SystemZISD::REPLICATE, SDLoc(N), N->getValueType(0),
                        N->getOperand(0));
+  return SDValue();
+}
+
+SDValue SystemZTargetLowering::combineFP_ROUND(
+    SDNode *N, DAGCombinerInfo &DCI) const {
   // (fround (extract_vector_elt X 0))
   // (fround (extract_vector_elt X 1)) ->
   // (extract_vector_elt (VROUND X) 0)
   // (extract_vector_elt (VROUND X) 1)
   //
   // This is a special case since the target doesn't really support v2f32s.
-  if (Opcode == ISD::FP_ROUND) {
-    SDValue Op0 = N->getOperand(0);
-    if (N->getValueType(0) == MVT::f32 &&
-        Op0.hasOneUse() &&
-        Op0.getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
-        Op0.getOperand(0).getValueType() == MVT::v2f64 &&
-        Op0.getOperand(1).getOpcode() == ISD::Constant &&
-        cast<ConstantSDNode>(Op0.getOperand(1))->getZExtValue() == 0) {
-      SDValue Vec = Op0.getOperand(0);
-      for (auto *U : Vec->uses()) {
-        if (U != Op0.getNode() &&
-            U->hasOneUse() &&
-            U->getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
-            U->getOperand(0) == Vec &&
-            U->getOperand(1).getOpcode() == ISD::Constant &&
-            cast<ConstantSDNode>(U->getOperand(1))->getZExtValue() == 1) {
-          SDValue OtherRound = SDValue(*U->use_begin(), 0);
-          if (OtherRound.getOpcode() == ISD::FP_ROUND &&
-              OtherRound.getOperand(0) == SDValue(U, 0) &&
-              OtherRound.getValueType() == MVT::f32) {
-            SDValue VRound = DAG.getNode(SystemZISD::VROUND, SDLoc(N),
-                                         MVT::v4f32, Vec);
-            DCI.AddToWorklist(VRound.getNode());
-            SDValue Extract1 =
-              DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SDLoc(U), MVT::f32,
-                          VRound, DAG.getConstant(2, SDLoc(U), MVT::i32));
-            DCI.AddToWorklist(Extract1.getNode());
-            DAG.ReplaceAllUsesOfValueWith(OtherRound, Extract1);
-            SDValue Extract0 =
-              DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SDLoc(Op0), MVT::f32,
-                          VRound, DAG.getConstant(0, SDLoc(Op0), MVT::i32));
-            return Extract0;
-          }
+  SelectionDAG &DAG = DCI.DAG;
+  SDValue Op0 = N->getOperand(0);
+  if (N->getValueType(0) == MVT::f32 &&
+      Op0.hasOneUse() &&
+      Op0.getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
+      Op0.getOperand(0).getValueType() == MVT::v2f64 &&
+      Op0.getOperand(1).getOpcode() == ISD::Constant &&
+      cast<ConstantSDNode>(Op0.getOperand(1))->getZExtValue() == 0) {
+    SDValue Vec = Op0.getOperand(0);
+    for (auto *U : Vec->uses()) {
+      if (U != Op0.getNode() &&
+          U->hasOneUse() &&
+          U->getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
+          U->getOperand(0) == Vec &&
+          U->getOperand(1).getOpcode() == ISD::Constant &&
+          cast<ConstantSDNode>(U->getOperand(1))->getZExtValue() == 1) {
+        SDValue OtherRound = SDValue(*U->use_begin(), 0);
+        if (OtherRound.getOpcode() == ISD::FP_ROUND &&
+            OtherRound.getOperand(0) == SDValue(U, 0) &&
+            OtherRound.getValueType() == MVT::f32) {
+          SDValue VRound = DAG.getNode(SystemZISD::VROUND, SDLoc(N),
+                                       MVT::v4f32, Vec);
+          DCI.AddToWorklist(VRound.getNode());
+          SDValue Extract1 =
+            DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SDLoc(U), MVT::f32,
+                        VRound, DAG.getConstant(2, SDLoc(U), MVT::i32));
+          DCI.AddToWorklist(Extract1.getNode());
+          DAG.ReplaceAllUsesOfValueWith(OtherRound, Extract1);
+          SDValue Extract0 =
+            DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SDLoc(Op0), MVT::f32,
+                        VRound, DAG.getConstant(0, SDLoc(Op0), MVT::i32));
+          return Extract0;
+        }
+      }
+    }
+  }
+  return SDValue();
+}
+
+SDValue SystemZTargetLowering::combineBSWAP(
+    SDNode *N, DAGCombinerInfo &DCI) const {
+  SelectionDAG &DAG = DCI.DAG;
+  // Combine BSWAP (LOAD) into LRVH/LRV/LRVG
+  // These loads are allowed to access memory multiple times, and so we must check
+  // that the loads are not volatile before performing the combine.
+  if (ISD::isNON_EXTLoad(N->getOperand(0).getNode()) &&
+      N->getOperand(0).hasOneUse() &&
+      (N->getValueType(0) == MVT::i16 || N->getValueType(0) == MVT::i32 ||
+       N->getValueType(0) == MVT::i64) &&
+       !cast<LoadSDNode>(N->getOperand(0))->isVolatile()) {
+      SDValue Load = N->getOperand(0);
+      LoadSDNode *LD = cast<LoadSDNode>(Load);
+
+      // Create the byte-swapping load.
+      SDValue Ops[] = {
+        LD->getChain(),    // Chain
+        LD->getBasePtr(),  // Ptr
+        DAG.getValueType(N->getValueType(0)) // VT
+      };
+      SDValue BSLoad =
+        DAG.getMemIntrinsicNode(SystemZISD::LRV, SDLoc(N),
+                                DAG.getVTList(N->getValueType(0) == MVT::i64 ?
+                                              MVT::i64 : MVT::i32, MVT::Other),
+                                Ops, LD->getMemoryVT(), LD->getMemOperand());
+
+      // If this is an i16 load, insert the truncate.
+      SDValue ResVal = BSLoad;
+      if (N->getValueType(0) == MVT::i16)
+        ResVal = DAG.getNode(ISD::TRUNCATE, SDLoc(N), MVT::i16, BSLoad);
+
+      // First, combine the bswap away.  This makes the value produced by the
+      // load dead.
+      DCI.CombineTo(N, ResVal);
+
+      // Next, combine the load away, we give it a bogus result value but a real
+      // chain result.  The result value is dead because the bswap is dead.
+      DCI.CombineTo(Load.getNode(), ResVal, BSLoad.getValue(1));
+
+      // Return N so it doesn't get rechecked!
+      return SDValue(N, 0);
+    }
+  return SDValue();
+}
+
+SDValue SystemZTargetLowering::combineSHIFTROT(
+    SDNode *N, DAGCombinerInfo &DCI) const {
+
+  SelectionDAG &DAG = DCI.DAG;
+
+  // Shift/rotate instructions only use the last 6 bits of the second operand
+  // register. If the second operand is the result of an AND with an immediate
+  // value that has its last 6 bits set, we can safely remove the AND operation.
+  SDValue N1 = N->getOperand(1);
+  if (N1.getOpcode() == ISD::AND) {
+    auto *AndMask = dyn_cast<ConstantSDNode>(N1.getOperand(1));
+
+    // The AND mask is constant
+    if (AndMask) {
+      auto AmtVal = AndMask->getZExtValue();
+
+      // Bottom 6 bits are set
+      if ((AmtVal & 0x3f) == 0x3f) {
+        SDValue AndOp = N1->getOperand(0);
+
+        // This is the only use, so remove the node
+        if (N1.hasOneUse()) {
+          // Combine the AND away
+          DCI.CombineTo(N1.getNode(), AndOp);
+
+          // Return N so it isn't rechecked
+          return SDValue(N, 0);
+
+        // The node will be reused, so create a new node for this one use
+        } else {
+          SDValue Replace = DAG.getNode(N->getOpcode(), SDLoc(N),
+                                        N->getValueType(0), N->getOperand(0),
+                                        AndOp);
+          DCI.AddToWorklist(Replace.getNode());
+
+          return Replace;
         }
       }
     }
   }
+
+  return SDValue();
+}
+
+SDValue SystemZTargetLowering::PerformDAGCombine(SDNode *N,
+                                                 DAGCombinerInfo &DCI) const {
+  switch(N->getOpcode()) {
+  default: break;
+  case ISD::SIGN_EXTEND:        return combineSIGN_EXTEND(N, DCI);
+  case SystemZISD::MERGE_HIGH:
+  case SystemZISD::MERGE_LOW:   return combineMERGE(N, DCI);
+  case ISD::STORE:              return combineSTORE(N, DCI);
+  case ISD::EXTRACT_VECTOR_ELT: return combineEXTRACT_VECTOR_ELT(N, DCI);
+  case SystemZISD::JOIN_DWORDS: return combineJOIN_DWORDS(N, DCI);
+  case ISD::FP_ROUND:           return combineFP_ROUND(N, DCI);
+  case ISD::BSWAP:              return combineBSWAP(N, DCI);
+  case ISD::SHL:
+  case ISD::SRA:
+  case ISD::SRL:
+  case ISD::ROTL:               return combineSHIFTROT(N, DCI);
+  }
+
   return SDValue();
 }
 
@@ -4831,7 +5141,7 @@ static MachineBasicBlock *emitBlockAfter(MachineBasicBlock *MBB) {
 
 // Split MBB after MI and return the new block (the one that contains
 // instructions after MI).
-static MachineBasicBlock *splitBlockAfter(MachineInstr *MI,
+static MachineBasicBlock *splitBlockAfter(MachineBasicBlock::iterator MI,
                                           MachineBasicBlock *MBB) {
   MachineBasicBlock *NewMBB = emitBlockAfter(MBB);
   NewMBB->splice(NewMBB->begin(), MBB,
@@ -4841,7 +5151,7 @@ static MachineBasicBlock *splitBlockAfter(MachineInstr *MI,
 }
 
 // Split MBB before MI and return the new block (the one that contains MI).
-static MachineBasicBlock *splitBlockBefore(MachineInstr *MI,
+static MachineBasicBlock *splitBlockBefore(MachineBasicBlock::iterator MI,
                                            MachineBasicBlock *MBB) {
   MachineBasicBlock *NewMBB = emitBlockAfter(MBB);
   NewMBB->splice(NewMBB->begin(), MBB, MI, MBB->end());
@@ -4850,34 +5160,36 @@ static MachineBasicBlock *splitBlockBefore(MachineInstr *MI,
 }
 
 // Force base value Base into a register before MI.  Return the register.
-static unsigned forceReg(MachineInstr *MI, MachineOperand &Base,
+static unsigned forceReg(MachineInstr &MI, MachineOperand &Base,
                          const SystemZInstrInfo *TII) {
   if (Base.isReg())
     return Base.getReg();
 
-  MachineBasicBlock *MBB = MI->getParent();
+  MachineBasicBlock *MBB = MI.getParent();
   MachineFunction &MF = *MBB->getParent();
   MachineRegisterInfo &MRI = MF.getRegInfo();
 
   unsigned Reg = MRI.createVirtualRegister(&SystemZ::ADDR64BitRegClass);
-  BuildMI(*MBB, MI, MI->getDebugLoc(), TII->get(SystemZ::LA), Reg)
-    .addOperand(Base).addImm(0).addReg(0);
+  BuildMI(*MBB, MI, MI.getDebugLoc(), TII->get(SystemZ::LA), Reg)
+      .addOperand(Base)
+      .addImm(0)
+      .addReg(0);
   return Reg;
 }
 
 // Implement EmitInstrWithCustomInserter for pseudo Select* instruction MI.
 MachineBasicBlock *
-SystemZTargetLowering::emitSelect(MachineInstr *MI,
+SystemZTargetLowering::emitSelect(MachineInstr &MI,
                                   MachineBasicBlock *MBB) const {
   const SystemZInstrInfo *TII =
       static_cast<const SystemZInstrInfo *>(Subtarget.getInstrInfo());
 
-  unsigned DestReg  = MI->getOperand(0).getReg();
-  unsigned TrueReg  = MI->getOperand(1).getReg();
-  unsigned FalseReg = MI->getOperand(2).getReg();
-  unsigned CCValid  = MI->getOperand(3).getImm();
-  unsigned CCMask   = MI->getOperand(4).getImm();
-  DebugLoc DL       = MI->getDebugLoc();
+  unsigned DestReg = MI.getOperand(0).getReg();
+  unsigned TrueReg = MI.getOperand(1).getReg();
+  unsigned FalseReg = MI.getOperand(2).getReg();
+  unsigned CCValid = MI.getOperand(3).getImm();
+  unsigned CCMask = MI.getOperand(4).getImm();
+  DebugLoc DL = MI.getDebugLoc();
 
   MachineBasicBlock *StartMBB = MBB;
   MachineBasicBlock *JoinMBB  = splitBlockBefore(MI, MBB);
@@ -4905,7 +5217,7 @@ SystemZTargetLowering::emitSelect(MachineInstr *MI,
     .addReg(TrueReg).addMBB(StartMBB)
     .addReg(FalseReg).addMBB(FalseMBB);
 
-  MI->eraseFromParent();
+  MI.eraseFromParent();
   return JoinMBB;
 }
 
@@ -4913,21 +5225,21 @@ SystemZTargetLowering::emitSelect(MachineInstr *MI,
 // StoreOpcode is the store to use and Invert says whether the store should
 // happen when the condition is false rather than true.  If a STORE ON
 // CONDITION is available, STOCOpcode is its opcode, otherwise it is 0.
-MachineBasicBlock *
-SystemZTargetLowering::emitCondStore(MachineInstr *MI,
-                                     MachineBasicBlock *MBB,
-                                     unsigned StoreOpcode, unsigned STOCOpcode,
-                                     bool Invert) const {
+MachineBasicBlock *SystemZTargetLowering::emitCondStore(MachineInstr &MI,
+                                                        MachineBasicBlock *MBB,
+                                                        unsigned StoreOpcode,
+                                                        unsigned STOCOpcode,
+                                                        bool Invert) const {
   const SystemZInstrInfo *TII =
       static_cast<const SystemZInstrInfo *>(Subtarget.getInstrInfo());
 
-  unsigned SrcReg     = MI->getOperand(0).getReg();
-  MachineOperand Base = MI->getOperand(1);
-  int64_t Disp        = MI->getOperand(2).getImm();
-  unsigned IndexReg   = MI->getOperand(3).getReg();
-  unsigned CCValid    = MI->getOperand(4).getImm();
-  unsigned CCMask     = MI->getOperand(5).getImm();
-  DebugLoc DL         = MI->getDebugLoc();
+  unsigned SrcReg = MI.getOperand(0).getReg();
+  MachineOperand Base = MI.getOperand(1);
+  int64_t Disp = MI.getOperand(2).getImm();
+  unsigned IndexReg = MI.getOperand(3).getReg();
+  unsigned CCValid = MI.getOperand(4).getImm();
+  unsigned CCMask = MI.getOperand(5).getImm();
+  DebugLoc DL = MI.getDebugLoc();
 
   StoreOpcode = TII->getOpcodeForOffset(StoreOpcode, Disp);
 
@@ -4940,7 +5252,7 @@ SystemZTargetLowering::emitCondStore(MachineInstr *MI,
     BuildMI(*MBB, MI, DL, TII->get(STOCOpcode))
       .addReg(SrcReg).addOperand(Base).addImm(Disp)
       .addImm(CCValid).addImm(CCMask);
-    MI->eraseFromParent();
+    MI.eraseFromParent();
     return MBB;
   }
 
@@ -4969,7 +5281,7 @@ SystemZTargetLowering::emitCondStore(MachineInstr *MI,
     .addReg(SrcReg).addOperand(Base).addImm(Disp).addReg(IndexReg);
   MBB->addSuccessor(JoinMBB);
 
-  MI->eraseFromParent();
+  MI.eraseFromParent();
   return JoinMBB;
 }
 
@@ -4980,12 +5292,9 @@ SystemZTargetLowering::emitCondStore(MachineInstr *MI,
 // ATOMIC_LOADW_* or ATOMIC_SWAPW instruction, in which case the bitsize
 // is one of the operands.  Invert says whether the field should be
 // inverted after performing BinOpcode (e.g. for NAND).
-MachineBasicBlock *
-SystemZTargetLowering::emitAtomicLoadBinary(MachineInstr *MI,
-                                            MachineBasicBlock *MBB,
-                                            unsigned BinOpcode,
-                                            unsigned BitSize,
-                                            bool Invert) const {
+MachineBasicBlock *SystemZTargetLowering::emitAtomicLoadBinary(
+    MachineInstr &MI, MachineBasicBlock *MBB, unsigned BinOpcode,
+    unsigned BitSize, bool Invert) const {
   MachineFunction &MF = *MBB->getParent();
   const SystemZInstrInfo *TII =
       static_cast<const SystemZInstrInfo *>(Subtarget.getInstrInfo());
@@ -4994,15 +5303,15 @@ SystemZTargetLowering::emitAtomicLoadBinary(MachineInstr *MI,
 
   // Extract the operands.  Base can be a register or a frame index.
   // Src2 can be a register or immediate.
-  unsigned Dest        = MI->getOperand(0).getReg();
-  MachineOperand Base  = earlyUseOperand(MI->getOperand(1));
-  int64_t Disp         = MI->getOperand(2).getImm();
-  MachineOperand Src2  = earlyUseOperand(MI->getOperand(3));
-  unsigned BitShift    = (IsSubWord ? MI->getOperand(4).getReg() : 0);
-  unsigned NegBitShift = (IsSubWord ? MI->getOperand(5).getReg() : 0);
-  DebugLoc DL          = MI->getDebugLoc();
+  unsigned Dest = MI.getOperand(0).getReg();
+  MachineOperand Base = earlyUseOperand(MI.getOperand(1));
+  int64_t Disp = MI.getOperand(2).getImm();
+  MachineOperand Src2 = earlyUseOperand(MI.getOperand(3));
+  unsigned BitShift = (IsSubWord ? MI.getOperand(4).getReg() : 0);
+  unsigned NegBitShift = (IsSubWord ? MI.getOperand(5).getReg() : 0);
+  DebugLoc DL = MI.getDebugLoc();
   if (IsSubWord)
-    BitSize = MI->getOperand(6).getImm();
+    BitSize = MI.getOperand(6).getImm();
 
   // Subword operations use 32-bit registers.
   const TargetRegisterClass *RC = (BitSize <= 32 ?
@@ -5090,7 +5399,7 @@ SystemZTargetLowering::emitAtomicLoadBinary(MachineInstr *MI,
   MBB->addSuccessor(LoopMBB);
   MBB->addSuccessor(DoneMBB);
 
-  MI->eraseFromParent();
+  MI.eraseFromParent();
   return DoneMBB;
 }
 
@@ -5100,12 +5409,9 @@ SystemZTargetLowering::emitAtomicLoadBinary(MachineInstr *MI,
 // minimum or maximum value.  KeepOldMask is the BRC condition-code mask
 // for when the current field should be kept.  BitSize is the width of
 // the field in bits, or 0 if this is a partword ATOMIC_LOADW_* instruction.
-MachineBasicBlock *
-SystemZTargetLowering::emitAtomicLoadMinMax(MachineInstr *MI,
-                                            MachineBasicBlock *MBB,
-                                            unsigned CompareOpcode,
-                                            unsigned KeepOldMask,
-                                            unsigned BitSize) const {
+MachineBasicBlock *SystemZTargetLowering::emitAtomicLoadMinMax(
+    MachineInstr &MI, MachineBasicBlock *MBB, unsigned CompareOpcode,
+    unsigned KeepOldMask, unsigned BitSize) const {
   MachineFunction &MF = *MBB->getParent();
   const SystemZInstrInfo *TII =
       static_cast<const SystemZInstrInfo *>(Subtarget.getInstrInfo());
@@ -5113,15 +5419,15 @@ SystemZTargetLowering::emitAtomicLoadMinMax(MachineInstr *MI,
   bool IsSubWord = (BitSize < 32);
 
   // Extract the operands.  Base can be a register or a frame index.
-  unsigned Dest        = MI->getOperand(0).getReg();
-  MachineOperand Base  = earlyUseOperand(MI->getOperand(1));
-  int64_t  Disp        = MI->getOperand(2).getImm();
-  unsigned Src2        = MI->getOperand(3).getReg();
-  unsigned BitShift    = (IsSubWord ? MI->getOperand(4).getReg() : 0);
-  unsigned NegBitShift = (IsSubWord ? MI->getOperand(5).getReg() : 0);
-  DebugLoc DL          = MI->getDebugLoc();
+  unsigned Dest = MI.getOperand(0).getReg();
+  MachineOperand Base = earlyUseOperand(MI.getOperand(1));
+  int64_t Disp = MI.getOperand(2).getImm();
+  unsigned Src2 = MI.getOperand(3).getReg();
+  unsigned BitShift = (IsSubWord ? MI.getOperand(4).getReg() : 0);
+  unsigned NegBitShift = (IsSubWord ? MI.getOperand(5).getReg() : 0);
+  DebugLoc DL = MI.getDebugLoc();
   if (IsSubWord)
-    BitSize = MI->getOperand(6).getImm();
+    BitSize = MI.getOperand(6).getImm();
 
   // Subword operations use 32-bit registers.
   const TargetRegisterClass *RC = (BitSize <= 32 ?
@@ -5209,30 +5515,31 @@ SystemZTargetLowering::emitAtomicLoadMinMax(MachineInstr *MI,
   MBB->addSuccessor(LoopMBB);
   MBB->addSuccessor(DoneMBB);
 
-  MI->eraseFromParent();
+  MI.eraseFromParent();
   return DoneMBB;
 }
 
 // Implement EmitInstrWithCustomInserter for pseudo ATOMIC_CMP_SWAPW
 // instruction MI.
 MachineBasicBlock *
-SystemZTargetLowering::emitAtomicCmpSwapW(MachineInstr *MI,
+SystemZTargetLowering::emitAtomicCmpSwapW(MachineInstr &MI,
                                           MachineBasicBlock *MBB) const {
+
   MachineFunction &MF = *MBB->getParent();
   const SystemZInstrInfo *TII =
       static_cast<const SystemZInstrInfo *>(Subtarget.getInstrInfo());
   MachineRegisterInfo &MRI = MF.getRegInfo();
 
   // Extract the operands.  Base can be a register or a frame index.
-  unsigned Dest        = MI->getOperand(0).getReg();
-  MachineOperand Base  = earlyUseOperand(MI->getOperand(1));
-  int64_t  Disp        = MI->getOperand(2).getImm();
-  unsigned OrigCmpVal  = MI->getOperand(3).getReg();
-  unsigned OrigSwapVal = MI->getOperand(4).getReg();
-  unsigned BitShift    = MI->getOperand(5).getReg();
-  unsigned NegBitShift = MI->getOperand(6).getReg();
-  int64_t  BitSize     = MI->getOperand(7).getImm();
-  DebugLoc DL          = MI->getDebugLoc();
+  unsigned Dest = MI.getOperand(0).getReg();
+  MachineOperand Base = earlyUseOperand(MI.getOperand(1));
+  int64_t Disp = MI.getOperand(2).getImm();
+  unsigned OrigCmpVal = MI.getOperand(3).getReg();
+  unsigned OrigSwapVal = MI.getOperand(4).getReg();
+  unsigned BitShift = MI.getOperand(5).getReg();
+  unsigned NegBitShift = MI.getOperand(6).getReg();
+  int64_t BitSize = MI.getOperand(7).getImm();
+  DebugLoc DL = MI.getDebugLoc();
 
   const TargetRegisterClass *RC = &SystemZ::GR32BitRegClass;
 
@@ -5323,7 +5630,7 @@ SystemZTargetLowering::emitAtomicCmpSwapW(MachineInstr *MI,
   MBB->addSuccessor(LoopMBB);
   MBB->addSuccessor(DoneMBB);
 
-  MI->eraseFromParent();
+  MI.eraseFromParent();
   return DoneMBB;
 }
 
@@ -5331,18 +5638,18 @@ SystemZTargetLowering::emitAtomicCmpSwapW(MachineInstr *MI,
 // if the high register of the GR128 value must be cleared or false if
 // it's "don't care".  SubReg is subreg_l32 when extending a GR32
 // and subreg_l64 when extending a GR64.
-MachineBasicBlock *
-SystemZTargetLowering::emitExt128(MachineInstr *MI,
-                                  MachineBasicBlock *MBB,
-                                  bool ClearEven, unsigned SubReg) const {
+MachineBasicBlock *SystemZTargetLowering::emitExt128(MachineInstr &MI,
+                                                     MachineBasicBlock *MBB,
+                                                     bool ClearEven,
+                                                     unsigned SubReg) const {
   MachineFunction &MF = *MBB->getParent();
   const SystemZInstrInfo *TII =
       static_cast<const SystemZInstrInfo *>(Subtarget.getInstrInfo());
   MachineRegisterInfo &MRI = MF.getRegInfo();
-  DebugLoc DL = MI->getDebugLoc();
+  DebugLoc DL = MI.getDebugLoc();
 
-  unsigned Dest  = MI->getOperand(0).getReg();
-  unsigned Src   = MI->getOperand(1).getReg();
+  unsigned Dest = MI.getOperand(0).getReg();
+  unsigned Src = MI.getOperand(1).getReg();
   unsigned In128 = MRI.createVirtualRegister(&SystemZ::GR128BitRegClass);
 
   BuildMI(*MBB, MI, DL, TII->get(TargetOpcode::IMPLICIT_DEF), In128);
@@ -5359,25 +5666,23 @@ SystemZTargetLowering::emitExt128(MachineInstr *MI,
   BuildMI(*MBB, MI, DL, TII->get(TargetOpcode::INSERT_SUBREG), Dest)
     .addReg(In128).addReg(Src).addImm(SubReg);
 
-  MI->eraseFromParent();
+  MI.eraseFromParent();
   return MBB;
 }
 
-MachineBasicBlock *
-SystemZTargetLowering::emitMemMemWrapper(MachineInstr *MI,
-                                         MachineBasicBlock *MBB,
-                                         unsigned Opcode) const {
+MachineBasicBlock *SystemZTargetLowering::emitMemMemWrapper(
+    MachineInstr &MI, MachineBasicBlock *MBB, unsigned Opcode) const {
   MachineFunction &MF = *MBB->getParent();
   const SystemZInstrInfo *TII =
       static_cast<const SystemZInstrInfo *>(Subtarget.getInstrInfo());
   MachineRegisterInfo &MRI = MF.getRegInfo();
-  DebugLoc DL = MI->getDebugLoc();
+  DebugLoc DL = MI.getDebugLoc();
 
-  MachineOperand DestBase = earlyUseOperand(MI->getOperand(0));
-  uint64_t       DestDisp = MI->getOperand(1).getImm();
-  MachineOperand SrcBase  = earlyUseOperand(MI->getOperand(2));
-  uint64_t       SrcDisp  = MI->getOperand(3).getImm();
-  uint64_t       Length   = MI->getOperand(4).getImm();
+  MachineOperand DestBase = earlyUseOperand(MI.getOperand(0));
+  uint64_t DestDisp = MI.getOperand(1).getImm();
+  MachineOperand SrcBase = earlyUseOperand(MI.getOperand(2));
+  uint64_t SrcDisp = MI.getOperand(3).getImm();
+  uint64_t Length = MI.getOperand(4).getImm();
 
   // When generating more than one CLC, all but the last will need to
   // branch to the end when a difference is found.
@@ -5385,10 +5690,10 @@ SystemZTargetLowering::emitMemMemWrapper(MachineInstr *MI,
                                splitBlockAfter(MI, MBB) : nullptr);
 
   // Check for the loop form, in which operand 5 is the trip count.
-  if (MI->getNumExplicitOperands() > 5) {
+  if (MI.getNumExplicitOperands() > 5) {
     bool HaveSingleBase = DestBase.isIdenticalTo(SrcBase);
 
-    uint64_t StartCountReg = MI->getOperand(5).getReg();
+    uint64_t StartCountReg = MI.getOperand(5).getReg();
     uint64_t StartSrcReg   = forceReg(MI, SrcBase, TII);
     uint64_t StartDestReg  = (HaveSingleBase ? StartSrcReg :
                               forceReg(MI, DestBase, TII));
@@ -5491,15 +5796,19 @@ SystemZTargetLowering::emitMemMemWrapper(MachineInstr *MI,
     // Apply them using LAY if so.
     if (!isUInt<12>(DestDisp)) {
       unsigned Reg = MRI.createVirtualRegister(&SystemZ::ADDR64BitRegClass);
-      BuildMI(*MBB, MI, MI->getDebugLoc(), TII->get(SystemZ::LAY), Reg)
-        .addOperand(DestBase).addImm(DestDisp).addReg(0);
+      BuildMI(*MBB, MI, MI.getDebugLoc(), TII->get(SystemZ::LAY), Reg)
+          .addOperand(DestBase)
+          .addImm(DestDisp)
+          .addReg(0);
       DestBase = MachineOperand::CreateReg(Reg, false);
       DestDisp = 0;
     }
     if (!isUInt<12>(SrcDisp)) {
       unsigned Reg = MRI.createVirtualRegister(&SystemZ::ADDR64BitRegClass);
-      BuildMI(*MBB, MI, MI->getDebugLoc(), TII->get(SystemZ::LAY), Reg)
-        .addOperand(SrcBase).addImm(SrcDisp).addReg(0);
+      BuildMI(*MBB, MI, MI.getDebugLoc(), TII->get(SystemZ::LAY), Reg)
+          .addOperand(SrcBase)
+          .addImm(SrcDisp)
+          .addReg(0);
       SrcBase = MachineOperand::CreateReg(Reg, false);
       SrcDisp = 0;
     }
@@ -5527,26 +5836,24 @@ SystemZTargetLowering::emitMemMemWrapper(MachineInstr *MI,
     MBB->addLiveIn(SystemZ::CC);
   }
 
-  MI->eraseFromParent();
+  MI.eraseFromParent();
   return MBB;
 }
 
 // Decompose string pseudo-instruction MI into a loop that continually performs
 // Opcode until CC != 3.
-MachineBasicBlock *
-SystemZTargetLowering::emitStringWrapper(MachineInstr *MI,
-                                         MachineBasicBlock *MBB,
-                                         unsigned Opcode) const {
+MachineBasicBlock *SystemZTargetLowering::emitStringWrapper(
+    MachineInstr &MI, MachineBasicBlock *MBB, unsigned Opcode) const {
   MachineFunction &MF = *MBB->getParent();
   const SystemZInstrInfo *TII =
       static_cast<const SystemZInstrInfo *>(Subtarget.getInstrInfo());
   MachineRegisterInfo &MRI = MF.getRegInfo();
-  DebugLoc DL = MI->getDebugLoc();
+  DebugLoc DL = MI.getDebugLoc();
 
-  uint64_t End1Reg   = MI->getOperand(0).getReg();
-  uint64_t Start1Reg = MI->getOperand(1).getReg();
-  uint64_t Start2Reg = MI->getOperand(2).getReg();
-  uint64_t CharReg   = MI->getOperand(3).getReg();
+  uint64_t End1Reg = MI.getOperand(0).getReg();
+  uint64_t Start1Reg = MI.getOperand(1).getReg();
+  uint64_t Start2Reg = MI.getOperand(2).getReg();
+  uint64_t CharReg = MI.getOperand(3).getReg();
 
   const TargetRegisterClass *RC = &SystemZ::GR64BitRegClass;
   uint64_t This1Reg = MRI.createVirtualRegister(RC);
@@ -5589,26 +5896,24 @@ SystemZTargetLowering::emitStringWrapper(MachineInstr *MI,
 
   DoneMBB->addLiveIn(SystemZ::CC);
 
-  MI->eraseFromParent();
+  MI.eraseFromParent();
   return DoneMBB;
 }
 
 // Update TBEGIN instruction with final opcode and register clobbers.
-MachineBasicBlock *
-SystemZTargetLowering::emitTransactionBegin(MachineInstr *MI,
-                                            MachineBasicBlock *MBB,
-                                            unsigned Opcode,
-                                            bool NoFloat) const {
+MachineBasicBlock *SystemZTargetLowering::emitTransactionBegin(
+    MachineInstr &MI, MachineBasicBlock *MBB, unsigned Opcode,
+    bool NoFloat) const {
   MachineFunction &MF = *MBB->getParent();
   const TargetFrameLowering *TFI = Subtarget.getFrameLowering();
   const SystemZInstrInfo *TII = Subtarget.getInstrInfo();
 
   // Update opcode.
-  MI->setDesc(TII->get(Opcode));
+  MI.setDesc(TII->get(Opcode));
 
   // We cannot handle a TBEGIN that clobbers the stack or frame pointer.
   // Make sure to add the corresponding GRSM bits if they are missing.
-  uint64_t Control = MI->getOperand(2).getImm();
+  uint64_t Control = MI.getOperand(2).getImm();
   static const unsigned GPRControlBit[16] = {
     0x8000, 0x8000, 0x4000, 0x4000, 0x2000, 0x2000, 0x1000, 0x1000,
     0x0800, 0x0800, 0x0400, 0x0400, 0x0200, 0x0200, 0x0100, 0x0100
@@ -5616,13 +5921,13 @@ SystemZTargetLowering::emitTransactionBegin(MachineInstr *MI,
   Control |= GPRControlBit[15];
   if (TFI->hasFP(MF))
     Control |= GPRControlBit[11];
-  MI->getOperand(2).setImm(Control);
+  MI.getOperand(2).setImm(Control);
 
   // Add GPR clobbers.
   for (int I = 0; I < 16; I++) {
     if ((Control & GPRControlBit[I]) == 0) {
       unsigned Reg = SystemZMC::GR64Regs[I];
-      MI->addOperand(MachineOperand::CreateReg(Reg, true, true));
+      MI.addOperand(MachineOperand::CreateReg(Reg, true, true));
     }
   }
 
@@ -5631,12 +5936,12 @@ SystemZTargetLowering::emitTransactionBegin(MachineInstr *MI,
     if (Subtarget.hasVector()) {
       for (int I = 0; I < 32; I++) {
         unsigned Reg = SystemZMC::VR128Regs[I];
-        MI->addOperand(MachineOperand::CreateReg(Reg, true, true));
+        MI.addOperand(MachineOperand::CreateReg(Reg, true, true));
       }
     } else {
       for (int I = 0; I < 16; I++) {
         unsigned Reg = SystemZMC::FP64Regs[I];
-        MI->addOperand(MachineOperand::CreateReg(Reg, true, true));
+        MI.addOperand(MachineOperand::CreateReg(Reg, true, true));
       }
     }
   }
@@ -5644,17 +5949,15 @@ SystemZTargetLowering::emitTransactionBegin(MachineInstr *MI,
   return MBB;
 }
 
-MachineBasicBlock *
-SystemZTargetLowering::emitLoadAndTestCmp0(MachineInstr *MI,
-                                          MachineBasicBlock *MBB,
-                                          unsigned Opcode) const {
+MachineBasicBlock *SystemZTargetLowering::emitLoadAndTestCmp0(
+    MachineInstr &MI, MachineBasicBlock *MBB, unsigned Opcode) const {
   MachineFunction &MF = *MBB->getParent();
   MachineRegisterInfo *MRI = &MF.getRegInfo();
   const SystemZInstrInfo *TII =
       static_cast<const SystemZInstrInfo *>(Subtarget.getInstrInfo());
-  DebugLoc DL = MI->getDebugLoc();
+  DebugLoc DL = MI.getDebugLoc();
 
-  unsigned SrcReg = MI->getOperand(0).getReg();
+  unsigned SrcReg = MI.getOperand(0).getReg();
 
   // Create new virtual register of the same class as source.
   const TargetRegisterClass *RC = MRI->getRegClass(SrcReg);
@@ -5664,14 +5967,14 @@ SystemZTargetLowering::emitLoadAndTestCmp0(MachineInstr *MI,
   // well.
   BuildMI(*MBB, MI, DL, TII->get(Opcode), DstReg)
     .addReg(SrcReg);
-  MI->eraseFromParent();
+  MI.eraseFromParent();
 
   return MBB;
 }
 
-MachineBasicBlock *SystemZTargetLowering::
-EmitInstrWithCustomInserter(MachineInstr *MI, MachineBasicBlock *MBB) const {
-  switch (MI->getOpcode()) {
+MachineBasicBlock *SystemZTargetLowering::EmitInstrWithCustomInserter(
+    MachineInstr &MI, MachineBasicBlock *MBB) const {
+  switch (MI.getOpcode()) {
   case SystemZ::Select32Mux:
   case SystemZ::Select32:
   case SystemZ::SelectF32:
diff --git a/lib/Target/SystemZ/SystemZISelLowering.h b/lib/Target/SystemZ/SystemZISelLowering.h
index 391636e5467f..b1de8936beed 100644
--- a/lib/Target/SystemZ/SystemZISelLowering.h
+++ b/lib/Target/SystemZ/SystemZISelLowering.h
@@ -146,6 +146,9 @@ enum NodeType : unsigned {
   // Perform a serialization operation.  (BCR 15,0 or BCR 14,0.)
   SERIALIZE,
 
+  // Compiler barrier only; generate a no-op.
+  MEMBARRIER,
+
   // Transaction begin.  The first operand is the chain, the second
   // the TDB pointer, and the third the immediate control field.
   // Returns chain and glue.
@@ -275,6 +278,12 @@ enum NodeType : unsigned {
   VSTRC_CC,
   VSTRCZ_CC,
 
+  // Test Data Class.
+  //
+  // Operand 0: the value to test
+  // Operand 1: the bit mask
+  TDC,
+
   // Wrappers around the inner loop of an 8- or 16-bit ATOMIC_SWAP or
   // ATOMIC_LOAD_<op>.
   //
@@ -308,6 +317,19 @@ enum NodeType : unsigned {
   // Operand 5: the width of the field in bits (8 or 16)
   ATOMIC_CMP_SWAPW,
 
+  // Byte swapping load.
+  //
+  // Operand 0: the address to load from
+  // Operand 1: the type of load (i16, i32, i64)
+  LRV,
+
+  // Byte swapping store.
+  //
+  // Operand 0: the value to store
+  // Operand 1: the address to store to
+  // Operand 2: the type of store (i16, i32, i64)
+  STRV,
+
   // Prefetch from the second operand using the 4-bit control code in
   // the first operand.  The code is 1 for a load prefetch and 2 for
   // a store prefetch.
@@ -423,16 +445,23 @@ public:
     return SystemZ::R7D;
   }
 
-  MachineBasicBlock *EmitInstrWithCustomInserter(MachineInstr *MI,
-                                                 MachineBasicBlock *BB) const
-    override;
+  /// Override to support customized stack guard loading.
+  bool useLoadStackGuardNode() const override {
+    return true;
+  }
+  void insertSSPDeclarations(Module &M) const override {
+  }
+
+  MachineBasicBlock *
+  EmitInstrWithCustomInserter(MachineInstr &MI,
+                              MachineBasicBlock *BB) const override;
   SDValue LowerOperation(SDValue Op, SelectionDAG &DAG) const override;
   bool allowTruncateForTailCall(Type *, Type *) const override;
   bool mayBeEmittedAsTailCall(CallInst *CI) const override;
   SDValue LowerFormalArguments(SDValue Chain, CallingConv::ID CallConv,
                                bool isVarArg,
                                const SmallVectorImpl<ISD::InputArg> &Ins,
-                               SDLoc DL, SelectionDAG &DAG,
+                               const SDLoc &DL, SelectionDAG &DAG,
                                SmallVectorImpl<SDValue> &InVals) const override;
   SDValue LowerCall(CallLoweringInfo &CLI,
                     SmallVectorImpl<SDValue> &InVals) const override;
@@ -443,12 +472,20 @@ public:
                       LLVMContext &Context) const override;
   SDValue LowerReturn(SDValue Chain, CallingConv::ID CallConv, bool IsVarArg,
                       const SmallVectorImpl<ISD::OutputArg> &Outs,
-                      const SmallVectorImpl<SDValue> &OutVals,
-                      SDLoc DL, SelectionDAG &DAG) const override;
-  SDValue prepareVolatileOrAtomicLoad(SDValue Chain, SDLoc DL,
+                      const SmallVectorImpl<SDValue> &OutVals, const SDLoc &DL,
+                      SelectionDAG &DAG) const override;
+  SDValue prepareVolatileOrAtomicLoad(SDValue Chain, const SDLoc &DL,
                                       SelectionDAG &DAG) const override;
   SDValue PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const override;
 
+  ISD::NodeType getExtendForAtomicOps() const override {
+    return ISD::ANY_EXTEND;
+  }
+
+  bool supportSwiftError() const override {
+    return true;
+  }
+
 private:
   const SystemZSubtarget &Subtarget;
 
@@ -461,15 +498,19 @@ private:
   SDValue lowerTLSGetOffset(GlobalAddressSDNode *Node,
                             SelectionDAG &DAG, unsigned Opcode,
                             SDValue GOTOffset) const;
+  SDValue lowerThreadPointer(const SDLoc &DL, SelectionDAG &DAG) const;
   SDValue lowerGlobalTLSAddress(GlobalAddressSDNode *Node,
                                 SelectionDAG &DAG) const;
   SDValue lowerBlockAddress(BlockAddressSDNode *Node,
                             SelectionDAG &DAG) const;
   SDValue lowerJumpTable(JumpTableSDNode *JT, SelectionDAG &DAG) const;
   SDValue lowerConstantPool(ConstantPoolSDNode *CP, SelectionDAG &DAG) const;
+  SDValue lowerFRAMEADDR(SDValue Op, SelectionDAG &DAG) const;
+  SDValue lowerRETURNADDR(SDValue Op, SelectionDAG &DAG) const;
   SDValue lowerVASTART(SDValue Op, SelectionDAG &DAG) const;
   SDValue lowerVACOPY(SDValue Op, SelectionDAG &DAG) const;
   SDValue lowerDYNAMIC_STACKALLOC(SDValue Op, SelectionDAG &DAG) const;
+  SDValue lowerGET_DYNAMIC_AREA_OFFSET(SDValue Op, SelectionDAG &DAG) const;
   SDValue lowerSMUL_LOHI(SDValue Op, SelectionDAG &DAG) const;
   SDValue lowerUMUL_LOHI(SDValue Op, SelectionDAG &DAG) const;
   SDValue lowerSDIVREM(SDValue Op, SelectionDAG &DAG) const;
@@ -477,6 +518,7 @@ private:
   SDValue lowerBITCAST(SDValue Op, SelectionDAG &DAG) const;
   SDValue lowerOR(SDValue Op, SelectionDAG &DAG) const;
   SDValue lowerCTPOP(SDValue Op, SelectionDAG &DAG) const;
+  SDValue lowerATOMIC_FENCE(SDValue Op, SelectionDAG &DAG) const;
   SDValue lowerATOMIC_LOAD(SDValue Op, SelectionDAG &DAG) const;
   SDValue lowerATOMIC_STORE(SDValue Op, SelectionDAG &DAG) const;
   SDValue lowerATOMIC_LOAD_OP(SDValue Op, SelectionDAG &DAG,
@@ -498,11 +540,19 @@ private:
                                  unsigned UnpackHigh) const;
   SDValue lowerShift(SDValue Op, SelectionDAG &DAG, unsigned ByScalar) const;
 
-  SDValue combineExtract(SDLoc DL, EVT ElemVT, EVT VecVT, SDValue OrigOp,
+  SDValue combineExtract(const SDLoc &DL, EVT ElemVT, EVT VecVT, SDValue OrigOp,
                          unsigned Index, DAGCombinerInfo &DCI,
                          bool Force) const;
-  SDValue combineTruncateExtract(SDLoc DL, EVT TruncVT, SDValue Op,
+  SDValue combineTruncateExtract(const SDLoc &DL, EVT TruncVT, SDValue Op,
                                  DAGCombinerInfo &DCI) const;
+  SDValue combineSIGN_EXTEND(SDNode *N, DAGCombinerInfo &DCI) const;
+  SDValue combineMERGE(SDNode *N, DAGCombinerInfo &DCI) const;
+  SDValue combineSTORE(SDNode *N, DAGCombinerInfo &DCI) const;
+  SDValue combineEXTRACT_VECTOR_ELT(SDNode *N, DAGCombinerInfo &DCI) const;
+  SDValue combineJOIN_DWORDS(SDNode *N, DAGCombinerInfo &DCI) const;
+  SDValue combineFP_ROUND(SDNode *N, DAGCombinerInfo &DCI) const;
+  SDValue combineBSWAP(SDNode *N, DAGCombinerInfo &DCI) const;
+  SDValue combineSHIFTROT(SDNode *N, DAGCombinerInfo &DCI) const;
 
   // If the last instruction before MBBI in MBB was some form of COMPARE,
   // try to replace it with a COMPARE AND BRANCH just before MBBI.
@@ -514,40 +564,33 @@ private:
                                   MachineBasicBlock *Target) const;
 
   // Implement EmitInstrWithCustomInserter for individual operation types.
-  MachineBasicBlock *emitSelect(MachineInstr *MI,
-                                MachineBasicBlock *BB) const;
-  MachineBasicBlock *emitCondStore(MachineInstr *MI,
-                                   MachineBasicBlock *BB,
+  MachineBasicBlock *emitSelect(MachineInstr &MI, MachineBasicBlock *BB) const;
+  MachineBasicBlock *emitCondStore(MachineInstr &MI, MachineBasicBlock *BB,
                                    unsigned StoreOpcode, unsigned STOCOpcode,
                                    bool Invert) const;
-  MachineBasicBlock *emitExt128(MachineInstr *MI,
-                                MachineBasicBlock *MBB,
+  MachineBasicBlock *emitExt128(MachineInstr &MI, MachineBasicBlock *MBB,
                                 bool ClearEven, unsigned SubReg) const;
-  MachineBasicBlock *emitAtomicLoadBinary(MachineInstr *MI,
+  MachineBasicBlock *emitAtomicLoadBinary(MachineInstr &MI,
                                           MachineBasicBlock *BB,
                                           unsigned BinOpcode, unsigned BitSize,
                                           bool Invert = false) const;
-  MachineBasicBlock *emitAtomicLoadMinMax(MachineInstr *MI,
+  MachineBasicBlock *emitAtomicLoadMinMax(MachineInstr &MI,
                                           MachineBasicBlock *MBB,
                                           unsigned CompareOpcode,
                                           unsigned KeepOldMask,
                                           unsigned BitSize) const;
-  MachineBasicBlock *emitAtomicCmpSwapW(MachineInstr *MI,
+  MachineBasicBlock *emitAtomicCmpSwapW(MachineInstr &MI,
                                         MachineBasicBlock *BB) const;
-  MachineBasicBlock *emitMemMemWrapper(MachineInstr *MI,
-                                       MachineBasicBlock *BB,
+  MachineBasicBlock *emitMemMemWrapper(MachineInstr &MI, MachineBasicBlock *BB,
                                        unsigned Opcode) const;
-  MachineBasicBlock *emitStringWrapper(MachineInstr *MI,
-                                       MachineBasicBlock *BB,
+  MachineBasicBlock *emitStringWrapper(MachineInstr &MI, MachineBasicBlock *BB,
                                        unsigned Opcode) const;
-  MachineBasicBlock *emitTransactionBegin(MachineInstr *MI,
+  MachineBasicBlock *emitTransactionBegin(MachineInstr &MI,
                                           MachineBasicBlock *MBB,
-                                          unsigned Opcode,
-                                          bool NoFloat) const;
-  MachineBasicBlock *emitLoadAndTestCmp0(MachineInstr *MI,
+                                          unsigned Opcode, bool NoFloat) const;
+  MachineBasicBlock *emitLoadAndTestCmp0(MachineInstr &MI,
                                          MachineBasicBlock *MBB,
                                          unsigned Opcode) const;
-
 };
 } // end namespace llvm
 
diff --git a/lib/Target/SystemZ/SystemZInstrBuilder.h b/lib/Target/SystemZ/SystemZInstrBuilder.h
index 5a1c874dfa36..2cb8aba1b322 100644
--- a/lib/Target/SystemZ/SystemZInstrBuilder.h
+++ b/lib/Target/SystemZ/SystemZInstrBuilder.h
@@ -29,7 +29,7 @@ addFrameReference(const MachineInstrBuilder &MIB, int FI) {
   MachineFunction &MF = *MI->getParent()->getParent();
   MachineFrameInfo *MFFrame = MF.getFrameInfo();
   const MCInstrDesc &MCID = MI->getDesc();
-  unsigned Flags = 0;
+  auto Flags = MachineMemOperand::MONone;
   if (MCID.mayLoad())
     Flags |= MachineMemOperand::MOLoad;
   if (MCID.mayStore())
diff --git a/lib/Target/SystemZ/SystemZInstrFP.td b/lib/Target/SystemZ/SystemZInstrFP.td
index 0cb267290cc1..8b32047e08e3 100644
--- a/lib/Target/SystemZ/SystemZInstrFP.td
+++ b/lib/Target/SystemZ/SystemZInstrFP.td
@@ -37,6 +37,10 @@ let hasSideEffects = 0 in {
   def LER : UnaryRR <"le", 0x38,   null_frag, FP32,  FP32>;
   def LDR : UnaryRR <"ld", 0x28,   null_frag, FP64,  FP64>;
   def LXR : UnaryRRE<"lx", 0xB365, null_frag, FP128, FP128>;
+
+  // For z13 we prefer LDR over LER to avoid partial register dependencies.
+  let isCodeGenOnly = 1 in
+    def LDR32 : UnaryRR<"ld", 0x28, null_frag, FP32, FP32>;
 }
 
 // Moves between two floating-point registers that also set the condition
@@ -443,6 +447,13 @@ let Defs = [CC], CCValues = 0xF in {
   def CDB : CompareRXE<"cdb", 0xED19, z_fcmp, FP64, load, 8>;
 }
 
+// Test Data Class.
+let Defs = [CC], CCValues = 0xC in {
+  def TCEB : TestRXE<"tceb", 0xED10, z_tdc, FP32>;
+  def TCDB : TestRXE<"tcdb", 0xED11, z_tdc, FP64>;
+  def TCXB : TestRXE<"tcxb", 0xED12, z_tdc, FP128>;
+}
+
 //===----------------------------------------------------------------------===//
 // Peepholes
 //===----------------------------------------------------------------------===//
diff --git a/lib/Target/SystemZ/SystemZInstrFormats.td b/lib/Target/SystemZ/SystemZInstrFormats.td
index 01f4cdec05cb..973894d5c001 100644
--- a/lib/Target/SystemZ/SystemZInstrFormats.td
+++ b/lib/Target/SystemZ/SystemZInstrFormats.td
@@ -158,6 +158,17 @@ def getThreeOperandOpcode : InstrMapping {
 //
 //===----------------------------------------------------------------------===//
 
+class InstI<bits<8> op, dag outs, dag ins, string asmstr, list<dag> pattern>
+  : InstSystemZ<2, outs, ins, asmstr, pattern> {
+  field bits<16> Inst;
+  field bits<16> SoftFail = 0;
+
+  bits<8> I1;
+
+  let Inst{15-8} = op;
+  let Inst{7-0}  = I1;
+}
+
 class InstRI<bits<12> op, dag outs, dag ins, string asmstr, list<dag> pattern>
   : InstSystemZ<4, outs, ins, asmstr, pattern> {
   field bits<32> Inst;
@@ -172,6 +183,24 @@ class InstRI<bits<12> op, dag outs, dag ins, string asmstr, list<dag> pattern>
   let Inst{15-0}  = I2;
 }
 
+class InstRIEa<bits<16> op, dag outs, dag ins, string asmstr, list<dag> pattern>
+  : InstSystemZ<6, outs, ins, asmstr, pattern> {
+  field bits<48> Inst;
+  field bits<48> SoftFail = 0;
+
+  bits<4> R1;
+  bits<16> I2;
+  bits<4> M3;
+
+  let Inst{47-40} = op{15-8};
+  let Inst{39-36} = R1;
+  let Inst{35-32} = 0;
+  let Inst{31-16} = I2;
+  let Inst{15-12} = M3;
+  let Inst{11-8}  = 0;
+  let Inst{7-0}   = op{7-0};
+}
+
 class InstRIEb<bits<16> op, dag outs, dag ins, string asmstr, list<dag> pattern>
   : InstSystemZ<6, outs, ins, asmstr, pattern> {
   field bits<48> Inst;
@@ -260,6 +289,24 @@ class InstRIL<bits<12> op, dag outs, dag ins, string asmstr, list<dag> pattern>
   let Inst{31-0}  = I2;
 }
 
+class InstRIS<bits<16> op, dag outs, dag ins, string asmstr, list<dag> pattern>
+  : InstSystemZ<6, outs, ins, asmstr, pattern> {
+  field bits<48> Inst;
+  field bits<48> SoftFail = 0;
+
+  bits<4> R1;
+  bits<8> I2;
+  bits<4> M3;
+  bits<16> BD4;
+
+  let Inst{47-40} = op{15-8};
+  let Inst{39-36} = R1;
+  let Inst{35-32} = M3;
+  let Inst{31-16} = BD4;
+  let Inst{15-8}  = I2;
+  let Inst{7-0}   = op{7-0};
+}
+
 class InstRR<bits<8> op, dag outs, dag ins, string asmstr, list<dag> pattern>
   : InstSystemZ<2, outs, ins, asmstr, pattern> {
   field bits<16> Inst;
@@ -320,6 +367,41 @@ class InstRRF<bits<16> op, dag outs, dag ins, string asmstr, list<dag> pattern>
   let Inst{3-0}   = R2;
 }
 
+class InstRRFc<bits<16> op, dag outs, dag ins, string asmstr, list<dag> pattern>
+  : InstSystemZ<4, outs, ins, asmstr, pattern> {
+  field bits<32> Inst;
+  field bits<32> SoftFail = 0;
+
+  bits<4> R1;
+  bits<4> R2;
+  bits<4> M3;
+
+  let Inst{31-16} = op;
+  let Inst{15-12} = M3;
+  let Inst{11-8}  = 0;
+  let Inst{7-4}   = R1;
+  let Inst{3-0}   = R2;
+}
+
+class InstRRS<bits<16> op, dag outs, dag ins, string asmstr, list<dag> pattern>
+  : InstSystemZ<6, outs, ins, asmstr, pattern> {
+  field bits<48> Inst;
+  field bits<48> SoftFail = 0;
+
+  bits<4> R1;
+  bits<4> R2;
+  bits<4> M3;
+  bits<16> BD4;
+
+  let Inst{47-40} = op{15-8};
+  let Inst{39-36} = R1;
+  let Inst{35-32} = R2;
+  let Inst{31-16} = BD4;
+  let Inst{15-12} = M3;
+  let Inst{11-8}  = 0;
+  let Inst{7-0}   = op{7-0};
+}
+
 class InstRX<bits<8> op, dag outs, dag ins, string asmstr, list<dag> pattern>
   : InstSystemZ<4, outs, ins, asmstr, pattern> {
   field bits<32> Inst;
@@ -919,6 +1001,10 @@ class InstVRX<bits<16> op, dag outs, dag ins, string asmstr, list<dag> pattern>
 //   Compare:
 //     Two input operands and an implicit CC output operand.
 //
+//   Test:
+//     Two input operands and an implicit CC output operand.  The second
+//     input operand is an "address" operand used as a test class mask.
+//
 //   Ternary:
 //     One register output operand and three input operands.
 //
@@ -974,12 +1060,30 @@ class BranchUnaryRI<string mnemonic, bits<12> opcode, RegisterOperand cls>
   let DisableEncoding = "$R1src";
 }
 
-class LoadMultipleRSY<string mnemonic, bits<16> opcode, RegisterOperand cls>
-  : InstRSY<opcode, (outs cls:$R1, cls:$R3), (ins bdaddr20only:$BD2),
+class LoadMultipleRS<string mnemonic, bits<8> opcode, RegisterOperand cls,
+                     AddressingMode mode = bdaddr12only>
+  : InstRS<opcode, (outs cls:$R1, cls:$R3), (ins mode:$BD2),
+           mnemonic#"\t$R1, $R3, $BD2", []> {
+  let mayLoad = 1;
+}
+
+class LoadMultipleRSY<string mnemonic, bits<16> opcode, RegisterOperand cls,
+                      AddressingMode mode = bdaddr20only>
+  : InstRSY<opcode, (outs cls:$R1, cls:$R3), (ins mode:$BD2),
             mnemonic#"\t$R1, $R3, $BD2", []> {
   let mayLoad = 1;
 }
 
+multiclass LoadMultipleRSPair<string mnemonic, bits<8> rsOpcode,
+                              bits<16> rsyOpcode, RegisterOperand cls> {
+  let DispKey = mnemonic ## #cls in {
+    let DispSize = "12" in
+      def "" : LoadMultipleRS<mnemonic, rsOpcode, cls, bdaddr12pair>;
+    let DispSize = "20" in
+      def Y  : LoadMultipleRSY<mnemonic#"y", rsyOpcode, cls, bdaddr20pair>;
+  }
+}
+
 class LoadMultipleVRSa<string mnemonic, bits<16> opcode>
   : InstVRSa<opcode, (outs VR128:$V1, VR128:$V3), (ins bdaddr12only:$BD2),
              mnemonic#"\t$V1, $V3, $BD2", []> {
@@ -1055,12 +1159,30 @@ class StoreLengthVRSb<string mnemonic, bits<16> opcode,
   let AccessBytes = bytes;
 }
 
-class StoreMultipleRSY<string mnemonic, bits<16> opcode, RegisterOperand cls>
-  : InstRSY<opcode, (outs), (ins cls:$R1, cls:$R3, bdaddr20only:$BD2),
+class StoreMultipleRS<string mnemonic, bits<8> opcode, RegisterOperand cls,
+                      AddressingMode mode = bdaddr12only>
+  : InstRS<opcode, (outs), (ins cls:$R1, cls:$R3, mode:$BD2),
+           mnemonic#"\t$R1, $R3, $BD2", []> {
+  let mayStore = 1;
+}
+
+class StoreMultipleRSY<string mnemonic, bits<16> opcode, RegisterOperand cls,
+                       AddressingMode mode = bdaddr20only>
+  : InstRSY<opcode, (outs), (ins cls:$R1, cls:$R3, mode:$BD2),
             mnemonic#"\t$R1, $R3, $BD2", []> {
   let mayStore = 1;
 }
 
+multiclass StoreMultipleRSPair<string mnemonic, bits<8> rsOpcode,
+                               bits<16> rsyOpcode, RegisterOperand cls> {
+  let DispKey = mnemonic ## #cls in {
+    let DispSize = "12" in
+      def "" : StoreMultipleRS<mnemonic, rsOpcode, cls, bdaddr12pair>;
+    let DispSize = "20" in
+      def Y  : StoreMultipleRSY<mnemonic#"y", rsyOpcode, cls, bdaddr20pair>;
+  }
+}
+
 class StoreMultipleVRSa<string mnemonic, bits<16> opcode>
   : InstVRSa<opcode, (outs), (ins VR128:$V1, VR128:$V3, bdaddr12only:$BD2),
              mnemonic#"\t$V1, $V3, $BD2", []> {
@@ -1186,6 +1308,15 @@ class CondUnaryRRF<string mnemonic, bits<16> opcode, RegisterOperand cls1,
   let R4 = 0;
 }
 
+class CondUnaryRIE<string mnemonic, bits<16> opcode, RegisterOperand cls,
+                   Immediate imm>
+  : InstRIEd<opcode, (outs cls:$R1),
+                     (ins imm:$I2, cond4:$valid, cond4:$R3),
+             mnemonic#"$R3\t$R1, $I2", []>,
+    Requires<[FeatureLoadStoreOnCond2]> {
+  let CCMaskLast = 1;
+}
+
 // Like CondUnaryRRF, but used for the raw assembly form.  The condition-code
 // mask is the third operand rather than being part of the mnemonic.
 class AsmCondUnaryRRF<string mnemonic, bits<16> opcode, RegisterOperand cls1,
@@ -1198,6 +1329,16 @@ class AsmCondUnaryRRF<string mnemonic, bits<16> opcode, RegisterOperand cls1,
   let R4 = 0;
 }
 
+class AsmCondUnaryRIE<string mnemonic, bits<16> opcode, RegisterOperand cls,
+                   Immediate imm>
+  : InstRIEd<opcode, (outs cls:$R1),
+                     (ins cls:$R1src, imm:$I2, imm32zx4:$R3),
+             mnemonic#"\t$R1, $I2, $R3", []>,
+    Requires<[FeatureLoadStoreOnCond2]> {
+  let Constraints = "$R1 = $R1src";
+  let DisableEncoding = "$R1src";
+}
+
 // Like CondUnaryRRF, but with a fixed CC mask.
 class FixedCondUnaryRRF<string mnemonic, bits<16> opcode, RegisterOperand cls1,
                         RegisterOperand cls2, bits<4> ccmask>
@@ -1210,6 +1351,17 @@ class FixedCondUnaryRRF<string mnemonic, bits<16> opcode, RegisterOperand cls1,
   let R4 = 0;
 }
 
+class FixedCondUnaryRIE<string mnemonic, bits<16> opcode, RegisterOperand cls,
+                   Immediate imm, bits<4> ccmask>
+  : InstRIEd<opcode, (outs cls:$R1),
+                     (ins cls:$R1src, imm:$I2),
+             mnemonic#"\t$R1, $I2", []>,
+    Requires<[FeatureLoadStoreOnCond2]> {
+  let Constraints = "$R1 = $R1src";
+  let DisableEncoding = "$R1src";
+  let R3 = ccmask;
+}
+
 class UnaryRI<string mnemonic, bits<12> opcode, SDPatternOperator operator,
               RegisterOperand cls, Immediate imm>
   : InstRI<opcode, (outs cls:$R1), (ins imm:$I2),
@@ -1391,9 +1543,9 @@ class BinaryRRE<string mnemonic, bits<16> opcode, SDPatternOperator operator,
 
 class BinaryRRF<string mnemonic, bits<16> opcode, SDPatternOperator operator,
                 RegisterOperand cls1, RegisterOperand cls2>
-  : InstRRF<opcode, (outs cls1:$R1), (ins cls1:$R3, cls2:$R2),
+  : InstRRF<opcode, (outs cls1:$R1), (ins cls1:$R2, cls2:$R3),
             mnemonic#"r\t$R1, $R3, $R2",
-            [(set cls1:$R1, (operator cls1:$R3, cls2:$R2))]> {
+            [(set cls1:$R1, (operator cls1:$R2, cls2:$R3))]> {
   let OpKey = mnemonic ## cls1;
   let OpType = "reg";
   let R4 = 0;
@@ -1874,6 +2026,14 @@ class CompareVRRa<string mnemonic, bits<16> opcode, SDPatternOperator operator,
   let M5 = 0;
 }
 
+class TestRXE<string mnemonic, bits<16> opcode, SDPatternOperator operator,
+              RegisterOperand cls>
+  : InstRXE<opcode, (outs), (ins cls:$R1, bdxaddr12only:$XBD2),
+            mnemonic#"\t$R1, $XBD2",
+            [(operator cls:$R1, bdxaddr12only:$XBD2)]> {
+  let M3 = 0;
+}
+
 class TernaryRRD<string mnemonic, bits<16> opcode,
                  SDPatternOperator operator, RegisterOperand cls>
   : InstRRD<opcode, (outs cls:$R1), (ins cls:$R1src, cls:$R3, cls:$R2),
@@ -1885,6 +2045,40 @@ class TernaryRRD<string mnemonic, bits<16> opcode,
   let DisableEncoding = "$R1src";
 }
 
+class TernaryRS<string mnemonic, bits<8> opcode, RegisterOperand cls,
+                bits<5> bytes, AddressingMode mode = bdaddr12only>
+  : InstRS<opcode, (outs cls:$R1),
+          (ins cls:$R1src, imm32zx4:$R3, mode:$BD2),
+           mnemonic#"\t$R1, $R3, $BD2", []> {
+
+  let Constraints = "$R1 = $R1src";
+  let DisableEncoding = "$R1src";
+  let mayLoad = 1;
+  let AccessBytes = bytes;
+}
+
+class TernaryRSY<string mnemonic, bits<16> opcode, RegisterOperand cls,
+                bits<5> bytes, AddressingMode mode = bdaddr20only>
+  : InstRSY<opcode, (outs cls:$R1),
+           (ins cls:$R1src, imm32zx4:$R3, mode:$BD2),
+            mnemonic#"\t$R1, $R3, $BD2", []> {
+
+  let Constraints = "$R1 = $R1src";
+  let DisableEncoding = "$R1src";
+  let mayLoad = 1;
+  let AccessBytes = bytes;
+}
+
+multiclass TernaryRSPair<string mnemonic, bits<8> rsOpcode, bits<16> rsyOpcode,
+                         RegisterOperand cls, bits<5> bytes> {
+  let DispKey = mnemonic ## #cls in {
+    let DispSize = "12" in
+      def "" : TernaryRS<mnemonic, rsOpcode, cls, bytes, bdaddr12pair>;
+    let DispSize = "20" in
+      def Y  : TernaryRSY<mnemonic#"y", rsyOpcode, cls, bytes, bdaddr20pair>;
+  }
+}
+
 class TernaryRXF<string mnemonic, bits<16> opcode, SDPatternOperator operator,
                  RegisterOperand cls, SDPatternOperator load, bits<5> bytes>
   : InstRXF<opcode, (outs cls:$R1),
diff --git a/lib/Target/SystemZ/SystemZInstrInfo.cpp b/lib/Target/SystemZ/SystemZInstrInfo.cpp
index e6b5fc8e6235..4084e93e5acb 100644
--- a/lib/Target/SystemZ/SystemZInstrInfo.cpp
+++ b/lib/Target/SystemZ/SystemZInstrInfo.cpp
@@ -15,6 +15,7 @@
 #include "SystemZInstrBuilder.h"
 #include "SystemZTargetMachine.h"
 #include "llvm/CodeGen/LiveVariables.h"
+#include "llvm/CodeGen/LiveIntervalAnalysis.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
 
 using namespace llvm;
@@ -54,7 +55,7 @@ void SystemZInstrInfo::splitMove(MachineBasicBlock::iterator MI,
 
   // Get two load or store instructions.  Use the original instruction for one
   // of them (arbitrarily the second here) and create a clone for the other.
-  MachineInstr *EarlierMI = MF.CloneMachineInstr(MI);
+  MachineInstr *EarlierMI = MF.CloneMachineInstr(&*MI);
   MBB->insert(MI, EarlierMI);
 
   // Set up the two 64-bit registers.
@@ -69,8 +70,8 @@ void SystemZInstrInfo::splitMove(MachineBasicBlock::iterator MI,
   MachineOperand &LowOffsetOp = MI->getOperand(2);
   LowOffsetOp.setImm(LowOffsetOp.getImm() + 8);
 
- // Clear the kill flags for the base and index registers in the first
- // instruction.
+  // Clear the kill flags for the base and index registers in the first
+  // instruction.
   EarlierMI->getOperand(1).setIsKill(false);
   EarlierMI->getOperand(3).setIsKill(false);
 
@@ -105,59 +106,89 @@ void SystemZInstrInfo::splitAdjDynAlloc(MachineBasicBlock::iterator MI) const {
 // and HighOpcode takes an unsigned 32-bit operand.  In those cases,
 // MI has the same kind of operand as LowOpcode, so needs to be converted
 // if HighOpcode is used.
-void SystemZInstrInfo::expandRIPseudo(MachineInstr *MI, unsigned LowOpcode,
+void SystemZInstrInfo::expandRIPseudo(MachineInstr &MI, unsigned LowOpcode,
                                       unsigned HighOpcode,
                                       bool ConvertHigh) const {
-  unsigned Reg = MI->getOperand(0).getReg();
+  unsigned Reg = MI.getOperand(0).getReg();
   bool IsHigh = isHighReg(Reg);
-  MI->setDesc(get(IsHigh ? HighOpcode : LowOpcode));
+  MI.setDesc(get(IsHigh ? HighOpcode : LowOpcode));
   if (IsHigh && ConvertHigh)
-    MI->getOperand(1).setImm(uint32_t(MI->getOperand(1).getImm()));
+    MI.getOperand(1).setImm(uint32_t(MI.getOperand(1).getImm()));
 }
 
 // MI is a three-operand RIE-style pseudo instruction.  Replace it with
 // LowOpcodeK if the registers are both low GR32s, otherwise use a move
 // followed by HighOpcode or LowOpcode, depending on whether the target
 // is a high or low GR32.
-void SystemZInstrInfo::expandRIEPseudo(MachineInstr *MI, unsigned LowOpcode,
+void SystemZInstrInfo::expandRIEPseudo(MachineInstr &MI, unsigned LowOpcode,
                                        unsigned LowOpcodeK,
                                        unsigned HighOpcode) const {
-  unsigned DestReg = MI->getOperand(0).getReg();
-  unsigned SrcReg = MI->getOperand(1).getReg();
+  unsigned DestReg = MI.getOperand(0).getReg();
+  unsigned SrcReg = MI.getOperand(1).getReg();
   bool DestIsHigh = isHighReg(DestReg);
   bool SrcIsHigh = isHighReg(SrcReg);
   if (!DestIsHigh && !SrcIsHigh)
-    MI->setDesc(get(LowOpcodeK));
+    MI.setDesc(get(LowOpcodeK));
   else {
-    emitGRX32Move(*MI->getParent(), MI, MI->getDebugLoc(),
-                  DestReg, SrcReg, SystemZ::LR, 32,
-                  MI->getOperand(1).isKill());
-    MI->setDesc(get(DestIsHigh ? HighOpcode : LowOpcode));
-    MI->getOperand(1).setReg(DestReg);
-    MI->tieOperands(0, 1);
+    emitGRX32Move(*MI.getParent(), MI, MI.getDebugLoc(), DestReg, SrcReg,
+                  SystemZ::LR, 32, MI.getOperand(1).isKill());
+    MI.setDesc(get(DestIsHigh ? HighOpcode : LowOpcode));
+    MI.getOperand(1).setReg(DestReg);
+    MI.tieOperands(0, 1);
   }
 }
 
 // MI is an RXY-style pseudo instruction.  Replace it with LowOpcode
 // if the first operand is a low GR32 and HighOpcode if the first operand
 // is a high GR32.
-void SystemZInstrInfo::expandRXYPseudo(MachineInstr *MI, unsigned LowOpcode,
+void SystemZInstrInfo::expandRXYPseudo(MachineInstr &MI, unsigned LowOpcode,
                                        unsigned HighOpcode) const {
-  unsigned Reg = MI->getOperand(0).getReg();
+  unsigned Reg = MI.getOperand(0).getReg();
   unsigned Opcode = getOpcodeForOffset(isHighReg(Reg) ? HighOpcode : LowOpcode,
-                                       MI->getOperand(2).getImm());
-  MI->setDesc(get(Opcode));
+                                       MI.getOperand(2).getImm());
+  MI.setDesc(get(Opcode));
 }
 
 // MI is an RR-style pseudo instruction that zero-extends the low Size bits
 // of one GRX32 into another.  Replace it with LowOpcode if both operands
 // are low registers, otherwise use RISB[LH]G.
-void SystemZInstrInfo::expandZExtPseudo(MachineInstr *MI, unsigned LowOpcode,
+void SystemZInstrInfo::expandZExtPseudo(MachineInstr &MI, unsigned LowOpcode,
                                         unsigned Size) const {
-  emitGRX32Move(*MI->getParent(), MI, MI->getDebugLoc(),
-                MI->getOperand(0).getReg(), MI->getOperand(1).getReg(),
-                LowOpcode, Size, MI->getOperand(1).isKill());
-  MI->eraseFromParent();
+  emitGRX32Move(*MI.getParent(), MI, MI.getDebugLoc(),
+                MI.getOperand(0).getReg(), MI.getOperand(1).getReg(), LowOpcode,
+                Size, MI.getOperand(1).isKill());
+  MI.eraseFromParent();
+}
+
+void SystemZInstrInfo::expandLoadStackGuard(MachineInstr *MI) const {
+  MachineBasicBlock *MBB = MI->getParent();
+  MachineFunction &MF = *MBB->getParent();
+  const unsigned Reg = MI->getOperand(0).getReg();
+
+  // Conveniently, all 4 instructions are cloned from LOAD_STACK_GUARD,
+  // so they already have operand 0 set to reg.
+
+  // ear <reg>, %a0
+  MachineInstr *Ear1MI = MF.CloneMachineInstr(MI);
+  MBB->insert(MI, Ear1MI);
+  Ear1MI->setDesc(get(SystemZ::EAR));
+  MachineInstrBuilder(MF, Ear1MI).addImm(0);
+
+  // sllg <reg>, <reg>, 32
+  MachineInstr *SllgMI = MF.CloneMachineInstr(MI);
+  MBB->insert(MI, SllgMI);
+  SllgMI->setDesc(get(SystemZ::SLLG));
+  MachineInstrBuilder(MF, SllgMI).addReg(Reg).addReg(0).addImm(32);
+
+  // ear <reg>, %a1
+  MachineInstr *Ear2MI = MF.CloneMachineInstr(MI);
+  MBB->insert(MI, Ear2MI);
+  Ear2MI->setDesc(get(SystemZ::EAR));
+  MachineInstrBuilder(MF, Ear2MI).addImm(1);
+
+  // lg <reg>, 40(<reg>)
+  MI->setDesc(get(SystemZ::LG));
+  MachineInstrBuilder(MF, MI).addReg(Reg).addImm(40).addReg(0);
 }
 
 // Emit a zero-extending move from 32-bit GPR SrcReg to 32-bit GPR
@@ -167,7 +198,7 @@ void SystemZInstrInfo::expandZExtPseudo(MachineInstr *MI, unsigned LowOpcode,
 // KillSrc is true if this move is the last use of SrcReg.
 void SystemZInstrInfo::emitGRX32Move(MachineBasicBlock &MBB,
                                      MachineBasicBlock::iterator MBBI,
-                                     DebugLoc DL, unsigned DestReg,
+                                     const DebugLoc &DL, unsigned DestReg,
                                      unsigned SrcReg, unsigned LowLowOpcode,
                                      unsigned Size, bool KillSrc) const {
   unsigned Opcode;
@@ -196,45 +227,41 @@ void SystemZInstrInfo::emitGRX32Move(MachineBasicBlock &MBB,
 // Return 0 otherwise.
 //
 // Flag is SimpleBDXLoad for loads and SimpleBDXStore for stores.
-static int isSimpleMove(const MachineInstr *MI, int &FrameIndex,
+static int isSimpleMove(const MachineInstr &MI, int &FrameIndex,
                         unsigned Flag) {
-  const MCInstrDesc &MCID = MI->getDesc();
-  if ((MCID.TSFlags & Flag) &&
-      MI->getOperand(1).isFI() &&
-      MI->getOperand(2).getImm() == 0 &&
-      MI->getOperand(3).getReg() == 0) {
-    FrameIndex = MI->getOperand(1).getIndex();
-    return MI->getOperand(0).getReg();
+  const MCInstrDesc &MCID = MI.getDesc();
+  if ((MCID.TSFlags & Flag) && MI.getOperand(1).isFI() &&
+      MI.getOperand(2).getImm() == 0 && MI.getOperand(3).getReg() == 0) {
+    FrameIndex = MI.getOperand(1).getIndex();
+    return MI.getOperand(0).getReg();
   }
   return 0;
 }
 
-unsigned SystemZInstrInfo::isLoadFromStackSlot(const MachineInstr *MI,
+unsigned SystemZInstrInfo::isLoadFromStackSlot(const MachineInstr &MI,
                                                int &FrameIndex) const {
   return isSimpleMove(MI, FrameIndex, SystemZII::SimpleBDXLoad);
 }
 
-unsigned SystemZInstrInfo::isStoreToStackSlot(const MachineInstr *MI,
+unsigned SystemZInstrInfo::isStoreToStackSlot(const MachineInstr &MI,
                                               int &FrameIndex) const {
   return isSimpleMove(MI, FrameIndex, SystemZII::SimpleBDXStore);
 }
 
-bool SystemZInstrInfo::isStackSlotCopy(const MachineInstr *MI,
+bool SystemZInstrInfo::isStackSlotCopy(const MachineInstr &MI,
                                        int &DestFrameIndex,
                                        int &SrcFrameIndex) const {
   // Check for MVC 0(Length,FI1),0(FI2)
-  const MachineFrameInfo *MFI = MI->getParent()->getParent()->getFrameInfo();
-  if (MI->getOpcode() != SystemZ::MVC ||
-      !MI->getOperand(0).isFI() ||
-      MI->getOperand(1).getImm() != 0 ||
-      !MI->getOperand(3).isFI() ||
-      MI->getOperand(4).getImm() != 0)
+  const MachineFrameInfo *MFI = MI.getParent()->getParent()->getFrameInfo();
+  if (MI.getOpcode() != SystemZ::MVC || !MI.getOperand(0).isFI() ||
+      MI.getOperand(1).getImm() != 0 || !MI.getOperand(3).isFI() ||
+      MI.getOperand(4).getImm() != 0)
     return false;
 
   // Check that Length covers the full slots.
-  int64_t Length = MI->getOperand(2).getImm();
-  unsigned FI1 = MI->getOperand(0).getIndex();
-  unsigned FI2 = MI->getOperand(3).getIndex();
+  int64_t Length = MI.getOperand(2).getImm();
+  unsigned FI1 = MI.getOperand(0).getIndex();
+  unsigned FI2 = MI.getOperand(3).getIndex();
   if (MFI->getObjectSize(FI1) != Length ||
       MFI->getObjectSize(FI2) != Length)
     return false;
@@ -244,7 +271,7 @@ bool SystemZInstrInfo::isStackSlotCopy(const MachineInstr *MI,
   return true;
 }
 
-bool SystemZInstrInfo::AnalyzeBranch(MachineBasicBlock &MBB,
+bool SystemZInstrInfo::analyzeBranch(MachineBasicBlock &MBB,
                                      MachineBasicBlock *&TBB,
                                      MachineBasicBlock *&FBB,
                                      SmallVectorImpl<MachineOperand> &Cond,
@@ -261,7 +288,7 @@ bool SystemZInstrInfo::AnalyzeBranch(MachineBasicBlock &MBB,
 
     // Working from the bottom, when we see a non-terminator instruction, we're
     // done.
-    if (!isUnpredicatedTerminator(I))
+    if (!isUnpredicatedTerminator(*I))
       break;
 
     // A terminator that isn't a branch can't easily be handled by this
@@ -270,7 +297,7 @@ bool SystemZInstrInfo::AnalyzeBranch(MachineBasicBlock &MBB,
       return true;
 
     // Can't handle indirect branches.
-    SystemZII::Branch Branch(getBranchInfo(I));
+    SystemZII::Branch Branch(getBranchInfo(*I));
     if (!Branch.Target->isMBB())
       return true;
 
@@ -347,7 +374,7 @@ unsigned SystemZInstrInfo::RemoveBranch(MachineBasicBlock &MBB) const {
       continue;
     if (!I->isBranch())
       break;
-    if (!getBranchInfo(I).Target->isMBB())
+    if (!getBranchInfo(*I).Target->isMBB())
       break;
     // Remove the branch.
     I->eraseFromParent();
@@ -365,11 +392,11 @@ ReverseBranchCondition(SmallVectorImpl<MachineOperand> &Cond) const {
   return false;
 }
 
-unsigned
-SystemZInstrInfo::InsertBranch(MachineBasicBlock &MBB, MachineBasicBlock *TBB,
-                               MachineBasicBlock *FBB,
-                               ArrayRef<MachineOperand> Cond,
-                               DebugLoc DL) const {
+unsigned SystemZInstrInfo::InsertBranch(MachineBasicBlock &MBB,
+                                        MachineBasicBlock *TBB,
+                                        MachineBasicBlock *FBB,
+                                        ArrayRef<MachineOperand> Cond,
+                                        const DebugLoc &DL) const {
   // In this function we output 32-bit branches, which should always
   // have enough range.  They can be shortened and relaxed by later code
   // in the pipeline, if desired.
@@ -402,17 +429,16 @@ SystemZInstrInfo::InsertBranch(MachineBasicBlock &MBB, MachineBasicBlock *TBB,
   return Count;
 }
 
-bool SystemZInstrInfo::analyzeCompare(const MachineInstr *MI,
-                                      unsigned &SrcReg, unsigned &SrcReg2,
-                                      int &Mask, int &Value) const {
-  assert(MI->isCompare() && "Caller should have checked for a comparison");
+bool SystemZInstrInfo::analyzeCompare(const MachineInstr &MI, unsigned &SrcReg,
+                                      unsigned &SrcReg2, int &Mask,
+                                      int &Value) const {
+  assert(MI.isCompare() && "Caller should have checked for a comparison");
 
-  if (MI->getNumExplicitOperands() == 2 &&
-      MI->getOperand(0).isReg() &&
-      MI->getOperand(1).isImm()) {
-    SrcReg = MI->getOperand(0).getReg();
+  if (MI.getNumExplicitOperands() == 2 && MI.getOperand(0).isReg() &&
+      MI.getOperand(1).isImm()) {
+    SrcReg = MI.getOperand(0).getReg();
     SrcReg2 = 0;
-    Value = MI->getOperand(1).getImm();
+    Value = MI.getOperand(1).getImm();
     Mask = ~0;
     return true;
   }
@@ -445,7 +471,7 @@ static void eraseIfDead(MachineInstr *MI, const MachineRegisterInfo *MRI) {
 // the result of an IPM sequence whose input CC survives until Compare,
 // and whether Compare is therefore redundant.  Delete it and return
 // true if so.
-static bool removeIPMBasedCompare(MachineInstr *Compare, unsigned SrcReg,
+static bool removeIPMBasedCompare(MachineInstr &Compare, unsigned SrcReg,
                                   const MachineRegisterInfo *MRI,
                                   const TargetRegisterInfo *TRI) {
   MachineInstr *LGFR = nullptr;
@@ -466,16 +492,16 @@ static bool removeIPMBasedCompare(MachineInstr *Compare, unsigned SrcReg,
     return false;
 
   // Check that there are no assignments to CC between the IPM and Compare,
-  if (IPM->getParent() != Compare->getParent())
+  if (IPM->getParent() != Compare.getParent())
     return false;
-  MachineBasicBlock::iterator MBBI = IPM, MBBE = Compare;
+  MachineBasicBlock::iterator MBBI = IPM, MBBE = Compare.getIterator();
   for (++MBBI; MBBI != MBBE; ++MBBI) {
-    MachineInstr *MI = MBBI;
-    if (MI->modifiesRegister(SystemZ::CC, TRI))
+    MachineInstr &MI = *MBBI;
+    if (MI.modifiesRegister(SystemZ::CC, TRI))
       return false;
   }
 
-  Compare->eraseFromParent();
+  Compare.eraseFromParent();
   if (LGFR)
     eraseIfDead(LGFR, MRI);
   eraseIfDead(RLL, MRI);
@@ -485,13 +511,11 @@ static bool removeIPMBasedCompare(MachineInstr *Compare, unsigned SrcReg,
   return true;
 }
 
-bool
-SystemZInstrInfo::optimizeCompareInstr(MachineInstr *Compare,
-                                       unsigned SrcReg, unsigned SrcReg2,
-                                       int Mask, int Value,
-                                       const MachineRegisterInfo *MRI) const {
+bool SystemZInstrInfo::optimizeCompareInstr(
+    MachineInstr &Compare, unsigned SrcReg, unsigned SrcReg2, int Mask,
+    int Value, const MachineRegisterInfo *MRI) const {
   assert(!SrcReg2 && "Only optimizing constant comparisons so far");
-  bool IsLogical = (Compare->getDesc().TSFlags & SystemZII::IsLogical) != 0;
+  bool IsLogical = (Compare.getDesc().TSFlags & SystemZII::IsLogical) != 0;
   return Value == 0 && !IsLogical &&
          removeIPMBasedCompare(Compare, SrcReg, MRI, &RI);
 }
@@ -506,15 +530,43 @@ static unsigned getConditionalMove(unsigned Opcode) {
   }
 }
 
-bool SystemZInstrInfo::isPredicable(MachineInstr *MI) const {
-  unsigned Opcode = MI->getOpcode();
-  return STI.hasLoadStoreOnCond() && getConditionalMove(Opcode);
+static unsigned getConditionalLoadImmediate(unsigned Opcode) {
+  switch (Opcode) {
+  case SystemZ::LHI:  return SystemZ::LOCHI;
+  case SystemZ::LGHI: return SystemZ::LOCGHI;
+  default:           return 0;
+  }
+}
+
+bool SystemZInstrInfo::isPredicable(MachineInstr &MI) const {
+  unsigned Opcode = MI.getOpcode();
+  if (STI.hasLoadStoreOnCond() && getConditionalMove(Opcode))
+    return true;
+  if (STI.hasLoadStoreOnCond2() && getConditionalLoadImmediate(Opcode))
+    return true;
+  if (Opcode == SystemZ::Return ||
+      Opcode == SystemZ::Trap ||
+      Opcode == SystemZ::CallJG ||
+      Opcode == SystemZ::CallBR)
+    return true;
+  return false;
 }
 
 bool SystemZInstrInfo::
 isProfitableToIfCvt(MachineBasicBlock &MBB,
                     unsigned NumCycles, unsigned ExtraPredCycles,
                     BranchProbability Probability) const {
+  // Avoid using conditional returns at the end of a loop (since then
+  // we'd need to emit an unconditional branch to the beginning anyway,
+  // making the loop body longer).  This doesn't apply for low-probability
+  // loops (eg. compare-and-swap retry), so just decide based on branch
+  // probability instead of looping structure.
+  // However, since Compare and Trap instructions cost the same as a regular
+  // Compare instruction, we should allow the if conversion to convert this
+  // into a Conditional Compare regardless of the branch probability.
+  if (MBB.getLastNonDebugInstr()->getOpcode() != SystemZ::Trap &&
+      MBB.succ_empty() && Probability < BranchProbability(1, 8))
+    return false;
   // For now only convert single instructions.
   return NumCycles == 1;
 }
@@ -530,27 +582,82 @@ isProfitableToIfCvt(MachineBasicBlock &TMBB,
 }
 
 bool SystemZInstrInfo::
-PredicateInstruction(MachineInstr *MI, ArrayRef<MachineOperand> Pred) const {
+isProfitableToDupForIfCvt(MachineBasicBlock &MBB, unsigned NumCycles,
+                          BranchProbability Probability) const {
+  // For now only duplicate single instructions.
+  return NumCycles == 1;
+}
+
+bool SystemZInstrInfo::PredicateInstruction(
+    MachineInstr &MI, ArrayRef<MachineOperand> Pred) const {
   assert(Pred.size() == 2 && "Invalid condition");
   unsigned CCValid = Pred[0].getImm();
   unsigned CCMask = Pred[1].getImm();
   assert(CCMask > 0 && CCMask < 15 && "Invalid predicate");
-  unsigned Opcode = MI->getOpcode();
+  unsigned Opcode = MI.getOpcode();
   if (STI.hasLoadStoreOnCond()) {
     if (unsigned CondOpcode = getConditionalMove(Opcode)) {
-      MI->setDesc(get(CondOpcode));
-      MachineInstrBuilder(*MI->getParent()->getParent(), MI)
-        .addImm(CCValid).addImm(CCMask)
-        .addReg(SystemZ::CC, RegState::Implicit);
+      MI.setDesc(get(CondOpcode));
+      MachineInstrBuilder(*MI.getParent()->getParent(), MI)
+          .addImm(CCValid)
+          .addImm(CCMask)
+          .addReg(SystemZ::CC, RegState::Implicit);
       return true;
     }
   }
+  if (STI.hasLoadStoreOnCond2()) {
+    if (unsigned CondOpcode = getConditionalLoadImmediate(Opcode)) {
+      MI.setDesc(get(CondOpcode));
+      MachineInstrBuilder(*MI.getParent()->getParent(), MI)
+          .addImm(CCValid)
+          .addImm(CCMask)
+          .addReg(SystemZ::CC, RegState::Implicit);
+      return true;
+    }
+  }
+  if (Opcode == SystemZ::Trap) {
+    MI.setDesc(get(SystemZ::CondTrap));
+    MachineInstrBuilder(*MI.getParent()->getParent(), MI)
+      .addImm(CCValid).addImm(CCMask)
+      .addReg(SystemZ::CC, RegState::Implicit);
+    return true;
+  }
+  if (Opcode == SystemZ::Return) {
+    MI.setDesc(get(SystemZ::CondReturn));
+    MachineInstrBuilder(*MI.getParent()->getParent(), MI)
+      .addImm(CCValid).addImm(CCMask)
+      .addReg(SystemZ::CC, RegState::Implicit);
+    return true;
+  }
+  if (Opcode == SystemZ::CallJG) {
+    MachineOperand FirstOp = MI.getOperand(0);
+    const uint32_t *RegMask = MI.getOperand(1).getRegMask();
+    MI.RemoveOperand(1);
+    MI.RemoveOperand(0);
+    MI.setDesc(get(SystemZ::CallBRCL));
+    MachineInstrBuilder(*MI.getParent()->getParent(), MI)
+      .addImm(CCValid).addImm(CCMask)
+      .addOperand(FirstOp)
+      .addRegMask(RegMask)
+      .addReg(SystemZ::CC, RegState::Implicit);
+    return true;
+  }
+  if (Opcode == SystemZ::CallBR) {
+    const uint32_t *RegMask = MI.getOperand(0).getRegMask();
+    MI.RemoveOperand(0);
+    MI.setDesc(get(SystemZ::CallBCR));
+    MachineInstrBuilder(*MI.getParent()->getParent(), MI)
+      .addImm(CCValid).addImm(CCMask)
+      .addRegMask(RegMask)
+      .addReg(SystemZ::CC, RegState::Implicit);
+    return true;
+  }
   return false;
 }
 
 void SystemZInstrInfo::copyPhysReg(MachineBasicBlock &MBB,
                                    MachineBasicBlock::iterator MBBI,
-                                   DebugLoc DL, unsigned DestReg,
+                                   const DebugLoc &DL, unsigned DestReg,
                                    unsigned SrcReg, bool KillSrc) const {
   // Split 128-bit GPR moves into two 64-bit moves.  This handles ADDR128 too.
   if (SystemZ::GR128BitRegClass.contains(DestReg, SrcReg)) {
@@ -571,7 +678,8 @@ void SystemZInstrInfo::copyPhysReg(MachineBasicBlock &MBB,
   if (SystemZ::GR64BitRegClass.contains(DestReg, SrcReg))
     Opcode = SystemZ::LGR;
   else if (SystemZ::FP32BitRegClass.contains(DestReg, SrcReg))
-    Opcode = SystemZ::LER;
+    // For z13 we prefer LDR over LER to avoid partial register dependencies.
+    Opcode = STI.hasVector() ? SystemZ::LDR32 : SystemZ::LER;
   else if (SystemZ::FP64BitRegClass.contains(DestReg, SrcReg))
     Opcode = SystemZ::LDR;
   else if (SystemZ::FP128BitRegClass.contains(DestReg, SrcReg))
@@ -654,6 +762,14 @@ static LogicOp interpretAndImmediate(unsigned Opcode) {
   }
 }
 
+static void transferDeadCC(MachineInstr *OldMI, MachineInstr *NewMI) {
+  if (OldMI->registerDefIsDead(SystemZ::CC)) {
+    MachineOperand *CCDef = NewMI->findRegisterDefOperand(SystemZ::CC);
+    if (CCDef != nullptr)
+      CCDef->setIsDead(true);
+  }
+}
+
 // Used to return from convertToThreeAddress after replacing two-address
 // instruction OldMI with three-address instruction NewMI.
 static MachineInstr *finishConvertToThreeAddress(MachineInstr *OldMI,
@@ -664,31 +780,29 @@ static MachineInstr *finishConvertToThreeAddress(MachineInstr *OldMI,
     for (unsigned I = 1; I < NumOps; ++I) {
       MachineOperand &Op = OldMI->getOperand(I);
       if (Op.isReg() && Op.isKill())
-        LV->replaceKillInstruction(Op.getReg(), OldMI, NewMI);
+        LV->replaceKillInstruction(Op.getReg(), *OldMI, *NewMI);
     }
   }
+  transferDeadCC(OldMI, NewMI);
   return NewMI;
 }
 
-MachineInstr *
-SystemZInstrInfo::convertToThreeAddress(MachineFunction::iterator &MFI,
-                                        MachineBasicBlock::iterator &MBBI,
-                                        LiveVariables *LV) const {
-  MachineInstr *MI = MBBI;
-  MachineBasicBlock *MBB = MI->getParent();
+MachineInstr *SystemZInstrInfo::convertToThreeAddress(
+    MachineFunction::iterator &MFI, MachineInstr &MI, LiveVariables *LV) const {
+  MachineBasicBlock *MBB = MI.getParent();
   MachineFunction *MF = MBB->getParent();
   MachineRegisterInfo &MRI = MF->getRegInfo();
 
-  unsigned Opcode = MI->getOpcode();
-  unsigned NumOps = MI->getNumOperands();
+  unsigned Opcode = MI.getOpcode();
+  unsigned NumOps = MI.getNumOperands();
 
   // Try to convert something like SLL into SLLK, if supported.
   // We prefer to keep the two-operand form where possible both
   // because it tends to be shorter and because some instructions
   // have memory forms that can be used during spilling.
   if (STI.hasDistinctOps()) {
-    MachineOperand &Dest = MI->getOperand(0);
-    MachineOperand &Src = MI->getOperand(1);
+    MachineOperand &Dest = MI.getOperand(0);
+    MachineOperand &Src = MI.getOperand(1);
     unsigned DestReg = Dest.getReg();
     unsigned SrcReg = Src.getReg();
     // AHIMux is only really a three-operand instruction when both operands
@@ -707,23 +821,23 @@ SystemZInstrInfo::convertToThreeAddress(MachineFunction::iterator &MFI,
       // Create three address instruction without adding the implicit
       // operands. Those will instead be copied over from the original
       // instruction by the loop below.
-      MachineInstrBuilder MIB(*MF,
-                              MF->CreateMachineInstr(get(ThreeOperandOpcode),
-                                    MI->getDebugLoc(), /*NoImplicit=*/true));
+      MachineInstrBuilder MIB(
+          *MF, MF->CreateMachineInstr(get(ThreeOperandOpcode), MI.getDebugLoc(),
+                                      /*NoImplicit=*/true));
       MIB.addOperand(Dest);
       // Keep the kill state, but drop the tied flag.
       MIB.addReg(Src.getReg(), getKillRegState(Src.isKill()), Src.getSubReg());
       // Keep the remaining operands as-is.
       for (unsigned I = 2; I < NumOps; ++I)
-        MIB.addOperand(MI->getOperand(I));
+        MIB.addOperand(MI.getOperand(I));
       MBB->insert(MI, MIB);
-      return finishConvertToThreeAddress(MI, MIB, LV);
+      return finishConvertToThreeAddress(&MI, MIB, LV);
     }
   }
 
   // Try to convert an AND into an RISBG-type instruction.
   if (LogicOp And = interpretAndImmediate(Opcode)) {
-    uint64_t Imm = MI->getOperand(2).getImm() << And.ImmLSB;
+    uint64_t Imm = MI.getOperand(2).getImm() << And.ImmLSB;
     // AND IMMEDIATE leaves the other bits of the register unchanged.
     Imm |= allOnes(And.RegSize) & ~(allOnes(And.ImmSize) << And.ImmLSB);
     unsigned Start, End;
@@ -739,36 +853,55 @@ SystemZInstrInfo::convertToThreeAddress(MachineFunction::iterator &MFI,
         Start &= 31;
         End &= 31;
       }
-      MachineOperand &Dest = MI->getOperand(0);
-      MachineOperand &Src = MI->getOperand(1);
+      MachineOperand &Dest = MI.getOperand(0);
+      MachineOperand &Src = MI.getOperand(1);
       MachineInstrBuilder MIB =
-        BuildMI(*MBB, MI, MI->getDebugLoc(), get(NewOpcode))
-        .addOperand(Dest).addReg(0)
-        .addReg(Src.getReg(), getKillRegState(Src.isKill()), Src.getSubReg())
-        .addImm(Start).addImm(End + 128).addImm(0);
-      return finishConvertToThreeAddress(MI, MIB, LV);
+          BuildMI(*MBB, MI, MI.getDebugLoc(), get(NewOpcode))
+              .addOperand(Dest)
+              .addReg(0)
+              .addReg(Src.getReg(), getKillRegState(Src.isKill()),
+                      Src.getSubReg())
+              .addImm(Start)
+              .addImm(End + 128)
+              .addImm(0);
+      return finishConvertToThreeAddress(&MI, MIB, LV);
     }
   }
   return nullptr;
 }
 
 MachineInstr *SystemZInstrInfo::foldMemoryOperandImpl(
-    MachineFunction &MF, MachineInstr *MI, ArrayRef<unsigned> Ops,
-    MachineBasicBlock::iterator InsertPt, int FrameIndex) const {
+    MachineFunction &MF, MachineInstr &MI, ArrayRef<unsigned> Ops,
+    MachineBasicBlock::iterator InsertPt, int FrameIndex,
+    LiveIntervals *LIS) const {
+  const TargetRegisterInfo *TRI = MF.getSubtarget().getRegisterInfo();
   const MachineFrameInfo *MFI = MF.getFrameInfo();
   unsigned Size = MFI->getObjectSize(FrameIndex);
-  unsigned Opcode = MI->getOpcode();
+  unsigned Opcode = MI.getOpcode();
 
   if (Ops.size() == 2 && Ops[0] == 0 && Ops[1] == 1) {
-    if ((Opcode == SystemZ::LA || Opcode == SystemZ::LAY) &&
-        isInt<8>(MI->getOperand(2).getImm()) &&
-        !MI->getOperand(3).getReg()) {
-      // LA(Y) %reg, CONST(%reg) -> AGSI %mem, CONST
-      return BuildMI(*InsertPt->getParent(), InsertPt, MI->getDebugLoc(),
-                     get(SystemZ::AGSI))
-          .addFrameIndex(FrameIndex)
-          .addImm(0)
-          .addImm(MI->getOperand(2).getImm());
+    if (LIS != nullptr && (Opcode == SystemZ::LA || Opcode == SystemZ::LAY) &&
+        isInt<8>(MI.getOperand(2).getImm()) && !MI.getOperand(3).getReg()) {
+
+      // Check CC liveness, since new instruction introduces a dead
+      // def of CC.
+      MCRegUnitIterator CCUnit(SystemZ::CC, TRI);
+      LiveRange &CCLiveRange = LIS->getRegUnit(*CCUnit);
+      ++CCUnit;
+      assert (!CCUnit.isValid() && "CC only has one reg unit.");
+      SlotIndex MISlot =
+          LIS->getSlotIndexes()->getInstructionIndex(MI).getRegSlot();
+      if (!CCLiveRange.liveAt(MISlot)) {
+        // LA(Y) %reg, CONST(%reg) -> AGSI %mem, CONST
+        MachineInstr *BuiltMI = BuildMI(*InsertPt->getParent(), InsertPt,
+                                        MI.getDebugLoc(), get(SystemZ::AGSI))
+                                    .addFrameIndex(FrameIndex)
+                                    .addImm(0)
+                                    .addImm(MI.getOperand(2).getImm());
+        BuiltMI->findRegisterDefOperand(SystemZ::CC)->setIsDead(true);
+        CCLiveRange.createDeadDef(MISlot, LIS->getVNInfoAllocator());
+        return BuiltMI;
+      }
     }
     return nullptr;
   }
@@ -778,20 +911,23 @@ MachineInstr *SystemZInstrInfo::foldMemoryOperandImpl(
     return nullptr;
 
   unsigned OpNum = Ops[0];
-  assert(Size == MF.getRegInfo()
-         .getRegClass(MI->getOperand(OpNum).getReg())->getSize() &&
+  assert(Size ==
+             MF.getRegInfo()
+                 .getRegClass(MI.getOperand(OpNum).getReg())
+                 ->getSize() &&
          "Invalid size combination");
 
-  if ((Opcode == SystemZ::AHI || Opcode == SystemZ::AGHI) &&
-      OpNum == 0 &&
-      isInt<8>(MI->getOperand(2).getImm())) {
+  if ((Opcode == SystemZ::AHI || Opcode == SystemZ::AGHI) && OpNum == 0 &&
+      isInt<8>(MI.getOperand(2).getImm())) {
     // A(G)HI %reg, CONST -> A(G)SI %mem, CONST
     Opcode = (Opcode == SystemZ::AHI ? SystemZ::ASI : SystemZ::AGSI);
-    return BuildMI(*InsertPt->getParent(), InsertPt, MI->getDebugLoc(),
-                   get(Opcode))
-        .addFrameIndex(FrameIndex)
-        .addImm(0)
-        .addImm(MI->getOperand(2).getImm());
+    MachineInstr *BuiltMI =
+        BuildMI(*InsertPt->getParent(), InsertPt, MI.getDebugLoc(), get(Opcode))
+            .addFrameIndex(FrameIndex)
+            .addImm(0)
+            .addImm(MI.getOperand(2).getImm());
+    transferDeadCC(&MI, BuiltMI);
+    return BuiltMI;
   }
 
   if (Opcode == SystemZ::LGDR || Opcode == SystemZ::LDGR) {
@@ -801,9 +937,9 @@ MachineInstr *SystemZInstrInfo::foldMemoryOperandImpl(
     // source register instead.
     if (OpNum == 0) {
       unsigned StoreOpcode = Op1IsGPR ? SystemZ::STG : SystemZ::STD;
-      return BuildMI(*InsertPt->getParent(), InsertPt, MI->getDebugLoc(),
+      return BuildMI(*InsertPt->getParent(), InsertPt, MI.getDebugLoc(),
                      get(StoreOpcode))
-          .addOperand(MI->getOperand(1))
+          .addOperand(MI.getOperand(1))
           .addFrameIndex(FrameIndex)
           .addImm(0)
           .addReg(0);
@@ -812,8 +948,8 @@ MachineInstr *SystemZInstrInfo::foldMemoryOperandImpl(
     // destination register instead.
     if (OpNum == 1) {
       unsigned LoadOpcode = Op0IsGPR ? SystemZ::LG : SystemZ::LD;
-      unsigned Dest = MI->getOperand(0).getReg();
-      return BuildMI(*InsertPt->getParent(), InsertPt, MI->getDebugLoc(),
+      unsigned Dest = MI.getOperand(0).getReg();
+      return BuildMI(*InsertPt->getParent(), InsertPt, MI.getDebugLoc(),
                      get(LoadOpcode), Dest)
           .addFrameIndex(FrameIndex)
           .addImm(0)
@@ -834,26 +970,26 @@ MachineInstr *SystemZInstrInfo::foldMemoryOperandImpl(
   // might be equal.  We don't worry about that case here, because spill slot
   // coloring happens later, and because we have special code to remove
   // MVCs that turn out to be redundant.
-  if (OpNum == 0 && MI->hasOneMemOperand()) {
-    MachineMemOperand *MMO = *MI->memoperands_begin();
+  if (OpNum == 0 && MI.hasOneMemOperand()) {
+    MachineMemOperand *MMO = *MI.memoperands_begin();
     if (MMO->getSize() == Size && !MMO->isVolatile()) {
       // Handle conversion of loads.
-      if (isSimpleBD12Move(MI, SystemZII::SimpleBDXLoad)) {
-        return BuildMI(*InsertPt->getParent(), InsertPt, MI->getDebugLoc(),
+      if (isSimpleBD12Move(&MI, SystemZII::SimpleBDXLoad)) {
+        return BuildMI(*InsertPt->getParent(), InsertPt, MI.getDebugLoc(),
                        get(SystemZ::MVC))
             .addFrameIndex(FrameIndex)
             .addImm(0)
             .addImm(Size)
-            .addOperand(MI->getOperand(1))
-            .addImm(MI->getOperand(2).getImm())
+            .addOperand(MI.getOperand(1))
+            .addImm(MI.getOperand(2).getImm())
             .addMemOperand(MMO);
       }
       // Handle conversion of stores.
-      if (isSimpleBD12Move(MI, SystemZII::SimpleBDXStore)) {
-        return BuildMI(*InsertPt->getParent(), InsertPt, MI->getDebugLoc(),
+      if (isSimpleBD12Move(&MI, SystemZII::SimpleBDXStore)) {
+        return BuildMI(*InsertPt->getParent(), InsertPt, MI.getDebugLoc(),
                        get(SystemZ::MVC))
-            .addOperand(MI->getOperand(1))
-            .addImm(MI->getOperand(2).getImm())
+            .addOperand(MI.getOperand(1))
+            .addImm(MI.getOperand(2).getImm())
             .addImm(Size)
             .addFrameIndex(FrameIndex)
             .addImm(0)
@@ -866,7 +1002,7 @@ MachineInstr *SystemZInstrInfo::foldMemoryOperandImpl(
   // into <INSN>.
   int MemOpcode = SystemZ::getMemOpcode(Opcode);
   if (MemOpcode >= 0) {
-    unsigned NumOps = MI->getNumExplicitOperands();
+    unsigned NumOps = MI.getNumExplicitOperands();
     if (OpNum == NumOps - 1) {
       const MCInstrDesc &MemDesc = get(MemOpcode);
       uint64_t AccessBytes = SystemZII::getAccessSize(MemDesc.TSFlags);
@@ -874,12 +1010,13 @@ MachineInstr *SystemZInstrInfo::foldMemoryOperandImpl(
       assert(AccessBytes <= Size && "Access outside the frame index");
       uint64_t Offset = Size - AccessBytes;
       MachineInstrBuilder MIB = BuildMI(*InsertPt->getParent(), InsertPt,
-                                        MI->getDebugLoc(), get(MemOpcode));
+                                        MI.getDebugLoc(), get(MemOpcode));
       for (unsigned I = 0; I < OpNum; ++I)
-        MIB.addOperand(MI->getOperand(I));
+        MIB.addOperand(MI.getOperand(I));
       MIB.addFrameIndex(FrameIndex).addImm(Offset);
       if (MemDesc.TSFlags & SystemZII::HasIndex)
         MIB.addReg(0);
+      transferDeadCC(&MI, MIB);
       return MIB;
     }
   }
@@ -888,14 +1025,14 @@ MachineInstr *SystemZInstrInfo::foldMemoryOperandImpl(
 }
 
 MachineInstr *SystemZInstrInfo::foldMemoryOperandImpl(
-    MachineFunction &MF, MachineInstr *MI, ArrayRef<unsigned> Ops,
-    MachineBasicBlock::iterator InsertPt, MachineInstr *LoadMI) const {
+    MachineFunction &MF, MachineInstr &MI, ArrayRef<unsigned> Ops,
+    MachineBasicBlock::iterator InsertPt, MachineInstr &LoadMI,
+    LiveIntervals *LIS) const {
   return nullptr;
 }
 
-bool
-SystemZInstrInfo::expandPostRAPseudo(MachineBasicBlock::iterator MI) const {
-  switch (MI->getOpcode()) {
+bool SystemZInstrInfo::expandPostRAPseudo(MachineInstr &MI) const {
+  switch (MI.getOpcode()) {
   case SystemZ::L128:
     splitMove(MI, SystemZ::LG);
     return true;
@@ -1033,13 +1170,13 @@ SystemZInstrInfo::expandPostRAPseudo(MachineBasicBlock::iterator MI) const {
     return true;
 
   case SystemZ::RISBMux: {
-    bool DestIsHigh = isHighReg(MI->getOperand(0).getReg());
-    bool SrcIsHigh = isHighReg(MI->getOperand(2).getReg());
+    bool DestIsHigh = isHighReg(MI.getOperand(0).getReg());
+    bool SrcIsHigh = isHighReg(MI.getOperand(2).getReg());
     if (SrcIsHigh == DestIsHigh)
-      MI->setDesc(get(DestIsHigh ? SystemZ::RISBHH : SystemZ::RISBLL));
+      MI.setDesc(get(DestIsHigh ? SystemZ::RISBHH : SystemZ::RISBLL));
     else {
-      MI->setDesc(get(DestIsHigh ? SystemZ::RISBHL : SystemZ::RISBLH));
-      MI->getOperand(5).setImm(MI->getOperand(5).getImm() ^ 32);
+      MI.setDesc(get(DestIsHigh ? SystemZ::RISBHL : SystemZ::RISBLH));
+      MI.getOperand(5).setImm(MI.getOperand(5).getImm() ^ 32);
     }
     return true;
   }
@@ -1048,62 +1185,65 @@ SystemZInstrInfo::expandPostRAPseudo(MachineBasicBlock::iterator MI) const {
     splitAdjDynAlloc(MI);
     return true;
 
+  case TargetOpcode::LOAD_STACK_GUARD:
+    expandLoadStackGuard(&MI);
+    return true;
+
   default:
     return false;
   }
 }
 
-uint64_t SystemZInstrInfo::getInstSizeInBytes(const MachineInstr *MI) const {
-  if (MI->getOpcode() == TargetOpcode::INLINEASM) {
-    const MachineFunction *MF = MI->getParent()->getParent();
-    const char *AsmStr = MI->getOperand(0).getSymbolName();
+uint64_t SystemZInstrInfo::getInstSizeInBytes(const MachineInstr &MI) const {
+  if (MI.getOpcode() == TargetOpcode::INLINEASM) {
+    const MachineFunction *MF = MI.getParent()->getParent();
+    const char *AsmStr = MI.getOperand(0).getSymbolName();
     return getInlineAsmLength(AsmStr, *MF->getTarget().getMCAsmInfo());
   }
-  return MI->getDesc().getSize();
+  return MI.getDesc().getSize();
 }
 
 SystemZII::Branch
-SystemZInstrInfo::getBranchInfo(const MachineInstr *MI) const {
-  switch (MI->getOpcode()) {
+SystemZInstrInfo::getBranchInfo(const MachineInstr &MI) const {
+  switch (MI.getOpcode()) {
   case SystemZ::BR:
   case SystemZ::J:
   case SystemZ::JG:
     return SystemZII::Branch(SystemZII::BranchNormal, SystemZ::CCMASK_ANY,
-                             SystemZ::CCMASK_ANY, &MI->getOperand(0));
+                             SystemZ::CCMASK_ANY, &MI.getOperand(0));
 
   case SystemZ::BRC:
   case SystemZ::BRCL:
-    return SystemZII::Branch(SystemZII::BranchNormal,
-                             MI->getOperand(0).getImm(),
-                             MI->getOperand(1).getImm(), &MI->getOperand(2));
+    return SystemZII::Branch(SystemZII::BranchNormal, MI.getOperand(0).getImm(),
+                             MI.getOperand(1).getImm(), &MI.getOperand(2));
 
   case SystemZ::BRCT:
     return SystemZII::Branch(SystemZII::BranchCT, SystemZ::CCMASK_ICMP,
-                             SystemZ::CCMASK_CMP_NE, &MI->getOperand(2));
+                             SystemZ::CCMASK_CMP_NE, &MI.getOperand(2));
 
   case SystemZ::BRCTG:
     return SystemZII::Branch(SystemZII::BranchCTG, SystemZ::CCMASK_ICMP,
-                             SystemZ::CCMASK_CMP_NE, &MI->getOperand(2));
+                             SystemZ::CCMASK_CMP_NE, &MI.getOperand(2));
 
   case SystemZ::CIJ:
   case SystemZ::CRJ:
     return SystemZII::Branch(SystemZII::BranchC, SystemZ::CCMASK_ICMP,
-                             MI->getOperand(2).getImm(), &MI->getOperand(3));
+                             MI.getOperand(2).getImm(), &MI.getOperand(3));
 
   case SystemZ::CLIJ:
   case SystemZ::CLRJ:
     return SystemZII::Branch(SystemZII::BranchCL, SystemZ::CCMASK_ICMP,
-                             MI->getOperand(2).getImm(), &MI->getOperand(3));
+                             MI.getOperand(2).getImm(), &MI.getOperand(3));
 
   case SystemZ::CGIJ:
   case SystemZ::CGRJ:
     return SystemZII::Branch(SystemZII::BranchCG, SystemZ::CCMASK_ICMP,
-                             MI->getOperand(2).getImm(), &MI->getOperand(3));
+                             MI.getOperand(2).getImm(), &MI.getOperand(3));
 
   case SystemZ::CLGIJ:
   case SystemZ::CLGRJ:
     return SystemZII::Branch(SystemZII::BranchCLG, SystemZ::CCMASK_ICMP,
-                             MI->getOperand(2).getImm(), &MI->getOperand(3));
+                             MI.getOperand(2).getImm(), &MI.getOperand(3));
 
   default:
     llvm_unreachable("Unrecognized branch opcode");
@@ -1250,28 +1390,107 @@ bool SystemZInstrInfo::isRxSBGMask(uint64_t Mask, unsigned BitSize,
   return false;
 }
 
-unsigned SystemZInstrInfo::getCompareAndBranch(unsigned Opcode,
-                                               const MachineInstr *MI) const {
+unsigned SystemZInstrInfo::getFusedCompare(unsigned Opcode,
+                                           SystemZII::FusedCompareType Type,
+                                           const MachineInstr *MI) const {
   switch (Opcode) {
-  case SystemZ::CR:
-    return SystemZ::CRJ;
-  case SystemZ::CGR:
-    return SystemZ::CGRJ;
   case SystemZ::CHI:
-    return MI && isInt<8>(MI->getOperand(1).getImm()) ? SystemZ::CIJ : 0;
   case SystemZ::CGHI:
-    return MI && isInt<8>(MI->getOperand(1).getImm()) ? SystemZ::CGIJ : 0;
-  case SystemZ::CLR:
-    return SystemZ::CLRJ;
-  case SystemZ::CLGR:
-    return SystemZ::CLGRJ;
+    if (!(MI && isInt<8>(MI->getOperand(1).getImm())))
+      return 0;
+    break;
   case SystemZ::CLFI:
-    return MI && isUInt<8>(MI->getOperand(1).getImm()) ? SystemZ::CLIJ : 0;
   case SystemZ::CLGFI:
-    return MI && isUInt<8>(MI->getOperand(1).getImm()) ? SystemZ::CLGIJ : 0;
-  default:
-    return 0;
+    if (!(MI && isUInt<8>(MI->getOperand(1).getImm())))
+      return 0;
   }
+  switch (Type) {
+  case SystemZII::CompareAndBranch:
+    switch (Opcode) {
+    case SystemZ::CR:
+      return SystemZ::CRJ;
+    case SystemZ::CGR:
+      return SystemZ::CGRJ;
+    case SystemZ::CHI:
+      return SystemZ::CIJ;
+    case SystemZ::CGHI:
+      return SystemZ::CGIJ;
+    case SystemZ::CLR:
+      return SystemZ::CLRJ;
+    case SystemZ::CLGR:
+      return SystemZ::CLGRJ;
+    case SystemZ::CLFI:
+      return SystemZ::CLIJ;
+    case SystemZ::CLGFI:
+      return SystemZ::CLGIJ;
+    default:
+      return 0;
+    }
+  case SystemZII::CompareAndReturn:
+    switch (Opcode) {
+    case SystemZ::CR:
+      return SystemZ::CRBReturn;
+    case SystemZ::CGR:
+      return SystemZ::CGRBReturn;
+    case SystemZ::CHI:
+      return SystemZ::CIBReturn;
+    case SystemZ::CGHI:
+      return SystemZ::CGIBReturn;
+    case SystemZ::CLR:
+      return SystemZ::CLRBReturn;
+    case SystemZ::CLGR:
+      return SystemZ::CLGRBReturn;
+    case SystemZ::CLFI:
+      return SystemZ::CLIBReturn;
+    case SystemZ::CLGFI:
+      return SystemZ::CLGIBReturn;
+    default:
+      return 0;
+    }
+  case SystemZII::CompareAndSibcall:
+    switch (Opcode) {
+    case SystemZ::CR:
+      return SystemZ::CRBCall;
+    case SystemZ::CGR:
+      return SystemZ::CGRBCall;
+    case SystemZ::CHI:
+      return SystemZ::CIBCall;
+    case SystemZ::CGHI:
+      return SystemZ::CGIBCall;
+    case SystemZ::CLR:
+      return SystemZ::CLRBCall;
+    case SystemZ::CLGR:
+      return SystemZ::CLGRBCall;
+    case SystemZ::CLFI:
+      return SystemZ::CLIBCall;
+    case SystemZ::CLGFI:
+      return SystemZ::CLGIBCall;
+    default:
+      return 0;
+    }
+  case SystemZII::CompareAndTrap:
+    switch (Opcode) {
+    case SystemZ::CR:
+      return SystemZ::CRT;
+    case SystemZ::CGR:
+      return SystemZ::CGRT;
+    case SystemZ::CHI:
+      return SystemZ::CIT;
+    case SystemZ::CGHI:
+      return SystemZ::CGIT;
+    case SystemZ::CLR:
+      return SystemZ::CLRT;
+    case SystemZ::CLGR:
+      return SystemZ::CLGRT;
+    case SystemZ::CLFI:
+      return SystemZ::CLFIT;
+    case SystemZ::CLGFI:
+      return SystemZ::CLGIT;
+    default:
+      return 0;
+    }
+  }
+  return 0;
 }
 
 void SystemZInstrInfo::loadImmediate(MachineBasicBlock &MBB,
diff --git a/lib/Target/SystemZ/SystemZInstrInfo.h b/lib/Target/SystemZ/SystemZInstrInfo.h
index d9094ba93658..010010b89dc8 100644
--- a/lib/Target/SystemZ/SystemZInstrInfo.h
+++ b/lib/Target/SystemZ/SystemZInstrInfo.h
@@ -111,6 +111,22 @@ struct Branch {
          const MachineOperand *target)
     : Type(type), CCValid(ccValid), CCMask(ccMask), Target(target) {}
 };
+// Kinds of fused compares in compare-and-* instructions.  Together with type
+// of the converted compare, this identifies the compare-and-*
+// instruction.
+enum FusedCompareType {
+  // Relative branch - CRJ etc.
+  CompareAndBranch,
+
+  // Indirect branch, used for return - CRBReturn etc.
+  CompareAndReturn,
+
+  // Indirect branch, used for sibcall - CRBCall etc.
+  CompareAndSibcall,
+
+  // Trap
+  CompareAndTrap
+};
 } // end namespace SystemZII
 
 class SystemZSubtarget;
@@ -120,16 +136,17 @@ class SystemZInstrInfo : public SystemZGenInstrInfo {
 
   void splitMove(MachineBasicBlock::iterator MI, unsigned NewOpcode) const;
   void splitAdjDynAlloc(MachineBasicBlock::iterator MI) const;
-  void expandRIPseudo(MachineInstr *MI, unsigned LowOpcode,
-                      unsigned HighOpcode, bool ConvertHigh) const;
-  void expandRIEPseudo(MachineInstr *MI, unsigned LowOpcode,
+  void expandRIPseudo(MachineInstr &MI, unsigned LowOpcode, unsigned HighOpcode,
+                      bool ConvertHigh) const;
+  void expandRIEPseudo(MachineInstr &MI, unsigned LowOpcode,
                        unsigned LowOpcodeK, unsigned HighOpcode) const;
-  void expandRXYPseudo(MachineInstr *MI, unsigned LowOpcode,
+  void expandRXYPseudo(MachineInstr &MI, unsigned LowOpcode,
                        unsigned HighOpcode) const;
-  void expandZExtPseudo(MachineInstr *MI, unsigned LowOpcode,
+  void expandZExtPseudo(MachineInstr &MI, unsigned LowOpcode,
                         unsigned Size) const;
+  void expandLoadStackGuard(MachineInstr *MI) const;
   void emitGRX32Move(MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI,
-                     DebugLoc DL, unsigned DestReg, unsigned SrcReg,
+                     const DebugLoc &DL, unsigned DestReg, unsigned SrcReg,
                      unsigned LowLowOpcode, unsigned Size, bool KillSrc) const;
   virtual void anchor();
   
@@ -137,26 +154,26 @@ public:
   explicit SystemZInstrInfo(SystemZSubtarget &STI);
 
   // Override TargetInstrInfo.
-  unsigned isLoadFromStackSlot(const MachineInstr *MI,
+  unsigned isLoadFromStackSlot(const MachineInstr &MI,
                                int &FrameIndex) const override;
-  unsigned isStoreToStackSlot(const MachineInstr *MI,
+  unsigned isStoreToStackSlot(const MachineInstr &MI,
                               int &FrameIndex) const override;
-  bool isStackSlotCopy(const MachineInstr *MI, int &DestFrameIndex,
+  bool isStackSlotCopy(const MachineInstr &MI, int &DestFrameIndex,
                        int &SrcFrameIndex) const override;
-  bool AnalyzeBranch(MachineBasicBlock &MBB, MachineBasicBlock *&TBB,
+  bool analyzeBranch(MachineBasicBlock &MBB, MachineBasicBlock *&TBB,
                      MachineBasicBlock *&FBB,
                      SmallVectorImpl<MachineOperand> &Cond,
                      bool AllowModify) const override;
   unsigned RemoveBranch(MachineBasicBlock &MBB) const override;
   unsigned InsertBranch(MachineBasicBlock &MBB, MachineBasicBlock *TBB,
                         MachineBasicBlock *FBB, ArrayRef<MachineOperand> Cond,
-                        DebugLoc DL) const override;
-  bool analyzeCompare(const MachineInstr *MI, unsigned &SrcReg,
+                        const DebugLoc &DL) const override;
+  bool analyzeCompare(const MachineInstr &MI, unsigned &SrcReg,
                       unsigned &SrcReg2, int &Mask, int &Value) const override;
-  bool optimizeCompareInstr(MachineInstr *CmpInstr, unsigned SrcReg,
+  bool optimizeCompareInstr(MachineInstr &CmpInstr, unsigned SrcReg,
                             unsigned SrcReg2, int Mask, int Value,
                             const MachineRegisterInfo *MRI) const override;
-  bool isPredicable(MachineInstr *MI) const override;
+  bool isPredicable(MachineInstr &MI) const override;
   bool isProfitableToIfCvt(MachineBasicBlock &MBB, unsigned NumCycles,
                            unsigned ExtraPredCycles,
                            BranchProbability Probability) const override;
@@ -165,10 +182,12 @@ public:
                            MachineBasicBlock &FMBB,
                            unsigned NumCyclesF, unsigned ExtraPredCyclesF,
                            BranchProbability Probability) const override;
-  bool PredicateInstruction(MachineInstr *MI,
+  bool isProfitableToDupForIfCvt(MachineBasicBlock &MBB, unsigned NumCycles,
+                            BranchProbability Probability) const override;
+  bool PredicateInstruction(MachineInstr &MI,
                             ArrayRef<MachineOperand> Pred) const override;
   void copyPhysReg(MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI,
-                   DebugLoc DL, unsigned DestReg, unsigned SrcReg,
+                   const DebugLoc &DL, unsigned DestReg, unsigned SrcReg,
                    bool KillSrc) const override;
   void storeRegToStackSlot(MachineBasicBlock &MBB,
                            MachineBasicBlock::iterator MBBI,
@@ -181,17 +200,18 @@ public:
                             const TargetRegisterClass *RC,
                             const TargetRegisterInfo *TRI) const override;
   MachineInstr *convertToThreeAddress(MachineFunction::iterator &MFI,
-                                      MachineBasicBlock::iterator &MBBI,
+                                      MachineInstr &MI,
                                       LiveVariables *LV) const override;
-  MachineInstr *foldMemoryOperandImpl(MachineFunction &MF, MachineInstr *MI,
-                                      ArrayRef<unsigned> Ops,
-                                      MachineBasicBlock::iterator InsertPt,
-                                      int FrameIndex) const override;
-  MachineInstr *foldMemoryOperandImpl(MachineFunction &MF, MachineInstr *MI,
-                                      ArrayRef<unsigned> Ops,
-                                      MachineBasicBlock::iterator InsertPt,
-                                      MachineInstr *LoadMI) const override;
-  bool expandPostRAPseudo(MachineBasicBlock::iterator MBBI) const override;
+  MachineInstr *
+  foldMemoryOperandImpl(MachineFunction &MF, MachineInstr &MI,
+                        ArrayRef<unsigned> Ops,
+                        MachineBasicBlock::iterator InsertPt, int FrameIndex,
+                        LiveIntervals *LIS = nullptr) const override;
+  MachineInstr *foldMemoryOperandImpl(
+      MachineFunction &MF, MachineInstr &MI, ArrayRef<unsigned> Ops,
+      MachineBasicBlock::iterator InsertPt, MachineInstr &LoadMI,
+      LiveIntervals *LIS = nullptr) const override;
+  bool expandPostRAPseudo(MachineInstr &MBBI) const override;
   bool ReverseBranchCondition(SmallVectorImpl<MachineOperand> &Cond) const
     override;
 
@@ -199,14 +219,14 @@ public:
   const SystemZRegisterInfo &getRegisterInfo() const { return RI; }
 
   // Return the size in bytes of MI.
-  uint64_t getInstSizeInBytes(const MachineInstr *MI) const;
+  uint64_t getInstSizeInBytes(const MachineInstr &MI) const;
 
   // Return true if MI is a conditional or unconditional branch.
   // When returning true, set Cond to the mask of condition-code
   // values on which the instruction will branch, and set Target
   // to the operand that contains the branch target.  This target
   // can be a register or a basic block.
-  SystemZII::Branch getBranchInfo(const MachineInstr *MI) const;
+  SystemZII::Branch getBranchInfo(const MachineInstr &MI) const;
 
   // Get the load and store opcodes for a given register class.
   void getLoadStoreOpcodes(const TargetRegisterClass *RC,
@@ -229,11 +249,12 @@ public:
   bool isRxSBGMask(uint64_t Mask, unsigned BitSize,
                    unsigned &Start, unsigned &End) const;
 
-  // If Opcode is a COMPARE opcode for which an associated COMPARE AND
-  // BRANCH exists, return the opcode for the latter, otherwise return 0.
+  // If Opcode is a COMPARE opcode for which an associated fused COMPARE AND *
+  // operation exists, return the opcode for the latter, otherwise return 0.
   // MI, if nonnull, is the compare instruction.
-  unsigned getCompareAndBranch(unsigned Opcode,
-                               const MachineInstr *MI = nullptr) const;
+  unsigned getFusedCompare(unsigned Opcode,
+                           SystemZII::FusedCompareType Type,
+                           const MachineInstr *MI = nullptr) const;
 
   // Emit code before MBBI in MI to move immediate value Value into
   // physical register Reg.
diff --git a/lib/Target/SystemZ/SystemZInstrInfo.td b/lib/Target/SystemZ/SystemZInstrInfo.td
index d5dabc2cd6ab..c510ca774be3 100644
--- a/lib/Target/SystemZ/SystemZInstrInfo.td
+++ b/lib/Target/SystemZ/SystemZInstrInfo.td
@@ -36,6 +36,22 @@ let hasSideEffects = 0 in {
 let isReturn = 1, isTerminator = 1, isBarrier = 1, hasCtrlDep = 1 in
   def Return : Alias<2, (outs), (ins), [(z_retflag)]>;
 
+// A conditional return instruction (bcr <cond>, %r14).
+let isReturn = 1, isTerminator = 1, hasCtrlDep = 1, CCMaskFirst = 1, Uses = [CC] in
+  def CondReturn : Alias<2, (outs), (ins cond4:$valid, cond4:$R1), []>;
+
+// Fused compare and conditional returns.
+let isReturn = 1, isTerminator = 1, hasCtrlDep = 1 in {
+  def CRBReturn : Alias<6, (outs), (ins GR32:$R1, GR32:$R2, cond4:$M3), []>;
+  def CGRBReturn : Alias<6, (outs), (ins GR64:$R1, GR64:$R2, cond4:$M3), []>;
+  def CIBReturn : Alias<6, (outs), (ins GR32:$R1, imm32sx8:$I2, cond4:$M3), []>;
+  def CGIBReturn : Alias<6, (outs), (ins GR64:$R1, imm64sx8:$I2, cond4:$M3), []>;
+  def CLRBReturn : Alias<6, (outs), (ins GR32:$R1, GR32:$R2, cond4:$M3), []>;
+  def CLGRBReturn : Alias<6, (outs), (ins GR64:$R1, GR64:$R2, cond4:$M3), []>;
+  def CLIBReturn : Alias<6, (outs), (ins GR32:$R1, imm32zx8:$I2, cond4:$M3), []>;
+  def CLGIBReturn : Alias<6, (outs), (ins GR64:$R1, imm64zx8:$I2, cond4:$M3), []>;
+}
+
 // Unconditional branches.  R1 is the condition-code mask (all 1s).
 let isBranch = 1, isTerminator = 1, isBarrier = 1, R1 = 15 in {
   let isIndirectBranch = 1 in
@@ -51,6 +67,17 @@ let isBranch = 1, isTerminator = 1, isBarrier = 1, R1 = 15 in {
   def JG : InstRIL<0xC04, (outs), (ins brtarget32:$I2), "jg\t$I2", []>;
 }
 
+// FIXME: This trap instruction should be marked as isTerminator, but there is
+// currently a general bug that allows non-terminators to be placed between
+// terminators. Temporarily leave this unmarked until the bug is fixed.
+let isBarrier = 1, hasCtrlDep = 1 in {
+  def Trap : Alias<4, (outs), (ins), [(trap)]>;
+}
+
+let isTerminator = 1, hasCtrlDep = 1, Uses = [CC] in {
+  def CondTrap : Alias<4, (outs), (ins cond4:$valid, cond4:$R1), []>;
+}
+
 // Conditional branches.  It's easier for LLVM to handle these branches
 // in their raw BRC/BRCL form, with the 4-bit condition-code mask being
 // the first operand.  It seems friendlier to use mnemonic forms like
@@ -62,15 +89,25 @@ let isBranch = 1, isTerminator = 1, Uses = [CC] in {
                      [(z_br_ccmask cond4:$valid, cond4:$R1, bb:$I2)]>;
     def BRCL : InstRIL<0xC04, (outs), (ins cond4:$valid, cond4:$R1,
                                            brtarget32:$I2), "jg$R1\t$I2", []>;
+    let isIndirectBranch = 1 in
+      def BCR : InstRR<0x07, (outs), (ins cond4:$valid, cond4:$R1, GR64:$R2),
+                       "b${R1}r\t$R2", []>;
   }
   def AsmBRC : InstRI<0xA74, (outs), (ins imm32zx4:$R1, brtarget16:$I2),
                       "brc\t$R1, $I2", []>;
   def AsmBRCL : InstRIL<0xC04, (outs), (ins imm32zx4:$R1, brtarget32:$I2),
                         "brcl\t$R1, $I2", []>;
-  def AsmBCR : InstRR<0x07, (outs), (ins imm32zx4:$R1, GR64:$R2),
-                      "bcr\t$R1, $R2", []>;
+  let isIndirectBranch = 1 in {
+    def AsmBC : InstRX<0x47, (outs), (ins imm32zx4:$R1, bdxaddr12only:$XBD2),
+                       "bc\t$R1, $XBD2", []>;
+    def AsmBCR : InstRR<0x07, (outs), (ins imm32zx4:$R1, GR64:$R2),
+                        "bcr\t$R1, $R2", []>;
+  }
 }
 
+def AsmNop  : InstAlias<"nop\t$XBD", (AsmBC 0, bdxaddr12only:$XBD), 0>;
+def AsmNopR : InstAlias<"nopr\t$R", (AsmBCR 0, GR64:$R), 0>;
+
 // Fused compare-and-branch instructions.  As for normal branches,
 // we handle these instructions internally in their raw CRJ-like form,
 // but use assembly macros like CRJE when writing them out.
@@ -83,38 +120,83 @@ multiclass CompareBranches<Operand ccmask, string pos1, string pos2> {
   let isBranch = 1, isTerminator = 1, Defs = [CC] in {
     def RJ  : InstRIEb<0xEC76, (outs), (ins GR32:$R1, GR32:$R2, ccmask:$M3,
                                             brtarget16:$RI4),
-                       "crj"##pos1##"\t$R1, $R2, "##pos2##"$RI4", []>;
+                       "crj"##pos1##"\t$R1, $R2"##pos2##", $RI4", []>;
     def GRJ : InstRIEb<0xEC64, (outs), (ins GR64:$R1, GR64:$R2, ccmask:$M3,
                                             brtarget16:$RI4),
-                       "cgrj"##pos1##"\t$R1, $R2, "##pos2##"$RI4", []>;
+                       "cgrj"##pos1##"\t$R1, $R2"##pos2##", $RI4", []>;
     def IJ  : InstRIEc<0xEC7E, (outs), (ins GR32:$R1, imm32sx8:$I2, ccmask:$M3,
                                             brtarget16:$RI4),
-                       "cij"##pos1##"\t$R1, $I2, "##pos2##"$RI4", []>;
+                       "cij"##pos1##"\t$R1, $I2"##pos2##", $RI4", []>;
     def GIJ : InstRIEc<0xEC7C, (outs), (ins GR64:$R1, imm64sx8:$I2, ccmask:$M3,
                                             brtarget16:$RI4),
-                       "cgij"##pos1##"\t$R1, $I2, "##pos2##"$RI4", []>;
+                       "cgij"##pos1##"\t$R1, $I2"##pos2##", $RI4", []>;
     def LRJ  : InstRIEb<0xEC77, (outs), (ins GR32:$R1, GR32:$R2, ccmask:$M3,
                                              brtarget16:$RI4),
-                        "clrj"##pos1##"\t$R1, $R2, "##pos2##"$RI4", []>;
+                        "clrj"##pos1##"\t$R1, $R2"##pos2##", $RI4", []>;
     def LGRJ : InstRIEb<0xEC65, (outs), (ins GR64:$R1, GR64:$R2, ccmask:$M3,
                                              brtarget16:$RI4),
-                        "clgrj"##pos1##"\t$R1, $R2, "##pos2##"$RI4", []>;
+                        "clgrj"##pos1##"\t$R1, $R2"##pos2##", $RI4", []>;
     def LIJ  : InstRIEc<0xEC7F, (outs), (ins GR32:$R1, imm32zx8:$I2, ccmask:$M3,
                                              brtarget16:$RI4),
-                        "clij"##pos1##"\t$R1, $I2, "##pos2##"$RI4", []>;
+                        "clij"##pos1##"\t$R1, $I2"##pos2##", $RI4", []>;
     def LGIJ : InstRIEc<0xEC7D, (outs), (ins GR64:$R1, imm64zx8:$I2, ccmask:$M3,
                                              brtarget16:$RI4),
-                        "clgij"##pos1##"\t$R1, $I2, "##pos2##"$RI4", []>;
+                        "clgij"##pos1##"\t$R1, $I2"##pos2##", $RI4", []>;
+    let isIndirectBranch = 1 in {
+      def RB  : InstRRS<0xECF6, (outs), (ins GR32:$R1, GR32:$R2, ccmask:$M3,
+                                             bdaddr12only:$BD4),
+                        "crb"##pos1##"\t$R1, $R2"##pos2##", $BD4", []>;
+      def GRB : InstRRS<0xECE4, (outs), (ins GR64:$R1, GR64:$R2, ccmask:$M3,
+                                             bdaddr12only:$BD4),
+                        "cgrb"##pos1##"\t$R1, $R2"##pos2##", $BD4", []>;
+      def IB  : InstRIS<0xECFE, (outs), (ins GR32:$R1, imm32sx8:$I2, ccmask:$M3,
+                                             bdaddr12only:$BD4),
+                        "cib"##pos1##"\t$R1, $I2"##pos2##", $BD4", []>;
+      def GIB : InstRIS<0xECFC, (outs), (ins GR64:$R1, imm64sx8:$I2, ccmask:$M3,
+                                             bdaddr12only:$BD4),
+                        "cgib"##pos1##"\t$R1, $I2"##pos2##", $BD4", []>;
+      def LRB  : InstRRS<0xECF7, (outs), (ins GR32:$R1, GR32:$R2, ccmask:$M3,
+                                              bdaddr12only:$BD4),
+                         "clrb"##pos1##"\t$R1, $R2"##pos2##", $BD4", []>;
+      def LGRB : InstRRS<0xECE5, (outs), (ins GR64:$R1, GR64:$R2, ccmask:$M3,
+                                              bdaddr12only:$BD4),
+                         "clgrb"##pos1##"\t$R1, $R2"##pos2##", $BD4", []>;
+      def LIB  : InstRIS<0xECFF, (outs), (ins GR32:$R1, imm32zx8:$I2, ccmask:$M3,
+                                              bdaddr12only:$BD4),
+                         "clib"##pos1##"\t$R1, $I2"##pos2##", $BD4", []>;
+      def LGIB : InstRIS<0xECFD, (outs), (ins GR64:$R1, imm64zx8:$I2, ccmask:$M3,
+                                              bdaddr12only:$BD4),
+                         "clgib"##pos1##"\t$R1, $I2"##pos2##", $BD4", []>;
+    }
+  }
+
+  let isTerminator = 1, hasCtrlDep = 1 in {
+    def RT   : InstRRFc<0xB972, (outs), (ins GR32:$R1, GR32:$R2, ccmask:$M3),
+                        "crt"##pos1##"\t$R1, $R2"##pos2, []>;
+    def GRT  : InstRRFc<0xB960, (outs), (ins GR64:$R1, GR64:$R2, ccmask:$M3),
+                        "cgrt"##pos1##"\t$R1, $R2"##pos2, []>;
+    def LRT  : InstRRFc<0xB973, (outs), (ins GR32:$R1, GR32:$R2, ccmask:$M3),
+                        "clrt"##pos1##"\t$R1, $R2"##pos2, []>;
+    def LGRT : InstRRFc<0xB961, (outs), (ins GR64:$R1, GR64:$R2, ccmask:$M3),
+                        "clgrt"##pos1##"\t$R1, $R2"##pos2, []>;
+    def IT   : InstRIEa<0xEC72, (outs), (ins GR32:$R1, imm32sx16:$I2, ccmask:$M3),
+                         "cit"##pos1##"\t$R1, $I2"##pos2, []>;
+    def GIT  : InstRIEa<0xEC70, (outs), (ins GR64:$R1, imm32sx16:$I2, ccmask:$M3),
+                         "cgit"##pos1##"\t$R1, $I2"##pos2, []>;
+    def LFIT : InstRIEa<0xEC73, (outs), (ins GR32:$R1, imm32zx16:$I2, ccmask:$M3),
+                         "clfit"##pos1##"\t$R1, $I2"##pos2, []>;
+    def LGIT : InstRIEa<0xEC71, (outs), (ins GR64:$R1, imm32zx16:$I2, ccmask:$M3),
+                         "clgit"##pos1##"\t$R1, $I2"##pos2, []>;
   }
 }
 let isCodeGenOnly = 1 in
   defm C : CompareBranches<cond4, "$M3", "">;
-defm AsmC : CompareBranches<imm32zx4, "", "$M3, ">;
+defm AsmC : CompareBranches<imm32zx4, "", ", $M3">;
 
 // Define AsmParser mnemonics for each general condition-code mask
 // (integer or floating-point)
-multiclass CondExtendedMnemonic<bits<4> ccmask, string name> {
-  let R1 = ccmask in {
+multiclass CondExtendedMnemonicA<bits<4> ccmask, string name> {
+  let isBranch = 1, isTerminator = 1, R1 = ccmask in {
     def J : InstRI<0xA74, (outs), (ins brtarget16:$I2),
                    "j"##name##"\t$I2", []>;
     def JG : InstRIL<0xC04, (outs), (ins brtarget32:$I2),
@@ -123,25 +205,36 @@ multiclass CondExtendedMnemonic<bits<4> ccmask, string name> {
   }
   def LOCR  : FixedCondUnaryRRF<"locr"##name,  0xB9F2, GR32, GR32, ccmask>;
   def LOCGR : FixedCondUnaryRRF<"locgr"##name, 0xB9E2, GR64, GR64, ccmask>;
+  def LOCHI : FixedCondUnaryRIE<"lochi"##name,  0xEC42, GR64, imm32sx16,
+                                ccmask>;
+  def LOCGHI: FixedCondUnaryRIE<"locghi"##name, 0xEC46, GR64, imm64sx16,
+                                ccmask>;
   def LOC   : FixedCondUnaryRSY<"loc"##name,   0xEBF2, GR32, ccmask, 4>;
   def LOCG  : FixedCondUnaryRSY<"locg"##name,  0xEBE2, GR64, ccmask, 8>;
   def STOC  : FixedCondStoreRSY<"stoc"##name,  0xEBF3, GR32, ccmask, 4>;
   def STOCG : FixedCondStoreRSY<"stocg"##name, 0xEBE3, GR64, ccmask, 8>;
 }
-defm AsmO   : CondExtendedMnemonic<1,  "o">;
-defm AsmH   : CondExtendedMnemonic<2,  "h">;
-defm AsmNLE : CondExtendedMnemonic<3,  "nle">;
-defm AsmL   : CondExtendedMnemonic<4,  "l">;
-defm AsmNHE : CondExtendedMnemonic<5,  "nhe">;
-defm AsmLH  : CondExtendedMnemonic<6,  "lh">;
-defm AsmNE  : CondExtendedMnemonic<7,  "ne">;
-defm AsmE   : CondExtendedMnemonic<8,  "e">;
-defm AsmNLH : CondExtendedMnemonic<9,  "nlh">;
-defm AsmHE  : CondExtendedMnemonic<10, "he">;
-defm AsmNL  : CondExtendedMnemonic<11, "nl">;
-defm AsmLE  : CondExtendedMnemonic<12, "le">;
-defm AsmNH  : CondExtendedMnemonic<13, "nh">;
-defm AsmNO  : CondExtendedMnemonic<14, "no">;
+
+multiclass CondExtendedMnemonic<bits<4> ccmask, string name1, string name2>
+  : CondExtendedMnemonicA<ccmask, name1> {
+  let isAsmParserOnly = 1 in
+    defm Alt : CondExtendedMnemonicA<ccmask, name2>;
+}
+
+defm AsmO   : CondExtendedMnemonicA<1,  "o">;
+defm AsmH   : CondExtendedMnemonic<2,  "h", "p">;
+defm AsmNLE : CondExtendedMnemonicA<3,  "nle">;
+defm AsmL   : CondExtendedMnemonic<4,  "l", "m">;
+defm AsmNHE : CondExtendedMnemonicA<5,  "nhe">;
+defm AsmLH  : CondExtendedMnemonicA<6,  "lh">;
+defm AsmNE  : CondExtendedMnemonic<7,  "ne", "nz">;
+defm AsmE   : CondExtendedMnemonic<8,  "e", "z">;
+defm AsmNLH : CondExtendedMnemonicA<9,  "nlh">;
+defm AsmHE  : CondExtendedMnemonicA<10, "he">;
+defm AsmNL  : CondExtendedMnemonic<11, "nl", "nm">;
+defm AsmLE  : CondExtendedMnemonicA<12, "le">;
+defm AsmNH  : CondExtendedMnemonic<13, "nh", "np">;
+defm AsmNO  : CondExtendedMnemonicA<14, "no">;
 
 // Define AsmParser mnemonics for each integer condition-code mask.
 // This is like the list above, except that condition 3 is not possible
@@ -151,31 +244,76 @@ defm AsmNO  : CondExtendedMnemonic<14, "no">;
 // We don't make one of the two names an alias of the other because
 // we need the custom parsing routines to select the correct register class.
 multiclass IntCondExtendedMnemonicA<bits<4> ccmask, string name> {
-  let M3 = ccmask in {
-    def CR  : InstRIEb<0xEC76, (outs), (ins GR32:$R1, GR32:$R2,
-                                            brtarget16:$RI4),
-                       "crj"##name##"\t$R1, $R2, $RI4", []>;
-    def CGR : InstRIEb<0xEC64, (outs), (ins GR64:$R1, GR64:$R2,
-                                            brtarget16:$RI4),
-                       "cgrj"##name##"\t$R1, $R2, $RI4", []>;
-    def CI  : InstRIEc<0xEC7E, (outs), (ins GR32:$R1, imm32sx8:$I2,
-                                            brtarget16:$RI4),
-                       "cij"##name##"\t$R1, $I2, $RI4", []>;
-    def CGI : InstRIEc<0xEC7C, (outs), (ins GR64:$R1, imm64sx8:$I2,
-                                            brtarget16:$RI4),
-                       "cgij"##name##"\t$R1, $I2, $RI4", []>;
-    def CLR  : InstRIEb<0xEC77, (outs), (ins GR32:$R1, GR32:$R2,
-                                            brtarget16:$RI4),
-                        "clrj"##name##"\t$R1, $R2, $RI4", []>;
-    def CLGR : InstRIEb<0xEC65, (outs), (ins GR64:$R1, GR64:$R2,
+  let isBranch = 1, isTerminator = 1, M3 = ccmask in {
+    def CRJ  : InstRIEb<0xEC76, (outs), (ins GR32:$R1, GR32:$R2,
+                                             brtarget16:$RI4),
+                        "crj"##name##"\t$R1, $R2, $RI4", []>;
+    def CGRJ : InstRIEb<0xEC64, (outs), (ins GR64:$R1, GR64:$R2,
+                                             brtarget16:$RI4),
+                        "cgrj"##name##"\t$R1, $R2, $RI4", []>;
+    def CIJ  : InstRIEc<0xEC7E, (outs), (ins GR32:$R1, imm32sx8:$I2,
                                              brtarget16:$RI4),
-                        "clgrj"##name##"\t$R1, $R2, $RI4", []>;
-    def CLI  : InstRIEc<0xEC7F, (outs), (ins GR32:$R1, imm32zx8:$I2,
+                        "cij"##name##"\t$R1, $I2, $RI4", []>;
+    def CGIJ : InstRIEc<0xEC7C, (outs), (ins GR64:$R1, imm64sx8:$I2,
                                              brtarget16:$RI4),
-                        "clij"##name##"\t$R1, $I2, $RI4", []>;
-    def CLGI : InstRIEc<0xEC7D, (outs), (ins GR64:$R1, imm64zx8:$I2,
+                        "cgij"##name##"\t$R1, $I2, $RI4", []>;
+    def CLRJ  : InstRIEb<0xEC77, (outs), (ins GR32:$R1, GR32:$R2,
                                              brtarget16:$RI4),
-                        "clgij"##name##"\t$R1, $I2, $RI4", []>;
+                         "clrj"##name##"\t$R1, $R2, $RI4", []>;
+    def CLGRJ : InstRIEb<0xEC65, (outs), (ins GR64:$R1, GR64:$R2,
+                                              brtarget16:$RI4),
+                         "clgrj"##name##"\t$R1, $R2, $RI4", []>;
+    def CLIJ  : InstRIEc<0xEC7F, (outs), (ins GR32:$R1, imm32zx8:$I2,
+                                              brtarget16:$RI4),
+                         "clij"##name##"\t$R1, $I2, $RI4", []>;
+    def CLGIJ : InstRIEc<0xEC7D, (outs), (ins GR64:$R1, imm64zx8:$I2,
+                                              brtarget16:$RI4),
+                         "clgij"##name##"\t$R1, $I2, $RI4", []>;
+    let isIndirectBranch = 1 in {
+      def CRB  : InstRRS<0xECF6, (outs), (ins GR32:$R1, GR32:$R2,
+                                              bdaddr12only:$BD4),
+                         "crb"##name##"\t$R1, $R2, $BD4", []>;
+      def CGRB : InstRRS<0xECE4, (outs), (ins GR64:$R1, GR64:$R2,
+                                              bdaddr12only:$BD4),
+                         "cgrb"##name##"\t$R1, $R2, $BD4", []>;
+      def CIB  : InstRIS<0xECFE, (outs), (ins GR32:$R1, imm32sx8:$I2,
+                                              bdaddr12only:$BD4),
+                         "cib"##name##"\t$R1, $I2, $BD4", []>;
+      def CGIB : InstRIS<0xECFC, (outs), (ins GR64:$R1, imm64sx8:$I2,
+                                              bdaddr12only:$BD4),
+                         "cgib"##name##"\t$R1, $I2, $BD4", []>;
+      def CLRB  : InstRRS<0xECF7, (outs), (ins GR32:$R1, GR32:$R2,
+                                              bdaddr12only:$BD4),
+                          "clrb"##name##"\t$R1, $R2, $BD4", []>;
+      def CLGRB : InstRRS<0xECE5, (outs), (ins GR64:$R1, GR64:$R2,
+                                               bdaddr12only:$BD4),
+                          "clgrb"##name##"\t$R1, $R2, $BD4", []>;
+      def CLIB  : InstRIS<0xECFF, (outs), (ins GR32:$R1, imm32zx8:$I2,
+                                               bdaddr12only:$BD4),
+                          "clib"##name##"\t$R1, $I2, $BD4", []>;
+      def CLGIB : InstRIS<0xECFD, (outs), (ins GR64:$R1, imm64zx8:$I2,
+                                               bdaddr12only:$BD4),
+                          "clgib"##name##"\t$R1, $I2, $BD4", []>;
+    }
+  }
+
+  let hasCtrlDep = 1, isTerminator = 1, M3 = ccmask in {
+      def CRT   : InstRRFc<0xB972, (outs), (ins GR32:$R1, GR32:$R2),
+                          "crt"##name##"\t$R1, $R2", []>;
+      def CGRT  : InstRRFc<0xB960, (outs), (ins GR64:$R1, GR64:$R2),
+                          "cgrt"##name##"\t$R1, $R2", []>;
+      def CLRT  : InstRRFc<0xB973, (outs), (ins GR32:$R1, GR32:$R2),
+                          "clrt"##name##"\t$R1, $R2", []>;
+      def CLGRT : InstRRFc<0xB961, (outs), (ins GR64:$R1, GR64:$R2),
+                          "clgrt"##name##"\t$R1, $R2", []>;
+      def CIT   : InstRIEa<0xEC72, (outs), (ins GR32:$R1, imm32sx16:$I2),
+                           "cit"##name##"\t$R1, $I2", []>;
+      def CGIT  : InstRIEa<0xEC70, (outs), (ins GR64:$R1, imm32sx16:$I2),
+                           "cgit"##name##"\t$R1, $I2", []>;
+      def CLFIT : InstRIEa<0xEC73, (outs), (ins GR32:$R1, imm32zx16:$I2),
+                           "clfit"##name##"\t$R1, $I2", []>;
+      def CLGIT : InstRIEa<0xEC71, (outs), (ins GR64:$R1, imm32zx16:$I2),
+                           "clgit"##name##"\t$R1, $I2", []>;
   }
 }
 multiclass IntCondExtendedMnemonic<bits<4> ccmask, string name1, string name2>
@@ -249,6 +387,26 @@ let isCall = 1, isTerminator = 1, isReturn = 1, isBarrier = 1 in {
     def CallBR : Alias<2, (outs), (ins), [(z_sibcall R1D)]>;
 }
 
+let CCMaskFirst = 1, isCall = 1, isTerminator = 1, isReturn = 1 in {
+  def CallBRCL : Alias<6, (outs), (ins cond4:$valid, cond4:$R1,
+                                   pcrel32:$I2), []>;
+
+  let Uses = [R1D] in
+    def CallBCR : Alias<2, (outs), (ins cond4:$valid, cond4:$R1), []>;
+}
+
+// Fused compare and conditional sibling calls.
+let isCall = 1, isTerminator = 1, isReturn = 1, Uses = [R1D] in {
+  def CRBCall : Alias<6, (outs), (ins GR32:$R1, GR32:$R2, cond4:$M3), []>;
+  def CGRBCall : Alias<6, (outs), (ins GR64:$R1, GR64:$R2, cond4:$M3), []>;
+  def CIBCall : Alias<6, (outs), (ins GR32:$R1, imm32sx8:$I2, cond4:$M3), []>;
+  def CGIBCall : Alias<6, (outs), (ins GR64:$R1, imm64sx8:$I2, cond4:$M3), []>;
+  def CLRBCall : Alias<6, (outs), (ins GR32:$R1, GR32:$R2, cond4:$M3), []>;
+  def CLGRBCall : Alias<6, (outs), (ins GR64:$R1, GR64:$R2, cond4:$M3), []>;
+  def CLIBCall : Alias<6, (outs), (ins GR32:$R1, imm32zx8:$I2, cond4:$M3), []>;
+  def CLGIBCall : Alias<6, (outs), (ins GR64:$R1, imm64zx8:$I2, cond4:$M3), []>;
+}
+
 // TLS calls.  These will be lowered into a call to __tls_get_offset,
 // with an extra relocation specifying the TLS symbol.
 let isCall = 1, Defs = [R14D, CC] in {
@@ -261,12 +419,14 @@ let isCall = 1, Defs = [R14D, CC] in {
 // Define the general form of the call instructions for the asm parser.
 // These instructions don't hard-code %r14 as the return address register.
 // Allow an optional TLS marker symbol to generate TLS call relocations.
-def BRAS  : InstRI<0xA75, (outs), (ins GR64:$R1, brtarget16tls:$I2),
-                   "bras\t$R1, $I2", []>;
-def BRASL : InstRIL<0xC05, (outs), (ins GR64:$R1, brtarget32tls:$I2),
-                    "brasl\t$R1, $I2", []>;
-def BASR  : InstRR<0x0D, (outs), (ins GR64:$R1, ADDR64:$R2),
-                   "basr\t$R1, $R2", []>;
+let isCall = 1, Defs = [CC] in {
+  def BRAS  : InstRI<0xA75, (outs), (ins GR64:$R1, brtarget16tls:$I2),
+                     "bras\t$R1, $I2", []>;
+  def BRASL : InstRIL<0xC05, (outs), (ins GR64:$R1, brtarget32tls:$I2),
+                      "brasl\t$R1, $I2", []>;
+  def BASR  : InstRR<0x0D, (outs), (ins GR64:$R1, ADDR64:$R2),
+                     "basr\t$R1, $R2", []>;
+}
 
 //===----------------------------------------------------------------------===//
 // Move instructions
@@ -294,6 +454,14 @@ let Uses = [CC] in {
   def AsmLOCR  : AsmCondUnaryRRF<"loc",  0xB9F2, GR32, GR32>;
   def AsmLOCGR : AsmCondUnaryRRF<"locg", 0xB9E2, GR64, GR64>;
 }
+let isCodeGenOnly = 1, Uses = [CC] in {
+  def LOCHI  : CondUnaryRIE<"lochi",  0xEC42, GR32, imm32sx16>;
+  def LOCGHI : CondUnaryRIE<"locghi", 0xEC46, GR64, imm64sx16>;
+}
+let Uses = [CC] in {
+  def AsmLOCHI  : AsmCondUnaryRIE<"lochi",  0xEC42, GR32, imm32sx16>;
+  def AsmLOCGHI : AsmCondUnaryRIE<"locghi", 0xEC46, GR64, imm64sx16>;
+}
 
 // Immediate moves.
 let hasSideEffects = 0, isAsCheapAsAMove = 1, isMoveImm = 1,
@@ -546,10 +714,14 @@ def  : StoreGR64PC<STRL, aligned_truncstorei32>;
 //===----------------------------------------------------------------------===//
 
 // Multi-register loads.
+defm LM : LoadMultipleRSPair<"lm", 0x98, 0xEB98, GR32>;
 def LMG : LoadMultipleRSY<"lmg", 0xEB04, GR64>;
+def LMH : LoadMultipleRSY<"lmh", 0xEB96, GRH32>;
 
 // Multi-register stores.
+defm STM : StoreMultipleRSPair<"stm", 0x90, 0xEB90, GR32>;
 def STMG : StoreMultipleRSY<"stmg", 0xEB24, GR64>;
+def STMH : StoreMultipleRSY<"stmh", 0xEB26, GRH32>;
 
 //===----------------------------------------------------------------------===//
 // Byte swaps
@@ -563,13 +735,14 @@ let hasSideEffects = 0 in {
 
 // Byte-swapping loads.  Unlike normal loads, these instructions are
 // allowed to access storage more than once.
-def LRV  : UnaryRXY<"lrv",  0xE31E, loadu<bswap, nonvolatile_load>, GR32, 4>;
-def LRVG : UnaryRXY<"lrvg", 0xE30F, loadu<bswap, nonvolatile_load>, GR64, 8>;
+def LRVH : UnaryRXY<"lrvh", 0xE31F, z_lrvh, GR32, 2>;
+def LRV  : UnaryRXY<"lrv",  0xE31E, z_lrv,  GR32, 4>;
+def LRVG : UnaryRXY<"lrvg", 0xE30F, z_lrvg, GR64, 8>;
 
 // Likewise byte-swapping stores.
-def STRV  : StoreRXY<"strv", 0xE33E, storeu<bswap, nonvolatile_store>, GR32, 4>;
-def STRVG : StoreRXY<"strvg", 0xE32F, storeu<bswap, nonvolatile_store>,
-                     GR64, 8>;
+def STRVH : StoreRXY<"strvh", 0xE33F, z_strvh, GR32, 2>;
+def STRV  : StoreRXY<"strv",  0xE33E, z_strv,  GR32, 4>;
+def STRVG : StoreRXY<"strvg", 0xE32F, z_strvg, GR64, 8>;
 
 //===----------------------------------------------------------------------===//
 // Load address instructions
@@ -657,6 +830,11 @@ defm : InsertMem<"inserti8", IC32Y, GR32, azextloadi8, bdxaddr20pair>;
 defm : InsertMem<"inserti8", IC,  GR64, azextloadi8, bdxaddr12pair>;
 defm : InsertMem<"inserti8", ICY, GR64, azextloadi8, bdxaddr20pair>;
 
+let Defs = [CC] in {
+  defm ICM : TernaryRSPair<"icm", 0xBF, 0xEB81, GR32, 0>;
+  def ICMH : TernaryRSY<"icmh", 0xEB80, GRH32, 0>;
+}
+
 // Insertions of a 16-bit immediate, leaving other bits unaffected.
 // We don't have or_as_insert equivalents of these operations because
 // OI is available instead.
@@ -812,7 +990,7 @@ defm : ZXB<subc, GR64, SLGFR>;
 let Defs = [CC], Uses = [CC] in {
   // Subtraction of a register.
   def SLBR  : BinaryRRE<"slb",  0xB999, sube, GR32, GR32>;
-  def SLGBR : BinaryRRE<"slbg", 0xB989, sube, GR64, GR64>;
+  def SLBGR : BinaryRRE<"slbg", 0xB989, sube, GR64, GR64>;
 
   // Subtraction of memory.
   def SLB  : BinaryRXY<"slb",  0xE399, sube, GR32, load, 4>;
@@ -865,7 +1043,7 @@ let Defs = [CC] in {
   // ANDs of memory.
   let CCValues = 0xC, CompareZeroCCMask = 0x8 in {
     defm N  : BinaryRXPair<"n", 0x54, 0xE354, and, GR32, load, 4>;
-    def  NG : BinaryRXY<"ng", 0xE380, and, GR64, load, 8>; 
+    def  NG : BinaryRXY<"ng", 0xE380, and, GR64, load, 8>;
   }
 
   // AND to memory
@@ -1030,6 +1208,7 @@ def DLG  : BinaryRXY<"dlg",  0xE387, z_udivrem64, GR128, load, 8>;
 // Shift left.
 let hasSideEffects = 0 in {
   defm SLL : BinaryRSAndK<"sll", 0x89, 0xEBDF, shl, GR32>;
+  defm SLA : BinaryRSAndK<"sla", 0x8B, 0xEBDD, null_frag, GR32>;
   def SLLG : BinaryRSY<"sllg", 0xEB0D, shl, GR64>;
 }
 
@@ -1208,6 +1387,9 @@ let Defs = [CC] in {
   defm TM : CompareSIPair<"tm", 0x91, 0xEB51, z_tm_mem, anyextloadi8, imm32zx8>;
 }
 
+def TML : InstAlias<"tml\t$R, $I", (TMLL GR32:$R, imm32ll16:$I), 0>;
+def TMH : InstAlias<"tmh\t$R, $I", (TMLH GR32:$R, imm32lh16:$I), 0>;
+
 //===----------------------------------------------------------------------===//
 // Prefetch
 //===----------------------------------------------------------------------===//
@@ -1224,6 +1406,10 @@ def PFDRL : PrefetchRILPC<"pfdrl", 0xC62, z_prefetch>;
 let hasSideEffects = 1 in
 def Serialize : Alias<2, (outs), (ins), [(z_serialize)]>;
 
+// A pseudo instruction that serves as a compiler barrier.
+let hasSideEffects = 1 in
+def MemBarrier : Pseudo<(outs), (ins), [(z_membarrier)]>;
+
 let Predicates = [FeatureInterlockedAccess1], Defs = [CC] in {
   def LAA   : LoadAndOpRSY<"laa",   0xEBF8, atomic_load_add_32, GR32>;
   def LAAG  : LoadAndOpRSY<"laag",  0xEBE8, atomic_load_add_64, GR64>;
@@ -1466,6 +1652,10 @@ let mayLoad = 1, Defs = [CC] in
   defm SRST : StringRRE<"srst", 0xb25e, z_search_string>;
 
 // Other instructions for inline assembly
+let hasSideEffects = 1, Defs = [CC], isCall = 1 in
+  def SVC : InstI<0x0A, (outs), (ins imm32zx8:$I1),
+                  "svc\t$I1",
+                  []>;
 let hasSideEffects = 1, Defs = [CC], mayStore = 1 in
   def STCK : InstS<0xB205, (outs), (ins bdaddr12only:$BD2),
                        "stck\t$BD2",
@@ -1483,6 +1673,12 @@ let hasSideEffects = 1, Defs = [CC], mayStore = 1 in
                        "stfle\t$BD2",
                        []>;
 
+let hasSideEffects = 1 in {
+  def EX   : InstRX<0x44, (outs), (ins GR64:$R1, bdxaddr12only:$XBD2),
+                  "ex\t$R1, $XBD2", []>;
+  def EXRL : InstRIL<0xC60, (outs), (ins GR64:$R1, pcrel32:$I2),
+                     "exrl\t$R1, $I2", []>;
+}
 
 
 //===----------------------------------------------------------------------===//
@@ -1515,6 +1711,42 @@ def : Pat<(sra (shl (i64 (anyext (i32 (z_select_ccmask 1, 0, imm32zx4:$valid,
                (i32 63)),
           (Select64 (LGHI -1), (LGHI 0), imm32zx4:$valid, imm32zx4:$cc)>;
 
+// Avoid generating 2 XOR instructions. (xor (and x, y), y) is
+// equivalent to (and (xor x, -1), y)
+def : Pat<(and (xor GR64:$x, (i64 -1)), GR64:$y),
+                          (XGR GR64:$y, (NGR GR64:$y, GR64:$x))>;
+
+// Shift/rotate instructions only use the last 6 bits of the second operand
+// register, so we can safely use NILL (16 fewer bits than NILF) to only AND the
+// last 16 bits.
+// Complexity is added so that we match this before we match NILF on the AND
+// operation alone.
+let AddedComplexity = 4 in {
+  def : Pat<(shl GR32:$val, (and GR32:$shift, uimm32:$imm)),
+            (SLL GR32:$val, (NILL GR32:$shift, uimm32:$imm), 0)>;
+
+  def : Pat<(sra GR32:$val, (and GR32:$shift, uimm32:$imm)),
+            (SRA GR32:$val, (NILL GR32:$shift, uimm32:$imm), 0)>;
+
+  def : Pat<(srl GR32:$val, (and GR32:$shift, uimm32:$imm)),
+            (SRL GR32:$val, (NILL GR32:$shift, uimm32:$imm), 0)>;
+
+  def : Pat<(shl GR64:$val, (and GR32:$shift, uimm32:$imm)),
+            (SLLG GR64:$val, (NILL GR32:$shift, uimm32:$imm), 0)>;
+
+  def : Pat<(sra GR64:$val, (and GR32:$shift, uimm32:$imm)),
+            (SRAG GR64:$val, (NILL GR32:$shift, uimm32:$imm), 0)>;
+
+  def : Pat<(srl GR64:$val, (and GR32:$shift, uimm32:$imm)),
+            (SRLG GR64:$val, (NILL GR32:$shift, uimm32:$imm), 0)>;
+
+  def : Pat<(rotl GR32:$val, (and GR32:$shift, uimm32:$imm)),
+            (RLL GR32:$val, (NILL GR32:$shift, uimm32:$imm), 0)>;
+
+  def : Pat<(rotl GR64:$val, (and GR32:$shift, uimm32:$imm)),
+            (RLLG GR64:$val, (NILL GR32:$shift, uimm32:$imm), 0)>;
+}
+
 // Peepholes for turning scalar operations into block operations.
 defm : BlockLoadStore<anyextloadi8, i32, MVCSequence, NCSequence, OCSequence,
                       XCSequence, 1>;
diff --git a/lib/Target/SystemZ/SystemZLDCleanup.cpp b/lib/Target/SystemZ/SystemZLDCleanup.cpp
index 24165be29ae7..2cdf2f9bf990 100644
--- a/lib/Target/SystemZ/SystemZLDCleanup.cpp
+++ b/lib/Target/SystemZ/SystemZLDCleanup.cpp
@@ -64,6 +64,9 @@ void SystemZLDCleanup::getAnalysisUsage(AnalysisUsage &AU) const {
 }
 
 bool SystemZLDCleanup::runOnMachineFunction(MachineFunction &F) {
+  if (skipFunction(*F.getFunction()))
+    return false;
+
   TII = static_cast<const SystemZInstrInfo *>(F.getSubtarget().getInstrInfo());
   MF = &F;
 
@@ -92,9 +95,9 @@ bool SystemZLDCleanup::VisitNode(MachineDomTreeNode *Node,
     switch (I->getOpcode()) {
       case SystemZ::TLS_LDCALL:
         if (TLSBaseAddrReg)
-          I = ReplaceTLSCall(I, TLSBaseAddrReg);
+          I = ReplaceTLSCall(&*I, TLSBaseAddrReg);
         else
-          I = SetRegister(I, &TLSBaseAddrReg);
+          I = SetRegister(&*I, &TLSBaseAddrReg);
         Changed = true;
         break;
       default:
diff --git a/lib/Target/SystemZ/SystemZLongBranch.cpp b/lib/Target/SystemZ/SystemZLongBranch.cpp
index 8dab44e7f8af..a24d47d2d16b 100644
--- a/lib/Target/SystemZ/SystemZLongBranch.cpp
+++ b/lib/Target/SystemZ/SystemZLongBranch.cpp
@@ -58,7 +58,6 @@
 #include "llvm/CodeGen/MachineFunctionPass.h"
 #include "llvm/CodeGen/MachineInstrBuilder.h"
 #include "llvm/IR/Function.h"
-#include "llvm/Support/CommandLine.h"
 #include "llvm/Support/MathExtras.h"
 #include "llvm/Target/TargetInstrInfo.h"
 #include "llvm/Target/TargetMachine.h"
@@ -139,12 +138,16 @@ public:
   }
 
   bool runOnMachineFunction(MachineFunction &F) override;
+  MachineFunctionProperties getRequiredProperties() const override {
+    return MachineFunctionProperties().set(
+        MachineFunctionProperties::Property::AllVRegsAllocated);
+  }
 
 private:
   void skipNonTerminators(BlockPosition &Position, MBBInfo &Block);
   void skipTerminator(BlockPosition &Position, TerminatorInfo &Terminator,
                       bool AssumeRelaxed);
-  TerminatorInfo describeTerminator(MachineInstr *MI);
+  TerminatorInfo describeTerminator(MachineInstr &MI);
   uint64_t initMBBInfo();
   bool mustRelaxBranch(const TerminatorInfo &Terminator, uint64_t Address);
   bool mustRelaxABranch();
@@ -207,11 +210,11 @@ void SystemZLongBranch::skipTerminator(BlockPosition &Position,
 }
 
 // Return a description of terminator instruction MI.
-TerminatorInfo SystemZLongBranch::describeTerminator(MachineInstr *MI) {
+TerminatorInfo SystemZLongBranch::describeTerminator(MachineInstr &MI) {
   TerminatorInfo Terminator;
   Terminator.Size = TII->getInstSizeInBytes(MI);
-  if (MI->isConditionalBranch() || MI->isUnconditionalBranch()) {
-    switch (MI->getOpcode()) {
+  if (MI.isConditionalBranch() || MI.isUnconditionalBranch()) {
+    switch (MI.getOpcode()) {
     case SystemZ::J:
       // Relaxes to JG, which is 2 bytes longer.
       Terminator.ExtraRelaxSize = 2;
@@ -248,7 +251,7 @@ TerminatorInfo SystemZLongBranch::describeTerminator(MachineInstr *MI) {
     default:
       llvm_unreachable("Unrecognized branch instruction");
     }
-    Terminator.Branch = MI;
+    Terminator.Branch = &MI;
     Terminator.TargetBlock =
       TII->getBranchInfo(MI).Target->getMBB()->getNumber();
   }
@@ -280,7 +283,7 @@ uint64_t SystemZLongBranch::initMBBInfo() {
     MachineBasicBlock::iterator MI = MBB->begin();
     MachineBasicBlock::iterator End = MBB->end();
     while (MI != End && !MI->isTerminator()) {
-      Block.Size += TII->getInstSizeInBytes(MI);
+      Block.Size += TII->getInstSizeInBytes(*MI);
       ++MI;
     }
     skipNonTerminators(Position, Block);
@@ -289,7 +292,7 @@ uint64_t SystemZLongBranch::initMBBInfo() {
     while (MI != End) {
       if (!MI->isDebugValue()) {
         assert(MI->isTerminator() && "Terminator followed by non-terminator");
-        Terminators.push_back(describeTerminator(MI));
+        Terminators.push_back(describeTerminator(*MI));
         skipTerminator(Position, Terminators.back(), false);
         ++Block.NumTerminators;
       }
diff --git a/lib/Target/SystemZ/SystemZMachineFunctionInfo.h b/lib/Target/SystemZ/SystemZMachineFunctionInfo.h
index f4a517bd54df..4f64f4c65f1d 100644
--- a/lib/Target/SystemZ/SystemZMachineFunctionInfo.h
+++ b/lib/Target/SystemZ/SystemZMachineFunctionInfo.h
@@ -22,14 +22,15 @@ class SystemZMachineFunctionInfo : public MachineFunctionInfo {
   unsigned VarArgsFirstFPR;
   unsigned VarArgsFrameIndex;
   unsigned RegSaveFrameIndex;
+  int FramePointerSaveIndex;
   bool ManipulatesSP;
   unsigned NumLocalDynamics;
 
 public:
   explicit SystemZMachineFunctionInfo(MachineFunction &MF)
     : LowSavedGPR(0), HighSavedGPR(0), VarArgsFirstGPR(0), VarArgsFirstFPR(0),
-      VarArgsFrameIndex(0), RegSaveFrameIndex(0), ManipulatesSP(false),
-      NumLocalDynamics(0) {}
+      VarArgsFrameIndex(0), RegSaveFrameIndex(0), FramePointerSaveIndex(0),
+      ManipulatesSP(false), NumLocalDynamics(0) {}
 
   // Get and set the first call-saved GPR that should be saved and restored
   // by this function.  This is 0 if no GPRs need to be saved or restored.
@@ -59,6 +60,10 @@ public:
   unsigned getRegSaveFrameIndex() const { return RegSaveFrameIndex; }
   void setRegSaveFrameIndex(unsigned FI) { RegSaveFrameIndex = FI; }
 
+  // Get and set the frame index of where the old frame pointer is stored.
+  int getFramePointerSaveIndex() const { return FramePointerSaveIndex; }
+  void setFramePointerSaveIndex(int Idx) { FramePointerSaveIndex = Idx; }
+
   // Get and set whether the function directly manipulates the stack pointer,
   // e.g. through STACKSAVE or STACKRESTORE.
   bool getManipulatesSP() const { return ManipulatesSP; }
diff --git a/lib/Target/SystemZ/SystemZOperands.td b/lib/Target/SystemZ/SystemZOperands.td
index 9af90d492cf8..17b076d88a34 100644
--- a/lib/Target/SystemZ/SystemZOperands.td
+++ b/lib/Target/SystemZ/SystemZOperands.td
@@ -451,11 +451,11 @@ def PCRelTLS32 : PCRelTLSAsmOperand<"32">;
 // and multiplied by 2.
 def brtarget16 : PCRelOperand<OtherVT, PCRel16> {
   let EncoderMethod = "getPC16DBLEncoding";
-  let DecoderMethod = "decodePC16DBLOperand";
+  let DecoderMethod = "decodePC16DBLBranchOperand";
 }
 def brtarget32 : PCRelOperand<OtherVT, PCRel32> {
   let EncoderMethod = "getPC32DBLEncoding";
-  let DecoderMethod = "decodePC32DBLOperand";
+  let DecoderMethod = "decodePC32DBLBranchOperand";
 }
 
 // Variants of brtarget16/32 with an optional additional TLS symbol.
@@ -464,12 +464,12 @@ def tlssym : Operand<i64> { }
 def brtarget16tls : PCRelTLSOperand<OtherVT, PCRelTLS16> {
   let MIOperandInfo = (ops brtarget16:$func, tlssym:$sym);
   let EncoderMethod = "getPC16DBLTLSEncoding";
-  let DecoderMethod = "decodePC16DBLOperand";
+  let DecoderMethod = "decodePC16DBLBranchOperand";
 }
 def brtarget32tls : PCRelTLSOperand<OtherVT, PCRelTLS32> {
   let MIOperandInfo = (ops brtarget32:$func, tlssym:$sym);
   let EncoderMethod = "getPC32DBLTLSEncoding";
-  let DecoderMethod = "decodePC32DBLOperand";
+  let DecoderMethod = "decodePC32DBLBranchOperand";
 }
 
 // A PC-relative offset of a global value.  The offset is sign-extended
diff --git a/lib/Target/SystemZ/SystemZOperators.td b/lib/Target/SystemZ/SystemZOperators.td
index 3c95a1e11b45..8d031f1ea05d 100644
--- a/lib/Target/SystemZ/SystemZOperators.td
+++ b/lib/Target/SystemZ/SystemZOperators.td
@@ -79,6 +79,14 @@ def SDT_ZI32Intrinsic       : SDTypeProfile<1, 0, [SDTCisVT<0, i32>]>;
 def SDT_ZPrefetch           : SDTypeProfile<0, 2,
                                             [SDTCisVT<0, i32>,
                                              SDTCisPtrTy<1>]>;
+def SDT_ZLoadBSwap          : SDTypeProfile<1, 2,
+                                            [SDTCisInt<0>,
+                                             SDTCisPtrTy<1>,
+                                             SDTCisVT<2, OtherVT>]>;
+def SDT_ZStoreBSwap         : SDTypeProfile<0, 3,
+                                            [SDTCisInt<0>,
+                                             SDTCisPtrTy<1>,
+                                             SDTCisVT<2, OtherVT>]>;
 def SDT_ZTBegin             : SDTypeProfile<0, 2,
                                             [SDTCisPtrTy<0>,
                                              SDTCisVT<1, i32>]>;
@@ -137,6 +145,7 @@ def SDT_ZVecQuaternaryInt   : SDTypeProfile<1, 4,
                                              SDTCisSameAs<0, 2>,
                                              SDTCisSameAs<0, 3>,
                                              SDTCisVT<4, i32>]>;
+def SDT_ZTest               : SDTypeProfile<0, 2, [SDTCisVT<1, i64>]>;
 
 //===----------------------------------------------------------------------===//
 // Node definitions
@@ -188,6 +197,15 @@ def z_udivrem64         : SDNode<"SystemZISD::UDIVREM64", SDT_ZGR128Binary64>;
 
 def z_serialize         : SDNode<"SystemZISD::SERIALIZE", SDTNone,
                                  [SDNPHasChain, SDNPMayStore]>;
+def z_membarrier        : SDNode<"SystemZISD::MEMBARRIER", SDTNone,
+                                 [SDNPHasChain, SDNPSideEffect]>;
+
+def z_loadbswap        : SDNode<"SystemZISD::LRV", SDT_ZLoadBSwap,
+                                 [SDNPHasChain, SDNPMayLoad, SDNPMemOperand]>;
+def z_storebswap       : SDNode<"SystemZISD::STRV", SDT_ZStoreBSwap,
+                                 [SDNPHasChain, SDNPMayStore, SDNPMemOperand]>;
+
+def z_tdc               : SDNode<"SystemZISD::TDC", SDT_ZTest, [SDNPOutGlue]>;
 
 // Defined because the index is an i32 rather than a pointer.
 def z_vector_insert     : SDNode<"ISD::INSERT_VECTOR_ELT",
@@ -329,6 +347,17 @@ def z_vsrl              : SDNode<"ISD::SRL", SDT_ZVecBinary>;
 // Pattern fragments
 //===----------------------------------------------------------------------===//
 
+def z_lrvh  : PatFrag<(ops node:$addr), (z_loadbswap node:$addr, i16)>;
+def z_lrv   : PatFrag<(ops node:$addr), (z_loadbswap node:$addr, i32)>;
+def z_lrvg  : PatFrag<(ops node:$addr), (z_loadbswap node:$addr, i64)>;
+
+def z_strvh : PatFrag<(ops node:$src, node:$addr),
+                      (z_storebswap node:$src, node:$addr, i16)>;
+def z_strv  : PatFrag<(ops node:$src, node:$addr),
+                      (z_storebswap node:$src, node:$addr, i32)>;
+def z_strvg : PatFrag<(ops node:$src, node:$addr),
+                      (z_storebswap node:$src, node:$addr, i64)>;
+
 // Signed and unsigned comparisons.
 def z_scmp : PatFrag<(ops node:$a, node:$b), (z_icmp node:$a, node:$b, imm), [{
   unsigned Type = cast<ConstantSDNode>(N->getOperand(2))->getZExtValue();
diff --git a/lib/Target/SystemZ/SystemZProcessors.td b/lib/Target/SystemZ/SystemZProcessors.td
index 32fbe5ae9ef9..9adc0189e650 100644
--- a/lib/Target/SystemZ/SystemZProcessors.td
+++ b/lib/Target/SystemZ/SystemZProcessors.td
@@ -29,6 +29,11 @@ def FeatureLoadStoreOnCond : SystemZFeature<
   "Assume that the load/store-on-condition facility is installed"
 >;
 
+def FeatureLoadStoreOnCond2 : SystemZFeature<
+  "load-store-on-cond-2", "LoadStoreOnCond2",
+  "Assume that the load/store-on-condition facility 2 is installed"
+>;
+
 def FeatureHighWord : SystemZFeature<
   "high-word", "HighWord",
   "Assume that the high-word facility is installed"
@@ -92,5 +97,6 @@ def : Processor<"z13", NoItineraries,
                 [FeatureDistinctOps, FeatureLoadStoreOnCond, FeatureHighWord,
                  FeatureFPExtension, FeaturePopulationCount,
                  FeatureFastSerialization, FeatureInterlockedAccess1,
+                 FeatureMiscellaneousExtensions,
                  FeatureTransactionalExecution, FeatureProcessorAssist,
-                 FeatureVector]>;
+                 FeatureVector, FeatureLoadStoreOnCond2]>;
diff --git a/lib/Target/SystemZ/SystemZRegisterInfo.cpp b/lib/Target/SystemZ/SystemZRegisterInfo.cpp
index 6fd24e3df625..b5e5fd4bfc4f 100644
--- a/lib/Target/SystemZ/SystemZRegisterInfo.cpp
+++ b/lib/Target/SystemZ/SystemZRegisterInfo.cpp
@@ -24,12 +24,20 @@ SystemZRegisterInfo::SystemZRegisterInfo()
 
 const MCPhysReg *
 SystemZRegisterInfo::getCalleeSavedRegs(const MachineFunction *MF) const {
+  if (MF->getSubtarget().getTargetLowering()->supportSwiftError() &&
+      MF->getFunction()->getAttributes().hasAttrSomewhere(
+          Attribute::SwiftError))
+    return CSR_SystemZ_SwiftError_SaveList;
   return CSR_SystemZ_SaveList;
 }
 
 const uint32_t *
 SystemZRegisterInfo::getCallPreservedMask(const MachineFunction &MF,
                                           CallingConv::ID CC) const {
+  if (MF.getSubtarget().getTargetLowering()->supportSwiftError() &&
+      MF.getFunction()->getAttributes().hasAttrSomewhere(
+          Attribute::SwiftError))
+    return CSR_SystemZ_SwiftError_RegMask;
   return CSR_SystemZ_RegMask;
 }
 
@@ -84,8 +92,14 @@ SystemZRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator MI,
   // accepts the offset exists.
   unsigned Opcode = MI->getOpcode();
   unsigned OpcodeForOffset = TII->getOpcodeForOffset(Opcode, Offset);
-  if (OpcodeForOffset)
+  if (OpcodeForOffset) {
+    if (OpcodeForOffset == SystemZ::LE &&
+        MF.getSubtarget<SystemZSubtarget>().hasVector()) {
+      // If LE is ok for offset, use LDE instead on z13.
+      OpcodeForOffset = SystemZ::LDE32;
+    }
     MI->getOperand(FIOperandNum).ChangeToRegister(BasePtr, false);
+  }
   else {
     // Create an anchor point that is in range.  Start at 0xffff so that
     // can use LLILH to load the immediate.
diff --git a/lib/Target/SystemZ/SystemZRegisterInfo.h b/lib/Target/SystemZ/SystemZRegisterInfo.h
index a0db5a9c188f..e41c06c98af2 100644
--- a/lib/Target/SystemZ/SystemZRegisterInfo.h
+++ b/lib/Target/SystemZ/SystemZRegisterInfo.h
@@ -33,6 +33,15 @@ struct SystemZRegisterInfo : public SystemZGenRegisterInfo {
 public:
   SystemZRegisterInfo();
 
+  /// getPointerRegClass - Return the register class to use to hold pointers.
+  /// This is currently only used by LOAD_STACK_GUARD, which requires a non-%r0
+  /// register, hence ADDR64.
+  const TargetRegisterClass *
+  getPointerRegClass(const MachineFunction &MF,
+                     unsigned Kind=0) const override {
+    return &SystemZ::ADDR64BitRegClass;
+  }
+
   // Override TargetRegisterInfo.h.
   bool requiresRegisterScavenging(const MachineFunction &MF) const override {
     return true;
diff --git a/lib/Target/SystemZ/SystemZSelectionDAGInfo.cpp b/lib/Target/SystemZ/SystemZSelectionDAGInfo.cpp
index 178aa3817311..657482504045 100644
--- a/lib/Target/SystemZ/SystemZSelectionDAGInfo.cpp
+++ b/lib/Target/SystemZ/SystemZSelectionDAGInfo.cpp
@@ -23,7 +23,7 @@ using namespace llvm;
 // address Dest.  Sequence is the opcode to use for straight-line code
 // (such as MVC) and Loop is the opcode to use for loops (such as MVC_LOOP).
 // Return the chain for the completed operation.
-static SDValue emitMemMem(SelectionDAG &DAG, SDLoc DL, unsigned Sequence,
+static SDValue emitMemMem(SelectionDAG &DAG, const SDLoc &DL, unsigned Sequence,
                           unsigned Loop, SDValue Chain, SDValue Dst,
                           SDValue Src, uint64_t Size) {
   EVT PtrVT = Src.getValueType();
@@ -46,12 +46,10 @@ static SDValue emitMemMem(SelectionDAG &DAG, SDLoc DL, unsigned Sequence,
                      DAG.getConstant(Size, DL, PtrVT));
 }
 
-SDValue SystemZSelectionDAGInfo::
-EmitTargetCodeForMemcpy(SelectionDAG &DAG, SDLoc DL, SDValue Chain,
-                        SDValue Dst, SDValue Src, SDValue Size, unsigned Align,
-                        bool IsVolatile, bool AlwaysInline,
-                        MachinePointerInfo DstPtrInfo,
-                        MachinePointerInfo SrcPtrInfo) const {
+SDValue SystemZSelectionDAGInfo::EmitTargetCodeForMemcpy(
+    SelectionDAG &DAG, const SDLoc &DL, SDValue Chain, SDValue Dst, SDValue Src,
+    SDValue Size, unsigned Align, bool IsVolatile, bool AlwaysInline,
+    MachinePointerInfo DstPtrInfo, MachinePointerInfo SrcPtrInfo) const {
   if (IsVolatile)
     return SDValue();
 
@@ -64,24 +62,21 @@ EmitTargetCodeForMemcpy(SelectionDAG &DAG, SDLoc DL, SDValue Chain,
 // Handle a memset of 1, 2, 4 or 8 bytes with the operands given by
 // Chain, Dst, ByteVal and Size.  These cases are expected to use
 // MVI, MVHHI, MVHI and MVGHI respectively.
-static SDValue memsetStore(SelectionDAG &DAG, SDLoc DL, SDValue Chain,
+static SDValue memsetStore(SelectionDAG &DAG, const SDLoc &DL, SDValue Chain,
                            SDValue Dst, uint64_t ByteVal, uint64_t Size,
-                           unsigned Align,
-                           MachinePointerInfo DstPtrInfo) {
+                           unsigned Align, MachinePointerInfo DstPtrInfo) {
   uint64_t StoreVal = ByteVal;
   for (unsigned I = 1; I < Size; ++I)
     StoreVal |= ByteVal << (I * 8);
-  return DAG.getStore(Chain, DL,
-                      DAG.getConstant(StoreVal, DL,
-                                      MVT::getIntegerVT(Size * 8)),
-                      Dst, DstPtrInfo, false, false, Align);
+  return DAG.getStore(
+      Chain, DL, DAG.getConstant(StoreVal, DL, MVT::getIntegerVT(Size * 8)),
+      Dst, DstPtrInfo, Align);
 }
 
-SDValue SystemZSelectionDAGInfo::
-EmitTargetCodeForMemset(SelectionDAG &DAG, SDLoc DL, SDValue Chain,
-                        SDValue Dst, SDValue Byte, SDValue Size,
-                        unsigned Align, bool IsVolatile,
-                        MachinePointerInfo DstPtrInfo) const {
+SDValue SystemZSelectionDAGInfo::EmitTargetCodeForMemset(
+    SelectionDAG &DAG, const SDLoc &DL, SDValue Chain, SDValue Dst,
+    SDValue Byte, SDValue Size, unsigned Align, bool IsVolatile,
+    MachinePointerInfo DstPtrInfo) const {
   EVT PtrVT = Dst.getValueType();
 
   if (IsVolatile)
@@ -116,15 +111,14 @@ EmitTargetCodeForMemset(SelectionDAG &DAG, SDLoc DL, SDValue Chain,
     } else {
       // Handle one and two bytes using STC.
       if (Bytes <= 2) {
-        SDValue Chain1 = DAG.getStore(Chain, DL, Byte, Dst, DstPtrInfo,
-                                      false, false, Align);
+        SDValue Chain1 = DAG.getStore(Chain, DL, Byte, Dst, DstPtrInfo, Align);
         if (Bytes == 1)
           return Chain1;
         SDValue Dst2 = DAG.getNode(ISD::ADD, DL, PtrVT, Dst,
                                    DAG.getConstant(1, DL, PtrVT));
-        SDValue Chain2 = DAG.getStore(Chain, DL, Byte, Dst2,
-                                      DstPtrInfo.getWithOffset(1),
-                                      false, false, 1);
+        SDValue Chain2 =
+            DAG.getStore(Chain, DL, Byte, Dst2, DstPtrInfo.getWithOffset(1),
+                         /* Alignment = */ 1);
         return DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Chain1, Chain2);
       }
     }
@@ -138,8 +132,7 @@ EmitTargetCodeForMemset(SelectionDAG &DAG, SDLoc DL, SDValue Chain,
 
     // Copy the byte to the first location and then use MVC to copy
     // it to the rest.
-    Chain = DAG.getStore(Chain, DL, Byte, Dst, DstPtrInfo,
-                         false, false, Align);
+    Chain = DAG.getStore(Chain, DL, Byte, Dst, DstPtrInfo, Align);
     SDValue DstPlus1 = DAG.getNode(ISD::ADD, DL, PtrVT, Dst,
                                    DAG.getConstant(1, DL, PtrVT));
     return emitMemMem(DAG, DL, SystemZISD::MVC, SystemZISD::MVC_LOOP,
@@ -150,7 +143,7 @@ EmitTargetCodeForMemset(SelectionDAG &DAG, SDLoc DL, SDValue Chain,
 
 // Use CLC to compare [Src1, Src1 + Size) with [Src2, Src2 + Size),
 // deciding whether to use a loop or straight-line code.
-static SDValue emitCLC(SelectionDAG &DAG, SDLoc DL, SDValue Chain,
+static SDValue emitCLC(SelectionDAG &DAG, const SDLoc &DL, SDValue Chain,
                        SDValue Src1, SDValue Src2, uint64_t Size) {
   SDVTList VTs = DAG.getVTList(MVT::Other, MVT::Glue);
   EVT PtrVT = Src1.getValueType();
@@ -174,7 +167,8 @@ static SDValue emitCLC(SelectionDAG &DAG, SDLoc DL, SDValue Chain,
 // less than zero if CC == 1 and greater than zero if CC >= 2.
 // The sequence starts with IPM, which puts CC into bits 29 and 28
 // of an integer and clears bits 30 and 31.
-static SDValue addIPMSequence(SDLoc DL, SDValue Glue, SelectionDAG &DAG) {
+static SDValue addIPMSequence(const SDLoc &DL, SDValue Glue,
+                              SelectionDAG &DAG) {
   SDValue IPM = DAG.getNode(SystemZISD::IPM, DL, MVT::i32, Glue);
   SDValue SRL = DAG.getNode(ISD::SRL, DL, MVT::i32, IPM,
                             DAG.getConstant(SystemZ::IPM_CC, DL, MVT::i32));
@@ -183,11 +177,10 @@ static SDValue addIPMSequence(SDLoc DL, SDValue Glue, SelectionDAG &DAG) {
   return ROTL;
 }
 
-std::pair<SDValue, SDValue> SystemZSelectionDAGInfo::
-EmitTargetCodeForMemcmp(SelectionDAG &DAG, SDLoc DL, SDValue Chain,
-                        SDValue Src1, SDValue Src2, SDValue Size,
-                        MachinePointerInfo Op1PtrInfo,
-                        MachinePointerInfo Op2PtrInfo) const {
+std::pair<SDValue, SDValue> SystemZSelectionDAGInfo::EmitTargetCodeForMemcmp(
+    SelectionDAG &DAG, const SDLoc &DL, SDValue Chain, SDValue Src1,
+    SDValue Src2, SDValue Size, MachinePointerInfo Op1PtrInfo,
+    MachinePointerInfo Op2PtrInfo) const {
   if (auto *CSize = dyn_cast<ConstantSDNode>(Size)) {
     uint64_t Bytes = CSize->getZExtValue();
     assert(Bytes > 0 && "Caller should have handled 0-size case");
@@ -198,10 +191,9 @@ EmitTargetCodeForMemcmp(SelectionDAG &DAG, SDLoc DL, SDValue Chain,
   return std::make_pair(SDValue(), SDValue());
 }
 
-std::pair<SDValue, SDValue> SystemZSelectionDAGInfo::
-EmitTargetCodeForMemchr(SelectionDAG &DAG, SDLoc DL, SDValue Chain,
-                        SDValue Src, SDValue Char, SDValue Length,
-                        MachinePointerInfo SrcPtrInfo) const {
+std::pair<SDValue, SDValue> SystemZSelectionDAGInfo::EmitTargetCodeForMemchr(
+    SelectionDAG &DAG, const SDLoc &DL, SDValue Chain, SDValue Src,
+    SDValue Char, SDValue Length, MachinePointerInfo SrcPtrInfo) const {
   // Use SRST to find the character.  End is its address on success.
   EVT PtrVT = Src.getValueType();
   SDVTList VTs = DAG.getVTList(PtrVT, MVT::Other, MVT::Glue);
@@ -226,22 +218,20 @@ EmitTargetCodeForMemchr(SelectionDAG &DAG, SDLoc DL, SDValue Chain,
   return std::make_pair(End, Chain);
 }
 
-std::pair<SDValue, SDValue> SystemZSelectionDAGInfo::
-EmitTargetCodeForStrcpy(SelectionDAG &DAG, SDLoc DL, SDValue Chain,
-                        SDValue Dest, SDValue Src,
-                        MachinePointerInfo DestPtrInfo,
-                        MachinePointerInfo SrcPtrInfo, bool isStpcpy) const {
+std::pair<SDValue, SDValue> SystemZSelectionDAGInfo::EmitTargetCodeForStrcpy(
+    SelectionDAG &DAG, const SDLoc &DL, SDValue Chain, SDValue Dest,
+    SDValue Src, MachinePointerInfo DestPtrInfo, MachinePointerInfo SrcPtrInfo,
+    bool isStpcpy) const {
   SDVTList VTs = DAG.getVTList(Dest.getValueType(), MVT::Other);
   SDValue EndDest = DAG.getNode(SystemZISD::STPCPY, DL, VTs, Chain, Dest, Src,
                                 DAG.getConstant(0, DL, MVT::i32));
   return std::make_pair(isStpcpy ? EndDest : Dest, EndDest.getValue(1));
 }
 
-std::pair<SDValue, SDValue> SystemZSelectionDAGInfo::
-EmitTargetCodeForStrcmp(SelectionDAG &DAG, SDLoc DL, SDValue Chain,
-                        SDValue Src1, SDValue Src2,
-                        MachinePointerInfo Op1PtrInfo,
-                        MachinePointerInfo Op2PtrInfo) const {
+std::pair<SDValue, SDValue> SystemZSelectionDAGInfo::EmitTargetCodeForStrcmp(
+    SelectionDAG &DAG, const SDLoc &DL, SDValue Chain, SDValue Src1,
+    SDValue Src2, MachinePointerInfo Op1PtrInfo,
+    MachinePointerInfo Op2PtrInfo) const {
   SDVTList VTs = DAG.getVTList(Src1.getValueType(), MVT::Other, MVT::Glue);
   SDValue Unused = DAG.getNode(SystemZISD::STRCMP, DL, VTs, Chain, Src1, Src2,
                                DAG.getConstant(0, DL, MVT::i32));
@@ -255,7 +245,8 @@ EmitTargetCodeForStrcmp(SelectionDAG &DAG, SDLoc DL, SDValue Chain,
 // and the second being the out chain.
 //
 // This can be used for strlen by setting Limit to 0.
-static std::pair<SDValue, SDValue> getBoundedStrlen(SelectionDAG &DAG, SDLoc DL,
+static std::pair<SDValue, SDValue> getBoundedStrlen(SelectionDAG &DAG,
+                                                    const SDLoc &DL,
                                                     SDValue Chain, SDValue Src,
                                                     SDValue Limit) {
   EVT PtrVT = Src.getValueType();
@@ -265,19 +256,18 @@ static std::pair<SDValue, SDValue> getBoundedStrlen(SelectionDAG &DAG, SDLoc DL,
   Chain = End.getValue(1);
   SDValue Len = DAG.getNode(ISD::SUB, DL, PtrVT, End, Src);
   return std::make_pair(Len, Chain);
-}    
+}
 
-std::pair<SDValue, SDValue> SystemZSelectionDAGInfo::
-EmitTargetCodeForStrlen(SelectionDAG &DAG, SDLoc DL, SDValue Chain,
-                        SDValue Src, MachinePointerInfo SrcPtrInfo) const {
+std::pair<SDValue, SDValue> SystemZSelectionDAGInfo::EmitTargetCodeForStrlen(
+    SelectionDAG &DAG, const SDLoc &DL, SDValue Chain, SDValue Src,
+    MachinePointerInfo SrcPtrInfo) const {
   EVT PtrVT = Src.getValueType();
   return getBoundedStrlen(DAG, DL, Chain, Src, DAG.getConstant(0, DL, PtrVT));
 }
 
-std::pair<SDValue, SDValue> SystemZSelectionDAGInfo::
-EmitTargetCodeForStrnlen(SelectionDAG &DAG, SDLoc DL, SDValue Chain,
-                         SDValue Src, SDValue MaxLength,
-                         MachinePointerInfo SrcPtrInfo) const {
+std::pair<SDValue, SDValue> SystemZSelectionDAGInfo::EmitTargetCodeForStrnlen(
+    SelectionDAG &DAG, const SDLoc &DL, SDValue Chain, SDValue Src,
+    SDValue MaxLength, MachinePointerInfo SrcPtrInfo) const {
   EVT PtrVT = Src.getValueType();
   MaxLength = DAG.getZExtOrTrunc(MaxLength, DL, PtrVT);
   SDValue Limit = DAG.getNode(ISD::ADD, DL, PtrVT, Src, MaxLength);
diff --git a/lib/Target/SystemZ/SystemZSelectionDAGInfo.h b/lib/Target/SystemZ/SystemZSelectionDAGInfo.h
index 246fa3e5e656..93cd970c30c6 100644
--- a/lib/Target/SystemZ/SystemZSelectionDAGInfo.h
+++ b/lib/Target/SystemZ/SystemZSelectionDAGInfo.h
@@ -7,66 +7,64 @@
 //
 //===----------------------------------------------------------------------===//
 //
-// This file defines the SystemZ subclass for TargetSelectionDAGInfo.
+// This file defines the SystemZ subclass for SelectionDAGTargetInfo.
 //
 //===----------------------------------------------------------------------===//
 
 #ifndef LLVM_LIB_TARGET_SYSTEMZ_SYSTEMZSELECTIONDAGINFO_H
 #define LLVM_LIB_TARGET_SYSTEMZ_SYSTEMZSELECTIONDAGINFO_H
 
-#include "llvm/Target/TargetSelectionDAGInfo.h"
+#include "llvm/CodeGen/SelectionDAGTargetInfo.h"
 
 namespace llvm {
 
 class SystemZTargetMachine;
 
-class SystemZSelectionDAGInfo : public TargetSelectionDAGInfo {
+class SystemZSelectionDAGInfo : public SelectionDAGTargetInfo {
 public:
   explicit SystemZSelectionDAGInfo() = default;
 
-  SDValue EmitTargetCodeForMemcpy(SelectionDAG &DAG, SDLoc DL, SDValue Chain,
-                                  SDValue Dst, SDValue Src,
-                                  SDValue Size, unsigned Align,
-                                  bool IsVolatile, bool AlwaysInline,
+  SDValue EmitTargetCodeForMemcpy(SelectionDAG &DAG, const SDLoc &DL,
+                                  SDValue Chain, SDValue Dst, SDValue Src,
+                                  SDValue Size, unsigned Align, bool IsVolatile,
+                                  bool AlwaysInline,
                                   MachinePointerInfo DstPtrInfo,
                                   MachinePointerInfo SrcPtrInfo) const override;
 
-  SDValue EmitTargetCodeForMemset(SelectionDAG &DAG, SDLoc DL,
+  SDValue EmitTargetCodeForMemset(SelectionDAG &DAG, const SDLoc &DL,
                                   SDValue Chain, SDValue Dst, SDValue Byte,
                                   SDValue Size, unsigned Align, bool IsVolatile,
                                   MachinePointerInfo DstPtrInfo) const override;
 
   std::pair<SDValue, SDValue>
-  EmitTargetCodeForMemcmp(SelectionDAG &DAG, SDLoc DL, SDValue Chain,
+  EmitTargetCodeForMemcmp(SelectionDAG &DAG, const SDLoc &DL, SDValue Chain,
                           SDValue Src1, SDValue Src2, SDValue Size,
                           MachinePointerInfo Op1PtrInfo,
                           MachinePointerInfo Op2PtrInfo) const override;
 
   std::pair<SDValue, SDValue>
-  EmitTargetCodeForMemchr(SelectionDAG &DAG, SDLoc DL, SDValue Chain,
+  EmitTargetCodeForMemchr(SelectionDAG &DAG, const SDLoc &DL, SDValue Chain,
                           SDValue Src, SDValue Char, SDValue Length,
                           MachinePointerInfo SrcPtrInfo) const override;
 
-  std::pair<SDValue, SDValue>
-  EmitTargetCodeForStrcpy(SelectionDAG &DAG, SDLoc DL, SDValue Chain,
-                          SDValue Dest, SDValue Src,
-                          MachinePointerInfo DestPtrInfo,
-                          MachinePointerInfo SrcPtrInfo,
-                          bool isStpcpy) const override;
+  std::pair<SDValue, SDValue> EmitTargetCodeForStrcpy(
+      SelectionDAG &DAG, const SDLoc &DL, SDValue Chain, SDValue Dest,
+      SDValue Src, MachinePointerInfo DestPtrInfo,
+      MachinePointerInfo SrcPtrInfo, bool isStpcpy) const override;
 
   std::pair<SDValue, SDValue>
-  EmitTargetCodeForStrcmp(SelectionDAG &DAG, SDLoc DL, SDValue Chain,
+  EmitTargetCodeForStrcmp(SelectionDAG &DAG, const SDLoc &DL, SDValue Chain,
                           SDValue Src1, SDValue Src2,
                           MachinePointerInfo Op1PtrInfo,
                           MachinePointerInfo Op2PtrInfo) const override;
 
   std::pair<SDValue, SDValue>
-  EmitTargetCodeForStrlen(SelectionDAG &DAG, SDLoc DL, SDValue Chain,
+  EmitTargetCodeForStrlen(SelectionDAG &DAG, const SDLoc &DL, SDValue Chain,
                           SDValue Src,
                           MachinePointerInfo SrcPtrInfo) const override;
 
   std::pair<SDValue, SDValue>
-  EmitTargetCodeForStrnlen(SelectionDAG &DAG, SDLoc DL, SDValue Chain,
+  EmitTargetCodeForStrnlen(SelectionDAG &DAG, const SDLoc &DL, SDValue Chain,
                            SDValue Src, SDValue MaxLength,
                            MachinePointerInfo SrcPtrInfo) const override;
 };
diff --git a/lib/Target/SystemZ/SystemZShortenInst.cpp b/lib/Target/SystemZ/SystemZShortenInst.cpp
index 846edd51341a..7f26a3519e50 100644
--- a/lib/Target/SystemZ/SystemZShortenInst.cpp
+++ b/lib/Target/SystemZ/SystemZShortenInst.cpp
@@ -35,6 +35,10 @@ public:
 
   bool processBlock(MachineBasicBlock &MBB);
   bool runOnMachineFunction(MachineFunction &F) override;
+  MachineFunctionProperties getRequiredProperties() const override {
+    return MachineFunctionProperties().set(
+        MachineFunctionProperties::Property::AllVRegsAllocated);
+  }
 
 private:
   bool shortenIIF(MachineInstr &MI, unsigned LLIxL, unsigned LLIxH);
@@ -68,18 +72,20 @@ static void tieOpsIfNeeded(MachineInstr &MI) {
 
 // MI loads one word of a GPR using an IIxF instruction and LLIxL and LLIxH
 // are the halfword immediate loads for the same word.  Try to use one of them
-// instead of IIxF. 
-bool SystemZShortenInst::shortenIIF(MachineInstr &MI,
-                                    unsigned LLIxL, unsigned LLIxH) {
+// instead of IIxF.
+bool SystemZShortenInst::shortenIIF(MachineInstr &MI, unsigned LLIxL,
+                                    unsigned LLIxH) {
   unsigned Reg = MI.getOperand(0).getReg();
   // The new opcode will clear the other half of the GR64 reg, so
   // cancel if that is live.
-  unsigned thisSubRegIdx = (SystemZ::GRH32BitRegClass.contains(Reg) ?
-			    SystemZ::subreg_h32 : SystemZ::subreg_l32);
-  unsigned otherSubRegIdx = (thisSubRegIdx == SystemZ::subreg_l32 ?
-			     SystemZ::subreg_h32 : SystemZ::subreg_l32);
-  unsigned GR64BitReg = TRI->getMatchingSuperReg(Reg, thisSubRegIdx,
-						 &SystemZ::GR64BitRegClass);
+  unsigned thisSubRegIdx =
+      (SystemZ::GRH32BitRegClass.contains(Reg) ? SystemZ::subreg_h32
+                                               : SystemZ::subreg_l32);
+  unsigned otherSubRegIdx =
+      (thisSubRegIdx == SystemZ::subreg_l32 ? SystemZ::subreg_h32
+                                            : SystemZ::subreg_l32);
+  unsigned GR64BitReg =
+      TRI->getMatchingSuperReg(Reg, thisSubRegIdx, &SystemZ::GR64BitRegClass);
   unsigned OtherReg = TRI->getSubReg(GR64BitReg, otherSubRegIdx);
   if (LiveRegs.contains(OtherReg))
     return false;
@@ -135,11 +141,10 @@ bool SystemZShortenInst::shortenOn001(MachineInstr &MI, unsigned Opcode) {
 
 // Calls shortenOn001 if CCLive is false. CC def operand is added in
 // case of success.
-bool SystemZShortenInst::shortenOn001AddCC(MachineInstr &MI,
-					   unsigned Opcode) {
+bool SystemZShortenInst::shortenOn001AddCC(MachineInstr &MI, unsigned Opcode) {
   if (!LiveRegs.contains(SystemZ::CC) && shortenOn001(MI, Opcode)) {
     MachineInstrBuilder(*MI.getParent()->getParent(), &MI)
-      .addReg(SystemZ::CC, RegState::ImplicitDefine);
+      .addReg(SystemZ::CC, RegState::ImplicitDefine | RegState::Dead);
     return true;
   }
   return false;
@@ -177,7 +182,7 @@ bool SystemZShortenInst::processBlock(MachineBasicBlock &MBB) {
 
   // Set up the set of live registers at the end of MBB (live out)
   LiveRegs.clear();
-  LiveRegs.addLiveOuts(&MBB);
+  LiveRegs.addLiveOuts(MBB);
 
   // Iterate backwards through the block looking for instructions to change.
   for (auto MBBI = MBB.rbegin(), MBBE = MBB.rend(); MBBI != MBBE; ++MBBI) {
@@ -264,6 +269,9 @@ bool SystemZShortenInst::processBlock(MachineBasicBlock &MBB) {
 }
 
 bool SystemZShortenInst::runOnMachineFunction(MachineFunction &F) {
+  if (skipFunction(*F.getFunction()))
+    return false;
+
   const SystemZSubtarget &ST = F.getSubtarget<SystemZSubtarget>();
   TII = ST.getInstrInfo();
   TRI = ST.getRegisterInfo();
diff --git a/lib/Target/SystemZ/SystemZSubtarget.cpp b/lib/Target/SystemZ/SystemZSubtarget.cpp
index 0b49fcdd8f78..67d5e0179fe2 100644
--- a/lib/Target/SystemZ/SystemZSubtarget.cpp
+++ b/lib/Target/SystemZ/SystemZSubtarget.cpp
@@ -40,21 +40,11 @@ SystemZSubtarget::SystemZSubtarget(const Triple &TT, const std::string &CPU,
       HasPopulationCount(false), HasFastSerialization(false),
       HasInterlockedAccess1(false), HasMiscellaneousExtensions(false),
       HasTransactionalExecution(false), HasProcessorAssist(false),
-      HasVector(false), TargetTriple(TT),
+      HasVector(false), HasLoadStoreOnCond2(false), TargetTriple(TT),
       InstrInfo(initializeSubtargetDependencies(CPU, FS)), TLInfo(TM, *this),
       TSInfo(), FrameLowering() {}
 
-// Return true if GV binds locally under reloc model RM.
-static bool bindsLocally(const GlobalValue *GV, Reloc::Model RM) {
-  // For non-PIC, all symbols bind locally.
-  if (RM == Reloc::Static)
-    return true;
-
-  return GV->hasLocalLinkage() || !GV->hasDefaultVisibility();
-}
-
 bool SystemZSubtarget::isPC32DBLSymbol(const GlobalValue *GV,
-                                       Reloc::Model RM,
                                        CodeModel::Model CM) const {
   // PC32DBL accesses require the low bit to be clear.  Note that a zero
   // value selects the default alignment and is therefore OK.
@@ -63,7 +53,7 @@ bool SystemZSubtarget::isPC32DBLSymbol(const GlobalValue *GV,
 
   // For the small model, all locally-binding symbols are in range.
   if (CM == CodeModel::Small)
-    return bindsLocally(GV, RM);
+    return TLInfo.getTargetMachine().shouldAssumeDSOLocal(*GV->getParent(), GV);
 
   // For Medium and above, assume that the symbol is not within the 4GB range.
   // Taking the address of locally-defined text would be OK, but that
diff --git a/lib/Target/SystemZ/SystemZSubtarget.h b/lib/Target/SystemZ/SystemZSubtarget.h
index f7eaf01cb77e..6007f6fc9c4c 100644
--- a/lib/Target/SystemZ/SystemZSubtarget.h
+++ b/lib/Target/SystemZ/SystemZSubtarget.h
@@ -45,6 +45,7 @@ protected:
   bool HasTransactionalExecution;
   bool HasProcessorAssist;
   bool HasVector;
+  bool HasLoadStoreOnCond2;
 
 private:
   Triple TargetTriple;
@@ -69,7 +70,7 @@ public:
   const SystemZTargetLowering *getTargetLowering() const override {
     return &TLInfo;
   }
-  const TargetSelectionDAGInfo *getSelectionDAGInfo() const override {
+  const SelectionDAGTargetInfo *getSelectionDAGInfo() const override {
     return &TSInfo;
   }
 
@@ -85,6 +86,9 @@ public:
   // Return true if the target has the load/store-on-condition facility.
   bool hasLoadStoreOnCond() const { return HasLoadStoreOnCond; }
 
+  // Return true if the target has the load/store-on-condition facility 2.
+  bool hasLoadStoreOnCond2() const { return HasLoadStoreOnCond2; }
+
   // Return true if the target has the high-word facility.
   bool hasHighWord() const { return HasHighWord; }
 
@@ -116,8 +120,7 @@ public:
 
   // Return true if GV can be accessed using LARL for reloc model RM
   // and code model CM.
-  bool isPC32DBLSymbol(const GlobalValue *GV, Reloc::Model RM,
-                       CodeModel::Model CM) const;
+  bool isPC32DBLSymbol(const GlobalValue *GV, CodeModel::Model CM) const;
 
   bool isTargetELF() const { return TargetTriple.isOSBinFormatELF(); }
 };
diff --git a/lib/Target/SystemZ/SystemZTDC.cpp b/lib/Target/SystemZ/SystemZTDC.cpp
new file mode 100644
index 000000000000..96a9ef82c125
--- /dev/null
+++ b/lib/Target/SystemZ/SystemZTDC.cpp
@@ -0,0 +1,382 @@
+//===-- SystemZTDC.cpp - Utilize Test Data Class instruction --------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This pass looks for instructions that can be replaced by a Test Data Class
+// instruction, and replaces them when profitable.
+//
+// Roughly, the following rules are recognized:
+//
+// 1: fcmp pred X, 0 -> tdc X, mask
+// 2: fcmp pred X, +-inf -> tdc X, mask
+// 3: fcmp pred X, +-minnorm -> tdc X, mask
+// 4: tdc (fabs X), mask -> tdc X, newmask
+// 5: icmp slt (bitcast float X to int), 0 -> tdc X, mask [ie. signbit]
+// 6: icmp sgt (bitcast float X to int), -1 -> tdc X, mask
+// 7: icmp ne/eq (call @llvm.s390.tdc.*(X, mask)) -> tdc X, mask/~mask
+// 8: and i1 (tdc X, M1), (tdc X, M2) -> tdc X, (M1 & M2)
+// 9: or i1 (tdc X, M1), (tdc X, M2) -> tdc X, (M1 | M2)
+// 10: xor i1 (tdc X, M1), (tdc X, M2) -> tdc X, (M1 ^ M2)
+//
+// The pass works in 4 steps:
+//
+// 1. All fcmp and icmp instructions in a function are checked for a match
+//    with rules 1-3 and 5-7.  Their TDC equivalents are stored in
+//    the ConvertedInsts mapping.  If the operand of a fcmp instruction is
+//    a fabs, it's also folded according to rule 4.
+// 2. All and/or/xor i1 instructions whose both operands have been already
+//    mapped are mapped according to rules 8-10.  LogicOpsWorklist is used
+//    as a queue of instructions to check.
+// 3. All mapped instructions that are considered worthy of conversion (ie.
+//    replacing them will actually simplify the final code) are replaced
+//    with a call to the s390.tdc intrinsic.
+// 4. All intermediate results of replaced instructions are removed if unused.
+//
+// Instructions that match rules 1-3 are considered unworthy of conversion
+// on their own (since a comparison instruction is superior), but are mapped
+// in the hopes of folding the result using rules 4 and 8-10 (likely removing
+// the original comparison in the process).
+//
+//===----------------------------------------------------------------------===//
+
+#include "SystemZ.h"
+#include "llvm/ADT/MapVector.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/InstIterator.h"
+#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/LegacyPassManager.h"
+#include "llvm/IR/Module.h"
+#include <deque>
+#include <set>
+
+using namespace llvm;
+
+namespace llvm {
+  void initializeSystemZTDCPassPass(PassRegistry&);
+}
+
+namespace {
+
+class SystemZTDCPass : public FunctionPass {
+public:
+  static char ID;
+  SystemZTDCPass() : FunctionPass(ID) {
+    initializeSystemZTDCPassPass(*PassRegistry::getPassRegistry());
+  }
+
+  bool runOnFunction(Function &F) override;
+private:
+  // Maps seen instructions that can be mapped to a TDC, values are
+  // (TDC operand, TDC mask, worthy flag) triples.
+  MapVector<Instruction *, std::tuple<Value *, int, bool>> ConvertedInsts;
+  // The queue of and/or/xor i1 instructions to be potentially folded.
+  std::vector<BinaryOperator *> LogicOpsWorklist;
+  // Instructions matched while folding, to be removed at the end if unused.
+  std::set<Instruction *> PossibleJunk;
+
+  // Tries to convert a fcmp instruction.
+  void convertFCmp(CmpInst &I);
+
+  // Tries to convert an icmp instruction.
+  void convertICmp(CmpInst &I);
+
+  // Tries to convert an i1 and/or/xor instruction, whose both operands
+  // have been already converted.
+  void convertLogicOp(BinaryOperator &I);
+
+  // Marks an instruction as converted - adds it to ConvertedInsts and adds
+  // any and/or/xor i1 users to the queue.
+  void converted(Instruction *I, Value *V, int Mask, bool Worthy) {
+    ConvertedInsts[I] = std::make_tuple(V, Mask, Worthy);
+    auto &M = *I->getFunction()->getParent();
+    auto &Ctx = M.getContext();
+    for (auto *U : I->users()) {
+      auto *LI = dyn_cast<BinaryOperator>(U);
+      if (LI && LI->getType() == Type::getInt1Ty(Ctx) &&
+          (LI->getOpcode() == Instruction::And ||
+           LI->getOpcode() == Instruction::Or ||
+           LI->getOpcode() == Instruction::Xor)) {
+        LogicOpsWorklist.push_back(LI);
+      }
+    }
+  }
+};
+
+} // end anonymous namespace
+
+char SystemZTDCPass::ID = 0;
+INITIALIZE_PASS(SystemZTDCPass, "systemz-tdc",
+                "SystemZ Test Data Class optimization", false, false)
+
+FunctionPass *llvm::createSystemZTDCPass() {
+  return new SystemZTDCPass();
+}
+
+void SystemZTDCPass::convertFCmp(CmpInst &I) {
+  Value *Op0 = I.getOperand(0);
+  auto *Const = dyn_cast<ConstantFP>(I.getOperand(1));
+  auto Pred = I.getPredicate();
+  // Only comparisons with consts are interesting.
+  if (!Const)
+    return;
+  // Compute the smallest normal number (and its negation).
+  auto &Sem = Op0->getType()->getFltSemantics();
+  APFloat Smallest = APFloat::getSmallestNormalized(Sem);
+  APFloat NegSmallest = Smallest;
+  NegSmallest.changeSign();
+  // Check if Const is one of our recognized consts.
+  int WhichConst;
+  if (Const->isZero()) {
+    // All comparisons with 0 can be converted.
+    WhichConst = 0;
+  } else if (Const->isInfinity()) {
+    // Likewise for infinities.
+    WhichConst = Const->isNegative() ? 2 : 1;
+  } else if (Const->isExactlyValue(Smallest)) {
+    // For Smallest, we cannot do EQ separately from GT.
+    if ((Pred & CmpInst::FCMP_OGE) != CmpInst::FCMP_OGE &&
+        (Pred & CmpInst::FCMP_OGE) != 0)
+      return;
+    WhichConst = 3;
+  } else if (Const->isExactlyValue(NegSmallest)) {
+    // Likewise for NegSmallest, we cannot do EQ separately from LT.
+    if ((Pred & CmpInst::FCMP_OLE) != CmpInst::FCMP_OLE &&
+        (Pred & CmpInst::FCMP_OLE) != 0)
+      return;
+    WhichConst = 4;
+  } else {
+    // Not one of our special constants.
+    return;
+  }
+  // Partial masks to use for EQ, GT, LT, UN comparisons, respectively.
+  static const int Masks[][4] = {
+    { // 0
+      SystemZ::TDCMASK_ZERO,              // eq
+      SystemZ::TDCMASK_POSITIVE,          // gt
+      SystemZ::TDCMASK_NEGATIVE,          // lt
+      SystemZ::TDCMASK_NAN,               // un
+    },
+    { // inf
+      SystemZ::TDCMASK_INFINITY_PLUS,     // eq
+      0,                                  // gt
+      (SystemZ::TDCMASK_ZERO |
+       SystemZ::TDCMASK_NEGATIVE |
+       SystemZ::TDCMASK_NORMAL_PLUS |
+       SystemZ::TDCMASK_SUBNORMAL_PLUS),  // lt
+      SystemZ::TDCMASK_NAN,               // un
+    },
+    { // -inf
+      SystemZ::TDCMASK_INFINITY_MINUS,    // eq
+      (SystemZ::TDCMASK_ZERO |
+       SystemZ::TDCMASK_POSITIVE |
+       SystemZ::TDCMASK_NORMAL_MINUS |
+       SystemZ::TDCMASK_SUBNORMAL_MINUS), // gt
+      0,                                  // lt
+      SystemZ::TDCMASK_NAN,               // un
+    },
+    { // minnorm
+      0,                                  // eq (unsupported)
+      (SystemZ::TDCMASK_NORMAL_PLUS |
+       SystemZ::TDCMASK_INFINITY_PLUS),   // gt (actually ge)
+      (SystemZ::TDCMASK_ZERO |
+       SystemZ::TDCMASK_NEGATIVE |
+       SystemZ::TDCMASK_SUBNORMAL_PLUS),  // lt
+      SystemZ::TDCMASK_NAN,               // un
+    },
+    { // -minnorm
+      0,                                  // eq (unsupported)
+      (SystemZ::TDCMASK_ZERO |
+       SystemZ::TDCMASK_POSITIVE |
+       SystemZ::TDCMASK_SUBNORMAL_MINUS), // gt
+      (SystemZ::TDCMASK_NORMAL_MINUS |
+       SystemZ::TDCMASK_INFINITY_MINUS),  // lt (actually le)
+      SystemZ::TDCMASK_NAN,               // un
+    }
+  };
+  // Construct the mask as a combination of the partial masks.
+  int Mask = 0;
+  if (Pred & CmpInst::FCMP_OEQ)
+    Mask |= Masks[WhichConst][0];
+  if (Pred & CmpInst::FCMP_OGT)
+    Mask |= Masks[WhichConst][1];
+  if (Pred & CmpInst::FCMP_OLT)
+    Mask |= Masks[WhichConst][2];
+  if (Pred & CmpInst::FCMP_UNO)
+    Mask |= Masks[WhichConst][3];
+  // A lone fcmp is unworthy of tdc conversion on its own, but may become
+  // worthy if combined with fabs.
+  bool Worthy = false;
+  if (CallInst *CI = dyn_cast<CallInst>(Op0)) {
+    Function *F = CI->getCalledFunction();
+    if (F && F->getIntrinsicID() == Intrinsic::fabs) {
+      // Fold with fabs - adjust the mask appropriately.
+      Mask &= SystemZ::TDCMASK_PLUS;
+      Mask |= Mask >> 1;
+      Op0 = CI->getArgOperand(0);
+      // A combination of fcmp with fabs is a win, unless the constant
+      // involved is 0 (which is handled by later passes).
+      Worthy = WhichConst != 0;
+      PossibleJunk.insert(CI);
+    }
+  }
+  converted(&I, Op0, Mask, Worthy);
+}
+
+void SystemZTDCPass::convertICmp(CmpInst &I) {
+  Value *Op0 = I.getOperand(0);
+  auto *Const = dyn_cast<ConstantInt>(I.getOperand(1));
+  auto Pred = I.getPredicate();
+  // All our icmp rules involve comparisons with consts.
+  if (!Const)
+    return;
+  if (auto *Cast = dyn_cast<BitCastInst>(Op0)) {
+    // Check for icmp+bitcast used for signbit.
+    if (!Cast->getSrcTy()->isFloatTy() &&
+        !Cast->getSrcTy()->isDoubleTy() &&
+        !Cast->getSrcTy()->isFP128Ty())
+      return;
+    Value *V = Cast->getOperand(0);
+    int Mask;
+    if (Pred == CmpInst::ICMP_SLT && Const->isZero()) {
+      // icmp slt (bitcast X), 0 - set if sign bit true
+      Mask = SystemZ::TDCMASK_MINUS;
+    } else if (Pred == CmpInst::ICMP_SGT && Const->isMinusOne()) {
+      // icmp sgt (bitcast X), -1 - set if sign bit false
+      Mask = SystemZ::TDCMASK_PLUS;
+    } else {
+      // Not a sign bit check.
+      return;
+    }
+    PossibleJunk.insert(Cast);
+    converted(&I, V, Mask, true);
+  } else if (auto *CI = dyn_cast<CallInst>(Op0)) {
+    // Check if this is a pre-existing call of our tdc intrinsic.
+    Function *F = CI->getCalledFunction();
+    if (!F || F->getIntrinsicID() != Intrinsic::s390_tdc)
+      return;
+    if (!Const->isZero())
+      return;
+    Value *V = CI->getArgOperand(0);
+    auto *MaskC = dyn_cast<ConstantInt>(CI->getArgOperand(1));
+    // Bail if the mask is not a constant.
+    if (!MaskC)
+      return;
+    int Mask = MaskC->getZExtValue();
+    Mask &= SystemZ::TDCMASK_ALL;
+    if (Pred == CmpInst::ICMP_NE) {
+      // icmp ne (call llvm.s390.tdc(...)), 0 -> simple TDC
+    } else if (Pred == CmpInst::ICMP_EQ) {
+      // icmp eq (call llvm.s390.tdc(...)), 0 -> TDC with inverted mask
+      Mask ^= SystemZ::TDCMASK_ALL;
+    } else {
+      // An unknown comparison - ignore.
+      return;
+    }
+    PossibleJunk.insert(CI);
+    converted(&I, V, Mask, false);
+  }
+}
+
+void SystemZTDCPass::convertLogicOp(BinaryOperator &I) {
+  Value *Op0, *Op1;
+  int Mask0, Mask1;
+  bool Worthy0, Worthy1;
+  std::tie(Op0, Mask0, Worthy0) = ConvertedInsts[cast<Instruction>(I.getOperand(0))];
+  std::tie(Op1, Mask1, Worthy1) = ConvertedInsts[cast<Instruction>(I.getOperand(1))];
+  if (Op0 != Op1)
+    return;
+  int Mask;
+  switch (I.getOpcode()) {
+    case Instruction::And:
+      Mask = Mask0 & Mask1;
+      break;
+    case Instruction::Or:
+      Mask = Mask0 | Mask1;
+      break;
+    case Instruction::Xor:
+      Mask = Mask0 ^ Mask1;
+      break;
+    default:
+      llvm_unreachable("Unknown op in convertLogicOp");
+  }
+  converted(&I, Op0, Mask, true);
+}
+
+bool SystemZTDCPass::runOnFunction(Function &F) {
+  ConvertedInsts.clear();
+  LogicOpsWorklist.clear();
+  PossibleJunk.clear();
+
+  // Look for icmp+fcmp instructions.
+  for (auto &I : instructions(F)) {
+    if (I.getOpcode() == Instruction::FCmp)
+      convertFCmp(cast<CmpInst>(I));
+    else if (I.getOpcode() == Instruction::ICmp)
+      convertICmp(cast<CmpInst>(I));
+  }
+
+  // If none found, bail already.
+  if (ConvertedInsts.empty())
+    return false;
+
+  // Process the queue of logic instructions.
+  while (!LogicOpsWorklist.empty()) {
+    BinaryOperator *Op = LogicOpsWorklist.back();
+    LogicOpsWorklist.pop_back();
+    // If both operands mapped, and the instruction itself not yet mapped,
+    // convert it.
+    if (ConvertedInsts.count(dyn_cast<Instruction>(Op->getOperand(0))) &&
+        ConvertedInsts.count(dyn_cast<Instruction>(Op->getOperand(1))) &&
+        !ConvertedInsts.count(Op))
+      convertLogicOp(*Op);
+  }
+
+  // Time to actually replace the instructions.  Do it in the reverse order
+  // of finding them, since there's a good chance the earlier ones will be
+  // unused (due to being folded into later ones).
+  Module &M = *F.getParent();
+  auto &Ctx = M.getContext();
+  Value *Zero32 = ConstantInt::get(Type::getInt32Ty(Ctx), 0);
+  bool MadeChange = false;
+  for (auto &It : reverse(ConvertedInsts)) {
+    Instruction *I = It.first;
+    Value *V;
+    int Mask;
+    bool Worthy;
+    std::tie(V, Mask, Worthy) = It.second;
+    if (!I->user_empty()) {
+      // If used and unworthy of conversion, skip it.
+      if (!Worthy)
+        continue;
+      // Call the intrinsic, compare result with 0.
+      Value *TDCFunc = Intrinsic::getDeclaration(&M, Intrinsic::s390_tdc,
+                                                 V->getType());
+      IRBuilder<> IRB(I);
+      Value *MaskVal = ConstantInt::get(Type::getInt64Ty(Ctx), Mask);
+      Instruction *TDC = IRB.CreateCall(TDCFunc, {V, MaskVal});
+      Value *ICmp = IRB.CreateICmp(CmpInst::ICMP_NE, TDC, Zero32);
+      I->replaceAllUsesWith(ICmp);
+    }
+    // If unused, or used and converted, remove it.
+    I->eraseFromParent();
+    MadeChange = true;
+  }
+
+  if (!MadeChange)
+    return false;
+
+  // We've actually done something - now clear misc accumulated junk (fabs,
+  // bitcast).
+  for (auto *I : PossibleJunk)
+    if (I->user_empty())
+      I->eraseFromParent();
+
+  return true;
+}
diff --git a/lib/Target/SystemZ/SystemZTargetMachine.cpp b/lib/Target/SystemZ/SystemZTargetMachine.cpp
index f305e85f6cfe..85a3f6f4a8be 100644
--- a/lib/Target/SystemZ/SystemZTargetMachine.cpp
+++ b/lib/Target/SystemZ/SystemZTargetMachine.cpp
@@ -10,6 +10,7 @@
 #include "SystemZTargetMachine.h"
 #include "SystemZTargetTransformInfo.h"
 #include "llvm/CodeGen/Passes.h"
+#include "llvm/CodeGen/TargetPassConfig.h"
 #include "llvm/Support/TargetRegistry.h"
 #include "llvm/Transforms/Scalar.h"
 #include "llvm/CodeGen/TargetLoweringObjectFileImpl.h"
@@ -79,13 +80,22 @@ static std::string computeDataLayout(const Triple &TT, StringRef CPU,
   return Ret;
 }
 
+static Reloc::Model getEffectiveRelocModel(Optional<Reloc::Model> RM) {
+  // Static code is suitable for use in a dynamic executable; there is no
+  // separate DynamicNoPIC model.
+  if (!RM.hasValue() || *RM == Reloc::DynamicNoPIC)
+    return Reloc::Static;
+  return *RM;
+}
+
 SystemZTargetMachine::SystemZTargetMachine(const Target &T, const Triple &TT,
                                            StringRef CPU, StringRef FS,
                                            const TargetOptions &Options,
-                                           Reloc::Model RM, CodeModel::Model CM,
+                                           Optional<Reloc::Model> RM,
+                                           CodeModel::Model CM,
                                            CodeGenOpt::Level OL)
     : LLVMTargetMachine(T, computeDataLayout(TT, CPU, FS), TT, CPU, FS, Options,
-                        RM, CM, OL),
+                        getEffectiveRelocModel(RM), CM, OL),
       TLOF(make_unique<TargetLoweringObjectFileELF>()),
       Subtarget(TT, CPU, FS, *this) {
   initAsmInfo();
@@ -112,6 +122,9 @@ public:
 } // end anonymous namespace
 
 void SystemZPassConfig::addIRPasses() {
+  if (getOptLevel() != CodeGenOpt::None)
+    addPass(createSystemZTDCPass());
+
   TargetPassConfig::addIRPasses();
 }
 
@@ -125,8 +138,7 @@ bool SystemZPassConfig::addInstSelector() {
 }
 
 void SystemZPassConfig::addPreSched2() {
-  if (getOptLevel() != CodeGenOpt::None &&
-      getSystemZTargetMachine().getSubtargetImpl()->hasLoadStoreOnCond())
+  if (getOptLevel() != CodeGenOpt::None)
     addPass(&IfConverterID);
 }
 
diff --git a/lib/Target/SystemZ/SystemZTargetMachine.h b/lib/Target/SystemZ/SystemZTargetMachine.h
index 1a8f1f7f3aaa..69cf9bc6e525 100644
--- a/lib/Target/SystemZ/SystemZTargetMachine.h
+++ b/lib/Target/SystemZ/SystemZTargetMachine.h
@@ -29,7 +29,7 @@ class SystemZTargetMachine : public LLVMTargetMachine {
 public:
   SystemZTargetMachine(const Target &T, const Triple &TT, StringRef CPU,
                        StringRef FS, const TargetOptions &Options,
-                       Reloc::Model RM, CodeModel::Model CM,
+                       Optional<Reloc::Model> RM, CodeModel::Model CM,
                        CodeGenOpt::Level OL);
   ~SystemZTargetMachine() override;
 
diff --git a/lib/Target/SystemZ/TargetInfo/Makefile b/lib/Target/SystemZ/TargetInfo/Makefile
deleted file mode 100644
index 0be80eb4e6ad..000000000000
--- a/lib/Target/SystemZ/TargetInfo/Makefile
+++ /dev/null
@@ -1,15 +0,0 @@
-##===- lib/Target/SystemZ/TargetInfo/Makefile --------------*- Makefile -*-===##
-#
-#                     The LLVM Compiler Infrastructure
-#
-# This file is distributed under the University of Illinois Open Source
-# License. See LICENSE.TXT for details.
-#
-##===----------------------------------------------------------------------===##
-LEVEL = ../../../..
-LIBRARYNAME = LLVMSystemZInfo
-
-# Hack: we need to include 'main' target directory to grab private headers
-CPPFLAGS = -I$(PROJ_OBJ_DIR)/.. -I$(PROJ_SRC_DIR)/..
-
-include $(LEVEL)/Makefile.common
diff --git a/lib/Target/Target.cpp b/lib/Target/Target.cpp
index 1b74e8cba4fe..5d1616d03779 100644
--- a/lib/Target/Target.cpp
+++ b/lib/Target/Target.cpp
@@ -24,6 +24,9 @@
 
 using namespace llvm;
 
+// Avoid including "llvm-c/Core.h" for compile time, fwd-declare this instead.
+extern "C" LLVMContextRef LLVMGetGlobalContext(void);
+
 inline TargetLibraryInfoImpl *unwrap(LLVMTargetLibraryInfoRef P) {
   return reinterpret_cast<TargetLibraryInfoImpl*>(P);
 }
@@ -42,11 +45,20 @@ void LLVMInitializeTarget(LLVMPassRegistryRef R) {
   initializeTarget(*unwrap(R));
 }
 
+LLVMTargetDataRef LLVMGetModuleDataLayout(LLVMModuleRef M) {
+  return wrap(&unwrap(M)->getDataLayout());
+}
+
+void LLVMSetModuleDataLayout(LLVMModuleRef M, LLVMTargetDataRef DL) {
+  unwrap(M)->setDataLayout(*unwrap(DL));
+}
+
 LLVMTargetDataRef LLVMCreateTargetData(const char *StringRep) {
   return wrap(new DataLayout(StringRep));
 }
 
-void LLVMAddTargetData(LLVMTargetDataRef TD, LLVMPassManagerRef PM) {
+void LLVMDisposeTargetData(LLVMTargetDataRef TD) {
+  delete unwrap(TD);
 }
 
 void LLVMAddTargetLibraryInfo(LLVMTargetLibraryInfoRef TLI,
@@ -72,11 +84,11 @@ unsigned LLVMPointerSizeForAS(LLVMTargetDataRef TD, unsigned AS) {
 }
 
 LLVMTypeRef LLVMIntPtrType(LLVMTargetDataRef TD) {
-  return wrap(unwrap(TD)->getIntPtrType(getGlobalContext()));
+  return wrap(unwrap(TD)->getIntPtrType(*unwrap(LLVMGetGlobalContext())));
 }
 
 LLVMTypeRef LLVMIntPtrTypeForAS(LLVMTargetDataRef TD, unsigned AS) {
-  return wrap(unwrap(TD)->getIntPtrType(getGlobalContext(), AS));
+  return wrap(unwrap(TD)->getIntPtrType(*unwrap(LLVMGetGlobalContext()), AS));
 }
 
 LLVMTypeRef LLVMIntPtrTypeInContext(LLVMContextRef C, LLVMTargetDataRef TD) {
@@ -127,7 +139,3 @@ unsigned long long LLVMOffsetOfElement(LLVMTargetDataRef TD, LLVMTypeRef StructT
   StructType *STy = unwrap<StructType>(StructTy);
   return unwrap(TD)->getStructLayout(STy)->getElementOffset(Element);
 }
-
-void LLVMDisposeTargetData(LLVMTargetDataRef TD) {
-  delete unwrap(TD);
-}
diff --git a/lib/Target/TargetLoweringObjectFile.cpp b/lib/Target/TargetLoweringObjectFile.cpp
index a0b0d8f24046..f863f429f43c 100644
--- a/lib/Target/TargetLoweringObjectFile.cpp
+++ b/lib/Target/TargetLoweringObjectFile.cpp
@@ -43,7 +43,7 @@ using namespace llvm;
 void TargetLoweringObjectFile::Initialize(MCContext &ctx,
                                           const TargetMachine &TM) {
   Ctx = &ctx;
-  InitMCObjectFileInfo(TM.getTargetTriple(), TM.getRelocationModel(),
+  InitMCObjectFileInfo(TM.getTargetTriple(), TM.isPositionIndependent(),
                        TM.getCodeModel(), *Ctx);
 }
 
@@ -173,7 +173,7 @@ SectionKind TargetLoweringObjectFile::getKindForGlobal(const GlobalValue *GV,
       // If the global is required to have a unique address, it can't be put
       // into a mergable section: just drop it into the general read-only
       // section instead.
-      if (!GVar->hasUnnamedAddr())
+      if (!GVar->hasGlobalUnnamedAddr())
         return SectionKind::getReadOnly();
 
       // If initializer is a null-terminated string, put it in a "cstring"
@@ -202,6 +202,7 @@ SectionKind TargetLoweringObjectFile::getKindForGlobal(const GlobalValue *GV,
       case 4:  return SectionKind::getMergeableConst4();
       case 8:  return SectionKind::getMergeableConst8();
       case 16: return SectionKind::getMergeableConst16();
+      case 32: return SectionKind::getMergeableConst32();
       default:
         return SectionKind::getReadOnly();
       }
@@ -221,16 +222,7 @@ SectionKind TargetLoweringObjectFile::getKindForGlobal(const GlobalValue *GV,
     }
   }
 
-  // Okay, this isn't a constant.  If the initializer for the global is going
-  // to require a runtime relocation by the dynamic linker, put it into a more
-  // specific section to improve startup time of the app.  This coalesces these
-  // globals together onto fewer pages, improving the locality of the dynamic
-  // linker.
-  if (ReloModel == Reloc::Static)
-    return SectionKind::getData();
-
-  if (C->needsRelocation())
-    return SectionKind::getData();
+  // Okay, this isn't a constant.
   return SectionKind::getData();
 }
 
@@ -252,8 +244,10 @@ TargetLoweringObjectFile::SectionForGlobal(const GlobalValue *GV,
 
 MCSection *TargetLoweringObjectFile::getSectionForJumpTable(
     const Function &F, Mangler &Mang, const TargetMachine &TM) const {
+  unsigned Align = 0;
   return getSectionForConstant(F.getParent()->getDataLayout(),
-                               SectionKind::getReadOnly(), /*C=*/nullptr);
+                               SectionKind::getReadOnly(), /*C=*/nullptr,
+                               Align);
 }
 
 bool TargetLoweringObjectFile::shouldPutJumpTableInFunctionSection(
@@ -277,7 +271,8 @@ bool TargetLoweringObjectFile::shouldPutJumpTableInFunctionSection(
 /// Given a mergable constant with the specified size and relocation
 /// information, return a section that it should be placed in.
 MCSection *TargetLoweringObjectFile::getSectionForConstant(
-    const DataLayout &DL, SectionKind Kind, const Constant *C) const {
+    const DataLayout &DL, SectionKind Kind, const Constant *C,
+    unsigned &Align) const {
   if (Kind.isReadOnly() && ReadOnlySection != nullptr)
     return ReadOnlySection;
 
diff --git a/lib/Target/TargetMachine.cpp b/lib/Target/TargetMachine.cpp
index 850c93cb21b8..82c68505c4e1 100644
--- a/lib/Target/TargetMachine.cpp
+++ b/lib/Target/TargetMachine.cpp
@@ -18,21 +18,23 @@
 #include "llvm/IR/GlobalAlias.h"
 #include "llvm/IR/GlobalValue.h"
 #include "llvm/IR/GlobalVariable.h"
+#include "llvm/IR/LegacyPassManager.h"
 #include "llvm/IR/Mangler.h"
 #include "llvm/MC/MCAsmInfo.h"
-#include "llvm/MC/MCCodeGenInfo.h"
 #include "llvm/MC/MCContext.h"
 #include "llvm/MC/MCInstrInfo.h"
 #include "llvm/MC/MCSectionMachO.h"
 #include "llvm/MC/MCTargetOptions.h"
 #include "llvm/MC/SectionKind.h"
-#include "llvm/IR/LegacyPassManager.h"
-#include "llvm/Support/CommandLine.h"
 #include "llvm/Target/TargetLowering.h"
 #include "llvm/Target/TargetLoweringObjectFile.h"
 #include "llvm/Target/TargetSubtargetInfo.h"
 using namespace llvm;
 
+cl::opt<bool> EnableIPRA("enable-ipra", cl::init(false), cl::Hidden,
+                         cl::desc("Enable interprocedural register allocation "
+                                  "to reduce load/store at procedure calls."));
+
 //---------------------------------------------------------------------------
 // TargetMachine Class
 //
@@ -41,18 +43,23 @@ TargetMachine::TargetMachine(const Target &T, StringRef DataLayoutString,
                              const Triple &TT, StringRef CPU, StringRef FS,
                              const TargetOptions &Options)
     : TheTarget(T), DL(DataLayoutString), TargetTriple(TT), TargetCPU(CPU),
-      TargetFS(FS), CodeGenInfo(nullptr), AsmInfo(nullptr), MRI(nullptr),
-      MII(nullptr), STI(nullptr), RequireStructuredCFG(false),
-      Options(Options) {}
+      TargetFS(FS), AsmInfo(nullptr), MRI(nullptr), MII(nullptr), STI(nullptr),
+      RequireStructuredCFG(false), Options(Options) {
+  if (EnableIPRA.getNumOccurrences())
+    this->Options.EnableIPRA = EnableIPRA;
+}
 
 TargetMachine::~TargetMachine() {
-  delete CodeGenInfo;
   delete AsmInfo;
   delete MRI;
   delete MII;
   delete STI;
 }
 
+bool TargetMachine::isPositionIndependent() const {
+  return getRelocationModel() == Reloc::PIC_;
+}
+
 /// \brief Reset the target options based on the function's attributes.
 // FIXME: This function needs to go away for a number of reasons:
 // a) global state on the TargetMachine is terrible in general,
@@ -72,21 +79,13 @@ void TargetMachine::resetTargetOptions(const Function &F) const {
   RESET_OPTION(NoNaNsFPMath, "no-nans-fp-math");
 }
 
-/// getRelocationModel - Returns the code generation relocation model. The
-/// choices are static, PIC, and dynamic-no-pic, and target default.
-Reloc::Model TargetMachine::getRelocationModel() const {
-  if (!CodeGenInfo)
-    return Reloc::Default;
-  return CodeGenInfo->getRelocationModel();
-}
+/// Returns the code generation relocation model. The choices are static, PIC,
+/// and dynamic-no-pic.
+Reloc::Model TargetMachine::getRelocationModel() const { return RM; }
 
-/// getCodeModel - Returns the code model. The choices are small, kernel,
-/// medium, large, and target default.
-CodeModel::Model TargetMachine::getCodeModel() const {
-  if (!CodeGenInfo)
-    return CodeModel::Default;
-  return CodeGenInfo->getCodeModel();
-}
+/// Returns the code model. The choices are small, kernel, medium, large, and
+/// target default.
+CodeModel::Model TargetMachine::getCodeModel() const { return CMModel; }
 
 /// Get the IR-specified TLS model for Var.
 static TLSModel::Model getSelectedTLSModel(const GlobalValue *GV) {
@@ -106,23 +105,65 @@ static TLSModel::Model getSelectedTLSModel(const GlobalValue *GV) {
   llvm_unreachable("invalid TLS model");
 }
 
+// FIXME: make this a proper option
+static bool CanUseCopyRelocWithPIE = false;
+
+bool TargetMachine::shouldAssumeDSOLocal(const Module &M,
+                                         const GlobalValue *GV) const {
+  Reloc::Model RM = getRelocationModel();
+  const Triple &TT = getTargetTriple();
+
+  // DLLImport explicitly marks the GV as external.
+  if (GV && GV->hasDLLImportStorageClass())
+    return false;
+
+  // Every other GV is local on COFF
+  if (TT.isOSBinFormatCOFF())
+    return true;
+
+  if (GV && (GV->hasLocalLinkage() || !GV->hasDefaultVisibility()))
+    return true;
+
+  if (TT.isOSBinFormatMachO()) {
+    if (RM == Reloc::Static)
+      return true;
+    return GV && GV->isStrongDefinitionForLinker();
+  }
+
+  assert(TT.isOSBinFormatELF());
+  assert(RM != Reloc::DynamicNoPIC);
+
+  bool IsExecutable =
+      RM == Reloc::Static || M.getPIELevel() != PIELevel::Default;
+  if (IsExecutable) {
+    // If the symbol is defined, it cannot be preempted.
+    if (GV && !GV->isDeclarationForLinker())
+      return true;
+
+    bool IsTLS = GV && GV->isThreadLocal();
+    // Check if we can use copy relocations.
+    if (!IsTLS && (RM == Reloc::Static || CanUseCopyRelocWithPIE))
+      return true;
+  }
+
+  // ELF supports preemption of other symbols.
+  return false;
+}
+
 TLSModel::Model TargetMachine::getTLSModel(const GlobalValue *GV) const {
-  bool isLocal = GV->hasLocalLinkage();
-  bool isDeclaration = GV->isDeclaration();
-  bool isPIC = getRelocationModel() == Reloc::PIC_;
-  bool isPIE = Options.PositionIndependentExecutable;
-  // FIXME: what should we do for protected and internal visibility?
-  // For variables, is internal different from hidden?
-  bool isHidden = GV->hasHiddenVisibility();
+  bool IsPIE = GV->getParent()->getPIELevel() != PIELevel::Default;
+  Reloc::Model RM = getRelocationModel();
+  bool IsSharedLibrary = RM == Reloc::PIC_ && !IsPIE;
+  bool IsLocal = shouldAssumeDSOLocal(*GV->getParent(), GV);
 
   TLSModel::Model Model;
-  if (isPIC && !isPIE) {
-    if (isLocal || isHidden)
+  if (IsSharedLibrary) {
+    if (IsLocal)
       Model = TLSModel::LocalDynamic;
     else
       Model = TLSModel::GeneralDynamic;
   } else {
-    if (!isDeclaration || isHidden)
+    if (IsLocal)
       Model = TLSModel::LocalExec;
     else
       Model = TLSModel::InitialExec;
@@ -136,18 +177,10 @@ TLSModel::Model TargetMachine::getTLSModel(const GlobalValue *GV) const {
   return Model;
 }
 
-/// getOptLevel - Returns the optimization level: None, Less,
-/// Default, or Aggressive.
-CodeGenOpt::Level TargetMachine::getOptLevel() const {
-  if (!CodeGenInfo)
-    return CodeGenOpt::Default;
-  return CodeGenInfo->getOptLevel();
-}
+/// Returns the optimization level: None, Less, Default, or Aggressive.
+CodeGenOpt::Level TargetMachine::getOptLevel() const { return OptLevel; }
 
-void TargetMachine::setOptLevel(CodeGenOpt::Level Level) const {
-  if (CodeGenInfo)
-    CodeGenInfo->setOptLevel(Level);
-}
+void TargetMachine::setOptLevel(CodeGenOpt::Level Level) { OptLevel = Level; }
 
 TargetIRAnalysis TargetMachine::getTargetIRAnalysis() {
   return TargetIRAnalysis([this](const Function &F) {
diff --git a/lib/Target/TargetMachineC.cpp b/lib/Target/TargetMachineC.cpp
index f82566c37baa..02836eaf08e5 100644
--- a/lib/Target/TargetMachineC.cpp
+++ b/lib/Target/TargetMachineC.cpp
@@ -18,7 +18,7 @@
 #include "llvm/IR/DataLayout.h"
 #include "llvm/IR/Module.h"
 #include "llvm/IR/LegacyPassManager.h"
-#include "llvm/Support/CodeGen.h"
+#include "llvm/Support/CodeGenCWrappers.h"
 #include "llvm/Support/FileSystem.h"
 #include "llvm/Support/FormattedStream.h"
 #include "llvm/Support/Host.h"
@@ -32,15 +32,6 @@
 
 using namespace llvm;
 
-namespace llvm {
-// Friend to the TargetMachine, access legacy API that are made private in C++
-struct C_API_PRIVATE_ACCESS {
-  static const DataLayout &getDataLayout(const TargetMachine &T) {
-    return T.getDataLayout();
-  }
-};
-}
-
 static TargetMachine *unwrap(LLVMTargetMachineRef P) {
   return reinterpret_cast<TargetMachine *>(P);
 }
@@ -114,7 +105,7 @@ LLVMTargetMachineRef LLVMCreateTargetMachine(LLVMTargetRef T,
         const char* Triple, const char* CPU, const char* Features,
         LLVMCodeGenOptLevel Level, LLVMRelocMode Reloc,
         LLVMCodeModel CodeModel) {
-  Reloc::Model RM;
+  Optional<Reloc::Model> RM;
   switch (Reloc){
     case LLVMRelocStatic:
       RM = Reloc::Static;
@@ -126,7 +117,6 @@ LLVMTargetMachineRef LLVMCreateTargetMachine(LLVMTargetRef T,
       RM = Reloc::DynamicNoPIC;
       break;
     default:
-      RM = Reloc::Default;
       break;
   }
 
@@ -175,16 +165,15 @@ char* LLVMGetTargetMachineFeatureString(LLVMTargetMachineRef T) {
   return strdup(StringRep.c_str());
 }
 
-/** Deprecated: use LLVMGetDataLayout(LLVMModuleRef M) instead. */
-LLVMTargetDataRef LLVMGetTargetMachineData(LLVMTargetMachineRef T) {
-  return wrap(&C_API_PRIVATE_ACCESS::getDataLayout(*unwrap(T)));
-}
-
 void LLVMSetTargetMachineAsmVerbosity(LLVMTargetMachineRef T,
                                       LLVMBool VerboseAsm) {
   unwrap(T)->Options.MCOptions.AsmVerbose = VerboseAsm;
 }
 
+LLVMTargetDataRef LLVMCreateTargetDataLayout(LLVMTargetMachineRef T) {
+  return wrap(new DataLayout(unwrap(T)->createDataLayout()));
+}
+
 static LLVMBool LLVMTargetMachineEmit(LLVMTargetMachineRef T, LLVMModuleRef M,
                                       raw_pwrite_stream &OS,
                                       LLVMCodeGenFileType codegen,
diff --git a/lib/Target/TargetRecip.cpp b/lib/Target/TargetRecip.cpp
index d41b6436928b..183fa5062eab 100644
--- a/lib/Target/TargetRecip.cpp
+++ b/lib/Target/TargetRecip.cpp
@@ -14,11 +14,10 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "llvm/ADT/StringRef.h"
+#include "llvm/Target/TargetRecip.h"
 #include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/StringRef.h"
 #include "llvm/Support/ErrorHandling.h"
-#include "llvm/Target/TargetRecip.h"
-#include <map>
 
 using namespace llvm;
 
@@ -157,9 +156,10 @@ void TargetRecip::parseIndividualParams(const std::vector<std::string> &Args) {
     
     // If the precision was not specified, the double entry is also initialized.
     if (Val.back() != 'f' && Val.back() != 'd') {
-      RecipMap[Val.str() + 'd'].Enabled = !IsDisabled;
+      RecipParams &Params = RecipMap[Val.str() + 'd'];
+      Params.Enabled = !IsDisabled;
       if (!RefStepString.empty())
-        RecipMap[Val.str() + 'd'].RefinementSteps = RefSteps;
+        Params.RefinementSteps = RefSteps;
     }
   }
 }
diff --git a/lib/Target/TargetSubtargetInfo.cpp b/lib/Target/TargetSubtargetInfo.cpp
index 6a61fcdf0f86..c3f94a99b4ca 100644
--- a/lib/Target/TargetSubtargetInfo.cpp
+++ b/lib/Target/TargetSubtargetInfo.cpp
@@ -11,8 +11,6 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "llvm/Support/CommandLine.h"
-#include "llvm/ADT/SmallVector.h"
 #include "llvm/Target/TargetSubtargetInfo.h"
 using namespace llvm;
 
diff --git a/lib/Target/WebAssembly/CMakeLists.txt b/lib/Target/WebAssembly/CMakeLists.txt
index e5c68e598479..b2865f1a0f9e 100644
--- a/lib/Target/WebAssembly/CMakeLists.txt
+++ b/lib/Target/WebAssembly/CMakeLists.txt
@@ -10,11 +10,11 @@ tablegen(LLVM WebAssemblyGenSubtargetInfo.inc -gen-subtarget)
 add_public_tablegen_target(WebAssemblyCommonTableGen)
 
 add_llvm_target(WebAssemblyCodeGen
-  Relooper.cpp
   WebAssemblyArgumentMove.cpp
   WebAssemblyAsmPrinter.cpp
   WebAssemblyCFGStackify.cpp
   WebAssemblyFastISel.cpp
+  WebAssemblyFixIrreducibleControlFlow.cpp
   WebAssemblyFrameLowering.cpp
   WebAssemblyISelDAGToDAG.cpp
   WebAssemblyISelLowering.cpp
@@ -22,14 +22,17 @@ add_llvm_target(WebAssemblyCodeGen
   WebAssemblyLowerBrUnless.cpp
   WebAssemblyMachineFunctionInfo.cpp
   WebAssemblyMCInstLower.cpp
+  WebAssemblyOptimizeLiveIntervals.cpp
   WebAssemblyOptimizeReturned.cpp
   WebAssemblyPeephole.cpp
-  WebAssemblyPEI.cpp
+  WebAssemblyPrepareForLiveIntervals.cpp
   WebAssemblyRegisterInfo.cpp
   WebAssemblyRegColoring.cpp
   WebAssemblyRegNumbering.cpp
   WebAssemblyRegStackify.cpp
+  WebAssemblyReplacePhysRegs.cpp
   WebAssemblySelectionDAGInfo.cpp
+  WebAssemblySetP2AlignOperands.cpp
   WebAssemblyStoreResults.cpp
   WebAssemblySubtarget.cpp
   WebAssemblyTargetMachine.cpp
diff --git a/lib/Target/WebAssembly/Disassembler/Makefile b/lib/Target/WebAssembly/Disassembler/Makefile
deleted file mode 100644
index bcd36ba6f01f..000000000000
--- a/lib/Target/WebAssembly/Disassembler/Makefile
+++ /dev/null
@@ -1,16 +0,0 @@
-##===-- lib/Target/WebAssembly/Disassembler/Makefile -------*- Makefile -*-===##
-#
-#                     The LLVM Compiler Infrastructure
-#
-# This file is distributed under the University of Illinois Open Source
-# License. See LICENSE.TXT for details.
-#
-##===----------------------------------------------------------------------===##
-
-LEVEL = ../../../..
-LIBRARYNAME = LLVMWebAssemblyDisassembler
-
-# Hack: we need to include 'main' target directory to grab private headers
-CPP.Flags += -I$(PROJ_OBJ_DIR)/.. -I$(PROJ_SRC_DIR)/..
-
-include $(LEVEL)/Makefile.common
diff --git a/lib/Target/WebAssembly/Disassembler/WebAssemblyDisassembler.cpp b/lib/Target/WebAssembly/Disassembler/WebAssemblyDisassembler.cpp
index 0143b10c0ab1..c0355aef0b35 100644
--- a/lib/Target/WebAssembly/Disassembler/WebAssemblyDisassembler.cpp
+++ b/lib/Target/WebAssembly/Disassembler/WebAssemblyDisassembler.cpp
@@ -18,7 +18,7 @@
 #include "WebAssembly.h"
 #include "MCTargetDesc/WebAssemblyMCTargetDesc.h"
 #include "llvm/MC/MCContext.h"
-#include "llvm/MC/MCDisassembler.h"
+#include "llvm/MC/MCDisassembler/MCDisassembler.h"
 #include "llvm/MC/MCInst.h"
 #include "llvm/MC/MCInstrInfo.h"
 #include "llvm/MC/MCSubtargetInfo.h"
@@ -93,6 +93,7 @@ MCDisassembler::DecodeStatus WebAssemblyDisassembler::getInstruction(
     const MCOperandInfo &Info = Desc.OpInfo[i];
     switch (Info.OperandType) {
     case MCOI::OPERAND_IMMEDIATE:
+    case WebAssembly::OPERAND_P2ALIGN:
     case WebAssembly::OPERAND_BASIC_BLOCK: {
       if (Pos + sizeof(uint64_t) > Bytes.size())
         return MCDisassembler::Fail;
@@ -109,7 +110,8 @@ MCDisassembler::DecodeStatus WebAssemblyDisassembler::getInstruction(
       MI.addOperand(MCOperand::createReg(Reg));
       break;
     }
-    case WebAssembly::OPERAND_FPIMM: {
+    case WebAssembly::OPERAND_FP32IMM:
+    case WebAssembly::OPERAND_FP64IMM: {
       // TODO: MC converts all floating point immediate operands to double.
       // This is fine for numeric values, but may cause NaNs to change bits.
       if (Pos + sizeof(uint64_t) > Bytes.size())
diff --git a/lib/Target/WebAssembly/InstPrinter/Makefile b/lib/Target/WebAssembly/InstPrinter/Makefile
deleted file mode 100644
index 87534379f796..000000000000
--- a/lib/Target/WebAssembly/InstPrinter/Makefile
+++ /dev/null
@@ -1,16 +0,0 @@
-##===- lib/Target/WebAssembly/AsmPrinter/Makefile ----------*- Makefile -*-===##
-#
-#                     The LLVM Compiler Infrastructure
-#
-# This file is distributed under the University of Illinois Open Source
-# License. See LICENSE.TXT for details.
-#
-##===----------------------------------------------------------------------===##
-
-LEVEL = ../../../..
-LIBRARYNAME = LLVMWebAssemblyAsmPrinter
-
-# Hack: we need to include 'main' wasm target directory to grab private headers
-CPP.Flags += -I$(PROJ_OBJ_DIR)/.. -I$(PROJ_SRC_DIR)/..
-
-include $(LEVEL)/Makefile.common
diff --git a/lib/Target/WebAssembly/InstPrinter/WebAssemblyInstPrinter.cpp b/lib/Target/WebAssembly/InstPrinter/WebAssemblyInstPrinter.cpp
index 9a95150cb557..267d716dd1d0 100644
--- a/lib/Target/WebAssembly/InstPrinter/WebAssemblyInstPrinter.cpp
+++ b/lib/Target/WebAssembly/InstPrinter/WebAssemblyInstPrinter.cpp
@@ -110,14 +110,22 @@ void WebAssemblyInstPrinter::printInst(const MCInst *MI, raw_ostream &OS,
 }
 
 static std::string toString(const APFloat &FP) {
+  // Print NaNs with custom payloads specially.
+  if (FP.isNaN() &&
+      !FP.bitwiseIsEqual(APFloat::getQNaN(FP.getSemantics())) &&
+      !FP.bitwiseIsEqual(APFloat::getQNaN(FP.getSemantics(), /*Negative=*/true))) {
+    APInt AI = FP.bitcastToAPInt();
+    return
+        std::string(AI.isNegative() ? "-" : "") + "nan:0x" +
+        utohexstr(AI.getZExtValue() &
+                  (AI.getBitWidth() == 32 ? INT64_C(0x007fffff) :
+                                            INT64_C(0x000fffffffffffff)),
+                  /*LowerCase=*/true);
+  }
+
+  // Use C99's hexadecimal floating-point representation.
   static const size_t BufBytes = 128;
   char buf[BufBytes];
-  if (FP.isNaN())
-    assert((FP.bitwiseIsEqual(APFloat::getQNaN(FP.getSemantics())) ||
-            FP.bitwiseIsEqual(
-                APFloat::getQNaN(FP.getSemantics(), /*Negative=*/true))) &&
-           "convertToHexString handles neither SNaN nor NaN payloads");
-  // Use C99's hexadecimal floating-point representation.
   auto Written = FP.convertToHexString(
       buf, /*hexDigits=*/0, /*upperCase=*/false, APFloat::rmNearestTiesToEven);
   (void)Written;
@@ -137,11 +145,11 @@ void WebAssemblyInstPrinter::printOperand(const MCInst *MI, unsigned OpNo,
     if (int(WAReg) >= 0)
       printRegName(O, WAReg);
     else if (OpNo >= MII.get(MI->getOpcode()).getNumDefs())
-      O << "$pop" << (WAReg & INT32_MAX);
+      O << "$pop" << WebAssemblyFunctionInfo::getWARegStackId(WAReg);
     else if (WAReg != WebAssemblyFunctionInfo::UnusedReg)
-      O << "$push" << (WAReg & INT32_MAX);
+      O << "$push" << WebAssemblyFunctionInfo::getWARegStackId(WAReg);
     else
-      O << "$discard";
+      O << "$drop";
     // Add a '=' suffix if this is a def.
     if (OpNo < MII.get(MI->getOpcode()).getNumDefs())
       O << '=';
@@ -157,10 +165,20 @@ void WebAssemblyInstPrinter::printOperand(const MCInst *MI, unsigned OpNo,
     // control flow stack, and it may be nice to pretty-print.
     O << Op.getImm();
   } else if (Op.isFPImm()) {
-    assert((OpNo < MII.get(MI->getOpcode()).getNumOperands() ||
-            MII.get(MI->getOpcode()).TSFlags == 0) &&
+    const MCInstrDesc &Desc = MII.get(MI->getOpcode());
+    assert(OpNo < Desc.getNumOperands() &&
+           "Unexpected floating-point immediate as a non-fixed operand");
+    assert(Desc.TSFlags == 0 &&
            "WebAssembly variable_ops floating point ops don't use TSFlags");
-    O << toString(APFloat(Op.getFPImm()));
+    const MCOperandInfo &Info = Desc.OpInfo[OpNo];
+    if (Info.OperandType == WebAssembly::OPERAND_FP32IMM) {
+      // TODO: MC converts all floating point immediate operands to double.
+      // This is fine for numeric values, but may cause NaNs to change bits.
+      O << toString(APFloat(float(Op.getFPImm())));
+    } else {
+      assert(Info.OperandType == WebAssembly::OPERAND_FP64IMM);
+      O << toString(APFloat(Op.getFPImm()));
+    }
   } else {
     assert((OpNo < MII.get(MI->getOpcode()).getNumOperands() ||
             (MII.get(MI->getOpcode()).TSFlags &
@@ -172,6 +190,16 @@ void WebAssemblyInstPrinter::printOperand(const MCInst *MI, unsigned OpNo,
   }
 }
 
+void
+WebAssemblyInstPrinter::printWebAssemblyP2AlignOperand(const MCInst *MI,
+                                                       unsigned OpNo,
+                                                       raw_ostream &O) {
+  int64_t Imm = MI->getOperand(OpNo).getImm();
+  if (Imm == WebAssembly::GetDefaultP2Align(MI->getOpcode()))
+    return;
+  O << ":p2align=" << Imm;
+}
+
 const char *llvm::WebAssembly::TypeToString(MVT Ty) {
   switch (Ty.SimpleTy) {
   case MVT::i32:
diff --git a/lib/Target/WebAssembly/InstPrinter/WebAssemblyInstPrinter.h b/lib/Target/WebAssembly/InstPrinter/WebAssemblyInstPrinter.h
index cd6c59a41c33..07b0f914e447 100644
--- a/lib/Target/WebAssembly/InstPrinter/WebAssemblyInstPrinter.h
+++ b/lib/Target/WebAssembly/InstPrinter/WebAssemblyInstPrinter.h
@@ -15,8 +15,9 @@
 #ifndef LLVM_LIB_TARGET_WEBASSEMBLY_INSTPRINTER_WEBASSEMBLYINSTPRINTER_H
 #define LLVM_LIB_TARGET_WEBASSEMBLY_INSTPRINTER_WEBASSEMBLYINSTPRINTER_H
 
-#include "llvm/MC/MCInstPrinter.h"
+#include "llvm/ADT/SmallVector.h"
 #include "llvm/CodeGen/MachineValueType.h"
+#include "llvm/MC/MCInstPrinter.h"
 
 namespace llvm {
 
@@ -36,6 +37,8 @@ public:
 
   // Used by tblegen code.
   void printOperand(const MCInst *MI, unsigned OpNo, raw_ostream &O);
+  void printWebAssemblyP2AlignOperand(const MCInst *MI, unsigned OpNo,
+                                      raw_ostream &O);
 
   // Autogenerated by tblgen.
   void printInstruction(const MCInst *MI, raw_ostream &O);
diff --git a/lib/Target/WebAssembly/MCTargetDesc/Makefile b/lib/Target/WebAssembly/MCTargetDesc/Makefile
deleted file mode 100644
index 11dcb4ff6075..000000000000
--- a/lib/Target/WebAssembly/MCTargetDesc/Makefile
+++ /dev/null
@@ -1,16 +0,0 @@
-##===- lib/Target/WebAssembly/TargetDesc/Makefile ----------*- Makefile -*-===##
-#
-#                     The LLVM Compiler Infrastructure
-#
-# This file is distributed under the University of Illinois Open Source
-# License. See LICENSE.TXT for details.
-#
-##===----------------------------------------------------------------------===##
-
-LEVEL = ../../../..
-LIBRARYNAME = LLVMWebAssemblyDesc
-
-# Hack: we need to include 'main' target directory to grab private headers
-CPP.Flags += -I$(PROJ_OBJ_DIR)/.. -I$(PROJ_SRC_DIR)/..
-
-include $(LEVEL)/Makefile.common
diff --git a/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyAsmBackend.cpp b/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyAsmBackend.cpp
index bba06f65e169..df6fb8968d56 100644
--- a/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyAsmBackend.cpp
+++ b/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyAsmBackend.cpp
@@ -55,7 +55,8 @@ public:
 
   bool mayNeedRelaxation(const MCInst &Inst) const override { return false; }
 
-  void relaxInstruction(const MCInst &Inst, MCInst &Res) const override {}
+  void relaxInstruction(const MCInst &Inst, const MCSubtargetInfo &STI,
+                        MCInst &Res) const override {}
 
   bool writeNopData(uint64_t Count, MCObjectWriter *OW) const override;
 };
@@ -73,8 +74,10 @@ void WebAssemblyAsmBackend::applyFixup(const MCFixup &Fixup, char *Data,
                                        unsigned DataSize, uint64_t Value,
                                        bool IsPCRel) const {
   const MCFixupKindInfo &Info = getFixupKindInfo(Fixup.getKind());
-  unsigned NumBytes = RoundUpToAlignment(Info.TargetSize, 8);
-  if (!Value)
+  assert(Info.Flags == 0 && "WebAssembly does not use MCFixupKindInfo flags");
+
+  unsigned NumBytes = (Info.TargetSize + 7) / 8;
+  if (Value == 0)
     return; // Doesn't change encoding.
 
   // Shift the value into position.
diff --git a/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyELFObjectWriter.cpp b/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyELFObjectWriter.cpp
index 2bb58b33934e..2146f67959b8 100644
--- a/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyELFObjectWriter.cpp
+++ b/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyELFObjectWriter.cpp
@@ -25,8 +25,8 @@ public:
   WebAssemblyELFObjectWriter(bool Is64Bit, uint8_t OSABI);
 
 protected:
-  unsigned GetRelocType(const MCValue &Target, const MCFixup &Fixup,
-                        bool IsPCRel) const override;
+  unsigned getRelocType(MCContext &Ctx, const MCValue &Target,
+                        const MCFixup &Fixup, bool IsPCRel) const override;
 };
 } // end anonymous namespace
 
@@ -35,7 +35,8 @@ WebAssemblyELFObjectWriter::WebAssemblyELFObjectWriter(bool Is64Bit,
     : MCELFObjectTargetWriter(Is64Bit, OSABI, ELF::EM_WEBASSEMBLY,
                               /*HasRelocationAddend=*/false) {}
 
-unsigned WebAssemblyELFObjectWriter::GetRelocType(const MCValue &Target,
+unsigned WebAssemblyELFObjectWriter::getRelocType(MCContext &Ctx,
+                                                  const MCValue &Target,
                                                   const MCFixup &Fixup,
                                                   bool IsPCRel) const {
   // WebAssembly functions are not allocated in the address space. To resolve a
diff --git a/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyMCAsmInfo.cpp b/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyMCAsmInfo.cpp
index 02c717a92101..d8c39216c53b 100644
--- a/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyMCAsmInfo.cpp
+++ b/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyMCAsmInfo.cpp
@@ -15,7 +15,6 @@
 
 #include "WebAssemblyMCAsmInfo.h"
 #include "llvm/ADT/Triple.h"
-#include "llvm/Support/CommandLine.h"
 using namespace llvm;
 
 #define DEBUG_TYPE "wasm-mc-asm-info"
@@ -48,4 +47,7 @@ WebAssemblyMCAsmInfo::WebAssemblyMCAsmInfo(const Triple &T) {
   ExceptionsType = ExceptionHandling::None;
 
   // TODO: UseIntegratedAssembler?
+
+  // WebAssembly's stack is never executable.
+  UsesNonexecutableStackSection = false;
 }
diff --git a/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyMCCodeEmitter.cpp b/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyMCCodeEmitter.cpp
index f409bd77442c..23f8b3d0e827 100644
--- a/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyMCCodeEmitter.cpp
+++ b/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyMCCodeEmitter.cpp
@@ -22,6 +22,7 @@
 #include "llvm/MC/MCRegisterInfo.h"
 #include "llvm/MC/MCSubtargetInfo.h"
 #include "llvm/MC/MCSymbol.h"
+#include "llvm/Support/EndianStream.h"
 #include "llvm/Support/raw_ostream.h"
 using namespace llvm;
 
@@ -33,7 +34,6 @@ STATISTIC(MCNumFixups, "Number of MC fixups created.");
 namespace {
 class WebAssemblyMCCodeEmitter final : public MCCodeEmitter {
   const MCInstrInfo &MCII;
-  const MCContext &Ctx;
 
   // Implementation generated by tablegen.
   uint64_t getBinaryCodeForInstr(const MCInst &MI,
@@ -45,14 +45,12 @@ class WebAssemblyMCCodeEmitter final : public MCCodeEmitter {
                          const MCSubtargetInfo &STI) const override;
 
 public:
-  WebAssemblyMCCodeEmitter(const MCInstrInfo &mcii, MCContext &ctx)
-      : MCII(mcii), Ctx(ctx) {}
+  WebAssemblyMCCodeEmitter(const MCInstrInfo &mcii) : MCII(mcii) {}
 };
 } // end anonymous namespace
 
-MCCodeEmitter *llvm::createWebAssemblyMCCodeEmitter(const MCInstrInfo &MCII,
-                                                    MCContext &Ctx) {
-  return new WebAssemblyMCCodeEmitter(MCII, Ctx);
+MCCodeEmitter *llvm::createWebAssemblyMCCodeEmitter(const MCInstrInfo &MCII) {
+  return new WebAssemblyMCCodeEmitter(MCII);
 }
 
 void WebAssemblyMCCodeEmitter::encodeInstruction(
@@ -78,7 +76,8 @@ void WebAssemblyMCCodeEmitter::encodeInstruction(
       support::endian::Writer<support::little>(OS).write<uint64_t>(0);
       Fixups.push_back(MCFixup::create(
           (1 + MCII.get(MI.getOpcode()).isVariadic() + i) * sizeof(uint64_t),
-          MO.getExpr(), STI.getTargetTriple().isArch64Bit() ? FK_Data_8 : FK_Data_4,
+          MO.getExpr(),
+          STI.getTargetTriple().isArch64Bit() ? FK_Data_8 : FK_Data_4,
           MI.getLoc()));
       ++MCNumFixups;
     } else {
diff --git a/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyMCTargetDesc.cpp b/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyMCTargetDesc.cpp
index 37000f1cd571..ac11a64086f2 100644
--- a/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyMCTargetDesc.cpp
+++ b/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyMCTargetDesc.cpp
@@ -16,7 +16,6 @@
 #include "InstPrinter/WebAssemblyInstPrinter.h"
 #include "WebAssemblyMCAsmInfo.h"
 #include "WebAssemblyTargetStreamer.h"
-#include "llvm/MC/MCCodeGenInfo.h"
 #include "llvm/MC/MCInstrInfo.h"
 #include "llvm/MC/MCRegisterInfo.h"
 #include "llvm/MC/MCSubtargetInfo.h"
@@ -40,6 +39,15 @@ static MCAsmInfo *createMCAsmInfo(const MCRegisterInfo & /*MRI*/,
   return new WebAssemblyMCAsmInfo(TT);
 }
 
+static void adjustCodeGenOpts(const Triple & /*TT*/, Reloc::Model /*RM*/,
+                              CodeModel::Model &CM) {
+  CodeModel::Model M = (CM == CodeModel::Default || CM == CodeModel::JITDefault)
+                           ? CodeModel::Large
+                           : CM;
+  if (M != CodeModel::Large)
+    report_fatal_error("Non-large code models are not supported yet");
+}
+
 static MCInstrInfo *createMCInstrInfo() {
   MCInstrInfo *X = new MCInstrInfo();
   InitWebAssemblyMCInstrInfo(X);
@@ -57,14 +65,14 @@ static MCInstPrinter *createMCInstPrinter(const Triple & /*T*/,
                                           const MCAsmInfo &MAI,
                                           const MCInstrInfo &MII,
                                           const MCRegisterInfo &MRI) {
-  assert(SyntaxVariant == 0);
+  assert(SyntaxVariant == 0 && "WebAssembly only has one syntax variant");
   return new WebAssemblyInstPrinter(MAI, MII, MRI);
 }
 
 static MCCodeEmitter *createCodeEmitter(const MCInstrInfo &MCII,
                                         const MCRegisterInfo & /*MRI*/,
-                                        MCContext &Ctx) {
-  return createWebAssemblyMCCodeEmitter(MCII, Ctx);
+                                        MCContext & /*Ctx*/) {
+  return createWebAssemblyMCCodeEmitter(MCII);
 }
 
 static MCAsmBackend *createAsmBackend(const Target & /*T*/,
@@ -99,6 +107,9 @@ extern "C" void LLVMInitializeWebAssemblyTargetMC() {
     // Register the MC instruction info.
     TargetRegistry::RegisterMCInstrInfo(*T, createMCInstrInfo);
 
+    // Register the MC codegen info.
+    TargetRegistry::registerMCAdjustCodeGenOpts(*T, adjustCodeGenOpts);
+
     // Register the MC register info.
     TargetRegistry::RegisterMCRegInfo(*T, createMCRegisterInfo);
 
diff --git a/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyMCTargetDesc.h b/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyMCTargetDesc.h
index 9bac4f82822a..001bd7f1fc43 100644
--- a/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyMCTargetDesc.h
+++ b/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyMCTargetDesc.h
@@ -33,8 +33,7 @@ class raw_pwrite_stream;
 extern Target TheWebAssemblyTarget32;
 extern Target TheWebAssemblyTarget64;
 
-MCCodeEmitter *createWebAssemblyMCCodeEmitter(const MCInstrInfo &MCII,
-                                              MCContext &Ctx);
+MCCodeEmitter *createWebAssemblyMCCodeEmitter(const MCInstrInfo &MCII);
 
 MCAsmBackend *createWebAssemblyAsmBackend(const Triple &TT);
 
@@ -45,8 +44,12 @@ namespace WebAssembly {
 enum OperandType {
   /// Basic block label in a branch construct.
   OPERAND_BASIC_BLOCK = MCOI::OPERAND_FIRST_TARGET,
-  /// Floating-point immediate.
-  OPERAND_FPIMM
+  /// 32-bit floating-point immediates.
+  OPERAND_FP32IMM,
+  /// 64-bit floating-point immediates.
+  OPERAND_FP64IMM,
+  /// p2align immediate for load and store address alignment.
+  OPERAND_P2ALIGN
 };
 
 /// WebAssembly-specific directive identifiers.
@@ -87,4 +90,49 @@ enum {
 #define GET_SUBTARGETINFO_ENUM
 #include "WebAssemblyGenSubtargetInfo.inc"
 
+namespace llvm {
+namespace WebAssembly {
+
+/// Return the default p2align value for a load or store with the given opcode.
+inline unsigned GetDefaultP2Align(unsigned Opcode) {
+  switch (Opcode) {
+  case WebAssembly::LOAD8_S_I32:
+  case WebAssembly::LOAD8_U_I32:
+  case WebAssembly::LOAD8_S_I64:
+  case WebAssembly::LOAD8_U_I64:
+  case WebAssembly::STORE8_I32:
+  case WebAssembly::STORE8_I64:
+    return 0;
+  case WebAssembly::LOAD16_S_I32:
+  case WebAssembly::LOAD16_U_I32:
+  case WebAssembly::LOAD16_S_I64:
+  case WebAssembly::LOAD16_U_I64:
+  case WebAssembly::STORE16_I32:
+  case WebAssembly::STORE16_I64:
+    return 1;
+  case WebAssembly::LOAD_I32:
+  case WebAssembly::LOAD_F32:
+  case WebAssembly::STORE_I32:
+  case WebAssembly::STORE_F32:
+  case WebAssembly::LOAD32_S_I64:
+  case WebAssembly::LOAD32_U_I64:
+  case WebAssembly::STORE32_I64:
+    return 2;
+  case WebAssembly::LOAD_I64:
+  case WebAssembly::LOAD_F64:
+  case WebAssembly::STORE_I64:
+  case WebAssembly::STORE_F64:
+    return 3;
+  default: llvm_unreachable("Only loads and stores have p2align values");
+  }
+}
+
+/// The operand number of the load or store address in load/store instructions.
+static const unsigned MemOpAddressOperandNo = 2;
+/// The operand number of the stored value in a store instruction.
+static const unsigned StoreValueOperandNo = 4;
+
+} // end namespace WebAssembly
+} // end namespace llvm
+
 #endif
diff --git a/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyTargetStreamer.cpp b/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyTargetStreamer.cpp
index 1d2822869a15..3d61c15717b4 100644
--- a/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyTargetStreamer.cpp
+++ b/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyTargetStreamer.cpp
@@ -16,12 +16,10 @@
 #include "WebAssemblyTargetStreamer.h"
 #include "InstPrinter/WebAssemblyInstPrinter.h"
 #include "WebAssemblyMCTargetDesc.h"
-#include "WebAssemblyTargetObjectFile.h"
 #include "llvm/MC/MCContext.h"
 #include "llvm/MC/MCSectionELF.h"
 #include "llvm/MC/MCSubtargetInfo.h"
 #include "llvm/MC/MCSymbolELF.h"
-#include "llvm/Support/CommandLine.h"
 #include "llvm/Support/ELF.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/FormattedStream.h"
@@ -66,6 +64,16 @@ void WebAssemblyTargetAsmStreamer::emitLocal(ArrayRef<MVT> Types) {
 
 void WebAssemblyTargetAsmStreamer::emitEndFunc() { OS << "\t.endfunc\n"; }
 
+void WebAssemblyTargetAsmStreamer::emitIndirectFunctionType(
+    StringRef name, SmallVectorImpl<MVT> &SignatureVTs, size_t NumResults) {
+  OS << "\t.functype\t" << name;
+  if (NumResults == 0) OS << ", void";
+  for (auto Ty : SignatureVTs) {
+    OS << ", " << WebAssembly::TypeToString(Ty);
+  }
+  OS << "\n";
+}
+
 // FIXME: What follows is not the real binary encoding.
 
 static void EncodeTypes(MCStreamer &Streamer, ArrayRef<MVT> Types) {
diff --git a/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyTargetStreamer.h b/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyTargetStreamer.h
index c66a51574efb..51354ef22d71 100644
--- a/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyTargetStreamer.h
+++ b/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyTargetStreamer.h
@@ -37,6 +37,12 @@ public:
   virtual void emitLocal(ArrayRef<MVT> Types) = 0;
   /// .endfunc
   virtual void emitEndFunc() = 0;
+  /// .functype
+  virtual void emitIndirectFunctionType(StringRef name,
+                                        SmallVectorImpl<MVT> &SignatureVTs,
+                                        size_t NumResults) {
+    llvm_unreachable("emitIndirectFunctionType not implemented");
+  }
 };
 
 /// This part is for ascii assembly output
@@ -50,6 +56,9 @@ public:
   void emitResult(ArrayRef<MVT> Types) override;
   void emitLocal(ArrayRef<MVT> Types) override;
   void emitEndFunc() override;
+  void emitIndirectFunctionType(StringRef name,
+                                SmallVectorImpl<MVT> &SignatureVTs,
+                                size_t NumResults) override;
 };
 
 /// This part is for ELF object output
diff --git a/lib/Target/WebAssembly/Makefile b/lib/Target/WebAssembly/Makefile
deleted file mode 100644
index c501a2b1ab15..000000000000
--- a/lib/Target/WebAssembly/Makefile
+++ /dev/null
@@ -1,26 +0,0 @@
-##===- lib/Target/WebAssembly/Makefile ---------------------*- Makefile -*-===##
-#
-#                     The LLVM Compiler Infrastructure
-#
-# This file is distributed under the University of Illinois Open Source
-# License. See LICENSE.TXT for details.
-#
-##===----------------------------------------------------------------------===##
-
-LEVEL = ../../..
-LIBRARYNAME = LLVMWebAssemblyCodeGen
-TARGET = WebAssembly
-
-# Make sure that tblgen is run, first thing.
-BUILT_SOURCES = \
-	WebAssemblyGenAsmWriter.inc \
-	WebAssemblyGenDAGISel.inc \
-	WebAssemblyGenFastISel.inc \
-	WebAssemblyGenInstrInfo.inc \
-	WebAssemblyGenMCCodeEmitter.inc \
-	WebAssemblyGenRegisterInfo.inc \
-	WebAssemblyGenSubtargetInfo.inc
-
-DIRS = InstPrinter TargetInfo MCTargetDesc Disassembler
-
-include $(LEVEL)/Makefile.common
diff --git a/lib/Target/WebAssembly/README.txt b/lib/Target/WebAssembly/README.txt
index b97ea454165c..a6c2eefc0578 100644
--- a/lib/Target/WebAssembly/README.txt
+++ b/lib/Target/WebAssembly/README.txt
@@ -13,32 +13,18 @@ binary encoding of WebAssembly itself:
   * https://github.com/WebAssembly/design/blob/master/BinaryEncoding.md
 
 The backend is built, tested and archived on the following waterfall:
-  https://build.chromium.org/p/client.wasm.llvm/console
+  https://wasm-stat.us
 
 The backend's bringup is done using the GCC torture test suite first since it
 doesn't require C library support. Current known failures are in
 known_gcc_test_failures.txt, all other tests should pass. The waterfall will
 turn red if not. Once most of these pass, further testing will use LLVM's own
 test suite. The tests can be run locally using:
-  github.com/WebAssembly/experimental/blob/master/buildbot/torture_test.py
-
-Interesting work that remains to be done:
-* Write a pass to restructurize irreducible control flow. This needs to be done
-  before register allocation to be efficient, because it may duplicate basic
-  blocks and WebAssembly performs register allocation at a whole-function
-  level. Note that LLVM's GPU code has such a pass, but it linearizes control
-  flow (e.g. both sides of branches execute and are masked) which is undesirable
-  for WebAssembly.
+  https://github.com/WebAssembly/waterfall/blob/master/src/compile_torture_tests.py
 
 //===---------------------------------------------------------------------===//
 
-set_local instructions have a return value. We should (a) model this,
-and (b) write optimizations which take advantage of it. Keep in mind that
-many set_local instructions are implicit!
-
-//===---------------------------------------------------------------------===//
-
-Br, br_if, and tableswitch instructions can support having a value on the
+Br, br_if, and br_table instructions can support having a value on the
 expression stack across the jump (sometimes). We should (a) model this, and
 (b) extend the stackifier to utilize it.
 
@@ -58,10 +44,6 @@ us too?
 
 //===---------------------------------------------------------------------===//
 
-When is it profitable to set isAsCheapAsAMove on instructions in WebAssembly?
-
-//===---------------------------------------------------------------------===//
-
 Register stackification uses the EXPR_STACK physical register to impose
 ordering dependencies on instructions with stack operands. This is pessimistic;
 we should consider alternate ways to model stack dependencies.
@@ -82,7 +64,74 @@ stores.
 
 //===---------------------------------------------------------------------===//
 
-Memset/memcpy/memmove should be marked with the "returned" attribute somehow,
-even when they are translated through intrinsics.
+Consider implementing optimizeSelect, optimizeCompareInstr, optimizeCondBranch,
+optimizeLoadInstr, and/or getMachineCombinerPatterns.
+
+//===---------------------------------------------------------------------===//
+
+Find a clean way to fix the problem which leads to the Shrink Wrapping pass
+being run after the WebAssembly PEI pass.
+
+//===---------------------------------------------------------------------===//
+
+When setting multiple local variables to the same constant, we currently get
+code like this:
+
+    i32.const   $4=, 0
+    i32.const   $3=, 0
+
+It could be done with a smaller encoding like this:
+
+    i32.const   $push5=, 0
+    tee_local   $push6=, $4=, $pop5
+    copy_local  $3=, $pop6
+
+//===---------------------------------------------------------------------===//
+
+WebAssembly registers are implicitly initialized to zero. Explicit zeroing is
+therefore often redundant and could be optimized away.
+
+//===---------------------------------------------------------------------===//
+
+Small indices may use smaller encodings than large indices.
+WebAssemblyRegColoring and/or WebAssemblyRegRenumbering should sort registers
+according to their usage frequency to maximize the usage of smaller encodings.
+
+//===---------------------------------------------------------------------===//
+
+When the last statement in a function body computes the return value, it can
+just let that value be the exit value of the outermost block, rather than
+needing an explicit return operation.
+
+//===---------------------------------------------------------------------===//
+
+Many cases of irreducible control flow could be transformed more optimally
+than via the transform in WebAssemblyFixIrreducibleControlFlow.cpp.
+
+It may also be worthwhile to do transforms before register coloring,
+particularly when duplicating code, to allow register coloring to be aware of
+the duplication.
+
+//===---------------------------------------------------------------------===//
+
+WebAssemblyRegStackify could use AliasAnalysis to reorder loads and stores more
+aggressively.
+
+//===---------------------------------------------------------------------===//
+
+WebAssemblyRegStackify is currently a greedy algorithm. This means that, for
+example, a binary operator will stackify with its user before its operands.
+However, if moving the binary operator to its user moves it to a place where
+its operands can't be moved to, it would be better to leave it in place, or
+perhaps move it up, so that it can stackify its operands. A binary operator
+has two operands and one result, so in such cases there could be a net win by
+prefering the operands.
+
+//===---------------------------------------------------------------------===//
+
+Instruction ordering has a significant influence on register stackification and
+coloring. Consider experimenting with the MachineScheduler (enable via
+enableMachineScheduler) and determine if it can be configured to schedule
+instructions advantageously for this purpose.
 
 //===---------------------------------------------------------------------===//
diff --git a/lib/Target/WebAssembly/Relooper.cpp b/lib/Target/WebAssembly/Relooper.cpp
deleted file mode 100644
index 9b718ef094aa..000000000000
--- a/lib/Target/WebAssembly/Relooper.cpp
+++ /dev/null
@@ -1,984 +0,0 @@
-//===-- Relooper.cpp - Top-level interface for WebAssembly  ----*- C++ -*-===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===---------------------------------------------------------------------===//
-///
-/// \file
-/// \brief This implements the Relooper algorithm. This implementation includes
-/// optimizations added since the original academic paper [1] was published.
-///
-/// [1] Alon Zakai. 2011. Emscripten: an LLVM-to-JavaScript compiler. In
-/// Proceedings of the ACM international conference companion on Object
-/// oriented programming systems languages and applications companion
-/// (SPLASH '11). ACM, New York, NY, USA, 301-312. DOI=10.1145/2048147.2048224
-/// http://doi.acm.org/10.1145/2048147.2048224
-///
-//===-------------------------------------------------------------------===//
-
-#include "Relooper.h"
-#include "WebAssembly.h"
-
-#include "llvm/ADT/STLExtras.h"
-#include "llvm/IR/CFG.h"
-#include "llvm/IR/Function.h"
-#include "llvm/Pass.h"
-#include "llvm/Support/CommandLine.h"
-#include "llvm/Support/Debug.h"
-#include "llvm/Support/raw_ostream.h"
-
-#include <cstring>
-#include <cstdlib>
-#include <functional>
-#include <list>
-#include <stack>
-#include <string>
-
-#define DEBUG_TYPE "relooper"
-
-using namespace llvm;
-using namespace Relooper;
-
-static cl::opt<int> RelooperSplittingFactor(
-    "relooper-splitting-factor",
-    cl::desc(
-        "How much to discount code size when deciding whether to split a node"),
-    cl::init(5));
-
-static cl::opt<unsigned> RelooperMultipleSwitchThreshold(
-    "relooper-multiple-switch-threshold",
-    cl::desc(
-        "How many entries to allow in a multiple before we use a switch"),
-    cl::init(10));
-
-static cl::opt<unsigned> RelooperNestingLimit(
-    "relooper-nesting-limit",
-    cl::desc(
-        "How much nesting is acceptable"),
-    cl::init(20));
-
-
-namespace {
-///
-/// Implements the relooper algorithm for a function's blocks.
-///
-/// Implementation details: The Relooper instance has
-/// ownership of the blocks and shapes, and frees them when done.
-///
-struct RelooperAlgorithm {
-  std::deque<Block *> Blocks;
-  std::deque<Shape *> Shapes;
-  Shape *Root;
-  bool MinSize;
-  int BlockIdCounter;
-  int ShapeIdCounter;
-
-  RelooperAlgorithm();
-  ~RelooperAlgorithm();
-
-  void AddBlock(Block *New, int Id = -1);
-
-  // Calculates the shapes
-  void Calculate(Block *Entry);
-
-  // Sets us to try to minimize size
-  void SetMinSize(bool MinSize_) { MinSize = MinSize_; }
-};
-
-struct RelooperAnalysis final : public FunctionPass {
-  static char ID;
-  RelooperAnalysis() : FunctionPass(ID) {}
-  const char *getPassName() const override { return "relooper"; }
-  void getAnalysisUsage(AnalysisUsage &AU) const override {
-    AU.setPreservesAll();
-  }
-  bool runOnFunction(Function &F) override;
-};
-}
-
-// RelooperAnalysis
-
-char RelooperAnalysis::ID = 0;
-FunctionPass *llvm::createWebAssemblyRelooper() {
-  return new RelooperAnalysis();
-}
-
-bool RelooperAnalysis::runOnFunction(Function &F) {
-  DEBUG(dbgs() << "Relooping function '" << F.getName() << "'\n");
-  RelooperAlgorithm R;
-  // FIXME: remove duplication between relooper's and LLVM's BBs.
-  std::map<const BasicBlock *, Block *> BB2B;
-  std::map<const Block *, const BasicBlock *> B2BB;
-  for (const BasicBlock &BB : F) {
-    // FIXME: getName is wrong here, Code is meant to represent amount of code.
-    // FIXME: use BranchVarInit for switch.
-    Block *B = new Block(BB.getName().str().data(), /*BranchVarInit=*/nullptr);
-    R.AddBlock(B);
-    assert(BB2B.find(&BB) == BB2B.end() && "Inserting the same block twice");
-    assert(B2BB.find(B) == B2BB.end() && "Inserting the same block twice");
-    BB2B[&BB] = B;
-    B2BB[B] = &BB;
-  }
-  for (Block *B : R.Blocks) {
-    const BasicBlock *BB = B2BB[B];
-    for (const BasicBlock *Successor : successors(BB))
-      // FIXME: add branch's Condition and Code below.
-      B->AddBranchTo(BB2B[Successor], /*Condition=*/nullptr, /*Code=*/nullptr);
-  }
-  R.Calculate(BB2B[&F.getEntryBlock()]);
-  return false; // Analysis passes don't modify anything.
-}
-
-// Helpers
-
-typedef MapVector<Block *, BlockSet> BlockBlockSetMap;
-typedef std::list<Block *> BlockList;
-
-template <class T, class U>
-static bool contains(const T &container, const U &contained) {
-  return container.count(contained);
-}
-
-
-// Branch
-
-Branch::Branch(const char *ConditionInit, const char *CodeInit)
-    : Ancestor(nullptr), Labeled(true) {
-  // FIXME: move from char* to LLVM data structures
-  Condition = ConditionInit ? strdup(ConditionInit) : nullptr;
-  Code = CodeInit ? strdup(CodeInit) : nullptr;
-}
-
-Branch::~Branch() {
-  // FIXME: move from char* to LLVM data structures
-  free(static_cast<void *>(const_cast<char *>(Condition)));
-  free(static_cast<void *>(const_cast<char *>(Code)));
-}
-
-// Block
-
-Block::Block(const char *CodeInit, const char *BranchVarInit)
-    : Parent(nullptr), Id(-1), IsCheckedMultipleEntry(false) {
-  // FIXME: move from char* to LLVM data structures
-  Code = strdup(CodeInit);
-  BranchVar = BranchVarInit ? strdup(BranchVarInit) : nullptr;
-}
-
-Block::~Block() {
-  // FIXME: move from char* to LLVM data structures
-  free(static_cast<void *>(const_cast<char *>(Code)));
-  free(static_cast<void *>(const_cast<char *>(BranchVar)));
-}
-
-void Block::AddBranchTo(Block *Target, const char *Condition,
-                        const char *Code) {
-  assert(!contains(BranchesOut, Target) &&
-         "cannot add more than one branch to the same target");
-  BranchesOut[Target] = make_unique<Branch>(Condition, Code);
-}
-
-// Relooper
-
-RelooperAlgorithm::RelooperAlgorithm()
-    : Root(nullptr), MinSize(false), BlockIdCounter(1),
-      ShapeIdCounter(0) { // block ID 0 is reserved for clearings
-}
-
-RelooperAlgorithm::~RelooperAlgorithm() {
-  for (auto Curr : Blocks)
-    delete Curr;
-  for (auto Curr : Shapes)
-    delete Curr;
-}
-
-void RelooperAlgorithm::AddBlock(Block *New, int Id) {
-  New->Id = Id == -1 ? BlockIdCounter++ : Id;
-  Blocks.push_back(New);
-}
-
-struct RelooperRecursor {
-  RelooperAlgorithm *Parent;
-  RelooperRecursor(RelooperAlgorithm *ParentInit) : Parent(ParentInit) {}
-};
-
-void RelooperAlgorithm::Calculate(Block *Entry) {
-  // Scan and optimize the input
-  struct PreOptimizer : public RelooperRecursor {
-    PreOptimizer(RelooperAlgorithm *Parent) : RelooperRecursor(Parent) {}
-    BlockSet Live;
-
-    void FindLive(Block *Root) {
-      BlockList ToInvestigate;
-      ToInvestigate.push_back(Root);
-      while (!ToInvestigate.empty()) {
-        Block *Curr = ToInvestigate.front();
-        ToInvestigate.pop_front();
-        if (contains(Live, Curr))
-          continue;
-        Live.insert(Curr);
-        for (const auto &iter : Curr->BranchesOut)
-          ToInvestigate.push_back(iter.first);
-      }
-    }
-
-    // If a block has multiple entries but no exits, and it is small enough, it
-    // is useful to split it. A common example is a C++ function where
-    // everything ends up at a final exit block and does some RAII cleanup.
-    // Without splitting, we will be forced to introduce labelled loops to
-    // allow reaching the final block
-    void SplitDeadEnds() {
-      unsigned TotalCodeSize = 0;
-      for (const auto &Curr : Live) {
-        TotalCodeSize += strlen(Curr->Code);
-      }
-      BlockSet Splits;
-      BlockSet Removed;
-      for (const auto &Original : Live) {
-        if (Original->BranchesIn.size() <= 1 ||
-            !Original->BranchesOut.empty())
-          continue; // only dead ends, for now
-        if (contains(Original->BranchesOut, Original))
-          continue; // cannot split a looping node
-        if (strlen(Original->Code) * (Original->BranchesIn.size() - 1) >
-            TotalCodeSize / RelooperSplittingFactor)
-          continue; // if splitting increases raw code size by a significant
-                    // amount, abort
-        // Split the node (for simplicity, we replace all the blocks, even
-        // though we could have reused the original)
-        DEBUG(dbgs() << "  Splitting '" << Original->Code << "'\n");
-        for (const auto &Prior : Original->BranchesIn) {
-          Block *Split = new Block(Original->Code, Original->BranchVar);
-          Parent->AddBlock(Split, Original->Id);
-          Split->BranchesIn.insert(Prior);
-          std::unique_ptr<Branch> Details;
-          Details.swap(Prior->BranchesOut[Original]);
-          Prior->BranchesOut[Split] = make_unique<Branch>(Details->Condition,
-                                                          Details->Code);
-          for (const auto &iter : Original->BranchesOut) {
-            Block *Post = iter.first;
-            Branch *Details = iter.second.get();
-            Split->BranchesOut[Post] = make_unique<Branch>(Details->Condition,
-                                                           Details->Code);
-            Post->BranchesIn.insert(Split);
-          }
-          Splits.insert(Split);
-          Removed.insert(Original);
-        }
-        for (const auto &iter : Original->BranchesOut) {
-          Block *Post = iter.first;
-          Post->BranchesIn.remove(Original);
-        }
-      }
-      for (const auto &iter : Splits)
-        Live.insert(iter);
-      for (const auto &iter : Removed)
-        Live.remove(iter);
-    }
-  };
-  PreOptimizer Pre(this);
-  Pre.FindLive(Entry);
-
-  // Add incoming branches from live blocks, ignoring dead code
-  for (unsigned i = 0; i < Blocks.size(); i++) {
-    Block *Curr = Blocks[i];
-    if (!contains(Pre.Live, Curr))
-      continue;
-    for (const auto &iter : Curr->BranchesOut)
-      iter.first->BranchesIn.insert(Curr);
-  }
-
-  if (!MinSize)
-    Pre.SplitDeadEnds();
-
-  // Recursively process the graph
-
-  struct Analyzer : public RelooperRecursor {
-    Analyzer(RelooperAlgorithm *Parent) : RelooperRecursor(Parent) {}
-
-    // Add a shape to the list of shapes in this Relooper calculation
-    void Notice(Shape *New) {
-      New->Id = Parent->ShapeIdCounter++;
-      Parent->Shapes.push_back(New);
-    }
-
-    // Create a list of entries from a block. If LimitTo is provided, only
-    // results in that set will appear
-    void GetBlocksOut(Block *Source, BlockSet &Entries,
-                      BlockSet *LimitTo = nullptr) {
-      for (const auto &iter : Source->BranchesOut)
-        if (!LimitTo || contains(*LimitTo, iter.first))
-          Entries.insert(iter.first);
-    }
-
-    // Converts/processes all branchings to a specific target
-    void Solipsize(Block *Target, Branch::FlowType Type, Shape *Ancestor,
-                   BlockSet &From) {
-      DEBUG(dbgs() << "  Solipsize '" << Target->Code << "' type " << Type
-                   << "\n");
-      for (auto iter = Target->BranchesIn.begin();
-           iter != Target->BranchesIn.end();) {
-        Block *Prior = *iter;
-        if (!contains(From, Prior)) {
-          iter++;
-          continue;
-        }
-        std::unique_ptr<Branch> PriorOut;
-        PriorOut.swap(Prior->BranchesOut[Target]);
-        PriorOut->Ancestor = Ancestor;
-        PriorOut->Type = Type;
-        if (MultipleShape *Multiple = dyn_cast<MultipleShape>(Ancestor))
-          Multiple->Breaks++; // We are breaking out of this Multiple, so need a
-                              // loop
-        iter++; // carefully increment iter before erasing
-        Target->BranchesIn.remove(Prior);
-        Target->ProcessedBranchesIn.insert(Prior);
-        Prior->ProcessedBranchesOut[Target].swap(PriorOut);
-      }
-    }
-
-    Shape *MakeSimple(BlockSet &Blocks, Block *Inner, BlockSet &NextEntries) {
-      DEBUG(dbgs() << "  MakeSimple inner block '" << Inner->Code << "'\n");
-      SimpleShape *Simple = new SimpleShape;
-      Notice(Simple);
-      Simple->Inner = Inner;
-      Inner->Parent = Simple;
-      if (Blocks.size() > 1) {
-        Blocks.remove(Inner);
-        GetBlocksOut(Inner, NextEntries, &Blocks);
-        BlockSet JustInner;
-        JustInner.insert(Inner);
-        for (const auto &iter : NextEntries)
-          Solipsize(iter, Branch::Direct, Simple, JustInner);
-      }
-      return Simple;
-    }
-
-    Shape *MakeLoop(BlockSet &Blocks, BlockSet &Entries,
-                    BlockSet &NextEntries) {
-      // Find the inner blocks in this loop. Proceed backwards from the entries
-      // until
-      // you reach a seen block, collecting as you go.
-      BlockSet InnerBlocks;
-      BlockSet Queue = Entries;
-      while (!Queue.empty()) {
-        Block *Curr = *(Queue.begin());
-        Queue.remove(*Queue.begin());
-        if (!contains(InnerBlocks, Curr)) {
-          // This element is new, mark it as inner and remove from outer
-          InnerBlocks.insert(Curr);
-          Blocks.remove(Curr);
-          // Add the elements prior to it
-          for (const auto &iter : Curr->BranchesIn)
-            Queue.insert(iter);
-        }
-      }
-      assert(!InnerBlocks.empty());
-
-      for (const auto &Curr : InnerBlocks) {
-        for (const auto &iter : Curr->BranchesOut) {
-          Block *Possible = iter.first;
-          if (!contains(InnerBlocks, Possible))
-            NextEntries.insert(Possible);
-        }
-      }
-
-      LoopShape *Loop = new LoopShape();
-      Notice(Loop);
-
-      // Solipsize the loop, replacing with break/continue and marking branches
-      // as Processed (will not affect later calculations)
-      // A. Branches to the loop entries become a continue to this shape
-      for (const auto &iter : Entries)
-        Solipsize(iter, Branch::Continue, Loop, InnerBlocks);
-      // B. Branches to outside the loop (a next entry) become breaks on this
-      // shape
-      for (const auto &iter : NextEntries)
-        Solipsize(iter, Branch::Break, Loop, InnerBlocks);
-      // Finish up
-      Shape *Inner = Process(InnerBlocks, Entries, nullptr);
-      Loop->Inner = Inner;
-      return Loop;
-    }
-
-    // For each entry, find the independent group reachable by it. The
-    // independent group is the entry itself, plus all the blocks it can
-    // reach that cannot be directly reached by another entry. Note that we
-    // ignore directly reaching the entry itself by another entry.
-    //   @param Ignore - previous blocks that are irrelevant
-    void FindIndependentGroups(BlockSet &Entries,
-                               BlockBlockSetMap &IndependentGroups,
-                               BlockSet *Ignore = nullptr) {
-      typedef std::map<Block *, Block *> BlockBlockMap;
-
-      struct HelperClass {
-        BlockBlockSetMap &IndependentGroups;
-        BlockBlockMap Ownership; // For each block, which entry it belongs to.
-                                 // We have reached it from there.
-
-        HelperClass(BlockBlockSetMap &IndependentGroupsInit)
-            : IndependentGroups(IndependentGroupsInit) {}
-        void InvalidateWithChildren(Block *New) {
-          // Being in the list means you need to be invalidated
-          BlockList ToInvalidate;
-          ToInvalidate.push_back(New);
-          while (!ToInvalidate.empty()) {
-            Block *Invalidatee = ToInvalidate.front();
-            ToInvalidate.pop_front();
-            Block *Owner = Ownership[Invalidatee];
-            // Owner may have been invalidated, do not add to
-            // IndependentGroups!
-            if (contains(IndependentGroups, Owner))
-              IndependentGroups[Owner].remove(Invalidatee);
-            if (Ownership[Invalidatee]) { // may have been seen before and
-                                          // invalidated already
-              Ownership[Invalidatee] = nullptr;
-              for (const auto &iter : Invalidatee->BranchesOut) {
-                Block *Target = iter.first;
-                BlockBlockMap::iterator Known = Ownership.find(Target);
-                if (Known != Ownership.end()) {
-                  Block *TargetOwner = Known->second;
-                  if (TargetOwner)
-                    ToInvalidate.push_back(Target);
-                }
-              }
-            }
-          }
-        }
-      };
-      HelperClass Helper(IndependentGroups);
-
-      // We flow out from each of the entries, simultaneously.
-      // When we reach a new block, we add it as belonging to the one we got to
-      // it from.
-      // If we reach a new block that is already marked as belonging to someone,
-      // it is reachable by two entries and is not valid for any of them.
-      // Remove it and all it can reach that have been visited.
-
-      // Being in the queue means we just added this item, and
-      // we need to add its children
-      BlockList Queue;
-      for (const auto &Entry : Entries) {
-        Helper.Ownership[Entry] = Entry;
-        IndependentGroups[Entry].insert(Entry);
-        Queue.push_back(Entry);
-      }
-      while (!Queue.empty()) {
-        Block *Curr = Queue.front();
-        Queue.pop_front();
-        Block *Owner = Helper.Ownership[Curr]; // Curr must be in the ownership
-                                               // map if we are in the queue
-        if (!Owner)
-          continue; // we have been invalidated meanwhile after being reached
-                    // from two entries
-        // Add all children
-        for (const auto &iter : Curr->BranchesOut) {
-          Block *New = iter.first;
-          BlockBlockMap::iterator Known = Helper.Ownership.find(New);
-          if (Known == Helper.Ownership.end()) {
-            // New node. Add it, and put it in the queue
-            Helper.Ownership[New] = Owner;
-            IndependentGroups[Owner].insert(New);
-            Queue.push_back(New);
-            continue;
-          }
-          Block *NewOwner = Known->second;
-          if (!NewOwner)
-            continue; // We reached an invalidated node
-          if (NewOwner != Owner)
-            // Invalidate this and all reachable that we have seen - we reached
-            // this from two locations
-            Helper.InvalidateWithChildren(New);
-          // otherwise, we have the same owner, so do nothing
-        }
-      }
-
-      // Having processed all the interesting blocks, we remain with just one
-      // potential issue:
-      // If a->b, and a was invalidated, but then b was later reached by
-      // someone else, we must invalidate b. To check for this, we go over all
-      // elements in the independent groups, if an element has a parent which
-      // does *not* have the same owner, we/ must remove it and all its
-      // children.
-
-      for (const auto &iter : Entries) {
-        BlockSet &CurrGroup = IndependentGroups[iter];
-        BlockList ToInvalidate;
-        for (const auto &iter : CurrGroup) {
-          Block *Child = iter;
-          for (const auto &iter : Child->BranchesIn) {
-            Block *Parent = iter;
-            if (Ignore && contains(*Ignore, Parent))
-              continue;
-            if (Helper.Ownership[Parent] != Helper.Ownership[Child])
-              ToInvalidate.push_back(Child);
-          }
-        }
-        while (!ToInvalidate.empty()) {
-          Block *Invalidatee = ToInvalidate.front();
-          ToInvalidate.pop_front();
-          Helper.InvalidateWithChildren(Invalidatee);
-        }
-      }
-
-      // Remove empty groups
-      for (const auto &iter : Entries)
-        if (IndependentGroups[iter].empty())
-          IndependentGroups.erase(iter);
-    }
-
-    Shape *MakeMultiple(BlockSet &Blocks, BlockSet &Entries,
-                        BlockBlockSetMap &IndependentGroups, Shape *Prev,
-                        BlockSet &NextEntries) {
-      bool Fused = isa<SimpleShape>(Prev);
-      MultipleShape *Multiple = new MultipleShape();
-      Notice(Multiple);
-      BlockSet CurrEntries;
-      for (auto &iter : IndependentGroups) {
-        Block *CurrEntry = iter.first;
-        BlockSet &CurrBlocks = iter.second;
-        // Create inner block
-        CurrEntries.clear();
-        CurrEntries.insert(CurrEntry);
-        for (const auto &CurrInner : CurrBlocks) {
-          // Remove the block from the remaining blocks
-          Blocks.remove(CurrInner);
-          // Find new next entries and fix branches to them
-          for (auto iter = CurrInner->BranchesOut.begin();
-               iter != CurrInner->BranchesOut.end();) {
-            Block *CurrTarget = iter->first;
-            auto Next = iter;
-            Next++;
-            if (!contains(CurrBlocks, CurrTarget)) {
-              NextEntries.insert(CurrTarget);
-              Solipsize(CurrTarget, Branch::Break, Multiple, CurrBlocks);
-            }
-            iter = Next; // increment carefully because Solipsize can remove us
-          }
-        }
-        Multiple->InnerMap[CurrEntry->Id] =
-            Process(CurrBlocks, CurrEntries, nullptr);
-        // If we are not fused, then our entries will actually be checked
-        if (!Fused)
-          CurrEntry->IsCheckedMultipleEntry = true;
-      }
-      // Add entries not handled as next entries, they are deferred
-      for (const auto &Entry : Entries)
-        if (!contains(IndependentGroups, Entry))
-          NextEntries.insert(Entry);
-      // The multiple has been created, we can decide how to implement it
-      if (Multiple->InnerMap.size() >= RelooperMultipleSwitchThreshold) {
-        Multiple->UseSwitch = true;
-        Multiple->Breaks++; // switch captures breaks
-      }
-      return Multiple;
-    }
-
-    // Main function.
-    // Process a set of blocks with specified entries, returns a shape
-    // The Make* functions receive a NextEntries. If they fill it with data,
-    // those are the entries for the ->Next block on them, and the blocks
-    // are what remains in Blocks (which Make* modify). In this way
-    // we avoid recursing on Next (imagine a long chain of Simples, if we
-    // recursed we could blow the stack).
-    Shape *Process(BlockSet &Blocks, BlockSet &InitialEntries, Shape *Prev) {
-      BlockSet *Entries = &InitialEntries;
-      BlockSet TempEntries[2];
-      int CurrTempIndex = 0;
-      BlockSet *NextEntries;
-      Shape *Ret = nullptr;
-
-      auto Make = [&](Shape *Temp) {
-        if (Prev)
-          Prev->Next = Temp;
-        if (!Ret)
-          Ret = Temp;
-        Prev = Temp;
-        Entries = NextEntries;
-      };
-
-      while (1) {
-        CurrTempIndex = 1 - CurrTempIndex;
-        NextEntries = &TempEntries[CurrTempIndex];
-        NextEntries->clear();
-
-        if (Entries->empty())
-          return Ret;
-        if (Entries->size() == 1) {
-          Block *Curr = *(Entries->begin());
-          if (Curr->BranchesIn.empty()) {
-            // One entry, no looping ==> Simple
-            Make(MakeSimple(Blocks, Curr, *NextEntries));
-            if (NextEntries->empty())
-              return Ret;
-            continue;
-          }
-          // One entry, looping ==> Loop
-          Make(MakeLoop(Blocks, *Entries, *NextEntries));
-          if (NextEntries->empty())
-            return Ret;
-          continue;
-        }
-
-        // More than one entry, try to eliminate through a Multiple groups of
-        // independent blocks from an entry/ies. It is important to remove
-        // through multiples as opposed to looping since the former is more
-        // performant.
-        BlockBlockSetMap IndependentGroups;
-        FindIndependentGroups(*Entries, IndependentGroups);
-
-        if (!IndependentGroups.empty()) {
-          // We can handle a group in a multiple if its entry cannot be reached
-          // by another group.
-          // Note that it might be reachable by itself - a loop. But that is
-          // fine, we will create a loop inside the multiple block (which
-          // is the performant order to do it).
-          for (auto iter = IndependentGroups.begin();
-               iter != IndependentGroups.end();) {
-            Block *Entry = iter->first;
-            BlockSet &Group = iter->second;
-            auto curr = iter++; // iterate carefully, we may delete
-            for (BlockSet::iterator iterBranch = Entry->BranchesIn.begin();
-                 iterBranch != Entry->BranchesIn.end(); iterBranch++) {
-              Block *Origin = *iterBranch;
-              if (!contains(Group, Origin)) {
-                // Reached from outside the group, so we cannot handle this
-                IndependentGroups.erase(curr);
-                break;
-              }
-            }
-          }
-
-          // As an optimization, if we have 2 independent groups, and one is a
-          // small dead end, we can handle only that dead end.
-          // The other then becomes a Next - without nesting in the code and
-          // recursion in the analysis.
-          // TODO: if the larger is the only dead end, handle that too
-          // TODO: handle >2 groups
-          // TODO: handle not just dead ends, but also that do not branch to the
-          // NextEntries. However, must be careful there since we create a
-          // Next, and that Next can prevent eliminating a break (since we no
-          // longer naturally reach the same place), which may necessitate a
-          // one-time loop, which makes the unnesting pointless.
-          if (IndependentGroups.size() == 2) {
-            // Find the smaller one
-            auto iter = IndependentGroups.begin();
-            Block *SmallEntry = iter->first;
-            auto SmallSize = iter->second.size();
-            iter++;
-            Block *LargeEntry = iter->first;
-            auto LargeSize = iter->second.size();
-            if (SmallSize != LargeSize) { // ignore the case where they are
-                                          // identical - keep things symmetrical
-                                          // there
-              if (SmallSize > LargeSize) {
-                Block *Temp = SmallEntry;
-                SmallEntry = LargeEntry;
-                LargeEntry = Temp; // Note: we did not flip the Sizes too, they
-                                   // are now invalid. TODO: use the smaller
-                                   // size as a limit?
-              }
-              // Check if dead end
-              bool DeadEnd = true;
-              BlockSet &SmallGroup = IndependentGroups[SmallEntry];
-              for (const auto &Curr : SmallGroup) {
-                for (const auto &iter : Curr->BranchesOut) {
-                  Block *Target = iter.first;
-                  if (!contains(SmallGroup, Target)) {
-                    DeadEnd = false;
-                    break;
-                  }
-                }
-                if (!DeadEnd)
-                  break;
-              }
-              if (DeadEnd)
-                IndependentGroups.erase(LargeEntry);
-            }
-          }
-
-          if (!IndependentGroups.empty())
-            // Some groups removable ==> Multiple
-            Make(MakeMultiple(Blocks, *Entries, IndependentGroups, Prev,
-                              *NextEntries));
-            if (NextEntries->empty())
-              return Ret;
-            continue;
-        }
-        // No independent groups, must be loopable ==> Loop
-        Make(MakeLoop(Blocks, *Entries, *NextEntries));
-        if (NextEntries->empty())
-          return Ret;
-        continue;
-      }
-    }
-  };
-
-  // Main
-
-  BlockSet AllBlocks;
-  for (const auto &Curr : Pre.Live) {
-    AllBlocks.insert(Curr);
-  }
-
-  BlockSet Entries;
-  Entries.insert(Entry);
-  Root = Analyzer(this).Process(AllBlocks, Entries, nullptr);
-  assert(Root);
-
-  ///
-  /// Relooper post-optimizer
-  ///
-  struct PostOptimizer {
-    RelooperAlgorithm *Parent;
-    std::stack<Shape *> LoopStack;
-
-    PostOptimizer(RelooperAlgorithm *ParentInit) : Parent(ParentInit) {}
-
-    void ShapeSwitch(Shape* var,
-                     std::function<void (SimpleShape*)> simple,
-                     std::function<void (MultipleShape*)> multiple,
-                     std::function<void (LoopShape*)> loop) {
-      switch (var->getKind()) {
-        case Shape::SK_Simple: {
-          simple(cast<SimpleShape>(var));
-          break;
-        }
-        case Shape::SK_Multiple: {
-          multiple(cast<MultipleShape>(var));
-          break;
-        }
-        case Shape::SK_Loop: {
-          loop(cast<LoopShape>(var));
-          break;
-        }
-      }
-    }
-
-    // Find the blocks that natural control flow can get us directly to, or
-    // through a multiple that we ignore
-    void FollowNaturalFlow(Shape *S, BlockSet &Out) {
-      ShapeSwitch(S, [&](SimpleShape* Simple) {
-        Out.insert(Simple->Inner);
-      }, [&](MultipleShape* Multiple) {
-        for (const auto &iter : Multiple->InnerMap) {
-          FollowNaturalFlow(iter.second, Out);
-        }
-        FollowNaturalFlow(Multiple->Next, Out);
-      }, [&](LoopShape* Loop) {
-        FollowNaturalFlow(Loop->Inner, Out);
-      });
-    }
-
-    void FindNaturals(Shape *Root, Shape *Otherwise = nullptr) {
-      if (Root->Next) {
-        Root->Natural = Root->Next;
-        FindNaturals(Root->Next, Otherwise);
-      } else {
-        Root->Natural = Otherwise;
-      }
-
-      ShapeSwitch(Root, [](SimpleShape* Simple) {
-      }, [&](MultipleShape* Multiple) {
-        for (const auto &iter : Multiple->InnerMap) {
-          FindNaturals(iter.second, Root->Natural);
-        }
-      }, [&](LoopShape* Loop){
-        FindNaturals(Loop->Inner, Loop->Inner);
-      });
-    }
-
-    // Remove unneeded breaks and continues.
-    // A flow operation is trivially unneeded if the shape we naturally get to
-    // by normal code execution is the same as the flow forces us to.
-    void RemoveUnneededFlows(Shape *Root, Shape *Natural = nullptr,
-                             LoopShape *LastLoop = nullptr,
-                             unsigned Depth = 0) {
-      BlockSet NaturalBlocks;
-      FollowNaturalFlow(Natural, NaturalBlocks);
-      Shape *Next = Root;
-      while (Next) {
-        Root = Next;
-        Next = nullptr;
-        ShapeSwitch(
-            Root,
-            [&](SimpleShape* Simple) {
-              if (Simple->Inner->BranchVar)
-                LastLoop =
-                    nullptr; // a switch clears out the loop (TODO: only for
-                             // breaks, not continue)
-
-              if (Simple->Next) {
-                if (!Simple->Inner->BranchVar &&
-                    Simple->Inner->ProcessedBranchesOut.size() == 2 &&
-                    Depth < RelooperNestingLimit) {
-                  // If there is a next block, we already know at Simple
-                  // creation time to make direct branches, and we can do
-                  // nothing more in general. But, we try to optimize the
-                  // case of a break and a direct: This would normally be
-                  //   if (break?) { break; } ..
-                  // but if we make sure to nest the else, we can save the
-                  // break,
-                  //   if (!break?) { .. }
-                  // This is also better because the more canonical nested
-                  // form is easier to further optimize later. The
-                  // downside is more nesting, which adds to size in builds with
-                  // whitespace.
-                  // Note that we avoid switches, as it complicates control flow
-                  // and is not relevant for the common case we optimize here.
-                  bool Found = false;
-                  bool Abort = false;
-                  for (const auto &iter : Simple->Inner->ProcessedBranchesOut) {
-                    Block *Target = iter.first;
-                    Branch *Details = iter.second.get();
-                    if (Details->Type == Branch::Break) {
-                      Found = true;
-                      if (!contains(NaturalBlocks, Target))
-                        Abort = true;
-                    } else if (Details->Type != Branch::Direct)
-                      Abort = true;
-                  }
-                  if (Found && !Abort) {
-                    for (const auto &iter : Simple->Inner->ProcessedBranchesOut) {
-                      Branch *Details = iter.second.get();
-                      if (Details->Type == Branch::Break) {
-                        Details->Type = Branch::Direct;
-                        if (MultipleShape *Multiple =
-                                dyn_cast<MultipleShape>(Details->Ancestor))
-                          Multiple->Breaks--;
-                      } else {
-                        assert(Details->Type == Branch::Direct);
-                        Details->Type = Branch::Nested;
-                      }
-                    }
-                  }
-                  Depth++; // this optimization increases depth, for us and all
-                           // our next chain (i.e., until this call returns)
-                }
-                Next = Simple->Next;
-              } else {
-                // If there is no next then Natural is where we will
-                // go to by doing nothing, so we can potentially optimize some
-                // branches to direct.
-                for (const auto &iter : Simple->Inner->ProcessedBranchesOut) {
-                  Block *Target = iter.first;
-                  Branch *Details = iter.second.get();
-                  if (Details->Type != Branch::Direct &&
-                      contains(NaturalBlocks,
-                               Target)) { // note: cannot handle split blocks
-                    Details->Type = Branch::Direct;
-                    if (MultipleShape *Multiple =
-                            dyn_cast<MultipleShape>(Details->Ancestor))
-                      Multiple->Breaks--;
-                  } else if (Details->Type == Branch::Break && LastLoop &&
-                             LastLoop->Natural == Details->Ancestor->Natural) {
-                    // it is important to simplify breaks, as simpler breaks
-                    // enable other optimizations
-                    Details->Labeled = false;
-                    if (MultipleShape *Multiple =
-                            dyn_cast<MultipleShape>(Details->Ancestor))
-                      Multiple->Breaks--;
-                  }
-                }
-              }
-            }, [&](MultipleShape* Multiple)
-            {
-              for (const auto &iter : Multiple->InnerMap) {
-                RemoveUnneededFlows(iter.second, Multiple->Next,
-                                    Multiple->Breaks ? nullptr : LastLoop,
-                                    Depth + 1);
-              }
-              Next = Multiple->Next;
-            }, [&](LoopShape* Loop)
-            {
-              RemoveUnneededFlows(Loop->Inner, Loop->Inner, Loop, Depth + 1);
-              Next = Loop->Next;
-            });
-      }
-    }
-
-    // After we know which loops exist, we can calculate which need to be
-    // labeled
-    void FindLabeledLoops(Shape *Root) {
-      Shape *Next = Root;
-      while (Next) {
-        Root = Next;
-        Next = nullptr;
-
-        ShapeSwitch(
-            Root,
-            [&](SimpleShape *Simple) {
-          MultipleShape *Fused = dyn_cast<MultipleShape>(Root->Next);
-          // If we are fusing a Multiple with a loop into this Simple, then
-          // visit it now
-          if (Fused && Fused->Breaks)
-            LoopStack.push(Fused);
-          if (Simple->Inner->BranchVar)
-            LoopStack.push(nullptr); // a switch means breaks are now useless,
-                                     // push a dummy
-          if (Fused) {
-            if (Fused->UseSwitch)
-              LoopStack.push(nullptr); // a switch means breaks are now
-                                       // useless, push a dummy
-            for (const auto &iter : Fused->InnerMap) {
-              FindLabeledLoops(iter.second);
-            }
-          }
-          for (const auto &iter : Simple->Inner->ProcessedBranchesOut) {
-            Branch *Details = iter.second.get();
-            if (Details->Type == Branch::Break ||
-                Details->Type == Branch::Continue) {
-              assert(!LoopStack.empty());
-              if (Details->Ancestor != LoopStack.top() && Details->Labeled) {
-                if (MultipleShape *Multiple =
-                        dyn_cast<MultipleShape>(Details->Ancestor)) {
-                  Multiple->Labeled = true;
-                } else {
-                  LoopShape *Loop = cast<LoopShape>(Details->Ancestor);
-                  Loop->Labeled = true;
-                }
-              } else {
-                Details->Labeled = false;
-              }
-            }
-            if (Fused && Fused->UseSwitch)
-              LoopStack.pop();
-            if (Simple->Inner->BranchVar)
-              LoopStack.pop();
-            if (Fused && Fused->Breaks)
-              LoopStack.pop();
-            if (Fused)
-              Next = Fused->Next;
-            else
-              Next = Root->Next;
-          }
-          }
-          , [&](MultipleShape* Multiple) {
-            if (Multiple->Breaks)
-              LoopStack.push(Multiple);
-            for (const auto &iter : Multiple->InnerMap)
-              FindLabeledLoops(iter.second);
-            if (Multiple->Breaks)
-              LoopStack.pop();
-            Next = Root->Next;
-          }
-          , [&](LoopShape* Loop) {
-            LoopStack.push(Loop);
-            FindLabeledLoops(Loop->Inner);
-            LoopStack.pop();
-            Next = Root->Next;
-          });
-      }
-    }
-
-    void Process(Shape * Root) {
-      FindNaturals(Root);
-      RemoveUnneededFlows(Root);
-      FindLabeledLoops(Root);
-    }
-  };
-
-  PostOptimizer(this).Process(Root);
-}
diff --git a/lib/Target/WebAssembly/Relooper.h b/lib/Target/WebAssembly/Relooper.h
deleted file mode 100644
index 7c564de82f34..000000000000
--- a/lib/Target/WebAssembly/Relooper.h
+++ /dev/null
@@ -1,186 +0,0 @@
-//===-- Relooper.h - Top-level interface for WebAssembly  ----*- C++ -*-===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===-------------------------------------------------------------------===//
-///
-/// \file
-/// \brief This defines an optimized C++ implemention of the Relooper
-/// algorithm, originally developed as part of Emscripten, which
-/// generates a structured AST from arbitrary control flow.
-///
-//===-------------------------------------------------------------------===//
-
-#include "llvm/ADT/MapVector.h"
-#include "llvm/ADT/SetVector.h"
-#include "llvm/Support/Casting.h"
-
-#include <cassert>
-#include <cstdarg>
-#include <cstdio>
-#include <deque>
-#include <list>
-#include <map>
-#include <memory>
-#include <set>
-
-namespace llvm {
-
-namespace Relooper {
-
-struct Block;
-struct Shape;
-
-///
-/// Info about a branching from one block to another
-///
-struct Branch {
-  enum FlowType {
-    Direct = 0, // We will directly reach the right location through other
-                // means, no need for continue or break
-    Break = 1,
-    Continue = 2,
-    Nested = 3 // This code is directly reached, but we must be careful to
-               // ensure it is nested in an if - it is not reached
-    // unconditionally, other code paths exist alongside it that we need to make
-    // sure do not intertwine
-  };
-  Shape
-      *Ancestor; // If not nullptr, this shape is the relevant one for purposes
-                 // of getting to the target block. We break or continue on it
-  Branch::FlowType
-      Type;     // If Ancestor is not nullptr, this says whether to break or
-                // continue
-  bool Labeled; // If a break or continue, whether we need to use a label
-  const char *Condition; // The condition for which we branch. For example,
-                         // "my_var == 1". Conditions are checked one by one.
-                         // One of the conditions should have nullptr as the
-                         // condition, in which case it is the default
-                         // FIXME: move from char* to LLVM data structures
-  const char *Code; // If provided, code that is run right before the branch is
-                    // taken. This is useful for phis
-                    // FIXME: move from char* to LLVM data structures
-
-  Branch(const char *ConditionInit, const char *CodeInit = nullptr);
-  ~Branch();
-};
-
-typedef SetVector<Block *> BlockSet;
-typedef MapVector<Block *, Branch *> BlockBranchMap;
-typedef MapVector<Block *, std::unique_ptr<Branch>> OwningBlockBranchMap;
-
-///
-/// Represents a basic block of code - some instructions that end with a
-/// control flow modifier (a branch, return or throw).
-///
-struct Block {
-  // Branches become processed after we finish the shape relevant to them. For
-  // example, when we recreate a loop, branches to the loop start become
-  // continues and are now processed. When we calculate what shape to generate
-  // from a set of blocks, we ignore processed branches. Blocks own the Branch
-  // objects they use, and destroy them when done.
-  OwningBlockBranchMap BranchesOut;
-  BlockSet BranchesIn;
-  OwningBlockBranchMap ProcessedBranchesOut;
-  BlockSet ProcessedBranchesIn;
-  Shape *Parent; // The shape we are directly inside
-  int Id; // A unique identifier, defined when added to relooper. Note that this
-          // uniquely identifies a *logical* block - if we split it, the two
-          // instances have the same content *and* the same Id
-  const char *Code;      // The string representation of the code in this block.
-                         // Owning pointer (we copy the input)
-                         // FIXME: move from char* to LLVM data structures
-  const char *BranchVar; // A variable whose value determines where we go; if
-                         // this is not nullptr, emit a switch on that variable
-                         // FIXME: move from char* to LLVM data structures
-  bool IsCheckedMultipleEntry; // If true, we are a multiple entry, so reaching
-                               // us requires setting the label variable
-
-  Block(const char *CodeInit, const char *BranchVarInit);
-  ~Block();
-
-  void AddBranchTo(Block *Target, const char *Condition,
-                   const char *Code = nullptr);
-};
-
-///
-/// Represents a structured control flow shape
-///
-struct Shape {
-  int Id; // A unique identifier. Used to identify loops, labels are Lx where x
-          // is the Id. Defined when added to relooper
-  Shape *Next;    // The shape that will appear in the code right after this one
-  Shape *Natural; // The shape that control flow gets to naturally (if there is
-                  // Next, then this is Next)
-
-  /// Discriminator for LLVM-style RTTI (dyn_cast<> et al.)
-  enum ShapeKind { SK_Simple, SK_Multiple, SK_Loop };
-
-private:
-  ShapeKind Kind;
-
-public:
-  ShapeKind getKind() const { return Kind; }
-
-  Shape(ShapeKind KindInit) : Id(-1), Next(nullptr), Kind(KindInit) {}
-};
-
-///
-/// Simple: No control flow at all, just instructions.
-///
-struct SimpleShape : public Shape {
-  Block *Inner;
-
-  SimpleShape() : Shape(SK_Simple), Inner(nullptr) {}
-
-  static bool classof(const Shape *S) { return S->getKind() == SK_Simple; }
-};
-
-///
-/// A shape that may be implemented with a labeled loop.
-///
-struct LabeledShape : public Shape {
-  bool Labeled; // If we have a loop, whether it needs to be labeled
-
-  LabeledShape(ShapeKind KindInit) : Shape(KindInit), Labeled(false) {}
-};
-
-// Blocks with the same id were split and are identical, so we just care about
-// ids in Multiple entries
-typedef std::map<int, Shape *> IdShapeMap;
-
-///
-/// Multiple: A shape with more than one entry. If the next block to
-///           be entered is among them, we run it and continue to
-///           the next shape, otherwise we continue immediately to the
-///           next shape.
-///
-struct MultipleShape : public LabeledShape {
-  IdShapeMap InnerMap; // entry block ID -> shape
-  int Breaks; // If we have branches on us, we need a loop (or a switch). This
-              // is a counter of requirements,
-              // if we optimize it to 0, the loop is unneeded
-  bool UseSwitch; // Whether to switch on label as opposed to an if-else chain
-
-  MultipleShape() : LabeledShape(SK_Multiple), Breaks(0), UseSwitch(false) {}
-
-  static bool classof(const Shape *S) { return S->getKind() == SK_Multiple; }
-};
-
-///
-/// Loop: An infinite loop.
-///
-struct LoopShape : public LabeledShape {
-  Shape *Inner;
-
-  LoopShape() : LabeledShape(SK_Loop), Inner(nullptr) {}
-
-  static bool classof(const Shape *S) { return S->getKind() == SK_Loop; }
-};
-
-} // namespace Relooper
-
-} // namespace llvm
diff --git a/lib/Target/WebAssembly/TargetInfo/Makefile b/lib/Target/WebAssembly/TargetInfo/Makefile
deleted file mode 100644
index b021eb6d9455..000000000000
--- a/lib/Target/WebAssembly/TargetInfo/Makefile
+++ /dev/null
@@ -1,15 +0,0 @@
-##===- lib/Target/WebAssembly/TargetInfo/Makefile ----------*- Makefile -*-===##
-#
-#                     The LLVM Compiler Infrastructure
-#
-# This file is distributed under the University of Illinois Open Source
-# License. See LICENSE.TXT for details.
-#
-##===----------------------------------------------------------------------===##
-LEVEL = ../../../..
-LIBRARYNAME = LLVMWebAssemblyInfo
-
-# Hack: we need to include 'main' target directory to grab private headers
-CPPFLAGS = -I$(PROJ_OBJ_DIR)/.. -I$(PROJ_SRC_DIR)/..
-
-include $(LEVEL)/Makefile.common
diff --git a/lib/Target/WebAssembly/WebAssembly.h b/lib/Target/WebAssembly/WebAssembly.h
index e972da5af74f..957f31cae222 100644
--- a/lib/Target/WebAssembly/WebAssembly.h
+++ b/lib/Target/WebAssembly/WebAssembly.h
@@ -23,23 +23,28 @@ namespace llvm {
 class WebAssemblyTargetMachine;
 class FunctionPass;
 
+// LLVM IR passes.
 FunctionPass *createWebAssemblyOptimizeReturned();
 
+// ISel and immediate followup passes.
 FunctionPass *createWebAssemblyISelDag(WebAssemblyTargetMachine &TM,
                                        CodeGenOpt::Level OptLevel);
 FunctionPass *createWebAssemblyArgumentMove();
+FunctionPass *createWebAssemblySetP2AlignOperands();
 
+// Late passes.
+FunctionPass *createWebAssemblyReplacePhysRegs();
+FunctionPass *createWebAssemblyPrepareForLiveIntervals();
+FunctionPass *createWebAssemblyOptimizeLiveIntervals();
 FunctionPass *createWebAssemblyStoreResults();
 FunctionPass *createWebAssemblyRegStackify();
 FunctionPass *createWebAssemblyRegColoring();
-FunctionPass *createWebAssemblyPEI();
+FunctionPass *createWebAssemblyFixIrreducibleControlFlow();
 FunctionPass *createWebAssemblyCFGStackify();
 FunctionPass *createWebAssemblyLowerBrUnless();
 FunctionPass *createWebAssemblyRegNumbering();
 FunctionPass *createWebAssemblyPeephole();
 
-FunctionPass *createWebAssemblyRelooper();
-
 } // end namespace llvm
 
 #endif
diff --git a/lib/Target/WebAssembly/WebAssemblyArgumentMove.cpp b/lib/Target/WebAssembly/WebAssemblyArgumentMove.cpp
index 3893c408cf63..5887f45371fc 100644
--- a/lib/Target/WebAssembly/WebAssemblyArgumentMove.cpp
+++ b/lib/Target/WebAssembly/WebAssemblyArgumentMove.cpp
@@ -65,8 +65,8 @@ FunctionPass *llvm::createWebAssemblyArgumentMove() {
 }
 
 /// Test whether the given instruction is an ARGUMENT.
-static bool IsArgument(const MachineInstr *MI) {
-  switch (MI->getOpcode()) {
+static bool IsArgument(const MachineInstr &MI) {
+  switch (MI.getOpcode()) {
   case WebAssembly::ARGUMENT_I32:
   case WebAssembly::ARGUMENT_I64:
   case WebAssembly::ARGUMENT_F32:
@@ -88,20 +88,18 @@ bool WebAssemblyArgumentMove::runOnMachineFunction(MachineFunction &MF) {
   MachineBasicBlock::iterator InsertPt = EntryMBB.end();
 
   // Look for the first NonArg instruction.
-  for (auto MII = EntryMBB.begin(), MIE = EntryMBB.end(); MII != MIE; ++MII) {
-    MachineInstr *MI = MII;
+  for (MachineInstr &MI : EntryMBB) {
     if (!IsArgument(MI)) {
-      InsertPt = MII;
+      InsertPt = MI;
       break;
     }
   }
 
   // Now move any argument instructions later in the block
   // to before our first NonArg instruction.
-  for (auto I = InsertPt, E = EntryMBB.end(); I != E; ++I) {
-    MachineInstr *MI = I;
+  for (MachineInstr &MI : llvm::make_range(InsertPt, EntryMBB.end())) {
     if (IsArgument(MI)) {
-      EntryMBB.insert(InsertPt, MI->removeFromParent());
+      EntryMBB.insert(InsertPt, MI.removeFromParent());
       Changed = true;
     }
   }
diff --git a/lib/Target/WebAssembly/WebAssemblyAsmPrinter.cpp b/lib/Target/WebAssembly/WebAssemblyAsmPrinter.cpp
index 45ac99d90ed9..54e9f7f52901 100644
--- a/lib/Target/WebAssembly/WebAssemblyAsmPrinter.cpp
+++ b/lib/Target/WebAssembly/WebAssemblyAsmPrinter.cpp
@@ -67,6 +67,7 @@ private:
   // AsmPrinter Implementation.
   //===------------------------------------------------------------------===//
 
+  void EmitEndOfAsmFile(Module &M) override;
   void EmitJumpTableInfo() override;
   void EmitConstantPool() override;
   void EmitFunctionBodyStart() override;
@@ -93,10 +94,7 @@ private:
 //===----------------------------------------------------------------------===//
 
 MVT WebAssemblyAsmPrinter::getRegType(unsigned RegNo) const {
-  const TargetRegisterClass *TRC =
-      TargetRegisterInfo::isVirtualRegister(RegNo)
-          ? MRI->getRegClass(RegNo)
-          : MRI->getTargetRegisterInfo()->getMinimalPhysRegClass(RegNo);
+  const TargetRegisterClass *TRC = MRI->getRegClass(RegNo);
   for (MVT T : {MVT::i32, MVT::i64, MVT::f32, MVT::f64})
     if (TRC->hasType(T))
       return T;
@@ -119,8 +117,7 @@ std::string WebAssemblyAsmPrinter::regToString(const MachineOperand &MO) {
   return '$' + utostr(WAReg);
 }
 
-WebAssemblyTargetStreamer *
-WebAssemblyAsmPrinter::getTargetStreamer() {
+WebAssemblyTargetStreamer *WebAssemblyAsmPrinter::getTargetStreamer() {
   MCTargetStreamer *TS = OutStreamer->getTargetStreamer();
   return static_cast<WebAssemblyTargetStreamer *>(TS);
 }
@@ -128,16 +125,6 @@ WebAssemblyAsmPrinter::getTargetStreamer() {
 //===----------------------------------------------------------------------===//
 // WebAssemblyAsmPrinter Implementation.
 //===----------------------------------------------------------------------===//
-
-void WebAssemblyAsmPrinter::EmitConstantPool() {
-  assert(MF->getConstantPool()->getConstants().empty() &&
-         "WebAssembly disables constant pools");
-}
-
-void WebAssemblyAsmPrinter::EmitJumpTableInfo() {
-  // Nothing to do; jump tables are incorporated into the instruction stream.
-}
-
 static void ComputeLegalValueVTs(const Function &F, const TargetMachine &TM,
                                  Type *Ty, SmallVectorImpl<MVT> &ValueVTs) {
   const DataLayout &DL(F.getParent()->getDataLayout());
@@ -154,6 +141,42 @@ static void ComputeLegalValueVTs(const Function &F, const TargetMachine &TM,
   }
 }
 
+void WebAssemblyAsmPrinter::EmitEndOfAsmFile(Module &M) {
+  for (const auto &F : M) {
+    // Emit function type info for all undefined functions
+    if (F.isDeclarationForLinker() && !F.isIntrinsic()) {
+      SmallVector<MVT, 4> SignatureVTs;
+      ComputeLegalValueVTs(F, TM, F.getReturnType(), SignatureVTs);
+      size_t NumResults = SignatureVTs.size();
+      if (SignatureVTs.size() > 1) {
+        // WebAssembly currently can't lower returns of multiple values without
+        // demoting to sret (see WebAssemblyTargetLowering::CanLowerReturn). So
+        // replace multiple return values with a pointer parameter.
+        SignatureVTs.clear();
+        SignatureVTs.push_back(
+            MVT::getIntegerVT(M.getDataLayout().getPointerSizeInBits()));
+        NumResults = 0;
+      }
+
+      for (auto &Arg : F.args()) {
+        ComputeLegalValueVTs(F, TM, Arg.getType(), SignatureVTs);
+      }
+
+      getTargetStreamer()->emitIndirectFunctionType(F.getName(), SignatureVTs,
+                                                    NumResults);
+    }
+  }
+}
+
+void WebAssemblyAsmPrinter::EmitConstantPool() {
+  assert(MF->getConstantPool()->getConstants().empty() &&
+         "WebAssembly disables constant pools");
+}
+
+void WebAssemblyAsmPrinter::EmitJumpTableInfo() {
+  // Nothing to do; jump tables are incorporated into the instruction stream.
+}
+
 void WebAssemblyAsmPrinter::EmitFunctionBodyStart() {
   if (!MFI->getParams().empty())
     getTargetStreamer()->emitParam(MFI->getParams());
@@ -184,13 +207,6 @@ void WebAssemblyAsmPrinter::EmitFunctionBodyStart() {
     LocalTypes.push_back(getRegType(VReg));
     AnyWARegs = true;
   }
-  auto &PhysRegs = MFI->getPhysRegs();
-  for (unsigned PReg = 0; PReg < PhysRegs.size(); ++PReg) {
-    if (PhysRegs[PReg] == -1U)
-      continue;
-    LocalTypes.push_back(getRegType(PReg));
-    AnyWARegs = true;
-  }
   if (AnyWARegs)
     getTargetStreamer()->emitLocal(LocalTypes);
 
@@ -212,6 +228,30 @@ void WebAssemblyAsmPrinter::EmitInstruction(const MachineInstr *MI) {
     // These represent values which are live into the function entry, so there's
     // no instruction to emit.
     break;
+  case WebAssembly::FALLTHROUGH_RETURN_I32:
+  case WebAssembly::FALLTHROUGH_RETURN_I64:
+  case WebAssembly::FALLTHROUGH_RETURN_F32:
+  case WebAssembly::FALLTHROUGH_RETURN_F64: {
+    // These instructions represent the implicit return at the end of a
+    // function body. The operand is always a pop.
+    assert(MFI->isVRegStackified(MI->getOperand(0).getReg()));
+
+    if (isVerbose()) {
+      OutStreamer->AddComment("fallthrough-return: $pop" +
+                              utostr(MFI->getWARegStackId(
+                                  MFI->getWAReg(MI->getOperand(0).getReg()))));
+      OutStreamer->AddBlankLine();
+    }
+    break;
+  }
+  case WebAssembly::FALLTHROUGH_RETURN_VOID:
+    // This instruction represents the implicit return at the end of a
+    // function body with no return value.
+    if (isVerbose()) {
+      OutStreamer->AddComment("fallthrough-return");
+      OutStreamer->AddBlankLine();
+    }
+    break;
   default: {
     WebAssemblyMCInstLower MCInstLowering(OutContext, *this);
     MCInst TmpInst;
diff --git a/lib/Target/WebAssembly/WebAssemblyCFGStackify.cpp b/lib/Target/WebAssembly/WebAssemblyCFGStackify.cpp
index a39349c562fd..33166f5b554f 100644
--- a/lib/Target/WebAssembly/WebAssemblyCFGStackify.cpp
+++ b/lib/Target/WebAssembly/WebAssemblyCFGStackify.cpp
@@ -10,10 +10,10 @@
 /// \file
 /// \brief This file implements a CFG stacking pass.
 ///
-/// This pass reorders the blocks in a function to put them into a reverse
-/// post-order [0], with special care to keep the order as similar as possible
-/// to the original order, and to keep loops contiguous even in the case of
-/// split backedges.
+/// This pass reorders the blocks in a function to put them into topological
+/// order, ignoring loop backedges, and without any loop being interrupted
+/// by a block not dominated by the loop header, with special care to keep the
+/// order as similar as possible to the original order.
 ///
 /// Then, it inserts BLOCK and LOOP markers to mark the start of scopes, since
 /// scope boundaries serve as the labels for WebAssembly's control transfers.
@@ -21,14 +21,13 @@
 /// This is sufficient to convert arbitrary CFGs into a form that works on
 /// WebAssembly, provided that all loops are single-entry.
 ///
-/// [0] https://en.wikipedia.org/wiki/Depth-first_search#Vertex_orderings
-///
 //===----------------------------------------------------------------------===//
 
 #include "WebAssembly.h"
 #include "MCTargetDesc/WebAssemblyMCTargetDesc.h"
+#include "WebAssemblyMachineFunctionInfo.h"
 #include "WebAssemblySubtarget.h"
-#include "llvm/ADT/SCCIterator.h"
+#include "llvm/ADT/PriorityQueue.h"
 #include "llvm/ADT/SetVector.h"
 #include "llvm/CodeGen/MachineDominators.h"
 #include "llvm/CodeGen/MachineFunction.h"
@@ -70,90 +69,6 @@ FunctionPass *llvm::createWebAssemblyCFGStackify() {
   return new WebAssemblyCFGStackify();
 }
 
-static void EliminateMultipleEntryLoops(MachineFunction &MF,
-                                        const MachineLoopInfo &MLI) {
-  SmallPtrSet<MachineBasicBlock *, 8> InSet;
-  for (scc_iterator<MachineFunction *> I = scc_begin(&MF), E = scc_end(&MF);
-       I != E; ++I) {
-    const std::vector<MachineBasicBlock *> &CurrentSCC = *I;
-
-    // Skip trivial SCCs.
-    if (CurrentSCC.size() == 1)
-      continue;
-
-    InSet.insert(CurrentSCC.begin(), CurrentSCC.end());
-    MachineBasicBlock *Header = nullptr;
-    for (MachineBasicBlock *MBB : CurrentSCC) {
-      for (MachineBasicBlock *Pred : MBB->predecessors()) {
-        if (InSet.count(Pred))
-          continue;
-        if (!Header) {
-          Header = MBB;
-          break;
-        }
-        // TODO: Implement multiple-entry loops.
-        report_fatal_error("multiple-entry loops are not supported yet");
-      }
-    }
-    assert(MLI.isLoopHeader(Header));
-
-    InSet.clear();
-  }
-}
-
-namespace {
-/// Post-order traversal stack entry.
-struct POStackEntry {
-  MachineBasicBlock *MBB;
-  SmallVector<MachineBasicBlock *, 0> Succs;
-
-  POStackEntry(MachineBasicBlock *MBB, MachineFunction &MF,
-               const MachineLoopInfo &MLI);
-};
-} // end anonymous namespace
-
-static bool LoopContains(const MachineLoop *Loop,
-                         const MachineBasicBlock *MBB) {
-  return Loop ? Loop->contains(MBB) : true;
-}
-
-POStackEntry::POStackEntry(MachineBasicBlock *MBB, MachineFunction &MF,
-                           const MachineLoopInfo &MLI)
-    : MBB(MBB), Succs(MBB->successors()) {
-  // RPO is not a unique form, since at every basic block with multiple
-  // successors, the DFS has to pick which order to visit the successors in.
-  // Sort them strategically (see below).
-  MachineLoop *Loop = MLI.getLoopFor(MBB);
-  MachineFunction::iterator Next = next(MachineFunction::iterator(MBB));
-  MachineBasicBlock *LayoutSucc = Next == MF.end() ? nullptr : &*Next;
-  std::stable_sort(
-      Succs.begin(), Succs.end(),
-      [=, &MLI](const MachineBasicBlock *A, const MachineBasicBlock *B) {
-        if (A == B)
-          return false;
-
-        // Keep loops contiguous by preferring the block that's in the same
-        // loop.
-        bool LoopContainsA = LoopContains(Loop, A);
-        bool LoopContainsB = LoopContains(Loop, B);
-        if (LoopContainsA && !LoopContainsB)
-          return true;
-        if (!LoopContainsA && LoopContainsB)
-          return false;
-
-        // Minimize perturbation by preferring the block which is the immediate
-        // layout successor.
-        if (A == LayoutSucc)
-          return true;
-        if (B == LayoutSucc)
-          return false;
-
-        // TODO: More sophisticated orderings may be profitable here.
-
-        return false;
-      });
-}
-
 /// Return the "bottom" block of a loop. This differs from
 /// MachineLoop::getBottomBlock in that it works even if the loop is
 /// discontiguous.
@@ -165,53 +80,166 @@ static MachineBasicBlock *LoopBottom(const MachineLoop *Loop) {
   return Bottom;
 }
 
-/// Sort the blocks in RPO, taking special care to make sure that loops are
-/// contiguous even in the case of split backedges.
-///
-/// TODO: Determine whether RPO is actually worthwhile, or whether we should
-/// move to just a stable-topological-sort-based approach that would preserve
-/// more of the original order.
-static void SortBlocks(MachineFunction &MF, const MachineLoopInfo &MLI) {
-  // Note that we do our own RPO rather than using
-  // "llvm/ADT/PostOrderIterator.h" because we want control over the order that
-  // successors are visited in (see above). Also, we can sort the blocks in the
-  // MachineFunction as we go.
-  SmallPtrSet<MachineBasicBlock *, 16> Visited;
-  SmallVector<POStackEntry, 16> Stack;
-
-  MachineBasicBlock *EntryBlock = &*MF.begin();
-  Visited.insert(EntryBlock);
-  Stack.push_back(POStackEntry(EntryBlock, MF, MLI));
-
-  for (;;) {
-    POStackEntry &Entry = Stack.back();
-    SmallVectorImpl<MachineBasicBlock *> &Succs = Entry.Succs;
-    if (!Succs.empty()) {
-      MachineBasicBlock *Succ = Succs.pop_back_val();
-      if (Visited.insert(Succ).second)
-        Stack.push_back(POStackEntry(Succ, MF, MLI));
-      continue;
-    }
+static void MaybeUpdateTerminator(MachineBasicBlock *MBB) {
+#ifndef NDEBUG
+  bool AnyBarrier = false;
+#endif
+  bool AllAnalyzable = true;
+  for (const MachineInstr &Term : MBB->terminators()) {
+#ifndef NDEBUG
+    AnyBarrier |= Term.isBarrier();
+#endif
+    AllAnalyzable &= Term.isBranch() && !Term.isIndirectBranch();
+  }
+  assert((AnyBarrier || AllAnalyzable) &&
+         "AnalyzeBranch needs to analyze any block with a fallthrough");
+  if (AllAnalyzable)
+    MBB->updateTerminator();
+}
 
-    // Put the block in its position in the MachineFunction.
-    MachineBasicBlock &MBB = *Entry.MBB;
-    MBB.moveBefore(&*MF.begin());
-
-    // Branch instructions may utilize a fallthrough, so update them if a
-    // fallthrough has been added or removed.
-    if (!MBB.empty() && MBB.back().isTerminator() && !MBB.back().isBranch() &&
-        !MBB.back().isBarrier())
-      report_fatal_error(
-          "Non-branch terminator with fallthrough cannot yet be rewritten");
-    if (MBB.empty() || !MBB.back().isTerminator() || MBB.back().isBranch())
-      MBB.updateTerminator();
-
-    Stack.pop_back();
-    if (Stack.empty())
-      break;
+namespace {
+/// Sort blocks by their number.
+struct CompareBlockNumbers {
+  bool operator()(const MachineBasicBlock *A,
+                  const MachineBasicBlock *B) const {
+    return A->getNumber() > B->getNumber();
+  }
+};
+/// Sort blocks by their number in the opposite order..
+struct CompareBlockNumbersBackwards {
+  bool operator()(const MachineBasicBlock *A,
+                  const MachineBasicBlock *B) const {
+    return A->getNumber() < B->getNumber();
   }
+};
+/// Bookkeeping for a loop to help ensure that we don't mix blocks not dominated
+/// by the loop header among the loop's blocks.
+struct Entry {
+  const MachineLoop *Loop;
+  unsigned NumBlocksLeft;
+
+  /// List of blocks not dominated by Loop's header that are deferred until
+  /// after all of Loop's blocks have been seen.
+  std::vector<MachineBasicBlock *> Deferred;
+
+  explicit Entry(const MachineLoop *L)
+      : Loop(L), NumBlocksLeft(L->getNumBlocks()) {}
+};
+}
 
-  // Now that we've sorted the blocks in RPO, renumber them.
+/// Sort the blocks, taking special care to make sure that loops are not
+/// interrupted by blocks not dominated by their header.
+/// TODO: There are many opportunities for improving the heuristics here.
+/// Explore them.
+static void SortBlocks(MachineFunction &MF, const MachineLoopInfo &MLI,
+                       const MachineDominatorTree &MDT) {
+  // Prepare for a topological sort: Record the number of predecessors each
+  // block has, ignoring loop backedges.
+  MF.RenumberBlocks();
+  SmallVector<unsigned, 16> NumPredsLeft(MF.getNumBlockIDs(), 0);
+  for (MachineBasicBlock &MBB : MF) {
+    unsigned N = MBB.pred_size();
+    if (MachineLoop *L = MLI.getLoopFor(&MBB))
+      if (L->getHeader() == &MBB)
+        for (const MachineBasicBlock *Pred : MBB.predecessors())
+          if (L->contains(Pred))
+            --N;
+    NumPredsLeft[MBB.getNumber()] = N;
+  }
+
+  // Topological sort the CFG, with additional constraints:
+  //  - Between a loop header and the last block in the loop, there can be
+  //    no blocks not dominated by the loop header.
+  //  - It's desirable to preserve the original block order when possible.
+  // We use two ready lists; Preferred and Ready. Preferred has recently
+  // processed sucessors, to help preserve block sequences from the original
+  // order. Ready has the remaining ready blocks.
+  PriorityQueue<MachineBasicBlock *, std::vector<MachineBasicBlock *>,
+                CompareBlockNumbers>
+      Preferred;
+  PriorityQueue<MachineBasicBlock *, std::vector<MachineBasicBlock *>,
+                CompareBlockNumbersBackwards>
+      Ready;
+  SmallVector<Entry, 4> Loops;
+  for (MachineBasicBlock *MBB = &MF.front();;) {
+    const MachineLoop *L = MLI.getLoopFor(MBB);
+    if (L) {
+      // If MBB is a loop header, add it to the active loop list. We can't put
+      // any blocks that it doesn't dominate until we see the end of the loop.
+      if (L->getHeader() == MBB)
+        Loops.push_back(Entry(L));
+      // For each active loop the block is in, decrement the count. If MBB is
+      // the last block in an active loop, take it off the list and pick up any
+      // blocks deferred because the header didn't dominate them.
+      for (Entry &E : Loops)
+        if (E.Loop->contains(MBB) && --E.NumBlocksLeft == 0)
+          for (auto DeferredBlock : E.Deferred)
+            Ready.push(DeferredBlock);
+      while (!Loops.empty() && Loops.back().NumBlocksLeft == 0)
+        Loops.pop_back();
+    }
+    // The main topological sort logic.
+    for (MachineBasicBlock *Succ : MBB->successors()) {
+      // Ignore backedges.
+      if (MachineLoop *SuccL = MLI.getLoopFor(Succ))
+        if (SuccL->getHeader() == Succ && SuccL->contains(MBB))
+          continue;
+      // Decrement the predecessor count. If it's now zero, it's ready.
+      if (--NumPredsLeft[Succ->getNumber()] == 0)
+        Preferred.push(Succ);
+    }
+    // Determine the block to follow MBB. First try to find a preferred block,
+    // to preserve the original block order when possible.
+    MachineBasicBlock *Next = nullptr;
+    while (!Preferred.empty()) {
+      Next = Preferred.top();
+      Preferred.pop();
+      // If X isn't dominated by the top active loop header, defer it until that
+      // loop is done.
+      if (!Loops.empty() &&
+          !MDT.dominates(Loops.back().Loop->getHeader(), Next)) {
+        Loops.back().Deferred.push_back(Next);
+        Next = nullptr;
+        continue;
+      }
+      // If Next was originally ordered before MBB, and it isn't because it was
+      // loop-rotated above the header, it's not preferred.
+      if (Next->getNumber() < MBB->getNumber() &&
+          (!L || !L->contains(Next) ||
+           L->getHeader()->getNumber() < Next->getNumber())) {
+        Ready.push(Next);
+        Next = nullptr;
+        continue;
+      }
+      break;
+    }
+    // If we didn't find a suitable block in the Preferred list, check the
+    // general Ready list.
+    if (!Next) {
+      // If there are no more blocks to process, we're done.
+      if (Ready.empty()) {
+        MaybeUpdateTerminator(MBB);
+        break;
+      }
+      for (;;) {
+        Next = Ready.top();
+        Ready.pop();
+        // If Next isn't dominated by the top active loop header, defer it until
+        // that loop is done.
+        if (!Loops.empty() &&
+            !MDT.dominates(Loops.back().Loop->getHeader(), Next)) {
+          Loops.back().Deferred.push_back(Next);
+          continue;
+        }
+        break;
+      }
+    }
+    // Move the next block into place and iterate.
+    Next->moveAfter(MBB);
+    MaybeUpdateTerminator(MBB);
+    MBB = Next;
+  }
+  assert(Loops.empty() && "Active loop list not finished");
   MF.RenumberBlocks();
 
 #ifndef NDEBUG
@@ -266,12 +294,26 @@ static bool ExplicitlyBranchesTo(MachineBasicBlock *Pred,
   return false;
 }
 
+/// Test whether MI is a child of some other node in an expression tree.
+static bool IsChild(const MachineInstr &MI,
+                    const WebAssemblyFunctionInfo &MFI) {
+  if (MI.getNumOperands() == 0)
+    return false;
+  const MachineOperand &MO = MI.getOperand(0);
+  if (!MO.isReg() || MO.isImplicit() || !MO.isDef())
+    return false;
+  unsigned Reg = MO.getReg();
+  return TargetRegisterInfo::isVirtualRegister(Reg) &&
+         MFI.isVRegStackified(Reg);
+}
+
 /// Insert a BLOCK marker for branches to MBB (if needed).
 static void PlaceBlockMarker(MachineBasicBlock &MBB, MachineFunction &MF,
                              SmallVectorImpl<MachineBasicBlock *> &ScopeTops,
                              const WebAssemblyInstrInfo &TII,
                              const MachineLoopInfo &MLI,
-                             MachineDominatorTree &MDT) {
+                             MachineDominatorTree &MDT,
+                             WebAssemblyFunctionInfo &MFI) {
   // First compute the nearest common dominator of all forward non-fallthrough
   // predecessors so that we minimize the time that the BLOCK is on the stack,
   // which reduces overall stack height.
@@ -319,14 +361,15 @@ static void PlaceBlockMarker(MachineBasicBlock &MBB, MachineFunction &MF,
   MachineLoop *HeaderLoop = MLI.getLoopFor(Header);
   if (HeaderLoop && MBB.getNumber() > LoopBottom(HeaderLoop)->getNumber()) {
     // Header is the header of a loop that does not lexically contain MBB, so
-    // the BLOCK needs to be above the LOOP.
+    // the BLOCK needs to be above the LOOP, after any END constructs.
     InsertPos = Header->begin();
+    while (InsertPos->getOpcode() != WebAssembly::LOOP)
+      ++InsertPos;
   } else {
     // Otherwise, insert the BLOCK as late in Header as we can, but before the
     // beginning of the local expression tree and any nested BLOCKs.
     InsertPos = Header->getFirstTerminator();
-    while (InsertPos != Header->begin() &&
-           prev(InsertPos)->definesRegister(WebAssembly::EXPR_STACK) &&
+    while (InsertPos != Header->begin() && IsChild(*prev(InsertPos), MFI) &&
            prev(InsertPos)->getOpcode() != WebAssembly::LOOP &&
            prev(InsertPos)->getOpcode() != WebAssembly::END_BLOCK &&
            prev(InsertPos)->getOpcode() != WebAssembly::END_LOOP)
@@ -388,7 +431,7 @@ static void PlaceLoopMarker(
 
   assert((!ScopeTops[AfterLoop->getNumber()] ||
           ScopeTops[AfterLoop->getNumber()]->getNumber() < MBB.getNumber()) &&
-         "With RPO we should visit the outer-most loop for a block first.");
+         "With block sorting the outermost loop for a block should be first.");
   if (!ScopeTops[AfterLoop->getNumber()])
     ScopeTops[AfterLoop->getNumber()] = &MBB;
 }
@@ -409,7 +452,8 @@ GetDepth(const SmallVectorImpl<const MachineBasicBlock *> &Stack,
 /// Insert LOOP and BLOCK markers at appropriate places.
 static void PlaceMarkers(MachineFunction &MF, const MachineLoopInfo &MLI,
                          const WebAssemblyInstrInfo &TII,
-                         MachineDominatorTree &MDT) {
+                         MachineDominatorTree &MDT,
+                         WebAssemblyFunctionInfo &MFI) {
   // For each block whose label represents the end of a scope, record the block
   // which holds the beginning of the scope. This will allow us to quickly skip
   // over scoped regions when walking blocks. We allocate one more than the
@@ -425,7 +469,7 @@ static void PlaceMarkers(MachineFunction &MF, const MachineLoopInfo &MLI,
     PlaceLoopMarker(MBB, MF, ScopeTops, LoopTops, TII, MLI);
 
     // Place the BLOCK for MBB if MBB is branched to from above.
-    PlaceBlockMarker(MBB, MF, ScopeTops, TII, MLI, MDT);
+    PlaceBlockMarker(MBB, MF, ScopeTops, TII, MLI, MDT, MFI);
   }
 
   // Now rewrite references to basic blocks to be depth immediates.
@@ -478,16 +522,14 @@ bool WebAssemblyCFGStackify::runOnMachineFunction(MachineFunction &MF) {
   auto &MDT = getAnalysis<MachineDominatorTree>();
   // Liveness is not tracked for EXPR_STACK physreg.
   const auto &TII = *MF.getSubtarget<WebAssemblySubtarget>().getInstrInfo();
+  WebAssemblyFunctionInfo &MFI = *MF.getInfo<WebAssemblyFunctionInfo>();
   MF.getRegInfo().invalidateLiveness();
 
-  // RPO sorting needs all loops to be single-entry.
-  EliminateMultipleEntryLoops(MF, MLI);
-
-  // Sort the blocks in RPO, with contiguous loops.
-  SortBlocks(MF, MLI);
+  // Sort the blocks, with contiguous loops.
+  SortBlocks(MF, MLI, MDT);
 
   // Place the BLOCK and LOOP markers to indicate the beginnings of scopes.
-  PlaceMarkers(MF, MLI, TII, MDT);
+  PlaceMarkers(MF, MLI, TII, MDT, MFI);
 
   return true;
 }
diff --git a/lib/Target/WebAssembly/WebAssemblyFastISel.cpp b/lib/Target/WebAssembly/WebAssemblyFastISel.cpp
index 1b761b1a9d73..7bfa4074849b 100644
--- a/lib/Target/WebAssembly/WebAssemblyFastISel.cpp
+++ b/lib/Target/WebAssembly/WebAssemblyFastISel.cpp
@@ -12,10 +12,13 @@
 /// class. Some of the target-specific code is generated by tablegen in the file
 /// WebAssemblyGenFastISel.inc, which is #included here.
 ///
+/// TODO: kill flags
+///
 //===----------------------------------------------------------------------===//
 
 #include "WebAssembly.h"
 #include "MCTargetDesc/WebAssemblyMCTargetDesc.h"
+#include "WebAssemblyMachineFunctionInfo.h"
 #include "WebAssemblySubtarget.h"
 #include "WebAssemblyTargetMachine.h"
 #include "llvm/Analysis/BranchProbabilityInfo.h"
@@ -41,13 +44,122 @@ using namespace llvm;
 namespace {
 
 class WebAssemblyFastISel final : public FastISel {
+  // All possible address modes.
+  class Address {
+  public:
+    typedef enum { RegBase, FrameIndexBase } BaseKind;
+
+  private:
+    BaseKind Kind;
+    union {
+      unsigned Reg;
+      int FI;
+    } Base;
+
+    int64_t Offset;
+
+    const GlobalValue *GV;
+
+  public:
+    // Innocuous defaults for our address.
+    Address() : Kind(RegBase), Offset(0), GV(0) { Base.Reg = 0; }
+    void setKind(BaseKind K) { Kind = K; }
+    BaseKind getKind() const { return Kind; }
+    bool isRegBase() const { return Kind == RegBase; }
+    bool isFIBase() const { return Kind == FrameIndexBase; }
+    void setReg(unsigned Reg) {
+      assert(isRegBase() && "Invalid base register access!");
+      Base.Reg = Reg;
+    }
+    unsigned getReg() const {
+      assert(isRegBase() && "Invalid base register access!");
+      return Base.Reg;
+    }
+    void setFI(unsigned FI) {
+      assert(isFIBase() && "Invalid base frame index access!");
+      Base.FI = FI;
+    }
+    unsigned getFI() const {
+      assert(isFIBase() && "Invalid base frame index access!");
+      return Base.FI;
+    }
+
+    void setOffset(int64_t Offset_) { Offset = Offset_; }
+    int64_t getOffset() const { return Offset; }
+    void setGlobalValue(const GlobalValue *G) { GV = G; }
+    const GlobalValue *getGlobalValue() const { return GV; }
+  };
+
   /// Keep a pointer to the WebAssemblySubtarget around so that we can make the
   /// right decision when generating code for different targets.
   const WebAssemblySubtarget *Subtarget;
   LLVMContext *Context;
 
-  // Call handling routines.
 private:
+  // Utility helper routines
+  MVT::SimpleValueType getSimpleType(Type *Ty) {
+    EVT VT = TLI.getValueType(DL, Ty, /*HandleUnknown=*/true);
+    return VT.isSimple() ? VT.getSimpleVT().SimpleTy :
+                           MVT::INVALID_SIMPLE_VALUE_TYPE;
+  }
+  MVT::SimpleValueType getLegalType(MVT::SimpleValueType VT) {
+    switch (VT) {
+    case MVT::i1:
+    case MVT::i8:
+    case MVT::i16:
+      return MVT::i32;
+    case MVT::i32:
+    case MVT::i64:
+    case MVT::f32:
+    case MVT::f64:
+      return VT;
+    default:
+      break;
+    }
+    return MVT::INVALID_SIMPLE_VALUE_TYPE;
+  }
+  bool computeAddress(const Value *Obj, Address &Addr);
+  void materializeLoadStoreOperands(Address &Addr);
+  void addLoadStoreOperands(const Address &Addr, const MachineInstrBuilder &MIB,
+                            MachineMemOperand *MMO);
+  unsigned maskI1Value(unsigned Reg, const Value *V);
+  unsigned getRegForI1Value(const Value *V, bool &Not);
+  unsigned zeroExtendToI32(unsigned Reg, const Value *V,
+                           MVT::SimpleValueType From);
+  unsigned signExtendToI32(unsigned Reg, const Value *V,
+                           MVT::SimpleValueType From);
+  unsigned zeroExtend(unsigned Reg, const Value *V,
+                      MVT::SimpleValueType From,
+                      MVT::SimpleValueType To);
+  unsigned signExtend(unsigned Reg, const Value *V,
+                      MVT::SimpleValueType From,
+                      MVT::SimpleValueType To);
+  unsigned getRegForUnsignedValue(const Value *V);
+  unsigned getRegForSignedValue(const Value *V);
+  unsigned getRegForPromotedValue(const Value *V, bool IsSigned);
+  unsigned notValue(unsigned Reg);
+  unsigned copyValue(unsigned Reg);
+
+  // Backend specific FastISel code.
+  unsigned fastMaterializeAlloca(const AllocaInst *AI) override;
+  unsigned fastMaterializeConstant(const Constant *C) override;
+  bool fastLowerArguments() override;
+
+  // Selection routines.
+  bool selectCall(const Instruction *I);
+  bool selectSelect(const Instruction *I);
+  bool selectTrunc(const Instruction *I);
+  bool selectZExt(const Instruction *I);
+  bool selectSExt(const Instruction *I);
+  bool selectICmp(const Instruction *I);
+  bool selectFCmp(const Instruction *I);
+  bool selectBitCast(const Instruction *I);
+  bool selectLoad(const Instruction *I);
+  bool selectStore(const Instruction *I);
+  bool selectBr(const Instruction *I);
+  bool selectRet(const Instruction *I);
+  bool selectUnreachable(const Instruction *I);
+
 public:
   // Backend specific FastISel code.
   WebAssemblyFastISel(FunctionLoweringInfo &FuncInfo,
@@ -64,11 +176,1001 @@ public:
 
 } // end anonymous namespace
 
+bool WebAssemblyFastISel::computeAddress(const Value *Obj, Address &Addr) {
+
+  const User *U = nullptr;
+  unsigned Opcode = Instruction::UserOp1;
+  if (const Instruction *I = dyn_cast<Instruction>(Obj)) {
+    // Don't walk into other basic blocks unless the object is an alloca from
+    // another block, otherwise it may not have a virtual register assigned.
+    if (FuncInfo.StaticAllocaMap.count(static_cast<const AllocaInst *>(Obj)) ||
+        FuncInfo.MBBMap[I->getParent()] == FuncInfo.MBB) {
+      Opcode = I->getOpcode();
+      U = I;
+    }
+  } else if (const ConstantExpr *C = dyn_cast<ConstantExpr>(Obj)) {
+    Opcode = C->getOpcode();
+    U = C;
+  }
+
+  if (auto *Ty = dyn_cast<PointerType>(Obj->getType()))
+    if (Ty->getAddressSpace() > 255)
+      // Fast instruction selection doesn't support the special
+      // address spaces.
+      return false;
+
+  if (const GlobalValue *GV = dyn_cast<GlobalValue>(Obj)) {
+    if (Addr.getGlobalValue())
+      return false;
+    Addr.setGlobalValue(GV);
+    return true;
+  }
+
+  switch (Opcode) {
+  default:
+    break;
+  case Instruction::BitCast: {
+    // Look through bitcasts.
+    return computeAddress(U->getOperand(0), Addr);
+  }
+  case Instruction::IntToPtr: {
+    // Look past no-op inttoptrs.
+    if (TLI.getValueType(DL, U->getOperand(0)->getType()) ==
+        TLI.getPointerTy(DL))
+      return computeAddress(U->getOperand(0), Addr);
+    break;
+  }
+  case Instruction::PtrToInt: {
+    // Look past no-op ptrtoints.
+    if (TLI.getValueType(DL, U->getType()) == TLI.getPointerTy(DL))
+      return computeAddress(U->getOperand(0), Addr);
+    break;
+  }
+  case Instruction::GetElementPtr: {
+    Address SavedAddr = Addr;
+    uint64_t TmpOffset = Addr.getOffset();
+    // Iterate through the GEP folding the constants into offsets where
+    // we can.
+    for (gep_type_iterator GTI = gep_type_begin(U), E = gep_type_end(U);
+         GTI != E; ++GTI) {
+      const Value *Op = GTI.getOperand();
+      if (StructType *STy = dyn_cast<StructType>(*GTI)) {
+        const StructLayout *SL = DL.getStructLayout(STy);
+        unsigned Idx = cast<ConstantInt>(Op)->getZExtValue();
+        TmpOffset += SL->getElementOffset(Idx);
+      } else {
+        uint64_t S = DL.getTypeAllocSize(GTI.getIndexedType());
+        for (;;) {
+          if (const ConstantInt *CI = dyn_cast<ConstantInt>(Op)) {
+            // Constant-offset addressing.
+            TmpOffset += CI->getSExtValue() * S;
+            break;
+          }
+          if (S == 1 && Addr.isRegBase() && Addr.getReg() == 0) {
+            // An unscaled add of a register. Set it as the new base.
+            Addr.setReg(getRegForValue(Op));
+            break;
+          }
+          if (canFoldAddIntoGEP(U, Op)) {
+            // A compatible add with a constant operand. Fold the constant.
+            ConstantInt *CI =
+                cast<ConstantInt>(cast<AddOperator>(Op)->getOperand(1));
+            TmpOffset += CI->getSExtValue() * S;
+            // Iterate on the other operand.
+            Op = cast<AddOperator>(Op)->getOperand(0);
+            continue;
+          }
+          // Unsupported
+          goto unsupported_gep;
+        }
+      }
+    }
+    // Try to grab the base operand now.
+    Addr.setOffset(TmpOffset);
+    if (computeAddress(U->getOperand(0), Addr))
+      return true;
+    // We failed, restore everything and try the other options.
+    Addr = SavedAddr;
+  unsupported_gep:
+    break;
+  }
+  case Instruction::Alloca: {
+    const AllocaInst *AI = cast<AllocaInst>(Obj);
+    DenseMap<const AllocaInst *, int>::iterator SI =
+        FuncInfo.StaticAllocaMap.find(AI);
+    if (SI != FuncInfo.StaticAllocaMap.end()) {
+      Addr.setKind(Address::FrameIndexBase);
+      Addr.setFI(SI->second);
+      return true;
+    }
+    break;
+  }
+  case Instruction::Add: {
+    // Adds of constants are common and easy enough.
+    const Value *LHS = U->getOperand(0);
+    const Value *RHS = U->getOperand(1);
+
+    if (isa<ConstantInt>(LHS))
+      std::swap(LHS, RHS);
+
+    if (const ConstantInt *CI = dyn_cast<ConstantInt>(RHS)) {
+      Addr.setOffset(Addr.getOffset() + CI->getSExtValue());
+      return computeAddress(LHS, Addr);
+    }
+
+    Address Backup = Addr;
+    if (computeAddress(LHS, Addr) && computeAddress(RHS, Addr))
+      return true;
+    Addr = Backup;
+
+    break;
+  }
+  case Instruction::Sub: {
+    // Subs of constants are common and easy enough.
+    const Value *LHS = U->getOperand(0);
+    const Value *RHS = U->getOperand(1);
+
+    if (const ConstantInt *CI = dyn_cast<ConstantInt>(RHS)) {
+      Addr.setOffset(Addr.getOffset() - CI->getSExtValue());
+      return computeAddress(LHS, Addr);
+    }
+    break;
+  }
+  }
+  Addr.setReg(getRegForValue(Obj));
+  return Addr.getReg() != 0;
+}
+
+void WebAssemblyFastISel::materializeLoadStoreOperands(Address &Addr) {
+  if (Addr.isRegBase()) {
+    unsigned Reg = Addr.getReg();
+    if (Reg == 0) {
+      Reg = createResultReg(Subtarget->hasAddr64() ?
+                            &WebAssembly::I64RegClass :
+                            &WebAssembly::I32RegClass);
+      unsigned Opc = Subtarget->hasAddr64() ?
+                     WebAssembly::CONST_I64 :
+                     WebAssembly::CONST_I32;
+      BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(Opc), Reg)
+         .addImm(0);
+      Addr.setReg(Reg);
+    }
+  }
+}
+
+void WebAssemblyFastISel::addLoadStoreOperands(const Address &Addr,
+                                               const MachineInstrBuilder &MIB,
+                                               MachineMemOperand *MMO) {
+  if (const GlobalValue *GV = Addr.getGlobalValue())
+    MIB.addGlobalAddress(GV, Addr.getOffset());
+  else
+    MIB.addImm(Addr.getOffset());
+
+  if (Addr.isRegBase())
+    MIB.addReg(Addr.getReg());
+  else
+    MIB.addFrameIndex(Addr.getFI());
+
+  // Set the alignment operand (this is rewritten in SetP2AlignOperands).
+  // TODO: Disable SetP2AlignOperands for FastISel and just do it here.
+  MIB.addImm(0);
+
+  MIB.addMemOperand(MMO);
+}
+
+unsigned WebAssemblyFastISel::maskI1Value(unsigned Reg, const Value *V) {
+  return zeroExtendToI32(Reg, V, MVT::i1);
+}
+
+unsigned WebAssemblyFastISel::getRegForI1Value(const Value *V, bool &Not) {
+  if (const ICmpInst *ICmp = dyn_cast<ICmpInst>(V))
+    if (const ConstantInt *C = dyn_cast<ConstantInt>(ICmp->getOperand(1)))
+      if (ICmp->isEquality() && C->isZero() && C->getType()->isIntegerTy(32)) {
+        Not = ICmp->isTrueWhenEqual();
+        return getRegForValue(ICmp->getOperand(0));
+      }
+
+  if (BinaryOperator::isNot(V)) {
+    Not = true;
+    return getRegForValue(BinaryOperator::getNotArgument(V));
+  }
+
+  Not = false;
+  return maskI1Value(getRegForValue(V), V);
+}
+
+unsigned WebAssemblyFastISel::zeroExtendToI32(unsigned Reg, const Value *V,
+                                              MVT::SimpleValueType From) {
+  switch (From) {
+  case MVT::i1:
+    // If the value is naturally an i1, we don't need to mask it.
+    // TODO: Recursively examine selects, phis, and, or, xor, constants.
+    if (From == MVT::i1 && V != nullptr) {
+      if (isa<CmpInst>(V) ||
+          (isa<Argument>(V) && cast<Argument>(V)->hasZExtAttr()))
+        return copyValue(Reg);
+    }
+  case MVT::i8:
+  case MVT::i16:
+    break;
+  case MVT::i32:
+    return copyValue(Reg);
+  default:
+    return 0;
+  }
+
+  unsigned Imm = createResultReg(&WebAssembly::I32RegClass);
+  BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
+          TII.get(WebAssembly::CONST_I32), Imm)
+    .addImm(~(~uint64_t(0) << MVT(From).getSizeInBits()));
+
+  unsigned Result = createResultReg(&WebAssembly::I32RegClass);
+  BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
+          TII.get(WebAssembly::AND_I32), Result)
+    .addReg(Reg)
+    .addReg(Imm);
+
+  return Result;
+}
+
+unsigned WebAssemblyFastISel::signExtendToI32(unsigned Reg, const Value *V,
+                                              MVT::SimpleValueType From) {
+  switch (From) {
+  case MVT::i1:
+  case MVT::i8:
+  case MVT::i16:
+    break;
+  case MVT::i32:
+    return copyValue(Reg);
+  default:
+    return 0;
+  }
+
+  unsigned Imm = createResultReg(&WebAssembly::I32RegClass);
+  BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
+          TII.get(WebAssembly::CONST_I32), Imm)
+    .addImm(32 - MVT(From).getSizeInBits());
+
+  unsigned Left = createResultReg(&WebAssembly::I32RegClass);
+  BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
+          TII.get(WebAssembly::SHL_I32), Left)
+    .addReg(Reg)
+    .addReg(Imm);
+
+  unsigned Right = createResultReg(&WebAssembly::I32RegClass);
+  BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
+          TII.get(WebAssembly::SHR_S_I32), Right)
+    .addReg(Left)
+    .addReg(Imm);
+
+  return Right;
+}
+
+unsigned WebAssemblyFastISel::zeroExtend(unsigned Reg, const Value *V,
+                                         MVT::SimpleValueType From,
+                                         MVT::SimpleValueType To) {
+  if (To == MVT::i64) {
+    if (From == MVT::i64)
+      return copyValue(Reg);
+
+    Reg = zeroExtendToI32(Reg, V, From);
+
+    unsigned Result = createResultReg(&WebAssembly::I64RegClass);
+    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
+            TII.get(WebAssembly::I64_EXTEND_U_I32), Result)
+        .addReg(Reg);
+    return Result;
+  }
+
+  return zeroExtendToI32(Reg, V, From);
+}
+
+unsigned WebAssemblyFastISel::signExtend(unsigned Reg, const Value *V,
+                                         MVT::SimpleValueType From,
+                                         MVT::SimpleValueType To) {
+  if (To == MVT::i64) {
+    if (From == MVT::i64)
+      return copyValue(Reg);
+
+    Reg = signExtendToI32(Reg, V, From);
+
+    unsigned Result = createResultReg(&WebAssembly::I64RegClass);
+    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
+            TII.get(WebAssembly::I64_EXTEND_S_I32), Result)
+        .addReg(Reg);
+    return Result;
+  }
+
+  return signExtendToI32(Reg, V, From);
+}
+
+unsigned WebAssemblyFastISel::getRegForUnsignedValue(const Value *V) {
+  MVT::SimpleValueType From = getSimpleType(V->getType());
+  MVT::SimpleValueType To = getLegalType(From);
+  return zeroExtend(getRegForValue(V), V, From, To);
+}
+
+unsigned WebAssemblyFastISel::getRegForSignedValue(const Value *V) {
+  MVT::SimpleValueType From = getSimpleType(V->getType());
+  MVT::SimpleValueType To = getLegalType(From);
+  return zeroExtend(getRegForValue(V), V, From, To);
+}
+
+unsigned WebAssemblyFastISel::getRegForPromotedValue(const Value *V,
+                                                     bool IsSigned) {
+  return IsSigned ? getRegForSignedValue(V) :
+                    getRegForUnsignedValue(V);
+}
+
+unsigned WebAssemblyFastISel::notValue(unsigned Reg) {
+  assert(MRI.getRegClass(Reg) == &WebAssembly::I32RegClass);
+
+  unsigned NotReg = createResultReg(&WebAssembly::I32RegClass);
+  BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
+          TII.get(WebAssembly::EQZ_I32), NotReg)
+    .addReg(Reg);
+  return NotReg;
+}
+
+unsigned WebAssemblyFastISel::copyValue(unsigned Reg) {
+  unsigned ResultReg = createResultReg(MRI.getRegClass(Reg));
+  BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
+          TII.get(WebAssembly::COPY), ResultReg)
+    .addReg(Reg);
+  return ResultReg;
+}
+
+unsigned WebAssemblyFastISel::fastMaterializeAlloca(const AllocaInst *AI) {
+  DenseMap<const AllocaInst *, int>::iterator SI =
+      FuncInfo.StaticAllocaMap.find(AI);
+
+  if (SI != FuncInfo.StaticAllocaMap.end()) {
+    unsigned ResultReg = createResultReg(Subtarget->hasAddr64() ?
+                                         &WebAssembly::I64RegClass :
+                                         &WebAssembly::I32RegClass);
+    unsigned Opc = Subtarget->hasAddr64() ?
+                   WebAssembly::COPY_LOCAL_I64 :
+                   WebAssembly::COPY_LOCAL_I32;
+    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(Opc), ResultReg)
+        .addFrameIndex(SI->second);
+    return ResultReg;
+  }
+
+  return 0;
+}
+
+unsigned WebAssemblyFastISel::fastMaterializeConstant(const Constant *C) {
+  if (const GlobalValue *GV = dyn_cast<GlobalValue>(C)) {
+    unsigned ResultReg = createResultReg(Subtarget->hasAddr64() ?
+                                         &WebAssembly::I64RegClass :
+                                         &WebAssembly::I32RegClass);
+    unsigned Opc = Subtarget->hasAddr64() ?
+                   WebAssembly::CONST_I64 :
+                   WebAssembly::CONST_I32;
+    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(Opc), ResultReg)
+       .addGlobalAddress(GV);
+    return ResultReg;
+  }
+
+  // Let target-independent code handle it.
+  return 0;
+}
+
+bool WebAssemblyFastISel::fastLowerArguments() {
+  if (!FuncInfo.CanLowerReturn)
+    return false;
+
+  const Function *F = FuncInfo.Fn;
+  if (F->isVarArg())
+    return false;
+
+  unsigned i = 0;
+  for (auto const &Arg : F->args()) {
+    const AttributeSet &Attrs = F->getAttributes();
+    if (Attrs.hasAttribute(i+1, Attribute::ByVal) ||
+        Attrs.hasAttribute(i+1, Attribute::SwiftSelf) ||
+        Attrs.hasAttribute(i+1, Attribute::SwiftError) ||
+        Attrs.hasAttribute(i+1, Attribute::InAlloca) ||
+        Attrs.hasAttribute(i+1, Attribute::Nest))
+      return false;
+
+    Type *ArgTy = Arg.getType();
+    if (ArgTy->isStructTy() || ArgTy->isArrayTy() || ArgTy->isVectorTy())
+      return false;
+
+    unsigned Opc;
+    const TargetRegisterClass *RC;
+    switch (getSimpleType(ArgTy)) {
+    case MVT::i1:
+    case MVT::i8:
+    case MVT::i16:
+    case MVT::i32:
+      Opc = WebAssembly::ARGUMENT_I32;
+      RC = &WebAssembly::I32RegClass;
+      break;
+    case MVT::i64:
+      Opc = WebAssembly::ARGUMENT_I64;
+      RC = &WebAssembly::I64RegClass;
+      break;
+    case MVT::f32:
+      Opc = WebAssembly::ARGUMENT_F32;
+      RC = &WebAssembly::F32RegClass;
+      break;
+    case MVT::f64:
+      Opc = WebAssembly::ARGUMENT_F64;
+      RC = &WebAssembly::F64RegClass;
+      break;
+    default:
+      return false;
+    }
+    unsigned ResultReg = createResultReg(RC);
+    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(Opc), ResultReg)
+      .addImm(i);
+    updateValueMap(&Arg, ResultReg);
+
+    ++i;
+  }
+
+  MRI.addLiveIn(WebAssembly::ARGUMENTS);
+
+  auto *MFI = MF->getInfo<WebAssemblyFunctionInfo>();
+  for (auto const &Arg : F->args())
+    MFI->addParam(getLegalType(getSimpleType(Arg.getType())));
+
+  return true;
+}
+
+bool WebAssemblyFastISel::selectCall(const Instruction *I) {
+  const CallInst *Call = cast<CallInst>(I);
+
+  if (Call->isMustTailCall() || Call->isInlineAsm() ||
+      Call->getFunctionType()->isVarArg())
+    return false;
+
+  Function *Func = Call->getCalledFunction();
+  if (Func && Func->isIntrinsic())
+    return false;
+
+  FunctionType *FuncTy = Call->getFunctionType();
+  unsigned Opc;
+  bool IsDirect = Func != nullptr;
+  bool IsVoid = FuncTy->getReturnType()->isVoidTy();
+  unsigned ResultReg;
+  if (IsVoid) {
+    Opc = IsDirect ? WebAssembly::CALL_VOID : WebAssembly::CALL_INDIRECT_VOID;
+  } else {
+    MVT::SimpleValueType RetTy = getSimpleType(Call->getType());
+    switch (RetTy) {
+    case MVT::i1:
+    case MVT::i8:
+    case MVT::i16:
+    case MVT::i32:
+      Opc = IsDirect ? WebAssembly::CALL_I32 : WebAssembly::CALL_INDIRECT_I32;
+      ResultReg = createResultReg(&WebAssembly::I32RegClass);
+      break;
+    case MVT::i64:
+      Opc = IsDirect ? WebAssembly::CALL_I64 : WebAssembly::CALL_INDIRECT_I64;
+      ResultReg = createResultReg(&WebAssembly::I64RegClass);
+      break;
+    case MVT::f32:
+      Opc = IsDirect ? WebAssembly::CALL_F32 : WebAssembly::CALL_INDIRECT_F32;
+      ResultReg = createResultReg(&WebAssembly::F32RegClass);
+      break;
+    case MVT::f64:
+      Opc = IsDirect ? WebAssembly::CALL_F64 : WebAssembly::CALL_INDIRECT_F64;
+      ResultReg = createResultReg(&WebAssembly::F64RegClass);
+      break;
+    default:
+      return false;
+    }
+  }
+
+  SmallVector<unsigned, 8> Args;
+  for (unsigned i = 0, e = Call->getNumArgOperands(); i < e; ++i) {
+    Value *V = Call->getArgOperand(i);
+    MVT::SimpleValueType ArgTy = getSimpleType(V->getType());
+    if (ArgTy == MVT::INVALID_SIMPLE_VALUE_TYPE)
+      return false;
+
+    const AttributeSet &Attrs = Call->getAttributes();
+    if (Attrs.hasAttribute(i+1, Attribute::ByVal) ||
+        Attrs.hasAttribute(i+1, Attribute::SwiftSelf) ||
+        Attrs.hasAttribute(i+1, Attribute::SwiftError) ||
+        Attrs.hasAttribute(i+1, Attribute::InAlloca) ||
+        Attrs.hasAttribute(i+1, Attribute::Nest))
+      return false;
+
+    unsigned Reg;
+
+    if (Attrs.hasAttribute(i+1, Attribute::SExt))
+      Reg = getRegForSignedValue(V);
+    else if (Attrs.hasAttribute(i+1, Attribute::ZExt))
+      Reg = getRegForUnsignedValue(V);
+    else
+      Reg = getRegForValue(V);
+
+    if (Reg == 0)
+      return false;
+
+    Args.push_back(Reg);
+  }
+
+  auto MIB = BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(Opc));
+
+  if (!IsVoid)
+    MIB.addReg(ResultReg, RegState::Define);
+
+  if (IsDirect)
+    MIB.addGlobalAddress(Func);
+  else
+    MIB.addReg(getRegForValue(Call->getCalledValue()));
+
+  for (unsigned ArgReg : Args)
+    MIB.addReg(ArgReg);
+
+  if (!IsVoid)
+    updateValueMap(Call, ResultReg);
+  return true;
+}
+
+bool WebAssemblyFastISel::selectSelect(const Instruction *I) {
+  const SelectInst *Select = cast<SelectInst>(I);
+
+  bool Not;
+  unsigned CondReg  = getRegForI1Value(Select->getCondition(), Not);
+  if (CondReg == 0)
+    return false;
+
+  unsigned TrueReg  = getRegForValue(Select->getTrueValue());
+  if (TrueReg == 0)
+    return false;
+
+  unsigned FalseReg = getRegForValue(Select->getFalseValue());
+  if (FalseReg == 0)
+    return false;
+
+  if (Not)
+    std::swap(TrueReg, FalseReg);
+
+  unsigned Opc;
+  const TargetRegisterClass *RC;
+  switch (getSimpleType(Select->getType())) {
+  case MVT::i1:
+  case MVT::i8:
+  case MVT::i16:
+  case MVT::i32:
+    Opc = WebAssembly::SELECT_I32;
+    RC = &WebAssembly::I32RegClass;
+    break;
+  case MVT::i64:
+    Opc = WebAssembly::SELECT_I64;
+    RC = &WebAssembly::I64RegClass;
+    break;
+  case MVT::f32:
+    Opc = WebAssembly::SELECT_F32;
+    RC = &WebAssembly::F32RegClass;
+    break;
+  case MVT::f64:
+    Opc = WebAssembly::SELECT_F64;
+    RC = &WebAssembly::F64RegClass;
+    break;
+  default:
+    return false;
+  }
+
+  unsigned ResultReg = createResultReg(RC);
+  BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(Opc), ResultReg)
+    .addReg(TrueReg)
+    .addReg(FalseReg)
+    .addReg(CondReg);
+
+  updateValueMap(Select, ResultReg);
+  return true;
+}
+
+bool WebAssemblyFastISel::selectTrunc(const Instruction *I) {
+  const TruncInst *Trunc = cast<TruncInst>(I);
+
+  unsigned Reg = getRegForValue(Trunc->getOperand(0));
+  if (Reg == 0)
+    return false;
+
+  if (Trunc->getOperand(0)->getType()->isIntegerTy(64)) {
+    unsigned Result = createResultReg(&WebAssembly::I32RegClass);
+    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
+            TII.get(WebAssembly::I32_WRAP_I64), Result)
+        .addReg(Reg);
+    Reg = Result;
+  }
+
+  updateValueMap(Trunc, Reg);
+  return true;
+}
+
+bool WebAssemblyFastISel::selectZExt(const Instruction *I) {
+  const ZExtInst *ZExt = cast<ZExtInst>(I);
+
+  const Value *Op = ZExt->getOperand(0);
+  MVT::SimpleValueType From = getSimpleType(Op->getType());
+  MVT::SimpleValueType To = getLegalType(getSimpleType(ZExt->getType()));
+  unsigned Reg = zeroExtend(getRegForValue(Op), Op, From, To);
+  if (Reg == 0)
+    return false;
+
+  updateValueMap(ZExt, Reg);
+  return true;
+}
+
+bool WebAssemblyFastISel::selectSExt(const Instruction *I) {
+  const SExtInst *SExt = cast<SExtInst>(I);
+
+  const Value *Op = SExt->getOperand(0);
+  MVT::SimpleValueType From = getSimpleType(Op->getType());
+  MVT::SimpleValueType To = getLegalType(getSimpleType(SExt->getType()));
+  unsigned Reg = signExtend(getRegForValue(Op), Op, From, To);
+  if (Reg == 0)
+    return false;
+
+  updateValueMap(SExt, Reg);
+  return true;
+}
+
+bool WebAssemblyFastISel::selectICmp(const Instruction *I) {
+  const ICmpInst *ICmp = cast<ICmpInst>(I);
+
+  bool I32 = getSimpleType(ICmp->getOperand(0)->getType()) != MVT::i64;
+  unsigned Opc;
+  bool isSigned = false;
+  switch (ICmp->getPredicate()) {
+  case ICmpInst::ICMP_EQ:
+    Opc = I32 ? WebAssembly::EQ_I32 : WebAssembly::EQ_I64;
+    break;
+  case ICmpInst::ICMP_NE:
+    Opc = I32 ? WebAssembly::NE_I32 : WebAssembly::NE_I64;
+    break;
+  case ICmpInst::ICMP_UGT:
+    Opc = I32 ? WebAssembly::GT_U_I32 : WebAssembly::GT_U_I64;
+    break;
+  case ICmpInst::ICMP_UGE:
+    Opc = I32 ? WebAssembly::GE_U_I32 : WebAssembly::GE_U_I64;
+    break;
+  case ICmpInst::ICMP_ULT:
+    Opc = I32 ? WebAssembly::LT_U_I32 : WebAssembly::LT_U_I64;
+    break;
+  case ICmpInst::ICMP_ULE:
+    Opc = I32 ? WebAssembly::LE_U_I32 : WebAssembly::LE_U_I64;
+    break;
+  case ICmpInst::ICMP_SGT:
+    Opc = I32 ? WebAssembly::GT_S_I32 : WebAssembly::GT_S_I64;
+    isSigned = true;
+    break;
+  case ICmpInst::ICMP_SGE:
+    Opc = I32 ? WebAssembly::GE_S_I32 : WebAssembly::GE_S_I64;
+    isSigned = true;
+    break;
+  case ICmpInst::ICMP_SLT:
+    Opc = I32 ? WebAssembly::LT_S_I32 : WebAssembly::LT_S_I64;
+    isSigned = true;
+    break;
+  case ICmpInst::ICMP_SLE:
+    Opc = I32 ? WebAssembly::LE_S_I32 : WebAssembly::LE_S_I64;
+    isSigned = true;
+    break;
+  default: return false;
+  }
+
+  unsigned LHS = getRegForPromotedValue(ICmp->getOperand(0), isSigned);
+  if (LHS == 0)
+    return false;
+
+  unsigned RHS = getRegForPromotedValue(ICmp->getOperand(1), isSigned);
+  if (RHS == 0)
+    return false;
+
+  unsigned ResultReg = createResultReg(&WebAssembly::I32RegClass);
+  BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(Opc), ResultReg)
+      .addReg(LHS)
+      .addReg(RHS);
+  updateValueMap(ICmp, ResultReg);
+  return true;
+}
+
+bool WebAssemblyFastISel::selectFCmp(const Instruction *I) {
+  const FCmpInst *FCmp = cast<FCmpInst>(I);
+
+  unsigned LHS = getRegForValue(FCmp->getOperand(0));
+  if (LHS == 0)
+    return false;
+
+  unsigned RHS = getRegForValue(FCmp->getOperand(1));
+  if (RHS == 0)
+    return false;
+
+  bool F32 = getSimpleType(FCmp->getOperand(0)->getType()) != MVT::f64;
+  unsigned Opc;
+  bool Not = false;
+  switch (FCmp->getPredicate()) {
+  case FCmpInst::FCMP_OEQ:
+    Opc = F32 ? WebAssembly::EQ_F32 : WebAssembly::EQ_F64;
+    break;
+  case FCmpInst::FCMP_UNE:
+    Opc = F32 ? WebAssembly::NE_F32 : WebAssembly::NE_F64;
+    break;
+  case FCmpInst::FCMP_OGT:
+    Opc = F32 ? WebAssembly::GT_F32 : WebAssembly::GT_F64;
+    break;
+  case FCmpInst::FCMP_OGE:
+    Opc = F32 ? WebAssembly::GE_F32 : WebAssembly::GE_F64;
+    break;
+  case FCmpInst::FCMP_OLT:
+    Opc = F32 ? WebAssembly::LT_F32 : WebAssembly::LT_F64;
+    break;
+  case FCmpInst::FCMP_OLE:
+    Opc = F32 ? WebAssembly::LE_F32 : WebAssembly::LE_F64;
+    break;
+  case FCmpInst::FCMP_UGT:
+    Opc = F32 ? WebAssembly::LE_F32 : WebAssembly::LE_F64;
+    Not = true;
+    break;
+  case FCmpInst::FCMP_UGE:
+    Opc = F32 ? WebAssembly::LT_F32 : WebAssembly::LT_F64;
+    Not = true;
+    break;
+  case FCmpInst::FCMP_ULT:
+    Opc = F32 ? WebAssembly::GE_F32 : WebAssembly::GE_F64;
+    Not = true;
+    break;
+  case FCmpInst::FCMP_ULE:
+    Opc = F32 ? WebAssembly::GT_F32 : WebAssembly::GT_F64;
+    Not = true;
+    break;
+  default:
+    return false;
+  }
+
+  unsigned ResultReg = createResultReg(&WebAssembly::I32RegClass);
+  BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(Opc), ResultReg)
+      .addReg(LHS)
+      .addReg(RHS);
+
+  if (Not)
+    ResultReg = notValue(ResultReg);
+
+  updateValueMap(FCmp, ResultReg);
+  return true;
+}
+
+bool WebAssemblyFastISel::selectBitCast(const Instruction *I) {
+  // Target-independent code can handle this, except it doesn't set the dead
+  // flag on the ARGUMENTS clobber, so we have to do that manually in order
+  // to satisfy code that expects this of isBitcast() instructions.
+  EVT VT = TLI.getValueType(DL, I->getOperand(0)->getType());
+  EVT RetVT = TLI.getValueType(DL, I->getType());
+  if (!VT.isSimple() || !RetVT.isSimple())
+    return false;
+
+  if (VT == RetVT) {
+    // No-op bitcast.
+    updateValueMap(I, getRegForValue(I->getOperand(0)));
+    return true;
+  }
+
+  unsigned Reg = fastEmit_ISD_BITCAST_r(VT.getSimpleVT(), RetVT.getSimpleVT(),
+                                        getRegForValue(I->getOperand(0)),
+                                        I->getOperand(0)->hasOneUse());
+  if (!Reg)
+    return false;
+  MachineBasicBlock::iterator Iter = FuncInfo.InsertPt;
+  --Iter;
+  assert(Iter->isBitcast());
+  Iter->setPhysRegsDeadExcept(ArrayRef<unsigned>(), TRI);
+  updateValueMap(I, Reg);
+  return true;
+}
+
+bool WebAssemblyFastISel::selectLoad(const Instruction *I) {
+  const LoadInst *Load = cast<LoadInst>(I);
+  if (Load->isAtomic())
+    return false;
+
+  Address Addr;
+  if (!computeAddress(Load->getPointerOperand(), Addr))
+    return false;
+
+  // TODO: Fold a following sign-/zero-extend into the load instruction.
+
+  unsigned Opc;
+  const TargetRegisterClass *RC;
+  switch (getSimpleType(Load->getType())) {
+  case MVT::i1:
+  case MVT::i8:
+    Opc = WebAssembly::LOAD8_U_I32;
+    RC = &WebAssembly::I32RegClass;
+    break;
+  case MVT::i16:
+    Opc = WebAssembly::LOAD16_U_I32;
+    RC = &WebAssembly::I32RegClass;
+    break;
+  case MVT::i32:
+    Opc = WebAssembly::LOAD_I32;
+    RC = &WebAssembly::I32RegClass;
+    break;
+  case MVT::i64:
+    Opc = WebAssembly::LOAD_I64;
+    RC = &WebAssembly::I64RegClass;
+    break;
+  case MVT::f32:
+    Opc = WebAssembly::LOAD_F32;
+    RC = &WebAssembly::F32RegClass;
+    break;
+  case MVT::f64:
+    Opc = WebAssembly::LOAD_F64;
+    RC = &WebAssembly::F64RegClass;
+    break;
+  default:
+    return false;
+  }
+
+  materializeLoadStoreOperands(Addr);
+
+  unsigned ResultReg = createResultReg(RC);
+  auto MIB = BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(Opc),
+                     ResultReg);
+
+  addLoadStoreOperands(Addr, MIB, createMachineMemOperandFor(Load));
+
+  updateValueMap(Load, ResultReg);
+  return true;
+}
+
+bool WebAssemblyFastISel::selectStore(const Instruction *I) {
+  const StoreInst *Store = cast<StoreInst>(I);
+  if (Store->isAtomic())
+    return false;
+
+  Address Addr;
+  if (!computeAddress(Store->getPointerOperand(), Addr))
+    return false;
+
+  unsigned Opc;
+  const TargetRegisterClass *RC;
+  bool VTIsi1 = false;
+  switch (getSimpleType(Store->getValueOperand()->getType())) {
+  case MVT::i1:
+    VTIsi1 = true;
+  case MVT::i8:
+    Opc = WebAssembly::STORE8_I32;
+    RC = &WebAssembly::I32RegClass;
+    break;
+  case MVT::i16:
+    Opc = WebAssembly::STORE16_I32;
+    RC = &WebAssembly::I32RegClass;
+    break;
+  case MVT::i32:
+    Opc = WebAssembly::STORE_I32;
+    RC = &WebAssembly::I32RegClass;
+    break;
+  case MVT::i64:
+    Opc = WebAssembly::STORE_I64;
+    RC = &WebAssembly::I64RegClass;
+    break;
+  case MVT::f32:
+    Opc = WebAssembly::STORE_F32;
+    RC = &WebAssembly::F32RegClass;
+    break;
+  case MVT::f64:
+    Opc = WebAssembly::STORE_F64;
+    RC = &WebAssembly::F64RegClass;
+    break;
+  default: return false;
+  }
+
+  materializeLoadStoreOperands(Addr);
+
+  unsigned ValueReg = getRegForValue(Store->getValueOperand());
+  if (VTIsi1)
+    ValueReg = maskI1Value(ValueReg, Store->getValueOperand());
+
+  unsigned ResultReg = createResultReg(RC);
+  auto MIB = BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(Opc),
+                     ResultReg);
+
+  addLoadStoreOperands(Addr, MIB, createMachineMemOperandFor(Store));
+
+  MIB.addReg(ValueReg);
+  return true;
+}
+
+bool WebAssemblyFastISel::selectBr(const Instruction *I) {
+  const BranchInst *Br = cast<BranchInst>(I);
+  if (Br->isUnconditional()) {
+    MachineBasicBlock *MSucc = FuncInfo.MBBMap[Br->getSuccessor(0)];
+    fastEmitBranch(MSucc, Br->getDebugLoc());
+    return true;
+  }
+
+  MachineBasicBlock *TBB = FuncInfo.MBBMap[Br->getSuccessor(0)];
+  MachineBasicBlock *FBB = FuncInfo.MBBMap[Br->getSuccessor(1)];
+
+  bool Not;
+  unsigned CondReg = getRegForI1Value(Br->getCondition(), Not);
+
+  unsigned Opc = WebAssembly::BR_IF;
+  if (Not)
+    Opc = WebAssembly::BR_UNLESS;
+
+  BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(Opc))
+      .addMBB(TBB)
+      .addReg(CondReg);
+  
+  finishCondBranch(Br->getParent(), TBB, FBB);
+  return true;
+}
+
+bool WebAssemblyFastISel::selectRet(const Instruction *I) {
+  if (!FuncInfo.CanLowerReturn)
+    return false;
+
+  const ReturnInst *Ret = cast<ReturnInst>(I);
+
+  if (Ret->getNumOperands() == 0) {
+    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
+            TII.get(WebAssembly::RETURN_VOID));
+    return true;
+  }
+
+  Value *RV = Ret->getOperand(0);
+  unsigned Opc;
+  switch (getSimpleType(RV->getType())) {
+  case MVT::i1: case MVT::i8:
+  case MVT::i16: case MVT::i32:
+    Opc = WebAssembly::RETURN_I32;
+    break;
+  case MVT::i64:
+    Opc = WebAssembly::RETURN_I64;
+    break;
+  case MVT::f32: Opc = WebAssembly::RETURN_F32; break;
+  case MVT::f64: Opc = WebAssembly::RETURN_F64; break;
+  default: return false;
+  }
+
+  unsigned Reg;
+  if (FuncInfo.Fn->getAttributes().hasAttribute(0, Attribute::SExt))
+    Reg = getRegForSignedValue(RV);
+  else if (FuncInfo.Fn->getAttributes().hasAttribute(0, Attribute::ZExt))
+    Reg = getRegForUnsignedValue(RV);
+  else
+    Reg = getRegForValue(RV);
+
+  BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(Opc)).addReg(Reg);
+  return true;
+}
+
+bool WebAssemblyFastISel::selectUnreachable(const Instruction *I) {
+  BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
+          TII.get(WebAssembly::UNREACHABLE));
+  return true;
+}
+
 bool WebAssemblyFastISel::fastSelectInstruction(const Instruction *I) {
   switch (I->getOpcode()) {
-  default:
+  case Instruction::Call:
+    if (selectCall(I))
+      return true;
     break;
-    // TODO: add fast-isel selection cases here...
+  case Instruction::Select:      return selectSelect(I);
+  case Instruction::Trunc:       return selectTrunc(I);
+  case Instruction::ZExt:        return selectZExt(I);
+  case Instruction::SExt:        return selectSExt(I);
+  case Instruction::ICmp:        return selectICmp(I);
+  case Instruction::FCmp:        return selectFCmp(I);
+  case Instruction::BitCast:     return selectBitCast(I);
+  case Instruction::Load:        return selectLoad(I);
+  case Instruction::Store:       return selectStore(I);
+  case Instruction::Br:          return selectBr(I);
+  case Instruction::Ret:         return selectRet(I);
+  case Instruction::Unreachable: return selectUnreachable(I);
+  default: break;
   }
 
   // Fall back to target-independent instruction selection.
diff --git a/lib/Target/WebAssembly/WebAssemblyFixIrreducibleControlFlow.cpp b/lib/Target/WebAssembly/WebAssemblyFixIrreducibleControlFlow.cpp
new file mode 100644
index 000000000000..5dc90920e310
--- /dev/null
+++ b/lib/Target/WebAssembly/WebAssemblyFixIrreducibleControlFlow.cpp
@@ -0,0 +1,296 @@
+//=- WebAssemblyFixIrreducibleControlFlow.cpp - Fix irreducible control flow -//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// \brief This file implements a pass that transforms irreducible control flow
+/// into reducible control flow. Irreducible control flow means multiple-entry
+/// loops; they appear as CFG cycles that are not recorded in MachineLoopInfo
+/// due to being unnatural.
+///
+/// Note that LLVM has a generic pass that lowers irreducible control flow, but
+/// it linearizes control flow, turning diamonds into two triangles, which is
+/// both unnecessary and undesirable for WebAssembly.
+///
+/// TODO: The transformation implemented here handles all irreducible control
+/// flow, without exponential code-size expansion, though it does so by creating
+/// inefficient code in many cases. Ideally, we should add other
+/// transformations, including code-duplicating cases, which can be more
+/// efficient in common cases, and they can fall back to this conservative
+/// implementation as needed.
+///
+//===----------------------------------------------------------------------===//
+
+#include "WebAssembly.h"
+#include "MCTargetDesc/WebAssemblyMCTargetDesc.h"
+#include "WebAssemblyMachineFunctionInfo.h"
+#include "WebAssemblySubtarget.h"
+#include "llvm/ADT/PriorityQueue.h"
+#include "llvm/ADT/SCCIterator.h"
+#include "llvm/ADT/SetVector.h"
+#include "llvm/CodeGen/MachineDominators.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineLoopInfo.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/Passes.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+using namespace llvm;
+
+#define DEBUG_TYPE "wasm-fix-irreducible-control-flow"
+
+namespace {
+class WebAssemblyFixIrreducibleControlFlow final : public MachineFunctionPass {
+  const char *getPassName() const override {
+    return "WebAssembly Fix Irreducible Control Flow";
+  }
+
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    AU.setPreservesCFG();
+    AU.addRequired<MachineDominatorTree>();
+    AU.addPreserved<MachineDominatorTree>();
+    AU.addRequired<MachineLoopInfo>();
+    AU.addPreserved<MachineLoopInfo>();
+    MachineFunctionPass::getAnalysisUsage(AU);
+  }
+
+  bool runOnMachineFunction(MachineFunction &MF) override;
+
+  bool VisitLoop(MachineFunction &MF, MachineLoopInfo &MLI, MachineLoop *Loop);
+
+public:
+  static char ID; // Pass identification, replacement for typeid
+  WebAssemblyFixIrreducibleControlFlow() : MachineFunctionPass(ID) {}
+};
+} // end anonymous namespace
+
+char WebAssemblyFixIrreducibleControlFlow::ID = 0;
+FunctionPass *llvm::createWebAssemblyFixIrreducibleControlFlow() {
+  return new WebAssemblyFixIrreducibleControlFlow();
+}
+
+namespace {
+
+/// A utility for walking the blocks of a loop, handling a nested inner
+/// loop as a monolithic conceptual block.
+class MetaBlock {
+  MachineBasicBlock *Block;
+  SmallVector<MachineBasicBlock *, 2> Preds;
+  SmallVector<MachineBasicBlock *, 2> Succs;
+
+public:
+  explicit MetaBlock(MachineBasicBlock *MBB)
+      : Block(MBB), Preds(MBB->pred_begin(), MBB->pred_end()),
+        Succs(MBB->succ_begin(), MBB->succ_end()) {}
+
+  explicit MetaBlock(MachineLoop *Loop) : Block(Loop->getHeader()) {
+    Loop->getExitBlocks(Succs);
+    for (MachineBasicBlock *Pred : Block->predecessors())
+      if (!Loop->contains(Pred))
+        Preds.push_back(Pred);
+  }
+
+  MachineBasicBlock *getBlock() const { return Block; }
+
+  const SmallVectorImpl<MachineBasicBlock *> &predecessors() const {
+    return Preds;
+  }
+  const SmallVectorImpl<MachineBasicBlock *> &successors() const {
+    return Succs;
+  }
+
+  bool operator==(const MetaBlock &MBB) { return Block == MBB.Block; }
+  bool operator!=(const MetaBlock &MBB) { return Block != MBB.Block; }
+};
+
+class SuccessorList final : public MetaBlock {
+  size_t Index;
+  size_t Num;
+
+public:
+  explicit SuccessorList(MachineBasicBlock *MBB)
+      : MetaBlock(MBB), Index(0), Num(successors().size()) {}
+
+  explicit SuccessorList(MachineLoop *Loop)
+      : MetaBlock(Loop), Index(0), Num(successors().size()) {}
+
+  bool HasNext() const { return Index != Num; }
+
+  MachineBasicBlock *Next() {
+    assert(HasNext());
+    return successors()[Index++];
+  }
+};
+
+} // end anonymous namespace
+
+bool WebAssemblyFixIrreducibleControlFlow::VisitLoop(MachineFunction &MF,
+                                                     MachineLoopInfo &MLI,
+                                                     MachineLoop *Loop) {
+  MachineBasicBlock *Header = Loop ? Loop->getHeader() : &*MF.begin();
+  SetVector<MachineBasicBlock *> RewriteSuccs;
+
+  // DFS through Loop's body, looking for for irreducible control flow. Loop is
+  // natural, and we stay in its body, and we treat any nested loops
+  // monolithically, so any cycles we encounter indicate irreducibility.
+  SmallPtrSet<MachineBasicBlock *, 8> OnStack;
+  SmallPtrSet<MachineBasicBlock *, 8> Visited;
+  SmallVector<SuccessorList, 4> LoopWorklist;
+  LoopWorklist.push_back(SuccessorList(Header));
+  OnStack.insert(Header);
+  Visited.insert(Header);
+  while (!LoopWorklist.empty()) {
+    SuccessorList &Top = LoopWorklist.back();
+    if (Top.HasNext()) {
+      MachineBasicBlock *Next = Top.Next();
+      if (Next == Header || (Loop && !Loop->contains(Next)))
+        continue;
+      if (LLVM_LIKELY(OnStack.insert(Next).second)) {
+        if (!Visited.insert(Next).second) {
+          OnStack.erase(Next);
+          continue;
+        }
+        MachineLoop *InnerLoop = MLI.getLoopFor(Next);
+        if (InnerLoop != Loop)
+          LoopWorklist.push_back(SuccessorList(InnerLoop));
+        else
+          LoopWorklist.push_back(SuccessorList(Next));
+      } else {
+        RewriteSuccs.insert(Top.getBlock());
+      }
+      continue;
+    }
+    OnStack.erase(Top.getBlock());
+    LoopWorklist.pop_back();
+  }
+
+  // Most likely, we didn't find any irreducible control flow.
+  if (LLVM_LIKELY(RewriteSuccs.empty()))
+    return false;
+
+  DEBUG(dbgs() << "Irreducible control flow detected!\n");
+
+  // Ok. We have irreducible control flow! Create a dispatch block which will
+  // contains a jump table to any block in the problematic set of blocks.
+  MachineBasicBlock *Dispatch = MF.CreateMachineBasicBlock();
+  MF.insert(MF.end(), Dispatch);
+  MLI.changeLoopFor(Dispatch, Loop);
+
+  // Add the jump table.
+  const auto &TII = *MF.getSubtarget<WebAssemblySubtarget>().getInstrInfo();
+  MachineInstrBuilder MIB = BuildMI(*Dispatch, Dispatch->end(), DebugLoc(),
+                                    TII.get(WebAssembly::BR_TABLE_I32));
+
+  // Add the register which will be used to tell the jump table which block to
+  // jump to.
+  MachineRegisterInfo &MRI = MF.getRegInfo();
+  unsigned Reg = MRI.createVirtualRegister(&WebAssembly::I32RegClass);
+  MIB.addReg(Reg);
+
+  // Collect all the blocks which need to have their successors rewritten,
+  // add the successors to the jump table, and remember their index.
+  DenseMap<MachineBasicBlock *, unsigned> Indices;
+  SmallVector<MachineBasicBlock *, 4> SuccWorklist(RewriteSuccs.begin(),
+                                                   RewriteSuccs.end());
+  while (!SuccWorklist.empty()) {
+    MachineBasicBlock *MBB = SuccWorklist.pop_back_val();
+    auto Pair = Indices.insert(std::make_pair(MBB, 0));
+    if (!Pair.second)
+      continue;
+
+    unsigned Index = MIB.getInstr()->getNumExplicitOperands() - 1;
+    DEBUG(dbgs() << "MBB#" << MBB->getNumber() << " has index " << Index
+                 << "\n");
+
+    Pair.first->second = Index;
+    for (auto Pred : MBB->predecessors())
+      RewriteSuccs.insert(Pred);
+
+    MIB.addMBB(MBB);
+    Dispatch->addSuccessor(MBB);
+
+    MetaBlock Meta(MBB);
+    for (auto *Succ : Meta.successors())
+      if (Succ != Header && (!Loop || Loop->contains(Succ)))
+        SuccWorklist.push_back(Succ);
+  }
+
+  // Rewrite the problematic successors for every block in RewriteSuccs.
+  // For simplicity, we just introduce a new block for every edge we need to
+  // rewrite. Fancier things are possible.
+  for (MachineBasicBlock *MBB : RewriteSuccs) {
+    DenseMap<MachineBasicBlock *, MachineBasicBlock *> Map;
+    for (auto *Succ : MBB->successors()) {
+      if (!Indices.count(Succ))
+        continue;
+
+      MachineBasicBlock *Split = MF.CreateMachineBasicBlock();
+      MF.insert(MBB->isLayoutSuccessor(Succ) ? MachineFunction::iterator(Succ)
+                                             : MF.end(),
+                Split);
+      MLI.changeLoopFor(Split, Loop);
+
+      // Set the jump table's register of the index of the block we wish to
+      // jump to, and jump to the jump table.
+      BuildMI(*Split, Split->end(), DebugLoc(), TII.get(WebAssembly::CONST_I32),
+              Reg)
+          .addImm(Indices[Succ]);
+      BuildMI(*Split, Split->end(), DebugLoc(), TII.get(WebAssembly::BR))
+          .addMBB(Dispatch);
+      Split->addSuccessor(Dispatch);
+      Map[Succ] = Split;
+    }
+    // Remap the terminator operands and the successor list.
+    for (MachineInstr &Term : MBB->terminators())
+      for (auto &Op : Term.explicit_uses())
+        if (Op.isMBB() && Indices.count(Op.getMBB()))
+          Op.setMBB(Map[Op.getMBB()]);
+    for (auto Rewrite : Map)
+      MBB->replaceSuccessor(Rewrite.first, Rewrite.second);
+  }
+
+  // Create a fake default label, because br_table requires one.
+  MIB.addMBB(MIB.getInstr()
+                 ->getOperand(MIB.getInstr()->getNumExplicitOperands() - 1)
+                 .getMBB());
+
+  return true;
+}
+
+bool WebAssemblyFixIrreducibleControlFlow::runOnMachineFunction(
+    MachineFunction &MF) {
+  DEBUG(dbgs() << "********** Fixing Irreducible Control Flow **********\n"
+                  "********** Function: "
+               << MF.getName() << '\n');
+
+  bool Changed = false;
+  auto &MLI = getAnalysis<MachineLoopInfo>();
+
+  // Visit the function body, which is identified as a null loop.
+  Changed |= VisitLoop(MF, MLI, nullptr);
+
+  // Visit all the loops.
+  SmallVector<MachineLoop *, 8> Worklist(MLI.begin(), MLI.end());
+  while (!Worklist.empty()) {
+    MachineLoop *CurLoop = Worklist.pop_back_val();
+    Worklist.append(CurLoop->begin(), CurLoop->end());
+    Changed |= VisitLoop(MF, MLI, CurLoop);
+  }
+
+  // If we made any changes, completely recompute everything.
+  if (LLVM_UNLIKELY(Changed)) {
+    DEBUG(dbgs() << "Recomputing dominators and loops.\n");
+    MF.getRegInfo().invalidateLiveness();
+    MF.RenumberBlocks();
+    getAnalysis<MachineDominatorTree>().runOnMachineFunction(MF);
+    MLI.runOnMachineFunction(MF);
+  }
+
+  return Changed;
+}
diff --git a/lib/Target/WebAssembly/WebAssemblyFrameLowering.cpp b/lib/Target/WebAssembly/WebAssemblyFrameLowering.cpp
index 0eefd57f1f2c..0a5782e5c287 100644
--- a/lib/Target/WebAssembly/WebAssemblyFrameLowering.cpp
+++ b/lib/Target/WebAssembly/WebAssemblyFrameLowering.cpp
@@ -34,10 +34,7 @@ using namespace llvm;
 
 #define DEBUG_TYPE "wasm-frame-info"
 
-// TODO: Implement a red zone?
 // TODO: wasm64
-// TODO: Prolog/epilog should be stackified too. This pass runs after register
-//       stackification, so we'll have to do it manually.
 // TODO: Emit TargetOpcode::CFI_INSTRUCTION instructions
 
 /// Return true if the specified function should have a dedicated frame pointer
@@ -46,7 +43,7 @@ bool WebAssemblyFrameLowering::hasFP(const MachineFunction &MF) const {
   const MachineFrameInfo *MFI = MF.getFrameInfo();
   const auto *RegInfo =
       MF.getSubtarget<WebAssemblySubtarget>().getRegisterInfo();
-  return MFI->hasVarSizedObjects() || MFI->isFrameAddressTaken() ||
+  return MFI->isFrameAddressTaken() || MFI->hasVarSizedObjects() ||
          MFI->hasStackMap() || MFI->hasPatchPoint() ||
          RegInfo->needsStackRealignment(MF);
 }
@@ -62,63 +59,64 @@ bool WebAssemblyFrameLowering::hasReservedCallFrame(
 }
 
 
-/// Adjust the stack pointer by a constant amount.
-static void adjustStackPointer(unsigned StackSize,
-                               bool AdjustUp,
-                               MachineFunction& MF,
-                               MachineBasicBlock& MBB,
-                               const TargetInstrInfo* TII,
-                               MachineBasicBlock::iterator InsertPt,
-                               const DebugLoc& DL) {
-  auto &MRI = MF.getRegInfo();
-  unsigned SPReg = MRI.createVirtualRegister(&WebAssembly::I32RegClass);
-  auto *SPSymbol = MF.createExternalSymbolName("__stack_pointer");
-  BuildMI(MBB, InsertPt, DL, TII->get(WebAssembly::CONST_I32), SPReg)
-      .addExternalSymbol(SPSymbol);
-  // This MachinePointerInfo should reference __stack_pointer as well but
-  // doesn't because MachinePointerInfo() takes a GV which we don't have for
-  // __stack_pointer. TODO: check if PseudoSourceValue::ExternalSymbolCallEntry
-  // is appropriate instead. (likewise for EmitEpologue below)
-  auto *LoadMMO = new MachineMemOperand(MachinePointerInfo(),
-                                        MachineMemOperand::MOLoad, 4, 4);
-  BuildMI(MBB, InsertPt, DL, TII->get(WebAssembly::LOAD_I32), SPReg)
-      .addImm(0)
-      .addReg(SPReg)
-      .addMemOperand(LoadMMO);
-  // Add/Subtract the frame size
-  unsigned OffsetReg = MRI.createVirtualRegister(&WebAssembly::I32RegClass);
-  BuildMI(MBB, InsertPt, DL, TII->get(WebAssembly::CONST_I32), OffsetReg)
-      .addImm(StackSize);
-  BuildMI(MBB, InsertPt, DL,
-          TII->get(AdjustUp ? WebAssembly::ADD_I32 : WebAssembly::SUB_I32),
-          WebAssembly::SP32)
-      .addReg(SPReg)
-      .addReg(OffsetReg);
-  // The SP32 register now has the new stacktop. Also write it back to memory.
-  BuildMI(MBB, InsertPt, DL, TII->get(WebAssembly::CONST_I32), OffsetReg)
-      .addExternalSymbol(SPSymbol);
-  auto *MMO = new MachineMemOperand(MachinePointerInfo(),
+/// Returns true if this function needs a local user-space stack pointer.
+/// Unlike a machine stack pointer, the wasm user stack pointer is a global
+/// variable, so it is loaded into a register in the prolog.
+bool WebAssemblyFrameLowering::needsSP(const MachineFunction &MF,
+                                       const MachineFrameInfo &MFI) const {
+  return MFI.getStackSize() || MFI.adjustsStack() || hasFP(MF);
+}
+
+/// Returns true if the local user-space stack pointer needs to be written back
+/// to memory by this function (this is not meaningful if needsSP is false). If
+/// false, the stack red zone can be used and only a local SP is needed.
+bool WebAssemblyFrameLowering::needsSPWriteback(
+    const MachineFunction &MF, const MachineFrameInfo &MFI) const {
+  assert(needsSP(MF, MFI));
+  return MFI.getStackSize() > RedZoneSize || MFI.hasCalls() ||
+         MF.getFunction()->hasFnAttribute(Attribute::NoRedZone);
+}
+
+static void writeSPToMemory(unsigned SrcReg, MachineFunction &MF,
+                            MachineBasicBlock &MBB,
+                            MachineBasicBlock::iterator &InsertAddr,
+                            MachineBasicBlock::iterator &InsertStore,
+                            const DebugLoc &DL) {
+  const char *ES = "__stack_pointer";
+  auto *SPSymbol = MF.createExternalSymbolName(ES);
+  MachineRegisterInfo &MRI = MF.getRegInfo();
+  const TargetRegisterClass *PtrRC =
+      MRI.getTargetRegisterInfo()->getPointerRegClass(MF);
+  unsigned Zero = MRI.createVirtualRegister(PtrRC);
+  unsigned Drop = MRI.createVirtualRegister(PtrRC);
+  const auto *TII = MF.getSubtarget<WebAssemblySubtarget>().getInstrInfo();
+
+  BuildMI(MBB, InsertAddr, DL, TII->get(WebAssembly::CONST_I32), Zero)
+      .addImm(0);
+  auto *MMO = new MachineMemOperand(MachinePointerInfo(MF.getPSVManager()
+                                        .getExternalSymbolCallEntry(ES)),
                                     MachineMemOperand::MOStore, 4, 4);
-  BuildMI(MBB, InsertPt, DL, TII->get(WebAssembly::STORE_I32), WebAssembly::SP32)
-      .addImm(0)
-      .addReg(OffsetReg)
-      .addReg(WebAssembly::SP32)
+  BuildMI(MBB, InsertStore, DL, TII->get(WebAssembly::STORE_I32), Drop)
+      .addExternalSymbol(SPSymbol)
+      .addReg(Zero)
+      .addImm(2)  // p2align
+      .addReg(SrcReg)
       .addMemOperand(MMO);
 }
 
-void WebAssemblyFrameLowering::eliminateCallFramePseudoInstr(
+MachineBasicBlock::iterator
+WebAssemblyFrameLowering::eliminateCallFramePseudoInstr(
     MachineFunction &MF, MachineBasicBlock &MBB,
     MachineBasicBlock::iterator I) const {
-  const auto *TII =
-      static_cast<const WebAssemblyInstrInfo*>(MF.getSubtarget().getInstrInfo());
-  DebugLoc DL = I->getDebugLoc();
-  unsigned Opc = I->getOpcode();
-  bool IsDestroy = Opc == TII->getCallFrameDestroyOpcode();
-  unsigned Amount = I->getOperand(0).getImm();
-  if (Amount)
-    adjustStackPointer(Amount, IsDestroy, MF, MBB,
-                       TII, I, DL);
-  MBB.erase(I);
+  assert(!I->getOperand(0).getImm() && hasFP(MF) &&
+         "Call frame pseudos should only be used for dynamic stack adjustment");
+  const auto *TII = MF.getSubtarget<WebAssemblySubtarget>().getInstrInfo();
+  if (I->getOpcode() == TII->getCallFrameDestroyOpcode() &&
+      needsSPWriteback(MF, *MF.getFrameInfo())) {
+    DebugLoc DL = I->getDebugLoc();
+    writeSPToMemory(WebAssembly::SP32, MF, MBB, I, I, DL);
+  }
+  return MBB.erase(I);
 }
 
 void WebAssemblyFrameLowering::emitPrologue(MachineFunction &MF,
@@ -127,49 +125,91 @@ void WebAssemblyFrameLowering::emitPrologue(MachineFunction &MF,
   auto *MFI = MF.getFrameInfo();
   assert(MFI->getCalleeSavedInfo().empty() &&
          "WebAssembly should not have callee-saved registers");
-  assert(!hasFP(MF) && "Functions needing frame pointers not yet supported");
+
+  if (!needsSP(MF, *MFI)) return;
   uint64_t StackSize = MFI->getStackSize();
-  if (!StackSize && (!MFI->adjustsStack() || MFI->getMaxCallFrameSize() == 0))
-    return;
 
-  const auto *TII = MF.getSubtarget().getInstrInfo();
+  const auto *TII = MF.getSubtarget<WebAssemblySubtarget>().getInstrInfo();
+  auto &MRI = MF.getRegInfo();
 
   auto InsertPt = MBB.begin();
   DebugLoc DL;
 
-  adjustStackPointer(StackSize, false, MF, MBB, TII, InsertPt, DL);
+  const TargetRegisterClass *PtrRC =
+      MRI.getTargetRegisterInfo()->getPointerRegClass(MF);
+  unsigned Zero = MRI.createVirtualRegister(PtrRC);
+  unsigned SPReg = MRI.createVirtualRegister(PtrRC);
+  const char *ES = "__stack_pointer";
+  auto *SPSymbol = MF.createExternalSymbolName(ES);
+  BuildMI(MBB, InsertPt, DL, TII->get(WebAssembly::CONST_I32), Zero)
+      .addImm(0);
+  auto *LoadMMO = new MachineMemOperand(MachinePointerInfo(MF.getPSVManager()
+                                            .getExternalSymbolCallEntry(ES)),
+                                        MachineMemOperand::MOLoad, 4, 4);
+  // Load the SP value.
+  BuildMI(MBB, InsertPt, DL, TII->get(WebAssembly::LOAD_I32),
+          StackSize ? SPReg : (unsigned)WebAssembly::SP32)
+      .addExternalSymbol(SPSymbol)
+      .addReg(Zero)    // addr
+      .addImm(2)       // p2align
+      .addMemOperand(LoadMMO);
+
+  if (StackSize) {
+    // Subtract the frame size
+    unsigned OffsetReg = MRI.createVirtualRegister(PtrRC);
+    BuildMI(MBB, InsertPt, DL, TII->get(WebAssembly::CONST_I32), OffsetReg)
+        .addImm(StackSize);
+    BuildMI(MBB, InsertPt, DL, TII->get(WebAssembly::SUB_I32),
+            WebAssembly::SP32)
+        .addReg(SPReg)
+        .addReg(OffsetReg);
+  }
+  if (hasFP(MF)) {
+    // Unlike most conventional targets (where FP points to the saved FP),
+    // FP points to the bottom of the fixed-size locals, so we can use positive
+    // offsets in load/store instructions.
+    BuildMI(MBB, InsertPt, DL, TII->get(WebAssembly::COPY),
+            WebAssembly::FP32)
+        .addReg(WebAssembly::SP32);
+  }
+  if (StackSize && needsSPWriteback(MF, *MFI)) {
+    writeSPToMemory(WebAssembly::SP32, MF, MBB, InsertPt, InsertPt, DL);
+  }
 }
 
 void WebAssemblyFrameLowering::emitEpilogue(MachineFunction &MF,
                                             MachineBasicBlock &MBB) const {
-  uint64_t StackSize = MF.getFrameInfo()->getStackSize();
-  if (!StackSize)
-    return;
-  const auto *TII = MF.getSubtarget().getInstrInfo();
+  auto *MFI = MF.getFrameInfo();
+  uint64_t StackSize = MFI->getStackSize();
+  if (!needsSP(MF, *MFI) || !needsSPWriteback(MF, *MFI)) return;
+  const auto *TII = MF.getSubtarget<WebAssemblySubtarget>().getInstrInfo();
   auto &MRI = MF.getRegInfo();
-  unsigned OffsetReg = MRI.createVirtualRegister(&WebAssembly::I32RegClass);
   auto InsertPt = MBB.getFirstTerminator();
   DebugLoc DL;
 
-  if (InsertPt != MBB.end()) {
+  if (InsertPt != MBB.end())
     DL = InsertPt->getDebugLoc();
+
+  // Restore the stack pointer. If we had fixed-size locals, add the offset
+  // subtracted in the prolog.
+  unsigned SPReg = 0;
+  MachineBasicBlock::iterator InsertAddr = InsertPt;
+  if (StackSize) {
+    const TargetRegisterClass *PtrRC =
+        MRI.getTargetRegisterInfo()->getPointerRegClass(MF);
+    unsigned OffsetReg = MRI.createVirtualRegister(PtrRC);
+    InsertAddr =
+        BuildMI(MBB, InsertPt, DL, TII->get(WebAssembly::CONST_I32), OffsetReg)
+            .addImm(StackSize);
+    // In the epilog we don't need to write the result back to the SP32 physreg
+    // because it won't be used again. We can use a stackified register instead.
+    SPReg = MRI.createVirtualRegister(PtrRC);
+    BuildMI(MBB, InsertPt, DL, TII->get(WebAssembly::ADD_I32), SPReg)
+        .addReg(hasFP(MF) ? WebAssembly::FP32 : WebAssembly::SP32)
+        .addReg(OffsetReg);
+  } else {
+    SPReg = hasFP(MF) ? WebAssembly::FP32 : WebAssembly::SP32;
   }
 
-  // Restore the stack pointer. Without FP its value is just SP32 - stacksize
-  BuildMI(MBB, InsertPt, DL, TII->get(WebAssembly::CONST_I32), OffsetReg)
-      .addImm(StackSize);
-  auto *SPSymbol = MF.createExternalSymbolName("__stack_pointer");
-  BuildMI(MBB, InsertPt, DL, TII->get(WebAssembly::ADD_I32), WebAssembly::SP32)
-      .addReg(WebAssembly::SP32)
-      .addReg(OffsetReg);
-  // Re-use OffsetReg to hold the address of the stacktop
-  BuildMI(MBB, InsertPt, DL, TII->get(WebAssembly::CONST_I32), OffsetReg)
-      .addExternalSymbol(SPSymbol);
-  auto *MMO = new MachineMemOperand(MachinePointerInfo(),
-                                    MachineMemOperand::MOStore, 4, 4);
-  BuildMI(MBB, InsertPt, DL, TII->get(WebAssembly::STORE_I32), WebAssembly::SP32)
-      .addImm(0)
-      .addReg(OffsetReg)
-      .addReg(WebAssembly::SP32)
-      .addMemOperand(MMO);
+  writeSPToMemory(SPReg, MF, MBB, InsertAddr, InsertPt, DL);
 }
diff --git a/lib/Target/WebAssembly/WebAssemblyFrameLowering.h b/lib/Target/WebAssembly/WebAssemblyFrameLowering.h
index 5f4708fe77ed..e20fc5df7443 100644
--- a/lib/Target/WebAssembly/WebAssemblyFrameLowering.h
+++ b/lib/Target/WebAssembly/WebAssemblyFrameLowering.h
@@ -19,18 +19,24 @@
 #include "llvm/Target/TargetFrameLowering.h"
 
 namespace llvm {
+class MachineFrameInfo;
 
 class WebAssemblyFrameLowering final : public TargetFrameLowering {
-public:
+ public:
+  /// Size of the red zone for the user stack (leaf functions can use this much
+  /// space below the stack pointer without writing it back to memory).
+  // TODO: (ABI) Revisit and decide how large it should be.
+  static const size_t RedZoneSize = 128;
+
   WebAssemblyFrameLowering()
       : TargetFrameLowering(StackGrowsDown, /*StackAlignment=*/16,
                             /*LocalAreaOffset=*/0,
                             /*TransientStackAlignment=*/16,
                             /*StackRealignable=*/true) {}
 
-  void
-  eliminateCallFramePseudoInstr(MachineFunction &MF, MachineBasicBlock &MBB,
-                                MachineBasicBlock::iterator I) const override;
+  MachineBasicBlock::iterator eliminateCallFramePseudoInstr(
+      MachineFunction &MF, MachineBasicBlock &MBB,
+      MachineBasicBlock::iterator I) const override;
 
   /// These methods insert prolog and epilog code into the function.
   void emitPrologue(MachineFunction &MF, MachineBasicBlock &MBB) const override;
@@ -38,8 +44,13 @@ public:
 
   bool hasFP(const MachineFunction &MF) const override;
   bool hasReservedCallFrame(const MachineFunction &MF) const override;
+
+ private:
+  bool needsSP(const MachineFunction &MF, const MachineFrameInfo &MFI) const;
+  bool needsSPWriteback(const MachineFunction &MF,
+                        const MachineFrameInfo &MFI) const;
 };
 
-} // end namespace llvm
+}  // end namespace llvm
 
 #endif
diff --git a/lib/Target/WebAssembly/WebAssemblyISD.def b/lib/Target/WebAssembly/WebAssemblyISD.def
index 3a03fa55b220..2f0f106ef5b7 100644
--- a/lib/Target/WebAssembly/WebAssemblyISD.def
+++ b/lib/Target/WebAssembly/WebAssemblyISD.def
@@ -20,6 +20,6 @@ HANDLE_NODETYPE(RETURN)
 HANDLE_NODETYPE(ARGUMENT)
 HANDLE_NODETYPE(Wrapper)
 HANDLE_NODETYPE(BR_IF)
-HANDLE_NODETYPE(TABLESWITCH)
+HANDLE_NODETYPE(BR_TABLE)
 
 // add memory opcodes starting at ISD::FIRST_TARGET_MEMORY_OPCODE here...
diff --git a/lib/Target/WebAssembly/WebAssemblyISelDAGToDAG.cpp b/lib/Target/WebAssembly/WebAssemblyISelDAGToDAG.cpp
index 8390f797c43e..88c38b3602b9 100644
--- a/lib/Target/WebAssembly/WebAssemblyISelDAGToDAG.cpp
+++ b/lib/Target/WebAssembly/WebAssemblyISelDAGToDAG.cpp
@@ -54,7 +54,7 @@ public:
     return SelectionDAGISel::runOnMachineFunction(MF);
   }
 
-  SDNode *Select(SDNode *Node) override;
+  void Select(SDNode *Node) override;
 
   bool SelectInlineAsmMemoryOperand(const SDValue &Op, unsigned ConstraintID,
                                     std::vector<SDValue> &OutOps) override;
@@ -67,7 +67,7 @@ private:
 };
 } // end anonymous namespace
 
-SDNode *WebAssemblyDAGToDAGISel::Select(SDNode *Node) {
+void WebAssemblyDAGToDAGISel::Select(SDNode *Node) {
   // Dump information about the Node being selected.
   DEBUG(errs() << "Selecting: ");
   DEBUG(Node->dump(CurDAG));
@@ -77,11 +77,10 @@ SDNode *WebAssemblyDAGToDAGISel::Select(SDNode *Node) {
   if (Node->isMachineOpcode()) {
     DEBUG(errs() << "== "; Node->dump(CurDAG); errs() << "\n");
     Node->setNodeId(-1);
-    return nullptr;
+    return;
   }
 
   // Few custom selection stuff.
-  SDNode *ResNode = nullptr;
   EVT VT = Node->getValueType(0);
 
   switch (Node->getOpcode()) {
@@ -92,16 +91,7 @@ SDNode *WebAssemblyDAGToDAGISel::Select(SDNode *Node) {
   }
 
   // Select the default instruction.
-  ResNode = SelectCode(Node);
-
-  DEBUG(errs() << "=> ");
-  if (ResNode == nullptr || ResNode == Node)
-    DEBUG(Node->dump(CurDAG));
-  else
-    DEBUG(ResNode->dump(CurDAG));
-  DEBUG(errs() << "\n");
-
-  return ResNode;
+  SelectCode(Node);
 }
 
 bool WebAssemblyDAGToDAGISel::SelectInlineAsmMemoryOperand(
diff --git a/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp b/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp
index e9933b092988..9e7731997d58 100644
--- a/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp
+++ b/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp
@@ -26,7 +26,6 @@
 #include "llvm/IR/DiagnosticPrinter.h"
 #include "llvm/IR/Function.h"
 #include "llvm/IR/Intrinsics.h"
-#include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/raw_ostream.h"
@@ -35,61 +34,6 @@ using namespace llvm;
 
 #define DEBUG_TYPE "wasm-lower"
 
-namespace {
-// Diagnostic information for unimplemented or unsupported feature reporting.
-// TODO: This code is copied from BPF and AMDGPU; consider factoring it out
-// and sharing code.
-class DiagnosticInfoUnsupported final : public DiagnosticInfo {
-private:
-  // Debug location where this diagnostic is triggered.
-  DebugLoc DLoc;
-  const Twine &Description;
-  const Function &Fn;
-  SDValue Value;
-
-  static int KindID;
-
-  static int getKindID() {
-    if (KindID == 0)
-      KindID = llvm::getNextAvailablePluginDiagnosticKind();
-    return KindID;
-  }
-
-public:
-  DiagnosticInfoUnsupported(SDLoc DLoc, const Function &Fn, const Twine &Desc,
-                            SDValue Value)
-      : DiagnosticInfo(getKindID(), DS_Error), DLoc(DLoc.getDebugLoc()),
-        Description(Desc), Fn(Fn), Value(Value) {}
-
-  void print(DiagnosticPrinter &DP) const override {
-    std::string Str;
-    raw_string_ostream OS(Str);
-
-    if (DLoc) {
-      auto DIL = DLoc.get();
-      StringRef Filename = DIL->getFilename();
-      unsigned Line = DIL->getLine();
-      unsigned Column = DIL->getColumn();
-      OS << Filename << ':' << Line << ':' << Column << ' ';
-    }
-
-    OS << "in function " << Fn.getName() << ' ' << *Fn.getFunctionType() << '\n'
-       << Description;
-    if (Value)
-      Value->print(OS);
-    OS << '\n';
-    OS.flush();
-    DP << Str;
-  }
-
-  static bool classof(const DiagnosticInfo *DI) {
-    return DI->getKind() == getKindID();
-  }
-};
-
-int DiagnosticInfoUnsupported::KindID = 0;
-} // end anonymous namespace
-
 WebAssemblyTargetLowering::WebAssemblyTargetLowering(
     const TargetMachine &TM, const WebAssemblySubtarget &STI)
     : TargetLowering(TM), Subtarget(&STI) {
@@ -116,6 +60,8 @@ WebAssemblyTargetLowering::WebAssemblyTargetLowering(
   setOperationAction(ISD::GlobalAddress, MVTPtr, Custom);
   setOperationAction(ISD::ExternalSymbol, MVTPtr, Custom);
   setOperationAction(ISD::JumpTable, MVTPtr, Custom);
+  setOperationAction(ISD::BlockAddress, MVTPtr, Custom);
+  setOperationAction(ISD::BRIND, MVT::Other, Custom);
 
   // Take the default expansion for va_arg, va_copy, and va_end. There is no
   // default action for va_start, so we do that custom.
@@ -148,7 +94,7 @@ WebAssemblyTargetLowering::WebAssemblyTargetLowering(
   for (auto T : {MVT::i32, MVT::i64}) {
     // Expand unavailable integer operations.
     for (auto Op :
-         {ISD::BSWAP, ISD::ROTL, ISD::ROTR, ISD::SMUL_LOHI, ISD::UMUL_LOHI,
+         {ISD::BSWAP, ISD::SMUL_LOHI, ISD::UMUL_LOHI,
           ISD::MULHS, ISD::MULHU, ISD::SDIVREM, ISD::UDIVREM, ISD::SHL_PARTS,
           ISD::SRA_PARTS, ISD::SRL_PARTS, ISD::ADDC, ISD::ADDE, ISD::SUBC,
           ISD::SUBE}) {
@@ -167,6 +113,7 @@ WebAssemblyTargetLowering::WebAssemblyTargetLowering(
   setOperationAction(ISD::DYNAMIC_STACKALLOC, MVTPtr, Expand);
 
   setOperationAction(ISD::FrameIndex, MVT::i32, Custom);
+  setOperationAction(ISD::CopyToReg, MVT::Other, Custom);
 
   // Expand these forms; we pattern-match the forms that we can handle in isel.
   for (auto T : {MVT::i32, MVT::i64, MVT::f32, MVT::f64})
@@ -204,13 +151,14 @@ bool WebAssemblyTargetLowering::isOffsetFoldingLegal(
 MVT WebAssemblyTargetLowering::getScalarShiftAmountTy(const DataLayout & /*DL*/,
                                                       EVT VT) const {
   unsigned BitWidth = NextPowerOf2(VT.getSizeInBits() - 1);
-  if (BitWidth > 1 && BitWidth < 8)
-    BitWidth = 8;
+  if (BitWidth > 1 && BitWidth < 8) BitWidth = 8;
 
   if (BitWidth > 64) {
-    BitWidth = 64;
+    // The shift will be lowered to a libcall, and compiler-rt libcalls expect
+    // the count to be an i32.
+    BitWidth = 32;
     assert(BitWidth >= Log2_32_Ceil(VT.getSizeInBits()) &&
-           "64-bit shift counts ought to be enough for anyone");
+           "32-bit shift counts ought to be enough for anyone");
   }
 
   MVT Result = MVT::getIntegerVT(BitWidth);
@@ -219,13 +167,13 @@ MVT WebAssemblyTargetLowering::getScalarShiftAmountTy(const DataLayout & /*DL*/,
   return Result;
 }
 
-const char *
-WebAssemblyTargetLowering::getTargetNodeName(unsigned Opcode) const {
+const char *WebAssemblyTargetLowering::getTargetNodeName(
+    unsigned Opcode) const {
   switch (static_cast<WebAssemblyISD::NodeType>(Opcode)) {
-  case WebAssemblyISD::FIRST_NUMBER:
-    break;
-#define HANDLE_NODETYPE(NODE)                                                  \
-  case WebAssemblyISD::NODE:                                                   \
+    case WebAssemblyISD::FIRST_NUMBER:
+      break;
+#define HANDLE_NODETYPE(NODE) \
+  case WebAssemblyISD::NODE:  \
     return "WebAssemblyISD::" #NODE;
 #include "WebAssemblyISD.def"
 #undef HANDLE_NODETYPE
@@ -240,17 +188,17 @@ WebAssemblyTargetLowering::getRegForInlineAsmConstraint(
   // WebAssembly register class.
   if (Constraint.size() == 1) {
     switch (Constraint[0]) {
-    case 'r':
-      assert(VT != MVT::iPTR && "Pointer MVT not expected here");
-      if (VT.isInteger() && !VT.isVector()) {
-        if (VT.getSizeInBits() <= 32)
-          return std::make_pair(0U, &WebAssembly::I32RegClass);
-        if (VT.getSizeInBits() <= 64)
-          return std::make_pair(0U, &WebAssembly::I64RegClass);
-      }
-      break;
-    default:
-      break;
+      case 'r':
+        assert(VT != MVT::iPTR && "Pointer MVT not expected here");
+        if (VT.isInteger() && !VT.isVector()) {
+          if (VT.getSizeInBits() <= 32)
+            return std::make_pair(0U, &WebAssembly::I32RegClass);
+          if (VT.getSizeInBits() <= 64)
+            return std::make_pair(0U, &WebAssembly::I64RegClass);
+        }
+        break;
+      default:
+        break;
     }
   }
 
@@ -274,17 +222,33 @@ bool WebAssemblyTargetLowering::isLegalAddressingMode(const DataLayout &DL,
   // WebAssembly offsets are added as unsigned without wrapping. The
   // isLegalAddressingMode gives us no way to determine if wrapping could be
   // happening, so we approximate this by accepting only non-negative offsets.
-  if (AM.BaseOffs < 0)
-    return false;
+  if (AM.BaseOffs < 0) return false;
 
   // WebAssembly has no scale register operands.
-  if (AM.Scale != 0)
-    return false;
+  if (AM.Scale != 0) return false;
 
   // Everything else is legal.
   return true;
 }
 
+bool WebAssemblyTargetLowering::allowsMisalignedMemoryAccesses(
+    EVT /*VT*/, unsigned /*AddrSpace*/, unsigned /*Align*/, bool *Fast) const {
+  // WebAssembly supports unaligned accesses, though it should be declared
+  // with the p2align attribute on loads and stores which do so, and there
+  // may be a performance impact. We tell LLVM they're "fast" because
+  // for the kinds of things that LLVM uses this for (merging adjacent stores
+  // of constants, etc.), WebAssembly implementations will either want the
+  // unaligned access or they'll split anyway.
+  if (Fast) *Fast = true;
+  return true;
+}
+
+bool WebAssemblyTargetLowering::isIntDivCheap(EVT VT, AttributeSet Attr) const {
+  // The current thinking is that wasm engines will perform this optimization,
+  // so we can save on code size.
+  return true;
+}
+
 //===----------------------------------------------------------------------===//
 // WebAssembly Lowering private implementation.
 //===----------------------------------------------------------------------===//
@@ -293,10 +257,10 @@ bool WebAssemblyTargetLowering::isLegalAddressingMode(const DataLayout &DL,
 // Lowering Code
 //===----------------------------------------------------------------------===//
 
-static void fail(SDLoc DL, SelectionDAG &DAG, const char *msg) {
+static void fail(const SDLoc &DL, SelectionDAG &DAG, const char *msg) {
   MachineFunction &MF = DAG.getMachineFunction();
   DAG.getContext()->diagnose(
-      DiagnosticInfoUnsupported(DL, *MF.getFunction(), msg, SDValue()));
+      DiagnosticInfoUnsupported(*MF.getFunction(), msg, DL.getDebugLoc()));
 }
 
 // Test whether the given calling convention is supported.
@@ -312,14 +276,14 @@ static bool CallingConvSupported(CallingConv::ID CallConv) {
          CallConv == CallingConv::CXX_FAST_TLS;
 }
 
-SDValue
-WebAssemblyTargetLowering::LowerCall(CallLoweringInfo &CLI,
-                                     SmallVectorImpl<SDValue> &InVals) const {
+SDValue WebAssemblyTargetLowering::LowerCall(
+    CallLoweringInfo &CLI, SmallVectorImpl<SDValue> &InVals) const {
   SelectionDAG &DAG = CLI.DAG;
   SDLoc DL = CLI.DL;
   SDValue Chain = CLI.Chain;
   SDValue Callee = CLI.Callee;
   MachineFunction &MF = DAG.getMachineFunction();
+  auto Layout = MF.getDataLayout();
 
   CallingConv::ID CallConv = CLI.CallConv;
   if (!CallingConvSupported(CallConv))
@@ -337,16 +301,15 @@ WebAssemblyTargetLowering::LowerCall(CallLoweringInfo &CLI,
     fail(DL, DAG, "WebAssembly doesn't support tail call yet");
   CLI.IsTailCall = false;
 
-  SmallVectorImpl<SDValue> &OutVals = CLI.OutVals;
-
   SmallVectorImpl<ISD::InputArg> &Ins = CLI.Ins;
   if (Ins.size() > 1)
     fail(DL, DAG, "WebAssembly doesn't support more than 1 returned value yet");
 
   SmallVectorImpl<ISD::OutputArg> &Outs = CLI.Outs;
-  for (const ISD::OutputArg &Out : Outs) {
-    if (Out.Flags.isByVal())
-      fail(DL, DAG, "WebAssembly hasn't implemented byval arguments");
+  SmallVectorImpl<SDValue> &OutVals = CLI.OutVals;
+  for (unsigned i = 0; i < Outs.size(); ++i) {
+    const ISD::OutputArg &Out = Outs[i];
+    SDValue &OutVal = OutVals[i];
     if (Out.Flags.isNest())
       fail(DL, DAG, "WebAssembly hasn't implemented nest arguments");
     if (Out.Flags.isInAlloca())
@@ -355,28 +318,41 @@ WebAssemblyTargetLowering::LowerCall(CallLoweringInfo &CLI,
       fail(DL, DAG, "WebAssembly hasn't implemented cons regs arguments");
     if (Out.Flags.isInConsecutiveRegsLast())
       fail(DL, DAG, "WebAssembly hasn't implemented cons regs last arguments");
+    if (Out.Flags.isByVal() && Out.Flags.getByValSize() != 0) {
+      auto *MFI = MF.getFrameInfo();
+      int FI = MFI->CreateStackObject(Out.Flags.getByValSize(),
+                                      Out.Flags.getByValAlign(),
+                                      /*isSS=*/false);
+      SDValue SizeNode =
+          DAG.getConstant(Out.Flags.getByValSize(), DL, MVT::i32);
+      SDValue FINode = DAG.getFrameIndex(FI, getPointerTy(Layout));
+      Chain = DAG.getMemcpy(
+          Chain, DL, FINode, OutVal, SizeNode, Out.Flags.getByValAlign(),
+          /*isVolatile*/ false, /*AlwaysInline=*/false,
+          /*isTailCall*/ false, MachinePointerInfo(), MachinePointerInfo());
+      OutVal = FINode;
+    }
   }
 
   bool IsVarArg = CLI.IsVarArg;
   unsigned NumFixedArgs = CLI.NumFixedArgs;
-  auto PtrVT = getPointerTy(MF.getDataLayout());
+
+  auto PtrVT = getPointerTy(Layout);
 
   // Analyze operands of the call, assigning locations to each operand.
   SmallVector<CCValAssign, 16> ArgLocs;
   CCState CCInfo(CallConv, IsVarArg, MF, ArgLocs, *DAG.getContext());
 
   if (IsVarArg) {
-    // Outgoing non-fixed arguments are placed at the top of the stack. First
-    // compute their offsets and the total amount of argument stack space
-    // needed.
+    // Outgoing non-fixed arguments are placed in a buffer. First
+    // compute their offsets and the total amount of buffer space needed.
     for (SDValue Arg :
          make_range(OutVals.begin() + NumFixedArgs, OutVals.end())) {
       EVT VT = Arg.getValueType();
       assert(VT != MVT::iPTR && "Legalized args should be concrete");
       Type *Ty = VT.getTypeForEVT(*DAG.getContext());
-      unsigned Offset =
-          CCInfo.AllocateStack(MF.getDataLayout().getTypeAllocSize(Ty),
-                               MF.getDataLayout().getABITypeAlignment(Ty));
+      unsigned Offset = CCInfo.AllocateStack(Layout.getTypeAllocSize(Ty),
+                                             Layout.getABITypeAlignment(Ty));
       CCInfo.addLoc(CCValAssign::getMem(ArgLocs.size(), VT.getSimpleVT(),
                                         Offset, VT.getSimpleVT(),
                                         CCValAssign::Full));
@@ -385,17 +361,13 @@ WebAssemblyTargetLowering::LowerCall(CallLoweringInfo &CLI,
 
   unsigned NumBytes = CCInfo.getAlignedCallFrameSize();
 
-  SDValue NB;
-  if (NumBytes) {
-    NB = DAG.getConstant(NumBytes, DL, PtrVT, true);
-    Chain = DAG.getCALLSEQ_START(Chain, NB, DL);
-  }
-
-  if (IsVarArg) {
+  SDValue FINode;
+  if (IsVarArg && NumBytes) {
     // For non-fixed arguments, next emit stores to store the argument values
-    // to the stack at the offsets computed above.
-    SDValue SP = DAG.getCopyFromReg(
-        Chain, DL, getStackPointerRegisterToSaveRestore(), PtrVT);
+    // to the stack buffer at the offsets computed above.
+    int FI = MF.getFrameInfo()->CreateStackObject(NumBytes,
+                                                  Layout.getStackAlignment(),
+                                                  /*isSS=*/false);
     unsigned ValNo = 0;
     SmallVector<SDValue, 8> Chains;
     for (SDValue Arg :
@@ -403,14 +375,17 @@ WebAssemblyTargetLowering::LowerCall(CallLoweringInfo &CLI,
       assert(ArgLocs[ValNo].getValNo() == ValNo &&
              "ArgLocs should remain in order and only hold varargs args");
       unsigned Offset = ArgLocs[ValNo++].getLocMemOffset();
-      SDValue Add = DAG.getNode(ISD::ADD, DL, PtrVT, SP,
+      FINode = DAG.getFrameIndex(FI, getPointerTy(Layout));
+      SDValue Add = DAG.getNode(ISD::ADD, DL, PtrVT, FINode,
                                 DAG.getConstant(Offset, DL, PtrVT));
-      Chains.push_back(DAG.getStore(Chain, DL, Arg, Add,
-                                    MachinePointerInfo::getStack(MF, Offset),
-                                    false, false, 0));
+      Chains.push_back(DAG.getStore(
+          Chain, DL, Arg, Add,
+          MachinePointerInfo::getFixedStack(MF, FI, Offset), 0));
     }
     if (!Chains.empty())
       Chain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Chains);
+  } else if (IsVarArg) {
+    FINode = DAG.getIntPtrConstant(0, DL);
   }
 
   // Compute the operands for the CALLn node.
@@ -422,8 +397,10 @@ WebAssemblyTargetLowering::LowerCall(CallLoweringInfo &CLI,
   // isn't reliable.
   Ops.append(OutVals.begin(),
              IsVarArg ? OutVals.begin() + NumFixedArgs : OutVals.end());
+  // Add a pointer to the vararg buffer.
+  if (IsVarArg) Ops.push_back(FINode);
 
-  SmallVector<EVT, 8> Tys;
+  SmallVector<EVT, 8> InTys;
   for (const auto &In : Ins) {
     assert(!In.Flags.isByVal() && "byval is not valid for return values");
     assert(!In.Flags.isNest() && "nest is not valid for return values");
@@ -436,13 +413,13 @@ WebAssemblyTargetLowering::LowerCall(CallLoweringInfo &CLI,
            "WebAssembly hasn't implemented cons regs last return values");
     // Ignore In.getOrigAlign() because all our arguments are passed in
     // registers.
-    Tys.push_back(In.VT);
+    InTys.push_back(In.VT);
   }
-  Tys.push_back(MVT::Other);
-  SDVTList TyList = DAG.getVTList(Tys);
+  InTys.push_back(MVT::Other);
+  SDVTList InTyList = DAG.getVTList(InTys);
   SDValue Res =
       DAG.getNode(Ins.empty() ? WebAssemblyISD::CALL0 : WebAssemblyISD::CALL1,
-                  DL, TyList, Ops);
+                  DL, InTyList, Ops);
   if (Ins.empty()) {
     Chain = Res;
   } else {
@@ -450,11 +427,6 @@ WebAssemblyTargetLowering::LowerCall(CallLoweringInfo &CLI,
     Chain = Res.getValue(1);
   }
 
-  if (NumBytes) {
-    SDValue Unused = DAG.getTargetConstant(0, DL, PtrVT);
-    Chain = DAG.getCALLSEQ_END(Chain, NB, Unused, SDValue(), DL);
-  }
-
   return Chain;
 }
 
@@ -469,7 +441,7 @@ bool WebAssemblyTargetLowering::CanLowerReturn(
 SDValue WebAssemblyTargetLowering::LowerReturn(
     SDValue Chain, CallingConv::ID CallConv, bool /*IsVarArg*/,
     const SmallVectorImpl<ISD::OutputArg> &Outs,
-    const SmallVectorImpl<SDValue> &OutVals, SDLoc DL,
+    const SmallVectorImpl<SDValue> &OutVals, const SDLoc &DL,
     SelectionDAG &DAG) const {
   assert(Outs.size() <= 1 && "WebAssembly can only return up to one value");
   if (!CallingConvSupported(CallConv))
@@ -496,10 +468,11 @@ SDValue WebAssemblyTargetLowering::LowerReturn(
 }
 
 SDValue WebAssemblyTargetLowering::LowerFormalArguments(
-    SDValue Chain, CallingConv::ID CallConv, bool /*IsVarArg*/,
-    const SmallVectorImpl<ISD::InputArg> &Ins, SDLoc DL, SelectionDAG &DAG,
-    SmallVectorImpl<SDValue> &InVals) const {
+    SDValue Chain, CallingConv::ID CallConv, bool IsVarArg,
+    const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &DL,
+    SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals) const {
   MachineFunction &MF = DAG.getMachineFunction();
+  auto *MFI = MF.getInfo<WebAssemblyFunctionInfo>();
 
   if (!CallingConvSupported(CallConv))
     fail(DL, DAG, "WebAssembly doesn't support non-C calling conventions");
@@ -509,8 +482,6 @@ SDValue WebAssemblyTargetLowering::LowerFormalArguments(
   MF.getRegInfo().addLiveIn(WebAssembly::ARGUMENTS);
 
   for (const ISD::InputArg &In : Ins) {
-    if (In.Flags.isByVal())
-      fail(DL, DAG, "WebAssembly hasn't implemented byval arguments");
     if (In.Flags.isInAlloca())
       fail(DL, DAG, "WebAssembly hasn't implemented inalloca arguments");
     if (In.Flags.isNest())
@@ -528,11 +499,22 @@ SDValue WebAssemblyTargetLowering::LowerFormalArguments(
             : DAG.getUNDEF(In.VT));
 
     // Record the number and types of arguments.
-    MF.getInfo<WebAssemblyFunctionInfo>()->addParam(In.VT);
+    MFI->addParam(In.VT);
   }
 
-  // Incoming varargs arguments are on the stack and will be accessed through
-  // va_arg, so we don't need to do anything for them here.
+  // Varargs are copied into a buffer allocated by the caller, and a pointer to
+  // the buffer is passed as an argument.
+  if (IsVarArg) {
+    MVT PtrVT = getPointerTy(MF.getDataLayout());
+    unsigned VarargVreg =
+        MF.getRegInfo().createVirtualRegister(getRegClassFor(PtrVT));
+    MFI->setVarargBufferVreg(VarargVreg);
+    Chain = DAG.getCopyToReg(
+        Chain, DL, VarargVreg,
+        DAG.getNode(WebAssemblyISD::ARGUMENT, DL, PtrVT,
+                    DAG.getTargetConstant(Ins.size(), DL, MVT::i32)));
+    MFI->addParam(PtrVT);
+  }
 
   return Chain;
 }
@@ -543,31 +525,85 @@ SDValue WebAssemblyTargetLowering::LowerFormalArguments(
 
 SDValue WebAssemblyTargetLowering::LowerOperation(SDValue Op,
                                                   SelectionDAG &DAG) const {
+  SDLoc DL(Op);
   switch (Op.getOpcode()) {
-  default:
-    llvm_unreachable("unimplemented operation lowering");
-    return SDValue();
-  case ISD::FrameIndex:
-    return LowerFrameIndex(Op, DAG);
-  case ISD::GlobalAddress:
-    return LowerGlobalAddress(Op, DAG);
-  case ISD::ExternalSymbol:
-    return LowerExternalSymbol(Op, DAG);
-  case ISD::JumpTable:
-    return LowerJumpTable(Op, DAG);
-  case ISD::BR_JT:
-    return LowerBR_JT(Op, DAG);
-  case ISD::VASTART:
-    return LowerVASTART(Op, DAG);
+    default:
+      llvm_unreachable("unimplemented operation lowering");
+      return SDValue();
+    case ISD::FrameIndex:
+      return LowerFrameIndex(Op, DAG);
+    case ISD::GlobalAddress:
+      return LowerGlobalAddress(Op, DAG);
+    case ISD::ExternalSymbol:
+      return LowerExternalSymbol(Op, DAG);
+    case ISD::JumpTable:
+      return LowerJumpTable(Op, DAG);
+    case ISD::BR_JT:
+      return LowerBR_JT(Op, DAG);
+    case ISD::VASTART:
+      return LowerVASTART(Op, DAG);
+    case ISD::BlockAddress:
+    case ISD::BRIND:
+      fail(DL, DAG, "WebAssembly hasn't implemented computed gotos");
+      return SDValue();
+    case ISD::RETURNADDR: // Probably nothing meaningful can be returned here.
+      fail(DL, DAG, "WebAssembly hasn't implemented __builtin_return_address");
+      return SDValue();
+    case ISD::FRAMEADDR:
+      return LowerFRAMEADDR(Op, DAG);
+    case ISD::CopyToReg:
+      return LowerCopyToReg(Op, DAG);
   }
 }
 
+SDValue WebAssemblyTargetLowering::LowerCopyToReg(SDValue Op,
+                                                  SelectionDAG &DAG) const {
+  SDValue Src = Op.getOperand(2);
+  if (isa<FrameIndexSDNode>(Src.getNode())) {
+    // CopyToReg nodes don't support FrameIndex operands. Other targets select
+    // the FI to some LEA-like instruction, but since we don't have that, we
+    // need to insert some kind of instruction that can take an FI operand and
+    // produces a value usable by CopyToReg (i.e. in a vreg). So insert a dummy
+    // copy_local between Op and its FI operand.
+    SDValue Chain = Op.getOperand(0);
+    SDLoc DL(Op);
+    unsigned Reg = cast<RegisterSDNode>(Op.getOperand(1))->getReg();
+    EVT VT = Src.getValueType();
+    SDValue Copy(
+        DAG.getMachineNode(VT == MVT::i32 ? WebAssembly::COPY_LOCAL_I32
+                                          : WebAssembly::COPY_LOCAL_I64,
+                           DL, VT, Src),
+        0);
+    return Op.getNode()->getNumValues() == 1
+               ? DAG.getCopyToReg(Chain, DL, Reg, Copy)
+               : DAG.getCopyToReg(Chain, DL, Reg, Copy, Op.getNumOperands() == 4
+                                                            ? Op.getOperand(3)
+                                                            : SDValue());
+  }
+  return SDValue();
+}
+
 SDValue WebAssemblyTargetLowering::LowerFrameIndex(SDValue Op,
                                                    SelectionDAG &DAG) const {
   int FI = cast<FrameIndexSDNode>(Op)->getIndex();
   return DAG.getTargetFrameIndex(FI, Op.getValueType());
 }
 
+SDValue WebAssemblyTargetLowering::LowerFRAMEADDR(SDValue Op,
+                                                  SelectionDAG &DAG) const {
+  // Non-zero depths are not supported by WebAssembly currently. Use the
+  // legalizer's default expansion, which is to return 0 (what this function is
+  // documented to do).
+  if (Op.getConstantOperandVal(0) > 0)
+    return SDValue();
+
+  DAG.getMachineFunction().getFrameInfo()->setFrameAddressIsTaken(true);
+  EVT VT = Op.getValueType();
+  unsigned FP =
+      Subtarget->getRegisterInfo()->getFrameRegister(DAG.getMachineFunction());
+  return DAG.getCopyFromReg(DAG.getEntryNode(), SDLoc(Op), FP, VT);
+}
+
 SDValue WebAssemblyTargetLowering::LowerGlobalAddress(SDValue Op,
                                                       SelectionDAG &DAG) const {
   SDLoc DL(Op);
@@ -582,9 +618,8 @@ SDValue WebAssemblyTargetLowering::LowerGlobalAddress(SDValue Op,
       DAG.getTargetGlobalAddress(GA->getGlobal(), DL, VT, GA->getOffset()));
 }
 
-SDValue
-WebAssemblyTargetLowering::LowerExternalSymbol(SDValue Op,
-                                               SelectionDAG &DAG) const {
+SDValue WebAssemblyTargetLowering::LowerExternalSymbol(
+    SDValue Op, SelectionDAG &DAG) const {
   SDLoc DL(Op);
   const auto *ES = cast<ExternalSymbolSDNode>(Op);
   EVT VT = Op.getValueType();
@@ -603,7 +638,7 @@ WebAssemblyTargetLowering::LowerExternalSymbol(SDValue Op,
 SDValue WebAssemblyTargetLowering::LowerJumpTable(SDValue Op,
                                                   SelectionDAG &DAG) const {
   // There's no need for a Wrapper node because we always incorporate a jump
-  // table operand into a TABLESWITCH instruction, rather than ever
+  // table operand into a BR_TABLE instruction, rather than ever
   // materializing it in a register.
   const JumpTableSDNode *JT = cast<JumpTableSDNode>(Op);
   return DAG.getTargetJumpTable(JT->getIndex(), Op.getValueType(),
@@ -625,16 +660,15 @@ SDValue WebAssemblyTargetLowering::LowerBR_JT(SDValue Op,
   MachineJumpTableInfo *MJTI = DAG.getMachineFunction().getJumpTableInfo();
   const auto &MBBs = MJTI->getJumpTables()[JT->getIndex()].MBBs;
 
+  // Add an operand for each case.
+  for (auto MBB : MBBs) Ops.push_back(DAG.getBasicBlock(MBB));
+
   // TODO: For now, we just pick something arbitrary for a default case for now.
   // We really want to sniff out the guard and put in the real default case (and
   // delete the guard).
   Ops.push_back(DAG.getBasicBlock(MBBs[0]));
 
-  // Add an operand for each case.
-  for (auto MBB : MBBs)
-    Ops.push_back(DAG.getBasicBlock(MBB));
-
-  return DAG.getNode(WebAssemblyISD::TABLESWITCH, DL, MVT::Other, Ops);
+  return DAG.getNode(WebAssemblyISD::BR_TABLE, DL, MVT::Other, Ops);
 }
 
 SDValue WebAssemblyTargetLowering::LowerVASTART(SDValue Op,
@@ -642,16 +676,13 @@ SDValue WebAssemblyTargetLowering::LowerVASTART(SDValue Op,
   SDLoc DL(Op);
   EVT PtrVT = getPointerTy(DAG.getMachineFunction().getDataLayout());
 
-  // The incoming non-fixed arguments are placed on the top of the stack, with
-  // natural alignment, at the point of the call, so the base pointer is just
-  // the current frame pointer.
-  DAG.getMachineFunction().getFrameInfo()->setFrameAddressIsTaken(true);
-  unsigned FP =
-      Subtarget->getRegisterInfo()->getFrameRegister(DAG.getMachineFunction());
-  SDValue FrameAddr = DAG.getCopyFromReg(DAG.getEntryNode(), DL, FP, PtrVT);
+  auto *MFI = DAG.getMachineFunction().getInfo<WebAssemblyFunctionInfo>();
   const Value *SV = cast<SrcValueSDNode>(Op.getOperand(2))->getValue();
-  return DAG.getStore(Op.getOperand(0), DL, FrameAddr, Op.getOperand(1),
-                      MachinePointerInfo(SV), false, false, 0);
+
+  SDValue ArgN = DAG.getCopyFromReg(DAG.getEntryNode(), DL,
+                                    MFI->getVarargBufferVreg(), PtrVT);
+  return DAG.getStore(Op.getOperand(0), DL, ArgN, Op.getOperand(1),
+                      MachinePointerInfo(SV), 0);
 }
 
 //===----------------------------------------------------------------------===//
diff --git a/lib/Target/WebAssembly/WebAssemblyISelLowering.h b/lib/Target/WebAssembly/WebAssemblyISelLowering.h
index e7232a042e12..5bc723028e63 100644
--- a/lib/Target/WebAssembly/WebAssemblyISelLowering.h
+++ b/lib/Target/WebAssembly/WebAssemblyISelLowering.h
@@ -29,17 +29,17 @@ enum NodeType : unsigned {
 #undef HANDLE_NODETYPE
 };
 
-} // end namespace WebAssemblyISD
+}  // end namespace WebAssemblyISD
 
 class WebAssemblySubtarget;
 class WebAssemblyTargetMachine;
 
 class WebAssemblyTargetLowering final : public TargetLowering {
-public:
+ public:
   WebAssemblyTargetLowering(const TargetMachine &TM,
                             const WebAssemblySubtarget &STI);
 
-private:
+ private:
   /// Keep a pointer to the WebAssemblySubtarget around so that we can make the
   /// right decision when generating code for different targets.
   const WebAssemblySubtarget *Subtarget;
@@ -49,13 +49,16 @@ private:
   bool isOffsetFoldingLegal(const GlobalAddressSDNode *GA) const override;
   MVT getScalarShiftAmountTy(const DataLayout &DL, EVT) const override;
   const char *getTargetNodeName(unsigned Opcode) const override;
-  std::pair<unsigned, const TargetRegisterClass *>
-  getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI,
-                               StringRef Constraint, MVT VT) const override;
+  std::pair<unsigned, const TargetRegisterClass *> getRegForInlineAsmConstraint(
+      const TargetRegisterInfo *TRI, StringRef Constraint,
+      MVT VT) const override;
   bool isCheapToSpeculateCttz() const override;
   bool isCheapToSpeculateCtlz() const override;
   bool isLegalAddressingMode(const DataLayout &DL, const AddrMode &AM, Type *Ty,
                              unsigned AS) const override;
+  bool allowsMisalignedMemoryAccesses(EVT, unsigned AddrSpace, unsigned Align,
+                                      bool *Fast) const override;
+  bool isIntDivCheap(EVT VT, AttributeSet Attr) const override;
 
   SDValue LowerCall(CallLoweringInfo &CLI,
                     SmallVectorImpl<SDValue> &InVals) const override;
@@ -65,29 +68,31 @@ private:
                       LLVMContext &Context) const override;
   SDValue LowerReturn(SDValue Chain, CallingConv::ID CallConv, bool isVarArg,
                       const SmallVectorImpl<ISD::OutputArg> &Outs,
-                      const SmallVectorImpl<SDValue> &OutVals, SDLoc dl,
+                      const SmallVectorImpl<SDValue> &OutVals, const SDLoc &dl,
                       SelectionDAG &DAG) const override;
   SDValue LowerFormalArguments(SDValue Chain, CallingConv::ID CallConv,
                                bool IsVarArg,
                                const SmallVectorImpl<ISD::InputArg> &Ins,
-                               SDLoc DL, SelectionDAG &DAG,
+                               const SDLoc &DL, SelectionDAG &DAG,
                                SmallVectorImpl<SDValue> &InVals) const override;
 
   // Custom lowering hooks.
   SDValue LowerOperation(SDValue Op, SelectionDAG &DAG) const override;
   SDValue LowerFrameIndex(SDValue Op, SelectionDAG &DAG) const;
+  SDValue LowerFRAMEADDR(SDValue Op, SelectionDAG &DAG) const;
   SDValue LowerGlobalAddress(SDValue Op, SelectionDAG &DAG) const;
   SDValue LowerExternalSymbol(SDValue Op, SelectionDAG &DAG) const;
   SDValue LowerBR_JT(SDValue Op, SelectionDAG &DAG) const;
   SDValue LowerJumpTable(SDValue Op, SelectionDAG &DAG) const;
   SDValue LowerVASTART(SDValue Op, SelectionDAG &DAG) const;
+  SDValue LowerCopyToReg(SDValue Op, SelectionDAG &DAG) const;
 };
 
 namespace WebAssembly {
 FastISel *createFastISel(FunctionLoweringInfo &funcInfo,
                          const TargetLibraryInfo *libInfo);
-} // end namespace WebAssembly
+}  // end namespace WebAssembly
 
-} // end namespace llvm
+}  // end namespace llvm
 
 #endif
diff --git a/lib/Target/WebAssembly/WebAssemblyInstrControl.td b/lib/Target/WebAssembly/WebAssemblyInstrControl.td
index fda95953db81..444e275c6ebf 100644
--- a/lib/Target/WebAssembly/WebAssemblyInstrControl.td
+++ b/lib/Target/WebAssembly/WebAssemblyInstrControl.td
@@ -16,12 +16,12 @@ let Defs = [ARGUMENTS] in {
 
 let isBranch = 1, isTerminator = 1, hasCtrlDep = 1 in {
 // The condition operand is a boolean value which WebAssembly represents as i32.
-def BR_IF : I<(outs), (ins I32:$cond, bb_op:$dst),
+def BR_IF : I<(outs), (ins bb_op:$dst, I32:$cond),
               [(brcond I32:$cond, bb:$dst)],
-               "br_if   \t$cond, $dst">;
+               "br_if   \t$dst, $cond">;
 let isCodeGenOnly = 1 in
-def BR_UNLESS : I<(outs), (ins I32:$cond, bb_op:$dst), [],
-                   "br_unless\t$cond, $dst">;
+def BR_UNLESS : I<(outs), (ins bb_op:$dst, I32:$cond), [],
+                   "br_unless\t$dst, $cond">;
 let isBarrier = 1 in {
 def BR   : I<(outs), (ins bb_op:$dst),
              [(br bb:$dst)],
@@ -32,27 +32,27 @@ def BR   : I<(outs), (ins bb_op:$dst),
 } // Defs = [ARGUMENTS]
 
 def : Pat<(brcond (i32 (setne I32:$cond, 0)), bb:$dst),
-          (BR_IF I32:$cond, bb_op:$dst)>;
+          (BR_IF bb_op:$dst, I32:$cond)>;
 def : Pat<(brcond (i32 (seteq I32:$cond, 0)), bb:$dst),
-          (BR_UNLESS I32:$cond, bb_op:$dst)>;
+          (BR_UNLESS bb_op:$dst, I32:$cond)>;
 
 let Defs = [ARGUMENTS] in {
 
 // TODO: SelectionDAG's lowering insists on using a pointer as the index for
-// jump tables, so in practice we don't ever use TABLESWITCH_I64 in wasm32 mode
+// jump tables, so in practice we don't ever use BR_TABLE_I64 in wasm32 mode
 // currently.
 // Set TSFlags{0} to 1 to indicate that the variable_ops are immediates.
 // Set TSFlags{1} to 1 to indicate that the immediates represent labels.
 let isTerminator = 1, hasCtrlDep = 1, isBarrier = 1 in {
-def TABLESWITCH_I32 : I<(outs), (ins I32:$index, bb_op:$default, variable_ops),
-                        [(WebAssemblytableswitch I32:$index, bb:$default)],
-                        "tableswitch\t$index, $default"> {
+def BR_TABLE_I32 : I<(outs), (ins I32:$index, variable_ops),
+                     [(WebAssemblybr_table I32:$index)],
+                     "br_table \t$index"> {
   let TSFlags{0} = 1;
   let TSFlags{1} = 1;
 }
-def TABLESWITCH_I64 : I<(outs), (ins I64:$index, bb_op:$default, variable_ops),
-                        [(WebAssemblytableswitch I64:$index, bb:$default)],
-                        "tableswitch\t$index, $default"> {
+def BR_TABLE_I64 : I<(outs), (ins I64:$index, variable_ops),
+                     [(WebAssemblybr_table I64:$index)],
+                     "br_table \t$index"> {
   let TSFlags{0} = 1;
   let TSFlags{1} = 1;
 }
@@ -71,6 +71,10 @@ def END_LOOP  : I<(outs), (ins), [], "end_loop">;
 multiclass RETURN<WebAssemblyRegClass vt> {
   def RETURN_#vt : I<(outs), (ins vt:$val), [(WebAssemblyreturn vt:$val)],
                      "return  \t$val">;
+  // Equivalent to RETURN_#vt, for use at the end of a function when wasm
+  // semantics return by falling off the end of the block.
+  let isCodeGenOnly = 1 in
+  def FALLTHROUGH_RETURN_#vt : I<(outs), (ins vt:$val), []>;
 }
 
 let isTerminator = 1, hasCtrlDep = 1, isBarrier = 1 in {
@@ -80,6 +84,10 @@ let isReturn = 1 in {
   defm : RETURN<F32>;
   defm : RETURN<F64>;
   def RETURN_VOID : I<(outs), (ins), [(WebAssemblyreturn)], "return">;
+
+  // This is to RETURN_VOID what FALLTHROUGH_RETURN_#vt is to RETURN_#vt.
+  let isCodeGenOnly = 1 in
+  def FALLTHROUGH_RETURN_VOID : I<(outs), (ins), []>;
 } // isReturn = 1
   def UNREACHABLE : I<(outs), (ins), [(trap)], "unreachable">;
 } // isTerminator = 1, hasCtrlDep = 1, isBarrier = 1
diff --git a/lib/Target/WebAssembly/WebAssemblyInstrFloat.td b/lib/Target/WebAssembly/WebAssemblyInstrFloat.td
index 5520c6de6732..64569720375c 100644
--- a/lib/Target/WebAssembly/WebAssemblyInstrFloat.td
+++ b/lib/Target/WebAssembly/WebAssemblyInstrFloat.td
@@ -77,12 +77,12 @@ def : Pat<(setge f64:$lhs, f64:$rhs), (GE_F64 f64:$lhs, f64:$rhs)>;
 
 let Defs = [ARGUMENTS] in {
 
-def SELECT_F32 : I<(outs F32:$dst), (ins I32:$cond, F32:$lhs, F32:$rhs),
+def SELECT_F32 : I<(outs F32:$dst), (ins F32:$lhs, F32:$rhs, I32:$cond),
                    [(set F32:$dst, (select I32:$cond, F32:$lhs, F32:$rhs))],
-                   "f32.select\t$dst, $cond, $lhs, $rhs">;
-def SELECT_F64 : I<(outs F64:$dst), (ins I32:$cond, F64:$lhs, F64:$rhs),
+                   "f32.select\t$dst, $lhs, $rhs, $cond">;
+def SELECT_F64 : I<(outs F64:$dst), (ins F64:$lhs, F64:$rhs, I32:$cond),
                    [(set F64:$dst, (select I32:$cond, F64:$lhs, F64:$rhs))],
-                   "f64.select\t$dst, $cond, $lhs, $rhs">;
+                   "f64.select\t$dst, $lhs, $rhs, $cond">;
 
 } // Defs = [ARGUMENTS]
 
@@ -90,12 +90,12 @@ def SELECT_F64 : I<(outs F64:$dst), (ins I32:$cond, F64:$lhs, F64:$rhs),
 // WebAssembly's select interprets any non-zero value as true, so we can fold
 // a setne with 0 into a select.
 def : Pat<(select (i32 (setne I32:$cond, 0)), F32:$lhs, F32:$rhs),
-          (SELECT_F32 I32:$cond, F32:$lhs, F32:$rhs)>;
+          (SELECT_F32 F32:$lhs, F32:$rhs, I32:$cond)>;
 def : Pat<(select (i32 (setne I32:$cond, 0)), F64:$lhs, F64:$rhs),
-          (SELECT_F64 I32:$cond, F64:$lhs, F64:$rhs)>;
+          (SELECT_F64 F64:$lhs, F64:$rhs, I32:$cond)>;
 
 // And again, this time with seteq instead of setne and the arms reversed.
 def : Pat<(select (i32 (seteq I32:$cond, 0)), F32:$lhs, F32:$rhs),
-          (SELECT_F32 I32:$cond, F32:$rhs, F32:$lhs)>;
+          (SELECT_F32 F32:$rhs, F32:$lhs, I32:$cond)>;
 def : Pat<(select (i32 (seteq I32:$cond, 0)), F64:$lhs, F64:$rhs),
-          (SELECT_F64 I32:$cond, F64:$rhs, F64:$lhs)>;
+          (SELECT_F64 F64:$rhs, F64:$lhs, I32:$cond)>;
diff --git a/lib/Target/WebAssembly/WebAssemblyInstrInfo.cpp b/lib/Target/WebAssembly/WebAssemblyInstrInfo.cpp
index 028e9af0834f..2fd3eab99d78 100644
--- a/lib/Target/WebAssembly/WebAssemblyInstrInfo.cpp
+++ b/lib/Target/WebAssembly/WebAssemblyInstrInfo.cpp
@@ -15,6 +15,7 @@
 
 #include "WebAssemblyInstrInfo.h"
 #include "MCTargetDesc/WebAssemblyMCTargetDesc.h"
+#include "WebAssemblyMachineFunctionInfo.h"
 #include "WebAssemblySubtarget.h"
 #include "llvm/CodeGen/MachineFrameInfo.h"
 #include "llvm/CodeGen/MachineInstrBuilder.h"
@@ -32,16 +33,32 @@ WebAssemblyInstrInfo::WebAssemblyInstrInfo(const WebAssemblySubtarget &STI)
                               WebAssembly::ADJCALLSTACKUP),
       RI(STI.getTargetTriple()) {}
 
+bool WebAssemblyInstrInfo::isReallyTriviallyReMaterializable(
+    const MachineInstr &MI, AliasAnalysis *AA) const {
+  switch (MI.getOpcode()) {
+  case WebAssembly::CONST_I32:
+  case WebAssembly::CONST_I64:
+  case WebAssembly::CONST_F32:
+  case WebAssembly::CONST_F64:
+    // isReallyTriviallyReMaterializableGeneric misses these because of the
+    // ARGUMENTS implicit def, so we manualy override it here.
+    return true;
+  default:
+    return false;
+  }
+}
+
 void WebAssemblyInstrInfo::copyPhysReg(MachineBasicBlock &MBB,
                                        MachineBasicBlock::iterator I,
-                                       DebugLoc DL, unsigned DestReg,
+                                       const DebugLoc &DL, unsigned DestReg,
                                        unsigned SrcReg, bool KillSrc) const {
   // This method is called by post-RA expansion, which expects only pregs to
   // exist. However we need to handle both here.
   auto &MRI = MBB.getParent()->getRegInfo();
-  const TargetRegisterClass *RC = TargetRegisterInfo::isVirtualRegister(DestReg) ?
-      MRI.getRegClass(DestReg) :
-      MRI.getTargetRegisterInfo()->getMinimalPhysRegClass(SrcReg);
+  const TargetRegisterClass *RC =
+      TargetRegisterInfo::isVirtualRegister(DestReg)
+          ? MRI.getRegClass(DestReg)
+          : MRI.getTargetRegisterInfo()->getMinimalPhysRegClass(DestReg);
 
   unsigned CopyLocalOpcode;
   if (RC == &WebAssembly::I32RegClass)
@@ -59,8 +76,23 @@ void WebAssemblyInstrInfo::copyPhysReg(MachineBasicBlock &MBB,
       .addReg(SrcReg, KillSrc ? RegState::Kill : 0);
 }
 
+MachineInstr *
+WebAssemblyInstrInfo::commuteInstructionImpl(MachineInstr &MI, bool NewMI,
+                                             unsigned OpIdx1,
+                                             unsigned OpIdx2) const {
+  // If the operands are stackified, we can't reorder them.
+  WebAssemblyFunctionInfo &MFI =
+      *MI.getParent()->getParent()->getInfo<WebAssemblyFunctionInfo>();
+  if (MFI.isVRegStackified(MI.getOperand(OpIdx1).getReg()) ||
+      MFI.isVRegStackified(MI.getOperand(OpIdx2).getReg()))
+    return nullptr;
+
+  // Otherwise use the default implementation.
+  return TargetInstrInfo::commuteInstructionImpl(MI, NewMI, OpIdx1, OpIdx2);
+}
+
 // Branch analysis.
-bool WebAssemblyInstrInfo::AnalyzeBranch(MachineBasicBlock &MBB,
+bool WebAssemblyInstrInfo::analyzeBranch(MachineBasicBlock &MBB,
                                          MachineBasicBlock *&TBB,
                                          MachineBasicBlock *&FBB,
                                          SmallVectorImpl<MachineOperand> &Cond,
@@ -75,22 +107,22 @@ bool WebAssemblyInstrInfo::AnalyzeBranch(MachineBasicBlock &MBB,
       if (HaveCond)
         return true;
       // If we're running after CFGStackify, we can't optimize further.
-      if (!MI.getOperand(1).isMBB())
+      if (!MI.getOperand(0).isMBB())
         return true;
       Cond.push_back(MachineOperand::CreateImm(true));
-      Cond.push_back(MI.getOperand(0));
-      TBB = MI.getOperand(1).getMBB();
+      Cond.push_back(MI.getOperand(1));
+      TBB = MI.getOperand(0).getMBB();
       HaveCond = true;
       break;
     case WebAssembly::BR_UNLESS:
       if (HaveCond)
         return true;
       // If we're running after CFGStackify, we can't optimize further.
-      if (!MI.getOperand(1).isMBB())
+      if (!MI.getOperand(0).isMBB())
         return true;
       Cond.push_back(MachineOperand::CreateImm(false));
-      Cond.push_back(MI.getOperand(0));
-      TBB = MI.getOperand(1).getMBB();
+      Cond.push_back(MI.getOperand(1));
+      TBB = MI.getOperand(0).getMBB();
       HaveCond = true;
       break;
     case WebAssembly::BR:
@@ -133,7 +165,7 @@ unsigned WebAssemblyInstrInfo::InsertBranch(MachineBasicBlock &MBB,
                                             MachineBasicBlock *TBB,
                                             MachineBasicBlock *FBB,
                                             ArrayRef<MachineOperand> Cond,
-                                            DebugLoc DL) const {
+                                            const DebugLoc &DL) const {
   if (Cond.empty()) {
     if (!TBB)
       return 0;
@@ -145,13 +177,11 @@ unsigned WebAssemblyInstrInfo::InsertBranch(MachineBasicBlock &MBB,
   assert(Cond.size() == 2 && "Expected a flag and a successor block");
 
   if (Cond[0].getImm()) {
-    BuildMI(&MBB, DL, get(WebAssembly::BR_IF))
-        .addOperand(Cond[1])
-        .addMBB(TBB);
+    BuildMI(&MBB, DL, get(WebAssembly::BR_IF)).addMBB(TBB).addOperand(Cond[1]);
   } else {
     BuildMI(&MBB, DL, get(WebAssembly::BR_UNLESS))
-        .addOperand(Cond[1])
-        .addMBB(TBB);
+        .addMBB(TBB)
+        .addOperand(Cond[1]);
   }
   if (!FBB)
     return 1;
diff --git a/lib/Target/WebAssembly/WebAssemblyInstrInfo.h b/lib/Target/WebAssembly/WebAssemblyInstrInfo.h
index 5ddd9b36f243..d93f958ca4cd 100644
--- a/lib/Target/WebAssembly/WebAssemblyInstrInfo.h
+++ b/lib/Target/WebAssembly/WebAssemblyInstrInfo.h
@@ -34,18 +34,24 @@ public:
 
   const WebAssemblyRegisterInfo &getRegisterInfo() const { return RI; }
 
+  bool isReallyTriviallyReMaterializable(const MachineInstr &MI,
+                                         AliasAnalysis *AA) const override;
+
   void copyPhysReg(MachineBasicBlock &MBB, MachineBasicBlock::iterator MI,
-                   DebugLoc DL, unsigned DestReg, unsigned SrcReg,
+                   const DebugLoc &DL, unsigned DestReg, unsigned SrcReg,
                    bool KillSrc) const override;
+  MachineInstr *commuteInstructionImpl(MachineInstr &MI, bool NewMI,
+                                       unsigned OpIdx1,
+                                       unsigned OpIdx2) const override;
 
-  bool AnalyzeBranch(MachineBasicBlock &MBB, MachineBasicBlock *&TBB,
+  bool analyzeBranch(MachineBasicBlock &MBB, MachineBasicBlock *&TBB,
                      MachineBasicBlock *&FBB,
                      SmallVectorImpl<MachineOperand> &Cond,
                      bool AllowModify = false) const override;
   unsigned RemoveBranch(MachineBasicBlock &MBB) const override;
   unsigned InsertBranch(MachineBasicBlock &MBB, MachineBasicBlock *TBB,
                         MachineBasicBlock *FBB, ArrayRef<MachineOperand> Cond,
-                        DebugLoc DL) const override;
+                        const DebugLoc &DL) const override;
   bool
   ReverseBranchCondition(SmallVectorImpl<MachineOperand> &Cond) const override;
 };
diff --git a/lib/Target/WebAssembly/WebAssemblyInstrInfo.td b/lib/Target/WebAssembly/WebAssemblyInstrInfo.td
index 2e682a475471..4b319871cf16 100644
--- a/lib/Target/WebAssembly/WebAssemblyInstrInfo.td
+++ b/lib/Target/WebAssembly/WebAssemblyInstrInfo.td
@@ -30,7 +30,7 @@ def SDT_WebAssemblyCallSeqEnd :
     SDCallSeqEnd<[SDTCisVT<0, iPTR>, SDTCisVT<1, iPTR>]>;
 def SDT_WebAssemblyCall0    : SDTypeProfile<0, -1, [SDTCisPtrTy<0>]>;
 def SDT_WebAssemblyCall1    : SDTypeProfile<1, -1, [SDTCisPtrTy<1>]>;
-def SDT_WebAssemblyTableswitch : SDTypeProfile<0, -1, [SDTCisPtrTy<0>]>;
+def SDT_WebAssemblyBrTable  : SDTypeProfile<0, -1, [SDTCisPtrTy<0>]>;
 def SDT_WebAssemblyArgument : SDTypeProfile<1, 1, [SDTCisVT<1, i32>]>;
 def SDT_WebAssemblyReturn   : SDTypeProfile<0, -1, []>;
 def SDT_WebAssemblyWrapper  : SDTypeProfile<1, 1, [SDTCisSameAs<0, 1>,
@@ -52,9 +52,9 @@ def WebAssemblycall0 : SDNode<"WebAssemblyISD::CALL0",
 def WebAssemblycall1 : SDNode<"WebAssemblyISD::CALL1",
                               SDT_WebAssemblyCall1,
                               [SDNPHasChain, SDNPVariadic]>;
-def WebAssemblytableswitch : SDNode<"WebAssemblyISD::TABLESWITCH",
-                                    SDT_WebAssemblyTableswitch,
-                                    [SDNPHasChain, SDNPVariadic]>;
+def WebAssemblybr_table : SDNode<"WebAssemblyISD::BR_TABLE",
+                                 SDT_WebAssemblyBrTable,
+                                 [SDNPHasChain, SDNPVariadic]>;
 def WebAssemblyargument : SDNode<"WebAssemblyISD::ARGUMENT",
                                  SDT_WebAssemblyArgument>;
 def WebAssemblyreturn   : SDNode<"WebAssemblyISD::RETURN",
@@ -71,10 +71,17 @@ let OperandNamespace = "WebAssembly" in {
 let OperandType = "OPERAND_BASIC_BLOCK" in
 def bb_op : Operand<OtherVT>;
 
-let OperandType = "OPERAND_FPIMM" in {
+let OperandType = "OPERAND_FP32IMM" in
 def f32imm_op : Operand<f32>;
+
+let OperandType = "OPERAND_FP64IMM" in
 def f64imm_op : Operand<f64>;
-} // OperandType = "OPERAND_FPIMM"
+
+let OperandType = "OPERAND_P2ALIGN" in {
+def P2Align : Operand<i32> {
+  let PrintMethod = "printWebAssemblyP2AlignOperand";
+}
+} // OperandType = "OPERAND_P2ALIGN"
 
 } // OperandNamespace = "WebAssembly"
 
@@ -101,15 +108,9 @@ defm : ARGUMENT<F64>;
 let Defs = [ARGUMENTS] in {
 
 // get_local and set_local are not generated by instruction selection; they
-// are implied by virtual register uses and defs in most contexts. However,
-// they are explicitly emitted for special purposes.
+// are implied by virtual register uses and defs.
 multiclass LOCAL<WebAssemblyRegClass vt> {
-  def GET_LOCAL_#vt : I<(outs vt:$res), (ins i32imm:$regno), [],
-                        "get_local\t$res, $regno">;
-  // TODO: set_local returns its operand value
-  def SET_LOCAL_#vt : I<(outs), (ins i32imm:$regno, vt:$src), [],
-                        "set_local\t$regno, $src">;
-
+let hasSideEffects = 0 in {
   // COPY_LOCAL is not an actual instruction in wasm, but since we allow
   // get_local and set_local to be implicit, we can have a COPY_LOCAL which
   // is actually a no-op because all the work is done in the implied
@@ -117,13 +118,21 @@ multiclass LOCAL<WebAssemblyRegClass vt> {
   let isAsCheapAsAMove = 1 in
   def COPY_LOCAL_#vt : I<(outs vt:$res), (ins vt:$src), [],
                          "copy_local\t$res, $src">;
+
+  // TEE_LOCAL is similar to COPY_LOCAL, but writes two copies of its result.
+  // Typically this would be used to stackify one result and write the other
+  // result to a local.
+  let isAsCheapAsAMove = 1 in
+  def TEE_LOCAL_#vt : I<(outs vt:$res, vt:$also), (ins vt:$src), [],
+                        "tee_local\t$res, $also, $src">;
+} // hasSideEffects = 0
 }
 defm : LOCAL<I32>;
 defm : LOCAL<I64>;
 defm : LOCAL<F32>;
 defm : LOCAL<F64>;
 
-let isMoveImm = 1 in {
+let isMoveImm = 1, isAsCheapAsAMove = 1, isReMaterializable = 1 in {
 def CONST_I32 : I<(outs I32:$res), (ins i32imm:$imm),
                   [(set I32:$res, imm:$imm)],
                   "i32.const\t$res, $imm">;
@@ -136,7 +145,7 @@ def CONST_F32 : I<(outs F32:$res), (ins f32imm_op:$imm),
 def CONST_F64 : I<(outs F64:$res), (ins f64imm_op:$imm),
                   [(set F64:$res, fpimm:$imm)],
                   "f64.const\t$res, $imm">;
-} // isMoveImm = 1
+} // isMoveImm = 1, isAsCheapAsAMove = 1, isReMaterializable = 1
 
 } // Defs = [ARGUMENTS]
 
diff --git a/lib/Target/WebAssembly/WebAssemblyInstrInteger.td b/lib/Target/WebAssembly/WebAssemblyInstrInteger.td
index 09e5eafb85e9..7eaa57bb217e 100644
--- a/lib/Target/WebAssembly/WebAssemblyInstrInteger.td
+++ b/lib/Target/WebAssembly/WebAssemblyInstrInteger.td
@@ -36,6 +36,8 @@ defm XOR : BinaryInt<xor, "xor ">;
 defm SHL : BinaryInt<shl, "shl ">;
 defm SHR_U : BinaryInt<srl, "shr_u">;
 defm SHR_S : BinaryInt<sra, "shr_s">;
+defm ROTL : BinaryInt<rotl, "rotl">;
+defm ROTR : BinaryInt<rotr, "rotr">;
 
 let isCommutable = 1 in {
 defm EQ : ComparisonInt<SETEQ, "eq  ">;
@@ -54,22 +56,29 @@ defm CLZ : UnaryInt<ctlz, "clz ">;
 defm CTZ : UnaryInt<cttz, "ctz ">;
 defm POPCNT : UnaryInt<ctpop, "popcnt">;
 
+def EQZ_I32 : I<(outs I32:$dst), (ins I32:$src),
+                [(set I32:$dst, (setcc I32:$src, 0, SETEQ))],
+                "i32.eqz \t$dst, $src">;
+def EQZ_I64 : I<(outs I32:$dst), (ins I64:$src),
+                [(set I32:$dst, (setcc I64:$src, 0, SETEQ))],
+                "i64.eqz \t$dst, $src">;
+
 } // Defs = [ARGUMENTS]
 
-// Expand the "don't care" operations to supported operations.
-def : Pat<(ctlz_zero_undef I32:$src), (CLZ_I32 I32:$src)>;
-def : Pat<(ctlz_zero_undef I64:$src), (CLZ_I64 I64:$src)>;
-def : Pat<(cttz_zero_undef I32:$src), (CTZ_I32 I32:$src)>;
-def : Pat<(cttz_zero_undef I64:$src), (CTZ_I64 I64:$src)>;
+// Optimize away an explicit mask on a rotate count.
+def : Pat<(rotl I32:$lhs, (and I32:$rhs, 31)), (ROTL_I32 I32:$lhs, I32:$rhs)>;
+def : Pat<(rotr I32:$lhs, (and I32:$rhs, 31)), (ROTR_I32 I32:$lhs, I32:$rhs)>;
+def : Pat<(rotl I64:$lhs, (and I64:$rhs, 63)), (ROTL_I64 I64:$lhs, I64:$rhs)>;
+def : Pat<(rotr I64:$lhs, (and I64:$rhs, 63)), (ROTR_I64 I64:$lhs, I64:$rhs)>;
 
 let Defs = [ARGUMENTS] in {
 
-def SELECT_I32 : I<(outs I32:$dst), (ins I32:$cond, I32:$lhs, I32:$rhs),
+def SELECT_I32 : I<(outs I32:$dst), (ins I32:$lhs, I32:$rhs, I32:$cond),
                    [(set I32:$dst, (select I32:$cond, I32:$lhs, I32:$rhs))],
-                   "i32.select\t$dst, $cond, $lhs, $rhs">;
-def SELECT_I64 : I<(outs I64:$dst), (ins I32:$cond, I64:$lhs, I64:$rhs),
+                   "i32.select\t$dst, $lhs, $rhs, $cond">;
+def SELECT_I64 : I<(outs I64:$dst), (ins I64:$lhs, I64:$rhs, I32:$cond),
                    [(set I64:$dst, (select I32:$cond, I64:$lhs, I64:$rhs))],
-                   "i64.select\t$dst, $cond, $lhs, $rhs">;
+                   "i64.select\t$dst, $lhs, $rhs, $cond">;
 
 } // Defs = [ARGUMENTS]
 
@@ -77,12 +86,12 @@ def SELECT_I64 : I<(outs I64:$dst), (ins I32:$cond, I64:$lhs, I64:$rhs),
 // WebAssembly's select interprets any non-zero value as true, so we can fold
 // a setne with 0 into a select.
 def : Pat<(select (i32 (setne I32:$cond, 0)), I32:$lhs, I32:$rhs),
-          (SELECT_I32 I32:$cond, I32:$lhs, I32:$rhs)>;
+          (SELECT_I32 I32:$lhs, I32:$rhs, I32:$cond)>;
 def : Pat<(select (i32 (setne I32:$cond, 0)), I64:$lhs, I64:$rhs),
-          (SELECT_I64 I32:$cond, I64:$lhs, I64:$rhs)>;
+          (SELECT_I64 I64:$lhs, I64:$rhs, I32:$cond)>;
 
 // And again, this time with seteq instead of setne and the arms reversed.
 def : Pat<(select (i32 (seteq I32:$cond, 0)), I32:$lhs, I32:$rhs),
-          (SELECT_I32 I32:$cond, I32:$rhs, I32:$lhs)>;
+          (SELECT_I32 I32:$rhs, I32:$lhs, I32:$cond)>;
 def : Pat<(select (i32 (seteq I32:$cond, 0)), I64:$lhs, I64:$rhs),
-          (SELECT_I64 I32:$cond, I64:$rhs, I64:$lhs)>;
+          (SELECT_I64 I64:$rhs, I64:$lhs, I32:$cond)>;
diff --git a/lib/Target/WebAssembly/WebAssemblyInstrMemory.td b/lib/Target/WebAssembly/WebAssemblyInstrMemory.td
index b39ac5212f87..521c664ca4ab 100644
--- a/lib/Target/WebAssembly/WebAssemblyInstrMemory.td
+++ b/lib/Target/WebAssembly/WebAssemblyInstrMemory.td
@@ -28,6 +28,18 @@ def regPlusImm : PatFrag<(ops node:$addr, node:$off),
                          (add node:$addr, node:$off),
                          [{ return N->getFlags()->hasNoUnsignedWrap(); }]>;
 
+// Treat an 'or' node as an 'add' if the or'ed bits are known to be zero.
+def or_is_add : PatFrag<(ops node:$lhs, node:$rhs), (or node:$lhs, node:$rhs),[{
+  if (ConstantSDNode *CN = dyn_cast<ConstantSDNode>(N->getOperand(1)))
+    return CurDAG->MaskedValueIsZero(N->getOperand(0), CN->getAPIntValue());
+
+  APInt KnownZero0, KnownOne0;
+  CurDAG->computeKnownBits(N->getOperand(0), KnownZero0, KnownOne0, 0);
+  APInt KnownZero1, KnownOne1;
+  CurDAG->computeKnownBits(N->getOperand(1), KnownZero1, KnownOne1, 0);
+  return (~KnownZero0 & ~KnownZero1) == 0;
+}]>;
+
 // GlobalAddresses are conceptually unsigned values, so we can also fold them
 // into immediate values as long as their offsets are non-negative.
 def regPlusGA : PatFrag<(ops node:$addr, node:$off),
@@ -46,325 +58,392 @@ def regPlusGA : PatFrag<(ops node:$addr, node:$off),
 let Defs = [ARGUMENTS] in {
 
 // Basic load.
-def LOAD_I32 : I<(outs I32:$dst), (ins i32imm:$off, I32:$addr), [],
-                 "i32.load\t$dst, ${off}(${addr})">;
-def LOAD_I64 : I<(outs I64:$dst), (ins i32imm:$off, I32:$addr), [],
-                 "i64.load\t$dst, ${off}(${addr})">;
-def LOAD_F32 : I<(outs F32:$dst), (ins i32imm:$off, I32:$addr), [],
-                 "f32.load\t$dst, ${off}(${addr})">;
-def LOAD_F64 : I<(outs F64:$dst), (ins i32imm:$off, I32:$addr), [],
-                 "f64.load\t$dst, ${off}(${addr})">;
+def LOAD_I32 : I<(outs I32:$dst), (ins i32imm:$off, I32:$addr,
+                                   P2Align:$p2align), [],
+                 "i32.load\t$dst, ${off}(${addr})${p2align}">;
+def LOAD_I64 : I<(outs I64:$dst), (ins i32imm:$off, I32:$addr,
+                                   P2Align:$p2align), [],
+                 "i64.load\t$dst, ${off}(${addr})${p2align}">;
+def LOAD_F32 : I<(outs F32:$dst), (ins i32imm:$off, I32:$addr,
+                                   P2Align:$p2align), [],
+                 "f32.load\t$dst, ${off}(${addr})${p2align}">;
+def LOAD_F64 : I<(outs F64:$dst), (ins i32imm:$off, I32:$addr,
+                                   P2Align:$p2align), [],
+                 "f64.load\t$dst, ${off}(${addr})${p2align}">;
 
 } // Defs = [ARGUMENTS]
 
 // Select loads with no constant offset.
-def : Pat<(i32 (load I32:$addr)), (LOAD_I32 0, $addr)>;
-def : Pat<(i64 (load I32:$addr)), (LOAD_I64 0, $addr)>;
-def : Pat<(f32 (load I32:$addr)), (LOAD_F32 0, $addr)>;
-def : Pat<(f64 (load I32:$addr)), (LOAD_F64 0, $addr)>;
+def : Pat<(i32 (load I32:$addr)), (LOAD_I32 0, $addr, 0)>;
+def : Pat<(i64 (load I32:$addr)), (LOAD_I64 0, $addr, 0)>;
+def : Pat<(f32 (load I32:$addr)), (LOAD_F32 0, $addr, 0)>;
+def : Pat<(f64 (load I32:$addr)), (LOAD_F64 0, $addr, 0)>;
 
 // Select loads with a constant offset.
 def : Pat<(i32 (load (regPlusImm I32:$addr, imm:$off))),
-          (LOAD_I32 imm:$off, $addr)>;
+          (LOAD_I32 imm:$off, $addr, 0)>;
 def : Pat<(i64 (load (regPlusImm I32:$addr, imm:$off))),
-          (LOAD_I64 imm:$off, $addr)>;
+          (LOAD_I64 imm:$off, $addr, 0)>;
 def : Pat<(f32 (load (regPlusImm I32:$addr, imm:$off))),
-          (LOAD_F32 imm:$off, $addr)>;
+          (LOAD_F32 imm:$off, $addr, 0)>;
 def : Pat<(f64 (load (regPlusImm I32:$addr, imm:$off))),
-          (LOAD_F64 imm:$off, $addr)>;
+          (LOAD_F64 imm:$off, $addr, 0)>;
+def : Pat<(i32 (load (or_is_add I32:$addr, imm:$off))),
+          (LOAD_I32 imm:$off, $addr, 0)>;
+def : Pat<(i64 (load (or_is_add I32:$addr, imm:$off))),
+          (LOAD_I64 imm:$off, $addr, 0)>;
+def : Pat<(f32 (load (or_is_add I32:$addr, imm:$off))),
+          (LOAD_F32 imm:$off, $addr, 0)>;
+def : Pat<(f64 (load (or_is_add I32:$addr, imm:$off))),
+          (LOAD_F64 imm:$off, $addr, 0)>;
 def : Pat<(i32 (load (regPlusGA I32:$addr,
                                 (WebAssemblywrapper tglobaladdr:$off)))),
-          (LOAD_I32 tglobaladdr:$off, $addr)>;
+          (LOAD_I32 tglobaladdr:$off, $addr, 0)>;
 def : Pat<(i64 (load (regPlusGA I32:$addr,
                                 (WebAssemblywrapper tglobaladdr:$off)))),
-          (LOAD_I64 tglobaladdr:$off, $addr)>;
+          (LOAD_I64 tglobaladdr:$off, $addr, 0)>;
 def : Pat<(f32 (load (regPlusGA I32:$addr,
                                 (WebAssemblywrapper tglobaladdr:$off)))),
-          (LOAD_F32 tglobaladdr:$off, $addr)>;
+          (LOAD_F32 tglobaladdr:$off, $addr, 0)>;
 def : Pat<(f64 (load (regPlusGA I32:$addr,
                                 (WebAssemblywrapper tglobaladdr:$off)))),
-          (LOAD_F64 tglobaladdr:$off, $addr)>;
+          (LOAD_F64 tglobaladdr:$off, $addr, 0)>;
 def : Pat<(i32 (load (add I32:$addr, (WebAssemblywrapper texternalsym:$off)))),
-          (LOAD_I32 texternalsym:$off, $addr)>;
+          (LOAD_I32 texternalsym:$off, $addr, 0)>;
 def : Pat<(i64 (load (add I32:$addr, (WebAssemblywrapper texternalsym:$off)))),
-          (LOAD_I64 texternalsym:$off, $addr)>;
+          (LOAD_I64 texternalsym:$off, $addr, 0)>;
 def : Pat<(f32 (load (add I32:$addr, (WebAssemblywrapper texternalsym:$off)))),
-          (LOAD_F32 texternalsym:$off, $addr)>;
+          (LOAD_F32 texternalsym:$off, $addr, 0)>;
 def : Pat<(f64 (load (add I32:$addr, (WebAssemblywrapper texternalsym:$off)))),
-          (LOAD_F64 texternalsym:$off, $addr)>;
+          (LOAD_F64 texternalsym:$off, $addr, 0)>;
 
 // Select loads with just a constant offset.
-def : Pat<(i32 (load imm:$off)), (LOAD_I32 imm:$off, (CONST_I32 0))>;
-def : Pat<(i64 (load imm:$off)), (LOAD_I64 imm:$off, (CONST_I32 0))>;
-def : Pat<(f32 (load imm:$off)), (LOAD_F32 imm:$off, (CONST_I32 0))>;
-def : Pat<(f64 (load imm:$off)), (LOAD_F64 imm:$off, (CONST_I32 0))>;
+def : Pat<(i32 (load imm:$off)), (LOAD_I32 imm:$off, (CONST_I32 0), 0)>;
+def : Pat<(i64 (load imm:$off)), (LOAD_I64 imm:$off, (CONST_I32 0), 0)>;
+def : Pat<(f32 (load imm:$off)), (LOAD_F32 imm:$off, (CONST_I32 0), 0)>;
+def : Pat<(f64 (load imm:$off)), (LOAD_F64 imm:$off, (CONST_I32 0), 0)>;
 def : Pat<(i32 (load (WebAssemblywrapper tglobaladdr:$off))),
-          (LOAD_I32 tglobaladdr:$off, (CONST_I32 0))>;
+          (LOAD_I32 tglobaladdr:$off, (CONST_I32 0), 0)>;
 def : Pat<(i64 (load (WebAssemblywrapper tglobaladdr:$off))),
-          (LOAD_I64 tglobaladdr:$off, (CONST_I32 0))>;
+          (LOAD_I64 tglobaladdr:$off, (CONST_I32 0), 0)>;
 def : Pat<(f32 (load (WebAssemblywrapper tglobaladdr:$off))),
-          (LOAD_F32 tglobaladdr:$off, (CONST_I32 0))>;
+          (LOAD_F32 tglobaladdr:$off, (CONST_I32 0), 0)>;
 def : Pat<(f64 (load (WebAssemblywrapper tglobaladdr:$off))),
-          (LOAD_F64 tglobaladdr:$off, (CONST_I32 0))>;
+          (LOAD_F64 tglobaladdr:$off, (CONST_I32 0), 0)>;
 def : Pat<(i32 (load (WebAssemblywrapper texternalsym:$off))),
-          (LOAD_I32 texternalsym:$off, (CONST_I32 0))>;
+          (LOAD_I32 texternalsym:$off, (CONST_I32 0), 0)>;
 def : Pat<(i64 (load (WebAssemblywrapper texternalsym:$off))),
-          (LOAD_I64 texternalsym:$off, (CONST_I32 0))>;
+          (LOAD_I64 texternalsym:$off, (CONST_I32 0), 0)>;
 def : Pat<(f32 (load (WebAssemblywrapper texternalsym:$off))),
-          (LOAD_F32 texternalsym:$off, (CONST_I32 0))>;
+          (LOAD_F32 texternalsym:$off, (CONST_I32 0), 0)>;
 def : Pat<(f64 (load (WebAssemblywrapper texternalsym:$off))),
-          (LOAD_F64 texternalsym:$off, (CONST_I32 0))>;
+          (LOAD_F64 texternalsym:$off, (CONST_I32 0), 0)>;
 
 let Defs = [ARGUMENTS] in {
 
 // Extending load.
-def LOAD8_S_I32  : I<(outs I32:$dst), (ins i32imm:$off, I32:$addr), [],
-                     "i32.load8_s\t$dst, ${off}(${addr})">;
-def LOAD8_U_I32  : I<(outs I32:$dst), (ins i32imm:$off, I32:$addr), [],
-                     "i32.load8_u\t$dst, ${off}(${addr})">;
-def LOAD16_S_I32 : I<(outs I32:$dst), (ins i32imm:$off, I32:$addr), [],
-                     "i32.load16_s\t$dst, ${off}(${addr})">;
-def LOAD16_U_I32 : I<(outs I32:$dst), (ins i32imm:$off, I32:$addr), [],
-                     "i32.load16_u\t$dst, ${off}(${addr})">;
-def LOAD8_S_I64  : I<(outs I64:$dst), (ins i32imm:$off, I32:$addr), [],
-                     "i64.load8_s\t$dst, ${off}(${addr})">;
-def LOAD8_U_I64  : I<(outs I64:$dst), (ins i32imm:$off, I32:$addr), [],
-                     "i64.load8_u\t$dst, ${off}(${addr})">;
-def LOAD16_S_I64 : I<(outs I64:$dst), (ins i32imm:$off, I32:$addr), [],
-                     "i64.load16_s\t$dst, ${off}(${addr})">;
-def LOAD16_U_I64 : I<(outs I64:$dst), (ins i32imm:$off, I32:$addr), [],
-                     "i64.load16_u\t$dst, ${off}(${addr})">;
-def LOAD32_S_I64 : I<(outs I64:$dst), (ins i32imm:$off, I32:$addr), [],
-                     "i64.load32_s\t$dst, ${off}(${addr})">;
-def LOAD32_U_I64 : I<(outs I64:$dst), (ins i32imm:$off, I32:$addr), [],
-                     "i64.load32_u\t$dst, ${off}(${addr})">;
+def LOAD8_S_I32  : I<(outs I32:$dst), (ins i32imm:$off, I32:$addr,
+                                       P2Align:$p2align), [],
+                     "i32.load8_s\t$dst, ${off}(${addr})${p2align}">;
+def LOAD8_U_I32  : I<(outs I32:$dst), (ins i32imm:$off, I32:$addr,
+                                       P2Align:$p2align), [],
+                     "i32.load8_u\t$dst, ${off}(${addr})${p2align}">;
+def LOAD16_S_I32 : I<(outs I32:$dst), (ins i32imm:$off, I32:$addr,
+                                       P2Align:$p2align), [],
+                     "i32.load16_s\t$dst, ${off}(${addr})${p2align}">;
+def LOAD16_U_I32 : I<(outs I32:$dst), (ins i32imm:$off, I32:$addr,
+                                       P2Align:$p2align), [],
+                     "i32.load16_u\t$dst, ${off}(${addr})${p2align}">;
+def LOAD8_S_I64  : I<(outs I64:$dst), (ins i32imm:$off, I32:$addr,
+                                       P2Align:$p2align), [],
+                     "i64.load8_s\t$dst, ${off}(${addr})${p2align}">;
+def LOAD8_U_I64  : I<(outs I64:$dst), (ins i32imm:$off, I32:$addr,
+                                       P2Align:$p2align), [],
+                     "i64.load8_u\t$dst, ${off}(${addr})${p2align}">;
+def LOAD16_S_I64 : I<(outs I64:$dst), (ins i32imm:$off, I32:$addr,
+                                       P2Align:$p2align), [],
+                     "i64.load16_s\t$dst, ${off}(${addr})${p2align}">;
+def LOAD16_U_I64 : I<(outs I64:$dst), (ins i32imm:$off, I32:$addr,
+                                       P2Align:$p2align), [],
+                     "i64.load16_u\t$dst, ${off}(${addr})${p2align}">;
+def LOAD32_S_I64 : I<(outs I64:$dst), (ins i32imm:$off, I32:$addr,
+                                       P2Align:$p2align), [],
+                     "i64.load32_s\t$dst, ${off}(${addr})${p2align}">;
+def LOAD32_U_I64 : I<(outs I64:$dst), (ins i32imm:$off, I32:$addr,
+                                       P2Align:$p2align), [],
+                     "i64.load32_u\t$dst, ${off}(${addr})${p2align}">;
 
 } // Defs = [ARGUMENTS]
 
 // Select extending loads with no constant offset.
-def : Pat<(i32 (sextloadi8 I32:$addr)), (LOAD8_S_I32 0, $addr)>;
-def : Pat<(i32 (zextloadi8 I32:$addr)), (LOAD8_U_I32 0, $addr)>;
-def : Pat<(i32 (sextloadi16 I32:$addr)), (LOAD16_S_I32 0, $addr)>;
-def : Pat<(i32 (zextloadi16 I32:$addr)), (LOAD16_U_I32 0, $addr)>;
-def : Pat<(i64 (sextloadi8 I32:$addr)), (LOAD8_S_I64 0, $addr)>;
-def : Pat<(i64 (zextloadi8 I32:$addr)), (LOAD8_U_I64 0, $addr)>;
-def : Pat<(i64 (sextloadi16 I32:$addr)), (LOAD16_S_I64 0, $addr)>;
-def : Pat<(i64 (zextloadi16 I32:$addr)), (LOAD16_U_I64 0, $addr)>;
-def : Pat<(i64 (sextloadi32 I32:$addr)), (LOAD32_S_I64 0, $addr)>;
-def : Pat<(i64 (zextloadi32 I32:$addr)), (LOAD32_U_I64 0, $addr)>;
+def : Pat<(i32 (sextloadi8 I32:$addr)), (LOAD8_S_I32 0, $addr, 0)>;
+def : Pat<(i32 (zextloadi8 I32:$addr)), (LOAD8_U_I32 0, $addr, 0)>;
+def : Pat<(i32 (sextloadi16 I32:$addr)), (LOAD16_S_I32 0, $addr, 0)>;
+def : Pat<(i32 (zextloadi16 I32:$addr)), (LOAD16_U_I32 0, $addr, 0)>;
+def : Pat<(i64 (sextloadi8 I32:$addr)), (LOAD8_S_I64 0, $addr, 0)>;
+def : Pat<(i64 (zextloadi8 I32:$addr)), (LOAD8_U_I64 0, $addr, 0)>;
+def : Pat<(i64 (sextloadi16 I32:$addr)), (LOAD16_S_I64 0, $addr, 0)>;
+def : Pat<(i64 (zextloadi16 I32:$addr)), (LOAD16_U_I64 0, $addr, 0)>;
+def : Pat<(i64 (sextloadi32 I32:$addr)), (LOAD32_S_I64 0, $addr, 0)>;
+def : Pat<(i64 (zextloadi32 I32:$addr)), (LOAD32_U_I64 0, $addr, 0)>;
 
 // Select extending loads with a constant offset.
 def : Pat<(i32 (sextloadi8 (regPlusImm I32:$addr, imm:$off))),
-          (LOAD8_S_I32 imm:$off, $addr)>;
+          (LOAD8_S_I32 imm:$off, $addr, 0)>;
 def : Pat<(i32 (zextloadi8 (regPlusImm I32:$addr, imm:$off))),
-          (LOAD8_U_I32 imm:$off, $addr)>;
+          (LOAD8_U_I32 imm:$off, $addr, 0)>;
 def : Pat<(i32 (sextloadi16 (regPlusImm I32:$addr, imm:$off))),
-          (LOAD16_S_I32 imm:$off, $addr)>;
+          (LOAD16_S_I32 imm:$off, $addr, 0)>;
 def : Pat<(i32 (zextloadi16 (regPlusImm I32:$addr, imm:$off))),
-          (LOAD16_U_I32 imm:$off, $addr)>;
+          (LOAD16_U_I32 imm:$off, $addr, 0)>;
 def : Pat<(i64 (sextloadi8 (regPlusImm I32:$addr, imm:$off))),
-          (LOAD8_S_I64 imm:$off, $addr)>;
+          (LOAD8_S_I64 imm:$off, $addr, 0)>;
 def : Pat<(i64 (zextloadi8 (regPlusImm I32:$addr, imm:$off))),
-          (LOAD8_U_I64 imm:$off, $addr)>;
+          (LOAD8_U_I64 imm:$off, $addr, 0)>;
 def : Pat<(i64 (sextloadi16 (regPlusImm I32:$addr, imm:$off))),
-          (LOAD16_S_I64 imm:$off, $addr)>;
+          (LOAD16_S_I64 imm:$off, $addr, 0)>;
 def : Pat<(i64 (zextloadi16 (regPlusImm I32:$addr, imm:$off))),
-          (LOAD16_U_I64 imm:$off, $addr)>;
+          (LOAD16_U_I64 imm:$off, $addr, 0)>;
 def : Pat<(i64 (sextloadi32 (regPlusImm I32:$addr, imm:$off))),
-          (LOAD32_S_I64 imm:$off, $addr)>;
+          (LOAD32_S_I64 imm:$off, $addr, 0)>;
 def : Pat<(i64 (zextloadi32 (regPlusImm I32:$addr, imm:$off))),
-          (LOAD32_U_I64 imm:$off, $addr)>;
+          (LOAD32_U_I64 imm:$off, $addr, 0)>;
+def : Pat<(i32 (sextloadi8 (or_is_add I32:$addr, imm:$off))),
+          (LOAD8_S_I32 imm:$off, $addr, 0)>;
+def : Pat<(i32 (zextloadi8 (or_is_add I32:$addr, imm:$off))),
+          (LOAD8_U_I32 imm:$off, $addr, 0)>;
+def : Pat<(i32 (sextloadi16 (or_is_add I32:$addr, imm:$off))),
+          (LOAD16_S_I32 imm:$off, $addr, 0)>;
+def : Pat<(i32 (zextloadi16 (or_is_add I32:$addr, imm:$off))),
+          (LOAD16_U_I32 imm:$off, $addr, 0)>;
+def : Pat<(i64 (sextloadi8 (or_is_add I32:$addr, imm:$off))),
+          (LOAD8_S_I64 imm:$off, $addr, 0)>;
+def : Pat<(i64 (zextloadi8 (or_is_add I32:$addr, imm:$off))),
+          (LOAD8_U_I64 imm:$off, $addr, 0)>;
+def : Pat<(i64 (sextloadi16 (or_is_add I32:$addr, imm:$off))),
+          (LOAD16_S_I64 imm:$off, $addr, 0)>;
+def : Pat<(i64 (zextloadi16 (or_is_add I32:$addr, imm:$off))),
+          (LOAD16_U_I64 imm:$off, $addr, 0)>;
+def : Pat<(i64 (sextloadi32 (or_is_add I32:$addr, imm:$off))),
+          (LOAD32_S_I64 imm:$off, $addr, 0)>;
+def : Pat<(i64 (zextloadi32 (or_is_add I32:$addr, imm:$off))),
+          (LOAD32_U_I64 imm:$off, $addr, 0)>;
 def : Pat<(i32 (sextloadi8 (regPlusGA I32:$addr,
                                       (WebAssemblywrapper tglobaladdr:$off)))),
-          (LOAD8_S_I32 tglobaladdr:$off, $addr)>;
+          (LOAD8_S_I32 tglobaladdr:$off, $addr, 0)>;
 def : Pat<(i32 (zextloadi8 (regPlusGA I32:$addr,
                                       (WebAssemblywrapper tglobaladdr:$off)))),
-          (LOAD8_U_I32 tglobaladdr:$off, $addr)>;
+          (LOAD8_U_I32 tglobaladdr:$off, $addr, 0)>;
 def : Pat<(i32 (sextloadi16 (regPlusGA I32:$addr,
                                        (WebAssemblywrapper tglobaladdr:$off)))),
-          (LOAD16_S_I32 tglobaladdr:$off, $addr)>;
+          (LOAD16_S_I32 tglobaladdr:$off, $addr, 0)>;
 def : Pat<(i32 (zextloadi16 (regPlusGA I32:$addr,
                                        (WebAssemblywrapper tglobaladdr:$off)))),
-          (LOAD16_U_I32 tglobaladdr:$off, $addr)>;
+          (LOAD16_U_I32 tglobaladdr:$off, $addr, 0)>;
 def : Pat<(i64 (sextloadi8 (regPlusGA I32:$addr,
                                       (WebAssemblywrapper tglobaladdr:$off)))),
-          (LOAD8_S_I64 tglobaladdr:$off, $addr)>;
+          (LOAD8_S_I64 tglobaladdr:$off, $addr, 0)>;
 def : Pat<(i64 (zextloadi8 (regPlusGA I32:$addr,
                                       (WebAssemblywrapper tglobaladdr:$off)))),
-          (LOAD8_U_I64 tglobaladdr:$off, $addr)>;
+          (LOAD8_U_I64 tglobaladdr:$off, $addr, 0)>;
 def : Pat<(i64 (sextloadi16 (regPlusGA I32:$addr,
                                        (WebAssemblywrapper tglobaladdr:$off)))),
-          (LOAD16_S_I64 tglobaladdr:$off, $addr)>;
+          (LOAD16_S_I64 tglobaladdr:$off, $addr, 0)>;
 def : Pat<(i64 (zextloadi16 (regPlusGA I32:$addr,
                                        (WebAssemblywrapper tglobaladdr:$off)))),
-          (LOAD16_U_I64 tglobaladdr:$off, $addr)>;
+          (LOAD16_U_I64 tglobaladdr:$off, $addr, 0)>;
 def : Pat<(i64 (sextloadi32 (regPlusGA I32:$addr,
                                        (WebAssemblywrapper tglobaladdr:$off)))),
-          (LOAD32_S_I64 tglobaladdr:$off, $addr)>;
+          (LOAD32_S_I64 tglobaladdr:$off, $addr, 0)>;
 def : Pat<(i64 (zextloadi32 (regPlusGA I32:$addr,
                                        (WebAssemblywrapper tglobaladdr:$off)))),
-          (LOAD32_U_I64 tglobaladdr:$off, $addr)>;
+          (LOAD32_U_I64 tglobaladdr:$off, $addr, 0)>;
 def : Pat<(i32 (sextloadi8 (add I32:$addr,
                                 (WebAssemblywrapper texternalsym:$off)))),
-          (LOAD8_S_I32 texternalsym:$off, $addr)>;
+          (LOAD8_S_I32 texternalsym:$off, $addr, 0)>;
 def : Pat<(i32 (zextloadi8 (add I32:$addr,
                                 (WebAssemblywrapper texternalsym:$off)))),
-          (LOAD8_U_I32 texternalsym:$off, $addr)>;
+          (LOAD8_U_I32 texternalsym:$off, $addr, 0)>;
 def : Pat<(i32 (sextloadi16 (add I32:$addr,
                                  (WebAssemblywrapper texternalsym:$off)))),
-          (LOAD16_S_I32 texternalsym:$off, $addr)>;
+          (LOAD16_S_I32 texternalsym:$off, $addr, 0)>;
 def : Pat<(i32 (zextloadi16 (add I32:$addr,
                                  (WebAssemblywrapper texternalsym:$off)))),
-          (LOAD16_U_I32 texternalsym:$off, $addr)>;
+          (LOAD16_U_I32 texternalsym:$off, $addr, 0)>;
 def : Pat<(i64 (sextloadi8 (add I32:$addr,
                                 (WebAssemblywrapper texternalsym:$off)))),
-          (LOAD8_S_I64 texternalsym:$off, $addr)>;
+          (LOAD8_S_I64 texternalsym:$off, $addr, 0)>;
 def : Pat<(i64 (zextloadi8 (add I32:$addr,
                                 (WebAssemblywrapper texternalsym:$off)))),
-          (LOAD8_U_I64 texternalsym:$off, $addr)>;
+          (LOAD8_U_I64 texternalsym:$off, $addr, 0)>;
 def : Pat<(i64 (sextloadi16 (add I32:$addr,
                                  (WebAssemblywrapper texternalsym:$off)))),
-          (LOAD16_S_I64 texternalsym:$off, $addr)>;
+          (LOAD16_S_I64 texternalsym:$off, $addr, 0)>;
 def : Pat<(i64 (zextloadi16 (add I32:$addr,
                                  (WebAssemblywrapper texternalsym:$off)))),
-          (LOAD16_U_I64 texternalsym:$off, $addr)>;
+          (LOAD16_U_I64 texternalsym:$off, $addr, 0)>;
 def : Pat<(i64 (sextloadi32 (add I32:$addr,
                                  (WebAssemblywrapper texternalsym:$off)))),
-          (LOAD32_S_I64 texternalsym:$off, $addr)>;
+          (LOAD32_S_I64 texternalsym:$off, $addr, 0)>;
 def : Pat<(i64 (zextloadi32 (add I32:$addr,
                                  (WebAssemblywrapper texternalsym:$off)))),
-          (LOAD32_U_I64 texternalsym:$off, $addr)>;
+          (LOAD32_U_I64 texternalsym:$off, $addr, 0)>;
 
 // Select extending loads with just a constant offset.
-def : Pat<(i32 (sextloadi8 imm:$off)), (LOAD8_S_I32 imm:$off, (CONST_I32 0))>;
-def : Pat<(i32 (zextloadi8 imm:$off)), (LOAD8_U_I32 imm:$off, (CONST_I32 0))>;
-def : Pat<(i32 (sextloadi16 imm:$off)), (LOAD16_S_I32 imm:$off, (CONST_I32 0))>;
-def : Pat<(i32 (zextloadi16 imm:$off)), (LOAD16_U_I32 imm:$off, (CONST_I32 0))>;
-def : Pat<(i64 (sextloadi8 imm:$off)), (LOAD8_S_I64 imm:$off, (CONST_I32 0))>;
-def : Pat<(i64 (zextloadi8 imm:$off)), (LOAD8_U_I64 imm:$off, (CONST_I32 0))>;
-def : Pat<(i64 (sextloadi16 imm:$off)), (LOAD16_S_I64 imm:$off, (CONST_I32 0))>;
-def : Pat<(i64 (zextloadi16 imm:$off)), (LOAD16_U_I64 imm:$off, (CONST_I32 0))>;
-def : Pat<(i64 (sextloadi32 imm:$off)), (LOAD32_S_I64 imm:$off, (CONST_I32 0))>;
-def : Pat<(i64 (zextloadi32 imm:$off)), (LOAD32_U_I64 imm:$off, (CONST_I32 0))>;
+def : Pat<(i32 (sextloadi8 imm:$off)),
+          (LOAD8_S_I32 imm:$off, (CONST_I32 0), 0)>;
+def : Pat<(i32 (zextloadi8 imm:$off)),
+          (LOAD8_U_I32 imm:$off, (CONST_I32 0), 0)>;
+def : Pat<(i32 (sextloadi16 imm:$off)),
+          (LOAD16_S_I32 imm:$off, (CONST_I32 0), 0)>;
+def : Pat<(i32 (zextloadi16 imm:$off)),
+          (LOAD16_U_I32 imm:$off, (CONST_I32 0), 0)>;
+def : Pat<(i64 (sextloadi8 imm:$off)),
+          (LOAD8_S_I64 imm:$off, (CONST_I32 0), 0)>;
+def : Pat<(i64 (zextloadi8 imm:$off)),
+          (LOAD8_U_I64 imm:$off, (CONST_I32 0), 0)>;
+def : Pat<(i64 (sextloadi16 imm:$off)),
+          (LOAD16_S_I64 imm:$off, (CONST_I32 0), 0)>;
+def : Pat<(i64 (zextloadi16 imm:$off)),
+          (LOAD16_U_I64 imm:$off, (CONST_I32 0), 0)>;
+def : Pat<(i64 (sextloadi32 imm:$off)),
+          (LOAD32_S_I64 imm:$off, (CONST_I32 0), 0)>;
+def : Pat<(i64 (zextloadi32 imm:$off)),
+          (LOAD32_U_I64 imm:$off, (CONST_I32 0), 0)>;
 def : Pat<(i32 (sextloadi8 (WebAssemblywrapper tglobaladdr:$off))),
-          (LOAD8_S_I32 tglobaladdr:$off, (CONST_I32 0))>;
+          (LOAD8_S_I32 tglobaladdr:$off, (CONST_I32 0), 0)>;
 def : Pat<(i32 (zextloadi8 (WebAssemblywrapper tglobaladdr:$off))),
-          (LOAD8_U_I32 tglobaladdr:$off, (CONST_I32 0))>;
+          (LOAD8_U_I32 tglobaladdr:$off, (CONST_I32 0), 0)>;
 def : Pat<(i32 (sextloadi16 (WebAssemblywrapper tglobaladdr:$off))),
-          (LOAD16_S_I32 tglobaladdr:$off, (CONST_I32 0))>;
+          (LOAD16_S_I32 tglobaladdr:$off, (CONST_I32 0), 0)>;
 def : Pat<(i32 (zextloadi16 (WebAssemblywrapper tglobaladdr:$off))),
-          (LOAD16_U_I32 tglobaladdr:$off, (CONST_I32 0))>;
+          (LOAD16_U_I32 tglobaladdr:$off, (CONST_I32 0), 0)>;
 def : Pat<(i64 (sextloadi8 (WebAssemblywrapper tglobaladdr:$off))),
-          (LOAD8_S_I64 tglobaladdr:$off, (CONST_I32 0))>;
+          (LOAD8_S_I64 tglobaladdr:$off, (CONST_I32 0), 0)>;
 def : Pat<(i64 (zextloadi8 (WebAssemblywrapper tglobaladdr:$off))),
-          (LOAD8_U_I64 tglobaladdr:$off, (CONST_I32 0))>;
+          (LOAD8_U_I64 tglobaladdr:$off, (CONST_I32 0), 0)>;
 def : Pat<(i64 (sextloadi16 (WebAssemblywrapper tglobaladdr:$off))),
-          (LOAD16_S_I64 tglobaladdr:$off, (CONST_I32 0))>;
+          (LOAD16_S_I64 tglobaladdr:$off, (CONST_I32 0), 0)>;
 def : Pat<(i64 (zextloadi16 (WebAssemblywrapper tglobaladdr:$off))),
-          (LOAD16_U_I64 tglobaladdr:$off, (CONST_I32 0))>;
+          (LOAD16_U_I64 tglobaladdr:$off, (CONST_I32 0), 0)>;
 def : Pat<(i64 (sextloadi32 (WebAssemblywrapper tglobaladdr:$off))),
-          (LOAD32_S_I64 tglobaladdr:$off, (CONST_I32 0))>;
+          (LOAD32_S_I64 tglobaladdr:$off, (CONST_I32 0), 0)>;
 def : Pat<(i64 (zextloadi32 (WebAssemblywrapper tglobaladdr:$off))),
-          (LOAD32_U_I64 tglobaladdr:$off, (CONST_I32 0))>;
+          (LOAD32_U_I64 tglobaladdr:$off, (CONST_I32 0), 0)>;
 def : Pat<(i32 (sextloadi8 (WebAssemblywrapper texternalsym:$off))),
-          (LOAD8_S_I32 texternalsym:$off, (CONST_I32 0))>;
+          (LOAD8_S_I32 texternalsym:$off, (CONST_I32 0), 0)>;
 def : Pat<(i32 (zextloadi8 (WebAssemblywrapper texternalsym:$off))),
-          (LOAD8_U_I32 texternalsym:$off, (CONST_I32 0))>;
+          (LOAD8_U_I32 texternalsym:$off, (CONST_I32 0), 0)>;
 def : Pat<(i32 (sextloadi16 (WebAssemblywrapper texternalsym:$off))),
-          (LOAD16_S_I32 texternalsym:$off, (CONST_I32 0))>;
+          (LOAD16_S_I32 texternalsym:$off, (CONST_I32 0), 0)>;
 def : Pat<(i32 (zextloadi16 (WebAssemblywrapper texternalsym:$off))),
-          (LOAD16_U_I32 texternalsym:$off, (CONST_I32 0))>;
+          (LOAD16_U_I32 texternalsym:$off, (CONST_I32 0), 0)>;
 def : Pat<(i64 (sextloadi8 (WebAssemblywrapper texternalsym:$off))),
-          (LOAD8_S_I64 texternalsym:$off, (CONST_I32 0))>;
+          (LOAD8_S_I64 texternalsym:$off, (CONST_I32 0), 0)>;
 def : Pat<(i64 (zextloadi8 (WebAssemblywrapper texternalsym:$off))),
-          (LOAD8_U_I64 texternalsym:$off, (CONST_I32 0))>;
+          (LOAD8_U_I64 texternalsym:$off, (CONST_I32 0), 0)>;
 def : Pat<(i64 (sextloadi16 (WebAssemblywrapper texternalsym:$off))),
-          (LOAD16_S_I64 texternalsym:$off, (CONST_I32 0))>;
+          (LOAD16_S_I64 texternalsym:$off, (CONST_I32 0), 0)>;
 def : Pat<(i64 (zextloadi16 (WebAssemblywrapper texternalsym:$off))),
-          (LOAD16_U_I64 texternalsym:$off, (CONST_I32 0))>;
+          (LOAD16_U_I64 texternalsym:$off, (CONST_I32 0), 0)>;
 def : Pat<(i64 (sextloadi32 (WebAssemblywrapper texternalsym:$off))),
-          (LOAD32_S_I64 texternalsym:$off, (CONST_I32 0))>;
+          (LOAD32_S_I64 texternalsym:$off, (CONST_I32 0), 0)>;
 def : Pat<(i64 (zextloadi32 (WebAssemblywrapper texternalsym:$off))),
-          (LOAD32_U_I64 texternalsym:$off, (CONST_I32 0))>;
+          (LOAD32_U_I64 texternalsym:$off, (CONST_I32 0), 0)>;
 
 // Resolve "don't care" extending loads to zero-extending loads. This is
 // somewhat arbitrary, but zero-extending is conceptually simpler.
 
 // Select "don't care" extending loads with no constant offset.
-def : Pat<(i32 (extloadi8 I32:$addr)),  (LOAD8_U_I32 0, $addr)>;
-def : Pat<(i32 (extloadi16 I32:$addr)), (LOAD16_U_I32 0, $addr)>;
-def : Pat<(i64 (extloadi8 I32:$addr)),  (LOAD8_U_I64 0, $addr)>;
-def : Pat<(i64 (extloadi16 I32:$addr)), (LOAD16_U_I64 0, $addr)>;
-def : Pat<(i64 (extloadi32 I32:$addr)), (LOAD32_U_I64 0, $addr)>;
+def : Pat<(i32 (extloadi8 I32:$addr)),  (LOAD8_U_I32 0, $addr, 0)>;
+def : Pat<(i32 (extloadi16 I32:$addr)), (LOAD16_U_I32 0, $addr, 0)>;
+def : Pat<(i64 (extloadi8 I32:$addr)),  (LOAD8_U_I64 0, $addr, 0)>;
+def : Pat<(i64 (extloadi16 I32:$addr)), (LOAD16_U_I64 0, $addr, 0)>;
+def : Pat<(i64 (extloadi32 I32:$addr)), (LOAD32_U_I64 0, $addr, 0)>;
 
 // Select "don't care" extending loads with a constant offset.
 def : Pat<(i32 (extloadi8 (regPlusImm I32:$addr, imm:$off))),
-          (LOAD8_U_I32 imm:$off, $addr)>;
+          (LOAD8_U_I32 imm:$off, $addr, 0)>;
 def : Pat<(i32 (extloadi16 (regPlusImm I32:$addr, imm:$off))),
-          (LOAD16_U_I32 imm:$off, $addr)>;
+          (LOAD16_U_I32 imm:$off, $addr, 0)>;
 def : Pat<(i64 (extloadi8 (regPlusImm I32:$addr, imm:$off))),
-          (LOAD8_U_I64 imm:$off, $addr)>;
+          (LOAD8_U_I64 imm:$off, $addr, 0)>;
 def : Pat<(i64 (extloadi16 (regPlusImm I32:$addr, imm:$off))),
-          (LOAD16_U_I64 imm:$off, $addr)>;
+          (LOAD16_U_I64 imm:$off, $addr, 0)>;
 def : Pat<(i64 (extloadi32 (regPlusImm I32:$addr, imm:$off))),
-          (LOAD32_U_I64 imm:$off, $addr)>;
+          (LOAD32_U_I64 imm:$off, $addr, 0)>;
+def : Pat<(i32 (extloadi8 (or_is_add I32:$addr, imm:$off))),
+          (LOAD8_U_I32 imm:$off, $addr, 0)>;
+def : Pat<(i32 (extloadi16 (or_is_add I32:$addr, imm:$off))),
+          (LOAD16_U_I32 imm:$off, $addr, 0)>;
+def : Pat<(i64 (extloadi8 (or_is_add I32:$addr, imm:$off))),
+          (LOAD8_U_I64 imm:$off, $addr, 0)>;
+def : Pat<(i64 (extloadi16 (or_is_add I32:$addr, imm:$off))),
+          (LOAD16_U_I64 imm:$off, $addr, 0)>;
+def : Pat<(i64 (extloadi32 (or_is_add I32:$addr, imm:$off))),
+          (LOAD32_U_I64 imm:$off, $addr, 0)>;
 def : Pat<(i32 (extloadi8 (regPlusGA I32:$addr,
                                      (WebAssemblywrapper tglobaladdr:$off)))),
-          (LOAD8_U_I32 tglobaladdr:$off, $addr)>;
+          (LOAD8_U_I32 tglobaladdr:$off, $addr, 0)>;
 def : Pat<(i32 (extloadi16 (regPlusGA I32:$addr,
                                       (WebAssemblywrapper tglobaladdr:$off)))),
-          (LOAD16_U_I32 tglobaladdr:$off, $addr)>;
+          (LOAD16_U_I32 tglobaladdr:$off, $addr, 0)>;
 def : Pat<(i64 (extloadi8 (regPlusGA I32:$addr,
                                      (WebAssemblywrapper tglobaladdr:$off)))),
-          (LOAD8_U_I64 tglobaladdr:$off, $addr)>;
+          (LOAD8_U_I64 tglobaladdr:$off, $addr, 0)>;
 def : Pat<(i64 (extloadi16 (regPlusGA I32:$addr,
                                       (WebAssemblywrapper tglobaladdr:$off)))),
-          (LOAD16_U_I64 tglobaladdr:$off, $addr)>;
+          (LOAD16_U_I64 tglobaladdr:$off, $addr, 0)>;
 def : Pat<(i64 (extloadi32 (regPlusGA I32:$addr,
                                       (WebAssemblywrapper tglobaladdr:$off)))),
-          (LOAD32_U_I64 tglobaladdr:$off, $addr)>;
+          (LOAD32_U_I64 tglobaladdr:$off, $addr, 0)>;
 def : Pat<(i32 (extloadi8 (add I32:$addr,
                                (WebAssemblywrapper texternalsym:$off)))),
-          (LOAD8_U_I32 texternalsym:$off, $addr)>;
+          (LOAD8_U_I32 texternalsym:$off, $addr, 0)>;
 def : Pat<(i32 (extloadi16 (add I32:$addr,
                                 (WebAssemblywrapper texternalsym:$off)))),
-          (LOAD16_U_I32 texternalsym:$off, $addr)>;
+          (LOAD16_U_I32 texternalsym:$off, $addr, 0)>;
 def : Pat<(i64 (extloadi8 (add I32:$addr,
                                (WebAssemblywrapper texternalsym:$off)))),
-          (LOAD8_U_I64 texternalsym:$off, $addr)>;
+          (LOAD8_U_I64 texternalsym:$off, $addr, 0)>;
 def : Pat<(i64 (extloadi16 (add I32:$addr,
                                 (WebAssemblywrapper texternalsym:$off)))),
-          (LOAD16_U_I64 texternalsym:$off, $addr)>;
+          (LOAD16_U_I64 texternalsym:$off, $addr, 0)>;
 def : Pat<(i64 (extloadi32 (add I32:$addr,
                                 (WebAssemblywrapper texternalsym:$off)))),
-          (LOAD32_U_I64 texternalsym:$off, $addr)>;
+          (LOAD32_U_I64 texternalsym:$off, $addr, 0)>;
 
 // Select "don't care" extending loads with just a constant offset.
-def : Pat<(i32 (extloadi8 imm:$off)), (LOAD8_U_I32 imm:$off, (CONST_I32 0))>;
-def : Pat<(i32 (extloadi16 imm:$off)), (LOAD16_U_I32 imm:$off, (CONST_I32 0))>;
-def : Pat<(i64 (extloadi8 imm:$off)), (LOAD8_U_I64 imm:$off, (CONST_I32 0))>;
-def : Pat<(i64 (extloadi16 imm:$off)), (LOAD16_U_I64 imm:$off, (CONST_I32 0))>;
-def : Pat<(i64 (extloadi32 imm:$off)), (LOAD32_U_I64 imm:$off, (CONST_I32 0))>;
+def : Pat<(i32 (extloadi8 imm:$off)),
+          (LOAD8_U_I32 imm:$off, (CONST_I32 0), 0)>;
+def : Pat<(i32 (extloadi16 imm:$off)),
+          (LOAD16_U_I32 imm:$off, (CONST_I32 0), 0)>;
+def : Pat<(i64 (extloadi8 imm:$off)),
+          (LOAD8_U_I64 imm:$off, (CONST_I32 0), 0)>;
+def : Pat<(i64 (extloadi16 imm:$off)),
+          (LOAD16_U_I64 imm:$off, (CONST_I32 0), 0)>;
+def : Pat<(i64 (extloadi32 imm:$off)),
+          (LOAD32_U_I64 imm:$off, (CONST_I32 0), 0)>;
 def : Pat<(i32 (extloadi8 (WebAssemblywrapper tglobaladdr:$off))),
-          (LOAD8_U_I32 tglobaladdr:$off, (CONST_I32 0))>;
+          (LOAD8_U_I32 tglobaladdr:$off, (CONST_I32 0), 0)>;
 def : Pat<(i32 (extloadi16 (WebAssemblywrapper tglobaladdr:$off))),
-          (LOAD16_U_I32 tglobaladdr:$off, (CONST_I32 0))>;
+          (LOAD16_U_I32 tglobaladdr:$off, (CONST_I32 0), 0)>;
 def : Pat<(i64 (extloadi8 (WebAssemblywrapper tglobaladdr:$off))),
-          (LOAD8_U_I64 tglobaladdr:$off, (CONST_I32 0))>;
+          (LOAD8_U_I64 tglobaladdr:$off, (CONST_I32 0), 0)>;
 def : Pat<(i64 (extloadi16 (WebAssemblywrapper tglobaladdr:$off))),
-          (LOAD16_U_I64 tglobaladdr:$off, (CONST_I32 0))>;
+          (LOAD16_U_I64 tglobaladdr:$off, (CONST_I32 0), 0)>;
 def : Pat<(i64 (extloadi32 (WebAssemblywrapper tglobaladdr:$off))),
-          (LOAD32_U_I64 tglobaladdr:$off, (CONST_I32 0))>;
+          (LOAD32_U_I64 tglobaladdr:$off, (CONST_I32 0), 0)>;
 def : Pat<(i32 (extloadi8 (WebAssemblywrapper texternalsym:$off))),
-          (LOAD8_U_I32 texternalsym:$off, (CONST_I32 0))>;
+          (LOAD8_U_I32 texternalsym:$off, (CONST_I32 0), 0)>;
 def : Pat<(i32 (extloadi16 (WebAssemblywrapper texternalsym:$off))),
-          (LOAD16_U_I32 texternalsym:$off, (CONST_I32 0))>;
+          (LOAD16_U_I32 texternalsym:$off, (CONST_I32 0), 0)>;
 def : Pat<(i64 (extloadi8 (WebAssemblywrapper texternalsym:$off))),
-          (LOAD8_U_I64 texternalsym:$off, (CONST_I32 0))>;
+          (LOAD8_U_I64 texternalsym:$off, (CONST_I32 0), 0)>;
 def : Pat<(i64 (extloadi16 (WebAssemblywrapper texternalsym:$off))),
-          (LOAD16_U_I64 texternalsym:$off, (CONST_I32 0))>;
+          (LOAD16_U_I64 texternalsym:$off, (CONST_I32 0), 0)>;
 def : Pat<(i64 (extloadi32 (WebAssemblywrapper texternalsym:$off))),
-          (LOAD32_U_I64 tglobaladdr:$off, (CONST_I32 0))>;
+          (LOAD32_U_I64 tglobaladdr:$off, (CONST_I32 0), 0)>;
 
 let Defs = [ARGUMENTS] in {
 
@@ -374,205 +453,232 @@ let Defs = [ARGUMENTS] in {
 // instruction definition patterns that don't reference all of the output
 // operands.
 // Note: WebAssembly inverts SelectionDAG's usual operand order.
-def STORE_I32  : I<(outs I32:$dst), (ins i32imm:$off, I32:$addr, I32:$val), [],
-                   "i32.store\t$dst, ${off}(${addr}), $val">;
-def STORE_I64  : I<(outs I64:$dst), (ins i32imm:$off, I32:$addr, I64:$val), [],
-                   "i64.store\t$dst, ${off}(${addr}), $val">;
-def STORE_F32  : I<(outs F32:$dst), (ins i32imm:$off, I32:$addr, F32:$val), [],
-                   "f32.store\t$dst, ${off}(${addr}), $val">;
-def STORE_F64  : I<(outs F64:$dst), (ins i32imm:$off, I32:$addr, F64:$val), [],
-                   "f64.store\t$dst, ${off}(${addr}), $val">;
+def STORE_I32  : I<(outs I32:$dst), (ins i32imm:$off, I32:$addr,
+                                     P2Align:$p2align, I32:$val), [],
+                   "i32.store\t$dst, ${off}(${addr})${p2align}, $val">;
+def STORE_I64  : I<(outs I64:$dst), (ins i32imm:$off, I32:$addr,
+                                     P2Align:$p2align, I64:$val), [],
+                   "i64.store\t$dst, ${off}(${addr})${p2align}, $val">;
+def STORE_F32  : I<(outs F32:$dst), (ins i32imm:$off, I32:$addr,
+                                     P2Align:$p2align, F32:$val), [],
+                   "f32.store\t$dst, ${off}(${addr})${p2align}, $val">;
+def STORE_F64  : I<(outs F64:$dst), (ins i32imm:$off, I32:$addr,
+                                     P2Align:$p2align, F64:$val), [],
+                   "f64.store\t$dst, ${off}(${addr})${p2align}, $val">;
 
 } // Defs = [ARGUMENTS]
 
 // Select stores with no constant offset.
-def : Pat<(store I32:$val, I32:$addr), (STORE_I32 0, I32:$addr, I32:$val)>;
-def : Pat<(store I64:$val, I32:$addr), (STORE_I64 0, I32:$addr, I64:$val)>;
-def : Pat<(store F32:$val, I32:$addr), (STORE_F32 0, I32:$addr, F32:$val)>;
-def : Pat<(store F64:$val, I32:$addr), (STORE_F64 0, I32:$addr, F64:$val)>;
+def : Pat<(store I32:$val, I32:$addr), (STORE_I32 0, I32:$addr, 0, I32:$val)>;
+def : Pat<(store I64:$val, I32:$addr), (STORE_I64 0, I32:$addr, 0, I64:$val)>;
+def : Pat<(store F32:$val, I32:$addr), (STORE_F32 0, I32:$addr, 0, F32:$val)>;
+def : Pat<(store F64:$val, I32:$addr), (STORE_F64 0, I32:$addr, 0, F64:$val)>;
 
 // Select stores with a constant offset.
 def : Pat<(store I32:$val, (regPlusImm I32:$addr, imm:$off)),
-          (STORE_I32 imm:$off, I32:$addr, I32:$val)>;
+          (STORE_I32 imm:$off, I32:$addr, 0, I32:$val)>;
 def : Pat<(store I64:$val, (regPlusImm I32:$addr, imm:$off)),
-          (STORE_I64 imm:$off, I32:$addr, I64:$val)>;
+          (STORE_I64 imm:$off, I32:$addr, 0, I64:$val)>;
 def : Pat<(store F32:$val, (regPlusImm I32:$addr, imm:$off)),
-          (STORE_F32 imm:$off, I32:$addr, F32:$val)>;
+          (STORE_F32 imm:$off, I32:$addr, 0, F32:$val)>;
 def : Pat<(store F64:$val, (regPlusImm I32:$addr, imm:$off)),
-          (STORE_F64 imm:$off, I32:$addr, F64:$val)>;
+          (STORE_F64 imm:$off, I32:$addr, 0, F64:$val)>;
+def : Pat<(store I32:$val, (or_is_add I32:$addr, imm:$off)),
+          (STORE_I32 imm:$off, I32:$addr, 0, I32:$val)>;
+def : Pat<(store I64:$val, (or_is_add I32:$addr, imm:$off)),
+          (STORE_I64 imm:$off, I32:$addr, 0, I64:$val)>;
+def : Pat<(store F32:$val, (or_is_add I32:$addr, imm:$off)),
+          (STORE_F32 imm:$off, I32:$addr, 0, F32:$val)>;
+def : Pat<(store F64:$val, (or_is_add I32:$addr, imm:$off)),
+          (STORE_F64 imm:$off, I32:$addr, 0, F64:$val)>;
 def : Pat<(store I32:$val, (regPlusGA I32:$addr,
                                       (WebAssemblywrapper tglobaladdr:$off))),
-          (STORE_I32 tglobaladdr:$off, I32:$addr, I32:$val)>;
+          (STORE_I32 tglobaladdr:$off, I32:$addr, 0, I32:$val)>;
 def : Pat<(store I64:$val, (regPlusGA I32:$addr,
                                       (WebAssemblywrapper tglobaladdr:$off))),
-          (STORE_I64 tglobaladdr:$off, I32:$addr, I64:$val)>;
+          (STORE_I64 tglobaladdr:$off, I32:$addr, 0, I64:$val)>;
 def : Pat<(store F32:$val, (regPlusGA I32:$addr,
                                       (WebAssemblywrapper tglobaladdr:$off))),
-          (STORE_F32 tglobaladdr:$off, I32:$addr, F32:$val)>;
+          (STORE_F32 tglobaladdr:$off, I32:$addr, 0, F32:$val)>;
 def : Pat<(store F64:$val, (regPlusGA I32:$addr,
                                       (WebAssemblywrapper tglobaladdr:$off))),
-          (STORE_F64 tglobaladdr:$off, I32:$addr, F64:$val)>;
+          (STORE_F64 tglobaladdr:$off, I32:$addr, 0, F64:$val)>;
 def : Pat<(store I32:$val, (add I32:$addr,
                                 (WebAssemblywrapper texternalsym:$off))),
-          (STORE_I32 texternalsym:$off, I32:$addr, I32:$val)>;
+          (STORE_I32 texternalsym:$off, I32:$addr, 0, I32:$val)>;
 def : Pat<(store I64:$val, (add I32:$addr,
                                 (WebAssemblywrapper texternalsym:$off))),
-          (STORE_I64 texternalsym:$off, I32:$addr, I64:$val)>;
+          (STORE_I64 texternalsym:$off, I32:$addr, 0, I64:$val)>;
 def : Pat<(store F32:$val, (add I32:$addr,
                                 (WebAssemblywrapper texternalsym:$off))),
-          (STORE_F32 texternalsym:$off, I32:$addr, F32:$val)>;
+          (STORE_F32 texternalsym:$off, I32:$addr, 0, F32:$val)>;
 def : Pat<(store F64:$val, (add I32:$addr,
                                 (WebAssemblywrapper texternalsym:$off))),
-          (STORE_F64 texternalsym:$off, I32:$addr, F64:$val)>;
+          (STORE_F64 texternalsym:$off, I32:$addr, 0, F64:$val)>;
 
 // Select stores with just a constant offset.
 def : Pat<(store I32:$val, imm:$off),
-          (STORE_I32 imm:$off, (CONST_I32 0), I32:$val)>;
+          (STORE_I32 imm:$off, (CONST_I32 0), 0, I32:$val)>;
 def : Pat<(store I64:$val, imm:$off),
-          (STORE_I64 imm:$off, (CONST_I32 0), I64:$val)>;
+          (STORE_I64 imm:$off, (CONST_I32 0), 0, I64:$val)>;
 def : Pat<(store F32:$val, imm:$off),
-          (STORE_F32 imm:$off, (CONST_I32 0), F32:$val)>;
+          (STORE_F32 imm:$off, (CONST_I32 0), 0, F32:$val)>;
 def : Pat<(store F64:$val, imm:$off),
-          (STORE_F64 imm:$off, (CONST_I32 0), F64:$val)>;
+          (STORE_F64 imm:$off, (CONST_I32 0), 0, F64:$val)>;
 def : Pat<(store I32:$val, (WebAssemblywrapper tglobaladdr:$off)),
-          (STORE_I32 tglobaladdr:$off, (CONST_I32 0), I32:$val)>;
+          (STORE_I32 tglobaladdr:$off, (CONST_I32 0), 0, I32:$val)>;
 def : Pat<(store I64:$val, (WebAssemblywrapper tglobaladdr:$off)),
-          (STORE_I64 tglobaladdr:$off, (CONST_I32 0), I64:$val)>;
+          (STORE_I64 tglobaladdr:$off, (CONST_I32 0), 0, I64:$val)>;
 def : Pat<(store F32:$val, (WebAssemblywrapper tglobaladdr:$off)),
-          (STORE_F32 tglobaladdr:$off, (CONST_I32 0), F32:$val)>;
+          (STORE_F32 tglobaladdr:$off, (CONST_I32 0), 0, F32:$val)>;
 def : Pat<(store F64:$val, (WebAssemblywrapper tglobaladdr:$off)),
-          (STORE_F64 tglobaladdr:$off, (CONST_I32 0), F64:$val)>;
+          (STORE_F64 tglobaladdr:$off, (CONST_I32 0), 0, F64:$val)>;
 def : Pat<(store I32:$val, (WebAssemblywrapper texternalsym:$off)),
-          (STORE_I32 texternalsym:$off, (CONST_I32 0), I32:$val)>;
+          (STORE_I32 texternalsym:$off, (CONST_I32 0), 0, I32:$val)>;
 def : Pat<(store I64:$val, (WebAssemblywrapper texternalsym:$off)),
-          (STORE_I64 texternalsym:$off, (CONST_I32 0), I64:$val)>;
+          (STORE_I64 texternalsym:$off, (CONST_I32 0), 0, I64:$val)>;
 def : Pat<(store F32:$val, (WebAssemblywrapper texternalsym:$off)),
-          (STORE_F32 texternalsym:$off, (CONST_I32 0), F32:$val)>;
+          (STORE_F32 texternalsym:$off, (CONST_I32 0), 0, F32:$val)>;
 def : Pat<(store F64:$val, (WebAssemblywrapper texternalsym:$off)),
-          (STORE_F64 texternalsym:$off, (CONST_I32 0), F64:$val)>;
+          (STORE_F64 texternalsym:$off, (CONST_I32 0), 0, F64:$val)>;
 
 let Defs = [ARGUMENTS] in {
 
 // Truncating store.
-def STORE8_I32  : I<(outs I32:$dst), (ins i32imm:$off, I32:$addr, I32:$val), [],
-                    "i32.store8\t$dst, ${off}(${addr}), $val">;
-def STORE16_I32 : I<(outs I32:$dst), (ins i32imm:$off, I32:$addr, I32:$val), [],
-                    "i32.store16\t$dst, ${off}(${addr}), $val">;
-def STORE8_I64  : I<(outs I64:$dst), (ins i32imm:$off, I32:$addr, I64:$val), [],
-                    "i64.store8\t$dst, ${off}(${addr}), $val">;
-def STORE16_I64 : I<(outs I64:$dst), (ins i32imm:$off, I32:$addr, I64:$val), [],
-                    "i64.store16\t$dst, ${off}(${addr}), $val">;
-def STORE32_I64 : I<(outs I64:$dst), (ins i32imm:$off, I32:$addr, I64:$val), [],
-                    "i64.store32\t$dst, ${off}(${addr}), $val">;
+def STORE8_I32  : I<(outs I32:$dst), (ins i32imm:$off, I32:$addr,
+                                      P2Align:$p2align, I32:$val), [],
+                    "i32.store8\t$dst, ${off}(${addr})${p2align}, $val">;
+def STORE16_I32 : I<(outs I32:$dst), (ins i32imm:$off, I32:$addr,
+                                      P2Align:$p2align, I32:$val), [],
+                    "i32.store16\t$dst, ${off}(${addr})${p2align}, $val">;
+def STORE8_I64  : I<(outs I64:$dst), (ins i32imm:$off, I32:$addr,
+                                      P2Align:$p2align, I64:$val), [],
+                    "i64.store8\t$dst, ${off}(${addr})${p2align}, $val">;
+def STORE16_I64 : I<(outs I64:$dst), (ins i32imm:$off, I32:$addr,
+                                      P2Align:$p2align, I64:$val), [],
+                    "i64.store16\t$dst, ${off}(${addr})${p2align}, $val">;
+def STORE32_I64 : I<(outs I64:$dst), (ins i32imm:$off, I32:$addr,
+                                      P2Align:$p2align, I64:$val), [],
+                    "i64.store32\t$dst, ${off}(${addr})${p2align}, $val">;
 
 } // Defs = [ARGUMENTS]
 
 // Select truncating stores with no constant offset.
 def : Pat<(truncstorei8 I32:$val, I32:$addr),
-          (STORE8_I32 0, I32:$addr, I32:$val)>;
+          (STORE8_I32 0, I32:$addr, 0, I32:$val)>;
 def : Pat<(truncstorei16 I32:$val, I32:$addr),
-          (STORE16_I32 0, I32:$addr, I32:$val)>;
+          (STORE16_I32 0, I32:$addr, 0, I32:$val)>;
 def : Pat<(truncstorei8 I64:$val, I32:$addr),
-          (STORE8_I64 0, I32:$addr, I64:$val)>;
+          (STORE8_I64 0, I32:$addr, 0, I64:$val)>;
 def : Pat<(truncstorei16 I64:$val, I32:$addr),
-          (STORE16_I64 0, I32:$addr, I64:$val)>;
+          (STORE16_I64 0, I32:$addr, 0, I64:$val)>;
 def : Pat<(truncstorei32 I64:$val, I32:$addr),
-          (STORE32_I64 0, I32:$addr, I64:$val)>;
+          (STORE32_I64 0, I32:$addr, 0, I64:$val)>;
 
 // Select truncating stores with a constant offset.
 def : Pat<(truncstorei8 I32:$val, (regPlusImm I32:$addr, imm:$off)),
-          (STORE8_I32 imm:$off, I32:$addr, I32:$val)>;
+          (STORE8_I32 imm:$off, I32:$addr, 0, I32:$val)>;
 def : Pat<(truncstorei16 I32:$val, (regPlusImm I32:$addr, imm:$off)),
-          (STORE16_I32 imm:$off, I32:$addr, I32:$val)>;
+          (STORE16_I32 imm:$off, I32:$addr, 0, I32:$val)>;
 def : Pat<(truncstorei8 I64:$val, (regPlusImm I32:$addr, imm:$off)),
-          (STORE8_I64 imm:$off, I32:$addr, I64:$val)>;
+          (STORE8_I64 imm:$off, I32:$addr, 0, I64:$val)>;
 def : Pat<(truncstorei16 I64:$val, (regPlusImm I32:$addr, imm:$off)),
-          (STORE16_I64 imm:$off, I32:$addr, I64:$val)>;
+          (STORE16_I64 imm:$off, I32:$addr, 0, I64:$val)>;
 def : Pat<(truncstorei32 I64:$val, (regPlusImm I32:$addr, imm:$off)),
-          (STORE32_I64 imm:$off, I32:$addr, I64:$val)>;
+          (STORE32_I64 imm:$off, I32:$addr, 0, I64:$val)>;
+def : Pat<(truncstorei8 I32:$val, (or_is_add I32:$addr, imm:$off)),
+          (STORE8_I32 imm:$off, I32:$addr, 0, I32:$val)>;
+def : Pat<(truncstorei16 I32:$val, (or_is_add I32:$addr, imm:$off)),
+          (STORE16_I32 imm:$off, I32:$addr, 0, I32:$val)>;
+def : Pat<(truncstorei8 I64:$val, (or_is_add I32:$addr, imm:$off)),
+          (STORE8_I64 imm:$off, I32:$addr, 0, I64:$val)>;
+def : Pat<(truncstorei16 I64:$val, (or_is_add I32:$addr, imm:$off)),
+          (STORE16_I64 imm:$off, I32:$addr, 0, I64:$val)>;
+def : Pat<(truncstorei32 I64:$val, (or_is_add I32:$addr, imm:$off)),
+          (STORE32_I64 imm:$off, I32:$addr, 0, I64:$val)>;
 def : Pat<(truncstorei8 I32:$val,
                         (regPlusGA I32:$addr,
                                    (WebAssemblywrapper tglobaladdr:$off))),
-          (STORE8_I32 tglobaladdr:$off, I32:$addr, I32:$val)>;
+          (STORE8_I32 tglobaladdr:$off, I32:$addr, 0, I32:$val)>;
 def : Pat<(truncstorei16 I32:$val,
                          (regPlusGA I32:$addr,
                                     (WebAssemblywrapper tglobaladdr:$off))),
-          (STORE16_I32 tglobaladdr:$off, I32:$addr, I32:$val)>;
+          (STORE16_I32 tglobaladdr:$off, I32:$addr, 0, I32:$val)>;
 def : Pat<(truncstorei8 I64:$val,
                         (regPlusGA I32:$addr,
                                    (WebAssemblywrapper tglobaladdr:$off))),
-          (STORE8_I64 tglobaladdr:$off, I32:$addr, I64:$val)>;
+          (STORE8_I64 tglobaladdr:$off, I32:$addr, 0, I64:$val)>;
 def : Pat<(truncstorei16 I64:$val,
                          (regPlusGA I32:$addr,
                                     (WebAssemblywrapper tglobaladdr:$off))),
-          (STORE16_I64 tglobaladdr:$off, I32:$addr, I64:$val)>;
+          (STORE16_I64 tglobaladdr:$off, I32:$addr, 0, I64:$val)>;
 def : Pat<(truncstorei32 I64:$val,
                          (regPlusGA I32:$addr,
                                     (WebAssemblywrapper tglobaladdr:$off))),
-          (STORE32_I64 tglobaladdr:$off, I32:$addr, I64:$val)>;
+          (STORE32_I64 tglobaladdr:$off, I32:$addr, 0, I64:$val)>;
 def : Pat<(truncstorei8 I32:$val, (add I32:$addr,
                                        (WebAssemblywrapper texternalsym:$off))),
-          (STORE8_I32 texternalsym:$off, I32:$addr, I32:$val)>;
+          (STORE8_I32 texternalsym:$off, I32:$addr, 0, I32:$val)>;
 def : Pat<(truncstorei16 I32:$val,
                          (add I32:$addr,
                               (WebAssemblywrapper texternalsym:$off))),
-          (STORE16_I32 texternalsym:$off, I32:$addr, I32:$val)>;
+          (STORE16_I32 texternalsym:$off, I32:$addr, 0, I32:$val)>;
 def : Pat<(truncstorei8 I64:$val,
                         (add I32:$addr,
                              (WebAssemblywrapper texternalsym:$off))),
-          (STORE8_I64 texternalsym:$off, I32:$addr, I64:$val)>;
+          (STORE8_I64 texternalsym:$off, I32:$addr, 0, I64:$val)>;
 def : Pat<(truncstorei16 I64:$val,
                          (add I32:$addr,
                               (WebAssemblywrapper texternalsym:$off))),
-          (STORE16_I64 texternalsym:$off, I32:$addr, I64:$val)>;
+          (STORE16_I64 texternalsym:$off, I32:$addr, 0, I64:$val)>;
 def : Pat<(truncstorei32 I64:$val,
                          (add I32:$addr,
                               (WebAssemblywrapper texternalsym:$off))),
-          (STORE32_I64 texternalsym:$off, I32:$addr, I64:$val)>;
+          (STORE32_I64 texternalsym:$off, I32:$addr, 0, I64:$val)>;
 
 // Select truncating stores with just a constant offset.
 def : Pat<(truncstorei8 I32:$val, imm:$off),
-          (STORE8_I32 imm:$off, (CONST_I32 0), I32:$val)>;
+          (STORE8_I32 imm:$off, (CONST_I32 0), 0, I32:$val)>;
 def : Pat<(truncstorei16 I32:$val, imm:$off),
-          (STORE16_I32 imm:$off, (CONST_I32 0), I32:$val)>;
+          (STORE16_I32 imm:$off, (CONST_I32 0), 0, I32:$val)>;
 def : Pat<(truncstorei8 I64:$val, imm:$off),
-          (STORE8_I64 imm:$off, (CONST_I32 0), I64:$val)>;
+          (STORE8_I64 imm:$off, (CONST_I32 0), 0, I64:$val)>;
 def : Pat<(truncstorei16 I64:$val, imm:$off),
-          (STORE16_I64 imm:$off, (CONST_I32 0), I64:$val)>;
+          (STORE16_I64 imm:$off, (CONST_I32 0), 0, I64:$val)>;
 def : Pat<(truncstorei32 I64:$val, imm:$off),
-          (STORE32_I64 imm:$off, (CONST_I32 0), I64:$val)>;
+          (STORE32_I64 imm:$off, (CONST_I32 0), 0, I64:$val)>;
 def : Pat<(truncstorei8 I32:$val, (WebAssemblywrapper tglobaladdr:$off)),
-          (STORE8_I32 tglobaladdr:$off, (CONST_I32 0), I32:$val)>;
+          (STORE8_I32 tglobaladdr:$off, (CONST_I32 0), 0, I32:$val)>;
 def : Pat<(truncstorei16 I32:$val, (WebAssemblywrapper tglobaladdr:$off)),
-          (STORE16_I32 tglobaladdr:$off, (CONST_I32 0), I32:$val)>;
+          (STORE16_I32 tglobaladdr:$off, (CONST_I32 0), 0, I32:$val)>;
 def : Pat<(truncstorei8 I64:$val, (WebAssemblywrapper tglobaladdr:$off)),
-          (STORE8_I64 tglobaladdr:$off, (CONST_I32 0), I64:$val)>;
+          (STORE8_I64 tglobaladdr:$off, (CONST_I32 0), 0, I64:$val)>;
 def : Pat<(truncstorei16 I64:$val, (WebAssemblywrapper tglobaladdr:$off)),
-          (STORE16_I64 tglobaladdr:$off, (CONST_I32 0), I64:$val)>;
+          (STORE16_I64 tglobaladdr:$off, (CONST_I32 0), 0, I64:$val)>;
 def : Pat<(truncstorei32 I64:$val, (WebAssemblywrapper tglobaladdr:$off)),
-          (STORE32_I64 tglobaladdr:$off, (CONST_I32 0), I64:$val)>;
+          (STORE32_I64 tglobaladdr:$off, (CONST_I32 0), 0, I64:$val)>;
 def : Pat<(truncstorei8 I32:$val, (WebAssemblywrapper texternalsym:$off)),
-          (STORE8_I32 texternalsym:$off, (CONST_I32 0), I32:$val)>;
+          (STORE8_I32 texternalsym:$off, (CONST_I32 0), 0, I32:$val)>;
 def : Pat<(truncstorei16 I32:$val, (WebAssemblywrapper texternalsym:$off)),
-          (STORE16_I32 texternalsym:$off, (CONST_I32 0), I32:$val)>;
+          (STORE16_I32 texternalsym:$off, (CONST_I32 0), 0, I32:$val)>;
 def : Pat<(truncstorei8 I64:$val, (WebAssemblywrapper texternalsym:$off)),
-          (STORE8_I64 texternalsym:$off, (CONST_I32 0), I64:$val)>;
+          (STORE8_I64 texternalsym:$off, (CONST_I32 0), 0, I64:$val)>;
 def : Pat<(truncstorei16 I64:$val, (WebAssemblywrapper texternalsym:$off)),
-          (STORE16_I64 texternalsym:$off, (CONST_I32 0), I64:$val)>;
+          (STORE16_I64 texternalsym:$off, (CONST_I32 0), 0, I64:$val)>;
 def : Pat<(truncstorei32 I64:$val, (WebAssemblywrapper texternalsym:$off)),
-          (STORE32_I64 texternalsym:$off, (CONST_I32 0), I64:$val)>;
+          (STORE32_I64 texternalsym:$off, (CONST_I32 0), 0, I64:$val)>;
 
 let Defs = [ARGUMENTS] in {
 
-// Memory size.
-def MEMORY_SIZE_I32 : I<(outs I32:$dst), (ins),
-                        [(set I32:$dst, (int_wasm_memory_size))],
-                        "memory_size\t$dst">,
-                      Requires<[HasAddr32]>;
-def MEMORY_SIZE_I64 : I<(outs I64:$dst), (ins),
-                        [(set I64:$dst, (int_wasm_memory_size))],
-                        "memory_size\t$dst">,
-                      Requires<[HasAddr64]>;
+// Current memory size.
+def CURRENT_MEMORY_I32 : I<(outs I32:$dst), (ins),
+                           [(set I32:$dst, (int_wasm_current_memory))],
+                           "current_memory\t$dst">,
+                         Requires<[HasAddr32]>;
+def CURRENT_MEMORY_I64 : I<(outs I64:$dst), (ins),
+                           [(set I64:$dst, (int_wasm_current_memory))],
+                           "current_memory\t$dst">,
+                         Requires<[HasAddr64]>;
 
 // Grow memory.
 def GROW_MEMORY_I32 : I<(outs), (ins I32:$delta),
diff --git a/lib/Target/WebAssembly/WebAssemblyLowerBrUnless.cpp b/lib/Target/WebAssembly/WebAssemblyLowerBrUnless.cpp
index b009a4e054cc..af53f3db967b 100644
--- a/lib/Target/WebAssembly/WebAssemblyLowerBrUnless.cpp
+++ b/lib/Target/WebAssembly/WebAssemblyLowerBrUnless.cpp
@@ -16,9 +16,9 @@
 //===----------------------------------------------------------------------===//
 
 #include "WebAssembly.h"
+#include "MCTargetDesc/WebAssemblyMCTargetDesc.h"
 #include "WebAssemblyMachineFunctionInfo.h"
 #include "WebAssemblySubtarget.h"
-#include "MCTargetDesc/WebAssemblyMCTargetDesc.h"
 #include "llvm/CodeGen/MachineFunctionPass.h"
 #include "llvm/CodeGen/MachineInstrBuilder.h"
 #include "llvm/Support/Debug.h"
@@ -61,12 +61,12 @@ bool WebAssemblyLowerBrUnless::runOnMachineFunction(MachineFunction &MF) {
   auto &MRI = MF.getRegInfo();
 
   for (auto &MBB : MF) {
-    for (auto MII = MBB.begin(); MII != MBB.end(); ) {
+    for (auto MII = MBB.begin(); MII != MBB.end();) {
       MachineInstr *MI = &*MII++;
       if (MI->getOpcode() != WebAssembly::BR_UNLESS)
         continue;
 
-      unsigned Cond = MI->getOperand(0).getReg();
+      unsigned Cond = MI->getOperand(1).getReg();
       bool Inverted = false;
 
       // Attempt to invert the condition in place.
@@ -74,7 +74,7 @@ bool WebAssemblyLowerBrUnless::runOnMachineFunction(MachineFunction &MF) {
         assert(MRI.hasOneDef(Cond));
         MachineInstr *Def = MRI.getVRegDef(Cond);
         switch (Def->getOpcode()) {
-        using namespace WebAssembly;
+          using namespace WebAssembly;
         case EQ_I32: Def->setDesc(TII.get(NE_I32)); Inverted = true; break;
         case NE_I32: Def->setDesc(TII.get(EQ_I32)); Inverted = true; break;
         case GT_S_I32: Def->setDesc(TII.get(LE_S_I32)); Inverted = true; break;
@@ -106,15 +106,10 @@ bool WebAssemblyLowerBrUnless::runOnMachineFunction(MachineFunction &MF) {
       // If we weren't able to invert the condition in place. Insert an
       // expression to invert it.
       if (!Inverted) {
-        unsigned ZeroReg = MRI.createVirtualRegister(&WebAssembly::I32RegClass);
-        MFI.stackifyVReg(ZeroReg);
-        BuildMI(MBB, MI, MI->getDebugLoc(), TII.get(WebAssembly::CONST_I32), ZeroReg)
-            .addImm(0);
         unsigned Tmp = MRI.createVirtualRegister(&WebAssembly::I32RegClass);
         MFI.stackifyVReg(Tmp);
-        BuildMI(MBB, MI, MI->getDebugLoc(), TII.get(WebAssembly::EQ_I32), Tmp)
-            .addReg(Cond)
-            .addReg(ZeroReg);
+        BuildMI(MBB, MI, MI->getDebugLoc(), TII.get(WebAssembly::EQZ_I32), Tmp)
+            .addReg(Cond);
         Cond = Tmp;
         Inverted = true;
       }
@@ -123,8 +118,8 @@ bool WebAssemblyLowerBrUnless::runOnMachineFunction(MachineFunction &MF) {
       // delete the br_unless.
       assert(Inverted);
       BuildMI(MBB, MI, MI->getDebugLoc(), TII.get(WebAssembly::BR_IF))
-          .addReg(Cond)
-          .addOperand(MI->getOperand(1));
+          .addOperand(MI->getOperand(0))
+          .addReg(Cond);
       MBB.erase(MI);
     }
   }
diff --git a/lib/Target/WebAssembly/WebAssemblyMachineFunctionInfo.h b/lib/Target/WebAssembly/WebAssemblyMachineFunctionInfo.h
index 6a60280900a9..89f607d84b71 100644
--- a/lib/Target/WebAssembly/WebAssemblyMachineFunctionInfo.h
+++ b/lib/Target/WebAssembly/WebAssemblyMachineFunctionInfo.h
@@ -39,18 +39,24 @@ class WebAssemblyFunctionInfo final : public MachineFunctionInfo {
   ///   - defined and used in LIFO order with other stack registers
   BitVector VRegStackified;
 
-  // One entry for each possible target reg. we expect it to be small.
-  std::vector<unsigned> PhysRegs;
+  // A virtual register holding the pointer to the vararg buffer for vararg
+  // functions. It is created and set in TLI::LowerFormalArguments and read by
+  // TLI::LowerVASTART
+  unsigned VarargVreg = -1U;
 
-public:
-  explicit WebAssemblyFunctionInfo(MachineFunction &MF) : MF(MF) {
-    PhysRegs.resize(WebAssembly::NUM_TARGET_REGS, -1U);
-  }
+ public:
+  explicit WebAssemblyFunctionInfo(MachineFunction &MF) : MF(MF) {}
   ~WebAssemblyFunctionInfo() override;
 
   void addParam(MVT VT) { Params.push_back(VT); }
   const std::vector<MVT> &getParams() const { return Params; }
 
+  unsigned getVarargBufferVreg() const {
+    assert(VarargVreg != -1U && "Vararg vreg hasn't been set");
+    return VarargVreg;
+  }
+  void setVarargBufferVreg(unsigned Reg) { VarargVreg = Reg; }
+
   static const unsigned UnusedReg = -1u;
 
   void stackifyVReg(unsigned VReg) {
@@ -71,25 +77,15 @@ public:
     WARegs[TargetRegisterInfo::virtReg2Index(VReg)] = WAReg;
   }
   unsigned getWAReg(unsigned Reg) const {
-    if (TargetRegisterInfo::isVirtualRegister(Reg)) {
-      assert(TargetRegisterInfo::virtReg2Index(Reg) < WARegs.size());
-      return WARegs[TargetRegisterInfo::virtReg2Index(Reg)];
-    }
-    return PhysRegs[Reg];
-  }
-  // If new virtual registers are created after initWARegs has been called,
-  // this function can be used to add WebAssembly register mappings for them.
-  void addWAReg(unsigned VReg, unsigned WAReg) {
-    assert(VReg = WARegs.size());
-    WARegs.push_back(WAReg);
+    assert(TargetRegisterInfo::virtReg2Index(Reg) < WARegs.size());
+    return WARegs[TargetRegisterInfo::virtReg2Index(Reg)];
   }
 
-  void addPReg(unsigned PReg, unsigned WAReg) {
-    assert(PReg < WebAssembly::NUM_TARGET_REGS);
-    assert(WAReg < -1U);
-    PhysRegs[PReg] = WAReg;
+  // For a given stackified WAReg, return the id number to print with push/pop.
+  static unsigned getWARegStackId(unsigned Reg) {
+    assert(Reg & INT32_MIN);
+    return Reg & INT32_MAX;
   }
-  const std::vector<unsigned> &getPhysRegs() const { return PhysRegs; }
 };
 
 } // end namespace llvm
diff --git a/lib/Target/WebAssembly/WebAssemblyOptimizeLiveIntervals.cpp b/lib/Target/WebAssembly/WebAssemblyOptimizeLiveIntervals.cpp
new file mode 100644
index 000000000000..473de7ddae7e
--- /dev/null
+++ b/lib/Target/WebAssembly/WebAssemblyOptimizeLiveIntervals.cpp
@@ -0,0 +1,105 @@
+//===--- WebAssemblyOptimizeLiveIntervals.cpp - LiveInterval processing ---===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// \brief Optimize LiveIntervals for use in a post-RA context.
+//
+/// LiveIntervals normally runs before register allocation when the code is
+/// only recently lowered out of SSA form, so it's uncommon for registers to
+/// have multiple defs, and then they do, the defs are usually closely related.
+/// Later, after coalescing, tail duplication, and other optimizations, it's
+/// more common to see registers with multiple unrelated defs. This pass
+/// updates LiveIntervalAnalysis to distribute the value numbers across separate
+/// LiveIntervals.
+///
+//===----------------------------------------------------------------------===//
+
+#include "WebAssembly.h"
+#include "WebAssemblySubtarget.h"
+#include "llvm/CodeGen/LiveIntervalAnalysis.h"
+#include "llvm/CodeGen/MachineBlockFrequencyInfo.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/Passes.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+using namespace llvm;
+
+#define DEBUG_TYPE "wasm-optimize-live-intervals"
+
+namespace {
+class WebAssemblyOptimizeLiveIntervals final : public MachineFunctionPass {
+  const char *getPassName() const override {
+    return "WebAssembly Optimize Live Intervals";
+  }
+
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    AU.setPreservesCFG();
+    AU.addRequired<LiveIntervals>();
+    AU.addPreserved<MachineBlockFrequencyInfo>();
+    AU.addPreserved<SlotIndexes>();
+    AU.addPreserved<LiveIntervals>();
+    AU.addPreservedID(LiveVariablesID);
+    AU.addPreservedID(MachineDominatorsID);
+    MachineFunctionPass::getAnalysisUsage(AU);
+  }
+
+  bool runOnMachineFunction(MachineFunction &MF) override;
+
+public:
+  static char ID; // Pass identification, replacement for typeid
+  WebAssemblyOptimizeLiveIntervals() : MachineFunctionPass(ID) {}
+};
+} // end anonymous namespace
+
+char WebAssemblyOptimizeLiveIntervals::ID = 0;
+FunctionPass *llvm::createWebAssemblyOptimizeLiveIntervals() {
+  return new WebAssemblyOptimizeLiveIntervals();
+}
+
+bool WebAssemblyOptimizeLiveIntervals::runOnMachineFunction(MachineFunction &MF) {
+  DEBUG(dbgs() << "********** Optimize LiveIntervals **********\n"
+                  "********** Function: "
+               << MF.getName() << '\n');
+
+  MachineRegisterInfo &MRI = MF.getRegInfo();
+  LiveIntervals &LIS = getAnalysis<LiveIntervals>();
+
+  // We don't preserve SSA form.
+  MRI.leaveSSA();
+
+  assert(MRI.tracksLiveness() &&
+         "OptimizeLiveIntervals expects liveness");
+
+  // Split multiple-VN LiveIntervals into multiple LiveIntervals.
+  SmallVector<LiveInterval*, 4> SplitLIs;
+  for (unsigned i = 0, e = MRI.getNumVirtRegs(); i < e; ++i) {
+    unsigned Reg = TargetRegisterInfo::index2VirtReg(i);
+    if (MRI.reg_nodbg_empty(Reg))
+      continue;
+
+    LIS.splitSeparateComponents(LIS.getInterval(Reg), SplitLIs);
+    SplitLIs.clear();
+  }
+
+  // In PrepareForLiveIntervals, we conservatively inserted IMPLICIT_DEF
+  // instructions to satisfy LiveIntervals' requirement that all uses be
+  // dominated by defs. Now that LiveIntervals has computed which of these
+  // defs are actually needed and which are dead, remove the dead ones.
+  for (auto MII = MF.begin()->begin(), MIE = MF.begin()->end(); MII != MIE; ) {
+    MachineInstr *MI = &*MII++;
+    if (MI->isImplicitDef() && MI->getOperand(0).isDead()) {
+      LiveInterval &LI = LIS.getInterval(MI->getOperand(0).getReg());
+      LIS.removeVRegDefAt(LI, LIS.getInstructionIndex(*MI).getRegSlot());
+      LIS.RemoveMachineInstrFromMaps(*MI);
+      MI->eraseFromParent();
+    }
+  }
+
+  return false;
+}
diff --git a/lib/Target/WebAssembly/WebAssemblyPEI.cpp b/lib/Target/WebAssembly/WebAssemblyPEI.cpp
deleted file mode 100644
index d570d4266110..000000000000
--- a/lib/Target/WebAssembly/WebAssemblyPEI.cpp
+++ /dev/null
@@ -1,1066 +0,0 @@
-//===-- WebAssemblyPEI.cpp - Insert Prolog/Epilog code in function --===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-//
-// This pass is responsible for finalizing the functions frame layout, saving
-// callee saved registers, and for emitting prolog & epilog code for the
-// function.
-//
-// This pass must be run after register allocation.  After this pass is
-// executed, it is illegal to construct MO_FrameIndex operands.
-//
-// This is a copy of lib/CodeGen/PrologEpilogInserter.cpp except that it does
-// not assert that all virtual registers are gone (because WebAssembly currently
-// uses virtual rather than physical registers), and only runs
-// MRI.clearVirtRegs() if scavenging happened (which it never does). It also
-// uses a different class name so it can be registered via INITIALIZE_PASS.
-// It is otherwise unmodified, so any changes to the target-independent PEI
-// can be easily applied.
-//===----------------------------------------------------------------------===//
-
-#include "llvm/ADT/IndexedMap.h"
-#include "llvm/ADT/STLExtras.h"
-#include "llvm/ADT/SetVector.h"
-#include "llvm/ADT/SmallSet.h"
-#include "llvm/ADT/Statistic.h"
-#include "llvm/CodeGen/MachineDominators.h"
-#include "llvm/CodeGen/MachineFrameInfo.h"
-#include "llvm/CodeGen/MachineInstr.h"
-#include "llvm/CodeGen/MachineLoopInfo.h"
-#include "llvm/CodeGen/MachineModuleInfo.h"
-#include "llvm/CodeGen/MachineRegisterInfo.h"
-#include "llvm/CodeGen/Passes.h"
-#include "llvm/CodeGen/RegisterScavenging.h"
-#include "llvm/CodeGen/StackProtector.h"
-#include "llvm/CodeGen/WinEHFuncInfo.h"
-#include "llvm/IR/DiagnosticInfo.h"
-#include "llvm/IR/InlineAsm.h"
-#include "llvm/IR/LLVMContext.h"
-#include "llvm/Support/CommandLine.h"
-#include "llvm/Support/Compiler.h"
-#include "llvm/Support/Debug.h"
-#include "llvm/Support/raw_ostream.h"
-#include "llvm/Target/TargetFrameLowering.h"
-#include "llvm/Target/TargetInstrInfo.h"
-#include "llvm/Target/TargetMachine.h"
-#include "llvm/Target/TargetRegisterInfo.h"
-#include "llvm/Target/TargetSubtargetInfo.h"
-#include <climits>
-
-using namespace llvm;
-
-#define DEBUG_TYPE "pei"
-namespace llvm {
-void initializeWasmPEIPass(PassRegistry&);
-}
-namespace {
-class WasmPEI : public MachineFunctionPass {
-public:
-  static char ID;
-  WasmPEI() : MachineFunctionPass(ID) {
-    initializeWasmPEIPass(*PassRegistry::getPassRegistry());
-  }
-
-  void getAnalysisUsage(AnalysisUsage &AU) const override;
-
-  /// runOnMachineFunction - Insert prolog/epilog code and replace abstract
-  /// frame indexes with appropriate references.
-  ///
-  bool runOnMachineFunction(MachineFunction &Fn) override;
-
-private:
-  RegScavenger *RS;
-
-  // MinCSFrameIndex, MaxCSFrameIndex - Keeps the range of callee saved
-  // stack frame indexes.
-  unsigned MinCSFrameIndex, MaxCSFrameIndex;
-
-  // Save and Restore blocks of the current function. Typically there is a
-  // single save block, unless Windows EH funclets are involved.
-  SmallVector<MachineBasicBlock *, 1> SaveBlocks;
-  SmallVector<MachineBasicBlock *, 4> RestoreBlocks;
-
-  // Flag to control whether to use the register scavenger to resolve
-  // frame index materialization registers. Set according to
-  // TRI->requiresFrameIndexScavenging() for the current function.
-  bool FrameIndexVirtualScavenging;
-
-  void calculateSets(MachineFunction &Fn);
-  void calculateCallsInformation(MachineFunction &Fn);
-  void assignCalleeSavedSpillSlots(MachineFunction &Fn,
-                                   const BitVector &SavedRegs);
-  void insertCSRSpillsAndRestores(MachineFunction &Fn);
-  void calculateFrameObjectOffsets(MachineFunction &Fn);
-  void replaceFrameIndices(MachineFunction &Fn);
-  void replaceFrameIndices(MachineBasicBlock *BB, MachineFunction &Fn,
-                           int &SPAdj);
-  void scavengeFrameVirtualRegs(MachineFunction &Fn);
-  void insertPrologEpilogCode(MachineFunction &Fn);
-};
-} // namespace
-
-char WasmPEI::ID = 0;
-
-namespace llvm {
-FunctionPass *createWebAssemblyPEI() {
-  return new WasmPEI();
-}
-}
-
-static cl::opt<unsigned>
-WarnStackSize("wasm-warn-stack-size", cl::Hidden, cl::init((unsigned)-1),
-              cl::desc("Warn for stack size bigger than the given"
-                       " number"));
-
-INITIALIZE_PASS_BEGIN(WasmPEI, "wasmprologepilog",
-                "Wasm Prologue/Epilogue Insertion", false, false)
-INITIALIZE_PASS_DEPENDENCY(MachineLoopInfo)
-INITIALIZE_PASS_DEPENDENCY(MachineDominatorTree)
-INITIALIZE_PASS_DEPENDENCY(StackProtector)
-INITIALIZE_PASS_DEPENDENCY(TargetPassConfig)
-INITIALIZE_PASS_END(WasmPEI, "wasmprologepilog",
-                    "Wasm Prologue/Epilogue Insertion & Frame Finalization",
-                    false, false)
-
-STATISTIC(NumScavengedRegs, "Number of frame index regs scavenged");
-STATISTIC(NumBytesStackSpace,
-          "Number of bytes used for stack in all functions");
-
-void WasmPEI::getAnalysisUsage(AnalysisUsage &AU) const {
-  AU.setPreservesCFG();
-  AU.addPreserved<MachineLoopInfo>();
-  AU.addPreserved<MachineDominatorTree>();
-  AU.addRequired<StackProtector>();
-  AU.addRequired<TargetPassConfig>();
-  MachineFunctionPass::getAnalysisUsage(AU);
-}
-
-/// Compute the set of return blocks
-void WasmPEI::calculateSets(MachineFunction &Fn) {
-  const MachineFrameInfo *MFI = Fn.getFrameInfo();
-
-  // Even when we do not change any CSR, we still want to insert the
-  // prologue and epilogue of the function.
-  // So set the save points for those.
-
-  // Use the points found by shrink-wrapping, if any.
-  if (MFI->getSavePoint()) {
-    SaveBlocks.push_back(MFI->getSavePoint());
-    assert(MFI->getRestorePoint() && "Both restore and save must be set");
-    MachineBasicBlock *RestoreBlock = MFI->getRestorePoint();
-    // If RestoreBlock does not have any successor and is not a return block
-    // then the end point is unreachable and we do not need to insert any
-    // epilogue.
-    if (!RestoreBlock->succ_empty() || RestoreBlock->isReturnBlock())
-      RestoreBlocks.push_back(RestoreBlock);
-    return;
-  }
-
-  // Save refs to entry and return blocks.
-  SaveBlocks.push_back(&Fn.front());
-  for (MachineBasicBlock &MBB : Fn) {
-    if (MBB.isEHFuncletEntry())
-      SaveBlocks.push_back(&MBB);
-    if (MBB.isReturnBlock())
-      RestoreBlocks.push_back(&MBB);
-  }
-}
-
-/// StackObjSet - A set of stack object indexes
-typedef SmallSetVector<int, 8> StackObjSet;
-
-/// runOnMachineFunction - Insert prolog/epilog code and replace abstract
-/// frame indexes with appropriate references.
-///
-bool WasmPEI::runOnMachineFunction(MachineFunction &Fn) {
-  const Function* F = Fn.getFunction();
-  const TargetRegisterInfo *TRI = Fn.getSubtarget().getRegisterInfo();
-  const TargetFrameLowering *TFI = Fn.getSubtarget().getFrameLowering();
-
-  // LOCALMOD: assert removed from target-independent PEI
-  //assert(!Fn.getRegInfo().getNumVirtRegs() && "Regalloc must assign all vregs");
-
-  RS = TRI->requiresRegisterScavenging(Fn) ? new RegScavenger() : nullptr;
-  FrameIndexVirtualScavenging = TRI->requiresFrameIndexScavenging(Fn);
-
-  // Calculate the MaxCallFrameSize and AdjustsStack variables for the
-  // function's frame information. Also eliminates call frame pseudo
-  // instructions.
-  calculateCallsInformation(Fn);
-
-  // Determine which of the registers in the callee save list should be saved.
-  BitVector SavedRegs;
-  TFI->determineCalleeSaves(Fn, SavedRegs, RS);
-
-  // Insert spill code for any callee saved registers that are modified.
-  assignCalleeSavedSpillSlots(Fn, SavedRegs);
-
-  // Determine placement of CSR spill/restore code:
-  // place all spills in the entry block, all restores in return blocks.
-  calculateSets(Fn);
-
-  // Add the code to save and restore the callee saved registers.
-  if (!F->hasFnAttribute(Attribute::Naked))
-    insertCSRSpillsAndRestores(Fn);
-
-  // Allow the target machine to make final modifications to the function
-  // before the frame layout is finalized.
-  TFI->processFunctionBeforeFrameFinalized(Fn, RS);
-
-  // Calculate actual frame offsets for all abstract stack objects...
-  calculateFrameObjectOffsets(Fn);
-
-  // Add prolog and epilog code to the function.  This function is required
-  // to align the stack frame as necessary for any stack variables or
-  // called functions.  Because of this, calculateCalleeSavedRegisters()
-  // must be called before this function in order to set the AdjustsStack
-  // and MaxCallFrameSize variables.
-  if (!F->hasFnAttribute(Attribute::Naked))
-    insertPrologEpilogCode(Fn);
-
-  // Replace all MO_FrameIndex operands with physical register references
-  // and actual offsets.
-  //
-  replaceFrameIndices(Fn);
-
-  // If register scavenging is needed, as we've enabled doing it as a
-  // post-pass, scavenge the virtual registers that frame index elimination
-  // inserted.
-  if (TRI->requiresRegisterScavenging(Fn) && FrameIndexVirtualScavenging) {
-    scavengeFrameVirtualRegs(Fn);
-    // Clear any vregs created by virtual scavenging.
-    // LOCALMOD: made this call conditional with scavengeFrameVirtualregs()
-    Fn.getRegInfo().clearVirtRegs();
-  }
-
-  // Warn on stack size when we exceeds the given limit.
-  MachineFrameInfo *MFI = Fn.getFrameInfo();
-  uint64_t StackSize = MFI->getStackSize();
-  if (WarnStackSize.getNumOccurrences() > 0 && WarnStackSize < StackSize) {
-    DiagnosticInfoStackSize DiagStackSize(*F, StackSize);
-    F->getContext().diagnose(DiagStackSize);
-  }
-
-  delete RS;
-  SaveBlocks.clear();
-  RestoreBlocks.clear();
-  return true;
-}
-
-/// calculateCallsInformation - Calculate the MaxCallFrameSize and AdjustsStack
-/// variables for the function's frame information and eliminate call frame
-/// pseudo instructions.
-void WasmPEI::calculateCallsInformation(MachineFunction &Fn) {
-  const TargetInstrInfo &TII = *Fn.getSubtarget().getInstrInfo();
-  const TargetFrameLowering *TFI = Fn.getSubtarget().getFrameLowering();
-  MachineFrameInfo *MFI = Fn.getFrameInfo();
-
-  unsigned MaxCallFrameSize = 0;
-  bool AdjustsStack = MFI->adjustsStack();
-
-  // Get the function call frame set-up and tear-down instruction opcode
-  unsigned FrameSetupOpcode = TII.getCallFrameSetupOpcode();
-  unsigned FrameDestroyOpcode = TII.getCallFrameDestroyOpcode();
-
-  // Early exit for targets which have no call frame setup/destroy pseudo
-  // instructions.
-  if (FrameSetupOpcode == ~0u && FrameDestroyOpcode == ~0u)
-    return;
-
-  std::vector<MachineBasicBlock::iterator> FrameSDOps;
-  for (MachineFunction::iterator BB = Fn.begin(), E = Fn.end(); BB != E; ++BB)
-    for (MachineBasicBlock::iterator I = BB->begin(); I != BB->end(); ++I)
-      if (I->getOpcode() == FrameSetupOpcode ||
-          I->getOpcode() == FrameDestroyOpcode) {
-        assert(I->getNumOperands() >= 1 && "Call Frame Setup/Destroy Pseudo"
-               " instructions should have a single immediate argument!");
-        unsigned Size = I->getOperand(0).getImm();
-        if (Size > MaxCallFrameSize) MaxCallFrameSize = Size;
-        AdjustsStack = true;
-        FrameSDOps.push_back(I);
-      } else if (I->isInlineAsm()) {
-        // Some inline asm's need a stack frame, as indicated by operand 1.
-        unsigned ExtraInfo = I->getOperand(InlineAsm::MIOp_ExtraInfo).getImm();
-        if (ExtraInfo & InlineAsm::Extra_IsAlignStack)
-          AdjustsStack = true;
-      }
-
-  MFI->setAdjustsStack(AdjustsStack);
-  MFI->setMaxCallFrameSize(MaxCallFrameSize);
-
-  for (std::vector<MachineBasicBlock::iterator>::iterator
-         i = FrameSDOps.begin(), e = FrameSDOps.end(); i != e; ++i) {
-    MachineBasicBlock::iterator I = *i;
-
-    // If call frames are not being included as part of the stack frame, and
-    // the target doesn't indicate otherwise, remove the call frame pseudos
-    // here. The sub/add sp instruction pairs are still inserted, but we don't
-    // need to track the SP adjustment for frame index elimination.
-    if (TFI->canSimplifyCallFramePseudos(Fn))
-      TFI->eliminateCallFramePseudoInstr(Fn, *I->getParent(), I);
-  }
-}
-
-void WasmPEI::assignCalleeSavedSpillSlots(MachineFunction &F,
-                                      const BitVector &SavedRegs) {
-  // These are used to keep track the callee-save area. Initialize them.
-  MinCSFrameIndex = INT_MAX;
-  MaxCSFrameIndex = 0;
-
-  if (SavedRegs.empty())
-    return;
-
-  const TargetRegisterInfo *RegInfo = F.getSubtarget().getRegisterInfo();
-  const MCPhysReg *CSRegs = RegInfo->getCalleeSavedRegs(&F);
-
-  std::vector<CalleeSavedInfo> CSI;
-  for (unsigned i = 0; CSRegs[i]; ++i) {
-    unsigned Reg = CSRegs[i];
-    if (SavedRegs.test(Reg))
-      CSI.push_back(CalleeSavedInfo(Reg));
-  }
-
-  const TargetFrameLowering *TFI = F.getSubtarget().getFrameLowering();
-  MachineFrameInfo *MFI = F.getFrameInfo();
-  if (!TFI->assignCalleeSavedSpillSlots(F, RegInfo, CSI)) {
-    // If target doesn't implement this, use generic code.
-
-    if (CSI.empty())
-      return; // Early exit if no callee saved registers are modified!
-
-    unsigned NumFixedSpillSlots;
-    const TargetFrameLowering::SpillSlot *FixedSpillSlots =
-        TFI->getCalleeSavedSpillSlots(NumFixedSpillSlots);
-
-    // Now that we know which registers need to be saved and restored, allocate
-    // stack slots for them.
-    for (std::vector<CalleeSavedInfo>::iterator I = CSI.begin(), E = CSI.end();
-         I != E; ++I) {
-      unsigned Reg = I->getReg();
-      const TargetRegisterClass *RC = RegInfo->getMinimalPhysRegClass(Reg);
-
-      int FrameIdx;
-      if (RegInfo->hasReservedSpillSlot(F, Reg, FrameIdx)) {
-        I->setFrameIdx(FrameIdx);
-        continue;
-      }
-
-      // Check to see if this physreg must be spilled to a particular stack slot
-      // on this target.
-      const TargetFrameLowering::SpillSlot *FixedSlot = FixedSpillSlots;
-      while (FixedSlot != FixedSpillSlots + NumFixedSpillSlots &&
-             FixedSlot->Reg != Reg)
-        ++FixedSlot;
-
-      if (FixedSlot == FixedSpillSlots + NumFixedSpillSlots) {
-        // Nope, just spill it anywhere convenient.
-        unsigned Align = RC->getAlignment();
-        unsigned StackAlign = TFI->getStackAlignment();
-
-        // We may not be able to satisfy the desired alignment specification of
-        // the TargetRegisterClass if the stack alignment is smaller. Use the
-        // min.
-        Align = std::min(Align, StackAlign);
-        FrameIdx = MFI->CreateStackObject(RC->getSize(), Align, true);
-        if ((unsigned)FrameIdx < MinCSFrameIndex) MinCSFrameIndex = FrameIdx;
-        if ((unsigned)FrameIdx > MaxCSFrameIndex) MaxCSFrameIndex = FrameIdx;
-      } else {
-        // Spill it to the stack where we must.
-        FrameIdx =
-            MFI->CreateFixedSpillStackObject(RC->getSize(), FixedSlot->Offset);
-      }
-
-      I->setFrameIdx(FrameIdx);
-    }
-  }
-
-  MFI->setCalleeSavedInfo(CSI);
-}
-
-/// Helper function to update the liveness information for the callee-saved
-/// registers.
-static void updateLiveness(MachineFunction &MF) {
-  MachineFrameInfo *MFI = MF.getFrameInfo();
-  // Visited will contain all the basic blocks that are in the region
-  // where the callee saved registers are alive:
-  // - Anything that is not Save or Restore -> LiveThrough.
-  // - Save -> LiveIn.
-  // - Restore -> LiveOut.
-  // The live-out is not attached to the block, so no need to keep
-  // Restore in this set.
-  SmallPtrSet<MachineBasicBlock *, 8> Visited;
-  SmallVector<MachineBasicBlock *, 8> WorkList;
-  MachineBasicBlock *Entry = &MF.front();
-  MachineBasicBlock *Save = MFI->getSavePoint();
-
-  if (!Save)
-    Save = Entry;
-
-  if (Entry != Save) {
-    WorkList.push_back(Entry);
-    Visited.insert(Entry);
-  }
-  Visited.insert(Save);
-
-  MachineBasicBlock *Restore = MFI->getRestorePoint();
-  if (Restore)
-    // By construction Restore cannot be visited, otherwise it
-    // means there exists a path to Restore that does not go
-    // through Save.
-    WorkList.push_back(Restore);
-
-  while (!WorkList.empty()) {
-    const MachineBasicBlock *CurBB = WorkList.pop_back_val();
-    // By construction, the region that is after the save point is
-    // dominated by the Save and post-dominated by the Restore.
-    if (CurBB == Save && Save != Restore)
-      continue;
-    // Enqueue all the successors not already visited.
-    // Those are by construction either before Save or after Restore.
-    for (MachineBasicBlock *SuccBB : CurBB->successors())
-      if (Visited.insert(SuccBB).second)
-        WorkList.push_back(SuccBB);
-  }
-
-  const std::vector<CalleeSavedInfo> &CSI = MFI->getCalleeSavedInfo();
-
-  for (unsigned i = 0, e = CSI.size(); i != e; ++i) {
-    for (MachineBasicBlock *MBB : Visited) {
-      MCPhysReg Reg = CSI[i].getReg();
-      // Add the callee-saved register as live-in.
-      // It's killed at the spill.
-      if (!MBB->isLiveIn(Reg))
-        MBB->addLiveIn(Reg);
-    }
-  }
-}
-
-/// insertCSRSpillsAndRestores - Insert spill and restore code for
-/// callee saved registers used in the function.
-///
-void WasmPEI::insertCSRSpillsAndRestores(MachineFunction &Fn) {
-  // Get callee saved register information.
-  MachineFrameInfo *MFI = Fn.getFrameInfo();
-  const std::vector<CalleeSavedInfo> &CSI = MFI->getCalleeSavedInfo();
-
-  MFI->setCalleeSavedInfoValid(true);
-
-  // Early exit if no callee saved registers are modified!
-  if (CSI.empty())
-    return;
-
-  const TargetInstrInfo &TII = *Fn.getSubtarget().getInstrInfo();
-  const TargetFrameLowering *TFI = Fn.getSubtarget().getFrameLowering();
-  const TargetRegisterInfo *TRI = Fn.getSubtarget().getRegisterInfo();
-  MachineBasicBlock::iterator I;
-
-  // Spill using target interface.
-  for (MachineBasicBlock *SaveBlock : SaveBlocks) {
-    I = SaveBlock->begin();
-    if (!TFI->spillCalleeSavedRegisters(*SaveBlock, I, CSI, TRI)) {
-      for (unsigned i = 0, e = CSI.size(); i != e; ++i) {
-        // Insert the spill to the stack frame.
-        unsigned Reg = CSI[i].getReg();
-        const TargetRegisterClass *RC = TRI->getMinimalPhysRegClass(Reg);
-        TII.storeRegToStackSlot(*SaveBlock, I, Reg, true, CSI[i].getFrameIdx(),
-                                RC, TRI);
-      }
-    }
-    // Update the live-in information of all the blocks up to the save point.
-    updateLiveness(Fn);
-  }
-
-  // Restore using target interface.
-  for (MachineBasicBlock *MBB : RestoreBlocks) {
-    I = MBB->end();
-
-    // Skip over all terminator instructions, which are part of the return
-    // sequence.
-    MachineBasicBlock::iterator I2 = I;
-    while (I2 != MBB->begin() && (--I2)->isTerminator())
-      I = I2;
-
-    bool AtStart = I == MBB->begin();
-    MachineBasicBlock::iterator BeforeI = I;
-    if (!AtStart)
-      --BeforeI;
-
-    // Restore all registers immediately before the return and any
-    // terminators that precede it.
-    if (!TFI->restoreCalleeSavedRegisters(*MBB, I, CSI, TRI)) {
-      for (unsigned i = 0, e = CSI.size(); i != e; ++i) {
-        unsigned Reg = CSI[i].getReg();
-        const TargetRegisterClass *RC = TRI->getMinimalPhysRegClass(Reg);
-        TII.loadRegFromStackSlot(*MBB, I, Reg, CSI[i].getFrameIdx(), RC, TRI);
-        assert(I != MBB->begin() &&
-               "loadRegFromStackSlot didn't insert any code!");
-        // Insert in reverse order.  loadRegFromStackSlot can insert
-        // multiple instructions.
-        if (AtStart)
-          I = MBB->begin();
-        else {
-          I = BeforeI;
-          ++I;
-        }
-      }
-    }
-  }
-}
-
-/// AdjustStackOffset - Helper function used to adjust the stack frame offset.
-static inline void
-AdjustStackOffset(MachineFrameInfo *MFI, int FrameIdx,
-                  bool StackGrowsDown, int64_t &Offset,
-                  unsigned &MaxAlign, unsigned Skew) {
-  // If the stack grows down, add the object size to find the lowest address.
-  if (StackGrowsDown)
-    Offset += MFI->getObjectSize(FrameIdx);
-
-  unsigned Align = MFI->getObjectAlignment(FrameIdx);
-
-  // If the alignment of this object is greater than that of the stack, then
-  // increase the stack alignment to match.
-  MaxAlign = std::max(MaxAlign, Align);
-
-  // Adjust to alignment boundary.
-  Offset = RoundUpToAlignment(Offset, Align, Skew);
-
-  if (StackGrowsDown) {
-    DEBUG(dbgs() << "alloc FI(" << FrameIdx << ") at SP[" << -Offset << "]\n");
-    MFI->setObjectOffset(FrameIdx, -Offset); // Set the computed offset
-  } else {
-    DEBUG(dbgs() << "alloc FI(" << FrameIdx << ") at SP[" << Offset << "]\n");
-    MFI->setObjectOffset(FrameIdx, Offset);
-    Offset += MFI->getObjectSize(FrameIdx);
-  }
-}
-
-/// AssignProtectedObjSet - Helper function to assign large stack objects (i.e.,
-/// those required to be close to the Stack Protector) to stack offsets.
-static void
-AssignProtectedObjSet(const StackObjSet &UnassignedObjs,
-                      SmallSet<int, 16> &ProtectedObjs,
-                      MachineFrameInfo *MFI, bool StackGrowsDown,
-                      int64_t &Offset, unsigned &MaxAlign, unsigned Skew) {
-
-  for (StackObjSet::const_iterator I = UnassignedObjs.begin(),
-        E = UnassignedObjs.end(); I != E; ++I) {
-    int i = *I;
-    AdjustStackOffset(MFI, i, StackGrowsDown, Offset, MaxAlign, Skew);
-    ProtectedObjs.insert(i);
-  }
-}
-
-/// calculateFrameObjectOffsets - Calculate actual frame offsets for all of the
-/// abstract stack objects.
-///
-void WasmPEI::calculateFrameObjectOffsets(MachineFunction &Fn) {
-  const TargetFrameLowering &TFI = *Fn.getSubtarget().getFrameLowering();
-  StackProtector *SP = &getAnalysis<StackProtector>();
-
-  bool StackGrowsDown =
-    TFI.getStackGrowthDirection() == TargetFrameLowering::StackGrowsDown;
-
-  // Loop over all of the stack objects, assigning sequential addresses...
-  MachineFrameInfo *MFI = Fn.getFrameInfo();
-
-  // Start at the beginning of the local area.
-  // The Offset is the distance from the stack top in the direction
-  // of stack growth -- so it's always nonnegative.
-  int LocalAreaOffset = TFI.getOffsetOfLocalArea();
-  if (StackGrowsDown)
-    LocalAreaOffset = -LocalAreaOffset;
-  assert(LocalAreaOffset >= 0
-         && "Local area offset should be in direction of stack growth");
-  int64_t Offset = LocalAreaOffset;
-
-  // Skew to be applied to alignment.
-  unsigned Skew = TFI.getStackAlignmentSkew(Fn);
-
-  // If there are fixed sized objects that are preallocated in the local area,
-  // non-fixed objects can't be allocated right at the start of local area.
-  // We currently don't support filling in holes in between fixed sized
-  // objects, so we adjust 'Offset' to point to the end of last fixed sized
-  // preallocated object.
-  for (int i = MFI->getObjectIndexBegin(); i != 0; ++i) {
-    int64_t FixedOff;
-    if (StackGrowsDown) {
-      // The maximum distance from the stack pointer is at lower address of
-      // the object -- which is given by offset. For down growing stack
-      // the offset is negative, so we negate the offset to get the distance.
-      FixedOff = -MFI->getObjectOffset(i);
-    } else {
-      // The maximum distance from the start pointer is at the upper
-      // address of the object.
-      FixedOff = MFI->getObjectOffset(i) + MFI->getObjectSize(i);
-    }
-    if (FixedOff > Offset) Offset = FixedOff;
-  }
-
-  // First assign frame offsets to stack objects that are used to spill
-  // callee saved registers.
-  if (StackGrowsDown) {
-    for (unsigned i = MinCSFrameIndex; i <= MaxCSFrameIndex; ++i) {
-      // If the stack grows down, we need to add the size to find the lowest
-      // address of the object.
-      Offset += MFI->getObjectSize(i);
-
-      unsigned Align = MFI->getObjectAlignment(i);
-      // Adjust to alignment boundary
-      Offset = RoundUpToAlignment(Offset, Align, Skew);
-
-      MFI->setObjectOffset(i, -Offset);        // Set the computed offset
-    }
-  } else {
-    int MaxCSFI = MaxCSFrameIndex, MinCSFI = MinCSFrameIndex;
-    for (int i = MaxCSFI; i >= MinCSFI ; --i) {
-      unsigned Align = MFI->getObjectAlignment(i);
-      // Adjust to alignment boundary
-      Offset = RoundUpToAlignment(Offset, Align, Skew);
-
-      MFI->setObjectOffset(i, Offset);
-      Offset += MFI->getObjectSize(i);
-    }
-  }
-
-  unsigned MaxAlign = MFI->getMaxAlignment();
-
-  // Make sure the special register scavenging spill slot is closest to the
-  // incoming stack pointer if a frame pointer is required and is closer
-  // to the incoming rather than the final stack pointer.
-  const TargetRegisterInfo *RegInfo = Fn.getSubtarget().getRegisterInfo();
-  bool EarlyScavengingSlots = (TFI.hasFP(Fn) &&
-                               TFI.isFPCloseToIncomingSP() &&
-                               RegInfo->useFPForScavengingIndex(Fn) &&
-                               !RegInfo->needsStackRealignment(Fn));
-  if (RS && EarlyScavengingSlots) {
-    SmallVector<int, 2> SFIs;
-    RS->getScavengingFrameIndices(SFIs);
-    for (SmallVectorImpl<int>::iterator I = SFIs.begin(),
-           IE = SFIs.end(); I != IE; ++I)
-      AdjustStackOffset(MFI, *I, StackGrowsDown, Offset, MaxAlign, Skew);
-  }
-
-  // FIXME: Once this is working, then enable flag will change to a target
-  // check for whether the frame is large enough to want to use virtual
-  // frame index registers. Functions which don't want/need this optimization
-  // will continue to use the existing code path.
-  if (MFI->getUseLocalStackAllocationBlock()) {
-    unsigned Align = MFI->getLocalFrameMaxAlign();
-
-    // Adjust to alignment boundary.
-    Offset = RoundUpToAlignment(Offset, Align, Skew);
-
-    DEBUG(dbgs() << "Local frame base offset: " << Offset << "\n");
-
-    // Resolve offsets for objects in the local block.
-    for (unsigned i = 0, e = MFI->getLocalFrameObjectCount(); i != e; ++i) {
-      std::pair<int, int64_t> Entry = MFI->getLocalFrameObjectMap(i);
-      int64_t FIOffset = (StackGrowsDown ? -Offset : Offset) + Entry.second;
-      DEBUG(dbgs() << "alloc FI(" << Entry.first << ") at SP[" <<
-            FIOffset << "]\n");
-      MFI->setObjectOffset(Entry.first, FIOffset);
-    }
-    // Allocate the local block
-    Offset += MFI->getLocalFrameSize();
-
-    MaxAlign = std::max(Align, MaxAlign);
-  }
-
-  // Make sure that the stack protector comes before the local variables on the
-  // stack.
-  SmallSet<int, 16> ProtectedObjs;
-  if (MFI->getStackProtectorIndex() >= 0) {
-    StackObjSet LargeArrayObjs;
-    StackObjSet SmallArrayObjs;
-    StackObjSet AddrOfObjs;
-
-    AdjustStackOffset(MFI, MFI->getStackProtectorIndex(), StackGrowsDown,
-                      Offset, MaxAlign, Skew);
-
-    // Assign large stack objects first.
-    for (unsigned i = 0, e = MFI->getObjectIndexEnd(); i != e; ++i) {
-      if (MFI->isObjectPreAllocated(i) &&
-          MFI->getUseLocalStackAllocationBlock())
-        continue;
-      if (i >= MinCSFrameIndex && i <= MaxCSFrameIndex)
-        continue;
-      if (RS && RS->isScavengingFrameIndex((int)i))
-        continue;
-      if (MFI->isDeadObjectIndex(i))
-        continue;
-      if (MFI->getStackProtectorIndex() == (int)i)
-        continue;
-
-      switch (SP->getSSPLayout(MFI->getObjectAllocation(i))) {
-      case StackProtector::SSPLK_None:
-        continue;
-      case StackProtector::SSPLK_SmallArray:
-        SmallArrayObjs.insert(i);
-        continue;
-      case StackProtector::SSPLK_AddrOf:
-        AddrOfObjs.insert(i);
-        continue;
-      case StackProtector::SSPLK_LargeArray:
-        LargeArrayObjs.insert(i);
-        continue;
-      }
-      llvm_unreachable("Unexpected SSPLayoutKind.");
-    }
-
-    AssignProtectedObjSet(LargeArrayObjs, ProtectedObjs, MFI, StackGrowsDown,
-                          Offset, MaxAlign, Skew);
-    AssignProtectedObjSet(SmallArrayObjs, ProtectedObjs, MFI, StackGrowsDown,
-                          Offset, MaxAlign, Skew);
-    AssignProtectedObjSet(AddrOfObjs, ProtectedObjs, MFI, StackGrowsDown,
-                          Offset, MaxAlign, Skew);
-  }
-
-  // Then assign frame offsets to stack objects that are not used to spill
-  // callee saved registers.
-  for (unsigned i = 0, e = MFI->getObjectIndexEnd(); i != e; ++i) {
-    if (MFI->isObjectPreAllocated(i) &&
-        MFI->getUseLocalStackAllocationBlock())
-      continue;
-    if (i >= MinCSFrameIndex && i <= MaxCSFrameIndex)
-      continue;
-    if (RS && RS->isScavengingFrameIndex((int)i))
-      continue;
-    if (MFI->isDeadObjectIndex(i))
-      continue;
-    if (MFI->getStackProtectorIndex() == (int)i)
-      continue;
-    if (ProtectedObjs.count(i))
-      continue;
-
-    AdjustStackOffset(MFI, i, StackGrowsDown, Offset, MaxAlign, Skew);
-  }
-
-  // Make sure the special register scavenging spill slot is closest to the
-  // stack pointer.
-  if (RS && !EarlyScavengingSlots) {
-    SmallVector<int, 2> SFIs;
-    RS->getScavengingFrameIndices(SFIs);
-    for (SmallVectorImpl<int>::iterator I = SFIs.begin(),
-           IE = SFIs.end(); I != IE; ++I)
-      AdjustStackOffset(MFI, *I, StackGrowsDown, Offset, MaxAlign, Skew);
-  }
-
-  if (!TFI.targetHandlesStackFrameRounding()) {
-    // If we have reserved argument space for call sites in the function
-    // immediately on entry to the current function, count it as part of the
-    // overall stack size.
-    if (MFI->adjustsStack() && TFI.hasReservedCallFrame(Fn))
-      Offset += MFI->getMaxCallFrameSize();
-
-    // Round up the size to a multiple of the alignment.  If the function has
-    // any calls or alloca's, align to the target's StackAlignment value to
-    // ensure that the callee's frame or the alloca data is suitably aligned;
-    // otherwise, for leaf functions, align to the TransientStackAlignment
-    // value.
-    unsigned StackAlign;
-    if (MFI->adjustsStack() || MFI->hasVarSizedObjects() ||
-        (RegInfo->needsStackRealignment(Fn) && MFI->getObjectIndexEnd() != 0))
-      StackAlign = TFI.getStackAlignment();
-    else
-      StackAlign = TFI.getTransientStackAlignment();
-
-    // If the frame pointer is eliminated, all frame offsets will be relative to
-    // SP not FP. Align to MaxAlign so this works.
-    StackAlign = std::max(StackAlign, MaxAlign);
-    Offset = RoundUpToAlignment(Offset, StackAlign, Skew);
-  }
-
-  // Update frame info to pretend that this is part of the stack...
-  int64_t StackSize = Offset - LocalAreaOffset;
-  MFI->setStackSize(StackSize);
-  NumBytesStackSpace += StackSize;
-}
-
-/// insertPrologEpilogCode - Scan the function for modified callee saved
-/// registers, insert spill code for these callee saved registers, then add
-/// prolog and epilog code to the function.
-///
-void WasmPEI::insertPrologEpilogCode(MachineFunction &Fn) {
-  const TargetFrameLowering &TFI = *Fn.getSubtarget().getFrameLowering();
-
-  // Add prologue to the function...
-  for (MachineBasicBlock *SaveBlock : SaveBlocks)
-    TFI.emitPrologue(Fn, *SaveBlock);
-
-  // Add epilogue to restore the callee-save registers in each exiting block.
-  for (MachineBasicBlock *RestoreBlock : RestoreBlocks)
-    TFI.emitEpilogue(Fn, *RestoreBlock);
-
-  for (MachineBasicBlock *SaveBlock : SaveBlocks)
-    TFI.inlineStackProbe(Fn, *SaveBlock);
-
-  // Emit additional code that is required to support segmented stacks, if
-  // we've been asked for it.  This, when linked with a runtime with support
-  // for segmented stacks (libgcc is one), will result in allocating stack
-  // space in small chunks instead of one large contiguous block.
-  if (Fn.shouldSplitStack()) {
-    for (MachineBasicBlock *SaveBlock : SaveBlocks)
-      TFI.adjustForSegmentedStacks(Fn, *SaveBlock);
-  }
-
-  // Emit additional code that is required to explicitly handle the stack in
-  // HiPE native code (if needed) when loaded in the Erlang/OTP runtime. The
-  // approach is rather similar to that of Segmented Stacks, but it uses a
-  // different conditional check and another BIF for allocating more stack
-  // space.
-  if (Fn.getFunction()->getCallingConv() == CallingConv::HiPE)
-    for (MachineBasicBlock *SaveBlock : SaveBlocks)
-      TFI.adjustForHiPEPrologue(Fn, *SaveBlock);
-}
-
-/// replaceFrameIndices - Replace all MO_FrameIndex operands with physical
-/// register references and actual offsets.
-///
-void WasmPEI::replaceFrameIndices(MachineFunction &Fn) {
-  const TargetFrameLowering &TFI = *Fn.getSubtarget().getFrameLowering();
-  if (!TFI.needsFrameIndexResolution(Fn)) return;
-
-  // Store SPAdj at exit of a basic block.
-  SmallVector<int, 8> SPState;
-  SPState.resize(Fn.getNumBlockIDs());
-  SmallPtrSet<MachineBasicBlock*, 8> Reachable;
-
-  // Iterate over the reachable blocks in DFS order.
-  for (auto DFI = df_ext_begin(&Fn, Reachable), DFE = df_ext_end(&Fn, Reachable);
-       DFI != DFE; ++DFI) {
-    int SPAdj = 0;
-    // Check the exit state of the DFS stack predecessor.
-    if (DFI.getPathLength() >= 2) {
-      MachineBasicBlock *StackPred = DFI.getPath(DFI.getPathLength() - 2);
-      assert(Reachable.count(StackPred) &&
-             "DFS stack predecessor is already visited.\n");
-      SPAdj = SPState[StackPred->getNumber()];
-    }
-    MachineBasicBlock *BB = *DFI;
-    replaceFrameIndices(BB, Fn, SPAdj);
-    SPState[BB->getNumber()] = SPAdj;
-  }
-
-  // Handle the unreachable blocks.
-  for (auto &BB : Fn) {
-    if (Reachable.count(&BB))
-      // Already handled in DFS traversal.
-      continue;
-    int SPAdj = 0;
-    replaceFrameIndices(&BB, Fn, SPAdj);
-  }
-}
-
-void WasmPEI::replaceFrameIndices(MachineBasicBlock *BB, MachineFunction &Fn,
-                              int &SPAdj) {
-  assert(Fn.getSubtarget().getRegisterInfo() &&
-         "getRegisterInfo() must be implemented!");
-  const TargetInstrInfo &TII = *Fn.getSubtarget().getInstrInfo();
-  const TargetRegisterInfo &TRI = *Fn.getSubtarget().getRegisterInfo();
-  const TargetFrameLowering *TFI = Fn.getSubtarget().getFrameLowering();
-  unsigned FrameSetupOpcode = TII.getCallFrameSetupOpcode();
-  unsigned FrameDestroyOpcode = TII.getCallFrameDestroyOpcode();
-
-  if (RS && !FrameIndexVirtualScavenging) RS->enterBasicBlock(BB);
-
-  bool InsideCallSequence = false;
-
-  for (MachineBasicBlock::iterator I = BB->begin(); I != BB->end(); ) {
-
-    if (I->getOpcode() == FrameSetupOpcode ||
-        I->getOpcode() == FrameDestroyOpcode) {
-      InsideCallSequence = (I->getOpcode() == FrameSetupOpcode);
-      SPAdj += TII.getSPAdjust(I);
-
-      MachineBasicBlock::iterator PrevI = BB->end();
-      if (I != BB->begin()) PrevI = std::prev(I);
-      TFI->eliminateCallFramePseudoInstr(Fn, *BB, I);
-
-      // Visit the instructions created by eliminateCallFramePseudoInstr().
-      if (PrevI == BB->end())
-        I = BB->begin();     // The replaced instr was the first in the block.
-      else
-        I = std::next(PrevI);
-      continue;
-    }
-
-    MachineInstr *MI = I;
-    bool DoIncr = true;
-    for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) {
-      if (!MI->getOperand(i).isFI())
-        continue;
-
-      // Frame indices in debug values are encoded in a target independent
-      // way with simply the frame index and offset rather than any
-      // target-specific addressing mode.
-      if (MI->isDebugValue()) {
-        assert(i == 0 && "Frame indices can only appear as the first "
-                         "operand of a DBG_VALUE machine instruction");
-        unsigned Reg;
-        MachineOperand &Offset = MI->getOperand(1);
-        Offset.setImm(Offset.getImm() +
-                      TFI->getFrameIndexReference(
-                          Fn, MI->getOperand(0).getIndex(), Reg));
-        MI->getOperand(0).ChangeToRegister(Reg, false /*isDef*/);
-        continue;
-      }
-
-      // TODO: This code should be commoned with the code for
-      // PATCHPOINT. There's no good reason for the difference in
-      // implementation other than historical accident.  The only
-      // remaining difference is the unconditional use of the stack
-      // pointer as the base register.
-      if (MI->getOpcode() == TargetOpcode::STATEPOINT) {
-        assert((!MI->isDebugValue() || i == 0) &&
-               "Frame indicies can only appear as the first operand of a "
-               "DBG_VALUE machine instruction");
-        unsigned Reg;
-        MachineOperand &Offset = MI->getOperand(i + 1);
-        const unsigned refOffset =
-          TFI->getFrameIndexReferenceFromSP(Fn, MI->getOperand(i).getIndex(),
-                                            Reg);
-
-        Offset.setImm(Offset.getImm() + refOffset);
-        MI->getOperand(i).ChangeToRegister(Reg, false /*isDef*/);
-        continue;
-      }
-
-      // Some instructions (e.g. inline asm instructions) can have
-      // multiple frame indices and/or cause eliminateFrameIndex
-      // to insert more than one instruction. We need the register
-      // scavenger to go through all of these instructions so that
-      // it can update its register information. We keep the
-      // iterator at the point before insertion so that we can
-      // revisit them in full.
-      bool AtBeginning = (I == BB->begin());
-      if (!AtBeginning) --I;
-
-      // If this instruction has a FrameIndex operand, we need to
-      // use that target machine register info object to eliminate
-      // it.
-      TRI.eliminateFrameIndex(MI, SPAdj, i,
-                              FrameIndexVirtualScavenging ?  nullptr : RS);
-
-      // Reset the iterator if we were at the beginning of the BB.
-      if (AtBeginning) {
-        I = BB->begin();
-        DoIncr = false;
-      }
-
-      MI = nullptr;
-      break;
-    }
-
-    // If we are looking at a call sequence, we need to keep track of
-    // the SP adjustment made by each instruction in the sequence.
-    // This includes both the frame setup/destroy pseudos (handled above),
-    // as well as other instructions that have side effects w.r.t the SP.
-    // Note that this must come after eliminateFrameIndex, because 
-    // if I itself referred to a frame index, we shouldn't count its own
-    // adjustment.
-    if (MI && InsideCallSequence)
-      SPAdj += TII.getSPAdjust(MI);
-
-    if (DoIncr && I != BB->end()) ++I;
-
-    // Update register states.
-    if (RS && !FrameIndexVirtualScavenging && MI) RS->forward(MI);
-  }
-}
-
-/// scavengeFrameVirtualRegs - Replace all frame index virtual registers
-/// with physical registers. Use the register scavenger to find an
-/// appropriate register to use.
-///
-/// FIXME: Iterating over the instruction stream is unnecessary. We can simply
-/// iterate over the vreg use list, which at this point only contains machine
-/// operands for which eliminateFrameIndex need a new scratch reg.
-void
-WasmPEI::scavengeFrameVirtualRegs(MachineFunction &Fn) {
-  // Run through the instructions and find any virtual registers.
-  for (MachineFunction::iterator BB = Fn.begin(),
-       E = Fn.end(); BB != E; ++BB) {
-    RS->enterBasicBlock(&*BB);
-
-    int SPAdj = 0;
-
-    // The instruction stream may change in the loop, so check BB->end()
-    // directly.
-    for (MachineBasicBlock::iterator I = BB->begin(); I != BB->end(); ) {
-      // We might end up here again with a NULL iterator if we scavenged a
-      // register for which we inserted spill code for definition by what was
-      // originally the first instruction in BB.
-      if (I == MachineBasicBlock::iterator(nullptr))
-        I = BB->begin();
-
-      MachineInstr *MI = I;
-      MachineBasicBlock::iterator J = std::next(I);
-      MachineBasicBlock::iterator P =
-                         I == BB->begin() ? MachineBasicBlock::iterator(nullptr)
-                                          : std::prev(I);
-
-      // RS should process this instruction before we might scavenge at this
-      // location. This is because we might be replacing a virtual register
-      // defined by this instruction, and if so, registers killed by this
-      // instruction are available, and defined registers are not.
-      RS->forward(I);
-
-      for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) {
-        if (MI->getOperand(i).isReg()) {
-          MachineOperand &MO = MI->getOperand(i);
-          unsigned Reg = MO.getReg();
-          if (Reg == 0)
-            continue;
-          if (!TargetRegisterInfo::isVirtualRegister(Reg))
-            continue;
-
-          // When we first encounter a new virtual register, it
-          // must be a definition.
-          assert(MI->getOperand(i).isDef() &&
-                 "frame index virtual missing def!");
-          // Scavenge a new scratch register
-          const TargetRegisterClass *RC = Fn.getRegInfo().getRegClass(Reg);
-          unsigned ScratchReg = RS->scavengeRegister(RC, J, SPAdj);
-
-          ++NumScavengedRegs;
-
-          // Replace this reference to the virtual register with the
-          // scratch register.
-          assert (ScratchReg && "Missing scratch register!");
-          Fn.getRegInfo().replaceRegWith(Reg, ScratchReg);
-          
-          // Because this instruction was processed by the RS before this
-          // register was allocated, make sure that the RS now records the
-          // register as being used.
-          RS->setRegUsed(ScratchReg);
-        }
-      }
-
-      // If the scavenger needed to use one of its spill slots, the
-      // spill code will have been inserted in between I and J. This is a
-      // problem because we need the spill code before I: Move I to just
-      // prior to J.
-      if (I != std::prev(J)) {
-        BB->splice(J, &*BB, I);
-
-        // Before we move I, we need to prepare the RS to visit I again.
-        // Specifically, RS will assert if it sees uses of registers that
-        // it believes are undefined. Because we have already processed
-        // register kills in I, when it visits I again, it will believe that
-        // those registers are undefined. To avoid this situation, unprocess
-        // the instruction I.
-        assert(RS->getCurrentPosition() == I &&
-          "The register scavenger has an unexpected position");
-        I = P;
-        RS->unprocess(P);
-      } else
-        ++I;
-    }
-  }
-}
diff --git a/lib/Target/WebAssembly/WebAssemblyPeephole.cpp b/lib/Target/WebAssembly/WebAssemblyPeephole.cpp
index 4ad6eed7385b..56d44e6466eb 100644
--- a/lib/Target/WebAssembly/WebAssemblyPeephole.cpp
+++ b/lib/Target/WebAssembly/WebAssemblyPeephole.cpp
@@ -12,14 +12,23 @@
 ///
 //===----------------------------------------------------------------------===//
 
-#include "WebAssembly.h"
 #include "MCTargetDesc/WebAssemblyMCTargetDesc.h"
+#include "WebAssembly.h"
 #include "WebAssemblyMachineFunctionInfo.h"
+#include "WebAssemblySubtarget.h"
+#include "llvm/Analysis/TargetLibraryInfo.h"
 #include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
 using namespace llvm;
 
 #define DEBUG_TYPE "wasm-peephole"
 
+static cl::opt<bool> DisableWebAssemblyFallthroughReturnOpt(
+    "disable-wasm-fallthrough-return-opt", cl::Hidden,
+    cl::desc("WebAssembly: Disable fallthrough-return optimizations."),
+    cl::init(false));
+
 namespace {
 class WebAssemblyPeephole final : public MachineFunctionPass {
   const char *getPassName() const override {
@@ -28,6 +37,7 @@ class WebAssemblyPeephole final : public MachineFunctionPass {
 
   void getAnalysisUsage(AnalysisUsage &AU) const override {
     AU.setPreservesCFG();
+    AU.addRequired<TargetLibraryInfoWrapperPass>();
     MachineFunctionPass::getAnalysisUsage(AU);
   }
 
@@ -44,11 +54,65 @@ FunctionPass *llvm::createWebAssemblyPeephole() {
   return new WebAssemblyPeephole();
 }
 
-bool WebAssemblyPeephole::runOnMachineFunction(MachineFunction &MF) {
+/// If desirable, rewrite NewReg to a drop register.
+static bool MaybeRewriteToDrop(unsigned OldReg, unsigned NewReg,
+                               MachineOperand &MO, WebAssemblyFunctionInfo &MFI,
+                               MachineRegisterInfo &MRI) {
   bool Changed = false;
+  if (OldReg == NewReg) {
+    Changed = true;
+    unsigned NewReg = MRI.createVirtualRegister(MRI.getRegClass(OldReg));
+    MO.setReg(NewReg);
+    MO.setIsDead();
+    MFI.stackifyVReg(NewReg);
+  }
+  return Changed;
+}
+
+static bool MaybeRewriteToFallthrough(MachineInstr &MI, MachineBasicBlock &MBB,
+                                      const MachineFunction &MF,
+                                      WebAssemblyFunctionInfo &MFI,
+                                      MachineRegisterInfo &MRI,
+                                      const WebAssemblyInstrInfo &TII,
+                                      unsigned FallthroughOpc,
+                                      unsigned CopyLocalOpc) {
+  if (DisableWebAssemblyFallthroughReturnOpt)
+    return false;
+  if (&MBB != &MF.back())
+    return false;
+  if (&MI != &MBB.back())
+    return false;
+
+  // If the operand isn't stackified, insert a COPY_LOCAL to read the operand
+  // and stackify it.
+  MachineOperand &MO = MI.getOperand(0);
+  unsigned Reg = MO.getReg();
+  if (!MFI.isVRegStackified(Reg)) {
+    unsigned NewReg = MRI.createVirtualRegister(MRI.getRegClass(Reg));
+    BuildMI(MBB, MI, MI.getDebugLoc(), TII.get(CopyLocalOpc), NewReg)
+        .addReg(Reg);
+    MO.setReg(NewReg);
+    MFI.stackifyVReg(NewReg);
+  }
+
+  // Rewrite the return.
+  MI.setDesc(TII.get(FallthroughOpc));
+  return true;
+}
+
+bool WebAssemblyPeephole::runOnMachineFunction(MachineFunction &MF) {
+  DEBUG({
+    dbgs() << "********** Peephole **********\n"
+           << "********** Function: " << MF.getName() << '\n';
+  });
 
   MachineRegisterInfo &MRI = MF.getRegInfo();
   WebAssemblyFunctionInfo &MFI = *MF.getInfo<WebAssemblyFunctionInfo>();
+  const auto &TII = *MF.getSubtarget<WebAssemblySubtarget>().getInstrInfo();
+  const WebAssemblyTargetLowering &TLI =
+      *MF.getSubtarget<WebAssemblySubtarget>().getTargetLowering();
+  auto &LibInfo = getAnalysis<TargetLibraryInfoWrapperPass>().getTLI();
+  bool Changed = false;
 
   for (auto &MBB : MF)
     for (auto &MI : MBB)
@@ -66,20 +130,67 @@ bool WebAssemblyPeephole::runOnMachineFunction(MachineFunction &MF) {
       case WebAssembly::STORE_I64: {
         // Store instructions return their value operand. If we ended up using
         // the same register for both, replace it with a dead def so that it
-        // can use $discard instead.
+        // can use $drop instead.
         MachineOperand &MO = MI.getOperand(0);
         unsigned OldReg = MO.getReg();
-        // TODO: Handle SP/physregs
-        if (OldReg == MI.getOperand(3).getReg()
-            && TargetRegisterInfo::isVirtualRegister(MI.getOperand(3).getReg())) {
-          Changed = true;
-          unsigned NewReg = MRI.createVirtualRegister(MRI.getRegClass(OldReg));
-          MO.setReg(NewReg);
-          MO.setIsDead();
-          MFI.stackifyVReg(NewReg);
-          MFI.addWAReg(NewReg, WebAssemblyFunctionInfo::UnusedReg);
+        unsigned NewReg =
+            MI.getOperand(WebAssembly::StoreValueOperandNo).getReg();
+        Changed |= MaybeRewriteToDrop(OldReg, NewReg, MO, MFI, MRI);
+        break;
+      }
+      case WebAssembly::CALL_I32:
+      case WebAssembly::CALL_I64: {
+        MachineOperand &Op1 = MI.getOperand(1);
+        if (Op1.isSymbol()) {
+          StringRef Name(Op1.getSymbolName());
+          if (Name == TLI.getLibcallName(RTLIB::MEMCPY) ||
+              Name == TLI.getLibcallName(RTLIB::MEMMOVE) ||
+              Name == TLI.getLibcallName(RTLIB::MEMSET)) {
+            LibFunc::Func Func;
+            if (LibInfo.getLibFunc(Name, Func)) {
+              const auto &Op2 = MI.getOperand(2);
+              if (!Op2.isReg())
+                report_fatal_error("Peephole: call to builtin function with "
+                                   "wrong signature, not consuming reg");
+              MachineOperand &MO = MI.getOperand(0);
+              unsigned OldReg = MO.getReg();
+              unsigned NewReg = Op2.getReg();
+
+              if (MRI.getRegClass(NewReg) != MRI.getRegClass(OldReg))
+                report_fatal_error("Peephole: call to builtin function with "
+                                   "wrong signature, from/to mismatch");
+              Changed |= MaybeRewriteToDrop(OldReg, NewReg, MO, MFI, MRI);
+            }
+          }
         }
+        break;
       }
+      // Optimize away an explicit void return at the end of the function.
+      case WebAssembly::RETURN_I32:
+        Changed |= MaybeRewriteToFallthrough(
+            MI, MBB, MF, MFI, MRI, TII, WebAssembly::FALLTHROUGH_RETURN_I32,
+            WebAssembly::COPY_LOCAL_I32);
+        break;
+      case WebAssembly::RETURN_I64:
+        Changed |= MaybeRewriteToFallthrough(
+            MI, MBB, MF, MFI, MRI, TII, WebAssembly::FALLTHROUGH_RETURN_I64,
+            WebAssembly::COPY_LOCAL_I64);
+        break;
+      case WebAssembly::RETURN_F32:
+        Changed |= MaybeRewriteToFallthrough(
+            MI, MBB, MF, MFI, MRI, TII, WebAssembly::FALLTHROUGH_RETURN_F32,
+            WebAssembly::COPY_LOCAL_F32);
+        break;
+      case WebAssembly::RETURN_F64:
+        Changed |= MaybeRewriteToFallthrough(
+            MI, MBB, MF, MFI, MRI, TII, WebAssembly::FALLTHROUGH_RETURN_F64,
+            WebAssembly::COPY_LOCAL_F64);
+        break;
+      case WebAssembly::RETURN_VOID:
+        if (!DisableWebAssemblyFallthroughReturnOpt &&
+            &MBB == &MF.back() && &MI == &MBB.back())
+          MI.setDesc(TII.get(WebAssembly::FALLTHROUGH_RETURN_VOID));
+        break;
       }
 
   return Changed;
diff --git a/lib/Target/WebAssembly/WebAssemblyPrepareForLiveIntervals.cpp b/lib/Target/WebAssembly/WebAssemblyPrepareForLiveIntervals.cpp
new file mode 100644
index 000000000000..30444ac598a4
--- /dev/null
+++ b/lib/Target/WebAssembly/WebAssemblyPrepareForLiveIntervals.cpp
@@ -0,0 +1,136 @@
+//===- WebAssemblyPrepareForLiveIntervals.cpp - Prepare for LiveIntervals -===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// \brief Fix up code to meet LiveInterval's requirements.
+///
+/// Some CodeGen passes don't preserve LiveInterval's requirements, because
+/// they run after register allocation and it isn't important. However,
+/// WebAssembly runs LiveIntervals in a late pass. This pass transforms code
+/// to meet LiveIntervals' requirements; primarily, it ensures that all
+/// virtual register uses have definitions (IMPLICIT_DEF definitions if
+/// nothing else).
+///
+//===----------------------------------------------------------------------===//
+
+#include "WebAssembly.h"
+#include "MCTargetDesc/WebAssemblyMCTargetDesc.h"
+#include "WebAssemblyMachineFunctionInfo.h"
+#include "WebAssemblySubtarget.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/Passes.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+using namespace llvm;
+
+#define DEBUG_TYPE "wasm-prepare-for-live-intervals"
+
+namespace {
+class WebAssemblyPrepareForLiveIntervals final : public MachineFunctionPass {
+public:
+  static char ID; // Pass identification, replacement for typeid
+  WebAssemblyPrepareForLiveIntervals() : MachineFunctionPass(ID) {}
+
+private:
+  const char *getPassName() const override {
+    return "WebAssembly Prepare For LiveIntervals";
+  }
+
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    AU.setPreservesCFG();
+    MachineFunctionPass::getAnalysisUsage(AU);
+  }
+
+  bool runOnMachineFunction(MachineFunction &MF) override;
+};
+} // end anonymous namespace
+
+char WebAssemblyPrepareForLiveIntervals::ID = 0;
+FunctionPass *llvm::createWebAssemblyPrepareForLiveIntervals() {
+  return new WebAssemblyPrepareForLiveIntervals();
+}
+
+/// Test whether the given instruction is an ARGUMENT.
+static bool IsArgument(const MachineInstr *MI) {
+  switch (MI->getOpcode()) {
+  case WebAssembly::ARGUMENT_I32:
+  case WebAssembly::ARGUMENT_I64:
+  case WebAssembly::ARGUMENT_F32:
+  case WebAssembly::ARGUMENT_F64:
+    return true;
+  default:
+    return false;
+  }
+}
+
+// Test whether the given register has an ARGUMENT def.
+static bool HasArgumentDef(unsigned Reg, const MachineRegisterInfo &MRI) {
+  for (auto &Def : MRI.def_instructions(Reg))
+    if (IsArgument(&Def))
+      return true;
+  return false;
+}
+
+bool WebAssemblyPrepareForLiveIntervals::runOnMachineFunction(MachineFunction &MF) {
+  DEBUG({
+    dbgs() << "********** Prepare For LiveIntervals **********\n"
+           << "********** Function: " << MF.getName() << '\n';
+  });
+
+  bool Changed = false;
+  MachineRegisterInfo &MRI = MF.getRegInfo();
+  const auto &TII = *MF.getSubtarget<WebAssemblySubtarget>().getInstrInfo();
+  MachineBasicBlock &Entry = *MF.begin();
+
+  assert(!mustPreserveAnalysisID(LiveIntervalsID) &&
+         "LiveIntervals shouldn't be active yet!");
+
+  // We don't preserve SSA form.
+  MRI.leaveSSA();
+
+  // BranchFolding and perhaps other passes don't preserve IMPLICIT_DEF
+  // instructions. LiveIntervals requires that all paths to virtual register
+  // uses provide a definition. Insert IMPLICIT_DEFs in the entry block to
+  // conservatively satisfy this.
+  //
+  // TODO: This is fairly heavy-handed; find a better approach.
+  //
+  for (unsigned i = 0, e = MRI.getNumVirtRegs(); i < e; ++i) {
+    unsigned Reg = TargetRegisterInfo::index2VirtReg(i);
+
+    // Skip unused registers.
+    if (MRI.use_nodbg_empty(Reg))
+      continue;
+
+    // Skip registers that have an ARGUMENT definition.
+    if (HasArgumentDef(Reg, MRI))
+      continue;
+
+    BuildMI(Entry, Entry.begin(), DebugLoc(),
+            TII.get(WebAssembly::IMPLICIT_DEF), Reg);
+    Changed = true;
+  }
+
+  // Move ARGUMENT_* instructions to the top of the entry block, so that their
+  // liveness reflects the fact that these really are live-in values.
+  for (auto MII = Entry.begin(), MIE = Entry.end(); MII != MIE; ) {
+    MachineInstr *MI = &*MII++;
+    if (IsArgument(MI)) {
+      MI->removeFromParent();
+      Entry.insert(Entry.begin(), MI);
+    }
+  }
+
+  // Ok, we're now ready to run LiveIntervalAnalysis again.
+  MF.getProperties().set(MachineFunctionProperties::Property::TracksLiveness);
+
+  return Changed;
+}
diff --git a/lib/Target/WebAssembly/WebAssemblyRegColoring.cpp b/lib/Target/WebAssembly/WebAssemblyRegColoring.cpp
index 9ec66595d8da..dedd9108dfd5 100644
--- a/lib/Target/WebAssembly/WebAssemblyRegColoring.cpp
+++ b/lib/Target/WebAssembly/WebAssemblyRegColoring.cpp
@@ -66,7 +66,7 @@ static float computeWeight(const MachineRegisterInfo *MRI,
   float weight = 0.0f;
   for (MachineOperand &MO : MRI->reg_nodbg_operands(VReg))
     weight += LiveIntervals::getSpillWeight(MO.isDef(), MO.isUse(), MBFI,
-                                            MO.getParent());
+                                            *MO.getParent());
   return weight;
 }
 
@@ -99,7 +99,7 @@ bool WebAssemblyRegColoring::runOnMachineFunction(MachineFunction &MF) {
     unsigned VReg = TargetRegisterInfo::index2VirtReg(i);
     if (MFI.isVRegStackified(VReg))
       continue;
-    // Skip unused registers, which can use $discard.
+    // Skip unused registers, which can use $drop.
     if (MRI->use_empty(VReg))
       continue;
 
diff --git a/lib/Target/WebAssembly/WebAssemblyRegNumbering.cpp b/lib/Target/WebAssembly/WebAssemblyRegNumbering.cpp
index f621db070b5b..4a8fd96f8324 100644
--- a/lib/Target/WebAssembly/WebAssemblyRegNumbering.cpp
+++ b/lib/Target/WebAssembly/WebAssemblyRegNumbering.cpp
@@ -18,8 +18,8 @@
 #include "WebAssemblyMachineFunctionInfo.h"
 #include "WebAssemblySubtarget.h"
 #include "llvm/ADT/SCCIterator.h"
-#include "llvm/CodeGen/MachineFunction.h"
 #include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/CodeGen/MachineFunction.h"
 #include "llvm/CodeGen/MachineInstrBuilder.h"
 #include "llvm/CodeGen/MachineLoopInfo.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
@@ -61,7 +61,6 @@ bool WebAssemblyRegNumbering::runOnMachineFunction(MachineFunction &MF) {
 
   WebAssemblyFunctionInfo &MFI = *MF.getInfo<WebAssemblyFunctionInfo>();
   MachineRegisterInfo &MRI = MF.getRegInfo();
-  const MachineFrameInfo &FrameInfo = *MF.getFrameInfo();
 
   MFI.initWARegs();
 
@@ -73,9 +72,13 @@ bool WebAssemblyRegNumbering::runOnMachineFunction(MachineFunction &MF) {
     case WebAssembly::ARGUMENT_I32:
     case WebAssembly::ARGUMENT_I64:
     case WebAssembly::ARGUMENT_F32:
-    case WebAssembly::ARGUMENT_F64:
-      MFI.setWAReg(MI.getOperand(0).getReg(), MI.getOperand(1).getImm());
+    case WebAssembly::ARGUMENT_F64: {
+      int64_t Imm = MI.getOperand(1).getImm();
+      DEBUG(dbgs() << "Arg VReg " << MI.getOperand(0).getReg() << " -> WAReg "
+                   << Imm << "\n");
+      MFI.setWAReg(MI.getOperand(0).getReg(), Imm);
       break;
+    }
     default:
       break;
     }
@@ -84,26 +87,27 @@ bool WebAssemblyRegNumbering::runOnMachineFunction(MachineFunction &MF) {
   // Then assign regular WebAssembly registers for all remaining used
   // virtual registers. TODO: Consider sorting the registers by frequency of
   // use, to maximize usage of small immediate fields.
-  unsigned NumArgRegs = MFI.getParams().size();
   unsigned NumVRegs = MF.getRegInfo().getNumVirtRegs();
   unsigned NumStackRegs = 0;
-  unsigned CurReg = 0;
+  // Start the numbering for locals after the arg regs
+  unsigned CurReg = MFI.getParams().size();
   for (unsigned VRegIdx = 0; VRegIdx < NumVRegs; ++VRegIdx) {
     unsigned VReg = TargetRegisterInfo::index2VirtReg(VRegIdx);
+    // Skip unused registers.
+    if (MRI.use_empty(VReg))
+      continue;
     // Handle stackified registers.
     if (MFI.isVRegStackified(VReg)) {
+      DEBUG(dbgs() << "VReg " << VReg << " -> WAReg "
+                   << (INT32_MIN | NumStackRegs) << "\n");
       MFI.setWAReg(VReg, INT32_MIN | NumStackRegs++);
       continue;
     }
-    // Skip unused registers.
-    if (MRI.use_empty(VReg))
-      continue;
-    if (MFI.getWAReg(VReg) == WebAssemblyFunctionInfo::UnusedReg)
-      MFI.setWAReg(VReg, NumArgRegs + CurReg++);
+    if (MFI.getWAReg(VReg) == WebAssemblyFunctionInfo::UnusedReg) {
+      DEBUG(dbgs() << "VReg " << VReg << " -> WAReg " << CurReg << "\n");
+      MFI.setWAReg(VReg, CurReg++);
+    }
   }
-  // Allocate locals for used physical registers
-  if (FrameInfo.getStackSize() > 0)
-    MFI.addPReg(WebAssembly::SP32, CurReg++);
 
   return true;
 }
diff --git a/lib/Target/WebAssembly/WebAssemblyRegStackify.cpp b/lib/Target/WebAssembly/WebAssemblyRegStackify.cpp
index 537c147e6142..0aa3b621da32 100644
--- a/lib/Target/WebAssembly/WebAssemblyRegStackify.cpp
+++ b/lib/Target/WebAssembly/WebAssemblyRegStackify.cpp
@@ -23,9 +23,12 @@
 #include "WebAssembly.h"
 #include "MCTargetDesc/WebAssemblyMCTargetDesc.h" // for WebAssembly::ARGUMENT_*
 #include "WebAssemblyMachineFunctionInfo.h"
+#include "WebAssemblySubtarget.h"
 #include "llvm/Analysis/AliasAnalysis.h"
 #include "llvm/CodeGen/LiveIntervalAnalysis.h"
 #include "llvm/CodeGen/MachineBlockFrequencyInfo.h"
+#include "llvm/CodeGen/MachineDominators.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
 #include "llvm/CodeGen/Passes.h"
 #include "llvm/Support/Debug.h"
@@ -43,12 +46,13 @@ class WebAssemblyRegStackify final : public MachineFunctionPass {
   void getAnalysisUsage(AnalysisUsage &AU) const override {
     AU.setPreservesCFG();
     AU.addRequired<AAResultsWrapperPass>();
+    AU.addRequired<MachineDominatorTree>();
     AU.addRequired<LiveIntervals>();
     AU.addPreserved<MachineBlockFrequencyInfo>();
     AU.addPreserved<SlotIndexes>();
     AU.addPreserved<LiveIntervals>();
-    AU.addPreservedID(MachineDominatorsID);
     AU.addPreservedID(LiveVariablesID);
+    AU.addPreserved<MachineDominatorTree>();
     MachineFunctionPass::getAnalysisUsage(AU);
   }
 
@@ -82,17 +86,197 @@ static void ImposeStackOrdering(MachineInstr *MI) {
                                              /*isImp=*/true));
 }
 
+// Determine whether a call to the callee referenced by
+// MI->getOperand(CalleeOpNo) reads memory, writes memory, and/or has side
+// effects.
+static void QueryCallee(const MachineInstr &MI, unsigned CalleeOpNo, bool &Read,
+                        bool &Write, bool &Effects, bool &StackPointer) {
+  // All calls can use the stack pointer.
+  StackPointer = true;
+
+  const MachineOperand &MO = MI.getOperand(CalleeOpNo);
+  if (MO.isGlobal()) {
+    const Constant *GV = MO.getGlobal();
+    if (const GlobalAlias *GA = dyn_cast<GlobalAlias>(GV))
+      if (!GA->isInterposable())
+        GV = GA->getAliasee();
+
+    if (const Function *F = dyn_cast<Function>(GV)) {
+      if (!F->doesNotThrow())
+        Effects = true;
+      if (F->doesNotAccessMemory())
+        return;
+      if (F->onlyReadsMemory()) {
+        Read = true;
+        return;
+      }
+    }
+  }
+
+  // Assume the worst.
+  Write = true;
+  Read = true;
+  Effects = true;
+}
+
+// Determine whether MI reads memory, writes memory, has side effects,
+// and/or uses the __stack_pointer value.
+static void Query(const MachineInstr &MI, AliasAnalysis &AA, bool &Read,
+                  bool &Write, bool &Effects, bool &StackPointer) {
+  assert(!MI.isPosition());
+  assert(!MI.isTerminator());
+
+  if (MI.isDebugValue())
+    return;
+
+  // Check for loads.
+  if (MI.mayLoad() && !MI.isInvariantLoad(&AA))
+    Read = true;
+
+  // Check for stores.
+  if (MI.mayStore()) {
+    Write = true;
+
+    // Check for stores to __stack_pointer.
+    for (auto MMO : MI.memoperands()) {
+      const MachinePointerInfo &MPI = MMO->getPointerInfo();
+      if (MPI.V.is<const PseudoSourceValue *>()) {
+        auto PSV = MPI.V.get<const PseudoSourceValue *>();
+        if (const ExternalSymbolPseudoSourceValue *EPSV =
+                dyn_cast<ExternalSymbolPseudoSourceValue>(PSV))
+          if (StringRef(EPSV->getSymbol()) == "__stack_pointer")
+            StackPointer = true;
+      }
+    }
+  } else if (MI.hasOrderedMemoryRef()) {
+    switch (MI.getOpcode()) {
+    case WebAssembly::DIV_S_I32: case WebAssembly::DIV_S_I64:
+    case WebAssembly::REM_S_I32: case WebAssembly::REM_S_I64:
+    case WebAssembly::DIV_U_I32: case WebAssembly::DIV_U_I64:
+    case WebAssembly::REM_U_I32: case WebAssembly::REM_U_I64:
+    case WebAssembly::I32_TRUNC_S_F32: case WebAssembly::I64_TRUNC_S_F32:
+    case WebAssembly::I32_TRUNC_S_F64: case WebAssembly::I64_TRUNC_S_F64:
+    case WebAssembly::I32_TRUNC_U_F32: case WebAssembly::I64_TRUNC_U_F32:
+    case WebAssembly::I32_TRUNC_U_F64: case WebAssembly::I64_TRUNC_U_F64:
+      // These instruction have hasUnmodeledSideEffects() returning true
+      // because they trap on overflow and invalid so they can't be arbitrarily
+      // moved, however hasOrderedMemoryRef() interprets this plus their lack
+      // of memoperands as having a potential unknown memory reference.
+      break;
+    default:
+      // Record volatile accesses, unless it's a call, as calls are handled
+      // specially below.
+      if (!MI.isCall()) {
+        Write = true;
+        Effects = true;
+      }
+      break;
+    }
+  }
+
+  // Check for side effects.
+  if (MI.hasUnmodeledSideEffects()) {
+    switch (MI.getOpcode()) {
+    case WebAssembly::DIV_S_I32: case WebAssembly::DIV_S_I64:
+    case WebAssembly::REM_S_I32: case WebAssembly::REM_S_I64:
+    case WebAssembly::DIV_U_I32: case WebAssembly::DIV_U_I64:
+    case WebAssembly::REM_U_I32: case WebAssembly::REM_U_I64:
+    case WebAssembly::I32_TRUNC_S_F32: case WebAssembly::I64_TRUNC_S_F32:
+    case WebAssembly::I32_TRUNC_S_F64: case WebAssembly::I64_TRUNC_S_F64:
+    case WebAssembly::I32_TRUNC_U_F32: case WebAssembly::I64_TRUNC_U_F32:
+    case WebAssembly::I32_TRUNC_U_F64: case WebAssembly::I64_TRUNC_U_F64:
+      // These instructions have hasUnmodeledSideEffects() returning true
+      // because they trap on overflow and invalid so they can't be arbitrarily
+      // moved, however in the specific case of register stackifying, it is safe
+      // to move them because overflow and invalid are Undefined Behavior.
+      break;
+    default:
+      Effects = true;
+      break;
+    }
+  }
+
+  // Analyze calls.
+  if (MI.isCall()) {
+    switch (MI.getOpcode()) {
+    case WebAssembly::CALL_VOID:
+    case WebAssembly::CALL_INDIRECT_VOID:
+      QueryCallee(MI, 0, Read, Write, Effects, StackPointer);
+      break;
+    case WebAssembly::CALL_I32: case WebAssembly::CALL_I64:
+    case WebAssembly::CALL_F32: case WebAssembly::CALL_F64:
+    case WebAssembly::CALL_INDIRECT_I32: case WebAssembly::CALL_INDIRECT_I64:
+    case WebAssembly::CALL_INDIRECT_F32: case WebAssembly::CALL_INDIRECT_F64:
+      QueryCallee(MI, 1, Read, Write, Effects, StackPointer);
+      break;
+    default:
+      llvm_unreachable("unexpected call opcode");
+    }
+  }
+}
+
+// Test whether Def is safe and profitable to rematerialize.
+static bool ShouldRematerialize(const MachineInstr &Def, AliasAnalysis &AA,
+                                const WebAssemblyInstrInfo *TII) {
+  return Def.isAsCheapAsAMove() && TII->isTriviallyReMaterializable(Def, &AA);
+}
+
+// Identify the definition for this register at this point. This is a
+// generalization of MachineRegisterInfo::getUniqueVRegDef that uses
+// LiveIntervals to handle complex cases.
+static MachineInstr *GetVRegDef(unsigned Reg, const MachineInstr *Insert,
+                                const MachineRegisterInfo &MRI,
+                                const LiveIntervals &LIS)
+{
+  // Most registers are in SSA form here so we try a quick MRI query first.
+  if (MachineInstr *Def = MRI.getUniqueVRegDef(Reg))
+    return Def;
+
+  // MRI doesn't know what the Def is. Try asking LIS.
+  if (const VNInfo *ValNo = LIS.getInterval(Reg).getVNInfoBefore(
+          LIS.getInstructionIndex(*Insert)))
+    return LIS.getInstructionFromIndex(ValNo->def);
+
+  return nullptr;
+}
+
+// Test whether Reg, as defined at Def, has exactly one use. This is a
+// generalization of MachineRegisterInfo::hasOneUse that uses LiveIntervals
+// to handle complex cases.
+static bool HasOneUse(unsigned Reg, MachineInstr *Def,
+                      MachineRegisterInfo &MRI, MachineDominatorTree &MDT,
+                      LiveIntervals &LIS) {
+  // Most registers are in SSA form here so we try a quick MRI query first.
+  if (MRI.hasOneUse(Reg))
+    return true;
+
+  bool HasOne = false;
+  const LiveInterval &LI = LIS.getInterval(Reg);
+  const VNInfo *DefVNI = LI.getVNInfoAt(
+      LIS.getInstructionIndex(*Def).getRegSlot());
+  assert(DefVNI);
+  for (auto I : MRI.use_nodbg_operands(Reg)) {
+    const auto &Result = LI.Query(LIS.getInstructionIndex(*I.getParent()));
+    if (Result.valueIn() == DefVNI) {
+      if (!Result.isKill())
+        return false;
+      if (HasOne)
+        return false;
+      HasOne = true;
+    }
+  }
+  return HasOne;
+}
+
 // Test whether it's safe to move Def to just before Insert.
 // TODO: Compute memory dependencies in a way that doesn't require always
 // walking the block.
 // TODO: Compute memory dependencies in a way that uses AliasAnalysis to be
 // more precise.
 static bool IsSafeToMove(const MachineInstr *Def, const MachineInstr *Insert,
-                         AliasAnalysis &AA, LiveIntervals &LIS,
-                         MachineRegisterInfo &MRI) {
+                         AliasAnalysis &AA, const LiveIntervals &LIS,
+                         const MachineRegisterInfo &MRI) {
   assert(Def->getParent() == Insert->getParent());
-  bool SawStore = false, SawSideEffects = false;
-  MachineBasicBlock::const_iterator D(Def), I(Insert);
 
   // Check for register dependencies.
   for (const MachineOperand &MO : Def->operands()) {
@@ -106,6 +290,10 @@ static bool IsSafeToMove(const MachineInstr *Def, const MachineInstr *Insert,
       continue;
 
     if (TargetRegisterInfo::isPhysicalRegister(Reg)) {
+      // Ignore ARGUMENTS; it's just used to keep the ARGUMENT_* instructions
+      // from moving down, and we've already checked for that.
+      if (Reg == WebAssembly::ARGUMENTS)
+        continue;
       // If the physical register is never modified, ignore it.
       if (!MRI.isPhysRegModified(Reg))
         continue;
@@ -114,24 +302,404 @@ static bool IsSafeToMove(const MachineInstr *Def, const MachineInstr *Insert,
     }
 
     // Ask LiveIntervals whether moving this virtual register use or def to
-    // Insert will change value numbers are seen.
+    // Insert will change which value numbers are seen.
+    // 
+    // If the operand is a use of a register that is also defined in the same
+    // instruction, test that the newly defined value reaches the insert point,
+    // since the operand will be moving along with the def.
     const LiveInterval &LI = LIS.getInterval(Reg);
-    VNInfo *DefVNI = MO.isDef() ?
-        LI.getVNInfoAt(LIS.getInstructionIndex(Def).getRegSlot()) :
-        LI.getVNInfoBefore(LIS.getInstructionIndex(Def));
+    VNInfo *DefVNI =
+        (MO.isDef() || Def->definesRegister(Reg)) ?
+        LI.getVNInfoAt(LIS.getInstructionIndex(*Def).getRegSlot()) :
+        LI.getVNInfoBefore(LIS.getInstructionIndex(*Def));
     assert(DefVNI && "Instruction input missing value number");
-    VNInfo *InsVNI = LI.getVNInfoBefore(LIS.getInstructionIndex(Insert));
+    VNInfo *InsVNI = LI.getVNInfoBefore(LIS.getInstructionIndex(*Insert));
     if (InsVNI && DefVNI != InsVNI)
       return false;
   }
 
-  // Check for memory dependencies and side effects.
-  for (--I; I != D; --I)
-    SawSideEffects |= I->isSafeToMove(&AA, SawStore);
-  return !(SawStore && Def->mayLoad() && !Def->isInvariantLoad(&AA)) &&
-         !(SawSideEffects && !Def->isSafeToMove(&AA, SawStore));
+  bool Read = false, Write = false, Effects = false, StackPointer = false;
+  Query(*Def, AA, Read, Write, Effects, StackPointer);
+
+  // If the instruction does not access memory and has no side effects, it has
+  // no additional dependencies.
+  if (!Read && !Write && !Effects && !StackPointer)
+    return true;
+
+  // Scan through the intervening instructions between Def and Insert.
+  MachineBasicBlock::const_iterator D(Def), I(Insert);
+  for (--I; I != D; --I) {
+    bool InterveningRead = false;
+    bool InterveningWrite = false;
+    bool InterveningEffects = false;
+    bool InterveningStackPointer = false;
+    Query(*I, AA, InterveningRead, InterveningWrite, InterveningEffects,
+          InterveningStackPointer);
+    if (Effects && InterveningEffects)
+      return false;
+    if (Read && InterveningWrite)
+      return false;
+    if (Write && (InterveningRead || InterveningWrite))
+      return false;
+    if (StackPointer && InterveningStackPointer)
+      return false;
+  }
+
+  return true;
+}
+
+/// Test whether OneUse, a use of Reg, dominates all of Reg's other uses.
+static bool OneUseDominatesOtherUses(unsigned Reg, const MachineOperand &OneUse,
+                                     const MachineBasicBlock &MBB,
+                                     const MachineRegisterInfo &MRI,
+                                     const MachineDominatorTree &MDT,
+                                     LiveIntervals &LIS,
+                                     WebAssemblyFunctionInfo &MFI) {
+  const LiveInterval &LI = LIS.getInterval(Reg);
+
+  const MachineInstr *OneUseInst = OneUse.getParent();
+  VNInfo *OneUseVNI = LI.getVNInfoBefore(LIS.getInstructionIndex(*OneUseInst));
+
+  for (const MachineOperand &Use : MRI.use_operands(Reg)) {
+    if (&Use == &OneUse)
+      continue;
+
+    const MachineInstr *UseInst = Use.getParent();
+    VNInfo *UseVNI = LI.getVNInfoBefore(LIS.getInstructionIndex(*UseInst));
+
+    if (UseVNI != OneUseVNI)
+      continue;
+
+    const MachineInstr *OneUseInst = OneUse.getParent();
+    if (UseInst == OneUseInst) {
+      // Another use in the same instruction. We need to ensure that the one
+      // selected use happens "before" it.
+      if (&OneUse > &Use)
+        return false;
+    } else {
+      // Test that the use is dominated by the one selected use.
+      while (!MDT.dominates(OneUseInst, UseInst)) {
+        // Actually, dominating is over-conservative. Test that the use would
+        // happen after the one selected use in the stack evaluation order.
+        //
+        // This is needed as a consequence of using implicit get_locals for
+        // uses and implicit set_locals for defs.
+        if (UseInst->getDesc().getNumDefs() == 0) 
+          return false;
+        const MachineOperand &MO = UseInst->getOperand(0);
+        if (!MO.isReg())
+          return false;
+        unsigned DefReg = MO.getReg();
+        if (!TargetRegisterInfo::isVirtualRegister(DefReg) ||
+            !MFI.isVRegStackified(DefReg))
+          return false;
+        assert(MRI.hasOneUse(DefReg));
+        const MachineOperand &NewUse = *MRI.use_begin(DefReg);
+        const MachineInstr *NewUseInst = NewUse.getParent();
+        if (NewUseInst == OneUseInst) {
+          if (&OneUse > &NewUse)
+            return false;
+          break;
+        }
+        UseInst = NewUseInst;
+      }
+    }
+  }
+  return true;
+}
+
+/// Get the appropriate tee_local opcode for the given register class.
+static unsigned GetTeeLocalOpcode(const TargetRegisterClass *RC) {
+  if (RC == &WebAssembly::I32RegClass)
+    return WebAssembly::TEE_LOCAL_I32;
+  if (RC == &WebAssembly::I64RegClass)
+    return WebAssembly::TEE_LOCAL_I64;
+  if (RC == &WebAssembly::F32RegClass)
+    return WebAssembly::TEE_LOCAL_F32;
+  if (RC == &WebAssembly::F64RegClass)
+    return WebAssembly::TEE_LOCAL_F64;
+  llvm_unreachable("Unexpected register class");
+}
+
+// Shrink LI to its uses, cleaning up LI.
+static void ShrinkToUses(LiveInterval &LI, LiveIntervals &LIS) {
+  if (LIS.shrinkToUses(&LI)) {
+    SmallVector<LiveInterval*, 4> SplitLIs;
+    LIS.splitSeparateComponents(LI, SplitLIs);
+  }
+}
+
+/// A single-use def in the same block with no intervening memory or register
+/// dependencies; move the def down and nest it with the current instruction.
+static MachineInstr *MoveForSingleUse(unsigned Reg, MachineOperand& Op,
+                                      MachineInstr *Def,
+                                      MachineBasicBlock &MBB,
+                                      MachineInstr *Insert, LiveIntervals &LIS,
+                                      WebAssemblyFunctionInfo &MFI,
+                                      MachineRegisterInfo &MRI) {
+  DEBUG(dbgs() << "Move for single use: "; Def->dump());
+
+  MBB.splice(Insert, &MBB, Def);
+  LIS.handleMove(*Def);
+
+  if (MRI.hasOneDef(Reg) && MRI.hasOneUse(Reg)) {
+    // No one else is using this register for anything so we can just stackify
+    // it in place.
+    MFI.stackifyVReg(Reg);
+  } else {
+    // The register may have unrelated uses or defs; create a new register for
+    // just our one def and use so that we can stackify it.
+    unsigned NewReg = MRI.createVirtualRegister(MRI.getRegClass(Reg));
+    Def->getOperand(0).setReg(NewReg);
+    Op.setReg(NewReg);
+
+    // Tell LiveIntervals about the new register.
+    LIS.createAndComputeVirtRegInterval(NewReg);
+
+    // Tell LiveIntervals about the changes to the old register.
+    LiveInterval &LI = LIS.getInterval(Reg);
+    LI.removeSegment(LIS.getInstructionIndex(*Def).getRegSlot(),
+                     LIS.getInstructionIndex(*Op.getParent()).getRegSlot(),
+                     /*RemoveDeadValNo=*/true);
+
+    MFI.stackifyVReg(NewReg);
+
+    DEBUG(dbgs() << " - Replaced register: "; Def->dump());
+  }
+
+  ImposeStackOrdering(Def);
+  return Def;
+}
+
+/// A trivially cloneable instruction; clone it and nest the new copy with the
+/// current instruction.
+static MachineInstr *RematerializeCheapDef(
+    unsigned Reg, MachineOperand &Op, MachineInstr &Def, MachineBasicBlock &MBB,
+    MachineBasicBlock::instr_iterator Insert, LiveIntervals &LIS,
+    WebAssemblyFunctionInfo &MFI, MachineRegisterInfo &MRI,
+    const WebAssemblyInstrInfo *TII, const WebAssemblyRegisterInfo *TRI) {
+  DEBUG(dbgs() << "Rematerializing cheap def: "; Def.dump());
+  DEBUG(dbgs() << " - for use in "; Op.getParent()->dump());
+
+  unsigned NewReg = MRI.createVirtualRegister(MRI.getRegClass(Reg));
+  TII->reMaterialize(MBB, Insert, NewReg, 0, Def, *TRI);
+  Op.setReg(NewReg);
+  MachineInstr *Clone = &*std::prev(Insert);
+  LIS.InsertMachineInstrInMaps(*Clone);
+  LIS.createAndComputeVirtRegInterval(NewReg);
+  MFI.stackifyVReg(NewReg);
+  ImposeStackOrdering(Clone);
+
+  DEBUG(dbgs() << " - Cloned to "; Clone->dump());
+
+  // Shrink the interval.
+  bool IsDead = MRI.use_empty(Reg);
+  if (!IsDead) {
+    LiveInterval &LI = LIS.getInterval(Reg);
+    ShrinkToUses(LI, LIS);
+    IsDead = !LI.liveAt(LIS.getInstructionIndex(Def).getDeadSlot());
+  }
+
+  // If that was the last use of the original, delete the original.
+  if (IsDead) {
+    DEBUG(dbgs() << " - Deleting original\n");
+    SlotIndex Idx = LIS.getInstructionIndex(Def).getRegSlot();
+    LIS.removePhysRegDefAt(WebAssembly::ARGUMENTS, Idx);
+    LIS.removeInterval(Reg);
+    LIS.RemoveMachineInstrFromMaps(Def);
+    Def.eraseFromParent();
+  }
+
+  return Clone;
 }
 
+/// A multiple-use def in the same block with no intervening memory or register
+/// dependencies; move the def down, nest it with the current instruction, and
+/// insert a tee_local to satisfy the rest of the uses. As an illustration,
+/// rewrite this:
+///
+///    Reg = INST ...        // Def
+///    INST ..., Reg, ...    // Insert
+///    INST ..., Reg, ...
+///    INST ..., Reg, ...
+///
+/// to this:
+///
+///    DefReg = INST ...     // Def (to become the new Insert)
+///    TeeReg, Reg = TEE_LOCAL_... DefReg
+///    INST ..., TeeReg, ... // Insert
+///    INST ..., Reg, ...
+///    INST ..., Reg, ...
+///
+/// with DefReg and TeeReg stackified. This eliminates a get_local from the
+/// resulting code.
+static MachineInstr *MoveAndTeeForMultiUse(
+    unsigned Reg, MachineOperand &Op, MachineInstr *Def, MachineBasicBlock &MBB,
+    MachineInstr *Insert, LiveIntervals &LIS, WebAssemblyFunctionInfo &MFI,
+    MachineRegisterInfo &MRI, const WebAssemblyInstrInfo *TII) {
+  DEBUG(dbgs() << "Move and tee for multi-use:"; Def->dump());
+
+  // Move Def into place.
+  MBB.splice(Insert, &MBB, Def);
+  LIS.handleMove(*Def);
+
+  // Create the Tee and attach the registers.
+  const auto *RegClass = MRI.getRegClass(Reg);
+  unsigned TeeReg = MRI.createVirtualRegister(RegClass);
+  unsigned DefReg = MRI.createVirtualRegister(RegClass);
+  MachineOperand &DefMO = Def->getOperand(0);
+  MachineInstr *Tee = BuildMI(MBB, Insert, Insert->getDebugLoc(),
+                              TII->get(GetTeeLocalOpcode(RegClass)), TeeReg)
+                          .addReg(Reg, RegState::Define)
+                          .addReg(DefReg, getUndefRegState(DefMO.isDead()));
+  Op.setReg(TeeReg);
+  DefMO.setReg(DefReg);
+  SlotIndex TeeIdx = LIS.InsertMachineInstrInMaps(*Tee).getRegSlot();
+  SlotIndex DefIdx = LIS.getInstructionIndex(*Def).getRegSlot();
+
+  // Tell LiveIntervals we moved the original vreg def from Def to Tee.
+  LiveInterval &LI = LIS.getInterval(Reg);
+  LiveInterval::iterator I = LI.FindSegmentContaining(DefIdx);
+  VNInfo *ValNo = LI.getVNInfoAt(DefIdx);
+  I->start = TeeIdx;
+  ValNo->def = TeeIdx;
+  ShrinkToUses(LI, LIS);
+
+  // Finish stackifying the new regs.
+  LIS.createAndComputeVirtRegInterval(TeeReg);
+  LIS.createAndComputeVirtRegInterval(DefReg);
+  MFI.stackifyVReg(DefReg);
+  MFI.stackifyVReg(TeeReg);
+  ImposeStackOrdering(Def);
+  ImposeStackOrdering(Tee);
+
+  DEBUG(dbgs() << " - Replaced register: "; Def->dump());
+  DEBUG(dbgs() << " - Tee instruction: "; Tee->dump());
+  return Def;
+}
+
+namespace {
+/// A stack for walking the tree of instructions being built, visiting the
+/// MachineOperands in DFS order.
+class TreeWalkerState {
+  typedef MachineInstr::mop_iterator mop_iterator;
+  typedef std::reverse_iterator<mop_iterator> mop_reverse_iterator;
+  typedef iterator_range<mop_reverse_iterator> RangeTy;
+  SmallVector<RangeTy, 4> Worklist;
+
+public:
+  explicit TreeWalkerState(MachineInstr *Insert) {
+    const iterator_range<mop_iterator> &Range = Insert->explicit_uses();
+    if (Range.begin() != Range.end())
+      Worklist.push_back(reverse(Range));
+  }
+
+  bool Done() const { return Worklist.empty(); }
+
+  MachineOperand &Pop() {
+    RangeTy &Range = Worklist.back();
+    MachineOperand &Op = *Range.begin();
+    Range = drop_begin(Range, 1);
+    if (Range.begin() == Range.end())
+      Worklist.pop_back();
+    assert((Worklist.empty() ||
+            Worklist.back().begin() != Worklist.back().end()) &&
+           "Empty ranges shouldn't remain in the worklist");
+    return Op;
+  }
+
+  /// Push Instr's operands onto the stack to be visited.
+  void PushOperands(MachineInstr *Instr) {
+    const iterator_range<mop_iterator> &Range(Instr->explicit_uses());
+    if (Range.begin() != Range.end())
+      Worklist.push_back(reverse(Range));
+  }
+
+  /// Some of Instr's operands are on the top of the stack; remove them and
+  /// re-insert them starting from the beginning (because we've commuted them).
+  void ResetTopOperands(MachineInstr *Instr) {
+    assert(HasRemainingOperands(Instr) &&
+           "Reseting operands should only be done when the instruction has "
+           "an operand still on the stack");
+    Worklist.back() = reverse(Instr->explicit_uses());
+  }
+
+  /// Test whether Instr has operands remaining to be visited at the top of
+  /// the stack.
+  bool HasRemainingOperands(const MachineInstr *Instr) const {
+    if (Worklist.empty())
+      return false;
+    const RangeTy &Range = Worklist.back();
+    return Range.begin() != Range.end() && Range.begin()->getParent() == Instr;
+  }
+
+  /// Test whether the given register is present on the stack, indicating an
+  /// operand in the tree that we haven't visited yet. Moving a definition of
+  /// Reg to a point in the tree after that would change its value.
+  ///
+  /// This is needed as a consequence of using implicit get_locals for
+  /// uses and implicit set_locals for defs.
+  bool IsOnStack(unsigned Reg) const {
+    for (const RangeTy &Range : Worklist)
+      for (const MachineOperand &MO : Range)
+        if (MO.isReg() && MO.getReg() == Reg)
+          return true;
+    return false;
+  }
+};
+
+/// State to keep track of whether commuting is in flight or whether it's been
+/// tried for the current instruction and didn't work.
+class CommutingState {
+  /// There are effectively three states: the initial state where we haven't
+  /// started commuting anything and we don't know anything yet, the tenative
+  /// state where we've commuted the operands of the current instruction and are
+  /// revisting it, and the declined state where we've reverted the operands
+  /// back to their original order and will no longer commute it further.
+  bool TentativelyCommuting;
+  bool Declined;
+
+  /// During the tentative state, these hold the operand indices of the commuted
+  /// operands.
+  unsigned Operand0, Operand1;
+
+public:
+  CommutingState() : TentativelyCommuting(false), Declined(false) {}
+
+  /// Stackification for an operand was not successful due to ordering
+  /// constraints. If possible, and if we haven't already tried it and declined
+  /// it, commute Insert's operands and prepare to revisit it.
+  void MaybeCommute(MachineInstr *Insert, TreeWalkerState &TreeWalker,
+                    const WebAssemblyInstrInfo *TII) {
+    if (TentativelyCommuting) {
+      assert(!Declined &&
+             "Don't decline commuting until you've finished trying it");
+      // Commuting didn't help. Revert it.
+      TII->commuteInstruction(*Insert, /*NewMI=*/false, Operand0, Operand1);
+      TentativelyCommuting = false;
+      Declined = true;
+    } else if (!Declined && TreeWalker.HasRemainingOperands(Insert)) {
+      Operand0 = TargetInstrInfo::CommuteAnyOperandIndex;
+      Operand1 = TargetInstrInfo::CommuteAnyOperandIndex;
+      if (TII->findCommutedOpIndices(*Insert, Operand0, Operand1)) {
+        // Tentatively commute the operands and try again.
+        TII->commuteInstruction(*Insert, /*NewMI=*/false, Operand0, Operand1);
+        TreeWalker.ResetTopOperands(Insert);
+        TentativelyCommuting = true;
+        Declined = false;
+      }
+    }
+  }
+
+  /// Stackification for some operand was successful. Reset to the default
+  /// state.
+  void Reset() {
+    TentativelyCommuting = false;
+    Declined = false;
+  }
+};
+} // end anonymous namespace
+
 bool WebAssemblyRegStackify::runOnMachineFunction(MachineFunction &MF) {
   DEBUG(dbgs() << "********** Register Stackifying **********\n"
                   "********** Function: "
@@ -140,7 +708,10 @@ bool WebAssemblyRegStackify::runOnMachineFunction(MachineFunction &MF) {
   bool Changed = false;
   MachineRegisterInfo &MRI = MF.getRegInfo();
   WebAssemblyFunctionInfo &MFI = *MF.getInfo<WebAssemblyFunctionInfo>();
+  const auto *TII = MF.getSubtarget<WebAssemblySubtarget>().getInstrInfo();
+  const auto *TRI = MF.getSubtarget<WebAssemblySubtarget>().getRegisterInfo();
   AliasAnalysis &AA = getAnalysis<AAResultsWrapperPass>().getAAResults();
+  MachineDominatorTree &MDT = getAnalysis<MachineDominatorTree>();
   LiveIntervals &LIS = getAnalysis<LiveIntervals>();
 
   // Walk the instructions from the bottom up. Currently we don't look past
@@ -151,33 +722,37 @@ bool WebAssemblyRegStackify::runOnMachineFunction(MachineFunction &MF) {
     // iterating over it and the end iterator may change.
     for (auto MII = MBB.rbegin(); MII != MBB.rend(); ++MII) {
       MachineInstr *Insert = &*MII;
-      // Don't nest anything inside a phi.
-      if (Insert->getOpcode() == TargetOpcode::PHI)
-        break;
-
       // Don't nest anything inside an inline asm, because we don't have
       // constraints for $push inputs.
       if (Insert->getOpcode() == TargetOpcode::INLINEASM)
-        break;
+        continue;
+
+      // Ignore debugging intrinsics.
+      if (Insert->getOpcode() == TargetOpcode::DBG_VALUE)
+        continue;
 
       // Iterate through the inputs in reverse order, since we'll be pulling
       // operands off the stack in LIFO order.
-      bool AnyStackified = false;
-      for (MachineOperand &Op : reverse(Insert->uses())) {
+      CommutingState Commuting;
+      TreeWalkerState TreeWalker(Insert);
+      while (!TreeWalker.Done()) {
+        MachineOperand &Op = TreeWalker.Pop();
+
         // We're only interested in explicit virtual register operands.
-        if (!Op.isReg() || Op.isImplicit() || !Op.isUse())
+        if (!Op.isReg())
           continue;
 
         unsigned Reg = Op.getReg();
-
-        // Only consider registers with a single definition.
-        // TODO: Eventually we may relax this, to stackify phi transfers.
-        MachineInstr *Def = MRI.getUniqueVRegDef(Reg);
-        if (!Def)
+        assert(Op.isUse() && "explicit_uses() should only iterate over uses");
+        assert(!Op.isImplicit() &&
+               "explicit_uses() should only iterate over explicit operands");
+        if (TargetRegisterInfo::isPhysicalRegister(Reg))
           continue;
 
-        // There's no use in nesting implicit defs inside anything.
-        if (Def->getOpcode() == TargetOpcode::IMPLICIT_DEF)
+        // Identify the definition for this register at this point. Most
+        // registers are in SSA form here so we try a quick MRI query first.
+        MachineInstr *Def = GetVRegDef(Reg, Insert, MRI, LIS);
+        if (!Def)
           continue;
 
         // Don't nest an INLINE_ASM def into anything, because we don't have
@@ -185,10 +760,6 @@ bool WebAssemblyRegStackify::runOnMachineFunction(MachineFunction &MF) {
         if (Def->getOpcode() == TargetOpcode::INLINEASM)
           continue;
 
-        // Don't nest PHIs inside of anything.
-        if (Def->getOpcode() == TargetOpcode::PHI)
-          continue;
-
         // Argument instructions represent live-in registers and not real
         // instructions.
         if (Def->getOpcode() == WebAssembly::ARGUMENT_I32 ||
@@ -197,38 +768,53 @@ bool WebAssemblyRegStackify::runOnMachineFunction(MachineFunction &MF) {
             Def->getOpcode() == WebAssembly::ARGUMENT_F64)
           continue;
 
-        // Single-use expression trees require defs that have one use.
-        // TODO: Eventually we'll relax this, to take advantage of set_local
-        // returning its result.
-        if (!MRI.hasOneUse(Reg))
-          continue;
-
-        // For now, be conservative and don't look across block boundaries.
-        // TODO: Be more aggressive?
-        if (Def->getParent() != &MBB)
+        // Decide which strategy to take. Prefer to move a single-use value
+        // over cloning it, and prefer cloning over introducing a tee_local.
+        // For moving, we require the def to be in the same block as the use;
+        // this makes things simpler (LiveIntervals' handleMove function only
+        // supports intra-block moves) and it's MachineSink's job to catch all
+        // the sinking opportunities anyway.
+        bool SameBlock = Def->getParent() == &MBB;
+        bool CanMove = SameBlock && IsSafeToMove(Def, Insert, AA, LIS, MRI) &&
+                       !TreeWalker.IsOnStack(Reg);
+        if (CanMove && HasOneUse(Reg, Def, MRI, MDT, LIS)) {
+          Insert = MoveForSingleUse(Reg, Op, Def, MBB, Insert, LIS, MFI, MRI);
+        } else if (ShouldRematerialize(*Def, AA, TII)) {
+          Insert =
+              RematerializeCheapDef(Reg, Op, *Def, MBB, Insert->getIterator(),
+                                    LIS, MFI, MRI, TII, TRI);
+        } else if (CanMove &&
+                   OneUseDominatesOtherUses(Reg, Op, MBB, MRI, MDT, LIS, MFI)) {
+          Insert = MoveAndTeeForMultiUse(Reg, Op, Def, MBB, Insert, LIS, MFI,
+                                         MRI, TII);
+        } else {
+          // We failed to stackify the operand. If the problem was ordering
+          // constraints, Commuting may be able to help.
+          if (!CanMove && SameBlock)
+            Commuting.MaybeCommute(Insert, TreeWalker, TII);
+          // Proceed to the next operand.
           continue;
+        }
 
-        // Don't move instructions that have side effects or memory dependencies
-        // or other complications.
-        if (!IsSafeToMove(Def, Insert, AA, LIS, MRI))
-          continue;
+        // We stackified an operand. Add the defining instruction's operands to
+        // the worklist stack now to continue to build an ever deeper tree.
+        Commuting.Reset();
+        TreeWalker.PushOperands(Insert);
+      }
 
+      // If we stackified any operands, skip over the tree to start looking for
+      // the next instruction we can build a tree on.
+      if (Insert != &*MII) {
+        ImposeStackOrdering(&*MII);
+        MII = std::prev(
+            llvm::make_reverse_iterator(MachineBasicBlock::iterator(Insert)));
         Changed = true;
-        AnyStackified = true;
-        // Move the def down and nest it in the current instruction.
-        MBB.splice(Insert, &MBB, Def);
-        LIS.handleMove(Def);
-        MFI.stackifyVReg(Reg);
-        ImposeStackOrdering(Def);
-        Insert = Def;
       }
-      if (AnyStackified)
-        ImposeStackOrdering(&*MII);
     }
   }
 
-  // If we used EXPR_STACK anywhere, add it to the live-in sets everywhere
-  // so that it never looks like a use-before-def.
+  // If we used EXPR_STACK anywhere, add it to the live-in sets everywhere so
+  // that it never looks like a use-before-def.
   if (Changed) {
     MF.getRegInfo().addLiveIn(WebAssembly::EXPR_STACK);
     for (MachineBasicBlock &MBB : MF)
@@ -236,30 +822,30 @@ bool WebAssemblyRegStackify::runOnMachineFunction(MachineFunction &MF) {
   }
 
 #ifndef NDEBUG
-  // Verify that pushes and pops are performed in FIFO order.
+  // Verify that pushes and pops are performed in LIFO order.
   SmallVector<unsigned, 0> Stack;
   for (MachineBasicBlock &MBB : MF) {
     for (MachineInstr &MI : MBB) {
+      if (MI.isDebugValue())
+        continue;
       for (MachineOperand &MO : reverse(MI.explicit_operands())) {
         if (!MO.isReg())
           continue;
-        unsigned VReg = MO.getReg();
-
-        // Don't stackify physregs like SP or FP.
-        if (!TargetRegisterInfo::isVirtualRegister(VReg))
-          continue;
+        unsigned Reg = MO.getReg();
 
-        if (MFI.isVRegStackified(VReg)) {
+        if (MFI.isVRegStackified(Reg)) {
           if (MO.isDef())
-            Stack.push_back(VReg);
+            Stack.push_back(Reg);
           else
-            assert(Stack.pop_back_val() == VReg);
+            assert(Stack.pop_back_val() == Reg &&
+                   "Register stack pop should be paired with a push");
         }
       }
     }
     // TODO: Generalize this code to support keeping values on the stack across
     // basic block boundaries.
-    assert(Stack.empty());
+    assert(Stack.empty() &&
+           "Register stack pushes and pops should be balanced");
   }
 #endif
 
diff --git a/lib/Target/WebAssembly/WebAssemblyRegisterInfo.cpp b/lib/Target/WebAssembly/WebAssemblyRegisterInfo.cpp
index 90d8dda530ba..239fe89b7ef9 100644
--- a/lib/Target/WebAssembly/WebAssemblyRegisterInfo.cpp
+++ b/lib/Target/WebAssembly/WebAssemblyRegisterInfo.cpp
@@ -52,43 +52,74 @@ WebAssemblyRegisterInfo::getReservedRegs(const MachineFunction & /*MF*/) const {
 }
 
 void WebAssemblyRegisterInfo::eliminateFrameIndex(
-    MachineBasicBlock::iterator II, int SPAdj,
-    unsigned FIOperandNum, RegScavenger * /*RS*/) const {
+    MachineBasicBlock::iterator II, int SPAdj, unsigned FIOperandNum,
+    RegScavenger * /*RS*/) const {
   assert(SPAdj == 0);
   MachineInstr &MI = *II;
 
   MachineBasicBlock &MBB = *MI.getParent();
   MachineFunction &MF = *MBB.getParent();
+  MachineRegisterInfo &MRI = MF.getRegInfo();
   int FrameIndex = MI.getOperand(FIOperandNum).getIndex();
-  const MachineFrameInfo& MFI = *MF.getFrameInfo();
+  const MachineFrameInfo &MFI = *MF.getFrameInfo();
   int64_t FrameOffset = MFI.getStackSize() + MFI.getObjectOffset(FrameIndex);
 
-  if (MI.mayLoadOrStore()) {
-    // If this is a load or store, make it relative to SP and fold the frame
-    // offset directly in.
+  // If this is the address operand of a load or store, make it relative to SP
+  // and fold the frame offset directly in.
+  if (MI.mayLoadOrStore() && FIOperandNum == WebAssembly::MemOpAddressOperandNo) {
     assert(FrameOffset >= 0 && MI.getOperand(1).getImm() >= 0);
     int64_t Offset = MI.getOperand(1).getImm() + FrameOffset;
 
-    if (static_cast<uint64_t>(Offset) > std::numeric_limits<uint32_t>::max()) {
-      // If this happens the program is invalid, but better to error here than
-      // generate broken code.
-      report_fatal_error("Memory offset field overflow");
+    if (static_cast<uint64_t>(Offset) <= std::numeric_limits<uint32_t>::max()) {
+      MI.getOperand(FIOperandNum - 1).setImm(Offset);
+      MI.getOperand(FIOperandNum)
+          .ChangeToRegister(WebAssembly::SP32, /*IsDef=*/false);
+      return;
     }
-    MI.getOperand(1).setImm(Offset);
-    MI.getOperand(2).ChangeToRegister(WebAssembly::SP32, /*IsDef=*/false);
-  } else {
-    // Otherwise create an i32.add SP, offset and make it the operand.
-    auto &MRI = MF.getRegInfo();
-    const auto *TII = MF.getSubtarget().getInstrInfo();
+  }
+
+  // If this is an address being added to a constant, fold the frame offset
+  // into the constant.
+  if (MI.getOpcode() == WebAssembly::ADD_I32) {
+    MachineOperand &OtherMO = MI.getOperand(3 - FIOperandNum);
+    if (OtherMO.isReg()) {
+      unsigned OtherMOReg = OtherMO.getReg();
+      if (TargetRegisterInfo::isVirtualRegister(OtherMOReg)) {
+        MachineInstr *Def = MF.getRegInfo().getUniqueVRegDef(OtherMOReg);
+        // TODO: For now we just opportunistically do this in the case where
+        // the CONST_I32 happens to have exactly one def and one use. We
+        // should generalize this to optimize in more cases.
+        if (Def && Def->getOpcode() == WebAssembly::CONST_I32 &&
+            MRI.hasOneNonDBGUse(Def->getOperand(0).getReg())) {
+          MachineOperand &ImmMO = Def->getOperand(1);
+          ImmMO.setImm(ImmMO.getImm() + uint32_t(FrameOffset));
+          MI.getOperand(FIOperandNum)
+              .ChangeToRegister(WebAssembly::SP32, /*IsDef=*/false);
+          return;
+        }
+      }
+    }
+  }
+
+  // Otherwise create an i32.add SP, offset and make it the operand.
+  const auto *TII = MF.getSubtarget<WebAssemblySubtarget>().getInstrInfo();
 
-    unsigned OffsetReg = MRI.createVirtualRegister(&WebAssembly::I32RegClass);
-    BuildMI(MBB, MI, MI.getDebugLoc(), TII->get(WebAssembly::CONST_I32), OffsetReg)
+  unsigned FIRegOperand = WebAssembly::SP32;
+  if (FrameOffset) {
+    // Create i32.add SP, offset and make it the operand.
+    const TargetRegisterClass *PtrRC =
+        MRI.getTargetRegisterInfo()->getPointerRegClass(MF);
+    unsigned OffsetOp = MRI.createVirtualRegister(PtrRC);
+    BuildMI(MBB, *II, II->getDebugLoc(), TII->get(WebAssembly::CONST_I32),
+            OffsetOp)
         .addImm(FrameOffset);
-    BuildMI(MBB, MI, MI.getDebugLoc(), TII->get(WebAssembly::ADD_I32), OffsetReg)
+    FIRegOperand = MRI.createVirtualRegister(PtrRC);
+    BuildMI(MBB, *II, II->getDebugLoc(), TII->get(WebAssembly::ADD_I32),
+            FIRegOperand)
         .addReg(WebAssembly::SP32)
-        .addReg(OffsetReg);
-    MI.getOperand(FIOperandNum).ChangeToRegister(OffsetReg, /*IsDef=*/false);
+        .addReg(OffsetOp);
   }
+  MI.getOperand(FIOperandNum).ChangeToRegister(FIRegOperand, /*IsDef=*/false);
 }
 
 unsigned
diff --git a/lib/Target/WebAssembly/WebAssemblyReplacePhysRegs.cpp b/lib/Target/WebAssembly/WebAssemblyReplacePhysRegs.cpp
new file mode 100644
index 000000000000..11bda47eac56
--- /dev/null
+++ b/lib/Target/WebAssembly/WebAssemblyReplacePhysRegs.cpp
@@ -0,0 +1,97 @@
+//===-- WebAssemblyReplacePhysRegs.cpp - Replace phys regs with virt regs -===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// \brief This file implements a pass that replaces physical registers with
+/// virtual registers.
+///
+/// LLVM expects certain physical registers, such as a stack pointer. However,
+/// WebAssembly doesn't actually have such physical registers. This pass is run
+/// once LLVM no longer needs these registers, and replaces them with virtual
+/// registers, so they can participate in register stackifying and coloring in
+/// the normal way.
+///
+//===----------------------------------------------------------------------===//
+
+#include "WebAssembly.h"
+#include "MCTargetDesc/WebAssemblyMCTargetDesc.h"
+#include "WebAssemblyMachineFunctionInfo.h"
+#include "WebAssemblySubtarget.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/Passes.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+using namespace llvm;
+
+#define DEBUG_TYPE "wasm-replace-phys-regs"
+
+namespace {
+class WebAssemblyReplacePhysRegs final : public MachineFunctionPass {
+public:
+  static char ID; // Pass identification, replacement for typeid
+  WebAssemblyReplacePhysRegs() : MachineFunctionPass(ID) {}
+
+private:
+  const char *getPassName() const override {
+    return "WebAssembly Replace Physical Registers";
+  }
+
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    AU.setPreservesCFG();
+    MachineFunctionPass::getAnalysisUsage(AU);
+  }
+
+  bool runOnMachineFunction(MachineFunction &MF) override;
+};
+} // end anonymous namespace
+
+char WebAssemblyReplacePhysRegs::ID = 0;
+FunctionPass *llvm::createWebAssemblyReplacePhysRegs() {
+  return new WebAssemblyReplacePhysRegs();
+}
+
+bool WebAssemblyReplacePhysRegs::runOnMachineFunction(MachineFunction &MF) {
+  DEBUG({
+    dbgs() << "********** Replace Physical Registers **********\n"
+           << "********** Function: " << MF.getName() << '\n';
+  });
+
+  MachineRegisterInfo &MRI = MF.getRegInfo();
+  const auto &TRI = *MF.getSubtarget<WebAssemblySubtarget>().getRegisterInfo();
+  bool Changed = false;
+
+  assert(!mustPreserveAnalysisID(LiveIntervalsID) &&
+         "LiveIntervals shouldn't be active yet!");
+  // We don't preserve SSA or liveness.
+  MRI.leaveSSA();
+  MRI.invalidateLiveness();
+
+  for (unsigned PReg = WebAssembly::NoRegister + 1;
+       PReg < WebAssembly::NUM_TARGET_REGS; ++PReg) {
+    // Skip fake registers that are never used explicitly.
+    if (PReg == WebAssembly::EXPR_STACK || PReg == WebAssembly::ARGUMENTS)
+      continue;
+
+    // Replace explicit uses of the physical register with a virtual register.
+    const TargetRegisterClass *RC = TRI.getMinimalPhysRegClass(PReg);
+    unsigned VReg = WebAssembly::NoRegister;
+    for (auto I = MRI.reg_begin(PReg), E = MRI.reg_end(); I != E; ) {
+      MachineOperand &MO = *I++;
+      if (!MO.isImplicit()) {
+        if (VReg == WebAssembly::NoRegister)
+          VReg = MRI.createVirtualRegister(RC);
+        MO.setReg(VReg);
+        Changed = true;
+      }
+    }
+  }
+
+  return Changed;
+}
diff --git a/lib/Target/WebAssembly/WebAssemblySelectionDAGInfo.h b/lib/Target/WebAssembly/WebAssemblySelectionDAGInfo.h
index 13d96671276d..533c66b7a22f 100644
--- a/lib/Target/WebAssembly/WebAssemblySelectionDAGInfo.h
+++ b/lib/Target/WebAssembly/WebAssemblySelectionDAGInfo.h
@@ -9,18 +9,18 @@
 ///
 /// \file
 /// \brief This file defines the WebAssembly subclass for
-/// TargetSelectionDAGInfo.
+/// SelectionDAGTargetInfo.
 ///
 //===----------------------------------------------------------------------===//
 
 #ifndef LLVM_LIB_TARGET_WEBASSEMBLY_WEBASSEMBLYSELECTIONDAGINFO_H
 #define LLVM_LIB_TARGET_WEBASSEMBLY_WEBASSEMBLYSELECTIONDAGINFO_H
 
-#include "llvm/Target/TargetSelectionDAGInfo.h"
+#include "llvm/CodeGen/SelectionDAGTargetInfo.h"
 
 namespace llvm {
 
-class WebAssemblySelectionDAGInfo final : public TargetSelectionDAGInfo {
+class WebAssemblySelectionDAGInfo final : public SelectionDAGTargetInfo {
 public:
   ~WebAssemblySelectionDAGInfo() override;
 };
diff --git a/lib/Target/WebAssembly/WebAssemblySetP2AlignOperands.cpp b/lib/Target/WebAssembly/WebAssemblySetP2AlignOperands.cpp
new file mode 100644
index 000000000000..4ebea68c58a5
--- /dev/null
+++ b/lib/Target/WebAssembly/WebAssemblySetP2AlignOperands.cpp
@@ -0,0 +1,114 @@
+//=- WebAssemblySetP2AlignOperands.cpp - Set alignments on loads and stores -=//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// \brief This file sets the p2align operands on load and store instructions.
+///
+//===----------------------------------------------------------------------===//
+
+#include "WebAssembly.h"
+#include "MCTargetDesc/WebAssemblyMCTargetDesc.h"
+#include "WebAssemblyMachineFunctionInfo.h"
+#include "llvm/CodeGen/MachineBlockFrequencyInfo.h"
+#include "llvm/CodeGen/MachineMemOperand.h"
+#include "llvm/CodeGen/Passes.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+using namespace llvm;
+
+#define DEBUG_TYPE "wasm-set-p2align-operands"
+
+namespace {
+class WebAssemblySetP2AlignOperands final : public MachineFunctionPass {
+public:
+  static char ID; // Pass identification, replacement for typeid
+  WebAssemblySetP2AlignOperands() : MachineFunctionPass(ID) {}
+
+  const char *getPassName() const override {
+    return "WebAssembly Set p2align Operands";
+  }
+
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    AU.setPreservesCFG();
+    AU.addPreserved<MachineBlockFrequencyInfo>();
+    AU.addPreservedID(MachineDominatorsID);
+    MachineFunctionPass::getAnalysisUsage(AU);
+  }
+
+  bool runOnMachineFunction(MachineFunction &MF) override;
+};
+} // end anonymous namespace
+
+char WebAssemblySetP2AlignOperands::ID = 0;
+FunctionPass *llvm::createWebAssemblySetP2AlignOperands() {
+  return new WebAssemblySetP2AlignOperands();
+}
+
+bool WebAssemblySetP2AlignOperands::runOnMachineFunction(MachineFunction &MF) {
+  DEBUG({
+    dbgs() << "********** Set p2align Operands **********\n"
+           << "********** Function: " << MF.getName() << '\n';
+  });
+
+  bool Changed = false;
+
+  for (auto &MBB : MF) {
+    for (auto &MI : MBB) {
+      switch (MI.getOpcode()) {
+      case WebAssembly::LOAD_I32:
+      case WebAssembly::LOAD_I64:
+      case WebAssembly::LOAD_F32:
+      case WebAssembly::LOAD_F64:
+      case WebAssembly::LOAD8_S_I32:
+      case WebAssembly::LOAD8_U_I32:
+      case WebAssembly::LOAD16_S_I32:
+      case WebAssembly::LOAD16_U_I32:
+      case WebAssembly::LOAD8_S_I64:
+      case WebAssembly::LOAD8_U_I64:
+      case WebAssembly::LOAD16_S_I64:
+      case WebAssembly::LOAD16_U_I64:
+      case WebAssembly::LOAD32_S_I64:
+      case WebAssembly::LOAD32_U_I64:
+      case WebAssembly::STORE_I32:
+      case WebAssembly::STORE_I64:
+      case WebAssembly::STORE_F32:
+      case WebAssembly::STORE_F64:
+      case WebAssembly::STORE8_I32:
+      case WebAssembly::STORE16_I32:
+      case WebAssembly::STORE8_I64:
+      case WebAssembly::STORE16_I64:
+      case WebAssembly::STORE32_I64: {
+        assert(MI.getOperand(3).getImm() == 0 &&
+               "ISel should set p2align operands to 0");
+        assert(MI.hasOneMemOperand() &&
+               "Load and store instructions have exactly one mem operand");
+        assert((*MI.memoperands_begin())->getSize() ==
+                   (UINT64_C(1)
+                    << WebAssembly::GetDefaultP2Align(MI.getOpcode())) &&
+               "Default p2align value should be natural");
+        assert(MI.getDesc().OpInfo[3].OperandType ==
+                   WebAssembly::OPERAND_P2ALIGN &&
+               "Load and store instructions should have a p2align operand");
+        uint64_t P2Align = Log2_64((*MI.memoperands_begin())->getAlignment());
+
+        // WebAssembly does not currently support supernatural alignment.
+        P2Align = std::min(
+            P2Align, uint64_t(WebAssembly::GetDefaultP2Align(MI.getOpcode())));
+
+        MI.getOperand(3).setImm(P2Align);
+        break;
+      }
+      default:
+        break;
+      }
+    }
+  }
+
+  return Changed;
+}
diff --git a/lib/Target/WebAssembly/WebAssemblyStoreResults.cpp b/lib/Target/WebAssembly/WebAssemblyStoreResults.cpp
index 4e08b2b079eb..1e9a773ae628 100644
--- a/lib/Target/WebAssembly/WebAssemblyStoreResults.cpp
+++ b/lib/Target/WebAssembly/WebAssemblyStoreResults.cpp
@@ -17,12 +17,19 @@
 /// potentially also exposing the store to register stackifying. These both can
 /// reduce get_local/set_local traffic.
 ///
+/// This pass also performs this optimization for memcpy, memmove, and memset
+/// calls, since the LLVM intrinsics for these return void so they can't use the
+/// returned attribute and consequently aren't handled by the OptimizeReturned
+/// pass.
+///
 //===----------------------------------------------------------------------===//
 
 #include "WebAssembly.h"
 #include "MCTargetDesc/WebAssemblyMCTargetDesc.h"
 #include "WebAssemblyMachineFunctionInfo.h"
 #include "WebAssemblySubtarget.h"
+#include "llvm/Analysis/TargetLibraryInfo.h"
+#include "llvm/CodeGen/LiveIntervalAnalysis.h"
 #include "llvm/CodeGen/MachineBlockFrequencyInfo.h"
 #include "llvm/CodeGen/MachineDominators.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
@@ -49,6 +56,10 @@ public:
     AU.addPreserved<MachineBlockFrequencyInfo>();
     AU.addRequired<MachineDominatorTree>();
     AU.addPreserved<MachineDominatorTree>();
+    AU.addRequired<LiveIntervals>();
+    AU.addPreserved<SlotIndexes>();
+    AU.addPreserved<LiveIntervals>();
+    AU.addRequired<TargetLibraryInfoWrapperPass>();
     MachineFunctionPass::getAnalysisUsage(AU);
   }
 
@@ -63,17 +74,127 @@ FunctionPass *llvm::createWebAssemblyStoreResults() {
   return new WebAssemblyStoreResults();
 }
 
+// Replace uses of FromReg with ToReg if they are dominated by MI.
+static bool ReplaceDominatedUses(MachineBasicBlock &MBB, MachineInstr &MI,
+                                 unsigned FromReg, unsigned ToReg,
+                                 const MachineRegisterInfo &MRI,
+                                 MachineDominatorTree &MDT,
+                                 LiveIntervals &LIS) {
+  bool Changed = false;
+
+  LiveInterval *FromLI = &LIS.getInterval(FromReg);
+  LiveInterval *ToLI = &LIS.getInterval(ToReg);
+
+  SlotIndex FromIdx = LIS.getInstructionIndex(MI).getRegSlot();
+  VNInfo *FromVNI = FromLI->getVNInfoAt(FromIdx);
+
+  SmallVector<SlotIndex, 4> Indices;
+
+  for (auto I = MRI.use_begin(FromReg), E = MRI.use_end(); I != E;) {
+    MachineOperand &O = *I++;
+    MachineInstr *Where = O.getParent();
+
+    // Check that MI dominates the instruction in the normal way.
+    if (&MI == Where || !MDT.dominates(&MI, Where))
+      continue;
+
+    // If this use gets a different value, skip it.
+    SlotIndex WhereIdx = LIS.getInstructionIndex(*Where);
+    VNInfo *WhereVNI = FromLI->getVNInfoAt(WhereIdx);
+    if (WhereVNI && WhereVNI != FromVNI)
+      continue;
+
+    // Make sure ToReg isn't clobbered before it gets there.
+    VNInfo *ToVNI = ToLI->getVNInfoAt(WhereIdx);
+    if (ToVNI && ToVNI != FromVNI)
+      continue;
+
+    Changed = true;
+    DEBUG(dbgs() << "Setting operand " << O << " in " << *Where << " from "
+                 << MI << "\n");
+    O.setReg(ToReg);
+
+    // If the store's def was previously dead, it is no longer.
+    if (!O.isUndef()) {
+      MI.getOperand(0).setIsDead(false);
+
+      Indices.push_back(WhereIdx.getRegSlot());
+    }
+  }
+
+  if (Changed) {
+    // Extend ToReg's liveness.
+    LIS.extendToIndices(*ToLI, Indices);
+
+    // Shrink FromReg's liveness.
+    LIS.shrinkToUses(FromLI);
+
+    // If we replaced all dominated uses, FromReg is now killed at MI.
+    if (!FromLI->liveAt(FromIdx.getDeadSlot()))
+      MI.addRegisterKilled(FromReg,
+                           MBB.getParent()->getSubtarget<WebAssemblySubtarget>()
+                                 .getRegisterInfo());
+  }
+
+  return Changed;
+}
+
+static bool optimizeStore(MachineBasicBlock &MBB, MachineInstr &MI,
+                          const MachineRegisterInfo &MRI,
+                          MachineDominatorTree &MDT,
+                          LiveIntervals &LIS) {
+  unsigned ToReg = MI.getOperand(0).getReg();
+  unsigned FromReg = MI.getOperand(WebAssembly::StoreValueOperandNo).getReg();
+  return ReplaceDominatedUses(MBB, MI, FromReg, ToReg, MRI, MDT, LIS);
+}
+
+static bool optimizeCall(MachineBasicBlock &MBB, MachineInstr &MI,
+                         const MachineRegisterInfo &MRI,
+                         MachineDominatorTree &MDT,
+                         LiveIntervals &LIS,
+                         const WebAssemblyTargetLowering &TLI,
+                         const TargetLibraryInfo &LibInfo) {
+  MachineOperand &Op1 = MI.getOperand(1);
+  if (!Op1.isSymbol())
+    return false;
+
+  StringRef Name(Op1.getSymbolName());
+  bool callReturnsInput = Name == TLI.getLibcallName(RTLIB::MEMCPY) ||
+                          Name == TLI.getLibcallName(RTLIB::MEMMOVE) ||
+                          Name == TLI.getLibcallName(RTLIB::MEMSET);
+  if (!callReturnsInput)
+    return false;
+
+  LibFunc::Func Func;
+  if (!LibInfo.getLibFunc(Name, Func))
+    return false;
+
+  unsigned FromReg = MI.getOperand(2).getReg();
+  unsigned ToReg = MI.getOperand(0).getReg();
+  if (MRI.getRegClass(FromReg) != MRI.getRegClass(ToReg))
+    report_fatal_error("Store results: call to builtin function with wrong "
+                       "signature, from/to mismatch");
+  return ReplaceDominatedUses(MBB, MI, FromReg, ToReg, MRI, MDT, LIS);
+}
+
 bool WebAssemblyStoreResults::runOnMachineFunction(MachineFunction &MF) {
   DEBUG({
     dbgs() << "********** Store Results **********\n"
            << "********** Function: " << MF.getName() << '\n';
   });
 
-  const MachineRegisterInfo &MRI = MF.getRegInfo();
+  MachineRegisterInfo &MRI = MF.getRegInfo();
   MachineDominatorTree &MDT = getAnalysis<MachineDominatorTree>();
+  const WebAssemblyTargetLowering &TLI =
+      *MF.getSubtarget<WebAssemblySubtarget>().getTargetLowering();
+  const auto &LibInfo = getAnalysis<TargetLibraryInfoWrapperPass>().getTLI();
+  LiveIntervals &LIS = getAnalysis<LiveIntervals>();
   bool Changed = false;
 
-  assert(MRI.isSSA() && "StoreResults depends on SSA form");
+  // We don't preserve SSA form.
+  MRI.leaveSSA();
+
+  assert(MRI.tracksLiveness() && "StoreResults expects liveness tracking");
 
   for (auto &MBB : MF) {
     DEBUG(dbgs() << "Basic Block: " << MBB.getName() << '\n');
@@ -90,33 +211,12 @@ bool WebAssemblyStoreResults::runOnMachineFunction(MachineFunction &MF) {
       case WebAssembly::STORE_F64:
       case WebAssembly::STORE_I32:
       case WebAssembly::STORE_I64:
-        unsigned ToReg = MI.getOperand(0).getReg();
-        unsigned FromReg = MI.getOperand(3).getReg();
-        for (auto I = MRI.use_begin(FromReg), E = MRI.use_end(); I != E;) {
-          MachineOperand &O = *I++;
-          MachineInstr *Where = O.getParent();
-          if (Where->getOpcode() == TargetOpcode::PHI) {
-            // PHIs use their operands on their incoming CFG edges rather than
-            // in their parent blocks. Get the basic block paired with this use
-            // of FromReg and check that MI's block dominates it.
-            MachineBasicBlock *Pred =
-                Where->getOperand(&O - &Where->getOperand(0) + 1).getMBB();
-            if (!MDT.dominates(&MBB, Pred))
-              continue;
-          } else {
-            // For a non-PHI, check that MI dominates the instruction in the
-            // normal way.
-            if (&MI == Where || !MDT.dominates(&MI, Where))
-              continue;
-          }
-          Changed = true;
-          DEBUG(dbgs() << "Setting operand " << O << " in " << *Where
-                       << " from " << MI << "\n");
-          O.setReg(ToReg);
-          // If the store's def was previously dead, it is no longer. But the
-          // dead flag shouldn't be set yet.
-          assert(!MI.getOperand(0).isDead() && "Dead flag set on store result");
-        }
+        Changed |= optimizeStore(MBB, MI, MRI, MDT, LIS);
+        break;
+      case WebAssembly::CALL_I32:
+      case WebAssembly::CALL_I64:
+        Changed |= optimizeCall(MBB, MI, MRI, MDT, LIS, TLI, LibInfo);
+        break;
       }
   }
 
diff --git a/lib/Target/WebAssembly/WebAssemblySubtarget.cpp b/lib/Target/WebAssembly/WebAssemblySubtarget.cpp
index cb2d5a63a19f..ce39051b0555 100644
--- a/lib/Target/WebAssembly/WebAssemblySubtarget.cpp
+++ b/lib/Target/WebAssembly/WebAssemblySubtarget.cpp
@@ -13,9 +13,9 @@
 ///
 //===----------------------------------------------------------------------===//
 
-#include "WebAssemblyInstrInfo.h"
-#include "MCTargetDesc/WebAssemblyMCTargetDesc.h"
 #include "WebAssemblySubtarget.h"
+#include "MCTargetDesc/WebAssemblyMCTargetDesc.h"
+#include "WebAssemblyInstrInfo.h"
 #include "llvm/Support/TargetRegistry.h"
 using namespace llvm;
 
@@ -45,5 +45,11 @@ WebAssemblySubtarget::WebAssemblySubtarget(const Triple &TT,
       InstrInfo(initializeSubtargetDependencies(FS)), TSInfo(),
       TLInfo(TM, *this) {}
 
-bool WebAssemblySubtarget::enableMachineScheduler() const { return true; }
+bool WebAssemblySubtarget::enableMachineScheduler() const {
+  // Disable the MachineScheduler for now. Even with ShouldTrackPressure set and
+  // enableMachineSchedDefaultSched overridden, it appears to have an overall
+  // negative effect for the kinds of register optimizations we're doing.
+  return false;
+}
+
 bool WebAssemblySubtarget::useAA() const { return true; }
diff --git a/lib/Target/WebAssembly/WebAssemblyTargetMachine.cpp b/lib/Target/WebAssembly/WebAssemblyTargetMachine.cpp
index b290b4bf7440..32154af3c1c2 100644
--- a/lib/Target/WebAssembly/WebAssemblyTargetMachine.cpp
+++ b/lib/Target/WebAssembly/WebAssemblyTargetMachine.cpp
@@ -20,8 +20,8 @@
 #include "llvm/CodeGen/MachineFunctionPass.h"
 #include "llvm/CodeGen/Passes.h"
 #include "llvm/CodeGen/RegAllocRegistry.h"
+#include "llvm/CodeGen/TargetPassConfig.h"
 #include "llvm/IR/Function.h"
-#include "llvm/Support/CommandLine.h"
 #include "llvm/Support/TargetRegistry.h"
 #include "llvm/Target/TargetOptions.h"
 #include "llvm/Transforms/Scalar.h"
@@ -39,16 +39,23 @@ extern "C" void LLVMInitializeWebAssemblyTarget() {
 // WebAssembly Lowering public interface.
 //===----------------------------------------------------------------------===//
 
+static Reloc::Model getEffectiveRelocModel(Optional<Reloc::Model> RM) {
+  if (!RM.hasValue())
+    return Reloc::PIC_;
+  return *RM;
+}
+
 /// Create an WebAssembly architecture model.
 ///
 WebAssemblyTargetMachine::WebAssemblyTargetMachine(
     const Target &T, const Triple &TT, StringRef CPU, StringRef FS,
-    const TargetOptions &Options, Reloc::Model RM, CodeModel::Model CM,
-    CodeGenOpt::Level OL)
+    const TargetOptions &Options, Optional<Reloc::Model> RM,
+    CodeModel::Model CM, CodeGenOpt::Level OL)
     : LLVMTargetMachine(T,
                         TT.isArch64Bit() ? "e-m:e-p:64:64-i64:64-n32:64-S128"
                                          : "e-m:e-p:32:32-i64:64-n32:64-S128",
-                        TT, CPU, FS, Options, RM, CM, OL),
+                        TT, CPU, FS, Options, getEffectiveRelocModel(RM),
+                        CM, OL),
       TLOF(make_unique<WebAssemblyTargetObjectFile>()) {
   // WebAssembly type-checks expressions, but a noreturn function with a return
   // type that doesn't match the context will cause a check failure. So we lower
@@ -58,9 +65,9 @@ WebAssemblyTargetMachine::WebAssemblyTargetMachine(
 
   initAsmInfo();
 
-  // We need a reducible CFG, so disable some optimizations which tend to
-  // introduce irreducibility.
-  setRequiresStructuredCFG(true);
+  // Note that we don't use setRequiresStructuredCFG(true). It disables
+  // optimizations than we're ok with, and want, such as critical edge
+  // splitting and tail merging.
 }
 
 WebAssemblyTargetMachine::~WebAssemblyTargetMachine() {}
@@ -103,9 +110,8 @@ public:
 
   void addIRPasses() override;
   bool addInstSelector() override;
-  bool addILPOpts() override;
-  void addPreRegAlloc() override;
   void addPostRegAlloc() override;
+  bool addGCPasses() override { return false; }
   void addPreEmitPass() override;
 };
 } // end anonymous namespace
@@ -140,7 +146,8 @@ void WebAssemblyPassConfig::addIRPasses() {
     addPass(createAtomicExpandPass(TM));
 
   // Optimize "returned" function attributes.
-  addPass(createWebAssemblyOptimizeReturned());
+  if (getOptLevel() != CodeGenOpt::None)
+    addPass(createWebAssemblyOptimizeReturned());
 
   TargetPassConfig::addIRPasses();
 }
@@ -153,58 +160,75 @@ bool WebAssemblyPassConfig::addInstSelector() {
   // so that we can fix up the ARGUMENT instructions before anything else
   // sees them in the wrong place.
   addPass(createWebAssemblyArgumentMove());
+  // Set the p2align operands. This information is present during ISel, however
+  // it's inconvenient to collect. Collect it now, and update the immediate
+  // operands.
+  addPass(createWebAssemblySetP2AlignOperands());
   return false;
 }
 
-bool WebAssemblyPassConfig::addILPOpts() {
-  (void)TargetPassConfig::addILPOpts();
-  return true;
-}
-
-void WebAssemblyPassConfig::addPreRegAlloc() {
-  TargetPassConfig::addPreRegAlloc();
-
-  // Prepare store instructions for register stackifying.
-  addPass(createWebAssemblyStoreResults());
-}
-
 void WebAssemblyPassConfig::addPostRegAlloc() {
   // TODO: The following CodeGen passes don't currently support code containing
   // virtual registers. Consider removing their restrictions and re-enabling
   // them.
-  //
-  // We use our own PrologEpilogInserter which is very slightly modified to
-  // tolerate virtual registers.
-  disablePass(&PrologEpilogCodeInserterID);
-  // Fails with: should be run after register allocation.
-  disablePass(&MachineCopyPropagationID);
 
-  // Mark registers as representing wasm's expression stack.
-  addPass(createWebAssemblyRegStackify());
+  // Has no asserts of its own, but was not written to handle virtual regs.
+  disablePass(&ShrinkWrapID);
 
-  // Run the register coloring pass to reduce the total number of registers.
-  addPass(createWebAssemblyRegColoring());
+  // These functions all require the AllVRegsAllocated property.
+  disablePass(&MachineCopyPropagationID);
+  disablePass(&PostRASchedulerID);
+  disablePass(&FuncletLayoutID);
+  disablePass(&StackMapLivenessID);
+  disablePass(&LiveDebugValuesID);
+  disablePass(&PatchableFunctionID);
 
   TargetPassConfig::addPostRegAlloc();
-
-  // Run WebAssembly's version of the PrologEpilogInserter. Target-independent
-  // PEI runs after PostRegAlloc and after ShrinkWrap. Putting it here will run
-  // PEI before ShrinkWrap but otherwise in the same position in the order.
-  addPass(createWebAssemblyPEI());
 }
 
 void WebAssemblyPassConfig::addPreEmitPass() {
   TargetPassConfig::addPreEmitPass();
 
+  // Now that we have a prologue and epilogue and all frame indices are
+  // rewritten, eliminate SP and FP. This allows them to be stackified,
+  // colored, and numbered with the rest of the registers.
+  addPass(createWebAssemblyReplacePhysRegs());
+
+  if (getOptLevel() != CodeGenOpt::None) {
+    // LiveIntervals isn't commonly run this late. Re-establish preconditions.
+    addPass(createWebAssemblyPrepareForLiveIntervals());
+
+    // Depend on LiveIntervals and perform some optimizations on it.
+    addPass(createWebAssemblyOptimizeLiveIntervals());
+
+    // Prepare store instructions for register stackifying.
+    addPass(createWebAssemblyStoreResults());
+
+    // Mark registers as representing wasm's expression stack. This is a key
+    // code-compression technique in WebAssembly. We run this pass (and
+    // StoreResults above) very late, so that it sees as much code as possible,
+    // including code emitted by PEI and expanded by late tail duplication.
+    addPass(createWebAssemblyRegStackify());
+
+    // Run the register coloring pass to reduce the total number of registers.
+    // This runs after stackification so that it doesn't consider registers
+    // that become stackified.
+    addPass(createWebAssemblyRegColoring());
+  }
+
+  // Eliminate multiple-entry loops.
+  addPass(createWebAssemblyFixIrreducibleControlFlow());
+
   // Put the CFG in structured form; insert BLOCK and LOOP markers.
   addPass(createWebAssemblyCFGStackify());
 
   // Lower br_unless into br_if.
   addPass(createWebAssemblyLowerBrUnless());
 
+  // Perform the very last peephole optimizations on the code.
+  if (getOptLevel() != CodeGenOpt::None)
+    addPass(createWebAssemblyPeephole());
+
   // Create a mapping from LLVM CodeGen virtual registers to wasm registers.
   addPass(createWebAssemblyRegNumbering());
-
-  // Perform the very last peephole optimizations on the code.
-  addPass(createWebAssemblyPeephole());
 }
diff --git a/lib/Target/WebAssembly/WebAssemblyTargetMachine.h b/lib/Target/WebAssembly/WebAssemblyTargetMachine.h
index 3226edcdc614..52a2ef78736a 100644
--- a/lib/Target/WebAssembly/WebAssemblyTargetMachine.h
+++ b/lib/Target/WebAssembly/WebAssemblyTargetMachine.h
@@ -28,7 +28,7 @@ class WebAssemblyTargetMachine final : public LLVMTargetMachine {
 public:
   WebAssemblyTargetMachine(const Target &T, const Triple &TT, StringRef CPU,
                            StringRef FS, const TargetOptions &Options,
-                           Reloc::Model RM, CodeModel::Model CM,
+                           Optional<Reloc::Model> RM, CodeModel::Model CM,
                            CodeGenOpt::Level OL);
 
   ~WebAssemblyTargetMachine() override;
@@ -44,6 +44,8 @@ public:
 
   /// \brief Get the TargetIRAnalysis for this target.
   TargetIRAnalysis getTargetIRAnalysis() override;
+
+  bool usesPhysRegsForPEI() const override { return false; }
 };
 
 } // end namespace llvm
diff --git a/lib/Target/WebAssembly/WebAssemblyTargetTransformInfo.cpp b/lib/Target/WebAssembly/WebAssemblyTargetTransformInfo.cpp
index 356631711921..bf546dab5fbb 100644
--- a/lib/Target/WebAssembly/WebAssemblyTargetTransformInfo.cpp
+++ b/lib/Target/WebAssembly/WebAssemblyTargetTransformInfo.cpp
@@ -25,3 +25,59 @@ WebAssemblyTTIImpl::getPopcntSupport(unsigned TyWidth) const {
   assert(isPowerOf2_32(TyWidth) && "Ty width must be power of 2");
   return TargetTransformInfo::PSK_FastHardware;
 }
+
+unsigned WebAssemblyTTIImpl::getNumberOfRegisters(bool Vector) {
+  unsigned Result = BaseT::getNumberOfRegisters(Vector);
+
+  // For SIMD, use at least 16 registers, as a rough guess.
+  if (Vector)
+    Result = std::max(Result, 16u);
+
+  return Result;
+}
+
+unsigned WebAssemblyTTIImpl::getRegisterBitWidth(bool Vector) {
+  if (Vector && getST()->hasSIMD128())
+    return 128;
+
+  return 64;
+}
+
+unsigned WebAssemblyTTIImpl::getArithmeticInstrCost(
+    unsigned Opcode, Type *Ty, TTI::OperandValueKind Opd1Info,
+    TTI::OperandValueKind Opd2Info, TTI::OperandValueProperties Opd1PropInfo,
+    TTI::OperandValueProperties Opd2PropInfo) {
+
+  unsigned Cost = BasicTTIImplBase<WebAssemblyTTIImpl>::getArithmeticInstrCost(
+      Opcode, Ty, Opd1Info, Opd2Info, Opd1PropInfo, Opd2PropInfo);
+
+  if (VectorType *VTy = dyn_cast<VectorType>(Ty)) {
+    switch (Opcode) {
+    case Instruction::LShr:
+    case Instruction::AShr:
+    case Instruction::Shl:
+      // SIMD128's shifts currently only accept a scalar shift count. For each
+      // element, we'll need to extract, op, insert. The following is a rough
+      // approxmation.
+      if (Opd2Info != TTI::OK_UniformValue &&
+          Opd2Info != TTI::OK_UniformConstantValue)
+        Cost = VTy->getNumElements() *
+               (TargetTransformInfo::TCC_Basic +
+                getArithmeticInstrCost(Opcode, VTy->getElementType()) +
+                TargetTransformInfo::TCC_Basic);
+      break;
+    }
+  }
+  return Cost;
+}
+
+unsigned WebAssemblyTTIImpl::getVectorInstrCost(unsigned Opcode, Type *Val,
+                                                unsigned Index) {
+  unsigned Cost = BasicTTIImplBase::getVectorInstrCost(Opcode, Val, Index);
+
+  // SIMD128's insert/extract currently only take constant indices.
+  if (Index == -1u)
+    return Cost + 25 * TargetTransformInfo::TCC_Expensive;
+
+  return Cost;
+}
diff --git a/lib/Target/WebAssembly/WebAssemblyTargetTransformInfo.h b/lib/Target/WebAssembly/WebAssemblyTargetTransformInfo.h
index 26dc388cc922..fe99e96eb3b8 100644
--- a/lib/Target/WebAssembly/WebAssemblyTargetTransformInfo.h
+++ b/lib/Target/WebAssembly/WebAssemblyTargetTransformInfo.h
@@ -61,7 +61,15 @@ public:
   /// \name Vector TTI Implementations
   /// @{
 
-  // TODO: Implement Vector TTI for WebAssembly
+  unsigned getNumberOfRegisters(bool Vector);
+  unsigned getRegisterBitWidth(bool Vector);
+  unsigned getArithmeticInstrCost(
+      unsigned Opcode, Type *Ty,
+      TTI::OperandValueKind Opd1Info = TTI::OK_AnyValue,
+      TTI::OperandValueKind Opd2Info = TTI::OK_AnyValue,
+      TTI::OperandValueProperties Opd1PropInfo = TTI::OP_None,
+      TTI::OperandValueProperties Opd2PropInfo = TTI::OP_None);
+  unsigned getVectorInstrCost(unsigned Opcode, Type *Val, unsigned Index);
 
   /// @}
 };
diff --git a/lib/Target/WebAssembly/known_gcc_test_failures.txt b/lib/Target/WebAssembly/known_gcc_test_failures.txt
index 91b3fff05dca..f07400021dc1 100644
--- a/lib/Target/WebAssembly/known_gcc_test_failures.txt
+++ b/lib/Target/WebAssembly/known_gcc_test_failures.txt
@@ -1,253 +1,15 @@
 # Tests which are known to fail from the GCC torture test suite.
 
-# Core dump.
-920908-1.c
-pr38151.c
-va-arg-22.c
-
-# TargetRegisterInfo.h:315: static unsigned int llvm::TargetRegisterInfo::virtReg2Index(unsigned int): Assertion `isVirtualRegister(Reg) && "Not a virtual register"' failed.
-struct-ret-1.c
-va-arg-11.c
-va-arg-21.c
-va-arg-24.c
-va-arg-trap-1.c
-
-# WebAssemblyCFGStackify.cpp:211: void SortBlocks(llvm::MachineFunction&, const llvm::MachineLoopInfo&): Assertion `L->contains( MLI.getLoopFor(&*prev(MachineFunction::iterator(&MBB)))) && "Loop isn't contiguous"' failed.
-20000815-1.c
-20010129-1.c
-930628-1.c
-980707-1.c
-
-# WebAssemblyISelLowering.cpp:316: virtual llvm::SDValue llvm::WebAssemblyTargetLowering::LowerCall(llvm::TargetLowering::CallLoweringInfo&, llvm::SmallVectorImpl<llvm::SDValue>&) const: Assertion `!Out.Flags.isByVal() && "byval is not valid for return values"' failed.
-20030914-2.c
-20040703-1.c
-20081117-1.c
-920625-1.c
-931004-11.c
-931004-13.c
-980223.c
-bitfld-5.c
-complex-7.c
-pr38969.c
-pr51323.c
-pr52129.c
-pr57130.c
-
-# These were previously "Cannot select FrameIndex." Now most of them fail
-# because they contain call frame pseudos (e.g. call a vararg func),
-# frame pointers, or similar. This list will be updated again soon.
-20000519-1.c
-20000706-4.c
-20000706-5.c
-20000801-2.c
-20000801-4.c
-20011126-2.c
-
-20020529-1.c
-20021024-1.c
-
-20030828-1.c
-20030914-1.c
-
+# Computed gotos are not supported (Cannot select BlockAddress/BRIND)
 20040302-1.c
-20040625-1.c
-20040823-1.c
-
-20041113-1.c
-
-20041214-1.c
-
-20050826-2.c
-
-20071213-1.c
-
-20080506-2.c
-20080519-1.c
-
-20081103-1.c
-20090113-1.c
-20090113-2.c
-20090113-3.c
-
-20090623-1.c
-
-920501-6.c
-920501-8.c
-920726-1.c
-930518-1.c
-
-931004-10.c
-931004-12.c
-931004-14.c
-931004-2.c
-931004-4.c
-931004-6.c
-931004-8.c
-
-980205.c
-980608-1.c
-980709-1.c
-980716-1.c
-990127-1.c
-
-991216-2.c
-
-#cbrt.c
-complex-5.c
-complex-6.c
-
-enum-3.c
-fprintf-chk-1.c
-frame-address.c
-loop-15.c
-loop-ivopts-2.c
-mayalias-3.c
-
-multi-ix.c
-
-pr20466-1.c
-
-
-pr28778.c
-pr28982b.c
-
-pr30778.c
-pr31448-2.c
-pr31448.c
-
-pr33870-1.c
-pr33870.c
-
-pr38051.c
-
-pr39100.c
-
-pr39339.c
-
-pr43987.c
-
-pr44575.c
-
-pr44942.c
-pr46309.c
-pr47538.c
-pr47925.c
-
-pr49390.c
-pr49419.c
-
-#pr51877.c
-
-#pr52979-1.c
-#pr52979-2.c
-pr53645-2.c
-pr53645.c
-
-pr56205.c
-
-pr56866.c
-
-pr57876.c
-pr58277-1.c
-
-pr59643.c
-
-printf-chk-1.c
-pta-field-1.c
-pta-field-2.c
-
-stdarg-1.c
-stdarg-2.c
-stdarg-3.c
-stdarg-4.c
-strct-stdarg-1.c
-strct-varg-1.c
-
-va-arg-1.c
-va-arg-10.c
-va-arg-12.c
-va-arg-13.c
-va-arg-14.c
-va-arg-15.c
-va-arg-16.c
-va-arg-17.c
-va-arg-18.c
-va-arg-19.c
-va-arg-2.c
-va-arg-20.c
-va-arg-23.c
-va-arg-26.c
-va-arg-4.c
-va-arg-5.c
-va-arg-6.c
-va-arg-7.c
-va-arg-8.c
-va-arg-9.c
-va-arg-pack-1.c
-vfprintf-1.c
-vfprintf-chk-1.c
-vprintf-1.c
-vprintf-chk-1.c
-
-# Cannot select callseq_end.
-20040811-1.c
-pr43220.c
-vla-dealloc-1.c
-
-# Cannot select brind.
 20071210-1.c
 920501-4.c
 920501-5.c
-
-# Cannot select BlockAddress.
 comp-goto-1.c
 980526-1.c
 990208-1.c
 
-# WebAssembly hasn't implemented byval arguments.
-20000412-3.c
-20000419-1.c
-20000706-1.c
-20000706-2.c
-20000707-1.c
-20000717-1.c
-20000717-5.c
-20000808-1.c
-20010605-2.c
-20011113-1.c
-20020215-1.c
-20020810-1.c
-20021118-1.c
-20040707-1.c
-20040709-1.c
-20040709-2.c
-20041201-1.c
-20050713-1.c
-20070614-1.c
-920908-2.c
-921112-1.c
-921117-1.c
-921123-2.c
-921204-1.c
-930126-1.c
-930208-1.c
-931004-5.c
-931004-9.c
-931031-1.c
-950607-2.c
-960416-1.c
-990525-1.c
-991118-1.c
-bf64-1.c
-complex-1.c
-complex-2.c
-pr15262-2.c
-pr20621-1.c
-pr23135.c
-pr30185.c
-pr42248.c
-
-# unimplemented operation lowering.
+# WebAssembly hasn't implemented (will never?) __builtin_return_address
 20010122-1.c
 20030323-1.c
 20030811-1.c
@@ -255,7 +17,6 @@ pr17377.c
 
 # Error: invalid output constraint '=t' in asm.
 990413-2.c
-990826-0.c
 
 # Error: __builtin_setjmp / __builtin_longjmp is not supported for the current target.
 built-in-setjmp.c
@@ -300,10 +61,9 @@ pr51447.c
 20070919-1.c
 align-nest.c
 pr41935.c
-20050107-1.c
-20050119-1.c
-20050119-2.c
 920302-1.c
 920501-3.c
 920728-1.c
 pr28865.c
+widechar-2.c
+pr41463.c
diff --git a/lib/Target/X86/AsmParser/Makefile b/lib/Target/X86/AsmParser/Makefile
deleted file mode 100644
index f834dfc300a1..000000000000
--- a/lib/Target/X86/AsmParser/Makefile
+++ /dev/null
@@ -1,15 +0,0 @@
-##===- lib/Target/X86/AsmParser/Makefile -------------------*- Makefile -*-===##
-#
-#                     The LLVM Compiler Infrastructure
-#
-# This file is distributed under the University of Illinois Open Source
-# License. See LICENSE.TXT for details.
-#
-##===----------------------------------------------------------------------===##
-LEVEL = ../../../..
-LIBRARYNAME = LLVMX86AsmParser
-
-# Hack: we need to include 'main' X86 target directory to grab private headers
-CPP.Flags += -I$(PROJ_OBJ_DIR)/.. -I$(PROJ_SRC_DIR)/..
-
-include $(LEVEL)/Makefile.common
diff --git a/lib/Target/X86/AsmParser/X86AsmInstrumentation.cpp b/lib/Target/X86/AsmParser/X86AsmInstrumentation.cpp
index 09cc53a8e6d3..c38a7d1dd44d 100644
--- a/lib/Target/X86/AsmParser/X86AsmInstrumentation.cpp
+++ b/lib/Target/X86/AsmParser/X86AsmInstrumentation.cpp
@@ -7,8 +7,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "MCTargetDesc/X86BaseInfo.h"
 #include "X86AsmInstrumentation.h"
+#include "MCTargetDesc/X86BaseInfo.h"
 #include "X86Operand.h"
 #include "llvm/ADT/StringExtras.h"
 #include "llvm/ADT/Triple.h"
@@ -18,9 +18,9 @@
 #include "llvm/MC/MCInstBuilder.h"
 #include "llvm/MC/MCInstrInfo.h"
 #include "llvm/MC/MCParser/MCParsedAsmOperand.h"
+#include "llvm/MC/MCParser/MCTargetAsmParser.h"
 #include "llvm/MC/MCStreamer.h"
 #include "llvm/MC/MCSubtargetInfo.h"
-#include "llvm/MC/MCTargetAsmParser.h"
 #include "llvm/MC/MCTargetOptions.h"
 #include "llvm/Support/CommandLine.h"
 #include <algorithm>
diff --git a/lib/Target/X86/AsmParser/X86AsmParser.cpp b/lib/Target/X86/AsmParser/X86AsmParser.cpp
index 4d8ffac1a82b..4e0ad8bfe1f1 100644
--- a/lib/Target/X86/AsmParser/X86AsmParser.cpp
+++ b/lib/Target/X86/AsmParser/X86AsmParser.cpp
@@ -11,7 +11,6 @@
 #include "X86AsmInstrumentation.h"
 #include "X86AsmParserCommon.h"
 #include "X86Operand.h"
-#include "llvm/ADT/APFloat.h"
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/SmallString.h"
 #include "llvm/ADT/SmallVector.h"
@@ -24,12 +23,12 @@
 #include "llvm/MC/MCParser/MCAsmLexer.h"
 #include "llvm/MC/MCParser/MCAsmParser.h"
 #include "llvm/MC/MCParser/MCParsedAsmOperand.h"
+#include "llvm/MC/MCParser/MCTargetAsmParser.h"
 #include "llvm/MC/MCRegisterInfo.h"
 #include "llvm/MC/MCSection.h"
 #include "llvm/MC/MCStreamer.h"
 #include "llvm/MC/MCSubtargetInfo.h"
 #include "llvm/MC/MCSymbol.h"
-#include "llvm/MC/MCTargetAsmParser.h"
 #include "llvm/Support/SourceMgr.h"
 #include "llvm/Support/TargetRegistry.h"
 #include "llvm/Support/raw_ostream.h"
@@ -683,9 +682,14 @@ private:
 
   std::unique_ptr<X86Operand> DefaultMemSIOperand(SMLoc Loc);
   std::unique_ptr<X86Operand> DefaultMemDIOperand(SMLoc Loc);
-  void AddDefaultSrcDestOperands(
-      OperandVector& Operands, std::unique_ptr<llvm::MCParsedAsmOperand> &&Src,
-      std::unique_ptr<llvm::MCParsedAsmOperand> &&Dst);
+  bool IsSIReg(unsigned Reg);
+  unsigned GetSIDIForRegClass(unsigned RegClassID, unsigned Reg, bool IsSIReg);
+  void
+  AddDefaultSrcDestOperands(OperandVector &Operands,
+                            std::unique_ptr<llvm::MCParsedAsmOperand> &&Src,
+                            std::unique_ptr<llvm::MCParsedAsmOperand> &&Dst);
+  bool VerifyAndAdjustOperands(OperandVector &OrigOperands,
+                               OperandVector &FinalOperands);
   std::unique_ptr<X86Operand> ParseOperand();
   std::unique_ptr<X86Operand> ParseATTOperand();
   std::unique_ptr<X86Operand> ParseIntelOperand();
@@ -747,11 +751,6 @@ private:
 
   bool OmitRegisterFromClobberLists(unsigned RegNo) override;
 
-  /// doSrcDstMatch - Returns true if operands are matching in their
-  /// word size (%si and %di, %esi and %edi, etc.). Order depends on
-  /// the parsing mode (Intel vs. AT&T).
-  bool doSrcDstMatch(X86Operand &Op1, X86Operand &Op2);
-
   /// Parses AVX512 specific operand primitives: masked registers ({%k<NUM>}, {z})
   /// and memory broadcasting ({1to<NUM>}) primitives, updating Operands vector if required.
   /// \return \c true if no parsing errors occurred, \c false otherwise.
@@ -867,27 +866,6 @@ static bool CheckBaseRegAndIndexReg(unsigned BaseReg, unsigned IndexReg,
   return false;
 }
 
-bool X86AsmParser::doSrcDstMatch(X86Operand &Op1, X86Operand &Op2)
-{
-  // Return true and let a normal complaint about bogus operands happen.
-  if (!Op1.isMem() || !Op2.isMem())
-    return true;
-
-  // Actually these might be the other way round if Intel syntax is
-  // being used. It doesn't matter.
-  unsigned diReg = Op1.Mem.BaseReg;
-  unsigned siReg = Op2.Mem.BaseReg;
-
-  if (X86MCRegisterClasses[X86::GR16RegClassID].contains(siReg))
-    return X86MCRegisterClasses[X86::GR16RegClassID].contains(diReg);
-  if (X86MCRegisterClasses[X86::GR32RegClassID].contains(siReg))
-    return X86MCRegisterClasses[X86::GR32RegClassID].contains(diReg);
-  if (X86MCRegisterClasses[X86::GR64RegClassID].contains(siReg))
-    return X86MCRegisterClasses[X86::GR64RegClassID].contains(diReg);
-  // Again, return true and let another error happen.
-  return true;
-}
-
 bool X86AsmParser::ParseRegister(unsigned &RegNo,
                                  SMLoc &StartLoc, SMLoc &EndLoc) {
   MCAsmParser &Parser = getParser();
@@ -929,10 +907,16 @@ bool X86AsmParser::ParseRegister(unsigned &RegNo,
     if (RegNo == X86::RIZ ||
         X86MCRegisterClasses[X86::GR64RegClassID].contains(RegNo) ||
         X86II::isX86_64NonExtLowByteReg(RegNo) ||
-        X86II::isX86_64ExtendedReg(RegNo))
+        X86II::isX86_64ExtendedReg(RegNo) ||
+        X86II::is32ExtendedReg(RegNo))
       return Error(StartLoc, "register %"
                    + Tok.getString() + " is only available in 64-bit mode",
                    SMRange(StartLoc, EndLoc));
+  } else if (!getSTI().getFeatureBits()[X86::FeatureAVX512]) {
+    if (X86II::is32ExtendedReg(RegNo))
+      return Error(StartLoc, "register %"
+                   + Tok.getString() + " is only available with AVX512",
+                   SMRange(StartLoc, EndLoc));
   }
 
   // Parse "%st" as "%st(0)" and "%st(1)", which is multiple tokens.
@@ -1025,6 +1009,33 @@ std::unique_ptr<X86Operand> X86AsmParser::DefaultMemDIOperand(SMLoc Loc) {
                                Loc, Loc, 0);
 }
 
+bool X86AsmParser::IsSIReg(unsigned Reg) {
+  switch (Reg) {
+  default: llvm_unreachable("Only (R|E)SI and (R|E)DI are expected!");
+  case X86::RSI:
+  case X86::ESI:
+  case X86::SI:
+    return true;
+  case X86::RDI:
+  case X86::EDI:
+  case X86::DI:
+    return false;
+  }
+}
+
+unsigned X86AsmParser::GetSIDIForRegClass(unsigned RegClassID, unsigned Reg,
+                                          bool IsSIReg) {
+  switch (RegClassID) {
+  default: llvm_unreachable("Unexpected register class");
+  case X86::GR64RegClassID:
+    return IsSIReg ? X86::RSI : X86::RDI;
+  case X86::GR32RegClassID:
+    return IsSIReg ? X86::ESI : X86::EDI;
+  case X86::GR16RegClassID:
+    return IsSIReg ? X86::SI : X86::DI;
+  }
+}
+
 void X86AsmParser::AddDefaultSrcDestOperands(
     OperandVector& Operands, std::unique_ptr<llvm::MCParsedAsmOperand> &&Src,
     std::unique_ptr<llvm::MCParsedAsmOperand> &&Dst) {
@@ -1038,6 +1049,88 @@ void X86AsmParser::AddDefaultSrcDestOperands(
   }
 }
 
+bool X86AsmParser::VerifyAndAdjustOperands(OperandVector &OrigOperands,
+                                           OperandVector &FinalOperands) {
+
+  if (OrigOperands.size() > 1) {
+    // Check if sizes match, OrigOperands also contains the instruction name
+    assert(OrigOperands.size() == FinalOperands.size() + 1 &&
+           "Operand size mismatch");
+
+    SmallVector<std::pair<SMLoc, std::string>, 2> Warnings;
+    // Verify types match
+    int RegClassID = -1;
+    for (unsigned int i = 0; i < FinalOperands.size(); ++i) {
+      X86Operand &OrigOp = static_cast<X86Operand &>(*OrigOperands[i + 1]);
+      X86Operand &FinalOp = static_cast<X86Operand &>(*FinalOperands[i]);
+
+      if (FinalOp.isReg() &&
+          (!OrigOp.isReg() || FinalOp.getReg() != OrigOp.getReg()))
+        // Return false and let a normal complaint about bogus operands happen
+        return false;
+
+      if (FinalOp.isMem()) {
+
+        if (!OrigOp.isMem())
+          // Return false and let a normal complaint about bogus operands happen
+          return false;
+
+        unsigned OrigReg = OrigOp.Mem.BaseReg;
+        unsigned FinalReg = FinalOp.Mem.BaseReg;
+
+        // If we've already encounterd a register class, make sure all register
+        // bases are of the same register class
+        if (RegClassID != -1 &&
+            !X86MCRegisterClasses[RegClassID].contains(OrigReg)) {
+          return Error(OrigOp.getStartLoc(),
+                       "mismatching source and destination index registers");
+        }
+
+        if (X86MCRegisterClasses[X86::GR64RegClassID].contains(OrigReg))
+          RegClassID = X86::GR64RegClassID;
+        else if (X86MCRegisterClasses[X86::GR32RegClassID].contains(OrigReg))
+          RegClassID = X86::GR32RegClassID;
+        else if (X86MCRegisterClasses[X86::GR16RegClassID].contains(OrigReg))
+          RegClassID = X86::GR16RegClassID;
+        else
+          // Unexpected register class type
+          // Return false and let a normal complaint about bogus operands happen
+          return false;
+
+        bool IsSI = IsSIReg(FinalReg);
+        FinalReg = GetSIDIForRegClass(RegClassID, FinalReg, IsSI);
+
+        if (FinalReg != OrigReg) {
+          std::string RegName = IsSI ? "ES:(R|E)SI" : "ES:(R|E)DI";
+          Warnings.push_back(std::make_pair(
+              OrigOp.getStartLoc(),
+              "memory operand is only for determining the size, " + RegName +
+                  " will be used for the location"));
+        }
+
+        FinalOp.Mem.Size = OrigOp.Mem.Size;
+        FinalOp.Mem.SegReg = OrigOp.Mem.SegReg;
+        FinalOp.Mem.BaseReg = FinalReg;
+      }
+    }
+
+    // Produce warnings only if all the operands passed the adjustment - prevent
+    // legal cases like "movsd (%rax), %xmm0" mistakenly produce warnings
+    for (auto &WarningMsg : Warnings) {
+      Warning(WarningMsg.first, WarningMsg.second);
+    }
+
+    // Remove old operands
+    for (unsigned int i = 0; i < FinalOperands.size(); ++i)
+      OrigOperands.pop_back();
+  }
+  // OrigOperands.append(FinalOperands.begin(), FinalOperands.end());
+  for (unsigned int i = 0; i < FinalOperands.size(); ++i)
+    OrigOperands.push_back(std::move(FinalOperands[i]));
+
+  return false;
+}
+
 std::unique_ptr<X86Operand> X86AsmParser::ParseOperand() {
   if (isParsingIntelSyntax())
     return ParseIntelOperand();
@@ -1301,7 +1394,7 @@ X86AsmParser::ParseIntelBracExpression(unsigned SegReg, SMLoc Start,
     return ErrorOperand(BracLoc, "Expected '[' token!");
   Parser.Lex(); // Eat '['
 
-  SMLoc StartInBrac = Tok.getLoc();
+  SMLoc StartInBrac = Parser.getTok().getLoc();
   // Parse [ Symbol + ImmDisp ] and [ BaseReg + Scale*IndexReg + ImmDisp ].  We
   // may have already parsed an immediate displacement before the bracketed
   // expression.
@@ -1330,7 +1423,10 @@ X86AsmParser::ParseIntelBracExpression(unsigned SegReg, SMLoc Start,
   // Parse struct field access.  Intel requires a dot, but MSVC doesn't.  MSVC
   // will in fact do global lookup the field name inside all global typedefs,
   // but we don't emulate that.
-  if (Tok.getString().find('.') != StringRef::npos) {
+  if ((Parser.getTok().getKind() == AsmToken::Identifier ||
+       Parser.getTok().getKind() == AsmToken::Dot ||
+       Parser.getTok().getKind() == AsmToken::Real) &&
+      Parser.getTok().getString().find('.') != StringRef::npos) {
     const MCExpr *NewDisp;
     if (ParseIntelDotOperator(Disp, NewDisp))
       return nullptr;
@@ -2087,22 +2183,36 @@ bool X86AsmParser::ParseInstruction(ParseInstructionInfo &Info, StringRef Name,
     unsigned ComparisonCode = StringSwitch<unsigned>(
       PatchedName.slice(CCIdx, PatchedName.size() - 2))
       .Case("eq",       0x00)
+      .Case("eq_oq",    0x00)
       .Case("lt",       0x01)
+      .Case("lt_os",    0x01)
       .Case("le",       0x02)
+      .Case("le_os",    0x02)
       .Case("unord",    0x03)
+      .Case("unord_q",  0x03)
       .Case("neq",      0x04)
+      .Case("neq_uq",   0x04)
       .Case("nlt",      0x05)
+      .Case("nlt_us",   0x05)
       .Case("nle",      0x06)
+      .Case("nle_us",   0x06)
       .Case("ord",      0x07)
+      .Case("ord_q",    0x07)
       /* AVX only from here */
       .Case("eq_uq",    0x08)
       .Case("nge",      0x09)
+      .Case("nge_us",   0x09)
       .Case("ngt",      0x0A)
+      .Case("ngt_us",   0x0A)
       .Case("false",    0x0B)
+      .Case("false_oq", 0x0B)
       .Case("neq_oq",   0x0C)
       .Case("ge",       0x0D)
+      .Case("ge_os",    0x0D)
       .Case("gt",       0x0E)
+      .Case("gt_os",    0x0E)
       .Case("true",     0x0F)
+      .Case("true_uq",  0x0F)
       .Case("eq_os",    0x10)
       .Case("lt_oq",    0x11)
       .Case("le_oq",    0x12)
@@ -2196,6 +2306,7 @@ bool X86AsmParser::ParseInstruction(ParseInstructionInfo &Info, StringRef Name,
     Name == "repne" || Name == "repnz" ||
     Name == "rex64" || Name == "data16";
 
+  bool CurlyAsEndOfStatement = false;
   // This does the actual operand parsing.  Don't parse any more if we have a
   // prefix juxtaposed with an operation like "lock incl 4(%rax)", because we
   // just want to parse the "lock" as the first instruction and the "incl" as
@@ -2223,7 +2334,12 @@ bool X86AsmParser::ParseInstruction(ParseInstructionInfo &Info, StringRef Name,
         break;
      }
 
-    if (getLexer().isNot(AsmToken::EndOfStatement))
+    // In MS inline asm curly braces mark the begining/end of a block, therefore
+    // they should be interepreted as end of statement
+    CurlyAsEndOfStatement =
+        isParsingIntelSyntax() && isParsingInlineAsm() &&
+        (getLexer().is(AsmToken::LCurly) || getLexer().is(AsmToken::RCurly));
+    if (getLexer().isNot(AsmToken::EndOfStatement) && !CurlyAsEndOfStatement)
       return ErrorAndEatStatement(getLexer().getLoc(),
                                   "unexpected token in argument list");
    }
@@ -2232,6 +2348,10 @@ bool X86AsmParser::ParseInstruction(ParseInstructionInfo &Info, StringRef Name,
   if (getLexer().is(AsmToken::EndOfStatement) ||
       (isPrefix && getLexer().is(AsmToken::Slash)))
     Parser.Lex();
+  else if (CurlyAsEndOfStatement)
+    // Add an actual EndOfStatement before the curly brace
+    Info.AsmRewrites->emplace_back(AOK_EndOfStatement,
+                                   getLexer().getTok().getLoc(), 0);
 
   // This is for gas compatibility and cannot be done in td.
   // Adding "p" for some floating point with no argument.
@@ -2247,10 +2367,11 @@ bool X86AsmParser::ParseInstruction(ParseInstructionInfo &Info, StringRef Name,
     static_cast<X86Operand &>(*Operands[0]).setTokenValue(Repl);
   }
 
-  // This is a terrible hack to handle "out[bwl]? %al, (%dx)" ->
+  // This is a terrible hack to handle "out[s]?[bwl]? %al, (%dx)" ->
   // "outb %al, %dx".  Out doesn't take a memory form, but this is a widely
   // documented form in various unofficial manuals, so a lot of code uses it.
-  if ((Name == "outb" || Name == "outw" || Name == "outl" || Name == "out") &&
+  if ((Name == "outb" || Name == "outsb" || Name == "outw" || Name == "outsw" ||
+       Name == "outl" || Name == "outsl" || Name == "out" || Name == "outs") &&
       Operands.size() == 3) {
     X86Operand &Op = (X86Operand &)*Operands.back();
     if (Op.isMem() && Op.Mem.SegReg == 0 &&
@@ -2261,8 +2382,9 @@ bool X86AsmParser::ParseInstruction(ParseInstructionInfo &Info, StringRef Name,
       Operands.back() = X86Operand::CreateReg(Op.Mem.BaseReg, Loc, Loc);
     }
   }
-  // Same hack for "in[bwl]? (%dx), %al" -> "inb %dx, %al".
-  if ((Name == "inb" || Name == "inw" || Name == "inl" || Name == "in") &&
+  // Same hack for "in[s]?[bwl]? (%dx), %al" -> "inb %dx, %al".
+  if ((Name == "inb" || Name == "insb" || Name == "inw" || Name == "insw" ||
+       Name == "inl" || Name == "insl" || Name == "in" || Name == "ins") &&
       Operands.size() == 3) {
     X86Operand &Op = (X86Operand &)*Operands[1];
     if (Op.isMem() && Op.Mem.SegReg == 0 &&
@@ -2274,84 +2396,92 @@ bool X86AsmParser::ParseInstruction(ParseInstructionInfo &Info, StringRef Name,
     }
   }
 
+  SmallVector<std::unique_ptr<MCParsedAsmOperand>, 2> TmpOperands;
+  bool HadVerifyError = false;
+
   // Append default arguments to "ins[bwld]"
-  if (Name.startswith("ins") && Operands.size() == 1 &&
-      (Name == "insb" || Name == "insw" || Name == "insl" || Name == "insd")) {
-    AddDefaultSrcDestOperands(Operands,
+  if (Name.startswith("ins") && 
+      (Operands.size() == 1 || Operands.size() == 3) &&
+      (Name == "insb" || Name == "insw" || Name == "insl" || Name == "insd" ||
+       Name == "ins")) {
+    
+    AddDefaultSrcDestOperands(TmpOperands,
                               X86Operand::CreateReg(X86::DX, NameLoc, NameLoc),
                               DefaultMemDIOperand(NameLoc));
+    HadVerifyError = VerifyAndAdjustOperands(Operands, TmpOperands);
   }
 
   // Append default arguments to "outs[bwld]"
-  if (Name.startswith("outs") && Operands.size() == 1 &&
+  if (Name.startswith("outs") && 
+      (Operands.size() == 1 || Operands.size() == 3) &&
       (Name == "outsb" || Name == "outsw" || Name == "outsl" ||
-       Name == "outsd" )) {
-    AddDefaultSrcDestOperands(Operands,
-                              DefaultMemSIOperand(NameLoc),
+       Name == "outsd" || Name == "outs")) {
+    AddDefaultSrcDestOperands(TmpOperands, DefaultMemSIOperand(NameLoc),
                               X86Operand::CreateReg(X86::DX, NameLoc, NameLoc));
+    HadVerifyError = VerifyAndAdjustOperands(Operands, TmpOperands);
   }
 
   // Transform "lods[bwlq]" into "lods[bwlq] ($SIREG)" for appropriate
   // values of $SIREG according to the mode. It would be nice if this
   // could be achieved with InstAlias in the tables.
-  if (Name.startswith("lods") && Operands.size() == 1 &&
+  if (Name.startswith("lods") &&
+      (Operands.size() == 1 || Operands.size() == 2) &&
       (Name == "lods" || Name == "lodsb" || Name == "lodsw" ||
-       Name == "lodsl" || Name == "lodsd" || Name == "lodsq"))
-    Operands.push_back(DefaultMemSIOperand(NameLoc));
+       Name == "lodsl" || Name == "lodsd" || Name == "lodsq")) {
+    TmpOperands.push_back(DefaultMemSIOperand(NameLoc));
+    HadVerifyError = VerifyAndAdjustOperands(Operands, TmpOperands);
+  }
 
   // Transform "stos[bwlq]" into "stos[bwlq] ($DIREG)" for appropriate
   // values of $DIREG according to the mode. It would be nice if this
   // could be achieved with InstAlias in the tables.
-  if (Name.startswith("stos") && Operands.size() == 1 &&
+  if (Name.startswith("stos") &&
+      (Operands.size() == 1 || Operands.size() == 2) &&
       (Name == "stos" || Name == "stosb" || Name == "stosw" ||
-       Name == "stosl" || Name == "stosd" || Name == "stosq"))
-    Operands.push_back(DefaultMemDIOperand(NameLoc));
+       Name == "stosl" || Name == "stosd" || Name == "stosq")) {
+    TmpOperands.push_back(DefaultMemDIOperand(NameLoc));
+    HadVerifyError = VerifyAndAdjustOperands(Operands, TmpOperands);
+  }
 
   // Transform "scas[bwlq]" into "scas[bwlq] ($DIREG)" for appropriate
   // values of $DIREG according to the mode. It would be nice if this
   // could be achieved with InstAlias in the tables.
-  if (Name.startswith("scas") && Operands.size() == 1 &&
+  if (Name.startswith("scas") &&
+      (Operands.size() == 1 || Operands.size() == 2) &&
       (Name == "scas" || Name == "scasb" || Name == "scasw" ||
-       Name == "scasl" || Name == "scasd" || Name == "scasq"))
-    Operands.push_back(DefaultMemDIOperand(NameLoc));
+       Name == "scasl" || Name == "scasd" || Name == "scasq")) {
+    TmpOperands.push_back(DefaultMemDIOperand(NameLoc));
+    HadVerifyError = VerifyAndAdjustOperands(Operands, TmpOperands);
+  }
 
   // Add default SI and DI operands to "cmps[bwlq]".
   if (Name.startswith("cmps") &&
+      (Operands.size() == 1 || Operands.size() == 3) &&
       (Name == "cmps" || Name == "cmpsb" || Name == "cmpsw" ||
        Name == "cmpsl" || Name == "cmpsd" || Name == "cmpsq")) {
-    if (Operands.size() == 1) {
-      AddDefaultSrcDestOperands(Operands,
-                                DefaultMemDIOperand(NameLoc),
-                                DefaultMemSIOperand(NameLoc));
-    } else if (Operands.size() == 3) {
-      X86Operand &Op = (X86Operand &)*Operands[1];
-      X86Operand &Op2 = (X86Operand &)*Operands[2];
-      if (!doSrcDstMatch(Op, Op2))
-        return Error(Op.getStartLoc(),
-                     "mismatching source and destination index registers");
-    }
+    AddDefaultSrcDestOperands(TmpOperands, DefaultMemDIOperand(NameLoc),
+                              DefaultMemSIOperand(NameLoc));
+    HadVerifyError = VerifyAndAdjustOperands(Operands, TmpOperands);
   }
 
   // Add default SI and DI operands to "movs[bwlq]".
-  if ((Name.startswith("movs") &&
-      (Name == "movs" || Name == "movsb" || Name == "movsw" ||
-       Name == "movsl" || Name == "movsd" || Name == "movsq")) ||
-      (Name.startswith("smov") &&
-      (Name == "smov" || Name == "smovb" || Name == "smovw" ||
-       Name == "smovl" || Name == "smovd" || Name == "smovq"))) {
-    if (Operands.size() == 1) {
-      if (Name == "movsd")
-        Operands.back() = X86Operand::CreateToken("movsl", NameLoc);
-      AddDefaultSrcDestOperands(Operands,
-                                DefaultMemSIOperand(NameLoc),
-                                DefaultMemDIOperand(NameLoc));
-    } else if (Operands.size() == 3) {
-      X86Operand &Op = (X86Operand &)*Operands[1];
-      X86Operand &Op2 = (X86Operand &)*Operands[2];
-      if (!doSrcDstMatch(Op, Op2))
-        return Error(Op.getStartLoc(),
-                     "mismatching source and destination index registers");
-    }
+  if (((Name.startswith("movs") &&
+        (Name == "movs" || Name == "movsb" || Name == "movsw" ||
+         Name == "movsl" || Name == "movsd" || Name == "movsq")) ||
+       (Name.startswith("smov") &&
+        (Name == "smov" || Name == "smovb" || Name == "smovw" ||
+         Name == "smovl" || Name == "smovd" || Name == "smovq"))) &&
+      (Operands.size() == 1 || Operands.size() == 3)) {
+    if (Name == "movsd" && Operands.size() == 1)
+      Operands.back() = X86Operand::CreateToken("movsl", NameLoc);
+    AddDefaultSrcDestOperands(TmpOperands, DefaultMemSIOperand(NameLoc),
+                              DefaultMemDIOperand(NameLoc));
+    HadVerifyError = VerifyAndAdjustOperands(Operands, TmpOperands);
+  }
+
+  // Check if we encountered an error for one the string insturctions
+  if (HadVerifyError) {
+    return HadVerifyError;
   }
 
   // FIXME: Hack to handle recognize s{hr,ar,hl} $1, <op>.  Canonicalize to
@@ -2387,64 +2517,22 @@ bool X86AsmParser::ParseInstruction(ParseInstructionInfo &Info, StringRef Name,
         }
   }
 
+  // Transforms "xlat mem8" into "xlatb"
+  if ((Name == "xlat" || Name == "xlatb") && Operands.size() == 2) {
+    X86Operand &Op1 = static_cast<X86Operand &>(*Operands[1]);
+    if (Op1.isMem8()) {
+      Warning(Op1.getStartLoc(), "memory operand is only for determining the "
+                                 "size, (R|E)BX will be used for the location");
+      Operands.pop_back();
+      static_cast<X86Operand &>(*Operands[0]).setTokenValue("xlatb");
+    }
+  }
+
   return false;
 }
 
 bool X86AsmParser::processInstruction(MCInst &Inst, const OperandVector &Ops) {
-  switch (Inst.getOpcode()) {
-  default: return false;
-  case X86::VMOVZPQILo2PQIrr:
-  case X86::VMOVAPDrr:
-  case X86::VMOVAPDYrr:
-  case X86::VMOVAPSrr:
-  case X86::VMOVAPSYrr:
-  case X86::VMOVDQArr:
-  case X86::VMOVDQAYrr:
-  case X86::VMOVDQUrr:
-  case X86::VMOVDQUYrr:
-  case X86::VMOVUPDrr:
-  case X86::VMOVUPDYrr:
-  case X86::VMOVUPSrr:
-  case X86::VMOVUPSYrr: {
-    if (X86II::isX86_64ExtendedReg(Inst.getOperand(0).getReg()) ||
-        !X86II::isX86_64ExtendedReg(Inst.getOperand(1).getReg()))
-      return false;
-
-    unsigned NewOpc;
-    switch (Inst.getOpcode()) {
-    default: llvm_unreachable("Invalid opcode");
-    case X86::VMOVZPQILo2PQIrr: NewOpc = X86::VMOVPQI2QIrr;   break;
-    case X86::VMOVAPDrr:        NewOpc = X86::VMOVAPDrr_REV;  break;
-    case X86::VMOVAPDYrr:       NewOpc = X86::VMOVAPDYrr_REV; break;
-    case X86::VMOVAPSrr:        NewOpc = X86::VMOVAPSrr_REV;  break;
-    case X86::VMOVAPSYrr:       NewOpc = X86::VMOVAPSYrr_REV; break;
-    case X86::VMOVDQArr:        NewOpc = X86::VMOVDQArr_REV;  break;
-    case X86::VMOVDQAYrr:       NewOpc = X86::VMOVDQAYrr_REV; break;
-    case X86::VMOVDQUrr:        NewOpc = X86::VMOVDQUrr_REV;  break;
-    case X86::VMOVDQUYrr:       NewOpc = X86::VMOVDQUYrr_REV; break;
-    case X86::VMOVUPDrr:        NewOpc = X86::VMOVUPDrr_REV;  break;
-    case X86::VMOVUPDYrr:       NewOpc = X86::VMOVUPDYrr_REV; break;
-    case X86::VMOVUPSrr:        NewOpc = X86::VMOVUPSrr_REV;  break;
-    case X86::VMOVUPSYrr:       NewOpc = X86::VMOVUPSYrr_REV; break;
-    }
-    Inst.setOpcode(NewOpc);
-    return true;
-  }
-  case X86::VMOVSDrr:
-  case X86::VMOVSSrr: {
-    if (X86II::isX86_64ExtendedReg(Inst.getOperand(0).getReg()) ||
-        !X86II::isX86_64ExtendedReg(Inst.getOperand(2).getReg()))
-      return false;
-    unsigned NewOpc;
-    switch (Inst.getOpcode()) {
-    default: llvm_unreachable("Invalid opcode");
-    case X86::VMOVSDrr: NewOpc = X86::VMOVSDrr_REV;   break;
-    case X86::VMOVSSrr: NewOpc = X86::VMOVSSrr_REV;   break;
-    }
-    Inst.setOpcode(NewOpc);
-    return true;
-  }
-  }
+  return false;
 }
 
 static const char *getSubtargetFeatureName(uint64_t Val);
diff --git a/lib/Target/X86/AsmParser/X86AsmParserCommon.h b/lib/Target/X86/AsmParser/X86AsmParserCommon.h
index 54538c804a03..c45a3f14ef11 100644
--- a/lib/Target/X86/AsmParser/X86AsmParserCommon.h
+++ b/lib/Target/X86/AsmParser/X86AsmParserCommon.h
@@ -10,6 +10,8 @@
 #ifndef LLVM_LIB_TARGET_X86_ASMPARSER_X86ASMPARSERCOMMON_H
 #define LLVM_LIB_TARGET_X86_ASMPARSER_X86ASMPARSERCOMMON_H
 
+#include "llvm/Support/MathExtras.h"
+
 namespace llvm {
 
 inline bool isImmSExti16i8Value(uint64_t Value) {
diff --git a/lib/Target/X86/AsmParser/X86Operand.h b/lib/Target/X86/AsmParser/X86Operand.h
index 7ec02408ffa4..a04c2f5c84a5 100644
--- a/lib/Target/X86/AsmParser/X86Operand.h
+++ b/lib/Target/X86/AsmParser/X86Operand.h
@@ -233,46 +233,47 @@ struct X86Operand : public MCParsedAsmOperand {
   bool isMem512() const {
     return Kind == Memory && (!Mem.Size || Mem.Size == 512);
   }
+  bool isMemIndexReg(unsigned LowR, unsigned HighR) const {
+    assert(Kind == Memory && "Invalid access!");
+    return Mem.IndexReg >= LowR && Mem.IndexReg <= HighR;
+  }
 
-  bool isMemVX32() const {
-    return Kind == Memory && (!Mem.Size || Mem.Size == 32) &&
-      getMemIndexReg() >= X86::XMM0 && getMemIndexReg() <= X86::XMM15;
+  bool isMem64_RC128() const {
+    return isMem64() && isMemIndexReg(X86::XMM0, X86::XMM15);
+  }
+  bool isMem128_RC128() const {
+    return isMem128() && isMemIndexReg(X86::XMM0, X86::XMM15);
   }
-  bool isMemVX32X() const {
-    return Kind == Memory && (!Mem.Size || Mem.Size == 32) &&
-      getMemIndexReg() >= X86::XMM0 && getMemIndexReg() <= X86::XMM31;
+  bool isMem128_RC256() const {
+    return isMem128() && isMemIndexReg(X86::YMM0, X86::YMM15);
   }
-  bool isMemVY32() const {
-    return Kind == Memory && (!Mem.Size || Mem.Size == 32) &&
-      getMemIndexReg() >= X86::YMM0 && getMemIndexReg() <= X86::YMM15;
+  bool isMem256_RC128() const {
+    return isMem256() && isMemIndexReg(X86::XMM0, X86::XMM15);
   }
-  bool isMemVY32X() const {
-    return Kind == Memory && (!Mem.Size || Mem.Size == 32) &&
-      getMemIndexReg() >= X86::YMM0 && getMemIndexReg() <= X86::YMM31;
+  bool isMem256_RC256() const {
+    return isMem256() && isMemIndexReg(X86::YMM0, X86::YMM15);
+  }
+
+  bool isMem64_RC128X() const {
+    return isMem64() && isMemIndexReg(X86::XMM0, X86::XMM31);
   }
-  bool isMemVX64() const {
-    return Kind == Memory && (!Mem.Size || Mem.Size == 64) &&
-      getMemIndexReg() >= X86::XMM0 && getMemIndexReg() <= X86::XMM15;
+  bool isMem128_RC128X() const {
+    return isMem128() && isMemIndexReg(X86::XMM0, X86::XMM31);
   }
-  bool isMemVX64X() const {
-    return Kind == Memory && (!Mem.Size || Mem.Size == 64) &&
-      getMemIndexReg() >= X86::XMM0 && getMemIndexReg() <= X86::XMM31;
+  bool isMem128_RC256X() const {
+    return isMem128() && isMemIndexReg(X86::YMM0, X86::YMM31);
   }
-  bool isMemVY64() const {
-    return Kind == Memory && (!Mem.Size || Mem.Size == 64) &&
-      getMemIndexReg() >= X86::YMM0 && getMemIndexReg() <= X86::YMM15;
+  bool isMem256_RC128X() const {
+    return isMem256() && isMemIndexReg(X86::XMM0, X86::XMM31);
   }
-  bool isMemVY64X() const {
-    return Kind == Memory && (!Mem.Size || Mem.Size == 64) &&
-      getMemIndexReg() >= X86::YMM0 && getMemIndexReg() <= X86::YMM31;
+  bool isMem256_RC256X() const {
+    return isMem256() && isMemIndexReg(X86::YMM0, X86::YMM31);
   }
-  bool isMemVZ32() const {
-    return Kind == Memory && (!Mem.Size || Mem.Size == 32) &&
-      getMemIndexReg() >= X86::ZMM0 && getMemIndexReg() <= X86::ZMM31;
+  bool isMem512_RC256X() const {
+    return isMem512() && isMemIndexReg(X86::YMM0, X86::YMM31);
   }
-  bool isMemVZ64() const {
-    return Kind == Memory && (!Mem.Size || Mem.Size == 64) &&
-      getMemIndexReg() >= X86::ZMM0 && getMemIndexReg() <= X86::ZMM31;
+  bool isMem512_RC512() const {
+    return isMem512() && isMemIndexReg(X86::ZMM0, X86::ZMM31);
   }
 
   bool isAbsMem() const {
diff --git a/lib/Target/X86/CMakeLists.txt b/lib/Target/X86/CMakeLists.txt
index 55949155da9e..894090f78977 100644
--- a/lib/Target/X86/CMakeLists.txt
+++ b/lib/Target/X86/CMakeLists.txt
@@ -17,6 +17,9 @@ set(sources
   X86CallFrameOptimization.cpp
   X86ExpandPseudo.cpp
   X86FastISel.cpp
+  X86FixupBWInsts.cpp
+  X86FixupLEAs.cpp
+  X86FixupSetCC.cpp
   X86FloatingPoint.cpp
   X86FrameLowering.cpp
   X86ISelDAGToDAG.cpp
@@ -24,6 +27,7 @@ set(sources
   X86InstrInfo.cpp
   X86MCInstLower.cpp
   X86MachineFunctionInfo.cpp
+  X86OptimizeLEAs.cpp
   X86PadShortFunction.cpp
   X86RegisterInfo.cpp
   X86SelectionDAGInfo.cpp
@@ -33,9 +37,8 @@ set(sources
   X86TargetObjectFile.cpp
   X86TargetTransformInfo.cpp
   X86VZeroUpper.cpp
-  X86FixupLEAs.cpp
+  X86WinAllocaExpander.cpp
   X86WinEHState.cpp
-  X86OptimizeLEAs.cpp
   )
 
 add_llvm_target(X86CodeGen ${sources})
diff --git a/lib/Target/X86/Disassembler/Makefile b/lib/Target/X86/Disassembler/Makefile
deleted file mode 100644
index 51e7b828cf2a..000000000000
--- a/lib/Target/X86/Disassembler/Makefile
+++ /dev/null
@@ -1,18 +0,0 @@
-##===- lib/Target/X86/Disassembler/Makefile ----------------*- Makefile -*-===##
-#
-#                     The LLVM Compiler Infrastructure
-#
-# This file is distributed under the University of Illinois Open Source
-# License. See LICENSE.TXT for details.
-#
-##===----------------------------------------------------------------------===##
-
-LEVEL = ../../../..
-LIBRARYNAME = LLVMX86Disassembler
-
-# Hack: we need to include 'main' x86 target directory to grab private headers.
-CPP.Flags += -I$(PROJ_OBJ_DIR)/.. -I$(PROJ_SRC_DIR)/..
-
-include $(LEVEL)/Makefile.common
-
-.PHONY: $(PROJ_SRC_DIR)/X86DisassemblerDecoder.c
diff --git a/lib/Target/X86/Disassembler/X86Disassembler.cpp b/lib/Target/X86/Disassembler/X86Disassembler.cpp
index ce8fcf164668..008dead5d0a5 100644
--- a/lib/Target/X86/Disassembler/X86Disassembler.cpp
+++ b/lib/Target/X86/Disassembler/X86Disassembler.cpp
@@ -10,14 +10,74 @@
 // This file is part of the X86 Disassembler.
 // It contains code to translate the data produced by the decoder into
 //  MCInsts.
-// Documentation for the disassembler can be found in X86Disassembler.h.
+//
+//
+// The X86 disassembler is a table-driven disassembler for the 16-, 32-, and
+// 64-bit X86 instruction sets.  The main decode sequence for an assembly
+// instruction in this disassembler is:
+//
+// 1. Read the prefix bytes and determine the attributes of the instruction.
+//    These attributes, recorded in enum attributeBits
+//    (X86DisassemblerDecoderCommon.h), form a bitmask.  The table CONTEXTS_SYM
+//    provides a mapping from bitmasks to contexts, which are represented by
+//    enum InstructionContext (ibid.).
+//
+// 2. Read the opcode, and determine what kind of opcode it is.  The
+//    disassembler distinguishes four kinds of opcodes, which are enumerated in
+//    OpcodeType (X86DisassemblerDecoderCommon.h): one-byte (0xnn), two-byte
+//    (0x0f 0xnn), three-byte-38 (0x0f 0x38 0xnn), or three-byte-3a
+//    (0x0f 0x3a 0xnn).  Mandatory prefixes are treated as part of the context.
+//
+// 3. Depending on the opcode type, look in one of four ClassDecision structures
+//    (X86DisassemblerDecoderCommon.h).  Use the opcode class to determine which
+//    OpcodeDecision (ibid.) to look the opcode in.  Look up the opcode, to get
+//    a ModRMDecision (ibid.).
+//
+// 4. Some instructions, such as escape opcodes or extended opcodes, or even
+//    instructions that have ModRM*Reg / ModRM*Mem forms in LLVM, need the
+//    ModR/M byte to complete decode.  The ModRMDecision's type is an entry from
+//    ModRMDecisionType (X86DisassemblerDecoderCommon.h) that indicates if the
+//    ModR/M byte is required and how to interpret it.
+//
+// 5. After resolving the ModRMDecision, the disassembler has a unique ID
+//    of type InstrUID (X86DisassemblerDecoderCommon.h).  Looking this ID up in
+//    INSTRUCTIONS_SYM yields the name of the instruction and the encodings and
+//    meanings of its operands.
+//
+// 6. For each operand, its encoding is an entry from OperandEncoding
+//    (X86DisassemblerDecoderCommon.h) and its type is an entry from
+//    OperandType (ibid.).  The encoding indicates how to read it from the
+//    instruction; the type indicates how to interpret the value once it has
+//    been read.  For example, a register operand could be stored in the R/M
+//    field of the ModR/M byte, the REG field of the ModR/M byte, or added to
+//    the main opcode.  This is orthogonal from its meaning (an GPR or an XMM
+//    register, for instance).  Given this information, the operands can be
+//    extracted and interpreted.
+//
+// 7. As the last step, the disassembler translates the instruction information
+//    and operands into a format understandable by the client - in this case, an
+//    MCInst for use by the MC infrastructure.
+//
+// The disassembler is broken broadly into two parts: the table emitter that
+// emits the instruction decode tables discussed above during compilation, and
+// the disassembler itself.  The table emitter is documented in more detail in
+// utils/TableGen/X86DisassemblerEmitter.h.
+//
+// X86Disassembler.cpp contains the code responsible for step 7, and for
+//   invoking the decoder to execute steps 1-6.
+// X86DisassemblerDecoderCommon.h contains the definitions needed by both the
+//   table emitter and the disassembler.
+// X86DisassemblerDecoder.h contains the public interface of the decoder,
+//   factored out into C for possible use by other projects.
+// X86DisassemblerDecoder.c contains the source code of the decoder, which is
+//   responsible for steps 1-6.
 //
 //===----------------------------------------------------------------------===//
 
-#include "X86Disassembler.h"
 #include "X86DisassemblerDecoder.h"
+#include "MCTargetDesc/X86MCTargetDesc.h"
 #include "llvm/MC/MCContext.h"
-#include "llvm/MC/MCDisassembler.h"
+#include "llvm/MC/MCDisassembler/MCDisassembler.h"
 #include "llvm/MC/MCExpr.h"
 #include "llvm/MC/MCInst.h"
 #include "llvm/MC/MCInstrInfo.h"
@@ -31,13 +91,6 @@ using namespace llvm::X86Disassembler;
 
 #define DEBUG_TYPE "x86-disassembler"
 
-#define GET_REGINFO_ENUM
-#include "X86GenRegisterInfo.inc"
-#define GET_INSTRINFO_ENUM
-#include "X86GenInstrInfo.inc"
-#define GET_SUBTARGETINFO_ENUM
-#include "X86GenSubtargetInfo.inc"
-
 void llvm::X86Disassembler::Debug(const char *file, unsigned line,
                                   const char *s) {
   dbgs() << file << ":" << line << ": " << s;
@@ -67,14 +120,34 @@ namespace X86 {
   };
 }
 
-extern Target TheX86_32Target, TheX86_64Target;
-
 }
 
 static bool translateInstruction(MCInst &target,
                                 InternalInstruction &source,
                                 const MCDisassembler *Dis);
 
+namespace {
+
+/// Generic disassembler for all X86 platforms. All each platform class should
+/// have to do is subclass the constructor, and provide a different
+/// disassemblerMode value.
+class X86GenericDisassembler : public MCDisassembler {
+  std::unique_ptr<const MCInstrInfo> MII;
+public:
+  X86GenericDisassembler(const MCSubtargetInfo &STI, MCContext &Ctx,
+                         std::unique_ptr<const MCInstrInfo> MII);
+public:
+  DecodeStatus getInstruction(MCInst &instr, uint64_t &size,
+                              ArrayRef<uint8_t> Bytes, uint64_t Address,
+                              raw_ostream &vStream,
+                              raw_ostream &cStream) const override;
+
+private:
+  DisassemblerMode              fMode;
+};
+
+}
+
 X86GenericDisassembler::X86GenericDisassembler(
                                          const MCSubtargetInfo &STI,
                                          MCContext &Ctx,
@@ -826,7 +899,6 @@ static bool translateRM(MCInst &mcInst, const OperandSpecifier &operand,
   case TYPE_R64:
   case TYPE_Rv:
   case TYPE_MM64:
-  case TYPE_XMM:
   case TYPE_XMM32:
   case TYPE_XMM64:
   case TYPE_XMM128:
@@ -911,14 +983,6 @@ static bool translateOperand(MCInst &mcInst, const OperandSpecifier &operand,
     return translateMaskRegister(mcInst, insn.writemask);
   CASE_ENCODING_RM:
     return translateRM(mcInst, operand, insn, Dis);
-  case ENCODING_CB:
-  case ENCODING_CW:
-  case ENCODING_CD:
-  case ENCODING_CP:
-  case ENCODING_CO:
-  case ENCODING_CT:
-    debug("Translation of code offsets isn't supported.");
-    return true;
   case ENCODING_IB:
   case ENCODING_IW:
   case ENCODING_ID:
@@ -997,7 +1061,7 @@ static MCDisassembler *createX86Disassembler(const Target &T,
                                              const MCSubtargetInfo &STI,
                                              MCContext &Ctx) {
   std::unique_ptr<const MCInstrInfo> MII(T.createMCInstrInfo());
-  return new X86Disassembler::X86GenericDisassembler(STI, Ctx, std::move(MII));
+  return new X86GenericDisassembler(STI, Ctx, std::move(MII));
 }
 
 extern "C" void LLVMInitializeX86Disassembler() {
diff --git a/lib/Target/X86/Disassembler/X86Disassembler.h b/lib/Target/X86/Disassembler/X86Disassembler.h
deleted file mode 100644
index d7f426b2641d..000000000000
--- a/lib/Target/X86/Disassembler/X86Disassembler.h
+++ /dev/null
@@ -1,112 +0,0 @@
-//===-- X86Disassembler.h - Disassembler for x86 and x86_64 -----*- C++ -*-===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-//
-// The X86 disassembler is a table-driven disassembler for the 16-, 32-, and
-// 64-bit X86 instruction sets.  The main decode sequence for an assembly
-// instruction in this disassembler is:
-//
-// 1. Read the prefix bytes and determine the attributes of the instruction.
-//    These attributes, recorded in enum attributeBits
-//    (X86DisassemblerDecoderCommon.h), form a bitmask.  The table CONTEXTS_SYM
-//    provides a mapping from bitmasks to contexts, which are represented by
-//    enum InstructionContext (ibid.).
-//
-// 2. Read the opcode, and determine what kind of opcode it is.  The
-//    disassembler distinguishes four kinds of opcodes, which are enumerated in
-//    OpcodeType (X86DisassemblerDecoderCommon.h): one-byte (0xnn), two-byte
-//    (0x0f 0xnn), three-byte-38 (0x0f 0x38 0xnn), or three-byte-3a
-//    (0x0f 0x3a 0xnn).  Mandatory prefixes are treated as part of the context.
-//
-// 3. Depending on the opcode type, look in one of four ClassDecision structures
-//    (X86DisassemblerDecoderCommon.h).  Use the opcode class to determine which
-//    OpcodeDecision (ibid.) to look the opcode in.  Look up the opcode, to get
-//    a ModRMDecision (ibid.).
-//
-// 4. Some instructions, such as escape opcodes or extended opcodes, or even
-//    instructions that have ModRM*Reg / ModRM*Mem forms in LLVM, need the
-//    ModR/M byte to complete decode.  The ModRMDecision's type is an entry from
-//    ModRMDecisionType (X86DisassemblerDecoderCommon.h) that indicates if the
-//    ModR/M byte is required and how to interpret it.
-//
-// 5. After resolving the ModRMDecision, the disassembler has a unique ID
-//    of type InstrUID (X86DisassemblerDecoderCommon.h).  Looking this ID up in
-//    INSTRUCTIONS_SYM yields the name of the instruction and the encodings and
-//    meanings of its operands.
-//
-// 6. For each operand, its encoding is an entry from OperandEncoding
-//    (X86DisassemblerDecoderCommon.h) and its type is an entry from
-//    OperandType (ibid.).  The encoding indicates how to read it from the
-//    instruction; the type indicates how to interpret the value once it has
-//    been read.  For example, a register operand could be stored in the R/M
-//    field of the ModR/M byte, the REG field of the ModR/M byte, or added to
-//    the main opcode.  This is orthogonal from its meaning (an GPR or an XMM
-//    register, for instance).  Given this information, the operands can be
-//    extracted and interpreted.
-//
-// 7. As the last step, the disassembler translates the instruction information
-//    and operands into a format understandable by the client - in this case, an
-//    MCInst for use by the MC infrastructure.
-//
-// The disassembler is broken broadly into two parts: the table emitter that
-// emits the instruction decode tables discussed above during compilation, and
-// the disassembler itself.  The table emitter is documented in more detail in
-// utils/TableGen/X86DisassemblerEmitter.h.
-//
-// X86Disassembler.h contains the public interface for the disassembler,
-//   adhering to the MCDisassembler interface.
-// X86Disassembler.cpp contains the code responsible for step 7, and for
-//   invoking the decoder to execute steps 1-6.
-// X86DisassemblerDecoderCommon.h contains the definitions needed by both the
-//   table emitter and the disassembler.
-// X86DisassemblerDecoder.h contains the public interface of the decoder,
-//   factored out into C for possible use by other projects.
-// X86DisassemblerDecoder.c contains the source code of the decoder, which is
-//   responsible for steps 1-6.
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef LLVM_LIB_TARGET_X86_DISASSEMBLER_X86DISASSEMBLER_H
-#define LLVM_LIB_TARGET_X86_DISASSEMBLER_X86DISASSEMBLER_H
-
-#include "X86DisassemblerDecoderCommon.h"
-#include "llvm/MC/MCDisassembler.h"
-
-namespace llvm {
-
-class MCInst;
-class MCInstrInfo;
-class MCSubtargetInfo;
-class MemoryObject;
-class raw_ostream;
-
-namespace X86Disassembler {
-
-/// Generic disassembler for all X86 platforms. All each platform class should
-/// have to do is subclass the constructor, and provide a different
-/// disassemblerMode value.
-class X86GenericDisassembler : public MCDisassembler {
-  std::unique_ptr<const MCInstrInfo> MII;
-public:
-  X86GenericDisassembler(const MCSubtargetInfo &STI, MCContext &Ctx,
-                         std::unique_ptr<const MCInstrInfo> MII);
-public:
-  DecodeStatus getInstruction(MCInst &instr, uint64_t &size,
-                              ArrayRef<uint8_t> Bytes, uint64_t Address,
-                              raw_ostream &vStream,
-                              raw_ostream &cStream) const override;
-
-private:
-  DisassemblerMode              fMode;
-};
-
-} // namespace X86Disassembler
-
-} // namespace llvm
-
-#endif
diff --git a/lib/Target/X86/Disassembler/X86DisassemblerDecoder.cpp b/lib/Target/X86/Disassembler/X86DisassemblerDecoder.cpp
index 040143b15587..b0a150ab564d 100644
--- a/lib/Target/X86/Disassembler/X86DisassemblerDecoder.cpp
+++ b/lib/Target/X86/Disassembler/X86DisassemblerDecoder.cpp
@@ -53,7 +53,6 @@ struct ContextDecision {
 #define debug(s) do { } while (0)
 #endif
 
-
 /*
  * contextForAttrs - Client for the instruction context table.  Takes a set of
  *   attributes and returns the appropriate decode context.
@@ -276,8 +275,6 @@ static void dbgprintf(struct InternalInstruction* insn,
   va_end(ap);
 
   insn->dlog(insn->dlogArg, buffer);
-
-  return;
 }
 
 /*
@@ -1453,10 +1450,10 @@ static int readModRM(struct InternalInstruction* insn) {
 }
 
 #define GENERIC_FIXUP_FUNC(name, base, prefix)            \
-  static uint8_t name(struct InternalInstruction *insn,   \
-                      OperandType type,                   \
-                      uint8_t index,                      \
-                      uint8_t *valid) {                   \
+  static uint16_t name(struct InternalInstruction *insn,  \
+                       OperandType type,                  \
+                       uint8_t index,                     \
+                       uint8_t *valid) {                  \
     *valid = 1;                                           \
     switch (type) {                                       \
     default:                                              \
@@ -1485,7 +1482,6 @@ static int readModRM(struct InternalInstruction* insn) {
     case TYPE_XMM128:                                     \
     case TYPE_XMM64:                                      \
     case TYPE_XMM32:                                      \
-    case TYPE_XMM:                                        \
       return prefix##_XMM0 + index;                       \
     case TYPE_VK1:                                        \
     case TYPE_VK2:                                        \
@@ -1507,6 +1503,10 @@ static int readModRM(struct InternalInstruction* insn) {
       return prefix##_DR0 + index;                        \
     case TYPE_CONTROLREG:                                 \
       return prefix##_CR0 + index;                        \
+    case TYPE_BNDR:                                       \
+      if (index > 3)                                      \
+        *valid = 0;                                       \
+      return prefix##_BND0 + index;                       \
     }                                                     \
   }
 
@@ -1763,14 +1763,6 @@ static int readOperands(struct InternalInstruction* insn) {
       if (Op.encoding != ENCODING_REG && insn->eaDisplacement == EA_DISP_8)
         insn->displacement *= 1 << (Op.encoding - ENCODING_RM);
       break;
-    case ENCODING_CB:
-    case ENCODING_CW:
-    case ENCODING_CD:
-    case ENCODING_CP:
-    case ENCODING_CO:
-    case ENCODING_CT:
-      dbgprintf(insn, "We currently don't hande code-offset encodings");
-      return -1;
     case ENCODING_IB:
       if (sawRegImm) {
         /* Saw a register immediate so don't read again and instead split the
diff --git a/lib/Target/X86/Disassembler/X86DisassemblerDecoder.h b/lib/Target/X86/Disassembler/X86DisassemblerDecoder.h
index 28a628e5066b..24d24a265b49 100644
--- a/lib/Target/X86/Disassembler/X86DisassemblerDecoder.h
+++ b/lib/Target/X86/Disassembler/X86DisassemblerDecoder.h
@@ -369,6 +369,12 @@ namespace X86Disassembler {
   ENTRY(CR14)         \
   ENTRY(CR15)
 
+#define REGS_BOUND    \
+  ENTRY(BND0)         \
+  ENTRY(BND1)         \
+  ENTRY(BND2)         \
+  ENTRY(BND3)
+
 #define ALL_EA_BASES  \
   EA_BASES_16BIT      \
   EA_BASES_32BIT      \
@@ -391,6 +397,7 @@ namespace X86Disassembler {
   REGS_SEGMENT        \
   REGS_DEBUG          \
   REGS_CONTROL        \
+  REGS_BOUND          \
   ENTRY(RIP)
 
 /// \brief All possible values of the base field for effective-address
diff --git a/lib/Target/X86/Disassembler/X86DisassemblerDecoderCommon.h b/lib/Target/X86/Disassembler/X86DisassemblerDecoderCommon.h
index 301db72feafb..0a835b876d90 100644
--- a/lib/Target/X86/Disassembler/X86DisassemblerDecoderCommon.h
+++ b/lib/Target/X86/Disassembler/X86DisassemblerDecoderCommon.h
@@ -352,12 +352,6 @@ enum ModRMDecisionType {
   ENUM_ENTRY(ENCODING_RM_CD64,"R/M operand with CDisp scaling of 64")          \
   ENUM_ENTRY(ENCODING_VVVV,   "Register operand in VEX.vvvv byte.")            \
   ENUM_ENTRY(ENCODING_WRITEMASK, "Register operand in EVEX.aaa byte.")         \
-  ENUM_ENTRY(ENCODING_CB,     "1-byte code offset (possible new CS value)")    \
-  ENUM_ENTRY(ENCODING_CW,     "2-byte")                                        \
-  ENUM_ENTRY(ENCODING_CD,     "4-byte")                                        \
-  ENUM_ENTRY(ENCODING_CP,     "6-byte")                                        \
-  ENUM_ENTRY(ENCODING_CO,     "8-byte")                                        \
-  ENUM_ENTRY(ENCODING_CT,     "10-byte")                                       \
   ENUM_ENTRY(ENCODING_IB,     "1-byte immediate")                              \
   ENUM_ENTRY(ENCODING_IW,     "2-byte")                                        \
   ENUM_ENTRY(ENCODING_ID,     "4-byte")                                        \
@@ -436,14 +430,11 @@ enum OperandEncoding {
   ENUM_ENTRY(TYPE_MOFFS16,    "2-byte")                                        \
   ENUM_ENTRY(TYPE_MOFFS32,    "4-byte")                                        \
   ENUM_ENTRY(TYPE_MOFFS64,    "8-byte")                                        \
-  ENUM_ENTRY(TYPE_SREG,       "Byte with single bit set: 0 = ES, 1 = CS, "     \
-                              "2 = SS, 3 = DS, 4 = FS, 5 = GS")                \
   ENUM_ENTRY(TYPE_M32FP,      "32-bit IEE754 memory floating-point operand")   \
   ENUM_ENTRY(TYPE_M64FP,      "64-bit")                                        \
   ENUM_ENTRY(TYPE_M80FP,      "80-bit extended")                               \
   ENUM_ENTRY(TYPE_ST,         "Position on the floating-point stack")          \
   ENUM_ENTRY(TYPE_MM64,       "8-byte MMX register")                           \
-  ENUM_ENTRY(TYPE_XMM,        "XMM register operand")                          \
   ENUM_ENTRY(TYPE_XMM32,      "4-byte XMM register or memory operand")         \
   ENUM_ENTRY(TYPE_XMM64,      "8-byte")                                        \
   ENUM_ENTRY(TYPE_XMM128,     "16-byte")                                       \
@@ -456,7 +447,6 @@ enum OperandEncoding {
   ENUM_ENTRY(TYPE_VK16,       "16-bit")                                        \
   ENUM_ENTRY(TYPE_VK32,       "32-bit")                                        \
   ENUM_ENTRY(TYPE_VK64,       "64-bit")                                        \
-  ENUM_ENTRY(TYPE_XMM0,       "Implicit use of XMM0")                          \
   ENUM_ENTRY(TYPE_SEGMENTREG, "Segment register operand")                      \
   ENUM_ENTRY(TYPE_DEBUGREG,   "Debug register operand")                        \
   ENUM_ENTRY(TYPE_CONTROLREG, "Control register operand")                      \
diff --git a/lib/Target/X86/InstPrinter/Makefile b/lib/Target/X86/InstPrinter/Makefile
deleted file mode 100644
index c82aa330a20c..000000000000
--- a/lib/Target/X86/InstPrinter/Makefile
+++ /dev/null
@@ -1,15 +0,0 @@
-##===- lib/Target/X86/AsmPrinter/Makefile ------------------*- Makefile -*-===##
-#
-#                     The LLVM Compiler Infrastructure
-#
-# This file is distributed under the University of Illinois Open Source
-# License. See LICENSE.TXT for details.
-#
-##===----------------------------------------------------------------------===##
-LEVEL = ../../../..
-LIBRARYNAME = LLVMX86AsmPrinter
-
-# Hack: we need to include 'main' x86 target directory to grab private headers
-CPP.Flags += -I$(PROJ_OBJ_DIR)/.. -I$(PROJ_SRC_DIR)/..
-
-include $(LEVEL)/Makefile.common
diff --git a/lib/Target/X86/InstPrinter/X86ATTInstPrinter.cpp b/lib/Target/X86/InstPrinter/X86ATTInstPrinter.cpp
index b4c0bc4cd4d9..3a5d056888a1 100644
--- a/lib/Target/X86/InstPrinter/X86ATTInstPrinter.cpp
+++ b/lib/Target/X86/InstPrinter/X86ATTInstPrinter.cpp
@@ -25,7 +25,6 @@
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/Format.h"
 #include "llvm/Support/FormattedStream.h"
-#include <map>
 using namespace llvm;
 
 #define DEBUG_TYPE "asm-printer"
@@ -166,17 +165,25 @@ void X86ATTInstPrinter::printOperand(const MCInst *MI, unsigned OpNo,
   if (Op.isReg()) {
     printRegName(O, Op.getReg());
   } else if (Op.isImm()) {
-    // Print X86 immediates as signed values.
-    O << markup("<imm:") << '$' << formatImm((int64_t)Op.getImm())
-      << markup(">");
+    // Print immediates as signed values.
+    int64_t Imm = Op.getImm();
+    O << markup("<imm:") << '$' << formatImm(Imm) << markup(">");
+
+    // TODO: This should be in a helper function in the base class, so it can
+    // be used by other printers.
 
     // If there are no instruction-specific comments, add a comment clarifying
     // the hex value of the immediate operand when it isn't in the range
     // [-256,255].
-    if (CommentStream && !HasCustomInstComment &&
-        (Op.getImm() > 255 || Op.getImm() < -256))
-      *CommentStream << format("imm = 0x%" PRIX64 "\n", (uint64_t)Op.getImm());
-
+    if (CommentStream && !HasCustomInstComment && (Imm > 255 || Imm < -256)) {
+      // Don't print unnecessary hex sign bits. 
+      if (Imm == (int16_t)(Imm))
+        *CommentStream << format("imm = 0x%" PRIX16 "\n", (uint16_t)Imm);
+      else if (Imm == (int32_t)(Imm))
+        *CommentStream << format("imm = 0x%" PRIX32 "\n", (uint32_t)Imm);
+      else
+        *CommentStream << format("imm = 0x%" PRIX64 "\n", (uint64_t)Imm);
+    }
   } else {
     assert(Op.isExpr() && "unknown operand kind in printOperand");
     O << markup("<imm:") << '$';
diff --git a/lib/Target/X86/InstPrinter/X86InstComments.cpp b/lib/Target/X86/InstPrinter/X86InstComments.cpp
index 73f654cba38c..f5379566b619 100644
--- a/lib/Target/X86/InstPrinter/X86InstComments.cpp
+++ b/lib/Target/X86/InstPrinter/X86InstComments.cpp
@@ -21,6 +21,143 @@
 
 using namespace llvm;
 
+#define CASE_SSE_INS_COMMON(Inst, src)            \
+  case X86::Inst##src:
+
+#define CASE_AVX_INS_COMMON(Inst, Suffix, src)    \
+  case X86::V##Inst##Suffix##src:
+
+#define CASE_MASK_INS_COMMON(Inst, Suffix, src)   \
+  case X86::V##Inst##Suffix##src##k:
+
+#define CASE_MASKZ_INS_COMMON(Inst, Suffix, src)  \
+  case X86::V##Inst##Suffix##src##kz:
+
+#define CASE_AVX512_INS_COMMON(Inst, Suffix, src) \
+  CASE_AVX_INS_COMMON(Inst, Suffix, src)          \
+  CASE_MASK_INS_COMMON(Inst, Suffix, src)         \
+  CASE_MASKZ_INS_COMMON(Inst, Suffix, src)
+
+#define CASE_MOVDUP(Inst, src)                    \
+  CASE_AVX512_INS_COMMON(Inst, Z, r##src)         \
+  CASE_AVX512_INS_COMMON(Inst, Z256, r##src)      \
+  CASE_AVX512_INS_COMMON(Inst, Z128, r##src)      \
+  CASE_AVX_INS_COMMON(Inst, , r##src)             \
+  CASE_AVX_INS_COMMON(Inst, Y, r##src)            \
+  CASE_SSE_INS_COMMON(Inst, r##src)
+
+#define CASE_MASK_MOVDUP(Inst, src)               \
+  CASE_MASK_INS_COMMON(Inst, Z, r##src)           \
+  CASE_MASK_INS_COMMON(Inst, Z256, r##src)        \
+  CASE_MASK_INS_COMMON(Inst, Z128, r##src)
+
+#define CASE_MASKZ_MOVDUP(Inst, src)              \
+  CASE_MASKZ_INS_COMMON(Inst, Z, r##src)          \
+  CASE_MASKZ_INS_COMMON(Inst, Z256, r##src)       \
+  CASE_MASKZ_INS_COMMON(Inst, Z128, r##src)
+
+#define CASE_PMOVZX(Inst, src)                    \
+  CASE_AVX512_INS_COMMON(Inst, Z, r##src)         \
+  CASE_AVX512_INS_COMMON(Inst, Z256, r##src)      \
+  CASE_AVX512_INS_COMMON(Inst, Z128, r##src)      \
+  CASE_AVX_INS_COMMON(Inst, , r##src)             \
+  CASE_AVX_INS_COMMON(Inst, Y, r##src)            \
+  CASE_SSE_INS_COMMON(Inst, r##src)
+
+#define CASE_MASK_PMOVZX(Inst, src)               \
+  CASE_MASK_INS_COMMON(Inst, Z, r##src)           \
+  CASE_MASK_INS_COMMON(Inst, Z256, r##src)        \
+  CASE_MASK_INS_COMMON(Inst, Z128, r##src)
+
+#define CASE_MASKZ_PMOVZX(Inst, src)              \
+  CASE_MASKZ_INS_COMMON(Inst, Z, r##src)          \
+  CASE_MASKZ_INS_COMMON(Inst, Z256, r##src)       \
+  CASE_MASKZ_INS_COMMON(Inst, Z128, r##src)
+
+#define CASE_UNPCK(Inst, src)                     \
+  CASE_AVX512_INS_COMMON(Inst, Z, r##src)         \
+  CASE_AVX512_INS_COMMON(Inst, Z256, r##src)      \
+  CASE_AVX512_INS_COMMON(Inst, Z128, r##src)      \
+  CASE_AVX_INS_COMMON(Inst, , r##src)             \
+  CASE_AVX_INS_COMMON(Inst, Y, r##src)            \
+  CASE_SSE_INS_COMMON(Inst, r##src)
+
+#define CASE_MASK_UNPCK(Inst, src)                \
+  CASE_MASK_INS_COMMON(Inst, Z, r##src)           \
+  CASE_MASK_INS_COMMON(Inst, Z256, r##src)        \
+  CASE_MASK_INS_COMMON(Inst, Z128, r##src)
+
+#define CASE_MASKZ_UNPCK(Inst, src)               \
+  CASE_MASKZ_INS_COMMON(Inst, Z, r##src)          \
+  CASE_MASKZ_INS_COMMON(Inst, Z256, r##src)       \
+  CASE_MASKZ_INS_COMMON(Inst, Z128, r##src)
+
+#define CASE_SHUF(Inst, suf)                      \
+  CASE_AVX512_INS_COMMON(Inst, Z, suf)            \
+  CASE_AVX512_INS_COMMON(Inst, Z256, suf)         \
+  CASE_AVX512_INS_COMMON(Inst, Z128, suf)         \
+  CASE_AVX_INS_COMMON(Inst, , suf)                \
+  CASE_AVX_INS_COMMON(Inst, Y, suf)               \
+  CASE_SSE_INS_COMMON(Inst, suf)
+
+#define CASE_MASK_SHUF(Inst, src)                 \
+  CASE_MASK_INS_COMMON(Inst, Z, r##src##i)        \
+  CASE_MASK_INS_COMMON(Inst, Z256, r##src##i)     \
+  CASE_MASK_INS_COMMON(Inst, Z128, r##src##i)
+
+#define CASE_MASKZ_SHUF(Inst, src)                \
+  CASE_MASKZ_INS_COMMON(Inst, Z, r##src##i)       \
+  CASE_MASKZ_INS_COMMON(Inst, Z256, r##src##i)    \
+  CASE_MASKZ_INS_COMMON(Inst, Z128, r##src##i)
+
+#define CASE_VPERMILPI(Inst, src)                 \
+  CASE_AVX512_INS_COMMON(Inst, Z, src##i)         \
+  CASE_AVX512_INS_COMMON(Inst, Z256, src##i)      \
+  CASE_AVX512_INS_COMMON(Inst, Z128, src##i)      \
+  CASE_AVX_INS_COMMON(Inst, , src##i)             \
+  CASE_AVX_INS_COMMON(Inst, Y, src##i)
+
+#define CASE_MASK_VPERMILPI(Inst, src)            \
+  CASE_MASK_INS_COMMON(Inst, Z, src##i)           \
+  CASE_MASK_INS_COMMON(Inst, Z256, src##i)        \
+  CASE_MASK_INS_COMMON(Inst, Z128, src##i)
+
+#define CASE_MASKZ_VPERMILPI(Inst, src)           \
+  CASE_MASKZ_INS_COMMON(Inst, Z, src##i)          \
+  CASE_MASKZ_INS_COMMON(Inst, Z256, src##i)       \
+  CASE_MASKZ_INS_COMMON(Inst, Z128, src##i)
+
+#define CASE_VPERM(Inst, src)                     \
+  CASE_AVX512_INS_COMMON(Inst, Z, src##i)         \
+  CASE_AVX512_INS_COMMON(Inst, Z256, src##i)      \
+  CASE_AVX_INS_COMMON(Inst, Y, src##i)
+
+#define CASE_MASK_VPERM(Inst, src)                \
+  CASE_MASK_INS_COMMON(Inst, Z, src##i)           \
+  CASE_MASK_INS_COMMON(Inst, Z256, src##i)
+
+#define CASE_MASKZ_VPERM(Inst, src)               \
+  CASE_MASKZ_INS_COMMON(Inst, Z, src##i)          \
+  CASE_MASKZ_INS_COMMON(Inst, Z256, src##i)
+
+#define CASE_VSHUF(Inst, src)                          \
+  CASE_AVX512_INS_COMMON(SHUFF##Inst, Z, r##src##i)    \
+  CASE_AVX512_INS_COMMON(SHUFI##Inst, Z, r##src##i)    \
+  CASE_AVX512_INS_COMMON(SHUFF##Inst, Z256, r##src##i) \
+  CASE_AVX512_INS_COMMON(SHUFI##Inst, Z256, r##src##i)
+
+#define CASE_MASK_VSHUF(Inst, src)                    \
+  CASE_MASK_INS_COMMON(SHUFF##Inst, Z, r##src##i)     \
+  CASE_MASK_INS_COMMON(SHUFI##Inst, Z, r##src##i)     \
+  CASE_MASK_INS_COMMON(SHUFF##Inst, Z256, r##src##i)  \
+  CASE_MASK_INS_COMMON(SHUFI##Inst, Z256, r##src##i)
+
+#define CASE_MASKZ_VSHUF(Inst, src)                   \
+  CASE_MASKZ_INS_COMMON(SHUFF##Inst, Z, r##src##i)    \
+  CASE_MASKZ_INS_COMMON(SHUFI##Inst, Z, r##src##i)    \
+  CASE_MASKZ_INS_COMMON(SHUFF##Inst, Z256, r##src##i) \
+  CASE_MASKZ_INS_COMMON(SHUFI##Inst, Z256, r##src##i)
+
 static unsigned getVectorRegSize(unsigned RegNo) {
   if (X86::ZMM0 <= RegNo && RegNo <= X86::ZMM31)
     return 512;
@@ -41,159 +178,184 @@ static MVT getRegOperandVectorVT(const MCInst *MI, const MVT &ScalarVT,
                           getVectorRegSize(OpReg)/ScalarVT.getSizeInBits());
 }
 
-/// \brief Extracts the src/dst types for a given zero extension instruction.
-/// \note While the number of elements in DstVT type correct, the
-/// number in the SrcVT type is expanded to fill the src xmm register and the
-/// upper elements may not be included in the dst xmm/ymm register.
-static void getZeroExtensionTypes(const MCInst *MI, MVT &SrcVT, MVT &DstVT) {
+/// \brief Extracts the dst type for a given zero extension instruction.
+static MVT getZeroExtensionResultType(const MCInst *MI) {
   switch (MI->getOpcode()) {
   default:
     llvm_unreachable("Unknown zero extension instruction");
-  // i8 zero extension
-  case X86::PMOVZXBWrm:
-  case X86::PMOVZXBWrr:
-  case X86::VPMOVZXBWrm:
-  case X86::VPMOVZXBWrr:
-    SrcVT = MVT::v16i8;
-    DstVT = MVT::v8i16;
-    break;
-  case X86::VPMOVZXBWYrm:
-  case X86::VPMOVZXBWYrr:
-    SrcVT = MVT::v16i8;
-    DstVT = MVT::v16i16;
-    break;
-  case X86::PMOVZXBDrm:
-  case X86::PMOVZXBDrr:
-  case X86::VPMOVZXBDrm:
-  case X86::VPMOVZXBDrr:
-    SrcVT = MVT::v16i8;
-    DstVT = MVT::v4i32;
-    break;
-  case X86::VPMOVZXBDYrm:
-  case X86::VPMOVZXBDYrr:
-    SrcVT = MVT::v16i8;
-    DstVT = MVT::v8i32;
-    break;
-  case X86::PMOVZXBQrm:
-  case X86::PMOVZXBQrr:
-  case X86::VPMOVZXBQrm:
-  case X86::VPMOVZXBQrr:
-    SrcVT = MVT::v16i8;
-    DstVT = MVT::v2i64;
-    break;
-  case X86::VPMOVZXBQYrm:
-  case X86::VPMOVZXBQYrr:
-    SrcVT = MVT::v16i8;
-    DstVT = MVT::v4i64;
-    break;
-  // i16 zero extension
-  case X86::PMOVZXWDrm:
-  case X86::PMOVZXWDrr:
-  case X86::VPMOVZXWDrm:
-  case X86::VPMOVZXWDrr:
-    SrcVT = MVT::v8i16;
-    DstVT = MVT::v4i32;
-    break;
-  case X86::VPMOVZXWDYrm:
-  case X86::VPMOVZXWDYrr:
-    SrcVT = MVT::v8i16;
-    DstVT = MVT::v8i32;
-    break;
-  case X86::PMOVZXWQrm:
-  case X86::PMOVZXWQrr:
-  case X86::VPMOVZXWQrm:
-  case X86::VPMOVZXWQrr:
-    SrcVT = MVT::v8i16;
-    DstVT = MVT::v2i64;
-    break;
-  case X86::VPMOVZXWQYrm:
-  case X86::VPMOVZXWQYrr:
-    SrcVT = MVT::v8i16;
-    DstVT = MVT::v4i64;
-    break;
-  // i32 zero extension
-  case X86::PMOVZXDQrm:
-  case X86::PMOVZXDQrr:
-  case X86::VPMOVZXDQrm:
-  case X86::VPMOVZXDQrr:
-    SrcVT = MVT::v4i32;
-    DstVT = MVT::v2i64;
-    break;
-  case X86::VPMOVZXDQYrm:
-  case X86::VPMOVZXDQYrr:
-    SrcVT = MVT::v4i32;
-    DstVT = MVT::v4i64;
-    break;
+  // zero extension to i16
+  CASE_PMOVZX(PMOVZXBW, m)
+  CASE_PMOVZX(PMOVZXBW, r)
+    return getRegOperandVectorVT(MI, MVT::i16, 0);
+  // zero extension to i32
+  CASE_PMOVZX(PMOVZXBD, m)
+  CASE_PMOVZX(PMOVZXBD, r)
+  CASE_PMOVZX(PMOVZXWD, m)
+  CASE_PMOVZX(PMOVZXWD, r)
+    return getRegOperandVectorVT(MI, MVT::i32, 0);
+  // zero extension to i64
+  CASE_PMOVZX(PMOVZXBQ, m)
+  CASE_PMOVZX(PMOVZXBQ, r)
+  CASE_PMOVZX(PMOVZXWQ, m)
+  CASE_PMOVZX(PMOVZXWQ, r)
+  CASE_PMOVZX(PMOVZXDQ, m)
+  CASE_PMOVZX(PMOVZXDQ, r)
+    return getRegOperandVectorVT(MI, MVT::i64, 0);
   }
 }
 
-#define CASE_MASK_INS_COMMON(Inst, Suffix, src)  \
-  case X86::V##Inst##Suffix##src:                \
-  case X86::V##Inst##Suffix##src##k:             \
-  case X86::V##Inst##Suffix##src##kz:
-
-#define CASE_SSE_INS_COMMON(Inst, src)           \
-  case X86::Inst##src:
-
-#define CASE_AVX_INS_COMMON(Inst, Suffix, src)  \
-  case X86::V##Inst##Suffix##src:
+/// Wraps the destination register name with AVX512 mask/maskz filtering.
+static std::string getMaskName(const MCInst *MI, const char *DestName,
+                               const char *(*getRegName)(unsigned)) {
+  std::string OpMaskName(DestName);
 
-#define CASE_MOVDUP(Inst, src)                  \
-  CASE_MASK_INS_COMMON(Inst, Z, r##src)         \
-  CASE_MASK_INS_COMMON(Inst, Z256, r##src)      \
-  CASE_MASK_INS_COMMON(Inst, Z128, r##src)      \
-  CASE_AVX_INS_COMMON(Inst, , r##src)           \
-  CASE_AVX_INS_COMMON(Inst, Y, r##src)          \
-  CASE_SSE_INS_COMMON(Inst, r##src)             \
-
-#define CASE_UNPCK(Inst, src)                   \
-  CASE_MASK_INS_COMMON(Inst, Z, r##src)         \
-  CASE_MASK_INS_COMMON(Inst, Z256, r##src)      \
-  CASE_MASK_INS_COMMON(Inst, Z128, r##src)      \
-  CASE_AVX_INS_COMMON(Inst, , r##src)           \
-  CASE_AVX_INS_COMMON(Inst, Y, r##src)          \
-  CASE_SSE_INS_COMMON(Inst, r##src)             \
-
-#define CASE_SHUF(Inst, src)                    \
-  CASE_MASK_INS_COMMON(Inst, Z, r##src##i)      \
-  CASE_MASK_INS_COMMON(Inst, Z256, r##src##i)   \
-  CASE_MASK_INS_COMMON(Inst, Z128, r##src##i)   \
-  CASE_AVX_INS_COMMON(Inst, , r##src##i)        \
-  CASE_AVX_INS_COMMON(Inst, Y, r##src##i)       \
-  CASE_SSE_INS_COMMON(Inst, r##src##i)          \
-
-#define CASE_VPERM(Inst, src)                   \
-  CASE_MASK_INS_COMMON(Inst, Z, src##i)         \
-  CASE_MASK_INS_COMMON(Inst, Z256, src##i)      \
-  CASE_MASK_INS_COMMON(Inst, Z128, src##i)      \
-  CASE_AVX_INS_COMMON(Inst, , src##i)           \
-  CASE_AVX_INS_COMMON(Inst, Y, src##i)          \
+  bool MaskWithZero = false;
+  const char *MaskRegName = nullptr;
 
-#define CASE_VSHUF(Inst, src)                          \
-  CASE_MASK_INS_COMMON(SHUFF##Inst, Z, r##src##i)      \
-  CASE_MASK_INS_COMMON(SHUFI##Inst, Z, r##src##i)      \
-  CASE_MASK_INS_COMMON(SHUFF##Inst, Z256, r##src##i)   \
-  CASE_MASK_INS_COMMON(SHUFI##Inst, Z256, r##src##i)   \
-
-/// \brief Extracts the types and if it has memory operand for a given
-/// (SHUFF32x4/SHUFF64x2/SHUFI32x4/SHUFI64x2) instruction.
-static void getVSHUF64x2FamilyInfo(const MCInst *MI, MVT &VT, bool &HasMemOp) {
-  HasMemOp = false;
   switch (MI->getOpcode()) {
   default:
-    llvm_unreachable("Unknown VSHUF64x2 family instructions.");
+    return OpMaskName;
+  CASE_MASKZ_MOVDUP(MOVDDUP, m)
+  CASE_MASKZ_MOVDUP(MOVDDUP, r)
+  CASE_MASKZ_MOVDUP(MOVSHDUP, m)
+  CASE_MASKZ_MOVDUP(MOVSHDUP, r)
+  CASE_MASKZ_MOVDUP(MOVSLDUP, m)
+  CASE_MASKZ_MOVDUP(MOVSLDUP, r)
+  CASE_MASKZ_PMOVZX(PMOVZXBD, m)
+  CASE_MASKZ_PMOVZX(PMOVZXBD, r)
+  CASE_MASKZ_PMOVZX(PMOVZXBQ, m)
+  CASE_MASKZ_PMOVZX(PMOVZXBQ, r)
+  CASE_MASKZ_PMOVZX(PMOVZXBW, m)
+  CASE_MASKZ_PMOVZX(PMOVZXBW, r)
+  CASE_MASKZ_PMOVZX(PMOVZXDQ, m)
+  CASE_MASKZ_PMOVZX(PMOVZXDQ, r)
+  CASE_MASKZ_PMOVZX(PMOVZXWD, m)
+  CASE_MASKZ_PMOVZX(PMOVZXWD, r)
+  CASE_MASKZ_PMOVZX(PMOVZXWQ, m)
+  CASE_MASKZ_PMOVZX(PMOVZXWQ, r)
+  CASE_MASKZ_UNPCK(PUNPCKHBW, m)
+  CASE_MASKZ_UNPCK(PUNPCKHBW, r)
+  CASE_MASKZ_UNPCK(PUNPCKHWD, m)
+  CASE_MASKZ_UNPCK(PUNPCKHWD, r)
+  CASE_MASKZ_UNPCK(PUNPCKHDQ, m)
+  CASE_MASKZ_UNPCK(PUNPCKHDQ, r)
+  CASE_MASKZ_UNPCK(PUNPCKLBW, m)
+  CASE_MASKZ_UNPCK(PUNPCKLBW, r)
+  CASE_MASKZ_UNPCK(PUNPCKLWD, m)
+  CASE_MASKZ_UNPCK(PUNPCKLWD, r)
+  CASE_MASKZ_UNPCK(PUNPCKLDQ, m)
+  CASE_MASKZ_UNPCK(PUNPCKLDQ, r)
+  CASE_MASKZ_UNPCK(UNPCKHPD, m)
+  CASE_MASKZ_UNPCK(UNPCKHPD, r)
+  CASE_MASKZ_UNPCK(UNPCKHPS, m)
+  CASE_MASKZ_UNPCK(UNPCKHPS, r)
+  CASE_MASKZ_UNPCK(UNPCKLPD, m)
+  CASE_MASKZ_UNPCK(UNPCKLPD, r)
+  CASE_MASKZ_UNPCK(UNPCKLPS, m)
+  CASE_MASKZ_UNPCK(UNPCKLPS, r)
+  CASE_MASKZ_SHUF(PALIGNR, r)
+  CASE_MASKZ_SHUF(PALIGNR, m)
+  CASE_MASKZ_SHUF(SHUFPD, m)
+  CASE_MASKZ_SHUF(SHUFPD, r)
+  CASE_MASKZ_SHUF(SHUFPS, m)
+  CASE_MASKZ_SHUF(SHUFPS, r)
+  CASE_MASKZ_VPERMILPI(PERMILPD, m)
+  CASE_MASKZ_VPERMILPI(PERMILPD, r)
+  CASE_MASKZ_VPERMILPI(PERMILPS, m)
+  CASE_MASKZ_VPERMILPI(PERMILPS, r)
+  CASE_MASKZ_VPERMILPI(PSHUFD, m)
+  CASE_MASKZ_VPERMILPI(PSHUFD, r)
+  CASE_MASKZ_VPERMILPI(PSHUFHW, m)
+  CASE_MASKZ_VPERMILPI(PSHUFHW, r)
+  CASE_MASKZ_VPERMILPI(PSHUFLW, m)
+  CASE_MASKZ_VPERMILPI(PSHUFLW, r)
+  CASE_MASKZ_VPERM(PERMPD, m)
+  CASE_MASKZ_VPERM(PERMPD, r)
+  CASE_MASKZ_VPERM(PERMQ, m)
+  CASE_MASKZ_VPERM(PERMQ, r)
+  CASE_MASKZ_VSHUF(64X2, m)
+  CASE_MASKZ_VSHUF(64X2, r)
+  CASE_MASKZ_VSHUF(32X4, m)
+  CASE_MASKZ_VSHUF(32X4, r)
+    MaskWithZero = true;
+    MaskRegName = getRegName(MI->getOperand(1).getReg());
     break;
-  CASE_VSHUF(64X2, m)
-    HasMemOp = true;        // FALL THROUGH.
-  CASE_VSHUF(64X2, r)
-    VT = getRegOperandVectorVT(MI, MVT::i64, 0);
-    break;
-  CASE_VSHUF(32X4, m)
-    HasMemOp = true;        // FALL THROUGH.
-  CASE_VSHUF(32X4, r)
-    VT = getRegOperandVectorVT(MI, MVT::i32, 0);
+  CASE_MASK_MOVDUP(MOVDDUP, m)
+  CASE_MASK_MOVDUP(MOVDDUP, r)
+  CASE_MASK_MOVDUP(MOVSHDUP, m)
+  CASE_MASK_MOVDUP(MOVSHDUP, r)
+  CASE_MASK_MOVDUP(MOVSLDUP, m)
+  CASE_MASK_MOVDUP(MOVSLDUP, r)
+  CASE_MASK_PMOVZX(PMOVZXBD, m)
+  CASE_MASK_PMOVZX(PMOVZXBD, r)
+  CASE_MASK_PMOVZX(PMOVZXBQ, m)
+  CASE_MASK_PMOVZX(PMOVZXBQ, r)
+  CASE_MASK_PMOVZX(PMOVZXBW, m)
+  CASE_MASK_PMOVZX(PMOVZXBW, r)
+  CASE_MASK_PMOVZX(PMOVZXDQ, m)
+  CASE_MASK_PMOVZX(PMOVZXDQ, r)
+  CASE_MASK_PMOVZX(PMOVZXWD, m)
+  CASE_MASK_PMOVZX(PMOVZXWD, r)
+  CASE_MASK_PMOVZX(PMOVZXWQ, m)
+  CASE_MASK_PMOVZX(PMOVZXWQ, r)
+  CASE_MASK_UNPCK(PUNPCKHBW, m)
+  CASE_MASK_UNPCK(PUNPCKHBW, r)
+  CASE_MASK_UNPCK(PUNPCKHWD, m)
+  CASE_MASK_UNPCK(PUNPCKHWD, r)
+  CASE_MASK_UNPCK(PUNPCKHDQ, m)
+  CASE_MASK_UNPCK(PUNPCKHDQ, r)
+  CASE_MASK_UNPCK(PUNPCKLBW, m)
+  CASE_MASK_UNPCK(PUNPCKLBW, r)
+  CASE_MASK_UNPCK(PUNPCKLWD, m)
+  CASE_MASK_UNPCK(PUNPCKLWD, r)
+  CASE_MASK_UNPCK(PUNPCKLDQ, m)
+  CASE_MASK_UNPCK(PUNPCKLDQ, r)
+  CASE_MASK_UNPCK(UNPCKHPD, m)
+  CASE_MASK_UNPCK(UNPCKHPD, r)
+  CASE_MASK_UNPCK(UNPCKHPS, m)
+  CASE_MASK_UNPCK(UNPCKHPS, r)
+  CASE_MASK_UNPCK(UNPCKLPD, m)
+  CASE_MASK_UNPCK(UNPCKLPD, r)
+  CASE_MASK_UNPCK(UNPCKLPS, m)
+  CASE_MASK_UNPCK(UNPCKLPS, r)
+  CASE_MASK_SHUF(PALIGNR, r)
+  CASE_MASK_SHUF(PALIGNR, m)
+  CASE_MASK_SHUF(SHUFPD, m)
+  CASE_MASK_SHUF(SHUFPD, r)
+  CASE_MASK_SHUF(SHUFPS, m)
+  CASE_MASK_SHUF(SHUFPS, r)
+  CASE_MASK_VPERMILPI(PERMILPD, m)
+  CASE_MASK_VPERMILPI(PERMILPD, r)
+  CASE_MASK_VPERMILPI(PERMILPS, m)
+  CASE_MASK_VPERMILPI(PERMILPS, r)
+  CASE_MASK_VPERMILPI(PSHUFD, m)
+  CASE_MASK_VPERMILPI(PSHUFD, r)
+  CASE_MASK_VPERMILPI(PSHUFHW, m)
+  CASE_MASK_VPERMILPI(PSHUFHW, r)
+  CASE_MASK_VPERMILPI(PSHUFLW, m)
+  CASE_MASK_VPERMILPI(PSHUFLW, r)
+  CASE_MASK_VPERM(PERMPD, m)
+  CASE_MASK_VPERM(PERMPD, r)
+  CASE_MASK_VPERM(PERMQ, m)
+  CASE_MASK_VPERM(PERMQ, r)
+  CASE_MASK_VSHUF(64X2, m)
+  CASE_MASK_VSHUF(64X2, r)
+  CASE_MASK_VSHUF(32X4, m)
+  CASE_MASK_VSHUF(32X4, r)
+    MaskRegName = getRegName(MI->getOperand(2).getReg());
     break;
   }
+
+  // MASK: zmmX {%kY}
+  OpMaskName += " {%";
+  OpMaskName += MaskRegName;
+  OpMaskName += "}";
+
+  // MASKZ: zmmX {%kY} {z}
+  if (MaskWithZero)
+    OpMaskName += " {z}";
+
+  return OpMaskName;
 }
 
 //===----------------------------------------------------------------------===//
@@ -208,6 +370,8 @@ bool llvm::EmitAnyX86InstComments(const MCInst *MI, raw_ostream &OS,
   // If this is a shuffle operation, the switch should fill in this state.
   SmallVector<int, 8> ShuffleMask;
   const char *DestName = nullptr, *Src1Name = nullptr, *Src2Name = nullptr;
+  unsigned NumOperands = MI->getNumOperands();
+  bool RegForm = false;
 
   switch (MI->getOpcode()) {
   default:
@@ -222,9 +386,9 @@ bool llvm::EmitAnyX86InstComments(const MCInst *MI, raw_ostream &OS,
   case X86::BLENDPDrmi:
   case X86::VBLENDPDrmi:
   case X86::VBLENDPDYrmi:
-    if (MI->getOperand(MI->getNumOperands() - 1).isImm())
+    if (MI->getOperand(NumOperands - 1).isImm())
       DecodeBLENDMask(getRegOperandVectorVT(MI, MVT::f64, 0),
-                      MI->getOperand(MI->getNumOperands() - 1).getImm(),
+                      MI->getOperand(NumOperands - 1).getImm(),
                       ShuffleMask);
     Src1Name = getRegName(MI->getOperand(1).getReg());
     DestName = getRegName(MI->getOperand(0).getReg());
@@ -238,9 +402,9 @@ bool llvm::EmitAnyX86InstComments(const MCInst *MI, raw_ostream &OS,
   case X86::BLENDPSrmi:
   case X86::VBLENDPSrmi:
   case X86::VBLENDPSYrmi:
-    if (MI->getOperand(MI->getNumOperands() - 1).isImm())
+    if (MI->getOperand(NumOperands - 1).isImm())
       DecodeBLENDMask(getRegOperandVectorVT(MI, MVT::f32, 0),
-                      MI->getOperand(MI->getNumOperands() - 1).getImm(),
+                      MI->getOperand(NumOperands - 1).getImm(),
                       ShuffleMask);
     Src1Name = getRegName(MI->getOperand(1).getReg());
     DestName = getRegName(MI->getOperand(0).getReg());
@@ -254,9 +418,9 @@ bool llvm::EmitAnyX86InstComments(const MCInst *MI, raw_ostream &OS,
   case X86::PBLENDWrmi:
   case X86::VPBLENDWrmi:
   case X86::VPBLENDWYrmi:
-    if (MI->getOperand(MI->getNumOperands() - 1).isImm())
+    if (MI->getOperand(NumOperands - 1).isImm())
       DecodeBLENDMask(getRegOperandVectorVT(MI, MVT::i16, 0),
-                      MI->getOperand(MI->getNumOperands() - 1).getImm(),
+                      MI->getOperand(NumOperands - 1).getImm(),
                       ShuffleMask);
     Src1Name = getRegName(MI->getOperand(1).getReg());
     DestName = getRegName(MI->getOperand(0).getReg());
@@ -268,9 +432,9 @@ bool llvm::EmitAnyX86InstComments(const MCInst *MI, raw_ostream &OS,
     // FALL THROUGH.
   case X86::VPBLENDDrmi:
   case X86::VPBLENDDYrmi:
-    if (MI->getOperand(MI->getNumOperands() - 1).isImm())
+    if (MI->getOperand(NumOperands - 1).isImm())
       DecodeBLENDMask(getRegOperandVectorVT(MI, MVT::i32, 0),
-                      MI->getOperand(MI->getNumOperands() - 1).getImm(),
+                      MI->getOperand(NumOperands - 1).getImm(),
                       ShuffleMask);
     Src1Name = getRegName(MI->getOperand(1).getReg());
     DestName = getRegName(MI->getOperand(0).getReg());
@@ -278,14 +442,16 @@ bool llvm::EmitAnyX86InstComments(const MCInst *MI, raw_ostream &OS,
 
   case X86::INSERTPSrr:
   case X86::VINSERTPSrr:
+  case X86::VINSERTPSzrr:
     Src2Name = getRegName(MI->getOperand(2).getReg());
     // FALL THROUGH.
   case X86::INSERTPSrm:
   case X86::VINSERTPSrm:
+  case X86::VINSERTPSzrm:
     DestName = getRegName(MI->getOperand(0).getReg());
     Src1Name = getRegName(MI->getOperand(1).getReg());
-    if (MI->getOperand(MI->getNumOperands() - 1).isImm())
-      DecodeINSERTPSMask(MI->getOperand(MI->getNumOperands() - 1).getImm(),
+    if (MI->getOperand(NumOperands - 1).isImm())
+      DecodeINSERTPSMask(MI->getOperand(NumOperands - 1).getImm(),
                          ShuffleMask);
     break;
 
@@ -307,8 +473,40 @@ bool llvm::EmitAnyX86InstComments(const MCInst *MI, raw_ostream &OS,
     DecodeMOVHLPSMask(2, ShuffleMask);
     break;
 
+  case X86::MOVHPDrm:
+  case X86::VMOVHPDrm:
+  case X86::VMOVHPDZ128rm:
+    Src1Name = getRegName(MI->getOperand(1).getReg());
+    DestName = getRegName(MI->getOperand(0).getReg());
+    DecodeInsertElementMask(MVT::v2f64, 1, 1, ShuffleMask);
+    break;
+
+  case X86::MOVHPSrm:
+  case X86::VMOVHPSrm:
+  case X86::VMOVHPSZ128rm:
+    Src1Name = getRegName(MI->getOperand(1).getReg());
+    DestName = getRegName(MI->getOperand(0).getReg());
+    DecodeInsertElementMask(MVT::v4f32, 2, 2, ShuffleMask);
+    break;
+
+  case X86::MOVLPDrm:
+  case X86::VMOVLPDrm:
+  case X86::VMOVLPDZ128rm:
+    Src1Name = getRegName(MI->getOperand(1).getReg());
+    DestName = getRegName(MI->getOperand(0).getReg());
+    DecodeInsertElementMask(MVT::v2f64, 0, 1, ShuffleMask);
+    break;
+
+  case X86::MOVLPSrm:
+  case X86::VMOVLPSrm:
+  case X86::VMOVLPSZ128rm:
+    Src1Name = getRegName(MI->getOperand(1).getReg());
+    DestName = getRegName(MI->getOperand(0).getReg());
+    DecodeInsertElementMask(MVT::v4f32, 0, 2, ShuffleMask);
+    break;
+
   CASE_MOVDUP(MOVSLDUP, r)
-    Src1Name = getRegName(MI->getOperand(MI->getNumOperands() - 1).getReg());
+    Src1Name = getRegName(MI->getOperand(NumOperands - 1).getReg());
     // FALL THROUGH.
   CASE_MOVDUP(MOVSLDUP, m)
     DestName = getRegName(MI->getOperand(0).getReg());
@@ -316,7 +514,7 @@ bool llvm::EmitAnyX86InstComments(const MCInst *MI, raw_ostream &OS,
     break;
 
   CASE_MOVDUP(MOVSHDUP, r)
-    Src1Name = getRegName(MI->getOperand(MI->getNumOperands() - 1).getReg());
+    Src1Name = getRegName(MI->getOperand(NumOperands - 1).getReg());
     // FALL THROUGH.
   CASE_MOVDUP(MOVSHDUP, m)
     DestName = getRegName(MI->getOperand(0).getReg());
@@ -324,7 +522,7 @@ bool llvm::EmitAnyX86InstComments(const MCInst *MI, raw_ostream &OS,
     break;
 
   CASE_MOVDUP(MOVDDUP, r)
-    Src1Name = getRegName(MI->getOperand(MI->getNumOperands() - 1).getReg());
+    Src1Name = getRegName(MI->getOperand(NumOperands - 1).getReg());
     // FALL THROUGH.
   CASE_MOVDUP(MOVDDUP, m)
     DestName = getRegName(MI->getOperand(0).getReg());
@@ -334,83 +532,80 @@ bool llvm::EmitAnyX86InstComments(const MCInst *MI, raw_ostream &OS,
   case X86::PSLLDQri:
   case X86::VPSLLDQri:
   case X86::VPSLLDQYri:
+  case X86::VPSLLDQZ128rr:
+  case X86::VPSLLDQZ256rr:
+  case X86::VPSLLDQZ512rr:
     Src1Name = getRegName(MI->getOperand(1).getReg());
+  case X86::VPSLLDQZ128rm:
+  case X86::VPSLLDQZ256rm:
+  case X86::VPSLLDQZ512rm:
     DestName = getRegName(MI->getOperand(0).getReg());
-    if (MI->getOperand(MI->getNumOperands() - 1).isImm())
+    if (MI->getOperand(NumOperands - 1).isImm())
       DecodePSLLDQMask(getRegOperandVectorVT(MI, MVT::i8, 0),
-                       MI->getOperand(MI->getNumOperands() - 1).getImm(),
+                       MI->getOperand(NumOperands - 1).getImm(),
                        ShuffleMask);
     break;
 
   case X86::PSRLDQri:
   case X86::VPSRLDQri:
   case X86::VPSRLDQYri:
+  case X86::VPSRLDQZ128rr:
+  case X86::VPSRLDQZ256rr:
+  case X86::VPSRLDQZ512rr:
     Src1Name = getRegName(MI->getOperand(1).getReg());
+  case X86::VPSRLDQZ128rm:
+  case X86::VPSRLDQZ256rm:
+  case X86::VPSRLDQZ512rm:
     DestName = getRegName(MI->getOperand(0).getReg());
-    if (MI->getOperand(MI->getNumOperands() - 1).isImm())
+    if (MI->getOperand(NumOperands - 1).isImm())
       DecodePSRLDQMask(getRegOperandVectorVT(MI, MVT::i8, 0),
-                       MI->getOperand(MI->getNumOperands() - 1).getImm(),
+                       MI->getOperand(NumOperands - 1).getImm(),
                        ShuffleMask);
     break;
 
-  case X86::PALIGNR128rr:
-  case X86::VPALIGNR128rr:
-  case X86::VPALIGNR256rr:
-    Src1Name = getRegName(MI->getOperand(2).getReg());
+  CASE_SHUF(PALIGNR, rri)
+    Src1Name = getRegName(MI->getOperand(NumOperands - 2).getReg());
+    RegForm = true;
     // FALL THROUGH.
-  case X86::PALIGNR128rm:
-  case X86::VPALIGNR128rm:
-  case X86::VPALIGNR256rm:
-    Src2Name = getRegName(MI->getOperand(1).getReg());
+  CASE_SHUF(PALIGNR, rmi)
+    Src2Name = getRegName(MI->getOperand(NumOperands-(RegForm?3:7)).getReg());
     DestName = getRegName(MI->getOperand(0).getReg());
-    if (MI->getOperand(MI->getNumOperands() - 1).isImm())
+    if (MI->getOperand(NumOperands - 1).isImm())
       DecodePALIGNRMask(getRegOperandVectorVT(MI, MVT::i8, 0),
-                        MI->getOperand(MI->getNumOperands() - 1).getImm(),
+                        MI->getOperand(NumOperands - 1).getImm(),
                         ShuffleMask);
     break;
 
-  case X86::PSHUFDri:
-  case X86::VPSHUFDri:
-  case X86::VPSHUFDYri:
-    Src1Name = getRegName(MI->getOperand(1).getReg());
+  CASE_SHUF(PSHUFD, ri)
+    Src1Name = getRegName(MI->getOperand(NumOperands - 2).getReg());
     // FALL THROUGH.
-  case X86::PSHUFDmi:
-  case X86::VPSHUFDmi:
-  case X86::VPSHUFDYmi:
+  CASE_SHUF(PSHUFD, mi)
     DestName = getRegName(MI->getOperand(0).getReg());
-    if (MI->getOperand(MI->getNumOperands() - 1).isImm())
+    if (MI->getOperand(NumOperands - 1).isImm())
       DecodePSHUFMask(getRegOperandVectorVT(MI, MVT::i32, 0),
-                      MI->getOperand(MI->getNumOperands() - 1).getImm(),
+                      MI->getOperand(NumOperands - 1).getImm(),
                       ShuffleMask);
     break;
 
-  case X86::PSHUFHWri:
-  case X86::VPSHUFHWri:
-  case X86::VPSHUFHWYri:
-    Src1Name = getRegName(MI->getOperand(1).getReg());
+  CASE_SHUF(PSHUFHW, ri)
+    Src1Name = getRegName(MI->getOperand(NumOperands - 2).getReg());
     // FALL THROUGH.
-  case X86::PSHUFHWmi:
-  case X86::VPSHUFHWmi:
-  case X86::VPSHUFHWYmi:
+  CASE_SHUF(PSHUFHW, mi)
     DestName = getRegName(MI->getOperand(0).getReg());
-    if (MI->getOperand(MI->getNumOperands() - 1).isImm())
+    if (MI->getOperand(NumOperands - 1).isImm())
       DecodePSHUFHWMask(getRegOperandVectorVT(MI, MVT::i16, 0),
-                        MI->getOperand(MI->getNumOperands() - 1).getImm(),
+                        MI->getOperand(NumOperands - 1).getImm(),
                         ShuffleMask);
     break;
 
-  case X86::PSHUFLWri:
-  case X86::VPSHUFLWri:
-  case X86::VPSHUFLWYri:
-    Src1Name = getRegName(MI->getOperand(1).getReg());
+  CASE_SHUF(PSHUFLW, ri)
+    Src1Name = getRegName(MI->getOperand(NumOperands - 2).getReg());
     // FALL THROUGH.
-  case X86::PSHUFLWmi:
-  case X86::VPSHUFLWmi:
-  case X86::VPSHUFLWYmi:
+  CASE_SHUF(PSHUFLW, mi)
     DestName = getRegName(MI->getOperand(0).getReg());
-    if (MI->getOperand(MI->getNumOperands() - 1).isImm())
+    if (MI->getOperand(NumOperands - 1).isImm())
       DecodePSHUFLWMask(getRegOperandVectorVT(MI, MVT::i16, 0),
-                        MI->getOperand(MI->getNumOperands() - 1).getImm(),
+                        MI->getOperand(NumOperands - 1).getImm(),
                         ShuffleMask);
     break;
 
@@ -419,9 +614,9 @@ bool llvm::EmitAnyX86InstComments(const MCInst *MI, raw_ostream &OS,
     // FALL THROUGH.
   case X86::MMX_PSHUFWmi:
     DestName = getRegName(MI->getOperand(0).getReg());
-    if (MI->getOperand(MI->getNumOperands() - 1).isImm())
+    if (MI->getOperand(NumOperands - 1).isImm())
       DecodePSHUFMask(MVT::v4i16,
-                      MI->getOperand(MI->getNumOperands() - 1).getImm(),
+                      MI->getOperand(NumOperands - 1).getImm(),
                       ShuffleMask);
     break;
 
@@ -435,188 +630,204 @@ bool llvm::EmitAnyX86InstComments(const MCInst *MI, raw_ostream &OS,
 
   CASE_UNPCK(PUNPCKHBW, r)
   case X86::MMX_PUNPCKHBWirr:
-    Src2Name = getRegName(MI->getOperand(2).getReg());
+    Src2Name = getRegName(MI->getOperand(NumOperands - 1).getReg());
+    RegForm = true;
     // FALL THROUGH.
   CASE_UNPCK(PUNPCKHBW, m)
   case X86::MMX_PUNPCKHBWirm:
-    Src1Name = getRegName(MI->getOperand(1).getReg());
+    Src1Name = getRegName(MI->getOperand(NumOperands-(RegForm?2:6)).getReg());
     DestName = getRegName(MI->getOperand(0).getReg());
     DecodeUNPCKHMask(getRegOperandVectorVT(MI, MVT::i8, 0), ShuffleMask);
     break;
 
   CASE_UNPCK(PUNPCKHWD, r)
   case X86::MMX_PUNPCKHWDirr:
-    Src2Name = getRegName(MI->getOperand(2).getReg());
+    Src2Name = getRegName(MI->getOperand(NumOperands - 1).getReg());
+    RegForm = true;
     // FALL THROUGH.
   CASE_UNPCK(PUNPCKHWD, m)
   case X86::MMX_PUNPCKHWDirm:
-    Src1Name = getRegName(MI->getOperand(1).getReg());
+    Src1Name = getRegName(MI->getOperand(NumOperands-(RegForm?2:6)).getReg());
     DestName = getRegName(MI->getOperand(0).getReg());
     DecodeUNPCKHMask(getRegOperandVectorVT(MI, MVT::i16, 0), ShuffleMask);
     break;
 
   CASE_UNPCK(PUNPCKHDQ, r)
   case X86::MMX_PUNPCKHDQirr:
-    Src2Name = getRegName(MI->getOperand(2).getReg());
+    Src2Name = getRegName(MI->getOperand(NumOperands - 1).getReg());
+    RegForm = true;
     // FALL THROUGH.
   CASE_UNPCK(PUNPCKHDQ, m)
   case X86::MMX_PUNPCKHDQirm:
-    Src1Name = getRegName(MI->getOperand(1).getReg());
+    Src1Name = getRegName(MI->getOperand(NumOperands-(RegForm?2:6)).getReg());
     DestName = getRegName(MI->getOperand(0).getReg());
     DecodeUNPCKHMask(getRegOperandVectorVT(MI, MVT::i32, 0), ShuffleMask);
     break;
 
   CASE_UNPCK(PUNPCKHQDQ, r)
-    Src2Name = getRegName(MI->getOperand(2).getReg());
+    Src2Name = getRegName(MI->getOperand(NumOperands - 1).getReg());
+    RegForm = true;
     // FALL THROUGH.
   CASE_UNPCK(PUNPCKHQDQ, m)
-    Src1Name = getRegName(MI->getOperand(1).getReg());
+    Src1Name = getRegName(MI->getOperand(NumOperands-(RegForm?2:6)).getReg());
     DestName = getRegName(MI->getOperand(0).getReg());
     DecodeUNPCKHMask(getRegOperandVectorVT(MI, MVT::i64, 0), ShuffleMask);
     break;
 
   CASE_UNPCK(PUNPCKLBW, r)
   case X86::MMX_PUNPCKLBWirr:
-    Src2Name = getRegName(MI->getOperand(2).getReg());
+    Src2Name = getRegName(MI->getOperand(NumOperands - 1).getReg());
+    RegForm = true;
     // FALL THROUGH.
   CASE_UNPCK(PUNPCKLBW, m)
   case X86::MMX_PUNPCKLBWirm:
-    Src1Name = getRegName(MI->getOperand(1).getReg());
+    Src1Name = getRegName(MI->getOperand(NumOperands-(RegForm?2:6)).getReg());
     DestName = getRegName(MI->getOperand(0).getReg());
     DecodeUNPCKLMask(getRegOperandVectorVT(MI, MVT::i8, 0), ShuffleMask);
     break;
 
   CASE_UNPCK(PUNPCKLWD, r)
   case X86::MMX_PUNPCKLWDirr:
-    Src2Name = getRegName(MI->getOperand(2).getReg());
+    Src2Name = getRegName(MI->getOperand(NumOperands - 1).getReg());
+    RegForm = true;
     // FALL THROUGH.
   CASE_UNPCK(PUNPCKLWD, m)
   case X86::MMX_PUNPCKLWDirm:
-    Src1Name = getRegName(MI->getOperand(1).getReg());
+    Src1Name = getRegName(MI->getOperand(NumOperands-(RegForm?2:6)).getReg());
     DestName = getRegName(MI->getOperand(0).getReg());
     DecodeUNPCKLMask(getRegOperandVectorVT(MI, MVT::i16, 0), ShuffleMask);
     break;
 
   CASE_UNPCK(PUNPCKLDQ, r)
   case X86::MMX_PUNPCKLDQirr:
-    Src2Name = getRegName(MI->getOperand(2).getReg());
+    Src2Name = getRegName(MI->getOperand(NumOperands - 1).getReg());
+    RegForm = true;
     // FALL THROUGH.
   CASE_UNPCK(PUNPCKLDQ, m)
   case X86::MMX_PUNPCKLDQirm:
-    Src1Name = getRegName(MI->getOperand(1).getReg());
+    Src1Name = getRegName(MI->getOperand(NumOperands-(RegForm?2:6)).getReg());
     DestName = getRegName(MI->getOperand(0).getReg());
     DecodeUNPCKLMask(getRegOperandVectorVT(MI, MVT::i32, 0), ShuffleMask);
     break;
 
   CASE_UNPCK(PUNPCKLQDQ, r)
-    Src2Name = getRegName(MI->getOperand(2).getReg());
+    Src2Name = getRegName(MI->getOperand(NumOperands - 1).getReg());
+    RegForm = true;
     // FALL THROUGH.
   CASE_UNPCK(PUNPCKLQDQ, m)
-    Src1Name = getRegName(MI->getOperand(1).getReg());
+    Src1Name = getRegName(MI->getOperand(NumOperands-(RegForm?2:6)).getReg());
     DestName = getRegName(MI->getOperand(0).getReg());
     DecodeUNPCKLMask(getRegOperandVectorVT(MI, MVT::i64, 0), ShuffleMask);
     break;
 
-  CASE_SHUF(SHUFPD, r)
-    Src2Name = getRegName(MI->getOperand(2).getReg());
+  CASE_SHUF(SHUFPD, rri)
+    Src2Name = getRegName(MI->getOperand(NumOperands - 2).getReg());
+    RegForm = true;
     // FALL THROUGH.
-  CASE_SHUF(SHUFPD, m)
-    if (MI->getOperand(MI->getNumOperands() - 1).isImm())
+  CASE_SHUF(SHUFPD, rmi)
+    if (MI->getOperand(NumOperands - 1).isImm())
       DecodeSHUFPMask(getRegOperandVectorVT(MI, MVT::f64, 0),
-                      MI->getOperand(MI->getNumOperands() - 1).getImm(),
+                      MI->getOperand(NumOperands - 1).getImm(),
                       ShuffleMask);
-    Src1Name = getRegName(MI->getOperand(1).getReg());
+    Src1Name = getRegName(MI->getOperand(NumOperands-(RegForm?3:7)).getReg());
     DestName = getRegName(MI->getOperand(0).getReg());
     break;
 
-  CASE_SHUF(SHUFPS, r)
-    Src2Name = getRegName(MI->getOperand(2).getReg());
+  CASE_SHUF(SHUFPS, rri)
+    Src2Name = getRegName(MI->getOperand(NumOperands - 2).getReg());
+    RegForm = true;
     // FALL THROUGH.
-  CASE_SHUF(SHUFPS, m)
-    if (MI->getOperand(MI->getNumOperands() - 1).isImm())
+  CASE_SHUF(SHUFPS, rmi)
+    if (MI->getOperand(NumOperands - 1).isImm())
       DecodeSHUFPMask(getRegOperandVectorVT(MI, MVT::f32, 0),
-                      MI->getOperand(MI->getNumOperands() - 1).getImm(),
+                      MI->getOperand(NumOperands - 1).getImm(),
                       ShuffleMask);
-    Src1Name = getRegName(MI->getOperand(1).getReg());
+    Src1Name = getRegName(MI->getOperand(NumOperands-(RegForm?3:7)).getReg());
     DestName = getRegName(MI->getOperand(0).getReg());
     break;
 
   CASE_VSHUF(64X2, r)
+    Src2Name = getRegName(MI->getOperand(NumOperands - 2).getReg());
+    RegForm = true;
+    // FALL THROUGH.
   CASE_VSHUF(64X2, m)
+    decodeVSHUF64x2FamilyMask(getRegOperandVectorVT(MI, MVT::i64, 0),
+                              MI->getOperand(NumOperands - 1).getImm(),
+                              ShuffleMask);
+    Src1Name = getRegName(MI->getOperand(NumOperands-(RegForm?3:7)).getReg());
+    DestName = getRegName(MI->getOperand(0).getReg());
+    break;
+
   CASE_VSHUF(32X4, r)
-  CASE_VSHUF(32X4, m) {
-    MVT VT;
-    bool HasMemOp;
-    unsigned NumOp = MI->getNumOperands();
-    getVSHUF64x2FamilyInfo(MI, VT, HasMemOp);
-    decodeVSHUF64x2FamilyMask(VT, MI->getOperand(NumOp - 1).getImm(),
+    Src2Name = getRegName(MI->getOperand(NumOperands - 2).getReg());
+    RegForm = true;
+    // FALL THROUGH.
+  CASE_VSHUF(32X4, m)
+    decodeVSHUF64x2FamilyMask(getRegOperandVectorVT(MI, MVT::i32, 0),
+                              MI->getOperand(NumOperands - 1).getImm(),
                               ShuffleMask);
+    Src1Name = getRegName(MI->getOperand(NumOperands-(RegForm?3:7)).getReg());
     DestName = getRegName(MI->getOperand(0).getReg());
-    if (HasMemOp) {
-      assert((NumOp >= 8) && "Expected at least 8 operands!");
-      Src1Name = getRegName(MI->getOperand(NumOp - 7).getReg());
-    } else {
-      assert((NumOp >= 4) && "Expected at least 4 operands!");
-      Src2Name = getRegName(MI->getOperand(NumOp - 2).getReg());
-      Src1Name = getRegName(MI->getOperand(NumOp - 3).getReg());
-    }
     break;
-  }
 
   CASE_UNPCK(UNPCKLPD, r)
-    Src2Name = getRegName(MI->getOperand(2).getReg());
+    Src2Name = getRegName(MI->getOperand(NumOperands - 1).getReg());
+    RegForm = true;
     // FALL THROUGH.
   CASE_UNPCK(UNPCKLPD, m)
     DecodeUNPCKLMask(getRegOperandVectorVT(MI, MVT::f64, 0), ShuffleMask);
-    Src1Name = getRegName(MI->getOperand(1).getReg());
+    Src1Name = getRegName(MI->getOperand(NumOperands-(RegForm?2:6)).getReg());
     DestName = getRegName(MI->getOperand(0).getReg());
     break;
 
   CASE_UNPCK(UNPCKLPS, r)
-    Src2Name = getRegName(MI->getOperand(2).getReg());
+    Src2Name = getRegName(MI->getOperand(NumOperands - 1).getReg());
+    RegForm = true;
     // FALL THROUGH.
   CASE_UNPCK(UNPCKLPS, m)
     DecodeUNPCKLMask(getRegOperandVectorVT(MI, MVT::f32, 0), ShuffleMask);
-    Src1Name = getRegName(MI->getOperand(1).getReg());
+    Src1Name = getRegName(MI->getOperand(NumOperands-(RegForm?2:6)).getReg());
     DestName = getRegName(MI->getOperand(0).getReg());
     break;
 
   CASE_UNPCK(UNPCKHPD, r)
-    Src2Name = getRegName(MI->getOperand(2).getReg());
+    Src2Name = getRegName(MI->getOperand(NumOperands - 1).getReg());
+    RegForm = true;
     // FALL THROUGH.
   CASE_UNPCK(UNPCKHPD, m)
     DecodeUNPCKHMask(getRegOperandVectorVT(MI, MVT::f64, 0), ShuffleMask);
-    Src1Name = getRegName(MI->getOperand(1).getReg());
+    Src1Name = getRegName(MI->getOperand(NumOperands-(RegForm?2:6)).getReg());
     DestName = getRegName(MI->getOperand(0).getReg());
     break;
 
   CASE_UNPCK(UNPCKHPS, r)
-    Src2Name = getRegName(MI->getOperand(2).getReg());
+    Src2Name = getRegName(MI->getOperand(NumOperands - 1).getReg());
+    RegForm = true;
     // FALL THROUGH.
   CASE_UNPCK(UNPCKHPS, m)
     DecodeUNPCKHMask(getRegOperandVectorVT(MI, MVT::f32, 0), ShuffleMask);
-    Src1Name = getRegName(MI->getOperand(1).getReg());
+    Src1Name = getRegName(MI->getOperand(NumOperands-(RegForm?2:6)).getReg());
     DestName = getRegName(MI->getOperand(0).getReg());
     break;
 
-  CASE_VPERM(PERMILPS, r)
-    Src1Name = getRegName(MI->getOperand(1).getReg());
+  CASE_VPERMILPI(PERMILPS, r)
+    Src1Name = getRegName(MI->getOperand(NumOperands - 2).getReg());
     // FALL THROUGH.
-  CASE_VPERM(PERMILPS, m)
-    if (MI->getOperand(MI->getNumOperands() - 1).isImm())
+  CASE_VPERMILPI(PERMILPS, m)
+    if (MI->getOperand(NumOperands - 1).isImm())
       DecodePSHUFMask(getRegOperandVectorVT(MI, MVT::f32, 0),
-                      MI->getOperand(MI->getNumOperands() - 1).getImm(),
+                      MI->getOperand(NumOperands - 1).getImm(),
                       ShuffleMask);
     DestName = getRegName(MI->getOperand(0).getReg());
     break;
 
-  CASE_VPERM(PERMILPD, r)
-    Src1Name = getRegName(MI->getOperand(1).getReg());
+  CASE_VPERMILPI(PERMILPD, r)
+    Src1Name = getRegName(MI->getOperand(NumOperands - 2).getReg());
     // FALL THROUGH.
-  CASE_VPERM(PERMILPD, m)
-    if (MI->getOperand(MI->getNumOperands() - 1).isImm())
+  CASE_VPERMILPI(PERMILPD, m)
+    if (MI->getOperand(NumOperands - 1).isImm())
       DecodePSHUFMask(getRegOperandVectorVT(MI, MVT::f64, 0),
-                      MI->getOperand(MI->getNumOperands() - 1).getImm(),
+                      MI->getOperand(NumOperands - 1).getImm(),
                       ShuffleMask);
     DestName = getRegName(MI->getOperand(0).getReg());
     break;
@@ -628,44 +839,58 @@ bool llvm::EmitAnyX86InstComments(const MCInst *MI, raw_ostream &OS,
   case X86::VPERM2F128rm:
   case X86::VPERM2I128rm:
     // For instruction comments purpose, assume the 256-bit vector is v4i64.
-    if (MI->getOperand(MI->getNumOperands() - 1).isImm())
+    if (MI->getOperand(NumOperands - 1).isImm())
       DecodeVPERM2X128Mask(MVT::v4i64,
-                           MI->getOperand(MI->getNumOperands() - 1).getImm(),
+                           MI->getOperand(NumOperands - 1).getImm(),
                            ShuffleMask);
     Src1Name = getRegName(MI->getOperand(1).getReg());
     DestName = getRegName(MI->getOperand(0).getReg());
     break;
 
-  case X86::VPERMQYri:
-  case X86::VPERMPDYri:
-    Src1Name = getRegName(MI->getOperand(1).getReg());
+  CASE_VPERM(PERMPD, r)
+    Src1Name = getRegName(MI->getOperand(NumOperands - 2).getReg());
     // FALL THROUGH.
-  case X86::VPERMQYmi:
-  case X86::VPERMPDYmi:
-    if (MI->getOperand(MI->getNumOperands() - 1).isImm())
-      DecodeVPERMMask(MI->getOperand(MI->getNumOperands() - 1).getImm(),
+  CASE_VPERM(PERMPD, m)
+    if (MI->getOperand(NumOperands - 1).isImm())
+      DecodeVPERMMask(getRegOperandVectorVT(MI, MVT::f64, 0),
+                      MI->getOperand(NumOperands - 1).getImm(),
+                      ShuffleMask);
+    DestName = getRegName(MI->getOperand(0).getReg());
+    break;
+
+  CASE_VPERM(PERMQ, r)
+    Src1Name = getRegName(MI->getOperand(NumOperands - 2).getReg());
+    // FALL THROUGH.
+  CASE_VPERM(PERMQ, m)
+    if (MI->getOperand(NumOperands - 1).isImm())
+      DecodeVPERMMask(getRegOperandVectorVT(MI, MVT::i64, 0),
+                      MI->getOperand(NumOperands - 1).getImm(),
                       ShuffleMask);
     DestName = getRegName(MI->getOperand(0).getReg());
     break;
 
   case X86::MOVSDrr:
   case X86::VMOVSDrr:
+  case X86::VMOVSDZrr:
     Src2Name = getRegName(MI->getOperand(2).getReg());
     Src1Name = getRegName(MI->getOperand(1).getReg());
     // FALL THROUGH.
   case X86::MOVSDrm:
   case X86::VMOVSDrm:
+  case X86::VMOVSDZrm:
     DecodeScalarMoveMask(MVT::v2f64, nullptr == Src2Name, ShuffleMask);
     DestName = getRegName(MI->getOperand(0).getReg());
     break;
 
   case X86::MOVSSrr:
   case X86::VMOVSSrr:
+  case X86::VMOVSSZrr:
     Src2Name = getRegName(MI->getOperand(2).getReg());
     Src1Name = getRegName(MI->getOperand(1).getReg());
     // FALL THROUGH.
   case X86::MOVSSrm:
   case X86::VMOVSSrm:
+  case X86::VMOVSSZrm:
     DecodeScalarMoveMask(MVT::v4f32, nullptr == Src2Name, ShuffleMask);
     DestName = getRegName(MI->getOperand(0).getReg());
     break;
@@ -681,6 +906,7 @@ bool llvm::EmitAnyX86InstComments(const MCInst *MI, raw_ostream &OS,
   case X86::MOVZQI2PQIrm:
   case X86::MOVZPQILo2PQIrm:
   case X86::VMOVQI2PQIrm:
+  case X86::VMOVQI2PQIZrm:
   case X86::VMOVZQI2PQIrm:
   case X86::VMOVZPQILo2PQIrm:
   case X86::VMOVZPQILo2PQIZrm:
@@ -690,6 +916,7 @@ bool llvm::EmitAnyX86InstComments(const MCInst *MI, raw_ostream &OS,
 
   case X86::MOVDI2PDIrm:
   case X86::VMOVDI2PDIrm:
+  case X86::VMOVDI2PDIZrm:
     DecodeZeroMoveLowMask(MVT::v4i32, ShuffleMask);
     DestName = getRegName(MI->getOperand(0).getReg());
     break;
@@ -717,49 +944,41 @@ bool llvm::EmitAnyX86InstComments(const MCInst *MI, raw_ostream &OS,
     Src2Name = getRegName(MI->getOperand(2).getReg());
     break;
 
-  case X86::PMOVZXBWrr:
-  case X86::PMOVZXBDrr:
-  case X86::PMOVZXBQrr:
-  case X86::PMOVZXWDrr:
-  case X86::PMOVZXWQrr:
-  case X86::PMOVZXDQrr:
-  case X86::VPMOVZXBWrr:
-  case X86::VPMOVZXBDrr:
-  case X86::VPMOVZXBQrr:
-  case X86::VPMOVZXWDrr:
-  case X86::VPMOVZXWQrr:
-  case X86::VPMOVZXDQrr:
-  case X86::VPMOVZXBWYrr:
-  case X86::VPMOVZXBDYrr:
-  case X86::VPMOVZXBQYrr:
-  case X86::VPMOVZXWDYrr:
-  case X86::VPMOVZXWQYrr:
-  case X86::VPMOVZXDQYrr:
-    Src1Name = getRegName(MI->getOperand(1).getReg());
+  case X86::VBROADCASTF128:
+  case X86::VBROADCASTI128:
+    DecodeSubVectorBroadcast(MVT::v4f64, MVT::v2f64, ShuffleMask);
+    DestName = getRegName(MI->getOperand(0).getReg());
+    break;
+
+  CASE_PMOVZX(PMOVZXBW, r)
+  CASE_PMOVZX(PMOVZXBD, r)
+  CASE_PMOVZX(PMOVZXBQ, r)
+    Src1Name = getRegName(MI->getOperand(NumOperands - 1).getReg());
+  // FALL THROUGH.
+  CASE_PMOVZX(PMOVZXBW, m)
+  CASE_PMOVZX(PMOVZXBD, m)
+  CASE_PMOVZX(PMOVZXBQ, m)
+    DecodeZeroExtendMask(MVT::i8, getZeroExtensionResultType(MI), ShuffleMask);
+    DestName = getRegName(MI->getOperand(0).getReg());
+    break;
+
+  CASE_PMOVZX(PMOVZXWD, r)
+  CASE_PMOVZX(PMOVZXWQ, r)
+    Src1Name = getRegName(MI->getOperand(NumOperands - 1).getReg());
   // FALL THROUGH.
-  case X86::PMOVZXBWrm:
-  case X86::PMOVZXBDrm:
-  case X86::PMOVZXBQrm:
-  case X86::PMOVZXWDrm:
-  case X86::PMOVZXWQrm:
-  case X86::PMOVZXDQrm:
-  case X86::VPMOVZXBWrm:
-  case X86::VPMOVZXBDrm:
-  case X86::VPMOVZXBQrm:
-  case X86::VPMOVZXWDrm:
-  case X86::VPMOVZXWQrm:
-  case X86::VPMOVZXDQrm:
-  case X86::VPMOVZXBWYrm:
-  case X86::VPMOVZXBDYrm:
-  case X86::VPMOVZXBQYrm:
-  case X86::VPMOVZXWDYrm:
-  case X86::VPMOVZXWQYrm:
-  case X86::VPMOVZXDQYrm: {
-    MVT SrcVT, DstVT;
-    getZeroExtensionTypes(MI, SrcVT, DstVT);
-    DecodeZeroExtendMask(SrcVT, DstVT, ShuffleMask);
-    DestName = getRegName(MI->getOperand(0).getReg());
-  } break;
+  CASE_PMOVZX(PMOVZXWD, m)
+  CASE_PMOVZX(PMOVZXWQ, m)
+    DecodeZeroExtendMask(MVT::i16, getZeroExtensionResultType(MI), ShuffleMask);
+    DestName = getRegName(MI->getOperand(0).getReg());
+    break;
+
+  CASE_PMOVZX(PMOVZXDQ, r)
+    Src1Name = getRegName(MI->getOperand(NumOperands - 1).getReg());
+  // FALL THROUGH.
+  CASE_PMOVZX(PMOVZXDQ, m)
+    DecodeZeroExtendMask(MVT::i32, getZeroExtensionResultType(MI), ShuffleMask);
+    DestName = getRegName(MI->getOperand(0).getReg());
+    break;
   }
 
   // The only comments we decode are shuffles, so give up if we were unable to
@@ -768,7 +987,7 @@ bool llvm::EmitAnyX86InstComments(const MCInst *MI, raw_ostream &OS,
     return false;
 
   if (!DestName) DestName = Src1Name;
-  OS << (DestName ? DestName : "mem") << " = ";
+  OS << (DestName ? getMaskName(MI, DestName, getRegName) : "mem") << " = ";
 
   // If the two sources are the same, canonicalize the input elements to be
   // from the first src so that we get larger element spans.
diff --git a/lib/Target/X86/MCTargetDesc/CMakeLists.txt b/lib/Target/X86/MCTargetDesc/CMakeLists.txt
index 129c28d804ef..33df9ec7dcde 100644
--- a/lib/Target/X86/MCTargetDesc/CMakeLists.txt
+++ b/lib/Target/X86/MCTargetDesc/CMakeLists.txt
@@ -7,6 +7,4 @@ add_llvm_library(LLVMX86Desc
   X86ELFObjectWriter.cpp
   X86WinCOFFStreamer.cpp
   X86WinCOFFObjectWriter.cpp
-  X86MachORelocationInfo.cpp
-  X86ELFRelocationInfo.cpp
   )
diff --git a/lib/Target/X86/MCTargetDesc/Makefile b/lib/Target/X86/MCTargetDesc/Makefile
deleted file mode 100644
index b19774ee379e..000000000000
--- a/lib/Target/X86/MCTargetDesc/Makefile
+++ /dev/null
@@ -1,16 +0,0 @@
-##===- lib/Target/X86/TargetDesc/Makefile ------------------*- Makefile -*-===##
-#
-#                     The LLVM Compiler Infrastructure
-#
-# This file is distributed under the University of Illinois Open Source
-# License. See LICENSE.TXT for details.
-#
-##===----------------------------------------------------------------------===##
-
-LEVEL = ../../../..
-LIBRARYNAME = LLVMX86Desc
-
-# Hack: we need to include 'main' target directory to grab private headers
-CPP.Flags += -I$(PROJ_OBJ_DIR)/.. -I$(PROJ_SRC_DIR)/..
-
-include $(LEVEL)/Makefile.common
diff --git a/lib/Target/X86/MCTargetDesc/X86AsmBackend.cpp b/lib/Target/X86/MCTargetDesc/X86AsmBackend.cpp
index 135c32bf8c3b..e77a0dc9bc27 100644
--- a/lib/Target/X86/MCTargetDesc/X86AsmBackend.cpp
+++ b/lib/Target/X86/MCTargetDesc/X86AsmBackend.cpp
@@ -21,7 +21,7 @@
 #include "llvm/MC/MCSectionCOFF.h"
 #include "llvm/MC/MCSectionELF.h"
 #include "llvm/MC/MCSectionMachO.h"
-#include "llvm/Support/CommandLine.h"
+#include "llvm/MC/MCSubtargetInfo.h"
 #include "llvm/Support/ELF.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/MachO.h"
@@ -43,8 +43,11 @@ static unsigned getFixupKindLog2Size(unsigned Kind) {
     return 1;
   case FK_PCRel_4:
   case X86::reloc_riprel_4byte:
+  case X86::reloc_riprel_4byte_relax:
+  case X86::reloc_riprel_4byte_relax_rex:
   case X86::reloc_riprel_4byte_movq_load:
   case X86::reloc_signed_4byte:
+  case X86::reloc_signed_4byte_relax:
   case X86::reloc_global_offset_table:
   case FK_SecRel_4:
   case FK_Data_4:
@@ -72,7 +75,8 @@ class X86AsmBackend : public MCAsmBackend {
   const uint64_t MaxNopLength;
 public:
   X86AsmBackend(const Target &T, StringRef CPU)
-      : MCAsmBackend(), CPU(CPU), MaxNopLength(CPU == "slm" ? 7 : 15) {
+      : MCAsmBackend(), CPU(CPU),
+        MaxNopLength((CPU == "slm" || CPU == "lakemont") ? 7 : 15) {
     HasNopl = CPU != "generic" && CPU != "i386" && CPU != "i486" &&
               CPU != "i586" && CPU != "pentium" && CPU != "pentium-mmx" &&
               CPU != "i686" && CPU != "k6" && CPU != "k6-2" && CPU != "k6-3" &&
@@ -86,10 +90,14 @@ public:
 
   const MCFixupKindInfo &getFixupKindInfo(MCFixupKind Kind) const override {
     const static MCFixupKindInfo Infos[X86::NumTargetFixupKinds] = {
-      { "reloc_riprel_4byte", 0, 4 * 8, MCFixupKindInfo::FKF_IsPCRel },
-      { "reloc_riprel_4byte_movq_load", 0, 4 * 8, MCFixupKindInfo::FKF_IsPCRel},
-      { "reloc_signed_4byte", 0, 4 * 8, 0},
-      { "reloc_global_offset_table", 0, 4 * 8, 0}
+        {"reloc_riprel_4byte", 0, 32, MCFixupKindInfo::FKF_IsPCRel},
+        {"reloc_riprel_4byte_movq_load", 0, 32, MCFixupKindInfo::FKF_IsPCRel},
+        {"reloc_riprel_4byte_relax", 0, 32, MCFixupKindInfo::FKF_IsPCRel},
+        {"reloc_riprel_4byte_relax_rex", 0, 32, MCFixupKindInfo::FKF_IsPCRel},
+        {"reloc_signed_4byte", 0, 32, 0},
+        {"reloc_signed_4byte_relax", 0, 32, 0},
+        {"reloc_global_offset_table", 0, 32, 0},
+        {"reloc_global_offset_table8", 0, 64, 0},
     };
 
     if (Kind < FirstTargetFixupKind)
@@ -124,38 +132,57 @@ public:
                             const MCRelaxableFragment *DF,
                             const MCAsmLayout &Layout) const override;
 
-  void relaxInstruction(const MCInst &Inst, MCInst &Res) const override;
+  void relaxInstruction(const MCInst &Inst, const MCSubtargetInfo &STI,
+                        MCInst &Res) const override;
 
   bool writeNopData(uint64_t Count, MCObjectWriter *OW) const override;
 };
 } // end anonymous namespace
 
-static unsigned getRelaxedOpcodeBranch(unsigned Op) {
+static unsigned getRelaxedOpcodeBranch(const MCInst &Inst, bool is16BitMode) {
+  unsigned Op = Inst.getOpcode();
   switch (Op) {
   default:
     return Op;
-
-  case X86::JAE_1: return X86::JAE_4;
-  case X86::JA_1:  return X86::JA_4;
-  case X86::JBE_1: return X86::JBE_4;
-  case X86::JB_1:  return X86::JB_4;
-  case X86::JE_1:  return X86::JE_4;
-  case X86::JGE_1: return X86::JGE_4;
-  case X86::JG_1:  return X86::JG_4;
-  case X86::JLE_1: return X86::JLE_4;
-  case X86::JL_1:  return X86::JL_4;
-  case X86::JMP_1: return X86::JMP_4;
-  case X86::JNE_1: return X86::JNE_4;
-  case X86::JNO_1: return X86::JNO_4;
-  case X86::JNP_1: return X86::JNP_4;
-  case X86::JNS_1: return X86::JNS_4;
-  case X86::JO_1:  return X86::JO_4;
-  case X86::JP_1:  return X86::JP_4;
-  case X86::JS_1:  return X86::JS_4;
+  case X86::JAE_1:
+    return (is16BitMode) ? X86::JAE_2 : X86::JAE_4;
+  case X86::JA_1:
+    return (is16BitMode) ? X86::JA_2 : X86::JA_4;
+  case X86::JBE_1:
+    return (is16BitMode) ? X86::JBE_2 : X86::JBE_4;
+  case X86::JB_1:
+    return (is16BitMode) ? X86::JB_2 : X86::JB_4;
+  case X86::JE_1:
+    return (is16BitMode) ? X86::JE_2 : X86::JE_4;
+  case X86::JGE_1:
+    return (is16BitMode) ? X86::JGE_2 : X86::JGE_4;
+  case X86::JG_1:
+    return (is16BitMode) ? X86::JG_2 : X86::JG_4;
+  case X86::JLE_1:
+    return (is16BitMode) ? X86::JLE_2 : X86::JLE_4;
+  case X86::JL_1:
+    return (is16BitMode) ? X86::JL_2 : X86::JL_4;
+  case X86::JMP_1:
+    return (is16BitMode) ? X86::JMP_2 : X86::JMP_4;
+  case X86::JNE_1:
+    return (is16BitMode) ? X86::JNE_2 : X86::JNE_4;
+  case X86::JNO_1:
+    return (is16BitMode) ? X86::JNO_2 : X86::JNO_4;
+  case X86::JNP_1:
+    return (is16BitMode) ? X86::JNP_2 : X86::JNP_4;
+  case X86::JNS_1:
+    return (is16BitMode) ? X86::JNS_2 : X86::JNS_4;
+  case X86::JO_1:
+    return (is16BitMode) ? X86::JO_2 : X86::JO_4;
+  case X86::JP_1:
+    return (is16BitMode) ? X86::JP_2 : X86::JP_4;
+  case X86::JS_1:
+    return (is16BitMode) ? X86::JS_2 : X86::JS_4;
   }
 }
 
-static unsigned getRelaxedOpcodeArith(unsigned Op) {
+static unsigned getRelaxedOpcodeArith(const MCInst &Inst) {
+  unsigned Op = Inst.getOpcode();
   switch (Op) {
   default:
     return Op;
@@ -239,20 +266,20 @@ static unsigned getRelaxedOpcodeArith(unsigned Op) {
   }
 }
 
-static unsigned getRelaxedOpcode(unsigned Op) {
-  unsigned R = getRelaxedOpcodeArith(Op);
-  if (R != Op)
+static unsigned getRelaxedOpcode(const MCInst &Inst, bool is16BitMode) {
+  unsigned R = getRelaxedOpcodeArith(Inst);
+  if (R != Inst.getOpcode())
     return R;
-  return getRelaxedOpcodeBranch(Op);
+  return getRelaxedOpcodeBranch(Inst, is16BitMode);
 }
 
 bool X86AsmBackend::mayNeedRelaxation(const MCInst &Inst) const {
-  // Branches can always be relaxed.
-  if (getRelaxedOpcodeBranch(Inst.getOpcode()) != Inst.getOpcode())
+  // Branches can always be relaxed in either mode.
+  if (getRelaxedOpcodeBranch(Inst, false) != Inst.getOpcode())
     return true;
 
   // Check if this instruction is ever relaxable.
-  if (getRelaxedOpcodeArith(Inst.getOpcode()) == Inst.getOpcode())
+  if (getRelaxedOpcodeArith(Inst) == Inst.getOpcode())
     return false;
 
 
@@ -275,9 +302,12 @@ bool X86AsmBackend::fixupNeedsRelaxation(const MCFixup &Fixup,
 
 // FIXME: Can tblgen help at all here to verify there aren't other instructions
 // we can relax?
-void X86AsmBackend::relaxInstruction(const MCInst &Inst, MCInst &Res) const {
+void X86AsmBackend::relaxInstruction(const MCInst &Inst,
+                                     const MCSubtargetInfo &STI,
+                                     MCInst &Res) const {
   // The only relaxations X86 does is from a 1byte pcrel to a 4byte pcrel.
-  unsigned RelaxedOp = getRelaxedOpcode(Inst.getOpcode());
+  bool is16BitMode = STI.getFeatureBits()[X86::Mode16Bit];
+  unsigned RelaxedOp = getRelaxedOpcode(Inst, is16BitMode);
 
   if (RelaxedOp == Inst.getOpcode()) {
     SmallString<256> Tmp;
@@ -405,6 +435,14 @@ public:
     , Is64Bit(is64Bit) {
   }
 
+  Optional<MCFixupKind> getFixupKind(StringRef Name) const override {
+    return StringSwitch<Optional<MCFixupKind>>(Name)
+        .Case("dir32", FK_Data_4)
+        .Case("secrel32", FK_SecRel_4)
+        .Case("secidx", FK_SecRel_2)
+        .Default(MCAsmBackend::getFixupKind(Name));
+  }
+
   MCObjectWriter *createObjectWriter(raw_pwrite_stream &OS) const override {
     return createX86WinCOFFObjectWriter(OS, Is64Bit);
   }
@@ -803,7 +841,7 @@ MCAsmBackend *llvm::createX86_32AsmBackend(const Target &T,
   if (TheTriple.isOSBinFormatMachO())
     return new DarwinX86_32AsmBackend(T, MRI, CPU);
 
-  if (TheTriple.isOSWindows() && !TheTriple.isOSBinFormatELF())
+  if (TheTriple.isOSWindows() && TheTriple.isOSBinFormatCOFF())
     return new WindowsX86AsmBackend(T, false, CPU);
 
   uint8_t OSABI = MCELFObjectTargetWriter::getOSABI(TheTriple.getOS());
@@ -826,7 +864,7 @@ MCAsmBackend *llvm::createX86_64AsmBackend(const Target &T,
     return new DarwinX86_64AsmBackend(T, MRI, CPU, CS);
   }
 
-  if (TheTriple.isOSWindows() && !TheTriple.isOSBinFormatELF())
+  if (TheTriple.isOSWindows() && TheTriple.isOSBinFormatCOFF())
     return new WindowsX86AsmBackend(T, true, CPU);
 
   uint8_t OSABI = MCELFObjectTargetWriter::getOSABI(TheTriple.getOS());
diff --git a/lib/Target/X86/MCTargetDesc/X86BaseInfo.h b/lib/Target/X86/MCTargetDesc/X86BaseInfo.h
index 9ff85b9154f8..b4195176f904 100644
--- a/lib/Target/X86/MCTargetDesc/X86BaseInfo.h
+++ b/lib/Target/X86/MCTargetDesc/X86BaseInfo.h
@@ -186,11 +186,6 @@ namespace X86II {
     /// dllimport linkage on windows.
     MO_DLLIMPORT,
 
-    /// MO_DARWIN_STUB - On a symbol operand "FOO", this indicates that the
-    /// reference is actually to the "FOO$stub" symbol.  This is used for calls
-    /// and jumps to external functions on Tiger and earlier.
-    MO_DARWIN_STUB,
-
     /// MO_DARWIN_NONLAZY - On a symbol operand "FOO", this indicates that the
     /// reference is actually to the "FOO$non_lazy_ptr" symbol, which is a
     /// non-PIC-base-relative reference to a non-hidden dyld lazy pointer stub.
@@ -201,12 +196,6 @@ namespace X86II {
     /// a PIC-base-relative reference to a non-hidden dyld lazy pointer stub.
     MO_DARWIN_NONLAZY_PIC_BASE,
 
-    /// MO_DARWIN_HIDDEN_NONLAZY_PIC_BASE - On a symbol operand "FOO", this
-    /// indicates that the reference is actually to "FOO$non_lazy_ptr -PICBASE",
-    /// which is a PIC-base-relative reference to a hidden dyld lazy pointer
-    /// stub.
-    MO_DARWIN_HIDDEN_NONLAZY_PIC_BASE,
-
     /// MO_TLVP - On a symbol operand this indicates that the immediate is
     /// some TLS offset.
     ///
@@ -667,7 +656,7 @@ namespace X86II {
   /// is duplicated in the MCInst (e.g. "EAX = addl EAX, [mem]") it is only
   /// counted as one operand.
   ///
-  inline int getMemoryOperandNo(uint64_t TSFlags, unsigned Opcode) {
+  inline int getMemoryOperandNo(uint64_t TSFlags) {
     bool HasVEX_4V = TSFlags & X86II::VEX_4V;
     bool HasMemOp4 = TSFlags & X86II::MemOp4;
     bool HasEVEX_K = TSFlags & X86II::EVEX_K;
@@ -734,12 +723,12 @@ namespace X86II {
   /// isX86_64ExtendedReg - Is the MachineOperand a x86-64 extended (r8 or
   /// higher) register?  e.g. r8, xmm8, xmm13, etc.
   inline bool isX86_64ExtendedReg(unsigned RegNo) {
-    if ((RegNo > X86::XMM7 && RegNo <= X86::XMM15) ||
-        (RegNo > X86::XMM23 && RegNo <= X86::XMM31) ||
-        (RegNo > X86::YMM7 && RegNo <= X86::YMM15) ||
-        (RegNo > X86::YMM23 && RegNo <= X86::YMM31) ||
-        (RegNo > X86::ZMM7 && RegNo <= X86::ZMM15) ||
-        (RegNo > X86::ZMM23 && RegNo <= X86::ZMM31))
+    if ((RegNo >= X86::XMM8 && RegNo <= X86::XMM15) ||
+        (RegNo >= X86::XMM24 && RegNo <= X86::XMM31) ||
+        (RegNo >= X86::YMM8 && RegNo <= X86::YMM15) ||
+        (RegNo >= X86::YMM24 && RegNo <= X86::YMM31) ||
+        (RegNo >= X86::ZMM8 && RegNo <= X86::ZMM15) ||
+        (RegNo >= X86::ZMM24 && RegNo <= X86::ZMM31))
       return true;
 
     switch (RegNo) {
@@ -762,9 +751,9 @@ namespace X86II {
   /// is32ExtendedReg - Is the MemoryOperand a 32 extended (zmm16 or higher)
   /// registers? e.g. zmm21, etc.
   static inline bool is32ExtendedReg(unsigned RegNo) {
-    return ((RegNo > X86::XMM15 && RegNo <= X86::XMM31) ||
-            (RegNo > X86::YMM15 && RegNo <= X86::YMM31) ||
-            (RegNo > X86::ZMM15 && RegNo <= X86::ZMM31));
+    return ((RegNo >= X86::XMM16 && RegNo <= X86::XMM31) ||
+            (RegNo >= X86::YMM16 && RegNo <= X86::YMM31) ||
+            (RegNo >= X86::ZMM16 && RegNo <= X86::ZMM31));
   }
 
 
diff --git a/lib/Target/X86/MCTargetDesc/X86ELFObjectWriter.cpp b/lib/Target/X86/MCTargetDesc/X86ELFObjectWriter.cpp
index 736c39dfb6f1..da69da51df10 100644
--- a/lib/Target/X86/MCTargetDesc/X86ELFObjectWriter.cpp
+++ b/lib/Target/X86/MCTargetDesc/X86ELFObjectWriter.cpp
@@ -9,6 +9,8 @@
 
 #include "MCTargetDesc/X86FixupKinds.h"
 #include "MCTargetDesc/X86MCTargetDesc.h"
+#include "llvm/MC/MCAsmInfo.h"
+#include "llvm/MC/MCContext.h"
 #include "llvm/MC/MCELFObjectWriter.h"
 #include "llvm/MC/MCExpr.h"
 #include "llvm/MC/MCValue.h"
@@ -25,8 +27,8 @@ namespace {
     ~X86ELFObjectWriter() override;
 
   protected:
-    unsigned GetRelocType(const MCValue &Target, const MCFixup &Fixup,
-                          bool IsPCRel) const override;
+    unsigned getRelocType(MCContext &Ctx, const MCValue &Target,
+                          const MCFixup &Fixup, bool IsPCRel) const override;
   };
 }
 
@@ -56,6 +58,7 @@ static X86_64RelType getType64(unsigned Kind,
   case FK_Data_8:
     return RT64_64;
   case X86::reloc_signed_4byte:
+  case X86::reloc_signed_4byte_relax:
     if (Modifier == MCSymbolRefExpr::VK_None && !IsPCRel)
       return RT64_32S;
     return RT64_32;
@@ -66,6 +69,8 @@ static X86_64RelType getType64(unsigned Kind,
   case FK_Data_4:
   case FK_PCRel_4:
   case X86::reloc_riprel_4byte:
+  case X86::reloc_riprel_4byte_relax:
+  case X86::reloc_riprel_4byte_relax_rex:
   case X86::reloc_riprel_4byte_movq_load:
     return RT64_32;
   case FK_PCRel_2:
@@ -77,8 +82,16 @@ static X86_64RelType getType64(unsigned Kind,
   }
 }
 
-static unsigned getRelocType64(MCSymbolRefExpr::VariantKind Modifier,
-                               X86_64RelType Type, bool IsPCRel) {
+static void checkIs32(MCContext &Ctx, SMLoc Loc, X86_64RelType Type) {
+  if (Type != RT64_32)
+    Ctx.reportError(Loc,
+                    "32 bit reloc applied to a field with a different size");
+}
+
+static unsigned getRelocType64(MCContext &Ctx, SMLoc Loc,
+                               MCSymbolRefExpr::VariantKind Modifier,
+                               X86_64RelType Type, bool IsPCRel,
+                               unsigned Kind) {
   switch (Modifier) {
   default:
     llvm_unreachable("Unimplemented");
@@ -146,21 +159,38 @@ static unsigned getRelocType64(MCSymbolRefExpr::VariantKind Modifier,
     case RT64_8:
       llvm_unreachable("Unimplemented");
     }
+  case MCSymbolRefExpr::VK_TLSCALL:
+    return ELF::R_X86_64_TLSDESC_CALL;
+  case MCSymbolRefExpr::VK_TLSDESC:
+    return ELF::R_X86_64_GOTPC32_TLSDESC;
   case MCSymbolRefExpr::VK_TLSGD:
-    assert(Type == RT64_32);
+    checkIs32(Ctx, Loc, Type);
     return ELF::R_X86_64_TLSGD;
   case MCSymbolRefExpr::VK_GOTTPOFF:
-    assert(Type == RT64_32);
+    checkIs32(Ctx, Loc, Type);
     return ELF::R_X86_64_GOTTPOFF;
   case MCSymbolRefExpr::VK_TLSLD:
-    assert(Type == RT64_32);
+    checkIs32(Ctx, Loc, Type);
     return ELF::R_X86_64_TLSLD;
   case MCSymbolRefExpr::VK_PLT:
-    assert(Type == RT64_32);
+    checkIs32(Ctx, Loc, Type);
     return ELF::R_X86_64_PLT32;
   case MCSymbolRefExpr::VK_GOTPCREL:
-    assert(Type == RT64_32);
-    return ELF::R_X86_64_GOTPCREL;
+    checkIs32(Ctx, Loc, Type);
+    // Older versions of ld.bfd/ld.gold/lld
+    // do not support GOTPCRELX/REX_GOTPCRELX,
+    // and we want to keep back-compatibility.
+    if (!Ctx.getAsmInfo()->canRelaxRelocations())
+      return ELF::R_X86_64_GOTPCREL;
+    switch (Kind) {
+    default:
+      return ELF::R_X86_64_GOTPCREL;
+    case X86::reloc_riprel_4byte_relax:
+      return ELF::R_X86_64_GOTPCRELX;
+    case X86::reloc_riprel_4byte_relax_rex:
+    case X86::reloc_riprel_4byte_movq_load:
+      return ELF::R_X86_64_REX_GOTPCRELX;
+    }
   }
 }
 
@@ -181,8 +211,10 @@ static X86_32RelType getType32(X86_64RelType T) {
   llvm_unreachable("unexpected relocation type!");
 }
 
-static unsigned getRelocType32(MCSymbolRefExpr::VariantKind Modifier,
-                               X86_32RelType Type, bool IsPCRel) {
+static unsigned getRelocType32(MCContext &Ctx,
+                               MCSymbolRefExpr::VariantKind Modifier,
+                               X86_32RelType Type, bool IsPCRel,
+                               unsigned Kind) {
   switch (Modifier) {
   default:
     llvm_unreachable("Unimplemented");
@@ -197,7 +229,15 @@ static unsigned getRelocType32(MCSymbolRefExpr::VariantKind Modifier,
     }
   case MCSymbolRefExpr::VK_GOT:
     assert(Type == RT32_32);
-    return IsPCRel ? ELF::R_386_GOTPC : ELF::R_386_GOT32;
+    if (IsPCRel)
+      return ELF::R_386_GOTPC;
+    // Older versions of ld.bfd/ld.gold/lld do not support R_386_GOT32X and we
+    // want to maintain compatibility.
+    if (!Ctx.getAsmInfo()->canRelaxRelocations())
+      return ELF::R_386_GOT32;
+
+    return Kind == X86::reloc_signed_4byte_relax ? ELF::R_386_GOT32X
+                                                 : ELF::R_386_GOT32;
   case MCSymbolRefExpr::VK_GOTOFF:
     assert(Type == RT32_32);
     assert(!IsPCRel);
@@ -240,17 +280,18 @@ static unsigned getRelocType32(MCSymbolRefExpr::VariantKind Modifier,
   }
 }
 
-unsigned X86ELFObjectWriter::GetRelocType(const MCValue &Target,
+unsigned X86ELFObjectWriter::getRelocType(MCContext &Ctx, const MCValue &Target,
                                           const MCFixup &Fixup,
                                           bool IsPCRel) const {
   MCSymbolRefExpr::VariantKind Modifier = Target.getAccessVariant();
-  X86_64RelType Type = getType64(Fixup.getKind(), Modifier, IsPCRel);
+  unsigned Kind = Fixup.getKind();
+  X86_64RelType Type = getType64(Kind, Modifier, IsPCRel);
   if (getEMachine() == ELF::EM_X86_64)
-    return getRelocType64(Modifier, Type, IsPCRel);
+    return getRelocType64(Ctx, Fixup.getLoc(), Modifier, Type, IsPCRel, Kind);
 
   assert((getEMachine() == ELF::EM_386 || getEMachine() == ELF::EM_IAMCU) &&
          "Unsupported ELF machine type.");
-  return getRelocType32(Modifier, getType32(Type), IsPCRel);
+  return getRelocType32(Ctx, Modifier, getType32(Type), IsPCRel, Kind);
 }
 
 MCObjectWriter *llvm::createX86ELFObjectWriter(raw_pwrite_stream &OS,
diff --git a/lib/Target/X86/MCTargetDesc/X86ELFRelocationInfo.cpp b/lib/Target/X86/MCTargetDesc/X86ELFRelocationInfo.cpp
deleted file mode 100644
index ddb764facdbf..000000000000
--- a/lib/Target/X86/MCTargetDesc/X86ELFRelocationInfo.cpp
+++ /dev/null
@@ -1,141 +0,0 @@
-//===-- X86ELFRelocationInfo.cpp ----------------------------------------===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-
-#include "MCTargetDesc/X86MCTargetDesc.h"
-#include "llvm/MC/MCContext.h"
-#include "llvm/MC/MCExpr.h"
-#include "llvm/MC/MCInst.h"
-#include "llvm/MC/MCRelocationInfo.h"
-#include "llvm/MC/MCSymbol.h"
-#include "llvm/Object/ELFObjectFile.h"
-#include "llvm/Support/ELF.h"
-
-using namespace llvm;
-using namespace object;
-using namespace ELF;
-
-namespace {
-class X86_64ELFRelocationInfo : public MCRelocationInfo {
-public:
-  X86_64ELFRelocationInfo(MCContext &Ctx) : MCRelocationInfo(Ctx) {}
-
-  const MCExpr *createExprForRelocation(RelocationRef Rel) override {
-    uint64_t RelType = Rel.getType();
-    elf_symbol_iterator SymI = Rel.getSymbol();
-
-    ErrorOr<StringRef> SymNameOrErr = SymI->getName();
-    if (std::error_code EC = SymNameOrErr.getError())
-      report_fatal_error(EC.message());
-    StringRef SymName = *SymNameOrErr;
-
-    ErrorOr<uint64_t> SymAddr = SymI->getAddress();
-    if (std::error_code EC = SymAddr.getError())
-      report_fatal_error(EC.message());
-    uint64_t SymSize = SymI->getSize();
-    int64_t Addend = *ELFRelocationRef(Rel).getAddend();
-
-    MCSymbol *Sym = Ctx.getOrCreateSymbol(SymName);
-    // FIXME: check that the value is actually the same.
-    if (!Sym->isVariable())
-      Sym->setVariableValue(MCConstantExpr::create(*SymAddr, Ctx));
-
-    const MCExpr *Expr = nullptr;
-    // If hasAddend is true, then we need to add Addend (r_addend) to Expr.
-    bool hasAddend = false;
-
-    // The AMD64 SysV ABI says:
-    // A: the addend used to compute the value of the relocatable field.
-    // B: the base address at which a shared object has been loaded into memory
-    //    during execution. Generally, a shared object is built with a 0 base
-    //    virtual address, but the execution address will be different.
-    // G: the offset into the global offset table at which the relocation
-    //    entry's symbol will reside during execution.
-    // GOT: the address of the global offset table.
-    // L: the place (section offset or address) of the Procedure Linkage Table
-    //    entry for a symbol.
-    // P: the place (section offset or address) of the storage unit being
-    //    relocated (computed using r_offset).
-    // S: the value of the symbol whose index resides in the relocation entry.
-    // Z: the size of the symbol whose index resides in the relocation entry.
-
-    switch(RelType) {
-    case R_X86_64_NONE:
-    case R_X86_64_COPY:
-      // none
-      break;
-    case R_X86_64_64:
-    case R_X86_64_16:
-    case R_X86_64_8:
-      // S + A
-    case R_X86_64_32:
-    case R_X86_64_32S:
-      // S + A (We don't care about the result not fitting in 32 bits.)
-    case R_X86_64_PC32:
-    case R_X86_64_PC16:
-    case R_X86_64_PC8:
-    case R_X86_64_PC64:
-      // S + A - P (P/pcrel is implicit)
-      hasAddend = true;
-      Expr = MCSymbolRefExpr::create(Sym, Ctx);
-      break;
-    case R_X86_64_GOT32:
-    case R_X86_64_GOT64:
-    case R_X86_64_GOTPC32:
-    case R_X86_64_GOTPC64:
-    case R_X86_64_GOTPLT64:
-      // G + A
-      hasAddend = true;
-      Expr = MCSymbolRefExpr::create(Sym, MCSymbolRefExpr::VK_GOT, Ctx);
-      break;
-    case R_X86_64_PLT32:
-      // L + A - P -> S@PLT + A
-      hasAddend = true;
-      Expr = MCSymbolRefExpr::create(Sym, MCSymbolRefExpr::VK_PLT, Ctx);
-      break;
-    case R_X86_64_GLOB_DAT:
-    case R_X86_64_JUMP_SLOT:
-      // S
-      Expr = MCSymbolRefExpr::create(Sym, Ctx);
-      break;
-    case R_X86_64_GOTPCREL:
-    case R_X86_64_GOTPCREL64:
-      // G + GOT + A - P -> S@GOTPCREL + A
-      hasAddend = true;
-      Expr = MCSymbolRefExpr::create(Sym, MCSymbolRefExpr::VK_GOTPCREL, Ctx);
-      break;
-    case R_X86_64_GOTOFF64:
-      // S + A - GOT
-      Expr = MCSymbolRefExpr::create(Sym, MCSymbolRefExpr::VK_GOTOFF, Ctx);
-      break;
-    case R_X86_64_PLTOFF64:
-      // L + A - GOT
-      break;
-    case R_X86_64_SIZE32:
-    case R_X86_64_SIZE64:
-      // Z + A
-      Expr = MCConstantExpr::create(SymSize, Ctx);
-      break;
-    default:
-      Expr = MCSymbolRefExpr::create(Sym, Ctx);
-      break;
-    }
-    if (Expr && hasAddend && Addend != 0)
-      Expr = MCBinaryExpr::createAdd(Expr,
-                                     MCConstantExpr::create(Addend, Ctx),
-                                     Ctx);
-    return Expr;
-  }
-};
-} // End unnamed namespace
-
-/// createX86ELFRelocationInfo - Construct an X86 Mach-O RelocationInfo.
-MCRelocationInfo *llvm::createX86_64ELFRelocationInfo(MCContext &Ctx) {
-  // We only handle x86-64 for now.
-  return new X86_64ELFRelocationInfo(Ctx);
-}
diff --git a/lib/Target/X86/MCTargetDesc/X86FixupKinds.h b/lib/Target/X86/MCTargetDesc/X86FixupKinds.h
index 4899900dcef9..dfdc9ec29aec 100644
--- a/lib/Target/X86/MCTargetDesc/X86FixupKinds.h
+++ b/lib/Target/X86/MCTargetDesc/X86FixupKinds.h
@@ -17,9 +17,15 @@ namespace X86 {
 enum Fixups {
   reloc_riprel_4byte = FirstTargetFixupKind, // 32-bit rip-relative
   reloc_riprel_4byte_movq_load,              // 32-bit rip-relative in movq
+  reloc_riprel_4byte_relax,                  // 32-bit rip-relative in relaxable
+                                             // instruction
+  reloc_riprel_4byte_relax_rex,              // 32-bit rip-relative in relaxable
+                                             // instruction with rex prefix
   reloc_signed_4byte,                        // 32-bit signed. Unlike FK_Data_4
                                              // this will be sign extended at
                                              // runtime.
+  reloc_signed_4byte_relax,                  // like reloc_signed_4byte, but
+                                             // in a relaxable instruction.
   reloc_global_offset_table,                 // 32-bit, relative to the start
                                              // of the instruction. Used only
                                              // for _GLOBAL_OFFSET_TABLE_.
diff --git a/lib/Target/X86/MCTargetDesc/X86MCAsmInfo.cpp b/lib/Target/X86/MCTargetDesc/X86MCAsmInfo.cpp
index fc0b0f89e23d..b7c56cec2db8 100644
--- a/lib/Target/X86/MCTargetDesc/X86MCAsmInfo.cpp
+++ b/lib/Target/X86/MCTargetDesc/X86MCAsmInfo.cpp
@@ -35,7 +35,7 @@ AsmWriterFlavor("x86-asm-syntax", cl::init(ATT),
              clEnumValEnd));
 
 static cl::opt<bool>
-MarkedJTDataRegions("mark-data-regions", cl::init(false),
+MarkedJTDataRegions("mark-data-regions", cl::init(true),
   cl::desc("Mark code section jump table data regions."),
   cl::Hidden);
 
diff --git a/lib/Target/X86/MCTargetDesc/X86MCCodeEmitter.cpp b/lib/Target/X86/MCTargetDesc/X86MCCodeEmitter.cpp
index dfab6ec10775..96c2e81c332a 100644
--- a/lib/Target/X86/MCTargetDesc/X86MCCodeEmitter.cpp
+++ b/lib/Target/X86/MCTargetDesc/X86MCCodeEmitter.cpp
@@ -76,36 +76,16 @@ public:
     return Ctx.getRegisterInfo()->getEncodingValue(MO.getReg()) & 0x7;
   }
 
-  // On regular x86, both XMM0-XMM7 and XMM8-XMM15 are encoded in the range
-  // 0-7 and the difference between the 2 groups is given by the REX prefix.
-  // In the VEX prefix, registers are seen sequencially from 0-15 and encoded
-  // in 1's complement form, example:
-  //
-  //  ModRM field => XMM9 => 1
-  //  VEX.VVVV    => XMM9 => ~9
-  //
-  // See table 4-35 of Intel AVX Programming Reference for details.
-  unsigned char getVEXRegisterEncoding(const MCInst &MI,
-                                       unsigned OpNum) const {
-    unsigned SrcReg = MI.getOperand(OpNum).getReg();
-    unsigned SrcRegNum = GetX86RegNum(MI.getOperand(OpNum));
-    if (X86II::isX86_64ExtendedReg(SrcReg))
-      SrcRegNum |= 8;
-
-    // The registers represented through VEX_VVVV should
-    // be encoded in 1's complement form.
-    return (~SrcRegNum) & 0xf;
+  unsigned getX86RegEncoding(const MCInst &MI, unsigned OpNum) const {
+    return Ctx.getRegisterInfo()->getEncodingValue(
+                                                 MI.getOperand(OpNum).getReg());
   }
 
-  unsigned char getWriteMaskRegisterEncoding(const MCInst &MI,
-                                             unsigned OpNum) const {
-    assert(X86::K0 != MI.getOperand(OpNum).getReg() &&
-           "Invalid mask register as write-mask!");
-    unsigned MaskRegNum = GetX86RegNum(MI.getOperand(OpNum));
-    return MaskRegNum;
+  bool isX86_64ExtendedReg(const MCInst &MI, unsigned OpNum) const {
+    return (getX86RegEncoding(MI, OpNum) >> 3) & 1;
   }
 
-  void EmitByte(unsigned char C, unsigned &CurByte, raw_ostream &OS) const {
+  void EmitByte(uint8_t C, unsigned &CurByte, raw_ostream &OS) const {
     OS << (char)C;
     ++CurByte;
   }
@@ -125,8 +105,8 @@ public:
                      SmallVectorImpl<MCFixup> &Fixups,
                      int ImmOffset = 0) const;
 
-  inline static unsigned char ModRMByte(unsigned Mod, unsigned RegOpcode,
-                                        unsigned RM) {
+  inline static uint8_t ModRMByte(unsigned Mod, unsigned RegOpcode,
+                                  unsigned RM) {
     assert(Mod < 4 && RegOpcode < 8 && RM < 8 && "ModRM Fields out of range!");
     return RM | (RegOpcode << 3) | (Mod << 6);
   }
@@ -142,11 +122,9 @@ public:
     EmitByte(ModRMByte(SS, Index, Base), CurByte, OS);
   }
 
-
-  void EmitMemModRMByte(const MCInst &MI, unsigned Op,
-                        unsigned RegOpcodeField,
-                        uint64_t TSFlags, unsigned &CurByte, raw_ostream &OS,
-                        SmallVectorImpl<MCFixup> &Fixups,
+  void emitMemModRMByte(const MCInst &MI, unsigned Op, unsigned RegOpcodeField,
+                        uint64_t TSFlags, bool Rex, unsigned &CurByte,
+                        raw_ostream &OS, SmallVectorImpl<MCFixup> &Fixups,
                         const MCSubtargetInfo &STI) const;
 
   void encodeInstruction(const MCInst &MI, raw_ostream &OS,
@@ -160,10 +138,12 @@ public:
   void EmitSegmentOverridePrefix(unsigned &CurByte, unsigned SegOperand,
                                  const MCInst &MI, raw_ostream &OS) const;
 
-  void EmitOpcodePrefix(uint64_t TSFlags, unsigned &CurByte, int MemOperand,
+  bool emitOpcodePrefix(uint64_t TSFlags, unsigned &CurByte, int MemOperand,
                         const MCInst &MI, const MCInstrDesc &Desc,
-                        const MCSubtargetInfo &STI,
-                        raw_ostream &OS) const;
+                        const MCSubtargetInfo &STI, raw_ostream &OS) const;
+
+  uint8_t DetermineREXPrefix(const MCInst &MI, uint64_t TSFlags,
+                             int MemOperand, const MCInstrDesc &Desc) const;
 };
 
 } // end anonymous namespace
@@ -177,7 +157,7 @@ MCCodeEmitter *llvm::createX86MCCodeEmitter(const MCInstrInfo &MCII,
 /// isDisp8 - Return true if this signed displacement fits in a 8-bit
 /// sign-extended field.
 static bool isDisp8(int Value) {
-  return Value == (signed char)Value;
+  return Value == (int8_t)Value;
 }
 
 /// isCDisp8 - Return true if this signed displacement fits in a 8-bit
@@ -198,7 +178,7 @@ static bool isCDisp8(uint64_t TSFlags, int Value, int& CValue) {
   if (Value & Mask) // Unaligned offset
     return false;
   Value /= (int)CD8_Scale;
-  bool Ret = (Value == (signed char)Value);
+  bool Ret = (Value == (int8_t)Value);
 
   if (Ret)
     CValue = Value;
@@ -231,6 +211,10 @@ static bool Is32BitMemOperand(const MCInst &MI, unsigned Op) {
       (IndexReg.getReg() != 0 &&
        X86MCRegisterClasses[X86::GR32RegClassID].contains(IndexReg.getReg())))
     return true;
+  if (BaseReg.getReg() == X86::EIP) {
+    assert(IndexReg.getReg() == 0 && "Invalid eip-based address.");
+    return true;
+  }
   return false;
 }
 
@@ -343,7 +327,9 @@ EmitImmediate(const MCOperand &DispOp, SMLoc Loc, unsigned Size,
   // the start of the field, not the end of the field.
   if (FixupKind == FK_PCRel_4 ||
       FixupKind == MCFixupKind(X86::reloc_riprel_4byte) ||
-      FixupKind == MCFixupKind(X86::reloc_riprel_4byte_movq_load))
+      FixupKind == MCFixupKind(X86::reloc_riprel_4byte_movq_load) ||
+      FixupKind == MCFixupKind(X86::reloc_riprel_4byte_relax) ||
+      FixupKind == MCFixupKind(X86::reloc_riprel_4byte_relax_rex))
     ImmOffset -= 4;
   if (FixupKind == FK_PCRel_2)
     ImmOffset -= 2;
@@ -359,12 +345,12 @@ EmitImmediate(const MCOperand &DispOp, SMLoc Loc, unsigned Size,
   EmitConstant(0, Size, CurByte, OS);
 }
 
-void X86MCCodeEmitter::EmitMemModRMByte(const MCInst &MI, unsigned Op,
+void X86MCCodeEmitter::emitMemModRMByte(const MCInst &MI, unsigned Op,
                                         unsigned RegOpcodeField,
-                                        uint64_t TSFlags, unsigned &CurByte,
-                                        raw_ostream &OS,
+                                        uint64_t TSFlags, bool Rex,
+                                        unsigned &CurByte, raw_ostream &OS,
                                         SmallVectorImpl<MCFixup> &Fixups,
-                                        const MCSubtargetInfo &STI) const{
+                                        const MCSubtargetInfo &STI) const {
   const MCOperand &Disp     = MI.getOperand(Op+X86::AddrDisp);
   const MCOperand &Base     = MI.getOperand(Op+X86::AddrBaseReg);
   const MCOperand &Scale    = MI.getOperand(Op+X86::AddrScaleAmt);
@@ -373,18 +359,38 @@ void X86MCCodeEmitter::EmitMemModRMByte(const MCInst &MI, unsigned Op,
   bool HasEVEX = (TSFlags & X86II::EncodingMask) == X86II::EVEX;
 
   // Handle %rip relative addressing.
-  if (BaseReg == X86::RIP) {    // [disp32+RIP] in X86-64 mode
+  if (BaseReg == X86::RIP ||
+      BaseReg == X86::EIP) {    // [disp32+rIP] in X86-64 mode
     assert(is64BitMode(STI) && "Rip-relative addressing requires 64-bit mode");
     assert(IndexReg.getReg() == 0 && "Invalid rip-relative address");
     EmitByte(ModRMByte(0, RegOpcodeField, 5), CurByte, OS);
 
-    unsigned FixupKind = X86::reloc_riprel_4byte;
-
+    unsigned Opcode = MI.getOpcode();
     // movq loads are handled with a special relocation form which allows the
     // linker to eliminate some loads for GOT references which end up in the
     // same linkage unit.
-    if (MI.getOpcode() == X86::MOV64rm)
-      FixupKind = X86::reloc_riprel_4byte_movq_load;
+    unsigned FixupKind = [=]() {
+      switch (Opcode) {
+      default:
+        return X86::reloc_riprel_4byte;
+      case X86::MOV64rm:
+        assert(Rex);
+        return X86::reloc_riprel_4byte_movq_load;
+      case X86::CALL64m:
+      case X86::JMP64m:
+      case X86::TEST64rm:
+      case X86::ADC64rm:
+      case X86::ADD64rm:
+      case X86::AND64rm:
+      case X86::CMP64rm:
+      case X86::OR64rm:
+      case X86::SBB64rm:
+      case X86::SUB64rm:
+      case X86::XOR64rm:
+        return Rex ? X86::reloc_riprel_4byte_relax_rex
+                   : X86::reloc_riprel_4byte_relax;
+      }
+    }();
 
     // rip-relative addressing is actually relative to the *next* instruction.
     // Since an immediate can follow the mod/rm byte for an instruction, this
@@ -510,8 +516,11 @@ void X86MCCodeEmitter::EmitMemModRMByte(const MCInst &MI, unsigned Op,
 
     // Otherwise, emit the most general non-SIB encoding: [REG+disp32]
     EmitByte(ModRMByte(2, RegOpcodeField, BaseRegNo), CurByte, OS);
-    EmitImmediate(Disp, MI.getLoc(), 4, MCFixupKind(X86::reloc_signed_4byte),
-                  CurByte, OS, Fixups);
+    unsigned Opcode = MI.getOpcode();
+    unsigned FixupKind = Opcode == X86::MOV32rm ? X86::reloc_signed_4byte_relax
+                                                : X86::reloc_signed_4byte;
+    EmitImmediate(Disp, MI.getLoc(), 4, MCFixupKind(FixupKind), CurByte, OS,
+                  Fixups);
     return;
   }
 
@@ -603,26 +612,26 @@ void X86MCCodeEmitter::EmitVEXOpcodePrefix(uint64_t TSFlags, unsigned &CurByte,
   //  1: Same as REX_R=0 (must be 1 in 32-bit mode)
   //  0: Same as REX_R=1 (64 bit mode only)
   //
-  unsigned char VEX_R = 0x1;
-  unsigned char EVEX_R2 = 0x1;
+  uint8_t VEX_R = 0x1;
+  uint8_t EVEX_R2 = 0x1;
 
   // VEX_X: equivalent to REX.X, only used when a
   // register is used for index in SIB Byte.
   //
   //  1: Same as REX.X=0 (must be 1 in 32-bit mode)
   //  0: Same as REX.X=1 (64-bit mode only)
-  unsigned char VEX_X = 0x1;
+  uint8_t VEX_X = 0x1;
 
   // VEX_B:
   //
   //  1: Same as REX_B=0 (ignored in 32-bit mode)
   //  0: Same as REX_B=1 (64 bit mode only)
   //
-  unsigned char VEX_B = 0x1;
+  uint8_t VEX_B = 0x1;
 
   // VEX_W: opcode specific (use like REX.W, or used for
   // opcode extension, or ignored, depending on the opcode byte)
-  unsigned char VEX_W = 0;
+  uint8_t VEX_W = (TSFlags & X86II::VEX_W) ? 1 : 0;
 
   // VEX_5M (VEX m-mmmmm field):
   //
@@ -634,20 +643,31 @@ void X86MCCodeEmitter::EmitVEXOpcodePrefix(uint64_t TSFlags, unsigned &CurByte,
   //  0b01000: XOP map select - 08h instructions with imm byte
   //  0b01001: XOP map select - 09h instructions with no imm byte
   //  0b01010: XOP map select - 0Ah instructions with imm dword
-  unsigned char VEX_5M = 0;
+  uint8_t VEX_5M;
+  switch (TSFlags & X86II::OpMapMask) {
+  default: llvm_unreachable("Invalid prefix!");
+  case X86II::TB:   VEX_5M = 0x1; break; // 0F
+  case X86II::T8:   VEX_5M = 0x2; break; // 0F 38
+  case X86II::TA:   VEX_5M = 0x3; break; // 0F 3A
+  case X86II::XOP8: VEX_5M = 0x8; break;
+  case X86II::XOP9: VEX_5M = 0x9; break;
+  case X86II::XOPA: VEX_5M = 0xA; break;
+  }
 
   // VEX_4V (VEX vvvv field): a register specifier
   // (in 1's complement form) or 1111 if unused.
-  unsigned char VEX_4V = 0xf;
-  unsigned char EVEX_V2 = 0x1;
+  uint8_t VEX_4V = 0xf;
+  uint8_t EVEX_V2 = 0x1;
 
-  // VEX_L (Vector Length):
+  // EVEX_L2/VEX_L (Vector Length):
   //
-  //  0: scalar or 128-bit vector
-  //  1: 256-bit vector
+  // L2 L
+  //  0 0: scalar or 128-bit vector
+  //  0 1: 256-bit vector
+  //  1 0: 512-bit vector
   //
-  unsigned char VEX_L = 0;
-  unsigned char EVEX_L2 = 0;
+  uint8_t VEX_L = (TSFlags & X86II::VEX_L) ? 1 : 0;
+  uint8_t EVEX_L2 = (TSFlags & X86II::EVEX_L2) ? 1 : 0;
 
   // VEX_PP: opcode extension providing equivalent
   // functionality of a SIMD prefix
@@ -657,56 +677,32 @@ void X86MCCodeEmitter::EmitVEXOpcodePrefix(uint64_t TSFlags, unsigned &CurByte,
   //  0b10: F3
   //  0b11: F2
   //
-  unsigned char VEX_PP = 0;
+  uint8_t VEX_PP;
+  switch (TSFlags & X86II::OpPrefixMask) {
+  default: llvm_unreachable("Invalid op prefix!");
+  case X86II::PS: VEX_PP = 0x0; break; // none
+  case X86II::PD: VEX_PP = 0x1; break; // 66
+  case X86II::XS: VEX_PP = 0x2; break; // F3
+  case X86II::XD: VEX_PP = 0x3; break; // F2
+  }
 
   // EVEX_U
-  unsigned char EVEX_U = 1; // Always '1' so far
+  uint8_t EVEX_U = 1; // Always '1' so far
 
   // EVEX_z
-  unsigned char EVEX_z = 0;
+  uint8_t EVEX_z = (HasEVEX_K && (TSFlags & X86II::EVEX_Z)) ? 1 : 0;
 
   // EVEX_b
-  unsigned char EVEX_b = 0;
+  uint8_t EVEX_b = (TSFlags & X86II::EVEX_B) ? 1 : 0;
 
   // EVEX_rc
-  unsigned char EVEX_rc = 0;
+  uint8_t EVEX_rc = 0;
 
   // EVEX_aaa
-  unsigned char EVEX_aaa = 0;
+  uint8_t EVEX_aaa = 0;
 
   bool EncodeRC = false;
 
-  if (TSFlags & X86II::VEX_W)
-    VEX_W = 1;
-
-  if (TSFlags & X86II::VEX_L)
-    VEX_L = 1;
-  if (TSFlags & X86II::EVEX_L2)
-    EVEX_L2 = 1;
-
-  if (HasEVEX_K && (TSFlags & X86II::EVEX_Z))
-    EVEX_z = 1;
-
-  if ((TSFlags & X86II::EVEX_B))
-    EVEX_b = 1;
-
-  switch (TSFlags & X86II::OpPrefixMask) {
-  default: break; // VEX_PP already correct
-  case X86II::PD: VEX_PP = 0x1; break; // 66
-  case X86II::XS: VEX_PP = 0x2; break; // F3
-  case X86II::XD: VEX_PP = 0x3; break; // F2
-  }
-
-  switch (TSFlags & X86II::OpMapMask) {
-  default: llvm_unreachable("Invalid prefix!");
-  case X86II::TB:   VEX_5M = 0x1; break; // 0F
-  case X86II::T8:   VEX_5M = 0x2; break; // 0F 38
-  case X86II::TA:   VEX_5M = 0x3; break; // 0F 3A
-  case X86II::XOP8: VEX_5M = 0x8; break;
-  case X86II::XOP9: VEX_5M = 0x9; break;
-  case X86II::XOPA: VEX_5M = 0xA; break;
-  }
-
   // Classify VEX_B, VEX_4V, VEX_R, VEX_X
   unsigned NumOps = Desc.getNumOperands();
   unsigned CurOp = X86II::getOperandBias(Desc);
@@ -721,38 +717,30 @@ void X86MCCodeEmitter::EmitVEXOpcodePrefix(uint64_t TSFlags, unsigned &CurByte,
     //  MemAddr, src1(VEX_4V), src2(ModR/M)
     //  MemAddr, src1(ModR/M), imm8
     //
-    if (X86II::isX86_64ExtendedReg(MI.getOperand(MemOperand +
-                                                 X86::AddrBaseReg).getReg()))
-      VEX_B = 0x0;
-    if (X86II::isX86_64ExtendedReg(MI.getOperand(MemOperand +
-                                                 X86::AddrIndexReg).getReg()))
-      VEX_X = 0x0;
-    if (X86II::is32ExtendedReg(MI.getOperand(MemOperand +
-                                          X86::AddrIndexReg).getReg()))
-      EVEX_V2 = 0x0;
+    unsigned BaseRegEnc = getX86RegEncoding(MI, MemOperand + X86::AddrBaseReg);
+    VEX_B = ~(BaseRegEnc >> 3) & 1;
+    unsigned IndexRegEnc = getX86RegEncoding(MI, MemOperand+X86::AddrIndexReg);
+    VEX_X = ~(IndexRegEnc >> 3) & 1;
+    if (!HasVEX_4V) // Only needed with VSIB which don't use VVVV.
+      EVEX_V2 = ~(IndexRegEnc >> 4) & 1;
 
     CurOp += X86::AddrNumOperands;
 
     if (HasEVEX_K)
-      EVEX_aaa = getWriteMaskRegisterEncoding(MI, CurOp++);
+      EVEX_aaa = getX86RegEncoding(MI, CurOp++);
 
     if (HasVEX_4V) {
-      VEX_4V = getVEXRegisterEncoding(MI, CurOp);
-      if (X86II::is32ExtendedReg(MI.getOperand(CurOp).getReg()))
-        EVEX_V2 = 0x0;
-      CurOp++;
+      unsigned VRegEnc = getX86RegEncoding(MI, CurOp++);
+      VEX_4V = ~VRegEnc & 0xf;
+      EVEX_V2 = ~(VRegEnc >> 4) & 1;
     }
 
-    const MCOperand &MO = MI.getOperand(CurOp);
-    if (MO.isReg()) {
-      if (X86II::isX86_64ExtendedReg(MO.getReg()))
-        VEX_R = 0x0;
-      if (X86II::is32ExtendedReg(MO.getReg()))
-        EVEX_R2 = 0x0;
-    }
+    unsigned RegEnc = getX86RegEncoding(MI, CurOp++);
+    VEX_R = ~(RegEnc >> 3) & 1;
+    EVEX_R2 = ~(RegEnc >> 4) & 1;
     break;
   }
-  case X86II::MRMSrcMem:
+  case X86II::MRMSrcMem: {
     // MRMSrcMem instructions forms:
     //  src1(ModR/M), MemAddr
     //  src1(ModR/M), src2(VEX_4V), MemAddr
@@ -762,31 +750,25 @@ void X86MCCodeEmitter::EmitVEXOpcodePrefix(uint64_t TSFlags, unsigned &CurByte,
     //  FMA4:
     //  dst(ModR/M.reg), src1(VEX_4V), src2(ModR/M), src3(VEX_I8IMM)
     //  dst(ModR/M.reg), src1(VEX_4V), src2(VEX_I8IMM), src3(ModR/M),
-    if (X86II::isX86_64ExtendedReg(MI.getOperand(CurOp).getReg()))
-      VEX_R = 0x0;
-    if (X86II::is32ExtendedReg(MI.getOperand(CurOp).getReg()))
-      EVEX_R2 = 0x0;
-    CurOp++;
+    unsigned RegEnc = getX86RegEncoding(MI, CurOp++);
+    VEX_R = ~(RegEnc >> 3) & 1;
+    EVEX_R2 = ~(RegEnc >> 4) & 1;
 
     if (HasEVEX_K)
-      EVEX_aaa = getWriteMaskRegisterEncoding(MI, CurOp++);
+      EVEX_aaa = getX86RegEncoding(MI, CurOp++);
 
     if (HasVEX_4V) {
-      VEX_4V = getVEXRegisterEncoding(MI, CurOp);
-      if (X86II::is32ExtendedReg(MI.getOperand(CurOp).getReg()))
-        EVEX_V2 = 0x0;
-      CurOp++;
+      unsigned VRegEnc = getX86RegEncoding(MI, CurOp++);
+      VEX_4V = ~VRegEnc & 0xf;
+      EVEX_V2 = ~(VRegEnc >> 4) & 1;
     }
 
-    if (X86II::isX86_64ExtendedReg(
-               MI.getOperand(MemOperand+X86::AddrBaseReg).getReg()))
-      VEX_B = 0x0;
-    if (X86II::isX86_64ExtendedReg(
-               MI.getOperand(MemOperand+X86::AddrIndexReg).getReg()))
-      VEX_X = 0x0;
-    if (X86II::is32ExtendedReg(MI.getOperand(MemOperand +
-                               X86::AddrIndexReg).getReg()))
-      EVEX_V2 = 0x0;
+    unsigned BaseRegEnc = getX86RegEncoding(MI, MemOperand + X86::AddrBaseReg);
+    VEX_B = ~(BaseRegEnc >> 3) & 1;
+    unsigned IndexRegEnc = getX86RegEncoding(MI, MemOperand+X86::AddrIndexReg);
+    VEX_X = ~(IndexRegEnc >> 3) & 1;
+    if (!HasVEX_4V) // Only needed with VSIB which don't use VVVV.
+      EVEX_V2 = ~(IndexRegEnc >> 4) & 1;
 
     if (HasVEX_4VOp3)
       // Instruction format for 4VOp3:
@@ -794,8 +776,9 @@ void X86MCCodeEmitter::EmitVEXOpcodePrefix(uint64_t TSFlags, unsigned &CurByte,
       // CurOp points to start of the MemoryOperand,
       //   it skips TIED_TO operands if exist, then increments past src1.
       // CurOp + X86::AddrNumOperands will point to src3.
-      VEX_4V = getVEXRegisterEncoding(MI, CurOp+X86::AddrNumOperands);
+      VEX_4V = ~getX86RegEncoding(MI, CurOp + X86::AddrNumOperands) & 0xf;
     break;
+  }
   case X86II::MRM0m: case X86II::MRM1m:
   case X86II::MRM2m: case X86II::MRM3m:
   case X86II::MRM4m: case X86II::MRM5m:
@@ -804,24 +787,21 @@ void X86MCCodeEmitter::EmitVEXOpcodePrefix(uint64_t TSFlags, unsigned &CurByte,
     //  MemAddr
     //  src1(VEX_4V), MemAddr
     if (HasVEX_4V) {
-      VEX_4V = getVEXRegisterEncoding(MI, CurOp);
-      if (X86II::is32ExtendedReg(MI.getOperand(CurOp).getReg()))
-        EVEX_V2 = 0x0;
-      CurOp++;
+      unsigned VRegEnc = getX86RegEncoding(MI, CurOp++);
+      VEX_4V = ~VRegEnc & 0xf;
+      EVEX_V2 = ~(VRegEnc >> 4) & 1;
     }
 
     if (HasEVEX_K)
-      EVEX_aaa = getWriteMaskRegisterEncoding(MI, CurOp++);
-
-    if (X86II::isX86_64ExtendedReg(
-               MI.getOperand(MemOperand+X86::AddrBaseReg).getReg()))
-      VEX_B = 0x0;
-    if (X86II::isX86_64ExtendedReg(
-               MI.getOperand(MemOperand+X86::AddrIndexReg).getReg()))
-      VEX_X = 0x0;
+      EVEX_aaa = getX86RegEncoding(MI, CurOp++);
+
+    unsigned BaseRegEnc = getX86RegEncoding(MI, MemOperand + X86::AddrBaseReg);
+    VEX_B = ~(BaseRegEnc >> 3) & 1;
+    unsigned IndexRegEnc = getX86RegEncoding(MI, MemOperand+X86::AddrIndexReg);
+    VEX_X = ~(IndexRegEnc >> 3) & 1;
     break;
   }
-  case X86II::MRMSrcReg:
+  case X86II::MRMSrcReg: {
     // MRMSrcReg instructions forms:
     //  dst(ModR/M), src1(VEX_4V), src2(ModR/M), src3(VEX_I8IMM)
     //  dst(ModR/M), src1(ModR/M)
@@ -830,32 +810,27 @@ void X86MCCodeEmitter::EmitVEXOpcodePrefix(uint64_t TSFlags, unsigned &CurByte,
     //  FMA4:
     //  dst(ModR/M.reg), src1(VEX_4V), src2(ModR/M), src3(VEX_I8IMM)
     //  dst(ModR/M.reg), src1(VEX_4V), src2(VEX_I8IMM), src3(ModR/M),
-    if (X86II::isX86_64ExtendedReg(MI.getOperand(CurOp).getReg()))
-      VEX_R = 0x0;
-    if (X86II::is32ExtendedReg(MI.getOperand(CurOp).getReg()))
-      EVEX_R2 = 0x0;
-    CurOp++;
+    unsigned RegEnc = getX86RegEncoding(MI, CurOp++);
+    VEX_R = ~(RegEnc >> 3) & 1;
+    EVEX_R2 = ~(RegEnc >> 4) & 1;
 
     if (HasEVEX_K)
-      EVEX_aaa = getWriteMaskRegisterEncoding(MI, CurOp++);
+      EVEX_aaa = getX86RegEncoding(MI, CurOp++);
 
     if (HasVEX_4V) {
-      VEX_4V = getVEXRegisterEncoding(MI, CurOp);
-      if (X86II::is32ExtendedReg(MI.getOperand(CurOp).getReg()))
-        EVEX_V2 = 0x0;
-      CurOp++;
+      unsigned VRegEnc = getX86RegEncoding(MI, CurOp++);
+      VEX_4V = ~VRegEnc & 0xf;
+      EVEX_V2 = ~(VRegEnc >> 4) & 1;
     }
 
     if (HasMemOp4) // Skip second register source (encoded in I8IMM)
       CurOp++;
 
-    if (X86II::isX86_64ExtendedReg(MI.getOperand(CurOp).getReg()))
-      VEX_B = 0x0;
-    if (X86II::is32ExtendedReg(MI.getOperand(CurOp).getReg()))
-      VEX_X = 0x0;
-    CurOp++;
+    RegEnc = getX86RegEncoding(MI, CurOp++);
+    VEX_B = ~(RegEnc >> 3) & 1;
+    VEX_X = ~(RegEnc >> 4) & 1;
     if (HasVEX_4VOp3)
-      VEX_4V = getVEXRegisterEncoding(MI, CurOp++);
+      VEX_4V = ~getX86RegEncoding(MI, CurOp++) & 0xf;
     if (EVEX_b) {
       if (HasEVEX_RC) {
         unsigned RcOperand = NumOps-1;
@@ -865,55 +840,52 @@ void X86MCCodeEmitter::EmitVEXOpcodePrefix(uint64_t TSFlags, unsigned &CurByte,
       EncodeRC = true;
     }
     break;
-  case X86II::MRMDestReg:
+  }
+  case X86II::MRMDestReg: {
     // MRMDestReg instructions forms:
     //  dst(ModR/M), src(ModR/M)
     //  dst(ModR/M), src(ModR/M), imm8
     //  dst(ModR/M), src1(VEX_4V), src2(ModR/M)
-    if (X86II::isX86_64ExtendedReg(MI.getOperand(CurOp).getReg()))
-      VEX_B = 0x0;
-    if (X86II::is32ExtendedReg(MI.getOperand(CurOp).getReg()))
-      VEX_X = 0x0;
-    CurOp++;
+    unsigned RegEnc = getX86RegEncoding(MI, CurOp++);
+    VEX_B = ~(RegEnc >> 3) & 1;
+    VEX_X = ~(RegEnc >> 4) & 1;
 
     if (HasEVEX_K)
-      EVEX_aaa = getWriteMaskRegisterEncoding(MI, CurOp++);
+      EVEX_aaa = getX86RegEncoding(MI, CurOp++);
 
     if (HasVEX_4V) {
-      VEX_4V = getVEXRegisterEncoding(MI, CurOp);
-      if (X86II::is32ExtendedReg(MI.getOperand(CurOp).getReg()))
-        EVEX_V2 = 0x0;
-      CurOp++;
+      unsigned VRegEnc = getX86RegEncoding(MI, CurOp++);
+      VEX_4V = ~VRegEnc & 0xf;
+      EVEX_V2 = ~(VRegEnc >> 4) & 1;
     }
 
-    if (X86II::isX86_64ExtendedReg(MI.getOperand(CurOp).getReg()))
-      VEX_R = 0x0;
-    if (X86II::is32ExtendedReg(MI.getOperand(CurOp).getReg()))
-      EVEX_R2 = 0x0;
+    RegEnc = getX86RegEncoding(MI, CurOp++);
+    VEX_R = ~(RegEnc >> 3) & 1;
+    EVEX_R2 = ~(RegEnc >> 4) & 1;
     if (EVEX_b)
       EncodeRC = true;
     break;
+  }
   case X86II::MRM0r: case X86II::MRM1r:
   case X86II::MRM2r: case X86II::MRM3r:
   case X86II::MRM4r: case X86II::MRM5r:
-  case X86II::MRM6r: case X86II::MRM7r:
+  case X86II::MRM6r: case X86II::MRM7r: {
     // MRM0r-MRM7r instructions forms:
     //  dst(VEX_4V), src(ModR/M), imm8
     if (HasVEX_4V) {
-      VEX_4V = getVEXRegisterEncoding(MI, CurOp);
-      if (X86II::is32ExtendedReg(MI.getOperand(CurOp).getReg()))
-          EVEX_V2 = 0x0;
-      CurOp++;
+      unsigned VRegEnc = getX86RegEncoding(MI, CurOp++);
+      VEX_4V = ~VRegEnc & 0xf;
+      EVEX_V2 = ~(VRegEnc >> 4) & 1;
     }
     if (HasEVEX_K)
-      EVEX_aaa = getWriteMaskRegisterEncoding(MI, CurOp++);
+      EVEX_aaa = getX86RegEncoding(MI, CurOp++);
 
-    if (X86II::isX86_64ExtendedReg(MI.getOperand(CurOp).getReg()))
-      VEX_B = 0x0;
-    if (X86II::is32ExtendedReg(MI.getOperand(CurOp).getReg()))
-      VEX_X = 0x0;
+    unsigned RegEnc = getX86RegEncoding(MI, CurOp++);
+    VEX_B = ~(RegEnc >> 3) & 1;
+    VEX_X = ~(RegEnc >> 4) & 1;
     break;
   }
+  }
 
   if (Encoding == X86II::VEX || Encoding == X86II::XOP) {
     // VEX opcode prefix can have 2 or 3 bytes
@@ -931,7 +903,7 @@ void X86MCCodeEmitter::EmitVEXOpcodePrefix(uint64_t TSFlags, unsigned &CurByte,
     //    +-----+ +--------------+ +-------------------+
     //    | 8Fh | | RXB | m-mmmm | | W | vvvv | L | pp |
     //    +-----+ +--------------+ +-------------------+
-    unsigned char LastByte = VEX_PP | (VEX_L << 2) | (VEX_4V << 3);
+    uint8_t LastByte = VEX_PP | (VEX_L << 2) | (VEX_4V << 3);
 
     // Can we use the 2 byte VEX prefix?
     if (Encoding == X86II::VEX && VEX_B && VEX_X && !VEX_W && (VEX_5M == 1)) {
@@ -954,8 +926,6 @@ void X86MCCodeEmitter::EmitVEXOpcodePrefix(uint64_t TSFlags, unsigned &CurByte,
     assert((VEX_5M & 0x3) == VEX_5M
            && "More than 2 significant bits in VEX.m-mmmm fields for EVEX!");
 
-    VEX_5M &= 0x3;
-
     EmitByte(0x62, CurByte, OS);
     EmitByte((VEX_R   << 7) |
              (VEX_X   << 6) |
@@ -968,26 +938,27 @@ void X86MCCodeEmitter::EmitVEXOpcodePrefix(uint64_t TSFlags, unsigned &CurByte,
              VEX_PP, CurByte, OS);
     if (EncodeRC)
       EmitByte((EVEX_z  << 7) |
-              (EVEX_rc << 5) |
-              (EVEX_b  << 4) |
-              (EVEX_V2 << 3) |
-              EVEX_aaa, CurByte, OS);
+               (EVEX_rc << 5) |
+               (EVEX_b  << 4) |
+               (EVEX_V2 << 3) |
+               EVEX_aaa, CurByte, OS);
     else
       EmitByte((EVEX_z  << 7) |
-              (EVEX_L2 << 6) |
-              (VEX_L   << 5) |
-              (EVEX_b  << 4) |
-              (EVEX_V2 << 3) |
-              EVEX_aaa, CurByte, OS);
+               (EVEX_L2 << 6) |
+               (VEX_L   << 5) |
+               (EVEX_b  << 4) |
+               (EVEX_V2 << 3) |
+               EVEX_aaa, CurByte, OS);
   }
 }
 
 /// DetermineREXPrefix - Determine if the MCInst has to be encoded with a X86-64
 /// REX prefix which specifies 1) 64-bit instructions, 2) non-default operand
 /// size, and 3) use of X86-64 extended registers.
-static unsigned DetermineREXPrefix(const MCInst &MI, uint64_t TSFlags,
-                                   const MCInstrDesc &Desc) {
-  unsigned REX = 0;
+uint8_t X86MCCodeEmitter::DetermineREXPrefix(const MCInst &MI, uint64_t TSFlags,
+                                             int MemOperand,
+                                             const MCInstrDesc &Desc) const {
+  uint8_t REX = 0;
   bool UsesHighByteReg = false;
 
   if (TSFlags & X86II::REX_W)
@@ -996,13 +967,10 @@ static unsigned DetermineREXPrefix(const MCInst &MI, uint64_t TSFlags,
   if (MI.getNumOperands() == 0) return REX;
 
   unsigned NumOps = MI.getNumOperands();
-  // FIXME: MCInst should explicitize the two-addrness.
-  bool isTwoAddr = NumOps > 1 &&
-                      Desc.getOperandConstraint(1, MCOI::TIED_TO) != -1;
+  unsigned CurOp = X86II::getOperandBias(Desc);
 
   // If it accesses SPL, BPL, SIL, or DIL, then it requires a 0x40 REX prefix.
-  unsigned i = isTwoAddr ? 1 : 0;
-  for (; i != NumOps; ++i) {
+  for (unsigned i = CurOp; i != NumOps; ++i) {
     const MCOperand &MO = MI.getOperand(i);
     if (!MO.isReg()) continue;
     unsigned Reg = MO.getReg();
@@ -1016,65 +984,44 @@ static unsigned DetermineREXPrefix(const MCInst &MI, uint64_t TSFlags,
   }
 
   switch (TSFlags & X86II::FormMask) {
+  case X86II::AddRegFrm:
+    REX |= isX86_64ExtendedReg(MI, CurOp++) << 0; // REX.B
+    break;
   case X86II::MRMSrcReg:
-    if (MI.getOperand(0).isReg() &&
-        X86II::isX86_64ExtendedReg(MI.getOperand(0).getReg()))
-      REX |= 1 << 2; // set REX.R
-    i = isTwoAddr ? 2 : 1;
-    for (; i != NumOps; ++i) {
-      const MCOperand &MO = MI.getOperand(i);
-      if (MO.isReg() && X86II::isX86_64ExtendedReg(MO.getReg()))
-        REX |= 1 << 0; // set REX.B
-    }
+    REX |= isX86_64ExtendedReg(MI, CurOp++) << 2; // REX.R
+    REX |= isX86_64ExtendedReg(MI, CurOp++) << 0; // REX.B
     break;
   case X86II::MRMSrcMem: {
-    if (MI.getOperand(0).isReg() &&
-        X86II::isX86_64ExtendedReg(MI.getOperand(0).getReg()))
-      REX |= 1 << 2; // set REX.R
-    unsigned Bit = 0;
-    i = isTwoAddr ? 2 : 1;
-    for (; i != NumOps; ++i) {
-      const MCOperand &MO = MI.getOperand(i);
-      if (MO.isReg()) {
-        if (X86II::isX86_64ExtendedReg(MO.getReg()))
-          REX |= 1 << Bit; // set REX.B (Bit=0) and REX.X (Bit=1)
-        Bit++;
-      }
-    }
+    REX |= isX86_64ExtendedReg(MI, CurOp++) << 2; // REX.R
+    REX |= isX86_64ExtendedReg(MI, MemOperand+X86::AddrBaseReg) << 0; // REX.B
+    REX |= isX86_64ExtendedReg(MI, MemOperand+X86::AddrIndexReg) << 1; // REX.X
+    CurOp += X86::AddrNumOperands;
     break;
   }
+  case X86II::MRMDestReg:
+    REX |= isX86_64ExtendedReg(MI, CurOp++) << 0; // REX.B
+    REX |= isX86_64ExtendedReg(MI, CurOp++) << 2; // REX.R
+    break;
+  case X86II::MRMDestMem:
+    REX |= isX86_64ExtendedReg(MI, MemOperand+X86::AddrBaseReg) << 0; // REX.B
+    REX |= isX86_64ExtendedReg(MI, MemOperand+X86::AddrIndexReg) << 1; // REX.X
+    CurOp += X86::AddrNumOperands;
+    REX |= isX86_64ExtendedReg(MI, CurOp++) << 2; // REX.R
+    break;
   case X86II::MRMXm:
   case X86II::MRM0m: case X86II::MRM1m:
   case X86II::MRM2m: case X86II::MRM3m:
   case X86II::MRM4m: case X86II::MRM5m:
   case X86II::MRM6m: case X86II::MRM7m:
-  case X86II::MRMDestMem: {
-    unsigned e = (isTwoAddr ? X86::AddrNumOperands+1 : X86::AddrNumOperands);
-    i = isTwoAddr ? 1 : 0;
-    if (NumOps > e && MI.getOperand(e).isReg() &&
-        X86II::isX86_64ExtendedReg(MI.getOperand(e).getReg()))
-      REX |= 1 << 2; // set REX.R
-    unsigned Bit = 0;
-    for (; i != e; ++i) {
-      const MCOperand &MO = MI.getOperand(i);
-      if (MO.isReg()) {
-        if (X86II::isX86_64ExtendedReg(MO.getReg()))
-          REX |= 1 << Bit; // REX.B (Bit=0) and REX.X (Bit=1)
-        Bit++;
-      }
-    }
+    REX |= isX86_64ExtendedReg(MI, MemOperand+X86::AddrBaseReg) << 0; // REX.B
+    REX |= isX86_64ExtendedReg(MI, MemOperand+X86::AddrIndexReg) << 1; // REX.X
     break;
-  }
-  default:
-    if (MI.getOperand(0).isReg() &&
-        X86II::isX86_64ExtendedReg(MI.getOperand(0).getReg()))
-      REX |= 1 << 0; // set REX.B
-    i = isTwoAddr ? 2 : 1;
-    for (unsigned e = NumOps; i != e; ++i) {
-      const MCOperand &MO = MI.getOperand(i);
-      if (MO.isReg() && X86II::isX86_64ExtendedReg(MO.getReg()))
-        REX |= 1 << 2; // set REX.R
-    }
+  case X86II::MRMXr:
+  case X86II::MRM0r: case X86II::MRM1r:
+  case X86II::MRM2r: case X86II::MRM3r:
+  case X86II::MRM4r: case X86II::MRM5r:
+  case X86II::MRM6r: case X86II::MRM7r:
+    REX |= isX86_64ExtendedReg(MI, CurOp++) << 0; // REX.B
     break;
   }
   if (REX && UsesHighByteReg)
@@ -1101,16 +1048,18 @@ void X86MCCodeEmitter::EmitSegmentOverridePrefix(unsigned &CurByte,
   }
 }
 
-/// EmitOpcodePrefix - Emit all instruction prefixes prior to the opcode.
+/// Emit all instruction prefixes prior to the opcode.
 ///
 /// MemOperand is the operand # of the start of a memory operand if present.  If
 /// Not present, it is -1.
-void X86MCCodeEmitter::EmitOpcodePrefix(uint64_t TSFlags, unsigned &CurByte,
+///
+/// Returns true if a REX prefix was used.
+bool X86MCCodeEmitter::emitOpcodePrefix(uint64_t TSFlags, unsigned &CurByte,
                                         int MemOperand, const MCInst &MI,
                                         const MCInstrDesc &Desc,
                                         const MCSubtargetInfo &STI,
                                         raw_ostream &OS) const {
-
+  bool Ret = false;
   // Emit the operand size opcode prefix as needed.
   if ((TSFlags & X86II::OpSizeMask) == (is16BitMode(STI) ? X86II::OpSize32
                                                          : X86II::OpSize16))
@@ -1135,8 +1084,10 @@ void X86MCCodeEmitter::EmitOpcodePrefix(uint64_t TSFlags, unsigned &CurByte,
   // Handle REX prefix.
   // FIXME: Can this come before F2 etc to simplify emission?
   if (is64BitMode(STI)) {
-    if (unsigned REX = DetermineREXPrefix(MI, TSFlags, Desc))
+    if (uint8_t REX = DetermineREXPrefix(MI, TSFlags, MemOperand, Desc)) {
       EmitByte(0x40 | REX, CurByte, OS);
+      Ret = true;
+    }
   }
 
   // 0x0F escape code must be emitted just before the opcode.
@@ -1156,6 +1107,7 @@ void X86MCCodeEmitter::EmitOpcodePrefix(uint64_t TSFlags, unsigned &CurByte,
     EmitByte(0x3A, CurByte, OS);
     break;
   }
+  return Ret;
 }
 
 void X86MCCodeEmitter::
@@ -1183,14 +1135,18 @@ encodeInstruction(const MCInst &MI, raw_ostream &OS,
   bool HasVEX_4V = TSFlags & X86II::VEX_4V;
   bool HasVEX_4VOp3 = TSFlags & X86II::VEX_4VOp3;
   bool HasMemOp4 = TSFlags & X86II::MemOp4;
-  const unsigned MemOp4_I8IMMOperand = 2;
+  bool HasVEX_I8IMM = TSFlags & X86II::VEX_I8IMM;
+  assert((!HasMemOp4 || HasVEX_I8IMM) && "MemOp4 should imply VEX_I8IMM");
 
   // It uses the EVEX.aaa field?
   bool HasEVEX_K = TSFlags & X86II::EVEX_K;
   bool HasEVEX_RC = TSFlags & X86II::EVEX_RC;
 
+  // Used if a register is encoded in 7:4 of immediate.
+  unsigned I8RegNum = 0;
+
   // Determine where the memory operand starts, if present.
-  int MemoryOperand = X86II::getMemoryOperandNo(TSFlags, Opcode);
+  int MemoryOperand = X86II::getMemoryOperandNo(TSFlags);
   if (MemoryOperand != -1) MemoryOperand += CurOp;
 
   // Emit segment override opcode prefix as needed.
@@ -1226,19 +1182,20 @@ encodeInstruction(const MCInst &MI, raw_ostream &OS,
   if (need_address_override)
     EmitByte(0x67, CurByte, OS);
 
+  bool Rex = false;
   if (Encoding == 0)
-    EmitOpcodePrefix(TSFlags, CurByte, MemoryOperand, MI, Desc, STI, OS);
+    Rex = emitOpcodePrefix(TSFlags, CurByte, MemoryOperand, MI, Desc, STI, OS);
   else
     EmitVEXOpcodePrefix(TSFlags, CurByte, MemoryOperand, MI, Desc, OS);
 
-  unsigned char BaseOpcode = X86II::getBaseOpcodeFor(TSFlags);
+  uint8_t BaseOpcode = X86II::getBaseOpcodeFor(TSFlags);
 
   if (TSFlags & X86II::Has3DNow0F0FOpcode)
     BaseOpcode = 0x0F;   // Weird 3DNow! encoding.
 
-  unsigned SrcRegNum = 0;
-  switch (TSFlags & X86II::FormMask) {
-  default: errs() << "FORM: " << (TSFlags & X86II::FormMask) << "\n";
+  uint64_t Form = TSFlags & X86II::FormMask;
+  switch (Form) {
+  default: errs() << "FORM: " << Form << "\n";
     llvm_unreachable("Unknown FormMask value in X86MCCodeEmitter!");
   case X86II::Pseudo:
     llvm_unreachable("Pseudo instruction shouldn't be emitted");
@@ -1315,12 +1272,12 @@ encodeInstruction(const MCInst &MI, raw_ostream &OS,
     EmitByte(BaseOpcode + GetX86RegNum(MI.getOperand(CurOp++)), CurByte, OS);
     break;
 
-  case X86II::MRMDestReg:
+  case X86II::MRMDestReg: {
     EmitByte(BaseOpcode, CurByte, OS);
-    SrcRegNum = CurOp + 1;
+    unsigned SrcRegNum = CurOp + 1;
 
     if (HasEVEX_K) // Skip writemask
-      SrcRegNum++;
+      ++SrcRegNum;
 
     if (HasVEX_4V) // Skip 1st src (which is encoded in VEX_VVVV)
       ++SrcRegNum;
@@ -1329,71 +1286,68 @@ encodeInstruction(const MCInst &MI, raw_ostream &OS,
                      GetX86RegNum(MI.getOperand(SrcRegNum)), CurByte, OS);
     CurOp = SrcRegNum + 1;
     break;
-
-  case X86II::MRMDestMem:
+  }
+  case X86II::MRMDestMem: {
     EmitByte(BaseOpcode, CurByte, OS);
-    SrcRegNum = CurOp + X86::AddrNumOperands;
+    unsigned SrcRegNum = CurOp + X86::AddrNumOperands;
 
     if (HasEVEX_K) // Skip writemask
-      SrcRegNum++;
+      ++SrcRegNum;
 
     if (HasVEX_4V) // Skip 1st src (which is encoded in VEX_VVVV)
       ++SrcRegNum;
 
-    EmitMemModRMByte(MI, CurOp,
-                     GetX86RegNum(MI.getOperand(SrcRegNum)),
-                     TSFlags, CurByte, OS, Fixups, STI);
+    emitMemModRMByte(MI, CurOp, GetX86RegNum(MI.getOperand(SrcRegNum)), TSFlags,
+                     Rex, CurByte, OS, Fixups, STI);
     CurOp = SrcRegNum + 1;
     break;
-
-  case X86II::MRMSrcReg:
+  }
+  case X86II::MRMSrcReg: {
     EmitByte(BaseOpcode, CurByte, OS);
-    SrcRegNum = CurOp + 1;
+    unsigned SrcRegNum = CurOp + 1;
 
     if (HasEVEX_K) // Skip writemask
-      SrcRegNum++;
+      ++SrcRegNum;
 
     if (HasVEX_4V) // Skip 1st src (which is encoded in VEX_VVVV)
       ++SrcRegNum;
 
-    if (HasMemOp4) // Skip 2nd src (which is encoded in I8IMM)
-      ++SrcRegNum;
+    if (HasMemOp4) // Capture 2nd src (which is encoded in I8IMM)
+      I8RegNum = getX86RegEncoding(MI, SrcRegNum++);
 
     EmitRegModRMByte(MI.getOperand(SrcRegNum),
                      GetX86RegNum(MI.getOperand(CurOp)), CurByte, OS);
-
-    // 2 operands skipped with HasMemOp4, compensate accordingly
-    CurOp = HasMemOp4 ? SrcRegNum : SrcRegNum + 1;
+    CurOp = SrcRegNum + 1;
     if (HasVEX_4VOp3)
       ++CurOp;
+    if (!HasMemOp4 && HasVEX_I8IMM)
+      I8RegNum = getX86RegEncoding(MI, CurOp++);
     // do not count the rounding control operand
     if (HasEVEX_RC)
-      NumOps--;
+      --NumOps;
     break;
-
+  }
   case X86II::MRMSrcMem: {
-    int AddrOperands = X86::AddrNumOperands;
     unsigned FirstMemOp = CurOp+1;
 
-    if (HasEVEX_K) { // Skip writemask
-      ++AddrOperands;
+    if (HasEVEX_K) // Skip writemask
       ++FirstMemOp;
-    }
 
-    if (HasVEX_4V) {
-      ++AddrOperands;
+    if (HasVEX_4V)
       ++FirstMemOp;  // Skip the register source (which is encoded in VEX_VVVV).
-    }
-    if (HasMemOp4) // Skip second register source (encoded in I8IMM)
-      ++FirstMemOp;
+
+    if (HasMemOp4) // Capture second register source (encoded in I8IMM)
+      I8RegNum = getX86RegEncoding(MI, FirstMemOp++);
 
     EmitByte(BaseOpcode, CurByte, OS);
 
-    EmitMemModRMByte(MI, FirstMemOp, GetX86RegNum(MI.getOperand(CurOp)),
-                     TSFlags, CurByte, OS, Fixups, STI);
-    CurOp += AddrOperands + 1;
+    emitMemModRMByte(MI, FirstMemOp, GetX86RegNum(MI.getOperand(CurOp)),
+                     TSFlags, Rex, CurByte, OS, Fixups, STI);
+    CurOp = FirstMemOp + X86::AddrNumOperands;
     if (HasVEX_4VOp3)
       ++CurOp;
+    if (!HasMemOp4 && HasVEX_I8IMM)
+      I8RegNum = getX86RegEncoding(MI, CurOp++);
     break;
   }
 
@@ -1407,7 +1361,6 @@ encodeInstruction(const MCInst &MI, raw_ostream &OS,
     if (HasEVEX_K) // Skip writemask
       ++CurOp;
     EmitByte(BaseOpcode, CurByte, OS);
-    uint64_t Form = TSFlags & X86II::FormMask;
     EmitRegModRMByte(MI.getOperand(CurOp++),
                      (Form == X86II::MRMXr) ? 0 : Form-X86II::MRM0r,
                      CurByte, OS);
@@ -1424,9 +1377,9 @@ encodeInstruction(const MCInst &MI, raw_ostream &OS,
     if (HasEVEX_K) // Skip writemask
       ++CurOp;
     EmitByte(BaseOpcode, CurByte, OS);
-    uint64_t Form = TSFlags & X86II::FormMask;
-    EmitMemModRMByte(MI, CurOp, (Form == X86II::MRMXm) ? 0 : Form-X86II::MRM0m,
-                     TSFlags, CurByte, OS, Fixups, STI);
+    emitMemModRMByte(MI, CurOp,
+                     (Form == X86II::MRMXm) ? 0 : Form - X86II::MRM0m, TSFlags,
+                     Rex, CurByte, OS, Fixups, STI);
     CurOp += X86::AddrNumOperands;
     break;
   }
@@ -1453,38 +1406,27 @@ encodeInstruction(const MCInst &MI, raw_ostream &OS,
   case X86II::MRM_FC: case X86II::MRM_FD: case X86II::MRM_FE:
   case X86II::MRM_FF:
     EmitByte(BaseOpcode, CurByte, OS);
-
-    uint64_t Form = TSFlags & X86II::FormMask;
     EmitByte(0xC0 + Form - X86II::MRM_C0, CurByte, OS);
     break;
   }
 
-  // If there is a remaining operand, it must be a trailing immediate.  Emit it
-  // according to the right size for the instruction. Some instructions
-  // (SSE4a extrq and insertq) have two trailing immediates.
-  while (CurOp != NumOps && NumOps - CurOp <= 2) {
+  if (HasVEX_I8IMM) {
     // The last source register of a 4 operand instruction in AVX is encoded
     // in bits[7:4] of a immediate byte.
-    if (TSFlags & X86II::VEX_I8IMM) {
-      const MCOperand &MO = MI.getOperand(HasMemOp4 ? MemOp4_I8IMMOperand
-                                                    : CurOp);
-      ++CurOp;
-      unsigned RegNum = GetX86RegNum(MO) << 4;
-      if (X86II::isX86_64ExtendedReg(MO.getReg()))
-        RegNum |= 1 << 7;
-      // If there is an additional 5th operand it must be an immediate, which
-      // is encoded in bits[3:0]
-      if (CurOp != NumOps) {
-        const MCOperand &MIMM = MI.getOperand(CurOp++);
-        if (MIMM.isImm()) {
-          unsigned Val = MIMM.getImm();
-          assert(Val < 16 && "Immediate operand value out of range");
-          RegNum |= Val;
-        }
-      }
-      EmitImmediate(MCOperand::createImm(RegNum), MI.getLoc(), 1, FK_Data_1,
-                    CurByte, OS, Fixups);
-    } else {
+    assert(I8RegNum < 16 && "Register encoding out of range");
+    I8RegNum <<= 4;
+    if (CurOp != NumOps) {
+      unsigned Val = MI.getOperand(CurOp++).getImm();
+      assert(Val < 16 && "Immediate operand value out of range");
+      I8RegNum |= Val;
+    }
+    EmitImmediate(MCOperand::createImm(I8RegNum), MI.getLoc(), 1, FK_Data_1,
+                  CurByte, OS, Fixups);
+  } else {
+    // If there is a remaining operand, it must be a trailing immediate. Emit it
+    // according to the right size for the instruction. Some instructions
+    // (SSE4a extrq and insertq) have two trailing immediates.
+    while (CurOp != NumOps && NumOps - CurOp <= 2) {
       EmitImmediate(MI.getOperand(CurOp++), MI.getLoc(),
                     X86II::getSizeOfImm(TSFlags), getImmFixupKind(TSFlags),
                     CurByte, OS, Fixups);
diff --git a/lib/Target/X86/MCTargetDesc/X86MCTargetDesc.cpp b/lib/Target/X86/MCTargetDesc/X86MCTargetDesc.cpp
index 53a6550acdd5..311a8d677eea 100644
--- a/lib/Target/X86/MCTargetDesc/X86MCTargetDesc.cpp
+++ b/lib/Target/X86/MCTargetDesc/X86MCTargetDesc.cpp
@@ -16,7 +16,6 @@
 #include "InstPrinter/X86IntelInstPrinter.h"
 #include "X86MCAsmInfo.h"
 #include "llvm/ADT/Triple.h"
-#include "llvm/MC/MCCodeGenInfo.h"
 #include "llvm/MC/MCInstrAnalysis.h"
 #include "llvm/MC/MCInstrInfo.h"
 #include "llvm/MC/MCRegisterInfo.h"
@@ -66,12 +65,59 @@ unsigned X86_MC::getDwarfRegFlavour(const Triple &TT, bool isEH) {
   return DWARFFlavour::X86_32_Generic;
 }
 
-void X86_MC::InitLLVM2SEHRegisterMapping(MCRegisterInfo *MRI) {
+void X86_MC::initLLVMToSEHAndCVRegMapping(MCRegisterInfo *MRI) {
   // FIXME: TableGen these.
-  for (unsigned Reg = X86::NoRegister+1; Reg < X86::NUM_TARGET_REGS; ++Reg) {
+  for (unsigned Reg = X86::NoRegister + 1; Reg < X86::NUM_TARGET_REGS; ++Reg) {
     unsigned SEH = MRI->getEncodingValue(Reg);
     MRI->mapLLVMRegToSEHReg(Reg, SEH);
   }
+
+  // These CodeView registers are numbered sequentially starting at value 1.
+  static const MCPhysReg LowCVRegs[] = {
+      X86::AL,  X86::CL,  X86::DL,  X86::BL,  X86::AH,  X86::CH,
+      X86::DH,  X86::BH,  X86::AX,  X86::CX,  X86::DX,  X86::BX,
+      X86::SP,  X86::BP,  X86::SI,  X86::DI,  X86::EAX, X86::ECX,
+      X86::EDX, X86::EBX, X86::ESP, X86::EBP, X86::ESI, X86::EDI,
+  };
+  unsigned CVLowRegStart = 1;
+  for (unsigned I = 0; I < array_lengthof(LowCVRegs); ++I)
+    MRI->mapLLVMRegToCVReg(LowCVRegs[I], I + CVLowRegStart);
+
+  MRI->mapLLVMRegToCVReg(X86::EFLAGS, 34);
+
+  // The x87 registers start at 128 and are numbered sequentially.
+  unsigned FP0Start = 128;
+  for (unsigned I = 0; I < 8; ++I)
+    MRI->mapLLVMRegToCVReg(X86::FP0 + I, FP0Start + I);
+
+  // The low 8 XMM registers start at 154 and are numbered sequentially.
+  unsigned CVXMM0Start = 154;
+  for (unsigned I = 0; I < 8; ++I)
+    MRI->mapLLVMRegToCVReg(X86::XMM0 + I, CVXMM0Start + I);
+
+  // The high 8 XMM registers start at 252 and are numbered sequentially.
+  unsigned CVXMM8Start = 252;
+  for (unsigned I = 0; I < 8; ++I)
+    MRI->mapLLVMRegToCVReg(X86::XMM8 + I, CVXMM8Start + I);
+
+  // FIXME: XMM16 and above from AVX512 not yet documented.
+
+  // AMD64 registers start at 324 and count up.
+  unsigned CVX64RegStart = 324;
+  static const MCPhysReg CVX64Regs[] = {
+      X86::SIL,   X86::DIL,   X86::BPL,   X86::SPL,   X86::RAX,   X86::RBX,
+      X86::RCX,   X86::RDX,   X86::RSI,   X86::RDI,   X86::RBP,   X86::RSP,
+      X86::R8,    X86::R9,    X86::R10,   X86::R11,   X86::R12,   X86::R13,
+      X86::R14,   X86::R15,   X86::R8B,   X86::R9B,   X86::R10B,  X86::R11B,
+      X86::R12B,  X86::R13B,  X86::R14B,  X86::R15B,  X86::R8W,   X86::R9W,
+      X86::R10W,  X86::R11W,  X86::R12W,  X86::R13W,  X86::R14W,  X86::R15W,
+      X86::R8D,   X86::R9D,   X86::R10D,  X86::R11D,  X86::R12D,  X86::R13D,
+      X86::R14D,  X86::R15D,  X86::YMM0,  X86::YMM1,  X86::YMM2,  X86::YMM3,
+      X86::YMM4,  X86::YMM5,  X86::YMM6,  X86::YMM7,  X86::YMM8,  X86::YMM9,
+      X86::YMM10, X86::YMM11, X86::YMM12, X86::YMM13, X86::YMM14, X86::YMM15,
+  };
+  for (unsigned I = 0; I < array_lengthof(CVX64Regs); ++I)
+    MRI->mapLLVMRegToCVReg(CVX64Regs[I], CVX64RegStart + I);
 }
 
 MCSubtargetInfo *X86_MC::createX86MCSubtargetInfo(const Triple &TT,
@@ -105,7 +151,7 @@ static MCRegisterInfo *createX86MCRegisterInfo(const Triple &TT) {
   MCRegisterInfo *X = new MCRegisterInfo();
   InitX86MCRegisterInfo(X, RA, X86_MC::getDwarfRegFlavour(TT, false),
                         X86_MC::getDwarfRegFlavour(TT, true), RA);
-  X86_MC::InitLLVM2SEHRegisterMapping(X);
+  X86_MC::initLLVMToSEHAndCVRegMapping(X);
   return X;
 }
 
@@ -152,53 +198,16 @@ static MCAsmInfo *createX86MCAsmInfo(const MCRegisterInfo &MRI,
   return MAI;
 }
 
-static MCCodeGenInfo *createX86MCCodeGenInfo(const Triple &TT, Reloc::Model RM,
-                                             CodeModel::Model CM,
-                                             CodeGenOpt::Level OL) {
-  MCCodeGenInfo *X = new MCCodeGenInfo();
-
+static void adjustCodeGenOpts(const Triple &TT, Reloc::Model RM,
+                              CodeModel::Model &CM) {
   bool is64Bit = TT.getArch() == Triple::x86_64;
 
-  if (RM == Reloc::Default) {
-    // Darwin defaults to PIC in 64 bit mode and dynamic-no-pic in 32 bit mode.
-    // Win64 requires rip-rel addressing, thus we force it to PIC. Otherwise we
-    // use static relocation model by default.
-    if (TT.isOSDarwin()) {
-      if (is64Bit)
-        RM = Reloc::PIC_;
-      else
-        RM = Reloc::DynamicNoPIC;
-    } else if (TT.isOSWindows() && is64Bit)
-      RM = Reloc::PIC_;
-    else
-      RM = Reloc::Static;
-  }
-
-  // ELF and X86-64 don't have a distinct DynamicNoPIC model.  DynamicNoPIC
-  // is defined as a model for code which may be used in static or dynamic
-  // executables but not necessarily a shared library. On X86-32 we just
-  // compile in -static mode, in x86-64 we use PIC.
-  if (RM == Reloc::DynamicNoPIC) {
-    if (is64Bit)
-      RM = Reloc::PIC_;
-    else if (!TT.isOSDarwin())
-      RM = Reloc::Static;
-  }
-
-  // If we are on Darwin, disallow static relocation model in X86-64 mode, since
-  // the Mach-O file format doesn't support it.
-  if (RM == Reloc::Static && TT.isOSDarwin() && is64Bit)
-    RM = Reloc::PIC_;
-
   // For static codegen, if we're not already set, use Small codegen.
   if (CM == CodeModel::Default)
     CM = CodeModel::Small;
   else if (CM == CodeModel::JITDefault)
     // 64-bit JIT places everything in the same buffer except external funcs.
     CM = is64Bit ? CodeModel::Large : CodeModel::Small;
-
-  X->initMCCodeGenInfo(RM, CM, OL);
-  return X;
 }
 
 static MCInstPrinter *createX86MCInstPrinter(const Triple &T,
@@ -215,10 +224,6 @@ static MCInstPrinter *createX86MCInstPrinter(const Triple &T,
 
 static MCRelocationInfo *createX86MCRelocationInfo(const Triple &TheTriple,
                                                    MCContext &Ctx) {
-  if (TheTriple.isOSBinFormatMachO() && TheTriple.getArch() == Triple::x86_64)
-    return createX86_64MachORelocationInfo(Ctx);
-  else if (TheTriple.isOSBinFormatELF())
-    return createX86_64ELFRelocationInfo(Ctx);
   // Default to the stock relocation info.
   return llvm::createMCRelocationInfo(TheTriple, Ctx);
 }
@@ -234,7 +239,7 @@ extern "C" void LLVMInitializeX86TargetMC() {
     RegisterMCAsmInfoFn X(*T, createX86MCAsmInfo);
 
     // Register the MC codegen info.
-    RegisterMCCodeGenInfoFn Y(*T, createX86MCCodeGenInfo);
+    RegisterMCAdjustCodeGenOptsFn Y(*T, adjustCodeGenOpts);
 
     // Register the MC instruction info.
     TargetRegistry::RegisterMCInstrInfo(*T, createX86MCInstrInfo);
diff --git a/lib/Target/X86/MCTargetDesc/X86MCTargetDesc.h b/lib/Target/X86/MCTargetDesc/X86MCTargetDesc.h
index 2d2836ff07c5..ca4f0d3e17d5 100644
--- a/lib/Target/X86/MCTargetDesc/X86MCTargetDesc.h
+++ b/lib/Target/X86/MCTargetDesc/X86MCTargetDesc.h
@@ -14,6 +14,7 @@
 #ifndef LLVM_LIB_TARGET_X86_MCTARGETDESC_X86MCTARGETDESC_H
 #define LLVM_LIB_TARGET_X86_MCTARGETDESC_X86MCTARGETDESC_H
 
+#include "llvm/MC/MCStreamer.h"
 #include "llvm/Support/DataTypes.h"
 #include <string>
 
@@ -26,7 +27,6 @@ class MCObjectWriter;
 class MCRegisterInfo;
 class MCSubtargetInfo;
 class MCRelocationInfo;
-class MCStreamer;
 class Target;
 class Triple;
 class StringRef;
@@ -56,7 +56,7 @@ std::string ParseX86Triple(const Triple &TT);
 
 unsigned getDwarfRegFlavour(const Triple &TT, bool isEH);
 
-void InitLLVM2SEHRegisterMapping(MCRegisterInfo *MRI);
+void initLLVMToSEHAndCVRegMapping(MCRegisterInfo *MRI);
 
 /// Create a X86 MCSubtargetInfo instance. This is exposed so Asm parser, etc.
 /// do not need to go through TargetRegistry.
@@ -93,12 +93,6 @@ MCObjectWriter *createX86ELFObjectWriter(raw_pwrite_stream &OS, bool IsELF64,
 MCObjectWriter *createX86WinCOFFObjectWriter(raw_pwrite_stream &OS,
                                              bool Is64Bit);
 
-/// Construct X86-64 Mach-O relocation info.
-MCRelocationInfo *createX86_64MachORelocationInfo(MCContext &Ctx);
-
-/// Construct X86-64 ELF relocation info.
-MCRelocationInfo *createX86_64ELFRelocationInfo(MCContext &Ctx);
-
 /// Returns the sub or super register of a specific X86 register.
 /// e.g. getX86SubSuperRegister(X86::EAX, 16) returns X86::AX.
 /// Aborts on error.
diff --git a/lib/Target/X86/MCTargetDesc/X86MachORelocationInfo.cpp b/lib/Target/X86/MCTargetDesc/X86MachORelocationInfo.cpp
deleted file mode 100644
index 9bfe999424fa..000000000000
--- a/lib/Target/X86/MCTargetDesc/X86MachORelocationInfo.cpp
+++ /dev/null
@@ -1,119 +0,0 @@
-//===-- X86MachORelocationInfo.cpp ----------------------------------------===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-
-#include "MCTargetDesc/X86MCTargetDesc.h"
-#include "llvm/MC/MCContext.h"
-#include "llvm/MC/MCExpr.h"
-#include "llvm/MC/MCInst.h"
-#include "llvm/MC/MCRelocationInfo.h"
-#include "llvm/MC/MCSymbol.h"
-#include "llvm/Object/MachO.h"
-
-using namespace llvm;
-using namespace object;
-using namespace MachO;
-
-namespace {
-class X86_64MachORelocationInfo : public MCRelocationInfo {
-public:
-  X86_64MachORelocationInfo(MCContext &Ctx) : MCRelocationInfo(Ctx) {}
-
-  const MCExpr *createExprForRelocation(RelocationRef Rel) override {
-    const MachOObjectFile *Obj = cast<MachOObjectFile>(Rel.getObject());
-
-    uint64_t RelType = Rel.getType();
-    symbol_iterator SymI = Rel.getSymbol();
-
-    ErrorOr<StringRef> SymNameOrErr = SymI->getName();
-    if (std::error_code EC = SymNameOrErr.getError())
-      report_fatal_error(EC.message());
-    StringRef SymName = *SymNameOrErr;
-    uint64_t SymAddr = SymI->getValue();
-
-    any_relocation_info RE = Obj->getRelocation(Rel.getRawDataRefImpl());
-    bool isPCRel = Obj->getAnyRelocationPCRel(RE);
-
-    MCSymbol *Sym = Ctx.getOrCreateSymbol(SymName);
-    // FIXME: check that the value is actually the same.
-    if (!Sym->isVariable())
-      Sym->setVariableValue(MCConstantExpr::create(SymAddr, Ctx));
-    const MCExpr *Expr = nullptr;
-
-    switch(RelType) {
-    case X86_64_RELOC_TLV:
-      Expr = MCSymbolRefExpr::create(Sym, MCSymbolRefExpr::VK_TLVP, Ctx);
-      break;
-    case X86_64_RELOC_SIGNED_4:
-      Expr = MCBinaryExpr::createAdd(MCSymbolRefExpr::create(Sym, Ctx),
-                                     MCConstantExpr::create(4, Ctx),
-                                     Ctx);
-      break;
-    case X86_64_RELOC_SIGNED_2:
-      Expr = MCBinaryExpr::createAdd(MCSymbolRefExpr::create(Sym, Ctx),
-                                     MCConstantExpr::create(2, Ctx),
-                                     Ctx);
-      break;
-    case X86_64_RELOC_SIGNED_1:
-      Expr = MCBinaryExpr::createAdd(MCSymbolRefExpr::create(Sym, Ctx),
-                                     MCConstantExpr::create(1, Ctx),
-                                     Ctx);
-      break;
-    case X86_64_RELOC_GOT_LOAD:
-      Expr = MCSymbolRefExpr::create(Sym, MCSymbolRefExpr::VK_GOTPCREL, Ctx);
-      break;
-    case X86_64_RELOC_GOT:
-      Expr = MCSymbolRefExpr::create(Sym, isPCRel ?
-                                     MCSymbolRefExpr::VK_GOTPCREL :
-                                     MCSymbolRefExpr::VK_GOT,
-                                     Ctx);
-      break;
-    case X86_64_RELOC_SUBTRACTOR:
-      {
-        Rel.moveNext();
-        any_relocation_info RENext =
-            Obj->getRelocation(Rel.getRawDataRefImpl());
-
-        // X86_64_SUBTRACTOR must be followed by a relocation of type
-        // X86_64_RELOC_UNSIGNED.
-        // NOTE: Scattered relocations don't exist on x86_64.
-        unsigned RType = Obj->getAnyRelocationType(RENext);
-        if (RType != X86_64_RELOC_UNSIGNED)
-          report_fatal_error("Expected X86_64_RELOC_UNSIGNED after "
-                             "X86_64_RELOC_SUBTRACTOR.");
-
-        const MCExpr *LHS = MCSymbolRefExpr::create(Sym, Ctx);
-
-        symbol_iterator RSymI = Rel.getSymbol();
-        uint64_t RSymAddr = RSymI->getValue();
-        ErrorOr<StringRef> RSymName = RSymI->getName();
-        if (std::error_code EC = RSymName.getError())
-          report_fatal_error(EC.message());
-
-        MCSymbol *RSym = Ctx.getOrCreateSymbol(*RSymName);
-        if (!RSym->isVariable())
-          RSym->setVariableValue(MCConstantExpr::create(RSymAddr, Ctx));
-
-        const MCExpr *RHS = MCSymbolRefExpr::create(RSym, Ctx);
-
-        Expr = MCBinaryExpr::createSub(LHS, RHS, Ctx);
-        break;
-      }
-    default:
-      Expr = MCSymbolRefExpr::create(Sym, Ctx);
-      break;
-    }
-    return Expr;
-  }
-};
-} // End unnamed namespace
-
-/// createX86_64MachORelocationInfo - Construct an X86-64 Mach-O RelocationInfo.
-MCRelocationInfo *llvm::createX86_64MachORelocationInfo(MCContext &Ctx) {
-  return new X86_64MachORelocationInfo(Ctx);
-}
diff --git a/lib/Target/X86/MCTargetDesc/X86MachObjectWriter.cpp b/lib/Target/X86/MCTargetDesc/X86MachObjectWriter.cpp
index 191ebeac7265..297926ddcfda 100644
--- a/lib/Target/X86/MCTargetDesc/X86MachObjectWriter.cpp
+++ b/lib/Target/X86/MCTargetDesc/X86MachObjectWriter.cpp
@@ -73,7 +73,9 @@ public:
 
 static bool isFixupKindRIPRel(unsigned Kind) {
   return Kind == X86::reloc_riprel_4byte ||
-    Kind == X86::reloc_riprel_4byte_movq_load;
+         Kind == X86::reloc_riprel_4byte_movq_load ||
+         Kind == X86::reloc_riprel_4byte_relax ||
+         Kind == X86::reloc_riprel_4byte_relax_rex;
 }
 
 static unsigned getFixupKindLog2Size(unsigned Kind) {
@@ -87,8 +89,11 @@ static unsigned getFixupKindLog2Size(unsigned Kind) {
   case FK_PCRel_4:
     // FIXME: Remove these!!!
   case X86::reloc_riprel_4byte:
+  case X86::reloc_riprel_4byte_relax:
+  case X86::reloc_riprel_4byte_relax_rex:
   case X86::reloc_riprel_4byte_movq_load:
   case X86::reloc_signed_4byte:
+  case X86::reloc_signed_4byte_relax:
   case FK_Data_4: return 2;
   case FK_Data_8: return 3;
   }
diff --git a/lib/Target/X86/MCTargetDesc/X86WinCOFFObjectWriter.cpp b/lib/Target/X86/MCTargetDesc/X86WinCOFFObjectWriter.cpp
index bd1bc9943b6d..33376b6d1b90 100644
--- a/lib/Target/X86/MCTargetDesc/X86WinCOFFObjectWriter.cpp
+++ b/lib/Target/X86/MCTargetDesc/X86WinCOFFObjectWriter.cpp
@@ -53,11 +53,16 @@ unsigned X86WinCOFFObjectWriter::getRelocType(const MCValue &Target,
     case FK_PCRel_4:
     case X86::reloc_riprel_4byte:
     case X86::reloc_riprel_4byte_movq_load:
+    case X86::reloc_riprel_4byte_relax:
+    case X86::reloc_riprel_4byte_relax_rex:
       return COFF::IMAGE_REL_AMD64_REL32;
     case FK_Data_4:
     case X86::reloc_signed_4byte:
+    case X86::reloc_signed_4byte_relax:
       if (Modifier == MCSymbolRefExpr::VK_COFF_IMGREL32)
         return COFF::IMAGE_REL_AMD64_ADDR32NB;
+      if (Modifier == MCSymbolRefExpr::VK_SECREL)
+        return COFF::IMAGE_REL_AMD64_SECREL;
       return COFF::IMAGE_REL_AMD64_ADDR32;
     case FK_Data_8:
       return COFF::IMAGE_REL_AMD64_ADDR64;
@@ -76,8 +81,11 @@ unsigned X86WinCOFFObjectWriter::getRelocType(const MCValue &Target,
       return COFF::IMAGE_REL_I386_REL32;
     case FK_Data_4:
     case X86::reloc_signed_4byte:
+    case X86::reloc_signed_4byte_relax:
       if (Modifier == MCSymbolRefExpr::VK_COFF_IMGREL32)
         return COFF::IMAGE_REL_I386_DIR32NB;
+      if (Modifier == MCSymbolRefExpr::VK_SECREL)
+        return COFF::IMAGE_REL_AMD64_SECREL;
       return COFF::IMAGE_REL_I386_DIR32;
     case FK_SecRel_2:
       return COFF::IMAGE_REL_I386_SECTION;
diff --git a/lib/Target/X86/Makefile b/lib/Target/X86/Makefile
deleted file mode 100644
index e518fecf044f..000000000000
--- a/lib/Target/X86/Makefile
+++ /dev/null
@@ -1,23 +0,0 @@
-##===- lib/Target/X86/Makefile -----------------------------*- Makefile -*-===##
-#
-#                     The LLVM Compiler Infrastructure
-#
-# This file is distributed under the University of Illinois Open Source
-# License. See LICENSE.TXT for details.
-#
-##===----------------------------------------------------------------------===##
-
-LEVEL = ../../..
-LIBRARYNAME = LLVMX86CodeGen
-TARGET = X86
-
-# Make sure that tblgen is run, first thing.
-BUILT_SOURCES = X86GenRegisterInfo.inc X86GenInstrInfo.inc \
-		X86GenAsmWriter.inc X86GenAsmMatcher.inc \
-                X86GenAsmWriter1.inc X86GenDAGISel.inc  \
-                X86GenDisassemblerTables.inc X86GenFastISel.inc \
-                X86GenCallingConv.inc X86GenSubtargetInfo.inc
-
-DIRS = InstPrinter AsmParser Disassembler TargetInfo MCTargetDesc Utils
-
-include $(LEVEL)/Makefile.common
diff --git a/lib/Target/X86/README-X86-64.txt b/lib/Target/X86/README-X86-64.txt
index bcfdf0bc56b2..09626e13849d 100644
--- a/lib/Target/X86/README-X86-64.txt
+++ b/lib/Target/X86/README-X86-64.txt
@@ -170,7 +170,7 @@ generated for it.  The primary issue with the result is that it doesn't do any
 of the optimizations which are possible if we know the address of a va_list
 in the current function is never taken:
 1. We shouldn't spill the XMM registers because we only call va_arg with "int".
-2. It would be nice if we could scalarrepl the va_list.
+2. It would be nice if we could sroa the va_list.
 3. Probably overkill, but it'd be cool if we could peel off the first five
 iterations of the loop.
 
diff --git a/lib/Target/X86/README.txt b/lib/Target/X86/README.txt
index 19a183201755..799157c926e6 100644
--- a/lib/Target/X86/README.txt
+++ b/lib/Target/X86/README.txt
@@ -50,8 +50,8 @@ Some isel ideas:
 2. Code duplication (addressing mode) during isel.
 3. Other ideas from "Register-Sensitive Selection, Duplication, and
    Sequencing of Instructions".
-4. Scheduling for reduced register pressure.  E.g. "Minimum Register 
-   Instruction Sequence Problem: Revisiting Optimal Code Generation for DAGs" 
+4. Scheduling for reduced register pressure.  E.g. "Minimum Register
+   Instruction Sequence Problem: Revisiting Optimal Code Generation for DAGs"
    and other related papers.
    http://citeseer.ist.psu.edu/govindarajan01minimum.html
 
@@ -73,7 +73,7 @@ It appears icc use push for parameter passing. Need to investigate.
 //===---------------------------------------------------------------------===//
 
 The instruction selector sometimes misses folding a load into a compare.  The
-pattern is written as (cmp reg, (load p)).  Because the compare isn't 
+pattern is written as (cmp reg, (load p)).  Because the compare isn't
 commutative, it is not matched with the load on both sides.  The dag combiner
 should be made smart enough to canonicalize the load into the RHS of a compare
 when it can invert the result of the compare for free.
diff --git a/lib/Target/X86/TargetInfo/Makefile b/lib/Target/X86/TargetInfo/Makefile
deleted file mode 100644
index ee91982df0c8..000000000000
--- a/lib/Target/X86/TargetInfo/Makefile
+++ /dev/null
@@ -1,16 +0,0 @@
-##===- lib/Target/X86/TargetInfo/Makefile ------------------*- Makefile -*-===##
-#
-#                     The LLVM Compiler Infrastructure
-#
-# This file is distributed under the University of Illinois Open Source
-# License. See LICENSE.TXT for details.
-#
-##===----------------------------------------------------------------------===##
-
-LEVEL = ../../../..
-LIBRARYNAME = LLVMX86Info
-
-# Hack: we need to include 'main' target directory to grab private headers
-CPP.Flags += -I$(PROJ_OBJ_DIR)/.. -I$(PROJ_SRC_DIR)/..
-
-include $(LEVEL)/Makefile.common
diff --git a/lib/Target/X86/Utils/Makefile b/lib/Target/X86/Utils/Makefile
deleted file mode 100644
index 1df6f0f561d4..000000000000
--- a/lib/Target/X86/Utils/Makefile
+++ /dev/null
@@ -1,15 +0,0 @@
-##===- lib/Target/X86/Utils/Makefile -----------------------*- Makefile -*-===##
-#
-#                     The LLVM Compiler Infrastructure
-#
-# This file is distributed under the University of Illinois Open Source
-# License. See LICENSE.TXT for details.
-#
-##===----------------------------------------------------------------------===##
-LEVEL = ../../../..
-LIBRARYNAME = LLVMX86Utils
-
-# Hack: we need to include 'main' x86 target directory to grab private headers
-CPP.Flags += -I$(PROJ_OBJ_DIR)/.. -I$(PROJ_SRC_DIR)/..
-
-include $(LEVEL)/Makefile.common
diff --git a/lib/Target/X86/Utils/X86ShuffleDecode.cpp b/lib/Target/X86/Utils/X86ShuffleDecode.cpp
index 619f7c8d25df..18f71675437b 100644
--- a/lib/Target/X86/Utils/X86ShuffleDecode.cpp
+++ b/lib/Target/X86/Utils/X86ShuffleDecode.cpp
@@ -13,6 +13,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "X86ShuffleDecode.h"
+#include "llvm/ADT/ArrayRef.h"
 #include "llvm/CodeGen/MachineValueType.h"
 
 //===----------------------------------------------------------------------===//
@@ -44,6 +45,17 @@ void DecodeINSERTPSMask(unsigned Imm, SmallVectorImpl<int> &ShuffleMask) {
   if (ZMask & 8) ShuffleMask[3] = SM_SentinelZero;
 }
 
+void DecodeInsertElementMask(MVT VT, unsigned Idx, unsigned Len,
+                             SmallVectorImpl<int> &ShuffleMask) {
+  unsigned NumElts = VT.getVectorNumElements();
+  assert((Idx + Len) <= NumElts && "Insertion out of range");
+
+  for (unsigned i = 0; i != NumElts; ++i)
+    ShuffleMask.push_back(i);
+  for (unsigned i = 0; i != Len; ++i)
+    ShuffleMask[Idx + i] = NumElts + i;
+}
+
 // <3,1> or <6,7,2,3>
 void DecodeMOVHLPSMask(unsigned NElts, SmallVectorImpl<int> &ShuffleMask) {
   for (unsigned i = NElts / 2; i != NElts; ++i)
@@ -263,6 +275,25 @@ void DecodeUNPCKLMask(MVT VT, SmallVectorImpl<int> &ShuffleMask) {
   }
 }
 
+/// Decodes a broadcast of the first element of a vector.
+void DecodeVectorBroadcast(MVT DstVT, SmallVectorImpl<int> &ShuffleMask) {
+  unsigned NumElts = DstVT.getVectorNumElements();
+  ShuffleMask.append(NumElts, 0);
+}
+
+/// Decodes a broadcast of a subvector to a larger vector type.
+void DecodeSubVectorBroadcast(MVT DstVT, MVT SrcVT,
+                              SmallVectorImpl<int> &ShuffleMask) {
+  assert(SrcVT.getScalarType() == DstVT.getScalarType() &&
+         "Non matching vector element types");
+  unsigned NumElts = SrcVT.getVectorNumElements();
+  unsigned Scale = DstVT.getSizeInBits() / SrcVT.getSizeInBits();
+
+  for (unsigned i = 0; i != Scale; ++i)
+    for (unsigned j = 0; j != NumElts; ++j)
+      ShuffleMask.push_back(j);
+}
+
 /// \brief Decode a shuffle packed values at 128-bit granularity
 /// (SHUFF32x4/SHUFF64x2/SHUFI32x4/SHUFI64x2)
 /// immediate mask into a shuffle mask.
@@ -303,9 +334,9 @@ void DecodePSHUFBMask(ArrayRef<uint64_t> RawMask,
       ShuffleMask.push_back(M);
       continue;
     }
-    // For AVX vectors with 32 bytes the base of the shuffle is the half of
-    // the vector we're inside.
-    int Base = i < 16 ? 0 : 16;
+    // For 256/512-bit vectors the base of the shuffle is the 128-bit
+    // subvector we're inside.
+    int Base = (i / 16) * 16;
     // If the high bit (7) of the byte is set, the element is zeroed.
     if (M & (1 << 7))
       ShuffleMask.push_back(SM_SentinelZero);
@@ -331,23 +362,62 @@ void DecodeBLENDMask(MVT VT, unsigned Imm, SmallVectorImpl<int> &ShuffleMask) {
   }
 }
 
-/// DecodeVPERMMask - this decodes the shuffle masks for VPERMQ/VPERMPD.
-/// No VT provided since it only works on 256-bit, 4 element vectors.
-void DecodeVPERMMask(unsigned Imm, SmallVectorImpl<int> &ShuffleMask) {
-  for (unsigned i = 0; i != 4; ++i) {
-    ShuffleMask.push_back((Imm >> (2 * i)) & 3);
+void DecodeVPPERMMask(ArrayRef<uint64_t> RawMask,
+                      SmallVectorImpl<int> &ShuffleMask) {
+  assert(RawMask.size() == 16 && "Illegal VPPERM shuffle mask size");
+
+  // VPPERM Operation
+  // Bits[4:0] - Byte Index (0 - 31)
+  // Bits[7:5] - Permute Operation
+  //
+  // Permute Operation:
+  // 0 - Source byte (no logical operation).
+  // 1 - Invert source byte.
+  // 2 - Bit reverse of source byte.
+  // 3 - Bit reverse of inverted source byte.
+  // 4 - 00h (zero - fill).
+  // 5 - FFh (ones - fill).
+  // 6 - Most significant bit of source byte replicated in all bit positions.
+  // 7 - Invert most significant bit of source byte and replicate in all bit positions.
+  for (int i = 0, e = RawMask.size(); i < e; ++i) {
+    uint64_t M = RawMask[i];
+    if (M == (uint64_t)SM_SentinelUndef) {
+      ShuffleMask.push_back(M);
+      continue;
+    }
+
+    uint64_t PermuteOp = (M >> 5) & 0x7;
+    if (PermuteOp == 4) {
+      ShuffleMask.push_back(SM_SentinelZero);
+      continue;
+    }
+    if (PermuteOp != 0) {
+      ShuffleMask.clear();
+      return;
+    }
+
+    uint64_t Index = M & 0x1F;
+    ShuffleMask.push_back((int)Index);
   }
 }
 
-void DecodeZeroExtendMask(MVT SrcVT, MVT DstVT, SmallVectorImpl<int> &Mask) {
+/// DecodeVPERMMask - this decodes the shuffle masks for VPERMQ/VPERMPD.
+void DecodeVPERMMask(MVT VT, unsigned Imm, SmallVectorImpl<int> &ShuffleMask) {
+  assert((VT.is256BitVector() || VT.is512BitVector()) &&
+         (VT.getScalarSizeInBits() == 64) && "Unexpected vector value type");
+  unsigned NumElts = VT.getVectorNumElements();
+  for (unsigned l = 0; l != NumElts; l += 4)
+    for (unsigned i = 0; i != 4; ++i)
+      ShuffleMask.push_back(l + ((Imm >> (2 * i)) & 3));
+}
+
+void DecodeZeroExtendMask(MVT SrcScalarVT, MVT DstVT, SmallVectorImpl<int> &Mask) {
   unsigned NumDstElts = DstVT.getVectorNumElements();
-  unsigned SrcScalarBits = SrcVT.getScalarSizeInBits();
+  unsigned SrcScalarBits = SrcScalarVT.getSizeInBits();
   unsigned DstScalarBits = DstVT.getScalarSizeInBits();
   unsigned Scale = DstScalarBits / SrcScalarBits;
   assert(SrcScalarBits < DstScalarBits &&
          "Expected zero extension mask to increase scalar size");
-  assert(SrcVT.getVectorNumElements() >= NumDstElts &&
-         "Too many zero extension lanes");
 
   for (unsigned i = 0; i != NumDstElts; i++) {
     Mask.push_back(i);
@@ -445,18 +515,78 @@ void DecodeINSERTQIMask(int Len, int Idx,
     ShuffleMask.push_back(SM_SentinelUndef);
 }
 
+void DecodeVPERMILPMask(MVT VT, ArrayRef<uint64_t> RawMask,
+                        SmallVectorImpl<int> &ShuffleMask) {
+  unsigned VecSize = VT.getSizeInBits();
+  unsigned EltSize = VT.getScalarSizeInBits();
+  unsigned NumLanes = VecSize / 128;
+  unsigned NumEltsPerLane = VT.getVectorNumElements() / NumLanes;
+  assert((VecSize == 128 || VecSize == 256 || VecSize == 512) &&
+         "Unexpected vector size");
+  assert((EltSize == 32 || EltSize == 64) && "Unexpected element size");
+
+  for (unsigned i = 0, e = RawMask.size(); i < e; ++i) {
+    uint64_t M = RawMask[i];
+    M = (EltSize == 64 ? ((M >> 1) & 0x1) : (M & 0x3));
+    unsigned LaneOffset = i & ~(NumEltsPerLane - 1);
+    ShuffleMask.push_back((int)(LaneOffset + M));
+  }
+}
+
+void DecodeVPERMIL2PMask(MVT VT, unsigned M2Z, ArrayRef<uint64_t> RawMask,
+                         SmallVectorImpl<int> &ShuffleMask) {
+  unsigned VecSize = VT.getSizeInBits();
+  unsigned EltSize = VT.getScalarSizeInBits();
+  unsigned NumLanes = VecSize / 128;
+  unsigned NumEltsPerLane = VT.getVectorNumElements() / NumLanes;
+  assert((VecSize == 128 || VecSize == 256) &&
+         "Unexpected vector size");
+  assert((EltSize == 32 || EltSize == 64) && "Unexpected element size");
+
+  for (unsigned i = 0, e = RawMask.size(); i < e; ++i) {
+    // VPERMIL2 Operation.
+    // Bits[3] - Match Bit.
+    // Bits[2:1] - (Per Lane) PD Shuffle Mask.
+    // Bits[2:0] - (Per Lane) PS Shuffle Mask.
+    uint64_t Selector = RawMask[i];
+    unsigned MatchBit = (Selector >> 3) & 0x1;
+
+    // M2Z[0:1]     MatchBit
+    //   0Xb           X        Source selected by Selector index.
+    //   10b           0        Source selected by Selector index.
+    //   10b           1        Zero.
+    //   11b           0        Zero.
+    //   11b           1        Source selected by Selector index.
+    if ((M2Z & 0x2) != 0 && MatchBit != (M2Z & 0x1)) {
+      ShuffleMask.push_back(SM_SentinelZero);
+      continue;
+    }
+
+    unsigned Index = i & ~(NumEltsPerLane - 1);
+    if (EltSize == 64)
+      Index += (Selector >> 1) & 0x1;
+    else
+      Index += Selector & 0x3;
+
+    unsigned SrcOffset = (Selector >> 2) & 1;
+    ShuffleMask.push_back((int)(SrcOffset + Index));
+  }
+}
+
 void DecodeVPERMVMask(ArrayRef<uint64_t> RawMask,
                       SmallVectorImpl<int> &ShuffleMask) {
-  for (int i = 0, e = RawMask.size(); i < e; ++i) {
-    uint64_t M = RawMask[i];
+  uint64_t EltMaskSize = RawMask.size() - 1;
+  for (auto M : RawMask) {
+    M &= EltMaskSize;
     ShuffleMask.push_back((int)M);
   }
 }
 
 void DecodeVPERMV3Mask(ArrayRef<uint64_t> RawMask,
                       SmallVectorImpl<int> &ShuffleMask) {
-  for (int i = 0, e = RawMask.size(); i < e; ++i) {
-    uint64_t M = RawMask[i];
+  uint64_t EltMaskSize = (RawMask.size() * 2) - 1;
+  for (auto M : RawMask) {
+    M &= EltMaskSize;
     ShuffleMask.push_back((int)M);
   }
 }
diff --git a/lib/Target/X86/Utils/X86ShuffleDecode.h b/lib/Target/X86/Utils/X86ShuffleDecode.h
index 72db6a81912b..dc21c19752c3 100644
--- a/lib/Target/X86/Utils/X86ShuffleDecode.h
+++ b/lib/Target/X86/Utils/X86ShuffleDecode.h
@@ -16,23 +16,31 @@
 #define LLVM_LIB_TARGET_X86_UTILS_X86SHUFFLEDECODE_H
 
 #include "llvm/ADT/SmallVector.h"
-#include "llvm/ADT/ArrayRef.h"
 
 //===----------------------------------------------------------------------===//
 //  Vector Mask Decoding
 //===----------------------------------------------------------------------===//
 
 namespace llvm {
+template <typename T> class ArrayRef;
 class MVT;
 
 enum { SM_SentinelUndef = -1, SM_SentinelZero = -2 };
 
+/// Decode a 128-bit INSERTPS instruction as a v4f32 shuffle mask.
 void DecodeINSERTPSMask(unsigned Imm, SmallVectorImpl<int> &ShuffleMask);
 
-// <3,1> or <6,7,2,3>
+// Insert the bottom Len elements from a second source into a vector starting at
+// element Idx.
+void DecodeInsertElementMask(MVT VT, unsigned Idx, unsigned Len,
+                             SmallVectorImpl<int> &ShuffleMask);
+
+/// Decode a MOVHLPS instruction as a v2f64/v4f32 shuffle mask.
+/// i.e. <3,1> or <6,7,2,3>
 void DecodeMOVHLPSMask(unsigned NElts, SmallVectorImpl<int> &ShuffleMask);
 
-// <0,2> or <0,1,4,5>
+/// Decode a MOVLHPS instruction as a v2f64/v4f32 shuffle mask.
+/// i.e. <0,2> or <0,1,4,5>
 void DecodeMOVLHPSMask(unsigned NElts, SmallVectorImpl<int> &ShuffleMask);
 
 void DecodeMOVSLDUPMask(MVT VT, SmallVectorImpl<int> &ShuffleMask);
@@ -47,74 +55,104 @@ void DecodePSRLDQMask(MVT VT, unsigned Imm, SmallVectorImpl<int> &ShuffleMask);
 
 void DecodePALIGNRMask(MVT VT, unsigned Imm, SmallVectorImpl<int> &ShuffleMask);
 
+/// Decodes the shuffle masks for pshufd/pshufw/vpermilpd/vpermilps.
+/// VT indicates the type of the vector allowing it to handle different
+/// datatypes and vector widths.
 void DecodePSHUFMask(MVT VT, unsigned Imm, SmallVectorImpl<int> &ShuffleMask);
 
+/// Decodes the shuffle masks for pshufhw.
+/// VT indicates the type of the vector allowing it to handle different
+/// datatypes and vector widths.
 void DecodePSHUFHWMask(MVT VT, unsigned Imm, SmallVectorImpl<int> &ShuffleMask);
 
-void DecodePSHUFLWMask(MVT, unsigned Imm, SmallVectorImpl<int> &ShuffleMask);
+/// Decodes the shuffle masks for pshuflw.
+/// VT indicates the type of the vector allowing it to handle different
+/// datatypes and vector widths.
+void DecodePSHUFLWMask(MVT VT, unsigned Imm, SmallVectorImpl<int> &ShuffleMask);
 
-/// \brief Decodes a PSWAPD 3DNow! instruction.
+/// Decodes a PSWAPD 3DNow! instruction.
 void DecodePSWAPMask(MVT VT, SmallVectorImpl<int> &ShuffleMask);
 
-/// DecodeSHUFPMask - This decodes the shuffle masks for shufp*. VT indicates
-/// the type of the vector allowing it to handle different datatypes and vector
-/// widths.
+/// Decodes the shuffle masks for shufp*.
+/// VT indicates the type of the vector allowing it to handle different
+/// datatypes and vector widths.
 void DecodeSHUFPMask(MVT VT, unsigned Imm, SmallVectorImpl<int> &ShuffleMask);
 
-/// DecodeUNPCKHMask - This decodes the shuffle masks for unpckhps/unpckhpd
-/// and punpckh*. VT indicates the type of the vector allowing it to handle
-/// different datatypes and vector widths.
+/// Decodes the shuffle masks for unpckhps/unpckhpd and punpckh*.
+/// VT indicates the type of the vector allowing it to handle different
+/// datatypes and vector widths.
 void DecodeUNPCKHMask(MVT VT, SmallVectorImpl<int> &ShuffleMask);
 
-/// DecodeUNPCKLMask - This decodes the shuffle masks for unpcklps/unpcklpd
-/// and punpckl*. VT indicates the type of the vector allowing it to handle
-/// different datatypes and vector widths.
+/// Decodes the shuffle masks for unpcklps/unpcklpd and punpckl*.
+/// VT indicates the type of the vector allowing it to handle different
+/// datatypes and vector widths.
 void DecodeUNPCKLMask(MVT VT, SmallVectorImpl<int> &ShuffleMask);
 
-/// \brief Decode a PSHUFB mask from a raw array of constants such as from
+/// Decodes a broadcast of the first element of a vector.
+void DecodeVectorBroadcast(MVT DstVT, SmallVectorImpl<int> &ShuffleMask);
+
+/// Decodes a broadcast of a subvector to a larger vector type.
+void DecodeSubVectorBroadcast(MVT DstVT, MVT SrcVT,
+                              SmallVectorImpl<int> &ShuffleMask);
+
+/// Decode a PSHUFB mask from a raw array of constants such as from
 /// BUILD_VECTOR.
 void DecodePSHUFBMask(ArrayRef<uint64_t> RawMask,
                       SmallVectorImpl<int> &ShuffleMask);
 
-/// \brief Decode a BLEND immediate mask into a shuffle mask.
+/// Decode a BLEND immediate mask into a shuffle mask.
 void DecodeBLENDMask(MVT VT, unsigned Imm, SmallVectorImpl<int> &ShuffleMask);
 
 void DecodeVPERM2X128Mask(MVT VT, unsigned Imm,
                           SmallVectorImpl<int> &ShuffleMask);
 
-/// \brief Decode a shuffle packed values at 128-bit granularity
+/// Decode a shuffle packed values at 128-bit granularity
 /// immediate mask into a shuffle mask.
 void decodeVSHUF64x2FamilyMask(MVT VT, unsigned Imm,
                                SmallVectorImpl<int> &ShuffleMask);
 
-/// DecodeVPERMMask - this decodes the shuffle masks for VPERMQ/VPERMPD.
-/// No VT provided since it only works on 256-bit, 4 element vectors.
-void DecodeVPERMMask(unsigned Imm, SmallVectorImpl<int> &ShuffleMask);
+/// Decodes the shuffle masks for VPERMQ/VPERMPD.
+void DecodeVPERMMask(MVT VT, unsigned Imm, SmallVectorImpl<int> &ShuffleMask);
 
-/// \brief Decode a zero extension instruction as a shuffle mask.
-void DecodeZeroExtendMask(MVT SrcVT, MVT DstVT,
+/// Decode a VPPERM mask from a raw array of constants such as from
+/// BUILD_VECTOR.
+/// This can only basic masks (permutes + zeros), not any of the other
+/// operations that VPPERM can perform.
+void DecodeVPPERMMask(ArrayRef<uint64_t> RawMask,
+                      SmallVectorImpl<int> &ShuffleMask);
+
+/// Decode a zero extension instruction as a shuffle mask.
+void DecodeZeroExtendMask(MVT SrcScalarVT, MVT DstVT,
                           SmallVectorImpl<int> &ShuffleMask);
 
-/// \brief Decode a move lower and zero upper instruction as a shuffle mask.
+/// Decode a move lower and zero upper instruction as a shuffle mask.
 void DecodeZeroMoveLowMask(MVT VT, SmallVectorImpl<int> &ShuffleMask);
 
-/// \brief Decode a scalar float move instruction as a shuffle mask.
+/// Decode a scalar float move instruction as a shuffle mask.
 void DecodeScalarMoveMask(MVT VT, bool IsLoad,
                           SmallVectorImpl<int> &ShuffleMask);
 
-/// \brief Decode a SSE4A EXTRQ instruction as a v16i8 shuffle mask.
+/// Decode a SSE4A EXTRQ instruction as a v16i8 shuffle mask.
 void DecodeEXTRQIMask(int Len, int Idx,
                       SmallVectorImpl<int> &ShuffleMask);
 
-/// \brief Decode a SSE4A INSERTQ instruction as a v16i8 shuffle mask.
+/// Decode a SSE4A INSERTQ instruction as a v16i8 shuffle mask.
 void DecodeINSERTQIMask(int Len, int Idx,
                         SmallVectorImpl<int> &ShuffleMask);
 
-/// \brief Decode a VPERM W/D/Q/PS/PD mask from a raw array of constants.
+/// Decode a VPERMILPD/VPERMILPS variable mask from a raw array of constants.
+void DecodeVPERMILPMask(MVT VT, ArrayRef<uint64_t> RawMask,
+                        SmallVectorImpl<int> &ShuffleMask);
+
+/// Decode a VPERMIL2PD/VPERMIL2PS variable mask from a raw array of constants.
+void DecodeVPERMIL2PMask(MVT VT, unsigned M2Z, ArrayRef<uint64_t> RawMask,
+                         SmallVectorImpl<int> &ShuffleMask);
+
+/// Decode a VPERM W/D/Q/PS/PD mask from a raw array of constants.
 void DecodeVPERMVMask(ArrayRef<uint64_t> RawMask,
                       SmallVectorImpl<int> &ShuffleMask);
 
-/// \brief Decode a VPERMT2 W/D/Q/PS/PD mask from a raw array of constants.
+/// Decode a VPERMT2 W/D/Q/PS/PD mask from a raw array of constants.
 void DecodeVPERMV3Mask(ArrayRef<uint64_t> RawMask,
                       SmallVectorImpl<int> &ShuffleMask);
 } // llvm namespace
diff --git a/lib/Target/X86/X86.h b/lib/Target/X86/X86.h
index 01e65b89f480..23d6c7120a4b 100644
--- a/lib/Target/X86/X86.h
+++ b/lib/Target/X86/X86.h
@@ -21,6 +21,7 @@ namespace llvm {
 
 class FunctionPass;
 class ImmutablePass;
+class PassRegistry;
 class X86TargetMachine;
 
 /// This pass converts a legalized DAG into a X86-specific DAG, ready for
@@ -58,6 +59,12 @@ FunctionPass *createX86FixupLEAs();
 /// recalculations.
 FunctionPass *createX86OptimizeLEAs();
 
+/// Return a pass that transforms setcc + movzx pairs into xor + setcc.
+FunctionPass *createX86FixupSetCC();
+
+/// Return a pass that expands WinAlloca pseudo-instructions.
+FunctionPass *createX86WinAllocaExpander();
+
 /// Return a pass that optimizes the code-size of x86 call sequences. This is
 /// done by replacing esp-relative movs with pushes.
 FunctionPass *createX86CallFrameOptimization();
@@ -72,6 +79,14 @@ FunctionPass *createX86WinEHStatePass();
 /// must run after prologue/epilogue insertion and before lowering
 /// the MachineInstr to MC.
 FunctionPass *createX86ExpandPseudoPass();
+
+/// Return a Machine IR pass that selectively replaces
+/// certain byte and word instructions by equivalent 32 bit instructions,
+/// in order to eliminate partial register usage, false dependences on
+/// the upper portions of registers, and to save code size.
+FunctionPass *createX86FixupBWInsts();
+
+void initializeFixupBWInstPassPass(PassRegistry &);
 } // End llvm namespace
 
 #endif
diff --git a/lib/Target/X86/X86.td b/lib/Target/X86/X86.td
index 8902a8534256..8267a84518fc 100644
--- a/lib/Target/X86/X86.td
+++ b/lib/Target/X86/X86.td
@@ -31,6 +31,9 @@ def Mode16Bit : SubtargetFeature<"16bit-mode", "In16BitMode", "true",
 // X86 Subtarget features
 //===----------------------------------------------------------------------===//
 
+def FeatureX87     : SubtargetFeature<"x87","HasX87", "true",
+                                      "Enable X87 float instructions">;
+
 def FeatureCMOV    : SubtargetFeature<"cmov","HasCMov", "true",
                                       "Enable conditional move instructions">;
 
@@ -125,6 +128,9 @@ def FeatureCDI      : SubtargetFeature<"avx512cd", "HasCDI", "true",
 def FeaturePFI      : SubtargetFeature<"avx512pf", "HasPFI", "true",
                       "Enable AVX-512 PreFetch Instructions",
                                       [FeatureAVX512]>;
+def FeaturePREFETCHWT1  : SubtargetFeature<"prefetchwt1", "HasPFPREFETCHWT1",
+                                   "true",
+                                   "Prefetch with Intent to Write and T1 Hint">;
 def FeatureDQI     : SubtargetFeature<"avx512dq", "HasDQI", "true",
                       "Enable AVX-512 Doubleword and Quadword Instructions",
                                       [FeatureAVX512]>;
@@ -134,6 +140,12 @@ def FeatureBWI     : SubtargetFeature<"avx512bw", "HasBWI", "true",
 def FeatureVLX     : SubtargetFeature<"avx512vl", "HasVLX", "true",
                       "Enable AVX-512 Vector Length eXtensions",
                                       [FeatureAVX512]>;
+def FeatureVBMI     : SubtargetFeature<"avx512vbmi", "HasVBMI", "true",
+                      "Enable AVX-512 Vector Bit Manipulation Instructions",
+                                      [FeatureAVX512]>;
+def FeatureIFMA     : SubtargetFeature<"avx512ifma", "HasIFMA", "true",
+                      "Enable AVX-512 Integer Fused Multiple-Add",
+                                      [FeatureAVX512]>;
 def FeaturePKU   : SubtargetFeature<"pku", "HasPKU", "true",
                       "Enable protection keys">;
 def FeaturePCLMUL  : SubtargetFeature<"pclmul", "HasPCLMUL", "true",
@@ -186,6 +198,8 @@ def FeatureRDSEED  : SubtargetFeature<"rdseed", "HasRDSEED", "true",
                                       "Support RDSEED instruction">;
 def FeatureLAHFSAHF : SubtargetFeature<"sahf", "HasLAHFSAHF", "true",
                                        "Support LAHF and SAHF instructions">;
+def FeatureMWAITX  : SubtargetFeature<"mwaitx", "HasMWAITX", "true",
+                                      "Enable MONITORX/MWAITX timer functionality">;
 def FeatureMPX     : SubtargetFeature<"mpx", "HasMPX", "true",
                                       "Support MPX instructions">;
 def FeatureLEAForSP : SubtargetFeature<"lea-sp", "UseLeaForSP", "true",
@@ -199,6 +213,20 @@ def FeatureSlowDivide64 : SubtargetFeature<"idivq-to-divw",
 def FeaturePadShortFunctions : SubtargetFeature<"pad-short-functions",
                                      "PadShortFunctions", "true",
                                      "Pad short functions">;
+def FeatureINVPCID : SubtargetFeature<"invpcid", "HasInvPCId", "true",
+                                      "Invalidate Process-Context Identifier">;
+def FeatureVMFUNC  : SubtargetFeature<"vmfunc", "HasVMFUNC", "true",
+                                      "VM Functions">;
+def FeatureSMAP    : SubtargetFeature<"smap", "HasSMAP", "true",
+                                      "Supervisor Mode Access Protection">;
+def FeatureSGX     : SubtargetFeature<"sgx", "HasSGX", "true",
+                                      "Enable Software Guard Extensions">;
+def FeatureCLFLUSHOPT : SubtargetFeature<"clflushopt", "HasCLFLUSHOPT", "true",
+                                      "Flush A Cache Line Optimized">;
+def FeaturePCOMMIT : SubtargetFeature<"pcommit", "HasPCOMMIT", "true",
+                                      "Enable Persistent Commit">;
+def FeatureCLWB    : SubtargetFeature<"clwb", "HasCLWB", "true",
+                                      "Cache Line Write Back">;
 // TODO: This feature ought to be renamed.
 // What it really refers to are CPUs for which certain instructions
 // (which ones besides the example below?) are microcoded.
@@ -216,6 +244,11 @@ def FeatureSlowIncDec : SubtargetFeature<"slow-incdec", "SlowIncDec", "true",
 def FeatureSoftFloat
     : SubtargetFeature<"soft-float", "UseSoftFloat", "true",
                        "Use software floating point features.">;
+// On at least some AMD processors, there is no performance hazard to writing
+// only the lower parts of a YMM register without clearing the upper part.
+def FeatureFastPartialYMMWrite
+    : SubtargetFeature<"fast-partial-ymm-write", "HasFastPartialYMMWrite",
+                       "true", "Partial writes to YMM registers are fast">;
 
 //===----------------------------------------------------------------------===//
 // X86 processors supported.
@@ -231,37 +264,57 @@ def ProcIntelSLM  : SubtargetFeature<"slm", "X86ProcFamily", "IntelSLM",
 class Proc<string Name, list<SubtargetFeature> Features>
  : ProcessorModel<Name, GenericModel, Features>;
 
-def : Proc<"generic",         [FeatureSlowUAMem16]>;
-def : Proc<"i386",            [FeatureSlowUAMem16]>;
-def : Proc<"i486",            [FeatureSlowUAMem16]>;
-def : Proc<"i586",            [FeatureSlowUAMem16]>;
-def : Proc<"pentium",         [FeatureSlowUAMem16]>;
-def : Proc<"pentium-mmx",     [FeatureSlowUAMem16, FeatureMMX]>;
-def : Proc<"i686",            [FeatureSlowUAMem16]>;
-def : Proc<"pentiumpro",      [FeatureSlowUAMem16, FeatureCMOV]>;
-def : Proc<"pentium2",        [FeatureSlowUAMem16, FeatureMMX, FeatureCMOV,
-                               FeatureFXSR]>;
-def : Proc<"pentium3",        [FeatureSlowUAMem16, FeatureMMX, FeatureSSE1,
-                               FeatureFXSR]>;
-def : Proc<"pentium3m",       [FeatureSlowUAMem16, FeatureMMX, FeatureSSE1,
-                               FeatureFXSR, FeatureSlowBTMem]>;
-def : Proc<"pentium-m",       [FeatureSlowUAMem16, FeatureMMX, FeatureSSE2,
-                               FeatureFXSR, FeatureSlowBTMem]>;
-def : Proc<"pentium4",        [FeatureSlowUAMem16, FeatureMMX, FeatureSSE2,
-                               FeatureFXSR]>;
-def : Proc<"pentium4m",       [FeatureSlowUAMem16, FeatureMMX, FeatureSSE2,
-                               FeatureFXSR, FeatureSlowBTMem]>;
+def : Proc<"generic",         [FeatureX87, FeatureSlowUAMem16]>;
+def : Proc<"i386",            [FeatureX87, FeatureSlowUAMem16]>;
+def : Proc<"i486",            [FeatureX87, FeatureSlowUAMem16]>;
+def : Proc<"i586",            [FeatureX87, FeatureSlowUAMem16]>;
+def : Proc<"pentium",         [FeatureX87, FeatureSlowUAMem16]>;
+def : Proc<"pentium-mmx",     [FeatureX87, FeatureSlowUAMem16, FeatureMMX]>;
+def : Proc<"i686",            [FeatureX87, FeatureSlowUAMem16]>;
+def : Proc<"pentiumpro",      [FeatureX87, FeatureSlowUAMem16, FeatureCMOV]>;
+def : Proc<"pentium2",        [FeatureX87, FeatureSlowUAMem16, FeatureMMX,
+                               FeatureCMOV, FeatureFXSR]>;
+def : Proc<"pentium3",        [FeatureX87, FeatureSlowUAMem16, FeatureMMX,
+                               FeatureSSE1, FeatureFXSR]>;
+def : Proc<"pentium3m",       [FeatureX87, FeatureSlowUAMem16, FeatureMMX,
+                               FeatureSSE1, FeatureFXSR, FeatureSlowBTMem]>;
+
+// Enable the PostRAScheduler for SSE2 and SSE3 class cpus.
+// The intent is to enable it for pentium4 which is the current default
+// processor in a vanilla 32-bit clang compilation when no specific
+// architecture is specified.  This generally gives a nice performance
+// increase on silvermont, with largely neutral behavior on other
+// contemporary large core processors.
+// pentium-m, pentium4m, prescott and nocona are included as a preventative
+// measure to avoid performance surprises, in case clang's default cpu
+// changes slightly.
+
+def : ProcessorModel<"pentium-m", GenericPostRAModel,
+                     [FeatureX87, FeatureSlowUAMem16, FeatureMMX,
+                      FeatureSSE2, FeatureFXSR, FeatureSlowBTMem]>;
+
+def : ProcessorModel<"pentium4", GenericPostRAModel,
+                     [FeatureX87, FeatureSlowUAMem16, FeatureMMX,
+                      FeatureSSE2, FeatureFXSR]>;
+
+def : ProcessorModel<"pentium4m", GenericPostRAModel,
+                     [FeatureX87, FeatureSlowUAMem16, FeatureMMX,
+                      FeatureSSE2, FeatureFXSR, FeatureSlowBTMem]>;
+
+// Intel Quark.
+def : Proc<"lakemont",        []>;
 
 // Intel Core Duo.
 def : ProcessorModel<"yonah", SandyBridgeModel,
-                     [FeatureSlowUAMem16, FeatureMMX, FeatureSSE3, FeatureFXSR,
-                      FeatureSlowBTMem]>;
+                     [FeatureX87, FeatureSlowUAMem16, FeatureMMX, FeatureSSE3,
+                      FeatureFXSR, FeatureSlowBTMem]>;
 
 // NetBurst.
-def : Proc<"prescott",
-           [FeatureSlowUAMem16, FeatureMMX, FeatureSSE3, FeatureFXSR,
-            FeatureSlowBTMem]>;
-def : Proc<"nocona", [
+def : ProcessorModel<"prescott", GenericPostRAModel,
+                     [FeatureX87, FeatureSlowUAMem16, FeatureMMX, FeatureSSE3,
+                      FeatureFXSR, FeatureSlowBTMem]>;
+def : ProcessorModel<"nocona", GenericPostRAModel, [
+  FeatureX87,
   FeatureSlowUAMem16,
   FeatureMMX,
   FeatureSSE3,
@@ -272,6 +325,7 @@ def : Proc<"nocona", [
 
 // Intel Core 2 Solo/Duo.
 def : ProcessorModel<"core2", SandyBridgeModel, [
+  FeatureX87,
   FeatureSlowUAMem16,
   FeatureMMX,
   FeatureSSSE3,
@@ -281,6 +335,7 @@ def : ProcessorModel<"core2", SandyBridgeModel, [
   FeatureLAHFSAHF
 ]>;
 def : ProcessorModel<"penryn", SandyBridgeModel, [
+  FeatureX87,
   FeatureSlowUAMem16,
   FeatureMMX,
   FeatureSSE41,
@@ -293,6 +348,7 @@ def : ProcessorModel<"penryn", SandyBridgeModel, [
 // Atom CPUs.
 class BonnellProc<string Name> : ProcessorModel<Name, AtomModel, [
   ProcIntelAtom,
+  FeatureX87,
   FeatureSlowUAMem16,
   FeatureMMX,
   FeatureSSSE3,
@@ -313,6 +369,7 @@ def : BonnellProc<"atom">; // Pin the generic name to the baseline.
 
 class SilvermontProc<string Name> : ProcessorModel<Name, SLMModel, [
   ProcIntelSLM,
+  FeatureX87,
   FeatureMMX,
   FeatureSSE42,
   FeatureFXSR,
@@ -334,6 +391,7 @@ def : SilvermontProc<"slm">; // Legacy alias.
 
 // "Arrandale" along with corei3 and corei5
 class NehalemProc<string Name> : ProcessorModel<Name, SandyBridgeModel, [
+  FeatureX87,
   FeatureMMX,
   FeatureSSE42,
   FeatureFXSR,
@@ -348,6 +406,7 @@ def : NehalemProc<"corei7">;
 // Westmere is a similar machine to nehalem with some additional features.
 // Westmere is the corei3/i5/i7 path from nehalem to sandybridge
 class WestmereProc<string Name> : ProcessorModel<Name, SandyBridgeModel, [
+  FeatureX87,
   FeatureMMX,
   FeatureSSE42,
   FeatureFXSR,
@@ -360,15 +419,24 @@ class WestmereProc<string Name> : ProcessorModel<Name, SandyBridgeModel, [
 ]>;
 def : WestmereProc<"westmere">;
 
+class ProcessorFeatures<list<SubtargetFeature> Inherited,
+                        list<SubtargetFeature> NewFeatures> {
+  list<SubtargetFeature> Value = !listconcat(Inherited, NewFeatures);
+}
+
+class ProcModel<string Name, SchedMachineModel Model,
+                list<SubtargetFeature> ProcFeatures,
+                list<SubtargetFeature> OtherFeatures> :
+  ProcessorModel<Name, Model, !listconcat(ProcFeatures, OtherFeatures)>;
+
 // SSE is not listed here since llvm treats AVX as a reimplementation of SSE,
 // rather than a superset.
-class SandyBridgeProc<string Name> : ProcessorModel<Name, SandyBridgeModel, [
+def SNBFeatures : ProcessorFeatures<[], [
+  FeatureX87,
   FeatureMMX,
   FeatureAVX,
   FeatureFXSR,
   FeatureCMPXCHG16B,
-  FeatureSlowBTMem,
-  FeatureSlowUAMem32,
   FeaturePOPCNT,
   FeatureAES,
   FeaturePCLMUL,
@@ -376,198 +444,166 @@ class SandyBridgeProc<string Name> : ProcessorModel<Name, SandyBridgeModel, [
   FeatureXSAVEOPT,
   FeatureLAHFSAHF
 ]>;
+
+class SandyBridgeProc<string Name> : ProcModel<Name, SandyBridgeModel,
+                                               SNBFeatures.Value, [
+  FeatureSlowBTMem,
+  FeatureSlowUAMem32
+]>;
 def : SandyBridgeProc<"sandybridge">;
 def : SandyBridgeProc<"corei7-avx">; // Legacy alias.
 
-class IvyBridgeProc<string Name> : ProcessorModel<Name, SandyBridgeModel, [
-  FeatureMMX,
-  FeatureAVX,
-  FeatureFXSR,
-  FeatureCMPXCHG16B,
-  FeatureSlowBTMem,
-  FeatureSlowUAMem32,
-  FeaturePOPCNT,
-  FeatureAES,
-  FeaturePCLMUL,
-  FeatureXSAVE,
-  FeatureXSAVEOPT,
+def IVBFeatures : ProcessorFeatures<SNBFeatures.Value, [
   FeatureRDRAND,
   FeatureF16C,
-  FeatureFSGSBase,
-  FeatureLAHFSAHF
+  FeatureFSGSBase
+]>;
+
+class IvyBridgeProc<string Name> : ProcModel<Name, SandyBridgeModel,
+                                             IVBFeatures.Value, [
+  FeatureSlowBTMem,
+  FeatureSlowUAMem32
 ]>;
 def : IvyBridgeProc<"ivybridge">;
 def : IvyBridgeProc<"core-avx-i">; // Legacy alias.
 
-class HaswellProc<string Name> : ProcessorModel<Name, HaswellModel, [
-  FeatureMMX,
+def HSWFeatures : ProcessorFeatures<IVBFeatures.Value, [
   FeatureAVX2,
-  FeatureFXSR,
-  FeatureCMPXCHG16B,
-  FeatureSlowBTMem,
-  FeaturePOPCNT,
-  FeatureAES,
-  FeaturePCLMUL,
-  FeatureRDRAND,
-  FeatureXSAVE,
-  FeatureXSAVEOPT,
-  FeatureF16C,
-  FeatureFSGSBase,
-  FeatureMOVBE,
-  FeatureLZCNT,
   FeatureBMI,
   FeatureBMI2,
   FeatureFMA,
+  FeatureLZCNT,
+  FeatureMOVBE,
+  FeatureINVPCID,
+  FeatureVMFUNC,
   FeatureRTM,
   FeatureHLE,
-  FeatureSlowIncDec,
-  FeatureLAHFSAHF
+  FeatureSlowIncDec
 ]>;
+
+class HaswellProc<string Name> : ProcModel<Name, HaswellModel,
+                                           HSWFeatures.Value, []>;
 def : HaswellProc<"haswell">;
 def : HaswellProc<"core-avx2">; // Legacy alias.
 
-class BroadwellProc<string Name> : ProcessorModel<Name, HaswellModel, [
-  FeatureMMX,
-  FeatureAVX2,
-  FeatureFXSR,
-  FeatureCMPXCHG16B,
-  FeatureSlowBTMem,
-  FeaturePOPCNT,
-  FeatureAES,
-  FeaturePCLMUL,
-  FeatureXSAVE,
-  FeatureXSAVEOPT,
-  FeatureRDRAND,
-  FeatureF16C,
-  FeatureFSGSBase,
-  FeatureMOVBE,
-  FeatureLZCNT,
-  FeatureBMI,
-  FeatureBMI2,
-  FeatureFMA,
-  FeatureRTM,
-  FeatureHLE,
+def BDWFeatures : ProcessorFeatures<HSWFeatures.Value, [
   FeatureADX,
   FeatureRDSEED,
-  FeatureSlowIncDec,
-  FeatureLAHFSAHF
+  FeatureSMAP
 ]>;
+class BroadwellProc<string Name> : ProcModel<Name, HaswellModel,
+                                             BDWFeatures.Value, []>;
 def : BroadwellProc<"broadwell">;
 
+def SKLFeatures : ProcessorFeatures<BDWFeatures.Value, [
+  FeatureMPX,
+  FeatureXSAVEC,
+  FeatureXSAVES,
+  FeatureSGX,
+  FeatureCLFLUSHOPT
+]>;
+
+// FIXME: define SKL model
+class SkylakeClientProc<string Name> : ProcModel<Name, HaswellModel,
+                                                 SKLFeatures.Value, []>;
+def : SkylakeClientProc<"skylake">;
+
 // FIXME: define KNL model
-class KnightsLandingProc<string Name> : ProcessorModel<Name, HaswellModel, [
-  FeatureMMX,
+class KnightsLandingProc<string Name> : ProcModel<Name, HaswellModel,
+                                                  IVBFeatures.Value, [
   FeatureAVX512,
-  FeatureFXSR,
   FeatureERI,
   FeatureCDI,
   FeaturePFI,
-  FeatureCMPXCHG16B,
-  FeaturePOPCNT,
-  FeatureAES,
-  FeaturePCLMUL,
-  FeatureXSAVE,
-  FeatureXSAVEOPT,
-  FeatureRDRAND,
-  FeatureF16C,
-  FeatureFSGSBase,
+  FeaturePREFETCHWT1,
+  FeatureADX,
+  FeatureRDSEED,
   FeatureMOVBE,
   FeatureLZCNT,
   FeatureBMI,
   FeatureBMI2,
-  FeatureFMA,
-  FeatureRTM,
-  FeatureHLE,
-  FeatureSlowIncDec,
-  FeatureMPX,
-  FeatureLAHFSAHF
+  FeatureFMA
 ]>;
 def : KnightsLandingProc<"knl">;
 
-// FIXME: define SKX model
-class SkylakeProc<string Name> : ProcessorModel<Name, HaswellModel, [
-  FeatureMMX,
+def SKXFeatures : ProcessorFeatures<SKLFeatures.Value, [
   FeatureAVX512,
-  FeatureFXSR,
   FeatureCDI,
   FeatureDQI,
   FeatureBWI,
   FeatureVLX,
   FeaturePKU,
-  FeatureCMPXCHG16B,
-  FeatureSlowBTMem,
-  FeaturePOPCNT,
-  FeatureAES,
-  FeaturePCLMUL,
-  FeatureXSAVE,
-  FeatureXSAVEOPT,
-  FeatureRDRAND,
-  FeatureF16C,
-  FeatureFSGSBase,
-  FeatureMOVBE,
-  FeatureLZCNT,
-  FeatureBMI,
-  FeatureBMI2,
-  FeatureFMA,
-  FeatureRTM,
-  FeatureHLE,
-  FeatureADX,
-  FeatureRDSEED,
-  FeatureSlowIncDec,
-  FeatureMPX,
-  FeatureXSAVEC,
-  FeatureXSAVES,
-  FeatureLAHFSAHF
+  FeaturePCOMMIT,
+  FeatureCLWB
 ]>;
-def : SkylakeProc<"skylake">;
-def : SkylakeProc<"skx">; // Legacy alias.
 
+// FIXME: define SKX model
+class SkylakeServerProc<string Name> : ProcModel<Name, HaswellModel,
+                                                 SKXFeatures.Value, []>;
+def : SkylakeServerProc<"skylake-avx512">;
+def : SkylakeServerProc<"skx">; // Legacy alias.
+
+def CNLFeatures : ProcessorFeatures<SKXFeatures.Value, [
+  FeatureVBMI,
+  FeatureIFMA,
+  FeatureSHA
+]>;
+
+class CannonlakeProc<string Name> : ProcModel<Name, HaswellModel,
+                                              CNLFeatures.Value, []>;
+def : CannonlakeProc<"cannonlake">;
 
 // AMD CPUs.
 
-def : Proc<"k6",              [FeatureSlowUAMem16, FeatureMMX]>;
-def : Proc<"k6-2",            [FeatureSlowUAMem16, Feature3DNow]>;
-def : Proc<"k6-3",            [FeatureSlowUAMem16, Feature3DNow]>;
-def : Proc<"athlon",          [FeatureSlowUAMem16, Feature3DNowA,
+def : Proc<"k6",              [FeatureX87, FeatureSlowUAMem16, FeatureMMX]>;
+def : Proc<"k6-2",            [FeatureX87, FeatureSlowUAMem16, Feature3DNow]>;
+def : Proc<"k6-3",            [FeatureX87, FeatureSlowUAMem16, Feature3DNow]>;
+def : Proc<"athlon",          [FeatureX87, FeatureSlowUAMem16, Feature3DNowA,
                                FeatureSlowBTMem, FeatureSlowSHLD]>;
-def : Proc<"athlon-tbird",    [FeatureSlowUAMem16, Feature3DNowA,
+def : Proc<"athlon-tbird",    [FeatureX87, FeatureSlowUAMem16, Feature3DNowA,
                                FeatureSlowBTMem, FeatureSlowSHLD]>;
-def : Proc<"athlon-4",        [FeatureSlowUAMem16, FeatureSSE1, Feature3DNowA,
-                               FeatureFXSR, FeatureSlowBTMem, FeatureSlowSHLD]>;
-def : Proc<"athlon-xp",       [FeatureSlowUAMem16, FeatureSSE1, Feature3DNowA,
-                               FeatureFXSR, FeatureSlowBTMem, FeatureSlowSHLD]>;
-def : Proc<"athlon-mp",       [FeatureSlowUAMem16, FeatureSSE1, Feature3DNowA,
-                               FeatureFXSR, FeatureSlowBTMem, FeatureSlowSHLD]>;
-def : Proc<"k8",              [FeatureSlowUAMem16, FeatureSSE2, Feature3DNowA,
-                               FeatureFXSR, Feature64Bit, FeatureSlowBTMem,
-                               FeatureSlowSHLD]>;
-def : Proc<"opteron",         [FeatureSlowUAMem16, FeatureSSE2, Feature3DNowA,
-                               FeatureFXSR, Feature64Bit, FeatureSlowBTMem,
-                               FeatureSlowSHLD]>;
-def : Proc<"athlon64",        [FeatureSlowUAMem16, FeatureSSE2,   Feature3DNowA,
-                               FeatureFXSR, Feature64Bit, FeatureSlowBTMem,
+def : Proc<"athlon-4",        [FeatureX87, FeatureSlowUAMem16, FeatureSSE1,
+                               Feature3DNowA, FeatureFXSR, FeatureSlowBTMem,
                                FeatureSlowSHLD]>;
-def : Proc<"athlon-fx",       [FeatureSlowUAMem16, FeatureSSE2,   Feature3DNowA,
-                               FeatureFXSR, Feature64Bit, FeatureSlowBTMem,
+def : Proc<"athlon-xp",       [FeatureX87, FeatureSlowUAMem16, FeatureSSE1,
+                               Feature3DNowA, FeatureFXSR, FeatureSlowBTMem,
                                FeatureSlowSHLD]>;
-def : Proc<"k8-sse3",         [FeatureSlowUAMem16, FeatureSSE3,   Feature3DNowA,
-                               FeatureFXSR, FeatureCMPXCHG16B, FeatureSlowBTMem,
+def : Proc<"athlon-mp",       [FeatureX87, FeatureSlowUAMem16, FeatureSSE1,
+                               Feature3DNowA, FeatureFXSR, FeatureSlowBTMem,
                                FeatureSlowSHLD]>;
-def : Proc<"opteron-sse3",    [FeatureSlowUAMem16, FeatureSSE3,   Feature3DNowA,
-                               FeatureFXSR, FeatureCMPXCHG16B, FeatureSlowBTMem,
-                               FeatureSlowSHLD]>;
-def : Proc<"athlon64-sse3",   [FeatureSlowUAMem16, FeatureSSE3,   Feature3DNowA,
-                               FeatureFXSR, FeatureCMPXCHG16B, FeatureSlowBTMem,
-                               FeatureSlowSHLD]>;
-def : Proc<"amdfam10",        [FeatureSSE4A, Feature3DNowA, FeatureFXSR,
-                               FeatureCMPXCHG16B, FeatureLZCNT, FeaturePOPCNT,
-                               FeatureSlowBTMem, FeatureSlowSHLD, FeatureLAHFSAHF]>;
-def : Proc<"barcelona",       [FeatureSSE4A, Feature3DNowA, FeatureFXSR,
-                               FeatureCMPXCHG16B, FeatureLZCNT, FeaturePOPCNT,
-                               FeatureSlowBTMem, FeatureSlowSHLD, FeatureLAHFSAHF]>;
+def : Proc<"k8",              [FeatureX87, FeatureSlowUAMem16, FeatureSSE2,
+                               Feature3DNowA, FeatureFXSR, Feature64Bit,
+                               FeatureSlowBTMem, FeatureSlowSHLD]>;
+def : Proc<"opteron",         [FeatureX87, FeatureSlowUAMem16, FeatureSSE2,
+                               Feature3DNowA, FeatureFXSR, Feature64Bit,
+                               FeatureSlowBTMem, FeatureSlowSHLD]>;
+def : Proc<"athlon64",        [FeatureX87, FeatureSlowUAMem16, FeatureSSE2,
+                               Feature3DNowA, FeatureFXSR, Feature64Bit,
+                               FeatureSlowBTMem, FeatureSlowSHLD]>;
+def : Proc<"athlon-fx",       [FeatureX87, FeatureSlowUAMem16, FeatureSSE2,
+                               Feature3DNowA, FeatureFXSR, Feature64Bit,
+                               FeatureSlowBTMem, FeatureSlowSHLD]>;
+def : Proc<"k8-sse3",         [FeatureX87, FeatureSlowUAMem16, FeatureSSE3,
+                               Feature3DNowA, FeatureFXSR, FeatureCMPXCHG16B,
+                               FeatureSlowBTMem, FeatureSlowSHLD]>;
+def : Proc<"opteron-sse3",    [FeatureX87, FeatureSlowUAMem16, FeatureSSE3,
+                               Feature3DNowA, FeatureFXSR, FeatureCMPXCHG16B,
+                               FeatureSlowBTMem, FeatureSlowSHLD]>;
+def : Proc<"athlon64-sse3",   [FeatureX87, FeatureSlowUAMem16, FeatureSSE3,
+                               Feature3DNowA, FeatureFXSR, FeatureCMPXCHG16B,
+                               FeatureSlowBTMem, FeatureSlowSHLD]>;
+def : Proc<"amdfam10",        [FeatureX87, FeatureSSE4A, Feature3DNowA,
+                               FeatureFXSR, FeatureCMPXCHG16B, FeatureLZCNT,
+                               FeaturePOPCNT, FeatureSlowBTMem, FeatureSlowSHLD,
+                               FeatureLAHFSAHF]>;
+def : Proc<"barcelona",       [FeatureX87, FeatureSSE4A, Feature3DNowA,
+                               FeatureFXSR, FeatureCMPXCHG16B, FeatureLZCNT,
+                               FeaturePOPCNT, FeatureSlowBTMem, FeatureSlowSHLD,
+                               FeatureLAHFSAHF]>;
 
 // Bobcat
 def : Proc<"btver1", [
+  FeatureX87,
   FeatureMMX,
   FeatureSSSE3,
   FeatureSSE4A,
@@ -576,13 +612,13 @@ def : Proc<"btver1", [
   FeaturePRFCHW,
   FeatureLZCNT,
   FeaturePOPCNT,
-  FeatureXSAVE,
   FeatureSlowSHLD,
   FeatureLAHFSAHF
 ]>;
 
 // Jaguar
 def : ProcessorModel<"btver2", BtVer2Model, [
+  FeatureX87,
   FeatureMMX,
   FeatureAVX,
   FeatureFXSR,
@@ -599,11 +635,13 @@ def : ProcessorModel<"btver2", BtVer2Model, [
   FeatureXSAVE,
   FeatureXSAVEOPT,
   FeatureSlowSHLD,
-  FeatureLAHFSAHF
+  FeatureLAHFSAHF,
+  FeatureFastPartialYMMWrite
 ]>;
 
 // Bulldozer
 def : Proc<"bdver1", [
+  FeatureX87,
   FeatureXOP,
   FeatureFMA4,
   FeatureCMPXCHG16B,
@@ -622,6 +660,7 @@ def : Proc<"bdver1", [
 ]>;
 // Piledriver
 def : Proc<"bdver2", [
+  FeatureX87,
   FeatureXOP,
   FeatureFMA4,
   FeatureCMPXCHG16B,
@@ -645,6 +684,7 @@ def : Proc<"bdver2", [
 
 // Steamroller
 def : Proc<"bdver3", [
+  FeatureX87,
   FeatureXOP,
   FeatureFMA4,
   FeatureCMPXCHG16B,
@@ -670,6 +710,7 @@ def : Proc<"bdver3", [
 
 // Excavator
 def : Proc<"bdver4", [
+  FeatureX87,
   FeatureMMX,
   FeatureAVX2,
   FeatureFXSR,
@@ -689,15 +730,17 @@ def : Proc<"bdver4", [
   FeatureFMA,
   FeatureXSAVEOPT,
   FeatureFSGSBase,
-  FeatureLAHFSAHF
+  FeatureLAHFSAHF,
+  FeatureMWAITX
 ]>;
 
-def : Proc<"geode",           [FeatureSlowUAMem16, Feature3DNowA]>;
+def : Proc<"geode",           [FeatureX87, FeatureSlowUAMem16, Feature3DNowA]>;
 
-def : Proc<"winchip-c6",      [FeatureSlowUAMem16, FeatureMMX]>;
-def : Proc<"winchip2",        [FeatureSlowUAMem16, Feature3DNow]>;
-def : Proc<"c3",              [FeatureSlowUAMem16, Feature3DNow]>;
-def : Proc<"c3-2", [FeatureSlowUAMem16, FeatureMMX, FeatureSSE1, FeatureFXSR]>;
+def : Proc<"winchip-c6",      [FeatureX87, FeatureSlowUAMem16, FeatureMMX]>;
+def : Proc<"winchip2",        [FeatureX87, FeatureSlowUAMem16, Feature3DNow]>;
+def : Proc<"c3",              [FeatureX87, FeatureSlowUAMem16, Feature3DNow]>;
+def : Proc<"c3-2",            [FeatureX87, FeatureSlowUAMem16, FeatureMMX,
+                               FeatureSSE1, FeatureFXSR]>;
 
 // We also provide a generic 64-bit specific x86 processor model which tries to
 // be good for modern chips without enabling instruction set encodings past the
@@ -710,8 +753,8 @@ def : Proc<"c3-2", [FeatureSlowUAMem16, FeatureMMX, FeatureSSE1, FeatureFXSR]>;
 // knobs which need to be tuned differently for AMD chips, we might consider
 // forming a common base for them.
 def : ProcessorModel<"x86-64", SandyBridgeModel,
-                     [FeatureMMX, FeatureSSE2, FeatureFXSR, Feature64Bit,
-                      FeatureSlowBTMem ]>;
+                     [FeatureX87, FeatureMMX, FeatureSSE2, FeatureFXSR,
+                      Feature64Bit, FeatureSlowBTMem ]>;
 
 //===----------------------------------------------------------------------===//
 // Register File Description
diff --git a/lib/Target/X86/X86AsmPrinter.cpp b/lib/Target/X86/X86AsmPrinter.cpp
index 2170e62e30fd..67e51f1e9194 100644
--- a/lib/Target/X86/X86AsmPrinter.cpp
+++ b/lib/Target/X86/X86AsmPrinter.cpp
@@ -17,7 +17,6 @@
 #include "MCTargetDesc/X86BaseInfo.h"
 #include "X86InstrInfo.h"
 #include "X86MachineFunctionInfo.h"
-#include "llvm/ADT/SmallString.h"
 #include "llvm/CodeGen/MachineConstantPool.h"
 #include "llvm/CodeGen/MachineModuleInfoImpls.h"
 #include "llvm/CodeGen/MachineValueType.h"
@@ -28,6 +27,7 @@
 #include "llvm/IR/Module.h"
 #include "llvm/IR/Type.h"
 #include "llvm/MC/MCAsmInfo.h"
+#include "llvm/MC/MCCodeEmitter.h"
 #include "llvm/MC/MCContext.h"
 #include "llvm/MC/MCExpr.h"
 #include "llvm/MC/MCSectionCOFF.h"
@@ -50,6 +50,9 @@ bool X86AsmPrinter::runOnMachineFunction(MachineFunction &MF) {
   Subtarget = &MF.getSubtarget<X86Subtarget>();
 
   SMShadowTracker.startFunction(MF);
+  CodeEmitter.reset(TM.getTarget().createMCCodeEmitter(
+      *MF.getSubtarget().getInstrInfo(), *MF.getSubtarget().getRegisterInfo(),
+      MF.getContext()));
 
   SetupMachineFunction(MF);
 
@@ -66,6 +69,9 @@ bool X86AsmPrinter::runOnMachineFunction(MachineFunction &MF) {
   // Emit the rest of the function body.
   EmitFunctionBody();
 
+  // Emit the XRay table for this function.
+  EmitXRayTable();
+
   // We didn't modify anything.
   return false;
 }
@@ -85,11 +91,8 @@ static void printSymbolOperand(X86AsmPrinter &P, const MachineOperand &MO,
     const GlobalValue *GV = MO.getGlobal();
 
     MCSymbol *GVSym;
-    if (MO.getTargetFlags() == X86II::MO_DARWIN_STUB)
-      GVSym = P.getSymbolWithGlobalValueBase(GV, "$stub");
-    else if (MO.getTargetFlags() == X86II::MO_DARWIN_NONLAZY ||
-             MO.getTargetFlags() == X86II::MO_DARWIN_NONLAZY_PIC_BASE ||
-             MO.getTargetFlags() == X86II::MO_DARWIN_HIDDEN_NONLAZY_PIC_BASE)
+    if (MO.getTargetFlags() == X86II::MO_DARWIN_NONLAZY ||
+        MO.getTargetFlags() == X86II::MO_DARWIN_NONLAZY_PIC_BASE)
       GVSym = P.getSymbolWithGlobalValueBase(GV, "$non_lazy_ptr");
     else
       GVSym = P.getSymbol(GV);
@@ -107,21 +110,6 @@ static void printSymbolOperand(X86AsmPrinter &P, const MachineOperand &MO,
       if (!StubSym.getPointer())
         StubSym = MachineModuleInfoImpl::
           StubValueTy(P.getSymbol(GV), !GV->hasInternalLinkage());
-    } else if (MO.getTargetFlags() == X86II::MO_DARWIN_HIDDEN_NONLAZY_PIC_BASE){
-      MCSymbol *Sym = P.getSymbolWithGlobalValueBase(GV, "$non_lazy_ptr");
-      MachineModuleInfoImpl::StubValueTy &StubSym =
-          P.MMI->getObjFileInfo<MachineModuleInfoMachO>().getHiddenGVStubEntry(
-              Sym);
-      if (!StubSym.getPointer())
-        StubSym = MachineModuleInfoImpl::
-          StubValueTy(P.getSymbol(GV), !GV->hasInternalLinkage());
-    } else if (MO.getTargetFlags() == X86II::MO_DARWIN_STUB) {
-      MCSymbol *Sym = P.getSymbolWithGlobalValueBase(GV, "$stub");
-      MachineModuleInfoImpl::StubValueTy &StubSym =
-          P.MMI->getObjFileInfo<MachineModuleInfoMachO>().getFnStubEntry(Sym);
-      if (!StubSym.getPointer())
-        StubSym = MachineModuleInfoImpl::
-          StubValueTy(P.getSymbol(GV), !GV->hasInternalLinkage());
     }
 
     // If the name begins with a dollar-sign, enclose it in parens.  We do this
@@ -145,7 +133,6 @@ static void printSymbolOperand(X86AsmPrinter &P, const MachineOperand &MO,
     break;
   case X86II::MO_DARWIN_NONLAZY:
   case X86II::MO_DLLIMPORT:
-  case X86II::MO_DARWIN_STUB:
     // These affect the name of the symbol, not any suffix.
     break;
   case X86II::MO_GOT_ABSOLUTE_ADDRESS:
@@ -155,7 +142,6 @@ static void printSymbolOperand(X86AsmPrinter &P, const MachineOperand &MO,
     break;
   case X86II::MO_PIC_BASE_OFFSET:
   case X86II::MO_DARWIN_NONLAZY_PIC_BASE:
-  case X86II::MO_DARWIN_HIDDEN_NONLAZY_PIC_BASE:
     O << '-';
     P.MF->getPICBaseSymbol()->print(O, P.MAI);
     break;
@@ -294,7 +280,7 @@ static void printLeaMemReference(X86AsmPrinter &P, const MachineInstr *MI,
 static void printMemReference(X86AsmPrinter &P, const MachineInstr *MI,
                               unsigned Op, raw_ostream &O,
                               const char *Modifier = nullptr) {
-  assert(isMem(MI, Op) && "Invalid memory reference!");
+  assert(isMem(*MI, Op) && "Invalid memory reference!");
   const MachineOperand &Segment = MI->getOperand(Op+X86::AddrSegmentReg);
   if (Segment.getReg()) {
     printOperand(P, MI, Op+X86::AddrSegmentReg, O, Modifier);
@@ -535,6 +521,12 @@ void X86AsmPrinter::EmitStartOfAsmFile(Module &M) {
     }
   }
   OutStreamer->EmitSyntaxDirective();
+
+  // If this is not inline asm and we're in 16-bit
+  // mode prefix assembly with .code16.
+  bool is16 = TT.getEnvironment() == Triple::CODE16;
+  if (M.getModuleInlineAsm().empty() && is16)
+    OutStreamer->EmitAssemblerFlag(MCAF_Code16);
 }
 
 static void
@@ -568,8 +560,9 @@ MCSymbol *X86AsmPrinter::GetCPISymbol(unsigned CPID) const {
       const DataLayout &DL = MF->getDataLayout();
       SectionKind Kind = CPE.getSectionKind(&DL);
       const Constant *C = CPE.Val.ConstVal;
+      unsigned Align = CPE.Alignment;
       if (const MCSectionCOFF *S = dyn_cast<MCSectionCOFF>(
-              getObjFileLowering().getSectionForConstant(DL, Kind, C))) {
+              getObjFileLowering().getSectionForConstant(DL, Kind, C, Align))) {
         if (MCSymbol *Sym = S->getCOMDATSymbol()) {
           if (Sym->isUndefined())
             OutStreamer->EmitSymbolAttribute(Sym, MCSA_Global);
@@ -593,30 +586,6 @@ void X86AsmPrinter::EmitEndOfAsmFile(Module &M) {
     // Output stubs for dynamically-linked functions.
     MachineModuleInfoMachO::SymbolListTy Stubs;
 
-    Stubs = MMIMacho.GetFnStubList();
-    if (!Stubs.empty()) {
-      MCSection *TheSection = OutContext.getMachOSection(
-          "__IMPORT", "__jump_table",
-          MachO::S_SYMBOL_STUBS | MachO::S_ATTR_SELF_MODIFYING_CODE |
-              MachO::S_ATTR_PURE_INSTRUCTIONS,
-          5, SectionKind::getMetadata());
-      OutStreamer->SwitchSection(TheSection);
-
-      for (const auto &Stub : Stubs) {
-        // L_foo$stub:
-        OutStreamer->EmitLabel(Stub.first);
-        //   .indirect_symbol _foo
-        OutStreamer->EmitSymbolAttribute(Stub.second.getPointer(),
-                                         MCSA_IndirectSymbol);
-        // hlt; hlt; hlt; hlt; hlt     hlt = 0xf4.
-        const char HltInsts[] = "\xf4\xf4\xf4\xf4\xf4";
-        OutStreamer->EmitBytes(StringRef(HltInsts, 5));
-      }
-
-      Stubs.clear();
-      OutStreamer->AddBlankLine();
-    }
-
     // Output stubs for external and common global variables.
     Stubs = MMIMacho.GetGVStubList();
     if (!Stubs.empty()) {
@@ -632,20 +601,6 @@ void X86AsmPrinter::EmitEndOfAsmFile(Module &M) {
       OutStreamer->AddBlankLine();
     }
 
-    Stubs = MMIMacho.GetHiddenGVStubList();
-    if (!Stubs.empty()) {
-      MCSection *TheSection = OutContext.getMachOSection(
-          "__IMPORT", "__pointers", MachO::S_NON_LAZY_SYMBOL_POINTERS,
-          SectionKind::getMetadata());
-      OutStreamer->SwitchSection(TheSection);
-
-      for (auto &Stub : Stubs)
-        emitNonLazySymbolPointer(*OutStreamer, Stub.first, Stub.second);
-
-      Stubs.clear();
-      OutStreamer->AddBlankLine();
-    }
-
     SM.serializeToStackMapSection();
     FM.serializeToFaultMapSection();
 
diff --git a/lib/Target/X86/X86AsmPrinter.h b/lib/Target/X86/X86AsmPrinter.h
index 9c8bd98dbade..dcb7b5a3466f 100644
--- a/lib/Target/X86/X86AsmPrinter.h
+++ b/lib/Target/X86/X86AsmPrinter.h
@@ -29,6 +29,7 @@ class LLVM_LIBRARY_VISIBILITY X86AsmPrinter : public AsmPrinter {
   const X86Subtarget *Subtarget;
   StackMaps SM;
   FaultMaps FM;
+  std::unique_ptr<MCCodeEmitter> CodeEmitter;
 
   // This utility class tracks the length of a stackmap instruction's 'shadow'.
   // It is used by the X86AsmPrinter to ensure that the stackmap shadow
@@ -40,10 +41,11 @@ class LLVM_LIBRARY_VISIBILITY X86AsmPrinter : public AsmPrinter {
   // few instruction bytes to cover the shadow are NOPs used for padding.
   class StackMapShadowTracker {
   public:
-    StackMapShadowTracker(TargetMachine &TM);
-    ~StackMapShadowTracker();
-    void startFunction(MachineFunction &MF);
-    void count(MCInst &Inst, const MCSubtargetInfo &STI);
+    void startFunction(MachineFunction &MF) {
+      this->MF = &MF;
+    }
+    void count(MCInst &Inst, const MCSubtargetInfo &STI,
+               MCCodeEmitter *CodeEmitter);
 
     // Called to signal the start of a shadow of RequiredSize bytes.
     void reset(unsigned RequiredSize) {
@@ -56,21 +58,40 @@ class LLVM_LIBRARY_VISIBILITY X86AsmPrinter : public AsmPrinter {
     // to emit any necessary padding-NOPs.
     void emitShadowPadding(MCStreamer &OutStreamer, const MCSubtargetInfo &STI);
   private:
-    TargetMachine &TM;
     const MachineFunction *MF;
-    std::unique_ptr<MCCodeEmitter> CodeEmitter;
-    bool InShadow;
+    bool InShadow = false;
 
     // RequiredShadowSize holds the length of the shadow specified in the most
     // recently encountered STACKMAP instruction.
     // CurrentShadowSize counts the number of bytes encoded since the most
     // recently encountered STACKMAP, stopping when that number is greater than
     // or equal to RequiredShadowSize.
-    unsigned RequiredShadowSize, CurrentShadowSize;
+    unsigned RequiredShadowSize = 0, CurrentShadowSize = 0;
   };
 
   StackMapShadowTracker SMShadowTracker;
 
+  // This describes the kind of sled we're storing in the XRay table.
+  enum class SledKind : uint8_t {
+    FUNCTION_ENTER = 0,
+    FUNCTION_EXIT = 1,
+    TAIL_CALL = 2,
+  };
+
+  // The table will contain these structs that point to the sled, the function
+  // containing the sled, and what kind of sled (and whether they should always
+  // be instrumented).
+  struct XRayFunctionEntry {
+    const MCSymbol *Sled;
+    const MCSymbol *Function;
+    SledKind Kind;
+    bool AlwaysInstrument;
+    const class Function *Fn;
+  };
+
+  // All the sleds to be emitted.
+  std::vector<XRayFunctionEntry> Sleds;
+
   // All instructions emitted by the X86AsmPrinter should use this helper
   // method.
   //
@@ -82,14 +103,26 @@ class LLVM_LIBRARY_VISIBILITY X86AsmPrinter : public AsmPrinter {
   void LowerPATCHPOINT(const MachineInstr &MI, X86MCInstLower &MCIL);
   void LowerSTATEPOINT(const MachineInstr &MI, X86MCInstLower &MCIL);
   void LowerFAULTING_LOAD_OP(const MachineInstr &MI, X86MCInstLower &MCIL);
+  void LowerPATCHABLE_OP(const MachineInstr &MI, X86MCInstLower &MCIL);
 
   void LowerTlsAddr(X86MCInstLower &MCInstLowering, const MachineInstr &MI);
 
- public:
-   explicit X86AsmPrinter(TargetMachine &TM,
-                          std::unique_ptr<MCStreamer> Streamer)
-       : AsmPrinter(TM, std::move(Streamer)), SM(*this), FM(*this),
-         SMShadowTracker(TM) {}
+  // XRay-specific lowering for X86.
+  void LowerPATCHABLE_FUNCTION_ENTER(const MachineInstr &MI,
+                                     X86MCInstLower &MCIL);
+  void LowerPATCHABLE_RET(const MachineInstr &MI, X86MCInstLower &MCIL);
+  void LowerPATCHABLE_TAIL_CALL(const MachineInstr &MI, X86MCInstLower &MCIL);
+
+  // Helper function that emits the XRay sleds we've collected for a particular
+  // function.
+  void EmitXRayTable();
+
+  // Helper function to record a given XRay sled.
+  void recordSled(MCSymbol *Sled, const MachineInstr &MI, SledKind Kind);
+public:
+  explicit X86AsmPrinter(TargetMachine &TM,
+                         std::unique_ptr<MCStreamer> Streamer)
+      : AsmPrinter(TM, std::move(Streamer)), SM(*this), FM(*this) {}
 
   const char *getPassName() const override {
     return "X86 Assembly / Object Emitter";
diff --git a/lib/Target/X86/X86CallFrameOptimization.cpp b/lib/Target/X86/X86CallFrameOptimization.cpp
index fc6ee1752f1f..b16fa76c73fa 100644
--- a/lib/Target/X86/X86CallFrameOptimization.cpp
+++ b/lib/Target/X86/X86CallFrameOptimization.cpp
@@ -10,9 +10,9 @@
 // This file defines a pass that optimizes call sequences on x86.
 // Currently, it converts movs of function parameters onto the stack into
 // pushes. This is beneficial for two main reasons:
-// 1) The push instruction encoding is much smaller than an esp-relative mov
+// 1) The push instruction encoding is much smaller than a stack-ptr-based mov.
 // 2) It is possible to push memory arguments directly. So, if the
-//    the transformation is preformed pre-reg-alloc, it can help relieve
+//    the transformation is performed pre-reg-alloc, it can help relieve
 //    register pressure.
 //
 //===----------------------------------------------------------------------===//
@@ -21,8 +21,8 @@
 
 #include "X86.h"
 #include "X86InstrInfo.h"
-#include "X86Subtarget.h"
 #include "X86MachineFunctionInfo.h"
+#include "X86Subtarget.h"
 #include "llvm/ADT/Statistic.h"
 #include "llvm/CodeGen/MachineFunctionPass.h"
 #include "llvm/CodeGen/MachineInstrBuilder.h"
@@ -55,7 +55,7 @@ private:
   struct CallContext {
     CallContext()
         : FrameSetup(nullptr), Call(nullptr), SPCopy(nullptr), ExpectedDist(0),
-          MovVector(4, nullptr), NoStackParams(false), UsePush(false){}
+          MovVector(4, nullptr), NoStackParams(false), UsePush(false) {}
 
     // Iterator referring to the frame setup instruction
     MachineBasicBlock::iterator FrameSetup;
@@ -75,7 +75,7 @@ private:
     // True if this call site has no stack parameters
     bool NoStackParams;
 
-    // True of this callsite can use push instructions
+    // True if this call site can use push instructions
     bool UsePush;
   };
 
@@ -88,7 +88,7 @@ private:
   void collectCallInfo(MachineFunction &MF, MachineBasicBlock &MBB,
                        MachineBasicBlock::iterator I, CallContext &Context);
 
-  bool adjustCallSequence(MachineFunction &MF, const CallContext &Context);
+  void adjustCallSequence(MachineFunction &MF, const CallContext &Context);
 
   MachineInstr *canFoldIntoRegPush(MachineBasicBlock::iterator FrameSetup,
                                    unsigned Reg);
@@ -105,12 +105,14 @@ private:
   const TargetInstrInfo *TII;
   const X86FrameLowering *TFL;
   const X86Subtarget *STI;
-  const MachineRegisterInfo *MRI;
+  MachineRegisterInfo *MRI;
+  unsigned SlotSize;
+  unsigned Log2SlotSize;
   static char ID;
 };
 
 char X86CallFrameOptimization::ID = 0;
-}
+} // end anonymous namespace
 
 FunctionPass *llvm::createX86CallFrameOptimization() {
   return new X86CallFrameOptimization();
@@ -123,22 +125,19 @@ bool X86CallFrameOptimization::isLegal(MachineFunction &MF) {
   if (NoX86CFOpt.getValue())
     return false;
 
-  // We currently only support call sequences where *all* parameters.
-  // are passed on the stack.
-  // No point in running this in 64-bit mode, since some arguments are
-  // passed in-register in all common calling conventions, so the pattern
-  // we're looking for will never match.
-  if (STI->is64Bit())
-    return false;
-
   // We can't encode multiple DW_CFA_GNU_args_size or DW_CFA_def_cfa_offset
   // in the compact unwind encoding that Darwin uses. So, bail if there
   // is a danger of that being generated.
-  if (STI->isTargetDarwin() && 
-     (!MF.getMMI().getLandingPads().empty() || 
+  if (STI->isTargetDarwin() &&
+      (!MF.getMMI().getLandingPads().empty() ||
        (MF.getFunction()->needsUnwindTableEntry() && !TFL->hasFP(MF))))
     return false;
 
+  // It is not valid to change the stack pointer outside the prolog/epilog
+  // on 64-bit Windows.
+  if (STI->isTargetWin64())
+    return false;
+
   // You would expect straight-line code between call-frame setup and
   // call-frame destroy. You would be wrong. There are circumstances (e.g.
   // CMOV_GR8 expansion of a select that feeds a function call!) where we can
@@ -169,10 +168,10 @@ bool X86CallFrameOptimization::isLegal(MachineFunction &MF) {
   return true;
 }
 
-// Check whether this trasnformation is profitable for a particular
+// Check whether this transformation is profitable for a particular
 // function - in terms of code size.
-bool X86CallFrameOptimization::isProfitable(MachineFunction &MF, 
-  ContextVector &CallSeqVector) {
+bool X86CallFrameOptimization::isProfitable(MachineFunction &MF,
+                                            ContextVector &CallSeqVector) {
   // This transformation is always a win when we do not expect to have
   // a reserved call frame. Under other circumstances, it may be either
   // a win or a loss, and requires a heuristic.
@@ -180,10 +179,6 @@ bool X86CallFrameOptimization::isProfitable(MachineFunction &MF,
   if (CannotReserveFrame)
     return true;
 
-  // Don't do this when not optimizing for size.
-  if (!MF.getFunction()->optForSize())
-    return false;
-
   unsigned StackAlign = TFL->getStackAlignment();
 
   int64_t Advantage = 0;
@@ -206,16 +201,16 @@ bool X86CallFrameOptimization::isProfitable(MachineFunction &MF,
       // We can use pushes. First, account for the fixed costs.
       // We'll need a add after the call.
       Advantage -= 3;
-      // If we have to realign the stack, we'll also need and sub before
+      // If we have to realign the stack, we'll also need a sub before
       if (CC.ExpectedDist % StackAlign)
         Advantage -= 3;
       // Now, for each push, we save ~3 bytes. For small constants, we actually,
       // save more (up to 5 bytes), but 3 should be a good approximation.
-      Advantage += (CC.ExpectedDist / 4) * 3;
+      Advantage += (CC.ExpectedDist >> Log2SlotSize) * 3;
     }
   }
 
-  return (Advantage >= 0);
+  return Advantage >= 0;
 }
 
 bool X86CallFrameOptimization::runOnMachineFunction(MachineFunction &MF) {
@@ -224,6 +219,12 @@ bool X86CallFrameOptimization::runOnMachineFunction(MachineFunction &MF) {
   TFL = STI->getFrameLowering();
   MRI = &MF.getRegInfo();
 
+  const X86RegisterInfo &RegInfo =
+      *static_cast<const X86RegisterInfo *>(STI->getRegisterInfo());
+  SlotSize = RegInfo.getSlotSize();
+  assert(isPowerOf2_32(SlotSize) && "Expect power of 2 stack slot size");
+  Log2SlotSize = Log2_32(SlotSize);
+
   if (!isLegal(MF))
     return false;
 
@@ -233,20 +234,23 @@ bool X86CallFrameOptimization::runOnMachineFunction(MachineFunction &MF) {
 
   ContextVector CallSeqVector;
 
-  for (MachineFunction::iterator BB = MF.begin(), E = MF.end(); BB != E; ++BB)
-    for (MachineBasicBlock::iterator I = BB->begin(); I != BB->end(); ++I)
-      if (I->getOpcode() == FrameSetupOpcode) {
+  for (auto &MBB : MF)
+    for (auto &MI : MBB)
+      if (MI.getOpcode() == FrameSetupOpcode) {
         CallContext Context;
-        collectCallInfo(MF, *BB, I, Context);
+        collectCallInfo(MF, MBB, MI, Context);
         CallSeqVector.push_back(Context);
       }
 
   if (!isProfitable(MF, CallSeqVector))
     return false;
 
-  for (auto CC : CallSeqVector)
-    if (CC.UsePush)
-      Changed |= adjustCallSequence(MF, CC);
+  for (auto CC : CallSeqVector) {
+    if (CC.UsePush) {
+      adjustCallSequence(MF, CC);
+      Changed = true;
+    }
+  }
 
   return Changed;
 }
@@ -260,7 +264,8 @@ X86CallFrameOptimization::classifyInstruction(
 
   // The instructions we actually care about are movs onto the stack
   int Opcode = MI->getOpcode();
-  if (Opcode == X86::MOV32mi || Opcode == X86::MOV32mr)
+  if (Opcode == X86::MOV32mi   || Opcode == X86::MOV32mr ||
+      Opcode == X86::MOV64mi32 || Opcode == X86::MOV64mr)
     return Convert;
 
   // Not all calling conventions have only stack MOVs between the stack
@@ -315,8 +320,8 @@ void X86CallFrameOptimization::collectCallInfo(MachineFunction &MF,
                                                CallContext &Context) {
   // Check that this particular call sequence is amenable to the
   // transformation.
-  const X86RegisterInfo &RegInfo = *static_cast<const X86RegisterInfo *>(
-                                       STI->getRegisterInfo());
+  const X86RegisterInfo &RegInfo =
+      *static_cast<const X86RegisterInfo *>(STI->getRegisterInfo());
   unsigned FrameDestroyOpcode = TII->getCallFrameDestroyOpcode();
 
   // We expect to enter this at the beginning of a call sequence
@@ -326,7 +331,8 @@ void X86CallFrameOptimization::collectCallInfo(MachineFunction &MF,
 
   // How much do we adjust the stack? This puts an upper bound on
   // the number of parameters actually passed on it.
-  unsigned int MaxAdjust = FrameSetup->getOperand(0).getImm() / 4;
+  unsigned int MaxAdjust =
+      FrameSetup->getOperand(0).getImm() >> Log2SlotSize;
 
   // A zero adjustment means no stack parameters
   if (!MaxAdjust) {
@@ -340,19 +346,19 @@ void X86CallFrameOptimization::collectCallInfo(MachineFunction &MF,
   while (I->getOpcode() == X86::LEA32r)
     ++I;
 
-  // We expect a copy instruction here.
-  // TODO: The copy instruction is a lowering artifact.
-  //       We should also support a copy-less version, where the stack
-  //       pointer is used directly.
-  if (!I->isCopy() || !I->getOperand(0).isReg())
-    return;
-  Context.SPCopy = I++;
-
-  unsigned StackPtr = Context.SPCopy->getOperand(0).getReg();
+  unsigned StackPtr = RegInfo.getStackRegister();
+  // SelectionDAG (but not FastISel) inserts a copy of ESP into a virtual
+  // register here.  If it's there, use that virtual register as stack pointer
+  // instead.
+  if (I->isCopy() && I->getOperand(0).isReg() && I->getOperand(1).isReg() &&
+      I->getOperand(1).getReg() == StackPtr) {
+    Context.SPCopy = &*I++;
+    StackPtr = Context.SPCopy->getOperand(0).getReg();
+  }
 
   // Scan the call setup sequence for the pattern we're looking for.
-  // We only handle a simple case - a sequence of MOV32mi or MOV32mr
-  // instructions, that push a sequence of 32-bit values onto the stack, with
+  // We only handle a simple case - a sequence of store instructions that
+  // push a sequence of stack-slot-aligned values onto the stack, with
   // no gaps between them.
   if (MaxAdjust > 4)
     Context.MovVector.resize(MaxAdjust, nullptr);
@@ -367,9 +373,9 @@ void X86CallFrameOptimization::collectCallInfo(MachineFunction &MF,
       continue;
     }
 
-    // We know the instruction is a MOV32mi/MOV32mr.
+    // We know the instruction has a supported store opcode.
     // We only want movs of the form:
-    // movl imm/r32, k(%esp)
+    // mov imm/reg, k(%StackPtr)
     // If we run into something else, bail.
     // Note that AddrBaseReg may, counter to its name, not be a register,
     // but rather a frame index.
@@ -390,9 +396,9 @@ void X86CallFrameOptimization::collectCallInfo(MachineFunction &MF,
            "Negative stack displacement when passing parameters");
 
     // We really don't want to consider the unaligned case.
-    if (StackDisp % 4)
+    if (StackDisp & (SlotSize - 1))
       return;
-    StackDisp /= 4;
+    StackDisp >>= Log2SlotSize;
 
     assert((size_t)StackDisp < Context.MovVector.size() &&
            "Function call has more parameters than the stack is adjusted for.");
@@ -400,7 +406,7 @@ void X86CallFrameOptimization::collectCallInfo(MachineFunction &MF,
     // If the same stack slot is being filled twice, something's fishy.
     if (Context.MovVector[StackDisp] != nullptr)
       return;
-    Context.MovVector[StackDisp] = I;
+    Context.MovVector[StackDisp] = &*I;
 
     for (const MachineOperand &MO : I->uses()) {
       if (!MO.isReg())
@@ -418,14 +424,14 @@ void X86CallFrameOptimization::collectCallInfo(MachineFunction &MF,
   if (I == MBB.end() || !I->isCall())
     return;
 
-  Context.Call = I;
+  Context.Call = &*I;
   if ((++I)->getOpcode() != FrameDestroyOpcode)
     return;
 
   // Now, go through the vector, and see that we don't have any gaps,
-  // but only a series of 32-bit MOVs.
+  // but only a series of MOVs.
   auto MMI = Context.MovVector.begin(), MME = Context.MovVector.end();
-  for (; MMI != MME; ++MMI, Context.ExpectedDist += 4)
+  for (; MMI != MME; ++MMI, Context.ExpectedDist += SlotSize)
     if (*MMI == nullptr)
       break;
 
@@ -440,10 +446,9 @@ void X86CallFrameOptimization::collectCallInfo(MachineFunction &MF,
       return;
 
   Context.UsePush = true;
-  return;
 }
 
-bool X86CallFrameOptimization::adjustCallSequence(MachineFunction &MF,
+void X86CallFrameOptimization::adjustCallSequence(MachineFunction &MF,
                                                   const CallContext &Context) {
   // Ok, we can in fact do the transformation for this call.
   // Do not remove the FrameSetup instruction, but adjust the parameters.
@@ -453,15 +458,21 @@ bool X86CallFrameOptimization::adjustCallSequence(MachineFunction &MF,
   FrameSetup->getOperand(1).setImm(Context.ExpectedDist);
 
   DebugLoc DL = FrameSetup->getDebugLoc();
+  bool Is64Bit = STI->is64Bit();
   // Now, iterate through the vector in reverse order, and replace the movs
   // with pushes. MOVmi/MOVmr doesn't have any defs, so no need to
   // replace uses.
-  for (int Idx = (Context.ExpectedDist / 4) - 1; Idx >= 0; --Idx) {
+  for (int Idx = (Context.ExpectedDist >> Log2SlotSize) - 1; Idx >= 0; --Idx) {
     MachineBasicBlock::iterator MOV = *Context.MovVector[Idx];
     MachineOperand PushOp = MOV->getOperand(X86::AddrNumOperands);
     MachineBasicBlock::iterator Push = nullptr;
-    if (MOV->getOpcode() == X86::MOV32mi) {
-      unsigned PushOpcode = X86::PUSHi32;
+    unsigned PushOpcode;
+    switch (MOV->getOpcode()) {
+    default:
+      llvm_unreachable("Unexpected Opcode!");
+    case X86::MOV32mi:
+    case X86::MOV64mi32:
+      PushOpcode = Is64Bit ? X86::PUSH64i32 : X86::PUSHi32;
       // If the operand is a small (8-bit) immediate, we can use a
       // PUSH instruction with a shorter encoding.
       // Note that isImm() may fail even though this is a MOVmi, because
@@ -469,13 +480,27 @@ bool X86CallFrameOptimization::adjustCallSequence(MachineFunction &MF,
       if (PushOp.isImm()) {
         int64_t Val = PushOp.getImm();
         if (isInt<8>(Val))
-          PushOpcode = X86::PUSH32i8;
+          PushOpcode = Is64Bit ? X86::PUSH64i8 : X86::PUSH32i8;
       }
       Push = BuildMI(MBB, Context.Call, DL, TII->get(PushOpcode))
-          .addOperand(PushOp);
-    } else {
+                 .addOperand(PushOp);
+      break;
+    case X86::MOV32mr:
+    case X86::MOV64mr:
       unsigned int Reg = PushOp.getReg();
 
+      // If storing a 32-bit vreg on 64-bit targets, extend to a 64-bit vreg
+      // in preparation for the PUSH64. The upper 32 bits can be undef.
+      if (Is64Bit && MOV->getOpcode() == X86::MOV32mr) {
+        unsigned UndefReg = MRI->createVirtualRegister(&X86::GR64RegClass);
+        Reg = MRI->createVirtualRegister(&X86::GR64RegClass);
+        BuildMI(MBB, Context.Call, DL, TII->get(X86::IMPLICIT_DEF), UndefReg);
+        BuildMI(MBB, Context.Call, DL, TII->get(X86::INSERT_SUBREG), Reg)
+          .addReg(UndefReg)
+          .addOperand(PushOp)
+          .addImm(X86::sub_32bit);
+      }
+
       // If PUSHrmm is not slow on this target, try to fold the source of the
       // push into the instruction.
       bool SlowPUSHrmm = STI->isAtom() || STI->isSLM();
@@ -484,7 +509,8 @@ bool X86CallFrameOptimization::adjustCallSequence(MachineFunction &MF,
       // conservative about that.
       MachineInstr *DefMov = nullptr;
       if (!SlowPUSHrmm && (DefMov = canFoldIntoRegPush(FrameSetup, Reg))) {
-        Push = BuildMI(MBB, Context.Call, DL, TII->get(X86::PUSH32rmm));
+        PushOpcode = Is64Bit ? X86::PUSH64rmm : X86::PUSH32rmm;
+        Push = BuildMI(MBB, Context.Call, DL, TII->get(PushOpcode));
 
         unsigned NumOps = DefMov->getDesc().getNumOperands();
         for (unsigned i = NumOps - X86::AddrNumOperands; i != NumOps; ++i)
@@ -492,33 +518,34 @@ bool X86CallFrameOptimization::adjustCallSequence(MachineFunction &MF,
 
         DefMov->eraseFromParent();
       } else {
-        Push = BuildMI(MBB, Context.Call, DL, TII->get(X86::PUSH32r))
-            .addReg(Reg)
-            .getInstr();
+        PushOpcode = Is64Bit ? X86::PUSH64r : X86::PUSH32r;
+        Push = BuildMI(MBB, Context.Call, DL, TII->get(PushOpcode))
+                   .addReg(Reg)
+                   .getInstr();
       }
+      break;
     }
 
     // For debugging, when using SP-based CFA, we need to adjust the CFA
     // offset after each push.
     // TODO: This is needed only if we require precise CFA.
     if (!TFL->hasFP(MF))
-      TFL->BuildCFI(MBB, std::next(Push), DL, 
-                    MCCFIInstruction::createAdjustCfaOffset(nullptr, 4));
+      TFL->BuildCFI(
+          MBB, std::next(Push), DL,
+          MCCFIInstruction::createAdjustCfaOffset(nullptr, SlotSize));
 
     MBB.erase(MOV);
   }
 
   // The stack-pointer copy is no longer used in the call sequences.
   // There should not be any other users, but we can't commit to that, so:
-  if (MRI->use_empty(Context.SPCopy->getOperand(0).getReg()))
+  if (Context.SPCopy && MRI->use_empty(Context.SPCopy->getOperand(0).getReg()))
     Context.SPCopy->eraseFromParent();
 
   // Once we've done this, we need to make sure PEI doesn't assume a reserved
   // frame.
   X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>();
   FuncInfo->setHasPushSequences(true);
-
-  return true;
 }
 
 MachineInstr *X86CallFrameOptimization::canFoldIntoRegPush(
@@ -540,19 +567,20 @@ MachineInstr *X86CallFrameOptimization::canFoldIntoRegPush(
   if (!MRI->hasOneNonDBGUse(Reg))
     return nullptr;
 
-  MachineBasicBlock::iterator DefMI = MRI->getVRegDef(Reg);
+  MachineInstr &DefMI = *MRI->getVRegDef(Reg);
 
   // Make sure the def is a MOV from memory.
-  // If the def is an another block, give up.
-  if (DefMI->getOpcode() != X86::MOV32rm ||
-      DefMI->getParent() != FrameSetup->getParent())
+  // If the def is in another block, give up.
+  if ((DefMI.getOpcode() != X86::MOV32rm &&
+       DefMI.getOpcode() != X86::MOV64rm) ||
+      DefMI.getParent() != FrameSetup->getParent())
     return nullptr;
 
   // Make sure we don't have any instructions between DefMI and the
   // push that make folding the load illegal.
-  for (auto I = DefMI; I != FrameSetup; ++I)
+  for (MachineBasicBlock::iterator I = DefMI; I != FrameSetup; ++I)
     if (I->isLoadFoldBarrier())
       return nullptr;
 
-  return DefMI;
+  return &DefMI;
 }
diff --git a/lib/Target/X86/X86CallingConv.td b/lib/Target/X86/X86CallingConv.td
index ed2e88067168..4cb62b56bce4 100644
--- a/lib/Target/X86/X86CallingConv.td
+++ b/lib/Target/X86/X86CallingConv.td
@@ -162,6 +162,9 @@ def RetCC_X86_64_C : CallingConv<[
 
   // MMX vector types are always returned in XMM0.
   CCIfType<[x86mmx], CCAssignToReg<[XMM0, XMM1]>>,
+
+  CCIfSwiftError<CCIfType<[i64], CCAssignToReg<[R12]>>>,
+
   CCDelegateTo<RetCC_X86Common>
 ]>;
 
@@ -192,6 +195,24 @@ def RetCC_X86_64_WebKit_JS : CallingConv<[
   CCIfType<[i64], CCAssignToReg<[RAX]>>
 ]>;
 
+def RetCC_X86_64_Swift : CallingConv<[
+  // For integers, ECX, R8D can be used as extra return registers.
+  CCIfType<[i1],  CCPromoteToType<i8>>,
+  CCIfType<[i8] , CCAssignToReg<[AL, DL, CL, R8B]>>,
+  CCIfType<[i16], CCAssignToReg<[AX, DX, CX, R8W]>>,
+  CCIfType<[i32], CCAssignToReg<[EAX, EDX, ECX, R8D]>>,
+  CCIfType<[i64], CCAssignToReg<[RAX, RDX, RCX, R8]>>,
+
+  // XMM0, XMM1, XMM2 and XMM3 can be used to return FP values.
+  CCIfType<[f32], CCAssignToReg<[XMM0, XMM1, XMM2, XMM3]>>,
+  CCIfType<[f64], CCAssignToReg<[XMM0, XMM1, XMM2, XMM3]>>,
+  CCIfType<[f128], CCAssignToReg<[XMM0, XMM1, XMM2, XMM3]>>,
+
+  // MMX vector types are returned in XMM0, XMM1, XMM2 and XMM3.
+  CCIfType<[x86mmx], CCAssignToReg<[XMM0, XMM1, XMM2, XMM3]>>,
+  CCDelegateTo<RetCC_X86Common>
+]>;
+
 // X86-64 AnyReg return-value convention. No explicit register is specified for
 // the return-value. The register allocator is allowed and expected to choose
 // any free register.
@@ -234,6 +255,9 @@ def RetCC_X86_64 : CallingConv<[
   CCIfCC<"CallingConv::WebKit_JS", CCDelegateTo<RetCC_X86_64_WebKit_JS>>,
   CCIfCC<"CallingConv::AnyReg", CCDelegateTo<RetCC_X86_64_AnyReg>>,
 
+  // Handle Swift calls.
+  CCIfCC<"CallingConv::Swift", CCDelegateTo<RetCC_X86_64_Swift>>,
+
   // Handle explicit CC selection
   CCIfCC<"CallingConv::X86_64_Win64", CCDelegateTo<RetCC_X86_Win64_C>>,
   CCIfCC<"CallingConv::X86_64_SysV", CCDelegateTo<RetCC_X86_64_C>>,
@@ -273,6 +297,16 @@ def CC_X86_64_C : CallingConv<[
   CCIfNest<CCIfSubtarget<"isTarget64BitILP32()", CCAssignToReg<[R10D]>>>,
   CCIfNest<CCAssignToReg<[R10]>>,
 
+  // Pass SwiftSelf in a callee saved register.
+  CCIfSwiftSelf<CCIfType<[i64], CCAssignToReg<[R13]>>>,
+
+  // A SwiftError is passed in R12.
+  CCIfSwiftError<CCIfType<[i64], CCAssignToReg<[R12]>>>,
+
+  // For Swift Calling Convention, pass sret in %RAX.
+  CCIfCC<"CallingConv::Swift",
+    CCIfSRet<CCIfType<[i64], CCAssignToReg<[RAX]>>>>,
+
   // The first 6 integer arguments are passed in integer registers.
   CCIfType<[i32], CCAssignToReg<[EDI, ESI, EDX, ECX, R8D, R9D]>>,
   CCIfType<[i64], CCAssignToReg<[RDI, RSI, RDX, RCX, R8 , R9 ]>>,
@@ -770,6 +804,9 @@ def CC_X86_64_Intr : CallingConv<[
 
 // This is the root argument convention for the X86-32 backend.
 def CC_X86_32 : CallingConv<[
+  // X86_INTR calling convention is valid in MCU target and should override the
+  // MCU calling convention. Thus, this should be checked before isTargetMCU().
+  CCIfCC<"CallingConv::X86_INTR", CCDelegateTo<CC_X86_32_Intr>>,
   CCIfSubtarget<"isTargetMCU()", CCDelegateTo<CC_X86_32_MCU>>,
   CCIfCC<"CallingConv::X86_FastCall", CCDelegateTo<CC_X86_32_FastCall>>,
   CCIfCC<"CallingConv::X86_VectorCall", CCDelegateTo<CC_X86_32_VectorCall>>,
@@ -777,7 +814,6 @@ def CC_X86_32 : CallingConv<[
   CCIfCC<"CallingConv::Fast", CCDelegateTo<CC_X86_32_FastCC>>,
   CCIfCC<"CallingConv::GHC", CCDelegateTo<CC_X86_32_GHC>>,
   CCIfCC<"CallingConv::HiPE", CCDelegateTo<CC_X86_32_HiPE>>,
-  CCIfCC<"CallingConv::X86_INTR", CCDelegateTo<CC_X86_32_Intr>>,
 
   // Otherwise, drop to normal X86-32 CC
   CCDelegateTo<CC_X86_32_C>
@@ -819,6 +855,8 @@ def CSR_NoRegs : CalleeSavedRegs<(add)>;
 def CSR_32 : CalleeSavedRegs<(add ESI, EDI, EBX, EBP)>;
 def CSR_64 : CalleeSavedRegs<(add RBX, R12, R13, R14, R15, RBP)>;
 
+def CSR_64_SwiftError : CalleeSavedRegs<(sub CSR_64, R12)>;
+
 def CSR_32EHRet : CalleeSavedRegs<(add EAX, EDX, CSR_32)>;
 def CSR_64EHRet : CalleeSavedRegs<(add RAX, RDX, CSR_64)>;
 
@@ -852,15 +890,23 @@ def CSR_64_MostRegs : CalleeSavedRegs<(add RBX, RCX, RDX, RSI, RDI, R8, R9, R10,
                                            (sequence "XMM%u", 0, 15))>;
 
 def CSR_32_AllRegs     : CalleeSavedRegs<(add EAX, EBX, ECX, EDX, EBP, ESI,
-                                              EDI, ESP)>;
+                                              EDI)>;
 def CSR_32_AllRegs_SSE : CalleeSavedRegs<(add CSR_32_AllRegs,
                                               (sequence "XMM%u", 0, 7))>;
-
-def CSR_64_AllRegs     : CalleeSavedRegs<(add CSR_64_MostRegs, RAX, RSP,
-                                              (sequence "XMM%u", 16, 31))>;
-def CSR_64_AllRegs_AVX : CalleeSavedRegs<(sub (add CSR_64_MostRegs, RAX, RSP,
-                                                   (sequence "YMM%u", 0, 31)),
+def CSR_32_AllRegs_AVX : CalleeSavedRegs<(add CSR_32_AllRegs,
+                                              (sequence "YMM%u", 0, 7))>;
+def CSR_32_AllRegs_AVX512 : CalleeSavedRegs<(add CSR_32_AllRegs,
+                                                 (sequence "ZMM%u", 0, 7),
+                                                 (sequence "K%u", 0, 7))>;
+
+def CSR_64_AllRegs     : CalleeSavedRegs<(add CSR_64_MostRegs, RAX)>;
+def CSR_64_AllRegs_AVX : CalleeSavedRegs<(sub (add CSR_64_MostRegs, RAX,
+                                                   (sequence "YMM%u", 0, 15)),
                                               (sequence "XMM%u", 0, 15))>;
+def CSR_64_AllRegs_AVX512 : CalleeSavedRegs<(sub (add CSR_64_MostRegs, RAX,
+                                                      (sequence "ZMM%u", 0, 31),
+                                                      (sequence "K%u", 0, 7)),
+                                                 (sequence "XMM%u", 0, 15))>;
 
 // Standard C + YMM6-15
 def CSR_Win64_Intel_OCL_BI_AVX : CalleeSavedRegs<(add RBX, RBP, RDI, RSI, R12,
diff --git a/lib/Target/X86/X86ExpandPseudo.cpp b/lib/Target/X86/X86ExpandPseudo.cpp
index a09d06519376..093fed7276f7 100644
--- a/lib/Target/X86/X86ExpandPseudo.cpp
+++ b/lib/Target/X86/X86ExpandPseudo.cpp
@@ -44,10 +44,16 @@ public:
   const X86Subtarget *STI;
   const X86InstrInfo *TII;
   const X86RegisterInfo *TRI;
+  const X86MachineFunctionInfo *X86FI;
   const X86FrameLowering *X86FL;
 
   bool runOnMachineFunction(MachineFunction &Fn) override;
 
+  MachineFunctionProperties getRequiredProperties() const override {
+    return MachineFunctionProperties().set(
+        MachineFunctionProperties::Property::AllVRegsAllocated);
+  }
+
   const char *getPassName() const override {
     return "X86 pseudo instruction expansion pass";
   }
@@ -83,11 +89,18 @@ bool X86ExpandPseudo::ExpandMI(MachineBasicBlock &MBB,
 
     // Adjust stack pointer.
     int StackAdj = StackAdjust.getImm();
+    int MaxTCDelta = X86FI->getTCReturnAddrDelta();
+    int Offset = 0;
+    assert(MaxTCDelta <= 0 && "MaxTCDelta should never be positive");
+
+    // Incoporate the retaddr area.
+    Offset = StackAdj-MaxTCDelta;
+    assert(Offset >= 0 && "Offset should never be negative");
 
-    if (StackAdj) {
+    if (Offset) {
       // Check for possible merge with preceding ADD instruction.
-      StackAdj += X86FL->mergeSPUpdates(MBB, MBBI, true);
-      X86FL->emitSPUpdate(MBB, MBBI, StackAdj, /*InEpilogue=*/true);
+      Offset += X86FL->mergeSPUpdates(MBB, MBBI, true);
+      X86FL->emitSPUpdate(MBB, MBBI, Offset, /*InEpilogue=*/true);
     }
 
     // Jump to label or value in register.
@@ -121,8 +134,8 @@ bool X86ExpandPseudo::ExpandMI(MachineBasicBlock &MBB,
           .addReg(JumpTarget.getReg(), RegState::Kill);
     }
 
-    MachineInstr *NewMI = std::prev(MBBI);
-    NewMI->copyImplicitOps(*MBBI->getParent()->getParent(), MBBI);
+    MachineInstr &NewMI = *std::prev(MBBI);
+    NewMI.copyImplicitOps(*MBBI->getParent()->getParent(), *MBBI);
 
     // Delete the pseudo instruction TCRETURN.
     MBB.erase(MBBI);
@@ -152,6 +165,32 @@ bool X86ExpandPseudo::ExpandMI(MachineBasicBlock &MBB,
     MBB.erase(MBBI);
     return true;
   }
+  case X86::RET: {
+    // Adjust stack to erase error code
+    int64_t StackAdj = MBBI->getOperand(0).getImm();
+    MachineInstrBuilder MIB;
+    if (StackAdj == 0) {
+      MIB = BuildMI(MBB, MBBI, DL,
+                    TII->get(STI->is64Bit() ? X86::RETQ : X86::RETL));
+    } else if (isUInt<16>(StackAdj)) {
+      MIB = BuildMI(MBB, MBBI, DL,
+                    TII->get(STI->is64Bit() ? X86::RETIQ : X86::RETIL))
+                .addImm(StackAdj);
+    } else {
+      assert(!STI->is64Bit() &&
+             "shouldn't need to do this for x86_64 targets!");
+      // A ret can only handle immediates as big as 2**16-1.  If we need to pop
+      // off bytes before the return address, we must do it manually.
+      BuildMI(MBB, MBBI, DL, TII->get(X86::POP32r)).addReg(X86::ECX, RegState::Define);
+      X86FL->emitSPUpdate(MBB, MBBI, StackAdj, /*InEpilogue=*/true);
+      BuildMI(MBB, MBBI, DL, TII->get(X86::PUSH32r)).addReg(X86::ECX);
+      MIB = BuildMI(MBB, MBBI, DL, TII->get(X86::RETL));
+    }
+    for (unsigned I = 1, E = MBBI->getNumOperands(); I != E; ++I)
+      MIB.addOperand(MBBI->getOperand(I));
+    MBB.erase(MBBI);
+    return true;
+  }
   case X86::EH_RESTORE: {
     // Restore ESP and EBP, and optionally ESI if required.
     bool IsSEH = isAsynchronousEHPersonality(classifyEHPersonality(
@@ -160,6 +199,38 @@ bool X86ExpandPseudo::ExpandMI(MachineBasicBlock &MBB,
     MBBI->eraseFromParent();
     return true;
   }
+  case X86::LCMPXCHG8B_SAVE_EBX:
+  case X86::LCMPXCHG16B_SAVE_RBX: {
+    // Perform the following transformation.
+    // SaveRbx = pseudocmpxchg Addr, <4 opds for the address>, InArg, SaveRbx
+    // =>
+    // [E|R]BX = InArg
+    // actualcmpxchg Addr
+    // [E|R]BX = SaveRbx
+    const MachineOperand &InArg = MBBI->getOperand(6);
+    unsigned SaveRbx = MBBI->getOperand(7).getReg();
+
+    unsigned ActualInArg =
+        Opcode == X86::LCMPXCHG8B_SAVE_EBX ? X86::EBX : X86::RBX;
+    // Copy the input argument of the pseudo into the argument of the
+    // actual instruction.
+    TII->copyPhysReg(MBB, MBBI, DL, ActualInArg, InArg.getReg(),
+                     InArg.isKill());
+    // Create the actual instruction.
+    unsigned ActualOpc =
+        Opcode == X86::LCMPXCHG8B_SAVE_EBX ? X86::LCMPXCHG8B : X86::LCMPXCHG16B;
+    MachineInstr *NewInstr = BuildMI(MBB, MBBI, DL, TII->get(ActualOpc));
+    // Copy the operands related to the address.
+    for (unsigned Idx = 1; Idx < 6; ++Idx)
+      NewInstr->addOperand(MBBI->getOperand(Idx));
+    // Finally, restore the value of RBX.
+    TII->copyPhysReg(MBB, MBBI, DL, ActualInArg, SaveRbx,
+                     /*SrcIsKill*/ true);
+
+    // Delete the pseudo.
+    MBBI->eraseFromParent();
+    return true;
+  }
   }
   llvm_unreachable("Previous switch has a fallthrough?");
 }
@@ -184,6 +255,7 @@ bool X86ExpandPseudo::runOnMachineFunction(MachineFunction &MF) {
   STI = &static_cast<const X86Subtarget &>(MF.getSubtarget());
   TII = STI->getInstrInfo();
   TRI = STI->getRegisterInfo();
+  X86FI = MF.getInfo<X86MachineFunctionInfo>();
   X86FL = STI->getFrameLowering();
 
   bool Modified = false;
diff --git a/lib/Target/X86/X86FastISel.cpp b/lib/Target/X86/X86FastISel.cpp
index f48b47934e03..dfe3c80be21d 100644
--- a/lib/Target/X86/X86FastISel.cpp
+++ b/lib/Target/X86/X86FastISel.cpp
@@ -22,7 +22,6 @@
 #include "X86Subtarget.h"
 #include "X86TargetMachine.h"
 #include "llvm/Analysis/BranchProbabilityInfo.h"
-#include "llvm/CodeGen/Analysis.h"
 #include "llvm/CodeGen/FastISel.h"
 #include "llvm/CodeGen/FunctionLoweringInfo.h"
 #include "llvm/CodeGen/MachineConstantPool.h"
@@ -30,6 +29,7 @@
 #include "llvm/CodeGen/MachineRegisterInfo.h"
 #include "llvm/IR/CallSite.h"
 #include "llvm/IR/CallingConv.h"
+#include "llvm/IR/DebugInfo.h"
 #include "llvm/IR/DerivedTypes.h"
 #include "llvm/IR/GetElementPtrTypeIterator.h"
 #include "llvm/IR/GlobalAlias.h"
@@ -82,7 +82,8 @@ public:
 #include "X86GenFastISel.inc"
 
 private:
-  bool X86FastEmitCompare(const Value *LHS, const Value *RHS, EVT VT, DebugLoc DL);
+  bool X86FastEmitCompare(const Value *LHS, const Value *RHS, EVT VT,
+                          const DebugLoc &DL);
 
   bool X86FastEmitLoad(EVT VT, X86AddressMode &AM, MachineMemOperand *MMO,
                        unsigned &ResultReg, unsigned Alignment = 1);
@@ -347,6 +348,11 @@ bool X86FastISel::isTypeLegal(Type *Ty, MVT &VT, bool AllowI1) {
 bool X86FastISel::X86FastEmitLoad(EVT VT, X86AddressMode &AM,
                                   MachineMemOperand *MMO, unsigned &ResultReg,
                                   unsigned Alignment) {
+  bool HasSSE41 = Subtarget->hasSSE41();
+  bool HasAVX = Subtarget->hasAVX();
+  bool HasAVX2 = Subtarget->hasAVX2();
+  bool IsNonTemporal = MMO && MMO->isNonTemporal();
+
   // Get opcode and regclass of the output for the given load instruction.
   unsigned Opc = 0;
   const TargetRegisterClass *RC = nullptr;
@@ -372,7 +378,7 @@ bool X86FastISel::X86FastEmitLoad(EVT VT, X86AddressMode &AM,
     break;
   case MVT::f32:
     if (X86ScalarSSEf32) {
-      Opc = Subtarget->hasAVX() ? X86::VMOVSSrm : X86::MOVSSrm;
+      Opc = HasAVX ? X86::VMOVSSrm : X86::MOVSSrm;
       RC  = &X86::FR32RegClass;
     } else {
       Opc = X86::LD_Fp32m;
@@ -381,7 +387,7 @@ bool X86FastISel::X86FastEmitLoad(EVT VT, X86AddressMode &AM,
     break;
   case MVT::f64:
     if (X86ScalarSSEf64) {
-      Opc = Subtarget->hasAVX() ? X86::VMOVSDrm : X86::MOVSDrm;
+      Opc = HasAVX ? X86::VMOVSDrm : X86::MOVSDrm;
       RC  = &X86::FR64RegClass;
     } else {
       Opc = X86::LD_Fp64m;
@@ -392,29 +398,91 @@ bool X86FastISel::X86FastEmitLoad(EVT VT, X86AddressMode &AM,
     // No f80 support yet.
     return false;
   case MVT::v4f32:
-    if (Alignment >= 16)
-      Opc = Subtarget->hasAVX() ? X86::VMOVAPSrm : X86::MOVAPSrm;
+    if (IsNonTemporal && Alignment >= 16 && HasSSE41)
+      Opc = HasAVX ? X86::VMOVNTDQArm : X86::MOVNTDQArm;
+    else if (Alignment >= 16)
+      Opc = HasAVX ? X86::VMOVAPSrm : X86::MOVAPSrm;
     else
-      Opc = Subtarget->hasAVX() ? X86::VMOVUPSrm : X86::MOVUPSrm;
+      Opc = HasAVX ? X86::VMOVUPSrm : X86::MOVUPSrm;
     RC  = &X86::VR128RegClass;
     break;
   case MVT::v2f64:
-    if (Alignment >= 16)
-      Opc = Subtarget->hasAVX() ? X86::VMOVAPDrm : X86::MOVAPDrm;
+    if (IsNonTemporal && Alignment >= 16 && HasSSE41)
+      Opc = HasAVX ? X86::VMOVNTDQArm : X86::MOVNTDQArm;
+    else if (Alignment >= 16)
+      Opc = HasAVX ? X86::VMOVAPDrm : X86::MOVAPDrm;
     else
-      Opc = Subtarget->hasAVX() ? X86::VMOVUPDrm : X86::MOVUPDrm;
+      Opc = HasAVX ? X86::VMOVUPDrm : X86::MOVUPDrm;
     RC  = &X86::VR128RegClass;
     break;
   case MVT::v4i32:
   case MVT::v2i64:
   case MVT::v8i16:
   case MVT::v16i8:
-    if (Alignment >= 16)
-      Opc = Subtarget->hasAVX() ? X86::VMOVDQArm : X86::MOVDQArm;
+    if (IsNonTemporal && Alignment >= 16)
+      Opc = HasAVX ? X86::VMOVNTDQArm : X86::MOVNTDQArm;
+    else if (Alignment >= 16)
+      Opc = HasAVX ? X86::VMOVDQArm : X86::MOVDQArm;
     else
-      Opc = Subtarget->hasAVX() ? X86::VMOVDQUrm : X86::MOVDQUrm;
+      Opc = HasAVX ? X86::VMOVDQUrm : X86::MOVDQUrm;
     RC  = &X86::VR128RegClass;
     break;
+  case MVT::v8f32:
+    assert(HasAVX);
+    if (IsNonTemporal && Alignment >= 32 && HasAVX2)
+      Opc = X86::VMOVNTDQAYrm;
+    else
+      Opc = (Alignment >= 32) ? X86::VMOVAPSYrm : X86::VMOVUPSYrm;
+    RC  = &X86::VR256RegClass;
+    break;
+  case MVT::v4f64:
+    assert(HasAVX);
+    if (IsNonTemporal && Alignment >= 32 && HasAVX2)
+      Opc = X86::VMOVNTDQAYrm;
+    else
+      Opc = (Alignment >= 32) ? X86::VMOVAPDYrm : X86::VMOVUPDYrm;
+    RC  = &X86::VR256RegClass;
+    break;
+  case MVT::v8i32:
+  case MVT::v4i64:
+  case MVT::v16i16:
+  case MVT::v32i8:
+    assert(HasAVX);
+    if (IsNonTemporal && Alignment >= 32 && HasAVX2)
+      Opc = X86::VMOVNTDQAYrm;
+    else
+      Opc = (Alignment >= 32) ? X86::VMOVDQAYrm : X86::VMOVDQUYrm;
+    RC  = &X86::VR256RegClass;
+    break;
+  case MVT::v16f32:
+    assert(Subtarget->hasAVX512());
+    if (IsNonTemporal && Alignment >= 64)
+      Opc = X86::VMOVNTDQAZrm;
+    else
+      Opc = (Alignment >= 64) ? X86::VMOVAPSZrm : X86::VMOVUPSZrm;
+    RC  = &X86::VR512RegClass;
+    break;
+  case MVT::v8f64:
+    assert(Subtarget->hasAVX512());
+    if (IsNonTemporal && Alignment >= 64)
+      Opc = X86::VMOVNTDQAZrm;
+    else
+      Opc = (Alignment >= 64) ? X86::VMOVAPDZrm : X86::VMOVUPDZrm;
+    RC  = &X86::VR512RegClass;
+    break;
+  case MVT::v8i64:
+  case MVT::v16i32:
+  case MVT::v32i16:
+  case MVT::v64i8:
+    assert(Subtarget->hasAVX512());
+    // Note: There are a lot more choices based on type with AVX-512, but
+    // there's really no advantage when the load isn't masked.
+    if (IsNonTemporal && Alignment >= 64)
+      Opc = X86::VMOVNTDQAZrm;
+    else
+      Opc = (Alignment >= 64) ? X86::VMOVDQA64Zrm : X86::VMOVDQU64Zrm;
+    RC  = &X86::VR512RegClass;
+    break;
   }
 
   ResultReg = createResultReg(RC);
@@ -507,12 +575,70 @@ bool X86FastISel::X86FastEmitStore(EVT VT, unsigned ValReg, bool ValIsKill,
       else
         Opc = HasAVX ? X86::VMOVDQAmr : X86::MOVDQAmr;
     } else
-      Opc = Subtarget->hasAVX() ? X86::VMOVDQUmr : X86::MOVDQUmr;
+      Opc = HasAVX ? X86::VMOVDQUmr : X86::MOVDQUmr;
+    break;
+  case MVT::v8f32:
+    assert(HasAVX);
+    if (Aligned)
+      Opc = IsNonTemporal ? X86::VMOVNTPSYmr : X86::VMOVAPSYmr;
+    else
+      Opc = X86::VMOVUPSYmr;
+    break;
+  case MVT::v4f64:
+    assert(HasAVX);
+    if (Aligned) {
+      Opc = IsNonTemporal ? X86::VMOVNTPDYmr : X86::VMOVAPDYmr;
+    } else
+      Opc = X86::VMOVUPDYmr;
+    break;
+  case MVT::v8i32:
+  case MVT::v4i64:
+  case MVT::v16i16:
+  case MVT::v32i8:
+    assert(HasAVX);
+    if (Aligned)
+      Opc = IsNonTemporal ? X86::VMOVNTDQYmr : X86::VMOVDQAYmr;
+    else
+      Opc = X86::VMOVDQUYmr;
+    break;
+  case MVT::v16f32:
+    assert(Subtarget->hasAVX512());
+    if (Aligned)
+      Opc = IsNonTemporal ? X86::VMOVNTPSZmr : X86::VMOVAPSZmr;
+    else
+      Opc = X86::VMOVUPSZmr;
+    break;
+  case MVT::v8f64:
+    assert(Subtarget->hasAVX512());
+    if (Aligned) {
+      Opc = IsNonTemporal ? X86::VMOVNTPDZmr : X86::VMOVAPDZmr;
+    } else
+      Opc = X86::VMOVUPDZmr;
+    break;
+  case MVT::v8i64:
+  case MVT::v16i32:
+  case MVT::v32i16:
+  case MVT::v64i8:
+    assert(Subtarget->hasAVX512());
+    // Note: There are a lot more choices based on type with AVX-512, but
+    // there's really no advantage when the store isn't masked.
+    if (Aligned)
+      Opc = IsNonTemporal ? X86::VMOVNTDQZmr : X86::VMOVDQA64Zmr;
+    else
+      Opc = X86::VMOVDQU64Zmr;
     break;
   }
 
+  const MCInstrDesc &Desc = TII.get(Opc);
+  // Some of the instructions in the previous switch use FR128 instead
+  // of FR32 for ValReg. Make sure the register we feed the instruction
+  // matches its register class constraints.
+  // Note: This is fine to do a copy from FR32 to FR128, this is the
+  // same registers behind the scene and actually why it did not trigger
+  // any bugs before.
+  ValReg = constrainOperandRegClass(Desc, ValReg, Desc.getNumOperands() - 1);
   MachineInstrBuilder MIB =
-    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(Opc));
+      BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, Desc);
   addFullAddress(MIB, AM).addReg(ValReg, getKillRegState(ValIsKill));
   if (MMO)
     MIB->addMemOperand(*FuncInfo.MF, MMO);
@@ -598,7 +724,7 @@ bool X86FastISel::handleConstantAddresses(const Value *V, X86AddressMode &AM) {
       AM.GV = GV;
 
       // Allow the subtarget to classify the global.
-      unsigned char GVFlags = Subtarget->ClassifyGlobalReference(GV, TM);
+      unsigned char GVFlags = Subtarget->classifyGlobalReference(GV);
 
       // If this reference is relative to the pic base, set it now.
       if (isGlobalRelativeToPICBase(GVFlags)) {
@@ -831,9 +957,8 @@ redo_gep:
     // our address and just match the value instead of completely failing.
     AM = SavedAM;
 
-    for (SmallVectorImpl<const Value *>::reverse_iterator
-           I = GEPs.rbegin(), E = GEPs.rend(); I != E; ++I)
-      if (handleConstantAddresses(*I, AM))
+    for (const Value *I : reverse(GEPs))
+      if (handleConstantAddresses(I, AM))
         return true;
 
     return false;
@@ -938,10 +1063,8 @@ bool X86FastISel::X86SelectCallAddress(const Value *V, X86AddressMode &AM) {
       // base and index registers are unused.
       assert(AM.Base.Reg == 0 && AM.IndexReg == 0);
       AM.Base.Reg = X86::RIP;
-    } else if (Subtarget->isPICStyleStubPIC()) {
-      AM.GVOpFlags = X86II::MO_PIC_BASE_OFFSET;
-    } else if (Subtarget->isPICStyleGOT()) {
-      AM.GVOpFlags = X86II::MO_GOTOFF;
+    } else {
+      AM.GVOpFlags = Subtarget->classifyLocalReference(nullptr);
     }
 
     return true;
@@ -972,6 +1095,21 @@ bool X86FastISel::X86SelectStore(const Instruction *I) {
   if (S->isAtomic())
     return false;
 
+  const Value *PtrV = I->getOperand(1);
+  if (TLI.supportSwiftError()) {
+    // Swifterror values can come from either a function parameter with
+    // swifterror attribute or an alloca with swifterror attribute.
+    if (const Argument *Arg = dyn_cast<Argument>(PtrV)) {
+      if (Arg->hasSwiftErrorAttr())
+        return false;
+    }
+
+    if (const AllocaInst *Alloca = dyn_cast<AllocaInst>(PtrV)) {
+      if (Alloca->isSwiftError())
+        return false;
+    }
+  }
+
   const Value *Val = S->getValueOperand();
   const Value *Ptr = S->getPointerOperand();
 
@@ -1002,6 +1140,10 @@ bool X86FastISel::X86SelectRet(const Instruction *I) {
   if (!FuncInfo.CanLowerReturn)
     return false;
 
+  if (TLI.supportSwiftError() &&
+      F.getAttributes().hasAttrSomewhere(Attribute::SwiftError))
+    return false;
+
   if (TLI.supportSplitCSR(FuncInfo.MF))
     return false;
 
@@ -1009,14 +1151,14 @@ bool X86FastISel::X86SelectRet(const Instruction *I) {
   if (CC != CallingConv::C &&
       CC != CallingConv::Fast &&
       CC != CallingConv::X86_FastCall &&
-      CC != CallingConv::X86_64_SysV)
+      CC != CallingConv::X86_StdCall &&
+      CC != CallingConv::X86_ThisCall &&
+      CC != CallingConv::X86_64_SysV &&
+      CC != CallingConv::X86_64_Win64)
     return false;
 
-  if (Subtarget->isCallingConvWin64(CC))
-    return false;
-
-  // Don't handle popping bytes on return for now.
-  if (X86MFInfo->getBytesToPopOnReturn() != 0)
+  // Don't handle popping bytes if they don't fit the ret's immediate.
+  if (!isUInt<16>(X86MFInfo->getBytesToPopOnReturn()))
     return false;
 
   // fastcc with -tailcallopt is intended to provide a guaranteed
@@ -1101,11 +1243,14 @@ bool X86FastISel::X86SelectRet(const Instruction *I) {
     RetRegs.push_back(VA.getLocReg());
   }
 
+  // Swift calling convention does not require we copy the sret argument
+  // into %rax/%eax for the return, and SRetReturnReg is not set for Swift.
+
   // All x86 ABIs require that for returning structs by value we copy
   // the sret argument into %rax/%eax (depending on ABI) for the return.
   // We saved the argument into a virtual register in the entry block,
   // so now we copy the value out and into %rax/%eax.
-  if (F.hasStructRetAttr()) {
+  if (F.hasStructRetAttr() && CC != CallingConv::Swift) {
     unsigned Reg = X86MFInfo->getSRetReturnReg();
     assert(Reg &&
            "SRetReturnReg should have been set in LowerFormalArguments()!");
@@ -1116,9 +1261,15 @@ bool X86FastISel::X86SelectRet(const Instruction *I) {
   }
 
   // Now emit the RET.
-  MachineInstrBuilder MIB =
-    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
-            TII.get(Subtarget->is64Bit() ? X86::RETQ : X86::RETL));
+  MachineInstrBuilder MIB;
+  if (X86MFInfo->getBytesToPopOnReturn()) {
+    MIB = BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
+                  TII.get(Subtarget->is64Bit() ? X86::RETIQ : X86::RETIL))
+              .addImm(X86MFInfo->getBytesToPopOnReturn());
+  } else {
+    MIB = BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
+                  TII.get(Subtarget->is64Bit() ? X86::RETQ : X86::RETL));
+  }
   for (unsigned i = 0, e = RetRegs.size(); i != e; ++i)
     MIB.addReg(RetRegs[i], RegState::Implicit);
   return true;
@@ -1133,6 +1284,21 @@ bool X86FastISel::X86SelectLoad(const Instruction *I) {
   if (LI->isAtomic())
     return false;
 
+  const Value *SV = I->getOperand(0);
+  if (TLI.supportSwiftError()) {
+    // Swifterror values can come from either a function parameter with
+    // swifterror attribute or an alloca with swifterror attribute.
+    if (const Argument *Arg = dyn_cast<Argument>(SV)) {
+      if (Arg->hasSwiftErrorAttr())
+        return false;
+    }
+
+    if (const AllocaInst *Alloca = dyn_cast<AllocaInst>(SV)) {
+      if (Alloca->isSwiftError())
+        return false;
+    }
+  }
+
   MVT VT;
   if (!isTypeLegal(LI->getType(), VT, /*AllowI1=*/true))
     return false;
@@ -1204,8 +1370,8 @@ static unsigned X86ChooseCmpImmediateOpcode(EVT VT, const ConstantInt *RHSC) {
   }
 }
 
-bool X86FastISel::X86FastEmitCompare(const Value *Op0, const Value *Op1,
-                                     EVT VT, DebugLoc CurDbgLoc) {
+bool X86FastISel::X86FastEmitCompare(const Value *Op0, const Value *Op1, EVT VT,
+                                     const DebugLoc &CurDbgLoc) {
   unsigned Op0Reg = getRegForValue(Op0);
   if (Op0Reg == 0) return false;
 
@@ -1244,6 +1410,9 @@ bool X86FastISel::X86SelectCmp(const Instruction *I) {
   if (!isTypeLegal(I->getOperand(0)->getType(), VT))
     return false;
 
+  if (I->getType()->isIntegerTy(1) && Subtarget->hasAVX512())
+    return false;
+
   // Try to optimize or fold the cmp.
   CmpInst::Predicate Predicate = optimizeCmpPredicate(CI);
   unsigned ResultReg = 0;
@@ -2294,8 +2463,10 @@ bool X86FastISel::fastLowerIntrinsicCall(const IntrinsicInst *II) {
       // register class VR128 by method 'constrainOperandRegClass' which is
       // directly called by 'fastEmitInst_ri'.
       // Instruction VCVTPS2PHrr takes an extra immediate operand which is
-      // used to provide rounding control.
-      InputReg = fastEmitInst_ri(X86::VCVTPS2PHrr, RC, InputReg, false, 0);
+      // used to provide rounding control: use MXCSR.RC, encoded as 0b100.
+      // It's consistent with the other FP instructions, which are usually
+      // controlled by MXCSR.
+      InputReg = fastEmitInst_ri(X86::VCVTPS2PHrr, RC, InputReg, false, 4);
 
       // Move the lower 32-bits of ResultReg to another register of class GR32.
       ResultReg = createResultReg(&X86::GR32RegClass);
@@ -2477,7 +2648,7 @@ bool X86FastISel::fastLowerIntrinsicCall(const IntrinsicInst *II) {
     // Unfortunately we can't use fastEmit_r, because the AVX version of FSQRT
     // is not generated by FastISel yet.
     // FIXME: Update this code once tablegen can handle it.
-    static const unsigned SqrtOpc[2][2] = {
+    static const uint16_t SqrtOpc[2][2] = {
       {X86::SQRTSSr, X86::VSQRTSSr},
       {X86::SQRTSDr, X86::VSQRTSDr}
     };
@@ -2577,7 +2748,7 @@ bool X86FastISel::fastLowerIntrinsicCall(const IntrinsicInst *II) {
     unsigned ResultReg = 0;
     // Check if we have an immediate version.
     if (const auto *CI = dyn_cast<ConstantInt>(RHS)) {
-      static const unsigned Opc[2][4] = {
+      static const uint16_t Opc[2][4] = {
         { X86::INC8r, X86::INC16r, X86::INC32r, X86::INC64r },
         { X86::DEC8r, X86::DEC16r, X86::DEC32r, X86::DEC64r }
       };
@@ -2607,9 +2778,9 @@ bool X86FastISel::fastLowerIntrinsicCall(const IntrinsicInst *II) {
     // FastISel doesn't have a pattern for all X86::MUL*r and X86::IMUL*r. Emit
     // it manually.
     if (BaseOpc == X86ISD::UMUL && !ResultReg) {
-      static const unsigned MULOpc[] =
+      static const uint16_t MULOpc[] =
         { X86::MUL8r, X86::MUL16r, X86::MUL32r, X86::MUL64r };
-      static const unsigned Reg[] = { X86::AL, X86::AX, X86::EAX, X86::RAX };
+      static const MCPhysReg Reg[] = { X86::AL, X86::AX, X86::EAX, X86::RAX };
       // First copy the first operand into RAX, which is an implicit input to
       // the X86::MUL*r instruction.
       BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
@@ -2618,7 +2789,7 @@ bool X86FastISel::fastLowerIntrinsicCall(const IntrinsicInst *II) {
       ResultReg = fastEmitInst_r(MULOpc[VT.SimpleTy-MVT::i8],
                                  TLI.getRegClassFor(VT), RHSReg, RHSIsKill);
     } else if (BaseOpc == X86ISD::SMUL && !ResultReg) {
-      static const unsigned MULOpc[] =
+      static const uint16_t MULOpc[] =
         { X86::IMUL8r, X86::IMUL16rr, X86::IMUL32rr, X86::IMUL64rr };
       if (VT == MVT::i8) {
         // Copy the first operand into AL, which is an implicit input to the
@@ -2671,7 +2842,7 @@ bool X86FastISel::fastLowerIntrinsicCall(const IntrinsicInst *II) {
     if (!isTypeLegal(RetTy, VT))
       return false;
 
-    static const unsigned CvtOpc[2][2][2] = {
+    static const uint16_t CvtOpc[2][2][2] = {
       { { X86::CVTTSS2SIrr,   X86::VCVTTSS2SIrr   },
         { X86::CVTTSS2SI64rr, X86::VCVTTSS2SI64rr }  },
       { { X86::CVTTSD2SIrr,   X86::VCVTTSD2SIrr   },
@@ -2742,6 +2913,8 @@ bool X86FastISel::fastLowerArguments() {
     if (F->getAttributes().hasAttribute(Idx, Attribute::ByVal) ||
         F->getAttributes().hasAttribute(Idx, Attribute::InReg) ||
         F->getAttributes().hasAttribute(Idx, Attribute::StructRet) ||
+        F->getAttributes().hasAttribute(Idx, Attribute::SwiftSelf) ||
+        F->getAttributes().hasAttribute(Idx, Attribute::SwiftError) ||
         F->getAttributes().hasAttribute(Idx, Attribute::Nest))
       return false;
 
@@ -2809,9 +2982,9 @@ bool X86FastISel::fastLowerArguments() {
   return true;
 }
 
-static unsigned computeBytesPoppedByCallee(const X86Subtarget *Subtarget,
-                                           CallingConv::ID CC,
-                                           ImmutableCallSite *CS) {
+static unsigned computeBytesPoppedByCalleeForSRet(const X86Subtarget *Subtarget,
+                                                  CallingConv::ID CC,
+                                                  ImmutableCallSite *CS) {
   if (Subtarget->is64Bit())
     return 0;
   if (Subtarget->getTargetTriple().isOSMSVCRT())
@@ -2849,7 +3022,10 @@ bool X86FastISel::fastLowerCall(CallLoweringInfo &CLI) {
   case CallingConv::C:
   case CallingConv::Fast:
   case CallingConv::WebKit_JS:
+  case CallingConv::Swift:
   case CallingConv::X86_FastCall:
+  case CallingConv::X86_StdCall:
+  case CallingConv::X86_ThisCall:
   case CallingConv::X86_64_Win64:
   case CallingConv::X86_64_SysV:
     break;
@@ -2873,10 +3049,9 @@ bool X86FastISel::fastLowerCall(CallLoweringInfo &CLI) {
   if (CLI.CS && CLI.CS->hasInAllocaArgument())
     return false;
 
-  // Fast-isel doesn't know about callee-pop yet.
-  if (X86::isCalleePop(CC, Subtarget->is64Bit(), IsVarArg,
-                       TM.Options.GuaranteedTailCallOpt))
-    return false;
+  for (auto Flag : CLI.OutFlags)
+    if (Flag.isSwiftError())
+      return false;
 
   SmallVector<MVT, 16> OutVTs;
   SmallVector<unsigned, 16> ArgRegs;
@@ -2964,6 +3139,10 @@ bool X86FastISel::fastLowerCall(CallLoweringInfo &CLI) {
     case CCValAssign::SExt: {
       assert(VA.getLocVT().isInteger() && !VA.getLocVT().isVector() &&
              "Unexpected extend");
+
+      if (ArgVT.SimpleTy == MVT::i1)
+        return false;
+
       bool Emitted = X86FastEmitExtend(ISD::SIGN_EXTEND, VA.getLocVT(), ArgReg,
                                        ArgVT, ArgReg);
       assert(Emitted && "Failed to emit a sext!"); (void)Emitted;
@@ -2973,6 +3152,17 @@ bool X86FastISel::fastLowerCall(CallLoweringInfo &CLI) {
     case CCValAssign::ZExt: {
       assert(VA.getLocVT().isInteger() && !VA.getLocVT().isVector() &&
              "Unexpected extend");
+
+      // Handle zero-extension from i1 to i8, which is common.
+      if (ArgVT.SimpleTy == MVT::i1) {
+        // Set the high bits to zero.
+        ArgReg = fastEmitZExtFromI1(MVT::i8, ArgReg, /*TODO: Kill=*/false);
+        ArgVT = MVT::i8;
+
+        if (ArgReg == 0)
+          return false;
+      }
+
       bool Emitted = X86FastEmitExtend(ISD::ZERO_EXTEND, VA.getLocVT(), ArgReg,
                                        ArgVT, ArgReg);
       assert(Emitted && "Failed to emit a zext!"); (void)Emitted;
@@ -3113,25 +3303,10 @@ bool X86FastISel::fastLowerCall(CallLoweringInfo &CLI) {
     unsigned CallOpc = Is64Bit ? X86::CALL64pcrel32 : X86::CALLpcrel32;
 
     // See if we need any target-specific flags on the GV operand.
-    unsigned char OpFlags = 0;
-
-    // On ELF targets, in both X86-64 and X86-32 mode, direct calls to
-    // external symbols most go through the PLT in PIC mode.  If the symbol
-    // has hidden or protected visibility, or if it is static or local, then
-    // we don't need to use the PLT - we can directly call it.
-    if (Subtarget->isTargetELF() &&
-        TM.getRelocationModel() == Reloc::PIC_ &&
-        GV->hasDefaultVisibility() && !GV->hasLocalLinkage()) {
-      OpFlags = X86II::MO_PLT;
-    } else if (Subtarget->isPICStyleStubAny() &&
-               !GV->isStrongDefinitionForLinker() &&
-               (!Subtarget->getTargetTriple().isMacOSX() ||
-                Subtarget->getTargetTriple().isMacOSXVersionLT(10, 5))) {
-      // PC-relative references to external symbols should go through $stub,
-      // unless we're building with the leopard linker or later, which
-      // automatically synthesizes these stubs.
-      OpFlags = X86II::MO_DARWIN_STUB;
-    }
+    unsigned char OpFlags = Subtarget->classifyGlobalFunctionReference(GV);
+    // Ignore NonLazyBind attribute in FastISel
+    if (OpFlags == X86II::MO_GOTPCREL)
+      OpFlags = 0;
 
     MIB = BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(CallOpc));
     if (Symbol)
@@ -3157,7 +3332,10 @@ bool X86FastISel::fastLowerCall(CallLoweringInfo &CLI) {
 
   // Issue CALLSEQ_END
   unsigned NumBytesForCalleeToPop =
-    computeBytesPoppedByCallee(Subtarget, CC, CLI.CS);
+      X86::isCalleePop(CC, Subtarget->is64Bit(), IsVarArg,
+                       TM.Options.GuaranteedTailCallOpt)
+          ? NumBytes // Callee pops everything.
+          : computeBytesPoppedByCalleeForSRet(Subtarget, CC, CLI.CS);
   unsigned AdjStackUp = TII.getCallFrameDestroyOpcode();
   BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(AdjStackUp))
     .addImm(NumBytes).addImm(NumBytesForCalleeToPop);
@@ -3398,17 +3576,13 @@ unsigned X86FastISel::X86MaterializeFP(const ConstantFP *CFP, MVT VT) {
 
   // x86-32 PIC requires a PIC base register for constant pools.
   unsigned PICBase = 0;
-  unsigned char OpFlag = 0;
-  if (Subtarget->isPICStyleStubPIC()) { // Not dynamic-no-pic
-    OpFlag = X86II::MO_PIC_BASE_OFFSET;
+  unsigned char OpFlag = Subtarget->classifyLocalReference(nullptr);
+  if (OpFlag == X86II::MO_PIC_BASE_OFFSET)
     PICBase = getInstrInfo()->getGlobalBaseReg(FuncInfo.MF);
-  } else if (Subtarget->isPICStyleGOT()) {
-    OpFlag = X86II::MO_GOTOFF;
+  else if (OpFlag == X86II::MO_GOTOFF)
     PICBase = getInstrInfo()->getGlobalBaseReg(FuncInfo.MF);
-  } else if (Subtarget->isPICStyleRIPRel() &&
-             TM.getCodeModel() == CodeModel::Small) {
+  else if (Subtarget->is64Bit() && TM.getCodeModel() == CodeModel::Small)
     PICBase = X86::RIP;
-  }
 
   // Create the load from the constant pool.
   unsigned CPI = MCP.getConstantPoolIndex(CFP, Align);
@@ -3572,7 +3746,7 @@ bool X86FastISel::tryToFoldLoadIntoMI(MachineInstr *MI, unsigned OpNo,
   AM.getFullAddress(AddrOps);
 
   MachineInstr *Result = XII.foldMemoryOperandImpl(
-      *FuncInfo.MF, MI, OpNo, AddrOps, FuncInfo.InsertPt, Size, Alignment,
+      *FuncInfo.MF, *MI, OpNo, AddrOps, FuncInfo.InsertPt, Size, Alignment,
       /*AllowCommute=*/true);
   if (!Result)
     return false;
diff --git a/lib/Target/X86/X86FixupBWInsts.cpp b/lib/Target/X86/X86FixupBWInsts.cpp
new file mode 100644
index 000000000000..90e758dc2e02
--- /dev/null
+++ b/lib/Target/X86/X86FixupBWInsts.cpp
@@ -0,0 +1,371 @@
+//===-- X86FixupBWInsts.cpp - Fixup Byte or Word instructions -----------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+/// \file
+/// This file defines the pass that looks through the machine instructions
+/// late in the compilation, and finds byte or word instructions that
+/// can be profitably replaced with 32 bit instructions that give equivalent
+/// results for the bits of the results that are used. There are two possible
+/// reasons to do this.
+///
+/// One reason is to avoid false-dependences on the upper portions
+/// of the registers.  Only instructions that have a destination register
+/// which is not in any of the source registers can be affected by this.
+/// Any instruction where one of the source registers is also the destination
+/// register is unaffected, because it has a true dependence on the source
+/// register already.  So, this consideration primarily affects load
+/// instructions and register-to-register moves.  It would
+/// seem like cmov(s) would also be affected, but because of the way cmov is
+/// really implemented by most machines as reading both the destination and
+/// and source regsters, and then "merging" the two based on a condition,
+/// it really already should be considered as having a true dependence on the
+/// destination register as well.
+///
+/// The other reason to do this is for potential code size savings.  Word
+/// operations need an extra override byte compared to their 32 bit
+/// versions. So this can convert many word operations to their larger
+/// size, saving a byte in encoding. This could introduce partial register
+/// dependences where none existed however.  As an example take:
+///   orw  ax, $0x1000
+///   addw ax, $3
+/// now if this were to get transformed into
+///   orw  ax, $1000
+///   addl eax, $3
+/// because the addl encodes shorter than the addw, this would introduce
+/// a use of a register that was only partially written earlier.  On older
+/// Intel processors this can be quite a performance penalty, so this should
+/// probably only be done when it can be proven that a new partial dependence
+/// wouldn't be created, or when your know a newer processor is being
+/// targeted, or when optimizing for minimum code size.
+///
+//===----------------------------------------------------------------------===//
+
+#include "X86.h"
+#include "X86InstrInfo.h"
+#include "X86Subtarget.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/CodeGen/LivePhysRegs.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineLoopInfo.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/Passes.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Target/TargetInstrInfo.h"
+using namespace llvm;
+
+#define FIXUPBW_DESC "X86 Byte/Word Instruction Fixup"
+#define FIXUPBW_NAME "x86-fixup-bw-insts"
+
+#define DEBUG_TYPE FIXUPBW_NAME
+
+// Option to allow this optimization pass to have fine-grained control.
+// This is turned off by default so as not to affect a large number of
+// existing lit tests.
+static cl::opt<bool>
+    FixupBWInsts("fixup-byte-word-insts",
+                 cl::desc("Change byte and word instructions to larger sizes"),
+                 cl::init(true), cl::Hidden);
+
+namespace {
+class FixupBWInstPass : public MachineFunctionPass {
+  /// Loop over all of the instructions in the basic block replacing applicable
+  /// byte or word instructions with better alternatives.
+  void processBasicBlock(MachineFunction &MF, MachineBasicBlock &MBB);
+
+  /// This sets the \p SuperDestReg to the 32 bit super reg of the original
+  /// destination register of the MachineInstr passed in. It returns true if
+  /// that super register is dead just prior to \p OrigMI, and false if not.
+  bool getSuperRegDestIfDead(MachineInstr *OrigMI,
+                             unsigned &SuperDestReg) const;
+
+  /// Change the MachineInstr \p MI into the equivalent extending load to 32 bit
+  /// register if it is safe to do so.  Return the replacement instruction if
+  /// OK, otherwise return nullptr.
+  MachineInstr *tryReplaceLoad(unsigned New32BitOpcode, MachineInstr *MI) const;
+
+  /// Change the MachineInstr \p MI into the equivalent 32-bit copy if it is
+  /// safe to do so.  Return the replacement instruction if OK, otherwise return
+  /// nullptr.
+  MachineInstr *tryReplaceCopy(MachineInstr *MI) const;
+
+  // Change the MachineInstr \p MI into an eqivalent 32 bit instruction if
+  // possible.  Return the replacement instruction if OK, return nullptr
+  // otherwise. Set WasCandidate to true or false depending on whether the
+  // MI was a candidate for this sort of transformation.
+  MachineInstr *tryReplaceInstr(MachineInstr *MI, MachineBasicBlock &MBB,
+                                bool &WasCandidate) const;
+public:
+  static char ID;
+
+  const char *getPassName() const override {
+    return FIXUPBW_DESC;
+  }
+
+  FixupBWInstPass() : MachineFunctionPass(ID) {
+    initializeFixupBWInstPassPass(*PassRegistry::getPassRegistry());
+  }
+
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    AU.addRequired<MachineLoopInfo>(); // Machine loop info is used to
+                                       // guide some heuristics.
+    MachineFunctionPass::getAnalysisUsage(AU);
+  }
+
+  /// Loop over all of the basic blocks, replacing byte and word instructions by
+  /// equivalent 32 bit instructions where performance or code size can be
+  /// improved.
+  bool runOnMachineFunction(MachineFunction &MF) override;
+
+  MachineFunctionProperties getRequiredProperties() const override {
+    return MachineFunctionProperties().set(
+        MachineFunctionProperties::Property::AllVRegsAllocated);
+  }
+
+private:
+  MachineFunction *MF;
+
+  /// Machine instruction info used throughout the class.
+  const X86InstrInfo *TII;
+
+  /// Local member for function's OptForSize attribute.
+  bool OptForSize;
+
+  /// Machine loop info used for guiding some heruistics.
+  MachineLoopInfo *MLI;
+
+  /// Register Liveness information after the current instruction.
+  LivePhysRegs LiveRegs;
+};
+char FixupBWInstPass::ID = 0;
+}
+
+INITIALIZE_PASS(FixupBWInstPass, FIXUPBW_NAME, FIXUPBW_DESC, false, false)
+
+FunctionPass *llvm::createX86FixupBWInsts() { return new FixupBWInstPass(); }
+
+bool FixupBWInstPass::runOnMachineFunction(MachineFunction &MF) {
+  if (!FixupBWInsts || skipFunction(*MF.getFunction()))
+    return false;
+
+  this->MF = &MF;
+  TII = MF.getSubtarget<X86Subtarget>().getInstrInfo();
+  OptForSize = MF.getFunction()->optForSize();
+  MLI = &getAnalysis<MachineLoopInfo>();
+  LiveRegs.init(&TII->getRegisterInfo());
+
+  DEBUG(dbgs() << "Start X86FixupBWInsts\n";);
+
+  // Process all basic blocks.
+  for (auto &MBB : MF)
+    processBasicBlock(MF, MBB);
+
+  DEBUG(dbgs() << "End X86FixupBWInsts\n";);
+
+  return true;
+}
+
+// TODO: This method of analysis can miss some legal cases, because the
+// super-register could be live into the address expression for a memory
+// reference for the instruction, and still be killed/last used by the
+// instruction. However, the existing query interfaces don't seem to
+// easily allow that to be checked.
+//
+// What we'd really like to know is whether after OrigMI, the
+// only portion of SuperDestReg that is alive is the portion that
+// was the destination register of OrigMI.
+bool FixupBWInstPass::getSuperRegDestIfDead(MachineInstr *OrigMI,
+                                            unsigned &SuperDestReg) const {
+  auto *TRI = &TII->getRegisterInfo();
+
+  unsigned OrigDestReg = OrigMI->getOperand(0).getReg();
+  SuperDestReg = getX86SubSuperRegister(OrigDestReg, 32);
+
+  const auto SubRegIdx = TRI->getSubRegIndex(SuperDestReg, OrigDestReg);
+
+  // Make sure that the sub-register that this instruction has as its
+  // destination is the lowest order sub-register of the super-register.
+  // If it isn't, then the register isn't really dead even if the
+  // super-register is considered dead.
+  if (SubRegIdx == X86::sub_8bit_hi)
+    return false;
+
+  if (LiveRegs.contains(SuperDestReg))
+    return false;
+
+  if (SubRegIdx == X86::sub_8bit) {
+    // In the case of byte registers, we also have to check that the upper
+    // byte register is also dead. That is considered to be independent of
+    // whether the super-register is dead.
+    unsigned UpperByteReg =
+        getX86SubSuperRegister(SuperDestReg, 8, /*High=*/true);
+
+    if (LiveRegs.contains(UpperByteReg))
+      return false;
+  }
+
+  return true;
+}
+
+MachineInstr *FixupBWInstPass::tryReplaceLoad(unsigned New32BitOpcode,
+                                              MachineInstr *MI) const {
+  unsigned NewDestReg;
+
+  // We are going to try to rewrite this load to a larger zero-extending
+  // load.  This is safe if all portions of the 32 bit super-register
+  // of the original destination register, except for the original destination
+  // register are dead. getSuperRegDestIfDead checks that.
+  if (!getSuperRegDestIfDead(MI, NewDestReg))
+    return nullptr;
+
+  // Safe to change the instruction.
+  MachineInstrBuilder MIB =
+      BuildMI(*MF, MI->getDebugLoc(), TII->get(New32BitOpcode), NewDestReg);
+
+  unsigned NumArgs = MI->getNumOperands();
+  for (unsigned i = 1; i < NumArgs; ++i)
+    MIB.addOperand(MI->getOperand(i));
+
+  MIB->setMemRefs(MI->memoperands_begin(), MI->memoperands_end());
+
+  return MIB;
+}
+
+MachineInstr *FixupBWInstPass::tryReplaceCopy(MachineInstr *MI) const {
+  assert(MI->getNumExplicitOperands() == 2);
+  auto &OldDest = MI->getOperand(0);
+  auto &OldSrc = MI->getOperand(1);
+
+  unsigned NewDestReg;
+  if (!getSuperRegDestIfDead(MI, NewDestReg))
+    return nullptr;
+
+  unsigned NewSrcReg = getX86SubSuperRegister(OldSrc.getReg(), 32);
+
+  // This is only correct if we access the same subregister index: otherwise,
+  // we could try to replace "movb %ah, %al" with "movl %eax, %eax".
+  auto *TRI = &TII->getRegisterInfo();
+  if (TRI->getSubRegIndex(NewSrcReg, OldSrc.getReg()) !=
+      TRI->getSubRegIndex(NewDestReg, OldDest.getReg()))
+    return nullptr;
+
+  // Safe to change the instruction.
+  // Don't set src flags, as we don't know if we're also killing the superreg.
+  // However, the superregister might not be defined; make it explicit that
+  // we don't care about the higher bits by reading it as Undef, and adding
+  // an imp-use on the original subregister.
+  MachineInstrBuilder MIB =
+      BuildMI(*MF, MI->getDebugLoc(), TII->get(X86::MOV32rr), NewDestReg)
+          .addReg(NewSrcReg, RegState::Undef)
+          .addReg(OldSrc.getReg(), RegState::Implicit);
+
+  // Drop imp-defs/uses that would be redundant with the new def/use.
+  for (auto &Op : MI->implicit_operands())
+    if (Op.getReg() != (Op.isDef() ? NewDestReg : NewSrcReg))
+      MIB.addOperand(Op);
+
+  return MIB;
+}
+
+MachineInstr *FixupBWInstPass::tryReplaceInstr(
+                  MachineInstr *MI, MachineBasicBlock &MBB,
+                  bool &WasCandidate) const {
+  MachineInstr *NewMI = nullptr;
+  WasCandidate = false;
+
+  // See if this is an instruction of the type we are currently looking for.
+  switch (MI->getOpcode()) {
+
+  case X86::MOV8rm:
+    // Only replace 8 bit loads with the zero extending versions if
+    // in an inner most loop and not optimizing for size. This takes
+    // an extra byte to encode, and provides limited performance upside.
+    if (MachineLoop *ML = MLI->getLoopFor(&MBB)) {
+      if (ML->begin() == ML->end() && !OptForSize) {
+        NewMI = tryReplaceLoad(X86::MOVZX32rm8, MI);
+        WasCandidate = true;
+      }
+    }
+    break;
+
+  case X86::MOV16rm:
+    // Always try to replace 16 bit load with 32 bit zero extending.
+    // Code size is the same, and there is sometimes a perf advantage
+    // from eliminating a false dependence on the upper portion of
+    // the register.
+    NewMI = tryReplaceLoad(X86::MOVZX32rm16, MI);
+    WasCandidate = true;
+    break;
+
+  case X86::MOV8rr:
+  case X86::MOV16rr:
+    // Always try to replace 8/16 bit copies with a 32 bit copy.
+    // Code size is either less (16) or equal (8), and there is sometimes a
+    // perf advantage from eliminating a false dependence on the upper portion
+    // of the register.
+    NewMI = tryReplaceCopy(MI);
+    WasCandidate = true;
+    break;
+
+  default:
+    // nothing to do here.
+    break;
+  }
+
+  return NewMI;
+}
+
+void FixupBWInstPass::processBasicBlock(MachineFunction &MF,
+                                        MachineBasicBlock &MBB) {
+
+  // This algorithm doesn't delete the instructions it is replacing
+  // right away.  By leaving the existing instructions in place, the
+  // register liveness information doesn't change, and this makes the
+  // analysis that goes on be better than if the replaced instructions
+  // were immediately removed.
+  //
+  // This algorithm always creates a replacement instruction
+  // and notes that and the original in a data structure, until the
+  // whole BB has been analyzed.  This keeps the replacement instructions
+  // from making it seem as if the larger register might be live.
+  SmallVector<std::pair<MachineInstr *, MachineInstr *>, 8> MIReplacements;
+
+  // Start computing liveness for this block. We iterate from the end to be able
+  // to update this for each instruction.
+  LiveRegs.clear();
+  // We run after PEI, so we need to AddPristinesAndCSRs.
+  LiveRegs.addLiveOuts(MBB);
+
+  bool WasCandidate = false;
+
+  for (auto I = MBB.rbegin(); I != MBB.rend(); ++I) {
+    MachineInstr *MI = &*I;
+    
+    MachineInstr *NewMI = tryReplaceInstr(MI, MBB, WasCandidate);
+
+    // Add this to replacements if it was a candidate, even if NewMI is
+    // nullptr.  We will revisit that in a bit.
+    if (WasCandidate) {
+      MIReplacements.push_back(std::make_pair(MI, NewMI));
+    }
+
+    // We're done with this instruction, update liveness for the next one.
+    LiveRegs.stepBackward(*MI);
+  }
+
+  while (!MIReplacements.empty()) {
+    MachineInstr *MI = MIReplacements.back().first;
+    MachineInstr *NewMI = MIReplacements.back().second;
+    MIReplacements.pop_back();
+    if (NewMI) {
+      MBB.insert(MI, NewMI);
+      MBB.erase(MI);
+    }
+  }
+}
diff --git a/lib/Target/X86/X86FixupLEAs.cpp b/lib/Target/X86/X86FixupLEAs.cpp
index 1dd69e8a6a5f..013ee249a60f 100644
--- a/lib/Target/X86/X86FixupLEAs.cpp
+++ b/lib/Target/X86/X86FixupLEAs.cpp
@@ -92,6 +92,12 @@ public:
   /// if needed and when possible.
   bool runOnMachineFunction(MachineFunction &MF) override;
 
+  // This pass runs after regalloc and doesn't support VReg operands.
+  MachineFunctionProperties getRequiredProperties() const override {
+    return MachineFunctionProperties().set(
+        MachineFunctionProperties::Property::AllVRegsAllocated);
+  }
+
 private:
   MachineFunction *MF;
   const X86InstrInfo *TII; // Machine instruction info.
@@ -104,22 +110,22 @@ char FixupLEAPass::ID = 0;
 MachineInstr *
 FixupLEAPass::postRAConvertToLEA(MachineFunction::iterator &MFI,
                                  MachineBasicBlock::iterator &MBBI) const {
-  MachineInstr *MI = MBBI;
-  MachineInstr *NewMI;
-  switch (MI->getOpcode()) {
+  MachineInstr &MI = *MBBI;
+  switch (MI.getOpcode()) {
   case X86::MOV32rr:
   case X86::MOV64rr: {
-    const MachineOperand &Src = MI->getOperand(1);
-    const MachineOperand &Dest = MI->getOperand(0);
-    NewMI = BuildMI(*MF, MI->getDebugLoc(),
-                    TII->get(MI->getOpcode() == X86::MOV32rr ? X86::LEA32r
-                                                             : X86::LEA64r))
-                .addOperand(Dest)
-                .addOperand(Src)
-                .addImm(1)
-                .addReg(0)
-                .addImm(0)
-                .addReg(0);
+    const MachineOperand &Src = MI.getOperand(1);
+    const MachineOperand &Dest = MI.getOperand(0);
+    MachineInstr *NewMI =
+        BuildMI(*MF, MI.getDebugLoc(),
+                TII->get(MI.getOpcode() == X86::MOV32rr ? X86::LEA32r
+                                                        : X86::LEA64r))
+            .addOperand(Dest)
+            .addOperand(Src)
+            .addImm(1)
+            .addReg(0)
+            .addImm(0)
+            .addReg(0);
     MFI->insert(MBBI, NewMI); // Insert the new inst
     return NewMI;
   }
@@ -135,7 +141,7 @@ FixupLEAPass::postRAConvertToLEA(MachineFunction::iterator &MFI,
   case X86::ADD16ri8:
   case X86::ADD16ri_DB:
   case X86::ADD16ri8_DB:
-    if (!MI->getOperand(2).isImm()) {
+    if (!MI.getOperand(2).isImm()) {
       // convertToThreeAddress will call getImm()
       // which requires isImm() to be true
       return nullptr;
@@ -143,19 +149,22 @@ FixupLEAPass::postRAConvertToLEA(MachineFunction::iterator &MFI,
     break;
   case X86::ADD16rr:
   case X86::ADD16rr_DB:
-    if (MI->getOperand(1).getReg() != MI->getOperand(2).getReg()) {
+    if (MI.getOperand(1).getReg() != MI.getOperand(2).getReg()) {
       // if src1 != src2, then convertToThreeAddress will
       // need to create a Virtual register, which we cannot do
       // after register allocation.
       return nullptr;
     }
   }
-  return TII->convertToThreeAddress(MFI, MBBI, nullptr);
+  return TII->convertToThreeAddress(MFI, MI, nullptr);
 }
 
 FunctionPass *llvm::createX86FixupLEAs() { return new FixupLEAPass(); }
 
 bool FixupLEAPass::runOnMachineFunction(MachineFunction &Func) {
+  if (skipFunction(*Func.getFunction()))
+    return false;
+
   MF = &Func;
   const X86Subtarget &ST = Func.getSubtarget<X86Subtarget>();
   OptIncDec = !ST.slowIncDec() || Func.getFunction()->optForMinSize();
@@ -178,10 +187,10 @@ bool FixupLEAPass::runOnMachineFunction(MachineFunction &Func) {
 FixupLEAPass::RegUsageState
 FixupLEAPass::usesRegister(MachineOperand &p, MachineBasicBlock::iterator I) {
   RegUsageState RegUsage = RU_NotUsed;
-  MachineInstr *MI = I;
+  MachineInstr &MI = *I;
 
-  for (unsigned int i = 0; i < MI->getNumOperands(); ++i) {
-    MachineOperand &opnd = MI->getOperand(i);
+  for (unsigned int i = 0; i < MI.getNumOperands(); ++i) {
+    MachineOperand &opnd = MI.getOperand(i);
     if (opnd.isReg() && opnd.getReg() == p.getReg()) {
       if (opnd.isDef())
         return RU_Write;
@@ -227,10 +236,10 @@ FixupLEAPass::searchBackwards(MachineOperand &p, MachineBasicBlock::iterator &I,
       return CurInst;
     }
     InstrDistance += TII->getInstrLatency(
-        MF->getSubtarget().getInstrItineraryData(), CurInst);
+        MF->getSubtarget().getInstrItineraryData(), *CurInst);
     Found = getPreviousInstr(CurInst, MFI);
   }
-  return nullptr;
+  return MachineBasicBlock::iterator();
 }
 
 static inline bool isLEA(const int opcode) {
@@ -241,28 +250,28 @@ static inline bool isLEA(const int opcode) {
 /// isLEASimpleIncOrDec - Does this LEA have one these forms:
 /// lea  %reg, 1(%reg)
 /// lea  %reg, -1(%reg)
-static inline bool isLEASimpleIncOrDec(MachineInstr *LEA) {
-  unsigned SrcReg = LEA->getOperand(1 + X86::AddrBaseReg).getReg();
-  unsigned DstReg = LEA->getOperand(0).getReg();
+static inline bool isLEASimpleIncOrDec(MachineInstr &LEA) {
+  unsigned SrcReg = LEA.getOperand(1 + X86::AddrBaseReg).getReg();
+  unsigned DstReg = LEA.getOperand(0).getReg();
   unsigned AddrDispOp = 1 + X86::AddrDisp;
   return SrcReg == DstReg &&
-         LEA->getOperand(1 + X86::AddrIndexReg).getReg() == 0 &&
-         LEA->getOperand(1 + X86::AddrSegmentReg).getReg() == 0 &&
-         LEA->getOperand(AddrDispOp).isImm() &&
-         (LEA->getOperand(AddrDispOp).getImm() == 1 ||
-          LEA->getOperand(AddrDispOp).getImm() == -1);
+         LEA.getOperand(1 + X86::AddrIndexReg).getReg() == 0 &&
+         LEA.getOperand(1 + X86::AddrSegmentReg).getReg() == 0 &&
+         LEA.getOperand(AddrDispOp).isImm() &&
+         (LEA.getOperand(AddrDispOp).getImm() == 1 ||
+          LEA.getOperand(AddrDispOp).getImm() == -1);
 }
 
 bool FixupLEAPass::fixupIncDec(MachineBasicBlock::iterator &I,
                                MachineFunction::iterator MFI) const {
-  MachineInstr *MI = I;
-  int Opcode = MI->getOpcode();
+  MachineInstr &MI = *I;
+  int Opcode = MI.getOpcode();
   if (!isLEA(Opcode))
     return false;
 
   if (isLEASimpleIncOrDec(MI) && TII->isSafeToClobberEFLAGS(*MFI, I)) {
     int NewOpcode;
-    bool isINC = MI->getOperand(4).getImm() == 1;
+    bool isINC = MI.getOperand(4).getImm() == 1;
     switch (Opcode) {
     case X86::LEA16r:
       NewOpcode = isINC ? X86::INC16r : X86::DEC16r;
@@ -277,9 +286,9 @@ bool FixupLEAPass::fixupIncDec(MachineBasicBlock::iterator &I,
     }
 
     MachineInstr *NewMI =
-        BuildMI(*MFI, I, MI->getDebugLoc(), TII->get(NewOpcode))
-            .addOperand(MI->getOperand(0))
-            .addOperand(MI->getOperand(1));
+        BuildMI(*MFI, I, MI.getDebugLoc(), TII->get(NewOpcode))
+            .addOperand(MI.getOperand(0))
+            .addOperand(MI.getOperand(1));
     MFI->erase(I);
     I = static_cast<MachineBasicBlock::iterator>(NewMI);
     return true;
@@ -290,17 +299,16 @@ bool FixupLEAPass::fixupIncDec(MachineBasicBlock::iterator &I,
 void FixupLEAPass::processInstruction(MachineBasicBlock::iterator &I,
                                       MachineFunction::iterator MFI) {
   // Process a load, store, or LEA instruction.
-  MachineInstr *MI = I;
-  int opcode = MI->getOpcode();
-  const MCInstrDesc &Desc = MI->getDesc();
-  int AddrOffset = X86II::getMemoryOperandNo(Desc.TSFlags, opcode);
+  MachineInstr &MI = *I;
+  const MCInstrDesc &Desc = MI.getDesc();
+  int AddrOffset = X86II::getMemoryOperandNo(Desc.TSFlags);
   if (AddrOffset >= 0) {
     AddrOffset += X86II::getOperandBias(Desc);
-    MachineOperand &p = MI->getOperand(AddrOffset + X86::AddrBaseReg);
+    MachineOperand &p = MI.getOperand(AddrOffset + X86::AddrBaseReg);
     if (p.isReg() && p.getReg() != X86::ESP) {
       seekLEAFixup(p, I, MFI);
     }
-    MachineOperand &q = MI->getOperand(AddrOffset + X86::AddrIndexReg);
+    MachineOperand &q = MI.getOperand(AddrOffset + X86::AddrIndexReg);
     if (q.isReg() && q.getReg() != X86::ESP) {
       seekLEAFixup(q, I, MFI);
     }
@@ -311,7 +319,7 @@ void FixupLEAPass::seekLEAFixup(MachineOperand &p,
                                 MachineBasicBlock::iterator &I,
                                 MachineFunction::iterator MFI) {
   MachineBasicBlock::iterator MBI = searchBackwards(p, I, MFI);
-  if (MBI) {
+  if (MBI != MachineBasicBlock::iterator()) {
     MachineInstr *NewMI = postRAConvertToLEA(MFI, MBI);
     if (NewMI) {
       ++NumLEAs;
@@ -328,19 +336,19 @@ void FixupLEAPass::seekLEAFixup(MachineOperand &p,
 
 void FixupLEAPass::processInstructionForSLM(MachineBasicBlock::iterator &I,
                                             MachineFunction::iterator MFI) {
-  MachineInstr *MI = I;
-  const int opcode = MI->getOpcode();
+  MachineInstr &MI = *I;
+  const int opcode = MI.getOpcode();
   if (!isLEA(opcode))
     return;
-  if (MI->getOperand(5).getReg() != 0 || !MI->getOperand(4).isImm() ||
+  if (MI.getOperand(5).getReg() != 0 || !MI.getOperand(4).isImm() ||
       !TII->isSafeToClobberEFLAGS(*MFI, I))
     return;
-  const unsigned DstR = MI->getOperand(0).getReg();
-  const unsigned SrcR1 = MI->getOperand(1).getReg();
-  const unsigned SrcR2 = MI->getOperand(3).getReg();
+  const unsigned DstR = MI.getOperand(0).getReg();
+  const unsigned SrcR1 = MI.getOperand(1).getReg();
+  const unsigned SrcR2 = MI.getOperand(3).getReg();
   if ((SrcR1 == 0 || SrcR1 != DstR) && (SrcR2 == 0 || SrcR2 != DstR))
     return;
-  if (MI->getOperand(2).getImm() > 1)
+  if (MI.getOperand(2).getImm() > 1)
     return;
   int addrr_opcode, addri_opcode;
   switch (opcode) {
@@ -363,12 +371,12 @@ void FixupLEAPass::processInstructionForSLM(MachineBasicBlock::iterator &I,
   DEBUG(dbgs() << "FixLEA: Candidate to replace:"; I->dump(););
   DEBUG(dbgs() << "FixLEA: Replaced by: ";);
   MachineInstr *NewMI = nullptr;
-  const MachineOperand &Dst = MI->getOperand(0);
+  const MachineOperand &Dst = MI.getOperand(0);
   // Make ADD instruction for two registers writing to LEA's destination
   if (SrcR1 != 0 && SrcR2 != 0) {
-    const MachineOperand &Src1 = MI->getOperand(SrcR1 == DstR ? 1 : 3);
-    const MachineOperand &Src2 = MI->getOperand(SrcR1 == DstR ? 3 : 1);
-    NewMI = BuildMI(*MF, MI->getDebugLoc(), TII->get(addrr_opcode))
+    const MachineOperand &Src1 = MI.getOperand(SrcR1 == DstR ? 1 : 3);
+    const MachineOperand &Src2 = MI.getOperand(SrcR1 == DstR ? 3 : 1);
+    NewMI = BuildMI(*MF, MI.getDebugLoc(), TII->get(addrr_opcode))
                 .addOperand(Dst)
                 .addOperand(Src1)
                 .addOperand(Src2);
@@ -376,12 +384,12 @@ void FixupLEAPass::processInstructionForSLM(MachineBasicBlock::iterator &I,
     DEBUG(NewMI->dump(););
   }
   // Make ADD instruction for immediate
-  if (MI->getOperand(4).getImm() != 0) {
-    const MachineOperand &SrcR = MI->getOperand(SrcR1 == DstR ? 1 : 3);
-    NewMI = BuildMI(*MF, MI->getDebugLoc(), TII->get(addri_opcode))
+  if (MI.getOperand(4).getImm() != 0) {
+    const MachineOperand &SrcR = MI.getOperand(SrcR1 == DstR ? 1 : 3);
+    NewMI = BuildMI(*MF, MI.getDebugLoc(), TII->get(addri_opcode))
                 .addOperand(Dst)
                 .addOperand(SrcR)
-                .addImm(MI->getOperand(4).getImm());
+                .addImm(MI.getOperand(4).getImm());
     MFI->insert(I, NewMI);
     DEBUG(NewMI->dump(););
   }
diff --git a/lib/Target/X86/X86FixupSetCC.cpp b/lib/Target/X86/X86FixupSetCC.cpp
new file mode 100644
index 000000000000..fb317da95355
--- /dev/null
+++ b/lib/Target/X86/X86FixupSetCC.cpp
@@ -0,0 +1,186 @@
+//===---- X86FixupSetCC.cpp - optimize usage of LEA instructions ----------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines a pass that fixes zero-extension of setcc patterns.
+// X86 setcc instructions are modeled to have no input arguments, and a single
+// GR8 output argument. This is consistent with other similar instructions
+// (e.g. movb), but means it is impossible to directly generate a setcc into
+// the lower GR8 of a specified GR32.
+// This means that ISel must select (zext (setcc)) into something like
+// seta %al; movzbl %al, %eax.
+// Unfortunately, this can cause a stall due to the partial register write
+// performed by the setcc. Instead, we can use:
+// xor %eax, %eax; seta %al
+// This both avoids the stall, and encodes shorter.
+//===----------------------------------------------------------------------===//
+
+#include "X86.h"
+#include "X86InstrInfo.h"
+#include "X86Subtarget.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "x86-fixup-setcc"
+
+STATISTIC(NumSubstZexts, "Number of setcc + zext pairs substituted");
+
+namespace {
+class X86FixupSetCCPass : public MachineFunctionPass {
+public:
+  X86FixupSetCCPass() : MachineFunctionPass(ID) {}
+
+  const char *getPassName() const override { return "X86 Fixup SetCC"; }
+
+  bool runOnMachineFunction(MachineFunction &MF) override;
+
+private:
+  // Find the preceding instruction that imp-defs eflags.
+  MachineInstr *findFlagsImpDef(MachineBasicBlock *MBB,
+                                MachineBasicBlock::reverse_iterator MI);
+
+  // Return true if MI imp-uses eflags.
+  bool impUsesFlags(MachineInstr *MI);
+
+  // Return true if this is the opcode of a SetCC instruction with a register
+  // output.
+  bool isSetCCr(unsigned Opode);
+
+  MachineRegisterInfo *MRI;
+  const X86InstrInfo *TII;
+
+  enum { SearchBound = 16 };
+
+  static char ID;
+};
+
+char X86FixupSetCCPass::ID = 0;
+}
+
+FunctionPass *llvm::createX86FixupSetCC() { return new X86FixupSetCCPass(); }
+
+bool X86FixupSetCCPass::isSetCCr(unsigned Opcode) {
+  switch (Opcode) {
+  default:
+    return false;
+  case X86::SETOr:
+  case X86::SETNOr:
+  case X86::SETBr:
+  case X86::SETAEr:
+  case X86::SETEr:
+  case X86::SETNEr:
+  case X86::SETBEr:
+  case X86::SETAr:
+  case X86::SETSr:
+  case X86::SETNSr:
+  case X86::SETPr:
+  case X86::SETNPr:
+  case X86::SETLr:
+  case X86::SETGEr:
+  case X86::SETLEr:
+  case X86::SETGr:
+    return true;
+  }
+}
+
+// We expect the instruction *immediately* before the setcc to imp-def
+// EFLAGS (because of scheduling glue). To make this less brittle w.r.t
+// scheduling, look backwards until we hit the beginning of the
+// basic-block, or a small bound (to avoid quadratic behavior).
+MachineInstr *
+X86FixupSetCCPass::findFlagsImpDef(MachineBasicBlock *MBB,
+                                   MachineBasicBlock::reverse_iterator MI) {
+  auto MBBStart = MBB->instr_rend();
+  for (int i = 0; (i < SearchBound) && (MI != MBBStart); ++i, ++MI)
+    for (auto &Op : MI->implicit_operands())
+      if ((Op.getReg() == X86::EFLAGS) && (Op.isDef()))
+        return &*MI;
+
+  return nullptr;
+}
+
+bool X86FixupSetCCPass::impUsesFlags(MachineInstr *MI) {
+  for (auto &Op : MI->implicit_operands())
+    if ((Op.getReg() == X86::EFLAGS) && (Op.isUse()))
+      return true;
+
+  return false;
+}
+
+bool X86FixupSetCCPass::runOnMachineFunction(MachineFunction &MF) {
+  bool Changed = false;
+  MRI = &MF.getRegInfo();
+  TII = MF.getSubtarget<X86Subtarget>().getInstrInfo();
+
+  SmallVector<MachineInstr*, 4> ToErase;
+
+  for (auto &MBB : MF) {
+    for (auto &MI : MBB) {
+      // Find a setcc that is used by a zext.
+      // This doesn't have to be the only use, the transformation is safe
+      // regardless.
+      if (!isSetCCr(MI.getOpcode()))
+        continue;
+
+      MachineInstr *ZExt = nullptr;
+      for (auto &Use : MRI->use_instructions(MI.getOperand(0).getReg()))
+        if (Use.getOpcode() == X86::MOVZX32rr8)
+          ZExt = &Use;
+
+      if (!ZExt)
+        continue;
+
+      // Find the preceding instruction that imp-defs eflags.
+      MachineInstr *FlagsDefMI = findFlagsImpDef(
+          MI.getParent(), MachineBasicBlock::reverse_iterator(&MI));
+      if (!FlagsDefMI)
+        continue;
+
+      // We'd like to put something that clobbers eflags directly before
+      // FlagsDefMI. This can't hurt anything after FlagsDefMI, because
+      // it, itself, by definition, clobbers eflags. But it may happen that
+      // FlagsDefMI also *uses* eflags, in which case the transformation is
+      // invalid.
+      if (impUsesFlags(FlagsDefMI))
+        continue;
+
+      ++NumSubstZexts;
+      Changed = true;
+
+      // On 32-bit, we need to be careful to force an ABCD register.
+      const TargetRegisterClass *RC = MF.getSubtarget<X86Subtarget>().is64Bit()
+                                          ? &X86::GR32RegClass
+                                          : &X86::GR32_ABCDRegClass;
+      unsigned ZeroReg = MRI->createVirtualRegister(RC);
+      unsigned InsertReg = MRI->createVirtualRegister(RC);
+
+      // Initialize a register with 0. This must go before the eflags def
+      BuildMI(MBB, FlagsDefMI, MI.getDebugLoc(), TII->get(X86::MOV32r0),
+              ZeroReg);
+
+      // X86 setcc only takes an output GR8, so fake a GR32 input by inserting
+      // the setcc result into the low byte of the zeroed register.
+      BuildMI(*ZExt->getParent(), ZExt, ZExt->getDebugLoc(),
+              TII->get(X86::INSERT_SUBREG), InsertReg)
+          .addReg(ZeroReg)
+          .addReg(MI.getOperand(0).getReg())
+          .addImm(X86::sub_8bit);
+      MRI->replaceRegWith(ZExt->getOperand(0).getReg(), InsertReg);
+      ToErase.push_back(ZExt);
+    }
+  }
+
+  for (auto &I : ToErase)
+    I->eraseFromParent();
+
+  return Changed;
+}
diff --git a/lib/Target/X86/X86FloatingPoint.cpp b/lib/Target/X86/X86FloatingPoint.cpp
index 97bb8ab653a6..55c1bff2bc18 100644
--- a/lib/Target/X86/X86FloatingPoint.cpp
+++ b/lib/Target/X86/X86FloatingPoint.cpp
@@ -76,6 +76,11 @@ namespace {
 
     bool runOnMachineFunction(MachineFunction &MF) override;
 
+    MachineFunctionProperties getRequiredProperties() const override {
+      return MachineFunctionProperties().set(
+          MachineFunctionProperties::Property::AllVRegsAllocated);
+    }
+
     const char *getPassName() const override { return "X86 FP Stackifier"; }
 
   private:
@@ -222,7 +227,8 @@ namespace {
       ++NumFXCH;
     }
 
-    void duplicateToTop(unsigned RegNo, unsigned AsReg, MachineInstr *I) {
+    void duplicateToTop(unsigned RegNo, unsigned AsReg,
+                        MachineBasicBlock::iterator I) {
       DebugLoc dl = I == MBB->end() ? DebugLoc() : I->getDebugLoc();
       unsigned STReg = getSTReg(RegNo);
       pushReg(AsReg);   // New register on top of stack
@@ -257,6 +263,7 @@ namespace {
     bool processBasicBlock(MachineFunction &MF, MachineBasicBlock &MBB);
 
     void handleCall(MachineBasicBlock::iterator &I);
+    void handleReturn(MachineBasicBlock::iterator &I);
     void handleZeroArgFP(MachineBasicBlock::iterator &I);
     void handleOneArgFP(MachineBasicBlock::iterator &I);
     void handleOneArgFPRW(MachineBasicBlock::iterator &I);
@@ -266,9 +273,9 @@ namespace {
     void handleSpecialFP(MachineBasicBlock::iterator &I);
 
     // Check if a COPY instruction is using FP registers.
-    static bool isFPCopy(MachineInstr *MI) {
-      unsigned DstReg = MI->getOperand(0).getReg();
-      unsigned SrcReg = MI->getOperand(1).getReg();
+    static bool isFPCopy(MachineInstr &MI) {
+      unsigned DstReg = MI.getOperand(0).getReg();
+      unsigned SrcReg = MI.getOperand(1).getReg();
 
       return X86::RFP80RegClass.contains(DstReg) ||
         X86::RFP80RegClass.contains(SrcReg);
@@ -367,21 +374,21 @@ bool FPS::processBasicBlock(MachineFunction &MF, MachineBasicBlock &BB) {
   setupBlockStack();
 
   for (MachineBasicBlock::iterator I = BB.begin(); I != BB.end(); ++I) {
-    MachineInstr *MI = I;
-    uint64_t Flags = MI->getDesc().TSFlags;
+    MachineInstr &MI = *I;
+    uint64_t Flags = MI.getDesc().TSFlags;
 
     unsigned FPInstClass = Flags & X86II::FPTypeMask;
-    if (MI->isInlineAsm())
+    if (MI.isInlineAsm())
       FPInstClass = X86II::SpecialFP;
 
-    if (MI->isCopy() && isFPCopy(MI))
+    if (MI.isCopy() && isFPCopy(MI))
       FPInstClass = X86II::SpecialFP;
 
-    if (MI->isImplicitDef() &&
-        X86::RFP80RegClass.contains(MI->getOperand(0).getReg()))
+    if (MI.isImplicitDef() &&
+        X86::RFP80RegClass.contains(MI.getOperand(0).getReg()))
       FPInstClass = X86II::SpecialFP;
 
-    if (MI->isCall())
+    if (MI.isCall())
       FPInstClass = X86II::SpecialFP;
 
     if (FPInstClass == X86II::NotFP)
@@ -389,16 +396,16 @@ bool FPS::processBasicBlock(MachineFunction &MF, MachineBasicBlock &BB) {
 
     MachineInstr *PrevMI = nullptr;
     if (I != BB.begin())
-      PrevMI = std::prev(I);
+      PrevMI = &*std::prev(I);
 
     ++NumFP;  // Keep track of # of pseudo instrs
-    DEBUG(dbgs() << "\nFPInst:\t" << *MI);
+    DEBUG(dbgs() << "\nFPInst:\t" << MI);
 
     // Get dead variables list now because the MI pointer may be deleted as part
     // of processing!
     SmallVector<unsigned, 8> DeadRegs;
-    for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) {
-      const MachineOperand &MO = MI->getOperand(i);
+    for (unsigned i = 0, e = MI.getNumOperands(); i != e; ++i) {
+      const MachineOperand &MO = MI.getOperand(i);
       if (MO.isReg() && MO.isDead())
         DeadRegs.push_back(MO.getReg());
     }
@@ -427,20 +434,22 @@ bool FPS::processBasicBlock(MachineFunction &MF, MachineBasicBlock &BB) {
     }
 
     // Print out all of the instructions expanded to if -debug
-    DEBUG(
-      MachineBasicBlock::iterator PrevI(PrevMI);
+    DEBUG({
+      MachineBasicBlock::iterator PrevI = PrevMI;
       if (I == PrevI) {
         dbgs() << "Just deleted pseudo instruction\n";
       } else {
         MachineBasicBlock::iterator Start = I;
         // Rewind to first instruction newly inserted.
-        while (Start != BB.begin() && std::prev(Start) != PrevI) --Start;
+        while (Start != BB.begin() && std::prev(Start) != PrevI)
+          --Start;
         dbgs() << "Inserted instructions:\n\t";
         Start->print(dbgs());
-        while (++Start != std::next(I)) {}
+        while (++Start != std::next(I)) {
+        }
       }
       dumpStack();
-    );
+    });
     (void)PrevMI;
 
     Changed = true;
@@ -779,8 +788,8 @@ static const TableEntry PopTable[] = {
 /// instruction if it was modified in place.
 ///
 void FPS::popStackAfter(MachineBasicBlock::iterator &I) {
-  MachineInstr* MI = I;
-  DebugLoc dl = MI->getDebugLoc();
+  MachineInstr &MI = *I;
+  const DebugLoc &dl = MI.getDebugLoc();
   ASSERT_SORTED(PopTable);
   if (StackTop == 0)
     report_fatal_error("Cannot pop empty stack!");
@@ -943,15 +952,102 @@ void FPS::handleCall(MachineBasicBlock::iterator &I) {
     pushReg(N - I - 1);
 }
 
+/// If RET has an FP register use operand, pass the first one in ST(0) and
+/// the second one in ST(1).
+void FPS::handleReturn(MachineBasicBlock::iterator &I) {
+  MachineInstr &MI = *I;
+
+  // Find the register operands.
+  unsigned FirstFPRegOp = ~0U, SecondFPRegOp = ~0U;
+  unsigned LiveMask = 0;
+
+  for (unsigned i = 0, e = MI.getNumOperands(); i != e; ++i) {
+    MachineOperand &Op = MI.getOperand(i);
+    if (!Op.isReg() || Op.getReg() < X86::FP0 || Op.getReg() > X86::FP6)
+      continue;
+    // FP Register uses must be kills unless there are two uses of the same
+    // register, in which case only one will be a kill.
+    assert(Op.isUse() &&
+           (Op.isKill() ||                    // Marked kill.
+            getFPReg(Op) == FirstFPRegOp ||   // Second instance.
+            MI.killsRegister(Op.getReg())) && // Later use is marked kill.
+           "Ret only defs operands, and values aren't live beyond it");
+
+    if (FirstFPRegOp == ~0U)
+      FirstFPRegOp = getFPReg(Op);
+    else {
+      assert(SecondFPRegOp == ~0U && "More than two fp operands!");
+      SecondFPRegOp = getFPReg(Op);
+    }
+    LiveMask |= (1 << getFPReg(Op));
+
+    // Remove the operand so that later passes don't see it.
+    MI.RemoveOperand(i);
+    --i;
+    --e;
+  }
+
+  // We may have been carrying spurious live-ins, so make sure only the
+  // returned registers are left live.
+  adjustLiveRegs(LiveMask, MI);
+  if (!LiveMask) return;  // Quick check to see if any are possible.
+
+  // There are only four possibilities here:
+  // 1) we are returning a single FP value.  In this case, it has to be in
+  //    ST(0) already, so just declare success by removing the value from the
+  //    FP Stack.
+  if (SecondFPRegOp == ~0U) {
+    // Assert that the top of stack contains the right FP register.
+    assert(StackTop == 1 && FirstFPRegOp == getStackEntry(0) &&
+           "Top of stack not the right register for RET!");
+
+    // Ok, everything is good, mark the value as not being on the stack
+    // anymore so that our assertion about the stack being empty at end of
+    // block doesn't fire.
+    StackTop = 0;
+    return;
+  }
+
+  // Otherwise, we are returning two values:
+  // 2) If returning the same value for both, we only have one thing in the FP
+  //    stack.  Consider:  RET FP1, FP1
+  if (StackTop == 1) {
+    assert(FirstFPRegOp == SecondFPRegOp && FirstFPRegOp == getStackEntry(0)&&
+           "Stack misconfiguration for RET!");
+
+    // Duplicate the TOS so that we return it twice.  Just pick some other FPx
+    // register to hold it.
+    unsigned NewReg = ScratchFPReg;
+    duplicateToTop(FirstFPRegOp, NewReg, MI);
+    FirstFPRegOp = NewReg;
+  }
+
+  /// Okay we know we have two different FPx operands now:
+  assert(StackTop == 2 && "Must have two values live!");
+
+  /// 3) If SecondFPRegOp is currently in ST(0) and FirstFPRegOp is currently
+  ///    in ST(1).  In this case, emit an fxch.
+  if (getStackEntry(0) == SecondFPRegOp) {
+    assert(getStackEntry(1) == FirstFPRegOp && "Unknown regs live");
+    moveToTop(FirstFPRegOp, MI);
+  }
+
+  /// 4) Finally, FirstFPRegOp must be in ST(0) and SecondFPRegOp must be in
+  /// ST(1).  Just remove both from our understanding of the stack and return.
+  assert(getStackEntry(0) == FirstFPRegOp && "Unknown regs live");
+  assert(getStackEntry(1) == SecondFPRegOp && "Unknown regs live");
+  StackTop = 0;
+}
+
 /// handleZeroArgFP - ST(0) = fld0    ST(0) = flds <mem>
 ///
 void FPS::handleZeroArgFP(MachineBasicBlock::iterator &I) {
-  MachineInstr *MI = I;
-  unsigned DestReg = getFPReg(MI->getOperand(0));
+  MachineInstr &MI = *I;
+  unsigned DestReg = getFPReg(MI.getOperand(0));
 
   // Change from the pseudo instruction to the concrete instruction.
-  MI->RemoveOperand(0);   // Remove the explicit ST(0) operand
-  MI->setDesc(TII->get(getConcreteOpcode(MI->getOpcode())));
+  MI.RemoveOperand(0); // Remove the explicit ST(0) operand
+  MI.setDesc(TII->get(getConcreteOpcode(MI.getOpcode())));
 
   // Result gets pushed on the stack.
   pushReg(DestReg);
@@ -960,14 +1056,14 @@ void FPS::handleZeroArgFP(MachineBasicBlock::iterator &I) {
 /// handleOneArgFP - fst <mem>, ST(0)
 ///
 void FPS::handleOneArgFP(MachineBasicBlock::iterator &I) {
-  MachineInstr *MI = I;
-  unsigned NumOps = MI->getDesc().getNumOperands();
+  MachineInstr &MI = *I;
+  unsigned NumOps = MI.getDesc().getNumOperands();
   assert((NumOps == X86::AddrNumOperands + 1 || NumOps == 1) &&
          "Can only handle fst* & ftst instructions!");
 
   // Is this the last use of the source register?
-  unsigned Reg = getFPReg(MI->getOperand(NumOps-1));
-  bool KillsSrc = MI->killsRegister(X86::FP0+Reg);
+  unsigned Reg = getFPReg(MI.getOperand(NumOps - 1));
+  bool KillsSrc = MI.killsRegister(X86::FP0 + Reg);
 
   // FISTP64m is strange because there isn't a non-popping versions.
   // If we have one _and_ we don't want to pop the operand, duplicate the value
@@ -975,34 +1071,31 @@ void FPS::handleOneArgFP(MachineBasicBlock::iterator &I) {
   // always ok.
   // Ditto FISTTP16m, FISTTP32m, FISTTP64m, ST_FpP80m.
   //
-  if (!KillsSrc &&
-      (MI->getOpcode() == X86::IST_Fp64m32 ||
-       MI->getOpcode() == X86::ISTT_Fp16m32 ||
-       MI->getOpcode() == X86::ISTT_Fp32m32 ||
-       MI->getOpcode() == X86::ISTT_Fp64m32 ||
-       MI->getOpcode() == X86::IST_Fp64m64 ||
-       MI->getOpcode() == X86::ISTT_Fp16m64 ||
-       MI->getOpcode() == X86::ISTT_Fp32m64 ||
-       MI->getOpcode() == X86::ISTT_Fp64m64 ||
-       MI->getOpcode() == X86::IST_Fp64m80 ||
-       MI->getOpcode() == X86::ISTT_Fp16m80 ||
-       MI->getOpcode() == X86::ISTT_Fp32m80 ||
-       MI->getOpcode() == X86::ISTT_Fp64m80 ||
-       MI->getOpcode() == X86::ST_FpP80m)) {
+  if (!KillsSrc && (MI.getOpcode() == X86::IST_Fp64m32 ||
+                    MI.getOpcode() == X86::ISTT_Fp16m32 ||
+                    MI.getOpcode() == X86::ISTT_Fp32m32 ||
+                    MI.getOpcode() == X86::ISTT_Fp64m32 ||
+                    MI.getOpcode() == X86::IST_Fp64m64 ||
+                    MI.getOpcode() == X86::ISTT_Fp16m64 ||
+                    MI.getOpcode() == X86::ISTT_Fp32m64 ||
+                    MI.getOpcode() == X86::ISTT_Fp64m64 ||
+                    MI.getOpcode() == X86::IST_Fp64m80 ||
+                    MI.getOpcode() == X86::ISTT_Fp16m80 ||
+                    MI.getOpcode() == X86::ISTT_Fp32m80 ||
+                    MI.getOpcode() == X86::ISTT_Fp64m80 ||
+                    MI.getOpcode() == X86::ST_FpP80m)) {
     duplicateToTop(Reg, ScratchFPReg, I);
   } else {
     moveToTop(Reg, I);            // Move to the top of the stack...
   }
 
   // Convert from the pseudo instruction to the concrete instruction.
-  MI->RemoveOperand(NumOps-1);    // Remove explicit ST(0) operand
-  MI->setDesc(TII->get(getConcreteOpcode(MI->getOpcode())));
-
-  if (MI->getOpcode() == X86::IST_FP64m ||
-      MI->getOpcode() == X86::ISTT_FP16m ||
-      MI->getOpcode() == X86::ISTT_FP32m ||
-      MI->getOpcode() == X86::ISTT_FP64m ||
-      MI->getOpcode() == X86::ST_FP80m) {
+  MI.RemoveOperand(NumOps - 1); // Remove explicit ST(0) operand
+  MI.setDesc(TII->get(getConcreteOpcode(MI.getOpcode())));
+
+  if (MI.getOpcode() == X86::IST_FP64m || MI.getOpcode() == X86::ISTT_FP16m ||
+      MI.getOpcode() == X86::ISTT_FP32m || MI.getOpcode() == X86::ISTT_FP64m ||
+      MI.getOpcode() == X86::ST_FP80m) {
     if (StackTop == 0)
       report_fatal_error("Stack empty??");
     --StackTop;
@@ -1021,15 +1114,15 @@ void FPS::handleOneArgFP(MachineBasicBlock::iterator &I) {
 ///     R1 = fadd R2, [mem]
 ///
 void FPS::handleOneArgFPRW(MachineBasicBlock::iterator &I) {
-  MachineInstr *MI = I;
+  MachineInstr &MI = *I;
 #ifndef NDEBUG
-  unsigned NumOps = MI->getDesc().getNumOperands();
+  unsigned NumOps = MI.getDesc().getNumOperands();
   assert(NumOps >= 2 && "FPRW instructions must have 2 ops!!");
 #endif
 
   // Is this the last use of the source register?
-  unsigned Reg = getFPReg(MI->getOperand(1));
-  bool KillsSrc = MI->killsRegister(X86::FP0+Reg);
+  unsigned Reg = getFPReg(MI.getOperand(1));
+  bool KillsSrc = MI.killsRegister(X86::FP0 + Reg);
 
   if (KillsSrc) {
     // If this is the last use of the source register, just make sure it's on
@@ -1038,17 +1131,17 @@ void FPS::handleOneArgFPRW(MachineBasicBlock::iterator &I) {
     if (StackTop == 0)
       report_fatal_error("Stack cannot be empty!");
     --StackTop;
-    pushReg(getFPReg(MI->getOperand(0)));
+    pushReg(getFPReg(MI.getOperand(0)));
   } else {
     // If this is not the last use of the source register, _copy_ it to the top
     // of the stack.
-    duplicateToTop(Reg, getFPReg(MI->getOperand(0)), I);
+    duplicateToTop(Reg, getFPReg(MI.getOperand(0)), I);
   }
 
   // Change from the pseudo instruction to the concrete instruction.
-  MI->RemoveOperand(1);   // Drop the source operand.
-  MI->RemoveOperand(0);   // Drop the destination operand.
-  MI->setDesc(TII->get(getConcreteOpcode(MI->getOpcode())));
+  MI.RemoveOperand(1); // Drop the source operand.
+  MI.RemoveOperand(0); // Drop the destination operand.
+  MI.setDesc(TII->get(getConcreteOpcode(MI.getOpcode())));
 }
 
 
@@ -1132,16 +1225,16 @@ static const TableEntry ReverseSTiTable[] = {
 void FPS::handleTwoArgFP(MachineBasicBlock::iterator &I) {
   ASSERT_SORTED(ForwardST0Table); ASSERT_SORTED(ReverseST0Table);
   ASSERT_SORTED(ForwardSTiTable); ASSERT_SORTED(ReverseSTiTable);
-  MachineInstr *MI = I;
+  MachineInstr &MI = *I;
 
-  unsigned NumOperands = MI->getDesc().getNumOperands();
+  unsigned NumOperands = MI.getDesc().getNumOperands();
   assert(NumOperands == 3 && "Illegal TwoArgFP instruction!");
-  unsigned Dest = getFPReg(MI->getOperand(0));
-  unsigned Op0 = getFPReg(MI->getOperand(NumOperands-2));
-  unsigned Op1 = getFPReg(MI->getOperand(NumOperands-1));
-  bool KillsOp0 = MI->killsRegister(X86::FP0+Op0);
-  bool KillsOp1 = MI->killsRegister(X86::FP0+Op1);
-  DebugLoc dl = MI->getDebugLoc();
+  unsigned Dest = getFPReg(MI.getOperand(0));
+  unsigned Op0 = getFPReg(MI.getOperand(NumOperands - 2));
+  unsigned Op1 = getFPReg(MI.getOperand(NumOperands - 1));
+  bool KillsOp0 = MI.killsRegister(X86::FP0 + Op0);
+  bool KillsOp1 = MI.killsRegister(X86::FP0 + Op1);
+  DebugLoc dl = MI.getDebugLoc();
 
   unsigned TOS = getStackEntry(0);
 
@@ -1198,14 +1291,14 @@ void FPS::handleTwoArgFP(MachineBasicBlock::iterator &I) {
       InstTable = ReverseSTiTable;
   }
 
-  int Opcode = Lookup(InstTable, MI->getOpcode());
+  int Opcode = Lookup(InstTable, MI.getOpcode());
   assert(Opcode != -1 && "Unknown TwoArgFP pseudo instruction!");
 
   // NotTOS - The register which is not on the top of stack...
   unsigned NotTOS = (TOS == Op0) ? Op1 : Op0;
 
   // Replace the old instruction with a new instruction
-  MBB->remove(I++);
+  MBB->remove(&*I++);
   I = BuildMI(*MBB, I, dl, TII->get(Opcode)).addReg(getSTReg(NotTOS));
 
   // If both operands are killed, pop one off of the stack in addition to
@@ -1221,7 +1314,7 @@ void FPS::handleTwoArgFP(MachineBasicBlock::iterator &I) {
   assert(UpdatedSlot < StackTop && Dest < 7);
   Stack[UpdatedSlot]   = Dest;
   RegMap[Dest]         = UpdatedSlot;
-  MBB->getParent()->DeleteMachineInstr(MI); // Remove the old instruction
+  MBB->getParent()->DeleteMachineInstr(&MI); // Remove the old instruction
 }
 
 /// handleCompareFP - Handle FUCOM and FUCOMI instructions, which have two FP
@@ -1230,23 +1323,23 @@ void FPS::handleTwoArgFP(MachineBasicBlock::iterator &I) {
 void FPS::handleCompareFP(MachineBasicBlock::iterator &I) {
   ASSERT_SORTED(ForwardST0Table); ASSERT_SORTED(ReverseST0Table);
   ASSERT_SORTED(ForwardSTiTable); ASSERT_SORTED(ReverseSTiTable);
-  MachineInstr *MI = I;
+  MachineInstr &MI = *I;
 
-  unsigned NumOperands = MI->getDesc().getNumOperands();
+  unsigned NumOperands = MI.getDesc().getNumOperands();
   assert(NumOperands == 2 && "Illegal FUCOM* instruction!");
-  unsigned Op0 = getFPReg(MI->getOperand(NumOperands-2));
-  unsigned Op1 = getFPReg(MI->getOperand(NumOperands-1));
-  bool KillsOp0 = MI->killsRegister(X86::FP0+Op0);
-  bool KillsOp1 = MI->killsRegister(X86::FP0+Op1);
+  unsigned Op0 = getFPReg(MI.getOperand(NumOperands - 2));
+  unsigned Op1 = getFPReg(MI.getOperand(NumOperands - 1));
+  bool KillsOp0 = MI.killsRegister(X86::FP0 + Op0);
+  bool KillsOp1 = MI.killsRegister(X86::FP0 + Op1);
 
   // Make sure the first operand is on the top of stack, the other one can be
   // anywhere.
   moveToTop(Op0, I);
 
   // Change from the pseudo instruction to the concrete instruction.
-  MI->getOperand(0).setReg(getSTReg(Op1));
-  MI->RemoveOperand(1);
-  MI->setDesc(TII->get(getConcreteOpcode(MI->getOpcode())));
+  MI.getOperand(0).setReg(getSTReg(Op1));
+  MI.RemoveOperand(1);
+  MI.setDesc(TII->get(getConcreteOpcode(MI.getOpcode())));
 
   // If any of the operands are killed by this instruction, free them.
   if (KillsOp0) freeStackSlotAfter(I, Op0);
@@ -1258,21 +1351,21 @@ void FPS::handleCompareFP(MachineBasicBlock::iterator &I) {
 /// instructions require that the first operand is at the top of the stack, but
 /// otherwise don't modify the stack at all.
 void FPS::handleCondMovFP(MachineBasicBlock::iterator &I) {
-  MachineInstr *MI = I;
+  MachineInstr &MI = *I;
 
-  unsigned Op0 = getFPReg(MI->getOperand(0));
-  unsigned Op1 = getFPReg(MI->getOperand(2));
-  bool KillsOp1 = MI->killsRegister(X86::FP0+Op1);
+  unsigned Op0 = getFPReg(MI.getOperand(0));
+  unsigned Op1 = getFPReg(MI.getOperand(2));
+  bool KillsOp1 = MI.killsRegister(X86::FP0 + Op1);
 
   // The first operand *must* be on the top of the stack.
   moveToTop(Op0, I);
 
   // Change the second operand to the stack register that the operand is in.
   // Change from the pseudo instruction to the concrete instruction.
-  MI->RemoveOperand(0);
-  MI->RemoveOperand(1);
-  MI->getOperand(0).setReg(getSTReg(Op1));
-  MI->setDesc(TII->get(getConcreteOpcode(MI->getOpcode())));
+  MI.RemoveOperand(0);
+  MI.RemoveOperand(1);
+  MI.getOperand(0).setReg(getSTReg(Op1));
+  MI.setDesc(TII->get(getConcreteOpcode(MI.getOpcode())));
 
   // If we kill the second operand, make sure to pop it from the stack.
   if (Op0 != Op1 && KillsOp1) {
@@ -1287,20 +1380,25 @@ void FPS::handleCondMovFP(MachineBasicBlock::iterator &I) {
 /// instructions.
 ///
 void FPS::handleSpecialFP(MachineBasicBlock::iterator &Inst) {
-  MachineInstr *MI = Inst;
+  MachineInstr &MI = *Inst;
 
-  if (MI->isCall()) {
+  if (MI.isCall()) {
     handleCall(Inst);
     return;
   }
 
-  switch (MI->getOpcode()) {
+  if (MI.isReturn()) {
+    handleReturn(Inst);
+    return;
+  }
+
+  switch (MI.getOpcode()) {
   default: llvm_unreachable("Unknown SpecialFP instruction!");
   case TargetOpcode::COPY: {
     // We handle three kinds of copies: FP <- FP, FP <- ST, and ST <- FP.
-    const MachineOperand &MO1 = MI->getOperand(1);
-    const MachineOperand &MO0 = MI->getOperand(0);
-    bool KillsSrc = MI->killsRegister(MO1.getReg());
+    const MachineOperand &MO1 = MI.getOperand(1);
+    const MachineOperand &MO0 = MI.getOperand(0);
+    bool KillsSrc = MI.killsRegister(MO1.getReg());
 
     // FP <- FP copy.
     unsigned DstFP = getFPReg(MO0);
@@ -1322,9 +1420,9 @@ void FPS::handleSpecialFP(MachineBasicBlock::iterator &Inst) {
 
   case TargetOpcode::IMPLICIT_DEF: {
     // All FP registers must be explicitly defined, so load a 0 instead.
-    unsigned Reg = MI->getOperand(0).getReg() - X86::FP0;
+    unsigned Reg = MI.getOperand(0).getReg() - X86::FP0;
     DEBUG(dbgs() << "Emitting LD_F0 for implicit FP" << Reg << '\n');
-    BuildMI(*MBB, Inst, MI->getDebugLoc(), TII->get(X86::LD_F0));
+    BuildMI(*MBB, Inst, MI.getDebugLoc(), TII->get(X86::LD_F0));
     pushReg(Reg);
     break;
   }
@@ -1368,14 +1466,14 @@ void FPS::handleSpecialFP(MachineBasicBlock::iterator &Inst) {
     SmallSet<unsigned, 1> FRegIdx;
     unsigned RCID;
 
-    for (unsigned i = InlineAsm::MIOp_FirstOperand, e = MI->getNumOperands();
-         i != e && MI->getOperand(i).isImm(); i += 1 + NumOps) {
-      unsigned Flags = MI->getOperand(i).getImm();
+    for (unsigned i = InlineAsm::MIOp_FirstOperand, e = MI.getNumOperands();
+         i != e && MI.getOperand(i).isImm(); i += 1 + NumOps) {
+      unsigned Flags = MI.getOperand(i).getImm();
 
       NumOps = InlineAsm::getNumOperandRegisters(Flags);
       if (NumOps != 1)
         continue;
-      const MachineOperand &MO = MI->getOperand(i + 1);
+      const MachineOperand &MO = MI.getOperand(i + 1);
       if (!MO.isReg())
         continue;
       unsigned STReg = MO.getReg() - X86::FP0;
@@ -1408,24 +1506,24 @@ void FPS::handleSpecialFP(MachineBasicBlock::iterator &Inst) {
     }
 
     if (STUses && !isMask_32(STUses))
-      MI->emitError("fixed input regs must be last on the x87 stack");
+      MI.emitError("fixed input regs must be last on the x87 stack");
     unsigned NumSTUses = countTrailingOnes(STUses);
 
     // Defs must be contiguous from the stack top. ST0-STn.
     if (STDefs && !isMask_32(STDefs)) {
-      MI->emitError("output regs must be last on the x87 stack");
+      MI.emitError("output regs must be last on the x87 stack");
       STDefs = NextPowerOf2(STDefs) - 1;
     }
     unsigned NumSTDefs = countTrailingOnes(STDefs);
 
     // So must the clobbered stack slots. ST0-STm, m >= n.
     if (STClobbers && !isMask_32(STDefs | STClobbers))
-      MI->emitError("clobbers must be last on the x87 stack");
+      MI.emitError("clobbers must be last on the x87 stack");
 
     // Popped inputs are the ones that are also clobbered or defined.
     unsigned STPopped = STUses & (STDefs | STClobbers);
     if (STPopped && !isMask_32(STPopped))
-      MI->emitError("implicitly popped regs must be last on the x87 stack");
+      MI.emitError("implicitly popped regs must be last on the x87 stack");
     unsigned NumSTPopped = countTrailingOnes(STPopped);
 
     DEBUG(dbgs() << "Asm uses " << NumSTUses << " fixed regs, pops "
@@ -1434,9 +1532,9 @@ void FPS::handleSpecialFP(MachineBasicBlock::iterator &Inst) {
 #ifndef NDEBUG
     // If any input operand uses constraint "f", all output register
     // constraints must be early-clobber defs.
-    for (unsigned I = 0, E = MI->getNumOperands(); I < E; ++I)
+    for (unsigned I = 0, E = MI.getNumOperands(); I < E; ++I)
       if (FRegIdx.count(I)) {
-        assert((1 << getFPReg(MI->getOperand(I)) & STDefs) == 0 &&
+        assert((1 << getFPReg(MI.getOperand(I)) & STDefs) == 0 &&
                "Operands with constraint \"f\" cannot overlap with defs");
       }
 #endif
@@ -1444,8 +1542,8 @@ void FPS::handleSpecialFP(MachineBasicBlock::iterator &Inst) {
     // Collect all FP registers (register operands with constraints "t", "u",
     // and "f") to kill afer the instruction.
     unsigned FPKills = ((1u << NumFPRegs) - 1) & ~0xff;
-    for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) {
-      MachineOperand &Op = MI->getOperand(i);
+    for (unsigned i = 0, e = MI.getNumOperands(); i != e; ++i) {
+      MachineOperand &Op = MI.getOperand(i);
       if (!Op.isReg() || Op.getReg() < X86::FP0 || Op.getReg() > X86::FP6)
         continue;
       unsigned FPReg = getFPReg(Op);
@@ -1470,8 +1568,8 @@ void FPS::handleSpecialFP(MachineBasicBlock::iterator &Inst) {
     DEBUG({dbgs() << "Before asm: "; dumpStack();});
 
     // With the stack layout fixed, rewrite the FP registers.
-    for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) {
-      MachineOperand &Op = MI->getOperand(i);
+    for (unsigned i = 0, e = MI.getNumOperands(); i != e; ++i) {
+      MachineOperand &Op = MI.getOperand(i);
       if (!Op.isReg() || Op.getReg() < X86::FP0 || Op.getReg() > X86::FP6)
         continue;
 
@@ -1508,94 +1606,6 @@ void FPS::handleSpecialFP(MachineBasicBlock::iterator &Inst) {
     // Don't delete the inline asm!
     return;
   }
-
-  case X86::RETQ:
-  case X86::RETL:
-  case X86::RETIL:
-  case X86::RETIQ:
-    // If RET has an FP register use operand, pass the first one in ST(0) and
-    // the second one in ST(1).
-
-    // Find the register operands.
-    unsigned FirstFPRegOp = ~0U, SecondFPRegOp = ~0U;
-    unsigned LiveMask = 0;
-
-    for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) {
-      MachineOperand &Op = MI->getOperand(i);
-      if (!Op.isReg() || Op.getReg() < X86::FP0 || Op.getReg() > X86::FP6)
-        continue;
-      // FP Register uses must be kills unless there are two uses of the same
-      // register, in which case only one will be a kill.
-      assert(Op.isUse() &&
-             (Op.isKill() ||                        // Marked kill.
-              getFPReg(Op) == FirstFPRegOp ||       // Second instance.
-              MI->killsRegister(Op.getReg())) &&    // Later use is marked kill.
-             "Ret only defs operands, and values aren't live beyond it");
-
-      if (FirstFPRegOp == ~0U)
-        FirstFPRegOp = getFPReg(Op);
-      else {
-        assert(SecondFPRegOp == ~0U && "More than two fp operands!");
-        SecondFPRegOp = getFPReg(Op);
-      }
-      LiveMask |= (1 << getFPReg(Op));
-
-      // Remove the operand so that later passes don't see it.
-      MI->RemoveOperand(i);
-      --i, --e;
-    }
-
-    // We may have been carrying spurious live-ins, so make sure only the returned
-    // registers are left live.
-    adjustLiveRegs(LiveMask, MI);
-    if (!LiveMask) return;  // Quick check to see if any are possible.
-
-    // There are only four possibilities here:
-    // 1) we are returning a single FP value.  In this case, it has to be in
-    //    ST(0) already, so just declare success by removing the value from the
-    //    FP Stack.
-    if (SecondFPRegOp == ~0U) {
-      // Assert that the top of stack contains the right FP register.
-      assert(StackTop == 1 && FirstFPRegOp == getStackEntry(0) &&
-             "Top of stack not the right register for RET!");
-
-      // Ok, everything is good, mark the value as not being on the stack
-      // anymore so that our assertion about the stack being empty at end of
-      // block doesn't fire.
-      StackTop = 0;
-      return;
-    }
-
-    // Otherwise, we are returning two values:
-    // 2) If returning the same value for both, we only have one thing in the FP
-    //    stack.  Consider:  RET FP1, FP1
-    if (StackTop == 1) {
-      assert(FirstFPRegOp == SecondFPRegOp && FirstFPRegOp == getStackEntry(0)&&
-             "Stack misconfiguration for RET!");
-
-      // Duplicate the TOS so that we return it twice.  Just pick some other FPx
-      // register to hold it.
-      unsigned NewReg = ScratchFPReg;
-      duplicateToTop(FirstFPRegOp, NewReg, MI);
-      FirstFPRegOp = NewReg;
-    }
-
-    /// Okay we know we have two different FPx operands now:
-    assert(StackTop == 2 && "Must have two values live!");
-
-    /// 3) If SecondFPRegOp is currently in ST(0) and FirstFPRegOp is currently
-    ///    in ST(1).  In this case, emit an fxch.
-    if (getStackEntry(0) == SecondFPRegOp) {
-      assert(getStackEntry(1) == FirstFPRegOp && "Unknown regs live");
-      moveToTop(FirstFPRegOp, MI);
-    }
-
-    /// 4) Finally, FirstFPRegOp must be in ST(0) and SecondFPRegOp must be in
-    /// ST(1).  Just remove both from our understanding of the stack and return.
-    assert(getStackEntry(0) == FirstFPRegOp && "Unknown regs live");
-    assert(getStackEntry(1) == SecondFPRegOp && "Unknown regs live");
-    StackTop = 0;
-    return;
   }
 
   Inst = MBB->erase(Inst);  // Remove the pseudo instruction
@@ -1614,7 +1624,7 @@ void FPS::setKillFlags(MachineBasicBlock &MBB) const {
       MBB.getParent()->getSubtarget().getRegisterInfo();
   LivePhysRegs LPR(TRI);
 
-  LPR.addLiveOuts(&MBB);
+  LPR.addLiveOuts(MBB);
 
   for (MachineBasicBlock::reverse_iterator I = MBB.rbegin(), E = MBB.rend();
        I != E; ++I) {
diff --git a/lib/Target/X86/X86FrameLowering.cpp b/lib/Target/X86/X86FrameLowering.cpp
index f5ffe0cf7e88..03d925692adf 100644
--- a/lib/Target/X86/X86FrameLowering.cpp
+++ b/lib/Target/X86/X86FrameLowering.cpp
@@ -159,6 +159,8 @@ static unsigned findDeadCallerSavedReg(MachineBasicBlock &MBB,
   unsigned Opc = MBBI->getOpcode();
   switch (Opc) {
   default: return 0;
+  case TargetOpcode::PATCHABLE_RET:
+  case X86::RET:
   case X86::RETL:
   case X86::RETQ:
   case X86::RETIL:
@@ -314,8 +316,8 @@ void X86FrameLowering::emitSPUpdate(MachineBasicBlock &MBB,
 }
 
 MachineInstrBuilder X86FrameLowering::BuildStackAdjustment(
-    MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, DebugLoc DL,
-    int64_t Offset, bool InEpilogue) const {
+    MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI,
+    const DebugLoc &DL, int64_t Offset, bool InEpilogue) const {
   assert(Offset != 0 && "zero offset stack adjustment requested");
 
   // On Atom, using LEA to adjust SP is preferred, but using it in the epilogue
@@ -374,16 +376,33 @@ int X86FrameLowering::mergeSPUpdates(MachineBasicBlock &MBB,
   unsigned Opc = PI->getOpcode();
   int Offset = 0;
 
+  if (!doMergeWithPrevious && NI != MBB.end() &&
+      NI->getOpcode() == TargetOpcode::CFI_INSTRUCTION) {
+    // Don't merge with the next instruction if it has CFI.
+    return Offset;
+  }
+
   if ((Opc == X86::ADD64ri32 || Opc == X86::ADD64ri8 ||
-       Opc == X86::ADD32ri || Opc == X86::ADD32ri8 ||
-       Opc == X86::LEA32r || Opc == X86::LEA64_32r) &&
+       Opc == X86::ADD32ri || Opc == X86::ADD32ri8) &&
       PI->getOperand(0).getReg() == StackPtr){
+    assert(PI->getOperand(1).getReg() == StackPtr);
     Offset += PI->getOperand(2).getImm();
     MBB.erase(PI);
     if (!doMergeWithPrevious) MBBI = NI;
+  } else if ((Opc == X86::LEA32r || Opc == X86::LEA64_32r) &&
+             PI->getOperand(0).getReg() == StackPtr &&
+             PI->getOperand(1).getReg() == StackPtr &&
+             PI->getOperand(2).getImm() == 1 &&
+             PI->getOperand(3).getReg() == X86::NoRegister &&
+             PI->getOperand(5).getReg() == X86::NoRegister) {
+    // For LEAs we have: def = lea SP, FI, noreg, Offset, noreg.
+    Offset += PI->getOperand(4).getImm();
+    MBB.erase(PI);
+    if (!doMergeWithPrevious) MBBI = NI;
   } else if ((Opc == X86::SUB64ri32 || Opc == X86::SUB64ri8 ||
               Opc == X86::SUB32ri || Opc == X86::SUB32ri8) &&
              PI->getOperand(0).getReg() == StackPtr) {
+    assert(PI->getOperand(1).getReg() == StackPtr);
     Offset -= PI->getOperand(2).getImm();
     MBB.erase(PI);
     if (!doMergeWithPrevious) MBBI = NI;
@@ -393,18 +412,18 @@ int X86FrameLowering::mergeSPUpdates(MachineBasicBlock &MBB,
 }
 
 void X86FrameLowering::BuildCFI(MachineBasicBlock &MBB,
-                                MachineBasicBlock::iterator MBBI, DebugLoc DL,
-                                MCCFIInstruction CFIInst) const {
+                                MachineBasicBlock::iterator MBBI,
+                                const DebugLoc &DL,
+                                const MCCFIInstruction &CFIInst) const {
   MachineFunction &MF = *MBB.getParent();
   unsigned CFIIndex = MF.getMMI().addFrameInst(CFIInst);
   BuildMI(MBB, MBBI, DL, TII.get(TargetOpcode::CFI_INSTRUCTION))
       .addCFIIndex(CFIIndex);
 }
 
-void
-X86FrameLowering::emitCalleeSavedFrameMoves(MachineBasicBlock &MBB,
-                                            MachineBasicBlock::iterator MBBI,
-                                            DebugLoc DL) const {
+void X86FrameLowering::emitCalleeSavedFrameMoves(
+    MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI,
+    const DebugLoc &DL) const {
   MachineFunction &MF = *MBB.getParent();
   MachineFrameInfo *MFI = MF.getFrameInfo();
   MachineModuleInfo &MMI = MF.getMMI();
@@ -429,7 +448,7 @@ X86FrameLowering::emitCalleeSavedFrameMoves(MachineBasicBlock &MBB,
 MachineInstr *X86FrameLowering::emitStackProbe(MachineFunction &MF,
                                                MachineBasicBlock &MBB,
                                                MachineBasicBlock::iterator MBBI,
-                                               DebugLoc DL,
+                                               const DebugLoc &DL,
                                                bool InProlog) const {
   const X86Subtarget &STI = MF.getSubtarget<X86Subtarget>();
   if (STI.isTargetWindowsCoreCLR()) {
@@ -457,6 +476,8 @@ void X86FrameLowering::inlineStackProbe(MachineFunction &MF,
   }
 
   if (ChkStkStub != nullptr) {
+    assert(!ChkStkStub->isBundled() &&
+           "Not expecting bundled instructions here");
     MachineBasicBlock::iterator MBBI = std::next(ChkStkStub->getIterator());
     assert(std::prev(MBBI).operator==(ChkStkStub) &&
       "MBBI expected after __chkstk_stub.");
@@ -467,8 +488,8 @@ void X86FrameLowering::inlineStackProbe(MachineFunction &MF,
 }
 
 MachineInstr *X86FrameLowering::emitStackProbeInline(
-  MachineFunction &MF, MachineBasicBlock &MBB,
-  MachineBasicBlock::iterator MBBI, DebugLoc DL, bool InProlog) const {
+    MachineFunction &MF, MachineBasicBlock &MBB,
+    MachineBasicBlock::iterator MBBI, const DebugLoc &DL, bool InProlog) const {
   const X86Subtarget &STI = MF.getSubtarget<X86Subtarget>();
   assert(STI.is64Bit() && "different expansion needed for 32 bit");
   assert(STI.isTargetWindowsCoreCLR() && "custom expansion expects CoreCLR");
@@ -679,12 +700,12 @@ MachineInstr *X86FrameLowering::emitStackProbeInline(
 
   // Possible TODO: physreg liveness for InProlog case.
 
-  return ContinueMBBI;
+  return &*ContinueMBBI;
 }
 
 MachineInstr *X86FrameLowering::emitStackProbeCall(
     MachineFunction &MF, MachineBasicBlock &MBB,
-    MachineBasicBlock::iterator MBBI, DebugLoc DL, bool InProlog) const {
+    MachineBasicBlock::iterator MBBI, const DebugLoc &DL, bool InProlog) const {
   bool IsLargeCodeModel = MF.getTarget().getCodeModel() == CodeModel::Large;
 
   unsigned CallOp;
@@ -743,19 +764,19 @@ MachineInstr *X86FrameLowering::emitStackProbeCall(
       ExpansionMBBI->setFlag(MachineInstr::FrameSetup);
   }
 
-  return MBBI;
+  return &*MBBI;
 }
 
 MachineInstr *X86FrameLowering::emitStackProbeInlineStub(
     MachineFunction &MF, MachineBasicBlock &MBB,
-    MachineBasicBlock::iterator MBBI, DebugLoc DL, bool InProlog) const {
+    MachineBasicBlock::iterator MBBI, const DebugLoc &DL, bool InProlog) const {
 
   assert(InProlog && "ChkStkStub called outside prolog!");
 
   BuildMI(MBB, MBBI, DL, TII.get(X86::CALLpcrel32))
       .addExternalSymbol("__chkstk_stub");
 
-  return MBBI;
+  return &*MBBI;
 }
 
 static unsigned calculateSetFPREG(uint64_t SPAdjust) {
@@ -786,7 +807,7 @@ uint64_t X86FrameLowering::calculateMaxStackAlign(const MachineFunction &MF) con
 
 void X86FrameLowering::BuildStackAlignAND(MachineBasicBlock &MBB,
                                           MachineBasicBlock::iterator MBBI,
-                                          DebugLoc DL, unsigned Reg,
+                                          const DebugLoc &DL, unsigned Reg,
                                           uint64_t MaxAlign) const {
   uint64_t Val = -MaxAlign;
   unsigned AndOp = getANDriOpcode(Uses64BitFramePtr, Val);
@@ -950,6 +971,7 @@ void X86FrameLowering::emitPrologue(MachineFunction &MF,
       !MF.shouldSplitStack()) {                 // Regular stack
     uint64_t MinSize = X86FI->getCalleeSavedFrameSize();
     if (HasFP) MinSize += SlotSize;
+    X86FI->setUsesRedZone(MinSize > 0 || StackSize > 0);
     StackSize = std::max(MinSize, StackSize > 128 ? StackSize - 128 : 0);
     MFI->setStackSize(StackSize);
   }
@@ -1009,7 +1031,7 @@ void X86FrameLowering::emitPrologue(MachineFunction &MF,
 
     // Callee-saved registers are pushed on stack before the stack is realigned.
     if (TRI->needsStackRealignment(MF) && !IsWin64Prologue)
-      NumBytes = RoundUpToAlignment(NumBytes, MaxAlign);
+      NumBytes = alignTo(NumBytes, MaxAlign);
 
     // Get the offset of the stack slot for the EBP register, which is
     // guaranteed to be the last slot by processFunctionBeforeFrameFinalized.
@@ -1130,7 +1152,7 @@ void X86FrameLowering::emitPrologue(MachineFunction &MF,
   // virtual memory manager are allocated in correct sequence.
   uint64_t AlignedNumBytes = NumBytes;
   if (IsWin64Prologue && !IsFunclet && TRI->needsStackRealignment(MF))
-    AlignedNumBytes = RoundUpToAlignment(AlignedNumBytes, MaxAlign);
+    AlignedNumBytes = alignTo(AlignedNumBytes, MaxAlign);
   if (AlignedNumBytes >= StackProbeSize && UseStackProbe) {
     // Check whether EAX is livein for this block.
     bool isEAXAlive = isEAXLiveIn(MBB);
@@ -1260,7 +1282,7 @@ void X86FrameLowering::emitPrologue(MachineFunction &MF,
   }
 
   while (MBBI != MBB.end() && MBBI->getFlag(MachineInstr::FrameSetup)) {
-    const MachineInstr *FrameInstr = &*MBBI;
+    const MachineInstr &FrameInstr = *MBBI;
     ++MBBI;
 
     if (NeedsWinCFI) {
@@ -1360,6 +1382,18 @@ void X86FrameLowering::emitPrologue(MachineFunction &MF,
     if (PushedRegs)
       emitCalleeSavedFrameMoves(MBB, MBBI, DL);
   }
+
+  // X86 Interrupt handling function cannot assume anything about the direction
+  // flag (DF in EFLAGS register). Clear this flag by creating "cld" instruction
+  // in each prologue of interrupt handler function.
+  //
+  // FIXME: Create "cld" instruction only in these cases:
+  // 1. The interrupt handling function uses any of the "rep" instructions.
+  // 2. Interrupt handling function calls another function.
+  //
+  if (Fn->getCallingConv() == CallingConv::X86_INTR)
+    BuildMI(MBB, MBBI, DL, TII.get(X86::CLD))
+        .setMIFlag(MachineInstr::FrameSetup);
 }
 
 bool X86FrameLowering::canUseLEAForSPInEpilogue(
@@ -1373,8 +1407,8 @@ bool X86FrameLowering::canUseLEAForSPInEpilogue(
   return !MF.getTarget().getMCAsmInfo()->usesWindowsCFI() || hasFP(MF);
 }
 
-static bool isFuncletReturnInstr(MachineInstr *MI) {
-  switch (MI->getOpcode()) {
+static bool isFuncletReturnInstr(MachineInstr &MI) {
+  switch (MI.getOpcode()) {
   case X86::CATCHRET:
   case X86::CLEANUPRET:
     return true;
@@ -1400,11 +1434,10 @@ static bool isFuncletReturnInstr(MachineInstr *MI) {
 unsigned
 X86FrameLowering::getPSPSlotOffsetFromSP(const MachineFunction &MF) const {
   const WinEHFuncInfo &Info = *MF.getWinEHFuncInfo();
-  // getFrameIndexReferenceFromSP has an out ref parameter for the stack
-  // pointer register; pass a dummy that we ignore
   unsigned SPReg;
-  int Offset = getFrameIndexReferenceFromSP(MF, Info.PSPSymFrameIdx, SPReg);
-  assert(Offset >= 0);
+  int Offset = getFrameIndexReferencePreferSP(MF, Info.PSPSymFrameIdx, SPReg,
+                                              /*IgnoreSPUpdates*/ true);
+  assert(Offset >= 0 && SPReg == TRI->getStackRegister());
   return static_cast<unsigned>(Offset);
 }
 
@@ -1429,18 +1462,25 @@ X86FrameLowering::getWinEHFuncletFrameSize(const MachineFunction &MF) const {
   // RBP is not included in the callee saved register block. After pushing RBP,
   // everything is 16 byte aligned. Everything we allocate before an outgoing
   // call must also be 16 byte aligned.
-  unsigned FrameSizeMinusRBP =
-      RoundUpToAlignment(CSSize + UsedSize, getStackAlignment());
+  unsigned FrameSizeMinusRBP = alignTo(CSSize + UsedSize, getStackAlignment());
   // Subtract out the size of the callee saved registers. This is how much stack
   // each funclet will allocate.
   return FrameSizeMinusRBP - CSSize;
 }
 
+static bool isTailCallOpcode(unsigned Opc) {
+    return Opc == X86::TCRETURNri || Opc == X86::TCRETURNdi ||
+        Opc == X86::TCRETURNmi ||
+        Opc == X86::TCRETURNri64 || Opc == X86::TCRETURNdi64 ||
+        Opc == X86::TCRETURNmi64;
+}
+
 void X86FrameLowering::emitEpilogue(MachineFunction &MF,
                                     MachineBasicBlock &MBB) const {
   const MachineFrameInfo *MFI = MF.getFrameInfo();
   X86MachineFunctionInfo *X86FI = MF.getInfo<X86MachineFunctionInfo>();
   MachineBasicBlock::iterator MBBI = MBB.getFirstTerminator();
+  unsigned RetOpcode = MBBI->getOpcode();
   DebugLoc DL;
   if (MBBI != MBB.end())
     DL = MBBI->getDebugLoc();
@@ -1453,7 +1493,7 @@ void X86FrameLowering::emitEpilogue(MachineFunction &MF,
   bool IsWin64Prologue = MF.getTarget().getMCAsmInfo()->usesWindowsCFI();
   bool NeedsWinCFI =
       IsWin64Prologue && MF.getFunction()->needsUnwindTableEntry();
-  bool IsFunclet = isFuncletReturnInstr(MBBI);
+  bool IsFunclet = isFuncletReturnInstr(*MBBI);
   MachineBasicBlock *TargetMBB = nullptr;
 
   // Get the number of bytes to allocate from the FrameInfo.
@@ -1490,7 +1530,7 @@ void X86FrameLowering::emitEpilogue(MachineFunction &MF,
     // Callee-saved registers were pushed on stack before the stack was
     // realigned.
     if (TRI->needsStackRealignment(MF) && !IsWin64Prologue)
-      NumBytes = RoundUpToAlignment(FrameSize, MaxAlign);
+      NumBytes = alignTo(FrameSize, MaxAlign);
 
     // Pop EBP.
     BuildMI(MBB, MBBI, DL,
@@ -1589,15 +1629,17 @@ void X86FrameLowering::emitEpilogue(MachineFunction &MF,
   if (NeedsWinCFI)
     BuildMI(MBB, MBBI, DL, TII.get(X86::SEH_Epilogue));
 
-  // Add the return addr area delta back since we are not tail calling.
-  int Offset = -1 * X86FI->getTCReturnAddrDelta();
-  assert(Offset >= 0 && "TCDelta should never be positive");
-  if (Offset) {
-    MBBI = MBB.getFirstTerminator();
+  if (!isTailCallOpcode(RetOpcode)) {
+    // Add the return addr area delta back since we are not tail calling.
+    int Offset = -1 * X86FI->getTCReturnAddrDelta();
+    assert(Offset >= 0 && "TCDelta should never be positive");
+    if (Offset) {
+      MBBI = MBB.getFirstTerminator();
 
-    // Check for possible merge with preceding ADD instruction.
-    Offset += mergeSPUpdates(MBB, MBBI, true);
-    emitSPUpdate(MBB, MBBI, Offset, /*InEpilogue=*/true);
+      // Check for possible merge with preceding ADD instruction.
+      Offset += mergeSPUpdates(MBB, MBBI, true);
+      emitSPUpdate(MBB, MBBI, Offset, /*InEpilogue=*/true);
+    }
   }
 }
 
@@ -1689,58 +1731,61 @@ int X86FrameLowering::getFrameIndexReference(const MachineFunction &MF, int FI,
   return Offset + FPDelta;
 }
 
-// Simplified from getFrameIndexReference keeping only StackPointer cases
-int X86FrameLowering::getFrameIndexReferenceFromSP(const MachineFunction &MF,
-                                                   int FI,
-                                                   unsigned &FrameReg) const {
+int
+X86FrameLowering::getFrameIndexReferencePreferSP(const MachineFunction &MF,
+                                                 int FI, unsigned &FrameReg,
+                                                 bool IgnoreSPUpdates) const {
+
   const MachineFrameInfo *MFI = MF.getFrameInfo();
   // Does not include any dynamic realign.
   const uint64_t StackSize = MFI->getStackSize();
-  {
-#ifndef NDEBUG
-    // LLVM arranges the stack as follows:
-    //   ...
-    //   ARG2
-    //   ARG1
-    //   RETADDR
-    //   PUSH RBP   <-- RBP points here
-    //   PUSH CSRs
-    //   ~~~~~~~    <-- possible stack realignment (non-win64)
-    //   ...
-    //   STACK OBJECTS
-    //   ...        <-- RSP after prologue points here
-    //   ~~~~~~~    <-- possible stack realignment (win64)
-    //
-    // if (hasVarSizedObjects()):
-    //   ...        <-- "base pointer" (ESI/RBX) points here
-    //   DYNAMIC ALLOCAS
-    //   ...        <-- RSP points here
-    //
-    // Case 1: In the simple case of no stack realignment and no dynamic
-    // allocas, both "fixed" stack objects (arguments and CSRs) are addressable
-    // with fixed offsets from RSP.
-    //
-    // Case 2: In the case of stack realignment with no dynamic allocas, fixed
-    // stack objects are addressed with RBP and regular stack objects with RSP.
-    //
-    // Case 3: In the case of dynamic allocas and stack realignment, RSP is used
-    // to address stack arguments for outgoing calls and nothing else. The "base
-    // pointer" points to local variables, and RBP points to fixed objects.
-    //
-    // In cases 2 and 3, we can only answer for non-fixed stack objects, and the
-    // answer we give is relative to the SP after the prologue, and not the
-    // SP in the middle of the function.
-
-    assert((!MFI->isFixedObjectIndex(FI) || !TRI->needsStackRealignment(MF) ||
-            STI.isTargetWin64()) &&
-           "offset from fixed object to SP is not static");
-
-    // We don't handle tail calls, and shouldn't be seeing them either.
-    int TailCallReturnAddrDelta =
-        MF.getInfo<X86MachineFunctionInfo>()->getTCReturnAddrDelta();
-    assert(!(TailCallReturnAddrDelta < 0) && "we don't handle this case!");
-#endif
-  }
+  // LLVM arranges the stack as follows:
+  //   ...
+  //   ARG2
+  //   ARG1
+  //   RETADDR
+  //   PUSH RBP   <-- RBP points here
+  //   PUSH CSRs
+  //   ~~~~~~~    <-- possible stack realignment (non-win64)
+  //   ...
+  //   STACK OBJECTS
+  //   ...        <-- RSP after prologue points here
+  //   ~~~~~~~    <-- possible stack realignment (win64)
+  //
+  // if (hasVarSizedObjects()):
+  //   ...        <-- "base pointer" (ESI/RBX) points here
+  //   DYNAMIC ALLOCAS
+  //   ...        <-- RSP points here
+  //
+  // Case 1: In the simple case of no stack realignment and no dynamic
+  // allocas, both "fixed" stack objects (arguments and CSRs) are addressable
+  // with fixed offsets from RSP.
+  //
+  // Case 2: In the case of stack realignment with no dynamic allocas, fixed
+  // stack objects are addressed with RBP and regular stack objects with RSP.
+  //
+  // Case 3: In the case of dynamic allocas and stack realignment, RSP is used
+  // to address stack arguments for outgoing calls and nothing else. The "base
+  // pointer" points to local variables, and RBP points to fixed objects.
+  //
+  // In cases 2 and 3, we can only answer for non-fixed stack objects, and the
+  // answer we give is relative to the SP after the prologue, and not the
+  // SP in the middle of the function.
+
+  if (MFI->isFixedObjectIndex(FI) && TRI->needsStackRealignment(MF) &&
+      !STI.isTargetWin64())
+    return getFrameIndexReference(MF, FI, FrameReg);
+
+  // If !hasReservedCallFrame the function might have SP adjustement in the
+  // body.  So, even though the offset is statically known, it depends on where
+  // we are in the function.
+  const TargetFrameLowering *TFI = MF.getSubtarget().getFrameLowering();
+  if (!IgnoreSPUpdates && !TFI->hasReservedCallFrame(MF))
+    return getFrameIndexReference(MF, FI, FrameReg);
+
+  // We don't handle tail calls, and shouldn't be seeing them either.
+  assert(MF.getInfo<X86MachineFunctionInfo>()->getTCReturnAddrDelta() >= 0 &&
+         "we don't handle this case!");
 
   // Fill in FrameReg output argument.
   FrameReg = TRI->getStackRegister();
@@ -1851,16 +1896,37 @@ bool X86FrameLowering::spillCalleeSavedRegisters(
     return true;
 
   // Push GPRs. It increases frame size.
+  const MachineFunction &MF = *MBB.getParent();
   unsigned Opc = STI.is64Bit() ? X86::PUSH64r : X86::PUSH32r;
   for (unsigned i = CSI.size(); i != 0; --i) {
     unsigned Reg = CSI[i - 1].getReg();
 
     if (!X86::GR64RegClass.contains(Reg) && !X86::GR32RegClass.contains(Reg))
       continue;
-    // Add the callee-saved register as live-in. It's killed at the spill.
-    MBB.addLiveIn(Reg);
 
-    BuildMI(MBB, MI, DL, TII.get(Opc)).addReg(Reg, RegState::Kill)
+    const MachineRegisterInfo &MRI = MF.getRegInfo();
+    bool isLiveIn = MRI.isLiveIn(Reg);
+    if (!isLiveIn)
+      MBB.addLiveIn(Reg);
+
+    // Decide whether we can add a kill flag to the use.
+    bool CanKill = !isLiveIn;
+    // Check if any subregister is live-in
+    if (CanKill) {
+      for (MCRegAliasIterator AReg(Reg, TRI, false); AReg.isValid(); ++AReg) {
+        if (MRI.isLiveIn(*AReg)) {
+          CanKill = false;
+          break;
+        }
+      }
+    }
+
+    // Do not set a kill flag on values that are also marked as live-in. This
+    // happens with the @llvm-returnaddress intrinsic and with arguments
+    // passed in callee saved registers.
+    // Omitting the kill flags is conservatively correct even if the live-in
+    // is not used after all.
+    BuildMI(MBB, MI, DL, TII.get(Opc)).addReg(Reg, getKillRegState(CanKill))
       .setMIFlag(MachineInstr::FrameSetup);
   }
 
@@ -1891,7 +1957,7 @@ bool X86FrameLowering::restoreCalleeSavedRegisters(MachineBasicBlock &MBB,
   if (CSI.empty())
     return false;
 
-  if (isFuncletReturnInstr(MI) && STI.isOSWindows()) {
+  if (isFuncletReturnInstr(*MI) && STI.isOSWindows()) {
     // Don't restore CSRs in 32-bit EH funclets. Matches
     // spillCalleeSavedRegisters.
     if (STI.is32Bit())
@@ -2250,11 +2316,33 @@ void X86FrameLowering::adjustForSegmentedStacks(
   checkMBB->addSuccessor(allocMBB);
   checkMBB->addSuccessor(&PrologueMBB);
 
-#ifdef XDEBUG
+#ifdef EXPENSIVE_CHECKS
   MF.verify();
 #endif
 }
 
+/// Lookup an ERTS parameter in the !hipe.literals named metadata node.
+/// HiPE provides Erlang Runtime System-internal parameters, such as PCB offsets
+/// to fields it needs, through a named metadata node "hipe.literals" containing
+/// name-value pairs.
+static unsigned getHiPELiteral(
+    NamedMDNode *HiPELiteralsMD, const StringRef LiteralName) {
+  for (int i = 0, e = HiPELiteralsMD->getNumOperands(); i != e; ++i) {
+    MDNode *Node = HiPELiteralsMD->getOperand(i);
+    if (Node->getNumOperands() != 2) continue;
+    MDString *NodeName = dyn_cast<MDString>(Node->getOperand(0));
+    ValueAsMetadata *NodeVal = dyn_cast<ValueAsMetadata>(Node->getOperand(1));
+    if (!NodeName || !NodeVal) continue;
+    ConstantInt *ValConst = dyn_cast_or_null<ConstantInt>(NodeVal->getValue());
+    if (ValConst && NodeName->getString() == LiteralName) {
+      return ValConst->getZExtValue();
+    }
+  }
+
+  report_fatal_error("HiPE literal " + LiteralName
+                     + " required but not provided");
+}
+
 /// Erlang programs may need a special prologue to handle the stack size they
 /// might need at runtime. That is because Erlang/OTP does not implement a C
 /// stack but uses a custom implementation of hybrid stack/heap architecture.
@@ -2280,7 +2368,14 @@ void X86FrameLowering::adjustForHiPEPrologue(
   assert(&(*MF.begin()) == &PrologueMBB && "Shrink-wrapping not supported yet");
 
   // HiPE-specific values
-  const unsigned HipeLeafWords = 24;
+  NamedMDNode *HiPELiteralsMD = MF.getMMI().getModule()
+    ->getNamedMetadata("hipe.literals");
+  if (!HiPELiteralsMD)
+    report_fatal_error(
+        "Can't generate HiPE prologue without runtime parameters");
+  const unsigned HipeLeafWords
+    = getHiPELiteral(HiPELiteralsMD,
+                     Is64Bit ? "AMD64_LEAF_WORDS" : "X86_LEAF_WORDS");
   const unsigned CCRegisteredArgs = Is64Bit ? 6 : 5;
   const unsigned Guaranteed = HipeLeafWords * SlotSize;
   unsigned CallerStkArity = MF.getFunction()->arg_size() > CCRegisteredArgs ?
@@ -2300,15 +2395,13 @@ void X86FrameLowering::adjustForHiPEPrologue(
   if (MFI->hasCalls()) {
     unsigned MoreStackForCalls = 0;
 
-    for (MachineFunction::iterator MBBI = MF.begin(), MBBE = MF.end();
-         MBBI != MBBE; ++MBBI)
-      for (MachineBasicBlock::iterator MI = MBBI->begin(), ME = MBBI->end();
-           MI != ME; ++MI) {
-        if (!MI->isCall())
+    for (auto &MBB : MF) {
+      for (auto &MI : MBB) {
+        if (!MI.isCall())
           continue;
 
         // Get callee operand.
-        const MachineOperand &MO = MI->getOperand(0);
+        const MachineOperand &MO = MI.getOperand(0);
 
         // Only take account of global function calls (no closures etc.).
         if (!MO.isGlobal())
@@ -2334,6 +2427,7 @@ void X86FrameLowering::adjustForHiPEPrologue(
           MoreStackForCalls = std::max(MoreStackForCalls,
                                (HipeLeafWords - 1 - CalleeStkArity) * SlotSize);
       }
+    }
     MaxStack += MoreStackForCalls;
   }
 
@@ -2353,20 +2447,19 @@ void X86FrameLowering::adjustForHiPEPrologue(
 
     unsigned ScratchReg, SPReg, PReg, SPLimitOffset;
     unsigned LEAop, CMPop, CALLop;
+    SPLimitOffset = getHiPELiteral(HiPELiteralsMD, "P_NSP_LIMIT");
     if (Is64Bit) {
       SPReg = X86::RSP;
       PReg  = X86::RBP;
       LEAop = X86::LEA64r;
       CMPop = X86::CMP64rm;
       CALLop = X86::CALL64pcrel32;
-      SPLimitOffset = 0x90;
     } else {
       SPReg = X86::ESP;
       PReg  = X86::EBP;
       LEAop = X86::LEA32r;
       CMPop = X86::CMP32rm;
       CALLop = X86::CALLpcrel32;
-      SPLimitOffset = 0x4c;
     }
 
     ScratchReg = GetScratchRegister(Is64Bit, IsLP64, MF, true);
@@ -2395,13 +2488,15 @@ void X86FrameLowering::adjustForHiPEPrologue(
     incStackMBB->addSuccessor(&PrologueMBB, {99, 100});
     incStackMBB->addSuccessor(incStackMBB, {1, 100});
   }
-#ifdef XDEBUG
+#ifdef EXPENSIVE_CHECKS
   MF.verify();
 #endif
 }
 
 bool X86FrameLowering::adjustStackWithPops(MachineBasicBlock &MBB,
-    MachineBasicBlock::iterator MBBI, DebugLoc DL, int Offset) const {
+                                           MachineBasicBlock::iterator MBBI,
+                                           const DebugLoc &DL,
+                                           int Offset) const {
 
   if (Offset <= 0)
     return false;
@@ -2440,7 +2535,8 @@ bool X86FrameLowering::adjustStackWithPops(MachineBasicBlock &MBB,
 
     bool IsDef = false;
     for (const MachineOperand &MO : Prev->implicit_operands()) {
-      if (MO.isReg() && MO.isDef() && MO.getReg() == Candidate) {
+      if (MO.isReg() && MO.isDef() &&
+          TRI->isSuperOrSubRegisterEq(MO.getReg(), Candidate)) {
         IsDef = true;
         break;
       }
@@ -2468,7 +2564,7 @@ bool X86FrameLowering::adjustStackWithPops(MachineBasicBlock &MBB,
   return true;
 }
 
-void X86FrameLowering::
+MachineBasicBlock::iterator X86FrameLowering::
 eliminateCallFramePseudoInstr(MachineFunction &MF, MachineBasicBlock &MBB,
                               MachineBasicBlock::iterator I) const {
   bool reserveCallFrame = hasReservedCallFrame(MF);
@@ -2488,7 +2584,7 @@ eliminateCallFramePseudoInstr(MachineFunction &MF, MachineBasicBlock &MBB,
     // amount of space needed for the outgoing arguments up to the next
     // alignment boundary.
     unsigned StackAlign = getStackAlignment();
-    Amount = RoundUpToAlignment(Amount, StackAlign);
+    Amount = alignTo(Amount, StackAlign);
 
     MachineModuleInfo &MMI = MF.getMMI();
     const Function *Fn = MF.getFunction();
@@ -2512,7 +2608,7 @@ eliminateCallFramePseudoInstr(MachineFunction &MF, MachineBasicBlock &MBB,
                MCCFIInstruction::createGnuArgsSize(nullptr, Amount));
 
     if (Amount == 0)
-      return;
+      return I;
 
     // Factor out the amount that gets handled inside the sequence
     // (Pushes of argument for frame setup, callee pops for frame destroy)
@@ -2525,13 +2621,23 @@ eliminateCallFramePseudoInstr(MachineFunction &MF, MachineBasicBlock &MBB,
       BuildCFI(MBB, I, DL, 
                MCCFIInstruction::createAdjustCfaOffset(nullptr, -InternalAmt));
 
-    if (Amount) {
-      // Add Amount to SP to destroy a frame, and subtract to setup.
-      int Offset = isDestroy ? Amount : -Amount;
-
-      if (!(Fn->optForMinSize() && 
-            adjustStackWithPops(MBB, I, DL, Offset)))
-        BuildStackAdjustment(MBB, I, DL, Offset, /*InEpilogue=*/false);
+    // Add Amount to SP to destroy a frame, or subtract to setup.
+    int64_t StackAdjustment = isDestroy ? Amount : -Amount;
+    int64_t CfaAdjustment = -StackAdjustment;
+
+    if (StackAdjustment) {
+      // Merge with any previous or following adjustment instruction. Note: the
+      // instructions merged with here do not have CFI, so their stack
+      // adjustments do not feed into CfaAdjustment.
+      StackAdjustment += mergeSPUpdates(MBB, I, true);
+      StackAdjustment += mergeSPUpdates(MBB, I, false);
+
+      if (StackAdjustment) {
+        if (!(Fn->optForMinSize() &&
+              adjustStackWithPops(MBB, I, DL, StackAdjustment)))
+          BuildStackAdjustment(MBB, I, DL, StackAdjustment,
+                               /*InEpilogue=*/false);
+      }
     }
 
     if (DwarfCFI && !hasFP(MF)) {
@@ -2541,18 +2647,16 @@ eliminateCallFramePseudoInstr(MachineFunction &MF, MachineBasicBlock &MBB,
       // CFI only for EH purposes or for debugging. EH only requires the CFA
       // offset to be correct at each call site, while for debugging we want
       // it to be more precise.
-      int CFAOffset = Amount;
+
       // TODO: When not using precise CFA, we also need to adjust for the
       // InternalAmt here.
-
-      if (CFAOffset) {
-        CFAOffset = isDestroy ? -CFAOffset : CFAOffset;
-        BuildCFI(MBB, I, DL, 
-                 MCCFIInstruction::createAdjustCfaOffset(nullptr, CFAOffset));
+      if (CfaAdjustment) {
+        BuildCFI(MBB, I, DL, MCCFIInstruction::createAdjustCfaOffset(
+                                 nullptr, CfaAdjustment));
       }
     }
 
-    return;
+    return I;
   }
 
   if (isDestroy && InternalAmt) {
@@ -2562,11 +2666,20 @@ eliminateCallFramePseudoInstr(MachineFunction &MF, MachineBasicBlock &MBB,
     // We are not tracking the stack pointer adjustment by the callee, so make
     // sure we restore the stack pointer immediately after the call, there may
     // be spill code inserted between the CALL and ADJCALLSTACKUP instructions.
+    MachineBasicBlock::iterator CI = I;
     MachineBasicBlock::iterator B = MBB.begin();
-    while (I != B && !std::prev(I)->isCall())
-      --I;
-    BuildStackAdjustment(MBB, I, DL, -InternalAmt, /*InEpilogue=*/false);
+    while (CI != B && !std::prev(CI)->isCall())
+      --CI;
+    BuildStackAdjustment(MBB, CI, DL, -InternalAmt, /*InEpilogue=*/false);
   }
+
+  return I;
+}
+
+bool X86FrameLowering::canUseAsPrologue(const MachineBasicBlock &MBB) const {
+  assert(MBB.getParent() && "Block is not attached to a function!");
+  const MachineFunction &MF = *MBB.getParent();
+  return !TRI->needsStackRealignment(MF) || !MBB.isLiveIn(X86::EFLAGS);
 }
 
 bool X86FrameLowering::canUseAsEpilogue(const MachineBasicBlock &MBB) const {
@@ -2604,7 +2717,7 @@ bool X86FrameLowering::enableShrinkWrapping(const MachineFunction &MF) const {
 
 MachineBasicBlock::iterator X86FrameLowering::restoreWin32EHStackPointers(
     MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI,
-    DebugLoc DL, bool RestoreSP) const {
+    const DebugLoc &DL, bool RestoreSP) const {
   assert(STI.isTargetWindowsMSVC() && "funclets only supported in MSVC env");
   assert(STI.isTargetWin32() && "EBP/ESI restoration only required on win32");
   assert(STI.is32Bit() && !Uses64BitFramePtr &&
@@ -2664,6 +2777,150 @@ MachineBasicBlock::iterator X86FrameLowering::restoreWin32EHStackPointers(
   return MBBI;
 }
 
+namespace {
+// Struct used by orderFrameObjects to help sort the stack objects.
+struct X86FrameSortingObject {
+  bool IsValid = false;         // true if we care about this Object.
+  unsigned ObjectIndex = 0;     // Index of Object into MFI list.
+  unsigned ObjectSize = 0;      // Size of Object in bytes.
+  unsigned ObjectAlignment = 1; // Alignment of Object in bytes.
+  unsigned ObjectNumUses = 0;   // Object static number of uses.
+};
+
+// The comparison function we use for std::sort to order our local
+// stack symbols. The current algorithm is to use an estimated
+// "density". This takes into consideration the size and number of
+// uses each object has in order to roughly minimize code size.
+// So, for example, an object of size 16B that is referenced 5 times
+// will get higher priority than 4 4B objects referenced 1 time each.
+// It's not perfect and we may be able to squeeze a few more bytes out of
+// it (for example : 0(esp) requires fewer bytes, symbols allocated at the
+// fringe end can have special consideration, given their size is less
+// important, etc.), but the algorithmic complexity grows too much to be
+// worth the extra gains we get. This gets us pretty close.
+// The final order leaves us with objects with highest priority going
+// at the end of our list.
+struct X86FrameSortingComparator {
+  inline bool operator()(const X86FrameSortingObject &A,
+                         const X86FrameSortingObject &B) {
+    uint64_t DensityAScaled, DensityBScaled;
+
+    // For consistency in our comparison, all invalid objects are placed
+    // at the end. This also allows us to stop walking when we hit the
+    // first invalid item after it's all sorted.
+    if (!A.IsValid)
+      return false;
+    if (!B.IsValid)
+      return true;
+
+    // The density is calculated by doing :
+    //     (double)DensityA = A.ObjectNumUses / A.ObjectSize
+    //     (double)DensityB = B.ObjectNumUses / B.ObjectSize
+    // Since this approach may cause inconsistencies in
+    // the floating point <, >, == comparisons, depending on the floating
+    // point model with which the compiler was built, we're going
+    // to scale both sides by multiplying with
+    // A.ObjectSize * B.ObjectSize. This ends up factoring away
+    // the division and, with it, the need for any floating point
+    // arithmetic.
+    DensityAScaled = static_cast<uint64_t>(A.ObjectNumUses) *
+      static_cast<uint64_t>(B.ObjectSize);
+    DensityBScaled = static_cast<uint64_t>(B.ObjectNumUses) *
+      static_cast<uint64_t>(A.ObjectSize);
+
+    // If the two densities are equal, prioritize highest alignment
+    // objects. This allows for similar alignment objects
+    // to be packed together (given the same density).
+    // There's room for improvement here, also, since we can pack
+    // similar alignment (different density) objects next to each
+    // other to save padding. This will also require further
+    // complexity/iterations, and the overall gain isn't worth it,
+    // in general. Something to keep in mind, though.
+    if (DensityAScaled == DensityBScaled)
+      return A.ObjectAlignment < B.ObjectAlignment;
+    
+    return DensityAScaled < DensityBScaled;
+  }
+};
+} // namespace
+
+// Order the symbols in the local stack.
+// We want to place the local stack objects in some sort of sensible order.
+// The heuristic we use is to try and pack them according to static number
+// of uses and size of object in order to minimize code size.
+void X86FrameLowering::orderFrameObjects(
+    const MachineFunction &MF, SmallVectorImpl<int> &ObjectsToAllocate) const {
+  const MachineFrameInfo *MFI = MF.getFrameInfo();
+
+  // Don't waste time if there's nothing to do.
+  if (ObjectsToAllocate.empty())
+    return;
+
+  // Create an array of all MFI objects. We won't need all of these
+  // objects, but we're going to create a full array of them to make
+  // it easier to index into when we're counting "uses" down below.
+  // We want to be able to easily/cheaply access an object by simply
+  // indexing into it, instead of having to search for it every time.
+  std::vector<X86FrameSortingObject> SortingObjects(MFI->getObjectIndexEnd());
+
+  // Walk the objects we care about and mark them as such in our working
+  // struct.
+  for (auto &Obj : ObjectsToAllocate) {
+    SortingObjects[Obj].IsValid = true;
+    SortingObjects[Obj].ObjectIndex = Obj;
+    SortingObjects[Obj].ObjectAlignment = MFI->getObjectAlignment(Obj);
+    // Set the size.
+    int ObjectSize = MFI->getObjectSize(Obj);
+    if (ObjectSize == 0)
+      // Variable size. Just use 4.
+      SortingObjects[Obj].ObjectSize = 4;
+    else      
+      SortingObjects[Obj].ObjectSize = ObjectSize;
+  }
+
+  // Count the number of uses for each object.
+  for (auto &MBB : MF) {
+    for (auto &MI : MBB) {
+      if (MI.isDebugValue())
+        continue;
+      for (const MachineOperand &MO : MI.operands()) {
+        // Check to see if it's a local stack symbol.
+        if (!MO.isFI())
+          continue;
+        int Index = MO.getIndex();
+        // Check to see if it falls within our range, and is tagged
+        // to require ordering.
+        if (Index >= 0 && Index < MFI->getObjectIndexEnd() &&
+            SortingObjects[Index].IsValid)
+          SortingObjects[Index].ObjectNumUses++;
+      }
+    }
+  }
+
+  // Sort the objects using X86FrameSortingAlgorithm (see its comment for
+  // info).
+  std::stable_sort(SortingObjects.begin(), SortingObjects.end(),
+                   X86FrameSortingComparator());
+
+  // Now modify the original list to represent the final order that
+  // we want. The order will depend on whether we're going to access them
+  // from the stack pointer or the frame pointer. For SP, the list should
+  // end up with the END containing objects that we want with smaller offsets.
+  // For FP, it should be flipped.
+  int i = 0;
+  for (auto &Obj : SortingObjects) {
+    // All invalid items are sorted at the end, so it's safe to stop.
+    if (!Obj.IsValid)
+      break;
+    ObjectsToAllocate[i++] = Obj.ObjectIndex;
+  }
+
+  // Flip it if we're accessing off of the FP.
+  if (!TRI->needsStackRealignment(MF) && hasFP(MF))
+    std::reverse(ObjectsToAllocate.begin(), ObjectsToAllocate.end());
+}
+
+
 unsigned X86FrameLowering::getWinEHParentFrameOffset(const MachineFunction &MF) const {
   // RDX, the parent frame pointer, is homed into 16(%rsp) in the prologue.
   unsigned Offset = 16;
@@ -2691,14 +2948,30 @@ void X86FrameLowering::processFunctionBeforeFrameFinalized(
   // were no fixed objects, use offset -SlotSize, which is immediately after the
   // return address. Fixed objects have negative frame indices.
   MachineFrameInfo *MFI = MF.getFrameInfo();
+  WinEHFuncInfo &EHInfo = *MF.getWinEHFuncInfo();
   int64_t MinFixedObjOffset = -SlotSize;
   for (int I = MFI->getObjectIndexBegin(); I < 0; ++I)
     MinFixedObjOffset = std::min(MinFixedObjOffset, MFI->getObjectOffset(I));
 
+  for (WinEHTryBlockMapEntry &TBME : EHInfo.TryBlockMap) {
+    for (WinEHHandlerType &H : TBME.HandlerArray) {
+      int FrameIndex = H.CatchObj.FrameIndex;
+      if (FrameIndex != INT_MAX) {
+        // Ensure alignment.
+        unsigned Align = MFI->getObjectAlignment(FrameIndex);
+        MinFixedObjOffset -= std::abs(MinFixedObjOffset) % Align;
+        MinFixedObjOffset -= MFI->getObjectSize(FrameIndex);
+        MFI->setObjectOffset(FrameIndex, MinFixedObjOffset);
+      }
+    }
+  }
+
+  // Ensure alignment.
+  MinFixedObjOffset -= std::abs(MinFixedObjOffset) % 8;
   int64_t UnwindHelpOffset = MinFixedObjOffset - SlotSize;
   int UnwindHelpFI =
       MFI->CreateFixedObject(SlotSize, UnwindHelpOffset, /*Immutable=*/false);
-  MF.getWinEHFuncInfo()->UnwindHelpFrameIdx = UnwindHelpFI;
+  EHInfo.UnwindHelpFrameIdx = UnwindHelpFI;
 
   // Store -2 into UnwindHelp on function entry. We have to scan forwards past
   // other frame setup instructions.
diff --git a/lib/Target/X86/X86FrameLowering.h b/lib/Target/X86/X86FrameLowering.h
index 3ab41b4a5789..4a01014ee545 100644
--- a/lib/Target/X86/X86FrameLowering.h
+++ b/lib/Target/X86/X86FrameLowering.h
@@ -52,8 +52,8 @@ public:
   /// the number of bytes to probe in RAX/EAX. Returns instruction just
   /// after the expansion.
   MachineInstr *emitStackProbe(MachineFunction &MF, MachineBasicBlock &MBB,
-                               MachineBasicBlock::iterator MBBI, DebugLoc DL,
-                               bool InProlog) const;
+                               MachineBasicBlock::iterator MBBI,
+                               const DebugLoc &DL, bool InProlog) const;
 
   /// Replace a StackProbe inline-stub with the actual probe code inline.
   void inlineStackProbe(MachineFunction &MF,
@@ -61,7 +61,7 @@ public:
 
   void emitCalleeSavedFrameMoves(MachineBasicBlock &MBB,
                                  MachineBasicBlock::iterator MBBI,
-                                 DebugLoc DL) const;
+                                 const DebugLoc &DL) const;
 
   /// emitProlog/emitEpilog - These methods insert prolog and epilog code into
   /// the function.
@@ -100,12 +100,13 @@ public:
   int getFrameIndexReference(const MachineFunction &MF, int FI,
                              unsigned &FrameReg) const override;
 
-  int getFrameIndexReferenceFromSP(const MachineFunction &MF, int FI,
-                                   unsigned &FrameReg) const override;
+  int getFrameIndexReferencePreferSP(const MachineFunction &MF, int FI,
+                                     unsigned &FrameReg,
+                                     bool IgnoreSPUpdates) const override;
 
-  void eliminateCallFramePseudoInstr(MachineFunction &MF,
-                                 MachineBasicBlock &MBB,
-                                 MachineBasicBlock::iterator MI) const override;
+  MachineBasicBlock::iterator
+  eliminateCallFramePseudoInstr(MachineFunction &MF, MachineBasicBlock &MBB,
+                                MachineBasicBlock::iterator MI) const override;
 
   unsigned getWinEHParentFrameOffset(const MachineFunction &MF) const override;
 
@@ -127,6 +128,16 @@ public:
   /// Check that LEA can be used on SP in an epilogue sequence for \p MF.
   bool canUseLEAForSPInEpilogue(const MachineFunction &MF) const;
 
+  /// Check whether or not the given \p MBB can be used as a prologue
+  /// for the target.
+  /// The prologue will be inserted first in this basic block.
+  /// This method is used by the shrink-wrapping pass to decide if
+  /// \p MBB will be correctly handled by the target.
+  /// As soon as the target enable shrink-wrapping without overriding
+  /// this method, we assume that each basic block is a valid
+  /// prologue.
+  bool canUseAsPrologue(const MachineBasicBlock &MBB) const override;
+
   /// Check whether or not the given \p MBB can be used as a epilogue
   /// for the target.
   /// The epilogue will be inserted before the first terminator of that block.
@@ -137,6 +148,13 @@ public:
   /// Returns true if the target will correctly handle shrink wrapping.
   bool enableShrinkWrapping(const MachineFunction &MF) const override;
 
+  /// Order the symbols in the local stack.
+  /// We want to place the local stack objects in some sort of sensible order.
+  /// The heuristic we use is to try and pack them according to static number
+  /// of uses and size in order to minimize code size.
+  void orderFrameObjects(const MachineFunction &MF,
+                         SmallVectorImpl<int> &ObjectsToAllocate) const override;
+
   /// convertArgMovsToPushes - This method tries to convert a call sequence
   /// that uses sub and mov instructions to put the argument onto the stack
   /// into a series of pushes.
@@ -148,14 +166,14 @@ public:
 
   /// Wraps up getting a CFI index and building a MachineInstr for it.
   void BuildCFI(MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI,
-                DebugLoc DL, MCCFIInstruction CFIInst) const;
+                const DebugLoc &DL, const MCCFIInstruction &CFIInst) const;
 
   /// Sets up EBP and optionally ESI based on the incoming EBP value.  Only
   /// needed for 32-bit. Used in funclet prologues and at catchret destinations.
   MachineBasicBlock::iterator
   restoreWin32EHStackPointers(MachineBasicBlock &MBB,
-                              MachineBasicBlock::iterator MBBI, DebugLoc DL,
-                              bool RestoreSP = false) const;
+                              MachineBasicBlock::iterator MBBI,
+                              const DebugLoc &DL, bool RestoreSP = false) const;
 
 private:
   uint64_t calculateMaxStackAlign(const MachineFunction &MF) const;
@@ -163,34 +181,35 @@ private:
   /// Emit target stack probe as a call to a helper function
   MachineInstr *emitStackProbeCall(MachineFunction &MF, MachineBasicBlock &MBB,
                                    MachineBasicBlock::iterator MBBI,
-                                   DebugLoc DL, bool InProlog) const;
+                                   const DebugLoc &DL, bool InProlog) const;
 
   /// Emit target stack probe as an inline sequence.
   MachineInstr *emitStackProbeInline(MachineFunction &MF,
                                      MachineBasicBlock &MBB,
                                      MachineBasicBlock::iterator MBBI,
-                                     DebugLoc DL, bool InProlog) const;
+                                     const DebugLoc &DL, bool InProlog) const;
 
   /// Emit a stub to later inline the target stack probe.
   MachineInstr *emitStackProbeInlineStub(MachineFunction &MF,
                                          MachineBasicBlock &MBB,
                                          MachineBasicBlock::iterator MBBI,
-                                         DebugLoc DL, bool InProlog) const;
+                                         const DebugLoc &DL,
+                                         bool InProlog) const;
 
   /// Aligns the stack pointer by ANDing it with -MaxAlign.
   void BuildStackAlignAND(MachineBasicBlock &MBB,
-                          MachineBasicBlock::iterator MBBI, DebugLoc DL,
+                          MachineBasicBlock::iterator MBBI, const DebugLoc &DL,
                           unsigned Reg, uint64_t MaxAlign) const;
 
   /// Make small positive stack adjustments using POPs.
   bool adjustStackWithPops(MachineBasicBlock &MBB,
-                           MachineBasicBlock::iterator MBBI, DebugLoc DL,
+                           MachineBasicBlock::iterator MBBI, const DebugLoc &DL,
                            int Offset) const;
 
   /// Adjusts the stack pointer using LEA, SUB, or ADD.
   MachineInstrBuilder BuildStackAdjustment(MachineBasicBlock &MBB,
                                            MachineBasicBlock::iterator MBBI,
-                                           DebugLoc DL, int64_t Offset,
+                                           const DebugLoc &DL, int64_t Offset,
                                            bool InEpilogue) const;
 
   unsigned getPSPSlotOffsetFromSP(const MachineFunction &MF) const;
diff --git a/lib/Target/X86/X86ISelDAGToDAG.cpp b/lib/Target/X86/X86ISelDAGToDAG.cpp
index 868ae4e19e55..7d53b3db6175 100644
--- a/lib/Target/X86/X86ISelDAGToDAG.cpp
+++ b/lib/Target/X86/X86ISelDAGToDAG.cpp
@@ -157,9 +157,13 @@ namespace {
     /// performance.
     bool OptForSize;
 
+    /// If true, selector should try to optimize for minimum code size.
+    bool OptForMinSize;
+
   public:
     explicit X86DAGToDAGISel(X86TargetMachine &tm, CodeGenOpt::Level OptLevel)
-        : SelectionDAGISel(tm, OptLevel), OptForSize(false) {}
+        : SelectionDAGISel(tm, OptLevel), OptForSize(false),
+          OptForMinSize(false) {}
 
     const char *getPassName() const override {
       return "X86 DAG->DAG Instruction Selection";
@@ -192,9 +196,8 @@ namespace {
 #include "X86GenDAGISel.inc"
 
   private:
-    SDNode *Select(SDNode *N) override;
-    SDNode *selectGather(SDNode *N, unsigned Opc);
-    SDNode *selectAtomicLoadArith(SDNode *Node, MVT NVT);
+    void Select(SDNode *N) override;
+    bool tryGather(SDNode *N, unsigned Opc);
 
     bool foldOffsetIntoAddress(uint64_t Offset, X86ISelAddressMode &AM);
     bool matchLoadInAddress(LoadSDNode *N, X86ISelAddressMode &AM);
@@ -238,7 +241,7 @@ namespace {
 
     void emitSpecialCodeForMain();
 
-    inline void getAddressOperands(X86ISelAddressMode &AM, SDLoc DL,
+    inline void getAddressOperands(X86ISelAddressMode &AM, const SDLoc &DL,
                                    SDValue &Base, SDValue &Scale,
                                    SDValue &Index, SDValue &Disp,
                                    SDValue &Segment) {
@@ -323,7 +326,7 @@ namespace {
         // types.
         if (User->getNumOperands() != 2)
           continue;
-        
+
         // Immediates that are used for offsets as part of stack
         // manipulation should be left alone. These are typically
         // used to indicate SP offsets for argument passing and
@@ -357,12 +360,12 @@ namespace {
     }
 
     /// Return a target constant with the specified value of type i8.
-    inline SDValue getI8Imm(unsigned Imm, SDLoc DL) {
+    inline SDValue getI8Imm(unsigned Imm, const SDLoc &DL) {
       return CurDAG->getTargetConstant(Imm, DL, MVT::i8);
     }
 
     /// Return a target constant with the specified value, of type i32.
-    inline SDValue getI32Imm(unsigned Imm, SDLoc DL) {
+    inline SDValue getI32Imm(unsigned Imm, const SDLoc &DL) {
       return CurDAG->getTargetConstant(Imm, DL, MVT::i32);
     }
 
@@ -531,8 +534,10 @@ static bool isCalleeLoad(SDValue Callee, SDValue &Chain, bool HasCallSeq) {
 }
 
 void X86DAGToDAGISel::PreprocessISelDAG() {
-  // OptForSize is used in pattern predicates that isel is matching.
+  // OptFor[Min]Size are used in pattern predicates that isel is matching.
   OptForSize = MF->getFunction()->optForSize();
+  OptForMinSize = MF->getFunction()->optForMinSize();
+  assert((!OptForMinSize || OptForSize) && "OptForMinSize implies OptForSize");
 
   for (SelectionDAG::allnodes_iterator I = CurDAG->allnodes_begin(),
        E = CurDAG->allnodes_end(); I != E; ) {
@@ -545,7 +550,7 @@ void X86DAGToDAGISel::PreprocessISelDAG() {
          (N->getOpcode() == X86ISD::TC_RETURN &&
           // Only does this if load can be folded into TC_RETURN.
           (Subtarget->is64Bit() ||
-           getTargetMachine().getRelocationModel() != Reloc::PIC_)))) {
+           !getTargetMachine().isPositionIndependent())))) {
       /// Also try moving call address load from outside callseq_start to just
       /// before the call to allow it to be folded.
       ///
@@ -624,13 +629,11 @@ void X86DAGToDAGISel::PreprocessISelDAG() {
     SDLoc dl(N);
 
     // FIXME: optimize the case where the src/dest is a load or store?
-    SDValue Store = CurDAG->getTruncStore(CurDAG->getEntryNode(), dl,
-                                          N->getOperand(0),
-                                          MemTmp, MachinePointerInfo(), MemVT,
-                                          false, false, 0);
+    SDValue Store =
+        CurDAG->getTruncStore(CurDAG->getEntryNode(), dl, N->getOperand(0),
+                              MemTmp, MachinePointerInfo(), MemVT);
     SDValue Result = CurDAG->getExtLoad(ISD::EXTLOAD, dl, DstVT, Store, MemTmp,
-                                        MachinePointerInfo(),
-                                        MemVT, false, false, false, 0);
+                                        MachinePointerInfo(), MemVT);
 
     // We're about to replace all uses of the FP_ROUND/FP_EXTEND with the
     // extload we created.  This will cause general havok on the dag because
@@ -657,7 +660,7 @@ void X86DAGToDAGISel::emitSpecialCodeForMain() {
     CLI.setChain(CurDAG->getRoot())
         .setCallee(CallingConv::C, Type::getVoidTy(*CurDAG->getContext()),
                    CurDAG->getExternalSymbol("__main", TLI->getPointerTy(DL)),
-                   std::move(Args), 0);
+                   std::move(Args));
     const TargetLowering &TLI = CurDAG->getTargetLoweringInfo();
     std::pair<SDValue, SDValue> Result = TLI.LowerCallTo(CLI);
     CurDAG->setRoot(Result.second);
@@ -714,7 +717,7 @@ bool X86DAGToDAGISel::matchLoadInAddress(LoadSDNode *N, X86ISelAddressMode &AM){
   // For more information see http://people.redhat.com/drepper/tls.pdf
   if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Address))
     if (C->getSExtValue() == 0 && AM.Segment.getNode() == nullptr &&
-        Subtarget->isTargetLinux())
+        Subtarget->isTargetGlibc())
       switch (N->getPointerInfo().getAddrSpace()) {
       case 256:
         AM.Segment = CurDAG->getRegister(X86::GS, MVT::i16);
@@ -722,6 +725,8 @@ bool X86DAGToDAGISel::matchLoadInAddress(LoadSDNode *N, X86ISelAddressMode &AM){
       case 257:
         AM.Segment = CurDAG->getRegister(X86::FS, MVT::i16);
         return false;
+      // Address space 258 is not handled here, because it is not used to
+      // address TLS areas.
       }
 
   return true;
@@ -1419,11 +1424,13 @@ bool X86DAGToDAGISel::selectVectorAddr(SDNode *Parent, SDValue N, SDValue &Base,
     return false;
   X86ISelAddressMode AM;
   unsigned AddrSpace = Mgs->getPointerInfo().getAddrSpace();
-  // AddrSpace 256 -> GS, 257 -> FS.
+  // AddrSpace 256 -> GS, 257 -> FS, 258 -> SS.
   if (AddrSpace == 256)
     AM.Segment = CurDAG->getRegister(X86::GS, MVT::i16);
   if (AddrSpace == 257)
     AM.Segment = CurDAG->getRegister(X86::FS, MVT::i16);
+  if (AddrSpace == 258)
+    AM.Segment = CurDAG->getRegister(X86::SS, MVT::i16);
 
   SDLoc DL(N);
   Base = Mgs->getBasePtr();
@@ -1468,11 +1475,13 @@ bool X86DAGToDAGISel::selectAddr(SDNode *Parent, SDValue N, SDValue &Base,
       Parent->getOpcode() != X86ISD::EH_SJLJ_LONGJMP) { // longjmp
     unsigned AddrSpace =
       cast<MemSDNode>(Parent)->getPointerInfo().getAddrSpace();
-    // AddrSpace 256 -> GS, 257 -> FS.
+    // AddrSpace 256 -> GS, 257 -> FS, 258 -> SS.
     if (AddrSpace == 256)
       AM.Segment = CurDAG->getRegister(X86::GS, MVT::i16);
     if (AddrSpace == 257)
       AM.Segment = CurDAG->getRegister(X86::FS, MVT::i16);
+    if (AddrSpace == 258)
+      AM.Segment = CurDAG->getRegister(X86::SS, MVT::i16);
   }
 
   if (matchAddress(N, AM))
@@ -1569,10 +1578,12 @@ bool X86DAGToDAGISel::selectMOV64Imm32(SDValue N, SDValue &Imm) {
 bool X86DAGToDAGISel::selectLEA64_32Addr(SDValue N, SDValue &Base,
                                          SDValue &Scale, SDValue &Index,
                                          SDValue &Disp, SDValue &Segment) {
+  // Save the debug loc before calling selectLEAAddr, in case it invalidates N.
+  SDLoc DL(N);
+
   if (!selectLEAAddr(N, Base, Scale, Index, Disp, Segment))
     return false;
 
-  SDLoc DL(N);
   RegisterSDNode *RN = dyn_cast<RegisterSDNode>(Base);
   if (RN && RN->getReg() == 0)
     Base = CurDAG->getRegister(0, MVT::i64);
@@ -1612,6 +1623,10 @@ bool X86DAGToDAGISel::selectLEAAddr(SDValue N,
                                     SDValue &Segment) {
   X86ISelAddressMode AM;
 
+  // Save the DL and VT before calling matchAddress, it can invalidate N.
+  SDLoc DL(N);
+  MVT VT = N.getSimpleValueType();
+
   // Set AM.Segment to prevent MatchAddress from using one. LEA doesn't support
   // segments.
   SDValue Copy = AM.Segment;
@@ -1622,7 +1637,6 @@ bool X86DAGToDAGISel::selectLEAAddr(SDValue N,
   assert (T == AM.Segment);
   AM.Segment = Copy;
 
-  MVT VT = N.getSimpleValueType();
   unsigned Complexity = 0;
   if (AM.BaseType == X86ISelAddressMode::RegBase)
     if (AM.Base_Reg.getNode())
@@ -1662,7 +1676,7 @@ bool X86DAGToDAGISel::selectLEAAddr(SDValue N,
   if (Complexity <= 2)
     return false;
 
-  getAddressOperands(AM, SDLoc(N), Base, Scale, Index, Disp, Segment);
+  getAddressOperands(AM, DL, Base, Scale, Index, Disp, Segment);
   return true;
 }
 
@@ -1713,295 +1727,6 @@ SDNode *X86DAGToDAGISel::getGlobalBaseReg() {
   return CurDAG->getRegister(GlobalBaseReg, TLI->getPointerTy(DL)).getNode();
 }
 
-/// Atomic opcode table
-///
-enum AtomicOpc {
-  ADD,
-  SUB,
-  INC,
-  DEC,
-  OR,
-  AND,
-  XOR,
-  AtomicOpcEnd
-};
-
-enum AtomicSz {
-  ConstantI8,
-  I8,
-  SextConstantI16,
-  ConstantI16,
-  I16,
-  SextConstantI32,
-  ConstantI32,
-  I32,
-  SextConstantI64,
-  ConstantI64,
-  I64,
-  AtomicSzEnd
-};
-
-static const uint16_t AtomicOpcTbl[AtomicOpcEnd][AtomicSzEnd] = {
-  {
-    X86::LOCK_ADD8mi,
-    X86::LOCK_ADD8mr,
-    X86::LOCK_ADD16mi8,
-    X86::LOCK_ADD16mi,
-    X86::LOCK_ADD16mr,
-    X86::LOCK_ADD32mi8,
-    X86::LOCK_ADD32mi,
-    X86::LOCK_ADD32mr,
-    X86::LOCK_ADD64mi8,
-    X86::LOCK_ADD64mi32,
-    X86::LOCK_ADD64mr,
-  },
-  {
-    X86::LOCK_SUB8mi,
-    X86::LOCK_SUB8mr,
-    X86::LOCK_SUB16mi8,
-    X86::LOCK_SUB16mi,
-    X86::LOCK_SUB16mr,
-    X86::LOCK_SUB32mi8,
-    X86::LOCK_SUB32mi,
-    X86::LOCK_SUB32mr,
-    X86::LOCK_SUB64mi8,
-    X86::LOCK_SUB64mi32,
-    X86::LOCK_SUB64mr,
-  },
-  {
-    0,
-    X86::LOCK_INC8m,
-    0,
-    0,
-    X86::LOCK_INC16m,
-    0,
-    0,
-    X86::LOCK_INC32m,
-    0,
-    0,
-    X86::LOCK_INC64m,
-  },
-  {
-    0,
-    X86::LOCK_DEC8m,
-    0,
-    0,
-    X86::LOCK_DEC16m,
-    0,
-    0,
-    X86::LOCK_DEC32m,
-    0,
-    0,
-    X86::LOCK_DEC64m,
-  },
-  {
-    X86::LOCK_OR8mi,
-    X86::LOCK_OR8mr,
-    X86::LOCK_OR16mi8,
-    X86::LOCK_OR16mi,
-    X86::LOCK_OR16mr,
-    X86::LOCK_OR32mi8,
-    X86::LOCK_OR32mi,
-    X86::LOCK_OR32mr,
-    X86::LOCK_OR64mi8,
-    X86::LOCK_OR64mi32,
-    X86::LOCK_OR64mr,
-  },
-  {
-    X86::LOCK_AND8mi,
-    X86::LOCK_AND8mr,
-    X86::LOCK_AND16mi8,
-    X86::LOCK_AND16mi,
-    X86::LOCK_AND16mr,
-    X86::LOCK_AND32mi8,
-    X86::LOCK_AND32mi,
-    X86::LOCK_AND32mr,
-    X86::LOCK_AND64mi8,
-    X86::LOCK_AND64mi32,
-    X86::LOCK_AND64mr,
-  },
-  {
-    X86::LOCK_XOR8mi,
-    X86::LOCK_XOR8mr,
-    X86::LOCK_XOR16mi8,
-    X86::LOCK_XOR16mi,
-    X86::LOCK_XOR16mr,
-    X86::LOCK_XOR32mi8,
-    X86::LOCK_XOR32mi,
-    X86::LOCK_XOR32mr,
-    X86::LOCK_XOR64mi8,
-    X86::LOCK_XOR64mi32,
-    X86::LOCK_XOR64mr,
-  }
-};
-
-// Return the target constant operand for atomic-load-op and do simple
-// translations, such as from atomic-load-add to lock-sub. The return value is
-// one of the following 3 cases:
-// + target-constant, the operand could be supported as a target constant.
-// + empty, the operand is not needed any more with the new op selected.
-// + non-empty, otherwise.
-static SDValue getAtomicLoadArithTargetConstant(SelectionDAG *CurDAG,
-                                                SDLoc dl,
-                                                enum AtomicOpc &Op, MVT NVT,
-                                                SDValue Val,
-                                                const X86Subtarget *Subtarget) {
-  if (ConstantSDNode *CN = dyn_cast<ConstantSDNode>(Val)) {
-    int64_t CNVal = CN->getSExtValue();
-    // Quit if not 32-bit imm.
-    if ((int32_t)CNVal != CNVal)
-      return Val;
-    // Quit if INT32_MIN: it would be negated as it is negative and overflow,
-    // producing an immediate that does not fit in the 32 bits available for
-    // an immediate operand to sub. However, it still fits in 32 bits for the
-    // add (since it is not negated) so we can return target-constant.
-    if (CNVal == INT32_MIN)
-      return CurDAG->getTargetConstant(CNVal, dl, NVT);
-    // For atomic-load-add, we could do some optimizations.
-    if (Op == ADD) {
-      // Translate to INC/DEC if ADD by 1 or -1.
-      if (((CNVal == 1) || (CNVal == -1)) && !Subtarget->slowIncDec()) {
-        Op = (CNVal == 1) ? INC : DEC;
-        // No more constant operand after being translated into INC/DEC.
-        return SDValue();
-      }
-      // Translate to SUB if ADD by negative value.
-      if (CNVal < 0) {
-        Op = SUB;
-        CNVal = -CNVal;
-      }
-    }
-    return CurDAG->getTargetConstant(CNVal, dl, NVT);
-  }
-
-  // If the value operand is single-used, try to optimize it.
-  if (Op == ADD && Val.hasOneUse()) {
-    // Translate (atomic-load-add ptr (sub 0 x)) back to (lock-sub x).
-    if (Val.getOpcode() == ISD::SUB && X86::isZeroNode(Val.getOperand(0))) {
-      Op = SUB;
-      return Val.getOperand(1);
-    }
-    // A special case for i16, which needs truncating as, in most cases, it's
-    // promoted to i32. We will translate
-    // (atomic-load-add (truncate (sub 0 x))) to (lock-sub (EXTRACT_SUBREG x))
-    if (Val.getOpcode() == ISD::TRUNCATE && NVT == MVT::i16 &&
-        Val.getOperand(0).getOpcode() == ISD::SUB &&
-        X86::isZeroNode(Val.getOperand(0).getOperand(0))) {
-      Op = SUB;
-      Val = Val.getOperand(0);
-      return CurDAG->getTargetExtractSubreg(X86::sub_16bit, dl, NVT,
-                                            Val.getOperand(1));
-    }
-  }
-
-  return Val;
-}
-
-SDNode *X86DAGToDAGISel::selectAtomicLoadArith(SDNode *Node, MVT NVT) {
-  if (Node->hasAnyUseOfValue(0))
-    return nullptr;
-
-  SDLoc dl(Node);
-
-  // Optimize common patterns for __sync_or_and_fetch and similar arith
-  // operations where the result is not used. This allows us to use the "lock"
-  // version of the arithmetic instruction.
-  SDValue Chain = Node->getOperand(0);
-  SDValue Ptr = Node->getOperand(1);
-  SDValue Val = Node->getOperand(2);
-  SDValue Base, Scale, Index, Disp, Segment;
-  if (!selectAddr(Node, Ptr, Base, Scale, Index, Disp, Segment))
-    return nullptr;
-
-  // Which index into the table.
-  enum AtomicOpc Op;
-  switch (Node->getOpcode()) {
-    default:
-      return nullptr;
-    case ISD::ATOMIC_LOAD_OR:
-      Op = OR;
-      break;
-    case ISD::ATOMIC_LOAD_AND:
-      Op = AND;
-      break;
-    case ISD::ATOMIC_LOAD_XOR:
-      Op = XOR;
-      break;
-    case ISD::ATOMIC_LOAD_ADD:
-      Op = ADD;
-      break;
-  }
-
-  Val = getAtomicLoadArithTargetConstant(CurDAG, dl, Op, NVT, Val, Subtarget);
-  bool isUnOp = !Val.getNode();
-  bool isCN = Val.getNode() && (Val.getOpcode() == ISD::TargetConstant);
-
-  unsigned Opc = 0;
-  switch (NVT.SimpleTy) {
-    default: return nullptr;
-    case MVT::i8:
-      if (isCN)
-        Opc = AtomicOpcTbl[Op][ConstantI8];
-      else
-        Opc = AtomicOpcTbl[Op][I8];
-      break;
-    case MVT::i16:
-      if (isCN) {
-        if (immSext8(Val.getNode()))
-          Opc = AtomicOpcTbl[Op][SextConstantI16];
-        else
-          Opc = AtomicOpcTbl[Op][ConstantI16];
-      } else
-        Opc = AtomicOpcTbl[Op][I16];
-      break;
-    case MVT::i32:
-      if (isCN) {
-        if (immSext8(Val.getNode()))
-          Opc = AtomicOpcTbl[Op][SextConstantI32];
-        else
-          Opc = AtomicOpcTbl[Op][ConstantI32];
-      } else
-        Opc = AtomicOpcTbl[Op][I32];
-      break;
-    case MVT::i64:
-      if (isCN) {
-        if (immSext8(Val.getNode()))
-          Opc = AtomicOpcTbl[Op][SextConstantI64];
-        else if (i64immSExt32(Val.getNode()))
-          Opc = AtomicOpcTbl[Op][ConstantI64];
-        else
-          llvm_unreachable("True 64 bits constant in SelectAtomicLoadArith");
-      } else
-        Opc = AtomicOpcTbl[Op][I64];
-      break;
-  }
-
-  assert(Opc != 0 && "Invalid arith lock transform!");
-
-  // Building the new node.
-  SDValue Ret;
-  if (isUnOp) {
-    SDValue Ops[] = { Base, Scale, Index, Disp, Segment, Chain };
-    Ret = SDValue(CurDAG->getMachineNode(Opc, dl, MVT::Other, Ops), 0);
-  } else {
-    SDValue Ops[] = { Base, Scale, Index, Disp, Segment, Val, Chain };
-    Ret = SDValue(CurDAG->getMachineNode(Opc, dl, MVT::Other, Ops), 0);
-  }
-
-  // Copying the MachineMemOperand.
-  MachineSDNode::mmo_iterator MemOp = MF->allocateMemRefsArray(1);
-  MemOp[0] = cast<MemSDNode>(Node)->getMemOperand();
-  cast<MachineSDNode>(Ret)->setMemRefs(MemOp, MemOp + 1);
-
-  // We need to have two outputs as that is what the original instruction had.
-  // So we add a dummy, undefined output. This is safe as we checked first
-  // that no-one uses our output anyway.
-  SDValue Undef = SDValue(CurDAG->getMachineNode(TargetOpcode::IMPLICIT_DEF,
-                                                 dl, NVT), 0);
-  SDValue RetVals[] = { Undef, Ret };
-  return CurDAG->getMergeValues(RetVals, dl).getNode();
-}
-
 /// Test whether the given X86ISD::CMP node has any uses which require the SF
 /// or OF bits to be accurate.
 static bool hasNoSignedComparisonUses(SDNode *N) {
@@ -2168,7 +1893,7 @@ static unsigned getFusedLdStOpcode(EVT &LdVT, unsigned Opc) {
 }
 
 /// Customized ISel for GATHER operations.
-SDNode *X86DAGToDAGISel::selectGather(SDNode *Node, unsigned Opc) {
+bool X86DAGToDAGISel::tryGather(SDNode *Node, unsigned Opc) {
   // Operands of Gather: VSrc, Base, VIdx, VMask, Scale
   SDValue Chain = Node->getOperand(0);
   SDValue VSrc = Node->getOperand(2);
@@ -2177,7 +1902,7 @@ SDNode *X86DAGToDAGISel::selectGather(SDNode *Node, unsigned Opc) {
   SDValue VMask = Node->getOperand(5);
   ConstantSDNode *Scale = dyn_cast<ConstantSDNode>(Node->getOperand(6));
   if (!Scale)
-    return nullptr;
+    return false;
 
   SDVTList VTs = CurDAG->getVTList(VSrc.getValueType(), VSrc.getValueType(),
                                    MVT::Other);
@@ -2196,10 +1921,11 @@ SDNode *X86DAGToDAGISel::selectGather(SDNode *Node, unsigned Opc) {
   // of ResNode.
   ReplaceUses(SDValue(Node, 0), SDValue(ResNode, 0));
   ReplaceUses(SDValue(Node, 1), SDValue(ResNode, 2));
-  return ResNode;
+  CurDAG->RemoveDeadNode(Node);
+  return true;
 }
 
-SDNode *X86DAGToDAGISel::Select(SDNode *Node) {
+void X86DAGToDAGISel::Select(SDNode *Node) {
   MVT NVT = Node->getSimpleValueType(0);
   unsigned Opc, MOpc;
   unsigned Opcode = Node->getOpcode();
@@ -2210,7 +1936,7 @@ SDNode *X86DAGToDAGISel::Select(SDNode *Node) {
   if (Node->isMachineOpcode()) {
     DEBUG(dbgs() << "== ";  Node->dump(CurDAG); dbgs() << '\n');
     Node->setNodeId(-1);
-    return nullptr;   // Already selected.
+    return;   // Already selected.
   }
 
   switch (Opcode) {
@@ -2229,10 +1955,10 @@ SDNode *X86DAGToDAGISel::Select(SDNode *Node) {
       SDValue ZextTarget = CurDAG->getZExtOrTrunc(Target, dl, EVT(MVT::i64));
       SDValue Brind = CurDAG->getNode(ISD::BRIND, dl, MVT::Other,
                                       Node->getOperand(0), ZextTarget);
-      ReplaceUses(SDValue(Node, 0), Brind);
+      ReplaceNode(Node, Brind.getNode());
       SelectCode(ZextTarget.getNode());
       SelectCode(Brind.getNode());
-      return nullptr;
+      return;
     }
     break;
   }
@@ -2278,17 +2004,16 @@ SDNode *X86DAGToDAGISel::Select(SDNode *Node) {
       case Intrinsic::x86_avx2_gather_q_d:      Opc = X86::VPGATHERQDrm;  break;
       case Intrinsic::x86_avx2_gather_q_d_256:  Opc = X86::VPGATHERQDYrm; break;
       }
-      SDNode *RetVal = selectGather(Node, Opc);
-      if (RetVal)
-        // We already called ReplaceUses inside SelectGather.
-        return nullptr;
+      if (tryGather(Node, Opc))
+        return;
       break;
     }
     }
     break;
   }
   case X86ISD::GlobalBaseReg:
-    return getGlobalBaseReg();
+    ReplaceNode(Node, getGlobalBaseReg());
+    return;
 
   case X86ISD::SHRUNKBLEND: {
     // SHRUNKBLEND selects like a regular VSELECT.
@@ -2298,18 +2023,9 @@ SDNode *X86DAGToDAGISel::Select(SDNode *Node) {
     ReplaceUses(SDValue(Node, 0), VSelect);
     SelectCode(VSelect.getNode());
     // We already called ReplaceUses.
-    return nullptr;
+    return;
   }
 
-  case ISD::ATOMIC_LOAD_XOR:
-  case ISD::ATOMIC_LOAD_AND:
-  case ISD::ATOMIC_LOAD_OR:
-  case ISD::ATOMIC_LOAD_ADD: {
-    SDNode *RetVal = selectAtomicLoadArith(Node, NVT);
-    if (RetVal)
-      return RetVal;
-    break;
-  }
   case ISD::AND:
   case ISD::OR:
   case ISD::XOR: {
@@ -2387,10 +2103,12 @@ SDNode *X86DAGToDAGISel::Select(SDNode *Node) {
     SDValue NewCst = CurDAG->getTargetConstant(Val >> ShlVal, dl, CstVT);
     SDNode *New = CurDAG->getMachineNode(Op, dl, NVT, N0->getOperand(0),NewCst);
     if (ShlVal == 1)
-      return CurDAG->SelectNodeTo(Node, AddOp, NVT, SDValue(New, 0),
-                                  SDValue(New, 0));
-    return CurDAG->SelectNodeTo(Node, ShlOp, NVT, SDValue(New, 0),
-                                getI8Imm(ShlVal, dl));
+      CurDAG->SelectNodeTo(Node, AddOp, NVT, SDValue(New, 0),
+                           SDValue(New, 0));
+    else
+      CurDAG->SelectNodeTo(Node, ShlOp, NVT, SDValue(New, 0),
+                           getI8Imm(ShlVal, dl));
+    return;
   }
   case X86ISD::UMUL8:
   case X86ISD::SMUL8: {
@@ -2406,9 +2124,8 @@ SDNode *X86DAGToDAGISel::Select(SDNode *Node) {
     SDValue Ops[] = {N1, InFlag};
     SDNode *CNode = CurDAG->getMachineNode(Opc, dl, VTs, Ops);
 
-    ReplaceUses(SDValue(Node, 0), SDValue(CNode, 0));
-    ReplaceUses(SDValue(Node, 1), SDValue(CNode, 1));
-    return nullptr;
+    ReplaceNode(Node, CNode);
+    return;
   }
 
   case X86ISD::UMUL: {
@@ -2431,10 +2148,8 @@ SDNode *X86DAGToDAGISel::Select(SDNode *Node) {
     SDValue Ops[] = {N1, InFlag};
     SDNode *CNode = CurDAG->getMachineNode(Opc, dl, VTs, Ops);
 
-    ReplaceUses(SDValue(Node, 0), SDValue(CNode, 0));
-    ReplaceUses(SDValue(Node, 1), SDValue(CNode, 1));
-    ReplaceUses(SDValue(Node, 2), SDValue(CNode, 2));
-    return nullptr;
+    ReplaceNode(Node, CNode);
+    return;
   }
 
   case ISD::SMUL_LOHI:
@@ -2506,24 +2221,32 @@ SDNode *X86DAGToDAGISel::Select(SDNode *Node) {
 
     if (foldedLoad) {
       SDValue Chain;
+      MachineSDNode *CNode = nullptr;
       SDValue Ops[] = { Tmp0, Tmp1, Tmp2, Tmp3, Tmp4, N1.getOperand(0),
                         InFlag };
       if (MOpc == X86::MULX32rm || MOpc == X86::MULX64rm) {
         SDVTList VTs = CurDAG->getVTList(NVT, NVT, MVT::Other, MVT::Glue);
-        SDNode *CNode = CurDAG->getMachineNode(MOpc, dl, VTs, Ops);
+        CNode = CurDAG->getMachineNode(MOpc, dl, VTs, Ops);
         ResHi = SDValue(CNode, 0);
         ResLo = SDValue(CNode, 1);
         Chain = SDValue(CNode, 2);
         InFlag = SDValue(CNode, 3);
       } else {
         SDVTList VTs = CurDAG->getVTList(MVT::Other, MVT::Glue);
-        SDNode *CNode = CurDAG->getMachineNode(MOpc, dl, VTs, Ops);
+        CNode = CurDAG->getMachineNode(MOpc, dl, VTs, Ops);
         Chain = SDValue(CNode, 0);
         InFlag = SDValue(CNode, 1);
       }
 
       // Update the chain.
       ReplaceUses(N1.getValue(1), Chain);
+      // Record the mem-refs
+      LoadSDNode *LoadNode = cast<LoadSDNode>(N1);
+      if (LoadNode) {
+        MachineSDNode::mmo_iterator MemOp = MF->allocateMemRefsArray(1);
+        MemOp[0] = LoadNode->getMemOperand();
+        CNode->setMemRefs(MemOp, MemOp + 1);
+      }
     } else {
       SDValue Ops[] = { N1, InFlag };
       if (Opc == X86::MULX32rr || Opc == X86::MULX64rr) {
@@ -2583,7 +2306,7 @@ SDNode *X86DAGToDAGISel::Select(SDNode *Node) {
       DEBUG(dbgs() << "=> "; ResHi.getNode()->dump(CurDAG); dbgs() << '\n');
     }
 
-    return nullptr;
+    return;
   }
 
   case ISD::SDIVREM:
@@ -2767,7 +2490,7 @@ SDNode *X86DAGToDAGISel::Select(SDNode *Node) {
       ReplaceUses(SDValue(Node, 1), Result);
       DEBUG(dbgs() << "=> "; Result.getNode()->dump(CurDAG); dbgs() << '\n');
     }
-    return nullptr;
+    return;
   }
 
   case X86ISD::CMP:
@@ -2825,7 +2548,7 @@ SDNode *X86DAGToDAGISel::Select(SDNode *Node) {
         // one, do not call ReplaceAllUsesWith.
         ReplaceUses(SDValue(Node, (Opcode == X86ISD::SUB ? 1 : 0)),
                     SDValue(NewNode, 0));
-        return nullptr;
+        return;
       }
 
       // For example, "testl %eax, $2048" to "testb %ah, $8".
@@ -2862,7 +2585,7 @@ SDNode *X86DAGToDAGISel::Select(SDNode *Node) {
         // one, do not call ReplaceAllUsesWith.
         ReplaceUses(SDValue(Node, (Opcode == X86ISD::SUB ? 1 : 0)),
                     SDValue(NewNode, 0));
-        return nullptr;
+        return;
       }
 
       // For example, "testl %eax, $32776" to "testw %ax, $32776".
@@ -2885,7 +2608,7 @@ SDNode *X86DAGToDAGISel::Select(SDNode *Node) {
         // one, do not call ReplaceAllUsesWith.
         ReplaceUses(SDValue(Node, (Opcode == X86ISD::SUB ? 1 : 0)),
                     SDValue(NewNode, 0));
-        return nullptr;
+        return;
       }
 
       // For example, "testq %rax, $268468232" to "testl %eax, $268468232".
@@ -2908,7 +2631,7 @@ SDNode *X86DAGToDAGISel::Select(SDNode *Node) {
         // one, do not call ReplaceAllUsesWith.
         ReplaceUses(SDValue(Node, (Opcode == X86ISD::SUB ? 1 : 0)),
                     SDValue(NewNode, 0));
-        return nullptr;
+        return;
       }
     }
     break;
@@ -2959,21 +2682,12 @@ SDNode *X86DAGToDAGISel::Select(SDNode *Node) {
 
     ReplaceUses(SDValue(StoreNode, 0), SDValue(Result, 1));
     ReplaceUses(SDValue(StoredVal.getNode(), 1), SDValue(Result, 0));
-
-    return Result;
+    CurDAG->RemoveDeadNode(Node);
+    return;
   }
   }
 
-  SDNode *ResNode = SelectCode(Node);
-
-  DEBUG(dbgs() << "=> ";
-        if (ResNode == nullptr || ResNode == Node)
-          Node->dump(CurDAG);
-        else
-          ResNode->dump(CurDAG);
-        dbgs() << '\n');
-
-  return ResNode;
+  SelectCode(Node);
 }
 
 bool X86DAGToDAGISel::
diff --git a/lib/Target/X86/X86ISelLowering.cpp b/lib/Target/X86/X86ISelLowering.cpp
index dd9966f9e179..e54711195900 100644
--- a/lib/Target/X86/X86ISelLowering.cpp
+++ b/lib/Target/X86/X86ISelLowering.cpp
@@ -71,9 +71,10 @@ static cl::opt<bool> ExperimentalVectorWideningLegalization(
 
 X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
                                      const X86Subtarget &STI)
-    : TargetLowering(TM), Subtarget(&STI) {
-  X86ScalarSSEf64 = Subtarget->hasSSE2();
-  X86ScalarSSEf32 = Subtarget->hasSSE1();
+    : TargetLowering(TM), Subtarget(STI) {
+  bool UseX87 = !Subtarget.useSoftFloat() && Subtarget.hasX87();
+  X86ScalarSSEf64 = Subtarget.hasSSE2();
+  X86ScalarSSEf32 = Subtarget.hasSSE1();
   MVT PtrVT = MVT::getIntegerVT(8 * TM.getPointerSize());
 
   // Set up the TargetLowering object.
@@ -86,24 +87,24 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
   // For 64-bit, since we have so many registers, use the ILP scheduler.
   // For 32-bit, use the register pressure specific scheduling.
   // For Atom, always use ILP scheduling.
-  if (Subtarget->isAtom())
+  if (Subtarget.isAtom())
     setSchedulingPreference(Sched::ILP);
-  else if (Subtarget->is64Bit())
+  else if (Subtarget.is64Bit())
     setSchedulingPreference(Sched::ILP);
   else
     setSchedulingPreference(Sched::RegPressure);
-  const X86RegisterInfo *RegInfo = Subtarget->getRegisterInfo();
+  const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
   setStackPointerRegisterToSaveRestore(RegInfo->getStackRegister());
 
   // Bypass expensive divides on Atom when compiling with O2.
   if (TM.getOptLevel() >= CodeGenOpt::Default) {
-    if (Subtarget->hasSlowDivide32())
+    if (Subtarget.hasSlowDivide32())
       addBypassSlowDiv(32, 8);
-    if (Subtarget->hasSlowDivide64() && Subtarget->is64Bit())
+    if (Subtarget.hasSlowDivide64() && Subtarget.is64Bit())
       addBypassSlowDiv(64, 16);
   }
 
-  if (Subtarget->isTargetKnownWindowsMSVC()) {
+  if (Subtarget.isTargetKnownWindowsMSVC()) {
     // Setup Windows compiler runtime calls.
     setLibcallName(RTLIB::SDIV_I64, "_alldiv");
     setLibcallName(RTLIB::UDIV_I64, "_aulldiv");
@@ -117,11 +118,11 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
     setLibcallCallingConv(RTLIB::MUL_I64, CallingConv::X86_StdCall);
   }
 
-  if (Subtarget->isTargetDarwin()) {
+  if (Subtarget.isTargetDarwin()) {
     // Darwin should use _setjmp/_longjmp instead of setjmp/longjmp.
     setUseUnderscoreSetJmp(false);
     setUseUnderscoreLongJmp(false);
-  } else if (Subtarget->isTargetWindowsGNU()) {
+  } else if (Subtarget.isTargetWindowsGNU()) {
     // MS runtime is weird: it exports _setjmp, but longjmp!
     setUseUnderscoreSetJmp(true);
     setUseUnderscoreLongJmp(false);
@@ -134,7 +135,7 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
   addRegisterClass(MVT::i8, &X86::GR8RegClass);
   addRegisterClass(MVT::i16, &X86::GR16RegClass);
   addRegisterClass(MVT::i32, &X86::GR32RegClass);
-  if (Subtarget->is64Bit())
+  if (Subtarget.is64Bit())
     addRegisterClass(MVT::i64, &X86::GR64RegClass);
 
   for (MVT VT : MVT::integer_valuetypes())
@@ -164,14 +165,14 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
   setOperationAction(ISD::UINT_TO_FP       , MVT::i8   , Promote);
   setOperationAction(ISD::UINT_TO_FP       , MVT::i16  , Promote);
 
-  if (Subtarget->is64Bit()) {
-    if (!Subtarget->useSoftFloat() && Subtarget->hasAVX512())
+  if (Subtarget.is64Bit()) {
+    if (!Subtarget.useSoftFloat() && Subtarget.hasAVX512())
       // f32/f64 are legal, f80 is custom.
       setOperationAction(ISD::UINT_TO_FP   , MVT::i32  , Custom);
     else
       setOperationAction(ISD::UINT_TO_FP   , MVT::i32  , Promote);
     setOperationAction(ISD::UINT_TO_FP     , MVT::i64  , Custom);
-  } else if (!Subtarget->useSoftFloat()) {
+  } else if (!Subtarget.useSoftFloat()) {
     // We have an algorithm for SSE2->double, and we turn this into a
     // 64-bit FILD followed by conditional FADD for other targets.
     setOperationAction(ISD::UINT_TO_FP     , MVT::i64  , Custom);
@@ -185,8 +186,8 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
   setOperationAction(ISD::SINT_TO_FP       , MVT::i1   , Promote);
   setOperationAction(ISD::SINT_TO_FP       , MVT::i8   , Promote);
 
-  if (!Subtarget->useSoftFloat()) {
-    // SSE has no i16 to fp conversion, only i32
+  if (!Subtarget.useSoftFloat()) {
+    // SSE has no i16 to fp conversion, only i32.
     if (X86ScalarSSEf32) {
       setOperationAction(ISD::SINT_TO_FP     , MVT::i16  , Promote);
       // f32 and f64 cases are Legal, f80 case is not
@@ -205,7 +206,7 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
   setOperationAction(ISD::FP_TO_SINT       , MVT::i1   , Promote);
   setOperationAction(ISD::FP_TO_SINT       , MVT::i8   , Promote);
 
-  if (!Subtarget->useSoftFloat()) {
+  if (!Subtarget.useSoftFloat()) {
     // In 32-bit mode these are custom lowered.  In 64-bit mode F32 and F64
     // are Legal, f80 is custom lowered.
     setOperationAction(ISD::FP_TO_SINT     , MVT::i64  , Custom);
@@ -231,8 +232,8 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
   setOperationAction(ISD::FP_TO_UINT       , MVT::i8   , Promote);
   setOperationAction(ISD::FP_TO_UINT       , MVT::i16  , Promote);
 
-  if (Subtarget->is64Bit()) {
-    if (!Subtarget->useSoftFloat() && Subtarget->hasAVX512()) {
+  if (Subtarget.is64Bit()) {
+    if (!Subtarget.useSoftFloat() && Subtarget.hasAVX512()) {
       // FP_TO_UINT-i32/i64 is legal for f32/f64, but custom for f80.
       setOperationAction(ISD::FP_TO_UINT   , MVT::i32  , Custom);
       setOperationAction(ISD::FP_TO_UINT   , MVT::i64  , Custom);
@@ -240,9 +241,9 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
       setOperationAction(ISD::FP_TO_UINT   , MVT::i32  , Promote);
       setOperationAction(ISD::FP_TO_UINT   , MVT::i64  , Expand);
     }
-  } else if (!Subtarget->useSoftFloat()) {
+  } else if (!Subtarget.useSoftFloat()) {
     // Since AVX is a superset of SSE3, only check for SSE here.
-    if (Subtarget->hasSSE1() && !Subtarget->hasSSE3())
+    if (Subtarget.hasSSE1() && !Subtarget.hasSSE3())
       // Expand FP_TO_UINT into a select.
       // FIXME: We would like to use a Custom expander here eventually to do
       // the optimal thing for SSE vs. the default expansion in the legalizer.
@@ -260,12 +261,12 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
   if (!X86ScalarSSEf64) {
     setOperationAction(ISD::BITCAST        , MVT::f32  , Expand);
     setOperationAction(ISD::BITCAST        , MVT::i32  , Expand);
-    if (Subtarget->is64Bit()) {
+    if (Subtarget.is64Bit()) {
       setOperationAction(ISD::BITCAST      , MVT::f64  , Expand);
       // Without SSE, i64->f64 goes through memory.
       setOperationAction(ISD::BITCAST      , MVT::i64  , Expand);
     }
-  } else if (!Subtarget->is64Bit())
+  } else if (!Subtarget.is64Bit())
     setOperationAction(ISD::BITCAST      , MVT::i64  , Custom);
 
   // Scalar integer divide and remainder are lowered to use operations that
@@ -295,72 +296,43 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
 
   setOperationAction(ISD::BR_JT            , MVT::Other, Expand);
   setOperationAction(ISD::BRCOND           , MVT::Other, Custom);
-  setOperationAction(ISD::BR_CC            , MVT::f32,   Expand);
-  setOperationAction(ISD::BR_CC            , MVT::f64,   Expand);
-  setOperationAction(ISD::BR_CC            , MVT::f80,   Expand);
-  setOperationAction(ISD::BR_CC            , MVT::f128,  Expand);
-  setOperationAction(ISD::BR_CC            , MVT::i8,    Expand);
-  setOperationAction(ISD::BR_CC            , MVT::i16,   Expand);
-  setOperationAction(ISD::BR_CC            , MVT::i32,   Expand);
-  setOperationAction(ISD::BR_CC            , MVT::i64,   Expand);
-  setOperationAction(ISD::SELECT_CC        , MVT::f32,   Expand);
-  setOperationAction(ISD::SELECT_CC        , MVT::f64,   Expand);
-  setOperationAction(ISD::SELECT_CC        , MVT::f80,   Expand);
-  setOperationAction(ISD::SELECT_CC        , MVT::f128,  Expand);
-  setOperationAction(ISD::SELECT_CC        , MVT::i8,    Expand);
-  setOperationAction(ISD::SELECT_CC        , MVT::i16,   Expand);
-  setOperationAction(ISD::SELECT_CC        , MVT::i32,   Expand);
-  setOperationAction(ISD::SELECT_CC        , MVT::i64,   Expand);
-  if (Subtarget->is64Bit())
+  for (auto VT : { MVT::f32, MVT::f64, MVT::f80, MVT::f128,
+                   MVT::i8,  MVT::i16, MVT::i32, MVT::i64 }) {
+    setOperationAction(ISD::BR_CC,     VT, Expand);
+    setOperationAction(ISD::SELECT_CC, VT, Expand);
+  }
+  if (Subtarget.is64Bit())
     setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i32, Legal);
   setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i16  , Legal);
   setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i8   , Legal);
   setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i1   , Expand);
   setOperationAction(ISD::FP_ROUND_INREG   , MVT::f32  , Expand);
 
-  if (Subtarget->is32Bit() && Subtarget->isTargetKnownWindowsMSVC()) {
-    // On 32 bit MSVC, `fmodf(f32)` is not defined - only `fmod(f64)`
-    // is. We should promote the value to 64-bits to solve this.
-    // This is what the CRT headers do - `fmodf` is an inline header
-    // function casting to f64 and calling `fmod`.
-    setOperationAction(ISD::FREM           , MVT::f32  , Promote);
-  } else {
-    setOperationAction(ISD::FREM           , MVT::f32  , Expand);
-  }
-
+  setOperationAction(ISD::FREM             , MVT::f32  , Expand);
   setOperationAction(ISD::FREM             , MVT::f64  , Expand);
   setOperationAction(ISD::FREM             , MVT::f80  , Expand);
   setOperationAction(ISD::FLT_ROUNDS_      , MVT::i32  , Custom);
 
   // Promote the i8 variants and force them on up to i32 which has a shorter
   // encoding.
-  setOperationAction(ISD::CTTZ             , MVT::i8   , Promote);
-  AddPromotedToType (ISD::CTTZ             , MVT::i8   , MVT::i32);
-  setOperationAction(ISD::CTTZ_ZERO_UNDEF  , MVT::i8   , Promote);
-  AddPromotedToType (ISD::CTTZ_ZERO_UNDEF  , MVT::i8   , MVT::i32);
-  if (Subtarget->hasBMI()) {
-    setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::i16  , Expand);
-    setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::i32  , Expand);
-    if (Subtarget->is64Bit())
-      setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::i64, Expand);
-  } else {
+  setOperationPromotedToType(ISD::CTTZ           , MVT::i8   , MVT::i32);
+  setOperationPromotedToType(ISD::CTTZ_ZERO_UNDEF, MVT::i8   , MVT::i32);
+  if (!Subtarget.hasBMI()) {
     setOperationAction(ISD::CTTZ           , MVT::i16  , Custom);
     setOperationAction(ISD::CTTZ           , MVT::i32  , Custom);
-    if (Subtarget->is64Bit())
+    setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::i16  , Legal);
+    setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::i32  , Legal);
+    if (Subtarget.is64Bit()) {
       setOperationAction(ISD::CTTZ         , MVT::i64  , Custom);
+      setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::i64, Legal);
+    }
   }
 
-  if (Subtarget->hasLZCNT()) {
+  if (Subtarget.hasLZCNT()) {
     // When promoting the i8 variants, force them to i32 for a shorter
     // encoding.
-    setOperationAction(ISD::CTLZ           , MVT::i8   , Promote);
-    AddPromotedToType (ISD::CTLZ           , MVT::i8   , MVT::i32);
-    setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i8   , Promote);
-    AddPromotedToType (ISD::CTLZ_ZERO_UNDEF, MVT::i8   , MVT::i32);
-    setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i16  , Expand);
-    setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i32  , Expand);
-    if (Subtarget->is64Bit())
-      setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i64, Expand);
+    setOperationPromotedToType(ISD::CTLZ           , MVT::i8   , MVT::i32);
+    setOperationPromotedToType(ISD::CTLZ_ZERO_UNDEF, MVT::i8   , MVT::i32);
   } else {
     setOperationAction(ISD::CTLZ           , MVT::i8   , Custom);
     setOperationAction(ISD::CTLZ           , MVT::i16  , Custom);
@@ -368,7 +340,7 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
     setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i8   , Custom);
     setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i16  , Custom);
     setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i32  , Custom);
-    if (Subtarget->is64Bit()) {
+    if (Subtarget.is64Bit()) {
       setOperationAction(ISD::CTLZ         , MVT::i64  , Custom);
       setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i64, Custom);
     }
@@ -377,7 +349,7 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
   // Special handling for half-precision floating point conversions.
   // If we don't have F16C support, then lower half float conversions
   // into library calls.
-  if (Subtarget->useSoftFloat() || !Subtarget->hasF16C()) {
+  if (Subtarget.useSoftFloat() || !Subtarget.hasF16C()) {
     setOperationAction(ISD::FP16_TO_FP, MVT::f32, Expand);
     setOperationAction(ISD::FP_TO_FP16, MVT::f32, Expand);
   }
@@ -395,45 +367,34 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
   setTruncStoreAction(MVT::f64, MVT::f16, Expand);
   setTruncStoreAction(MVT::f80, MVT::f16, Expand);
 
-  if (Subtarget->hasPOPCNT()) {
+  if (Subtarget.hasPOPCNT()) {
     setOperationAction(ISD::CTPOP          , MVT::i8   , Promote);
   } else {
     setOperationAction(ISD::CTPOP          , MVT::i8   , Expand);
     setOperationAction(ISD::CTPOP          , MVT::i16  , Expand);
     setOperationAction(ISD::CTPOP          , MVT::i32  , Expand);
-    if (Subtarget->is64Bit())
+    if (Subtarget.is64Bit())
       setOperationAction(ISD::CTPOP        , MVT::i64  , Expand);
   }
 
   setOperationAction(ISD::READCYCLECOUNTER , MVT::i64  , Custom);
 
-  if (!Subtarget->hasMOVBE())
+  if (!Subtarget.hasMOVBE())
     setOperationAction(ISD::BSWAP          , MVT::i16  , Expand);
 
   // These should be promoted to a larger select which is supported.
   setOperationAction(ISD::SELECT          , MVT::i1   , Promote);
   // X86 wants to expand cmov itself.
-  setOperationAction(ISD::SELECT          , MVT::i8   , Custom);
-  setOperationAction(ISD::SELECT          , MVT::i16  , Custom);
-  setOperationAction(ISD::SELECT          , MVT::i32  , Custom);
-  setOperationAction(ISD::SELECT          , MVT::f32  , Custom);
-  setOperationAction(ISD::SELECT          , MVT::f64  , Custom);
-  setOperationAction(ISD::SELECT          , MVT::f80  , Custom);
-  setOperationAction(ISD::SELECT          , MVT::f128 , Custom);
-  setOperationAction(ISD::SETCC           , MVT::i8   , Custom);
-  setOperationAction(ISD::SETCC           , MVT::i16  , Custom);
-  setOperationAction(ISD::SETCC           , MVT::i32  , Custom);
-  setOperationAction(ISD::SETCC           , MVT::f32  , Custom);
-  setOperationAction(ISD::SETCC           , MVT::f64  , Custom);
-  setOperationAction(ISD::SETCC           , MVT::f80  , Custom);
-  setOperationAction(ISD::SETCC           , MVT::f128 , Custom);
-  setOperationAction(ISD::SETCCE          , MVT::i8   , Custom);
-  setOperationAction(ISD::SETCCE          , MVT::i16  , Custom);
-  setOperationAction(ISD::SETCCE          , MVT::i32  , Custom);
-  if (Subtarget->is64Bit()) {
-    setOperationAction(ISD::SELECT        , MVT::i64  , Custom);
-    setOperationAction(ISD::SETCC         , MVT::i64  , Custom);
-    setOperationAction(ISD::SETCCE        , MVT::i64  , Custom);
+  for (auto VT : { MVT::f32, MVT::f64, MVT::f80, MVT::f128 }) {
+    setOperationAction(ISD::SELECT, VT, Custom);
+    setOperationAction(ISD::SETCC, VT, Custom);
+  }
+  for (auto VT : { MVT::i8, MVT::i16, MVT::i32, MVT::i64 }) {
+    if (VT == MVT::i64 && !Subtarget.is64Bit())
+      continue;
+    setOperationAction(ISD::SELECT, VT, Custom);
+    setOperationAction(ISD::SETCC,  VT, Custom);
+    setOperationAction(ISD::SETCCE, VT, Custom);
   }
   setOperationAction(ISD::EH_RETURN       , MVT::Other, Custom);
   // NOTE: EH_SJLJ_SETJMP/_LONGJMP supported here is NOT intended to support
@@ -444,34 +405,31 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
   // LLVM/Clang supports zero-cost DWARF exception handling.
   setOperationAction(ISD::EH_SJLJ_SETJMP, MVT::i32, Custom);
   setOperationAction(ISD::EH_SJLJ_LONGJMP, MVT::Other, Custom);
+  setOperationAction(ISD::EH_SJLJ_SETUP_DISPATCH, MVT::Other, Custom);
+  if (TM.Options.ExceptionModel == ExceptionHandling::SjLj)
+    setLibcallName(RTLIB::UNWIND_RESUME, "_Unwind_SjLj_Resume");
 
   // Darwin ABI issue.
-  setOperationAction(ISD::ConstantPool    , MVT::i32  , Custom);
-  setOperationAction(ISD::JumpTable       , MVT::i32  , Custom);
-  setOperationAction(ISD::GlobalAddress   , MVT::i32  , Custom);
-  setOperationAction(ISD::GlobalTLSAddress, MVT::i32  , Custom);
-  if (Subtarget->is64Bit())
-    setOperationAction(ISD::GlobalTLSAddress, MVT::i64, Custom);
-  setOperationAction(ISD::ExternalSymbol  , MVT::i32  , Custom);
-  setOperationAction(ISD::BlockAddress    , MVT::i32  , Custom);
-  if (Subtarget->is64Bit()) {
-    setOperationAction(ISD::ConstantPool  , MVT::i64  , Custom);
-    setOperationAction(ISD::JumpTable     , MVT::i64  , Custom);
-    setOperationAction(ISD::GlobalAddress , MVT::i64  , Custom);
-    setOperationAction(ISD::ExternalSymbol, MVT::i64  , Custom);
-    setOperationAction(ISD::BlockAddress  , MVT::i64  , Custom);
+  for (auto VT : { MVT::i32, MVT::i64 }) {
+    if (VT == MVT::i64 && !Subtarget.is64Bit())
+      continue;
+    setOperationAction(ISD::ConstantPool    , VT, Custom);
+    setOperationAction(ISD::JumpTable       , VT, Custom);
+    setOperationAction(ISD::GlobalAddress   , VT, Custom);
+    setOperationAction(ISD::GlobalTLSAddress, VT, Custom);
+    setOperationAction(ISD::ExternalSymbol  , VT, Custom);
+    setOperationAction(ISD::BlockAddress    , VT, Custom);
   }
   // 64-bit addm sub, shl, sra, srl (iff 32-bit x86)
-  setOperationAction(ISD::SHL_PARTS       , MVT::i32  , Custom);
-  setOperationAction(ISD::SRA_PARTS       , MVT::i32  , Custom);
-  setOperationAction(ISD::SRL_PARTS       , MVT::i32  , Custom);
-  if (Subtarget->is64Bit()) {
-    setOperationAction(ISD::SHL_PARTS     , MVT::i64  , Custom);
-    setOperationAction(ISD::SRA_PARTS     , MVT::i64  , Custom);
-    setOperationAction(ISD::SRL_PARTS     , MVT::i64  , Custom);
+  for (auto VT : { MVT::i32, MVT::i64 }) {
+    if (VT == MVT::i64 && !Subtarget.is64Bit())
+      continue;
+    setOperationAction(ISD::SHL_PARTS, VT, Custom);
+    setOperationAction(ISD::SRA_PARTS, VT, Custom);
+    setOperationAction(ISD::SRL_PARTS, VT, Custom);
   }
 
-  if (Subtarget->hasSSE1())
+  if (Subtarget.hasSSE1())
     setOperationAction(ISD::PREFETCH      , MVT::Other, Legal);
 
   setOperationAction(ISD::ATOMIC_FENCE  , MVT::Other, Custom);
@@ -480,16 +438,21 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
   for (auto VT : { MVT::i8, MVT::i16, MVT::i32, MVT::i64 }) {
     setOperationAction(ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS, VT, Custom);
     setOperationAction(ISD::ATOMIC_LOAD_SUB, VT, Custom);
+    setOperationAction(ISD::ATOMIC_LOAD_ADD, VT, Custom);
+    setOperationAction(ISD::ATOMIC_LOAD_OR, VT, Custom);
+    setOperationAction(ISD::ATOMIC_LOAD_XOR, VT, Custom);
+    setOperationAction(ISD::ATOMIC_LOAD_AND, VT, Custom);
     setOperationAction(ISD::ATOMIC_STORE, VT, Custom);
   }
 
-  if (Subtarget->hasCmpxchg16b()) {
+  if (Subtarget.hasCmpxchg16b()) {
     setOperationAction(ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS, MVT::i128, Custom);
   }
 
   // FIXME - use subtarget debug flags
-  if (!Subtarget->isTargetDarwin() && !Subtarget->isTargetELF() &&
-      !Subtarget->isTargetCygMing() && !Subtarget->isTargetWin64()) {
+  if (!Subtarget.isTargetDarwin() && !Subtarget.isTargetELF() &&
+      !Subtarget.isTargetCygMing() && !Subtarget.isTargetWin64() &&
+      TM.Options.ExceptionModel != ExceptionHandling::SjLj) {
     setOperationAction(ISD::EH_LABEL, MVT::Other, Expand);
   }
 
@@ -505,14 +468,9 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
   // VASTART needs to be custom lowered to use the VarArgsFrameIndex
   setOperationAction(ISD::VASTART           , MVT::Other, Custom);
   setOperationAction(ISD::VAEND             , MVT::Other, Expand);
-  if (Subtarget->is64Bit()) {
-    setOperationAction(ISD::VAARG           , MVT::Other, Custom);
-    setOperationAction(ISD::VACOPY          , MVT::Other, Custom);
-  } else {
-    // TargetInfo::CharPtrBuiltinVaList
-    setOperationAction(ISD::VAARG           , MVT::Other, Expand);
-    setOperationAction(ISD::VACOPY          , MVT::Other, Expand);
-  }
+  bool Is64Bit = Subtarget.is64Bit();
+  setOperationAction(ISD::VAARG,  MVT::Other, Is64Bit ? Custom : Expand);
+  setOperationAction(ISD::VACOPY, MVT::Other, Is64Bit ? Custom : Expand);
 
   setOperationAction(ISD::STACKSAVE,          MVT::Other, Expand);
   setOperationAction(ISD::STACKRESTORE,       MVT::Other, Expand);
@@ -523,41 +481,37 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
   setOperationAction(ISD::GC_TRANSITION_START, MVT::Other, Custom);
   setOperationAction(ISD::GC_TRANSITION_END, MVT::Other, Custom);
 
-  if (!Subtarget->useSoftFloat() && X86ScalarSSEf64) {
+  if (!Subtarget.useSoftFloat() && X86ScalarSSEf64) {
     // f32 and f64 use SSE.
     // Set up the FP register classes.
     addRegisterClass(MVT::f32, &X86::FR32RegClass);
     addRegisterClass(MVT::f64, &X86::FR64RegClass);
 
-    // Use ANDPD to simulate FABS.
-    setOperationAction(ISD::FABS , MVT::f64, Custom);
-    setOperationAction(ISD::FABS , MVT::f32, Custom);
+    for (auto VT : { MVT::f32, MVT::f64 }) {
+      // Use ANDPD to simulate FABS.
+      setOperationAction(ISD::FABS, VT, Custom);
 
-    // Use XORP to simulate FNEG.
-    setOperationAction(ISD::FNEG , MVT::f64, Custom);
-    setOperationAction(ISD::FNEG , MVT::f32, Custom);
+      // Use XORP to simulate FNEG.
+      setOperationAction(ISD::FNEG, VT, Custom);
 
-    // Use ANDPD and ORPD to simulate FCOPYSIGN.
-    setOperationAction(ISD::FCOPYSIGN, MVT::f64, Custom);
-    setOperationAction(ISD::FCOPYSIGN, MVT::f32, Custom);
+      // Use ANDPD and ORPD to simulate FCOPYSIGN.
+      setOperationAction(ISD::FCOPYSIGN, VT, Custom);
+
+      // We don't support sin/cos/fmod
+      setOperationAction(ISD::FSIN   , VT, Expand);
+      setOperationAction(ISD::FCOS   , VT, Expand);
+      setOperationAction(ISD::FSINCOS, VT, Expand);
+    }
 
-    // Lower this to FGETSIGNx86 plus an AND.
+    // Lower this to MOVMSK plus an AND.
     setOperationAction(ISD::FGETSIGN, MVT::i64, Custom);
     setOperationAction(ISD::FGETSIGN, MVT::i32, Custom);
 
-    // We don't support sin/cos/fmod
-    setOperationAction(ISD::FSIN   , MVT::f64, Expand);
-    setOperationAction(ISD::FCOS   , MVT::f64, Expand);
-    setOperationAction(ISD::FSINCOS, MVT::f64, Expand);
-    setOperationAction(ISD::FSIN   , MVT::f32, Expand);
-    setOperationAction(ISD::FCOS   , MVT::f32, Expand);
-    setOperationAction(ISD::FSINCOS, MVT::f32, Expand);
-
     // Expand FP immediates into loads from the stack, except for the special
     // cases we handle.
     addLegalFPImmediate(APFloat(+0.0)); // xorpd
     addLegalFPImmediate(APFloat(+0.0f)); // xorps
-  } else if (!Subtarget->useSoftFloat() && X86ScalarSSEf32) {
+  } else if (UseX87 && X86ScalarSSEf32) {
     // Use SSE for f32, x87 for f64.
     // Set up the FP register classes.
     addRegisterClass(MVT::f32, &X86::FR32RegClass);
@@ -592,24 +546,21 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
       setOperationAction(ISD::FCOS   , MVT::f64, Expand);
       setOperationAction(ISD::FSINCOS, MVT::f64, Expand);
     }
-  } else if (!Subtarget->useSoftFloat()) {
+  } else if (UseX87) {
     // f32 and f64 in x87.
     // Set up the FP register classes.
     addRegisterClass(MVT::f64, &X86::RFP64RegClass);
     addRegisterClass(MVT::f32, &X86::RFP32RegClass);
 
-    setOperationAction(ISD::UNDEF,     MVT::f64, Expand);
-    setOperationAction(ISD::UNDEF,     MVT::f32, Expand);
-    setOperationAction(ISD::FCOPYSIGN, MVT::f64, Expand);
-    setOperationAction(ISD::FCOPYSIGN, MVT::f32, Expand);
+    for (auto VT : { MVT::f32, MVT::f64 }) {
+      setOperationAction(ISD::UNDEF,     VT, Expand);
+      setOperationAction(ISD::FCOPYSIGN, VT, Expand);
 
-    if (!TM.Options.UnsafeFPMath) {
-      setOperationAction(ISD::FSIN   , MVT::f64, Expand);
-      setOperationAction(ISD::FSIN   , MVT::f32, Expand);
-      setOperationAction(ISD::FCOS   , MVT::f64, Expand);
-      setOperationAction(ISD::FCOS   , MVT::f32, Expand);
-      setOperationAction(ISD::FSINCOS, MVT::f64, Expand);
-      setOperationAction(ISD::FSINCOS, MVT::f32, Expand);
+      if (!TM.Options.UnsafeFPMath) {
+        setOperationAction(ISD::FSIN   , VT, Expand);
+        setOperationAction(ISD::FCOS   , VT, Expand);
+        setOperationAction(ISD::FSINCOS, VT, Expand);
+      }
     }
     addLegalFPImmediate(APFloat(+0.0)); // FLD0
     addLegalFPImmediate(APFloat(+1.0)); // FLD1
@@ -626,8 +577,8 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
   setOperationAction(ISD::FMA, MVT::f32, Expand);
 
   // Long double always uses X87, except f128 in MMX.
-  if (!Subtarget->useSoftFloat()) {
-    if (Subtarget->is64Bit() && Subtarget->hasMMX()) {
+  if (UseX87) {
+    if (Subtarget.is64Bit() && Subtarget.hasMMX()) {
       addRegisterClass(MVT::f128, &X86::FR128RegClass);
       ValueTypeActions.setTypeAction(MVT::f128, TypeSoftenFloat);
       setOperationAction(ISD::FABS , MVT::f128, Custom);
@@ -680,38 +631,36 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
   setOperationAction(ISD::FMINNUM, MVT::f80, Expand);
   setOperationAction(ISD::FMAXNUM, MVT::f80, Expand);
 
+  // Some FP actions are always expanded for vector types.
+  for (auto VT : { MVT::v4f32, MVT::v8f32, MVT::v16f32,
+                   MVT::v2f64, MVT::v4f64, MVT::v8f64 }) {
+    setOperationAction(ISD::FSIN,      VT, Expand);
+    setOperationAction(ISD::FSINCOS,   VT, Expand);
+    setOperationAction(ISD::FCOS,      VT, Expand);
+    setOperationAction(ISD::FREM,      VT, Expand);
+    setOperationAction(ISD::FPOWI,     VT, Expand);
+    setOperationAction(ISD::FCOPYSIGN, VT, Expand);
+    setOperationAction(ISD::FPOW,      VT, Expand);
+    setOperationAction(ISD::FLOG,      VT, Expand);
+    setOperationAction(ISD::FLOG2,     VT, Expand);
+    setOperationAction(ISD::FLOG10,    VT, Expand);
+    setOperationAction(ISD::FEXP,      VT, Expand);
+    setOperationAction(ISD::FEXP2,     VT, Expand);
+  }
+
   // First set operation action for all vector types to either promote
   // (for widening) or expand (for scalarization). Then we will selectively
   // turn on ones that can be effectively codegen'd.
   for (MVT VT : MVT::vector_valuetypes()) {
-    setOperationAction(ISD::ADD , VT, Expand);
-    setOperationAction(ISD::SUB , VT, Expand);
-    setOperationAction(ISD::FADD, VT, Expand);
-    setOperationAction(ISD::FNEG, VT, Expand);
-    setOperationAction(ISD::FSUB, VT, Expand);
-    setOperationAction(ISD::MUL , VT, Expand);
-    setOperationAction(ISD::FMUL, VT, Expand);
     setOperationAction(ISD::SDIV, VT, Expand);
     setOperationAction(ISD::UDIV, VT, Expand);
-    setOperationAction(ISD::FDIV, VT, Expand);
     setOperationAction(ISD::SREM, VT, Expand);
     setOperationAction(ISD::UREM, VT, Expand);
-    setOperationAction(ISD::LOAD, VT, Expand);
-    setOperationAction(ISD::VECTOR_SHUFFLE, VT, Expand);
     setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT,Expand);
     setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Expand);
     setOperationAction(ISD::EXTRACT_SUBVECTOR, VT,Expand);
     setOperationAction(ISD::INSERT_SUBVECTOR, VT,Expand);
-    setOperationAction(ISD::FABS, VT, Expand);
-    setOperationAction(ISD::FSIN, VT, Expand);
-    setOperationAction(ISD::FSINCOS, VT, Expand);
-    setOperationAction(ISD::FCOS, VT, Expand);
-    setOperationAction(ISD::FSINCOS, VT, Expand);
-    setOperationAction(ISD::FREM, VT, Expand);
     setOperationAction(ISD::FMA,  VT, Expand);
-    setOperationAction(ISD::FPOWI, VT, Expand);
-    setOperationAction(ISD::FSQRT, VT, Expand);
-    setOperationAction(ISD::FCOPYSIGN, VT, Expand);
     setOperationAction(ISD::FFLOOR, VT, Expand);
     setOperationAction(ISD::FCEIL, VT, Expand);
     setOperationAction(ISD::FTRUNC, VT, Expand);
@@ -723,24 +672,13 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
     setOperationAction(ISD::MULHU, VT, Expand);
     setOperationAction(ISD::SDIVREM, VT, Expand);
     setOperationAction(ISD::UDIVREM, VT, Expand);
-    setOperationAction(ISD::FPOW, VT, Expand);
     setOperationAction(ISD::CTPOP, VT, Expand);
     setOperationAction(ISD::CTTZ, VT, Expand);
-    setOperationAction(ISD::CTTZ_ZERO_UNDEF, VT, Expand);
     setOperationAction(ISD::CTLZ, VT, Expand);
-    setOperationAction(ISD::CTLZ_ZERO_UNDEF, VT, Expand);
-    setOperationAction(ISD::SHL, VT, Expand);
-    setOperationAction(ISD::SRA, VT, Expand);
-    setOperationAction(ISD::SRL, VT, Expand);
     setOperationAction(ISD::ROTL, VT, Expand);
     setOperationAction(ISD::ROTR, VT, Expand);
     setOperationAction(ISD::BSWAP, VT, Expand);
     setOperationAction(ISD::SETCC, VT, Expand);
-    setOperationAction(ISD::FLOG, VT, Expand);
-    setOperationAction(ISD::FLOG2, VT, Expand);
-    setOperationAction(ISD::FLOG10, VT, Expand);
-    setOperationAction(ISD::FEXP, VT, Expand);
-    setOperationAction(ISD::FEXP2, VT, Expand);
     setOperationAction(ISD::FP_TO_UINT, VT, Expand);
     setOperationAction(ISD::FP_TO_SINT, VT, Expand);
     setOperationAction(ISD::UINT_TO_FP, VT, Expand);
@@ -750,7 +688,6 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
     setOperationAction(ISD::SIGN_EXTEND, VT, Expand);
     setOperationAction(ISD::ZERO_EXTEND, VT, Expand);
     setOperationAction(ISD::ANY_EXTEND, VT, Expand);
-    setOperationAction(ISD::VSELECT, VT, Expand);
     setOperationAction(ISD::SELECT_CC, VT, Expand);
     for (MVT InnerVT : MVT::vector_valuetypes()) {
       setTruncStoreAction(InnerVT, VT, Expand);
@@ -774,35 +711,16 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
 
   // FIXME: In order to prevent SSE instructions being expanded to MMX ones
   // with -msoft-float, disable use of MMX as well.
-  if (!Subtarget->useSoftFloat() && Subtarget->hasMMX()) {
+  if (!Subtarget.useSoftFloat() && Subtarget.hasMMX()) {
     addRegisterClass(MVT::x86mmx, &X86::VR64RegClass);
     // No operations on x86mmx supported, everything uses intrinsics.
   }
 
-  // MMX-sized vectors (other than x86mmx) are expected to be expanded
-  // into smaller operations.
-  for (MVT MMXTy : {MVT::v8i8, MVT::v4i16, MVT::v2i32, MVT::v1i64}) {
-    setOperationAction(ISD::MULHS,              MMXTy,      Expand);
-    setOperationAction(ISD::AND,                MMXTy,      Expand);
-    setOperationAction(ISD::OR,                 MMXTy,      Expand);
-    setOperationAction(ISD::XOR,                MMXTy,      Expand);
-    setOperationAction(ISD::SCALAR_TO_VECTOR,   MMXTy,      Expand);
-    setOperationAction(ISD::SELECT,             MMXTy,      Expand);
-    setOperationAction(ISD::BITCAST,            MMXTy,      Expand);
-  }
-  setOperationAction(ISD::INSERT_VECTOR_ELT,  MVT::v1i64, Expand);
-
-  if (!Subtarget->useSoftFloat() && Subtarget->hasSSE1()) {
+  if (!Subtarget.useSoftFloat() && Subtarget.hasSSE1()) {
     addRegisterClass(MVT::v4f32, &X86::VR128RegClass);
 
-    setOperationAction(ISD::FADD,               MVT::v4f32, Legal);
-    setOperationAction(ISD::FSUB,               MVT::v4f32, Legal);
-    setOperationAction(ISD::FMUL,               MVT::v4f32, Legal);
-    setOperationAction(ISD::FDIV,               MVT::v4f32, Legal);
-    setOperationAction(ISD::FSQRT,              MVT::v4f32, Legal);
     setOperationAction(ISD::FNEG,               MVT::v4f32, Custom);
     setOperationAction(ISD::FABS,               MVT::v4f32, Custom);
-    setOperationAction(ISD::LOAD,               MVT::v4f32, Legal);
     setOperationAction(ISD::BUILD_VECTOR,       MVT::v4f32, Custom);
     setOperationAction(ISD::VECTOR_SHUFFLE,     MVT::v4f32, Custom);
     setOperationAction(ISD::VSELECT,            MVT::v4f32, Custom);
@@ -811,7 +729,7 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
     setOperationAction(ISD::UINT_TO_FP,         MVT::v4i32, Custom);
   }
 
-  if (!Subtarget->useSoftFloat() && Subtarget->hasSSE2()) {
+  if (!Subtarget.useSoftFloat() && Subtarget.hasSSE2()) {
     addRegisterClass(MVT::v2f64, &X86::VR128RegClass);
 
     // FIXME: Unfortunately, -soft-float and -no-implicit-float mean XMM
@@ -821,27 +739,16 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
     addRegisterClass(MVT::v4i32, &X86::VR128RegClass);
     addRegisterClass(MVT::v2i64, &X86::VR128RegClass);
 
-    setOperationAction(ISD::ADD,                MVT::v16i8, Legal);
-    setOperationAction(ISD::ADD,                MVT::v8i16, Legal);
-    setOperationAction(ISD::ADD,                MVT::v4i32, Legal);
-    setOperationAction(ISD::ADD,                MVT::v2i64, Legal);
     setOperationAction(ISD::MUL,                MVT::v16i8, Custom);
     setOperationAction(ISD::MUL,                MVT::v4i32, Custom);
     setOperationAction(ISD::MUL,                MVT::v2i64, Custom);
     setOperationAction(ISD::UMUL_LOHI,          MVT::v4i32, Custom);
     setOperationAction(ISD::SMUL_LOHI,          MVT::v4i32, Custom);
+    setOperationAction(ISD::MULHU,              MVT::v16i8, Custom);
+    setOperationAction(ISD::MULHS,              MVT::v16i8, Custom);
     setOperationAction(ISD::MULHU,              MVT::v8i16, Legal);
     setOperationAction(ISD::MULHS,              MVT::v8i16, Legal);
-    setOperationAction(ISD::SUB,                MVT::v16i8, Legal);
-    setOperationAction(ISD::SUB,                MVT::v8i16, Legal);
-    setOperationAction(ISD::SUB,                MVT::v4i32, Legal);
-    setOperationAction(ISD::SUB,                MVT::v2i64, Legal);
     setOperationAction(ISD::MUL,                MVT::v8i16, Legal);
-    setOperationAction(ISD::FADD,               MVT::v2f64, Legal);
-    setOperationAction(ISD::FSUB,               MVT::v2f64, Legal);
-    setOperationAction(ISD::FMUL,               MVT::v2f64, Legal);
-    setOperationAction(ISD::FDIV,               MVT::v2f64, Legal);
-    setOperationAction(ISD::FSQRT,              MVT::v2f64, Legal);
     setOperationAction(ISD::FNEG,               MVT::v2f64, Custom);
     setOperationAction(ISD::FABS,               MVT::v2f64, Custom);
 
@@ -870,10 +777,6 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
     setOperationAction(ISD::CTTZ,               MVT::v8i16, Custom);
     setOperationAction(ISD::CTTZ,               MVT::v4i32, Custom);
     // ISD::CTTZ v2i64 - scalarization is faster.
-    setOperationAction(ISD::CTTZ_ZERO_UNDEF,    MVT::v16i8, Custom);
-    setOperationAction(ISD::CTTZ_ZERO_UNDEF,    MVT::v8i16, Custom);
-    setOperationAction(ISD::CTTZ_ZERO_UNDEF,    MVT::v4i32, Custom);
-    // ISD::CTTZ_ZERO_UNDEF v2i64 - scalarization is faster.
 
     // Custom lower build_vector, vector_shuffle, and extract_vector_elt.
     for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32 }) {
@@ -899,37 +802,28 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
       setLoadExtAction(ISD::EXTLOAD, VT, MVT::v8i8, Custom);
     }
 
-    setOperationAction(ISD::BUILD_VECTOR,       MVT::v2f64, Custom);
-    setOperationAction(ISD::BUILD_VECTOR,       MVT::v2i64, Custom);
-    setOperationAction(ISD::VECTOR_SHUFFLE,     MVT::v2f64, Custom);
-    setOperationAction(ISD::VECTOR_SHUFFLE,     MVT::v2i64, Custom);
-    setOperationAction(ISD::VSELECT,            MVT::v2f64, Custom);
-    setOperationAction(ISD::VSELECT,            MVT::v2i64, Custom);
-    setOperationAction(ISD::INSERT_VECTOR_ELT,  MVT::v2f64, Custom);
-    setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2f64, Custom);
+    for (auto VT : { MVT::v2f64, MVT::v2i64 }) {
+      setOperationAction(ISD::BUILD_VECTOR,       VT, Custom);
+      setOperationAction(ISD::VECTOR_SHUFFLE,     VT, Custom);
+      setOperationAction(ISD::VSELECT,            VT, Custom);
+
+      if (VT == MVT::v2i64 && !Subtarget.is64Bit())
+        continue;
 
-    if (Subtarget->is64Bit()) {
-      setOperationAction(ISD::INSERT_VECTOR_ELT,  MVT::v2i64, Custom);
-      setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2i64, Custom);
+      setOperationAction(ISD::INSERT_VECTOR_ELT,  VT, Custom);
+      setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom);
     }
 
     // Promote v16i8, v8i16, v4i32 load, select, and, or, xor to v2i64.
     for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32 }) {
-      setOperationAction(ISD::AND,    VT, Promote);
-      AddPromotedToType (ISD::AND,    VT, MVT::v2i64);
-      setOperationAction(ISD::OR,     VT, Promote);
-      AddPromotedToType (ISD::OR,     VT, MVT::v2i64);
-      setOperationAction(ISD::XOR,    VT, Promote);
-      AddPromotedToType (ISD::XOR,    VT, MVT::v2i64);
-      setOperationAction(ISD::LOAD,   VT, Promote);
-      AddPromotedToType (ISD::LOAD,   VT, MVT::v2i64);
-      setOperationAction(ISD::SELECT, VT, Promote);
-      AddPromotedToType (ISD::SELECT, VT, MVT::v2i64);
+      setOperationPromotedToType(ISD::AND,    VT, MVT::v2i64);
+      setOperationPromotedToType(ISD::OR,     VT, MVT::v2i64);
+      setOperationPromotedToType(ISD::XOR,    VT, MVT::v2i64);
+      setOperationPromotedToType(ISD::LOAD,   VT, MVT::v2i64);
+      setOperationPromotedToType(ISD::SELECT, VT, MVT::v2i64);
     }
 
     // Custom lower v2i64 and v2f64 selects.
-    setOperationAction(ISD::LOAD,               MVT::v2f64, Legal);
-    setOperationAction(ISD::LOAD,               MVT::v2i64, Legal);
     setOperationAction(ISD::SELECT,             MVT::v2f64, Custom);
     setOperationAction(ISD::SELECT,             MVT::v2i64, Custom);
 
@@ -942,7 +836,7 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
     setOperationAction(ISD::UINT_TO_FP,         MVT::v4i16, Custom);
     // As there is no 64-bit GPR available, we need build a special custom
     // sequence to convert from v2i32 to v2f32.
-    if (!Subtarget->is64Bit())
+    if (!Subtarget.is64Bit())
       setOperationAction(ISD::UINT_TO_FP,       MVT::v2f32, Custom);
 
     setOperationAction(ISD::FP_EXTEND,          MVT::v2f32, Custom);
@@ -954,9 +848,35 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
     setOperationAction(ISD::BITCAST,            MVT::v2i32, Custom);
     setOperationAction(ISD::BITCAST,            MVT::v4i16, Custom);
     setOperationAction(ISD::BITCAST,            MVT::v8i8,  Custom);
+
+    setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, MVT::v2i64, Custom);
+    setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, MVT::v4i32, Custom);
+    setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, MVT::v8i16, Custom);
+
+    for (auto VT : { MVT::v8i16, MVT::v16i8 }) {
+      setOperationAction(ISD::SRL, VT, Custom);
+      setOperationAction(ISD::SHL, VT, Custom);
+      setOperationAction(ISD::SRA, VT, Custom);
+    }
+
+    // In the customized shift lowering, the legal cases in AVX2 will be
+    // recognized.
+    for (auto VT : { MVT::v4i32, MVT::v2i64 }) {
+      setOperationAction(ISD::SRL, VT, Custom);
+      setOperationAction(ISD::SHL, VT, Custom);
+      setOperationAction(ISD::SRA, VT, Custom);
+    }
+  }
+
+  if (!Subtarget.useSoftFloat() && Subtarget.hasSSSE3()) {
+    setOperationAction(ISD::BITREVERSE,         MVT::v16i8, Custom);
+    setOperationAction(ISD::CTLZ,               MVT::v16i8, Custom);
+    setOperationAction(ISD::CTLZ,               MVT::v8i16, Custom);
+    // ISD::CTLZ v4i32 - scalarization is faster.
+    // ISD::CTLZ v2i64 - scalarization is faster.
   }
 
-  if (!Subtarget->useSoftFloat() && Subtarget->hasSSE41()) {
+  if (!Subtarget.useSoftFloat() && Subtarget.hasSSE41()) {
     for (MVT RoundedTy : {MVT::f32, MVT::f64, MVT::v4f32, MVT::v2f64}) {
       setOperationAction(ISD::FFLOOR,           RoundedTy,  Legal);
       setOperationAction(ISD::FCEIL,            RoundedTy,  Legal);
@@ -1004,66 +924,28 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
     setLoadExtAction(ISD::ZEXTLOAD, MVT::v2i64, MVT::v2i16, Legal);
     setLoadExtAction(ISD::ZEXTLOAD, MVT::v2i64, MVT::v2i32, Legal);
 
-    // i8 and i16 vectors are custom because the source register and source
-    // source memory operand types are not the same width.  f32 vectors are
-    // custom since the immediate controlling the insert encodes additional
-    // information.
+    // i8 vectors are custom because the source register and source
+    // source memory operand types are not the same width.
     setOperationAction(ISD::INSERT_VECTOR_ELT,  MVT::v16i8, Custom);
-    setOperationAction(ISD::INSERT_VECTOR_ELT,  MVT::v8i16, Custom);
-    setOperationAction(ISD::INSERT_VECTOR_ELT,  MVT::v4i32, Custom);
-    setOperationAction(ISD::INSERT_VECTOR_ELT,  MVT::v4f32, Custom);
-
-    setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v16i8, Custom);
-    setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v8i16, Custom);
-    setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v4i32, Custom);
-    setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v4f32, Custom);
-
-    // FIXME: these should be Legal, but that's only for the case where
-    // the index is constant.  For now custom expand to deal with that.
-    if (Subtarget->is64Bit()) {
-      setOperationAction(ISD::INSERT_VECTOR_ELT,  MVT::v2i64, Custom);
-      setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2i64, Custom);
-    }
   }
 
-  if (Subtarget->hasSSE2()) {
-    setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, MVT::v2i64, Custom);
-    setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, MVT::v4i32, Custom);
-    setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, MVT::v8i16, Custom);
-
-    setOperationAction(ISD::SRL,               MVT::v8i16, Custom);
-    setOperationAction(ISD::SRL,               MVT::v16i8, Custom);
+  if (!Subtarget.useSoftFloat() && Subtarget.hasXOP()) {
+    for (auto VT : { MVT::v16i8, MVT::v8i16,  MVT::v4i32, MVT::v2i64,
+                     MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64 })
+      setOperationAction(ISD::ROTL, VT, Custom);
 
-    setOperationAction(ISD::SHL,               MVT::v8i16, Custom);
-    setOperationAction(ISD::SHL,               MVT::v16i8, Custom);
-
-    setOperationAction(ISD::SRA,               MVT::v8i16, Custom);
-    setOperationAction(ISD::SRA,               MVT::v16i8, Custom);
-
-    // In the customized shift lowering, the legal cases in AVX2 will be
-    // recognized.
-    setOperationAction(ISD::SRL,               MVT::v2i64, Custom);
-    setOperationAction(ISD::SRL,               MVT::v4i32, Custom);
+    // XOP can efficiently perform BITREVERSE with VPPERM.
+    for (auto VT : { MVT::i8, MVT::i16, MVT::i32, MVT::i64 })
+      setOperationAction(ISD::BITREVERSE, VT, Custom);
 
-    setOperationAction(ISD::SHL,               MVT::v2i64, Custom);
-    setOperationAction(ISD::SHL,               MVT::v4i32, Custom);
-
-    setOperationAction(ISD::SRA,               MVT::v2i64, Custom);
-    setOperationAction(ISD::SRA,               MVT::v4i32, Custom);
+    for (auto VT : { MVT::v16i8, MVT::v8i16,  MVT::v4i32, MVT::v2i64,
+                     MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64 })
+      setOperationAction(ISD::BITREVERSE, VT, Custom);
   }
 
-  if (Subtarget->hasXOP()) {
-    setOperationAction(ISD::ROTL,              MVT::v16i8, Custom);
-    setOperationAction(ISD::ROTL,              MVT::v8i16, Custom);
-    setOperationAction(ISD::ROTL,              MVT::v4i32, Custom);
-    setOperationAction(ISD::ROTL,              MVT::v2i64, Custom);
-    setOperationAction(ISD::ROTL,              MVT::v32i8, Custom);
-    setOperationAction(ISD::ROTL,              MVT::v16i16, Custom);
-    setOperationAction(ISD::ROTL,              MVT::v8i32, Custom);
-    setOperationAction(ISD::ROTL,              MVT::v4i64, Custom);
-  }
+  if (!Subtarget.useSoftFloat() && Subtarget.hasFp256()) {
+    bool HasInt256 = Subtarget.hasInt256();
 
-  if (!Subtarget->useSoftFloat() && Subtarget->hasFp256()) {
     addRegisterClass(MVT::v32i8,  &X86::VR256RegClass);
     addRegisterClass(MVT::v16i16, &X86::VR256RegClass);
     addRegisterClass(MVT::v8i32,  &X86::VR256RegClass);
@@ -1071,35 +953,15 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
     addRegisterClass(MVT::v4i64,  &X86::VR256RegClass);
     addRegisterClass(MVT::v4f64,  &X86::VR256RegClass);
 
-    setOperationAction(ISD::LOAD,               MVT::v8f32, Legal);
-    setOperationAction(ISD::LOAD,               MVT::v4f64, Legal);
-    setOperationAction(ISD::LOAD,               MVT::v4i64, Legal);
-
-    setOperationAction(ISD::FADD,               MVT::v8f32, Legal);
-    setOperationAction(ISD::FSUB,               MVT::v8f32, Legal);
-    setOperationAction(ISD::FMUL,               MVT::v8f32, Legal);
-    setOperationAction(ISD::FDIV,               MVT::v8f32, Legal);
-    setOperationAction(ISD::FSQRT,              MVT::v8f32, Legal);
-    setOperationAction(ISD::FFLOOR,             MVT::v8f32, Legal);
-    setOperationAction(ISD::FCEIL,              MVT::v8f32, Legal);
-    setOperationAction(ISD::FTRUNC,             MVT::v8f32, Legal);
-    setOperationAction(ISD::FRINT,              MVT::v8f32, Legal);
-    setOperationAction(ISD::FNEARBYINT,         MVT::v8f32, Legal);
-    setOperationAction(ISD::FNEG,               MVT::v8f32, Custom);
-    setOperationAction(ISD::FABS,               MVT::v8f32, Custom);
-
-    setOperationAction(ISD::FADD,               MVT::v4f64, Legal);
-    setOperationAction(ISD::FSUB,               MVT::v4f64, Legal);
-    setOperationAction(ISD::FMUL,               MVT::v4f64, Legal);
-    setOperationAction(ISD::FDIV,               MVT::v4f64, Legal);
-    setOperationAction(ISD::FSQRT,              MVT::v4f64, Legal);
-    setOperationAction(ISD::FFLOOR,             MVT::v4f64, Legal);
-    setOperationAction(ISD::FCEIL,              MVT::v4f64, Legal);
-    setOperationAction(ISD::FTRUNC,             MVT::v4f64, Legal);
-    setOperationAction(ISD::FRINT,              MVT::v4f64, Legal);
-    setOperationAction(ISD::FNEARBYINT,         MVT::v4f64, Legal);
-    setOperationAction(ISD::FNEG,               MVT::v4f64, Custom);
-    setOperationAction(ISD::FABS,               MVT::v4f64, Custom);
+    for (auto VT : { MVT::v8f32, MVT::v4f64 }) {
+      setOperationAction(ISD::FFLOOR,     VT, Legal);
+      setOperationAction(ISD::FCEIL,      VT, Legal);
+      setOperationAction(ISD::FTRUNC,     VT, Legal);
+      setOperationAction(ISD::FRINT,      VT, Legal);
+      setOperationAction(ISD::FNEARBYINT, VT, Legal);
+      setOperationAction(ISD::FNEG,       VT, Custom);
+      setOperationAction(ISD::FABS,       VT, Custom);
+    }
 
     // (fp_to_int:v8i16 (v8f32 ..)) requires the result type to be promoted
     // even though v8i16 is a legal type.
@@ -1117,14 +979,11 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
     for (MVT VT : MVT::fp_vector_valuetypes())
       setLoadExtAction(ISD::EXTLOAD, VT, MVT::v4f32, Legal);
 
-    setOperationAction(ISD::SRL,               MVT::v16i16, Custom);
-    setOperationAction(ISD::SRL,               MVT::v32i8, Custom);
-
-    setOperationAction(ISD::SHL,               MVT::v16i16, Custom);
-    setOperationAction(ISD::SHL,               MVT::v32i8, Custom);
-
-    setOperationAction(ISD::SRA,               MVT::v16i16, Custom);
-    setOperationAction(ISD::SRA,               MVT::v32i8, Custom);
+    for (auto VT : { MVT::v32i8, MVT::v16i16 }) {
+      setOperationAction(ISD::SRL, VT, Custom);
+      setOperationAction(ISD::SHL, VT, Custom);
+      setOperationAction(ISD::SRA, VT, Custom);
+    }
 
     setOperationAction(ISD::SETCC,             MVT::v32i8, Custom);
     setOperationAction(ISD::SETCC,             MVT::v16i16, Custom);
@@ -1147,63 +1006,57 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
     setOperationAction(ISD::TRUNCATE,          MVT::v16i8, Custom);
     setOperationAction(ISD::TRUNCATE,          MVT::v8i16, Custom);
     setOperationAction(ISD::TRUNCATE,          MVT::v4i32, Custom);
+    setOperationAction(ISD::BITREVERSE,        MVT::v32i8, Custom);
+
+    for (auto VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64 }) {
+      setOperationAction(ISD::CTPOP,           VT, Custom);
+      setOperationAction(ISD::CTTZ,            VT, Custom);
+    }
+
+    // ISD::CTLZ v8i32/v4i64 - scalarization is faster without AVX2
+    // as we end up splitting the 256-bit vectors.
+    for (auto VT : { MVT::v32i8, MVT::v16i16 })
+      setOperationAction(ISD::CTLZ,            VT, Custom);
+
+    if (HasInt256)
+      for (auto VT : { MVT::v8i32, MVT::v4i64 })
+        setOperationAction(ISD::CTLZ,          VT, Custom);
 
-    setOperationAction(ISD::CTPOP,             MVT::v32i8, Custom);
-    setOperationAction(ISD::CTPOP,             MVT::v16i16, Custom);
-    setOperationAction(ISD::CTPOP,             MVT::v8i32, Custom);
-    setOperationAction(ISD::CTPOP,             MVT::v4i64, Custom);
-
-    setOperationAction(ISD::CTTZ,              MVT::v32i8, Custom);
-    setOperationAction(ISD::CTTZ,              MVT::v16i16, Custom);
-    setOperationAction(ISD::CTTZ,              MVT::v8i32, Custom);
-    setOperationAction(ISD::CTTZ,              MVT::v4i64, Custom);
-    setOperationAction(ISD::CTTZ_ZERO_UNDEF,   MVT::v32i8, Custom);
-    setOperationAction(ISD::CTTZ_ZERO_UNDEF,   MVT::v16i16, Custom);
-    setOperationAction(ISD::CTTZ_ZERO_UNDEF,   MVT::v8i32, Custom);
-    setOperationAction(ISD::CTTZ_ZERO_UNDEF,   MVT::v4i64, Custom);
-
-    if (Subtarget->hasAnyFMA()) {
-      setOperationAction(ISD::FMA,             MVT::v8f32, Legal);
-      setOperationAction(ISD::FMA,             MVT::v4f64, Legal);
-      setOperationAction(ISD::FMA,             MVT::v4f32, Legal);
-      setOperationAction(ISD::FMA,             MVT::v2f64, Legal);
-      setOperationAction(ISD::FMA,             MVT::f32, Legal);
-      setOperationAction(ISD::FMA,             MVT::f64, Legal);
-    }
-
-    if (Subtarget->hasInt256()) {
-      setOperationAction(ISD::ADD,             MVT::v4i64, Legal);
-      setOperationAction(ISD::ADD,             MVT::v8i32, Legal);
-      setOperationAction(ISD::ADD,             MVT::v16i16, Legal);
-      setOperationAction(ISD::ADD,             MVT::v32i8, Legal);
-
-      setOperationAction(ISD::SUB,             MVT::v4i64, Legal);
-      setOperationAction(ISD::SUB,             MVT::v8i32, Legal);
-      setOperationAction(ISD::SUB,             MVT::v16i16, Legal);
-      setOperationAction(ISD::SUB,             MVT::v32i8, Legal);
-
-      setOperationAction(ISD::MUL,             MVT::v4i64, Custom);
-      setOperationAction(ISD::MUL,             MVT::v8i32, Legal);
-      setOperationAction(ISD::MUL,             MVT::v16i16, Legal);
-      setOperationAction(ISD::MUL,             MVT::v32i8, Custom);
-
-      setOperationAction(ISD::UMUL_LOHI,       MVT::v8i32, Custom);
-      setOperationAction(ISD::SMUL_LOHI,       MVT::v8i32, Custom);
-      setOperationAction(ISD::MULHU,           MVT::v16i16, Legal);
-      setOperationAction(ISD::MULHS,           MVT::v16i16, Legal);
-
-      setOperationAction(ISD::SMAX,            MVT::v32i8,  Legal);
-      setOperationAction(ISD::SMAX,            MVT::v16i16, Legal);
-      setOperationAction(ISD::SMAX,            MVT::v8i32,  Legal);
-      setOperationAction(ISD::UMAX,            MVT::v32i8,  Legal);
-      setOperationAction(ISD::UMAX,            MVT::v16i16, Legal);
-      setOperationAction(ISD::UMAX,            MVT::v8i32,  Legal);
-      setOperationAction(ISD::SMIN,            MVT::v32i8,  Legal);
-      setOperationAction(ISD::SMIN,            MVT::v16i16, Legal);
-      setOperationAction(ISD::SMIN,            MVT::v8i32,  Legal);
-      setOperationAction(ISD::UMIN,            MVT::v32i8,  Legal);
-      setOperationAction(ISD::UMIN,            MVT::v16i16, Legal);
-      setOperationAction(ISD::UMIN,            MVT::v8i32,  Legal);
+    if (Subtarget.hasAnyFMA()) {
+      for (auto VT : { MVT::f32, MVT::f64, MVT::v4f32, MVT::v8f32,
+                       MVT::v2f64, MVT::v4f64 })
+        setOperationAction(ISD::FMA, VT, Legal);
+    }
+
+    for (auto VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64 }) {
+      setOperationAction(ISD::ADD, VT, HasInt256 ? Legal : Custom);
+      setOperationAction(ISD::SUB, VT, HasInt256 ? Legal : Custom);
+    }
+
+    setOperationAction(ISD::MUL,       MVT::v4i64,  Custom);
+    setOperationAction(ISD::MUL,       MVT::v8i32,  HasInt256 ? Legal : Custom);
+    setOperationAction(ISD::MUL,       MVT::v16i16, HasInt256 ? Legal : Custom);
+    setOperationAction(ISD::MUL,       MVT::v32i8,  Custom);
+
+    setOperationAction(ISD::UMUL_LOHI, MVT::v8i32,  Custom);
+    setOperationAction(ISD::SMUL_LOHI, MVT::v8i32,  Custom);
+
+    setOperationAction(ISD::MULHU,     MVT::v16i16, HasInt256 ? Legal : Custom);
+    setOperationAction(ISD::MULHS,     MVT::v16i16, HasInt256 ? Legal : Custom);
+    setOperationAction(ISD::MULHU,     MVT::v32i8,  Custom);
+    setOperationAction(ISD::MULHS,     MVT::v32i8,  Custom);
+
+    for (auto VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32 }) {
+      setOperationAction(ISD::SMAX, VT, HasInt256 ? Legal : Custom);
+      setOperationAction(ISD::UMAX, VT, HasInt256 ? Legal : Custom);
+      setOperationAction(ISD::SMIN, VT, HasInt256 ? Legal : Custom);
+      setOperationAction(ISD::UMIN, VT, HasInt256 ? Legal : Custom);
+    }
+
+    if (HasInt256) {
+      setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, MVT::v4i64,  Custom);
+      setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, MVT::v8i32,  Custom);
+      setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, MVT::v16i16, Custom);
 
       // The custom lowering for UINT_TO_FP for v8i32 becomes interesting
       // when we have a 256bit-wide blend with immediate.
@@ -1223,62 +1076,32 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
       setLoadExtAction(ISD::ZEXTLOAD, MVT::v8i32,  MVT::v8i16, Legal);
       setLoadExtAction(ISD::ZEXTLOAD, MVT::v4i64,  MVT::v4i16, Legal);
       setLoadExtAction(ISD::ZEXTLOAD, MVT::v4i64,  MVT::v4i32, Legal);
-    } else {
-      setOperationAction(ISD::ADD,             MVT::v4i64, Custom);
-      setOperationAction(ISD::ADD,             MVT::v8i32, Custom);
-      setOperationAction(ISD::ADD,             MVT::v16i16, Custom);
-      setOperationAction(ISD::ADD,             MVT::v32i8, Custom);
-
-      setOperationAction(ISD::SUB,             MVT::v4i64, Custom);
-      setOperationAction(ISD::SUB,             MVT::v8i32, Custom);
-      setOperationAction(ISD::SUB,             MVT::v16i16, Custom);
-      setOperationAction(ISD::SUB,             MVT::v32i8, Custom);
-
-      setOperationAction(ISD::MUL,             MVT::v4i64, Custom);
-      setOperationAction(ISD::MUL,             MVT::v8i32, Custom);
-      setOperationAction(ISD::MUL,             MVT::v16i16, Custom);
-      setOperationAction(ISD::MUL,             MVT::v32i8, Custom);
-
-      setOperationAction(ISD::SMAX,            MVT::v32i8,  Custom);
-      setOperationAction(ISD::SMAX,            MVT::v16i16, Custom);
-      setOperationAction(ISD::SMAX,            MVT::v8i32,  Custom);
-      setOperationAction(ISD::UMAX,            MVT::v32i8,  Custom);
-      setOperationAction(ISD::UMAX,            MVT::v16i16, Custom);
-      setOperationAction(ISD::UMAX,            MVT::v8i32,  Custom);
-      setOperationAction(ISD::SMIN,            MVT::v32i8,  Custom);
-      setOperationAction(ISD::SMIN,            MVT::v16i16, Custom);
-      setOperationAction(ISD::SMIN,            MVT::v8i32,  Custom);
-      setOperationAction(ISD::UMIN,            MVT::v32i8,  Custom);
-      setOperationAction(ISD::UMIN,            MVT::v16i16, Custom);
-      setOperationAction(ISD::UMIN,            MVT::v8i32,  Custom);
     }
 
     // In the customized shift lowering, the legal cases in AVX2 will be
     // recognized.
-    setOperationAction(ISD::SRL,               MVT::v4i64, Custom);
-    setOperationAction(ISD::SRL,               MVT::v8i32, Custom);
+    for (auto VT : { MVT::v8i32, MVT::v4i64 }) {
+      setOperationAction(ISD::SRL, VT, Custom);
+      setOperationAction(ISD::SHL, VT, Custom);
+      setOperationAction(ISD::SRA, VT, Custom);
+    }
 
-    setOperationAction(ISD::SHL,               MVT::v4i64, Custom);
-    setOperationAction(ISD::SHL,               MVT::v8i32, Custom);
+    for (auto VT : { MVT::v4i32, MVT::v8i32, MVT::v2i64, MVT::v4i64,
+                     MVT::v4f32, MVT::v8f32, MVT::v2f64, MVT::v4f64 }) {
+      setOperationAction(ISD::MLOAD,  VT, Legal);
+      setOperationAction(ISD::MSTORE, VT, Legal);
+    }
 
-    setOperationAction(ISD::SRA,               MVT::v4i64, Custom);
-    setOperationAction(ISD::SRA,               MVT::v8i32, Custom);
+    // Extract subvector is special because the value type
+    // (result) is 128-bit but the source is 256-bit wide.
+    for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64,
+                     MVT::v4f32, MVT::v2f64 }) {
+      setOperationAction(ISD::EXTRACT_SUBVECTOR, VT, Custom);
+    }
 
     // Custom lower several nodes for 256-bit types.
-    for (MVT VT : MVT::vector_valuetypes()) {
-      if (VT.getScalarSizeInBits() >= 32) {
-        setOperationAction(ISD::MLOAD,  VT, Legal);
-        setOperationAction(ISD::MSTORE, VT, Legal);
-      }
-      // Extract subvector is special because the value type
-      // (result) is 128-bit but the source is 256-bit wide.
-      if (VT.is128BitVector()) {
-        setOperationAction(ISD::EXTRACT_SUBVECTOR, VT, Custom);
-      }
-      // Do not attempt to custom lower other non-256-bit vectors
-      if (!VT.is256BitVector())
-        continue;
-
+    for (MVT VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64,
+                    MVT::v8f32, MVT::v4f64 }) {
       setOperationAction(ISD::BUILD_VECTOR,       VT, Custom);
       setOperationAction(ISD::VECTOR_SHUFFLE,     VT, Custom);
       setOperationAction(ISD::VSELECT,            VT, Custom);
@@ -1289,25 +1112,20 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
       setOperationAction(ISD::CONCAT_VECTORS,     VT, Custom);
     }
 
-    if (Subtarget->hasInt256())
+    if (HasInt256)
       setOperationAction(ISD::VSELECT,         MVT::v32i8, Legal);
 
     // Promote v32i8, v16i16, v8i32 select, and, or, xor to v4i64.
     for (auto VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32 }) {
-      setOperationAction(ISD::AND,    VT, Promote);
-      AddPromotedToType (ISD::AND,    VT, MVT::v4i64);
-      setOperationAction(ISD::OR,     VT, Promote);
-      AddPromotedToType (ISD::OR,     VT, MVT::v4i64);
-      setOperationAction(ISD::XOR,    VT, Promote);
-      AddPromotedToType (ISD::XOR,    VT, MVT::v4i64);
-      setOperationAction(ISD::LOAD,   VT, Promote);
-      AddPromotedToType (ISD::LOAD,   VT, MVT::v4i64);
-      setOperationAction(ISD::SELECT, VT, Promote);
-      AddPromotedToType (ISD::SELECT, VT, MVT::v4i64);
+      setOperationPromotedToType(ISD::AND,    VT, MVT::v4i64);
+      setOperationPromotedToType(ISD::OR,     VT, MVT::v4i64);
+      setOperationPromotedToType(ISD::XOR,    VT, MVT::v4i64);
+      setOperationPromotedToType(ISD::LOAD,   VT, MVT::v4i64);
+      setOperationPromotedToType(ISD::SELECT, VT, MVT::v4i64);
     }
   }
 
-  if (!Subtarget->useSoftFloat() && Subtarget->hasAVX512()) {
+  if (!Subtarget.useSoftFloat() && Subtarget.hasAVX512()) {
     addRegisterClass(MVT::v16i32, &X86::VR512RegClass);
     addRegisterClass(MVT::v16f32, &X86::VR512RegClass);
     addRegisterClass(MVT::v8i64,  &X86::VR512RegClass);
@@ -1320,19 +1138,14 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
     for (MVT VT : MVT::fp_vector_valuetypes())
       setLoadExtAction(ISD::EXTLOAD, VT, MVT::v8f32, Legal);
 
-    setLoadExtAction(ISD::ZEXTLOAD, MVT::v16i32, MVT::v16i8, Legal);
-    setLoadExtAction(ISD::SEXTLOAD, MVT::v16i32, MVT::v16i8, Legal);
-    setLoadExtAction(ISD::ZEXTLOAD, MVT::v16i32, MVT::v16i16, Legal);
-    setLoadExtAction(ISD::SEXTLOAD, MVT::v16i32, MVT::v16i16, Legal);
-    setLoadExtAction(ISD::ZEXTLOAD, MVT::v32i16, MVT::v32i8, Legal);
-    setLoadExtAction(ISD::SEXTLOAD, MVT::v32i16, MVT::v32i8, Legal);
-    setLoadExtAction(ISD::ZEXTLOAD, MVT::v8i64,  MVT::v8i8,  Legal);
-    setLoadExtAction(ISD::SEXTLOAD, MVT::v8i64,  MVT::v8i8,  Legal);
-    setLoadExtAction(ISD::ZEXTLOAD, MVT::v8i64,  MVT::v8i16,  Legal);
-    setLoadExtAction(ISD::SEXTLOAD, MVT::v8i64,  MVT::v8i16,  Legal);
-    setLoadExtAction(ISD::ZEXTLOAD, MVT::v8i64,  MVT::v8i32,  Legal);
-    setLoadExtAction(ISD::SEXTLOAD, MVT::v8i64,  MVT::v8i32,  Legal);
-
+    for (auto ExtType : {ISD::ZEXTLOAD, ISD::SEXTLOAD, ISD::EXTLOAD}) {
+      setLoadExtAction(ExtType, MVT::v16i32, MVT::v16i8,  Legal);
+      setLoadExtAction(ExtType, MVT::v16i32, MVT::v16i16, Legal);
+      setLoadExtAction(ExtType, MVT::v32i16, MVT::v32i8,  Legal);
+      setLoadExtAction(ExtType, MVT::v8i64,  MVT::v8i8,   Legal);
+      setLoadExtAction(ExtType, MVT::v8i64,  MVT::v8i16,  Legal);
+      setLoadExtAction(ExtType, MVT::v8i64,  MVT::v8i32,  Legal);
+    }
     setOperationAction(ISD::BR_CC,              MVT::i1,    Expand);
     setOperationAction(ISD::SETCC,              MVT::i1,    Custom);
     setOperationAction(ISD::SETCCE,             MVT::i1,    Custom);
@@ -1343,29 +1156,22 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
     setOperationAction(ISD::SUB,                MVT::i1,    Custom);
     setOperationAction(ISD::ADD,                MVT::i1,    Custom);
     setOperationAction(ISD::MUL,                MVT::i1,    Custom);
-    setOperationAction(ISD::LOAD,               MVT::v16f32, Legal);
-    setOperationAction(ISD::LOAD,               MVT::v8f64, Legal);
-    setOperationAction(ISD::LOAD,               MVT::v8i64, Legal);
-    setOperationAction(ISD::LOAD,               MVT::v16i32, Legal);
-    setOperationAction(ISD::LOAD,               MVT::v16i1, Legal);
-
-    setOperationAction(ISD::FADD,               MVT::v16f32, Legal);
-    setOperationAction(ISD::FSUB,               MVT::v16f32, Legal);
-    setOperationAction(ISD::FMUL,               MVT::v16f32, Legal);
-    setOperationAction(ISD::FDIV,               MVT::v16f32, Legal);
-    setOperationAction(ISD::FSQRT,              MVT::v16f32, Legal);
-    setOperationAction(ISD::FNEG,               MVT::v16f32, Custom);
-    setOperationAction(ISD::FABS,               MVT::v16f32, Custom);
-
-    setOperationAction(ISD::FADD,               MVT::v8f64, Legal);
-    setOperationAction(ISD::FSUB,               MVT::v8f64, Legal);
-    setOperationAction(ISD::FMUL,               MVT::v8f64, Legal);
-    setOperationAction(ISD::FDIV,               MVT::v8f64, Legal);
-    setOperationAction(ISD::FSQRT,              MVT::v8f64, Legal);
-    setOperationAction(ISD::FNEG,               MVT::v8f64, Custom);
-    setOperationAction(ISD::FABS,               MVT::v8f64, Custom);
-    setOperationAction(ISD::FMA,                MVT::v8f64, Legal);
-    setOperationAction(ISD::FMA,                MVT::v16f32, Legal);
+
+    for (MVT VT : {MVT::v2i64, MVT::v4i32, MVT::v8i32, MVT::v4i64, MVT::v8i16,
+                   MVT::v16i8, MVT::v16i16, MVT::v32i8, MVT::v16i32,
+                   MVT::v8i64, MVT::v32i16, MVT::v64i8}) {
+      MVT MaskVT = MVT::getVectorVT(MVT::i1, VT.getVectorNumElements());
+      setLoadExtAction(ISD::SEXTLOAD, VT, MaskVT, Custom);
+      setLoadExtAction(ISD::ZEXTLOAD, VT, MaskVT, Custom);
+      setLoadExtAction(ISD::EXTLOAD,  VT, MaskVT, Custom);
+      setTruncStoreAction(VT, MaskVT, Custom);
+    }
+
+    for (MVT VT : { MVT::v16f32, MVT::v8f64 }) {
+      setOperationAction(ISD::FNEG,  VT, Custom);
+      setOperationAction(ISD::FABS,  VT, Custom);
+      setOperationAction(ISD::FMA,   VT, Legal);
+    }
 
     setOperationAction(ISD::FP_TO_SINT,         MVT::v16i32, Legal);
     setOperationAction(ISD::FP_TO_UINT,         MVT::v16i32, Legal);
@@ -1389,7 +1195,7 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
     setTruncStoreAction(MVT::v8i64,   MVT::v8i32,  Legal);
     setTruncStoreAction(MVT::v16i32,  MVT::v16i8,  Legal);
     setTruncStoreAction(MVT::v16i32,  MVT::v16i16, Legal);
-    if (Subtarget->hasVLX()){
+    if (Subtarget.hasVLX()){
       setTruncStoreAction(MVT::v4i64, MVT::v4i8,  Legal);
       setTruncStoreAction(MVT::v4i64, MVT::v4i16, Legal);
       setTruncStoreAction(MVT::v4i64, MVT::v4i32, Legal);
@@ -1412,15 +1218,14 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
     setOperationAction(ISD::TRUNCATE,           MVT::v8i32, Custom);
     setOperationAction(ISD::VECTOR_SHUFFLE,     MVT::v8i1,  Custom);
     setOperationAction(ISD::VECTOR_SHUFFLE,     MVT::v16i1, Custom);
-    if (Subtarget->hasDQI()) {
-      setOperationAction(ISD::TRUNCATE,         MVT::v2i1, Custom);
-      setOperationAction(ISD::TRUNCATE,         MVT::v4i1, Custom);
-
+    setOperationAction(ISD::VSELECT,            MVT::v8i1,  Expand);
+    setOperationAction(ISD::VSELECT,            MVT::v16i1, Expand);
+    if (Subtarget.hasDQI()) {
       setOperationAction(ISD::SINT_TO_FP,       MVT::v8i64, Legal);
       setOperationAction(ISD::UINT_TO_FP,       MVT::v8i64, Legal);
       setOperationAction(ISD::FP_TO_SINT,       MVT::v8i64, Legal);
       setOperationAction(ISD::FP_TO_UINT,       MVT::v8i64, Legal);
-      if (Subtarget->hasVLX()) {
+      if (Subtarget.hasVLX()) {
         setOperationAction(ISD::SINT_TO_FP,    MVT::v4i64, Legal);
         setOperationAction(ISD::SINT_TO_FP,    MVT::v2i64, Legal);
         setOperationAction(ISD::UINT_TO_FP,    MVT::v4i64, Legal);
@@ -1431,7 +1236,7 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
         setOperationAction(ISD::FP_TO_UINT,    MVT::v2i64, Legal);
       }
     }
-    if (Subtarget->hasVLX()) {
+    if (Subtarget.hasVLX()) {
       setOperationAction(ISD::SINT_TO_FP,       MVT::v8i32, Legal);
       setOperationAction(ISD::UINT_TO_FP,       MVT::v8i32, Legal);
       setOperationAction(ISD::FP_TO_SINT,       MVT::v8i32, Legal);
@@ -1440,7 +1245,22 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
       setOperationAction(ISD::UINT_TO_FP,       MVT::v4i32, Legal);
       setOperationAction(ISD::FP_TO_SINT,       MVT::v4i32, Legal);
       setOperationAction(ISD::FP_TO_UINT,       MVT::v4i32, Legal);
+      setOperationAction(ISD::ZERO_EXTEND,      MVT::v4i32, Custom);
+      setOperationAction(ISD::ZERO_EXTEND,      MVT::v2i64, Custom);
+
+      // FIXME. This commands are available on SSE/AVX2, add relevant patterns.
+      setLoadExtAction(ISD::EXTLOAD, MVT::v8i32, MVT::v8i8,  Legal);
+      setLoadExtAction(ISD::EXTLOAD, MVT::v8i32, MVT::v8i16, Legal);
+      setLoadExtAction(ISD::EXTLOAD, MVT::v4i32, MVT::v4i8,  Legal);
+      setLoadExtAction(ISD::EXTLOAD, MVT::v4i32, MVT::v4i16, Legal);
+      setLoadExtAction(ISD::EXTLOAD, MVT::v4i64, MVT::v4i8,  Legal);
+      setLoadExtAction(ISD::EXTLOAD, MVT::v4i64, MVT::v4i16, Legal);
+      setLoadExtAction(ISD::EXTLOAD, MVT::v4i64, MVT::v4i32, Legal);
+      setLoadExtAction(ISD::EXTLOAD, MVT::v2i64, MVT::v2i8,  Legal);
+      setLoadExtAction(ISD::EXTLOAD, MVT::v2i64, MVT::v2i16, Legal);
+      setLoadExtAction(ISD::EXTLOAD, MVT::v2i64, MVT::v2i32, Legal);
     }
+
     setOperationAction(ISD::TRUNCATE,           MVT::v8i1, Custom);
     setOperationAction(ISD::TRUNCATE,           MVT::v16i1, Custom);
     setOperationAction(ISD::TRUNCATE,           MVT::v16i16, Custom);
@@ -1453,20 +1273,17 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
     setOperationAction(ISD::SIGN_EXTEND,        MVT::v16i8, Custom);
     setOperationAction(ISD::SIGN_EXTEND,        MVT::v8i16, Custom);
     setOperationAction(ISD::SIGN_EXTEND,        MVT::v16i16, Custom);
-    if (Subtarget->hasDQI()) {
+    if (Subtarget.hasDQI()) {
       setOperationAction(ISD::SIGN_EXTEND,        MVT::v4i32, Custom);
       setOperationAction(ISD::SIGN_EXTEND,        MVT::v2i64, Custom);
     }
-    setOperationAction(ISD::FFLOOR,             MVT::v16f32, Legal);
-    setOperationAction(ISD::FFLOOR,             MVT::v8f64, Legal);
-    setOperationAction(ISD::FCEIL,              MVT::v16f32, Legal);
-    setOperationAction(ISD::FCEIL,              MVT::v8f64, Legal);
-    setOperationAction(ISD::FTRUNC,             MVT::v16f32, Legal);
-    setOperationAction(ISD::FTRUNC,             MVT::v8f64, Legal);
-    setOperationAction(ISD::FRINT,              MVT::v16f32, Legal);
-    setOperationAction(ISD::FRINT,              MVT::v8f64, Legal);
-    setOperationAction(ISD::FNEARBYINT,         MVT::v16f32, Legal);
-    setOperationAction(ISD::FNEARBYINT,         MVT::v8f64, Legal);
+    for (auto VT : { MVT::v16f32, MVT::v8f64 }) {
+      setOperationAction(ISD::FFLOOR,     VT, Legal);
+      setOperationAction(ISD::FCEIL,      VT, Legal);
+      setOperationAction(ISD::FTRUNC,     VT, Legal);
+      setOperationAction(ISD::FRINT,      VT, Legal);
+      setOperationAction(ISD::FNEARBYINT, VT, Legal);
+    }
 
     setOperationAction(ISD::CONCAT_VECTORS,     MVT::v8f64,  Custom);
     setOperationAction(ISD::CONCAT_VECTORS,     MVT::v8i64,  Custom);
@@ -1501,139 +1318,115 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
     setOperationAction(ISD::UMIN,               MVT::v16i32, Legal);
     setOperationAction(ISD::UMIN,               MVT::v8i64, Legal);
 
-    setOperationAction(ISD::ADD,                MVT::v8i64, Legal);
-    setOperationAction(ISD::ADD,                MVT::v16i32, Legal);
-
-    setOperationAction(ISD::SUB,                MVT::v8i64, Legal);
-    setOperationAction(ISD::SUB,                MVT::v16i32, Legal);
+    setOperationAction(ISD::ADD,                MVT::v8i1,  Expand);
+    setOperationAction(ISD::ADD,                MVT::v16i1, Expand);
+    setOperationAction(ISD::SUB,                MVT::v8i1,  Expand);
+    setOperationAction(ISD::SUB,                MVT::v16i1, Expand);
+    setOperationAction(ISD::MUL,                MVT::v8i1,  Expand);
+    setOperationAction(ISD::MUL,                MVT::v16i1, Expand);
 
     setOperationAction(ISD::MUL,                MVT::v16i32, Legal);
 
-    setOperationAction(ISD::SRL,                MVT::v8i64, Custom);
-    setOperationAction(ISD::SRL,                MVT::v16i32, Custom);
-
-    setOperationAction(ISD::SHL,                MVT::v8i64, Custom);
-    setOperationAction(ISD::SHL,                MVT::v16i32, Custom);
-
-    setOperationAction(ISD::SRA,                MVT::v8i64, Custom);
-    setOperationAction(ISD::SRA,                MVT::v16i32, Custom);
-
-    setOperationAction(ISD::AND,                MVT::v8i64, Legal);
-    setOperationAction(ISD::OR,                 MVT::v8i64, Legal);
-    setOperationAction(ISD::XOR,                MVT::v8i64, Legal);
-    setOperationAction(ISD::AND,                MVT::v16i32, Legal);
-    setOperationAction(ISD::OR,                 MVT::v16i32, Legal);
-    setOperationAction(ISD::XOR,                MVT::v16i32, Legal);
+    for (auto VT : { MVT::v16i32, MVT::v8i64 }) {
+      setOperationAction(ISD::SRL, VT, Custom);
+      setOperationAction(ISD::SHL, VT, Custom);
+      setOperationAction(ISD::SRA, VT, Custom);
+      setOperationAction(ISD::AND, VT, Legal);
+      setOperationAction(ISD::OR,  VT, Legal);
+      setOperationAction(ISD::XOR, VT, Legal);
+      setOperationAction(ISD::CTPOP, VT, Custom);
+      setOperationAction(ISD::CTTZ, VT, Custom);
+    }
 
-    if (Subtarget->hasCDI()) {
+    if (Subtarget.hasCDI()) {
       setOperationAction(ISD::CTLZ,             MVT::v8i64,  Legal);
       setOperationAction(ISD::CTLZ,             MVT::v16i32, Legal);
-      setOperationAction(ISD::CTLZ_ZERO_UNDEF,  MVT::v8i64,  Expand);
-      setOperationAction(ISD::CTLZ_ZERO_UNDEF,  MVT::v16i32, Expand);
 
       setOperationAction(ISD::CTLZ,             MVT::v8i16,  Custom);
       setOperationAction(ISD::CTLZ,             MVT::v16i8,  Custom);
       setOperationAction(ISD::CTLZ,             MVT::v16i16, Custom);
       setOperationAction(ISD::CTLZ,             MVT::v32i8,  Custom);
-      setOperationAction(ISD::CTLZ_ZERO_UNDEF,  MVT::v8i16,  Expand);
-      setOperationAction(ISD::CTLZ_ZERO_UNDEF,  MVT::v16i8,  Expand);
-      setOperationAction(ISD::CTLZ_ZERO_UNDEF,  MVT::v16i16, Expand);
-      setOperationAction(ISD::CTLZ_ZERO_UNDEF,  MVT::v32i8,  Expand);
 
       setOperationAction(ISD::CTTZ_ZERO_UNDEF,  MVT::v8i64,  Custom);
       setOperationAction(ISD::CTTZ_ZERO_UNDEF,  MVT::v16i32, Custom);
 
-      if (Subtarget->hasVLX()) {
+      if (Subtarget.hasVLX()) {
         setOperationAction(ISD::CTLZ,             MVT::v4i64, Legal);
         setOperationAction(ISD::CTLZ,             MVT::v8i32, Legal);
         setOperationAction(ISD::CTLZ,             MVT::v2i64, Legal);
         setOperationAction(ISD::CTLZ,             MVT::v4i32, Legal);
-        setOperationAction(ISD::CTLZ_ZERO_UNDEF,  MVT::v4i64, Expand);
-        setOperationAction(ISD::CTLZ_ZERO_UNDEF,  MVT::v8i32, Expand);
-        setOperationAction(ISD::CTLZ_ZERO_UNDEF,  MVT::v2i64, Expand);
-        setOperationAction(ISD::CTLZ_ZERO_UNDEF,  MVT::v4i32, Expand);
-
-        setOperationAction(ISD::CTTZ_ZERO_UNDEF,  MVT::v4i64, Custom);
-        setOperationAction(ISD::CTTZ_ZERO_UNDEF,  MVT::v8i32, Custom);
-        setOperationAction(ISD::CTTZ_ZERO_UNDEF,  MVT::v2i64, Custom);
-        setOperationAction(ISD::CTTZ_ZERO_UNDEF,  MVT::v4i32, Custom);
       } else {
         setOperationAction(ISD::CTLZ,             MVT::v4i64, Custom);
         setOperationAction(ISD::CTLZ,             MVT::v8i32, Custom);
         setOperationAction(ISD::CTLZ,             MVT::v2i64, Custom);
         setOperationAction(ISD::CTLZ,             MVT::v4i32, Custom);
-        setOperationAction(ISD::CTLZ_ZERO_UNDEF,  MVT::v4i64, Expand);
-        setOperationAction(ISD::CTLZ_ZERO_UNDEF,  MVT::v8i32, Expand);
-        setOperationAction(ISD::CTLZ_ZERO_UNDEF,  MVT::v2i64, Expand);
-        setOperationAction(ISD::CTLZ_ZERO_UNDEF,  MVT::v4i32, Expand);
       }
-    } // Subtarget->hasCDI()
 
-    if (Subtarget->hasDQI()) {
-      setOperationAction(ISD::MUL,             MVT::v2i64, Legal);
-      setOperationAction(ISD::MUL,             MVT::v4i64, Legal);
+      setOperationAction(ISD::CTTZ_ZERO_UNDEF,  MVT::v4i64, Custom);
+      setOperationAction(ISD::CTTZ_ZERO_UNDEF,  MVT::v8i32, Custom);
+      setOperationAction(ISD::CTTZ_ZERO_UNDEF,  MVT::v2i64, Custom);
+      setOperationAction(ISD::CTTZ_ZERO_UNDEF,  MVT::v4i32, Custom);
+    } // Subtarget.hasCDI()
+
+    if (Subtarget.hasDQI()) {
+      if (Subtarget.hasVLX()) {
+        setOperationAction(ISD::MUL,             MVT::v2i64, Legal);
+        setOperationAction(ISD::MUL,             MVT::v4i64, Legal);
+      }
       setOperationAction(ISD::MUL,             MVT::v8i64, Legal);
     }
     // Custom lower several nodes.
-    for (MVT VT : MVT::vector_valuetypes()) {
-      unsigned EltSize = VT.getVectorElementType().getSizeInBits();
-      if (EltSize == 1) {
-        setOperationAction(ISD::AND, VT, Legal);
-        setOperationAction(ISD::OR,  VT, Legal);
-        setOperationAction(ISD::XOR,  VT, Legal);
-      }
-      if ((VT.is128BitVector() || VT.is256BitVector()) && EltSize >= 32) {
-        setOperationAction(ISD::MGATHER,  VT, Custom);
-        setOperationAction(ISD::MSCATTER, VT, Custom);
-      }
-      // Extract subvector is special because the value type
-      // (result) is 256/128-bit but the source is 512-bit wide.
-      if (VT.is128BitVector() || VT.is256BitVector()) {
-        setOperationAction(ISD::EXTRACT_SUBVECTOR, VT, Custom);
-      }
-      if (VT.getVectorElementType() == MVT::i1)
-        setOperationAction(ISD::EXTRACT_SUBVECTOR, VT, Legal);
-
-      // Do not attempt to custom lower other non-512-bit vectors
-      if (!VT.is512BitVector())
-        continue;
-
-      if (EltSize >= 32) {
-        setOperationAction(ISD::VECTOR_SHUFFLE,      VT, Custom);
-        setOperationAction(ISD::INSERT_VECTOR_ELT,   VT, Custom);
-        setOperationAction(ISD::BUILD_VECTOR,        VT, Custom);
-        setOperationAction(ISD::VSELECT,             VT, Legal);
-        setOperationAction(ISD::EXTRACT_VECTOR_ELT,  VT, Custom);
-        setOperationAction(ISD::SCALAR_TO_VECTOR,    VT, Custom);
-        setOperationAction(ISD::INSERT_SUBVECTOR,    VT, Custom);
-        setOperationAction(ISD::MLOAD,               VT, Legal);
-        setOperationAction(ISD::MSTORE,              VT, Legal);
-        setOperationAction(ISD::MGATHER,  VT, Legal);
-        setOperationAction(ISD::MSCATTER, VT, Custom);
-      }
+    for (auto VT : { MVT::v4i32, MVT::v8i32, MVT::v2i64, MVT::v4i64,
+                     MVT::v4f32, MVT::v8f32, MVT::v2f64, MVT::v4f64 }) {
+      setOperationAction(ISD::MGATHER,  VT, Custom);
+      setOperationAction(ISD::MSCATTER, VT, Custom);
+    }
+    // Extract subvector is special because the value type
+    // (result) is 256-bit but the source is 512-bit wide.
+    // 128-bit was made Custom under AVX1.
+    for (auto VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64,
+                     MVT::v8f32, MVT::v4f64 })
+      setOperationAction(ISD::EXTRACT_SUBVECTOR, VT, Custom);
+    for (auto VT : { MVT::v2i1, MVT::v4i1, MVT::v8i1,
+                     MVT::v16i1, MVT::v32i1, MVT::v64i1 })
+      setOperationAction(ISD::EXTRACT_SUBVECTOR, VT, Legal);
+
+    for (auto VT : { MVT::v16i32, MVT::v8i64, MVT::v16f32, MVT::v8f64 }) {
+      setOperationAction(ISD::VECTOR_SHUFFLE,      VT, Custom);
+      setOperationAction(ISD::INSERT_VECTOR_ELT,   VT, Custom);
+      setOperationAction(ISD::BUILD_VECTOR,        VT, Custom);
+      setOperationAction(ISD::VSELECT,             VT, Legal);
+      setOperationAction(ISD::EXTRACT_VECTOR_ELT,  VT, Custom);
+      setOperationAction(ISD::SCALAR_TO_VECTOR,    VT, Custom);
+      setOperationAction(ISD::INSERT_SUBVECTOR,    VT, Custom);
+      setOperationAction(ISD::MLOAD,               VT, Legal);
+      setOperationAction(ISD::MSTORE,              VT, Legal);
+      setOperationAction(ISD::MGATHER,             VT, Legal);
+      setOperationAction(ISD::MSCATTER,            VT, Custom);
     }
     for (auto VT : { MVT::v64i8, MVT::v32i16, MVT::v16i32 }) {
-      setOperationAction(ISD::SELECT, VT, Promote);
-      AddPromotedToType (ISD::SELECT, VT, MVT::v8i64);
+      setOperationPromotedToType(ISD::SELECT, VT, MVT::v8i64);
     }
   }// has  AVX-512
 
-  if (!Subtarget->useSoftFloat() && Subtarget->hasBWI()) {
+  if (!Subtarget.useSoftFloat() && Subtarget.hasBWI()) {
     addRegisterClass(MVT::v32i16, &X86::VR512RegClass);
     addRegisterClass(MVT::v64i8,  &X86::VR512RegClass);
 
     addRegisterClass(MVT::v32i1,  &X86::VK32RegClass);
     addRegisterClass(MVT::v64i1,  &X86::VK64RegClass);
 
-    setOperationAction(ISD::LOAD,               MVT::v32i16, Legal);
-    setOperationAction(ISD::LOAD,               MVT::v64i8, Legal);
+    setOperationAction(ISD::ADD,                MVT::v32i1, Expand);
+    setOperationAction(ISD::ADD,                MVT::v64i1, Expand);
+    setOperationAction(ISD::SUB,                MVT::v32i1, Expand);
+    setOperationAction(ISD::SUB,                MVT::v64i1, Expand);
+    setOperationAction(ISD::MUL,                MVT::v32i1, Expand);
+    setOperationAction(ISD::MUL,                MVT::v64i1, Expand);
+
     setOperationAction(ISD::SETCC,              MVT::v32i1, Custom);
     setOperationAction(ISD::SETCC,              MVT::v64i1, Custom);
-    setOperationAction(ISD::ADD,                MVT::v32i16, Legal);
-    setOperationAction(ISD::ADD,                MVT::v64i8, Legal);
-    setOperationAction(ISD::SUB,                MVT::v32i16, Legal);
-    setOperationAction(ISD::SUB,                MVT::v64i8, Legal);
     setOperationAction(ISD::MUL,                MVT::v32i16, Legal);
+    setOperationAction(ISD::MUL,                MVT::v64i8, Custom);
     setOperationAction(ISD::MULHS,              MVT::v32i16, Legal);
     setOperationAction(ISD::MULHU,              MVT::v32i16, Legal);
     setOperationAction(ISD::CONCAT_VECTORS,     MVT::v32i1, Custom);
@@ -1646,12 +1439,15 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
     setOperationAction(ISD::INSERT_SUBVECTOR,   MVT::v64i8, Custom);
     setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v32i16, Custom);
     setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v64i8, Custom);
+    setOperationAction(ISD::SCALAR_TO_VECTOR,   MVT::v32i16, Custom);
+    setOperationAction(ISD::SCALAR_TO_VECTOR,   MVT::v64i8, Custom);
     setOperationAction(ISD::SELECT,             MVT::v32i1, Custom);
     setOperationAction(ISD::SELECT,             MVT::v64i1, Custom);
     setOperationAction(ISD::SIGN_EXTEND,        MVT::v32i8, Custom);
     setOperationAction(ISD::ZERO_EXTEND,        MVT::v32i8, Custom);
     setOperationAction(ISD::SIGN_EXTEND,        MVT::v32i16, Custom);
     setOperationAction(ISD::ZERO_EXTEND,        MVT::v32i16, Custom);
+    setOperationAction(ISD::ANY_EXTEND,         MVT::v32i16, Custom);
     setOperationAction(ISD::VECTOR_SHUFFLE,     MVT::v32i16, Custom);
     setOperationAction(ISD::VECTOR_SHUFFLE,     MVT::v64i8, Custom);
     setOperationAction(ISD::SIGN_EXTEND,        MVT::v64i8, Custom);
@@ -1667,6 +1463,11 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
     setOperationAction(ISD::TRUNCATE,           MVT::v32i8, Custom);
     setOperationAction(ISD::VECTOR_SHUFFLE,     MVT::v32i1, Custom);
     setOperationAction(ISD::VECTOR_SHUFFLE,     MVT::v64i1, Custom);
+    setOperationAction(ISD::BUILD_VECTOR,       MVT::v32i1, Custom);
+    setOperationAction(ISD::BUILD_VECTOR,       MVT::v64i1, Custom);
+    setOperationAction(ISD::VSELECT,            MVT::v32i1, Expand);
+    setOperationAction(ISD::VSELECT,            MVT::v64i1, Expand);
+    setOperationAction(ISD::BITREVERSE,         MVT::v64i8, Custom);
 
     setOperationAction(ISD::SMAX,               MVT::v64i8, Legal);
     setOperationAction(ISD::SMAX,               MVT::v32i16, Legal);
@@ -1679,36 +1480,59 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
 
     setTruncStoreAction(MVT::v32i16,  MVT::v32i8, Legal);
     setTruncStoreAction(MVT::v16i16,  MVT::v16i8, Legal);
-    if (Subtarget->hasVLX())
+    if (Subtarget.hasVLX())
       setTruncStoreAction(MVT::v8i16,   MVT::v8i8,  Legal);
 
-    if (Subtarget->hasCDI()) {
+    LegalizeAction Action = Subtarget.hasVLX() ? Legal : Custom;
+    for (auto VT : { MVT::v32i8, MVT::v16i8, MVT::v16i16, MVT::v8i16 }) {
+      setOperationAction(ISD::MLOAD,               VT, Action);
+      setOperationAction(ISD::MSTORE,              VT, Action);
+    }
+
+    if (Subtarget.hasCDI()) {
       setOperationAction(ISD::CTLZ,            MVT::v32i16, Custom);
       setOperationAction(ISD::CTLZ,            MVT::v64i8,  Custom);
-      setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::v32i16, Expand);
-      setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::v64i8,  Expand);
     }
 
     for (auto VT : { MVT::v64i8, MVT::v32i16 }) {
-      setOperationAction(ISD::BUILD_VECTOR,        VT, Custom);
-      setOperationAction(ISD::VSELECT,             VT, Legal);
-      setOperationAction(ISD::SRL,                 VT, Custom);
-      setOperationAction(ISD::SHL,                 VT, Custom);
-      setOperationAction(ISD::SRA,                 VT, Custom);
-
-      setOperationAction(ISD::AND,    VT, Promote);
-      AddPromotedToType (ISD::AND,    VT, MVT::v8i64);
-      setOperationAction(ISD::OR,     VT, Promote);
-      AddPromotedToType (ISD::OR,     VT, MVT::v8i64);
-      setOperationAction(ISD::XOR,    VT, Promote);
-      AddPromotedToType (ISD::XOR,    VT, MVT::v8i64);
+      setOperationAction(ISD::BUILD_VECTOR, VT, Custom);
+      setOperationAction(ISD::VSELECT,      VT, Legal);
+      setOperationAction(ISD::SRL,          VT, Custom);
+      setOperationAction(ISD::SHL,          VT, Custom);
+      setOperationAction(ISD::SRA,          VT, Custom);
+      setOperationAction(ISD::MLOAD,        VT, Legal);
+      setOperationAction(ISD::MSTORE,       VT, Legal);
+      setOperationAction(ISD::CTPOP,        VT, Custom);
+      setOperationAction(ISD::CTTZ,         VT, Custom);
+
+      setOperationPromotedToType(ISD::AND,  VT, MVT::v8i64);
+      setOperationPromotedToType(ISD::OR,   VT, MVT::v8i64);
+      setOperationPromotedToType(ISD::XOR,  VT, MVT::v8i64);
+    }
+
+    for (auto ExtType : {ISD::ZEXTLOAD, ISD::SEXTLOAD, ISD::EXTLOAD}) {
+      setLoadExtAction(ExtType, MVT::v32i16, MVT::v32i8, Legal);
+      if (Subtarget.hasVLX()) {
+        // FIXME. This commands are available on SSE/AVX2, add relevant patterns.
+        setLoadExtAction(ExtType, MVT::v16i16, MVT::v16i8, Legal);
+        setLoadExtAction(ExtType, MVT::v8i16,  MVT::v8i8,  Legal);
+      }
     }
   }
 
-  if (!Subtarget->useSoftFloat() && Subtarget->hasVLX()) {
+  if (!Subtarget.useSoftFloat() && Subtarget.hasVLX()) {
     addRegisterClass(MVT::v4i1,   &X86::VK4RegClass);
     addRegisterClass(MVT::v2i1,   &X86::VK2RegClass);
 
+    setOperationAction(ISD::ADD,                MVT::v2i1, Expand);
+    setOperationAction(ISD::ADD,                MVT::v4i1, Expand);
+    setOperationAction(ISD::SUB,                MVT::v2i1, Expand);
+    setOperationAction(ISD::SUB,                MVT::v4i1, Expand);
+    setOperationAction(ISD::MUL,                MVT::v2i1, Expand);
+    setOperationAction(ISD::MUL,                MVT::v4i1, Expand);
+
+    setOperationAction(ISD::TRUNCATE,           MVT::v2i1, Custom);
+    setOperationAction(ISD::TRUNCATE,           MVT::v4i1, Custom);
     setOperationAction(ISD::SETCC,              MVT::v4i1, Custom);
     setOperationAction(ISD::SETCC,              MVT::v2i1, Custom);
     setOperationAction(ISD::CONCAT_VECTORS,     MVT::v4i1, Custom);
@@ -1721,31 +1545,28 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
     setOperationAction(ISD::BUILD_VECTOR,       MVT::v2i1, Custom);
     setOperationAction(ISD::VECTOR_SHUFFLE,     MVT::v2i1, Custom);
     setOperationAction(ISD::VECTOR_SHUFFLE,     MVT::v4i1, Custom);
+    setOperationAction(ISD::VSELECT,            MVT::v2i1, Expand);
+    setOperationAction(ISD::VSELECT,            MVT::v4i1, Expand);
+
+    for (auto VT : { MVT::v4i32, MVT::v8i32 }) {
+      setOperationAction(ISD::AND, VT, Legal);
+      setOperationAction(ISD::OR,  VT, Legal);
+      setOperationAction(ISD::XOR, VT, Legal);
+    }
 
-    setOperationAction(ISD::AND,                MVT::v8i32, Legal);
-    setOperationAction(ISD::OR,                 MVT::v8i32, Legal);
-    setOperationAction(ISD::XOR,                MVT::v8i32, Legal);
-    setOperationAction(ISD::AND,                MVT::v4i32, Legal);
-    setOperationAction(ISD::OR,                 MVT::v4i32, Legal);
-    setOperationAction(ISD::XOR,                MVT::v4i32, Legal);
-    setOperationAction(ISD::SRA,                MVT::v2i64, Custom);
-    setOperationAction(ISD::SRA,                MVT::v4i64, Custom);
-
-    setOperationAction(ISD::SMAX,               MVT::v2i64, Legal);
-    setOperationAction(ISD::SMAX,               MVT::v4i64, Legal);
-    setOperationAction(ISD::UMAX,               MVT::v2i64, Legal);
-    setOperationAction(ISD::UMAX,               MVT::v4i64, Legal);
-    setOperationAction(ISD::SMIN,               MVT::v2i64, Legal);
-    setOperationAction(ISD::SMIN,               MVT::v4i64, Legal);
-    setOperationAction(ISD::UMIN,               MVT::v2i64, Legal);
-    setOperationAction(ISD::UMIN,               MVT::v4i64, Legal);
+    for (auto VT : { MVT::v2i64, MVT::v4i64 }) {
+      setOperationAction(ISD::SMAX, VT, Legal);
+      setOperationAction(ISD::UMAX, VT, Legal);
+      setOperationAction(ISD::SMIN, VT, Legal);
+      setOperationAction(ISD::UMIN, VT, Legal);
+    }
   }
 
   // We want to custom lower some of our intrinsics.
   setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::Other, Custom);
   setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::Other, Custom);
   setOperationAction(ISD::INTRINSIC_VOID, MVT::Other, Custom);
-  if (!Subtarget->is64Bit()) {
+  if (!Subtarget.is64Bit()) {
     setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::i64, Custom);
     setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::i64, Custom);
   }
@@ -1757,7 +1578,7 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
   // subtraction on x86-32 once PR3203 is fixed.  We really can't do much better
   // than generic legalization for 64-bit multiplication-with-overflow, though.
   for (auto VT : { MVT::i8, MVT::i16, MVT::i32, MVT::i64 }) {
-    if (VT == MVT::i64 && !Subtarget->is64Bit())
+    if (VT == MVT::i64 && !Subtarget.is64Bit())
       continue;
     // Add/Sub/Mul with overflow operations are custom lowered.
     setOperationAction(ISD::SADDO, VT, Custom);
@@ -1768,7 +1589,7 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
     setOperationAction(ISD::UMULO, VT, Custom);
   }
 
-  if (!Subtarget->is64Bit()) {
+  if (!Subtarget.is64Bit()) {
     // These libcalls are not available in 32-bit.
     setLibcallName(RTLIB::SHL_I128, nullptr);
     setLibcallName(RTLIB::SRL_I128, nullptr);
@@ -1776,10 +1597,10 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
   }
 
   // Combine sin / cos into one node or libcall if possible.
-  if (Subtarget->hasSinCos()) {
+  if (Subtarget.hasSinCos()) {
     setLibcallName(RTLIB::SINCOS_F32, "sincosf");
     setLibcallName(RTLIB::SINCOS_F64, "sincos");
-    if (Subtarget->isTargetDarwin()) {
+    if (Subtarget.isTargetDarwin()) {
       // For MacOSX, we don't want the normal expansion of a libcall to sincos.
       // We want to issue a libcall to __sincos_stret to avoid memory traffic.
       setOperationAction(ISD::FSINCOS, MVT::f64, Custom);
@@ -1787,7 +1608,7 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
     }
   }
 
-  if (Subtarget->isTargetWin64()) {
+  if (Subtarget.isTargetWin64()) {
     setOperationAction(ISD::SDIV, MVT::i128, Custom);
     setOperationAction(ISD::UDIV, MVT::i128, Custom);
     setOperationAction(ISD::SREM, MVT::i128, Custom);
@@ -1796,6 +1617,17 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
     setOperationAction(ISD::UDIVREM, MVT::i128, Custom);
   }
 
+  // On 32 bit MSVC, `fmodf(f32)` is not defined - only `fmod(f64)`
+  // is. We should promote the value to 64-bits to solve this.
+  // This is what the CRT headers do - `fmodf` is an inline header
+  // function casting to f64 and calling `fmod`.
+  if (Subtarget.is32Bit() && Subtarget.isTargetKnownWindowsMSVC())
+    for (ISD::NodeType Op :
+         {ISD::FCEIL, ISD::FCOS, ISD::FEXP, ISD::FFLOOR, ISD::FREM, ISD::FLOG,
+          ISD::FLOG10, ISD::FPOW, ISD::FSIN})
+      if (isOperationExpand(Op, MVT::f32))
+        setOperationAction(Op, MVT::f32, Promote);
+
   // We have target-specific dag combine patterns for the following nodes:
   setTargetDAGCombine(ISD::VECTOR_SHUFFLE);
   setTargetDAGCombine(ISD::EXTRACT_VECTOR_ELT);
@@ -1827,13 +1659,12 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
   setTargetDAGCombine(ISD::SINT_TO_FP);
   setTargetDAGCombine(ISD::UINT_TO_FP);
   setTargetDAGCombine(ISD::SETCC);
-  setTargetDAGCombine(ISD::BUILD_VECTOR);
   setTargetDAGCombine(ISD::MUL);
   setTargetDAGCombine(ISD::XOR);
   setTargetDAGCombine(ISD::MSCATTER);
   setTargetDAGCombine(ISD::MGATHER);
 
-  computeRegisterProperties(Subtarget->getRegisterInfo());
+  computeRegisterProperties(Subtarget.getRegisterInfo());
 
   MaxStoresPerMemset = 16; // For @llvm.memset -> sequence of stores
   MaxStoresPerMemsetOptSize = 8;
@@ -1843,9 +1674,9 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
   MaxStoresPerMemmoveOptSize = 4;
   setPrefLoopAlignment(4); // 2^4 bytes.
 
-  // A predictable cmov does not hurt on an in-order CPU.
-  // FIXME: Use a CPU attribute to trigger this, not a CPU model.
-  PredictableSelectIsExpensive = !Subtarget->isAtom();
+  // An out-of-order CPU can speculatively execute past a predictable branch,
+  // but a conditional move could be stalled by an expensive earlier operation.
+  PredictableSelectIsExpensive = Subtarget.getSchedModel().isOutOfOrder();
   EnableExtLdPromotion = true;
   setPrefFunctionAlignment(4); // 2^4 bytes.
 
@@ -1854,7 +1685,7 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
 
 // This has so far only been implemented for 64-bit MachO.
 bool X86TargetLowering::useLoadStackGuardNode() const {
-  return Subtarget->isTargetMachO() && Subtarget->is64Bit();
+  return Subtarget.isTargetMachO() && Subtarget.is64Bit();
 }
 
 TargetLoweringBase::LegalizeTypeAction
@@ -1867,24 +1698,25 @@ X86TargetLowering::getPreferredVectorAction(EVT VT) const {
   return TargetLoweringBase::getPreferredVectorAction(VT);
 }
 
-EVT X86TargetLowering::getSetCCResultType(const DataLayout &DL, LLVMContext &,
+EVT X86TargetLowering::getSetCCResultType(const DataLayout &DL,
+                                          LLVMContext& Context,
                                           EVT VT) const {
   if (!VT.isVector())
-    return Subtarget->hasAVX512() ? MVT::i1: MVT::i8;
+    return Subtarget.hasAVX512() ? MVT::i1: MVT::i8;
 
   if (VT.isSimple()) {
     MVT VVT = VT.getSimpleVT();
     const unsigned NumElts = VVT.getVectorNumElements();
-    const MVT EltVT = VVT.getVectorElementType();
+    MVT EltVT = VVT.getVectorElementType();
     if (VVT.is512BitVector()) {
-      if (Subtarget->hasAVX512())
+      if (Subtarget.hasAVX512())
         if (EltVT == MVT::i32 || EltVT == MVT::i64 ||
             EltVT == MVT::f32 || EltVT == MVT::f64)
           switch(NumElts) {
           case  8: return MVT::v8i1;
           case 16: return MVT::v16i1;
         }
-      if (Subtarget->hasBWI())
+      if (Subtarget.hasBWI())
         if (EltVT == MVT::i8 || EltVT == MVT::i16)
           switch(NumElts) {
           case 32: return MVT::v32i1;
@@ -1892,23 +1724,20 @@ EVT X86TargetLowering::getSetCCResultType(const DataLayout &DL, LLVMContext &,
         }
     }
 
-    if (VVT.is256BitVector() || VVT.is128BitVector()) {
-      if (Subtarget->hasVLX())
-        if (EltVT == MVT::i32 || EltVT == MVT::i64 ||
-            EltVT == MVT::f32 || EltVT == MVT::f64)
-          switch(NumElts) {
-          case 2: return MVT::v2i1;
-          case 4: return MVT::v4i1;
-          case 8: return MVT::v8i1;
-        }
-      if (Subtarget->hasBWI() && Subtarget->hasVLX())
-        if (EltVT == MVT::i8 || EltVT == MVT::i16)
-          switch(NumElts) {
-          case  8: return MVT::v8i1;
-          case 16: return MVT::v16i1;
-          case 32: return MVT::v32i1;
-        }
+    if (Subtarget.hasBWI() && Subtarget.hasVLX())
+      return MVT::getVectorVT(MVT::i1, NumElts);
+
+    if (!isTypeLegal(VT) && getTypeAction(Context, VT) == TypePromoteInteger) {
+      EVT LegalVT = getTypeToTransformTo(Context, VT);
+      EltVT = LegalVT.getVectorElementType().getSimpleVT();
     }
+
+    if (Subtarget.hasVLX() && EltVT.getSizeInBits() >= 32)
+      switch(NumElts) {
+      case 2: return MVT::v2i1;
+      case 4: return MVT::v4i1;
+      case 8: return MVT::v8i1;
+      }
   }
 
   return VT.changeVectorElementTypeToInteger();
@@ -1945,7 +1774,7 @@ static void getMaxByValAlign(Type *Ty, unsigned &MaxAlign) {
 /// are at 4-byte boundaries.
 unsigned X86TargetLowering::getByValTypeAlignment(Type *Ty,
                                                   const DataLayout &DL) const {
-  if (Subtarget->is64Bit()) {
+  if (Subtarget.is64Bit()) {
     // Max of 8 and alignment of type.
     unsigned TyAlign = DL.getABITypeAlignment(Ty);
     if (TyAlign > 8)
@@ -1954,7 +1783,7 @@ unsigned X86TargetLowering::getByValTypeAlignment(Type *Ty,
   }
 
   unsigned Align = 4;
-  if (Subtarget->hasSSE1())
+  if (Subtarget.hasSSE1())
     getMaxByValAlign(Ty, Align);
   return Align;
 }
@@ -1977,35 +1806,40 @@ X86TargetLowering::getOptimalMemOpType(uint64_t Size,
                                        bool MemcpyStrSrc,
                                        MachineFunction &MF) const {
   const Function *F = MF.getFunction();
-  if ((!IsMemset || ZeroMemset) &&
-      !F->hasFnAttribute(Attribute::NoImplicitFloat)) {
+  if (!F->hasFnAttribute(Attribute::NoImplicitFloat)) {
     if (Size >= 16 &&
-        (!Subtarget->isUnalignedMem16Slow() ||
+        (!Subtarget.isUnalignedMem16Slow() ||
          ((DstAlign == 0 || DstAlign >= 16) &&
           (SrcAlign == 0 || SrcAlign >= 16)))) {
-      if (Size >= 32) {
-        // FIXME: Check if unaligned 32-byte accesses are slow.
-        if (Subtarget->hasInt256())
-          return MVT::v8i32;
-        if (Subtarget->hasFp256())
-          return MVT::v8f32;
+      // FIXME: Check if unaligned 32-byte accesses are slow.
+      if (Size >= 32 && Subtarget.hasAVX()) {
+        // Although this isn't a well-supported type for AVX1, we'll let
+        // legalization and shuffle lowering produce the optimal codegen. If we
+        // choose an optimal type with a vector element larger than a byte,
+        // getMemsetStores() may create an intermediate splat (using an integer
+        // multiply) before we splat as a vector.
+        return MVT::v32i8;
       }
-      if (Subtarget->hasSSE2())
-        return MVT::v4i32;
-      if (Subtarget->hasSSE1())
+      if (Subtarget.hasSSE2())
+        return MVT::v16i8;
+      // TODO: Can SSE1 handle a byte vector?
+      if (Subtarget.hasSSE1())
         return MVT::v4f32;
-    } else if (!MemcpyStrSrc && Size >= 8 &&
-               !Subtarget->is64Bit() &&
-               Subtarget->hasSSE2()) {
+    } else if ((!IsMemset || ZeroMemset) && !MemcpyStrSrc && Size >= 8 &&
+               !Subtarget.is64Bit() && Subtarget.hasSSE2()) {
       // Do not use f64 to lower memcpy if source is string constant. It's
       // better to use i32 to avoid the loads.
+      // Also, do not use f64 to lower memset unless this is a memset of zeros.
+      // The gymnastics of splatting a byte value into an XMM register and then
+      // only using 8-byte stores (because this is a CPU with slow unaligned
+      // 16-byte accesses) makes that a loser.
       return MVT::f64;
     }
   }
   // This is a compromise. If we reach here, unaligned accesses may be slow on
   // this target. However, creating smaller, aligned accesses could be even
   // slower and would certainly be a lot more code.
-  if (Subtarget->is64Bit() && Size >= 8)
+  if (Subtarget.is64Bit() && Size >= 8)
     return MVT::i64;
   return MVT::i32;
 }
@@ -2030,10 +1864,10 @@ X86TargetLowering::allowsMisalignedMemoryAccesses(EVT VT,
       *Fast = true;
       break;
     case 128:
-      *Fast = !Subtarget->isUnalignedMem16Slow();
+      *Fast = !Subtarget.isUnalignedMem16Slow();
       break;
     case 256:
-      *Fast = !Subtarget->isUnalignedMem32Slow();
+      *Fast = !Subtarget.isUnalignedMem32Slow();
       break;
     // TODO: What about AVX-512 (512-bit) accesses?
     }
@@ -2048,8 +1882,7 @@ X86TargetLowering::allowsMisalignedMemoryAccesses(EVT VT,
 unsigned X86TargetLowering::getJumpTableEncoding() const {
   // In GOT pic mode, each entry in the jump table is emitted as a @GOTOFF
   // symbol.
-  if (getTargetMachine().getRelocationModel() == Reloc::PIC_ &&
-      Subtarget->isPICStyleGOT())
+  if (isPositionIndependent() && Subtarget.isPICStyleGOT())
     return MachineJumpTableInfo::EK_Custom32;
 
   // Otherwise, use the normal jump table encoding heuristics.
@@ -2057,15 +1890,14 @@ unsigned X86TargetLowering::getJumpTableEncoding() const {
 }
 
 bool X86TargetLowering::useSoftFloat() const {
-  return Subtarget->useSoftFloat();
+  return Subtarget.useSoftFloat();
 }
 
 const MCExpr *
 X86TargetLowering::LowerCustomJumpTableEntry(const MachineJumpTableInfo *MJTI,
                                              const MachineBasicBlock *MBB,
                                              unsigned uid,MCContext &Ctx) const{
-  assert(MBB->getParent()->getTarget().getRelocationModel() == Reloc::PIC_ &&
-         Subtarget->isPICStyleGOT());
+  assert(isPositionIndependent() && Subtarget.isPICStyleGOT());
   // In 32-bit ELF systems, our jump table entries are formed with @GOTOFF
   // entries.
   return MCSymbolRefExpr::create(MBB->getSymbol(),
@@ -2075,7 +1907,7 @@ X86TargetLowering::LowerCustomJumpTableEntry(const MachineJumpTableInfo *MJTI,
 /// Returns relocation base for the given PIC jumptable.
 SDValue X86TargetLowering::getPICJumpTableRelocBase(SDValue Table,
                                                     SelectionDAG &DAG) const {
-  if (!Subtarget->is64Bit())
+  if (!Subtarget.is64Bit())
     // This doesn't have SDLoc associated with it, but is not really the
     // same as a Register.
     return DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(),
@@ -2089,7 +1921,7 @@ const MCExpr *X86TargetLowering::
 getPICJumpTableRelocBaseExpr(const MachineFunction *MF, unsigned JTI,
                              MCContext &Ctx) const {
   // X86-64 uses RIP relative addressing based on the jump table label.
-  if (Subtarget->isPICStyleRIPRel())
+  if (Subtarget.isPICStyleRIPRel())
     return TargetLowering::getPICJumpTableRelocBaseExpr(MF, JTI, Ctx);
 
   // Otherwise, the reference is relative to the PIC base.
@@ -2105,7 +1937,7 @@ X86TargetLowering::findRepresentativeClass(const TargetRegisterInfo *TRI,
   default:
     return TargetLowering::findRepresentativeClass(TRI, VT);
   case MVT::i8: case MVT::i16: case MVT::i32: case MVT::i64:
-    RRC = Subtarget->is64Bit() ? &X86::GR64RegClass : &X86::GR32RegClass;
+    RRC = Subtarget.is64Bit() ? &X86::GR64RegClass : &X86::GR32RegClass;
     break;
   case MVT::x86mmx:
     RRC = &X86::VR64RegClass;
@@ -2121,47 +1953,76 @@ X86TargetLowering::findRepresentativeClass(const TargetRegisterInfo *TRI,
   return std::make_pair(RRC, Cost);
 }
 
-bool X86TargetLowering::getStackCookieLocation(unsigned &AddressSpace,
-                                               unsigned &Offset) const {
-  if (!Subtarget->isTargetLinux())
-    return false;
+unsigned X86TargetLowering::getAddressSpace() const {
+  if (Subtarget.is64Bit())
+    return (getTargetMachine().getCodeModel() == CodeModel::Kernel) ? 256 : 257;
+  return 256;
+}
 
-  if (Subtarget->is64Bit()) {
-    // %fs:0x28, unless we're using a Kernel code model, in which case it's %gs:
-    Offset = 0x28;
-    if (getTargetMachine().getCodeModel() == CodeModel::Kernel)
-      AddressSpace = 256;
-    else
-      AddressSpace = 257;
-  } else {
-    // %gs:0x14 on i386
-    Offset = 0x14;
-    AddressSpace = 256;
+Value *X86TargetLowering::getIRStackGuard(IRBuilder<> &IRB) const {
+  // glibc has a special slot for the stack guard in tcbhead_t, use it instead
+  // of the usual global variable (see sysdeps/{i386,x86_64}/nptl/tls.h)
+  if (!Subtarget.isTargetGlibc())
+    return TargetLowering::getIRStackGuard(IRB);
+
+  // %fs:0x28, unless we're using a Kernel code model, in which case it's %gs:
+  // %gs:0x14 on i386
+  unsigned Offset = (Subtarget.is64Bit()) ? 0x28 : 0x14;
+  unsigned AddressSpace = getAddressSpace();
+  return ConstantExpr::getIntToPtr(
+      ConstantInt::get(Type::getInt32Ty(IRB.getContext()), Offset),
+      Type::getInt8PtrTy(IRB.getContext())->getPointerTo(AddressSpace));
+}
+
+void X86TargetLowering::insertSSPDeclarations(Module &M) const {
+  // MSVC CRT provides functionalities for stack protection.
+  if (Subtarget.getTargetTriple().isOSMSVCRT()) {
+    // MSVC CRT has a global variable holding security cookie.
+    M.getOrInsertGlobal("__security_cookie",
+                        Type::getInt8PtrTy(M.getContext()));
+
+    // MSVC CRT has a function to validate security cookie.
+    auto *SecurityCheckCookie = cast<Function>(
+        M.getOrInsertFunction("__security_check_cookie",
+                              Type::getVoidTy(M.getContext()),
+                              Type::getInt8PtrTy(M.getContext()), nullptr));
+    SecurityCheckCookie->setCallingConv(CallingConv::X86_FastCall);
+    SecurityCheckCookie->addAttribute(1, Attribute::AttrKind::InReg);
+    return;
   }
-  return true;
+  // glibc has a special slot for the stack guard.
+  if (Subtarget.isTargetGlibc())
+    return;
+  TargetLowering::insertSSPDeclarations(M);
+}
+
+Value *X86TargetLowering::getSDagStackGuard(const Module &M) const {
+  // MSVC CRT has a global variable holding security cookie.
+  if (Subtarget.getTargetTriple().isOSMSVCRT())
+    return M.getGlobalVariable("__security_cookie");
+  return TargetLowering::getSDagStackGuard(M);
+}
+
+Value *X86TargetLowering::getSSPStackGuardCheck(const Module &M) const {
+  // MSVC CRT has a function to validate security cookie.
+  if (Subtarget.getTargetTriple().isOSMSVCRT())
+    return M.getFunction("__security_check_cookie");
+  return TargetLowering::getSSPStackGuardCheck(M);
 }
 
 Value *X86TargetLowering::getSafeStackPointerLocation(IRBuilder<> &IRB) const {
-  if (!Subtarget->isTargetAndroid())
+  if (!Subtarget.isTargetAndroid())
     return TargetLowering::getSafeStackPointerLocation(IRB);
 
   // Android provides a fixed TLS slot for the SafeStack pointer. See the
   // definition of TLS_SLOT_SAFESTACK in
   // https://android.googlesource.com/platform/bionic/+/master/libc/private/bionic_tls.h
   unsigned AddressSpace, Offset;
-  if (Subtarget->is64Bit()) {
-    // %fs:0x48, unless we're using a Kernel code model, in which case it's %gs:
-    Offset = 0x48;
-    if (getTargetMachine().getCodeModel() == CodeModel::Kernel)
-      AddressSpace = 256;
-    else
-      AddressSpace = 257;
-  } else {
-    // %gs:0x24 on i386
-    Offset = 0x24;
-    AddressSpace = 256;
-  }
 
+  // %fs:0x48, unless we're using a Kernel code model, in which case it's %gs:
+  // %gs:0x24 on i386
+  Offset = (Subtarget.is64Bit()) ? 0x48 : 0x24;
+  AddressSpace = getAddressSpace();
   return ConstantExpr::getIntToPtr(
       ConstantInt::get(Type::getInt32Ty(IRB.getContext()), Offset),
       Type::getInt8PtrTy(IRB.getContext())->getPointerTo(AddressSpace));
@@ -2194,11 +2055,11 @@ const MCPhysReg *X86TargetLowering::getScratchRegisters(CallingConv::ID) const {
 }
 
 SDValue
-X86TargetLowering::LowerReturn(SDValue Chain,
-                               CallingConv::ID CallConv, bool isVarArg,
+X86TargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv,
+                               bool isVarArg,
                                const SmallVectorImpl<ISD::OutputArg> &Outs,
                                const SmallVectorImpl<SDValue> &OutVals,
-                               SDLoc dl, SelectionDAG &DAG) const {
+                               const SDLoc &dl, SelectionDAG &DAG) const {
   MachineFunction &MF = DAG.getMachineFunction();
   X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>();
 
@@ -2214,10 +2075,10 @@ X86TargetLowering::LowerReturn(SDValue Chain,
   RetOps.push_back(Chain); // Operand #0 = Chain (updated below)
   // Operand #1 = Bytes To Pop
   RetOps.push_back(DAG.getTargetConstant(FuncInfo->getBytesToPopOnReturn(), dl,
-                   MVT::i16));
+                   MVT::i32));
 
   // Copy the result values into the output registers.
-  for (unsigned i = 0; i != RVLocs.size(); ++i) {
+  for (unsigned i = 0, e = RVLocs.size(); i != e; ++i) {
     CCValAssign &VA = RVLocs[i];
     assert(VA.isRegLoc() && "Can only return in registers!");
     SDValue ValToCopy = OutVals[i];
@@ -2244,14 +2105,14 @@ X86TargetLowering::LowerReturn(SDValue Chain,
     // or SSE or MMX vectors.
     if ((ValVT == MVT::f32 || ValVT == MVT::f64 ||
          VA.getLocReg() == X86::XMM0 || VA.getLocReg() == X86::XMM1) &&
-          (Subtarget->is64Bit() && !Subtarget->hasSSE1())) {
+          (Subtarget.is64Bit() && !Subtarget.hasSSE1())) {
       report_fatal_error("SSE register return with SSE disabled");
     }
     // Likewise we can't return F64 values with SSE1 only.  gcc does so, but
     // llvm-gcc has never done it right and no one has noticed, so this
     // should be OK for now.
     if (ValVT == MVT::f64 &&
-        (Subtarget->is64Bit() && !Subtarget->hasSSE2()))
+        (Subtarget.is64Bit() && !Subtarget.hasSSE2()))
       report_fatal_error("SSE2 register return with SSE2 disabled");
 
     // Returns in ST0/ST1 are handled specially: these are pushed as operands to
@@ -2269,7 +2130,7 @@ X86TargetLowering::LowerReturn(SDValue Chain,
 
     // 64-bit vector (MMX) values are returned in XMM0 / XMM1 except for v1i64
     // which is returned in RAX / RDX.
-    if (Subtarget->is64Bit()) {
+    if (Subtarget.is64Bit()) {
       if (ValVT == MVT::x86mmx) {
         if (VA.getLocReg() == X86::XMM0 || VA.getLocReg() == X86::XMM1) {
           ValToCopy = DAG.getBitcast(MVT::i64, ValToCopy);
@@ -2277,7 +2138,7 @@ X86TargetLowering::LowerReturn(SDValue Chain,
                                   ValToCopy);
           // If we don't have SSE2 available, convert to v4f32 so the generated
           // register is legal.
-          if (!Subtarget->hasSSE2())
+          if (!Subtarget.hasSSE2())
             ValToCopy = DAG.getBitcast(MVT::v4f32, ValToCopy);
         }
       }
@@ -2288,6 +2149,9 @@ X86TargetLowering::LowerReturn(SDValue Chain,
     RetOps.push_back(DAG.getRegister(VA.getLocReg(), VA.getLocVT()));
   }
 
+  // Swift calling convention does not require we copy the sret argument
+  // into %rax/%eax for the return, and SRetReturnReg is not set for Swift.
+
   // All x86 ABIs require that for returning structs by value we copy
   // the sret argument into %rax/%eax (depending on ABI) for the return.
   // We saved the argument into a virtual register in the entry block,
@@ -2298,11 +2162,30 @@ X86TargetLowering::LowerReturn(SDValue Chain,
   // false, then an sret argument may be implicitly inserted in the SelDAG. In
   // either case FuncInfo->setSRetReturnReg() will have been called.
   if (unsigned SRetReg = FuncInfo->getSRetReturnReg()) {
-    SDValue Val = DAG.getCopyFromReg(Chain, dl, SRetReg,
+    // When we have both sret and another return value, we should use the
+    // original Chain stored in RetOps[0], instead of the current Chain updated
+    // in the above loop. If we only have sret, RetOps[0] equals to Chain.
+
+    // For the case of sret and another return value, we have
+    //   Chain_0 at the function entry
+    //   Chain_1 = getCopyToReg(Chain_0) in the above loop
+    // If we use Chain_1 in getCopyFromReg, we will have
+    //   Val = getCopyFromReg(Chain_1)
+    //   Chain_2 = getCopyToReg(Chain_1, Val) from below
+
+    // getCopyToReg(Chain_0) will be glued together with
+    // getCopyToReg(Chain_1, Val) into Unit A, getCopyFromReg(Chain_1) will be
+    // in Unit B, and we will have cyclic dependency between Unit A and Unit B:
+    //   Data dependency from Unit B to Unit A due to usage of Val in
+    //     getCopyToReg(Chain_1, Val)
+    //   Chain dependency from Unit A to Unit B
+
+    // So here, we use RetOps[0] (i.e Chain_0) for getCopyFromReg.
+    SDValue Val = DAG.getCopyFromReg(RetOps[0], dl, SRetReg,
                                      getPointerTy(MF.getDataLayout()));
 
     unsigned RetValReg
-        = (Subtarget->is64Bit() && !Subtarget->isTarget64BitILP32()) ?
+        = (Subtarget.is64Bit() && !Subtarget.isTarget64BitILP32()) ?
           X86::RAX : X86::EAX;
     Chain = DAG.getCopyToReg(Chain, dl, RetValReg, Val, Flag);
     Flag = Chain.getValue(1);
@@ -2312,7 +2195,7 @@ X86TargetLowering::LowerReturn(SDValue Chain,
         DAG.getRegister(RetValReg, getPointerTy(DAG.getDataLayout())));
   }
 
-  const X86RegisterInfo *TRI = Subtarget->getRegisterInfo();
+  const X86RegisterInfo *TRI = Subtarget.getRegisterInfo();
   const MCPhysReg *I =
       TRI->getCalleeSavedRegsViaCopy(&DAG.getMachineFunction());
   if (I) {
@@ -2337,9 +2220,7 @@ X86TargetLowering::LowerReturn(SDValue Chain,
 }
 
 bool X86TargetLowering::isUsedByReturnOnly(SDNode *N, SDValue &Chain) const {
-  if (N->getNumValues() != 1)
-    return false;
-  if (!N->hasNUsesOfValue(1, 0))
+  if (N->getNumValues() != 1 || !N->hasNUsesOfValue(1, 0))
     return false;
 
   SDValue TCChain = Chain;
@@ -2375,15 +2256,19 @@ bool X86TargetLowering::isUsedByReturnOnly(SDNode *N, SDValue &Chain) const {
   return true;
 }
 
-EVT
-X86TargetLowering::getTypeForExtArgOrReturn(LLVMContext &Context, EVT VT,
-                                            ISD::NodeType ExtendKind) const {
-  MVT ReturnMVT;
-  // TODO: Is this also valid on 32-bit?
-  if (Subtarget->is64Bit() && VT == MVT::i1 && ExtendKind == ISD::ZERO_EXTEND)
+EVT X86TargetLowering::getTypeForExtReturn(LLVMContext &Context, EVT VT,
+                                           ISD::NodeType ExtendKind) const {
+  MVT ReturnMVT = MVT::i32;
+
+  bool Darwin = Subtarget.getTargetTriple().isOSDarwin();
+  if (VT == MVT::i1 || (!Darwin && (VT == MVT::i8 || VT == MVT::i16))) {
+    // The ABI does not require i1, i8 or i16 to be extended.
+    //
+    // On Darwin, there is code in the wild relying on Clang's old behaviour of
+    // always extending i8/i16 return values, so keep doing that for now.
+    // (PR26665).
     ReturnMVT = MVT::i8;
-  else
-    ReturnMVT = MVT::i32;
+  }
 
   EVT MinVT = getRegisterType(Context, ReturnMVT);
   return VT.bitsLT(MinVT) ? MinVT : VT;
@@ -2392,16 +2277,14 @@ X86TargetLowering::getTypeForExtArgOrReturn(LLVMContext &Context, EVT VT,
 /// Lower the result values of a call into the
 /// appropriate copies out of appropriate physical registers.
 ///
-SDValue
-X86TargetLowering::LowerCallResult(SDValue Chain, SDValue InFlag,
-                                   CallingConv::ID CallConv, bool isVarArg,
-                                   const SmallVectorImpl<ISD::InputArg> &Ins,
-                                   SDLoc dl, SelectionDAG &DAG,
-                                   SmallVectorImpl<SDValue> &InVals) const {
+SDValue X86TargetLowering::LowerCallResult(
+    SDValue Chain, SDValue InFlag, CallingConv::ID CallConv, bool isVarArg,
+    const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &dl,
+    SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals) const {
 
   // Assign locations to each value returned by this call.
   SmallVector<CCValAssign, 16> RVLocs;
-  bool Is64Bit = Subtarget->is64Bit();
+  bool Is64Bit = Subtarget.is64Bit();
   CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), RVLocs,
                  *DAG.getContext());
   CCInfo.AnalyzeCallResult(Ins, RetCC_X86);
@@ -2413,7 +2296,7 @@ X86TargetLowering::LowerCallResult(SDValue Chain, SDValue InFlag,
 
     // If this is x86-64, and we disabled SSE, we can't return FP values
     if ((CopyVT == MVT::f32 || CopyVT == MVT::f64 || CopyVT == MVT::f128) &&
-        ((Is64Bit || Ins[i].Flags.isInReg()) && !Subtarget->hasSSE1())) {
+        ((Is64Bit || Ins[i].Flags.isInReg()) && !Subtarget.hasSSE1())) {
       report_fatal_error("SSE register return with SSE disabled");
     }
 
@@ -2422,6 +2305,8 @@ X86TargetLowering::LowerCallResult(SDValue Chain, SDValue InFlag,
     bool RoundAfterCopy = false;
     if ((VA.getLocReg() == X86::FP0 || VA.getLocReg() == X86::FP1) &&
         isScalarFPTypeInSSEReg(VA.getValVT())) {
+      if (!Subtarget.hasX87())
+        report_fatal_error("X87 register return with X87 disabled");
       CopyVT = MVT::f80;
       RoundAfterCopy = (CopyVT != VA.getLocVT());
     }
@@ -2492,10 +2377,9 @@ argsAreStructReturn(const SmallVectorImpl<ISD::InputArg> &Ins, bool IsMCU) {
 /// Make a copy of an aggregate at address specified by "Src" to address
 /// "Dst" with size and alignment information specified by the specific
 /// parameter attribute. The copy will be passed as a byval function parameter.
-static SDValue
-CreateCopyOfByValArgument(SDValue Src, SDValue Dst, SDValue Chain,
-                          ISD::ArgFlagsTy Flags, SelectionDAG &DAG,
-                          SDLoc dl) {
+static SDValue CreateCopyOfByValArgument(SDValue Src, SDValue Dst,
+                                         SDValue Chain, ISD::ArgFlagsTy Flags,
+                                         SelectionDAG &DAG, const SDLoc &dl) {
   SDValue SizeNode = DAG.getConstant(Flags.getByValSize(), dl, MVT::i32);
 
   return DAG.getMemcpy(Chain, dl, Dst, Src, SizeNode, Flags.getByValAlign(),
@@ -2549,13 +2433,11 @@ bool X86TargetLowering::mayBeEmittedAsTailCall(CallInst *CI) const {
 }
 
 SDValue
-X86TargetLowering::LowerMemArgument(SDValue Chain,
-                                    CallingConv::ID CallConv,
+X86TargetLowering::LowerMemArgument(SDValue Chain, CallingConv::ID CallConv,
                                     const SmallVectorImpl<ISD::InputArg> &Ins,
-                                    SDLoc dl, SelectionDAG &DAG,
+                                    const SDLoc &dl, SelectionDAG &DAG,
                                     const CCValAssign &VA,
-                                    MachineFrameInfo *MFI,
-                                    unsigned i) const {
+                                    MachineFrameInfo *MFI, unsigned i) const {
   // Create the nodes corresponding to a load from this parameter slot.
   ISD::ArgFlagsTy Flags = Ins[i].Flags;
   bool AlwaysUseMutable = shouldGuaranteeTCO(
@@ -2602,6 +2484,14 @@ X86TargetLowering::LowerMemArgument(SDValue Chain,
   } else {
     int FI = MFI->CreateFixedObject(ValVT.getSizeInBits()/8,
                                     VA.getLocMemOffset(), isImmutable);
+
+    // Set SExt or ZExt flag.
+    if (VA.getLocInfo() == CCValAssign::ZExt) {
+      MFI->setObjectZExt(FI, true);
+    } else if (VA.getLocInfo() == CCValAssign::SExt) {
+      MFI->setObjectSExt(FI, true);
+    }
+
     // Adjust SP offset of interrupt parameter.
     if (CallConv == CallingConv::X86_INTR) {
       MFI->setObjectOffset(FI, Offset);
@@ -2610,8 +2500,7 @@ X86TargetLowering::LowerMemArgument(SDValue Chain,
     SDValue FIN = DAG.getFrameIndex(FI, getPointerTy(DAG.getDataLayout()));
     SDValue Val = DAG.getLoad(
         ValVT, dl, Chain, FIN,
-        MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI), false,
-        false, false, 0);
+        MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI));
     return ExtendedInMem ?
       DAG.getNode(ISD::TRUNCATE, dl, VA.getValVT(), Val) : Val;
   }
@@ -2619,10 +2508,10 @@ X86TargetLowering::LowerMemArgument(SDValue Chain,
 
 // FIXME: Get this from tablegen.
 static ArrayRef<MCPhysReg> get64BitArgumentGPRs(CallingConv::ID CallConv,
-                                                const X86Subtarget *Subtarget) {
-  assert(Subtarget->is64Bit());
+                                                const X86Subtarget &Subtarget) {
+  assert(Subtarget.is64Bit());
 
-  if (Subtarget->isCallingConvWin64(CallConv)) {
+  if (Subtarget.isCallingConvWin64(CallConv)) {
     static const MCPhysReg GPR64ArgRegsWin64[] = {
       X86::RCX, X86::RDX, X86::R8,  X86::R9
     };
@@ -2638,9 +2527,9 @@ static ArrayRef<MCPhysReg> get64BitArgumentGPRs(CallingConv::ID CallConv,
 // FIXME: Get this from tablegen.
 static ArrayRef<MCPhysReg> get64BitArgumentXMMs(MachineFunction &MF,
                                                 CallingConv::ID CallConv,
-                                                const X86Subtarget *Subtarget) {
-  assert(Subtarget->is64Bit());
-  if (Subtarget->isCallingConvWin64(CallConv)) {
+                                                const X86Subtarget &Subtarget) {
+  assert(Subtarget.is64Bit());
+  if (Subtarget.isCallingConvWin64(CallConv)) {
     // The XMM registers which might contain var arg parameters are shadowed
     // in their paired GPR.  So we only need to save the GPR to their home
     // slots.
@@ -2650,10 +2539,10 @@ static ArrayRef<MCPhysReg> get64BitArgumentXMMs(MachineFunction &MF,
 
   const Function *Fn = MF.getFunction();
   bool NoImplicitFloatOps = Fn->hasFnAttribute(Attribute::NoImplicitFloat);
-  bool isSoftFloat = Subtarget->useSoftFloat();
+  bool isSoftFloat = Subtarget.useSoftFloat();
   assert(!(isSoftFloat && NoImplicitFloatOps) &&
          "SSE register cannot be used when SSE is disabled!");
-  if (isSoftFloat || NoImplicitFloatOps || !Subtarget->hasSSE1())
+  if (isSoftFloat || NoImplicitFloatOps || !Subtarget.hasSSE1())
     // Kernel mode asks for SSE to be disabled, so there are no XMM argument
     // registers.
     return None;
@@ -2667,21 +2556,21 @@ static ArrayRef<MCPhysReg> get64BitArgumentXMMs(MachineFunction &MF,
 
 SDValue X86TargetLowering::LowerFormalArguments(
     SDValue Chain, CallingConv::ID CallConv, bool isVarArg,
-    const SmallVectorImpl<ISD::InputArg> &Ins, SDLoc dl, SelectionDAG &DAG,
-    SmallVectorImpl<SDValue> &InVals) const {
+    const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &dl,
+    SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals) const {
   MachineFunction &MF = DAG.getMachineFunction();
   X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>();
-  const TargetFrameLowering &TFI = *Subtarget->getFrameLowering();
+  const TargetFrameLowering &TFI = *Subtarget.getFrameLowering();
 
-  const Function* Fn = MF.getFunction();
+  const Function *Fn = MF.getFunction();
   if (Fn->hasExternalLinkage() &&
-      Subtarget->isTargetCygMing() &&
+      Subtarget.isTargetCygMing() &&
       Fn->getName() == "main")
     FuncInfo->setForceFramePointer(true);
 
   MachineFrameInfo *MFI = MF.getFrameInfo();
-  bool Is64Bit = Subtarget->is64Bit();
-  bool IsWin64 = Subtarget->isCallingConvWin64(CallConv);
+  bool Is64Bit = Subtarget.is64Bit();
+  bool IsWin64 = Subtarget.isCallingConvWin64(CallConv);
 
   assert(!(isVarArg && canGuaranteeTCO(CallConv)) &&
          "Var args not supported with calling convention fastcc, ghc or hipe");
@@ -2778,13 +2667,18 @@ SDValue X86TargetLowering::LowerFormalArguments(
 
     // If value is passed via pointer - do a load.
     if (VA.getLocInfo() == CCValAssign::Indirect)
-      ArgValue = DAG.getLoad(VA.getValVT(), dl, Chain, ArgValue,
-                             MachinePointerInfo(), false, false, false, 0);
+      ArgValue =
+          DAG.getLoad(VA.getValVT(), dl, Chain, ArgValue, MachinePointerInfo());
 
     InVals.push_back(ArgValue);
   }
 
   for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
+    // Swift calling convention does not require we copy the sret argument
+    // into %rax/%eax for the return. We don't set SRetReturnReg for Swift.
+    if (CallConv == CallingConv::Swift)
+      continue;
+
     // All x86 ABIs require that for returning structs by value we copy the
     // sret argument into %rax/%eax (depending on ABI) for the return. Save
     // the argument into a virtual register so that we can access it from the
@@ -2819,7 +2713,7 @@ SDValue X86TargetLowering::LowerFormalArguments(
   }
 
   // Figure out if XMM registers are in use.
-  assert(!(Subtarget->useSoftFloat() &&
+  assert(!(Subtarget.useSoftFloat() &&
            Fn->hasFnAttribute(Attribute::NoImplicitFloat)) &&
          "SSE register cannot be used when SSE is disabled!");
 
@@ -2831,7 +2725,7 @@ SDValue X86TargetLowering::LowerFormalArguments(
     ArrayRef<MCPhysReg> ArgXMMs = get64BitArgumentXMMs(MF, CallConv, Subtarget);
     unsigned NumIntRegs = CCInfo.getFirstUnallocated(ArgGPRs);
     unsigned NumXMMRegs = CCInfo.getFirstUnallocated(ArgXMMs);
-    assert(!(NumXMMRegs && !Subtarget->hasSSE1()) &&
+    assert(!(NumXMMRegs && !Subtarget.hasSSE1()) &&
            "SSE register cannot be used when SSE is disabled!");
 
     // Gather all the live in physical registers.
@@ -2865,7 +2759,7 @@ SDValue X86TargetLowering::LowerFormalArguments(
     } else {
       // For X86-64, if there are vararg parameters that are passed via
       // registers, then we must store them to their spots on the stack so
-      // they may be loaded by deferencing the result of va_next.
+      // they may be loaded by dereferencing the result of va_next.
       FuncInfo->setVarArgsGPOffset(NumIntRegs * 8);
       FuncInfo->setVarArgsFPOffset(ArgGPRs.size() * 8 + NumXMMRegs * 16);
       FuncInfo->setRegSaveFrameIndex(MFI->CreateStackObject(
@@ -2884,8 +2778,7 @@ SDValue X86TargetLowering::LowerFormalArguments(
           DAG.getStore(Val.getValue(1), dl, Val, FIN,
                        MachinePointerInfo::getFixedStack(
                            DAG.getMachineFunction(),
-                           FuncInfo->getRegSaveFrameIndex(), Offset),
-                       false, false, 0);
+                           FuncInfo->getRegSaveFrameIndex(), Offset));
       MemOps.push_back(Store);
       Offset += 8;
     }
@@ -2913,13 +2806,13 @@ SDValue X86TargetLowering::LowerFormalArguments(
     // Find the largest legal vector type.
     MVT VecVT = MVT::Other;
     // FIXME: Only some x86_32 calling conventions support AVX512.
-    if (Subtarget->hasAVX512() &&
+    if (Subtarget.hasAVX512() &&
         (Is64Bit || (CallConv == CallingConv::X86_VectorCall ||
                      CallConv == CallingConv::Intel_OCL_BI)))
       VecVT = MVT::v16f32;
-    else if (Subtarget->hasAVX())
+    else if (Subtarget.hasAVX())
       VecVT = MVT::v8f32;
-    else if (Subtarget->hasSSE2())
+    else if (Subtarget.hasSSE2())
       VecVT = MVT::v4f32;
 
     // We forward some GPRs and some vector types.
@@ -2960,8 +2853,8 @@ SDValue X86TargetLowering::LowerFormalArguments(
     FuncInfo->setBytesToPopOnReturn(0); // Callee pops nothing.
     // If this is an sret function, the return should pop the hidden pointer.
     if (!Is64Bit && !canGuaranteeTCO(CallConv) &&
-        !Subtarget->getTargetTriple().isOSMSVCRT() &&
-        argsAreStructReturn(Ins, Subtarget->isTargetMCU()) == StackStructReturn)
+        !Subtarget.getTargetTriple().isOSMSVCRT() &&
+        argsAreStructReturn(Ins, Subtarget.isTargetMCU()) == StackStructReturn)
       FuncInfo->setBytesToPopOnReturn(4);
   }
 
@@ -2987,7 +2880,7 @@ SDValue X86TargetLowering::LowerFormalArguments(
       // offset from the bottom of this and each funclet's frame must be the
       // same, so the size of funclets' (mostly empty) frames is dictated by
       // how far this slot is from the bottom (since they allocate just enough
-      // space to accomodate holding this slot at the correct offset).
+      // space to accommodate holding this slot at the correct offset).
       int PSPSymFI = MFI->CreateStackObject(8, 8, /*isSS=*/false);
       EHInfo->PSPSymFrameIdx = PSPSymFI;
     }
@@ -2996,12 +2889,11 @@ SDValue X86TargetLowering::LowerFormalArguments(
   return Chain;
 }
 
-SDValue
-X86TargetLowering::LowerMemOpCallTo(SDValue Chain,
-                                    SDValue StackPtr, SDValue Arg,
-                                    SDLoc dl, SelectionDAG &DAG,
-                                    const CCValAssign &VA,
-                                    ISD::ArgFlagsTy Flags) const {
+SDValue X86TargetLowering::LowerMemOpCallTo(SDValue Chain, SDValue StackPtr,
+                                            SDValue Arg, const SDLoc &dl,
+                                            SelectionDAG &DAG,
+                                            const CCValAssign &VA,
+                                            ISD::ArgFlagsTy Flags) const {
   unsigned LocMemOffset = VA.getLocMemOffset();
   SDValue PtrOff = DAG.getIntPtrConstant(LocMemOffset, dl);
   PtrOff = DAG.getNode(ISD::ADD, dl, getPointerTy(DAG.getDataLayout()),
@@ -3011,24 +2903,20 @@ X86TargetLowering::LowerMemOpCallTo(SDValue Chain,
 
   return DAG.getStore(
       Chain, dl, Arg, PtrOff,
-      MachinePointerInfo::getStack(DAG.getMachineFunction(), LocMemOffset),
-      false, false, 0);
+      MachinePointerInfo::getStack(DAG.getMachineFunction(), LocMemOffset));
 }
 
 /// Emit a load of return address if tail call
 /// optimization is performed and it is required.
-SDValue
-X86TargetLowering::EmitTailCallLoadRetAddr(SelectionDAG &DAG,
-                                           SDValue &OutRetAddr, SDValue Chain,
-                                           bool IsTailCall, bool Is64Bit,
-                                           int FPDiff, SDLoc dl) const {
+SDValue X86TargetLowering::EmitTailCallLoadRetAddr(
+    SelectionDAG &DAG, SDValue &OutRetAddr, SDValue Chain, bool IsTailCall,
+    bool Is64Bit, int FPDiff, const SDLoc &dl) const {
   // Adjust the Return address stack slot.
   EVT VT = getPointerTy(DAG.getDataLayout());
   OutRetAddr = getReturnAddressFrameIndex(DAG);
 
   // Load the "old" Return address.
-  OutRetAddr = DAG.getLoad(VT, dl, Chain, OutRetAddr, MachinePointerInfo(),
-                           false, false, false, 0);
+  OutRetAddr = DAG.getLoad(VT, dl, Chain, OutRetAddr, MachinePointerInfo());
   return SDValue(OutRetAddr.getNode(), 1);
 }
 
@@ -3037,7 +2925,7 @@ X86TargetLowering::EmitTailCallLoadRetAddr(SelectionDAG &DAG,
 static SDValue EmitTailCallStoreRetAddr(SelectionDAG &DAG, MachineFunction &MF,
                                         SDValue Chain, SDValue RetAddrFrIdx,
                                         EVT PtrVT, unsigned SlotSize,
-                                        int FPDiff, SDLoc dl) {
+                                        int FPDiff, const SDLoc &dl) {
   // Store the return address to the appropriate stack slot.
   if (!FPDiff) return Chain;
   // Calculate the new stack slot for the return address.
@@ -3047,21 +2935,20 @@ static SDValue EmitTailCallStoreRetAddr(SelectionDAG &DAG, MachineFunction &MF,
   SDValue NewRetAddrFrIdx = DAG.getFrameIndex(NewReturnAddrFI, PtrVT);
   Chain = DAG.getStore(Chain, dl, RetAddrFrIdx, NewRetAddrFrIdx,
                        MachinePointerInfo::getFixedStack(
-                           DAG.getMachineFunction(), NewReturnAddrFI),
-                       false, false, 0);
+                           DAG.getMachineFunction(), NewReturnAddrFI));
   return Chain;
 }
 
 /// Returns a vector_shuffle mask for an movs{s|d}, movd
 /// operation of specified width.
-static SDValue getMOVL(SelectionDAG &DAG, SDLoc dl, MVT VT, SDValue V1,
+static SDValue getMOVL(SelectionDAG &DAG, const SDLoc &dl, MVT VT, SDValue V1,
                        SDValue V2) {
   unsigned NumElems = VT.getVectorNumElements();
   SmallVector<int, 8> Mask;
   Mask.push_back(NumElems);
   for (unsigned i = 1; i != NumElems; ++i)
     Mask.push_back(i);
-  return DAG.getVectorShuffle(VT, dl, V1, V2, &Mask[0]);
+  return DAG.getVectorShuffle(VT, dl, V1, V2, Mask);
 }
 
 SDValue
@@ -3079,9 +2966,9 @@ X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
   bool isVarArg                         = CLI.IsVarArg;
 
   MachineFunction &MF = DAG.getMachineFunction();
-  bool Is64Bit        = Subtarget->is64Bit();
-  bool IsWin64        = Subtarget->isCallingConvWin64(CallConv);
-  StructReturnType SR = callIsStructReturn(Outs, Subtarget->isTargetMCU());
+  bool Is64Bit        = Subtarget.is64Bit();
+  bool IsWin64        = Subtarget.isCallingConvWin64(CallConv);
+  StructReturnType SR = callIsStructReturn(Outs, Subtarget.isTargetMCU());
   bool IsSibcall      = false;
   X86MachineFunctionInfo *X86Info = MF.getInfo<X86MachineFunctionInfo>();
   auto Attr = MF.getFunction()->getFnAttribute("disable-tail-calls");
@@ -3092,7 +2979,7 @@ X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
   if (Attr.getValueAsString() == "true")
     isTailCall = false;
 
-  if (Subtarget->isPICStyleGOT() &&
+  if (Subtarget.isPICStyleGOT() &&
       !MF.getTarget().Options.GuaranteedTailCallOpt) {
     // If we are using a GOT, disable tail calls to external symbols with
     // default visibility. Tail calling such a symbol requires using a GOT
@@ -3195,7 +3082,7 @@ X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
 
   // Walk the register/memloc assignments, inserting copies/loads.  In the case
   // of tail call optimization arguments are handle later.
-  const X86RegisterInfo *RegInfo = Subtarget->getRegisterInfo();
+  const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
   for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
     // Skip inalloca arguments, they have already been written.
     ISD::ArgFlagsTy Flags = Outs[i].Flags;
@@ -3238,8 +3125,7 @@ X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
       int FI = cast<FrameIndexSDNode>(SpillSlot)->getIndex();
       Chain = DAG.getStore(
           Chain, dl, Arg, SpillSlot,
-          MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI),
-          false, false, 0);
+          MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI));
       Arg = SpillSlot;
       break;
     }
@@ -3273,7 +3159,7 @@ X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
   if (!MemOpChains.empty())
     Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, MemOpChains);
 
-  if (Subtarget->isPICStyleGOT()) {
+  if (Subtarget.isPICStyleGOT()) {
     // ELF / PIC requires GOT in the EBX register before function calls via PLT
     // GOT pointer.
     if (!isTailCall) {
@@ -3314,7 +3200,7 @@ X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
       X86::XMM4, X86::XMM5, X86::XMM6, X86::XMM7
     };
     unsigned NumXMMRegs = CCInfo.getFirstUnallocated(XMMArgRegs);
-    assert((Subtarget->hasSSE1() || !NumXMMRegs)
+    assert((Subtarget.hasSSE1() || !NumXMMRegs)
            && "SSE registers cannot be used when SSE is disabled");
 
     RegsToPass.push_back(std::make_pair(unsigned(X86::AL),
@@ -3377,8 +3263,7 @@ X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
         // Store relative to framepointer.
         MemOpChains2.push_back(DAG.getStore(
             ArgChain, dl, Arg, FIN,
-            MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI),
-            false, false, 0));
+            MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI)));
       }
     }
 
@@ -3416,70 +3301,29 @@ X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
     // non-JIT mode.
     const GlobalValue *GV = G->getGlobal();
     if (!GV->hasDLLImportStorageClass()) {
-      unsigned char OpFlags = 0;
-      bool ExtraLoad = false;
-      unsigned WrapperKind = ISD::DELETED_NODE;
-
-      // On ELF targets, in both X86-64 and X86-32 mode, direct calls to
-      // external symbols most go through the PLT in PIC mode.  If the symbol
-      // has hidden or protected visibility, or if it is static or local, then
-      // we don't need to use the PLT - we can directly call it.
-      if (Subtarget->isTargetELF() &&
-          DAG.getTarget().getRelocationModel() == Reloc::PIC_ &&
-          GV->hasDefaultVisibility() && !GV->hasLocalLinkage()) {
-        OpFlags = X86II::MO_PLT;
-      } else if (Subtarget->isPICStyleStubAny() &&
-                 !GV->isStrongDefinitionForLinker() &&
-                 (!Subtarget->getTargetTriple().isMacOSX() ||
-                  Subtarget->getTargetTriple().isMacOSXVersionLT(10, 5))) {
-        // PC-relative references to external symbols should go through $stub,
-        // unless we're building with the leopard linker or later, which
-        // automatically synthesizes these stubs.
-        OpFlags = X86II::MO_DARWIN_STUB;
-      } else if (Subtarget->isPICStyleRIPRel() && isa<Function>(GV) &&
-                 cast<Function>(GV)->hasFnAttribute(Attribute::NonLazyBind)) {
-        // If the function is marked as non-lazy, generate an indirect call
-        // which loads from the GOT directly. This avoids runtime overhead
-        // at the cost of eager binding (and one extra byte of encoding).
-        OpFlags = X86II::MO_GOTPCREL;
-        WrapperKind = X86ISD::WrapperRIP;
-        ExtraLoad = true;
-      }
+      unsigned char OpFlags = Subtarget.classifyGlobalFunctionReference(GV);
 
       Callee = DAG.getTargetGlobalAddress(
           GV, dl, getPointerTy(DAG.getDataLayout()), G->getOffset(), OpFlags);
 
-      // Add a wrapper if needed.
-      if (WrapperKind != ISD::DELETED_NODE)
+      if (OpFlags == X86II::MO_GOTPCREL) {
+        // Add a wrapper.
         Callee = DAG.getNode(X86ISD::WrapperRIP, dl,
-                             getPointerTy(DAG.getDataLayout()), Callee);
-      // Add extra indirection if needed.
-      if (ExtraLoad)
+          getPointerTy(DAG.getDataLayout()), Callee);
+        // Add extra indirection
         Callee = DAG.getLoad(
             getPointerTy(DAG.getDataLayout()), dl, DAG.getEntryNode(), Callee,
-            MachinePointerInfo::getGOT(DAG.getMachineFunction()), false, false,
-            false, 0);
+            MachinePointerInfo::getGOT(DAG.getMachineFunction()));
+      }
     }
   } else if (ExternalSymbolSDNode *S = dyn_cast<ExternalSymbolSDNode>(Callee)) {
-    unsigned char OpFlags = 0;
-
-    // On ELF targets, in either X86-64 or X86-32 mode, direct calls to
-    // external symbols should go through the PLT.
-    if (Subtarget->isTargetELF() &&
-        DAG.getTarget().getRelocationModel() == Reloc::PIC_) {
-      OpFlags = X86II::MO_PLT;
-    } else if (Subtarget->isPICStyleStubAny() &&
-               (!Subtarget->getTargetTriple().isMacOSX() ||
-                Subtarget->getTargetTriple().isMacOSXVersionLT(10, 5))) {
-      // PC-relative references to external symbols should go through $stub,
-      // unless we're building with the leopard linker or later, which
-      // automatically synthesizes these stubs.
-      OpFlags = X86II::MO_DARWIN_STUB;
-    }
+    const Module *Mod = DAG.getMachineFunction().getFunction()->getParent();
+    unsigned char OpFlags =
+        Subtarget.classifyGlobalFunctionReference(nullptr, *Mod);
 
     Callee = DAG.getTargetExternalSymbol(
         S->getSymbol(), getPointerTy(DAG.getDataLayout()), OpFlags);
-  } else if (Subtarget->isTarget64BitILP32() &&
+  } else if (Subtarget.isTarget64BitILP32() &&
              Callee->getValueType(0) == MVT::i32) {
     // Zero-extend the 32-bit Callee address into a 64-bit according to x32 ABI
     Callee = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i64, Callee);
@@ -3552,7 +3396,7 @@ X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
                        DAG.getTarget().Options.GuaranteedTailCallOpt))
     NumBytesForCalleeToPop = NumBytes;    // Callee pops everything
   else if (!Is64Bit && !canGuaranteeTCO(CallConv) &&
-           !Subtarget->getTargetTriple().isOSMSVCRT() &&
+           !Subtarget.getTargetTriple().isOSMSVCRT() &&
            SR == StackStructReturn)
     // If this is a call to a struct-return function, the callee
     // pops the hidden struct pointer, so we have to push it back.
@@ -3562,6 +3406,12 @@ X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
   else
     NumBytesForCalleeToPop = 0;  // Callee pops nothing.
 
+  if (CLI.DoesNotReturn && !getTargetMachine().Options.TrapUnreachable) {
+    // No need to reset the stack after the call if the call doesn't return. To
+    // make the MI verify, we'll pretend the callee does it for us.
+    NumBytesForCalleeToPop = NumBytes;
+  }
+
   // Returns a flag for retval copy to use.
   if (!IsSibcall) {
     Chain = DAG.getCALLSEQ_END(Chain,
@@ -3614,8 +3464,8 @@ X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
 unsigned
 X86TargetLowering::GetAlignedArgumentStackSize(unsigned StackSize,
                                                SelectionDAG& DAG) const {
-  const X86RegisterInfo *RegInfo = Subtarget->getRegisterInfo();
-  const TargetFrameLowering &TFI = *Subtarget->getFrameLowering();
+  const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
+  const TargetFrameLowering &TFI = *Subtarget.getFrameLowering();
   unsigned StackAlignment = TFI.getStackAlignment();
   uint64_t AlignMask = StackAlignment - 1;
   int64_t Offset = StackSize;
@@ -3636,8 +3486,28 @@ X86TargetLowering::GetAlignedArgumentStackSize(unsigned StackSize,
 static
 bool MatchingStackOffset(SDValue Arg, unsigned Offset, ISD::ArgFlagsTy Flags,
                          MachineFrameInfo *MFI, const MachineRegisterInfo *MRI,
-                         const X86InstrInfo *TII) {
+                         const X86InstrInfo *TII, const CCValAssign &VA) {
   unsigned Bytes = Arg.getValueType().getSizeInBits() / 8;
+
+  for (;;) {
+    // Look through nodes that don't alter the bits of the incoming value.
+    unsigned Op = Arg.getOpcode();
+    if (Op == ISD::ZERO_EXTEND || Op == ISD::ANY_EXTEND || Op == ISD::BITCAST) {
+      Arg = Arg.getOperand(0);
+      continue;
+    }
+    if (Op == ISD::TRUNCATE) {
+      const SDValue &TruncInput = Arg.getOperand(0);
+      if (TruncInput.getOpcode() == ISD::AssertZext &&
+          cast<VTSDNode>(TruncInput.getOperand(1))->getVT() ==
+              Arg.getValueType()) {
+        Arg = TruncInput.getOperand(0);
+        continue;
+      }
+    }
+    break;
+  }
+
   int FI = INT_MAX;
   if (Arg.getOpcode() == ISD::CopyFromReg) {
     unsigned VR = cast<RegisterSDNode>(Arg.getOperand(1))->getReg();
@@ -3647,7 +3517,7 @@ bool MatchingStackOffset(SDValue Arg, unsigned Offset, ISD::ArgFlagsTy Flags,
     if (!Def)
       return false;
     if (!Flags.isByVal()) {
-      if (!TII->isLoadFromStackSlot(Def, FI))
+      if (!TII->isLoadFromStackSlot(*Def, FI))
         return false;
     } else {
       unsigned Opcode = Def->getOpcode();
@@ -3682,7 +3552,20 @@ bool MatchingStackOffset(SDValue Arg, unsigned Offset, ISD::ArgFlagsTy Flags,
   assert(FI != INT_MAX);
   if (!MFI->isFixedObjectIndex(FI))
     return false;
-  return Offset == MFI->getObjectOffset(FI) && Bytes == MFI->getObjectSize(FI);
+
+  if (Offset != MFI->getObjectOffset(FI))
+    return false;
+
+  if (VA.getLocVT().getSizeInBits() > Arg.getValueType().getSizeInBits()) {
+    // If the argument location is wider than the argument type, check that any
+    // extension flags match.
+    if (Flags.isZExt() != MFI->isObjectZExt(FI) ||
+        Flags.isSExt() != MFI->isObjectSExt(FI)) {
+      return false;
+    }
+  }
+
+  return Bytes == MFI->getObjectSize(FI);
 }
 
 /// Check whether the call is eligible for tail call optimization. Targets
@@ -3708,8 +3591,8 @@ bool X86TargetLowering::IsEligibleForTailCallOptimization(
 
   CallingConv::ID CallerCC = CallerF->getCallingConv();
   bool CCMatch = CallerCC == CalleeCC;
-  bool IsCalleeWin64 = Subtarget->isCallingConvWin64(CalleeCC);
-  bool IsCallerWin64 = Subtarget->isCallingConvWin64(CallerCC);
+  bool IsCalleeWin64 = Subtarget.isCallingConvWin64(CalleeCC);
+  bool IsCallerWin64 = Subtarget.isCallingConvWin64(CallerCC);
 
   // Win64 functions have extra shadow space for argument homing. Don't do the
   // sibcall if the caller and callee have mismatched expectations for this
@@ -3728,7 +3611,7 @@ bool X86TargetLowering::IsEligibleForTailCallOptimization(
 
   // Can't do sibcall if stack needs to be dynamically re-aligned. PEI needs to
   // emit a special epilogue.
-  const X86RegisterInfo *RegInfo = Subtarget->getRegisterInfo();
+  const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
   if (RegInfo->needsStackRealignment(MF))
     return false;
 
@@ -3739,6 +3622,7 @@ bool X86TargetLowering::IsEligibleForTailCallOptimization(
 
   // Do not sibcall optimize vararg calls unless all arguments are passed via
   // registers.
+  LLVMContext &C = *DAG.getContext();
   if (isVarArg && !Outs.empty()) {
     // Optimizing for varargs on Win64 is unlikely to be safe without
     // additional testing.
@@ -3746,8 +3630,7 @@ bool X86TargetLowering::IsEligibleForTailCallOptimization(
       return false;
 
     SmallVector<CCValAssign, 16> ArgLocs;
-    CCState CCInfo(CalleeCC, isVarArg, DAG.getMachineFunction(), ArgLocs,
-                   *DAG.getContext());
+    CCState CCInfo(CalleeCC, isVarArg, MF, ArgLocs, C);
 
     CCInfo.AnalyzeCallOperands(Outs, CC_X86);
     for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i)
@@ -3767,8 +3650,7 @@ bool X86TargetLowering::IsEligibleForTailCallOptimization(
   }
   if (Unused) {
     SmallVector<CCValAssign, 16> RVLocs;
-    CCState CCInfo(CalleeCC, false, DAG.getMachineFunction(), RVLocs,
-                   *DAG.getContext());
+    CCState CCInfo(CalleeCC, false, MF, RVLocs, C);
     CCInfo.AnalyzeCallResult(Ins, RetCC_X86);
     for (unsigned i = 0, e = RVLocs.size(); i != e; ++i) {
       CCValAssign &VA = RVLocs[i];
@@ -3777,34 +3659,17 @@ bool X86TargetLowering::IsEligibleForTailCallOptimization(
     }
   }
 
-  // If the calling conventions do not match, then we'd better make sure the
-  // results are returned in the same way as what the caller expects.
+  // Check that the call results are passed in the same way.
+  if (!CCState::resultsCompatible(CalleeCC, CallerCC, MF, C, Ins,
+                                  RetCC_X86, RetCC_X86))
+    return false;
+  // The callee has to preserve all registers the caller needs to preserve.
+  const X86RegisterInfo *TRI = Subtarget.getRegisterInfo();
+  const uint32_t *CallerPreserved = TRI->getCallPreservedMask(MF, CallerCC);
   if (!CCMatch) {
-    SmallVector<CCValAssign, 16> RVLocs1;
-    CCState CCInfo1(CalleeCC, false, DAG.getMachineFunction(), RVLocs1,
-                    *DAG.getContext());
-    CCInfo1.AnalyzeCallResult(Ins, RetCC_X86);
-
-    SmallVector<CCValAssign, 16> RVLocs2;
-    CCState CCInfo2(CallerCC, false, DAG.getMachineFunction(), RVLocs2,
-                    *DAG.getContext());
-    CCInfo2.AnalyzeCallResult(Ins, RetCC_X86);
-
-    if (RVLocs1.size() != RVLocs2.size())
+    const uint32_t *CalleePreserved = TRI->getCallPreservedMask(MF, CalleeCC);
+    if (!TRI->regmaskSubsetEqual(CallerPreserved, CalleePreserved))
       return false;
-    for (unsigned i = 0, e = RVLocs1.size(); i != e; ++i) {
-      if (RVLocs1[i].isRegLoc() != RVLocs2[i].isRegLoc())
-        return false;
-      if (RVLocs1[i].getLocInfo() != RVLocs2[i].getLocInfo())
-        return false;
-      if (RVLocs1[i].isRegLoc()) {
-        if (RVLocs1[i].getLocReg() != RVLocs2[i].getLocReg())
-          return false;
-      } else {
-        if (RVLocs1[i].getLocMemOffset() != RVLocs2[i].getLocMemOffset())
-          return false;
-      }
-    }
   }
 
   unsigned StackArgsSize = 0;
@@ -3815,8 +3680,7 @@ bool X86TargetLowering::IsEligibleForTailCallOptimization(
     // Check if stack adjustment is needed. For now, do not do this if any
     // argument is passed on the stack.
     SmallVector<CCValAssign, 16> ArgLocs;
-    CCState CCInfo(CalleeCC, isVarArg, DAG.getMachineFunction(), ArgLocs,
-                   *DAG.getContext());
+    CCState CCInfo(CalleeCC, isVarArg, MF, ArgLocs, C);
 
     // Allocate shadow area for Win64
     if (IsCalleeWin64)
@@ -3830,7 +3694,7 @@ bool X86TargetLowering::IsEligibleForTailCallOptimization(
       // the caller's fixed stack objects.
       MachineFrameInfo *MFI = MF.getFrameInfo();
       const MachineRegisterInfo *MRI = &MF.getRegInfo();
-      const X86InstrInfo *TII = Subtarget->getInstrInfo();
+      const X86InstrInfo *TII = Subtarget.getInstrInfo();
       for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
         CCValAssign &VA = ArgLocs[i];
         SDValue Arg = OutVals[i];
@@ -3839,26 +3703,25 @@ bool X86TargetLowering::IsEligibleForTailCallOptimization(
           return false;
         if (!VA.isRegLoc()) {
           if (!MatchingStackOffset(Arg, VA.getLocMemOffset(), Flags,
-                                   MFI, MRI, TII))
+                                   MFI, MRI, TII, VA))
             return false;
         }
       }
     }
 
+    bool PositionIndependent = isPositionIndependent();
     // If the tailcall address may be in a register, then make sure it's
     // possible to register allocate for it. In 32-bit, the call address can
     // only target EAX, EDX, or ECX since the tail call must be scheduled after
     // callee-saved registers are restored. These happen to be the same
     // registers used to pass 'inreg' arguments so watch out for those.
-    if (!Subtarget->is64Bit() &&
-        ((!isa<GlobalAddressSDNode>(Callee) &&
-          !isa<ExternalSymbolSDNode>(Callee)) ||
-         DAG.getTarget().getRelocationModel() == Reloc::PIC_)) {
+    if (!Subtarget.is64Bit() && ((!isa<GlobalAddressSDNode>(Callee) &&
+                                  !isa<ExternalSymbolSDNode>(Callee)) ||
+                                 PositionIndependent)) {
       unsigned NumInRegs = 0;
       // In PIC we need an extra register to formulate the address computation
       // for the callee.
-      unsigned MaxInRegs =
-        (DAG.getTarget().getRelocationModel() == Reloc::PIC_) ? 2 : 3;
+      unsigned MaxInRegs = PositionIndependent ? 2 : 3;
 
       for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
         CCValAssign &VA = ArgLocs[i];
@@ -3874,10 +3737,14 @@ bool X86TargetLowering::IsEligibleForTailCallOptimization(
         }
       }
     }
+
+    const MachineRegisterInfo &MRI = MF.getRegInfo();
+    if (!parametersInCSRMatch(MRI, CallerPreserved, ArgLocs, OutVals))
+      return false;
   }
 
   bool CalleeWillPop =
-      X86::isCalleePop(CalleeCC, Subtarget->is64Bit(), isVarArg,
+      X86::isCalleePop(CalleeCC, Subtarget.is64Bit(), isVarArg,
                        MF.getTarget().Options.GuaranteedTailCallOpt);
 
   if (unsigned BytesToPop =
@@ -3923,6 +3790,8 @@ static bool isTargetShuffle(unsigned Opcode) {
   case X86ISD::SHUFP:
   case X86ISD::INSERTPS:
   case X86ISD::PALIGNR:
+  case X86ISD::VSHLDQ:
+  case X86ISD::VSRLDQ:
   case X86ISD::MOVLHPS:
   case X86ISD::MOVLHPD:
   case X86ISD::MOVHLPS:
@@ -3935,16 +3804,30 @@ static bool isTargetShuffle(unsigned Opcode) {
   case X86ISD::MOVSD:
   case X86ISD::UNPCKL:
   case X86ISD::UNPCKH:
+  case X86ISD::VBROADCAST:
   case X86ISD::VPERMILPI:
+  case X86ISD::VPERMILPV:
   case X86ISD::VPERM2X128:
+  case X86ISD::VPERMIL2:
   case X86ISD::VPERMI:
+  case X86ISD::VPPERM:
   case X86ISD::VPERMV:
   case X86ISD::VPERMV3:
+  case X86ISD::VZEXT_MOVL:
     return true;
   }
 }
 
-static SDValue getTargetShuffleNode(unsigned Opc, SDLoc dl, MVT VT,
+static bool isTargetShuffleVariableMask(unsigned Opcode) {
+  switch (Opcode) {
+  default: return false;
+  case X86ISD::PSHUFB:
+  case X86ISD::VPERMILPV:
+    return true;
+  }
+}
+
+static SDValue getTargetShuffleNode(unsigned Opc, const SDLoc &dl, MVT VT,
                                     SDValue V1, unsigned TargetMask,
                                     SelectionDAG &DAG) {
   switch(Opc) {
@@ -3959,7 +3842,7 @@ static SDValue getTargetShuffleNode(unsigned Opc, SDLoc dl, MVT VT,
   }
 }
 
-static SDValue getTargetShuffleNode(unsigned Opc, SDLoc dl, MVT VT,
+static SDValue getTargetShuffleNode(unsigned Opc, const SDLoc &dl, MVT VT,
                                     SDValue V1, SDValue V2, SelectionDAG &DAG) {
   switch(Opc) {
   default: llvm_unreachable("Unknown x86 shuffle node");
@@ -3978,7 +3861,7 @@ static SDValue getTargetShuffleNode(unsigned Opc, SDLoc dl, MVT VT,
 
 SDValue X86TargetLowering::getReturnAddressFrameIndex(SelectionDAG &DAG) const {
   MachineFunction &MF = DAG.getMachineFunction();
-  const X86RegisterInfo *RegInfo = Subtarget->getRegisterInfo();
+  const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
   X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>();
   int ReturnAddrIndex = FuncInfo->getRAIndex();
 
@@ -4047,17 +3930,20 @@ bool X86::isCalleePop(CallingConv::ID CallingConv,
 /// \brief Return true if the condition is an unsigned comparison operation.
 static bool isX86CCUnsigned(unsigned X86CC) {
   switch (X86CC) {
-  default: llvm_unreachable("Invalid integer condition!");
-  case X86::COND_E:     return true;
-  case X86::COND_G:     return false;
-  case X86::COND_GE:    return false;
-  case X86::COND_L:     return false;
-  case X86::COND_LE:    return false;
-  case X86::COND_NE:    return true;
-  case X86::COND_B:     return true;
-  case X86::COND_A:     return true;
-  case X86::COND_BE:    return true;
-  case X86::COND_AE:    return true;
+  default:
+    llvm_unreachable("Invalid integer condition!");
+  case X86::COND_E:
+  case X86::COND_NE:
+  case X86::COND_B:
+  case X86::COND_A:
+  case X86::COND_BE:
+  case X86::COND_AE:
+    return true;
+  case X86::COND_G:
+  case X86::COND_GE:
+  case X86::COND_L:
+  case X86::COND_LE:
+    return false;
   }
 }
 
@@ -4080,8 +3966,9 @@ static X86::CondCode TranslateIntegerX86CC(ISD::CondCode SetCCOpcode) {
 /// Do a one-to-one translation of a ISD::CondCode to the X86-specific
 /// condition code, returning the condition code and the LHS/RHS of the
 /// comparison to make.
-static unsigned TranslateX86CC(ISD::CondCode SetCCOpcode, SDLoc DL, bool isFP,
-                               SDValue &LHS, SDValue &RHS, SelectionDAG &DAG) {
+static unsigned TranslateX86CC(ISD::CondCode SetCCOpcode, const SDLoc &DL,
+                               bool isFP, SDValue &LHS, SDValue &RHS,
+                               SelectionDAG &DAG) {
   if (!isFP) {
     if (ConstantSDNode *RHSC = dyn_cast<ConstantSDNode>(RHS)) {
       if (SetCCOpcode == ISD::SETGT && RHSC->isAllOnesValue()) {
@@ -4181,24 +4068,50 @@ bool X86TargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
   if (!IntrData)
     return false;
 
+  Info.opc = ISD::INTRINSIC_W_CHAIN;
+  Info.readMem = false;
+  Info.writeMem = false;
+  Info.vol = false;
+  Info.offset = 0;
+
   switch (IntrData->Type) {
-  case LOADA:
-  case LOADU: {
-    Info.opc = ISD::INTRINSIC_W_CHAIN;
-    Info.memVT = MVT::getVT(I.getType());
+  case EXPAND_FROM_MEM: {
     Info.ptrVal = I.getArgOperand(0);
-    Info.offset = 0;
-    Info.align = (IntrData->Type == LOADA ? Info.memVT.getSizeInBits()/8 : 1);
-    Info.vol = false;
+    Info.memVT = MVT::getVT(I.getType());
+    Info.align = 1;
     Info.readMem = true;
-    Info.writeMem = false;
-    return true;
+    break;
   }
-  default:
+  case COMPRESS_TO_MEM: {
+    Info.ptrVal = I.getArgOperand(0);
+    Info.memVT = MVT::getVT(I.getArgOperand(1)->getType());
+    Info.align = 1;
+    Info.writeMem = true;
     break;
   }
+  case TRUNCATE_TO_MEM_VI8:
+  case TRUNCATE_TO_MEM_VI16:
+  case TRUNCATE_TO_MEM_VI32: {
+    Info.ptrVal = I.getArgOperand(0);
+    MVT VT  = MVT::getVT(I.getArgOperand(1)->getType());
+    MVT ScalarVT = MVT::INVALID_SIMPLE_VALUE_TYPE;
+    if (IntrData->Type == TRUNCATE_TO_MEM_VI8)
+      ScalarVT = MVT::i8;
+    else if (IntrData->Type == TRUNCATE_TO_MEM_VI16)
+      ScalarVT = MVT::i16;
+    else if (IntrData->Type == TRUNCATE_TO_MEM_VI32)
+      ScalarVT = MVT::i32;
+
+    Info.memVT = MVT::getVectorVT(ScalarVT, VT.getVectorNumElements());
+    Info.align = 1;
+    Info.writeMem = true;
+    break;
+  }
+  default:
+    return false;
+  }
 
-  return false;
+  return true;
 }
 
 /// Returns true if the target can instruction select the
@@ -4246,12 +4159,24 @@ bool X86TargetLowering::isExtractSubvectorCheap(EVT ResVT,
 
 bool X86TargetLowering::isCheapToSpeculateCttz() const {
   // Speculate cttz only if we can directly use TZCNT.
-  return Subtarget->hasBMI();
+  return Subtarget.hasBMI();
 }
 
 bool X86TargetLowering::isCheapToSpeculateCtlz() const {
   // Speculate ctlz only if we can directly use LZCNT.
-  return Subtarget->hasLZCNT();
+  return Subtarget.hasLZCNT();
+}
+
+bool X86TargetLowering::hasAndNotCompare(SDValue Y) const {
+  if (!Subtarget.hasBMI())
+    return false;
+
+  // There are only 32-bit and 64-bit forms for 'andn'.
+  EVT VT = Y.getValueType();
+  if (VT != MVT::i32 && VT != MVT::i64)
+    return false;
+
+  return true;
 }
 
 /// Return true if every element in Mask, beginning
@@ -4269,11 +4194,26 @@ static bool isUndefOrInRange(int Val, int Low, int Hi) {
   return (Val < 0) || (Val >= Low && Val < Hi);
 }
 
+/// Return true if every element in Mask is undef or if its value
+/// falls within the specified range (L, H].
+static bool isUndefOrInRange(ArrayRef<int> Mask,
+                             int Low, int Hi) {
+  for (int M : Mask)
+    if (!isUndefOrInRange(M, Low, Hi))
+      return false;
+  return true;
+}
+
 /// Val is either less than zero (undef) or equal to the specified value.
 static bool isUndefOrEqual(int Val, int CmpVal) {
   return (Val < 0 || Val == CmpVal);
 }
 
+/// Val is either the undef or zero sentinel value.
+static bool isUndefOrZero(int Val) {
+  return (Val == SM_SentinelUndef || Val == SM_SentinelZero);
+}
+
 /// Return true if every element in Mask, beginning
 /// from position Pos and ending in Pos+Size, falls within the specified
 /// sequential range (Low, Low+Size]. or is undef.
@@ -4285,6 +4225,17 @@ static bool isSequentialOrUndefInRange(ArrayRef<int> Mask,
   return true;
 }
 
+/// Return true if every element in Mask, beginning
+/// from position Pos and ending in Pos+Size, falls within the specified
+/// sequential range (Low, Low+Size], or is undef or is zero.
+static bool isSequentialOrUndefOrZeroInRange(ArrayRef<int> Mask, unsigned Pos,
+                                             unsigned Size, int Low) {
+  for (unsigned i = Pos, e = Pos + Size; i != e; ++i, ++Low)
+    if (!isUndefOrZero(Mask[i]) && Mask[i] != Low)
+      return false;
+  return true;
+}
+
 /// Return true if the specified EXTRACT_SUBVECTOR operand specifies a vector
 /// extract that is suitable for instruction that extract 128 or 256 bit vectors
 static bool isVEXTRACTIndex(SDNode *N, unsigned vecWidth) {
@@ -4399,9 +4350,8 @@ bool X86::isZeroNode(SDValue Elt) {
 // Build a vector of constants
 // Use an UNDEF node if MaskElt == -1.
 // Spilt 64-bit constants in the 32-bit mode.
-static SDValue getConstVector(ArrayRef<int> Values, MVT VT,
-                              SelectionDAG &DAG,
-                              SDLoc dl, bool IsMask = false) {
+static SDValue getConstVector(ArrayRef<int> Values, MVT VT, SelectionDAG &DAG,
+                              const SDLoc &dl, bool IsMask = false) {
 
   SmallVector<SDValue, 32>  Ops;
   bool Split = false;
@@ -4424,63 +4374,40 @@ static SDValue getConstVector(ArrayRef<int> Values, MVT VT,
       Ops.push_back(IsUndef ? DAG.getUNDEF(EltVT) :
                     DAG.getConstant(0, dl, EltVT));
   }
-  SDValue ConstsNode = DAG.getNode(ISD::BUILD_VECTOR, dl, ConstVecVT, Ops);
+  SDValue ConstsNode = DAG.getBuildVector(ConstVecVT, dl, Ops);
   if (Split)
     ConstsNode = DAG.getBitcast(VT, ConstsNode);
   return ConstsNode;
 }
 
 /// Returns a vector of specified type with all zero elements.
-static SDValue getZeroVector(MVT VT, const X86Subtarget *Subtarget,
-                             SelectionDAG &DAG, SDLoc dl) {
-  assert(VT.isVector() && "Expected a vector type");
-
-  // Always build SSE zero vectors as <4 x i32> bitcasted
-  // to their dest type. This ensures they get CSE'd.
+static SDValue getZeroVector(MVT VT, const X86Subtarget &Subtarget,
+                             SelectionDAG &DAG, const SDLoc &dl) {
+  assert((VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector() ||
+          VT.getVectorElementType() == MVT::i1) &&
+         "Unexpected vector type");
+
+  // Try to build SSE/AVX zero vectors as <N x i32> bitcasted to their dest
+  // type. This ensures they get CSE'd. But if the integer type is not
+  // available, use a floating-point +0.0 instead.
   SDValue Vec;
-  if (VT.is128BitVector()) {  // SSE
-    if (Subtarget->hasSSE2()) {  // SSE2
-      SDValue Cst = DAG.getConstant(0, dl, MVT::i32);
-      Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4i32, Cst, Cst, Cst, Cst);
-    } else { // SSE1
-      SDValue Cst = DAG.getConstantFP(+0.0, dl, MVT::f32);
-      Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4f32, Cst, Cst, Cst, Cst);
-    }
-  } else if (VT.is256BitVector()) { // AVX
-    if (Subtarget->hasInt256()) { // AVX2
-      SDValue Cst = DAG.getConstant(0, dl, MVT::i32);
-      SDValue Ops[] = { Cst, Cst, Cst, Cst, Cst, Cst, Cst, Cst };
-      Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v8i32, Ops);
-    } else {
-      // 256-bit logic and arithmetic instructions in AVX are all
-      // floating-point, no support for integer ops. Emit fp zeroed vectors.
-      SDValue Cst = DAG.getConstantFP(+0.0, dl, MVT::f32);
-      SDValue Ops[] = { Cst, Cst, Cst, Cst, Cst, Cst, Cst, Cst };
-      Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v8f32, Ops);
-    }
-  } else if (VT.is512BitVector()) { // AVX-512
-      SDValue Cst = DAG.getConstant(0, dl, MVT::i32);
-      SDValue Ops[] = { Cst, Cst, Cst, Cst, Cst, Cst, Cst, Cst,
-                        Cst, Cst, Cst, Cst, Cst, Cst, Cst, Cst };
-      Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v16i32, Ops);
+  if (!Subtarget.hasSSE2() && VT.is128BitVector()) {
+    Vec = DAG.getConstantFP(+0.0, dl, MVT::v4f32);
   } else if (VT.getVectorElementType() == MVT::i1) {
-
-    assert((Subtarget->hasBWI() || VT.getVectorNumElements() <= 16)
-            && "Unexpected vector type");
-    assert((Subtarget->hasVLX() || VT.getVectorNumElements() >= 8)
-            && "Unexpected vector type");
-    SDValue Cst = DAG.getConstant(0, dl, MVT::i1);
-    SmallVector<SDValue, 64> Ops(VT.getVectorNumElements(), Cst);
-    return DAG.getNode(ISD::BUILD_VECTOR, dl, VT, Ops);
-  } else
-    llvm_unreachable("Unexpected vector type");
-
+    assert((Subtarget.hasBWI() || VT.getVectorNumElements() <= 16) &&
+           "Unexpected vector type");
+    assert((Subtarget.hasVLX() || VT.getVectorNumElements() >= 8) &&
+           "Unexpected vector type");
+    Vec = DAG.getConstant(0, dl, VT);
+  } else {
+    unsigned Num32BitElts = VT.getSizeInBits() / 32;
+    Vec = DAG.getConstant(0, dl, MVT::getVectorVT(MVT::i32, Num32BitElts));
+  }
   return DAG.getBitcast(VT, Vec);
 }
 
-static SDValue ExtractSubVector(SDValue Vec, unsigned IdxVal,
-                                SelectionDAG &DAG, SDLoc dl,
-                                unsigned vectorWidth) {
+static SDValue extractSubVector(SDValue Vec, unsigned IdxVal, SelectionDAG &DAG,
+                                const SDLoc &dl, unsigned vectorWidth) {
   assert((vectorWidth == 128 || vectorWidth == 256) &&
          "Unsupported vector width");
   EVT VT = Vec.getValueType();
@@ -4490,7 +4417,7 @@ static SDValue ExtractSubVector(SDValue Vec, unsigned IdxVal,
                                   VT.getVectorNumElements()/Factor);
 
   // Extract from UNDEF is UNDEF.
-  if (Vec.getOpcode() == ISD::UNDEF)
+  if (Vec.isUndef())
     return DAG.getUNDEF(ResultVT);
 
   // Extract the relevant vectorWidth bits.  Generate an EXTRACT_SUBVECTOR
@@ -4503,8 +4430,8 @@ static SDValue ExtractSubVector(SDValue Vec, unsigned IdxVal,
 
   // If the input is a buildvector just emit a smaller one.
   if (Vec.getOpcode() == ISD::BUILD_VECTOR)
-    return DAG.getNode(ISD::BUILD_VECTOR, dl, ResultVT,
-                       makeArrayRef(Vec->op_begin() + IdxVal, ElemsPerChunk));
+    return DAG.getNode(ISD::BUILD_VECTOR,
+         dl, ResultVT, makeArrayRef(Vec->op_begin() + IdxVal, ElemsPerChunk));
 
   SDValue VecIdx = DAG.getIntPtrConstant(IdxVal, dl);
   return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, ResultVT, Vec, VecIdx);
@@ -4516,27 +4443,27 @@ static SDValue ExtractSubVector(SDValue Vec, unsigned IdxVal,
 /// instructions or a simple subregister reference. Idx is an index in the
 /// 128 bits we want.  It need not be aligned to a 128-bit boundary.  That makes
 /// lowering EXTRACT_VECTOR_ELT operations easier.
-static SDValue Extract128BitVector(SDValue Vec, unsigned IdxVal,
-                                   SelectionDAG &DAG, SDLoc dl) {
+static SDValue extract128BitVector(SDValue Vec, unsigned IdxVal,
+                                   SelectionDAG &DAG, const SDLoc &dl) {
   assert((Vec.getValueType().is256BitVector() ||
           Vec.getValueType().is512BitVector()) && "Unexpected vector size!");
-  return ExtractSubVector(Vec, IdxVal, DAG, dl, 128);
+  return extractSubVector(Vec, IdxVal, DAG, dl, 128);
 }
 
 /// Generate a DAG to grab 256-bits from a 512-bit vector.
-static SDValue Extract256BitVector(SDValue Vec, unsigned IdxVal,
-                                   SelectionDAG &DAG, SDLoc dl) {
+static SDValue extract256BitVector(SDValue Vec, unsigned IdxVal,
+                                   SelectionDAG &DAG, const SDLoc &dl) {
   assert(Vec.getValueType().is512BitVector() && "Unexpected vector size!");
-  return ExtractSubVector(Vec, IdxVal, DAG, dl, 256);
+  return extractSubVector(Vec, IdxVal, DAG, dl, 256);
 }
 
-static SDValue InsertSubVector(SDValue Result, SDValue Vec,
-                               unsigned IdxVal, SelectionDAG &DAG,
-                               SDLoc dl, unsigned vectorWidth) {
+static SDValue insertSubVector(SDValue Result, SDValue Vec, unsigned IdxVal,
+                               SelectionDAG &DAG, const SDLoc &dl,
+                               unsigned vectorWidth) {
   assert((vectorWidth == 128 || vectorWidth == 256) &&
          "Unsupported vector width");
   // Inserting UNDEF is Result
-  if (Vec.getOpcode() == ISD::UNDEF)
+  if (Vec.isUndef())
     return Result;
   EVT VT = Vec.getValueType();
   EVT ElVT = VT.getVectorElementType();
@@ -4560,8 +4487,8 @@ static SDValue InsertSubVector(SDValue Result, SDValue Vec,
 /// simple superregister reference.  Idx is an index in the 128 bits
 /// we want.  It need not be aligned to a 128-bit boundary.  That makes
 /// lowering INSERT_VECTOR_ELT operations easier.
-static SDValue Insert128BitVector(SDValue Result, SDValue Vec, unsigned IdxVal,
-                                  SelectionDAG &DAG, SDLoc dl) {
+static SDValue insert128BitVector(SDValue Result, SDValue Vec, unsigned IdxVal,
+                                  SelectionDAG &DAG, const SDLoc &dl) {
   assert(Vec.getValueType().is128BitVector() && "Unexpected vector size!");
 
   // For insertion into the zero index (low half) of a 256-bit vector, it is
@@ -4570,7 +4497,7 @@ static SDValue Insert128BitVector(SDValue Result, SDValue Vec, unsigned IdxVal,
   // extend the subvector to the size of the result vector. Make sure that
   // we are not recursing on that node by checking for undef here.
   if (IdxVal == 0 && Result.getValueType().is256BitVector() &&
-      Result.getOpcode() != ISD::UNDEF) {
+      !Result.isUndef()) {
     EVT ResultVT = Result.getValueType();
     SDValue ZeroIndex = DAG.getIntPtrConstant(0, dl);
     SDValue Undef = DAG.getUNDEF(ResultVT);
@@ -4607,17 +4534,18 @@ static SDValue Insert128BitVector(SDValue Result, SDValue Vec, unsigned IdxVal,
     return DAG.getBitcast(ResultVT, Vec256);
   }
 
-  return InsertSubVector(Result, Vec, IdxVal, DAG, dl, 128);
+  return insertSubVector(Result, Vec, IdxVal, DAG, dl, 128);
 }
 
-static SDValue Insert256BitVector(SDValue Result, SDValue Vec, unsigned IdxVal,
-                                  SelectionDAG &DAG, SDLoc dl) {
+static SDValue insert256BitVector(SDValue Result, SDValue Vec, unsigned IdxVal,
+                                  SelectionDAG &DAG, const SDLoc &dl) {
   assert(Vec.getValueType().is256BitVector() && "Unexpected vector size!");
-  return InsertSubVector(Result, Vec, IdxVal, DAG, dl, 256);
+  return insertSubVector(Result, Vec, IdxVal, DAG, dl, 256);
 }
 
 /// Insert i1-subvector to i1-vector.
-static SDValue Insert1BitVector(SDValue Op, SelectionDAG &DAG) {
+static SDValue insert1BitVector(SDValue Op, SelectionDAG &DAG,
+                                const X86Subtarget &Subtarget) {
 
   SDLoc dl(Op);
   SDValue Vec = Op.getOperand(0);
@@ -4647,43 +4575,71 @@ static SDValue Insert1BitVector(SDValue Op, SelectionDAG &DAG) {
   // 3. Subvector should be inserted in the middle (for example v2i1
   //    to v16i1, index 2)
 
+  // extend to natively supported kshift
+  MVT MinVT = Subtarget.hasDQI() ? MVT::v8i1 : MVT::v16i1;
+  MVT WideOpVT = OpVT;
+  if (OpVT.getSizeInBits() < MinVT.getStoreSizeInBits())
+    WideOpVT = MinVT;
+
   SDValue ZeroIdx = DAG.getIntPtrConstant(0, dl);
-  SDValue Undef = DAG.getUNDEF(OpVT);
-  SDValue WideSubVec =
-    DAG.getNode(ISD::INSERT_SUBVECTOR, dl, OpVT, Undef, SubVec, ZeroIdx);
-  if (Vec.isUndef())
-    return DAG.getNode(X86ISD::VSHLI, dl, OpVT, WideSubVec,
-      DAG.getConstant(IdxVal, dl, MVT::i8));
+  SDValue Undef = DAG.getUNDEF(WideOpVT);
+  SDValue WideSubVec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideOpVT,
+                                   Undef, SubVec, ZeroIdx);
+
+  // Extract sub-vector if require.
+  auto ExtractSubVec = [&](SDValue V) {
+    return (WideOpVT == OpVT) ? V : DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl,
+                                                OpVT, V, ZeroIdx);
+  };
+
+  if (Vec.isUndef()) {
+    if (IdxVal != 0) {
+      SDValue ShiftBits = DAG.getConstant(IdxVal, dl, MVT::i8);
+      WideSubVec = DAG.getNode(X86ISD::VSHLI, dl, WideOpVT, WideSubVec, ShiftBits);
+    }
+    return ExtractSubVec(WideSubVec);
+  }
 
   if (ISD::isBuildVectorAllZeros(Vec.getNode())) {
+    NumElems = WideOpVT.getVectorNumElements();
     unsigned ShiftLeft = NumElems - SubVecNumElems;
     unsigned ShiftRight = NumElems - SubVecNumElems - IdxVal;
-    WideSubVec = DAG.getNode(X86ISD::VSHLI, dl, OpVT, WideSubVec,
-      DAG.getConstant(ShiftLeft, dl, MVT::i8));
-    return ShiftRight ? DAG.getNode(X86ISD::VSRLI, dl, OpVT, WideSubVec,
-      DAG.getConstant(ShiftRight, dl, MVT::i8)) : WideSubVec;
+    Vec = DAG.getNode(X86ISD::VSHLI, dl, WideOpVT, WideSubVec,
+                             DAG.getConstant(ShiftLeft, dl, MVT::i8));
+    Vec = ShiftRight ? DAG.getNode(X86ISD::VSRLI, dl, WideOpVT, Vec,
+      DAG.getConstant(ShiftRight, dl, MVT::i8)) : Vec;
+    return ExtractSubVec(Vec);
   }
 
   if (IdxVal == 0) {
     // Zero lower bits of the Vec
     SDValue ShiftBits = DAG.getConstant(SubVecNumElems, dl, MVT::i8);
-    Vec = DAG.getNode(X86ISD::VSRLI, dl, OpVT, Vec, ShiftBits);
-    Vec = DAG.getNode(X86ISD::VSHLI, dl, OpVT, Vec, ShiftBits);
-    // Merge them together
-    return DAG.getNode(ISD::OR, dl, OpVT, Vec, WideSubVec);
+    Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideOpVT, Undef, Vec, ZeroIdx);
+    Vec = DAG.getNode(X86ISD::VSRLI, dl, WideOpVT, Vec, ShiftBits);
+    Vec = DAG.getNode(X86ISD::VSHLI, dl, WideOpVT, Vec, ShiftBits);
+    // Merge them together, SubVec should be zero extended.
+    WideSubVec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideOpVT,
+                             getZeroVector(WideOpVT, Subtarget, DAG, dl),
+                             SubVec, ZeroIdx);
+    Vec =  DAG.getNode(ISD::OR, dl, WideOpVT, Vec, WideSubVec);
+    return ExtractSubVec(Vec);
   }
 
   // Simple case when we put subvector in the upper part
   if (IdxVal + SubVecNumElems == NumElems) {
     // Zero upper bits of the Vec
-    WideSubVec = DAG.getNode(X86ISD::VSHLI, dl, OpVT, Vec,
-                        DAG.getConstant(IdxVal, dl, MVT::i8));
+    WideSubVec = DAG.getNode(X86ISD::VSHLI, dl, WideOpVT, WideSubVec,
+                             DAG.getConstant(IdxVal, dl, MVT::i8));
     SDValue ShiftBits = DAG.getConstant(SubVecNumElems, dl, MVT::i8);
-    Vec = DAG.getNode(X86ISD::VSHLI, dl, OpVT, Vec, ShiftBits);
-    Vec = DAG.getNode(X86ISD::VSRLI, dl, OpVT, Vec, ShiftBits);
-    return DAG.getNode(ISD::OR, dl, OpVT, Vec, WideSubVec);
+    Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideOpVT, Undef, Vec, ZeroIdx);
+    Vec = DAG.getNode(X86ISD::VSHLI, dl, WideOpVT, Vec, ShiftBits);
+    Vec = DAG.getNode(X86ISD::VSRLI, dl, WideOpVT, Vec, ShiftBits);
+    Vec = DAG.getNode(ISD::OR, dl, WideOpVT, Vec, WideSubVec);
+    return ExtractSubVec(Vec);
   }
   // Subvector should be inserted in the middle - use shuffle
+  WideSubVec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, OpVT, Undef,
+                           SubVec, ZeroIdx);
   SmallVector<int, 64> Mask;
   for (unsigned i = 0; i < NumElems; ++i)
     Mask.push_back(i >= IdxVal && i < IdxVal + SubVecNumElems ?
@@ -4695,103 +4651,206 @@ static SDValue Insert1BitVector(SDValue Op, SelectionDAG &DAG) {
 /// instructions. This is used because creating CONCAT_VECTOR nodes of
 /// BUILD_VECTORS returns a larger BUILD_VECTOR while we're trying to lower
 /// large BUILD_VECTORS.
-static SDValue Concat128BitVectors(SDValue V1, SDValue V2, EVT VT,
+static SDValue concat128BitVectors(SDValue V1, SDValue V2, EVT VT,
                                    unsigned NumElems, SelectionDAG &DAG,
-                                   SDLoc dl) {
-  SDValue V = Insert128BitVector(DAG.getUNDEF(VT), V1, 0, DAG, dl);
-  return Insert128BitVector(V, V2, NumElems/2, DAG, dl);
+                                   const SDLoc &dl) {
+  SDValue V = insert128BitVector(DAG.getUNDEF(VT), V1, 0, DAG, dl);
+  return insert128BitVector(V, V2, NumElems / 2, DAG, dl);
 }
 
-static SDValue Concat256BitVectors(SDValue V1, SDValue V2, EVT VT,
+static SDValue concat256BitVectors(SDValue V1, SDValue V2, EVT VT,
                                    unsigned NumElems, SelectionDAG &DAG,
-                                   SDLoc dl) {
-  SDValue V = Insert256BitVector(DAG.getUNDEF(VT), V1, 0, DAG, dl);
-  return Insert256BitVector(V, V2, NumElems/2, DAG, dl);
+                                   const SDLoc &dl) {
+  SDValue V = insert256BitVector(DAG.getUNDEF(VT), V1, 0, DAG, dl);
+  return insert256BitVector(V, V2, NumElems / 2, DAG, dl);
 }
 
 /// Returns a vector of specified type with all bits set.
 /// Always build ones vectors as <4 x i32> or <8 x i32>. For 256-bit types with
-/// no AVX2 supprt, use two <4 x i32> inserted in a <8 x i32> appropriately.
+/// no AVX2 support, use two <4 x i32> inserted in a <8 x i32> appropriately.
 /// Then bitcast to their original type, ensuring they get CSE'd.
-static SDValue getOnesVector(EVT VT, const X86Subtarget *Subtarget,
-                             SelectionDAG &DAG, SDLoc dl) {
-  assert(VT.isVector() && "Expected a vector type");
+static SDValue getOnesVector(EVT VT, const X86Subtarget &Subtarget,
+                             SelectionDAG &DAG, const SDLoc &dl) {
+  assert((VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector()) &&
+         "Expected a 128/256/512-bit vector type");
 
-  SDValue Cst = DAG.getConstant(~0U, dl, MVT::i32);
+  APInt Ones = APInt::getAllOnesValue(32);
+  unsigned NumElts = VT.getSizeInBits() / 32;
   SDValue Vec;
-  if (VT.is512BitVector()) {
-    SDValue Ops[] = { Cst, Cst, Cst, Cst, Cst, Cst, Cst, Cst,
-                      Cst, Cst, Cst, Cst, Cst, Cst, Cst, Cst };
-    Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v16i32, Ops);
-  } else if (VT.is256BitVector()) {
-    if (Subtarget->hasInt256()) { // AVX2
-      SDValue Ops[] = { Cst, Cst, Cst, Cst, Cst, Cst, Cst, Cst };
-      Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v8i32, Ops);
-    } else { // AVX
-      Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4i32, Cst, Cst, Cst, Cst);
-      Vec = Concat128BitVectors(Vec, Vec, MVT::v8i32, 8, DAG, dl);
-    }
-  } else if (VT.is128BitVector()) {
-    Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4i32, Cst, Cst, Cst, Cst);
-  } else
-    llvm_unreachable("Unexpected vector type");
-
+  if (!Subtarget.hasInt256() && NumElts == 8) {
+    Vec = DAG.getConstant(Ones, dl, MVT::v4i32);
+    Vec = concat128BitVectors(Vec, Vec, MVT::v8i32, 8, DAG, dl);
+  } else {
+    Vec = DAG.getConstant(Ones, dl, MVT::getVectorVT(MVT::i32, NumElts));
+  }
   return DAG.getBitcast(VT, Vec);
 }
 
 /// Returns a vector_shuffle node for an unpackl operation.
-static SDValue getUnpackl(SelectionDAG &DAG, SDLoc dl, MVT VT, SDValue V1,
-                          SDValue V2) {
+static SDValue getUnpackl(SelectionDAG &DAG, const SDLoc &dl, MVT VT,
+                          SDValue V1, SDValue V2) {
+  assert(VT.is128BitVector() && "Expected a 128-bit vector type");
   unsigned NumElems = VT.getVectorNumElements();
-  SmallVector<int, 8> Mask;
+  SmallVector<int, 8> Mask(NumElems);
   for (unsigned i = 0, e = NumElems/2; i != e; ++i) {
-    Mask.push_back(i);
-    Mask.push_back(i + NumElems);
+    Mask[i * 2]     = i;
+    Mask[i * 2 + 1] = i + NumElems;
   }
-  return DAG.getVectorShuffle(VT, dl, V1, V2, &Mask[0]);
+  return DAG.getVectorShuffle(VT, dl, V1, V2, Mask);
 }
 
 /// Returns a vector_shuffle node for an unpackh operation.
-static SDValue getUnpackh(SelectionDAG &DAG, SDLoc dl, MVT VT, SDValue V1,
-                          SDValue V2) {
+static SDValue getUnpackh(SelectionDAG &DAG, const SDLoc &dl, MVT VT,
+                          SDValue V1, SDValue V2) {
+  assert(VT.is128BitVector() && "Expected a 128-bit vector type");
   unsigned NumElems = VT.getVectorNumElements();
-  SmallVector<int, 8> Mask;
+  SmallVector<int, 8> Mask(NumElems);
   for (unsigned i = 0, Half = NumElems/2; i != Half; ++i) {
-    Mask.push_back(i + Half);
-    Mask.push_back(i + NumElems + Half);
+    Mask[i * 2]     = i + Half;
+    Mask[i * 2 + 1] = i + NumElems + Half;
   }
-  return DAG.getVectorShuffle(VT, dl, V1, V2, &Mask[0]);
+  return DAG.getVectorShuffle(VT, dl, V1, V2, Mask);
 }
 
 /// Return a vector_shuffle of the specified vector of zero or undef vector.
 /// This produces a shuffle where the low element of V2 is swizzled into the
 /// zero/undef vector, landing at element Idx.
 /// This produces a shuffle mask like 4,1,2,3 (idx=0) or  0,1,2,4 (idx=3).
-static SDValue getShuffleVectorZeroOrUndef(SDValue V2, unsigned Idx,
+static SDValue getShuffleVectorZeroOrUndef(SDValue V2, int Idx,
                                            bool IsZero,
-                                           const X86Subtarget *Subtarget,
+                                           const X86Subtarget &Subtarget,
                                            SelectionDAG &DAG) {
   MVT VT = V2.getSimpleValueType();
   SDValue V1 = IsZero
     ? getZeroVector(VT, Subtarget, DAG, SDLoc(V2)) : DAG.getUNDEF(VT);
-  unsigned NumElems = VT.getVectorNumElements();
-  SmallVector<int, 16> MaskVec;
-  for (unsigned i = 0; i != NumElems; ++i)
+  int NumElems = VT.getVectorNumElements();
+  SmallVector<int, 16> MaskVec(NumElems);
+  for (int i = 0; i != NumElems; ++i)
     // If this is the insertion idx, put the low elt of V2 here.
-    MaskVec.push_back(i == Idx ? NumElems : i);
-  return DAG.getVectorShuffle(VT, SDLoc(V2), V1, V2, &MaskVec[0]);
+    MaskVec[i] = (i == Idx) ? NumElems : i;
+  return DAG.getVectorShuffle(VT, SDLoc(V2), V1, V2, MaskVec);
+}
+
+static SDValue peekThroughBitcasts(SDValue V) {
+  while (V.getNode() && V.getOpcode() == ISD::BITCAST)
+    V = V.getOperand(0);
+  return V;
+}
+
+static bool getTargetShuffleMaskIndices(SDValue MaskNode,
+                                        unsigned MaskEltSizeInBits,
+                                        SmallVectorImpl<uint64_t> &RawMask) {
+  MaskNode = peekThroughBitcasts(MaskNode);
+
+  MVT VT = MaskNode.getSimpleValueType();
+  assert(VT.isVector() && "Can't produce a non-vector with a build_vector!");
+
+  // Split an APInt element into MaskEltSizeInBits sized pieces and
+  // insert into the shuffle mask.
+  auto SplitElementToMask = [&](APInt Element) {
+    // Note that this is x86 and so always little endian: the low byte is
+    // the first byte of the mask.
+    int Split = VT.getScalarSizeInBits() / MaskEltSizeInBits;
+    for (int i = 0; i < Split; ++i) {
+      APInt RawElt = Element.getLoBits(MaskEltSizeInBits);
+      Element = Element.lshr(MaskEltSizeInBits);
+      RawMask.push_back(RawElt.getZExtValue());
+    }
+  };
+
+  if (MaskNode.getOpcode() == X86ISD::VBROADCAST) {
+    // TODO: Handle (MaskEltSizeInBits % VT.getScalarSizeInBits()) == 0
+    // TODO: Handle (VT.getScalarSizeInBits() % MaskEltSizeInBits) == 0
+    if (VT.getScalarSizeInBits() != MaskEltSizeInBits)
+      return false;
+    if (auto *CN = dyn_cast<ConstantSDNode>(MaskNode.getOperand(0))) {
+      const APInt &MaskElement = CN->getAPIntValue();
+      for (unsigned i = 0, e = VT.getVectorNumElements(); i != e; ++i) {
+        APInt RawElt = MaskElement.getLoBits(MaskEltSizeInBits);
+        RawMask.push_back(RawElt.getZExtValue());
+      }
+    }
+    return false;
+  }
+
+  if (MaskNode.getOpcode() == X86ISD::VZEXT_MOVL &&
+      MaskNode.getOperand(0).getOpcode() == ISD::SCALAR_TO_VECTOR) {
+
+    // TODO: Handle (MaskEltSizeInBits % VT.getScalarSizeInBits()) == 0
+    if ((VT.getScalarSizeInBits() % MaskEltSizeInBits) != 0)
+      return false;
+    unsigned ElementSplit = VT.getScalarSizeInBits() / MaskEltSizeInBits;
+
+    SDValue MaskOp = MaskNode.getOperand(0).getOperand(0);
+    if (auto *CN = dyn_cast<ConstantSDNode>(MaskOp)) {
+      SplitElementToMask(CN->getAPIntValue());
+      RawMask.append((VT.getVectorNumElements() - 1) * ElementSplit, 0);
+      return true;
+    }
+    return false;
+  }
+
+  if (MaskNode.getOpcode() != ISD::BUILD_VECTOR)
+    return false;
+
+  // We can always decode if the buildvector is all zero constants,
+  // but can't use isBuildVectorAllZeros as it might contain UNDEFs.
+  if (llvm::all_of(MaskNode->ops(), X86::isZeroNode)) {
+    RawMask.append(VT.getSizeInBits() / MaskEltSizeInBits, 0);
+    return true;
+  }
+
+  // TODO: Handle (MaskEltSizeInBits % VT.getScalarSizeInBits()) == 0
+  if ((VT.getScalarSizeInBits() % MaskEltSizeInBits) != 0)
+    return false;
+
+  for (SDValue Op : MaskNode->ops()) {
+    if (auto *CN = dyn_cast<ConstantSDNode>(Op.getNode()))
+      SplitElementToMask(CN->getAPIntValue());
+    else if (auto *CFN = dyn_cast<ConstantFPSDNode>(Op.getNode()))
+      SplitElementToMask(CFN->getValueAPF().bitcastToAPInt());
+    else
+      return false;
+  }
+
+  return true;
+}
+
+static const Constant *getTargetShuffleMaskConstant(SDValue MaskNode) {
+  MaskNode = peekThroughBitcasts(MaskNode);
+
+  auto *MaskLoad = dyn_cast<LoadSDNode>(MaskNode);
+  if (!MaskLoad)
+    return nullptr;
+
+  SDValue Ptr = MaskLoad->getBasePtr();
+  if (Ptr->getOpcode() == X86ISD::Wrapper ||
+      Ptr->getOpcode() == X86ISD::WrapperRIP)
+    Ptr = Ptr->getOperand(0);
+
+  auto *MaskCP = dyn_cast<ConstantPoolSDNode>(Ptr);
+  if (!MaskCP || MaskCP->isMachineConstantPoolEntry())
+    return nullptr;
+
+  return dyn_cast<Constant>(MaskCP->getConstVal());
 }
 
 /// Calculates the shuffle mask corresponding to the target-specific opcode.
-/// Returns true if the Mask could be calculated. Sets IsUnary to true if only
-/// uses one source. Note that this will set IsUnary for shuffles which use a
-/// single input multiple times, and in those cases it will
-/// adjust the mask to only have indices within that single input.
+/// If the mask could be calculated, returns it in \p Mask, returns the shuffle
+/// operands in \p Ops, and returns true.
+/// Sets \p IsUnary to true if only one source is used. Note that this will set
+/// IsUnary for shuffles which use a single input multiple times, and in those
+/// cases it will adjust the mask to only have indices within that single input.
+/// It is an error to call this with non-empty Mask/Ops vectors.
 static bool getTargetShuffleMask(SDNode *N, MVT VT, bool AllowSentinelZero,
+                                 SmallVectorImpl<SDValue> &Ops,
                                  SmallVectorImpl<int> &Mask, bool &IsUnary) {
   unsigned NumElems = VT.getVectorNumElements();
   SDValue ImmN;
 
+  assert(Mask.empty() && "getTargetShuffleMask expects an empty Mask vector");
+  assert(Ops.empty() && "getTargetShuffleMask expects an empty Ops vector");
+
   IsUnary = false;
   bool IsFakeUnary = false;
   switch(N->getOpcode()) {
@@ -4826,9 +4885,22 @@ static bool getTargetShuffleMask(SDNode *N, MVT VT, bool AllowSentinelZero,
     IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
     break;
   case X86ISD::PALIGNR:
+    assert(VT.getScalarType() == MVT::i8 && "Byte vector expected");
     ImmN = N->getOperand(N->getNumOperands()-1);
     DecodePALIGNRMask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
     break;
+  case X86ISD::VSHLDQ:
+    assert(VT.getScalarType() == MVT::i8 && "Byte vector expected");
+    ImmN = N->getOperand(N->getNumOperands() - 1);
+    DecodePSLLDQMask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
+    IsUnary = true;
+    break;
+  case X86ISD::VSRLDQ:
+    assert(VT.getScalarType() == MVT::i8 && "Byte vector expected");
+    ImmN = N->getOperand(N->getNumOperands() - 1);
+    DecodePSRLDQMask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
+    IsUnary = true;
+    break;
   case X86ISD::PSHUFD:
   case X86ISD::VPERMILPI:
     ImmN = N->getOperand(N->getNumOperands()-1);
@@ -4845,70 +4917,51 @@ static bool getTargetShuffleMask(SDNode *N, MVT VT, bool AllowSentinelZero,
     DecodePSHUFLWMask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
     IsUnary = true;
     break;
+  case X86ISD::VZEXT_MOVL:
+    DecodeZeroMoveLowMask(VT, Mask);
+    IsUnary = true;
+    break;
+  case X86ISD::VBROADCAST: {
+    // We only decode broadcasts of same-sized vectors at the moment.
+    if (N->getOperand(0).getValueType() == VT) {
+      DecodeVectorBroadcast(VT, Mask);
+      IsUnary = true;
+      break;
+    }
+    return false;
+  }
+  case X86ISD::VPERMILPV: {
+    IsUnary = true;
+    SDValue MaskNode = N->getOperand(1);
+    unsigned MaskEltSize = VT.getScalarSizeInBits();
+    SmallVector<uint64_t, 32> RawMask;
+    if (getTargetShuffleMaskIndices(MaskNode, MaskEltSize, RawMask)) {
+      DecodeVPERMILPMask(VT, RawMask, Mask);
+      break;
+    }
+    if (auto *C = getTargetShuffleMaskConstant(MaskNode)) {
+      DecodeVPERMILPMask(C, MaskEltSize, Mask);
+      break;
+    }
+    return false;
+  }
   case X86ISD::PSHUFB: {
     IsUnary = true;
     SDValue MaskNode = N->getOperand(1);
-    while (MaskNode->getOpcode() == ISD::BITCAST)
-      MaskNode = MaskNode->getOperand(0);
-
-    if (MaskNode->getOpcode() == ISD::BUILD_VECTOR) {
-      // If we have a build-vector, then things are easy.
-      MVT VT = MaskNode.getSimpleValueType();
-      assert(VT.isVector() &&
-             "Can't produce a non-vector with a build_vector!");
-      if (!VT.isInteger())
-        return false;
-
-      int NumBytesPerElement = VT.getVectorElementType().getSizeInBits() / 8;
-
-      SmallVector<uint64_t, 32> RawMask;
-      for (int i = 0, e = MaskNode->getNumOperands(); i < e; ++i) {
-        SDValue Op = MaskNode->getOperand(i);
-        if (Op->getOpcode() == ISD::UNDEF) {
-          RawMask.push_back((uint64_t)SM_SentinelUndef);
-          continue;
-        }
-        auto *CN = dyn_cast<ConstantSDNode>(Op.getNode());
-        if (!CN)
-          return false;
-        APInt MaskElement = CN->getAPIntValue();
-
-        // We now have to decode the element which could be any integer size and
-        // extract each byte of it.
-        for (int j = 0; j < NumBytesPerElement; ++j) {
-          // Note that this is x86 and so always little endian: the low byte is
-          // the first byte of the mask.
-          RawMask.push_back(MaskElement.getLoBits(8).getZExtValue());
-          MaskElement = MaskElement.lshr(8);
-        }
-      }
+    SmallVector<uint64_t, 32> RawMask;
+    if (getTargetShuffleMaskIndices(MaskNode, 8, RawMask)) {
       DecodePSHUFBMask(RawMask, Mask);
       break;
     }
-
-    auto *MaskLoad = dyn_cast<LoadSDNode>(MaskNode);
-    if (!MaskLoad)
-      return false;
-
-    SDValue Ptr = MaskLoad->getBasePtr();
-    if (Ptr->getOpcode() == X86ISD::Wrapper ||
-        Ptr->getOpcode() == X86ISD::WrapperRIP)
-      Ptr = Ptr->getOperand(0);
-
-    auto *MaskCP = dyn_cast<ConstantPoolSDNode>(Ptr);
-    if (!MaskCP || MaskCP->isMachineConstantPoolEntry())
-      return false;
-
-    if (auto *C = dyn_cast<Constant>(MaskCP->getConstVal())) {
+    if (auto *C = getTargetShuffleMaskConstant(MaskNode)) {
       DecodePSHUFBMask(C, Mask);
       break;
     }
-
     return false;
   }
   case X86ISD::VPERMI:
     ImmN = N->getOperand(N->getNumOperands()-1);
-    DecodeVPERMMask(cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
+    DecodeVPERMMask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
     IsUnary = true;
     break;
   case X86ISD::MOVSS:
@@ -4937,110 +4990,63 @@ static bool getTargetShuffleMask(SDNode *N, MVT VT, bool AllowSentinelZero,
   case X86ISD::MOVLPS:
     // Not yet implemented
     return false;
+  case X86ISD::VPERMIL2: {
+    IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
+    unsigned MaskEltSize = VT.getScalarSizeInBits();
+    SDValue MaskNode = N->getOperand(2);
+    SDValue CtrlNode = N->getOperand(3);
+    if (ConstantSDNode *CtrlOp = dyn_cast<ConstantSDNode>(CtrlNode)) {
+      unsigned CtrlImm = CtrlOp->getZExtValue();
+      SmallVector<uint64_t, 32> RawMask;
+      if (getTargetShuffleMaskIndices(MaskNode, MaskEltSize, RawMask)) {
+        DecodeVPERMIL2PMask(VT, CtrlImm, RawMask, Mask);
+        break;
+      }
+      if (auto *C = getTargetShuffleMaskConstant(MaskNode)) {
+        DecodeVPERMIL2PMask(C, CtrlImm, MaskEltSize, Mask);
+        break;
+      }
+    }
+    return false;
+  }
+  case X86ISD::VPPERM: {
+    IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
+    SDValue MaskNode = N->getOperand(2);
+    SmallVector<uint64_t, 32> RawMask;
+    if (getTargetShuffleMaskIndices(MaskNode, 8, RawMask)) {
+      DecodeVPPERMMask(RawMask, Mask);
+      break;
+    }
+    if (auto *C = getTargetShuffleMaskConstant(MaskNode)) {
+      DecodeVPPERMMask(C, Mask);
+      break;
+    }
+    return false;
+  }
   case X86ISD::VPERMV: {
     IsUnary = true;
+    // Unlike most shuffle nodes, VPERMV's mask operand is operand 0.
+    Ops.push_back(N->getOperand(1));
     SDValue MaskNode = N->getOperand(0);
-    while (MaskNode->getOpcode() == ISD::BITCAST)
-      MaskNode = MaskNode->getOperand(0);
-
-    unsigned MaskLoBits = Log2_64(VT.getVectorNumElements());
     SmallVector<uint64_t, 32> RawMask;
-    if (MaskNode->getOpcode() == ISD::BUILD_VECTOR) {
-      // If we have a build-vector, then things are easy.
-      assert(MaskNode.getSimpleValueType().isInteger() &&
-             MaskNode.getSimpleValueType().getVectorNumElements() ==
-             VT.getVectorNumElements());
-
-      for (unsigned i = 0; i < MaskNode->getNumOperands(); ++i) {
-        SDValue Op = MaskNode->getOperand(i);
-        if (Op->getOpcode() == ISD::UNDEF)
-          RawMask.push_back((uint64_t)SM_SentinelUndef);
-        else if (isa<ConstantSDNode>(Op)) {
-          APInt MaskElement = cast<ConstantSDNode>(Op)->getAPIntValue();
-          RawMask.push_back(MaskElement.getLoBits(MaskLoBits).getZExtValue());
-        } else
-          return false;
-      }
+    unsigned MaskEltSize = VT.getScalarSizeInBits();
+    if (getTargetShuffleMaskIndices(MaskNode, MaskEltSize, RawMask)) {
       DecodeVPERMVMask(RawMask, Mask);
       break;
     }
-    if (MaskNode->getOpcode() == X86ISD::VBROADCAST) {
-      unsigned NumEltsInMask = MaskNode->getNumOperands();
-      MaskNode = MaskNode->getOperand(0);
-      if (auto *CN = dyn_cast<ConstantSDNode>(MaskNode)) {
-        APInt MaskEltValue = CN->getAPIntValue();
-        for (unsigned i = 0; i < NumEltsInMask; ++i)
-          RawMask.push_back(MaskEltValue.getLoBits(MaskLoBits).getZExtValue());
-        DecodeVPERMVMask(RawMask, Mask);
-        break;
-      }
-      // It may be a scalar load
-    }
-
-    auto *MaskLoad = dyn_cast<LoadSDNode>(MaskNode);
-    if (!MaskLoad)
-      return false;
-
-    SDValue Ptr = MaskLoad->getBasePtr();
-    if (Ptr->getOpcode() == X86ISD::Wrapper ||
-        Ptr->getOpcode() == X86ISD::WrapperRIP)
-      Ptr = Ptr->getOperand(0);
-
-    auto *MaskCP = dyn_cast<ConstantPoolSDNode>(Ptr);
-    if (!MaskCP || MaskCP->isMachineConstantPoolEntry())
-      return false;
-
-    if (auto *C = dyn_cast<Constant>(MaskCP->getConstVal())) {
+    if (auto *C = getTargetShuffleMaskConstant(MaskNode)) {
       DecodeVPERMVMask(C, VT, Mask);
       break;
     }
     return false;
   }
   case X86ISD::VPERMV3: {
-    IsUnary = false;
+    IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(2);
+    // Unlike most shuffle nodes, VPERMV3's mask operand is the middle one.
+    Ops.push_back(N->getOperand(0));
+    Ops.push_back(N->getOperand(2));
     SDValue MaskNode = N->getOperand(1);
-    while (MaskNode->getOpcode() == ISD::BITCAST)
-      MaskNode = MaskNode->getOperand(1);
-
-    if (MaskNode->getOpcode() == ISD::BUILD_VECTOR) {
-      // If we have a build-vector, then things are easy.
-      assert(MaskNode.getSimpleValueType().isInteger() &&
-             MaskNode.getSimpleValueType().getVectorNumElements() ==
-             VT.getVectorNumElements());
-
-      SmallVector<uint64_t, 32> RawMask;
-      unsigned MaskLoBits = Log2_64(VT.getVectorNumElements()*2);
-
-      for (unsigned i = 0; i < MaskNode->getNumOperands(); ++i) {
-        SDValue Op = MaskNode->getOperand(i);
-        if (Op->getOpcode() == ISD::UNDEF)
-          RawMask.push_back((uint64_t)SM_SentinelUndef);
-        else {
-          auto *CN = dyn_cast<ConstantSDNode>(Op.getNode());
-          if (!CN)
-            return false;
-          APInt MaskElement = CN->getAPIntValue();
-          RawMask.push_back(MaskElement.getLoBits(MaskLoBits).getZExtValue());
-        }
-      }
-      DecodeVPERMV3Mask(RawMask, Mask);
-      break;
-    }
-
-    auto *MaskLoad = dyn_cast<LoadSDNode>(MaskNode);
-    if (!MaskLoad)
-      return false;
-
-    SDValue Ptr = MaskLoad->getBasePtr();
-    if (Ptr->getOpcode() == X86ISD::Wrapper ||
-        Ptr->getOpcode() == X86ISD::WrapperRIP)
-      Ptr = Ptr->getOperand(0);
-
-    auto *MaskCP = dyn_cast<ConstantPoolSDNode>(Ptr);
-    if (!MaskCP || MaskCP->isMachineConstantPoolEntry())
-      return false;
-
-    if (auto *C = dyn_cast<Constant>(MaskCP->getConstVal())) {
+    if (auto *C = getTargetShuffleMaskConstant(MaskNode)) {
       DecodeVPERMV3Mask(C, VT, Mask);
       break;
     }
@@ -5055,8 +5061,7 @@ static bool getTargetShuffleMask(SDNode *N, MVT VT, bool AllowSentinelZero,
 
   // Check if we're getting a shuffle mask with zero'd elements.
   if (!AllowSentinelZero)
-    if (std::any_of(Mask.begin(), Mask.end(),
-                    [](int M){ return M == SM_SentinelZero; }))
+    if (llvm::any_of(Mask, [](int M) { return M == SM_SentinelZero; }))
       return false;
 
   // If we have a fake unary shuffle, the shuffle mask is spread across two
@@ -5067,6 +5072,123 @@ static bool getTargetShuffleMask(SDNode *N, MVT VT, bool AllowSentinelZero,
       if (M >= (int)Mask.size())
         M -= Mask.size();
 
+  // If we didn't already add operands in the opcode-specific code, default to
+  // adding 1 or 2 operands starting at 0.
+  if (Ops.empty()) {
+    Ops.push_back(N->getOperand(0));
+    if (!IsUnary || IsFakeUnary)
+      Ops.push_back(N->getOperand(1));
+  }
+
+  return true;
+}
+
+/// Check a target shuffle mask's inputs to see if we can set any values to
+/// SM_SentinelZero - this is for elements that are known to be zero
+/// (not just zeroable) from their inputs.
+/// Returns true if the target shuffle mask was decoded.
+static bool setTargetShuffleZeroElements(SDValue N,
+                                         SmallVectorImpl<int> &Mask,
+                                         SmallVectorImpl<SDValue> &Ops) {
+  bool IsUnary;
+  if (!isTargetShuffle(N.getOpcode()))
+    return false;
+  if (!getTargetShuffleMask(N.getNode(), N.getSimpleValueType(), true, Ops,
+                            Mask, IsUnary))
+    return false;
+
+  SDValue V1 = Ops[0];
+  SDValue V2 = IsUnary ? V1 : Ops[1];
+
+  V1 = peekThroughBitcasts(V1);
+  V2 = peekThroughBitcasts(V2);
+
+  for (int i = 0, Size = Mask.size(); i < Size; ++i) {
+    int M = Mask[i];
+
+    // Already decoded as SM_SentinelZero / SM_SentinelUndef.
+    if (M < 0)
+      continue;
+
+    // Determine shuffle input and normalize the mask.
+    SDValue V = M < Size ? V1 : V2;
+    M %= Size;
+
+    // We are referencing an UNDEF input.
+    if (V.isUndef()) {
+      Mask[i] = SM_SentinelUndef;
+      continue;
+    }
+
+    // Currently we can only search BUILD_VECTOR for UNDEF/ZERO elements.
+    if (V.getOpcode() != ISD::BUILD_VECTOR)
+      continue;
+
+    // If the BUILD_VECTOR has fewer elements then the (larger) source
+    // element must be UNDEF/ZERO.
+    // TODO: Is it worth testing the individual bits of a constant?
+    if ((Size % V.getNumOperands()) == 0) {
+      int Scale = Size / V->getNumOperands();
+      SDValue Op = V.getOperand(M / Scale);
+      if (Op.isUndef())
+        Mask[i] = SM_SentinelUndef;
+      else if (X86::isZeroNode(Op))
+        Mask[i] = SM_SentinelZero;
+      continue;
+    }
+
+    // If the BUILD_VECTOR has more elements then all the (smaller) source
+    // elements must be all UNDEF or all ZERO.
+    if ((V.getNumOperands() % Size) == 0) {
+      int Scale = V->getNumOperands() / Size;
+      bool AllUndef = true;
+      bool AllZero = true;
+      for (int j = 0; j < Scale; ++j) {
+        SDValue Op = V.getOperand((M * Scale) + j);
+        AllUndef &= Op.isUndef();
+        AllZero &= X86::isZeroNode(Op);
+      }
+      if (AllUndef)
+        Mask[i] = SM_SentinelUndef;
+      else if (AllZero)
+        Mask[i] = SM_SentinelZero;
+      continue;
+    }
+  }
+
+  return true;
+}
+
+/// Calls setTargetShuffleZeroElements to resolve a target shuffle mask's inputs
+/// and set the SM_SentinelUndef and SM_SentinelZero values. Then check the
+/// remaining input indices in case we now have a unary shuffle and adjust the
+/// Op0/Op1 inputs accordingly.
+/// Returns true if the target shuffle mask was decoded.
+static bool resolveTargetShuffleInputs(SDValue Op, SDValue &Op0, SDValue &Op1,
+                                       SmallVectorImpl<int> &Mask) {
+  SmallVector<SDValue, 2> Ops;
+  if (!setTargetShuffleZeroElements(Op, Mask, Ops))
+    return false;
+
+  int NumElts = Mask.size();
+  bool Op0InUse = std::any_of(Mask.begin(), Mask.end(), [NumElts](int Idx) {
+    return 0 <= Idx && Idx < NumElts;
+  });
+  bool Op1InUse = std::any_of(Mask.begin(), Mask.end(),
+                              [NumElts](int Idx) { return NumElts <= Idx; });
+
+  Op0 = Op0InUse ? Ops[0] : SDValue();
+  Op1 = Op1InUse ? Ops[1] : SDValue();
+
+  // We're only using Op1 - commute the mask and inputs.
+  if (!Op0InUse && Op1InUse) {
+    for (int &M : Mask)
+      if (NumElts <= M)
+        M -= NumElts;
+    Op0 = Op1;
+    Op1 = SDValue();
+  }
+
   return true;
 }
 
@@ -5097,19 +5219,24 @@ static SDValue getShuffleScalarElt(SDNode *N, unsigned Index, SelectionDAG &DAG,
   // Recurse into target specific vector shuffles to find scalars.
   if (isTargetShuffle(Opcode)) {
     MVT ShufVT = V.getSimpleValueType();
+    MVT ShufSVT = ShufVT.getVectorElementType();
     int NumElems = (int)ShufVT.getVectorNumElements();
     SmallVector<int, 16> ShuffleMask;
+    SmallVector<SDValue, 16> ShuffleOps;
     bool IsUnary;
 
-    if (!getTargetShuffleMask(N, ShufVT, false, ShuffleMask, IsUnary))
+    if (!getTargetShuffleMask(N, ShufVT, true, ShuffleOps, ShuffleMask, IsUnary))
       return SDValue();
 
     int Elt = ShuffleMask[Index];
+    if (Elt == SM_SentinelZero)
+      return ShufSVT.isInteger() ? DAG.getConstant(0, SDLoc(N), ShufSVT)
+                                 : DAG.getConstantFP(+0.0, SDLoc(N), ShufSVT);
     if (Elt == SM_SentinelUndef)
-      return DAG.getUNDEF(ShufVT.getVectorElementType());
+      return DAG.getUNDEF(ShufSVT);
 
     assert(0 <= Elt && Elt < (2*NumElems) && "Shuffle index out of range");
-    SDValue NewV = (Elt < NumElems) ? N->getOperand(0) : N->getOperand(1);
+    SDValue NewV = (Elt < NumElems) ? ShuffleOps[0] : ShuffleOps[1];
     return getShuffleScalarElt(NewV.getNode(), Elt % NumElems, DAG,
                                Depth+1);
   }
@@ -5138,7 +5265,7 @@ static SDValue getShuffleScalarElt(SDNode *N, unsigned Index, SelectionDAG &DAG,
 static SDValue LowerBuildVectorv16i8(SDValue Op, unsigned NonZeros,
                                        unsigned NumNonZero, unsigned NumZero,
                                        SelectionDAG &DAG,
-                                       const X86Subtarget* Subtarget,
+                                       const X86Subtarget &Subtarget,
                                        const TargetLowering &TLI) {
   if (NumNonZero > 8)
     return SDValue();
@@ -5148,7 +5275,7 @@ static SDValue LowerBuildVectorv16i8(SDValue Op, unsigned NonZeros,
   bool First = true;
 
   // SSE4.1 - use PINSRB to insert each byte directly.
-  if (Subtarget->hasSSE41()) {
+  if (Subtarget.hasSSE41()) {
     for (unsigned i = 0; i < 16; ++i) {
       bool isNonZero = (NonZeros & (1 << i)) != 0;
       if (isNonZero) {
@@ -5208,7 +5335,7 @@ static SDValue LowerBuildVectorv16i8(SDValue Op, unsigned NonZeros,
 static SDValue LowerBuildVectorv8i16(SDValue Op, unsigned NonZeros,
                                      unsigned NumNonZero, unsigned NumZero,
                                      SelectionDAG &DAG,
-                                     const X86Subtarget* Subtarget,
+                                     const X86Subtarget &Subtarget,
                                      const TargetLowering &TLI) {
   if (NumNonZero > 4)
     return SDValue();
@@ -5237,13 +5364,13 @@ static SDValue LowerBuildVectorv8i16(SDValue Op, unsigned NonZeros,
 
 /// Custom lower build_vector of v4i32 or v4f32.
 static SDValue LowerBuildVectorv4x32(SDValue Op, SelectionDAG &DAG,
-                                     const X86Subtarget *Subtarget,
+                                     const X86Subtarget &Subtarget,
                                      const TargetLowering &TLI) {
   // Find all zeroable elements.
   std::bitset<4> Zeroable;
   for (int i=0; i < 4; ++i) {
     SDValue Elt = Op->getOperand(i);
-    Zeroable[i] = (Elt.getOpcode() == ISD::UNDEF || X86::isZeroNode(Elt));
+    Zeroable[i] = (Elt.isUndef() || X86::isZeroNode(Elt));
   }
   assert(Zeroable.size() - Zeroable.count() > 1 &&
          "We expect at least two non-zero elements!");
@@ -5296,12 +5423,12 @@ static SDValue LowerBuildVectorv4x32(SDValue Op, SelectionDAG &DAG,
     // Let the shuffle legalizer deal with blend operations.
     SDValue VZero = getZeroVector(VT, Subtarget, DAG, SDLoc(Op));
     if (V1.getSimpleValueType() != VT)
-      V1 = DAG.getNode(ISD::BITCAST, SDLoc(V1), VT, V1);
-    return DAG.getVectorShuffle(VT, SDLoc(V1), V1, VZero, &Mask[0]);
+      V1 = DAG.getBitcast(VT, V1);
+    return DAG.getVectorShuffle(VT, SDLoc(V1), V1, VZero, Mask);
   }
 
   // See if we can lower this build_vector to a INSERTPS.
-  if (!Subtarget->hasSSE41())
+  if (!Subtarget.hasSSE41())
     return SDValue();
 
   SDValue V2 = Elt.getOperand(0);
@@ -5326,9 +5453,9 @@ static SDValue LowerBuildVectorv4x32(SDValue Op, SelectionDAG &DAG,
 
   assert(V1.getNode() && "Expected at least two non-zero elements!");
   if (V1.getSimpleValueType() != MVT::v4f32)
-    V1 = DAG.getNode(ISD::BITCAST, SDLoc(V1), MVT::v4f32, V1);
+    V1 = DAG.getBitcast(MVT::v4f32, V1);
   if (V2.getSimpleValueType() != MVT::v4f32)
-    V2 = DAG.getNode(ISD::BITCAST, SDLoc(V2), MVT::v4f32, V2);
+    V2 = DAG.getBitcast(MVT::v4f32, V2);
 
   // Ok, we can emit an INSERTPS instruction.
   unsigned ZMask = Zeroable.to_ulong();
@@ -5342,11 +5469,11 @@ static SDValue LowerBuildVectorv4x32(SDValue Op, SelectionDAG &DAG,
 }
 
 /// Return a vector logical shift node.
-static SDValue getVShift(bool isLeft, EVT VT, SDValue SrcOp,
-                         unsigned NumBits, SelectionDAG &DAG,
-                         const TargetLowering &TLI, SDLoc dl) {
+static SDValue getVShift(bool isLeft, EVT VT, SDValue SrcOp, unsigned NumBits,
+                         SelectionDAG &DAG, const TargetLowering &TLI,
+                         const SDLoc &dl) {
   assert(VT.is128BitVector() && "Unknown type for VShift");
-  MVT ShVT = MVT::v2i64;
+  MVT ShVT = MVT::v16i8;
   unsigned Opc = isLeft ? X86ISD::VSHLDQ : X86ISD::VSRLDQ;
   SrcOp = DAG.getBitcast(ShVT, SrcOp);
   MVT ScalarShiftTy = TLI.getScalarShiftAmountTy(DAG.getDataLayout(), VT);
@@ -5355,8 +5482,8 @@ static SDValue getVShift(bool isLeft, EVT VT, SDValue SrcOp,
   return DAG.getBitcast(VT, DAG.getNode(Opc, dl, ShVT, SrcOp, ShiftVal));
 }
 
-static SDValue
-LowerAsSplatVectorLoad(SDValue SrcOp, MVT VT, SDLoc dl, SelectionDAG &DAG) {
+static SDValue LowerAsSplatVectorLoad(SDValue SrcOp, MVT VT, const SDLoc &dl,
+                                      SelectionDAG &DAG) {
 
   // Check if the scalar load can be widened into a vector load. And if
   // the address is "base + cst" see if the cst can be "absorbed" into
@@ -5418,12 +5545,11 @@ LowerAsSplatVectorLoad(SDValue SrcOp, MVT VT, SDLoc dl, SelectionDAG &DAG) {
 
     EVT NVT = EVT::getVectorVT(*DAG.getContext(), PVT, NumElems);
     SDValue V1 = DAG.getLoad(NVT, dl, Chain, Ptr,
-                             LD->getPointerInfo().getWithOffset(StartOffset),
-                             false, false, false, 0);
+                             LD->getPointerInfo().getWithOffset(StartOffset));
 
     SmallVector<int, 8> Mask(NumElems, EltNo);
 
-    return DAG.getVectorShuffle(NVT, dl, V1, DAG.getUNDEF(NVT), &Mask[0]);
+    return DAG.getVectorShuffle(NVT, dl, V1, DAG.getUNDEF(NVT), Mask);
   }
 
   return SDValue();
@@ -5433,55 +5559,103 @@ LowerAsSplatVectorLoad(SDValue SrcOp, MVT VT, SDLoc dl, SelectionDAG &DAG) {
 /// elements can be replaced by a single large load which has the same value as
 /// a build_vector or insert_subvector whose loaded operands are 'Elts'.
 ///
-/// Example: <load i32 *a, load i32 *a+4, undef, undef> -> zextload a
-///
-/// FIXME: we'd also like to handle the case where the last elements are zero
-/// rather than undef via VZEXT_LOAD, but we do not detect that case today.
-/// There's even a handy isZeroNode for that purpose.
+/// Example: <load i32 *a, load i32 *a+4, zero, undef> -> zextload a
 static SDValue EltsFromConsecutiveLoads(EVT VT, ArrayRef<SDValue> Elts,
                                         SDLoc &DL, SelectionDAG &DAG,
                                         bool isAfterLegalize) {
   unsigned NumElems = Elts.size();
 
-  LoadSDNode *LDBase = nullptr;
-  unsigned LastLoadedElt = -1U;
+  int LastLoadedElt = -1;
+  SmallBitVector LoadMask(NumElems, false);
+  SmallBitVector ZeroMask(NumElems, false);
+  SmallBitVector UndefMask(NumElems, false);
 
-  // For each element in the initializer, see if we've found a load or an undef.
-  // If we don't find an initial load element, or later load elements are
-  // non-consecutive, bail out.
+  // For each element in the initializer, see if we've found a load, zero or an
+  // undef.
   for (unsigned i = 0; i < NumElems; ++i) {
-    SDValue Elt = Elts[i];
-    // Look through a bitcast.
-    if (Elt.getNode() && Elt.getOpcode() == ISD::BITCAST)
-      Elt = Elt.getOperand(0);
-    if (!Elt.getNode() ||
-        (Elt.getOpcode() != ISD::UNDEF && !ISD::isNON_EXTLoad(Elt.getNode())))
+    SDValue Elt = peekThroughBitcasts(Elts[i]);
+    if (!Elt.getNode())
       return SDValue();
-    if (!LDBase) {
-      if (Elt.getNode()->getOpcode() == ISD::UNDEF)
-        return SDValue();
-      LDBase = cast<LoadSDNode>(Elt.getNode());
-      LastLoadedElt = i;
-      continue;
-    }
-    if (Elt.getOpcode() == ISD::UNDEF)
-      continue;
 
-    LoadSDNode *LD = cast<LoadSDNode>(Elt);
-    EVT LdVT = Elt.getValueType();
-    // Each loaded element must be the correct fractional portion of the
-    // requested vector load.
-    if (LdVT.getSizeInBits() != VT.getSizeInBits() / NumElems)
-      return SDValue();
-    if (!DAG.isConsecutiveLoad(LD, LDBase, LdVT.getSizeInBits() / 8, i))
+    if (Elt.isUndef())
+      UndefMask[i] = true;
+    else if (X86::isZeroNode(Elt) || ISD::isBuildVectorAllZeros(Elt.getNode()))
+      ZeroMask[i] = true;
+    else if (ISD::isNON_EXTLoad(Elt.getNode())) {
+      LoadMask[i] = true;
+      LastLoadedElt = i;
+      // Each loaded element must be the correct fractional portion of the
+      // requested vector load.
+      if ((NumElems * Elt.getValueSizeInBits()) != VT.getSizeInBits())
+        return SDValue();
+    } else
       return SDValue();
-    LastLoadedElt = i;
   }
+  assert((ZeroMask | UndefMask | LoadMask).count() == NumElems &&
+         "Incomplete element masks");
+
+  // Handle Special Cases - all undef or undef/zero.
+  if (UndefMask.count() == NumElems)
+    return DAG.getUNDEF(VT);
+
+  // FIXME: Should we return this as a BUILD_VECTOR instead?
+  if ((ZeroMask | UndefMask).count() == NumElems)
+    return VT.isInteger() ? DAG.getConstant(0, DL, VT)
+                          : DAG.getConstantFP(0.0, DL, VT);
+
+  const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+  int FirstLoadedElt = LoadMask.find_first();
+  SDValue EltBase = peekThroughBitcasts(Elts[FirstLoadedElt]);
+  LoadSDNode *LDBase = cast<LoadSDNode>(EltBase);
+  EVT LDBaseVT = EltBase.getValueType();
+
+  // Consecutive loads can contain UNDEFS but not ZERO elements.
+  // Consecutive loads with UNDEFs and ZEROs elements require a
+  // an additional shuffle stage to clear the ZERO elements.
+  bool IsConsecutiveLoad = true;
+  bool IsConsecutiveLoadWithZeros = true;
+  for (int i = FirstLoadedElt + 1; i <= LastLoadedElt; ++i) {
+    if (LoadMask[i]) {
+      SDValue Elt = peekThroughBitcasts(Elts[i]);
+      LoadSDNode *LD = cast<LoadSDNode>(Elt);
+      if (!DAG.areNonVolatileConsecutiveLoads(
+              LD, LDBase, Elt.getValueType().getStoreSizeInBits() / 8,
+              i - FirstLoadedElt)) {
+        IsConsecutiveLoad = false;
+        IsConsecutiveLoadWithZeros = false;
+        break;
+      }
+    } else if (ZeroMask[i]) {
+      IsConsecutiveLoad = false;
+    }
+  }
+
+  auto CreateLoad = [&DAG, &DL](EVT VT, LoadSDNode *LDBase) {
+    auto MMOFlags = LDBase->getMemOperand()->getFlags();
+    assert(!(MMOFlags & MachineMemOperand::MOVolatile) &&
+           "Cannot merge volatile loads.");
+    SDValue NewLd =
+        DAG.getLoad(VT, DL, LDBase->getChain(), LDBase->getBasePtr(),
+                    LDBase->getPointerInfo(), LDBase->getAlignment(), MMOFlags);
+
+    if (LDBase->hasAnyUseOfValue(1)) {
+      SDValue NewChain =
+          DAG.getNode(ISD::TokenFactor, DL, MVT::Other, SDValue(LDBase, 1),
+                      SDValue(NewLd.getNode(), 1));
+      DAG.ReplaceAllUsesOfValueWith(SDValue(LDBase, 1), NewChain);
+      DAG.UpdateNodeOperands(NewChain.getNode(), SDValue(LDBase, 1),
+                             SDValue(NewLd.getNode(), 1));
+    }
+
+    return NewLd;
+  };
 
+  // LOAD - all consecutive load/undefs (must start/end with a load).
   // If we have found an entire vector of loads and undefs, then return a large
-  // load of the entire vector width starting at the base pointer.  If we found
-  // consecutive loads for the low half, generate a vzext_load node.
-  if (LastLoadedElt == NumElems - 1) {
+  // load of the entire vector width starting at the base pointer.
+  // If the vector contains zeros, then attempt to shuffle those elements.
+  if (FirstLoadedElt == 0 && LastLoadedElt == (int)(NumElems - 1) &&
+      (IsConsecutiveLoad || IsConsecutiveLoadWithZeros)) {
     assert(LDBase && "Did not find base load for merging consecutive loads");
     EVT EltVT = LDBase->getValueType(0);
     // Ensure that the input vector size for the merged loads matches the
@@ -5489,72 +5663,93 @@ static SDValue EltsFromConsecutiveLoads(EVT VT, ArrayRef<SDValue> Elts,
     if (VT.getSizeInBits() != EltVT.getSizeInBits() * NumElems)
       return SDValue();
 
-    if (isAfterLegalize &&
-        !DAG.getTargetLoweringInfo().isOperationLegal(ISD::LOAD, VT))
+    if (isAfterLegalize && !TLI.isOperationLegal(ISD::LOAD, VT))
       return SDValue();
 
-    SDValue NewLd = SDValue();
+    if (IsConsecutiveLoad)
+      return CreateLoad(VT, LDBase);
+
+    // IsConsecutiveLoadWithZeros - we need to create a shuffle of the loaded
+    // vector and a zero vector to clear out the zero elements.
+    if (!isAfterLegalize && NumElems == VT.getVectorNumElements()) {
+      SmallVector<int, 4> ClearMask(NumElems, -1);
+      for (unsigned i = 0; i < NumElems; ++i) {
+        if (ZeroMask[i])
+          ClearMask[i] = i + NumElems;
+        else if (LoadMask[i])
+          ClearMask[i] = i;
+      }
+      SDValue V = CreateLoad(VT, LDBase);
+      SDValue Z = VT.isInteger() ? DAG.getConstant(0, DL, VT)
+                                 : DAG.getConstantFP(0.0, DL, VT);
+      return DAG.getVectorShuffle(VT, DL, V, Z, ClearMask);
+    }
+  }
+
+  int LoadSize =
+      (1 + LastLoadedElt - FirstLoadedElt) * LDBaseVT.getStoreSizeInBits();
+
+  // VZEXT_LOAD - consecutive load/undefs followed by zeros/undefs.
+  if (IsConsecutiveLoad && FirstLoadedElt == 0 && LoadSize == 64 &&
+      ((VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector()))) {
+    MVT VecSVT = VT.isFloatingPoint() ? MVT::f64 : MVT::i64;
+    MVT VecVT = MVT::getVectorVT(VecSVT, VT.getSizeInBits() / 64);
+    if (TLI.isTypeLegal(VecVT)) {
+      SDVTList Tys = DAG.getVTList(VecVT, MVT::Other);
+      SDValue Ops[] = { LDBase->getChain(), LDBase->getBasePtr() };
+      SDValue ResNode =
+          DAG.getMemIntrinsicNode(X86ISD::VZEXT_LOAD, DL, Tys, Ops, VecSVT,
+                                  LDBase->getPointerInfo(),
+                                  LDBase->getAlignment(),
+                                  false/*isVolatile*/, true/*ReadMem*/,
+                                  false/*WriteMem*/);
 
-    NewLd = DAG.getLoad(VT, DL, LDBase->getChain(), LDBase->getBasePtr(),
-                        LDBase->getPointerInfo(), LDBase->isVolatile(),
-                        LDBase->isNonTemporal(), LDBase->isInvariant(),
-                        LDBase->getAlignment());
+      // Make sure the newly-created LOAD is in the same position as LDBase in
+      // terms of dependency. We create a TokenFactor for LDBase and ResNode,
+      // and update uses of LDBase's output chain to use the TokenFactor.
+      if (LDBase->hasAnyUseOfValue(1)) {
+        SDValue NewChain =
+            DAG.getNode(ISD::TokenFactor, DL, MVT::Other, SDValue(LDBase, 1),
+                        SDValue(ResNode.getNode(), 1));
+        DAG.ReplaceAllUsesOfValueWith(SDValue(LDBase, 1), NewChain);
+        DAG.UpdateNodeOperands(NewChain.getNode(), SDValue(LDBase, 1),
+                               SDValue(ResNode.getNode(), 1));
+      }
 
-    if (LDBase->hasAnyUseOfValue(1)) {
-      SDValue NewChain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other,
-                                     SDValue(LDBase, 1),
-                                     SDValue(NewLd.getNode(), 1));
-      DAG.ReplaceAllUsesOfValueWith(SDValue(LDBase, 1), NewChain);
-      DAG.UpdateNodeOperands(NewChain.getNode(), SDValue(LDBase, 1),
-                             SDValue(NewLd.getNode(), 1));
+      return DAG.getBitcast(VT, ResNode);
     }
-
-    return NewLd;
   }
 
-  //TODO: The code below fires only for for loading the low v2i32 / v2f32
-  //of a v4i32 / v4f32. It's probably worth generalizing.
-  EVT EltVT = VT.getVectorElementType();
-  if (NumElems == 4 && LastLoadedElt == 1 && (EltVT.getSizeInBits() == 32) &&
-      DAG.getTargetLoweringInfo().isTypeLegal(MVT::v2i64)) {
-    SDVTList Tys = DAG.getVTList(MVT::v2i64, MVT::Other);
-    SDValue Ops[] = { LDBase->getChain(), LDBase->getBasePtr() };
-    SDValue ResNode =
-        DAG.getMemIntrinsicNode(X86ISD::VZEXT_LOAD, DL, Tys, Ops, MVT::i64,
-                                LDBase->getPointerInfo(),
-                                LDBase->getAlignment(),
-                                false/*isVolatile*/, true/*ReadMem*/,
-                                false/*WriteMem*/);
-
-    // Make sure the newly-created LOAD is in the same position as LDBase in
-    // terms of dependency. We create a TokenFactor for LDBase and ResNode, and
-    // update uses of LDBase's output chain to use the TokenFactor.
-    if (LDBase->hasAnyUseOfValue(1)) {
-      SDValue NewChain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other,
-                             SDValue(LDBase, 1), SDValue(ResNode.getNode(), 1));
-      DAG.ReplaceAllUsesOfValueWith(SDValue(LDBase, 1), NewChain);
-      DAG.UpdateNodeOperands(NewChain.getNode(), SDValue(LDBase, 1),
-                             SDValue(ResNode.getNode(), 1));
+  // VZEXT_MOVL - consecutive 32-bit load/undefs followed by zeros/undefs.
+  if (IsConsecutiveLoad && FirstLoadedElt == 0 && LoadSize == 32 &&
+      ((VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector()))) {
+    MVT VecSVT = VT.isFloatingPoint() ? MVT::f32 : MVT::i32;
+    MVT VecVT = MVT::getVectorVT(VecSVT, VT.getSizeInBits() / 32);
+    if (TLI.isTypeLegal(VecVT)) {
+      SDValue V = LastLoadedElt != 0 ? CreateLoad(VecSVT, LDBase)
+                                     : DAG.getBitcast(VecSVT, EltBase);
+      V = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VecVT, V);
+      V = DAG.getNode(X86ISD::VZEXT_MOVL, DL, VecVT, V);
+      return DAG.getBitcast(VT, V);
     }
-
-    return DAG.getBitcast(VT, ResNode);
   }
+
   return SDValue();
 }
 
-/// LowerVectorBroadcast - Attempt to use the vbroadcast instruction
-/// to generate a splat value for the following cases:
+/// Attempt to use the vbroadcast instruction to generate a splat value for the
+/// following cases:
 /// 1. A splat BUILD_VECTOR which uses a single scalar load, or a constant.
 /// 2. A splat shuffle which uses a scalar_to_vector node which comes from
 /// a scalar load, or a constant.
 /// The VBROADCAST node is returned when a pattern is found,
 /// or SDValue() otherwise.
-static SDValue LowerVectorBroadcast(SDValue Op, const X86Subtarget* Subtarget,
+static SDValue LowerVectorBroadcast(SDValue Op, const X86Subtarget &Subtarget,
                                     SelectionDAG &DAG) {
   // VBROADCAST requires AVX.
   // TODO: Splats could be generated for non-AVX CPUs using SSE
   // instructions, but there's less potential gain for only 128-bit vectors.
-  if (!Subtarget->hasAVX())
+  if (!Subtarget.hasAVX())
     return SDValue();
 
   MVT VT = Op.getSimpleValueType();
@@ -5604,12 +5799,12 @@ static SDValue LowerVectorBroadcast(SDValue Op, const X86Subtarget* Subtarget,
       if (Sc.getOpcode() != ISD::SCALAR_TO_VECTOR &&
           Sc.getOpcode() != ISD::BUILD_VECTOR) {
 
-        if (!Subtarget->hasInt256())
+        if (!Subtarget.hasInt256())
           return SDValue();
 
         // Use the register form of the broadcast instruction available on AVX2.
         if (VT.getSizeInBits() >= 256)
-          Sc = Extract128BitVector(Sc, 0, DAG, dl);
+          Sc = extract128BitVector(Sc, 0, DAG, dl);
         return DAG.getNode(X86ISD::VBROADCAST, dl, VT, Sc);
       }
 
@@ -5622,7 +5817,7 @@ static SDValue LowerVectorBroadcast(SDValue Op, const X86Subtarget* Subtarget,
       // Constants may have multiple users.
 
       // AVX-512 has register version of the broadcast
-      bool hasRegVer = Subtarget->hasAVX512() && VT.is512BitVector() &&
+      bool hasRegVer = Subtarget.hasAVX512() && VT.is512BitVector() &&
         Ld.getValueType().getSizeInBits() >= 32;
       if (!ConstSplatVal && ((!Sc.hasOneUse() || !Ld.hasOneUse()) &&
           !hasRegVer))
@@ -5647,7 +5842,7 @@ static SDValue LowerVectorBroadcast(SDValue Op, const X86Subtarget* Subtarget,
   // from the constant pool and not to broadcast it from a scalar.
   // But override that restriction when optimizing for size.
   // TODO: Check if splatting is recommended for other AVX-capable CPUs.
-  if (ConstSplatVal && (Subtarget->hasAVX2() || OptForSize)) {
+  if (ConstSplatVal && (Subtarget.hasAVX2() || OptForSize)) {
     EVT CVT = Ld.getValueType();
     assert(!CVT.isVector() && "Must not broadcast a vector type");
 
@@ -5656,7 +5851,7 @@ static SDValue LowerVectorBroadcast(SDValue Op, const X86Subtarget* Subtarget,
     // with AVX2, also splat i8 and i16.
     // With pattern matching, the VBROADCAST node may become a VMOVDDUP.
     if (ScalarSize == 32 || (IsGE256 && ScalarSize == 64) ||
-        (OptForSize && (ScalarSize == 64 || Subtarget->hasAVX2()))) {
+        (OptForSize && (ScalarSize == 64 || Subtarget.hasAVX2()))) {
       const Constant *C = nullptr;
       if (ConstantSDNode *CI = dyn_cast<ConstantSDNode>(Ld))
         C = CI->getConstantIntValue();
@@ -5671,8 +5866,8 @@ static SDValue LowerVectorBroadcast(SDValue Op, const X86Subtarget* Subtarget,
       unsigned Alignment = cast<ConstantPoolSDNode>(CP)->getAlignment();
       Ld = DAG.getLoad(
           CVT, dl, DAG.getEntryNode(), CP,
-          MachinePointerInfo::getConstantPool(DAG.getMachineFunction()), false,
-          false, false, Alignment);
+          MachinePointerInfo::getConstantPool(DAG.getMachineFunction()),
+          Alignment);
 
       return DAG.getNode(X86ISD::VBROADCAST, dl, VT, Ld);
     }
@@ -5681,7 +5876,7 @@ static SDValue LowerVectorBroadcast(SDValue Op, const X86Subtarget* Subtarget,
   bool IsLoad = ISD::isNormalLoad(Ld.getNode());
 
   // Handle AVX2 in-register broadcasts.
-  if (!IsLoad && Subtarget->hasInt256() &&
+  if (!IsLoad && Subtarget.hasInt256() &&
       (ScalarSize == 32 || (IsGE256 && ScalarSize == 64)))
     return DAG.getNode(X86ISD::VBROADCAST, dl, VT, Ld);
 
@@ -5690,12 +5885,12 @@ static SDValue LowerVectorBroadcast(SDValue Op, const X86Subtarget* Subtarget,
     return SDValue();
 
   if (ScalarSize == 32 || (IsGE256 && ScalarSize == 64) ||
-      (Subtarget->hasVLX() && ScalarSize == 64))
+      (Subtarget.hasVLX() && ScalarSize == 64))
     return DAG.getNode(X86ISD::VBROADCAST, dl, VT, Ld);
 
   // The integer check is needed for the 64-bit into 128-bit so it doesn't match
   // double since there is no vbroadcastsd xmm
-  if (Subtarget->hasInt256() && Ld.getValueType().isInteger()) {
+  if (Subtarget.hasInt256() && Ld.getValueType().isInteger()) {
     if (ScalarSize == 8 || ScalarSize == 16 || ScalarSize == 64)
       return DAG.getNode(X86ISD::VBROADCAST, dl, VT, Ld);
   }
@@ -5801,7 +5996,7 @@ static SDValue buildFromShuffleMostly(SDValue Op, SelectionDAG &DAG) {
     return SDValue();
 
   VecIn2 = VecIn2.getNode() ? VecIn2 : DAG.getUNDEF(VT);
-  SDValue NV = DAG.getVectorShuffle(VT, DL, VecIn1, VecIn2, &Mask[0]);
+  SDValue NV = DAG.getVectorShuffle(VT, DL, VecIn1, VecIn2, Mask);
   for (unsigned i = 0, e = InsertIndices.size(); i != e; ++i) {
     unsigned Idx = InsertIndices[i];
     NV = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, VT, NV, Op.getOperand(Idx),
@@ -5818,7 +6013,7 @@ static SDValue ConvertI1VectorToInteger(SDValue Op, SelectionDAG &DAG) {
   uint64_t Immediate = 0;
   for (unsigned idx = 0, e = Op.getNumOperands(); idx < e; ++idx) {
     SDValue In = Op.getOperand(idx);
-    if (In.getOpcode() != ISD::UNDEF)
+    if (!In.isUndef())
       Immediate |= cast<ConstantSDNode>(In)->getZExtValue() << idx;
   }
   SDLoc dl(Op);
@@ -5835,17 +6030,11 @@ X86TargetLowering::LowerBUILD_VECTORvXi1(SDValue Op, SelectionDAG &DAG) const {
          "Unexpected type in LowerBUILD_VECTORvXi1!");
 
   SDLoc dl(Op);
-  if (ISD::isBuildVectorAllZeros(Op.getNode())) {
-    SDValue Cst = DAG.getTargetConstant(0, dl, MVT::i1);
-    SmallVector<SDValue, 16> Ops(VT.getVectorNumElements(), Cst);
-    return DAG.getNode(ISD::BUILD_VECTOR, dl, VT, Ops);
-  }
+  if (ISD::isBuildVectorAllZeros(Op.getNode()))
+    return DAG.getTargetConstant(0, dl, VT);
 
-  if (ISD::isBuildVectorAllOnes(Op.getNode())) {
-    SDValue Cst = DAG.getTargetConstant(1, dl, MVT::i1);
-    SmallVector<SDValue, 16> Ops(VT.getVectorNumElements(), Cst);
-    return DAG.getNode(ISD::BUILD_VECTOR, dl, VT, Ops);
-  }
+  if (ISD::isBuildVectorAllOnes(Op.getNode()))
+    return DAG.getTargetConstant(1, dl, VT);
 
   if (ISD::isBuildVectorOfConstantSDNodes(Op.getNode())) {
     SDValue Imm = ConvertI1VectorToInteger(Op, DAG);
@@ -5864,7 +6053,7 @@ X86TargetLowering::LowerBUILD_VECTORvXi1(SDValue Op, SelectionDAG &DAG) const {
   int SplatIdx = -1;
   for (unsigned idx = 0, e = Op.getNumOperands(); idx < e; ++idx) {
     SDValue In = Op.getOperand(idx);
-    if (In.getOpcode() == ISD::UNDEF)
+    if (In.isUndef())
       continue;
     if (!isa<ConstantSDNode>(In))
       NonConstIdx.push_back(idx);
@@ -5872,7 +6061,7 @@ X86TargetLowering::LowerBUILD_VECTORvXi1(SDValue Op, SelectionDAG &DAG) const {
       Immediate |= cast<ConstantSDNode>(In)->getZExtValue() << idx;
       HasConstElts = true;
     }
-    if (SplatIdx == -1)
+    if (SplatIdx < 0)
       SplatIdx = idx;
     else if (In != Op.getOperand(SplatIdx))
       IsSplat = false;
@@ -5903,7 +6092,7 @@ X86TargetLowering::LowerBUILD_VECTORvXi1(SDValue Op, SelectionDAG &DAG) const {
                          DAG.getIntPtrConstant(0, dl));
   }
 
-  for (unsigned i = 0; i < NonConstIdx.size(); ++i) {
+  for (unsigned i = 0, e = NonConstIdx.size(); i != e; ++i) {
     unsigned InsertIdx = NonConstIdx[i];
     DstVec = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, DstVec,
                          Op.getOperand(InsertIdx),
@@ -5948,7 +6137,7 @@ static bool isHorizontalBinOp(const BuildVectorSDNode *N, unsigned Opcode,
     SDValue Op = N->getOperand(i + BaseIdx);
 
     // Skip UNDEFs.
-    if (Op->getOpcode() == ISD::UNDEF) {
+    if (Op->isUndef()) {
       // Update the expected vector extract index.
       if (i * 2 == NumElts)
         ExpectedVExtractIdx = BaseIdx;
@@ -5978,13 +6167,13 @@ static bool isHorizontalBinOp(const BuildVectorSDNode *N, unsigned Opcode,
     unsigned I1 = cast<ConstantSDNode>(Op1.getOperand(1))->getZExtValue();
 
     if (i * 2 < NumElts) {
-      if (V0.getOpcode() == ISD::UNDEF) {
+      if (V0.isUndef()) {
         V0 = Op0.getOperand(0);
         if (V0.getValueType() != VT)
           return false;
       }
     } else {
-      if (V1.getOpcode() == ISD::UNDEF) {
+      if (V1.isUndef()) {
         V1 = Op0.getOperand(0);
         if (V1.getValueType() != VT)
           return false;
@@ -6041,37 +6230,35 @@ static bool isHorizontalBinOp(const BuildVectorSDNode *N, unsigned Opcode,
 /// 128-bits of the result. If \p isUndefHI is set, then UNDEF is propagated to
 /// the upper 128-bits of the result.
 static SDValue ExpandHorizontalBinOp(const SDValue &V0, const SDValue &V1,
-                                     SDLoc DL, SelectionDAG &DAG,
+                                     const SDLoc &DL, SelectionDAG &DAG,
                                      unsigned X86Opcode, bool Mode,
                                      bool isUndefLO, bool isUndefHI) {
-  EVT VT = V0.getValueType();
-  assert(VT.is256BitVector() && VT == V1.getValueType() &&
+  MVT VT = V0.getSimpleValueType();
+  assert(VT.is256BitVector() && VT == V1.getSimpleValueType() &&
          "Invalid nodes in input!");
 
   unsigned NumElts = VT.getVectorNumElements();
-  SDValue V0_LO = Extract128BitVector(V0, 0, DAG, DL);
-  SDValue V0_HI = Extract128BitVector(V0, NumElts/2, DAG, DL);
-  SDValue V1_LO = Extract128BitVector(V1, 0, DAG, DL);
-  SDValue V1_HI = Extract128BitVector(V1, NumElts/2, DAG, DL);
-  EVT NewVT = V0_LO.getValueType();
+  SDValue V0_LO = extract128BitVector(V0, 0, DAG, DL);
+  SDValue V0_HI = extract128BitVector(V0, NumElts/2, DAG, DL);
+  SDValue V1_LO = extract128BitVector(V1, 0, DAG, DL);
+  SDValue V1_HI = extract128BitVector(V1, NumElts/2, DAG, DL);
+  MVT NewVT = V0_LO.getSimpleValueType();
 
   SDValue LO = DAG.getUNDEF(NewVT);
   SDValue HI = DAG.getUNDEF(NewVT);
 
   if (Mode) {
     // Don't emit a horizontal binop if the result is expected to be UNDEF.
-    if (!isUndefLO && V0->getOpcode() != ISD::UNDEF)
+    if (!isUndefLO && !V0->isUndef())
       LO = DAG.getNode(X86Opcode, DL, NewVT, V0_LO, V0_HI);
-    if (!isUndefHI && V1->getOpcode() != ISD::UNDEF)
+    if (!isUndefHI && !V1->isUndef())
       HI = DAG.getNode(X86Opcode, DL, NewVT, V1_LO, V1_HI);
   } else {
     // Don't emit a horizontal binop if the result is expected to be UNDEF.
-    if (!isUndefLO && (V0_LO->getOpcode() != ISD::UNDEF ||
-                       V1_LO->getOpcode() != ISD::UNDEF))
+    if (!isUndefLO && (!V0_LO->isUndef() || !V1_LO->isUndef()))
       LO = DAG.getNode(X86Opcode, DL, NewVT, V0_LO, V1_LO);
 
-    if (!isUndefHI && (V0_HI->getOpcode() != ISD::UNDEF ||
-                       V1_HI->getOpcode() != ISD::UNDEF))
+    if (!isUndefHI && (!V0_HI->isUndef() || !V1_HI->isUndef()))
       HI = DAG.getNode(X86Opcode, DL, NewVT, V0_HI, V1_HI);
   }
 
@@ -6081,10 +6268,10 @@ static SDValue ExpandHorizontalBinOp(const SDValue &V0, const SDValue &V1,
 /// Try to fold a build_vector that performs an 'addsub' to an X86ISD::ADDSUB
 /// node.
 static SDValue LowerToAddSub(const BuildVectorSDNode *BV,
-                             const X86Subtarget *Subtarget, SelectionDAG &DAG) {
+                             const X86Subtarget &Subtarget, SelectionDAG &DAG) {
   MVT VT = BV->getSimpleValueType(0);
-  if ((!Subtarget->hasSSE3() || (VT != MVT::v4f32 && VT != MVT::v2f64)) &&
-      (!Subtarget->hasAVX() || (VT != MVT::v8f32 && VT != MVT::v4f64)))
+  if ((!Subtarget.hasSSE3() || (VT != MVT::v4f32 && VT != MVT::v2f64)) &&
+      (!Subtarget.hasAVX() || (VT != MVT::v8f32 && VT != MVT::v4f64)))
     return SDValue();
 
   SDLoc DL(BV);
@@ -6142,12 +6329,12 @@ static SDValue LowerToAddSub(const BuildVectorSDNode *BV,
       SubFound = true;
 
     // Update InVec0 and InVec1.
-    if (InVec0.getOpcode() == ISD::UNDEF) {
+    if (InVec0.isUndef()) {
       InVec0 = Op0.getOperand(0);
       if (InVec0.getSimpleValueType() != VT)
         return SDValue();
     }
-    if (InVec1.getOpcode() == ISD::UNDEF) {
+    if (InVec1.isUndef()) {
       InVec1 = Op1.getOperand(0);
       if (InVec1.getSimpleValueType() != VT)
         return SDValue();
@@ -6174,8 +6361,7 @@ static SDValue LowerToAddSub(const BuildVectorSDNode *BV,
   }
 
   // Don't try to fold this build_vector into an ADDSUB if the inputs are undef.
-  if (AddFound && SubFound && InVec0.getOpcode() != ISD::UNDEF &&
-      InVec1.getOpcode() != ISD::UNDEF)
+  if (AddFound && SubFound && !InVec0.isUndef() && !InVec1.isUndef())
     return DAG.getNode(X86ISD::ADDSUB, DL, VT, InVec0, InVec1);
 
   return SDValue();
@@ -6183,7 +6369,7 @@ static SDValue LowerToAddSub(const BuildVectorSDNode *BV,
 
 /// Lower BUILD_VECTOR to a horizontal add/sub operation if possible.
 static SDValue LowerToHorizontalOp(const BuildVectorSDNode *BV,
-                                   const X86Subtarget *Subtarget,
+                                   const X86Subtarget &Subtarget,
                                    SelectionDAG &DAG) {
   MVT VT = BV->getSimpleValueType(0);
   unsigned NumElts = VT.getVectorNumElements();
@@ -6193,11 +6379,11 @@ static SDValue LowerToHorizontalOp(const BuildVectorSDNode *BV,
 
   // Count the number of UNDEF operands in the build_vector in input.
   for (unsigned i = 0, e = Half; i != e; ++i)
-    if (BV->getOperand(i)->getOpcode() == ISD::UNDEF)
+    if (BV->getOperand(i)->isUndef())
       NumUndefsLO++;
 
   for (unsigned i = Half, e = NumElts; i != e; ++i)
-    if (BV->getOperand(i)->getOpcode() == ISD::UNDEF)
+    if (BV->getOperand(i)->isUndef())
       NumUndefsHI++;
 
   // Early exit if this is either a build_vector of all UNDEFs or all the
@@ -6207,14 +6393,14 @@ static SDValue LowerToHorizontalOp(const BuildVectorSDNode *BV,
 
   SDLoc DL(BV);
   SDValue InVec0, InVec1;
-  if ((VT == MVT::v4f32 || VT == MVT::v2f64) && Subtarget->hasSSE3()) {
+  if ((VT == MVT::v4f32 || VT == MVT::v2f64) && Subtarget.hasSSE3()) {
     // Try to match an SSE3 float HADD/HSUB.
     if (isHorizontalBinOp(BV, ISD::FADD, DAG, 0, NumElts, InVec0, InVec1))
       return DAG.getNode(X86ISD::FHADD, DL, VT, InVec0, InVec1);
 
     if (isHorizontalBinOp(BV, ISD::FSUB, DAG, 0, NumElts, InVec0, InVec1))
       return DAG.getNode(X86ISD::FHSUB, DL, VT, InVec0, InVec1);
-  } else if ((VT == MVT::v4i32 || VT == MVT::v8i16) && Subtarget->hasSSSE3()) {
+  } else if ((VT == MVT::v4i32 || VT == MVT::v8i16) && Subtarget.hasSSSE3()) {
     // Try to match an SSSE3 integer HADD/HSUB.
     if (isHorizontalBinOp(BV, ISD::ADD, DAG, 0, NumElts, InVec0, InVec1))
       return DAG.getNode(X86ISD::HADD, DL, VT, InVec0, InVec1);
@@ -6223,7 +6409,7 @@ static SDValue LowerToHorizontalOp(const BuildVectorSDNode *BV,
       return DAG.getNode(X86ISD::HSUB, DL, VT, InVec0, InVec1);
   }
 
-  if (!Subtarget->hasAVX())
+  if (!Subtarget.hasAVX())
     return SDValue();
 
   if ((VT == MVT::v8f32 || VT == MVT::v4f64)) {
@@ -6232,18 +6418,14 @@ static SDValue LowerToHorizontalOp(const BuildVectorSDNode *BV,
     SDValue InVec2, InVec3;
     if (isHorizontalBinOp(BV, ISD::FADD, DAG, 0, Half, InVec0, InVec1) &&
         isHorizontalBinOp(BV, ISD::FADD, DAG, Half, NumElts, InVec2, InVec3) &&
-        ((InVec0.getOpcode() == ISD::UNDEF ||
-          InVec2.getOpcode() == ISD::UNDEF) || InVec0 == InVec2) &&
-        ((InVec1.getOpcode() == ISD::UNDEF ||
-          InVec3.getOpcode() == ISD::UNDEF) || InVec1 == InVec3))
+        ((InVec0.isUndef() || InVec2.isUndef()) || InVec0 == InVec2) &&
+        ((InVec1.isUndef() || InVec3.isUndef()) || InVec1 == InVec3))
       return DAG.getNode(X86ISD::FHADD, DL, VT, InVec0, InVec1);
 
     if (isHorizontalBinOp(BV, ISD::FSUB, DAG, 0, Half, InVec0, InVec1) &&
         isHorizontalBinOp(BV, ISD::FSUB, DAG, Half, NumElts, InVec2, InVec3) &&
-        ((InVec0.getOpcode() == ISD::UNDEF ||
-          InVec2.getOpcode() == ISD::UNDEF) || InVec0 == InVec2) &&
-        ((InVec1.getOpcode() == ISD::UNDEF ||
-          InVec3.getOpcode() == ISD::UNDEF) || InVec1 == InVec3))
+        ((InVec0.isUndef() || InVec2.isUndef()) || InVec0 == InVec2) &&
+        ((InVec1.isUndef() || InVec3.isUndef()) || InVec1 == InVec3))
       return DAG.getNode(X86ISD::FHSUB, DL, VT, InVec0, InVec1);
   } else if (VT == MVT::v8i32 || VT == MVT::v16i16) {
     // Try to match an AVX2 horizontal add/sub of signed integers.
@@ -6253,17 +6435,13 @@ static SDValue LowerToHorizontalOp(const BuildVectorSDNode *BV,
 
     if (isHorizontalBinOp(BV, ISD::ADD, DAG, 0, Half, InVec0, InVec1) &&
         isHorizontalBinOp(BV, ISD::ADD, DAG, Half, NumElts, InVec2, InVec3) &&
-        ((InVec0.getOpcode() == ISD::UNDEF ||
-          InVec2.getOpcode() == ISD::UNDEF) || InVec0 == InVec2) &&
-        ((InVec1.getOpcode() == ISD::UNDEF ||
-          InVec3.getOpcode() == ISD::UNDEF) || InVec1 == InVec3))
+        ((InVec0.isUndef() || InVec2.isUndef()) || InVec0 == InVec2) &&
+        ((InVec1.isUndef() || InVec3.isUndef()) || InVec1 == InVec3))
       X86Opcode = X86ISD::HADD;
     else if (isHorizontalBinOp(BV, ISD::SUB, DAG, 0, Half, InVec0, InVec1) &&
         isHorizontalBinOp(BV, ISD::SUB, DAG, Half, NumElts, InVec2, InVec3) &&
-        ((InVec0.getOpcode() == ISD::UNDEF ||
-          InVec2.getOpcode() == ISD::UNDEF) || InVec0 == InVec2) &&
-        ((InVec1.getOpcode() == ISD::UNDEF ||
-          InVec3.getOpcode() == ISD::UNDEF) || InVec1 == InVec3))
+        ((InVec0.isUndef() || InVec2.isUndef()) || InVec0 == InVec2) &&
+        ((InVec1.isUndef() || InVec3.isUndef()) || InVec1 == InVec3))
       X86Opcode = X86ISD::HSUB;
     else
       CanFold = false;
@@ -6271,7 +6449,7 @@ static SDValue LowerToHorizontalOp(const BuildVectorSDNode *BV,
     if (CanFold) {
       // Fold this build_vector into a single horizontal add/sub.
       // Do this only if the target has AVX2.
-      if (Subtarget->hasAVX2())
+      if (Subtarget.hasAVX2())
         return DAG.getNode(X86Opcode, DL, VT, InVec0, InVec1);
 
       // Do not try to expand this build_vector into a pair of horizontal
@@ -6289,7 +6467,7 @@ static SDValue LowerToHorizontalOp(const BuildVectorSDNode *BV,
   }
 
   if ((VT == MVT::v8f32 || VT == MVT::v4f64 || VT == MVT::v8i32 ||
-       VT == MVT::v16i16) && Subtarget->hasAVX()) {
+       VT == MVT::v16i16) && Subtarget.hasAVX()) {
     unsigned X86Opcode;
     if (isHorizontalBinOp(BV, ISD::ADD, DAG, 0, NumElts, InVec0, InVec1))
       X86Opcode = X86ISD::HADD;
@@ -6318,39 +6496,101 @@ static SDValue LowerToHorizontalOp(const BuildVectorSDNode *BV,
   return SDValue();
 }
 
-SDValue
-X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const {
-  SDLoc dl(Op);
-
+/// If a BUILD_VECTOR's source elements all apply the same bit operation and
+/// one of their operands is constant, lower to a pair of BUILD_VECTOR and
+/// just apply the bit to the vectors.
+/// NOTE: Its not in our interest to start make a general purpose vectorizer
+/// from this, but enough scalar bit operations are created from the later
+/// legalization + scalarization stages to need basic support.
+static SDValue lowerBuildVectorToBitOp(SDValue Op, SelectionDAG &DAG) {
+  SDLoc DL(Op);
   MVT VT = Op.getSimpleValueType();
-  MVT ExtVT = VT.getVectorElementType();
-  unsigned NumElems = Op.getNumOperands();
+  unsigned NumElems = VT.getVectorNumElements();
+  const TargetLowering &TLI = DAG.getTargetLoweringInfo();
 
-  // Generate vectors for predicate vectors.
-  if (VT.getVectorElementType() == MVT::i1 && Subtarget->hasAVX512())
-    return LowerBUILD_VECTORvXi1(Op, DAG);
+  // Check that all elements have the same opcode.
+  // TODO: Should we allow UNDEFS and if so how many?
+  unsigned Opcode = Op.getOperand(0).getOpcode();
+  for (unsigned i = 1; i < NumElems; ++i)
+    if (Opcode != Op.getOperand(i).getOpcode())
+      return SDValue();
 
-  // Vectors containing all zeros can be matched by pxor and xorps later
+  // TODO: We may be able to add support for other Ops (ADD/SUB + shifts).
+  switch (Opcode) {
+  default:
+    return SDValue();
+  case ISD::AND:
+  case ISD::XOR:
+  case ISD::OR:
+    if (!TLI.isOperationLegalOrPromote(Opcode, VT))
+      return SDValue();
+    break;
+  }
+
+  SmallVector<SDValue, 4> LHSElts, RHSElts;
+  for (SDValue Elt : Op->ops()) {
+    SDValue LHS = Elt.getOperand(0);
+    SDValue RHS = Elt.getOperand(1);
+
+    // We expect the canonicalized RHS operand to be the constant.
+    if (!isa<ConstantSDNode>(RHS))
+      return SDValue();
+    LHSElts.push_back(LHS);
+    RHSElts.push_back(RHS);
+  }
+
+  SDValue LHS = DAG.getBuildVector(VT, DL, LHSElts);
+  SDValue RHS = DAG.getBuildVector(VT, DL, RHSElts);
+  return DAG.getNode(Opcode, DL, VT, LHS, RHS);
+}
+
+/// Create a vector constant without a load. SSE/AVX provide the bare minimum
+/// functionality to do this, so it's all zeros, all ones, or some derivation
+/// that is cheap to calculate.
+static SDValue materializeVectorConstant(SDValue Op, SelectionDAG &DAG,
+                                         const X86Subtarget &Subtarget) {
+  SDLoc DL(Op);
+  MVT VT = Op.getSimpleValueType();
+
+  // Vectors containing all zeros can be matched by pxor and xorps.
   if (ISD::isBuildVectorAllZeros(Op.getNode())) {
     // Canonicalize this to <4 x i32> to 1) ensure the zero vectors are CSE'd
     // and 2) ensure that i64 scalars are eliminated on x86-32 hosts.
     if (VT == MVT::v4i32 || VT == MVT::v8i32 || VT == MVT::v16i32)
       return Op;
 
-    return getZeroVector(VT, Subtarget, DAG, dl);
+    return getZeroVector(VT, Subtarget, DAG, DL);
   }
 
   // Vectors containing all ones can be matched by pcmpeqd on 128-bit width
   // vectors or broken into v4i32 operations on 256-bit vectors. AVX2 can use
   // vpcmpeqd on 256-bit vectors.
-  if (Subtarget->hasSSE2() && ISD::isBuildVectorAllOnes(Op.getNode())) {
-    if (VT == MVT::v4i32 || (VT == MVT::v8i32 && Subtarget->hasInt256()))
+  if (Subtarget.hasSSE2() && ISD::isBuildVectorAllOnes(Op.getNode())) {
+    if (VT == MVT::v4i32 || VT == MVT::v16i32 ||
+        (VT == MVT::v8i32 && Subtarget.hasInt256()))
       return Op;
 
-    if (!VT.is512BitVector())
-      return getOnesVector(VT, Subtarget, DAG, dl);
+    return getOnesVector(VT, Subtarget, DAG, DL);
   }
 
+  return SDValue();
+}
+
+SDValue
+X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const {
+  SDLoc dl(Op);
+
+  MVT VT = Op.getSimpleValueType();
+  MVT ExtVT = VT.getVectorElementType();
+  unsigned NumElems = Op.getNumOperands();
+
+  // Generate vectors for predicate vectors.
+  if (VT.getVectorElementType() == MVT::i1 && Subtarget.hasAVX512())
+    return LowerBUILD_VECTORvXi1(Op, DAG);
+
+  if (SDValue VectorConstant = materializeVectorConstant(Op, DAG, Subtarget))
+    return VectorConstant;
+
   BuildVectorSDNode *BV = cast<BuildVectorSDNode>(Op.getNode());
   if (SDValue AddSub = LowerToAddSub(BV, Subtarget, DAG))
     return AddSub;
@@ -6358,6 +6598,8 @@ X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const {
     return HorizontalOp;
   if (SDValue Broadcast = LowerVectorBroadcast(Op, Subtarget, DAG))
     return Broadcast;
+  if (SDValue BitOp = lowerBuildVectorToBitOp(Op, DAG))
+    return BitOp;
 
   unsigned EVTBits = ExtVT.getSizeInBits();
 
@@ -6368,7 +6610,7 @@ X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const {
   SmallSet<SDValue, 8> Values;
   for (unsigned i = 0; i < NumElems; ++i) {
     SDValue Elt = Op.getOperand(i);
-    if (Elt.getOpcode() == ISD::UNDEF)
+    if (Elt.isUndef())
       continue;
     Values.insert(Elt);
     if (Elt.getOpcode() != ISD::Constant &&
@@ -6397,7 +6639,7 @@ X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const {
     // insertion that way.  Only do this if the value is non-constant or if the
     // value is a constant being inserted into element 0.  It is cheaper to do
     // a constant pool load than it is to do a movd + shuffle.
-    if (ExtVT == MVT::i64 && !Subtarget->is64Bit() &&
+    if (ExtVT == MVT::i64 && !Subtarget.is64Bit() &&
         (!IsAllConstants || Idx == 0)) {
       if (DAG.MaskedValueIsZero(Item, APInt::getBitsSet(64, 32, 64))) {
         // Handle SSE only.
@@ -6422,7 +6664,7 @@ X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const {
         return DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Item);
 
       if (ExtVT == MVT::i32 || ExtVT == MVT::f32 || ExtVT == MVT::f64 ||
-          (ExtVT == MVT::i64 && Subtarget->is64Bit())) {
+          (ExtVT == MVT::i64 && Subtarget.is64Bit())) {
         if (VT.is512BitVector()) {
           SDValue ZeroVec = getZeroVector(VT, Subtarget, DAG, dl);
           return DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, ZeroVec,
@@ -6439,16 +6681,17 @@ X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const {
       // it to i32 first.
       if (ExtVT == MVT::i16 || ExtVT == MVT::i8) {
         Item = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, Item);
-        if (VT.is256BitVector()) {
-          if (Subtarget->hasAVX()) {
-            Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v8i32, Item);
+        if (VT.getSizeInBits() >= 256) {
+          MVT ShufVT = MVT::getVectorVT(MVT::i32, VT.getSizeInBits()/32);
+          if (Subtarget.hasAVX()) {
+            Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, ShufVT, Item);
             Item = getShuffleVectorZeroOrUndef(Item, 0, true, Subtarget, DAG);
           } else {
             // Without AVX, we need to extend to a 128-bit vector and then
             // insert into the 256-bit vector.
             Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32, Item);
-            SDValue ZeroVec = getZeroVector(MVT::v8i32, Subtarget, DAG, dl);
-            Item = Insert128BitVector(ZeroVec, Item, 0, DAG, dl);
+            SDValue ZeroVec = getZeroVector(ShufVT, Subtarget, DAG, dl);
+            Item = insert128BitVector(ZeroVec, Item, 0, DAG, dl);
           }
         } else {
           assert(VT.is128BitVector() && "Expected an SSE value type!");
@@ -6504,28 +6747,30 @@ X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const {
   if (IsAllConstants)
     return SDValue();
 
-  // For AVX-length vectors, see if we can use a vector load to get all of the
-  // elements, otherwise build the individual 128-bit pieces and use
+  // See if we can use a vector load to get all of the elements.
+  if (VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector()) {
+    SmallVector<SDValue, 64> Ops(Op->op_begin(), Op->op_begin() + NumElems);
+    if (SDValue LD = EltsFromConsecutiveLoads(VT, Ops, dl, DAG, false))
+      return LD;
+  }
+
+  // For AVX-length vectors, build the individual 128-bit pieces and use
   // shuffles to put them in place.
   if (VT.is256BitVector() || VT.is512BitVector()) {
-    SmallVector<SDValue, 64> V(Op->op_begin(), Op->op_begin() + NumElems);
-
-    // Check for a build vector of consecutive loads.
-    if (SDValue LD = EltsFromConsecutiveLoads(VT, V, dl, DAG, false))
-      return LD;
+    SmallVector<SDValue, 64> Ops(Op->op_begin(), Op->op_begin() + NumElems);
 
     EVT HVT = EVT::getVectorVT(*DAG.getContext(), ExtVT, NumElems/2);
 
     // Build both the lower and upper subvector.
-    SDValue Lower = DAG.getNode(ISD::BUILD_VECTOR, dl, HVT,
-                                makeArrayRef(&V[0], NumElems/2));
-    SDValue Upper = DAG.getNode(ISD::BUILD_VECTOR, dl, HVT,
-                                makeArrayRef(&V[NumElems / 2], NumElems/2));
+    SDValue Lower =
+        DAG.getBuildVector(HVT, dl, makeArrayRef(&Ops[0], NumElems / 2));
+    SDValue Upper = DAG.getBuildVector(
+        HVT, dl, makeArrayRef(&Ops[NumElems / 2], NumElems / 2));
 
     // Recreate the wider vector with the lower and upper part.
     if (VT.is256BitVector())
-      return Concat128BitVectors(Lower, Upper, VT, NumElems, DAG, dl);
-    return Concat256BitVectors(Lower, Upper, VT, NumElems, DAG, dl);
+      return concat128BitVectors(Lower, Upper, VT, NumElems, DAG, dl);
+    return concat256BitVectors(Lower, Upper, VT, NumElems, DAG, dl);
   }
 
   // Let legalizer expand 2-wide build_vectors.
@@ -6557,30 +6802,30 @@ X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const {
       return V;
 
   // If element VT is == 32 bits, turn it into a number of shuffles.
-  SmallVector<SDValue, 8> V(NumElems);
   if (NumElems == 4 && NumZero > 0) {
+    SmallVector<SDValue, 8> Ops(NumElems);
     for (unsigned i = 0; i < 4; ++i) {
       bool isZero = !(NonZeros & (1ULL << i));
       if (isZero)
-        V[i] = getZeroVector(VT, Subtarget, DAG, dl);
+        Ops[i] = getZeroVector(VT, Subtarget, DAG, dl);
       else
-        V[i] = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Op.getOperand(i));
+        Ops[i] = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Op.getOperand(i));
     }
 
     for (unsigned i = 0; i < 2; ++i) {
       switch ((NonZeros & (0x3 << i*2)) >> (i*2)) {
         default: break;
         case 0:
-          V[i] = V[i*2];  // Must be a zero vector.
+          Ops[i] = Ops[i*2];  // Must be a zero vector.
           break;
         case 1:
-          V[i] = getMOVL(DAG, dl, VT, V[i*2+1], V[i*2]);
+          Ops[i] = getMOVL(DAG, dl, VT, Ops[i*2+1], Ops[i*2]);
           break;
         case 2:
-          V[i] = getMOVL(DAG, dl, VT, V[i*2], V[i*2+1]);
+          Ops[i] = getMOVL(DAG, dl, VT, Ops[i*2], Ops[i*2+1]);
           break;
         case 3:
-          V[i] = getUnpackl(DAG, dl, VT, V[i*2], V[i*2+1]);
+          Ops[i] = getUnpackl(DAG, dl, VT, Ops[i*2], Ops[i*2+1]);
           break;
       }
     }
@@ -6593,32 +6838,24 @@ X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const {
       static_cast<int>(Reverse2 ? NumElems+1 : NumElems),
       static_cast<int>(Reverse2 ? NumElems   : NumElems+1)
     };
-    return DAG.getVectorShuffle(VT, dl, V[0], V[1], &MaskVec[0]);
+    return DAG.getVectorShuffle(VT, dl, Ops[0], Ops[1], MaskVec);
   }
 
   if (Values.size() > 1 && VT.is128BitVector()) {
-    // Check for a build vector of consecutive loads.
-    for (unsigned i = 0; i < NumElems; ++i)
-      V[i] = Op.getOperand(i);
-
-    // Check for elements which are consecutive loads.
-    if (SDValue LD = EltsFromConsecutiveLoads(VT, V, dl, DAG, false))
-      return LD;
-
     // Check for a build vector from mostly shuffle plus few inserting.
     if (SDValue Sh = buildFromShuffleMostly(Op, DAG))
       return Sh;
 
     // For SSE 4.1, use insertps to put the high elements into the low element.
-    if (Subtarget->hasSSE41()) {
+    if (Subtarget.hasSSE41()) {
       SDValue Result;
-      if (Op.getOperand(0).getOpcode() != ISD::UNDEF)
+      if (!Op.getOperand(0).isUndef())
         Result = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Op.getOperand(0));
       else
         Result = DAG.getUNDEF(VT);
 
       for (unsigned i = 1; i < NumElems; ++i) {
-        if (Op.getOperand(i).getOpcode() == ISD::UNDEF) continue;
+        if (Op.getOperand(i).isUndef()) continue;
         Result = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, Result,
                              Op.getOperand(i), DAG.getIntPtrConstant(i, dl));
       }
@@ -6628,11 +6865,12 @@ X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const {
     // Otherwise, expand into a number of unpckl*, start by extending each of
     // our (non-undef) elements to the full vector width with the element in the
     // bottom slot of the vector (which generates no code for SSE).
+    SmallVector<SDValue, 8> Ops(NumElems);
     for (unsigned i = 0; i < NumElems; ++i) {
-      if (Op.getOperand(i).getOpcode() != ISD::UNDEF)
-        V[i] = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Op.getOperand(i));
+      if (!Op.getOperand(i).isUndef())
+        Ops[i] = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Op.getOperand(i));
       else
-        V[i] = DAG.getUNDEF(VT);
+        Ops[i] = DAG.getUNDEF(VT);
     }
 
     // Next, we iteratively mix elements, e.g. for v4f32:
@@ -6642,20 +6880,20 @@ X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const {
     unsigned EltStride = NumElems >> 1;
     while (EltStride != 0) {
       for (unsigned i = 0; i < EltStride; ++i) {
-        // If V[i+EltStride] is undef and this is the first round of mixing,
+        // If Ops[i+EltStride] is undef and this is the first round of mixing,
         // then it is safe to just drop this shuffle: V[i] is already in the
         // right place, the one element (since it's the first round) being
         // inserted as undef can be dropped.  This isn't safe for successive
         // rounds because they will permute elements within both vectors.
-        if (V[i+EltStride].getOpcode() == ISD::UNDEF &&
+        if (Ops[i+EltStride].isUndef() &&
             EltStride == NumElems/2)
           continue;
 
-        V[i] = getUnpackl(DAG, dl, VT, V[i], V[i + EltStride]);
+        Ops[i] = getUnpackl(DAG, dl, VT, Ops[i], Ops[i + EltStride]);
       }
       EltStride >>= 1;
     }
-    return V[0];
+    return Ops[0];
   }
   return SDValue();
 }
@@ -6673,21 +6911,23 @@ static SDValue LowerAVXCONCAT_VECTORS(SDValue Op, SelectionDAG &DAG) {
   SDValue V2 = Op.getOperand(1);
   unsigned NumElems = ResVT.getVectorNumElements();
   if (ResVT.is256BitVector())
-    return Concat128BitVectors(V1, V2, ResVT, NumElems, DAG, dl);
+    return concat128BitVectors(V1, V2, ResVT, NumElems, DAG, dl);
 
   if (Op.getNumOperands() == 4) {
     MVT HalfVT = MVT::getVectorVT(ResVT.getVectorElementType(),
                                   ResVT.getVectorNumElements()/2);
     SDValue V3 = Op.getOperand(2);
     SDValue V4 = Op.getOperand(3);
-    return Concat256BitVectors(Concat128BitVectors(V1, V2, HalfVT, NumElems/2, DAG, dl),
-      Concat128BitVectors(V3, V4, HalfVT, NumElems/2, DAG, dl), ResVT, NumElems, DAG, dl);
+    return concat256BitVectors(
+        concat128BitVectors(V1, V2, HalfVT, NumElems / 2, DAG, dl),
+        concat128BitVectors(V3, V4, HalfVT, NumElems / 2, DAG, dl), ResVT,
+        NumElems, DAG, dl);
   }
-  return Concat256BitVectors(V1, V2, ResVT, NumElems, DAG, dl);
+  return concat256BitVectors(V1, V2, ResVT, NumElems, DAG, dl);
 }
 
 static SDValue LowerCONCAT_VECTORSvXi1(SDValue Op,
-                                       const X86Subtarget *Subtarget,
+                                       const X86Subtarget &Subtarget,
                                        SelectionDAG & DAG) {
   SDLoc dl(Op);
   MVT ResVT = Op.getSimpleValueType();
@@ -6764,7 +7004,7 @@ static SDValue LowerCONCAT_VECTORSvXi1(SDValue Op,
 }
 
 static SDValue LowerCONCAT_VECTORS(SDValue Op,
-                                   const X86Subtarget *Subtarget,
+                                   const X86Subtarget &Subtarget,
                                    SelectionDAG &DAG) {
   MVT VT = Op.getSimpleValueType();
   if (VT.getVectorElementType() == MVT::i1)
@@ -6800,24 +7040,11 @@ static SDValue LowerCONCAT_VECTORS(SDValue Op,
 /// ShuffleVectorSDNode mask) requires any shuffles to occur. Both undef and an
 /// in-place shuffle are 'no-op's.
 static bool isNoopShuffleMask(ArrayRef<int> Mask) {
-  for (int i = 0, Size = Mask.size(); i < Size; ++i)
-    if (Mask[i] != -1 && Mask[i] != i)
-      return false;
-  return true;
-}
-
-/// \brief Helper function to classify a mask as a single-input mask.
-///
-/// This isn't a generic single-input test because in the vector shuffle
-/// lowering we canonicalize single inputs to be the first input operand. This
-/// means we can more quickly test for a single input by only checking whether
-/// an input from the second operand exists. We also assume that the size of
-/// mask corresponds to the size of the input vectors which isn't true in the
-/// fully general case.
-static bool isSingleInputShuffleMask(ArrayRef<int> Mask) {
-  for (int M : Mask)
-    if (M >= (int)Mask.size())
+  for (int i = 0, Size = Mask.size(); i < Size; ++i) {
+    assert(Mask[i] >= -1 && "Out of bound mask element!");
+    if (Mask[i] >= 0 && Mask[i] != i)
       return false;
+  }
   return true;
 }
 
@@ -6835,22 +7062,22 @@ static bool is128BitLaneCrossingShuffleMask(MVT VT, ArrayRef<int> Mask) {
   return false;
 }
 
-/// \brief Test whether a shuffle mask is equivalent within each 128-bit lane.
+/// \brief Test whether a shuffle mask is equivalent within each sub-lane.
 ///
 /// This checks a shuffle mask to see if it is performing the same
-/// 128-bit lane-relative shuffle in each 128-bit lane. This trivially implies
+/// lane-relative shuffle in each sub-lane. This trivially implies
 /// that it is also not lane-crossing. It may however involve a blend from the
 /// same lane of a second vector.
 ///
 /// The specific repeated shuffle mask is populated in \p RepeatedMask, as it is
 /// non-trivial to compute in the face of undef lanes. The representation is
-/// *not* suitable for use with existing 128-bit shuffles as it will contain
-/// entries from both V1 and V2 inputs to the wider mask.
-static bool
-is128BitLaneRepeatedShuffleMask(MVT VT, ArrayRef<int> Mask,
-                                SmallVectorImpl<int> &RepeatedMask) {
-  int LaneSize = 128 / VT.getScalarSizeInBits();
-  RepeatedMask.resize(LaneSize, -1);
+/// suitable for use with existing 128-bit shuffles as entries from the second
+/// vector have been remapped to [LaneSize, 2*LaneSize).
+static bool isRepeatedShuffleMask(unsigned LaneSizeInBits, MVT VT,
+                                  ArrayRef<int> Mask,
+                                  SmallVectorImpl<int> &RepeatedMask) {
+  int LaneSize = LaneSizeInBits / VT.getScalarSizeInBits();
+  RepeatedMask.assign(LaneSize, -1);
   int Size = Mask.size();
   for (int i = 0; i < Size; ++i) {
     if (Mask[i] < 0)
@@ -6860,17 +7087,55 @@ is128BitLaneRepeatedShuffleMask(MVT VT, ArrayRef<int> Mask,
       return false;
 
     // Ok, handle the in-lane shuffles by detecting if and when they repeat.
-    if (RepeatedMask[i % LaneSize] == -1)
+    // Adjust second vector indices to start at LaneSize instead of Size.
+    int LocalM = Mask[i] < Size ? Mask[i] % LaneSize
+                                : Mask[i] % LaneSize + LaneSize;
+    if (RepeatedMask[i % LaneSize] < 0)
       // This is the first non-undef entry in this slot of a 128-bit lane.
-      RepeatedMask[i % LaneSize] =
-          Mask[i] < Size ? Mask[i] % LaneSize : Mask[i] % LaneSize + Size;
-    else if (RepeatedMask[i % LaneSize] + (i / LaneSize) * LaneSize != Mask[i])
+      RepeatedMask[i % LaneSize] = LocalM;
+    else if (RepeatedMask[i % LaneSize] != LocalM)
       // Found a mismatch with the repeated mask.
       return false;
   }
   return true;
 }
 
+/// Test whether a shuffle mask is equivalent within each 128-bit lane.
+static bool
+is128BitLaneRepeatedShuffleMask(MVT VT, ArrayRef<int> Mask,
+                                SmallVectorImpl<int> &RepeatedMask) {
+  return isRepeatedShuffleMask(128, VT, Mask, RepeatedMask);
+}
+
+/// Test whether a shuffle mask is equivalent within each 256-bit lane.
+static bool
+is256BitLaneRepeatedShuffleMask(MVT VT, ArrayRef<int> Mask,
+                                SmallVectorImpl<int> &RepeatedMask) {
+  return isRepeatedShuffleMask(256, VT, Mask, RepeatedMask);
+}
+
+static void scaleShuffleMask(int Scale, ArrayRef<int> Mask,
+                             SmallVectorImpl<int> &ScaledMask) {
+  assert(0 < Scale && "Unexpected scaling factor");
+  int NumElts = Mask.size();
+  ScaledMask.assign(NumElts * Scale, -1);
+
+  for (int i = 0; i != NumElts; ++i) {
+    int M = Mask[i];
+
+    // Repeat sentinel values in every mask element.
+    if (M < 0) {
+      for (int s = 0; s != Scale; ++s)
+        ScaledMask[(Scale * i) + s] = M;
+      continue;
+    }
+
+    // Scale mask element and increment across each mask element.
+    for (int s = 0; s != Scale; ++s)
+      ScaledMask[(Scale * i) + s] = (Scale * M) + s;
+  }
+}
+
 /// \brief Checks whether a shuffle mask is equivalent to an explicit list of
 /// arguments.
 ///
@@ -6893,8 +7158,9 @@ static bool isShuffleEquivalent(SDValue V1, SDValue V2, ArrayRef<int> Mask,
   auto *BV1 = dyn_cast<BuildVectorSDNode>(V1);
   auto *BV2 = dyn_cast<BuildVectorSDNode>(V2);
 
-  for (int i = 0; i < Size; ++i)
-    if (Mask[i] != -1 && Mask[i] != ExpectedMask[i]) {
+  for (int i = 0; i < Size; ++i) {
+    assert(Mask[i] >= -1 && "Out of bound mask element!");
+    if (Mask[i] >= 0 && Mask[i] != ExpectedMask[i]) {
       auto *MaskBV = Mask[i] < Size ? BV1 : BV2;
       auto *ExpectedBV = ExpectedMask[i] < Size ? BV1 : BV2;
       if (!MaskBV || !ExpectedBV ||
@@ -6902,6 +7168,32 @@ static bool isShuffleEquivalent(SDValue V1, SDValue V2, ArrayRef<int> Mask,
               ExpectedBV->getOperand(ExpectedMask[i] % Size))
         return false;
     }
+}
+
+  return true;
+}
+
+/// Checks whether a target shuffle mask is equivalent to an explicit pattern.
+///
+/// The masks must be exactly the same width.
+///
+/// If an element in Mask matches SM_SentinelUndef (-1) then the corresponding
+/// value in ExpectedMask is always accepted. Otherwise the indices must match.
+///
+/// SM_SentinelZero is accepted as a valid negative index but must match in both.
+static bool isTargetShuffleEquivalent(ArrayRef<int> Mask,
+                                      ArrayRef<int> ExpectedMask) {
+  int Size = Mask.size();
+  if (Size != (int)ExpectedMask.size())
+    return false;
+
+  for (int i = 0; i < Size; ++i)
+    if (Mask[i] == SM_SentinelUndef)
+      continue;
+    else if (Mask[i] < 0 && Mask[i] != SM_SentinelZero)
+      return false;
+    else if (Mask[i] != ExpectedMask[i])
+      return false;
 
   return true;
 }
@@ -6914,8 +7206,7 @@ static bool isShuffleEquivalent(SDValue V1, SDValue V2, ArrayRef<int> Mask,
 /// example.
 ///
 /// NB: We rely heavily on "undef" masks preserving the input lane.
-static SDValue getV4X86ShuffleImm8ForMask(ArrayRef<int> Mask, SDLoc DL,
-                                          SelectionDAG &DAG) {
+static unsigned getV4X86ShuffleImm(ArrayRef<int> Mask) {
   assert(Mask.size() == 4 && "Only 4-lane shuffle masks");
   assert(Mask[0] >= -1 && Mask[0] < 4 && "Out of bound mask element!");
   assert(Mask[1] >= -1 && Mask[1] < 4 && "Out of bound mask element!");
@@ -6923,11 +7214,16 @@ static SDValue getV4X86ShuffleImm8ForMask(ArrayRef<int> Mask, SDLoc DL,
   assert(Mask[3] >= -1 && Mask[3] < 4 && "Out of bound mask element!");
 
   unsigned Imm = 0;
-  Imm |= (Mask[0] == -1 ? 0 : Mask[0]) << 0;
-  Imm |= (Mask[1] == -1 ? 1 : Mask[1]) << 2;
-  Imm |= (Mask[2] == -1 ? 2 : Mask[2]) << 4;
-  Imm |= (Mask[3] == -1 ? 3 : Mask[3]) << 6;
-  return DAG.getConstant(Imm, DL, MVT::i8);
+  Imm |= (Mask[0] < 0 ? 0 : Mask[0]) << 0;
+  Imm |= (Mask[1] < 0 ? 1 : Mask[1]) << 2;
+  Imm |= (Mask[2] < 0 ? 2 : Mask[2]) << 4;
+  Imm |= (Mask[3] < 0 ? 3 : Mask[3]) << 6;
+  return Imm;
+}
+
+static SDValue getV4X86ShuffleImm8ForMask(ArrayRef<int> Mask, SDLoc DL,
+                                          SelectionDAG &DAG) {
+  return DAG.getConstant(getV4X86ShuffleImm(Mask), DL, MVT::i8);
 }
 
 /// \brief Compute whether each element of a shuffle is zeroable.
@@ -6941,15 +7237,16 @@ static SDValue getV4X86ShuffleImm8ForMask(ArrayRef<int> Mask, SDLoc DL,
 static SmallBitVector computeZeroableShuffleElements(ArrayRef<int> Mask,
                                                      SDValue V1, SDValue V2) {
   SmallBitVector Zeroable(Mask.size(), false);
-
-  while (V1.getOpcode() == ISD::BITCAST)
-    V1 = V1->getOperand(0);
-  while (V2.getOpcode() == ISD::BITCAST)
-    V2 = V2->getOperand(0);
+  V1 = peekThroughBitcasts(V1);
+  V2 = peekThroughBitcasts(V2);
 
   bool V1IsZero = ISD::isBuildVectorAllZeros(V1.getNode());
   bool V2IsZero = ISD::isBuildVectorAllZeros(V2.getNode());
 
+  int VectorSizeInBits = V1.getValueType().getSizeInBits();
+  int ScalarSizeInBits = VectorSizeInBits / Mask.size();
+  assert(!(VectorSizeInBits % ScalarSizeInBits) && "Illegal shuffle mask size");
+
   for (int i = 0, Size = Mask.size(); i < Size; ++i) {
     int M = Mask[i];
     // Handle the easy cases.
@@ -6958,38 +7255,119 @@ static SmallBitVector computeZeroableShuffleElements(ArrayRef<int> Mask,
       continue;
     }
 
-    // If this is an index into a build_vector node (which has the same number
-    // of elements), dig out the input value and use it.
+    // Determine shuffle input and normalize the mask.
     SDValue V = M < Size ? V1 : V2;
-    if (V.getOpcode() != ISD::BUILD_VECTOR || Size != (int)V.getNumOperands())
+    M %= Size;
+
+    // Currently we can only search BUILD_VECTOR for UNDEF/ZERO elements.
+    if (V.getOpcode() != ISD::BUILD_VECTOR)
       continue;
 
-    SDValue Input = V.getOperand(M % Size);
-    // The UNDEF opcode check really should be dead code here, but not quite
-    // worth asserting on (it isn't invalid, just unexpected).
-    if (Input.getOpcode() == ISD::UNDEF || X86::isZeroNode(Input))
-      Zeroable[i] = true;
+    // If the BUILD_VECTOR has fewer elements then the bitcasted portion of
+    // the (larger) source element must be UNDEF/ZERO.
+    if ((Size % V.getNumOperands()) == 0) {
+      int Scale = Size / V->getNumOperands();
+      SDValue Op = V.getOperand(M / Scale);
+      if (Op.isUndef() || X86::isZeroNode(Op))
+        Zeroable[i] = true;
+      else if (ConstantSDNode *Cst = dyn_cast<ConstantSDNode>(Op)) {
+        APInt Val = Cst->getAPIntValue();
+        Val = Val.lshr((M % Scale) * ScalarSizeInBits);
+        Val = Val.getLoBits(ScalarSizeInBits);
+        Zeroable[i] = (Val == 0);
+      } else if (ConstantFPSDNode *Cst = dyn_cast<ConstantFPSDNode>(Op)) {
+        APInt Val = Cst->getValueAPF().bitcastToAPInt();
+        Val = Val.lshr((M % Scale) * ScalarSizeInBits);
+        Val = Val.getLoBits(ScalarSizeInBits);
+        Zeroable[i] = (Val == 0);
+      }
+      continue;
+    }
+
+    // If the BUILD_VECTOR has more elements then all the (smaller) source
+    // elements must be UNDEF or ZERO.
+    if ((V.getNumOperands() % Size) == 0) {
+      int Scale = V->getNumOperands() / Size;
+      bool AllZeroable = true;
+      for (int j = 0; j < Scale; ++j) {
+        SDValue Op = V.getOperand((M * Scale) + j);
+        AllZeroable &= (Op.isUndef() || X86::isZeroNode(Op));
+      }
+      Zeroable[i] = AllZeroable;
+      continue;
+    }
   }
 
   return Zeroable;
 }
 
+/// Try to lower a shuffle with a single PSHUFB of V1.
+/// This is only possible if V2 is unused (at all, or only for zero elements).
+static SDValue lowerVectorShuffleWithPSHUFB(const SDLoc &DL, MVT VT,
+                                            ArrayRef<int> Mask, SDValue V1,
+                                            SDValue V2,
+                                            const X86Subtarget &Subtarget,
+                                            SelectionDAG &DAG) {
+  int Size = Mask.size();
+  int LaneSize = 128 / VT.getScalarSizeInBits();
+  const int NumBytes = VT.getSizeInBits() / 8;
+  const int NumEltBytes = VT.getScalarSizeInBits() / 8;
+
+  assert((Subtarget.hasSSSE3() && VT.is128BitVector()) ||
+         (Subtarget.hasAVX2() && VT.is256BitVector()) ||
+         (Subtarget.hasBWI() && VT.is512BitVector()));
+
+  SmallBitVector Zeroable = computeZeroableShuffleElements(Mask, V1, V2);
+
+  SmallVector<SDValue, 64> PSHUFBMask(NumBytes);
+  // Sign bit set in i8 mask means zero element.
+  SDValue ZeroMask = DAG.getConstant(0x80, DL, MVT::i8);
+
+  for (int i = 0; i < NumBytes; ++i) {
+    int M = Mask[i / NumEltBytes];
+    if (M < 0) {
+      PSHUFBMask[i] = DAG.getUNDEF(MVT::i8);
+      continue;
+    }
+    if (Zeroable[i / NumEltBytes]) {
+      PSHUFBMask[i] = ZeroMask;
+      continue;
+    }
+    // Only allow V1.
+    if (M >= Size)
+      return SDValue();
+
+    // PSHUFB can't cross lanes, ensure this doesn't happen.
+    if ((M / LaneSize) != ((i / NumEltBytes) / LaneSize))
+      return SDValue();
+
+    M = M % LaneSize;
+    M = M * NumEltBytes + (i % NumEltBytes);
+    PSHUFBMask[i] = DAG.getConstant(M, DL, MVT::i8);
+  }
+
+  MVT I8VT = MVT::getVectorVT(MVT::i8, NumBytes);
+  return DAG.getBitcast(
+      VT, DAG.getNode(X86ISD::PSHUFB, DL, I8VT, DAG.getBitcast(I8VT, V1),
+                      DAG.getBuildVector(I8VT, DL, PSHUFBMask)));
+}
+
 // X86 has dedicated unpack instructions that can handle specific blend
 // operations: UNPCKH and UNPCKL.
-static SDValue lowerVectorShuffleWithUNPCK(SDLoc DL, MVT VT, ArrayRef<int> Mask,
-                                           SDValue V1, SDValue V2,
-                                           SelectionDAG &DAG) {
+static SDValue lowerVectorShuffleWithUNPCK(const SDLoc &DL, MVT VT,
+                                           ArrayRef<int> Mask, SDValue V1,
+                                           SDValue V2, SelectionDAG &DAG) {
   int NumElts = VT.getVectorNumElements();
   int NumEltsInLane = 128 / VT.getScalarSizeInBits();
-  SmallVector<int, 8> Unpckl;
-  SmallVector<int, 8> Unpckh;
+  SmallVector<int, 8> Unpckl(NumElts);
+  SmallVector<int, 8> Unpckh(NumElts);
 
   for (int i = 0; i < NumElts; ++i) {
     unsigned LaneStart = (i / NumEltsInLane) * NumEltsInLane;
     int LoPos = (i % NumEltsInLane) / 2 + LaneStart + NumElts * (i % 2);
     int HiPos = LoPos + NumEltsInLane / 2;
-    Unpckl.push_back(LoPos);
-    Unpckh.push_back(HiPos);
+    Unpckl[i] = LoPos;
+    Unpckh[i] = HiPos;
   }
 
   if (isShuffleEquivalent(V1, V2, Mask, Unpckl))
@@ -7013,7 +7391,7 @@ static SDValue lowerVectorShuffleWithUNPCK(SDLoc DL, MVT VT, ArrayRef<int> Mask,
 ///
 /// This handles cases where we can model a blend exactly as a bitmask due to
 /// one of the inputs being zeroable.
-static SDValue lowerVectorShuffleAsBitMask(SDLoc DL, MVT VT, SDValue V1,
+static SDValue lowerVectorShuffleAsBitMask(const SDLoc &DL, MVT VT, SDValue V1,
                                            SDValue V2, ArrayRef<int> Mask,
                                            SelectionDAG &DAG) {
   MVT EltVT = VT.getVectorElementType();
@@ -7044,7 +7422,7 @@ static SDValue lowerVectorShuffleAsBitMask(SDLoc DL, MVT VT, SDValue V1,
   if (!V)
     return SDValue(); // No non-zeroable elements!
 
-  SDValue VMask = DAG.getNode(ISD::BUILD_VECTOR, DL, VT, VMaskOps);
+  SDValue VMask = DAG.getBuildVector(VT, DL, VMaskOps);
   V = DAG.getNode(VT.isFloatingPoint()
                   ? (unsigned) X86ISD::FAND : (unsigned) ISD::AND,
                   DL, VT, V, VMask);
@@ -7056,7 +7434,7 @@ static SDValue lowerVectorShuffleAsBitMask(SDLoc DL, MVT VT, SDValue V1,
 /// This is used as a fallback approach when first class blend instructions are
 /// unavailable. Currently it is only suitable for integer vectors, but could
 /// be generalized for floating point vectors if desirable.
-static SDValue lowerVectorShuffleAsBitBlend(SDLoc DL, MVT VT, SDValue V1,
+static SDValue lowerVectorShuffleAsBitBlend(const SDLoc &DL, MVT VT, SDValue V1,
                                             SDValue V2, ArrayRef<int> Mask,
                                             SelectionDAG &DAG) {
   assert(VT.isInteger() && "Only supports integer vector types!");
@@ -7067,12 +7445,12 @@ static SDValue lowerVectorShuffleAsBitBlend(SDLoc DL, MVT VT, SDValue V1,
                                     EltVT);
   SmallVector<SDValue, 16> MaskOps;
   for (int i = 0, Size = Mask.size(); i < Size; ++i) {
-    if (Mask[i] != -1 && Mask[i] != i && Mask[i] != i + Size)
+    if (Mask[i] >= 0 && Mask[i] != i && Mask[i] != i + Size)
       return SDValue(); // Shuffled input!
     MaskOps.push_back(Mask[i] < Size ? AllOnes : Zero);
   }
 
-  SDValue V1Mask = DAG.getNode(ISD::BUILD_VECTOR, DL, VT, MaskOps);
+  SDValue V1Mask = DAG.getBuildVector(VT, DL, MaskOps);
   V1 = DAG.getNode(ISD::AND, DL, VT, V1, V1Mask);
   // We have to cast V2 around.
   MVT MaskVT = MVT::getVectorVT(MVT::i64, VT.getSizeInBits() / 64);
@@ -7088,9 +7466,9 @@ static SDValue lowerVectorShuffleAsBitBlend(SDLoc DL, MVT VT, SDValue V1,
 /// these values. It relies on the availability of the X86ISD::BLENDI pattern to
 /// be matched in the backend with the type given. What it does check for is
 /// that the shuffle mask is a blend, or convertible into a blend with zero.
-static SDValue lowerVectorShuffleAsBlend(SDLoc DL, MVT VT, SDValue V1,
+static SDValue lowerVectorShuffleAsBlend(const SDLoc &DL, MVT VT, SDValue V1,
                                          SDValue V2, ArrayRef<int> Original,
-                                         const X86Subtarget *Subtarget,
+                                         const X86Subtarget &Subtarget,
                                          SelectionDAG &DAG) {
   bool V1IsZero = ISD::isBuildVectorAllZeros(V1.getNode());
   bool V2IsZero = ISD::isBuildVectorAllZeros(V2.getNode());
@@ -7153,13 +7531,13 @@ static SDValue lowerVectorShuffleAsBlend(SDLoc DL, MVT VT, SDValue V1,
 
   case MVT::v4i64:
   case MVT::v8i32:
-    assert(Subtarget->hasAVX2() && "256-bit integer blends require AVX2!");
+    assert(Subtarget.hasAVX2() && "256-bit integer blends require AVX2!");
     // FALLTHROUGH
   case MVT::v2i64:
   case MVT::v4i32:
     // If we have AVX2 it is faster to use VPBLENDD when the shuffle fits into
     // that instruction.
-    if (Subtarget->hasAVX2()) {
+    if (Subtarget.hasAVX2()) {
       // Scale the blend by the number of 32-bit dwords per element.
       int Scale =  VT.getScalarSizeInBits() / 32;
       BlendMask = ScaleBlendMask(BlendMask, Mask.size(), Scale);
@@ -7184,14 +7562,14 @@ static SDValue lowerVectorShuffleAsBlend(SDLoc DL, MVT VT, SDValue V1,
   }
 
   case MVT::v16i16: {
-    assert(Subtarget->hasAVX2() && "256-bit integer blends require AVX2!");
+    assert(Subtarget.hasAVX2() && "256-bit integer blends require AVX2!");
     SmallVector<int, 8> RepeatedMask;
     if (is128BitLaneRepeatedShuffleMask(MVT::v16i16, Mask, RepeatedMask)) {
       // We can lower these with PBLENDW which is mirrored across 128-bit lanes.
       assert(RepeatedMask.size() == 8 && "Repeated mask size doesn't match!");
       BlendMask = 0;
       for (int i = 0; i < 8; ++i)
-        if (RepeatedMask[i] >= 16)
+        if (RepeatedMask[i] >= 8)
           BlendMask |= 1u << i;
       return DAG.getNode(X86ISD::BLENDI, DL, MVT::v16i16, V1, V2,
                          DAG.getConstant(BlendMask, DL, MVT::i8));
@@ -7200,7 +7578,7 @@ static SDValue lowerVectorShuffleAsBlend(SDLoc DL, MVT VT, SDValue V1,
     // FALLTHROUGH
   case MVT::v16i8:
   case MVT::v32i8: {
-    assert((VT.is128BitVector() || Subtarget->hasAVX2()) &&
+    assert((VT.is128BitVector() || Subtarget.hasAVX2()) &&
            "256-bit byte-blends require AVX2 support!");
 
     // Attempt to lower to a bitmask if we can. VPAND is faster than VPBLENDVB.
@@ -7235,10 +7613,9 @@ static SDValue lowerVectorShuffleAsBlend(SDLoc DL, MVT VT, SDValue V1,
 
     V1 = DAG.getBitcast(BlendVT, V1);
     V2 = DAG.getBitcast(BlendVT, V2);
-    return DAG.getBitcast(VT, DAG.getNode(ISD::VSELECT, DL, BlendVT,
-                                          DAG.getNode(ISD::BUILD_VECTOR, DL,
-                                                      BlendVT, VSELECTMask),
-                                          V1, V2));
+    return DAG.getBitcast(
+        VT, DAG.getNode(ISD::VSELECT, DL, BlendVT,
+                        DAG.getBuildVector(BlendVT, DL, VSELECTMask), V1, V2));
   }
 
   default:
@@ -7251,8 +7628,8 @@ static SDValue lowerVectorShuffleAsBlend(SDLoc DL, MVT VT, SDValue V1,
 ///
 /// This matches the pattern where we can blend elements from two inputs and
 /// then reduce the shuffle to a single-input permutation.
-static SDValue lowerVectorShuffleAsBlendAndPermute(SDLoc DL, MVT VT, SDValue V1,
-                                                   SDValue V2,
+static SDValue lowerVectorShuffleAsBlendAndPermute(const SDLoc &DL, MVT VT,
+                                                   SDValue V1, SDValue V2,
                                                    ArrayRef<int> Mask,
                                                    SelectionDAG &DAG) {
   // We build up the blend mask while checking whether a blend is a viable way
@@ -7266,7 +7643,7 @@ static SDValue lowerVectorShuffleAsBlendAndPermute(SDLoc DL, MVT VT, SDValue V1,
 
     assert(Mask[i] < Size * 2 && "Shuffle input is out of bounds.");
 
-    if (BlendMask[Mask[i] % Size] == -1)
+    if (BlendMask[Mask[i] % Size] < 0)
       BlendMask[Mask[i] % Size] = Mask[i];
     else if (BlendMask[Mask[i] % Size] != Mask[i])
       return SDValue(); // Can't blend in the needed input!
@@ -7285,8 +7662,8 @@ static SDValue lowerVectorShuffleAsBlendAndPermute(SDLoc DL, MVT VT, SDValue V1,
 /// shuffle+blend operations on newer X86 ISAs where we have very fast blend
 /// operations. It will try to pick the best arrangement of shuffles and
 /// blends.
-static SDValue lowerVectorShuffleAsDecomposedShuffleBlend(SDLoc DL, MVT VT,
-                                                          SDValue V1,
+static SDValue lowerVectorShuffleAsDecomposedShuffleBlend(const SDLoc &DL,
+                                                          MVT VT, SDValue V1,
                                                           SDValue V2,
                                                           ArrayRef<int> Mask,
                                                           SelectionDAG &DAG) {
@@ -7335,10 +7712,10 @@ static SDValue lowerVectorShuffleAsDecomposedShuffleBlend(SDLoc DL, MVT VT,
 /// elements, and takes the low elements as the result. Note that while this is
 /// specified as a *right shift* because x86 is little-endian, it is a *left
 /// rotate* of the vector lanes.
-static SDValue lowerVectorShuffleAsByteRotate(SDLoc DL, MVT VT, SDValue V1,
-                                              SDValue V2,
+static SDValue lowerVectorShuffleAsByteRotate(const SDLoc &DL, MVT VT,
+                                              SDValue V1, SDValue V2,
                                               ArrayRef<int> Mask,
-                                              const X86Subtarget *Subtarget,
+                                              const X86Subtarget &Subtarget,
                                               SelectionDAG &DAG) {
   assert(!isNoopShuffleMask(Mask) && "We shouldn't lower no-op shuffles!");
 
@@ -7357,9 +7734,8 @@ static SDValue lowerVectorShuffleAsByteRotate(SDLoc DL, MVT VT, SDValue V1,
   SDValue Lo, Hi;
   for (int l = 0; l < NumElts; l += NumLaneElts) {
     for (int i = 0; i < NumLaneElts; ++i) {
-      if (Mask[l + i] == -1)
+      if (Mask[l + i] < 0)
         continue;
-      assert(Mask[l + i] >= 0 && "Only -1 is a valid negative mask element!");
 
       // Get the mod-Size index and lane correct it.
       int LaneIdx = (Mask[l + i] % NumElts) - l;
@@ -7411,19 +7787,22 @@ static SDValue lowerVectorShuffleAsByteRotate(SDLoc DL, MVT VT, SDValue V1,
   else if (!Hi)
     Hi = Lo;
 
+  // Cast the inputs to i8 vector of correct length to match PALIGNR or
+  // PSLLDQ/PSRLDQ.
+  MVT ByteVT = MVT::getVectorVT(MVT::i8, 16 * NumLanes);
+  Lo = DAG.getBitcast(ByteVT, Lo);
+  Hi = DAG.getBitcast(ByteVT, Hi);
+
   // The actual rotate instruction rotates bytes, so we need to scale the
   // rotation based on how many bytes are in the vector lane.
   int Scale = 16 / NumLaneElts;
 
   // SSSE3 targets can use the palignr instruction.
-  if (Subtarget->hasSSSE3()) {
-    // Cast the inputs to i8 vector of correct length to match PALIGNR.
-    MVT AlignVT = MVT::getVectorVT(MVT::i8, 16 * NumLanes);
-    Lo = DAG.getBitcast(AlignVT, Lo);
-    Hi = DAG.getBitcast(AlignVT, Hi);
-
+  if (Subtarget.hasSSSE3()) {
+    assert((!VT.is512BitVector() || Subtarget.hasBWI()) &&
+           "512-bit PALIGNR requires BWI instructions");
     return DAG.getBitcast(
-        VT, DAG.getNode(X86ISD::PALIGNR, DL, AlignVT, Lo, Hi,
+        VT, DAG.getNode(X86ISD::PALIGNR, DL, ByteVT, Lo, Hi,
                         DAG.getConstant(Rotation * Scale, DL, MVT::i8)));
   }
 
@@ -7431,21 +7810,19 @@ static SDValue lowerVectorShuffleAsByteRotate(SDLoc DL, MVT VT, SDValue V1,
          "Rotate-based lowering only supports 128-bit lowering!");
   assert(Mask.size() <= 16 &&
          "Can shuffle at most 16 bytes in a 128-bit vector!");
+  assert(ByteVT == MVT::v16i8 &&
+         "SSE2 rotate lowering only needed for v16i8!");
 
   // Default SSE2 implementation
   int LoByteShift = 16 - Rotation * Scale;
   int HiByteShift = Rotation * Scale;
 
-  // Cast the inputs to v2i64 to match PSLLDQ/PSRLDQ.
-  Lo = DAG.getBitcast(MVT::v2i64, Lo);
-  Hi = DAG.getBitcast(MVT::v2i64, Hi);
-
-  SDValue LoShift = DAG.getNode(X86ISD::VSHLDQ, DL, MVT::v2i64, Lo,
+  SDValue LoShift = DAG.getNode(X86ISD::VSHLDQ, DL, MVT::v16i8, Lo,
                                 DAG.getConstant(LoByteShift, DL, MVT::i8));
-  SDValue HiShift = DAG.getNode(X86ISD::VSRLDQ, DL, MVT::v2i64, Hi,
+  SDValue HiShift = DAG.getNode(X86ISD::VSRLDQ, DL, MVT::v16i8, Hi,
                                 DAG.getConstant(HiByteShift, DL, MVT::i8));
   return DAG.getBitcast(VT,
-                        DAG.getNode(ISD::OR, DL, MVT::v2i64, LoShift, HiShift));
+                        DAG.getNode(ISD::OR, DL, MVT::v16i8, LoShift, HiShift));
 }
 
 /// \brief Try to lower a vector shuffle as a bit shift (shifts in zeros).
@@ -7471,8 +7848,9 @@ static SDValue lowerVectorShuffleAsByteRotate(SDLoc DL, MVT VT, SDValue V1,
 /// [  5, 6,  7, zz, zz, zz, zz, zz]
 /// [ -1, 5,  6,  7, zz, zz, zz, zz]
 /// [  1, 2, -1, -1, -1, -1, zz, zz]
-static SDValue lowerVectorShuffleAsShift(SDLoc DL, MVT VT, SDValue V1,
+static SDValue lowerVectorShuffleAsShift(const SDLoc &DL, MVT VT, SDValue V1,
                                          SDValue V2, ArrayRef<int> Mask,
+                                         const X86Subtarget &Subtarget,
                                          SelectionDAG &DAG) {
   SmallBitVector Zeroable = computeZeroableShuffleElements(Mask, V1, V2);
 
@@ -7510,7 +7888,8 @@ static SDValue lowerVectorShuffleAsShift(SDLoc DL, MVT VT, SDValue V1,
 
     // We need to round trip through the appropriate type for the shift.
     MVT ShiftSVT = MVT::getIntegerVT(VT.getScalarSizeInBits() * Scale);
-    MVT ShiftVT = MVT::getVectorVT(ShiftSVT, Size / Scale);
+    MVT ShiftVT = ByteShift ? MVT::getVectorVT(MVT::i8, VT.getSizeInBits() / 8)
+                            : MVT::getVectorVT(ShiftSVT, Size / Scale);
     assert(DAG.getTargetLoweringInfo().isTypeLegal(ShiftVT) &&
            "Illegal integer vector type");
     V = DAG.getBitcast(ShiftVT, V);
@@ -7526,7 +7905,8 @@ static SDValue lowerVectorShuffleAsShift(SDLoc DL, MVT VT, SDValue V1,
   // their width within the elements of the larger integer vector. Test each
   // multiple to see if we can find a match with the moved element indices
   // and that the shifted in elements are all zeroable.
-  for (int Scale = 2; Scale * VT.getScalarSizeInBits() <= 128; Scale *= 2)
+  unsigned MaxWidth = (VT.is512BitVector() && !Subtarget.hasBWI() ? 64 : 128);
+  for (int Scale = 2; Scale * VT.getScalarSizeInBits() <= MaxWidth; Scale *= 2)
     for (int Shift = 1; Shift != Scale; ++Shift)
       for (bool Left : {true, false})
         if (CheckZeros(Shift, Scale, Left))
@@ -7539,7 +7919,7 @@ static SDValue lowerVectorShuffleAsShift(SDLoc DL, MVT VT, SDValue V1,
 }
 
 /// \brief Try to lower a vector shuffle using SSE4a EXTRQ/INSERTQ.
-static SDValue lowerVectorShuffleWithSSE4A(SDLoc DL, MVT VT, SDValue V1,
+static SDValue lowerVectorShuffleWithSSE4A(const SDLoc &DL, MVT VT, SDValue V1,
                                            SDValue V2, ArrayRef<int> Mask,
                                            SelectionDAG &DAG) {
   SmallBitVector Zeroable = computeZeroableShuffleElements(Mask, V1, V2);
@@ -7679,8 +8059,8 @@ static SDValue lowerVectorShuffleWithSSE4A(SDLoc DL, MVT VT, SDValue V1,
 /// or at the start of a higher lane. All extended elements must be from
 /// the same lane.
 static SDValue lowerVectorShuffleAsSpecificZeroOrAnyExtend(
-    SDLoc DL, MVT VT, int Scale, int Offset, bool AnyExt, SDValue InputV,
-    ArrayRef<int> Mask, const X86Subtarget *Subtarget, SelectionDAG &DAG) {
+    const SDLoc &DL, MVT VT, int Scale, int Offset, bool AnyExt, SDValue InputV,
+    ArrayRef<int> Mask, const X86Subtarget &Subtarget, SelectionDAG &DAG) {
   assert(Scale > 1 && "Need a scale to extend.");
   int EltBits = VT.getScalarSizeInBits();
   int NumElements = VT.getVectorNumElements();
@@ -7713,14 +8093,20 @@ static SDValue lowerVectorShuffleAsSpecificZeroOrAnyExtend(
 
   // Found a valid zext mask! Try various lowering strategies based on the
   // input type and available ISA extensions.
-  if (Subtarget->hasSSE41()) {
+  if (Subtarget.hasSSE41()) {
     // Not worth offseting 128-bit vectors if scale == 2, a pattern using
     // PUNPCK will catch this in a later shuffle match.
     if (Offset && Scale == 2 && VT.is128BitVector())
       return SDValue();
     MVT ExtVT = MVT::getVectorVT(MVT::getIntegerVT(EltBits * Scale),
                                  NumElements / Scale);
-    InputV = DAG.getNode(X86ISD::VZEXT, DL, ExtVT, ShuffleOffset(InputV));
+    InputV = ShuffleOffset(InputV);
+
+    // For 256-bit vectors, we only need the lower (128-bit) input half.
+    if (VT.is256BitVector())
+      InputV = extract128BitVector(InputV, 0, DAG, DL);
+
+    InputV = DAG.getNode(X86ISD::VZEXT, DL, ExtVT, InputV);
     return DAG.getBitcast(VT, InputV);
   }
 
@@ -7752,33 +8138,33 @@ static SDValue lowerVectorShuffleAsSpecificZeroOrAnyExtend(
 
   // The SSE4A EXTRQ instruction can efficiently extend the first 2 lanes
   // to 64-bits.
-  if ((Scale * EltBits) == 64 && EltBits < 32 && Subtarget->hasSSE4A()) {
+  if ((Scale * EltBits) == 64 && EltBits < 32 && Subtarget.hasSSE4A()) {
     assert(NumElements == (int)Mask.size() && "Unexpected shuffle mask size!");
     assert(VT.is128BitVector() && "Unexpected vector width!");
 
     int LoIdx = Offset * EltBits;
-    SDValue Lo = DAG.getNode(ISD::BITCAST, DL, MVT::v2i64,
-                             DAG.getNode(X86ISD::EXTRQI, DL, VT, InputV,
-                                         DAG.getConstant(EltBits, DL, MVT::i8),
-                                         DAG.getConstant(LoIdx, DL, MVT::i8)));
+    SDValue Lo = DAG.getBitcast(
+        MVT::v2i64, DAG.getNode(X86ISD::EXTRQI, DL, VT, InputV,
+                                DAG.getConstant(EltBits, DL, MVT::i8),
+                                DAG.getConstant(LoIdx, DL, MVT::i8)));
 
     if (isUndefInRange(Mask, NumElements / 2, NumElements / 2) ||
         !SafeOffset(Offset + 1))
-      return DAG.getNode(ISD::BITCAST, DL, VT, Lo);
+      return DAG.getBitcast(VT, Lo);
 
     int HiIdx = (Offset + 1) * EltBits;
-    SDValue Hi = DAG.getNode(ISD::BITCAST, DL, MVT::v2i64,
-                             DAG.getNode(X86ISD::EXTRQI, DL, VT, InputV,
-                                         DAG.getConstant(EltBits, DL, MVT::i8),
-                                         DAG.getConstant(HiIdx, DL, MVT::i8)));
-    return DAG.getNode(ISD::BITCAST, DL, VT,
-                       DAG.getNode(X86ISD::UNPCKL, DL, MVT::v2i64, Lo, Hi));
+    SDValue Hi = DAG.getBitcast(
+        MVT::v2i64, DAG.getNode(X86ISD::EXTRQI, DL, VT, InputV,
+                                DAG.getConstant(EltBits, DL, MVT::i8),
+                                DAG.getConstant(HiIdx, DL, MVT::i8)));
+    return DAG.getBitcast(VT,
+                          DAG.getNode(X86ISD::UNPCKL, DL, MVT::v2i64, Lo, Hi));
   }
 
   // If this would require more than 2 unpack instructions to expand, use
   // pshufb when available. We can only use more than 2 unpack instructions
   // when zero extending i8 elements which also makes it easier to use pshufb.
-  if (Scale > 4 && EltBits == 8 && Subtarget->hasSSSE3()) {
+  if (Scale > 4 && EltBits == 8 && Subtarget.hasSSSE3()) {
     assert(NumElements == 16 && "Unexpected byte vector width!");
     SDValue PSHUFBMask[16];
     for (int i = 0; i < 16; ++i) {
@@ -7787,10 +8173,9 @@ static SDValue lowerVectorShuffleAsSpecificZeroOrAnyExtend(
           (i % Scale == 0 && SafeOffset(Idx)) ? Idx : 0x80, DL, MVT::i8);
     }
     InputV = DAG.getBitcast(MVT::v16i8, InputV);
-    return DAG.getBitcast(VT,
-                          DAG.getNode(X86ISD::PSHUFB, DL, MVT::v16i8, InputV,
-                                      DAG.getNode(ISD::BUILD_VECTOR, DL,
-                                                  MVT::v16i8, PSHUFBMask)));
+    return DAG.getBitcast(
+        VT, DAG.getNode(X86ISD::PSHUFB, DL, MVT::v16i8, InputV,
+                        DAG.getBuildVector(MVT::v16i8, DL, PSHUFBMask)));
   }
 
   // If we are extending from an offset, ensure we start on a boundary that
@@ -7837,8 +8222,8 @@ static SDValue lowerVectorShuffleAsSpecificZeroOrAnyExtend(
 /// The reason we have dedicated lowering for zext-style shuffles is that they
 /// are both incredibly common and often quite performance sensitive.
 static SDValue lowerVectorShuffleAsZeroOrAnyExtend(
-    SDLoc DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,
-    const X86Subtarget *Subtarget, SelectionDAG &DAG) {
+    const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,
+    const X86Subtarget &Subtarget, SelectionDAG &DAG) {
   SmallBitVector Zeroable = computeZeroableShuffleElements(Mask, V1, V2);
 
   int Bits = VT.getSizeInBits();
@@ -7858,7 +8243,7 @@ static SDValue lowerVectorShuffleAsZeroOrAnyExtend(
     int Matches = 0;
     for (int i = 0; i < NumElements; ++i) {
       int M = Mask[i];
-      if (M == -1)
+      if (M < 0)
         continue; // Valid anywhere but doesn't tell us anything.
       if (i % Scale != 0) {
         // Each of the extended elements need to be zeroable.
@@ -7960,8 +8345,8 @@ static SDValue getScalarValueForVectorElement(SDValue V, int Idx,
                                               SelectionDAG &DAG) {
   MVT VT = V.getSimpleValueType();
   MVT EltVT = VT.getVectorElementType();
-  while (V.getOpcode() == ISD::BITCAST)
-    V = V.getOperand(0);
+  V = peekThroughBitcasts(V);
+
   // If the bitcasts shift the element size, we can't extract an equivalent
   // element from it.
   MVT NewVT = V.getSimpleValueType();
@@ -7974,7 +8359,7 @@ static SDValue getScalarValueForVectorElement(SDValue V, int Idx,
     // FIXME: Add support for scalar truncation where possible.
     SDValue S = V.getOperand(Idx);
     if (EltVT.getSizeInBits() == S.getSimpleValueType().getSizeInBits())
-      return DAG.getNode(ISD::BITCAST, SDLoc(V), EltVT, S);
+      return DAG.getBitcast(EltVT, S);
   }
 
   return SDValue();
@@ -7985,9 +8370,7 @@ static SDValue getScalarValueForVectorElement(SDValue V, int Idx,
 /// This is particularly important because the set of instructions varies
 /// significantly based on whether the operand is a load or not.
 static bool isShuffleFoldableLoad(SDValue V) {
-  while (V.getOpcode() == ISD::BITCAST)
-    V = V.getOperand(0);
-
+  V = peekThroughBitcasts(V);
   return ISD::isNON_EXTLoad(V.getNode());
 }
 
@@ -7996,8 +8379,8 @@ static bool isShuffleFoldableLoad(SDValue V) {
 /// This is a common pattern that we have especially efficient patterns to lower
 /// across all subtarget feature sets.
 static SDValue lowerVectorShuffleAsElementInsertion(
-    SDLoc DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,
-    const X86Subtarget *Subtarget, SelectionDAG &DAG) {
+    const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,
+    const X86Subtarget &Subtarget, SelectionDAG &DAG) {
   SmallBitVector Zeroable = computeZeroableShuffleElements(Mask, V1, V2);
   MVT ExtVT = VT;
   MVT EltVT = VT.getVectorElementType();
@@ -8054,7 +8437,7 @@ static SDValue lowerVectorShuffleAsElementInsertion(
     // This is essentially a special case blend operation, but if we have
     // general purpose blend operations, they are always faster. Bail and let
     // the rest of the lowering handle these as blends.
-    if (Subtarget->hasSSE41())
+    if (Subtarget.hasSSE41())
       return SDValue();
 
     // Otherwise, use MOVSD or MOVSS.
@@ -8082,9 +8465,9 @@ static SDValue lowerVectorShuffleAsElementInsertion(
       V2Shuffle[V2Index] = 0;
       V2 = DAG.getVectorShuffle(VT, DL, V2, DAG.getUNDEF(VT), V2Shuffle);
     } else {
-      V2 = DAG.getBitcast(MVT::v2i64, V2);
+      V2 = DAG.getBitcast(MVT::v16i8, V2);
       V2 = DAG.getNode(
-          X86ISD::VSHLDQ, DL, MVT::v2i64, V2,
+          X86ISD::VSHLDQ, DL, MVT::v16i8, V2,
           DAG.getConstant(V2Index * EltVT.getSizeInBits() / 8, DL,
                           DAG.getTargetLoweringInfo().getScalarShiftAmountTy(
                               DAG.getDataLayout(), VT)));
@@ -8094,15 +8477,15 @@ static SDValue lowerVectorShuffleAsElementInsertion(
   return V2;
 }
 
-/// \brief Try to lower broadcast of a single - truncated - integer element,
+/// Try to lower broadcast of a single - truncated - integer element,
 /// coming from a scalar_to_vector/build_vector node \p V0 with larger elements.
 ///
 /// This assumes we have AVX2.
-static SDValue lowerVectorShuffleAsTruncBroadcast(SDLoc DL, MVT VT, SDValue V0,
-                                                  int BroadcastIdx,
-                                                  const X86Subtarget *Subtarget,
+static SDValue lowerVectorShuffleAsTruncBroadcast(const SDLoc &DL, MVT VT,
+                                                  SDValue V0, int BroadcastIdx,
+                                                  const X86Subtarget &Subtarget,
                                                   SelectionDAG &DAG) {
-  assert(Subtarget->hasAVX2() &&
+  assert(Subtarget.hasAVX2() &&
          "We can only lower integer broadcasts with AVX2!");
 
   EVT EltVT = VT.getVectorElementType();
@@ -8153,38 +8536,57 @@ static SDValue lowerVectorShuffleAsTruncBroadcast(SDLoc DL, MVT VT, SDValue V0,
 /// filtering. While a little annoying to re-dispatch on type here, there isn't
 /// a convenient way to factor it out.
 /// FIXME: This is very similar to LowerVectorBroadcast - can we merge them?
-static SDValue lowerVectorShuffleAsBroadcast(SDLoc DL, MVT VT, SDValue V,
+static SDValue lowerVectorShuffleAsBroadcast(const SDLoc &DL, MVT VT,
+                                             SDValue V1, SDValue V2,
                                              ArrayRef<int> Mask,
-                                             const X86Subtarget *Subtarget,
+                                             const X86Subtarget &Subtarget,
                                              SelectionDAG &DAG) {
-  if (!Subtarget->hasAVX())
-    return SDValue();
-  if (VT.isInteger() && !Subtarget->hasAVX2())
+  if (!((Subtarget.hasSSE3() && VT == MVT::v2f64) ||
+        (Subtarget.hasAVX() && VT.isFloatingPoint()) ||
+        (Subtarget.hasAVX2() && VT.isInteger())))
     return SDValue();
 
+  // With MOVDDUP (v2f64) we can broadcast from a register or a load, otherwise
+  // we can only broadcast from a register with AVX2.
+  unsigned NumElts = Mask.size();
+  unsigned Opcode = VT == MVT::v2f64 ? X86ISD::MOVDDUP : X86ISD::VBROADCAST;
+  bool BroadcastFromReg = (Opcode == X86ISD::MOVDDUP) || Subtarget.hasAVX2();
+
   // Check that the mask is a broadcast.
   int BroadcastIdx = -1;
-  for (int M : Mask)
-    if (M >= 0 && BroadcastIdx == -1)
-      BroadcastIdx = M;
-    else if (M >= 0 && M != BroadcastIdx)
-      return SDValue();
+  for (int i = 0; i != (int)NumElts; ++i) {
+    SmallVector<int, 8> BroadcastMask(NumElts, i);
+    if (isShuffleEquivalent(V1, V2, Mask, BroadcastMask)) {
+      BroadcastIdx = i;
+      break;
+    }
+  }
 
+  if (BroadcastIdx < 0)
+    return SDValue();
   assert(BroadcastIdx < (int)Mask.size() && "We only expect to be called with "
                                             "a sorted mask where the broadcast "
                                             "comes from V1.");
 
   // Go up the chain of (vector) values to find a scalar load that we can
   // combine with the broadcast.
+  SDValue V = V1;
   for (;;) {
     switch (V.getOpcode()) {
+    case ISD::BITCAST: {
+      SDValue VSrc = V.getOperand(0);
+      MVT SrcVT = VSrc.getSimpleValueType();
+      if (VT.getScalarSizeInBits() != SrcVT.getScalarSizeInBits())
+        break;
+      V = VSrc;
+      continue;
+    }
     case ISD::CONCAT_VECTORS: {
       int OperandSize = Mask.size() / V.getNumOperands();
       V = V.getOperand(BroadcastIdx / OperandSize);
       BroadcastIdx %= OperandSize;
       continue;
     }
-
     case ISD::INSERT_SUBVECTOR: {
       SDValue VOuter = V.getOperand(0), VInner = V.getOperand(1);
       auto ConstantIdx = dyn_cast<ConstantSDNode>(V.getOperand(2));
@@ -8219,45 +8621,76 @@ static SDValue lowerVectorShuffleAsBroadcast(SDLoc DL, MVT VT, SDValue V,
   MVT BroadcastVT = VT;
 
   // Peek through any bitcast (only useful for loads).
-  SDValue BC = V;
-  while (BC.getOpcode() == ISD::BITCAST)
-    BC = BC.getOperand(0);
+  SDValue BC = peekThroughBitcasts(V);
 
   // Also check the simpler case, where we can directly reuse the scalar.
   if (V.getOpcode() == ISD::BUILD_VECTOR ||
       (V.getOpcode() == ISD::SCALAR_TO_VECTOR && BroadcastIdx == 0)) {
     V = V.getOperand(BroadcastIdx);
 
-    // If the scalar isn't a load, we can't broadcast from it in AVX1.
-    // Only AVX2 has register broadcasts.
-    if (!Subtarget->hasAVX2() && !isShuffleFoldableLoad(V))
+    // If we can't broadcast from a register, check that the input is a load.
+    if (!BroadcastFromReg && !isShuffleFoldableLoad(V))
       return SDValue();
   } else if (MayFoldLoad(BC) && !cast<LoadSDNode>(BC)->isVolatile()) {
     // 32-bit targets need to load i64 as a f64 and then bitcast the result.
-    if (!Subtarget->is64Bit() && VT.getScalarType() == MVT::i64)
+    if (!Subtarget.is64Bit() && VT.getScalarType() == MVT::i64) {
       BroadcastVT = MVT::getVectorVT(MVT::f64, VT.getVectorNumElements());
+      Opcode = (BroadcastVT.is128BitVector() ? X86ISD::MOVDDUP : Opcode);
+    }
 
     // If we are broadcasting a load that is only used by the shuffle
     // then we can reduce the vector load to the broadcasted scalar load.
     LoadSDNode *Ld = cast<LoadSDNode>(BC);
     SDValue BaseAddr = Ld->getOperand(1);
-    EVT AddrVT = BaseAddr.getValueType();
     EVT SVT = BroadcastVT.getScalarType();
     unsigned Offset = BroadcastIdx * SVT.getStoreSize();
-    SDValue NewAddr = DAG.getNode(
-        ISD::ADD, DL, AddrVT, BaseAddr,
-        DAG.getConstant(Offset, DL, AddrVT));
+    SDValue NewAddr = DAG.getMemBasePlusOffset(BaseAddr, Offset, DL);
     V = DAG.getLoad(SVT, DL, Ld->getChain(), NewAddr,
                     DAG.getMachineFunction().getMachineMemOperand(
                         Ld->getMemOperand(), Offset, SVT.getStoreSize()));
-  } else if (BroadcastIdx != 0 || !Subtarget->hasAVX2()) {
-    // We can't broadcast from a vector register without AVX2, and we can only
-    // broadcast from the zero-element of a vector register.
+  } else if (!BroadcastFromReg) {
+    // We can't broadcast from a vector register.
     return SDValue();
+  } else if (BroadcastIdx != 0) {
+    // We can only broadcast from the zero-element of a vector register,
+    // but it can be advantageous to broadcast from the zero-element of a
+    // subvector.
+    if (!VT.is256BitVector() && !VT.is512BitVector())
+      return SDValue();
+
+    // VPERMQ/VPERMPD can perform the cross-lane shuffle directly.
+    if (VT == MVT::v4f64 || VT == MVT::v4i64)
+      return SDValue();
+
+    // Only broadcast the zero-element of a 128-bit subvector.
+    unsigned EltSize = VT.getScalarSizeInBits();
+    if (((BroadcastIdx * EltSize) % 128) != 0)
+      return SDValue();
+
+    MVT ExtVT = MVT::getVectorVT(VT.getScalarType(), 128 / EltSize);
+    V = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, ExtVT, V,
+                    DAG.getIntPtrConstant(BroadcastIdx, DL));
   }
 
-  V = DAG.getNode(X86ISD::VBROADCAST, DL, BroadcastVT, V);
-  return DAG.getBitcast(VT, V);
+  if (Opcode == X86ISD::MOVDDUP && !V.getValueType().isVector())
+    V = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v2f64,
+                    DAG.getBitcast(MVT::f64, V));
+
+  // Bitcast back to the same scalar type as BroadcastVT.
+  MVT SrcVT = V.getSimpleValueType();
+  if (SrcVT.getScalarType() != BroadcastVT.getScalarType()) {
+    assert(SrcVT.getScalarSizeInBits() == BroadcastVT.getScalarSizeInBits() &&
+           "Unexpected vector element size");
+    if (SrcVT.isVector()) {
+      unsigned NumSrcElts = SrcVT.getVectorNumElements();
+      SrcVT = MVT::getVectorVT(BroadcastVT.getScalarType(), NumSrcElts);
+    } else {
+      SrcVT = BroadcastVT.getScalarType();
+    }
+    V = DAG.getBitcast(SrcVT, V);
+  }
+
+  return DAG.getBitcast(VT, DAG.getNode(Opcode, DL, BroadcastVT, V));
 }
 
 // Check for whether we can use INSERTPS to perform the shuffle. We only use
@@ -8266,16 +8699,14 @@ static SDValue lowerVectorShuffleAsBroadcast(SDLoc DL, MVT VT, SDValue V,
 // are much smaller to encode than a SHUFPS and an INSERTPS. We can also
 // perform INSERTPS if a single V1 element is out of place and all V2
 // elements are zeroable.
-static SDValue lowerVectorShuffleAsInsertPS(SDValue Op, SDValue V1, SDValue V2,
-                                            ArrayRef<int> Mask,
-                                            SelectionDAG &DAG) {
-  assert(Op.getSimpleValueType() == MVT::v4f32 && "Bad shuffle type!");
-  assert(V1.getSimpleValueType() == MVT::v4f32 && "Bad operand type!");
-  assert(V2.getSimpleValueType() == MVT::v4f32 && "Bad operand type!");
+static bool matchVectorShuffleAsInsertPS(SDValue &V1, SDValue &V2,
+                                         unsigned &InsertPSMask,
+                                         const SmallBitVector &Zeroable,
+                                         ArrayRef<int> Mask,
+                                         SelectionDAG &DAG) {
+  assert(V1.getSimpleValueType().is128BitVector() && "Bad operand type!");
+  assert(V2.getSimpleValueType().is128BitVector() && "Bad operand type!");
   assert(Mask.size() == 4 && "Unexpected mask size for v4 shuffle!");
-
-  SmallBitVector Zeroable = computeZeroableShuffleElements(Mask, V1, V2);
-
   unsigned ZMask = 0;
   int V1DstIndex = -1;
   int V2DstIndex = -1;
@@ -8295,8 +8726,8 @@ static SDValue lowerVectorShuffleAsInsertPS(SDValue Op, SDValue V1, SDValue V2,
     }
 
     // We can only insert a single non-zeroable element.
-    if (V1DstIndex != -1 || V2DstIndex != -1)
-      return SDValue();
+    if (V1DstIndex >= 0 || V2DstIndex >= 0)
+      return false;
 
     if (Mask[i] < 4) {
       // V1 input out of place for insertion.
@@ -8308,13 +8739,13 @@ static SDValue lowerVectorShuffleAsInsertPS(SDValue Op, SDValue V1, SDValue V2,
   }
 
   // Don't bother if we have no (non-zeroable) element for insertion.
-  if (V1DstIndex == -1 && V2DstIndex == -1)
-    return SDValue();
+  if (V1DstIndex < 0 && V2DstIndex < 0)
+    return false;
 
   // Determine element insertion src/dst indices. The src index is from the
   // start of the inserted vector, not the start of the concatenated vector.
   unsigned V2SrcIndex = 0;
-  if (V1DstIndex != -1) {
+  if (V1DstIndex >= 0) {
     // If we have a V1 input out of place, we use V1 as the V2 element insertion
     // and don't use the original V2 at all.
     V2SrcIndex = Mask[V1DstIndex];
@@ -8329,11 +8760,25 @@ static SDValue lowerVectorShuffleAsInsertPS(SDValue Op, SDValue V1, SDValue V2,
   if (!V1UsedInPlace)
     V1 = DAG.getUNDEF(MVT::v4f32);
 
-  unsigned InsertPSMask = V2SrcIndex << 6 | V2DstIndex << 4 | ZMask;
+  // Insert the V2 element into the desired position.
+  InsertPSMask = V2SrcIndex << 6 | V2DstIndex << 4 | ZMask;
   assert((InsertPSMask & ~0xFFu) == 0 && "Invalid mask!");
+  return true;
+}
+
+static SDValue lowerVectorShuffleAsInsertPS(const SDLoc &DL, SDValue V1,
+                                            SDValue V2, ArrayRef<int> Mask,
+                                            SelectionDAG &DAG) {
+  assert(V1.getSimpleValueType() == MVT::v4f32 && "Bad operand type!");
+  assert(V2.getSimpleValueType() == MVT::v4f32 && "Bad operand type!");
+  SmallBitVector Zeroable = computeZeroableShuffleElements(Mask, V1, V2);
+
+  // Attempt to match the insertps pattern.
+  unsigned InsertPSMask;
+  if (!matchVectorShuffleAsInsertPS(V1, V2, InsertPSMask, Zeroable, Mask, DAG))
+    return SDValue();
 
   // Insert the V2 element into the desired position.
-  SDLoc DL(Op);
   return DAG.getNode(X86ISD::INSERTPS, DL, MVT::v4f32, V1, V2,
                      DAG.getConstant(InsertPSMask, DL, MVT::i8));
 }
@@ -8347,29 +8792,30 @@ static SDValue lowerVectorShuffleAsInsertPS(SDValue Op, SDValue V1, SDValue V2,
 /// because for floating point vectors we have a generalized SHUFPS lowering
 /// strategy that handles everything that doesn't *exactly* match an unpack,
 /// making this clever lowering unnecessary.
-static SDValue lowerVectorShuffleAsPermuteAndUnpack(SDLoc DL, MVT VT,
+static SDValue lowerVectorShuffleAsPermuteAndUnpack(const SDLoc &DL, MVT VT,
                                                     SDValue V1, SDValue V2,
                                                     ArrayRef<int> Mask,
                                                     SelectionDAG &DAG) {
   assert(!VT.isFloatingPoint() &&
          "This routine only supports integer vectors.");
-  assert(!isSingleInputShuffleMask(Mask) &&
+  assert(VT.is128BitVector() &&
+         "This routine only works on 128-bit vectors.");
+  assert(!V2.isUndef() &&
          "This routine should only be used when blending two inputs.");
   assert(Mask.size() >= 2 && "Single element masks are invalid.");
 
   int Size = Mask.size();
 
-  int NumLoInputs = std::count_if(Mask.begin(), Mask.end(), [Size](int M) {
-    return M >= 0 && M % Size < Size / 2;
-  });
-  int NumHiInputs = std::count_if(
-      Mask.begin(), Mask.end(), [Size](int M) { return M % Size >= Size / 2; });
+  int NumLoInputs =
+      count_if(Mask, [Size](int M) { return M >= 0 && M % Size < Size / 2; });
+  int NumHiInputs =
+      count_if(Mask, [Size](int M) { return M % Size >= Size / 2; });
 
   bool UnpackLo = NumLoInputs >= NumHiInputs;
 
-  auto TryUnpack = [&](MVT UnpackVT, int Scale) {
-    SmallVector<int, 32> V1Mask(Mask.size(), -1);
-    SmallVector<int, 32> V2Mask(Mask.size(), -1);
+  auto TryUnpack = [&](int ScalarSize, int Scale) {
+    SmallVector<int, 16> V1Mask((unsigned)Size, -1);
+    SmallVector<int, 16> V2Mask((unsigned)Size, -1);
 
     for (int i = 0; i < Size; ++i) {
       if (Mask[i] < 0)
@@ -8401,6 +8847,7 @@ static SDValue lowerVectorShuffleAsPermuteAndUnpack(SDLoc DL, MVT VT,
     V2 = DAG.getVectorShuffle(VT, DL, V2, DAG.getUNDEF(VT), V2Mask);
 
     // Cast the inputs to the type we will use to unpack them.
+    MVT UnpackVT = MVT::getVectorVT(MVT::getIntegerVT(ScalarSize), Size / Scale);
     V1 = DAG.getBitcast(UnpackVT, V1);
     V2 = DAG.getBitcast(UnpackVT, V2);
 
@@ -8412,15 +8859,10 @@ static SDValue lowerVectorShuffleAsPermuteAndUnpack(SDLoc DL, MVT VT,
 
   // We try each unpack from the largest to the smallest to try and find one
   // that fits this mask.
-  int OrigNumElements = VT.getVectorNumElements();
   int OrigScalarSize = VT.getScalarSizeInBits();
-  for (int ScalarSize = 64; ScalarSize >= OrigScalarSize; ScalarSize /= 2) {
-    int Scale = ScalarSize / OrigScalarSize;
-    int NumElements = OrigNumElements / Scale;
-    MVT UnpackVT = MVT::getVectorVT(MVT::getIntegerVT(ScalarSize), NumElements);
-    if (SDValue Unpack = TryUnpack(UnpackVT, Scale))
+  for (int ScalarSize = 64; ScalarSize >= OrigScalarSize; ScalarSize /= 2)
+    if (SDValue Unpack = TryUnpack(ScalarSize, ScalarSize / OrigScalarSize))
       return Unpack;
-  }
 
   // If none of the unpack-rooted lowerings worked (or were profitable) try an
   // initial unpack.
@@ -8434,8 +8876,7 @@ static SDValue lowerVectorShuffleAsPermuteAndUnpack(SDLoc DL, MVT VT,
     // half-crossings are created.
     // FIXME: We could consider commuting the unpacks.
 
-    SmallVector<int, 32> PermMask;
-    PermMask.assign(Size, -1);
+    SmallVector<int, 32> PermMask((unsigned)Size, -1);
     for (int i = 0; i < Size; ++i) {
       if (Mask[i] < 0)
         continue;
@@ -8461,28 +8902,25 @@ static SDValue lowerVectorShuffleAsPermuteAndUnpack(SDLoc DL, MVT VT,
 /// instructions will incur a domain crossing penalty on some chips though so
 /// it is better to avoid lowering through this for integer vectors where
 /// possible.
-static SDValue lowerV2F64VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
-                                       const X86Subtarget *Subtarget,
+static SDValue lowerV2F64VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
+                                       SDValue V1, SDValue V2,
+                                       const X86Subtarget &Subtarget,
                                        SelectionDAG &DAG) {
-  SDLoc DL(Op);
-  assert(Op.getSimpleValueType() == MVT::v2f64 && "Bad shuffle type!");
   assert(V1.getSimpleValueType() == MVT::v2f64 && "Bad operand type!");
   assert(V2.getSimpleValueType() == MVT::v2f64 && "Bad operand type!");
-  ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
-  ArrayRef<int> Mask = SVOp->getMask();
   assert(Mask.size() == 2 && "Unexpected mask size for v2 shuffle!");
 
-  if (isSingleInputShuffleMask(Mask)) {
-    // Use low duplicate instructions for masks that match their pattern.
-    if (Subtarget->hasSSE3())
-      if (isShuffleEquivalent(V1, V2, Mask, {0, 0}))
-        return DAG.getNode(X86ISD::MOVDDUP, DL, MVT::v2f64, V1);
+  if (V2.isUndef()) {
+    // Check for being able to broadcast a single element.
+    if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(
+            DL, MVT::v2f64, V1, V2, Mask, Subtarget, DAG))
+      return Broadcast;
 
     // Straight shuffle of a single input vector. Simulate this by using the
     // single input as both of the "inputs" to this instruction..
     unsigned SHUFPDMask = (Mask[0] == 1) | ((Mask[1] == 1) << 1);
 
-    if (Subtarget->hasAVX()) {
+    if (Subtarget.hasAVX()) {
       // If we have AVX, we can use VPERMILPS which will allow folding a load
       // into the shuffle.
       return DAG.getNode(X86ISD::VPERMILPI, DL, MVT::v2f64, V1,
@@ -8521,7 +8959,7 @@ static SDValue lowerV2F64VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
           DL, MVT::v2f64, V2,
           DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v2f64, V1S));
 
-  if (Subtarget->hasSSE41())
+  if (Subtarget.hasSSE41())
     if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v2f64, V1, V2, Mask,
                                                   Subtarget, DAG))
       return Blend;
@@ -8542,21 +8980,18 @@ static SDValue lowerV2F64VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
 /// the integer unit to minimize domain crossing penalties. However, for blends
 /// it falls back to the floating point shuffle operation with appropriate bit
 /// casting.
-static SDValue lowerV2I64VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
-                                       const X86Subtarget *Subtarget,
+static SDValue lowerV2I64VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
+                                       SDValue V1, SDValue V2,
+                                       const X86Subtarget &Subtarget,
                                        SelectionDAG &DAG) {
-  SDLoc DL(Op);
-  assert(Op.getSimpleValueType() == MVT::v2i64 && "Bad shuffle type!");
   assert(V1.getSimpleValueType() == MVT::v2i64 && "Bad operand type!");
   assert(V2.getSimpleValueType() == MVT::v2i64 && "Bad operand type!");
-  ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
-  ArrayRef<int> Mask = SVOp->getMask();
   assert(Mask.size() == 2 && "Unexpected mask size for v2 shuffle!");
 
-  if (isSingleInputShuffleMask(Mask)) {
+  if (V2.isUndef()) {
     // Check for being able to broadcast a single element.
-    if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(DL, MVT::v2i64, V1,
-                                                          Mask, Subtarget, DAG))
+    if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(
+            DL, MVT::v2i64, V1, V2, Mask, Subtarget, DAG))
       return Broadcast;
 
     // Straight shuffle of a single input vector. For everything from SSE2
@@ -8576,28 +9011,29 @@ static SDValue lowerV2I64VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
   assert(Mask[0] < 2 && "We sort V1 to be the first input.");
   assert(Mask[1] >= 2 && "We sort V2 to be the second input.");
 
-  // If we have a blend of two PACKUS operations an the blend aligns with the
-  // low and half halves, we can just merge the PACKUS operations. This is
-  // particularly important as it lets us merge shuffles that this routine itself
-  // creates.
+  // If we have a blend of two same-type PACKUS operations and the blend aligns
+  // with the low and high halves, we can just merge the PACKUS operations.
+  // This is particularly important as it lets us merge shuffles that this
+  // routine itself creates.
   auto GetPackNode = [](SDValue V) {
-    while (V.getOpcode() == ISD::BITCAST)
-      V = V.getOperand(0);
-
+    V = peekThroughBitcasts(V);
     return V.getOpcode() == X86ISD::PACKUS ? V : SDValue();
   };
   if (SDValue V1Pack = GetPackNode(V1))
-    if (SDValue V2Pack = GetPackNode(V2))
-      return DAG.getBitcast(MVT::v2i64,
-                            DAG.getNode(X86ISD::PACKUS, DL, MVT::v16i8,
-                                        Mask[0] == 0 ? V1Pack.getOperand(0)
-                                                     : V1Pack.getOperand(1),
-                                        Mask[1] == 2 ? V2Pack.getOperand(0)
-                                                     : V2Pack.getOperand(1)));
+    if (SDValue V2Pack = GetPackNode(V2)) {
+      EVT PackVT = V1Pack.getValueType();
+      if (PackVT == V2Pack.getValueType())
+        return DAG.getBitcast(MVT::v2i64,
+                              DAG.getNode(X86ISD::PACKUS, DL, PackVT,
+                                          Mask[0] == 0 ? V1Pack.getOperand(0)
+                                                       : V1Pack.getOperand(1),
+                                          Mask[1] == 2 ? V2Pack.getOperand(0)
+                                                       : V2Pack.getOperand(1)));
+    }
 
   // Try to use shift instructions.
-  if (SDValue Shift =
-          lowerVectorShuffleAsShift(DL, MVT::v2i64, V1, V2, Mask, DAG))
+  if (SDValue Shift = lowerVectorShuffleAsShift(DL, MVT::v2i64, V1, V2, Mask,
+                                                Subtarget, DAG))
     return Shift;
 
   // When loading a scalar and then shuffling it into a vector we can often do
@@ -8614,7 +9050,7 @@ static SDValue lowerV2I64VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
 
   // We have different paths for blend lowering, but they all must use the
   // *exact* same predicate.
-  bool IsBlendSupported = Subtarget->hasSSE41();
+  bool IsBlendSupported = Subtarget.hasSSE41();
   if (IsBlendSupported)
     if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v2i64, V1, V2, Mask,
                                                   Subtarget, DAG))
@@ -8627,7 +9063,7 @@ static SDValue lowerV2I64VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
 
   // Try to use byte rotation instructions.
   // Its more profitable for pre-SSSE3 to use shuffles/unpacks.
-  if (Subtarget->hasSSSE3())
+  if (Subtarget.hasSSSE3())
     if (SDValue Rotate = lowerVectorShuffleAsByteRotate(
             DL, MVT::v2i64, V1, V2, Mask, Subtarget, DAG))
       return Rotate;
@@ -8655,12 +9091,16 @@ static SDValue lowerV2I64VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
 static bool isSingleSHUFPSMask(ArrayRef<int> Mask) {
   // This routine only handles 128-bit shufps.
   assert(Mask.size() == 4 && "Unsupported mask size!");
+  assert(Mask[0] >= -1 && Mask[0] < 8 && "Out of bound mask element!");
+  assert(Mask[1] >= -1 && Mask[1] < 8 && "Out of bound mask element!");
+  assert(Mask[2] >= -1 && Mask[2] < 8 && "Out of bound mask element!");
+  assert(Mask[3] >= -1 && Mask[3] < 8 && "Out of bound mask element!");
 
   // To lower with a single SHUFPS we need to have the low half and high half
   // each requiring a single input.
-  if (Mask[0] != -1 && Mask[1] != -1 && (Mask[0] < 4) != (Mask[1] < 4))
+  if (Mask[0] >= 0 && Mask[1] >= 0 && (Mask[0] < 4) != (Mask[1] < 4))
     return false;
-  if (Mask[2] != -1 && Mask[3] != -1 && (Mask[2] < 4) != (Mask[3] < 4))
+  if (Mask[2] >= 0 && Mask[3] >= 0 && (Mask[2] < 4) != (Mask[3] < 4))
     return false;
 
   return true;
@@ -8671,14 +9111,13 @@ static bool isSingleSHUFPSMask(ArrayRef<int> Mask) {
 /// This is a helper routine dedicated to lowering vector shuffles using SHUFPS.
 /// It makes no assumptions about whether this is the *best* lowering, it simply
 /// uses it.
-static SDValue lowerVectorShuffleWithSHUFPS(SDLoc DL, MVT VT,
+static SDValue lowerVectorShuffleWithSHUFPS(const SDLoc &DL, MVT VT,
                                             ArrayRef<int> Mask, SDValue V1,
                                             SDValue V2, SelectionDAG &DAG) {
   SDValue LowV = V1, HighV = V2;
   int NewMask[4] = {Mask[0], Mask[1], Mask[2], Mask[3]};
 
-  int NumV2Elements =
-      std::count_if(Mask.begin(), Mask.end(), [](int M) { return M >= 4; });
+  int NumV2Elements = count_if(Mask, [](int M) { return M >= 4; });
 
   if (NumV2Elements == 1) {
     int V2Index =
@@ -8689,7 +9128,7 @@ static SDValue lowerVectorShuffleWithSHUFPS(SDLoc DL, MVT VT,
     // the low bit.
     int V2AdjIndex = V2Index ^ 1;
 
-    if (Mask[V2AdjIndex] == -1) {
+    if (Mask[V2AdjIndex] < 0) {
       // Handles all the cases where we have a single V2 element and an undef.
       // This will only ever happen in the high lanes because we commute the
       // vector otherwise.
@@ -8761,35 +9200,31 @@ static SDValue lowerVectorShuffleWithSHUFPS(SDLoc DL, MVT VT,
 /// Uses instructions exclusively from the floating point unit to minimize
 /// domain crossing penalties, as these are sufficient to implement all v4f32
 /// shuffles.
-static SDValue lowerV4F32VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
-                                       const X86Subtarget *Subtarget,
+static SDValue lowerV4F32VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
+                                       SDValue V1, SDValue V2,
+                                       const X86Subtarget &Subtarget,
                                        SelectionDAG &DAG) {
-  SDLoc DL(Op);
-  assert(Op.getSimpleValueType() == MVT::v4f32 && "Bad shuffle type!");
   assert(V1.getSimpleValueType() == MVT::v4f32 && "Bad operand type!");
   assert(V2.getSimpleValueType() == MVT::v4f32 && "Bad operand type!");
-  ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
-  ArrayRef<int> Mask = SVOp->getMask();
   assert(Mask.size() == 4 && "Unexpected mask size for v4 shuffle!");
 
-  int NumV2Elements =
-      std::count_if(Mask.begin(), Mask.end(), [](int M) { return M >= 4; });
+  int NumV2Elements = count_if(Mask, [](int M) { return M >= 4; });
 
   if (NumV2Elements == 0) {
     // Check for being able to broadcast a single element.
-    if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(DL, MVT::v4f32, V1,
-                                                          Mask, Subtarget, DAG))
+    if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(
+            DL, MVT::v4f32, V1, V2, Mask, Subtarget, DAG))
       return Broadcast;
 
     // Use even/odd duplicate instructions for masks that match their pattern.
-    if (Subtarget->hasSSE3()) {
+    if (Subtarget.hasSSE3()) {
       if (isShuffleEquivalent(V1, V2, Mask, {0, 0, 2, 2}))
         return DAG.getNode(X86ISD::MOVSLDUP, DL, MVT::v4f32, V1);
       if (isShuffleEquivalent(V1, V2, Mask, {1, 1, 3, 3}))
         return DAG.getNode(X86ISD::MOVSHDUP, DL, MVT::v4f32, V1);
     }
 
-    if (Subtarget->hasAVX()) {
+    if (Subtarget.hasAVX()) {
       // If we have AVX, we can use VPERMILPS which will allow folding a load
       // into the shuffle.
       return DAG.getNode(X86ISD::VPERMILPI, DL, MVT::v4f32, V1,
@@ -8812,13 +9247,13 @@ static SDValue lowerV4F32VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
                                                          Mask, Subtarget, DAG))
       return V;
 
-  if (Subtarget->hasSSE41()) {
+  if (Subtarget.hasSSE41()) {
     if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v4f32, V1, V2, Mask,
                                                   Subtarget, DAG))
       return Blend;
 
     // Use INSERTPS if we can complete the shuffle efficiently.
-    if (SDValue V = lowerVectorShuffleAsInsertPS(Op, V1, V2, Mask, DAG))
+    if (SDValue V = lowerVectorShuffleAsInsertPS(DL, V1, V2, Mask, DAG))
       return V;
 
     if (!isSingleSHUFPSMask(Mask))
@@ -8827,6 +9262,12 @@ static SDValue lowerV4F32VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
         return BlendPerm;
   }
 
+  // Use low/high mov instructions.
+  if (isShuffleEquivalent(V1, V2, Mask, {0, 1, 4, 5}))
+    return DAG.getNode(X86ISD::MOVLHPS, DL, MVT::v4f32, V1, V2);
+  if (isShuffleEquivalent(V1, V2, Mask, {2, 3, 6, 7}))
+    return DAG.getNode(X86ISD::MOVHLPS, DL, MVT::v4f32, V2, V1);
+
   // Use dedicated unpack instructions for masks that match their pattern.
   if (SDValue V =
           lowerVectorShuffleWithUNPCK(DL, MVT::v4f32, Mask, V1, V2, DAG))
@@ -8840,15 +9281,12 @@ static SDValue lowerV4F32VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
 ///
 /// We try to handle these with integer-domain shuffles where we can, but for
 /// blends we use the floating point domain blend instructions.
-static SDValue lowerV4I32VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
-                                       const X86Subtarget *Subtarget,
+static SDValue lowerV4I32VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
+                                       SDValue V1, SDValue V2,
+                                       const X86Subtarget &Subtarget,
                                        SelectionDAG &DAG) {
-  SDLoc DL(Op);
-  assert(Op.getSimpleValueType() == MVT::v4i32 && "Bad shuffle type!");
   assert(V1.getSimpleValueType() == MVT::v4i32 && "Bad operand type!");
   assert(V2.getSimpleValueType() == MVT::v4i32 && "Bad operand type!");
-  ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
-  ArrayRef<int> Mask = SVOp->getMask();
   assert(Mask.size() == 4 && "Unexpected mask size for v4 shuffle!");
 
   // Whenever we can lower this as a zext, that instruction is strictly faster
@@ -8858,13 +9296,12 @@ static SDValue lowerV4I32VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
                                                          Mask, Subtarget, DAG))
     return ZExt;
 
-  int NumV2Elements =
-      std::count_if(Mask.begin(), Mask.end(), [](int M) { return M >= 4; });
+  int NumV2Elements = count_if(Mask, [](int M) { return M >= 4; });
 
   if (NumV2Elements == 0) {
     // Check for being able to broadcast a single element.
-    if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(DL, MVT::v4i32, V1,
-                                                          Mask, Subtarget, DAG))
+    if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(
+            DL, MVT::v4i32, V1, V2, Mask, Subtarget, DAG))
       return Broadcast;
 
     // Straight shuffle of a single input vector. For everything from SSE2
@@ -8884,8 +9321,8 @@ static SDValue lowerV4I32VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
   }
 
   // Try to use shift instructions.
-  if (SDValue Shift =
-          lowerVectorShuffleAsShift(DL, MVT::v4i32, V1, V2, Mask, DAG))
+  if (SDValue Shift = lowerVectorShuffleAsShift(DL, MVT::v4i32, V1, V2, Mask,
+                                                Subtarget, DAG))
     return Shift;
 
   // There are special ways we can lower some single-element blends.
@@ -8896,7 +9333,7 @@ static SDValue lowerV4I32VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
 
   // We have different paths for blend lowering, but they all must use the
   // *exact* same predicate.
-  bool IsBlendSupported = Subtarget->hasSSE41();
+  bool IsBlendSupported = Subtarget.hasSSE41();
   if (IsBlendSupported)
     if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v4i32, V1, V2, Mask,
                                                   Subtarget, DAG))
@@ -8913,7 +9350,7 @@ static SDValue lowerV4I32VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
 
   // Try to use byte rotation instructions.
   // Its more profitable for pre-SSSE3 to use shuffles/unpacks.
-  if (Subtarget->hasSSSE3())
+  if (Subtarget.hasSSSE3())
     if (SDValue Rotate = lowerVectorShuffleAsByteRotate(
             DL, MVT::v4i32, V1, V2, Mask, Subtarget, DAG))
       return Rotate;
@@ -8957,8 +9394,8 @@ static SDValue lowerV4I32VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
 /// this routine for it to work correctly. To shuffle a 256-bit or 512-bit i16
 /// vector, form the analogous 128-bit 8-element Mask.
 static SDValue lowerV8I16GeneralSingleInputVectorShuffle(
-    SDLoc DL, MVT VT, SDValue V, MutableArrayRef<int> Mask,
-    const X86Subtarget *Subtarget, SelectionDAG &DAG) {
+    const SDLoc &DL, MVT VT, SDValue V, MutableArrayRef<int> Mask,
+    const X86Subtarget &Subtarget, SelectionDAG &DAG) {
   assert(VT.getVectorElementType() == MVT::i16 && "Bad input type!");
   MVT PSHUFDVT = MVT::getVectorVT(MVT::i32, VT.getVectorNumElements() / 2);
 
@@ -8987,6 +9424,26 @@ static SDValue lowerV8I16GeneralSingleInputVectorShuffle(
   MutableArrayRef<int> HToLInputs(LoInputs.data() + NumLToL, NumHToL);
   MutableArrayRef<int> HToHInputs(HiInputs.data() + NumLToH, NumHToH);
 
+  // If we are splatting two values from one half - one to each half, then
+  // we can shuffle that half so each is splatted to a dword, then splat those
+  // to their respective halves.
+  auto SplatHalfs = [&](int LoInput, int HiInput, unsigned ShufWOp,
+                        int DOffset) {
+    int PSHUFHalfMask[] = {LoInput % 4, LoInput % 4, HiInput % 4, HiInput % 4};
+    int PSHUFDMask[] = {DOffset + 0, DOffset + 0, DOffset + 1, DOffset + 1};
+    V = DAG.getNode(ShufWOp, DL, VT, V,
+                    getV4X86ShuffleImm8ForMask(PSHUFHalfMask, DL, DAG));
+    V = DAG.getBitcast(PSHUFDVT, V);
+    V = DAG.getNode(X86ISD::PSHUFD, DL, PSHUFDVT, V,
+                    getV4X86ShuffleImm8ForMask(PSHUFDMask, DL, DAG));
+    return DAG.getBitcast(VT, V);
+  };
+
+  if (NumLToL == 1 && NumLToH == 1 && (NumHToL + NumHToH) == 0)
+    return SplatHalfs(LToLInputs[0], LToHInputs[0], X86ISD::PSHUFLW, 0);
+  if (NumHToL == 1 && NumHToH == 1 && (NumLToL + NumLToH) == 0)
+    return SplatHalfs(HToLInputs[0], HToHInputs[0], X86ISD::PSHUFHW, 2);
+
   // Simplify the 1-into-3 and 3-into-1 cases with a single pshufd. For all
   // such inputs we can swap two of the dwords across the half mark and end up
   // with <=2 inputs to each half in each half. Once there, we can fall through
@@ -9096,9 +9553,9 @@ static SDValue lowerV8I16GeneralSingleInputVectorShuffle(
                           getV4X86ShuffleImm8ForMask(PSHUFHalfMask, DL, DAG));
 
           for (int &M : Mask)
-            if (M != -1 && M == FixIdx)
+            if (M >= 0 && M == FixIdx)
               M = FixFreeIdx;
-            else if (M != -1 && M == FixFreeIdx)
+            else if (M >= 0 && M == FixFreeIdx)
               M = FixIdx;
         };
         if (NumFlippedBToBInputs != 0) {
@@ -9123,9 +9580,9 @@ static SDValue lowerV8I16GeneralSingleInputVectorShuffle(
 
     // Adjust the mask to match the new locations of A and B.
     for (int &M : Mask)
-      if (M != -1 && M/2 == ADWord)
+      if (M >= 0 && M/2 == ADWord)
         M = 2 * BDWord + M % 2;
-      else if (M != -1 && M/2 == BDWord)
+      else if (M >= 0 && M/2 == BDWord)
         M = 2 * ADWord + M % 2;
 
     // Recurse back into this routine to re-compute state now that this isn't
@@ -9194,7 +9651,7 @@ static SDValue lowerV8I16GeneralSingleInputVectorShuffle(
       MutableArrayRef<int> FinalSourceHalfMask, int SourceOffset,
       int DestOffset) {
     auto isWordClobbered = [](ArrayRef<int> SourceHalfMask, int Word) {
-      return SourceHalfMask[Word] != -1 && SourceHalfMask[Word] != Word;
+      return SourceHalfMask[Word] >= 0 && SourceHalfMask[Word] != Word;
     };
     auto isDWordClobbered = [&isWordClobbered](ArrayRef<int> SourceHalfMask,
                                                int Word) {
@@ -9213,7 +9670,7 @@ static SDValue lowerV8I16GeneralSingleInputVectorShuffle(
         // If the source half mask maps over the inputs, turn those into
         // swaps and use the swapped lane.
         if (isWordClobbered(SourceHalfMask, Input - SourceOffset)) {
-          if (SourceHalfMask[SourceHalfMask[Input - SourceOffset]] == -1) {
+          if (SourceHalfMask[SourceHalfMask[Input - SourceOffset]] < 0) {
             SourceHalfMask[SourceHalfMask[Input - SourceOffset]] =
                 Input - SourceOffset;
             // We have to swap the uses in our half mask in one sweep.
@@ -9234,7 +9691,7 @@ static SDValue lowerV8I16GeneralSingleInputVectorShuffle(
         }
 
         // Map the input's dword into the correct half.
-        if (PSHUFDMask[(Input - SourceOffset + DestOffset) / 2] == -1)
+        if (PSHUFDMask[(Input - SourceOffset + DestOffset) / 2] < 0)
           PSHUFDMask[(Input - SourceOffset + DestOffset) / 2] = Input / 2;
         else
           assert(PSHUFDMask[(Input - SourceOffset + DestOffset) / 2] ==
@@ -9280,17 +9737,17 @@ static SDValue lowerV8I16GeneralSingleInputVectorShuffle(
         // the inputs, place the other input in it. We use (Index XOR 1) to
         // compute an adjacent index.
         if (!isWordClobbered(SourceHalfMask, InputsFixed[0]) &&
-            SourceHalfMask[InputsFixed[0] ^ 1] == -1) {
+            SourceHalfMask[InputsFixed[0] ^ 1] < 0) {
           SourceHalfMask[InputsFixed[0]] = InputsFixed[0];
           SourceHalfMask[InputsFixed[0] ^ 1] = InputsFixed[1];
           InputsFixed[1] = InputsFixed[0] ^ 1;
         } else if (!isWordClobbered(SourceHalfMask, InputsFixed[1]) &&
-                   SourceHalfMask[InputsFixed[1] ^ 1] == -1) {
+                   SourceHalfMask[InputsFixed[1] ^ 1] < 0) {
           SourceHalfMask[InputsFixed[1]] = InputsFixed[1];
           SourceHalfMask[InputsFixed[1] ^ 1] = InputsFixed[0];
           InputsFixed[0] = InputsFixed[1] ^ 1;
-        } else if (SourceHalfMask[2 * ((InputsFixed[0] / 2) ^ 1)] == -1 &&
-                   SourceHalfMask[2 * ((InputsFixed[0] / 2) ^ 1) + 1] == -1) {
+        } else if (SourceHalfMask[2 * ((InputsFixed[0] / 2) ^ 1)] < 0 &&
+                   SourceHalfMask[2 * ((InputsFixed[0] / 2) ^ 1) + 1] < 0) {
           // The two inputs are in the same DWord but it is clobbered and the
           // adjacent DWord isn't used at all. Move both inputs to the free
           // slot.
@@ -9304,7 +9761,7 @@ static SDValue lowerV8I16GeneralSingleInputVectorShuffle(
           // free slot adjacent to one of the inputs. In this case, we have to
           // swap an input with a non-input.
           for (int i = 0; i < 4; ++i)
-            assert((SourceHalfMask[i] == -1 || SourceHalfMask[i] == i) &&
+            assert((SourceHalfMask[i] < 0 || SourceHalfMask[i] == i) &&
                    "We can't handle any clobbers here!");
           assert(InputsFixed[1] != (InputsFixed[0] ^ 1) &&
                  "Cannot have adjacent inputs here!");
@@ -9338,8 +9795,8 @@ static SDValue lowerV8I16GeneralSingleInputVectorShuffle(
     }
 
     // Now hoist the DWord down to the right half.
-    int FreeDWord = (PSHUFDMask[DestOffset / 2] == -1 ? 0 : 1) + DestOffset / 2;
-    assert(PSHUFDMask[FreeDWord] == -1 && "DWord not free");
+    int FreeDWord = (PSHUFDMask[DestOffset / 2] < 0 ? 0 : 1) + DestOffset / 2;
+    assert(PSHUFDMask[FreeDWord] < 0 && "DWord not free");
     PSHUFDMask[FreeDWord] = IncomingInputs[0] / 2;
     for (int &M : HalfMask)
       for (int Input : IncomingInputs)
@@ -9367,11 +9824,9 @@ static SDValue lowerV8I16GeneralSingleInputVectorShuffle(
 
   // At this point, each half should contain all its inputs, and we can then
   // just shuffle them into their final position.
-  assert(std::count_if(LoMask.begin(), LoMask.end(),
-                       [](int M) { return M >= 4; }) == 0 &&
+  assert(count_if(LoMask, [](int M) { return M >= 4; }) == 0 &&
          "Failed to lift all the high half inputs to the low mask!");
-  assert(std::count_if(HiMask.begin(), HiMask.end(),
-                       [](int M) { return M >= 0 && M < 4; }) == 0 &&
+  assert(count_if(HiMask, [](int M) { return M >= 0 && M < 4; }) == 0 &&
          "Failed to lift all the low half inputs to the high mask!");
 
   // Do a half shuffle for the low mask.
@@ -9390,11 +9845,11 @@ static SDValue lowerV8I16GeneralSingleInputVectorShuffle(
   return V;
 }
 
-/// \brief Helper to form a PSHUFB-based shuffle+blend.
-static SDValue lowerVectorShuffleAsPSHUFB(SDLoc DL, MVT VT, SDValue V1,
-                                          SDValue V2, ArrayRef<int> Mask,
-                                          SelectionDAG &DAG, bool &V1InUse,
-                                          bool &V2InUse) {
+/// Helper to form a PSHUFB-based shuffle+blend, opportunistically avoiding the
+/// blend if only one input is used.
+static SDValue lowerVectorShuffleAsBlendOfPSHUFBs(
+    const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,
+    SelectionDAG &DAG, bool &V1InUse, bool &V2InUse) {
   SmallBitVector Zeroable = computeZeroableShuffleElements(Mask, V1, V2);
   SDValue V1Mask[16];
   SDValue V2Mask[16];
@@ -9404,7 +9859,7 @@ static SDValue lowerVectorShuffleAsPSHUFB(SDLoc DL, MVT VT, SDValue V1,
   int Size = Mask.size();
   int Scale = 16 / Size;
   for (int i = 0; i < 16; ++i) {
-    if (Mask[i / Scale] == -1) {
+    if (Mask[i / Scale] < 0) {
       V1Mask[i] = V2Mask[i] = DAG.getUNDEF(MVT::i8);
     } else {
       const int ZeroMask = 0x80;
@@ -9425,11 +9880,11 @@ static SDValue lowerVectorShuffleAsPSHUFB(SDLoc DL, MVT VT, SDValue V1,
   if (V1InUse)
     V1 = DAG.getNode(X86ISD::PSHUFB, DL, MVT::v16i8,
                      DAG.getBitcast(MVT::v16i8, V1),
-                     DAG.getNode(ISD::BUILD_VECTOR, DL, MVT::v16i8, V1Mask));
+                     DAG.getBuildVector(MVT::v16i8, DL, V1Mask));
   if (V2InUse)
     V2 = DAG.getNode(X86ISD::PSHUFB, DL, MVT::v16i8,
                      DAG.getBitcast(MVT::v16i8, V2),
-                     DAG.getNode(ISD::BUILD_VECTOR, DL, MVT::v16i8, V2Mask));
+                     DAG.getBuildVector(MVT::v16i8, DL, V2Mask));
 
   // If we need shuffled inputs from both, blend the two.
   SDValue V;
@@ -9454,42 +9909,31 @@ static SDValue lowerVectorShuffleAsPSHUFB(SDLoc DL, MVT VT, SDValue V1,
 /// the two inputs, try to interleave them. Otherwise, blend the low and high
 /// halves of the inputs separately (making them have relatively few inputs)
 /// and then concatenate them.
-static SDValue lowerV8I16VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
-                                       const X86Subtarget *Subtarget,
+static SDValue lowerV8I16VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
+                                       SDValue V1, SDValue V2,
+                                       const X86Subtarget &Subtarget,
                                        SelectionDAG &DAG) {
-  SDLoc DL(Op);
-  assert(Op.getSimpleValueType() == MVT::v8i16 && "Bad shuffle type!");
   assert(V1.getSimpleValueType() == MVT::v8i16 && "Bad operand type!");
   assert(V2.getSimpleValueType() == MVT::v8i16 && "Bad operand type!");
-  ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
-  ArrayRef<int> OrigMask = SVOp->getMask();
-  int MaskStorage[8] = {OrigMask[0], OrigMask[1], OrigMask[2], OrigMask[3],
-                        OrigMask[4], OrigMask[5], OrigMask[6], OrigMask[7]};
-  MutableArrayRef<int> Mask(MaskStorage);
-
   assert(Mask.size() == 8 && "Unexpected mask size for v8 shuffle!");
 
   // Whenever we can lower this as a zext, that instruction is strictly faster
   // than any alternative.
   if (SDValue ZExt = lowerVectorShuffleAsZeroOrAnyExtend(
-          DL, MVT::v8i16, V1, V2, OrigMask, Subtarget, DAG))
+          DL, MVT::v8i16, V1, V2, Mask, Subtarget, DAG))
     return ZExt;
 
-  auto isV1 = [](int M) { return M >= 0 && M < 8; };
-  (void)isV1;
-  auto isV2 = [](int M) { return M >= 8; };
-
-  int NumV2Inputs = std::count_if(Mask.begin(), Mask.end(), isV2);
+  int NumV2Inputs = count_if(Mask, [](int M) { return M >= 8; });
 
   if (NumV2Inputs == 0) {
     // Check for being able to broadcast a single element.
-    if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(DL, MVT::v8i16, V1,
-                                                          Mask, Subtarget, DAG))
+    if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(
+            DL, MVT::v8i16, V1, V2, Mask, Subtarget, DAG))
       return Broadcast;
 
     // Try to use shift instructions.
-    if (SDValue Shift =
-            lowerVectorShuffleAsShift(DL, MVT::v8i16, V1, V1, Mask, DAG))
+    if (SDValue Shift = lowerVectorShuffleAsShift(DL, MVT::v8i16, V1, V1, Mask,
+                                                  Subtarget, DAG))
       return Shift;
 
     // Use dedicated unpack instructions for masks that match their pattern.
@@ -9502,21 +9946,24 @@ static SDValue lowerV8I16VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
                                                         Mask, Subtarget, DAG))
       return Rotate;
 
-    return lowerV8I16GeneralSingleInputVectorShuffle(DL, MVT::v8i16, V1, Mask,
-                                                     Subtarget, DAG);
+    // Make a copy of the mask so it can be modified.
+    SmallVector<int, 8> MutableMask(Mask.begin(), Mask.end());
+    return lowerV8I16GeneralSingleInputVectorShuffle(DL, MVT::v8i16, V1,
+                                                     MutableMask, Subtarget,
+                                                     DAG);
   }
 
-  assert(std::any_of(Mask.begin(), Mask.end(), isV1) &&
+  assert(llvm::any_of(Mask, [](int M) { return M >= 0 && M < 8; }) &&
          "All single-input shuffles should be canonicalized to be V1-input "
          "shuffles.");
 
   // Try to use shift instructions.
-  if (SDValue Shift =
-          lowerVectorShuffleAsShift(DL, MVT::v8i16, V1, V2, Mask, DAG))
+  if (SDValue Shift = lowerVectorShuffleAsShift(DL, MVT::v8i16, V1, V2, Mask,
+                                                Subtarget, DAG))
     return Shift;
 
   // See if we can use SSE4A Extraction / Insertion.
-  if (Subtarget->hasSSE4A())
+  if (Subtarget.hasSSE4A())
     if (SDValue V = lowerVectorShuffleWithSSE4A(DL, MVT::v8i16, V1, V2, Mask, DAG))
       return V;
 
@@ -9528,7 +9975,7 @@ static SDValue lowerV8I16VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
 
   // We have different paths for blend lowering, but they all must use the
   // *exact* same predicate.
-  bool IsBlendSupported = Subtarget->hasSSE41();
+  bool IsBlendSupported = Subtarget.hasSSE41();
   if (IsBlendSupported)
     if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v8i16, V1, V2, Mask,
                                                   Subtarget, DAG))
@@ -9552,16 +9999,17 @@ static SDValue lowerV8I16VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
           lowerVectorShuffleAsBitBlend(DL, MVT::v8i16, V1, V2, Mask, DAG))
     return BitBlend;
 
+  // Try to lower by permuting the inputs into an unpack instruction.
   if (SDValue Unpack = lowerVectorShuffleAsPermuteAndUnpack(DL, MVT::v8i16, V1,
                                                             V2, Mask, DAG))
     return Unpack;
 
   // If we can't directly blend but can use PSHUFB, that will be better as it
   // can both shuffle and set up the inefficient blend.
-  if (!IsBlendSupported && Subtarget->hasSSSE3()) {
+  if (!IsBlendSupported && Subtarget.hasSSSE3()) {
     bool V1InUse, V2InUse;
-    return lowerVectorShuffleAsPSHUFB(DL, MVT::v8i16, V1, V2, Mask, DAG,
-                                      V1InUse, V2InUse);
+    return lowerVectorShuffleAsBlendOfPSHUFBs(DL, MVT::v8i16, V1, V2, Mask, DAG,
+                                              V1InUse, V2InUse);
   }
 
   // We can always bit-blend if we have to so the fallback strategy is to
@@ -9591,10 +10039,8 @@ static SDValue lowerV8I16VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
 ///
 /// \returns N above, or the number of times even elements must be dropped if
 /// there is such a number. Otherwise returns zero.
-static int canLowerByDroppingEvenElements(ArrayRef<int> Mask) {
-  // Figure out whether we're looping over two inputs or just one.
-  bool IsSingleInput = isSingleInputShuffleMask(Mask);
-
+static int canLowerByDroppingEvenElements(ArrayRef<int> Mask,
+                                          bool IsSingleInput) {
   // The modulus for the shuffle vector entries is based on whether this is
   // a single input or not.
   int ShuffleModulus = Mask.size() * (IsSingleInput ? 1 : 2);
@@ -9611,7 +10057,7 @@ static int canLowerByDroppingEvenElements(ArrayRef<int> Mask) {
   for (int i = 0, e = Mask.size(); i < e; ++i) {
     // Ignore undef lanes, we'll optimistically collapse them to the pattern we
     // want.
-    if (Mask[i] == -1)
+    if (Mask[i] < 0)
       continue;
 
     bool IsAnyViable = false;
@@ -9645,20 +10091,17 @@ static int canLowerByDroppingEvenElements(ArrayRef<int> Mask) {
 /// UNPCK to spread the i8 elements across two i16-element vectors, and uses
 /// the existing lowering for v8i16 blends on each half, finally PACK-ing them
 /// back together.
-static SDValue lowerV16I8VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
-                                       const X86Subtarget *Subtarget,
+static SDValue lowerV16I8VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
+                                       SDValue V1, SDValue V2,
+                                       const X86Subtarget &Subtarget,
                                        SelectionDAG &DAG) {
-  SDLoc DL(Op);
-  assert(Op.getSimpleValueType() == MVT::v16i8 && "Bad shuffle type!");
   assert(V1.getSimpleValueType() == MVT::v16i8 && "Bad operand type!");
   assert(V2.getSimpleValueType() == MVT::v16i8 && "Bad operand type!");
-  ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
-  ArrayRef<int> Mask = SVOp->getMask();
   assert(Mask.size() == 16 && "Unexpected mask size for v16 shuffle!");
 
   // Try to use shift instructions.
-  if (SDValue Shift =
-          lowerVectorShuffleAsShift(DL, MVT::v16i8, V1, V2, Mask, DAG))
+  if (SDValue Shift = lowerVectorShuffleAsShift(DL, MVT::v16i8, V1, V2, Mask,
+                                                Subtarget, DAG))
     return Shift;
 
   // Try to use byte rotation instructions.
@@ -9672,18 +10115,17 @@ static SDValue lowerV16I8VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
     return ZExt;
 
   // See if we can use SSE4A Extraction / Insertion.
-  if (Subtarget->hasSSE4A())
+  if (Subtarget.hasSSE4A())
     if (SDValue V = lowerVectorShuffleWithSSE4A(DL, MVT::v16i8, V1, V2, Mask, DAG))
       return V;
 
-  int NumV2Elements =
-      std::count_if(Mask.begin(), Mask.end(), [](int M) { return M >= 16; });
+  int NumV2Elements = count_if(Mask, [](int M) { return M >= 16; });
 
   // For single-input shuffles, there are some nicer lowering tricks we can use.
   if (NumV2Elements == 0) {
     // Check for being able to broadcast a single element.
-    if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(DL, MVT::v16i8, V1,
-                                                          Mask, Subtarget, DAG))
+    if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(
+            DL, MVT::v16i8, V1, V2, Mask, Subtarget, DAG))
       return Broadcast;
 
     // Check whether we can widen this to an i16 shuffle by duplicating bytes.
@@ -9696,7 +10138,7 @@ static SDValue lowerV16I8VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
     // i16 shuffle as well.
     auto canWidenViaDuplication = [](ArrayRef<int> Mask) {
       for (int i = 0; i < 16; i += 2)
-        if (Mask[i] != -1 && Mask[i + 1] != -1 && Mask[i] != Mask[i + 1])
+        if (Mask[i] >= 0 && Mask[i + 1] >= 0 && Mask[i] != Mask[i + 1])
           return false;
 
       return true;
@@ -9734,7 +10176,7 @@ static SDValue lowerV16I8VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
         if (PreDupI16Shuffle[j] != MovingInputs[i] / 2) {
           // If we haven't yet mapped the input, search for a slot into which
           // we can map it.
-          while (j < je && PreDupI16Shuffle[j] != -1)
+          while (j < je && PreDupI16Shuffle[j] >= 0)
             ++j;
 
           if (j == je)
@@ -9759,10 +10201,10 @@ static SDValue lowerV16I8VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
 
       int PostDupI16Shuffle[8] = {-1, -1, -1, -1, -1, -1, -1, -1};
       for (int i = 0; i < 16; ++i)
-        if (Mask[i] != -1) {
+        if (Mask[i] >= 0) {
           int MappedMask = LaneMap[Mask[i]] - (TargetLo ? 0 : 8);
           assert(MappedMask < 8 && "Invalid v8 shuffle mask!");
-          if (PostDupI16Shuffle[i / 2] == -1)
+          if (PostDupI16Shuffle[i / 2] < 0)
             PostDupI16Shuffle[i / 2] = MappedMask;
           else
             assert(PostDupI16Shuffle[i / 2] == MappedMask &&
@@ -9799,18 +10241,18 @@ static SDValue lowerV16I8VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
   // FIXME: The only exceptions to the above are blends which are exact
   // interleavings with direct instructions supporting them. We currently don't
   // handle those well here.
-  if (Subtarget->hasSSSE3()) {
+  if (Subtarget.hasSSSE3()) {
     bool V1InUse = false;
     bool V2InUse = false;
 
-    SDValue PSHUFB = lowerVectorShuffleAsPSHUFB(DL, MVT::v16i8, V1, V2, Mask,
-                                                DAG, V1InUse, V2InUse);
+    SDValue PSHUFB = lowerVectorShuffleAsBlendOfPSHUFBs(
+        DL, MVT::v16i8, V1, V2, Mask, DAG, V1InUse, V2InUse);
 
     // If both V1 and V2 are in use and we can use a direct blend or an unpack,
     // do so. This avoids using them to handle blends-with-zero which is
     // important as a single pshufb is significantly faster for that.
     if (V1InUse && V2InUse) {
-      if (Subtarget->hasSSE41())
+      if (Subtarget.hasSSE41())
         if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v16i8, V1, V2,
                                                       Mask, Subtarget, DAG))
           return Blend;
@@ -9848,11 +10290,11 @@ static SDValue lowerV16I8VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
   // We special case these as they can be particularly efficiently handled with
   // the PACKUSB instruction on x86 and they show up in common patterns of
   // rearranging bytes to truncate wide elements.
-  if (int NumEvenDrops = canLowerByDroppingEvenElements(Mask)) {
+  bool IsSingleInput = V2.isUndef();
+  if (int NumEvenDrops = canLowerByDroppingEvenElements(Mask, IsSingleInput)) {
     // NumEvenDrops is the power of two stride of the elements. Another way of
     // thinking about it is that we need to drop the even elements this many
     // times to get the original input.
-    bool IsSingleInput = isSingleInputShuffleMask(Mask);
 
     // First we need to zero all the dropped bytes.
     assert(NumEvenDrops <= 3 &&
@@ -9907,7 +10349,7 @@ static SDValue lowerV16I8VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
     // Use a mask to drop the high bytes.
     VLoHalf = DAG.getBitcast(MVT::v8i16, V);
     VLoHalf = DAG.getNode(ISD::AND, DL, MVT::v8i16, VLoHalf,
-                     DAG.getConstant(0x00FF, DL, MVT::v8i16));
+                          DAG.getConstant(0x00FF, DL, MVT::v8i16));
 
     // This will be a single vector shuffle instead of a blend so nuke VHiHalf.
     VHiHalf = DAG.getUNDEF(MVT::v8i16);
@@ -9938,22 +10380,23 @@ static SDValue lowerV16I8VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
 ///
 /// This routine breaks down the specific type of 128-bit shuffle and
 /// dispatches to the lowering routines accordingly.
-static SDValue lower128BitVectorShuffle(SDValue Op, SDValue V1, SDValue V2,
-                                        MVT VT, const X86Subtarget *Subtarget,
+static SDValue lower128BitVectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
+                                        MVT VT, SDValue V1, SDValue V2,
+                                        const X86Subtarget &Subtarget,
                                         SelectionDAG &DAG) {
   switch (VT.SimpleTy) {
   case MVT::v2i64:
-    return lowerV2I64VectorShuffle(Op, V1, V2, Subtarget, DAG);
+    return lowerV2I64VectorShuffle(DL, Mask, V1, V2, Subtarget, DAG);
   case MVT::v2f64:
-    return lowerV2F64VectorShuffle(Op, V1, V2, Subtarget, DAG);
+    return lowerV2F64VectorShuffle(DL, Mask, V1, V2, Subtarget, DAG);
   case MVT::v4i32:
-    return lowerV4I32VectorShuffle(Op, V1, V2, Subtarget, DAG);
+    return lowerV4I32VectorShuffle(DL, Mask, V1, V2, Subtarget, DAG);
   case MVT::v4f32:
-    return lowerV4F32VectorShuffle(Op, V1, V2, Subtarget, DAG);
+    return lowerV4F32VectorShuffle(DL, Mask, V1, V2, Subtarget, DAG);
   case MVT::v8i16:
-    return lowerV8I16VectorShuffle(Op, V1, V2, Subtarget, DAG);
+    return lowerV8I16VectorShuffle(DL, Mask, V1, V2, Subtarget, DAG);
   case MVT::v16i8:
-    return lowerV16I8VectorShuffle(Op, V1, V2, Subtarget, DAG);
+    return lowerV16I8VectorShuffle(DL, Mask, V1, V2, Subtarget, DAG);
 
   default:
     llvm_unreachable("Unimplemented!");
@@ -9971,21 +10414,22 @@ static SDValue lower128BitVectorShuffle(SDValue Op, SDValue V1, SDValue V2,
 /// a zero-ed lane of a vector.
 static bool canWidenShuffleElements(ArrayRef<int> Mask,
                                     SmallVectorImpl<int> &WidenedMask) {
+  WidenedMask.assign(Mask.size() / 2, 0);
   for (int i = 0, Size = Mask.size(); i < Size; i += 2) {
     // If both elements are undef, its trivial.
     if (Mask[i] == SM_SentinelUndef && Mask[i + 1] == SM_SentinelUndef) {
-      WidenedMask.push_back(SM_SentinelUndef);
+      WidenedMask[i/2] = SM_SentinelUndef;
       continue;
     }
 
     // Check for an undef mask and a mask value properly aligned to fit with
     // a pair of values. If we find such a case, use the non-undef mask's value.
     if (Mask[i] == SM_SentinelUndef && Mask[i + 1] >= 0 && Mask[i + 1] % 2 == 1) {
-      WidenedMask.push_back(Mask[i + 1] / 2);
+      WidenedMask[i/2] = Mask[i + 1] / 2;
       continue;
     }
     if (Mask[i + 1] == SM_SentinelUndef && Mask[i] >= 0 && Mask[i] % 2 == 0) {
-      WidenedMask.push_back(Mask[i] / 2);
+      WidenedMask[i/2] = Mask[i] / 2;
       continue;
     }
 
@@ -9993,7 +10437,7 @@ static bool canWidenShuffleElements(ArrayRef<int> Mask,
     if (Mask[i] == SM_SentinelZero || Mask[i + 1] == SM_SentinelZero) {
       if ((Mask[i] == SM_SentinelZero || Mask[i] == SM_SentinelUndef) &&
           (Mask[i + 1] == SM_SentinelZero || Mask[i + 1] == SM_SentinelUndef)) {
-        WidenedMask.push_back(SM_SentinelZero);
+        WidenedMask[i/2] = SM_SentinelZero;
         continue;
       }
       return false;
@@ -10002,7 +10446,7 @@ static bool canWidenShuffleElements(ArrayRef<int> Mask,
     // Finally check if the two mask values are adjacent and aligned with
     // a pair.
     if (Mask[i] != SM_SentinelUndef && Mask[i] % 2 == 0 && Mask[i] + 1 == Mask[i + 1]) {
-      WidenedMask.push_back(Mask[i] / 2);
+      WidenedMask[i/2] = Mask[i] / 2;
       continue;
     }
 
@@ -10020,7 +10464,7 @@ static bool canWidenShuffleElements(ArrayRef<int> Mask,
 /// This routine just extracts two subvectors, shuffles them independently, and
 /// then concatenates them back together. This should work effectively with all
 /// AVX vector shuffle types.
-static SDValue splitAndLowerVectorShuffle(SDLoc DL, MVT VT, SDValue V1,
+static SDValue splitAndLowerVectorShuffle(const SDLoc &DL, MVT VT, SDValue V1,
                                           SDValue V2, ArrayRef<int> Mask,
                                           SelectionDAG &DAG) {
   assert(VT.getSizeInBits() >= 256 &&
@@ -10039,8 +10483,7 @@ static SDValue splitAndLowerVectorShuffle(SDLoc DL, MVT VT, SDValue V1,
   // Rather than splitting build-vectors, just build two narrower build
   // vectors. This helps shuffling with splats and zeros.
   auto SplitVector = [&](SDValue V) {
-    while (V.getOpcode() == ISD::BITCAST)
-      V = V->getOperand(0);
+    V = peekThroughBitcasts(V);
 
     MVT OrigVT = V.getSimpleValueType();
     int OrigNumElements = OrigVT.getVectorNumElements();
@@ -10063,8 +10506,8 @@ static SDValue splitAndLowerVectorShuffle(SDLoc DL, MVT VT, SDValue V1,
         LoOps.push_back(BV->getOperand(i));
         HiOps.push_back(BV->getOperand(i + OrigSplitNumElements));
       }
-      LoV = DAG.getNode(ISD::BUILD_VECTOR, DL, OrigSplitVT, LoOps);
-      HiV = DAG.getNode(ISD::BUILD_VECTOR, DL, OrigSplitVT, HiOps);
+      LoV = DAG.getBuildVector(OrigSplitVT, DL, LoOps);
+      HiV = DAG.getBuildVector(OrigSplitVT, DL, HiOps);
     }
     return std::make_pair(DAG.getBitcast(SplitVT, LoV),
                           DAG.getBitcast(SplitVT, HiV));
@@ -10077,7 +10520,9 @@ static SDValue splitAndLowerVectorShuffle(SDLoc DL, MVT VT, SDValue V1,
   // Now create two 4-way blends of these half-width vectors.
   auto HalfBlend = [&](ArrayRef<int> HalfMask) {
     bool UseLoV1 = false, UseHiV1 = false, UseLoV2 = false, UseHiV2 = false;
-    SmallVector<int, 32> V1BlendMask, V2BlendMask, BlendMask;
+    SmallVector<int, 32> V1BlendMask((unsigned)SplitNumElements, -1);
+    SmallVector<int, 32> V2BlendMask((unsigned)SplitNumElements, -1);
+    SmallVector<int, 32> BlendMask((unsigned)SplitNumElements, -1);
     for (int i = 0; i < SplitNumElements; ++i) {
       int M = HalfMask[i];
       if (M >= NumElements) {
@@ -10085,21 +10530,15 @@ static SDValue splitAndLowerVectorShuffle(SDLoc DL, MVT VT, SDValue V1,
           UseHiV2 = true;
         else
           UseLoV2 = true;
-        V2BlendMask.push_back(M - NumElements);
-        V1BlendMask.push_back(-1);
-        BlendMask.push_back(SplitNumElements + i);
+        V2BlendMask[i] = M - NumElements;
+        BlendMask[i] = SplitNumElements + i;
       } else if (M >= 0) {
         if (M >= SplitNumElements)
           UseHiV1 = true;
         else
           UseLoV1 = true;
-        V2BlendMask.push_back(-1);
-        V1BlendMask.push_back(M);
-        BlendMask.push_back(i);
-      } else {
-        V2BlendMask.push_back(-1);
-        V1BlendMask.push_back(-1);
-        BlendMask.push_back(-1);
+        V1BlendMask[i] = M;
+        BlendMask[i] = i;
       }
     }
 
@@ -10151,12 +10590,12 @@ static SDValue splitAndLowerVectorShuffle(SDLoc DL, MVT VT, SDValue V1,
 /// between splitting the shuffle into 128-bit components and stitching those
 /// back together vs. extracting the single-input shuffles and blending those
 /// results.
-static SDValue lowerVectorShuffleAsSplitOrBlend(SDLoc DL, MVT VT, SDValue V1,
-                                                SDValue V2, ArrayRef<int> Mask,
+static SDValue lowerVectorShuffleAsSplitOrBlend(const SDLoc &DL, MVT VT,
+                                                SDValue V1, SDValue V2,
+                                                ArrayRef<int> Mask,
                                                 SelectionDAG &DAG) {
-  assert(!isSingleInputShuffleMask(Mask) && "This routine must not be used to "
-                                            "lower single-input shuffles as it "
-                                            "could then recurse on itself.");
+  assert(!V2.isUndef() && "This routine must not be used to lower single-input "
+         "shuffles as it could then recurse on itself.");
   int Size = Mask.size();
 
   // If this can be modeled as a broadcast of two elements followed by a blend,
@@ -10166,12 +10605,12 @@ static SDValue lowerVectorShuffleAsSplitOrBlend(SDLoc DL, MVT VT, SDValue V1,
     int V1BroadcastIdx = -1, V2BroadcastIdx = -1;
     for (int M : Mask)
       if (M >= Size) {
-        if (V2BroadcastIdx == -1)
+        if (V2BroadcastIdx < 0)
           V2BroadcastIdx = M - Size;
         else if (M - Size != V2BroadcastIdx)
           return false;
       } else if (M >= 0) {
-        if (V1BroadcastIdx == -1)
+        if (V1BroadcastIdx < 0)
           V1BroadcastIdx = M;
         else if (M != V1BroadcastIdx)
           return false;
@@ -10210,54 +10649,51 @@ static SDValue lowerVectorShuffleAsSplitOrBlend(SDLoc DL, MVT VT, SDValue V1,
 /// is lower than any other fully general cross-lane shuffle strategy I'm aware
 /// of. Special cases for each particular shuffle pattern should be handled
 /// prior to trying this lowering.
-static SDValue lowerVectorShuffleAsLanePermuteAndBlend(SDLoc DL, MVT VT,
+static SDValue lowerVectorShuffleAsLanePermuteAndBlend(const SDLoc &DL, MVT VT,
                                                        SDValue V1, SDValue V2,
                                                        ArrayRef<int> Mask,
                                                        SelectionDAG &DAG) {
   // FIXME: This should probably be generalized for 512-bit vectors as well.
   assert(VT.is256BitVector() && "Only for 256-bit vector shuffles!");
-  int LaneSize = Mask.size() / 2;
+  int Size = Mask.size();
+  int LaneSize = Size / 2;
 
   // If there are only inputs from one 128-bit lane, splitting will in fact be
   // less expensive. The flags track whether the given lane contains an element
   // that crosses to another lane.
   bool LaneCrossing[2] = {false, false};
-  for (int i = 0, Size = Mask.size(); i < Size; ++i)
+  for (int i = 0; i < Size; ++i)
     if (Mask[i] >= 0 && (Mask[i] % Size) / LaneSize != i / LaneSize)
       LaneCrossing[(Mask[i] % Size) / LaneSize] = true;
   if (!LaneCrossing[0] || !LaneCrossing[1])
     return splitAndLowerVectorShuffle(DL, VT, V1, V2, Mask, DAG);
 
-  if (isSingleInputShuffleMask(Mask)) {
-    SmallVector<int, 32> FlippedBlendMask;
-    for (int i = 0, Size = Mask.size(); i < Size; ++i)
-      FlippedBlendMask.push_back(
-          Mask[i] < 0 ? -1 : (((Mask[i] % Size) / LaneSize == i / LaneSize)
-                                  ? Mask[i]
-                                  : Mask[i] % LaneSize +
-                                        (i / LaneSize) * LaneSize + Size));
-
-    // Flip the vector, and blend the results which should now be in-lane. The
-    // VPERM2X128 mask uses the low 2 bits for the low source and bits 4 and
-    // 5 for the high source. The value 3 selects the high half of source 2 and
-    // the value 2 selects the low half of source 2. We only use source 2 to
-    // allow folding it into a memory operand.
-    unsigned PERMMask = 3 | 2 << 4;
-    SDValue Flipped = DAG.getNode(X86ISD::VPERM2X128, DL, VT, DAG.getUNDEF(VT),
-                                  V1, DAG.getConstant(PERMMask, DL, MVT::i8));
-    return DAG.getVectorShuffle(VT, DL, V1, Flipped, FlippedBlendMask);
-  }
-
-  // This now reduces to two single-input shuffles of V1 and V2 which at worst
-  // will be handled by the above logic and a blend of the results, much like
-  // other patterns in AVX.
-  return lowerVectorShuffleAsDecomposedShuffleBlend(DL, VT, V1, V2, Mask, DAG);
+  assert(V2.isUndef() &&
+         "This last part of this routine only works on single input shuffles");
+
+  SmallVector<int, 32> FlippedBlendMask(Size);
+  for (int i = 0; i < Size; ++i)
+    FlippedBlendMask[i] =
+        Mask[i] < 0 ? -1 : (((Mask[i] % Size) / LaneSize == i / LaneSize)
+                                ? Mask[i]
+                                : Mask[i] % LaneSize +
+                                      (i / LaneSize) * LaneSize + Size);
+
+  // Flip the vector, and blend the results which should now be in-lane. The
+  // VPERM2X128 mask uses the low 2 bits for the low source and bits 4 and
+  // 5 for the high source. The value 3 selects the high half of source 2 and
+  // the value 2 selects the low half of source 2. We only use source 2 to
+  // allow folding it into a memory operand.
+  unsigned PERMMask = 3 | 2 << 4;
+  SDValue Flipped = DAG.getNode(X86ISD::VPERM2X128, DL, VT, DAG.getUNDEF(VT),
+                                V1, DAG.getConstant(PERMMask, DL, MVT::i8));
+  return DAG.getVectorShuffle(VT, DL, V1, Flipped, FlippedBlendMask);
 }
 
 /// \brief Handle lowering 2-lane 128-bit shuffles.
-static SDValue lowerV2X128VectorShuffle(SDLoc DL, MVT VT, SDValue V1,
+static SDValue lowerV2X128VectorShuffle(const SDLoc &DL, MVT VT, SDValue V1,
                                         SDValue V2, ArrayRef<int> Mask,
-                                        const X86Subtarget *Subtarget,
+                                        const X86Subtarget &Subtarget,
                                         SelectionDAG &DAG) {
   // TODO: If minimizing size and one of the inputs is a zero vector and the
   // the zero vector has only one use, we could use a VPERM2X128 to save the
@@ -10278,6 +10714,10 @@ static SDValue lowerV2X128VectorShuffle(SDLoc DL, MVT VT, SDValue V1,
     // subvector.
     bool OnlyUsesV1 = isShuffleEquivalent(V1, V2, Mask, {0, 1, 0, 1});
     if (OnlyUsesV1 || isShuffleEquivalent(V1, V2, Mask, {0, 1, 4, 5})) {
+      // With AVX2 we should use VPERMQ/VPERMPD to allow memory folding.
+      if (Subtarget.hasAVX2() && V2.isUndef())
+        return SDValue();
+
       MVT SubVT = MVT::getVectorVT(VT.getVectorElementType(),
                                    VT.getVectorNumElements() / 2);
       SDValue LoV = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT, V1,
@@ -10349,10 +10789,9 @@ static SDValue lowerV2X128VectorShuffle(SDLoc DL, MVT VT, SDValue V1,
 /// in x86 only floating point has interesting non-repeating shuffles, and even
 /// those are still *marginally* more expensive.
 static SDValue lowerVectorShuffleByMerging128BitLanes(
-    SDLoc DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,
-    const X86Subtarget *Subtarget, SelectionDAG &DAG) {
-  assert(!isSingleInputShuffleMask(Mask) &&
-         "This is only useful with multiple inputs.");
+    const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,
+    const X86Subtarget &Subtarget, SelectionDAG &DAG) {
+  assert(!V2.isUndef() && "This is only useful with multiple inputs.");
 
   int Size = Mask.size();
   int LaneSize = 128 / VT.getScalarSizeInBits();
@@ -10361,10 +10800,8 @@ static SDValue lowerVectorShuffleByMerging128BitLanes(
 
   // See if we can build a hypothetical 128-bit lane-fixing shuffle mask. Also
   // check whether the in-128-bit lane shuffles share a repeating pattern.
-  SmallVector<int, 4> Lanes;
-  Lanes.resize(NumLanes, -1);
-  SmallVector<int, 4> InLaneMask;
-  InLaneMask.resize(LaneSize, -1);
+  SmallVector<int, 4> Lanes((unsigned)NumLanes, -1);
+  SmallVector<int, 4> InLaneMask((unsigned)LaneSize, -1);
   for (int i = 0; i < Size; ++i) {
     if (Mask[i] < 0)
       continue;
@@ -10392,8 +10829,7 @@ static SDValue lowerVectorShuffleByMerging128BitLanes(
   // First shuffle the lanes into place.
   MVT LaneVT = MVT::getVectorVT(VT.isFloatingPoint() ? MVT::f64 : MVT::i64,
                                 VT.getSizeInBits() / 64);
-  SmallVector<int, 8> LaneMask;
-  LaneMask.resize(NumLanes * 2, -1);
+  SmallVector<int, 8> LaneMask((unsigned)NumLanes * 2, -1);
   for (int i = 0; i < NumLanes; ++i)
     if (Lanes[i] >= 0) {
       LaneMask[2 * i + 0] = 2*Lanes[i] + 0;
@@ -10408,8 +10844,7 @@ static SDValue lowerVectorShuffleByMerging128BitLanes(
   LaneShuffle = DAG.getBitcast(VT, LaneShuffle);
 
   // Now do a simple shuffle that isn't lane crossing.
-  SmallVector<int, 8> NewMask;
-  NewMask.resize(Size, -1);
+  SmallVector<int, 8> NewMask((unsigned)Size, -1);
   for (int i = 0; i < Size; ++i)
     if (Mask[i] >= 0)
       NewMask[i] = (i / LaneSize) * LaneSize + Mask[i] % LaneSize;
@@ -10422,11 +10857,12 @@ static SDValue lowerVectorShuffleByMerging128BitLanes(
 /// Lower shuffles where an entire half of a 256-bit vector is UNDEF.
 /// This allows for fast cases such as subvector extraction/insertion
 /// or shuffling smaller vector types which can lower more efficiently.
-static SDValue lowerVectorShuffleWithUndefHalf(SDLoc DL, MVT VT, SDValue V1,
-                                               SDValue V2, ArrayRef<int> Mask,
-                                               const X86Subtarget *Subtarget,
+static SDValue lowerVectorShuffleWithUndefHalf(const SDLoc &DL, MVT VT,
+                                               SDValue V1, SDValue V2,
+                                               ArrayRef<int> Mask,
+                                               const X86Subtarget &Subtarget,
                                                SelectionDAG &DAG) {
-  assert(VT.getSizeInBits() == 256 && "Expected 256-bit vector");
+  assert(VT.is256BitVector() && "Expected 256-bit vector");
 
   unsigned NumElts = VT.getVectorNumElements();
   unsigned HalfNumElts = NumElts / 2;
@@ -10457,21 +10893,16 @@ static SDValue lowerVectorShuffleWithUndefHalf(SDLoc DL, MVT VT, SDValue V1,
                        DAG.getIntPtrConstant(HalfNumElts, DL));
   }
 
-  // AVX2 supports efficient immediate 64-bit element cross-lane shuffles.
-  if (UndefLower && Subtarget->hasAVX2() &&
-      (VT == MVT::v4f64 || VT == MVT::v4i64))
-    return SDValue();
-
-  // If the shuffle only uses the lower halves of the input operands,
+  // If the shuffle only uses two of the four halves of the input operands,
   // then extract them and perform the 'half' shuffle at half width.
   // e.g. vector_shuffle <X, X, X, X, u, u, u, u> or <X, X, u, u>
   int HalfIdx1 = -1, HalfIdx2 = -1;
-  SmallVector<int, 8> HalfMask;
+  SmallVector<int, 8> HalfMask(HalfNumElts);
   unsigned Offset = UndefLower ? HalfNumElts : 0;
   for (unsigned i = 0; i != HalfNumElts; ++i) {
     int M = Mask[i + Offset];
     if (M < 0) {
-      HalfMask.push_back(M);
+      HalfMask[i] = M;
       continue;
     }
 
@@ -10479,23 +10910,18 @@ static SDValue lowerVectorShuffleWithUndefHalf(SDLoc DL, MVT VT, SDValue V1,
     // i.e. 0 = Lower V1, 1 = Upper V1, 2 = Lower V2, 3 = Upper V2.
     int HalfIdx = M / HalfNumElts;
 
-    // Only shuffle using the lower halves of the inputs.
-    // TODO: Investigate usefulness of shuffling with upper halves.
-    if (HalfIdx != 0 && HalfIdx != 2)
-      return SDValue();
-
     // Determine the element index into its half vector source.
     int HalfElt = M % HalfNumElts;
 
     // We can shuffle with up to 2 half vectors, set the new 'half'
     // shuffle mask accordingly.
-    if (-1 == HalfIdx1 || HalfIdx1 == HalfIdx) {
-      HalfMask.push_back(HalfElt);
+    if (HalfIdx1 < 0 || HalfIdx1 == HalfIdx) {
+      HalfMask[i] = HalfElt;
       HalfIdx1 = HalfIdx;
       continue;
     }
-    if (-1 == HalfIdx2 || HalfIdx2 == HalfIdx) {
-      HalfMask.push_back(HalfElt + HalfNumElts);
+    if (HalfIdx2 < 0 || HalfIdx2 == HalfIdx) {
+      HalfMask[i] = HalfElt + HalfNumElts;
       HalfIdx2 = HalfIdx;
       continue;
     }
@@ -10505,6 +10931,33 @@ static SDValue lowerVectorShuffleWithUndefHalf(SDLoc DL, MVT VT, SDValue V1,
   }
   assert(HalfMask.size() == HalfNumElts && "Unexpected shuffle mask length");
 
+  // Only shuffle the halves of the inputs when useful.
+  int NumLowerHalves =
+      (HalfIdx1 == 0 || HalfIdx1 == 2) + (HalfIdx2 == 0 || HalfIdx2 == 2);
+  int NumUpperHalves =
+      (HalfIdx1 == 1 || HalfIdx1 == 3) + (HalfIdx2 == 1 || HalfIdx2 == 3);
+
+  // uuuuXXXX - don't extract uppers just to insert again.
+  if (UndefLower && NumUpperHalves != 0)
+    return SDValue();
+
+  // XXXXuuuu - don't extract both uppers, instead shuffle and then extract.
+  if (UndefUpper && NumUpperHalves == 2)
+    return SDValue();
+
+  // AVX2 - XXXXuuuu - always extract lowers.
+  if (Subtarget.hasAVX2() && !(UndefUpper && NumUpperHalves == 0)) {
+    // AVX2 supports efficient immediate 64-bit element cross-lane shuffles.
+    if (VT == MVT::v4f64 || VT == MVT::v4i64)
+      return SDValue();
+    // AVX2 supports variable 32-bit element cross-lane shuffles.
+    if (VT == MVT::v8f32 || VT == MVT::v8i32) {
+      // XXXXuuuu - don't extract lowers and uppers.
+      if (UndefUpper && NumLowerHalves != 0 && NumUpperHalves != 0)
+        return SDValue();
+    }
+  }
+
   auto GetHalfVector = [&](int HalfIdx) {
     if (HalfIdx < 0)
       return DAG.getUNDEF(HalfVT);
@@ -10536,7 +10989,177 @@ static bool isShuffleMaskInputInPlace(int Input, ArrayRef<int> Mask) {
   return true;
 }
 
-static SDValue lowerVectorShuffleWithSHUFPD(SDLoc DL, MVT VT,
+/// Handle case where shuffle sources are coming from the same 128-bit lane and
+/// every lane can be represented as the same repeating mask - allowing us to
+/// shuffle the sources with the repeating shuffle and then permute the result
+/// to the destination lanes.
+static SDValue lowerShuffleAsRepeatedMaskAndLanePermute(
+    const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,
+    const X86Subtarget &Subtarget, SelectionDAG &DAG) {
+  int NumElts = VT.getVectorNumElements();
+  int NumLanes = VT.getSizeInBits() / 128;
+  int NumLaneElts = NumElts / NumLanes;
+
+  // On AVX2 we may be able to just shuffle the lowest elements and then
+  // broadcast the result.
+  if (Subtarget.hasAVX2()) {
+    for (unsigned BroadcastSize : {16, 32, 64}) {
+      if (BroadcastSize <= VT.getScalarSizeInBits())
+        continue;
+      int NumBroadcastElts = BroadcastSize / VT.getScalarSizeInBits();
+
+      // Attempt to match a repeating pattern every NumBroadcastElts,
+      // accounting for UNDEFs but only references the lowest 128-bit
+      // lane of the inputs.
+      auto FindRepeatingBroadcastMask = [&](SmallVectorImpl<int> &RepeatMask) {
+        for (int i = 0; i != NumElts; i += NumBroadcastElts)
+          for (int j = 0; j != NumBroadcastElts; ++j) {
+            int M = Mask[i + j];
+            if (M < 0)
+              continue;
+            int &R = RepeatMask[j];
+            if (0 != ((M % NumElts) / NumLaneElts))
+              return false;
+            if (0 <= R && R != M)
+              return false;
+            R = M;
+          }
+        return true;
+      };
+
+      SmallVector<int, 8> RepeatMask((unsigned)NumElts, -1);
+      if (!FindRepeatingBroadcastMask(RepeatMask))
+        continue;
+
+      // Shuffle the (lowest) repeated elements in place for broadcast.
+      SDValue RepeatShuf = DAG.getVectorShuffle(VT, DL, V1, V2, RepeatMask);
+
+      // Shuffle the actual broadcast.
+      SmallVector<int, 8> BroadcastMask((unsigned)NumElts, -1);
+      for (int i = 0; i != NumElts; i += NumBroadcastElts)
+        for (int j = 0; j != NumBroadcastElts; ++j)
+          BroadcastMask[i + j] = j;
+      return DAG.getVectorShuffle(VT, DL, RepeatShuf, DAG.getUNDEF(VT),
+                                  BroadcastMask);
+    }
+  }
+
+  // Bail if the shuffle mask doesn't cross 128-bit lanes.
+  if (!is128BitLaneCrossingShuffleMask(VT, Mask))
+    return SDValue();
+
+  // Bail if we already have a repeated lane shuffle mask.
+  SmallVector<int, 8> RepeatedShuffleMask;
+  if (is128BitLaneRepeatedShuffleMask(VT, Mask, RepeatedShuffleMask))
+    return SDValue();
+
+  // On AVX2 targets we can permute 256-bit vectors as 64-bit sub-lanes
+  // (with PERMQ/PERMPD), otherwise we can only permute whole 128-bit lanes.
+  int SubLaneScale = Subtarget.hasAVX2() && VT.is256BitVector() ? 2 : 1;
+  int NumSubLanes = NumLanes * SubLaneScale;
+  int NumSubLaneElts = NumLaneElts / SubLaneScale;
+
+  // Check that all the sources are coming from the same lane and see if we can
+  // form a repeating shuffle mask (local to each sub-lane). At the same time,
+  // determine the source sub-lane for each destination sub-lane.
+  int TopSrcSubLane = -1;
+  SmallVector<int, 8> Dst2SrcSubLanes((unsigned)NumSubLanes, -1);
+  SmallVector<int, 8> RepeatedSubLaneMasks[2] = {
+      SmallVector<int, 8>((unsigned)NumSubLaneElts, SM_SentinelUndef),
+      SmallVector<int, 8>((unsigned)NumSubLaneElts, SM_SentinelUndef)};
+
+  for (int DstSubLane = 0; DstSubLane != NumSubLanes; ++DstSubLane) {
+    // Extract the sub-lane mask, check that it all comes from the same lane
+    // and normalize the mask entries to come from the first lane.
+    int SrcLane = -1;
+    SmallVector<int, 8> SubLaneMask((unsigned)NumSubLaneElts, -1);
+    for (int Elt = 0; Elt != NumSubLaneElts; ++Elt) {
+      int M = Mask[(DstSubLane * NumSubLaneElts) + Elt];
+      if (M < 0)
+        continue;
+      int Lane = (M % NumElts) / NumLaneElts;
+      if ((0 <= SrcLane) && (SrcLane != Lane))
+        return SDValue();
+      SrcLane = Lane;
+      int LocalM = (M % NumLaneElts) + (M < NumElts ? 0 : NumElts);
+      SubLaneMask[Elt] = LocalM;
+    }
+
+    // Whole sub-lane is UNDEF.
+    if (SrcLane < 0)
+      continue;
+
+    // Attempt to match against the candidate repeated sub-lane masks.
+    for (int SubLane = 0; SubLane != SubLaneScale; ++SubLane) {
+      auto MatchMasks = [NumSubLaneElts](ArrayRef<int> M1, ArrayRef<int> M2) {
+        for (int i = 0; i != NumSubLaneElts; ++i) {
+          if (M1[i] < 0 || M2[i] < 0)
+            continue;
+          if (M1[i] != M2[i])
+            return false;
+        }
+        return true;
+      };
+
+      auto &RepeatedSubLaneMask = RepeatedSubLaneMasks[SubLane];
+      if (!MatchMasks(SubLaneMask, RepeatedSubLaneMask))
+        continue;
+
+      // Merge the sub-lane mask into the matching repeated sub-lane mask.
+      for (int i = 0; i != NumSubLaneElts; ++i) {
+        int M = SubLaneMask[i];
+        if (M < 0)
+          continue;
+        assert((RepeatedSubLaneMask[i] < 0 || RepeatedSubLaneMask[i] == M) &&
+               "Unexpected mask element");
+        RepeatedSubLaneMask[i] = M;
+      }
+
+      // Track the top most source sub-lane - by setting the remaining to UNDEF
+      // we can greatly simplify shuffle matching.
+      int SrcSubLane = (SrcLane * SubLaneScale) + SubLane;
+      TopSrcSubLane = std::max(TopSrcSubLane, SrcSubLane);
+      Dst2SrcSubLanes[DstSubLane] = SrcSubLane;
+      break;
+    }
+
+    // Bail if we failed to find a matching repeated sub-lane mask.
+    if (Dst2SrcSubLanes[DstSubLane] < 0)
+      return SDValue();
+  }
+  assert(0 <= TopSrcSubLane && TopSrcSubLane < NumSubLanes &&
+         "Unexpected source lane");
+
+  // Create a repeating shuffle mask for the entire vector.
+  SmallVector<int, 8> RepeatedMask((unsigned)NumElts, -1);
+  for (int SubLane = 0; SubLane <= TopSrcSubLane; ++SubLane) {
+    int Lane = SubLane / SubLaneScale;
+    auto &RepeatedSubLaneMask = RepeatedSubLaneMasks[SubLane % SubLaneScale];
+    for (int Elt = 0; Elt != NumSubLaneElts; ++Elt) {
+      int M = RepeatedSubLaneMask[Elt];
+      if (M < 0)
+        continue;
+      int Idx = (SubLane * NumSubLaneElts) + Elt;
+      RepeatedMask[Idx] = M + (Lane * NumLaneElts);
+    }
+  }
+  SDValue RepeatedShuffle = DAG.getVectorShuffle(VT, DL, V1, V2, RepeatedMask);
+
+  // Shuffle each source sub-lane to its destination.
+  SmallVector<int, 8> SubLaneMask((unsigned)NumElts, -1);
+  for (int i = 0; i != NumElts; i += NumSubLaneElts) {
+    int SrcSubLane = Dst2SrcSubLanes[i / NumSubLaneElts];
+    if (SrcSubLane < 0)
+      continue;
+    for (int j = 0; j != NumSubLaneElts; ++j)
+      SubLaneMask[i + j] = j + (SrcSubLane * NumSubLaneElts);
+  }
+
+  return DAG.getVectorShuffle(VT, DL, RepeatedShuffle, DAG.getUNDEF(VT),
+                              SubLaneMask);
+}
+
+static SDValue lowerVectorShuffleWithSHUFPD(const SDLoc &DL, MVT VT,
                                             ArrayRef<int> Mask, SDValue V1,
                                             SDValue V2, SelectionDAG &DAG) {
 
@@ -10571,25 +11194,24 @@ static SDValue lowerVectorShuffleWithSHUFPD(SDLoc DL, MVT VT,
 ///
 /// Also ends up handling lowering of 4-lane 64-bit integer shuffles when AVX2
 /// isn't available.
-static SDValue lowerV4F64VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
-                                       const X86Subtarget *Subtarget,
+static SDValue lowerV4F64VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
+                                       SDValue V1, SDValue V2,
+                                       const X86Subtarget &Subtarget,
                                        SelectionDAG &DAG) {
-  SDLoc DL(Op);
   assert(V1.getSimpleValueType() == MVT::v4f64 && "Bad operand type!");
   assert(V2.getSimpleValueType() == MVT::v4f64 && "Bad operand type!");
-  ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
-  ArrayRef<int> Mask = SVOp->getMask();
   assert(Mask.size() == 4 && "Unexpected mask size for v4 shuffle!");
 
   SmallVector<int, 4> WidenedMask;
   if (canWidenShuffleElements(Mask, WidenedMask))
-    return lowerV2X128VectorShuffle(DL, MVT::v4f64, V1, V2, Mask, Subtarget,
-                                    DAG);
+    if (SDValue V = lowerV2X128VectorShuffle(DL, MVT::v4f64, V1, V2, Mask,
+                                             Subtarget, DAG))
+      return V;
 
-  if (isSingleInputShuffleMask(Mask)) {
+  if (V2.isUndef()) {
     // Check for being able to broadcast a single element.
-    if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(DL, MVT::v4f64, V1,
-                                                          Mask, Subtarget, DAG))
+    if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(
+            DL, MVT::v4f64, V1, V2, Mask, Subtarget, DAG))
       return Broadcast;
 
     // Use low duplicate instructions for masks that match their pattern.
@@ -10597,7 +11219,7 @@ static SDValue lowerV4F64VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
       return DAG.getNode(X86ISD::MOVDDUP, DL, MVT::v4f64, V1);
 
     if (!is128BitLaneCrossingShuffleMask(MVT::v4f64, Mask)) {
-      // Non-half-crossing single input shuffles can be lowerid with an
+      // Non-half-crossing single input shuffles can be lowered with an
       // interleaved permutation.
       unsigned VPERMILPMask = (Mask[0] == 1) | ((Mask[1] == 1) << 1) |
                               ((Mask[2] == 3) << 2) | ((Mask[3] == 3) << 3);
@@ -10606,10 +11228,16 @@ static SDValue lowerV4F64VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
     }
 
     // With AVX2 we have direct support for this permutation.
-    if (Subtarget->hasAVX2())
+    if (Subtarget.hasAVX2())
       return DAG.getNode(X86ISD::VPERMI, DL, MVT::v4f64, V1,
                          getV4X86ShuffleImm8ForMask(Mask, DL, DAG));
 
+    // Try to create an in-lane repeating shuffle mask and then shuffle the
+    // the results into the target lanes.
+    if (SDValue V = lowerShuffleAsRepeatedMaskAndLanePermute(
+            DL, MVT::v4f64, V1, V2, Mask, Subtarget, DAG))
+      return V;
+
     // Otherwise, fall back.
     return lowerVectorShuffleAsLanePermuteAndBlend(DL, MVT::v4f64, V1, V2, Mask,
                                                    DAG);
@@ -10629,19 +11257,25 @@ static SDValue lowerV4F64VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
       lowerVectorShuffleWithSHUFPD(DL, MVT::v4f64, Mask, V1, V2, DAG))
     return Op;
 
+  // Try to create an in-lane repeating shuffle mask and then shuffle the
+  // the results into the target lanes.
+  if (SDValue V = lowerShuffleAsRepeatedMaskAndLanePermute(
+          DL, MVT::v4f64, V1, V2, Mask, Subtarget, DAG))
+  return V;
+
   // Try to simplify this by merging 128-bit lanes to enable a lane-based
   // shuffle. However, if we have AVX2 and either inputs are already in place,
   // we will be able to shuffle even across lanes the other input in a single
   // instruction so skip this pattern.
-  if (!(Subtarget->hasAVX2() && (isShuffleMaskInputInPlace(0, Mask) ||
-                                 isShuffleMaskInputInPlace(1, Mask))))
+  if (!(Subtarget.hasAVX2() && (isShuffleMaskInputInPlace(0, Mask) ||
+                                isShuffleMaskInputInPlace(1, Mask))))
     if (SDValue Result = lowerVectorShuffleByMerging128BitLanes(
             DL, MVT::v4f64, V1, V2, Mask, Subtarget, DAG))
       return Result;
 
   // If we have AVX2 then we always want to lower with a blend because an v4 we
   // can fully permute the elements.
-  if (Subtarget->hasAVX2())
+  if (Subtarget.hasAVX2())
     return lowerVectorShuffleAsDecomposedShuffleBlend(DL, MVT::v4f64, V1, V2,
                                                       Mask, DAG);
 
@@ -10653,59 +11287,53 @@ static SDValue lowerV4F64VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
 ///
 /// This routine is only called when we have AVX2 and thus a reasonable
 /// instruction set for v4i64 shuffling..
-static SDValue lowerV4I64VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
-                                       const X86Subtarget *Subtarget,
+static SDValue lowerV4I64VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
+                                       SDValue V1, SDValue V2,
+                                       const X86Subtarget &Subtarget,
                                        SelectionDAG &DAG) {
-  SDLoc DL(Op);
   assert(V1.getSimpleValueType() == MVT::v4i64 && "Bad operand type!");
   assert(V2.getSimpleValueType() == MVT::v4i64 && "Bad operand type!");
-  ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
-  ArrayRef<int> Mask = SVOp->getMask();
   assert(Mask.size() == 4 && "Unexpected mask size for v4 shuffle!");
-  assert(Subtarget->hasAVX2() && "We can only lower v4i64 with AVX2!");
+  assert(Subtarget.hasAVX2() && "We can only lower v4i64 with AVX2!");
 
   SmallVector<int, 4> WidenedMask;
   if (canWidenShuffleElements(Mask, WidenedMask))
-    return lowerV2X128VectorShuffle(DL, MVT::v4i64, V1, V2, Mask, Subtarget,
-                                    DAG);
+    if (SDValue V = lowerV2X128VectorShuffle(DL, MVT::v4i64, V1, V2, Mask,
+                                             Subtarget, DAG))
+      return V;
 
   if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v4i64, V1, V2, Mask,
                                                 Subtarget, DAG))
     return Blend;
 
   // Check for being able to broadcast a single element.
-  if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(DL, MVT::v4i64, V1,
+  if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(DL, MVT::v4i64, V1, V2,
                                                         Mask, Subtarget, DAG))
     return Broadcast;
 
-  // When the shuffle is mirrored between the 128-bit lanes of the unit, we can
-  // use lower latency instructions that will operate on both 128-bit lanes.
-  SmallVector<int, 2> RepeatedMask;
-  if (is128BitLaneRepeatedShuffleMask(MVT::v4i64, Mask, RepeatedMask)) {
-    if (isSingleInputShuffleMask(Mask)) {
-      int PSHUFDMask[] = {-1, -1, -1, -1};
-      for (int i = 0; i < 2; ++i)
-        if (RepeatedMask[i] >= 0) {
-          PSHUFDMask[2 * i] = 2 * RepeatedMask[i];
-          PSHUFDMask[2 * i + 1] = 2 * RepeatedMask[i] + 1;
-        }
+  if (V2.isUndef()) {
+    // When the shuffle is mirrored between the 128-bit lanes of the unit, we
+    // can use lower latency instructions that will operate on both lanes.
+    SmallVector<int, 2> RepeatedMask;
+    if (is128BitLaneRepeatedShuffleMask(MVT::v4i64, Mask, RepeatedMask)) {
+      SmallVector<int, 4> PSHUFDMask;
+      scaleShuffleMask(2, RepeatedMask, PSHUFDMask);
       return DAG.getBitcast(
           MVT::v4i64,
           DAG.getNode(X86ISD::PSHUFD, DL, MVT::v8i32,
                       DAG.getBitcast(MVT::v8i32, V1),
                       getV4X86ShuffleImm8ForMask(PSHUFDMask, DL, DAG)));
     }
-  }
 
-  // AVX2 provides a direct instruction for permuting a single input across
-  // lanes.
-  if (isSingleInputShuffleMask(Mask))
+    // AVX2 provides a direct instruction for permuting a single input across
+    // lanes.
     return DAG.getNode(X86ISD::VPERMI, DL, MVT::v4i64, V1,
                        getV4X86ShuffleImm8ForMask(Mask, DL, DAG));
+  }
 
   // Try to use shift instructions.
-  if (SDValue Shift =
-          lowerVectorShuffleAsShift(DL, MVT::v4i64, V1, V2, Mask, DAG))
+  if (SDValue Shift = lowerVectorShuffleAsShift(DL, MVT::v4i64, V1, V2, Mask,
+                                                Subtarget, DAG))
     return Shift;
 
   // Use dedicated unpack instructions for masks that match their pattern.
@@ -10717,7 +11345,7 @@ static SDValue lowerV4I64VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
   // shuffle. However, if we have AVX2 and either inputs are already in place,
   // we will be able to shuffle even across lanes the other input in a single
   // instruction so skip this pattern.
-  if (!(Subtarget->hasAVX2() && (isShuffleMaskInputInPlace(0, Mask) ||
+  if (!(Subtarget.hasAVX2() && (isShuffleMaskInputInPlace(0, Mask) ||
                                  isShuffleMaskInputInPlace(1, Mask))))
     if (SDValue Result = lowerVectorShuffleByMerging128BitLanes(
             DL, MVT::v4i64, V1, V2, Mask, Subtarget, DAG))
@@ -10732,14 +11360,12 @@ static SDValue lowerV4I64VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
 ///
 /// Also ends up handling lowering of 8-lane 32-bit integer shuffles when AVX2
 /// isn't available.
-static SDValue lowerV8F32VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
-                                       const X86Subtarget *Subtarget,
+static SDValue lowerV8F32VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
+                                       SDValue V1, SDValue V2,
+                                       const X86Subtarget &Subtarget,
                                        SelectionDAG &DAG) {
-  SDLoc DL(Op);
   assert(V1.getSimpleValueType() == MVT::v8f32 && "Bad operand type!");
   assert(V2.getSimpleValueType() == MVT::v8f32 && "Bad operand type!");
-  ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
-  ArrayRef<int> Mask = SVOp->getMask();
   assert(Mask.size() == 8 && "Unexpected mask size for v8 shuffle!");
 
   if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v8f32, V1, V2, Mask,
@@ -10747,7 +11373,7 @@ static SDValue lowerV8F32VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
     return Blend;
 
   // Check for being able to broadcast a single element.
-  if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(DL, MVT::v8f32, V1,
+  if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(DL, MVT::v8f32, V1, V2,
                                                         Mask, Subtarget, DAG))
     return Broadcast;
 
@@ -10759,12 +11385,12 @@ static SDValue lowerV8F32VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
            "Repeated masks must be half the mask width!");
 
     // Use even/odd duplicate instructions for masks that match their pattern.
-    if (isShuffleEquivalent(V1, V2, Mask, {0, 0, 2, 2, 4, 4, 6, 6}))
+    if (isShuffleEquivalent(V1, V2, RepeatedMask, {0, 0, 2, 2}))
       return DAG.getNode(X86ISD::MOVSLDUP, DL, MVT::v8f32, V1);
-    if (isShuffleEquivalent(V1, V2, Mask, {1, 1, 3, 3, 5, 5, 7, 7}))
+    if (isShuffleEquivalent(V1, V2, RepeatedMask, {1, 1, 3, 3}))
       return DAG.getNode(X86ISD::MOVSHDUP, DL, MVT::v8f32, V1);
 
-    if (isSingleInputShuffleMask(Mask))
+    if (V2.isUndef())
       return DAG.getNode(X86ISD::VPERMILPI, DL, MVT::v8f32, V1,
                          getV4X86ShuffleImm8ForMask(RepeatedMask, DL, DAG));
 
@@ -10774,30 +11400,30 @@ static SDValue lowerV8F32VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
       return V;
 
     // Otherwise, fall back to a SHUFPS sequence. Here it is important that we
-    // have already handled any direct blends. We also need to squash the
-    // repeated mask into a simulated v4f32 mask.
-    for (int i = 0; i < 4; ++i)
-      if (RepeatedMask[i] >= 8)
-        RepeatedMask[i] -= 4;
+    // have already handled any direct blends.
     return lowerVectorShuffleWithSHUFPS(DL, MVT::v8f32, RepeatedMask, V1, V2, DAG);
   }
 
+  // Try to create an in-lane repeating shuffle mask and then shuffle the
+  // the results into the target lanes.
+  if (SDValue V = lowerShuffleAsRepeatedMaskAndLanePermute(
+          DL, MVT::v8f32, V1, V2, Mask, Subtarget, DAG))
+    return V;
+
   // If we have a single input shuffle with different shuffle patterns in the
   // two 128-bit lanes use the variable mask to VPERMILPS.
-  if (isSingleInputShuffleMask(Mask)) {
+  if (V2.isUndef()) {
     SDValue VPermMask[8];
     for (int i = 0; i < 8; ++i)
       VPermMask[i] = Mask[i] < 0 ? DAG.getUNDEF(MVT::i32)
                                  : DAG.getConstant(Mask[i], DL, MVT::i32);
     if (!is128BitLaneCrossingShuffleMask(MVT::v8f32, Mask))
-      return DAG.getNode(
-          X86ISD::VPERMILPV, DL, MVT::v8f32, V1,
-          DAG.getNode(ISD::BUILD_VECTOR, DL, MVT::v8i32, VPermMask));
+      return DAG.getNode(X86ISD::VPERMILPV, DL, MVT::v8f32, V1,
+                         DAG.getBuildVector(MVT::v8i32, DL, VPermMask));
 
-    if (Subtarget->hasAVX2())
-      return DAG.getNode(
-          X86ISD::VPERMV, DL, MVT::v8f32,
-          DAG.getNode(ISD::BUILD_VECTOR, DL, MVT::v8i32, VPermMask), V1);
+    if (Subtarget.hasAVX2())
+      return DAG.getNode(X86ISD::VPERMV, DL, MVT::v8f32,
+                         DAG.getBuildVector(MVT::v8i32, DL, VPermMask), V1);
 
     // Otherwise, fall back.
     return lowerVectorShuffleAsLanePermuteAndBlend(DL, MVT::v8f32, V1, V2, Mask,
@@ -10812,7 +11438,7 @@ static SDValue lowerV8F32VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
 
   // If we have AVX2 then we always want to lower with a blend because at v8 we
   // can fully permute the elements.
-  if (Subtarget->hasAVX2())
+  if (Subtarget.hasAVX2())
     return lowerVectorShuffleAsDecomposedShuffleBlend(DL, MVT::v8f32, V1, V2,
                                                       Mask, DAG);
 
@@ -10824,16 +11450,14 @@ static SDValue lowerV8F32VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
 ///
 /// This routine is only called when we have AVX2 and thus a reasonable
 /// instruction set for v8i32 shuffling..
-static SDValue lowerV8I32VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
-                                       const X86Subtarget *Subtarget,
+static SDValue lowerV8I32VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
+                                       SDValue V1, SDValue V2,
+                                       const X86Subtarget &Subtarget,
                                        SelectionDAG &DAG) {
-  SDLoc DL(Op);
   assert(V1.getSimpleValueType() == MVT::v8i32 && "Bad operand type!");
   assert(V2.getSimpleValueType() == MVT::v8i32 && "Bad operand type!");
-  ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
-  ArrayRef<int> Mask = SVOp->getMask();
   assert(Mask.size() == 8 && "Unexpected mask size for v8 shuffle!");
-  assert(Subtarget->hasAVX2() && "We can only lower v8i32 with AVX2!");
+  assert(Subtarget.hasAVX2() && "We can only lower v8i32 with AVX2!");
 
   // Whenever we can lower this as a zext, that instruction is strictly faster
   // than any alternative. It also allows us to fold memory operands into the
@@ -10847,7 +11471,7 @@ static SDValue lowerV8I32VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
     return Blend;
 
   // Check for being able to broadcast a single element.
-  if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(DL, MVT::v8i32, V1,
+  if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(DL, MVT::v8i32, V1, V2,
                                                         Mask, Subtarget, DAG))
     return Broadcast;
 
@@ -10857,7 +11481,7 @@ static SDValue lowerV8I32VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
   SmallVector<int, 4> RepeatedMask;
   if (is128BitLaneRepeatedShuffleMask(MVT::v8i32, Mask, RepeatedMask)) {
     assert(RepeatedMask.size() == 4 && "Unexpected repeated mask size!");
-    if (isSingleInputShuffleMask(Mask))
+    if (V2.isUndef())
       return DAG.getNode(X86ISD::PSHUFD, DL, MVT::v8i32, V1,
                          getV4X86ShuffleImm8ForMask(RepeatedMask, DL, DAG));
 
@@ -10868,24 +11492,30 @@ static SDValue lowerV8I32VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
   }
 
   // Try to use shift instructions.
-  if (SDValue Shift =
-          lowerVectorShuffleAsShift(DL, MVT::v8i32, V1, V2, Mask, DAG))
+  if (SDValue Shift = lowerVectorShuffleAsShift(DL, MVT::v8i32, V1, V2, Mask,
+                                                Subtarget, DAG))
     return Shift;
 
+  // Try to use byte rotation instructions.
   if (SDValue Rotate = lowerVectorShuffleAsByteRotate(
           DL, MVT::v8i32, V1, V2, Mask, Subtarget, DAG))
     return Rotate;
 
+  // Try to create an in-lane repeating shuffle mask and then shuffle the
+  // the results into the target lanes.
+  if (SDValue V = lowerShuffleAsRepeatedMaskAndLanePermute(
+          DL, MVT::v8i32, V1, V2, Mask, Subtarget, DAG))
+    return V;
+
   // If the shuffle patterns aren't repeated but it is a single input, directly
   // generate a cross-lane VPERMD instruction.
-  if (isSingleInputShuffleMask(Mask)) {
+  if (V2.isUndef()) {
     SDValue VPermMask[8];
     for (int i = 0; i < 8; ++i)
       VPermMask[i] = Mask[i] < 0 ? DAG.getUNDEF(MVT::i32)
                                  : DAG.getConstant(Mask[i], DL, MVT::i32);
-    return DAG.getNode(
-        X86ISD::VPERMV, DL, MVT::v8i32,
-        DAG.getNode(ISD::BUILD_VECTOR, DL, MVT::v8i32, VPermMask), V1);
+    return DAG.getNode(X86ISD::VPERMV, DL, MVT::v8i32,
+                       DAG.getBuildVector(MVT::v8i32, DL, VPermMask), V1);
   }
 
   // Try to simplify this by merging 128-bit lanes to enable a lane-based
@@ -10903,16 +11533,14 @@ static SDValue lowerV8I32VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
 ///
 /// This routine is only called when we have AVX2 and thus a reasonable
 /// instruction set for v16i16 shuffling..
-static SDValue lowerV16I16VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
-                                        const X86Subtarget *Subtarget,
+static SDValue lowerV16I16VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
+                                        SDValue V1, SDValue V2,
+                                        const X86Subtarget &Subtarget,
                                         SelectionDAG &DAG) {
-  SDLoc DL(Op);
   assert(V1.getSimpleValueType() == MVT::v16i16 && "Bad operand type!");
   assert(V2.getSimpleValueType() == MVT::v16i16 && "Bad operand type!");
-  ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
-  ArrayRef<int> Mask = SVOp->getMask();
   assert(Mask.size() == 16 && "Unexpected mask size for v16 shuffle!");
-  assert(Subtarget->hasAVX2() && "We can only lower v16i16 with AVX2!");
+  assert(Subtarget.hasAVX2() && "We can only lower v16i16 with AVX2!");
 
   // Whenever we can lower this as a zext, that instruction is strictly faster
   // than any alternative. It also allows us to fold memory operands into the
@@ -10922,7 +11550,7 @@ static SDValue lowerV16I16VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
     return ZExt;
 
   // Check for being able to broadcast a single element.
-  if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(DL, MVT::v16i16, V1,
+  if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(DL, MVT::v16i16, V1, V2,
                                                         Mask, Subtarget, DAG))
     return Broadcast;
 
@@ -10936,8 +11564,8 @@ static SDValue lowerV16I16VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
     return V;
 
   // Try to use shift instructions.
-  if (SDValue Shift =
-          lowerVectorShuffleAsShift(DL, MVT::v16i16, V1, V2, Mask, DAG))
+  if (SDValue Shift = lowerVectorShuffleAsShift(DL, MVT::v16i16, V1, V2, Mask,
+                                                Subtarget, DAG))
     return Shift;
 
   // Try to use byte rotation instructions.
@@ -10945,7 +11573,13 @@ static SDValue lowerV16I16VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
           DL, MVT::v16i16, V1, V2, Mask, Subtarget, DAG))
     return Rotate;
 
-  if (isSingleInputShuffleMask(Mask)) {
+  // Try to create an in-lane repeating shuffle mask and then shuffle the
+  // the results into the target lanes.
+  if (SDValue V = lowerShuffleAsRepeatedMaskAndLanePermute(
+          DL, MVT::v16i16, V1, V2, Mask, Subtarget, DAG))
+    return V;
+
+  if (V2.isUndef()) {
     // There are no generalized cross-lane shuffle operations available on i16
     // element types.
     if (is128BitLaneCrossingShuffleMask(MVT::v16i16, Mask))
@@ -10960,26 +11594,12 @@ static SDValue lowerV16I16VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
       return lowerV8I16GeneralSingleInputVectorShuffle(
           DL, MVT::v16i16, V1, RepeatedMask, Subtarget, DAG);
     }
-
-    SDValue PSHUFBMask[32];
-    for (int i = 0; i < 16; ++i) {
-      if (Mask[i] == -1) {
-        PSHUFBMask[2 * i] = PSHUFBMask[2 * i + 1] = DAG.getUNDEF(MVT::i8);
-        continue;
-      }
-
-      int M = i < 8 ? Mask[i] : Mask[i] - 8;
-      assert(M >= 0 && M < 8 && "Invalid single-input mask!");
-      PSHUFBMask[2 * i] = DAG.getConstant(2 * M, DL, MVT::i8);
-      PSHUFBMask[2 * i + 1] = DAG.getConstant(2 * M + 1, DL, MVT::i8);
-    }
-    return DAG.getBitcast(MVT::v16i16,
-                          DAG.getNode(X86ISD::PSHUFB, DL, MVT::v32i8,
-                                      DAG.getBitcast(MVT::v32i8, V1),
-                                      DAG.getNode(ISD::BUILD_VECTOR, DL,
-                                                  MVT::v32i8, PSHUFBMask)));
   }
 
+  if (SDValue PSHUFB = lowerVectorShuffleWithPSHUFB(DL, MVT::v16i16, Mask, V1,
+                                                    V2, Subtarget, DAG))
+    return PSHUFB;
+
   // Try to simplify this by merging 128-bit lanes to enable a lane-based
   // shuffle.
   if (SDValue Result = lowerVectorShuffleByMerging128BitLanes(
@@ -10994,16 +11614,14 @@ static SDValue lowerV16I16VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
 ///
 /// This routine is only called when we have AVX2 and thus a reasonable
 /// instruction set for v32i8 shuffling..
-static SDValue lowerV32I8VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
-                                       const X86Subtarget *Subtarget,
+static SDValue lowerV32I8VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
+                                       SDValue V1, SDValue V2,
+                                       const X86Subtarget &Subtarget,
                                        SelectionDAG &DAG) {
-  SDLoc DL(Op);
   assert(V1.getSimpleValueType() == MVT::v32i8 && "Bad operand type!");
   assert(V2.getSimpleValueType() == MVT::v32i8 && "Bad operand type!");
-  ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
-  ArrayRef<int> Mask = SVOp->getMask();
   assert(Mask.size() == 32 && "Unexpected mask size for v32 shuffle!");
-  assert(Subtarget->hasAVX2() && "We can only lower v32i8 with AVX2!");
+  assert(Subtarget.hasAVX2() && "We can only lower v32i8 with AVX2!");
 
   // Whenever we can lower this as a zext, that instruction is strictly faster
   // than any alternative. It also allows us to fold memory operands into the
@@ -11013,7 +11631,7 @@ static SDValue lowerV32I8VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
     return ZExt;
 
   // Check for being able to broadcast a single element.
-  if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(DL, MVT::v32i8, V1,
+  if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(DL, MVT::v32i8, V1, V2,
                                                         Mask, Subtarget, DAG))
     return Broadcast;
 
@@ -11027,8 +11645,8 @@ static SDValue lowerV32I8VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
     return V;
 
   // Try to use shift instructions.
-  if (SDValue Shift =
-          lowerVectorShuffleAsShift(DL, MVT::v32i8, V1, V2, Mask, DAG))
+  if (SDValue Shift = lowerVectorShuffleAsShift(DL, MVT::v32i8, V1, V2, Mask,
+                                                Subtarget, DAG))
     return Shift;
 
   // Try to use byte rotation instructions.
@@ -11036,25 +11654,21 @@ static SDValue lowerV32I8VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
           DL, MVT::v32i8, V1, V2, Mask, Subtarget, DAG))
     return Rotate;
 
-  if (isSingleInputShuffleMask(Mask)) {
-    // There are no generalized cross-lane shuffle operations available on i8
-    // element types.
-    if (is128BitLaneCrossingShuffleMask(MVT::v32i8, Mask))
-      return lowerVectorShuffleAsLanePermuteAndBlend(DL, MVT::v32i8, V1, V2,
-                                                     Mask, DAG);
+  // Try to create an in-lane repeating shuffle mask and then shuffle the
+  // the results into the target lanes.
+  if (SDValue V = lowerShuffleAsRepeatedMaskAndLanePermute(
+          DL, MVT::v32i8, V1, V2, Mask, Subtarget, DAG))
+    return V;
 
-    SDValue PSHUFBMask[32];
-    for (int i = 0; i < 32; ++i)
-      PSHUFBMask[i] =
-          Mask[i] < 0
-              ? DAG.getUNDEF(MVT::i8)
-              : DAG.getConstant(Mask[i] < 16 ? Mask[i] : Mask[i] - 16, DL,
-                                MVT::i8);
+  // There are no generalized cross-lane shuffle operations available on i8
+  // element types.
+  if (V2.isUndef() && is128BitLaneCrossingShuffleMask(MVT::v32i8, Mask))
+    return lowerVectorShuffleAsLanePermuteAndBlend(DL, MVT::v32i8, V1, V2, Mask,
+                                                   DAG);
 
-    return DAG.getNode(
-        X86ISD::PSHUFB, DL, MVT::v32i8, V1,
-        DAG.getNode(ISD::BUILD_VECTOR, DL, MVT::v32i8, PSHUFBMask));
-  }
+  if (SDValue PSHUFB = lowerVectorShuffleWithPSHUFB(DL, MVT::v32i8, Mask, V1,
+                                                    V2, Subtarget, DAG))
+    return PSHUFB;
 
   // Try to simplify this by merging 128-bit lanes to enable a lane-based
   // shuffle.
@@ -11071,19 +11685,14 @@ static SDValue lowerV32I8VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
 /// This routine either breaks down the specific type of a 256-bit x86 vector
 /// shuffle or splits it into two 128-bit shuffles and fuses the results back
 /// together based on the available instructions.
-static SDValue lower256BitVectorShuffle(SDValue Op, SDValue V1, SDValue V2,
-                                        MVT VT, const X86Subtarget *Subtarget,
+static SDValue lower256BitVectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
+                                        MVT VT, SDValue V1, SDValue V2,
+                                        const X86Subtarget &Subtarget,
                                         SelectionDAG &DAG) {
-  SDLoc DL(Op);
-  ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
-  ArrayRef<int> Mask = SVOp->getMask();
-
   // If we have a single input to the zero element, insert that into V1 if we
   // can do so cheaply.
   int NumElts = VT.getVectorNumElements();
-  int NumV2Elements = std::count_if(Mask.begin(), Mask.end(), [NumElts](int M) {
-    return M >= NumElts;
-  });
+  int NumV2Elements = count_if(Mask, [NumElts](int M) { return M >= NumElts; });
 
   if (NumV2Elements == 1 && Mask[0] >= NumElts)
     if (SDValue Insertion = lowerVectorShuffleAsElementInsertion(
@@ -11101,11 +11710,17 @@ static SDValue lower256BitVectorShuffle(SDValue Op, SDValue V1, SDValue V2,
   // essentially *zero* ability to manipulate a 256-bit vector with integer
   // types. Since we'll use floating point types there eventually, just
   // immediately cast everything to a float and operate entirely in that domain.
-  if (VT.isInteger() && !Subtarget->hasAVX2()) {
+  if (VT.isInteger() && !Subtarget.hasAVX2()) {
     int ElementBits = VT.getScalarSizeInBits();
-    if (ElementBits < 32)
-      // No floating point type available, decompose into 128-bit vectors.
+    if (ElementBits < 32) {
+      // No floating point type available, if we can't use the bit operations
+      // for masking/blending then decompose into 128-bit vectors.
+      if (SDValue V = lowerVectorShuffleAsBitMask(DL, VT, V1, V2, Mask, DAG))
+        return V;
+      if (SDValue V = lowerVectorShuffleAsBitBlend(DL, VT, V1, V2, Mask, DAG))
+        return V;
       return splitAndLowerVectorShuffle(DL, VT, V1, V2, Mask, DAG);
+    }
 
     MVT FpVT = MVT::getVectorVT(MVT::getFloatingPointVT(ElementBits),
                                 VT.getVectorNumElements());
@@ -11116,17 +11731,17 @@ static SDValue lower256BitVectorShuffle(SDValue Op, SDValue V1, SDValue V2,
 
   switch (VT.SimpleTy) {
   case MVT::v4f64:
-    return lowerV4F64VectorShuffle(Op, V1, V2, Subtarget, DAG);
+    return lowerV4F64VectorShuffle(DL, Mask, V1, V2, Subtarget, DAG);
   case MVT::v4i64:
-    return lowerV4I64VectorShuffle(Op, V1, V2, Subtarget, DAG);
+    return lowerV4I64VectorShuffle(DL, Mask, V1, V2, Subtarget, DAG);
   case MVT::v8f32:
-    return lowerV8F32VectorShuffle(Op, V1, V2, Subtarget, DAG);
+    return lowerV8F32VectorShuffle(DL, Mask, V1, V2, Subtarget, DAG);
   case MVT::v8i32:
-    return lowerV8I32VectorShuffle(Op, V1, V2, Subtarget, DAG);
+    return lowerV8I32VectorShuffle(DL, Mask, V1, V2, Subtarget, DAG);
   case MVT::v16i16:
-    return lowerV16I16VectorShuffle(Op, V1, V2, Subtarget, DAG);
+    return lowerV16I16VectorShuffle(DL, Mask, V1, V2, Subtarget, DAG);
   case MVT::v32i8:
-    return lowerV32I8VectorShuffle(Op, V1, V2, Subtarget, DAG);
+    return lowerV32I8VectorShuffle(DL, Mask, V1, V2, Subtarget, DAG);
 
   default:
     llvm_unreachable("Not a valid 256-bit x86 vector type!");
@@ -11134,21 +11749,37 @@ static SDValue lower256BitVectorShuffle(SDValue Op, SDValue V1, SDValue V2,
 }
 
 /// \brief Try to lower a vector shuffle as a 128-bit shuffles.
-static SDValue lowerV4X128VectorShuffle(SDLoc DL, MVT VT,
-                                        ArrayRef<int> Mask,
-                                        SDValue V1, SDValue V2,
-                                        SelectionDAG &DAG) {
+static SDValue lowerV4X128VectorShuffle(const SDLoc &DL, MVT VT,
+                                        ArrayRef<int> Mask, SDValue V1,
+                                        SDValue V2, SelectionDAG &DAG) {
   assert(VT.getScalarSizeInBits() == 64 &&
          "Unexpected element type size for 128bit shuffle.");
 
   // To handle 256 bit vector requires VLX and most probably
   // function lowerV2X128VectorShuffle() is better solution.
-  assert(VT.is512BitVector() && "Unexpected vector size for 128bit shuffle.");
+  assert(VT.is512BitVector() && "Unexpected vector size for 512bit shuffle.");
 
   SmallVector<int, 4> WidenedMask;
   if (!canWidenShuffleElements(Mask, WidenedMask))
     return SDValue();
 
+  SDValue Ops[2] = {DAG.getUNDEF(VT), DAG.getUNDEF(VT)};
+  // Insure elements came from the same Op.
+  int MaxOp1Index = VT.getVectorNumElements()/2 - 1;
+  for (int i = 0, Size = WidenedMask.size(); i < Size; ++i) {
+    if (WidenedMask[i] == SM_SentinelZero)
+      return SDValue();
+    if (WidenedMask[i] == SM_SentinelUndef)
+      continue;
+
+    SDValue Op = WidenedMask[i] > MaxOp1Index ? V2 : V1;
+    unsigned OpIndex = (i < Size/2) ? 0 : 1;
+    if (Ops[OpIndex].isUndef())
+      Ops[OpIndex] = Op;
+    else if (Ops[OpIndex] != Op)
+      return SDValue();
+  }
+
   // Form a 128-bit permutation.
   // Convert the 64-bit shuffle mask selection values into 128-bit selection
   // bits defined by a vshuf64x2 instruction's immediate control byte.
@@ -11156,19 +11787,16 @@ static SDValue lowerV4X128VectorShuffle(SDLoc DL, MVT VT,
   unsigned ControlBitsNum = WidenedMask.size() / 2;
 
   for (int i = 0, Size = WidenedMask.size(); i < Size; ++i) {
-    if (WidenedMask[i] == SM_SentinelZero)
-      return SDValue();
-
     // Use first element in place of undef mask.
     Imm = (WidenedMask[i] == SM_SentinelUndef) ? 0 : WidenedMask[i];
     PermMask |= (Imm % WidenedMask.size()) << (i * ControlBitsNum);
   }
 
-  return DAG.getNode(X86ISD::SHUF128, DL, VT, V1, V2,
+  return DAG.getNode(X86ISD::SHUF128, DL, VT, Ops[0], Ops[1],
                      DAG.getConstant(PermMask, DL, MVT::i8));
 }
 
-static SDValue lowerVectorShuffleWithPERMV(SDLoc DL, MVT VT,
+static SDValue lowerVectorShuffleWithPERMV(const SDLoc &DL, MVT VT,
                                            ArrayRef<int> Mask, SDValue V1,
                                            SDValue V2, SelectionDAG &DAG) {
 
@@ -11178,23 +11806,43 @@ static SDValue lowerVectorShuffleWithPERMV(SDLoc DL, MVT VT,
   MVT MaskVecVT = MVT::getVectorVT(MaskEltVT, VT.getVectorNumElements());
 
   SDValue MaskNode = getConstVector(Mask, MaskVecVT, DAG, DL, true);
-  if (isSingleInputShuffleMask(Mask))
+  if (V2.isUndef())
     return DAG.getNode(X86ISD::VPERMV, DL, VT, MaskNode, V1);
 
   return DAG.getNode(X86ISD::VPERMV3, DL, VT, V1, MaskNode, V2);
 }
 
 /// \brief Handle lowering of 8-lane 64-bit floating point shuffles.
-static SDValue lowerV8F64VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
-                                       const X86Subtarget *Subtarget,
+static SDValue lowerV8F64VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
+                                       SDValue V1, SDValue V2,
+                                       const X86Subtarget &Subtarget,
                                        SelectionDAG &DAG) {
-  SDLoc DL(Op);
   assert(V1.getSimpleValueType() == MVT::v8f64 && "Bad operand type!");
   assert(V2.getSimpleValueType() == MVT::v8f64 && "Bad operand type!");
-  ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
-  ArrayRef<int> Mask = SVOp->getMask();
   assert(Mask.size() == 8 && "Unexpected mask size for v8 shuffle!");
 
+  if (V2.isUndef()) {
+    // Use low duplicate instructions for masks that match their pattern.
+    if (isShuffleEquivalent(V1, V2, Mask, {0, 0, 2, 2, 4, 4, 6, 6}))
+      return DAG.getNode(X86ISD::MOVDDUP, DL, MVT::v8f64, V1);
+
+    if (!is128BitLaneCrossingShuffleMask(MVT::v8f64, Mask)) {
+      // Non-half-crossing single input shuffles can be lowered with an
+      // interleaved permutation.
+      unsigned VPERMILPMask = (Mask[0] == 1) | ((Mask[1] == 1) << 1) |
+                              ((Mask[2] == 3) << 2) | ((Mask[3] == 3) << 3) |
+                              ((Mask[4] == 5) << 4) | ((Mask[5] == 5) << 5) |
+                              ((Mask[6] == 7) << 6) | ((Mask[7] == 7) << 7);
+      return DAG.getNode(X86ISD::VPERMILPI, DL, MVT::v8f64, V1,
+                         DAG.getConstant(VPERMILPMask, DL, MVT::i8));
+    }
+
+    SmallVector<int, 4> RepeatedMask;
+    if (is256BitLaneRepeatedShuffleMask(MVT::v8f64, Mask, RepeatedMask))
+      return DAG.getNode(X86ISD::VPERMI, DL, MVT::v8f64, V1,
+                         getV4X86ShuffleImm8ForMask(RepeatedMask, DL, DAG));
+  }
+
   if (SDValue Shuf128 =
           lowerV4X128VectorShuffle(DL, MVT::v8f64, Mask, V1, V2, DAG))
     return Shuf128;
@@ -11203,42 +11851,90 @@ static SDValue lowerV8F64VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
           lowerVectorShuffleWithUNPCK(DL, MVT::v8f64, Mask, V1, V2, DAG))
     return Unpck;
 
+  // Check if the blend happens to exactly fit that of SHUFPD.
+  if (SDValue Op =
+      lowerVectorShuffleWithSHUFPD(DL, MVT::v8f64, Mask, V1, V2, DAG))
+    return Op;
+
   return lowerVectorShuffleWithPERMV(DL, MVT::v8f64, Mask, V1, V2, DAG);
 }
 
 /// \brief Handle lowering of 16-lane 32-bit floating point shuffles.
-static SDValue lowerV16F32VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
-                                        const X86Subtarget *Subtarget,
+static SDValue lowerV16F32VectorShuffle(SDLoc DL, ArrayRef<int> Mask,
+                                        SDValue V1, SDValue V2,
+                                        const X86Subtarget &Subtarget,
                                         SelectionDAG &DAG) {
-  SDLoc DL(Op);
   assert(V1.getSimpleValueType() == MVT::v16f32 && "Bad operand type!");
   assert(V2.getSimpleValueType() == MVT::v16f32 && "Bad operand type!");
-  ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
-  ArrayRef<int> Mask = SVOp->getMask();
   assert(Mask.size() == 16 && "Unexpected mask size for v16 shuffle!");
 
-  if (SDValue Unpck =
-          lowerVectorShuffleWithUNPCK(DL, MVT::v16f32, Mask, V1, V2, DAG))
-    return Unpck;
+  // If the shuffle mask is repeated in each 128-bit lane, we have many more
+  // options to efficiently lower the shuffle.
+  SmallVector<int, 4> RepeatedMask;
+  if (is128BitLaneRepeatedShuffleMask(MVT::v16f32, Mask, RepeatedMask)) {
+    assert(RepeatedMask.size() == 4 && "Unexpected repeated mask size!");
+
+    // Use even/odd duplicate instructions for masks that match their pattern.
+    if (isShuffleEquivalent(V1, V2, RepeatedMask, {0, 0, 2, 2}))
+      return DAG.getNode(X86ISD::MOVSLDUP, DL, MVT::v16f32, V1);
+    if (isShuffleEquivalent(V1, V2, RepeatedMask, {1, 1, 3, 3}))
+      return DAG.getNode(X86ISD::MOVSHDUP, DL, MVT::v16f32, V1);
+
+    if (V2.isUndef())
+      return DAG.getNode(X86ISD::VPERMILPI, DL, MVT::v16f32, V1,
+                         getV4X86ShuffleImm8ForMask(RepeatedMask, DL, DAG));
+
+    // Use dedicated unpack instructions for masks that match their pattern.
+    if (SDValue Unpck =
+            lowerVectorShuffleWithUNPCK(DL, MVT::v16f32, Mask, V1, V2, DAG))
+      return Unpck;
+
+    // Otherwise, fall back to a SHUFPS sequence.
+    return lowerVectorShuffleWithSHUFPS(DL, MVT::v16f32, RepeatedMask, V1, V2, DAG);
+  }
 
   return lowerVectorShuffleWithPERMV(DL, MVT::v16f32, Mask, V1, V2, DAG);
 }
 
 /// \brief Handle lowering of 8-lane 64-bit integer shuffles.
-static SDValue lowerV8I64VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
-                                       const X86Subtarget *Subtarget,
+static SDValue lowerV8I64VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
+                                       SDValue V1, SDValue V2,
+                                       const X86Subtarget &Subtarget,
                                        SelectionDAG &DAG) {
-  SDLoc DL(Op);
   assert(V1.getSimpleValueType() == MVT::v8i64 && "Bad operand type!");
   assert(V2.getSimpleValueType() == MVT::v8i64 && "Bad operand type!");
-  ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
-  ArrayRef<int> Mask = SVOp->getMask();
   assert(Mask.size() == 8 && "Unexpected mask size for v8 shuffle!");
 
   if (SDValue Shuf128 =
           lowerV4X128VectorShuffle(DL, MVT::v8i64, Mask, V1, V2, DAG))
     return Shuf128;
 
+  if (V2.isUndef()) {
+    // When the shuffle is mirrored between the 128-bit lanes of the unit, we
+    // can use lower latency instructions that will operate on all four
+    // 128-bit lanes.
+    SmallVector<int, 2> Repeated128Mask;
+    if (is128BitLaneRepeatedShuffleMask(MVT::v8i64, Mask, Repeated128Mask)) {
+      SmallVector<int, 4> PSHUFDMask;
+      scaleShuffleMask(2, Repeated128Mask, PSHUFDMask);
+      return DAG.getBitcast(
+          MVT::v8i64,
+          DAG.getNode(X86ISD::PSHUFD, DL, MVT::v16i32,
+                      DAG.getBitcast(MVT::v16i32, V1),
+                      getV4X86ShuffleImm8ForMask(PSHUFDMask, DL, DAG)));
+    }
+
+    SmallVector<int, 4> Repeated256Mask;
+    if (is256BitLaneRepeatedShuffleMask(MVT::v8i64, Mask, Repeated256Mask))
+      return DAG.getNode(X86ISD::VPERMI, DL, MVT::v8i64, V1,
+                         getV4X86ShuffleImm8ForMask(Repeated256Mask, DL, DAG));
+  }
+
+  // Try to use shift instructions.
+  if (SDValue Shift = lowerVectorShuffleAsShift(DL, MVT::v8i64, V1, V2, Mask,
+                                                Subtarget, DAG))
+    return Shift;
+
   if (SDValue Unpck =
           lowerVectorShuffleWithUNPCK(DL, MVT::v8i64, Mask, V1, V2, DAG))
     return Unpck;
@@ -11247,49 +11943,111 @@ static SDValue lowerV8I64VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
 }
 
 /// \brief Handle lowering of 16-lane 32-bit integer shuffles.
-static SDValue lowerV16I32VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
-                                        const X86Subtarget *Subtarget,
+static SDValue lowerV16I32VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
+                                        SDValue V1, SDValue V2,
+                                        const X86Subtarget &Subtarget,
                                         SelectionDAG &DAG) {
-  SDLoc DL(Op);
   assert(V1.getSimpleValueType() == MVT::v16i32 && "Bad operand type!");
   assert(V2.getSimpleValueType() == MVT::v16i32 && "Bad operand type!");
-  ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
-  ArrayRef<int> Mask = SVOp->getMask();
   assert(Mask.size() == 16 && "Unexpected mask size for v16 shuffle!");
 
-  if (SDValue Unpck =
-          lowerVectorShuffleWithUNPCK(DL, MVT::v16i32, Mask, V1, V2, DAG))
-    return Unpck;
+  // If the shuffle mask is repeated in each 128-bit lane we can use more
+  // efficient instructions that mirror the shuffles across the four 128-bit
+  // lanes.
+  SmallVector<int, 4> RepeatedMask;
+  if (is128BitLaneRepeatedShuffleMask(MVT::v16i32, Mask, RepeatedMask)) {
+    assert(RepeatedMask.size() == 4 && "Unexpected repeated mask size!");
+    if (V2.isUndef())
+      return DAG.getNode(X86ISD::PSHUFD, DL, MVT::v16i32, V1,
+                         getV4X86ShuffleImm8ForMask(RepeatedMask, DL, DAG));
+
+    // Use dedicated unpack instructions for masks that match their pattern.
+    if (SDValue V =
+            lowerVectorShuffleWithUNPCK(DL, MVT::v16i32, Mask, V1, V2, DAG))
+      return V;
+  }
+
+  // Try to use shift instructions.
+  if (SDValue Shift = lowerVectorShuffleAsShift(DL, MVT::v16i32, V1, V2, Mask,
+                                                Subtarget, DAG))
+    return Shift;
+
+  // Try to use byte rotation instructions.
+  if (Subtarget.hasBWI())
+    if (SDValue Rotate = lowerVectorShuffleAsByteRotate(
+            DL, MVT::v16i32, V1, V2, Mask, Subtarget, DAG))
+      return Rotate;
 
   return lowerVectorShuffleWithPERMV(DL, MVT::v16i32, Mask, V1, V2, DAG);
 }
 
 /// \brief Handle lowering of 32-lane 16-bit integer shuffles.
-static SDValue lowerV32I16VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
-                                        const X86Subtarget *Subtarget,
+static SDValue lowerV32I16VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
+                                        SDValue V1, SDValue V2,
+                                        const X86Subtarget &Subtarget,
                                         SelectionDAG &DAG) {
-  SDLoc DL(Op);
   assert(V1.getSimpleValueType() == MVT::v32i16 && "Bad operand type!");
   assert(V2.getSimpleValueType() == MVT::v32i16 && "Bad operand type!");
-  ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
-  ArrayRef<int> Mask = SVOp->getMask();
   assert(Mask.size() == 32 && "Unexpected mask size for v32 shuffle!");
-  assert(Subtarget->hasBWI() && "We can only lower v32i16 with AVX-512-BWI!");
+  assert(Subtarget.hasBWI() && "We can only lower v32i16 with AVX-512-BWI!");
+
+  // Use dedicated unpack instructions for masks that match their pattern.
+  if (SDValue V =
+          lowerVectorShuffleWithUNPCK(DL, MVT::v32i16, Mask, V1, V2, DAG))
+    return V;
+
+  // Try to use shift instructions.
+  if (SDValue Shift = lowerVectorShuffleAsShift(DL, MVT::v32i16, V1, V2, Mask,
+                                                Subtarget, DAG))
+    return Shift;
+
+  // Try to use byte rotation instructions.
+  if (SDValue Rotate = lowerVectorShuffleAsByteRotate(
+          DL, MVT::v32i16, V1, V2, Mask, Subtarget, DAG))
+    return Rotate;
+
+  if (V2.isUndef()) {
+    SmallVector<int, 8> RepeatedMask;
+    if (is128BitLaneRepeatedShuffleMask(MVT::v32i16, Mask, RepeatedMask)) {
+      // As this is a single-input shuffle, the repeated mask should be
+      // a strictly valid v8i16 mask that we can pass through to the v8i16
+      // lowering to handle even the v32 case.
+      return lowerV8I16GeneralSingleInputVectorShuffle(
+          DL, MVT::v32i16, V1, RepeatedMask, Subtarget, DAG);
+    }
+  }
 
   return lowerVectorShuffleWithPERMV(DL, MVT::v32i16, Mask, V1, V2, DAG);
 }
 
 /// \brief Handle lowering of 64-lane 8-bit integer shuffles.
-static SDValue lowerV64I8VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
-                                       const X86Subtarget *Subtarget,
+static SDValue lowerV64I8VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
+                                       SDValue V1, SDValue V2,
+                                       const X86Subtarget &Subtarget,
                                        SelectionDAG &DAG) {
-  SDLoc DL(Op);
   assert(V1.getSimpleValueType() == MVT::v64i8 && "Bad operand type!");
   assert(V2.getSimpleValueType() == MVT::v64i8 && "Bad operand type!");
-  ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
-  ArrayRef<int> Mask = SVOp->getMask();
   assert(Mask.size() == 64 && "Unexpected mask size for v64 shuffle!");
-  assert(Subtarget->hasBWI() && "We can only lower v64i8 with AVX-512-BWI!");
+  assert(Subtarget.hasBWI() && "We can only lower v64i8 with AVX-512-BWI!");
+
+  // Use dedicated unpack instructions for masks that match their pattern.
+  if (SDValue V =
+          lowerVectorShuffleWithUNPCK(DL, MVT::v64i8, Mask, V1, V2, DAG))
+    return V;
+
+  // Try to use shift instructions.
+  if (SDValue Shift = lowerVectorShuffleAsShift(DL, MVT::v64i8, V1, V2, Mask,
+                                                Subtarget, DAG))
+    return Shift;
+
+  // Try to use byte rotation instructions.
+  if (SDValue Rotate = lowerVectorShuffleAsByteRotate(
+          DL, MVT::v64i8, V1, V2, Mask, Subtarget, DAG))
+    return Rotate;
+
+  if (SDValue PSHUFB = lowerVectorShuffleWithPSHUFB(DL, MVT::v64i8, Mask, V1,
+                                                    V2, Subtarget, DAG))
+    return PSHUFB;
 
   // FIXME: Implement direct support for this type!
   return splitAndLowerVectorShuffle(DL, MVT::v64i8, V1, V2, Mask, DAG);
@@ -11300,61 +12058,50 @@ static SDValue lowerV64I8VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
 /// This routine either breaks down the specific type of a 512-bit x86 vector
 /// shuffle or splits it into two 256-bit shuffles and fuses the results back
 /// together based on the available instructions.
-static SDValue lower512BitVectorShuffle(SDValue Op, SDValue V1, SDValue V2,
-                                        MVT VT, const X86Subtarget *Subtarget,
+static SDValue lower512BitVectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
+                                        MVT VT, SDValue V1, SDValue V2,
+                                        const X86Subtarget &Subtarget,
                                         SelectionDAG &DAG) {
-  SDLoc DL(Op);
-  ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
-  ArrayRef<int> Mask = SVOp->getMask();
-  assert(Subtarget->hasAVX512() &&
+  assert(Subtarget.hasAVX512() &&
          "Cannot lower 512-bit vectors w/ basic ISA!");
 
   // Check for being able to broadcast a single element.
   if (SDValue Broadcast =
-          lowerVectorShuffleAsBroadcast(DL, VT, V1, Mask, Subtarget, DAG))
+          lowerVectorShuffleAsBroadcast(DL, VT, V1, V2, Mask, Subtarget, DAG))
     return Broadcast;
 
-  // Dispatch to each element type for lowering. If we don't have supprot for
+  // Dispatch to each element type for lowering. If we don't have support for
   // specific element type shuffles at 512 bits, immediately split them and
   // lower them. Each lowering routine of a given type is allowed to assume that
   // the requisite ISA extensions for that element type are available.
   switch (VT.SimpleTy) {
   case MVT::v8f64:
-    return lowerV8F64VectorShuffle(Op, V1, V2, Subtarget, DAG);
+    return lowerV8F64VectorShuffle(DL, Mask, V1, V2, Subtarget, DAG);
   case MVT::v16f32:
-    return lowerV16F32VectorShuffle(Op, V1, V2, Subtarget, DAG);
+    return lowerV16F32VectorShuffle(DL, Mask, V1, V2, Subtarget, DAG);
   case MVT::v8i64:
-    return lowerV8I64VectorShuffle(Op, V1, V2, Subtarget, DAG);
+    return lowerV8I64VectorShuffle(DL, Mask, V1, V2, Subtarget, DAG);
   case MVT::v16i32:
-    return lowerV16I32VectorShuffle(Op, V1, V2, Subtarget, DAG);
+    return lowerV16I32VectorShuffle(DL, Mask, V1, V2, Subtarget, DAG);
   case MVT::v32i16:
-    if (Subtarget->hasBWI())
-      return lowerV32I16VectorShuffle(Op, V1, V2, Subtarget, DAG);
-    break;
+    return lowerV32I16VectorShuffle(DL, Mask, V1, V2, Subtarget, DAG);
   case MVT::v64i8:
-    if (Subtarget->hasBWI())
-      return lowerV64I8VectorShuffle(Op, V1, V2, Subtarget, DAG);
-    break;
+    return lowerV64I8VectorShuffle(DL, Mask, V1, V2, Subtarget, DAG);
 
   default:
     llvm_unreachable("Not a valid 512-bit x86 vector type!");
   }
-
-  // Otherwise fall back on splitting.
-  return splitAndLowerVectorShuffle(DL, VT, V1, V2, Mask, DAG);
 }
 
 // Lower vXi1 vector shuffles.
 // There is no a dedicated instruction on AVX-512 that shuffles the masks.
 // The only way to shuffle bits is to sign-extend the mask vector to SIMD
 // vector, shuffle and then truncate it back.
-static SDValue lower1BitVectorShuffle(SDValue Op, SDValue V1, SDValue V2,
-                                      MVT VT, const X86Subtarget *Subtarget,
+static SDValue lower1BitVectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
+                                      MVT VT, SDValue V1, SDValue V2,
+                                      const X86Subtarget &Subtarget,
                                       SelectionDAG &DAG) {
-  SDLoc DL(Op);
-  ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
-  ArrayRef<int> Mask = SVOp->getMask();
-  assert(Subtarget->hasAVX512() &&
+  assert(Subtarget.hasAVX512() &&
          "Cannot lower 512-bit vectors w/o basic ISA!");
   MVT ExtVT;
   switch (VT.SimpleTy) {
@@ -11405,7 +12152,7 @@ static SDValue lower1BitVectorShuffle(SDValue Op, SDValue V1, SDValue V2,
 /// above in helper routines. The canonicalization attempts to widen shuffles
 /// to involve fewer lanes of wider elements, consolidate symmetric patterns
 /// s.t. only one of the two inputs needs to be tested, etc.
-static SDValue lowerVectorShuffle(SDValue Op, const X86Subtarget *Subtarget,
+static SDValue lowerVectorShuffle(SDValue Op, const X86Subtarget &Subtarget,
                                   SelectionDAG &DAG) {
   ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
   ArrayRef<int> Mask = SVOp->getMask();
@@ -11413,14 +12160,14 @@ static SDValue lowerVectorShuffle(SDValue Op, const X86Subtarget *Subtarget,
   SDValue V2 = Op.getOperand(1);
   MVT VT = Op.getSimpleValueType();
   int NumElements = VT.getVectorNumElements();
-  SDLoc dl(Op);
+  SDLoc DL(Op);
   bool Is1BitVector = (VT.getVectorElementType() == MVT::i1);
 
   assert((VT.getSizeInBits() != 64 || Is1BitVector) &&
          "Can't lower MMX shuffles");
 
-  bool V1IsUndef = V1.getOpcode() == ISD::UNDEF;
-  bool V2IsUndef = V2.getOpcode() == ISD::UNDEF;
+  bool V1IsUndef = V1.isUndef();
+  bool V2IsUndef = V2.isUndef();
   if (V1IsUndef && V2IsUndef)
     return DAG.getUNDEF(VT);
 
@@ -11440,7 +12187,7 @@ static SDValue lowerVectorShuffle(SDValue Op, const X86Subtarget *Subtarget,
         for (int &M : NewMask)
           if (M >= NumElements)
             M = -1;
-        return DAG.getVectorShuffle(VT, dl, V1, V2, NewMask);
+        return DAG.getVectorShuffle(VT, DL, V1, V2, NewMask);
       }
 
   // We actually see shuffles that are entirely re-arrangements of a set of
@@ -11448,7 +12195,7 @@ static SDValue lowerVectorShuffle(SDValue Op, const X86Subtarget *Subtarget,
   // simple ones. Directly lower these as a buildvector of zeros.
   SmallBitVector Zeroable = computeZeroableShuffleElements(Mask, V1, V2);
   if (Zeroable.all())
-    return getZeroVector(VT, Subtarget, DAG, dl);
+    return getZeroVector(VT, Subtarget, DAG, DL);
 
   // Try to collapse shuffles into using a vector type with fewer elements but
   // wider element types. We cap this to not form integers or floating point
@@ -11467,12 +12214,12 @@ static SDValue lowerVectorShuffle(SDValue Op, const X86Subtarget *Subtarget,
       V1 = DAG.getBitcast(NewVT, V1);
       V2 = DAG.getBitcast(NewVT, V2);
       return DAG.getBitcast(
-          VT, DAG.getVectorShuffle(NewVT, dl, V1, V2, WidenedMask));
+          VT, DAG.getVectorShuffle(NewVT, DL, V1, V2, WidenedMask));
     }
   }
 
   int NumV1Elements = 0, NumUndefElements = 0, NumV2Elements = 0;
-  for (int M : SVOp->getMask())
+  for (int M : Mask)
     if (M < 0)
       ++NumUndefElements;
     else if (M < NumElements)
@@ -11486,6 +12233,9 @@ static SDValue lowerVectorShuffle(SDValue Op, const X86Subtarget *Subtarget,
   if (NumV2Elements > NumV1Elements)
     return DAG.getCommutedVectorShuffle(*SVOp);
 
+  assert(NumV1Elements > 0 && "No V1 indices");
+  assert((NumV2Elements > 0 || V2IsUndef) && "V2 not undef, but not used");
+
   // When the number of V1 and V2 elements are the same, try to minimize the
   // number of uses of V2 in the low half of the vector. When that is tied,
   // ensure that the sum of indices for V1 is equal to or lower than the sum
@@ -11493,28 +12243,28 @@ static SDValue lowerVectorShuffle(SDValue Op, const X86Subtarget *Subtarget,
   // indices for V1 is lower than the number of odd indices for V2.
   if (NumV1Elements == NumV2Elements) {
     int LowV1Elements = 0, LowV2Elements = 0;
-    for (int M : SVOp->getMask().slice(0, NumElements / 2))
+    for (int M : Mask.slice(0, NumElements / 2))
       if (M >= NumElements)
         ++LowV2Elements;
       else if (M >= 0)
         ++LowV1Elements;
-    if (LowV2Elements > LowV1Elements) {
+    if (LowV2Elements > LowV1Elements)
       return DAG.getCommutedVectorShuffle(*SVOp);
-    } else if (LowV2Elements == LowV1Elements) {
+    if (LowV2Elements == LowV1Elements) {
       int SumV1Indices = 0, SumV2Indices = 0;
-      for (int i = 0, Size = SVOp->getMask().size(); i < Size; ++i)
-        if (SVOp->getMask()[i] >= NumElements)
+      for (int i = 0, Size = Mask.size(); i < Size; ++i)
+        if (Mask[i] >= NumElements)
           SumV2Indices += i;
-        else if (SVOp->getMask()[i] >= 0)
+        else if (Mask[i] >= 0)
           SumV1Indices += i;
-      if (SumV2Indices < SumV1Indices) {
+      if (SumV2Indices < SumV1Indices)
         return DAG.getCommutedVectorShuffle(*SVOp);
-      } else if (SumV2Indices == SumV1Indices) {
+      if (SumV2Indices == SumV1Indices) {
         int NumV1OddIndices = 0, NumV2OddIndices = 0;
-        for (int i = 0, Size = SVOp->getMask().size(); i < Size; ++i)
-          if (SVOp->getMask()[i] >= NumElements)
+        for (int i = 0, Size = Mask.size(); i < Size; ++i)
+          if (Mask[i] >= NumElements)
             NumV2OddIndices += i % 2;
-          else if (SVOp->getMask()[i] >= 0)
+          else if (Mask[i] >= 0)
             NumV1OddIndices += i % 2;
         if (NumV2OddIndices < NumV1OddIndices)
           return DAG.getCommutedVectorShuffle(*SVOp);
@@ -11524,69 +12274,23 @@ static SDValue lowerVectorShuffle(SDValue Op, const X86Subtarget *Subtarget,
 
   // For each vector width, delegate to a specialized lowering routine.
   if (VT.is128BitVector())
-    return lower128BitVectorShuffle(Op, V1, V2, VT, Subtarget, DAG);
+    return lower128BitVectorShuffle(DL, Mask, VT, V1, V2, Subtarget, DAG);
 
   if (VT.is256BitVector())
-    return lower256BitVectorShuffle(Op, V1, V2, VT, Subtarget, DAG);
+    return lower256BitVectorShuffle(DL, Mask, VT, V1, V2, Subtarget, DAG);
 
   if (VT.is512BitVector())
-    return lower512BitVectorShuffle(Op, V1, V2, VT, Subtarget, DAG);
+    return lower512BitVectorShuffle(DL, Mask, VT, V1, V2, Subtarget, DAG);
 
   if (Is1BitVector)
-    return lower1BitVectorShuffle(Op, V1, V2, VT, Subtarget, DAG);
-  llvm_unreachable("Unimplemented!");
-}
+    return lower1BitVectorShuffle(DL, Mask, VT, V1, V2, Subtarget, DAG);
 
-// This function assumes its argument is a BUILD_VECTOR of constants or
-// undef SDNodes. i.e: ISD::isBuildVectorOfConstantSDNodes(BuildVector) is
-// true.
-static bool BUILD_VECTORtoBlendMask(BuildVectorSDNode *BuildVector,
-                                    unsigned &MaskValue) {
-  MaskValue = 0;
-  unsigned NumElems = BuildVector->getNumOperands();
-
-  // There are 2 lanes if (NumElems > 8), and 1 lane otherwise.
-  // We don't handle the >2 lanes case right now.
-  unsigned NumLanes = (NumElems - 1) / 8 + 1;
-  if (NumLanes > 2)
-    return false;
-
-  unsigned NumElemsInLane = NumElems / NumLanes;
-
-  // Blend for v16i16 should be symmetric for the both lanes.
-  for (unsigned i = 0; i < NumElemsInLane; ++i) {
-    SDValue EltCond = BuildVector->getOperand(i);
-    SDValue SndLaneEltCond =
-        (NumLanes == 2) ? BuildVector->getOperand(i + NumElemsInLane) : EltCond;
-
-    int Lane1Cond = -1, Lane2Cond = -1;
-    if (isa<ConstantSDNode>(EltCond))
-      Lane1Cond = !isNullConstant(EltCond);
-    if (isa<ConstantSDNode>(SndLaneEltCond))
-      Lane2Cond = !isNullConstant(SndLaneEltCond);
-
-    unsigned LaneMask = 0;
-    if (Lane1Cond == Lane2Cond || Lane2Cond < 0)
-      // Lane1Cond != 0, means we want the first argument.
-      // Lane1Cond == 0, means we want the second argument.
-      // The encoding of this argument is 0 for the first argument, 1
-      // for the second. Therefore, invert the condition.
-      LaneMask = !Lane1Cond << i;
-    else if (Lane1Cond < 0)
-      LaneMask = !Lane2Cond << i;
-    else
-      return false;
-
-    MaskValue |= LaneMask;
-    if (NumLanes == 2)
-      MaskValue |= LaneMask << NumElemsInLane;
-  }
-  return true;
+  llvm_unreachable("Unimplemented!");
 }
 
 /// \brief Try to lower a VSELECT instruction to a vector shuffle.
 static SDValue lowerVSELECTtoVectorShuffle(SDValue Op,
-                                           const X86Subtarget *Subtarget,
+                                           const X86Subtarget &Subtarget,
                                            SelectionDAG &DAG) {
   SDValue Cond = Op.getOperand(0);
   SDValue LHS = Op.getOperand(1);
@@ -11624,7 +12328,7 @@ SDValue X86TargetLowering::LowerVSELECT(SDValue Op, SelectionDAG &DAG) const {
     return BlendOp;
 
   // Variable blends are only legal from SSE4.1 onward.
-  if (!Subtarget->hasSSE41())
+  if (!Subtarget.hasSSE41())
     return SDValue();
 
   // Only some types will be legal on some subtargets. If we can emit a legal
@@ -11637,7 +12341,7 @@ SDValue X86TargetLowering::LowerVSELECT(SDValue Op, SelectionDAG &DAG) const {
 
   case MVT::v32i8:
     // The byte blends for AVX vectors were introduced only in AVX2.
-    if (Subtarget->hasAVX2())
+    if (Subtarget.hasAVX2())
       return Op;
 
     return SDValue();
@@ -11645,7 +12349,7 @@ SDValue X86TargetLowering::LowerVSELECT(SDValue Op, SelectionDAG &DAG) const {
   case MVT::v8i16:
   case MVT::v16i16:
     // AVX-512 BWI and VLX features support VSELECT with i16 elements.
-    if (Subtarget->hasBWI() && Subtarget->hasVLX())
+    if (Subtarget.hasBWI() && Subtarget.hasVLX())
       return Op;
 
     // FIXME: We should custom lower this by fixing the condition and using i8
@@ -11723,7 +12427,7 @@ X86TargetLowering::ExtractBitFromMaskVector(SDValue Op, SelectionDAG &DAG) const
   MVT EltVT = Op.getSimpleValueType();
 
   assert((EltVT == MVT::i1) && "Unexpected operands in ExtractBitFromMaskVector");
-  assert((VecVT.getVectorNumElements() <= 16 || Subtarget->hasBWI()) &&
+  assert((VecVT.getVectorNumElements() <= 16 || Subtarget.hasBWI()) &&
          "Unexpected vector type in ExtractBitFromMaskVector");
 
   // variable index can't be handled in mask registers,
@@ -11737,10 +12441,15 @@ X86TargetLowering::ExtractBitFromMaskVector(SDValue Op, SelectionDAG &DAG) const
   }
 
   unsigned IdxVal = cast<ConstantSDNode>(Idx)->getZExtValue();
-  const TargetRegisterClass* rc = getRegClassFor(VecVT);
-  if (!Subtarget->hasDQI() && (VecVT.getVectorNumElements() <= 8))
-    rc = getRegClassFor(MVT::v16i1);
-  unsigned MaxSift = rc->getSize()*8 - 1;
+  if (!Subtarget.hasDQI() && (VecVT.getVectorNumElements() <= 8)) {
+    // Use kshiftlw/rw instruction.
+    VecVT = MVT::v16i1;
+    Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, VecVT,
+                      DAG.getUNDEF(VecVT),
+                      Vec,
+                      DAG.getIntPtrConstant(0, dl));
+  }
+  unsigned MaxSift = VecVT.getVectorNumElements() - 1;
   Vec = DAG.getNode(X86ISD::VSHLI, dl, VecVT, Vec,
                     DAG.getConstant(MaxSift - IdxVal, dl, MVT::i8));
   Vec = DAG.getNode(X86ISD::VSRLI, dl, VecVT, Vec,
@@ -11762,7 +12471,7 @@ X86TargetLowering::LowerEXTRACT_VECTOR_ELT(SDValue Op,
 
   if (!isa<ConstantSDNode>(Idx)) {
     if (VecVT.is512BitVector() ||
-        (VecVT.is256BitVector() && Subtarget->hasInt256() &&
+        (VecVT.is256BitVector() && Subtarget.hasInt256() &&
          VecVT.getVectorElementType().getSizeInBits() == 32)) {
 
       MVT MaskEltVT =
@@ -11782,13 +12491,13 @@ X86TargetLowering::LowerEXTRACT_VECTOR_ELT(SDValue Op,
     return SDValue();
   }
 
+  unsigned IdxVal = cast<ConstantSDNode>(Idx)->getZExtValue();
+
   // If this is a 256-bit vector result, first extract the 128-bit vector and
   // then extract the element from the 128-bit vector.
   if (VecVT.is256BitVector() || VecVT.is512BitVector()) {
-
-    unsigned IdxVal = cast<ConstantSDNode>(Idx)->getZExtValue();
     // Get the 128-bit vector.
-    Vec = Extract128BitVector(Vec, IdxVal, DAG, dl);
+    Vec = extract128BitVector(Vec, IdxVal, DAG, dl);
     MVT EltVT = VecVT.getVectorElementType();
 
     unsigned ElemsPerChunk = 128 / EltVT.getSizeInBits();
@@ -11803,38 +12512,33 @@ X86TargetLowering::LowerEXTRACT_VECTOR_ELT(SDValue Op,
 
   assert(VecVT.is128BitVector() && "Unexpected vector length");
 
-  if (Subtarget->hasSSE41())
+  if (Subtarget.hasSSE41())
     if (SDValue Res = LowerEXTRACT_VECTOR_ELT_SSE4(Op, DAG))
       return Res;
 
   MVT VT = Op.getSimpleValueType();
   // TODO: handle v16i8.
   if (VT.getSizeInBits() == 16) {
-    SDValue Vec = Op.getOperand(0);
-    if (isNullConstant(Op.getOperand(1)))
+    if (IdxVal == 0)
       return DAG.getNode(ISD::TRUNCATE, dl, MVT::i16,
                          DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32,
-                                     DAG.getBitcast(MVT::v4i32, Vec),
-                                     Op.getOperand(1)));
+                                     DAG.getBitcast(MVT::v4i32, Vec), Idx));
+
     // Transform it so it match pextrw which produces a 32-bit result.
     MVT EltVT = MVT::i32;
-    SDValue Extract = DAG.getNode(X86ISD::PEXTRW, dl, EltVT,
-                                  Op.getOperand(0), Op.getOperand(1));
+    SDValue Extract = DAG.getNode(X86ISD::PEXTRW, dl, EltVT, Vec, Idx);
     SDValue Assert  = DAG.getNode(ISD::AssertZext, dl, EltVT, Extract,
                                   DAG.getValueType(VT));
     return DAG.getNode(ISD::TRUNCATE, dl, VT, Assert);
   }
 
   if (VT.getSizeInBits() == 32) {
-    unsigned Idx = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue();
-    if (Idx == 0)
+    if (IdxVal == 0)
       return Op;
 
     // SHUFPS the element to the lowest double word, then movss.
-    int Mask[4] = { static_cast<int>(Idx), -1, -1, -1 };
-    MVT VVT = Op.getOperand(0).getSimpleValueType();
-    SDValue Vec = DAG.getVectorShuffle(VVT, dl, Op.getOperand(0),
-                                       DAG.getUNDEF(VVT), Mask);
+    int Mask[4] = { static_cast<int>(IdxVal), -1, -1, -1 };
+    Vec = DAG.getVectorShuffle(VecVT, dl, Vec, DAG.getUNDEF(VecVT), Mask);
     return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, Vec,
                        DAG.getIntPtrConstant(0, dl));
   }
@@ -11843,16 +12547,14 @@ X86TargetLowering::LowerEXTRACT_VECTOR_ELT(SDValue Op,
     // FIXME: .td only matches this for <2 x f64>, not <2 x i64> on 32b
     // FIXME: seems like this should be unnecessary if mov{h,l}pd were taught
     //        to match extract_elt for f64.
-    if (isNullConstant(Op.getOperand(1)))
+    if (IdxVal == 0)
       return Op;
 
     // UNPCKHPD the element to the lowest double word, then movsd.
     // Note if the lower 64 bits of the result of the UNPCKHPD is then stored
     // to a f64mem, the whole operation is folded into a single MOVHPDmr.
     int Mask[2] = { 1, -1 };
-    MVT VVT = Op.getOperand(0).getSimpleValueType();
-    SDValue Vec = DAG.getVectorShuffle(VVT, dl, Op.getOperand(0),
-                                       DAG.getUNDEF(VVT), Mask);
+    Vec = DAG.getVectorShuffle(VecVT, dl, Vec, DAG.getUNDEF(VecVT), Mask);
     return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, Vec,
                        DAG.getIntPtrConstant(0, dl));
   }
@@ -11886,7 +12588,7 @@ X86TargetLowering::InsertBitToMaskVector(SDValue Op, SelectionDAG &DAG) const {
   if (IdxVal)
     EltInVec = DAG.getNode(X86ISD::VSHLI, dl, VecVT, EltInVec,
                            DAG.getConstant(IdxVal, dl, MVT::i8));
-  if (Vec.getOpcode() == ISD::UNDEF)
+  if (Vec.isUndef())
     return EltInVec;
   return DAG.getNode(ISD::OR, dl, VecVT, Vec, EltInVec);
 }
@@ -11895,6 +12597,7 @@ SDValue X86TargetLowering::LowerINSERT_VECTOR_ELT(SDValue Op,
                                                   SelectionDAG &DAG) const {
   MVT VT = Op.getSimpleValueType();
   MVT EltVT = VT.getVectorElementType();
+  unsigned NumElts = VT.getVectorNumElements();
 
   if (EltVT == MVT::i1)
     return InsertBitToMaskVector(Op, DAG);
@@ -11908,6 +12611,19 @@ SDValue X86TargetLowering::LowerINSERT_VECTOR_ELT(SDValue Op,
   auto *N2C = cast<ConstantSDNode>(N2);
   unsigned IdxVal = N2C->getZExtValue();
 
+  // If we are clearing out a element, we do this more efficiently with a
+  // blend shuffle than a costly integer insertion.
+  // TODO: would other rematerializable values (e.g. allbits) benefit as well?
+  // TODO: pre-SSE41 targets will tend to use bit masking - this could still
+  // be beneficial if we are inserting several zeros and can combine the masks.
+  if (X86::isZeroNode(N1) && Subtarget.hasSSE41() && NumElts <= 8) {
+    SmallVector<int, 8> ClearMask;
+    for (unsigned i = 0; i != NumElts; ++i)
+      ClearMask.push_back(i == IdxVal ? i + NumElts : i);
+    SDValue ZeroVector = getZeroVector(VT, Subtarget, DAG, dl);
+    return DAG.getVectorShuffle(VT, dl, N0, ZeroVector, ClearMask);
+  }
+
   // If the vector is wider than 128 bits, extract the 128-bit subvector, insert
   // into that, and then insert the subvector back into the result.
   if (VT.is256BitVector() || VT.is512BitVector()) {
@@ -11917,8 +12633,8 @@ SDValue X86TargetLowering::LowerINSERT_VECTOR_ELT(SDValue Op,
       // TODO: It is worthwhile to cast integer to floating point and back
       // and incur a domain crossing penalty if that's what we'll end up
       // doing anyway after extracting to a 128-bit vector.
-      if ((Subtarget->hasAVX() && (EltVT == MVT::f64 || EltVT == MVT::f32)) ||
-          (Subtarget->hasAVX2() && EltVT == MVT::i32)) {
+      if ((Subtarget.hasAVX() && (EltVT == MVT::f64 || EltVT == MVT::f32)) ||
+          (Subtarget.hasAVX2() && EltVT == MVT::i32)) {
         SDValue N1Vec = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, N1);
         N2 = DAG.getIntPtrConstant(1, dl);
         return DAG.getNode(X86ISD::BLENDI, dl, VT, N0, N1Vec, N2);
@@ -11926,7 +12642,7 @@ SDValue X86TargetLowering::LowerINSERT_VECTOR_ELT(SDValue Op,
     }
 
     // Get the desired 128-bit vector chunk.
-    SDValue V = Extract128BitVector(N0, IdxVal, DAG, dl);
+    SDValue V = extract128BitVector(N0, IdxVal, DAG, dl);
 
     // Insert the element into the desired chunk.
     unsigned NumEltsIn128 = 128 / EltVT.getSizeInBits();
@@ -11938,11 +12654,11 @@ SDValue X86TargetLowering::LowerINSERT_VECTOR_ELT(SDValue Op,
                     DAG.getConstant(IdxIn128, dl, MVT::i32));
 
     // Insert the changed part back into the bigger vector
-    return Insert128BitVector(N0, V, IdxVal, DAG, dl);
+    return insert128BitVector(N0, V, IdxVal, DAG, dl);
   }
   assert(VT.is128BitVector() && "Only 128-bit vector types should be left!");
 
-  if (Subtarget->hasSSE41()) {
+  if (Subtarget.hasSSE41()) {
     if (EltVT.getSizeInBits() == 8 || EltVT.getSizeInBits() == 16) {
       unsigned Opc;
       if (VT == MVT::v8i16) {
@@ -12026,7 +12742,7 @@ static SDValue LowerSCALAR_TO_VECTOR(SDValue Op, SelectionDAG &DAG) {
     Op = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT128, Op.getOperand(0));
 
     // Insert the 128-bit vector.
-    return Insert128BitVector(DAG.getUNDEF(OpVT), Op, 0, DAG, dl);
+    return insert128BitVector(DAG.getUNDEF(OpVT), Op, 0, DAG, dl);
   }
 
   if (OpVT == MVT::v1i64 &&
@@ -12042,7 +12758,7 @@ static SDValue LowerSCALAR_TO_VECTOR(SDValue Op, SelectionDAG &DAG) {
 // Lower a node with an EXTRACT_SUBVECTOR opcode.  This may result in
 // a simple subregister reference or explicit instructions to grab
 // upper bits of a vector.
-static SDValue LowerEXTRACT_SUBVECTOR(SDValue Op, const X86Subtarget *Subtarget,
+static SDValue LowerEXTRACT_SUBVECTOR(SDValue Op, const X86Subtarget &Subtarget,
                                       SelectionDAG &DAG) {
   SDLoc dl(Op);
   SDValue In =  Op.getOperand(0);
@@ -12051,15 +12767,15 @@ static SDValue LowerEXTRACT_SUBVECTOR(SDValue Op, const X86Subtarget *Subtarget,
   MVT ResVT   = Op.getSimpleValueType();
   MVT InVT    = In.getSimpleValueType();
 
-  if (Subtarget->hasFp256()) {
+  if (Subtarget.hasFp256()) {
     if (ResVT.is128BitVector() &&
         (InVT.is256BitVector() || InVT.is512BitVector()) &&
         isa<ConstantSDNode>(Idx)) {
-      return Extract128BitVector(In, IdxVal, DAG, dl);
+      return extract128BitVector(In, IdxVal, DAG, dl);
     }
     if (ResVT.is256BitVector() && InVT.is512BitVector() &&
         isa<ConstantSDNode>(Idx)) {
-      return Extract256BitVector(In, IdxVal, DAG, dl);
+      return extract256BitVector(In, IdxVal, DAG, dl);
     }
   }
   return SDValue();
@@ -12068,9 +12784,9 @@ static SDValue LowerEXTRACT_SUBVECTOR(SDValue Op, const X86Subtarget *Subtarget,
 // Lower a node with an INSERT_SUBVECTOR opcode.  This may result in a
 // simple superregister reference or explicit instructions to insert
 // the upper bits of a vector.
-static SDValue LowerINSERT_SUBVECTOR(SDValue Op, const X86Subtarget *Subtarget,
+static SDValue LowerINSERT_SUBVECTOR(SDValue Op, const X86Subtarget &Subtarget,
                                      SelectionDAG &DAG) {
-  if (!Subtarget->hasAVX())
+  if (!Subtarget.hasAVX())
     return SDValue();
 
   SDLoc dl(Op);
@@ -12094,16 +12810,13 @@ static SDValue LowerINSERT_SUBVECTOR(SDValue Op, const X86Subtarget *Subtarget,
       OpVT.is256BitVector() && SubVecVT.is128BitVector()) {
     auto *Idx2 = dyn_cast<ConstantSDNode>(Vec.getOperand(2));
     if (Idx2 && Idx2->getZExtValue() == 0) {
-      SDValue SubVec2 = Vec.getOperand(1);
-      // If needed, look through a bitcast to get to the load.
-      if (SubVec2.getNode() && SubVec2.getOpcode() == ISD::BITCAST)
-        SubVec2 = SubVec2.getOperand(0);
-
+      // If needed, look through bitcasts to get to the load.
+      SDValue SubVec2 = peekThroughBitcasts(Vec.getOperand(1));
       if (auto *FirstLd = dyn_cast<LoadSDNode>(SubVec2)) {
         bool Fast;
         unsigned Alignment = FirstLd->getAlignment();
         unsigned AS = FirstLd->getAddressSpace();
-        const X86TargetLowering *TLI = Subtarget->getTargetLowering();
+        const X86TargetLowering *TLI = Subtarget.getTargetLowering();
         if (TLI->allowsMemoryAccess(*DAG.getContext(), DAG.getDataLayout(),
                                     OpVT, AS, Alignment, &Fast) && Fast) {
           SDValue Ops[] = { SubVec2, SubVec };
@@ -12116,13 +12829,13 @@ static SDValue LowerINSERT_SUBVECTOR(SDValue Op, const X86Subtarget *Subtarget,
 
   if ((OpVT.is256BitVector() || OpVT.is512BitVector()) &&
       SubVecVT.is128BitVector())
-    return Insert128BitVector(Vec, SubVec, IdxVal, DAG, dl);
+    return insert128BitVector(Vec, SubVec, IdxVal, DAG, dl);
 
   if (OpVT.is512BitVector() && SubVecVT.is256BitVector())
-    return Insert256BitVector(Vec, SubVec, IdxVal, DAG, dl);
+    return insert256BitVector(Vec, SubVec, IdxVal, DAG, dl);
 
   if (OpVT.getVectorElementType() == MVT::i1)
-    return Insert1BitVector(Op, DAG);
+    return insert1BitVector(Op, DAG, Subtarget);
 
   return SDValue();
 }
@@ -12139,17 +12852,13 @@ X86TargetLowering::LowerConstantPool(SDValue Op, SelectionDAG &DAG) const {
 
   // In PIC mode (unless we're in RIPRel PIC mode) we add an offset to the
   // global base reg.
-  unsigned char OpFlag = 0;
+  unsigned char OpFlag = Subtarget.classifyLocalReference(nullptr);
   unsigned WrapperKind = X86ISD::Wrapper;
   CodeModel::Model M = DAG.getTarget().getCodeModel();
 
-  if (Subtarget->isPICStyleRIPRel() &&
+  if (Subtarget.isPICStyleRIPRel() &&
       (M == CodeModel::Small || M == CodeModel::Kernel))
     WrapperKind = X86ISD::WrapperRIP;
-  else if (Subtarget->isPICStyleGOT())
-    OpFlag = X86II::MO_GOTOFF;
-  else if (Subtarget->isPICStyleStubPIC())
-    OpFlag = X86II::MO_PIC_BASE_OFFSET;
 
   auto PtrVT = getPointerTy(DAG.getDataLayout());
   SDValue Result = DAG.getTargetConstantPool(
@@ -12171,17 +12880,13 @@ SDValue X86TargetLowering::LowerJumpTable(SDValue Op, SelectionDAG &DAG) const {
 
   // In PIC mode (unless we're in RIPRel PIC mode) we add an offset to the
   // global base reg.
-  unsigned char OpFlag = 0;
+  unsigned char OpFlag = Subtarget.classifyLocalReference(nullptr);
   unsigned WrapperKind = X86ISD::Wrapper;
   CodeModel::Model M = DAG.getTarget().getCodeModel();
 
-  if (Subtarget->isPICStyleRIPRel() &&
+  if (Subtarget.isPICStyleRIPRel() &&
       (M == CodeModel::Small || M == CodeModel::Kernel))
     WrapperKind = X86ISD::WrapperRIP;
-  else if (Subtarget->isPICStyleGOT())
-    OpFlag = X86II::MO_GOTOFF;
-  else if (Subtarget->isPICStyleStubPIC())
-    OpFlag = X86II::MO_PIC_BASE_OFFSET;
 
   auto PtrVT = getPointerTy(DAG.getDataLayout());
   SDValue Result = DAG.getTargetJumpTable(JT->getIndex(), PtrVT, OpFlag);
@@ -12203,22 +12908,14 @@ X86TargetLowering::LowerExternalSymbol(SDValue Op, SelectionDAG &DAG) const {
 
   // In PIC mode (unless we're in RIPRel PIC mode) we add an offset to the
   // global base reg.
-  unsigned char OpFlag = 0;
+  const Module *Mod = DAG.getMachineFunction().getFunction()->getParent();
+  unsigned char OpFlag = Subtarget.classifyGlobalReference(nullptr, *Mod);
   unsigned WrapperKind = X86ISD::Wrapper;
   CodeModel::Model M = DAG.getTarget().getCodeModel();
 
-  if (Subtarget->isPICStyleRIPRel() &&
-      (M == CodeModel::Small || M == CodeModel::Kernel)) {
-    if (Subtarget->isTargetDarwin() || Subtarget->isTargetELF())
-      OpFlag = X86II::MO_GOTPCREL;
+  if (Subtarget.isPICStyleRIPRel() &&
+      (M == CodeModel::Small || M == CodeModel::Kernel))
     WrapperKind = X86ISD::WrapperRIP;
-  } else if (Subtarget->isPICStyleGOT()) {
-    OpFlag = X86II::MO_GOT;
-  } else if (Subtarget->isPICStyleStubPIC()) {
-    OpFlag = X86II::MO_DARWIN_NONLAZY_PIC_BASE;
-  } else if (Subtarget->isPICStyleStubNoDynamic()) {
-    OpFlag = X86II::MO_DARWIN_NONLAZY;
-  }
 
   auto PtrVT = getPointerTy(DAG.getDataLayout());
   SDValue Result = DAG.getTargetExternalSymbol(Sym, PtrVT, OpFlag);
@@ -12227,8 +12924,7 @@ X86TargetLowering::LowerExternalSymbol(SDValue Op, SelectionDAG &DAG) const {
   Result = DAG.getNode(WrapperKind, DL, PtrVT, Result);
 
   // With PIC, the address is actually $g + Offset.
-  if (DAG.getTarget().getRelocationModel() == Reloc::PIC_ &&
-      !Subtarget->is64Bit()) {
+  if (isPositionIndependent() && !Subtarget.is64Bit()) {
     Result =
         DAG.getNode(ISD::ADD, DL, PtrVT,
                     DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(), PtrVT), Result);
@@ -12238,8 +12934,7 @@ X86TargetLowering::LowerExternalSymbol(SDValue Op, SelectionDAG &DAG) const {
   // load.
   if (isGlobalStubReference(OpFlag))
     Result = DAG.getLoad(PtrVT, DL, DAG.getEntryNode(), Result,
-                         MachinePointerInfo::getGOT(DAG.getMachineFunction()),
-                         false, false, false, 0);
+                         MachinePointerInfo::getGOT(DAG.getMachineFunction()));
 
   return Result;
 }
@@ -12248,7 +12943,7 @@ SDValue
 X86TargetLowering::LowerBlockAddress(SDValue Op, SelectionDAG &DAG) const {
   // Create the TargetBlockAddressAddress node.
   unsigned char OpFlags =
-    Subtarget->ClassifyBlockAddressReference();
+    Subtarget.classifyBlockAddressReference();
   CodeModel::Model M = DAG.getTarget().getCodeModel();
   const BlockAddress *BA = cast<BlockAddressSDNode>(Op)->getBlockAddress();
   int64_t Offset = cast<BlockAddressSDNode>(Op)->getOffset();
@@ -12256,7 +12951,7 @@ X86TargetLowering::LowerBlockAddress(SDValue Op, SelectionDAG &DAG) const {
   auto PtrVT = getPointerTy(DAG.getDataLayout());
   SDValue Result = DAG.getTargetBlockAddress(BA, PtrVT, Offset, OpFlags);
 
-  if (Subtarget->isPICStyleRIPRel() &&
+  if (Subtarget.isPICStyleRIPRel() &&
       (M == CodeModel::Small || M == CodeModel::Kernel))
     Result = DAG.getNode(X86ISD::WrapperRIP, dl, PtrVT, Result);
   else
@@ -12271,13 +12966,12 @@ X86TargetLowering::LowerBlockAddress(SDValue Op, SelectionDAG &DAG) const {
   return Result;
 }
 
-SDValue
-X86TargetLowering::LowerGlobalAddress(const GlobalValue *GV, SDLoc dl,
-                                      int64_t Offset, SelectionDAG &DAG) const {
+SDValue X86TargetLowering::LowerGlobalAddress(const GlobalValue *GV,
+                                              const SDLoc &dl, int64_t Offset,
+                                              SelectionDAG &DAG) const {
   // Create the TargetGlobalAddress node, folding in the constant
   // offset if it is legal.
-  unsigned char OpFlags =
-      Subtarget->ClassifyGlobalReference(GV, DAG.getTarget());
+  unsigned char OpFlags = Subtarget.classifyGlobalReference(GV);
   CodeModel::Model M = DAG.getTarget().getCodeModel();
   auto PtrVT = getPointerTy(DAG.getDataLayout());
   SDValue Result;
@@ -12290,7 +12984,7 @@ X86TargetLowering::LowerGlobalAddress(const GlobalValue *GV, SDLoc dl,
     Result = DAG.getTargetGlobalAddress(GV, dl, PtrVT, 0, OpFlags);
   }
 
-  if (Subtarget->isPICStyleRIPRel() &&
+  if (Subtarget.isPICStyleRIPRel() &&
       (M == CodeModel::Small || M == CodeModel::Kernel))
     Result = DAG.getNode(X86ISD::WrapperRIP, dl, PtrVT, Result);
   else
@@ -12306,8 +13000,7 @@ X86TargetLowering::LowerGlobalAddress(const GlobalValue *GV, SDLoc dl,
   // load.
   if (isGlobalStubReference(OpFlags))
     Result = DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), Result,
-                         MachinePointerInfo::getGOT(DAG.getMachineFunction()),
-                         false, false, false, 0);
+                         MachinePointerInfo::getGOT(DAG.getMachineFunction()));
 
   // If there was a non-zero offset that we didn't fold, create an explicit
   // addition for it.
@@ -12429,7 +13122,7 @@ static SDValue LowerToTLSExecModel(GlobalAddressSDNode *GA, SelectionDAG &DAG,
 
   SDValue ThreadPointer =
       DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), DAG.getIntPtrConstant(0, dl),
-                  MachinePointerInfo(Ptr), false, false, false, 0);
+                  MachinePointerInfo(Ptr));
 
   unsigned char OperandFlags = 0;
   // Most TLS accesses are not RIP relative, even on x86-64.  One exception is
@@ -12464,8 +13157,7 @@ static SDValue LowerToTLSExecModel(GlobalAddressSDNode *GA, SelectionDAG &DAG,
     }
 
     Offset = DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), Offset,
-                         MachinePointerInfo::getGOT(DAG.getMachineFunction()),
-                         false, false, false, 0);
+                         MachinePointerInfo::getGOT(DAG.getMachineFunction()));
   }
 
   // The address of the thread local variable is the add of the thread
@@ -12478,45 +13170,40 @@ X86TargetLowering::LowerGlobalTLSAddress(SDValue Op, SelectionDAG &DAG) const {
 
   GlobalAddressSDNode *GA = cast<GlobalAddressSDNode>(Op);
 
-  // Cygwin uses emutls.
-  // FIXME: It may be EmulatedTLS-generic also for X86-Android.
-  if (Subtarget->isTargetWindowsCygwin())
+  if (DAG.getTarget().Options.EmulatedTLS)
     return LowerToTLSEmulatedModel(GA, DAG);
 
   const GlobalValue *GV = GA->getGlobal();
   auto PtrVT = getPointerTy(DAG.getDataLayout());
+  bool PositionIndependent = isPositionIndependent();
 
-  if (Subtarget->isTargetELF()) {
-    if (DAG.getTarget().Options.EmulatedTLS)
-      return LowerToTLSEmulatedModel(GA, DAG);
+  if (Subtarget.isTargetELF()) {
     TLSModel::Model model = DAG.getTarget().getTLSModel(GV);
     switch (model) {
       case TLSModel::GeneralDynamic:
-        if (Subtarget->is64Bit())
+        if (Subtarget.is64Bit())
           return LowerToTLSGeneralDynamicModel64(GA, DAG, PtrVT);
         return LowerToTLSGeneralDynamicModel32(GA, DAG, PtrVT);
       case TLSModel::LocalDynamic:
         return LowerToTLSLocalDynamicModel(GA, DAG, PtrVT,
-                                           Subtarget->is64Bit());
+                                           Subtarget.is64Bit());
       case TLSModel::InitialExec:
       case TLSModel::LocalExec:
-        return LowerToTLSExecModel(GA, DAG, PtrVT, model, Subtarget->is64Bit(),
-                                   DAG.getTarget().getRelocationModel() ==
-                                       Reloc::PIC_);
+        return LowerToTLSExecModel(GA, DAG, PtrVT, model, Subtarget.is64Bit(),
+                                   PositionIndependent);
     }
     llvm_unreachable("Unknown TLS model.");
   }
 
-  if (Subtarget->isTargetDarwin()) {
+  if (Subtarget.isTargetDarwin()) {
     // Darwin only has one model of TLS.  Lower to that.
     unsigned char OpFlag = 0;
-    unsigned WrapperKind = Subtarget->isPICStyleRIPRel() ?
+    unsigned WrapperKind = Subtarget.isPICStyleRIPRel() ?
                            X86ISD::WrapperRIP : X86ISD::Wrapper;
 
     // In PIC mode (unless we're in RIPRel PIC mode) we add an offset to the
     // global base reg.
-    bool PIC32 = (DAG.getTarget().getRelocationModel() == Reloc::PIC_) &&
-                 !Subtarget->is64Bit();
+    bool PIC32 = PositionIndependent && !Subtarget.is64Bit();
     if (PIC32)
       OpFlag = X86II::MO_TLVP_PIC_BASE;
     else
@@ -12540,9 +13227,9 @@ X86TargetLowering::LowerGlobalTLSAddress(SDValue Op, SelectionDAG &DAG) const {
     Chain = DAG.getCALLSEQ_START(Chain, DAG.getIntPtrConstant(0, DL, true), DL);
     SDValue Args[] = { Chain, Offset };
     Chain = DAG.getNode(X86ISD::TLSCALL, DL, NodeTys, Args);
-    Chain =
-        DAG.getCALLSEQ_END(Chain, DAG.getIntPtrConstant(0, DL, true),
-                           DAG.getIntPtrConstant(0, DL, true), SDValue(), DL);
+    Chain = DAG.getCALLSEQ_END(Chain, DAG.getIntPtrConstant(0, DL, true),
+                               DAG.getIntPtrConstant(0, DL, true),
+                               Chain.getValue(1), DL);
 
     // TLSCALL will be codegen'ed as call. Inform MFI that function has calls.
     MachineFrameInfo *MFI = DAG.getMachineFunction().getFrameInfo();
@@ -12550,12 +13237,13 @@ X86TargetLowering::LowerGlobalTLSAddress(SDValue Op, SelectionDAG &DAG) const {
 
     // And our return value (tls address) is in the standard call return value
     // location.
-    unsigned Reg = Subtarget->is64Bit() ? X86::RAX : X86::EAX;
+    unsigned Reg = Subtarget.is64Bit() ? X86::RAX : X86::EAX;
     return DAG.getCopyFromReg(Chain, DL, Reg, PtrVT, Chain.getValue(1));
   }
 
-  if (Subtarget->isTargetKnownWindowsMSVC() ||
-      Subtarget->isTargetWindowsGNU()) {
+  if (Subtarget.isTargetKnownWindowsMSVC() ||
+      Subtarget.isTargetWindowsItanium() ||
+      Subtarget.isTargetWindowsGNU()) {
     // Just use the implicit TLS architecture
     // Need to generate someting similar to:
     //   mov     rdx, qword [gs:abs 58H]; Load pointer to ThreadLocalStorage
@@ -12573,21 +13261,20 @@ X86TargetLowering::LowerGlobalTLSAddress(SDValue Op, SelectionDAG &DAG) const {
     // Get the Thread Pointer, which is %fs:__tls_array (32-bit) or
     // %gs:0x58 (64-bit). On MinGW, __tls_array is not available, so directly
     // use its literal value of 0x2C.
-    Value *Ptr = Constant::getNullValue(Subtarget->is64Bit()
+    Value *Ptr = Constant::getNullValue(Subtarget.is64Bit()
                                         ? Type::getInt8PtrTy(*DAG.getContext(),
                                                              256)
                                         : Type::getInt32PtrTy(*DAG.getContext(),
                                                               257));
 
-    SDValue TlsArray = Subtarget->is64Bit()
+    SDValue TlsArray = Subtarget.is64Bit()
                            ? DAG.getIntPtrConstant(0x58, dl)
-                           : (Subtarget->isTargetWindowsGNU()
+                           : (Subtarget.isTargetWindowsGNU()
                                   ? DAG.getIntPtrConstant(0x2C, dl)
                                   : DAG.getExternalSymbol("_tls_array", PtrVT));
 
     SDValue ThreadPointer =
-        DAG.getLoad(PtrVT, dl, Chain, TlsArray, MachinePointerInfo(Ptr), false,
-                    false, false, 0);
+        DAG.getLoad(PtrVT, dl, Chain, TlsArray, MachinePointerInfo(Ptr));
 
     SDValue res;
     if (GV->getThreadLocalMode() == GlobalVariable::LocalExecTLSModel) {
@@ -12595,13 +13282,11 @@ X86TargetLowering::LowerGlobalTLSAddress(SDValue Op, SelectionDAG &DAG) const {
     } else {
       // Load the _tls_index variable
       SDValue IDX = DAG.getExternalSymbol("_tls_index", PtrVT);
-      if (Subtarget->is64Bit())
+      if (Subtarget.is64Bit())
         IDX = DAG.getExtLoad(ISD::ZEXTLOAD, dl, PtrVT, Chain, IDX,
-                             MachinePointerInfo(), MVT::i32, false, false,
-                             false, 0);
+                             MachinePointerInfo(), MVT::i32);
       else
-        IDX = DAG.getLoad(PtrVT, dl, Chain, IDX, MachinePointerInfo(), false,
-                          false, false, 0);
+        IDX = DAG.getLoad(PtrVT, dl, Chain, IDX, MachinePointerInfo());
 
       auto &DL = DAG.getDataLayout();
       SDValue Scale =
@@ -12611,8 +13296,7 @@ X86TargetLowering::LowerGlobalTLSAddress(SDValue Op, SelectionDAG &DAG) const {
       res = DAG.getNode(ISD::ADD, dl, PtrVT, ThreadPointer, IDX);
     }
 
-    res = DAG.getLoad(PtrVT, dl, Chain, res, MachinePointerInfo(), false, false,
-                      false, 0);
+    res = DAG.getLoad(PtrVT, dl, Chain, res, MachinePointerInfo());
 
     // Get the offset of start of .tls section
     SDValue TGA = DAG.getTargetGlobalAddress(GA->getGlobal(), dl,
@@ -12628,7 +13312,7 @@ X86TargetLowering::LowerGlobalTLSAddress(SDValue Op, SelectionDAG &DAG) const {
   llvm_unreachable("TLS not implemented for this target.");
 }
 
-/// LowerShiftParts - Lower SRA_PARTS and friends, which return two i32 values
+/// Lower SRA_PARTS and friends, which return two i32 values
 /// and take a 2 x i32 value to shift plus a shift amount.
 static SDValue LowerShiftParts(SDValue Op, SelectionDAG &DAG) {
   assert(Op.getNumOperands() == 3 && "Not a double-shift!");
@@ -12711,13 +13395,13 @@ SDValue X86TargetLowering::LowerSINT_TO_FP(SDValue Op,
   if (SrcVT == MVT::i32 && isScalarFPTypeInSSEReg(Op.getValueType()))
     return Op;
   if (SrcVT == MVT::i64 && isScalarFPTypeInSSEReg(Op.getValueType()) &&
-      Subtarget->is64Bit()) {
+      Subtarget.is64Bit()) {
     return Op;
   }
 
   SDValue ValueToStore = Op.getOperand(0);
   if (SrcVT == MVT::i64 && isScalarFPTypeInSSEReg(Op.getValueType()) &&
-      !Subtarget->is64Bit())
+      !Subtarget.is64Bit())
     // Bitcasting to f64 here allows us to do a single 64-bit store from
     // an SSE register, avoiding the store forwarding penalty that would come
     // with two 32-bit stores.
@@ -12730,8 +13414,7 @@ SDValue X86TargetLowering::LowerSINT_TO_FP(SDValue Op,
   SDValue StackSlot = DAG.getFrameIndex(SSFI, PtrVT);
   SDValue Chain = DAG.getStore(
       DAG.getEntryNode(), dl, ValueToStore, StackSlot,
-      MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), SSFI), false,
-      false, 0);
+      MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), SSFI));
   return BuildFILD(Op, SrcVT, Chain, StackSlot, DAG);
 }
 
@@ -12789,14 +13472,13 @@ SDValue X86TargetLowering::BuildFILD(SDValue Op, EVT SrcVT, SDValue Chain,
                                     Ops, Op.getValueType(), MMO);
     Result = DAG.getLoad(
         Op.getValueType(), DL, Chain, StackSlot,
-        MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), SSFI),
-        false, false, false, 0);
+        MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), SSFI));
   }
 
   return Result;
 }
 
-// LowerUINT_TO_FP_i64 - 64-bit unsigned integer to double expansion.
+/// 64-bit unsigned integer to double expansion.
 SDValue X86TargetLowering::LowerUINT_TO_FP_i64(SDValue Op,
                                                SelectionDAG &DAG) const {
   // This algorithm is not obvious. Here it is what we're trying to output:
@@ -12837,20 +13519,20 @@ SDValue X86TargetLowering::LowerUINT_TO_FP_i64(SDValue Op,
   SDValue CLod0 =
       DAG.getLoad(MVT::v4i32, dl, DAG.getEntryNode(), CPIdx0,
                   MachinePointerInfo::getConstantPool(DAG.getMachineFunction()),
-                  false, false, false, 16);
+                  /* Alignment = */ 16);
   SDValue Unpck1 =
       getUnpackl(DAG, dl, MVT::v4i32, DAG.getBitcast(MVT::v4i32, XR1), CLod0);
 
   SDValue CLod1 =
       DAG.getLoad(MVT::v2f64, dl, CLod0.getValue(1), CPIdx1,
                   MachinePointerInfo::getConstantPool(DAG.getMachineFunction()),
-                  false, false, false, 16);
+                  /* Alignment = */ 16);
   SDValue XR2F = DAG.getBitcast(MVT::v2f64, Unpck1);
   // TODO: Are there any fast-math-flags to propagate here?
   SDValue Sub = DAG.getNode(ISD::FSUB, dl, MVT::v2f64, XR2F, CLod1);
   SDValue Result;
 
-  if (Subtarget->hasSSE3()) {
+  if (Subtarget.hasSSE3()) {
     // FIXME: The 'haddpd' instruction may be slower than 'movhlps + addsd'.
     Result = DAG.getNode(X86ISD::FHADD, dl, MVT::v2f64, Sub, Sub);
   } else {
@@ -12865,7 +13547,7 @@ SDValue X86TargetLowering::LowerUINT_TO_FP_i64(SDValue Op,
                      DAG.getIntPtrConstant(0, dl));
 }
 
-// LowerUINT_TO_FP_i32 - 32-bit unsigned integer to float expansion.
+/// 32-bit unsigned integer to float expansion.
 SDValue X86TargetLowering::LowerUINT_TO_FP_i32(SDValue Op,
                                                SelectionDAG &DAG) const {
   SDLoc dl(Op);
@@ -12945,10 +13627,8 @@ static SDValue lowerUINT_TO_FP_vXi32(SDValue Op, SelectionDAG &DAG,
   if (VecFloatVT != Op->getSimpleValueType(0))
     return SDValue();
 
-  unsigned NumElts = VecIntVT.getVectorNumElements();
   assert((VecIntVT == MVT::v4i32 || VecIntVT == MVT::v8i32) &&
          "Unsupported custom type");
-  assert(NumElts <= 8 && "The size of the constant array must be fixed");
 
   // In the #idef/#else code, we have in common:
   // - The vector of constants:
@@ -12958,24 +13638,12 @@ static SDValue lowerUINT_TO_FP_vXi32(SDValue Op, SelectionDAG &DAG,
   // -- v >> 16
 
   // Create the splat vector for 0x4b000000.
-  SDValue CstLow = DAG.getConstant(0x4b000000, DL, MVT::i32);
-  SDValue CstLowArray[] = {CstLow, CstLow, CstLow, CstLow,
-                           CstLow, CstLow, CstLow, CstLow};
-  SDValue VecCstLow = DAG.getNode(ISD::BUILD_VECTOR, DL, VecIntVT,
-                                  makeArrayRef(&CstLowArray[0], NumElts));
+  SDValue VecCstLow = DAG.getConstant(0x4b000000, DL, VecIntVT);
   // Create the splat vector for 0x53000000.
-  SDValue CstHigh = DAG.getConstant(0x53000000, DL, MVT::i32);
-  SDValue CstHighArray[] = {CstHigh, CstHigh, CstHigh, CstHigh,
-                            CstHigh, CstHigh, CstHigh, CstHigh};
-  SDValue VecCstHigh = DAG.getNode(ISD::BUILD_VECTOR, DL, VecIntVT,
-                                   makeArrayRef(&CstHighArray[0], NumElts));
+  SDValue VecCstHigh = DAG.getConstant(0x53000000, DL, VecIntVT);
 
   // Create the right shift.
-  SDValue CstShift = DAG.getConstant(16, DL, MVT::i32);
-  SDValue CstShiftArray[] = {CstShift, CstShift, CstShift, CstShift,
-                             CstShift, CstShift, CstShift, CstShift};
-  SDValue VecCstShift = DAG.getNode(ISD::BUILD_VECTOR, DL, VecIntVT,
-                                    makeArrayRef(&CstShiftArray[0], NumElts));
+  SDValue VecCstShift = DAG.getConstant(16, DL, VecIntVT);
   SDValue HighShift = DAG.getNode(ISD::SRL, DL, VecIntVT, V, VecCstShift);
 
   SDValue Low, High;
@@ -12997,9 +13665,7 @@ static SDValue lowerUINT_TO_FP_vXi32(SDValue Op, SelectionDAG &DAG,
     High = DAG.getNode(X86ISD::BLENDI, DL, VecI16VT, VecShiftBitcast,
                        VecCstHighBitcast, DAG.getConstant(0xaa, DL, MVT::i32));
   } else {
-    SDValue CstMask = DAG.getConstant(0xffff, DL, MVT::i32);
-    SDValue VecCstMask = DAG.getNode(ISD::BUILD_VECTOR, DL, VecIntVT, CstMask,
-                                     CstMask, CstMask, CstMask);
+    SDValue VecCstMask = DAG.getConstant(0xffff, DL, VecIntVT);
     //     uint4 lo = (v & (uint4) 0xffff) | (uint4) 0x4b000000;
     SDValue LowAnd = DAG.getNode(ISD::AND, DL, VecIntVT, V, VecCstMask);
     Low = DAG.getNode(ISD::OR, DL, VecIntVT, LowAnd, VecCstLow);
@@ -13009,12 +13675,8 @@ static SDValue lowerUINT_TO_FP_vXi32(SDValue Op, SelectionDAG &DAG,
   }
 
   // Create the vector constant for -(0x1.0p39f + 0x1.0p23f).
-  SDValue CstFAdd = DAG.getConstantFP(
-      APFloat(APFloat::IEEEsingle, APInt(32, 0xD3000080)), DL, MVT::f32);
-  SDValue CstFAddArray[] = {CstFAdd, CstFAdd, CstFAdd, CstFAdd,
-                            CstFAdd, CstFAdd, CstFAdd, CstFAdd};
-  SDValue VecCstFAdd = DAG.getNode(ISD::BUILD_VECTOR, DL, VecFloatVT,
-                                   makeArrayRef(&CstFAddArray[0], NumElts));
+  SDValue VecCstFAdd = DAG.getConstantFP(
+      APFloat(APFloat::IEEEsingle, APInt(32, 0xD3000080)), DL, VecFloatVT);
 
   //     float4 fhi = (float4) hi - (0x1.0p39f + 0x1.0p23f);
   SDValue HighBitcast = DAG.getBitcast(VecFloatVT, High);
@@ -13045,10 +13707,10 @@ SDValue X86TargetLowering::lowerUINT_TO_FP_vec(SDValue Op,
   }
   case MVT::v4i32:
   case MVT::v8i32:
-    return lowerUINT_TO_FP_vXi32(Op, DAG, *Subtarget);
+    return lowerUINT_TO_FP_vXi32(Op, DAG, Subtarget);
   case MVT::v16i8:
   case MVT::v16i16:
-    assert(Subtarget->hasAVX512());
+    assert(Subtarget.hasAVX512());
     return DAG.getNode(ISD::UINT_TO_FP, dl, Op.getValueType(),
                        DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::v16i32, N0));
   }
@@ -13072,8 +13734,8 @@ SDValue X86TargetLowering::LowerUINT_TO_FP(SDValue Op,
   MVT SrcVT = N0.getSimpleValueType();
   MVT DstVT = Op.getSimpleValueType();
 
-  if (Subtarget->hasAVX512() && isScalarFPTypeInSSEReg(DstVT) &&
-      (SrcVT == MVT::i32 || (SrcVT == MVT::i64 && Subtarget->is64Bit()))) {
+  if (Subtarget.hasAVX512() && isScalarFPTypeInSSEReg(DstVT) &&
+      (SrcVT == MVT::i32 || (SrcVT == MVT::i64 && Subtarget.is64Bit()))) {
     // Conversions from unsigned i32 to f32/f64 are legal,
     // using VCVTUSI2SS/SD.  Same for i64 in 64-bit mode.
     return Op;
@@ -13083,34 +13745,30 @@ SDValue X86TargetLowering::LowerUINT_TO_FP(SDValue Op,
     return LowerUINT_TO_FP_i64(Op, DAG);
   if (SrcVT == MVT::i32 && X86ScalarSSEf64)
     return LowerUINT_TO_FP_i32(Op, DAG);
-  if (Subtarget->is64Bit() && SrcVT == MVT::i64 && DstVT == MVT::f32)
+  if (Subtarget.is64Bit() && SrcVT == MVT::i64 && DstVT == MVT::f32)
     return SDValue();
 
   // Make a 64-bit buffer, and use it to build an FILD.
   SDValue StackSlot = DAG.CreateStackTemporary(MVT::i64);
   if (SrcVT == MVT::i32) {
-    SDValue WordOff = DAG.getConstant(4, dl, PtrVT);
-    SDValue OffsetSlot = DAG.getNode(ISD::ADD, dl, PtrVT, StackSlot, WordOff);
+    SDValue OffsetSlot = DAG.getMemBasePlusOffset(StackSlot, 4, dl);
     SDValue Store1 = DAG.getStore(DAG.getEntryNode(), dl, Op.getOperand(0),
-                                  StackSlot, MachinePointerInfo(),
-                                  false, false, 0);
+                                  StackSlot, MachinePointerInfo());
     SDValue Store2 = DAG.getStore(Store1, dl, DAG.getConstant(0, dl, MVT::i32),
-                                  OffsetSlot, MachinePointerInfo(),
-                                  false, false, 0);
+                                  OffsetSlot, MachinePointerInfo());
     SDValue Fild = BuildFILD(Op, MVT::i64, Store2, StackSlot, DAG);
     return Fild;
   }
 
   assert(SrcVT == MVT::i64 && "Unexpected type in UINT_TO_FP");
   SDValue ValueToStore = Op.getOperand(0);
-  if (isScalarFPTypeInSSEReg(Op.getValueType()) && !Subtarget->is64Bit())
+  if (isScalarFPTypeInSSEReg(Op.getValueType()) && !Subtarget.is64Bit())
     // Bitcasting to f64 here allows us to do a single 64-bit store from
     // an SSE register, avoiding the store forwarding penalty that would come
     // with two 32-bit stores.
     ValueToStore = DAG.getBitcast(MVT::f64, ValueToStore);
-  SDValue Store = DAG.getStore(DAG.getEntryNode(), dl, ValueToStore,
-                               StackSlot, MachinePointerInfo(),
-                               false, false, 0);
+  SDValue Store = DAG.getStore(DAG.getEntryNode(), dl, ValueToStore, StackSlot,
+                               MachinePointerInfo());
   // For i64 source, we need to add the appropriate power of 2 if the input
   // was negative.  This is the same as the optimization in
   // DAGTypeLegalizer::ExpandIntOp_UNIT_TO_FP, and for it to be safe here,
@@ -13149,7 +13807,7 @@ SDValue X86TargetLowering::LowerUINT_TO_FP(SDValue Op,
   SDValue Fudge = DAG.getExtLoad(
       ISD::EXTLOAD, dl, MVT::f80, DAG.getEntryNode(), FudgePtr,
       MachinePointerInfo::getConstantPool(DAG.getMachineFunction()), MVT::f32,
-      false, false, false, 4);
+      /* Alignment = */ 4);
   // Extend everything to 80 bits to force it to be done on x87.
   // TODO: Are there any fast-math-flags to propagate here?
   SDValue Add = DAG.getNode(ISD::FADD, dl, MVT::f80, Fild, Fudge);
@@ -13186,10 +13844,10 @@ X86TargetLowering::FP_TO_INTHelper(SDValue Op, SelectionDAG &DAG,
   // used for the 32-bit subtarget, but also for f80 on a 64-bit target.
   bool UnsignedFixup = !IsSigned &&
                        DstTy == MVT::i64 &&
-                       (!Subtarget->is64Bit() ||
+                       (!Subtarget.is64Bit() ||
                         !isScalarFPTypeInSSEReg(TheVT));
 
-  if (!IsSigned && DstTy != MVT::i64 && !Subtarget->hasAVX512()) {
+  if (!IsSigned && DstTy != MVT::i64 && !Subtarget.hasAVX512()) {
     // Replace the fp-to-uint32 operation with an fp-to-sint64 FIST.
     // The low 32 bits of the fist result will have the correct uint32 result.
     assert(DstTy == MVT::i32 && "Unexpected FP_TO_UINT");
@@ -13204,7 +13862,7 @@ X86TargetLowering::FP_TO_INTHelper(SDValue Op, SelectionDAG &DAG,
   if (DstTy == MVT::i32 &&
       isScalarFPTypeInSSEReg(Op.getOperand(0).getValueType()))
     return std::make_pair(SDValue(), SDValue());
-  if (Subtarget->is64Bit() &&
+  if (Subtarget.is64Bit() &&
       DstTy == MVT::i64 &&
       isScalarFPTypeInSSEReg(Op.getOperand(0).getValueType()))
     return std::make_pair(SDValue(), SDValue());
@@ -13280,8 +13938,7 @@ X86TargetLowering::FP_TO_INTHelper(SDValue Op, SelectionDAG &DAG,
   if (isScalarFPTypeInSSEReg(TheVT)) {
     assert(DstTy == MVT::i64 && "Invalid FP_TO_SINT to lower!");
     Chain = DAG.getStore(Chain, DL, Value, StackSlot,
-                         MachinePointerInfo::getFixedStack(MF, SSFI), false,
-                         false, 0);
+                         MachinePointerInfo::getFixedStack(MF, SSFI));
     SDVTList Tys = DAG.getVTList(Op.getOperand(0).getValueType(), MVT::Other);
     SDValue Ops[] = {
       Chain, StackSlot, DAG.getValueType(TheVT)
@@ -13309,18 +13966,15 @@ X86TargetLowering::FP_TO_INTHelper(SDValue Op, SelectionDAG &DAG,
     SDValue FIST = DAG.getMemIntrinsicNode(Opc, DL, DAG.getVTList(MVT::Other),
                                            FistOps, DstTy, MMO);
 
-    SDValue Low32 = DAG.getLoad(MVT::i32, DL, FIST, StackSlot,
-                                MachinePointerInfo(),
-                                false, false, false, 0);
-    SDValue HighAddr = DAG.getNode(ISD::ADD, DL, PtrVT, StackSlot,
-                                   DAG.getConstant(4, DL, PtrVT));
+    SDValue Low32 =
+        DAG.getLoad(MVT::i32, DL, FIST, StackSlot, MachinePointerInfo());
+    SDValue HighAddr = DAG.getMemBasePlusOffset(StackSlot, 4, DL);
 
-    SDValue High32 = DAG.getLoad(MVT::i32, DL, FIST, HighAddr,
-                                 MachinePointerInfo(),
-                                 false, false, false, 0);
+    SDValue High32 =
+        DAG.getLoad(MVT::i32, DL, FIST, HighAddr, MachinePointerInfo());
     High32 = DAG.getNode(ISD::XOR, DL, MVT::i32, High32, Adjust);
 
-    if (Subtarget->is64Bit()) {
+    if (Subtarget.is64Bit()) {
       // Join High32 and Low32 into a 64-bit result.
       // (High32 << 32) | Low32
       Low32 = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i64, Low32);
@@ -13347,7 +14001,7 @@ X86TargetLowering::FP_TO_INTHelper(SDValue Op, SelectionDAG &DAG,
 }
 
 static SDValue LowerAVXExtend(SDValue Op, SelectionDAG &DAG,
-                              const X86Subtarget *Subtarget) {
+                              const X86Subtarget &Subtarget) {
   MVT VT = Op->getSimpleValueType(0);
   SDValue In = Op->getOperand(0);
   MVT InVT = In.getSimpleValueType();
@@ -13374,7 +14028,7 @@ static SDValue LowerAVXExtend(SDValue Op, SelectionDAG &DAG,
       ((VT != MVT::v4i64) || (InVT != MVT::v4i32)))
     return SDValue();
 
-  if (Subtarget->hasInt256())
+  if (Subtarget.hasInt256())
     return DAG.getNode(X86ISD::VZEXT, dl, VT, In);
 
   SDValue ZeroVec = getZeroVector(InVT, Subtarget, DAG, dl);
@@ -13393,41 +14047,46 @@ static SDValue LowerAVXExtend(SDValue Op, SelectionDAG &DAG,
 }
 
 static  SDValue LowerZERO_EXTEND_AVX512(SDValue Op,
-                  const X86Subtarget *Subtarget, SelectionDAG &DAG) {
+                  const X86Subtarget &Subtarget, SelectionDAG &DAG) {
   MVT VT = Op->getSimpleValueType(0);
   SDValue In = Op->getOperand(0);
   MVT InVT = In.getSimpleValueType();
   SDLoc DL(Op);
   unsigned int NumElts = VT.getVectorNumElements();
-  if (NumElts != 8 && NumElts != 16 && !Subtarget->hasBWI())
+  if (NumElts != 8 && NumElts != 16 && !Subtarget.hasBWI())
     return SDValue();
 
   if (VT.is512BitVector() && InVT.getVectorElementType() != MVT::i1)
     return DAG.getNode(X86ISD::VZEXT, DL, VT, In);
 
   assert(InVT.getVectorElementType() == MVT::i1);
-  MVT ExtVT = NumElts == 8 ? MVT::v8i64 : MVT::v16i32;
+
+  // Extend VT if the target is 256 or 128bit vector and VLX is not supported.
+  MVT ExtVT = VT;
+  if (!VT.is512BitVector() && !Subtarget.hasVLX())
+    ExtVT = MVT::getVectorVT(MVT::getIntegerVT(512/NumElts), NumElts);
+
   SDValue One =
    DAG.getConstant(APInt(ExtVT.getScalarSizeInBits(), 1), DL, ExtVT);
   SDValue Zero =
    DAG.getConstant(APInt::getNullValue(ExtVT.getScalarSizeInBits()), DL, ExtVT);
 
-  SDValue V = DAG.getNode(ISD::VSELECT, DL, ExtVT, In, One, Zero);
-  if (VT.is512BitVector())
-    return V;
-  return DAG.getNode(X86ISD::VTRUNC, DL, VT, V);
+  SDValue SelectedVal = DAG.getNode(ISD::VSELECT, DL, ExtVT, In, One, Zero);
+  if (VT == ExtVT)
+    return SelectedVal;
+  return DAG.getNode(X86ISD::VTRUNC, DL, VT, SelectedVal);
 }
 
-static SDValue LowerANY_EXTEND(SDValue Op, const X86Subtarget *Subtarget,
+static SDValue LowerANY_EXTEND(SDValue Op, const X86Subtarget &Subtarget,
                                SelectionDAG &DAG) {
-  if (Subtarget->hasFp256())
+  if (Subtarget.hasFp256())
     if (SDValue Res = LowerAVXExtend(Op, DAG, Subtarget))
       return Res;
 
   return SDValue();
 }
 
-static SDValue LowerZERO_EXTEND(SDValue Op, const X86Subtarget *Subtarget,
+static SDValue LowerZERO_EXTEND(SDValue Op, const X86Subtarget &Subtarget,
                                 SelectionDAG &DAG) {
   SDLoc DL(Op);
   MVT VT = Op.getSimpleValueType();
@@ -13437,7 +14096,7 @@ static SDValue LowerZERO_EXTEND(SDValue Op, const X86Subtarget *Subtarget,
   if (VT.is512BitVector() || SVT.getVectorElementType() == MVT::i1)
     return LowerZERO_EXTEND_AVX512(Op, Subtarget, DAG);
 
-  if (Subtarget->hasFp256())
+  if (Subtarget.hasFp256())
     if (SDValue Res = LowerAVXExtend(Op, DAG, Subtarget))
       return Res;
 
@@ -13447,50 +14106,32 @@ static SDValue LowerZERO_EXTEND(SDValue Op, const X86Subtarget *Subtarget,
 }
 
 static SDValue LowerTruncateVecI1(SDValue Op, SelectionDAG &DAG,
-                                  const X86Subtarget *Subtarget) {
+                                  const X86Subtarget &Subtarget) {
 
   SDLoc DL(Op);
   MVT VT = Op.getSimpleValueType();
   SDValue In = Op.getOperand(0);
   MVT InVT = In.getSimpleValueType();
 
-  assert(VT.getVectorElementType() == MVT::i1 && "Unexected vector type.");
+  assert(VT.getVectorElementType() == MVT::i1 && "Unexpected vector type.");
 
-  // Shift LSB to MSB and use VPMOVB2M - SKX.
+  // Shift LSB to MSB and use VPMOVB/W2M or TESTD/Q.
   unsigned ShiftInx = InVT.getScalarSizeInBits() - 1;
-  if ((InVT.is512BitVector() && InVT.getScalarSizeInBits() <= 16 &&
-         Subtarget->hasBWI()) ||     // legal, will go to VPMOVB2M, VPMOVW2M
-      ((InVT.is256BitVector() || InVT.is128BitVector()) &&
-             InVT.getScalarSizeInBits() <= 16 && Subtarget->hasBWI() &&
-             Subtarget->hasVLX())) { // legal, will go to VPMOVB2M, VPMOVW2M
-    // Shift packed bytes not supported natively, bitcast to dword
-    MVT ExtVT = MVT::getVectorVT(MVT::i16, InVT.getSizeInBits()/16);
-    SDValue  ShiftNode = DAG.getNode(ISD::SHL, DL, ExtVT,
-                                     DAG.getBitcast(ExtVT, In),
-                                     DAG.getConstant(ShiftInx, DL, ExtVT));
-    ShiftNode = DAG.getBitcast(InVT, ShiftNode);
-    return DAG.getNode(X86ISD::CVT2MASK, DL, VT, ShiftNode);
-  }
-  if ((InVT.is512BitVector() && InVT.getScalarSizeInBits() >= 32 &&
-         Subtarget->hasDQI()) ||  // legal, will go to VPMOVD2M, VPMOVQ2M
-      ((InVT.is256BitVector() || InVT.is128BitVector()) &&
-         InVT.getScalarSizeInBits() >= 32 && Subtarget->hasDQI() &&
-         Subtarget->hasVLX())) {  // legal, will go to VPMOVD2M, VPMOVQ2M
-
-    SDValue  ShiftNode = DAG.getNode(ISD::SHL, DL, InVT, In,
-                                     DAG.getConstant(ShiftInx, DL, InVT));
-    return DAG.getNode(X86ISD::CVT2MASK, DL, VT, ShiftNode);
-  }
-
-  // Shift LSB to MSB, extend if necessary and use TESTM.
-  unsigned NumElts = InVT.getVectorNumElements();
-  if (InVT.getSizeInBits() < 512 &&
-      (InVT.getScalarType() == MVT::i8 || InVT.getScalarType() == MVT::i16 ||
-       !Subtarget->hasVLX())) {
-    assert((NumElts == 8 || NumElts == 16) && "Unexected vector type.");
-
-    // TESTD/Q should be used (if BW supported we use CVT2MASK above),
-    // so vector should be extended to packed dword/qword.
+  if (InVT.getScalarSizeInBits() <= 16) {
+    if (Subtarget.hasBWI()) {
+      // legal, will go to VPMOVB2M, VPMOVW2M
+      // Shift packed bytes not supported natively, bitcast to word
+      MVT ExtVT = MVT::getVectorVT(MVT::i16, InVT.getSizeInBits()/16);
+      SDValue  ShiftNode = DAG.getNode(ISD::SHL, DL, ExtVT,
+                                       DAG.getBitcast(ExtVT, In),
+                                       DAG.getConstant(ShiftInx, DL, ExtVT));
+      ShiftNode = DAG.getBitcast(InVT, ShiftNode);
+      return DAG.getNode(X86ISD::CVT2MASK, DL, VT, ShiftNode);
+    }
+    // Use TESTD/Q, extended vector to packed dword/qword.
+    assert((InVT.is256BitVector() || InVT.is128BitVector()) &&
+           "Unexpected vector type.");
+    unsigned NumElts = InVT.getVectorNumElements();
     MVT ExtVT = MVT::getVectorVT(MVT::getIntegerVT(512/NumElts), NumElts);
     In = DAG.getNode(ISD::SIGN_EXTEND, DL, ExtVT, In);
     InVT = ExtVT;
@@ -13523,16 +14164,16 @@ SDValue X86TargetLowering::LowerTRUNCATE(SDValue Op, SelectionDAG &DAG) const {
     return LowerTruncateVecI1(Op, DAG, Subtarget);
 
   // vpmovqb/w/d, vpmovdb/w, vpmovwb
-  if (Subtarget->hasAVX512()) {
+  if (Subtarget.hasAVX512()) {
     // word to byte only under BWI
-    if (InVT == MVT::v16i16 && !Subtarget->hasBWI()) // v16i16 -> v16i8
+    if (InVT == MVT::v16i16 && !Subtarget.hasBWI()) // v16i16 -> v16i8
       return DAG.getNode(X86ISD::VTRUNC, DL, VT,
                          DAG.getNode(X86ISD::VSEXT, DL, MVT::v16i32, In));
     return DAG.getNode(X86ISD::VTRUNC, DL, VT, In);
   }
   if ((VT == MVT::v4i32) && (InVT == MVT::v4i64)) {
     // On AVX2, v4i64 -> v4i32 becomes VPERMD.
-    if (Subtarget->hasInt256()) {
+    if (Subtarget.hasInt256()) {
       static const int ShufMask[] = {0, 2, 4, 6, -1, -1, -1, -1};
       In = DAG.getBitcast(MVT::v8i32, In);
       In = DAG.getVectorShuffle(MVT::v8i32, DL, In, DAG.getUNDEF(MVT::v8i32),
@@ -13553,7 +14194,7 @@ SDValue X86TargetLowering::LowerTRUNCATE(SDValue Op, SelectionDAG &DAG) const {
 
   if ((VT == MVT::v8i16) && (InVT == MVT::v8i32)) {
     // On AVX2, v8i32 -> v8i16 becomed PSHUFB.
-    if (Subtarget->hasInt256()) {
+    if (Subtarget.hasInt256()) {
       In = DAG.getBitcast(MVT::v32i8, In);
 
       SmallVector<SDValue,32> pshufbMask;
@@ -13569,13 +14210,13 @@ SDValue X86TargetLowering::LowerTRUNCATE(SDValue Op, SelectionDAG &DAG) const {
         for (unsigned j = 0; j < 8; ++j)
           pshufbMask.push_back(DAG.getConstant(0x80, DL, MVT::i8));
       }
-      SDValue BV = DAG.getNode(ISD::BUILD_VECTOR, DL, MVT::v32i8, pshufbMask);
+      SDValue BV = DAG.getBuildVector(MVT::v32i8, DL, pshufbMask);
       In = DAG.getNode(X86ISD::PSHUFB, DL, MVT::v32i8, In, BV);
       In = DAG.getBitcast(MVT::v4i64, In);
 
       static const int ShufMask[] = {0,  2,  -1,  -1};
       In = DAG.getVectorShuffle(MVT::v4i64, DL,  In, DAG.getUNDEF(MVT::v4i64),
-                                &ShufMask[0]);
+                                ShufMask);
       In = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v2i64, In,
                        DAG.getIntPtrConstant(0, DL));
       return DAG.getBitcast(VT, In);
@@ -13611,7 +14252,7 @@ SDValue X86TargetLowering::LowerTRUNCATE(SDValue Op, SelectionDAG &DAG) const {
   if (!VT.is128BitVector() || !InVT.is256BitVector())
     return SDValue();
 
-  assert(Subtarget->hasFp256() && "256-bit vector without AVX!");
+  assert(Subtarget.hasFp256() && "256-bit vector without AVX!");
 
   unsigned NumElems = VT.getVectorNumElements();
   MVT NVT = MVT::getVectorVT(VT.getVectorElementType(), NumElems * 2);
@@ -13621,7 +14262,7 @@ SDValue X86TargetLowering::LowerTRUNCATE(SDValue Op, SelectionDAG &DAG) const {
   for (unsigned i = 0; i != NumElems; ++i)
     MaskVec[i] = i * 2;
   SDValue V = DAG.getVectorShuffle(NVT, DL, DAG.getBitcast(NVT, In),
-                                   DAG.getUNDEF(NVT), &MaskVec[0]);
+                                   DAG.getUNDEF(NVT), MaskVec);
   return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, V,
                      DAG.getIntPtrConstant(0, DL));
 }
@@ -13639,9 +14280,8 @@ SDValue X86TargetLowering::LowerFP_TO_SINT(SDValue Op,
 
   if (StackSlot.getNode())
     // Load the result.
-    return DAG.getLoad(Op.getValueType(), SDLoc(Op),
-                       FIST, StackSlot, MachinePointerInfo(),
-                       false, false, false, 0);
+    return DAG.getLoad(Op.getValueType(), SDLoc(Op), FIST, StackSlot,
+                       MachinePointerInfo());
 
   // The node is the result.
   return FIST;
@@ -13658,9 +14298,8 @@ SDValue X86TargetLowering::LowerFP_TO_UINT(SDValue Op,
 
   if (StackSlot.getNode())
     // Load the result.
-    return DAG.getLoad(Op.getValueType(), SDLoc(Op),
-                       FIST, StackSlot, MachinePointerInfo(),
-                       false, false, false, 0);
+    return DAG.getLoad(Op.getValueType(), SDLoc(Op), FIST, StackSlot,
+                       MachinePointerInfo());
 
   // The node is the result.
   return FIST;
@@ -13736,10 +14375,9 @@ static SDValue LowerFABSorFNEG(SDValue Op, SelectionDAG &DAG) {
   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
   SDValue CPIdx = DAG.getConstantPool(C, TLI.getPointerTy(DAG.getDataLayout()));
   unsigned Alignment = cast<ConstantPoolSDNode>(CPIdx)->getAlignment();
-  SDValue Mask =
-      DAG.getLoad(LogicVT, dl, DAG.getEntryNode(), CPIdx,
-                  MachinePointerInfo::getConstantPool(DAG.getMachineFunction()),
-                  false, false, false, Alignment);
+  SDValue Mask = DAG.getLoad(
+      LogicVT, dl, DAG.getEntryNode(), CPIdx,
+      MachinePointerInfo::getConstantPool(DAG.getMachineFunction()), Alignment);
 
   SDValue Op0 = Op.getOperand(0);
   bool IsFNABS = !IsFABS && (Op0.getOpcode() == ISD::FABS);
@@ -13807,7 +14445,7 @@ static SDValue LowerFCOPYSIGN(SDValue Op, SelectionDAG &DAG) {
   SDValue Mask1 =
       DAG.getLoad(LogicVT, dl, DAG.getEntryNode(), CPIdx,
                   MachinePointerInfo::getConstantPool(DAG.getMachineFunction()),
-                  false, false, false, 16);
+                  /* Alignment = */ 16);
   if (!IsF128)
     Op1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, LogicVT, Op1);
   SDValue SignBit = DAG.getNode(X86ISD::FAND, dl, LogicVT, Op1, Mask1);
@@ -13833,7 +14471,7 @@ static SDValue LowerFCOPYSIGN(SDValue Op, SelectionDAG &DAG) {
   SDValue Val =
       DAG.getLoad(LogicVT, dl, DAG.getEntryNode(), CPIdx,
                   MachinePointerInfo::getConstantPool(DAG.getMachineFunction()),
-                  false, false, false, 16);
+                  /* Alignment = */ 16);
   // If the magnitude operand wasn't a constant, we need to AND out the sign.
   if (!isa<ConstantFPSDNode>(Op0)) {
     if (!IsF128)
@@ -13852,18 +14490,25 @@ static SDValue LowerFGETSIGN(SDValue Op, SelectionDAG &DAG) {
   SDLoc dl(Op);
   MVT VT = Op.getSimpleValueType();
 
-  // Lower ISD::FGETSIGN to (AND (X86ISD::FGETSIGNx86 ...) 1).
-  SDValue xFGETSIGN = DAG.getNode(X86ISD::FGETSIGNx86, dl, VT, N0,
-                                  DAG.getConstant(1, dl, VT));
-  return DAG.getNode(ISD::AND, dl, VT, xFGETSIGN, DAG.getConstant(1, dl, VT));
+  MVT OpVT = N0.getSimpleValueType();
+  assert((OpVT == MVT::f32 || OpVT == MVT::f64) &&
+         "Unexpected type for FGETSIGN");
+
+  // Lower ISD::FGETSIGN to (AND (X86ISD::MOVMSK ...) 1).
+  MVT VecVT = (OpVT == MVT::f32 ? MVT::v4f32 : MVT::v2f64);
+  SDValue Res = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VecVT, N0);
+  Res = DAG.getNode(X86ISD::MOVMSK, dl, MVT::i32, Res);
+  Res = DAG.getZExtOrTrunc(Res, dl, VT);
+  Res = DAG.getNode(ISD::AND, dl, VT, Res, DAG.getConstant(1, dl, VT));
+  return Res;
 }
 
 // Check whether an OR'd tree is PTEST-able.
-static SDValue LowerVectorAllZeroTest(SDValue Op, const X86Subtarget *Subtarget,
+static SDValue LowerVectorAllZeroTest(SDValue Op, const X86Subtarget &Subtarget,
                                       SelectionDAG &DAG) {
   assert(Op.getOpcode() == ISD::OR && "Only check OR'd tree.");
 
-  if (!Subtarget->hasSSE41())
+  if (!Subtarget.hasSSE41())
     return SDValue();
 
   if (!Op->hasOneUse())
@@ -13969,9 +14614,27 @@ static bool hasNonFlagsUse(SDValue Op) {
   return false;
 }
 
+// Emit KTEST instruction for bit vectors on AVX-512
+static SDValue EmitKTEST(SDValue Op, SelectionDAG &DAG,
+                         const X86Subtarget &Subtarget) {
+  if (Op.getOpcode() == ISD::BITCAST) {
+    auto hasKTEST = [&](MVT VT) {
+      unsigned SizeInBits = VT.getSizeInBits();
+      return (Subtarget.hasDQI() && (SizeInBits == 8 || SizeInBits == 16)) ||
+        (Subtarget.hasBWI() && (SizeInBits == 32 || SizeInBits == 64));
+    };
+    SDValue Op0 = Op.getOperand(0);
+    MVT Op0VT = Op0.getValueType().getSimpleVT();
+    if (Op0VT.isVector() && Op0VT.getVectorElementType() == MVT::i1 &&
+        hasKTEST(Op0VT))
+      return DAG.getNode(X86ISD::KTEST, SDLoc(Op), Op0VT, Op0, Op0);
+  }
+  return SDValue();
+}
+
 /// Emit nodes that will be selected as "test Op0,Op0", or something
 /// equivalent.
-SDValue X86TargetLowering::EmitTest(SDValue Op, unsigned X86CC, SDLoc dl,
+SDValue X86TargetLowering::EmitTest(SDValue Op, unsigned X86CC, const SDLoc &dl,
                                     SelectionDAG &DAG) const {
   if (Op.getValueType() == MVT::i1) {
     SDValue ExtOp = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i8, Op);
@@ -14014,10 +14677,10 @@ SDValue X86TargetLowering::EmitTest(SDValue Op, unsigned X86CC, SDLoc dl,
   // doing a separate TEST. TEST always sets OF and CF to 0, so unless
   // we prove that the arithmetic won't overflow, we can't use OF or CF.
   if (Op.getResNo() != 0 || NeedOF || NeedCF) {
+    // Emit KTEST for bit vectors
+    if (auto Node = EmitKTEST(Op, DAG, Subtarget))
+      return Node;
     // Emit a CMP with 0, which is the TEST pattern.
-    //if (Op.getValueType() == MVT::i1)
-    //  return DAG.getNode(X86ISD::CMP, dl, MVT::i1, Op,
-    //                     DAG.getConstant(0, MVT::i1));
     return DAG.getNode(X86ISD::CMP, dl, MVT::i32, Op,
                        DAG.getConstant(0, dl, Op.getValueType()));
   }
@@ -14071,14 +14734,14 @@ SDValue X86TargetLowering::EmitTest(SDValue Op, unsigned X86CC, SDLoc dl,
     if (ConstantSDNode *C =
         dyn_cast<ConstantSDNode>(ArithOp.getNode()->getOperand(1))) {
       // An add of one will be selected as an INC.
-      if (C->isOne() && !Subtarget->slowIncDec()) {
+      if (C->isOne() && !Subtarget.slowIncDec()) {
         Opcode = X86ISD::INC;
         NumOperands = 1;
         break;
       }
 
       // An add of negative one (subtract of one) will be selected as a DEC.
-      if (C->isAllOnesValue() && !Subtarget->slowIncDec()) {
+      if (C->isAllOnesValue() && !Subtarget.slowIncDec()) {
         Opcode = X86ISD::DEC;
         NumOperands = 1;
         break;
@@ -14106,18 +14769,26 @@ SDValue X86TargetLowering::EmitTest(SDValue Op, unsigned X86CC, SDLoc dl,
                        : APInt::getLowBitsSet(BitWidth, BitWidth - ShAmt);
       if (!Mask.isSignedIntN(32)) // Avoid large immediates.
         break;
-      SDValue New = DAG.getNode(ISD::AND, dl, VT, Op->getOperand(0),
-                                DAG.getConstant(Mask, dl, VT));
-      DAG.ReplaceAllUsesWith(Op, New);
-      Op = New;
+      Op = DAG.getNode(ISD::AND, dl, VT, Op->getOperand(0),
+                       DAG.getConstant(Mask, dl, VT));
     }
     break;
 
   case ISD::AND:
-    // If the primary and result isn't used, don't bother using X86ISD::AND,
+    // If the primary 'and' result isn't used, don't bother using X86ISD::AND,
     // because a TEST instruction will be better.
-    if (!hasNonFlagsUse(Op))
-      break;
+    if (!hasNonFlagsUse(Op)) {
+      SDValue Op0 = ArithOp->getOperand(0);
+      SDValue Op1 = ArithOp->getOperand(1);
+      EVT VT = ArithOp.getValueType();
+      bool isAndn = isBitwiseNot(Op0) || isBitwiseNot(Op1);
+      bool isLegalAndnType = VT == MVT::i32 || VT == MVT::i64;
+
+      // But if we can combine this into an ANDN operation, then create an AND
+      // now and allow it to be pattern matched into an ANDN.
+      if (!Subtarget.hasBMI() || !isAndn || !isLegalAndnType)
+        break;
+    }
     // FALL THROUGH
   case ISD::SUB:
   case ISD::OR:
@@ -14137,8 +14808,7 @@ SDValue X86TargetLowering::EmitTest(SDValue Op, unsigned X86CC, SDLoc dl,
     case ISD::AND: Opcode = X86ISD::AND; break;
     case ISD::OR: {
       if (!NeedTruncation && (X86CC == X86::COND_E || X86CC == X86::COND_NE)) {
-        SDValue EFLAGS = LowerVectorAllZeroTest(Op, Subtarget, DAG);
-        if (EFLAGS.getNode())
+        if (SDValue EFLAGS = LowerVectorAllZeroTest(Op, Subtarget, DAG))
           return EFLAGS;
       }
       Opcode = X86ISD::OR;
@@ -14190,11 +14860,15 @@ SDValue X86TargetLowering::EmitTest(SDValue Op, unsigned X86CC, SDLoc dl,
     }
   }
 
-  if (Opcode == 0)
+  if (Opcode == 0) {
+    // Emit KTEST for bit vectors
+    if (auto Node = EmitKTEST(Op, DAG, Subtarget))
+      return Node;
+
     // Emit a CMP with 0, which is the TEST pattern.
     return DAG.getNode(X86ISD::CMP, dl, MVT::i32, Op,
                        DAG.getConstant(0, dl, Op.getValueType()));
-
+  }
   SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::i32);
   SmallVector<SDValue, 4> Ops(Op->op_begin(), Op->op_begin() + NumOperands);
 
@@ -14206,7 +14880,7 @@ SDValue X86TargetLowering::EmitTest(SDValue Op, unsigned X86CC, SDLoc dl,
 /// Emit nodes that will be selected as "cmp Op0,Op1", or something
 /// equivalent.
 SDValue X86TargetLowering::EmitCmp(SDValue Op0, SDValue Op1, unsigned X86CC,
-                                   SDLoc dl, SelectionDAG &DAG) const {
+                                   const SDLoc &dl, SelectionDAG &DAG) const {
   if (isNullConstant(Op1))
     return EmitTest(Op0, X86CC, dl, DAG);
 
@@ -14215,13 +14889,12 @@ SDValue X86TargetLowering::EmitCmp(SDValue Op0, SDValue Op1, unsigned X86CC,
 
   if ((Op0.getValueType() == MVT::i8 || Op0.getValueType() == MVT::i16 ||
        Op0.getValueType() == MVT::i32 || Op0.getValueType() == MVT::i64)) {
-    // Do the comparison at i32 if it's smaller, besides the Atom case.
-    // This avoids subregister aliasing issues. Keep the smaller reference
-    // if we're optimizing for size, however, as that'll allow better folding
-    // of memory operations.
-    if (Op0.getValueType() != MVT::i32 && Op0.getValueType() != MVT::i64 &&
+    // Only promote the compare up to I32 if it is a 16 bit operation
+    // with an immediate.  16 bit immediates are to be avoided.
+    if ((Op0.getValueType() == MVT::i16 &&
+         (isa<ConstantSDNode>(Op0) || isa<ConstantSDNode>(Op1))) &&
         !DAG.getMachineFunction().getFunction()->optForMinSize() &&
-        !Subtarget->isAtom()) {
+        !Subtarget.isAtom()) {
       unsigned ExtendOp =
           isX86CCUnsigned(X86CC) ? ISD::ZERO_EXTEND : ISD::SIGN_EXTEND;
       Op0 = DAG.getNode(ExtendOp, dl, MVT::i32, Op0);
@@ -14241,7 +14914,7 @@ SDValue X86TargetLowering::ConvertCmpIfNecessary(SDValue Cmp,
                                                  SelectionDAG &DAG) const {
   // If the subtarget does not support the FUCOMI instruction, floating-point
   // comparisons have to be converted.
-  if (Subtarget->hasCMov() ||
+  if (Subtarget.hasCMov() ||
       Cmp.getOpcode() != X86ISD::CMP ||
       !Cmp.getOperand(0).getValueType().isFloatingPoint() ||
       !Cmp.getOperand(1).getValueType().isFloatingPoint())
@@ -14259,7 +14932,7 @@ SDValue X86TargetLowering::ConvertCmpIfNecessary(SDValue Cmp,
   SDValue TruncSrl = DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, Srl);
 
   // Some 64-bit targets lack SAHF support, but they do support FCOMI.
-  assert(Subtarget->hasLAHFSAHF() && "Target doesn't support SAHF or FCOMI?");
+  assert(Subtarget.hasLAHFSAHF() && "Target doesn't support SAHF or FCOMI?");
   return DAG.getNode(X86ISD::SAHF, dl, MVT::i32, TruncSrl);
 }
 
@@ -14279,10 +14952,10 @@ SDValue X86TargetLowering::getRsqrtEstimate(SDValue Op,
   // instructions: convert to single, rsqrtss, convert back to double, refine
   // (3 steps = at least 13 insts). If an 'rsqrtsd' variant was added to the ISA
   // along with FMA, this could be a throughput win.
-  if (VT == MVT::f32 && Subtarget->hasSSE1())
+  if (VT == MVT::f32 && Subtarget.hasSSE1())
     RecipOp = "sqrtf";
-  else if ((VT == MVT::v4f32 && Subtarget->hasSSE1()) ||
-           (VT == MVT::v8f32 && Subtarget->hasAVX()))
+  else if ((VT == MVT::v4f32 && Subtarget.hasSSE1()) ||
+           (VT == MVT::v8f32 && Subtarget.hasAVX()))
     RecipOp = "vec-sqrtf";
   else
     return SDValue();
@@ -14311,10 +14984,10 @@ SDValue X86TargetLowering::getRecipEstimate(SDValue Op,
   // 15 instructions: convert to single, rcpss, convert back to double, refine
   // (3 steps = 12 insts). If an 'rcpsd' variant was added to the ISA
   // along with FMA, this could be a throughput win.
-  if (VT == MVT::f32 && Subtarget->hasSSE1())
+  if (VT == MVT::f32 && Subtarget.hasSSE1())
     RecipOp = "divf";
-  else if ((VT == MVT::v4f32 && Subtarget->hasSSE1()) ||
-           (VT == MVT::v8f32 && Subtarget->hasAVX()))
+  else if ((VT == MVT::v4f32 && Subtarget.hasSSE1()) ||
+           (VT == MVT::v8f32 && Subtarget.hasAVX()))
     RecipOp = "vec-divf";
   else
     return SDValue();
@@ -14337,10 +15010,9 @@ unsigned X86TargetLowering::combineRepeatedFPDivisors() const {
   return 2;
 }
 
-/// LowerToBT - Result of 'and' is compared against zero. Turn it into a BT node
-/// if it's possible.
+/// Result of 'and' is compared against zero. Change to a BT node if possible.
 SDValue X86TargetLowering::LowerToBT(SDValue And, ISD::CondCode CC,
-                                     SDLoc dl, SelectionDAG &DAG) const {
+                                     const SDLoc &dl, SelectionDAG &DAG) const {
   SDValue Op0 = And.getOperand(0);
   SDValue Op1 = And.getOperand(1);
   if (Op0.getOpcode() == ISD::TRUNCATE)
@@ -14353,19 +15025,19 @@ SDValue X86TargetLowering::LowerToBT(SDValue And, ISD::CondCode CC,
     std::swap(Op0, Op1);
   if (Op0.getOpcode() == ISD::SHL) {
     if (isOneConstant(Op0.getOperand(0))) {
-        // If we looked past a truncate, check that it's only truncating away
-        // known zeros.
-        unsigned BitWidth = Op0.getValueSizeInBits();
-        unsigned AndBitWidth = And.getValueSizeInBits();
-        if (BitWidth > AndBitWidth) {
-          APInt Zeros, Ones;
-          DAG.computeKnownBits(Op0, Zeros, Ones);
-          if (Zeros.countLeadingOnes() < BitWidth - AndBitWidth)
-            return SDValue();
-        }
-        LHS = Op1;
-        RHS = Op0.getOperand(1);
+      // If we looked past a truncate, check that it's only truncating away
+      // known zeros.
+      unsigned BitWidth = Op0.getValueSizeInBits();
+      unsigned AndBitWidth = And.getValueSizeInBits();
+      if (BitWidth > AndBitWidth) {
+        APInt Zeros, Ones;
+        DAG.computeKnownBits(Op0, Zeros, Ones);
+        if (Zeros.countLeadingOnes() < BitWidth - AndBitWidth)
+          return SDValue();
       }
+      LHS = Op1;
+      RHS = Op0.getOperand(1);
+    }
   } else if (Op1.getOpcode() == ISD::Constant) {
     ConstantSDNode *AndRHS = cast<ConstantSDNode>(Op1);
     uint64_t AndRHSVal = AndRHS->getZExtValue();
@@ -14407,8 +15079,8 @@ SDValue X86TargetLowering::LowerToBT(SDValue And, ISD::CondCode CC,
   return SDValue();
 }
 
-/// \brief - Turns an ISD::CondCode into a value suitable for SSE floating point
-/// mask CMPs.
+/// Turns an ISD::CondCode into a value suitable for SSE floating-point mask
+/// CMPs.
 static int translateX86FSETCC(ISD::CondCode SetCCOpcode, SDValue &Op0,
                               SDValue &Op1) {
   unsigned SSECC;
@@ -14452,8 +15124,8 @@ static int translateX86FSETCC(ISD::CondCode SetCCOpcode, SDValue &Op0,
   return SSECC;
 }
 
-// Lower256IntVSETCC - Break a VSETCC 256-bit integer VSETCC into two new 128
-// ones, and then concatenate the result back.
+/// Break a VSETCC 256-bit integer VSETCC into two new 128 ones and then
+/// concatenate the result back.
 static SDValue Lower256IntVSETCC(SDValue Op, SelectionDAG &DAG) {
   MVT VT = Op.getSimpleValueType();
 
@@ -14466,13 +15138,13 @@ static SDValue Lower256IntVSETCC(SDValue Op, SelectionDAG &DAG) {
 
   // Extract the LHS vectors
   SDValue LHS = Op.getOperand(0);
-  SDValue LHS1 = Extract128BitVector(LHS, 0, DAG, dl);
-  SDValue LHS2 = Extract128BitVector(LHS, NumElems/2, DAG, dl);
+  SDValue LHS1 = extract128BitVector(LHS, 0, DAG, dl);
+  SDValue LHS2 = extract128BitVector(LHS, NumElems / 2, DAG, dl);
 
   // Extract the RHS vectors
   SDValue RHS = Op.getOperand(1);
-  SDValue RHS1 = Extract128BitVector(RHS, 0, DAG, dl);
-  SDValue RHS2 = Extract128BitVector(RHS, NumElems/2, DAG, dl);
+  SDValue RHS1 = extract128BitVector(RHS, 0, DAG, dl);
+  SDValue RHS2 = extract128BitVector(RHS, NumElems / 2, DAG, dl);
 
   // Issue the operation on the smaller types and concatenate the result back
   MVT EltVT = VT.getVectorElementType();
@@ -14525,16 +15197,15 @@ static SDValue LowerBoolVSETCC_AVX512(SDValue Op, SelectionDAG &DAG) {
   }
 }
 
-static SDValue LowerIntVSETCC_AVX512(SDValue Op, SelectionDAG &DAG,
-                                     const X86Subtarget *Subtarget) {
+static SDValue LowerIntVSETCC_AVX512(SDValue Op, SelectionDAG &DAG) {
+
   SDValue Op0 = Op.getOperand(0);
   SDValue Op1 = Op.getOperand(1);
   SDValue CC = Op.getOperand(2);
   MVT VT = Op.getSimpleValueType();
   SDLoc dl(Op);
 
-  assert(Op0.getSimpleValueType().getVectorElementType().getSizeInBits() >= 8 &&
-         Op.getSimpleValueType().getVectorElementType() == MVT::i1 &&
+  assert(VT.getVectorElementType() == MVT::i1 &&
          "Cannot set masked compare for this operation");
 
   ISD::CondCode SetCCOpcode = cast<CondCodeSDNode>(CC)->get();
@@ -14568,8 +15239,8 @@ static SDValue LowerIntVSETCC_AVX512(SDValue Op, SelectionDAG &DAG,
 /// \brief Try to turn a VSETULT into a VSETULE by modifying its second
 /// operand \p Op1.  If non-trivial (for example because it's not constant)
 /// return an empty value.
-static SDValue ChangeVSETULTtoVSETULE(SDLoc dl, SDValue Op1, SelectionDAG &DAG)
-{
+static SDValue ChangeVSETULTtoVSETULE(const SDLoc &dl, SDValue Op1,
+                                      SelectionDAG &DAG) {
   BuildVectorSDNode *BV = dyn_cast<BuildVectorSDNode>(Op1.getNode());
   if (!BV)
     return SDValue();
@@ -14592,10 +15263,10 @@ static SDValue ChangeVSETULTtoVSETULE(SDLoc dl, SDValue Op1, SelectionDAG &DAG)
     ULTOp1.push_back(DAG.getConstant(Val - 1, dl, EVT));
   }
 
-  return DAG.getNode(ISD::BUILD_VECTOR, dl, VT, ULTOp1);
+  return DAG.getBuildVector(VT, dl, ULTOp1);
 }
 
-static SDValue LowerVSETCC(SDValue Op, const X86Subtarget *Subtarget,
+static SDValue LowerVSETCC(SDValue Op, const X86Subtarget &Subtarget,
                            SelectionDAG &DAG) {
   SDValue Op0 = Op.getOperand(0);
   SDValue Op1 = Op.getOperand(1);
@@ -14611,32 +15282,59 @@ static SDValue LowerVSETCC(SDValue Op, const X86Subtarget *Subtarget,
     assert(EltVT == MVT::f32 || EltVT == MVT::f64);
 #endif
 
-    unsigned SSECC = translateX86FSETCC(SetCCOpcode, Op0, Op1);
-    unsigned Opc = X86ISD::CMPP;
-    if (Subtarget->hasAVX512() && VT.getVectorElementType() == MVT::i1) {
+    unsigned Opc;
+    if (Subtarget.hasAVX512() && VT.getVectorElementType() == MVT::i1) {
       assert(VT.getVectorNumElements() <= 16);
       Opc = X86ISD::CMPM;
-    }
-    // In the two special cases we can't handle, emit two comparisons.
+    } else {
+      Opc = X86ISD::CMPP;
+      // The SSE/AVX packed FP comparison nodes are defined with a
+      // floating-point vector result that matches the operand type. This allows
+      // them to work with an SSE1 target (integer vector types are not legal).
+      VT = Op0.getSimpleValueType();
+    }
+
+    // In the two cases not handled by SSE compare predicates (SETUEQ/SETONE),
+    // emit two comparisons and a logic op to tie them together.
+    // TODO: This can be avoided if Intel (and only Intel as of 2016) AVX is
+    // available.
+    SDValue Cmp;
+    unsigned SSECC = translateX86FSETCC(SetCCOpcode, Op0, Op1);
     if (SSECC == 8) {
+      // LLVM predicate is SETUEQ or SETONE.
       unsigned CC0, CC1;
       unsigned CombineOpc;
       if (SetCCOpcode == ISD::SETUEQ) {
-        CC0 = 3; CC1 = 0; CombineOpc = ISD::OR;
+        CC0 = 3; // UNORD
+        CC1 = 0; // EQ
+        CombineOpc = Opc == X86ISD::CMPP ? static_cast<unsigned>(X86ISD::FOR) :
+                                           static_cast<unsigned>(ISD::OR);
       } else {
         assert(SetCCOpcode == ISD::SETONE);
-        CC0 = 7; CC1 = 4; CombineOpc = ISD::AND;
+        CC0 = 7; // ORD
+        CC1 = 4; // NEQ
+        CombineOpc = Opc == X86ISD::CMPP ? static_cast<unsigned>(X86ISD::FAND) :
+                                           static_cast<unsigned>(ISD::AND);
       }
 
       SDValue Cmp0 = DAG.getNode(Opc, dl, VT, Op0, Op1,
                                  DAG.getConstant(CC0, dl, MVT::i8));
       SDValue Cmp1 = DAG.getNode(Opc, dl, VT, Op0, Op1,
                                  DAG.getConstant(CC1, dl, MVT::i8));
-      return DAG.getNode(CombineOpc, dl, VT, Cmp0, Cmp1);
+      Cmp = DAG.getNode(CombineOpc, dl, VT, Cmp0, Cmp1);
+    } else {
+      // Handle all other FP comparisons here.
+      Cmp = DAG.getNode(Opc, dl, VT, Op0, Op1,
+                        DAG.getConstant(SSECC, dl, MVT::i8));
     }
-    // Handle all other FP comparisons here.
-    return DAG.getNode(Opc, dl, VT, Op0, Op1,
-                       DAG.getConstant(SSECC, dl, MVT::i8));
+
+    // If this is SSE/AVX CMPP, bitcast the result back to integer to match the
+    // result type of SETCC. The bitcast is expected to be optimized away
+    // during combining/isel.
+    if (Opc == X86ISD::CMPP)
+      Cmp = DAG.getBitcast(Op.getSimpleValueType(), Cmp);
+
+    return Cmp;
   }
 
   MVT VTOp0 = Op0.getSimpleValueType();
@@ -14665,38 +15363,38 @@ static SDValue LowerVSETCC(SDValue Op, const X86Subtarget *Subtarget,
 
   // The non-AVX512 code below works under the assumption that source and
   // destination types are the same.
-  assert((Subtarget->hasAVX512() || (VT == VTOp0)) &&
+  assert((Subtarget.hasAVX512() || (VT == VTOp0)) &&
          "Value types for source and destination must be the same!");
 
   // Break 256-bit integer vector compare into smaller ones.
-  if (VT.is256BitVector() && !Subtarget->hasInt256())
+  if (VT.is256BitVector() && !Subtarget.hasInt256())
     return Lower256IntVSETCC(Op, DAG);
 
+  // Operands are boolean (vectors of i1)
   MVT OpVT = Op1.getSimpleValueType();
   if (OpVT.getVectorElementType() == MVT::i1)
     return LowerBoolVSETCC_AVX512(Op, DAG);
 
-  bool MaskResult = (VT.getVectorElementType() == MVT::i1);
-  if (Subtarget->hasAVX512()) {
-    if (Op1.getSimpleValueType().is512BitVector() ||
-        (Subtarget->hasBWI() && Subtarget->hasVLX()) ||
-        (MaskResult && OpVT.getVectorElementType().getSizeInBits() >= 32))
-      return LowerIntVSETCC_AVX512(Op, DAG, Subtarget);
-
+  // The result is boolean, but operands are int/float
+  if (VT.getVectorElementType() == MVT::i1) {
     // In AVX-512 architecture setcc returns mask with i1 elements,
     // But there is no compare instruction for i8 and i16 elements in KNL.
-    // We are not talking about 512-bit operands in this case, these
-    // types are illegal.
-    if (MaskResult &&
-        (OpVT.getVectorElementType().getSizeInBits() < 32 &&
-         OpVT.getVectorElementType().getSizeInBits() >= 8))
-      return DAG.getNode(ISD::TRUNCATE, dl, VT,
-                         DAG.getNode(ISD::SETCC, dl, OpVT, Op0, Op1, CC));
+    // In this case use SSE compare
+    bool UseAVX512Inst =
+      (OpVT.is512BitVector() ||
+       OpVT.getVectorElementType().getSizeInBits() >= 32 ||
+       (Subtarget.hasBWI() && Subtarget.hasVLX()));
+
+    if (UseAVX512Inst)
+      return LowerIntVSETCC_AVX512(Op, DAG);
+
+    return DAG.getNode(ISD::TRUNCATE, dl, VT,
+                        DAG.getNode(ISD::SETCC, dl, OpVT, Op0, Op1, CC));
   }
 
   // Lower using XOP integer comparisons.
   if ((VT == MVT::v16i8 || VT == MVT::v8i16 ||
-       VT == MVT::v4i32 || VT == MVT::v2i64) && Subtarget->hasXOP()) {
+       VT == MVT::v4i32 || VT == MVT::v2i64) && Subtarget.hasXOP()) {
     // Translate compare code to XOP PCOM compare mode.
     unsigned CmpMode = 0;
     switch (SetCCOpcode) {
@@ -14748,8 +15446,8 @@ static SDValue LowerVSETCC(SDValue Op, const X86Subtarget *Subtarget,
   // Special case: Use min/max operations for SETULE/SETUGE
   MVT VET = VT.getVectorElementType();
   bool hasMinMax =
-       (Subtarget->hasSSE41() && (VET >= MVT::i8 && VET <= MVT::i32))
-    || (Subtarget->hasSSE2()  && (VET == MVT::i8));
+       (Subtarget.hasSSE41() && (VET >= MVT::i8 && VET <= MVT::i32))
+    || (Subtarget.hasSSE2()  && (VET == MVT::i8));
 
   if (hasMinMax) {
     switch (SetCCOpcode) {
@@ -14761,7 +15459,7 @@ static SDValue LowerVSETCC(SDValue Op, const X86Subtarget *Subtarget,
     if (MinMax) { Swap = false; Invert = false; FlipSigns = false; }
   }
 
-  bool hasSubus = Subtarget->hasSSE2() && (VET == MVT::i8 || VET == MVT::i16);
+  bool hasSubus = Subtarget.hasSSE2() && (VET == MVT::i8 || VET == MVT::i16);
   if (!MinMax && hasSubus) {
     // As another special case, use PSUBUS[BW] when it's profitable. E.g. for
     // Op0 u<= Op1:
@@ -14775,10 +15473,9 @@ static SDValue LowerVSETCC(SDValue Op, const X86Subtarget *Subtarget,
       // beneficial because the constant in the register is no longer
       // destructed as the destination so it can be hoisted out of a loop.
       // Only do this pre-AVX since vpcmp* is no longer destructive.
-      if (Subtarget->hasAVX())
+      if (Subtarget.hasAVX())
         break;
-      SDValue ULEOp1 = ChangeVSETULTtoVSETULE(dl, Op1, DAG);
-      if (ULEOp1.getNode()) {
+      if (SDValue ULEOp1 = ChangeVSETULTtoVSETULE(dl, Op1, DAG)) {
         Op1 = ULEOp1;
         Subus = true; Invert = false; Swap = false;
       }
@@ -14801,8 +15498,8 @@ static SDValue LowerVSETCC(SDValue Op, const X86Subtarget *Subtarget,
   // Check that the operation in question is available (most are plain SSE2,
   // but PCMPGTQ and PCMPEQQ have different requirements).
   if (VT == MVT::v2i64) {
-    if (Opc == X86ISD::PCMPGT && !Subtarget->hasSSE42()) {
-      assert(Subtarget->hasSSE2() && "Don't know how to lower!");
+    if (Opc == X86ISD::PCMPGT && !Subtarget.hasSSE42()) {
+      assert(Subtarget.hasSSE2() && "Don't know how to lower!");
 
       // First cast everything to the right type.
       Op0 = DAG.getBitcast(MVT::v4i32, Op0);
@@ -14817,8 +15514,7 @@ static SDValue LowerVSETCC(SDValue Op, const X86Subtarget *Subtarget,
       } else {
         SDValue Sign = DAG.getConstant(0x80000000U, dl, MVT::i32);
         SDValue Zero = DAG.getConstant(0x00000000U, dl, MVT::i32);
-        SB = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4i32,
-                         Sign, Zero, Sign, Zero);
+        SB = DAG.getBuildVector(MVT::v4i32, dl, {Sign, Zero, Sign, Zero});
       }
       Op0 = DAG.getNode(ISD::XOR, dl, MVT::v4i32, Op0, SB);
       Op1 = DAG.getNode(ISD::XOR, dl, MVT::v4i32, Op1, SB);
@@ -14843,10 +15539,10 @@ static SDValue LowerVSETCC(SDValue Op, const X86Subtarget *Subtarget,
       return DAG.getBitcast(VT, Result);
     }
 
-    if (Opc == X86ISD::PCMPEQ && !Subtarget->hasSSE41()) {
+    if (Opc == X86ISD::PCMPEQ && !Subtarget.hasSSE41()) {
       // If pcmpeqq is missing but pcmpeqd is available synthesize pcmpeqq with
       // pcmpeqd + pshufd + pand.
-      assert(Subtarget->hasSSE2() && !FlipSigns && "Don't know how to lower!");
+      assert(Subtarget.hasSSE2() && !FlipSigns && "Don't know how to lower!");
 
       // First cast everything to the right type.
       Op0 = DAG.getBitcast(MVT::v4i32, Op0);
@@ -14899,7 +15595,7 @@ SDValue X86TargetLowering::LowerSETCC(SDValue Op, SelectionDAG &DAG) const {
 
   if (VT.isVector()) return LowerVSETCC(Op, Subtarget, DAG);
 
-  assert(((!Subtarget->hasAVX512() && VT == MVT::i8) || (VT == MVT::i1))
+  assert(((!Subtarget.hasAVX512() && VT == MVT::i8) || (VT == MVT::i1))
          && "SetCC type must be 8-bit or 1-bit integer");
   SDValue Op0 = Op.getOperand(0);
   SDValue Op1 = Op.getOperand(1);
@@ -14914,8 +15610,11 @@ SDValue X86TargetLowering::LowerSETCC(SDValue Op, SelectionDAG &DAG) const {
       isNullConstant(Op1) &&
       (CC == ISD::SETEQ || CC == ISD::SETNE)) {
     if (SDValue NewSetCC = LowerToBT(Op0, CC, dl, DAG)) {
-      if (VT == MVT::i1)
+      if (VT == MVT::i1) {
+        NewSetCC = DAG.getNode(ISD::AssertZext, dl, MVT::i8, NewSetCC,
+                               DAG.getValueType(MVT::i1));
         return DAG.getNode(ISD::TRUNCATE, dl, MVT::i1, NewSetCC);
+      }
       return NewSetCC;
     }
   }
@@ -14937,16 +15636,23 @@ SDValue X86TargetLowering::LowerSETCC(SDValue Op, SelectionDAG &DAG) const {
       SDValue SetCC = DAG.getNode(X86ISD::SETCC, dl, MVT::i8,
                                   DAG.getConstant(CCode, dl, MVT::i8),
                                   Op0.getOperand(1));
-      if (VT == MVT::i1)
+      if (VT == MVT::i1) {
+        SetCC = DAG.getNode(ISD::AssertZext, dl, MVT::i8, SetCC,
+                            DAG.getValueType(MVT::i1));
         return DAG.getNode(ISD::TRUNCATE, dl, MVT::i1, SetCC);
+      }
       return SetCC;
     }
   }
-  if ((Op0.getValueType() == MVT::i1) && isOneConstant(Op1) &&
-      (CC == ISD::SETEQ || CC == ISD::SETNE)) {
-
-    ISD::CondCode NewCC = ISD::getSetCCInverse(CC, true);
-    return DAG.getSetCC(dl, VT, Op0, DAG.getConstant(0, dl, MVT::i1), NewCC);
+  if (Op0.getValueType() == MVT::i1 && (CC == ISD::SETEQ || CC == ISD::SETNE)) {
+    if (isOneConstant(Op1)) {
+      ISD::CondCode NewCC = ISD::getSetCCInverse(CC, true);
+      return DAG.getSetCC(dl, VT, Op0, DAG.getConstant(0, dl, MVT::i1), NewCC);
+    }
+    if (!isNullConstant(Op1)) {
+      SDValue Xor = DAG.getNode(ISD::XOR, dl, MVT::i1, Op0, Op1);
+      return DAG.getSetCC(dl, VT, Xor, DAG.getConstant(0, dl, MVT::i1), CC);
+    }
   }
 
   bool isFP = Op1.getSimpleValueType().isFloatingPoint();
@@ -14958,8 +15664,11 @@ SDValue X86TargetLowering::LowerSETCC(SDValue Op, SelectionDAG &DAG) const {
   EFLAGS = ConvertCmpIfNecessary(EFLAGS, DAG);
   SDValue SetCC = DAG.getNode(X86ISD::SETCC, dl, MVT::i8,
                               DAG.getConstant(X86CC, dl, MVT::i8), EFLAGS);
-  if (VT == MVT::i1)
+  if (VT == MVT::i1) {
+    SetCC = DAG.getNode(ISD::AssertZext, dl, MVT::i8, SetCC,
+                        DAG.getValueType(MVT::i1));
     return DAG.getNode(ISD::TRUNCATE, dl, MVT::i1, SetCC);
+  }
   return SetCC;
 }
 
@@ -14978,12 +15687,15 @@ SDValue X86TargetLowering::LowerSETCCE(SDValue Op, SelectionDAG &DAG) const {
   SDValue Cmp = DAG.getNode(X86ISD::SBB, DL, VTs, LHS, RHS, Carry);
   SDValue SetCC = DAG.getNode(X86ISD::SETCC, DL, MVT::i8,
                               DAG.getConstant(CC, DL, MVT::i8), Cmp.getValue(1));
-  if (Op.getSimpleValueType() == MVT::i1)
-      return DAG.getNode(ISD::TRUNCATE, DL, MVT::i1, SetCC);
+  if (Op.getSimpleValueType() == MVT::i1) {
+    SetCC = DAG.getNode(ISD::AssertZext, DL, MVT::i8, SetCC,
+                        DAG.getValueType(MVT::i1));
+    return DAG.getNode(ISD::TRUNCATE, DL, MVT::i1, SetCC);
+  }
   return SetCC;
 }
 
-// isX86LogicalCmp - Return true if opcode is a X86 logical comparison.
+/// Return true if opcode is a X86 logical comparison.
 static bool isX86LogicalCmp(SDValue Op) {
   unsigned Opc = Op.getNode()->getOpcode();
   if (Opc == X86ISD::CMP || Opc == X86ISD::COMI || Opc == X86ISD::UCOMI ||
@@ -15009,14 +15721,23 @@ static bool isX86LogicalCmp(SDValue Op) {
   return false;
 }
 
-static bool isTruncWithZeroHighBitsInput(SDValue V, SelectionDAG &DAG) {
+/// Returns the "condition" node, that may be wrapped with "truncate".
+/// Like this: (i1 (trunc (i8 X86ISD::SETCC))).
+static SDValue getCondAfterTruncWithZeroHighBitsInput(SDValue V, SelectionDAG &DAG) {
   if (V.getOpcode() != ISD::TRUNCATE)
-    return false;
+    return V;
 
   SDValue VOp0 = V.getOperand(0);
+  if (VOp0.getOpcode() == ISD::AssertZext &&
+      V.getValueSizeInBits() ==
+      cast<VTSDNode>(VOp0.getOperand(1))->getVT().getSizeInBits())
+    return VOp0.getOperand(0);
+
   unsigned InBits = VOp0.getValueSizeInBits();
   unsigned Bits = V.getValueSizeInBits();
-  return DAG.MaskedValueIsZero(VOp0, APInt::getHighBitsSet(InBits,InBits-Bits));
+  if (DAG.MaskedValueIsZero(VOp0, APInt::getHighBitsSet(InBits,InBits-Bits)))
+    return V.getOperand(0);
+  return V;
 }
 
 SDValue X86TargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const {
@@ -15032,15 +15753,15 @@ SDValue X86TargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const {
   // are available or VBLENDV if AVX is available.
   // Otherwise FP cmovs get lowered into a less efficient branch sequence later.
   if (Cond.getOpcode() == ISD::SETCC &&
-      ((Subtarget->hasSSE2() && (VT == MVT::f32 || VT == MVT::f64)) ||
-       (Subtarget->hasSSE1() && VT == MVT::f32)) &&
+      ((Subtarget.hasSSE2() && (VT == MVT::f32 || VT == MVT::f64)) ||
+       (Subtarget.hasSSE1() && VT == MVT::f32)) &&
       VT == Cond.getOperand(0).getSimpleValueType() && Cond->hasOneUse()) {
     SDValue CondOp0 = Cond.getOperand(0), CondOp1 = Cond.getOperand(1);
     int SSECC = translateX86FSETCC(
         cast<CondCodeSDNode>(Cond.getOperand(2))->get(), CondOp0, CondOp1);
 
     if (SSECC != 8) {
-      if (Subtarget->hasAVX512()) {
+      if (Subtarget.hasAVX512()) {
         SDValue Cmp = DAG.getNode(X86ISD::FSETCC, DL, MVT::i1, CondOp0, CondOp1,
                                   DAG.getConstant(SSECC, DL, MVT::i8));
         return DAG.getNode(X86ISD::SELECT, DL, VT, Cmp, Op1, Op2);
@@ -15062,7 +15783,7 @@ SDValue X86TargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const {
       // instructions as the AND/ANDN/OR sequence due to register moves, so
       // don't bother.
 
-      if (Subtarget->hasAVX() &&
+      if (Subtarget.hasAVX() &&
           !isa<ConstantFPSDNode>(Op1) && !isa<ConstantFPSDNode>(Op2)) {
 
         // Convert to vectors, do a VSELECT, and convert back to scalar.
@@ -15122,8 +15843,7 @@ SDValue X86TargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const {
   }
 
   if (Cond.getOpcode() == ISD::SETCC) {
-    SDValue NewCond = LowerSETCC(Cond, DAG);
-    if (NewCond.getNode())
+    if (SDValue NewCond = LowerSETCC(Cond, DAG))
       Cond = NewCond;
   }
 
@@ -15240,8 +15960,7 @@ SDValue X86TargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const {
 
   if (addTest) {
     // Look past the truncate if the high bits are known zero.
-    if (isTruncWithZeroHighBitsInput(Cond, DAG))
-      Cond = Cond.getOperand(0);
+    Cond = getCondAfterTruncWithZeroHighBitsInput(Cond, DAG);
 
     // We know the result of AND is compared against zero. Try to match
     // it to BT.
@@ -15302,7 +16021,7 @@ SDValue X86TargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const {
 }
 
 static SDValue LowerSIGN_EXTEND_AVX512(SDValue Op,
-                                       const X86Subtarget *Subtarget,
+                                       const X86Subtarget &Subtarget,
                                        SelectionDAG &DAG) {
   MVT VT = Op->getSimpleValueType(0);
   SDValue In = Op->getOperand(0);
@@ -15313,22 +16032,22 @@ static SDValue LowerSIGN_EXTEND_AVX512(SDValue Op,
 
   // SKX processor
   if ((InVTElt == MVT::i1) &&
-      (((Subtarget->hasBWI() && Subtarget->hasVLX() &&
+      (((Subtarget.hasBWI() && Subtarget.hasVLX() &&
         VT.getSizeInBits() <= 256 && VTElt.getSizeInBits() <= 16)) ||
 
-       ((Subtarget->hasBWI() && VT.is512BitVector() &&
+       ((Subtarget.hasBWI() && VT.is512BitVector() &&
         VTElt.getSizeInBits() <= 16)) ||
 
-       ((Subtarget->hasDQI() && Subtarget->hasVLX() &&
+       ((Subtarget.hasDQI() && Subtarget.hasVLX() &&
         VT.getSizeInBits() <= 256 && VTElt.getSizeInBits() >= 32)) ||
 
-       ((Subtarget->hasDQI() && VT.is512BitVector() &&
+       ((Subtarget.hasDQI() && VT.is512BitVector() &&
         VTElt.getSizeInBits() >= 32))))
     return DAG.getNode(X86ISD::VSEXT, dl, VT, In);
 
   unsigned int NumElts = VT.getVectorNumElements();
 
-  if (NumElts != 8 && NumElts != 16 && !Subtarget->hasBWI())
+  if (NumElts != 8 && NumElts != 16 && !Subtarget.hasBWI())
     return SDValue();
 
   if (VT.is512BitVector() && InVT.getVectorElementType() != MVT::i1) {
@@ -15352,25 +16071,35 @@ static SDValue LowerSIGN_EXTEND_AVX512(SDValue Op,
 }
 
 static SDValue LowerSIGN_EXTEND_VECTOR_INREG(SDValue Op,
-                                             const X86Subtarget *Subtarget,
+                                             const X86Subtarget &Subtarget,
                                              SelectionDAG &DAG) {
   SDValue In = Op->getOperand(0);
   MVT VT = Op->getSimpleValueType(0);
   MVT InVT = In.getSimpleValueType();
   assert(VT.getSizeInBits() == InVT.getSizeInBits());
 
+  MVT SVT = VT.getVectorElementType();
   MVT InSVT = InVT.getVectorElementType();
-  assert(VT.getVectorElementType().getSizeInBits() > InSVT.getSizeInBits());
+  assert(SVT.getSizeInBits() > InSVT.getSizeInBits());
 
-  if (VT != MVT::v2i64 && VT != MVT::v4i32 && VT != MVT::v8i16)
+  if (SVT != MVT::i64 && SVT != MVT::i32 && SVT != MVT::i16)
     return SDValue();
   if (InSVT != MVT::i32 && InSVT != MVT::i16 && InSVT != MVT::i8)
     return SDValue();
+  if (!(VT.is128BitVector() && Subtarget.hasSSE2()) &&
+      !(VT.is256BitVector() && Subtarget.hasInt256()))
+    return SDValue();
 
   SDLoc dl(Op);
 
+  // For 256-bit vectors, we only need the lower (128-bit) half of the input.
+  if (VT.is256BitVector())
+    In = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl,
+                     MVT::getVectorVT(InSVT, InVT.getVectorNumElements() / 2),
+                     In, DAG.getIntPtrConstant(0, dl));
+
   // SSE41 targets can use the pmovsx* instructions directly.
-  if (Subtarget->hasSSE41())
+  if (Subtarget.hasSSE41())
     return DAG.getNode(X86ISD::VSEXT, dl, VT, In);
 
   // pre-SSE41 targets unpack lower lanes and then sign-extend using SRAI.
@@ -15407,7 +16136,7 @@ static SDValue LowerSIGN_EXTEND_VECTOR_INREG(SDValue Op,
   return SDValue();
 }
 
-static SDValue LowerSIGN_EXTEND(SDValue Op, const X86Subtarget *Subtarget,
+static SDValue LowerSIGN_EXTEND(SDValue Op, const X86Subtarget &Subtarget,
                                 SelectionDAG &DAG) {
   MVT VT = Op->getSimpleValueType(0);
   SDValue In = Op->getOperand(0);
@@ -15422,7 +16151,7 @@ static SDValue LowerSIGN_EXTEND(SDValue Op, const X86Subtarget *Subtarget,
       (VT != MVT::v16i16 || InVT != MVT::v16i8))
     return SDValue();
 
-  if (Subtarget->hasInt256())
+  if (Subtarget.hasInt256())
     return DAG.getNode(X86ISD::VSEXT, dl, VT, In);
 
   // Optimize vectors in AVX mode
@@ -15441,13 +16170,13 @@ static SDValue LowerSIGN_EXTEND(SDValue Op, const X86Subtarget *Subtarget,
   for (unsigned i = 0; i != NumElems/2; ++i)
     ShufMask1[i] = i;
 
-  SDValue OpLo = DAG.getVectorShuffle(InVT, dl, In, Undef, &ShufMask1[0]);
+  SDValue OpLo = DAG.getVectorShuffle(InVT, dl, In, Undef, ShufMask1);
 
   SmallVector<int,8> ShufMask2(NumElems, -1);
   for (unsigned i = 0; i != NumElems/2; ++i)
     ShufMask2[i] = i + NumElems/2;
 
-  SDValue OpHi = DAG.getVectorShuffle(InVT, dl, In, Undef, &ShufMask2[0]);
+  SDValue OpHi = DAG.getVectorShuffle(InVT, dl, In, Undef, ShufMask2);
 
   MVT HalfVT = MVT::getVectorVT(VT.getVectorElementType(),
                                 VT.getVectorNumElements()/2);
@@ -15458,6 +16187,157 @@ static SDValue LowerSIGN_EXTEND(SDValue Op, const X86Subtarget *Subtarget,
   return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, OpLo, OpHi);
 }
 
+// Lower truncating store. We need a special lowering to vXi1 vectors
+static SDValue LowerTruncatingStore(SDValue StOp, const X86Subtarget &Subtarget,
+                                    SelectionDAG &DAG) {
+  StoreSDNode *St = cast<StoreSDNode>(StOp.getNode());
+  SDLoc dl(St);
+  EVT MemVT = St->getMemoryVT();
+  assert(St->isTruncatingStore() && "We only custom truncating store.");
+  assert(MemVT.isVector() && MemVT.getVectorElementType() == MVT::i1 &&
+         "Expected truncstore of i1 vector");
+
+  SDValue Op = St->getValue();
+  MVT OpVT = Op.getValueType().getSimpleVT();
+  unsigned NumElts = OpVT.getVectorNumElements();
+  if ((Subtarget.hasVLX() && Subtarget.hasBWI() && Subtarget.hasDQI()) ||
+      NumElts == 16) {
+    // Truncate and store - everything is legal
+    Op = DAG.getNode(ISD::TRUNCATE, dl, MemVT, Op);
+    if (MemVT.getSizeInBits() < 8)
+      Op = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, MVT::v8i1,
+                       DAG.getUNDEF(MVT::v8i1), Op,
+                       DAG.getIntPtrConstant(0, dl));
+    return DAG.getStore(St->getChain(), dl, Op, St->getBasePtr(),
+                        St->getMemOperand());
+  }
+
+  // A subset, assume that we have only AVX-512F
+  if (NumElts <= 8) {
+    if (NumElts < 8) {
+      // Extend to 8-elts vector
+      MVT ExtVT = MVT::getVectorVT(OpVT.getScalarType(), 8);
+      Op = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ExtVT,
+                        DAG.getUNDEF(ExtVT), Op, DAG.getIntPtrConstant(0, dl));
+    }
+    Op = DAG.getNode(ISD::TRUNCATE, dl, MVT::v8i1, Op);
+    return DAG.getStore(St->getChain(), dl, Op, St->getBasePtr(),
+                        St->getMemOperand());
+  }
+  // v32i8
+  assert(OpVT == MVT::v32i8 && "Unexpected operand type");
+  // Divide the vector into 2 parts and store each part separately
+  SDValue Lo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v16i8, Op,
+                            DAG.getIntPtrConstant(0, dl));
+  Lo = DAG.getNode(ISD::TRUNCATE, dl, MVT::v16i1, Lo);
+  SDValue BasePtr = St->getBasePtr();
+  SDValue StLo = DAG.getStore(St->getChain(), dl, Lo, BasePtr,
+                              St->getMemOperand());
+  SDValue Hi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v16i8, Op,
+                            DAG.getIntPtrConstant(16, dl));
+  Hi = DAG.getNode(ISD::TRUNCATE, dl, MVT::v16i1, Hi);
+
+  SDValue BasePtrHi =
+    DAG.getNode(ISD::ADD, dl, BasePtr.getValueType(), BasePtr,
+                DAG.getConstant(2, dl, BasePtr.getValueType()));
+
+  SDValue StHi = DAG.getStore(St->getChain(), dl, Hi,
+                              BasePtrHi, St->getMemOperand());
+  return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, StLo, StHi);
+}
+
+static SDValue LowerExtended1BitVectorLoad(SDValue Op,
+                                           const X86Subtarget &Subtarget,
+                                           SelectionDAG &DAG) {
+
+  LoadSDNode *Ld = cast<LoadSDNode>(Op.getNode());
+  SDLoc dl(Ld);
+  EVT MemVT = Ld->getMemoryVT();
+  assert(MemVT.isVector() && MemVT.getScalarType() == MVT::i1 &&
+         "Expected i1 vector load");
+  unsigned ExtOpcode = Ld->getExtensionType() == ISD::ZEXTLOAD ?
+    ISD::ZERO_EXTEND : ISD::SIGN_EXTEND;
+  MVT VT = Op.getValueType().getSimpleVT();
+  unsigned NumElts = VT.getVectorNumElements();
+
+  if ((Subtarget.hasVLX() && Subtarget.hasBWI() && Subtarget.hasDQI()) ||
+      NumElts == 16) {
+    // Load and extend - everything is legal
+    if (NumElts < 8) {
+      SDValue Load = DAG.getLoad(MVT::v8i1, dl, Ld->getChain(),
+                                 Ld->getBasePtr(),
+                                 Ld->getMemOperand());
+      // Replace chain users with the new chain.
+      assert(Load->getNumValues() == 2 && "Loads must carry a chain!");
+      DAG.ReplaceAllUsesOfValueWith(SDValue(Ld, 1), Load.getValue(1));
+      MVT ExtVT = MVT::getVectorVT(VT.getScalarType(), 8);
+      SDValue ExtVec = DAG.getNode(ExtOpcode, dl, ExtVT, Load);
+
+      return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, ExtVec,
+                                   DAG.getIntPtrConstant(0, dl));
+    }
+    SDValue Load = DAG.getLoad(MemVT, dl, Ld->getChain(),
+                               Ld->getBasePtr(),
+                               Ld->getMemOperand());
+    // Replace chain users with the new chain.
+    assert(Load->getNumValues() == 2 && "Loads must carry a chain!");
+    DAG.ReplaceAllUsesOfValueWith(SDValue(Ld, 1), Load.getValue(1));
+
+    // Finally, do a normal sign-extend to the desired register.
+    return DAG.getNode(ExtOpcode, dl, Op.getValueType(), Load);
+  }
+
+  if (NumElts <= 8) {
+    // A subset, assume that we have only AVX-512F
+    unsigned NumBitsToLoad = NumElts < 8 ? 8 : NumElts;
+    MVT TypeToLoad = MVT::getIntegerVT(NumBitsToLoad);
+    SDValue Load = DAG.getLoad(TypeToLoad, dl, Ld->getChain(),
+                              Ld->getBasePtr(),
+                              Ld->getMemOperand());
+    // Replace chain users with the new chain.
+    assert(Load->getNumValues() == 2 && "Loads must carry a chain!");
+    DAG.ReplaceAllUsesOfValueWith(SDValue(Ld, 1), Load.getValue(1));
+
+    MVT MaskVT = MVT::getVectorVT(MVT::i1, NumBitsToLoad);
+    SDValue BitVec = DAG.getBitcast(MaskVT, Load);
+
+    if (NumElts == 8)
+      return DAG.getNode(ExtOpcode, dl, VT, BitVec);
+
+      // we should take care to v4i1 and v2i1
+
+    MVT ExtVT = MVT::getVectorVT(VT.getScalarType(), 8);
+    SDValue ExtVec = DAG.getNode(ExtOpcode, dl, ExtVT, BitVec);
+    return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, ExtVec,
+                        DAG.getIntPtrConstant(0, dl));
+  }
+
+  assert(VT == MVT::v32i8 && "Unexpected extload type");
+
+  SmallVector<SDValue, 2> Chains;
+
+  SDValue BasePtr = Ld->getBasePtr();
+  SDValue LoadLo = DAG.getLoad(MVT::v16i1, dl, Ld->getChain(),
+                               Ld->getBasePtr(),
+                               Ld->getMemOperand());
+  Chains.push_back(LoadLo.getValue(1));
+
+  SDValue BasePtrHi =
+    DAG.getNode(ISD::ADD, dl, BasePtr.getValueType(), BasePtr,
+                DAG.getConstant(2, dl, BasePtr.getValueType()));
+
+  SDValue LoadHi = DAG.getLoad(MVT::v16i1, dl, Ld->getChain(),
+                               BasePtrHi,
+                               Ld->getMemOperand());
+  Chains.push_back(LoadHi.getValue(1));
+  SDValue NewChain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Chains);
+  DAG.ReplaceAllUsesOfValueWith(SDValue(Ld, 1), NewChain);
+
+  SDValue Lo = DAG.getNode(ExtOpcode, dl, MVT::v16i8, LoadLo);
+  SDValue Hi = DAG.getNode(ExtOpcode, dl, MVT::v16i8, LoadHi);
+  return DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v32i8, Lo, Hi);
+}
+
 // Lower vector extended loads using a shuffle. If SSSE3 is not available we
 // may emit an illegal shuffle but the expansion is still better than scalar
 // code. We generate X86ISD::VSEXT for SEXTLOADs if it's available, otherwise
@@ -15465,7 +16345,7 @@ static SDValue LowerSIGN_EXTEND(SDValue Op, const X86Subtarget *Subtarget,
 // FIXME: Is the expansion actually better than scalar code? It doesn't seem so.
 // TODO: It is possible to support ZExt by zeroing the undef values during
 // the shuffle phase or after the shuffle.
-static SDValue LowerExtendedLoad(SDValue Op, const X86Subtarget *Subtarget,
+static SDValue LowerExtendedLoad(SDValue Op, const X86Subtarget &Subtarget,
                                  SelectionDAG &DAG) {
   MVT RegVT = Op.getSimpleValueType();
   assert(RegVT.isVector() && "We only custom lower vector sext loads.");
@@ -15473,11 +16353,14 @@ static SDValue LowerExtendedLoad(SDValue Op, const X86Subtarget *Subtarget,
          "We only custom lower integer vector sext loads.");
 
   // Nothing useful we can do without SSE2 shuffles.
-  assert(Subtarget->hasSSE2() && "We only custom lower sext loads with SSE2.");
+  assert(Subtarget.hasSSE2() && "We only custom lower sext loads with SSE2.");
 
   LoadSDNode *Ld = cast<LoadSDNode>(Op.getNode());
   SDLoc dl(Ld);
   EVT MemVT = Ld->getMemoryVT();
+  if (MemVT.getScalarType() == MVT::i1)
+    return LowerExtended1BitVectorLoad(Op, Subtarget, DAG);
+
   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
   unsigned RegSz = RegVT.getSizeInBits();
 
@@ -15492,7 +16375,7 @@ static SDValue LowerExtendedLoad(SDValue Op, const X86Subtarget *Subtarget,
   unsigned MemSz = MemVT.getSizeInBits();
   assert(RegSz > MemSz && "Register size must be greater than the mem size");
 
-  if (Ext == ISD::SEXTLOAD && RegSz == 256 && !Subtarget->hasInt256()) {
+  if (Ext == ISD::SEXTLOAD && RegSz == 256 && !Subtarget.hasInt256()) {
     // The only way in which we have a legal 256-bit vector result but not the
     // integer 256-bit operations needed to directly lower a sextload is if we
     // have AVX1 but not AVX2. In that case, we can always emit a sextload to
@@ -15508,8 +16391,8 @@ static SDValue LowerExtendedLoad(SDValue Op, const X86Subtarget *Subtarget,
                                        "it must be a legal 128-bit vector "
                                        "type!");
       Load = DAG.getLoad(MemVT, dl, Ld->getChain(), Ld->getBasePtr(),
-                  Ld->getPointerInfo(), Ld->isVolatile(), Ld->isNonTemporal(),
-                  Ld->isInvariant(), Ld->getAlignment());
+                         Ld->getPointerInfo(), Ld->getAlignment(),
+                         Ld->getMemOperand()->getFlags());
     } else {
       assert(MemSz < 128 &&
              "Can't extend a type wider than 128 bits to a 256 bit vector!");
@@ -15522,9 +16405,8 @@ static SDValue LowerExtendedLoad(SDValue Op, const X86Subtarget *Subtarget,
       EVT HalfVecVT = EVT::getVectorVT(*DAG.getContext(), HalfEltVT, NumElems);
       Load =
           DAG.getExtLoad(Ext, dl, HalfVecVT, Ld->getChain(), Ld->getBasePtr(),
-                         Ld->getPointerInfo(), MemVT, Ld->isVolatile(),
-                         Ld->isNonTemporal(), Ld->isInvariant(),
-                         Ld->getAlignment());
+                         Ld->getPointerInfo(), MemVT, Ld->getAlignment(),
+                         Ld->getMemOperand()->getFlags());
     }
 
     // Replace chain users with the new chain.
@@ -15592,8 +16474,7 @@ static SDValue LowerExtendedLoad(SDValue Op, const X86Subtarget *Subtarget,
     // Perform a single load.
     SDValue ScalarLoad =
         DAG.getLoad(SclrLoadTy, dl, Ld->getChain(), Ptr, Ld->getPointerInfo(),
-                    Ld->isVolatile(), Ld->isNonTemporal(), Ld->isInvariant(),
-                    Ld->getAlignment());
+                    Ld->getAlignment(), Ld->getMemOperand()->getFlags());
     Chains.push_back(ScalarLoad.getValue(1));
     // Create the first element type using SCALAR_TO_VECTOR in order to avoid
     // another round of DAGCombining.
@@ -15615,7 +16496,7 @@ static SDValue LowerExtendedLoad(SDValue Op, const X86Subtarget *Subtarget,
 
   if (Ext == ISD::SEXTLOAD) {
     // If we have SSE4.1, we can directly emit a VSEXT node.
-    if (Subtarget->hasSSE41()) {
+    if (Subtarget.hasSSE41()) {
       SDValue Sext = DAG.getNode(X86ISD::VSEXT, dl, RegVT, SlicedVec);
       DAG.ReplaceAllUsesOfValueWith(SDValue(Ld, 1), TF);
       return Sext;
@@ -15637,7 +16518,7 @@ static SDValue LowerExtendedLoad(SDValue Op, const X86Subtarget *Subtarget,
     ShuffleVec[i * SizeRatio] = i;
 
   SDValue Shuff = DAG.getVectorShuffle(WideVecVT, dl, SlicedVec,
-                                       DAG.getUNDEF(WideVecVT), &ShuffleVec[0]);
+                                       DAG.getUNDEF(WideVecVT), ShuffleVec);
 
   // Bitcast to the requested type.
   Shuff = DAG.getBitcast(RegVT, Shuff);
@@ -15645,9 +16526,8 @@ static SDValue LowerExtendedLoad(SDValue Op, const X86Subtarget *Subtarget,
   return Shuff;
 }
 
-// isAndOrOfSingleUseSetCCs - Return true if node is an ISD::AND or
-// ISD::OR of two X86ISD::SETCC nodes each of which has no other use apart
-// from the AND / OR.
+/// Return true if node is an ISD::AND or ISD::OR of two X86ISD::SETCC nodes
+/// each of which has no other use apart from the AND / OR.
 static bool isAndOrOfSetCCs(SDValue Op, unsigned &Opc) {
   Opc = Op.getOpcode();
   if (Opc != ISD::OR && Opc != ISD::AND)
@@ -15658,8 +16538,8 @@ static bool isAndOrOfSetCCs(SDValue Op, unsigned &Opc) {
           Op.getOperand(1).hasOneUse());
 }
 
-// isXor1OfSetCC - Return true if node is an ISD::XOR of a X86ISD::SETCC and
-// 1 and that the SETCC node has a single use.
+/// Return true if node is an ISD::XOR of a X86ISD::SETCC and 1 and that the
+/// SETCC node has a single use.
 static bool isXor1OfSetCC(SDValue Op) {
   if (Op.getOpcode() != ISD::XOR)
     return false;
@@ -15692,8 +16572,7 @@ SDValue X86TargetLowering::LowerBRCOND(SDValue Op, SelectionDAG &DAG) const {
       Inverted = true;
       Cond = Cond.getOperand(0);
     } else {
-      SDValue NewCond = LowerSETCC(Cond, DAG);
-      if (NewCond.getNode())
+      if (SDValue NewCond = LowerSETCC(Cond, DAG))
         Cond = NewCond;
     }
   }
@@ -15917,8 +16796,7 @@ SDValue X86TargetLowering::LowerBRCOND(SDValue Op, SelectionDAG &DAG) const {
 
   if (addTest) {
     // Look pass the truncate if the high bits are known zero.
-    if (isTruncWithZeroHighBitsInput(Cond, DAG))
-        Cond = Cond.getOperand(0);
+    Cond = getCondAfterTruncWithZeroHighBitsInput(Cond, DAG);
 
     // We know the result of AND is compared against zero. Try to match
     // it to BT.
@@ -15951,7 +16829,7 @@ X86TargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op,
                                            SelectionDAG &DAG) const {
   MachineFunction &MF = DAG.getMachineFunction();
   bool SplitStack = MF.shouldSplitStack();
-  bool Lower = (Subtarget->isOSWindows() && !Subtarget->isTargetMachO()) ||
+  bool Lower = (Subtarget.isOSWindows() && !Subtarget.isTargetMachO()) ||
                SplitStack;
   SDLoc dl(Op);
 
@@ -15966,7 +16844,7 @@ X86TargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op,
   // pointer when other instructions are using the stack.
   Chain = DAG.getCALLSEQ_START(Chain, DAG.getIntPtrConstant(0, dl, true), dl);
 
-  bool Is64Bit = Subtarget->is64Bit();
+  bool Is64Bit = Subtarget.is64Bit();
   MVT SPTy = getPointerTy(DAG.getDataLayout());
 
   SDValue Result;
@@ -15975,13 +16853,10 @@ X86TargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op,
     unsigned SPReg = TLI.getStackPointerRegisterToSaveRestore();
     assert(SPReg && "Target cannot require DYNAMIC_STACKALLOC expansion and"
                     " not tell us which reg is the stack pointer!");
-    EVT VT = Node->getValueType(0);
-    SDValue Tmp3 = Node->getOperand(2);
 
     SDValue SP = DAG.getCopyFromReg(Chain, dl, SPReg, VT);
     Chain = SP.getValue(1);
-    unsigned Align = cast<ConstantSDNode>(Tmp3)->getZExtValue();
-    const TargetFrameLowering &TFI = *Subtarget->getFrameLowering();
+    const TargetFrameLowering &TFI = *Subtarget.getFrameLowering();
     unsigned StackAlign = TFI.getStackAlignment();
     Result = DAG.getNode(ISD::SUB, dl, VT, SP, Size); // Value
     if (Align > StackAlign)
@@ -15995,12 +16870,11 @@ X86TargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op,
       // The 64 bit implementation of segmented stacks needs to clobber both r10
       // r11. This makes it impossible to use it along with nested parameters.
       const Function *F = MF.getFunction();
-
-      for (Function::const_arg_iterator I = F->arg_begin(), E = F->arg_end();
-           I != E; ++I)
-        if (I->hasNestAttr())
+      for (const auto &A : F->args()) {
+        if (A.hasNestAttr())
           report_fatal_error("Cannot use segmented stacks with functions that "
                              "have nested arguments.");
+      }
     }
 
     const TargetRegisterClass *AddrRegClass = getRegClassFor(SPTy);
@@ -16009,16 +16883,11 @@ X86TargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op,
     Result = DAG.getNode(X86ISD::SEG_ALLOCA, dl, SPTy, Chain,
                                 DAG.getRegister(Vreg, SPTy));
   } else {
-    SDValue Flag;
-    const unsigned Reg = (Subtarget->isTarget64BitLP64() ? X86::RAX : X86::EAX);
-
-    Chain = DAG.getCopyToReg(Chain, dl, Reg, Size, Flag);
-    Flag = Chain.getValue(1);
     SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
+    Chain = DAG.getNode(X86ISD::WIN_ALLOCA, dl, NodeTys, Chain, Size);
+    MF.getInfo<X86MachineFunctionInfo>()->setHasWinAlloca(true);
 
-    Chain = DAG.getNode(X86ISD::WIN_ALLOCA, dl, NodeTys, Chain, Flag);
-
-    const X86RegisterInfo *RegInfo = Subtarget->getRegisterInfo();
+    const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
     unsigned SPReg = RegInfo->getStackRegister();
     SDValue SP = DAG.getCopyFromReg(Chain, dl, SPReg, SPTy);
     Chain = SP.getValue(1);
@@ -16047,13 +16916,13 @@ SDValue X86TargetLowering::LowerVASTART(SDValue Op, SelectionDAG &DAG) const {
   const Value *SV = cast<SrcValueSDNode>(Op.getOperand(2))->getValue();
   SDLoc DL(Op);
 
-  if (!Subtarget->is64Bit() ||
-      Subtarget->isCallingConvWin64(MF.getFunction()->getCallingConv())) {
+  if (!Subtarget.is64Bit() ||
+      Subtarget.isCallingConvWin64(MF.getFunction()->getCallingConv())) {
     // vastart just stores the address of the VarArgsFrameIndex slot into the
     // memory location argument.
     SDValue FR = DAG.getFrameIndex(FuncInfo->getVarArgsFrameIndex(), PtrVT);
     return DAG.getStore(Op.getOperand(0), DL, FR, Op.getOperand(1),
-                        MachinePointerInfo(SV), false, false, 0);
+                        MachinePointerInfo(SV));
   }
 
   // __va_list_tag:
@@ -16064,45 +16933,45 @@ SDValue X86TargetLowering::LowerVASTART(SDValue Op, SelectionDAG &DAG) const {
   SmallVector<SDValue, 8> MemOps;
   SDValue FIN = Op.getOperand(1);
   // Store gp_offset
-  SDValue Store = DAG.getStore(Op.getOperand(0), DL,
-                               DAG.getConstant(FuncInfo->getVarArgsGPOffset(),
-                                               DL, MVT::i32),
-                               FIN, MachinePointerInfo(SV), false, false, 0);
+  SDValue Store = DAG.getStore(
+      Op.getOperand(0), DL,
+      DAG.getConstant(FuncInfo->getVarArgsGPOffset(), DL, MVT::i32), FIN,
+      MachinePointerInfo(SV));
   MemOps.push_back(Store);
 
   // Store fp_offset
-  FIN = DAG.getNode(ISD::ADD, DL, PtrVT, FIN, DAG.getIntPtrConstant(4, DL));
-  Store = DAG.getStore(Op.getOperand(0), DL,
-                       DAG.getConstant(FuncInfo->getVarArgsFPOffset(), DL,
-                                       MVT::i32),
-                       FIN, MachinePointerInfo(SV, 4), false, false, 0);
+  FIN = DAG.getMemBasePlusOffset(FIN, 4, DL);
+  Store = DAG.getStore(
+      Op.getOperand(0), DL,
+      DAG.getConstant(FuncInfo->getVarArgsFPOffset(), DL, MVT::i32), FIN,
+      MachinePointerInfo(SV, 4));
   MemOps.push_back(Store);
 
   // Store ptr to overflow_arg_area
   FIN = DAG.getNode(ISD::ADD, DL, PtrVT, FIN, DAG.getIntPtrConstant(4, DL));
   SDValue OVFIN = DAG.getFrameIndex(FuncInfo->getVarArgsFrameIndex(), PtrVT);
-  Store = DAG.getStore(Op.getOperand(0), DL, OVFIN, FIN,
-                       MachinePointerInfo(SV, 8),
-                       false, false, 0);
+  Store =
+      DAG.getStore(Op.getOperand(0), DL, OVFIN, FIN, MachinePointerInfo(SV, 8));
   MemOps.push_back(Store);
 
   // Store ptr to reg_save_area.
   FIN = DAG.getNode(ISD::ADD, DL, PtrVT, FIN, DAG.getIntPtrConstant(
-      Subtarget->isTarget64BitLP64() ? 8 : 4, DL));
+      Subtarget.isTarget64BitLP64() ? 8 : 4, DL));
   SDValue RSFIN = DAG.getFrameIndex(FuncInfo->getRegSaveFrameIndex(), PtrVT);
-  Store = DAG.getStore(Op.getOperand(0), DL, RSFIN, FIN, MachinePointerInfo(
-      SV, Subtarget->isTarget64BitLP64() ? 16 : 12), false, false, 0);
+  Store = DAG.getStore(
+      Op.getOperand(0), DL, RSFIN, FIN,
+      MachinePointerInfo(SV, Subtarget.isTarget64BitLP64() ? 16 : 12));
   MemOps.push_back(Store);
   return DAG.getNode(ISD::TokenFactor, DL, MVT::Other, MemOps);
 }
 
 SDValue X86TargetLowering::LowerVAARG(SDValue Op, SelectionDAG &DAG) const {
-  assert(Subtarget->is64Bit() &&
+  assert(Subtarget.is64Bit() &&
          "LowerVAARG only handles 64-bit va_arg!");
   assert(Op.getNode()->getNumOperands() == 4);
 
   MachineFunction &MF = DAG.getMachineFunction();
-  if (Subtarget->isCallingConvWin64(MF.getFunction()->getCallingConv()))
+  if (Subtarget.isCallingConvWin64(MF.getFunction()->getCallingConv()))
     // The Win64 ABI uses char* instead of a structure.
     return DAG.expandVAArg(Op.getNode());
 
@@ -16132,9 +17001,9 @@ SDValue X86TargetLowering::LowerVAARG(SDValue Op, SelectionDAG &DAG) const {
 
   if (ArgMode == 2) {
     // Sanity Check: Make sure using fp_offset makes sense.
-    assert(!Subtarget->useSoftFloat() &&
+    assert(!Subtarget.useSoftFloat() &&
            !(MF.getFunction()->hasFnAttribute(Attribute::NoImplicitFloat)) &&
-           Subtarget->hasSSE1());
+           Subtarget.hasSSE1());
   }
 
   // Insert VAARG_64 node into the DAG
@@ -16153,19 +17022,15 @@ SDValue X86TargetLowering::LowerVAARG(SDValue Op, SelectionDAG &DAG) const {
   Chain = VAARG.getValue(1);
 
   // Load the next argument and return it
-  return DAG.getLoad(ArgVT, dl,
-                     Chain,
-                     VAARG,
-                     MachinePointerInfo(),
-                     false, false, false, 0);
+  return DAG.getLoad(ArgVT, dl, Chain, VAARG, MachinePointerInfo());
 }
 
-static SDValue LowerVACOPY(SDValue Op, const X86Subtarget *Subtarget,
+static SDValue LowerVACOPY(SDValue Op, const X86Subtarget &Subtarget,
                            SelectionDAG &DAG) {
   // X86-64 va_list is a struct { i32, i32, i8*, i8* }, except on Windows,
   // where a va_list is still an i8*.
-  assert(Subtarget->is64Bit() && "This code only handles 64-bit va_copy!");
-  if (Subtarget->isCallingConvWin64(
+  assert(Subtarget.is64Bit() && "This code only handles 64-bit va_copy!");
+  if (Subtarget.isCallingConvWin64(
         DAG.getMachineFunction().getFunction()->getCallingConv()))
     // Probably a Win64 va_copy.
     return DAG.expandVACopy(Op.getNode());
@@ -16183,9 +17048,9 @@ static SDValue LowerVACOPY(SDValue Op, const X86Subtarget *Subtarget,
                        MachinePointerInfo(DstSV), MachinePointerInfo(SrcSV));
 }
 
-// getTargetVShiftByConstNode - Handle vector element shifts where the shift
-// amount is a constant. Takes immediate version of shift as input.
-static SDValue getTargetVShiftByConstNode(unsigned Opc, SDLoc dl, MVT VT,
+/// Handle vector element shifts where the shift amount is a constant.
+/// Takes immediate version of shift as input.
+static SDValue getTargetVShiftByConstNode(unsigned Opc, const SDLoc &dl, MVT VT,
                                           SDValue SrcOp, uint64_t ShiftAmt,
                                           SelectionDAG &DAG) {
   MVT ElementType = VT.getVectorElementType();
@@ -16214,11 +17079,11 @@ static SDValue getTargetVShiftByConstNode(unsigned Opc, SDLoc dl, MVT VT,
     ConstantSDNode *ND;
 
     switch(Opc) {
-    default: llvm_unreachable(nullptr);
+    default: llvm_unreachable("Unknown opcode!");
     case X86ISD::VSHLI:
       for (unsigned i=0; i!=NumElts; ++i) {
         SDValue CurrentOp = SrcOp->getOperand(i);
-        if (CurrentOp->getOpcode() == ISD::UNDEF) {
+        if (CurrentOp->isUndef()) {
           Elts.push_back(CurrentOp);
           continue;
         }
@@ -16230,7 +17095,7 @@ static SDValue getTargetVShiftByConstNode(unsigned Opc, SDLoc dl, MVT VT,
     case X86ISD::VSRLI:
       for (unsigned i=0; i!=NumElts; ++i) {
         SDValue CurrentOp = SrcOp->getOperand(i);
-        if (CurrentOp->getOpcode() == ISD::UNDEF) {
+        if (CurrentOp->isUndef()) {
           Elts.push_back(CurrentOp);
           continue;
         }
@@ -16242,7 +17107,7 @@ static SDValue getTargetVShiftByConstNode(unsigned Opc, SDLoc dl, MVT VT,
     case X86ISD::VSRAI:
       for (unsigned i=0; i!=NumElts; ++i) {
         SDValue CurrentOp = SrcOp->getOperand(i);
-        if (CurrentOp->getOpcode() == ISD::UNDEF) {
+        if (CurrentOp->isUndef()) {
           Elts.push_back(CurrentOp);
           continue;
         }
@@ -16253,16 +17118,16 @@ static SDValue getTargetVShiftByConstNode(unsigned Opc, SDLoc dl, MVT VT,
       break;
     }
 
-    return DAG.getNode(ISD::BUILD_VECTOR, dl, VT, Elts);
+    return DAG.getBuildVector(VT, dl, Elts);
   }
 
   return DAG.getNode(Opc, dl, VT, SrcOp,
                      DAG.getConstant(ShiftAmt, dl, MVT::i8));
 }
 
-// getTargetVShiftNode - Handle vector element shifts where the shift amount
-// may or may not be a constant. Takes immediate version of shift as input.
-static SDValue getTargetVShiftNode(unsigned Opc, SDLoc dl, MVT VT,
+/// Handle vector element shifts where the shift amount may or may not be a
+/// constant. Takes immediate version of shift as input.
+static SDValue getTargetVShiftNode(unsigned Opc, const SDLoc &dl, MVT VT,
                                    SDValue SrcOp, SDValue ShAmt,
                                    SelectionDAG &DAG) {
   MVT SVT = ShAmt.getSimpleValueType();
@@ -16288,7 +17153,7 @@ static SDValue getTargetVShiftNode(unsigned Opc, SDLoc dl, MVT VT,
     // Let the shuffle legalizer expand this shift amount node.
     SDValue Op0 = ShAmt.getOperand(0);
     Op0 = DAG.getNode(ISD::SCALAR_TO_VECTOR, SDLoc(Op0), MVT::v8i16, Op0);
-    ShAmt = getShuffleVectorZeroOrUndef(Op0, 0, true, &Subtarget, DAG);
+    ShAmt = getShuffleVectorZeroOrUndef(Op0, 0, true, Subtarget, DAG);
   } else {
     // Need to build a vector containing shift amount.
     // SSE/AVX packed shifts only use the lower 64-bit of the shift count.
@@ -16301,7 +17166,7 @@ static SDValue getTargetVShiftNode(unsigned Opc, SDLoc dl, MVT VT,
     ShOps.push_back(DAG.getUNDEF(SVT));
 
     MVT BVT = SVT == MVT::i32 ? MVT::v4i32 : MVT::v2i64;
-    ShAmt = DAG.getNode(ISD::BUILD_VECTOR, dl, BVT, ShOps);
+    ShAmt = DAG.getBuildVector(BVT, dl, ShOps);
   }
 
   // The return type has to be a 128-bit type with the same element
@@ -16316,8 +17181,8 @@ static SDValue getTargetVShiftNode(unsigned Opc, SDLoc dl, MVT VT,
 /// \brief Return Mask with the necessary casting or extending
 /// for \p Mask according to \p MaskVT when lowering masking intrinsics
 static SDValue getMaskNode(SDValue Mask, MVT MaskVT,
-                           const X86Subtarget *Subtarget,
-                           SelectionDAG &DAG, SDLoc dl) {
+                           const X86Subtarget &Subtarget, SelectionDAG &DAG,
+                           const SDLoc &dl) {
 
   if (isAllOnesConstant(Mask))
     return DAG.getTargetConstant(1, dl, MaskVT);
@@ -16330,9 +17195,9 @@ static SDValue getMaskNode(SDValue Mask, MVT MaskVT,
                        MVT::getIntegerVT(MaskVT.getSizeInBits()), Mask);
   }
 
-  if (Mask.getSimpleValueType() == MVT::i64 && Subtarget->is32Bit()) {
+  if (Mask.getSimpleValueType() == MVT::i64 && Subtarget.is32Bit()) {
     if (MaskVT == MVT::v64i1) {
-      assert(Subtarget->hasBWI() && "Expected AVX512BW target!");
+      assert(Subtarget.hasBWI() && "Expected AVX512BW target!");
       // In case 32bit mode, bitcast i64 is illegal, extend/split it.
       SDValue Lo, Hi;
       Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, Mask,
@@ -16368,7 +17233,7 @@ static SDValue getMaskNode(SDValue Mask, MVT MaskVT,
 /// necessary casting or extending for \p Mask when lowering masking intrinsics
 static SDValue getVectorMaskingNode(SDValue Op, SDValue Mask,
                   SDValue PreservedSrc,
-                  const X86Subtarget *Subtarget,
+                  const X86Subtarget &Subtarget,
                   SelectionDAG &DAG) {
   MVT VT = Op.getSimpleValueType();
   MVT MaskVT = MVT::getVectorVT(MVT::i1, VT.getVectorNumElements());
@@ -16393,13 +17258,14 @@ static SDValue getVectorMaskingNode(SDValue Op, SDValue Mask,
   case X86ISD::VTRUNC:
   case X86ISD::VTRUNCS:
   case X86ISD::VTRUNCUS:
+  case ISD::FP_TO_FP16:
     // We can't use ISD::VSELECT here because it is not always "Legal"
     // for the destination type. For example vpmovqb require only AVX512
     // and vselect that can operate on byte element type require BWI
     OpcodeSelect = X86ISD::SELECT;
     break;
   }
-  if (PreservedSrc.getOpcode() == ISD::UNDEF)
+  if (PreservedSrc.isUndef())
     PreservedSrc = getZeroVector(VT, Subtarget, DAG, dl);
   return DAG.getNode(OpcodeSelect, dl, VT, VMask, Op, PreservedSrc);
 }
@@ -16413,7 +17279,7 @@ static SDValue getVectorMaskingNode(SDValue Op, SDValue Mask,
 /// for a scalar instruction.
 static SDValue getScalarMaskingNode(SDValue Op, SDValue Mask,
                                     SDValue PreservedSrc,
-                                    const X86Subtarget *Subtarget,
+                                    const X86Subtarget &Subtarget,
                                     SelectionDAG &DAG) {
   if (isAllOnesConstant(Mask))
     return Op;
@@ -16429,7 +17295,7 @@ static SDValue getScalarMaskingNode(SDValue Op, SDValue Mask,
       Op.getOpcode() == X86ISD::VFPCLASSS)
     return DAG.getNode(ISD::OR, dl, VT, Op, IMask);
 
-  if (PreservedSrc.getOpcode() == ISD::UNDEF)
+  if (PreservedSrc.isUndef())
     PreservedSrc = getZeroVector(VT, Subtarget, DAG, dl);
   return DAG.getNode(X86ISD::SELECT, dl, VT, IMask, Op, PreservedSrc);
 }
@@ -16495,7 +17361,7 @@ static SDValue recoverFramePointer(SelectionDAG &DAG, const Function *Fn,
   return DAG.getNode(ISD::SUB, dl, PtrVT, RegNodeBase, ParentFrameOffset);
 }
 
-static SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, const X86Subtarget *Subtarget,
+static SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, const X86Subtarget &Subtarget,
                                        SelectionDAG &DAG) {
   SDLoc dl(Op);
   unsigned IntNo = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
@@ -16706,6 +17572,16 @@ static SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, const X86Subtarget *Subtarget
                                               Src1, Src2, Src3),
                                   Mask, PassThru, Subtarget, DAG);
     }
+    case VPERM_2OP_MASK : {
+      SDValue Src1 = Op.getOperand(1);
+      SDValue Src2 = Op.getOperand(2);
+      SDValue PassThru = Op.getOperand(3);
+      SDValue Mask = Op.getOperand(4);
+
+      // Swap Src1 and Src2 in the node creation
+      return getVectorMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT,Src2, Src1),
+                                  Mask, PassThru, Subtarget, DAG);
+    }
     case VPERM_3OP_MASKZ:
     case VPERM_3OP_MASK:{
       // Src2 is the PassThru
@@ -16764,6 +17640,30 @@ static SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, const X86Subtarget *Subtarget
                                               Src1, Src2, Src3),
                                   Mask, PassThru, Subtarget, DAG);
     }
+    case FMA_OP_SCALAR_MASK:
+    case FMA_OP_SCALAR_MASK3:
+    case FMA_OP_SCALAR_MASKZ: {
+      SDValue Src1 = Op.getOperand(1);
+      SDValue Src2 = Op.getOperand(2);
+      SDValue Src3 = Op.getOperand(3);
+      SDValue Mask = Op.getOperand(4);
+      MVT VT = Op.getSimpleValueType();
+      SDValue PassThru = SDValue();
+
+      // set PassThru element
+      if (IntrData->Type == FMA_OP_SCALAR_MASKZ)
+        PassThru = getZeroVector(VT, Subtarget, DAG, dl);
+      else if (IntrData->Type == FMA_OP_SCALAR_MASK3)
+        PassThru = Src3;
+      else
+        PassThru = Src1;
+
+      SDValue Rnd = Op.getOperand(5);
+      return getScalarMaskingNode(DAG.getNode(IntrData->Opc0, dl,
+                                              Op.getValueType(), Src1, Src2,
+                                              Src3, Rnd),
+                                  Mask, PassThru, Subtarget, DAG);
+    }
     case TERLOG_OP_MASK:
     case TERLOG_OP_MASKZ: {
       SDValue Src1 = Op.getOperand(1);
@@ -16879,49 +17779,76 @@ static SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, const X86Subtarget *Subtarget
                                                                    MVT::i1),
                                              Subtarget, DAG);
 
-      return DAG.getNode(ISD::SIGN_EXTEND_INREG, dl, MVT::i8,
-                         DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i8, CmpMask),
-                         DAG.getValueType(MVT::i1));
+      return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i8, CmpMask);
     }
     case COMI: { // Comparison intrinsics
       ISD::CondCode CC = (ISD::CondCode)IntrData->Opc1;
       SDValue LHS = Op.getOperand(1);
       SDValue RHS = Op.getOperand(2);
-      unsigned X86CC = TranslateX86CC(CC, dl, true, LHS, RHS, DAG);
-      assert(X86CC != X86::COND_INVALID && "Unexpected illegal condition!");
-      SDValue Cond = DAG.getNode(IntrData->Opc0, dl, MVT::i32, LHS, RHS);
-      SDValue SetCC = DAG.getNode(X86ISD::SETCC, dl, MVT::i8,
-                                  DAG.getConstant(X86CC, dl, MVT::i8), Cond);
+      SDValue Comi = DAG.getNode(IntrData->Opc0, dl, MVT::i32, LHS, RHS);
+      SDValue InvComi = DAG.getNode(IntrData->Opc0, dl, MVT::i32, RHS, LHS);
+      SDValue SetCC;
+      switch (CC) {
+      case ISD::SETEQ: { // (ZF = 0 and PF = 0)
+        SetCC = DAG.getNode(X86ISD::SETCC, dl, MVT::i8,
+                            DAG.getConstant(X86::COND_E, dl, MVT::i8), Comi);
+        SDValue SetNP = DAG.getNode(X86ISD::SETCC, dl, MVT::i8,
+                                    DAG.getConstant(X86::COND_NP, dl, MVT::i8),
+                                    Comi);
+        SetCC = DAG.getNode(ISD::AND, dl, MVT::i8, SetCC, SetNP);
+        break;
+      }
+      case ISD::SETNE: { // (ZF = 1 or PF = 1)
+        SetCC = DAG.getNode(X86ISD::SETCC, dl, MVT::i8,
+                            DAG.getConstant(X86::COND_NE, dl, MVT::i8), Comi);
+        SDValue SetP = DAG.getNode(X86ISD::SETCC, dl, MVT::i8,
+                                   DAG.getConstant(X86::COND_P, dl, MVT::i8),
+                                   Comi);
+        SetCC = DAG.getNode(ISD::OR, dl, MVT::i8, SetCC, SetP);
+        break;
+      }
+      case ISD::SETGT: // (CF = 0 and ZF = 0)
+        SetCC = DAG.getNode(X86ISD::SETCC, dl, MVT::i8,
+                            DAG.getConstant(X86::COND_A, dl, MVT::i8), Comi);
+        break;
+      case ISD::SETLT: { // The condition is opposite to GT. Swap the operands.
+        SetCC = DAG.getNode(X86ISD::SETCC, dl, MVT::i8,
+                            DAG.getConstant(X86::COND_A, dl, MVT::i8), InvComi);
+        break;
+      }
+      case ISD::SETGE: // CF = 0
+        SetCC = DAG.getNode(X86ISD::SETCC, dl, MVT::i8,
+                            DAG.getConstant(X86::COND_AE, dl, MVT::i8), Comi);
+        break;
+      case ISD::SETLE: // The condition is opposite to GE. Swap the operands.
+        SetCC = DAG.getNode(X86ISD::SETCC, dl, MVT::i8,
+                            DAG.getConstant(X86::COND_AE, dl, MVT::i8), InvComi);
+        break;
+      default:
+        llvm_unreachable("Unexpected illegal condition!");
+      }
       return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, SetCC);
     }
     case COMI_RM: { // Comparison intrinsics with Sae
       SDValue LHS = Op.getOperand(1);
       SDValue RHS = Op.getOperand(2);
-      SDValue CC = Op.getOperand(3);
+      unsigned CondVal = cast<ConstantSDNode>(Op.getOperand(3))->getZExtValue();
       SDValue Sae = Op.getOperand(4);
-      auto ComiType = TranslateX86ConstCondToX86CC(CC);
-      // choose between ordered and unordered (comi/ucomi)
-      unsigned comiOp = std::get<0>(ComiType) ? IntrData->Opc0 : IntrData->Opc1;
-      SDValue Cond;
-      if (cast<ConstantSDNode>(Sae)->getZExtValue() !=
-                                           X86::STATIC_ROUNDING::CUR_DIRECTION)
-        Cond = DAG.getNode(comiOp, dl, MVT::i32, LHS, RHS, Sae);
+
+      SDValue FCmp;
+      if (cast<ConstantSDNode>(Sae)->getZExtValue() ==
+          X86::STATIC_ROUNDING::CUR_DIRECTION)
+        FCmp = DAG.getNode(X86ISD::FSETCC, dl, MVT::i1, LHS, RHS,
+                                  DAG.getConstant(CondVal, dl, MVT::i8));
       else
-        Cond = DAG.getNode(comiOp, dl, MVT::i32, LHS, RHS);
-      SDValue SetCC = DAG.getNode(X86ISD::SETCC, dl, MVT::i8,
-        DAG.getConstant(std::get<1>(ComiType), dl, MVT::i8), Cond);
-      return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, SetCC);
+        FCmp = DAG.getNode(X86ISD::FSETCC, dl, MVT::i1, LHS, RHS,
+                                  DAG.getConstant(CondVal, dl, MVT::i8), Sae);
+      // AnyExt just uses KMOVW %kreg, %r32; ZeroExt emits "and $1, %reg"
+      return DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, FCmp);
     }
     case VSHIFT:
       return getTargetVShiftNode(IntrData->Opc0, dl, Op.getSimpleValueType(),
                                  Op.getOperand(1), Op.getOperand(2), DAG);
-    case VSHIFT_MASK:
-      return getVectorMaskingNode(getTargetVShiftNode(IntrData->Opc0, dl,
-                                                      Op.getSimpleValueType(),
-                                                      Op.getOperand(1),
-                                                      Op.getOperand(2), DAG),
-                                  Op.getOperand(4), Op.getOperand(3), Subtarget,
-                                  DAG);
     case COMPRESS_EXPAND_IN_REG: {
       SDValue Mask = Op.getOperand(3);
       SDValue DataToCompress = Op.getOperand(1);
@@ -16940,14 +17867,6 @@ static SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, const X86Subtarget *Subtarget
       Mask = DAG.getBitcast(MaskVT, Mask);
       return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(), Mask);
     }
-    case BLEND: {
-      SDValue Mask = Op.getOperand(3);
-      MVT VT = Op.getSimpleValueType();
-      MVT MaskVT = MVT::getVectorVT(MVT::i1, VT.getVectorNumElements());
-      SDValue VMask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl);
-      return DAG.getNode(IntrData->Opc0, dl, VT, VMask, Op.getOperand(1),
-                         Op.getOperand(2));
-    }
     case KUNPCK: {
       MVT VT = Op.getSimpleValueType();
       MVT MaskVT = MVT::getVectorVT(MVT::i1, VT.getSizeInBits()/2);
@@ -16960,6 +17879,35 @@ static SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, const X86Subtarget *Subtarget
                                 Src2, Src1);
       return DAG.getBitcast(VT, Res);
     }
+    case FIXUPIMMS:
+    case FIXUPIMMS_MASKZ:
+    case FIXUPIMM:
+    case FIXUPIMM_MASKZ:{
+      SDValue Src1 = Op.getOperand(1);
+      SDValue Src2 = Op.getOperand(2);
+      SDValue Src3 = Op.getOperand(3);
+      SDValue Imm = Op.getOperand(4);
+      SDValue Mask = Op.getOperand(5);
+      SDValue Passthru = (IntrData->Type == FIXUPIMM || IntrData->Type == FIXUPIMMS ) ?
+                                         Src1 : getZeroVector(VT, Subtarget, DAG, dl);
+      // We specify 2 possible modes for intrinsics, with/without rounding
+      // modes.
+      // First, we check if the intrinsic have rounding mode (7 operands),
+      // if not, we set rounding mode to "current".
+      SDValue Rnd;
+      if (Op.getNumOperands() == 7)
+        Rnd = Op.getOperand(6);
+      else
+        Rnd = DAG.getConstant(X86::STATIC_ROUNDING::CUR_DIRECTION, dl, MVT::i32);
+      if (IntrData->Type == FIXUPIMM || IntrData->Type == FIXUPIMM_MASKZ)
+        return getVectorMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT,
+                                                Src1, Src2, Src3, Imm, Rnd),
+                                    Mask, Passthru, Subtarget, DAG);
+      else // Scalar - FIXUPIMMS, FIXUPIMMS_MASKZ
+        return getScalarMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT,
+                                       Src1, Src2, Src3, Imm, Rnd),
+                                    Mask, Passthru, Subtarget, DAG);
+    }
     case CONVERT_TO_MASK: {
       MVT SrcVT = Op.getOperand(1).getSimpleValueType();
       MVT MaskVT = MVT::getVectorVT(MVT::i1, SrcVT.getVectorNumElements());
@@ -16995,6 +17943,21 @@ static SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, const X86Subtarget *Subtarget
                                               subVec, subVec, immVal),
                                   Mask, Passthru, Subtarget, DAG);
     }
+    case BRCST32x2_TO_VEC: {
+      SDValue Src = Op.getOperand(1);
+      SDValue PassThru = Op.getOperand(2);
+      SDValue Mask = Op.getOperand(3);
+
+      assert((VT.getScalarType() == MVT::i32 ||
+              VT.getScalarType() == MVT::f32) && "Unexpected type!");
+      //bitcast Src to packed 64
+      MVT ScalarVT = VT.getScalarType() == MVT::i32 ? MVT::i64 : MVT::f64;
+      MVT BitcastVT = MVT::getVectorVT(ScalarVT, Src.getValueSizeInBits()/64);
+      Src = DAG.getBitcast(BitcastVT, Src);
+
+      return getVectorMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT, Src),
+                                  Mask, PassThru, Subtarget, DAG);
+    }
     default:
       break;
     }
@@ -17082,7 +18045,7 @@ static SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, const X86Subtarget *Subtarget
     SDValue RHS = DAG.getBitcast(MVT::v16i1, Op.getOperand(2));
     SDValue CC = DAG.getConstant(X86CC, dl, MVT::i8);
     SDValue Test = DAG.getNode(X86ISD::KORTEST, dl, MVT::i32, LHS, RHS);
-    SDValue SetCC = DAG.getNode(X86ISD::SETCC, dl, MVT::i1, CC, Test);
+    SDValue SetCC = DAG.getNode(X86ISD::SETCC, dl, MVT::i8, CC, Test);
     return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, SetCC);
   }
 
@@ -17163,6 +18126,16 @@ static SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, const X86Subtarget *Subtarget
     return DAG.getNode(Opcode, dl, VTs, NewOps);
   }
 
+  case Intrinsic::eh_sjlj_lsda: {
+    MachineFunction &MF = DAG.getMachineFunction();
+    const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+    MVT PtrVT = TLI.getPointerTy(DAG.getDataLayout());
+    auto &Context = MF.getMMI().getContext();
+    MCSymbol *S = Context.getOrCreateSymbol(Twine("GCC_except_table") +
+                                            Twine(MF.getFunctionNumber()));
+    return DAG.getNode(X86ISD::Wrapper, dl, VT, DAG.getMCSymbol(S, PtrVT));
+  }
+
   case Intrinsic::x86_seh_lsda: {
     // Compute the symbol for the LSDA. We know it'll get emitted later.
     MachineFunction &MF = DAG.getMachineFunction();
@@ -17192,7 +18165,7 @@ static SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, const X86Subtarget *Subtarget
     // Returns one of the stack, base, or frame pointer registers, depending on
     // which is used to reference local variables.
     MachineFunction &MF = DAG.getMachineFunction();
-    const X86RegisterInfo *RegInfo = Subtarget->getRegisterInfo();
+    const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
     unsigned Reg;
     if (RegInfo->hasBasePointer(MF))
       Reg = RegInfo->getBaseRegister();
@@ -17206,7 +18179,7 @@ static SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, const X86Subtarget *Subtarget
 static SDValue getGatherNode(unsigned Opc, SDValue Op, SelectionDAG &DAG,
                               SDValue Src, SDValue Mask, SDValue Base,
                               SDValue Index, SDValue ScaleOp, SDValue Chain,
-                              const X86Subtarget * Subtarget) {
+                              const X86Subtarget &Subtarget) {
   SDLoc dl(Op);
   auto *C = cast<ConstantSDNode>(ScaleOp);
   SDValue Scale = DAG.getTargetConstant(C->getZExtValue(), dl, MVT::i8);
@@ -17217,7 +18190,7 @@ static SDValue getGatherNode(unsigned Opc, SDValue Op, SelectionDAG &DAG,
   SDVTList VTs = DAG.getVTList(Op.getValueType(), MaskVT, MVT::Other);
   SDValue Disp = DAG.getTargetConstant(0, dl, MVT::i32);
   SDValue Segment = DAG.getRegister(0, MVT::i32);
-  if (Src.getOpcode() == ISD::UNDEF)
+  if (Src.isUndef())
     Src = getZeroVector(Op.getSimpleValueType(), Subtarget, DAG, dl);
   SDValue Ops[] = {Src, VMask, Base, Scale, Index, Disp, Segment, Chain};
   SDNode *Res = DAG.getMachineNode(Opc, dl, VTs, Ops);
@@ -17237,7 +18210,7 @@ static SDValue getScatterNode(unsigned Opc, SDValue Op, SelectionDAG &DAG,
   MVT MaskVT = MVT::getVectorVT(MVT::i1,
                              Index.getSimpleValueType().getVectorNumElements());
 
-  SDValue VMask = getMaskNode(Mask, MaskVT, &Subtarget, DAG, dl);
+  SDValue VMask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl);
   SDVTList VTs = DAG.getVTList(MaskVT, MVT::Other);
   SDValue Ops[] = {Base, Scale, Index, Disp, Segment, VMask, Src, Chain};
   SDNode *Res = DAG.getMachineNode(Opc, dl, VTs, Ops);
@@ -17255,18 +18228,19 @@ static SDValue getPrefetchNode(unsigned Opc, SDValue Op, SelectionDAG &DAG,
   SDValue Segment = DAG.getRegister(0, MVT::i32);
   MVT MaskVT =
     MVT::getVectorVT(MVT::i1, Index.getSimpleValueType().getVectorNumElements());
-  SDValue VMask = getMaskNode(Mask, MaskVT, &Subtarget, DAG, dl);
+  SDValue VMask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl);
   //SDVTList VTs = DAG.getVTList(MVT::Other);
   SDValue Ops[] = {VMask, Base, Scale, Index, Disp, Segment, Chain};
   SDNode *Res = DAG.getMachineNode(Opc, dl, MVT::Other, Ops);
   return SDValue(Res, 0);
 }
 
-// getReadPerformanceCounter - Handles the lowering of builtin intrinsics that
-// read performance monitor counters (x86_rdpmc).
-static void getReadPerformanceCounter(SDNode *N, SDLoc DL,
-                              SelectionDAG &DAG, const X86Subtarget *Subtarget,
-                              SmallVectorImpl<SDValue> &Results) {
+/// Handles the lowering of builtin intrinsics that read performance monitor
+/// counters (x86_rdpmc).
+static void getReadPerformanceCounter(SDNode *N, const SDLoc &DL,
+                                      SelectionDAG &DAG,
+                                      const X86Subtarget &Subtarget,
+                                      SmallVectorImpl<SDValue> &Results) {
   assert(N->getNumOperands() == 3 && "Unexpected number of operands!");
   SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Glue);
   SDValue LO, HI;
@@ -17279,7 +18253,7 @@ static void getReadPerformanceCounter(SDNode *N, SDLoc DL,
 
   // Reads the content of a 64-bit performance counter and returns it in the
   // registers EDX:EAX.
-  if (Subtarget->is64Bit()) {
+  if (Subtarget.is64Bit()) {
     LO = DAG.getCopyFromReg(rd, DL, X86::RAX, MVT::i64, rd.getValue(1));
     HI = DAG.getCopyFromReg(LO.getValue(1), DL, X86::RDX, MVT::i64,
                             LO.getValue(2));
@@ -17290,7 +18264,7 @@ static void getReadPerformanceCounter(SDNode *N, SDLoc DL,
   }
   Chain = HI.getValue(1);
 
-  if (Subtarget->is64Bit()) {
+  if (Subtarget.is64Bit()) {
     // The EAX register is loaded with the low-order 32 bits. The EDX register
     // is loaded with the supported high-order bits of the counter.
     SDValue Tmp = DAG.getNode(ISD::SHL, DL, MVT::i64, HI,
@@ -17307,12 +18281,13 @@ static void getReadPerformanceCounter(SDNode *N, SDLoc DL,
   Results.push_back(Chain);
 }
 
-// getReadTimeStampCounter - Handles the lowering of builtin intrinsics that
-// read the time stamp counter (x86_rdtsc and x86_rdtscp). This function is
-// also used to custom lower READCYCLECOUNTER nodes.
-static void getReadTimeStampCounter(SDNode *N, SDLoc DL, unsigned Opcode,
-                              SelectionDAG &DAG, const X86Subtarget *Subtarget,
-                              SmallVectorImpl<SDValue> &Results) {
+/// Handles the lowering of builtin intrinsics that read the time stamp counter
+/// (x86_rdtsc and x86_rdtscp). This function is also used to custom lower
+/// READCYCLECOUNTER nodes.
+static void getReadTimeStampCounter(SDNode *N, const SDLoc &DL, unsigned Opcode,
+                                    SelectionDAG &DAG,
+                                    const X86Subtarget &Subtarget,
+                                    SmallVectorImpl<SDValue> &Results) {
   SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Glue);
   SDValue rd = DAG.getNode(Opcode, DL, Tys, N->getOperand(0));
   SDValue LO, HI;
@@ -17320,7 +18295,7 @@ static void getReadTimeStampCounter(SDNode *N, SDLoc DL, unsigned Opcode,
   // The processor's time-stamp counter (a 64-bit MSR) is stored into the
   // EDX:EAX registers. EDX is loaded with the high-order 32 bits of the MSR
   // and the EAX register is loaded with the low-order 32 bits.
-  if (Subtarget->is64Bit()) {
+  if (Subtarget.is64Bit()) {
     LO = DAG.getCopyFromReg(rd, DL, X86::RAX, MVT::i64, rd.getValue(1));
     HI = DAG.getCopyFromReg(LO.getValue(1), DL, X86::RDX, MVT::i64,
                             LO.getValue(2));
@@ -17341,10 +18316,10 @@ static void getReadTimeStampCounter(SDNode *N, SDLoc DL, unsigned Opcode,
     // Explicitly store the content of ECX at the location passed in input
     // to the 'rdtscp' intrinsic.
     Chain = DAG.getStore(ecx.getValue(1), DL, ecx, N->getOperand(2),
-                         MachinePointerInfo(), false, false, 0);
+                         MachinePointerInfo());
   }
 
-  if (Subtarget->is64Bit()) {
+  if (Subtarget.is64Bit()) {
     // The EDX register is loaded with the high-order 32 bits of the MSR, and
     // the EAX register is loaded with the low-order 32 bits.
     SDValue Tmp = DAG.getNode(ISD::SHL, DL, MVT::i64, HI,
@@ -17361,7 +18336,7 @@ static void getReadTimeStampCounter(SDNode *N, SDLoc DL, unsigned Opcode,
   Results.push_back(Chain);
 }
 
-static SDValue LowerREADCYCLECOUNTER(SDValue Op, const X86Subtarget *Subtarget,
+static SDValue LowerREADCYCLECOUNTER(SDValue Op, const X86Subtarget &Subtarget,
                                      SelectionDAG &DAG) {
   SmallVector<SDValue, 2> Results;
   SDLoc DL(Op);
@@ -17388,44 +18363,25 @@ static SDValue MarkEHRegistrationNode(SDValue Op, SelectionDAG &DAG) {
   return Chain;
 }
 
-/// \brief Lower intrinsics for TRUNCATE_TO_MEM case
-/// return truncate Store/MaskedStore Node
-static SDValue LowerINTRINSIC_TRUNCATE_TO_MEM(const SDValue & Op,
-                                               SelectionDAG &DAG,
-                                               MVT ElementType) {
-  SDLoc dl(Op);
-  SDValue Mask = Op.getOperand(4);
-  SDValue DataToTruncate = Op.getOperand(3);
-  SDValue Addr = Op.getOperand(2);
+static SDValue MarkEHGuard(SDValue Op, SelectionDAG &DAG) {
+  MachineFunction &MF = DAG.getMachineFunction();
   SDValue Chain = Op.getOperand(0);
+  SDValue EHGuard = Op.getOperand(2);
+  WinEHFuncInfo *EHInfo = MF.getWinEHFuncInfo();
+  if (!EHInfo)
+    report_fatal_error("EHGuard only live in functions using WinEH");
 
-  MVT VT  = DataToTruncate.getSimpleValueType();
-  MVT SVT = MVT::getVectorVT(ElementType, VT.getVectorNumElements());
-
-  if (isAllOnesConstant(Mask)) // return just a truncate store
-    return DAG.getTruncStore(Chain, dl, DataToTruncate, Addr,
-                             MachinePointerInfo(), SVT, false, false,
-                             SVT.getScalarSizeInBits()/8);
-
-  MVT MaskVT = MVT::getVectorVT(MVT::i1, VT.getVectorNumElements());
-  MVT BitcastVT = MVT::getVectorVT(MVT::i1,
-                                   Mask.getSimpleValueType().getSizeInBits());
-  // In case when MaskVT equals v2i1 or v4i1, low 2 or 4 elements
-  // are extracted by EXTRACT_SUBVECTOR.
-  SDValue VMask = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MaskVT,
-                              DAG.getBitcast(BitcastVT, Mask),
-                              DAG.getIntPtrConstant(0, dl));
-
-  MachineMemOperand *MMO = DAG.getMachineFunction().
-    getMachineMemOperand(MachinePointerInfo(),
-                         MachineMemOperand::MOStore, SVT.getStoreSize(),
-                         SVT.getScalarSizeInBits()/8);
+  // Cast the operand to an alloca, and remember the frame index.
+  auto *FINode = dyn_cast<FrameIndexSDNode>(EHGuard);
+  if (!FINode)
+    report_fatal_error("llvm.x86.seh.ehguard expects a static alloca");
+  EHInfo->EHGuardFrameIndex = FINode->getIndex();
 
-  return DAG.getMaskedStore(Chain, dl, DataToTruncate, Addr,
-                            VMask, SVT, MMO, true);
+  // Return the chain operand without making any DAG nodes.
+  return Chain;
 }
 
-static SDValue LowerINTRINSIC_W_CHAIN(SDValue Op, const X86Subtarget *Subtarget,
+static SDValue LowerINTRINSIC_W_CHAIN(SDValue Op, const X86Subtarget &Subtarget,
                                       SelectionDAG &DAG) {
   unsigned IntNo = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue();
 
@@ -17433,6 +18389,8 @@ static SDValue LowerINTRINSIC_W_CHAIN(SDValue Op, const X86Subtarget *Subtarget,
   if (!IntrData) {
     if (IntNo == llvm::Intrinsic::x86_seh_ehregnode)
       return MarkEHRegistrationNode(Op, DAG);
+    if (IntNo == llvm::Intrinsic::x86_seh_ehguard)
+      return MarkEHGuard(Op, DAG);
     if (IntNo == llvm::Intrinsic::x86_flags_read_u32 ||
         IntNo == llvm::Intrinsic::x86_flags_read_u64 ||
         IntNo == llvm::Intrinsic::x86_flags_write_u32 ||
@@ -17491,7 +18449,7 @@ static SDValue LowerINTRINSIC_W_CHAIN(SDValue Op, const X86Subtarget *Subtarget,
     SDValue Src   = Op.getOperand(5);
     SDValue Scale = Op.getOperand(6);
     return getScatterNode(IntrData->Opc0, Op, DAG, Src, Mask, Base, Index,
-                          Scale, Chain, *Subtarget);
+                          Scale, Chain, Subtarget);
   }
   case PREFETCH: {
     SDValue Hint = Op.getOperand(6);
@@ -17504,7 +18462,7 @@ static SDValue LowerINTRINSIC_W_CHAIN(SDValue Op, const X86Subtarget *Subtarget,
     SDValue Base  = Op.getOperand(4);
     SDValue Scale = Op.getOperand(5);
     return getPrefetchNode(Opcode, Op, DAG, Mask, Base, Index, Scale, Chain,
-                           *Subtarget);
+                           Subtarget);
   }
   // Read Time Stamp Counter (RDTSC) and Processor ID (RDTSCP).
   case RDTSC: {
@@ -17532,7 +18490,6 @@ static SDValue LowerINTRINSIC_W_CHAIN(SDValue Op, const X86Subtarget *Subtarget,
   }
   // ADC/ADCX/SBB
   case ADX: {
-    SmallVector<SDValue, 2> Results;
     SDVTList CFVTs = DAG.getVTList(Op->getValueType(0), MVT::Other);
     SDVTList VTs = DAG.getVTList(Op.getOperand(3)->getValueType(0), MVT::Other);
     SDValue GenCF = DAG.getNode(X86ISD::ADD, dl, CFVTs, Op.getOperand(2),
@@ -17540,13 +18497,11 @@ static SDValue LowerINTRINSIC_W_CHAIN(SDValue Op, const X86Subtarget *Subtarget,
     SDValue Res = DAG.getNode(IntrData->Opc0, dl, VTs, Op.getOperand(3),
                               Op.getOperand(4), GenCF.getValue(1));
     SDValue Store = DAG.getStore(Op.getOperand(0), dl, Res.getValue(0),
-                                 Op.getOperand(5), MachinePointerInfo(),
-                                 false, false, 0);
+                                 Op.getOperand(5), MachinePointerInfo());
     SDValue SetCC = DAG.getNode(X86ISD::SETCC, dl, MVT::i8,
                                 DAG.getConstant(X86::COND_B, dl, MVT::i8),
                                 Res.getValue(1));
-    Results.push_back(SetCC);
-    Results.push_back(Store);
+    SDValue Results[] = { SetCC, Store };
     return DAG.getMergeValues(Results, dl);
   }
   case COMPRESS_TO_MEM: {
@@ -17554,48 +18509,45 @@ static SDValue LowerINTRINSIC_W_CHAIN(SDValue Op, const X86Subtarget *Subtarget,
     SDValue DataToCompress = Op.getOperand(3);
     SDValue Addr = Op.getOperand(2);
     SDValue Chain = Op.getOperand(0);
-
     MVT VT = DataToCompress.getSimpleValueType();
+
+    MemIntrinsicSDNode *MemIntr = dyn_cast<MemIntrinsicSDNode>(Op);
+    assert(MemIntr && "Expected MemIntrinsicSDNode!");
+
     if (isAllOnesConstant(Mask)) // return just a store
       return DAG.getStore(Chain, dl, DataToCompress, Addr,
-                          MachinePointerInfo(), false, false,
-                          VT.getScalarSizeInBits()/8);
+                          MemIntr->getMemOperand());
 
     SDValue Compressed =
       getVectorMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT, DataToCompress),
                            Mask, DAG.getUNDEF(VT), Subtarget, DAG);
     return DAG.getStore(Chain, dl, Compressed, Addr,
-                        MachinePointerInfo(), false, false,
-                        VT.getScalarSizeInBits()/8);
+                        MemIntr->getMemOperand());
   }
   case TRUNCATE_TO_MEM_VI8:
-    return LowerINTRINSIC_TRUNCATE_TO_MEM(Op, DAG, MVT::i8);
   case TRUNCATE_TO_MEM_VI16:
-    return LowerINTRINSIC_TRUNCATE_TO_MEM(Op, DAG, MVT::i16);
-  case TRUNCATE_TO_MEM_VI32:
-    return LowerINTRINSIC_TRUNCATE_TO_MEM(Op, DAG, MVT::i32);
-  case EXPAND_FROM_MEM: {
+  case TRUNCATE_TO_MEM_VI32: {
     SDValue Mask = Op.getOperand(4);
-    SDValue PassThru = Op.getOperand(3);
+    SDValue DataToTruncate = Op.getOperand(3);
     SDValue Addr = Op.getOperand(2);
     SDValue Chain = Op.getOperand(0);
-    MVT VT = Op.getSimpleValueType();
 
-    if (isAllOnesConstant(Mask)) // return just a load
-      return DAG.getLoad(VT, dl, Chain, Addr, MachinePointerInfo(), false, false,
-                         false, VT.getScalarSizeInBits()/8);
+    MemIntrinsicSDNode *MemIntr = dyn_cast<MemIntrinsicSDNode>(Op);
+    assert(MemIntr && "Expected MemIntrinsicSDNode!");
 
-    SDValue DataToExpand = DAG.getLoad(VT, dl, Chain, Addr, MachinePointerInfo(),
-                                       false, false, false,
-                                       VT.getScalarSizeInBits()/8);
+    EVT VT  = MemIntr->getMemoryVT();
 
-    SDValue Results[] = {
-      getVectorMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT, DataToExpand),
-                           Mask, PassThru, Subtarget, DAG), Chain};
-    return DAG.getMergeValues(Results, dl);
+    if (isAllOnesConstant(Mask)) // return just a truncate store
+      return DAG.getTruncStore(Chain, dl, DataToTruncate, Addr, VT,
+                               MemIntr->getMemOperand());
+
+    MVT MaskVT = MVT::getVectorVT(MVT::i1, VT.getVectorNumElements());
+    SDValue VMask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl);
+
+    return DAG.getMaskedStore(Chain, dl, DataToTruncate, Addr, VMask, VT,
+                              MemIntr->getMemOperand(), true);
   }
-  case LOADU:
-  case LOADA: {
+  case EXPAND_FROM_MEM: {
     SDValue Mask = Op.getOperand(4);
     SDValue PassThru = Op.getOperand(3);
     SDValue Addr = Op.getOperand(2);
@@ -17605,13 +18557,16 @@ static SDValue LowerINTRINSIC_W_CHAIN(SDValue Op, const X86Subtarget *Subtarget,
     MemIntrinsicSDNode *MemIntr = dyn_cast<MemIntrinsicSDNode>(Op);
     assert(MemIntr && "Expected MemIntrinsicSDNode!");
 
+    SDValue DataToExpand = DAG.getLoad(VT, dl, Chain, Addr,
+                                       MemIntr->getMemOperand());
+
     if (isAllOnesConstant(Mask)) // return just a load
-      return DAG.getLoad(VT, dl, Chain, Addr, MemIntr->getMemOperand());
+      return DataToExpand;
 
-    MVT MaskVT = MVT::getVectorVT(MVT::i1, VT.getVectorNumElements());
-    SDValue VMask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl);
-    return DAG.getMaskedLoad(VT, dl, Chain, Addr, VMask, PassThru, VT,
-                             MemIntr->getMemOperand(), ISD::NON_EXTLOAD);
+    SDValue Results[] = {
+      getVectorMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT, DataToExpand),
+                           Mask, PassThru, Subtarget, DAG), Chain};
+    return DAG.getMergeValues(Results, dl);
   }
   }
 }
@@ -17630,25 +18585,24 @@ SDValue X86TargetLowering::LowerRETURNADDR(SDValue Op,
 
   if (Depth > 0) {
     SDValue FrameAddr = LowerFRAMEADDR(Op, DAG);
-    const X86RegisterInfo *RegInfo = Subtarget->getRegisterInfo();
+    const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
     SDValue Offset = DAG.getConstant(RegInfo->getSlotSize(), dl, PtrVT);
     return DAG.getLoad(PtrVT, dl, DAG.getEntryNode(),
-                       DAG.getNode(ISD::ADD, dl, PtrVT,
-                                   FrameAddr, Offset),
-                       MachinePointerInfo(), false, false, false, 0);
+                       DAG.getNode(ISD::ADD, dl, PtrVT, FrameAddr, Offset),
+                       MachinePointerInfo());
   }
 
   // Just load the return address.
   SDValue RetAddrFI = getReturnAddressFrameIndex(DAG);
-  return DAG.getLoad(PtrVT, dl, DAG.getEntryNode(),
-                     RetAddrFI, MachinePointerInfo(), false, false, false, 0);
+  return DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), RetAddrFI,
+                     MachinePointerInfo());
 }
 
 SDValue X86TargetLowering::LowerFRAMEADDR(SDValue Op, SelectionDAG &DAG) const {
   MachineFunction &MF = DAG.getMachineFunction();
   MachineFrameInfo *MFI = MF.getFrameInfo();
   X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>();
-  const X86RegisterInfo *RegInfo = Subtarget->getRegisterInfo();
+  const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
   EVT VT = Op.getValueType();
 
   MFI->setFrameAddressIsTaken(true);
@@ -17678,8 +18632,7 @@ SDValue X86TargetLowering::LowerFRAMEADDR(SDValue Op, SelectionDAG &DAG) const {
   SDValue FrameAddr = DAG.getCopyFromReg(DAG.getEntryNode(), dl, FrameReg, VT);
   while (Depth--)
     FrameAddr = DAG.getLoad(VT, dl, DAG.getEntryNode(), FrameAddr,
-                            MachinePointerInfo(),
-                            false, false, false, 0);
+                            MachinePointerInfo());
   return FrameAddr;
 }
 
@@ -17687,7 +18640,7 @@ SDValue X86TargetLowering::LowerFRAMEADDR(SDValue Op, SelectionDAG &DAG) const {
 // this table could be generated automatically from RegInfo.
 unsigned X86TargetLowering::getRegisterByName(const char* RegName, EVT VT,
                                               SelectionDAG &DAG) const {
-  const TargetFrameLowering &TFI = *Subtarget->getFrameLowering();
+  const TargetFrameLowering &TFI = *Subtarget.getFrameLowering();
   const MachineFunction &MF = DAG.getMachineFunction();
 
   unsigned Reg = StringSwitch<unsigned>(RegName)
@@ -17703,7 +18656,7 @@ unsigned X86TargetLowering::getRegisterByName(const char* RegName, EVT VT,
                          " is allocatable: function has no frame pointer");
 #ifndef NDEBUG
     else {
-      const X86RegisterInfo *RegInfo = Subtarget->getRegisterInfo();
+      const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
       unsigned FrameReg =
           RegInfo->getPtrSizedFrameRegister(DAG.getMachineFunction());
       assert((FrameReg == X86::EBP || FrameReg == X86::RBP) &&
@@ -17720,23 +18673,27 @@ unsigned X86TargetLowering::getRegisterByName(const char* RegName, EVT VT,
 
 SDValue X86TargetLowering::LowerFRAME_TO_ARGS_OFFSET(SDValue Op,
                                                      SelectionDAG &DAG) const {
-  const X86RegisterInfo *RegInfo = Subtarget->getRegisterInfo();
+  const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
   return DAG.getIntPtrConstant(2 * RegInfo->getSlotSize(), SDLoc(Op));
 }
 
 unsigned X86TargetLowering::getExceptionPointerRegister(
     const Constant *PersonalityFn) const {
   if (classifyEHPersonality(PersonalityFn) == EHPersonality::CoreCLR)
-    return Subtarget->isTarget64BitLP64() ? X86::RDX : X86::EDX;
+    return Subtarget.isTarget64BitLP64() ? X86::RDX : X86::EDX;
 
-  return Subtarget->isTarget64BitLP64() ? X86::RAX : X86::EAX;
+  return Subtarget.isTarget64BitLP64() ? X86::RAX : X86::EAX;
 }
 
 unsigned X86TargetLowering::getExceptionSelectorRegister(
     const Constant *PersonalityFn) const {
   // Funclet personalities don't use selectors (the runtime does the selection).
   assert(!isFuncletEHPersonality(classifyEHPersonality(PersonalityFn)));
-  return Subtarget->isTarget64BitLP64() ? X86::RDX : X86::EDX;
+  return Subtarget.isTarget64BitLP64() ? X86::RDX : X86::EDX;
+}
+
+bool X86TargetLowering::needsFixedCatchObjects() const {
+  return Subtarget.isTargetWin64();
 }
 
 SDValue X86TargetLowering::LowerEH_RETURN(SDValue Op, SelectionDAG &DAG) const {
@@ -17746,7 +18703,7 @@ SDValue X86TargetLowering::LowerEH_RETURN(SDValue Op, SelectionDAG &DAG) const {
   SDLoc dl      (Op);
 
   EVT PtrVT = getPointerTy(DAG.getDataLayout());
-  const X86RegisterInfo *RegInfo = Subtarget->getRegisterInfo();
+  const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
   unsigned FrameReg = RegInfo->getFrameRegister(DAG.getMachineFunction());
   assert(((FrameReg == X86::RBP && PtrVT == MVT::i64) ||
           (FrameReg == X86::EBP && PtrVT == MVT::i32)) &&
@@ -17758,8 +18715,7 @@ SDValue X86TargetLowering::LowerEH_RETURN(SDValue Op, SelectionDAG &DAG) const {
                                  DAG.getIntPtrConstant(RegInfo->getSlotSize(),
                                                        dl));
   StoreAddr = DAG.getNode(ISD::ADD, dl, PtrVT, StoreAddr, Offset);
-  Chain = DAG.getStore(Chain, dl, Handler, StoreAddr, MachinePointerInfo(),
-                       false, false, 0);
+  Chain = DAG.getStore(Chain, dl, Handler, StoreAddr, MachinePointerInfo());
   Chain = DAG.getCopyToReg(Chain, dl, StoreAddrReg, StoreAddr);
 
   return DAG.getNode(X86ISD::EH_RETURN, dl, MVT::Other, Chain,
@@ -17769,6 +18725,16 @@ SDValue X86TargetLowering::LowerEH_RETURN(SDValue Op, SelectionDAG &DAG) const {
 SDValue X86TargetLowering::lowerEH_SJLJ_SETJMP(SDValue Op,
                                                SelectionDAG &DAG) const {
   SDLoc DL(Op);
+  // If the subtarget is not 64bit, we may need the global base reg
+  // after isel expand pseudo, i.e., after CGBR pass ran.
+  // Therefore, ask for the GlobalBaseReg now, so that the pass
+  // inserts the code for us in case we need it.
+  // Otherwise, we will end up in a situation where we will
+  // reference a virtual register that is not defined!
+  if (!Subtarget.is64Bit()) {
+    const X86InstrInfo *TII = Subtarget.getInstrInfo();
+    (void)TII->getGlobalBaseReg(&DAG.getMachineFunction());
+  }
   return DAG.getNode(X86ISD::EH_SJLJ_SETJMP, DL,
                      DAG.getVTList(MVT::i32, MVT::Other),
                      Op.getOperand(0), Op.getOperand(1));
@@ -17781,6 +18747,13 @@ SDValue X86TargetLowering::lowerEH_SJLJ_LONGJMP(SDValue Op,
                      Op.getOperand(0), Op.getOperand(1));
 }
 
+SDValue X86TargetLowering::lowerEH_SJLJ_SETUP_DISPATCH(SDValue Op,
+                                                       SelectionDAG &DAG) const {
+  SDLoc DL(Op);
+  return DAG.getNode(X86ISD::EH_SJLJ_SETUP_DISPATCH, DL, MVT::Other,
+                     Op.getOperand(0));
+}
+
 static SDValue LowerADJUST_TRAMPOLINE(SDValue Op, SelectionDAG &DAG) {
   return Op.getOperand(0);
 }
@@ -17794,9 +18767,9 @@ SDValue X86TargetLowering::LowerINIT_TRAMPOLINE(SDValue Op,
   SDLoc dl (Op);
 
   const Value *TrmpAddr = cast<SrcValueSDNode>(Op.getOperand(4))->getValue();
-  const TargetRegisterInfo *TRI = Subtarget->getRegisterInfo();
+  const TargetRegisterInfo *TRI = Subtarget.getRegisterInfo();
 
-  if (Subtarget->is64Bit()) {
+  if (Subtarget.is64Bit()) {
     SDValue OutChains[6];
 
     // Large code-model.
@@ -17812,14 +18785,13 @@ SDValue X86TargetLowering::LowerINIT_TRAMPOLINE(SDValue Op,
     unsigned OpCode = ((MOV64ri | N86R11) << 8) | REX_WB; // movabsq r11
     SDValue Addr = Trmp;
     OutChains[0] = DAG.getStore(Root, dl, DAG.getConstant(OpCode, dl, MVT::i16),
-                                Addr, MachinePointerInfo(TrmpAddr),
-                                false, false, 0);
+                                Addr, MachinePointerInfo(TrmpAddr));
 
     Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp,
                        DAG.getConstant(2, dl, MVT::i64));
-    OutChains[1] = DAG.getStore(Root, dl, FPtr, Addr,
-                                MachinePointerInfo(TrmpAddr, 2),
-                                false, false, 2);
+    OutChains[1] =
+        DAG.getStore(Root, dl, FPtr, Addr, MachinePointerInfo(TrmpAddr, 2),
+                     /* Alignment = */ 2);
 
     // Load the 'nest' parameter value into R10.
     // R10 is specified in X86CallingConv.td
@@ -17827,29 +18799,26 @@ SDValue X86TargetLowering::LowerINIT_TRAMPOLINE(SDValue Op,
     Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp,
                        DAG.getConstant(10, dl, MVT::i64));
     OutChains[2] = DAG.getStore(Root, dl, DAG.getConstant(OpCode, dl, MVT::i16),
-                                Addr, MachinePointerInfo(TrmpAddr, 10),
-                                false, false, 0);
+                                Addr, MachinePointerInfo(TrmpAddr, 10));
 
     Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp,
                        DAG.getConstant(12, dl, MVT::i64));
-    OutChains[3] = DAG.getStore(Root, dl, Nest, Addr,
-                                MachinePointerInfo(TrmpAddr, 12),
-                                false, false, 2);
+    OutChains[3] =
+        DAG.getStore(Root, dl, Nest, Addr, MachinePointerInfo(TrmpAddr, 12),
+                     /* Alignment = */ 2);
 
     // Jump to the nested function.
     OpCode = (JMP64r << 8) | REX_WB; // jmpq *...
     Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp,
                        DAG.getConstant(20, dl, MVT::i64));
     OutChains[4] = DAG.getStore(Root, dl, DAG.getConstant(OpCode, dl, MVT::i16),
-                                Addr, MachinePointerInfo(TrmpAddr, 20),
-                                false, false, 0);
+                                Addr, MachinePointerInfo(TrmpAddr, 20));
 
     unsigned char ModRM = N86R11 | (4 << 3) | (3 << 6); // ...r11
     Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp,
                        DAG.getConstant(22, dl, MVT::i64));
     OutChains[5] = DAG.getStore(Root, dl, DAG.getConstant(ModRM, dl, MVT::i8),
-                                Addr, MachinePointerInfo(TrmpAddr, 22),
-                                false, false, 0);
+                                Addr, MachinePointerInfo(TrmpAddr, 22));
 
     return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, OutChains);
   } else {
@@ -17909,29 +18878,28 @@ SDValue X86TargetLowering::LowerINIT_TRAMPOLINE(SDValue Op,
     // This is storing the opcode for MOV32ri.
     const unsigned char MOV32ri = 0xB8; // X86::MOV32ri's opcode byte.
     const unsigned char N86Reg = TRI->getEncodingValue(NestReg) & 0x7;
-    OutChains[0] = DAG.getStore(Root, dl,
-                                DAG.getConstant(MOV32ri|N86Reg, dl, MVT::i8),
-                                Trmp, MachinePointerInfo(TrmpAddr),
-                                false, false, 0);
+    OutChains[0] =
+        DAG.getStore(Root, dl, DAG.getConstant(MOV32ri | N86Reg, dl, MVT::i8),
+                     Trmp, MachinePointerInfo(TrmpAddr));
 
     Addr = DAG.getNode(ISD::ADD, dl, MVT::i32, Trmp,
                        DAG.getConstant(1, dl, MVT::i32));
-    OutChains[1] = DAG.getStore(Root, dl, Nest, Addr,
-                                MachinePointerInfo(TrmpAddr, 1),
-                                false, false, 1);
+    OutChains[1] =
+        DAG.getStore(Root, dl, Nest, Addr, MachinePointerInfo(TrmpAddr, 1),
+                     /* Alignment = */ 1);
 
     const unsigned char JMP = 0xE9; // jmp <32bit dst> opcode.
     Addr = DAG.getNode(ISD::ADD, dl, MVT::i32, Trmp,
                        DAG.getConstant(5, dl, MVT::i32));
     OutChains[2] = DAG.getStore(Root, dl, DAG.getConstant(JMP, dl, MVT::i8),
                                 Addr, MachinePointerInfo(TrmpAddr, 5),
-                                false, false, 1);
+                                /* Alignment = */ 1);
 
     Addr = DAG.getNode(ISD::ADD, dl, MVT::i32, Trmp,
                        DAG.getConstant(6, dl, MVT::i32));
-    OutChains[3] = DAG.getStore(Root, dl, Disp, Addr,
-                                MachinePointerInfo(TrmpAddr, 6),
-                                false, false, 1);
+    OutChains[3] =
+        DAG.getStore(Root, dl, Disp, Addr, MachinePointerInfo(TrmpAddr, 6),
+                     /* Alignment = */ 1);
 
     return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, OutChains);
   }
@@ -17959,7 +18927,7 @@ SDValue X86TargetLowering::LowerFLT_ROUNDS_(SDValue Op,
   */
 
   MachineFunction &MF = DAG.getMachineFunction();
-  const TargetFrameLowering &TFI = *Subtarget->getFrameLowering();
+  const TargetFrameLowering &TFI = *Subtarget.getFrameLowering();
   unsigned StackAlignment = TFI.getStackAlignment();
   MVT VT = Op.getSimpleValueType();
   SDLoc DL(Op);
@@ -17979,8 +18947,8 @@ SDValue X86TargetLowering::LowerFLT_ROUNDS_(SDValue Op,
                                           Ops, MVT::i16, MMO);
 
   // Load FP Control Word from stack slot
-  SDValue CWD = DAG.getLoad(MVT::i16, DL, Chain, StackSlot,
-                            MachinePointerInfo(), false, false, false, 0);
+  SDValue CWD =
+      DAG.getLoad(MVT::i16, DL, Chain, StackSlot, MachinePointerInfo());
 
   // Transform as necessary
   SDValue CWD1 =
@@ -18014,6 +18982,7 @@ SDValue X86TargetLowering::LowerFLT_ROUNDS_(SDValue Op,
 //    split the vector, perform operation on it's Lo a Hi part and
 //    concatenate the results.
 static SDValue LowerVectorCTLZ_AVX512(SDValue Op, SelectionDAG &DAG) {
+  assert(Op.getOpcode() == ISD::CTLZ);
   SDLoc dl(Op);
   MVT VT = Op.getSimpleValueType();
   MVT EltVT = VT.getVectorElementType();
@@ -18044,8 +19013,8 @@ static SDValue LowerVectorCTLZ_AVX512(SDValue Op, SelectionDAG &DAG) {
     std::tie(Lo, Hi) = DAG.SplitVector(Op.getOperand(0), dl);
     MVT OutVT = MVT::getVectorVT(EltVT, NumElems/2);
 
-    Lo = DAG.getNode(Op.getOpcode(), dl, OutVT, Lo);
-    Hi = DAG.getNode(Op.getOpcode(), dl, OutVT, Hi);
+    Lo = DAG.getNode(ISD::CTLZ, dl, OutVT, Lo);
+    Hi = DAG.getNode(ISD::CTLZ, dl, OutVT, Hi);
 
     return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, Lo, Hi);
   }
@@ -18064,51 +19033,112 @@ static SDValue LowerVectorCTLZ_AVX512(SDValue Op, SelectionDAG &DAG) {
   return DAG.getNode(ISD::SUB, dl, VT, TruncNode, Delta);
 }
 
-static SDValue LowerCTLZ(SDValue Op, const X86Subtarget *Subtarget,
-                         SelectionDAG &DAG) {
+// Lower CTLZ using a PSHUFB lookup table implementation.
+static SDValue LowerVectorCTLZInRegLUT(SDValue Op, const SDLoc &DL,
+                                       const X86Subtarget &Subtarget,
+                                       SelectionDAG &DAG) {
   MVT VT = Op.getSimpleValueType();
-  MVT OpVT = VT;
-  unsigned NumBits = VT.getSizeInBits();
-  SDLoc dl(Op);
+  int NumElts = VT.getVectorNumElements();
+  int NumBytes = NumElts * (VT.getScalarSizeInBits() / 8);
+  MVT CurrVT = MVT::getVectorVT(MVT::i8, NumBytes);
 
-  if (VT.isVector() && Subtarget->hasAVX512())
-    return LowerVectorCTLZ_AVX512(Op, DAG);
+  // Per-nibble leading zero PSHUFB lookup table.
+  const int LUT[16] = {/* 0 */ 4, /* 1 */ 3, /* 2 */ 2, /* 3 */ 2,
+                       /* 4 */ 1, /* 5 */ 1, /* 6 */ 1, /* 7 */ 1,
+                       /* 8 */ 0, /* 9 */ 0, /* a */ 0, /* b */ 0,
+                       /* c */ 0, /* d */ 0, /* e */ 0, /* f */ 0};
 
-  Op = Op.getOperand(0);
-  if (VT == MVT::i8) {
-    // Zero extend to i32 since there is not an i8 bsr.
-    OpVT = MVT::i32;
-    Op = DAG.getNode(ISD::ZERO_EXTEND, dl, OpVT, Op);
+  SmallVector<SDValue, 64> LUTVec;
+  for (int i = 0; i < NumBytes; ++i)
+    LUTVec.push_back(DAG.getConstant(LUT[i % 16], DL, MVT::i8));
+  SDValue InRegLUT = DAG.getNode(ISD::BUILD_VECTOR, DL, CurrVT, LUTVec);
+
+  // Begin by bitcasting the input to byte vector, then split those bytes
+  // into lo/hi nibbles and use the PSHUFB LUT to perform CLTZ on each of them.
+  // If the hi input nibble is zero then we add both results together, otherwise
+  // we just take the hi result (by masking the lo result to zero before the
+  // add).
+  SDValue Op0 = DAG.getBitcast(CurrVT, Op.getOperand(0));
+  SDValue Zero = getZeroVector(CurrVT, Subtarget, DAG, DL);
+
+  SDValue NibbleMask = DAG.getConstant(0xF, DL, CurrVT);
+  SDValue NibbleShift = DAG.getConstant(0x4, DL, CurrVT);
+  SDValue Lo = DAG.getNode(ISD::AND, DL, CurrVT, Op0, NibbleMask);
+  SDValue Hi = DAG.getNode(ISD::SRL, DL, CurrVT, Op0, NibbleShift);
+  SDValue HiZ = DAG.getSetCC(DL, CurrVT, Hi, Zero, ISD::SETEQ);
+
+  Lo = DAG.getNode(X86ISD::PSHUFB, DL, CurrVT, InRegLUT, Lo);
+  Hi = DAG.getNode(X86ISD::PSHUFB, DL, CurrVT, InRegLUT, Hi);
+  Lo = DAG.getNode(ISD::AND, DL, CurrVT, Lo, HiZ);
+  SDValue Res = DAG.getNode(ISD::ADD, DL, CurrVT, Lo, Hi);
+
+  // Merge result back from vXi8 back to VT, working on the lo/hi halves
+  // of the current vector width in the same way we did for the nibbles.
+  // If the upper half of the input element is zero then add the halves'
+  // leading zero counts together, otherwise just use the upper half's.
+  // Double the width of the result until we are at target width.
+  while (CurrVT != VT) {
+    int CurrScalarSizeInBits = CurrVT.getScalarSizeInBits();
+    int CurrNumElts = CurrVT.getVectorNumElements();
+    MVT NextSVT = MVT::getIntegerVT(CurrScalarSizeInBits * 2);
+    MVT NextVT = MVT::getVectorVT(NextSVT, CurrNumElts / 2);
+    SDValue Shift = DAG.getConstant(CurrScalarSizeInBits, DL, NextVT);
+
+    // Check if the upper half of the input element is zero.
+    SDValue HiZ = DAG.getSetCC(DL, CurrVT, DAG.getBitcast(CurrVT, Op0),
+                               DAG.getBitcast(CurrVT, Zero), ISD::SETEQ);
+    HiZ = DAG.getBitcast(NextVT, HiZ);
+
+    // Move the upper/lower halves to the lower bits as we'll be extending to
+    // NextVT. Mask the lower result to zero if HiZ is true and add the results
+    // together.
+    SDValue ResNext = Res = DAG.getBitcast(NextVT, Res);
+    SDValue R0 = DAG.getNode(ISD::SRL, DL, NextVT, ResNext, Shift);
+    SDValue R1 = DAG.getNode(ISD::SRL, DL, NextVT, HiZ, Shift);
+    R1 = DAG.getNode(ISD::AND, DL, NextVT, ResNext, R1);
+    Res = DAG.getNode(ISD::ADD, DL, NextVT, R0, R1);
+    CurrVT = NextVT;
   }
 
-  // Issue a bsr (scan bits in reverse) which also sets EFLAGS.
-  SDVTList VTs = DAG.getVTList(OpVT, MVT::i32);
-  Op = DAG.getNode(X86ISD::BSR, dl, VTs, Op);
+  return Res;
+}
 
-  // If src is zero (i.e. bsr sets ZF), returns NumBits.
-  SDValue Ops[] = {
-    Op,
-    DAG.getConstant(NumBits + NumBits - 1, dl, OpVT),
-    DAG.getConstant(X86::COND_E, dl, MVT::i8),
-    Op.getValue(1)
-  };
-  Op = DAG.getNode(X86ISD::CMOV, dl, OpVT, Ops);
+static SDValue LowerVectorCTLZ(SDValue Op, const SDLoc &DL,
+                               const X86Subtarget &Subtarget,
+                               SelectionDAG &DAG) {
+  MVT VT = Op.getSimpleValueType();
+  SDValue Op0 = Op.getOperand(0);
 
-  // Finally xor with NumBits-1.
-  Op = DAG.getNode(ISD::XOR, dl, OpVT, Op,
-                   DAG.getConstant(NumBits - 1, dl, OpVT));
+  if (Subtarget.hasAVX512())
+    return LowerVectorCTLZ_AVX512(Op, DAG);
 
-  if (VT == MVT::i8)
-    Op = DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, Op);
-  return Op;
+  // Decompose 256-bit ops into smaller 128-bit ops.
+  if (VT.is256BitVector() && !Subtarget.hasInt256()) {
+    unsigned NumElems = VT.getVectorNumElements();
+
+    // Extract each 128-bit vector, perform ctlz and concat the result.
+    SDValue LHS = extract128BitVector(Op0, 0, DAG, DL);
+    SDValue RHS = extract128BitVector(Op0, NumElems / 2, DAG, DL);
+
+    return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT,
+                       DAG.getNode(ISD::CTLZ, DL, LHS.getValueType(), LHS),
+                       DAG.getNode(ISD::CTLZ, DL, RHS.getValueType(), RHS));
+  }
+
+  assert(Subtarget.hasSSSE3() && "Expected SSSE3 support for PSHUFB");
+  return LowerVectorCTLZInRegLUT(Op, DL, Subtarget, DAG);
 }
 
-static SDValue LowerCTLZ_ZERO_UNDEF(SDValue Op, const X86Subtarget *Subtarget,
-                                    SelectionDAG &DAG) {
+static SDValue LowerCTLZ(SDValue Op, const X86Subtarget &Subtarget,
+                         SelectionDAG &DAG) {
   MVT VT = Op.getSimpleValueType();
-  EVT OpVT = VT;
+  MVT OpVT = VT;
   unsigned NumBits = VT.getSizeInBits();
   SDLoc dl(Op);
+  unsigned Opc = Op.getOpcode();
+
+  if (VT.isVector())
+    return LowerVectorCTLZ(Op, dl, Subtarget, DAG);
 
   Op = Op.getOperand(0);
   if (VT == MVT::i8) {
@@ -18117,11 +19147,22 @@ static SDValue LowerCTLZ_ZERO_UNDEF(SDValue Op, const X86Subtarget *Subtarget,
     Op = DAG.getNode(ISD::ZERO_EXTEND, dl, OpVT, Op);
   }
 
-  // Issue a bsr (scan bits in reverse).
+  // Issue a bsr (scan bits in reverse) which also sets EFLAGS.
   SDVTList VTs = DAG.getVTList(OpVT, MVT::i32);
   Op = DAG.getNode(X86ISD::BSR, dl, VTs, Op);
 
-  // And xor with NumBits-1.
+  if (Opc == ISD::CTLZ) {
+    // If src is zero (i.e. bsr sets ZF), returns NumBits.
+    SDValue Ops[] = {
+      Op,
+      DAG.getConstant(NumBits + NumBits - 1, dl, OpVT),
+      DAG.getConstant(X86::COND_E, dl, MVT::i8),
+      Op.getValue(1)
+    };
+    Op = DAG.getNode(X86ISD::CMOV, dl, OpVT, Ops);
+  }
+
+  // Finally xor with NumBits-1.
   Op = DAG.getNode(ISD::XOR, dl, OpVT, Op,
                    DAG.getConstant(NumBits - 1, dl, OpVT));
 
@@ -18136,8 +19177,6 @@ static SDValue LowerCTTZ(SDValue Op, SelectionDAG &DAG) {
   SDLoc dl(Op);
 
   if (VT.isVector()) {
-    const TargetLowering &TLI = DAG.getTargetLoweringInfo();
-
     SDValue N0 = Op.getOperand(0);
     SDValue Zero = DAG.getConstant(0, dl, VT);
 
@@ -18146,8 +19185,7 @@ static SDValue LowerCTTZ(SDValue Op, SelectionDAG &DAG) {
                               DAG.getNode(ISD::SUB, dl, VT, Zero, N0));
 
     // cttz_undef(x) = (width - 1) - ctlz(lsb)
-    if (Op.getOpcode() == ISD::CTTZ_ZERO_UNDEF &&
-        TLI.isOperationLegal(ISD::CTLZ, VT)) {
+    if (Op.getOpcode() == ISD::CTTZ_ZERO_UNDEF) {
       SDValue WidthMinusOne = DAG.getConstant(NumBits - 1, dl, VT);
       return DAG.getNode(ISD::SUB, dl, VT, WidthMinusOne,
                          DAG.getNode(ISD::CTLZ, dl, VT, LSB));
@@ -18176,8 +19214,8 @@ static SDValue LowerCTTZ(SDValue Op, SelectionDAG &DAG) {
   return DAG.getNode(X86ISD::CMOV, dl, VT, Ops);
 }
 
-// Lower256IntArith - Break a 256-bit integer operation into two new 128-bit
-// ones, and then concatenate the result back.
+/// Break a 256-bit integer operation into two new 128-bit ones and then
+/// concatenate the result back.
 static SDValue Lower256IntArith(SDValue Op, SelectionDAG &DAG) {
   MVT VT = Op.getSimpleValueType();
 
@@ -18189,13 +19227,42 @@ static SDValue Lower256IntArith(SDValue Op, SelectionDAG &DAG) {
 
   // Extract the LHS vectors
   SDValue LHS = Op.getOperand(0);
-  SDValue LHS1 = Extract128BitVector(LHS, 0, DAG, dl);
-  SDValue LHS2 = Extract128BitVector(LHS, NumElems/2, DAG, dl);
+  SDValue LHS1 = extract128BitVector(LHS, 0, DAG, dl);
+  SDValue LHS2 = extract128BitVector(LHS, NumElems / 2, DAG, dl);
 
   // Extract the RHS vectors
   SDValue RHS = Op.getOperand(1);
-  SDValue RHS1 = Extract128BitVector(RHS, 0, DAG, dl);
-  SDValue RHS2 = Extract128BitVector(RHS, NumElems/2, DAG, dl);
+  SDValue RHS1 = extract128BitVector(RHS, 0, DAG, dl);
+  SDValue RHS2 = extract128BitVector(RHS, NumElems / 2, DAG, dl);
+
+  MVT EltVT = VT.getVectorElementType();
+  MVT NewVT = MVT::getVectorVT(EltVT, NumElems/2);
+
+  return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT,
+                     DAG.getNode(Op.getOpcode(), dl, NewVT, LHS1, RHS1),
+                     DAG.getNode(Op.getOpcode(), dl, NewVT, LHS2, RHS2));
+}
+
+/// Break a 512-bit integer operation into two new 256-bit ones and then
+/// concatenate the result back.
+static SDValue Lower512IntArith(SDValue Op, SelectionDAG &DAG) {
+  MVT VT = Op.getSimpleValueType();
+
+  assert(VT.is512BitVector() && VT.isInteger() &&
+         "Unsupported value type for operation");
+
+  unsigned NumElems = VT.getVectorNumElements();
+  SDLoc dl(Op);
+
+  // Extract the LHS vectors
+  SDValue LHS = Op.getOperand(0);
+  SDValue LHS1 = extract256BitVector(LHS, 0, DAG, dl);
+  SDValue LHS2 = extract256BitVector(LHS, NumElems / 2, DAG, dl);
+
+  // Extract the RHS vectors
+  SDValue RHS = Op.getOperand(1);
+  SDValue RHS1 = extract256BitVector(RHS, 0, DAG, dl);
+  SDValue RHS2 = extract256BitVector(RHS, NumElems / 2, DAG, dl);
 
   MVT EltVT = VT.getVectorElementType();
   MVT NewVT = MVT::getVectorVT(EltVT, NumElems/2);
@@ -18232,7 +19299,7 @@ static SDValue LowerMINMAX(SDValue Op, SelectionDAG &DAG) {
   return Lower256IntArith(Op, DAG);
 }
 
-static SDValue LowerMUL(SDValue Op, const X86Subtarget *Subtarget,
+static SDValue LowerMUL(SDValue Op, const X86Subtarget &Subtarget,
                         SelectionDAG &DAG) {
   SDLoc dl(Op);
   MVT VT = Op.getSimpleValueType();
@@ -18241,28 +19308,26 @@ static SDValue LowerMUL(SDValue Op, const X86Subtarget *Subtarget,
     return DAG.getNode(ISD::AND, dl, VT, Op.getOperand(0), Op.getOperand(1));
 
   // Decompose 256-bit ops into smaller 128-bit ops.
-  if (VT.is256BitVector() && !Subtarget->hasInt256())
+  if (VT.is256BitVector() && !Subtarget.hasInt256())
     return Lower256IntArith(Op, DAG);
 
   SDValue A = Op.getOperand(0);
   SDValue B = Op.getOperand(1);
 
-  // Lower v16i8/v32i8 mul as promotion to v8i16/v16i16 vector
-  // pairs, multiply and truncate.
-  if (VT == MVT::v16i8 || VT == MVT::v32i8) {
-    if (Subtarget->hasInt256()) {
-      if (VT == MVT::v32i8) {
-        MVT SubVT = MVT::getVectorVT(MVT::i8, VT.getVectorNumElements() / 2);
-        SDValue Lo = DAG.getIntPtrConstant(0, dl);
-        SDValue Hi = DAG.getIntPtrConstant(VT.getVectorNumElements() / 2, dl);
-        SDValue ALo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, SubVT, A, Lo);
-        SDValue BLo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, SubVT, B, Lo);
-        SDValue AHi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, SubVT, A, Hi);
-        SDValue BHi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, SubVT, B, Hi);
-        return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT,
-                           DAG.getNode(ISD::MUL, dl, SubVT, ALo, BLo),
-                           DAG.getNode(ISD::MUL, dl, SubVT, AHi, BHi));
-      }
+  // Lower v16i8/v32i8/v64i8 mul as sign-extension to v8i16/v16i16/v32i16
+  // vector pairs, multiply and truncate.
+  if (VT == MVT::v16i8 || VT == MVT::v32i8 || VT == MVT::v64i8) {
+    if (Subtarget.hasInt256()) {
+      // For 512-bit vectors, split into 256-bit vectors to allow the
+      // sign-extension to occur.
+      if (VT == MVT::v64i8)
+        return Lower512IntArith(Op, DAG);
+
+      // For 256-bit vectors, split into 128-bit vectors to allow the
+      // sign-extension to occur. We don't need this on AVX512BW as we can
+      // safely sign-extend to v32i16.
+      if (VT == MVT::v32i8 && !Subtarget.hasBWI())
+        return Lower256IntArith(Op, DAG);
 
       MVT ExVT = MVT::getVectorVT(MVT::i16, VT.getVectorNumElements());
       return DAG.getNode(
@@ -18278,7 +19343,7 @@ static SDValue LowerMUL(SDValue Op, const X86Subtarget *Subtarget,
 
     // Extract the lo parts and sign extend to i16
     SDValue ALo, BLo;
-    if (Subtarget->hasSSE41()) {
+    if (Subtarget.hasSSE41()) {
       ALo = DAG.getNode(X86ISD::VSEXT, dl, ExVT, A);
       BLo = DAG.getNode(X86ISD::VSEXT, dl, ExVT, B);
     } else {
@@ -18294,7 +19359,7 @@ static SDValue LowerMUL(SDValue Op, const X86Subtarget *Subtarget,
 
     // Extract the hi parts and sign extend to i16
     SDValue AHi, BHi;
-    if (Subtarget->hasSSE41()) {
+    if (Subtarget.hasSSE41()) {
       const int ShufMask[] = {8,  9,  10, 11, 12, 13, 14, 15,
                               -1, -1, -1, -1, -1, -1, -1, -1};
       AHi = DAG.getVectorShuffle(VT, dl, A, A, ShufMask);
@@ -18322,7 +19387,7 @@ static SDValue LowerMUL(SDValue Op, const X86Subtarget *Subtarget,
 
   // Lower v4i32 mul as 2x shuffle, 2x pmuludq, 2x shuffle.
   if (VT == MVT::v4i32) {
-    assert(Subtarget->hasSSE2() && !Subtarget->hasSSE41() &&
+    assert(Subtarget.hasSSE2() && !Subtarget.hasSSE41() &&
            "Should not custom lower when pmuldq is available!");
 
     // Extract the odd parts.
@@ -18386,8 +19451,122 @@ static SDValue LowerMUL(SDValue Op, const X86Subtarget *Subtarget,
   return DAG.getNode(ISD::ADD, dl, VT, Res, AhiBlo);
 }
 
+static SDValue LowerMULH(SDValue Op, const X86Subtarget &Subtarget,
+                         SelectionDAG &DAG) {
+  SDLoc dl(Op);
+  MVT VT = Op.getSimpleValueType();
+
+  // Decompose 256-bit ops into smaller 128-bit ops.
+  if (VT.is256BitVector() && !Subtarget.hasInt256())
+    return Lower256IntArith(Op, DAG);
+
+  // Only i8 vectors should need custom lowering after this.
+  assert((VT == MVT::v16i8 || (VT == MVT::v32i8 && Subtarget.hasInt256())) &&
+         "Unsupported vector type");
+
+  // Lower v16i8/v32i8 as extension to v8i16/v16i16 vector pairs, multiply,
+  // logical shift down the upper half and pack back to i8.
+  SDValue A = Op.getOperand(0);
+  SDValue B = Op.getOperand(1);
+
+  // With SSE41 we can use sign/zero extend, but for pre-SSE41 we unpack
+  // and then ashr/lshr the upper bits down to the lower bits before multiply.
+  unsigned Opcode = Op.getOpcode();
+  unsigned ExShift = (ISD::MULHU == Opcode ? ISD::SRL : ISD::SRA);
+  unsigned ExSSE41 = (ISD::MULHU == Opcode ? X86ISD::VZEXT : X86ISD::VSEXT);
+
+  // AVX2 implementations - extend xmm subvectors to ymm.
+  if (Subtarget.hasInt256()) {
+    SDValue Lo = DAG.getIntPtrConstant(0, dl);
+    SDValue Hi = DAG.getIntPtrConstant(VT.getVectorNumElements() / 2, dl);
+
+    if (VT == MVT::v32i8) {
+      SDValue ALo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v16i8, A, Lo);
+      SDValue BLo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v16i8, B, Lo);
+      SDValue AHi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v16i8, A, Hi);
+      SDValue BHi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v16i8, B, Hi);
+      ALo = DAG.getNode(ExSSE41, dl, MVT::v16i16, ALo);
+      BLo = DAG.getNode(ExSSE41, dl, MVT::v16i16, BLo);
+      AHi = DAG.getNode(ExSSE41, dl, MVT::v16i16, AHi);
+      BHi = DAG.getNode(ExSSE41, dl, MVT::v16i16, BHi);
+      Lo = DAG.getNode(ISD::SRL, dl, MVT::v16i16,
+                       DAG.getNode(ISD::MUL, dl, MVT::v16i16, ALo, BLo),
+                       DAG.getConstant(8, dl, MVT::v16i16));
+      Hi = DAG.getNode(ISD::SRL, dl, MVT::v16i16,
+                       DAG.getNode(ISD::MUL, dl, MVT::v16i16, AHi, BHi),
+                       DAG.getConstant(8, dl, MVT::v16i16));
+      // The ymm variant of PACKUS treats the 128-bit lanes separately, so before
+      // using PACKUS we need to permute the inputs to the correct lo/hi xmm lane.
+      const int LoMask[] = {0,  1,  2,  3,  4,  5,  6,  7,
+                            16, 17, 18, 19, 20, 21, 22, 23};
+      const int HiMask[] = {8,  9,  10, 11, 12, 13, 14, 15,
+                            24, 25, 26, 27, 28, 29, 30, 31};
+      return DAG.getNode(X86ISD::PACKUS, dl, VT,
+                         DAG.getVectorShuffle(MVT::v16i16, dl, Lo, Hi, LoMask),
+                         DAG.getVectorShuffle(MVT::v16i16, dl, Lo, Hi, HiMask));
+    }
+
+    SDValue ExA = DAG.getNode(ExSSE41, dl, MVT::v16i16, A);
+    SDValue ExB = DAG.getNode(ExSSE41, dl, MVT::v16i16, B);
+    SDValue Mul = DAG.getNode(ISD::MUL, dl, MVT::v16i16, ExA, ExB);
+    SDValue MulH = DAG.getNode(ISD::SRL, dl, MVT::v16i16, Mul,
+                               DAG.getConstant(8, dl, MVT::v16i16));
+    Lo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v8i16, MulH, Lo);
+    Hi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v8i16, MulH, Hi);
+    return DAG.getNode(X86ISD::PACKUS, dl, VT, Lo, Hi);
+  }
+
+  assert(VT == MVT::v16i8 &&
+         "Pre-AVX2 support only supports v16i8 multiplication");
+  MVT ExVT = MVT::v8i16;
+
+  // Extract the lo parts and zero/sign extend to i16.
+  SDValue ALo, BLo;
+  if (Subtarget.hasSSE41()) {
+    ALo = DAG.getNode(ExSSE41, dl, ExVT, A);
+    BLo = DAG.getNode(ExSSE41, dl, ExVT, B);
+  } else {
+    const int ShufMask[] = {-1, 0, -1, 1, -1, 2, -1, 3,
+                            -1, 4, -1, 5, -1, 6, -1, 7};
+    ALo = DAG.getVectorShuffle(VT, dl, A, A, ShufMask);
+    BLo = DAG.getVectorShuffle(VT, dl, B, B, ShufMask);
+    ALo = DAG.getBitcast(ExVT, ALo);
+    BLo = DAG.getBitcast(ExVT, BLo);
+    ALo = DAG.getNode(ExShift, dl, ExVT, ALo, DAG.getConstant(8, dl, ExVT));
+    BLo = DAG.getNode(ExShift, dl, ExVT, BLo, DAG.getConstant(8, dl, ExVT));
+  }
+
+  // Extract the hi parts and zero/sign extend to i16.
+  SDValue AHi, BHi;
+  if (Subtarget.hasSSE41()) {
+    const int ShufMask[] = {8,  9,  10, 11, 12, 13, 14, 15,
+                            -1, -1, -1, -1, -1, -1, -1, -1};
+    AHi = DAG.getVectorShuffle(VT, dl, A, A, ShufMask);
+    BHi = DAG.getVectorShuffle(VT, dl, B, B, ShufMask);
+    AHi = DAG.getNode(ExSSE41, dl, ExVT, AHi);
+    BHi = DAG.getNode(ExSSE41, dl, ExVT, BHi);
+  } else {
+    const int ShufMask[] = {-1, 8,  -1, 9,  -1, 10, -1, 11,
+                            -1, 12, -1, 13, -1, 14, -1, 15};
+    AHi = DAG.getVectorShuffle(VT, dl, A, A, ShufMask);
+    BHi = DAG.getVectorShuffle(VT, dl, B, B, ShufMask);
+    AHi = DAG.getBitcast(ExVT, AHi);
+    BHi = DAG.getBitcast(ExVT, BHi);
+    AHi = DAG.getNode(ExShift, dl, ExVT, AHi, DAG.getConstant(8, dl, ExVT));
+    BHi = DAG.getNode(ExShift, dl, ExVT, BHi, DAG.getConstant(8, dl, ExVT));
+  }
+
+  // Multiply, lshr the upper 8bits to the lower 8bits of the lo/hi results and
+  // pack back to v16i8.
+  SDValue RLo = DAG.getNode(ISD::MUL, dl, ExVT, ALo, BLo);
+  SDValue RHi = DAG.getNode(ISD::MUL, dl, ExVT, AHi, BHi);
+  RLo = DAG.getNode(ISD::SRL, dl, ExVT, RLo, DAG.getConstant(8, dl, ExVT));
+  RHi = DAG.getNode(ISD::SRL, dl, ExVT, RHi, DAG.getConstant(8, dl, ExVT));
+  return DAG.getNode(X86ISD::PACKUS, dl, VT, RLo, RHi);
+}
+
 SDValue X86TargetLowering::LowerWin64_i128OP(SDValue Op, SelectionDAG &DAG) const {
-  assert(Subtarget->isTargetWin64() && "Unexpected target");
+  assert(Subtarget.isTargetWin64() && "Unexpected target");
   EVT VT = Op.getValueType();
   assert(VT.isInteger() && VT.getSizeInBits() == 128 &&
          "Unexpected return type for lowering");
@@ -18415,8 +19594,8 @@ SDValue X86TargetLowering::LowerWin64_i128OP(SDValue Op, SelectionDAG &DAG) cons
            "Unexpected argument type for lowering");
     SDValue StackPtr = DAG.CreateStackTemporary(ArgVT, 16);
     Entry.Node = StackPtr;
-    InChain = DAG.getStore(InChain, dl, Op->getOperand(i), StackPtr, MachinePointerInfo(),
-                           false, false, 16);
+    InChain = DAG.getStore(InChain, dl, Op->getOperand(i), StackPtr,
+                           MachinePointerInfo(), /* Alignment = */ 16);
     Type *ArgTy = ArgVT.getTypeForEVT(*DAG.getContext());
     Entry.Ty = PointerType::get(ArgTy,0);
     Entry.isSExt = false;
@@ -18431,21 +19610,39 @@ SDValue X86TargetLowering::LowerWin64_i128OP(SDValue Op, SelectionDAG &DAG) cons
   CLI.setDebugLoc(dl).setChain(InChain)
     .setCallee(getLibcallCallingConv(LC),
                static_cast<EVT>(MVT::v2i64).getTypeForEVT(*DAG.getContext()),
-               Callee, std::move(Args), 0)
+               Callee, std::move(Args))
     .setInRegister().setSExtResult(isSigned).setZExtResult(!isSigned);
 
   std::pair<SDValue, SDValue> CallInfo = LowerCallTo(CLI);
   return DAG.getBitcast(VT, CallInfo.first);
 }
 
-static SDValue LowerMUL_LOHI(SDValue Op, const X86Subtarget *Subtarget,
+static SDValue LowerMUL_LOHI(SDValue Op, const X86Subtarget &Subtarget,
                              SelectionDAG &DAG) {
   SDValue Op0 = Op.getOperand(0), Op1 = Op.getOperand(1);
   MVT VT = Op0.getSimpleValueType();
   SDLoc dl(Op);
 
-  assert((VT == MVT::v4i32 && Subtarget->hasSSE2()) ||
-         (VT == MVT::v8i32 && Subtarget->hasInt256()));
+  // Decompose 256-bit ops into smaller 128-bit ops.
+  if (VT.is256BitVector() && !Subtarget.hasInt256()) {
+    unsigned Opcode = Op.getOpcode();
+    unsigned NumElems = VT.getVectorNumElements();
+    MVT HalfVT = MVT::getVectorVT(VT.getScalarType(), NumElems / 2);
+    SDValue Lo0 = extract128BitVector(Op0, 0, DAG, dl);
+    SDValue Lo1 = extract128BitVector(Op1, 0, DAG, dl);
+    SDValue Hi0 = extract128BitVector(Op0, NumElems / 2, DAG, dl);
+    SDValue Hi1 = extract128BitVector(Op1, NumElems / 2, DAG, dl);
+    SDValue Lo = DAG.getNode(Opcode, dl, DAG.getVTList(HalfVT, HalfVT), Lo0, Lo1);
+    SDValue Hi = DAG.getNode(Opcode, dl, DAG.getVTList(HalfVT, HalfVT), Hi0, Hi1);
+    SDValue Ops[] = {
+      DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, Lo.getValue(0), Hi.getValue(0)),
+      DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, Lo.getValue(1), Hi.getValue(1))
+    };
+    return DAG.getMergeValues(Ops, dl);
+  }
+
+  assert((VT == MVT::v4i32 && Subtarget.hasSSE2()) ||
+         (VT == MVT::v8i32 && Subtarget.hasInt256()));
 
   // PMULxD operations multiply each even value (starting at 0) of LHS with
   // the related value of RHS and produce a widen result.
@@ -18461,16 +19658,18 @@ static SDValue LowerMUL_LOHI(SDValue Op, const X86Subtarget *Subtarget,
   // step to the left):
   const int Mask[] = {1, -1, 3, -1, 5, -1, 7, -1};
   // <a|b|c|d> => <b|undef|d|undef>
-  SDValue Odd0 = DAG.getVectorShuffle(VT, dl, Op0, Op0, Mask);
+  SDValue Odd0 = DAG.getVectorShuffle(VT, dl, Op0, Op0,
+                             makeArrayRef(&Mask[0], VT.getVectorNumElements()));
   // <e|f|g|h> => <f|undef|h|undef>
-  SDValue Odd1 = DAG.getVectorShuffle(VT, dl, Op1, Op1, Mask);
+  SDValue Odd1 = DAG.getVectorShuffle(VT, dl, Op1, Op1,
+                             makeArrayRef(&Mask[0], VT.getVectorNumElements()));
 
   // Emit two multiplies, one for the lower 2 ints and one for the higher 2
   // ints.
   MVT MulVT = VT == MVT::v4i32 ? MVT::v2i64 : MVT::v4i64;
   bool IsSigned = Op->getOpcode() == ISD::SMUL_LOHI;
   unsigned Opcode =
-      (!IsSigned || !Subtarget->hasSSE41()) ? X86ISD::PMULUDQ : X86ISD::PMULDQ;
+      (!IsSigned || !Subtarget.hasSSE41()) ? X86ISD::PMULUDQ : X86ISD::PMULDQ;
   // PMULUDQ <4 x i32> <a|b|c|d>, <4 x i32> <e|f|g|h>
   // => <2 x i64> <ae|cg>
   SDValue Mul1 = DAG.getBitcast(VT, DAG.getNode(Opcode, dl, MulVT, Op0, Op1));
@@ -18494,7 +19693,7 @@ static SDValue LowerMUL_LOHI(SDValue Op, const X86Subtarget *Subtarget,
 
   // If we have a signed multiply but no PMULDQ fix up the high parts of a
   // unsigned multiply.
-  if (IsSigned && !Subtarget->hasSSE41()) {
+  if (IsSigned && !Subtarget.hasSSE41()) {
     SDValue ShAmt = DAG.getConstant(
         31, dl,
         DAG.getTargetLoweringInfo().getShiftAmountTy(VT, DAG.getDataLayout()));
@@ -18515,19 +19714,19 @@ static SDValue LowerMUL_LOHI(SDValue Op, const X86Subtarget *Subtarget,
 
 // Return true if the required (according to Opcode) shift-imm form is natively
 // supported by the Subtarget
-static bool SupportedVectorShiftWithImm(MVT VT, const X86Subtarget *Subtarget,
+static bool SupportedVectorShiftWithImm(MVT VT, const X86Subtarget &Subtarget,
                                         unsigned Opcode) {
   if (VT.getScalarSizeInBits() < 16)
     return false;
 
   if (VT.is512BitVector() &&
-      (VT.getScalarSizeInBits() > 16 || Subtarget->hasBWI()))
+      (VT.getScalarSizeInBits() > 16 || Subtarget.hasBWI()))
     return true;
 
   bool LShift = VT.is128BitVector() ||
-    (VT.is256BitVector() && Subtarget->hasInt256());
+    (VT.is256BitVector() && Subtarget.hasInt256());
 
-  bool AShift = LShift && (Subtarget->hasVLX() ||
+  bool AShift = LShift && (Subtarget.hasVLX() ||
     (VT != MVT::v2i64 && VT != MVT::v4i64));
   return (Opcode == ISD::SRA) ? AShift : LShift;
 }
@@ -18535,24 +19734,24 @@ static bool SupportedVectorShiftWithImm(MVT VT, const X86Subtarget *Subtarget,
 // The shift amount is a variable, but it is the same for all vector lanes.
 // These instructions are defined together with shift-immediate.
 static
-bool SupportedVectorShiftWithBaseAmnt(MVT VT, const X86Subtarget *Subtarget,
+bool SupportedVectorShiftWithBaseAmnt(MVT VT, const X86Subtarget &Subtarget,
                                       unsigned Opcode) {
   return SupportedVectorShiftWithImm(VT, Subtarget, Opcode);
 }
 
 // Return true if the required (according to Opcode) variable-shift form is
 // natively supported by the Subtarget
-static bool SupportedVectorVarShift(MVT VT, const X86Subtarget *Subtarget,
+static bool SupportedVectorVarShift(MVT VT, const X86Subtarget &Subtarget,
                                     unsigned Opcode) {
 
-  if (!Subtarget->hasInt256() || VT.getScalarSizeInBits() < 16)
+  if (!Subtarget.hasInt256() || VT.getScalarSizeInBits() < 16)
     return false;
 
   // vXi16 supported only on AVX-512, BWI
-  if (VT.getScalarSizeInBits() == 16 && !Subtarget->hasBWI())
+  if (VT.getScalarSizeInBits() == 16 && !Subtarget.hasBWI())
     return false;
 
-  if (VT.is512BitVector() || Subtarget->hasVLX())
+  if (VT.is512BitVector() || Subtarget.hasVLX())
     return true;
 
   bool LShift = VT.is128BitVector() || VT.is256BitVector();
@@ -18561,7 +19760,7 @@ static bool SupportedVectorVarShift(MVT VT, const X86Subtarget *Subtarget,
 }
 
 static SDValue LowerScalarImmediateShift(SDValue Op, SelectionDAG &DAG,
-                                         const X86Subtarget *Subtarget) {
+                                         const X86Subtarget &Subtarget) {
   MVT VT = Op.getSimpleValueType();
   SDLoc dl(Op);
   SDValue R = Op.getOperand(0);
@@ -18611,12 +19810,12 @@ static SDValue LowerScalarImmediateShift(SDValue Op, SelectionDAG &DAG,
         return getTargetVShiftByConstNode(X86Opc, dl, VT, R, ShiftAmt, DAG);
 
       // i64 SRA needs to be performed as partial shifts.
-      if ((VT == MVT::v2i64 || (Subtarget->hasInt256() && VT == MVT::v4i64)) &&
-          Op.getOpcode() == ISD::SRA && !Subtarget->hasXOP())
+      if ((VT == MVT::v2i64 || (Subtarget.hasInt256() && VT == MVT::v4i64)) &&
+          Op.getOpcode() == ISD::SRA && !Subtarget.hasXOP())
         return ArithmeticShiftRight64(ShiftAmt);
 
       if (VT == MVT::v16i8 ||
-          (Subtarget->hasInt256() && VT == MVT::v32i8) ||
+          (Subtarget.hasInt256() && VT == MVT::v32i8) ||
           VT == MVT::v64i8) {
         unsigned NumElts = VT.getVectorNumElements();
         MVT ShiftVT = MVT::getVectorVT(MVT::i16, NumElts / 2);
@@ -18628,11 +19827,16 @@ static SDValue LowerScalarImmediateShift(SDValue Op, SelectionDAG &DAG,
         // ashr(R, 7)  === cmp_slt(R, 0)
         if (Op.getOpcode() == ISD::SRA && ShiftAmt == 7) {
           SDValue Zeros = getZeroVector(VT, Subtarget, DAG, dl);
+          if (VT.is512BitVector()) {
+            assert(VT == MVT::v64i8 && "Unexpected element type!");
+            SDValue CMP = DAG.getNode(X86ISD::PCMPGTM, dl, MVT::v64i1, Zeros, R);
+            return DAG.getNode(ISD::SIGN_EXTEND, dl, VT, CMP);
+          }
           return DAG.getNode(X86ISD::PCMPGT, dl, VT, Zeros, R);
         }
 
         // XOP can shift v16i8 directly instead of as shift v8i16 + mask.
-        if (VT == MVT::v16i8 && Subtarget->hasXOP())
+        if (VT == MVT::v16i8 && Subtarget.hasXOP())
           return SDValue();
 
         if (Op.getOpcode() == ISD::SHL) {
@@ -18668,8 +19872,8 @@ static SDValue LowerScalarImmediateShift(SDValue Op, SelectionDAG &DAG,
   }
 
   // Special case in 32-bit mode, where i64 is expanded into high and low parts.
-  if (!Subtarget->is64Bit() && !Subtarget->hasXOP() &&
-      (VT == MVT::v2i64 || (Subtarget->hasInt256() && VT == MVT::v4i64))) {
+  if (!Subtarget.is64Bit() && !Subtarget.hasXOP() &&
+      (VT == MVT::v2i64 || (Subtarget.hasInt256() && VT == MVT::v4i64))) {
 
     // Peek through any splat that was introduced for i64 shift vectorization.
     int SplatIndex = -1;
@@ -18726,7 +19930,7 @@ static SDValue LowerScalarImmediateShift(SDValue Op, SelectionDAG &DAG,
 }
 
 static SDValue LowerScalarVariableShift(SDValue Op, SelectionDAG &DAG,
-                                        const X86Subtarget* Subtarget) {
+                                        const X86Subtarget &Subtarget) {
   MVT VT = Op.getSimpleValueType();
   SDLoc dl(Op);
   SDValue R = Op.getOperand(0);
@@ -18746,7 +19950,7 @@ static SDValue LowerScalarVariableShift(SDValue Op, SelectionDAG &DAG,
       // Check if this build_vector node is doing a splat.
       // If so, then set BaseShAmt equal to the splat value.
       BaseShAmt = BV->getSplatValue();
-      if (BaseShAmt && BaseShAmt.getOpcode() == ISD::UNDEF)
+      if (BaseShAmt && BaseShAmt.isUndef())
         BaseShAmt = SDValue();
     } else {
       if (Amt.getOpcode() == ISD::EXTRACT_SUBVECTOR)
@@ -18787,7 +19991,7 @@ static SDValue LowerScalarVariableShift(SDValue Op, SelectionDAG &DAG,
   }
 
   // Special case in 32-bit mode, where i64 is expanded into high and low parts.
-  if (!Subtarget->is64Bit() && VT == MVT::v2i64  &&
+  if (!Subtarget.is64Bit() && VT == MVT::v2i64  &&
       Amt.getOpcode() == ISD::BITCAST &&
       Amt.getOperand(0).getOpcode() == ISD::BUILD_VECTOR) {
     Amt = Amt.getOperand(0);
@@ -18808,15 +20012,16 @@ static SDValue LowerScalarVariableShift(SDValue Op, SelectionDAG &DAG,
   return SDValue();
 }
 
-static SDValue LowerShift(SDValue Op, const X86Subtarget* Subtarget,
+static SDValue LowerShift(SDValue Op, const X86Subtarget &Subtarget,
                           SelectionDAG &DAG) {
   MVT VT = Op.getSimpleValueType();
   SDLoc dl(Op);
   SDValue R = Op.getOperand(0);
   SDValue Amt = Op.getOperand(1);
+  bool ConstantAmt = ISD::isBuildVectorOfConstantSDNodes(Amt.getNode());
 
   assert(VT.isVector() && "Custom lowering only for vector shifts!");
-  assert(Subtarget->hasSSE2() && "Only custom lower when we have SSE2!");
+  assert(Subtarget.hasSSE2() && "Only custom lower when we have SSE2!");
 
   if (SDValue V = LowerScalarImmediateShift(Op, DAG, Subtarget))
     return V;
@@ -18829,7 +20034,7 @@ static SDValue LowerShift(SDValue Op, const X86Subtarget* Subtarget,
 
   // XOP has 128-bit variable logical/arithmetic shifts.
   // +ve/-ve Amt = shift left/right.
-  if (Subtarget->hasXOP() &&
+  if (Subtarget.hasXOP() &&
       (VT == MVT::v2i64 || VT == MVT::v4i32 ||
        VT == MVT::v8i16 || VT == MVT::v16i8)) {
     if (Op.getOpcode() == ISD::SRL || Op.getOpcode() == ISD::SRA) {
@@ -18856,7 +20061,7 @@ static SDValue LowerShift(SDValue Op, const X86Subtarget* Subtarget,
   // i64 vector arithmetic shift can be emulated with the transform:
   // M = lshr(SIGN_BIT, Amt)
   // ashr(R, Amt) === sub(xor(lshr(R, Amt), M), M)
-  if ((VT == MVT::v2i64 || (VT == MVT::v4i64 && Subtarget->hasInt256())) &&
+  if ((VT == MVT::v2i64 || (VT == MVT::v4i64 && Subtarget.hasInt256())) &&
       Op.getOpcode() == ISD::SRA) {
     SDValue S = DAG.getConstant(APInt::getSignBit(64), dl, VT);
     SDValue M = DAG.getNode(ISD::SRL, dl, VT, S, Amt);
@@ -18869,10 +20074,9 @@ static SDValue LowerShift(SDValue Op, const X86Subtarget* Subtarget,
   // If possible, lower this packed shift into a vector multiply instead of
   // expanding it into a sequence of scalar shifts.
   // Do this only if the vector shift count is a constant build_vector.
-  if (Op.getOpcode() == ISD::SHL &&
+  if (ConstantAmt && Op.getOpcode() == ISD::SHL &&
       (VT == MVT::v8i16 || VT == MVT::v4i32 ||
-       (Subtarget->hasInt256() && VT == MVT::v16i16)) &&
-      ISD::isBuildVectorOfConstantSDNodes(Amt.getNode())) {
+       (Subtarget.hasInt256() && VT == MVT::v16i16))) {
     SmallVector<SDValue, 8> Elts;
     MVT SVT = VT.getVectorElementType();
     unsigned SVTBits = SVT.getSizeInBits();
@@ -18881,7 +20085,7 @@ static SDValue LowerShift(SDValue Op, const X86Subtarget* Subtarget,
 
     for (unsigned i=0; i !=NumElems; ++i) {
       SDValue Op = Amt->getOperand(i);
-      if (Op->getOpcode() == ISD::UNDEF) {
+      if (Op->isUndef()) {
         Elts.push_back(Op);
         continue;
       }
@@ -18895,7 +20099,7 @@ static SDValue LowerShift(SDValue Op, const X86Subtarget* Subtarget,
       }
       Elts.push_back(DAG.getConstant(One.shl(ShAmt), dl, SVT));
     }
-    SDValue BV = DAG.getNode(ISD::BUILD_VECTOR, dl, VT, Elts);
+    SDValue BV = DAG.getBuildVector(VT, dl, Elts);
     return DAG.getNode(ISD::MUL, dl, VT, R, BV);
   }
 
@@ -18922,15 +20126,13 @@ static SDValue LowerShift(SDValue Op, const X86Subtarget* Subtarget,
   // lowered as X86ISD::VSRLI nodes. This would be cheaper than scalarizing
   // the vector shift into four scalar shifts plus four pairs of vector
   // insert/extract.
-  if ((VT == MVT::v8i16 || VT == MVT::v4i32) &&
-      ISD::isBuildVectorOfConstantSDNodes(Amt.getNode())) {
+  if (ConstantAmt && (VT == MVT::v8i16 || VT == MVT::v4i32)) {
     unsigned TargetOpcode = X86ISD::MOVSS;
     bool CanBeSimplified;
     // The splat value for the first packed shift (the 'X' from the example).
     SDValue Amt1 = Amt->getOperand(0);
     // The splat value for the second packed shift (the 'Y' from the example).
-    SDValue Amt2 = (VT == MVT::v4i32) ? Amt->getOperand(1) :
-                                        Amt->getOperand(2);
+    SDValue Amt2 = (VT == MVT::v4i32) ? Amt->getOperand(1) : Amt->getOperand(2);
 
     // See if it is possible to replace this node with a sequence of
     // two shifts followed by a MOVSS/MOVSD
@@ -18991,7 +20193,7 @@ static SDValue LowerShift(SDValue Op, const X86Subtarget* Subtarget,
   if (VT == MVT::v4i32) {
     unsigned Opc = Op.getOpcode();
     SDValue Amt0, Amt1, Amt2, Amt3;
-    if (ISD::isBuildVectorOfConstantSDNodes(Amt.getNode())) {
+    if (ConstantAmt) {
       Amt0 = DAG.getVectorShuffle(VT, dl, Amt, DAG.getUNDEF(VT), {0, 0, 0, 0});
       Amt1 = DAG.getVectorShuffle(VT, dl, Amt, DAG.getUNDEF(VT), {1, 1, 1, 1});
       Amt2 = DAG.getVectorShuffle(VT, dl, Amt, DAG.getUNDEF(VT), {2, 2, 2, 2});
@@ -19031,14 +20233,14 @@ static SDValue LowerShift(SDValue Op, const X86Subtarget* Subtarget,
   }
 
   if (VT == MVT::v16i8 ||
-      (VT == MVT::v32i8 && Subtarget->hasInt256() && !Subtarget->hasXOP())) {
+      (VT == MVT::v32i8 && Subtarget.hasInt256() && !Subtarget.hasXOP())) {
     MVT ExtVT = MVT::getVectorVT(MVT::i16, VT.getVectorNumElements() / 2);
     unsigned ShiftOpcode = Op->getOpcode();
 
     auto SignBitSelect = [&](MVT SelVT, SDValue Sel, SDValue V0, SDValue V1) {
       // On SSE41 targets we make use of the fact that VSELECT lowers
       // to PBLENDVB which selects bytes based just on the sign bit.
-      if (Subtarget->hasSSE41()) {
+      if (Subtarget.hasSSE41()) {
         V0 = DAG.getBitcast(VT, V0);
         V1 = DAG.getBitcast(VT, V1);
         Sel = DAG.getBitcast(VT, Sel);
@@ -19141,7 +20343,7 @@ static SDValue LowerShift(SDValue Op, const X86Subtarget* Subtarget,
   // It's worth extending once and using the v8i32 shifts for 16-bit types, but
   // the extra overheads to get from v16i8 to v8i32 make the existing SSE
   // solution better.
-  if (Subtarget->hasInt256() && VT == MVT::v8i16) {
+  if (Subtarget.hasInt256() && VT == MVT::v8i16) {
     MVT ExtVT = MVT::v8i32;
     unsigned ExtOpc =
         Op.getOpcode() == ISD::SRA ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
@@ -19151,13 +20353,13 @@ static SDValue LowerShift(SDValue Op, const X86Subtarget* Subtarget,
                        DAG.getNode(Op.getOpcode(), dl, ExtVT, R, Amt));
   }
 
-  if (Subtarget->hasInt256() && !Subtarget->hasXOP() && VT == MVT::v16i16) {
+  if (Subtarget.hasInt256() && !Subtarget.hasXOP() && VT == MVT::v16i16) {
     MVT ExtVT = MVT::v8i32;
     SDValue Z = getZeroVector(VT, Subtarget, DAG, dl);
     SDValue ALo = DAG.getNode(X86ISD::UNPCKL, dl, VT, Amt, Z);
     SDValue AHi = DAG.getNode(X86ISD::UNPCKH, dl, VT, Amt, Z);
-    SDValue RLo = DAG.getNode(X86ISD::UNPCKL, dl, VT, R, R);
-    SDValue RHi = DAG.getNode(X86ISD::UNPCKH, dl, VT, R, R);
+    SDValue RLo = DAG.getNode(X86ISD::UNPCKL, dl, VT, Z, R);
+    SDValue RHi = DAG.getNode(X86ISD::UNPCKH, dl, VT, Z, R);
     ALo = DAG.getBitcast(ExtVT, ALo);
     AHi = DAG.getBitcast(ExtVT, AHi);
     RLo = DAG.getBitcast(ExtVT, RLo);
@@ -19172,10 +20374,15 @@ static SDValue LowerShift(SDValue Op, const X86Subtarget* Subtarget,
   if (VT == MVT::v8i16) {
     unsigned ShiftOpcode = Op->getOpcode();
 
+    // If we have a constant shift amount, the non-SSE41 path is best as
+    // avoiding bitcasts make it easier to constant fold and reduce to PBLENDW.
+    bool UseSSE41 = Subtarget.hasSSE41() &&
+                    !ISD::isBuildVectorOfConstantSDNodes(Amt.getNode());
+
     auto SignBitSelect = [&](SDValue Sel, SDValue V0, SDValue V1) {
       // On SSE41 targets we make use of the fact that VSELECT lowers
       // to PBLENDVB which selects bytes based just on the sign bit.
-      if (Subtarget->hasSSE41()) {
+      if (UseSSE41) {
         MVT ExtVT = MVT::getVectorVT(MVT::i8, VT.getVectorNumElements() * 2);
         V0 = DAG.getBitcast(ExtVT, V0);
         V1 = DAG.getBitcast(ExtVT, V1);
@@ -19192,7 +20399,7 @@ static SDValue LowerShift(SDValue Op, const X86Subtarget* Subtarget,
     };
 
     // Turn 'a' into a mask suitable for VSELECT: a = a << 12;
-    if (Subtarget->hasSSE41()) {
+    if (UseSSE41) {
       // On SSE41 targets we need to replicate the shift mask in both
       // bytes for PBLENDVB.
       Amt = DAG.getNode(
@@ -19231,43 +20438,13 @@ static SDValue LowerShift(SDValue Op, const X86Subtarget* Subtarget,
   }
 
   // Decompose 256-bit shifts into smaller 128-bit shifts.
-  if (VT.is256BitVector()) {
-    unsigned NumElems = VT.getVectorNumElements();
-    MVT EltVT = VT.getVectorElementType();
-    MVT NewVT = MVT::getVectorVT(EltVT, NumElems/2);
-
-    // Extract the two vectors
-    SDValue V1 = Extract128BitVector(R, 0, DAG, dl);
-    SDValue V2 = Extract128BitVector(R, NumElems/2, DAG, dl);
-
-    // Recreate the shift amount vectors
-    SDValue Amt1, Amt2;
-    if (Amt.getOpcode() == ISD::BUILD_VECTOR) {
-      // Constant shift amount
-      SmallVector<SDValue, 8> Ops(Amt->op_begin(), Amt->op_begin() + NumElems);
-      ArrayRef<SDValue> Amt1Csts = makeArrayRef(Ops).slice(0, NumElems / 2);
-      ArrayRef<SDValue> Amt2Csts = makeArrayRef(Ops).slice(NumElems / 2);
-
-      Amt1 = DAG.getNode(ISD::BUILD_VECTOR, dl, NewVT, Amt1Csts);
-      Amt2 = DAG.getNode(ISD::BUILD_VECTOR, dl, NewVT, Amt2Csts);
-    } else {
-      // Variable shift amount
-      Amt1 = Extract128BitVector(Amt, 0, DAG, dl);
-      Amt2 = Extract128BitVector(Amt, NumElems/2, DAG, dl);
-    }
-
-    // Issue new vector shifts for the smaller types
-    V1 = DAG.getNode(Op.getOpcode(), dl, NewVT, V1, Amt1);
-    V2 = DAG.getNode(Op.getOpcode(), dl, NewVT, V2, Amt2);
-
-    // Concatenate the result back
-    return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, V1, V2);
-  }
+  if (VT.is256BitVector())
+    return Lower256IntArith(Op, DAG);
 
   return SDValue();
 }
 
-static SDValue LowerRotate(SDValue Op, const X86Subtarget *Subtarget,
+static SDValue LowerRotate(SDValue Op, const X86Subtarget &Subtarget,
                            SelectionDAG &DAG) {
   MVT VT = Op.getSimpleValueType();
   SDLoc DL(Op);
@@ -19275,7 +20452,7 @@ static SDValue LowerRotate(SDValue Op, const X86Subtarget *Subtarget,
   SDValue Amt = Op.getOperand(1);
 
   assert(VT.isVector() && "Custom lowering only for vector rotates!");
-  assert(Subtarget->hasXOP() && "XOP support required for vector rotates!");
+  assert(Subtarget.hasXOP() && "XOP support required for vector rotates!");
   assert((Op.getOpcode() == ISD::ROTL) && "Only ROTL supported");
 
   // XOP has 128-bit vector variable + immediate rotates.
@@ -19363,6 +20540,11 @@ static SDValue LowerXALUO(SDValue Op, SelectionDAG &DAG) {
                   DAG.getConstant(X86::COND_O, DL, MVT::i32),
                   SDValue(Sum.getNode(), 2));
 
+    if (N->getValueType(1) == MVT::i1) {
+      SetCC = DAG.getNode(ISD::AssertZext, DL, MVT::i8, SetCC,
+                          DAG.getValueType(MVT::i1));
+      SetCC = DAG.getNode(ISD::TRUNCATE, DL, MVT::i1, SetCC);
+    }
     return DAG.getNode(ISD::MERGE_VALUES, DL, N->getVTList(), Sum, SetCC);
   }
   }
@@ -19372,10 +20554,15 @@ static SDValue LowerXALUO(SDValue Op, SelectionDAG &DAG) {
   SDValue Sum = DAG.getNode(BaseOp, DL, VTs, LHS, RHS);
 
   SDValue SetCC =
-    DAG.getNode(X86ISD::SETCC, DL, N->getValueType(1),
+    DAG.getNode(X86ISD::SETCC, DL, MVT::i8,
                 DAG.getConstant(Cond, DL, MVT::i32),
                 SDValue(Sum.getNode(), 1));
 
+  if (N->getValueType(1) == MVT::i1) {
+    SetCC = DAG.getNode(ISD::AssertZext, DL, MVT::i8, SetCC,
+                        DAG.getValueType(MVT::i1));
+    SetCC = DAG.getNode(ISD::TRUNCATE, DL, MVT::i1, SetCC);
+  }
   return DAG.getNode(ISD::MERGE_VALUES, DL, N->getVTList(), Sum, SetCC);
 }
 
@@ -19387,9 +20574,9 @@ bool X86TargetLowering::needsCmpXchgNb(Type *MemType) const {
   unsigned OpWidth = MemType->getPrimitiveSizeInBits();
 
   if (OpWidth == 64)
-    return !Subtarget->is64Bit(); // FIXME this should be Subtarget.hasCmpxchg8b
+    return !Subtarget.is64Bit(); // FIXME this should be Subtarget.hasCmpxchg8b
   else if (OpWidth == 128)
-    return Subtarget->hasCmpxchg16b();
+    return Subtarget.hasCmpxchg16b();
   else
     return false;
 }
@@ -19409,7 +20596,7 @@ X86TargetLowering::shouldExpandAtomicLoadInIR(LoadInst *LI) const {
 
 TargetLowering::AtomicExpansionKind
 X86TargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *AI) const {
-  unsigned NativeWidth = Subtarget->is64Bit() ? 64 : 32;
+  unsigned NativeWidth = Subtarget.is64Bit() ? 64 : 32;
   Type *MemType = AI->getType();
 
   // If the operand is too big, we must see if cmpxchg8/16b is available
@@ -19446,16 +20633,9 @@ X86TargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *AI) const {
   }
 }
 
-static bool hasMFENCE(const X86Subtarget& Subtarget) {
-  // Use mfence if we have SSE2 or we're on x86-64 (even if we asked for
-  // no-sse2). There isn't any reason to disable it if the target processor
-  // supports it.
-  return Subtarget.hasSSE2() || Subtarget.is64Bit();
-}
-
 LoadInst *
 X86TargetLowering::lowerIdempotentRMWIntoFencedLoad(AtomicRMWInst *AI) const {
-  unsigned NativeWidth = Subtarget->is64Bit() ? 64 : 32;
+  unsigned NativeWidth = Subtarget.is64Bit() ? 64 : 32;
   Type *MemType = AI->getType();
   // Accesses larger than the native width are turned into cmpxchg/libcalls, so
   // there is no benefit in turning such RMWs into loads, and it is actually
@@ -19483,7 +20663,7 @@ X86TargetLowering::lowerIdempotentRMWIntoFencedLoad(AtomicRMWInst *AI) const {
   // r1 = r2 = 0 is impossible, but becomes possible if the idempotent rmw is
   // lowered to just a load without a fence. A mfence flushes the store buffer,
   // making the optimization clearly correct.
-  // FIXME: it is required if isAtLeastRelease(Order) but it is not clear
+  // FIXME: it is required if isReleaseOrStronger(Order) but it is not clear
   // otherwise, we might be able to be more aggressive on relaxed idempotent
   // rmw. In practice, they do not look useful, so we don't try to be
   // especially clever.
@@ -19492,7 +20672,7 @@ X86TargetLowering::lowerIdempotentRMWIntoFencedLoad(AtomicRMWInst *AI) const {
     // the IR level, so we must wrap it in an intrinsic.
     return nullptr;
 
-  if (!hasMFENCE(*Subtarget))
+  if (!Subtarget.hasMFence())
     // FIXME: it might make sense to use a locked operation here but on a
     // different cache-line to prevent cache-line bouncing. In practice it
     // is probably a small win, and x86 processors without mfence are rare
@@ -19512,7 +20692,7 @@ X86TargetLowering::lowerIdempotentRMWIntoFencedLoad(AtomicRMWInst *AI) const {
   return Loaded;
 }
 
-static SDValue LowerATOMIC_FENCE(SDValue Op, const X86Subtarget *Subtarget,
+static SDValue LowerATOMIC_FENCE(SDValue Op, const X86Subtarget &Subtarget,
                                  SelectionDAG &DAG) {
   SDLoc dl(Op);
   AtomicOrdering FenceOrdering = static_cast<AtomicOrdering>(
@@ -19522,8 +20702,9 @@ static SDValue LowerATOMIC_FENCE(SDValue Op, const X86Subtarget *Subtarget,
 
   // The only fence that needs an instruction is a sequentially-consistent
   // cross-thread fence.
-  if (FenceOrdering == SequentiallyConsistent && FenceScope == CrossThread) {
-    if (hasMFENCE(*Subtarget))
+  if (FenceOrdering == AtomicOrdering::SequentiallyConsistent &&
+      FenceScope == CrossThread) {
+    if (Subtarget.hasMFence())
       return DAG.getNode(X86ISD::MFENCE, dl, MVT::Other, Op.getOperand(0));
 
     SDValue Chain = Op.getOperand(0);
@@ -19545,7 +20726,7 @@ static SDValue LowerATOMIC_FENCE(SDValue Op, const X86Subtarget *Subtarget,
   return DAG.getNode(X86ISD::MEMBARRIER, dl, MVT::Other, Op.getOperand(0));
 }
 
-static SDValue LowerCMP_SWAP(SDValue Op, const X86Subtarget *Subtarget,
+static SDValue LowerCMP_SWAP(SDValue Op, const X86Subtarget &Subtarget,
                              SelectionDAG &DAG) {
   MVT T = Op.getSimpleValueType();
   SDLoc DL(Op);
@@ -19557,7 +20738,7 @@ static SDValue LowerCMP_SWAP(SDValue Op, const X86Subtarget *Subtarget,
   case MVT::i16: Reg = X86::AX;  size = 2; break;
   case MVT::i32: Reg = X86::EAX; size = 4; break;
   case MVT::i64:
-    assert(Subtarget->is64Bit() && "Node not type legal!");
+    assert(Subtarget.is64Bit() && "Node not type legal!");
     Reg = X86::RAX; size = 8;
     break;
   }
@@ -19587,14 +20768,14 @@ static SDValue LowerCMP_SWAP(SDValue Op, const X86Subtarget *Subtarget,
   return SDValue();
 }
 
-static SDValue LowerBITCAST(SDValue Op, const X86Subtarget *Subtarget,
+static SDValue LowerBITCAST(SDValue Op, const X86Subtarget &Subtarget,
                             SelectionDAG &DAG) {
   MVT SrcVT = Op.getOperand(0).getSimpleValueType();
   MVT DstVT = Op.getSimpleValueType();
 
   if (SrcVT == MVT::v2i32 || SrcVT == MVT::v4i16 || SrcVT == MVT::v8i8 ||
       SrcVT == MVT::i64) {
-    assert(Subtarget->hasSSE2() && "Requires at least SSE2!");
+    assert(Subtarget.hasSSE2() && "Requires at least SSE2!");
     if (DstVT != MVT::f64)
       // This conversion needs to be expanded.
       return SDValue();
@@ -19614,7 +20795,7 @@ static SDValue LowerBITCAST(SDValue Op, const X86Subtarget *Subtarget,
         Elts.push_back(DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, SVT, Op0,
                                    DAG.getIntPtrConstant(i, dl)));
     } else {
-      assert(SrcVT == MVT::i64 && !Subtarget->is64Bit() &&
+      assert(SrcVT == MVT::i64 && !Subtarget.is64Bit() &&
              "Unexpected source type in LowerBITCAST");
       Elts.push_back(DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, Op0,
                                  DAG.getIntPtrConstant(0, dl)));
@@ -19627,14 +20808,14 @@ static SDValue LowerBITCAST(SDValue Op, const X86Subtarget *Subtarget,
     Elts.append(NumElts, DAG.getUNDEF(SVT));
 
     EVT NewVT = EVT::getVectorVT(*DAG.getContext(), SVT, NumElts * 2);
-    SDValue BV = DAG.getNode(ISD::BUILD_VECTOR, dl, NewVT, Elts);
+    SDValue BV = DAG.getBuildVector(NewVT, dl, Elts);
     SDValue ToV2F64 = DAG.getBitcast(MVT::v2f64, BV);
     return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64, ToV2F64,
                        DAG.getIntPtrConstant(0, dl));
   }
 
-  assert(Subtarget->is64Bit() && !Subtarget->hasSSE2() &&
-         Subtarget->hasMMX() && "Unexpected custom BITCAST");
+  assert(Subtarget.is64Bit() && !Subtarget.hasSSE2() &&
+         Subtarget.hasMMX() && "Unexpected custom BITCAST");
   assert((DstVT == MVT::i64 ||
           (DstVT.isVector() && DstVT.getSizeInBits()==64)) &&
          "Unexpected custom BITCAST");
@@ -19657,12 +20838,11 @@ static SDValue LowerBITCAST(SDValue Op, const X86Subtarget *Subtarget,
 /// how many bytes of V are summed horizontally to produce each element of the
 /// result.
 static SDValue LowerHorizontalByteSum(SDValue V, MVT VT,
-                                      const X86Subtarget *Subtarget,
+                                      const X86Subtarget &Subtarget,
                                       SelectionDAG &DAG) {
   SDLoc DL(V);
   MVT ByteVecVT = V.getSimpleValueType();
   MVT EltVT = VT.getVectorElementType();
-  int NumElts = VT.getVectorNumElements();
   assert(ByteVecVT.getVectorElementType() == MVT::i8 &&
          "Expected value to have byte element type.");
   assert(EltVT != MVT::i8 &&
@@ -19713,16 +20893,15 @@ static SDValue LowerHorizontalByteSum(SDValue V, MVT VT,
   // i8 elements, shift the i16s left by 8, sum as i8s, and then shift as i16s
   // right by 8. It is important to shift as i16s as i8 vector shift isn't
   // directly supported.
-  SmallVector<SDValue, 16> Shifters(NumElts, DAG.getConstant(8, DL, EltVT));
-  SDValue Shifter = DAG.getNode(ISD::BUILD_VECTOR, DL, VT, Shifters);
-  SDValue Shl = DAG.getNode(ISD::SHL, DL, VT, DAG.getBitcast(VT, V), Shifter);
+  SDValue ShifterV = DAG.getConstant(8, DL, VT);
+  SDValue Shl = DAG.getNode(ISD::SHL, DL, VT, DAG.getBitcast(VT, V), ShifterV);
   V = DAG.getNode(ISD::ADD, DL, ByteVecVT, DAG.getBitcast(ByteVecVT, Shl),
                   DAG.getBitcast(ByteVecVT, V));
-  return DAG.getNode(ISD::SRL, DL, VT, DAG.getBitcast(VT, V), Shifter);
+  return DAG.getNode(ISD::SRL, DL, VT, DAG.getBitcast(VT, V), ShifterV);
 }
 
-static SDValue LowerVectorCTPOPInRegLUT(SDValue Op, SDLoc DL,
-                                        const X86Subtarget *Subtarget,
+static SDValue LowerVectorCTPOPInRegLUT(SDValue Op, const SDLoc &DL,
+                                        const X86Subtarget &Subtarget,
                                         SelectionDAG &DAG) {
   MVT VT = Op.getSimpleValueType();
   MVT EltVT = VT.getVectorElementType();
@@ -19750,17 +20929,14 @@ static SDValue LowerVectorCTPOPInRegLUT(SDValue Op, SDLoc DL,
   int NumByteElts = VecSize / 8;
   MVT ByteVecVT = MVT::getVectorVT(MVT::i8, NumByteElts);
   SDValue In = DAG.getBitcast(ByteVecVT, Op);
-  SmallVector<SDValue, 16> LUTVec;
+  SmallVector<SDValue, 64> LUTVec;
   for (int i = 0; i < NumByteElts; ++i)
     LUTVec.push_back(DAG.getConstant(LUT[i % 16], DL, MVT::i8));
-  SDValue InRegLUT = DAG.getNode(ISD::BUILD_VECTOR, DL, ByteVecVT, LUTVec);
-  SmallVector<SDValue, 16> Mask0F(NumByteElts,
-                                  DAG.getConstant(0x0F, DL, MVT::i8));
-  SDValue M0F = DAG.getNode(ISD::BUILD_VECTOR, DL, ByteVecVT, Mask0F);
+  SDValue InRegLUT = DAG.getBuildVector(ByteVecVT, DL, LUTVec);
+  SDValue M0F = DAG.getConstant(0x0F, DL, ByteVecVT);
 
   // High nibbles
-  SmallVector<SDValue, 16> Four(NumByteElts, DAG.getConstant(4, DL, MVT::i8));
-  SDValue FourV = DAG.getNode(ISD::BUILD_VECTOR, DL, ByteVecVT, Four);
+  SDValue FourV = DAG.getConstant(4, DL, ByteVecVT);
   SDValue HighNibbles = DAG.getNode(ISD::SRL, DL, ByteVecVT, In, FourV);
 
   // Low nibbles
@@ -19781,8 +20957,8 @@ static SDValue LowerVectorCTPOPInRegLUT(SDValue Op, SDLoc DL,
   return LowerHorizontalByteSum(PopCnt, VT, Subtarget, DAG);
 }
 
-static SDValue LowerVectorCTPOPBitmath(SDValue Op, SDLoc DL,
-                                       const X86Subtarget *Subtarget,
+static SDValue LowerVectorCTPOPBitmath(SDValue Op, const SDLoc &DL,
+                                       const X86Subtarget &Subtarget,
                                        SelectionDAG &DAG) {
   MVT VT = Op.getSimpleValueType();
   assert(VT.is128BitVector() &&
@@ -19801,19 +20977,13 @@ static SDValue LowerVectorCTPOPBitmath(SDValue Op, SDLoc DL,
 
   auto GetShift = [&](unsigned OpCode, SDValue V, int Shifter) {
     MVT VT = V.getSimpleValueType();
-    SmallVector<SDValue, 32> Shifters(
-        VT.getVectorNumElements(),
-        DAG.getConstant(Shifter, DL, VT.getVectorElementType()));
-    return DAG.getNode(OpCode, DL, VT, V,
-                       DAG.getNode(ISD::BUILD_VECTOR, DL, VT, Shifters));
+    SDValue ShifterV = DAG.getConstant(Shifter, DL, VT);
+    return DAG.getNode(OpCode, DL, VT, V, ShifterV);
   };
   auto GetMask = [&](SDValue V, APInt Mask) {
     MVT VT = V.getSimpleValueType();
-    SmallVector<SDValue, 32> Masks(
-        VT.getVectorNumElements(),
-        DAG.getConstant(Mask, DL, VT.getVectorElementType()));
-    return DAG.getNode(ISD::AND, DL, VT, V,
-                       DAG.getNode(ISD::BUILD_VECTOR, DL, VT, Masks));
+    SDValue MaskV = DAG.getConstant(Mask, DL, VT);
+    return DAG.getNode(ISD::AND, DL, VT, V, MaskV);
   };
 
   // We don't want to incur the implicit masks required to SRL vNi8 vectors on
@@ -19852,27 +21022,38 @@ static SDValue LowerVectorCTPOPBitmath(SDValue Op, SDLoc DL,
       DAG);
 }
 
-static SDValue LowerVectorCTPOP(SDValue Op, const X86Subtarget *Subtarget,
+static SDValue LowerVectorCTPOP(SDValue Op, const X86Subtarget &Subtarget,
                                 SelectionDAG &DAG) {
   MVT VT = Op.getSimpleValueType();
-  // FIXME: Need to add AVX-512 support here!
-  assert((VT.is256BitVector() || VT.is128BitVector()) &&
+  assert((VT.is512BitVector() || VT.is256BitVector() || VT.is128BitVector()) &&
          "Unknown CTPOP type to handle");
   SDLoc DL(Op.getNode());
   SDValue Op0 = Op.getOperand(0);
 
-  if (!Subtarget->hasSSSE3()) {
+  if (!Subtarget.hasSSSE3()) {
     // We can't use the fast LUT approach, so fall back on vectorized bitmath.
     assert(VT.is128BitVector() && "Only 128-bit vectors supported in SSE!");
     return LowerVectorCTPOPBitmath(Op0, DL, Subtarget, DAG);
   }
 
-  if (VT.is256BitVector() && !Subtarget->hasInt256()) {
+  if (VT.is256BitVector() && !Subtarget.hasInt256()) {
     unsigned NumElems = VT.getVectorNumElements();
 
     // Extract each 128-bit vector, compute pop count and concat the result.
-    SDValue LHS = Extract128BitVector(Op0, 0, DAG, DL);
-    SDValue RHS = Extract128BitVector(Op0, NumElems/2, DAG, DL);
+    SDValue LHS = extract128BitVector(Op0, 0, DAG, DL);
+    SDValue RHS = extract128BitVector(Op0, NumElems / 2, DAG, DL);
+
+    return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT,
+                       LowerVectorCTPOPInRegLUT(LHS, DL, Subtarget, DAG),
+                       LowerVectorCTPOPInRegLUT(RHS, DL, Subtarget, DAG));
+  }
+
+  if (VT.is512BitVector() && !Subtarget.hasBWI()) {
+    unsigned NumElems = VT.getVectorNumElements();
+
+    // Extract each 256-bit vector, compute pop count and concat the result.
+    SDValue LHS = extract256BitVector(Op0, 0, DAG, DL);
+    SDValue RHS = extract256BitVector(Op0, NumElems / 2, DAG, DL);
 
     return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT,
                        LowerVectorCTPOPInRegLUT(LHS, DL, Subtarget, DAG),
@@ -19882,26 +21063,184 @@ static SDValue LowerVectorCTPOP(SDValue Op, const X86Subtarget *Subtarget,
   return LowerVectorCTPOPInRegLUT(Op0, DL, Subtarget, DAG);
 }
 
-static SDValue LowerCTPOP(SDValue Op, const X86Subtarget *Subtarget,
+static SDValue LowerCTPOP(SDValue Op, const X86Subtarget &Subtarget,
                           SelectionDAG &DAG) {
   assert(Op.getSimpleValueType().isVector() &&
          "We only do custom lowering for vector population count.");
   return LowerVectorCTPOP(Op, Subtarget, DAG);
 }
 
-static SDValue LowerLOAD_SUB(SDValue Op, SelectionDAG &DAG) {
-  SDNode *Node = Op.getNode();
-  SDLoc dl(Node);
-  EVT T = Node->getValueType(0);
-  SDValue negOp = DAG.getNode(ISD::SUB, dl, T,
-                              DAG.getConstant(0, dl, T), Node->getOperand(2));
-  return DAG.getAtomic(ISD::ATOMIC_LOAD_ADD, dl,
-                       cast<AtomicSDNode>(Node)->getMemoryVT(),
-                       Node->getOperand(0),
-                       Node->getOperand(1), negOp,
-                       cast<AtomicSDNode>(Node)->getMemOperand(),
-                       cast<AtomicSDNode>(Node)->getOrdering(),
-                       cast<AtomicSDNode>(Node)->getSynchScope());
+static SDValue LowerBITREVERSE_XOP(SDValue Op, SelectionDAG &DAG) {
+  MVT VT = Op.getSimpleValueType();
+  SDValue In = Op.getOperand(0);
+  SDLoc DL(Op);
+
+  // For scalars, its still beneficial to transfer to/from the SIMD unit to
+  // perform the BITREVERSE.
+  if (!VT.isVector()) {
+    MVT VecVT = MVT::getVectorVT(VT, 128 / VT.getSizeInBits());
+    SDValue Res = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VecVT, In);
+    Res = DAG.getNode(ISD::BITREVERSE, DL, VecVT, Res);
+    return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, Res,
+                       DAG.getIntPtrConstant(0, DL));
+  }
+
+  MVT SVT = VT.getVectorElementType();
+  int NumElts = VT.getVectorNumElements();
+  int ScalarSizeInBytes = VT.getScalarSizeInBits() / 8;
+
+  // Decompose 256-bit ops into smaller 128-bit ops.
+  if (VT.is256BitVector()) {
+    SDValue Lo = extract128BitVector(In, 0, DAG, DL);
+    SDValue Hi = extract128BitVector(In, NumElts / 2, DAG, DL);
+
+    MVT HalfVT = MVT::getVectorVT(SVT, NumElts / 2);
+    return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT,
+                       DAG.getNode(ISD::BITREVERSE, DL, HalfVT, Lo),
+                       DAG.getNode(ISD::BITREVERSE, DL, HalfVT, Hi));
+  }
+
+  assert(VT.is128BitVector() &&
+         "Only 128-bit vector bitreverse lowering supported.");
+
+  // VPPERM reverses the bits of a byte with the permute Op (2 << 5), and we
+  // perform the BSWAP in the shuffle.
+  // Its best to shuffle using the second operand as this will implicitly allow
+  // memory folding for multiple vectors.
+  SmallVector<SDValue, 16> MaskElts;
+  for (int i = 0; i != NumElts; ++i) {
+    for (int j = ScalarSizeInBytes - 1; j >= 0; --j) {
+      int SourceByte = 16 + (i * ScalarSizeInBytes) + j;
+      int PermuteByte = SourceByte | (2 << 5);
+      MaskElts.push_back(DAG.getConstant(PermuteByte, DL, MVT::i8));
+    }
+  }
+
+  SDValue Mask = DAG.getBuildVector(MVT::v16i8, DL, MaskElts);
+  SDValue Res = DAG.getBitcast(MVT::v16i8, In);
+  Res = DAG.getNode(X86ISD::VPPERM, DL, MVT::v16i8, DAG.getUNDEF(MVT::v16i8),
+                    Res, Mask);
+  return DAG.getBitcast(VT, Res);
+}
+
+static SDValue LowerBITREVERSE(SDValue Op, const X86Subtarget &Subtarget,
+                               SelectionDAG &DAG) {
+  if (Subtarget.hasXOP())
+    return LowerBITREVERSE_XOP(Op, DAG);
+
+  assert(Subtarget.hasSSSE3() && "SSSE3 required for BITREVERSE");
+
+  MVT VT = Op.getSimpleValueType();
+  SDValue In = Op.getOperand(0);
+  SDLoc DL(Op);
+
+  unsigned NumElts = VT.getVectorNumElements();
+  assert(VT.getScalarType() == MVT::i8 &&
+         "Only byte vector BITREVERSE supported");
+
+  // Decompose 256-bit ops into smaller 128-bit ops on pre-AVX2.
+  if (VT.is256BitVector() && !Subtarget.hasInt256()) {
+    MVT HalfVT = MVT::getVectorVT(MVT::i8, NumElts / 2);
+    SDValue Lo = extract128BitVector(In, 0, DAG, DL);
+    SDValue Hi = extract128BitVector(In, NumElts / 2, DAG, DL);
+    Lo = DAG.getNode(ISD::BITREVERSE, DL, HalfVT, Lo);
+    Hi = DAG.getNode(ISD::BITREVERSE, DL, HalfVT, Hi);
+    return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Lo, Hi);
+  }
+
+  // Perform BITREVERSE using PSHUFB lookups. Each byte is split into
+  // two nibbles and a PSHUFB lookup to find the bitreverse of each
+  // 0-15 value (moved to the other nibble).
+  SDValue NibbleMask = DAG.getConstant(0xF, DL, VT);
+  SDValue Lo = DAG.getNode(ISD::AND, DL, VT, In, NibbleMask);
+  SDValue Hi = DAG.getNode(ISD::SRL, DL, VT, In, DAG.getConstant(4, DL, VT));
+
+  const int LoLUT[16] = {
+      /* 0 */ 0x00, /* 1 */ 0x80, /* 2 */ 0x40, /* 3 */ 0xC0,
+      /* 4 */ 0x20, /* 5 */ 0xA0, /* 6 */ 0x60, /* 7 */ 0xE0,
+      /* 8 */ 0x10, /* 9 */ 0x90, /* a */ 0x50, /* b */ 0xD0,
+      /* c */ 0x30, /* d */ 0xB0, /* e */ 0x70, /* f */ 0xF0};
+  const int HiLUT[16] = {
+      /* 0 */ 0x00, /* 1 */ 0x08, /* 2 */ 0x04, /* 3 */ 0x0C,
+      /* 4 */ 0x02, /* 5 */ 0x0A, /* 6 */ 0x06, /* 7 */ 0x0E,
+      /* 8 */ 0x01, /* 9 */ 0x09, /* a */ 0x05, /* b */ 0x0D,
+      /* c */ 0x03, /* d */ 0x0B, /* e */ 0x07, /* f */ 0x0F};
+
+  SmallVector<SDValue, 16> LoMaskElts, HiMaskElts;
+  for (unsigned i = 0; i < NumElts; ++i) {
+    LoMaskElts.push_back(DAG.getConstant(LoLUT[i % 16], DL, MVT::i8));
+    HiMaskElts.push_back(DAG.getConstant(HiLUT[i % 16], DL, MVT::i8));
+  }
+
+  SDValue LoMask = DAG.getBuildVector(VT, DL, LoMaskElts);
+  SDValue HiMask = DAG.getBuildVector(VT, DL, HiMaskElts);
+  Lo = DAG.getNode(X86ISD::PSHUFB, DL, VT, LoMask, Lo);
+  Hi = DAG.getNode(X86ISD::PSHUFB, DL, VT, HiMask, Hi);
+  return DAG.getNode(ISD::OR, DL, VT, Lo, Hi);
+}
+
+static SDValue lowerAtomicArithWithLOCK(SDValue N, SelectionDAG &DAG) {
+  unsigned NewOpc = 0;
+  switch (N->getOpcode()) {
+  case ISD::ATOMIC_LOAD_ADD:
+    NewOpc = X86ISD::LADD;
+    break;
+  case ISD::ATOMIC_LOAD_SUB:
+    NewOpc = X86ISD::LSUB;
+    break;
+  case ISD::ATOMIC_LOAD_OR:
+    NewOpc = X86ISD::LOR;
+    break;
+  case ISD::ATOMIC_LOAD_XOR:
+    NewOpc = X86ISD::LXOR;
+    break;
+  case ISD::ATOMIC_LOAD_AND:
+    NewOpc = X86ISD::LAND;
+    break;
+  default:
+    llvm_unreachable("Unknown ATOMIC_LOAD_ opcode");
+  }
+
+  MachineMemOperand *MMO = cast<MemSDNode>(N)->getMemOperand();
+  return DAG.getMemIntrinsicNode(
+      NewOpc, SDLoc(N), DAG.getVTList(MVT::i32, MVT::Other),
+      {N->getOperand(0), N->getOperand(1), N->getOperand(2)},
+      /*MemVT=*/N->getSimpleValueType(0), MMO);
+}
+
+/// Lower atomic_load_ops into LOCK-prefixed operations.
+static SDValue lowerAtomicArith(SDValue N, SelectionDAG &DAG,
+                                const X86Subtarget &Subtarget) {
+  SDValue Chain = N->getOperand(0);
+  SDValue LHS = N->getOperand(1);
+  SDValue RHS = N->getOperand(2);
+  unsigned Opc = N->getOpcode();
+  MVT VT = N->getSimpleValueType(0);
+  SDLoc DL(N);
+
+  // We can lower atomic_load_add into LXADD. However, any other atomicrmw op
+  // can only be lowered when the result is unused.  They should have already
+  // been transformed into a cmpxchg loop in AtomicExpand.
+  if (N->hasAnyUseOfValue(0)) {
+    // Handle (atomic_load_sub p, v) as (atomic_load_add p, -v), to be able to
+    // select LXADD if LOCK_SUB can't be selected.
+    if (Opc == ISD::ATOMIC_LOAD_SUB) {
+      AtomicSDNode *AN = cast<AtomicSDNode>(N.getNode());
+      RHS = DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, DL, VT), RHS);
+      return DAG.getAtomic(ISD::ATOMIC_LOAD_ADD, DL, VT, Chain, LHS,
+                           RHS, AN->getMemOperand(), AN->getOrdering(),
+                           AN->getSynchScope());
+    }
+    assert(Opc == ISD::ATOMIC_LOAD_ADD &&
+           "Used AtomicRMW ops other than Add should have been expanded!");
+    return N;
+  }
+
+  SDValue LockOp = lowerAtomicArithWithLOCK(N, DAG);
+  // RAUW the chain, but don't worry about the result, as it's unused.
+  assert(!N->hasAnyUseOfValue(0));
+  DAG.ReplaceAllUsesOfValueWith(N.getValue(1), LockOp.getValue(1));
+  return SDValue();
 }
 
 static SDValue LowerATOMIC_STORE(SDValue Op, SelectionDAG &DAG) {
@@ -19914,7 +21253,8 @@ static SDValue LowerATOMIC_STORE(SDValue Op, SelectionDAG &DAG) {
   // FIXME: On 32-bit, store -> fist or movq would be more efficient
   //        (The only way to get a 16-byte store is cmpxchg16b)
   // FIXME: 16-byte ATOMIC_SWAP isn't actually hooked up at the moment.
-  if (cast<AtomicSDNode>(Node)->getOrdering() == SequentiallyConsistent ||
+  if (cast<AtomicSDNode>(Node)->getOrdering() ==
+          AtomicOrdering::SequentiallyConsistent ||
       !DAG.getTargetLoweringInfo().isTypeLegal(VT)) {
     SDValue Swap = DAG.getAtomic(ISD::ATOMIC_SWAP, dl,
                                  cast<AtomicSDNode>(Node)->getMemoryVT(),
@@ -19955,9 +21295,9 @@ static SDValue LowerADDC_ADDE_SUBC_SUBE(SDValue Op, SelectionDAG &DAG) {
                      Op.getOperand(1), Op.getOperand(2));
 }
 
-static SDValue LowerFSINCOS(SDValue Op, const X86Subtarget *Subtarget,
+static SDValue LowerFSINCOS(SDValue Op, const X86Subtarget &Subtarget,
                             SelectionDAG &DAG) {
-  assert(Subtarget->isTargetDarwin() && Subtarget->is64Bit());
+  assert(Subtarget.isTargetDarwin() && Subtarget.is64Bit());
 
   // For MacOSX, we want to call an alternative entry point: __sincos_stret,
   // which returns the values as { float, float } (in XMM0) or
@@ -19991,7 +21331,7 @@ static SDValue LowerFSINCOS(SDValue Op, const X86Subtarget *Subtarget,
 
   TargetLowering::CallLoweringInfo CLI(DAG);
   CLI.setDebugLoc(dl).setChain(DAG.getEntryNode())
-    .setCallee(CallingConv::C, RetTy, Callee, std::move(Args), 0);
+    .setCallee(CallingConv::C, RetTy, Callee, std::move(Args));
 
   std::pair<SDValue, SDValue> CallResult = TLI.LowerCallTo(CLI);
 
@@ -20051,7 +21391,7 @@ static SDValue ExtendToType(SDValue InOp, MVT NVT, SelectionDAG &DAG,
       DAG.getUNDEF(EltVT);
     for (unsigned i = 0; i < WidenNumElts - InNumElts; ++i)
       Ops.push_back(FillVal);
-    return DAG.getNode(ISD::BUILD_VECTOR, dl, NVT, Ops);
+    return DAG.getBuildVector(NVT, dl, Ops);
   }
   SDValue FillVal = FillWithZeroes ? DAG.getConstant(0, dl, NVT) :
     DAG.getUNDEF(NVT);
@@ -20059,9 +21399,9 @@ static SDValue ExtendToType(SDValue InOp, MVT NVT, SelectionDAG &DAG,
                      InOp, DAG.getIntPtrConstant(0, dl));
 }
 
-static SDValue LowerMSCATTER(SDValue Op, const X86Subtarget *Subtarget,
+static SDValue LowerMSCATTER(SDValue Op, const X86Subtarget &Subtarget,
                              SelectionDAG &DAG) {
-  assert(Subtarget->hasAVX512() &&
+  assert(Subtarget.hasAVX512() &&
          "MGATHER/MSCATTER are supported on AVX-512 arch only");
 
   // X86 scatter kills mask register, so its type should be added to
@@ -20110,7 +21450,7 @@ static SDValue LowerMSCATTER(SDValue Op, const X86Subtarget *Subtarget,
   }
 
   unsigned NumElts = VT.getVectorNumElements();
-  if (!Subtarget->hasVLX() && !VT.is512BitVector() &&
+  if (!Subtarget.hasVLX() && !VT.is512BitVector() &&
       !Index.getSimpleValueType().is512BitVector()) {
     // AVX512F supports only 512-bit vectors. Or data or index should
     // be 512 bit wide. If now the both index and data are 256-bit, but
@@ -20150,68 +21490,78 @@ static SDValue LowerMSCATTER(SDValue Op, const X86Subtarget *Subtarget,
   NewScatter = DAG.getMaskedScatter(VTs, N->getMemoryVT(), dl, Ops,
                                     N->getMemOperand());
   DAG.ReplaceAllUsesWith(Op, SDValue(NewScatter.getNode(), 1));
-  return SDValue(NewScatter.getNode(), 0);
+  return SDValue(NewScatter.getNode(), 1);
 }
 
-static SDValue LowerMLOAD(SDValue Op, const X86Subtarget *Subtarget,
+static SDValue LowerMLOAD(SDValue Op, const X86Subtarget &Subtarget,
                           SelectionDAG &DAG) {
 
   MaskedLoadSDNode *N = cast<MaskedLoadSDNode>(Op.getNode());
   MVT VT = Op.getSimpleValueType();
+  MVT ScalarVT = VT.getScalarType();
   SDValue Mask = N->getMask();
   SDLoc dl(Op);
 
-  if (Subtarget->hasAVX512() && !Subtarget->hasVLX() &&
-      !VT.is512BitVector() && Mask.getValueType() == MVT::v8i1) {
-    // This operation is legal for targets with VLX, but without
-    // VLX the vector should be widened to 512 bit
-    unsigned NumEltsInWideVec = 512/VT.getScalarSizeInBits();
-    MVT WideDataVT = MVT::getVectorVT(VT.getScalarType(), NumEltsInWideVec);
-    MVT WideMaskVT = MVT::getVectorVT(MVT::i1, NumEltsInWideVec);
-    SDValue Src0 = N->getSrc0();
-    Src0 = ExtendToType(Src0, WideDataVT, DAG);
-    Mask = ExtendToType(Mask, WideMaskVT, DAG, true);
-    SDValue NewLoad = DAG.getMaskedLoad(WideDataVT, dl, N->getChain(),
-                                        N->getBasePtr(), Mask, Src0,
-                                        N->getMemoryVT(), N->getMemOperand(),
-                                        N->getExtensionType());
-
-    SDValue Exract = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT,
-                                 NewLoad.getValue(0),
-                                 DAG.getIntPtrConstant(0, dl));
-    SDValue RetOps[] = {Exract, NewLoad.getValue(1)};
-    return DAG.getMergeValues(RetOps, dl);
-  }
-  return Op;
+  assert(Subtarget.hasAVX512() && !Subtarget.hasVLX() && !VT.is512BitVector() &&
+         "Cannot lower masked load op.");
+
+  assert(((ScalarVT == MVT::i32 || ScalarVT == MVT::f32) ||
+          (Subtarget.hasBWI() &&
+              (ScalarVT == MVT::i8 || ScalarVT == MVT::i16))) &&
+         "Unsupported masked load op.");
+
+  // This operation is legal for targets with VLX, but without
+  // VLX the vector should be widened to 512 bit
+  unsigned NumEltsInWideVec = 512/VT.getScalarSizeInBits();
+  MVT WideDataVT = MVT::getVectorVT(ScalarVT, NumEltsInWideVec);
+  MVT WideMaskVT = MVT::getVectorVT(MVT::i1, NumEltsInWideVec);
+  SDValue Src0 = N->getSrc0();
+  Src0 = ExtendToType(Src0, WideDataVT, DAG);
+  Mask = ExtendToType(Mask, WideMaskVT, DAG, true);
+  SDValue NewLoad = DAG.getMaskedLoad(WideDataVT, dl, N->getChain(),
+                                      N->getBasePtr(), Mask, Src0,
+                                      N->getMemoryVT(), N->getMemOperand(),
+                                      N->getExtensionType());
+
+  SDValue Exract = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT,
+                               NewLoad.getValue(0),
+                               DAG.getIntPtrConstant(0, dl));
+  SDValue RetOps[] = {Exract, NewLoad.getValue(1)};
+  return DAG.getMergeValues(RetOps, dl);
 }
 
-static SDValue LowerMSTORE(SDValue Op, const X86Subtarget *Subtarget,
+static SDValue LowerMSTORE(SDValue Op, const X86Subtarget &Subtarget,
                            SelectionDAG &DAG) {
   MaskedStoreSDNode *N = cast<MaskedStoreSDNode>(Op.getNode());
   SDValue DataToStore = N->getValue();
   MVT VT = DataToStore.getSimpleValueType();
+  MVT ScalarVT = VT.getScalarType();
   SDValue Mask = N->getMask();
   SDLoc dl(Op);
 
-  if (Subtarget->hasAVX512() && !Subtarget->hasVLX() &&
-      !VT.is512BitVector() && Mask.getValueType() == MVT::v8i1) {
-    // This operation is legal for targets with VLX, but without
-    // VLX the vector should be widened to 512 bit
-    unsigned NumEltsInWideVec = 512/VT.getScalarSizeInBits();
-    MVT WideDataVT = MVT::getVectorVT(VT.getScalarType(), NumEltsInWideVec);
-    MVT WideMaskVT = MVT::getVectorVT(MVT::i1, NumEltsInWideVec);
-    DataToStore = ExtendToType(DataToStore, WideDataVT, DAG);
-    Mask = ExtendToType(Mask, WideMaskVT, DAG, true);
-    return DAG.getMaskedStore(N->getChain(), dl, DataToStore, N->getBasePtr(),
-                              Mask, N->getMemoryVT(), N->getMemOperand(),
-                              N->isTruncatingStore());
-  }
-  return Op;
+  assert(Subtarget.hasAVX512() && !Subtarget.hasVLX() && !VT.is512BitVector() &&
+         "Cannot lower masked store op.");
+
+  assert(((ScalarVT == MVT::i32 || ScalarVT == MVT::f32) ||
+          (Subtarget.hasBWI() &&
+              (ScalarVT == MVT::i8 || ScalarVT == MVT::i16))) &&
+          "Unsupported masked store op.");
+
+  // This operation is legal for targets with VLX, but without
+  // VLX the vector should be widened to 512 bit
+  unsigned NumEltsInWideVec = 512/VT.getScalarSizeInBits();
+  MVT WideDataVT = MVT::getVectorVT(ScalarVT, NumEltsInWideVec);
+  MVT WideMaskVT = MVT::getVectorVT(MVT::i1, NumEltsInWideVec);
+  DataToStore = ExtendToType(DataToStore, WideDataVT, DAG);
+  Mask = ExtendToType(Mask, WideMaskVT, DAG, true);
+  return DAG.getMaskedStore(N->getChain(), dl, DataToStore, N->getBasePtr(),
+                            Mask, N->getMemoryVT(), N->getMemOperand(),
+                            N->isTruncatingStore());
 }
 
-static SDValue LowerMGATHER(SDValue Op, const X86Subtarget *Subtarget,
+static SDValue LowerMGATHER(SDValue Op, const X86Subtarget &Subtarget,
                             SelectionDAG &DAG) {
-  assert(Subtarget->hasAVX512() &&
+  assert(Subtarget.hasAVX512() &&
          "MGATHER/MSCATTER are supported on AVX-512 arch only");
 
   MaskedGatherSDNode *N = cast<MaskedGatherSDNode>(Op.getNode());
@@ -20226,7 +21576,7 @@ static SDValue LowerMGATHER(SDValue Op, const X86Subtarget *Subtarget,
   unsigned NumElts = VT.getVectorNumElements();
   assert(VT.getScalarSizeInBits() >= 32 && "Unsupported gather op");
 
-  if (!Subtarget->hasVLX() && !VT.is512BitVector() &&
+  if (!Subtarget.hasVLX() && !VT.is512BitVector() &&
       !Index.getSimpleValueType().is512BitVector()) {
     // AVX512F supports only 512-bit vectors. Or data or index should
     // be 512 bit wide. If now the both index and data are 256-bit, but
@@ -20314,8 +21664,7 @@ SDValue X86TargetLowering::LowerGC_TRANSITION_END(SDValue Op,
   return NOOP;
 }
 
-/// LowerOperation - Provide custom lowering hooks for some operations.
-///
+/// Provide custom lowering hooks for some operations.
 SDValue X86TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
   switch (Op.getOpcode()) {
   default: llvm_unreachable("Should not custom lower this!");
@@ -20323,8 +21672,13 @@ SDValue X86TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
   case ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS:
     return LowerCMP_SWAP(Op, Subtarget, DAG);
   case ISD::CTPOP:              return LowerCTPOP(Op, Subtarget, DAG);
-  case ISD::ATOMIC_LOAD_SUB:    return LowerLOAD_SUB(Op,DAG);
-  case ISD::ATOMIC_STORE:       return LowerATOMIC_STORE(Op,DAG);
+  case ISD::ATOMIC_LOAD_ADD:
+  case ISD::ATOMIC_LOAD_SUB:
+  case ISD::ATOMIC_LOAD_OR:
+  case ISD::ATOMIC_LOAD_XOR:
+  case ISD::ATOMIC_LOAD_AND:    return lowerAtomicArith(Op, DAG, Subtarget);
+  case ISD::ATOMIC_STORE:       return LowerATOMIC_STORE(Op, DAG);
+  case ISD::BITREVERSE:         return LowerBITREVERSE(Op, Subtarget, DAG);
   case ISD::BUILD_VECTOR:       return LowerBUILD_VECTOR(Op, DAG);
   case ISD::CONCAT_VECTORS:     return LowerCONCAT_VECTORS(Op, Subtarget, DAG);
   case ISD::VECTOR_SHUFFLE:     return lowerVectorShuffle(Op, Subtarget, DAG);
@@ -20377,14 +21731,18 @@ SDValue X86TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
   case ISD::EH_RETURN:          return LowerEH_RETURN(Op, DAG);
   case ISD::EH_SJLJ_SETJMP:     return lowerEH_SJLJ_SETJMP(Op, DAG);
   case ISD::EH_SJLJ_LONGJMP:    return lowerEH_SJLJ_LONGJMP(Op, DAG);
+  case ISD::EH_SJLJ_SETUP_DISPATCH:
+    return lowerEH_SJLJ_SETUP_DISPATCH(Op, DAG);
   case ISD::INIT_TRAMPOLINE:    return LowerINIT_TRAMPOLINE(Op, DAG);
   case ISD::ADJUST_TRAMPOLINE:  return LowerADJUST_TRAMPOLINE(Op, DAG);
   case ISD::FLT_ROUNDS_:        return LowerFLT_ROUNDS_(Op, DAG);
-  case ISD::CTLZ:               return LowerCTLZ(Op, Subtarget, DAG);
-  case ISD::CTLZ_ZERO_UNDEF:    return LowerCTLZ_ZERO_UNDEF(Op, Subtarget, DAG);
+  case ISD::CTLZ:
+  case ISD::CTLZ_ZERO_UNDEF:    return LowerCTLZ(Op, Subtarget, DAG);
   case ISD::CTTZ:
   case ISD::CTTZ_ZERO_UNDEF:    return LowerCTTZ(Op, DAG);
   case ISD::MUL:                return LowerMUL(Op, Subtarget, DAG);
+  case ISD::MULHS:
+  case ISD::MULHU:              return LowerMULH(Op, Subtarget, DAG);
   case ISD::UMUL_LOHI:
   case ISD::SMUL_LOHI:          return LowerMUL_LOHI(Op, Subtarget, DAG);
   case ISD::ROTL:               return LowerRotate(Op, Subtarget, DAG);
@@ -20417,11 +21775,34 @@ SDValue X86TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
   case ISD::GC_TRANSITION_START:
                                 return LowerGC_TRANSITION_START(Op, DAG);
   case ISD::GC_TRANSITION_END:  return LowerGC_TRANSITION_END(Op, DAG);
+  case ISD::STORE:              return LowerTruncatingStore(Op, Subtarget, DAG);
   }
 }
 
-/// ReplaceNodeResults - Replace a node with an illegal result type
-/// with a new node built out of custom code.
+/// Places new result values for the node in Results (their number
+/// and types must exactly match those of the original return values of
+/// the node), or leaves Results empty, which indicates that the node is not
+/// to be custom lowered after all.
+void X86TargetLowering::LowerOperationWrapper(SDNode *N,
+                                              SmallVectorImpl<SDValue> &Results,
+                                              SelectionDAG &DAG) const {
+  SDValue Res = LowerOperation(SDValue(N, 0), DAG);
+
+  if (!Res.getNode())
+    return;
+
+  assert((N->getNumValues() <= Res->getNumValues()) &&
+      "Lowering returned the wrong number of results!");
+
+  // Places new result values base on N result number.
+  // In some cases (LowerSINT_TO_FP for example) Res has more result values
+  // than original node, chain should be dropped(last value).
+  for (unsigned I = 0, E = N->getNumValues(); I != E; ++I)
+      Results.push_back(Res.getValue(I));
+}
+
+/// Replace a node with an illegal result type with a new node built out of
+/// custom code.
 void X86TargetLowering::ReplaceNodeResults(SDNode *N,
                                            SmallVectorImpl<SDValue>&Results,
                                            SelectionDAG &DAG) const {
@@ -20432,15 +21813,15 @@ void X86TargetLowering::ReplaceNodeResults(SDNode *N,
     llvm_unreachable("Do not know how to custom type legalize this operation!");
   case X86ISD::AVG: {
     // Legalize types for X86ISD::AVG by expanding vectors.
-    assert(Subtarget->hasSSE2() && "Requires at least SSE2!");
+    assert(Subtarget.hasSSE2() && "Requires at least SSE2!");
 
     auto InVT = N->getValueType(0);
     auto InVTSize = InVT.getSizeInBits();
     const unsigned RegSize =
         (InVTSize > 128) ? ((InVTSize > 256) ? 512 : 256) : 128;
-    assert((!Subtarget->hasAVX512() || RegSize < 512) &&
+    assert((!Subtarget.hasAVX512() || RegSize < 512) &&
            "512-bit vector requires AVX512");
-    assert((!Subtarget->hasAVX2() || RegSize < 256) &&
+    assert((!Subtarget.hasAVX2() || RegSize < 256) &&
            "256-bit vector requires AVX2");
 
     auto ElemVT = InVT.getVectorElementType();
@@ -20503,24 +21884,22 @@ void X86TargetLowering::ReplaceNodeResults(SDNode *N,
       EVT VT = N->getValueType(0);
       // Return a load from the stack slot.
       if (StackSlot.getNode())
-        Results.push_back(DAG.getLoad(VT, dl, FIST, StackSlot,
-                                      MachinePointerInfo(),
-                                      false, false, false, 0));
+        Results.push_back(
+            DAG.getLoad(VT, dl, FIST, StackSlot, MachinePointerInfo()));
       else
         Results.push_back(FIST);
     }
     return;
   }
   case ISD::UINT_TO_FP: {
-    assert(Subtarget->hasSSE2() && "Requires at least SSE2!");
+    assert(Subtarget.hasSSE2() && "Requires at least SSE2!");
     if (N->getOperand(0).getValueType() != MVT::v2i32 ||
         N->getValueType(0) != MVT::v2f32)
       return;
     SDValue ZExtIn = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::v2i64,
                                  N->getOperand(0));
-    SDValue Bias = DAG.getConstantFP(BitsToDouble(0x4330000000000000ULL), dl,
-                                     MVT::f64);
-    SDValue VBias = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v2f64, Bias, Bias);
+    SDValue VBias =
+        DAG.getConstantFP(BitsToDouble(0x4330000000000000ULL), dl, MVT::v2f64);
     SDValue Or = DAG.getNode(ISD::OR, dl, MVT::v2i64, ZExtIn,
                              DAG.getBitcast(MVT::v2i64, VBias));
     Or = DAG.getBitcast(MVT::v2f64, Or);
@@ -20588,20 +21967,49 @@ void X86TargetLowering::ReplaceNodeResults(SDNode *N,
                           DAG.getConstant(0, dl, HalfT));
     swapInH = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, HalfT, N->getOperand(3),
                           DAG.getConstant(1, dl, HalfT));
-    swapInL = DAG.getCopyToReg(cpInH.getValue(0), dl,
-                               Regs64bit ? X86::RBX : X86::EBX,
-                               swapInL, cpInH.getValue(1));
-    swapInH = DAG.getCopyToReg(swapInL.getValue(0), dl,
-                               Regs64bit ? X86::RCX : X86::ECX,
-                               swapInH, swapInL.getValue(1));
-    SDValue Ops[] = { swapInH.getValue(0),
-                      N->getOperand(1),
-                      swapInH.getValue(1) };
+    swapInH =
+        DAG.getCopyToReg(cpInH.getValue(0), dl, Regs64bit ? X86::RCX : X86::ECX,
+                         swapInH, cpInH.getValue(1));
+    // If the current function needs the base pointer, RBX,
+    // we shouldn't use cmpxchg directly.
+    // Indeed the lowering of that instruction will clobber
+    // that register and since RBX will be a reserved register
+    // the register allocator will not make sure its value will
+    // be properly saved and restored around this live-range.
+    const X86RegisterInfo *TRI = Subtarget.getRegisterInfo();
+    SDValue Result;
     SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Glue);
+    unsigned BasePtr = TRI->getBaseRegister();
     MachineMemOperand *MMO = cast<AtomicSDNode>(N)->getMemOperand();
-    unsigned Opcode = Regs64bit ? X86ISD::LCMPXCHG16_DAG :
-                                  X86ISD::LCMPXCHG8_DAG;
-    SDValue Result = DAG.getMemIntrinsicNode(Opcode, dl, Tys, Ops, T, MMO);
+    if (TRI->hasBasePointer(DAG.getMachineFunction()) &&
+        (BasePtr == X86::RBX || BasePtr == X86::EBX)) {
+      // ISel prefers the LCMPXCHG64 variant.
+      // If that assert breaks, that means it is not the case anymore,
+      // and we need to teach LCMPXCHG8_SAVE_EBX_DAG how to save RBX,
+      // not just EBX. This is a matter of accepting i64 input for that
+      // pseudo, and restoring into the register of the right wide
+      // in expand pseudo. Everything else should just work.
+      assert(((Regs64bit == (BasePtr == X86::RBX)) || BasePtr == X86::EBX) &&
+             "Saving only half of the RBX");
+      unsigned Opcode = Regs64bit ? X86ISD::LCMPXCHG16_SAVE_RBX_DAG
+                                  : X86ISD::LCMPXCHG8_SAVE_EBX_DAG;
+      SDValue RBXSave = DAG.getCopyFromReg(swapInH.getValue(0), dl,
+                                           Regs64bit ? X86::RBX : X86::EBX,
+                                           HalfT, swapInH.getValue(1));
+      SDValue Ops[] = {/*Chain*/ RBXSave.getValue(1), N->getOperand(1), swapInL,
+                       RBXSave,
+                       /*Glue*/ RBXSave.getValue(2)};
+      Result = DAG.getMemIntrinsicNode(Opcode, dl, Tys, Ops, T, MMO);
+    } else {
+      unsigned Opcode =
+          Regs64bit ? X86ISD::LCMPXCHG16_DAG : X86ISD::LCMPXCHG8_DAG;
+      swapInL = DAG.getCopyToReg(swapInH.getValue(0), dl,
+                                 Regs64bit ? X86::RBX : X86::EBX, swapInL,
+                                 swapInH.getValue(1));
+      SDValue Ops[] = {swapInL.getValue(0), N->getOperand(1),
+                       swapInL.getValue(1)};
+      Result = DAG.getMemIntrinsicNode(Opcode, dl, Tys, Ops, T, MMO);
+    }
     SDValue cpOutL = DAG.getCopyFromReg(Result.getValue(0), dl,
                                         Regs64bit ? X86::RAX : X86::EAX,
                                         HalfT, Result.getValue(1));
@@ -20639,7 +22047,7 @@ void X86TargetLowering::ReplaceNodeResults(SDNode *N,
     break;
   }
   case ISD::BITCAST: {
-    assert(Subtarget->hasSSE2() && "Requires at least SSE2!");
+    assert(Subtarget.hasSSE2() && "Requires at least SSE2!");
     EVT DstVT = N->getValueType(0);
     EVT SrcVT = N->getOperand(0)->getValueType(0);
 
@@ -20666,7 +22074,7 @@ void X86TargetLowering::ReplaceNodeResults(SDNode *N,
       Elts.push_back(DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, SVT,
                                    ToVecInt, DAG.getIntPtrConstant(i, dl)));
 
-    Results.push_back(DAG.getNode(ISD::BUILD_VECTOR, dl, DstVT, Elts));
+    Results.push_back(DAG.getBuildVector(DstVT, dl, Elts));
   }
   }
 }
@@ -20703,7 +22111,6 @@ const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const {
   case X86ISD::SETCC:              return "X86ISD::SETCC";
   case X86ISD::SETCC_CARRY:        return "X86ISD::SETCC_CARRY";
   case X86ISD::FSETCC:             return "X86ISD::FSETCC";
-  case X86ISD::FGETSIGNx86:        return "X86ISD::FGETSIGNx86";
   case X86ISD::CMOV:               return "X86ISD::CMOV";
   case X86ISD::BRCOND:             return "X86ISD::BRCOND";
   case X86ISD::RET_FLAG:           return "X86ISD::RET_FLAG";
@@ -20724,7 +22131,6 @@ const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const {
   case X86ISD::MMX_PINSRW:         return "X86ISD::MMX_PINSRW";
   case X86ISD::PSHUFB:             return "X86ISD::PSHUFB";
   case X86ISD::ANDNP:              return "X86ISD::ANDNP";
-  case X86ISD::PSIGN:              return "X86ISD::PSIGN";
   case X86ISD::BLENDI:             return "X86ISD::BLENDI";
   case X86ISD::SHRUNKBLEND:        return "X86ISD::SHRUNKBLEND";
   case X86ISD::ADDUS:              return "X86ISD::ADDUS";
@@ -20742,7 +22148,9 @@ const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const {
   case X86ISD::FMAXC:              return "X86ISD::FMAXC";
   case X86ISD::FMINC:              return "X86ISD::FMINC";
   case X86ISD::FRSQRT:             return "X86ISD::FRSQRT";
+  case X86ISD::FRSQRTS:             return "X86ISD::FRSQRTS";
   case X86ISD::FRCP:               return "X86ISD::FRCP";
+  case X86ISD::FRCPS:              return "X86ISD::FRCPS";
   case X86ISD::EXTRQI:             return "X86ISD::EXTRQI";
   case X86ISD::INSERTQI:           return "X86ISD::INSERTQI";
   case X86ISD::TLSADDR:            return "X86ISD::TLSADDR";
@@ -20750,6 +22158,8 @@ const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const {
   case X86ISD::TLSCALL:            return "X86ISD::TLSCALL";
   case X86ISD::EH_SJLJ_SETJMP:     return "X86ISD::EH_SJLJ_SETJMP";
   case X86ISD::EH_SJLJ_LONGJMP:    return "X86ISD::EH_SJLJ_LONGJMP";
+  case X86ISD::EH_SJLJ_SETUP_DISPATCH:
+    return "X86ISD::EH_SJLJ_SETUP_DISPATCH";
   case X86ISD::EH_RETURN:          return "X86ISD::EH_RETURN";
   case X86ISD::TC_RETURN:          return "X86ISD::TC_RETURN";
   case X86ISD::FNSTCW16m:          return "X86ISD::FNSTCW16m";
@@ -20757,6 +22167,15 @@ const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const {
   case X86ISD::LCMPXCHG_DAG:       return "X86ISD::LCMPXCHG_DAG";
   case X86ISD::LCMPXCHG8_DAG:      return "X86ISD::LCMPXCHG8_DAG";
   case X86ISD::LCMPXCHG16_DAG:     return "X86ISD::LCMPXCHG16_DAG";
+  case X86ISD::LCMPXCHG8_SAVE_EBX_DAG:
+    return "X86ISD::LCMPXCHG8_SAVE_EBX_DAG";
+  case X86ISD::LCMPXCHG16_SAVE_RBX_DAG:
+    return "X86ISD::LCMPXCHG16_SAVE_RBX_DAG";
+  case X86ISD::LADD:               return "X86ISD::LADD";
+  case X86ISD::LSUB:               return "X86ISD::LSUB";
+  case X86ISD::LOR:                return "X86ISD::LOR";
+  case X86ISD::LXOR:               return "X86ISD::LXOR";
+  case X86ISD::LAND:               return "X86ISD::LAND";
   case X86ISD::VZEXT_MOVL:         return "X86ISD::VZEXT_MOVL";
   case X86ISD::VZEXT_LOAD:         return "X86ISD::VZEXT_LOAD";
   case X86ISD::VZEXT:              return "X86ISD::VZEXT";
@@ -20778,8 +22197,10 @@ const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const {
   case X86ISD::VSHLI:              return "X86ISD::VSHLI";
   case X86ISD::VSRLI:              return "X86ISD::VSRLI";
   case X86ISD::VSRAI:              return "X86ISD::VSRAI";
+  case X86ISD::VSRAV:              return "X86ISD::VSRAV";
   case X86ISD::VROTLI:             return "X86ISD::VROTLI";
   case X86ISD::VROTRI:             return "X86ISD::VROTRI";
+  case X86ISD::VPPERM:             return "X86ISD::VPPERM";
   case X86ISD::CMPP:               return "X86ISD::CMPP";
   case X86ISD::PCMPEQ:             return "X86ISD::PCMPEQ";
   case X86ISD::PCMPGT:             return "X86ISD::PCMPGT";
@@ -20802,6 +22223,7 @@ const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const {
   case X86ISD::AND:                return "X86ISD::AND";
   case X86ISD::BEXTR:              return "X86ISD::BEXTR";
   case X86ISD::MUL_IMM:            return "X86ISD::MUL_IMM";
+  case X86ISD::MOVMSK:             return "X86ISD::MOVMSK";
   case X86ISD::PTEST:              return "X86ISD::PTEST";
   case X86ISD::TESTP:              return "X86ISD::TESTP";
   case X86ISD::TESTM:              return "X86ISD::TESTM";
@@ -20842,6 +22264,7 @@ const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const {
   case X86ISD::VPERMI:             return "X86ISD::VPERMI";
   case X86ISD::VPTERNLOG:          return "X86ISD::VPTERNLOG";
   case X86ISD::VFIXUPIMM:          return "X86ISD::VFIXUPIMM";
+  case X86ISD::VFIXUPIMMS:          return "X86ISD::VFIXUPIMMS";
   case X86ISD::VRANGE:             return "X86ISD::VRANGE";
   case X86ISD::PMULUDQ:            return "X86ISD::PMULUDQ";
   case X86ISD::PMULDQ:             return "X86ISD::PMULDQ";
@@ -20852,8 +22275,6 @@ const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const {
   case X86ISD::WIN_ALLOCA:         return "X86ISD::WIN_ALLOCA";
   case X86ISD::MEMBARRIER:         return "X86ISD::MEMBARRIER";
   case X86ISD::MFENCE:             return "X86ISD::MFENCE";
-  case X86ISD::SFENCE:             return "X86ISD::SFENCE";
-  case X86ISD::LFENCE:             return "X86ISD::LFENCE";
   case X86ISD::SEG_ALLOCA:         return "X86ISD::SEG_ALLOCA";
   case X86ISD::SAHF:               return "X86ISD::SAHF";
   case X86ISD::RDRAND:             return "X86ISD::RDRAND";
@@ -20866,6 +22287,7 @@ const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const {
   case X86ISD::VPSHL:              return "X86ISD::VPSHL";
   case X86ISD::VPCOM:              return "X86ISD::VPCOM";
   case X86ISD::VPCOMU:             return "X86ISD::VPCOMU";
+  case X86ISD::VPERMIL2:           return "X86ISD::VPERMIL2";
   case X86ISD::FMADD:              return "X86ISD::FMADD";
   case X86ISD::FMSUB:              return "X86ISD::FMSUB";
   case X86ISD::FNMADD:             return "X86ISD::FNMADD";
@@ -20878,6 +22300,8 @@ const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const {
   case X86ISD::FNMSUB_RND:         return "X86ISD::FNMSUB_RND";
   case X86ISD::FMADDSUB_RND:       return "X86ISD::FMADDSUB_RND";
   case X86ISD::FMSUBADD_RND:       return "X86ISD::FMSUBADD_RND";
+  case X86ISD::VPMADD52H:          return "X86ISD::VPMADD52H";
+  case X86ISD::VPMADD52L:          return "X86ISD::VPMADD52L";
   case X86ISD::VRNDSCALE:          return "X86ISD::VRNDSCALE";
   case X86ISD::VREDUCE:            return "X86ISD::VREDUCE";
   case X86ISD::VGETMANT:           return "X86ISD::VGETMANT";
@@ -20898,6 +22322,7 @@ const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const {
   case X86ISD::FSQRT_RND:          return "X86ISD::FSQRT_RND";
   case X86ISD::FGETEXP_RND:        return "X86ISD::FGETEXP_RND";
   case X86ISD::SCALEF:             return "X86ISD::SCALEF";
+  case X86ISD::SCALEFS:            return "X86ISD::SCALEFS";
   case X86ISD::ADDS:               return "X86ISD::ADDS";
   case X86ISD::SUBS:               return "X86ISD::SUBS";
   case X86ISD::AVG:                return "X86ISD::AVG";
@@ -20908,26 +22333,27 @@ const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const {
   case X86ISD::FP_TO_UINT_RND:     return "X86ISD::FP_TO_UINT_RND";
   case X86ISD::VFPCLASS:           return "X86ISD::VFPCLASS";
   case X86ISD::VFPCLASSS:          return "X86ISD::VFPCLASSS";
+  case X86ISD::MULTISHIFT:         return "X86ISD::MULTISHIFT";
+  case X86ISD::SCALAR_FP_TO_SINT_RND: return "X86ISD::SCALAR_FP_TO_SINT_RND";
+  case X86ISD::SCALAR_FP_TO_UINT_RND: return "X86ISD::SCALAR_FP_TO_UINT_RND";
   }
   return nullptr;
 }
 
-// isLegalAddressingMode - Return true if the addressing mode represented
-// by AM is legal for this target, for a load/store of the specified type.
+/// Return true if the addressing mode represented by AM is legal for this
+/// target, for a load/store of the specified type.
 bool X86TargetLowering::isLegalAddressingMode(const DataLayout &DL,
                                               const AddrMode &AM, Type *Ty,
                                               unsigned AS) const {
   // X86 supports extremely general addressing modes.
   CodeModel::Model M = getTargetMachine().getCodeModel();
-  Reloc::Model R = getTargetMachine().getRelocationModel();
 
   // X86 allows a sign-extended 32-bit immediate field as a displacement.
   if (!X86::isOffsetSuitableForCodeModel(AM.BaseOffs, M, AM.BaseGV != nullptr))
     return false;
 
   if (AM.BaseGV) {
-    unsigned GVFlags =
-      Subtarget->ClassifyGlobalReference(AM.BaseGV, getTargetMachine());
+    unsigned GVFlags = Subtarget.classifyGlobalReference(AM.BaseGV);
 
     // If a reference to this global requires an extra load, we can't fold it.
     if (isGlobalStubReference(GVFlags))
@@ -20939,8 +22365,8 @@ bool X86TargetLowering::isLegalAddressingMode(const DataLayout &DL,
       return false;
 
     // If lower 4G is not available, then we must use rip-relative addressing.
-    if ((M != CodeModel::Small || R != Reloc::Static) &&
-        Subtarget->is64Bit() && (AM.BaseOffs || AM.Scale > 1))
+    if ((M != CodeModel::Small || isPositionIndependent()) &&
+        Subtarget.is64Bit() && (AM.BaseOffs || AM.Scale > 1))
       return false;
   }
 
@@ -20977,7 +22403,7 @@ bool X86TargetLowering::isVectorShiftByScalarCheap(Type *Ty) const {
 
   // On AVX2 there are new vpsllv[dq] instructions (and other shifts), that make
   // variable shifts just as cheap as scalar ones.
-  if (Subtarget->hasInt256() && (Bits == 32 || Bits == 64))
+  if (Subtarget.hasInt256() && (Bits == 32 || Bits == 64))
     return false;
 
   // Otherwise, it's significantly cheaper to shift by a scalar amount than by a
@@ -21026,12 +22452,12 @@ bool X86TargetLowering::isTruncateFree(EVT VT1, EVT VT2) const {
 
 bool X86TargetLowering::isZExtFree(Type *Ty1, Type *Ty2) const {
   // x86-64 implicitly zero-extends 32-bit results in 64-bit registers.
-  return Ty1->isIntegerTy(32) && Ty2->isIntegerTy(64) && Subtarget->is64Bit();
+  return Ty1->isIntegerTy(32) && Ty2->isIntegerTy(64) && Subtarget.is64Bit();
 }
 
 bool X86TargetLowering::isZExtFree(EVT VT1, EVT VT2) const {
   // x86-64 implicitly zero-extends 32-bit results in 64-bit registers.
-  return VT1 == MVT::i32 && VT2 == MVT::i64 && Subtarget->is64Bit();
+  return VT1 == MVT::i32 && VT2 == MVT::i64 && Subtarget.is64Bit();
 }
 
 bool X86TargetLowering::isZExtFree(SDValue Val, EVT VT2) const {
@@ -21062,7 +22488,7 @@ bool X86TargetLowering::isVectorLoadExtDesirable(SDValue) const { return true; }
 
 bool
 X86TargetLowering::isFMAFasterThanFMulAndFAdd(EVT VT) const {
-  if (!Subtarget->hasAnyFMA())
+  if (!Subtarget.hasAnyFMA())
     return false;
 
   VT = VT.getScalarType();
@@ -21086,8 +22512,8 @@ bool X86TargetLowering::isNarrowingProfitable(EVT VT1, EVT VT2) const {
   return !(VT1 == MVT::i32 && VT2 == MVT::i16);
 }
 
-/// isShuffleMaskLegal - Targets can use this to indicate that they only
-/// support *some* VECTOR_SHUFFLE operations, those with specific masks.
+/// Targets can use this to indicate that they only support *some*
+/// VECTOR_SHUFFLE operations, those with specific masks.
 /// By default, if a target supports the VECTOR_SHUFFLE node, all mask values
 /// are assumed to be legal.
 bool
@@ -21121,9 +22547,9 @@ X86TargetLowering::isVectorClearMaskLegal(const SmallVectorImpl<int> &Mask,
 //===----------------------------------------------------------------------===//
 
 /// Utility function to emit xbegin specifying the start of an RTM region.
-static MachineBasicBlock *EmitXBegin(MachineInstr *MI, MachineBasicBlock *MBB,
+static MachineBasicBlock *emitXBegin(MachineInstr &MI, MachineBasicBlock *MBB,
                                      const TargetInstrInfo *TII) {
-  DebugLoc DL = MI->getDebugLoc();
+  DebugLoc DL = MI.getDebugLoc();
 
   const BasicBlock *BB = MBB->getBasicBlock();
   MachineFunction::iterator I = ++MBB->getIterator();
@@ -21167,21 +22593,21 @@ static MachineBasicBlock *EmitXBegin(MachineInstr *MI, MachineBasicBlock *MBB,
   // sinkMBB:
   // EAX is live into the sinkMBB
   sinkMBB->addLiveIn(X86::EAX);
-  BuildMI(*sinkMBB, sinkMBB->begin(), DL,
-          TII->get(TargetOpcode::COPY), MI->getOperand(0).getReg())
-    .addReg(X86::EAX);
+  BuildMI(*sinkMBB, sinkMBB->begin(), DL, TII->get(TargetOpcode::COPY),
+          MI.getOperand(0).getReg())
+      .addReg(X86::EAX);
 
-  MI->eraseFromParent();
+  MI.eraseFromParent();
   return sinkMBB;
 }
 
 // FIXME: When we get size specific XMM0 registers, i.e. XMM0_V16I8
 // or XMM0_V32I8 in AVX all of this code can be replaced with that
 // in the .td file.
-static MachineBasicBlock *EmitPCMPSTRM(MachineInstr *MI, MachineBasicBlock *BB,
+static MachineBasicBlock *emitPCMPSTRM(MachineInstr &MI, MachineBasicBlock *BB,
                                        const TargetInstrInfo *TII) {
   unsigned Opc;
-  switch (MI->getOpcode()) {
+  switch (MI.getOpcode()) {
   default: llvm_unreachable("illegal opcode!");
   case X86::PCMPISTRM128REG:  Opc = X86::PCMPISTRM128rr;  break;
   case X86::VPCMPISTRM128REG: Opc = X86::VPCMPISTRM128rr; break;
@@ -21193,32 +22619,31 @@ static MachineBasicBlock *EmitPCMPSTRM(MachineInstr *MI, MachineBasicBlock *BB,
   case X86::VPCMPESTRM128MEM: Opc = X86::VPCMPESTRM128rm; break;
   }
 
-  DebugLoc dl = MI->getDebugLoc();
+  DebugLoc dl = MI.getDebugLoc();
   MachineInstrBuilder MIB = BuildMI(*BB, MI, dl, TII->get(Opc));
 
-  unsigned NumArgs = MI->getNumOperands();
+  unsigned NumArgs = MI.getNumOperands();
   for (unsigned i = 1; i < NumArgs; ++i) {
-    MachineOperand &Op = MI->getOperand(i);
+    MachineOperand &Op = MI.getOperand(i);
     if (!(Op.isReg() && Op.isImplicit()))
       MIB.addOperand(Op);
   }
-  if (MI->hasOneMemOperand())
-    MIB->setMemRefs(MI->memoperands_begin(), MI->memoperands_end());
+  if (MI.hasOneMemOperand())
+    MIB->setMemRefs(MI.memoperands_begin(), MI.memoperands_end());
 
-  BuildMI(*BB, MI, dl,
-    TII->get(TargetOpcode::COPY), MI->getOperand(0).getReg())
-    .addReg(X86::XMM0);
+  BuildMI(*BB, MI, dl, TII->get(TargetOpcode::COPY), MI.getOperand(0).getReg())
+      .addReg(X86::XMM0);
 
-  MI->eraseFromParent();
+  MI.eraseFromParent();
   return BB;
 }
 
 // FIXME: Custom handling because TableGen doesn't support multiple implicit
 // defs in an instruction pattern
-static MachineBasicBlock *EmitPCMPSTRI(MachineInstr *MI, MachineBasicBlock *BB,
+static MachineBasicBlock *emitPCMPSTRI(MachineInstr &MI, MachineBasicBlock *BB,
                                        const TargetInstrInfo *TII) {
   unsigned Opc;
-  switch (MI->getOpcode()) {
+  switch (MI.getOpcode()) {
   default: llvm_unreachable("illegal opcode!");
   case X86::PCMPISTRIREG:  Opc = X86::PCMPISTRIrr;  break;
   case X86::VPCMPISTRIREG: Opc = X86::VPCMPISTRIrr; break;
@@ -21230,93 +22655,90 @@ static MachineBasicBlock *EmitPCMPSTRI(MachineInstr *MI, MachineBasicBlock *BB,
   case X86::VPCMPESTRIMEM: Opc = X86::VPCMPESTRIrm; break;
   }
 
-  DebugLoc dl = MI->getDebugLoc();
+  DebugLoc dl = MI.getDebugLoc();
   MachineInstrBuilder MIB = BuildMI(*BB, MI, dl, TII->get(Opc));
 
-  unsigned NumArgs = MI->getNumOperands(); // remove the results
+  unsigned NumArgs = MI.getNumOperands(); // remove the results
   for (unsigned i = 1; i < NumArgs; ++i) {
-    MachineOperand &Op = MI->getOperand(i);
+    MachineOperand &Op = MI.getOperand(i);
     if (!(Op.isReg() && Op.isImplicit()))
       MIB.addOperand(Op);
   }
-  if (MI->hasOneMemOperand())
-    MIB->setMemRefs(MI->memoperands_begin(), MI->memoperands_end());
+  if (MI.hasOneMemOperand())
+    MIB->setMemRefs(MI.memoperands_begin(), MI.memoperands_end());
 
-  BuildMI(*BB, MI, dl,
-    TII->get(TargetOpcode::COPY), MI->getOperand(0).getReg())
-    .addReg(X86::ECX);
+  BuildMI(*BB, MI, dl, TII->get(TargetOpcode::COPY), MI.getOperand(0).getReg())
+      .addReg(X86::ECX);
 
-  MI->eraseFromParent();
+  MI.eraseFromParent();
   return BB;
 }
 
-static MachineBasicBlock *EmitWRPKRU(MachineInstr *MI, MachineBasicBlock *BB,
-                                     const X86Subtarget *Subtarget) {
-  DebugLoc dl = MI->getDebugLoc();
-  const TargetInstrInfo *TII = Subtarget->getInstrInfo();
+static MachineBasicBlock *emitWRPKRU(MachineInstr &MI, MachineBasicBlock *BB,
+                                     const X86Subtarget &Subtarget) {
+  DebugLoc dl = MI.getDebugLoc();
+  const TargetInstrInfo *TII = Subtarget.getInstrInfo();
 
   // insert input VAL into EAX
   BuildMI(*BB, MI, dl, TII->get(TargetOpcode::COPY), X86::EAX)
-                           .addReg(MI->getOperand(0).getReg());
+      .addReg(MI.getOperand(0).getReg());
   // insert zero to ECX
-  BuildMI(*BB, MI, dl, TII->get(X86::XOR32rr), X86::ECX)
-                           .addReg(X86::ECX)
-                           .addReg(X86::ECX);
+  BuildMI(*BB, MI, dl, TII->get(X86::MOV32r0), X86::ECX);
+
   // insert zero to EDX
-  BuildMI(*BB, MI, dl, TII->get(X86::XOR32rr), X86::EDX)
-                           .addReg(X86::EDX)
-                           .addReg(X86::EDX);
+  BuildMI(*BB, MI, dl, TII->get(X86::MOV32r0), X86::EDX);
+
   // insert WRPKRU instruction
   BuildMI(*BB, MI, dl, TII->get(X86::WRPKRUr));
 
-  MI->eraseFromParent(); // The pseudo is gone now.
+  MI.eraseFromParent(); // The pseudo is gone now.
   return BB;
 }
 
-static MachineBasicBlock *EmitRDPKRU(MachineInstr *MI, MachineBasicBlock *BB,
-                                     const X86Subtarget *Subtarget) {
-  DebugLoc dl = MI->getDebugLoc();
-  const TargetInstrInfo *TII = Subtarget->getInstrInfo();
+static MachineBasicBlock *emitRDPKRU(MachineInstr &MI, MachineBasicBlock *BB,
+                                     const X86Subtarget &Subtarget) {
+  DebugLoc dl = MI.getDebugLoc();
+  const TargetInstrInfo *TII = Subtarget.getInstrInfo();
 
   // insert zero to ECX
-  BuildMI(*BB, MI, dl, TII->get(X86::XOR32rr), X86::ECX)
-                           .addReg(X86::ECX)
-                           .addReg(X86::ECX);
+  BuildMI(*BB, MI, dl, TII->get(X86::MOV32r0), X86::ECX);
+
   // insert RDPKRU instruction
   BuildMI(*BB, MI, dl, TII->get(X86::RDPKRUr));
-  BuildMI(*BB, MI, dl, TII->get(TargetOpcode::COPY), MI->getOperand(0).getReg())
-                           .addReg(X86::EAX);
+  BuildMI(*BB, MI, dl, TII->get(TargetOpcode::COPY), MI.getOperand(0).getReg())
+      .addReg(X86::EAX);
 
-  MI->eraseFromParent(); // The pseudo is gone now.
+  MI.eraseFromParent(); // The pseudo is gone now.
   return BB;
 }
 
-static MachineBasicBlock *EmitMonitor(MachineInstr *MI, MachineBasicBlock *BB,
-                                      const X86Subtarget *Subtarget) {
-  DebugLoc dl = MI->getDebugLoc();
-  const TargetInstrInfo *TII = Subtarget->getInstrInfo();
+static MachineBasicBlock *emitMonitor(MachineInstr &MI, MachineBasicBlock *BB,
+                                      const X86Subtarget &Subtarget,
+                                      unsigned Opc) {
+  DebugLoc dl = MI.getDebugLoc();
+  const TargetInstrInfo *TII = Subtarget.getInstrInfo();
   // Address into RAX/EAX, other two args into ECX, EDX.
-  unsigned MemOpc = Subtarget->is64Bit() ? X86::LEA64r : X86::LEA32r;
-  unsigned MemReg = Subtarget->is64Bit() ? X86::RAX : X86::EAX;
+  unsigned MemOpc = Subtarget.is64Bit() ? X86::LEA64r : X86::LEA32r;
+  unsigned MemReg = Subtarget.is64Bit() ? X86::RAX : X86::EAX;
   MachineInstrBuilder MIB = BuildMI(*BB, MI, dl, TII->get(MemOpc), MemReg);
   for (int i = 0; i < X86::AddrNumOperands; ++i)
-    MIB.addOperand(MI->getOperand(i));
+    MIB.addOperand(MI.getOperand(i));
 
   unsigned ValOps = X86::AddrNumOperands;
   BuildMI(*BB, MI, dl, TII->get(TargetOpcode::COPY), X86::ECX)
-    .addReg(MI->getOperand(ValOps).getReg());
+      .addReg(MI.getOperand(ValOps).getReg());
   BuildMI(*BB, MI, dl, TII->get(TargetOpcode::COPY), X86::EDX)
-    .addReg(MI->getOperand(ValOps+1).getReg());
+      .addReg(MI.getOperand(ValOps + 1).getReg());
 
   // The instruction doesn't actually take any operands though.
-  BuildMI(*BB, MI, dl, TII->get(X86::MONITORrrr));
+  BuildMI(*BB, MI, dl, TII->get(Opc));
 
-  MI->eraseFromParent(); // The pseudo is gone now.
+  MI.eraseFromParent(); // The pseudo is gone now.
   return BB;
 }
 
 MachineBasicBlock *
-X86TargetLowering::EmitVAARG64WithCustomInserter(MachineInstr *MI,
+X86TargetLowering::EmitVAARG64WithCustomInserter(MachineInstr &MI,
                                                  MachineBasicBlock *MBB) const {
   // Emit va_arg instruction on X86-64.
 
@@ -21328,31 +22750,31 @@ X86TargetLowering::EmitVAARG64WithCustomInserter(MachineInstr *MI,
   // 8  ) Align         : Alignment of type
   // 9  ) EFLAGS (implicit-def)
 
-  assert(MI->getNumOperands() == 10 && "VAARG_64 should have 10 operands!");
+  assert(MI.getNumOperands() == 10 && "VAARG_64 should have 10 operands!");
   static_assert(X86::AddrNumOperands == 5,
                 "VAARG_64 assumes 5 address operands");
 
-  unsigned DestReg = MI->getOperand(0).getReg();
-  MachineOperand &Base = MI->getOperand(1);
-  MachineOperand &Scale = MI->getOperand(2);
-  MachineOperand &Index = MI->getOperand(3);
-  MachineOperand &Disp = MI->getOperand(4);
-  MachineOperand &Segment = MI->getOperand(5);
-  unsigned ArgSize = MI->getOperand(6).getImm();
-  unsigned ArgMode = MI->getOperand(7).getImm();
-  unsigned Align = MI->getOperand(8).getImm();
+  unsigned DestReg = MI.getOperand(0).getReg();
+  MachineOperand &Base = MI.getOperand(1);
+  MachineOperand &Scale = MI.getOperand(2);
+  MachineOperand &Index = MI.getOperand(3);
+  MachineOperand &Disp = MI.getOperand(4);
+  MachineOperand &Segment = MI.getOperand(5);
+  unsigned ArgSize = MI.getOperand(6).getImm();
+  unsigned ArgMode = MI.getOperand(7).getImm();
+  unsigned Align = MI.getOperand(8).getImm();
 
   // Memory Reference
-  assert(MI->hasOneMemOperand() && "Expected VAARG_64 to have one memoperand");
-  MachineInstr::mmo_iterator MMOBegin = MI->memoperands_begin();
-  MachineInstr::mmo_iterator MMOEnd = MI->memoperands_end();
+  assert(MI.hasOneMemOperand() && "Expected VAARG_64 to have one memoperand");
+  MachineInstr::mmo_iterator MMOBegin = MI.memoperands_begin();
+  MachineInstr::mmo_iterator MMOEnd = MI.memoperands_end();
 
   // Machine Information
-  const TargetInstrInfo *TII = Subtarget->getInstrInfo();
+  const TargetInstrInfo *TII = Subtarget.getInstrInfo();
   MachineRegisterInfo &MRI = MBB->getParent()->getRegInfo();
   const TargetRegisterClass *AddrRegClass = getRegClassFor(MVT::i64);
   const TargetRegisterClass *OffsetRegClass = getRegClassFor(MVT::i32);
-  DebugLoc DL = MI->getDebugLoc();
+  DebugLoc DL = MI.getDebugLoc();
 
   // struct va_list {
   //   i32   gp_offset
@@ -21521,7 +22943,7 @@ X86TargetLowering::EmitVAARG64WithCustomInserter(MachineInstr *MI,
   // to OverflowDestReg.
   if (NeedsAlign) {
     // Align the overflow address
-    assert((Align & (Align-1)) == 0 && "Alignment must be a power of 2");
+    assert(isPowerOf2_32(Align) && "Alignment must be a power of 2");
     unsigned TmpReg = MRI.createVirtualRegister(AddrRegClass);
 
     // aligned_addr = (addr + (align-1)) & ~(align-1)
@@ -21563,15 +22985,13 @@ X86TargetLowering::EmitVAARG64WithCustomInserter(MachineInstr *MI,
   }
 
   // Erase the pseudo instruction
-  MI->eraseFromParent();
+  MI.eraseFromParent();
 
   return endMBB;
 }
 
-MachineBasicBlock *
-X86TargetLowering::EmitVAStartSaveXMMRegsWithCustomInserter(
-                                                 MachineInstr *MI,
-                                                 MachineBasicBlock *MBB) const {
+MachineBasicBlock *X86TargetLowering::EmitVAStartSaveXMMRegsWithCustomInserter(
+    MachineInstr &MI, MachineBasicBlock *MBB) const {
   // Emit code to save XMM registers to the stack. The ABI says that the
   // number of registers to save is given in %al, so it's theoretically
   // possible to do an indirect jump trick to avoid saving all of them,
@@ -21602,14 +23022,14 @@ X86TargetLowering::EmitVAStartSaveXMMRegsWithCustomInserter(
   XMMSaveMBB->addSuccessor(EndMBB);
 
   // Now add the instructions.
-  const TargetInstrInfo *TII = Subtarget->getInstrInfo();
-  DebugLoc DL = MI->getDebugLoc();
+  const TargetInstrInfo *TII = Subtarget.getInstrInfo();
+  DebugLoc DL = MI.getDebugLoc();
 
-  unsigned CountReg = MI->getOperand(0).getReg();
-  int64_t RegSaveFrameIndex = MI->getOperand(1).getImm();
-  int64_t VarArgsFPOffset = MI->getOperand(2).getImm();
+  unsigned CountReg = MI.getOperand(0).getReg();
+  int64_t RegSaveFrameIndex = MI.getOperand(1).getImm();
+  int64_t VarArgsFPOffset = MI.getOperand(2).getImm();
 
-  if (!Subtarget->isCallingConvWin64(F->getFunction()->getCallingConv())) {
+  if (!Subtarget.isCallingConvWin64(F->getFunction()->getCallingConv())) {
     // If %al is 0, branch around the XMM save block.
     BuildMI(MBB, DL, TII->get(X86::TEST8rr)).addReg(CountReg).addReg(CountReg);
     BuildMI(MBB, DL, TII->get(X86::JE_1)).addMBB(EndMBB);
@@ -21618,29 +23038,29 @@ X86TargetLowering::EmitVAStartSaveXMMRegsWithCustomInserter(
 
   // Make sure the last operand is EFLAGS, which gets clobbered by the branch
   // that was just emitted, but clearly shouldn't be "saved".
-  assert((MI->getNumOperands() <= 3 ||
-          !MI->getOperand(MI->getNumOperands() - 1).isReg() ||
-          MI->getOperand(MI->getNumOperands() - 1).getReg() == X86::EFLAGS)
-         && "Expected last argument to be EFLAGS");
-  unsigned MOVOpc = Subtarget->hasFp256() ? X86::VMOVAPSmr : X86::MOVAPSmr;
+  assert((MI.getNumOperands() <= 3 ||
+          !MI.getOperand(MI.getNumOperands() - 1).isReg() ||
+          MI.getOperand(MI.getNumOperands() - 1).getReg() == X86::EFLAGS) &&
+         "Expected last argument to be EFLAGS");
+  unsigned MOVOpc = Subtarget.hasFp256() ? X86::VMOVAPSmr : X86::MOVAPSmr;
   // In the XMM save block, save all the XMM argument registers.
-  for (int i = 3, e = MI->getNumOperands() - 1; i != e; ++i) {
+  for (int i = 3, e = MI.getNumOperands() - 1; i != e; ++i) {
     int64_t Offset = (i - 3) * 16 + VarArgsFPOffset;
     MachineMemOperand *MMO = F->getMachineMemOperand(
         MachinePointerInfo::getFixedStack(*F, RegSaveFrameIndex, Offset),
         MachineMemOperand::MOStore,
         /*Size=*/16, /*Align=*/16);
     BuildMI(XMMSaveMBB, DL, TII->get(MOVOpc))
-      .addFrameIndex(RegSaveFrameIndex)
-      .addImm(/*Scale=*/1)
-      .addReg(/*IndexReg=*/0)
-      .addImm(/*Disp=*/Offset)
-      .addReg(/*Segment=*/0)
-      .addReg(MI->getOperand(i).getReg())
-      .addMemOperand(MMO);
+        .addFrameIndex(RegSaveFrameIndex)
+        .addImm(/*Scale=*/1)
+        .addReg(/*IndexReg=*/0)
+        .addImm(/*Disp=*/Offset)
+        .addReg(/*Segment=*/0)
+        .addReg(MI.getOperand(i).getReg())
+        .addMemOperand(MMO);
   }
 
-  MI->eraseFromParent();   // The pseudo instruction is gone now.
+  MI.eraseFromParent(); // The pseudo instruction is gone now.
 
   return EndMBB;
 }
@@ -21684,8 +23104,8 @@ static bool checkAndUpdateEFLAGSKill(MachineBasicBlock::iterator SelectItr,
 // Return true if it is OK for this CMOV pseudo-opcode to be cascaded
 // together with other CMOV pseudo-opcodes into a single basic-block with
 // conditional jump around it.
-static bool isCMOVPseudo(MachineInstr *MI) {
-  switch (MI->getOpcode()) {
+static bool isCMOVPseudo(MachineInstr &MI) {
+  switch (MI.getOpcode()) {
   case X86::CMOV_FR32:
   case X86::CMOV_FR64:
   case X86::CMOV_GR8:
@@ -21715,10 +23135,10 @@ static bool isCMOVPseudo(MachineInstr *MI) {
 }
 
 MachineBasicBlock *
-X86TargetLowering::EmitLoweredSelect(MachineInstr *MI,
+X86TargetLowering::EmitLoweredSelect(MachineInstr &MI,
                                      MachineBasicBlock *BB) const {
-  const TargetInstrInfo *TII = Subtarget->getInstrInfo();
-  DebugLoc DL = MI->getDebugLoc();
+  const TargetInstrInfo *TII = Subtarget.getInstrInfo();
+  DebugLoc DL = MI.getDebugLoc();
 
   // To "insert" a SELECT_CC instruction, we actually have to insert the
   // diamond control-flow pattern.  The incoming instruction knows the
@@ -21837,8 +23257,8 @@ X86TargetLowering::EmitLoweredSelect(MachineInstr *MI,
   //         retq
   //
   MachineInstr *CascadedCMOV = nullptr;
-  MachineInstr *LastCMOV = MI;
-  X86::CondCode CC = X86::CondCode(MI->getOperand(3).getImm());
+  MachineInstr *LastCMOV = &MI;
+  X86::CondCode CC = X86::CondCode(MI.getOperand(3).getImm());
   X86::CondCode OppCC = X86::GetOppositeBranchCondition(CC);
   MachineBasicBlock::iterator NextMIIt =
       std::next(MachineBasicBlock::iterator(MI));
@@ -21849,8 +23269,7 @@ X86TargetLowering::EmitLoweredSelect(MachineInstr *MI,
 
   if (isCMOVPseudo(MI)) {
     // See if we have a string of CMOVS with the same condition.
-    while (NextMIIt != BB->end() &&
-           isCMOVPseudo(NextMIIt) &&
+    while (NextMIIt != BB->end() && isCMOVPseudo(*NextMIIt) &&
            (NextMIIt->getOperand(3).getImm() == CC ||
             NextMIIt->getOperand(3).getImm() == OppCC)) {
       LastCMOV = &*NextMIIt;
@@ -21860,10 +23279,10 @@ X86TargetLowering::EmitLoweredSelect(MachineInstr *MI,
 
   // This checks for case 2, but only do this if we didn't already find
   // case 1, as indicated by LastCMOV == MI.
-  if (LastCMOV == MI &&
-      NextMIIt != BB->end() && NextMIIt->getOpcode() == MI->getOpcode() &&
-      NextMIIt->getOperand(2).getReg() == MI->getOperand(2).getReg() &&
-      NextMIIt->getOperand(1).getReg() == MI->getOperand(0).getReg() &&
+  if (LastCMOV == &MI && NextMIIt != BB->end() &&
+      NextMIIt->getOpcode() == MI.getOpcode() &&
+      NextMIIt->getOperand(2).getReg() == MI.getOperand(2).getReg() &&
+      NextMIIt->getOperand(1).getReg() == MI.getOperand(0).getReg() &&
       NextMIIt->getOperand(1).isKill()) {
     CascadedCMOV = &*NextMIIt;
   }
@@ -21885,7 +23304,7 @@ X86TargetLowering::EmitLoweredSelect(MachineInstr *MI,
 
   // If the EFLAGS register isn't dead in the terminator, then claim that it's
   // live into the sink and copy blocks.
-  const TargetRegisterInfo *TRI = Subtarget->getRegisterInfo();
+  const TargetRegisterInfo *TRI = Subtarget.getRegisterInfo();
 
   MachineInstr *LastEFLAGSUser = CascadedCMOV ? CascadedCMOV : LastCMOV;
   if (!LastEFLAGSUser->killsRegister(X86::EFLAGS) &&
@@ -21976,12 +23395,12 @@ X86TargetLowering::EmitLoweredSelect(MachineInstr *MI,
   // If we have a cascaded CMOV, the second Jcc provides the same incoming
   // value as the first Jcc (the True operand of the SELECT_CC/CMOV nodes).
   if (CascadedCMOV) {
-    MIB.addReg(MI->getOperand(2).getReg()).addMBB(jcc1MBB);
+    MIB.addReg(MI.getOperand(2).getReg()).addMBB(jcc1MBB);
     // Copy the PHI result to the register defined by the second CMOV.
     BuildMI(*sinkMBB, std::next(MachineBasicBlock::iterator(MIB.getInstr())),
             DL, TII->get(TargetOpcode::COPY),
             CascadedCMOV->getOperand(0).getReg())
-        .addReg(MI->getOperand(0).getReg());
+        .addReg(MI.getOperand(0).getReg());
     CascadedCMOV->eraseFromParent();
   }
 
@@ -21993,7 +23412,7 @@ X86TargetLowering::EmitLoweredSelect(MachineInstr *MI,
 }
 
 MachineBasicBlock *
-X86TargetLowering::EmitLoweredAtomicFP(MachineInstr *MI,
+X86TargetLowering::EmitLoweredAtomicFP(MachineInstr &MI,
                                        MachineBasicBlock *BB) const {
   // Combine the following atomic floating-point modification pattern:
   //   a.store(reg OP a.load(acquire), release)
@@ -22002,52 +23421,55 @@ X86TargetLowering::EmitLoweredAtomicFP(MachineInstr *MI,
   //   movss %xmm, (%gpr)
   // Or sd equivalent for 64-bit operations.
   unsigned MOp, FOp;
-  switch (MI->getOpcode()) {
+  switch (MI.getOpcode()) {
   default: llvm_unreachable("unexpected instr type for EmitLoweredAtomicFP");
-  case X86::RELEASE_FADD32mr: MOp = X86::MOVSSmr; FOp = X86::ADDSSrm; break;
-  case X86::RELEASE_FADD64mr: MOp = X86::MOVSDmr; FOp = X86::ADDSDrm; break;
+  case X86::RELEASE_FADD32mr:
+    FOp = X86::ADDSSrm;
+    MOp = X86::MOVSSmr;
+    break;
+  case X86::RELEASE_FADD64mr:
+    FOp = X86::ADDSDrm;
+    MOp = X86::MOVSDmr;
+    break;
   }
-  const X86InstrInfo *TII = Subtarget->getInstrInfo();
-  DebugLoc DL = MI->getDebugLoc();
+  const X86InstrInfo *TII = Subtarget.getInstrInfo();
+  DebugLoc DL = MI.getDebugLoc();
   MachineRegisterInfo &MRI = BB->getParent()->getRegInfo();
-  MachineOperand MSrc = MI->getOperand(0);
-  unsigned VSrc = MI->getOperand(5).getReg();
-  const MachineOperand &Disp = MI->getOperand(3);
-  MachineOperand ZeroDisp = MachineOperand::CreateImm(0);
-  bool hasDisp = Disp.isGlobal() || Disp.isImm();
-  if (hasDisp && MSrc.isReg())
-    MSrc.setIsKill(false);
-  MachineInstrBuilder MIM = BuildMI(*BB, MI, DL, TII->get(MOp))
-                                .addOperand(/*Base=*/MSrc)
-                                .addImm(/*Scale=*/1)
-                                .addReg(/*Index=*/0)
-                                .addDisp(hasDisp ? Disp : ZeroDisp, /*off=*/0)
-                                .addReg(0);
-  MachineInstr *MIO = BuildMI(*BB, (MachineInstr *)MIM, DL, TII->get(FOp),
-                              MRI.createVirtualRegister(MRI.getRegClass(VSrc)))
-                          .addReg(VSrc)
-                          .addOperand(/*Base=*/MSrc)
-                          .addImm(/*Scale=*/1)
-                          .addReg(/*Index=*/0)
-                          .addDisp(hasDisp ? Disp : ZeroDisp, /*off=*/0)
-                          .addReg(/*Segment=*/0);
-  MIM.addReg(MIO->getOperand(0).getReg(), RegState::Kill);
-  MI->eraseFromParent(); // The pseudo instruction is gone now.
+  unsigned ValOpIdx = X86::AddrNumOperands;
+  unsigned VSrc = MI.getOperand(ValOpIdx).getReg();
+  MachineInstrBuilder MIB =
+      BuildMI(*BB, MI, DL, TII->get(FOp),
+              MRI.createVirtualRegister(MRI.getRegClass(VSrc)))
+          .addReg(VSrc);
+  for (int i = 0; i < X86::AddrNumOperands; ++i) {
+    MachineOperand &Operand = MI.getOperand(i);
+    // Clear any kill flags on register operands as we'll create a second
+    // instruction using the same address operands.
+    if (Operand.isReg())
+      Operand.setIsKill(false);
+    MIB.addOperand(Operand);
+  }
+  MachineInstr *FOpMI = MIB;
+  MIB = BuildMI(*BB, MI, DL, TII->get(MOp));
+  for (int i = 0; i < X86::AddrNumOperands; ++i)
+    MIB.addOperand(MI.getOperand(i));
+  MIB.addReg(FOpMI->getOperand(0).getReg(), RegState::Kill);
+  MI.eraseFromParent(); // The pseudo instruction is gone now.
   return BB;
 }
 
 MachineBasicBlock *
-X86TargetLowering::EmitLoweredSegAlloca(MachineInstr *MI,
+X86TargetLowering::EmitLoweredSegAlloca(MachineInstr &MI,
                                         MachineBasicBlock *BB) const {
   MachineFunction *MF = BB->getParent();
-  const TargetInstrInfo *TII = Subtarget->getInstrInfo();
-  DebugLoc DL = MI->getDebugLoc();
+  const TargetInstrInfo *TII = Subtarget.getInstrInfo();
+  DebugLoc DL = MI.getDebugLoc();
   const BasicBlock *LLVM_BB = BB->getBasicBlock();
 
   assert(MF->shouldSplitStack());
 
-  const bool Is64Bit = Subtarget->is64Bit();
-  const bool IsLP64 = Subtarget->isTarget64BitLP64();
+  const bool Is64Bit = Subtarget.is64Bit();
+  const bool IsLP64 = Subtarget.isTarget64BitLP64();
 
   const unsigned TlsReg = Is64Bit ? X86::FS : X86::GS;
   const unsigned TlsOffset = IsLP64 ? 0x70 : Is64Bit ? 0x40 : 0x30;
@@ -22077,11 +23499,12 @@ X86TargetLowering::EmitLoweredSegAlloca(MachineInstr *MI,
       getRegClassFor(getPointerTy(MF->getDataLayout()));
 
   unsigned mallocPtrVReg = MRI.createVirtualRegister(AddrRegClass),
-    bumpSPPtrVReg = MRI.createVirtualRegister(AddrRegClass),
-    tmpSPVReg = MRI.createVirtualRegister(AddrRegClass),
-    SPLimitVReg = MRI.createVirtualRegister(AddrRegClass),
-    sizeVReg = MI->getOperand(1).getReg(),
-    physSPReg = IsLP64 || Subtarget->isTargetNaCl64() ? X86::RSP : X86::ESP;
+           bumpSPPtrVReg = MRI.createVirtualRegister(AddrRegClass),
+           tmpSPVReg = MRI.createVirtualRegister(AddrRegClass),
+           SPLimitVReg = MRI.createVirtualRegister(AddrRegClass),
+           sizeVReg = MI.getOperand(1).getReg(),
+           physSPReg =
+               IsLP64 || Subtarget.isTargetNaCl64() ? X86::RSP : X86::ESP;
 
   MachineFunction::iterator MBBIter = ++BB->getIterator();
 
@@ -22113,7 +23536,7 @@ X86TargetLowering::EmitLoweredSegAlloca(MachineInstr *MI,
 
   // Calls into a routine in libgcc to allocate more space from the heap.
   const uint32_t *RegMask =
-      Subtarget->getRegisterInfo()->getCallPreservedMask(*MF, CallingConv::C);
+      Subtarget.getRegisterInfo()->getCallPreservedMask(*MF, CallingConv::C);
   if (IsLP64) {
     BuildMI(mallocMBB, DL, TII->get(X86::MOV64rr), X86::RDI)
       .addReg(sizeVReg);
@@ -22156,43 +23579,33 @@ X86TargetLowering::EmitLoweredSegAlloca(MachineInstr *MI,
 
   // Take care of the PHI nodes.
   BuildMI(*continueMBB, continueMBB->begin(), DL, TII->get(X86::PHI),
-          MI->getOperand(0).getReg())
-    .addReg(mallocPtrVReg).addMBB(mallocMBB)
-    .addReg(bumpSPPtrVReg).addMBB(bumpMBB);
+          MI.getOperand(0).getReg())
+      .addReg(mallocPtrVReg)
+      .addMBB(mallocMBB)
+      .addReg(bumpSPPtrVReg)
+      .addMBB(bumpMBB);
 
   // Delete the original pseudo instruction.
-  MI->eraseFromParent();
+  MI.eraseFromParent();
 
   // And we're done.
   return continueMBB;
 }
 
 MachineBasicBlock *
-X86TargetLowering::EmitLoweredWinAlloca(MachineInstr *MI,
-                                        MachineBasicBlock *BB) const {
-  assert(!Subtarget->isTargetMachO());
-  DebugLoc DL = MI->getDebugLoc();
-  MachineInstr *ResumeMI = Subtarget->getFrameLowering()->emitStackProbe(
-      *BB->getParent(), *BB, MI, DL, false);
-  MachineBasicBlock *ResumeBB = ResumeMI->getParent();
-  MI->eraseFromParent(); // The pseudo instruction is gone now.
-  return ResumeBB;
-}
-
-MachineBasicBlock *
-X86TargetLowering::EmitLoweredCatchRet(MachineInstr *MI,
+X86TargetLowering::EmitLoweredCatchRet(MachineInstr &MI,
                                        MachineBasicBlock *BB) const {
   MachineFunction *MF = BB->getParent();
-  const TargetInstrInfo &TII = *Subtarget->getInstrInfo();
-  MachineBasicBlock *TargetMBB = MI->getOperand(0).getMBB();
-  DebugLoc DL = MI->getDebugLoc();
+  const TargetInstrInfo &TII = *Subtarget.getInstrInfo();
+  MachineBasicBlock *TargetMBB = MI.getOperand(0).getMBB();
+  DebugLoc DL = MI.getDebugLoc();
 
   assert(!isAsynchronousEHPersonality(
              classifyEHPersonality(MF->getFunction()->getPersonalityFn())) &&
          "SEH does not use catchret!");
 
   // Only 32-bit EH needs to worry about manually restoring stack pointers.
-  if (!Subtarget->is32Bit())
+  if (!Subtarget.is32Bit())
     return BB;
 
   // C++ EH creates a new target block to hold the restore code, and wires up
@@ -22203,7 +23616,7 @@ X86TargetLowering::EmitLoweredCatchRet(MachineInstr *MI,
   MF->insert(std::next(BB->getIterator()), RestoreMBB);
   RestoreMBB->transferSuccessorsAndUpdatePHIs(BB);
   BB->addSuccessor(RestoreMBB);
-  MI->getOperand(0).setMBB(RestoreMBB);
+  MI.getOperand(0).setMBB(RestoreMBB);
 
   auto RestoreMBBI = RestoreMBB->begin();
   BuildMI(*RestoreMBB, RestoreMBBI, DL, TII.get(X86::EH_RESTORE));
@@ -22212,37 +23625,37 @@ X86TargetLowering::EmitLoweredCatchRet(MachineInstr *MI,
 }
 
 MachineBasicBlock *
-X86TargetLowering::EmitLoweredCatchPad(MachineInstr *MI,
+X86TargetLowering::EmitLoweredCatchPad(MachineInstr &MI,
                                        MachineBasicBlock *BB) const {
   MachineFunction *MF = BB->getParent();
   const Constant *PerFn = MF->getFunction()->getPersonalityFn();
   bool IsSEH = isAsynchronousEHPersonality(classifyEHPersonality(PerFn));
   // Only 32-bit SEH requires special handling for catchpad.
-  if (IsSEH && Subtarget->is32Bit()) {
-    const TargetInstrInfo &TII = *Subtarget->getInstrInfo();
-    DebugLoc DL = MI->getDebugLoc();
+  if (IsSEH && Subtarget.is32Bit()) {
+    const TargetInstrInfo &TII = *Subtarget.getInstrInfo();
+    DebugLoc DL = MI.getDebugLoc();
     BuildMI(*BB, MI, DL, TII.get(X86::EH_RESTORE));
   }
-  MI->eraseFromParent();
+  MI.eraseFromParent();
   return BB;
 }
 
 MachineBasicBlock *
-X86TargetLowering::EmitLoweredTLSAddr(MachineInstr *MI,
+X86TargetLowering::EmitLoweredTLSAddr(MachineInstr &MI,
                                       MachineBasicBlock *BB) const {
   // So, here we replace TLSADDR with the sequence:
   // adjust_stackdown -> TLSADDR -> adjust_stackup.
   // We need this because TLSADDR is lowered into calls
   // inside MC, therefore without the two markers shrink-wrapping
   // may push the prologue/epilogue pass them.
-  const TargetInstrInfo &TII = *Subtarget->getInstrInfo();
-  DebugLoc DL = MI->getDebugLoc();
+  const TargetInstrInfo &TII = *Subtarget.getInstrInfo();
+  DebugLoc DL = MI.getDebugLoc();
   MachineFunction &MF = *BB->getParent();
 
   // Emit CALLSEQ_START right before the instruction.
   unsigned AdjStackDown = TII.getCallFrameSetupOpcode();
   MachineInstrBuilder CallseqStart =
-    BuildMI(MF, DL, TII.get(AdjStackDown)).addImm(0);
+    BuildMI(MF, DL, TII.get(AdjStackDown)).addImm(0).addImm(0);
   BB->insert(MachineBasicBlock::iterator(MI), CallseqStart);
 
   // Emit CALLSEQ_END right after the instruction.
@@ -22257,86 +23670,89 @@ X86TargetLowering::EmitLoweredTLSAddr(MachineInstr *MI,
 }
 
 MachineBasicBlock *
-X86TargetLowering::EmitLoweredTLSCall(MachineInstr *MI,
+X86TargetLowering::EmitLoweredTLSCall(MachineInstr &MI,
                                       MachineBasicBlock *BB) const {
   // This is pretty easy.  We're taking the value that we received from
   // our load from the relocation, sticking it in either RDI (x86-64)
   // or EAX and doing an indirect call.  The return value will then
   // be in the normal return register.
   MachineFunction *F = BB->getParent();
-  const X86InstrInfo *TII = Subtarget->getInstrInfo();
-  DebugLoc DL = MI->getDebugLoc();
+  const X86InstrInfo *TII = Subtarget.getInstrInfo();
+  DebugLoc DL = MI.getDebugLoc();
 
-  assert(Subtarget->isTargetDarwin() && "Darwin only instr emitted?");
-  assert(MI->getOperand(3).isGlobal() && "This should be a global");
+  assert(Subtarget.isTargetDarwin() && "Darwin only instr emitted?");
+  assert(MI.getOperand(3).isGlobal() && "This should be a global");
 
   // Get a register mask for the lowered call.
   // FIXME: The 32-bit calls have non-standard calling conventions. Use a
   // proper register mask.
   const uint32_t *RegMask =
-      Subtarget->is64Bit() ?
-      Subtarget->getRegisterInfo()->getDarwinTLSCallPreservedMask() :
-      Subtarget->getRegisterInfo()->getCallPreservedMask(*F, CallingConv::C);
-  if (Subtarget->is64Bit()) {
-    MachineInstrBuilder MIB = BuildMI(*BB, MI, DL,
-                                      TII->get(X86::MOV64rm), X86::RDI)
-    .addReg(X86::RIP)
-    .addImm(0).addReg(0)
-    .addGlobalAddress(MI->getOperand(3).getGlobal(), 0,
-                      MI->getOperand(3).getTargetFlags())
-    .addReg(0);
+      Subtarget.is64Bit() ?
+      Subtarget.getRegisterInfo()->getDarwinTLSCallPreservedMask() :
+      Subtarget.getRegisterInfo()->getCallPreservedMask(*F, CallingConv::C);
+  if (Subtarget.is64Bit()) {
+    MachineInstrBuilder MIB =
+        BuildMI(*BB, MI, DL, TII->get(X86::MOV64rm), X86::RDI)
+            .addReg(X86::RIP)
+            .addImm(0)
+            .addReg(0)
+            .addGlobalAddress(MI.getOperand(3).getGlobal(), 0,
+                              MI.getOperand(3).getTargetFlags())
+            .addReg(0);
     MIB = BuildMI(*BB, MI, DL, TII->get(X86::CALL64m));
     addDirectMem(MIB, X86::RDI);
     MIB.addReg(X86::RAX, RegState::ImplicitDefine).addRegMask(RegMask);
-  } else if (F->getTarget().getRelocationModel() != Reloc::PIC_) {
-    MachineInstrBuilder MIB = BuildMI(*BB, MI, DL,
-                                      TII->get(X86::MOV32rm), X86::EAX)
-    .addReg(0)
-    .addImm(0).addReg(0)
-    .addGlobalAddress(MI->getOperand(3).getGlobal(), 0,
-                      MI->getOperand(3).getTargetFlags())
-    .addReg(0);
+  } else if (!isPositionIndependent()) {
+    MachineInstrBuilder MIB =
+        BuildMI(*BB, MI, DL, TII->get(X86::MOV32rm), X86::EAX)
+            .addReg(0)
+            .addImm(0)
+            .addReg(0)
+            .addGlobalAddress(MI.getOperand(3).getGlobal(), 0,
+                              MI.getOperand(3).getTargetFlags())
+            .addReg(0);
     MIB = BuildMI(*BB, MI, DL, TII->get(X86::CALL32m));
     addDirectMem(MIB, X86::EAX);
     MIB.addReg(X86::EAX, RegState::ImplicitDefine).addRegMask(RegMask);
   } else {
-    MachineInstrBuilder MIB = BuildMI(*BB, MI, DL,
-                                      TII->get(X86::MOV32rm), X86::EAX)
-    .addReg(TII->getGlobalBaseReg(F))
-    .addImm(0).addReg(0)
-    .addGlobalAddress(MI->getOperand(3).getGlobal(), 0,
-                      MI->getOperand(3).getTargetFlags())
-    .addReg(0);
+    MachineInstrBuilder MIB =
+        BuildMI(*BB, MI, DL, TII->get(X86::MOV32rm), X86::EAX)
+            .addReg(TII->getGlobalBaseReg(F))
+            .addImm(0)
+            .addReg(0)
+            .addGlobalAddress(MI.getOperand(3).getGlobal(), 0,
+                              MI.getOperand(3).getTargetFlags())
+            .addReg(0);
     MIB = BuildMI(*BB, MI, DL, TII->get(X86::CALL32m));
     addDirectMem(MIB, X86::EAX);
     MIB.addReg(X86::EAX, RegState::ImplicitDefine).addRegMask(RegMask);
   }
 
-  MI->eraseFromParent(); // The pseudo instruction is gone now.
+  MI.eraseFromParent(); // The pseudo instruction is gone now.
   return BB;
 }
 
 MachineBasicBlock *
-X86TargetLowering::emitEHSjLjSetJmp(MachineInstr *MI,
+X86TargetLowering::emitEHSjLjSetJmp(MachineInstr &MI,
                                     MachineBasicBlock *MBB) const {
-  DebugLoc DL = MI->getDebugLoc();
+  DebugLoc DL = MI.getDebugLoc();
   MachineFunction *MF = MBB->getParent();
-  const TargetInstrInfo *TII = Subtarget->getInstrInfo();
+  const TargetInstrInfo *TII = Subtarget.getInstrInfo();
   MachineRegisterInfo &MRI = MF->getRegInfo();
 
   const BasicBlock *BB = MBB->getBasicBlock();
   MachineFunction::iterator I = ++MBB->getIterator();
 
   // Memory Reference
-  MachineInstr::mmo_iterator MMOBegin = MI->memoperands_begin();
-  MachineInstr::mmo_iterator MMOEnd = MI->memoperands_end();
+  MachineInstr::mmo_iterator MMOBegin = MI.memoperands_begin();
+  MachineInstr::mmo_iterator MMOEnd = MI.memoperands_end();
 
   unsigned DstReg;
   unsigned MemOpndSlot = 0;
 
   unsigned CurOp = 0;
 
-  DstReg = MI->getOperand(CurOp++).getReg();
+  DstReg = MI.getOperand(CurOp++).getReg();
   const TargetRegisterClass *RC = MRI.getRegClass(DstReg);
   assert(RC->hasType(MVT::i32) && "Invalid destination!");
   unsigned mainDstReg = MRI.createVirtualRegister(RC);
@@ -22384,16 +23800,15 @@ X86TargetLowering::emitEHSjLjSetJmp(MachineInstr *MI,
   unsigned PtrStoreOpc = 0;
   unsigned LabelReg = 0;
   const int64_t LabelOffset = 1 * PVT.getStoreSize();
-  Reloc::Model RM = MF->getTarget().getRelocationModel();
   bool UseImmLabel = (MF->getTarget().getCodeModel() == CodeModel::Small) &&
-                     (RM == Reloc::Static || RM == Reloc::DynamicNoPIC);
+                     !isPositionIndependent();
 
   // Prepare IP either in reg or imm.
   if (!UseImmLabel) {
     PtrStoreOpc = (PVT == MVT::i64) ? X86::MOV64mr : X86::MOV32mr;
     const TargetRegisterClass *PtrRC = getRegClassFor(PVT);
     LabelReg = MRI.createVirtualRegister(PtrRC);
-    if (Subtarget->is64Bit()) {
+    if (Subtarget.is64Bit()) {
       MIB = BuildMI(*thisMBB, MI, DL, TII->get(X86::LEA64r), LabelReg)
               .addReg(X86::RIP)
               .addImm(0)
@@ -22406,7 +23821,7 @@ X86TargetLowering::emitEHSjLjSetJmp(MachineInstr *MI,
               .addReg(XII->getGlobalBaseReg(MF))
               .addImm(0)
               .addReg(0)
-              .addMBB(restoreMBB, Subtarget->ClassifyBlockAddressReference())
+              .addMBB(restoreMBB, Subtarget.classifyBlockAddressReference())
               .addReg(0);
     }
   } else
@@ -22415,9 +23830,9 @@ X86TargetLowering::emitEHSjLjSetJmp(MachineInstr *MI,
   MIB = BuildMI(*thisMBB, MI, DL, TII->get(PtrStoreOpc));
   for (unsigned i = 0; i < X86::AddrNumOperands; ++i) {
     if (i == X86::AddrDisp)
-      MIB.addDisp(MI->getOperand(MemOpndSlot + i), LabelOffset);
+      MIB.addDisp(MI.getOperand(MemOpndSlot + i), LabelOffset);
     else
-      MIB.addOperand(MI->getOperand(MemOpndSlot + i));
+      MIB.addOperand(MI.getOperand(MemOpndSlot + i));
   }
   if (!UseImmLabel)
     MIB.addReg(LabelReg);
@@ -22428,7 +23843,7 @@ X86TargetLowering::emitEHSjLjSetJmp(MachineInstr *MI,
   MIB = BuildMI(*thisMBB, MI, DL, TII->get(X86::EH_SjLj_Setup))
           .addMBB(restoreMBB);
 
-  const X86RegisterInfo *RegInfo = Subtarget->getRegisterInfo();
+  const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
   MIB.addRegMask(RegInfo->getNoPreservedMask());
   thisMBB->addSuccessor(mainMBB);
   thisMBB->addSuccessor(restoreMBB);
@@ -22447,7 +23862,7 @@ X86TargetLowering::emitEHSjLjSetJmp(MachineInstr *MI,
   // restoreMBB:
   if (RegInfo->hasBasePointer(*MF)) {
     const bool Uses64BitFramePtr =
-        Subtarget->isTarget64BitLP64() || Subtarget->isTargetNaCl64();
+        Subtarget.isTarget64BitLP64() || Subtarget.isTargetNaCl64();
     X86MachineFunctionInfo *X86FI = MF->getInfo<X86MachineFunctionInfo>();
     X86FI->setRestoreBasePointer(MF);
     unsigned FramePtr = RegInfo->getFrameRegister(*MF);
@@ -22461,21 +23876,21 @@ X86TargetLowering::emitEHSjLjSetJmp(MachineInstr *MI,
   BuildMI(restoreMBB, DL, TII->get(X86::JMP_1)).addMBB(sinkMBB);
   restoreMBB->addSuccessor(sinkMBB);
 
-  MI->eraseFromParent();
+  MI.eraseFromParent();
   return sinkMBB;
 }
 
 MachineBasicBlock *
-X86TargetLowering::emitEHSjLjLongJmp(MachineInstr *MI,
+X86TargetLowering::emitEHSjLjLongJmp(MachineInstr &MI,
                                      MachineBasicBlock *MBB) const {
-  DebugLoc DL = MI->getDebugLoc();
+  DebugLoc DL = MI.getDebugLoc();
   MachineFunction *MF = MBB->getParent();
-  const TargetInstrInfo *TII = Subtarget->getInstrInfo();
+  const TargetInstrInfo *TII = Subtarget.getInstrInfo();
   MachineRegisterInfo &MRI = MF->getRegInfo();
 
   // Memory Reference
-  MachineInstr::mmo_iterator MMOBegin = MI->memoperands_begin();
-  MachineInstr::mmo_iterator MMOEnd = MI->memoperands_end();
+  MachineInstr::mmo_iterator MMOBegin = MI.memoperands_begin();
+  MachineInstr::mmo_iterator MMOEnd = MI.memoperands_end();
 
   MVT PVT = getPointerTy(MF->getDataLayout());
   assert((PVT == MVT::i64 || PVT == MVT::i32) &&
@@ -22485,7 +23900,7 @@ X86TargetLowering::emitEHSjLjLongJmp(MachineInstr *MI,
     (PVT == MVT::i64) ? &X86::GR64RegClass : &X86::GR32RegClass;
   unsigned Tmp = MRI.createVirtualRegister(RC);
   // Since FP is only updated here but NOT referenced, it's treated as GPR.
-  const X86RegisterInfo *RegInfo = Subtarget->getRegisterInfo();
+  const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
   unsigned FP = (PVT == MVT::i64) ? X86::RBP : X86::EBP;
   unsigned SP = RegInfo->getStackRegister();
 
@@ -22500,41 +23915,275 @@ X86TargetLowering::emitEHSjLjLongJmp(MachineInstr *MI,
   // Reload FP
   MIB = BuildMI(*MBB, MI, DL, TII->get(PtrLoadOpc), FP);
   for (unsigned i = 0; i < X86::AddrNumOperands; ++i)
-    MIB.addOperand(MI->getOperand(i));
+    MIB.addOperand(MI.getOperand(i));
   MIB.setMemRefs(MMOBegin, MMOEnd);
   // Reload IP
   MIB = BuildMI(*MBB, MI, DL, TII->get(PtrLoadOpc), Tmp);
   for (unsigned i = 0; i < X86::AddrNumOperands; ++i) {
     if (i == X86::AddrDisp)
-      MIB.addDisp(MI->getOperand(i), LabelOffset);
+      MIB.addDisp(MI.getOperand(i), LabelOffset);
     else
-      MIB.addOperand(MI->getOperand(i));
+      MIB.addOperand(MI.getOperand(i));
   }
   MIB.setMemRefs(MMOBegin, MMOEnd);
   // Reload SP
   MIB = BuildMI(*MBB, MI, DL, TII->get(PtrLoadOpc), SP);
   for (unsigned i = 0; i < X86::AddrNumOperands; ++i) {
     if (i == X86::AddrDisp)
-      MIB.addDisp(MI->getOperand(i), SPOffset);
+      MIB.addDisp(MI.getOperand(i), SPOffset);
     else
-      MIB.addOperand(MI->getOperand(i));
+      MIB.addOperand(MI.getOperand(i));
   }
   MIB.setMemRefs(MMOBegin, MMOEnd);
   // Jump
   BuildMI(*MBB, MI, DL, TII->get(IJmpOpc)).addReg(Tmp);
 
-  MI->eraseFromParent();
+  MI.eraseFromParent();
   return MBB;
 }
 
+void X86TargetLowering::SetupEntryBlockForSjLj(MachineInstr &MI,
+                                               MachineBasicBlock *MBB,
+                                               MachineBasicBlock *DispatchBB,
+                                               int FI) const {
+  DebugLoc DL = MI.getDebugLoc();
+  MachineFunction *MF = MBB->getParent();
+  MachineRegisterInfo *MRI = &MF->getRegInfo();
+  const TargetInstrInfo *TII = Subtarget.getInstrInfo();
+
+  MVT PVT = getPointerTy(MF->getDataLayout());
+  assert((PVT == MVT::i64 || PVT == MVT::i32) && "Invalid Pointer Size!");
+
+  unsigned Op = 0;
+  unsigned VR = 0;
+
+  bool UseImmLabel = (MF->getTarget().getCodeModel() == CodeModel::Small) &&
+                     !isPositionIndependent();
+
+  if (UseImmLabel) {
+    Op = (PVT == MVT::i64) ? X86::MOV64mi32 : X86::MOV32mi;
+  } else {
+    const TargetRegisterClass *TRC =
+        (PVT == MVT::i64) ? &X86::GR64RegClass : &X86::GR32RegClass;
+    VR = MRI->createVirtualRegister(TRC);
+    Op = (PVT == MVT::i64) ? X86::MOV64mr : X86::MOV32mr;
+
+    /* const X86InstrInfo *XII = static_cast<const X86InstrInfo *>(TII); */
+
+    if (Subtarget.is64Bit())
+      BuildMI(*MBB, MI, DL, TII->get(X86::LEA64r), VR)
+          .addReg(X86::RIP)
+          .addImm(1)
+          .addReg(0)
+          .addMBB(DispatchBB)
+          .addReg(0);
+    else
+      BuildMI(*MBB, MI, DL, TII->get(X86::LEA32r), VR)
+          .addReg(0) /* XII->getGlobalBaseReg(MF) */
+          .addImm(1)
+          .addReg(0)
+          .addMBB(DispatchBB, Subtarget.classifyBlockAddressReference())
+          .addReg(0);
+  }
+
+  MachineInstrBuilder MIB = BuildMI(*MBB, MI, DL, TII->get(Op));
+  addFrameReference(MIB, FI, 36);
+  if (UseImmLabel)
+    MIB.addMBB(DispatchBB);
+  else
+    MIB.addReg(VR);
+}
+
+MachineBasicBlock *
+X86TargetLowering::EmitSjLjDispatchBlock(MachineInstr &MI,
+                                         MachineBasicBlock *BB) const {
+  DebugLoc DL = MI.getDebugLoc();
+  MachineFunction *MF = BB->getParent();
+  MachineModuleInfo *MMI = &MF->getMMI();
+  MachineFrameInfo *MFI = MF->getFrameInfo();
+  MachineRegisterInfo *MRI = &MF->getRegInfo();
+  const TargetInstrInfo *TII = Subtarget.getInstrInfo();
+  int FI = MFI->getFunctionContextIndex();
+
+  // Get a mapping of the call site numbers to all of the landing pads they're
+  // associated with.
+  DenseMap<unsigned, SmallVector<MachineBasicBlock *, 2>> CallSiteNumToLPad;
+  unsigned MaxCSNum = 0;
+  for (auto &MBB : *MF) {
+    if (!MBB.isEHPad())
+      continue;
+
+    MCSymbol *Sym = nullptr;
+    for (const auto &MI : MBB) {
+      if (MI.isDebugValue())
+        continue;
+
+      assert(MI.isEHLabel() && "expected EH_LABEL");
+      Sym = MI.getOperand(0).getMCSymbol();
+      break;
+    }
+
+    if (!MMI->hasCallSiteLandingPad(Sym))
+      continue;
+
+    for (unsigned CSI : MMI->getCallSiteLandingPad(Sym)) {
+      CallSiteNumToLPad[CSI].push_back(&MBB);
+      MaxCSNum = std::max(MaxCSNum, CSI);
+    }
+  }
+
+  // Get an ordered list of the machine basic blocks for the jump table.
+  std::vector<MachineBasicBlock *> LPadList;
+  SmallPtrSet<MachineBasicBlock *, 32> InvokeBBs;
+  LPadList.reserve(CallSiteNumToLPad.size());
+
+  for (unsigned CSI = 1; CSI <= MaxCSNum; ++CSI) {
+    for (auto &LP : CallSiteNumToLPad[CSI]) {
+      LPadList.push_back(LP);
+      InvokeBBs.insert(LP->pred_begin(), LP->pred_end());
+    }
+  }
+
+  assert(!LPadList.empty() &&
+         "No landing pad destinations for the dispatch jump table!");
+
+  // Create the MBBs for the dispatch code.
+
+  // Shove the dispatch's address into the return slot in the function context.
+  MachineBasicBlock *DispatchBB = MF->CreateMachineBasicBlock();
+  DispatchBB->setIsEHPad(true);
+
+  MachineBasicBlock *TrapBB = MF->CreateMachineBasicBlock();
+  BuildMI(TrapBB, DL, TII->get(X86::TRAP));
+  DispatchBB->addSuccessor(TrapBB);
+
+  MachineBasicBlock *DispContBB = MF->CreateMachineBasicBlock();
+  DispatchBB->addSuccessor(DispContBB);
+
+  // Insert MBBs.
+  MF->push_back(DispatchBB);
+  MF->push_back(DispContBB);
+  MF->push_back(TrapBB);
+
+  // Insert code into the entry block that creates and registers the function
+  // context.
+  SetupEntryBlockForSjLj(MI, BB, DispatchBB, FI);
+
+  // Create the jump table and associated information
+  MachineJumpTableInfo *JTI =
+      MF->getOrCreateJumpTableInfo(getJumpTableEncoding());
+  unsigned MJTI = JTI->createJumpTableIndex(LPadList);
+
+  const X86InstrInfo *XII = static_cast<const X86InstrInfo *>(TII);
+  const X86RegisterInfo &RI = XII->getRegisterInfo();
+
+  // Add a register mask with no preserved registers.  This results in all
+  // registers being marked as clobbered.
+  if (RI.hasBasePointer(*MF)) {
+    const bool FPIs64Bit =
+        Subtarget.isTarget64BitLP64() || Subtarget.isTargetNaCl64();
+    X86MachineFunctionInfo *MFI = MF->getInfo<X86MachineFunctionInfo>();
+    MFI->setRestoreBasePointer(MF);
+
+    unsigned FP = RI.getFrameRegister(*MF);
+    unsigned BP = RI.getBaseRegister();
+    unsigned Op = FPIs64Bit ? X86::MOV64rm : X86::MOV32rm;
+    addRegOffset(BuildMI(DispatchBB, DL, TII->get(Op), BP), FP, true,
+                 MFI->getRestoreBasePointerOffset())
+        .addRegMask(RI.getNoPreservedMask());
+  } else {
+    BuildMI(DispatchBB, DL, TII->get(X86::NOOP))
+        .addRegMask(RI.getNoPreservedMask());
+  }
+
+  unsigned IReg = MRI->createVirtualRegister(&X86::GR32RegClass);
+  addFrameReference(BuildMI(DispatchBB, DL, TII->get(X86::MOV32rm), IReg), FI,
+                    4);
+  BuildMI(DispatchBB, DL, TII->get(X86::CMP32ri))
+      .addReg(IReg)
+      .addImm(LPadList.size());
+  BuildMI(DispatchBB, DL, TII->get(X86::JA_1)).addMBB(TrapBB);
+
+  unsigned JReg = MRI->createVirtualRegister(&X86::GR32RegClass);
+  BuildMI(DispContBB, DL, TII->get(X86::SUB32ri), JReg)
+      .addReg(IReg)
+      .addImm(1);
+  BuildMI(DispContBB, DL,
+          TII->get(Subtarget.is64Bit() ? X86::JMP64m : X86::JMP32m))
+      .addReg(0)
+      .addImm(Subtarget.is64Bit() ? 8 : 4)
+      .addReg(JReg)
+      .addJumpTableIndex(MJTI)
+      .addReg(0);
+
+  // Add the jump table entries as successors to the MBB.
+  SmallPtrSet<MachineBasicBlock *, 8> SeenMBBs;
+  for (auto &LP : LPadList)
+    if (SeenMBBs.insert(LP).second)
+      DispContBB->addSuccessor(LP);
+
+  // N.B. the order the invoke BBs are processed in doesn't matter here.
+  SmallVector<MachineBasicBlock *, 64> MBBLPads;
+  const MCPhysReg *SavedRegs =
+      Subtarget.getRegisterInfo()->getCalleeSavedRegs(MF);
+  for (MachineBasicBlock *MBB : InvokeBBs) {
+    // Remove the landing pad successor from the invoke block and replace it
+    // with the new dispatch block.
+    // Keep a copy of Successors since it's modified inside the loop.
+    SmallVector<MachineBasicBlock *, 8> Successors(MBB->succ_rbegin(),
+                                                   MBB->succ_rend());
+    // FIXME: Avoid quadratic complexity.
+    for (auto MBBS : Successors) {
+      if (MBBS->isEHPad()) {
+        MBB->removeSuccessor(MBBS);
+        MBBLPads.push_back(MBBS);
+      }
+    }
+
+    MBB->addSuccessor(DispatchBB);
+
+    // Find the invoke call and mark all of the callee-saved registers as
+    // 'implicit defined' so that they're spilled.  This prevents code from
+    // moving instructions to before the EH block, where they will never be
+    // executed.
+    for (auto &II : reverse(*MBB)) {
+      if (!II.isCall())
+        continue;
+
+      DenseMap<unsigned, bool> DefRegs;
+      for (auto &MOp : II.operands())
+        if (MOp.isReg())
+          DefRegs[MOp.getReg()] = true;
+
+      MachineInstrBuilder MIB(*MF, &II);
+      for (unsigned RI = 0; SavedRegs[RI]; ++RI) {
+        unsigned Reg = SavedRegs[RI];
+        if (!DefRegs[Reg])
+          MIB.addReg(Reg, RegState::ImplicitDefine | RegState::Dead);
+      }
+
+      break;
+    }
+  }
+
+  // Mark all former landing pads as non-landing pads.  The dispatch is the only
+  // landing pad now.
+  for (auto &LP : MBBLPads)
+    LP->setIsEHPad(false);
+
+  // The instruction is gone now.
+  MI.eraseFromParent();
+  return BB;
+}
+
 // Replace 213-type (isel default) FMA3 instructions with 231-type for
 // accumulator loops. Writing back to the accumulator allows the coalescer
 // to remove extra copies in the loop.
 // FIXME: Do this on AVX512.  We don't support 231 variants yet (PR23937).
 MachineBasicBlock *
-X86TargetLowering::emitFMA3Instr(MachineInstr *MI,
+X86TargetLowering::emitFMA3Instr(MachineInstr &MI,
                                  MachineBasicBlock *MBB) const {
-  MachineOperand &AddendOp = MI->getOperand(3);
+  MachineOperand &AddendOp = MI.getOperand(3);
 
   // Bail out early if the addend isn't a register - we can't switch these.
   if (!AddendOp.isReg())
@@ -22565,55 +24214,120 @@ X86TargetLowering::emitFMA3Instr(MachineInstr *MI,
     assert(AddendDef.getOperand(i).isReg());
     MachineOperand PHISrcOp = AddendDef.getOperand(i);
     MachineInstr &PHISrcInst = *MRI.def_instr_begin(PHISrcOp.getReg());
-    if (&PHISrcInst == MI) {
+    if (&PHISrcInst == &MI) {
       // Found a matching instruction.
       unsigned NewFMAOpc = 0;
-      switch (MI->getOpcode()) {
-        case X86::VFMADDPDr213r: NewFMAOpc = X86::VFMADDPDr231r; break;
-        case X86::VFMADDPSr213r: NewFMAOpc = X86::VFMADDPSr231r; break;
-        case X86::VFMADDSDr213r: NewFMAOpc = X86::VFMADDSDr231r; break;
-        case X86::VFMADDSSr213r: NewFMAOpc = X86::VFMADDSSr231r; break;
-        case X86::VFMSUBPDr213r: NewFMAOpc = X86::VFMSUBPDr231r; break;
-        case X86::VFMSUBPSr213r: NewFMAOpc = X86::VFMSUBPSr231r; break;
-        case X86::VFMSUBSDr213r: NewFMAOpc = X86::VFMSUBSDr231r; break;
-        case X86::VFMSUBSSr213r: NewFMAOpc = X86::VFMSUBSSr231r; break;
-        case X86::VFNMADDPDr213r: NewFMAOpc = X86::VFNMADDPDr231r; break;
-        case X86::VFNMADDPSr213r: NewFMAOpc = X86::VFNMADDPSr231r; break;
-        case X86::VFNMADDSDr213r: NewFMAOpc = X86::VFNMADDSDr231r; break;
-        case X86::VFNMADDSSr213r: NewFMAOpc = X86::VFNMADDSSr231r; break;
-        case X86::VFNMSUBPDr213r: NewFMAOpc = X86::VFNMSUBPDr231r; break;
-        case X86::VFNMSUBPSr213r: NewFMAOpc = X86::VFNMSUBPSr231r; break;
-        case X86::VFNMSUBSDr213r: NewFMAOpc = X86::VFNMSUBSDr231r; break;
-        case X86::VFNMSUBSSr213r: NewFMAOpc = X86::VFNMSUBSSr231r; break;
-        case X86::VFMADDSUBPDr213r: NewFMAOpc = X86::VFMADDSUBPDr231r; break;
-        case X86::VFMADDSUBPSr213r: NewFMAOpc = X86::VFMADDSUBPSr231r; break;
-        case X86::VFMSUBADDPDr213r: NewFMAOpc = X86::VFMSUBADDPDr231r; break;
-        case X86::VFMSUBADDPSr213r: NewFMAOpc = X86::VFMSUBADDPSr231r; break;
-
-        case X86::VFMADDPDr213rY: NewFMAOpc = X86::VFMADDPDr231rY; break;
-        case X86::VFMADDPSr213rY: NewFMAOpc = X86::VFMADDPSr231rY; break;
-        case X86::VFMSUBPDr213rY: NewFMAOpc = X86::VFMSUBPDr231rY; break;
-        case X86::VFMSUBPSr213rY: NewFMAOpc = X86::VFMSUBPSr231rY; break;
-        case X86::VFNMADDPDr213rY: NewFMAOpc = X86::VFNMADDPDr231rY; break;
-        case X86::VFNMADDPSr213rY: NewFMAOpc = X86::VFNMADDPSr231rY; break;
-        case X86::VFNMSUBPDr213rY: NewFMAOpc = X86::VFNMSUBPDr231rY; break;
-        case X86::VFNMSUBPSr213rY: NewFMAOpc = X86::VFNMSUBPSr231rY; break;
-        case X86::VFMADDSUBPDr213rY: NewFMAOpc = X86::VFMADDSUBPDr231rY; break;
-        case X86::VFMADDSUBPSr213rY: NewFMAOpc = X86::VFMADDSUBPSr231rY; break;
-        case X86::VFMSUBADDPDr213rY: NewFMAOpc = X86::VFMSUBADDPDr231rY; break;
-        case X86::VFMSUBADDPSr213rY: NewFMAOpc = X86::VFMSUBADDPSr231rY; break;
-        default: llvm_unreachable("Unrecognized FMA variant.");
+      switch (MI.getOpcode()) {
+      case X86::VFMADDPDr213r:
+        NewFMAOpc = X86::VFMADDPDr231r;
+        break;
+      case X86::VFMADDPSr213r:
+        NewFMAOpc = X86::VFMADDPSr231r;
+        break;
+      case X86::VFMADDSDr213r:
+        NewFMAOpc = X86::VFMADDSDr231r;
+        break;
+      case X86::VFMADDSSr213r:
+        NewFMAOpc = X86::VFMADDSSr231r;
+        break;
+      case X86::VFMSUBPDr213r:
+        NewFMAOpc = X86::VFMSUBPDr231r;
+        break;
+      case X86::VFMSUBPSr213r:
+        NewFMAOpc = X86::VFMSUBPSr231r;
+        break;
+      case X86::VFMSUBSDr213r:
+        NewFMAOpc = X86::VFMSUBSDr231r;
+        break;
+      case X86::VFMSUBSSr213r:
+        NewFMAOpc = X86::VFMSUBSSr231r;
+        break;
+      case X86::VFNMADDPDr213r:
+        NewFMAOpc = X86::VFNMADDPDr231r;
+        break;
+      case X86::VFNMADDPSr213r:
+        NewFMAOpc = X86::VFNMADDPSr231r;
+        break;
+      case X86::VFNMADDSDr213r:
+        NewFMAOpc = X86::VFNMADDSDr231r;
+        break;
+      case X86::VFNMADDSSr213r:
+        NewFMAOpc = X86::VFNMADDSSr231r;
+        break;
+      case X86::VFNMSUBPDr213r:
+        NewFMAOpc = X86::VFNMSUBPDr231r;
+        break;
+      case X86::VFNMSUBPSr213r:
+        NewFMAOpc = X86::VFNMSUBPSr231r;
+        break;
+      case X86::VFNMSUBSDr213r:
+        NewFMAOpc = X86::VFNMSUBSDr231r;
+        break;
+      case X86::VFNMSUBSSr213r:
+        NewFMAOpc = X86::VFNMSUBSSr231r;
+        break;
+      case X86::VFMADDSUBPDr213r:
+        NewFMAOpc = X86::VFMADDSUBPDr231r;
+        break;
+      case X86::VFMADDSUBPSr213r:
+        NewFMAOpc = X86::VFMADDSUBPSr231r;
+        break;
+      case X86::VFMSUBADDPDr213r:
+        NewFMAOpc = X86::VFMSUBADDPDr231r;
+        break;
+      case X86::VFMSUBADDPSr213r:
+        NewFMAOpc = X86::VFMSUBADDPSr231r;
+        break;
+
+      case X86::VFMADDPDr213rY:
+        NewFMAOpc = X86::VFMADDPDr231rY;
+        break;
+      case X86::VFMADDPSr213rY:
+        NewFMAOpc = X86::VFMADDPSr231rY;
+        break;
+      case X86::VFMSUBPDr213rY:
+        NewFMAOpc = X86::VFMSUBPDr231rY;
+        break;
+      case X86::VFMSUBPSr213rY:
+        NewFMAOpc = X86::VFMSUBPSr231rY;
+        break;
+      case X86::VFNMADDPDr213rY:
+        NewFMAOpc = X86::VFNMADDPDr231rY;
+        break;
+      case X86::VFNMADDPSr213rY:
+        NewFMAOpc = X86::VFNMADDPSr231rY;
+        break;
+      case X86::VFNMSUBPDr213rY:
+        NewFMAOpc = X86::VFNMSUBPDr231rY;
+        break;
+      case X86::VFNMSUBPSr213rY:
+        NewFMAOpc = X86::VFNMSUBPSr231rY;
+        break;
+      case X86::VFMADDSUBPDr213rY:
+        NewFMAOpc = X86::VFMADDSUBPDr231rY;
+        break;
+      case X86::VFMADDSUBPSr213rY:
+        NewFMAOpc = X86::VFMADDSUBPSr231rY;
+        break;
+      case X86::VFMSUBADDPDr213rY:
+        NewFMAOpc = X86::VFMSUBADDPDr231rY;
+        break;
+      case X86::VFMSUBADDPSr213rY:
+        NewFMAOpc = X86::VFMSUBADDPSr231rY;
+        break;
+      default:
+        llvm_unreachable("Unrecognized FMA variant.");
       }
 
-      const TargetInstrInfo &TII = *Subtarget->getInstrInfo();
+      const TargetInstrInfo &TII = *Subtarget.getInstrInfo();
       MachineInstrBuilder MIB =
-        BuildMI(MF, MI->getDebugLoc(), TII.get(NewFMAOpc))
-        .addOperand(MI->getOperand(0))
-        .addOperand(MI->getOperand(3))
-        .addOperand(MI->getOperand(2))
-        .addOperand(MI->getOperand(1));
+          BuildMI(MF, MI.getDebugLoc(), TII.get(NewFMAOpc))
+              .addOperand(MI.getOperand(0))
+              .addOperand(MI.getOperand(3))
+              .addOperand(MI.getOperand(2))
+              .addOperand(MI.getOperand(1));
       MBB->insert(MachineBasicBlock::iterator(MI), MIB);
-      MI->eraseFromParent();
+      MI.eraseFromParent();
     }
   }
 
@@ -22621,9 +24335,9 @@ X86TargetLowering::emitFMA3Instr(MachineInstr *MI,
 }
 
 MachineBasicBlock *
-X86TargetLowering::EmitInstrWithCustomInserter(MachineInstr *MI,
+X86TargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI,
                                                MachineBasicBlock *BB) const {
-  switch (MI->getOpcode()) {
+  switch (MI.getOpcode()) {
   default: llvm_unreachable("Unexpected instr type to insert");
   case X86::TAILJMPd64:
   case X86::TAILJMPr64:
@@ -22641,8 +24355,6 @@ X86TargetLowering::EmitInstrWithCustomInserter(MachineInstr *MI,
   case X86::TLS_base_addr32:
   case X86::TLS_base_addr64:
     return EmitLoweredTLSAddr(MI, BB);
-  case X86::WIN_ALLOCA:
-    return EmitLoweredWinAlloca(MI, BB);
   case X86::CATCHRET:
     return EmitLoweredCatchRet(MI, BB);
   case X86::CATCHPAD:
@@ -22679,31 +24391,35 @@ X86TargetLowering::EmitInstrWithCustomInserter(MachineInstr *MI,
 
   case X86::RDFLAGS32:
   case X86::RDFLAGS64: {
-    DebugLoc DL = MI->getDebugLoc();
-    const TargetInstrInfo *TII = Subtarget->getInstrInfo();
+    DebugLoc DL = MI.getDebugLoc();
+    const TargetInstrInfo *TII = Subtarget.getInstrInfo();
     unsigned PushF =
-        MI->getOpcode() == X86::RDFLAGS32 ? X86::PUSHF32 : X86::PUSHF64;
-    unsigned Pop =
-        MI->getOpcode() == X86::RDFLAGS32 ? X86::POP32r : X86::POP64r;
-    BuildMI(*BB, MI, DL, TII->get(PushF));
-    BuildMI(*BB, MI, DL, TII->get(Pop), MI->getOperand(0).getReg());
-
-    MI->eraseFromParent(); // The pseudo is gone now.
+        MI.getOpcode() == X86::RDFLAGS32 ? X86::PUSHF32 : X86::PUSHF64;
+    unsigned Pop = MI.getOpcode() == X86::RDFLAGS32 ? X86::POP32r : X86::POP64r;
+    MachineInstr *Push = BuildMI(*BB, MI, DL, TII->get(PushF));
+    // Permit reads of the FLAGS register without it being defined.
+    // This intrinsic exists to read external processor state in flags, such as
+    // the trap flag, interrupt flag, and direction flag, none of which are
+    // modeled by the backend.
+    Push->getOperand(2).setIsUndef();
+    BuildMI(*BB, MI, DL, TII->get(Pop), MI.getOperand(0).getReg());
+
+    MI.eraseFromParent(); // The pseudo is gone now.
     return BB;
   }
 
   case X86::WRFLAGS32:
   case X86::WRFLAGS64: {
-    DebugLoc DL = MI->getDebugLoc();
-    const TargetInstrInfo *TII = Subtarget->getInstrInfo();
+    DebugLoc DL = MI.getDebugLoc();
+    const TargetInstrInfo *TII = Subtarget.getInstrInfo();
     unsigned Push =
-        MI->getOpcode() == X86::WRFLAGS32 ? X86::PUSH32r : X86::PUSH64r;
+        MI.getOpcode() == X86::WRFLAGS32 ? X86::PUSH32r : X86::PUSH64r;
     unsigned PopF =
-        MI->getOpcode() == X86::WRFLAGS32 ? X86::POPF32 : X86::POPF64;
-    BuildMI(*BB, MI, DL, TII->get(Push)).addReg(MI->getOperand(0).getReg());
+        MI.getOpcode() == X86::WRFLAGS32 ? X86::POPF32 : X86::POPF64;
+    BuildMI(*BB, MI, DL, TII->get(Push)).addReg(MI.getOperand(0).getReg());
     BuildMI(*BB, MI, DL, TII->get(PopF));
 
-    MI->eraseFromParent(); // The pseudo is gone now.
+    MI.eraseFromParent(); // The pseudo is gone now.
     return BB;
   }
 
@@ -22721,8 +24437,8 @@ X86TargetLowering::EmitInstrWithCustomInserter(MachineInstr *MI,
   case X86::FP80_TO_INT32_IN_MEM:
   case X86::FP80_TO_INT64_IN_MEM: {
     MachineFunction *F = BB->getParent();
-    const TargetInstrInfo *TII = Subtarget->getInstrInfo();
-    DebugLoc DL = MI->getDebugLoc();
+    const TargetInstrInfo *TII = Subtarget.getInstrInfo();
+    DebugLoc DL = MI.getDebugLoc();
 
     // Change the floating point control register to use "round towards zero"
     // mode when truncating to an integer value.
@@ -22750,7 +24466,7 @@ X86TargetLowering::EmitInstrWithCustomInserter(MachineInstr *MI,
 
     // Get the X86 opcode to use.
     unsigned Opc;
-    switch (MI->getOpcode()) {
+    switch (MI.getOpcode()) {
     default: llvm_unreachable("illegal opcode!");
     case X86::FP32_TO_INT16_IN_MEM: Opc = X86::IST_Fp16m32; break;
     case X86::FP32_TO_INT32_IN_MEM: Opc = X86::IST_Fp32m32; break;
@@ -22763,35 +24479,15 @@ X86TargetLowering::EmitInstrWithCustomInserter(MachineInstr *MI,
     case X86::FP80_TO_INT64_IN_MEM: Opc = X86::IST_Fp64m80; break;
     }
 
-    X86AddressMode AM;
-    MachineOperand &Op = MI->getOperand(0);
-    if (Op.isReg()) {
-      AM.BaseType = X86AddressMode::RegBase;
-      AM.Base.Reg = Op.getReg();
-    } else {
-      AM.BaseType = X86AddressMode::FrameIndexBase;
-      AM.Base.FrameIndex = Op.getIndex();
-    }
-    Op = MI->getOperand(1);
-    if (Op.isImm())
-      AM.Scale = Op.getImm();
-    Op = MI->getOperand(2);
-    if (Op.isImm())
-      AM.IndexReg = Op.getImm();
-    Op = MI->getOperand(3);
-    if (Op.isGlobal()) {
-      AM.GV = Op.getGlobal();
-    } else {
-      AM.Disp = Op.getImm();
-    }
+    X86AddressMode AM = getAddressFromInstr(&MI, 0);
     addFullAddress(BuildMI(*BB, MI, DL, TII->get(Opc)), AM)
-                      .addReg(MI->getOperand(X86::AddrNumOperands).getReg());
+        .addReg(MI.getOperand(X86::AddrNumOperands).getReg());
 
     // Reload the original control word now.
     addFrameReference(BuildMI(*BB, MI, DL,
                               TII->get(X86::FLDCW16m)), CWFrameIdx);
 
-    MI->eraseFromParent();   // The pseudo instruction is gone now.
+    MI.eraseFromParent(); // The pseudo instruction is gone now.
     return BB;
   }
     // String/text processing lowering.
@@ -22803,9 +24499,9 @@ X86TargetLowering::EmitInstrWithCustomInserter(MachineInstr *MI,
   case X86::VPCMPESTRM128REG:
   case X86::PCMPESTRM128MEM:
   case X86::VPCMPESTRM128MEM:
-    assert(Subtarget->hasSSE42() &&
+    assert(Subtarget.hasSSE42() &&
            "Target must have SSE4.2 or AVX features enabled");
-    return EmitPCMPSTRM(MI, BB, Subtarget->getInstrInfo());
+    return emitPCMPSTRM(MI, BB, Subtarget.getInstrInfo());
 
   // String/text processing lowering.
   case X86::PCMPISTRIREG:
@@ -22816,21 +24512,23 @@ X86TargetLowering::EmitInstrWithCustomInserter(MachineInstr *MI,
   case X86::VPCMPESTRIREG:
   case X86::PCMPESTRIMEM:
   case X86::VPCMPESTRIMEM:
-    assert(Subtarget->hasSSE42() &&
+    assert(Subtarget.hasSSE42() &&
            "Target must have SSE4.2 or AVX features enabled");
-    return EmitPCMPSTRI(MI, BB, Subtarget->getInstrInfo());
+    return emitPCMPSTRI(MI, BB, Subtarget.getInstrInfo());
 
   // Thread synchronization.
   case X86::MONITOR:
-    return EmitMonitor(MI, BB, Subtarget);
+    return emitMonitor(MI, BB, Subtarget, X86::MONITORrrr);
+  case X86::MONITORX:
+    return emitMonitor(MI, BB, Subtarget, X86::MONITORXrrr);
   // PKU feature
   case X86::WRPKRU:
-    return EmitWRPKRU(MI, BB, Subtarget);
+    return emitWRPKRU(MI, BB, Subtarget);
   case X86::RDPKRU:
-    return EmitRDPKRU(MI, BB, Subtarget);
+    return emitRDPKRU(MI, BB, Subtarget);
   // xbegin
   case X86::XBEGIN:
-    return EmitXBegin(MI, BB, Subtarget->getInstrInfo());
+    return emitXBegin(MI, BB, Subtarget.getInstrInfo());
 
   case X86::VASTART_SAVE_XMM_REGS:
     return EmitVAStartSaveXMMRegsWithCustomInserter(MI, BB);
@@ -22846,6 +24544,9 @@ X86TargetLowering::EmitInstrWithCustomInserter(MachineInstr *MI,
   case X86::EH_SjLj_LongJmp64:
     return emitEHSjLjLongJmp(MI, BB);
 
+  case X86::Int_eh_sjlj_setup_dispatch:
+    return EmitSjLjDispatchBlock(MI, BB);
+
   case TargetOpcode::STATEPOINT:
     // As an implementation detail, STATEPOINT shares the STACKMAP format at
     // this point in the process.  We diverge later.
@@ -22888,6 +24589,14 @@ X86TargetLowering::EmitInstrWithCustomInserter(MachineInstr *MI,
   case X86::VFMSUBADDPDr213rY:
   case X86::VFMSUBADDPSr213rY:
     return emitFMA3Instr(MI, BB);
+  case X86::LCMPXCHG8B_SAVE_EBX:
+  case X86::LCMPXCHG16B_SAVE_RBX: {
+    unsigned BasePtr =
+        MI.getOpcode() == X86::LCMPXCHG8B_SAVE_EBX ? X86::EBX : X86::RBX;
+    if (!BB->isLiveIn(BasePtr))
+      BB->addLiveIn(BasePtr);
+    return BB;
+  }
   }
 }
 
@@ -22930,33 +24639,9 @@ void X86TargetLowering::computeKnownBitsForTargetNode(const SDValue Op,
   case X86ISD::SETCC:
     KnownZero |= APInt::getHighBitsSet(BitWidth, BitWidth - 1);
     break;
-  case ISD::INTRINSIC_WO_CHAIN: {
-    unsigned IntId = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
-    unsigned NumLoBits = 0;
-    switch (IntId) {
-    default: break;
-    case Intrinsic::x86_sse_movmsk_ps:
-    case Intrinsic::x86_avx_movmsk_ps_256:
-    case Intrinsic::x86_sse2_movmsk_pd:
-    case Intrinsic::x86_avx_movmsk_pd_256:
-    case Intrinsic::x86_mmx_pmovmskb:
-    case Intrinsic::x86_sse2_pmovmskb_128:
-    case Intrinsic::x86_avx2_pmovmskb: {
-      // High bits of movmskp{s|d}, pmovmskb are known zero.
-      switch (IntId) {
-        default: llvm_unreachable("Impossible intrinsic");  // Can't reach here.
-        case Intrinsic::x86_sse_movmsk_ps:      NumLoBits = 4; break;
-        case Intrinsic::x86_avx_movmsk_ps_256:  NumLoBits = 8; break;
-        case Intrinsic::x86_sse2_movmsk_pd:     NumLoBits = 2; break;
-        case Intrinsic::x86_avx_movmsk_pd_256:  NumLoBits = 4; break;
-        case Intrinsic::x86_mmx_pmovmskb:       NumLoBits = 8; break;
-        case Intrinsic::x86_sse2_pmovmskb_128:  NumLoBits = 16; break;
-        case Intrinsic::x86_avx2_pmovmskb:      NumLoBits = 32; break;
-      }
-      KnownZero = APInt::getHighBitsSet(BitWidth, BitWidth - NumLoBits);
-      break;
-    }
-    }
+  case X86ISD::MOVMSK: {
+    unsigned NumLoBits = Op.getOperand(0).getValueType().getVectorNumElements();
+    KnownZero = APInt::getHighBitsSet(BitWidth, BitWidth - NumLoBits);
     break;
   }
   }
@@ -22974,8 +24659,8 @@ unsigned X86TargetLowering::ComputeNumSignBitsForTargetNode(
   return 1;
 }
 
-/// isGAPlusOffset - Returns true (and the GlobalValue and the offset) if the
-/// node is a GlobalAddress + offset.
+/// Returns true (and the GlobalValue and the offset) if the node is a
+/// GlobalAddress + offset.
 bool X86TargetLowering::isGAPlusOffset(SDNode *N,
                                        const GlobalValue* &GA,
                                        int64_t &Offset) const {
@@ -22989,11 +24674,11 @@ bool X86TargetLowering::isGAPlusOffset(SDNode *N,
   return TargetLowering::isGAPlusOffset(N, GA, Offset);
 }
 
-/// PerformShuffleCombine256 - Performs shuffle combines for 256-bit vectors.
+/// Performs shuffle combines for 256-bit vectors.
 /// FIXME: This could be expanded to support 512 bit vectors as well.
-static SDValue PerformShuffleCombine256(SDNode *N, SelectionDAG &DAG,
-                                        TargetLowering::DAGCombinerInfo &DCI,
-                                        const X86Subtarget* Subtarget) {
+static SDValue combineShuffle256(SDNode *N, SelectionDAG &DAG,
+                                 TargetLowering::DAGCombinerInfo &DCI,
+                                 const X86Subtarget &Subtarget) {
   SDLoc dl(N);
   ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(N);
   SDValue V1 = SVOp->getOperand(0);
@@ -23014,8 +24699,7 @@ static SDValue PerformShuffleCombine256(SDNode *N, SelectionDAG &DAG,
     //          RESULT: V + zero extended
     //
     if (V2.getOperand(0).getOpcode() != ISD::BUILD_VECTOR ||
-        V2.getOperand(1).getOpcode() != ISD::UNDEF ||
-        V1.getOperand(1).getOpcode() != ISD::UNDEF)
+        !V2.getOperand(1).isUndef() || !V1.getOperand(1).isUndef())
       return SDValue();
 
     if (!ISD::isBuildVectorAllZeros(V2.getOperand(0).getNode()))
@@ -23060,195 +24744,556 @@ static SDValue PerformShuffleCombine256(SDNode *N, SelectionDAG &DAG,
     // Emit a zeroed vector and insert the desired subvector on its
     // first half.
     SDValue Zeros = getZeroVector(VT, Subtarget, DAG, dl);
-    SDValue InsV = Insert128BitVector(Zeros, V1.getOperand(0), 0, DAG, dl);
+    SDValue InsV = insert128BitVector(Zeros, V1.getOperand(0), 0, DAG, dl);
     return DCI.CombineTo(N, InsV);
   }
 
   return SDValue();
 }
 
+// Attempt to match a combined shuffle mask against supported unary shuffle
+// instructions.
+// TODO: Investigate sharing more of this with shuffle lowering.
+static bool matchUnaryVectorShuffle(MVT SrcVT, ArrayRef<int> Mask,
+                                    const X86Subtarget &Subtarget,
+                                    unsigned &Shuffle, MVT &ShuffleVT) {
+  bool FloatDomain = SrcVT.isFloatingPoint() ||
+                     (!Subtarget.hasAVX2() && SrcVT.is256BitVector());
+
+  // Match a 128-bit integer vector against a VZEXT_MOVL (MOVQ) instruction.
+  if (!FloatDomain && SrcVT.is128BitVector() &&
+      isTargetShuffleEquivalent(Mask, {0, SM_SentinelZero})) {
+    Shuffle = X86ISD::VZEXT_MOVL;
+    ShuffleVT = MVT::v2i64;
+    return true;
+  }
+
+  // Check if we have SSE3 which will let us use MOVDDUP etc. The
+  // instructions are no slower than UNPCKLPD but has the option to
+  // fold the input operand into even an unaligned memory load.
+  if (SrcVT.is128BitVector() && Subtarget.hasSSE3() && FloatDomain) {
+    if (isTargetShuffleEquivalent(Mask, {0, 0})) {
+      Shuffle = X86ISD::MOVDDUP;
+      ShuffleVT = MVT::v2f64;
+      return true;
+    }
+    if (isTargetShuffleEquivalent(Mask, {0, 0, 2, 2})) {
+      Shuffle = X86ISD::MOVSLDUP;
+      ShuffleVT = MVT::v4f32;
+      return true;
+    }
+    if (isTargetShuffleEquivalent(Mask, {1, 1, 3, 3})) {
+      Shuffle = X86ISD::MOVSHDUP;
+      ShuffleVT = MVT::v4f32;
+      return true;
+    }
+  }
+
+  if (SrcVT.is256BitVector() && FloatDomain) {
+    assert(Subtarget.hasAVX() && "AVX required for 256-bit vector shuffles");
+    if (isTargetShuffleEquivalent(Mask, {0, 0, 2, 2})) {
+      Shuffle = X86ISD::MOVDDUP;
+      ShuffleVT = MVT::v4f64;
+      return true;
+    }
+    if (isTargetShuffleEquivalent(Mask, {0, 0, 2, 2, 4, 4, 6, 6})) {
+      Shuffle = X86ISD::MOVSLDUP;
+      ShuffleVT = MVT::v8f32;
+      return true;
+    }
+    if (isTargetShuffleEquivalent(Mask, {1, 1, 3, 3, 5, 5, 7, 7})) {
+      Shuffle = X86ISD::MOVSHDUP;
+      ShuffleVT = MVT::v8f32;
+      return true;
+    }
+  }
+
+  if (SrcVT.is512BitVector() && FloatDomain) {
+    assert(Subtarget.hasAVX512() &&
+           "AVX512 required for 512-bit vector shuffles");
+    if (isTargetShuffleEquivalent(Mask, {0, 0, 2, 2, 4, 4, 6, 6})) {
+      Shuffle = X86ISD::MOVDDUP;
+      ShuffleVT = MVT::v8f64;
+      return true;
+    }
+    if (isTargetShuffleEquivalent(
+            Mask, {0, 0, 2, 2, 4, 4, 6, 6, 8, 8, 10, 10, 12, 12, 14, 14})) {
+      Shuffle = X86ISD::MOVSLDUP;
+      ShuffleVT = MVT::v16f32;
+      return true;
+    }
+    if (isTargetShuffleEquivalent(
+            Mask, {1, 1, 3, 3, 5, 5, 7, 7, 9, 9, 11, 11, 13, 13, 15, 15})) {
+      Shuffle = X86ISD::MOVSHDUP;
+      ShuffleVT = MVT::v16f32;
+      return true;
+    }
+  }
+
+  // Attempt to match against broadcast-from-vector.
+  if (Subtarget.hasAVX2()) {
+    unsigned NumElts = Mask.size();
+    SmallVector<int, 64> BroadcastMask(NumElts, 0);
+    if (isTargetShuffleEquivalent(Mask, BroadcastMask)) {
+      unsigned EltSize = SrcVT.getSizeInBits() / NumElts;
+      ShuffleVT = FloatDomain ? MVT::getFloatingPointVT(EltSize)
+                              : MVT::getIntegerVT(EltSize);
+      ShuffleVT = MVT::getVectorVT(ShuffleVT, NumElts);
+      Shuffle = X86ISD::VBROADCAST;
+      return true;
+    }
+  }
+
+  return false;
+}
+
+// Attempt to match a combined shuffle mask against supported unary immediate
+// permute instructions.
+// TODO: Investigate sharing more of this with shuffle lowering.
+static bool matchPermuteVectorShuffle(MVT SrcVT, ArrayRef<int> Mask,
+                                      const X86Subtarget &Subtarget,
+                                      unsigned &Shuffle, MVT &ShuffleVT,
+                                      unsigned &PermuteImm) {
+  // Ensure we don't contain any zero elements.
+  for (int M : Mask) {
+    if (M == SM_SentinelZero)
+      return false;
+    assert(SM_SentinelUndef <= M && M < (int)Mask.size() &&
+           "Expected unary shuffle");
+  }
+
+  unsigned MaskScalarSizeInBits = SrcVT.getSizeInBits() / Mask.size();
+  MVT MaskEltVT = MVT::getIntegerVT(MaskScalarSizeInBits);
+
+  // Handle PSHUFLW/PSHUFHW repeated patterns.
+  if (MaskScalarSizeInBits == 16) {
+    SmallVector<int, 4> RepeatedMask;
+    if (is128BitLaneRepeatedShuffleMask(MaskEltVT, Mask, RepeatedMask)) {
+      ArrayRef<int> LoMask(Mask.data() + 0, 4);
+      ArrayRef<int> HiMask(Mask.data() + 4, 4);
+
+      // PSHUFLW: permute lower 4 elements only.
+      if (isUndefOrInRange(LoMask, 0, 4) &&
+          isSequentialOrUndefInRange(HiMask, 0, 4, 4)) {
+        Shuffle = X86ISD::PSHUFLW;
+        ShuffleVT = MVT::getVectorVT(MVT::i16, SrcVT.getSizeInBits() / 16);
+        PermuteImm = getV4X86ShuffleImm(LoMask);
+        return true;
+      }
+
+      // PSHUFHW: permute upper 4 elements only.
+      if (isUndefOrInRange(HiMask, 4, 8) &&
+          isSequentialOrUndefInRange(LoMask, 0, 4, 0)) {
+        // Offset the HiMask so that we can create the shuffle immediate.
+        int OffsetHiMask[4];
+        for (int i = 0; i != 4; ++i)
+          OffsetHiMask[i] = (HiMask[i] < 0 ? HiMask[i] : HiMask[i] - 4);
+
+        Shuffle = X86ISD::PSHUFHW;
+        ShuffleVT = MVT::getVectorVT(MVT::i16, SrcVT.getSizeInBits() / 16);
+        PermuteImm = getV4X86ShuffleImm(OffsetHiMask);
+        return true;
+      }
+
+      return false;
+    }
+    return false;
+  }
+
+  // We only support permutation of 32/64 bit elements after this.
+  if (MaskScalarSizeInBits != 32 && MaskScalarSizeInBits != 64)
+    return false;
+
+  // AVX introduced the VPERMILPD/VPERMILPS float permutes, before then we
+  // had to use 2-input SHUFPD/SHUFPS shuffles (not handled here).
+  bool FloatDomain = SrcVT.isFloatingPoint();
+  if (FloatDomain && !Subtarget.hasAVX())
+    return false;
+
+  // Pre-AVX2 we must use float shuffles on 256-bit vectors.
+  if (SrcVT.is256BitVector() && !Subtarget.hasAVX2())
+    FloatDomain = true;
+
+  // Check for lane crossing permutes.
+  if (is128BitLaneCrossingShuffleMask(MaskEltVT, Mask)) {
+    // PERMPD/PERMQ permutes within a 256-bit vector (AVX2+).
+    if (Subtarget.hasAVX2() && SrcVT.is256BitVector() && Mask.size() == 4) {
+      Shuffle = X86ISD::VPERMI;
+      ShuffleVT = (FloatDomain ? MVT::v4f64 : MVT::v4i64);
+      PermuteImm = getV4X86ShuffleImm(Mask);
+      return true;
+    }
+    if (Subtarget.hasAVX512() && SrcVT.is512BitVector() && Mask.size() == 8) {
+      SmallVector<int, 4> RepeatedMask;
+      if (is256BitLaneRepeatedShuffleMask(MVT::v8f64, Mask, RepeatedMask)) {
+        Shuffle = X86ISD::VPERMI;
+        ShuffleVT = (FloatDomain ? MVT::v8f64 : MVT::v8i64);
+        PermuteImm = getV4X86ShuffleImm(RepeatedMask);
+        return true;
+      }
+    }
+    return false;
+  }
+
+  // VPERMILPD can permute with a non-repeating shuffle.
+  if (FloatDomain && MaskScalarSizeInBits == 64) {
+    Shuffle = X86ISD::VPERMILPI;
+    ShuffleVT = MVT::getVectorVT(MVT::f64, Mask.size());
+    PermuteImm = 0;
+    for (int i = 0, e = Mask.size(); i != e; ++i) {
+      int M = Mask[i];
+      if (M == SM_SentinelUndef)
+        continue;
+      assert(((M / 2) == (i / 2)) && "Out of range shuffle mask index");
+      PermuteImm |= (M & 1) << i;
+    }
+    return true;
+  }
+
+  // We need a repeating shuffle mask for VPERMILPS/PSHUFD.
+  SmallVector<int, 4> RepeatedMask;
+  if (!is128BitLaneRepeatedShuffleMask(MaskEltVT, Mask, RepeatedMask))
+    return false;
+
+  // Narrow the repeated mask for 32-bit element permutes.
+  SmallVector<int, 4> WordMask = RepeatedMask;
+  if (MaskScalarSizeInBits == 64)
+    scaleShuffleMask(2, RepeatedMask, WordMask);
+
+  Shuffle = (FloatDomain ? X86ISD::VPERMILPI : X86ISD::PSHUFD);
+  ShuffleVT = (FloatDomain ? MVT::f32 : MVT::i32);
+  ShuffleVT = MVT::getVectorVT(ShuffleVT, SrcVT.getSizeInBits() / 32);
+  PermuteImm = getV4X86ShuffleImm(WordMask);
+  return true;
+}
+
+// Attempt to match a combined unary shuffle mask against supported binary
+// shuffle instructions.
+// TODO: Investigate sharing more of this with shuffle lowering.
+static bool matchBinaryVectorShuffle(MVT SrcVT, ArrayRef<int> Mask,
+                                     unsigned &Shuffle, MVT &ShuffleVT) {
+  bool FloatDomain = SrcVT.isFloatingPoint();
+
+  if (SrcVT.is128BitVector()) {
+    if (isTargetShuffleEquivalent(Mask, {0, 0}) && FloatDomain) {
+      Shuffle = X86ISD::MOVLHPS;
+      ShuffleVT = MVT::v4f32;
+      return true;
+    }
+    if (isTargetShuffleEquivalent(Mask, {1, 1}) && FloatDomain) {
+      Shuffle = X86ISD::MOVHLPS;
+      ShuffleVT = MVT::v4f32;
+      return true;
+    }
+    if (isTargetShuffleEquivalent(Mask, {0, 0, 1, 1}) && FloatDomain) {
+      Shuffle = X86ISD::UNPCKL;
+      ShuffleVT = MVT::v4f32;
+      return true;
+    }
+    if (isTargetShuffleEquivalent(Mask, {2, 2, 3, 3}) && FloatDomain) {
+      Shuffle = X86ISD::UNPCKH;
+      ShuffleVT = MVT::v4f32;
+      return true;
+    }
+    if (isTargetShuffleEquivalent(Mask, {0, 0, 1, 1, 2, 2, 3, 3}) ||
+        isTargetShuffleEquivalent(
+            Mask, {0, 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7})) {
+      Shuffle = X86ISD::UNPCKL;
+      ShuffleVT = Mask.size() == 8 ? MVT::v8i16 : MVT::v16i8;
+      return true;
+    }
+    if (isTargetShuffleEquivalent(Mask, {4, 4, 5, 5, 6, 6, 7, 7}) ||
+        isTargetShuffleEquivalent(Mask, {8, 8, 9, 9, 10, 10, 11, 11, 12, 12, 13,
+                                         13, 14, 14, 15, 15})) {
+      Shuffle = X86ISD::UNPCKH;
+      ShuffleVT = Mask.size() == 8 ? MVT::v8i16 : MVT::v16i8;
+      return true;
+    }
+  }
+
+  return false;
+}
+
 /// \brief Combine an arbitrary chain of shuffles into a single instruction if
 /// possible.
 ///
-/// This is the leaf of the recursive combinine below. When we have found some
+/// This is the leaf of the recursive combine below. When we have found some
 /// chain of single-use x86 shuffle instructions and accumulated the combined
 /// shuffle mask represented by them, this will try to pattern match that mask
 /// into either a single instruction if there is a special purpose instruction
 /// for this operation, or into a PSHUFB instruction which is a fully general
 /// instruction but should only be used to replace chains over a certain depth.
-static bool combineX86ShuffleChain(SDValue Op, SDValue Root, ArrayRef<int> Mask,
-                                   int Depth, bool HasPSHUFB, SelectionDAG &DAG,
+static bool combineX86ShuffleChain(SDValue Input, SDValue Root,
+                                   ArrayRef<int> BaseMask, int Depth,
+                                   bool HasVariableMask, SelectionDAG &DAG,
                                    TargetLowering::DAGCombinerInfo &DCI,
-                                   const X86Subtarget *Subtarget) {
-  assert(!Mask.empty() && "Cannot combine an empty shuffle mask!");
+                                   const X86Subtarget &Subtarget) {
+  assert(!BaseMask.empty() && "Cannot combine an empty shuffle mask!");
 
   // Find the operand that enters the chain. Note that multiple uses are OK
   // here, we're not going to remove the operand we find.
-  SDValue Input = Op.getOperand(0);
-  while (Input.getOpcode() == ISD::BITCAST)
-    Input = Input.getOperand(0);
+  Input = peekThroughBitcasts(Input);
 
   MVT VT = Input.getSimpleValueType();
   MVT RootVT = Root.getSimpleValueType();
   SDLoc DL(Root);
 
-  if (Mask.size() == 1) {
-    int Index = Mask[0];
-    assert((Index >= 0 || Index == SM_SentinelUndef ||
-            Index == SM_SentinelZero) &&
-           "Invalid shuffle index found!");
-
-    // We may end up with an accumulated mask of size 1 as a result of
-    // widening of shuffle operands (see function canWidenShuffleElements).
-    // If the only shuffle index is equal to SM_SentinelZero then propagate
-    // a zero vector. Otherwise, the combine shuffle mask is a no-op shuffle
-    // mask, and therefore the entire chain of shuffles can be folded away.
-    if (Index == SM_SentinelZero)
-      DCI.CombineTo(Root.getNode(), getZeroVector(RootVT, Subtarget, DAG, DL));
-    else
-      DCI.CombineTo(Root.getNode(), DAG.getBitcast(RootVT, Input),
-                    /*AddTo*/ true);
+  SDValue Res;
+
+  unsigned NumBaseMaskElts = BaseMask.size();
+  if (NumBaseMaskElts == 1) {
+    assert(BaseMask[0] == 0 && "Invalid shuffle index found!");
+    DCI.CombineTo(Root.getNode(), DAG.getBitcast(RootVT, Input),
+                  /*AddTo*/ true);
     return true;
   }
 
-  // Use the float domain if the operand type is a floating point type.
-  bool FloatDomain = VT.isFloatingPoint();
+  unsigned RootSizeInBits = RootVT.getSizeInBits();
+  unsigned BaseMaskEltSizeInBits = RootSizeInBits / NumBaseMaskElts;
 
-  // For floating point shuffles, we don't have free copies in the shuffle
-  // instructions or the ability to load as part of the instruction, so
-  // canonicalize their shuffles to UNPCK or MOV variants.
-  //
-  // Note that even with AVX we prefer the PSHUFD form of shuffle for integer
-  // vectors because it can have a load folded into it that UNPCK cannot. This
-  // doesn't preclude something switching to the shorter encoding post-RA.
-  //
-  // FIXME: Should teach these routines about AVX vector widths.
-  if (FloatDomain && VT.is128BitVector()) {
-    if (Mask.equals({0, 0}) || Mask.equals({1, 1})) {
-      bool Lo = Mask.equals({0, 0});
-      unsigned Shuffle;
-      MVT ShuffleVT;
-      // Check if we have SSE3 which will let us use MOVDDUP. That instruction
-      // is no slower than UNPCKLPD but has the option to fold the input operand
-      // into even an unaligned memory load.
-      if (Lo && Subtarget->hasSSE3()) {
-        Shuffle = X86ISD::MOVDDUP;
-        ShuffleVT = MVT::v2f64;
-      } else {
-        // We have MOVLHPS and MOVHLPS throughout SSE and they encode smaller
-        // than the UNPCK variants.
-        Shuffle = Lo ? X86ISD::MOVLHPS : X86ISD::MOVHLPS;
-        ShuffleVT = MVT::v4f32;
-      }
-      if (Depth == 1 && Root->getOpcode() == Shuffle)
-        return false; // Nothing to do!
-      Op = DAG.getBitcast(ShuffleVT, Input);
-      DCI.AddToWorklist(Op.getNode());
-      if (Shuffle == X86ISD::MOVDDUP)
-        Op = DAG.getNode(Shuffle, DL, ShuffleVT, Op);
-      else
-        Op = DAG.getNode(Shuffle, DL, ShuffleVT, Op, Op);
-      DCI.AddToWorklist(Op.getNode());
-      DCI.CombineTo(Root.getNode(), DAG.getBitcast(RootVT, Op),
-                    /*AddTo*/ true);
-      return true;
-    }
-    if (Subtarget->hasSSE3() &&
-        (Mask.equals({0, 0, 2, 2}) || Mask.equals({1, 1, 3, 3}))) {
-      bool Lo = Mask.equals({0, 0, 2, 2});
-      unsigned Shuffle = Lo ? X86ISD::MOVSLDUP : X86ISD::MOVSHDUP;
-      MVT ShuffleVT = MVT::v4f32;
-      if (Depth == 1 && Root->getOpcode() == Shuffle)
-        return false; // Nothing to do!
-      Op = DAG.getBitcast(ShuffleVT, Input);
-      DCI.AddToWorklist(Op.getNode());
-      Op = DAG.getNode(Shuffle, DL, ShuffleVT, Op);
-      DCI.AddToWorklist(Op.getNode());
-      DCI.CombineTo(Root.getNode(), DAG.getBitcast(RootVT, Op),
+  // Don't combine if we are a AVX512/EVEX target and the mask element size
+  // is different from the root element size - this would prevent writemasks
+  // from being reused.
+  // TODO - this currently prevents all lane shuffles from occurring.
+  // TODO - check for writemasks usage instead of always preventing combining.
+  // TODO - attempt to narrow Mask back to writemask size.
+  if (RootVT.getScalarSizeInBits() != BaseMaskEltSizeInBits &&
+      (RootSizeInBits == 512 ||
+       (Subtarget.hasVLX() && RootSizeInBits >= 128))) {
+    return false;
+  }
+
+  // TODO - handle 128/256-bit lane shuffles of 512-bit vectors.
+
+  // Handle 128-bit lane shuffles of 256-bit vectors.
+  if (VT.is256BitVector() && NumBaseMaskElts == 2 &&
+      !isSequentialOrUndefOrZeroInRange(BaseMask, 0, 2, 0)) {
+    if (Depth == 1 && Root.getOpcode() == X86ISD::VPERM2X128)
+      return false; // Nothing to do!
+    MVT ShuffleVT = (VT.isFloatingPoint() || !Subtarget.hasAVX2() ? MVT::v4f64
+                                                                  : MVT::v4i64);
+    unsigned PermMask = 0;
+    PermMask |= ((BaseMask[0] < 0 ? 0x8 : (BaseMask[0] & 1)) << 0);
+    PermMask |= ((BaseMask[1] < 0 ? 0x8 : (BaseMask[1] & 1)) << 4);
+
+    Res = DAG.getBitcast(ShuffleVT, Input);
+    DCI.AddToWorklist(Res.getNode());
+    Res = DAG.getNode(X86ISD::VPERM2X128, DL, ShuffleVT, Res,
+                      DAG.getUNDEF(ShuffleVT),
+                      DAG.getConstant(PermMask, DL, MVT::i8));
+    DCI.AddToWorklist(Res.getNode());
+    DCI.CombineTo(Root.getNode(), DAG.getBitcast(RootVT, Res),
+                  /*AddTo*/ true);
+    return true;
+  }
+
+  // For masks that have been widened to 128-bit elements or more,
+  // narrow back down to 64-bit elements.
+  SmallVector<int, 64> Mask;
+  if (BaseMaskEltSizeInBits > 64) {
+    assert((BaseMaskEltSizeInBits % 64) == 0 && "Illegal mask size");
+    int MaskScale = BaseMaskEltSizeInBits / 64;
+    scaleShuffleMask(MaskScale, BaseMask, Mask);
+  } else {
+    Mask = SmallVector<int, 64>(BaseMask.begin(), BaseMask.end());
+  }
+
+  unsigned NumMaskElts = Mask.size();
+  unsigned MaskEltSizeInBits = RootSizeInBits / NumMaskElts;
+
+  // Determine the effective mask value type.
+  bool FloatDomain =
+      (VT.isFloatingPoint() || (VT.is256BitVector() && !Subtarget.hasAVX2())) &&
+      (32 <= MaskEltSizeInBits);
+  MVT MaskVT = FloatDomain ? MVT::getFloatingPointVT(MaskEltSizeInBits)
+                           : MVT::getIntegerVT(MaskEltSizeInBits);
+  MaskVT = MVT::getVectorVT(MaskVT, NumMaskElts);
+
+  // Attempt to match the mask against known shuffle patterns.
+  MVT ShuffleVT;
+  unsigned Shuffle, PermuteImm;
+
+  if (matchUnaryVectorShuffle(VT, Mask, Subtarget, Shuffle, ShuffleVT)) {
+    if (Depth == 1 && Root.getOpcode() == Shuffle)
+      return false; // Nothing to do!
+    Res = DAG.getBitcast(ShuffleVT, Input);
+    DCI.AddToWorklist(Res.getNode());
+    Res = DAG.getNode(Shuffle, DL, ShuffleVT, Res);
+    DCI.AddToWorklist(Res.getNode());
+    DCI.CombineTo(Root.getNode(), DAG.getBitcast(RootVT, Res),
+                  /*AddTo*/ true);
+    return true;
+  }
+
+  if (matchPermuteVectorShuffle(VT, Mask, Subtarget, Shuffle, ShuffleVT,
+                                PermuteImm)) {
+    if (Depth == 1 && Root.getOpcode() == Shuffle)
+      return false; // Nothing to do!
+    Res = DAG.getBitcast(ShuffleVT, Input);
+    DCI.AddToWorklist(Res.getNode());
+    Res = DAG.getNode(Shuffle, DL, ShuffleVT, Res,
+                      DAG.getConstant(PermuteImm, DL, MVT::i8));
+    DCI.AddToWorklist(Res.getNode());
+    DCI.CombineTo(Root.getNode(), DAG.getBitcast(RootVT, Res),
+                  /*AddTo*/ true);
+    return true;
+  }
+
+  if (matchBinaryVectorShuffle(VT, Mask, Shuffle, ShuffleVT)) {
+    if (Depth == 1 && Root.getOpcode() == Shuffle)
+      return false; // Nothing to do!
+    Res = DAG.getBitcast(ShuffleVT, Input);
+    DCI.AddToWorklist(Res.getNode());
+    Res = DAG.getNode(Shuffle, DL, ShuffleVT, Res, Res);
+    DCI.AddToWorklist(Res.getNode());
+    DCI.CombineTo(Root.getNode(), DAG.getBitcast(RootVT, Res),
+                  /*AddTo*/ true);
+    return true;
+  }
+
+  // Attempt to blend with zero.
+  if (NumMaskElts <= 8 &&
+      ((Subtarget.hasSSE41() && VT.is128BitVector()) ||
+       (Subtarget.hasAVX() && VT.is256BitVector()))) {
+    // Convert VT to a type compatible with X86ISD::BLENDI.
+    // TODO - add 16i16 support (requires lane duplication).
+    MVT ShuffleVT = MaskVT;
+    if (Subtarget.hasAVX2()) {
+      if (ShuffleVT == MVT::v4i64)
+        ShuffleVT = MVT::v8i32;
+      else if (ShuffleVT == MVT::v2i64)
+        ShuffleVT = MVT::v4i32;
+    } else {
+      if (ShuffleVT == MVT::v2i64 || ShuffleVT == MVT::v4i32)
+        ShuffleVT = MVT::v8i16;
+      else if (ShuffleVT == MVT::v4i64)
+        ShuffleVT = MVT::v4f64;
+      else if (ShuffleVT == MVT::v8i32)
+        ShuffleVT = MVT::v8f32;
+    }
+
+    if (isSequentialOrUndefOrZeroInRange(Mask, /*Pos*/ 0, /*Size*/ NumMaskElts,
+                                         /*Low*/ 0) &&
+        NumMaskElts <= ShuffleVT.getVectorNumElements()) {
+      unsigned BlendMask = 0;
+      unsigned ShuffleSize = ShuffleVT.getVectorNumElements();
+      unsigned MaskRatio = ShuffleSize / NumMaskElts;
+
+      if (Depth == 1 && Root.getOpcode() == X86ISD::BLENDI)
+        return false;
+
+      for (unsigned i = 0; i != ShuffleSize; ++i)
+        if (Mask[i / MaskRatio] < 0)
+          BlendMask |= 1u << i;
+
+      SDValue Zero = getZeroVector(ShuffleVT, Subtarget, DAG, DL);
+      Res = DAG.getBitcast(ShuffleVT, Input);
+      DCI.AddToWorklist(Res.getNode());
+      Res = DAG.getNode(X86ISD::BLENDI, DL, ShuffleVT, Res, Zero,
+                        DAG.getConstant(BlendMask, DL, MVT::i8));
+      DCI.AddToWorklist(Res.getNode());
+      DCI.CombineTo(Root.getNode(), DAG.getBitcast(RootVT, Res),
                     /*AddTo*/ true);
       return true;
     }
-    if (Mask.equals({0, 0, 1, 1}) || Mask.equals({2, 2, 3, 3})) {
-      bool Lo = Mask.equals({0, 0, 1, 1});
-      unsigned Shuffle = Lo ? X86ISD::UNPCKL : X86ISD::UNPCKH;
-      MVT ShuffleVT = MVT::v4f32;
-      if (Depth == 1 && Root->getOpcode() == Shuffle)
+  }
+
+  // Attempt to combine to INSERTPS.
+  if (Subtarget.hasSSE41() && NumMaskElts == 4 &&
+      (VT == MVT::v2f64 || VT == MVT::v4f32)) {
+    SmallBitVector Zeroable(4, false);
+    for (unsigned i = 0; i != NumMaskElts; ++i)
+      if (Mask[i] < 0)
+        Zeroable[i] = true;
+
+    unsigned InsertPSMask;
+    SDValue V1 = Input, V2 = Input;
+    if (Zeroable.any() && matchVectorShuffleAsInsertPS(V1, V2, InsertPSMask,
+                                                       Zeroable, Mask, DAG)) {
+      if (Depth == 1 && Root.getOpcode() == X86ISD::INSERTPS)
         return false; // Nothing to do!
-      Op = DAG.getBitcast(ShuffleVT, Input);
-      DCI.AddToWorklist(Op.getNode());
-      Op = DAG.getNode(Shuffle, DL, ShuffleVT, Op, Op);
-      DCI.AddToWorklist(Op.getNode());
-      DCI.CombineTo(Root.getNode(), DAG.getBitcast(RootVT, Op),
+      V1 = DAG.getBitcast(MVT::v4f32, V1);
+      DCI.AddToWorklist(V1.getNode());
+      V2 = DAG.getBitcast(MVT::v4f32, V2);
+      DCI.AddToWorklist(V2.getNode());
+      Res = DAG.getNode(X86ISD::INSERTPS, DL, MVT::v4f32, V1, V2,
+                        DAG.getConstant(InsertPSMask, DL, MVT::i8));
+      DCI.AddToWorklist(Res.getNode());
+      DCI.CombineTo(Root.getNode(), DAG.getBitcast(RootVT, Res),
                     /*AddTo*/ true);
       return true;
     }
   }
 
-  // We always canonicalize the 8 x i16 and 16 x i8 shuffles into their UNPCK
-  // variants as none of these have single-instruction variants that are
-  // superior to the UNPCK formulation.
-  if (!FloatDomain && VT.is128BitVector() &&
-      (Mask.equals({0, 0, 1, 1, 2, 2, 3, 3}) ||
-       Mask.equals({4, 4, 5, 5, 6, 6, 7, 7}) ||
-       Mask.equals({0, 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7}) ||
-       Mask.equals(
-           {8, 8, 9, 9, 10, 10, 11, 11, 12, 12, 13, 13, 14, 14, 15, 15}))) {
-    bool Lo = Mask[0] == 0;
-    unsigned Shuffle = Lo ? X86ISD::UNPCKL : X86ISD::UNPCKH;
-    if (Depth == 1 && Root->getOpcode() == Shuffle)
-      return false; // Nothing to do!
-    MVT ShuffleVT;
-    switch (Mask.size()) {
-    case 8:
-      ShuffleVT = MVT::v8i16;
-      break;
-    case 16:
-      ShuffleVT = MVT::v16i8;
-      break;
-    default:
-      llvm_unreachable("Impossible mask size!");
-    };
-    Op = DAG.getBitcast(ShuffleVT, Input);
-    DCI.AddToWorklist(Op.getNode());
-    Op = DAG.getNode(Shuffle, DL, ShuffleVT, Op, Op);
-    DCI.AddToWorklist(Op.getNode());
-    DCI.CombineTo(Root.getNode(), DAG.getBitcast(RootVT, Op),
-                  /*AddTo*/ true);
-    return true;
-  }
-
   // Don't try to re-form single instruction chains under any circumstances now
   // that we've done encoding canonicalization for them.
   if (Depth < 2)
     return false;
 
-  // If we have 3 or more shuffle instructions or a chain involving PSHUFB, we
-  // can replace them with a single PSHUFB instruction profitably. Intel's
-  // manuals suggest only using PSHUFB if doing so replacing 5 instructions, but
-  // in practice PSHUFB tends to be *very* fast so we're more aggressive.
-  if ((Depth >= 3 || HasPSHUFB) && Subtarget->hasSSSE3()) {
+  if (is128BitLaneCrossingShuffleMask(MaskVT, Mask))
+    return false;
+
+  bool MaskContainsZeros =
+      llvm::any_of(Mask, [](int M) { return M == SM_SentinelZero; });
+
+  // If we have a single input shuffle with different shuffle patterns in the
+  // the 128-bit lanes use the variable mask to VPERMILPS.
+  // TODO Combine other mask types at higher depths.
+  if (HasVariableMask && !MaskContainsZeros &&
+      ((MaskVT == MVT::v8f32 && Subtarget.hasAVX()) ||
+       (MaskVT == MVT::v16f32 && Subtarget.hasAVX512()))) {
+    SmallVector<SDValue, 16> VPermIdx;
+    for (int M : Mask) {
+      SDValue Idx =
+          M < 0 ? DAG.getUNDEF(MVT::i32) : DAG.getConstant(M % 4, DL, MVT::i32);
+      VPermIdx.push_back(Idx);
+    }
+    MVT VPermMaskVT = MVT::getVectorVT(MVT::i32, NumMaskElts);
+    SDValue VPermMask = DAG.getBuildVector(VPermMaskVT, DL, VPermIdx);
+    DCI.AddToWorklist(VPermMask.getNode());
+    Res = DAG.getBitcast(MaskVT, Input);
+    DCI.AddToWorklist(Res.getNode());
+    Res = DAG.getNode(X86ISD::VPERMILPV, DL, MaskVT, Res, VPermMask);
+    DCI.AddToWorklist(Res.getNode());
+    DCI.CombineTo(Root.getNode(), DAG.getBitcast(RootVT, Res),
+                  /*AddTo*/ true);
+    return true;
+  }
+
+  // If we have 3 or more shuffle instructions or a chain involving a variable
+  // mask, we can replace them with a single PSHUFB instruction profitably.
+  // Intel's manuals suggest only using PSHUFB if doing so replacing 5
+  // instructions, but in practice PSHUFB tends to be *very* fast so we're
+  // more aggressive.
+  if ((Depth >= 3 || HasVariableMask) &&
+      ((VT.is128BitVector() && Subtarget.hasSSSE3()) ||
+       (VT.is256BitVector() && Subtarget.hasAVX2()) ||
+       (VT.is512BitVector() && Subtarget.hasBWI()))) {
     SmallVector<SDValue, 16> PSHUFBMask;
     int NumBytes = VT.getSizeInBits() / 8;
-    int Ratio = NumBytes / Mask.size();
+    int Ratio = NumBytes / NumMaskElts;
     for (int i = 0; i < NumBytes; ++i) {
-      if (Mask[i / Ratio] == SM_SentinelUndef) {
+      int M = Mask[i / Ratio];
+      if (M == SM_SentinelUndef) {
         PSHUFBMask.push_back(DAG.getUNDEF(MVT::i8));
         continue;
       }
-      int M = Mask[i / Ratio] != SM_SentinelZero
-                  ? Ratio * Mask[i / Ratio] + i % Ratio
-                  : 255;
+      if (M == SM_SentinelZero) {
+        PSHUFBMask.push_back(DAG.getConstant(255, DL, MVT::i8));
+        continue;
+      }
+      M = Ratio * M + i % Ratio;
+      assert ((M / 16) == (i / 16) && "Lane crossing detected");
       PSHUFBMask.push_back(DAG.getConstant(M, DL, MVT::i8));
     }
     MVT ByteVT = MVT::getVectorVT(MVT::i8, NumBytes);
-    Op = DAG.getBitcast(ByteVT, Input);
-    DCI.AddToWorklist(Op.getNode());
-    SDValue PSHUFBMaskOp =
-        DAG.getNode(ISD::BUILD_VECTOR, DL, ByteVT, PSHUFBMask);
+    Res = DAG.getBitcast(ByteVT, Input);
+    DCI.AddToWorklist(Res.getNode());
+    SDValue PSHUFBMaskOp = DAG.getBuildVector(ByteVT, DL, PSHUFBMask);
     DCI.AddToWorklist(PSHUFBMaskOp.getNode());
-    Op = DAG.getNode(X86ISD::PSHUFB, DL, ByteVT, Op, PSHUFBMaskOp);
-    DCI.AddToWorklist(Op.getNode());
-    DCI.CombineTo(Root.getNode(), DAG.getBitcast(RootVT, Op),
+    Res = DAG.getNode(X86ISD::PSHUFB, DL, ByteVT, Res, PSHUFBMaskOp);
+    DCI.AddToWorklist(Res.getNode());
+    DCI.CombineTo(Root.getNode(), DAG.getBitcast(RootVT, Res),
                   /*AddTo*/ true);
     return true;
   }
@@ -23288,10 +25333,10 @@ static bool combineX86ShuffleChain(SDValue Op, SDValue Root, ArrayRef<int> Mask,
 /// combining in this recursive walk.
 static bool combineX86ShufflesRecursively(SDValue Op, SDValue Root,
                                           ArrayRef<int> RootMask,
-                                          int Depth, bool HasPSHUFB,
+                                          int Depth, bool HasVariableMask,
                                           SelectionDAG &DAG,
                                           TargetLowering::DAGCombinerInfo &DCI,
-                                          const X86Subtarget *Subtarget) {
+                                          const X86Subtarget &Subtarget) {
   // Bound the depth of our recursive combine because this is ultimately
   // quadratic in nature.
   if (Depth > 8)
@@ -23310,13 +25355,10 @@ static bool combineX86ShufflesRecursively(SDValue Op, SDValue Root,
   assert(VT.getSizeInBits() == Root.getSimpleValueType().getSizeInBits() &&
          "Can only combine shuffles of the same vector register size.");
 
-  if (!isTargetShuffle(Op.getOpcode()))
-    return false;
+  // Extract target shuffle mask and resolve sentinels and inputs.
+  SDValue Input0, Input1;
   SmallVector<int, 16> OpMask;
-  bool IsUnary;
-  bool HaveMask = getTargetShuffleMask(Op.getNode(), VT, true, OpMask, IsUnary);
-  // We only can combine unary shuffles which we can decode the mask for.
-  if (!HaveMask || !IsUnary)
+  if (!resolveTargetShuffleInputs(Op, Input0, Input1, OpMask))
     return false;
 
   assert(VT.getVectorNumElements() == OpMask.size() &&
@@ -23327,6 +25369,7 @@ static bool combineX86ShufflesRecursively(SDValue Op, SDValue Root,
            OpMask.size() % RootMask.size() == 0) ||
           OpMask.size() == RootMask.size()) &&
          "The smaller number of elements must divide the larger.");
+  int MaskWidth = std::max<int>(OpMask.size(), RootMask.size());
   int RootRatio = std::max<int>(1, OpMask.size() / RootMask.size());
   int OpRatio = std::max<int>(1, RootMask.size() / OpMask.size());
   assert(((RootRatio == 1 && OpRatio == 1) ||
@@ -23334,13 +25377,13 @@ static bool combineX86ShufflesRecursively(SDValue Op, SDValue Root,
          "Must not have a ratio for both incoming and op masks!");
 
   SmallVector<int, 16> Mask;
-  Mask.reserve(std::max(OpMask.size(), RootMask.size()));
+  Mask.reserve(MaskWidth);
 
   // Merge this shuffle operation's mask into our accumulated mask. Note that
   // this shuffle's mask will be the first applied to the input, followed by the
   // root mask to get us all the way to the root value arrangement. The reason
   // for this order is that we are recursing up the operation chain.
-  for (int i = 0, e = std::max(OpMask.size(), RootMask.size()); i < e; ++i) {
+  for (int i = 0; i < MaskWidth; ++i) {
     int RootIdx = i / RootRatio;
     if (RootMask[RootIdx] < 0) {
       // This is a zero or undef lane, we're done.
@@ -23362,45 +25405,56 @@ static bool combineX86ShufflesRecursively(SDValue Op, SDValue Root,
                    RootMaskedIdx % OpRatio);
   }
 
-  // See if we can recurse into the operand to combine more things.
-  switch (Op.getOpcode()) {
-  case X86ISD::PSHUFB:
-    HasPSHUFB = true;
-  case X86ISD::PSHUFD:
-  case X86ISD::PSHUFHW:
-  case X86ISD::PSHUFLW:
-    if (Op.getOperand(0).hasOneUse() &&
-        combineX86ShufflesRecursively(Op.getOperand(0), Root, Mask, Depth + 1,
-                                      HasPSHUFB, DAG, DCI, Subtarget))
-      return true;
-    break;
+  // Handle the all undef/zero cases early.
+  if (llvm::all_of(Mask, [](int Idx) { return Idx == SM_SentinelUndef; })) {
+    DCI.CombineTo(Root.getNode(), DAG.getUNDEF(Root.getValueType()));
+    return true;
+  }
+  if (llvm::all_of(Mask, [](int Idx) { return Idx < 0; })) {
+    // TODO - should we handle the mixed zero/undef case as well? Just returning
+    // a zero mask will lose information on undef elements possibly reducing
+    // future combine possibilities.
+    DCI.CombineTo(Root.getNode(), getZeroVector(Root.getSimpleValueType(),
+                                                Subtarget, DAG, SDLoc(Root)));
+    return true;
+  }
 
-  case X86ISD::UNPCKL:
-  case X86ISD::UNPCKH:
-    assert(Op.getOperand(0) == Op.getOperand(1) &&
-           "We only combine unary shuffles!");
-    // We can't check for single use, we have to check that this shuffle is the
-    // only user.
-    if (Op->isOnlyUserOf(Op.getOperand(0).getNode()) &&
-        combineX86ShufflesRecursively(Op.getOperand(0), Root, Mask, Depth + 1,
-                                      HasPSHUFB, DAG, DCI, Subtarget))
-      return true;
-    break;
+  int MaskSize = Mask.size();
+  bool UseInput0 = std::any_of(Mask.begin(), Mask.end(),
+                  [MaskSize](int Idx) { return 0 <= Idx && Idx < MaskSize; });
+  bool UseInput1 = std::any_of(Mask.begin(), Mask.end(),
+                  [MaskSize](int Idx) { return MaskSize <= Idx; });
+
+  // At the moment we can only combine unary shuffle mask cases.
+  if (UseInput0 && UseInput1)
+    return false;
+  else if (UseInput1) {
+    std::swap(Input0, Input1);
+    ShuffleVectorSDNode::commuteMask(Mask);
   }
 
+  assert(Input0 && "Shuffle with no inputs detected");
+
+  HasVariableMask |= isTargetShuffleVariableMask(Op.getOpcode());
+
+  // See if we can recurse into Input0 (if it's a target shuffle).
+  if (Op->isOnlyUserOf(Input0.getNode()) &&
+      combineX86ShufflesRecursively(Input0, Root, Mask, Depth + 1,
+                                    HasVariableMask, DAG, DCI, Subtarget))
+    return true;
+
   // Minor canonicalization of the accumulated shuffle mask to make it easier
-  // to match below. All this does is detect masks with squential pairs of
+  // to match below. All this does is detect masks with sequential pairs of
   // elements, and shrink them to the half-width mask. It does this in a loop
   // so it will reduce the size of the mask to the minimal width mask which
   // performs an equivalent shuffle.
   SmallVector<int, 16> WidenedMask;
   while (Mask.size() > 1 && canWidenShuffleElements(Mask, WidenedMask)) {
     Mask = std::move(WidenedMask);
-    WidenedMask.clear();
   }
 
-  return combineX86ShuffleChain(Op, Root, Mask, Depth, HasPSHUFB, DAG, DCI,
-                                Subtarget);
+  return combineX86ShuffleChain(Input0, Root, Mask, Depth, HasVariableMask, DAG,
+                                DCI, Subtarget);
 }
 
 /// \brief Get the PSHUF-style mask from PSHUF node.
@@ -23410,8 +25464,10 @@ static bool combineX86ShufflesRecursively(SDValue Op, SDValue Root,
 static SmallVector<int, 4> getPSHUFShuffleMask(SDValue N) {
   MVT VT = N.getSimpleValueType();
   SmallVector<int, 4> Mask;
+  SmallVector<SDValue, 2> Ops;
   bool IsUnary;
-  bool HaveMask = getTargetShuffleMask(N.getNode(), VT, false, Mask, IsUnary);
+  bool HaveMask =
+      getTargetShuffleMask(N.getNode(), VT, false, Ops, Mask, IsUnary);
   (void)HaveMask;
   assert(HaveMask);
 
@@ -23647,9 +25703,9 @@ static bool combineRedundantHalfShuffle(SDValue N, MutableArrayRef<int> Mask,
 }
 
 /// \brief Try to combine x86 target specific shuffles.
-static SDValue PerformTargetShuffleCombine(SDValue N, SelectionDAG &DAG,
-                                           TargetLowering::DAGCombinerInfo &DCI,
-                                           const X86Subtarget *Subtarget) {
+static SDValue combineTargetShuffle(SDValue N, SelectionDAG &DAG,
+                                    TargetLowering::DAGCombinerInfo &DCI,
+                                    const X86Subtarget &Subtarget) {
   SDLoc DL(N);
   MVT VT = N.getSimpleValueType();
   SmallVector<int, 4> Mask;
@@ -23681,8 +25737,7 @@ static SDValue PerformTargetShuffleCombine(SDValue N, SelectionDAG &DAG,
 
     auto Op0 = N.getOperand(0);
     auto Op1 = N.getOperand(1);
-    if (Op0.getOpcode() == ISD::UNDEF &&
-        Op1.getNode()->getOpcode() == ISD::VECTOR_SHUFFLE) {
+    if (Op0.isUndef() && Op1.getNode()->getOpcode() == ISD::VECTOR_SHUFFLE) {
       ArrayRef<int> Mask = cast<ShuffleVectorSDNode>(Op1.getNode())->getMask();
 
       unsigned NumElts = VT.getVectorNumElements();
@@ -23719,6 +25774,129 @@ static SDValue PerformTargetShuffleCombine(SDValue N, SelectionDAG &DAG,
           return DAG.getNode(X86ISD::BLENDI, DL, VT, V1, V0, NewMask);
         }
 
+    // Attempt to merge blend(insertps(x,y),zero).
+    if (V0.getOpcode() == X86ISD::INSERTPS ||
+        V1.getOpcode() == X86ISD::INSERTPS) {
+      assert(VT == MVT::v4f32 && "INSERTPS ValueType must be MVT::v4f32");
+
+      // Determine which elements are known to be zero.
+      SmallVector<int, 8> TargetMask;
+      SmallVector<SDValue, 2> BlendOps;
+      if (!setTargetShuffleZeroElements(N, TargetMask, BlendOps))
+        return SDValue();
+
+      // Helper function to take inner insertps node and attempt to
+      // merge the blend with zero into its zero mask.
+      auto MergeInsertPSAndBlend = [&](SDValue V, int Offset) {
+        if (V.getOpcode() != X86ISD::INSERTPS)
+          return SDValue();
+        SDValue Op0 = V.getOperand(0);
+        SDValue Op1 = V.getOperand(1);
+        SDValue Op2 = V.getOperand(2);
+        unsigned InsertPSMask = cast<ConstantSDNode>(Op2)->getZExtValue();
+
+        // Check each element of the blend node's target mask - must either
+        // be zeroable (and update the zero mask) or selects the element from
+        // the inner insertps node.
+        for (int i = 0; i != 4; ++i)
+          if (TargetMask[i] < 0)
+            InsertPSMask |= (1u << i);
+          else if (TargetMask[i] != (i + Offset))
+            return SDValue();
+        return DAG.getNode(X86ISD::INSERTPS, DL, MVT::v4f32, Op0, Op1,
+                           DAG.getConstant(InsertPSMask, DL, MVT::i8));
+      };
+
+      if (SDValue V = MergeInsertPSAndBlend(V0, 0))
+        return V;
+      if (SDValue V = MergeInsertPSAndBlend(V1, 4))
+        return V;
+    }
+    return SDValue();
+  }
+  case X86ISD::INSERTPS: {
+    assert(VT == MVT::v4f32 && "INSERTPS ValueType must be MVT::v4f32");
+    SDValue Op0 = N.getOperand(0);
+    SDValue Op1 = N.getOperand(1);
+    SDValue Op2 = N.getOperand(2);
+    unsigned InsertPSMask = cast<ConstantSDNode>(Op2)->getZExtValue();
+    unsigned SrcIdx = (InsertPSMask >> 6) & 0x3;
+    unsigned DstIdx = (InsertPSMask >> 4) & 0x3;
+    unsigned ZeroMask = InsertPSMask & 0xF;
+
+    // If we zero out all elements from Op0 then we don't need to reference it.
+    if (((ZeroMask | (1u << DstIdx)) == 0xF) && !Op0.isUndef())
+      return DAG.getNode(X86ISD::INSERTPS, DL, VT, DAG.getUNDEF(VT), Op1,
+                         DAG.getConstant(InsertPSMask, DL, MVT::i8));
+
+    // If we zero out the element from Op1 then we don't need to reference it.
+    if ((ZeroMask & (1u << DstIdx)) && !Op1.isUndef())
+      return DAG.getNode(X86ISD::INSERTPS, DL, VT, Op0, DAG.getUNDEF(VT),
+                         DAG.getConstant(InsertPSMask, DL, MVT::i8));
+
+    // Attempt to merge insertps Op1 with an inner target shuffle node.
+    SmallVector<int, 8> TargetMask1;
+    SmallVector<SDValue, 2> Ops1;
+    if (setTargetShuffleZeroElements(Op1, TargetMask1, Ops1)) {
+      int M = TargetMask1[SrcIdx];
+      if (isUndefOrZero(M)) {
+        // Zero/UNDEF insertion - zero out element and remove dependency.
+        InsertPSMask |= (1u << DstIdx);
+        return DAG.getNode(X86ISD::INSERTPS, DL, VT, Op0, DAG.getUNDEF(VT),
+                           DAG.getConstant(InsertPSMask, DL, MVT::i8));
+      }
+      // Update insertps mask srcidx and reference the source input directly.
+      assert(0 <= M && M < 8 && "Shuffle index out of range");
+      InsertPSMask = (InsertPSMask & 0x3f) | ((M & 0x3) << 6);
+      Op1 = Ops1[M < 4 ? 0 : 1];
+      return DAG.getNode(X86ISD::INSERTPS, DL, VT, Op0, Op1,
+                         DAG.getConstant(InsertPSMask, DL, MVT::i8));
+    }
+
+    // Attempt to merge insertps Op0 with an inner target shuffle node.
+    SmallVector<int, 8> TargetMask0;
+    SmallVector<SDValue, 2> Ops0;
+    if (!setTargetShuffleZeroElements(Op0, TargetMask0, Ops0))
+      return SDValue();
+
+    bool Updated = false;
+    bool UseInput00 = false;
+    bool UseInput01 = false;
+    for (int i = 0; i != 4; ++i) {
+      int M = TargetMask0[i];
+      if ((InsertPSMask & (1u << i)) || (i == (int)DstIdx)) {
+        // No change if element is already zero or the inserted element.
+        continue;
+      } else if (isUndefOrZero(M)) {
+        // If the target mask is undef/zero then we must zero the element.
+        InsertPSMask |= (1u << i);
+        Updated = true;
+        continue;
+      }
+
+      // The input vector element must be inline.
+      if (M != i && M != (i + 4))
+        return SDValue();
+
+      // Determine which inputs of the target shuffle we're using.
+      UseInput00 |= (0 <= M && M < 4);
+      UseInput01 |= (4 <= M);
+    }
+
+    // If we're not using both inputs of the target shuffle then use the
+    // referenced input directly.
+    if (UseInput00 && !UseInput01) {
+      Updated = true;
+      Op0 = Ops0[0];
+    } else if (!UseInput00 && UseInput01) {
+      Updated = true;
+      Op0 = Ops0[1];
+    }
+
+    if (Updated)
+      return DAG.getNode(X86ISD::INSERTPS, DL, VT, Op0, Op1,
+                         DAG.getConstant(InsertPSMask, DL, MVT::i8));
+
     return SDValue();
   }
   default:
@@ -23814,12 +25992,12 @@ static SDValue PerformTargetShuffleCombine(SDValue N, SelectionDAG &DAG,
 /// the operands which explicitly discard the lanes which are unused by this
 /// operation to try to flow through the rest of the combiner the fact that
 /// they're unused.
-static SDValue combineShuffleToAddSub(SDNode *N, const X86Subtarget *Subtarget,
+static SDValue combineShuffleToAddSub(SDNode *N, const X86Subtarget &Subtarget,
                                       SelectionDAG &DAG) {
   SDLoc DL(N);
   EVT VT = N->getValueType(0);
-  if ((!Subtarget->hasSSE3() || (VT != MVT::v4f32 && VT != MVT::v2f64)) &&
-      (!Subtarget->hasAVX() || (VT != MVT::v8f32 && VT != MVT::v4f64)))
+  if ((!Subtarget.hasSSE3() || (VT != MVT::v4f32 && VT != MVT::v2f64)) &&
+      (!Subtarget.hasAVX() || (VT != MVT::v8f32 && VT != MVT::v4f64)))
     return SDValue();
 
   // We only handle target-independent shuffles.
@@ -23865,13 +26043,10 @@ static SDValue combineShuffleToAddSub(SDNode *N, const X86Subtarget *Subtarget,
   return DAG.getNode(X86ISD::ADDSUB, DL, VT, LHS, RHS);
 }
 
-/// PerformShuffleCombine - Performs several different shuffle combines.
-static SDValue PerformShuffleCombine(SDNode *N, SelectionDAG &DAG,
-                                     TargetLowering::DAGCombinerInfo &DCI,
-                                     const X86Subtarget *Subtarget) {
+static SDValue combineShuffle(SDNode *N, SelectionDAG &DAG,
+                              TargetLowering::DAGCombinerInfo &DCI,
+                              const X86Subtarget &Subtarget) {
   SDLoc dl(N);
-  SDValue N0 = N->getOperand(0);
-  SDValue N1 = N->getOperand(1);
   EVT VT = N->getValueType(0);
 
   // Don't create instructions with illegal types after legalize types has run.
@@ -23886,9 +26061,9 @@ static SDValue PerformShuffleCombine(SDNode *N, SelectionDAG &DAG,
       return AddSub;
 
   // Combine 256-bit vector shuffles. This is only profitable when in AVX mode
-  if (TLI.isTypeLegal(VT) && Subtarget->hasFp256() && VT.is256BitVector() &&
+  if (TLI.isTypeLegal(VT) && Subtarget.hasFp256() && VT.is256BitVector() &&
       N->getOpcode() == ISD::VECTOR_SHUFFLE)
-    return PerformShuffleCombine256(N, DAG, DCI, Subtarget);
+    return combineShuffle256(N, DAG, DCI, Subtarget);
 
   // During Type Legalization, when promoting illegal vector types,
   // the backend might introduce new shuffle dag nodes and bitcasts.
@@ -23903,8 +26078,12 @@ static SDValue PerformShuffleCombine(SDNode *N, SelectionDAG &DAG,
   // potentially need to be further expanded (or custom lowered) into a
   // less optimal sequence of dag nodes.
   if (!DCI.isBeforeLegalize() && DCI.isBeforeLegalizeOps() &&
-      N1.getOpcode() == ISD::UNDEF && N0.hasOneUse() &&
-      N0.getOpcode() == ISD::BITCAST) {
+      N->getOpcode() == ISD::VECTOR_SHUFFLE &&
+      N->getOperand(0).getOpcode() == ISD::BITCAST &&
+      N->getOperand(1).isUndef() && N->getOperand(0).hasOneUse()) {
+    SDValue N0 = N->getOperand(0);
+    SDValue N1 = N->getOperand(1);
+
     SDValue BC0 = N0.getOperand(0);
     EVT SVT = BC0.getValueType();
     unsigned Opcode = BC0.getOpcode();
@@ -23936,7 +26115,7 @@ static SDValue PerformShuffleCombine(SDNode *N, SelectionDAG &DAG,
         SDValue BC00 = DAG.getBitcast(VT, BC0.getOperand(0));
         SDValue BC01 = DAG.getBitcast(VT, BC0.getOperand(1));
         SDValue NewBinOp = DAG.getNode(BC0.getOpcode(), dl, VT, BC00, BC01);
-        return DAG.getVectorShuffle(VT, dl, NewBinOp, N1, &SVOp->getMask()[0]);
+        return DAG.getVectorShuffle(VT, dl, NewBinOp, N1, SVOp->getMask());
       }
     }
   }
@@ -23952,9 +26131,8 @@ static SDValue PerformShuffleCombine(SDNode *N, SelectionDAG &DAG,
     return LD;
 
   if (isTargetShuffle(N->getOpcode())) {
-    SDValue Shuffle =
-        PerformTargetShuffleCombine(SDValue(N, 0), DAG, DCI, Subtarget);
-    if (Shuffle.getNode())
+    if (SDValue Shuffle =
+            combineTargetShuffle(SDValue(N, 0), DAG, DCI, Subtarget))
       return Shuffle;
 
     // Try recursively combining arbitrary sequences of x86 shuffle
@@ -23973,8 +26151,8 @@ static SDValue PerformShuffleCombine(SDNode *N, SelectionDAG &DAG,
   return SDValue();
 }
 
-/// XFormVExtractWithShuffleIntoLoad - Check if a vector extract from a target
-/// specific shuffle of a load can be folded into a single element load.
+/// Check if a vector extract from a target-specific shuffle of a load can be
+/// folded into a single element load.
 /// Similar handling for VECTOR_SHUFFLE is performed by DAGCombiner, but
 /// shuffles have been custom lowered so we need to handle those here.
 static SDValue XFormVExtractWithShuffleIntoLoad(SDNode *N, SelectionDAG &DAG,
@@ -24012,9 +26190,10 @@ static SDValue XFormVExtractWithShuffleIntoLoad(SDNode *N, SelectionDAG &DAG,
     return SDValue();
 
   SmallVector<int, 16> ShuffleMask;
+  SmallVector<SDValue, 2> ShuffleOps;
   bool UnaryShuffle;
   if (!getTargetShuffleMask(InVec.getNode(), CurrentVT.getSimpleVT(), true,
-                            ShuffleMask, UnaryShuffle))
+                            ShuffleOps, ShuffleMask, UnaryShuffle))
     return SDValue();
 
   // Select the input vector, guarding against out of range extract vector.
@@ -24029,12 +26208,12 @@ static SDValue XFormVExtractWithShuffleIntoLoad(SDNode *N, SelectionDAG &DAG,
     return DAG.getUNDEF(EltVT);
 
   assert(0 <= Idx && Idx < (int)(2 * NumElems) && "Shuffle index out of range");
-  SDValue LdNode = (Idx < (int)NumElems) ? InVec.getOperand(0)
-                                         : InVec.getOperand(1);
+  SDValue LdNode = (Idx < (int)NumElems) ? ShuffleOps[0]
+                                         : ShuffleOps[1];
 
   // If inputs to shuffle are the same for both ops, then allow 2 uses
-  unsigned AllowedUses = InVec.getNumOperands() > 1 &&
-                         InVec.getOperand(0) == InVec.getOperand(1) ? 2 : 1;
+  unsigned AllowedUses =
+      (ShuffleOps.size() > 1 && ShuffleOps[0] == ShuffleOps[1]) ? 2 : 1;
 
   if (LdNode.getOpcode() == ISD::BITCAST) {
     // Don't duplicate a load with other uses.
@@ -24068,18 +26247,16 @@ static SDValue XFormVExtractWithShuffleIntoLoad(SDNode *N, SelectionDAG &DAG,
   SDLoc dl(N);
 
   // Create shuffle node taking into account the case that its a unary shuffle
-  SDValue Shuffle = (UnaryShuffle) ? DAG.getUNDEF(CurrentVT)
-                                   : InVec.getOperand(1);
-  Shuffle = DAG.getVectorShuffle(CurrentVT, dl,
-                                 InVec.getOperand(0), Shuffle,
-                                 &ShuffleMask[0]);
+  SDValue Shuffle = (UnaryShuffle) ? DAG.getUNDEF(CurrentVT) : ShuffleOps[1];
+  Shuffle = DAG.getVectorShuffle(CurrentVT, dl, ShuffleOps[0], Shuffle,
+                                 ShuffleMask);
   Shuffle = DAG.getBitcast(OriginalVT, Shuffle);
   return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, N->getValueType(0), Shuffle,
                      EltNo);
 }
 
-static SDValue PerformBITCASTCombine(SDNode *N, SelectionDAG &DAG,
-                                     const X86Subtarget *Subtarget) {
+static SDValue combineBitcast(SDNode *N, SelectionDAG &DAG,
+                              const X86Subtarget &Subtarget) {
   SDValue N0 = N->getOperand(0);
   EVT VT = N->getValueType(0);
 
@@ -24108,8 +26285,8 @@ static SDValue PerformBITCASTCombine(SDNode *N, SelectionDAG &DAG,
     case ISD::XOR: FPOpcode = X86ISD::FXOR; break;
     default: return SDValue();
   }
-  if (((Subtarget->hasSSE1() && VT == MVT::f32) ||
-       (Subtarget->hasSSE2() && VT == MVT::f64)) &&
+  if (((Subtarget.hasSSE1() && VT == MVT::f32) ||
+       (Subtarget.hasSSE2() && VT == MVT::f64)) &&
       isa<ConstantSDNode>(N0.getOperand(1)) &&
       N0.getOperand(0).getOpcode() == ISD::BITCAST &&
       N0.getOperand(0).getOperand(0).getValueType() == VT) {
@@ -24121,13 +26298,12 @@ static SDValue PerformBITCASTCombine(SDNode *N, SelectionDAG &DAG,
   return SDValue();
 }
 
-/// PerformEXTRACT_VECTOR_ELTCombine - Detect vector gather/scatter index
-/// generation and convert it from being a bunch of shuffles and extracts
-/// into a somewhat faster sequence. For i686, the best sequence is apparently
-/// storing the value and loading scalars back, while for x64 we should
-/// use 64-bit extracts and shifts.
-static SDValue PerformEXTRACT_VECTOR_ELTCombine(SDNode *N, SelectionDAG &DAG,
-                                         TargetLowering::DAGCombinerInfo &DCI) {
+/// Detect vector gather/scatter index generation and convert it from being a
+/// bunch of shuffles and extracts into a somewhat faster sequence.
+/// For i686, the best sequence is apparently storing the value and loading
+/// scalars back, while for x64 we should use 64-bit extracts and shifts.
+static SDValue combineExtractVectorElt(SDNode *N, SelectionDAG &DAG,
+                                       TargetLowering::DAGCombinerInfo &DCI) {
   if (SDValue NewOp = XFormVExtractWithShuffleIntoLoad(N, DAG, DCI))
     return NewOp;
 
@@ -24136,25 +26312,14 @@ static SDValue PerformEXTRACT_VECTOR_ELTCombine(SDNode *N, SelectionDAG &DAG,
   // Detect mmx to i32 conversion through a v2i32 elt extract.
   if (InputVector.getOpcode() == ISD::BITCAST && InputVector.hasOneUse() &&
       N->getValueType(0) == MVT::i32 &&
-      InputVector.getValueType() == MVT::v2i32) {
+      InputVector.getValueType() == MVT::v2i32 &&
+      isa<ConstantSDNode>(N->getOperand(1)) &&
+      N->getConstantOperandVal(1) == 0) {
+    SDValue MMXSrc = InputVector.getNode()->getOperand(0);
 
     // The bitcast source is a direct mmx result.
-    SDValue MMXSrc = InputVector.getNode()->getOperand(0);
     if (MMXSrc.getValueType() == MVT::x86mmx)
-      return DAG.getNode(X86ISD::MMX_MOVD2W, SDLoc(InputVector),
-                         N->getValueType(0),
-                         InputVector.getNode()->getOperand(0));
-
-    // The mmx is indirect: (i64 extract_elt (v1i64 bitcast (x86mmx ...))).
-    if (MMXSrc.getOpcode() == ISD::EXTRACT_VECTOR_ELT && MMXSrc.hasOneUse() &&
-        MMXSrc.getValueType() == MVT::i64) {
-      SDValue MMXSrcOp = MMXSrc.getOperand(0);
-      if (MMXSrcOp.hasOneUse() && MMXSrcOp.getOpcode() == ISD::BITCAST &&
-          MMXSrcOp.getValueType() == MVT::v1i64 &&
-          MMXSrcOp.getOperand(0).getValueType() == MVT::x86mmx)
-        return DAG.getNode(X86ISD::MMX_MOVD2W, SDLoc(InputVector),
-                           N->getValueType(0), MMXSrcOp.getOperand(0));
-    }
+      return DAG.getNode(X86ISD::MMX_MOVD2W, dl, MVT::i32, MMXSrc);
   }
 
   EVT VT = N->getValueType(0);
@@ -24236,7 +26401,7 @@ static SDValue PerformEXTRACT_VECTOR_ELTCombine(SDNode *N, SelectionDAG &DAG,
     // Store the value to a temporary stack slot.
     SDValue StackPtr = DAG.CreateStackTemporary(InputVector.getValueType());
     SDValue Ch = DAG.getStore(DAG.getEntryNode(), dl, InputVector, StackPtr,
-      MachinePointerInfo(), false, false, 0);
+                              MachinePointerInfo());
 
     EVT ElementType = InputVector.getValueType().getVectorElementType();
     unsigned EltSize = ElementType.getSizeInBits() / 8;
@@ -24251,10 +26416,8 @@ static SDValue PerformEXTRACT_VECTOR_ELTCombine(SDNode *N, SelectionDAG &DAG,
           DAG.getNode(ISD::ADD, dl, PtrVT, StackPtr, OffsetVal);
 
       // Load the scalar.
-      Vals[i] = DAG.getLoad(ElementType, dl, Ch,
-                            ScalarAddr, MachinePointerInfo(),
-                            false, false, false, 0);
-
+      Vals[i] =
+          DAG.getLoad(ElementType, dl, Ch, ScalarAddr, MachinePointerInfo());
     }
   }
 
@@ -24272,55 +26435,10 @@ static SDValue PerformEXTRACT_VECTOR_ELTCombine(SDNode *N, SelectionDAG &DAG,
   return SDValue();
 }
 
-static SDValue
-transformVSELECTtoBlendVECTOR_SHUFFLE(SDNode *N, SelectionDAG &DAG,
-                                      const X86Subtarget *Subtarget) {
-  SDLoc dl(N);
-  SDValue Cond = N->getOperand(0);
-  SDValue LHS = N->getOperand(1);
-  SDValue RHS = N->getOperand(2);
-
-  if (Cond.getOpcode() == ISD::SIGN_EXTEND) {
-    SDValue CondSrc = Cond->getOperand(0);
-    if (CondSrc->getOpcode() == ISD::SIGN_EXTEND_INREG)
-      Cond = CondSrc->getOperand(0);
-  }
-
-  if (!ISD::isBuildVectorOfConstantSDNodes(Cond.getNode()))
-    return SDValue();
-
-  // A vselect where all conditions and data are constants can be optimized into
-  // a single vector load by SelectionDAGLegalize::ExpandBUILD_VECTOR().
-  if (ISD::isBuildVectorOfConstantSDNodes(LHS.getNode()) &&
-      ISD::isBuildVectorOfConstantSDNodes(RHS.getNode()))
-    return SDValue();
-
-  unsigned MaskValue = 0;
-  if (!BUILD_VECTORtoBlendMask(cast<BuildVectorSDNode>(Cond), MaskValue))
-    return SDValue();
-
-  MVT VT = N->getSimpleValueType(0);
-  unsigned NumElems = VT.getVectorNumElements();
-  SmallVector<int, 8> ShuffleMask(NumElems, -1);
-  for (unsigned i = 0; i < NumElems; ++i) {
-    // Be sure we emit undef where we can.
-    if (Cond.getOperand(i)->getOpcode() == ISD::UNDEF)
-      ShuffleMask[i] = -1;
-    else
-      ShuffleMask[i] = i + NumElems * ((MaskValue >> i) & 1);
-  }
-
-  const TargetLowering &TLI = DAG.getTargetLoweringInfo();
-  if (!TLI.isShuffleMaskLegal(ShuffleMask, VT))
-    return SDValue();
-  return DAG.getVectorShuffle(VT, dl, LHS, RHS, &ShuffleMask[0]);
-}
-
-/// PerformSELECTCombine - Do target-specific dag combines on SELECT and VSELECT
-/// nodes.
-static SDValue PerformSELECTCombine(SDNode *N, SelectionDAG &DAG,
-                                    TargetLowering::DAGCombinerInfo &DCI,
-                                    const X86Subtarget *Subtarget) {
+/// Do target-specific dag combines on SELECT and VSELECT nodes.
+static SDValue combineSelect(SDNode *N, SelectionDAG &DAG,
+                             TargetLowering::DAGCombinerInfo &DCI,
+                             const X86Subtarget &Subtarget) {
   SDLoc DL(N);
   SDValue Cond = N->getOperand(0);
   // Get the LHS/RHS of the select.
@@ -24337,8 +26455,8 @@ static SDValue PerformSELECTCombine(SDNode *N, SelectionDAG &DAG,
   if (Cond.getOpcode() == ISD::SETCC && VT.isFloatingPoint() &&
       VT != MVT::f80 && VT != MVT::f128 &&
       (TLI.isTypeLegal(VT) || VT == MVT::v2f32) &&
-      (Subtarget->hasSSE2() ||
-       (Subtarget->hasSSE1() && VT.getScalarType() == MVT::f32))) {
+      (Subtarget.hasSSE2() ||
+       (Subtarget.hasSSE1() && VT.getScalarType() == MVT::f32))) {
     ISD::CondCode CC = cast<CondCodeSDNode>(Cond.getOperand(2))->get();
 
     unsigned Opcode = 0;
@@ -24476,7 +26594,7 @@ static SDValue PerformSELECTCombine(SDNode *N, SelectionDAG &DAG,
   }
 
   EVT CondVT = Cond.getValueType();
-  if (Subtarget->hasAVX512() && VT.isVector() && CondVT.isVector() &&
+  if (Subtarget.hasAVX512() && VT.isVector() && CondVT.isVector() &&
       CondVT.getVectorElementType() == MVT::i1) {
     // v16i8 (select v16i1, v16i8, v16i8) does not have a proper
     // lowering on KNL. In this case we convert it to
@@ -24487,7 +26605,7 @@ static SDValue PerformSELECTCombine(SDNode *N, SelectionDAG &DAG,
     if ((OpVT.is128BitVector() || OpVT.is256BitVector()) &&
         (OpVT.getVectorElementType() == MVT::i8 ||
          OpVT.getVectorElementType() == MVT::i16) &&
-        !(Subtarget->hasBWI() && Subtarget->hasVLX())) {
+        !(Subtarget.hasBWI() && Subtarget.hasVLX())) {
       Cond = DAG.getNode(ISD::SIGN_EXTEND, DL, OpVT, Cond);
       DCI.AddToWorklist(Cond.getNode());
       return DAG.getNode(N->getOpcode(), DL, OpVT, Cond, LHS, RHS);
@@ -24625,8 +26743,8 @@ static SDValue PerformSELECTCombine(SDNode *N, SelectionDAG &DAG,
   // Match VSELECTs into subs with unsigned saturation.
   if (N->getOpcode() == ISD::VSELECT && Cond.getOpcode() == ISD::SETCC &&
       // psubus is available in SSE2 and AVX2 for i8 and i16 vectors.
-      ((Subtarget->hasSSE2() && (VT == MVT::v16i8 || VT == MVT::v8i16)) ||
-       (Subtarget->hasAVX2() && (VT == MVT::v32i8 || VT == MVT::v16i16)))) {
+      ((Subtarget.hasSSE2() && (VT == MVT::v16i8 || VT == MVT::v8i16)) ||
+       (Subtarget.hasAVX2() && (VT == MVT::v32i8 || VT == MVT::v16i16)))) {
     ISD::CondCode CC = cast<CondCodeSDNode>(Cond.getOperand(2))->get();
 
     // Check if one of the arms of the VSELECT is a zero vector. If it's on the
@@ -24730,25 +26848,6 @@ static SDValue PerformSELECTCombine(SDNode *N, SelectionDAG &DAG,
     }
   }
 
-  // We should generate an X86ISD::BLENDI from a vselect if its argument
-  // is a sign_extend_inreg of an any_extend of a BUILD_VECTOR of
-  // constants. This specific pattern gets generated when we split a
-  // selector for a 512 bit vector in a machine without AVX512 (but with
-  // 256-bit vectors), during legalization:
-  //
-  // (vselect (sign_extend (any_extend (BUILD_VECTOR)) i1) LHS RHS)
-  //
-  // Iff we find this pattern and the build_vectors are built from
-  // constants, we translate the vselect into a shuffle_vector that we
-  // know will be matched by LowerVECTOR_SHUFFLEtoBlend.
-  if ((N->getOpcode() == ISD::VSELECT ||
-       N->getOpcode() == X86ISD::SHRUNKBLEND) &&
-      !DCI.isBeforeLegalize() && !VT.is512BitVector()) {
-    SDValue Shuffle = transformVSELECTtoBlendVECTOR_SHUFFLE(N, DAG, Subtarget);
-    if (Shuffle.getNode())
-      return Shuffle;
-  }
-
   // If this is a *dynamic* select (non-constant condition) and we can match
   // this node with one of the variable blend instructions, restructure the
   // condition so that the blends can use the high bit of each element and use
@@ -24780,10 +26879,10 @@ static SDValue PerformSELECTCombine(SDNode *N, SelectionDAG &DAG,
     if (VT.getVectorElementType() == MVT::i16)
       return SDValue();
     // Dynamic blending was only available from SSE4.1 onward.
-    if (VT.is128BitVector() && !Subtarget->hasSSE41())
+    if (VT.is128BitVector() && !Subtarget.hasSSE41())
       return SDValue();
     // Byte blends are only available in AVX2
-    if (VT == MVT::v32i8 && !Subtarget->hasAVX2())
+    if (VT == MVT::v32i8 && !Subtarget.hasAVX2())
       return SDValue();
 
     assert(BitWidth >= 8 && BitWidth <= 64 && "Invalid mask size");
@@ -24837,6 +26936,73 @@ static SDValue PerformSELECTCombine(SDNode *N, SelectionDAG &DAG,
   return SDValue();
 }
 
+/// Combine:
+///   (brcond/cmov/setcc .., (cmp (atomic_load_add x, 1), 0), COND_S)
+/// to:
+///   (brcond/cmov/setcc .., (LADD x, 1), COND_LE)
+/// i.e., reusing the EFLAGS produced by the LOCKed instruction.
+/// Note that this is only legal for some op/cc combinations.
+static SDValue combineSetCCAtomicArith(SDValue Cmp, X86::CondCode &CC,
+                                       SelectionDAG &DAG) {
+  // This combine only operates on CMP-like nodes.
+  if (!(Cmp.getOpcode() == X86ISD::CMP ||
+        (Cmp.getOpcode() == X86ISD::SUB && !Cmp->hasAnyUseOfValue(0))))
+    return SDValue();
+
+  // This only applies to variations of the common case:
+  //   (icmp slt x, 0) -> (icmp sle (add x, 1), 0)
+  //   (icmp sge x, 0) -> (icmp sgt (add x, 1), 0)
+  //   (icmp sle x, 0) -> (icmp slt (sub x, 1), 0)
+  //   (icmp sgt x, 0) -> (icmp sge (sub x, 1), 0)
+  // Using the proper condcodes (see below), overflow is checked for.
+
+  // FIXME: We can generalize both constraints:
+  // - XOR/OR/AND (if they were made to survive AtomicExpand)
+  // - LHS != 1
+  // if the result is compared.
+
+  SDValue CmpLHS = Cmp.getOperand(0);
+  SDValue CmpRHS = Cmp.getOperand(1);
+
+  if (!CmpLHS.hasOneUse())
+    return SDValue();
+
+  auto *CmpRHSC = dyn_cast<ConstantSDNode>(CmpRHS);
+  if (!CmpRHSC || CmpRHSC->getZExtValue() != 0)
+    return SDValue();
+
+  const unsigned Opc = CmpLHS.getOpcode();
+
+  if (Opc != ISD::ATOMIC_LOAD_ADD && Opc != ISD::ATOMIC_LOAD_SUB)
+    return SDValue();
+
+  SDValue OpRHS = CmpLHS.getOperand(2);
+  auto *OpRHSC = dyn_cast<ConstantSDNode>(OpRHS);
+  if (!OpRHSC)
+    return SDValue();
+
+  APInt Addend = OpRHSC->getAPIntValue();
+  if (Opc == ISD::ATOMIC_LOAD_SUB)
+    Addend = -Addend;
+
+  if (CC == X86::COND_S && Addend == 1)
+    CC = X86::COND_LE;
+  else if (CC == X86::COND_NS && Addend == 1)
+    CC = X86::COND_G;
+  else if (CC == X86::COND_G && Addend == -1)
+    CC = X86::COND_GE;
+  else if (CC == X86::COND_LE && Addend == -1)
+    CC = X86::COND_L;
+  else
+    return SDValue();
+
+  SDValue LockOp = lowerAtomicArithWithLOCK(CmpLHS, DAG);
+  DAG.ReplaceAllUsesOfValueWith(CmpLHS.getValue(0),
+                                DAG.getUNDEF(CmpLHS.getValueType()));
+  DAG.ReplaceAllUsesOfValueWith(CmpLHS.getValue(1), LockOp.getValue(1));
+  return LockOp;
+}
+
 // Check whether a boolean test is testing a boolean value generated by
 // X86ISD::SETCC. If so, return the operand of that SETCC and proper condition
 // code.
@@ -24853,10 +27019,10 @@ static SDValue PerformSELECTCombine(SDNode *N, SelectionDAG &DAG,
 // where Op could be BRCOND or CMOV.
 //
 static SDValue checkBoolTestSetCCCombine(SDValue Cmp, X86::CondCode &CC) {
-  // Quit if not CMP and SUB with its value result used.
-  if (Cmp.getOpcode() != X86ISD::CMP &&
-      (Cmp.getOpcode() != X86ISD::SUB || Cmp.getNode()->hasAnyUseOfValue(0)))
-      return SDValue();
+  // This combine only operates on CMP-like nodes.
+  if (!(Cmp.getOpcode() == X86ISD::CMP ||
+        (Cmp.getOpcode() == X86ISD::SUB && !Cmp->hasAnyUseOfValue(0))))
+    return SDValue();
 
   // Quit if not used as a boolean value.
   if (CC != X86::COND_E && CC != X86::COND_NE)
@@ -24890,6 +27056,7 @@ static SDValue checkBoolTestSetCCCombine(SDValue Cmp, X86::CondCode &CC) {
   // Skip (zext $x), (trunc $x), or (and $x, 1) node.
   while (SetCC.getOpcode() == ISD::ZERO_EXTEND ||
          SetCC.getOpcode() == ISD::TRUNCATE ||
+         SetCC.getOpcode() == ISD::AssertZext ||
          SetCC.getOpcode() == ISD::AND) {
     if (SetCC.getOpcode() == ISD::AND) {
       int OpIdx = -1;
@@ -24897,7 +27064,7 @@ static SDValue checkBoolTestSetCCCombine(SDValue Cmp, X86::CondCode &CC) {
         OpIdx = 1;
       if (isOneConstant(SetCC.getOperand(1)))
         OpIdx = 0;
-      if (OpIdx == -1)
+      if (OpIdx < 0)
         break;
       SetCC = SetCC.getOperand(OpIdx);
       truncatedToBoolWithAnd = true;
@@ -25008,10 +27175,20 @@ static bool checkBoolTestAndOrSetCCCombine(SDValue Cond, X86::CondCode &CC0,
   return true;
 }
 
+/// Optimize an EFLAGS definition used according to the condition code \p CC
+/// into a simpler EFLAGS value, potentially returning a new \p CC and replacing
+/// uses of chain values.
+static SDValue combineSetCCEFLAGS(SDValue EFLAGS, X86::CondCode &CC,
+                                  SelectionDAG &DAG) {
+  if (SDValue R = checkBoolTestSetCCCombine(EFLAGS, CC))
+    return R;
+  return combineSetCCAtomicArith(EFLAGS, CC, DAG);
+}
+
 /// Optimize X86ISD::CMOV [LHS, RHS, CONDCODE (e.g. X86::COND_NE), CONDVAL]
-static SDValue PerformCMOVCombine(SDNode *N, SelectionDAG &DAG,
-                                  TargetLowering::DAGCombinerInfo &DCI,
-                                  const X86Subtarget *Subtarget) {
+static SDValue combineCMov(SDNode *N, SelectionDAG &DAG,
+                           TargetLowering::DAGCombinerInfo &DCI,
+                           const X86Subtarget &Subtarget) {
   SDLoc DL(N);
 
   // If the flag operand isn't dead, don't touch this CMOV.
@@ -25034,15 +27211,14 @@ static SDValue PerformCMOVCombine(SDNode *N, SelectionDAG &DAG,
     }
   }
 
-  SDValue Flags;
-
-  Flags = checkBoolTestSetCCCombine(Cond, CC);
-  if (Flags.getNode() &&
-      // Extra check as FCMOV only supports a subset of X86 cond.
-      (FalseOp.getValueType() != MVT::f80 || hasFPCMov(CC))) {
-    SDValue Ops[] = { FalseOp, TrueOp,
-                      DAG.getConstant(CC, DL, MVT::i8), Flags };
-    return DAG.getNode(X86ISD::CMOV, DL, N->getVTList(), Ops);
+  // Try to simplify the EFLAGS and condition code operands.
+  // We can't always do this as FCMOV only supports a subset of X86 cond.
+  if (SDValue Flags = combineSetCCEFLAGS(Cond, CC, DAG)) {
+    if (FalseOp.getValueType() != MVT::f80 || hasFPCMov(CC)) {
+      SDValue Ops[] = {FalseOp, TrueOp, DAG.getConstant(CC, DL, MVT::i8),
+        Flags};
+      return DAG.getNode(X86ISD::CMOV, DL, N->getVTList(), Ops);
+    }
   }
 
   // If this is a select between two integer constants, try to do some
@@ -25218,11 +27394,216 @@ static SDValue PerformCMOVCombine(SDNode *N, SelectionDAG &DAG,
   return SDValue();
 }
 
-/// PerformMulCombine - Optimize a single multiply with constant into two
-/// in order to implement it with two cheaper instructions, e.g.
-/// LEA + SHL, LEA + LEA.
-static SDValue PerformMulCombine(SDNode *N, SelectionDAG &DAG,
-                                 TargetLowering::DAGCombinerInfo &DCI) {
+/// Different mul shrinking modes.
+enum ShrinkMode { MULS8, MULU8, MULS16, MULU16 };
+
+static bool canReduceVMulWidth(SDNode *N, SelectionDAG &DAG, ShrinkMode &Mode) {
+  EVT VT = N->getOperand(0).getValueType();
+  if (VT.getScalarSizeInBits() != 32)
+    return false;
+
+  assert(N->getNumOperands() == 2 && "NumOperands of Mul are 2");
+  unsigned SignBits[2] = {1, 1};
+  bool IsPositive[2] = {false, false};
+  for (unsigned i = 0; i < 2; i++) {
+    SDValue Opd = N->getOperand(i);
+
+    // DAG.ComputeNumSignBits return 1 for ISD::ANY_EXTEND, so we need to
+    // compute signbits for it separately.
+    if (Opd.getOpcode() == ISD::ANY_EXTEND) {
+      // For anyextend, it is safe to assume an appropriate number of leading
+      // sign/zero bits.
+      if (Opd.getOperand(0).getValueType().getVectorElementType() == MVT::i8)
+        SignBits[i] = 25;
+      else if (Opd.getOperand(0).getValueType().getVectorElementType() ==
+               MVT::i16)
+        SignBits[i] = 17;
+      else
+        return false;
+      IsPositive[i] = true;
+    } else if (Opd.getOpcode() == ISD::BUILD_VECTOR) {
+      // All the operands of BUILD_VECTOR need to be int constant.
+      // Find the smallest value range which all the operands belong to.
+      SignBits[i] = 32;
+      IsPositive[i] = true;
+      for (const SDValue &SubOp : Opd.getNode()->op_values()) {
+        if (SubOp.isUndef())
+          continue;
+        auto *CN = dyn_cast<ConstantSDNode>(SubOp);
+        if (!CN)
+          return false;
+        APInt IntVal = CN->getAPIntValue();
+        if (IntVal.isNegative())
+          IsPositive[i] = false;
+        SignBits[i] = std::min(SignBits[i], IntVal.getNumSignBits());
+      }
+    } else {
+      SignBits[i] = DAG.ComputeNumSignBits(Opd);
+      if (Opd.getOpcode() == ISD::ZERO_EXTEND)
+        IsPositive[i] = true;
+    }
+  }
+
+  bool AllPositive = IsPositive[0] && IsPositive[1];
+  unsigned MinSignBits = std::min(SignBits[0], SignBits[1]);
+  // When ranges are from -128 ~ 127, use MULS8 mode.
+  if (MinSignBits >= 25)
+    Mode = MULS8;
+  // When ranges are from 0 ~ 255, use MULU8 mode.
+  else if (AllPositive && MinSignBits >= 24)
+    Mode = MULU8;
+  // When ranges are from -32768 ~ 32767, use MULS16 mode.
+  else if (MinSignBits >= 17)
+    Mode = MULS16;
+  // When ranges are from 0 ~ 65535, use MULU16 mode.
+  else if (AllPositive && MinSignBits >= 16)
+    Mode = MULU16;
+  else
+    return false;
+  return true;
+}
+
+/// When the operands of vector mul are extended from smaller size values,
+/// like i8 and i16, the type of mul may be shrinked to generate more
+/// efficient code. Two typical patterns are handled:
+/// Pattern1:
+///     %2 = sext/zext <N x i8> %1 to <N x i32>
+///     %4 = sext/zext <N x i8> %3 to <N x i32>
+//   or %4 = build_vector <N x i32> %C1, ..., %CN (%C1..%CN are constants)
+///     %5 = mul <N x i32> %2, %4
+///
+/// Pattern2:
+///     %2 = zext/sext <N x i16> %1 to <N x i32>
+///     %4 = zext/sext <N x i16> %3 to <N x i32>
+///  or %4 = build_vector <N x i32> %C1, ..., %CN (%C1..%CN are constants)
+///     %5 = mul <N x i32> %2, %4
+///
+/// There are four mul shrinking modes:
+/// If %2 == sext32(trunc8(%2)), i.e., the scalar value range of %2 is
+/// -128 to 128, and the scalar value range of %4 is also -128 to 128,
+/// generate pmullw+sext32 for it (MULS8 mode).
+/// If %2 == zext32(trunc8(%2)), i.e., the scalar value range of %2 is
+/// 0 to 255, and the scalar value range of %4 is also 0 to 255,
+/// generate pmullw+zext32 for it (MULU8 mode).
+/// If %2 == sext32(trunc16(%2)), i.e., the scalar value range of %2 is
+/// -32768 to 32767, and the scalar value range of %4 is also -32768 to 32767,
+/// generate pmullw+pmulhw for it (MULS16 mode).
+/// If %2 == zext32(trunc16(%2)), i.e., the scalar value range of %2 is
+/// 0 to 65535, and the scalar value range of %4 is also 0 to 65535,
+/// generate pmullw+pmulhuw for it (MULU16 mode).
+static SDValue reduceVMULWidth(SDNode *N, SelectionDAG &DAG,
+                               const X86Subtarget &Subtarget) {
+  // pmulld is supported since SSE41. It is better to use pmulld
+  // instead of pmullw+pmulhw.
+  if (Subtarget.hasSSE41())
+    return SDValue();
+
+  ShrinkMode Mode;
+  if (!canReduceVMulWidth(N, DAG, Mode))
+    return SDValue();
+
+  SDLoc DL(N);
+  SDValue N0 = N->getOperand(0);
+  SDValue N1 = N->getOperand(1);
+  EVT VT = N->getOperand(0).getValueType();
+  unsigned RegSize = 128;
+  MVT OpsVT = MVT::getVectorVT(MVT::i16, RegSize / 16);
+  EVT ReducedVT =
+      EVT::getVectorVT(*DAG.getContext(), MVT::i16, VT.getVectorNumElements());
+  // Shrink the operands of mul.
+  SDValue NewN0 = DAG.getNode(ISD::TRUNCATE, DL, ReducedVT, N0);
+  SDValue NewN1 = DAG.getNode(ISD::TRUNCATE, DL, ReducedVT, N1);
+
+  if (VT.getVectorNumElements() >= OpsVT.getVectorNumElements()) {
+    // Generate the lower part of mul: pmullw. For MULU8/MULS8, only the
+    // lower part is needed.
+    SDValue MulLo = DAG.getNode(ISD::MUL, DL, ReducedVT, NewN0, NewN1);
+    if (Mode == MULU8 || Mode == MULS8) {
+      return DAG.getNode((Mode == MULU8) ? ISD::ZERO_EXTEND : ISD::SIGN_EXTEND,
+                         DL, VT, MulLo);
+    } else {
+      MVT ResVT = MVT::getVectorVT(MVT::i32, VT.getVectorNumElements() / 2);
+      // Generate the higher part of mul: pmulhw/pmulhuw. For MULU16/MULS16,
+      // the higher part is also needed.
+      SDValue MulHi = DAG.getNode(Mode == MULS16 ? ISD::MULHS : ISD::MULHU, DL,
+                                  ReducedVT, NewN0, NewN1);
+
+      // Repack the lower part and higher part result of mul into a wider
+      // result.
+      // Generate shuffle functioning as punpcklwd.
+      SmallVector<int, 16> ShuffleMask(VT.getVectorNumElements());
+      for (unsigned i = 0; i < VT.getVectorNumElements() / 2; i++) {
+        ShuffleMask[2 * i] = i;
+        ShuffleMask[2 * i + 1] = i + VT.getVectorNumElements();
+      }
+      SDValue ResLo =
+          DAG.getVectorShuffle(ReducedVT, DL, MulLo, MulHi, ShuffleMask);
+      ResLo = DAG.getNode(ISD::BITCAST, DL, ResVT, ResLo);
+      // Generate shuffle functioning as punpckhwd.
+      for (unsigned i = 0; i < VT.getVectorNumElements() / 2; i++) {
+        ShuffleMask[2 * i] = i + VT.getVectorNumElements() / 2;
+        ShuffleMask[2 * i + 1] = i + VT.getVectorNumElements() * 3 / 2;
+      }
+      SDValue ResHi =
+          DAG.getVectorShuffle(ReducedVT, DL, MulLo, MulHi, ShuffleMask);
+      ResHi = DAG.getNode(ISD::BITCAST, DL, ResVT, ResHi);
+      return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, ResLo, ResHi);
+    }
+  } else {
+    // When VT.getVectorNumElements() < OpsVT.getVectorNumElements(), we want
+    // to legalize the mul explicitly because implicit legalization for type
+    // <4 x i16> to <4 x i32> sometimes involves unnecessary unpack
+    // instructions which will not exist when we explicitly legalize it by
+    // extending <4 x i16> to <8 x i16> (concatenating the <4 x i16> val with
+    // <4 x i16> undef).
+    //
+    // Legalize the operands of mul.
+    SmallVector<SDValue, 16> Ops(RegSize / ReducedVT.getSizeInBits(),
+                                 DAG.getUNDEF(ReducedVT));
+    Ops[0] = NewN0;
+    NewN0 = DAG.getNode(ISD::CONCAT_VECTORS, DL, OpsVT, Ops);
+    Ops[0] = NewN1;
+    NewN1 = DAG.getNode(ISD::CONCAT_VECTORS, DL, OpsVT, Ops);
+
+    if (Mode == MULU8 || Mode == MULS8) {
+      // Generate lower part of mul: pmullw. For MULU8/MULS8, only the lower
+      // part is needed.
+      SDValue Mul = DAG.getNode(ISD::MUL, DL, OpsVT, NewN0, NewN1);
+
+      // convert the type of mul result to VT.
+      MVT ResVT = MVT::getVectorVT(MVT::i32, RegSize / 32);
+      SDValue Res = DAG.getNode(Mode == MULU8 ? ISD::ZERO_EXTEND_VECTOR_INREG
+                                              : ISD::SIGN_EXTEND_VECTOR_INREG,
+                                DL, ResVT, Mul);
+      return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, Res,
+                         DAG.getIntPtrConstant(0, DL));
+    } else {
+      // Generate the lower and higher part of mul: pmulhw/pmulhuw. For
+      // MULU16/MULS16, both parts are needed.
+      SDValue MulLo = DAG.getNode(ISD::MUL, DL, OpsVT, NewN0, NewN1);
+      SDValue MulHi = DAG.getNode(Mode == MULS16 ? ISD::MULHS : ISD::MULHU, DL,
+                                  OpsVT, NewN0, NewN1);
+
+      // Repack the lower part and higher part result of mul into a wider
+      // result. Make sure the type of mul result is VT.
+      MVT ResVT = MVT::getVectorVT(MVT::i32, RegSize / 32);
+      SDValue Res = DAG.getNode(X86ISD::UNPCKL, DL, OpsVT, MulLo, MulHi);
+      Res = DAG.getNode(ISD::BITCAST, DL, ResVT, Res);
+      return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, Res,
+                         DAG.getIntPtrConstant(0, DL));
+    }
+  }
+}
+
+/// Optimize a single multiply with constant into two operations in order to
+/// implement it with two cheaper instructions, e.g. LEA + SHL, LEA + LEA.
+static SDValue combineMul(SDNode *N, SelectionDAG &DAG,
+                          TargetLowering::DAGCombinerInfo &DCI,
+                          const X86Subtarget &Subtarget) {
+  EVT VT = N->getValueType(0);
+  if (DCI.isBeforeLegalize() && VT.isVector())
+    return reduceVMULWidth(N, DAG, Subtarget);
+
   // An imul is usually smaller than the alternative sequence.
   if (DAG.getMachineFunction().getFunction()->optForMinSize())
     return SDValue();
@@ -25230,7 +27611,6 @@ static SDValue PerformMulCombine(SDNode *N, SelectionDAG &DAG,
   if (DCI.isBeforeLegalize() || DCI.isCalledByLegalizer())
     return SDValue();
 
-  EVT VT = N->getValueType(0);
   if (VT != MVT::i64 && VT != MVT::i32)
     return SDValue();
 
@@ -25307,7 +27687,7 @@ static SDValue PerformMulCombine(SDNode *N, SelectionDAG &DAG,
   return SDValue();
 }
 
-static SDValue PerformSHLCombine(SDNode *N, SelectionDAG &DAG) {
+static SDValue combineShiftLeft(SDNode *N, SelectionDAG &DAG) {
   SDValue N0 = N->getOperand(0);
   SDValue N1 = N->getOperand(1);
   ConstantSDNode *N1C = dyn_cast<ConstantSDNode>(N1);
@@ -25320,7 +27700,7 @@ static SDValue PerformSHLCombine(SDNode *N, SelectionDAG &DAG) {
       N0.getOperand(1).getOpcode() == ISD::Constant) {
     SDValue N00 = N0.getOperand(0);
     APInt Mask = cast<ConstantSDNode>(N0.getOperand(1))->getAPIntValue();
-    APInt ShAmt = N1C->getAPIntValue();
+    const APInt &ShAmt = N1C->getAPIntValue();
     Mask = Mask.shl(ShAmt);
     bool MaskOK = false;
     // We can handle cases concerning bit-widening nodes containing setcc_c if
@@ -25367,7 +27747,7 @@ static SDValue PerformSHLCombine(SDNode *N, SelectionDAG &DAG) {
   return SDValue();
 }
 
-static SDValue PerformSRACombine(SDNode *N, SelectionDAG &DAG) {
+static SDValue combineShiftRightAlgebraic(SDNode *N, SelectionDAG &DAG) {
   SDValue N0 = N->getOperand(0);
   SDValue N1 = N->getOperand(1);
   EVT VT = N0.getValueType();
@@ -25424,11 +27804,11 @@ static SDValue PerformSRACombine(SDNode *N, SelectionDAG &DAG) {
 /// shift by a constant amount which is known to be bigger than or equal
 /// to the vector element size in bits.
 static SDValue performShiftToAllZeros(SDNode *N, SelectionDAG &DAG,
-                                      const X86Subtarget *Subtarget) {
+                                      const X86Subtarget &Subtarget) {
   EVT VT = N->getValueType(0);
 
   if (VT != MVT::v2i64 && VT != MVT::v4i32 && VT != MVT::v8i16 &&
-      (!Subtarget->hasInt256() ||
+      (!Subtarget.hasInt256() ||
        (VT != MVT::v4i64 && VT != MVT::v8i32 && VT != MVT::v16i16)))
     return SDValue();
 
@@ -25436,7 +27816,7 @@ static SDValue performShiftToAllZeros(SDNode *N, SelectionDAG &DAG,
   SDLoc DL(N);
   if (auto *AmtBV = dyn_cast<BuildVectorSDNode>(Amt))
     if (auto *AmtSplat = AmtBV->getConstantSplatNode()) {
-      APInt ShiftAmt = AmtSplat->getAPIntValue();
+      const APInt &ShiftAmt = AmtSplat->getAPIntValue();
       unsigned MaxAmount =
         VT.getSimpleVT().getVectorElementType().getSizeInBits();
 
@@ -25451,16 +27831,15 @@ static SDValue performShiftToAllZeros(SDNode *N, SelectionDAG &DAG,
   return SDValue();
 }
 
-/// PerformShiftCombine - Combine shifts.
-static SDValue PerformShiftCombine(SDNode* N, SelectionDAG &DAG,
-                                   TargetLowering::DAGCombinerInfo &DCI,
-                                   const X86Subtarget *Subtarget) {
+static SDValue combineShift(SDNode* N, SelectionDAG &DAG,
+                            TargetLowering::DAGCombinerInfo &DCI,
+                            const X86Subtarget &Subtarget) {
   if (N->getOpcode() == ISD::SHL)
-    if (SDValue V = PerformSHLCombine(N, DAG))
+    if (SDValue V = combineShiftLeft(N, DAG))
       return V;
 
   if (N->getOpcode() == ISD::SRA)
-    if (SDValue V = PerformSRACombine(N, DAG))
+    if (SDValue V = combineShiftRightAlgebraic(N, DAG))
       return V;
 
   // Try to fold this logical shift into a zero vector.
@@ -25471,17 +27850,17 @@ static SDValue PerformShiftCombine(SDNode* N, SelectionDAG &DAG,
   return SDValue();
 }
 
-// CMPEQCombine - Recognize the distinctive  (AND (setcc ...) (setcc ..))
-// where both setccs reference the same FP CMP, and rewrite for CMPEQSS
-// and friends.  Likewise for OR -> CMPNEQSS.
-static SDValue CMPEQCombine(SDNode *N, SelectionDAG &DAG,
-                            TargetLowering::DAGCombinerInfo &DCI,
-                            const X86Subtarget *Subtarget) {
+/// Recognize the distinctive (AND (setcc ...) (setcc ..)) where both setccs
+/// reference the same FP CMP, and rewrite for CMPEQSS and friends. Likewise for
+/// OR -> CMPNEQSS.
+static SDValue combineCompareEqual(SDNode *N, SelectionDAG &DAG,
+                                   TargetLowering::DAGCombinerInfo &DCI,
+                                   const X86Subtarget &Subtarget) {
   unsigned opcode;
 
   // SSE1 supports CMP{eq|ne}SS, and SSE2 added CMP{eq|ne}SD, but
   // we're requiring SSE2 for both.
-  if (Subtarget->hasSSE2() && isAndOrOfSetCCs(SDValue(N, 0U), opcode)) {
+  if (Subtarget.hasSSE2() && isAndOrOfSetCCs(SDValue(N, 0U), opcode)) {
     SDValue N0 = N->getOperand(0);
     SDValue N1 = N->getOperand(1);
     SDValue CMP0 = N0->getOperand(1);
@@ -25530,7 +27909,7 @@ static SDValue CMPEQCombine(SDNode *N, SelectionDAG &DAG,
           // FIXME: need symbolic constants for these magic numbers.
           // See X86ATTInstPrinter.cpp:printSSECC().
           unsigned x86cc = (cc0 == X86::COND_E) ? 0 : 4;
-          if (Subtarget->hasAVX512()) {
+          if (Subtarget.hasAVX512()) {
             SDValue FSetCC = DAG.getNode(X86ISD::FSETCC, DL, MVT::i1, CMP00,
                                          CMP01,
                                          DAG.getConstant(x86cc, DL, MVT::i8));
@@ -25547,7 +27926,7 @@ static SDValue CMPEQCombine(SDNode *N, SelectionDAG &DAG,
           bool is64BitFP = (CMP00.getValueType() == MVT::f64);
           MVT IntVT = is64BitFP ? MVT::i64 : MVT::i32;
 
-          if (is64BitFP && !Subtarget->is64Bit()) {
+          if (is64BitFP && !Subtarget.is64Bit()) {
             // On a 32-bit target, we cannot bitcast the 64-bit float to a
             // 64-bit integer, since that's not a legal type. Since
             // OnesOrZeroesF is all ones of all zeroes, we don't need all the
@@ -25574,34 +27953,47 @@ static SDValue CMPEQCombine(SDNode *N, SelectionDAG &DAG,
   return SDValue();
 }
 
-/// CanFoldXORWithAllOnes - Test whether the XOR operand is a AllOnes vector
-/// so it can be folded inside ANDNP.
-static bool CanFoldXORWithAllOnes(const SDNode *N) {
+/// Try to fold: (and (xor X, -1), Y) -> (andnp X, Y).
+static SDValue combineANDXORWithAllOnesIntoANDNP(SDNode *N, SelectionDAG &DAG) {
+  assert(N->getOpcode() == ISD::AND);
+
   EVT VT = N->getValueType(0);
+  SDValue N0 = N->getOperand(0);
+  SDValue N1 = N->getOperand(1);
+  SDLoc DL(N);
 
-  // Match direct AllOnes for 128 and 256-bit vectors
-  if (ISD::isBuildVectorAllOnes(N))
-    return true;
+  if (VT != MVT::v2i64 && VT != MVT::v4i64 &&
+      VT != MVT::v8i64 && VT != MVT::v16i32 &&
+      VT != MVT::v4i32 && VT != MVT::v8i32) // Legal with VLX
+    return SDValue();
 
-  // Look through a bit convert.
-  if (N->getOpcode() == ISD::BITCAST)
-    N = N->getOperand(0).getNode();
-
-  // Sometimes the operand may come from a insert_subvector building a 256-bit
-  // allones vector
-  if (VT.is256BitVector() &&
-      N->getOpcode() == ISD::INSERT_SUBVECTOR) {
-    SDValue V1 = N->getOperand(0);
-    SDValue V2 = N->getOperand(1);
-
-    if (V1.getOpcode() == ISD::INSERT_SUBVECTOR &&
-        V1.getOperand(0).getOpcode() == ISD::UNDEF &&
-        ISD::isBuildVectorAllOnes(V1.getOperand(1).getNode()) &&
-        ISD::isBuildVectorAllOnes(V2.getNode()))
-      return true;
-  }
+  // Canonicalize XOR to the left.
+  if (N1.getOpcode() == ISD::XOR)
+    std::swap(N0, N1);
 
-  return false;
+  if (N0.getOpcode() != ISD::XOR)
+    return SDValue();
+
+  SDValue N00 = N0->getOperand(0);
+  SDValue N01 = N0->getOperand(1);
+
+  N01 = peekThroughBitcasts(N01);
+
+  // Either match a direct AllOnes for 128, 256, and 512-bit vectors, or an
+  // insert_subvector building a 256-bit AllOnes vector.
+  if (!ISD::isBuildVectorAllOnes(N01.getNode())) {
+    if (!VT.is256BitVector() || N01->getOpcode() != ISD::INSERT_SUBVECTOR)
+      return SDValue();
+
+    SDValue V1 = N01->getOperand(0);
+    SDValue V2 = N01->getOperand(1);
+    if (V1.getOpcode() != ISD::INSERT_SUBVECTOR ||
+        !V1.getOperand(0).isUndef() ||
+        !ISD::isBuildVectorAllOnes(V1.getOperand(1).getNode()) ||
+        !ISD::isBuildVectorAllOnes(V2.getNode()))
+      return SDValue();
+  }
+  return DAG.getNode(X86ISD::ANDNP, DL, VT, N00, N1);
 }
 
 // On AVX/AVX2 the type v8i1 is legalized to v8i16, which is an XMM sized
@@ -25610,7 +28002,7 @@ static bool CanFoldXORWithAllOnes(const SDNode *N) {
 // some of the transition sequences.
 static SDValue WidenMaskArithmetic(SDNode *N, SelectionDAG &DAG,
                                  TargetLowering::DAGCombinerInfo &DCI,
-                                 const X86Subtarget *Subtarget) {
+                                 const X86Subtarget &Subtarget) {
   EVT VT = N->getValueType(0);
   if (!VT.is256BitVector())
     return SDValue();
@@ -25660,8 +28052,7 @@ static SDValue WidenMaskArithmetic(SDNode *N, SelectionDAG &DAG,
   if (RHSConstSplat) {
     N1 = DAG.getNode(ISD::ZERO_EXTEND, DL, WideVT.getVectorElementType(),
                      SDValue(RHSConstSplat, 0));
-    SmallVector<SDValue, 8> C(WideVT.getVectorNumElements(), N1);
-    N1 = DAG.getNode(ISD::BUILD_VECTOR, DL, WideVT, C);
+    N1 = DAG.getSplatBuildVector(WideVT, DL, N1);
   } else if (RHSTrunc) {
     N1 = N1->getOperand(0);
   }
@@ -25687,9 +28078,9 @@ static SDValue WidenMaskArithmetic(SDNode *N, SelectionDAG &DAG,
   }
 }
 
-static SDValue VectorZextCombine(SDNode *N, SelectionDAG &DAG,
+static SDValue combineVectorZext(SDNode *N, SelectionDAG &DAG,
                                  TargetLowering::DAGCombinerInfo &DCI,
-                                 const X86Subtarget *Subtarget) {
+                                 const X86Subtarget &Subtarget) {
   SDValue N0 = N->getOperand(0);
   SDValue N1 = N->getOperand(1);
   SDLoc DL(N);
@@ -25705,8 +28096,7 @@ static SDValue VectorZextCombine(SDNode *N, SelectionDAG &DAG,
 
   // The other side of the AND should be a splat of 2^C, where C
   // is the number of bits in the source type.
-  if (N1.getOpcode() == ISD::BITCAST)
-    N1 = N1.getOperand(0);
+  N1 = peekThroughBitcasts(N1);
   if (N1.getOpcode() != ISD::BUILD_VECTOR)
     return SDValue();
   BuildVectorSDNode *Vector = cast<BuildVectorSDNode>(N1);
@@ -25715,10 +28105,11 @@ static SDValue VectorZextCombine(SDNode *N, SelectionDAG &DAG,
   EVT SrcType = Shuffle->getValueType(0);
 
   // We expect a single-source shuffle
-  if (Shuffle->getOperand(1)->getOpcode() != ISD::UNDEF)
+  if (!Shuffle->getOperand(1)->isUndef())
     return SDValue();
 
   unsigned SrcSize = SrcType.getScalarSizeInBits();
+  unsigned NumElems = SrcType.getVectorNumElements();
 
   APInt SplatValue, SplatUndef;
   unsigned SplatBitSize;
@@ -25742,7 +28133,7 @@ static SDValue VectorZextCombine(SDNode *N, SelectionDAG &DAG,
   // the source and dest type.
   unsigned ZextRatio = ResSize / SrcSize;
   bool IsZext = true;
-  for (unsigned i = 0; i < SrcType.getVectorNumElements(); ++i) {
+  for (unsigned i = 0; i != NumElems; ++i) {
     if (i % ZextRatio) {
       if (Shuffle->getMaskElt(i) > 0) {
         // Expected undef
@@ -25765,8 +28156,7 @@ static SDValue VectorZextCombine(SDNode *N, SelectionDAG &DAG,
   // a shuffle of the form <0, k, k, k, 1, k, k, k> with zero
   // (instead of undef) where the k elements come from the zero vector.
   SmallVector<int, 8> Mask;
-  unsigned NumElems = SrcType.getVectorNumElements();
-  for (unsigned i = 0; i < NumElems; ++i)
+  for (unsigned i = 0; i != NumElems; ++i)
     if (i % ZextRatio)
       Mask.push_back(NumElems);
     else
@@ -25781,7 +28171,7 @@ static SDValue VectorZextCombine(SDNode *N, SelectionDAG &DAG,
 /// types, try to convert this into a floating point logic node to avoid
 /// unnecessary moves from SSE to integer registers.
 static SDValue convertIntLogicToFPLogic(SDNode *N, SelectionDAG &DAG,
-                                        const X86Subtarget *Subtarget) {
+                                        const X86Subtarget &Subtarget) {
   unsigned FPOpcode = ISD::DELETED_NODE;
   if (N->getOpcode() == ISD::AND)
     FPOpcode = X86ISD::FAND;
@@ -25798,8 +28188,8 @@ static SDValue convertIntLogicToFPLogic(SDNode *N, SelectionDAG &DAG,
   SDValue N1 = N->getOperand(1);
   SDLoc DL(N);
   if (N0.getOpcode() == ISD::BITCAST && N1.getOpcode() == ISD::BITCAST &&
-      ((Subtarget->hasSSE1() && VT == MVT::i32) ||
-       (Subtarget->hasSSE2() && VT == MVT::i64))) {
+      ((Subtarget.hasSSE1() && VT == MVT::i32) ||
+       (Subtarget.hasSSE2() && VT == MVT::i64))) {
     SDValue N00 = N0.getOperand(0);
     SDValue N10 = N1.getOperand(0);
     EVT N00Type = N00.getValueType();
@@ -25812,21 +28202,63 @@ static SDValue convertIntLogicToFPLogic(SDNode *N, SelectionDAG &DAG,
   return SDValue();
 }
 
-static SDValue PerformAndCombine(SDNode *N, SelectionDAG &DAG,
-                                 TargetLowering::DAGCombinerInfo &DCI,
-                                 const X86Subtarget *Subtarget) {
+/// If this is a PCMPEQ or PCMPGT result that is bitwise-anded with 1 (this is
+/// the x86 lowering of a SETCC + ZEXT), replace the 'and' with a shift-right to
+/// eliminate loading the vector constant mask value. This relies on the fact
+/// that a PCMP always creates an all-ones or all-zeros bitmask per element.
+static SDValue combinePCMPAnd1(SDNode *N, SelectionDAG &DAG) {
+  SDValue Op0 = peekThroughBitcasts(N->getOperand(0));
+  SDValue Op1 = peekThroughBitcasts(N->getOperand(1));
+
+  // TODO: Use AssertSext to mark any nodes that have the property of producing
+  // all-ones or all-zeros. Then check for that node rather than particular
+  // opcodes.
+  if (Op0.getOpcode() != X86ISD::PCMPEQ && Op0.getOpcode() != X86ISD::PCMPGT)
+    return SDValue();
+
+  // The existence of the PCMP node guarantees that we have the required SSE2 or
+  // AVX2 for a shift of this vector type, but there is no vector shift by
+  // immediate for a vector with byte elements (PSRLB). 512-bit vectors use the
+  // masked compare nodes, so they should not make it here.
+  EVT VT0 = Op0.getValueType();
+  EVT VT1 = Op1.getValueType();
+  unsigned EltBitWidth = VT0.getScalarType().getSizeInBits();
+  if (VT0 != VT1 || EltBitWidth == 8)
+    return SDValue();
+
+  assert(VT0.getSizeInBits() == 128 || VT0.getSizeInBits() == 256);
+
+  APInt SplatVal;
+  if (!ISD::isConstantSplatVector(Op1.getNode(), SplatVal) || SplatVal != 1)
+    return SDValue();
+
+  SDLoc DL(N);
+  SDValue ShAmt = DAG.getConstant(EltBitWidth - 1, DL, MVT::i8);
+  SDValue Shift = DAG.getNode(X86ISD::VSRLI, DL, VT0, Op0, ShAmt);
+  return DAG.getBitcast(N->getValueType(0), Shift);
+}
+
+static SDValue combineAnd(SDNode *N, SelectionDAG &DAG,
+                          TargetLowering::DAGCombinerInfo &DCI,
+                          const X86Subtarget &Subtarget) {
   if (DCI.isBeforeLegalizeOps())
     return SDValue();
 
-  if (SDValue Zext = VectorZextCombine(N, DAG, DCI, Subtarget))
+  if (SDValue Zext = combineVectorZext(N, DAG, DCI, Subtarget))
     return Zext;
 
-  if (SDValue R = CMPEQCombine(N, DAG, DCI, Subtarget))
+  if (SDValue R = combineCompareEqual(N, DAG, DCI, Subtarget))
     return R;
 
   if (SDValue FPLogic = convertIntLogicToFPLogic(N, DAG, Subtarget))
     return FPLogic;
 
+  if (SDValue R = combineANDXORWithAllOnesIntoANDNP(N, DAG))
+    return R;
+
+  if (SDValue ShiftRight = combinePCMPAnd1(N, DAG))
+    return ShiftRight;
+
   EVT VT = N->getValueType(0);
   SDValue N0 = N->getOperand(0);
   SDValue N1 = N->getOperand(1);
@@ -25834,143 +28266,176 @@ static SDValue PerformAndCombine(SDNode *N, SelectionDAG &DAG,
 
   // Create BEXTR instructions
   // BEXTR is ((X >> imm) & (2**size-1))
-  if (VT == MVT::i32 || VT == MVT::i64) {
-    // Check for BEXTR.
-    if ((Subtarget->hasBMI() || Subtarget->hasTBM()) &&
-        (N0.getOpcode() == ISD::SRA || N0.getOpcode() == ISD::SRL)) {
-      ConstantSDNode *MaskNode = dyn_cast<ConstantSDNode>(N1);
-      ConstantSDNode *ShiftNode = dyn_cast<ConstantSDNode>(N0.getOperand(1));
-      if (MaskNode && ShiftNode) {
-        uint64_t Mask = MaskNode->getZExtValue();
-        uint64_t Shift = ShiftNode->getZExtValue();
-        if (isMask_64(Mask)) {
-          uint64_t MaskSize = countPopulation(Mask);
-          if (Shift + MaskSize <= VT.getSizeInBits())
-            return DAG.getNode(X86ISD::BEXTR, DL, VT, N0.getOperand(0),
-                               DAG.getConstant(Shift | (MaskSize << 8), DL,
-                                               VT));
-        }
-      }
-    } // BEXTR
+  if (VT != MVT::i32 && VT != MVT::i64)
+    return SDValue();
 
+  if (!Subtarget.hasBMI() && !Subtarget.hasTBM())
     return SDValue();
+  if (N0.getOpcode() != ISD::SRA && N0.getOpcode() != ISD::SRL)
+    return SDValue();
+
+  ConstantSDNode *MaskNode = dyn_cast<ConstantSDNode>(N1);
+  ConstantSDNode *ShiftNode = dyn_cast<ConstantSDNode>(N0.getOperand(1));
+  if (MaskNode && ShiftNode) {
+    uint64_t Mask = MaskNode->getZExtValue();
+    uint64_t Shift = ShiftNode->getZExtValue();
+    if (isMask_64(Mask)) {
+      uint64_t MaskSize = countPopulation(Mask);
+      if (Shift + MaskSize <= VT.getSizeInBits())
+        return DAG.getNode(X86ISD::BEXTR, DL, VT, N0.getOperand(0),
+                           DAG.getConstant(Shift | (MaskSize << 8), DL,
+                                           VT));
+    }
   }
+  return SDValue();
+}
 
-  // Want to form ANDNP nodes:
-  // 1) In the hopes of then easily combining them with OR and AND nodes
-  //    to form PBLEND/PSIGN.
-  // 2) To match ANDN packed intrinsics
-  if (VT != MVT::v2i64 && VT != MVT::v4i64)
+// Try to fold:
+//   (or (and (m, y), (pandn m, x)))
+// into:
+//   (vselect m, x, y)
+// As a special case, try to fold:
+//   (or (and (m, (sub 0, x)), (pandn m, x)))
+// into:
+//   (sub (xor X, M), M)
+static SDValue combineLogicBlendIntoPBLENDV(SDNode *N, SelectionDAG &DAG,
+                                            const X86Subtarget &Subtarget) {
+  assert(N->getOpcode() == ISD::OR);
+
+  SDValue N0 = N->getOperand(0);
+  SDValue N1 = N->getOperand(1);
+  EVT VT = N->getValueType(0);
+
+  if (!((VT == MVT::v2i64) || (VT == MVT::v4i64 && Subtarget.hasInt256())))
+    return SDValue();
+  assert(Subtarget.hasSSE2() && "Unexpected i64 vector without SSE2!");
+
+  // Canonicalize pandn to RHS
+  if (N0.getOpcode() == X86ISD::ANDNP)
+    std::swap(N0, N1);
+
+  if (N0.getOpcode() != ISD::AND || N1.getOpcode() != X86ISD::ANDNP)
     return SDValue();
 
-  // Check LHS for vnot
-  if (N0.getOpcode() == ISD::XOR &&
-      //ISD::isBuildVectorAllOnes(N0.getOperand(1).getNode()))
-      CanFoldXORWithAllOnes(N0.getOperand(1).getNode()))
-    return DAG.getNode(X86ISD::ANDNP, DL, VT, N0.getOperand(0), N1);
+  SDValue Mask = N1.getOperand(0);
+  SDValue X = N1.getOperand(1);
+  SDValue Y;
+  if (N0.getOperand(0) == Mask)
+    Y = N0.getOperand(1);
+  if (N0.getOperand(1) == Mask)
+    Y = N0.getOperand(0);
 
-  // Check RHS for vnot
-  if (N1.getOpcode() == ISD::XOR &&
-      //ISD::isBuildVectorAllOnes(N1.getOperand(1).getNode()))
-      CanFoldXORWithAllOnes(N1.getOperand(1).getNode()))
-    return DAG.getNode(X86ISD::ANDNP, DL, VT, N1.getOperand(0), N0);
+  // Check to see if the mask appeared in both the AND and ANDNP.
+  if (!Y.getNode())
+    return SDValue();
 
-  return SDValue();
+  // Validate that X, Y, and Mask are bitcasts, and see through them.
+  Mask = peekThroughBitcasts(Mask);
+  X = peekThroughBitcasts(X);
+  Y = peekThroughBitcasts(Y);
+
+  EVT MaskVT = Mask.getValueType();
+
+  // Validate that the Mask operand is a vector sra node.
+  // FIXME: what to do for bytes, since there is a psignb/pblendvb, but
+  // there is no psrai.b
+  unsigned EltBits = MaskVT.getVectorElementType().getSizeInBits();
+  unsigned SraAmt = ~0;
+  if (Mask.getOpcode() == ISD::SRA) {
+    if (auto *AmtBV = dyn_cast<BuildVectorSDNode>(Mask.getOperand(1)))
+      if (auto *AmtConst = AmtBV->getConstantSplatNode())
+        SraAmt = AmtConst->getZExtValue();
+  } else if (Mask.getOpcode() == X86ISD::VSRAI) {
+    SDValue SraC = Mask.getOperand(1);
+    SraAmt = cast<ConstantSDNode>(SraC)->getZExtValue();
+  }
+  if ((SraAmt + 1) != EltBits)
+    return SDValue();
+
+  SDLoc DL(N);
+
+  // Try to match:
+  //   (or (and (M, (sub 0, X)), (pandn M, X)))
+  // which is a special case of vselect:
+  //   (vselect M, (sub 0, X), X)
+  // Per:
+  // http://graphics.stanford.edu/~seander/bithacks.html#ConditionalNegate
+  // We know that, if fNegate is 0 or 1:
+  //   (fNegate ? -v : v) == ((v ^ -fNegate) + fNegate)
+  //
+  // Here, we have a mask, M (all 1s or 0), and, similarly, we know that:
+  //   ((M & 1) ? -X : X) == ((X ^ -(M & 1)) + (M & 1))
+  //   ( M      ? -X : X) == ((X ^   M     ) + (M & 1))
+  // This lets us transform our vselect to:
+  //   (add (xor X, M), (and M, 1))
+  // And further to:
+  //   (sub (xor X, M), M)
+  if (X.getValueType() == MaskVT && Y.getValueType() == MaskVT) {
+    auto IsNegV = [](SDNode *N, SDValue V) {
+      return N->getOpcode() == ISD::SUB && N->getOperand(1) == V &&
+        ISD::isBuildVectorAllZeros(N->getOperand(0).getNode());
+    };
+    SDValue V;
+    if (IsNegV(Y.getNode(), X))
+      V = X;
+    else if (IsNegV(X.getNode(), Y))
+      V = Y;
+
+    if (V) {
+      assert(EltBits == 8 || EltBits == 16 || EltBits == 32);
+      SDValue SubOp1 = DAG.getNode(ISD::XOR, DL, MaskVT, V, Mask);
+      SDValue SubOp2 = Mask;
+
+      // If the negate was on the false side of the select, then
+      // the operands of the SUB need to be swapped. PR 27251.
+      // This is because the pattern being matched above is
+      // (vselect M, (sub (0, X), X)  -> (sub (xor X, M), M)
+      // but if the pattern matched was
+      // (vselect M, X, (sub (0, X))), that is really negation of the pattern
+      // above, -(vselect M, (sub 0, X), X), and therefore the replacement
+      // pattern also needs to be a negation of the replacement pattern above.
+      // And -(sub X, Y) is just sub (Y, X), so swapping the operands of the
+      // sub accomplishes the negation of the replacement pattern.
+      if (V == Y)
+         std::swap(SubOp1, SubOp2);
+
+      return DAG.getBitcast(VT,
+                            DAG.getNode(ISD::SUB, DL, MaskVT, SubOp1, SubOp2));
+    }
+  }
+
+  // PBLENDVB is only available on SSE 4.1.
+  if (!Subtarget.hasSSE41())
+    return SDValue();
+
+  MVT BlendVT = (VT == MVT::v4i64) ? MVT::v32i8 : MVT::v16i8;
+
+  X = DAG.getBitcast(BlendVT, X);
+  Y = DAG.getBitcast(BlendVT, Y);
+  Mask = DAG.getBitcast(BlendVT, Mask);
+  Mask = DAG.getNode(ISD::VSELECT, DL, BlendVT, Mask, Y, X);
+  return DAG.getBitcast(VT, Mask);
 }
 
-static SDValue PerformOrCombine(SDNode *N, SelectionDAG &DAG,
-                                TargetLowering::DAGCombinerInfo &DCI,
-                                const X86Subtarget *Subtarget) {
+static SDValue combineOr(SDNode *N, SelectionDAG &DAG,
+                         TargetLowering::DAGCombinerInfo &DCI,
+                         const X86Subtarget &Subtarget) {
   if (DCI.isBeforeLegalizeOps())
     return SDValue();
 
-  if (SDValue R = CMPEQCombine(N, DAG, DCI, Subtarget))
+  if (SDValue R = combineCompareEqual(N, DAG, DCI, Subtarget))
     return R;
 
   if (SDValue FPLogic = convertIntLogicToFPLogic(N, DAG, Subtarget))
     return FPLogic;
 
+  if (SDValue R = combineLogicBlendIntoPBLENDV(N, DAG, Subtarget))
+    return R;
+
   SDValue N0 = N->getOperand(0);
   SDValue N1 = N->getOperand(1);
   EVT VT = N->getValueType(0);
 
-  // look for psign/blend
-  if (VT == MVT::v2i64 || VT == MVT::v4i64) {
-    if (!Subtarget->hasSSSE3() ||
-        (VT == MVT::v4i64 && !Subtarget->hasInt256()))
-      return SDValue();
-
-    // Canonicalize pandn to RHS
-    if (N0.getOpcode() == X86ISD::ANDNP)
-      std::swap(N0, N1);
-    // or (and (m, y), (pandn m, x))
-    if (N0.getOpcode() == ISD::AND && N1.getOpcode() == X86ISD::ANDNP) {
-      SDValue Mask = N1.getOperand(0);
-      SDValue X    = N1.getOperand(1);
-      SDValue Y;
-      if (N0.getOperand(0) == Mask)
-        Y = N0.getOperand(1);
-      if (N0.getOperand(1) == Mask)
-        Y = N0.getOperand(0);
-
-      // Check to see if the mask appeared in both the AND and ANDNP and
-      if (!Y.getNode())
-        return SDValue();
-
-      // Validate that X, Y, and Mask are BIT_CONVERTS, and see through them.
-      // Look through mask bitcast.
-      if (Mask.getOpcode() == ISD::BITCAST)
-        Mask = Mask.getOperand(0);
-      if (X.getOpcode() == ISD::BITCAST)
-        X = X.getOperand(0);
-      if (Y.getOpcode() == ISD::BITCAST)
-        Y = Y.getOperand(0);
-
-      EVT MaskVT = Mask.getValueType();
-
-      // Validate that the Mask operand is a vector sra node.
-      // FIXME: what to do for bytes, since there is a psignb/pblendvb, but
-      // there is no psrai.b
-      unsigned EltBits = MaskVT.getVectorElementType().getSizeInBits();
-      unsigned SraAmt = ~0;
-      if (Mask.getOpcode() == ISD::SRA) {
-        if (auto *AmtBV = dyn_cast<BuildVectorSDNode>(Mask.getOperand(1)))
-          if (auto *AmtConst = AmtBV->getConstantSplatNode())
-            SraAmt = AmtConst->getZExtValue();
-      } else if (Mask.getOpcode() == X86ISD::VSRAI) {
-        SDValue SraC = Mask.getOperand(1);
-        SraAmt  = cast<ConstantSDNode>(SraC)->getZExtValue();
-      }
-      if ((SraAmt + 1) != EltBits)
-        return SDValue();
-
-      SDLoc DL(N);
-
-      // Now we know we at least have a plendvb with the mask val.  See if
-      // we can form a psignb/w/d.
-      // psign = x.type == y.type == mask.type && y = sub(0, x);
-      if (Y.getOpcode() == ISD::SUB && Y.getOperand(1) == X &&
-          ISD::isBuildVectorAllZeros(Y.getOperand(0).getNode()) &&
-          X.getValueType() == MaskVT && Y.getValueType() == MaskVT) {
-        assert((EltBits == 8 || EltBits == 16 || EltBits == 32) &&
-               "Unsupported VT for PSIGN");
-        Mask = DAG.getNode(X86ISD::PSIGN, DL, MaskVT, X, Mask.getOperand(0));
-        return DAG.getBitcast(VT, Mask);
-      }
-      // PBLENDVB only available on SSE 4.1
-      if (!Subtarget->hasSSE41())
-        return SDValue();
-
-      MVT BlendVT = (VT == MVT::v4i64) ? MVT::v32i8 : MVT::v16i8;
-
-      X = DAG.getBitcast(BlendVT, X);
-      Y = DAG.getBitcast(BlendVT, Y);
-      Mask = DAG.getBitcast(BlendVT, Mask);
-      Mask = DAG.getNode(ISD::VSELECT, DL, BlendVT, Mask, Y, X);
-      return DAG.getBitcast(VT, Mask);
-    }
-  }
-
   if (VT != MVT::i16 && VT != MVT::i32 && VT != MVT::i64)
     return SDValue();
 
@@ -25982,7 +28447,7 @@ static SDValue PerformOrCombine(SDNode *N, SelectionDAG &DAG,
   // series of shifts/or that would otherwise be generated.
   // Don't fold (or (x << c) | (y >> (64 - c))) if SHLD/SHRD instructions
   // have higher latencies and we are not optimizing for size.
-  if (!OptForSize && Subtarget->isSHLDSlow())
+  if (!OptForSize && Subtarget.isSHLDSlow())
     return SDValue();
 
   if (N0.getOpcode() == ISD::SRL && N1.getOpcode() == ISD::SHL)
@@ -26040,7 +28505,7 @@ static SDValue PerformOrCombine(SDNode *N, SelectionDAG &DAG,
 }
 
 // Generate NEG and CMOV for integer abs.
-static SDValue performIntegerAbsCombine(SDNode *N, SelectionDAG &DAG) {
+static SDValue combineIntegerAbs(SDNode *N, SelectionDAG &DAG) {
   EVT VT = N->getValueType(0);
 
   // Since X86 does not have CMOV for 8-bit integer, we don't convert
@@ -26073,13 +28538,14 @@ static SDValue performIntegerAbsCombine(SDNode *N, SelectionDAG &DAG) {
   return SDValue();
 }
 
-// Try to turn tests against the signbit in the form of:
-//   XOR(TRUNCATE(SRL(X, size(X)-1)), 1)
-// into:
-//   SETGT(X, -1)
+/// Try to turn tests against the signbit in the form of:
+///   XOR(TRUNCATE(SRL(X, size(X)-1)), 1)
+/// into:
+///   SETGT(X, -1)
 static SDValue foldXorTruncShiftIntoCmp(SDNode *N, SelectionDAG &DAG) {
-  // This is only worth doing if the output type is i8.
-  if (N->getValueType(0) != MVT::i8)
+  // This is only worth doing if the output type is i8 or i1.
+  EVT ResultType = N->getValueType(0);
+  if (ResultType != MVT::i8 && ResultType != MVT::i1)
     return SDValue();
 
   SDValue N0 = N->getOperand(0);
@@ -26114,22 +28580,78 @@ static SDValue foldXorTruncShiftIntoCmp(SDNode *N, SelectionDAG &DAG) {
   SDLoc DL(N);
   SDValue ShiftOp = Shift.getOperand(0);
   EVT ShiftOpTy = ShiftOp.getValueType();
-  SDValue Cond = DAG.getSetCC(DL, MVT::i8, ShiftOp,
+  const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+  EVT SetCCResultType = TLI.getSetCCResultType(DAG.getDataLayout(),
+                                               *DAG.getContext(), ResultType);
+  SDValue Cond = DAG.getSetCC(DL, SetCCResultType, ShiftOp,
                               DAG.getConstant(-1, DL, ShiftOpTy), ISD::SETGT);
+  if (SetCCResultType != ResultType)
+    Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, ResultType, Cond);
   return Cond;
 }
 
-static SDValue PerformXorCombine(SDNode *N, SelectionDAG &DAG,
+/// Turn vector tests of the signbit in the form of:
+///   xor (sra X, elt_size(X)-1), -1
+/// into:
+///   pcmpgt X, -1
+///
+/// This should be called before type legalization because the pattern may not
+/// persist after that.
+static SDValue foldVectorXorShiftIntoCmp(SDNode *N, SelectionDAG &DAG,
+                                         const X86Subtarget &Subtarget) {
+  EVT VT = N->getValueType(0);
+  if (!VT.isSimple())
+    return SDValue();
+
+  switch (VT.getSimpleVT().SimpleTy) {
+  default: return SDValue();
+  case MVT::v16i8:
+  case MVT::v8i16:
+  case MVT::v4i32: if (!Subtarget.hasSSE2()) return SDValue(); break;
+  case MVT::v2i64: if (!Subtarget.hasSSE42()) return SDValue(); break;
+  case MVT::v32i8:
+  case MVT::v16i16:
+  case MVT::v8i32:
+  case MVT::v4i64: if (!Subtarget.hasAVX2()) return SDValue(); break;
+  }
+
+  // There must be a shift right algebraic before the xor, and the xor must be a
+  // 'not' operation.
+  SDValue Shift = N->getOperand(0);
+  SDValue Ones = N->getOperand(1);
+  if (Shift.getOpcode() != ISD::SRA || !Shift.hasOneUse() ||
+      !ISD::isBuildVectorAllOnes(Ones.getNode()))
+    return SDValue();
+
+  // The shift should be smearing the sign bit across each vector element.
+  auto *ShiftBV = dyn_cast<BuildVectorSDNode>(Shift.getOperand(1));
+  if (!ShiftBV)
+    return SDValue();
+
+  EVT ShiftEltTy = Shift.getValueType().getVectorElementType();
+  auto *ShiftAmt = ShiftBV->getConstantSplatNode();
+  if (!ShiftAmt || ShiftAmt->getZExtValue() != ShiftEltTy.getSizeInBits() - 1)
+    return SDValue();
+
+  // Create a greater-than comparison against -1. We don't use the more obvious
+  // greater-than-or-equal-to-zero because SSE/AVX don't have that instruction.
+  return DAG.getNode(X86ISD::PCMPGT, SDLoc(N), VT, Shift.getOperand(0), Ones);
+}
+
+static SDValue combineXor(SDNode *N, SelectionDAG &DAG,
                                  TargetLowering::DAGCombinerInfo &DCI,
-                                 const X86Subtarget *Subtarget) {
+                                 const X86Subtarget &Subtarget) {
+  if (SDValue Cmp = foldVectorXorShiftIntoCmp(N, DAG, Subtarget))
+    return Cmp;
+
   if (DCI.isBeforeLegalizeOps())
     return SDValue();
 
   if (SDValue RV = foldXorTruncShiftIntoCmp(N, DAG))
     return RV;
 
-  if (Subtarget->hasCMov())
-    if (SDValue RV = performIntegerAbsCombine(N, DAG))
+  if (Subtarget.hasCMov())
+    if (SDValue RV = combineIntegerAbs(N, DAG))
       return RV;
 
   if (SDValue FPLogic = convertIntLogicToFPLogic(N, DAG, Subtarget))
@@ -26142,7 +28664,8 @@ static SDValue PerformXorCombine(SDNode *N, SelectionDAG &DAG,
 /// which is c = (a + b + 1) / 2, and replace this operation with the efficient
 /// X86ISD::AVG instruction.
 static SDValue detectAVGPattern(SDValue In, EVT VT, SelectionDAG &DAG,
-                                const X86Subtarget *Subtarget, SDLoc DL) {
+                                const X86Subtarget &Subtarget,
+                                const SDLoc &DL) {
   if (!VT.isVector() || !VT.isSimple())
     return SDValue();
   EVT InVT = In.getValueType();
@@ -26159,10 +28682,12 @@ static SDValue detectAVGPattern(SDValue In, EVT VT, SelectionDAG &DAG,
   if (InScalarVT.getSizeInBits() <= ScalarVT.getSizeInBits())
     return SDValue();
 
-  if (Subtarget->hasAVX512()) {
+  if (!Subtarget.hasSSE2())
+    return SDValue();
+  if (Subtarget.hasAVX512()) {
     if (VT.getSizeInBits() > 512)
       return SDValue();
-  } else if (Subtarget->hasAVX2()) {
+  } else if (Subtarget.hasAVX2()) {
     if (VT.getSizeInBits() > 256)
       return SDValue();
   } else {
@@ -26221,10 +28746,8 @@ static SDValue detectAVGPattern(SDValue In, EVT VT, SelectionDAG &DAG,
       Operands[0].getOperand(0).getValueType() == VT) {
     // The pattern is detected. Subtract one from the constant vector, then
     // demote it and emit X86ISD::AVG instruction.
-    SDValue One = DAG.getConstant(1, DL, InScalarVT);
-    SDValue Ones = DAG.getNode(ISD::BUILD_VECTOR, DL, InVT,
-                               SmallVector<SDValue, 8>(NumElems, One));
-    Operands[1] = DAG.getNode(ISD::SUB, DL, InVT, Operands[1], Ones);
+    SDValue VecOnes = DAG.getConstant(1, DL, InVT);
+    Operands[1] = DAG.getNode(ISD::SUB, DL, InVT, Operands[1], VecOnes);
     Operands[1] = DAG.getNode(ISD::TRUNCATE, DL, VT, Operands[1]);
     return DAG.getNode(X86ISD::AVG, DL, VT, Operands[0].getOperand(0),
                        Operands[1]);
@@ -26258,10 +28781,9 @@ static SDValue detectAVGPattern(SDValue In, EVT VT, SelectionDAG &DAG,
   return SDValue();
 }
 
-/// PerformLOADCombine - Do target-specific dag combines on LOAD nodes.
-static SDValue PerformLOADCombine(SDNode *N, SelectionDAG &DAG,
-                                  TargetLowering::DAGCombinerInfo &DCI,
-                                  const X86Subtarget *Subtarget) {
+static SDValue combineLoad(SDNode *N, SelectionDAG &DAG,
+                           TargetLowering::DAGCombinerInfo &DCI,
+                           const X86Subtarget &Subtarget) {
   LoadSDNode *Ld = cast<LoadSDNode>(N);
   EVT RegVT = Ld->getValueType(0);
   EVT MemVT = Ld->getMemoryVT();
@@ -26283,41 +28805,180 @@ static SDValue PerformLOADCombine(SDNode *N, SelectionDAG &DAG,
       return SDValue();
 
     SDValue Ptr = Ld->getBasePtr();
-    SDValue Increment =
-        DAG.getConstant(16, dl, TLI.getPointerTy(DAG.getDataLayout()));
 
     EVT HalfVT = EVT::getVectorVT(*DAG.getContext(), MemVT.getScalarType(),
                                   NumElems/2);
-    SDValue Load1 = DAG.getLoad(HalfVT, dl, Ld->getChain(), Ptr,
-                                Ld->getPointerInfo(), Ld->isVolatile(),
-                                Ld->isNonTemporal(), Ld->isInvariant(),
-                                Alignment);
-    Ptr = DAG.getNode(ISD::ADD, dl, Ptr.getValueType(), Ptr, Increment);
-    SDValue Load2 = DAG.getLoad(HalfVT, dl, Ld->getChain(), Ptr,
-                                Ld->getPointerInfo(), Ld->isVolatile(),
-                                Ld->isNonTemporal(), Ld->isInvariant(),
-                                std::min(16U, Alignment));
+    SDValue Load1 =
+        DAG.getLoad(HalfVT, dl, Ld->getChain(), Ptr, Ld->getPointerInfo(),
+                    Alignment, Ld->getMemOperand()->getFlags());
+
+    Ptr = DAG.getMemBasePlusOffset(Ptr, 16, dl);
+    SDValue Load2 =
+        DAG.getLoad(HalfVT, dl, Ld->getChain(), Ptr, Ld->getPointerInfo(),
+                    std::min(16U, Alignment), Ld->getMemOperand()->getFlags());
     SDValue TF = DAG.getNode(ISD::TokenFactor, dl, MVT::Other,
                              Load1.getValue(1),
                              Load2.getValue(1));
 
     SDValue NewVec = DAG.getUNDEF(RegVT);
-    NewVec = Insert128BitVector(NewVec, Load1, 0, DAG, dl);
-    NewVec = Insert128BitVector(NewVec, Load2, NumElems/2, DAG, dl);
+    NewVec = insert128BitVector(NewVec, Load1, 0, DAG, dl);
+    NewVec = insert128BitVector(NewVec, Load2, NumElems / 2, DAG, dl);
     return DCI.CombineTo(N, NewVec, TF, true);
   }
 
   return SDValue();
 }
 
-/// PerformMLOADCombine - Resolve extending loads
-static SDValue PerformMLOADCombine(SDNode *N, SelectionDAG &DAG,
-                                   TargetLowering::DAGCombinerInfo &DCI,
-                                   const X86Subtarget *Subtarget) {
+/// If V is a build vector of boolean constants and exactly one of those
+/// constants is true, return the operand index of that true element.
+/// Otherwise, return -1.
+static int getOneTrueElt(SDValue V) {
+  // This needs to be a build vector of booleans.
+  // TODO: Checking for the i1 type matches the IR definition for the mask,
+  // but the mask check could be loosened to i8 or other types. That might
+  // also require checking more than 'allOnesValue'; eg, the x86 HW
+  // instructions only require that the MSB is set for each mask element.
+  // The ISD::MSTORE comments/definition do not specify how the mask operand
+  // is formatted.
+  auto *BV = dyn_cast<BuildVectorSDNode>(V);
+  if (!BV || BV->getValueType(0).getVectorElementType() != MVT::i1)
+    return -1;
+
+  int TrueIndex = -1;
+  unsigned NumElts = BV->getValueType(0).getVectorNumElements();
+  for (unsigned i = 0; i < NumElts; ++i) {
+    const SDValue &Op = BV->getOperand(i);
+    if (Op.isUndef())
+      continue;
+    auto *ConstNode = dyn_cast<ConstantSDNode>(Op);
+    if (!ConstNode)
+      return -1;
+    if (ConstNode->getAPIntValue().isAllOnesValue()) {
+      // If we already found a one, this is too many.
+      if (TrueIndex >= 0)
+        return -1;
+      TrueIndex = i;
+    }
+  }
+  return TrueIndex;
+}
+
+/// Given a masked memory load/store operation, return true if it has one mask
+/// bit set. If it has one mask bit set, then also return the memory address of
+/// the scalar element to load/store, the vector index to insert/extract that
+/// scalar element, and the alignment for the scalar memory access.
+static bool getParamsForOneTrueMaskedElt(MaskedLoadStoreSDNode *MaskedOp,
+                                         SelectionDAG &DAG, SDValue &Addr,
+                                         SDValue &Index, unsigned &Alignment) {
+  int TrueMaskElt = getOneTrueElt(MaskedOp->getMask());
+  if (TrueMaskElt < 0)
+    return false;
+
+  // Get the address of the one scalar element that is specified by the mask
+  // using the appropriate offset from the base pointer.
+  EVT EltVT = MaskedOp->getMemoryVT().getVectorElementType();
+  Addr = MaskedOp->getBasePtr();
+  if (TrueMaskElt != 0) {
+    unsigned Offset = TrueMaskElt * EltVT.getStoreSize();
+    Addr = DAG.getMemBasePlusOffset(Addr, Offset, SDLoc(MaskedOp));
+  }
+
+  Index = DAG.getIntPtrConstant(TrueMaskElt, SDLoc(MaskedOp));
+  Alignment = MinAlign(MaskedOp->getAlignment(), EltVT.getStoreSize());
+  return true;
+}
+
+/// If exactly one element of the mask is set for a non-extending masked load,
+/// it is a scalar load and vector insert.
+/// Note: It is expected that the degenerate cases of an all-zeros or all-ones
+/// mask have already been optimized in IR, so we don't bother with those here.
+static SDValue
+reduceMaskedLoadToScalarLoad(MaskedLoadSDNode *ML, SelectionDAG &DAG,
+                             TargetLowering::DAGCombinerInfo &DCI) {
+  // TODO: This is not x86-specific, so it could be lifted to DAGCombiner.
+  // However, some target hooks may need to be added to know when the transform
+  // is profitable. Endianness would also have to be considered.
+
+  SDValue Addr, VecIndex;
+  unsigned Alignment;
+  if (!getParamsForOneTrueMaskedElt(ML, DAG, Addr, VecIndex, Alignment))
+    return SDValue();
+
+  // Load the one scalar element that is specified by the mask using the
+  // appropriate offset from the base pointer.
+  SDLoc DL(ML);
+  EVT VT = ML->getValueType(0);
+  EVT EltVT = VT.getVectorElementType();
+  SDValue Load =
+      DAG.getLoad(EltVT, DL, ML->getChain(), Addr, ML->getPointerInfo(),
+                  Alignment, ML->getMemOperand()->getFlags());
+
+  // Insert the loaded element into the appropriate place in the vector.
+  SDValue Insert = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, VT, ML->getSrc0(),
+                               Load, VecIndex);
+  return DCI.CombineTo(ML, Insert, Load.getValue(1), true);
+}
+
+static SDValue
+combineMaskedLoadConstantMask(MaskedLoadSDNode *ML, SelectionDAG &DAG,
+                              TargetLowering::DAGCombinerInfo &DCI) {
+  if (!ISD::isBuildVectorOfConstantSDNodes(ML->getMask().getNode()))
+    return SDValue();
+
+  SDLoc DL(ML);
+  EVT VT = ML->getValueType(0);
+
+  // If we are loading the first and last elements of a vector, it is safe and
+  // always faster to load the whole vector. Replace the masked load with a
+  // vector load and select.
+  unsigned NumElts = VT.getVectorNumElements();
+  BuildVectorSDNode *MaskBV = cast<BuildVectorSDNode>(ML->getMask());
+  bool LoadFirstElt = !isNullConstant(MaskBV->getOperand(0));
+  bool LoadLastElt = !isNullConstant(MaskBV->getOperand(NumElts - 1));
+  if (LoadFirstElt && LoadLastElt) {
+    SDValue VecLd = DAG.getLoad(VT, DL, ML->getChain(), ML->getBasePtr(),
+                                ML->getMemOperand());
+    SDValue Blend = DAG.getSelect(DL, VT, ML->getMask(), VecLd, ML->getSrc0());
+    return DCI.CombineTo(ML, Blend, VecLd.getValue(1), true);
+  }
+
+  // Convert a masked load with a constant mask into a masked load and a select.
+  // This allows the select operation to use a faster kind of select instruction
+  // (for example, vblendvps -> vblendps).
+
+  // Don't try this if the pass-through operand is already undefined. That would
+  // cause an infinite loop because that's what we're about to create.
+  if (ML->getSrc0().isUndef())
+    return SDValue();
+
+  // The new masked load has an undef pass-through operand. The select uses the
+  // original pass-through operand.
+  SDValue NewML = DAG.getMaskedLoad(VT, DL, ML->getChain(), ML->getBasePtr(),
+                                    ML->getMask(), DAG.getUNDEF(VT),
+                                    ML->getMemoryVT(), ML->getMemOperand(),
+                                    ML->getExtensionType());
+  SDValue Blend = DAG.getSelect(DL, VT, ML->getMask(), NewML, ML->getSrc0());
+
+  return DCI.CombineTo(ML, Blend, NewML.getValue(1), true);
+}
+
+static SDValue combineMaskedLoad(SDNode *N, SelectionDAG &DAG,
+                                 TargetLowering::DAGCombinerInfo &DCI,
+                                 const X86Subtarget &Subtarget) {
   MaskedLoadSDNode *Mld = cast<MaskedLoadSDNode>(N);
+  if (Mld->getExtensionType() == ISD::NON_EXTLOAD) {
+    if (SDValue ScalarLoad = reduceMaskedLoadToScalarLoad(Mld, DAG, DCI))
+      return ScalarLoad;
+    // TODO: Do some AVX512 subsets benefit from this transform?
+    if (!Subtarget.hasAVX512())
+      if (SDValue Blend = combineMaskedLoadConstantMask(Mld, DAG, DCI))
+        return Blend;
+  }
+
   if (Mld->getExtensionType() != ISD::SEXTLOAD)
     return SDValue();
 
+  // Resolve extending loads.
   EVT VT = Mld->getValueType(0);
   unsigned NumElems = VT.getVectorNumElements();
   EVT LdVT = Mld->getMemoryVT();
@@ -26326,21 +28987,21 @@ static SDValue PerformMLOADCombine(SDNode *N, SelectionDAG &DAG,
   assert(LdVT != VT && "Cannot extend to the same type");
   unsigned ToSz = VT.getVectorElementType().getSizeInBits();
   unsigned FromSz = LdVT.getVectorElementType().getSizeInBits();
-  // From, To sizes and ElemCount must be pow of two
+  // From/To sizes and ElemCount must be pow of two.
   assert (isPowerOf2_32(NumElems * FromSz * ToSz) &&
     "Unexpected size for extending masked load");
 
   unsigned SizeRatio  = ToSz / FromSz;
   assert(SizeRatio * NumElems * FromSz == VT.getSizeInBits());
 
-  // Create a type on which we perform the shuffle
+  // Create a type on which we perform the shuffle.
   EVT WideVecVT = EVT::getVectorVT(*DAG.getContext(),
           LdVT.getScalarType(), NumElems*SizeRatio);
   assert(WideVecVT.getSizeInBits() == VT.getSizeInBits());
 
-  // Convert Src0 value
+  // Convert Src0 value.
   SDValue WideSrc0 = DAG.getBitcast(WideVecVT, Mld->getSrc0());
-  if (Mld->getSrc0().getOpcode() != ISD::UNDEF) {
+  if (!Mld->getSrc0().isUndef()) {
     SmallVector<int, 16> ShuffleVec(NumElems * SizeRatio, -1);
     for (unsigned i = 0; i != NumElems; ++i)
       ShuffleVec[i] = i * SizeRatio;
@@ -26349,13 +29010,13 @@ static SDValue PerformMLOADCombine(SDNode *N, SelectionDAG &DAG,
     assert(DAG.getTargetLoweringInfo().isTypeLegal(WideVecVT) &&
            "WideVecVT should be legal");
     WideSrc0 = DAG.getVectorShuffle(WideVecVT, dl, WideSrc0,
-                                    DAG.getUNDEF(WideVecVT), &ShuffleVec[0]);
+                                    DAG.getUNDEF(WideVecVT), ShuffleVec);
   }
-  // Prepare the new mask
+  // Prepare the new mask.
   SDValue NewMask;
   SDValue Mask = Mld->getMask();
   if (Mask.getValueType() == VT) {
-    // Mask and original value have the same type
+    // Mask and original value have the same type.
     NewMask = DAG.getBitcast(WideVecVT, Mask);
     SmallVector<int, 16> ShuffleVec(NumElems * SizeRatio, -1);
     for (unsigned i = 0; i != NumElems; ++i)
@@ -26364,9 +29025,8 @@ static SDValue PerformMLOADCombine(SDNode *N, SelectionDAG &DAG,
       ShuffleVec[i] = NumElems * SizeRatio;
     NewMask = DAG.getVectorShuffle(WideVecVT, dl, NewMask,
                                    DAG.getConstant(0, dl, WideVecVT),
-                                   &ShuffleVec[0]);
-  }
-  else {
+                                   ShuffleVec);
+  } else {
     assert(Mask.getValueType().getVectorElementType() == MVT::i1);
     unsigned WidenNumElts = NumElems*SizeRatio;
     unsigned MaskNumElts = VT.getVectorNumElements();
@@ -26390,13 +29050,41 @@ static SDValue PerformMLOADCombine(SDNode *N, SelectionDAG &DAG,
   SDValue NewVec = DAG.getNode(X86ISD::VSEXT, dl, VT, WideLd);
   return DCI.CombineTo(N, NewVec, WideLd.getValue(1), true);
 }
-/// PerformMSTORECombine - Resolve truncating stores
-static SDValue PerformMSTORECombine(SDNode *N, SelectionDAG &DAG,
-                                    const X86Subtarget *Subtarget) {
+
+/// If exactly one element of the mask is set for a non-truncating masked store,
+/// it is a vector extract and scalar store.
+/// Note: It is expected that the degenerate cases of an all-zeros or all-ones
+/// mask have already been optimized in IR, so we don't bother with those here.
+static SDValue reduceMaskedStoreToScalarStore(MaskedStoreSDNode *MS,
+                                              SelectionDAG &DAG) {
+  // TODO: This is not x86-specific, so it could be lifted to DAGCombiner.
+  // However, some target hooks may need to be added to know when the transform
+  // is profitable. Endianness would also have to be considered.
+
+  SDValue Addr, VecIndex;
+  unsigned Alignment;
+  if (!getParamsForOneTrueMaskedElt(MS, DAG, Addr, VecIndex, Alignment))
+    return SDValue();
+
+  // Extract the one scalar element that is actually being stored.
+  SDLoc DL(MS);
+  EVT VT = MS->getValue().getValueType();
+  EVT EltVT = VT.getVectorElementType();
+  SDValue Extract = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, EltVT,
+                                MS->getValue(), VecIndex);
+
+  // Store that element at the appropriate offset from the base pointer.
+  return DAG.getStore(MS->getChain(), DL, Extract, Addr, MS->getPointerInfo(),
+                      Alignment, MS->getMemOperand()->getFlags());
+}
+
+static SDValue combineMaskedStore(SDNode *N, SelectionDAG &DAG,
+                                  const X86Subtarget &Subtarget) {
   MaskedStoreSDNode *Mst = cast<MaskedStoreSDNode>(N);
   if (!Mst->isTruncatingStore())
-    return SDValue();
+    return reduceMaskedStoreToScalarStore(Mst, DAG);
 
+  // Resolve truncating stores.
   EVT VT = Mst->getValue().getValueType();
   unsigned NumElems = VT.getVectorNumElements();
   EVT StVT = Mst->getMemoryVT();
@@ -26415,7 +29103,7 @@ static SDValue PerformMSTORECombine(SDNode *N, SelectionDAG &DAG,
   if (TLI.isTruncStoreLegal(VT, StVT))
     return SDValue();
 
-  // From, To sizes and ElemCount must be pow of two
+  // From/To sizes and ElemCount must be pow of two.
   assert (isPowerOf2_32(NumElems * FromSz * ToSz) &&
     "Unexpected size for truncating masked store");
   // We are going to use the original vector elt for storing.
@@ -26426,7 +29114,7 @@ static SDValue PerformMSTORECombine(SDNode *N, SelectionDAG &DAG,
   unsigned SizeRatio  = FromSz / ToSz;
   assert(SizeRatio * NumElems * ToSz == VT.getSizeInBits());
 
-  // Create a type on which we perform the shuffle
+  // Create a type on which we perform the shuffle.
   EVT WideVecVT = EVT::getVectorVT(*DAG.getContext(),
           StVT.getScalarType(), NumElems*SizeRatio);
 
@@ -26443,12 +29131,12 @@ static SDValue PerformMSTORECombine(SDNode *N, SelectionDAG &DAG,
 
   SDValue TruncatedVal = DAG.getVectorShuffle(WideVecVT, dl, WideVec,
                                               DAG.getUNDEF(WideVecVT),
-                                              &ShuffleVec[0]);
+                                              ShuffleVec);
 
   SDValue NewMask;
   SDValue Mask = Mst->getMask();
   if (Mask.getValueType() == VT) {
-    // Mask and original value have the same type
+    // Mask and original value have the same type.
     NewMask = DAG.getBitcast(WideVecVT, Mask);
     for (unsigned i = 0; i != NumElems; ++i)
       ShuffleVec[i] = i * SizeRatio;
@@ -26456,9 +29144,8 @@ static SDValue PerformMSTORECombine(SDNode *N, SelectionDAG &DAG,
       ShuffleVec[i] = NumElems*SizeRatio;
     NewMask = DAG.getVectorShuffle(WideVecVT, dl, NewMask,
                                    DAG.getConstant(0, dl, WideVecVT),
-                                   &ShuffleVec[0]);
-  }
-  else {
+                                   ShuffleVec);
+  } else {
     assert(Mask.getValueType().getVectorElementType() == MVT::i1);
     unsigned WidenNumElts = NumElems*SizeRatio;
     unsigned MaskNumElts = VT.getVectorNumElements();
@@ -26479,9 +29166,9 @@ static SDValue PerformMSTORECombine(SDNode *N, SelectionDAG &DAG,
                             Mst->getBasePtr(), NewMask, StVT,
                             Mst->getMemOperand(), false);
 }
-/// PerformSTORECombine - Do target-specific dag combines on STORE nodes.
-static SDValue PerformSTORECombine(SDNode *N, SelectionDAG &DAG,
-                                   const X86Subtarget *Subtarget) {
+
+static SDValue combineStore(SDNode *N, SelectionDAG &DAG,
+                            const X86Subtarget &Subtarget) {
   StoreSDNode *St = cast<StoreSDNode>(N);
   EVT VT = St->getValue().getValueType();
   EVT StVT = St->getMemoryVT();
@@ -26496,26 +29183,24 @@ static SDValue PerformSTORECombine(SDNode *N, SelectionDAG &DAG,
   unsigned Alignment = St->getAlignment();
   if (VT.is256BitVector() && StVT == VT &&
       TLI.allowsMemoryAccess(*DAG.getContext(), DAG.getDataLayout(), VT,
-                             AddressSpace, Alignment, &Fast) && !Fast) {
+                             AddressSpace, Alignment, &Fast) &&
+      !Fast) {
     unsigned NumElems = VT.getVectorNumElements();
     if (NumElems < 2)
       return SDValue();
 
-    SDValue Value0 = Extract128BitVector(StoredVal, 0, DAG, dl);
-    SDValue Value1 = Extract128BitVector(StoredVal, NumElems/2, DAG, dl);
+    SDValue Value0 = extract128BitVector(StoredVal, 0, DAG, dl);
+    SDValue Value1 = extract128BitVector(StoredVal, NumElems / 2, DAG, dl);
 
-    SDValue Stride =
-        DAG.getConstant(16, dl, TLI.getPointerTy(DAG.getDataLayout()));
     SDValue Ptr0 = St->getBasePtr();
-    SDValue Ptr1 = DAG.getNode(ISD::ADD, dl, Ptr0.getValueType(), Ptr0, Stride);
-
-    SDValue Ch0 = DAG.getStore(St->getChain(), dl, Value0, Ptr0,
-                                St->getPointerInfo(), St->isVolatile(),
-                                St->isNonTemporal(), Alignment);
-    SDValue Ch1 = DAG.getStore(St->getChain(), dl, Value1, Ptr1,
-                                St->getPointerInfo(), St->isVolatile(),
-                                St->isNonTemporal(),
-                                std::min(16U, Alignment));
+    SDValue Ptr1 = DAG.getMemBasePlusOffset(Ptr0, 16, dl);
+
+    SDValue Ch0 =
+        DAG.getStore(St->getChain(), dl, Value0, Ptr0, St->getPointerInfo(),
+                     Alignment, St->getMemOperand()->getFlags());
+    SDValue Ch1 =
+        DAG.getStore(St->getChain(), dl, Value1, Ptr1, St->getPointerInfo(),
+                     std::min(16U, Alignment), St->getMemOperand()->getFlags());
     return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Ch0, Ch1);
   }
 
@@ -26526,12 +29211,11 @@ static SDValue PerformSTORECombine(SDNode *N, SelectionDAG &DAG,
     // Check if we can detect an AVG pattern from the truncation. If yes,
     // replace the trunc store by a normal store with the result of X86ISD::AVG
     // instruction.
-    SDValue Avg =
-        detectAVGPattern(St->getValue(), St->getMemoryVT(), DAG, Subtarget, dl);
-    if (Avg.getNode())
+    if (SDValue Avg = detectAVGPattern(St->getValue(), St->getMemoryVT(), DAG,
+                                       Subtarget, dl))
       return DAG.getStore(St->getChain(), dl, Avg, St->getBasePtr(),
-                          St->getPointerInfo(), St->isVolatile(),
-                          St->isNonTemporal(), St->getAlignment());
+                          St->getPointerInfo(), St->getAlignment(),
+                          St->getMemOperand()->getFlags());
 
     const TargetLowering &TLI = DAG.getTargetLoweringInfo();
     unsigned NumElems = VT.getVectorNumElements();
@@ -26543,7 +29227,7 @@ static SDValue PerformSTORECombine(SDNode *N, SelectionDAG &DAG,
     // vpmovqb, vpmovqw, vpmovqd, vpmovdb, vpmovdw
     // are designated for truncate store.
     // In this case we don't need any further transformations.
-    if (TLI.isTruncStoreLegal(VT, StVT))
+    if (TLI.isTruncStoreLegalOrCustom(VT, StVT))
       return SDValue();
 
     // From, To sizes and ElemCount must be pow of two
@@ -26573,7 +29257,7 @@ static SDValue PerformSTORECombine(SDNode *N, SelectionDAG &DAG,
 
     SDValue Shuff = DAG.getVectorShuffle(WideVecVT, dl, WideVec,
                                          DAG.getUNDEF(WideVecVT),
-                                         &ShuffleVec[0]);
+                                         ShuffleVec);
     // At this point all of the data is stored at the bottom of the
     // register. We now need to save it to mem.
 
@@ -26595,8 +29279,6 @@ static SDValue PerformSTORECombine(SDNode *N, SelectionDAG &DAG,
     assert(StoreVecVT.getSizeInBits() == VT.getSizeInBits());
     SDValue ShuffWide = DAG.getBitcast(StoreVecVT, Shuff);
     SmallVector<SDValue, 8> Chains;
-    SDValue Increment = DAG.getConstant(StoreType.getSizeInBits() / 8, dl,
-                                        TLI.getPointerTy(DAG.getDataLayout()));
     SDValue Ptr = St->getBasePtr();
 
     // Perform one or more big stores into memory.
@@ -26604,10 +29286,10 @@ static SDValue PerformSTORECombine(SDNode *N, SelectionDAG &DAG,
       SDValue SubVec = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl,
                                    StoreType, ShuffWide,
                                    DAG.getIntPtrConstant(i, dl));
-      SDValue Ch = DAG.getStore(St->getChain(), dl, SubVec, Ptr,
-                                St->getPointerInfo(), St->isVolatile(),
-                                St->isNonTemporal(), St->getAlignment());
-      Ptr = DAG.getNode(ISD::ADD, dl, Ptr.getValueType(), Ptr, Increment);
+      SDValue Ch =
+          DAG.getStore(St->getChain(), dl, SubVec, Ptr, St->getPointerInfo(),
+                       St->getAlignment(), St->getMemOperand()->getFlags());
+      Ptr = DAG.getMemBasePlusOffset(Ptr, StoreType.getStoreSize(), dl);
       Chains.push_back(Ch);
     }
 
@@ -26626,9 +29308,9 @@ static SDValue PerformSTORECombine(SDNode *N, SelectionDAG &DAG,
   const Function *F = DAG.getMachineFunction().getFunction();
   bool NoImplicitFloatOps = F->hasFnAttribute(Attribute::NoImplicitFloat);
   bool F64IsLegal =
-      !Subtarget->useSoftFloat() && !NoImplicitFloatOps && Subtarget->hasSSE2();
+      !Subtarget.useSoftFloat() && !NoImplicitFloatOps && Subtarget.hasSSE2();
   if ((VT.isVector() ||
-       (VT == MVT::i64 && F64IsLegal && !Subtarget->is64Bit())) &&
+       (VT == MVT::i64 && F64IsLegal && !Subtarget.is64Bit())) &&
       isa<LoadSDNode>(St->getValue()) &&
       !cast<LoadSDNode>(St->getValue())->isVolatile() &&
       St->getChain().hasOneUse() && !St->isVolatile()) {
@@ -26667,58 +29349,49 @@ static SDValue PerformSTORECombine(SDNode *N, SelectionDAG &DAG,
     // If we are a 64-bit capable x86, lower to a single movq load/store pair.
     // Otherwise, if it's legal to use f64 SSE instructions, use f64 load/store
     // pair instead.
-    if (Subtarget->is64Bit() || F64IsLegal) {
-      MVT LdVT = Subtarget->is64Bit() ? MVT::i64 : MVT::f64;
+    if (Subtarget.is64Bit() || F64IsLegal) {
+      MVT LdVT = Subtarget.is64Bit() ? MVT::i64 : MVT::f64;
       SDValue NewLd = DAG.getLoad(LdVT, LdDL, Ld->getChain(), Ld->getBasePtr(),
-                                  Ld->getPointerInfo(), Ld->isVolatile(),
-                                  Ld->isNonTemporal(), Ld->isInvariant(),
-                                  Ld->getAlignment());
+                                  Ld->getPointerInfo(), Ld->getAlignment(),
+                                  Ld->getMemOperand()->getFlags());
       SDValue NewChain = NewLd.getValue(1);
-      if (TokenFactorIndex != -1) {
+      if (TokenFactorIndex >= 0) {
         Ops.push_back(NewChain);
         NewChain = DAG.getNode(ISD::TokenFactor, LdDL, MVT::Other, Ops);
       }
       return DAG.getStore(NewChain, StDL, NewLd, St->getBasePtr(),
-                          St->getPointerInfo(),
-                          St->isVolatile(), St->isNonTemporal(),
-                          St->getAlignment());
+                          St->getPointerInfo(), St->getAlignment(),
+                          St->getMemOperand()->getFlags());
     }
 
     // Otherwise, lower to two pairs of 32-bit loads / stores.
     SDValue LoAddr = Ld->getBasePtr();
-    SDValue HiAddr = DAG.getNode(ISD::ADD, LdDL, MVT::i32, LoAddr,
-                                 DAG.getConstant(4, LdDL, MVT::i32));
+    SDValue HiAddr = DAG.getMemBasePlusOffset(LoAddr, 4, LdDL);
 
     SDValue LoLd = DAG.getLoad(MVT::i32, LdDL, Ld->getChain(), LoAddr,
-                               Ld->getPointerInfo(),
-                               Ld->isVolatile(), Ld->isNonTemporal(),
-                               Ld->isInvariant(), Ld->getAlignment());
+                               Ld->getPointerInfo(), Ld->getAlignment(),
+                               Ld->getMemOperand()->getFlags());
     SDValue HiLd = DAG.getLoad(MVT::i32, LdDL, Ld->getChain(), HiAddr,
                                Ld->getPointerInfo().getWithOffset(4),
-                               Ld->isVolatile(), Ld->isNonTemporal(),
-                               Ld->isInvariant(),
-                               MinAlign(Ld->getAlignment(), 4));
+                               MinAlign(Ld->getAlignment(), 4),
+                               Ld->getMemOperand()->getFlags());
 
     SDValue NewChain = LoLd.getValue(1);
-    if (TokenFactorIndex != -1) {
+    if (TokenFactorIndex >= 0) {
       Ops.push_back(LoLd);
       Ops.push_back(HiLd);
       NewChain = DAG.getNode(ISD::TokenFactor, LdDL, MVT::Other, Ops);
     }
 
     LoAddr = St->getBasePtr();
-    HiAddr = DAG.getNode(ISD::ADD, StDL, MVT::i32, LoAddr,
-                         DAG.getConstant(4, StDL, MVT::i32));
-
-    SDValue LoSt = DAG.getStore(NewChain, StDL, LoLd, LoAddr,
-                                St->getPointerInfo(),
-                                St->isVolatile(), St->isNonTemporal(),
-                                St->getAlignment());
-    SDValue HiSt = DAG.getStore(NewChain, StDL, HiLd, HiAddr,
-                                St->getPointerInfo().getWithOffset(4),
-                                St->isVolatile(),
-                                St->isNonTemporal(),
-                                MinAlign(St->getAlignment(), 4));
+    HiAddr = DAG.getMemBasePlusOffset(LoAddr, 4, StDL);
+
+    SDValue LoSt =
+        DAG.getStore(NewChain, StDL, LoLd, LoAddr, St->getPointerInfo(),
+                     St->getAlignment(), St->getMemOperand()->getFlags());
+    SDValue HiSt = DAG.getStore(
+        NewChain, StDL, HiLd, HiAddr, St->getPointerInfo().getWithOffset(4),
+        MinAlign(St->getAlignment(), 4), St->getMemOperand()->getFlags());
     return DAG.getNode(ISD::TokenFactor, StDL, MVT::Other, LoSt, HiSt);
   }
 
@@ -26728,7 +29401,7 @@ static SDValue PerformSTORECombine(SDNode *N, SelectionDAG &DAG,
   // to get past legalization. The execution dependencies fixup pass will
   // choose the optimal machine instruction for the store if this really is
   // an integer or v2f32 rather than an f64.
-  if (VT == MVT::i64 && F64IsLegal && !Subtarget->is64Bit() &&
+  if (VT == MVT::i64 && F64IsLegal && !Subtarget.is64Bit() &&
       St->getOperand(1).getOpcode() == ISD::EXTRACT_VECTOR_ELT) {
     SDValue OldExtract = St->getOperand(1);
     SDValue ExtOp0 = OldExtract.getOperand(0);
@@ -26738,8 +29411,8 @@ static SDValue PerformSTORECombine(SDNode *N, SelectionDAG &DAG,
     SDValue NewExtract = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64,
                                      BitCast, OldExtract.getOperand(1));
     return DAG.getStore(St->getChain(), dl, NewExtract, St->getBasePtr(),
-                        St->getPointerInfo(), St->isVolatile(),
-                        St->isNonTemporal(), St->getAlignment());
+                        St->getPointerInfo(), St->getAlignment(),
+                        St->getMemOperand()->getFlags());
   }
 
   return SDValue();
@@ -26798,14 +29471,14 @@ static bool isHorizontalBinOp(SDValue &LHS, SDValue &RHS, bool IsCommutative) {
   SDValue A, B;
   SmallVector<int, 16> LMask(NumElts);
   if (LHS.getOpcode() == ISD::VECTOR_SHUFFLE) {
-    if (LHS.getOperand(0).getOpcode() != ISD::UNDEF)
+    if (!LHS.getOperand(0).isUndef())
       A = LHS.getOperand(0);
-    if (LHS.getOperand(1).getOpcode() != ISD::UNDEF)
+    if (!LHS.getOperand(1).isUndef())
       B = LHS.getOperand(1);
     ArrayRef<int> Mask = cast<ShuffleVectorSDNode>(LHS.getNode())->getMask();
     std::copy(Mask.begin(), Mask.end(), LMask.begin());
   } else {
-    if (LHS.getOpcode() != ISD::UNDEF)
+    if (!LHS.isUndef())
       A = LHS;
     for (unsigned i = 0; i != NumElts; ++i)
       LMask[i] = i;
@@ -26816,14 +29489,14 @@ static bool isHorizontalBinOp(SDValue &LHS, SDValue &RHS, bool IsCommutative) {
   SDValue C, D;
   SmallVector<int, 16> RMask(NumElts);
   if (RHS.getOpcode() == ISD::VECTOR_SHUFFLE) {
-    if (RHS.getOperand(0).getOpcode() != ISD::UNDEF)
+    if (!RHS.getOperand(0).isUndef())
       C = RHS.getOperand(0);
-    if (RHS.getOperand(1).getOpcode() != ISD::UNDEF)
+    if (!RHS.getOperand(1).isUndef())
       D = RHS.getOperand(1);
     ArrayRef<int> Mask = cast<ShuffleVectorSDNode>(RHS.getNode())->getMask();
     std::copy(Mask.begin(), Mask.end(), RMask.begin());
   } else {
-    if (RHS.getOpcode() != ISD::UNDEF)
+    if (!RHS.isUndef())
       C = RHS;
     for (unsigned i = 0; i != NumElts; ++i)
       RMask[i] = i;
@@ -26871,33 +29544,22 @@ static bool isHorizontalBinOp(SDValue &LHS, SDValue &RHS, bool IsCommutative) {
   return true;
 }
 
-/// Do target-specific dag combines on floating point adds.
-static SDValue PerformFADDCombine(SDNode *N, SelectionDAG &DAG,
-                                  const X86Subtarget *Subtarget) {
-  EVT VT = N->getValueType(0);
-  SDValue LHS = N->getOperand(0);
-  SDValue RHS = N->getOperand(1);
-
-  // Try to synthesize horizontal adds from adds of shuffles.
-  if (((Subtarget->hasSSE3() && (VT == MVT::v4f32 || VT == MVT::v2f64)) ||
-       (Subtarget->hasFp256() && (VT == MVT::v8f32 || VT == MVT::v4f64))) &&
-      isHorizontalBinOp(LHS, RHS, true))
-    return DAG.getNode(X86ISD::FHADD, SDLoc(N), VT, LHS, RHS);
-  return SDValue();
-}
-
-/// Do target-specific dag combines on floating point subs.
-static SDValue PerformFSUBCombine(SDNode *N, SelectionDAG &DAG,
-                                  const X86Subtarget *Subtarget) {
+/// Do target-specific dag combines on floating-point adds/subs.
+static SDValue combineFaddFsub(SDNode *N, SelectionDAG &DAG,
+                               const X86Subtarget &Subtarget) {
   EVT VT = N->getValueType(0);
   SDValue LHS = N->getOperand(0);
   SDValue RHS = N->getOperand(1);
+  bool IsFadd = N->getOpcode() == ISD::FADD;
+  assert((IsFadd || N->getOpcode() == ISD::FSUB) && "Wrong opcode");
 
-  // Try to synthesize horizontal subs from subs of shuffles.
-  if (((Subtarget->hasSSE3() && (VT == MVT::v4f32 || VT == MVT::v2f64)) ||
-       (Subtarget->hasFp256() && (VT == MVT::v8f32 || VT == MVT::v4f64))) &&
-      isHorizontalBinOp(LHS, RHS, false))
-    return DAG.getNode(X86ISD::FHSUB, SDLoc(N), VT, LHS, RHS);
+  // Try to synthesize horizontal add/sub from adds/subs of shuffles.
+  if (((Subtarget.hasSSE3() && (VT == MVT::v4f32 || VT == MVT::v2f64)) ||
+       (Subtarget.hasFp256() && (VT == MVT::v8f32 || VT == MVT::v4f64))) &&
+      isHorizontalBinOp(LHS, RHS, IsFadd)) {
+    auto NewOpcode = IsFadd ? X86ISD::FHADD : X86ISD::FHSUB;
+    return DAG.getNode(NewOpcode, SDLoc(N), VT, LHS, RHS);
+  }
   return SDValue();
 }
 
@@ -26916,13 +29578,11 @@ combineVectorTruncationWithPACKUS(SDNode *N, SelectionDAG &DAG,
   // First, use mask to unset all bits that won't appear in the result.
   assert((OutSVT == MVT::i8 || OutSVT == MVT::i16) &&
          "OutSVT can only be either i8 or i16.");
-  SDValue MaskVal =
-      DAG.getConstant(OutSVT == MVT::i8 ? 0xFF : 0xFFFF, DL, InSVT);
-  SDValue MaskVec = DAG.getNode(
-      ISD::BUILD_VECTOR, DL, InVT,
-      SmallVector<SDValue, 8>(InVT.getVectorNumElements(), MaskVal));
+  APInt Mask =
+      APInt::getLowBitsSet(InSVT.getSizeInBits(), OutSVT.getSizeInBits());
+  SDValue MaskVal = DAG.getConstant(Mask, DL, InVT);
   for (auto &Reg : Regs)
-    Reg = DAG.getNode(ISD::AND, DL, InVT, MaskVec, Reg);
+    Reg = DAG.getNode(ISD::AND, DL, InVT, MaskVal, Reg);
 
   MVT UnpackedVT, PackedVT;
   if (OutSVT == MVT::i8) {
@@ -26938,7 +29598,7 @@ combineVectorTruncationWithPACKUS(SDNode *N, SelectionDAG &DAG,
   for (unsigned j = 1, e = InSVT.getSizeInBits() / OutSVT.getSizeInBits();
        j < e; j *= 2, RegNum /= 2) {
     for (unsigned i = 0; i < RegNum; i++)
-      Regs[i] = DAG.getNode(ISD::BITCAST, DL, UnpackedVT, Regs[i]);
+      Regs[i] = DAG.getBitcast(UnpackedVT, Regs[i]);
     for (unsigned i = 0; i < RegNum / 2; i++)
       Regs[i] = DAG.getNode(X86ISD::PACKUS, DL, PackedVT, Regs[i * 2],
                             Regs[i * 2 + 1]);
@@ -26990,7 +29650,7 @@ combineVectorTruncationWithPACKSS(SDNode *N, SelectionDAG &DAG,
 /// element that is extracted from a vector and then truncated, and it is
 /// diffcult to do this optimization based on them.
 static SDValue combineVectorTruncation(SDNode *N, SelectionDAG &DAG,
-                                       const X86Subtarget *Subtarget) {
+                                       const X86Subtarget &Subtarget) {
   EVT OutVT = N->getValueType(0);
   if (!OutVT.isVector())
     return SDValue();
@@ -27005,7 +29665,7 @@ static SDValue combineVectorTruncation(SDNode *N, SelectionDAG &DAG,
   // TODO: On AVX2, the behavior of X86ISD::PACKUS is different from that on
   // SSE2, and we need to take care of it specially.
   // AVX512 provides vpmovdb.
-  if (!Subtarget->hasSSE2() || Subtarget->hasAVX2())
+  if (!Subtarget.hasSSE2() || Subtarget.hasAVX2())
     return SDValue();
 
   EVT OutSVT = OutVT.getVectorElementType();
@@ -27016,7 +29676,7 @@ static SDValue combineVectorTruncation(SDNode *N, SelectionDAG &DAG,
     return SDValue();
 
   // SSSE3's pshufb results in less instructions in the cases below.
-  if (Subtarget->hasSSSE3() && NumElems == 8 &&
+  if (Subtarget.hasSSSE3() && NumElems == 8 &&
       ((OutSVT == MVT::i8 && InSVT != MVT::i64) ||
        (InSVT == MVT::i32 && OutSVT == MVT::i16)))
     return SDValue();
@@ -27026,20 +29686,17 @@ static SDValue combineVectorTruncation(SDNode *N, SelectionDAG &DAG,
   // Split a long vector into vectors of legal type.
   unsigned RegNum = InVT.getSizeInBits() / 128;
   SmallVector<SDValue, 8> SubVec(RegNum);
-  if (InSVT == MVT::i32) {
-    for (unsigned i = 0; i < RegNum; i++)
-      SubVec[i] = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v4i32, In,
-                              DAG.getIntPtrConstant(i * 4, DL));
-  } else {
-    for (unsigned i = 0; i < RegNum; i++)
-      SubVec[i] = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v2i64, In,
-                              DAG.getIntPtrConstant(i * 2, DL));
-  }
+  unsigned NumSubRegElts = 128 / InSVT.getSizeInBits();
+  EVT SubRegVT = EVT::getVectorVT(*DAG.getContext(), InSVT, NumSubRegElts);
 
-  // SSE2 provides PACKUS for only 2 x v8i16 -> v16i8 and SSE4.1 provides PAKCUS
+  for (unsigned i = 0; i < RegNum; i++)
+    SubVec[i] = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubRegVT, In,
+                            DAG.getIntPtrConstant(i * NumSubRegElts, DL));
+
+  // SSE2 provides PACKUS for only 2 x v8i16 -> v16i8 and SSE4.1 provides PACKUS
   // for 2 x v4i32 -> v8i16. For SSSE3 and below, we need to use PACKSS to
   // truncate 2 x v4i32 to v8i16.
-  if (Subtarget->hasSSE41() || OutSVT == MVT::i8)
+  if (Subtarget.hasSSE41() || OutSVT == MVT::i8)
     return combineVectorTruncationWithPACKUS(N, DAG, SubVec);
   else if (InSVT == MVT::i32)
     return combineVectorTruncationWithPACKSS(N, DAG, SubVec);
@@ -27047,20 +29704,30 @@ static SDValue combineVectorTruncation(SDNode *N, SelectionDAG &DAG,
     return SDValue();
 }
 
-static SDValue PerformTRUNCATECombine(SDNode *N, SelectionDAG &DAG,
-                                      const X86Subtarget *Subtarget) {
+static SDValue combineTruncate(SDNode *N, SelectionDAG &DAG,
+                               const X86Subtarget &Subtarget) {
+  EVT VT = N->getValueType(0);
+  SDValue Src = N->getOperand(0);
+  SDLoc DL(N);
+
   // Try to detect AVG pattern first.
-  SDValue Avg = detectAVGPattern(N->getOperand(0), N->getValueType(0), DAG,
-                                 Subtarget, SDLoc(N));
-  if (Avg.getNode())
+  if (SDValue Avg = detectAVGPattern(Src, VT, DAG, Subtarget, DL))
     return Avg;
 
+  // The bitcast source is a direct mmx result.
+  // Detect bitcasts between i32 to x86mmx
+  if (Src.getOpcode() == ISD::BITCAST && VT == MVT::i32) {
+    SDValue BCSrc = Src.getOperand(0);
+    if (BCSrc.getValueType() == MVT::x86mmx)
+      return DAG.getNode(X86ISD::MMX_MOVD2W, DL, MVT::i32, BCSrc);
+  }
+
   return combineVectorTruncation(N, DAG, Subtarget);
 }
 
 /// Do target-specific dag combines on floating point negations.
-static SDValue PerformFNEGCombine(SDNode *N, SelectionDAG &DAG,
-                                  const X86Subtarget *Subtarget) {
+static SDValue combineFneg(SDNode *N, SelectionDAG &DAG,
+                           const X86Subtarget &Subtarget) {
   EVT VT = N->getValueType(0);
   EVT SVT = VT.getScalarType();
   SDValue Arg = N->getOperand(0);
@@ -27074,7 +29741,7 @@ static SDValue PerformFNEGCombine(SDNode *N, SelectionDAG &DAG,
   // use of a constant by performing (-0 - A*B) instead.
   // FIXME: Check rounding control flags as well once it becomes available.
   if (Arg.getOpcode() == ISD::FMUL && (SVT == MVT::f32 || SVT == MVT::f64) &&
-      Arg->getFlags()->hasNoSignedZeros() && Subtarget->hasAnyFMA()) {
+      Arg->getFlags()->hasNoSignedZeros() && Subtarget.hasAnyFMA()) {
     SDValue Zero = DAG.getConstantFP(0.0, DL, VT);
     return DAG.getNode(X86ISD::FNMSUB, DL, VT, Arg.getOperand(0),
                        Arg.getOperand(1), Zero);
@@ -27102,17 +29769,17 @@ static SDValue PerformFNEGCombine(SDNode *N, SelectionDAG &DAG,
 }
 
 static SDValue lowerX86FPLogicOp(SDNode *N, SelectionDAG &DAG,
-                              const X86Subtarget *Subtarget) {
+                              const X86Subtarget &Subtarget) {
   EVT VT = N->getValueType(0);
-  if (VT.is512BitVector() && !Subtarget->hasDQI()) {
+  if (VT.is512BitVector() && !Subtarget.hasDQI()) {
     // VXORPS, VORPS, VANDPS, VANDNPS are supported only under DQ extention.
     // These logic operations may be executed in the integer domain.
     SDLoc dl(N);
     MVT IntScalar = MVT::getIntegerVT(VT.getScalarSizeInBits());
     MVT IntVT = MVT::getVectorVT(IntScalar, VT.getVectorNumElements());
 
-    SDValue Op0 = DAG.getNode(ISD::BITCAST, dl, IntVT, N->getOperand(0));
-    SDValue Op1 = DAG.getNode(ISD::BITCAST, dl, IntVT, N->getOperand(1));
+    SDValue Op0 = DAG.getBitcast(IntVT, N->getOperand(0));
+    SDValue Op1 = DAG.getBitcast(IntVT, N->getOperand(1));
     unsigned IntOpcode = 0;
     switch (N->getOpcode()) {
       default: llvm_unreachable("Unexpected FP logic op");
@@ -27122,13 +29789,13 @@ static SDValue lowerX86FPLogicOp(SDNode *N, SelectionDAG &DAG,
       case X86ISD::FANDN: IntOpcode = X86ISD::ANDNP; break;
     }
     SDValue IntOp = DAG.getNode(IntOpcode, dl, IntVT, Op0, Op1);
-    return  DAG.getNode(ISD::BITCAST, dl, VT, IntOp);
+    return DAG.getBitcast(VT, IntOp);
   }
   return SDValue();
 }
 /// Do target-specific dag combines on X86ISD::FOR and X86ISD::FXOR nodes.
-static SDValue PerformFORCombine(SDNode *N, SelectionDAG &DAG,
-                                 const X86Subtarget *Subtarget) {
+static SDValue combineFOr(SDNode *N, SelectionDAG &DAG,
+                          const X86Subtarget &Subtarget) {
   assert(N->getOpcode() == X86ISD::FOR || N->getOpcode() == X86ISD::FXOR);
 
   // F[X]OR(0.0, x) -> x
@@ -27145,7 +29812,7 @@ static SDValue PerformFORCombine(SDNode *N, SelectionDAG &DAG,
 }
 
 /// Do target-specific dag combines on X86ISD::FMIN and X86ISD::FMAX nodes.
-static SDValue PerformFMinFMaxCombine(SDNode *N, SelectionDAG &DAG) {
+static SDValue combineFMinFMax(SDNode *N, SelectionDAG &DAG) {
   assert(N->getOpcode() == X86ISD::FMIN || N->getOpcode() == X86ISD::FMAX);
 
   // Only perform optimizations if UnsafeMath is used.
@@ -27165,9 +29832,9 @@ static SDValue PerformFMinFMaxCombine(SDNode *N, SelectionDAG &DAG) {
                      N->getOperand(0), N->getOperand(1));
 }
 
-static SDValue performFMinNumFMaxNumCombine(SDNode *N, SelectionDAG &DAG,
-                                            const X86Subtarget *Subtarget) {
-  if (Subtarget->useSoftFloat())
+static SDValue combineFMinNumFMaxNum(SDNode *N, SelectionDAG &DAG,
+                                     const X86Subtarget &Subtarget) {
+  if (Subtarget.useSoftFloat())
     return SDValue();
 
   // TODO: Check for global or instruction-level "nnan". In that case, we
@@ -27176,9 +29843,9 @@ static SDValue performFMinNumFMaxNumCombine(SDNode *N, SelectionDAG &DAG,
   //       should be an optional swap and FMAX/FMIN.
 
   EVT VT = N->getValueType(0);
-  if (!((Subtarget->hasSSE1() && (VT == MVT::f32 || VT == MVT::v4f32)) ||
-        (Subtarget->hasSSE2() && (VT == MVT::f64 || VT == MVT::v2f64)) ||
-        (Subtarget->hasAVX() && (VT == MVT::v8f32 || VT == MVT::v4f64))))
+  if (!((Subtarget.hasSSE1() && (VT == MVT::f32 || VT == MVT::v4f32)) ||
+        (Subtarget.hasSSE2() && (VT == MVT::f64 || VT == MVT::v2f64)) ||
+        (Subtarget.hasAVX() && (VT == MVT::v8f32 || VT == MVT::v4f64))))
     return SDValue();
 
   // This takes at least 3 instructions, so favor a library call when operating
@@ -27222,8 +29889,8 @@ static SDValue performFMinNumFMaxNumCombine(SDNode *N, SelectionDAG &DAG,
 }
 
 /// Do target-specific dag combines on X86ISD::FAND nodes.
-static SDValue PerformFANDCombine(SDNode *N, SelectionDAG &DAG,
-                                  const X86Subtarget *Subtarget) {
+static SDValue combineFAnd(SDNode *N, SelectionDAG &DAG,
+                           const X86Subtarget &Subtarget) {
   // FAND(0.0, x) -> 0.0
   if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(N->getOperand(0)))
     if (C->getValueAPF().isPosZero())
@@ -27238,8 +29905,8 @@ static SDValue PerformFANDCombine(SDNode *N, SelectionDAG &DAG,
 }
 
 /// Do target-specific dag combines on X86ISD::FANDN nodes
-static SDValue PerformFANDNCombine(SDNode *N, SelectionDAG &DAG,
-                                   const X86Subtarget *Subtarget) {
+static SDValue combineFAndn(SDNode *N, SelectionDAG &DAG,
+                            const X86Subtarget &Subtarget) {
   // FANDN(0.0, x) -> x
   if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(N->getOperand(0)))
     if (C->getValueAPF().isPosZero())
@@ -27253,9 +29920,8 @@ static SDValue PerformFANDNCombine(SDNode *N, SelectionDAG &DAG,
   return lowerX86FPLogicOp(N, DAG, Subtarget);
 }
 
-static SDValue PerformBTCombine(SDNode *N,
-                                SelectionDAG &DAG,
-                                TargetLowering::DAGCombinerInfo &DCI) {
+static SDValue combineBT(SDNode *N, SelectionDAG &DAG,
+                         TargetLowering::DAGCombinerInfo &DCI) {
   // BT ignores high bits in the bit index operand.
   SDValue Op1 = N->getOperand(1);
   if (Op1.hasOneUse()) {
@@ -27272,21 +29938,19 @@ static SDValue PerformBTCombine(SDNode *N,
   return SDValue();
 }
 
-static SDValue PerformVZEXT_MOVLCombine(SDNode *N, SelectionDAG &DAG) {
-  SDValue Op = N->getOperand(0);
-  if (Op.getOpcode() == ISD::BITCAST)
-    Op = Op.getOperand(0);
+static SDValue combineVZextMovl(SDNode *N, SelectionDAG &DAG) {
+  SDValue Op = peekThroughBitcasts(N->getOperand(0));
   EVT VT = N->getValueType(0), OpVT = Op.getValueType();
   if (Op.getOpcode() == X86ISD::VZEXT_LOAD &&
       VT.getVectorElementType().getSizeInBits() ==
       OpVT.getVectorElementType().getSizeInBits()) {
-    return DAG.getNode(ISD::BITCAST, SDLoc(N), VT, Op);
+    return DAG.getBitcast(VT, Op);
   }
   return SDValue();
 }
 
-static SDValue PerformSIGN_EXTEND_INREGCombine(SDNode *N, SelectionDAG &DAG,
-                                               const X86Subtarget *Subtarget) {
+static SDValue combineSignExtendInReg(SDNode *N, SelectionDAG &DAG,
+                                      const X86Subtarget &Subtarget) {
   EVT VT = N->getValueType(0);
   if (!VT.isVector())
     return SDValue();
@@ -27307,7 +29971,7 @@ static SDValue PerformSIGN_EXTEND_INREGCombine(SDNode *N, SelectionDAG &DAG,
 
     // EXTLOAD has a better solution on AVX2,
     // it may be replaced with X86ISD::VSEXT node.
-    if (N00.getOpcode() == ISD::LOAD && Subtarget->hasInt256())
+    if (N00.getOpcode() == ISD::LOAD && Subtarget.hasInt256())
       if (!ISD::isNormalLoad(N00.getNode()))
         return SDValue();
 
@@ -27325,7 +29989,7 @@ static SDValue PerformSIGN_EXTEND_INREGCombine(SDNode *N, SelectionDAG &DAG,
 /// to combine math ops, use an LEA, or use a complex addressing mode. This can
 /// eliminate extend, add, and shift instructions.
 static SDValue promoteSextBeforeAddNSW(SDNode *Sext, SelectionDAG &DAG,
-                                       const X86Subtarget *Subtarget) {
+                                       const X86Subtarget &Subtarget) {
   // TODO: This should be valid for other integer types.
   EVT VT = Sext->getValueType(0);
   if (VT != MVT::i64)
@@ -27397,14 +30061,106 @@ static SDValue getDivRem8(SDNode *N, SelectionDAG &DAG) {
   return R.getValue(1);
 }
 
-static SDValue PerformSExtCombine(SDNode *N, SelectionDAG &DAG,
-                                  TargetLowering::DAGCombinerInfo &DCI,
-                                  const X86Subtarget *Subtarget) {
+/// Convert a SEXT or ZEXT of a vector to a SIGN_EXTEND_VECTOR_INREG or
+/// ZERO_EXTEND_VECTOR_INREG, this requires the splitting (or concatenating
+/// with UNDEFs) of the input to vectors of the same size as the target type
+/// which then extends the lowest elements.
+static SDValue combineToExtendVectorInReg(SDNode *N, SelectionDAG &DAG,
+                                          TargetLowering::DAGCombinerInfo &DCI,
+                                          const X86Subtarget &Subtarget) {
+  unsigned Opcode = N->getOpcode();
+  if (Opcode != ISD::SIGN_EXTEND && Opcode != ISD::ZERO_EXTEND)
+    return SDValue();
+  if (!DCI.isBeforeLegalizeOps())
+    return SDValue();
+  if (!Subtarget.hasSSE2())
+    return SDValue();
+
   SDValue N0 = N->getOperand(0);
   EVT VT = N->getValueType(0);
   EVT SVT = VT.getScalarType();
   EVT InVT = N0.getValueType();
   EVT InSVT = InVT.getScalarType();
+
+  // Input type must be a vector and we must be extending legal integer types.
+  if (!VT.isVector())
+    return SDValue();
+  if (SVT != MVT::i64 && SVT != MVT::i32 && SVT != MVT::i16)
+    return SDValue();
+  if (InSVT != MVT::i32 && InSVT != MVT::i16 && InSVT != MVT::i8)
+    return SDValue();
+
+  // On AVX2+ targets, if the input/output types are both legal then we will be
+  // able to use SIGN_EXTEND/ZERO_EXTEND directly.
+  if (Subtarget.hasInt256() && DAG.getTargetLoweringInfo().isTypeLegal(VT) &&
+      DAG.getTargetLoweringInfo().isTypeLegal(InVT))
+    return SDValue();
+
+  SDLoc DL(N);
+
+  auto ExtendVecSize = [&DAG](const SDLoc &DL, SDValue N, unsigned Size) {
+    EVT InVT = N.getValueType();
+    EVT OutVT = EVT::getVectorVT(*DAG.getContext(), InVT.getScalarType(),
+                                 Size / InVT.getScalarSizeInBits());
+    SmallVector<SDValue, 8> Opnds(Size / InVT.getSizeInBits(),
+                                  DAG.getUNDEF(InVT));
+    Opnds[0] = N;
+    return DAG.getNode(ISD::CONCAT_VECTORS, DL, OutVT, Opnds);
+  };
+
+  // If target-size is less than 128-bits, extend to a type that would extend
+  // to 128 bits, extend that and extract the original target vector.
+  if (VT.getSizeInBits() < 128 && !(128 % VT.getSizeInBits())) {
+    unsigned Scale = 128 / VT.getSizeInBits();
+    EVT ExVT =
+        EVT::getVectorVT(*DAG.getContext(), SVT, 128 / SVT.getSizeInBits());
+    SDValue Ex = ExtendVecSize(DL, N0, Scale * InVT.getSizeInBits());
+    SDValue SExt = DAG.getNode(Opcode, DL, ExVT, Ex);
+    return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, SExt,
+                       DAG.getIntPtrConstant(0, DL));
+  }
+
+  // If target-size is 128-bits (or 256-bits on AVX2 target), then convert to
+  // ISD::*_EXTEND_VECTOR_INREG which ensures lowering to X86ISD::V*EXT.
+  // Also use this if we don't have SSE41 to allow the legalizer do its job.
+  if (!Subtarget.hasSSE41() || VT.is128BitVector() ||
+      (VT.is256BitVector() && Subtarget.hasInt256())) {
+    SDValue ExOp = ExtendVecSize(DL, N0, VT.getSizeInBits());
+    return Opcode == ISD::SIGN_EXTEND
+               ? DAG.getSignExtendVectorInReg(ExOp, DL, VT)
+               : DAG.getZeroExtendVectorInReg(ExOp, DL, VT);
+  }
+
+  // On pre-AVX2 targets, split into 128-bit nodes of
+  // ISD::*_EXTEND_VECTOR_INREG.
+  if (!Subtarget.hasInt256() && !(VT.getSizeInBits() % 128)) {
+    unsigned NumVecs = VT.getSizeInBits() / 128;
+    unsigned NumSubElts = 128 / SVT.getSizeInBits();
+    EVT SubVT = EVT::getVectorVT(*DAG.getContext(), SVT, NumSubElts);
+    EVT InSubVT = EVT::getVectorVT(*DAG.getContext(), InSVT, NumSubElts);
+
+    SmallVector<SDValue, 8> Opnds;
+    for (unsigned i = 0, Offset = 0; i != NumVecs; ++i, Offset += NumSubElts) {
+      SDValue SrcVec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, InSubVT, N0,
+                                   DAG.getIntPtrConstant(Offset, DL));
+      SrcVec = ExtendVecSize(DL, SrcVec, 128);
+      SrcVec = Opcode == ISD::SIGN_EXTEND
+                   ? DAG.getSignExtendVectorInReg(SrcVec, DL, SubVT)
+                   : DAG.getZeroExtendVectorInReg(SrcVec, DL, SubVT);
+      Opnds.push_back(SrcVec);
+    }
+    return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Opnds);
+  }
+
+  return SDValue();
+}
+
+static SDValue combineSext(SDNode *N, SelectionDAG &DAG,
+                           TargetLowering::DAGCombinerInfo &DCI,
+                           const X86Subtarget &Subtarget) {
+  SDValue N0 = N->getOperand(0);
+  EVT VT = N->getValueType(0);
+  EVT InVT = N0.getValueType();
   SDLoc DL(N);
 
   if (SDValue DivRem8 = getDivRem8(N, DAG))
@@ -27414,70 +30170,16 @@ static SDValue PerformSExtCombine(SDNode *N, SelectionDAG &DAG,
     if (InVT == MVT::i1) {
       SDValue Zero = DAG.getConstant(0, DL, VT);
       SDValue AllOnes =
-        DAG.getConstant(APInt::getAllOnesValue(VT.getSizeInBits()), DL, VT);
+          DAG.getConstant(APInt::getAllOnesValue(VT.getSizeInBits()), DL, VT);
       return DAG.getNode(ISD::SELECT, DL, VT, N0, AllOnes, Zero);
     }
     return SDValue();
   }
 
-  if (VT.isVector() && Subtarget->hasSSE2()) {
-    auto ExtendVecSize = [&DAG](SDLoc DL, SDValue N, unsigned Size) {
-      EVT InVT = N.getValueType();
-      EVT OutVT = EVT::getVectorVT(*DAG.getContext(), InVT.getScalarType(),
-                                   Size / InVT.getScalarSizeInBits());
-      SmallVector<SDValue, 8> Opnds(Size / InVT.getSizeInBits(),
-                                    DAG.getUNDEF(InVT));
-      Opnds[0] = N;
-      return DAG.getNode(ISD::CONCAT_VECTORS, DL, OutVT, Opnds);
-    };
-
-    // If target-size is less than 128-bits, extend to a type that would extend
-    // to 128 bits, extend that and extract the original target vector.
-    if (VT.getSizeInBits() < 128 && !(128 % VT.getSizeInBits()) &&
-        (SVT == MVT::i64 || SVT == MVT::i32 || SVT == MVT::i16) &&
-        (InSVT == MVT::i32 || InSVT == MVT::i16 || InSVT == MVT::i8)) {
-      unsigned Scale = 128 / VT.getSizeInBits();
-      EVT ExVT =
-          EVT::getVectorVT(*DAG.getContext(), SVT, 128 / SVT.getSizeInBits());
-      SDValue Ex = ExtendVecSize(DL, N0, Scale * InVT.getSizeInBits());
-      SDValue SExt = DAG.getNode(ISD::SIGN_EXTEND, DL, ExVT, Ex);
-      return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, SExt,
-                         DAG.getIntPtrConstant(0, DL));
-    }
-
-    // If target-size is 128-bits, then convert to ISD::SIGN_EXTEND_VECTOR_INREG
-    // which ensures lowering to X86ISD::VSEXT (pmovsx*).
-    if (VT.getSizeInBits() == 128 &&
-        (SVT == MVT::i64 || SVT == MVT::i32 || SVT == MVT::i16) &&
-        (InSVT == MVT::i32 || InSVT == MVT::i16 || InSVT == MVT::i8)) {
-      SDValue ExOp = ExtendVecSize(DL, N0, 128);
-      return DAG.getSignExtendVectorInReg(ExOp, DL, VT);
-    }
-
-    // On pre-AVX2 targets, split into 128-bit nodes of
-    // ISD::SIGN_EXTEND_VECTOR_INREG.
-    if (!Subtarget->hasInt256() && !(VT.getSizeInBits() % 128) &&
-        (SVT == MVT::i64 || SVT == MVT::i32 || SVT == MVT::i16) &&
-        (InSVT == MVT::i32 || InSVT == MVT::i16 || InSVT == MVT::i8)) {
-      unsigned NumVecs = VT.getSizeInBits() / 128;
-      unsigned NumSubElts = 128 / SVT.getSizeInBits();
-      EVT SubVT = EVT::getVectorVT(*DAG.getContext(), SVT, NumSubElts);
-      EVT InSubVT = EVT::getVectorVT(*DAG.getContext(), InSVT, NumSubElts);
-
-      SmallVector<SDValue, 8> Opnds;
-      for (unsigned i = 0, Offset = 0; i != NumVecs;
-           ++i, Offset += NumSubElts) {
-        SDValue SrcVec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, InSubVT, N0,
-                                     DAG.getIntPtrConstant(Offset, DL));
-        SrcVec = ExtendVecSize(DL, SrcVec, 128);
-        SrcVec = DAG.getSignExtendVectorInReg(SrcVec, DL, SubVT);
-        Opnds.push_back(SrcVec);
-      }
-      return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Opnds);
-    }
-  }
+  if (SDValue V = combineToExtendVectorInReg(N, DAG, DCI, Subtarget))
+    return V;
 
-  if (Subtarget->hasAVX() && VT.is256BitVector())
+  if (Subtarget.hasAVX() && VT.is256BitVector())
     if (SDValue R = WidenMaskArithmetic(N, DAG, DCI, Subtarget))
       return R;
 
@@ -27487,8 +30189,8 @@ static SDValue PerformSExtCombine(SDNode *N, SelectionDAG &DAG,
   return SDValue();
 }
 
-static SDValue PerformFMACombine(SDNode *N, SelectionDAG &DAG,
-                                 const X86Subtarget* Subtarget) {
+static SDValue combineFMA(SDNode *N, SelectionDAG &DAG,
+                          const X86Subtarget &Subtarget) {
   SDLoc dl(N);
   EVT VT = N->getValueType(0);
 
@@ -27497,7 +30199,7 @@ static SDValue PerformFMACombine(SDNode *N, SelectionDAG &DAG,
     return SDValue();
 
   EVT ScalarVT = VT.getScalarType();
-  if ((ScalarVT != MVT::f32 && ScalarVT != MVT::f64) || !Subtarget->hasAnyFMA())
+  if ((ScalarVT != MVT::f32 && ScalarVT != MVT::f64) || !Subtarget.hasAnyFMA())
     return SDValue();
 
   SDValue A = N->getOperand(0);
@@ -27526,9 +30228,9 @@ static SDValue PerformFMACombine(SDNode *N, SelectionDAG &DAG,
   return DAG.getNode(Opcode, dl, VT, A, B, C);
 }
 
-static SDValue PerformZExtCombine(SDNode *N, SelectionDAG &DAG,
-                                  TargetLowering::DAGCombinerInfo &DCI,
-                                  const X86Subtarget *Subtarget) {
+static SDValue combineZext(SDNode *N, SelectionDAG &DAG,
+                           TargetLowering::DAGCombinerInfo &DCI,
+                           const X86Subtarget &Subtarget) {
   // (i32 zext (and (i8  x86isd::setcc_carry), 1)) ->
   //           (and (i32 x86isd::setcc_carry), 1)
   // This eliminates the zext. This transformation is necessary because
@@ -27563,6 +30265,9 @@ static SDValue PerformZExtCombine(SDNode *N, SelectionDAG &DAG,
     }
   }
 
+  if (SDValue V = combineToExtendVectorInReg(N, DAG, DCI, Subtarget))
+    return V;
+
   if (VT.is256BitVector())
     if (SDValue R = WidenMaskArithmetic(N, DAG, DCI, Subtarget))
       return R;
@@ -27573,10 +30278,10 @@ static SDValue PerformZExtCombine(SDNode *N, SelectionDAG &DAG,
   return SDValue();
 }
 
-// Optimize x == -y --> x+y == 0
-//          x != -y --> x+y != 0
-static SDValue PerformISDSETCCCombine(SDNode *N, SelectionDAG &DAG,
-                                      const X86Subtarget* Subtarget) {
+/// Optimize x == -y --> x+y == 0
+///          x != -y --> x+y != 0
+static SDValue combineSetCC(SDNode *N, SelectionDAG &DAG,
+                            const X86Subtarget &Subtarget) {
   ISD::CondCode CC = cast<CondCodeSDNode>(N->getOperand(2))->get();
   SDValue LHS = N->getOperand(0);
   SDValue RHS = N->getOperand(1);
@@ -27631,10 +30336,15 @@ static SDValue PerformISDSETCCCombine(SDNode *N, SelectionDAG &DAG,
     }
   }
 
+  // For an SSE1-only target, lower to X86ISD::CMPP early to avoid scalarization
+  // via legalization because v4i32 is not a legal type.
+  if (Subtarget.hasSSE1() && !Subtarget.hasSSE2() && VT == MVT::v4i32)
+    return LowerVSETCC(SDValue(N, 0), Subtarget, DAG);
+
   return SDValue();
 }
 
-static SDValue PerformGatherScatterCombine(SDNode *N, SelectionDAG &DAG) {
+static SDValue combineGatherScatter(SDNode *N, SelectionDAG &DAG) {
   SDLoc DL(N);
   // Gather and Scatter instructions use k-registers for masks. The type of
   // the masks is v*i1. So the mask will be truncated anyway.
@@ -27648,11 +30358,11 @@ static SDValue PerformGatherScatterCombine(SDNode *N, SelectionDAG &DAG) {
   return SDValue();
 }
 
-// Helper function of PerformSETCCCombine. It is to materialize "setb reg"
+// Helper function of performSETCCCombine. It is to materialize "setb reg"
 // as "sbb reg,reg", since it can be extended without zext and produces
 // an all-ones bit which is more useful than 0/1 in some cases.
-static SDValue MaterializeSETB(SDLoc DL, SDValue EFLAGS, SelectionDAG &DAG,
-                               MVT VT) {
+static SDValue MaterializeSETB(const SDLoc &DL, SDValue EFLAGS,
+                               SelectionDAG &DAG, MVT VT) {
   if (VT == MVT::i8)
     return DAG.getNode(ISD::AND, DL, VT,
                        DAG.getNode(X86ISD::SETCC_CARRY, DL, MVT::i8,
@@ -27667,9 +30377,9 @@ static SDValue MaterializeSETB(SDLoc DL, SDValue EFLAGS, SelectionDAG &DAG,
 }
 
 // Optimize  RES = X86ISD::SETCC CONDCODE, EFLAG_INPUT
-static SDValue PerformSETCCCombine(SDNode *N, SelectionDAG &DAG,
-                                   TargetLowering::DAGCombinerInfo &DCI,
-                                   const X86Subtarget *Subtarget) {
+static SDValue combineX86SetCC(SDNode *N, SelectionDAG &DAG,
+                               TargetLowering::DAGCombinerInfo &DCI,
+                               const X86Subtarget &Subtarget) {
   SDLoc DL(N);
   X86::CondCode CC = X86::CondCode(N->getConstantOperandVal(0));
   SDValue EFLAGS = N->getOperand(1);
@@ -27698,7 +30408,8 @@ static SDValue PerformSETCCCombine(SDNode *N, SelectionDAG &DAG,
   if (CC == X86::COND_B)
     return MaterializeSETB(DL, EFLAGS, DAG, N->getSimpleValueType(0));
 
-  if (SDValue Flags = checkBoolTestSetCCCombine(EFLAGS, CC)) {
+  // Try to simplify the EFLAGS and condition code operands.
+  if (SDValue Flags = combineSetCCEFLAGS(EFLAGS, CC, DAG)) {
     SDValue Cond = DAG.getConstant(CC, DL, MVT::i8);
     return DAG.getNode(X86ISD::SETCC, DL, N->getVTList(), Cond, Flags);
   }
@@ -27706,28 +30417,28 @@ static SDValue PerformSETCCCombine(SDNode *N, SelectionDAG &DAG,
   return SDValue();
 }
 
-// Optimize branch condition evaluation.
-//
-static SDValue PerformBrCondCombine(SDNode *N, SelectionDAG &DAG,
-                                    TargetLowering::DAGCombinerInfo &DCI,
-                                    const X86Subtarget *Subtarget) {
+/// Optimize branch condition evaluation.
+static SDValue combineBrCond(SDNode *N, SelectionDAG &DAG,
+                             TargetLowering::DAGCombinerInfo &DCI,
+                             const X86Subtarget &Subtarget) {
   SDLoc DL(N);
-  SDValue Chain = N->getOperand(0);
-  SDValue Dest = N->getOperand(1);
   SDValue EFLAGS = N->getOperand(3);
   X86::CondCode CC = X86::CondCode(N->getConstantOperandVal(2));
 
-  if (SDValue Flags = checkBoolTestSetCCCombine(EFLAGS, CC)) {
+  // Try to simplify the EFLAGS and condition code operands.
+  // Make sure to not keep references to operands, as combineSetCCEFLAGS can
+  // RAUW them under us.
+  if (SDValue Flags = combineSetCCEFLAGS(EFLAGS, CC, DAG)) {
     SDValue Cond = DAG.getConstant(CC, DL, MVT::i8);
-    return DAG.getNode(X86ISD::BRCOND, DL, N->getVTList(), Chain, Dest, Cond,
-                       Flags);
+    return DAG.getNode(X86ISD::BRCOND, DL, N->getVTList(), N->getOperand(0),
+                       N->getOperand(1), Cond, Flags);
   }
 
   return SDValue();
 }
 
-static SDValue performVectorCompareAndMaskUnaryOpCombine(SDNode *N,
-                                                         SelectionDAG &DAG) {
+static SDValue combineVectorCompareAndMaskUnaryOp(SDNode *N,
+                                                  SelectionDAG &DAG) {
   // Take advantage of vector comparisons producing 0 or -1 in each lane to
   // optimize away operation when it's from a constant.
   //
@@ -27772,8 +30483,8 @@ static SDValue performVectorCompareAndMaskUnaryOpCombine(SDNode *N,
   return SDValue();
 }
 
-static SDValue PerformUINT_TO_FPCombine(SDNode *N, SelectionDAG &DAG,
-                                        const X86Subtarget *Subtarget) {
+static SDValue combineUIntToFP(SDNode *N, SelectionDAG &DAG,
+                               const X86Subtarget &Subtarget) {
   SDValue Op0 = N->getOperand(0);
   EVT VT = N->getValueType(0);
   EVT InVT = Op0.getValueType();
@@ -27797,11 +30508,11 @@ static SDValue PerformUINT_TO_FPCombine(SDNode *N, SelectionDAG &DAG,
   return SDValue();
 }
 
-static SDValue PerformSINT_TO_FPCombine(SDNode *N, SelectionDAG &DAG,
-                                        const X86Subtarget *Subtarget) {
+static SDValue combineSIntToFP(SDNode *N, SelectionDAG &DAG,
+                               const X86Subtarget &Subtarget) {
   // First try to optimize away the conversion entirely when it's
   // conditionally from a constant. Vectors only.
-  if (SDValue Res = performVectorCompareAndMaskUnaryOpCombine(N, DAG))
+  if (SDValue Res = combineVectorCompareAndMaskUnaryOp(N, DAG))
     return Res;
 
   // Now move on to more general possibilities.
@@ -27822,18 +30533,18 @@ static SDValue PerformSINT_TO_FPCombine(SDNode *N, SelectionDAG &DAG,
 
   // Transform (SINT_TO_FP (i64 ...)) into an x87 operation if we have
   // a 32-bit target where SSE doesn't support i64->FP operations.
-  if (!Subtarget->useSoftFloat() && Op0.getOpcode() == ISD::LOAD) {
+  if (!Subtarget.useSoftFloat() && Op0.getOpcode() == ISD::LOAD) {
     LoadSDNode *Ld = cast<LoadSDNode>(Op0.getNode());
     EVT LdVT = Ld->getValueType(0);
 
-    // This transformation is not supported if the result type is f16
-    if (VT == MVT::f16)
+    // This transformation is not supported if the result type is f16 or f128.
+    if (VT == MVT::f16 || VT == MVT::f128)
       return SDValue();
 
     if (!Ld->isVolatile() && !VT.isVector() &&
         ISD::isNON_EXTLoad(Op0.getNode()) && Op0.hasOneUse() &&
-        !Subtarget->is64Bit() && LdVT == MVT::i64) {
-      SDValue FILDChain = Subtarget->getTargetLowering()->BuildFILD(
+        !Subtarget.is64Bit() && LdVT == MVT::i64) {
+      SDValue FILDChain = Subtarget.getTargetLowering()->BuildFILD(
           SDValue(N, 0), LdVT, Ld->getChain(), Op0, DAG);
       DAG.ReplaceAllUsesOfValueWith(Op0.getValue(1), FILDChain.getValue(1));
       return FILDChain;
@@ -27843,8 +30554,8 @@ static SDValue PerformSINT_TO_FPCombine(SDNode *N, SelectionDAG &DAG,
 }
 
 // Optimize RES, EFLAGS = X86ISD::ADC LHS, RHS, EFLAGS
-static SDValue PerformADCCombine(SDNode *N, SelectionDAG &DAG,
-                                 X86TargetLowering::DAGCombinerInfo &DCI) {
+static SDValue combineADC(SDNode *N, SelectionDAG &DAG,
+                          X86TargetLowering::DAGCombinerInfo &DCI) {
   // If the LHS and RHS of the ADC node are zero, then it can't overflow and
   // the result is either zero or one (depending on the input carry bit).
   // Strength reduce this down to a "set on carry" aka SETCC_CARRY&1.
@@ -27868,10 +30579,10 @@ static SDValue PerformADCCombine(SDNode *N, SelectionDAG &DAG,
   return SDValue();
 }
 
-// fold (add Y, (sete  X, 0)) -> adc  0, Y
-//      (add Y, (setne X, 0)) -> sbb -1, Y
-//      (sub (sete  X, 0), Y) -> sbb  0, Y
-//      (sub (setne X, 0), Y) -> adc -1, Y
+/// fold (add Y, (sete  X, 0)) -> adc  0, Y
+///      (add Y, (setne X, 0)) -> sbb -1, Y
+///      (sub (sete  X, 0), Y) -> sbb  0, Y
+///      (sub (setne X, 0), Y) -> adc -1, Y
 static SDValue OptimizeConditionalInDecrement(SDNode *N, SelectionDAG &DAG) {
   SDLoc DL(N);
 
@@ -27909,24 +30620,163 @@ static SDValue OptimizeConditionalInDecrement(SDNode *N, SelectionDAG &DAG) {
                      DAG.getConstant(0, DL, OtherVal.getValueType()), NewCmp);
 }
 
-/// PerformADDCombine - Do target-specific dag combines on integer adds.
-static SDValue PerformAddCombine(SDNode *N, SelectionDAG &DAG,
-                                 const X86Subtarget *Subtarget) {
+static SDValue detectSADPattern(SDNode *N, SelectionDAG &DAG,
+                                const X86Subtarget &Subtarget) {
+  SDLoc DL(N);
+  EVT VT = N->getValueType(0);
+  SDValue Op0 = N->getOperand(0);
+  SDValue Op1 = N->getOperand(1);
+
+  if (!VT.isVector() || !VT.isSimple() ||
+      !(VT.getVectorElementType() == MVT::i32))
+    return SDValue();
+
+  unsigned RegSize = 128;
+  if (Subtarget.hasBWI())
+    RegSize = 512;
+  else if (Subtarget.hasAVX2())
+    RegSize = 256;
+
+  // We only handle v16i32 for SSE2 / v32i32 for AVX2 / v64i32 for AVX512.
+  if (VT.getSizeInBits() / 4 > RegSize)
+    return SDValue();
+
+  // Detect the following pattern:
+  //
+  // 1:    %2 = zext <N x i8> %0 to <N x i32>
+  // 2:    %3 = zext <N x i8> %1 to <N x i32>
+  // 3:    %4 = sub nsw <N x i32> %2, %3
+  // 4:    %5 = icmp sgt <N x i32> %4, [0 x N] or [-1 x N]
+  // 5:    %6 = sub nsw <N x i32> zeroinitializer, %4
+  // 6:    %7 = select <N x i1> %5, <N x i32> %4, <N x i32> %6
+  // 7:    %8 = add nsw <N x i32> %7, %vec.phi
+  //
+  // The last instruction must be a reduction add. The instructions 3-6 forms an
+  // ABSDIFF pattern.
+
+  // The two operands of reduction add are from PHI and a select-op as in line 7
+  // above.
+  SDValue SelectOp, Phi;
+  if (Op0.getOpcode() == ISD::VSELECT) {
+    SelectOp = Op0;
+    Phi = Op1;
+  } else if (Op1.getOpcode() == ISD::VSELECT) {
+    SelectOp = Op1;
+    Phi = Op0;
+  } else
+    return SDValue();
+
+  // Check the condition of the select instruction is greater-than.
+  SDValue SetCC = SelectOp->getOperand(0);
+  if (SetCC.getOpcode() != ISD::SETCC)
+    return SDValue();
+  ISD::CondCode CC = cast<CondCodeSDNode>(SetCC.getOperand(2))->get();
+  if (CC != ISD::SETGT)
+    return SDValue();
+
+  Op0 = SelectOp->getOperand(1);
+  Op1 = SelectOp->getOperand(2);
+
+  // The second operand of SelectOp Op1 is the negation of the first operand
+  // Op0, which is implemented as 0 - Op0.
+  if (!(Op1.getOpcode() == ISD::SUB &&
+        ISD::isBuildVectorAllZeros(Op1.getOperand(0).getNode()) &&
+        Op1.getOperand(1) == Op0))
+    return SDValue();
+
+  // The first operand of SetCC is the first operand of SelectOp, which is the
+  // difference between two input vectors.
+  if (SetCC.getOperand(0) != Op0)
+    return SDValue();
+
+  // The second operand of > comparison can be either -1 or 0.
+  if (!(ISD::isBuildVectorAllZeros(SetCC.getOperand(1).getNode()) ||
+        ISD::isBuildVectorAllOnes(SetCC.getOperand(1).getNode())))
+    return SDValue();
+
+  // The first operand of SelectOp is the difference between two input vectors.
+  if (Op0.getOpcode() != ISD::SUB)
+    return SDValue();
+
+  Op1 = Op0.getOperand(1);
+  Op0 = Op0.getOperand(0);
+
+  // Check if the operands of the diff are zero-extended from vectors of i8.
+  if (Op0.getOpcode() != ISD::ZERO_EXTEND ||
+      Op0.getOperand(0).getValueType().getVectorElementType() != MVT::i8 ||
+      Op1.getOpcode() != ISD::ZERO_EXTEND ||
+      Op1.getOperand(0).getValueType().getVectorElementType() != MVT::i8)
+    return SDValue();
+
+  // SAD pattern detected. Now build a SAD instruction and an addition for
+  // reduction. Note that the number of elments of the result of SAD is less
+  // than the number of elements of its input. Therefore, we could only update
+  // part of elements in the reduction vector.
+
+  // Legalize the type of the inputs of PSADBW.
+  EVT InVT = Op0.getOperand(0).getValueType();
+  if (InVT.getSizeInBits() <= 128)
+    RegSize = 128;
+  else if (InVT.getSizeInBits() <= 256)
+    RegSize = 256;
+
+  unsigned NumConcat = RegSize / InVT.getSizeInBits();
+  SmallVector<SDValue, 16> Ops(NumConcat, DAG.getConstant(0, DL, InVT));
+  Ops[0] = Op0.getOperand(0);
+  MVT ExtendedVT = MVT::getVectorVT(MVT::i8, RegSize / 8);
+  Op0 = DAG.getNode(ISD::CONCAT_VECTORS, DL, ExtendedVT, Ops);
+  Ops[0] = Op1.getOperand(0);
+  Op1 = DAG.getNode(ISD::CONCAT_VECTORS, DL, ExtendedVT, Ops);
+
+  // The output of PSADBW is a vector of i64.
+  MVT SadVT = MVT::getVectorVT(MVT::i64, RegSize / 64);
+  SDValue Sad = DAG.getNode(X86ISD::PSADBW, DL, SadVT, Op0, Op1);
+
+  // We need to turn the vector of i64 into a vector of i32.
+  // If the reduction vector is at least as wide as the psadbw result, just
+  // bitcast. If it's narrower, truncate - the high i32 of each i64 is zero
+  // anyway.
+  MVT ResVT = MVT::getVectorVT(MVT::i32, RegSize / 32);
+  if (VT.getSizeInBits() >= ResVT.getSizeInBits())
+    Sad = DAG.getNode(ISD::BITCAST, DL, ResVT, Sad);
+  else
+    Sad = DAG.getNode(ISD::TRUNCATE, DL, VT, Sad);
+
+  if (VT.getSizeInBits() > ResVT.getSizeInBits()) {
+    // Update part of elements of the reduction vector. This is done by first
+    // extracting a sub-vector from it, updating this sub-vector, and inserting
+    // it back.
+    SDValue SubPhi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, ResVT, Phi,
+                                 DAG.getIntPtrConstant(0, DL));
+    SDValue Res = DAG.getNode(ISD::ADD, DL, ResVT, Sad, SubPhi);
+    return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, Phi, Res,
+                       DAG.getIntPtrConstant(0, DL));
+  } else
+    return DAG.getNode(ISD::ADD, DL, VT, Sad, Phi);
+}
+
+static SDValue combineAdd(SDNode *N, SelectionDAG &DAG,
+                          const X86Subtarget &Subtarget) {
+  const SDNodeFlags *Flags = &cast<BinaryWithFlagsSDNode>(N)->Flags;
+  if (Flags->hasVectorReduction()) {
+    if (SDValue Sad = detectSADPattern(N, DAG, Subtarget))
+      return Sad;
+  }
   EVT VT = N->getValueType(0);
   SDValue Op0 = N->getOperand(0);
   SDValue Op1 = N->getOperand(1);
 
   // Try to synthesize horizontal adds from adds of shuffles.
-  if (((Subtarget->hasSSSE3() && (VT == MVT::v8i16 || VT == MVT::v4i32)) ||
-       (Subtarget->hasInt256() && (VT == MVT::v16i16 || VT == MVT::v8i32))) &&
+  if (((Subtarget.hasSSSE3() && (VT == MVT::v8i16 || VT == MVT::v4i32)) ||
+       (Subtarget.hasInt256() && (VT == MVT::v16i16 || VT == MVT::v8i32))) &&
       isHorizontalBinOp(Op0, Op1, true))
     return DAG.getNode(X86ISD::HADD, SDLoc(N), VT, Op0, Op1);
 
   return OptimizeConditionalInDecrement(N, DAG);
 }
 
-static SDValue PerformSubCombine(SDNode *N, SelectionDAG &DAG,
-                                 const X86Subtarget *Subtarget) {
+static SDValue combineSub(SDNode *N, SelectionDAG &DAG,
+                          const X86Subtarget &Subtarget) {
   SDValue Op0 = N->getOperand(0);
   SDValue Op1 = N->getOperand(1);
 
@@ -27950,30 +30800,44 @@ static SDValue PerformSubCombine(SDNode *N, SelectionDAG &DAG,
 
   // Try to synthesize horizontal adds from adds of shuffles.
   EVT VT = N->getValueType(0);
-  if (((Subtarget->hasSSSE3() && (VT == MVT::v8i16 || VT == MVT::v4i32)) ||
-       (Subtarget->hasInt256() && (VT == MVT::v16i16 || VT == MVT::v8i32))) &&
+  if (((Subtarget.hasSSSE3() && (VT == MVT::v8i16 || VT == MVT::v4i32)) ||
+       (Subtarget.hasInt256() && (VT == MVT::v16i16 || VT == MVT::v8i32))) &&
       isHorizontalBinOp(Op0, Op1, true))
     return DAG.getNode(X86ISD::HSUB, SDLoc(N), VT, Op0, Op1);
 
   return OptimizeConditionalInDecrement(N, DAG);
 }
 
-/// performVZEXTCombine - Performs build vector combines
-static SDValue performVZEXTCombine(SDNode *N, SelectionDAG &DAG,
-                                   TargetLowering::DAGCombinerInfo &DCI,
-                                   const X86Subtarget *Subtarget) {
+static SDValue combineVZext(SDNode *N, SelectionDAG &DAG,
+                            TargetLowering::DAGCombinerInfo &DCI,
+                            const X86Subtarget &Subtarget) {
   SDLoc DL(N);
   MVT VT = N->getSimpleValueType(0);
+  MVT SVT = VT.getVectorElementType();
   SDValue Op = N->getOperand(0);
   MVT OpVT = Op.getSimpleValueType();
   MVT OpEltVT = OpVT.getVectorElementType();
   unsigned InputBits = OpEltVT.getSizeInBits() * VT.getVectorNumElements();
 
-  // (vzext (bitcast (vzext (x)) -> (vzext x)
-  SDValue V = Op;
-  while (V.getOpcode() == ISD::BITCAST)
-    V = V.getOperand(0);
+  // Perform any constant folding.
+  if (ISD::isBuildVectorOfConstantSDNodes(Op.getNode())) {
+    SmallVector<SDValue, 4> Vals;
+    for (int i = 0, e = VT.getVectorNumElements(); i != e; ++i) {
+      SDValue OpElt = Op.getOperand(i);
+      if (OpElt.getOpcode() == ISD::UNDEF) {
+        Vals.push_back(DAG.getUNDEF(SVT));
+        continue;
+      }
+      APInt Cst = cast<ConstantSDNode>(OpElt.getNode())->getAPIntValue();
+      assert(Cst.getBitWidth() == OpEltVT.getSizeInBits());
+      Cst = Cst.zextOrTrunc(SVT.getSizeInBits());
+      Vals.push_back(DAG.getConstant(Cst, DL, SVT));
+    }
+    return DAG.getNode(ISD::BUILD_VECTOR, DL, VT, Vals);
+  }
 
+  // (vzext (bitcast (vzext (x)) -> (vzext x)
+  SDValue V = peekThroughBitcasts(Op);
   if (V != Op && V.getOpcode() == X86ISD::VZEXT) {
     MVT InnerVT = V.getSimpleValueType();
     MVT InnerEltVT = InnerVT.getVectorElementType();
@@ -28022,61 +30886,111 @@ static SDValue performVZEXTCombine(SDNode *N, SelectionDAG &DAG,
   return SDValue();
 }
 
+/// Canonicalize (LSUB p, 1) -> (LADD p, -1).
+static SDValue combineLockSub(SDNode *N, SelectionDAG &DAG,
+                                  const X86Subtarget &Subtarget) {
+  SDValue Chain = N->getOperand(0);
+  SDValue LHS = N->getOperand(1);
+  SDValue RHS = N->getOperand(2);
+  MVT VT = RHS.getSimpleValueType();
+  SDLoc DL(N);
+
+  auto *C = dyn_cast<ConstantSDNode>(RHS);
+  if (!C || C->getZExtValue() != 1)
+    return SDValue();
+
+  RHS = DAG.getConstant(-1, DL, VT);
+  MachineMemOperand *MMO = cast<MemSDNode>(N)->getMemOperand();
+  return DAG.getMemIntrinsicNode(X86ISD::LADD, DL,
+                                 DAG.getVTList(MVT::i32, MVT::Other),
+                                 {Chain, LHS, RHS}, VT, MMO);
+}
+
+// TEST (AND a, b) ,(AND a, b) -> TEST a, b
+static SDValue combineTestM(SDNode *N, SelectionDAG &DAG) {
+  SDValue Op0 = N->getOperand(0);
+  SDValue Op1 = N->getOperand(1);
+
+  if (Op0 != Op1 || Op1->getOpcode() != ISD::AND)
+    return SDValue();
+
+  EVT VT = N->getValueType(0);
+  SDLoc DL(N);
+
+  return DAG.getNode(X86ISD::TESTM, DL, VT,
+                     Op0->getOperand(0), Op0->getOperand(1));
+}
+
+static SDValue combineVectorCompare(SDNode *N, SelectionDAG &DAG,
+                                    const X86Subtarget &Subtarget) {
+  MVT VT = N->getSimpleValueType(0);
+  SDLoc DL(N);
+
+  if (N->getOperand(0) == N->getOperand(1)) {
+    if (N->getOpcode() == X86ISD::PCMPEQ)
+      return getOnesVector(VT, Subtarget, DAG, DL);
+    if (N->getOpcode() == X86ISD::PCMPGT)
+      return getZeroVector(VT, Subtarget, DAG, DL);
+  }
+
+  return SDValue();
+}
+
+
 SDValue X86TargetLowering::PerformDAGCombine(SDNode *N,
                                              DAGCombinerInfo &DCI) const {
   SelectionDAG &DAG = DCI.DAG;
   switch (N->getOpcode()) {
   default: break;
-  case ISD::EXTRACT_VECTOR_ELT:
-    return PerformEXTRACT_VECTOR_ELTCombine(N, DAG, DCI);
+  case ISD::EXTRACT_VECTOR_ELT: return combineExtractVectorElt(N, DAG, DCI);
   case ISD::VSELECT:
   case ISD::SELECT:
-  case X86ISD::SHRUNKBLEND:
-    return PerformSELECTCombine(N, DAG, DCI, Subtarget);
-  case ISD::BITCAST:        return PerformBITCASTCombine(N, DAG, Subtarget);
-  case X86ISD::CMOV:        return PerformCMOVCombine(N, DAG, DCI, Subtarget);
-  case ISD::ADD:            return PerformAddCombine(N, DAG, Subtarget);
-  case ISD::SUB:            return PerformSubCombine(N, DAG, Subtarget);
-  case X86ISD::ADC:         return PerformADCCombine(N, DAG, DCI);
-  case ISD::MUL:            return PerformMulCombine(N, DAG, DCI);
+  case X86ISD::SHRUNKBLEND: return combineSelect(N, DAG, DCI, Subtarget);
+  case ISD::BITCAST:        return combineBitcast(N, DAG, Subtarget);
+  case X86ISD::CMOV:        return combineCMov(N, DAG, DCI, Subtarget);
+  case ISD::ADD:            return combineAdd(N, DAG, Subtarget);
+  case ISD::SUB:            return combineSub(N, DAG, Subtarget);
+  case X86ISD::ADC:         return combineADC(N, DAG, DCI);
+  case ISD::MUL:            return combineMul(N, DAG, DCI, Subtarget);
   case ISD::SHL:
   case ISD::SRA:
-  case ISD::SRL:            return PerformShiftCombine(N, DAG, DCI, Subtarget);
-  case ISD::AND:            return PerformAndCombine(N, DAG, DCI, Subtarget);
-  case ISD::OR:             return PerformOrCombine(N, DAG, DCI, Subtarget);
-  case ISD::XOR:            return PerformXorCombine(N, DAG, DCI, Subtarget);
-  case ISD::LOAD:           return PerformLOADCombine(N, DAG, DCI, Subtarget);
-  case ISD::MLOAD:          return PerformMLOADCombine(N, DAG, DCI, Subtarget);
-  case ISD::STORE:          return PerformSTORECombine(N, DAG, Subtarget);
-  case ISD::MSTORE:         return PerformMSTORECombine(N, DAG, Subtarget);
-  case ISD::SINT_TO_FP:     return PerformSINT_TO_FPCombine(N, DAG, Subtarget);
-  case ISD::UINT_TO_FP:     return PerformUINT_TO_FPCombine(N, DAG, Subtarget);
-  case ISD::FADD:           return PerformFADDCombine(N, DAG, Subtarget);
-  case ISD::FSUB:           return PerformFSUBCombine(N, DAG, Subtarget);
-  case ISD::FNEG:           return PerformFNEGCombine(N, DAG, Subtarget);
-  case ISD::TRUNCATE:       return PerformTRUNCATECombine(N, DAG, Subtarget);
+  case ISD::SRL:            return combineShift(N, DAG, DCI, Subtarget);
+  case ISD::AND:            return combineAnd(N, DAG, DCI, Subtarget);
+  case ISD::OR:             return combineOr(N, DAG, DCI, Subtarget);
+  case ISD::XOR:            return combineXor(N, DAG, DCI, Subtarget);
+  case ISD::LOAD:           return combineLoad(N, DAG, DCI, Subtarget);
+  case ISD::MLOAD:          return combineMaskedLoad(N, DAG, DCI, Subtarget);
+  case ISD::STORE:          return combineStore(N, DAG, Subtarget);
+  case ISD::MSTORE:         return combineMaskedStore(N, DAG, Subtarget);
+  case ISD::SINT_TO_FP:     return combineSIntToFP(N, DAG, Subtarget);
+  case ISD::UINT_TO_FP:     return combineUIntToFP(N, DAG, Subtarget);
+  case ISD::FADD:
+  case ISD::FSUB:           return combineFaddFsub(N, DAG, Subtarget);
+  case ISD::FNEG:           return combineFneg(N, DAG, Subtarget);
+  case ISD::TRUNCATE:       return combineTruncate(N, DAG, Subtarget);
   case X86ISD::FXOR:
-  case X86ISD::FOR:         return PerformFORCombine(N, DAG, Subtarget);
+  case X86ISD::FOR:         return combineFOr(N, DAG, Subtarget);
   case X86ISD::FMIN:
-  case X86ISD::FMAX:        return PerformFMinFMaxCombine(N, DAG);
+  case X86ISD::FMAX:        return combineFMinFMax(N, DAG);
   case ISD::FMINNUM:
-  case ISD::FMAXNUM:        return performFMinNumFMaxNumCombine(N, DAG,
-                                                                Subtarget);
-  case X86ISD::FAND:        return PerformFANDCombine(N, DAG, Subtarget);
-  case X86ISD::FANDN:       return PerformFANDNCombine(N, DAG, Subtarget);
-  case X86ISD::BT:          return PerformBTCombine(N, DAG, DCI);
-  case X86ISD::VZEXT_MOVL:  return PerformVZEXT_MOVLCombine(N, DAG);
+  case ISD::FMAXNUM:        return combineFMinNumFMaxNum(N, DAG, Subtarget);
+  case X86ISD::FAND:        return combineFAnd(N, DAG, Subtarget);
+  case X86ISD::FANDN:       return combineFAndn(N, DAG, Subtarget);
+  case X86ISD::BT:          return combineBT(N, DAG, DCI);
+  case X86ISD::VZEXT_MOVL:  return combineVZextMovl(N, DAG);
   case ISD::ANY_EXTEND:
-  case ISD::ZERO_EXTEND:    return PerformZExtCombine(N, DAG, DCI, Subtarget);
-  case ISD::SIGN_EXTEND:    return PerformSExtCombine(N, DAG, DCI, Subtarget);
-  case ISD::SIGN_EXTEND_INREG:
-    return PerformSIGN_EXTEND_INREGCombine(N, DAG, Subtarget);
-  case ISD::SETCC:          return PerformISDSETCCCombine(N, DAG, Subtarget);
-  case X86ISD::SETCC:       return PerformSETCCCombine(N, DAG, DCI, Subtarget);
-  case X86ISD::BRCOND:      return PerformBrCondCombine(N, DAG, DCI, Subtarget);
-  case X86ISD::VZEXT:       return performVZEXTCombine(N, DAG, DCI, Subtarget);
+  case ISD::ZERO_EXTEND:    return combineZext(N, DAG, DCI, Subtarget);
+  case ISD::SIGN_EXTEND:    return combineSext(N, DAG, DCI, Subtarget);
+  case ISD::SIGN_EXTEND_INREG: return combineSignExtendInReg(N, DAG, Subtarget);
+  case ISD::SETCC:          return combineSetCC(N, DAG, Subtarget);
+  case X86ISD::SETCC:       return combineX86SetCC(N, DAG, DCI, Subtarget);
+  case X86ISD::BRCOND:      return combineBrCond(N, DAG, DCI, Subtarget);
+  case X86ISD::VZEXT:       return combineVZext(N, DAG, DCI, Subtarget);
   case X86ISD::SHUFP:       // Handle all target specific shuffles
+  case X86ISD::INSERTPS:
   case X86ISD::PALIGNR:
+  case X86ISD::VSHLDQ:
+  case X86ISD::VSRLDQ:
   case X86ISD::BLENDI:
   case X86ISD::UNPCKH:
   case X86ISD::UNPCKL:
@@ -28086,23 +31000,36 @@ SDValue X86TargetLowering::PerformDAGCombine(SDNode *N,
   case X86ISD::PSHUFD:
   case X86ISD::PSHUFHW:
   case X86ISD::PSHUFLW:
+  case X86ISD::MOVSHDUP:
+  case X86ISD::MOVSLDUP:
+  case X86ISD::MOVDDUP:
   case X86ISD::MOVSS:
   case X86ISD::MOVSD:
+  case X86ISD::VPPERM:
+  case X86ISD::VPERMI:
+  case X86ISD::VPERMV:
+  case X86ISD::VPERMV3:
+  case X86ISD::VPERMIL2:
   case X86ISD::VPERMILPI:
+  case X86ISD::VPERMILPV:
   case X86ISD::VPERM2X128:
-  case ISD::VECTOR_SHUFFLE: return PerformShuffleCombine(N, DAG, DCI,Subtarget);
-  case ISD::FMA:            return PerformFMACombine(N, DAG, Subtarget);
+  case ISD::VECTOR_SHUFFLE: return combineShuffle(N, DAG, DCI,Subtarget);
+  case ISD::FMA:            return combineFMA(N, DAG, Subtarget);
   case ISD::MGATHER:
-  case ISD::MSCATTER:       return PerformGatherScatterCombine(N, DAG);
+  case ISD::MSCATTER:       return combineGatherScatter(N, DAG);
+  case X86ISD::LSUB:        return combineLockSub(N, DAG, Subtarget);
+  case X86ISD::TESTM:       return combineTestM(N, DAG);
+  case X86ISD::PCMPEQ:
+  case X86ISD::PCMPGT:      return combineVectorCompare(N, DAG, Subtarget);
   }
 
   return SDValue();
 }
 
-/// isTypeDesirableForOp - Return true if the target has native support for
-/// the specified value type and it is 'desirable' to use the type for the
-/// given node type. e.g. On x86 i16 is legal, but undesirable since i16
-/// instruction encodings are longer and some i16 instructions are slow.
+/// Return true if the target has native support for the specified value type
+/// and it is 'desirable' to use the type for the given node type. e.g. On x86
+/// i16 is legal, but undesirable since i16 instruction encodings are longer and
+/// some i16 instructions are slow.
 bool X86TargetLowering::isTypeDesirableForOp(unsigned Opc, EVT VT) const {
   if (!isTypeLegal(VT))
     return false;
@@ -28140,9 +31067,9 @@ bool X86TargetLowering::hasCopyImplyingStackAdjustment(
                 [](const MachineInstr &RI) { return RI.isCopy(); });
 }
 
-/// IsDesirableToPromoteOp - This method query the target whether it is
-/// beneficial for dag combiner to promote the specified node. If true, it
-/// should return the desired promotion type by reference.
+/// This method query the target whether it is beneficial for dag combiner to
+/// promote the specified node. If true, it should return the desired promotion
+/// type by reference.
 bool X86TargetLowering::IsDesirableToPromoteOp(SDValue Op, EVT &PVT) const {
   EVT VT = Op.getValueType();
   if (VT != MVT::i16)
@@ -28152,23 +31079,6 @@ bool X86TargetLowering::IsDesirableToPromoteOp(SDValue Op, EVT &PVT) const {
   bool Commute = false;
   switch (Op.getOpcode()) {
   default: break;
-  case ISD::LOAD: {
-    LoadSDNode *LD = cast<LoadSDNode>(Op);
-    // If the non-extending load has a single use and it's not live out, then it
-    // might be folded.
-    if (LD->getExtensionType() == ISD::NON_EXTLOAD /*&&
-                                                     Op.hasOneUse()*/) {
-      for (SDNode::use_iterator UI = Op.getNode()->use_begin(),
-             UE = Op.getNode()->use_end(); UI != UE; ++UI) {
-        // The only case where we'd want to promote LOAD (rather then it being
-        // promoted as an operand is when it's only use is liveout.
-        if (UI->getOpcode() != ISD::CopyToReg)
-          return false;
-      }
-    }
-    Promote = true;
-    break;
-  }
   case ISD::SIGN_EXTEND:
   case ISD::ZERO_EXTEND:
   case ISD::ANY_EXTEND:
@@ -28250,7 +31160,7 @@ static bool clobbersFlagRegisters(const SmallVector<StringRef, 4> &AsmPieces) {
 bool X86TargetLowering::ExpandInlineAsm(CallInst *CI) const {
   InlineAsm *IA = cast<InlineAsm>(CI->getCalledValue());
 
-  std::string AsmStr = IA->getAsmString();
+  const std::string &AsmStr = IA->getAsmString();
 
   IntegerType *Ty = dyn_cast<IntegerType>(CI->getType());
   if (!Ty || Ty->getBitWidth() % 16 != 0)
@@ -28323,8 +31233,7 @@ bool X86TargetLowering::ExpandInlineAsm(CallInst *CI) const {
   return false;
 }
 
-/// getConstraintType - Given a constraint letter, return the type of
-/// constraint it is for this target.
+/// Given a constraint letter, return the type of constraint for this target.
 X86TargetLowering::ConstraintType
 X86TargetLowering::getConstraintType(StringRef Constraint) const {
   if (Constraint.size() == 1) {
@@ -28403,13 +31312,13 @@ TargetLowering::ConstraintWeight
       weight = CW_SpecificReg;
     break;
   case 'y':
-    if (type->isX86_MMXTy() && Subtarget->hasMMX())
+    if (type->isX86_MMXTy() && Subtarget.hasMMX())
       weight = CW_SpecificReg;
     break;
   case 'x':
   case 'Y':
-    if (((type->getPrimitiveSizeInBits() == 128) && Subtarget->hasSSE1()) ||
-        ((type->getPrimitiveSizeInBits() == 256) && Subtarget->hasFp256()))
+    if (((type->getPrimitiveSizeInBits() == 128) && Subtarget.hasSSE1()) ||
+        ((type->getPrimitiveSizeInBits() == 256) && Subtarget.hasFp256()))
       weight = CW_Register;
     break;
   case 'I':
@@ -28471,25 +31380,25 @@ TargetLowering::ConstraintWeight
   return weight;
 }
 
-/// LowerXConstraint - try to replace an X constraint, which matches anything,
-/// with another that has more specific requirements based on the type of the
-/// corresponding operand.
+/// Try to replace an X constraint, which matches anything, with another that
+/// has more specific requirements based on the type of the corresponding
+/// operand.
 const char *X86TargetLowering::
 LowerXConstraint(EVT ConstraintVT) const {
   // FP X constraints get lowered to SSE1/2 registers if available, otherwise
   // 'f' like normal targets.
   if (ConstraintVT.isFloatingPoint()) {
-    if (Subtarget->hasSSE2())
+    if (Subtarget.hasSSE2())
       return "Y";
-    if (Subtarget->hasSSE1())
+    if (Subtarget.hasSSE1())
       return "x";
   }
 
   return TargetLowering::LowerXConstraint(ConstraintVT);
 }
 
-/// LowerAsmOperandForConstraint - Lower the specified operand into the Ops
-/// vector.  If it is invalid, don't add anything to Ops.
+/// Lower the specified operand into the Ops vector.
+/// If it is invalid, don't add anything to Ops.
 void X86TargetLowering::LowerAsmOperandForConstraint(SDValue Op,
                                                      std::string &Constraint,
                                                      std::vector<SDValue>&Ops,
@@ -28532,7 +31441,7 @@ void X86TargetLowering::LowerAsmOperandForConstraint(SDValue Op,
   case 'L':
     if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
       if (C->getZExtValue() == 0xff || C->getZExtValue() == 0xffff ||
-          (Subtarget->is64Bit() && C->getZExtValue() == 0xffffffff)) {
+          (Subtarget.is64Bit() && C->getZExtValue() == 0xffffffff)) {
         Result = DAG.getTargetConstant(C->getSExtValue(), SDLoc(Op),
                                        Op.getValueType());
         break;
@@ -28605,7 +31514,7 @@ void X86TargetLowering::LowerAsmOperandForConstraint(SDValue Op,
     // In any sort of PIC mode addresses need to be computed at runtime by
     // adding in a register or some sort of table lookup.  These can't
     // be used as immediates.
-    if (Subtarget->isPICStyleGOT() || Subtarget->isPICStyleStubPIC())
+    if (Subtarget.isPICStyleGOT() || Subtarget.isPICStyleStubPIC())
       return;
 
     // If we are in non-pic codegen mode, we allow the address of a global (with
@@ -28639,8 +31548,7 @@ void X86TargetLowering::LowerAsmOperandForConstraint(SDValue Op,
     const GlobalValue *GV = GA->getGlobal();
     // If we require an extra load to get this address, as in PIC mode, we
     // can't accept it.
-    if (isGlobalStubReference(
-            Subtarget->ClassifyGlobalReference(GV, DAG.getTarget())))
+    if (isGlobalStubReference(Subtarget.classifyGlobalReference(GV)))
       return;
 
     Result = DAG.getTargetGlobalAddress(GV, SDLoc(Op),
@@ -28656,6 +31564,65 @@ void X86TargetLowering::LowerAsmOperandForConstraint(SDValue Op,
   return TargetLowering::LowerAsmOperandForConstraint(Op, Constraint, Ops, DAG);
 }
 
+/// Check if \p RC is a general purpose register class.
+/// I.e., GR* or one of their variant.
+static bool isGRClass(const TargetRegisterClass &RC) {
+  switch (RC.getID()) {
+  case X86::GR8RegClassID:
+  case X86::GR8_ABCD_LRegClassID:
+  case X86::GR8_ABCD_HRegClassID:
+  case X86::GR8_NOREXRegClassID:
+  case X86::GR16RegClassID:
+  case X86::GR16_ABCDRegClassID:
+  case X86::GR16_NOREXRegClassID:
+  case X86::GR32RegClassID:
+  case X86::GR32_ABCDRegClassID:
+  case X86::GR32_TCRegClassID:
+  case X86::GR32_NOREXRegClassID:
+  case X86::GR32_NOAXRegClassID:
+  case X86::GR32_NOSPRegClassID:
+  case X86::GR32_NOREX_NOSPRegClassID:
+  case X86::GR32_ADRegClassID:
+  case X86::GR64RegClassID:
+  case X86::GR64_ABCDRegClassID:
+  case X86::GR64_TCRegClassID:
+  case X86::GR64_TCW64RegClassID:
+  case X86::GR64_NOREXRegClassID:
+  case X86::GR64_NOSPRegClassID:
+  case X86::GR64_NOREX_NOSPRegClassID:
+  case X86::LOW32_ADDR_ACCESSRegClassID:
+  case X86::LOW32_ADDR_ACCESS_RBPRegClassID:
+    return true;
+  default:
+    return false;
+  }
+}
+
+/// Check if \p RC is a vector register class.
+/// I.e., FR* / VR* or one of their variant.
+static bool isFRClass(const TargetRegisterClass &RC) {
+  switch (RC.getID()) {
+  case X86::FR32RegClassID:
+  case X86::FR32XRegClassID:
+  case X86::FR64RegClassID:
+  case X86::FR64XRegClassID:
+  case X86::FR128RegClassID:
+  case X86::VR64RegClassID:
+  case X86::VR128RegClassID:
+  case X86::VR128LRegClassID:
+  case X86::VR128HRegClassID:
+  case X86::VR128XRegClassID:
+  case X86::VR256RegClassID:
+  case X86::VR256LRegClassID:
+  case X86::VR256HRegClassID:
+  case X86::VR256XRegClassID:
+  case X86::VR512RegClassID:
+    return true;
+  default:
+    return false;
+  }
+}
+
 std::pair<unsigned, const TargetRegisterClass *>
 X86TargetLowering::getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI,
                                                 StringRef Constraint,
@@ -28670,7 +31637,7 @@ X86TargetLowering::getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI,
       // RIP in the class. Do they matter any more here than they do
       // in the normal allocation?
     case 'q':   // GENERAL_REGS in 64-bit mode, Q_REGS in 32-bit mode.
-      if (Subtarget->is64Bit()) {
+      if (Subtarget.is64Bit()) {
         if (VT == MVT::i32 || VT == MVT::f32)
           return std::make_pair(0U, &X86::GR32RegClass);
         if (VT == MVT::i16)
@@ -28698,7 +31665,7 @@ X86TargetLowering::getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI,
         return std::make_pair(0U, &X86::GR8RegClass);
       if (VT == MVT::i16)
         return std::make_pair(0U, &X86::GR16RegClass);
-      if (VT == MVT::i32 || VT == MVT::f32 || !Subtarget->is64Bit())
+      if (VT == MVT::i32 || VT == MVT::f32 || !Subtarget.is64Bit())
         return std::make_pair(0U, &X86::GR32RegClass);
       return std::make_pair(0U, &X86::GR64RegClass);
     case 'R':   // LEGACY_REGS
@@ -28706,7 +31673,7 @@ X86TargetLowering::getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI,
         return std::make_pair(0U, &X86::GR8_NOREXRegClass);
       if (VT == MVT::i16)
         return std::make_pair(0U, &X86::GR16_NOREXRegClass);
-      if (VT == MVT::i32 || !Subtarget->is64Bit())
+      if (VT == MVT::i32 || !Subtarget.is64Bit())
         return std::make_pair(0U, &X86::GR32_NOREXRegClass);
       return std::make_pair(0U, &X86::GR64_NOREXRegClass);
     case 'f':  // FP Stack registers.
@@ -28718,13 +31685,13 @@ X86TargetLowering::getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI,
         return std::make_pair(0U, &X86::RFP64RegClass);
       return std::make_pair(0U, &X86::RFP80RegClass);
     case 'y':   // MMX_REGS if MMX allowed.
-      if (!Subtarget->hasMMX()) break;
+      if (!Subtarget.hasMMX()) break;
       return std::make_pair(0U, &X86::VR64RegClass);
     case 'Y':   // SSE_REGS if SSE2 allowed
-      if (!Subtarget->hasSSE2()) break;
+      if (!Subtarget.hasSSE2()) break;
       // FALL THROUGH.
     case 'x':   // SSE_REGS if SSE1 allowed or AVX_REGS if AVX allowed
-      if (!Subtarget->hasSSE1()) break;
+      if (!Subtarget.hasSSE1()) break;
 
       switch (VT.SimpleTy) {
       default: break;
@@ -28817,8 +31784,11 @@ X86TargetLowering::getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI,
   // return "eax". This should even work for things like getting 64bit integer
   // registers when given an f64 type.
   const TargetRegisterClass *Class = Res.second;
-  if (Class == &X86::GR8RegClass || Class == &X86::GR16RegClass ||
-      Class == &X86::GR32RegClass || Class == &X86::GR64RegClass) {
+  // The generic code will match the first register class that contains the
+  // given register. Thus, based on the ordering of the tablegened file,
+  // the "plain" GR classes might not come first.
+  // Therefore, use a helper method.
+  if (isGRClass(*Class)) {
     unsigned Size = VT.getSizeInBits();
     if (Size == 1) Size = 8;
     unsigned DestReg = getX86SubSuperRegisterOrZero(Res.first, Size);
@@ -28834,11 +31804,7 @@ X86TargetLowering::getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI,
       Res.first = 0;
       Res.second = nullptr;
     }
-  } else if (Class == &X86::FR32RegClass || Class == &X86::FR64RegClass ||
-             Class == &X86::VR128RegClass || Class == &X86::VR256RegClass ||
-             Class == &X86::FR32XRegClass || Class == &X86::FR64XRegClass ||
-             Class == &X86::VR128XRegClass || Class == &X86::VR256XRegClass ||
-             Class == &X86::VR512RegClass) {
+  } else if (isFRClass(*Class)) {
     // Handle references to XMM physical registers that got mapped into the
     // wrong class.  This can happen with constraints like {xmm0} where the
     // target independent register mapper will just pick the first match it can
@@ -28907,7 +31873,7 @@ bool X86TargetLowering::isIntDivCheap(EVT VT, AttributeSet Attr) const {
 }
 
 void X86TargetLowering::initializeSplitCSR(MachineBasicBlock *Entry) const {
-  if (!Subtarget->is64Bit())
+  if (!Subtarget.is64Bit())
     return;
 
   // Update IsSplitCSR in X86MachineFunctionInfo.
@@ -28919,12 +31885,12 @@ void X86TargetLowering::initializeSplitCSR(MachineBasicBlock *Entry) const {
 void X86TargetLowering::insertCopiesSplitCSR(
     MachineBasicBlock *Entry,
     const SmallVectorImpl<MachineBasicBlock *> &Exits) const {
-  const X86RegisterInfo *TRI = Subtarget->getRegisterInfo();
+  const X86RegisterInfo *TRI = Subtarget.getRegisterInfo();
   const MCPhysReg *IStart = TRI->getCalleeSavedRegsViaCopy(Entry->getParent());
   if (!IStart)
     return;
 
-  const TargetInstrInfo *TII = Subtarget->getInstrInfo();
+  const TargetInstrInfo *TII = Subtarget.getInstrInfo();
   MachineRegisterInfo *MRI = &Entry->getParent()->getRegInfo();
   MachineBasicBlock::iterator MBBI = Entry->begin();
   for (const MCPhysReg *I = IStart; *I; ++I) {
diff --git a/lib/Target/X86/X86ISelLowering.h b/lib/Target/X86/X86ISelLowering.h
index b67958a9c498..d826f1ec3e05 100644
--- a/lib/Target/X86/X86ISelLowering.h
+++ b/lib/Target/X86/X86ISelLowering.h
@@ -75,7 +75,7 @@ namespace llvm {
       ///
       CALL,
 
-      /// This operation implements the lowering for readcyclecounter
+      /// This operation implements the lowering for readcyclecounter.
       RDTSC_DAG,
 
       /// X86 Read Time-Stamp Counter and Processor ID.
@@ -106,10 +106,6 @@ namespace llvm {
       /// 0s or 1s.  Generally DTRT for C/C++ with NaNs.
       FSETCC,
 
-      /// X86 MOVMSK{pd|ps}, extracts sign bits of two or four FP values,
-      /// result in an integer GPR.  Needs masking for scalar result.
-      FGETSIGNx86,
-
       /// X86 conditional moves. Operand 0 and operand 1 are the two values
       /// to select from. Operand 2 is the condition code, and operand 3 is the
       /// flag operand produced by a CMP or TEST instruction. It also writes a
@@ -191,9 +187,6 @@ namespace llvm {
       /// Bitwise Logical AND NOT of Packed FP values.
       ANDNP,
 
-      /// Copy integer sign.
-      PSIGN,
-
       /// Blend where the selector is an immediate.
       BLENDI,
 
@@ -214,30 +207,31 @@ namespace llvm {
       FMIN_RND,
       FSQRT_RND,
 
-      // FP vector get exponent 
+      // FP vector get exponent.
       FGETEXP_RND,
-      // Extract Normalized Mantissas
+      // Extract Normalized Mantissas.
       VGETMANT,
-      // FP Scale
+      // FP Scale.
       SCALEF,
+      SCALEFS,
+
       // Integer add/sub with unsigned saturation.
       ADDUS,
       SUBUS,
+
       // Integer add/sub with signed saturation.
       ADDS,
       SUBS,
-      // Unsigned Integer average 
+
+      // Unsigned Integer average.
       AVG,
-      /// Integer horizontal add.
-      HADD,
 
-      /// Integer horizontal sub.
+      /// Integer horizontal add/sub.
+      HADD,
       HSUB,
 
-      /// Floating point horizontal add.
+      /// Floating point horizontal add/sub.
       FHADD,
-
-      /// Floating point horizontal sub.
       FHSUB,
 
       // Integer absolute value
@@ -256,7 +250,8 @@ namespace llvm {
       /// Note that these typically require refinement
       /// in order to obtain suitable precision.
       FRSQRT, FRCP,
-
+      FRSQRTS, FRCPS,
+   
       // Thread Local Storage.
       TLSADDR,
 
@@ -277,6 +272,9 @@ namespace llvm {
       // SjLj exception handling longjmp.
       EH_SJLJ_LONGJMP,
 
+      // SjLj exception handling dispatch.
+      EH_SJLJ_SETUP_DISPATCH,
+
       /// Tail call return. See X86TargetLowering::LowerCall for
       /// the list of operands.
       TC_RETURN,
@@ -286,7 +284,6 @@ namespace llvm {
 
       // Vector integer zero-extend.
       VZEXT,
-
       // Vector integer signed-extend.
       VSEXT,
 
@@ -313,6 +310,11 @@ namespace llvm {
       // Vector shift elements
       VSHL, VSRL, VSRA,
 
+      // Vector variable shift right arithmetic.
+      // Unlike ISD::SRA, in case shift count greater then element size
+      // use sign bit to fill destination data element.
+      VSRAV,
+
       // Vector shift elements by immediate
       VSHLI, VSRLI, VSRAI,
 
@@ -327,6 +329,8 @@ namespace llvm {
       // Vector integer comparisons, the result is in a mask vector.
       PCMPEQM, PCMPGTM,
 
+      MULTISHIFT,
+
       /// Vector comparison generating mask bits for fp and
       /// integer signed and unsigned data types.
       CMPM,
@@ -338,11 +342,13 @@ namespace llvm {
       ADD, SUB, ADC, SBB, SMUL,
       INC, DEC, OR, XOR, AND,
 
-      BEXTR,  // Bit field extract
+      // Bit field extract.
+      BEXTR,
 
-      UMUL, // LOW, HI, FLAGS = umul LHS, RHS
+      // LOW, HI, FLAGS = umul LHS, RHS.
+      UMUL,
 
-      // 8-bit SMUL/UMUL - AX, FLAGS = smul8/umul8 AL, RHS
+      // 8-bit SMUL/UMUL - AX, FLAGS = smul8/umul8 AL, RHS.
       SMUL8, UMUL8,
 
       // 8-bit divrem that zero-extend the high result (AH).
@@ -352,6 +358,9 @@ namespace llvm {
       // X86-specific multiply by immediate.
       MUL_IMM,
 
+      // Vector sign bit extraction.
+      MOVMSK,
+
       // Vector bitwise comparisons.
       PTEST,
 
@@ -362,22 +371,23 @@ namespace llvm {
       TESTM,
       TESTNM,
 
-      // OR/AND test for masks
+      // OR/AND test for masks.
       KORTEST,
       KTEST,
 
       // Several flavors of instructions with vector shuffle behaviors.
+      // Saturated signed/unnsigned packing.
       PACKSS,
       PACKUS,
-      // Intra-lane alignr
+      // Intra-lane alignr.
       PALIGNR,
-      // AVX512 inter-lane alignr
+      // AVX512 inter-lane alignr.
       VALIGN,
       PSHUFD,
       PSHUFHW,
       PSHUFLW,
       SHUFP,
-      //Shuffle Packed Values at 128-bit granularity
+      //Shuffle Packed Values at 128-bit granularity.
       SHUF128,
       MOVDDUP,
       MOVSHDUP,
@@ -393,61 +403,82 @@ namespace llvm {
       UNPCKH,
       VPERMILPV,
       VPERMILPI,
+      VPERMI,
+      VPERM2X128,
+
+      // Variable Permute (VPERM).
+      // Res = VPERMV MaskV, V0
       VPERMV,
+
+      // 3-op Variable Permute (VPERMT2).
+      // Res = VPERMV3 V0, MaskV, V1
       VPERMV3,
+
+      // 3-op Variable Permute overwriting the index (VPERMI2).
+      // Res = VPERMIV3 V0, MaskV, V1
       VPERMIV3,
-      VPERMI,
-      VPERM2X128,
-      // Bitwise ternary logic
+
+      // Bitwise ternary logic.
       VPTERNLOG,
-      // Fix Up Special Packed Float32/64 values
+      // Fix Up Special Packed Float32/64 values.
       VFIXUPIMM,
-      // Range Restriction Calculation For Packed Pairs of Float32/64 values
+      VFIXUPIMMS,
+      // Range Restriction Calculation For Packed Pairs of Float32/64 values.
       VRANGE,
-      // Reduce - Perform Reduction Transformation on scalar\packed FP
+      // Reduce - Perform Reduction Transformation on scalar\packed FP.
       VREDUCE,
-      // RndScale - Round FP Values To Include A Given Number Of Fraction Bits
+      // RndScale - Round FP Values To Include A Given Number Of Fraction Bits.
       VRNDSCALE,
-      // VFPCLASS - Tests Types Of a FP Values for packed types.
-      VFPCLASS, 
-      // VFPCLASSS - Tests Types Of a FP Values for scalar types.
-      VFPCLASSS, 
-      // Broadcast scalar to vector
+      // Tests Types Of a FP Values for packed types.
+      VFPCLASS,
+      // Tests Types Of a FP Values for scalar types.
+      VFPCLASSS,
+
+      // Broadcast scalar to vector.
       VBROADCAST,
-      // Broadcast mask to vector
+      // Broadcast mask to vector.
       VBROADCASTM,
-      // Broadcast subvector to vector
+      // Broadcast subvector to vector.
       SUBV_BROADCAST,
-      // Insert/Extract vector element
+
+      // Insert/Extract vector element.
       VINSERT,
       VEXTRACT,
 
       /// SSE4A Extraction and Insertion.
       EXTRQI, INSERTQI,
 
-      // XOP variable/immediate rotations
+      // XOP variable/immediate rotations.
       VPROT, VPROTI,
-      // XOP arithmetic/logical shifts
+      // XOP arithmetic/logical shifts.
       VPSHA, VPSHL,
-      // XOP signed/unsigned integer comparisons
+      // XOP signed/unsigned integer comparisons.
       VPCOM, VPCOMU,
+      // XOP packed permute bytes.
+      VPPERM,
+      // XOP two source permutation.
+      VPERMIL2,
 
-      // Vector multiply packed unsigned doubleword integers
+      // Vector multiply packed unsigned doubleword integers.
       PMULUDQ,
-      // Vector multiply packed signed doubleword integers
+      // Vector multiply packed signed doubleword integers.
       PMULDQ,
-      // Vector Multiply Packed UnsignedIntegers with Round and Scale
+      // Vector Multiply Packed UnsignedIntegers with Round and Scale.
       MULHRS,
-      // Multiply and Add Packed Integers
+
+      // Multiply and Add Packed Integers.
       VPMADDUBSW, VPMADDWD,
-      // FMA nodes
+      VPMADD52L, VPMADD52H,
+
+      // FMA nodes.
       FMADD,
       FNMADD,
       FMSUB,
       FNMSUB,
       FMADDSUB,
       FMSUBADD,
-      // FMA with rounding mode
+
+      // FMA with rounding mode.
       FMADD_RND,
       FNMADD_RND,
       FMSUB_RND,
@@ -455,17 +486,20 @@ namespace llvm {
       FMADDSUB_RND,
       FMSUBADD_RND,
 
-      // Compress and expand
+      // Compress and expand.
       COMPRESS,
       EXPAND,
 
-      //Convert Unsigned/Integer to Scalar Floating-Point Value
-      //with rounding mode
+      // Convert Unsigned/Integer to Scalar Floating-Point Value
+      // with rounding mode.
       SINT_TO_FP_RND,
       UINT_TO_FP_RND,
 
       // Vector float/double to signed/unsigned integer.
       FP_TO_SINT_RND, FP_TO_UINT_RND,
+      // Scalar float/double to signed/unsigned integer.
+      SCALAR_FP_TO_SINT_RND, SCALAR_FP_TO_UINT_RND,
+
       // Save xmm argument registers to the stack, according to %al. An operator
       // is needed so that this can be expanded with control flow.
       VASTART_SAVE_XMM_REGS,
@@ -478,11 +512,9 @@ namespace llvm {
       // falls back to heap allocation if not.
       SEG_ALLOCA,
 
-      // Memory barrier
+      // Memory barriers.
       MEMBARRIER,
       MFENCE,
-      SFENCE,
-      LFENCE,
 
       // Store FP status word into i16 register.
       FNSTSW16r,
@@ -497,19 +529,26 @@ namespace llvm {
       // indicate whether it is valid in CF.
       RDSEED,
 
+      // SSE42 string comparisons.
       PCMPISTRI,
       PCMPESTRI,
 
       // Test if in transactional execution.
       XTEST,
 
-      // ERI instructions
+      // ERI instructions.
       RSQRT28, RCP28, EXP2,
 
       // Compare and swap.
       LCMPXCHG_DAG = ISD::FIRST_TARGET_MEMORY_OPCODE,
       LCMPXCHG8_DAG,
       LCMPXCHG16_DAG,
+      LCMPXCHG8_SAVE_EBX_DAG,
+      LCMPXCHG16_SAVE_RBX_DAG,
+
+      /// LOCK-prefixed arithmetic read-modify-write instructions.
+      /// EFLAGS, OUTCHAIN = LADD(INCHAIN, PTR, RHS)
+      LADD, LSUB, LOR, LXOR, LAND,
 
       // Load, scalar_to_vector, and zero extend.
       VZEXT_LOAD,
@@ -551,10 +590,10 @@ namespace llvm {
       VAARG_64
 
       // WARNING: Do not add anything in the end unless you want the node to
-      // have memop! In fact, starting from ATOMADD64_DAG all opcodes will be
-      // thought as target memory ops!
+      // have memop! In fact, starting from FIRST_TARGET_MEMORY_OPCODE all
+      // opcodes will be thought as target memory ops!
     };
-  }
+  } // end namespace X86ISD
 
   /// Define some predicates that are used for node matching.
   namespace X86 {
@@ -606,13 +645,12 @@ namespace llvm {
     bool isOffsetSuitableForCodeModel(int64_t Offset, CodeModel::Model M,
                                       bool hasSymbolicDisplacement = true);
 
-
     /// Determines whether the callee is required to pop its
     /// own arguments. Callee pop is necessary to support tail calls.
     bool isCalleePop(CallingConv::ID CallingConv,
-                     bool is64Bit, bool IsVarArg, bool TailCallOpt);
+                     bool is64Bit, bool IsVarArg, bool GuaranteeTCO);
 
-  }
+  } // end namespace X86
 
   //===--------------------------------------------------------------------===//
   //  X86 Implementation of the TargetLowering interface
@@ -679,13 +717,20 @@ namespace llvm {
     ///
     SDValue LowerOperation(SDValue Op, SelectionDAG &DAG) const override;
 
+    /// Places new result values for the node in Results (their number
+    /// and types must exactly match those of the original return values of
+    /// the node), or leaves Results empty, which indicates that the node is not
+    /// to be custom lowered after all.
+    void LowerOperationWrapper(SDNode *N,
+                               SmallVectorImpl<SDValue> &Results,
+                               SelectionDAG &DAG) const override;
+
     /// Replace the results of node with an illegal result
     /// type with new values built out of custom code.
     ///
     void ReplaceNodeResults(SDNode *N, SmallVectorImpl<SDValue>&Results,
                             SelectionDAG &DAG) const override;
 
-
     SDValue PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const override;
 
     /// Return true if the target has native support for
@@ -705,9 +750,8 @@ namespace llvm {
     bool hasCopyImplyingStackAdjustment(MachineFunction *MF) const override;
 
     MachineBasicBlock *
-      EmitInstrWithCustomInserter(MachineInstr *MI,
-                                  MachineBasicBlock *MBB) const override;
-
+    EmitInstrWithCustomInserter(MachineInstr &MI,
+                                MachineBasicBlock *MBB) const override;
 
     /// This method returns the name of a target specific DAG node.
     const char *getTargetNodeName(unsigned Opcode) const override;
@@ -716,6 +760,12 @@ namespace llvm {
 
     bool isCheapToSpeculateCtlz() const override;
 
+    bool hasBitPreservingFPLogic(EVT VT) const override {
+      return VT == MVT::f32 || VT == MVT::f64 || VT.isVector();
+    }
+
+    bool hasAndNotCompare(SDValue Y) const override;
+
     /// Return the value type to use for ISD::SETCC.
     EVT getSetCCResultType(const DataLayout &DL, LLVMContext &Context,
                            EVT VT) const override;
@@ -914,16 +964,21 @@ namespace llvm {
     unsigned
     getExceptionSelectorRegister(const Constant *PersonalityFn) const override;
 
+    virtual bool needsFixedCatchObjects() const override;
+
     /// This method returns a target specific FastISel object,
     /// or null if the target does not support "fast" ISel.
     FastISel *createFastISel(FunctionLoweringInfo &funcInfo,
                              const TargetLibraryInfo *libInfo) const override;
 
-    /// Return true if the target stores stack protector cookies at a fixed
-    /// offset in some non-standard address space, and populates the address
-    /// space and offset as appropriate.
-    bool getStackCookieLocation(unsigned &AddressSpace,
-                                unsigned &Offset) const override;
+    /// If the target has a standard location for the stack protector cookie,
+    /// returns the address of that location. Otherwise, returns nullptr.
+    Value *getIRStackGuard(IRBuilder<> &IRB) const override;
+
+    bool useLoadStackGuardNode() const override;
+    void insertSSPDeclarations(Module &M) const override;
+    Value *getSDagStackGuard(const Module &M) const override;
+    Value *getSSPStackGuardCheck(const Module &M) const override;
 
     /// Return true if the target stores SafeStack pointer at a fixed offset in
     /// some non-standard address space, and populates the address space and
@@ -935,21 +990,24 @@ namespace llvm {
 
     bool isNoopAddrSpaceCast(unsigned SrcAS, unsigned DestAS) const override;
 
-    bool useLoadStackGuardNode() const override;
     /// \brief Customize the preferred legalization strategy for certain types.
     LegalizeTypeAction getPreferredVectorAction(EVT VT) const override;
 
     bool isIntDivCheap(EVT VT, AttributeSet Attr) const override;
 
+    bool supportSwiftError() const override {
+      return true;
+    }
+
   protected:
     std::pair<const TargetRegisterClass *, uint8_t>
     findRepresentativeClass(const TargetRegisterInfo *TRI,
                             MVT VT) const override;
 
   private:
-    /// Keep a pointer to the X86Subtarget around so that we can
+    /// Keep a reference to the X86Subtarget around so that we can
     /// make the right decision when generating code for different targets.
-    const X86Subtarget *Subtarget;
+    const X86Subtarget &Subtarget;
 
     /// Select between SSE or x87 floating point ops.
     /// When SSE is available, use it for f32 operations.
@@ -969,16 +1027,15 @@ namespace llvm {
     SDValue LowerCallResult(SDValue Chain, SDValue InFlag,
                             CallingConv::ID CallConv, bool isVarArg,
                             const SmallVectorImpl<ISD::InputArg> &Ins,
-                            SDLoc dl, SelectionDAG &DAG,
+                            const SDLoc &dl, SelectionDAG &DAG,
                             SmallVectorImpl<SDValue> &InVals) const;
-    SDValue LowerMemArgument(SDValue Chain,
-                             CallingConv::ID CallConv,
+    SDValue LowerMemArgument(SDValue Chain, CallingConv::ID CallConv,
                              const SmallVectorImpl<ISD::InputArg> &ArgInfo,
-                             SDLoc dl, SelectionDAG &DAG,
-                             const CCValAssign &VA,  MachineFrameInfo *MFI,
-                              unsigned i) const;
+                             const SDLoc &dl, SelectionDAG &DAG,
+                             const CCValAssign &VA, MachineFrameInfo *MFI,
+                             unsigned i) const;
     SDValue LowerMemOpCallTo(SDValue Chain, SDValue StackPtr, SDValue Arg,
-                             SDLoc dl, SelectionDAG &DAG,
+                             const SDLoc &dl, SelectionDAG &DAG,
                              const CCValAssign &VA,
                              ISD::ArgFlagsTy Flags) const;
 
@@ -997,12 +1054,15 @@ namespace llvm {
                                     const SmallVectorImpl<ISD::InputArg> &Ins,
                                            SelectionDAG& DAG) const;
     SDValue EmitTailCallLoadRetAddr(SelectionDAG &DAG, SDValue &OutRetAddr,
-                                SDValue Chain, bool IsTailCall, bool Is64Bit,
-                                int FPDiff, SDLoc dl) const;
+                                    SDValue Chain, bool IsTailCall,
+                                    bool Is64Bit, int FPDiff,
+                                    const SDLoc &dl) const;
 
     unsigned GetAlignedArgumentStackSize(unsigned StackSize,
                                          SelectionDAG &DAG) const;
 
+    unsigned getAddressSpace(void) const;
+
     std::pair<SDValue,SDValue> FP_TO_INTHelper(SDValue Op, SelectionDAG &DAG,
                                                bool isSigned,
                                                bool isReplace) const;
@@ -1017,7 +1077,7 @@ namespace llvm {
     SDValue LowerINSERT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG) const;
     SDValue LowerConstantPool(SDValue Op, SelectionDAG &DAG) const;
     SDValue LowerBlockAddress(SDValue Op, SelectionDAG &DAG) const;
-    SDValue LowerGlobalAddress(const GlobalValue *GV, SDLoc dl,
+    SDValue LowerGlobalAddress(const GlobalValue *GV, const SDLoc &dl,
                                int64_t Offset, SelectionDAG &DAG) const;
     SDValue LowerGlobalAddress(SDValue Op, SelectionDAG &DAG) const;
     SDValue LowerGlobalTLSAddress(SDValue Op, SelectionDAG &DAG) const;
@@ -1030,8 +1090,8 @@ namespace llvm {
     SDValue LowerTRUNCATE(SDValue Op, SelectionDAG &DAG) const;
     SDValue LowerFP_TO_SINT(SDValue Op, SelectionDAG &DAG) const;
     SDValue LowerFP_TO_UINT(SDValue Op, SelectionDAG &DAG) const;
-    SDValue LowerToBT(SDValue And, ISD::CondCode CC,
-                      SDLoc dl, SelectionDAG &DAG) const;
+    SDValue LowerToBT(SDValue And, ISD::CondCode CC, const SDLoc &dl,
+                      SelectionDAG &DAG) const;
     SDValue LowerSETCC(SDValue Op, SelectionDAG &DAG) const;
     SDValue LowerSETCCE(SDValue Op, SelectionDAG &DAG) const;
     SDValue LowerSELECT(SDValue Op, SelectionDAG &DAG) const;
@@ -1046,6 +1106,7 @@ namespace llvm {
     SDValue LowerEH_RETURN(SDValue Op, SelectionDAG &DAG) const;
     SDValue lowerEH_SJLJ_SETJMP(SDValue Op, SelectionDAG &DAG) const;
     SDValue lowerEH_SJLJ_LONGJMP(SDValue Op, SelectionDAG &DAG) const;
+    SDValue lowerEH_SJLJ_SETUP_DISPATCH(SDValue Op, SelectionDAG &DAG) const;
     SDValue LowerINIT_TRAMPOLINE(SDValue Op, SelectionDAG &DAG) const;
     SDValue LowerFLT_ROUNDS_(SDValue Op, SelectionDAG &DAG) const;
     SDValue LowerWin64_i128OP(SDValue Op, SelectionDAG &DAG) const;
@@ -1053,19 +1114,17 @@ namespace llvm {
     SDValue LowerGC_TRANSITION_END(SDValue Op, SelectionDAG &DAG) const;
 
     SDValue
-      LowerFormalArguments(SDValue Chain,
-                           CallingConv::ID CallConv, bool isVarArg,
-                           const SmallVectorImpl<ISD::InputArg> &Ins,
-                           SDLoc dl, SelectionDAG &DAG,
-                           SmallVectorImpl<SDValue> &InVals) const override;
+    LowerFormalArguments(SDValue Chain, CallingConv::ID CallConv, bool isVarArg,
+                         const SmallVectorImpl<ISD::InputArg> &Ins,
+                         const SDLoc &dl, SelectionDAG &DAG,
+                         SmallVectorImpl<SDValue> &InVals) const override;
     SDValue LowerCall(CallLoweringInfo &CLI,
                       SmallVectorImpl<SDValue> &InVals) const override;
 
-    SDValue LowerReturn(SDValue Chain,
-                        CallingConv::ID CallConv, bool isVarArg,
+    SDValue LowerReturn(SDValue Chain, CallingConv::ID CallConv, bool isVarArg,
                         const SmallVectorImpl<ISD::OutputArg> &Outs,
                         const SmallVectorImpl<SDValue> &OutVals,
-                        SDLoc dl, SelectionDAG &DAG) const override;
+                        const SDLoc &dl, SelectionDAG &DAG) const override;
 
     bool supportSplitCSR(MachineFunction *MF) const override {
       return MF->getFunction()->getCallingConv() == CallingConv::CXX_FAST_TLS &&
@@ -1080,8 +1139,8 @@ namespace llvm {
 
     bool mayBeEmittedAsTailCall(CallInst *CI) const override;
 
-    EVT getTypeForExtArgOrReturn(LLVMContext &Context, EVT VT,
-                                 ISD::NodeType ExtendKind) const override;
+    EVT getTypeForExtReturn(LLVMContext &Context, EVT VT,
+                            ISD::NodeType ExtendKind) const override;
 
     bool CanLowerReturn(CallingConv::ID CallConv, MachineFunction &MF,
                         bool isVarArg,
@@ -1101,57 +1160,60 @@ namespace llvm {
 
     bool needsCmpXchgNb(Type *MemType) const;
 
+    void SetupEntryBlockForSjLj(MachineInstr &MI, MachineBasicBlock *MBB,
+                                MachineBasicBlock *DispatchBB, int FI) const;
+
     // Utility function to emit the low-level va_arg code for X86-64.
-    MachineBasicBlock *EmitVAARG64WithCustomInserter(
-                       MachineInstr *MI,
-                       MachineBasicBlock *MBB) const;
+    MachineBasicBlock *
+    EmitVAARG64WithCustomInserter(MachineInstr &MI,
+                                  MachineBasicBlock *MBB) const;
 
     /// Utility function to emit the xmm reg save portion of va_start.
-    MachineBasicBlock *EmitVAStartSaveXMMRegsWithCustomInserter(
-                                                   MachineInstr *BInstr,
-                                                   MachineBasicBlock *BB) const;
+    MachineBasicBlock *
+    EmitVAStartSaveXMMRegsWithCustomInserter(MachineInstr &BInstr,
+                                             MachineBasicBlock *BB) const;
 
-    MachineBasicBlock *EmitLoweredSelect(MachineInstr *I,
+    MachineBasicBlock *EmitLoweredSelect(MachineInstr &I,
                                          MachineBasicBlock *BB) const;
 
-    MachineBasicBlock *EmitLoweredAtomicFP(MachineInstr *I,
+    MachineBasicBlock *EmitLoweredAtomicFP(MachineInstr &I,
                                            MachineBasicBlock *BB) const;
 
-    MachineBasicBlock *EmitLoweredWinAlloca(MachineInstr *MI,
-                                              MachineBasicBlock *BB) const;
-
-    MachineBasicBlock *EmitLoweredCatchRet(MachineInstr *MI,
+    MachineBasicBlock *EmitLoweredCatchRet(MachineInstr &MI,
                                            MachineBasicBlock *BB) const;
 
-    MachineBasicBlock *EmitLoweredCatchPad(MachineInstr *MI,
+    MachineBasicBlock *EmitLoweredCatchPad(MachineInstr &MI,
                                            MachineBasicBlock *BB) const;
 
-    MachineBasicBlock *EmitLoweredSegAlloca(MachineInstr *MI,
+    MachineBasicBlock *EmitLoweredSegAlloca(MachineInstr &MI,
                                             MachineBasicBlock *BB) const;
 
-    MachineBasicBlock *EmitLoweredTLSAddr(MachineInstr *MI,
+    MachineBasicBlock *EmitLoweredTLSAddr(MachineInstr &MI,
                                           MachineBasicBlock *BB) const;
 
-    MachineBasicBlock *EmitLoweredTLSCall(MachineInstr *MI,
+    MachineBasicBlock *EmitLoweredTLSCall(MachineInstr &MI,
                                           MachineBasicBlock *BB) const;
 
-    MachineBasicBlock *emitEHSjLjSetJmp(MachineInstr *MI,
+    MachineBasicBlock *emitEHSjLjSetJmp(MachineInstr &MI,
                                         MachineBasicBlock *MBB) const;
 
-    MachineBasicBlock *emitEHSjLjLongJmp(MachineInstr *MI,
+    MachineBasicBlock *emitEHSjLjLongJmp(MachineInstr &MI,
                                          MachineBasicBlock *MBB) const;
 
-    MachineBasicBlock *emitFMA3Instr(MachineInstr *MI,
+    MachineBasicBlock *emitFMA3Instr(MachineInstr &MI,
                                      MachineBasicBlock *MBB) const;
 
+    MachineBasicBlock *EmitSjLjDispatchBlock(MachineInstr &MI,
+                                             MachineBasicBlock *MBB) const;
+
     /// Emit nodes that will be selected as "test Op0,Op0", or something
     /// equivalent, for use with the given x86 condition code.
-    SDValue EmitTest(SDValue Op0, unsigned X86CC, SDLoc dl,
+    SDValue EmitTest(SDValue Op0, unsigned X86CC, const SDLoc &dl,
                      SelectionDAG &DAG) const;
 
     /// Emit nodes that will be selected as "cmp Op0,Op1", or something
     /// equivalent, for use with the given x86 condition code.
-    SDValue EmitCmp(SDValue Op0, SDValue Op1, unsigned X86CC, SDLoc dl,
+    SDValue EmitCmp(SDValue Op0, SDValue Op1, unsigned X86CC, const SDLoc &dl,
                     SelectionDAG &DAG) const;
 
     /// Convert a comparison if required by the subtarget.
@@ -1173,7 +1235,7 @@ namespace llvm {
   namespace X86 {
     FastISel *createFastISel(FunctionLoweringInfo &funcInfo,
                              const TargetLibraryInfo *libInfo);
-  }
-}
+  } // end namespace X86
+} // end namespace llvm
 
-#endif    // X86ISELLOWERING_H
+#endif // LLVM_LIB_TARGET_X86_X86ISELLOWERING_H
diff --git a/lib/Target/X86/X86InstrAVX512.td b/lib/Target/X86/X86InstrAVX512.td
index 6f0199b015cd..de4129f86541 100644
--- a/lib/Target/X86/X86InstrAVX512.td
+++ b/lib/Target/X86/X86InstrAVX512.td
@@ -30,6 +30,10 @@ class X86VectorVTInfo<int numelts, ValueType eltvt, RegisterClass rc,
   // Corresponding write-mask register class.
   RegisterClass KRCWM = !cast<RegisterClass>("VK" # NumElts # "WM");
 
+  // The mask VT.
+  ValueType KVT = !cast<ValueType>(!if (!eq (NumElts, 1), "i1",
+                                                          "v" # NumElts # "i1"));
+
   // The GPR register class that can hold the write mask.  Use GR8 for fewer
   // than 8 elements.  Use shift-right and equal to work around the lack of
   // !lt in tablegen.
@@ -95,6 +99,12 @@ class X86VectorVTInfo<int numelts, ValueType eltvt, RegisterClass rc,
                                   "v" # NumElts # "f" # EltSize,
                                   VTName)));
 
+  ValueType IntVT = !cast<ValueType>(
+                        !if (!eq (!srl(EltSize,5),0),
+                             VTName,
+                             !if (!eq(TypeVariantName, "f"),
+                                  "v" # NumElts # "i" # EltSize,
+                                  VTName)));
   // The string to specify embedded broadcast in assembly.
   string BroadcastStr = "{1to" # NumElts # "}";
 
@@ -238,12 +248,12 @@ multiclass AVX512_maskable<bits<8> O, Format F, X86VectorVTInfo _,
                            string AttSrcAsm, string IntelSrcAsm,
                            dag RHS,
                            InstrItinClass itin = NoItinerary,
-                           bit IsCommutable = 0> :
+                           bit IsCommutable = 0, SDNode Select = vselect> :
    AVX512_maskable_common<O, F, _, Outs, Ins,
                           !con((ins _.RC:$src0, _.KRCWM:$mask), Ins),
                           !con((ins _.KRCWM:$mask), Ins),
                           OpcodeStr, AttSrcAsm, IntelSrcAsm, RHS,
-                          (vselect _.KRCWM:$mask, RHS, _.RC:$src0), vselect,
+                          (Select _.KRCWM:$mask, RHS, _.RC:$src0), Select,
                           "$src0 = $dst", itin, IsCommutable>;
 
 // This multiclass generates the unconditional/non-masking, the masking and
@@ -258,8 +268,8 @@ multiclass AVX512_maskable_scalar<bits<8> O, Format F, X86VectorVTInfo _,
                           !con((ins _.RC:$src0, _.KRCWM:$mask), Ins),
                           !con((ins _.KRCWM:$mask), Ins),
                           OpcodeStr, AttSrcAsm, IntelSrcAsm, RHS,
-                          (X86select _.KRCWM:$mask, RHS, _.RC:$src0), X86select,
-                          "$src0 = $dst", itin, IsCommutable>;
+                          (X86selects _.KRCWM:$mask, RHS, _.RC:$src0),
+                          X86selects, "$src0 = $dst", itin, IsCommutable>;
 
 // Similar to AVX512_maskable but in this case one of the source operands
 // ($src1) is already tied to $dst so we just use that for the preserved
@@ -301,7 +311,8 @@ multiclass AVX512_maskable_3src_scalar<bits<8> O, Format F, X86VectorVTInfo _,
                           !con((ins _.RC:$src1, _.KRCWM:$mask), NonTiedIns),
                           !con((ins _.RC:$src1, _.KRCWM:$mask), NonTiedIns),
                           OpcodeStr, AttSrcAsm, IntelSrcAsm, RHS,
-                          (X86select _.KRCWM:$mask, RHS, _.RC:$src1)>;
+                          (X86selects _.KRCWM:$mask, RHS, _.RC:$src1),
+                          X86selects>;
 
 multiclass AVX512_maskable_in_asm<bits<8> O, Format F, X86VectorVTInfo _,
                                   dag Outs, dag Ins,
@@ -363,119 +374,58 @@ multiclass AVX512_maskable_cmp_alt<bits<8> O, Format F, X86VectorVTInfo _,
                              AttSrcAsm, IntelSrcAsm, [],[]>;
 
 // Bitcasts between 512-bit vector types. Return the original type since
-// no instruction is needed for the conversion
-let Predicates = [HasAVX512] in {
-  def : Pat<(v8f64  (bitconvert (v8i64 VR512:$src))),  (v8f64 VR512:$src)>;
-  def : Pat<(v8f64  (bitconvert (v16i32 VR512:$src))), (v8f64 VR512:$src)>;
-  def : Pat<(v8f64  (bitconvert (v32i16 VR512:$src))),  (v8f64 VR512:$src)>;
-  def : Pat<(v8f64  (bitconvert (v64i8 VR512:$src))), (v8f64 VR512:$src)>;
-  def : Pat<(v8f64  (bitconvert (v16f32 VR512:$src))), (v8f64 VR512:$src)>;
-  def : Pat<(v16f32 (bitconvert (v8i64 VR512:$src))),  (v16f32 VR512:$src)>;
-  def : Pat<(v16f32 (bitconvert (v16i32 VR512:$src))), (v16f32 VR512:$src)>;
-  def : Pat<(v16f32 (bitconvert (v32i16 VR512:$src))), (v16f32 VR512:$src)>;
-  def : Pat<(v16f32 (bitconvert (v64i8 VR512:$src))), (v16f32 VR512:$src)>;
-  def : Pat<(v16f32 (bitconvert (v8f64 VR512:$src))),  (v16f32 VR512:$src)>;
-  def : Pat<(v8i64  (bitconvert (v16i32 VR512:$src))), (v8i64 VR512:$src)>;
-  def : Pat<(v8i64  (bitconvert (v32i16 VR512:$src))), (v8i64 VR512:$src)>;
-  def : Pat<(v8i64  (bitconvert (v64i8 VR512:$src))), (v8i64 VR512:$src)>;
-  def : Pat<(v8i64  (bitconvert (v8f64 VR512:$src))),  (v8i64 VR512:$src)>;
-  def : Pat<(v8i64  (bitconvert (v16f32 VR512:$src))), (v8i64 VR512:$src)>;
-  def : Pat<(v16i32 (bitconvert (v8i64 VR512:$src))), (v16i32 VR512:$src)>;
-  def : Pat<(v16i32 (bitconvert (v16f32 VR512:$src))), (v16i32 VR512:$src)>;
-  def : Pat<(v16i32 (bitconvert (v32i16 VR512:$src))),  (v16i32 VR512:$src)>;
-  def : Pat<(v16i32 (bitconvert (v64i8 VR512:$src))),  (v16i32 VR512:$src)>;
-  def : Pat<(v16i32 (bitconvert (v8f64 VR512:$src))),  (v16i32 VR512:$src)>;
-  def : Pat<(v32i16 (bitconvert (v8i64 VR512:$src))), (v32i16 VR512:$src)>;
-  def : Pat<(v32i16 (bitconvert (v16i32 VR512:$src))),  (v32i16 VR512:$src)>;
-  def : Pat<(v32i16 (bitconvert (v64i8 VR512:$src))),  (v32i16 VR512:$src)>;
-  def : Pat<(v32i16 (bitconvert (v8f64 VR512:$src))),  (v32i16 VR512:$src)>;
-  def : Pat<(v32i16 (bitconvert (v16f32 VR512:$src))), (v32i16 VR512:$src)>;
-  def : Pat<(v32i16 (bitconvert (v16f32 VR512:$src))), (v32i16 VR512:$src)>;
-  def : Pat<(v64i8  (bitconvert (v8i64 VR512:$src))), (v64i8 VR512:$src)>;
-  def : Pat<(v64i8  (bitconvert (v16i32 VR512:$src))), (v64i8 VR512:$src)>;
-  def : Pat<(v64i8  (bitconvert (v32i16 VR512:$src))), (v64i8 VR512:$src)>;
-  def : Pat<(v64i8  (bitconvert (v8f64 VR512:$src))),  (v64i8 VR512:$src)>;
-  def : Pat<(v64i8  (bitconvert (v16f32 VR512:$src))), (v64i8 VR512:$src)>;
-
-  def : Pat<(v2i64 (bitconvert (v4i32 VR128X:$src))), (v2i64 VR128X:$src)>;
-  def : Pat<(v2i64 (bitconvert (v8i16 VR128X:$src))), (v2i64 VR128X:$src)>;
-  def : Pat<(v2i64 (bitconvert (v16i8 VR128X:$src))), (v2i64 VR128X:$src)>;
-  def : Pat<(v2i64 (bitconvert (v2f64 VR128X:$src))), (v2i64 VR128X:$src)>;
-  def : Pat<(v2i64 (bitconvert (v4f32 VR128X:$src))), (v2i64 VR128X:$src)>;
-  def : Pat<(v4i32 (bitconvert (v2i64 VR128X:$src))), (v4i32 VR128X:$src)>;
-  def : Pat<(v4i32 (bitconvert (v8i16 VR128X:$src))), (v4i32 VR128X:$src)>;
-  def : Pat<(v4i32 (bitconvert (v16i8 VR128X:$src))), (v4i32 VR128X:$src)>;
-  def : Pat<(v4i32 (bitconvert (v2f64 VR128X:$src))), (v4i32 VR128X:$src)>;
-  def : Pat<(v4i32 (bitconvert (v4f32 VR128X:$src))), (v4i32 VR128X:$src)>;
-  def : Pat<(v8i16 (bitconvert (v2i64 VR128X:$src))), (v8i16 VR128X:$src)>;
-  def : Pat<(v8i16 (bitconvert (v4i32 VR128X:$src))), (v8i16 VR128X:$src)>;
-  def : Pat<(v8i16 (bitconvert (v16i8 VR128X:$src))), (v8i16 VR128X:$src)>;
-  def : Pat<(v8i16 (bitconvert (v2f64 VR128X:$src))), (v8i16 VR128X:$src)>;
-  def : Pat<(v8i16 (bitconvert (v4f32 VR128X:$src))), (v8i16 VR128X:$src)>;
-  def : Pat<(v16i8 (bitconvert (v2i64 VR128X:$src))), (v16i8 VR128X:$src)>;
-  def : Pat<(v16i8 (bitconvert (v4i32 VR128X:$src))), (v16i8 VR128X:$src)>;
-  def : Pat<(v16i8 (bitconvert (v8i16 VR128X:$src))), (v16i8 VR128X:$src)>;
-  def : Pat<(v16i8 (bitconvert (v2f64 VR128X:$src))), (v16i8 VR128X:$src)>;
-  def : Pat<(v16i8 (bitconvert (v4f32 VR128X:$src))), (v16i8 VR128X:$src)>;
-  def : Pat<(v4f32 (bitconvert (v2i64 VR128X:$src))), (v4f32 VR128X:$src)>;
-  def : Pat<(v4f32 (bitconvert (v4i32 VR128X:$src))), (v4f32 VR128X:$src)>;
-  def : Pat<(v4f32 (bitconvert (v8i16 VR128X:$src))), (v4f32 VR128X:$src)>;
-  def : Pat<(v4f32 (bitconvert (v16i8 VR128X:$src))), (v4f32 VR128X:$src)>;
-  def : Pat<(v4f32 (bitconvert (v2f64 VR128X:$src))), (v4f32 VR128X:$src)>;
-  def : Pat<(v2f64 (bitconvert (v2i64 VR128X:$src))), (v2f64 VR128X:$src)>;
-  def : Pat<(v2f64 (bitconvert (v4i32 VR128X:$src))), (v2f64 VR128X:$src)>;
-  def : Pat<(v2f64 (bitconvert (v8i16 VR128X:$src))), (v2f64 VR128X:$src)>;
-  def : Pat<(v2f64 (bitconvert (v16i8 VR128X:$src))), (v2f64 VR128X:$src)>;
-  def : Pat<(v2f64 (bitconvert (v4f32 VR128X:$src))), (v2f64 VR128X:$src)>;
-
-// Bitcasts between 256-bit vector types. Return the original type since
-// no instruction is needed for the conversion
-  def : Pat<(v4f64  (bitconvert (v8f32 VR256X:$src))),  (v4f64 VR256X:$src)>;
-  def : Pat<(v4f64  (bitconvert (v8i32 VR256X:$src))),  (v4f64 VR256X:$src)>;
-  def : Pat<(v4f64  (bitconvert (v4i64 VR256X:$src))),  (v4f64 VR256X:$src)>;
-  def : Pat<(v4f64  (bitconvert (v16i16 VR256X:$src))), (v4f64 VR256X:$src)>;
-  def : Pat<(v4f64  (bitconvert (v32i8 VR256X:$src))),  (v4f64 VR256X:$src)>;
-  def : Pat<(v8f32  (bitconvert (v8i32 VR256X:$src))),  (v8f32 VR256X:$src)>;
-  def : Pat<(v8f32  (bitconvert (v4i64 VR256X:$src))),  (v8f32 VR256X:$src)>;
-  def : Pat<(v8f32  (bitconvert (v4f64 VR256X:$src))),  (v8f32 VR256X:$src)>;
-  def : Pat<(v8f32  (bitconvert (v32i8 VR256X:$src))),  (v8f32 VR256X:$src)>;
-  def : Pat<(v8f32  (bitconvert (v16i16 VR256X:$src))), (v8f32 VR256X:$src)>;
-  def : Pat<(v4i64  (bitconvert (v8f32 VR256X:$src))),  (v4i64 VR256X:$src)>;
-  def : Pat<(v4i64  (bitconvert (v8i32 VR256X:$src))),  (v4i64 VR256X:$src)>;
-  def : Pat<(v4i64  (bitconvert (v4f64 VR256X:$src))),  (v4i64 VR256X:$src)>;
-  def : Pat<(v4i64  (bitconvert (v32i8 VR256X:$src))),  (v4i64 VR256X:$src)>;
-  def : Pat<(v4i64  (bitconvert (v16i16 VR256X:$src))), (v4i64 VR256X:$src)>;
-  def : Pat<(v32i8  (bitconvert (v4f64 VR256X:$src))),  (v32i8 VR256X:$src)>;
-  def : Pat<(v32i8  (bitconvert (v4i64 VR256X:$src))),  (v32i8 VR256X:$src)>;
-  def : Pat<(v32i8  (bitconvert (v8f32 VR256X:$src))),  (v32i8 VR256X:$src)>;
-  def : Pat<(v32i8  (bitconvert (v8i32 VR256X:$src))),  (v32i8 VR256X:$src)>;
-  def : Pat<(v32i8  (bitconvert (v16i16 VR256X:$src))), (v32i8 VR256X:$src)>;
-  def : Pat<(v8i32  (bitconvert (v32i8 VR256X:$src))),  (v8i32 VR256X:$src)>;
-  def : Pat<(v8i32  (bitconvert (v16i16 VR256X:$src))), (v8i32 VR256X:$src)>;
-  def : Pat<(v8i32  (bitconvert (v8f32 VR256X:$src))),  (v8i32 VR256X:$src)>;
-  def : Pat<(v8i32  (bitconvert (v4i64 VR256X:$src))),  (v8i32 VR256X:$src)>;
-  def : Pat<(v8i32  (bitconvert (v4f64 VR256X:$src))),  (v8i32 VR256X:$src)>;
-  def : Pat<(v16i16 (bitconvert (v8f32 VR256X:$src))),  (v16i16 VR256X:$src)>;
-  def : Pat<(v16i16 (bitconvert (v8i32 VR256X:$src))),  (v16i16 VR256X:$src)>;
-  def : Pat<(v16i16 (bitconvert (v4i64 VR256X:$src))),  (v16i16 VR256X:$src)>;
-  def : Pat<(v16i16 (bitconvert (v4f64 VR256X:$src))),  (v16i16 VR256X:$src)>;
-  def : Pat<(v16i16 (bitconvert (v32i8 VR256X:$src))),  (v16i16 VR256X:$src)>;
-}
-
-//
-// AVX-512: VPXOR instruction writes zero to its upper part, it's safe build zeros.
-//
-
+// no instruction is needed for the conversion.
+def : Pat<(v8f64  (bitconvert (v8i64  VR512:$src))), (v8f64  VR512:$src)>;
+def : Pat<(v8f64  (bitconvert (v16i32 VR512:$src))), (v8f64  VR512:$src)>;
+def : Pat<(v8f64  (bitconvert (v32i16 VR512:$src))), (v8f64  VR512:$src)>;
+def : Pat<(v8f64  (bitconvert (v64i8  VR512:$src))), (v8f64  VR512:$src)>;
+def : Pat<(v8f64  (bitconvert (v16f32 VR512:$src))), (v8f64  VR512:$src)>;
+def : Pat<(v16f32 (bitconvert (v8i64  VR512:$src))), (v16f32 VR512:$src)>;
+def : Pat<(v16f32 (bitconvert (v16i32 VR512:$src))), (v16f32 VR512:$src)>;
+def : Pat<(v16f32 (bitconvert (v32i16 VR512:$src))), (v16f32 VR512:$src)>;
+def : Pat<(v16f32 (bitconvert (v64i8  VR512:$src))), (v16f32 VR512:$src)>;
+def : Pat<(v16f32 (bitconvert (v8f64  VR512:$src))), (v16f32 VR512:$src)>;
+def : Pat<(v8i64  (bitconvert (v16i32 VR512:$src))), (v8i64  VR512:$src)>;
+def : Pat<(v8i64  (bitconvert (v32i16 VR512:$src))), (v8i64  VR512:$src)>;
+def : Pat<(v8i64  (bitconvert (v64i8  VR512:$src))), (v8i64  VR512:$src)>;
+def : Pat<(v8i64  (bitconvert (v8f64  VR512:$src))), (v8i64  VR512:$src)>;
+def : Pat<(v8i64  (bitconvert (v16f32 VR512:$src))), (v8i64  VR512:$src)>;
+def : Pat<(v16i32 (bitconvert (v8i64  VR512:$src))), (v16i32 VR512:$src)>;
+def : Pat<(v16i32 (bitconvert (v16f32 VR512:$src))), (v16i32 VR512:$src)>;
+def : Pat<(v16i32 (bitconvert (v32i16 VR512:$src))), (v16i32 VR512:$src)>;
+def : Pat<(v16i32 (bitconvert (v64i8  VR512:$src))), (v16i32 VR512:$src)>;
+def : Pat<(v16i32 (bitconvert (v8f64  VR512:$src))), (v16i32 VR512:$src)>;
+def : Pat<(v32i16 (bitconvert (v8i64  VR512:$src))), (v32i16 VR512:$src)>;
+def : Pat<(v32i16 (bitconvert (v16i32 VR512:$src))), (v32i16 VR512:$src)>;
+def : Pat<(v32i16 (bitconvert (v64i8  VR512:$src))), (v32i16 VR512:$src)>;
+def : Pat<(v32i16 (bitconvert (v8f64  VR512:$src))), (v32i16 VR512:$src)>;
+def : Pat<(v32i16 (bitconvert (v16f32 VR512:$src))), (v32i16 VR512:$src)>;
+def : Pat<(v32i16 (bitconvert (v16f32 VR512:$src))), (v32i16 VR512:$src)>;
+def : Pat<(v64i8  (bitconvert (v8i64  VR512:$src))), (v64i8  VR512:$src)>;
+def : Pat<(v64i8  (bitconvert (v16i32 VR512:$src))), (v64i8  VR512:$src)>;
+def : Pat<(v64i8  (bitconvert (v32i16 VR512:$src))), (v64i8  VR512:$src)>;
+def : Pat<(v64i8  (bitconvert (v8f64  VR512:$src))), (v64i8  VR512:$src)>;
+def : Pat<(v64i8  (bitconvert (v16f32 VR512:$src))), (v64i8  VR512:$src)>;
+
+// Alias instruction that maps zero vector to pxor / xorp* for AVX-512.
+// This is expanded by ExpandPostRAPseudos to an xorps / vxorps, and then
+// swizzled by ExecutionDepsFix to pxor.
+// We set canFoldAsLoad because this can be converted to a constant-pool
+// load of an all-zeros value if folding it would be beneficial.
 let isReMaterializable = 1, isAsCheapAsAMove = 1, canFoldAsLoad = 1,
-    isPseudo = 1, Predicates = [HasAVX512] in {
+    isPseudo = 1, Predicates = [HasAVX512], SchedRW = [WriteZero] in {
 def AVX512_512_SET0 : I<0, Pseudo, (outs VR512:$dst), (ins), "",
-               [(set VR512:$dst, (v16f32 immAllZerosV))]>;
+               [(set VR512:$dst, (v16i32 immAllZerosV))]>;
+def AVX512_512_SETALLONES : I<0, Pseudo, (outs VR512:$dst), (ins), "",
+               [(set VR512:$dst, (v16i32 immAllOnesV))]>;
 }
 
-let Predicates = [HasAVX512] in {
-def : Pat<(v8i64 immAllZerosV), (AVX512_512_SET0)>;
-def : Pat<(v16i32 immAllZerosV), (AVX512_512_SET0)>;
-def : Pat<(v8f64 immAllZerosV), (AVX512_512_SET0)>;
+let isReMaterializable = 1, isAsCheapAsAMove = 1, canFoldAsLoad = 1,
+    isPseudo = 1, Predicates = [HasVLX], SchedRW = [WriteZero] in {
+def AVX512_128_SET0 : I<0, Pseudo, (outs VR128X:$dst), (ins), "",
+               [(set VR128X:$dst, (v4i32 immAllZerosV))]>;
+def AVX512_256_SET0 : I<0, Pseudo, (outs VR256X:$dst), (ins), "",
+               [(set VR256X:$dst, (v8i32 immAllZerosV))]>;
 }
 
 //===----------------------------------------------------------------------===//
@@ -483,7 +433,7 @@ def : Pat<(v8f64 immAllZerosV), (AVX512_512_SET0)>;
 //
 multiclass vinsert_for_size<int Opcode, X86VectorVTInfo From, X86VectorVTInfo To,
                                                        PatFrag vinsert_insert> {
-  let hasSideEffects = 0, ExeDomain = To.ExeDomain in {
+  let ExeDomain = To.ExeDomain in {
     defm rr : AVX512_maskable<Opcode, MRMSrcReg, To, (outs To.RC:$dst),
                    (ins To.RC:$src1, From.RC:$src2, i32u8imm:$src3),
                    "vinsert" # From.EltTypeName # "x" # From.NumElts,
@@ -492,7 +442,6 @@ multiclass vinsert_for_size<int Opcode, X86VectorVTInfo From, X86VectorVTInfo To
                                          (From.VT From.RC:$src2),
                                          (iPTR imm))>, AVX512AIi8Base, EVEX_4V;
 
-  let mayLoad = 1 in
     defm rm : AVX512_maskable<Opcode, MRMSrcMem, To, (outs To.RC:$dst),
                    (ins To.RC:$src1, From.MemOp:$src2, i32u8imm:$src3),
                    "vinsert" # From.EltTypeName # "x" # From.NumElts,
@@ -615,19 +564,9 @@ def VINSERTPSzrm: AVX512AIi8<0x21, MRMSrcMem, (outs VR128X:$dst),
 // AVX-512 VECTOR EXTRACT
 //---
 
-multiclass vextract_for_size_first_position_lowering<X86VectorVTInfo From,
-                                                     X86VectorVTInfo To> {
-  // A subvector extract from the first vector position is
-  // a subregister copy that needs no instruction.
-  def NAME # To.NumElts:
-      Pat<(To.VT (extract_subvector (From.VT From.RC:$src),(iPTR 0))),
-          (To.VT (EXTRACT_SUBREG (From.VT From.RC:$src), To.SubRegIdx))>;
-}
-
 multiclass vextract_for_size<int Opcode,
                                     X86VectorVTInfo From, X86VectorVTInfo To,
-                                    PatFrag vextract_extract> :
-  vextract_for_size_first_position_lowering<From, To> {
+                                    PatFrag vextract_extract> {
 
   let hasSideEffects = 0, ExeDomain = To.ExeDomain in {
     // use AVX512_maskable_in_asm (AVX512_maskable can't be used due to
@@ -640,21 +579,22 @@ multiclass vextract_for_size<int Opcode,
                 [(set To.RC:$dst, (vextract_extract:$idx (From.VT From.RC:$src1),
                                                          (iPTR imm)))]>,
               AVX512AIi8Base, EVEX;
-    let mayStore = 1 in {
-      def rm  : AVX512AIi8<Opcode, MRMDestMem, (outs),
-                      (ins To.MemOp:$dst, From.RC:$src1, i32u8imm:$src2),
-                      "vextract" # To.EltTypeName # "x" # To.NumElts #
-                          "\t{$src2, $src1, $dst|$dst, $src1, $src2}",
-                      []>, EVEX;
-
-      def rmk : AVX512AIi8<Opcode, MRMDestMem, (outs),
-                      (ins To.MemOp:$dst, To.KRCWM:$mask,
-                                          From.RC:$src1, i32u8imm:$src2),
-                       "vextract" # To.EltTypeName # "x" # To.NumElts #
-                            "\t{$src2, $src1, $dst {${mask}}|"
-                            "$dst {${mask}}, $src1, $src2}",
-                      []>, EVEX_K, EVEX;
-    }//mayStore = 1
+    def mr  : AVX512AIi8<Opcode, MRMDestMem, (outs),
+                    (ins To.MemOp:$dst, From.RC:$src1, i32u8imm:$idx),
+                    "vextract" # To.EltTypeName # "x" # To.NumElts #
+                        "\t{$idx, $src1, $dst|$dst, $src1, $idx}",
+                    [(store (To.VT (vextract_extract:$idx
+                                    (From.VT From.RC:$src1), (iPTR imm))),
+                             addr:$dst)]>, EVEX;
+
+    let mayStore = 1, hasSideEffects = 0 in
+    def mrk : AVX512AIi8<Opcode, MRMDestMem, (outs),
+                    (ins To.MemOp:$dst, To.KRCWM:$mask,
+                                        From.RC:$src1, i32u8imm:$idx),
+                     "vextract" # To.EltTypeName # "x" # To.NumElts #
+                          "\t{$idx, $src1, $dst {${mask}}|"
+                          "$dst {${mask}}, $src1, $idx}",
+                    []>, EVEX_K, EVEX;
   }
 
   // Intrinsic call with masking.
@@ -688,14 +628,17 @@ multiclass vextract_for_size<int Opcode,
 // Codegen pattern for the alternative types
 multiclass vextract_for_size_lowering<string InstrStr, X86VectorVTInfo From,
                 X86VectorVTInfo To, PatFrag vextract_extract,
-                SDNodeXForm EXTRACT_get_vextract_imm, list<Predicate> p> :
-  vextract_for_size_first_position_lowering<From, To> {
-
-  let Predicates = p in
+                SDNodeXForm EXTRACT_get_vextract_imm, list<Predicate> p> {
+  let Predicates = p in {
      def : Pat<(vextract_extract:$ext (From.VT From.RC:$src1), (iPTR imm)),
                (To.VT (!cast<Instruction>(InstrStr#"rr")
                           From.RC:$src1,
                           (EXTRACT_get_vextract_imm To.RC:$ext)))>;
+     def : Pat<(store (To.VT (vextract_extract:$ext (From.VT From.RC:$src1),
+                              (iPTR imm))), addr:$dst),
+               (!cast<Instruction>(InstrStr#"mr") addr:$dst, From.RC:$src1,
+                (EXTRACT_get_vextract_imm To.RC:$ext))>;
+  }
 }
 
 multiclass vextract_for_type<ValueType EltVT32, int Opcode128,
@@ -756,6 +699,12 @@ defm : vextract_for_size_lowering<"VEXTRACTF32x4Z256", v4f64x_info, v2f64x_info,
 defm : vextract_for_size_lowering<"VEXTRACTI32x4Z256", v4i64x_info, v2i64x_info,
           vextract128_extract, EXTRACT_get_vextract128_imm, [HasVLX, NoDQI]>;
 
+// Codegen pattern with the alternative types extract VEC128 from VEC256
+defm : vextract_for_size_lowering<"VEXTRACTI32x4Z256", v16i16x_info, v8i16x_info,
+          vextract128_extract, EXTRACT_get_vextract128_imm, [HasVLX]>;
+defm : vextract_for_size_lowering<"VEXTRACTI32x4Z256", v32i8x_info, v16i8x_info,
+          vextract128_extract, EXTRACT_get_vextract128_imm, [HasVLX]>;
+
 // Codegen pattern with the alternative types extract VEC128 from VEC512
 defm : vextract_for_size_lowering<"VEXTRACTI32x4Z", v32i16_info, v8i16x_info,
                  vextract128_extract, EXTRACT_get_vextract128_imm, [HasAVX512]>;
@@ -767,46 +716,76 @@ defm : vextract_for_size_lowering<"VEXTRACTI64x4Z", v32i16_info, v16i16x_info,
 defm : vextract_for_size_lowering<"VEXTRACTI64x4Z", v64i8_info, v32i8x_info,
                  vextract256_extract, EXTRACT_get_vextract256_imm, [HasAVX512]>;
 
+// A 128-bit subvector extract from the first 256-bit vector position
+// is a subregister copy that needs no instruction.
+def : Pat<(v2i64 (extract_subvector (v8i64 VR512:$src), (iPTR 0))),
+          (v2i64 (EXTRACT_SUBREG (v8i64 VR512:$src), sub_xmm))>;
+def : Pat<(v2f64 (extract_subvector (v8f64 VR512:$src), (iPTR 0))),
+          (v2f64 (EXTRACT_SUBREG (v8f64 VR512:$src), sub_xmm))>;
+def : Pat<(v4i32 (extract_subvector (v16i32 VR512:$src), (iPTR 0))),
+          (v4i32 (EXTRACT_SUBREG (v16i32 VR512:$src), sub_xmm))>;
+def : Pat<(v4f32 (extract_subvector (v16f32 VR512:$src), (iPTR 0))),
+          (v4f32 (EXTRACT_SUBREG (v16f32 VR512:$src), sub_xmm))>;
+def : Pat<(v8i16 (extract_subvector (v32i16 VR512:$src), (iPTR 0))),
+          (v8i16 (EXTRACT_SUBREG (v32i16 VR512:$src), sub_xmm))>;
+def : Pat<(v16i8 (extract_subvector (v64i8 VR512:$src), (iPTR 0))),
+          (v16i8 (EXTRACT_SUBREG (v64i8 VR512:$src), sub_xmm))>;
+
+// A 256-bit subvector extract from the first 256-bit vector position
+// is a subregister copy that needs no instruction.
+def : Pat<(v4i64 (extract_subvector (v8i64 VR512:$src), (iPTR 0))),
+          (v4i64 (EXTRACT_SUBREG (v8i64 VR512:$src), sub_ymm))>;
+def : Pat<(v4f64 (extract_subvector (v8f64 VR512:$src), (iPTR 0))),
+          (v4f64 (EXTRACT_SUBREG (v8f64 VR512:$src), sub_ymm))>;
+def : Pat<(v8i32 (extract_subvector (v16i32 VR512:$src), (iPTR 0))),
+          (v8i32 (EXTRACT_SUBREG (v16i32 VR512:$src), sub_ymm))>;
+def : Pat<(v8f32 (extract_subvector (v16f32 VR512:$src), (iPTR 0))),
+          (v8f32 (EXTRACT_SUBREG (v16f32 VR512:$src), sub_ymm))>;
+def : Pat<(v16i16 (extract_subvector (v32i16 VR512:$src), (iPTR 0))),
+          (v16i16 (EXTRACT_SUBREG (v32i16 VR512:$src), sub_ymm))>;
+def : Pat<(v32i8 (extract_subvector (v64i8 VR512:$src), (iPTR 0))),
+          (v32i8 (EXTRACT_SUBREG (v64i8 VR512:$src), sub_ymm))>;
+
+let AddedComplexity = 25 in { // to give priority over vinsertf128rm
 // A 128-bit subvector insert to the first 512-bit vector position
 // is a subregister copy that needs no instruction.
-def : Pat<(insert_subvector undef, (v2i64 VR128X:$src), (iPTR 0)),
-          (INSERT_SUBREG (v8i64 (IMPLICIT_DEF)),
-          (INSERT_SUBREG (v4i64 (IMPLICIT_DEF)), VR128X:$src, sub_xmm),
-          sub_ymm)>;
-def : Pat<(insert_subvector undef, (v2f64 VR128X:$src), (iPTR 0)),
-          (INSERT_SUBREG (v8f64 (IMPLICIT_DEF)),
-          (INSERT_SUBREG (v4f64 (IMPLICIT_DEF)), VR128X:$src, sub_xmm),
-          sub_ymm)>;
-def : Pat<(insert_subvector undef, (v4i32 VR128X:$src), (iPTR 0)),
-          (INSERT_SUBREG (v16i32 (IMPLICIT_DEF)),
-          (INSERT_SUBREG (v8i32 (IMPLICIT_DEF)), VR128X:$src, sub_xmm),
-          sub_ymm)>;
-def : Pat<(insert_subvector undef, (v4f32 VR128X:$src), (iPTR 0)),
-          (INSERT_SUBREG (v16f32 (IMPLICIT_DEF)),
-          (INSERT_SUBREG (v8f32 (IMPLICIT_DEF)), VR128X:$src, sub_xmm),
-          sub_ymm)>;
-
-def : Pat<(insert_subvector undef, (v4i64 VR256X:$src), (iPTR 0)),
+def : Pat<(v8i64 (insert_subvector undef, (v2i64 VR128X:$src), (iPTR 0))),
+          (INSERT_SUBREG (v8i64 (IMPLICIT_DEF)), VR128X:$src, sub_xmm)>;
+def : Pat<(v8f64 (insert_subvector undef, (v2f64 VR128X:$src), (iPTR 0))),
+          (INSERT_SUBREG (v8f64 (IMPLICIT_DEF)), VR128X:$src, sub_xmm)>;
+def : Pat<(v16i32 (insert_subvector undef, (v4i32 VR128X:$src), (iPTR 0))),
+          (INSERT_SUBREG (v16i32 (IMPLICIT_DEF)), VR128X:$src, sub_xmm)>;
+def : Pat<(v16f32 (insert_subvector undef, (v4f32 VR128X:$src), (iPTR 0))),
+          (INSERT_SUBREG (v16f32 (IMPLICIT_DEF)), VR128X:$src, sub_xmm)>;
+def : Pat<(v32i16 (insert_subvector undef, (v8i16 VR128X:$src), (iPTR 0))),
+          (INSERT_SUBREG (v32i16 (IMPLICIT_DEF)), VR128X:$src, sub_xmm)>;
+def : Pat<(v64i8 (insert_subvector undef, (v16i8 VR128X:$src), (iPTR 0))),
+          (INSERT_SUBREG (v64i8 (IMPLICIT_DEF)), VR128X:$src, sub_xmm)>;
+
+// A 256-bit subvector insert to the first 512-bit vector position
+// is a subregister copy that needs no instruction.
+def : Pat<(v8i64 (insert_subvector undef, (v4i64 VR256X:$src), (iPTR 0))),
           (INSERT_SUBREG (v8i64 (IMPLICIT_DEF)), VR256X:$src, sub_ymm)>;
-def : Pat<(insert_subvector undef, (v4f64 VR256X:$src), (iPTR 0)),
+def : Pat<(v8f64 (insert_subvector undef, (v4f64 VR256X:$src), (iPTR 0))),
           (INSERT_SUBREG (v8f64 (IMPLICIT_DEF)), VR256X:$src, sub_ymm)>;
-def : Pat<(insert_subvector undef, (v8i32 VR256X:$src), (iPTR 0)),
+def : Pat<(v16i32 (insert_subvector undef, (v8i32 VR256X:$src), (iPTR 0))),
           (INSERT_SUBREG (v16i32 (IMPLICIT_DEF)), VR256X:$src, sub_ymm)>;
-def : Pat<(insert_subvector undef, (v8f32 VR256X:$src), (iPTR 0)),
+def : Pat<(v16f32 (insert_subvector undef, (v8f32 VR256X:$src), (iPTR 0))),
           (INSERT_SUBREG (v16f32 (IMPLICIT_DEF)), VR256X:$src, sub_ymm)>;
-def : Pat<(insert_subvector undef, (v16i16 VR256X:$src), (iPTR 0)),
+def : Pat<(v32i16 (insert_subvector undef, (v16i16 VR256X:$src), (iPTR 0))),
           (INSERT_SUBREG (v32i16 (IMPLICIT_DEF)), VR256X:$src, sub_ymm)>;
-def : Pat<(insert_subvector undef, (v32i8 VR256X:$src), (iPTR 0)),
+def : Pat<(v64i8 (insert_subvector undef, (v32i8 VR256X:$src), (iPTR 0))),
           (INSERT_SUBREG (v64i8 (IMPLICIT_DEF)), VR256X:$src, sub_ymm)>;
+}
 
 // vextractps - extract 32 bits from XMM
-def VEXTRACTPSzrr : AVX512AIi8<0x17, MRMDestReg, (outs GR32:$dst),
+def VEXTRACTPSZrr : AVX512AIi8<0x17, MRMDestReg, (outs GR32:$dst),
       (ins VR128X:$src1, u8imm:$src2),
       "vextractps\t{$src2, $src1, $dst|$dst, $src1, $src2}",
       [(set GR32:$dst, (extractelt (bc_v4i32 (v4f32 VR128X:$src1)), imm:$src2))]>,
       EVEX;
 
-def VEXTRACTPSzmr : AVX512AIi8<0x17, MRMDestMem, (outs),
+def VEXTRACTPSZmr : AVX512AIi8<0x17, MRMDestMem, (outs),
       (ins f32mem:$dst, VR128X:$src1, u8imm:$src2),
       "vextractps\t{$src2, $src1, $dst|$dst, $src1, $src2}",
       [(store (extractelt (bc_v4i32 (v4f32 VR128X:$src1)), imm:$src2),
@@ -815,90 +794,107 @@ def VEXTRACTPSzmr : AVX512AIi8<0x17, MRMDestMem, (outs),
 //===---------------------------------------------------------------------===//
 // AVX-512 BROADCAST
 //---
+// broadcast with a scalar argument.
+multiclass avx512_broadcast_scalar<bits<8> opc, string OpcodeStr,
+                            X86VectorVTInfo DestInfo, X86VectorVTInfo SrcInfo> {
+
+  let isCodeGenOnly = 1 in {
+  def r_s : I< opc, MRMSrcReg, (outs DestInfo.RC:$dst),
+               (ins SrcInfo.FRC:$src), OpcodeStr#"\t{$src, $dst|$dst, $src}",
+               [(set DestInfo.RC:$dst, (DestInfo.VT (X86VBroadcast SrcInfo.FRC:$src)))]>,
+               Requires<[HasAVX512]>, T8PD, EVEX;
+
+  let Constraints = "$src0 = $dst" in
+  def rk_s : I< opc, MRMSrcReg, (outs DestInfo.RC:$dst),
+                (ins DestInfo.RC:$src0, DestInfo.KRCWM:$mask, SrcInfo.FRC:$src),
+                OpcodeStr#"\t{$src, $dst {${mask}} |$dst {${mask}}, $src}",
+                [(set DestInfo.RC:$dst,
+                     (vselect DestInfo.KRCWM:$mask,
+                              (DestInfo.VT (X86VBroadcast SrcInfo.FRC:$src)),
+                              DestInfo.RC:$src0))]>,
+              Requires<[HasAVX512]>, T8PD, EVEX, EVEX_K;
+
+  def rkz_s : I< opc, MRMSrcReg, (outs DestInfo.RC:$dst),
+                (ins DestInfo.KRCWM:$mask, SrcInfo.FRC:$src),
+                OpcodeStr#"\t{$src, $dst {${mask}} {z}|$dst {${mask}} {z}, $src}",
+                [(set DestInfo.RC:$dst,
+                     (vselect DestInfo.KRCWM:$mask,
+                              (DestInfo.VT (X86VBroadcast SrcInfo.FRC:$src)),
+                              DestInfo.ImmAllZerosV))]>,
+                Requires<[HasAVX512]>, T8PD, EVEX, EVEX_KZ;
+  } // let isCodeGenOnly = 1 in
+}
 
 multiclass avx512_broadcast_rm<bits<8> opc, string OpcodeStr,
                             X86VectorVTInfo DestInfo, X86VectorVTInfo SrcInfo> {
-
+  let ExeDomain = DestInfo.ExeDomain in {
   defm r : AVX512_maskable<opc, MRMSrcReg, DestInfo, (outs DestInfo.RC:$dst),
                    (ins SrcInfo.RC:$src), OpcodeStr, "$src", "$src",
                    (DestInfo.VT (X86VBroadcast (SrcInfo.VT SrcInfo.RC:$src)))>,
                    T8PD, EVEX;
-  let mayLoad = 1 in
-    defm m : AVX512_maskable<opc, MRMSrcMem, DestInfo, (outs DestInfo.RC:$dst),
-                     (ins SrcInfo.ScalarMemOp:$src), OpcodeStr, "$src", "$src",
-                     (DestInfo.VT (X86VBroadcast
-                                     (SrcInfo.ScalarLdFrag addr:$src)))>,
-                     T8PD, EVEX, EVEX_CD8<SrcInfo.EltSize, CD8VT1>;
-}
+  defm m : AVX512_maskable<opc, MRMSrcMem, DestInfo, (outs DestInfo.RC:$dst),
+                   (ins SrcInfo.ScalarMemOp:$src), OpcodeStr, "$src", "$src",
+                   (DestInfo.VT (X86VBroadcast
+                                   (SrcInfo.ScalarLdFrag addr:$src)))>,
+                   T8PD, EVEX, EVEX_CD8<SrcInfo.EltSize, CD8VT1>;
+  }
 
-multiclass avx512_fp_broadcast_vl<bits<8> opc, string OpcodeStr,
+  def : Pat<(DestInfo.VT (X86VBroadcast
+                          (SrcInfo.VT (scalar_to_vector
+                                       (SrcInfo.ScalarLdFrag addr:$src))))),
+            (!cast<Instruction>(NAME#DestInfo.ZSuffix#m) addr:$src)>;
+  let AddedComplexity = 20 in
+  def : Pat<(DestInfo.VT (vselect DestInfo.KRCWM:$mask,
+                          (X86VBroadcast
+                           (SrcInfo.VT (scalar_to_vector
+                                        (SrcInfo.ScalarLdFrag addr:$src)))),
+                          DestInfo.RC:$src0)),
+            (!cast<Instruction>(NAME#DestInfo.ZSuffix#mk)
+             DestInfo.RC:$src0, DestInfo.KRCWM:$mask, addr:$src)>;
+  let AddedComplexity = 30 in
+  def : Pat<(DestInfo.VT (vselect DestInfo.KRCWM:$mask,
+                          (X86VBroadcast
+                           (SrcInfo.VT (scalar_to_vector
+                                        (SrcInfo.ScalarLdFrag addr:$src)))),
+                          DestInfo.ImmAllZerosV)),
+            (!cast<Instruction>(NAME#DestInfo.ZSuffix#mkz)
+             DestInfo.KRCWM:$mask, addr:$src)>;
+}
+
+multiclass avx512_fp_broadcast_sd<bits<8> opc, string OpcodeStr,
                                                        AVX512VLVectorVTInfo _> {
-  defm Z  : avx512_broadcast_rm<opc, OpcodeStr, _.info512, _.info128>,
-                             EVEX_V512;
+  let Predicates = [HasAVX512] in
+    defm Z  : avx512_broadcast_rm<opc, OpcodeStr, _.info512, _.info128>,
+              avx512_broadcast_scalar<opc, OpcodeStr, _.info512, _.info128>,
+                               EVEX_V512;
 
   let Predicates = [HasVLX] in {
     defm Z256  : avx512_broadcast_rm<opc, OpcodeStr, _.info256, _.info128>,
+                 avx512_broadcast_scalar<opc, OpcodeStr, _.info256, _.info128>,
                              EVEX_V256;
   }
 }
 
-let ExeDomain = SSEPackedSingle in {
-  defm VBROADCASTSS  : avx512_fp_broadcast_vl<0x18, "vbroadcastss",
-                                         avx512vl_f32_info>;
-   let Predicates = [HasVLX] in {
-     defm VBROADCASTSSZ128  : avx512_broadcast_rm<0x18, "vbroadcastss",
-                                         v4f32x_info, v4f32x_info>, EVEX_V128;
-   }
-}
-
-let ExeDomain = SSEPackedDouble in {
-  defm VBROADCASTSD  : avx512_fp_broadcast_vl<0x19, "vbroadcastsd",
-                                         avx512vl_f64_info>, VEX_W;
-}
-
-// avx512_broadcast_pat introduces patterns for broadcast with a scalar argument.
-// Later, we can canonize broadcast instructions before ISel phase and
-// eliminate additional patterns on ISel.
-// SrcRC_v and SrcRC_s are RegisterClasses for vector and scalar
-// representations of source
-multiclass avx512_broadcast_pat<string InstName, SDNode OpNode,
-                                X86VectorVTInfo _, RegisterClass SrcRC_v,
-                                RegisterClass SrcRC_s> {
-  def : Pat<(_.VT (OpNode  (_.EltVT SrcRC_s:$src))),
-            (!cast<Instruction>(InstName##"r")
-              (COPY_TO_REGCLASS SrcRC_s:$src, SrcRC_v))>;
-
-  let AddedComplexity = 30 in {
-    def : Pat<(_.VT (vselect _.KRCWM:$mask,
-                (OpNode (_.EltVT SrcRC_s:$src)), _.RC:$src0)),
-              (!cast<Instruction>(InstName##"rk") _.RC:$src0, _.KRCWM:$mask,
-                (COPY_TO_REGCLASS SrcRC_s:$src, SrcRC_v))>;
+multiclass avx512_fp_broadcast_ss<bits<8> opc, string OpcodeStr,
+                                                       AVX512VLVectorVTInfo _> {
+  let Predicates = [HasAVX512] in
+    defm Z  : avx512_broadcast_rm<opc, OpcodeStr, _.info512, _.info128>,
+              avx512_broadcast_scalar<opc, OpcodeStr, _.info512, _.info128>,
+                               EVEX_V512;
 
-    def : Pat<(_.VT(vselect _.KRCWM:$mask,
-                (OpNode (_.EltVT SrcRC_s:$src)), _.ImmAllZerosV)),
-              (!cast<Instruction>(InstName##"rkz") _.KRCWM:$mask,
-                (COPY_TO_REGCLASS SrcRC_s:$src, SrcRC_v))>;
+  let Predicates = [HasVLX] in {
+    defm Z256  : avx512_broadcast_rm<opc, OpcodeStr, _.info256, _.info128>,
+                 avx512_broadcast_scalar<opc, OpcodeStr, _.info256, _.info128>,
+                             EVEX_V256;
+    defm Z128  : avx512_broadcast_rm<opc, OpcodeStr, _.info128, _.info128>,
+                 avx512_broadcast_scalar<opc, OpcodeStr, _.info128, _.info128>,
+                             EVEX_V128;
   }
 }
-
-defm : avx512_broadcast_pat<"VBROADCASTSSZ", X86VBroadcast, v16f32_info,
-                            VR128X, FR32X>;
-defm : avx512_broadcast_pat<"VBROADCASTSDZ", X86VBroadcast, v8f64_info,
-                            VR128X, FR64X>;
-
-let Predicates = [HasVLX] in {
-  defm : avx512_broadcast_pat<"VBROADCASTSSZ256", X86VBroadcast,
-                              v8f32x_info, VR128X, FR32X>;
-  defm : avx512_broadcast_pat<"VBROADCASTSSZ128", X86VBroadcast,
-                              v4f32x_info, VR128X, FR32X>;
-  defm : avx512_broadcast_pat<"VBROADCASTSDZ256", X86VBroadcast,
-                              v4f64x_info, VR128X, FR64X>;
-}
-
-def : Pat<(v16f32 (X86VBroadcast (loadf32 addr:$src))),
-          (VBROADCASTSSZm addr:$src)>;
-def : Pat<(v8f64 (X86VBroadcast (loadf64 addr:$src))),
-          (VBROADCASTSDZm addr:$src)>;
+defm VBROADCASTSS  : avx512_fp_broadcast_ss<0x18, "vbroadcastss",
+                                       avx512vl_f32_info>;
+defm VBROADCASTSD  : avx512_fp_broadcast_sd<0x19, "vbroadcastsd",
+                                       avx512vl_f64_info>, VEX_W;
 
 def : Pat<(int_x86_avx512_vbroadcast_ss_512 addr:$src),
           (VBROADCASTSSZm addr:$src)>;
@@ -907,9 +903,10 @@ def : Pat<(int_x86_avx512_vbroadcast_sd_512 addr:$src),
 
 multiclass avx512_int_broadcast_reg<bits<8> opc, X86VectorVTInfo _,
                                     RegisterClass SrcRC> {
-  defm r : AVX512_maskable_in_asm<opc, MRMSrcReg, _, (outs _.RC:$dst),
-                           (ins SrcRC:$src),  "vpbroadcast"##_.Suffix,
-                           "$src", "$src", []>, T8PD, EVEX;
+  defm r : AVX512_maskable<opc, MRMSrcReg, _, (outs _.RC:$dst),
+                         (ins SrcRC:$src),
+                         "vpbroadcast"##_.Suffix, "$src", "$src",
+                         (_.VT (X86VBroadcast SrcRC:$src))>, T8PD, EVEX;
 }
 
 multiclass avx512_int_broadcast_reg_vl<bits<8> opc, AVX512VLVectorVTInfo _,
@@ -922,10 +919,18 @@ multiclass avx512_int_broadcast_reg_vl<bits<8> opc, AVX512VLVectorVTInfo _,
   }
 }
 
-defm VPBROADCASTBr : avx512_int_broadcast_reg_vl<0x7A, avx512vl_i8_info, GR32,
+let isCodeGenOnly = 1 in {
+defm VPBROADCASTBr : avx512_int_broadcast_reg_vl<0x7A, avx512vl_i8_info, GR8,
                                                  HasBWI>;
-defm VPBROADCASTWr : avx512_int_broadcast_reg_vl<0x7B, avx512vl_i16_info, GR32,
+defm VPBROADCASTWr : avx512_int_broadcast_reg_vl<0x7B, avx512vl_i16_info, GR16,
                                                  HasBWI>;
+}
+let isAsmParserOnly = 1 in {
+  defm VPBROADCASTBr_Alt : avx512_int_broadcast_reg_vl<0x7A, avx512vl_i8_info,
+                                                       GR32, HasBWI>;
+  defm VPBROADCASTWr_Alt : avx512_int_broadcast_reg_vl<0x7B, avx512vl_i16_info,
+                                                       GR32, HasBWI>;
+}
 defm VPBROADCASTDr : avx512_int_broadcast_reg_vl<0x7C, avx512vl_i32_info, GR32,
                                                  HasAVX512>;
 defm VPBROADCASTQr : avx512_int_broadcast_reg_vl<0x7C, avx512vl_i64_info, GR64,
@@ -933,27 +938,9 @@ defm VPBROADCASTQr : avx512_int_broadcast_reg_vl<0x7C, avx512vl_i64_info, GR64,
 
 def : Pat <(v16i32 (X86vzext VK16WM:$mask)),
            (VPBROADCASTDrZrkz VK16WM:$mask, (i32 (MOV32ri 0x1)))>;
-
 def : Pat <(v8i64 (X86vzext VK8WM:$mask)),
            (VPBROADCASTQrZrkz VK8WM:$mask, (i64 (MOV64ri 0x1)))>;
 
-def : Pat<(v16i32 (X86VBroadcast (i32 GR32:$src))),
-        (VPBROADCASTDrZr GR32:$src)>;
-def : Pat<(v8i64 (X86VBroadcast (i64 GR64:$src))),
-        (VPBROADCASTQrZr GR64:$src)>;
-
-def : Pat<(v16i32 (int_x86_avx512_pbroadcastd_i32_512 (i32 GR32:$src))),
-        (VPBROADCASTDrZr GR32:$src)>;
-def : Pat<(v8i64 (int_x86_avx512_pbroadcastq_i64_512 (i64 GR64:$src))),
-        (VPBROADCASTQrZr GR64:$src)>;
-
-def : Pat<(v16i32 (int_x86_avx512_mask_pbroadcast_d_gpr_512 (i32 GR32:$src),
-                   (v16i32 immAllZerosV), (i16 GR16:$mask))),
-          (VPBROADCASTDrZrkz (COPY_TO_REGCLASS GR16:$mask, VK16WM), GR32:$src)>;
-def : Pat<(v8i64 (int_x86_avx512_mask_pbroadcast_q_gpr_512 (i64 GR64:$src),
-                   (bc_v8i64 (v16i32 immAllZerosV)), (i8 GR8:$mask))),
-          (VPBROADCASTQrZrkz (COPY_TO_REGCLASS GR8:$mask, VK8WM), GR64:$src)>;
-
 // Provide aliases for broadcast from the same register class that
 // automatically does the extract.
 multiclass avx512_int_broadcast_rm_lowering<X86VectorVTInfo DestInfo,
@@ -992,12 +979,11 @@ defm VPBROADCASTQ  : avx512_int_broadcast_rm_vl<0x59, "vpbroadcastq",
 
 multiclass avx512_subvec_broadcast_rm<bits<8> opc, string OpcodeStr,
                           X86VectorVTInfo _Dst, X86VectorVTInfo _Src> {
-  let mayLoad = 1 in 
-    defm rm : AVX512_maskable<opc, MRMSrcMem, _Dst, (outs _Dst.RC:$dst), 
-                             (ins _Src.MemOp:$src), OpcodeStr, "$src", "$src",
-                             (_Dst.VT (X86SubVBroadcast
-                               (_Src.VT (bitconvert (_Src.LdFrag addr:$src)))))>, 
-                              AVX5128IBase, EVEX;
+  defm rm : AVX512_maskable<opc, MRMSrcMem, _Dst, (outs _Dst.RC:$dst),
+                           (ins _Src.MemOp:$src), OpcodeStr, "$src", "$src",
+                           (_Dst.VT (X86SubVBroadcast
+                             (_Src.VT (bitconvert (_Src.LdFrag addr:$src)))))>,
+                            AVX5128IBase, EVEX;
 }
 
 defm VBROADCASTI32X4 : avx512_subvec_broadcast_rm<0x5a, "vbroadcasti32x4",
@@ -1044,45 +1030,29 @@ defm VBROADCASTF32X8 : avx512_subvec_broadcast_rm<0x1b, "vbroadcastf32x8",
                        EVEX_V512, EVEX_CD8<32, CD8VT8>;
 }
 
-multiclass avx512_broadcast_32x2<bits<8> opc, string OpcodeStr,
-                                 X86VectorVTInfo _Dst, X86VectorVTInfo _Src,
-                                 SDNode OpNode = X86SubVBroadcast> {
-
-  defm r : AVX512_maskable<opc, MRMSrcReg, _Dst, (outs _Dst.RC:$dst),
-                   (ins _Src.RC:$src), OpcodeStr, "$src", "$src",
-                   (_Dst.VT (OpNode (_Src.VT _Src.RC:$src)))>,
-                   T8PD, EVEX;
-  let mayLoad = 1 in
-    defm m : AVX512_maskable<opc, MRMSrcMem, _Dst, (outs _Dst.RC:$dst),
-                   (ins _Src.ScalarMemOp:$src), OpcodeStr, "$src", "$src",
-                   (_Dst.VT (OpNode
-                              (_Src.VT (scalar_to_vector(loadi64 addr:$src)))))>,
-                   T8PD, EVEX, EVEX_CD8<_Src.EltSize, CD8VT2>;
-}
-
 multiclass avx512_common_broadcast_32x2<bits<8> opc, string OpcodeStr,
-                             AVX512VLVectorVTInfo _> {
+                         AVX512VLVectorVTInfo _Dst, AVX512VLVectorVTInfo _Src> {
   let Predicates = [HasDQI] in
-    defm Z :    avx512_broadcast_32x2<opc, OpcodeStr, _.info512, _.info128>,
+    defm Z :    avx512_broadcast_rm<opc, OpcodeStr, _Dst.info512, _Src.info128>,
                                   EVEX_V512;
   let Predicates = [HasDQI, HasVLX] in
-    defm Z256 : avx512_broadcast_32x2<opc, OpcodeStr, _.info256, _.info128>,
+    defm Z256 : avx512_broadcast_rm<opc, OpcodeStr, _Dst.info256, _Src.info128>,
                                   EVEX_V256;
 }
 
 multiclass avx512_common_broadcast_i32x2<bits<8> opc, string OpcodeStr,
-                                                       AVX512VLVectorVTInfo _> :
-  avx512_common_broadcast_32x2<opc, OpcodeStr, _> {
+                         AVX512VLVectorVTInfo _Dst, AVX512VLVectorVTInfo _Src> :
+  avx512_common_broadcast_32x2<opc, OpcodeStr, _Dst, _Src> {
 
   let Predicates = [HasDQI, HasVLX] in
-    defm Z128 : avx512_broadcast_32x2<opc, OpcodeStr, _.info128, _.info128,
-                                      X86SubV32x2Broadcast>, EVEX_V128;
+    defm Z128 : avx512_broadcast_rm<opc, OpcodeStr, _Dst.info128, _Src.info128>,
+                                      EVEX_V128;
 }
 
 defm VPBROADCASTI32X2  : avx512_common_broadcast_i32x2<0x59, "vbroadcasti32x2",
-                                           avx512vl_i32_info>;
+                                           avx512vl_i32_info, avx512vl_i64_info>;
 defm VPBROADCASTF32X2  : avx512_common_broadcast_32x2<0x19, "vbroadcastf32x2",
-                                           avx512vl_f32_info>;
+                                           avx512vl_f32_info, avx512vl_f64_info>;
 
 def : Pat<(v16f32 (X86VBroadcast (v16f32 VR512:$src))),
           (VBROADCASTSSZr (EXTRACT_SUBREG (v16f32 VR512:$src), sub_xmm))>;
@@ -1094,14 +1064,6 @@ def : Pat<(v8f64 (X86VBroadcast (v8f64 VR512:$src))),
 def : Pat<(v8f64 (X86VBroadcast (v4f64 VR256X:$src))),
           (VBROADCASTSDZr (EXTRACT_SUBREG (v4f64 VR256X:$src), sub_xmm))>;
 
-// Provide fallback in case the load node that is used in the patterns above
-// is used by additional users, which prevents the pattern selection.
-def : Pat<(v16f32 (X86VBroadcast FR32X:$src)),
-          (VBROADCASTSSZr (COPY_TO_REGCLASS FR32X:$src, VR128X))>;
-def : Pat<(v8f64 (X86VBroadcast FR64X:$src)),
-          (VBROADCASTSDZr (COPY_TO_REGCLASS FR64X:$src, VR128X))>;
-
-
 //===----------------------------------------------------------------------===//
 // AVX-512 BROADCAST MASK TO VECTOR REGISTER
 //---
@@ -1112,7 +1074,7 @@ multiclass avx512_mask_broadcastm<bits<8> opc, string OpcodeStr,
                   [(set _.RC:$dst, (_.VT (X86VBroadcastm KRC:$src)))]>, EVEX;
 }
 
-multiclass avx512_mask_broadcast<bits<8> opc, string OpcodeStr, 
+multiclass avx512_mask_broadcast<bits<8> opc, string OpcodeStr,
                                  AVX512VLVectorVTInfo VTInfo, RegisterClass KRC> {
   let Predicates = [HasCDI] in
     defm Z : avx512_mask_broadcastm<opc, OpcodeStr, VTInfo.info512, KRC>, EVEX_V512;
@@ -1138,7 +1100,6 @@ let Constraints = "$src1 = $dst" in {
           (_.VT (X86VPermi2X IdxVT.RC:$src1, _.RC:$src2, _.RC:$src3))>, EVEX_4V,
          AVX5128IBase;
 
-  let mayLoad = 1 in
   defm rm: AVX512_maskable_3src_cast<opc, MRMSrcMem, _, IdxVT, (outs _.RC:$dst),
             (ins _.RC:$src2, _.MemOp:$src3),
             OpcodeStr, "$src3, $src2", "$src2, $src3",
@@ -1149,7 +1110,7 @@ let Constraints = "$src1 = $dst" in {
 }
 multiclass avx512_perm_i_mb<bits<8> opc, string OpcodeStr,
                             X86VectorVTInfo _, X86VectorVTInfo IdxVT> {
-  let mayLoad = 1, Constraints = "$src1 = $dst" in
+  let Constraints = "$src1 = $dst" in
   defm rmb: AVX512_maskable_3src_cast<opc, MRMSrcMem, _, IdxVT, (outs _.RC:$dst),
               (ins _.RC:$src2, _.ScalarMemOp:$src3),
               OpcodeStr,   !strconcat("${src3}", _.BroadcastStr,", $src2"),
@@ -1178,13 +1139,14 @@ multiclass avx512_perm_i_sizes<bits<8> opc, string OpcodeStr,
   }
 }
 
-multiclass avx512_perm_i_sizes_w<bits<8> opc, string OpcodeStr,
+multiclass avx512_perm_i_sizes_bw<bits<8> opc, string OpcodeStr,
                                  AVX512VLVectorVTInfo VTInfo,
-                                 AVX512VLVectorVTInfo Idx> {
-  let Predicates = [HasBWI] in
+                                 AVX512VLVectorVTInfo Idx,
+                                 Predicate Prd> {
+  let Predicates = [Prd] in
   defm NAME: avx512_perm_i<opc, OpcodeStr, VTInfo.info512,
                            Idx.info512>, EVEX_V512;
-  let Predicates = [HasBWI, HasVLX] in {
+  let Predicates = [Prd, HasVLX] in {
   defm NAME#128: avx512_perm_i<opc, OpcodeStr, VTInfo.info128,
                                Idx.info128>, EVEX_V128;
   defm NAME#256: avx512_perm_i<opc, OpcodeStr, VTInfo.info256,
@@ -1196,8 +1158,12 @@ defm VPERMI2D  : avx512_perm_i_sizes<0x76, "vpermi2d",
                   avx512vl_i32_info, avx512vl_i32_info>, EVEX_CD8<32, CD8VF>;
 defm VPERMI2Q  : avx512_perm_i_sizes<0x76, "vpermi2q",
                   avx512vl_i64_info, avx512vl_i64_info>, VEX_W, EVEX_CD8<64, CD8VF>;
-defm VPERMI2W  : avx512_perm_i_sizes_w<0x75, "vpermi2w",
-                  avx512vl_i16_info, avx512vl_i16_info>, VEX_W, EVEX_CD8<16, CD8VF>;
+defm VPERMI2W  : avx512_perm_i_sizes_bw<0x75, "vpermi2w",
+                  avx512vl_i16_info, avx512vl_i16_info, HasBWI>,
+                  VEX_W, EVEX_CD8<16, CD8VF>;
+defm VPERMI2B  : avx512_perm_i_sizes_bw<0x75, "vpermi2b",
+                  avx512vl_i8_info, avx512vl_i8_info, HasVBMI>,
+                  EVEX_CD8<8, CD8VF>;
 defm VPERMI2PS : avx512_perm_i_sizes<0x77, "vpermi2ps",
                   avx512vl_f32_info, avx512vl_i32_info>, EVEX_CD8<32, CD8VF>;
 defm VPERMI2PD : avx512_perm_i_sizes<0x77, "vpermi2pd",
@@ -1213,7 +1179,6 @@ let Constraints = "$src1 = $dst" in {
           (_.VT (X86VPermt2 _.RC:$src1, IdxVT.RC:$src2, _.RC:$src3))>, EVEX_4V,
          AVX5128IBase;
 
-  let mayLoad = 1 in
   defm rm: AVX512_maskable_3src<opc, MRMSrcMem, _, (outs _.RC:$dst),
             (ins IdxVT.RC:$src2, _.MemOp:$src3),
             OpcodeStr, "$src3, $src2", "$src2, $src3",
@@ -1224,7 +1189,7 @@ let Constraints = "$src1 = $dst" in {
 }
 multiclass avx512_perm_t_mb<bits<8> opc, string OpcodeStr,
                             X86VectorVTInfo _, X86VectorVTInfo IdxVT> {
-  let mayLoad = 1, Constraints = "$src1 = $dst" in
+  let Constraints = "$src1 = $dst" in
   defm rmb: AVX512_maskable_3src<opc, MRMSrcMem, _, (outs _.RC:$dst),
               (ins IdxVT.RC:$src2, _.ScalarMemOp:$src3),
               OpcodeStr,   !strconcat("${src3}", _.BroadcastStr,", $src2"),
@@ -1253,13 +1218,14 @@ multiclass avx512_perm_t_sizes<bits<8> opc, string OpcodeStr,
   }
 }
 
-multiclass avx512_perm_t_sizes_w<bits<8> opc, string OpcodeStr,
+multiclass avx512_perm_t_sizes_bw<bits<8> opc, string OpcodeStr,
                                  AVX512VLVectorVTInfo VTInfo,
-                                 AVX512VLVectorVTInfo Idx> {
-  let Predicates = [HasBWI] in
+                                 AVX512VLVectorVTInfo Idx,
+                                 Predicate Prd> {
+  let Predicates = [Prd] in
   defm NAME: avx512_perm_t<opc, OpcodeStr, VTInfo.info512,
                            Idx.info512>, EVEX_V512;
-  let Predicates = [HasBWI, HasVLX] in {
+  let Predicates = [Prd, HasVLX] in {
   defm NAME#128: avx512_perm_t<opc, OpcodeStr, VTInfo.info128,
                                Idx.info128>, EVEX_V128;
   defm NAME#256: avx512_perm_t<opc, OpcodeStr, VTInfo.info256,
@@ -1271,8 +1237,12 @@ defm VPERMT2D  : avx512_perm_t_sizes<0x7E, "vpermt2d",
                   avx512vl_i32_info, avx512vl_i32_info>, EVEX_CD8<32, CD8VF>;
 defm VPERMT2Q  : avx512_perm_t_sizes<0x7E, "vpermt2q",
                   avx512vl_i64_info, avx512vl_i64_info>, VEX_W, EVEX_CD8<64, CD8VF>;
-defm VPERMT2W  : avx512_perm_t_sizes_w<0x7D, "vpermt2w",
-                  avx512vl_i16_info, avx512vl_i16_info>, VEX_W, EVEX_CD8<16, CD8VF>;
+defm VPERMT2W  : avx512_perm_t_sizes_bw<0x7D, "vpermt2w",
+                  avx512vl_i16_info, avx512vl_i16_info, HasBWI>,
+                  VEX_W, EVEX_CD8<16, CD8VF>;
+defm VPERMT2B  : avx512_perm_t_sizes_bw<0x7D, "vpermt2b",
+                  avx512vl_i8_info, avx512vl_i8_info, HasVBMI>,
+                  EVEX_CD8<8, CD8VF>;
 defm VPERMT2PS : avx512_perm_t_sizes<0x7F, "vpermt2ps",
                   avx512vl_f32_info, avx512vl_i32_info>, EVEX_CD8<32, CD8VF>;
 defm VPERMT2PD : avx512_perm_t_sizes<0x7F, "vpermt2pd",
@@ -1283,6 +1253,7 @@ defm VPERMT2PD : avx512_perm_t_sizes<0x7F, "vpermt2pd",
 //
 multiclass avx512_blendmask<bits<8> opc, string OpcodeStr, X86VectorVTInfo _> {
   let ExeDomain = _.ExeDomain in {
+  let hasSideEffects = 0 in
   def rr : AVX5128I<opc, MRMSrcReg, (outs _.RC:$dst),
              (ins _.RC:$src1, _.RC:$src2),
              !strconcat(OpcodeStr,
@@ -1292,14 +1263,16 @@ multiclass avx512_blendmask<bits<8> opc, string OpcodeStr, X86VectorVTInfo _> {
              (ins _.KRCWM:$mask, _.RC:$src1, _.RC:$src2),
              !strconcat(OpcodeStr,
              "\t{$src2, $src1, ${dst} {${mask}}|${dst} {${mask}}, $src1, $src2}"),
-             [(set _.RC:$dst, (X86select _.KRCWM:$mask, (_.VT _.RC:$src1),
-                 (_.VT _.RC:$src2)))]>, EVEX_4V, EVEX_K;
+             [(set _.RC:$dst, (vselect _.KRCWM:$mask, 
+                                (_.VT _.RC:$src2),
+                                (_.VT _.RC:$src1)))]>, EVEX_4V, EVEX_K;
+  let hasSideEffects = 0 in
   def rrkz : AVX5128I<opc, MRMSrcReg, (outs _.RC:$dst),
              (ins _.KRCWM:$mask, _.RC:$src1, _.RC:$src2),
              !strconcat(OpcodeStr,
              "\t{$src2, $src1, ${dst} {${mask}} {z}|${dst} {${mask}} {z}, $src1, $src2}"),
              []>, EVEX_4V, EVEX_KZ;
-  let mayLoad = 1 in {
+  let mayLoad = 1, hasSideEffects = 0 in
   def rm  : AVX5128I<opc, MRMSrcMem, (outs _.RC:$dst),
              (ins _.RC:$src1, _.MemOp:$src2),
              !strconcat(OpcodeStr,
@@ -1309,16 +1282,17 @@ multiclass avx512_blendmask<bits<8> opc, string OpcodeStr, X86VectorVTInfo _> {
              (ins _.KRCWM:$mask, _.RC:$src1, _.MemOp:$src2),
              !strconcat(OpcodeStr,
              "\t{$src2, $src1, ${dst} {${mask}}|${dst} {${mask}}, $src1, $src2}"),
-             [(set _.RC:$dst, (X86select _.KRCWM:$mask, (_.VT _.RC:$src1),
-              (_.VT (bitconvert (_.LdFrag addr:$src2)))))]>,
+             [(set _.RC:$dst, (vselect _.KRCWM:$mask,
+                                 (_.VT (bitconvert (_.LdFrag addr:$src2))),
+                                 (_.VT _.RC:$src1)))]>,
               EVEX_4V, EVEX_K, EVEX_CD8<_.EltSize, CD8VF>;
+  let mayLoad = 1, hasSideEffects = 0 in
   def rmkz : AVX5128I<opc, MRMSrcMem, (outs _.RC:$dst),
              (ins _.KRCWM:$mask, _.RC:$src1, _.MemOp:$src2),
              !strconcat(OpcodeStr,
              "\t{$src2, $src1, ${dst} {${mask}} {z}|${dst} {${mask}} {z}, $src1, $src2}"),
              []>, EVEX_4V, EVEX_KZ, EVEX_CD8<_.EltSize, CD8VF>;
   }
-  }
 }
 multiclass avx512_blendmask_rmb<bits<8> opc, string OpcodeStr, X86VectorVTInfo _> {
 
@@ -1327,10 +1301,12 @@ multiclass avx512_blendmask_rmb<bits<8> opc, string OpcodeStr, X86VectorVTInfo _
        !strconcat(OpcodeStr,
             "\t{${src2}", _.BroadcastStr, ", $src1, $dst {${mask}}|",
             "$dst {${mask}}, $src1, ${src2}", _.BroadcastStr, "}"),
-      [(set _.RC:$dst,(X86select _.KRCWM:$mask, (_.VT _.RC:$src1),
-                       (X86VBroadcast (_.ScalarLdFrag addr:$src2))))]>,
+      [(set _.RC:$dst,(vselect _.KRCWM:$mask,
+                        (X86VBroadcast (_.ScalarLdFrag addr:$src2)),
+                        (_.VT _.RC:$src1)))]>,
       EVEX_4V, EVEX_K, EVEX_B, EVEX_CD8<_.EltSize, CD8VF>;
 
+  let mayLoad = 1, hasSideEffects = 0 in
   def rmb : AVX5128I<opc, MRMSrcMem, (outs _.RC:$dst),
       (ins _.RC:$src1, _.ScalarMemOp:$src2),
        !strconcat(OpcodeStr,
@@ -1373,7 +1349,7 @@ defm VPBLENDMB : blendmask_bw <0x66, "vpblendmb", avx512vl_i8_info>;
 defm VPBLENDMW : blendmask_bw <0x66, "vpblendmw", avx512vl_i16_info>, VEX_W;
 
 
-let Predicates = [HasAVX512] in {
+let Predicates = [HasAVX512, NoVLX] in {
 def : Pat<(v8f32 (vselect (v8i1 VK8WM:$mask), (v8f32 VR256X:$src1),
                             (v8f32 VR256X:$src2))),
             (EXTRACT_SUBREG
@@ -1404,15 +1380,14 @@ multiclass avx512_cmp_scalar<X86VectorVTInfo _, SDNode OpNode, SDNode OpNodeRnd>
                       (OpNode (_.VT _.RC:$src1),
                               (_.VT _.RC:$src2),
                               imm:$cc)>, EVEX_4V;
-  let mayLoad = 1 in
-    defm  rm_Int  : AVX512_maskable_cmp<0xC2, MRMSrcMem, _,
-                      (outs _.KRC:$dst),
-                      (ins _.RC:$src1, _.MemOp:$src2, AVXCC:$cc),
-                      "vcmp${cc}"#_.Suffix,
-                      "$src2, $src1", "$src1, $src2",
-                      (OpNode (_.VT _.RC:$src1),
-                          (_.VT (scalar_to_vector (_.ScalarLdFrag addr:$src2))),
-                          imm:$cc)>, EVEX_4V, EVEX_CD8<_.EltSize, CD8VT1>;
+  defm  rm_Int  : AVX512_maskable_cmp<0xC2, MRMSrcMem, _,
+                    (outs _.KRC:$dst),
+                    (ins _.RC:$src1, _.ScalarMemOp:$src2, AVXCC:$cc),
+                    "vcmp${cc}"#_.Suffix,
+                    "$src2, $src1", "$src1, $src2",
+                    (OpNode (_.VT _.RC:$src1),
+                        (_.VT (scalar_to_vector (_.ScalarLdFrag addr:$src2))),
+                        imm:$cc)>, EVEX_4V, EVEX_CD8<_.EltSize, CD8VT1>;
 
   defm  rrb_Int  : AVX512_maskable_cmp<0xC2, MRMSrcReg, _,
                      (outs _.KRC:$dst),
@@ -1432,7 +1407,7 @@ multiclass avx512_cmp_scalar<X86VectorVTInfo _, SDNode OpNode, SDNode OpNodeRnd>
                         "$cc, $src2, $src1", "$src1, $src2, $cc">, EVEX_4V;
     defm  rmi_alt  : AVX512_maskable_cmp_alt<0xC2, MRMSrcMem, _,
                         (outs _.KRC:$dst),
-                        (ins _.RC:$src1, _.MemOp:$src2, u8imm:$cc),
+                        (ins _.RC:$src1, _.ScalarMemOp:$src2, u8imm:$cc),
                         "vcmp"#_.Suffix,
                         "$cc, $src2, $src1", "$src1, $src2, $cc">,
                         EVEX_4V, EVEX_CD8<_.EltSize, CD8VT1>;
@@ -1454,16 +1429,15 @@ multiclass avx512_cmp_scalar<X86VectorVTInfo _, SDNode OpNode, SDNode OpNodeRnd>
                                           _.FRC:$src2,
                                           imm:$cc))],
                 IIC_SSE_ALU_F32S_RR>, EVEX_4V;
-    let mayLoad = 1 in
-      def rm : AVX512Ii8<0xC2, MRMSrcMem,
-                (outs _.KRC:$dst),
-                (ins _.FRC:$src1, _.ScalarMemOp:$src2, AVXCC:$cc),
-                !strconcat("vcmp${cc}", _.Suffix,
-                           "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
-                [(set _.KRC:$dst, (OpNode _.FRC:$src1,
-                                          (_.ScalarLdFrag addr:$src2),
-                                          imm:$cc))],
-                IIC_SSE_ALU_F32P_RM>, EVEX_4V, EVEX_CD8<_.EltSize, CD8VT1>;
+    def rm : AVX512Ii8<0xC2, MRMSrcMem,
+              (outs _.KRC:$dst),
+              (ins _.FRC:$src1, _.ScalarMemOp:$src2, AVXCC:$cc),
+              !strconcat("vcmp${cc}", _.Suffix,
+                         "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+              [(set _.KRC:$dst, (OpNode _.FRC:$src1,
+                                        (_.ScalarLdFrag addr:$src2),
+                                        imm:$cc))],
+              IIC_SSE_ALU_F32P_RM>, EVEX_4V, EVEX_CD8<_.EltSize, CD8VT1>;
   }
 }
 
@@ -1481,7 +1455,6 @@ multiclass avx512_icmp_packed<bits<8> opc, string OpcodeStr, SDNode OpNode,
              !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
              [(set _.KRC:$dst, (OpNode (_.VT _.RC:$src1), (_.VT _.RC:$src2)))],
              IIC_SSE_ALU_F32P_RR>, EVEX_4V;
-  let mayLoad = 1 in
   def rm : AVX512BI<opc, MRMSrcMem,
              (outs _.KRC:$dst), (ins _.RC:$src1, _.MemOp:$src2),
              !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
@@ -1495,7 +1468,6 @@ multiclass avx512_icmp_packed<bits<8> opc, string OpcodeStr, SDNode OpNode,
               [(set _.KRC:$dst, (and _.KRCWM:$mask,
                                    (OpNode (_.VT _.RC:$src1), (_.VT _.RC:$src2))))],
               IIC_SSE_ALU_F32P_RR>, EVEX_4V, EVEX_K;
-  let mayLoad = 1 in
   def rmk : AVX512BI<opc, MRMSrcMem,
               (outs _.KRC:$dst), (ins _.KRCWM:$mask, _.RC:$src1, _.MemOp:$src2),
               !strconcat(OpcodeStr, "\t{$src2, $src1, $dst {${mask}}|",
@@ -1510,7 +1482,6 @@ multiclass avx512_icmp_packed<bits<8> opc, string OpcodeStr, SDNode OpNode,
 multiclass avx512_icmp_packed_rmb<bits<8> opc, string OpcodeStr, SDNode OpNode,
               X86VectorVTInfo _> :
            avx512_icmp_packed<opc, OpcodeStr, OpNode, _> {
-  let mayLoad = 1 in {
   def rmb : AVX512BI<opc, MRMSrcMem,
               (outs _.KRC:$dst), (ins _.RC:$src1, _.ScalarMemOp:$src2),
               !strconcat(OpcodeStr, "\t{${src2}", _.BroadcastStr, ", $src1, $dst",
@@ -1529,7 +1500,6 @@ multiclass avx512_icmp_packed_rmb<bits<8> opc, string OpcodeStr, SDNode OpNode,
                                         (X86VBroadcast
                                           (_.ScalarLdFrag addr:$src2)))))],
                IIC_SSE_ALU_F32P_RM>, EVEX_4V, EVEX_K, EVEX_B;
-  }
 }
 
 multiclass avx512_icmp_packed_vl<bits<8> opc, string OpcodeStr, SDNode OpNode,
@@ -1612,7 +1582,6 @@ multiclass avx512_icmp_cc<bits<8> opc, string Suffix, SDNode OpNode,
              [(set _.KRC:$dst, (OpNode (_.VT _.RC:$src1), (_.VT _.RC:$src2),
                                        imm:$cc))],
              IIC_SSE_ALU_F32P_RR>, EVEX_4V;
-  let mayLoad = 1 in
   def rmi : AVX512AIi8<opc, MRMSrcMem,
              (outs _.KRC:$dst), (ins _.RC:$src1, _.MemOp:$src2, AVX512ICC:$cc),
              !strconcat("vpcmp${cc}", Suffix,
@@ -1631,7 +1600,6 @@ multiclass avx512_icmp_cc<bits<8> opc, string Suffix, SDNode OpNode,
                                   (OpNode (_.VT _.RC:$src1), (_.VT _.RC:$src2),
                                           imm:$cc)))],
               IIC_SSE_ALU_F32P_RR>, EVEX_4V, EVEX_K;
-  let mayLoad = 1 in
   def rmik : AVX512AIi8<opc, MRMSrcMem,
               (outs _.KRC:$dst), (ins _.KRCWM:$mask, _.RC:$src1, _.MemOp:$src2,
                                     AVX512ICC:$cc),
@@ -1774,25 +1742,23 @@ multiclass avx512_vcmp_common<X86VectorVTInfo _> {
                          (_.VT _.RC:$src2),
                            imm:$cc)>;
 
-  let mayLoad = 1 in {
-    defm  rmi  : AVX512_maskable_cmp<0xC2, MRMSrcMem, _,
-                  (outs _.KRC:$dst),(ins _.RC:$src1, _.MemOp:$src2, AVXCC:$cc),
-                  "vcmp${cc}"#_.Suffix,
-                  "$src2, $src1", "$src1, $src2",
-                  (X86cmpm (_.VT _.RC:$src1),
-                          (_.VT (bitconvert (_.LdFrag addr:$src2))),
-                          imm:$cc)>;
-
-    defm  rmbi : AVX512_maskable_cmp<0xC2, MRMSrcMem, _,
-                  (outs _.KRC:$dst),
-                  (ins _.RC:$src1, _.ScalarMemOp:$src2, AVXCC:$cc),
-                  "vcmp${cc}"#_.Suffix,
-                  "${src2}"##_.BroadcastStr##", $src1",
-                  "$src1, ${src2}"##_.BroadcastStr,
-                  (X86cmpm (_.VT _.RC:$src1),
-                          (_.VT (X86VBroadcast(_.ScalarLdFrag addr:$src2))),
-                          imm:$cc)>,EVEX_B;
-  }
+  defm  rmi  : AVX512_maskable_cmp<0xC2, MRMSrcMem, _,
+                (outs _.KRC:$dst),(ins _.RC:$src1, _.MemOp:$src2, AVXCC:$cc),
+                "vcmp${cc}"#_.Suffix,
+                "$src2, $src1", "$src1, $src2",
+                (X86cmpm (_.VT _.RC:$src1),
+                        (_.VT (bitconvert (_.LdFrag addr:$src2))),
+                        imm:$cc)>;
+
+  defm  rmbi : AVX512_maskable_cmp<0xC2, MRMSrcMem, _,
+                (outs _.KRC:$dst),
+                (ins _.RC:$src1, _.ScalarMemOp:$src2, AVXCC:$cc),
+                "vcmp${cc}"#_.Suffix,
+                "${src2}"##_.BroadcastStr##", $src1",
+                "$src1, ${src2}"##_.BroadcastStr,
+                (X86cmpm (_.VT _.RC:$src1),
+                        (_.VT (X86VBroadcast(_.ScalarLdFrag addr:$src2))),
+                        imm:$cc)>,EVEX_B;
   // Accept explicit immediate argument form instead of comparison code.
   let isAsmParserOnly = 1, hasSideEffects = 0 in {
     defm  rri_alt : AVX512_maskable_cmp_alt<0xC2, MRMSrcReg, _,
@@ -1888,10 +1854,10 @@ multiclass avx512_scalar_fpclass<bits<8> opc, string OpcodeStr, SDNode OpNode,
                       (ins _.KRCWM:$mask, _.RC:$src1, i32u8imm:$src2),
                       OpcodeStr##_.Suffix#
                       "\t{$src2, $src1, $dst {${mask}}|$dst {${mask}}, $src1, $src2}",
-                      [(set _.KRC:$dst,(or _.KRCWM:$mask, 
+                      [(set _.KRC:$dst,(or _.KRCWM:$mask,
                                       (OpNode (_.VT _.RC:$src1),
                                       (i32 imm:$src2))))], NoItinerary>, EVEX_K;
-    let mayLoad = 1, AddedComplexity = 20 in {
+    let AddedComplexity = 20 in {
       def rm : AVX512<opc, MRMSrcMem, (outs _.KRC:$dst),
                       (ins _.MemOp:$src1, i32u8imm:$src2),
                       OpcodeStr##_.Suffix##
@@ -1903,7 +1869,7 @@ multiclass avx512_scalar_fpclass<bits<8> opc, string OpcodeStr, SDNode OpNode,
                       (ins _.KRCWM:$mask, _.MemOp:$src1, i32u8imm:$src2),
                       OpcodeStr##_.Suffix##
                       "\t{$src2, $src1, $dst {${mask}}|$dst {${mask}}, $src1, $src2}",
-                      [(set _.KRC:$dst,(or _.KRCWM:$mask, 
+                      [(set _.KRC:$dst,(or _.KRCWM:$mask,
                           (OpNode (_.VT (bitconvert (_.LdFrag addr:$src1))),
                               (i32 imm:$src2))))], NoItinerary>, EVEX_K;
     }
@@ -1924,51 +1890,49 @@ multiclass avx512_vector_fpclass<bits<8> opc, string OpcodeStr, SDNode OpNode,
                       (ins _.KRCWM:$mask, _.RC:$src1, i32u8imm:$src2),
                       OpcodeStr##_.Suffix#
                       "\t{$src2, $src1, $dst {${mask}}|$dst {${mask}}, $src1, $src2}",
-                      [(set _.KRC:$dst,(or _.KRCWM:$mask, 
+                      [(set _.KRC:$dst,(or _.KRCWM:$mask,
                                        (OpNode (_.VT _.RC:$src1),
                                        (i32 imm:$src2))))], NoItinerary>, EVEX_K;
-  let mayLoad = 1 in {
-    def rm : AVX512<opc, MRMSrcMem, (outs _.KRC:$dst),
-                      (ins _.MemOp:$src1, i32u8imm:$src2),
-                      OpcodeStr##_.Suffix##mem#
-                      "\t{$src2, $src1, $dst|$dst, $src1, $src2}",
-                      [(set _.KRC:$dst,(OpNode 
-                                       (_.VT (bitconvert (_.LdFrag addr:$src1))),
-                                       (i32 imm:$src2)))], NoItinerary>;
-    def rmk : AVX512<opc, MRMSrcMem, (outs _.KRC:$dst),
-                      (ins _.KRCWM:$mask, _.MemOp:$src1, i32u8imm:$src2),
-                      OpcodeStr##_.Suffix##mem#
-                      "\t{$src2, $src1, $dst {${mask}}|$dst {${mask}}, $src1, $src2}",
-                      [(set _.KRC:$dst, (or _.KRCWM:$mask, (OpNode 
-                                    (_.VT (bitconvert (_.LdFrag addr:$src1))),
-                                    (i32 imm:$src2))))], NoItinerary>, EVEX_K;
-    def rmb : AVX512<opc, MRMSrcMem, (outs _.KRC:$dst),
-                      (ins _.ScalarMemOp:$src1, i32u8imm:$src2),
-                      OpcodeStr##_.Suffix##broadcast##"\t{$src2, ${src1}"##
-                                        _.BroadcastStr##", $dst|$dst, ${src1}"
-                                                    ##_.BroadcastStr##", $src2}",
-                      [(set _.KRC:$dst,(OpNode 
-                                       (_.VT (X86VBroadcast 
-                                             (_.ScalarLdFrag addr:$src1))),
-                                       (i32 imm:$src2)))], NoItinerary>,EVEX_B;
-    def rmbk : AVX512<opc, MRMSrcMem, (outs _.KRC:$dst),
-                      (ins _.KRCWM:$mask, _.ScalarMemOp:$src1, i32u8imm:$src2),
-                      OpcodeStr##_.Suffix##broadcast##"\t{$src2, ${src1}"##
-                            _.BroadcastStr##", $dst {${mask}}|$dst {${mask}}, ${src1}"##
-                                                     _.BroadcastStr##", $src2}",
-                      [(set _.KRC:$dst,(or _.KRCWM:$mask, (OpNode 
-                                       (_.VT (X86VBroadcast 
-                                             (_.ScalarLdFrag addr:$src1))),
-                                       (i32 imm:$src2))))], NoItinerary>,
-                                                            EVEX_B, EVEX_K;
-  }
+  def rm : AVX512<opc, MRMSrcMem, (outs _.KRC:$dst),
+                    (ins _.MemOp:$src1, i32u8imm:$src2),
+                    OpcodeStr##_.Suffix##mem#
+                    "\t{$src2, $src1, $dst|$dst, $src1, $src2}",
+                    [(set _.KRC:$dst,(OpNode
+                                     (_.VT (bitconvert (_.LdFrag addr:$src1))),
+                                     (i32 imm:$src2)))], NoItinerary>;
+  def rmk : AVX512<opc, MRMSrcMem, (outs _.KRC:$dst),
+                    (ins _.KRCWM:$mask, _.MemOp:$src1, i32u8imm:$src2),
+                    OpcodeStr##_.Suffix##mem#
+                    "\t{$src2, $src1, $dst {${mask}}|$dst {${mask}}, $src1, $src2}",
+                    [(set _.KRC:$dst, (or _.KRCWM:$mask, (OpNode
+                                  (_.VT (bitconvert (_.LdFrag addr:$src1))),
+                                  (i32 imm:$src2))))], NoItinerary>, EVEX_K;
+  def rmb : AVX512<opc, MRMSrcMem, (outs _.KRC:$dst),
+                    (ins _.ScalarMemOp:$src1, i32u8imm:$src2),
+                    OpcodeStr##_.Suffix##broadcast##"\t{$src2, ${src1}"##
+                                      _.BroadcastStr##", $dst|$dst, ${src1}"
+                                                  ##_.BroadcastStr##", $src2}",
+                    [(set _.KRC:$dst,(OpNode
+                                     (_.VT (X86VBroadcast
+                                           (_.ScalarLdFrag addr:$src1))),
+                                     (i32 imm:$src2)))], NoItinerary>,EVEX_B;
+  def rmbk : AVX512<opc, MRMSrcMem, (outs _.KRC:$dst),
+                    (ins _.KRCWM:$mask, _.ScalarMemOp:$src1, i32u8imm:$src2),
+                    OpcodeStr##_.Suffix##broadcast##"\t{$src2, ${src1}"##
+                          _.BroadcastStr##", $dst {${mask}}|$dst {${mask}}, ${src1}"##
+                                                   _.BroadcastStr##", $src2}",
+                    [(set _.KRC:$dst,(or _.KRCWM:$mask, (OpNode
+                                     (_.VT (X86VBroadcast
+                                           (_.ScalarLdFrag addr:$src1))),
+                                     (i32 imm:$src2))))], NoItinerary>,
+                                                          EVEX_B, EVEX_K;
 }
 
 multiclass avx512_vector_fpclass_all<string OpcodeStr,
-            AVX512VLVectorVTInfo _, bits<8> opc, SDNode OpNode, Predicate prd, 
+            AVX512VLVectorVTInfo _, bits<8> opc, SDNode OpNode, Predicate prd,
                                                               string broadcast>{
   let Predicates = [prd] in {
-    defm Z    : avx512_vector_fpclass<opc, OpcodeStr, OpNode, _.info512, "{z}", 
+    defm Z    : avx512_vector_fpclass<opc, OpcodeStr, OpNode, _.info512, "{z}",
                                       broadcast>, EVEX_V512;
   }
   let Predicates = [prd, HasVLX] in {
@@ -1981,9 +1945,9 @@ multiclass avx512_vector_fpclass_all<string OpcodeStr,
 
 multiclass avx512_fp_fpclass_all<string OpcodeStr, bits<8> opcVec,
              bits<8> opcScalar, SDNode VecOpNode, SDNode ScalarOpNode, Predicate prd>{
-  defm PS : avx512_vector_fpclass_all<OpcodeStr,  avx512vl_f32_info, opcVec, 
+  defm PS : avx512_vector_fpclass_all<OpcodeStr,  avx512vl_f32_info, opcVec,
                                       VecOpNode, prd, "{l}">, EVEX_CD8<32, CD8VF>;
-  defm PD : avx512_vector_fpclass_all<OpcodeStr,  avx512vl_f64_info, opcVec, 
+  defm PD : avx512_vector_fpclass_all<OpcodeStr,  avx512vl_f64_info, opcVec,
                                       VecOpNode, prd, "{q}">,EVEX_CD8<64, CD8VF> , VEX_W;
   defm SS : avx512_scalar_fpclass<opcScalar, OpcodeStr, ScalarOpNode,
                                       f32x_info, prd>, EVEX_CD8<32, CD8VT1>;
@@ -2003,18 +1967,15 @@ defm VFPCLASS : avx512_fp_fpclass_all<"vfpclass", 0x66, 0x67, X86Vfpclass,
 multiclass avx512_mask_mov<bits<8> opc_kk, bits<8> opc_km, bits<8> opc_mk,
                          string OpcodeStr, RegisterClass KRC,
                          ValueType vvt, X86MemOperand x86memop> {
-  let hasSideEffects = 0 in {
-    def kk : I<opc_kk, MRMSrcReg, (outs KRC:$dst), (ins KRC:$src),
-               !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), []>;
-    let mayLoad = 1 in
-    def km : I<opc_km, MRMSrcMem, (outs KRC:$dst), (ins x86memop:$src),
-               !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
-               [(set KRC:$dst, (vvt (load addr:$src)))]>;
-    let mayStore = 1 in
-    def mk : I<opc_mk, MRMDestMem, (outs), (ins x86memop:$dst, KRC:$src),
-               !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
-               [(store KRC:$src, addr:$dst)]>;
-  }
+  let hasSideEffects = 0 in
+  def kk : I<opc_kk, MRMSrcReg, (outs KRC:$dst), (ins KRC:$src),
+             !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), []>;
+  def km : I<opc_km, MRMSrcMem, (outs KRC:$dst), (ins x86memop:$src),
+             !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
+             [(set KRC:$dst, (vvt (load addr:$src)))]>;
+  def mk : I<opc_mk, MRMDestMem, (outs), (ins x86memop:$dst, KRC:$src),
+             !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
+             [(store KRC:$src, addr:$dst)]>;
 }
 
 multiclass avx512_mask_mov_gpr<bits<8> opc_kr, bits<8> opc_rk,
@@ -2043,9 +2004,6 @@ let Predicates = [HasBWI] in {
                VEX, PD, VEX_W;
   defm KMOVD : avx512_mask_mov_gpr<0x92, 0x93, "kmovd", VK32, GR32>,
                VEX, XD;
-}
-
-let Predicates = [HasBWI] in {
   defm KMOVQ : avx512_mask_mov<0x90, 0x90, 0x91, "kmovq", VK64, v64i1, i64mem>,
                VEX, PS, VEX_W;
   defm KMOVQ : avx512_mask_mov_gpr<0x92, 0x93, "kmovq", VK64, GR64>,
@@ -2058,12 +2016,20 @@ let Predicates = [HasDQI] in {
             (KMOVBkr (SUBREG_TO_REG (i32 0), GR8:$src, sub_8bit))>;
   def : Pat<(i8 (bitconvert (v8i1 VK8:$src))),
             (EXTRACT_SUBREG (KMOVBrk VK8:$src), sub_8bit)>;
+  def : Pat<(i32 (zext (i8 (bitconvert (v8i1 VK8:$src))))),
+            (KMOVBrk VK8:$src)>;
+  def : Pat<(i32 (anyext (i8 (bitconvert (v8i1 VK8:$src))))),
+            (KMOVBrk VK8:$src)>;
 }
 let Predicates = [HasAVX512] in {
   def : Pat<(v16i1 (bitconvert (i16 GR16:$src))),
             (KMOVWkr (SUBREG_TO_REG (i32 0), GR16:$src, sub_16bit))>;
   def : Pat<(i16 (bitconvert (v16i1 VK16:$src))),
             (EXTRACT_SUBREG (KMOVWrk VK16:$src), sub_16bit)>;
+  def : Pat<(i32 (zext (i16 (bitconvert (v16i1 VK16:$src))))),
+            (KMOVWrk VK16:$src)>;
+  def : Pat<(i32 (anyext (i16 (bitconvert (v16i1 VK16:$src))))),
+            (KMOVWrk VK16:$src)>;
 }
 let Predicates = [HasBWI] in {
   def : Pat<(v32i1 (bitconvert (i32 GR32:$src))), (KMOVDkr GR32:$src)>;
@@ -2085,20 +2051,45 @@ let Predicates = [HasDQI] in {
             (KMOVBmk addr:$dst, (COPY_TO_REGCLASS VK4:$src, VK8))>;
   def : Pat<(store VK2:$src, addr:$dst),
             (KMOVBmk addr:$dst, (COPY_TO_REGCLASS VK2:$src, VK8))>;
+  def : Pat<(store VK1:$src, addr:$dst),
+            (KMOVBmk addr:$dst, (COPY_TO_REGCLASS VK1:$src, VK8))>;
+
+  def : Pat<(v2i1 (load addr:$src)),
+            (COPY_TO_REGCLASS (KMOVBkm addr:$src), VK2)>;
+  def : Pat<(v4i1 (load addr:$src)),
+            (COPY_TO_REGCLASS (KMOVBkm addr:$src), VK4)>;
 }
 let Predicates = [HasAVX512, NoDQI] in {
-  def : Pat<(store (i8 (bitconvert (v8i1 VK8:$src))), addr:$dst),
-            (KMOVWmk addr:$dst, (COPY_TO_REGCLASS VK8:$src, VK16))>;
-  def : Pat<(v8i1 (bitconvert (i8 (load addr:$src)))),
-            (COPY_TO_REGCLASS (KMOVWkm addr:$src), VK8)>;
+  def : Pat<(store VK1:$src, addr:$dst),
+            (MOV8mr addr:$dst,
+             (EXTRACT_SUBREG (KMOVWrk (COPY_TO_REGCLASS VK1:$src, VK16)),
+              sub_8bit))>;
+  def : Pat<(store VK2:$src, addr:$dst),
+            (MOV8mr addr:$dst,
+             (EXTRACT_SUBREG (KMOVWrk (COPY_TO_REGCLASS VK2:$src, VK16)),
+              sub_8bit))>;
+  def : Pat<(store VK4:$src, addr:$dst),
+            (MOV8mr addr:$dst,
+             (EXTRACT_SUBREG (KMOVWrk (COPY_TO_REGCLASS VK4:$src, VK16)),
+              sub_8bit))>;
+  def : Pat<(store VK8:$src, addr:$dst),
+            (MOV8mr addr:$dst,
+             (EXTRACT_SUBREG (KMOVWrk (COPY_TO_REGCLASS VK8:$src, VK16)),
+              sub_8bit))>;
+
+  def : Pat<(v8i1 (load addr:$src)),
+            (COPY_TO_REGCLASS (MOVZX32rm8 addr:$src), VK8)>;
+  def : Pat<(v2i1 (load addr:$src)),
+            (COPY_TO_REGCLASS (MOVZX32rm8 addr:$src), VK2)>;
+  def : Pat<(v4i1 (load addr:$src)),
+            (COPY_TO_REGCLASS (MOVZX32rm8 addr:$src), VK4)>;
 }
+
 let Predicates = [HasAVX512] in {
   def : Pat<(store (i16 (bitconvert (v16i1 VK16:$src))), addr:$dst),
             (KMOVWmk addr:$dst, VK16:$src)>;
   def : Pat<(i1 (load addr:$src)),
-            (COPY_TO_REGCLASS (AND16ri (i16 (SUBREG_TO_REG (i32 0),
-                                              (MOV8rm addr:$src), sub_8bit)),
-                                (i16 1)), VK1)>;
+            (COPY_TO_REGCLASS (AND32ri8 (MOVZX32rm8 addr:$src), (i32 1)), VK1)>;
   def : Pat<(v16i1 (bitconvert (i16 (load addr:$src)))),
             (KMOVWkm addr:$src)>;
 }
@@ -2107,51 +2098,71 @@ let Predicates = [HasBWI] in {
             (KMOVDmk addr:$dst, VK32:$src)>;
   def : Pat<(v32i1 (bitconvert (i32 (load addr:$src)))),
             (KMOVDkm addr:$src)>;
-}
-let Predicates = [HasBWI] in {
   def : Pat<(store (i64 (bitconvert (v64i1 VK64:$src))), addr:$dst),
             (KMOVQmk addr:$dst, VK64:$src)>;
   def : Pat<(v64i1 (bitconvert (i64 (load addr:$src)))),
             (KMOVQkm addr:$src)>;
 }
 
+def assertzext_i1 : PatFrag<(ops node:$src), (assertzext node:$src), [{
+  return cast<VTSDNode>(N->getOperand(1))->getVT() == MVT::i1;
+}]>;
+
 let Predicates = [HasAVX512] in {
   def : Pat<(i1 (trunc (i64 GR64:$src))),
-            (COPY_TO_REGCLASS (KMOVWkr (AND32ri (EXTRACT_SUBREG $src, sub_32bit),
-                                        (i32 1))), VK1)>;
+            (COPY_TO_REGCLASS (i16 (EXTRACT_SUBREG (AND64ri8 $src, (i64 1)),
+                                    sub_16bit)), VK1)>;
+
+  def : Pat<(i1 (trunc (i64 (assertzext_i1 GR64:$src)))),
+            (COPY_TO_REGCLASS (i16 (EXTRACT_SUBREG $src, sub_16bit)), VK1)>;
 
   def : Pat<(i1 (trunc (i32 GR32:$src))),
-            (COPY_TO_REGCLASS (KMOVWkr (AND32ri $src, (i32 1))), VK1)>;
+            (COPY_TO_REGCLASS (i16 (EXTRACT_SUBREG (AND32ri8 $src, (i32 1)),
+                                    sub_16bit)), VK1)>;
+
+  def : Pat<(i1 (trunc (i32 (assertzext_i1 GR32:$src)))),
+            (COPY_TO_REGCLASS (i16 (EXTRACT_SUBREG $src, sub_16bit)), VK1)>;
 
   def : Pat<(i1 (trunc (i8 GR8:$src))),
-       (COPY_TO_REGCLASS
-        (KMOVWkr (AND32ri (SUBREG_TO_REG (i32 0), GR8:$src, sub_8bit), (i32 1))),
-       VK1)>;
+            (COPY_TO_REGCLASS (i16 (SUBREG_TO_REG (i64 0), (AND8ri8 $src, (i8 1)),
+                                    sub_8bit)), VK1)>;
+
+  def : Pat<(i1 (trunc (i8 (assertzext_i1 GR8:$src)))),
+            (COPY_TO_REGCLASS (i16 (SUBREG_TO_REG (i64 0), $src, sub_8bit)), VK1)>;
+
   def : Pat<(i1 (trunc (i16 GR16:$src))),
-       (COPY_TO_REGCLASS
-        (KMOVWkr (AND32ri (SUBREG_TO_REG (i32 0), $src, sub_16bit), (i32 1))),
-       VK1)>;
+            (COPY_TO_REGCLASS (AND16ri GR16:$src, (i16 1)), VK1)>;
+
+  def : Pat<(i1 (trunc (i16 (assertzext_i1 GR16:$src)))),
+            (COPY_TO_REGCLASS $src, VK1)>;
 
   def : Pat<(i32 (zext VK1:$src)),
-            (AND32ri (KMOVWrk (COPY_TO_REGCLASS VK1:$src, VK16)), (i32 1))>;
+            (i32 (SUBREG_TO_REG (i64 0), (i16 (COPY_TO_REGCLASS $src, GR16)),
+                  sub_16bit))>;
+
   def : Pat<(i32 (anyext VK1:$src)),
-            (KMOVWrk (COPY_TO_REGCLASS VK1:$src, VK16))>;
+            (i32 (SUBREG_TO_REG (i64 0), (i16 (COPY_TO_REGCLASS $src, GR16)),
+                  sub_16bit))>;
 
   def : Pat<(i8 (zext VK1:$src)),
-            (EXTRACT_SUBREG
-             (AND32ri (KMOVWrk
-                       (COPY_TO_REGCLASS VK1:$src, VK16)), (i32 1)), sub_8bit)>;
+            (i8 (EXTRACT_SUBREG (i16 (COPY_TO_REGCLASS VK1:$src, GR16)), sub_8bit))>;
+
   def : Pat<(i8 (anyext VK1:$src)),
-              (EXTRACT_SUBREG
-                (KMOVWrk (COPY_TO_REGCLASS VK1:$src, VK16)), sub_8bit)>;
+            (i8 (EXTRACT_SUBREG (i16 (COPY_TO_REGCLASS $src, GR16)), sub_8bit))>;
 
   def : Pat<(i64 (zext VK1:$src)),
-            (AND64ri8 (SUBREG_TO_REG (i64 0),
-             (KMOVWrk (COPY_TO_REGCLASS VK1:$src, VK16)), sub_32bit), (i64 1))>;
+            (i64 (SUBREG_TO_REG (i64 0), (i16 (COPY_TO_REGCLASS $src, GR16)),
+                  sub_16bit))>;
+
+  def : Pat<(i64 (anyext VK1:$src)),
+            (i64 (SUBREG_TO_REG (i64 0), (i16 (COPY_TO_REGCLASS $src, GR16)),
+                  sub_16bit))>;
+
   def : Pat<(i16 (zext VK1:$src)),
-            (EXTRACT_SUBREG
-             (AND32ri (KMOVWrk (COPY_TO_REGCLASS VK1:$src, VK16)), (i32 1)),
-              sub_16bit)>;
+            (COPY_TO_REGCLASS $src, GR16)>;
+
+  def : Pat<(i16 (anyext VK1:$src)),
+            (i16 (COPY_TO_REGCLASS $src, GR16))>;
 }
 def : Pat<(v16i1 (scalar_to_vector VK1:$src)),
           (COPY_TO_REGCLASS VK1:$src, VK16)>;
@@ -2166,17 +2177,24 @@ def : Pat<(v32i1 (scalar_to_vector VK1:$src)),
 def : Pat<(v64i1 (scalar_to_vector VK1:$src)),
           (COPY_TO_REGCLASS VK1:$src, VK64)>;
 
+def : Pat<(store (i1 -1), addr:$dst), (MOV8mi addr:$dst, (i8 1))>;
+def : Pat<(store (i1  1), addr:$dst), (MOV8mi addr:$dst, (i8 1))>;
+def : Pat<(store (i1  0), addr:$dst), (MOV8mi addr:$dst, (i8 0))>;
 
 // With AVX-512 only, 8-bit mask is promoted to 16-bit mask.
 let Predicates = [HasAVX512, NoDQI] in {
   // GR from/to 8-bit mask without native support
   def : Pat<(v8i1 (bitconvert (i8 GR8:$src))),
             (COPY_TO_REGCLASS
-             (KMOVWkr (MOVZX32rr8 GR8 :$src)), VK8)>;
+             (KMOVWkr (SUBREG_TO_REG (i32 0), GR8:$src, sub_8bit)), VK8)>;
   def : Pat<(i8 (bitconvert (v8i1 VK8:$src))),
             (EXTRACT_SUBREG
               (KMOVWrk (COPY_TO_REGCLASS VK8:$src, VK16)),
               sub_8bit)>;
+  def : Pat<(i32 (zext (i8 (bitconvert (v8i1 VK8:$src))))),
+            (KMOVWrk (COPY_TO_REGCLASS VK8:$src, VK16))>;
+  def : Pat<(i32 (anyext (i8 (bitconvert (v8i1 VK8:$src))))),
+            (KMOVWrk (COPY_TO_REGCLASS VK8:$src, VK16))>;
 }
 
 let Predicates = [HasAVX512] in {
@@ -2419,7 +2437,6 @@ multiclass avx512_mask_shiftop_w<bits<8> opc1, bits<8> opc2, string OpcodeStr,
   let Predicates = [HasBWI] in {
   defm Q : avx512_mask_shiftop<opc2, !strconcat(OpcodeStr, "q"), VK64, OpNode>,
                                VEX, TAPD, VEX_W;
-  let Predicates = [HasDQI] in
   defm D : avx512_mask_shiftop<opc2, !strconcat(OpcodeStr, "d"), VK32, OpNode>,
                                VEX, TAPD;
   }
@@ -2456,82 +2473,61 @@ let Predicates = [HasAVX512] in {
   def : Pat<(i1 1), (COPY_TO_REGCLASS (KSHIFTRWri (KSET1W), (i8 15)), VK1)>;
   def : Pat<(i1 -1), (COPY_TO_REGCLASS (KSHIFTRWri (KSET1W), (i8 15)), VK1)>;
 }
-def : Pat<(v8i1 (extract_subvector (v16i1 VK16:$src), (iPTR 0))),
-          (v8i1 (COPY_TO_REGCLASS VK16:$src, VK8))>;
 
-def : Pat<(v16i1 (insert_subvector undef, (v8i1 VK8:$src), (iPTR 0))),
-          (v16i1 (COPY_TO_REGCLASS VK8:$src, VK16))>;
+// Patterns for kmask insert_subvector/extract_subvector to/from index=0
+multiclass operation_subvector_mask_lowering<RegisterClass subRC, ValueType subVT,
+                                             RegisterClass RC, ValueType VT> {
+  def : Pat<(subVT (extract_subvector (VT RC:$src), (iPTR 0))),
+            (subVT (COPY_TO_REGCLASS RC:$src, subRC))>;
 
-def : Pat<(v8i1 (extract_subvector (v16i1 VK16:$src), (iPTR 8))),
-          (v8i1 (COPY_TO_REGCLASS (KSHIFTRWri VK16:$src, (i8 8)), VK8))>;
+  def : Pat<(VT (insert_subvector undef, subRC:$src, (iPTR 0))),
+            (VT (COPY_TO_REGCLASS subRC:$src, RC))>;
+}
 
-def : Pat<(v16i1 (extract_subvector (v32i1 VK32:$src), (iPTR 0))),
-          (v16i1 (COPY_TO_REGCLASS VK32:$src, VK16))>;
+defm : operation_subvector_mask_lowering<VK2,  v2i1,  VK4,  v4i1>;
+defm : operation_subvector_mask_lowering<VK2,  v2i1,  VK8,  v8i1>;
+defm : operation_subvector_mask_lowering<VK2,  v2i1,  VK16, v16i1>;
+defm : operation_subvector_mask_lowering<VK2,  v2i1,  VK32, v32i1>;
+defm : operation_subvector_mask_lowering<VK2,  v2i1,  VK64, v64i1>;
 
-def : Pat<(v16i1 (extract_subvector (v32i1 VK32:$src), (iPTR 16))),
-          (v16i1 (COPY_TO_REGCLASS (KSHIFTRDri VK32:$src, (i8 16)), VK16))>;
+defm : operation_subvector_mask_lowering<VK4,  v4i1,  VK8,  v8i1>;
+defm : operation_subvector_mask_lowering<VK4,  v4i1,  VK16, v16i1>;
+defm : operation_subvector_mask_lowering<VK4,  v4i1,  VK32, v32i1>;
+defm : operation_subvector_mask_lowering<VK4,  v4i1,  VK64, v64i1>;
 
-def : Pat<(v32i1 (extract_subvector (v64i1 VK64:$src), (iPTR 0))),
-          (v32i1 (COPY_TO_REGCLASS VK64:$src, VK32))>;
+defm : operation_subvector_mask_lowering<VK8,  v8i1,  VK16, v16i1>;
+defm : operation_subvector_mask_lowering<VK8,  v8i1,  VK32, v32i1>;
+defm : operation_subvector_mask_lowering<VK8,  v8i1,  VK64, v64i1>;
 
-def : Pat<(v32i1 (extract_subvector (v64i1 VK64:$src), (iPTR 32))),
-          (v32i1 (COPY_TO_REGCLASS (KSHIFTRQri VK64:$src, (i8 32)), VK32))>;
+defm : operation_subvector_mask_lowering<VK16, v16i1, VK32, v32i1>;
+defm : operation_subvector_mask_lowering<VK16, v16i1, VK64, v64i1>;
 
-def : Pat<(v4i1 (extract_subvector (v8i1 VK8:$src), (iPTR 0))),
-          (v4i1 (COPY_TO_REGCLASS VK8:$src, VK4))>;
-
-def : Pat<(v2i1 (extract_subvector (v8i1 VK8:$src), (iPTR 0))),
-          (v2i1 (COPY_TO_REGCLASS VK8:$src, VK2))>;
-
-def : Pat<(v4i1 (insert_subvector undef, (v2i1 VK2:$src), (iPTR 0))),
-          (v4i1 (COPY_TO_REGCLASS VK2:$src, VK4))>;
-
-def : Pat<(v8i1 (insert_subvector undef, (v4i1 VK4:$src), (iPTR 0))),
-          (v8i1 (COPY_TO_REGCLASS VK4:$src, VK8))>;
-def : Pat<(v8i1 (insert_subvector undef, (v2i1 VK2:$src), (iPTR 0))),
-          (v8i1 (COPY_TO_REGCLASS VK2:$src, VK8))>;
-
-def : Pat<(v32i1 (insert_subvector undef, VK2:$src, (iPTR 0))),
-          (v32i1 (COPY_TO_REGCLASS VK2:$src, VK32))>;
-def : Pat<(v32i1 (insert_subvector undef, VK4:$src, (iPTR 0))),
-          (v32i1 (COPY_TO_REGCLASS VK4:$src, VK32))>;
-def : Pat<(v32i1 (insert_subvector undef, VK8:$src, (iPTR 0))),
-          (v32i1 (COPY_TO_REGCLASS VK8:$src, VK32))>;
-def : Pat<(v32i1 (insert_subvector undef, VK16:$src, (iPTR 0))),
-          (v32i1 (COPY_TO_REGCLASS VK16:$src, VK32))>;
-
-def : Pat<(v64i1 (insert_subvector undef, VK2:$src, (iPTR 0))),
-          (v64i1 (COPY_TO_REGCLASS VK2:$src, VK64))>;
-def : Pat<(v64i1 (insert_subvector undef, VK4:$src, (iPTR 0))),
-          (v64i1 (COPY_TO_REGCLASS VK4:$src, VK64))>;
-def : Pat<(v64i1 (insert_subvector undef, VK8:$src, (iPTR 0))),
-          (v64i1 (COPY_TO_REGCLASS VK8:$src, VK64))>;
-def : Pat<(v64i1 (insert_subvector undef, VK16:$src, (iPTR 0))),
-          (v64i1 (COPY_TO_REGCLASS VK16:$src, VK64))>;
-def : Pat<(v64i1 (insert_subvector undef, VK32:$src, (iPTR 0))),
-          (v64i1 (COPY_TO_REGCLASS VK32:$src, VK64))>;
+defm : operation_subvector_mask_lowering<VK32, v32i1, VK64, v64i1>;
 
+def : Pat<(v2i1 (extract_subvector (v4i1 VK4:$src), (iPTR 2))),
+          (v2i1 (COPY_TO_REGCLASS
+                  (KSHIFTRWri (COPY_TO_REGCLASS VK4:$src, VK16), (i8 2)),
+                   VK2))>;
+def : Pat<(v4i1 (extract_subvector (v8i1 VK8:$src), (iPTR 4))),
+          (v4i1 (COPY_TO_REGCLASS
+                  (KSHIFTRWri (COPY_TO_REGCLASS VK8:$src, VK16), (i8 4)),
+                   VK4))>;
+def : Pat<(v8i1 (extract_subvector (v16i1 VK16:$src), (iPTR 8))),
+          (v8i1 (COPY_TO_REGCLASS (KSHIFTRWri VK16:$src, (i8 8)), VK8))>;
+def : Pat<(v16i1 (extract_subvector (v32i1 VK32:$src), (iPTR 16))),
+          (v16i1 (COPY_TO_REGCLASS (KSHIFTRDri VK32:$src, (i8 16)), VK16))>;
+def : Pat<(v32i1 (extract_subvector (v64i1 VK64:$src), (iPTR 32))),
+          (v32i1 (COPY_TO_REGCLASS (KSHIFTRQri VK64:$src, (i8 32)), VK32))>;
 
 def : Pat<(v8i1 (X86vshli VK8:$src, (i8 imm:$imm))),
           (v8i1 (COPY_TO_REGCLASS
                  (KSHIFTLWri (COPY_TO_REGCLASS VK8:$src, VK16),
                   (I8Imm $imm)), VK8))>, Requires<[HasAVX512, NoDQI]>;
 
-def : Pat<(v8i1 (X86vsrli VK8:$src, (i8 imm:$imm))),
-          (v8i1 (COPY_TO_REGCLASS
-                 (KSHIFTRWri (COPY_TO_REGCLASS VK8:$src, VK16),
-                  (I8Imm $imm)), VK8))>, Requires<[HasAVX512, NoDQI]>;
-
 def : Pat<(v4i1 (X86vshli VK4:$src, (i8 imm:$imm))),
           (v4i1 (COPY_TO_REGCLASS
                  (KSHIFTLWri (COPY_TO_REGCLASS VK4:$src, VK16),
                   (I8Imm $imm)), VK4))>, Requires<[HasAVX512]>;
-
-def : Pat<(v4i1 (X86vsrli VK4:$src, (i8 imm:$imm))),
-          (v4i1 (COPY_TO_REGCLASS
-                 (KSHIFTRWri (COPY_TO_REGCLASS VK4:$src, VK16),
-                  (I8Imm $imm)), VK4))>, Requires<[HasAVX512]>;
-
 //===----------------------------------------------------------------------===//
 // AVX-512 - Aligned and unaligned load and store
 //
@@ -2539,7 +2535,8 @@ def : Pat<(v4i1 (X86vsrli VK4:$src, (i8 imm:$imm))),
 
 multiclass avx512_load<bits<8> opc, string OpcodeStr, X86VectorVTInfo _,
                          PatFrag ld_frag, PatFrag mload,
-                         bit IsReMaterializable = 1> {
+                         bit IsReMaterializable = 1,
+                         SDPatternOperator SelectOprr = vselect> {
   let hasSideEffects = 0 in {
   def rr : AVX512PI<opc, MRMSrcReg, (outs _.RC:$dst), (ins _.RC:$src),
                     !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), [],
@@ -2547,7 +2544,10 @@ multiclass avx512_load<bits<8> opc, string OpcodeStr, X86VectorVTInfo _,
   def rrkz : AVX512PI<opc, MRMSrcReg, (outs _.RC:$dst),
                       (ins _.KRCWM:$mask,  _.RC:$src),
                       !strconcat(OpcodeStr, "\t{$src, ${dst} {${mask}} {z}|",
-                       "${dst} {${mask}} {z}, $src}"), [], _.ExeDomain>,
+                       "${dst} {${mask}} {z}, $src}"),
+                       [(set _.RC:$dst, (_.VT (vselect _.KRCWM:$mask,
+                                           (_.VT _.RC:$src),
+                                           _.ImmAllZerosV)))], _.ExeDomain>,
                        EVEX, EVEX_KZ;
 
   let canFoldAsLoad = 1, isReMaterializable = IsReMaterializable,
@@ -2562,11 +2562,11 @@ multiclass avx512_load<bits<8> opc, string OpcodeStr, X86VectorVTInfo _,
                     (ins _.RC:$src0, _.KRCWM:$mask, _.RC:$src1),
                     !strconcat(OpcodeStr, "\t{$src1, ${dst} {${mask}}|",
                     "${dst} {${mask}}, $src1}"),
-                    [(set _.RC:$dst, (_.VT (vselect _.KRCWM:$mask,
+                    [(set _.RC:$dst, (_.VT (SelectOprr _.KRCWM:$mask,
                                         (_.VT _.RC:$src1),
                                         (_.VT _.RC:$src0))))], _.ExeDomain>,
                      EVEX, EVEX_K;
-  let mayLoad = 1, SchedRW = [WriteLoad] in
+    let SchedRW = [WriteLoad] in
     def rmk : AVX512PI<opc, MRMSrcMem, (outs _.RC:$dst),
                      (ins _.RC:$src0, _.KRCWM:$mask, _.MemOp:$src1),
                      !strconcat(OpcodeStr, "\t{$src1, ${dst} {${mask}}|",
@@ -2576,7 +2576,7 @@ multiclass avx512_load<bits<8> opc, string OpcodeStr, X86VectorVTInfo _,
                           (_.VT (bitconvert (ld_frag addr:$src1))),
                            (_.VT _.RC:$src0))))], _.ExeDomain>, EVEX, EVEX_K;
   }
-  let mayLoad = 1, SchedRW = [WriteLoad] in
+  let SchedRW = [WriteLoad] in
   def rmkz : AVX512PI<opc, MRMSrcMem, (outs _.RC:$dst),
                   (ins _.KRCWM:$mask, _.MemOp:$src),
                   OpcodeStr #"\t{$src, ${dst} {${mask}} {z}|"#
@@ -2615,22 +2615,27 @@ multiclass avx512_alignedload_vl<bits<8> opc, string OpcodeStr,
 multiclass avx512_load_vl<bits<8> opc, string OpcodeStr,
                                   AVX512VLVectorVTInfo _,
                                   Predicate prd,
-                                  bit IsReMaterializable = 1> {
+                                  bit IsReMaterializable = 1,
+                                  SDPatternOperator SelectOprr = vselect> {
   let Predicates = [prd] in
   defm Z : avx512_load<opc, OpcodeStr, _.info512, _.info512.LdFrag,
-                       masked_load_unaligned, IsReMaterializable>, EVEX_V512;
+                       masked_load_unaligned, IsReMaterializable,
+                       SelectOprr>, EVEX_V512;
 
   let Predicates = [prd, HasVLX] in {
   defm Z256 : avx512_load<opc, OpcodeStr, _.info256, _.info256.LdFrag,
-                         masked_load_unaligned, IsReMaterializable>, EVEX_V256;
+                         masked_load_unaligned, IsReMaterializable,
+                         SelectOprr>, EVEX_V256;
   defm Z128 : avx512_load<opc, OpcodeStr, _.info128, _.info128.LdFrag,
-                         masked_load_unaligned, IsReMaterializable>, EVEX_V128;
+                         masked_load_unaligned, IsReMaterializable,
+                         SelectOprr>, EVEX_V128;
   }
 }
 
 multiclass avx512_store<bits<8> opc, string OpcodeStr, X86VectorVTInfo _,
                         PatFrag st_frag, PatFrag mstore> {
 
+  let hasSideEffects = 0 in {
   def rr_REV  : AVX512PI<opc, MRMDestReg, (outs _.RC:$dst), (ins _.RC:$src),
                          OpcodeStr # ".s\t{$src, $dst|$dst, $src}",
                          [], _.ExeDomain>, EVEX;
@@ -2644,8 +2649,8 @@ multiclass avx512_store<bits<8> opc, string OpcodeStr, X86VectorVTInfo _,
                           OpcodeStr # ".s\t{$src, ${dst} {${mask}} {z}|" #
                           "${dst} {${mask}} {z}, $src}",
                           [], _.ExeDomain>, EVEX, EVEX_KZ;
+  }
 
-  let mayStore = 1 in {
   def mr : AVX512PI<opc, MRMDestMem, (outs), (ins _.MemOp:$dst, _.RC:$src),
                     !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
                     [(st_frag (_.VT _.RC:$src), addr:$dst)], _.ExeDomain>, EVEX;
@@ -2653,7 +2658,6 @@ multiclass avx512_store<bits<8> opc, string OpcodeStr, X86VectorVTInfo _,
                      (ins _.MemOp:$dst, _.KRCWM:$mask, _.RC:$src),
               OpcodeStr # "\t{$src, ${dst} {${mask}}|${dst} {${mask}}, $src}",
                [], _.ExeDomain>, EVEX, EVEX_K;
-  }
 
   def: Pat<(mstore addr:$ptr, _.KRCWM:$mask, (_.VT _.RC:$src)),
            (!cast<Instruction>(NAME#_.ZSuffix##mrk) addr:$ptr,
@@ -2699,32 +2703,16 @@ defm VMOVAPD : avx512_alignedload_vl<0x28, "vmovapd", avx512vl_f64_info,
                avx512_alignedstore_vl<0x29, "vmovapd", avx512vl_f64_info,
                                      HasAVX512>, PD, VEX_W, EVEX_CD8<64, CD8VF>;
 
-defm VMOVUPS : avx512_load_vl<0x10, "vmovups", avx512vl_f32_info, HasAVX512>,
+defm VMOVUPS : avx512_load_vl<0x10, "vmovups", avx512vl_f32_info, HasAVX512,
+                              1, null_frag>,
                avx512_store_vl<0x11, "vmovups", avx512vl_f32_info, HasAVX512>,
                               PS, EVEX_CD8<32, CD8VF>;
 
-defm VMOVUPD : avx512_load_vl<0x10, "vmovupd", avx512vl_f64_info, HasAVX512, 0>,
+defm VMOVUPD : avx512_load_vl<0x10, "vmovupd", avx512vl_f64_info, HasAVX512, 0,
+                              null_frag>,
                avx512_store_vl<0x11, "vmovupd", avx512vl_f64_info, HasAVX512>,
                PD, VEX_W, EVEX_CD8<64, CD8VF>;
 
-def: Pat<(int_x86_avx512_mask_storeu_ps_512 addr:$ptr, (v16f32 VR512:$src),
-          GR16:$mask),
-         (VMOVUPSZmrk addr:$ptr, (v16i1 (COPY_TO_REGCLASS GR16:$mask, VK16WM)),
-            VR512:$src)>;
-def: Pat<(int_x86_avx512_mask_storeu_pd_512 addr:$ptr, (v8f64 VR512:$src),
-          GR8:$mask),
-         (VMOVUPDZmrk addr:$ptr, (v8i1 (COPY_TO_REGCLASS GR8:$mask, VK8WM)),
-            VR512:$src)>;
-
-def: Pat<(int_x86_avx512_mask_store_ps_512 addr:$ptr, (v16f32 VR512:$src),
-          GR16:$mask),
-         (VMOVAPSZmrk addr:$ptr, (v16i1 (COPY_TO_REGCLASS GR16:$mask, VK16WM)),
-            VR512:$src)>;
-def: Pat<(int_x86_avx512_mask_store_pd_512 addr:$ptr, (v8f64 VR512:$src),
-          GR8:$mask),
-         (VMOVAPDZmrk addr:$ptr, (v8i1 (COPY_TO_REGCLASS GR8:$mask, VK8WM)),
-            VR512:$src)>;
-
 defm VMOVDQA32 : avx512_alignedload_vl<0x6F, "vmovdqa32", avx512vl_i32_info,
                                        HasAVX512>,
                  avx512_alignedstore_vl<0x7F, "vmovdqa32", avx512vl_i32_info,
@@ -2743,50 +2731,159 @@ defm VMOVDQU16 : avx512_load_vl<0x6F, "vmovdqu16", avx512vl_i16_info, HasBWI>,
                  avx512_store_vl<0x7F, "vmovdqu16", avx512vl_i16_info,
                                  HasBWI>, XD, VEX_W, EVEX_CD8<16, CD8VF>;
 
-defm VMOVDQU32 : avx512_load_vl<0x6F, "vmovdqu32", avx512vl_i32_info, HasAVX512>,
+defm VMOVDQU32 : avx512_load_vl<0x6F, "vmovdqu32", avx512vl_i32_info, HasAVX512,
+                                1, null_frag>,
                  avx512_store_vl<0x7F, "vmovdqu32", avx512vl_i32_info,
                                  HasAVX512>, XS, EVEX_CD8<32, CD8VF>;
 
-defm VMOVDQU64 : avx512_load_vl<0x6F, "vmovdqu64", avx512vl_i64_info, HasAVX512>,
+defm VMOVDQU64 : avx512_load_vl<0x6F, "vmovdqu64", avx512vl_i64_info, HasAVX512,
+                                1, null_frag>,
                  avx512_store_vl<0x7F, "vmovdqu64", avx512vl_i64_info,
                                  HasAVX512>, XS, VEX_W, EVEX_CD8<64, CD8VF>;
 
-def: Pat<(v16i32 (int_x86_avx512_mask_loadu_d_512 addr:$ptr,
-                 (v16i32 immAllZerosV), GR16:$mask)),
-       (VMOVDQU32Zrmkz (v16i1 (COPY_TO_REGCLASS GR16:$mask, VK16WM)), addr:$ptr)>;
-
-def: Pat<(v8i64 (int_x86_avx512_mask_loadu_q_512 addr:$ptr,
-                (bc_v8i64 (v16i32 immAllZerosV)), GR8:$mask)),
-       (VMOVDQU64Zrmkz (v8i1 (COPY_TO_REGCLASS GR8:$mask, VK8WM)), addr:$ptr)>;
-
-def: Pat<(int_x86_avx512_mask_storeu_d_512 addr:$ptr, (v16i32 VR512:$src),
-            GR16:$mask),
-         (VMOVDQU32Zmrk addr:$ptr, (v16i1 (COPY_TO_REGCLASS GR16:$mask, VK16WM)),
-            VR512:$src)>;
-def: Pat<(int_x86_avx512_mask_storeu_q_512 addr:$ptr, (v8i64 VR512:$src),
-            GR8:$mask),
-         (VMOVDQU64Zmrk addr:$ptr, (v8i1 (COPY_TO_REGCLASS GR8:$mask, VK8WM)),
-            VR512:$src)>;
-
-let AddedComplexity = 20 in {
-def : Pat<(v8i64 (vselect VK8WM:$mask, (v8i64 VR512:$src),
-                          (bc_v8i64 (v16i32 immAllZerosV)))),
-                  (VMOVDQU64Zrrkz VK8WM:$mask, VR512:$src)>;
-
 def : Pat<(v8i64 (vselect VK8WM:$mask, (bc_v8i64 (v16i32 immAllZerosV)),
                           (v8i64 VR512:$src))),
-   (VMOVDQU64Zrrkz (COPY_TO_REGCLASS (KNOTWrr (COPY_TO_REGCLASS VK8:$mask, VK16)),
+   (VMOVDQA64Zrrkz (COPY_TO_REGCLASS (KNOTWrr (COPY_TO_REGCLASS VK8:$mask, VK16)),
                                               VK8), VR512:$src)>;
 
-def : Pat<(v16i32 (vselect VK16WM:$mask, (v16i32 VR512:$src),
-                           (v16i32 immAllZerosV))),
-                  (VMOVDQU32Zrrkz VK16WM:$mask, VR512:$src)>;
-
 def : Pat<(v16i32 (vselect VK16WM:$mask, (v16i32 immAllZerosV),
                            (v16i32 VR512:$src))),
-                  (VMOVDQU32Zrrkz (KNOTWrr VK16WM:$mask), VR512:$src)>;
+                  (VMOVDQA32Zrrkz (KNOTWrr VK16WM:$mask), VR512:$src)>;
+
+// These patterns exist to prevent the above patterns from introducing a second
+// mask inversion when one already exists.
+def : Pat<(v8i64 (vselect (xor VK8:$mask, (v8i1 immAllOnesV)),
+                          (bc_v8i64 (v16i32 immAllZerosV)),
+                          (v8i64 VR512:$src))),
+                 (VMOVDQA64Zrrkz VK8:$mask, VR512:$src)>;
+def : Pat<(v16i32 (vselect (xor VK16:$mask, (v16i1 immAllOnesV)),
+                           (v16i32 immAllZerosV),
+                           (v16i32 VR512:$src))),
+                  (VMOVDQA32Zrrkz VK16WM:$mask, VR512:$src)>;
+
+let Predicates = [HasVLX] in {
+  // Special patterns for storing subvector extracts of lower 128-bits of 256.
+  // Its cheaper to just use VMOVAPS/VMOVUPS instead of VEXTRACTF128mr
+  def : Pat<(alignedstore (v2f64 (extract_subvector
+                                  (v4f64 VR256X:$src), (iPTR 0))), addr:$dst),
+     (VMOVAPDZ128mr addr:$dst, (v2f64 (EXTRACT_SUBREG VR256X:$src,sub_xmm)))>;
+  def : Pat<(alignedstore (v4f32 (extract_subvector
+                                  (v8f32 VR256X:$src), (iPTR 0))), addr:$dst),
+     (VMOVAPSZ128mr addr:$dst, (v4f32 (EXTRACT_SUBREG VR256X:$src,sub_xmm)))>;
+  def : Pat<(alignedstore (v2i64 (extract_subvector
+                                  (v4i64 VR256X:$src), (iPTR 0))), addr:$dst),
+     (VMOVDQA64Z128mr addr:$dst, (v2i64 (EXTRACT_SUBREG VR256X:$src,sub_xmm)))>;
+  def : Pat<(alignedstore (v4i32 (extract_subvector
+                                  (v8i32 VR256X:$src), (iPTR 0))), addr:$dst),
+     (VMOVDQA32Z128mr addr:$dst, (v4i32 (EXTRACT_SUBREG VR256X:$src,sub_xmm)))>;
+  def : Pat<(alignedstore (v8i16 (extract_subvector
+                                  (v16i16 VR256X:$src), (iPTR 0))), addr:$dst),
+     (VMOVDQA32Z128mr addr:$dst, (v8i16 (EXTRACT_SUBREG VR256X:$src,sub_xmm)))>;
+  def : Pat<(alignedstore (v16i8 (extract_subvector
+                                  (v32i8 VR256X:$src), (iPTR 0))), addr:$dst),
+     (VMOVDQA32Z128mr addr:$dst, (v16i8 (EXTRACT_SUBREG VR256X:$src,sub_xmm)))>;
+
+  def : Pat<(store (v2f64 (extract_subvector
+                           (v4f64 VR256X:$src), (iPTR 0))), addr:$dst),
+     (VMOVUPDZ128mr addr:$dst, (v2f64 (EXTRACT_SUBREG VR256X:$src,sub_xmm)))>;
+  def : Pat<(store (v4f32 (extract_subvector
+                           (v8f32 VR256X:$src), (iPTR 0))), addr:$dst),
+     (VMOVUPSZ128mr addr:$dst, (v4f32 (EXTRACT_SUBREG VR256X:$src,sub_xmm)))>;
+  def : Pat<(store (v2i64 (extract_subvector
+                           (v4i64 VR256X:$src), (iPTR 0))), addr:$dst),
+     (VMOVDQU64Z128mr addr:$dst, (v2i64 (EXTRACT_SUBREG VR256X:$src,sub_xmm)))>;
+  def : Pat<(store (v4i32 (extract_subvector
+                           (v8i32 VR256X:$src), (iPTR 0))), addr:$dst),
+     (VMOVDQU32Z128mr addr:$dst, (v4i32 (EXTRACT_SUBREG VR256X:$src,sub_xmm)))>;
+  def : Pat<(store (v8i16 (extract_subvector
+                           (v16i16 VR256X:$src), (iPTR 0))), addr:$dst),
+     (VMOVDQU32Z128mr addr:$dst, (v8i16 (EXTRACT_SUBREG VR256X:$src,sub_xmm)))>;
+  def : Pat<(store (v16i8 (extract_subvector
+                           (v32i8 VR256X:$src), (iPTR 0))), addr:$dst),
+     (VMOVDQU32Z128mr addr:$dst, (v16i8 (EXTRACT_SUBREG VR256X:$src,sub_xmm)))>;
+
+  // Special patterns for storing subvector extracts of lower 128-bits of 512.
+  // Its cheaper to just use VMOVAPS/VMOVUPS instead of VEXTRACTF128mr
+  def : Pat<(alignedstore (v2f64 (extract_subvector
+                                  (v8f64 VR512:$src), (iPTR 0))), addr:$dst),
+     (VMOVAPDZ128mr addr:$dst, (v2f64 (EXTRACT_SUBREG VR512:$src,sub_xmm)))>;
+  def : Pat<(alignedstore (v4f32 (extract_subvector
+                                  (v16f32 VR512:$src), (iPTR 0))), addr:$dst),
+     (VMOVAPSZ128mr addr:$dst, (v4f32 (EXTRACT_SUBREG VR512:$src,sub_xmm)))>;
+  def : Pat<(alignedstore (v2i64 (extract_subvector
+                                  (v8i64 VR512:$src), (iPTR 0))), addr:$dst),
+     (VMOVDQA64Z128mr addr:$dst, (v2i64 (EXTRACT_SUBREG VR512:$src,sub_xmm)))>;
+  def : Pat<(alignedstore (v4i32 (extract_subvector
+                                  (v16i32 VR512:$src), (iPTR 0))), addr:$dst),
+     (VMOVDQA32Z128mr addr:$dst, (v4i32 (EXTRACT_SUBREG VR512:$src,sub_xmm)))>;
+  def : Pat<(alignedstore (v8i16 (extract_subvector
+                                  (v32i16 VR512:$src), (iPTR 0))), addr:$dst),
+     (VMOVDQA32Z128mr addr:$dst, (v8i16 (EXTRACT_SUBREG VR512:$src,sub_xmm)))>;
+  def : Pat<(alignedstore (v16i8 (extract_subvector
+                                  (v64i8 VR512:$src), (iPTR 0))), addr:$dst),
+     (VMOVDQA32Z128mr addr:$dst, (v16i8 (EXTRACT_SUBREG VR512:$src,sub_xmm)))>;
+
+  def : Pat<(store (v2f64 (extract_subvector
+                           (v8f64 VR512:$src), (iPTR 0))), addr:$dst),
+     (VMOVUPDZ128mr addr:$dst, (v2f64 (EXTRACT_SUBREG VR512:$src,sub_xmm)))>;
+  def : Pat<(store (v4f32 (extract_subvector
+                           (v16f32 VR512:$src), (iPTR 0))), addr:$dst),
+     (VMOVUPSZ128mr addr:$dst, (v4f32 (EXTRACT_SUBREG VR512:$src,sub_xmm)))>;
+  def : Pat<(store (v2i64 (extract_subvector
+                           (v8i64 VR512:$src), (iPTR 0))), addr:$dst),
+     (VMOVDQU64Z128mr addr:$dst, (v2i64 (EXTRACT_SUBREG VR512:$src,sub_xmm)))>;
+  def : Pat<(store (v4i32 (extract_subvector
+                           (v16i32 VR512:$src), (iPTR 0))), addr:$dst),
+     (VMOVDQU32Z128mr addr:$dst, (v4i32 (EXTRACT_SUBREG VR512:$src,sub_xmm)))>;
+  def : Pat<(store (v8i16 (extract_subvector
+                           (v32i16 VR512:$src), (iPTR 0))), addr:$dst),
+     (VMOVDQU32Z128mr addr:$dst, (v8i16 (EXTRACT_SUBREG VR512:$src,sub_xmm)))>;
+  def : Pat<(store (v16i8 (extract_subvector
+                           (v64i8 VR512:$src), (iPTR 0))), addr:$dst),
+     (VMOVDQU32Z128mr addr:$dst, (v16i8 (EXTRACT_SUBREG VR512:$src,sub_xmm)))>;
+
+  // Special patterns for storing subvector extracts of lower 256-bits of 512.
+  // Its cheaper to just use VMOVAPS/VMOVUPS instead of VEXTRACTF128mr
+  def : Pat<(alignedstore (v4f64 (extract_subvector
+                                  (v8f64 VR512:$src), (iPTR 0))), addr:$dst),
+     (VMOVAPDZ256mr addr:$dst, (v4f64 (EXTRACT_SUBREG VR512:$src,sub_ymm)))>;
+  def : Pat<(alignedstore (v8f32 (extract_subvector
+                                  (v16f32 VR512:$src), (iPTR 0))), addr:$dst),
+     (VMOVAPSZ256mr addr:$dst, (v8f32 (EXTRACT_SUBREG VR512:$src,sub_ymm)))>;
+  def : Pat<(alignedstore (v4i64 (extract_subvector
+                                  (v8i64 VR512:$src), (iPTR 0))), addr:$dst),
+     (VMOVDQA64Z256mr addr:$dst, (v4i64 (EXTRACT_SUBREG VR512:$src,sub_ymm)))>;
+  def : Pat<(alignedstore (v8i32 (extract_subvector
+                                  (v16i32 VR512:$src), (iPTR 0))), addr:$dst),
+     (VMOVDQA32Z256mr addr:$dst, (v8i32 (EXTRACT_SUBREG VR512:$src,sub_ymm)))>;
+  def : Pat<(alignedstore (v16i16 (extract_subvector
+                                   (v32i16 VR512:$src), (iPTR 0))), addr:$dst),
+     (VMOVDQA32Z256mr addr:$dst, (v16i16 (EXTRACT_SUBREG VR512:$src,sub_ymm)))>;
+  def : Pat<(alignedstore (v32i8 (extract_subvector
+                                  (v64i8 VR512:$src), (iPTR 0))), addr:$dst),
+     (VMOVDQA32Z256mr addr:$dst, (v32i8 (EXTRACT_SUBREG VR512:$src,sub_ymm)))>;
+
+  def : Pat<(store (v4f64 (extract_subvector
+                           (v8f64 VR512:$src), (iPTR 0))), addr:$dst),
+     (VMOVUPDZ256mr addr:$dst, (v4f64 (EXTRACT_SUBREG VR512:$src,sub_ymm)))>;
+  def : Pat<(store (v8f32 (extract_subvector
+                           (v16f32 VR512:$src), (iPTR 0))), addr:$dst),
+     (VMOVUPSZ256mr addr:$dst, (v8f32 (EXTRACT_SUBREG VR512:$src,sub_ymm)))>;
+  def : Pat<(store (v4i64 (extract_subvector
+                           (v8i64 VR512:$src), (iPTR 0))), addr:$dst),
+     (VMOVDQU64Z256mr addr:$dst, (v4i64 (EXTRACT_SUBREG VR512:$src,sub_ymm)))>;
+  def : Pat<(store (v8i32 (extract_subvector
+                           (v16i32 VR512:$src), (iPTR 0))), addr:$dst),
+     (VMOVDQU32Z256mr addr:$dst, (v8i32 (EXTRACT_SUBREG VR512:$src,sub_ymm)))>;
+  def : Pat<(store (v16i16 (extract_subvector
+                            (v32i16 VR512:$src), (iPTR 0))), addr:$dst),
+     (VMOVDQU32Z256mr addr:$dst, (v16i16 (EXTRACT_SUBREG VR512:$src,sub_ymm)))>;
+  def : Pat<(store (v32i8 (extract_subvector
+                           (v64i8 VR512:$src), (iPTR 0))), addr:$dst),
+     (VMOVDQU32Z256mr addr:$dst, (v32i8 (EXTRACT_SUBREG VR512:$src,sub_ymm)))>;
 }
 
+
 // Move Int Doubleword to Packed Double Int
 //
 def VMOVDI2PDIZrr : AVX512BI<0x6E, MRMSrcReg, (outs VR128X:$dst), (ins GR32:$src),
@@ -2910,45 +3007,43 @@ def VMOVQI2PQIZrm : AVX512XSI<0x7E, MRMSrcMem, (outs VR128X:$dst),
 // AVX-512  MOVSS, MOVSD
 //===----------------------------------------------------------------------===//
 
-multiclass avx512_move_scalar <string asm, SDNode OpNode, 
+multiclass avx512_move_scalar <string asm, SDNode OpNode,
                               X86VectorVTInfo _> {
-  defm rr_Int : AVX512_maskable_scalar<0x10, MRMSrcReg, _, (outs _.RC:$dst), 
+  defm rr_Int : AVX512_maskable_scalar<0x10, MRMSrcReg, _, (outs _.RC:$dst),
                     (ins _.RC:$src1, _.RC:$src2),
-                    asm, "$src2, $src1","$src1, $src2", 
+                    asm, "$src2, $src1","$src1, $src2",
                     (_.VT (OpNode (_.VT _.RC:$src1),
                                    (_.VT _.RC:$src2))),
                                    IIC_SSE_MOV_S_RR>, EVEX_4V;
-  let Constraints = "$src1 = $dst" , mayLoad = 1 in
+  let Constraints = "$src1 = $dst" in
     defm rm_Int : AVX512_maskable_3src_scalar<0x10, MRMSrcMem, _,
-                    (outs _.RC:$dst), 
+                    (outs _.RC:$dst),
                     (ins _.ScalarMemOp:$src),
                     asm,"$src","$src",
-                    (_.VT (OpNode (_.VT _.RC:$src1), 
-                               (_.VT (scalar_to_vector 
+                    (_.VT (OpNode (_.VT _.RC:$src1),
+                               (_.VT (scalar_to_vector
                                      (_.ScalarLdFrag addr:$src)))))>, EVEX;
   let isCodeGenOnly = 1 in {
-    def rr : AVX512PI<0x10, MRMSrcReg, (outs _.RC:$dst), 
+    def rr : AVX512PI<0x10, MRMSrcReg, (outs _.RC:$dst),
                (ins _.RC:$src1, _.FRC:$src2),
                !strconcat(asm, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
                [(set _.RC:$dst, (_.VT (OpNode _.RC:$src1,
                                       (scalar_to_vector _.FRC:$src2))))],
                _.ExeDomain,IIC_SSE_MOV_S_RR>, EVEX_4V;
-  let mayLoad = 1 in
     def rm : AVX512PI<0x10, MRMSrcMem, (outs _.FRC:$dst), (ins _.ScalarMemOp:$src),
                !strconcat(asm, "\t{$src, $dst|$dst, $src}"),
                [(set _.FRC:$dst, (_.ScalarLdFrag addr:$src))],
                _.ExeDomain, IIC_SSE_MOV_S_RM>, EVEX;
   }
-  let mayStore = 1 in {
-    def mr: AVX512PI<0x11, MRMDestMem, (outs), (ins _.ScalarMemOp:$dst, _.FRC:$src),
-               !strconcat(asm, "\t{$src, $dst|$dst, $src}"),
-               [(store _.FRC:$src, addr:$dst)],  _.ExeDomain, IIC_SSE_MOV_S_MR>,
-               EVEX;
-    def mrk: AVX512PI<0x11, MRMDestMem, (outs), 
-                (ins _.ScalarMemOp:$dst, VK1WM:$mask, _.FRC:$src),
-                !strconcat(asm, "\t{$src, $dst {${mask}}|$dst {${mask}}, $src}"),
-                [], _.ExeDomain, IIC_SSE_MOV_S_MR>, EVEX, EVEX_K;
-  } // mayStore
+  def mr: AVX512PI<0x11, MRMDestMem, (outs), (ins _.ScalarMemOp:$dst, _.FRC:$src),
+             !strconcat(asm, "\t{$src, $dst|$dst, $src}"),
+             [(store _.FRC:$src, addr:$dst)],  _.ExeDomain, IIC_SSE_MOV_S_MR>,
+             EVEX;
+  let mayStore = 1 in
+  def mrk: AVX512PI<0x11, MRMDestMem, (outs),
+              (ins _.ScalarMemOp:$dst, VK1WM:$mask, _.FRC:$src),
+              !strconcat(asm, "\t{$src, $dst {${mask}}|$dst {${mask}}, $src}"),
+              [], _.ExeDomain, IIC_SSE_MOV_S_MR>, EVEX, EVEX_K;
 }
 
 defm VMOVSSZ : avx512_move_scalar<"vmovss", X86Movss, f32x_info>,
@@ -2957,11 +3052,11 @@ defm VMOVSSZ : avx512_move_scalar<"vmovss", X86Movss, f32x_info>,
 defm VMOVSDZ : avx512_move_scalar<"vmovsd", X86Movsd, f64x_info>,
                                   VEX_LIG, XD, VEX_W, EVEX_CD8<64, CD8VT1>;
 
-def : Pat<(f32 (X86select VK1WM:$mask, (f32 FR32X:$src1), (f32 FR32X:$src2))),
+def : Pat<(f32 (X86selects VK1WM:$mask, (f32 FR32X:$src1), (f32 FR32X:$src2))),
           (COPY_TO_REGCLASS (VMOVSSZrr_Intk (COPY_TO_REGCLASS FR32X:$src2, VR128X),
            VK1WM:$mask, (v4f32 (IMPLICIT_DEF)),(COPY_TO_REGCLASS FR32X:$src1, VR128X)), FR32X)>;
 
-def : Pat<(f64 (X86select VK1WM:$mask, (f64 FR64X:$src1), (f64 FR64X:$src2))),
+def : Pat<(f64 (X86selects VK1WM:$mask, (f64 FR64X:$src1), (f64 FR64X:$src2))),
           (COPY_TO_REGCLASS (VMOVSDZrr_Intk (COPY_TO_REGCLASS FR64X:$src2, VR128X),
            VK1WM:$mask, (v2f64 (IMPLICIT_DEF)), (COPY_TO_REGCLASS FR64X:$src1, VR128X)), FR64X)>;
 
@@ -2969,11 +3064,13 @@ def : Pat<(int_x86_avx512_mask_store_ss addr:$dst, VR128X:$src, GR8:$mask),
           (VMOVSSZmrk addr:$dst, (i1 (COPY_TO_REGCLASS GR8:$mask, VK1WM)),
            (COPY_TO_REGCLASS VR128X:$src, FR32X))>;
 
+let hasSideEffects = 0 in
 defm VMOVSSZrr_REV : AVX512_maskable_in_asm<0x11, MRMDestReg, f32x_info,
                            (outs VR128X:$dst), (ins VR128X:$src1, VR128X:$src2),
                            "vmovss.s", "$src2, $src1", "$src1, $src2", []>,
                            XS, EVEX_4V, VEX_LIG;
 
+let hasSideEffects = 0 in
 defm VMOVSSDrr_REV : AVX512_maskable_in_asm<0x11, MRMDestReg, f64x_info,
                            (outs VR128X:$dst), (ins VR128X:$src1, VR128X:$src2),
                            "vmovsd.s", "$src2, $src1", "$src1, $src2", []>,
@@ -3037,6 +3134,22 @@ let Predicates = [HasAVX512] in {
   def : Pat<(v4f64 (X86vzmovl (insert_subvector undef,
                    (v2f64 (scalar_to_vector (loadf64 addr:$src))), (iPTR 0)))),
             (SUBREG_TO_REG (i32 0), (VMOVSDZrm addr:$src), sub_xmm)>;
+  def : Pat<(v4f64 (X86vzload addr:$src)),
+            (SUBREG_TO_REG (i32 0), (VMOVSDZrm addr:$src), sub_xmm)>;
+
+  // Represent the same patterns above but in the form they appear for
+  // 512-bit types
+  def : Pat<(v16i32 (X86vzmovl (insert_subvector undef,
+                   (v4i32 (scalar_to_vector (loadi32 addr:$src))), (iPTR 0)))),
+            (SUBREG_TO_REG (i32 0), (VMOVDI2PDIZrm addr:$src), sub_xmm)>;
+  def : Pat<(v16f32 (X86vzmovl (insert_subvector undef,
+                   (v4f32 (scalar_to_vector (loadf32 addr:$src))), (iPTR 0)))),
+            (SUBREG_TO_REG (i32 0), (VMOVSSZrm addr:$src), sub_xmm)>;
+  def : Pat<(v8f64 (X86vzmovl (insert_subvector undef,
+                   (v2f64 (scalar_to_vector (loadf64 addr:$src))), (iPTR 0)))),
+            (SUBREG_TO_REG (i32 0), (VMOVSDZrm addr:$src), sub_xmm)>;
+  def : Pat<(v8f64 (X86vzload addr:$src)),
+            (SUBREG_TO_REG (i32 0), (VMOVSDZrm addr:$src), sub_xmm)>;
   }
   def : Pat<(v8f32 (X86vzmovl (insert_subvector undef,
                    (v4f32 (scalar_to_vector FR32X:$src)), (iPTR 0)))),
@@ -3064,9 +3177,6 @@ let Predicates = [HasAVX512] in {
   def : Pat<(store (f32 (extractelt (v4f32 VR128X:$src), (iPTR 0))),
                    addr:$dst),
             (VMOVSSZmr addr:$dst, (COPY_TO_REGCLASS (v4f32 VR128X:$src), FR32X))>;
-  def : Pat<(store (f64 (extractelt (v2f64 VR128X:$src), (iPTR 0))),
-                   addr:$dst),
-            (VMOVSDZmr addr:$dst, (COPY_TO_REGCLASS (v2f64 VR128X:$src), FR64X))>;
 
   // Shuffle with VMOVSS
   def : Pat<(v4i32 (X86Movss VR128X:$src1, VR128X:$src2)),
@@ -3138,14 +3248,21 @@ def VMOVZPQILo2PQIZrm : AVX512XSI<0x7E, MRMSrcMem, (outs VR128X:$dst),
                                  EVEX_CD8<8, CD8VT8>;
 
 let Predicates = [HasAVX512] in {
+  let AddedComplexity = 15 in {
+    def : Pat<(v4i32 (X86vzmovl (v4i32 (scalar_to_vector GR32:$src)))),
+              (VMOVDI2PDIZrr GR32:$src)>;
+
+    def : Pat<(v2i64 (X86vzmovl (v2i64 (scalar_to_vector GR64:$src)))),
+              (VMOV64toPQIZrr GR64:$src)>;
+
+    def : Pat<(v4i64 (X86vzmovl (insert_subvector undef,
+                                 (v2i64 (scalar_to_vector GR64:$src)),(iPTR 0)))),
+              (SUBREG_TO_REG (i64 0), (VMOV64toPQIZrr GR64:$src), sub_xmm)>;
+  }
   // AVX 128-bit movd/movq instruction write zeros in the high 128-bit part.
   let AddedComplexity = 20 in {
     def : Pat<(v4i32 (X86vzmovl (v4i32 (scalar_to_vector (loadi32 addr:$src))))),
               (VMOVDI2PDIZrm addr:$src)>;
-    def : Pat<(v2i64 (X86vzmovl (v2i64 (scalar_to_vector GR64:$src)))),
-              (VMOV64toPQIZrr GR64:$src)>;
-    def : Pat<(v4i32 (X86vzmovl (v4i32 (scalar_to_vector GR32:$src)))),
-              (VMOVDI2PDIZrr GR32:$src)>;
 
     def : Pat<(v4i32 (X86vzmovl (bc_v4i32 (loadv4f32 addr:$src)))),
               (VMOVDI2PDIZrm addr:$src)>;
@@ -3157,15 +3274,18 @@ let Predicates = [HasAVX512] in {
             (VMOVZPQILo2PQIZrr VR128X:$src)>;
     def : Pat<(v2i64 (X86vzload addr:$src)),
             (VMOVZPQILo2PQIZrm addr:$src)>;
+    def : Pat<(v4i64 (X86vzload addr:$src)),
+              (SUBREG_TO_REG (i64 0), (VMOVZPQILo2PQIZrm addr:$src), sub_xmm)>;
   }
 
   // Use regular 128-bit instructions to match 256-bit scalar_to_vec+zext.
   def : Pat<(v8i32 (X86vzmovl (insert_subvector undef,
                                (v4i32 (scalar_to_vector GR32:$src)),(iPTR 0)))),
             (SUBREG_TO_REG (i32 0), (VMOVDI2PDIZrr GR32:$src), sub_xmm)>;
-  def : Pat<(v4i64 (X86vzmovl (insert_subvector undef,
-                               (v2i64 (scalar_to_vector GR64:$src)),(iPTR 0)))),
-            (SUBREG_TO_REG (i64 0), (VMOV64toPQIZrr GR64:$src), sub_xmm)>;
+
+  // Use regular 128-bit instructions to match 512-bit scalar_to_vec+zext.
+  def : Pat<(v8i64 (X86vzload addr:$src)),
+            (SUBREG_TO_REG (i64 0), (VMOVZPQILo2PQIZrm addr:$src), sub_xmm)>;
 }
 
 def : Pat<(v16i32 (X86Vinsert (v16i32 immAllZerosV), GR32:$src2, (iPTR 0))),
@@ -3190,66 +3310,112 @@ let SchedRW = [WriteLoad] in {
                         SSEPackedInt>, EVEX, T8PD, EVEX_V512,
                         EVEX_CD8<64, CD8VF>;
 
-  let Predicates = [HasAVX512, HasVLX] in {
+  let Predicates = [HasVLX] in {
     def VMOVNTDQAZ256rm : AVX512PI<0x2A, MRMSrcMem, (outs VR256X:$dst),
-                             (ins i256mem:$src),
-                             "vmovntdqa\t{$src, $dst|$dst, $src}", [],
-                             SSEPackedInt>, EVEX, T8PD, EVEX_V256,
-                             EVEX_CD8<64, CD8VF>;
+                         (ins i256mem:$src),
+                         "vmovntdqa\t{$src, $dst|$dst, $src}",
+                         [(set VR256X:$dst, (int_x86_avx2_movntdqa addr:$src))],
+                         SSEPackedInt>, EVEX, T8PD, EVEX_V256,
+                         EVEX_CD8<64, CD8VF>;
 
     def VMOVNTDQAZ128rm : AVX512PI<0x2A, MRMSrcMem, (outs VR128X:$dst),
-                             (ins i128mem:$src),
-                             "vmovntdqa\t{$src, $dst|$dst, $src}", [],
-                             SSEPackedInt>, EVEX, T8PD, EVEX_V128,
-                             EVEX_CD8<64, CD8VF>;
+                        (ins i128mem:$src),
+                        "vmovntdqa\t{$src, $dst|$dst, $src}",
+                        [(set VR128X:$dst, (int_x86_sse41_movntdqa addr:$src))],
+                        SSEPackedInt>, EVEX, T8PD, EVEX_V128,
+                        EVEX_CD8<64, CD8VF>;
   }
 }
 
-multiclass avx512_movnt<bits<8> opc, string OpcodeStr, PatFrag st_frag,
-                        ValueType OpVT, RegisterClass RC, X86MemOperand memop,
-                        Domain d, InstrItinClass itin = IIC_SSE_MOVNT> {
-  let SchedRW = [WriteStore], mayStore = 1,
-      AddedComplexity = 400 in
-  def mr : AVX512PI<opc, MRMDestMem, (outs), (ins memop:$dst, RC:$src),
+multiclass avx512_movnt<bits<8> opc, string OpcodeStr, X86VectorVTInfo _,
+                        PatFrag st_frag = alignednontemporalstore,
+                        InstrItinClass itin = IIC_SSE_MOVNT> {
+  let SchedRW = [WriteStore], AddedComplexity = 400 in
+  def mr : AVX512PI<opc, MRMDestMem, (outs), (ins _.MemOp:$dst, _.RC:$src),
                     !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
-                    [(st_frag (OpVT RC:$src), addr:$dst)], d, itin>, EVEX;
+                    [(st_frag (_.VT _.RC:$src), addr:$dst)],
+                    _.ExeDomain, itin>, EVEX, EVEX_CD8<_.EltSize, CD8VF>;
 }
 
-multiclass avx512_movnt_vl<bits<8> opc, string OpcodeStr, PatFrag st_frag,
-                           string elty, string elsz, string vsz512,
-                           string vsz256, string vsz128, Domain d,
-                           Predicate prd, InstrItinClass itin = IIC_SSE_MOVNT> {
-  let Predicates = [prd] in
-  defm Z : avx512_movnt<opc, OpcodeStr, st_frag,
-                        !cast<ValueType>("v"##vsz512##elty##elsz), VR512,
-                        !cast<X86MemOperand>(elty##"512mem"), d, itin>,
-                        EVEX_V512;
-
-  let Predicates = [prd, HasVLX] in {
-    defm Z256 : avx512_movnt<opc, OpcodeStr, st_frag,
-                             !cast<ValueType>("v"##vsz256##elty##elsz), VR256X,
-                             !cast<X86MemOperand>(elty##"256mem"), d, itin>,
-                             EVEX_V256;
+multiclass avx512_movnt_vl<bits<8> opc, string OpcodeStr,
+                                                  AVX512VLVectorVTInfo VTInfo> {
+  let Predicates = [HasAVX512] in
+    defm Z : avx512_movnt<opc, OpcodeStr, VTInfo.info512>, EVEX_V512;
 
-    defm Z128 : avx512_movnt<opc, OpcodeStr, st_frag,
-                             !cast<ValueType>("v"##vsz128##elty##elsz), VR128X,
-                             !cast<X86MemOperand>(elty##"128mem"), d, itin>,
-                             EVEX_V128;
-  }
+  let Predicates = [HasAVX512, HasVLX] in {
+    defm Z256 : avx512_movnt<opc, OpcodeStr, VTInfo.info256>, EVEX_V256;
+    defm Z128 : avx512_movnt<opc, OpcodeStr, VTInfo.info128>, EVEX_V128;
+  }
+}
+
+defm VMOVNTDQ : avx512_movnt_vl<0xE7, "vmovntdq", avx512vl_i64_info>, PD;
+defm VMOVNTPD : avx512_movnt_vl<0x2B, "vmovntpd", avx512vl_f64_info>, PD, VEX_W;
+defm VMOVNTPS : avx512_movnt_vl<0x2B, "vmovntps", avx512vl_f32_info>, PS;
+
+let Predicates = [HasAVX512], AddedComplexity = 400 in {
+  def : Pat<(alignednontemporalstore (v16i32 VR512:$src), addr:$dst),
+            (VMOVNTDQZmr addr:$dst, VR512:$src)>;
+  def : Pat<(alignednontemporalstore (v32i16 VR512:$src), addr:$dst),
+            (VMOVNTDQZmr addr:$dst, VR512:$src)>;
+  def : Pat<(alignednontemporalstore (v64i8 VR512:$src), addr:$dst),
+            (VMOVNTDQZmr addr:$dst, VR512:$src)>;
+
+  def : Pat<(v8f64 (alignednontemporalload addr:$src)),
+            (VMOVNTDQAZrm addr:$src)>;
+  def : Pat<(v16f32 (alignednontemporalload addr:$src)),
+            (VMOVNTDQAZrm addr:$src)>;
+  def : Pat<(v8i64 (alignednontemporalload addr:$src)),
+            (VMOVNTDQAZrm addr:$src)>;
+  def : Pat<(v16i32 (alignednontemporalload addr:$src)),
+            (VMOVNTDQAZrm addr:$src)>;
+  def : Pat<(v32i16 (alignednontemporalload addr:$src)),
+            (VMOVNTDQAZrm addr:$src)>;
+  def : Pat<(v64i8 (alignednontemporalload addr:$src)),
+            (VMOVNTDQAZrm addr:$src)>;
+}
+
+let Predicates = [HasVLX], AddedComplexity = 400 in {
+  def : Pat<(alignednontemporalstore (v8i32 VR256X:$src), addr:$dst),
+            (VMOVNTDQZ256mr addr:$dst, VR256X:$src)>;
+  def : Pat<(alignednontemporalstore (v16i16 VR256X:$src), addr:$dst),
+            (VMOVNTDQZ256mr addr:$dst, VR256X:$src)>;
+  def : Pat<(alignednontemporalstore (v32i8 VR256X:$src), addr:$dst),
+            (VMOVNTDQZ256mr addr:$dst, VR256X:$src)>;
+
+  def : Pat<(v4f64 (alignednontemporalload addr:$src)),
+            (VMOVNTDQAZ256rm addr:$src)>;
+  def : Pat<(v8f32 (alignednontemporalload addr:$src)),
+            (VMOVNTDQAZ256rm addr:$src)>;
+  def : Pat<(v4i64 (alignednontemporalload addr:$src)),
+            (VMOVNTDQAZ256rm addr:$src)>;
+  def : Pat<(v8i32 (alignednontemporalload addr:$src)),
+            (VMOVNTDQAZ256rm addr:$src)>;
+  def : Pat<(v16i16 (alignednontemporalload addr:$src)),
+            (VMOVNTDQAZ256rm addr:$src)>;
+  def : Pat<(v32i8 (alignednontemporalload addr:$src)),
+            (VMOVNTDQAZ256rm addr:$src)>;
+
+  def : Pat<(alignednontemporalstore (v4i32 VR128X:$src), addr:$dst),
+            (VMOVNTDQZ128mr addr:$dst, VR128X:$src)>;
+  def : Pat<(alignednontemporalstore (v8i16 VR128X:$src), addr:$dst),
+            (VMOVNTDQZ128mr addr:$dst, VR128X:$src)>;
+  def : Pat<(alignednontemporalstore (v16i8 VR128X:$src), addr:$dst),
+            (VMOVNTDQZ128mr addr:$dst, VR128X:$src)>;
+
+  def : Pat<(v2f64 (alignednontemporalload addr:$src)),
+            (VMOVNTDQAZ128rm addr:$src)>;
+  def : Pat<(v4f32 (alignednontemporalload addr:$src)),
+            (VMOVNTDQAZ128rm addr:$src)>;
+  def : Pat<(v2i64 (alignednontemporalload addr:$src)),
+            (VMOVNTDQAZ128rm addr:$src)>;
+  def : Pat<(v4i32 (alignednontemporalload addr:$src)),
+            (VMOVNTDQAZ128rm addr:$src)>;
+  def : Pat<(v8i16 (alignednontemporalload addr:$src)),
+            (VMOVNTDQAZ128rm addr:$src)>;
+  def : Pat<(v16i8 (alignednontemporalload addr:$src)),
+            (VMOVNTDQAZ128rm addr:$src)>;
 }
 
-defm VMOVNTDQ : avx512_movnt_vl<0xE7, "vmovntdq", alignednontemporalstore,
-                                "i", "64", "8", "4", "2", SSEPackedInt,
-                                HasAVX512>, PD, EVEX_CD8<64, CD8VF>;
-
-defm VMOVNTPD : avx512_movnt_vl<0x2B, "vmovntpd", alignednontemporalstore,
-                                "f", "64", "8", "4", "2", SSEPackedDouble,
-                                HasAVX512>, PD, VEX_W, EVEX_CD8<64, CD8VF>;
-
-defm VMOVNTPS : avx512_movnt_vl<0x2B, "vmovntps", alignednontemporalstore,
-                                "f", "32", "16", "8", "4", SSEPackedSingle,
-                                HasAVX512>, PS, EVEX_CD8<32, CD8VF>;
-
 //===----------------------------------------------------------------------===//
 // AVX-512 - Integer arithmetic
 //
@@ -3263,30 +3429,28 @@ multiclass avx512_binop_rm<bits<8> opc, string OpcodeStr, SDNode OpNode,
                     itins.rr, IsCommutable>,
             AVX512BIBase, EVEX_4V;
 
-  let mayLoad = 1 in
-    defm rm : AVX512_maskable<opc, MRMSrcMem, _, (outs _.RC:$dst),
-                    (ins _.RC:$src1, _.MemOp:$src2), OpcodeStr,
-                    "$src2, $src1", "$src1, $src2",
-                    (_.VT (OpNode _.RC:$src1,
-                                  (bitconvert (_.LdFrag addr:$src2)))),
-                    itins.rm>,
-              AVX512BIBase, EVEX_4V;
+  defm rm : AVX512_maskable<opc, MRMSrcMem, _, (outs _.RC:$dst),
+                  (ins _.RC:$src1, _.MemOp:$src2), OpcodeStr,
+                  "$src2, $src1", "$src1, $src2",
+                  (_.VT (OpNode _.RC:$src1,
+                                (bitconvert (_.LdFrag addr:$src2)))),
+                  itins.rm>,
+            AVX512BIBase, EVEX_4V;
 }
 
 multiclass avx512_binop_rmb<bits<8> opc, string OpcodeStr, SDNode OpNode,
                             X86VectorVTInfo _, OpndItins itins,
                             bit IsCommutable = 0> :
            avx512_binop_rm<opc, OpcodeStr, OpNode, _, itins, IsCommutable> {
-  let mayLoad = 1 in
-    defm rmb : AVX512_maskable<opc, MRMSrcMem, _, (outs _.RC:$dst),
-                    (ins _.RC:$src1, _.ScalarMemOp:$src2), OpcodeStr,
-                    "${src2}"##_.BroadcastStr##", $src1",
-                    "$src1, ${src2}"##_.BroadcastStr,
-                    (_.VT (OpNode _.RC:$src1,
-                                  (X86VBroadcast
-                                      (_.ScalarLdFrag addr:$src2)))),
-                    itins.rm>,
-               AVX512BIBase, EVEX_4V, EVEX_B;
+  defm rmb : AVX512_maskable<opc, MRMSrcMem, _, (outs _.RC:$dst),
+                  (ins _.RC:$src1, _.ScalarMemOp:$src2), OpcodeStr,
+                  "${src2}"##_.BroadcastStr##", $src1",
+                  "$src1, ${src2}"##_.BroadcastStr,
+                  (_.VT (OpNode _.RC:$src1,
+                                (X86VBroadcast
+                                    (_.ScalarLdFrag addr:$src2)))),
+                  itins.rm>,
+             AVX512BIBase, EVEX_4V, EVEX_B;
 }
 
 multiclass avx512_binop_rm_vl<bits<8> opc, string OpcodeStr, SDNode OpNode,
@@ -3380,7 +3544,8 @@ multiclass avx512_binop_rm_vl_all<bits<8> opc_b, bits<8> opc_w,
 
 multiclass avx512_binop_rm2<bits<8> opc, string OpcodeStr, OpndItins itins,
                             SDNode OpNode,X86VectorVTInfo _Src,
-                            X86VectorVTInfo _Dst, bit IsCommutable = 0> {
+                            X86VectorVTInfo _Dst, X86VectorVTInfo _Brdct,
+                            bit IsCommutable = 0> {
   defm rr : AVX512_maskable<opc, MRMSrcReg, _Dst, (outs _Dst.RC:$dst),
                             (ins _Src.RC:$src1, _Src.RC:$src2), OpcodeStr,
                             "$src2, $src1","$src1, $src2",
@@ -3389,26 +3554,24 @@ multiclass avx512_binop_rm2<bits<8> opc, string OpcodeStr, OpndItins itins,
                                          (_Src.VT _Src.RC:$src2))),
                             itins.rr, IsCommutable>,
                             AVX512BIBase, EVEX_4V;
-  let mayLoad = 1 in {
-      defm rm : AVX512_maskable<opc, MRMSrcMem, _Dst, (outs _Dst.RC:$dst),
-                            (ins _Src.RC:$src1, _Src.MemOp:$src2), OpcodeStr,
-                            "$src2, $src1", "$src1, $src2",
-                            (_Dst.VT (OpNode (_Src.VT _Src.RC:$src1),
-                                          (bitconvert (_Src.LdFrag addr:$src2)))),
-                            itins.rm>,
-                            AVX512BIBase, EVEX_4V;
-
-      defm rmb : AVX512_maskable<opc, MRMSrcMem, _Dst, (outs _Dst.RC:$dst),
-                        (ins _Src.RC:$src1, _Dst.ScalarMemOp:$src2),
-                        OpcodeStr,
-                        "${src2}"##_Dst.BroadcastStr##", $src1",
-                         "$src1, ${src2}"##_Dst.BroadcastStr,
-                        (_Dst.VT (OpNode (_Src.VT _Src.RC:$src1), (bitconvert
-                                     (_Dst.VT (X86VBroadcast
-                                              (_Dst.ScalarLdFrag addr:$src2)))))),
+  defm rm : AVX512_maskable<opc, MRMSrcMem, _Dst, (outs _Dst.RC:$dst),
+                        (ins _Src.RC:$src1, _Src.MemOp:$src2), OpcodeStr,
+                        "$src2, $src1", "$src1, $src2",
+                        (_Dst.VT (OpNode (_Src.VT _Src.RC:$src1),
+                                      (bitconvert (_Src.LdFrag addr:$src2)))),
                         itins.rm>,
-                        AVX512BIBase, EVEX_4V, EVEX_B;
-  }
+                        AVX512BIBase, EVEX_4V;
+
+  defm rmb : AVX512_maskable<opc, MRMSrcMem, _Dst, (outs _Dst.RC:$dst),
+                    (ins _Src.RC:$src1, _Dst.ScalarMemOp:$src2),
+                    OpcodeStr,
+                    "${src2}"##_Brdct.BroadcastStr##", $src1",
+                     "$src1, ${src2}"##_Dst.BroadcastStr,
+                    (_Dst.VT (OpNode (_Src.VT _Src.RC:$src1), (bitconvert
+                                 (_Brdct.VT (X86VBroadcast
+                                          (_Brdct.ScalarLdFrag addr:$src2)))))),
+                    itins.rm>,
+                    AVX512BIBase, EVEX_4V, EVEX_B;
 }
 
 defm VPADD : avx512_binop_rm_vl_all<0xFC, 0xFD, 0xFE, 0xD4, "vpadd", add,
@@ -3439,39 +3602,46 @@ defm VPAVG : avx512_binop_rm_vl_bw<0xE0, 0xE3, "vpavg", X86avg,
                                    SSE_INTALU_ITINS_P, HasBWI, 1>;
 
 multiclass avx512_binop_all<bits<8> opc, string OpcodeStr, OpndItins itins,
-                            SDNode OpNode, bit IsCommutable = 0> {
-
-  defm NAME#Z : avx512_binop_rm2<opc, OpcodeStr, itins, OpNode,
-                                 v16i32_info, v8i64_info, IsCommutable>,
-                                EVEX_V512, EVEX_CD8<64, CD8VF>, VEX_W;
-  let Predicates = [HasVLX] in {
+                            AVX512VLVectorVTInfo _SrcVTInfo, AVX512VLVectorVTInfo _DstVTInfo,
+                            SDNode OpNode, Predicate prd,  bit IsCommutable = 0> {
+  let Predicates = [prd] in
+    defm NAME#Z : avx512_binop_rm2<opc, OpcodeStr, itins, OpNode,
+                                 _SrcVTInfo.info512, _DstVTInfo.info512,
+                                 v8i64_info, IsCommutable>,
+                                  EVEX_V512, EVEX_CD8<64, CD8VF>, VEX_W;
+  let Predicates = [HasVLX, prd] in {
     defm NAME#Z256 : avx512_binop_rm2<opc, OpcodeStr, itins, OpNode,
-                                      v8i32x_info, v4i64x_info, IsCommutable>,
-                                     EVEX_V256, EVEX_CD8<64, CD8VF>, VEX_W;
+                                      _SrcVTInfo.info256, _DstVTInfo.info256,
+                                      v4i64x_info, IsCommutable>,
+                                      EVEX_V256, EVEX_CD8<64, CD8VF>, VEX_W;
     defm NAME#Z128 : avx512_binop_rm2<opc, OpcodeStr, itins, OpNode,
-                                      v4i32x_info, v2i64x_info, IsCommutable>,
+                                      _SrcVTInfo.info128, _DstVTInfo.info128,
+                                      v2i64x_info, IsCommutable>,
                                      EVEX_V128, EVEX_CD8<64, CD8VF>, VEX_W;
   }
 }
 
 defm VPMULDQ : avx512_binop_all<0x28, "vpmuldq", SSE_INTALU_ITINS_P,
-                   X86pmuldq, 1>,T8PD;
+                                avx512vl_i32_info, avx512vl_i64_info,
+                                X86pmuldq, HasAVX512, 1>,T8PD;
 defm VPMULUDQ : avx512_binop_all<0xF4, "vpmuludq", SSE_INTMUL_ITINS_P,
-                   X86pmuludq, 1>;
+                                avx512vl_i32_info, avx512vl_i64_info,
+                                X86pmuludq, HasAVX512, 1>;
+defm VPMULTISHIFTQB : avx512_binop_all<0x83, "vpmultishiftqb", SSE_INTALU_ITINS_P,
+                                avx512vl_i8_info, avx512vl_i8_info,
+                                X86multishift, HasVBMI, 0>, T8PD;
 
 multiclass avx512_packs_rmb<bits<8> opc, string OpcodeStr, SDNode OpNode,
                             X86VectorVTInfo _Src, X86VectorVTInfo _Dst> {
-  let mayLoad = 1 in {
-      defm rmb : AVX512_maskable<opc, MRMSrcMem, _Dst, (outs _Dst.RC:$dst),
-                        (ins _Src.RC:$src1, _Src.ScalarMemOp:$src2),
-                        OpcodeStr,
-                        "${src2}"##_Src.BroadcastStr##", $src1",
-                         "$src1, ${src2}"##_Src.BroadcastStr,
-                        (_Dst.VT (OpNode (_Src.VT _Src.RC:$src1), (bitconvert
-                                     (_Src.VT (X86VBroadcast
-                                              (_Src.ScalarLdFrag addr:$src2))))))>,
-                        EVEX_4V, EVEX_B, EVEX_CD8<_Src.EltSize, CD8VF>;
-  }
+  defm rmb : AVX512_maskable<opc, MRMSrcMem, _Dst, (outs _Dst.RC:$dst),
+                    (ins _Src.RC:$src1, _Src.ScalarMemOp:$src2),
+                    OpcodeStr,
+                    "${src2}"##_Src.BroadcastStr##", $src1",
+                     "$src1, ${src2}"##_Src.BroadcastStr,
+                    (_Dst.VT (OpNode (_Src.VT _Src.RC:$src1), (bitconvert
+                                 (_Src.VT (X86VBroadcast
+                                          (_Src.ScalarLdFrag addr:$src2))))))>,
+                    EVEX_4V, EVEX_B, EVEX_CD8<_Src.EltSize, CD8VF>;
 }
 
 multiclass avx512_packs_rm<bits<8> opc, string OpcodeStr,
@@ -3484,23 +3654,22 @@ multiclass avx512_packs_rm<bits<8> opc, string OpcodeStr,
                                          (_Src.VT _Src.RC:$src1),
                                          (_Src.VT _Src.RC:$src2)))>,
                             EVEX_CD8<_Src.EltSize, CD8VF>, EVEX_4V;
-  let mayLoad = 1 in {
-    defm rm : AVX512_maskable<opc, MRMSrcMem, _Dst, (outs _Dst.RC:$dst),
-                          (ins _Src.RC:$src1, _Src.MemOp:$src2), OpcodeStr,
-                          "$src2, $src1", "$src1, $src2",
-                          (_Dst.VT (OpNode (_Src.VT _Src.RC:$src1),
-                                        (bitconvert (_Src.LdFrag addr:$src2))))>,
-                           EVEX_4V, EVEX_CD8<_Src.EltSize, CD8VF>;
-  }
+  defm rm : AVX512_maskable<opc, MRMSrcMem, _Dst, (outs _Dst.RC:$dst),
+                        (ins _Src.RC:$src1, _Src.MemOp:$src2), OpcodeStr,
+                        "$src2, $src1", "$src1, $src2",
+                        (_Dst.VT (OpNode (_Src.VT _Src.RC:$src1),
+                                      (bitconvert (_Src.LdFrag addr:$src2))))>,
+                         EVEX_4V, EVEX_CD8<_Src.EltSize, CD8VF>;
 }
 
 multiclass avx512_packs_all_i32_i16<bits<8> opc, string OpcodeStr,
                                     SDNode OpNode> {
+  let Predicates = [HasBWI] in
   defm NAME#Z : avx512_packs_rm<opc, OpcodeStr, OpNode, v16i32_info,
                                  v32i16_info>,
                 avx512_packs_rmb<opc, OpcodeStr, OpNode, v16i32_info,
                                  v32i16_info>, EVEX_V512;
-  let Predicates = [HasVLX] in {
+  let Predicates = [HasBWI, HasVLX] in {
     defm NAME#Z256 : avx512_packs_rm<opc, OpcodeStr, OpNode, v8i32x_info,
                                      v16i16x_info>,
                      avx512_packs_rmb<opc, OpcodeStr, OpNode, v8i32x_info,
@@ -3513,9 +3682,10 @@ multiclass avx512_packs_all_i32_i16<bits<8> opc, string OpcodeStr,
 }
 multiclass avx512_packs_all_i16_i8<bits<8> opc, string OpcodeStr,
                             SDNode OpNode> {
+  let Predicates = [HasBWI] in
   defm NAME#Z : avx512_packs_rm<opc, OpcodeStr, OpNode, v32i16_info,
                                 v64i8_info>, EVEX_V512;
-  let Predicates = [HasVLX] in {
+  let Predicates = [HasBWI, HasVLX] in {
     defm NAME#Z256 : avx512_packs_rm<opc, OpcodeStr, OpNode, v16i16x_info,
                                     v32i8x_info>, EVEX_V256;
     defm NAME#Z128 : avx512_packs_rm<opc, OpcodeStr, OpNode, v8i16x_info,
@@ -3526,9 +3696,10 @@ multiclass avx512_packs_all_i16_i8<bits<8> opc, string OpcodeStr,
 multiclass avx512_vpmadd<bits<8> opc, string OpcodeStr,
                             SDNode OpNode, AVX512VLVectorVTInfo _Src,
                             AVX512VLVectorVTInfo _Dst> {
+  let Predicates = [HasBWI] in
   defm NAME#Z : avx512_packs_rm<opc, OpcodeStr, OpNode, _Src.info512,
                                 _Dst.info512>, EVEX_V512;
-  let Predicates = [HasVLX] in {
+  let Predicates = [HasBWI, HasVLX] in {
     defm NAME#Z256 : avx512_packs_rm<opc, OpcodeStr, OpNode, _Src.info256,
                                      _Dst.info256>, EVEX_V256;
     defm NAME#Z128 : avx512_packs_rm<opc, OpcodeStr, OpNode, _Src.info128,
@@ -3536,17 +3707,15 @@ multiclass avx512_vpmadd<bits<8> opc, string OpcodeStr,
   }
 }
 
-let Predicates = [HasBWI] in {
-  defm VPACKSSDW : avx512_packs_all_i32_i16<0x6B, "vpackssdw", X86Packss>, PD;
-  defm VPACKUSDW : avx512_packs_all_i32_i16<0x2b, "vpackusdw", X86Packus>, T8PD;
-  defm VPACKSSWB : avx512_packs_all_i16_i8 <0x63, "vpacksswb", X86Packss>, AVX512BIBase, VEX_W;
-  defm VPACKUSWB : avx512_packs_all_i16_i8 <0x67, "vpackuswb", X86Packus>, AVX512BIBase, VEX_W;
+defm VPACKSSDW : avx512_packs_all_i32_i16<0x6B, "vpackssdw", X86Packss>, AVX512BIBase;
+defm VPACKUSDW : avx512_packs_all_i32_i16<0x2b, "vpackusdw", X86Packus>, AVX5128IBase;
+defm VPACKSSWB : avx512_packs_all_i16_i8 <0x63, "vpacksswb", X86Packss>, AVX512BIBase;
+defm VPACKUSWB : avx512_packs_all_i16_i8 <0x67, "vpackuswb", X86Packus>, AVX512BIBase;
 
-  defm VPMADDUBSW : avx512_vpmadd<0x04, "vpmaddubsw", X86vpmaddubsw,
-                       avx512vl_i8_info, avx512vl_i16_info>, AVX512BIBase, T8PD;
-  defm VPMADDWD   : avx512_vpmadd<0xF5, "vpmaddwd", X86vpmaddwd,
-                       avx512vl_i16_info, avx512vl_i32_info>, AVX512BIBase;
-}
+defm VPMADDUBSW : avx512_vpmadd<0x04, "vpmaddubsw", X86vpmaddubsw,
+                     avx512vl_i8_info, avx512vl_i16_info>, AVX512BIBase, T8PD;
+defm VPMADDWD   : avx512_vpmadd<0xF5, "vpmaddwd", X86vpmaddwd,
+                     avx512vl_i16_info, avx512vl_i32_info>, AVX512BIBase;
 
 defm VPMAXSB : avx512_binop_rm_vl_b<0x3C, "vpmaxsb", smax,
                                      SSE_INTALU_ITINS_P, HasBWI, 1>, T8PD;
@@ -3603,7 +3772,7 @@ multiclass avx512_fp_scalar<bits<8> opc, string OpcodeStr,X86VectorVTInfo _,
                            itins.rr, IsCommutable>;
 
   defm rm_Int : AVX512_maskable_scalar<opc, MRMSrcMem, _, (outs _.RC:$dst),
-                         (ins _.RC:$src1, _.MemOp:$src2), OpcodeStr,
+                         (ins _.RC:$src1, _.ScalarMemOp:$src2), OpcodeStr,
                          "$src2, $src1", "$src1, $src2",
                          (VecNode (_.VT _.RC:$src1),
                           (_.VT (scalar_to_vector (_.ScalarLdFrag addr:$src2))),
@@ -3620,7 +3789,7 @@ multiclass avx512_fp_scalar<bits<8> opc, string OpcodeStr,X86VectorVTInfo _,
                          (ins _.FRC:$src1, _.ScalarMemOp:$src2),
                          OpcodeStr#"\t{$src2, $src1, $dst|$dst, $src1, $src2}",
                          [(set _.FRC:$dst, (OpNode _.FRC:$src1,
-                         (_.ScalarLdFrag addr:$src2)))], itins.rr>;
+                         (_.ScalarLdFrag addr:$src2)))], itins.rm>;
   }
 }
 
@@ -3677,8 +3846,41 @@ defm VADD : avx512_binop_s_round<0x58, "vadd", fadd, X86faddRnd, SSE_ALU_ITINS_S
 defm VMUL : avx512_binop_s_round<0x59, "vmul", fmul, X86fmulRnd, SSE_ALU_ITINS_S, 1>;
 defm VSUB : avx512_binop_s_round<0x5C, "vsub", fsub, X86fsubRnd, SSE_ALU_ITINS_S, 0>;
 defm VDIV : avx512_binop_s_round<0x5E, "vdiv", fdiv, X86fdivRnd, SSE_ALU_ITINS_S, 0>;
-defm VMIN : avx512_binop_s_sae  <0x5D, "vmin", X86fmin, X86fminRnd, SSE_ALU_ITINS_S, 1>;
-defm VMAX : avx512_binop_s_sae  <0x5F, "vmax", X86fmax, X86fmaxRnd, SSE_ALU_ITINS_S, 1>;
+defm VMIN : avx512_binop_s_sae  <0x5D, "vmin", X86fmin, X86fminRnd, SSE_ALU_ITINS_S, 0>;
+defm VMAX : avx512_binop_s_sae  <0x5F, "vmax", X86fmax, X86fmaxRnd, SSE_ALU_ITINS_S, 0>;
+
+// MIN/MAX nodes are commutable under "unsafe-fp-math". In this case we use
+// X86fminc and X86fmaxc instead of X86fmin and X86fmax
+multiclass avx512_comutable_binop_s<bits<8> opc, string OpcodeStr,
+                          X86VectorVTInfo _, SDNode OpNode, OpndItins itins> {
+  let isCodeGenOnly = 1, isCommutable =1, Predicates = [HasAVX512] in {
+  def rr : I< opc, MRMSrcReg, (outs _.FRC:$dst),
+                         (ins _.FRC:$src1, _.FRC:$src2),
+                          OpcodeStr#"\t{$src2, $src1, $dst|$dst, $src1, $src2}",
+                          [(set _.FRC:$dst, (OpNode _.FRC:$src1, _.FRC:$src2))],
+                          itins.rr>;
+  def rm : I< opc, MRMSrcMem, (outs _.FRC:$dst),
+                         (ins _.FRC:$src1, _.ScalarMemOp:$src2),
+                         OpcodeStr#"\t{$src2, $src1, $dst|$dst, $src1, $src2}",
+                         [(set _.FRC:$dst, (OpNode _.FRC:$src1,
+                         (_.ScalarLdFrag addr:$src2)))], itins.rm>;
+  }
+}
+defm VMINCSSZ : avx512_comutable_binop_s<0x5D, "vminss", f32x_info, X86fminc,
+                                SSE_ALU_ITINS_S.s>, XS, EVEX_4V, VEX_LIG,
+                                EVEX_CD8<32, CD8VT1>;
+
+defm VMINCSDZ : avx512_comutable_binop_s<0x5D, "vminsd", f64x_info, X86fminc,
+                                SSE_ALU_ITINS_S.d>, XD, VEX_W, EVEX_4V, VEX_LIG,
+                                EVEX_CD8<64, CD8VT1>;
+
+defm VMAXCSSZ : avx512_comutable_binop_s<0x5F, "vmaxss", f32x_info, X86fmaxc,
+                                SSE_ALU_ITINS_S.s>, XS, EVEX_4V, VEX_LIG,
+                                EVEX_CD8<32, CD8VT1>;
+
+defm VMAXCSDZ : avx512_comutable_binop_s<0x5F, "vmaxsd", f64x_info, X86fmaxc,
+                                SSE_ALU_ITINS_S.d>, XD, VEX_W, EVEX_4V, VEX_LIG,
+                                EVEX_CD8<64, CD8VT1>;
 
 multiclass avx512_fp_packed<bits<8> opc, string OpcodeStr, SDNode OpNode,
                             X86VectorVTInfo _, bit IsCommutable> {
@@ -3686,19 +3888,17 @@ multiclass avx512_fp_packed<bits<8> opc, string OpcodeStr, SDNode OpNode,
                   (ins _.RC:$src1, _.RC:$src2), OpcodeStr##_.Suffix,
                   "$src2, $src1", "$src1, $src2",
                   (_.VT (OpNode _.RC:$src1, _.RC:$src2))>, EVEX_4V;
-  let mayLoad = 1 in {
-    defm rm: AVX512_maskable<opc, MRMSrcMem, _, (outs _.RC:$dst),
-                    (ins _.RC:$src1, _.MemOp:$src2), OpcodeStr##_.Suffix,
-                    "$src2, $src1", "$src1, $src2",
-                    (OpNode _.RC:$src1, (_.LdFrag addr:$src2))>, EVEX_4V;
-    defm rmb: AVX512_maskable<opc, MRMSrcMem, _, (outs _.RC:$dst),
-                     (ins _.RC:$src1, _.ScalarMemOp:$src2), OpcodeStr##_.Suffix,
-                     "${src2}"##_.BroadcastStr##", $src1",
-                     "$src1, ${src2}"##_.BroadcastStr,
-                     (OpNode  _.RC:$src1, (_.VT (X86VBroadcast
-                                                (_.ScalarLdFrag addr:$src2))))>,
-                     EVEX_4V, EVEX_B;
-  }//let mayLoad = 1
+  defm rm: AVX512_maskable<opc, MRMSrcMem, _, (outs _.RC:$dst),
+                  (ins _.RC:$src1, _.MemOp:$src2), OpcodeStr##_.Suffix,
+                  "$src2, $src1", "$src1, $src2",
+                  (OpNode _.RC:$src1, (_.LdFrag addr:$src2))>, EVEX_4V;
+  defm rmb: AVX512_maskable<opc, MRMSrcMem, _, (outs _.RC:$dst),
+                   (ins _.RC:$src1, _.ScalarMemOp:$src2), OpcodeStr##_.Suffix,
+                   "${src2}"##_.BroadcastStr##", $src1",
+                   "$src1, ${src2}"##_.BroadcastStr,
+                   (OpNode  _.RC:$src1, (_.VT (X86VBroadcast
+                                              (_.ScalarLdFrag addr:$src2))))>,
+                   EVEX_4V, EVEX_B;
 }
 
 multiclass avx512_fp_round_packed<bits<8> opc, string OpcodeStr, SDNode OpNodeRnd,
@@ -3721,16 +3921,18 @@ multiclass avx512_fp_sae_packed<bits<8> opc, string OpcodeStr, SDNode OpNodeRnd,
 }
 
 multiclass avx512_fp_binop_p<bits<8> opc, string OpcodeStr, SDNode OpNode,
-                             bit IsCommutable = 0> {
+                             Predicate prd, bit IsCommutable = 0> {
+  let Predicates = [prd] in {
   defm PSZ : avx512_fp_packed<opc, OpcodeStr, OpNode, v16f32_info,
                               IsCommutable>, EVEX_V512, PS,
                               EVEX_CD8<32, CD8VF>;
   defm PDZ : avx512_fp_packed<opc, OpcodeStr, OpNode, v8f64_info,
                               IsCommutable>, EVEX_V512, PD, VEX_W,
                               EVEX_CD8<64, CD8VF>;
+  }
 
     // Define only if AVX512VL feature is present.
-  let Predicates = [HasVLX] in {
+  let Predicates = [prd, HasVLX] in {
     defm PSZ128 : avx512_fp_packed<opc, OpcodeStr, OpNode, v4f32x_info,
                                    IsCommutable>, EVEX_V128, PS,
                                    EVEX_CD8<32, CD8VF>;
@@ -3760,24 +3962,26 @@ multiclass avx512_fp_binop_p_sae<bits<8> opc, string OpcodeStr, SDNode OpNodeRnd
                               EVEX_V512, PD, VEX_W,EVEX_CD8<64, CD8VF>;
 }
 
-defm VADD : avx512_fp_binop_p<0x58, "vadd", fadd, 1>,
+defm VADD : avx512_fp_binop_p<0x58, "vadd", fadd, HasAVX512, 1>,
             avx512_fp_binop_p_round<0x58, "vadd", X86faddRnd>;
-defm VMUL : avx512_fp_binop_p<0x59, "vmul", fmul, 1>,
+defm VMUL : avx512_fp_binop_p<0x59, "vmul", fmul, HasAVX512, 1>,
             avx512_fp_binop_p_round<0x59, "vmul", X86fmulRnd>;
-defm VSUB : avx512_fp_binop_p<0x5C, "vsub", fsub>,
+defm VSUB : avx512_fp_binop_p<0x5C, "vsub", fsub, HasAVX512>,
             avx512_fp_binop_p_round<0x5C, "vsub", X86fsubRnd>;
-defm VDIV : avx512_fp_binop_p<0x5E, "vdiv", fdiv>,
+defm VDIV : avx512_fp_binop_p<0x5E, "vdiv", fdiv, HasAVX512>,
             avx512_fp_binop_p_round<0x5E, "vdiv", X86fdivRnd>;
-defm VMIN : avx512_fp_binop_p<0x5D, "vmin", X86fmin, 1>,
+defm VMIN : avx512_fp_binop_p<0x5D, "vmin", X86fmin, HasAVX512, 0>,
             avx512_fp_binop_p_sae<0x5D, "vmin", X86fminRnd>;
-defm VMAX : avx512_fp_binop_p<0x5F, "vmax", X86fmax, 1>,
+defm VMAX : avx512_fp_binop_p<0x5F, "vmax", X86fmax, HasAVX512, 0>,
             avx512_fp_binop_p_sae<0x5F, "vmax", X86fmaxRnd>;
-let Predicates = [HasDQI] in {
-  defm VAND  : avx512_fp_binop_p<0x54, "vand", X86fand, 1>;
-  defm VANDN : avx512_fp_binop_p<0x55, "vandn", X86fandn, 0>;
-  defm VOR   : avx512_fp_binop_p<0x56, "vor", X86for, 1>;
-  defm VXOR  : avx512_fp_binop_p<0x57, "vxor", X86fxor, 1>;
+let isCodeGenOnly = 1 in {
+  defm VMINC : avx512_fp_binop_p<0x5D, "vmin", X86fminc, HasAVX512, 1>;
+  defm VMAXC : avx512_fp_binop_p<0x5F, "vmax", X86fmaxc, HasAVX512, 1>;
 }
+defm VAND  : avx512_fp_binop_p<0x54, "vand", X86fand, HasDQI, 1>;
+defm VANDN : avx512_fp_binop_p<0x55, "vandn", X86fandn, HasDQI, 0>;
+defm VOR   : avx512_fp_binop_p<0x56, "vor", X86for, HasDQI, 1>;
+defm VXOR  : avx512_fp_binop_p<0x57, "vxor", X86fxor, HasDQI, 1>;
 
 multiclass avx512_fp_scalef_p<bits<8> opc, string OpcodeStr, SDNode OpNode,
                             X86VectorVTInfo _> {
@@ -3785,19 +3989,17 @@ multiclass avx512_fp_scalef_p<bits<8> opc, string OpcodeStr, SDNode OpNode,
                   (ins _.RC:$src1, _.RC:$src2), OpcodeStr##_.Suffix,
                   "$src2, $src1", "$src1, $src2",
                   (_.VT (OpNode _.RC:$src1, _.RC:$src2, (i32 FROUND_CURRENT)))>, EVEX_4V;
-  let mayLoad = 1 in {
-    defm rm: AVX512_maskable<opc, MRMSrcMem, _, (outs _.RC:$dst),
-                    (ins _.RC:$src1, _.MemOp:$src2), OpcodeStr##_.Suffix,
-                    "$src2, $src1", "$src1, $src2",
-                    (OpNode _.RC:$src1, (_.LdFrag addr:$src2), (i32 FROUND_CURRENT))>, EVEX_4V;
-    defm rmb: AVX512_maskable<opc, MRMSrcMem, _, (outs _.RC:$dst),
-                     (ins _.RC:$src1, _.ScalarMemOp:$src2), OpcodeStr##_.Suffix,
-                     "${src2}"##_.BroadcastStr##", $src1",
-                     "$src1, ${src2}"##_.BroadcastStr,
-                     (OpNode  _.RC:$src1, (_.VT (X86VBroadcast
-                                                (_.ScalarLdFrag addr:$src2))), (i32 FROUND_CURRENT))>,
-                     EVEX_4V, EVEX_B;
-  }//let mayLoad = 1
+  defm rm: AVX512_maskable<opc, MRMSrcMem, _, (outs _.RC:$dst),
+                  (ins _.RC:$src1, _.MemOp:$src2), OpcodeStr##_.Suffix,
+                  "$src2, $src1", "$src1, $src2",
+                  (OpNode _.RC:$src1, (_.LdFrag addr:$src2), (i32 FROUND_CURRENT))>, EVEX_4V;
+  defm rmb: AVX512_maskable<opc, MRMSrcMem, _, (outs _.RC:$dst),
+                   (ins _.RC:$src1, _.ScalarMemOp:$src2), OpcodeStr##_.Suffix,
+                   "${src2}"##_.BroadcastStr##", $src1",
+                   "$src1, ${src2}"##_.BroadcastStr,
+                   (OpNode  _.RC:$src1, (_.VT (X86VBroadcast
+                                              (_.ScalarLdFrag addr:$src2))), (i32 FROUND_CURRENT))>,
+                   EVEX_4V, EVEX_B;
 }
 
 multiclass avx512_fp_scalef_scalar<bits<8> opc, string OpcodeStr, SDNode OpNode,
@@ -3806,26 +4008,26 @@ multiclass avx512_fp_scalef_scalar<bits<8> opc, string OpcodeStr, SDNode OpNode,
                   (ins _.RC:$src1, _.RC:$src2), OpcodeStr##_.Suffix,
                   "$src2, $src1", "$src1, $src2",
                   (_.VT (OpNode _.RC:$src1, _.RC:$src2, (i32 FROUND_CURRENT)))>;
-  let mayLoad = 1 in {
-    defm rm: AVX512_maskable_scalar<opc, MRMSrcMem, _, (outs _.RC:$dst),
-                    (ins _.RC:$src1, _.MemOp:$src2), OpcodeStr##_.Suffix,
-                    "$src2, $src1", "$src1, $src2",
-                    (OpNode _.RC:$src1, (_.LdFrag addr:$src2), (i32 FROUND_CURRENT))>;
-  }//let mayLoad = 1
+  defm rm: AVX512_maskable_scalar<opc, MRMSrcMem, _, (outs _.RC:$dst),
+                  (ins _.RC:$src1, _.ScalarMemOp:$src2), OpcodeStr##_.Suffix,
+                  "$src2, $src1", "$src1, $src2",
+                  (OpNode _.RC:$src1,
+                          (_.VT (scalar_to_vector (_.ScalarLdFrag addr:$src2))),
+                          (i32 FROUND_CURRENT))>;
 }
 
-multiclass avx512_fp_scalef_all<bits<8> opc, bits<8> opcScaler, string OpcodeStr, SDNode OpNode> {
+multiclass avx512_fp_scalef_all<bits<8> opc, bits<8> opcScaler, string OpcodeStr, SDNode OpNode, SDNode OpNodeScal> {
   defm PSZ : avx512_fp_scalef_p<opc, OpcodeStr, OpNode, v16f32_info>,
              avx512_fp_round_packed<opc, OpcodeStr, OpNode, v16f32_info>,
                               EVEX_V512, EVEX_CD8<32, CD8VF>;
   defm PDZ : avx512_fp_scalef_p<opc, OpcodeStr, OpNode, v8f64_info>,
              avx512_fp_round_packed<opc, OpcodeStr, OpNode, v8f64_info>,
                               EVEX_V512, VEX_W, EVEX_CD8<64, CD8VF>;
-  defm SSZ128 : avx512_fp_scalef_scalar<opcScaler, OpcodeStr, OpNode, f32x_info>,
-                avx512_fp_scalar_round<opcScaler, OpcodeStr##"ss", f32x_info, OpNode, SSE_ALU_ITINS_S.s>,
+  defm SSZ128 : avx512_fp_scalef_scalar<opcScaler, OpcodeStr, OpNodeScal, f32x_info>,
+                avx512_fp_scalar_round<opcScaler, OpcodeStr##"ss", f32x_info, OpNodeScal, SSE_ALU_ITINS_S.s>,
                               EVEX_4V,EVEX_CD8<32, CD8VT1>;
-  defm SDZ128 : avx512_fp_scalef_scalar<opcScaler, OpcodeStr, OpNode, f64x_info>,
-                avx512_fp_scalar_round<opcScaler, OpcodeStr##"sd", f64x_info, OpNode, SSE_ALU_ITINS_S.d>,
+  defm SDZ128 : avx512_fp_scalef_scalar<opcScaler, OpcodeStr, OpNodeScal, f64x_info>,
+                avx512_fp_scalar_round<opcScaler, OpcodeStr##"sd", f64x_info, OpNodeScal, SSE_ALU_ITINS_S.d>,
                               EVEX_4V, EVEX_CD8<64, CD8VT1>, VEX_W;
 
   // Define only if AVX512VL feature is present.
@@ -3840,7 +4042,7 @@ multiclass avx512_fp_scalef_all<bits<8> opc, bits<8> opcScaler, string OpcodeStr
                                    EVEX_V256, VEX_W, EVEX_CD8<64, CD8VF>;
   }
 }
-defm VSCALEF : avx512_fp_scalef_all<0x2C, 0x2D, "vscalef", X86scalef>, T8PD;
+defm VSCALEF : avx512_fp_scalef_all<0x2C, 0x2D, "vscalef", X86scalef, X86scalefs>, T8PD;
 
 //===----------------------------------------------------------------------===//
 // AVX-512  VPTESTM instructions
@@ -3848,12 +4050,12 @@ defm VSCALEF : avx512_fp_scalef_all<0x2C, 0x2D, "vscalef", X86scalef>, T8PD;
 
 multiclass avx512_vptest<bits<8> opc, string OpcodeStr, SDNode OpNode,
                             X86VectorVTInfo _> {
+  let isCommutable = 1 in
   defm rr : AVX512_maskable_cmp<opc, MRMSrcReg, _, (outs _.KRC:$dst),
                    (ins _.RC:$src1, _.RC:$src2), OpcodeStr,
                       "$src2, $src1", "$src1, $src2",
                    (OpNode (_.VT _.RC:$src1), (_.VT _.RC:$src2))>,
                     EVEX_4V;
-  let mayLoad = 1 in
   defm rm : AVX512_maskable_cmp<opc, MRMSrcMem, _, (outs _.KRC:$dst),
                    (ins _.RC:$src1, _.MemOp:$src2), OpcodeStr,
                        "$src2, $src1", "$src1, $src2",
@@ -3865,7 +4067,6 @@ multiclass avx512_vptest<bits<8> opc, string OpcodeStr, SDNode OpNode,
 
 multiclass avx512_vptest_mb<bits<8> opc, string OpcodeStr, SDNode OpNode,
                             X86VectorVTInfo _> {
-  let mayLoad = 1 in
   defm rmb : AVX512_maskable_cmp<opc, MRMSrcMem, _, (outs _.KRC:$dst),
                     (ins _.RC:$src1, _.ScalarMemOp:$src2), OpcodeStr,
                     "${src2}"##_.BroadcastStr##", $src1",
@@ -3874,8 +4075,22 @@ multiclass avx512_vptest_mb<bits<8> opc, string OpcodeStr, SDNode OpNode,
                                                 (_.ScalarLdFrag addr:$src2))))>,
                     EVEX_B, EVEX_4V, EVEX_CD8<_.EltSize, CD8VF>;
 }
+
+// Use 512bit version to implement 128/256 bit in case NoVLX.
+multiclass avx512_vptest_lowering<SDNode OpNode, X86VectorVTInfo ExtendInfo,
+                                  X86VectorVTInfo _, string Suffix> {
+    def : Pat<(_.KVT (OpNode (_.VT _.RC:$src1), (_.VT _.RC:$src2))),
+              (_.KVT (COPY_TO_REGCLASS
+                       (!cast<Instruction>(NAME # Suffix # "Zrr")
+                         (INSERT_SUBREG (ExtendInfo.VT (IMPLICIT_DEF)),
+                                        _.RC:$src1, _.SubRegIdx),
+                         (INSERT_SUBREG (ExtendInfo.VT (IMPLICIT_DEF)),
+                                        _.RC:$src2, _.SubRegIdx)),
+                     _.KRC))>;
+}
+
 multiclass avx512_vptest_dq_sizes<bits<8> opc, string OpcodeStr, SDNode OpNode,
-                                  AVX512VLVectorVTInfo _> {
+                                  AVX512VLVectorVTInfo _, string Suffix> {
   let Predicates  = [HasAVX512] in
   defm Z : avx512_vptest<opc, OpcodeStr, OpNode, _.info512>,
            avx512_vptest_mb<opc, OpcodeStr, OpNode, _.info512>, EVEX_V512;
@@ -3886,13 +4101,17 @@ multiclass avx512_vptest_dq_sizes<bits<8> opc, string OpcodeStr, SDNode OpNode,
   defm Z128 : avx512_vptest<opc, OpcodeStr, OpNode, _.info128>,
               avx512_vptest_mb<opc, OpcodeStr, OpNode, _.info128>, EVEX_V128;
   }
+  let Predicates = [HasAVX512, NoVLX] in {
+  defm Z256_Alt : avx512_vptest_lowering< OpNode, _.info512, _.info256, Suffix>;
+  defm Z128_Alt : avx512_vptest_lowering< OpNode, _.info512, _.info128, Suffix>;
+  }
 }
 
 multiclass avx512_vptest_dq<bits<8> opc, string OpcodeStr, SDNode OpNode> {
   defm D : avx512_vptest_dq_sizes<opc, OpcodeStr#"d", OpNode,
-                                 avx512vl_i32_info>;
+                                 avx512vl_i32_info, "D">;
   defm Q : avx512_vptest_dq_sizes<opc, OpcodeStr#"q", OpNode,
-                                 avx512vl_i64_info>, VEX_W;
+                                 avx512vl_i64_info, "Q">, VEX_W;
 }
 
 multiclass avx512_vptest_wb<bits<8> opc, string OpcodeStr,
@@ -3914,6 +4133,14 @@ multiclass avx512_vptest_wb<bits<8> opc, string OpcodeStr,
   defm BZ128: avx512_vptest<opc, OpcodeStr#"b", OpNode, v16i8x_info>,
               EVEX_V128;
   }
+
+  let Predicates = [HasAVX512, NoVLX] in {
+  defm BZ256_Alt : avx512_vptest_lowering< OpNode, v64i8_info, v32i8x_info, "B">;
+  defm BZ128_Alt : avx512_vptest_lowering< OpNode, v64i8_info, v16i8x_info, "B">;
+  defm WZ256_Alt : avx512_vptest_lowering< OpNode, v32i16_info, v16i16x_info, "W">;
+  defm WZ128_Alt : avx512_vptest_lowering< OpNode, v32i16_info, v8i16x_info, "W">;
+  }
+
 }
 
 multiclass avx512_vptest_all_forms<bits<8> opc_wb, bits<8> opc_dq, string OpcodeStr,
@@ -3924,13 +4151,6 @@ multiclass avx512_vptest_all_forms<bits<8> opc_wb, bits<8> opc_dq, string Opcode
 defm VPTESTM   : avx512_vptest_all_forms<0x26, 0x27, "vptestm", X86testm>, T8PD;
 defm VPTESTNM  : avx512_vptest_all_forms<0x26, 0x27, "vptestnm", X86testnm>, T8XS;
 
-def : Pat <(i16 (int_x86_avx512_mask_ptestm_d_512 (v16i32 VR512:$src1),
-                 (v16i32 VR512:$src2), (i16 -1))),
-                 (COPY_TO_REGCLASS (VPTESTMDZrr VR512:$src1, VR512:$src2), GR16)>;
-
-def : Pat <(i8 (int_x86_avx512_mask_ptestm_q_512 (v8i64 VR512:$src1),
-                 (v8i64 VR512:$src2), (i8 -1))),
-                 (COPY_TO_REGCLASS (VPTESTMQZrr VR512:$src1, VR512:$src2), GR8)>;
 
 //===----------------------------------------------------------------------===//
 // AVX-512  Shift instructions
@@ -3942,7 +4162,6 @@ multiclass avx512_shift_rmi<bits<8> opc, Format ImmFormR, Format ImmFormM,
                       "$src2, $src1", "$src1, $src2",
                    (_.VT (OpNode _.RC:$src1, (i8 imm:$src2))),
                    SSE_INTSHIFT_ITINS_P.rr>;
-  let mayLoad = 1 in
   defm mi : AVX512_maskable<opc, ImmFormM, _, (outs _.RC:$dst),
                    (ins _.MemOp:$src1, u8imm:$src2), OpcodeStr,
                        "$src2, $src1", "$src1, $src2",
@@ -3953,7 +4172,6 @@ multiclass avx512_shift_rmi<bits<8> opc, Format ImmFormR, Format ImmFormM,
 
 multiclass avx512_shift_rmbi<bits<8> opc, Format ImmFormM,
                          string OpcodeStr, SDNode OpNode, X86VectorVTInfo _> {
-  let mayLoad = 1 in
   defm mbi : AVX512_maskable<opc, ImmFormM, _, (outs _.RC:$dst),
                    (ins _.ScalarMemOp:$src1, u8imm:$src2), OpcodeStr,
       "$src2, ${src1}"##_.BroadcastStr, "${src1}"##_.BroadcastStr##", $src2",
@@ -4073,7 +4291,6 @@ multiclass avx512_var_shift<bits<8> opc, string OpcodeStr, SDNode OpNode,
                       "$src2, $src1", "$src1, $src2",
                    (_.VT (OpNode _.RC:$src1, (_.VT _.RC:$src2))),
                    SSE_INTSHIFT_ITINS_P.rr>, AVX5128IBase, EVEX_4V;
-  let mayLoad = 1 in
   defm rm : AVX512_maskable<opc, MRMSrcMem, _, (outs _.RC:$dst),
                    (ins _.RC:$src1, _.MemOp:$src2), OpcodeStr,
                        "$src2, $src1", "$src1, $src2",
@@ -4085,7 +4302,6 @@ multiclass avx512_var_shift<bits<8> opc, string OpcodeStr, SDNode OpNode,
 
 multiclass avx512_var_shift_mb<bits<8> opc, string OpcodeStr, SDNode OpNode,
                             X86VectorVTInfo _> {
-  let mayLoad = 1 in
   defm rmb : AVX512_maskable<opc, MRMSrcMem, _, (outs _.RC:$dst),
                     (ins _.RC:$src1, _.ScalarMemOp:$src2), OpcodeStr,
                     "${src2}"##_.BroadcastStr##", $src1",
@@ -4117,20 +4333,20 @@ multiclass avx512_var_shift_types<bits<8> opc, string OpcodeStr,
                                  avx512vl_i64_info>, VEX_W;
 }
 
-// Use 512bit version to implement 128/256 bit in case NoVLX.  
+// Use 512bit version to implement 128/256 bit in case NoVLX.
 multiclass avx512_var_shift_w_lowering<AVX512VLVectorVTInfo _, SDNode OpNode> {
   let Predicates = [HasBWI, NoVLX] in {
-  def : Pat<(_.info256.VT (OpNode (_.info256.VT _.info256.RC:$src1), 
+  def : Pat<(_.info256.VT (OpNode (_.info256.VT _.info256.RC:$src1),
                                   (_.info256.VT _.info256.RC:$src2))),
-            (EXTRACT_SUBREG                
+            (EXTRACT_SUBREG
                 (!cast<Instruction>(NAME#"WZrr")
                     (INSERT_SUBREG (_.info512.VT (IMPLICIT_DEF)), VR256X:$src1, sub_ymm),
                     (INSERT_SUBREG (_.info512.VT (IMPLICIT_DEF)), VR256X:$src2, sub_ymm)),
              sub_ymm)>;
 
-  def : Pat<(_.info128.VT (OpNode (_.info128.VT _.info128.RC:$src1), 
+  def : Pat<(_.info128.VT (OpNode (_.info128.VT _.info128.RC:$src1),
                                   (_.info128.VT _.info128.RC:$src2))),
-            (EXTRACT_SUBREG                
+            (EXTRACT_SUBREG
                 (!cast<Instruction>(NAME#"WZrr")
                     (INSERT_SUBREG (_.info512.VT (IMPLICIT_DEF)), VR128X:$src1, sub_xmm),
                     (INSERT_SUBREG (_.info512.VT (IMPLICIT_DEF)), VR128X:$src2, sub_xmm)),
@@ -4155,9 +4371,14 @@ multiclass avx512_var_shift_w<bits<8> opc, string OpcodeStr,
 defm VPSLLV : avx512_var_shift_types<0x47, "vpsllv", shl>,
               avx512_var_shift_w<0x12, "vpsllvw", shl>,
               avx512_var_shift_w_lowering<avx512vl_i16_info, shl>;
+
 defm VPSRAV : avx512_var_shift_types<0x46, "vpsrav", sra>,
               avx512_var_shift_w<0x11, "vpsravw", sra>,
               avx512_var_shift_w_lowering<avx512vl_i16_info, sra>;
+let isCodeGenOnly = 1 in
+  defm VPSRAV_Int : avx512_var_shift_types<0x46, "vpsrav", X86vsrav>,
+                    avx512_var_shift_w<0x11, "vpsravw", X86vsrav>;
+
 defm VPSRLV : avx512_var_shift_types<0x45, "vpsrlv", srl>,
               avx512_var_shift_w<0x10, "vpsrlvw", srl>,
               avx512_var_shift_w_lowering<avx512vl_i16_info, srl>;
@@ -4193,8 +4414,24 @@ multiclass avx512_vpermi_dq_sizes<bits<8> opc, Format ImmFormR, Format ImmFormM,
                               VTInfo.info256>, EVEX_V256;
 }
 
+multiclass avx512_vperm_bw<bits<8> opc, string OpcodeStr,
+                              Predicate prd, SDNode OpNode,
+                              AVX512VLVectorVTInfo _> {
+  let Predicates = [prd] in
+  defm Z:    avx512_var_shift<opc, OpcodeStr, OpNode, _.info512>,
+              EVEX_V512 ;
+  let Predicates = [HasVLX, prd] in {
+  defm Z256: avx512_var_shift<opc, OpcodeStr, OpNode, _.info256>,
+              EVEX_V256 ;
+  defm Z128: avx512_var_shift<opc, OpcodeStr, OpNode, _.info128>,
+              EVEX_V128 ;
+  }
+}
 
-defm VPERM  : avx512_var_shift_w<0x8D, "vpermw", X86VPermv>;
+defm VPERMW  : avx512_vperm_bw<0x8D, "vpermw", HasBWI, X86VPermv,
+                                  avx512vl_i16_info>, VEX_W;
+defm VPERMB  : avx512_vperm_bw<0x8D, "vpermb", HasVBMI, X86VPermv,
+                                  avx512vl_i8_info>;
 
 defm VPERMD : avx512_vperm_dq_sizes<0x36, "vpermd", X86VPermv,
                                     avx512vl_i32_info>;
@@ -4212,7 +4449,7 @@ defm VPERMPD : avx512_vpermi_dq_sizes<0x01, MRMSrcReg, MRMSrcMem, "vpermpd",
                              X86VPermi, avx512vl_f64_info>,
                              EVEX, AVX512AIi8Base, EVEX_CD8<64, CD8VF>, VEX_W;
 //===----------------------------------------------------------------------===//
-// AVX-512 - VPERMIL 
+// AVX-512 - VPERMIL
 //===----------------------------------------------------------------------===//
 
 multiclass avx512_permil_vec<bits<8> OpcVar, string OpcodeStr,  SDNode OpNode,
@@ -4223,24 +4460,22 @@ multiclass avx512_permil_vec<bits<8> OpcVar, string OpcodeStr,  SDNode OpNode,
                   (_.VT (OpNode _.RC:$src1,
                                (Ctrl.VT Ctrl.RC:$src2)))>,
                   T8PD, EVEX_4V;
-  let mayLoad = 1 in {
-    defm rm: AVX512_maskable<OpcVar, MRMSrcMem, _, (outs _.RC:$dst),
-                    (ins _.RC:$src1, Ctrl.MemOp:$src2), OpcodeStr,
-                    "$src2, $src1", "$src1, $src2",
-                    (_.VT (OpNode
-                             _.RC:$src1,
-                             (Ctrl.VT (bitconvert(Ctrl.LdFrag addr:$src2)))))>,
-                    T8PD, EVEX_4V, EVEX_CD8<_.EltSize, CD8VF>;
-    defm rmb: AVX512_maskable<OpcVar, MRMSrcMem, _, (outs _.RC:$dst),
-                     (ins _.RC:$src1, _.ScalarMemOp:$src2), OpcodeStr,
-                     "${src2}"##_.BroadcastStr##", $src1",
-                     "$src1, ${src2}"##_.BroadcastStr,
-                     (_.VT (OpNode
-                              _.RC:$src1,
-                              (Ctrl.VT (X86VBroadcast
-                                         (Ctrl.ScalarLdFrag addr:$src2)))))>,
-                     T8PD, EVEX_4V, EVEX_B, EVEX_CD8<_.EltSize, CD8VF>;
-  }//let mayLoad = 1
+  defm rm: AVX512_maskable<OpcVar, MRMSrcMem, _, (outs _.RC:$dst),
+                  (ins _.RC:$src1, Ctrl.MemOp:$src2), OpcodeStr,
+                  "$src2, $src1", "$src1, $src2",
+                  (_.VT (OpNode
+                           _.RC:$src1,
+                           (Ctrl.VT (bitconvert(Ctrl.LdFrag addr:$src2)))))>,
+                  T8PD, EVEX_4V, EVEX_CD8<_.EltSize, CD8VF>;
+  defm rmb: AVX512_maskable<OpcVar, MRMSrcMem, _, (outs _.RC:$dst),
+                   (ins _.RC:$src1, _.ScalarMemOp:$src2), OpcodeStr,
+                   "${src2}"##_.BroadcastStr##", $src1",
+                   "$src1, ${src2}"##_.BroadcastStr,
+                   (_.VT (OpNode
+                            _.RC:$src1,
+                            (Ctrl.VT (X86VBroadcast
+                                       (Ctrl.ScalarLdFrag addr:$src2)))))>,
+                   T8PD, EVEX_4V, EVEX_B, EVEX_CD8<_.EltSize, CD8VF>;
 }
 
 multiclass avx512_permil_vec_common<string OpcodeStr, bits<8> OpcVar,
@@ -4326,16 +4561,15 @@ let Predicates = [HasAVX512] in {
 //===----------------------------------------------------------------------===//
 multiclass avx512_mov_hilo_packed<bits<8> opc, string OpcodeStr, SDNode OpNode,
                                   X86VectorVTInfo _> {
-  let mayLoad = 1 in
-    def rm : AVX512<opc, MRMSrcMem, (outs _.RC:$dst),
-                    (ins _.RC:$src1, f64mem:$src2),
-                    !strconcat(OpcodeStr,
-                               "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
-                    [(set _.RC:$dst,
-                       (OpNode _.RC:$src1,
-                         (_.VT (bitconvert
-                           (v2f64 (scalar_to_vector (loadf64 addr:$src2)))))))],
-                    IIC_SSE_MOV_LH>, EVEX_4V;
+  def rm : AVX512<opc, MRMSrcMem, (outs _.RC:$dst),
+                  (ins _.RC:$src1, f64mem:$src2),
+                  !strconcat(OpcodeStr,
+                             "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+                  [(set _.RC:$dst,
+                     (OpNode _.RC:$src1,
+                       (_.VT (bitconvert
+                         (v2f64 (scalar_to_vector (loadf64 addr:$src2)))))))],
+                  IIC_SSE_MOV_LH>, EVEX_4V;
 }
 
 defm VMOVHPSZ128 : avx512_mov_hilo_packed<0x16, "vmovhps", X86Movlhps,
@@ -4377,11 +4611,10 @@ let Predicates = [HasAVX512] in {
           (VMOVLPDZ128rm VR128X:$src1, addr:$src2)>;
 }
 
-let mayStore = 1 in {
 def VMOVHPSZ128mr : AVX512PSI<0x17, MRMDestMem, (outs),
                        (ins f64mem:$dst, VR128X:$src),
                        "vmovhps\t{$src, $dst|$dst, $src}",
-                       [(store (f64 (vector_extract
+                       [(store (f64 (extractelt
                                      (X86Unpckh (bc_v2f64 (v4f32 VR128X:$src)),
                                                 (bc_v2f64 (v4f32 VR128X:$src))),
                                      (iPTR 0))), addr:$dst)], IIC_SSE_MOV_LH>,
@@ -4389,28 +4622,28 @@ def VMOVHPSZ128mr : AVX512PSI<0x17, MRMDestMem, (outs),
 def VMOVHPDZ128mr : AVX512PDI<0x17, MRMDestMem, (outs),
                        (ins f64mem:$dst, VR128X:$src),
                        "vmovhpd\t{$src, $dst|$dst, $src}",
-                       [(store (f64 (vector_extract
+                       [(store (f64 (extractelt
                                      (v2f64 (X86Unpckh VR128X:$src, VR128X:$src)),
                                      (iPTR 0))), addr:$dst)], IIC_SSE_MOV_LH>,
                        EVEX, EVEX_CD8<64, CD8VT1>, VEX_W;
 def VMOVLPSZ128mr : AVX512PSI<0x13, MRMDestMem, (outs),
                        (ins f64mem:$dst, VR128X:$src),
                        "vmovlps\t{$src, $dst|$dst, $src}",
-                       [(store (f64 (vector_extract (bc_v2f64 (v4f32 VR128X:$src)),
+                       [(store (f64 (extractelt (bc_v2f64 (v4f32 VR128X:$src)),
                                      (iPTR 0))), addr:$dst)],
                                      IIC_SSE_MOV_LH>,
                        EVEX, EVEX_CD8<32, CD8VT2>;
 def VMOVLPDZ128mr : AVX512PDI<0x13, MRMDestMem, (outs),
                        (ins f64mem:$dst, VR128X:$src),
                        "vmovlpd\t{$src, $dst|$dst, $src}",
-                       [(store (f64 (vector_extract (v2f64 VR128X:$src),
+                       [(store (f64 (extractelt (v2f64 VR128X:$src),
                                      (iPTR 0))), addr:$dst)],
                                      IIC_SSE_MOV_LH>,
                        EVEX, EVEX_CD8<64, CD8VT1>, VEX_W;
-}
+
 let Predicates = [HasAVX512] in {
   // VMOVHPD patterns
-  def : Pat<(store (f64 (vector_extract
+  def : Pat<(store (f64 (extractelt
                            (v2f64 (X86VPermilpi VR128X:$src, (i8 1))),
                            (iPTR 0))), addr:$dst),
            (VMOVHPDZ128mr addr:$dst, VR128X:$src)>;
@@ -4442,21 +4675,19 @@ multiclass avx512_fma3p_213_rm<bits<8> opc, string OpcodeStr, SDNode OpNode,
           (_.VT (OpNode _.RC:$src1, _.RC:$src2, _.RC:$src3))>,
          AVX512FMA3Base;
 
-  let mayLoad = 1 in {
-    defm m: AVX512_maskable_3src<opc, MRMSrcMem, _, (outs _.RC:$dst),
-            (ins _.RC:$src2, _.MemOp:$src3),
-            OpcodeStr, "$src3, $src2", "$src2, $src3",
-            (_.VT (OpNode _.RC:$src1, _.RC:$src2, (_.LdFrag addr:$src3)))>,
-            AVX512FMA3Base;
+  defm m: AVX512_maskable_3src<opc, MRMSrcMem, _, (outs _.RC:$dst),
+          (ins _.RC:$src2, _.MemOp:$src3),
+          OpcodeStr, "$src3, $src2", "$src2, $src3",
+          (_.VT (OpNode _.RC:$src1, _.RC:$src2, (_.LdFrag addr:$src3)))>,
+          AVX512FMA3Base;
 
-    defm mb: AVX512_maskable_3src<opc, MRMSrcMem, _, (outs _.RC:$dst),
-              (ins _.RC:$src2, _.ScalarMemOp:$src3),
-              OpcodeStr,   !strconcat("${src3}", _.BroadcastStr,", $src2"),
-              !strconcat("$src2, ${src3}", _.BroadcastStr ),
-              (OpNode _.RC:$src1,
-               _.RC:$src2,(_.VT (X86VBroadcast (_.ScalarLdFrag addr:$src3))))>,
-              AVX512FMA3Base, EVEX_B;
-  }
+  defm mb: AVX512_maskable_3src<opc, MRMSrcMem, _, (outs _.RC:$dst),
+            (ins _.RC:$src2, _.ScalarMemOp:$src3),
+            OpcodeStr,   !strconcat("${src3}", _.BroadcastStr,", $src2"),
+            !strconcat("$src2, ${src3}", _.BroadcastStr ),
+            (OpNode _.RC:$src1,
+             _.RC:$src2,(_.VT (X86VBroadcast (_.ScalarLdFrag addr:$src3))))>,
+            AVX512FMA3Base, EVEX_B;
 }
 
 multiclass avx512_fma3_213_round<bits<8> opc, string OpcodeStr, SDNode OpNode,
@@ -4509,21 +4740,19 @@ multiclass avx512_fma3p_231_rm<bits<8> opc, string OpcodeStr, SDNode OpNode,
           (_.VT (OpNode _.RC:$src2, _.RC:$src3, _.RC:$src1))>,
          AVX512FMA3Base;
 
-  let mayLoad = 1 in {
-    defm m: AVX512_maskable_3src<opc, MRMSrcMem, _, (outs _.RC:$dst),
-            (ins _.RC:$src2, _.MemOp:$src3),
-            OpcodeStr, "$src3, $src2", "$src2, $src3",
-            (_.VT (OpNode _.RC:$src2, (_.LdFrag addr:$src3), _.RC:$src1))>,
-           AVX512FMA3Base;
+  defm m: AVX512_maskable_3src<opc, MRMSrcMem, _, (outs _.RC:$dst),
+          (ins _.RC:$src2, _.MemOp:$src3),
+          OpcodeStr, "$src3, $src2", "$src2, $src3",
+          (_.VT (OpNode _.RC:$src2, (_.LdFrag addr:$src3), _.RC:$src1))>,
+         AVX512FMA3Base;
 
-    defm mb: AVX512_maskable_3src<opc, MRMSrcMem, _, (outs _.RC:$dst),
-           (ins _.RC:$src2, _.ScalarMemOp:$src3),
-           OpcodeStr, "${src3}"##_.BroadcastStr##", $src2",
-           "$src2, ${src3}"##_.BroadcastStr,
-           (_.VT (OpNode _.RC:$src2,
-                        (_.VT (X86VBroadcast(_.ScalarLdFrag addr:$src3))),
-                        _.RC:$src1))>, AVX512FMA3Base, EVEX_B;
-  }
+  defm mb: AVX512_maskable_3src<opc, MRMSrcMem, _, (outs _.RC:$dst),
+         (ins _.RC:$src2, _.ScalarMemOp:$src3),
+         OpcodeStr, "${src3}"##_.BroadcastStr##", $src2",
+         "$src2, ${src3}"##_.BroadcastStr,
+         (_.VT (OpNode _.RC:$src2,
+                      (_.VT (X86VBroadcast(_.ScalarLdFrag addr:$src3))),
+                      _.RC:$src1))>, AVX512FMA3Base, EVEX_B;
 }
 
 multiclass avx512_fma3_231_round<bits<8> opc, string OpcodeStr, SDNode OpNode,
@@ -4575,21 +4804,19 @@ multiclass avx512_fma3p_132_rm<bits<8> opc, string OpcodeStr, SDNode OpNode,
           (_.VT (OpNode _.RC:$src1, _.RC:$src2, _.RC:$src3))>,
          AVX512FMA3Base;
 
-  let mayLoad = 1 in {
-    defm m: AVX512_maskable_3src<opc, MRMSrcMem, _, (outs _.RC:$dst),
-            (ins _.RC:$src3, _.MemOp:$src2),
-            OpcodeStr, "$src2, $src3", "$src3, $src2",
-            (_.VT (OpNode _.RC:$src1, (_.LdFrag addr:$src2), _.RC:$src3))>,
-           AVX512FMA3Base;
-
-    defm mb: AVX512_maskable_3src<opc, MRMSrcMem, _, (outs _.RC:$dst),
-           (ins _.RC:$src3, _.ScalarMemOp:$src2),
-           OpcodeStr, "${src2}"##_.BroadcastStr##", $src3",
-           "$src3, ${src2}"##_.BroadcastStr,
-           (_.VT (OpNode _.RC:$src1,
-                        (_.VT (X86VBroadcast(_.ScalarLdFrag addr:$src2))),
-                        _.RC:$src3))>, AVX512FMA3Base, EVEX_B;
-  }
+  defm m: AVX512_maskable_3src<opc, MRMSrcMem, _, (outs _.RC:$dst),
+          (ins _.RC:$src3, _.MemOp:$src2),
+          OpcodeStr, "$src2, $src3", "$src3, $src2",
+          (_.VT (OpNode _.RC:$src1, (_.LdFrag addr:$src2), _.RC:$src3))>,
+         AVX512FMA3Base;
+
+  defm mb: AVX512_maskable_3src<opc, MRMSrcMem, _, (outs _.RC:$dst),
+         (ins _.RC:$src3, _.ScalarMemOp:$src2),
+         OpcodeStr, "${src2}"##_.BroadcastStr##", $src3",
+         "$src3, ${src2}"##_.BroadcastStr,
+         (_.VT (OpNode _.RC:$src1,
+                      (_.VT (X86VBroadcast(_.ScalarLdFrag addr:$src2))),
+                      _.RC:$src3))>, AVX512FMA3Base, EVEX_B;
 }
 
 multiclass avx512_fma3_132_round<bits<8> opc, string OpcodeStr, SDNode OpNode,
@@ -4641,10 +4868,9 @@ multiclass avx512_fma3s_common<bits<8> opc, string OpcodeStr, X86VectorVTInfo _,
           (ins _.RC:$src2, _.RC:$src3), OpcodeStr,
           "$src3, $src2", "$src2, $src3", RHS_VEC_r>, AVX512FMA3Base;
 
-  let mayLoad = 1 in
-    defm m_Int: AVX512_maskable_3src_scalar<opc, MRMSrcMem, _, (outs _.RC:$dst),
-            (ins _.RC:$src2, _.MemOp:$src3), OpcodeStr,
-            "$src3, $src2", "$src2, $src3", RHS_VEC_m>, AVX512FMA3Base;
+  defm m_Int: AVX512_maskable_3src_scalar<opc, MRMSrcMem, _, (outs _.RC:$dst),
+          (ins _.RC:$src2, _.ScalarMemOp:$src3), OpcodeStr,
+          "$src3, $src2", "$src2, $src3", RHS_VEC_m>, AVX512FMA3Base;
 
   defm rb_Int: AVX512_maskable_3src_scalar<opc, MRMSrcReg, _, (outs _.RC:$dst),
          (ins _.RC:$src2, _.RC:$src3, AVX512RC:$rc),
@@ -4657,12 +4883,11 @@ multiclass avx512_fma3s_common<bits<8> opc, string OpcodeStr, X86VectorVTInfo _,
                      !strconcat(OpcodeStr,
                               "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
                      [RHS_r]>;
-    let mayLoad = 1 in
-      def m     : AVX512FMA3<opc, MRMSrcMem, (outs _.FRC:$dst),
-                      (ins _.FRC:$src1, _.FRC:$src2, _.ScalarMemOp:$src3),
-                      !strconcat(OpcodeStr,
-                                 "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
-                      [RHS_m]>;
+    def m     : AVX512FMA3<opc, MRMSrcMem, (outs _.FRC:$dst),
+                    (ins _.FRC:$src1, _.FRC:$src2, _.ScalarMemOp:$src3),
+                    !strconcat(OpcodeStr,
+                               "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
+                    [RHS_m]>;
   }// isCodeGenOnly = 1
 }
 }// Constraints = "$src1 = $dst"
@@ -4672,9 +4897,9 @@ multiclass avx512_fma3s_all<bits<8> opc213, bits<8> opc231, bits<8> opc132,
                                                                   string SUFF> {
 
   defm NAME#213#SUFF: avx512_fma3s_common<opc213, OpcodeStr#"213"#_.Suffix , _ ,
-                (_.VT (OpNode _.RC:$src2, _.RC:$src1, _.RC:$src3)),
-                (_.VT (OpNode _.RC:$src2, _.RC:$src1,
-                         (_.VT (scalar_to_vector(_.ScalarLdFrag addr:$src3))))),
+                (_.VT (OpNodeRnd _.RC:$src2, _.RC:$src1, _.RC:$src3, (i32 FROUND_CURRENT))),
+                (_.VT (OpNodeRnd _.RC:$src2, _.RC:$src1,
+                         (_.VT (scalar_to_vector(_.ScalarLdFrag addr:$src3))), (i32 FROUND_CURRENT))),
                 (_.VT ( OpNodeRnd _.RC:$src2, _.RC:$src1, _.RC:$src3,
                          (i32 imm:$rc))),
                 (set _.FRC:$dst, (_.EltVT (OpNode _.FRC:$src2, _.FRC:$src1,
@@ -4683,10 +4908,10 @@ multiclass avx512_fma3s_all<bits<8> opc213, bits<8> opc231, bits<8> opc132,
                          (_.ScalarLdFrag addr:$src3))))>;
 
   defm NAME#231#SUFF: avx512_fma3s_common<opc231, OpcodeStr#"231"#_.Suffix , _ ,
-                (_.VT (OpNode _.RC:$src2, _.RC:$src3, _.RC:$src1)),
-                (_.VT (OpNode _.RC:$src2,
+                (_.VT (OpNodeRnd _.RC:$src2, _.RC:$src3, _.RC:$src1, (i32 FROUND_CURRENT))),
+                (_.VT (OpNodeRnd _.RC:$src2,
                        (_.VT (scalar_to_vector(_.ScalarLdFrag addr:$src3))),
-                              _.RC:$src1)),
+                              _.RC:$src1, (i32 FROUND_CURRENT))),
                 (_.VT ( OpNodeRnd _.RC:$src2, _.RC:$src3, _.RC:$src1,
                                   (i32 imm:$rc))),
                 (set _.FRC:$dst, (_.EltVT (OpNode _.FRC:$src2, _.FRC:$src3,
@@ -4695,10 +4920,10 @@ multiclass avx512_fma3s_all<bits<8> opc213, bits<8> opc231, bits<8> opc132,
                             (_.ScalarLdFrag addr:$src3), _.FRC:$src1)))>;
 
   defm NAME#132#SUFF: avx512_fma3s_common<opc132, OpcodeStr#"132"#_.Suffix , _ ,
-                (_.VT (OpNode _.RC:$src1, _.RC:$src3, _.RC:$src2)),
-                (_.VT (OpNode _.RC:$src1,
+                (_.VT (OpNodeRnd _.RC:$src1, _.RC:$src3, _.RC:$src2, (i32 FROUND_CURRENT))),
+                (_.VT (OpNodeRnd _.RC:$src1,
                        (_.VT (scalar_to_vector(_.ScalarLdFrag addr:$src3))),
-                              _.RC:$src2)),
+                              _.RC:$src2, (i32 FROUND_CURRENT))),
                 (_.VT ( OpNodeRnd _.RC:$src1, _.RC:$src3, _.RC:$src2,
                          (i32 imm:$rc))),
                 (set _.FRC:$dst, (_.EltVT (OpNode _.FRC:$src1, _.FRC:$src3,
@@ -4725,6 +4950,53 @@ defm VFNMADD : avx512_fma3s<0xAD, 0xBD, 0x9D, "vfnmadd", X86Fnmadd, X86FnmaddRnd
 defm VFNMSUB : avx512_fma3s<0xAF, 0xBF, 0x9F, "vfnmsub", X86Fnmsub, X86FnmsubRnd>;
 
 //===----------------------------------------------------------------------===//
+// AVX-512  Packed Multiply of Unsigned 52-bit Integers and Add the Low 52-bit IFMA
+//===----------------------------------------------------------------------===//
+let Constraints = "$src1 = $dst" in {
+multiclass avx512_pmadd52_rm<bits<8> opc, string OpcodeStr, SDNode OpNode,
+                                                            X86VectorVTInfo _> {
+  defm r: AVX512_maskable_3src<opc, MRMSrcReg, _, (outs _.RC:$dst),
+          (ins _.RC:$src2, _.RC:$src3),
+          OpcodeStr, "$src3, $src2", "$src2, $src3",
+          (_.VT (OpNode _.RC:$src1, _.RC:$src2, _.RC:$src3))>,
+         AVX512FMA3Base;
+
+  defm m: AVX512_maskable_3src<opc, MRMSrcMem, _, (outs _.RC:$dst),
+          (ins _.RC:$src2, _.MemOp:$src3),
+          OpcodeStr, "$src3, $src2", "$src2, $src3",
+          (_.VT (OpNode _.RC:$src1, _.RC:$src2, (_.LdFrag addr:$src3)))>,
+          AVX512FMA3Base;
+
+  defm mb: AVX512_maskable_3src<opc, MRMSrcMem, _, (outs _.RC:$dst),
+            (ins _.RC:$src2, _.ScalarMemOp:$src3),
+            OpcodeStr,   !strconcat("${src3}", _.BroadcastStr,", $src2"),
+            !strconcat("$src2, ${src3}", _.BroadcastStr ),
+            (OpNode _.RC:$src1,
+             _.RC:$src2,(_.VT (X86VBroadcast (_.ScalarLdFrag addr:$src3))))>,
+            AVX512FMA3Base, EVEX_B;
+}
+} // Constraints = "$src1 = $dst"
+
+multiclass avx512_pmadd52_common<bits<8> opc, string OpcodeStr, SDNode OpNode,
+                                     AVX512VLVectorVTInfo _> {
+  let Predicates = [HasIFMA] in {
+    defm Z      : avx512_pmadd52_rm<opc, OpcodeStr, OpNode, _.info512>,
+                      EVEX_V512, EVEX_CD8<_.info512.EltSize, CD8VF>;
+  }
+  let Predicates = [HasVLX, HasIFMA] in {
+    defm Z256 : avx512_pmadd52_rm<opc, OpcodeStr, OpNode, _.info256>,
+                      EVEX_V256, EVEX_CD8<_.info256.EltSize, CD8VF>;
+    defm Z128 : avx512_pmadd52_rm<opc, OpcodeStr, OpNode, _.info128>,
+                      EVEX_V128, EVEX_CD8<_.info128.EltSize, CD8VF>;
+  }
+}
+
+defm VPMADD52LUQ : avx512_pmadd52_common<0xb4, "vpmadd52luq", x86vpmadd52l,
+                                  avx512vl_i64_info>, VEX_W;
+defm VPMADD52HUQ : avx512_pmadd52_common<0xb5, "vpmadd52huq", x86vpmadd52h,
+                                  avx512vl_i64_info>, VEX_W;
+
+//===----------------------------------------------------------------------===//
 // AVX-512  Scalar convert from sign integer to float/double
 //===----------------------------------------------------------------------===//
 
@@ -4848,54 +5120,65 @@ def : Pat<(f64 (uint_to_fp GR64:$src)),
 //===----------------------------------------------------------------------===//
 // AVX-512  Scalar convert from float/double to integer
 //===----------------------------------------------------------------------===//
-multiclass avx512_cvt_s_int_round<bits<8> opc, RegisterClass SrcRC, 
-                                  RegisterClass DstRC, Intrinsic Int,
-                           Operand memop, ComplexPattern mem_cpat, string asm> {
-  let hasSideEffects = 0, Predicates = [HasAVX512] in {
-    def rr : SI<opc, MRMSrcReg, (outs DstRC:$dst), (ins SrcRC:$src),
+multiclass avx512_cvt_s_int_round<bits<8> opc, X86VectorVTInfo SrcVT ,
+                                  X86VectorVTInfo DstVT, SDNode OpNode, string asm> {
+  let Predicates = [HasAVX512] in {
+    def rr : SI<opc, MRMSrcReg, (outs DstVT.RC:$dst), (ins SrcVT.RC:$src),
                 !strconcat(asm,"\t{$src, $dst|$dst, $src}"),
-                [(set DstRC:$dst, (Int SrcRC:$src))]>, EVEX, VEX_LIG;
-    def rb : SI<opc, MRMSrcReg, (outs DstRC:$dst), (ins SrcRC:$src, AVX512RC:$rc),
-                !strconcat(asm,"\t{$rc, $src, $dst|$dst, $src, $rc}"), []>, 
+                [(set DstVT.RC:$dst, (OpNode (SrcVT.VT SrcVT.RC:$src),(i32 FROUND_CURRENT)))]>,
+                EVEX, VEX_LIG;
+    def rb : SI<opc, MRMSrcReg, (outs DstVT.RC:$dst), (ins SrcVT.RC:$src, AVX512RC:$rc),
+                !strconcat(asm,"\t{$rc, $src, $dst|$dst, $src, $rc}"),
+                [(set DstVT.RC:$dst, (OpNode (SrcVT.VT SrcVT.RC:$src),(i32 imm:$rc)))]>,
                 EVEX, VEX_LIG, EVEX_B, EVEX_RC;
-    let mayLoad = 1 in
-    def rm : SI<opc, MRMSrcMem, (outs DstRC:$dst), (ins memop:$src),
-                !strconcat(asm,"\t{$src, $dst|$dst, $src}"), []>, EVEX, VEX_LIG;
-  } // hasSideEffects = 0, Predicates = [HasAVX512] 
+    def rm : SI<opc, MRMSrcMem, (outs DstVT.RC:$dst), (ins SrcVT.ScalarMemOp:$src),
+                !strconcat(asm,"\t{$src, $dst|$dst, $src}"),
+                [(set DstVT.RC:$dst, (OpNode
+                      (SrcVT.VT (scalar_to_vector (SrcVT.ScalarLdFrag addr:$src))),
+                      (i32 FROUND_CURRENT)))]>,
+                EVEX, VEX_LIG;
+  } // Predicates = [HasAVX512]
 }
 
 // Convert float/double to signed/unsigned int 32/64
-defm VCVTSS2SIZ: avx512_cvt_s_int_round<0x2D, VR128X, GR32, int_x86_sse_cvtss2si,
-                                   ssmem, sse_load_f32, "cvtss2si">,
+defm VCVTSS2SIZ: avx512_cvt_s_int_round<0x2D, f32x_info, i32x_info,
+                                   X86cvts2si, "cvtss2si">,
                                    XS, EVEX_CD8<32, CD8VT1>;
-defm VCVTSS2SI64Z: avx512_cvt_s_int_round<0x2D, VR128X, GR64, 
-                                  int_x86_sse_cvtss2si64,
-                                   ssmem, sse_load_f32, "cvtss2si">,
+defm VCVTSS2SI64Z: avx512_cvt_s_int_round<0x2D, f32x_info, i64x_info,
+                                   X86cvts2si, "cvtss2si">,
                                    XS, VEX_W, EVEX_CD8<32, CD8VT1>;
-defm VCVTSS2USIZ: avx512_cvt_s_int_round<0x79, VR128X, GR32, 
-                                  int_x86_avx512_cvtss2usi,
-                                   ssmem, sse_load_f32, "cvtss2usi">,
+defm VCVTSS2USIZ: avx512_cvt_s_int_round<0x79, f32x_info, i32x_info,
+                                   X86cvts2usi, "cvtss2usi">,
                                    XS, EVEX_CD8<32, CD8VT1>;
-defm VCVTSS2USI64Z: avx512_cvt_s_int_round<0x79, VR128X, GR64,
-                                   int_x86_avx512_cvtss2usi64, ssmem,
-                                   sse_load_f32, "cvtss2usi">, XS, VEX_W,
+defm VCVTSS2USI64Z: avx512_cvt_s_int_round<0x79, f32x_info, i64x_info,
+                                   X86cvts2usi, "cvtss2usi">, XS, VEX_W,
                                    EVEX_CD8<32, CD8VT1>;
-defm VCVTSD2SIZ: avx512_cvt_s_int_round<0x2D, VR128X, GR32, int_x86_sse2_cvtsd2si,
-                                   sdmem, sse_load_f64, "cvtsd2si">,
+defm VCVTSD2SIZ: avx512_cvt_s_int_round<0x2D, f64x_info, i32x_info,
+                                   X86cvts2si, "cvtsd2si">,
                                    XD, EVEX_CD8<64, CD8VT1>;
-defm VCVTSD2SI64Z: avx512_cvt_s_int_round<0x2D, VR128X, GR64, 
-                                   int_x86_sse2_cvtsd2si64,
-                                   sdmem, sse_load_f64, "cvtsd2si">,
+defm VCVTSD2SI64Z: avx512_cvt_s_int_round<0x2D, f64x_info, i64x_info,
+                                   X86cvts2si, "cvtsd2si">,
                                    XD, VEX_W, EVEX_CD8<64, CD8VT1>;
-defm VCVTSD2USIZ:   avx512_cvt_s_int_round<0x79, VR128X, GR32, 
-                                   int_x86_avx512_cvtsd2usi,
-                                   sdmem, sse_load_f64, "cvtsd2usi">,
+defm VCVTSD2USIZ:   avx512_cvt_s_int_round<0x79, f64x_info, i32x_info,
+                                   X86cvts2usi, "cvtsd2usi">,
                                    XD, EVEX_CD8<64, CD8VT1>;
-defm VCVTSD2USI64Z: avx512_cvt_s_int_round<0x79, VR128X, GR64,
-                                   int_x86_avx512_cvtsd2usi64, sdmem,
-                                   sse_load_f64, "cvtsd2usi">, XD, VEX_W,
+defm VCVTSD2USI64Z: avx512_cvt_s_int_round<0x79, f64x_info, i64x_info,
+                                   X86cvts2usi, "cvtsd2usi">, XD, VEX_W,
                                    EVEX_CD8<64, CD8VT1>;
 
+// The SSE version of these instructions are disabled for AVX512.
+// Therefore, the SSE intrinsics are mapped to the AVX512 instructions.
+let Predicates = [HasAVX512] in {
+  def : Pat<(i32 (int_x86_sse_cvtss2si (v4f32 VR128X:$src))),
+            (VCVTSS2SIZrr (COPY_TO_REGCLASS VR128X:$src, FR32X))>;
+  def : Pat<(i64 (int_x86_sse_cvtss2si64 (v4f32 VR128X:$src))),
+            (VCVTSS2SI64Zrr (COPY_TO_REGCLASS VR128X:$src, FR32X))>;
+  def : Pat<(i32 (int_x86_sse2_cvtsd2si (v2f64 VR128X:$src))),
+            (VCVTSD2SIZrr (COPY_TO_REGCLASS VR128X:$src, FR64X))>;
+  def : Pat<(i64 (int_x86_sse2_cvtsd2si64 (v2f64 VR128X:$src))),
+            (VCVTSD2SI64Zrr (COPY_TO_REGCLASS VR128X:$src, FR64X))>;
+} // HasAVX512
+
 let isCodeGenOnly = 1 , Predicates = [HasAVX512] in {
   defm Int_VCVTSI2SSZ : sse12_cvt_sint_3addr<0x2A, GR32, VR128X,
             int_x86_sse_cvtsi2ss, i32mem, loadi32, "cvtsi2ss{l}",
@@ -4910,14 +5193,14 @@ let isCodeGenOnly = 1 , Predicates = [HasAVX512] in {
             int_x86_sse2_cvtsi642sd, i64mem, loadi64, "cvtsi2sd{q}",
             SSE_CVT_Scalar, 0>, XD, EVEX_4V, VEX_W;
 
-  defm Int_VCVTUSI2SDZ : sse12_cvt_sint_3addr<0x2A, GR32, VR128X,
+  defm Int_VCVTUSI2SDZ : sse12_cvt_sint_3addr<0x7B, GR32, VR128X,
             int_x86_avx512_cvtusi2sd, i32mem, loadi32, "cvtusi2sd{l}",
             SSE_CVT_Scalar, 0>, XD, EVEX_4V;
 } // isCodeGenOnly = 1, Predicates = [HasAVX512]
 
 // Convert float/double to signed/unsigned int 32/64 with truncation
-multiclass avx512_cvt_s_all<bits<8> opc, string asm, X86VectorVTInfo _SrcRC, 
-                            X86VectorVTInfo _DstRC, SDNode OpNode, 
+multiclass avx512_cvt_s_all<bits<8> opc, string asm, X86VectorVTInfo _SrcRC,
+                            X86VectorVTInfo _DstRC, SDNode OpNode,
                             SDNode OpNodeRnd>{
 let Predicates = [HasAVX512] in {
   def rr : SI<opc, MRMSrcReg, (outs _DstRC.RC:$dst), (ins _SrcRC.FRC:$src),
@@ -4926,56 +5209,56 @@ let Predicates = [HasAVX512] in {
   def rb : SI<opc, MRMSrcReg, (outs _DstRC.RC:$dst), (ins _SrcRC.FRC:$src),
                 !strconcat(asm,"\t{{sae}, $src, $dst|$dst, $src, {sae}}"),
                 []>, EVEX, EVEX_B;
-  def rm : SI<opc, MRMSrcMem, (outs _DstRC.RC:$dst), (ins _SrcRC.MemOp:$src),
+  def rm : SI<opc, MRMSrcMem, (outs _DstRC.RC:$dst), (ins _SrcRC.ScalarMemOp:$src),
               !strconcat(asm,"\t{$src, $dst|$dst, $src}"),
-              [(set _DstRC.RC:$dst, (OpNode (_SrcRC.ScalarLdFrag addr:$src)))]>, 
+              [(set _DstRC.RC:$dst, (OpNode (_SrcRC.ScalarLdFrag addr:$src)))]>,
               EVEX;
 
-  let isCodeGenOnly = 1,hasSideEffects = 0 in {
+  let isCodeGenOnly = 1 in {
       def rr_Int : SI<opc, MRMSrcReg, (outs _DstRC.RC:$dst), (ins _SrcRC.RC:$src),
                 !strconcat(asm,"\t{$src, $dst|$dst, $src}"),
-               [(set _DstRC.RC:$dst, (OpNodeRnd _SrcRC.RC:$src,
+               [(set _DstRC.RC:$dst, (OpNodeRnd (_SrcRC.VT _SrcRC.RC:$src),
                                      (i32 FROUND_CURRENT)))]>, EVEX, VEX_LIG;
       def rb_Int : SI<opc, MRMSrcReg, (outs _DstRC.RC:$dst), (ins _SrcRC.RC:$src),
                 !strconcat(asm,"\t{{sae}, $src, $dst|$dst, $src, {sae}}"),
-                [(set _DstRC.RC:$dst, (OpNodeRnd _SrcRC.RC:$src, 
-                                      (i32 FROUND_NO_EXC)))]>, 
+                [(set _DstRC.RC:$dst, (OpNodeRnd (_SrcRC.VT _SrcRC.RC:$src),
+                                      (i32 FROUND_NO_EXC)))]>,
                                       EVEX,VEX_LIG , EVEX_B;
-      let mayLoad = 1 in
-        def rm_Int : SI<opc, MRMSrcMem, (outs _DstRC.RC:$dst), 
+      let mayLoad = 1, hasSideEffects = 0 in
+        def rm_Int : SI<opc, MRMSrcMem, (outs _DstRC.RC:$dst),
                     (ins _SrcRC.MemOp:$src),
                     !strconcat(asm,"\t{$src, $dst|$dst, $src}"),
                     []>, EVEX, VEX_LIG;
 
-  } // isCodeGenOnly = 1, hasSideEffects = 0
+  } // isCodeGenOnly = 1
 } //HasAVX512
 }
 
 
-defm VCVTTSS2SIZ: avx512_cvt_s_all<0x2C, "cvttss2si", f32x_info, i32x_info, 
-                        fp_to_sint,X86cvttss2IntRnd>, 
+defm VCVTTSS2SIZ: avx512_cvt_s_all<0x2C, "cvttss2si", f32x_info, i32x_info,
+                        fp_to_sint,X86cvtts2IntRnd>,
                         XS, EVEX_CD8<32, CD8VT1>;
-defm VCVTTSS2SI64Z: avx512_cvt_s_all<0x2C, "cvttss2si", f32x_info, i64x_info, 
-                        fp_to_sint,X86cvttss2IntRnd>, 
+defm VCVTTSS2SI64Z: avx512_cvt_s_all<0x2C, "cvttss2si", f32x_info, i64x_info,
+                        fp_to_sint,X86cvtts2IntRnd>,
                         VEX_W, XS, EVEX_CD8<32, CD8VT1>;
-defm VCVTTSD2SIZ: avx512_cvt_s_all<0x2C, "cvttsd2si", f64x_info, i32x_info, 
-                        fp_to_sint,X86cvttsd2IntRnd>,
+defm VCVTTSD2SIZ: avx512_cvt_s_all<0x2C, "cvttsd2si", f64x_info, i32x_info,
+                        fp_to_sint,X86cvtts2IntRnd>,
                         XD, EVEX_CD8<64, CD8VT1>;
-defm VCVTTSD2SI64Z: avx512_cvt_s_all<0x2C, "cvttsd2si", f64x_info, i64x_info, 
-                        fp_to_sint,X86cvttsd2IntRnd>, 
+defm VCVTTSD2SI64Z: avx512_cvt_s_all<0x2C, "cvttsd2si", f64x_info, i64x_info,
+                        fp_to_sint,X86cvtts2IntRnd>,
                         VEX_W, XD, EVEX_CD8<64, CD8VT1>;
 
-defm VCVTTSS2USIZ: avx512_cvt_s_all<0x78, "cvttss2usi", f32x_info, i32x_info, 
-                        fp_to_uint,X86cvttss2UIntRnd>, 
+defm VCVTTSS2USIZ: avx512_cvt_s_all<0x78, "cvttss2usi", f32x_info, i32x_info,
+                        fp_to_uint,X86cvtts2UIntRnd>,
                         XS, EVEX_CD8<32, CD8VT1>;
-defm VCVTTSS2USI64Z: avx512_cvt_s_all<0x78, "cvttss2usi", f32x_info, i64x_info, 
-                        fp_to_uint,X86cvttss2UIntRnd>, 
+defm VCVTTSS2USI64Z: avx512_cvt_s_all<0x78, "cvttss2usi", f32x_info, i64x_info,
+                        fp_to_uint,X86cvtts2UIntRnd>,
                         XS,VEX_W, EVEX_CD8<32, CD8VT1>;
-defm VCVTTSD2USIZ: avx512_cvt_s_all<0x78, "cvttsd2usi", f64x_info, i32x_info, 
-                        fp_to_uint,X86cvttsd2UIntRnd>, 
+defm VCVTTSD2USIZ: avx512_cvt_s_all<0x78, "cvttsd2usi", f64x_info, i32x_info,
+                        fp_to_uint,X86cvtts2UIntRnd>,
                         XD, EVEX_CD8<64, CD8VT1>;
-defm VCVTTSD2USI64Z: avx512_cvt_s_all<0x78, "cvttsd2usi", f64x_info, i64x_info, 
-                        fp_to_uint,X86cvttsd2UIntRnd>, 
+defm VCVTTSD2USI64Z: avx512_cvt_s_all<0x78, "cvttsd2usi", f64x_info, i64x_info,
+                        fp_to_uint,X86cvtts2UIntRnd>,
                         XD, VEX_W, EVEX_CD8<64, CD8VT1>;
 let Predicates = [HasAVX512] in {
   def : Pat<(i32 (int_x86_sse_cvttss2si (v4f32 VR128X:$src))),
@@ -4994,17 +5277,17 @@ let Predicates = [HasAVX512] in {
 multiclass avx512_cvt_fp_scalar<bits<8> opc, string OpcodeStr, X86VectorVTInfo _,
                          X86VectorVTInfo _Src, SDNode OpNode> {
   defm rr : AVX512_maskable_scalar<opc, MRMSrcReg, _, (outs _.RC:$dst),
-                         (ins _Src.RC:$src1, _Src.RC:$src2), OpcodeStr, 
+                         (ins _.RC:$src1, _Src.RC:$src2), OpcodeStr,
                          "$src2, $src1", "$src1, $src2",
-                         (_.VT (OpNode (_Src.VT _Src.RC:$src1),
-                                       (_Src.VT _Src.RC:$src2)))>, 
+                         (_.VT (OpNode (_.VT _.RC:$src1),
+                                       (_Src.VT _Src.RC:$src2)))>,
                          EVEX_4V, VEX_LIG, Sched<[WriteCvtF2F]>;
   defm rm : AVX512_maskable_scalar<opc, MRMSrcMem, _, (outs _.RC:$dst),
-                         (ins _Src.RC:$src1, _Src.MemOp:$src2), OpcodeStr, 
+                         (ins _Src.RC:$src1, _Src.ScalarMemOp:$src2), OpcodeStr,
                          "$src2, $src1", "$src1, $src2",
-                         (_.VT (OpNode (_Src.VT _Src.RC:$src1), 
-                                  (_Src.VT (scalar_to_vector 
-                                            (_Src.ScalarLdFrag addr:$src2)))))>, 
+                         (_.VT (OpNode (_.VT _.RC:$src1),
+                                  (_Src.VT (scalar_to_vector
+                                            (_Src.ScalarLdFrag addr:$src2)))))>,
                          EVEX_4V, VEX_LIG, Sched<[WriteCvtF2FLd, ReadAfterLd]>;
 }
 
@@ -5012,9 +5295,9 @@ multiclass avx512_cvt_fp_scalar<bits<8> opc, string OpcodeStr, X86VectorVTInfo _
 multiclass avx512_cvt_fp_sae_scalar<bits<8> opc, string OpcodeStr, X86VectorVTInfo _,
                          X86VectorVTInfo _Src, SDNode OpNodeRnd> {
   defm rrb : AVX512_maskable_scalar<opc, MRMSrcReg, _, (outs _.RC:$dst),
-                        (ins _Src.RC:$src1, _Src.RC:$src2), OpcodeStr,
+                        (ins _.RC:$src1, _Src.RC:$src2), OpcodeStr,
                         "{sae}, $src2, $src1", "$src1, $src2, {sae}",
-                        (_.VT (OpNodeRnd (_Src.VT _Src.RC:$src1), 
+                        (_.VT (OpNodeRnd (_.VT _.RC:$src1),
                                          (_Src.VT _Src.RC:$src2),
                                          (i32 FROUND_NO_EXC)))>,
                         EVEX_4V, VEX_LIG, EVEX_B;
@@ -5024,15 +5307,15 @@ multiclass avx512_cvt_fp_sae_scalar<bits<8> opc, string OpcodeStr, X86VectorVTIn
 multiclass avx512_cvt_fp_rc_scalar<bits<8> opc, string OpcodeStr, X86VectorVTInfo _,
                          X86VectorVTInfo _Src, SDNode OpNodeRnd> {
   defm rrb : AVX512_maskable_scalar<opc, MRMSrcReg, _, (outs _.RC:$dst),
-                        (ins _Src.RC:$src1, _Src.RC:$src2, AVX512RC:$rc), OpcodeStr,
+                        (ins _.RC:$src1, _Src.RC:$src2, AVX512RC:$rc), OpcodeStr,
                         "$rc, $src2, $src1", "$src1, $src2, $rc",
-                        (_.VT (OpNodeRnd (_Src.VT _Src.RC:$src1), 
+                        (_.VT (OpNodeRnd (_.VT _.RC:$src1),
                                          (_Src.VT _Src.RC:$src2), (i32 imm:$rc)))>,
                         EVEX_4V, VEX_LIG, Sched<[WriteCvtF2FLd, ReadAfterLd]>,
                         EVEX_B, EVEX_RC;
 }
-multiclass avx512_cvt_fp_scalar_sd2ss<bits<8> opc, string OpcodeStr, SDNode OpNode, 
-                                  SDNode OpNodeRnd, X86VectorVTInfo _src, 
+multiclass avx512_cvt_fp_scalar_sd2ss<bits<8> opc, string OpcodeStr, SDNode OpNode,
+                                  SDNode OpNodeRnd, X86VectorVTInfo _src,
                                                         X86VectorVTInfo _dst> {
   let Predicates = [HasAVX512] in {
     defm Z : avx512_cvt_fp_scalar<opc, OpcodeStr, _dst, _src, OpNode>,
@@ -5042,22 +5325,22 @@ multiclass avx512_cvt_fp_scalar_sd2ss<bits<8> opc, string OpcodeStr, SDNode OpNo
   }
 }
 
-multiclass avx512_cvt_fp_scalar_ss2sd<bits<8> opc, string OpcodeStr, SDNode OpNode, 
-                                    SDNode OpNodeRnd, X86VectorVTInfo _src, 
+multiclass avx512_cvt_fp_scalar_ss2sd<bits<8> opc, string OpcodeStr, SDNode OpNode,
+                                    SDNode OpNodeRnd, X86VectorVTInfo _src,
                                                           X86VectorVTInfo _dst> {
   let Predicates = [HasAVX512] in {
     defm Z : avx512_cvt_fp_scalar<opc, OpcodeStr, _dst, _src, OpNode>,
-             avx512_cvt_fp_sae_scalar<opc, OpcodeStr, _dst, _src, OpNodeRnd>, 
+             avx512_cvt_fp_sae_scalar<opc, OpcodeStr, _dst, _src, OpNodeRnd>,
              EVEX_CD8<32, CD8VT1>, XS, EVEX_V512;
   }
 }
 defm VCVTSD2SS : avx512_cvt_fp_scalar_sd2ss<0x5A, "vcvtsd2ss", X86fround,
                                          X86froundRnd, f64x_info, f32x_info>;
-defm VCVTSS2SD : avx512_cvt_fp_scalar_ss2sd<0x5A, "vcvtss2sd", X86fpext, 
+defm VCVTSS2SD : avx512_cvt_fp_scalar_ss2sd<0x5A, "vcvtss2sd", X86fpext,
                                           X86fpextRnd,f32x_info, f64x_info >;
 
-def : Pat<(f64 (fextend FR32X:$src)), 
-          (COPY_TO_REGCLASS (VCVTSS2SDZrr (COPY_TO_REGCLASS FR32X:$src, VR128X), 
+def : Pat<(f64 (fextend FR32X:$src)),
+          (COPY_TO_REGCLASS (VCVTSS2SDZrr (COPY_TO_REGCLASS FR32X:$src, VR128X),
                                (COPY_TO_REGCLASS FR32X:$src, VR128X)), VR128X)>,
           Requires<[HasAVX512]>;
 def : Pat<(f64 (fextend (loadf32 addr:$src))),
@@ -5069,12 +5352,12 @@ def : Pat<(f64 (extloadf32 addr:$src)),
       Requires<[HasAVX512, OptForSize]>;
 
 def : Pat<(f64 (extloadf32 addr:$src)),
-          (COPY_TO_REGCLASS (VCVTSS2SDZrr (v4f32 (IMPLICIT_DEF)), 
+          (COPY_TO_REGCLASS (VCVTSS2SDZrr (v4f32 (IMPLICIT_DEF)),
                     (COPY_TO_REGCLASS (VMOVSSZrm addr:$src), VR128X)), VR128X)>,
           Requires<[HasAVX512, OptForSpeed]>;
 
-def : Pat<(f32 (fround FR64X:$src)), 
-          (COPY_TO_REGCLASS (VCVTSD2SSZrr (COPY_TO_REGCLASS FR64X:$src, VR128X), 
+def : Pat<(f32 (fround FR64X:$src)),
+          (COPY_TO_REGCLASS (VCVTSD2SSZrr (COPY_TO_REGCLASS FR64X:$src, VR128X),
                     (COPY_TO_REGCLASS FR64X:$src, VR128X)), VR128X)>,
            Requires<[HasAVX512]>;
 //===----------------------------------------------------------------------===//
@@ -5097,7 +5380,7 @@ multiclass avx512_vcvt_fp<bits<8> opc, string OpcodeStr, X86VectorVTInfo _,
                              (bitconvert (_Src.LdFrag addr:$src)))))>, EVEX;
 
   defm rmb : AVX512_maskable<opc, MRMSrcMem, _, (outs _.RC:$dst),
-                         (ins _Src.MemOp:$src), OpcodeStr,
+                         (ins _Src.ScalarMemOp:$src), OpcodeStr,
                          "${src}"##Broadcast, "${src}"##Broadcast,
                          (_.VT (OpNode (_Src.VT
                                   (X86VBroadcast (_Src.ScalarLdFrag addr:$src)))
@@ -5405,59 +5688,59 @@ defm VCVTUDQ2PS : avx512_cvtdq2ps<0x7A, "vcvtudq2ps", uint_to_fp,
                                  X86VUintToFpRnd>, XD,
                                  EVEX_CD8<32, CD8VF>;
 
-defm VCVTPS2DQ : avx512_cvtps2dq<0x5B, "vcvtps2dq", X86cvtps2Int,
-                                 X86cvtps2IntRnd>, PD, EVEX_CD8<32, CD8VF>;
+defm VCVTPS2DQ : avx512_cvtps2dq<0x5B, "vcvtps2dq", X86cvtp2Int,
+                                 X86cvtp2IntRnd>, PD, EVEX_CD8<32, CD8VF>;
 
-defm VCVTPD2DQ : avx512_cvtpd2dq<0xE6, "vcvtpd2dq", X86cvtpd2Int,
-                                 X86cvtpd2IntRnd>, XD, VEX_W,
+defm VCVTPD2DQ : avx512_cvtpd2dq<0xE6, "vcvtpd2dq", X86cvtp2Int,
+                                 X86cvtp2IntRnd>, XD, VEX_W,
                                  EVEX_CD8<64, CD8VF>;
 
-defm VCVTPS2UDQ : avx512_cvtps2dq<0x79, "vcvtps2udq", X86cvtps2UInt,
-                                 X86cvtps2UIntRnd>,
+defm VCVTPS2UDQ : avx512_cvtps2dq<0x79, "vcvtps2udq", X86cvtp2UInt,
+                                 X86cvtp2UIntRnd>,
                                  PS, EVEX_CD8<32, CD8VF>;
-defm VCVTPD2UDQ : avx512_cvtpd2dq<0x79, "vcvtpd2udq", X86cvtpd2UInt,
-                                 X86cvtpd2UIntRnd>, VEX_W,
+defm VCVTPD2UDQ : avx512_cvtpd2dq<0x79, "vcvtpd2udq", X86cvtp2UInt,
+                                 X86cvtp2UIntRnd>, VEX_W,
                                  PS, EVEX_CD8<64, CD8VF>;
 
-defm VCVTPD2QQ : avx512_cvtpd2qq<0x7B, "vcvtpd2qq", X86cvtpd2Int,
-                                 X86cvtpd2IntRnd>, VEX_W,
+defm VCVTPD2QQ : avx512_cvtpd2qq<0x7B, "vcvtpd2qq", X86cvtp2Int,
+                                 X86cvtp2IntRnd>, VEX_W,
                                  PD, EVEX_CD8<64, CD8VF>;
 
-defm VCVTPS2QQ : avx512_cvtps2qq<0x7B, "vcvtps2qq", X86cvtps2Int,
-                                 X86cvtps2IntRnd>, PD, EVEX_CD8<32, CD8VH>;
+defm VCVTPS2QQ : avx512_cvtps2qq<0x7B, "vcvtps2qq", X86cvtp2Int,
+                                 X86cvtp2IntRnd>, PD, EVEX_CD8<32, CD8VH>;
 
-defm VCVTPD2UQQ : avx512_cvtpd2qq<0x79, "vcvtpd2uqq", X86cvtpd2UInt,
-                                 X86cvtpd2UIntRnd>, VEX_W,
+defm VCVTPD2UQQ : avx512_cvtpd2qq<0x79, "vcvtpd2uqq", X86cvtp2UInt,
+                                 X86cvtp2UIntRnd>, VEX_W,
                                  PD, EVEX_CD8<64, CD8VF>;
 
-defm VCVTPS2UQQ : avx512_cvtps2qq<0x79, "vcvtps2uqq", X86cvtps2UInt,
-                                 X86cvtps2UIntRnd>, PD, EVEX_CD8<32, CD8VH>;
+defm VCVTPS2UQQ : avx512_cvtps2qq<0x79, "vcvtps2uqq", X86cvtp2UInt,
+                                 X86cvtp2UIntRnd>, PD, EVEX_CD8<32, CD8VH>;
 
 defm VCVTTPD2QQ : avx512_cvttpd2qq<0x7A, "vcvttpd2qq", fp_to_sint,
-                                 X86VFpToSlongRnd>, VEX_W,
+                                 X86VFpToSintRnd>, VEX_W,
                                  PD, EVEX_CD8<64, CD8VF>;
 
 defm VCVTTPS2QQ : avx512_cvttps2qq<0x7A, "vcvttps2qq", fp_to_sint,
-                                 X86VFpToSlongRnd>, PD, EVEX_CD8<32, CD8VH>;
+                                 X86VFpToSintRnd>, PD, EVEX_CD8<32, CD8VH>;
 
 defm VCVTTPD2UQQ : avx512_cvttpd2qq<0x78, "vcvttpd2uqq", fp_to_uint,
-                                 X86VFpToUlongRnd>, VEX_W,
+                                 X86VFpToUintRnd>, VEX_W,
                                  PD, EVEX_CD8<64, CD8VF>;
 
 defm VCVTTPS2UQQ : avx512_cvttps2qq<0x78, "vcvttps2uqq", fp_to_uint,
-                                 X86VFpToUlongRnd>, PD, EVEX_CD8<32, CD8VH>;
+                                 X86VFpToUintRnd>, PD, EVEX_CD8<32, CD8VH>;
 
 defm VCVTQQ2PD : avx512_cvtqq2pd<0xE6, "vcvtqq2pd", sint_to_fp,
-                            X86VSlongToFpRnd>, VEX_W, XS, EVEX_CD8<64, CD8VF>;
+                            X86VSintToFpRnd>, VEX_W, XS, EVEX_CD8<64, CD8VF>;
 
 defm VCVTUQQ2PD : avx512_cvtqq2pd<0x7A, "vcvtuqq2pd", uint_to_fp,
-                            X86VUlongToFpRnd>, VEX_W, XS, EVEX_CD8<64, CD8VF>;
+                            X86VUintToFpRnd>, VEX_W, XS, EVEX_CD8<64, CD8VF>;
 
 defm VCVTQQ2PS : avx512_cvtqq2ps<0x5B, "vcvtqq2ps", sint_to_fp,
-                            X86VSlongToFpRnd>, VEX_W, PS, EVEX_CD8<64, CD8VF>;
+                            X86VSintToFpRnd>, VEX_W, PS, EVEX_CD8<64, CD8VF>;
 
 defm VCVTUQQ2PS : avx512_cvtqq2ps<0x7A, "vcvtuqq2ps", uint_to_fp,
-                            X86VUlongToFpRnd>, VEX_W, XD, EVEX_CD8<64, CD8VF>;
+                            X86VUintToFpRnd>, VEX_W, XD, EVEX_CD8<64, CD8VF>;
 
 let Predicates = [HasAVX512, NoVLX] in {
 def : Pat<(v8i32 (fp_to_uint (v8f32 VR256X:$src1))),
@@ -5468,6 +5751,10 @@ def : Pat<(v4i32 (fp_to_uint (v4f32 VR128X:$src1))),
           (EXTRACT_SUBREG (v16i32 (VCVTTPS2UDQZrr
            (v16f32 (SUBREG_TO_REG (i32 0), VR128X:$src1, sub_xmm)))), sub_xmm)>;
 
+def : Pat<(v4i32 (fp_to_uint (v4f64 VR256X:$src1))),
+          (EXTRACT_SUBREG (v8i32 (VCVTTPD2UDQZrr
+           (v8f64 (SUBREG_TO_REG (i32 0), VR256X:$src1, sub_ymm)))), sub_xmm)>;
+
 def : Pat<(v8f32 (uint_to_fp (v8i32 VR256X:$src1))),
           (EXTRACT_SUBREG (v16f32 (VCVTUDQ2PSZrr
            (v16i32 (SUBREG_TO_REG (i32 0), VR256X:$src1, sub_ymm)))), sub_ymm)>;
@@ -5491,18 +5778,16 @@ let Predicates = [HasAVX512] in {
 //===----------------------------------------------------------------------===//
 // Half precision conversion instructions
 //===----------------------------------------------------------------------===//
-multiclass avx512_cvtph2ps<X86VectorVTInfo _dest, X86VectorVTInfo _src, 
+multiclass avx512_cvtph2ps<X86VectorVTInfo _dest, X86VectorVTInfo _src,
                            X86MemOperand x86memop, PatFrag ld_frag> {
   defm rr : AVX512_maskable<0x13, MRMSrcReg, _dest ,(outs _dest.RC:$dst), (ins _src.RC:$src),
                     "vcvtph2ps", "$src", "$src",
                    (X86cvtph2ps (_src.VT _src.RC:$src),
                                                 (i32 FROUND_CURRENT))>, T8PD;
-  let hasSideEffects = 0, mayLoad = 1 in {
-    defm rm : AVX512_maskable<0x13, MRMSrcMem, _dest, (outs _dest.RC:$dst), (ins x86memop:$src),
-                      "vcvtph2ps", "$src", "$src", 
-                      (X86cvtph2ps (_src.VT (bitconvert (ld_frag addr:$src))),
-                                       (i32 FROUND_CURRENT))>, T8PD;
-  }
+  defm rm : AVX512_maskable<0x13, MRMSrcMem, _dest, (outs _dest.RC:$dst), (ins x86memop:$src),
+                    "vcvtph2ps", "$src", "$src",
+                    (X86cvtph2ps (_src.VT (bitconvert (ld_frag addr:$src))),
+                                     (i32 FROUND_CURRENT))>, T8PD;
 }
 
 multiclass avx512_cvtph2ps_sae<X86VectorVTInfo _dest, X86VectorVTInfo _src> {
@@ -5515,44 +5800,45 @@ multiclass avx512_cvtph2ps_sae<X86VectorVTInfo _dest, X86VectorVTInfo _src> {
 
 let Predicates = [HasAVX512] in {
   defm VCVTPH2PSZ : avx512_cvtph2ps<v16f32_info, v16i16x_info, f256mem, loadv4i64>,
-                    avx512_cvtph2ps_sae<v16f32_info, v16i16x_info>, 
+                    avx512_cvtph2ps_sae<v16f32_info, v16i16x_info>,
                     EVEX, EVEX_V512, EVEX_CD8<32, CD8VH>;
   let Predicates = [HasVLX] in {
-    defm VCVTPH2PSZ256 : avx512_cvtph2ps<v8f32x_info, v8i16x_info, f128mem, 
+    defm VCVTPH2PSZ256 : avx512_cvtph2ps<v8f32x_info, v8i16x_info, f128mem,
                          loadv2i64>,EVEX, EVEX_V256, EVEX_CD8<32, CD8VH>;
     defm VCVTPH2PSZ128 : avx512_cvtph2ps<v4f32x_info, v8i16x_info, f64mem,
                          loadv2i64>, EVEX, EVEX_V128, EVEX_CD8<32, CD8VH>;
   }
 }
 
-multiclass avx512_cvtps2ph<X86VectorVTInfo _dest, X86VectorVTInfo _src, 
+multiclass avx512_cvtps2ph<X86VectorVTInfo _dest, X86VectorVTInfo _src,
                            X86MemOperand x86memop> {
   defm rr : AVX512_maskable<0x1D, MRMDestReg, _dest ,(outs _dest.RC:$dst),
-               (ins _src.RC:$src1, i32u8imm:$src2),
-                    "vcvtps2ph", "$src2, $src1", "$src1, $src2", 
+                   (ins _src.RC:$src1, i32u8imm:$src2),
+                   "vcvtps2ph", "$src2, $src1", "$src1, $src2",
                    (X86cvtps2ph (_src.VT _src.RC:$src1),
-                                (i32 imm:$src2), 
-                                (i32 FROUND_CURRENT))>, AVX512AIi8Base;
-  let hasSideEffects = 0, mayStore = 1 in {
-    def mr : AVX512AIi8<0x1D, MRMDestMem, (outs),
-               (ins x86memop:$dst, _src.RC:$src1, i32u8imm:$src2),
-               "vcvtps2ph\t{$src2, $src1, $dst|$dst, $src1, $src2}", 
-               [(store (_dest.VT (X86cvtps2ph (_src.VT _src.RC:$src1),
-                                       (i32 imm:$src2), (i32 FROUND_CURRENT) )),
-                                       addr:$dst)]>;
-    def mrk : AVX512AIi8<0x1D, MRMDestMem, (outs),
-               (ins x86memop:$dst, _dest.KRCWM:$mask, _src.RC:$src1, i32u8imm:$src2),
-               "vcvtps2ph\t{$src2, $src1, $dst {${mask}}|$dst {${mask}}, $src1, $src2}", 
-                []>, EVEX_K;
-  }
+                                (i32 imm:$src2),
+                                (i32 FROUND_CURRENT)),
+                   NoItinerary, 0, X86select>, AVX512AIi8Base;
+  def mr : AVX512AIi8<0x1D, MRMDestMem, (outs),
+             (ins x86memop:$dst, _src.RC:$src1, i32u8imm:$src2),
+             "vcvtps2ph\t{$src2, $src1, $dst|$dst, $src1, $src2}",
+             [(store (_dest.VT (X86cvtps2ph (_src.VT _src.RC:$src1),
+                                     (i32 imm:$src2), (i32 FROUND_CURRENT) )),
+                                     addr:$dst)]>;
+  let hasSideEffects = 0, mayStore = 1 in
+  def mrk : AVX512AIi8<0x1D, MRMDestMem, (outs),
+             (ins x86memop:$dst, _dest.KRCWM:$mask, _src.RC:$src1, i32u8imm:$src2),
+             "vcvtps2ph\t{$src2, $src1, $dst {${mask}}|$dst {${mask}}, $src1, $src2}",
+              []>, EVEX_K;
 }
 multiclass avx512_cvtps2ph_sae<X86VectorVTInfo _dest, X86VectorVTInfo _src> {
   defm rb : AVX512_maskable<0x1D, MRMDestReg, _dest ,(outs _dest.RC:$dst),
-               (ins _src.RC:$src1, i32u8imm:$src2),
-                    "vcvtps2ph", "$src2, {sae}, $src1", "$src1, $src2, {sae}", 
+                   (ins _src.RC:$src1, i32u8imm:$src2),
+                   "vcvtps2ph", "$src2, {sae}, $src1", "$src1, {sae}, $src2",
                    (X86cvtps2ph (_src.VT _src.RC:$src1),
-                                (i32 imm:$src2), 
-                                (i32 FROUND_NO_EXC))>, EVEX_B, AVX512AIi8Base;
+                                (i32 imm:$src2),
+                                (i32 FROUND_NO_EXC)),
+                   NoItinerary, 0, X86select>, EVEX_B, AVX512AIi8Base;
 }
 let Predicates = [HasAVX512] in {
   defm VCVTPS2PHZ : avx512_cvtps2ph<v16i16x_info, v16f32_info, f256mem>,
@@ -5571,7 +5857,7 @@ multiclass avx512_ord_cmp_sae<bits<8> opc, X86VectorVTInfo _, SDNode OpNode,
                             string OpcodeStr> {
   def rb: AVX512<opc, MRMSrcReg, (outs), (ins _.RC:$src1, _.RC:$src2),
                  !strconcat(OpcodeStr, "\t{{sae}, $src2, $src1|$src1, $src2, {sae}}"),
-                 [(set EFLAGS, (OpNode (_.VT _.RC:$src1), _.RC:$src2, 
+                 [(set EFLAGS, (OpNode (_.VT _.RC:$src1), _.RC:$src2,
                                                         (i32 FROUND_NO_EXC)))],
                  IIC_SSE_COMIS_RR>, EVEX, EVEX_B, VEX_LIG, EVEX_V128,
                  Sched<[WriteFAdd]>;
@@ -5623,18 +5909,16 @@ let Defs = [EFLAGS], Predicates = [HasAVX512] in {
 /// avx512_fp14_s rcp14ss, rcp14sd, rsqrt14ss, rsqrt14sd
 multiclass avx512_fp14_s<bits<8> opc, string OpcodeStr, SDNode OpNode,
                             X86VectorVTInfo _> {
-  let hasSideEffects = 0, AddedComplexity = 20 , Predicates = [HasAVX512] in {
+  let AddedComplexity = 20 , Predicates = [HasAVX512] in {
   defm rr : AVX512_maskable_scalar<opc, MRMSrcReg, _, (outs _.RC:$dst),
                            (ins _.RC:$src1, _.RC:$src2), OpcodeStr,
                            "$src2, $src1", "$src1, $src2",
                            (OpNode (_.VT _.RC:$src1), (_.VT _.RC:$src2))>, EVEX_4V;
-  let mayLoad = 1 in {
   defm rm : AVX512_maskable_scalar<opc, MRMSrcMem, _, (outs _.RC:$dst),
-                         (ins _.RC:$src1, _.MemOp:$src2), OpcodeStr,
+                         (ins _.RC:$src1, _.ScalarMemOp:$src2), OpcodeStr,
                          "$src2, $src1", "$src1, $src2",
                          (OpNode (_.VT _.RC:$src1),
                           (_.VT (scalar_to_vector (_.ScalarLdFrag addr:$src2))))>, EVEX_4V;
-  }
 }
 }
 
@@ -5653,18 +5937,16 @@ multiclass avx512_fp14_p<bits<8> opc, string OpcodeStr, SDNode OpNode,
   defm r: AVX512_maskable<opc, MRMSrcReg, _, (outs _.RC:$dst),
                          (ins _.RC:$src), OpcodeStr, "$src", "$src",
                          (_.FloatVT (OpNode _.RC:$src))>, EVEX, T8PD;
-  let mayLoad = 1 in {
-    defm m: AVX512_maskable<opc, MRMSrcMem, _, (outs _.RC:$dst),
-                           (ins _.MemOp:$src), OpcodeStr, "$src", "$src",
-                           (OpNode (_.FloatVT
-                             (bitconvert (_.LdFrag addr:$src))))>, EVEX, T8PD;
-    defm mb: AVX512_maskable<opc, MRMSrcMem, _, (outs _.RC:$dst),
-                            (ins _.ScalarMemOp:$src), OpcodeStr,
-                            "${src}"##_.BroadcastStr, "${src}"##_.BroadcastStr,
-                            (OpNode (_.FloatVT
-                              (X86VBroadcast (_.ScalarLdFrag addr:$src))))>,
-                            EVEX, T8PD, EVEX_B;
-  }
+  defm m: AVX512_maskable<opc, MRMSrcMem, _, (outs _.RC:$dst),
+                         (ins _.MemOp:$src), OpcodeStr, "$src", "$src",
+                         (OpNode (_.FloatVT
+                           (bitconvert (_.LdFrag addr:$src))))>, EVEX, T8PD;
+  defm mb: AVX512_maskable<opc, MRMSrcMem, _, (outs _.RC:$dst),
+                          (ins _.ScalarMemOp:$src), OpcodeStr,
+                          "${src}"##_.BroadcastStr, "${src}"##_.BroadcastStr,
+                          (OpNode (_.FloatVT
+                            (X86VBroadcast (_.ScalarLdFrag addr:$src))))>,
+                          EVEX, T8PD, EVEX_B;
 }
 
 multiclass avx512_fp14_p_vl_all<bits<8> opc, string OpcodeStr, SDNode OpNode> {
@@ -5710,7 +5992,7 @@ multiclass avx512_fp28_s<bits<8> opc, string OpcodeStr,X86VectorVTInfo _,
                             (i32 FROUND_NO_EXC))>, EVEX_B;
 
   defm m : AVX512_maskable_scalar<opc, MRMSrcMem, _, (outs _.RC:$dst),
-                         (ins _.RC:$src1, _.MemOp:$src2), OpcodeStr,
+                         (ins _.RC:$src1, _.ScalarMemOp:$src2), OpcodeStr,
                          "$src2, $src1", "$src1, $src2",
                          (OpNode (_.VT _.RC:$src1),
                           (_.VT (scalar_to_vector (_.ScalarLdFrag addr:$src2))),
@@ -5724,7 +6006,7 @@ multiclass avx512_eri_s<bits<8> opc, string OpcodeStr, SDNode OpNode> {
               EVEX_CD8<64, CD8VT1>, VEX_W;
 }
 
-let hasSideEffects = 0, Predicates = [HasERI] in {
+let Predicates = [HasERI] in {
   defm VRCP28   : avx512_eri_s<0xCB, "vrcp28",   X86rcp28s>,   T8PD, EVEX_4V;
   defm VRSQRT28 : avx512_eri_s<0xCD, "vrsqrt28", X86rsqrt28s>, T8PD, EVEX_4V;
 }
@@ -5746,7 +6028,7 @@ multiclass avx512_fp28_p<bits<8> opc, string OpcodeStr, X86VectorVTInfo _,
                           (i32 FROUND_CURRENT))>;
 
   defm mb : AVX512_maskable<opc, MRMSrcMem, _, (outs _.RC:$dst),
-                         (ins _.MemOp:$src), OpcodeStr,
+                         (ins _.ScalarMemOp:$src), OpcodeStr,
                          "${src}"##_.BroadcastStr, "${src}"##_.BroadcastStr,
                          (OpNode (_.FloatVT
                                   (X86VBroadcast (_.ScalarLdFrag addr:$src))),
@@ -5783,7 +6065,7 @@ multiclass avx512_fp_unaryop_packed<bits<8> opc, string OpcodeStr,
                                      EVEX_V256, VEX_W, T8PD, EVEX_CD8<64, CD8VF>;
   }
 }
-let Predicates = [HasERI], hasSideEffects = 0 in {
+let Predicates = [HasERI] in {
 
  defm VRSQRT28 : avx512_eri<0xCC, "vrsqrt28", X86rsqrt28>, EVEX;
  defm VRCP28   : avx512_eri<0xCA, "vrcp28",   X86rcp28>,   EVEX;
@@ -5805,19 +6087,17 @@ multiclass avx512_sqrt_packed<bits<8> opc, string OpcodeStr,
   defm r: AVX512_maskable<opc, MRMSrcReg, _, (outs _.RC:$dst),
                          (ins _.RC:$src), OpcodeStr, "$src", "$src",
                          (_.FloatVT (OpNode _.RC:$src))>, EVEX;
-  let mayLoad = 1 in {
-    defm m: AVX512_maskable<opc, MRMSrcMem, _, (outs _.RC:$dst),
-                           (ins _.MemOp:$src), OpcodeStr, "$src", "$src",
-                           (OpNode (_.FloatVT
-                             (bitconvert (_.LdFrag addr:$src))))>, EVEX;
+  defm m: AVX512_maskable<opc, MRMSrcMem, _, (outs _.RC:$dst),
+                         (ins _.MemOp:$src), OpcodeStr, "$src", "$src",
+                         (OpNode (_.FloatVT
+                           (bitconvert (_.LdFrag addr:$src))))>, EVEX;
 
-    defm mb: AVX512_maskable<opc, MRMSrcMem, _, (outs _.RC:$dst),
-                            (ins _.ScalarMemOp:$src), OpcodeStr,
-                            "${src}"##_.BroadcastStr, "${src}"##_.BroadcastStr,
-                            (OpNode (_.FloatVT
-                              (X86VBroadcast (_.ScalarLdFrag addr:$src))))>,
-                            EVEX, EVEX_B;
-  }
+  defm mb: AVX512_maskable<opc, MRMSrcMem, _, (outs _.RC:$dst),
+                          (ins _.ScalarMemOp:$src), OpcodeStr,
+                          "${src}"##_.BroadcastStr, "${src}"##_.BroadcastStr,
+                          (OpNode (_.FloatVT
+                            (X86VBroadcast (_.ScalarLdFrag addr:$src))))>,
+                          EVEX, EVEX_B;
 }
 
 multiclass avx512_sqrt_packed_all<bits<8> opc, string OpcodeStr,
@@ -5862,14 +6142,13 @@ multiclass avx512_sqrt_scalar<bits<8> opc, string OpcodeStr,X86VectorVTInfo _,
                          (OpNodeRnd (_.VT _.RC:$src1),
                                     (_.VT _.RC:$src2),
                                     (i32 FROUND_CURRENT))>;
-  let mayLoad = 1 in
-    defm m_Int : AVX512_maskable_scalar<opc, MRMSrcMem, _, (outs _.RC:$dst),
-                         (ins _.RC:$src1, _.MemOp:$src2), OpcodeStr,
-                         "$src2, $src1", "$src1, $src2",
-                         (OpNodeRnd (_.VT _.RC:$src1),
-                                    (_.VT (scalar_to_vector
-                                              (_.ScalarLdFrag addr:$src2))),
-                                    (i32 FROUND_CURRENT))>;
+  defm m_Int : AVX512_maskable_scalar<opc, MRMSrcMem, _, (outs _.RC:$dst),
+                       (ins _.RC:$src1, _.ScalarMemOp:$src2), OpcodeStr,
+                       "$src2, $src1", "$src1, $src2",
+                       (OpNodeRnd (_.VT _.RC:$src1),
+                                  (_.VT (scalar_to_vector
+                                            (_.ScalarLdFrag addr:$src2))),
+                                  (i32 FROUND_CURRENT))>;
 
   defm rb_Int : AVX512_maskable_scalar<opc, MRMSrcReg, _, (outs _.RC:$dst),
                          (ins _.RC:$src1, _.RC:$src2, AVX512RC:$rc), OpcodeStr,
@@ -5879,7 +6158,7 @@ multiclass avx512_sqrt_scalar<bits<8> opc, string OpcodeStr,X86VectorVTInfo _,
                                      (i32 imm:$rc))>,
                          EVEX_B, EVEX_RC;
 
-  let isCodeGenOnly = 1 in {
+  let isCodeGenOnly = 1, hasSideEffects = 0 in {
     def r : I<opc, MRMSrcReg, (outs _.FRC:$dst),
                (ins _.FRC:$src1, _.FRC:$src2),
                OpcodeStr#"\t{$src2, $src1, $dst|$dst, $src1, $src2}", []>;
@@ -5940,9 +6219,9 @@ avx512_rndscale_scalar<bits<8> opc, string OpcodeStr, X86VectorVTInfo _> {
                          (_.VT (X86RndScales (_.VT _.RC:$src1), (_.VT _.RC:$src2),
                          (i32 imm:$src3), (i32 FROUND_NO_EXC)))>, EVEX_B;
 
-  let mayLoad = 1 in
   defm m : AVX512_maskable_scalar<opc, MRMSrcMem, _, (outs _.RC:$dst),
-                         (ins _.RC:$src1, _.MemOp:$src2, i32u8imm:$src3), OpcodeStr,
+                         (ins _.RC:$src1, _.ScalarMemOp:$src2, i32u8imm:$src3),
+                         OpcodeStr,
                          "$src3, $src2, $src1", "$src1, $src2, $src3",
                          (_.VT (X86RndScales (_.VT _.RC:$src1),
                           (_.VT (scalar_to_vector (_.ScalarLdFrag addr:$src2))),
@@ -6022,7 +6301,7 @@ multiclass avx512_trunc_common<bits<8> opc, string OpcodeStr, SDNode OpNode,
                                       DestInfo.KRCWM:$mask ,
                                       SrcInfo.RC:$src1)>;
 
-  let mayStore = 1 in {
+  let mayStore = 1, mayLoad = 1, hasSideEffects = 0 in {
     def mr : AVX512XS8I<opc, MRMDestMem, (outs),
                (ins x86memop:$dst, SrcInfo.RC:$src),
                OpcodeStr # "\t{$src, $dst|$dst, $src}",
@@ -6032,7 +6311,7 @@ multiclass avx512_trunc_common<bits<8> opc, string OpcodeStr, SDNode OpNode,
                (ins x86memop:$dst, SrcInfo.KRCWM:$mask, SrcInfo.RC:$src),
                OpcodeStr # "\t{$src, $dst {${mask}}|$dst {${mask}}, $src}",
                []>, EVEX, EVEX_K;
-  }//mayStore = 1
+  }//mayStore = 1, mayLoad = 1, hasSideEffects = 0
 }
 
 multiclass avx512_trunc_mr_lowering<X86VectorVTInfo SrcInfo,
@@ -6223,23 +6502,21 @@ def: Pat<(v16i8 (X86vtrunc (v16i16 VR256X:$src))),
 }
 
 multiclass avx512_extend_common<bits<8> opc, string OpcodeStr,
-                  X86VectorVTInfo DestInfo, X86VectorVTInfo SrcInfo,
-                  X86MemOperand x86memop, PatFrag LdFrag, SDNode OpNode>{
-
+              X86VectorVTInfo DestInfo, X86VectorVTInfo SrcInfo,
+              X86MemOperand x86memop, PatFrag LdFrag, SDPatternOperator OpNode>{
   defm rr   : AVX512_maskable<opc, MRMSrcReg, DestInfo, (outs DestInfo.RC:$dst),
                     (ins SrcInfo.RC:$src), OpcodeStr ,"$src", "$src",
                     (DestInfo.VT (OpNode (SrcInfo.VT SrcInfo.RC:$src)))>,
                   EVEX;
 
-  let mayLoad = 1 in {
-    defm rm : AVX512_maskable<opc, MRMSrcMem, DestInfo, (outs DestInfo.RC:$dst),
-                    (ins x86memop:$src), OpcodeStr ,"$src", "$src",
-                    (DestInfo.VT (LdFrag addr:$src))>,
-                  EVEX;
-  }
+  defm rm : AVX512_maskable<opc, MRMSrcMem, DestInfo, (outs DestInfo.RC:$dst),
+                  (ins x86memop:$src), OpcodeStr ,"$src", "$src",
+                  (DestInfo.VT (LdFrag addr:$src))>,
+                EVEX;
 }
 
-multiclass avx512_extend_BW<bits<8> opc, string OpcodeStr, SDNode OpNode,
+multiclass avx512_extend_BW<bits<8> opc, string OpcodeStr,
+          SDPatternOperator OpNode,
           string ExtTy,PatFrag LdFrag = !cast<PatFrag>(ExtTy#"extloadvi8")> {
   let Predicates = [HasVLX, HasBWI] in {
     defm Z128:  avx512_extend_common<opc, OpcodeStr, v8i16x_info,
@@ -6257,7 +6534,8 @@ multiclass avx512_extend_BW<bits<8> opc, string OpcodeStr, SDNode OpNode,
   }
 }
 
-multiclass avx512_extend_BD<bits<8> opc, string OpcodeStr, SDNode OpNode,
+multiclass avx512_extend_BD<bits<8> opc, string OpcodeStr,
+          SDPatternOperator OpNode,
           string ExtTy,PatFrag LdFrag = !cast<PatFrag>(ExtTy#"extloadvi8")> {
   let Predicates = [HasVLX, HasAVX512] in {
     defm Z128:  avx512_extend_common<opc, OpcodeStr, v4i32x_info,
@@ -6275,7 +6553,8 @@ multiclass avx512_extend_BD<bits<8> opc, string OpcodeStr, SDNode OpNode,
   }
 }
 
-multiclass avx512_extend_BQ<bits<8> opc, string OpcodeStr, SDNode OpNode,
+multiclass avx512_extend_BQ<bits<8> opc, string OpcodeStr,
+          SDPatternOperator OpNode,
           string ExtTy,PatFrag LdFrag = !cast<PatFrag>(ExtTy#"extloadvi8")> {
   let Predicates = [HasVLX, HasAVX512] in {
     defm Z128:  avx512_extend_common<opc, OpcodeStr, v2i64x_info,
@@ -6293,7 +6572,8 @@ multiclass avx512_extend_BQ<bits<8> opc, string OpcodeStr, SDNode OpNode,
   }
 }
 
-multiclass avx512_extend_WD<bits<8> opc, string OpcodeStr, SDNode OpNode,
+multiclass avx512_extend_WD<bits<8> opc, string OpcodeStr,
+         SDPatternOperator OpNode,
          string ExtTy,PatFrag LdFrag = !cast<PatFrag>(ExtTy#"extloadvi16")> {
   let Predicates = [HasVLX, HasAVX512] in {
     defm Z128:  avx512_extend_common<opc, OpcodeStr, v4i32x_info,
@@ -6311,7 +6591,8 @@ multiclass avx512_extend_WD<bits<8> opc, string OpcodeStr, SDNode OpNode,
   }
 }
 
-multiclass avx512_extend_WQ<bits<8> opc, string OpcodeStr, SDNode OpNode,
+multiclass avx512_extend_WQ<bits<8> opc, string OpcodeStr,
+         SDPatternOperator OpNode,
          string ExtTy,PatFrag LdFrag = !cast<PatFrag>(ExtTy#"extloadvi16")> {
   let Predicates = [HasVLX, HasAVX512] in {
     defm Z128:  avx512_extend_common<opc, OpcodeStr, v2i64x_info,
@@ -6329,7 +6610,8 @@ multiclass avx512_extend_WQ<bits<8> opc, string OpcodeStr, SDNode OpNode,
   }
 }
 
-multiclass avx512_extend_DQ<bits<8> opc, string OpcodeStr, SDNode OpNode,
+multiclass avx512_extend_DQ<bits<8> opc, string OpcodeStr,
+         SDPatternOperator OpNode,
          string ExtTy,PatFrag LdFrag = !cast<PatFrag>(ExtTy#"extloadvi32")> {
 
   let Predicates = [HasVLX, HasAVX512] in {
@@ -6355,7 +6637,6 @@ defm VPMOVZXWD : avx512_extend_WD<0x33, "vpmovzxwd", X86vzext, "z">;
 defm VPMOVZXWQ : avx512_extend_WQ<0x34, "vpmovzxwq", X86vzext, "z">;
 defm VPMOVZXDQ : avx512_extend_DQ<0x35, "vpmovzxdq", X86vzext, "z">;
 
-
 defm VPMOVSXBW: avx512_extend_BW<0x20, "vpmovsxbw", X86vsext, "s">;
 defm VPMOVSXBD: avx512_extend_BD<0x21, "vpmovsxbd", X86vsext, "s">;
 defm VPMOVSXBQ: avx512_extend_BQ<0x22, "vpmovsxbq", X86vsext, "s">;
@@ -6363,6 +6644,47 @@ defm VPMOVSXWD: avx512_extend_WD<0x23, "vpmovsxwd", X86vsext, "s">;
 defm VPMOVSXWQ: avx512_extend_WQ<0x24, "vpmovsxwq", X86vsext, "s">;
 defm VPMOVSXDQ: avx512_extend_DQ<0x25, "vpmovsxdq", X86vsext, "s">;
 
+// EXTLOAD patterns, implemented using vpmovz
+multiclass avx512_ext_lowering<string InstrStr, X86VectorVTInfo To,
+                               X86VectorVTInfo From, PatFrag LdFrag> {
+  def : Pat<(To.VT (LdFrag addr:$src)),
+            (!cast<Instruction>("VPMOVZX"#InstrStr#"rm") addr:$src)>;
+  def : Pat<(To.VT (vselect To.KRCWM:$mask, (LdFrag addr:$src), To.RC:$src0)),
+            (!cast<Instruction>("VPMOVZX"#InstrStr#"rmk") To.RC:$src0,
+             To.KRC:$mask, addr:$src)>;
+  def : Pat<(To.VT (vselect To.KRCWM:$mask, (LdFrag addr:$src),
+                    To.ImmAllZerosV)),
+            (!cast<Instruction>("VPMOVZX"#InstrStr#"rmkz") To.KRC:$mask,
+             addr:$src)>;
+}
+
+let Predicates = [HasVLX, HasBWI] in {
+  defm : avx512_ext_lowering<"BWZ128", v8i16x_info,  v16i8x_info,  extloadvi8>;
+  defm : avx512_ext_lowering<"BWZ256", v16i16x_info, v16i8x_info,  extloadvi8>;
+}
+let Predicates = [HasBWI] in {
+  defm : avx512_ext_lowering<"BWZ",    v32i16_info,  v32i8x_info,  extloadvi8>;
+}
+let Predicates = [HasVLX, HasAVX512] in {
+  defm : avx512_ext_lowering<"BDZ128", v4i32x_info,  v16i8x_info,  extloadvi8>;
+  defm : avx512_ext_lowering<"BDZ256", v8i32x_info,  v16i8x_info,  extloadvi8>;
+  defm : avx512_ext_lowering<"BQZ128", v2i64x_info,  v16i8x_info,  extloadvi8>;
+  defm : avx512_ext_lowering<"BQZ256", v4i64x_info,  v16i8x_info,  extloadvi8>;
+  defm : avx512_ext_lowering<"WDZ128", v4i32x_info,  v8i16x_info,  extloadvi16>;
+  defm : avx512_ext_lowering<"WDZ256", v8i32x_info,  v8i16x_info,  extloadvi16>;
+  defm : avx512_ext_lowering<"WQZ128", v2i64x_info,  v8i16x_info,  extloadvi16>;
+  defm : avx512_ext_lowering<"WQZ256", v4i64x_info,  v8i16x_info,  extloadvi16>;
+  defm : avx512_ext_lowering<"DQZ128", v2i64x_info,  v4i32x_info,  extloadvi32>;
+  defm : avx512_ext_lowering<"DQZ256", v4i64x_info,  v4i32x_info,  extloadvi32>;
+}
+let Predicates = [HasAVX512] in {
+  defm : avx512_ext_lowering<"BDZ",    v16i32_info,  v16i8x_info,  extloadvi8>;
+  defm : avx512_ext_lowering<"BQZ",    v8i64_info,   v16i8x_info,  extloadvi8>;
+  defm : avx512_ext_lowering<"WDZ",    v16i32_info,  v16i16x_info, extloadvi16>;
+  defm : avx512_ext_lowering<"WQZ",    v8i64_info,   v8i16x_info,  extloadvi16>;
+  defm : avx512_ext_lowering<"DQZ",    v8i64_info,   v8i32x_info,  extloadvi32>;
+}
+
 //===----------------------------------------------------------------------===//
 // GATHER - SCATTER Operations
 
@@ -6383,34 +6705,34 @@ multiclass avx512_gather<bits<8> opc, string OpcodeStr, X86VectorVTInfo _,
 multiclass avx512_gather_q_pd<bits<8> dopc, bits<8> qopc,
                         AVX512VLVectorVTInfo _, string OpcodeStr, string SUFF> {
   defm NAME##D##SUFF##Z: avx512_gather<dopc, OpcodeStr##"d", _.info512,
-                                      vy32xmem, mgatherv8i32>, EVEX_V512, VEX_W;
+                                      vy512mem, mgatherv8i32>, EVEX_V512, VEX_W;
   defm NAME##Q##SUFF##Z: avx512_gather<qopc, OpcodeStr##"q", _.info512,
-                                      vz64mem,  mgatherv8i64>, EVEX_V512, VEX_W;
+                                      vz512mem,  mgatherv8i64>, EVEX_V512, VEX_W;
 let Predicates = [HasVLX] in {
   defm NAME##D##SUFF##Z256: avx512_gather<dopc, OpcodeStr##"d", _.info256,
-                              vx32xmem, mgatherv4i32>, EVEX_V256, VEX_W;
+                              vx256xmem, mgatherv4i32>, EVEX_V256, VEX_W;
   defm NAME##Q##SUFF##Z256: avx512_gather<qopc, OpcodeStr##"q", _.info256,
-                              vy64xmem, mgatherv4i64>, EVEX_V256, VEX_W;
+                              vy256xmem, mgatherv4i64>, EVEX_V256, VEX_W;
   defm NAME##D##SUFF##Z128: avx512_gather<dopc, OpcodeStr##"d", _.info128,
-                              vx32xmem, mgatherv4i32>, EVEX_V128, VEX_W;
+                              vx128xmem, mgatherv4i32>, EVEX_V128, VEX_W;
   defm NAME##Q##SUFF##Z128: avx512_gather<qopc, OpcodeStr##"q", _.info128,
-                              vx64xmem, mgatherv2i64>, EVEX_V128, VEX_W;
+                              vx128xmem, mgatherv2i64>, EVEX_V128, VEX_W;
 }
 }
 
 multiclass avx512_gather_d_ps<bits<8> dopc, bits<8> qopc,
                        AVX512VLVectorVTInfo _, string OpcodeStr, string SUFF> {
-  defm NAME##D##SUFF##Z: avx512_gather<dopc, OpcodeStr##"d", _.info512, vz32mem,
+  defm NAME##D##SUFF##Z: avx512_gather<dopc, OpcodeStr##"d", _.info512, vz512mem,
                                        mgatherv16i32>, EVEX_V512;
-  defm NAME##Q##SUFF##Z: avx512_gather<qopc, OpcodeStr##"q", _.info256, vz64mem,
+  defm NAME##Q##SUFF##Z: avx512_gather<qopc, OpcodeStr##"q", _.info256, vz512mem,
                                        mgatherv8i64>, EVEX_V512;
 let Predicates = [HasVLX] in {
   defm NAME##D##SUFF##Z256: avx512_gather<dopc, OpcodeStr##"d", _.info256,
-                                          vy32xmem, mgatherv8i32>, EVEX_V256;
+                                          vy256xmem, mgatherv8i32>, EVEX_V256;
   defm NAME##Q##SUFF##Z256: avx512_gather<qopc, OpcodeStr##"q", _.info128,
-                                          vy64xmem, mgatherv4i64>, EVEX_V256;
+                                          vy128xmem, mgatherv4i64>, EVEX_V256;
   defm NAME##D##SUFF##Z128: avx512_gather<dopc, OpcodeStr##"d", _.info128,
-                                          vx32xmem, mgatherv4i32>, EVEX_V128;
+                                          vx128xmem, mgatherv4i32>, EVEX_V128;
   defm NAME##Q##SUFF##Z128: avx512_gather<qopc, OpcodeStr##"q", _.info128,
                                           vx64xmem, mgatherv2i64>, EVEX_V128;
 }
@@ -6440,34 +6762,34 @@ let mayStore = 1, Constraints = "$mask = $mask_wb", ExeDomain = _.ExeDomain in
 multiclass avx512_scatter_q_pd<bits<8> dopc, bits<8> qopc,
                         AVX512VLVectorVTInfo _, string OpcodeStr, string SUFF> {
   defm NAME##D##SUFF##Z: avx512_scatter<dopc, OpcodeStr##"d", _.info512,
-                                      vy32xmem, mscatterv8i32>, EVEX_V512, VEX_W;
+                                      vy512mem, mscatterv8i32>, EVEX_V512, VEX_W;
   defm NAME##Q##SUFF##Z: avx512_scatter<qopc, OpcodeStr##"q", _.info512,
-                                      vz64mem,  mscatterv8i64>, EVEX_V512, VEX_W;
+                                      vz512mem,  mscatterv8i64>, EVEX_V512, VEX_W;
 let Predicates = [HasVLX] in {
   defm NAME##D##SUFF##Z256: avx512_scatter<dopc, OpcodeStr##"d", _.info256,
-                              vx32xmem, mscatterv4i32>, EVEX_V256, VEX_W;
+                              vx256xmem, mscatterv4i32>, EVEX_V256, VEX_W;
   defm NAME##Q##SUFF##Z256: avx512_scatter<qopc, OpcodeStr##"q", _.info256,
-                              vy64xmem, mscatterv4i64>, EVEX_V256, VEX_W;
+                              vy256xmem, mscatterv4i64>, EVEX_V256, VEX_W;
   defm NAME##D##SUFF##Z128: avx512_scatter<dopc, OpcodeStr##"d", _.info128,
-                              vx32xmem, mscatterv4i32>, EVEX_V128, VEX_W;
+                              vx128xmem, mscatterv4i32>, EVEX_V128, VEX_W;
   defm NAME##Q##SUFF##Z128: avx512_scatter<qopc, OpcodeStr##"q", _.info128,
-                              vx64xmem, mscatterv2i64>, EVEX_V128, VEX_W;
+                              vx128xmem, mscatterv2i64>, EVEX_V128, VEX_W;
 }
 }
 
 multiclass avx512_scatter_d_ps<bits<8> dopc, bits<8> qopc,
                        AVX512VLVectorVTInfo _, string OpcodeStr, string SUFF> {
-  defm NAME##D##SUFF##Z: avx512_scatter<dopc, OpcodeStr##"d", _.info512, vz32mem,
+  defm NAME##D##SUFF##Z: avx512_scatter<dopc, OpcodeStr##"d", _.info512, vz512mem,
                                        mscatterv16i32>, EVEX_V512;
-  defm NAME##Q##SUFF##Z: avx512_scatter<qopc, OpcodeStr##"q", _.info256, vz64mem,
+  defm NAME##Q##SUFF##Z: avx512_scatter<qopc, OpcodeStr##"q", _.info256, vz512mem,
                                        mscatterv8i64>, EVEX_V512;
 let Predicates = [HasVLX] in {
   defm NAME##D##SUFF##Z256: avx512_scatter<dopc, OpcodeStr##"d", _.info256,
-                                          vy32xmem, mscatterv8i32>, EVEX_V256;
+                                          vy256xmem, mscatterv8i32>, EVEX_V256;
   defm NAME##Q##SUFF##Z256: avx512_scatter<qopc, OpcodeStr##"q", _.info128,
-                                          vy64xmem, mscatterv4i64>, EVEX_V256;
+                                          vy128xmem, mscatterv4i64>, EVEX_V256;
   defm NAME##D##SUFF##Z128: avx512_scatter<dopc, OpcodeStr##"d", _.info128,
-                                          vx32xmem, mscatterv4i32>, EVEX_V128;
+                                          vx128xmem, mscatterv4i32>, EVEX_V128;
   defm NAME##Q##SUFF##Z128: avx512_scatter<qopc, OpcodeStr##"q", _.info128,
                                           vx64xmem, mscatterv2i64>, EVEX_V128;
 }
@@ -6489,79 +6811,57 @@ multiclass avx512_gather_scatter_prefetch<bits<8> opc, Format F, string OpcodeSt
 }
 
 defm VGATHERPF0DPS: avx512_gather_scatter_prefetch<0xC6, MRM1m, "vgatherpf0dps",
-                     VK16WM, vz32mem>, EVEX_V512, EVEX_CD8<32, CD8VT1>;
+                     VK16WM, vz512mem>, EVEX_V512, EVEX_CD8<32, CD8VT1>;
 
 defm VGATHERPF0QPS: avx512_gather_scatter_prefetch<0xC7, MRM1m, "vgatherpf0qps",
-                     VK8WM, vz64mem>, EVEX_V512, EVEX_CD8<64, CD8VT1>;
+                     VK8WM, vz512mem>, EVEX_V512, EVEX_CD8<64, CD8VT1>;
 
 defm VGATHERPF0DPD: avx512_gather_scatter_prefetch<0xC6, MRM1m, "vgatherpf0dpd",
-                     VK8WM, vy32mem>, EVEX_V512, VEX_W, EVEX_CD8<32, CD8VT1>;
+                     VK8WM, vy512mem>, EVEX_V512, VEX_W, EVEX_CD8<32, CD8VT1>;
 
 defm VGATHERPF0QPD: avx512_gather_scatter_prefetch<0xC7, MRM1m, "vgatherpf0qpd",
-                     VK8WM, vz64mem>, EVEX_V512, VEX_W, EVEX_CD8<64, CD8VT1>;
+                     VK8WM, vz512mem>, EVEX_V512, VEX_W, EVEX_CD8<64, CD8VT1>;
 
 defm VGATHERPF1DPS: avx512_gather_scatter_prefetch<0xC6, MRM2m, "vgatherpf1dps",
-                     VK16WM, vz32mem>, EVEX_V512, EVEX_CD8<32, CD8VT1>;
+                     VK16WM, vz512mem>, EVEX_V512, EVEX_CD8<32, CD8VT1>;
 
 defm VGATHERPF1QPS: avx512_gather_scatter_prefetch<0xC7, MRM2m, "vgatherpf1qps",
-                     VK8WM, vz64mem>, EVEX_V512, EVEX_CD8<64, CD8VT1>;
+                     VK8WM, vz512mem>, EVEX_V512, EVEX_CD8<64, CD8VT1>;
 
 defm VGATHERPF1DPD: avx512_gather_scatter_prefetch<0xC6, MRM2m, "vgatherpf1dpd",
-                     VK8WM, vy32mem>, EVEX_V512, VEX_W, EVEX_CD8<32, CD8VT1>;
+                     VK8WM, vy512mem>, EVEX_V512, VEX_W, EVEX_CD8<32, CD8VT1>;
 
 defm VGATHERPF1QPD: avx512_gather_scatter_prefetch<0xC7, MRM2m, "vgatherpf1qpd",
-                     VK8WM, vz64mem>, EVEX_V512, VEX_W, EVEX_CD8<64, CD8VT1>;
+                     VK8WM, vz512mem>, EVEX_V512, VEX_W, EVEX_CD8<64, CD8VT1>;
 
 defm VSCATTERPF0DPS: avx512_gather_scatter_prefetch<0xC6, MRM5m, "vscatterpf0dps",
-                     VK16WM, vz32mem>, EVEX_V512, EVEX_CD8<32, CD8VT1>;
+                     VK16WM, vz512mem>, EVEX_V512, EVEX_CD8<32, CD8VT1>;
 
 defm VSCATTERPF0QPS: avx512_gather_scatter_prefetch<0xC7, MRM5m, "vscatterpf0qps",
-                     VK8WM, vz64mem>, EVEX_V512, EVEX_CD8<64, CD8VT1>;
+                     VK8WM, vz512mem>, EVEX_V512, EVEX_CD8<64, CD8VT1>;
 
 defm VSCATTERPF0DPD: avx512_gather_scatter_prefetch<0xC6, MRM5m, "vscatterpf0dpd",
-                     VK8WM, vy32mem>, EVEX_V512, VEX_W, EVEX_CD8<32, CD8VT1>;
+                     VK8WM, vy512mem>, EVEX_V512, VEX_W, EVEX_CD8<32, CD8VT1>;
 
 defm VSCATTERPF0QPD: avx512_gather_scatter_prefetch<0xC7, MRM5m, "vscatterpf0qpd",
-                     VK8WM, vz64mem>, EVEX_V512, VEX_W, EVEX_CD8<64, CD8VT1>;
+                     VK8WM, vz512mem>, EVEX_V512, VEX_W, EVEX_CD8<64, CD8VT1>;
 
 defm VSCATTERPF1DPS: avx512_gather_scatter_prefetch<0xC6, MRM6m, "vscatterpf1dps",
-                     VK16WM, vz32mem>, EVEX_V512, EVEX_CD8<32, CD8VT1>;
+                     VK16WM, vz512mem>, EVEX_V512, EVEX_CD8<32, CD8VT1>;
 
 defm VSCATTERPF1QPS: avx512_gather_scatter_prefetch<0xC7, MRM6m, "vscatterpf1qps",
-                     VK8WM, vz64mem>, EVEX_V512, EVEX_CD8<64, CD8VT1>;
+                     VK8WM, vz512mem>, EVEX_V512, EVEX_CD8<64, CD8VT1>;
 
 defm VSCATTERPF1DPD: avx512_gather_scatter_prefetch<0xC6, MRM6m, "vscatterpf1dpd",
-                     VK8WM, vy32mem>, EVEX_V512, VEX_W, EVEX_CD8<32, CD8VT1>;
+                     VK8WM, vy512mem>, EVEX_V512, VEX_W, EVEX_CD8<32, CD8VT1>;
 
 defm VSCATTERPF1QPD: avx512_gather_scatter_prefetch<0xC7, MRM6m, "vscatterpf1qpd",
-                     VK8WM, vz64mem>, EVEX_V512, VEX_W, EVEX_CD8<64, CD8VT1>;
+                     VK8WM, vz512mem>, EVEX_V512, VEX_W, EVEX_CD8<64, CD8VT1>;
 
 // Helper fragments to match sext vXi1 to vXiY.
 def v16i1sextv16i32  : PatLeaf<(v16i32 (X86vsrai VR512:$src, (i8 31)))>;
 def v8i1sextv8i64  : PatLeaf<(v8i64 (X86vsrai VR512:$src, (i8 63)))>;
 
-def : Pat<(store (i1 -1), addr:$dst), (MOV8mi addr:$dst, (i8 1))>;
-def : Pat<(store (i1  1), addr:$dst), (MOV8mi addr:$dst, (i8 1))>;
-def : Pat<(store (i1  0), addr:$dst), (MOV8mi addr:$dst, (i8 0))>;
-
-def : Pat<(store VK1:$src, addr:$dst),
-          (MOV8mr addr:$dst,
-           (EXTRACT_SUBREG (KMOVWrk (COPY_TO_REGCLASS VK1:$src, VK16)),
-            sub_8bit))>, Requires<[HasAVX512, NoDQI]>;
-
-def : Pat<(store VK8:$src, addr:$dst),
-          (MOV8mr addr:$dst,
-           (EXTRACT_SUBREG (KMOVWrk (COPY_TO_REGCLASS VK8:$src, VK16)),
-            sub_8bit))>, Requires<[HasAVX512, NoDQI]>;
-
-def truncstorei1 : PatFrag<(ops node:$val, node:$ptr),
-                           (truncstore node:$val, node:$ptr), [{
-  return cast<StoreSDNode>(N)->getMemoryVT() == MVT::i1;
-}]>;
-
-def : Pat<(truncstorei1 GR8:$src, addr:$dst),
-          (MOV8mr addr:$dst, GR8:$src)>;
-
 multiclass cvt_by_vec_width<bits<8> opc, X86VectorVTInfo Vec, string OpcodeStr > {
 def rr : AVX512XS8I<opc, MRMSrcReg, (outs Vec.RC:$dst), (ins Vec.KRC:$src),
                   !strconcat(OpcodeStr##Vec.Suffix, "\t{$src, $dst|$dst, $src}"),
@@ -6593,22 +6893,38 @@ multiclass avx512_convert_mask_to_vector<string OpcodeStr> {
 defm VPMOVM2 : avx512_convert_mask_to_vector<"vpmovm2">;
 
 multiclass convert_vector_to_mask_common<bits<8> opc, X86VectorVTInfo _, string OpcodeStr > {
-def rr : AVX512XS8I<opc, MRMSrcReg, (outs _.KRC:$dst), (ins _.RC:$src),
-                  !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
-                  [(set _.KRC:$dst, (X86cvt2mask (_.VT _.RC:$src)))]>, EVEX;
+    def rr : AVX512XS8I<opc, MRMSrcReg, (outs _.KRC:$dst), (ins _.RC:$src),
+                        !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
+                        [(set _.KRC:$dst, (X86cvt2mask (_.VT _.RC:$src)))]>, EVEX;
+}
+
+// Use 512bit version to implement 128/256 bit in case NoVLX.
+multiclass convert_vector_to_mask_lowering<X86VectorVTInfo ExtendInfo,
+                                                            X86VectorVTInfo _> {
+
+  def : Pat<(_.KVT (X86cvt2mask (_.VT _.RC:$src))),
+            (_.KVT (COPY_TO_REGCLASS
+                     (!cast<Instruction>(NAME#"Zrr")
+                       (INSERT_SUBREG (ExtendInfo.VT (IMPLICIT_DEF)),
+                                      _.RC:$src, _.SubRegIdx)),
+                   _.KRC))>;
 }
 
 multiclass avx512_convert_vector_to_mask<bits<8> opc, string OpcodeStr,
-                        AVX512VLVectorVTInfo VTInfo, Predicate prd> {
-let Predicates = [prd] in
-  defm Z : convert_vector_to_mask_common <opc, VTInfo.info512, OpcodeStr>,
-   EVEX_V512;
+                                   AVX512VLVectorVTInfo VTInfo, Predicate prd> {
+  let Predicates = [prd] in
+    defm Z : convert_vector_to_mask_common <opc, VTInfo.info512, OpcodeStr>,
+                                            EVEX_V512;
 
   let Predicates = [prd, HasVLX] in {
     defm Z256 : convert_vector_to_mask_common<opc, VTInfo.info256, OpcodeStr>,
-     EVEX_V256;
+                                              EVEX_V256;
     defm Z128 : convert_vector_to_mask_common<opc, VTInfo.info128, OpcodeStr>,
-     EVEX_V128;
+                                               EVEX_V128;
+  }
+  let Predicates = [prd, NoVLX] in {
+    defm Z256_Alt : convert_vector_to_mask_lowering<VTInfo.info512, VTInfo.info256>;
+    defm Z128_Alt : convert_vector_to_mask_lowering<VTInfo.info512, VTInfo.info128>;
   }
 }
 
@@ -6631,7 +6947,7 @@ multiclass compress_by_vec_width<bits<8> opc, X86VectorVTInfo _,
               (ins _.RC:$src1), OpcodeStr, "$src1", "$src1",
               (_.VT (X86compress _.RC:$src1))>, AVX5128IBase;
 
-  let mayStore = 1 in {
+  let mayStore = 1, hasSideEffects = 0 in
   def mr : AVX5128I<opc, MRMDestMem, (outs),
               (ins _.MemOp:$dst, _.RC:$src),
               OpcodeStr # "\t{$src, $dst|$dst, $src}",
@@ -6644,7 +6960,6 @@ multiclass compress_by_vec_width<bits<8> opc, X86VectorVTInfo _,
                              (_.VT (X86compress  _.RC:$src)), _.ImmAllZerosV)),
                 addr:$dst)]>,
               EVEX_K, EVEX_CD8<_.EltSize, CD8VT1>;
-  }
 }
 
 multiclass compress_by_elt_width<bits<8> opc, string OpcodeStr,
@@ -6673,7 +6988,6 @@ multiclass expand_by_vec_width<bits<8> opc, X86VectorVTInfo _,
               (ins _.RC:$src1), OpcodeStr, "$src1", "$src1",
               (_.VT (X86expand _.RC:$src1))>, AVX5128IBase;
 
-  let mayLoad = 1 in
   defm rm : AVX512_maskable<opc, MRMSrcMem, _, (outs _.RC:$dst),
               (ins _.MemOp:$src1), OpcodeStr, "$src1", "$src1",
               (_.VT (X86expand (_.VT (bitconvert
@@ -6708,25 +7022,23 @@ multiclass avx512_unary_fp_packed_imm<bits<8> opc, string OpcodeStr, SDNode OpNo
                                                             X86VectorVTInfo _>{
   defm rri : AVX512_maskable<opc, MRMSrcReg, _, (outs _.RC:$dst),
                       (ins _.RC:$src1, i32u8imm:$src2),
-                      OpcodeStr##_.Suffix, "$src2, $src1", "$src2, $src2",
-                      (OpNode (_.VT _.RC:$src1),
-                              (i32 imm:$src2),
-                              (i32 FROUND_CURRENT))>;
-  let mayLoad = 1 in {
-    defm rmi : AVX512_maskable<opc, MRMSrcMem, _, (outs _.RC:$dst),
-                      (ins _.MemOp:$src1, i32u8imm:$src2),
                       OpcodeStr##_.Suffix, "$src2, $src1", "$src1, $src2",
-                      (OpNode (_.VT (bitconvert (_.LdFrag addr:$src1))),
+                      (OpNode (_.VT _.RC:$src1),
                               (i32 imm:$src2),
                               (i32 FROUND_CURRENT))>;
-    defm rmbi : AVX512_maskable<opc, MRMSrcMem, _, (outs _.RC:$dst),
-                      (ins _.ScalarMemOp:$src1, i32u8imm:$src2),
-                      OpcodeStr##_.Suffix, "$src2, ${src1}"##_.BroadcastStr,
-                      "${src1}"##_.BroadcastStr##", $src2",
-                      (OpNode (_.VT (X86VBroadcast(_.ScalarLdFrag addr:$src1))),
-                              (i32 imm:$src2),
-                              (i32 FROUND_CURRENT))>, EVEX_B;
-  }
+  defm rmi : AVX512_maskable<opc, MRMSrcMem, _, (outs _.RC:$dst),
+                    (ins _.MemOp:$src1, i32u8imm:$src2),
+                    OpcodeStr##_.Suffix, "$src2, $src1", "$src1, $src2",
+                    (OpNode (_.VT (bitconvert (_.LdFrag addr:$src1))),
+                            (i32 imm:$src2),
+                            (i32 FROUND_CURRENT))>;
+  defm rmbi : AVX512_maskable<opc, MRMSrcMem, _, (outs _.RC:$dst),
+                    (ins _.ScalarMemOp:$src1, i32u8imm:$src2),
+                    OpcodeStr##_.Suffix, "$src2, ${src1}"##_.BroadcastStr,
+                    "${src1}"##_.BroadcastStr##", $src2",
+                    (OpNode (_.VT (X86VBroadcast(_.ScalarLdFrag addr:$src1))),
+                            (i32 imm:$src2),
+                            (i32 FROUND_CURRENT))>, EVEX_B;
 }
 
 //handle instruction  reg_vec1 = op(reg_vec2,reg_vec3,imm),{sae}
@@ -6769,23 +7081,21 @@ multiclass avx512_fp_packed_imm<bits<8> opc, string OpcodeStr, SDNode OpNode,
                               (_.VT _.RC:$src2),
                               (i32 imm:$src3),
                               (i32 FROUND_CURRENT))>;
-  let mayLoad = 1 in {
-    defm rmi : AVX512_maskable<opc, MRMSrcMem, _, (outs _.RC:$dst),
-                      (ins _.RC:$src1, _.MemOp:$src2, i32u8imm:$src3),
-                      OpcodeStr, "$src3, $src2, $src1", "$src1, $src2, $src3",
-                      (OpNode (_.VT _.RC:$src1),
-                              (_.VT (bitconvert (_.LdFrag addr:$src2))),
-                              (i32 imm:$src3),
-                              (i32 FROUND_CURRENT))>;
-    defm rmbi : AVX512_maskable<opc, MRMSrcMem, _, (outs _.RC:$dst),
-                      (ins _.RC:$src1, _.ScalarMemOp:$src2, i32u8imm:$src3),
-                      OpcodeStr, "$src3, ${src2}"##_.BroadcastStr##", $src1",
-                      "$src1, ${src2}"##_.BroadcastStr##", $src3",
-                      (OpNode (_.VT _.RC:$src1),
-                              (_.VT (X86VBroadcast(_.ScalarLdFrag addr:$src2))),
-                              (i32 imm:$src3),
-                              (i32 FROUND_CURRENT))>, EVEX_B;
-  }
+  defm rmi : AVX512_maskable<opc, MRMSrcMem, _, (outs _.RC:$dst),
+                    (ins _.RC:$src1, _.MemOp:$src2, i32u8imm:$src3),
+                    OpcodeStr, "$src3, $src2, $src1", "$src1, $src2, $src3",
+                    (OpNode (_.VT _.RC:$src1),
+                            (_.VT (bitconvert (_.LdFrag addr:$src2))),
+                            (i32 imm:$src3),
+                            (i32 FROUND_CURRENT))>;
+  defm rmbi : AVX512_maskable<opc, MRMSrcMem, _, (outs _.RC:$dst),
+                    (ins _.RC:$src1, _.ScalarMemOp:$src2, i32u8imm:$src3),
+                    OpcodeStr, "$src3, ${src2}"##_.BroadcastStr##", $src1",
+                    "$src1, ${src2}"##_.BroadcastStr##", $src3",
+                    (OpNode (_.VT _.RC:$src1),
+                            (_.VT (X86VBroadcast(_.ScalarLdFrag addr:$src2))),
+                            (i32 imm:$src3),
+                            (i32 FROUND_CURRENT))>, EVEX_B;
 }
 
 //handle instruction  reg_vec1 = op(reg_vec2,reg_vec3,imm)
@@ -6799,14 +7109,13 @@ multiclass avx512_3Op_rm_imm8<bits<8> opc, string OpcodeStr, SDNode OpNode,
                   (DestInfo.VT (OpNode (SrcInfo.VT SrcInfo.RC:$src1),
                                (SrcInfo.VT SrcInfo.RC:$src2),
                                (i8 imm:$src3)))>;
-  let mayLoad = 1 in
-    defm rmi : AVX512_maskable<opc, MRMSrcMem, DestInfo, (outs DestInfo.RC:$dst),
-                  (ins SrcInfo.RC:$src1, SrcInfo.MemOp:$src2, u8imm:$src3),
-                  OpcodeStr, "$src3, $src2, $src1", "$src1, $src2, $src3",
-                  (DestInfo.VT (OpNode (SrcInfo.VT SrcInfo.RC:$src1),
-                               (SrcInfo.VT (bitconvert
-                                                  (SrcInfo.LdFrag addr:$src2))),
-                               (i8 imm:$src3)))>;
+  defm rmi : AVX512_maskable<opc, MRMSrcMem, DestInfo, (outs DestInfo.RC:$dst),
+                (ins SrcInfo.RC:$src1, SrcInfo.MemOp:$src2, u8imm:$src3),
+                OpcodeStr, "$src3, $src2, $src1", "$src1, $src2, $src3",
+                (DestInfo.VT (OpNode (SrcInfo.VT SrcInfo.RC:$src1),
+                             (SrcInfo.VT (bitconvert
+                                                (SrcInfo.LdFrag addr:$src2))),
+                             (i8 imm:$src3)))>;
 }
 
 //handle instruction  reg_vec1 = op(reg_vec2,reg_vec3,imm)
@@ -6816,14 +7125,13 @@ multiclass avx512_3Op_imm8<bits<8> opc, string OpcodeStr, SDNode OpNode,
                            X86VectorVTInfo _>:
   avx512_3Op_rm_imm8<opc, OpcodeStr, OpNode, _, _>{
 
-  let mayLoad = 1 in
-    defm rmbi : AVX512_maskable<opc, MRMSrcMem, _, (outs _.RC:$dst),
-                      (ins _.RC:$src1, _.ScalarMemOp:$src2, u8imm:$src3),
-                      OpcodeStr, "$src3, ${src2}"##_.BroadcastStr##", $src1",
-                      "$src1, ${src2}"##_.BroadcastStr##", $src3",
-                      (OpNode (_.VT _.RC:$src1),
-                              (_.VT (X86VBroadcast(_.ScalarLdFrag addr:$src2))),
-                              (i8 imm:$src3))>, EVEX_B;
+  defm rmbi : AVX512_maskable<opc, MRMSrcMem, _, (outs _.RC:$dst),
+                    (ins _.RC:$src1, _.ScalarMemOp:$src2, u8imm:$src3),
+                    OpcodeStr, "$src3, ${src2}"##_.BroadcastStr##", $src1",
+                    "$src1, ${src2}"##_.BroadcastStr##", $src3",
+                    (OpNode (_.VT _.RC:$src1),
+                            (_.VT (X86VBroadcast(_.ScalarLdFrag addr:$src2))),
+                            (i8 imm:$src3))>, EVEX_B;
 }
 
 //handle scalar instruction  reg_vec1 = op(reg_vec2,reg_vec3,imm)
@@ -6839,22 +7147,20 @@ multiclass avx512_fp_scalar_imm<bits<8> opc, string OpcodeStr, SDNode OpNode,
                               (_.VT _.RC:$src2),
                               (i32 imm:$src3),
                               (i32 FROUND_CURRENT))>;
-  let mayLoad = 1 in {
-    defm rmi : AVX512_maskable_scalar<opc, MRMSrcMem, _, (outs _.RC:$dst),
-                      (ins _.RC:$src1, _.MemOp:$src2, i32u8imm:$src3),
-                      OpcodeStr, "$src3, $src2, $src1", "$src1, $src2, $src3",
-                      (OpNode (_.VT _.RC:$src1),
-                              (_.VT (scalar_to_vector
-                                        (_.ScalarLdFrag addr:$src2))),
-                              (i32 imm:$src3),
-                              (i32 FROUND_CURRENT))>;
+  defm rmi : AVX512_maskable_scalar<opc, MRMSrcMem, _, (outs _.RC:$dst),
+                    (ins _.RC:$src1, _.MemOp:$src2, i32u8imm:$src3),
+                    OpcodeStr, "$src3, $src2, $src1", "$src1, $src2, $src3",
+                    (OpNode (_.VT _.RC:$src1),
+                            (_.VT (scalar_to_vector
+                                      (_.ScalarLdFrag addr:$src2))),
+                            (i32 imm:$src3),
+                            (i32 FROUND_CURRENT))>;
 
-    let isAsmParserOnly = 1 in {
-      defm rmi_alt :AVX512_maskable_in_asm<opc, MRMSrcMem, _, (outs _.FRC:$dst),
-                      (ins _.FRC:$src1, _.ScalarMemOp:$src2, u8imm:$src3),
-                      OpcodeStr, "$src3, $src2, $src1", "$src1, $src2, $src3",
-                      []>;
-    }
+  let isAsmParserOnly = 1, mayLoad = 1, hasSideEffects = 0 in {
+    defm rmi_alt :AVX512_maskable_in_asm<opc, MRMSrcMem, _, (outs _.FRC:$dst),
+                    (ins _.FRC:$src1, _.ScalarMemOp:$src2, u8imm:$src3),
+                    OpcodeStr, "$src3, $src2, $src1", "$src1, $src2, $src3",
+                    []>;
   }
 }
 
@@ -6940,19 +7246,6 @@ multiclass avx512_common_unary_fp_sae_packed_imm_all<string OpcodeStr,
                             opcPd, OpNode, prd>, EVEX_CD8<64, CD8VF>, VEX_W;
 }
 
-defm VFIXUPIMMPD : avx512_common_fp_sae_packed_imm<"vfixupimmpd",
-                              avx512vl_f64_info, 0x54, X86VFixupimm, HasAVX512>,
-      AVX512AIi8Base, EVEX_4V, EVEX_CD8<64, CD8VF>, VEX_W;
-defm VFIXUPIMMPS : avx512_common_fp_sae_packed_imm<"vfixupimmps",
-                              avx512vl_f32_info, 0x54, X86VFixupimm, HasAVX512>,
-      AVX512AIi8Base, EVEX_4V, EVEX_CD8<32, CD8VF>;
-
-defm VFIXUPIMMSD: avx512_common_fp_sae_scalar_imm<"vfixupimmsd", f64x_info,
-                                                 0x55, X86VFixupimm, HasAVX512>,
-      AVX512AIi8Base, VEX_LIG, EVEX_4V, EVEX_CD8<64, CD8VT1>, VEX_W;
-defm VFIXUPIMMSS: avx512_common_fp_sae_scalar_imm<"vfixupimmss", f32x_info,
-                                                 0x55, X86VFixupimm, HasAVX512>,
-      AVX512AIi8Base, VEX_LIG, EVEX_4V, EVEX_CD8<32, CD8VT1>;
 
 defm VREDUCE   : avx512_common_unary_fp_sae_packed_imm_all<"vreduce", 0x56, 0x56,
                               X86VReduce, HasDQI>, AVX512AIi8Base, EVEX;
@@ -7043,7 +7336,7 @@ defm VALIGND: avx512_valign<"valignd", avx512vl_i32_info>,
 defm VALIGNQ: avx512_valign<"valignq", avx512vl_i64_info>,
                                                   EVEX_CD8<64, CD8VF>, VEX_W;
 
-multiclass avx512_vpalign_lowering<X86VectorVTInfo _ , list<Predicate> p>{
+multiclass avx512_vpalignr_lowering<X86VectorVTInfo _ , list<Predicate> p>{
   let Predicates = p in
     def NAME#_.VTName#rri:
           Pat<(_.VT (X86PAlignr _.RC:$src1, _.RC:$src2, (i8 imm:$imm))),
@@ -7051,18 +7344,18 @@ multiclass avx512_vpalign_lowering<X86VectorVTInfo _ , list<Predicate> p>{
                     _.RC:$src1, _.RC:$src2, imm:$imm)>;
 }
 
-multiclass avx512_vpalign_lowering_common<AVX512VLVectorVTInfo _>:
-      avx512_vpalign_lowering<_.info512, [HasBWI]>,
-      avx512_vpalign_lowering<_.info128, [HasBWI, HasVLX]>,
-      avx512_vpalign_lowering<_.info256, [HasBWI, HasVLX]>;
+multiclass avx512_vpalignr_lowering_common<AVX512VLVectorVTInfo _>:
+      avx512_vpalignr_lowering<_.info512, [HasBWI]>,
+      avx512_vpalignr_lowering<_.info128, [HasBWI, HasVLX]>,
+      avx512_vpalignr_lowering<_.info256, [HasBWI, HasVLX]>;
 
-defm VPALIGN:   avx512_common_3Op_rm_imm8<0x0F, X86PAlignr, "vpalignr" ,
+defm VPALIGNR:   avx512_common_3Op_rm_imm8<0x0F, X86PAlignr, "vpalignr" ,
                                           avx512vl_i8_info, avx512vl_i8_info>,
-                avx512_vpalign_lowering_common<avx512vl_i16_info>,
-                avx512_vpalign_lowering_common<avx512vl_i32_info>,
-                avx512_vpalign_lowering_common<avx512vl_f32_info>,
-                avx512_vpalign_lowering_common<avx512vl_i64_info>,
-                avx512_vpalign_lowering_common<avx512vl_f64_info>,
+                avx512_vpalignr_lowering_common<avx512vl_i16_info>,
+                avx512_vpalignr_lowering_common<avx512vl_i32_info>,
+                avx512_vpalignr_lowering_common<avx512vl_f32_info>,
+                avx512_vpalignr_lowering_common<avx512vl_i64_info>,
+                avx512_vpalignr_lowering_common<avx512vl_f64_info>,
                 EVEX_CD8<8, CD8VF>;
 
 defm VDBPSADBW: avx512_common_3Op_rm_imm8<0x42, X86dbpsadbw, "vdbpsadbw" ,
@@ -7075,25 +7368,23 @@ multiclass avx512_unary_rm<bits<8> opc, string OpcodeStr, SDNode OpNode,
                     "$src1", "$src1",
                     (_.VT (OpNode _.RC:$src1))>, EVEX, AVX5128IBase;
 
-  let mayLoad = 1 in
-    defm rm : AVX512_maskable<opc, MRMSrcMem, _, (outs _.RC:$dst),
-                    (ins _.MemOp:$src1), OpcodeStr,
-                    "$src1", "$src1",
-                    (_.VT (OpNode (bitconvert (_.LdFrag addr:$src1))))>,
-              EVEX, AVX5128IBase, EVEX_CD8<_.EltSize, CD8VF>;
+  defm rm : AVX512_maskable<opc, MRMSrcMem, _, (outs _.RC:$dst),
+                  (ins _.MemOp:$src1), OpcodeStr,
+                  "$src1", "$src1",
+                  (_.VT (OpNode (bitconvert (_.LdFrag addr:$src1))))>,
+            EVEX, AVX5128IBase, EVEX_CD8<_.EltSize, CD8VF>;
 }
 
 multiclass avx512_unary_rmb<bits<8> opc, string OpcodeStr, SDNode OpNode,
                             X86VectorVTInfo _> :
            avx512_unary_rm<opc, OpcodeStr, OpNode, _> {
-  let mayLoad = 1 in
-    defm rmb : AVX512_maskable<opc, MRMSrcMem, _, (outs _.RC:$dst),
-                    (ins _.ScalarMemOp:$src1), OpcodeStr,
-                    "${src1}"##_.BroadcastStr,
-                    "${src1}"##_.BroadcastStr,
-                    (_.VT (OpNode (X86VBroadcast
-                                      (_.ScalarLdFrag addr:$src1))))>,
-               EVEX, AVX5128IBase, EVEX_B, EVEX_CD8<_.EltSize, CD8VF>;
+  defm rmb : AVX512_maskable<opc, MRMSrcMem, _, (outs _.RC:$dst),
+                  (ins _.ScalarMemOp:$src1), OpcodeStr,
+                  "${src1}"##_.BroadcastStr,
+                  "${src1}"##_.BroadcastStr,
+                  (_.VT (OpNode (X86VBroadcast
+                                    (_.ScalarLdFrag addr:$src1))))>,
+             EVEX, AVX5128IBase, EVEX_B, EVEX_CD8<_.EltSize, CD8VF>;
 }
 
 multiclass avx512_unary_rm_vl<bits<8> opc, string OpcodeStr, SDNode OpNode,
@@ -7185,12 +7476,11 @@ multiclass avx512_movddup_128<bits<8> opc, string OpcodeStr, SDNode OpNode,
   defm rr : AVX512_maskable<opc, MRMSrcReg, _, (outs _.RC:$dst),
                    (ins _.RC:$src), OpcodeStr, "$src", "$src",
                    (_.VT (OpNode (_.VT _.RC:$src)))>, EVEX;
-  let mayLoad = 1 in
-    defm rm : AVX512_maskable<opc, MRMSrcMem, _, (outs _.RC:$dst),
-                   (ins _.ScalarMemOp:$src), OpcodeStr, "$src", "$src",
-                   (_.VT (OpNode (_.VT (scalar_to_vector
-                                         (_.ScalarLdFrag addr:$src)))))>,
-                   EVEX, EVEX_CD8<_.EltSize, CD8VH>;
+  defm rm : AVX512_maskable<opc, MRMSrcMem, _, (outs _.RC:$dst),
+                 (ins _.ScalarMemOp:$src), OpcodeStr, "$src", "$src",
+                 (_.VT (OpNode (_.VT (scalar_to_vector
+                                       (_.ScalarLdFrag addr:$src)))))>,
+                 EVEX, EVEX_CD8<_.EltSize, CD8VH>;
 }
 
 multiclass avx512_movddup_common<bits<8> opc, string OpcodeStr, SDNode OpNode,
@@ -7221,8 +7511,8 @@ def : Pat<(v2f64 (X86VBroadcast (loadf64 addr:$src))),
 //===----------------------------------------------------------------------===//
 // AVX-512 - Unpack Instructions
 //===----------------------------------------------------------------------===//
-defm VUNPCKH : avx512_fp_binop_p<0x15, "vunpckh", X86Unpckh>;
-defm VUNPCKL : avx512_fp_binop_p<0x14, "vunpckl", X86Unpckl>;
+defm VUNPCKH : avx512_fp_binop_p<0x15, "vunpckh", X86Unpckh, HasAVX512>;
+defm VUNPCKL : avx512_fp_binop_p<0x14, "vunpckl", X86Unpckl, HasAVX512>;
 
 defm VPUNPCKLBW : avx512_binop_rm_vl_b<0x60, "vpunpcklbw", X86Unpckl,
                                        SSE_INTALU_ITINS_P, HasBWI>;
@@ -7248,14 +7538,13 @@ defm VPUNPCKHQDQ : avx512_binop_rm_vl_q<0x6D, "vpunpckhqdq", X86Unpckh,
 
 multiclass avx512_extract_elt_bw_m<bits<8> opc, string OpcodeStr, SDNode OpNode,
                                                             X86VectorVTInfo _> {
-  let mayStore = 1 in
-    def mr : AVX512Ii8<opc, MRMDestMem, (outs),
-                (ins _.ScalarMemOp:$dst, _.RC:$src1, u8imm:$src2),
-                OpcodeStr#"\t{$src2, $src1, $dst|$dst, $src1, $src2}",
-                [(store (_.EltVT (trunc (assertzext (OpNode (_.VT _.RC:$src1),
-                                                            imm:$src2)))),
-                        addr:$dst)]>,
-                EVEX, EVEX_CD8<_.EltSize, CD8VT1>;
+  def mr : AVX512Ii8<opc, MRMDestMem, (outs),
+              (ins _.ScalarMemOp:$dst, _.RC:$src1, u8imm:$src2),
+              OpcodeStr#"\t{$src2, $src1, $dst|$dst, $src1, $src2}",
+              [(store (_.EltVT (trunc (assertzext (OpNode (_.VT _.RC:$src1),
+                                                          imm:$src2)))),
+                      addr:$dst)]>,
+              EVEX, EVEX_CD8<_.EltSize, CD8VT1>;
 }
 
 multiclass avx512_extract_elt_b<string OpcodeStr, X86VectorVTInfo _> {
@@ -7280,6 +7569,7 @@ multiclass avx512_extract_elt_w<string OpcodeStr, X86VectorVTInfo _> {
                         (X86pextrw (_.VT _.RC:$src1), imm:$src2))]>,
                   EVEX, PD;
 
+    let hasSideEffects = 0 in
     def rr_REV : AVX512Ii8<0x15, MRMDestReg, (outs GR32orGR64:$dst),
                    (ins _.RC:$src1, u8imm:$src2),
                    OpcodeStr#".s\t{$src2, $src1, $dst|$dst, $src1, $src2}", []>,
@@ -7299,13 +7589,12 @@ multiclass avx512_extract_elt_dq<string OpcodeStr, X86VectorVTInfo _,
                       (extractelt (_.VT _.RC:$src1), imm:$src2))]>,
                   EVEX, TAPD;
 
-    let mayStore = 1 in
-      def mr : AVX512Ii8<0x16, MRMDestMem, (outs),
-                  (ins _.ScalarMemOp:$dst, _.RC:$src1, u8imm:$src2),
-                  OpcodeStr#"\t{$src2, $src1, $dst|$dst, $src1, $src2}",
-                  [(store (extractelt (_.VT _.RC:$src1),
-                                      imm:$src2),addr:$dst)]>,
-                  EVEX, EVEX_CD8<_.EltSize, CD8VT1>, TAPD;
+    def mr : AVX512Ii8<0x16, MRMDestMem, (outs),
+                (ins _.ScalarMemOp:$dst, _.RC:$src1, u8imm:$src2),
+                OpcodeStr#"\t{$src2, $src1, $dst|$dst, $src1, $src2}",
+                [(store (extractelt (_.VT _.RC:$src1),
+                                    imm:$src2),addr:$dst)]>,
+                EVEX, EVEX_CD8<_.EltSize, CD8VT1>, TAPD;
   }
 }
 
@@ -7380,33 +7669,33 @@ multiclass avx512_shift_packed<bits<8> opc, SDNode OpNode, Format MRMr,
              (outs _.RC:$dst), (ins _.RC:$src1, u8imm:$src2),
              !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
              [(set _.RC:$dst,(_.VT (OpNode _.RC:$src1, (i8 imm:$src2))))]>;
-  let mayLoad = 1 in
-    def rm : AVX512<opc, MRMm,
-             (outs _.RC:$dst), (ins _.MemOp:$src1, u8imm:$src2),
-             !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
-             [(set _.RC:$dst,(_.VT (OpNode 
-                                   (_.LdFrag addr:$src1), (i8 imm:$src2))))]>;
+  def rm : AVX512<opc, MRMm,
+           (outs _.RC:$dst), (ins _.MemOp:$src1, u8imm:$src2),
+           !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+           [(set _.RC:$dst,(_.VT (OpNode
+                                 (_.VT (bitconvert (_.LdFrag addr:$src1))),
+                                 (i8 imm:$src2))))]>;
 }
 
-multiclass avx512_shift_packed_all<bits<8> opc, SDNode OpNode, Format MRMr, 
+multiclass avx512_shift_packed_all<bits<8> opc, SDNode OpNode, Format MRMr,
                                  Format MRMm, string OpcodeStr, Predicate prd>{
   let Predicates = [prd] in
-    defm Z512 : avx512_shift_packed<opc, OpNode, MRMr, MRMm, 
-                                    OpcodeStr, v8i64_info>, EVEX_V512;
+    defm Z512 : avx512_shift_packed<opc, OpNode, MRMr, MRMm,
+                                    OpcodeStr, v64i8_info>, EVEX_V512;
   let Predicates = [prd, HasVLX] in {
-    defm Z256 : avx512_shift_packed<opc, OpNode, MRMr, MRMm, 
-                                    OpcodeStr, v4i64x_info>, EVEX_V256;
-    defm Z128 : avx512_shift_packed<opc, OpNode, MRMr, MRMm, 
-                                    OpcodeStr, v2i64x_info>, EVEX_V128;
+    defm Z256 : avx512_shift_packed<opc, OpNode, MRMr, MRMm,
+                                    OpcodeStr, v32i8x_info>, EVEX_V256;
+    defm Z128 : avx512_shift_packed<opc, OpNode, MRMr, MRMm,
+                                    OpcodeStr, v16i8x_info>, EVEX_V128;
   }
 }
-defm VPSLLDQ : avx512_shift_packed_all<0x73, X86vshldq, MRM7r, MRM7m, "vpslldq", 
+defm VPSLLDQ : avx512_shift_packed_all<0x73, X86vshldq, MRM7r, MRM7m, "vpslldq",
                                        HasBWI>, AVX512PDIi8Base, EVEX_4V;
-defm VPSRLDQ : avx512_shift_packed_all<0x73, X86vshrdq, MRM3r, MRM3m, "vpsrldq", 
+defm VPSRLDQ : avx512_shift_packed_all<0x73, X86vshrdq, MRM3r, MRM3m, "vpsrldq",
                                        HasBWI>, AVX512PDIi8Base, EVEX_4V;
 
 
-multiclass avx512_psadbw_packed<bits<8> opc, SDNode OpNode, 
+multiclass avx512_psadbw_packed<bits<8> opc, SDNode OpNode,
                                 string OpcodeStr, X86VectorVTInfo _dst,
                                 X86VectorVTInfo _src>{
   def rr : AVX512BI<opc, MRMSrcReg,
@@ -7415,17 +7704,16 @@ multiclass avx512_psadbw_packed<bits<8> opc, SDNode OpNode,
              [(set _dst.RC:$dst,(_dst.VT
                                 (OpNode (_src.VT _src.RC:$src1),
                                         (_src.VT _src.RC:$src2))))]>;
-  let mayLoad = 1 in
-    def rm : AVX512BI<opc, MRMSrcMem,
-             (outs _dst.RC:$dst), (ins _src.RC:$src1, _src.MemOp:$src2),
-             !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
-             [(set _dst.RC:$dst,(_dst.VT
-                                (OpNode (_src.VT _src.RC:$src1),
-                                (_src.VT (bitconvert
-                                          (_src.LdFrag addr:$src2))))))]>;
+  def rm : AVX512BI<opc, MRMSrcMem,
+           (outs _dst.RC:$dst), (ins _src.RC:$src1, _src.MemOp:$src2),
+           !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+           [(set _dst.RC:$dst,(_dst.VT
+                              (OpNode (_src.VT _src.RC:$src1),
+                              (_src.VT (bitconvert
+                                        (_src.LdFrag addr:$src2))))))]>;
 }
 
-multiclass avx512_psadbw_packed_all<bits<8> opc, SDNode OpNode, 
+multiclass avx512_psadbw_packed_all<bits<8> opc, SDNode OpNode,
                                     string OpcodeStr, Predicate prd> {
   let Predicates = [prd] in
     defm Z512 : avx512_psadbw_packed<opc, OpNode, OpcodeStr, v8i64_info,
@@ -7438,7 +7726,7 @@ multiclass avx512_psadbw_packed_all<bits<8> opc, SDNode OpNode,
   }
 }
 
-defm VPSADBW : avx512_psadbw_packed_all<0xf6, X86psadbw, "vpsadbw", 
+defm VPSADBW : avx512_psadbw_packed_all<0xf6, X86psadbw, "vpsadbw",
                                        HasBWI>, EVEX_4V;
 
 multiclass avx512_ternlog<bits<8> opc, string OpcodeStr, SDNode OpNode,
@@ -7446,30 +7734,28 @@ multiclass avx512_ternlog<bits<8> opc, string OpcodeStr, SDNode OpNode,
   let Constraints = "$src1 = $dst" in {
   defm rri : AVX512_maskable_3src<opc, MRMSrcReg, _, (outs _.RC:$dst),
                       (ins _.RC:$src2, _.RC:$src3, u8imm:$src4),
-                      OpcodeStr, "$src4, $src3, $src2", "$src2, $src3, $src3",
+                      OpcodeStr, "$src4, $src3, $src2", "$src2, $src3, $src4",
                       (OpNode (_.VT _.RC:$src1),
                               (_.VT _.RC:$src2),
                               (_.VT _.RC:$src3),
                               (i8 imm:$src4))>, AVX512AIi8Base, EVEX_4V;
-  let mayLoad = 1 in {
-    defm rmi : AVX512_maskable_3src<opc, MRMSrcMem, _, (outs _.RC:$dst),
-                      (ins _.RC:$src2, _.MemOp:$src3, u8imm:$src4),
-                      OpcodeStr, "$src4, $src3, $src2", "$src2, $src3, $src3",
-                      (OpNode (_.VT _.RC:$src1),
-                              (_.VT _.RC:$src2),
-                              (_.VT (bitconvert (_.LdFrag addr:$src3))),
-                              (i8 imm:$src4))>,
-                      AVX512AIi8Base, EVEX_4V, EVEX_CD8<_.EltSize, CD8VF>;
-    defm rmbi : AVX512_maskable_3src<opc, MRMSrcMem, _, (outs _.RC:$dst),
-                      (ins _.RC:$src2, _.ScalarMemOp:$src3, u8imm:$src4),
-                      OpcodeStr, "$src4, ${src3}"##_.BroadcastStr##", $src2",
-                      "$src2, ${src3}"##_.BroadcastStr##", $src4",
-                      (OpNode (_.VT _.RC:$src1),
-                              (_.VT _.RC:$src2),
-                              (_.VT (X86VBroadcast(_.ScalarLdFrag addr:$src3))),
-                              (i8 imm:$src4))>, EVEX_B,
-                      AVX512AIi8Base, EVEX_4V, EVEX_CD8<_.EltSize, CD8VF>;
-  }
+  defm rmi : AVX512_maskable_3src<opc, MRMSrcMem, _, (outs _.RC:$dst),
+                    (ins _.RC:$src2, _.MemOp:$src3, u8imm:$src4),
+                    OpcodeStr, "$src4, $src3, $src2", "$src2, $src3, $src4",
+                    (OpNode (_.VT _.RC:$src1),
+                            (_.VT _.RC:$src2),
+                            (_.VT (bitconvert (_.LdFrag addr:$src3))),
+                            (i8 imm:$src4))>,
+                    AVX512AIi8Base, EVEX_4V, EVEX_CD8<_.EltSize, CD8VF>;
+  defm rmbi : AVX512_maskable_3src<opc, MRMSrcMem, _, (outs _.RC:$dst),
+                    (ins _.RC:$src2, _.ScalarMemOp:$src3, u8imm:$src4),
+                    OpcodeStr, "$src4, ${src3}"##_.BroadcastStr##", $src2",
+                    "$src2, ${src3}"##_.BroadcastStr##", $src4",
+                    (OpNode (_.VT _.RC:$src1),
+                            (_.VT _.RC:$src2),
+                            (_.VT (X86VBroadcast(_.ScalarLdFrag addr:$src3))),
+                            (i8 imm:$src4))>, EVEX_B,
+                    AVX512AIi8Base, EVEX_4V, EVEX_CD8<_.EltSize, CD8VF>;
   }// Constraints = "$src1 = $dst"
 }
 
@@ -7485,3 +7771,109 @@ multiclass avx512_common_ternlog<string OpcodeStr, AVX512VLVectorVTInfo _>{
 defm VPTERNLOGD : avx512_common_ternlog<"vpternlogd", avx512vl_i32_info>;
 defm VPTERNLOGQ : avx512_common_ternlog<"vpternlogq", avx512vl_i64_info>, VEX_W;
 
+//===----------------------------------------------------------------------===//
+// AVX-512 - FixupImm
+//===----------------------------------------------------------------------===//
+
+multiclass avx512_fixupimm_packed<bits<8> opc, string OpcodeStr, SDNode OpNode,
+                                                            X86VectorVTInfo _>{
+  let Constraints = "$src1 = $dst" in {
+    defm rri : AVX512_maskable_3src<opc, MRMSrcReg, _, (outs _.RC:$dst),
+                        (ins _.RC:$src2, _.RC:$src3, i32u8imm:$src4),
+                         OpcodeStr##_.Suffix, "$src4, $src3, $src2", "$src2, $src3, $src4",
+                        (OpNode (_.VT _.RC:$src1),
+                                (_.VT _.RC:$src2),
+                                (_.IntVT _.RC:$src3),
+                                (i32 imm:$src4),
+                                (i32 FROUND_CURRENT))>;
+    defm rmi : AVX512_maskable_3src<opc, MRMSrcMem, _, (outs _.RC:$dst),
+                      (ins _.RC:$src2, _.MemOp:$src3, i32u8imm:$src4),
+                      OpcodeStr##_.Suffix, "$src4, $src3, $src2", "$src2, $src3, $src4",
+                      (OpNode (_.VT _.RC:$src1),
+                              (_.VT _.RC:$src2),
+                              (_.IntVT (bitconvert (_.LdFrag addr:$src3))),
+                              (i32 imm:$src4),
+                              (i32 FROUND_CURRENT))>;
+    defm rmbi : AVX512_maskable_3src<opc, MRMSrcMem, _, (outs _.RC:$dst),
+                      (ins _.RC:$src2, _.ScalarMemOp:$src3, i32u8imm:$src4),
+                    OpcodeStr##_.Suffix, "$src4, ${src3}"##_.BroadcastStr##", $src2",
+                    "$src2, ${src3}"##_.BroadcastStr##", $src4",
+                      (OpNode (_.VT _.RC:$src1),
+                              (_.VT _.RC:$src2),
+                              (_.IntVT (X86VBroadcast(_.ScalarLdFrag addr:$src3))),
+                              (i32 imm:$src4),
+                              (i32 FROUND_CURRENT))>, EVEX_B;
+  } // Constraints = "$src1 = $dst"
+}
+
+multiclass avx512_fixupimm_packed_sae<bits<8> opc, string OpcodeStr,
+                                             SDNode OpNode, X86VectorVTInfo _>{
+let Constraints = "$src1 = $dst" in {
+  defm rrib : AVX512_maskable_3src<opc, MRMSrcReg, _, (outs _.RC:$dst),
+                      (ins _.RC:$src2, _.RC:$src3, i32u8imm:$src4),
+                      OpcodeStr##_.Suffix, "$src4, {sae}, $src3, $src2",
+                      "$src2, $src3, {sae}, $src4",
+                      (OpNode (_.VT _.RC:$src1),
+                                (_.VT _.RC:$src2),
+                                (_.IntVT _.RC:$src3),
+                                (i32 imm:$src4),
+                                (i32 FROUND_NO_EXC))>, EVEX_B;
+  }
+}
+
+multiclass avx512_fixupimm_scalar<bits<8> opc, string OpcodeStr, SDNode OpNode,
+                                  X86VectorVTInfo _, X86VectorVTInfo _src3VT> {
+  let Constraints = "$src1 = $dst" , Predicates = [HasAVX512] in {
+    defm rri : AVX512_maskable_3src_scalar<opc, MRMSrcReg, _, (outs _.RC:$dst),
+                      (ins _.RC:$src2, _.RC:$src3, i32u8imm:$src4),
+                      OpcodeStr##_.Suffix, "$src4, $src3, $src2", "$src2, $src3, $src4",
+                      (OpNode (_.VT _.RC:$src1),
+                              (_.VT _.RC:$src2),
+                              (_src3VT.VT _src3VT.RC:$src3),
+                              (i32 imm:$src4),
+                              (i32 FROUND_CURRENT))>;
+
+    defm rrib : AVX512_maskable_3src_scalar<opc, MRMSrcReg, _, (outs _.RC:$dst),
+                      (ins _.RC:$src2, _.RC:$src3, i32u8imm:$src4),
+                      OpcodeStr##_.Suffix, "$src4, {sae}, $src3, $src2",
+                      "$src2, $src3, {sae}, $src4",
+                      (OpNode (_.VT _.RC:$src1),
+                              (_.VT _.RC:$src2),
+                              (_src3VT.VT _src3VT.RC:$src3),
+                              (i32 imm:$src4),
+                              (i32 FROUND_NO_EXC))>, EVEX_B;
+    defm rmi : AVX512_maskable_3src_scalar<opc, MRMSrcMem, _, (outs _.RC:$dst),
+                     (ins _.RC:$src2, _.ScalarMemOp:$src3, i32u8imm:$src4),
+                     OpcodeStr##_.Suffix, "$src4, $src3, $src2", "$src2, $src3, $src4",
+                     (OpNode (_.VT _.RC:$src1),
+                             (_.VT _.RC:$src2),
+                             (_src3VT.VT (scalar_to_vector
+                                       (_src3VT.ScalarLdFrag addr:$src3))),
+                             (i32 imm:$src4),
+                             (i32 FROUND_CURRENT))>;
+  }
+}
+
+multiclass avx512_fixupimm_packed_all<AVX512VLVectorVTInfo _Vec>{
+  let Predicates = [HasAVX512] in
+    defm Z    : avx512_fixupimm_packed<0x54, "vfixupimm", X86VFixupimm, _Vec.info512>,
+                avx512_fixupimm_packed_sae<0x54, "vfixupimm", X86VFixupimm, _Vec.info512>,
+                                  AVX512AIi8Base, EVEX_4V, EVEX_V512;
+  let Predicates = [HasAVX512, HasVLX] in {
+    defm Z128 : avx512_fixupimm_packed<0x54, "vfixupimm", X86VFixupimm, _Vec.info128>,
+                                  AVX512AIi8Base, EVEX_4V, EVEX_V128;
+    defm Z256 : avx512_fixupimm_packed<0x54, "vfixupimm", X86VFixupimm, _Vec.info256>,
+                                  AVX512AIi8Base, EVEX_4V, EVEX_V256;
+  }
+}
+
+defm VFIXUPIMMSS : avx512_fixupimm_scalar<0x55, "vfixupimm", X86VFixupimmScalar,
+                                          f32x_info, v4i32x_info>,
+                         AVX512AIi8Base, VEX_LIG, EVEX_4V, EVEX_CD8<32, CD8VT1>;
+defm VFIXUPIMMSD : avx512_fixupimm_scalar<0x55, "vfixupimm", X86VFixupimmScalar,
+                                          f64x_info, v2i64x_info>,
+                         AVX512AIi8Base, VEX_LIG, EVEX_4V, EVEX_CD8<64, CD8VT1>, VEX_W;
+defm VFIXUPIMMPS : avx512_fixupimm_packed_all<avx512vl_f32_info>,
+                         EVEX_CD8<32, CD8VF>;
+defm VFIXUPIMMPD : avx512_fixupimm_packed_all<avx512vl_f64_info>,
+                         EVEX_CD8<64, CD8VF>, VEX_W;
diff --git a/lib/Target/X86/X86InstrBuilder.h b/lib/Target/X86/X86InstrBuilder.h
index 787f15bc628e..bcea6fa80350 100644
--- a/lib/Target/X86/X86InstrBuilder.h
+++ b/lib/Target/X86/X86InstrBuilder.h
@@ -83,6 +83,34 @@ struct X86AddressMode {
   }
 };
 
+/// Compute the addressing mode from an machine instruction starting with the
+/// given operand.
+static inline X86AddressMode getAddressFromInstr(MachineInstr *MI,
+                                                 unsigned Operand) {
+  X86AddressMode AM;
+  MachineOperand &Op = MI->getOperand(Operand);
+  if (Op.isReg()) {
+    AM.BaseType = X86AddressMode::RegBase;
+    AM.Base.Reg = Op.getReg();
+  } else {
+    AM.BaseType = X86AddressMode::FrameIndexBase;
+    AM.Base.FrameIndex = Op.getIndex();
+  }
+  Op = MI->getOperand(Operand + 1);
+  if (Op.isImm())
+    AM.Scale = Op.getImm();
+  Op = MI->getOperand(Operand + 2);
+  if (Op.isImm())
+    AM.IndexReg = Op.getImm();
+  Op = MI->getOperand(Operand + 3);
+  if (Op.isGlobal()) {
+    AM.GV = Op.getGlobal();
+  } else {
+    AM.Disp = Op.getImm();
+  }
+  return AM;
+}
+
 /// addDirectMem - This function is used to add a direct memory reference to the
 /// current instruction -- that is, a dereference of an address in a register,
 /// with no scale, index or displacement. An example is: DWORD PTR [EAX].
@@ -151,7 +179,7 @@ addFrameReference(const MachineInstrBuilder &MIB, int FI, int Offset = 0) {
   MachineFunction &MF = *MI->getParent()->getParent();
   MachineFrameInfo &MFI = *MF.getFrameInfo();
   const MCInstrDesc &MCID = MI->getDesc();
-  unsigned Flags = 0;
+  auto Flags = MachineMemOperand::MONone;
   if (MCID.mayLoad())
     Flags |= MachineMemOperand::MOLoad;
   if (MCID.mayStore())
diff --git a/lib/Target/X86/X86InstrCompiler.td b/lib/Target/X86/X86InstrCompiler.td
index c709c8aca9fa..925f4efb5aa9 100644
--- a/lib/Target/X86/X86InstrCompiler.td
+++ b/lib/Target/X86/X86InstrCompiler.td
@@ -99,18 +99,6 @@ def VAARG_64 : I<0, Pseudo,
                     (X86vaarg64 addr:$ap, imm:$size, imm:$mode, imm:$align)),
                   (implicit EFLAGS)]>;
 
-// Dynamic stack allocation yields a _chkstk or _alloca call for all Windows
-// targets.  These calls are needed to probe the stack when allocating more than
-// 4k bytes in one go. Touching the stack at 4K increments is necessary to
-// ensure that the guard pages used by the OS virtual memory manager are
-// allocated in correct sequence.
-// The main point of having separate instruction are extra unmodelled effects
-// (compared to ordinary calls) like stack pointer change.
-
-let Defs = [EAX, ESP, EFLAGS], Uses = [ESP] in
-  def WIN_ALLOCA : I<0, Pseudo, (outs), (ins),
-                     "# dynamic stack allocation",
-                     [(X86WinAlloca)]>;
 
 // When using segmented stacks these are lowered into instructions which first
 // check if the current stacklet has enough free memory. If it does, memory is
@@ -132,6 +120,27 @@ def SEG_ALLOCA_64 : I<0, Pseudo, (outs GR64:$dst), (ins GR64:$size),
                     Requires<[In64BitMode]>;
 }
 
+// Dynamic stack allocation yields a _chkstk or _alloca call for all Windows
+// targets.  These calls are needed to probe the stack when allocating more than
+// 4k bytes in one go. Touching the stack at 4K increments is necessary to
+// ensure that the guard pages used by the OS virtual memory manager are
+// allocated in correct sequence.
+// The main point of having separate instruction are extra unmodelled effects
+// (compared to ordinary calls) like stack pointer change.
+
+let Defs = [EAX, ESP, EFLAGS], Uses = [ESP] in
+def WIN_ALLOCA_32 : I<0, Pseudo, (outs), (ins GR32:$size),
+                     "# dynamic stack allocation",
+                     [(X86WinAlloca GR32:$size)]>,
+                     Requires<[NotLP64]>;
+
+let Defs = [RAX, RSP, EFLAGS], Uses = [RSP] in
+def WIN_ALLOCA_64 : I<0, Pseudo, (outs), (ins GR64:$size),
+                     "# dynamic stack allocation",
+                     [(X86WinAlloca GR64:$size)]>,
+                     Requires<[In64BitMode]>;
+
+
 //===----------------------------------------------------------------------===//
 // EH Pseudo Instructions
 //
@@ -250,7 +259,7 @@ def MORESTACK_RET_RESTORE_R10 : I<0, Pseudo, (outs), (ins),
 // Alias instruction mapping movr0 to xor.
 // FIXME: remove when we can teach regalloc that xor reg, reg is ok.
 let Defs = [EFLAGS], isReMaterializable = 1, isAsCheapAsAMove = 1,
-    isPseudo = 1 in
+    isPseudo = 1, AddedComplexity = 20 in
 def MOV32r0  : I<0, Pseudo, (outs GR32:$dst), (ins), "",
                  [(set GR32:$dst, 0)], IIC_ALU_NONMEM>, Sched<[WriteZero]>;
 
@@ -263,7 +272,7 @@ def : Pat<(i64 0), (SUBREG_TO_REG (i64 0), (MOV32r0), sub_32bit)> {
 }
 
 let Predicates = [OptForSize, NotSlowIncDec, Not64BitMode],
-    AddedComplexity = 1 in {
+    AddedComplexity = 15 in {
   // Pseudo instructions for materializing 1 and -1 using XOR+INC/DEC,
   // which only require 3 bytes compared to MOV32ri which requires 5.
   let Defs = [EFLAGS], isReMaterializable = 1, isPseudo = 1 in {
@@ -278,6 +287,17 @@ let Predicates = [OptForSize, NotSlowIncDec, Not64BitMode],
   def : Pat<(i16 -1), (EXTRACT_SUBREG (MOV32r_1), sub_16bit)>;
 }
 
+let isReMaterializable = 1, isPseudo = 1, AddedComplexity = 10 in {
+// AddedComplexity higher than MOV64ri but lower than MOV32r0 and MOV32r1.
+// FIXME: Add itinerary class and Schedule.
+def MOV32ImmSExti8 : I<0, Pseudo, (outs GR32:$dst), (ins i32i8imm:$src), "",
+                       [(set GR32:$dst, i32immSExt8:$src)]>,
+                     Requires<[OptForMinSize, NotWin64WithoutFP]>;
+def MOV64ImmSExti8 : I<0, Pseudo, (outs GR64:$dst), (ins i64i8imm:$src), "",
+                       [(set GR64:$dst, i64immSExt8:$src)]>,
+                     Requires<[OptForMinSize, NotWin64WithoutFP]>;
+}
+
 // Materialize i64 constant where top 32-bits are zero. This could theoretically
 // use MOV32ri with a SUBREG_TO_REG to represent the zero-extension, however
 // that would make it more difficult to rematerialize.
@@ -479,10 +499,13 @@ def TLSCall_32 : I<0, Pseudo, (outs), (ins i32mem:$sym),
                 [(X86TLSCall addr:$sym)]>,
                 Requires<[Not64BitMode]>;
 
-// For x86_64, the address of the thunk is passed in %rdi, on return
-// the address of the variable is in %rax.  All other registers are preserved.
+// For x86_64, the address of the thunk is passed in %rdi, but the
+// pseudo directly use the symbol, so do not add an implicit use of
+// %rdi. The lowering will do the right thing with RDI.
+// On return the address of the variable is in %rax.  All other
+// registers are preserved.
 let Defs = [RAX, EFLAGS],
-    Uses = [RSP, RDI],
+    Uses = [RSP],
     usesCustomInserter = 1 in
 def TLSCall_64 : I<0, Pseudo, (outs), (ins i64mem:$sym),
                   "# TLSCall_64",
@@ -568,7 +591,7 @@ def Int_MemBarrier : I<0, Pseudo, (outs), (ins),
 // ImmOpc8 corresponds to the mi8 version of the instruction
 // ImmMod corresponds to the instruction format of the mi and mi8 versions
 multiclass LOCK_ArithBinOp<bits<8> RegOpc, bits<8> ImmOpc, bits<8> ImmOpc8,
-                           Format ImmMod, string mnemonic> {
+                           Format ImmMod, SDPatternOperator Op, string mnemonic> {
 let Defs = [EFLAGS], mayLoad = 1, mayStore = 1, isCodeGenOnly = 1,
     SchedRW = [WriteALULd, WriteRMW] in {
 
@@ -577,106 +600,124 @@ def NAME#8mr : I<{RegOpc{7}, RegOpc{6}, RegOpc{5}, RegOpc{4},
                   MRMDestMem, (outs), (ins i8mem:$dst, GR8:$src2),
                   !strconcat(mnemonic, "{b}\t",
                              "{$src2, $dst|$dst, $src2}"),
-                  [], IIC_ALU_NONMEM>, LOCK;
+                  [(set EFLAGS, (Op addr:$dst, GR8:$src2))],
+                  IIC_ALU_NONMEM>, LOCK;
+
 def NAME#16mr : I<{RegOpc{7}, RegOpc{6}, RegOpc{5}, RegOpc{4},
                    RegOpc{3}, RegOpc{2}, RegOpc{1}, 1 },
                    MRMDestMem, (outs), (ins i16mem:$dst, GR16:$src2),
                    !strconcat(mnemonic, "{w}\t",
                               "{$src2, $dst|$dst, $src2}"),
-                   [], IIC_ALU_NONMEM>, OpSize16, LOCK;
+                   [(set EFLAGS, (Op addr:$dst, GR16:$src2))],
+                   IIC_ALU_NONMEM>, OpSize16, LOCK;
+
 def NAME#32mr : I<{RegOpc{7}, RegOpc{6}, RegOpc{5}, RegOpc{4},
                    RegOpc{3}, RegOpc{2}, RegOpc{1}, 1 },
                    MRMDestMem, (outs), (ins i32mem:$dst, GR32:$src2),
                    !strconcat(mnemonic, "{l}\t",
                               "{$src2, $dst|$dst, $src2}"),
-                   [], IIC_ALU_NONMEM>, OpSize32, LOCK;
+                   [(set EFLAGS, (Op addr:$dst, GR32:$src2))],
+                   IIC_ALU_NONMEM>, OpSize32, LOCK;
+
 def NAME#64mr : RI<{RegOpc{7}, RegOpc{6}, RegOpc{5}, RegOpc{4},
                     RegOpc{3}, RegOpc{2}, RegOpc{1}, 1 },
                     MRMDestMem, (outs), (ins i64mem:$dst, GR64:$src2),
                     !strconcat(mnemonic, "{q}\t",
                                "{$src2, $dst|$dst, $src2}"),
-                    [], IIC_ALU_NONMEM>, LOCK;
+                    [(set EFLAGS, (Op addr:$dst, GR64:$src2))],
+                    IIC_ALU_NONMEM>, LOCK;
 
 def NAME#8mi : Ii8<{ImmOpc{7}, ImmOpc{6}, ImmOpc{5}, ImmOpc{4},
                     ImmOpc{3}, ImmOpc{2}, ImmOpc{1}, 0 },
                     ImmMod, (outs), (ins i8mem :$dst, i8imm :$src2),
                     !strconcat(mnemonic, "{b}\t",
                                "{$src2, $dst|$dst, $src2}"),
-                    [], IIC_ALU_MEM>, LOCK;
+                    [(set EFLAGS, (Op addr:$dst, (i8 imm:$src2)))],
+                    IIC_ALU_MEM>, LOCK;
 
 def NAME#16mi : Ii16<{ImmOpc{7}, ImmOpc{6}, ImmOpc{5}, ImmOpc{4},
                       ImmOpc{3}, ImmOpc{2}, ImmOpc{1}, 1 },
                       ImmMod, (outs), (ins i16mem :$dst, i16imm :$src2),
                       !strconcat(mnemonic, "{w}\t",
                                  "{$src2, $dst|$dst, $src2}"),
-                      [], IIC_ALU_MEM>, OpSize16, LOCK;
+                      [(set EFLAGS, (Op addr:$dst, (i16 imm:$src2)))],
+                      IIC_ALU_MEM>, OpSize16, LOCK;
 
 def NAME#32mi : Ii32<{ImmOpc{7}, ImmOpc{6}, ImmOpc{5}, ImmOpc{4},
                       ImmOpc{3}, ImmOpc{2}, ImmOpc{1}, 1 },
                       ImmMod, (outs), (ins i32mem :$dst, i32imm :$src2),
                       !strconcat(mnemonic, "{l}\t",
                                  "{$src2, $dst|$dst, $src2}"),
-                      [], IIC_ALU_MEM>, OpSize32, LOCK;
+                      [(set EFLAGS, (Op addr:$dst, (i32 imm:$src2)))],
+                      IIC_ALU_MEM>, OpSize32, LOCK;
 
 def NAME#64mi32 : RIi32S<{ImmOpc{7}, ImmOpc{6}, ImmOpc{5}, ImmOpc{4},
                           ImmOpc{3}, ImmOpc{2}, ImmOpc{1}, 1 },
                           ImmMod, (outs), (ins i64mem :$dst, i64i32imm :$src2),
                           !strconcat(mnemonic, "{q}\t",
                                      "{$src2, $dst|$dst, $src2}"),
-                          [], IIC_ALU_MEM>, LOCK;
+                          [(set EFLAGS, (Op addr:$dst, i64immSExt32:$src2))],
+                          IIC_ALU_MEM>, LOCK;
 
 def NAME#16mi8 : Ii8<{ImmOpc8{7}, ImmOpc8{6}, ImmOpc8{5}, ImmOpc8{4},
                       ImmOpc8{3}, ImmOpc8{2}, ImmOpc8{1}, 1 },
                       ImmMod, (outs), (ins i16mem :$dst, i16i8imm :$src2),
                       !strconcat(mnemonic, "{w}\t",
                                  "{$src2, $dst|$dst, $src2}"),
-                      [], IIC_ALU_MEM>, OpSize16, LOCK;
+                      [(set EFLAGS, (Op addr:$dst, i16immSExt8:$src2))],
+                      IIC_ALU_MEM>, OpSize16, LOCK;
+
 def NAME#32mi8 : Ii8<{ImmOpc8{7}, ImmOpc8{6}, ImmOpc8{5}, ImmOpc8{4},
                       ImmOpc8{3}, ImmOpc8{2}, ImmOpc8{1}, 1 },
                       ImmMod, (outs), (ins i32mem :$dst, i32i8imm :$src2),
                       !strconcat(mnemonic, "{l}\t",
                                  "{$src2, $dst|$dst, $src2}"),
-                      [], IIC_ALU_MEM>, OpSize32, LOCK;
+                      [(set EFLAGS, (Op addr:$dst, i32immSExt8:$src2))],
+                      IIC_ALU_MEM>, OpSize32, LOCK;
+
 def NAME#64mi8 : RIi8<{ImmOpc8{7}, ImmOpc8{6}, ImmOpc8{5}, ImmOpc8{4},
                        ImmOpc8{3}, ImmOpc8{2}, ImmOpc8{1}, 1 },
                        ImmMod, (outs), (ins i64mem :$dst, i64i8imm :$src2),
                        !strconcat(mnemonic, "{q}\t",
                                   "{$src2, $dst|$dst, $src2}"),
-                       [], IIC_ALU_MEM>, LOCK;
+                       [(set EFLAGS, (Op addr:$dst, i64immSExt8:$src2))],
+                       IIC_ALU_MEM>, LOCK;
 
 }
 
 }
 
-defm LOCK_ADD : LOCK_ArithBinOp<0x00, 0x80, 0x83, MRM0m, "add">;
-defm LOCK_SUB : LOCK_ArithBinOp<0x28, 0x80, 0x83, MRM5m, "sub">;
-defm LOCK_OR  : LOCK_ArithBinOp<0x08, 0x80, 0x83, MRM1m, "or">;
-defm LOCK_AND : LOCK_ArithBinOp<0x20, 0x80, 0x83, MRM4m, "and">;
-defm LOCK_XOR : LOCK_ArithBinOp<0x30, 0x80, 0x83, MRM6m, "xor">;
+defm LOCK_ADD : LOCK_ArithBinOp<0x00, 0x80, 0x83, MRM0m, X86lock_add, "add">;
+defm LOCK_SUB : LOCK_ArithBinOp<0x28, 0x80, 0x83, MRM5m, X86lock_sub, "sub">;
+defm LOCK_OR  : LOCK_ArithBinOp<0x08, 0x80, 0x83, MRM1m, X86lock_or , "or">;
+defm LOCK_AND : LOCK_ArithBinOp<0x20, 0x80, 0x83, MRM4m, X86lock_and, "and">;
+defm LOCK_XOR : LOCK_ArithBinOp<0x30, 0x80, 0x83, MRM6m, X86lock_xor, "xor">;
 
-// Optimized codegen when the non-memory output is not used.
 multiclass LOCK_ArithUnOp<bits<8> Opc8, bits<8> Opc, Format Form,
-                          string mnemonic> {
+                          int Increment, string mnemonic> {
 let Defs = [EFLAGS], mayLoad = 1, mayStore = 1, isCodeGenOnly = 1,
-    SchedRW = [WriteALULd, WriteRMW] in {
-
+    SchedRW = [WriteALULd, WriteRMW], Predicates = [NotSlowIncDec] in {
 def NAME#8m  : I<Opc8, Form, (outs), (ins i8mem :$dst),
                  !strconcat(mnemonic, "{b}\t$dst"),
-                 [], IIC_UNARY_MEM>, LOCK;
+                 [(set EFLAGS, (X86lock_add addr:$dst, (i8 Increment)))],
+                  IIC_UNARY_MEM>, LOCK;
 def NAME#16m : I<Opc, Form, (outs), (ins i16mem:$dst),
                  !strconcat(mnemonic, "{w}\t$dst"),
-                 [], IIC_UNARY_MEM>, OpSize16, LOCK;
+                 [(set EFLAGS, (X86lock_add addr:$dst, (i16 Increment)))],
+                 IIC_UNARY_MEM>, OpSize16, LOCK;
 def NAME#32m : I<Opc, Form, (outs), (ins i32mem:$dst),
                  !strconcat(mnemonic, "{l}\t$dst"),
-                 [], IIC_UNARY_MEM>, OpSize32, LOCK;
+                 [(set EFLAGS, (X86lock_add addr:$dst, (i32 Increment)))],
+                 IIC_UNARY_MEM>, OpSize32, LOCK;
 def NAME#64m : RI<Opc, Form, (outs), (ins i64mem:$dst),
                   !strconcat(mnemonic, "{q}\t$dst"),
-                  [], IIC_UNARY_MEM>, LOCK;
+                  [(set EFLAGS, (X86lock_add addr:$dst, (i64 Increment)))],
+                  IIC_UNARY_MEM>, LOCK;
 }
 }
 
-defm LOCK_INC    : LOCK_ArithUnOp<0xFE, 0xFF, MRM0m, "inc">;
-defm LOCK_DEC    : LOCK_ArithUnOp<0xFE, 0xFF, MRM1m, "dec">;
+defm LOCK_INC    : LOCK_ArithUnOp<0xFE, 0xFF, MRM0m,  1, "inc">;
+defm LOCK_DEC    : LOCK_ArithUnOp<0xFE, 0xFF, MRM1m, -1, "dec">;
 
 // Atomic compare and swap.
 multiclass LCMPXCHG_UnOp<bits<8> Opc, Format Form, string mnemonic,
@@ -719,6 +760,38 @@ defm LCMPXCHG8B : LCMPXCHG_UnOp<0xC7, MRM1m, "cmpxchg8b",
                                 IIC_CMPX_LOCK_8B>;
 }
 
+// This pseudo must be used when the frame uses RBX as
+// the base pointer. Indeed, in such situation RBX is a reserved
+// register and the register allocator will ignore any use/def of
+// it. In other words, the register will not fix the clobbering of
+// RBX that will happen when setting the arguments for the instrucion.
+// 
+// Unlike the actual related instuction, we mark that this one
+// defines EBX (instead of using EBX).
+// The rationale is that we will define RBX during the expansion of
+// the pseudo. The argument feeding EBX is ebx_input.
+//
+// The additional argument, $ebx_save, is a temporary register used to
+// save the value of RBX accross the actual instruction.
+//
+// To make sure the register assigned to $ebx_save does not interfere with
+// the definition of the actual instruction, we use a definition $dst which
+// is tied to $rbx_save. That way, the live-range of $rbx_save spans accross
+// the instruction and we are sure we will have a valid register to restore
+// the value of RBX.
+let Defs = [EAX, EDX, EBX, EFLAGS], Uses = [EAX, ECX, EDX],
+    SchedRW = [WriteALULd, WriteRMW], isCodeGenOnly = 1, isPseudo = 1,
+    Constraints = "$ebx_save = $dst", usesCustomInserter = 1 in {
+def LCMPXCHG8B_SAVE_EBX :
+    I<0, Pseudo, (outs GR32:$dst),
+      (ins i64mem:$ptr, GR32:$ebx_input, GR32:$ebx_save),
+      !strconcat("cmpxchg8b", "\t$ptr"),
+      [(set GR32:$dst, (X86cas8save_ebx addr:$ptr, GR32:$ebx_input,
+                                        GR32:$ebx_save))],
+      IIC_CMPX_LOCK_8B>;
+}
+
+
 let Defs = [RAX, RDX, EFLAGS], Uses = [RAX, RBX, RCX, RDX],
     Predicates = [HasCmpxchg16b], SchedRW = [WriteALULd, WriteRMW] in {
 defm LCMPXCHG16B : LCMPXCHG_UnOp<0xC7, MRM1m, "cmpxchg16b",
@@ -726,6 +799,20 @@ defm LCMPXCHG16B : LCMPXCHG_UnOp<0xC7, MRM1m, "cmpxchg16b",
                                  IIC_CMPX_LOCK_16B>, REX_W;
 }
 
+// Same as LCMPXCHG8B_SAVE_RBX but for the 16 Bytes variant.
+let Defs = [RAX, RDX, RBX, EFLAGS], Uses = [RAX, RCX, RDX],
+    Predicates = [HasCmpxchg16b], SchedRW = [WriteALULd, WriteRMW],
+    isCodeGenOnly = 1, isPseudo = 1, Constraints = "$rbx_save = $dst",
+    usesCustomInserter = 1 in {
+def LCMPXCHG16B_SAVE_RBX :
+    I<0, Pseudo, (outs GR64:$dst),
+      (ins i128mem:$ptr, GR64:$rbx_input, GR64:$rbx_save),
+      !strconcat("cmpxchg16b", "\t$ptr"),
+      [(set GR64:$dst, (X86cas16save_rbx addr:$ptr, GR64:$rbx_input,
+                                                    GR64:$rbx_save))],
+      IIC_CMPX_LOCK_16B>;
+}
+
 defm LCMPXCHG : LCMPXCHG_BinOp<0xB0, 0xB1, MRMDestMem, "cmpxchg",
                                X86cas, IIC_CMPX_LOCK_8, IIC_CMPX_LOCK>;
 
@@ -926,6 +1013,18 @@ def ACQUIRE_MOV64rm : I<0, Pseudo, (outs GR64:$dst), (ins i64mem:$src),
 // DAG Pattern Matching Rules
 //===----------------------------------------------------------------------===//
 
+// Use AND/OR to store 0/-1 in memory when optimizing for minsize. This saves
+// binary size compared to a regular MOV, but it introduces an unnecessary
+// load, so is not suitable for regular or optsize functions.
+let Predicates = [OptForMinSize] in {
+def : Pat<(store (i16 0), addr:$dst), (AND16mi8 addr:$dst, 0)>;
+def : Pat<(store (i32 0), addr:$dst), (AND32mi8 addr:$dst, 0)>;
+def : Pat<(store (i64 0), addr:$dst), (AND64mi8 addr:$dst, 0)>;
+def : Pat<(store (i16 -1), addr:$dst), (OR16mi8 addr:$dst, -1)>;
+def : Pat<(store (i32 -1), addr:$dst), (OR32mi8 addr:$dst, -1)>;
+def : Pat<(store (i64 -1), addr:$dst), (OR64mi8 addr:$dst, -1)>;
+}
+
 // ConstantPool GlobalAddress, ExternalSymbol, and JumpTable
 def : Pat<(i32 (X86Wrapper tconstpool  :$dst)), (MOV32ri tconstpool  :$dst)>;
 def : Pat<(i32 (X86Wrapper tjumptable  :$dst)), (MOV32ri tjumptable  :$dst)>;
@@ -994,22 +1093,22 @@ def : Pat<(i64 (X86Wrapper tblockaddress:$dst)),
 // for MOV64mi32 should handle this sort of thing.
 def : Pat<(store (i64 (X86Wrapper tconstpool:$src)), addr:$dst),
           (MOV64mi32 addr:$dst, tconstpool:$src)>,
-          Requires<[NearData, IsStatic]>;
+          Requires<[NearData, IsNotPIC]>;
 def : Pat<(store (i64 (X86Wrapper tjumptable:$src)), addr:$dst),
           (MOV64mi32 addr:$dst, tjumptable:$src)>,
-          Requires<[NearData, IsStatic]>;
+          Requires<[NearData, IsNotPIC]>;
 def : Pat<(store (i64 (X86Wrapper tglobaladdr:$src)), addr:$dst),
           (MOV64mi32 addr:$dst, tglobaladdr:$src)>,
-          Requires<[NearData, IsStatic]>;
+          Requires<[NearData, IsNotPIC]>;
 def : Pat<(store (i64 (X86Wrapper texternalsym:$src)), addr:$dst),
           (MOV64mi32 addr:$dst, texternalsym:$src)>,
-          Requires<[NearData, IsStatic]>;
+          Requires<[NearData, IsNotPIC]>;
 def : Pat<(store (i64 (X86Wrapper mcsym:$src)), addr:$dst),
           (MOV64mi32 addr:$dst, mcsym:$src)>,
-          Requires<[NearData, IsStatic]>;
+          Requires<[NearData, IsNotPIC]>;
 def : Pat<(store (i64 (X86Wrapper tblockaddress:$src)), addr:$dst),
           (MOV64mi32 addr:$dst, tblockaddress:$src)>,
-          Requires<[NearData, IsStatic]>;
+          Requires<[NearData, IsNotPIC]>;
 
 def : Pat<(i32 (X86RecoverFrameAlloc mcsym:$dst)), (MOV32ri mcsym:$dst)>;
 def : Pat<(i64 (X86RecoverFrameAlloc mcsym:$dst)), (MOV64ri mcsym:$dst)>;
@@ -1139,12 +1238,13 @@ defm : CMOVmr<X86_COND_O , CMOVNO16rm, CMOVNO32rm, CMOVNO64rm>;
 defm : CMOVmr<X86_COND_NO, CMOVO16rm , CMOVO32rm , CMOVO64rm>;
 
 // zextload bool -> zextload byte
-def : Pat<(zextloadi8i1  addr:$src), (AND8ri (MOV8rm addr:$src), (i8 1))>;
-def : Pat<(zextloadi16i1 addr:$src), (AND16ri8 (MOVZX16rm8 addr:$src), (i16 1))>;
-def : Pat<(zextloadi32i1 addr:$src), (AND32ri8 (MOVZX32rm8 addr:$src), (i32 1))>;
+// i1 stored in one byte in zero-extended form.
+// Upper bits cleanup should be executed before Store.
+def : Pat<(zextloadi8i1  addr:$src), (MOV8rm addr:$src)>;
+def : Pat<(zextloadi16i1 addr:$src), (MOVZX16rm8 addr:$src)>;
+def : Pat<(zextloadi32i1 addr:$src), (MOVZX32rm8 addr:$src)>;
 def : Pat<(zextloadi64i1 addr:$src),
-          (SUBREG_TO_REG (i64 0),
-           (AND32ri8 (MOVZX32rm8 addr:$src), (i32 1)), sub_32bit)>;
+          (SUBREG_TO_REG (i64 0), (MOVZX32rm8 addr:$src), sub_32bit)>;
 
 // extload bool -> extload byte
 // When extloading from 16-bit and smaller memory locations into 64-bit
@@ -1305,7 +1405,7 @@ def : Pat<(store (add (loadi64 addr:$dst), 128), addr:$dst),
 // instructions.
 def : Pat<(add GR64:$src1, 0x0000000080000000),
           (SUB64ri32 GR64:$src1, 0xffffffff80000000)>;
-def : Pat<(store (add (loadi64 addr:$dst), 0x00000000800000000), addr:$dst),
+def : Pat<(store (add (loadi64 addr:$dst), 0x0000000080000000), addr:$dst),
           (SUB64mi32 addr:$dst, 0xffffffff80000000)>;
 
 // To avoid needing to materialize an immediate in a register, use a 32-bit and
@@ -1450,6 +1550,10 @@ def : Pat<(i8 (trunc (srl_su GR16:$src, (i8 8)))),
           (EXTRACT_SUBREG (i16 (COPY_TO_REGCLASS GR16:$src, GR16_ABCD)),
                           sub_8bit_hi)>,
       Requires<[Not64BitMode]>;
+def : Pat<(i8 (trunc (srl_su (i32 (anyext GR16:$src)), (i8 8)))),
+          (EXTRACT_SUBREG (i16 (COPY_TO_REGCLASS GR16:$src, GR16_ABCD)),
+                          sub_8bit_hi)>,
+      Requires<[Not64BitMode]>;
 def : Pat<(i8 (trunc (srl_su GR32:$src, (i8 8)))),
           (EXTRACT_SUBREG (i32 (COPY_TO_REGCLASS GR32:$src, GR32_ABCD)),
                           sub_8bit_hi)>,
diff --git a/lib/Target/X86/X86InstrControl.td b/lib/Target/X86/X86InstrControl.td
index 8c351a51c460..bb5f9117f032 100644
--- a/lib/Target/X86/X86InstrControl.td
+++ b/lib/Target/X86/X86InstrControl.td
@@ -22,21 +22,21 @@
 let isTerminator = 1, isReturn = 1, isBarrier = 1,
     hasCtrlDep = 1, FPForm = SpecialFP, SchedRW = [WriteJumpLd] in {
   def RETL   : I   <0xC3, RawFrm, (outs), (ins variable_ops),
-                    "ret{l}", [(X86retflag 0)], IIC_RET>, OpSize32,
+                    "ret{l}", [], IIC_RET>, OpSize32,
                     Requires<[Not64BitMode]>;
   def RETQ   : I   <0xC3, RawFrm, (outs), (ins variable_ops),
-                    "ret{q}", [(X86retflag 0)], IIC_RET>, OpSize32,
+                    "ret{q}", [], IIC_RET>, OpSize32,
                     Requires<[In64BitMode]>;
   def RETW   : I   <0xC3, RawFrm, (outs), (ins),
                     "ret{w}",
                     [], IIC_RET>, OpSize16;
   def RETIL  : Ii16<0xC2, RawFrm, (outs), (ins i16imm:$amt, variable_ops),
                     "ret{l}\t$amt",
-                    [(X86retflag timm:$amt)], IIC_RET_IMM>, OpSize32,
+                    [], IIC_RET_IMM>, OpSize32,
                Requires<[Not64BitMode]>;
   def RETIQ  : Ii16<0xC2, RawFrm, (outs), (ins i16imm:$amt, variable_ops),
                     "ret{q}\t$amt",
-                    [(X86retflag timm:$amt)], IIC_RET_IMM>, OpSize32,
+                    [], IIC_RET_IMM>, OpSize32,
                Requires<[In64BitMode]>;
   def RETIW  : Ii16<0xC2, RawFrm, (outs), (ins i16imm:$amt),
                     "ret{w}\t$amt",
@@ -64,8 +64,8 @@ let isTerminator = 1, isReturn = 1, isBarrier = 1,
   def IRET64 : RI  <0xcf, RawFrm, (outs), (ins), "iretq", [],
                     IIC_IRET>, Requires<[In64BitMode]>;
   let isCodeGenOnly = 1 in
-  def IRET : PseudoI<(outs), (ins i16imm:$adj), [(X86iret timm:$adj)]>;
-  
+  def IRET : PseudoI<(outs), (ins i32imm:$adj), [(X86iret timm:$adj)]>;
+  def RET  : PseudoI<(outs), (ins i32imm:$adj, variable_ops), [(X86retflag timm:$adj)]>;
 }
 
 // Unconditional branches.
diff --git a/lib/Target/X86/X86InstrFPStack.td b/lib/Target/X86/X86InstrFPStack.td
index 03ae21125b0e..078dab41502a 100644
--- a/lib/Target/X86/X86InstrFPStack.td
+++ b/lib/Target/X86/X86InstrFPStack.td
@@ -326,7 +326,7 @@ def FCOM32m  : FPI<0xD8, MRM2m, (outs), (ins f32mem:$src), "fcom{s}\t$src">;
 def FCOMP32m : FPI<0xD8, MRM3m, (outs), (ins f32mem:$src), "fcomp{s}\t$src">;
 
 def FLDENVm  : FPI<0xD9, MRM4m, (outs), (ins f32mem:$src), "fldenv\t$src">;
-def FSTENVm  : FPI<0xD9, MRM6m, (outs f32mem:$dst), (ins), "fnstenv\t$dst">;
+def FSTENVm  : FPI<0xD9, MRM6m, (outs), (ins f32mem:$dst), "fnstenv\t$dst">;
 
 def FICOM32m : FPI<0xDA, MRM2m, (outs), (ins i32mem:$src), "ficom{l}\t$src">;
 def FICOMP32m: FPI<0xDA, MRM3m, (outs), (ins i32mem:$src), "ficomp{l}\t$src">;
@@ -334,15 +334,15 @@ def FICOMP32m: FPI<0xDA, MRM3m, (outs), (ins i32mem:$src), "ficomp{l}\t$src">;
 def FCOM64m  : FPI<0xDC, MRM2m, (outs), (ins f64mem:$src), "fcom{l}\t$src">;
 def FCOMP64m : FPI<0xDC, MRM3m, (outs), (ins f64mem:$src), "fcomp{l}\t$src">;
 
-def FRSTORm  : FPI<0xDD, MRM4m, (outs f32mem:$dst), (ins), "frstor\t$dst">;
-def FSAVEm   : FPI<0xDD, MRM6m, (outs f32mem:$dst), (ins), "fnsave\t$dst">;
-def FNSTSWm  : FPI<0xDD, MRM7m, (outs i16mem:$dst), (ins), "fnstsw\t$dst">;
+def FRSTORm  : FPI<0xDD, MRM4m, (outs), (ins f32mem:$dst), "frstor\t$dst">;
+def FSAVEm   : FPI<0xDD, MRM6m, (outs), (ins f32mem:$dst), "fnsave\t$dst">;
+def FNSTSWm  : FPI<0xDD, MRM7m, (outs), (ins i16mem:$dst), "fnstsw\t$dst">;
 
 def FICOM16m : FPI<0xDE, MRM2m, (outs), (ins i16mem:$src), "ficom{s}\t$src">;
 def FICOMP16m: FPI<0xDE, MRM3m, (outs), (ins i16mem:$src), "ficomp{s}\t$src">;
 
 def FBLDm    : FPI<0xDF, MRM4m, (outs), (ins f80mem:$src), "fbld\t$src">;
-def FBSTPm   : FPI<0xDF, MRM6m, (outs f80mem:$dst), (ins), "fbstp\t$dst">;
+def FBSTPm   : FPI<0xDF, MRM6m, (outs), (ins f80mem:$dst), "fbstp\t$dst">;
 
 // Floating point cmovs.
 class FpIf32CMov<dag outs, dag ins, FPFormat fp, list<dag> pattern> :
diff --git a/lib/Target/X86/X86InstrFormats.td b/lib/Target/X86/X86InstrFormats.td
index e2fa295c0230..5183adc834b1 100644
--- a/lib/Target/X86/X86InstrFormats.td
+++ b/lib/Target/X86/X86InstrFormats.td
@@ -845,7 +845,7 @@ class AVXPCLMULIi8<bits<8> o, Format F, dag outs, dag ins, string asm,
 class FMA3<bits<8> o, Format F, dag outs, dag ins, string asm,
            list<dag>pattern, InstrItinClass itin = NoItinerary>
       : I<o, F, outs, ins, asm, pattern, itin>, T8PD,
-        VEX_4V, FMASC, Requires<[HasFMA]>;
+        VEX_4V, FMASC, Requires<[HasFMA, NoVLX]>;
 
 // FMA4 Instruction Templates
 class FMA4<bits<8> o, Format F, dag outs, dag ins, string asm,
diff --git a/lib/Target/X86/X86InstrFragmentsSIMD.td b/lib/Target/X86/X86InstrFragmentsSIMD.td
index 643286324e25..ea54f049ec7a 100644
--- a/lib/Target/X86/X86InstrFragmentsSIMD.td
+++ b/lib/Target/X86/X86InstrFragmentsSIMD.td
@@ -35,7 +35,7 @@ def bc_mmx  : PatFrag<(ops node:$in), (x86mmx  (bitconvert node:$in))>;
 // SSE specific DAG Nodes.
 //===----------------------------------------------------------------------===//
 
-def SDTX86VFCMP : SDTypeProfile<1, 3, [SDTCisInt<0>, SDTCisSameAs<1, 2>,
+def SDTX86VFCMP : SDTypeProfile<1, 3, [SDTCisFP<0>, SDTCisSameAs<1, 2>,
                                        SDTCisFP<1>, SDTCisVT<3, i8>,
                                        SDTCisVec<1>]>;
 def SDTX86CmpTestSae : SDTypeProfile<1, 3, [SDTCisVT<0, i32>, 
@@ -60,9 +60,8 @@ def X86fandn   : SDNode<"X86ISD::FANDN",     SDTFPBinOp,
                         [SDNPCommutative, SDNPAssociative]>;
 def X86frsqrt  : SDNode<"X86ISD::FRSQRT",    SDTFPUnaryOp>;
 def X86frcp    : SDNode<"X86ISD::FRCP",      SDTFPUnaryOp>;
-def X86frsqrt14s: SDNode<"X86ISD::FRSQRT",  SDTFPBinOp>;
-def X86frcp14s : SDNode<"X86ISD::FRCP",    SDTFPBinOp>;
-def X86fgetsign: SDNode<"X86ISD::FGETSIGNx86",SDTFPToIntOp>;
+def X86frsqrt14s: SDNode<"X86ISD::FRSQRTS",  SDTFPBinOp>;
+def X86frcp14s : SDNode<"X86ISD::FRCPS",    SDTFPBinOp>;
 def X86fhadd   : SDNode<"X86ISD::FHADD",     SDTFPBinOp>;
 def X86fhsub   : SDNode<"X86ISD::FHSUB",     SDTFPBinOp>;
 def X86hadd    : SDNode<"X86ISD::HADD",      SDTIntBinOp>;
@@ -72,7 +71,6 @@ def X86comiSae : SDNode<"X86ISD::COMI",      SDTX86CmpTestSae>;
 def X86ucomi   : SDNode<"X86ISD::UCOMI",     SDTX86CmpTest>;
 def X86ucomiSae: SDNode<"X86ISD::UCOMI",     SDTX86CmpTestSae>;
 def X86cmps    : SDNode<"X86ISD::FSETCC",     SDTX86Cmps>;
-//def X86cmpsd   : SDNode<"X86ISD::FSETCCsd",    SDTX86Cmpsd>;
 def X86cvtdq2pd: SDNode<"X86ISD::CVTDQ2PD",
                  SDTypeProfile<1, 1, [SDTCisVT<0, v2f64>,
                                       SDTCisVT<1, v4i32>]>>;
@@ -95,9 +93,9 @@ def X86dbpsadbw : SDNode<"X86ISD::DBPSADBW",
 def X86andnp   : SDNode<"X86ISD::ANDNP",
                  SDTypeProfile<1, 2, [SDTCisVec<0>, SDTCisSameAs<0,1>,
                                       SDTCisSameAs<0,2>]>>;
-def X86psign   : SDNode<"X86ISD::PSIGN",
-                 SDTypeProfile<1, 2, [SDTCisVec<0>, SDTCisSameAs<0,1>,
-                                      SDTCisSameAs<0,2>]>>;
+def X86multishift   : SDNode<"X86ISD::MULTISHIFT",
+                 SDTypeProfile<1, 2, [SDTCisVec<0>, SDTCisVec<1>,
+                                      SDTCisSameAs<1,2>]>>;
 def X86pextrb  : SDNode<"X86ISD::PEXTRB",
                  SDTypeProfile<1, 2, [SDTCisVT<0, i32>, SDTCisVT<1, v16i8>,
                                       SDTCisPtrTy<2>]>>;
@@ -137,46 +135,39 @@ def X86vtrunc    : SDNode<"X86ISD::VTRUNC",   SDTVtrunc>;
 def X86vtruncs   : SDNode<"X86ISD::VTRUNCS",  SDTVtrunc>;
 def X86vtruncus  : SDNode<"X86ISD::VTRUNCUS", SDTVtrunc>;
 
-def X86trunc    : SDNode<"X86ISD::TRUNC",
-                         SDTypeProfile<1, 1, [SDTCisInt<0>, SDTCisInt<1>,
-                                              SDTCisOpSmallerThanOp<0, 1>]>>;
 def X86vfpext  : SDNode<"X86ISD::VFPEXT",
-                        SDTypeProfile<1, 1, [SDTCisVec<0>, SDTCisVec<1>,
-                                             SDTCisFP<0>, SDTCisFP<1>,
-                                             SDTCisOpSmallerThanOp<1, 0>]>>;
+                        SDTypeProfile<1, 1, [SDTCVecEltisVT<0, f64>,
+                                             SDTCVecEltisVT<1, f32>,
+                                             SDTCisSameSizeAs<0, 1>]>>;
 def X86vfpround: SDNode<"X86ISD::VFPROUND",
-                        SDTypeProfile<1, 1, [SDTCisVec<0>, SDTCisVec<1>,
-                                             SDTCisFP<0>, SDTCisFP<1>,
-                                             SDTCisOpSmallerThanOp<0, 1>]>>;
+                        SDTypeProfile<1, 1, [SDTCVecEltisVT<0, f32>,
+                                             SDTCVecEltisVT<1, f64>,
+                                             SDTCisSameSizeAs<0, 1>]>>;
 
 def X86fround: SDNode<"X86ISD::VFPROUND",
-                        SDTypeProfile<1, 2, [SDTCisFP<0>, SDTCisFP<1>,SDTCisFP<2>,
-                                             SDTCVecEltisVT<0, f32>,
-                                             SDTCVecEltisVT<1, f64>,
+                        SDTypeProfile<1, 2, [SDTCVecEltisVT<0, f32>,
+                                             SDTCisSameAs<0, 1>,
                                              SDTCVecEltisVT<2, f64>,
-                                             SDTCisOpSmallerThanOp<0, 1>]>>;
+                                             SDTCisSameSizeAs<0, 2>]>>;
 def X86froundRnd: SDNode<"X86ISD::VFPROUND",
-                        SDTypeProfile<1, 3, [SDTCisFP<0>, SDTCisFP<1>,SDTCisFP<2>,
-                                             SDTCVecEltisVT<0, f32>,
-                                             SDTCVecEltisVT<1, f64>,
+                        SDTypeProfile<1, 3, [SDTCVecEltisVT<0, f32>,
+                                             SDTCisSameAs<0, 1>,
                                              SDTCVecEltisVT<2, f64>,
-                                             SDTCisOpSmallerThanOp<0, 1>,
-                                             SDTCisInt<3>]>>;
+                                             SDTCisSameSizeAs<0, 2>,
+                                             SDTCisVT<3, i32>]>>;
 
 def X86fpext  : SDNode<"X86ISD::VFPEXT",
-                        SDTypeProfile<1, 2, [SDTCisFP<0>, SDTCisFP<1>,SDTCisFP<2>,
-                                             SDTCVecEltisVT<0, f64>,
-                                             SDTCVecEltisVT<1, f32>,
+                        SDTypeProfile<1, 2, [SDTCVecEltisVT<0, f64>,
+                                             SDTCisSameAs<0, 1>,
                                              SDTCVecEltisVT<2, f32>,
-                                             SDTCisOpSmallerThanOp<1, 0>]>>;
+                                             SDTCisSameSizeAs<0, 2>]>>;
 
 def X86fpextRnd  : SDNode<"X86ISD::VFPEXT",
-                        SDTypeProfile<1, 3, [SDTCisFP<0>, SDTCisFP<1>,SDTCisFP<2>,
-                                             SDTCVecEltisVT<0, f64>,
-                                             SDTCVecEltisVT<1, f32>,
+                        SDTypeProfile<1, 3, [SDTCVecEltisVT<0, f64>,
+                                             SDTCisSameAs<0, 1>,
                                              SDTCVecEltisVT<2, f32>,
-                                             SDTCisOpSmallerThanOp<1, 0>,
-                                             SDTCisInt<3>]>>;
+                                             SDTCisSameSizeAs<0, 2>,
+                                             SDTCisVT<3, i32>]>>;
 
 def X86vshldq  : SDNode<"X86ISD::VSHLDQ",    SDTIntShiftOp>;
 def X86vshrdq  : SDNode<"X86ISD::VSRLDQ",    SDTIntShiftOp>;
@@ -221,6 +212,8 @@ def X86vsra    : SDNode<"X86ISD::VSRA",
                         SDTypeProfile<1, 2, [SDTCisVec<0>, SDTCisSameAs<0,1>,
                                       SDTCisVec<2>]>>;
 
+def X86vsrav   : SDNode<"X86ISD::VSRAV" , SDTIntShiftOp>;
+
 def X86vshli   : SDNode<"X86ISD::VSHLI", SDTIntShiftOp>;
 def X86vsrli   : SDNode<"X86ISD::VSRLI", SDTIntShiftOp>;
 def X86vsrai   : SDNode<"X86ISD::VSRAI", SDTIntShiftOp>;
@@ -250,10 +243,24 @@ def X86vpcomu  : SDNode<"X86ISD::VPCOMU",
                         SDTypeProfile<1, 3, [SDTCisVec<0>, SDTCisSameAs<0,1>,
                                              SDTCisSameAs<0,2>,
                                              SDTCisVT<3, i8>]>>;
+def X86vpermil2 : SDNode<"X86ISD::VPERMIL2",
+                        SDTypeProfile<1, 4, [SDTCisVec<0>, SDTCisSameAs<0,1>,
+                                             SDTCisSameAs<0,2>,
+                                             SDTCisSameSizeAs<0,3>,
+                                             SDTCisSameNumEltsAs<0, 3>,
+                                             SDTCisVT<4, i8>]>>;
+def X86vpperm : SDNode<"X86ISD::VPPERM",
+                        SDTypeProfile<1, 3, [SDTCisVT<0, v16i8>, SDTCisSameAs<0,1>,
+                                             SDTCisSameAs<0,2>]>>;
 
 def SDTX86CmpPTest : SDTypeProfile<1, 2, [SDTCisVT<0, i32>,
                                           SDTCisVec<1>,
                                           SDTCisSameAs<2, 1>]>;
+
+def SDTX86Testm : SDTypeProfile<1, 2, [SDTCisVec<0>, SDTCisVec<1>,
+                                       SDTCisSameAs<2, 1>, SDTCVecEltisVT<0, i1>,
+                                       SDTCisSameNumEltsAs<0, 1>]>;
+
 def X86addus   : SDNode<"X86ISD::ADDUS", SDTIntBinOp>;
 def X86subus   : SDNode<"X86ISD::SUBUS", SDTIntBinOp>;
 def X86adds    : SDNode<"X86ISD::ADDS", SDTIntBinOp>;
@@ -264,15 +271,22 @@ def X86ptest   : SDNode<"X86ISD::PTEST", SDTX86CmpPTest>;
 def X86testp   : SDNode<"X86ISD::TESTP", SDTX86CmpPTest>;
 def X86kortest : SDNode<"X86ISD::KORTEST", SDTX86CmpPTest>;
 def X86ktest   : SDNode<"X86ISD::KTEST", SDTX86CmpPTest>;
-def X86testm   : SDNode<"X86ISD::TESTM", SDTypeProfile<1, 2, [SDTCisVec<0>,
-                                          SDTCisVec<1>, SDTCisSameAs<2, 1>,
-                                          SDTCVecEltisVT<0, i1>,
-                                          SDTCisSameNumEltsAs<0, 1>]>>;
-def X86testnm  : SDNode<"X86ISD::TESTNM", SDTypeProfile<1, 2, [SDTCisVec<0>,
-                                          SDTCisVec<1>, SDTCisSameAs<2, 1>,
-                                          SDTCVecEltisVT<0, i1>,
-                                          SDTCisSameNumEltsAs<0, 1>]>>;
-def X86select  : SDNode<"X86ISD::SELECT"     , SDTSelect>;
+def X86testm   : SDNode<"X86ISD::TESTM", SDTX86Testm, [SDNPCommutative]>;
+def X86testnm  : SDNode<"X86ISD::TESTNM", SDTX86Testm, [SDNPCommutative]>;
+
+def X86movmsk : SDNode<"X86ISD::MOVMSK",
+                        SDTypeProfile<1, 1, [SDTCisVT<0, i32>, SDTCisVec<1>]>>;
+
+def X86select  : SDNode<"X86ISD::SELECT",
+                        SDTypeProfile<1, 3, [SDTCVecEltisVT<1, i1>,
+                                             SDTCisSameAs<0, 2>,
+                                             SDTCisSameAs<2, 3>,
+                                             SDTCisSameNumEltsAs<0, 1>]>>;
+
+def X86selects : SDNode<"X86ISD::SELECT",
+                        SDTypeProfile<1, 3, [SDTCisVT<1, i1>,
+                                             SDTCisSameAs<0, 2>,
+                                             SDTCisSameAs<2, 3>]>>;
 
 def X86pmuludq : SDNode<"X86ISD::PMULUDQ",
                         SDTypeProfile<1, 2, [SDTCVecEltisVT<0, i64>,
@@ -308,9 +322,16 @@ def SDTShuff2OpI : SDTypeProfile<1, 2, [SDTCisVec<0>,
 def SDTShuff3OpI : SDTypeProfile<1, 3, [SDTCisVec<0>, SDTCisSameAs<0,1>,
                                  SDTCisSameAs<0,2>, SDTCisVT<3, i8>]>;
 def SDTFPBinOpImmRound: SDTypeProfile<1, 4, [SDTCisVec<0>, SDTCisSameAs<0,1>,
-                             SDTCisSameAs<0,2>, SDTCisInt<3>, SDTCisInt<4>]>;
+                             SDTCisSameAs<0,2>, SDTCisVT<3, i32>, SDTCisVT<4, i32>]>;
+def SDTFPTernaryOpImmRound: SDTypeProfile<1, 5, [SDTCisFP<0>, SDTCisSameAs<0,1>,
+                                                 SDTCisSameAs<0,2>,
+                                                 SDTCisInt<3>,
+                                                 SDTCisSameSizeAs<0, 3>,
+                                                 SDTCisSameNumEltsAs<0, 3>,
+                                                 SDTCisVT<4, i32>,
+                                                 SDTCisVT<5, i32>]>;
 def SDTFPUnaryOpImmRound: SDTypeProfile<1, 3, [SDTCisVec<0>, SDTCisSameAs<0,1>,
-                              SDTCisInt<2>, SDTCisInt<3>]>;
+                              SDTCisVT<2, i32>, SDTCisVT<3, i32>]>;
 
 def SDTVBroadcast  : SDTypeProfile<1, 1, [SDTCisVec<0>]>;
 def SDTVBroadcastm : SDTypeProfile<1, 1, [SDTCisVec<0>,
@@ -324,21 +345,16 @@ def SDTTernlog  : SDTypeProfile<1, 4, [SDTCisVec<0>, SDTCisSameAs<0,1>,
                                 SDTCisVT<4, i8>]>;
 
 def SDTFPBinOpRound : SDTypeProfile<1, 3, [      // fadd_round, fmul_round, etc.
-  SDTCisSameAs<0, 1>, SDTCisSameAs<0, 2>, SDTCisFP<0>, SDTCisInt<3>]>;
+  SDTCisSameAs<0, 1>, SDTCisSameAs<0, 2>, SDTCisFP<0>, SDTCisVT<3, i32>]>;
 
 def SDTFPUnaryOpRound : SDTypeProfile<1, 2, [      // fsqrt_round, fgetexp_round, etc.
-  SDTCisSameAs<0, 1>, SDTCisFP<0>, SDTCisInt<2>]>;
+  SDTCisSameAs<0, 1>, SDTCisFP<0>, SDTCisVT<2, i32>]>;
 
 def SDTFma : SDTypeProfile<1, 3, [SDTCisSameAs<0,1>,
                            SDTCisSameAs<1,2>, SDTCisSameAs<1,3>]>;
 def SDTFmaRound : SDTypeProfile<1, 4, [SDTCisSameAs<0,1>,
-                           SDTCisSameAs<1,2>, SDTCisSameAs<1,3>, SDTCisInt<4>]>;
-def STDFp1SrcRm : SDTypeProfile<1, 2, [SDTCisSameAs<0,1>,
-                           SDTCisVec<0>, SDTCisVT<2, i32>]>;
-def STDFp2SrcRm : SDTypeProfile<1, 3, [SDTCisSameAs<0,1>,
-                           SDTCisVec<0>, SDTCisVT<3, i32>]>;
-def STDFp3SrcRm : SDTypeProfile<1, 4, [SDTCisSameAs<0,1>,
-                           SDTCisVec<0>, SDTCisVT<3, i32>, SDTCisVT<4, i32>]>;
+                           SDTCisSameAs<1,2>, SDTCisSameAs<1,3>,
+                           SDTCisVT<4, i32>]>;
 
 def X86PAlignr : SDNode<"X86ISD::PALIGNR", SDTShuff3OpI>;
 def X86VAlign  : SDNode<"X86ISD::VALIGN", SDTShuff3OpI>;
@@ -405,7 +421,8 @@ def X86vpternlog  : SDNode<"X86ISD::VPTERNLOG", SDTTernlog>;
 
 def X86VPerm2x128 : SDNode<"X86ISD::VPERM2X128", SDTShuff3OpI>;
 
-def X86VFixupimm   : SDNode<"X86ISD::VFIXUPIMM", SDTFPBinOpImmRound>;
+def X86VFixupimm   : SDNode<"X86ISD::VFIXUPIMM", SDTFPTernaryOpImmRound>;
+def X86VFixupimmScalar   : SDNode<"X86ISD::VFIXUPIMMS", SDTFPTernaryOpImmRound>;
 def X86VRange      : SDNode<"X86ISD::VRANGE",    SDTFPBinOpImmRound>;
 def X86VReduce     : SDNode<"X86ISD::VREDUCE",   SDTFPUnaryOpImmRound>;
 def X86VRndScale   : SDNode<"X86ISD::VRNDSCALE", SDTFPUnaryOpImmRound>;
@@ -422,10 +439,6 @@ def X86Vfpclasss   : SDNode<"X86ISD::VFPCLASSS",
 def X86SubVBroadcast : SDNode<"X86ISD::SUBV_BROADCAST",
                     SDTypeProfile<1, 1, [SDTCisVec<0>, SDTCisVec<1>,
                                          SDTCisSubVecOfVec<1, 0>]>, []>;
-// SDTCisSubVecOfVec restriction cannot be applied for 128 bit version of VBROADCASTI32x2.
-def X86SubV32x2Broadcast : SDNode<"X86ISD::SUBV_BROADCAST",
-                    SDTypeProfile<1, 1, [SDTCisVec<0>,
-                                         SDTCisSameAs<0,1>]>, []>;
 
 def X86VBroadcast : SDNode<"X86ISD::VBROADCAST", SDTVBroadcast>;
 def X86VBroadcastm : SDNode<"X86ISD::VBROADCASTM", SDTVBroadcastm>;
@@ -446,11 +459,12 @@ def X86fmulRnd   : SDNode<"X86ISD::FMUL_RND",  SDTFPBinOpRound>;
 def X86fdivRnd   : SDNode<"X86ISD::FDIV_RND",  SDTFPBinOpRound>;
 def X86fmaxRnd   : SDNode<"X86ISD::FMAX_RND",       SDTFPBinOpRound>;
 def X86scalef    : SDNode<"X86ISD::SCALEF",         SDTFPBinOpRound>;
+def X86scalefs   : SDNode<"X86ISD::SCALEFS",        SDTFPBinOpRound>;
 def X86fminRnd   : SDNode<"X86ISD::FMIN_RND",       SDTFPBinOpRound>;
 def X86fsqrtRnd     : SDNode<"X86ISD::FSQRT_RND",   SDTFPUnaryOpRound>;
-def X86fsqrtRnds    : SDNode<"X86ISD::FSQRT_RND",   STDFp2SrcRm>;
+def X86fsqrtRnds    : SDNode<"X86ISD::FSQRT_RND",   SDTFPBinOpRound>;
 def X86fgetexpRnd   : SDNode<"X86ISD::FGETEXP_RND", SDTFPUnaryOpRound>;
-def X86fgetexpRnds  : SDNode<"X86ISD::FGETEXP_RND", STDFp2SrcRm>;
+def X86fgetexpRnds  : SDNode<"X86ISD::FGETEXP_RND", SDTFPBinOpRound>;
 
 def X86Fmadd     : SDNode<"X86ISD::FMADD",     SDTFma>;
 def X86Fnmadd    : SDNode<"X86ISD::FNMADD",    SDTFma>;
@@ -466,15 +480,18 @@ def X86FnmsubRnd    : SDNode<"X86ISD::FNMSUB_RND",    SDTFmaRound>;
 def X86FmaddsubRnd  : SDNode<"X86ISD::FMADDSUB_RND",  SDTFmaRound>;
 def X86FmsubaddRnd  : SDNode<"X86ISD::FMSUBADD_RND",  SDTFmaRound>;
 
-def X86rsqrt28   : SDNode<"X86ISD::RSQRT28",  STDFp1SrcRm>;
-def X86rcp28     : SDNode<"X86ISD::RCP28",    STDFp1SrcRm>;
-def X86exp2      : SDNode<"X86ISD::EXP2",     STDFp1SrcRm>;
+def x86vpmadd52l     : SDNode<"X86ISD::VPMADD52L",     SDTFma>;
+def x86vpmadd52h     : SDNode<"X86ISD::VPMADD52H",     SDTFma>;
 
-def X86rsqrt28s  : SDNode<"X86ISD::RSQRT28",   STDFp2SrcRm>;
-def X86rcp28s    : SDNode<"X86ISD::RCP28",     STDFp2SrcRm>;
-def X86RndScales : SDNode<"X86ISD::VRNDSCALE", STDFp3SrcRm>;
-def X86Reduces   : SDNode<"X86ISD::VREDUCE",   STDFp3SrcRm>;
-def X86GetMants  : SDNode<"X86ISD::VGETMANT",  STDFp3SrcRm>;
+def X86rsqrt28   : SDNode<"X86ISD::RSQRT28",  SDTFPUnaryOpRound>;
+def X86rcp28     : SDNode<"X86ISD::RCP28",    SDTFPUnaryOpRound>;
+def X86exp2      : SDNode<"X86ISD::EXP2",     SDTFPUnaryOpRound>;
+
+def X86rsqrt28s  : SDNode<"X86ISD::RSQRT28",   SDTFPBinOpRound>;
+def X86rcp28s    : SDNode<"X86ISD::RCP28",     SDTFPBinOpRound>;
+def X86RndScales : SDNode<"X86ISD::VRNDSCALE", SDTFPBinOpImmRound>;
+def X86Reduces   : SDNode<"X86ISD::VREDUCE",   SDTFPBinOpImmRound>;
+def X86GetMants  : SDNode<"X86ISD::VGETMANT",  SDTFPBinOpImmRound>;
 
 def SDT_PCMPISTRI : SDTypeProfile<2, 3, [SDTCisVT<0, i32>, SDTCisVT<1, i32>,
                                          SDTCisVT<2, v16i8>, SDTCisVT<3, v16i8>,
@@ -496,90 +513,62 @@ def SDTintToFPRound: SDTypeProfile<1, 3, [SDTCisVec<0>, SDTCisFP<0>,
                                           SDTCisSameAs<0,1>, SDTCisInt<2>,
                                           SDTCisVT<3, i32>]>;
 
-def SDTDoubleToInt: SDTypeProfile<1, 1, [SDTCisVec<0>, SDTCisVec<1>,
-                                         SDTCisInt<0>, SDTCVecEltisVT<1, f64>]>;
 def SDTFloatToInt: SDTypeProfile<1, 1, [SDTCisVec<0>, SDTCisVec<1>,
-                                         SDTCisInt<0>, SDTCVecEltisVT<1, f32>]>;
+                                        SDTCisInt<0>, SDTCisFP<1>]>;
 
-def SDTDoubleToIntRnd: SDTypeProfile<1, 2, [SDTCisVec<0>, SDTCisVec<1>,
-                                         SDTCisInt<0>, SDTCVecEltisVT<1, f64>]>;
-def SDTSDoubleToIntRnd: SDTypeProfile<1, 2, [SDTCisInt<0>,SDTCisFP<1>, 
-                                             SDTCVecEltisVT<1, f64>, SDTCisInt<2>]>;
 def SDTFloatToIntRnd: SDTypeProfile<1, 2, [SDTCisVec<0>, SDTCisVec<1>,
-                                         SDTCisInt<0>, SDTCVecEltisVT<1, f32>]>;
+                                           SDTCisInt<0>, SDTCisFP<1>,
+                                           SDTCisVT<2, i32>]>;
 def SDTSFloatToIntRnd: SDTypeProfile<1, 2, [SDTCisInt<0>, SDTCisFP<1>,
-                                            SDTCVecEltisVT<1, f32>, SDTCisInt<2>]>;
+                                            SDTCisVec<1>, SDTCisVT<2, i32>]>;
 def SDTVintToFPRound: SDTypeProfile<1, 2, [SDTCisVec<0>, SDTCisVec<1>,
-                                           SDTCisFP<0>, SDTCVecEltisVT<1, i32>,
-                                           SDTCisInt<2>]>;
-def SDTVlongToFPRound: SDTypeProfile<1, 2, [SDTCisVec<0>, SDTCisVec<1>,
-                                           SDTCisFP<0>, SDTCVecEltisVT<1, i64>,
-                                           SDTCisInt<2>]>;
-
-def SDTVFPToIntRound: SDTypeProfile<1, 2, [SDTCisVec<0>, SDTCisVec<1>,
-                                           SDTCisFP<1>, SDTCVecEltisVT<0, i32>,
-                                           SDTCisInt<2>]>;
-def SDTVFPToLongRound: SDTypeProfile<1, 2, [SDTCisVec<0>, SDTCisVec<1>,
-                                           SDTCisFP<1>, SDTCVecEltisVT<0, i64>,
-                                           SDTCisInt<2>]>;
+                                           SDTCisFP<0>, SDTCisInt<1>,
+                                           SDTCisVT<2, i32>]>;
 
 // Scalar
 def X86SintToFpRnd  : SDNode<"X86ISD::SINT_TO_FP_RND",  SDTintToFPRound>;
 def X86UintToFpRnd  : SDNode<"X86ISD::UINT_TO_FP_RND",  SDTintToFPRound>;
 
-def X86cvttss2IntRnd      : SDNode<"X86ISD::FP_TO_SINT_RND",  SDTSFloatToIntRnd>;
-def X86cvttss2UIntRnd     : SDNode<"X86ISD::FP_TO_UINT_RND",  SDTSFloatToIntRnd>;
-def X86cvttsd2IntRnd      : SDNode<"X86ISD::FP_TO_SINT_RND",  SDTSDoubleToIntRnd>;
-def X86cvttsd2UIntRnd     : SDNode<"X86ISD::FP_TO_UINT_RND",  SDTSDoubleToIntRnd>;
+def X86cvtts2IntRnd      : SDNode<"X86ISD::FP_TO_SINT_RND",  SDTSFloatToIntRnd>;
+def X86cvtts2UIntRnd     : SDNode<"X86ISD::FP_TO_UINT_RND",  SDTSFloatToIntRnd>;
+
+def  X86cvts2si  : SDNode<"X86ISD::SCALAR_FP_TO_SINT_RND", SDTSFloatToIntRnd>;
+def  X86cvts2usi : SDNode<"X86ISD::SCALAR_FP_TO_UINT_RND", SDTSFloatToIntRnd>;
+
 // Vector with rounding mode
 
 // cvtt fp-to-int staff
-def X86VFpToSintRnd   : SDNode<"ISD::FP_TO_SINT",  SDTVFPToIntRound>;
-def X86VFpToUintRnd   : SDNode<"ISD::FP_TO_UINT",  SDTVFPToIntRound>;
-def X86VFpToSlongRnd  : SDNode<"ISD::FP_TO_SINT",  SDTVFPToLongRound>;
-def X86VFpToUlongRnd  : SDNode<"ISD::FP_TO_UINT",  SDTVFPToLongRound>;
+def X86VFpToSintRnd   : SDNode<"ISD::FP_TO_SINT",  SDTFloatToIntRnd>;
+def X86VFpToUintRnd   : SDNode<"ISD::FP_TO_UINT",  SDTFloatToIntRnd>;
 
 def X86VSintToFpRnd   : SDNode<"ISD::SINT_TO_FP",  SDTVintToFPRound>;
 def X86VUintToFpRnd   : SDNode<"ISD::UINT_TO_FP",  SDTVintToFPRound>;
-def X86VSlongToFpRnd  : SDNode<"ISD::SINT_TO_FP",  SDTVlongToFPRound>;
-def X86VUlongToFpRnd  : SDNode<"ISD::UINT_TO_FP",  SDTVlongToFPRound>;
 
 // cvt fp-to-int staff
-def X86cvtps2IntRnd      : SDNode<"X86ISD::FP_TO_SINT_RND",  SDTFloatToIntRnd>;
-def X86cvtps2UIntRnd     : SDNode<"X86ISD::FP_TO_UINT_RND",  SDTFloatToIntRnd>;
-def X86cvtpd2IntRnd      : SDNode<"X86ISD::FP_TO_SINT_RND",  SDTDoubleToIntRnd>;
-def X86cvtpd2UIntRnd     : SDNode<"X86ISD::FP_TO_UINT_RND",  SDTDoubleToIntRnd>;
+def X86cvtp2IntRnd      : SDNode<"X86ISD::FP_TO_SINT_RND",  SDTFloatToIntRnd>;
+def X86cvtp2UIntRnd     : SDNode<"X86ISD::FP_TO_UINT_RND",  SDTFloatToIntRnd>;
 
 // Vector without rounding mode
-def X86cvtps2Int      : SDNode<"X86ISD::FP_TO_SINT_RND",  SDTFloatToInt>;
-def X86cvtps2UInt     : SDNode<"X86ISD::FP_TO_UINT_RND",  SDTFloatToInt>;
-def X86cvtpd2Int      : SDNode<"X86ISD::FP_TO_SINT_RND",  SDTDoubleToInt>;
-def X86cvtpd2UInt     : SDNode<"X86ISD::FP_TO_UINT_RND",  SDTDoubleToInt>;
+def X86cvtp2Int      : SDNode<"X86ISD::FP_TO_SINT_RND",  SDTFloatToInt>;
+def X86cvtp2UInt     : SDNode<"X86ISD::FP_TO_UINT_RND",  SDTFloatToInt>;
 
 def X86cvtph2ps     : SDNode<"ISD::FP16_TO_FP",
-                              SDTypeProfile<1, 2, [SDTCisVec<0>, SDTCisVec<1>,
-                                                   SDTCVecEltisVT<0, f32>,
+                              SDTypeProfile<1, 2, [SDTCVecEltisVT<0, f32>,
                                                    SDTCVecEltisVT<1, i16>,
-                                                   SDTCisFP<0>,
                                                    SDTCisVT<2, i32>]> >;
 
 def X86cvtps2ph   : SDNode<"ISD::FP_TO_FP16",
-                        SDTypeProfile<1, 3, [SDTCisVec<0>, SDTCisVec<1>,
-                                             SDTCVecEltisVT<0, i16>,
+                        SDTypeProfile<1, 3, [SDTCVecEltisVT<0, i16>,
                                              SDTCVecEltisVT<1, f32>,
-                                             SDTCisFP<1>, SDTCisVT<2, i32>,
+                                             SDTCisVT<2, i32>,
                                              SDTCisVT<3, i32>]> >;
 def X86vfpextRnd  : SDNode<"X86ISD::VFPEXT",
-                        SDTypeProfile<1, 2, [SDTCisVec<0>, SDTCisVec<1>,
-                                             SDTCisFP<0>, SDTCisFP<1>,
-                                             SDTCVecEltisVT<0, f64>,
+                        SDTypeProfile<1, 2, [SDTCVecEltisVT<0, f64>,
                                              SDTCVecEltisVT<1, f32>,
                                              SDTCisOpSmallerThanOp<1, 0>,
                                              SDTCisVT<2, i32>]>>;
 def X86vfproundRnd: SDNode<"X86ISD::VFPROUND",
-                        SDTypeProfile<1, 2, [SDTCisVec<0>, SDTCisVec<1>,
-                                             SDTCisFP<0>, SDTCisFP<1>,
-                                             SDTCVecEltisVT<0, f32>,
+                        SDTypeProfile<1, 2, [SDTCVecEltisVT<0, f32>,
                                              SDTCVecEltisVT<1, f64>,
                                              SDTCisOpSmallerThanOp<0, 1>,
                                              SDTCisVT<2, i32>]>>;
@@ -602,13 +591,13 @@ def sse_load_f64 : ComplexPattern<v2f64, 5, "selectScalarSSELoad", [],
 
 def ssmem : Operand<v4f32> {
   let PrintMethod = "printf32mem";
-  let MIOperandInfo = (ops ptr_rc, i8imm, ptr_rc_nosp, i32imm, i8imm);
+  let MIOperandInfo = (ops ptr_rc, i8imm, ptr_rc_nosp, i32imm, SEGMENT_REG);
   let ParserMatchClass = X86Mem32AsmOperand;
   let OperandType = "OPERAND_MEMORY";
 }
 def sdmem : Operand<v2f64> {
   let PrintMethod = "printf64mem";
-  let MIOperandInfo = (ops ptr_rc, i8imm, ptr_rc_nosp, i32imm, i8imm);
+  let MIOperandInfo = (ops ptr_rc, i8imm, ptr_rc_nosp, i32imm, SEGMENT_REG);
   let ParserMatchClass = X86Mem64AsmOperand;
   let OperandType = "OPERAND_MEMORY";
 }
@@ -674,11 +663,6 @@ def alignedload : PatFrag<(ops node:$ptr), (load node:$ptr), [{
   return cast<LoadSDNode>(N)->getAlignment() >= 16;
 }]>;
 
-// Like 'X86vzload', but always requires 128-bit vector alignment.
-def alignedX86vzload : PatFrag<(ops node:$ptr), (X86vzload node:$ptr), [{
-  return cast<MemSDNode>(N)->getAlignment() >= 16;
-}]>;
-
 // Like 'load', but always requires 256-bit vector alignment.
 def alignedload256 : PatFrag<(ops node:$ptr), (load node:$ptr), [{
   return cast<LoadSDNode>(N)->getAlignment() >= 32;
@@ -982,9 +966,9 @@ def masked_load_unaligned : PatFrag<(ops node:$src1, node:$src2, node:$src3),
   return isa<MaskedLoadSDNode>(N);
 }]>;
 
-// masked store fragments.
+// Masked store fragments.
 // X86mstore can't be implemented in core DAG files because some targets
-// doesn't support vector type ( llvm-tblgen will fail)
+// do not support vector types (llvm-tblgen will fail).
 def X86mstore : PatFrag<(ops node:$src1, node:$src2, node:$src3),
                         (masked_store node:$src1, node:$src2, node:$src3), [{
   return !cast<MaskedStoreSDNode>(N)->isTruncatingStore();
diff --git a/lib/Target/X86/X86InstrInfo.cpp b/lib/Target/X86/X86InstrInfo.cpp
index 246804e34289..1672b3855b79 100644
--- a/lib/Target/X86/X86InstrInfo.cpp
+++ b/lib/Target/X86/X86InstrInfo.cpp
@@ -18,11 +18,13 @@
 #include "X86Subtarget.h"
 #include "X86TargetMachine.h"
 #include "llvm/ADT/STLExtras.h"
+#include "llvm/CodeGen/LivePhysRegs.h"
 #include "llvm/CodeGen/LiveVariables.h"
 #include "llvm/CodeGen/MachineConstantPool.h"
 #include "llvm/CodeGen/MachineDominators.h"
 #include "llvm/CodeGen/MachineFrameInfo.h"
 #include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineModuleInfo.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
 #include "llvm/CodeGen/StackMaps.h"
 #include "llvm/IR/DerivedTypes.h"
@@ -36,7 +38,6 @@
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/Target/TargetOptions.h"
-#include <limits>
 
 using namespace llvm;
 
@@ -57,6 +58,17 @@ static cl::opt<bool>
 ReMatPICStubLoad("remat-pic-stub-load",
                  cl::desc("Re-materialize load from stub in PIC mode"),
                  cl::init(false), cl::Hidden);
+static cl::opt<unsigned>
+PartialRegUpdateClearance("partial-reg-update-clearance",
+                          cl::desc("Clearance between two register writes "
+                                   "for inserting XOR to avoid partial "
+                                   "register update"),
+                          cl::init(64), cl::Hidden);
+static cl::opt<unsigned>
+UndefRegClearance("undef-reg-clearance",
+                  cl::desc("How many idle instructions we would like before "
+                           "certain undef register reads"),
+                  cl::init(64), cl::Hidden);
 
 enum {
   // Select which memory operand is being unfolded.
@@ -105,7 +117,8 @@ X86InstrInfo::X86InstrInfo(X86Subtarget &STI)
                                                : X86::ADJCALLSTACKDOWN32),
                       (STI.isTarget64BitLP64() ? X86::ADJCALLSTACKUP64
                                                : X86::ADJCALLSTACKUP32),
-                      X86::CATCHRET),
+                      X86::CATCHRET,
+                      (STI.is64Bit() ? X86::RETQ : X86::RETL)),
       Subtarget(STI), RI(STI.getTargetTriple()) {
 
   static const X86MemoryFoldTableEntry MemoryFoldTable2Addr[] = {
@@ -804,50 +817,54 @@ X86InstrInfo::X86InstrInfo(X86Subtarget &STI)
     { X86::TZMSK64rr,       X86::TZMSK64rm,           0 },
 
     // AVX-512 foldable instructions
-    { X86::VMOV64toPQIZrr,  X86::VMOVQI2PQIZrm,       0 },
-    { X86::VMOVDI2SSZrr,    X86::VMOVDI2SSZrm,        0 },
-    { X86::VMOVAPDZrr,      X86::VMOVAPDZrm,          TB_ALIGN_64 },
-    { X86::VMOVAPSZrr,      X86::VMOVAPSZrm,          TB_ALIGN_64 },
-    { X86::VMOVDQA32Zrr,    X86::VMOVDQA32Zrm,        TB_ALIGN_64 },
-    { X86::VMOVDQA64Zrr,    X86::VMOVDQA64Zrm,        TB_ALIGN_64 },
-    { X86::VMOVDQU8Zrr,     X86::VMOVDQU8Zrm,         0 },
-    { X86::VMOVDQU16Zrr,    X86::VMOVDQU16Zrm,        0 },
-    { X86::VMOVDQU32Zrr,    X86::VMOVDQU32Zrm,        0 },
-    { X86::VMOVDQU64Zrr,    X86::VMOVDQU64Zrm,        0 },
-    { X86::VMOVUPDZrr,      X86::VMOVUPDZrm,          0 },
-    { X86::VMOVUPSZrr,      X86::VMOVUPSZrm,          0 },
-    { X86::VPABSDZrr,       X86::VPABSDZrm,           0 },
-    { X86::VPABSQZrr,       X86::VPABSQZrm,           0 },
-    { X86::VBROADCASTSSZr,  X86::VBROADCASTSSZm,      TB_NO_REVERSE },
-    { X86::VBROADCASTSDZr,  X86::VBROADCASTSDZm,      TB_NO_REVERSE },
+    { X86::VMOV64toPQIZrr,   X86::VMOVQI2PQIZrm,      0 },
+    { X86::VMOVDI2SSZrr,     X86::VMOVDI2SSZrm,       0 },
+    { X86::VMOVAPDZrr,       X86::VMOVAPDZrm,         TB_ALIGN_64 },
+    { X86::VMOVAPSZrr,       X86::VMOVAPSZrm,         TB_ALIGN_64 },
+    { X86::VMOVDQA32Zrr,     X86::VMOVDQA32Zrm,       TB_ALIGN_64 },
+    { X86::VMOVDQA64Zrr,     X86::VMOVDQA64Zrm,       TB_ALIGN_64 },
+    { X86::VMOVDQU8Zrr,      X86::VMOVDQU8Zrm,        0 },
+    { X86::VMOVDQU16Zrr,     X86::VMOVDQU16Zrm,       0 },
+    { X86::VMOVDQU32Zrr,     X86::VMOVDQU32Zrm,       0 },
+    { X86::VMOVDQU64Zrr,     X86::VMOVDQU64Zrm,       0 },
+    { X86::VMOVUPDZrr,       X86::VMOVUPDZrm,         0 },
+    { X86::VMOVUPSZrr,       X86::VMOVUPSZrm,         0 },
+    { X86::VPABSDZrr,        X86::VPABSDZrm,          0 },
+    { X86::VPABSQZrr,        X86::VPABSQZrm,          0 },
+    { X86::VBROADCASTSSZr,   X86::VBROADCASTSSZm,     TB_NO_REVERSE },
+    { X86::VBROADCASTSSZr_s, X86::VBROADCASTSSZm,     TB_NO_REVERSE },
+    { X86::VBROADCASTSDZr,   X86::VBROADCASTSDZm,     TB_NO_REVERSE },
+    { X86::VBROADCASTSDZr_s, X86::VBROADCASTSDZm,     TB_NO_REVERSE },
 
     // AVX-512 foldable instructions (256-bit versions)
-    { X86::VMOVAPDZ256rr,      X86::VMOVAPDZ256rm,          TB_ALIGN_32 },
-    { X86::VMOVAPSZ256rr,      X86::VMOVAPSZ256rm,          TB_ALIGN_32 },
-    { X86::VMOVDQA32Z256rr,    X86::VMOVDQA32Z256rm,        TB_ALIGN_32 },
-    { X86::VMOVDQA64Z256rr,    X86::VMOVDQA64Z256rm,        TB_ALIGN_32 },
-    { X86::VMOVDQU8Z256rr,     X86::VMOVDQU8Z256rm,         0 },
-    { X86::VMOVDQU16Z256rr,    X86::VMOVDQU16Z256rm,        0 },
-    { X86::VMOVDQU32Z256rr,    X86::VMOVDQU32Z256rm,        0 },
-    { X86::VMOVDQU64Z256rr,    X86::VMOVDQU64Z256rm,        0 },
-    { X86::VMOVUPDZ256rr,      X86::VMOVUPDZ256rm,          0 },
-    { X86::VMOVUPSZ256rr,      X86::VMOVUPSZ256rm,          0 },
-    { X86::VBROADCASTSSZ256r,  X86::VBROADCASTSSZ256m,      TB_NO_REVERSE },
-    { X86::VBROADCASTSDZ256r,  X86::VBROADCASTSDZ256m,      TB_NO_REVERSE },
-
-    // AVX-512 foldable instructions (256-bit versions)
-    { X86::VMOVAPDZ128rr,      X86::VMOVAPDZ128rm,          TB_ALIGN_16 },
-    { X86::VMOVAPSZ128rr,      X86::VMOVAPSZ128rm,          TB_ALIGN_16 },
-    { X86::VMOVDQA32Z128rr,    X86::VMOVDQA32Z128rm,        TB_ALIGN_16 },
-    { X86::VMOVDQA64Z128rr,    X86::VMOVDQA64Z128rm,        TB_ALIGN_16 },
-    { X86::VMOVDQU8Z128rr,     X86::VMOVDQU8Z128rm,         0 },
-    { X86::VMOVDQU16Z128rr,    X86::VMOVDQU16Z128rm,        0 },
-    { X86::VMOVDQU32Z128rr,    X86::VMOVDQU32Z128rm,        0 },
-    { X86::VMOVDQU64Z128rr,    X86::VMOVDQU64Z128rm,        0 },
-    { X86::VMOVUPDZ128rr,      X86::VMOVUPDZ128rm,          0 },
-    { X86::VMOVUPSZ128rr,      X86::VMOVUPSZ128rm,          0 },
-    { X86::VBROADCASTSSZ128r,  X86::VBROADCASTSSZ128m,      TB_NO_REVERSE },
+    { X86::VMOVAPDZ256rr,        X86::VMOVAPDZ256rm,        TB_ALIGN_32 },
+    { X86::VMOVAPSZ256rr,        X86::VMOVAPSZ256rm,        TB_ALIGN_32 },
+    { X86::VMOVDQA32Z256rr,      X86::VMOVDQA32Z256rm,      TB_ALIGN_32 },
+    { X86::VMOVDQA64Z256rr,      X86::VMOVDQA64Z256rm,      TB_ALIGN_32 },
+    { X86::VMOVDQU8Z256rr,       X86::VMOVDQU8Z256rm,       0 },
+    { X86::VMOVDQU16Z256rr,      X86::VMOVDQU16Z256rm,      0 },
+    { X86::VMOVDQU32Z256rr,      X86::VMOVDQU32Z256rm,      0 },
+    { X86::VMOVDQU64Z256rr,      X86::VMOVDQU64Z256rm,      0 },
+    { X86::VMOVUPDZ256rr,        X86::VMOVUPDZ256rm,        0 },
+    { X86::VMOVUPSZ256rr,        X86::VMOVUPSZ256rm,        0 },
+    { X86::VBROADCASTSSZ256r,    X86::VBROADCASTSSZ256m,    TB_NO_REVERSE },
+    { X86::VBROADCASTSSZ256r_s,  X86::VBROADCASTSSZ256m,    TB_NO_REVERSE },
+    { X86::VBROADCASTSDZ256r,    X86::VBROADCASTSDZ256m,    TB_NO_REVERSE },
+    { X86::VBROADCASTSDZ256r_s,  X86::VBROADCASTSDZ256m,    TB_NO_REVERSE },
 
+    // AVX-512 foldable instructions (128-bit versions)
+    { X86::VMOVAPDZ128rr,        X86::VMOVAPDZ128rm,        TB_ALIGN_16 },
+    { X86::VMOVAPSZ128rr,        X86::VMOVAPSZ128rm,        TB_ALIGN_16 },
+    { X86::VMOVDQA32Z128rr,      X86::VMOVDQA32Z128rm,      TB_ALIGN_16 },
+    { X86::VMOVDQA64Z128rr,      X86::VMOVDQA64Z128rm,      TB_ALIGN_16 },
+    { X86::VMOVDQU8Z128rr,       X86::VMOVDQU8Z128rm,       0 },
+    { X86::VMOVDQU16Z128rr,      X86::VMOVDQU16Z128rm,      0 },
+    { X86::VMOVDQU32Z128rr,      X86::VMOVDQU32Z128rm,      0 },
+    { X86::VMOVDQU64Z128rr,      X86::VMOVDQU64Z128rm,      0 },
+    { X86::VMOVUPDZ128rr,        X86::VMOVUPDZ128rm,        0 },
+    { X86::VMOVUPSZ128rr,        X86::VMOVUPSZ128rm,        0 },
+    { X86::VBROADCASTSSZ128r,    X86::VBROADCASTSSZ128m,    TB_NO_REVERSE },
+    { X86::VBROADCASTSSZ128r_s,  X86::VBROADCASTSSZ128m,    TB_NO_REVERSE },
     // F16C foldable instructions
     { X86::VCVTPH2PSrr,        X86::VCVTPH2PSrm,            0 },
     { X86::VCVTPH2PSYrr,       X86::VCVTPH2PSYrm,           0 },
@@ -998,6 +1015,7 @@ X86InstrInfo::X86InstrInfo(X86Subtarget &STI)
     { X86::MINSDrr_Int,     X86::MINSDrm_Int,   0 },
     { X86::MINSSrr,         X86::MINSSrm,       0 },
     { X86::MINSSrr_Int,     X86::MINSSrm_Int,   0 },
+    { X86::MOVLHPSrr,       X86::MOVHPSrm,      TB_NO_REVERSE },
     { X86::MPSADBWrri,      X86::MPSADBWrmi,    TB_ALIGN_16 },
     { X86::MULPDrr,         X86::MULPDrm,       TB_ALIGN_16 },
     { X86::MULPSrr,         X86::MULPSrm,       TB_ALIGN_16 },
@@ -1023,7 +1041,7 @@ X86InstrInfo::X86InstrInfo(X86Subtarget &STI)
     { X86::PADDUSBrr,       X86::PADDUSBrm,     TB_ALIGN_16 },
     { X86::PADDUSWrr,       X86::PADDUSWrm,     TB_ALIGN_16 },
     { X86::PADDWrr,         X86::PADDWrm,       TB_ALIGN_16 },
-    { X86::PALIGNR128rr,    X86::PALIGNR128rm,  TB_ALIGN_16 },
+    { X86::PALIGNRrri,      X86::PALIGNRrmi,    TB_ALIGN_16 },
     { X86::PANDNrr,         X86::PANDNrm,       TB_ALIGN_16 },
     { X86::PANDrr,          X86::PANDrm,        TB_ALIGN_16 },
     { X86::PAVGBrr,         X86::PAVGBrm,       TB_ALIGN_16 },
@@ -1073,9 +1091,9 @@ X86InstrInfo::X86InstrInfo(X86Subtarget &STI)
     { X86::PORrr,           X86::PORrm,         TB_ALIGN_16 },
     { X86::PSADBWrr,        X86::PSADBWrm,      TB_ALIGN_16 },
     { X86::PSHUFBrr,        X86::PSHUFBrm,      TB_ALIGN_16 },
-    { X86::PSIGNBrr,        X86::PSIGNBrm,      TB_ALIGN_16 },
-    { X86::PSIGNWrr,        X86::PSIGNWrm,      TB_ALIGN_16 },
-    { X86::PSIGNDrr,        X86::PSIGNDrm,      TB_ALIGN_16 },
+    { X86::PSIGNBrr128,     X86::PSIGNBrm128,   TB_ALIGN_16 },
+    { X86::PSIGNWrr128,     X86::PSIGNWrm128,   TB_ALIGN_16 },
+    { X86::PSIGNDrr128,     X86::PSIGNDrm128,   TB_ALIGN_16 },
     { X86::PSLLDrr,         X86::PSLLDrm,       TB_ALIGN_16 },
     { X86::PSLLQrr,         X86::PSLLQrm,       TB_ALIGN_16 },
     { X86::PSLLWrr,         X86::PSLLWrm,       TB_ALIGN_16 },
@@ -1298,6 +1316,7 @@ X86InstrInfo::X86InstrInfo(X86Subtarget &STI)
     { X86::VMINSDrr_Int,      X86::VMINSDrm_Int,       0 },
     { X86::VMINSSrr,          X86::VMINSSrm,           0 },
     { X86::VMINSSrr_Int,      X86::VMINSSrm_Int,       0 },
+    { X86::VMOVLHPSrr,        X86::VMOVHPSrm,          TB_NO_REVERSE },
     { X86::VMPSADBWrri,       X86::VMPSADBWrmi,        0 },
     { X86::VMULPDrr,          X86::VMULPDrm,           0 },
     { X86::VMULPSrr,          X86::VMULPSrm,           0 },
@@ -1319,7 +1338,7 @@ X86InstrInfo::X86InstrInfo(X86Subtarget &STI)
     { X86::VPADDUSBrr,        X86::VPADDUSBrm,         0 },
     { X86::VPADDUSWrr,        X86::VPADDUSWrm,         0 },
     { X86::VPADDWrr,          X86::VPADDWrm,           0 },
-    { X86::VPALIGNR128rr,     X86::VPALIGNR128rm,      0 },
+    { X86::VPALIGNRrri,       X86::VPALIGNRrmi,        0 },
     { X86::VPANDNrr,          X86::VPANDNrm,           0 },
     { X86::VPANDrr,           X86::VPANDrm,            0 },
     { X86::VPAVGBrr,          X86::VPAVGBrm,           0 },
@@ -1371,9 +1390,9 @@ X86InstrInfo::X86InstrInfo(X86Subtarget &STI)
     { X86::VPORrr,            X86::VPORrm,             0 },
     { X86::VPSADBWrr,         X86::VPSADBWrm,          0 },
     { X86::VPSHUFBrr,         X86::VPSHUFBrm,          0 },
-    { X86::VPSIGNBrr,         X86::VPSIGNBrm,          0 },
-    { X86::VPSIGNWrr,         X86::VPSIGNWrm,          0 },
-    { X86::VPSIGNDrr,         X86::VPSIGNDrm,          0 },
+    { X86::VPSIGNBrr128,      X86::VPSIGNBrm128,       0 },
+    { X86::VPSIGNWrr128,      X86::VPSIGNWrm128,       0 },
+    { X86::VPSIGNDrr128,      X86::VPSIGNDrm128,       0 },
     { X86::VPSLLDrr,          X86::VPSLLDrm,           0 },
     { X86::VPSLLQrr,          X86::VPSLLQrm,           0 },
     { X86::VPSLLWrr,          X86::VPSLLWrm,           0 },
@@ -1475,7 +1494,7 @@ X86InstrInfo::X86InstrInfo(X86Subtarget &STI)
     { X86::VPADDUSBYrr,       X86::VPADDUSBYrm,        0 },
     { X86::VPADDUSWYrr,       X86::VPADDUSWYrm,        0 },
     { X86::VPADDWYrr,         X86::VPADDWYrm,          0 },
-    { X86::VPALIGNR256rr,     X86::VPALIGNR256rm,      0 },
+    { X86::VPALIGNRYrri,      X86::VPALIGNRYrmi,       0 },
     { X86::VPANDNYrr,         X86::VPANDNYrm,          0 },
     { X86::VPANDYrr,          X86::VPANDYrm,           0 },
     { X86::VPAVGBYrr,         X86::VPAVGBYrm,          0 },
@@ -1526,9 +1545,9 @@ X86InstrInfo::X86InstrInfo(X86Subtarget &STI)
     { X86::VPORYrr,           X86::VPORYrm,            0 },
     { X86::VPSADBWYrr,        X86::VPSADBWYrm,         0 },
     { X86::VPSHUFBYrr,        X86::VPSHUFBYrm,         0 },
-    { X86::VPSIGNBYrr,        X86::VPSIGNBYrm,         0 },
-    { X86::VPSIGNWYrr,        X86::VPSIGNWYrm,         0 },
-    { X86::VPSIGNDYrr,        X86::VPSIGNDYrm,         0 },
+    { X86::VPSIGNBYrr256,     X86::VPSIGNBYrm256,      0 },
+    { X86::VPSIGNWYrr256,     X86::VPSIGNWYrm256,      0 },
+    { X86::VPSIGNDYrr256,     X86::VPSIGNDYrm256,      0 },
     { X86::VPSLLDYrr,         X86::VPSLLDYrm,          0 },
     { X86::VPSLLQYrr,         X86::VPSLLQYrm,          0 },
     { X86::VPSLLWYrr,         X86::VPSLLWYrm,          0 },
@@ -1540,6 +1559,8 @@ X86InstrInfo::X86InstrInfo(X86Subtarget &STI)
     { X86::VPSRAWYrr,         X86::VPSRAWYrm,          0 },
     { X86::VPSRAVDrr,         X86::VPSRAVDrm,          0 },
     { X86::VPSRAVDYrr,        X86::VPSRAVDYrm,         0 },
+    { X86::VPSRAVD_Intrr,     X86::VPSRAVD_Intrm,      0 },
+    { X86::VPSRAVD_IntYrr,    X86::VPSRAVD_IntYrm,     0 },
     { X86::VPSRLDYrr,         X86::VPSRLDYrm,          0 },
     { X86::VPSRLQYrr,         X86::VPSRLQYrm,          0 },
     { X86::VPSRLWYrr,         X86::VPSRLWYrm,          0 },
@@ -1600,8 +1621,8 @@ X86InstrInfo::X86InstrInfo(X86Subtarget &STI)
     { X86::VFMSUBADDPD4rrY,   X86::VFMSUBADDPD4mrY,    TB_ALIGN_NONE },
 
     // XOP foldable instructions
-    { X86::VPCMOVrr,          X86::VPCMOVmr,            0 },
-    { X86::VPCMOVrrY,         X86::VPCMOVmrY,           0 },
+    { X86::VPCMOVrrr,         X86::VPCMOVrmr,           0 },
+    { X86::VPCMOVrrrY,        X86::VPCMOVrmrY,          0 },
     { X86::VPCOMBri,          X86::VPCOMBmi,            0 },
     { X86::VPCOMDri,          X86::VPCOMDmi,            0 },
     { X86::VPCOMQri,          X86::VPCOMQmi,            0 },
@@ -1626,7 +1647,7 @@ X86InstrInfo::X86InstrInfo(X86Subtarget &STI)
     { X86::VPMACSWWrr,        X86::VPMACSWWrm,          0 },
     { X86::VPMADCSSWDrr,      X86::VPMADCSSWDrm,        0 },
     { X86::VPMADCSWDrr,       X86::VPMADCSWDrm,         0 },
-    { X86::VPPERMrr,          X86::VPPERMmr,            0 },
+    { X86::VPPERMrrr,         X86::VPPERMrmr,           0 },
     { X86::VPROTBrr,          X86::VPROTBrm,            0 },
     { X86::VPROTDrr,          X86::VPROTDrm,            0 },
     { X86::VPROTQrr,          X86::VPROTQrm,            0 },
@@ -1659,12 +1680,28 @@ X86InstrInfo::X86InstrInfo(X86Subtarget &STI)
     // AVX-512 foldable instructions
     { X86::VADDPSZrr,         X86::VADDPSZrm,           0 },
     { X86::VADDPDZrr,         X86::VADDPDZrm,           0 },
+    { X86::VADDSSZrr,         X86::VADDSSZrm,           0 },
+    { X86::VADDSSZrr_Int,     X86::VADDSSZrm_Int,       0 },
+    { X86::VADDSDZrr,         X86::VADDSDZrm,           0 },
+    { X86::VADDSDZrr_Int,     X86::VADDSDZrm_Int,       0 },
     { X86::VSUBPSZrr,         X86::VSUBPSZrm,           0 },
     { X86::VSUBPDZrr,         X86::VSUBPDZrm,           0 },
+    { X86::VSUBSSZrr,         X86::VSUBSSZrm,           0 },
+    { X86::VSUBSSZrr_Int,     X86::VSUBSSZrm_Int,       0 },
+    { X86::VSUBSDZrr,         X86::VSUBSDZrm,           0 },
+    { X86::VSUBSDZrr_Int,     X86::VSUBSDZrm_Int,       0 },
     { X86::VMULPSZrr,         X86::VMULPSZrm,           0 },
     { X86::VMULPDZrr,         X86::VMULPDZrm,           0 },
+    { X86::VMULSSZrr,         X86::VMULSSZrm,           0 },
+    { X86::VMULSSZrr_Int,     X86::VMULSSZrm_Int,       0 },
+    { X86::VMULSDZrr,         X86::VMULSDZrm,           0 },
+    { X86::VMULSDZrr_Int,     X86::VMULSDZrm_Int,       0 },
     { X86::VDIVPSZrr,         X86::VDIVPSZrm,           0 },
     { X86::VDIVPDZrr,         X86::VDIVPDZrm,           0 },
+    { X86::VDIVSSZrr,         X86::VDIVSSZrm,           0 },
+    { X86::VDIVSSZrr_Int,     X86::VDIVSSZrm_Int,       0 },
+    { X86::VDIVSDZrr,         X86::VDIVSDZrm,           0 },
+    { X86::VDIVSDZrr_Int,     X86::VDIVSDZrm_Int,       0 },
     { X86::VMINPSZrr,         X86::VMINPSZrm,           0 },
     { X86::VMINPDZrr,         X86::VMINPDZrm,           0 },
     { X86::VMAXPSZrr,         X86::VMAXPSZrm,           0 },
@@ -1902,13 +1939,13 @@ X86InstrInfo::X86InstrInfo(X86Subtarget &STI)
     { X86::VFMSUBADDPD4rrY,       X86::VFMSUBADDPD4rmY,       TB_ALIGN_NONE },
 
     // XOP foldable instructions
-    { X86::VPCMOVrr,              X86::VPCMOVrm,              0 },
-    { X86::VPCMOVrrY,             X86::VPCMOVrmY,             0 },
+    { X86::VPCMOVrrr,             X86::VPCMOVrrm,             0 },
+    { X86::VPCMOVrrrY,            X86::VPCMOVrrmY,            0 },
     { X86::VPERMIL2PDrr,          X86::VPERMIL2PDrm,          0 },
     { X86::VPERMIL2PDrrY,         X86::VPERMIL2PDrmY,         0 },
     { X86::VPERMIL2PSrr,          X86::VPERMIL2PSrm,          0 },
     { X86::VPERMIL2PSrrY,         X86::VPERMIL2PSrmY,         0 },
-    { X86::VPPERMrr,              X86::VPPERMrm,              0 },
+    { X86::VPPERMrrr,             X86::VPPERMrrm,             0 },
 
     // AVX-512 VPERMI instructions with 3 source operands.
     { X86::VPERMI2Drr,            X86::VPERMI2Drm,            0 },
@@ -2025,7 +2062,7 @@ X86InstrInfo::X86InstrInfo(X86Subtarget &STI)
 void
 X86InstrInfo::AddTableEntry(RegOp2MemOpTableType &R2MTable,
                             MemOp2RegOpTableType &M2RTable,
-                            unsigned RegOp, unsigned MemOp, unsigned Flags) {
+                            uint16_t RegOp, uint16_t MemOp, uint16_t Flags) {
     if ((Flags & TB_NO_FORWARD) == 0) {
       assert(!R2MTable.count(RegOp) && "Duplicate entry!");
       R2MTable[RegOp] = std::make_pair(MemOp, Flags);
@@ -2085,19 +2122,19 @@ X86InstrInfo::isCoalescableExtInstr(const MachineInstr &MI,
   return false;
 }
 
-int X86InstrInfo::getSPAdjust(const MachineInstr *MI) const {
-  const MachineFunction *MF = MI->getParent()->getParent();
+int X86InstrInfo::getSPAdjust(const MachineInstr &MI) const {
+  const MachineFunction *MF = MI.getParent()->getParent();
   const TargetFrameLowering *TFI = MF->getSubtarget().getFrameLowering();
 
-  if (MI->getOpcode() == getCallFrameSetupOpcode() ||
-      MI->getOpcode() == getCallFrameDestroyOpcode()) {
+  if (MI.getOpcode() == getCallFrameSetupOpcode() ||
+      MI.getOpcode() == getCallFrameDestroyOpcode()) {
     unsigned StackAlign = TFI->getStackAlignment();
-    int SPAdj = (MI->getOperand(0).getImm() + StackAlign - 1) / StackAlign *
-                 StackAlign;
+    int SPAdj =
+        (MI.getOperand(0).getImm() + StackAlign - 1) / StackAlign * StackAlign;
 
-    SPAdj -= MI->getOperand(1).getImm();
+    SPAdj -= MI.getOperand(1).getImm();
 
-    if (MI->getOpcode() == getCallFrameSetupOpcode())
+    if (MI.getOpcode() == getCallFrameSetupOpcode())
       return SPAdj;
     else
       return -SPAdj;
@@ -2106,8 +2143,8 @@ int X86InstrInfo::getSPAdjust(const MachineInstr *MI) const {
   // To know whether a call adjusts the stack, we need information
   // that is bound to the following ADJCALLSTACKUP pseudo.
   // Look for the next ADJCALLSTACKUP that follows the call.
-  if (MI->isCall()) {
-    const MachineBasicBlock* MBB = MI->getParent();
+  if (MI.isCall()) {
+    const MachineBasicBlock *MBB = MI.getParent();
     auto I = ++MachineBasicBlock::const_iterator(MI);
     for (auto E = MBB->end(); I != E; ++I) {
       if (I->getOpcode() == getCallFrameDestroyOpcode() ||
@@ -2125,7 +2162,7 @@ int X86InstrInfo::getSPAdjust(const MachineInstr *MI) const {
 
   // Currently handle only PUSHes we can reasonably expect to see
   // in call sequences
-  switch (MI->getOpcode()) {
+  switch (MI.getOpcode()) {
   default:
     return 0;
   case X86::PUSH32i8:
@@ -2134,21 +2171,27 @@ int X86InstrInfo::getSPAdjust(const MachineInstr *MI) const {
   case X86::PUSH32rmr:
   case X86::PUSHi32:
     return 4;
+  case X86::PUSH64i8:
+  case X86::PUSH64r:
+  case X86::PUSH64rmm:
+  case X86::PUSH64rmr:
+  case X86::PUSH64i32:
+    return 8;
   }
 }
 
 /// Return true and the FrameIndex if the specified
 /// operand and follow operands form a reference to the stack frame.
-bool X86InstrInfo::isFrameOperand(const MachineInstr *MI, unsigned int Op,
+bool X86InstrInfo::isFrameOperand(const MachineInstr &MI, unsigned int Op,
                                   int &FrameIndex) const {
-  if (MI->getOperand(Op+X86::AddrBaseReg).isFI() &&
-      MI->getOperand(Op+X86::AddrScaleAmt).isImm() &&
-      MI->getOperand(Op+X86::AddrIndexReg).isReg() &&
-      MI->getOperand(Op+X86::AddrDisp).isImm() &&
-      MI->getOperand(Op+X86::AddrScaleAmt).getImm() == 1 &&
-      MI->getOperand(Op+X86::AddrIndexReg).getReg() == 0 &&
-      MI->getOperand(Op+X86::AddrDisp).getImm() == 0) {
-    FrameIndex = MI->getOperand(Op+X86::AddrBaseReg).getIndex();
+  if (MI.getOperand(Op + X86::AddrBaseReg).isFI() &&
+      MI.getOperand(Op + X86::AddrScaleAmt).isImm() &&
+      MI.getOperand(Op + X86::AddrIndexReg).isReg() &&
+      MI.getOperand(Op + X86::AddrDisp).isImm() &&
+      MI.getOperand(Op + X86::AddrScaleAmt).getImm() == 1 &&
+      MI.getOperand(Op + X86::AddrIndexReg).getReg() == 0 &&
+      MI.getOperand(Op + X86::AddrDisp).getImm() == 0) {
+    FrameIndex = MI.getOperand(Op + X86::AddrBaseReg).getIndex();
     return true;
   }
   return false;
@@ -2166,13 +2209,19 @@ static bool isFrameLoadOpcode(int Opcode) {
   case X86::MOVSSrm:
   case X86::MOVSDrm:
   case X86::MOVAPSrm:
+  case X86::MOVUPSrm:
   case X86::MOVAPDrm:
+  case X86::MOVUPDrm:
   case X86::MOVDQArm:
+  case X86::MOVDQUrm:
   case X86::VMOVSSrm:
   case X86::VMOVSDrm:
   case X86::VMOVAPSrm:
+  case X86::VMOVUPSrm:
   case X86::VMOVAPDrm:
+  case X86::VMOVUPDrm:
   case X86::VMOVDQArm:
+  case X86::VMOVDQUrm:
   case X86::VMOVUPSYrm:
   case X86::VMOVAPSYrm:
   case X86::VMOVUPDYrm:
@@ -2181,8 +2230,42 @@ static bool isFrameLoadOpcode(int Opcode) {
   case X86::VMOVDQAYrm:
   case X86::MMX_MOVD64rm:
   case X86::MMX_MOVQ64rm:
+  case X86::VMOVSSZrm:
+  case X86::VMOVSDZrm:
   case X86::VMOVAPSZrm:
+  case X86::VMOVAPSZ128rm:
+  case X86::VMOVAPSZ256rm:
   case X86::VMOVUPSZrm:
+  case X86::VMOVUPSZ128rm:
+  case X86::VMOVUPSZ256rm:
+  case X86::VMOVAPDZrm:
+  case X86::VMOVAPDZ128rm:
+  case X86::VMOVAPDZ256rm:
+  case X86::VMOVUPDZrm:
+  case X86::VMOVUPDZ128rm:
+  case X86::VMOVUPDZ256rm:
+  case X86::VMOVDQA32Zrm:
+  case X86::VMOVDQA32Z128rm:
+  case X86::VMOVDQA32Z256rm:
+  case X86::VMOVDQU32Zrm:
+  case X86::VMOVDQU32Z128rm:
+  case X86::VMOVDQU32Z256rm:
+  case X86::VMOVDQA64Zrm:
+  case X86::VMOVDQA64Z128rm:
+  case X86::VMOVDQA64Z256rm:
+  case X86::VMOVDQU64Zrm:
+  case X86::VMOVDQU64Z128rm:
+  case X86::VMOVDQU64Z256rm:
+  case X86::VMOVDQU8Zrm:
+  case X86::VMOVDQU8Z128rm:
+  case X86::VMOVDQU8Z256rm:
+  case X86::VMOVDQU16Zrm:
+  case X86::VMOVDQU16Z128rm:
+  case X86::VMOVDQU16Z256rm:
+  case X86::KMOVBkm:
+  case X86::KMOVWkm:
+  case X86::KMOVDkm:
+  case X86::KMOVQkm:
     return true;
   }
 }
@@ -2198,40 +2281,80 @@ static bool isFrameStoreOpcode(int Opcode) {
   case X86::MOVSSmr:
   case X86::MOVSDmr:
   case X86::MOVAPSmr:
+  case X86::MOVUPSmr:
   case X86::MOVAPDmr:
+  case X86::MOVUPDmr:
   case X86::MOVDQAmr:
+  case X86::MOVDQUmr:
   case X86::VMOVSSmr:
   case X86::VMOVSDmr:
   case X86::VMOVAPSmr:
+  case X86::VMOVUPSmr:
   case X86::VMOVAPDmr:
+  case X86::VMOVUPDmr:
   case X86::VMOVDQAmr:
+  case X86::VMOVDQUmr:
   case X86::VMOVUPSYmr:
   case X86::VMOVAPSYmr:
   case X86::VMOVUPDYmr:
   case X86::VMOVAPDYmr:
   case X86::VMOVDQUYmr:
   case X86::VMOVDQAYmr:
+  case X86::VMOVSSZmr:
+  case X86::VMOVSDZmr:
   case X86::VMOVUPSZmr:
+  case X86::VMOVUPSZ128mr:
+  case X86::VMOVUPSZ256mr:
   case X86::VMOVAPSZmr:
+  case X86::VMOVAPSZ128mr:
+  case X86::VMOVAPSZ256mr:
+  case X86::VMOVUPDZmr:
+  case X86::VMOVUPDZ128mr:
+  case X86::VMOVUPDZ256mr:
+  case X86::VMOVAPDZmr:
+  case X86::VMOVAPDZ128mr:
+  case X86::VMOVAPDZ256mr:
+  case X86::VMOVDQA32Zmr:
+  case X86::VMOVDQA32Z128mr:
+  case X86::VMOVDQA32Z256mr:
+  case X86::VMOVDQU32Zmr:
+  case X86::VMOVDQU32Z128mr:
+  case X86::VMOVDQU32Z256mr:
+  case X86::VMOVDQA64Zmr:
+  case X86::VMOVDQA64Z128mr:
+  case X86::VMOVDQA64Z256mr:
+  case X86::VMOVDQU64Zmr:
+  case X86::VMOVDQU64Z128mr:
+  case X86::VMOVDQU64Z256mr:
+  case X86::VMOVDQU8Zmr:
+  case X86::VMOVDQU8Z128mr:
+  case X86::VMOVDQU8Z256mr:
+  case X86::VMOVDQU16Zmr:
+  case X86::VMOVDQU16Z128mr:
+  case X86::VMOVDQU16Z256mr:
   case X86::MMX_MOVD64mr:
   case X86::MMX_MOVQ64mr:
   case X86::MMX_MOVNTQmr:
+  case X86::KMOVBmk:
+  case X86::KMOVWmk:
+  case X86::KMOVDmk:
+  case X86::KMOVQmk:
     return true;
   }
   return false;
 }
 
-unsigned X86InstrInfo::isLoadFromStackSlot(const MachineInstr *MI,
+unsigned X86InstrInfo::isLoadFromStackSlot(const MachineInstr &MI,
                                            int &FrameIndex) const {
-  if (isFrameLoadOpcode(MI->getOpcode()))
-    if (MI->getOperand(0).getSubReg() == 0 && isFrameOperand(MI, 1, FrameIndex))
-      return MI->getOperand(0).getReg();
+  if (isFrameLoadOpcode(MI.getOpcode()))
+    if (MI.getOperand(0).getSubReg() == 0 && isFrameOperand(MI, 1, FrameIndex))
+      return MI.getOperand(0).getReg();
   return 0;
 }
 
-unsigned X86InstrInfo::isLoadFromStackSlotPostFE(const MachineInstr *MI,
+unsigned X86InstrInfo::isLoadFromStackSlotPostFE(const MachineInstr &MI,
                                                  int &FrameIndex) const {
-  if (isFrameLoadOpcode(MI->getOpcode())) {
+  if (isFrameLoadOpcode(MI.getOpcode())) {
     unsigned Reg;
     if ((Reg = isLoadFromStackSlot(MI, FrameIndex)))
       return Reg;
@@ -2242,18 +2365,18 @@ unsigned X86InstrInfo::isLoadFromStackSlotPostFE(const MachineInstr *MI,
   return 0;
 }
 
-unsigned X86InstrInfo::isStoreToStackSlot(const MachineInstr *MI,
+unsigned X86InstrInfo::isStoreToStackSlot(const MachineInstr &MI,
                                           int &FrameIndex) const {
-  if (isFrameStoreOpcode(MI->getOpcode()))
-    if (MI->getOperand(X86::AddrNumOperands).getSubReg() == 0 &&
+  if (isFrameStoreOpcode(MI.getOpcode()))
+    if (MI.getOperand(X86::AddrNumOperands).getSubReg() == 0 &&
         isFrameOperand(MI, 0, FrameIndex))
-      return MI->getOperand(X86::AddrNumOperands).getReg();
+      return MI.getOperand(X86::AddrNumOperands).getReg();
   return 0;
 }
 
-unsigned X86InstrInfo::isStoreToStackSlotPostFE(const MachineInstr *MI,
+unsigned X86InstrInfo::isStoreToStackSlotPostFE(const MachineInstr &MI,
                                                 int &FrameIndex) const {
-  if (isFrameStoreOpcode(MI->getOpcode())) {
+  if (isFrameStoreOpcode(MI.getOpcode())) {
     unsigned Reg;
     if ((Reg = isStoreToStackSlot(MI, FrameIndex)))
       return Reg;
@@ -2281,10 +2404,9 @@ static bool regIsPICBase(unsigned BaseReg, const MachineRegisterInfo &MRI) {
   return isPICBase;
 }
 
-bool
-X86InstrInfo::isReallyTriviallyReMaterializable(const MachineInstr *MI,
-                                                AliasAnalysis *AA) const {
-  switch (MI->getOpcode()) {
+bool X86InstrInfo::isReallyTriviallyReMaterializable(const MachineInstr &MI,
+                                                     AliasAnalysis *AA) const {
+  switch (MI.getOpcode()) {
   default: break;
   case X86::MOV8rm:
   case X86::MOV16rm:
@@ -2345,18 +2467,18 @@ X86InstrInfo::isReallyTriviallyReMaterializable(const MachineInstr *MI,
   case X86::VMOVUPSZ256rm:
   case X86::VMOVUPSZrm: {
     // Loads from constant pools are trivially rematerializable.
-    if (MI->getOperand(1+X86::AddrBaseReg).isReg() &&
-        MI->getOperand(1+X86::AddrScaleAmt).isImm() &&
-        MI->getOperand(1+X86::AddrIndexReg).isReg() &&
-        MI->getOperand(1+X86::AddrIndexReg).getReg() == 0 &&
-        MI->isInvariantLoad(AA)) {
-      unsigned BaseReg = MI->getOperand(1+X86::AddrBaseReg).getReg();
+    if (MI.getOperand(1 + X86::AddrBaseReg).isReg() &&
+        MI.getOperand(1 + X86::AddrScaleAmt).isImm() &&
+        MI.getOperand(1 + X86::AddrIndexReg).isReg() &&
+        MI.getOperand(1 + X86::AddrIndexReg).getReg() == 0 &&
+        MI.isInvariantLoad(AA)) {
+      unsigned BaseReg = MI.getOperand(1 + X86::AddrBaseReg).getReg();
       if (BaseReg == 0 || BaseReg == X86::RIP)
         return true;
       // Allow re-materialization of PIC load.
-      if (!ReMatPICStubLoad && MI->getOperand(1+X86::AddrDisp).isGlobal())
+      if (!ReMatPICStubLoad && MI.getOperand(1 + X86::AddrDisp).isGlobal())
         return false;
-      const MachineFunction &MF = *MI->getParent()->getParent();
+      const MachineFunction &MF = *MI.getParent()->getParent();
       const MachineRegisterInfo &MRI = MF.getRegInfo();
       return regIsPICBase(BaseReg, MRI);
     }
@@ -2365,18 +2487,18 @@ X86InstrInfo::isReallyTriviallyReMaterializable(const MachineInstr *MI,
 
   case X86::LEA32r:
   case X86::LEA64r: {
-    if (MI->getOperand(1+X86::AddrScaleAmt).isImm() &&
-        MI->getOperand(1+X86::AddrIndexReg).isReg() &&
-        MI->getOperand(1+X86::AddrIndexReg).getReg() == 0 &&
-        !MI->getOperand(1+X86::AddrDisp).isReg()) {
+    if (MI.getOperand(1 + X86::AddrScaleAmt).isImm() &&
+        MI.getOperand(1 + X86::AddrIndexReg).isReg() &&
+        MI.getOperand(1 + X86::AddrIndexReg).getReg() == 0 &&
+        !MI.getOperand(1 + X86::AddrDisp).isReg()) {
       // lea fi#, lea GV, etc. are all rematerializable.
-      if (!MI->getOperand(1+X86::AddrBaseReg).isReg())
+      if (!MI.getOperand(1 + X86::AddrBaseReg).isReg())
         return true;
-      unsigned BaseReg = MI->getOperand(1+X86::AddrBaseReg).getReg();
+      unsigned BaseReg = MI.getOperand(1 + X86::AddrBaseReg).getReg();
       if (BaseReg == 0)
         return true;
       // Allow re-materialization of lea PICBase + x.
-      const MachineFunction &MF = *MI->getParent()->getParent();
+      const MachineFunction &MF = *MI.getParent()->getParent();
       const MachineRegisterInfo &MRI = MF.getRegInfo();
       return regIsPICBase(BaseReg, MRI);
     }
@@ -2469,10 +2591,10 @@ bool X86InstrInfo::isSafeToClobberEFLAGS(MachineBasicBlock &MBB,
 void X86InstrInfo::reMaterialize(MachineBasicBlock &MBB,
                                  MachineBasicBlock::iterator I,
                                  unsigned DestReg, unsigned SubIdx,
-                                 const MachineInstr *Orig,
+                                 const MachineInstr &Orig,
                                  const TargetRegisterInfo &TRI) const {
   bool ClobbersEFLAGS = false;
-  for (const MachineOperand &MO : Orig->operands()) {
+  for (const MachineOperand &MO : Orig.operands()) {
     if (MO.isReg() && MO.isDef() && MO.getReg() == X86::EFLAGS) {
       ClobbersEFLAGS = true;
       break;
@@ -2483,7 +2605,7 @@ void X86InstrInfo::reMaterialize(MachineBasicBlock &MBB,
     // The instruction clobbers EFLAGS. Re-materialize as MOV32ri to avoid side
     // effects.
     int Value;
-    switch (Orig->getOpcode()) {
+    switch (Orig.getOpcode()) {
     case X86::MOV32r0:  Value = 0; break;
     case X86::MOV32r1:  Value = 1; break;
     case X86::MOV32r_1: Value = -1; break;
@@ -2491,22 +2613,23 @@ void X86InstrInfo::reMaterialize(MachineBasicBlock &MBB,
       llvm_unreachable("Unexpected instruction!");
     }
 
-    DebugLoc DL = Orig->getDebugLoc();
-    BuildMI(MBB, I, DL, get(X86::MOV32ri)).addOperand(Orig->getOperand(0))
-      .addImm(Value);
+    const DebugLoc &DL = Orig.getDebugLoc();
+    BuildMI(MBB, I, DL, get(X86::MOV32ri))
+        .addOperand(Orig.getOperand(0))
+        .addImm(Value);
   } else {
-    MachineInstr *MI = MBB.getParent()->CloneMachineInstr(Orig);
+    MachineInstr *MI = MBB.getParent()->CloneMachineInstr(&Orig);
     MBB.insert(I, MI);
   }
 
-  MachineInstr *NewMI = std::prev(I);
-  NewMI->substituteRegister(Orig->getOperand(0).getReg(), DestReg, SubIdx, TRI);
+  MachineInstr &NewMI = *std::prev(I);
+  NewMI.substituteRegister(Orig.getOperand(0).getReg(), DestReg, SubIdx, TRI);
 }
 
 /// True if MI has a condition code def, e.g. EFLAGS, that is not marked dead.
-bool X86InstrInfo::hasLiveCondCodeDef(MachineInstr *MI) const {
-  for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) {
-    MachineOperand &MO = MI->getOperand(i);
+bool X86InstrInfo::hasLiveCondCodeDef(MachineInstr &MI) const {
+  for (unsigned i = 0, e = MI.getNumOperands(); i != e; ++i) {
+    MachineOperand &MO = MI.getOperand(i);
     if (MO.isReg() && MO.isDef() &&
         MO.getReg() == X86::EFLAGS && !MO.isDead()) {
       return true;
@@ -2516,11 +2639,11 @@ bool X86InstrInfo::hasLiveCondCodeDef(MachineInstr *MI) const {
 }
 
 /// Check whether the shift count for a machine operand is non-zero.
-inline static unsigned getTruncatedShiftCount(MachineInstr *MI,
+inline static unsigned getTruncatedShiftCount(MachineInstr &MI,
                                               unsigned ShiftAmtOperandIdx) {
   // The shift count is six bits with the REX.W prefix and five bits without.
-  unsigned ShiftCountMask = (MI->getDesc().TSFlags & X86II::REX_W) ? 63 : 31;
-  unsigned Imm = MI->getOperand(ShiftAmtOperandIdx).getImm();
+  unsigned ShiftCountMask = (MI.getDesc().TSFlags & X86II::REX_W) ? 63 : 31;
+  unsigned Imm = MI.getOperand(ShiftAmtOperandIdx).getImm();
   return Imm & ShiftCountMask;
 }
 
@@ -2535,11 +2658,11 @@ inline static bool isTruncatedShiftCountForLEA(unsigned ShAmt) {
   return ShAmt < 4 && ShAmt > 0;
 }
 
-bool X86InstrInfo::classifyLEAReg(MachineInstr *MI, const MachineOperand &Src,
-                                  unsigned Opc, bool AllowSP,
-                                  unsigned &NewSrc, bool &isKill, bool &isUndef,
+bool X86InstrInfo::classifyLEAReg(MachineInstr &MI, const MachineOperand &Src,
+                                  unsigned Opc, bool AllowSP, unsigned &NewSrc,
+                                  bool &isKill, bool &isUndef,
                                   MachineOperand &ImplicitOp) const {
-  MachineFunction &MF = *MI->getParent()->getParent();
+  MachineFunction &MF = *MI.getParent()->getParent();
   const TargetRegisterClass *RC;
   if (AllowSP) {
     RC = Opc != X86::LEA32r ? &X86::GR64RegClass : &X86::GR32RegClass;
@@ -2571,7 +2694,7 @@ bool X86InstrInfo::classifyLEAReg(MachineInstr *MI, const MachineOperand &Src,
 
     NewSrc = getX86SubSuperRegister(Src.getReg(), 64);
     MachineBasicBlock::LivenessQueryResult LQR =
-      MI->getParent()->computeRegisterLiveness(&getRegisterInfo(), NewSrc, MI);
+        MI.getParent()->computeRegisterLiveness(&getRegisterInfo(), NewSrc, MI);
 
     switch (LQR) {
     case MachineBasicBlock::LQR_Unknown:
@@ -2579,7 +2702,7 @@ bool X86InstrInfo::classifyLEAReg(MachineInstr *MI, const MachineOperand &Src,
       // formation.
       return false;
     case MachineBasicBlock::LQR_Live:
-      isKill = MI->killsRegister(SrcReg);
+      isKill = MI.killsRegister(SrcReg);
       isUndef = false;
       break;
     default:
@@ -2592,9 +2715,8 @@ bool X86InstrInfo::classifyLEAReg(MachineInstr *MI, const MachineOperand &Src,
     // Virtual register of the wrong class, we have to create a temporary 64-bit
     // vreg to feed into the LEA.
     NewSrc = MF.getRegInfo().createVirtualRegister(RC);
-    BuildMI(*MI->getParent(), MI, MI->getDebugLoc(),
-            get(TargetOpcode::COPY))
-      .addReg(NewSrc, RegState::Define | RegState::Undef, X86::sub_32bit)
+    BuildMI(*MI.getParent(), MI, MI.getDebugLoc(), get(TargetOpcode::COPY))
+        .addReg(NewSrc, RegState::Define | RegState::Undef, X86::sub_32bit)
         .addOperand(Src);
 
     // Which is obviously going to be dead after we're done with it.
@@ -2609,16 +2731,14 @@ bool X86InstrInfo::classifyLEAReg(MachineInstr *MI, const MachineOperand &Src,
 /// Helper for convertToThreeAddress when 16-bit LEA is disabled, use 32-bit
 /// LEA to form 3-address code by promoting to a 32-bit superregister and then
 /// truncating back down to a 16-bit subregister.
-MachineInstr *
-X86InstrInfo::convertToThreeAddressWithLEA(unsigned MIOpc,
-                                           MachineFunction::iterator &MFI,
-                                           MachineBasicBlock::iterator &MBBI,
-                                           LiveVariables *LV) const {
-  MachineInstr *MI = MBBI;
-  unsigned Dest = MI->getOperand(0).getReg();
-  unsigned Src = MI->getOperand(1).getReg();
-  bool isDead = MI->getOperand(0).isDead();
-  bool isKill = MI->getOperand(1).isKill();
+MachineInstr *X86InstrInfo::convertToThreeAddressWithLEA(
+    unsigned MIOpc, MachineFunction::iterator &MFI, MachineInstr &MI,
+    LiveVariables *LV) const {
+  MachineBasicBlock::iterator MBBI = MI.getIterator();
+  unsigned Dest = MI.getOperand(0).getReg();
+  unsigned Src = MI.getOperand(1).getReg();
+  bool isDead = MI.getOperand(0).isDead();
+  bool isKill = MI.getOperand(1).isKill();
 
   MachineRegisterInfo &RegInfo = MFI->getParent()->getRegInfo();
   unsigned leaOutReg = RegInfo.createVirtualRegister(&X86::GR32RegClass);
@@ -2638,19 +2758,19 @@ X86InstrInfo::convertToThreeAddressWithLEA(unsigned MIOpc,
   //   leal    -65(%rdx), %esi
   // But testing has shown this *does* help performance in 64-bit mode (at
   // least on modern x86 machines).
-  BuildMI(*MFI, MBBI, MI->getDebugLoc(), get(X86::IMPLICIT_DEF), leaInReg);
+  BuildMI(*MFI, MBBI, MI.getDebugLoc(), get(X86::IMPLICIT_DEF), leaInReg);
   MachineInstr *InsMI =
-    BuildMI(*MFI, MBBI, MI->getDebugLoc(), get(TargetOpcode::COPY))
-    .addReg(leaInReg, RegState::Define, X86::sub_16bit)
-    .addReg(Src, getKillRegState(isKill));
+      BuildMI(*MFI, MBBI, MI.getDebugLoc(), get(TargetOpcode::COPY))
+          .addReg(leaInReg, RegState::Define, X86::sub_16bit)
+          .addReg(Src, getKillRegState(isKill));
 
-  MachineInstrBuilder MIB = BuildMI(*MFI, MBBI, MI->getDebugLoc(),
-                                    get(Opc), leaOutReg);
+  MachineInstrBuilder MIB =
+      BuildMI(*MFI, MBBI, MI.getDebugLoc(), get(Opc), leaOutReg);
   switch (MIOpc) {
   default: llvm_unreachable("Unreachable!");
   case X86::SHL16ri: {
-    unsigned ShAmt = MI->getOperand(2).getImm();
-    MIB.addReg(0).addImm(1 << ShAmt)
+    unsigned ShAmt = MI.getOperand(2).getImm();
+    MIB.addReg(0).addImm(1ULL << ShAmt)
        .addReg(leaInReg, RegState::Kill).addImm(0).addReg(0);
     break;
   }
@@ -2664,12 +2784,12 @@ X86InstrInfo::convertToThreeAddressWithLEA(unsigned MIOpc,
   case X86::ADD16ri8:
   case X86::ADD16ri_DB:
   case X86::ADD16ri8_DB:
-    addRegOffset(MIB, leaInReg, true, MI->getOperand(2).getImm());
+    addRegOffset(MIB, leaInReg, true, MI.getOperand(2).getImm());
     break;
   case X86::ADD16rr:
   case X86::ADD16rr_DB: {
-    unsigned Src2 = MI->getOperand(2).getReg();
-    bool isKill2 = MI->getOperand(2).isKill();
+    unsigned Src2 = MI.getOperand(2).getReg();
+    bool isKill2 = MI.getOperand(2).isKill();
     unsigned leaInReg2 = 0;
     MachineInstr *InsMI2 = nullptr;
     if (Src == Src2) {
@@ -2683,33 +2803,32 @@ X86InstrInfo::convertToThreeAddressWithLEA(unsigned MIOpc,
         leaInReg2 = RegInfo.createVirtualRegister(&X86::GR32_NOSPRegClass);
       // Build and insert into an implicit UNDEF value. This is OK because
       // well be shifting and then extracting the lower 16-bits.
-      BuildMI(*MFI, &*MIB, MI->getDebugLoc(), get(X86::IMPLICIT_DEF),leaInReg2);
-      InsMI2 =
-        BuildMI(*MFI, &*MIB, MI->getDebugLoc(), get(TargetOpcode::COPY))
-        .addReg(leaInReg2, RegState::Define, X86::sub_16bit)
-        .addReg(Src2, getKillRegState(isKill2));
+      BuildMI(*MFI, &*MIB, MI.getDebugLoc(), get(X86::IMPLICIT_DEF), leaInReg2);
+      InsMI2 = BuildMI(*MFI, &*MIB, MI.getDebugLoc(), get(TargetOpcode::COPY))
+                   .addReg(leaInReg2, RegState::Define, X86::sub_16bit)
+                   .addReg(Src2, getKillRegState(isKill2));
       addRegReg(MIB, leaInReg, true, leaInReg2, true);
     }
     if (LV && isKill2 && InsMI2)
-      LV->replaceKillInstruction(Src2, MI, InsMI2);
+      LV->replaceKillInstruction(Src2, MI, *InsMI2);
     break;
   }
   }
 
   MachineInstr *NewMI = MIB;
   MachineInstr *ExtMI =
-    BuildMI(*MFI, MBBI, MI->getDebugLoc(), get(TargetOpcode::COPY))
-    .addReg(Dest, RegState::Define | getDeadRegState(isDead))
-    .addReg(leaOutReg, RegState::Kill, X86::sub_16bit);
+      BuildMI(*MFI, MBBI, MI.getDebugLoc(), get(TargetOpcode::COPY))
+          .addReg(Dest, RegState::Define | getDeadRegState(isDead))
+          .addReg(leaOutReg, RegState::Kill, X86::sub_16bit);
 
   if (LV) {
     // Update live variables
     LV->getVarInfo(leaInReg).Kills.push_back(NewMI);
     LV->getVarInfo(leaOutReg).Kills.push_back(ExtMI);
     if (isKill)
-      LV->replaceKillInstruction(Src, MI, InsMI);
+      LV->replaceKillInstruction(Src, MI, *InsMI);
     if (isDead)
-      LV->replaceKillInstruction(Dest, MI, ExtMI);
+      LV->replaceKillInstruction(Dest, MI, *ExtMI);
   }
 
   return ExtMI;
@@ -2727,20 +2846,17 @@ X86InstrInfo::convertToThreeAddressWithLEA(unsigned MIOpc,
 ///
 MachineInstr *
 X86InstrInfo::convertToThreeAddress(MachineFunction::iterator &MFI,
-                                    MachineBasicBlock::iterator &MBBI,
-                                    LiveVariables *LV) const {
-  MachineInstr *MI = MBBI;
-
+                                    MachineInstr &MI, LiveVariables *LV) const {
   // The following opcodes also sets the condition code register(s). Only
   // convert them to equivalent lea if the condition code register def's
   // are dead!
   if (hasLiveCondCodeDef(MI))
     return nullptr;
 
-  MachineFunction &MF = *MI->getParent()->getParent();
+  MachineFunction &MF = *MI.getParent()->getParent();
   // All instructions input are two-addr instructions.  Get the known operands.
-  const MachineOperand &Dest = MI->getOperand(0);
-  const MachineOperand &Src = MI->getOperand(1);
+  const MachineOperand &Dest = MI.getOperand(0);
+  const MachineOperand &Src = MI.getOperand(1);
 
   MachineInstr *NewMI = nullptr;
   // FIXME: 16-bit LEA's are really slow on Athlons, but not bad on P4's.  When
@@ -2749,11 +2865,11 @@ X86InstrInfo::convertToThreeAddress(MachineFunction::iterator &MFI,
   bool DisableLEA16 = true;
   bool is64Bit = Subtarget.is64Bit();
 
-  unsigned MIOpc = MI->getOpcode();
+  unsigned MIOpc = MI.getOpcode();
   switch (MIOpc) {
   default: return nullptr;
   case X86::SHL64ri: {
-    assert(MI->getNumOperands() >= 3 && "Unknown shift instruction!");
+    assert(MI.getNumOperands() >= 3 && "Unknown shift instruction!");
     unsigned ShAmt = getTruncatedShiftCount(MI, 2);
     if (!isTruncatedShiftCountForLEA(ShAmt)) return nullptr;
 
@@ -2763,13 +2879,17 @@ X86InstrInfo::convertToThreeAddress(MachineFunction::iterator &MFI,
                                            &X86::GR64_NOSPRegClass))
       return nullptr;
 
-    NewMI = BuildMI(MF, MI->getDebugLoc(), get(X86::LEA64r))
-      .addOperand(Dest)
-      .addReg(0).addImm(1 << ShAmt).addOperand(Src).addImm(0).addReg(0);
+    NewMI = BuildMI(MF, MI.getDebugLoc(), get(X86::LEA64r))
+                .addOperand(Dest)
+                .addReg(0)
+                .addImm(1ULL << ShAmt)
+                .addOperand(Src)
+                .addImm(0)
+                .addReg(0);
     break;
   }
   case X86::SHL32ri: {
-    assert(MI->getNumOperands() >= 3 && "Unknown shift instruction!");
+    assert(MI.getNumOperands() >= 3 && "Unknown shift instruction!");
     unsigned ShAmt = getTruncatedShiftCount(MI, 2);
     if (!isTruncatedShiftCountForLEA(ShAmt)) return nullptr;
 
@@ -2783,11 +2903,14 @@ X86InstrInfo::convertToThreeAddress(MachineFunction::iterator &MFI,
                         SrcReg, isKill, isUndef, ImplicitOp))
       return nullptr;
 
-    MachineInstrBuilder MIB = BuildMI(MF, MI->getDebugLoc(), get(Opc))
-      .addOperand(Dest)
-      .addReg(0).addImm(1 << ShAmt)
-      .addReg(SrcReg, getKillRegState(isKill) | getUndefRegState(isUndef))
-      .addImm(0).addReg(0);
+    MachineInstrBuilder MIB =
+        BuildMI(MF, MI.getDebugLoc(), get(Opc))
+            .addOperand(Dest)
+            .addReg(0)
+            .addImm(1ULL << ShAmt)
+            .addReg(SrcReg, getKillRegState(isKill) | getUndefRegState(isUndef))
+            .addImm(0)
+            .addReg(0);
     if (ImplicitOp.getReg() != 0)
       MIB.addOperand(ImplicitOp);
     NewMI = MIB;
@@ -2795,20 +2918,25 @@ X86InstrInfo::convertToThreeAddress(MachineFunction::iterator &MFI,
     break;
   }
   case X86::SHL16ri: {
-    assert(MI->getNumOperands() >= 3 && "Unknown shift instruction!");
+    assert(MI.getNumOperands() >= 3 && "Unknown shift instruction!");
     unsigned ShAmt = getTruncatedShiftCount(MI, 2);
     if (!isTruncatedShiftCountForLEA(ShAmt)) return nullptr;
 
     if (DisableLEA16)
-      return is64Bit ? convertToThreeAddressWithLEA(MIOpc, MFI, MBBI, LV) : nullptr;
-    NewMI = BuildMI(MF, MI->getDebugLoc(), get(X86::LEA16r))
-      .addOperand(Dest)
-      .addReg(0).addImm(1 << ShAmt).addOperand(Src).addImm(0).addReg(0);
+      return is64Bit ? convertToThreeAddressWithLEA(MIOpc, MFI, MI, LV)
+                     : nullptr;
+    NewMI = BuildMI(MF, MI.getDebugLoc(), get(X86::LEA16r))
+                .addOperand(Dest)
+                .addReg(0)
+                .addImm(1ULL << ShAmt)
+                .addOperand(Src)
+                .addImm(0)
+                .addReg(0);
     break;
   }
   case X86::INC64r:
   case X86::INC32r: {
-    assert(MI->getNumOperands() >= 2 && "Unknown inc instruction!");
+    assert(MI.getNumOperands() >= 2 && "Unknown inc instruction!");
     unsigned Opc = MIOpc == X86::INC64r ? X86::LEA64r
       : (is64Bit ? X86::LEA64_32r : X86::LEA32r);
     bool isKill, isUndef;
@@ -2818,9 +2946,11 @@ X86InstrInfo::convertToThreeAddress(MachineFunction::iterator &MFI,
                         SrcReg, isKill, isUndef, ImplicitOp))
       return nullptr;
 
-    MachineInstrBuilder MIB = BuildMI(MF, MI->getDebugLoc(), get(Opc))
-        .addOperand(Dest)
-        .addReg(SrcReg, getKillRegState(isKill) | getUndefRegState(isUndef));
+    MachineInstrBuilder MIB =
+        BuildMI(MF, MI.getDebugLoc(), get(Opc))
+            .addOperand(Dest)
+            .addReg(SrcReg,
+                    getKillRegState(isKill) | getUndefRegState(isUndef));
     if (ImplicitOp.getReg() != 0)
       MIB.addOperand(ImplicitOp);
 
@@ -2829,15 +2959,17 @@ X86InstrInfo::convertToThreeAddress(MachineFunction::iterator &MFI,
   }
   case X86::INC16r:
     if (DisableLEA16)
-      return is64Bit ? convertToThreeAddressWithLEA(MIOpc, MFI, MBBI, LV)
+      return is64Bit ? convertToThreeAddressWithLEA(MIOpc, MFI, MI, LV)
                      : nullptr;
-    assert(MI->getNumOperands() >= 2 && "Unknown inc instruction!");
-    NewMI = addOffset(BuildMI(MF, MI->getDebugLoc(), get(X86::LEA16r))
-                      .addOperand(Dest).addOperand(Src), 1);
+    assert(MI.getNumOperands() >= 2 && "Unknown inc instruction!");
+    NewMI = addOffset(BuildMI(MF, MI.getDebugLoc(), get(X86::LEA16r))
+                          .addOperand(Dest)
+                          .addOperand(Src),
+                      1);
     break;
   case X86::DEC64r:
   case X86::DEC32r: {
-    assert(MI->getNumOperands() >= 2 && "Unknown dec instruction!");
+    assert(MI.getNumOperands() >= 2 && "Unknown dec instruction!");
     unsigned Opc = MIOpc == X86::DEC64r ? X86::LEA64r
       : (is64Bit ? X86::LEA64_32r : X86::LEA32r);
 
@@ -2848,9 +2980,10 @@ X86InstrInfo::convertToThreeAddress(MachineFunction::iterator &MFI,
                         SrcReg, isKill, isUndef, ImplicitOp))
       return nullptr;
 
-    MachineInstrBuilder MIB = BuildMI(MF, MI->getDebugLoc(), get(Opc))
-        .addOperand(Dest)
-        .addReg(SrcReg, getUndefRegState(isUndef) | getKillRegState(isKill));
+    MachineInstrBuilder MIB = BuildMI(MF, MI.getDebugLoc(), get(Opc))
+                                  .addOperand(Dest)
+                                  .addReg(SrcReg, getUndefRegState(isUndef) |
+                                                      getKillRegState(isKill));
     if (ImplicitOp.getReg() != 0)
       MIB.addOperand(ImplicitOp);
 
@@ -2860,17 +2993,19 @@ X86InstrInfo::convertToThreeAddress(MachineFunction::iterator &MFI,
   }
   case X86::DEC16r:
     if (DisableLEA16)
-      return is64Bit ? convertToThreeAddressWithLEA(MIOpc, MFI, MBBI, LV)
+      return is64Bit ? convertToThreeAddressWithLEA(MIOpc, MFI, MI, LV)
                      : nullptr;
-    assert(MI->getNumOperands() >= 2 && "Unknown dec instruction!");
-    NewMI = addOffset(BuildMI(MF, MI->getDebugLoc(), get(X86::LEA16r))
-                      .addOperand(Dest).addOperand(Src), -1);
+    assert(MI.getNumOperands() >= 2 && "Unknown dec instruction!");
+    NewMI = addOffset(BuildMI(MF, MI.getDebugLoc(), get(X86::LEA16r))
+                          .addOperand(Dest)
+                          .addOperand(Src),
+                      -1);
     break;
   case X86::ADD64rr:
   case X86::ADD64rr_DB:
   case X86::ADD32rr:
   case X86::ADD32rr_DB: {
-    assert(MI->getNumOperands() >= 3 && "Unknown add instruction!");
+    assert(MI.getNumOperands() >= 3 && "Unknown add instruction!");
     unsigned Opc;
     if (MIOpc == X86::ADD64rr || MIOpc == X86::ADD64rr_DB)
       Opc = X86::LEA64r;
@@ -2884,7 +3019,7 @@ X86InstrInfo::convertToThreeAddress(MachineFunction::iterator &MFI,
                         SrcReg, isKill, isUndef, ImplicitOp))
       return nullptr;
 
-    const MachineOperand &Src2 = MI->getOperand(2);
+    const MachineOperand &Src2 = MI.getOperand(2);
     bool isKill2, isUndef2;
     unsigned SrcReg2;
     MachineOperand ImplicitOp2 = MachineOperand::CreateReg(0, false);
@@ -2892,8 +3027,8 @@ X86InstrInfo::convertToThreeAddress(MachineFunction::iterator &MFI,
                         SrcReg2, isKill2, isUndef2, ImplicitOp2))
       return nullptr;
 
-    MachineInstrBuilder MIB = BuildMI(MF, MI->getDebugLoc(), get(Opc))
-      .addOperand(Dest);
+    MachineInstrBuilder MIB =
+        BuildMI(MF, MI.getDebugLoc(), get(Opc)).addOperand(Dest);
     if (ImplicitOp.getReg() != 0)
       MIB.addOperand(ImplicitOp);
     if (ImplicitOp2.getReg() != 0)
@@ -2906,45 +3041,46 @@ X86InstrInfo::convertToThreeAddress(MachineFunction::iterator &MFI,
     NewMI->getOperand(3).setIsUndef(isUndef2);
 
     if (LV && Src2.isKill())
-      LV->replaceKillInstruction(SrcReg2, MI, NewMI);
+      LV->replaceKillInstruction(SrcReg2, MI, *NewMI);
     break;
   }
   case X86::ADD16rr:
   case X86::ADD16rr_DB: {
     if (DisableLEA16)
-      return is64Bit ? convertToThreeAddressWithLEA(MIOpc, MFI, MBBI, LV)
+      return is64Bit ? convertToThreeAddressWithLEA(MIOpc, MFI, MI, LV)
                      : nullptr;
-    assert(MI->getNumOperands() >= 3 && "Unknown add instruction!");
-    unsigned Src2 = MI->getOperand(2).getReg();
-    bool isKill2 = MI->getOperand(2).isKill();
-    NewMI = addRegReg(BuildMI(MF, MI->getDebugLoc(), get(X86::LEA16r))
-                      .addOperand(Dest),
-                      Src.getReg(), Src.isKill(), Src2, isKill2);
+    assert(MI.getNumOperands() >= 3 && "Unknown add instruction!");
+    unsigned Src2 = MI.getOperand(2).getReg();
+    bool isKill2 = MI.getOperand(2).isKill();
+    NewMI = addRegReg(
+        BuildMI(MF, MI.getDebugLoc(), get(X86::LEA16r)).addOperand(Dest),
+        Src.getReg(), Src.isKill(), Src2, isKill2);
 
     // Preserve undefness of the operands.
-    bool isUndef = MI->getOperand(1).isUndef();
-    bool isUndef2 = MI->getOperand(2).isUndef();
+    bool isUndef = MI.getOperand(1).isUndef();
+    bool isUndef2 = MI.getOperand(2).isUndef();
     NewMI->getOperand(1).setIsUndef(isUndef);
     NewMI->getOperand(3).setIsUndef(isUndef2);
 
     if (LV && isKill2)
-      LV->replaceKillInstruction(Src2, MI, NewMI);
+      LV->replaceKillInstruction(Src2, MI, *NewMI);
     break;
   }
   case X86::ADD64ri32:
   case X86::ADD64ri8:
   case X86::ADD64ri32_DB:
   case X86::ADD64ri8_DB:
-    assert(MI->getNumOperands() >= 3 && "Unknown add instruction!");
-    NewMI = addOffset(BuildMI(MF, MI->getDebugLoc(), get(X86::LEA64r))
-                      .addOperand(Dest).addOperand(Src),
-                      MI->getOperand(2).getImm());
+    assert(MI.getNumOperands() >= 3 && "Unknown add instruction!");
+    NewMI = addOffset(BuildMI(MF, MI.getDebugLoc(), get(X86::LEA64r))
+                          .addOperand(Dest)
+                          .addOperand(Src),
+                      MI.getOperand(2).getImm());
     break;
   case X86::ADD32ri:
   case X86::ADD32ri8:
   case X86::ADD32ri_DB:
   case X86::ADD32ri8_DB: {
-    assert(MI->getNumOperands() >= 3 && "Unknown add instruction!");
+    assert(MI.getNumOperands() >= 3 && "Unknown add instruction!");
     unsigned Opc = is64Bit ? X86::LEA64_32r : X86::LEA32r;
 
     bool isKill, isUndef;
@@ -2954,13 +3090,14 @@ X86InstrInfo::convertToThreeAddress(MachineFunction::iterator &MFI,
                         SrcReg, isKill, isUndef, ImplicitOp))
       return nullptr;
 
-    MachineInstrBuilder MIB = BuildMI(MF, MI->getDebugLoc(), get(Opc))
-        .addOperand(Dest)
-        .addReg(SrcReg, getUndefRegState(isUndef) | getKillRegState(isKill));
+    MachineInstrBuilder MIB = BuildMI(MF, MI.getDebugLoc(), get(Opc))
+                                  .addOperand(Dest)
+                                  .addReg(SrcReg, getUndefRegState(isUndef) |
+                                                      getKillRegState(isKill));
     if (ImplicitOp.getReg() != 0)
       MIB.addOperand(ImplicitOp);
 
-    NewMI = addOffset(MIB, MI->getOperand(2).getImm());
+    NewMI = addOffset(MIB, MI.getOperand(2).getImm());
     break;
   }
   case X86::ADD16ri:
@@ -2968,12 +3105,13 @@ X86InstrInfo::convertToThreeAddress(MachineFunction::iterator &MFI,
   case X86::ADD16ri_DB:
   case X86::ADD16ri8_DB:
     if (DisableLEA16)
-      return is64Bit ? convertToThreeAddressWithLEA(MIOpc, MFI, MBBI, LV)
+      return is64Bit ? convertToThreeAddressWithLEA(MIOpc, MFI, MI, LV)
                      : nullptr;
-    assert(MI->getNumOperands() >= 3 && "Unknown add instruction!");
-    NewMI = addOffset(BuildMI(MF, MI->getDebugLoc(), get(X86::LEA16r))
-                      .addOperand(Dest).addOperand(Src),
-                      MI->getOperand(2).getImm());
+    assert(MI.getNumOperands() >= 3 && "Unknown add instruction!");
+    NewMI = addOffset(BuildMI(MF, MI.getDebugLoc(), get(X86::LEA16r))
+                          .addOperand(Dest)
+                          .addOperand(Src),
+                      MI.getOperand(2).getImm());
     break;
   }
 
@@ -2981,12 +3119,12 @@ X86InstrInfo::convertToThreeAddress(MachineFunction::iterator &MFI,
 
   if (LV) {  // Update live variables
     if (Src.isKill())
-      LV->replaceKillInstruction(Src.getReg(), MI, NewMI);
+      LV->replaceKillInstruction(Src.getReg(), MI, *NewMI);
     if (Dest.isDead())
-      LV->replaceKillInstruction(Dest.getReg(), MI, NewMI);
+      LV->replaceKillInstruction(Dest.getReg(), MI, *NewMI);
   }
 
-  MFI->insert(MBBI, NewMI);          // Insert the new inst
+  MFI->insert(MI.getIterator(), NewMI); // Insert the new inst
   return NewMI;
 }
 
@@ -3142,11 +3280,16 @@ static bool isFMA3(unsigned Opcode, bool *IsIntrinsic = nullptr) {
   llvm_unreachable("Opcode not handled by the switch");
 }
 
-MachineInstr *X86InstrInfo::commuteInstructionImpl(MachineInstr *MI,
-                                                   bool NewMI,
+MachineInstr *X86InstrInfo::commuteInstructionImpl(MachineInstr &MI, bool NewMI,
                                                    unsigned OpIdx1,
                                                    unsigned OpIdx2) const {
-  switch (MI->getOpcode()) {
+  auto cloneIfNew = [NewMI](MachineInstr &MI) -> MachineInstr & {
+    if (NewMI)
+      return *MI.getParent()->getParent()->CloneMachineInstr(&MI);
+    return MI;
+  };
+
+  switch (MI.getOpcode()) {
   case X86::SHRD16rri8: // A = SHRD16rri8 B, C, I -> A = SHLD16rri8 C, B, (16-I)
   case X86::SHLD16rri8: // A = SHLD16rri8 B, C, I -> A = SHRD16rri8 C, B, (16-I)
   case X86::SHRD32rri8: // A = SHRD32rri8 B, C, I -> A = SHLD32rri8 C, B, (32-I)
@@ -3155,7 +3298,7 @@ MachineInstr *X86InstrInfo::commuteInstructionImpl(MachineInstr *MI,
   case X86::SHLD64rri8:{// A = SHLD64rri8 B, C, I -> A = SHRD64rri8 C, B, (64-I)
     unsigned Opc;
     unsigned Size;
-    switch (MI->getOpcode()) {
+    switch (MI.getOpcode()) {
     default: llvm_unreachable("Unreachable!");
     case X86::SHRD16rri8: Size = 16; Opc = X86::SHLD16rri8; break;
     case X86::SHLD16rri8: Size = 16; Opc = X86::SHRD16rri8; break;
@@ -3164,15 +3307,12 @@ MachineInstr *X86InstrInfo::commuteInstructionImpl(MachineInstr *MI,
     case X86::SHRD64rri8: Size = 64; Opc = X86::SHLD64rri8; break;
     case X86::SHLD64rri8: Size = 64; Opc = X86::SHRD64rri8; break;
     }
-    unsigned Amt = MI->getOperand(3).getImm();
-    if (NewMI) {
-      MachineFunction &MF = *MI->getParent()->getParent();
-      MI = MF.CloneMachineInstr(MI);
-      NewMI = false;
-    }
-    MI->setDesc(get(Opc));
-    MI->getOperand(3).setImm(Size-Amt);
-    return TargetInstrInfo::commuteInstructionImpl(MI, NewMI, OpIdx1, OpIdx2);
+    unsigned Amt = MI.getOperand(3).getImm();
+    auto &WorkingMI = cloneIfNew(MI);
+    WorkingMI.setDesc(get(Opc));
+    WorkingMI.getOperand(3).setImm(Size - Amt);
+    return TargetInstrInfo::commuteInstructionImpl(WorkingMI, /*NewMI=*/false,
+                                                   OpIdx1, OpIdx2);
   }
   case X86::BLENDPDrri:
   case X86::BLENDPSrri:
@@ -3186,7 +3326,7 @@ MachineInstr *X86InstrInfo::commuteInstructionImpl(MachineInstr *MI,
   case X86::VPBLENDDYrri:
   case X86::VPBLENDWYrri:{
     unsigned Mask;
-    switch (MI->getOpcode()) {
+    switch (MI.getOpcode()) {
     default: llvm_unreachable("Unreachable!");
     case X86::BLENDPDrri:    Mask = 0x03; break;
     case X86::BLENDPSrri:    Mask = 0x0F; break;
@@ -3201,29 +3341,23 @@ MachineInstr *X86InstrInfo::commuteInstructionImpl(MachineInstr *MI,
     case X86::VPBLENDWYrri:  Mask = 0xFF; break;
     }
     // Only the least significant bits of Imm are used.
-    unsigned Imm = MI->getOperand(3).getImm() & Mask;
-    if (NewMI) {
-      MachineFunction &MF = *MI->getParent()->getParent();
-      MI = MF.CloneMachineInstr(MI);
-      NewMI = false;
-    }
-    MI->getOperand(3).setImm(Mask ^ Imm);
-    return TargetInstrInfo::commuteInstructionImpl(MI, NewMI, OpIdx1, OpIdx2);
+    unsigned Imm = MI.getOperand(3).getImm() & Mask;
+    auto &WorkingMI = cloneIfNew(MI);
+    WorkingMI.getOperand(3).setImm(Mask ^ Imm);
+    return TargetInstrInfo::commuteInstructionImpl(WorkingMI, /*NewMI=*/false,
+                                                   OpIdx1, OpIdx2);
   }
   case X86::PCLMULQDQrr:
   case X86::VPCLMULQDQrr:{
     // SRC1 64bits = Imm[0] ? SRC1[127:64] : SRC1[63:0]
     // SRC2 64bits = Imm[4] ? SRC2[127:64] : SRC2[63:0]
-    unsigned Imm = MI->getOperand(3).getImm();
+    unsigned Imm = MI.getOperand(3).getImm();
     unsigned Src1Hi = Imm & 0x01;
     unsigned Src2Hi = Imm & 0x10;
-    if (NewMI) {
-      MachineFunction &MF = *MI->getParent()->getParent();
-      MI = MF.CloneMachineInstr(MI);
-      NewMI = false;
-    }
-    MI->getOperand(3).setImm((Src1Hi << 4) | (Src2Hi >> 4));
-    return TargetInstrInfo::commuteInstructionImpl(MI, NewMI, OpIdx1, OpIdx2);
+    auto &WorkingMI = cloneIfNew(MI);
+    WorkingMI.getOperand(3).setImm((Src1Hi << 4) | (Src2Hi >> 4));
+    return TargetInstrInfo::commuteInstructionImpl(WorkingMI, /*NewMI=*/false,
+                                                   OpIdx1, OpIdx2);
   }
   case X86::CMPPDrri:
   case X86::CMPPSrri:
@@ -3233,17 +3367,12 @@ MachineInstr *X86InstrInfo::commuteInstructionImpl(MachineInstr *MI,
   case X86::VCMPPSYrri: {
     // Float comparison can be safely commuted for
     // Ordered/Unordered/Equal/NotEqual tests
-    unsigned Imm = MI->getOperand(3).getImm() & 0x7;
+    unsigned Imm = MI.getOperand(3).getImm() & 0x7;
     switch (Imm) {
     case 0x00: // EQUAL
     case 0x03: // UNORDERED
     case 0x04: // NOT EQUAL
     case 0x07: // ORDERED
-      if (NewMI) {
-        MachineFunction &MF = *MI->getParent()->getParent();
-        MI = MF.CloneMachineInstr(MI);
-        NewMI = false;
-      }
       return TargetInstrInfo::commuteInstructionImpl(MI, NewMI, OpIdx1, OpIdx2);
     default:
       return nullptr;
@@ -3254,7 +3383,7 @@ MachineInstr *X86InstrInfo::commuteInstructionImpl(MachineInstr *MI,
   case X86::VPCOMQri: case X86::VPCOMUQri:
   case X86::VPCOMWri: case X86::VPCOMUWri: {
     // Flip comparison mode immediate (if necessary).
-    unsigned Imm = MI->getOperand(3).getImm() & 0x7;
+    unsigned Imm = MI.getOperand(3).getImm() & 0x7;
     switch (Imm) {
     case 0x00: Imm = 0x02; break; // LT -> GT
     case 0x01: Imm = 0x03; break; // LE -> GE
@@ -3267,13 +3396,21 @@ MachineInstr *X86InstrInfo::commuteInstructionImpl(MachineInstr *MI,
     default:
       break;
     }
-    if (NewMI) {
-      MachineFunction &MF = *MI->getParent()->getParent();
-      MI = MF.CloneMachineInstr(MI);
-      NewMI = false;
-    }
-    MI->getOperand(3).setImm(Imm);
-    return TargetInstrInfo::commuteInstructionImpl(MI, NewMI, OpIdx1, OpIdx2);
+    auto &WorkingMI = cloneIfNew(MI);
+    WorkingMI.getOperand(3).setImm(Imm);
+    return TargetInstrInfo::commuteInstructionImpl(WorkingMI, /*NewMI=*/false,
+                                                   OpIdx1, OpIdx2);
+  }
+  case X86::VPERM2F128rr:
+  case X86::VPERM2I128rr: {
+    // Flip permute source immediate.
+    // Imm & 0x02: lo = if set, select Op1.lo/hi else Op0.lo/hi.
+    // Imm & 0x20: hi = if set, select Op1.lo/hi else Op0.lo/hi.
+    unsigned Imm = MI.getOperand(3).getImm() & 0xFF;
+    auto &WorkingMI = cloneIfNew(MI);
+    WorkingMI.getOperand(3).setImm(Imm ^ 0x22);
+    return TargetInstrInfo::commuteInstructionImpl(WorkingMI, /*NewMI=*/false,
+                                                   OpIdx1, OpIdx2);
   }
   case X86::CMOVB16rr:  case X86::CMOVB32rr:  case X86::CMOVB64rr:
   case X86::CMOVAE16rr: case X86::CMOVAE32rr: case X86::CMOVAE64rr:
@@ -3292,7 +3429,7 @@ MachineInstr *X86InstrInfo::commuteInstructionImpl(MachineInstr *MI,
   case X86::CMOVO16rr:  case X86::CMOVO32rr:  case X86::CMOVO64rr:
   case X86::CMOVNO16rr: case X86::CMOVNO32rr: case X86::CMOVNO64rr: {
     unsigned Opc;
-    switch (MI->getOpcode()) {
+    switch (MI.getOpcode()) {
     default: llvm_unreachable("Unreachable!");
     case X86::CMOVB16rr:  Opc = X86::CMOVAE16rr; break;
     case X86::CMOVB32rr:  Opc = X86::CMOVAE32rr; break;
@@ -3343,31 +3480,27 @@ MachineInstr *X86InstrInfo::commuteInstructionImpl(MachineInstr *MI,
     case X86::CMOVNO32rr: Opc = X86::CMOVO32rr; break;
     case X86::CMOVNO64rr: Opc = X86::CMOVO64rr; break;
     }
-    if (NewMI) {
-      MachineFunction &MF = *MI->getParent()->getParent();
-      MI = MF.CloneMachineInstr(MI);
-      NewMI = false;
-    }
-    MI->setDesc(get(Opc));
-    // Fallthrough intended.
+    auto &WorkingMI = cloneIfNew(MI);
+    WorkingMI.setDesc(get(Opc));
+    return TargetInstrInfo::commuteInstructionImpl(WorkingMI, /*NewMI=*/false,
+                                                   OpIdx1, OpIdx2);
   }
   default:
-    if (isFMA3(MI->getOpcode())) {
+    if (isFMA3(MI.getOpcode())) {
       unsigned Opc = getFMA3OpcodeToCommuteOperands(MI, OpIdx1, OpIdx2);
       if (Opc == 0)
         return nullptr;
-      if (NewMI) {
-        MachineFunction &MF = *MI->getParent()->getParent();
-        MI = MF.CloneMachineInstr(MI);
-        NewMI = false;
-      }
-      MI->setDesc(get(Opc));
+      auto &WorkingMI = cloneIfNew(MI);
+      WorkingMI.setDesc(get(Opc));
+      return TargetInstrInfo::commuteInstructionImpl(WorkingMI, /*NewMI=*/false,
+                                                     OpIdx1, OpIdx2);
     }
+
     return TargetInstrInfo::commuteInstructionImpl(MI, NewMI, OpIdx1, OpIdx2);
   }
 }
 
-bool X86InstrInfo::findFMA3CommutedOpIndices(MachineInstr *MI,
+bool X86InstrInfo::findFMA3CommutedOpIndices(MachineInstr &MI,
                                              unsigned &SrcOpIdx1,
                                              unsigned &SrcOpIdx2) const {
 
@@ -3402,12 +3535,12 @@ bool X86InstrInfo::findFMA3CommutedOpIndices(MachineInstr *MI,
 
     // CommutableOpIdx2 is well defined now. Let's choose another commutable
     // operand and assign its index to CommutableOpIdx1.
-    unsigned Op2Reg = MI->getOperand(CommutableOpIdx2).getReg();
+    unsigned Op2Reg = MI.getOperand(CommutableOpIdx2).getReg();
     for (CommutableOpIdx1 = RegOpsNum; CommutableOpIdx1 > 0; CommutableOpIdx1--) {
       // The commuted operands must have different registers.
       // Otherwise, the commute transformation does not change anything and
       // is useless then.
-      if (Op2Reg != MI->getOperand(CommutableOpIdx1).getReg())
+      if (Op2Reg != MI.getOperand(CommutableOpIdx1).getReg())
         break;
     }
 
@@ -3427,14 +3560,13 @@ bool X86InstrInfo::findFMA3CommutedOpIndices(MachineInstr *MI,
   return getFMA3OpcodeToCommuteOperands(MI, SrcOpIdx1, SrcOpIdx2) != 0;
 }
 
-unsigned X86InstrInfo::getFMA3OpcodeToCommuteOperands(MachineInstr *MI,
-                                                      unsigned SrcOpIdx1,
-                                                      unsigned SrcOpIdx2) const {
-  unsigned Opc = MI->getOpcode();
+unsigned X86InstrInfo::getFMA3OpcodeToCommuteOperands(
+    MachineInstr &MI, unsigned SrcOpIdx1, unsigned SrcOpIdx2) const {
+  unsigned Opc = MI.getOpcode();
 
   // Define the array that holds FMA opcodes in groups
   // of 3 opcodes(132, 213, 231) in each group.
-  static const unsigned RegularOpcodeGroups[][3] = {
+  static const uint16_t RegularOpcodeGroups[][3] = {
     { X86::VFMADDSSr132r,   X86::VFMADDSSr213r,   X86::VFMADDSSr231r  },
     { X86::VFMADDSDr132r,   X86::VFMADDSDr213r,   X86::VFMADDSDr231r  },
     { X86::VFMADDPSr132r,   X86::VFMADDPSr213r,   X86::VFMADDPSr231r  },
@@ -3508,7 +3640,7 @@ unsigned X86InstrInfo::getFMA3OpcodeToCommuteOperands(MachineInstr *MI,
 
   // Define the array that holds FMA*_Int opcodes in groups
   // of 3 opcodes(132, 213, 231) in each group.
-  static const unsigned IntrinOpcodeGroups[][3] = {
+  static const uint16_t IntrinOpcodeGroups[][3] = {
     { X86::VFMADDSSr132r_Int,  X86::VFMADDSSr213r_Int,  X86::VFMADDSSr231r_Int },
     { X86::VFMADDSDr132r_Int,  X86::VFMADDSDr213r_Int,  X86::VFMADDSDr231r_Int },
     { X86::VFMADDSSr132m_Int,  X86::VFMADDSSr213m_Int,  X86::VFMADDSSr231m_Int },
@@ -3539,7 +3671,7 @@ unsigned X86InstrInfo::getFMA3OpcodeToCommuteOperands(MachineInstr *MI,
   isFMA3(Opc, &IsIntrinOpcode);
 
   size_t GroupsNum;
-  const unsigned (*OpcodeGroups)[3];
+  const uint16_t (*OpcodeGroups)[3];
   if (IsIntrinOpcode) {
     GroupsNum = array_lengthof(IntrinOpcodeGroups);
     OpcodeGroups = IntrinOpcodeGroups;
@@ -3548,7 +3680,7 @@ unsigned X86InstrInfo::getFMA3OpcodeToCommuteOperands(MachineInstr *MI,
     OpcodeGroups = RegularOpcodeGroups;
   }
 
-  const unsigned *FoundOpcodesGroup = nullptr;
+  const uint16_t *FoundOpcodesGroup = nullptr;
   size_t FormIndex;
 
   // Look for the input opcode in the corresponding opcodes table.
@@ -3616,34 +3748,33 @@ unsigned X86InstrInfo::getFMA3OpcodeToCommuteOperands(MachineInstr *MI,
   return FoundOpcodesGroup[FormIndex];
 }
 
-bool X86InstrInfo::findCommutedOpIndices(MachineInstr *MI,
-                                         unsigned &SrcOpIdx1,
+bool X86InstrInfo::findCommutedOpIndices(MachineInstr &MI, unsigned &SrcOpIdx1,
                                          unsigned &SrcOpIdx2) const {
-  switch (MI->getOpcode()) {
-    case X86::CMPPDrri:
-    case X86::CMPPSrri:
-    case X86::VCMPPDrri:
-    case X86::VCMPPSrri:
-    case X86::VCMPPDYrri:
-    case X86::VCMPPSYrri: {
-      // Float comparison can be safely commuted for
-      // Ordered/Unordered/Equal/NotEqual tests
-      unsigned Imm = MI->getOperand(3).getImm() & 0x7;
-      switch (Imm) {
-        case 0x00: // EQUAL
-        case 0x03: // UNORDERED
-        case 0x04: // NOT EQUAL
-        case 0x07: // ORDERED
-          // The indices of the commutable operands are 1 and 2.
-          // Assign them to the returned operand indices here.
-          return fixCommutedOpIndices(SrcOpIdx1, SrcOpIdx2, 1, 2);
-      }
-      return false;
+  switch (MI.getOpcode()) {
+  case X86::CMPPDrri:
+  case X86::CMPPSrri:
+  case X86::VCMPPDrri:
+  case X86::VCMPPSrri:
+  case X86::VCMPPDYrri:
+  case X86::VCMPPSYrri: {
+    // Float comparison can be safely commuted for
+    // Ordered/Unordered/Equal/NotEqual tests
+    unsigned Imm = MI.getOperand(3).getImm() & 0x7;
+    switch (Imm) {
+    case 0x00: // EQUAL
+    case 0x03: // UNORDERED
+    case 0x04: // NOT EQUAL
+    case 0x07: // ORDERED
+      // The indices of the commutable operands are 1 and 2.
+      // Assign them to the returned operand indices here.
+      return fixCommutedOpIndices(SrcOpIdx1, SrcOpIdx2, 1, 2);
     }
-    default:
-      if (isFMA3(MI->getOpcode()))
-        return findFMA3CommutedOpIndices(MI, SrcOpIdx1, SrcOpIdx2);
-      return TargetInstrInfo::findCommutedOpIndices(MI, SrcOpIdx1, SrcOpIdx2);
+    return false;
+  }
+  default:
+    if (isFMA3(MI.getOpcode()))
+      return findFMA3CommutedOpIndices(MI, SrcOpIdx1, SrcOpIdx2);
+    return TargetInstrInfo::findCommutedOpIndices(MI, SrcOpIdx1, SrcOpIdx2);
   }
   return false;
 }
@@ -3791,6 +3922,8 @@ X86::CondCode X86::GetOppositeBranchCondition(X86::CondCode CC) {
   case X86::COND_NP: return X86::COND_P;
   case X86::COND_O:  return X86::COND_NO;
   case X86::COND_NO: return X86::COND_O;
+  case X86::COND_NE_OR_P:  return X86::COND_E_AND_NP;
+  case X86::COND_E_AND_NP: return X86::COND_NE_OR_P;
   }
 }
 
@@ -3887,17 +4020,38 @@ unsigned X86::getCMovFromCond(CondCode CC, unsigned RegBytes,
   }
 }
 
-bool X86InstrInfo::isUnpredicatedTerminator(const MachineInstr *MI) const {
-  if (!MI->isTerminator()) return false;
+bool X86InstrInfo::isUnpredicatedTerminator(const MachineInstr &MI) const {
+  if (!MI.isTerminator()) return false;
 
   // Conditional branch is a special case.
-  if (MI->isBranch() && !MI->isBarrier())
+  if (MI.isBranch() && !MI.isBarrier())
     return true;
-  if (!MI->isPredicable())
+  if (!MI.isPredicable())
     return true;
   return !isPredicated(MI);
 }
 
+// Given a MBB and its TBB, find the FBB which was a fallthrough MBB (it may
+// not be a fallthrough MBB now due to layout changes). Return nullptr if the
+// fallthrough MBB cannot be identified.
+static MachineBasicBlock *getFallThroughMBB(MachineBasicBlock *MBB,
+                                            MachineBasicBlock *TBB) {
+  // Look for non-EHPad successors other than TBB. If we find exactly one, it
+  // is the fallthrough MBB. If we find zero, then TBB is both the target MBB
+  // and fallthrough MBB. If we find more than one, we cannot identify the
+  // fallthrough MBB and should return nullptr.
+  MachineBasicBlock *FallthroughBB = nullptr;
+  for (auto SI = MBB->succ_begin(), SE = MBB->succ_end(); SI != SE; ++SI) {
+    if ((*SI)->isEHPad() || (*SI == TBB && FallthroughBB))
+      continue;
+    // Return a nullptr if we found more than one fallthrough successor.
+    if (FallthroughBB && FallthroughBB != TBB)
+      return nullptr;
+    FallthroughBB = *SI;
+  }
+  return FallthroughBB;
+}
+
 bool X86InstrInfo::AnalyzeBranchImpl(
     MachineBasicBlock &MBB, MachineBasicBlock *&TBB, MachineBasicBlock *&FBB,
     SmallVectorImpl<MachineOperand> &Cond,
@@ -3914,7 +4068,7 @@ bool X86InstrInfo::AnalyzeBranchImpl(
 
     // Working from the bottom, when we see a non-terminator instruction, we're
     // done.
-    if (!isUnpredicatedTerminator(I))
+    if (!isUnpredicatedTerminator(*I))
       break;
 
     // A terminator that isn't a branch can't easily be handled by this
@@ -4000,7 +4154,7 @@ bool X86InstrInfo::AnalyzeBranchImpl(
       FBB = TBB;
       TBB = I->getOperand(0).getMBB();
       Cond.push_back(MachineOperand::CreateImm(BranchCode));
-      CondBranches.push_back(I);
+      CondBranches.push_back(&*I);
       continue;
     }
 
@@ -4010,41 +4164,56 @@ bool X86InstrInfo::AnalyzeBranchImpl(
     assert(Cond.size() == 1);
     assert(TBB);
 
-    // Only handle the case where all conditional branches branch to the same
-    // destination.
-    if (TBB != I->getOperand(0).getMBB())
-      return true;
-
     // If the conditions are the same, we can leave them alone.
     X86::CondCode OldBranchCode = (X86::CondCode)Cond[0].getImm();
-    if (OldBranchCode == BranchCode)
+    auto NewTBB = I->getOperand(0).getMBB();
+    if (OldBranchCode == BranchCode && TBB == NewTBB)
       continue;
 
     // If they differ, see if they fit one of the known patterns. Theoretically,
     // we could handle more patterns here, but we shouldn't expect to see them
     // if instruction selection has done a reasonable job.
-    if ((OldBranchCode == X86::COND_NP &&
-         BranchCode == X86::COND_E) ||
-        (OldBranchCode == X86::COND_E &&
-         BranchCode == X86::COND_NP))
-      BranchCode = X86::COND_NP_OR_E;
-    else if ((OldBranchCode == X86::COND_P &&
-              BranchCode == X86::COND_NE) ||
-             (OldBranchCode == X86::COND_NE &&
-              BranchCode == X86::COND_P))
+    if (TBB == NewTBB &&
+               ((OldBranchCode == X86::COND_P && BranchCode == X86::COND_NE) ||
+                (OldBranchCode == X86::COND_NE && BranchCode == X86::COND_P))) {
       BranchCode = X86::COND_NE_OR_P;
-    else
+    } else if ((OldBranchCode == X86::COND_NP && BranchCode == X86::COND_NE) ||
+               (OldBranchCode == X86::COND_E && BranchCode == X86::COND_P)) {
+      if (NewTBB != (FBB ? FBB : getFallThroughMBB(&MBB, TBB)))
+        return true;
+
+      // X86::COND_E_AND_NP usually has two different branch destinations.
+      //
+      // JP B1
+      // JE B2
+      // JMP B1
+      // B1:
+      // B2:
+      //
+      // Here this condition branches to B2 only if NP && E. It has another
+      // equivalent form:
+      //
+      // JNE B1
+      // JNP B2
+      // JMP B1
+      // B1:
+      // B2:
+      //
+      // Similarly it branches to B2 only if E && NP. That is why this condition
+      // is named with COND_E_AND_NP.
+      BranchCode = X86::COND_E_AND_NP;
+    } else
       return true;
 
     // Update the MachineOperand.
     Cond[0].setImm(BranchCode);
-    CondBranches.push_back(I);
+    CondBranches.push_back(&*I);
   }
 
   return false;
 }
 
-bool X86InstrInfo::AnalyzeBranch(MachineBasicBlock &MBB,
+bool X86InstrInfo::analyzeBranch(MachineBasicBlock &MBB,
                                  MachineBasicBlock *&TBB,
                                  MachineBasicBlock *&FBB,
                                  SmallVectorImpl<MachineOperand> &Cond,
@@ -4053,7 +4222,7 @@ bool X86InstrInfo::AnalyzeBranch(MachineBasicBlock &MBB,
   return AnalyzeBranchImpl(MBB, TBB, FBB, Cond, CondBranches, AllowModify);
 }
 
-bool X86InstrInfo::AnalyzeBranchPredicate(MachineBasicBlock &MBB,
+bool X86InstrInfo::analyzeBranchPredicate(MachineBasicBlock &MBB,
                                           MachineBranchPredicate &MBP,
                                           bool AllowModify) const {
   using namespace std::placeholders;
@@ -4142,10 +4311,11 @@ unsigned X86InstrInfo::RemoveBranch(MachineBasicBlock &MBB) const {
   return Count;
 }
 
-unsigned
-X86InstrInfo::InsertBranch(MachineBasicBlock &MBB, MachineBasicBlock *TBB,
-                           MachineBasicBlock *FBB, ArrayRef<MachineOperand> Cond,
-                           DebugLoc DL) const {
+unsigned X86InstrInfo::InsertBranch(MachineBasicBlock &MBB,
+                                    MachineBasicBlock *TBB,
+                                    MachineBasicBlock *FBB,
+                                    ArrayRef<MachineOperand> Cond,
+                                    const DebugLoc &DL) const {
   // Shouldn't be a fall through.
   assert(TBB && "InsertBranch must not be told to insert a fallthrough");
   assert((Cond.size() == 1 || Cond.size() == 0) &&
@@ -4158,17 +4328,13 @@ X86InstrInfo::InsertBranch(MachineBasicBlock &MBB, MachineBasicBlock *TBB,
     return 1;
   }
 
+  // If FBB is null, it is implied to be a fall-through block.
+  bool FallThru = FBB == nullptr;
+
   // Conditional branch.
   unsigned Count = 0;
   X86::CondCode CC = (X86::CondCode)Cond[0].getImm();
   switch (CC) {
-  case X86::COND_NP_OR_E:
-    // Synthesize NP_OR_E with two branches.
-    BuildMI(&MBB, DL, get(X86::JNP_1)).addMBB(TBB);
-    ++Count;
-    BuildMI(&MBB, DL, get(X86::JE_1)).addMBB(TBB);
-    ++Count;
-    break;
   case X86::COND_NE_OR_P:
     // Synthesize NE_OR_P with two branches.
     BuildMI(&MBB, DL, get(X86::JNE_1)).addMBB(TBB);
@@ -4176,13 +4342,26 @@ X86InstrInfo::InsertBranch(MachineBasicBlock &MBB, MachineBasicBlock *TBB,
     BuildMI(&MBB, DL, get(X86::JP_1)).addMBB(TBB);
     ++Count;
     break;
+  case X86::COND_E_AND_NP:
+    // Use the next block of MBB as FBB if it is null.
+    if (FBB == nullptr) {
+      FBB = getFallThroughMBB(&MBB, TBB);
+      assert(FBB && "MBB cannot be the last block in function when the false "
+                    "body is a fall-through.");
+    }
+    // Synthesize COND_E_AND_NP with two branches.
+    BuildMI(&MBB, DL, get(X86::JNE_1)).addMBB(FBB);
+    ++Count;
+    BuildMI(&MBB, DL, get(X86::JNP_1)).addMBB(TBB);
+    ++Count;
+    break;
   default: {
     unsigned Opc = GetCondBranchFromCond(CC);
     BuildMI(&MBB, DL, get(Opc)).addMBB(TBB);
     ++Count;
   }
   }
-  if (FBB) {
+  if (!FallThru) {
     // Two-way Conditional branch. Insert the second branch.
     BuildMI(&MBB, DL, get(X86::JMP_1)).addMBB(FBB);
     ++Count;
@@ -4228,15 +4407,16 @@ canInsertSelect(const MachineBasicBlock &MBB,
 }
 
 void X86InstrInfo::insertSelect(MachineBasicBlock &MBB,
-                                MachineBasicBlock::iterator I, DebugLoc DL,
-                                unsigned DstReg, ArrayRef<MachineOperand> Cond,
-                                unsigned TrueReg, unsigned FalseReg) const {
-   MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
-   assert(Cond.size() == 1 && "Invalid Cond array");
-   unsigned Opc = getCMovFromCond((X86::CondCode)Cond[0].getImm(),
-                                  MRI.getRegClass(DstReg)->getSize(),
-                                  false/*HasMemoryOperand*/);
-   BuildMI(MBB, I, DL, get(Opc), DstReg).addReg(FalseReg).addReg(TrueReg);
+                                MachineBasicBlock::iterator I,
+                                const DebugLoc &DL, unsigned DstReg,
+                                ArrayRef<MachineOperand> Cond, unsigned TrueReg,
+                                unsigned FalseReg) const {
+  MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
+  assert(Cond.size() == 1 && "Invalid Cond array");
+  unsigned Opc = getCMovFromCond((X86::CondCode)Cond[0].getImm(),
+                                 MRI.getRegClass(DstReg)->getSize(),
+                                 false /*HasMemoryOperand*/);
+  BuildMI(MBB, I, DL, get(Opc), DstReg).addReg(FalseReg).addReg(TrueReg);
 }
 
 /// Test if the given register is a physical h register.
@@ -4258,16 +4438,18 @@ static unsigned CopyToFromAsymmetricReg(unsigned DestReg, unsigned SrcReg,
   if (X86::GR64RegClass.contains(DestReg)) {
     if (X86::VR128XRegClass.contains(SrcReg))
       // Copy from a VR128 register to a GR64 register.
-      return HasAVX512 ? X86::VMOVPQIto64Zrr: (HasAVX ? X86::VMOVPQIto64rr :
-                                               X86::MOVPQIto64rr);
+      return HasAVX512 ? X86::VMOVPQIto64Zrr :
+             HasAVX    ? X86::VMOVPQIto64rr  :
+                         X86::MOVPQIto64rr;
     if (X86::VR64RegClass.contains(SrcReg))
       // Copy from a VR64 register to a GR64 register.
       return X86::MMX_MOVD64from64rr;
   } else if (X86::GR64RegClass.contains(SrcReg)) {
     // Copy from a GR64 register to a VR128 register.
     if (X86::VR128XRegClass.contains(DestReg))
-      return HasAVX512 ? X86::VMOV64toPQIZrr: (HasAVX ? X86::VMOV64toPQIrr :
-                                               X86::MOV64toPQIrr);
+      return HasAVX512 ? X86::VMOV64toPQIZrr :
+             HasAVX    ? X86::VMOV64toPQIrr  :
+                         X86::MOV64toPQIrr;
     // Copy from a GR64 register to a VR64 register.
     if (X86::VR64RegClass.contains(DestReg))
       return X86::MMX_MOVD64to64rr;
@@ -4276,22 +4458,30 @@ static unsigned CopyToFromAsymmetricReg(unsigned DestReg, unsigned SrcReg,
   // SrcReg(FR32) -> DestReg(GR32)
   // SrcReg(GR32) -> DestReg(FR32)
 
-  if (X86::GR32RegClass.contains(DestReg) && X86::FR32XRegClass.contains(SrcReg))
+  if (X86::GR32RegClass.contains(DestReg) &&
+      X86::FR32XRegClass.contains(SrcReg))
     // Copy from a FR32 register to a GR32 register.
-    return HasAVX512 ? X86::VMOVSS2DIZrr : (HasAVX ? X86::VMOVSS2DIrr : X86::MOVSS2DIrr);
+    return HasAVX512 ? X86::VMOVSS2DIZrr :
+           HasAVX    ? X86::VMOVSS2DIrr  :
+                       X86::MOVSS2DIrr;
 
-  if (X86::FR32XRegClass.contains(DestReg) && X86::GR32RegClass.contains(SrcReg))
+  if (X86::FR32XRegClass.contains(DestReg) &&
+      X86::GR32RegClass.contains(SrcReg))
     // Copy from a GR32 register to a FR32 register.
-    return HasAVX512 ? X86::VMOVDI2SSZrr : (HasAVX ? X86::VMOVDI2SSrr : X86::MOVDI2SSrr);
+    return HasAVX512 ? X86::VMOVDI2SSZrr :
+           HasAVX    ? X86::VMOVDI2SSrr  :
+                       X86::MOVDI2SSrr;
   return 0;
 }
 
+static bool isMaskRegClass(const TargetRegisterClass *RC) {
+  // All KMASK RegClasses hold the same k registers, can be tested against anyone.
+  return X86::VK16RegClass.hasSubClassEq(RC);
+}
+
 static bool MaskRegClassContains(unsigned Reg) {
-  return X86::VK8RegClass.contains(Reg) ||
-         X86::VK16RegClass.contains(Reg) ||
-         X86::VK32RegClass.contains(Reg) ||
-         X86::VK64RegClass.contains(Reg) ||
-         X86::VK1RegClass.contains(Reg);
+  // All KMASK RegClasses hold the same k registers, can be tested against anyone.
+  return X86::VK16RegClass.contains(Reg);
 }
 
 static bool GRRegClassContains(unsigned Reg) {
@@ -4338,13 +4528,22 @@ unsigned copyPhysRegOpcode_AVX512(unsigned& DestReg, unsigned& SrcReg,
   if (Subtarget.hasBWI())
     if (auto Opc = copyPhysRegOpcode_AVX512_BW(DestReg, SrcReg))
       return Opc;
-  if (X86::VR128XRegClass.contains(DestReg, SrcReg) ||
-      X86::VR256XRegClass.contains(DestReg, SrcReg) ||
-      X86::VR512RegClass.contains(DestReg, SrcReg)) {
-     DestReg = get512BitSuperRegister(DestReg);
-     SrcReg = get512BitSuperRegister(SrcReg);
+  if (X86::VR128XRegClass.contains(DestReg, SrcReg)) {
+    if (Subtarget.hasVLX())
+      return X86::VMOVAPSZ128rr;
+   DestReg = get512BitSuperRegister(DestReg);
+   SrcReg = get512BitSuperRegister(SrcReg);
+   return X86::VMOVAPSZrr;
+  }
+  if (X86::VR256XRegClass.contains(DestReg, SrcReg)) {
+    if (Subtarget.hasVLX())
+      return X86::VMOVAPSZ256rr;
+   DestReg = get512BitSuperRegister(DestReg);
+   SrcReg = get512BitSuperRegister(SrcReg);
+   return X86::VMOVAPSZrr;
+  }
+  if (X86::VR512RegClass.contains(DestReg, SrcReg))
      return X86::VMOVAPSZrr;
-  }
   if (MaskRegClassContains(DestReg) && MaskRegClassContains(SrcReg))
     return X86::KMOVWkk;
   if (MaskRegClassContains(DestReg) && GRRegClassContains(SrcReg)) {
@@ -4359,9 +4558,9 @@ unsigned copyPhysRegOpcode_AVX512(unsigned& DestReg, unsigned& SrcReg,
 }
 
 void X86InstrInfo::copyPhysReg(MachineBasicBlock &MBB,
-                               MachineBasicBlock::iterator MI, DebugLoc DL,
-                               unsigned DestReg, unsigned SrcReg,
-                               bool KillSrc) const {
+                               MachineBasicBlock::iterator MI,
+                               const DebugLoc &DL, unsigned DestReg,
+                               unsigned SrcReg, bool KillSrc) const {
   // First deal with the normal symmetric copies.
   bool HasAVX = Subtarget.hasAVX();
   bool HasAVX512 = Subtarget.hasAVX512();
@@ -4455,22 +4654,33 @@ void X86InstrInfo::copyPhysReg(MachineBasicBlock &MBB,
     // first frame index.
     // See X86ISelLowering.cpp - X86::hasCopyImplyingStackAdjustment.
 
-
-    bool AXDead = (Reg == AX) ||
-                  (MachineBasicBlock::LQR_Dead ==
-                   MBB.computeRegisterLiveness(&getRegisterInfo(), AX, MI));
-    if (!AXDead) {
-      // FIXME: If computeRegisterLiveness() reported LQR_Unknown then AX may
-      // actually be dead. This is not a problem for correctness as we are just
-      // (unnecessarily) saving+restoring a dead register. However the
-      // MachineVerifier expects operands that read from dead registers
-      // to be marked with the "undef" flag.
-      // An example of this can be found in
-      // test/CodeGen/X86/peephole-na-phys-copy-folding.ll and
-      // test/CodeGen/X86/cmpxchg-clobber-flags.ll when using
-      // -verify-machineinstrs.
-      BuildMI(MBB, MI, DL, get(Push)).addReg(AX, getKillRegState(true));
+    const TargetRegisterInfo *TRI = &getRegisterInfo();
+    MachineBasicBlock::LivenessQueryResult LQR =
+        MBB.computeRegisterLiveness(TRI, AX, MI);
+    // We do not want to save and restore AX if we do not have to.
+    // Moreover, if we do so whereas AX is dead, we would need to set
+    // an undef flag on the use of AX, otherwise the verifier will
+    // complain that we read an undef value.
+    // We do not want to change the behavior of the machine verifier
+    // as this is usually wrong to read an undef value.
+    if (MachineBasicBlock::LQR_Unknown == LQR) {
+      LivePhysRegs LPR(TRI);
+      LPR.addLiveOuts(MBB);
+      MachineBasicBlock::iterator I = MBB.end();
+      while (I != MI) {
+        --I;
+        LPR.stepBackward(*I);
+      }
+      // AX contains the top most register in the aliasing hierarchy.
+      // It may not be live, but one of its aliases may be.
+      for (MCRegAliasIterator AI(AX, TRI, true);
+           AI.isValid() && LQR != MachineBasicBlock::LQR_Live; ++AI)
+        LQR = LPR.contains(*AI) ? MachineBasicBlock::LQR_Live
+                                : MachineBasicBlock::LQR_Dead;
     }
+    bool AXDead = (Reg == AX) || (MachineBasicBlock::LQR_Dead == LQR);
+    if (!AXDead)
+      BuildMI(MBB, MI, DL, get(Push)).addReg(AX, getKillRegState(true));
     if (FromEFLAGS) {
       BuildMI(MBB, MI, DL, get(X86::SETOr), X86::AL);
       BuildMI(MBB, MI, DL, get(X86::LAHF));
@@ -4493,15 +4703,28 @@ void X86InstrInfo::copyPhysReg(MachineBasicBlock &MBB,
   llvm_unreachable("Cannot emit physreg copy instruction");
 }
 
+static unsigned getLoadStoreMaskRegOpcode(const TargetRegisterClass *RC,
+                                          bool load) {
+  switch (RC->getSize()) {
+  default:
+    llvm_unreachable("Unknown spill size");
+  case 2:
+    return load ? X86::KMOVWkm : X86::KMOVWmk;
+  case 4:
+    return load ? X86::KMOVDkm : X86::KMOVDmk;
+  case 8:
+    return load ? X86::KMOVQkm : X86::KMOVQmk;
+  }
+}
+
 static unsigned getLoadStoreRegOpcode(unsigned Reg,
                                       const TargetRegisterClass *RC,
                                       bool isStackAligned,
                                       const X86Subtarget &STI,
                                       bool load) {
   if (STI.hasAVX512()) {
-    if (X86::VK8RegClass.hasSubClassEq(RC)  ||
-      X86::VK16RegClass.hasSubClassEq(RC))
-      return load ? X86::KMOVWkm : X86::KMOVWmk;
+    if (isMaskRegClass(RC))
+      return getLoadStoreMaskRegOpcode(RC, load);
     if (RC->getSize() == 4 && X86::FR32XRegClass.hasSubClassEq(RC))
       return load ? X86::VMOVSSZrm : X86::VMOVSSZmr;
     if (RC->getSize() == 8 && X86::FR64XRegClass.hasSubClassEq(RC))
@@ -4554,25 +4777,38 @@ static unsigned getLoadStoreRegOpcode(unsigned Reg,
     assert((X86::VR128RegClass.hasSubClassEq(RC) ||
             X86::VR128XRegClass.hasSubClassEq(RC))&& "Unknown 16-byte regclass");
     // If stack is realigned we can use aligned stores.
+    if (X86::VR128RegClass.hasSubClassEq(RC)) {
+      if (isStackAligned)
+        return load ? (HasAVX ? X86::VMOVAPSrm : X86::MOVAPSrm)
+                    : (HasAVX ? X86::VMOVAPSmr : X86::MOVAPSmr);
+      else
+        return load ? (HasAVX ? X86::VMOVUPSrm : X86::MOVUPSrm)
+                    : (HasAVX ? X86::VMOVUPSmr : X86::MOVUPSmr);
+    }
+    assert(STI.hasVLX() && "Using extended register requires VLX");
     if (isStackAligned)
-      return load ?
-        (HasAVX ? X86::VMOVAPSrm : X86::MOVAPSrm) :
-        (HasAVX ? X86::VMOVAPSmr : X86::MOVAPSmr);
+      return load ? X86::VMOVAPSZ128rm : X86::VMOVAPSZ128mr;
     else
-      return load ?
-        (HasAVX ? X86::VMOVUPSrm : X86::MOVUPSrm) :
-        (HasAVX ? X86::VMOVUPSmr : X86::MOVUPSmr);
+      return load ? X86::VMOVUPSZ128rm : X86::VMOVUPSZ128mr;
   }
   case 32:
     assert((X86::VR256RegClass.hasSubClassEq(RC) ||
             X86::VR256XRegClass.hasSubClassEq(RC)) && "Unknown 32-byte regclass");
     // If stack is realigned we can use aligned stores.
+    if (X86::VR256RegClass.hasSubClassEq(RC)) {
+      if (isStackAligned)
+        return load ? X86::VMOVAPSYrm : X86::VMOVAPSYmr;
+      else
+        return load ? X86::VMOVUPSYrm : X86::VMOVUPSYmr;
+    }
+    assert(STI.hasVLX() && "Using extended register requires VLX");
     if (isStackAligned)
-      return load ? X86::VMOVAPSYrm : X86::VMOVAPSYmr;
+      return load ? X86::VMOVAPSZ256rm : X86::VMOVAPSZ256mr;
     else
-      return load ? X86::VMOVUPSYrm : X86::VMOVUPSYmr;
+      return load ? X86::VMOVUPSZ256rm : X86::VMOVUPSZ256mr;
   case 64:
     assert(X86::VR512RegClass.hasSubClassEq(RC) && "Unknown 64-byte regclass");
+    assert(STI.hasVLX() && "Using 512-bit register requires AVX512");
     if (isStackAligned)
       return load ? X86::VMOVAPSZrm : X86::VMOVAPSZmr;
     else
@@ -4580,25 +4816,29 @@ static unsigned getLoadStoreRegOpcode(unsigned Reg,
   }
 }
 
-bool X86InstrInfo::getMemOpBaseRegImmOfs(MachineInstr *MemOp, unsigned &BaseReg,
-                                         unsigned &Offset,
+bool X86InstrInfo::getMemOpBaseRegImmOfs(MachineInstr &MemOp, unsigned &BaseReg,
+                                         int64_t &Offset,
                                          const TargetRegisterInfo *TRI) const {
-  const MCInstrDesc &Desc = MemOp->getDesc();
-  int MemRefBegin = X86II::getMemoryOperandNo(Desc.TSFlags, MemOp->getOpcode());
+  const MCInstrDesc &Desc = MemOp.getDesc();
+  int MemRefBegin = X86II::getMemoryOperandNo(Desc.TSFlags);
   if (MemRefBegin < 0)
     return false;
 
   MemRefBegin += X86II::getOperandBias(Desc);
 
-  BaseReg = MemOp->getOperand(MemRefBegin + X86::AddrBaseReg).getReg();
-  if (MemOp->getOperand(MemRefBegin + X86::AddrScaleAmt).getImm() != 1)
+  MachineOperand &BaseMO = MemOp.getOperand(MemRefBegin + X86::AddrBaseReg);
+  if (!BaseMO.isReg()) // Can be an MO_FrameIndex
+    return false;
+
+  BaseReg = BaseMO.getReg();
+  if (MemOp.getOperand(MemRefBegin + X86::AddrScaleAmt).getImm() != 1)
     return false;
 
-  if (MemOp->getOperand(MemRefBegin + X86::AddrIndexReg).getReg() !=
+  if (MemOp.getOperand(MemRefBegin + X86::AddrIndexReg).getReg() !=
       X86::NoRegister)
     return false;
 
-  const MachineOperand &DispMO = MemOp->getOperand(MemRefBegin + X86::AddrDisp);
+  const MachineOperand &DispMO = MemOp.getOperand(MemRefBegin + X86::AddrDisp);
 
   // Displacement can be symbolic
   if (!DispMO.isImm())
@@ -4606,8 +4846,8 @@ bool X86InstrInfo::getMemOpBaseRegImmOfs(MachineInstr *MemOp, unsigned &BaseReg,
 
   Offset = DispMO.getImm();
 
-  return (MemOp->getOperand(MemRefBegin + X86::AddrIndexReg).getReg() ==
-          X86::NoRegister);
+  return MemOp.getOperand(MemRefBegin + X86::AddrIndexReg).getReg() ==
+         X86::NoRegister;
 }
 
 static unsigned getStoreRegOpcode(unsigned SrcReg,
@@ -4697,10 +4937,10 @@ void X86InstrInfo::loadRegFromAddr(MachineFunction &MF, unsigned DestReg,
   NewMIs.push_back(MIB);
 }
 
-bool X86InstrInfo::
-analyzeCompare(const MachineInstr *MI, unsigned &SrcReg, unsigned &SrcReg2,
-               int &CmpMask, int &CmpValue) const {
-  switch (MI->getOpcode()) {
+bool X86InstrInfo::analyzeCompare(const MachineInstr &MI, unsigned &SrcReg,
+                                  unsigned &SrcReg2, int &CmpMask,
+                                  int &CmpValue) const {
+  switch (MI.getOpcode()) {
   default: break;
   case X86::CMP64ri32:
   case X86::CMP64ri8:
@@ -4709,17 +4949,17 @@ analyzeCompare(const MachineInstr *MI, unsigned &SrcReg, unsigned &SrcReg2,
   case X86::CMP16ri:
   case X86::CMP16ri8:
   case X86::CMP8ri:
-    SrcReg = MI->getOperand(0).getReg();
+    SrcReg = MI.getOperand(0).getReg();
     SrcReg2 = 0;
     CmpMask = ~0;
-    CmpValue = MI->getOperand(1).getImm();
+    CmpValue = MI.getOperand(1).getImm();
     return true;
   // A SUB can be used to perform comparison.
   case X86::SUB64rm:
   case X86::SUB32rm:
   case X86::SUB16rm:
   case X86::SUB8rm:
-    SrcReg = MI->getOperand(1).getReg();
+    SrcReg = MI.getOperand(1).getReg();
     SrcReg2 = 0;
     CmpMask = ~0;
     CmpValue = 0;
@@ -4728,8 +4968,8 @@ analyzeCompare(const MachineInstr *MI, unsigned &SrcReg, unsigned &SrcReg2,
   case X86::SUB32rr:
   case X86::SUB16rr:
   case X86::SUB8rr:
-    SrcReg = MI->getOperand(1).getReg();
-    SrcReg2 = MI->getOperand(2).getReg();
+    SrcReg = MI.getOperand(1).getReg();
+    SrcReg2 = MI.getOperand(2).getReg();
     CmpMask = ~0;
     CmpValue = 0;
     return true;
@@ -4740,17 +4980,17 @@ analyzeCompare(const MachineInstr *MI, unsigned &SrcReg, unsigned &SrcReg2,
   case X86::SUB16ri:
   case X86::SUB16ri8:
   case X86::SUB8ri:
-    SrcReg = MI->getOperand(1).getReg();
+    SrcReg = MI.getOperand(1).getReg();
     SrcReg2 = 0;
     CmpMask = ~0;
-    CmpValue = MI->getOperand(2).getImm();
+    CmpValue = MI.getOperand(2).getImm();
     return true;
   case X86::CMP64rr:
   case X86::CMP32rr:
   case X86::CMP16rr:
   case X86::CMP8rr:
-    SrcReg = MI->getOperand(0).getReg();
-    SrcReg2 = MI->getOperand(1).getReg();
+    SrcReg = MI.getOperand(0).getReg();
+    SrcReg2 = MI.getOperand(1).getReg();
     CmpMask = ~0;
     CmpValue = 0;
     return true;
@@ -4758,8 +4998,9 @@ analyzeCompare(const MachineInstr *MI, unsigned &SrcReg, unsigned &SrcReg2,
   case X86::TEST16rr:
   case X86::TEST32rr:
   case X86::TEST64rr:
-    SrcReg = MI->getOperand(0).getReg();
-    if (MI->getOperand(1).getReg() != SrcReg) return false;
+    SrcReg = MI.getOperand(0).getReg();
+    if (MI.getOperand(1).getReg() != SrcReg)
+      return false;
     // Compare against zero.
     SrcReg2 = 0;
     CmpMask = ~0;
@@ -4775,47 +5016,40 @@ analyzeCompare(const MachineInstr *MI, unsigned &SrcReg, unsigned &SrcReg2,
 /// This function can be extended later on.
 /// SrcReg, SrcRegs: register operands for FlagI.
 /// ImmValue: immediate for FlagI if it takes an immediate.
-inline static bool isRedundantFlagInstr(MachineInstr *FlagI, unsigned SrcReg,
+inline static bool isRedundantFlagInstr(MachineInstr &FlagI, unsigned SrcReg,
                                         unsigned SrcReg2, int ImmValue,
-                                        MachineInstr *OI) {
-  if (((FlagI->getOpcode() == X86::CMP64rr &&
-        OI->getOpcode() == X86::SUB64rr) ||
-       (FlagI->getOpcode() == X86::CMP32rr &&
-        OI->getOpcode() == X86::SUB32rr)||
-       (FlagI->getOpcode() == X86::CMP16rr &&
-        OI->getOpcode() == X86::SUB16rr)||
-       (FlagI->getOpcode() == X86::CMP8rr &&
-        OI->getOpcode() == X86::SUB8rr)) &&
-      ((OI->getOperand(1).getReg() == SrcReg &&
-        OI->getOperand(2).getReg() == SrcReg2) ||
-       (OI->getOperand(1).getReg() == SrcReg2 &&
-        OI->getOperand(2).getReg() == SrcReg)))
+                                        MachineInstr &OI) {
+  if (((FlagI.getOpcode() == X86::CMP64rr && OI.getOpcode() == X86::SUB64rr) ||
+       (FlagI.getOpcode() == X86::CMP32rr && OI.getOpcode() == X86::SUB32rr) ||
+       (FlagI.getOpcode() == X86::CMP16rr && OI.getOpcode() == X86::SUB16rr) ||
+       (FlagI.getOpcode() == X86::CMP8rr && OI.getOpcode() == X86::SUB8rr)) &&
+      ((OI.getOperand(1).getReg() == SrcReg &&
+        OI.getOperand(2).getReg() == SrcReg2) ||
+       (OI.getOperand(1).getReg() == SrcReg2 &&
+        OI.getOperand(2).getReg() == SrcReg)))
     return true;
 
-  if (((FlagI->getOpcode() == X86::CMP64ri32 &&
-        OI->getOpcode() == X86::SUB64ri32) ||
-       (FlagI->getOpcode() == X86::CMP64ri8 &&
-        OI->getOpcode() == X86::SUB64ri8) ||
-       (FlagI->getOpcode() == X86::CMP32ri &&
-        OI->getOpcode() == X86::SUB32ri) ||
-       (FlagI->getOpcode() == X86::CMP32ri8 &&
-        OI->getOpcode() == X86::SUB32ri8) ||
-       (FlagI->getOpcode() == X86::CMP16ri &&
-        OI->getOpcode() == X86::SUB16ri) ||
-       (FlagI->getOpcode() == X86::CMP16ri8 &&
-        OI->getOpcode() == X86::SUB16ri8) ||
-       (FlagI->getOpcode() == X86::CMP8ri &&
-        OI->getOpcode() == X86::SUB8ri)) &&
-      OI->getOperand(1).getReg() == SrcReg &&
-      OI->getOperand(2).getImm() == ImmValue)
+  if (((FlagI.getOpcode() == X86::CMP64ri32 &&
+        OI.getOpcode() == X86::SUB64ri32) ||
+       (FlagI.getOpcode() == X86::CMP64ri8 &&
+        OI.getOpcode() == X86::SUB64ri8) ||
+       (FlagI.getOpcode() == X86::CMP32ri && OI.getOpcode() == X86::SUB32ri) ||
+       (FlagI.getOpcode() == X86::CMP32ri8 &&
+        OI.getOpcode() == X86::SUB32ri8) ||
+       (FlagI.getOpcode() == X86::CMP16ri && OI.getOpcode() == X86::SUB16ri) ||
+       (FlagI.getOpcode() == X86::CMP16ri8 &&
+        OI.getOpcode() == X86::SUB16ri8) ||
+       (FlagI.getOpcode() == X86::CMP8ri && OI.getOpcode() == X86::SUB8ri)) &&
+      OI.getOperand(1).getReg() == SrcReg &&
+      OI.getOperand(2).getImm() == ImmValue)
     return true;
   return false;
 }
 
 /// Check whether the definition can be converted
 /// to remove a comparison against zero.
-inline static bool isDefConvertible(MachineInstr *MI) {
-  switch (MI->getOpcode()) {
+inline static bool isDefConvertible(MachineInstr &MI) {
+  switch (MI.getOpcode()) {
   default: return false;
 
   // The shift instructions only modify ZF if their shift count is non-zero.
@@ -4899,8 +5133,8 @@ inline static bool isDefConvertible(MachineInstr *MI) {
 }
 
 /// Check whether the use can be converted to remove a comparison against zero.
-static X86::CondCode isUseDefConvertible(MachineInstr *MI) {
-  switch (MI->getOpcode()) {
+static X86::CondCode isUseDefConvertible(MachineInstr &MI) {
+  switch (MI.getOpcode()) {
   default: return X86::COND_INVALID;
   case X86::LZCNT16rr: case X86::LZCNT16rm:
   case X86::LZCNT32rr: case X86::LZCNT32rm:
@@ -4920,13 +5154,13 @@ static X86::CondCode isUseDefConvertible(MachineInstr *MI) {
 /// Check if there exists an earlier instruction that
 /// operates on the same source operands and sets flags in the same way as
 /// Compare; remove Compare if possible.
-bool X86InstrInfo::
-optimizeCompareInstr(MachineInstr *CmpInstr, unsigned SrcReg, unsigned SrcReg2,
-                     int CmpMask, int CmpValue,
-                     const MachineRegisterInfo *MRI) const {
+bool X86InstrInfo::optimizeCompareInstr(MachineInstr &CmpInstr, unsigned SrcReg,
+                                        unsigned SrcReg2, int CmpMask,
+                                        int CmpValue,
+                                        const MachineRegisterInfo *MRI) const {
   // Check whether we can replace SUB with CMP.
   unsigned NewOpcode = 0;
-  switch (CmpInstr->getOpcode()) {
+  switch (CmpInstr.getOpcode()) {
   default: break;
   case X86::SUB64ri32:
   case X86::SUB64ri8:
@@ -4943,10 +5177,10 @@ optimizeCompareInstr(MachineInstr *CmpInstr, unsigned SrcReg, unsigned SrcReg2,
   case X86::SUB32rr:
   case X86::SUB16rr:
   case X86::SUB8rr: {
-    if (!MRI->use_nodbg_empty(CmpInstr->getOperand(0).getReg()))
+    if (!MRI->use_nodbg_empty(CmpInstr.getOperand(0).getReg()))
       return false;
     // There is no use of the destination register, we can replace SUB with CMP.
-    switch (CmpInstr->getOpcode()) {
+    switch (CmpInstr.getOpcode()) {
     default: llvm_unreachable("Unreachable!");
     case X86::SUB64rm:   NewOpcode = X86::CMP64rm;   break;
     case X86::SUB32rm:   NewOpcode = X86::CMP32rm;   break;
@@ -4964,8 +5198,8 @@ optimizeCompareInstr(MachineInstr *CmpInstr, unsigned SrcReg, unsigned SrcReg2,
     case X86::SUB16ri8:  NewOpcode = X86::CMP16ri8;  break;
     case X86::SUB8ri:    NewOpcode = X86::CMP8ri;    break;
     }
-    CmpInstr->setDesc(get(NewOpcode));
-    CmpInstr->RemoveOperand(0);
+    CmpInstr.setDesc(get(NewOpcode));
+    CmpInstr.RemoveOperand(0);
     // Fall through to optimize Cmp if Cmp is CMPrr or CMPri.
     if (NewOpcode == X86::CMP64rm || NewOpcode == X86::CMP32rm ||
         NewOpcode == X86::CMP16rm || NewOpcode == X86::CMP8rm)
@@ -4983,7 +5217,7 @@ optimizeCompareInstr(MachineInstr *CmpInstr, unsigned SrcReg, unsigned SrcReg2,
   // If we are comparing against zero, check whether we can use MI to update
   // EFLAGS. If MI is not in the same BB as CmpInstr, do not optimize.
   bool IsCmpZero = (SrcReg2 == 0 && CmpValue == 0);
-  if (IsCmpZero && MI->getParent() != CmpInstr->getParent())
+  if (IsCmpZero && MI->getParent() != CmpInstr.getParent())
     return false;
 
   // If we have a use of the source register between the def and our compare
@@ -4991,19 +5225,20 @@ optimizeCompareInstr(MachineInstr *CmpInstr, unsigned SrcReg, unsigned SrcReg2,
   // right way.
   bool ShouldUpdateCC = false;
   X86::CondCode NewCC = X86::COND_INVALID;
-  if (IsCmpZero && !isDefConvertible(MI)) {
+  if (IsCmpZero && !isDefConvertible(*MI)) {
     // Scan forward from the use until we hit the use we're looking for or the
     // compare instruction.
     for (MachineBasicBlock::iterator J = MI;; ++J) {
       // Do we have a convertible instruction?
-      NewCC = isUseDefConvertible(J);
+      NewCC = isUseDefConvertible(*J);
       if (NewCC != X86::COND_INVALID && J->getOperand(1).isReg() &&
           J->getOperand(1).getReg() == SrcReg) {
         assert(J->definesRegister(X86::EFLAGS) && "Must be an EFLAGS def!");
         ShouldUpdateCC = true; // Update CC later on.
         // This is not a def of SrcReg, but still a def of EFLAGS. Keep going
         // with the new def.
-        MI = Def = J;
+        Def = J;
+        MI = &*Def;
         break;
       }
 
@@ -5024,29 +5259,29 @@ optimizeCompareInstr(MachineInstr *CmpInstr, unsigned SrcReg, unsigned SrcReg2,
   // otherwise, RE is the rend of the basic block.
   MachineBasicBlock::reverse_iterator
       RI = MachineBasicBlock::reverse_iterator(I),
-      RE = CmpInstr->getParent() == MI->getParent() ?
-           MachineBasicBlock::reverse_iterator(++Def) /* points to MI */ :
-           CmpInstr->getParent()->rend();
+      RE = CmpInstr.getParent() == MI->getParent()
+               ? MachineBasicBlock::reverse_iterator(++Def) /* points to MI */
+               : CmpInstr.getParent()->rend();
   MachineInstr *Movr0Inst = nullptr;
   for (; RI != RE; ++RI) {
-    MachineInstr *Instr = &*RI;
+    MachineInstr &Instr = *RI;
     // Check whether CmpInstr can be made redundant by the current instruction.
     if (!IsCmpZero &&
         isRedundantFlagInstr(CmpInstr, SrcReg, SrcReg2, CmpValue, Instr)) {
-      Sub = Instr;
+      Sub = &Instr;
       break;
     }
 
-    if (Instr->modifiesRegister(X86::EFLAGS, TRI) ||
-        Instr->readsRegister(X86::EFLAGS, TRI)) {
+    if (Instr.modifiesRegister(X86::EFLAGS, TRI) ||
+        Instr.readsRegister(X86::EFLAGS, TRI)) {
       // This instruction modifies or uses EFLAGS.
 
       // MOV32r0 etc. are implemented with xor which clobbers condition code.
       // They are safe to move up, if the definition to EFLAGS is dead and
       // earlier instructions do not read or write EFLAGS.
-      if (!Movr0Inst && Instr->getOpcode() == X86::MOV32r0 &&
-          Instr->registerDefIsDead(X86::EFLAGS, TRI)) {
-        Movr0Inst = Instr;
+      if (!Movr0Inst && Instr.getOpcode() == X86::MOV32r0 &&
+          Instr.registerDefIsDead(X86::EFLAGS, TRI)) {
+        Movr0Inst = &Instr;
         continue;
       }
 
@@ -5068,7 +5303,7 @@ optimizeCompareInstr(MachineInstr *CmpInstr, unsigned SrcReg, unsigned SrcReg2,
   // live-out.
   bool IsSafe = false;
   SmallVector<std::pair<MachineInstr*, unsigned /*NewOpc*/>, 4> OpsToUpdate;
-  MachineBasicBlock::iterator E = CmpInstr->getParent()->end();
+  MachineBasicBlock::iterator E = CmpInstr.getParent()->end();
   for (++I; I != E; ++I) {
     const MachineInstr &Instr = *I;
     bool ModifyEFLAGS = Instr.modifiesRegister(X86::EFLAGS, TRI);
@@ -5159,7 +5394,7 @@ optimizeCompareInstr(MachineInstr *CmpInstr, unsigned SrcReg, unsigned SrcReg2,
   // If EFLAGS is not killed nor re-defined, we should check whether it is
   // live-out. If it is live-out, do not optimize.
   if ((IsCmpZero || IsSwapped) && !IsSafe) {
-    MachineBasicBlock *MBB = CmpInstr->getParent();
+    MachineBasicBlock *MBB = CmpInstr.getParent();
     for (MachineBasicBlock *Successor : MBB->successors())
       if (Successor->isLiveIn(X86::EFLAGS))
         return false;
@@ -5199,7 +5434,7 @@ optimizeCompareInstr(MachineInstr *CmpInstr, unsigned SrcReg, unsigned SrcReg2,
   }
   assert(i != e && "Unable to locate a def EFLAGS operand");
 
-  CmpInstr->eraseFromParent();
+  CmpInstr.eraseFromParent();
 
   // Modify the condition code of instructions in OpsToUpdate.
   for (auto &Op : OpsToUpdate)
@@ -5211,14 +5446,14 @@ optimizeCompareInstr(MachineInstr *CmpInstr, unsigned SrcReg, unsigned SrcReg2,
 /// operand at the use. We fold the load instructions if load defines a virtual
 /// register, the virtual register is used once in the same BB, and the
 /// instructions in-between do not load or store, and have no side effects.
-MachineInstr *X86InstrInfo::optimizeLoadInstr(MachineInstr *MI,
+MachineInstr *X86InstrInfo::optimizeLoadInstr(MachineInstr &MI,
                                               const MachineRegisterInfo *MRI,
                                               unsigned &FoldAsLoadDefReg,
                                               MachineInstr *&DefMI) const {
   if (FoldAsLoadDefReg == 0)
     return nullptr;
   // To be conservative, if there exists another load, clear the load candidate.
-  if (MI->mayLoad()) {
+  if (MI.mayLoad()) {
     FoldAsLoadDefReg = 0;
     return nullptr;
   }
@@ -5233,8 +5468,8 @@ MachineInstr *X86InstrInfo::optimizeLoadInstr(MachineInstr *MI,
   // Collect information about virtual register operands of MI.
   unsigned SrcOperandId = 0;
   bool FoundSrcOperand = false;
-  for (unsigned i = 0, e = MI->getDesc().getNumOperands(); i != e; ++i) {
-    MachineOperand &MO = MI->getOperand(i);
+  for (unsigned i = 0, e = MI.getDesc().getNumOperands(); i != e; ++i) {
+    MachineOperand &MO = MI.getOperand(i);
     if (!MO.isReg())
       continue;
     unsigned Reg = MO.getReg();
@@ -5251,7 +5486,7 @@ MachineInstr *X86InstrInfo::optimizeLoadInstr(MachineInstr *MI,
     return nullptr;
 
   // Check whether we can fold the def into SrcOperandId.
-  if (MachineInstr *FoldMI = foldMemoryOperand(MI, SrcOperandId, DefMI)) {
+  if (MachineInstr *FoldMI = foldMemoryOperand(MI, SrcOperandId, *DefMI)) {
     FoldAsLoadDefReg = 0;
     return FoldMI;
   }
@@ -5313,6 +5548,60 @@ static bool expandMOV32r1(MachineInstrBuilder &MIB, const TargetInstrInfo &TII,
   return true;
 }
 
+bool X86InstrInfo::ExpandMOVImmSExti8(MachineInstrBuilder &MIB) const {
+  MachineBasicBlock &MBB = *MIB->getParent();
+  DebugLoc DL = MIB->getDebugLoc();
+  int64_t Imm = MIB->getOperand(1).getImm();
+  assert(Imm != 0 && "Using push/pop for 0 is not efficient.");
+  MachineBasicBlock::iterator I = MIB.getInstr();
+
+  int StackAdjustment;
+
+  if (Subtarget.is64Bit()) {
+    assert(MIB->getOpcode() == X86::MOV64ImmSExti8 ||
+           MIB->getOpcode() == X86::MOV32ImmSExti8);
+
+    // Can't use push/pop lowering if the function might write to the red zone.
+    X86MachineFunctionInfo *X86FI =
+        MBB.getParent()->getInfo<X86MachineFunctionInfo>();
+    if (X86FI->getUsesRedZone()) {
+      MIB->setDesc(get(MIB->getOpcode() == X86::MOV32ImmSExti8 ? X86::MOV32ri
+                                                               : X86::MOV64ri));
+      return true;
+    }
+
+    // 64-bit mode doesn't have 32-bit push/pop, so use 64-bit operations and
+    // widen the register if necessary.
+    StackAdjustment = 8;
+    BuildMI(MBB, I, DL, get(X86::PUSH64i8)).addImm(Imm);
+    MIB->setDesc(get(X86::POP64r));
+    MIB->getOperand(0)
+        .setReg(getX86SubSuperRegister(MIB->getOperand(0).getReg(), 64));
+  } else {
+    assert(MIB->getOpcode() == X86::MOV32ImmSExti8);
+    StackAdjustment = 4;
+    BuildMI(MBB, I, DL, get(X86::PUSH32i8)).addImm(Imm);
+    MIB->setDesc(get(X86::POP32r));
+  }
+
+  // Build CFI if necessary.
+  MachineFunction &MF = *MBB.getParent();
+  const X86FrameLowering *TFL = Subtarget.getFrameLowering();
+  bool IsWin64Prologue = MF.getTarget().getMCAsmInfo()->usesWindowsCFI();
+  bool NeedsDwarfCFI =
+      !IsWin64Prologue &&
+      (MF.getMMI().hasDebugInfo() || MF.getFunction()->needsUnwindTableEntry());
+  bool EmitCFI = !TFL->hasFP(MF) && NeedsDwarfCFI;
+  if (EmitCFI) {
+    TFL->BuildCFI(MBB, I, DL,
+        MCCFIInstruction::createAdjustCfaOffset(nullptr, StackAdjustment));
+    TFL->BuildCFI(MBB, std::next(I), DL,
+        MCCFIInstruction::createAdjustCfaOffset(nullptr, -StackAdjustment));
+  }
+
+  return true;
+}
+
 // LoadStackGuard has so far only been implemented for 64-bit MachO. Different
 // code sequence is needed for other targets.
 static void expandLoadStackGuard(MachineInstrBuilder &MIB,
@@ -5322,9 +5611,9 @@ static void expandLoadStackGuard(MachineInstrBuilder &MIB,
   unsigned Reg = MIB->getOperand(0).getReg();
   const GlobalValue *GV =
       cast<GlobalValue>((*MIB->memoperands_begin())->getValue());
-  unsigned Flag = MachineMemOperand::MOLoad | MachineMemOperand::MOInvariant;
+  auto Flags = MachineMemOperand::MOLoad | MachineMemOperand::MOInvariant;
   MachineMemOperand *MMO = MBB.getParent()->getMachineMemOperand(
-      MachinePointerInfo::getGOT(*MBB.getParent()), Flag, 8, 8);
+      MachinePointerInfo::getGOT(*MBB.getParent()), Flags, 8, 8);
   MachineBasicBlock::iterator I = MIB.getInstr();
 
   BuildMI(MBB, I, DL, TII.get(X86::MOV64rm), Reg).addReg(X86::RIP).addImm(1)
@@ -5335,16 +5624,19 @@ static void expandLoadStackGuard(MachineInstrBuilder &MIB,
   MIB.addReg(Reg, RegState::Kill).addImm(1).addReg(0).addImm(0).addReg(0);
 }
 
-bool X86InstrInfo::expandPostRAPseudo(MachineBasicBlock::iterator MI) const {
+bool X86InstrInfo::expandPostRAPseudo(MachineInstr &MI) const {
   bool HasAVX = Subtarget.hasAVX();
-  MachineInstrBuilder MIB(*MI->getParent()->getParent(), MI);
-  switch (MI->getOpcode()) {
+  MachineInstrBuilder MIB(*MI.getParent()->getParent(), MI);
+  switch (MI.getOpcode()) {
   case X86::MOV32r0:
     return Expand2AddrUndef(MIB, get(X86::XOR32rr));
   case X86::MOV32r1:
     return expandMOV32r1(MIB, *this, /*MinusOne=*/ false);
   case X86::MOV32r_1:
     return expandMOV32r1(MIB, *this, /*MinusOne=*/ true);
+  case X86::MOV32ImmSExti8:
+  case X86::MOV64ImmSExti8:
+    return ExpandMOVImmSExti8(MIB);
   case X86::SETB_C8r:
     return Expand2AddrUndef(MIB, get(X86::SBB8rr));
   case X86::SETB_C16r:
@@ -5360,17 +5652,30 @@ bool X86InstrInfo::expandPostRAPseudo(MachineBasicBlock::iterator MI) const {
   case X86::AVX_SET0:
     assert(HasAVX && "AVX not supported");
     return Expand2AddrUndef(MIB, get(X86::VXORPSYrr));
+  case X86::AVX512_128_SET0:
+    return Expand2AddrUndef(MIB, get(X86::VPXORDZ128rr));
+  case X86::AVX512_256_SET0:
+    return Expand2AddrUndef(MIB, get(X86::VPXORDZ256rr));
   case X86::AVX512_512_SET0:
     return Expand2AddrUndef(MIB, get(X86::VPXORDZrr));
   case X86::V_SETALLONES:
     return Expand2AddrUndef(MIB, get(HasAVX ? X86::VPCMPEQDrr : X86::PCMPEQDrr));
   case X86::AVX2_SETALLONES:
     return Expand2AddrUndef(MIB, get(X86::VPCMPEQDYrr));
+  case X86::AVX512_512_SETALLONES: {
+    unsigned Reg = MIB->getOperand(0).getReg();
+    MIB->setDesc(get(X86::VPTERNLOGDZrri));
+    // VPTERNLOGD needs 3 register inputs and an immediate.
+    // 0xff will return 1s for any input.
+    MIB.addReg(Reg, RegState::Undef).addReg(Reg, RegState::Undef)
+       .addReg(Reg, RegState::Undef).addImm(0xff);
+    return true;
+  }
   case X86::TEST8ri_NOREX:
-    MI->setDesc(get(X86::TEST8ri));
+    MI.setDesc(get(X86::TEST8ri));
     return true;
   case X86::MOV32ri64:
-    MI->setDesc(get(X86::MOV32ri));
+    MI.setDesc(get(X86::MOV32ri));
     return true;
 
   // KNL does not recognize dependency-breaking idioms for mask registers,
@@ -5422,23 +5727,23 @@ static void addOperands(MachineInstrBuilder &MIB, ArrayRef<MachineOperand> MOs,
 static MachineInstr *FuseTwoAddrInst(MachineFunction &MF, unsigned Opcode,
                                      ArrayRef<MachineOperand> MOs,
                                      MachineBasicBlock::iterator InsertPt,
-                                     MachineInstr *MI,
+                                     MachineInstr &MI,
                                      const TargetInstrInfo &TII) {
   // Create the base instruction with the memory operand as the first part.
   // Omit the implicit operands, something BuildMI can't do.
-  MachineInstr *NewMI = MF.CreateMachineInstr(TII.get(Opcode),
-                                              MI->getDebugLoc(), true);
+  MachineInstr *NewMI =
+      MF.CreateMachineInstr(TII.get(Opcode), MI.getDebugLoc(), true);
   MachineInstrBuilder MIB(MF, NewMI);
   addOperands(MIB, MOs);
 
   // Loop over the rest of the ri operands, converting them over.
-  unsigned NumOps = MI->getDesc().getNumOperands()-2;
+  unsigned NumOps = MI.getDesc().getNumOperands() - 2;
   for (unsigned i = 0; i != NumOps; ++i) {
-    MachineOperand &MO = MI->getOperand(i+2);
+    MachineOperand &MO = MI.getOperand(i + 2);
     MIB.addOperand(MO);
   }
-  for (unsigned i = NumOps+2, e = MI->getNumOperands(); i != e; ++i) {
-    MachineOperand &MO = MI->getOperand(i);
+  for (unsigned i = NumOps + 2, e = MI.getNumOperands(); i != e; ++i) {
+    MachineOperand &MO = MI.getOperand(i);
     MIB.addOperand(MO);
   }
 
@@ -5451,15 +5756,15 @@ static MachineInstr *FuseTwoAddrInst(MachineFunction &MF, unsigned Opcode,
 static MachineInstr *FuseInst(MachineFunction &MF, unsigned Opcode,
                               unsigned OpNo, ArrayRef<MachineOperand> MOs,
                               MachineBasicBlock::iterator InsertPt,
-                              MachineInstr *MI, const TargetInstrInfo &TII,
+                              MachineInstr &MI, const TargetInstrInfo &TII,
                               int PtrOffset = 0) {
   // Omit the implicit operands, something BuildMI can't do.
-  MachineInstr *NewMI = MF.CreateMachineInstr(TII.get(Opcode),
-                                              MI->getDebugLoc(), true);
+  MachineInstr *NewMI =
+      MF.CreateMachineInstr(TII.get(Opcode), MI.getDebugLoc(), true);
   MachineInstrBuilder MIB(MF, NewMI);
 
-  for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) {
-    MachineOperand &MO = MI->getOperand(i);
+  for (unsigned i = 0, e = MI.getNumOperands(); i != e; ++i) {
+    MachineOperand &MO = MI.getOperand(i);
     if (i == OpNo) {
       assert(MO.isReg() && "Expected to fold into reg operand!");
       addOperands(MIB, MOs, PtrOffset);
@@ -5477,35 +5782,35 @@ static MachineInstr *FuseInst(MachineFunction &MF, unsigned Opcode,
 static MachineInstr *MakeM0Inst(const TargetInstrInfo &TII, unsigned Opcode,
                                 ArrayRef<MachineOperand> MOs,
                                 MachineBasicBlock::iterator InsertPt,
-                                MachineInstr *MI) {
+                                MachineInstr &MI) {
   MachineInstrBuilder MIB = BuildMI(*InsertPt->getParent(), InsertPt,
-                                    MI->getDebugLoc(), TII.get(Opcode));
+                                    MI.getDebugLoc(), TII.get(Opcode));
   addOperands(MIB, MOs);
   return MIB.addImm(0);
 }
 
 MachineInstr *X86InstrInfo::foldMemoryOperandCustom(
-    MachineFunction &MF, MachineInstr *MI, unsigned OpNum,
+    MachineFunction &MF, MachineInstr &MI, unsigned OpNum,
     ArrayRef<MachineOperand> MOs, MachineBasicBlock::iterator InsertPt,
     unsigned Size, unsigned Align) const {
-  switch (MI->getOpcode()) {
+  switch (MI.getOpcode()) {
   case X86::INSERTPSrr:
   case X86::VINSERTPSrr:
     // Attempt to convert the load of inserted vector into a fold load
     // of a single float.
     if (OpNum == 2) {
-      unsigned Imm = MI->getOperand(MI->getNumOperands() - 1).getImm();
+      unsigned Imm = MI.getOperand(MI.getNumOperands() - 1).getImm();
       unsigned ZMask = Imm & 15;
       unsigned DstIdx = (Imm >> 4) & 3;
       unsigned SrcIdx = (Imm >> 6) & 3;
 
-      unsigned RCSize = getRegClass(MI->getDesc(), OpNum, &RI, MF)->getSize();
+      unsigned RCSize = getRegClass(MI.getDesc(), OpNum, &RI, MF)->getSize();
       if (Size <= RCSize && 4 <= Align) {
         int PtrOffset = SrcIdx * 4;
         unsigned NewImm = (DstIdx << 4) | ZMask;
         unsigned NewOpCode =
-            (MI->getOpcode() == X86::VINSERTPSrr ? X86::VINSERTPSrm
-                                                 : X86::INSERTPSrm);
+            (MI.getOpcode() == X86::VINSERTPSrr ? X86::VINSERTPSrm
+                                                : X86::INSERTPSrm);
         MachineInstr *NewMI =
             FuseInst(MF, NewOpCode, OpNum, MOs, InsertPt, MI, *this, PtrOffset);
         NewMI->getOperand(NewMI->getNumOperands() - 1).setImm(NewImm);
@@ -5513,17 +5818,34 @@ MachineInstr *X86InstrInfo::foldMemoryOperandCustom(
       }
     }
     break;
+  case X86::MOVHLPSrr:
+  case X86::VMOVHLPSrr:
+    // Move the upper 64-bits of the second operand to the lower 64-bits.
+    // To fold the load, adjust the pointer to the upper and use (V)MOVLPS.
+    // TODO: In most cases AVX doesn't have a 8-byte alignment requirement.
+    if (OpNum == 2) {
+      unsigned RCSize = getRegClass(MI.getDesc(), OpNum, &RI, MF)->getSize();
+      if (Size <= RCSize && 8 <= Align) {
+        unsigned NewOpCode =
+            (MI.getOpcode() == X86::VMOVHLPSrr ? X86::VMOVLPSrm
+                                               : X86::MOVLPSrm);
+        MachineInstr *NewMI =
+            FuseInst(MF, NewOpCode, OpNum, MOs, InsertPt, MI, *this, 8);
+        return NewMI;
+      }
+    }
+    break;
   };
 
   return nullptr;
 }
 
 MachineInstr *X86InstrInfo::foldMemoryOperandImpl(
-    MachineFunction &MF, MachineInstr *MI, unsigned OpNum,
+    MachineFunction &MF, MachineInstr &MI, unsigned OpNum,
     ArrayRef<MachineOperand> MOs, MachineBasicBlock::iterator InsertPt,
     unsigned Size, unsigned Align, bool AllowCommute) const {
   const DenseMap<unsigned,
-                 std::pair<unsigned,unsigned> > *OpcodeTablePtr = nullptr;
+                 std::pair<uint16_t, uint16_t> > *OpcodeTablePtr = nullptr;
   bool isCallRegIndirect = Subtarget.callRegIndirect();
   bool isTwoAddrFold = false;
 
@@ -5531,19 +5853,19 @@ MachineInstr *X86InstrInfo::foldMemoryOperandImpl(
   // do not fold loads into calls or pushes, unless optimizing for size
   // aggressively.
   if (isCallRegIndirect && !MF.getFunction()->optForMinSize() &&
-      (MI->getOpcode() == X86::CALL32r || MI->getOpcode() == X86::CALL64r ||
-       MI->getOpcode() == X86::PUSH16r || MI->getOpcode() == X86::PUSH32r ||
-       MI->getOpcode() == X86::PUSH64r))
+      (MI.getOpcode() == X86::CALL32r || MI.getOpcode() == X86::CALL64r ||
+       MI.getOpcode() == X86::PUSH16r || MI.getOpcode() == X86::PUSH32r ||
+       MI.getOpcode() == X86::PUSH64r))
     return nullptr;
 
-  unsigned NumOps = MI->getDesc().getNumOperands();
-  bool isTwoAddr = NumOps > 1 &&
-    MI->getDesc().getOperandConstraint(1, MCOI::TIED_TO) != -1;
+  unsigned NumOps = MI.getDesc().getNumOperands();
+  bool isTwoAddr =
+      NumOps > 1 && MI.getDesc().getOperandConstraint(1, MCOI::TIED_TO) != -1;
 
   // FIXME: AsmPrinter doesn't know how to handle
   // X86II::MO_GOT_ABSOLUTE_ADDRESS after folding.
-  if (MI->getOpcode() == X86::ADD32ri &&
-      MI->getOperand(2).getTargetFlags() == X86II::MO_GOT_ABSOLUTE_ADDRESS)
+  if (MI.getOpcode() == X86::ADD32ri &&
+      MI.getOperand(2).getTargetFlags() == X86II::MO_GOT_ABSOLUTE_ADDRESS)
     return nullptr;
 
   MachineInstr *NewMI = nullptr;
@@ -5556,14 +5878,13 @@ MachineInstr *X86InstrInfo::foldMemoryOperandImpl(
   // Folding a memory location into the two-address part of a two-address
   // instruction is different than folding it other places.  It requires
   // replacing the *two* registers with the memory location.
-  if (isTwoAddr && NumOps >= 2 && OpNum < 2 &&
-      MI->getOperand(0).isReg() &&
-      MI->getOperand(1).isReg() &&
-      MI->getOperand(0).getReg() == MI->getOperand(1).getReg()) {
+  if (isTwoAddr && NumOps >= 2 && OpNum < 2 && MI.getOperand(0).isReg() &&
+      MI.getOperand(1).isReg() &&
+      MI.getOperand(0).getReg() == MI.getOperand(1).getReg()) {
     OpcodeTablePtr = &RegOp2MemOpTable2Addr;
     isTwoAddrFold = true;
   } else if (OpNum == 0) {
-    if (MI->getOpcode() == X86::MOV32r0) {
+    if (MI.getOpcode() == X86::MOV32r0) {
       NewMI = MakeM0Inst(*this, X86::MOV32mi, MOs, InsertPt, MI);
       if (NewMI)
         return NewMI;
@@ -5583,8 +5904,7 @@ MachineInstr *X86InstrInfo::foldMemoryOperandImpl(
   // If table selected...
   if (OpcodeTablePtr) {
     // Find the Opcode to fuse
-    DenseMap<unsigned, std::pair<unsigned,unsigned> >::const_iterator I =
-      OpcodeTablePtr->find(MI->getOpcode());
+    auto I = OpcodeTablePtr->find(MI.getOpcode());
     if (I != OpcodeTablePtr->end()) {
       unsigned Opcode = I->second.first;
       unsigned MinAlign = (I->second.second & TB_ALIGN_MASK) >> TB_ALIGN_SHIFT;
@@ -5592,7 +5912,7 @@ MachineInstr *X86InstrInfo::foldMemoryOperandImpl(
         return nullptr;
       bool NarrowToMOV32rm = false;
       if (Size) {
-        unsigned RCSize = getRegClass(MI->getDesc(), OpNum, &RI, MF)->getSize();
+        unsigned RCSize = getRegClass(MI.getDesc(), OpNum, &RI, MF)->getSize();
         if (Size < RCSize) {
           // Check if it's safe to fold the load. If the size of the object is
           // narrower than the load width, then it's not.
@@ -5601,7 +5921,7 @@ MachineInstr *X86InstrInfo::foldMemoryOperandImpl(
           // If this is a 64-bit load, but the spill slot is 32, then we can do
           // a 32-bit load which is implicitly zero-extended. This likely is
           // due to live interval analysis remat'ing a load from stack slot.
-          if (MI->getOperand(0).getSubReg() || MI->getOperand(1).getSubReg())
+          if (MI.getOperand(0).getSubReg() || MI.getOperand(1).getSubReg())
             return nullptr;
           Opcode = X86::MOV32rm;
           NarrowToMOV32rm = true;
@@ -5632,14 +5952,14 @@ MachineInstr *X86InstrInfo::foldMemoryOperandImpl(
   if (AllowCommute) {
     unsigned CommuteOpIdx1 = OpNum, CommuteOpIdx2 = CommuteAnyOperandIndex;
     if (findCommutedOpIndices(MI, CommuteOpIdx1, CommuteOpIdx2)) {
-      bool HasDef = MI->getDesc().getNumDefs();
-      unsigned Reg0 = HasDef ? MI->getOperand(0).getReg() : 0;
-      unsigned Reg1 = MI->getOperand(CommuteOpIdx1).getReg();
-      unsigned Reg2 = MI->getOperand(CommuteOpIdx2).getReg();
+      bool HasDef = MI.getDesc().getNumDefs();
+      unsigned Reg0 = HasDef ? MI.getOperand(0).getReg() : 0;
+      unsigned Reg1 = MI.getOperand(CommuteOpIdx1).getReg();
+      unsigned Reg2 = MI.getOperand(CommuteOpIdx2).getReg();
       bool Tied1 =
-          0 == MI->getDesc().getOperandConstraint(CommuteOpIdx1, MCOI::TIED_TO);
+          0 == MI.getDesc().getOperandConstraint(CommuteOpIdx1, MCOI::TIED_TO);
       bool Tied2 =
-          0 == MI->getDesc().getOperandConstraint(CommuteOpIdx2, MCOI::TIED_TO);
+          0 == MI.getDesc().getOperandConstraint(CommuteOpIdx2, MCOI::TIED_TO);
 
       // If either of the commutable operands are tied to the destination
       // then we can not commute + fold.
@@ -5653,7 +5973,7 @@ MachineInstr *X86InstrInfo::foldMemoryOperandImpl(
         // Unable to commute.
         return nullptr;
       }
-      if (CommutedMI != MI) {
+      if (CommutedMI != &MI) {
         // New instruction. We can't fold from this.
         CommutedMI->eraseFromParent();
         return nullptr;
@@ -5672,7 +5992,7 @@ MachineInstr *X86InstrInfo::foldMemoryOperandImpl(
         // Unable to commute.
         return nullptr;
       }
-      if (UncommutedMI != MI) {
+      if (UncommutedMI != &MI) {
         // New instruction. It doesn't need to be kept.
         UncommutedMI->eraseFromParent();
         return nullptr;
@@ -5684,8 +6004,8 @@ MachineInstr *X86InstrInfo::foldMemoryOperandImpl(
   }
 
   // No fusion
-  if (PrintFailedFusing && !MI->isCopy())
-    dbgs() << "We failed to fuse operand " << OpNum << " in " << *MI;
+  if (PrintFailedFusing && !MI.isCopy())
+    dbgs() << "We failed to fuse operand " << OpNum << " in " << MI;
   return nullptr;
 }
 
@@ -5723,6 +6043,10 @@ static bool hasPartialRegUpdate(unsigned Opcode) {
   case X86::CVTSS2SDrm:
   case X86::Int_CVTSS2SDrr:
   case X86::Int_CVTSS2SDrm:
+  case X86::MOVHPDrm:
+  case X86::MOVHPSrm:
+  case X86::MOVLPDrm:
+  case X86::MOVLPSrm:
   case X86::RCPSSr:
   case X86::RCPSSm:
   case X86::RCPSSr_Int:
@@ -5753,27 +6077,27 @@ static bool hasPartialRegUpdate(unsigned Opcode) {
 
 /// Inform the ExeDepsFix pass how many idle
 /// instructions we would like before a partial register update.
-unsigned X86InstrInfo::
-getPartialRegUpdateClearance(const MachineInstr *MI, unsigned OpNum,
-                             const TargetRegisterInfo *TRI) const {
-  if (OpNum != 0 || !hasPartialRegUpdate(MI->getOpcode()))
+unsigned X86InstrInfo::getPartialRegUpdateClearance(
+    const MachineInstr &MI, unsigned OpNum,
+    const TargetRegisterInfo *TRI) const {
+  if (OpNum != 0 || !hasPartialRegUpdate(MI.getOpcode()))
     return 0;
 
   // If MI is marked as reading Reg, the partial register update is wanted.
-  const MachineOperand &MO = MI->getOperand(0);
+  const MachineOperand &MO = MI.getOperand(0);
   unsigned Reg = MO.getReg();
   if (TargetRegisterInfo::isVirtualRegister(Reg)) {
-    if (MO.readsReg() || MI->readsVirtualRegister(Reg))
+    if (MO.readsReg() || MI.readsVirtualRegister(Reg))
       return 0;
   } else {
-    if (MI->readsRegister(Reg, TRI))
+    if (MI.readsRegister(Reg, TRI))
       return 0;
   }
 
-  // If any of the preceding 16 instructions are reading Reg, insert a
-  // dependency breaking instruction.  The magic number is based on a few
-  // Nehalem experiments.
-  return 16;
+  // If any instructions in the clearance range are reading Reg, insert a
+  // dependency breaking instruction, which is inexpensive and is likely to
+  // be hidden in other instruction's cycles.
+  return PartialRegUpdateClearance;
 }
 
 // Return true for any instruction the copies the high bits of the first source
@@ -5847,59 +6171,61 @@ static bool hasUndefRegUpdate(unsigned Opcode) {
 ///
 /// Like getPartialRegUpdateClearance, this makes a strong assumption that the
 /// high bits that are passed-through are not live.
-unsigned X86InstrInfo::
-getUndefRegClearance(const MachineInstr *MI, unsigned &OpNum,
-                     const TargetRegisterInfo *TRI) const {
-  if (!hasUndefRegUpdate(MI->getOpcode()))
+unsigned
+X86InstrInfo::getUndefRegClearance(const MachineInstr &MI, unsigned &OpNum,
+                                   const TargetRegisterInfo *TRI) const {
+  if (!hasUndefRegUpdate(MI.getOpcode()))
     return 0;
 
   // Set the OpNum parameter to the first source operand.
   OpNum = 1;
 
-  const MachineOperand &MO = MI->getOperand(OpNum);
+  const MachineOperand &MO = MI.getOperand(OpNum);
   if (MO.isUndef() && TargetRegisterInfo::isPhysicalRegister(MO.getReg())) {
-    // Use the same magic number as getPartialRegUpdateClearance.
-    return 16;
+    return UndefRegClearance;
   }
   return 0;
 }
 
-void X86InstrInfo::
-breakPartialRegDependency(MachineBasicBlock::iterator MI, unsigned OpNum,
-                          const TargetRegisterInfo *TRI) const {
-  unsigned Reg = MI->getOperand(OpNum).getReg();
+void X86InstrInfo::breakPartialRegDependency(
+    MachineInstr &MI, unsigned OpNum, const TargetRegisterInfo *TRI) const {
+  unsigned Reg = MI.getOperand(OpNum).getReg();
   // If MI kills this register, the false dependence is already broken.
-  if (MI->killsRegister(Reg, TRI))
+  if (MI.killsRegister(Reg, TRI))
     return;
 
   if (X86::VR128RegClass.contains(Reg)) {
     // These instructions are all floating point domain, so xorps is the best
     // choice.
     unsigned Opc = Subtarget.hasAVX() ? X86::VXORPSrr : X86::XORPSrr;
-    BuildMI(*MI->getParent(), MI, MI->getDebugLoc(), get(Opc), Reg)
-      .addReg(Reg, RegState::Undef).addReg(Reg, RegState::Undef);
-    MI->addRegisterKilled(Reg, TRI, true);
+    BuildMI(*MI.getParent(), MI, MI.getDebugLoc(), get(Opc), Reg)
+        .addReg(Reg, RegState::Undef)
+        .addReg(Reg, RegState::Undef);
+    MI.addRegisterKilled(Reg, TRI, true);
   } else if (X86::VR256RegClass.contains(Reg)) {
     // Use vxorps to clear the full ymm register.
     // It wants to read and write the xmm sub-register.
     unsigned XReg = TRI->getSubReg(Reg, X86::sub_xmm);
-    BuildMI(*MI->getParent(), MI, MI->getDebugLoc(), get(X86::VXORPSrr), XReg)
-      .addReg(XReg, RegState::Undef).addReg(XReg, RegState::Undef)
-      .addReg(Reg, RegState::ImplicitDefine);
-    MI->addRegisterKilled(Reg, TRI, true);
+    BuildMI(*MI.getParent(), MI, MI.getDebugLoc(), get(X86::VXORPSrr), XReg)
+        .addReg(XReg, RegState::Undef)
+        .addReg(XReg, RegState::Undef)
+        .addReg(Reg, RegState::ImplicitDefine);
+    MI.addRegisterKilled(Reg, TRI, true);
   }
 }
 
-MachineInstr *X86InstrInfo::foldMemoryOperandImpl(
-    MachineFunction &MF, MachineInstr *MI, ArrayRef<unsigned> Ops,
-    MachineBasicBlock::iterator InsertPt, int FrameIndex) const {
+MachineInstr *
+X86InstrInfo::foldMemoryOperandImpl(MachineFunction &MF, MachineInstr &MI,
+                                    ArrayRef<unsigned> Ops,
+                                    MachineBasicBlock::iterator InsertPt,
+                                    int FrameIndex, LiveIntervals *LIS) const {
   // Check switch flag
   if (NoFusing)
     return nullptr;
 
   // Unless optimizing for size, don't fold to avoid partial
   // register update stalls
-  if (!MF.getFunction()->optForSize() && hasPartialRegUpdate(MI->getOpcode()))
+  if (!MF.getFunction()->optForSize() && hasPartialRegUpdate(MI.getOpcode()))
     return nullptr;
 
   const MachineFrameInfo *MFI = MF.getFrameInfo();
@@ -5913,7 +6239,7 @@ MachineInstr *X86InstrInfo::foldMemoryOperandImpl(
   if (Ops.size() == 2 && Ops[0] == 0 && Ops[1] == 1) {
     unsigned NewOpc = 0;
     unsigned RCSize = 0;
-    switch (MI->getOpcode()) {
+    switch (MI.getOpcode()) {
     default: return nullptr;
     case X86::TEST8rr:  NewOpc = X86::CMP8ri; RCSize = 1; break;
     case X86::TEST16rr: NewOpc = X86::CMP16ri8; RCSize = 2; break;
@@ -5925,8 +6251,8 @@ MachineInstr *X86InstrInfo::foldMemoryOperandImpl(
     if (Size < RCSize)
       return nullptr;
     // Change to CMPXXri r, 0 first.
-    MI->setDesc(get(NewOpc));
-    MI->getOperand(1).ChangeToImmediate(0);
+    MI.setDesc(get(NewOpc));
+    MI.getOperand(1).ChangeToImmediate(0);
   } else if (Ops.size() != 1)
     return nullptr;
 
@@ -5957,15 +6283,16 @@ static bool isNonFoldablePartialRegisterLoad(const MachineInstr &LoadMI,
   unsigned RegSize =
       MF.getRegInfo().getRegClass(LoadMI.getOperand(0).getReg())->getSize();
 
-  if ((Opc == X86::MOVSSrm || Opc == X86::VMOVSSrm) && RegSize > 4) {
+  if ((Opc == X86::MOVSSrm || Opc == X86::VMOVSSrm || Opc == X86::VMOVSSZrm) &&
+      RegSize > 4) {
     // These instructions only load 32 bits, we can't fold them if the
     // destination register is wider than 32 bits (4 bytes), and its user
     // instruction isn't scalar (SS).
     switch (UserOpc) {
-    case X86::ADDSSrr_Int: case X86::VADDSSrr_Int:
-    case X86::DIVSSrr_Int: case X86::VDIVSSrr_Int:
-    case X86::MULSSrr_Int: case X86::VMULSSrr_Int:
-    case X86::SUBSSrr_Int: case X86::VSUBSSrr_Int:
+    case X86::ADDSSrr_Int: case X86::VADDSSrr_Int: case X86::VADDSSZrr_Int:
+    case X86::DIVSSrr_Int: case X86::VDIVSSrr_Int: case X86::VDIVSSZrr_Int:
+    case X86::MULSSrr_Int: case X86::VMULSSrr_Int: case X86::VMULSSZrr_Int:
+    case X86::SUBSSrr_Int: case X86::VSUBSSrr_Int: case X86::VSUBSSZrr_Int:
     case X86::VFMADDSSr132r_Int: case X86::VFNMADDSSr132r_Int:
     case X86::VFMADDSSr213r_Int: case X86::VFNMADDSSr213r_Int:
     case X86::VFMADDSSr231r_Int: case X86::VFNMADDSSr231r_Int:
@@ -5978,15 +6305,16 @@ static bool isNonFoldablePartialRegisterLoad(const MachineInstr &LoadMI,
     }
   }
 
-  if ((Opc == X86::MOVSDrm || Opc == X86::VMOVSDrm) && RegSize > 8) {
+  if ((Opc == X86::MOVSDrm || Opc == X86::VMOVSDrm || Opc == X86::VMOVSDZrm) &&
+      RegSize > 8) {
     // These instructions only load 64 bits, we can't fold them if the
     // destination register is wider than 64 bits (8 bytes), and its user
     // instruction isn't scalar (SD).
     switch (UserOpc) {
-    case X86::ADDSDrr_Int: case X86::VADDSDrr_Int:
-    case X86::DIVSDrr_Int: case X86::VDIVSDrr_Int:
-    case X86::MULSDrr_Int: case X86::VMULSDrr_Int:
-    case X86::SUBSDrr_Int: case X86::VSUBSDrr_Int:
+    case X86::ADDSDrr_Int: case X86::VADDSDrr_Int: case X86::VADDSDZrr_Int:
+    case X86::DIVSDrr_Int: case X86::VDIVSDrr_Int: case X86::VDIVSDZrr_Int:
+    case X86::MULSDrr_Int: case X86::VMULSDrr_Int: case X86::VMULSDZrr_Int:
+    case X86::SUBSDrr_Int: case X86::VSUBSDrr_Int: case X86::VSUBSDZrr_Int:
     case X86::VFMADDSDr132r_Int: case X86::VFNMADDSDr132r_Int:
     case X86::VFMADDSDr213r_Int: case X86::VFNMADDSDr213r_Int:
     case X86::VFMADDSDr231r_Int: case X86::VFNMADDSDr231r_Int:
@@ -6003,36 +6331,43 @@ static bool isNonFoldablePartialRegisterLoad(const MachineInstr &LoadMI,
 }
 
 MachineInstr *X86InstrInfo::foldMemoryOperandImpl(
-    MachineFunction &MF, MachineInstr *MI, ArrayRef<unsigned> Ops,
-    MachineBasicBlock::iterator InsertPt, MachineInstr *LoadMI) const {
+    MachineFunction &MF, MachineInstr &MI, ArrayRef<unsigned> Ops,
+    MachineBasicBlock::iterator InsertPt, MachineInstr &LoadMI,
+    LiveIntervals *LIS) const {
   // If loading from a FrameIndex, fold directly from the FrameIndex.
-  unsigned NumOps = LoadMI->getDesc().getNumOperands();
+  unsigned NumOps = LoadMI.getDesc().getNumOperands();
   int FrameIndex;
   if (isLoadFromStackSlot(LoadMI, FrameIndex)) {
-    if (isNonFoldablePartialRegisterLoad(*LoadMI, *MI, MF))
+    if (isNonFoldablePartialRegisterLoad(LoadMI, MI, MF))
       return nullptr;
-    return foldMemoryOperandImpl(MF, MI, Ops, InsertPt, FrameIndex);
+    return foldMemoryOperandImpl(MF, MI, Ops, InsertPt, FrameIndex, LIS);
   }
 
   // Check switch flag
   if (NoFusing) return nullptr;
 
   // Avoid partial register update stalls unless optimizing for size.
-  if (!MF.getFunction()->optForSize() && hasPartialRegUpdate(MI->getOpcode()))
+  if (!MF.getFunction()->optForSize() && hasPartialRegUpdate(MI.getOpcode()))
     return nullptr;
 
   // Determine the alignment of the load.
   unsigned Alignment = 0;
-  if (LoadMI->hasOneMemOperand())
-    Alignment = (*LoadMI->memoperands_begin())->getAlignment();
+  if (LoadMI.hasOneMemOperand())
+    Alignment = (*LoadMI.memoperands_begin())->getAlignment();
   else
-    switch (LoadMI->getOpcode()) {
+    switch (LoadMI.getOpcode()) {
+    case X86::AVX512_512_SET0:
+    case X86::AVX512_512_SETALLONES:
+      Alignment = 64;
+      break;
     case X86::AVX2_SETALLONES:
     case X86::AVX_SET0:
+    case X86::AVX512_256_SET0:
       Alignment = 32;
       break;
     case X86::V_SET0:
     case X86::V_SETALLONES:
+    case X86::AVX512_128_SET0:
       Alignment = 16;
       break;
     case X86::FsFLD0SD:
@@ -6046,7 +6381,7 @@ MachineInstr *X86InstrInfo::foldMemoryOperandImpl(
     }
   if (Ops.size() == 2 && Ops[0] == 0 && Ops[1] == 1) {
     unsigned NewOpc = 0;
-    switch (MI->getOpcode()) {
+    switch (MI.getOpcode()) {
     default: return nullptr;
     case X86::TEST8rr:  NewOpc = X86::CMP8ri; break;
     case X86::TEST16rr: NewOpc = X86::CMP16ri8; break;
@@ -6054,22 +6389,26 @@ MachineInstr *X86InstrInfo::foldMemoryOperandImpl(
     case X86::TEST64rr: NewOpc = X86::CMP64ri8; break;
     }
     // Change to CMPXXri r, 0 first.
-    MI->setDesc(get(NewOpc));
-    MI->getOperand(1).ChangeToImmediate(0);
+    MI.setDesc(get(NewOpc));
+    MI.getOperand(1).ChangeToImmediate(0);
   } else if (Ops.size() != 1)
     return nullptr;
 
   // Make sure the subregisters match.
   // Otherwise we risk changing the size of the load.
-  if (LoadMI->getOperand(0).getSubReg() != MI->getOperand(Ops[0]).getSubReg())
+  if (LoadMI.getOperand(0).getSubReg() != MI.getOperand(Ops[0]).getSubReg())
     return nullptr;
 
   SmallVector<MachineOperand,X86::AddrNumOperands> MOs;
-  switch (LoadMI->getOpcode()) {
+  switch (LoadMI.getOpcode()) {
   case X86::V_SET0:
   case X86::V_SETALLONES:
   case X86::AVX2_SETALLONES:
   case X86::AVX_SET0:
+  case X86::AVX512_128_SET0:
+  case X86::AVX512_256_SET0:
+  case X86::AVX512_512_SET0:
+  case X86::AVX512_512_SETALLONES:
   case X86::FsFLD0SD:
   case X86::FsFLD0SS: {
     // Folding a V_SET0 or V_SETALLONES as a load, to ease register pressure.
@@ -6082,7 +6421,7 @@ MachineInstr *X86InstrInfo::foldMemoryOperandImpl(
 
     // x86-32 PIC requires a PIC base register for constant pools.
     unsigned PICBase = 0;
-    if (MF.getTarget().getRelocationModel() == Reloc::PIC_) {
+    if (MF.getTarget().isPositionIndependent()) {
       if (Subtarget.is64Bit())
         PICBase = X86::RIP;
       else
@@ -6096,17 +6435,21 @@ MachineInstr *X86InstrInfo::foldMemoryOperandImpl(
     // Create a constant-pool entry.
     MachineConstantPool &MCP = *MF.getConstantPool();
     Type *Ty;
-    unsigned Opc = LoadMI->getOpcode();
+    unsigned Opc = LoadMI.getOpcode();
     if (Opc == X86::FsFLD0SS)
       Ty = Type::getFloatTy(MF.getFunction()->getContext());
     else if (Opc == X86::FsFLD0SD)
       Ty = Type::getDoubleTy(MF.getFunction()->getContext());
-    else if (Opc == X86::AVX2_SETALLONES || Opc == X86::AVX_SET0)
+    else if (Opc == X86::AVX512_512_SET0 || Opc == X86::AVX512_512_SETALLONES)
+      Ty = VectorType::get(Type::getInt32Ty(MF.getFunction()->getContext()),16);
+    else if (Opc == X86::AVX2_SETALLONES || Opc == X86::AVX_SET0 ||
+             Opc == X86::AVX512_256_SET0)
       Ty = VectorType::get(Type::getInt32Ty(MF.getFunction()->getContext()), 8);
     else
       Ty = VectorType::get(Type::getInt32Ty(MF.getFunction()->getContext()), 4);
 
-    bool IsAllOnes = (Opc == X86::V_SETALLONES || Opc == X86::AVX2_SETALLONES);
+    bool IsAllOnes = (Opc == X86::V_SETALLONES || Opc == X86::AVX2_SETALLONES ||
+                      Opc == X86::AVX512_512_SETALLONES);
     const Constant *C = IsAllOnes ? Constant::getAllOnesValue(Ty) :
                                     Constant::getNullValue(Ty);
     unsigned CPI = MCP.getConstantPoolIndex(C, Alignment);
@@ -6120,12 +6463,12 @@ MachineInstr *X86InstrInfo::foldMemoryOperandImpl(
     break;
   }
   default: {
-    if (isNonFoldablePartialRegisterLoad(*LoadMI, *MI, MF))
+    if (isNonFoldablePartialRegisterLoad(LoadMI, MI, MF))
       return nullptr;
 
     // Folding a normal load. Just copy the load's address operands.
-    MOs.append(LoadMI->operands_begin() + NumOps - X86::AddrNumOperands,
-               LoadMI->operands_begin() + NumOps);
+    MOs.append(LoadMI.operands_begin() + NumOps - X86::AddrNumOperands,
+               LoadMI.operands_begin() + NumOps);
     break;
   }
   }
@@ -6133,11 +6476,10 @@ MachineInstr *X86InstrInfo::foldMemoryOperandImpl(
                                /*Size=*/0, Alignment, /*AllowCommute=*/true);
 }
 
-bool X86InstrInfo::unfoldMemoryOperand(MachineFunction &MF, MachineInstr *MI,
-                                unsigned Reg, bool UnfoldLoad, bool UnfoldStore,
-                                SmallVectorImpl<MachineInstr*> &NewMIs) const {
-  DenseMap<unsigned, std::pair<unsigned,unsigned> >::const_iterator I =
-    MemOp2RegOpTable.find(MI->getOpcode());
+bool X86InstrInfo::unfoldMemoryOperand(
+    MachineFunction &MF, MachineInstr &MI, unsigned Reg, bool UnfoldLoad,
+    bool UnfoldStore, SmallVectorImpl<MachineInstr *> &NewMIs) const {
+  auto I = MemOp2RegOpTable.find(MI.getOpcode());
   if (I == MemOp2RegOpTable.end())
     return false;
   unsigned Opc = I->second.first;
@@ -6154,8 +6496,7 @@ bool X86InstrInfo::unfoldMemoryOperand(MachineFunction &MF, MachineInstr *MI,
   const MCInstrDesc &MCID = get(Opc);
   const TargetRegisterClass *RC = getRegClass(MCID, Index, &RI, MF);
   // TODO: Check if 32-byte or greater accesses are slow too?
-  if (!MI->hasOneMemOperand() &&
-      RC == &X86::VR128RegClass &&
+  if (!MI.hasOneMemOperand() && RC == &X86::VR128RegClass &&
       Subtarget.isUnalignedMem16Slow())
     // Without memoperands, loadRegFromAddr and storeRegToStackSlot will
     // conservatively assume the address is unaligned. That's bad for
@@ -6165,8 +6506,8 @@ bool X86InstrInfo::unfoldMemoryOperand(MachineFunction &MF, MachineInstr *MI,
   SmallVector<MachineOperand,2> BeforeOps;
   SmallVector<MachineOperand,2> AfterOps;
   SmallVector<MachineOperand,4> ImpOps;
-  for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) {
-    MachineOperand &Op = MI->getOperand(i);
+  for (unsigned i = 0, e = MI.getNumOperands(); i != e; ++i) {
+    MachineOperand &Op = MI.getOperand(i);
     if (i >= Index && i < Index + X86::AddrNumOperands)
       AddrOps.push_back(Op);
     else if (Op.isReg() && Op.isImplicit())
@@ -6179,10 +6520,8 @@ bool X86InstrInfo::unfoldMemoryOperand(MachineFunction &MF, MachineInstr *MI,
 
   // Emit the load instruction.
   if (UnfoldLoad) {
-    std::pair<MachineInstr::mmo_iterator,
-              MachineInstr::mmo_iterator> MMOs =
-      MF.extractLoadMemRefs(MI->memoperands_begin(),
-                            MI->memoperands_end());
+    std::pair<MachineInstr::mmo_iterator, MachineInstr::mmo_iterator> MMOs =
+        MF.extractLoadMemRefs(MI.memoperands_begin(), MI.memoperands_end());
     loadRegFromAddr(MF, Reg, AddrOps, RC, MMOs.first, MMOs.second, NewMIs);
     if (UnfoldStore) {
       // Address operands cannot be marked isKill.
@@ -6195,7 +6534,7 @@ bool X86InstrInfo::unfoldMemoryOperand(MachineFunction &MF, MachineInstr *MI,
   }
 
   // Emit the data processing instruction.
-  MachineInstr *DataMI = MF.CreateMachineInstr(MCID, MI->getDebugLoc(), true);
+  MachineInstr *DataMI = MF.CreateMachineInstr(MCID, MI.getDebugLoc(), true);
   MachineInstrBuilder MIB(MF, DataMI);
 
   if (FoldedStore)
@@ -6248,10 +6587,8 @@ bool X86InstrInfo::unfoldMemoryOperand(MachineFunction &MF, MachineInstr *MI,
   // Emit the store instruction.
   if (UnfoldStore) {
     const TargetRegisterClass *DstRC = getRegClass(MCID, 0, &RI, MF);
-    std::pair<MachineInstr::mmo_iterator,
-              MachineInstr::mmo_iterator> MMOs =
-      MF.extractStoreMemRefs(MI->memoperands_begin(),
-                             MI->memoperands_end());
+    std::pair<MachineInstr::mmo_iterator, MachineInstr::mmo_iterator> MMOs =
+        MF.extractStoreMemRefs(MI.memoperands_begin(), MI.memoperands_end());
     storeRegToAddr(MF, Reg, true, AddrOps, DstRC, MMOs.first, MMOs.second, NewMIs);
   }
 
@@ -6264,8 +6601,7 @@ X86InstrInfo::unfoldMemoryOperand(SelectionDAG &DAG, SDNode *N,
   if (!N->isMachineOpcode())
     return false;
 
-  DenseMap<unsigned, std::pair<unsigned,unsigned> >::const_iterator I =
-    MemOp2RegOpTable.find(N->getMachineOpcode());
+  auto I = MemOp2RegOpTable.find(N->getMachineOpcode());
   if (I == MemOp2RegOpTable.end())
     return false;
   unsigned Opc = I->second.first;
@@ -6371,8 +6707,7 @@ X86InstrInfo::unfoldMemoryOperand(SelectionDAG &DAG, SDNode *N,
 unsigned X86InstrInfo::getOpcodeAfterMemoryUnfold(unsigned Opc,
                                       bool UnfoldLoad, bool UnfoldStore,
                                       unsigned *LoadRegIndex) const {
-  DenseMap<unsigned, std::pair<unsigned,unsigned> >::const_iterator I =
-    MemOp2RegOpTable.find(Opc);
+  auto I = MemOp2RegOpTable.find(Opc);
   if (I == MemOp2RegOpTable.end())
     return 0;
   bool FoldedLoad = I->second.second & TB_FOLDED_LOAD;
@@ -6411,6 +6746,7 @@ X86InstrInfo::areLoadsFromSameBasePtr(SDNode *Load1, SDNode *Load2,
   case X86::MOVAPSrm:
   case X86::MOVUPSrm:
   case X86::MOVAPDrm:
+  case X86::MOVUPDrm:
   case X86::MOVDQArm:
   case X86::MOVDQUrm:
   // AVX load instructions
@@ -6421,13 +6757,52 @@ X86InstrInfo::areLoadsFromSameBasePtr(SDNode *Load1, SDNode *Load2,
   case X86::VMOVAPSrm:
   case X86::VMOVUPSrm:
   case X86::VMOVAPDrm:
+  case X86::VMOVUPDrm:
   case X86::VMOVDQArm:
   case X86::VMOVDQUrm:
   case X86::VMOVAPSYrm:
   case X86::VMOVUPSYrm:
   case X86::VMOVAPDYrm:
+  case X86::VMOVUPDYrm:
   case X86::VMOVDQAYrm:
   case X86::VMOVDQUYrm:
+  // AVX512 load instructions
+  case X86::VMOVSSZrm:
+  case X86::VMOVSDZrm:
+  case X86::VMOVAPSZ128rm:
+  case X86::VMOVUPSZ128rm:
+  case X86::VMOVAPDZ128rm:
+  case X86::VMOVUPDZ128rm:
+  case X86::VMOVDQU8Z128rm:
+  case X86::VMOVDQU16Z128rm:
+  case X86::VMOVDQA32Z128rm:
+  case X86::VMOVDQU32Z128rm:
+  case X86::VMOVDQA64Z128rm:
+  case X86::VMOVDQU64Z128rm:
+  case X86::VMOVAPSZ256rm:
+  case X86::VMOVUPSZ256rm:
+  case X86::VMOVAPDZ256rm:
+  case X86::VMOVUPDZ256rm:
+  case X86::VMOVDQU8Z256rm:
+  case X86::VMOVDQU16Z256rm:
+  case X86::VMOVDQA32Z256rm:
+  case X86::VMOVDQU32Z256rm:
+  case X86::VMOVDQA64Z256rm:
+  case X86::VMOVDQU64Z256rm:
+  case X86::VMOVAPSZrm:
+  case X86::VMOVUPSZrm:
+  case X86::VMOVAPDZrm:
+  case X86::VMOVUPDZrm:
+  case X86::VMOVDQU8Zrm:
+  case X86::VMOVDQU16Zrm:
+  case X86::VMOVDQA32Zrm:
+  case X86::VMOVDQU32Zrm:
+  case X86::VMOVDQA64Zrm:
+  case X86::VMOVDQU64Zrm:
+  case X86::KMOVBkm:
+  case X86::KMOVWkm:
+  case X86::KMOVDkm:
+  case X86::KMOVQkm:
     break;
   }
   switch (Opc2) {
@@ -6448,6 +6823,7 @@ X86InstrInfo::areLoadsFromSameBasePtr(SDNode *Load1, SDNode *Load2,
   case X86::MOVAPSrm:
   case X86::MOVUPSrm:
   case X86::MOVAPDrm:
+  case X86::MOVUPDrm:
   case X86::MOVDQArm:
   case X86::MOVDQUrm:
   // AVX load instructions
@@ -6458,13 +6834,52 @@ X86InstrInfo::areLoadsFromSameBasePtr(SDNode *Load1, SDNode *Load2,
   case X86::VMOVAPSrm:
   case X86::VMOVUPSrm:
   case X86::VMOVAPDrm:
+  case X86::VMOVUPDrm:
   case X86::VMOVDQArm:
   case X86::VMOVDQUrm:
   case X86::VMOVAPSYrm:
   case X86::VMOVUPSYrm:
   case X86::VMOVAPDYrm:
+  case X86::VMOVUPDYrm:
   case X86::VMOVDQAYrm:
   case X86::VMOVDQUYrm:
+  // AVX512 load instructions
+  case X86::VMOVSSZrm:
+  case X86::VMOVSDZrm:
+  case X86::VMOVAPSZ128rm:
+  case X86::VMOVUPSZ128rm:
+  case X86::VMOVAPDZ128rm:
+  case X86::VMOVUPDZ128rm:
+  case X86::VMOVDQU8Z128rm:
+  case X86::VMOVDQU16Z128rm:
+  case X86::VMOVDQA32Z128rm:
+  case X86::VMOVDQU32Z128rm:
+  case X86::VMOVDQA64Z128rm:
+  case X86::VMOVDQU64Z128rm:
+  case X86::VMOVAPSZ256rm:
+  case X86::VMOVUPSZ256rm:
+  case X86::VMOVAPDZ256rm:
+  case X86::VMOVUPDZ256rm:
+  case X86::VMOVDQU8Z256rm:
+  case X86::VMOVDQU16Z256rm:
+  case X86::VMOVDQA32Z256rm:
+  case X86::VMOVDQU32Z256rm:
+  case X86::VMOVDQA64Z256rm:
+  case X86::VMOVDQU64Z256rm:
+  case X86::VMOVAPSZrm:
+  case X86::VMOVUPSZrm:
+  case X86::VMOVAPDZrm:
+  case X86::VMOVUPDZrm:
+  case X86::VMOVDQU8Zrm:
+  case X86::VMOVDQU16Zrm:
+  case X86::VMOVDQA32Zrm:
+  case X86::VMOVDQU32Zrm:
+  case X86::VMOVDQA64Zrm:
+  case X86::VMOVDQU64Zrm:
+  case X86::KMOVBkm:
+  case X86::KMOVWkm:
+  case X86::KMOVDkm:
+  case X86::KMOVQkm:
     break;
   }
 
@@ -6540,8 +6955,8 @@ bool X86InstrInfo::shouldScheduleLoadsNear(SDNode *Load1, SDNode *Load2,
   return true;
 }
 
-bool X86InstrInfo::shouldScheduleAdjacent(MachineInstr* First,
-                                          MachineInstr *Second) const {
+bool X86InstrInfo::shouldScheduleAdjacent(MachineInstr &First,
+                                          MachineInstr &Second) const {
   // Check if this processor supports macro-fusion. Since this is a minor
   // heuristic, we haven't specifically reserved a feature. hasAVX is a decent
   // proxy for SandyBridge+.
@@ -6554,7 +6969,7 @@ bool X86InstrInfo::shouldScheduleAdjacent(MachineInstr* First,
     FuseInc
   } FuseKind;
 
-  switch(Second->getOpcode()) {
+  switch (Second.getOpcode()) {
   default:
     return false;
   case X86::JE_1:
@@ -6580,7 +6995,7 @@ bool X86InstrInfo::shouldScheduleAdjacent(MachineInstr* First,
     FuseKind = FuseTest;
     break;
   }
-  switch (First->getOpcode()) {
+  switch (First.getOpcode()) {
   default:
     return false;
   case X86::TEST8rr:
@@ -6703,8 +7118,6 @@ bool X86InstrInfo::
 ReverseBranchCondition(SmallVectorImpl<MachineOperand> &Cond) const {
   assert(Cond.size() == 1 && "Invalid X86 branch condition!");
   X86::CondCode CC = static_cast<X86::CondCode>(Cond[0].getImm());
-  if (CC == X86::COND_NE_OR_P || CC == X86::COND_NP_OR_E)
-    return true;
   Cond[0].setImm(GetOppositeBranchCondition(CC));
   return false;
 }
@@ -6827,29 +7240,29 @@ static const uint16_t *lookupAVX2(unsigned opcode, unsigned domain) {
 }
 
 std::pair<uint16_t, uint16_t>
-X86InstrInfo::getExecutionDomain(const MachineInstr *MI) const {
-  uint16_t domain = (MI->getDesc().TSFlags >> X86II::SSEDomainShift) & 3;
+X86InstrInfo::getExecutionDomain(const MachineInstr &MI) const {
+  uint16_t domain = (MI.getDesc().TSFlags >> X86II::SSEDomainShift) & 3;
   bool hasAVX2 = Subtarget.hasAVX2();
   uint16_t validDomains = 0;
-  if (domain && lookup(MI->getOpcode(), domain))
+  if (domain && lookup(MI.getOpcode(), domain))
     validDomains = 0xe;
-  else if (domain && lookupAVX2(MI->getOpcode(), domain))
+  else if (domain && lookupAVX2(MI.getOpcode(), domain))
     validDomains = hasAVX2 ? 0xe : 0x6;
   return std::make_pair(domain, validDomains);
 }
 
-void X86InstrInfo::setExecutionDomain(MachineInstr *MI, unsigned Domain) const {
+void X86InstrInfo::setExecutionDomain(MachineInstr &MI, unsigned Domain) const {
   assert(Domain>0 && Domain<4 && "Invalid execution domain");
-  uint16_t dom = (MI->getDesc().TSFlags >> X86II::SSEDomainShift) & 3;
+  uint16_t dom = (MI.getDesc().TSFlags >> X86II::SSEDomainShift) & 3;
   assert(dom && "Not an SSE instruction");
-  const uint16_t *table = lookup(MI->getOpcode(), dom);
+  const uint16_t *table = lookup(MI.getOpcode(), dom);
   if (!table) { // try the other table
     assert((Subtarget.hasAVX2() || Domain < 3) &&
            "256-bit vector operations only available in AVX2");
-    table = lookupAVX2(MI->getOpcode(), dom);
+    table = lookupAVX2(MI.getOpcode(), dom);
   }
   assert(table && "Cannot change domain");
-  MI->setDesc(get(table[Domain-1]));
+  MI.setDesc(get(table[Domain - 1]));
 }
 
 /// Return the noop instruction to use for a noop.
@@ -6886,6 +7299,10 @@ unsigned X86InstrInfo::getJumpInstrTableEntryBound() const {
 bool X86InstrInfo::isHighLatencyDef(int opc) const {
   switch (opc) {
   default: return false;
+  case X86::DIVPDrm:
+  case X86::DIVPDrr:
+  case X86::DIVPSrm:
+  case X86::DIVPSrr:
   case X86::DIVSDrm:
   case X86::DIVSDrm_Int:
   case X86::DIVSDrr:
@@ -6907,6 +7324,14 @@ bool X86InstrInfo::isHighLatencyDef(int opc) const {
   case X86::SQRTSSr:
   case X86::SQRTSSr_Int:
   // AVX instructions with high latency
+  case X86::VDIVPDrm:
+  case X86::VDIVPDrr:
+  case X86::VDIVPDYrm:
+  case X86::VDIVPDYrr:
+  case X86::VDIVPSrm:
+  case X86::VDIVPSrr:
+  case X86::VDIVPSYrm:
+  case X86::VDIVPSYrr:
   case X86::VDIVSDrm:
   case X86::VDIVSDrm_Int:
   case X86::VDIVSDrr:
@@ -6917,55 +7342,277 @@ bool X86InstrInfo::isHighLatencyDef(int opc) const {
   case X86::VDIVSSrr_Int:
   case X86::VSQRTPDm:
   case X86::VSQRTPDr:
+  case X86::VSQRTPDYm:
+  case X86::VSQRTPDYr:
   case X86::VSQRTPSm:
   case X86::VSQRTPSr:
+  case X86::VSQRTPSYm:
+  case X86::VSQRTPSYr:
   case X86::VSQRTSDm:
   case X86::VSQRTSDm_Int:
   case X86::VSQRTSDr:
+  case X86::VSQRTSDr_Int:
   case X86::VSQRTSSm:
   case X86::VSQRTSSm_Int:
   case X86::VSQRTSSr:
+  case X86::VSQRTSSr_Int:
+  // AVX512 instructions with high latency
+  case X86::VDIVPDZ128rm:
+  case X86::VDIVPDZ128rmb:
+  case X86::VDIVPDZ128rmbk:
+  case X86::VDIVPDZ128rmbkz:
+  case X86::VDIVPDZ128rmk:
+  case X86::VDIVPDZ128rmkz:
+  case X86::VDIVPDZ128rr:
+  case X86::VDIVPDZ128rrk:
+  case X86::VDIVPDZ128rrkz:
+  case X86::VDIVPDZ256rm:
+  case X86::VDIVPDZ256rmb:
+  case X86::VDIVPDZ256rmbk:
+  case X86::VDIVPDZ256rmbkz:
+  case X86::VDIVPDZ256rmk:
+  case X86::VDIVPDZ256rmkz:
+  case X86::VDIVPDZ256rr:
+  case X86::VDIVPDZ256rrk:
+  case X86::VDIVPDZ256rrkz:
+  case X86::VDIVPDZrb:
+  case X86::VDIVPDZrbk:
+  case X86::VDIVPDZrbkz:
+  case X86::VDIVPDZrm:
+  case X86::VDIVPDZrmb:
+  case X86::VDIVPDZrmbk:
+  case X86::VDIVPDZrmbkz:
+  case X86::VDIVPDZrmk:
+  case X86::VDIVPDZrmkz:
+  case X86::VDIVPDZrr:
+  case X86::VDIVPDZrrk:
+  case X86::VDIVPDZrrkz:
+  case X86::VDIVPSZ128rm:
+  case X86::VDIVPSZ128rmb:
+  case X86::VDIVPSZ128rmbk:
+  case X86::VDIVPSZ128rmbkz:
+  case X86::VDIVPSZ128rmk:
+  case X86::VDIVPSZ128rmkz:
+  case X86::VDIVPSZ128rr:
+  case X86::VDIVPSZ128rrk:
+  case X86::VDIVPSZ128rrkz:
+  case X86::VDIVPSZ256rm:
+  case X86::VDIVPSZ256rmb:
+  case X86::VDIVPSZ256rmbk:
+  case X86::VDIVPSZ256rmbkz:
+  case X86::VDIVPSZ256rmk:
+  case X86::VDIVPSZ256rmkz:
+  case X86::VDIVPSZ256rr:
+  case X86::VDIVPSZ256rrk:
+  case X86::VDIVPSZ256rrkz:
+  case X86::VDIVPSZrb:
+  case X86::VDIVPSZrbk:
+  case X86::VDIVPSZrbkz:
+  case X86::VDIVPSZrm:
+  case X86::VDIVPSZrmb:
+  case X86::VDIVPSZrmbk:
+  case X86::VDIVPSZrmbkz:
+  case X86::VDIVPSZrmk:
+  case X86::VDIVPSZrmkz:
+  case X86::VDIVPSZrr:
+  case X86::VDIVPSZrrk:
+  case X86::VDIVPSZrrkz:
+  case X86::VDIVSDZrm:
+  case X86::VDIVSDZrr:
+  case X86::VDIVSDZrm_Int:
+  case X86::VDIVSDZrm_Intk:
+  case X86::VDIVSDZrm_Intkz:
+  case X86::VDIVSDZrr_Int:
+  case X86::VDIVSDZrr_Intk:
+  case X86::VDIVSDZrr_Intkz:
+  case X86::VDIVSDZrrb:
+  case X86::VDIVSDZrrbk:
+  case X86::VDIVSDZrrbkz:
+  case X86::VDIVSSZrm:
+  case X86::VDIVSSZrr:
+  case X86::VDIVSSZrm_Int:
+  case X86::VDIVSSZrm_Intk:
+  case X86::VDIVSSZrm_Intkz:
+  case X86::VDIVSSZrr_Int:
+  case X86::VDIVSSZrr_Intk:
+  case X86::VDIVSSZrr_Intkz:
+  case X86::VDIVSSZrrb:
+  case X86::VDIVSSZrrbk:
+  case X86::VDIVSSZrrbkz:
+  case X86::VSQRTPDZ128m:
+  case X86::VSQRTPDZ128mb:
+  case X86::VSQRTPDZ128mbk:
+  case X86::VSQRTPDZ128mbkz:
+  case X86::VSQRTPDZ128mk:
+  case X86::VSQRTPDZ128mkz:
+  case X86::VSQRTPDZ128r:
+  case X86::VSQRTPDZ128rk:
+  case X86::VSQRTPDZ128rkz:
+  case X86::VSQRTPDZ256m:
+  case X86::VSQRTPDZ256mb:
+  case X86::VSQRTPDZ256mbk:
+  case X86::VSQRTPDZ256mbkz:
+  case X86::VSQRTPDZ256mk:
+  case X86::VSQRTPDZ256mkz:
+  case X86::VSQRTPDZ256r:
+  case X86::VSQRTPDZ256rk:
+  case X86::VSQRTPDZ256rkz:
   case X86::VSQRTPDZm:
+  case X86::VSQRTPDZmb:
+  case X86::VSQRTPDZmbk:
+  case X86::VSQRTPDZmbkz:
+  case X86::VSQRTPDZmk:
+  case X86::VSQRTPDZmkz:
   case X86::VSQRTPDZr:
+  case X86::VSQRTPDZrb:
+  case X86::VSQRTPDZrbk:
+  case X86::VSQRTPDZrbkz:
+  case X86::VSQRTPDZrk:
+  case X86::VSQRTPDZrkz:
+  case X86::VSQRTPSZ128m:
+  case X86::VSQRTPSZ128mb:
+  case X86::VSQRTPSZ128mbk:
+  case X86::VSQRTPSZ128mbkz:
+  case X86::VSQRTPSZ128mk:
+  case X86::VSQRTPSZ128mkz:
+  case X86::VSQRTPSZ128r:
+  case X86::VSQRTPSZ128rk:
+  case X86::VSQRTPSZ128rkz:
+  case X86::VSQRTPSZ256m:
+  case X86::VSQRTPSZ256mb:
+  case X86::VSQRTPSZ256mbk:
+  case X86::VSQRTPSZ256mbkz:
+  case X86::VSQRTPSZ256mk:
+  case X86::VSQRTPSZ256mkz:
+  case X86::VSQRTPSZ256r:
+  case X86::VSQRTPSZ256rk:
+  case X86::VSQRTPSZ256rkz:
   case X86::VSQRTPSZm:
+  case X86::VSQRTPSZmb:
+  case X86::VSQRTPSZmbk:
+  case X86::VSQRTPSZmbkz:
+  case X86::VSQRTPSZmk:
+  case X86::VSQRTPSZmkz:
   case X86::VSQRTPSZr:
+  case X86::VSQRTPSZrb:
+  case X86::VSQRTPSZrbk:
+  case X86::VSQRTPSZrbkz:
+  case X86::VSQRTPSZrk:
+  case X86::VSQRTPSZrkz:
   case X86::VSQRTSDZm:
   case X86::VSQRTSDZm_Int:
+  case X86::VSQRTSDZm_Intk:
+  case X86::VSQRTSDZm_Intkz:
   case X86::VSQRTSDZr:
+  case X86::VSQRTSDZr_Int:
+  case X86::VSQRTSDZr_Intk:
+  case X86::VSQRTSDZr_Intkz:
+  case X86::VSQRTSDZrb_Int:
+  case X86::VSQRTSDZrb_Intk:
+  case X86::VSQRTSDZrb_Intkz:
+  case X86::VSQRTSSZm:
   case X86::VSQRTSSZm_Int:
+  case X86::VSQRTSSZm_Intk:
+  case X86::VSQRTSSZm_Intkz:
   case X86::VSQRTSSZr:
-  case X86::VSQRTSSZm:
-  case X86::VDIVSDZrm:
-  case X86::VDIVSDZrr:
-  case X86::VDIVSSZrm:
-  case X86::VDIVSSZrr:
-
-  case X86::VGATHERQPSZrm:
-  case X86::VGATHERQPDZrm:
+  case X86::VSQRTSSZr_Int:
+  case X86::VSQRTSSZr_Intk:
+  case X86::VSQRTSSZr_Intkz:
+  case X86::VSQRTSSZrb_Int:
+  case X86::VSQRTSSZrb_Intk:
+  case X86::VSQRTSSZrb_Intkz:
+
+  case X86::VGATHERDPDYrm:
+  case X86::VGATHERDPDZ128rm:
+  case X86::VGATHERDPDZ256rm:
   case X86::VGATHERDPDZrm:
+  case X86::VGATHERDPDrm:
+  case X86::VGATHERDPSYrm:
+  case X86::VGATHERDPSZ128rm:
+  case X86::VGATHERDPSZ256rm:
   case X86::VGATHERDPSZrm:
-  case X86::VPGATHERQDZrm:
-  case X86::VPGATHERQQZrm:
+  case X86::VGATHERDPSrm:
+  case X86::VGATHERPF0DPDm:
+  case X86::VGATHERPF0DPSm:
+  case X86::VGATHERPF0QPDm:
+  case X86::VGATHERPF0QPSm:
+  case X86::VGATHERPF1DPDm:
+  case X86::VGATHERPF1DPSm:
+  case X86::VGATHERPF1QPDm:
+  case X86::VGATHERPF1QPSm:
+  case X86::VGATHERQPDYrm:
+  case X86::VGATHERQPDZ128rm:
+  case X86::VGATHERQPDZ256rm:
+  case X86::VGATHERQPDZrm:
+  case X86::VGATHERQPDrm:
+  case X86::VGATHERQPSYrm:
+  case X86::VGATHERQPSZ128rm:
+  case X86::VGATHERQPSZ256rm:
+  case X86::VGATHERQPSZrm:
+  case X86::VGATHERQPSrm:
+  case X86::VPGATHERDDYrm:
+  case X86::VPGATHERDDZ128rm:
+  case X86::VPGATHERDDZ256rm:
   case X86::VPGATHERDDZrm:
+  case X86::VPGATHERDDrm:
+  case X86::VPGATHERDQYrm:
+  case X86::VPGATHERDQZ128rm:
+  case X86::VPGATHERDQZ256rm:
   case X86::VPGATHERDQZrm:
-  case X86::VSCATTERQPDZmr:
-  case X86::VSCATTERQPSZmr:
+  case X86::VPGATHERDQrm:
+  case X86::VPGATHERQDYrm:
+  case X86::VPGATHERQDZ128rm:
+  case X86::VPGATHERQDZ256rm:
+  case X86::VPGATHERQDZrm:
+  case X86::VPGATHERQDrm:
+  case X86::VPGATHERQQYrm:
+  case X86::VPGATHERQQZ128rm:
+  case X86::VPGATHERQQZ256rm:
+  case X86::VPGATHERQQZrm:
+  case X86::VPGATHERQQrm:
+  case X86::VSCATTERDPDZ128mr:
+  case X86::VSCATTERDPDZ256mr:
   case X86::VSCATTERDPDZmr:
+  case X86::VSCATTERDPSZ128mr:
+  case X86::VSCATTERDPSZ256mr:
   case X86::VSCATTERDPSZmr:
-  case X86::VPSCATTERQDZmr:
-  case X86::VPSCATTERQQZmr:
+  case X86::VSCATTERPF0DPDm:
+  case X86::VSCATTERPF0DPSm:
+  case X86::VSCATTERPF0QPDm:
+  case X86::VSCATTERPF0QPSm:
+  case X86::VSCATTERPF1DPDm:
+  case X86::VSCATTERPF1DPSm:
+  case X86::VSCATTERPF1QPDm:
+  case X86::VSCATTERPF1QPSm:
+  case X86::VSCATTERQPDZ128mr:
+  case X86::VSCATTERQPDZ256mr:
+  case X86::VSCATTERQPDZmr:
+  case X86::VSCATTERQPSZ128mr:
+  case X86::VSCATTERQPSZ256mr:
+  case X86::VSCATTERQPSZmr:
+  case X86::VPSCATTERDDZ128mr:
+  case X86::VPSCATTERDDZ256mr:
   case X86::VPSCATTERDDZmr:
+  case X86::VPSCATTERDQZ128mr:
+  case X86::VPSCATTERDQZ256mr:
   case X86::VPSCATTERDQZmr:
+  case X86::VPSCATTERQDZ128mr:
+  case X86::VPSCATTERQDZ256mr:
+  case X86::VPSCATTERQDZmr:
+  case X86::VPSCATTERQQZ128mr:
+  case X86::VPSCATTERQQZ256mr:
+  case X86::VPSCATTERQQZmr:
     return true;
   }
 }
 
-bool X86InstrInfo::
-hasHighOperandLatency(const TargetSchedModel &SchedModel,
-                      const MachineRegisterInfo *MRI,
-                      const MachineInstr *DefMI, unsigned DefIdx,
-                      const MachineInstr *UseMI, unsigned UseIdx) const {
-  return isHighLatencyDef(DefMI->getOpcode());
+bool X86InstrInfo::hasHighOperandLatency(const TargetSchedModel &SchedModel,
+                                         const MachineRegisterInfo *MRI,
+                                         const MachineInstr &DefMI,
+                                         unsigned DefIdx,
+                                         const MachineInstr &UseMI,
+                                         unsigned UseIdx) const {
+  return isHighLatencyDef(DefMI.getOpcode());
 }
 
 bool X86InstrInfo::hasReassociableOperands(const MachineInstr &Inst,
@@ -7014,12 +7661,119 @@ bool X86InstrInfo::isAssociativeAndCommutative(const MachineInstr &Inst) const {
   case X86::PANDrr:
   case X86::PORrr:
   case X86::PXORrr:
+  case X86::ANDPDrr:
+  case X86::ANDPSrr:
+  case X86::ORPDrr:
+  case X86::ORPSrr:
+  case X86::XORPDrr:
+  case X86::XORPSrr:
+  case X86::PADDBrr:
+  case X86::PADDWrr:
+  case X86::PADDDrr:
+  case X86::PADDQrr:
   case X86::VPANDrr:
   case X86::VPANDYrr:
+  case X86::VPANDDZ128rr:
+  case X86::VPANDDZ256rr:
+  case X86::VPANDDZrr:
+  case X86::VPANDQZ128rr:
+  case X86::VPANDQZ256rr:
+  case X86::VPANDQZrr:
   case X86::VPORrr:
   case X86::VPORYrr:
+  case X86::VPORDZ128rr:
+  case X86::VPORDZ256rr:
+  case X86::VPORDZrr:
+  case X86::VPORQZ128rr:
+  case X86::VPORQZ256rr:
+  case X86::VPORQZrr:
   case X86::VPXORrr:
   case X86::VPXORYrr:
+  case X86::VPXORDZ128rr:
+  case X86::VPXORDZ256rr:
+  case X86::VPXORDZrr:
+  case X86::VPXORQZ128rr:
+  case X86::VPXORQZ256rr:
+  case X86::VPXORQZrr:
+  case X86::VANDPDrr:
+  case X86::VANDPSrr:
+  case X86::VANDPDYrr:
+  case X86::VANDPSYrr:
+  case X86::VANDPDZ128rr:
+  case X86::VANDPSZ128rr:
+  case X86::VANDPDZ256rr:
+  case X86::VANDPSZ256rr:
+  case X86::VANDPDZrr:
+  case X86::VANDPSZrr:
+  case X86::VORPDrr:
+  case X86::VORPSrr:
+  case X86::VORPDYrr:
+  case X86::VORPSYrr:
+  case X86::VORPDZ128rr:
+  case X86::VORPSZ128rr:
+  case X86::VORPDZ256rr:
+  case X86::VORPSZ256rr:
+  case X86::VORPDZrr:
+  case X86::VORPSZrr:
+  case X86::VXORPDrr:
+  case X86::VXORPSrr:
+  case X86::VXORPDYrr:
+  case X86::VXORPSYrr:
+  case X86::VXORPDZ128rr:
+  case X86::VXORPSZ128rr:
+  case X86::VXORPDZ256rr:
+  case X86::VXORPSZ256rr:
+  case X86::VXORPDZrr:
+  case X86::VXORPSZrr:
+  case X86::KADDBrr:
+  case X86::KADDWrr:
+  case X86::KADDDrr:
+  case X86::KADDQrr:
+  case X86::KANDBrr:
+  case X86::KANDWrr:
+  case X86::KANDDrr:
+  case X86::KANDQrr:
+  case X86::KORBrr:
+  case X86::KORWrr:
+  case X86::KORDrr:
+  case X86::KORQrr:
+  case X86::KXORBrr:
+  case X86::KXORWrr:
+  case X86::KXORDrr:
+  case X86::KXORQrr:
+  case X86::VPADDBrr:
+  case X86::VPADDWrr:
+  case X86::VPADDDrr:
+  case X86::VPADDQrr:
+  case X86::VPADDBYrr:
+  case X86::VPADDWYrr:
+  case X86::VPADDDYrr:
+  case X86::VPADDQYrr:
+  case X86::VPADDBZ128rr:
+  case X86::VPADDWZ128rr:
+  case X86::VPADDDZ128rr:
+  case X86::VPADDQZ128rr:
+  case X86::VPADDBZ256rr:
+  case X86::VPADDWZ256rr:
+  case X86::VPADDDZ256rr:
+  case X86::VPADDQZ256rr:
+  case X86::VPADDBZrr:
+  case X86::VPADDWZrr:
+  case X86::VPADDDZrr:
+  case X86::VPADDQZrr:
+  case X86::VPMULLWrr:
+  case X86::VPMULLWYrr:
+  case X86::VPMULLWZ128rr:
+  case X86::VPMULLWZ256rr:
+  case X86::VPMULLWZrr:
+  case X86::VPMULLDrr:
+  case X86::VPMULLDYrr:
+  case X86::VPMULLDZ128rr:
+  case X86::VPMULLDZ256rr:
+  case X86::VPMULLDZrr:
+  case X86::VPMULLQZ128rr:
+  case X86::VPMULLQZ256rr:
+  case X86::VPMULLQZrr:
   // Normal min/max instructions are not commutative because of NaN and signed
   // zero semantics, but these are. Thus, there's no need to check for global
   // relaxed math; the instructions themselves have the properties we need.
@@ -7035,14 +7789,30 @@ bool X86InstrInfo::isAssociativeAndCommutative(const MachineInstr &Inst) const {
   case X86::VMAXCPSrr:
   case X86::VMAXCPDYrr:
   case X86::VMAXCPSYrr:
+  case X86::VMAXCPDZ128rr:
+  case X86::VMAXCPSZ128rr:
+  case X86::VMAXCPDZ256rr:
+  case X86::VMAXCPSZ256rr:
+  case X86::VMAXCPDZrr:
+  case X86::VMAXCPSZrr:
   case X86::VMAXCSDrr:
   case X86::VMAXCSSrr:
+  case X86::VMAXCSDZrr:
+  case X86::VMAXCSSZrr:
   case X86::VMINCPDrr:
   case X86::VMINCPSrr:
   case X86::VMINCPDYrr:
   case X86::VMINCPSYrr:
+  case X86::VMINCPDZ128rr:
+  case X86::VMINCPSZ128rr:
+  case X86::VMINCPDZ256rr:
+  case X86::VMINCPSZ256rr:
+  case X86::VMINCPDZrr:
+  case X86::VMINCPSZrr:
   case X86::VMINCSDrr:
   case X86::VMINCSSrr:
+  case X86::VMINCSDZrr:
+  case X86::VMINCSSZrr:
     return true;
   case X86::ADDPDrr:
   case X86::ADDPSrr:
@@ -7056,14 +7826,30 @@ bool X86InstrInfo::isAssociativeAndCommutative(const MachineInstr &Inst) const {
   case X86::VADDPSrr:
   case X86::VADDPDYrr:
   case X86::VADDPSYrr:
+  case X86::VADDPDZ128rr:
+  case X86::VADDPSZ128rr:
+  case X86::VADDPDZ256rr:
+  case X86::VADDPSZ256rr:
+  case X86::VADDPDZrr:
+  case X86::VADDPSZrr:
   case X86::VADDSDrr:
   case X86::VADDSSrr:
+  case X86::VADDSDZrr:
+  case X86::VADDSSZrr:
   case X86::VMULPDrr:
   case X86::VMULPSrr:
   case X86::VMULPDYrr:
   case X86::VMULPSYrr:
+  case X86::VMULPDZ128rr:
+  case X86::VMULPSZ128rr:
+  case X86::VMULPDZ256rr:
+  case X86::VMULPSZ256rr:
+  case X86::VMULPDZrr:
+  case X86::VMULPSZrr:
   case X86::VMULSDrr:
   case X86::VMULSSrr:
+  case X86::VMULSDZrr:
+  case X86::VMULSSZrr:
     return Inst.getParent()->getParent()->getTarget().Options.UnsafeFPMath;
   default:
     return false;
@@ -7135,10 +7921,8 @@ X86InstrInfo::getSerializableDirectMachineOperandTargetFlags() const {
       {MO_NTPOFF, "x86-ntpoff"},
       {MO_GOTNTPOFF, "x86-gotntpoff"},
       {MO_DLLIMPORT, "x86-dllimport"},
-      {MO_DARWIN_STUB, "x86-darwin-stub"},
       {MO_DARWIN_NONLAZY, "x86-darwin-nonlazy"},
       {MO_DARWIN_NONLAZY_PIC_BASE, "x86-darwin-nonlazy-pic-base"},
-      {MO_DARWIN_HIDDEN_NONLAZY_PIC_BASE, "x86-darwin-hidden-nonlazy-pic-base"},
       {MO_TLVP, "x86-tlvp"},
       {MO_TLVP_PIC_BASE, "x86-tlvp-pic-base"},
       {MO_SECREL, "x86-secrel"}};
@@ -7163,7 +7947,7 @@ namespace {
         return false;
 
       // Only emit a global base reg in PIC mode.
-      if (TM->getRelocationModel() != Reloc::PIC_)
+      if (!TM->isPositionIndependent())
         return false;
 
       X86MachineFunctionInfo *X86FI = MF.getInfo<X86MachineFunctionInfo>();
@@ -7223,7 +8007,10 @@ namespace {
     LDTLSCleanup() : MachineFunctionPass(ID) {}
 
     bool runOnMachineFunction(MachineFunction &MF) override {
-      X86MachineFunctionInfo* MFI = MF.getInfo<X86MachineFunctionInfo>();
+      if (skipFunction(*MF.getFunction()))
+        return false;
+
+      X86MachineFunctionInfo *MFI = MF.getInfo<X86MachineFunctionInfo>();
       if (MFI->getNumLocalDynamicTLSAccesses() < 2) {
         // No point folding accesses if there isn't at least two.
         return false;
@@ -7249,9 +8036,9 @@ namespace {
           case X86::TLS_base_addr32:
           case X86::TLS_base_addr64:
             if (TLSBaseAddrReg)
-              I = ReplaceTLSBaseAddrCall(I, TLSBaseAddrReg);
+              I = ReplaceTLSBaseAddrCall(*I, TLSBaseAddrReg);
             else
-              I = SetRegister(I, &TLSBaseAddrReg);
+              I = SetRegister(*I, &TLSBaseAddrReg);
             Changed = true;
             break;
           default:
@@ -7270,29 +8057,29 @@ namespace {
 
     // Replace the TLS_base_addr instruction I with a copy from
     // TLSBaseAddrReg, returning the new instruction.
-    MachineInstr *ReplaceTLSBaseAddrCall(MachineInstr *I,
+    MachineInstr *ReplaceTLSBaseAddrCall(MachineInstr &I,
                                          unsigned TLSBaseAddrReg) {
-      MachineFunction *MF = I->getParent()->getParent();
+      MachineFunction *MF = I.getParent()->getParent();
       const X86Subtarget &STI = MF->getSubtarget<X86Subtarget>();
       const bool is64Bit = STI.is64Bit();
       const X86InstrInfo *TII = STI.getInstrInfo();
 
       // Insert a Copy from TLSBaseAddrReg to RAX/EAX.
-      MachineInstr *Copy = BuildMI(*I->getParent(), I, I->getDebugLoc(),
-                                   TII->get(TargetOpcode::COPY),
-                                   is64Bit ? X86::RAX : X86::EAX)
-                                   .addReg(TLSBaseAddrReg);
+      MachineInstr *Copy =
+          BuildMI(*I.getParent(), I, I.getDebugLoc(),
+                  TII->get(TargetOpcode::COPY), is64Bit ? X86::RAX : X86::EAX)
+              .addReg(TLSBaseAddrReg);
 
       // Erase the TLS_base_addr instruction.
-      I->eraseFromParent();
+      I.eraseFromParent();
 
       return Copy;
     }
 
     // Create a virtal register in *TLSBaseAddrReg, and populate it by
     // inserting a copy instruction after I. Returns the new instruction.
-    MachineInstr *SetRegister(MachineInstr *I, unsigned *TLSBaseAddrReg) {
-      MachineFunction *MF = I->getParent()->getParent();
+    MachineInstr *SetRegister(MachineInstr &I, unsigned *TLSBaseAddrReg) {
+      MachineFunction *MF = I.getParent()->getParent();
       const X86Subtarget &STI = MF->getSubtarget<X86Subtarget>();
       const bool is64Bit = STI.is64Bit();
       const X86InstrInfo *TII = STI.getInstrInfo();
@@ -7304,11 +8091,11 @@ namespace {
                                                       : &X86::GR32RegClass);
 
       // Insert a copy from RAX/EAX to TLSBaseAddrReg.
-      MachineInstr *Next = I->getNextNode();
-      MachineInstr *Copy = BuildMI(*I->getParent(), Next, I->getDebugLoc(),
-                                   TII->get(TargetOpcode::COPY),
-                                   *TLSBaseAddrReg)
-                                   .addReg(is64Bit ? X86::RAX : X86::EAX);
+      MachineInstr *Next = I.getNextNode();
+      MachineInstr *Copy =
+          BuildMI(*I.getParent(), Next, I.getDebugLoc(),
+                  TII->get(TargetOpcode::COPY), *TLSBaseAddrReg)
+              .addReg(is64Bit ? X86::RAX : X86::EAX);
 
       return Copy;
     }
diff --git a/lib/Target/X86/X86InstrInfo.h b/lib/Target/X86/X86InstrInfo.h
index edd09d617595..858f35d1cbf0 100644
--- a/lib/Target/X86/X86InstrInfo.h
+++ b/lib/Target/X86/X86InstrInfo.h
@@ -23,60 +23,61 @@
 #include "X86GenInstrInfo.inc"
 
 namespace llvm {
+  class MachineInstrBuilder;
   class X86RegisterInfo;
   class X86Subtarget;
 
 namespace X86 {
   // X86 specific condition code. These correspond to X86_*_COND in
   // X86InstrInfo.td. They must be kept in synch.
-  enum CondCode {
-    COND_A  = 0,
-    COND_AE = 1,
-    COND_B  = 2,
-    COND_BE = 3,
-    COND_E  = 4,
-    COND_G  = 5,
-    COND_GE = 6,
-    COND_L  = 7,
-    COND_LE = 8,
-    COND_NE = 9,
-    COND_NO = 10,
-    COND_NP = 11,
-    COND_NS = 12,
-    COND_O  = 13,
-    COND_P  = 14,
-    COND_S  = 15,
-    LAST_VALID_COND = COND_S,
-
-    // Artificial condition codes. These are used by AnalyzeBranch
-    // to indicate a block terminated with two conditional branches to
-    // the same location. This occurs in code using FCMP_OEQ or FCMP_UNE,
-    // which can't be represented on x86 with a single condition. These
-    // are never used in MachineInstrs.
-    COND_NE_OR_P,
-    COND_NP_OR_E,
-
-    COND_INVALID
-  };
-
-  // Turn condition code into conditional branch opcode.
-  unsigned GetCondBranchFromCond(CondCode CC);
-
-  /// \brief Return a set opcode for the given condition and whether it has
-  /// a memory operand.
-  unsigned getSETFromCond(CondCode CC, bool HasMemoryOperand = false);
-
-  /// \brief Return a cmov opcode for the given condition, register size in
-  /// bytes, and operand type.
-  unsigned getCMovFromCond(CondCode CC, unsigned RegBytes,
-                           bool HasMemoryOperand = false);
-
-  // Turn CMov opcode into condition code.
-  CondCode getCondFromCMovOpc(unsigned Opc);
-
-  /// GetOppositeBranchCondition - Return the inverse of the specified cond,
-  /// e.g. turning COND_E to COND_NE.
-  CondCode GetOppositeBranchCondition(CondCode CC);
+enum CondCode {
+  COND_A = 0,
+  COND_AE = 1,
+  COND_B = 2,
+  COND_BE = 3,
+  COND_E = 4,
+  COND_G = 5,
+  COND_GE = 6,
+  COND_L = 7,
+  COND_LE = 8,
+  COND_NE = 9,
+  COND_NO = 10,
+  COND_NP = 11,
+  COND_NS = 12,
+  COND_O = 13,
+  COND_P = 14,
+  COND_S = 15,
+  LAST_VALID_COND = COND_S,
+
+  // Artificial condition codes. These are used by AnalyzeBranch
+  // to indicate a block terminated with two conditional branches that together
+  // form a compound condition. They occur in code using FCMP_OEQ or FCMP_UNE,
+  // which can't be represented on x86 with a single condition. These
+  // are never used in MachineInstrs and are inverses of one another.
+  COND_NE_OR_P,
+  COND_E_AND_NP,
+
+  COND_INVALID
+};
+
+// Turn condition code into conditional branch opcode.
+unsigned GetCondBranchFromCond(CondCode CC);
+
+/// \brief Return a set opcode for the given condition and whether it has
+/// a memory operand.
+unsigned getSETFromCond(CondCode CC, bool HasMemoryOperand = false);
+
+/// \brief Return a cmov opcode for the given condition, register size in
+/// bytes, and operand type.
+unsigned getCMovFromCond(CondCode CC, unsigned RegBytes,
+                         bool HasMemoryOperand = false);
+
+// Turn CMov opcode into condition code.
+CondCode getCondFromCMovOpc(unsigned Opc);
+
+/// GetOppositeBranchCondition - Return the inverse of the specified cond,
+/// e.g. turning COND_E to COND_NE.
+CondCode GetOppositeBranchCondition(CondCode CC);
 }  // end namespace X86;
 
 
@@ -89,7 +90,6 @@ inline static bool isGlobalStubReference(unsigned char TargetFlag) {
   case X86II::MO_GOT:       // normal GOT reference.
   case X86II::MO_DARWIN_NONLAZY_PIC_BASE:        // Normal $non_lazy_ptr ref.
   case X86II::MO_DARWIN_NONLAZY:                 // Normal $non_lazy_ptr ref.
-  case X86II::MO_DARWIN_HIDDEN_NONLAZY_PIC_BASE: // Hidden $non_lazy_ptr ref.
     return true;
   default:
     return false;
@@ -105,7 +105,6 @@ inline static bool isGlobalRelativeToPICBase(unsigned char TargetFlag) {
   case X86II::MO_GOT:                            // isPICStyleGOT: other global.
   case X86II::MO_PIC_BASE_OFFSET:                // Darwin local global.
   case X86II::MO_DARWIN_NONLAZY_PIC_BASE:        // Darwin/32 external global.
-  case X86II::MO_DARWIN_HIDDEN_NONLAZY_PIC_BASE: // Darwin/32 hidden global.
   case X86II::MO_TLVP:                           // ??? Pretty sure..
     return true;
   default:
@@ -119,23 +118,24 @@ inline static bool isScale(const MachineOperand &MO) {
      MO.getImm() == 4 || MO.getImm() == 8);
 }
 
-inline static bool isLeaMem(const MachineInstr *MI, unsigned Op) {
-  if (MI->getOperand(Op).isFI()) return true;
-  return Op+X86::AddrSegmentReg <= MI->getNumOperands() &&
-    MI->getOperand(Op+X86::AddrBaseReg).isReg() &&
-    isScale(MI->getOperand(Op+X86::AddrScaleAmt)) &&
-    MI->getOperand(Op+X86::AddrIndexReg).isReg() &&
-    (MI->getOperand(Op+X86::AddrDisp).isImm() ||
-     MI->getOperand(Op+X86::AddrDisp).isGlobal() ||
-     MI->getOperand(Op+X86::AddrDisp).isCPI() ||
-     MI->getOperand(Op+X86::AddrDisp).isJTI());
+inline static bool isLeaMem(const MachineInstr &MI, unsigned Op) {
+  if (MI.getOperand(Op).isFI())
+    return true;
+  return Op + X86::AddrSegmentReg <= MI.getNumOperands() &&
+         MI.getOperand(Op + X86::AddrBaseReg).isReg() &&
+         isScale(MI.getOperand(Op + X86::AddrScaleAmt)) &&
+         MI.getOperand(Op + X86::AddrIndexReg).isReg() &&
+         (MI.getOperand(Op + X86::AddrDisp).isImm() ||
+          MI.getOperand(Op + X86::AddrDisp).isGlobal() ||
+          MI.getOperand(Op + X86::AddrDisp).isCPI() ||
+          MI.getOperand(Op + X86::AddrDisp).isJTI());
 }
 
-inline static bool isMem(const MachineInstr *MI, unsigned Op) {
-  if (MI->getOperand(Op).isFI()) return true;
-  return Op+X86::AddrNumOperands <= MI->getNumOperands() &&
-    MI->getOperand(Op+X86::AddrSegmentReg).isReg() &&
-    isLeaMem(MI, Op);
+inline static bool isMem(const MachineInstr &MI, unsigned Op) {
+  if (MI.getOperand(Op).isFI())
+    return true;
+  return Op + X86::AddrNumOperands <= MI.getNumOperands() &&
+         MI.getOperand(Op + X86::AddrSegmentReg).isReg() && isLeaMem(MI, Op);
 }
 
 class X86InstrInfo final : public X86GenInstrInfo {
@@ -146,7 +146,7 @@ class X86InstrInfo final : public X86GenInstrInfo {
   /// RegOp2MemOpTable2, RegOp2MemOpTable3 - Load / store folding opcode maps.
   ///
   typedef DenseMap<unsigned,
-                   std::pair<unsigned, unsigned> > RegOp2MemOpTableType;
+                   std::pair<uint16_t, uint16_t> > RegOp2MemOpTableType;
   RegOp2MemOpTableType RegOp2MemOpTable2Addr;
   RegOp2MemOpTableType RegOp2MemOpTable0;
   RegOp2MemOpTableType RegOp2MemOpTable1;
@@ -157,12 +157,12 @@ class X86InstrInfo final : public X86GenInstrInfo {
   /// MemOp2RegOpTable - Load / store unfolding opcode map.
   ///
   typedef DenseMap<unsigned,
-                   std::pair<unsigned, unsigned> > MemOp2RegOpTableType;
+                   std::pair<uint16_t, uint16_t> > MemOp2RegOpTableType;
   MemOp2RegOpTableType MemOp2RegOpTable;
 
   static void AddTableEntry(RegOp2MemOpTableType &R2MTable,
                             MemOp2RegOpTableType &M2RTable,
-                            unsigned RegOp, unsigned MemOp, unsigned Flags);
+                            uint16_t RegOp, uint16_t MemOp, uint16_t Flags);
 
   virtual void anchor();
 
@@ -184,7 +184,7 @@ public:
   /// getSPAdjust - This returns the stack pointer adjustment made by
   /// this instruction. For x86, we need to handle more complex call
   /// sequences involving PUSHes.
-  int getSPAdjust(const MachineInstr *MI) const override;
+  int getSPAdjust(const MachineInstr &MI) const override;
 
   /// isCoalescableExtInstr - Return true if the instruction is a "coalescable"
   /// extension instruction. That is, it's like a copy where it's legal for the
@@ -196,27 +196,27 @@ public:
                              unsigned &SrcReg, unsigned &DstReg,
                              unsigned &SubIdx) const override;
 
-  unsigned isLoadFromStackSlot(const MachineInstr *MI,
+  unsigned isLoadFromStackSlot(const MachineInstr &MI,
                                int &FrameIndex) const override;
   /// isLoadFromStackSlotPostFE - Check for post-frame ptr elimination
   /// stack locations as well.  This uses a heuristic so it isn't
   /// reliable for correctness.
-  unsigned isLoadFromStackSlotPostFE(const MachineInstr *MI,
+  unsigned isLoadFromStackSlotPostFE(const MachineInstr &MI,
                                      int &FrameIndex) const override;
 
-  unsigned isStoreToStackSlot(const MachineInstr *MI,
+  unsigned isStoreToStackSlot(const MachineInstr &MI,
                               int &FrameIndex) const override;
   /// isStoreToStackSlotPostFE - Check for post-frame ptr elimination
   /// stack locations as well.  This uses a heuristic so it isn't
   /// reliable for correctness.
-  unsigned isStoreToStackSlotPostFE(const MachineInstr *MI,
+  unsigned isStoreToStackSlotPostFE(const MachineInstr &MI,
                                     int &FrameIndex) const override;
 
-  bool isReallyTriviallyReMaterializable(const MachineInstr *MI,
+  bool isReallyTriviallyReMaterializable(const MachineInstr &MI,
                                          AliasAnalysis *AA) const override;
   void reMaterialize(MachineBasicBlock &MBB, MachineBasicBlock::iterator MI,
                      unsigned DestReg, unsigned SubIdx,
-                     const MachineInstr *Orig,
+                     const MachineInstr &Orig,
                      const TargetRegisterInfo &TRI) const override;
 
   /// Given an operand within a MachineInstr, insert preceding code to put it
@@ -227,10 +227,10 @@ public:
   ///
   /// Reference parameters are set to indicate how caller should add this
   /// operand to the LEA instruction.
-  bool classifyLEAReg(MachineInstr *MI, const MachineOperand &Src,
-                      unsigned LEAOpcode, bool AllowSP,
-                      unsigned &NewSrc, bool &isKill,
-                      bool &isUndef, MachineOperand &ImplicitOp) const;
+  bool classifyLEAReg(MachineInstr &MI, const MachineOperand &Src,
+                      unsigned LEAOpcode, bool AllowSP, unsigned &NewSrc,
+                      bool &isKill, bool &isUndef,
+                      MachineOperand &ImplicitOp) const;
 
   /// convertToThreeAddress - This method must be implemented by targets that
   /// set the M_CONVERTIBLE_TO_3_ADDR flag.  When this flag is set, the target
@@ -243,7 +243,7 @@ public:
   /// performed, otherwise it returns the new instruction.
   ///
   MachineInstr *convertToThreeAddress(MachineFunction::iterator &MFI,
-                                      MachineBasicBlock::iterator &MBBI,
+                                      MachineInstr &MI,
                                       LiveVariables *LV) const override;
 
   /// Returns true iff the routine could find two commutable operands in the
@@ -261,7 +261,7 @@ public:
   ///     findCommutedOpIndices(MI, Op1, Op2);
   /// can be interpreted as a query asking to find an operand that would be
   /// commutable with the operand#1.
-  bool findCommutedOpIndices(MachineInstr *MI, unsigned &SrcOpIdx1,
+  bool findCommutedOpIndices(MachineInstr &MI, unsigned &SrcOpIdx1,
                              unsigned &SrcOpIdx2) const override;
 
   /// Returns true if the routine could find two commutable operands
@@ -286,8 +286,7 @@ public:
   ///     FMA213 #1, #2, #3
   /// results into instruction with adjusted opcode:
   ///     FMA231 #3, #2, #1
-  bool findFMA3CommutedOpIndices(MachineInstr *MI,
-                                 unsigned &SrcOpIdx1,
+  bool findFMA3CommutedOpIndices(MachineInstr &MI, unsigned &SrcOpIdx1,
                                  unsigned &SrcOpIdx2) const;
 
   /// Returns an adjusted FMA opcode that must be used in FMA instruction that
@@ -300,37 +299,35 @@ public:
   ///     FMA213 #1, #2, #3
   /// results into instruction with adjusted opcode:
   ///     FMA231 #3, #2, #1
-  unsigned getFMA3OpcodeToCommuteOperands(MachineInstr *MI,
-                                          unsigned SrcOpIdx1,
+  unsigned getFMA3OpcodeToCommuteOperands(MachineInstr &MI, unsigned SrcOpIdx1,
                                           unsigned SrcOpIdx2) const;
 
   // Branch analysis.
-  bool isUnpredicatedTerminator(const MachineInstr* MI) const override;
-  bool AnalyzeBranch(MachineBasicBlock &MBB, MachineBasicBlock *&TBB,
+  bool isUnpredicatedTerminator(const MachineInstr &MI) const override;
+  bool analyzeBranch(MachineBasicBlock &MBB, MachineBasicBlock *&TBB,
                      MachineBasicBlock *&FBB,
                      SmallVectorImpl<MachineOperand> &Cond,
                      bool AllowModify) const override;
 
-  bool getMemOpBaseRegImmOfs(MachineInstr *LdSt, unsigned &BaseReg,
-                             unsigned &Offset,
+  bool getMemOpBaseRegImmOfs(MachineInstr &LdSt, unsigned &BaseReg,
+                             int64_t &Offset,
                              const TargetRegisterInfo *TRI) const override;
-  bool AnalyzeBranchPredicate(MachineBasicBlock &MBB,
+  bool analyzeBranchPredicate(MachineBasicBlock &MBB,
                               TargetInstrInfo::MachineBranchPredicate &MBP,
                               bool AllowModify = false) const override;
 
   unsigned RemoveBranch(MachineBasicBlock &MBB) const override;
   unsigned InsertBranch(MachineBasicBlock &MBB, MachineBasicBlock *TBB,
                         MachineBasicBlock *FBB, ArrayRef<MachineOperand> Cond,
-                        DebugLoc DL) const override;
+                        const DebugLoc &DL) const override;
   bool canInsertSelect(const MachineBasicBlock&, ArrayRef<MachineOperand> Cond,
                        unsigned, unsigned, int&, int&, int&) const override;
-  void insertSelect(MachineBasicBlock &MBB,
-                    MachineBasicBlock::iterator MI, DebugLoc DL,
-                    unsigned DstReg, ArrayRef<MachineOperand> Cond,
-                    unsigned TrueReg, unsigned FalseReg) const override;
-  void copyPhysReg(MachineBasicBlock &MBB,
-                   MachineBasicBlock::iterator MI, DebugLoc DL,
-                   unsigned DestReg, unsigned SrcReg,
+  void insertSelect(MachineBasicBlock &MBB, MachineBasicBlock::iterator MI,
+                    const DebugLoc &DL, unsigned DstReg,
+                    ArrayRef<MachineOperand> Cond, unsigned TrueReg,
+                    unsigned FalseReg) const override;
+  void copyPhysReg(MachineBasicBlock &MBB, MachineBasicBlock::iterator MI,
+                   const DebugLoc &DL, unsigned DestReg, unsigned SrcReg,
                    bool KillSrc) const override;
   void storeRegToStackSlot(MachineBasicBlock &MBB,
                            MachineBasicBlock::iterator MI,
@@ -358,7 +355,7 @@ public:
                        MachineInstr::mmo_iterator MMOEnd,
                        SmallVectorImpl<MachineInstr*> &NewMIs) const;
 
-  bool expandPostRAPseudo(MachineBasicBlock::iterator MI) const override;
+  bool expandPostRAPseudo(MachineInstr &MI) const override;
 
   /// foldMemoryOperand - If this target supports it, fold a load or store of
   /// the specified stack slot into the specified machine instruction for the
@@ -366,25 +363,27 @@ public:
   /// folding and return true, otherwise it should return false.  If it folds
   /// the instruction, it is likely that the MachineInstruction the iterator
   /// references has been changed.
-  MachineInstr *foldMemoryOperandImpl(MachineFunction &MF, MachineInstr *MI,
-                                      ArrayRef<unsigned> Ops,
-                                      MachineBasicBlock::iterator InsertPt,
-                                      int FrameIndex) const override;
+  MachineInstr *
+  foldMemoryOperandImpl(MachineFunction &MF, MachineInstr &MI,
+                        ArrayRef<unsigned> Ops,
+                        MachineBasicBlock::iterator InsertPt, int FrameIndex,
+                        LiveIntervals *LIS = nullptr) const override;
 
   /// foldMemoryOperand - Same as the previous version except it allows folding
   /// of any load and store from / to any address, not just from a specific
   /// stack slot.
-  MachineInstr *foldMemoryOperandImpl(MachineFunction &MF, MachineInstr *MI,
-                                      ArrayRef<unsigned> Ops,
-                                      MachineBasicBlock::iterator InsertPt,
-                                      MachineInstr *LoadMI) const override;
+  MachineInstr *foldMemoryOperandImpl(
+      MachineFunction &MF, MachineInstr &MI, ArrayRef<unsigned> Ops,
+      MachineBasicBlock::iterator InsertPt, MachineInstr &LoadMI,
+      LiveIntervals *LIS = nullptr) const override;
 
   /// unfoldMemoryOperand - Separate a single instruction which folded a load or
   /// a store or a load and a store into two or more instruction. If this is
   /// possible, returns true as well as the new instructions by reference.
-  bool unfoldMemoryOperand(MachineFunction &MF, MachineInstr *MI,
-                         unsigned Reg, bool UnfoldLoad, bool UnfoldStore,
-                         SmallVectorImpl<MachineInstr*> &NewMIs) const override;
+  bool
+  unfoldMemoryOperand(MachineFunction &MF, MachineInstr &MI, unsigned Reg,
+                      bool UnfoldLoad, bool UnfoldStore,
+                      SmallVectorImpl<MachineInstr *> &NewMIs) const override;
 
   bool unfoldMemoryOperand(SelectionDAG &DAG, SDNode *N,
                            SmallVectorImpl<SDNode*> &NewNodes) const override;
@@ -419,8 +418,8 @@ public:
                                int64_t Offset1, int64_t Offset2,
                                unsigned NumLoads) const override;
 
-  bool shouldScheduleAdjacent(MachineInstr* First,
-                              MachineInstr *Second) const override;
+  bool shouldScheduleAdjacent(MachineInstr &First,
+                              MachineInstr &Second) const override;
 
   void getNoopForMachoTarget(MCInst &NopInst) const override;
 
@@ -440,7 +439,7 @@ public:
 
   /// True if MI has a condition code def, e.g. EFLAGS, that is
   /// not marked dead.
-  bool hasLiveCondCodeDef(MachineInstr *MI) const;
+  bool hasLiveCondCodeDef(MachineInstr &MI) const;
 
   /// getGlobalBaseReg - Return a virtual register initialized with the
   /// the global base register value. Output instructions required to
@@ -449,19 +448,19 @@ public:
   unsigned getGlobalBaseReg(MachineFunction *MF) const;
 
   std::pair<uint16_t, uint16_t>
-  getExecutionDomain(const MachineInstr *MI) const override;
+  getExecutionDomain(const MachineInstr &MI) const override;
 
-  void setExecutionDomain(MachineInstr *MI, unsigned Domain) const override;
+  void setExecutionDomain(MachineInstr &MI, unsigned Domain) const override;
 
   unsigned
-    getPartialRegUpdateClearance(const MachineInstr *MI, unsigned OpNum,
-                                 const TargetRegisterInfo *TRI) const override;
-  unsigned getUndefRegClearance(const MachineInstr *MI, unsigned &OpNum,
+  getPartialRegUpdateClearance(const MachineInstr &MI, unsigned OpNum,
+                               const TargetRegisterInfo *TRI) const override;
+  unsigned getUndefRegClearance(const MachineInstr &MI, unsigned &OpNum,
                                 const TargetRegisterInfo *TRI) const override;
-  void breakPartialRegDependency(MachineBasicBlock::iterator MI, unsigned OpNum,
+  void breakPartialRegDependency(MachineInstr &MI, unsigned OpNum,
                                  const TargetRegisterInfo *TRI) const override;
 
-  MachineInstr *foldMemoryOperandImpl(MachineFunction &MF, MachineInstr *MI,
+  MachineInstr *foldMemoryOperandImpl(MachineFunction &MF, MachineInstr &MI,
                                       unsigned OpNum,
                                       ArrayRef<MachineOperand> MOs,
                                       MachineBasicBlock::iterator InsertPt,
@@ -480,10 +479,10 @@ public:
 
   bool hasHighOperandLatency(const TargetSchedModel &SchedModel,
                              const MachineRegisterInfo *MRI,
-                             const MachineInstr *DefMI, unsigned DefIdx,
-                             const MachineInstr *UseMI,
+                             const MachineInstr &DefMI, unsigned DefIdx,
+                             const MachineInstr &UseMI,
                              unsigned UseIdx) const override;
-  
+
   bool useMachineCombiner() const override {
     return true;
   }
@@ -501,14 +500,14 @@ public:
   /// in SrcReg and SrcReg2 if having two register operands, and the value it
   /// compares against in CmpValue. Return true if the comparison instruction
   /// can be analyzed.
-  bool analyzeCompare(const MachineInstr *MI, unsigned &SrcReg,
+  bool analyzeCompare(const MachineInstr &MI, unsigned &SrcReg,
                       unsigned &SrcReg2, int &CmpMask,
                       int &CmpValue) const override;
 
   /// optimizeCompareInstr - Check if there exists an earlier instruction that
   /// operates on the same source operands and sets flags in the same way as
   /// Compare; remove Compare if possible.
-  bool optimizeCompareInstr(MachineInstr *CmpInstr, unsigned SrcReg,
+  bool optimizeCompareInstr(MachineInstr &CmpInstr, unsigned SrcReg,
                             unsigned SrcReg2, int CmpMask, int CmpValue,
                             const MachineRegisterInfo *MRI) const override;
 
@@ -519,7 +518,7 @@ public:
   /// defined by the load we are trying to fold. DefMI returns the machine
   /// instruction that defines FoldAsLoadDefReg, and the function returns
   /// the machine instruction generated due to folding.
-  MachineInstr* optimizeLoadInstr(MachineInstr *MI,
+  MachineInstr *optimizeLoadInstr(MachineInstr &MI,
                                   const MachineRegisterInfo *MRI,
                                   unsigned &FoldAsLoadDefReg,
                                   MachineInstr *&DefMI) const override;
@@ -542,19 +541,19 @@ protected:
   /// non-commutable operands.
   /// Even though the instruction is commutable, the method may still
   /// fail to commute the operands, null pointer is returned in such cases.
-  MachineInstr *commuteInstructionImpl(MachineInstr *MI, bool NewMI,
+  MachineInstr *commuteInstructionImpl(MachineInstr &MI, bool NewMI,
                                        unsigned CommuteOpIdx1,
                                        unsigned CommuteOpIdx2) const override;
 
 private:
-  MachineInstr * convertToThreeAddressWithLEA(unsigned MIOpc,
-                                              MachineFunction::iterator &MFI,
-                                              MachineBasicBlock::iterator &MBBI,
-                                              LiveVariables *LV) const;
+  MachineInstr *convertToThreeAddressWithLEA(unsigned MIOpc,
+                                             MachineFunction::iterator &MFI,
+                                             MachineInstr &MI,
+                                             LiveVariables *LV) const;
 
   /// Handles memory folding for special case instructions, for instance those
   /// requiring custom manipulation of the address.
-  MachineInstr *foldMemoryOperandCustom(MachineFunction &MF, MachineInstr *MI,
+  MachineInstr *foldMemoryOperandCustom(MachineFunction &MF, MachineInstr &MI,
                                         unsigned OpNum,
                                         ArrayRef<MachineOperand> MOs,
                                         MachineBasicBlock::iterator InsertPt,
@@ -562,8 +561,11 @@ private:
 
   /// isFrameOperand - Return true and the FrameIndex if the specified
   /// operand and follow operands form a reference to the stack frame.
-  bool isFrameOperand(const MachineInstr *MI, unsigned int Op,
+  bool isFrameOperand(const MachineInstr &MI, unsigned int Op,
                       int &FrameIndex) const;
+
+  /// Expand the MOVImmSExti8 pseudo-instructions.
+  bool ExpandMOVImmSExti8(MachineInstrBuilder &MIB) const;
 };
 
 } // End llvm namespace
diff --git a/lib/Target/X86/X86InstrInfo.td b/lib/Target/X86/X86InstrInfo.td
index 9c8339a841c9..b19a8f3306aa 100644
--- a/lib/Target/X86/X86InstrInfo.td
+++ b/lib/Target/X86/X86InstrInfo.td
@@ -71,10 +71,18 @@ def SDTX86rdrand : SDTypeProfile<2, 0, [SDTCisInt<0>, SDTCisVT<1, i32>]>;
 def SDTX86cas : SDTypeProfile<0, 3, [SDTCisPtrTy<0>, SDTCisInt<1>,
                                      SDTCisVT<2, i8>]>;
 def SDTX86caspair : SDTypeProfile<0, 1, [SDTCisPtrTy<0>]>;
+def SDTX86caspairSaveEbx8 : SDTypeProfile<1, 3,
+                                          [SDTCisVT<0, i32>, SDTCisPtrTy<1>,
+                                          SDTCisVT<2, i32>, SDTCisVT<3, i32>]>;
+def SDTX86caspairSaveRbx16 : SDTypeProfile<1, 3,
+                                           [SDTCisVT<0, i64>, SDTCisPtrTy<1>,
+                                           SDTCisVT<2, i64>, SDTCisVT<3, i64>]>;
 
-def SDTX86atomicBinary : SDTypeProfile<2, 3, [SDTCisInt<0>, SDTCisInt<1>,
-                                SDTCisPtrTy<2>, SDTCisInt<3>,SDTCisInt<4>]>;
-def SDTX86Ret     : SDTypeProfile<0, -1, [SDTCisVT<0, i16>]>;
+def SDTLockBinaryArithWithFlags : SDTypeProfile<1, 2, [SDTCisVT<0, i32>,
+                                                       SDTCisPtrTy<1>,
+                                                       SDTCisInt<2>]>;
+
+def SDTX86Ret     : SDTypeProfile<0, -1, [SDTCisVT<0, i32>]>;
 
 def SDT_X86CallSeqStart : SDCallSeqStart<[SDTCisVT<0, i32>]>;
 def SDT_X86CallSeqEnd   : SDCallSeqEnd<[SDTCisVT<0, i32>,
@@ -104,6 +112,8 @@ def SDT_X86TLSBASEADDR : SDTypeProfile<0, 1, [SDTCisInt<0>]>;
 
 def SDT_X86TLSCALL : SDTypeProfile<0, 1, [SDTCisInt<0>]>;
 
+def SDT_X86WIN_ALLOCA : SDTypeProfile<0, 1, [SDTCisVT<0, iPTR>]>;
+
 def SDT_X86SEG_ALLOCA : SDTypeProfile<1, 1, [SDTCisVT<0, iPTR>, SDTCisVT<1, iPTR>]>;
 
 def SDT_X86EHRET : SDTypeProfile<0, 1, [SDTCisInt<0>]>;
@@ -116,10 +126,6 @@ def X86MemBarrier : SDNode<"X86ISD::MEMBARRIER", SDT_X86MEMBARRIER,
                             [SDNPHasChain,SDNPSideEffect]>;
 def X86MFence : SDNode<"X86ISD::MFENCE", SDT_X86MEMBARRIER,
                         [SDNPHasChain]>;
-def X86SFence : SDNode<"X86ISD::SFENCE", SDT_X86MEMBARRIER,
-                        [SDNPHasChain]>;
-def X86LFence : SDNode<"X86ISD::LFENCE", SDT_X86MEMBARRIER,
-                        [SDNPHasChain]>;
 
 
 def X86bsf     : SDNode<"X86ISD::BSF",      SDTUnaryArithWithFlags>;
@@ -153,6 +159,14 @@ def X86cas8 : SDNode<"X86ISD::LCMPXCHG8_DAG", SDTX86caspair,
 def X86cas16 : SDNode<"X86ISD::LCMPXCHG16_DAG", SDTX86caspair,
                         [SDNPHasChain, SDNPInGlue, SDNPOutGlue, SDNPMayStore,
                          SDNPMayLoad, SDNPMemOperand]>;
+def X86cas8save_ebx : SDNode<"X86ISD::LCMPXCHG8_SAVE_EBX_DAG",
+                                SDTX86caspairSaveEbx8,
+                                [SDNPHasChain, SDNPInGlue, SDNPOutGlue,
+                                SDNPMayStore, SDNPMayLoad, SDNPMemOperand]>;
+def X86cas16save_rbx : SDNode<"X86ISD::LCMPXCHG16_SAVE_RBX_DAG",
+                                SDTX86caspairSaveRbx16,
+                                [SDNPHasChain, SDNPInGlue, SDNPOutGlue,
+                                SDNPMayStore, SDNPMayLoad, SDNPMemOperand]>;
 
 def X86retflag : SDNode<"X86ISD::RET_FLAG", SDTX86Ret,
                         [SDNPHasChain, SDNPOptInGlue, SDNPVariadic]>;
@@ -214,6 +228,9 @@ def X86eh_sjlj_setjmp  : SDNode<"X86ISD::EH_SJLJ_SETJMP",
 def X86eh_sjlj_longjmp : SDNode<"X86ISD::EH_SJLJ_LONGJMP",
                                 SDTypeProfile<0, 1, [SDTCisPtrTy<0>]>,
                                 [SDNPHasChain, SDNPSideEffect]>;
+def X86eh_sjlj_setup_dispatch : SDNode<"X86ISD::EH_SJLJ_SETUP_DISPATCH",
+                                       SDTypeProfile<0, 0, []>,
+                                       [SDNPHasChain, SDNPSideEffect]>;
 
 def X86tcret : SDNode<"X86ISD::TC_RETURN", SDT_X86TCRET,
                         [SDNPHasChain,  SDNPOptInGlue, SDNPVariadic]>;
@@ -237,12 +254,28 @@ def X86xor_flag  : SDNode<"X86ISD::XOR",  SDTBinaryArithWithFlags,
 def X86and_flag  : SDNode<"X86ISD::AND",  SDTBinaryArithWithFlags,
                           [SDNPCommutative]>;
 
+def X86lock_add  : SDNode<"X86ISD::LADD",  SDTLockBinaryArithWithFlags,
+                          [SDNPHasChain, SDNPMayStore, SDNPMayLoad,
+                           SDNPMemOperand]>;
+def X86lock_sub  : SDNode<"X86ISD::LSUB",  SDTLockBinaryArithWithFlags,
+                          [SDNPHasChain, SDNPMayStore, SDNPMayLoad,
+                           SDNPMemOperand]>;
+def X86lock_or  : SDNode<"X86ISD::LOR",  SDTLockBinaryArithWithFlags,
+                         [SDNPHasChain, SDNPMayStore, SDNPMayLoad,
+                          SDNPMemOperand]>;
+def X86lock_xor  : SDNode<"X86ISD::LXOR",  SDTLockBinaryArithWithFlags,
+                          [SDNPHasChain, SDNPMayStore, SDNPMayLoad,
+                           SDNPMemOperand]>;
+def X86lock_and  : SDNode<"X86ISD::LAND",  SDTLockBinaryArithWithFlags,
+                          [SDNPHasChain, SDNPMayStore, SDNPMayLoad,
+                           SDNPMemOperand]>;
+
 def X86bextr  : SDNode<"X86ISD::BEXTR",  SDTIntBinOp>;
 
 def X86mul_imm : SDNode<"X86ISD::MUL_IMM", SDTIntBinOp>;
 
-def X86WinAlloca : SDNode<"X86ISD::WIN_ALLOCA", SDTX86Void,
-                          [SDNPHasChain, SDNPInGlue, SDNPOutGlue]>;
+def X86WinAlloca : SDNode<"X86ISD::WIN_ALLOCA", SDT_X86WIN_ALLOCA,
+                          [SDNPHasChain, SDNPOutGlue]>;
 
 def X86SegAlloca : SDNode<"X86ISD::SEG_ALLOCA", SDT_X86SEG_ALLOCA,
                           [SDNPHasChain]>;
@@ -263,7 +296,7 @@ def ptr_rc_nosp : PointerLikeRegClass<1>;
 def X86MemAsmOperand : AsmOperandClass {
  let Name = "Mem";
 }
-let RenderMethod = "addMemOperands" in {
+let RenderMethod = "addMemOperands", SuperClasses = [X86MemAsmOperand] in {
   def X86Mem8AsmOperand   : AsmOperandClass { let Name = "Mem8"; }
   def X86Mem16AsmOperand  : AsmOperandClass { let Name = "Mem16"; }
   def X86Mem32AsmOperand  : AsmOperandClass { let Name = "Mem32"; }
@@ -273,16 +306,19 @@ let RenderMethod = "addMemOperands" in {
   def X86Mem256AsmOperand : AsmOperandClass { let Name = "Mem256"; }
   def X86Mem512AsmOperand : AsmOperandClass { let Name = "Mem512"; }
   // Gather mem operands
-  def X86MemVX32Operand : AsmOperandClass { let Name = "MemVX32"; }
-  def X86MemVY32Operand : AsmOperandClass { let Name = "MemVY32"; }
-  def X86MemVZ32Operand : AsmOperandClass { let Name = "MemVZ32"; }
-  def X86MemVX64Operand : AsmOperandClass { let Name = "MemVX64"; }
-  def X86MemVY64Operand : AsmOperandClass { let Name = "MemVY64"; }
-  def X86MemVZ64Operand : AsmOperandClass { let Name = "MemVZ64"; }
-  def X86MemVX32XOperand : AsmOperandClass { let Name = "MemVX32X"; }
-  def X86MemVY32XOperand : AsmOperandClass { let Name = "MemVY32X"; }
-  def X86MemVX64XOperand : AsmOperandClass { let Name = "MemVX64X"; }
-  def X86MemVY64XOperand : AsmOperandClass { let Name = "MemVY64X"; }
+  def X86Mem64_RC128Operand  : AsmOperandClass { let Name = "Mem64_RC128"; }
+  def X86Mem128_RC128Operand : AsmOperandClass { let Name = "Mem128_RC128"; }
+  def X86Mem256_RC128Operand : AsmOperandClass { let Name = "Mem256_RC128"; }
+  def X86Mem128_RC256Operand : AsmOperandClass { let Name = "Mem128_RC256"; }
+  def X86Mem256_RC256Operand : AsmOperandClass { let Name = "Mem256_RC256"; }
+
+  def X86Mem64_RC128XOperand  : AsmOperandClass { let Name = "Mem64_RC128X"; }
+  def X86Mem128_RC128XOperand : AsmOperandClass { let Name = "Mem128_RC128X"; }
+  def X86Mem256_RC128XOperand : AsmOperandClass { let Name = "Mem256_RC128X"; }
+  def X86Mem128_RC256XOperand : AsmOperandClass { let Name = "Mem128_RC256X"; }
+  def X86Mem256_RC256XOperand : AsmOperandClass { let Name = "Mem256_RC256X"; }
+  def X86Mem512_RC256XOperand : AsmOperandClass { let Name = "Mem512_RC256X"; }
+  def X86Mem512_RC512Operand  : AsmOperandClass { let Name = "Mem512_RC512"; }
 }
 
 def X86AbsMemAsmOperand : AsmOperandClass {
@@ -293,7 +329,7 @@ def X86AbsMemAsmOperand : AsmOperandClass {
 class X86MemOperand<string printMethod,
           AsmOperandClass parserMatchClass = X86MemAsmOperand> : Operand<iPTR> {
   let PrintMethod = printMethod;
-  let MIOperandInfo = (ops ptr_rc, i8imm, ptr_rc_nosp, i32imm, i8imm);
+  let MIOperandInfo = (ops ptr_rc, i8imm, ptr_rc_nosp, i32imm, SEGMENT_REG);
   let ParserMatchClass = parserMatchClass;
   let OperandType = "OPERAND_MEMORY";
 }
@@ -302,7 +338,7 @@ class X86MemOperand<string printMethod,
 class X86VMemOperand<RegisterClass RC, string printMethod,
                      AsmOperandClass parserMatchClass>
     : X86MemOperand<printMethod, parserMatchClass> {
-  let MIOperandInfo = (ops ptr_rc, i8imm, RC, i32imm, i8imm);
+  let MIOperandInfo = (ops ptr_rc, i8imm, RC, i32imm, SEGMENT_REG);
 }
 
 def anymem : X86MemOperand<"printanymem">;
@@ -329,17 +365,19 @@ def f512mem : X86MemOperand<"printf512mem", X86Mem512AsmOperand>;
 def v512mem : X86VMemOperand<VR512, "printf512mem", X86Mem512AsmOperand>;
 
 // Gather mem operands
-def vx32mem  : X86VMemOperand<VR128,  "printi32mem", X86MemVX32Operand>;
-def vy32mem  : X86VMemOperand<VR256,  "printi32mem", X86MemVY32Operand>;
-def vx64mem  : X86VMemOperand<VR128,  "printi64mem", X86MemVX64Operand>;
-def vy64mem  : X86VMemOperand<VR256,  "printi64mem", X86MemVY64Operand>;
-
-def vx32xmem : X86VMemOperand<VR128X, "printi32mem", X86MemVX32XOperand>;
-def vx64xmem : X86VMemOperand<VR128X, "printi32mem", X86MemVX64XOperand>;
-def vy32xmem : X86VMemOperand<VR256X, "printi32mem", X86MemVY32XOperand>;
-def vy64xmem : X86VMemOperand<VR256X, "printi64mem", X86MemVY64XOperand>;
-def vz32mem  : X86VMemOperand<VR512,  "printi32mem", X86MemVZ32Operand>;
-def vz64mem  : X86VMemOperand<VR512,  "printi64mem", X86MemVZ64Operand>;
+def vx64mem  : X86VMemOperand<VR128,  "printi64mem",  X86Mem64_RC128Operand>;
+def vx128mem : X86VMemOperand<VR128,  "printi128mem", X86Mem128_RC128Operand>;
+def vx256mem : X86VMemOperand<VR128,  "printi256mem", X86Mem256_RC128Operand>;
+def vy128mem : X86VMemOperand<VR256,  "printi128mem", X86Mem128_RC256Operand>;
+def vy256mem : X86VMemOperand<VR256,  "printi256mem", X86Mem256_RC256Operand>;
+
+def vx64xmem  : X86VMemOperand<VR128X, "printi64mem",  X86Mem64_RC128XOperand>;
+def vx128xmem : X86VMemOperand<VR128X, "printi128mem", X86Mem128_RC128XOperand>;
+def vx256xmem : X86VMemOperand<VR128X, "printi256mem", X86Mem256_RC128XOperand>;
+def vy128xmem : X86VMemOperand<VR256,  "printi128mem", X86Mem128_RC256XOperand>;
+def vy256xmem : X86VMemOperand<VR256X, "printi256mem", X86Mem256_RC256XOperand>;
+def vy512mem  : X86VMemOperand<VR256X, "printi512mem", X86Mem512_RC256XOperand>;
+def vz512mem  : X86VMemOperand<VR512,  "printi512mem", X86Mem512_RC512Operand>;
 
 // A version of i8mem for use on x86-64 and x32 that uses a NOREX GPR instead
 // of a plain GPR, so that it doesn't potentially require a REX prefix.
@@ -348,7 +386,8 @@ def ptr_rc_norex_nosp : PointerLikeRegClass<3>;
 
 def i8mem_NOREX : Operand<iPTR> {
   let PrintMethod = "printi8mem";
-  let MIOperandInfo = (ops ptr_rc_norex, i8imm, ptr_rc_norex_nosp, i32imm, i8imm);
+  let MIOperandInfo = (ops ptr_rc_norex, i8imm, ptr_rc_norex_nosp, i32imm,
+                       SEGMENT_REG);
   let ParserMatchClass = X86Mem8AsmOperand;
   let OperandType = "OPERAND_MEMORY";
 }
@@ -363,7 +402,7 @@ def ptr_rc_tailcall : PointerLikeRegClass<4>;
 def i32mem_TC : Operand<i32> {
   let PrintMethod = "printi32mem";
   let MIOperandInfo = (ops ptr_rc_tailcall, i8imm, ptr_rc_tailcall,
-                       i32imm, i8imm);
+                       i32imm, SEGMENT_REG);
   let ParserMatchClass = X86Mem32AsmOperand;
   let OperandType = "OPERAND_MEMORY";
 }
@@ -374,7 +413,7 @@ def i32mem_TC : Operand<i32> {
 def i64mem_TC : Operand<i64> {
   let PrintMethod = "printi64mem";
   let MIOperandInfo = (ops ptr_rc_tailcall, i8imm,
-                       ptr_rc_tailcall, i32imm, i8imm);
+                       ptr_rc_tailcall, i32imm, SEGMENT_REG);
   let ParserMatchClass = X86Mem64AsmOperand;
   let OperandType = "OPERAND_MEMORY";
 }
@@ -494,7 +533,7 @@ let RenderMethod = "addMemOffsOperands" in {
 
 class X86SrcIdxOperand<string printMethod, AsmOperandClass parserMatchClass>
     : X86MemOperand<printMethod, parserMatchClass> {
-  let MIOperandInfo = (ops ptr_rc, i8imm);
+  let MIOperandInfo = (ops ptr_rc, SEGMENT_REG);
 }
 
 class X86DstIdxOperand<string printMethod, AsmOperandClass parserMatchClass>
@@ -514,7 +553,7 @@ def dstidx64 : X86DstIdxOperand<"printDstIdx64", X86DstIdx64Operand>;
 class X86MemOffsOperand<Operand immOperand, string printMethod,
                         AsmOperandClass parserMatchClass>
     : X86MemOperand<printMethod, parserMatchClass> {
-  let MIOperandInfo = (ops immOperand, i8imm);
+  let MIOperandInfo = (ops immOperand, SEGMENT_REG);
 }
 
 def offset16_8  : X86MemOffsOperand<i16imm, "printMemOffs8",
@@ -681,14 +720,14 @@ def i64i32imm_pcrel : Operand<i64> {
 
 def lea64_32mem : Operand<i32> {
   let PrintMethod = "printanymem";
-  let MIOperandInfo = (ops GR64, i8imm, GR64_NOSP, i32imm, i8imm);
+  let MIOperandInfo = (ops GR64, i8imm, GR64_NOSP, i32imm, SEGMENT_REG);
   let ParserMatchClass = X86MemAsmOperand;
 }
 
 // Memory operands that use 64-bit pointers in both ILP32 and LP64.
 def lea64mem : Operand<i64> {
   let PrintMethod = "printanymem";
-  let MIOperandInfo = (ops GR64, i8imm, GR64_NOSP, i32imm, i8imm);
+  let MIOperandInfo = (ops GR64, i8imm, GR64_NOSP, i32imm, SEGMENT_REG);
   let ParserMatchClass = X86MemAsmOperand;
 }
 
@@ -728,6 +767,8 @@ def vectoraddr : ComplexPattern<iPTR, 5, "selectVectorAddr", [],[SDNPWantParent]
 
 //===----------------------------------------------------------------------===//
 // X86 Instruction Predicate Definitions.
+def TruePredicate : Predicate<"true">;
+
 def HasCMov      : Predicate<"Subtarget->hasCMov()">;
 def NoCMov       : Predicate<"!Subtarget->hasCMov()">;
 
@@ -773,7 +814,7 @@ def HasVLX       : Predicate<"Subtarget->hasVLX()">,
 def NoVLX        : Predicate<"!Subtarget->hasVLX()">;
 def NoVLX_Or_NoBWI : Predicate<"!Subtarget->hasVLX() || !Subtarget->hasBWI()">;
 def NoVLX_Or_NoDQI : Predicate<"!Subtarget->hasVLX() || !Subtarget->hasDQI()">;
-def PKU        : Predicate<"!Subtarget->hasPKU()">;
+def PKU        : Predicate<"Subtarget->hasPKU()">;
 
 def HasPOPCNT    : Predicate<"Subtarget->hasPOPCNT()">;
 def HasAES       : Predicate<"Subtarget->hasAES()">;
@@ -795,6 +836,10 @@ def HasFSGSBase  : Predicate<"Subtarget->hasFSGSBase()">;
 def HasLZCNT     : Predicate<"Subtarget->hasLZCNT()">;
 def HasBMI       : Predicate<"Subtarget->hasBMI()">;
 def HasBMI2      : Predicate<"Subtarget->hasBMI2()">;
+def HasVBMI      : Predicate<"Subtarget->hasVBMI()">,
+                     AssemblerPredicate<"FeatureVBMI", "AVX-512 VBMI ISA">;
+def HasIFMA      : Predicate<"Subtarget->hasIFMA()">,
+                     AssemblerPredicate<"FeatureIFMA", "AVX-512 IFMA ISA">;
 def HasRTM       : Predicate<"Subtarget->hasRTM()">;
 def HasHLE       : Predicate<"Subtarget->hasHLE()">;
 def HasTSX       : Predicate<"Subtarget->hasRTM() || Subtarget->hasHLE()">;
@@ -804,6 +849,7 @@ def HasPRFCHW    : Predicate<"Subtarget->hasPRFCHW()">;
 def HasRDSEED    : Predicate<"Subtarget->hasRDSEED()">;
 def HasPrefetchW : Predicate<"Subtarget->hasPRFCHW()">;
 def HasLAHFSAHF  : Predicate<"Subtarget->hasLAHFSAHF()">;
+def HasMWAITX    : Predicate<"Subtarget->hasMWAITX()">;
 def FPStackf32   : Predicate<"!Subtarget->hasSSE1()">;
 def FPStackf64   : Predicate<"!Subtarget->hasSSE2()">;
 def HasMPX       : Predicate<"Subtarget->hasMPX()">;
@@ -822,6 +868,8 @@ def In32BitMode  : Predicate<"Subtarget->is32Bit()">,
                              AssemblerPredicate<"Mode32Bit", "32-bit mode">;
 def IsWin64      : Predicate<"Subtarget->isTargetWin64()">;
 def NotWin64     : Predicate<"!Subtarget->isTargetWin64()">;
+def NotWin64WithoutFP : Predicate<"!Subtarget->isTargetWin64() ||"
+                                  "Subtarget->getFrameLowering()->hasFP(*MF)">;
 def IsPS4        : Predicate<"Subtarget->isTargetPS4()">;
 def NotPS4       : Predicate<"!Subtarget->isTargetPS4()">;
 def IsNaCl       : Predicate<"Subtarget->isTargetNaCl()">;
@@ -832,15 +880,16 @@ def FarData      : Predicate<"TM.getCodeModel() != CodeModel::Small &&"
                              "TM.getCodeModel() != CodeModel::Kernel">;
 def NearData     : Predicate<"TM.getCodeModel() == CodeModel::Small ||"
                              "TM.getCodeModel() == CodeModel::Kernel">;
-def IsStatic     : Predicate<"TM.getRelocationModel() == Reloc::Static">;
-def IsNotPIC     : Predicate<"TM.getRelocationModel() != Reloc::PIC_">;
+def IsNotPIC     : Predicate<"!TM.isPositionIndependent()">;
 def OptForSize   : Predicate<"OptForSize">;
+def OptForMinSize : Predicate<"OptForMinSize">;
 def OptForSpeed  : Predicate<"!OptForSize">;
 def FastBTMem    : Predicate<"!Subtarget->isBTMemSlow()">;
-def CallImmAddr  : Predicate<"Subtarget->IsLegalToCallImmediateAddr(TM)">;
+def CallImmAddr  : Predicate<"Subtarget->isLegalToCallImmediateAddr()">;
 def FavorMemIndirectCall  : Predicate<"!Subtarget->callRegIndirect()">;
 def NotSlowIncDec : Predicate<"!Subtarget->slowIncDec()">;
 def HasFastMem32 : Predicate<"!Subtarget->isUnalignedMem32Slow()">;
+def HasMFence    : Predicate<"Subtarget->hasMFence()">;
 
 //===----------------------------------------------------------------------===//
 // X86 Instruction Format Definitions.
@@ -871,12 +920,6 @@ def X86_COND_O   : PatLeaf<(i8 13)>;
 def X86_COND_P   : PatLeaf<(i8 14)>; // alt. COND_PE
 def X86_COND_S   : PatLeaf<(i8 15)>;
 
-// Predicate used to help when pattern matching LZCNT/TZCNT.
-def X86_COND_E_OR_NE : ImmLeaf<i8, [{
-  return (Imm == X86::COND_E) || (Imm == X86::COND_NE);
-}]>;
-
-
 def i16immSExt8  : ImmLeaf<i16, [{ return isInt<8>(Imm); }]>;
 def i32immSExt8  : ImmLeaf<i32, [{ return isInt<8>(Imm); }]>;
 def i64immSExt8  : ImmLeaf<i64, [{ return isInt<8>(Imm); }]>;
@@ -1042,6 +1085,10 @@ def LEAVE64  : I<0xC9, RawFrm,
 //  Miscellaneous Instructions.
 //
 
+let isBarrier = 1, hasSideEffects = 1, usesCustomInserter = 1 in
+  def Int_eh_sjlj_setup_dispatch
+    : PseudoI<(outs), (ins), [(X86eh_sjlj_setup_dispatch)]>;
+
 let Defs = [ESP], Uses = [ESP], hasSideEffects=0 in {
 let mayLoad = 1, SchedRW = [WriteLoad] in {
 def POP16r  : I<0x58, AddRegFrm, (outs GR16:$reg), (ins), "pop{w}\t$reg", [],
@@ -1092,12 +1139,12 @@ def PUSH32rmm: I<0xFF, MRM6m, (outs), (ins i32mem:$src), "push{l}\t$src",[],
 
 let mayLoad = 1, mayStore = 1, usesCustomInserter = 1,
     SchedRW = [WriteRMW], Defs = [ESP] in {
-  let Uses = [ESP, EFLAGS] in
+  let Uses = [ESP] in
   def RDFLAGS32 : PseudoI<(outs GR32:$dst), (ins),
                    [(set GR32:$dst, (int_x86_flags_read_u32))]>,
                 Requires<[Not64BitMode]>;
 
-  let Uses = [RSP, EFLAGS] in
+  let Uses = [RSP] in
   def RDFLAGS64 : PseudoI<(outs GR64:$dst), (ins),
                    [(set GR64:$dst, (int_x86_flags_read_u64))]>,
                 Requires<[In64BitMode]>;
@@ -1253,28 +1300,28 @@ def BSR64rm  : RI<0xBD, MRMSrcMem, (outs GR64:$dst), (ins i64mem:$src),
 let SchedRW = [WriteMicrocoded] in {
 // These uses the DF flag in the EFLAGS register to inc or dec EDI and ESI
 let Defs = [EDI,ESI], Uses = [EDI,ESI,EFLAGS] in {
-def MOVSB : I<0xA4, RawFrmDstSrc, (outs dstidx8:$dst), (ins srcidx8:$src),
+def MOVSB : I<0xA4, RawFrmDstSrc, (outs), (ins dstidx8:$dst, srcidx8:$src),
               "movsb\t{$src, $dst|$dst, $src}", [], IIC_MOVS>;
-def MOVSW : I<0xA5, RawFrmDstSrc, (outs dstidx16:$dst), (ins srcidx16:$src),
+def MOVSW : I<0xA5, RawFrmDstSrc, (outs), (ins dstidx16:$dst, srcidx16:$src),
               "movsw\t{$src, $dst|$dst, $src}", [], IIC_MOVS>, OpSize16;
-def MOVSL : I<0xA5, RawFrmDstSrc, (outs dstidx32:$dst), (ins srcidx32:$src),
+def MOVSL : I<0xA5, RawFrmDstSrc, (outs), (ins dstidx32:$dst, srcidx32:$src),
               "movs{l|d}\t{$src, $dst|$dst, $src}", [], IIC_MOVS>, OpSize32;
-def MOVSQ : RI<0xA5, RawFrmDstSrc, (outs dstidx64:$dst), (ins srcidx64:$src),
+def MOVSQ : RI<0xA5, RawFrmDstSrc, (outs), (ins dstidx64:$dst, srcidx64:$src),
                "movsq\t{$src, $dst|$dst, $src}", [], IIC_MOVS>;
 }
 
 // These uses the DF flag in the EFLAGS register to inc or dec EDI and ESI
 let Defs = [EDI], Uses = [AL,EDI,EFLAGS] in
-def STOSB : I<0xAA, RawFrmDst, (outs dstidx8:$dst), (ins),
+def STOSB : I<0xAA, RawFrmDst, (outs), (ins dstidx8:$dst),
               "stosb\t{%al, $dst|$dst, al}", [], IIC_STOS>;
 let Defs = [EDI], Uses = [AX,EDI,EFLAGS] in
-def STOSW : I<0xAB, RawFrmDst, (outs dstidx16:$dst), (ins),
+def STOSW : I<0xAB, RawFrmDst, (outs), (ins dstidx16:$dst),
               "stosw\t{%ax, $dst|$dst, ax}", [], IIC_STOS>, OpSize16;
 let Defs = [EDI], Uses = [EAX,EDI,EFLAGS] in
-def STOSL : I<0xAB, RawFrmDst, (outs dstidx32:$dst), (ins),
+def STOSL : I<0xAB, RawFrmDst, (outs), (ins dstidx32:$dst),
               "stos{l|d}\t{%eax, $dst|$dst, eax}", [], IIC_STOS>, OpSize32;
 let Defs = [RDI], Uses = [RAX,RDI,EFLAGS] in
-def STOSQ : RI<0xAB, RawFrmDst, (outs dstidx64:$dst), (ins),
+def STOSQ : RI<0xAB, RawFrmDst, (outs), (ins dstidx64:$dst),
                "stosq\t{%rax, $dst|$dst, rax}", [], IIC_STOS>;
 
 // These uses the DF flag in the EFLAGS register to inc or dec EDI and ESI
@@ -1402,30 +1449,30 @@ def MOV32ao16 : Ii16<0xA1, RawFrmMemOffs, (outs), (ins offset16_32:$src),
 }
 let mayStore = 1 in {
 let Uses = [AL] in
-def MOV8o32a : Ii32<0xA2, RawFrmMemOffs, (outs offset32_8:$dst), (ins),
+def MOV8o32a : Ii32<0xA2, RawFrmMemOffs, (outs), (ins offset32_8:$dst),
                     "mov{b}\t{%al, $dst|$dst, al}", [], IIC_MOV_MEM>, AdSize32;
 let Uses = [AX] in
-def MOV16o32a : Ii32<0xA3, RawFrmMemOffs, (outs offset32_16:$dst), (ins),
+def MOV16o32a : Ii32<0xA3, RawFrmMemOffs, (outs), (ins offset32_16:$dst),
                      "mov{w}\t{%ax, $dst|$dst, ax}", [], IIC_MOV_MEM>,
                      OpSize16, AdSize32;
 let Uses = [EAX] in
-def MOV32o32a : Ii32<0xA3, RawFrmMemOffs, (outs offset32_32:$dst), (ins),
+def MOV32o32a : Ii32<0xA3, RawFrmMemOffs, (outs), (ins offset32_32:$dst),
                      "mov{l}\t{%eax, $dst|$dst, eax}", [], IIC_MOV_MEM>,
                      OpSize32, AdSize32;
 let Uses = [RAX] in
-def MOV64o32a : RIi32<0xA3, RawFrmMemOffs, (outs offset32_64:$dst), (ins),
+def MOV64o32a : RIi32<0xA3, RawFrmMemOffs, (outs), (ins offset32_64:$dst),
                       "mov{q}\t{%rax, $dst|$dst, rax}", [], IIC_MOV_MEM>,
                       AdSize32;
 
 let Uses = [AL] in
-def MOV8o16a : Ii16<0xA2, RawFrmMemOffs, (outs offset16_8:$dst), (ins),
+def MOV8o16a : Ii16<0xA2, RawFrmMemOffs, (outs), (ins offset16_8:$dst),
                     "mov{b}\t{%al, $dst|$dst, al}", [], IIC_MOV_MEM>, AdSize16;
 let Uses = [AX] in
-def MOV16o16a : Ii16<0xA3, RawFrmMemOffs, (outs offset16_16:$dst), (ins),
+def MOV16o16a : Ii16<0xA3, RawFrmMemOffs, (outs), (ins offset16_16:$dst),
                      "mov{w}\t{%ax, $dst|$dst, ax}", [], IIC_MOV_MEM>,
                      OpSize16, AdSize16;
 let Uses = [EAX] in
-def MOV32o16a : Ii16<0xA3, RawFrmMemOffs, (outs offset16_32:$dst), (ins),
+def MOV32o16a : Ii16<0xA3, RawFrmMemOffs, (outs), (ins offset16_32:$dst),
                      "mov{l}\t{%eax, $dst|$dst, eax}", [], IIC_MOV_MEM>,
                      OpSize32, AdSize16;
 }
@@ -1451,17 +1498,17 @@ def MOV64ao64 : RIi64<0xA1, RawFrmMemOffs, (outs), (ins offset64_64:$src),
 
 let mayStore = 1 in {
 let Uses = [AL] in
-def MOV8o64a : RIi64_NOREX<0xA2, RawFrmMemOffs, (outs offset64_8:$dst), (ins),
+def MOV8o64a : RIi64_NOREX<0xA2, RawFrmMemOffs, (outs), (ins offset64_8:$dst),
                      "movabs{b}\t{%al, $dst|$dst, al}", []>, AdSize64;
 let Uses = [AX] in
-def MOV16o64a : RIi64_NOREX<0xA3, RawFrmMemOffs, (outs offset64_16:$dst), (ins),
+def MOV16o64a : RIi64_NOREX<0xA3, RawFrmMemOffs, (outs), (ins offset64_16:$dst),
                      "movabs{w}\t{%ax, $dst|$dst, ax}", []>, OpSize16, AdSize64;
 let Uses = [EAX] in
-def MOV32o64a : RIi64_NOREX<0xA3, RawFrmMemOffs, (outs offset64_32:$dst), (ins),
+def MOV32o64a : RIi64_NOREX<0xA3, RawFrmMemOffs, (outs), (ins offset64_32:$dst),
                      "movabs{l}\t{%eax, $dst|$dst, eax}", []>, OpSize32,
                      AdSize64;
 let Uses = [RAX] in
-def MOV64o64a : RIi64<0xA3, RawFrmMemOffs, (outs offset64_64:$dst), (ins),
+def MOV64o64a : RIi64<0xA3, RawFrmMemOffs, (outs), (ins offset64_64:$dst),
                      "movabs{q}\t{%rax, $dst|$dst, rax}", []>, AdSize64;
 }
 } // hasSideEffects = 0
@@ -1951,11 +1998,11 @@ def OUTSL : I<0x6F, RawFrmSrc, (outs), (ins srcidx32:$src),
 
 // These uses the DF flag in the EFLAGS register to inc or dec EDI and ESI
 let Defs = [EDI], Uses = [DX,EDI,EFLAGS] in {
-def INSB : I<0x6C, RawFrmDst, (outs dstidx8:$dst), (ins),
+def INSB : I<0x6C, RawFrmDst, (outs), (ins dstidx8:$dst),
              "insb\t{%dx, $dst|$dst, dx}", [], IIC_INS>;
-def INSW : I<0x6D, RawFrmDst, (outs dstidx16:$dst), (ins),
+def INSW : I<0x6D, RawFrmDst, (outs), (ins dstidx16:$dst),
              "insw\t{%dx, $dst|$dst, dx}", [], IIC_INS>,  OpSize16;
-def INSL : I<0x6D, RawFrmDst, (outs dstidx32:$dst), (ins),
+def INSL : I<0x6D, RawFrmDst, (outs), (ins dstidx32:$dst),
              "ins{l|d}\t{%dx, $dst|$dst, dx}", [], IIC_INS>, OpSize32;
 }
 }
@@ -2124,46 +2171,6 @@ let Predicates = [HasLZCNT], Defs = [EFLAGS] in {
                       (implicit EFLAGS)]>, XS;
 }
 
-let Predicates = [HasLZCNT] in {
-  def : Pat<(X86cmov (ctlz GR16:$src), (i16 16), (X86_COND_E_OR_NE),
-              (X86cmp GR16:$src, (i16 0))),
-            (LZCNT16rr GR16:$src)>;
-  def : Pat<(X86cmov (ctlz GR32:$src), (i32 32), (X86_COND_E_OR_NE),
-              (X86cmp GR32:$src, (i32 0))),
-            (LZCNT32rr GR32:$src)>;
-  def : Pat<(X86cmov (ctlz GR64:$src), (i64 64), (X86_COND_E_OR_NE),
-              (X86cmp GR64:$src, (i64 0))),
-            (LZCNT64rr GR64:$src)>;
-  def : Pat<(X86cmov (i16 16), (ctlz GR16:$src), (X86_COND_E_OR_NE),
-              (X86cmp GR16:$src, (i16 0))),
-            (LZCNT16rr GR16:$src)>;
-  def : Pat<(X86cmov (i32 32), (ctlz GR32:$src), (X86_COND_E_OR_NE),
-              (X86cmp GR32:$src, (i32 0))),
-            (LZCNT32rr GR32:$src)>;
-  def : Pat<(X86cmov (i64 64), (ctlz GR64:$src), (X86_COND_E_OR_NE),
-              (X86cmp GR64:$src, (i64 0))),
-            (LZCNT64rr GR64:$src)>;
-
-  def : Pat<(X86cmov (ctlz (loadi16 addr:$src)), (i16 16), (X86_COND_E_OR_NE),
-              (X86cmp (loadi16 addr:$src), (i16 0))),
-            (LZCNT16rm addr:$src)>;
-  def : Pat<(X86cmov (ctlz (loadi32 addr:$src)), (i32 32), (X86_COND_E_OR_NE),
-              (X86cmp (loadi32 addr:$src), (i32 0))),
-            (LZCNT32rm addr:$src)>;
-  def : Pat<(X86cmov (ctlz (loadi64 addr:$src)), (i64 64), (X86_COND_E_OR_NE),
-              (X86cmp (loadi64 addr:$src), (i64 0))),
-            (LZCNT64rm addr:$src)>;
-  def : Pat<(X86cmov (i16 16), (ctlz (loadi16 addr:$src)), (X86_COND_E_OR_NE),
-              (X86cmp (loadi16 addr:$src), (i16 0))),
-            (LZCNT16rm addr:$src)>;
-  def : Pat<(X86cmov (i32 32), (ctlz (loadi32 addr:$src)), (X86_COND_E_OR_NE),
-              (X86cmp (loadi32 addr:$src), (i32 0))),
-            (LZCNT32rm addr:$src)>;
-  def : Pat<(X86cmov (i64 64), (ctlz (loadi64 addr:$src)), (X86_COND_E_OR_NE),
-              (X86cmp (loadi64 addr:$src), (i64 0))),
-            (LZCNT64rm addr:$src)>;
-}
-
 //===----------------------------------------------------------------------===//
 // BMI Instructions
 //
@@ -2240,46 +2247,6 @@ let Predicates = [HasBMI] in {
             (BLSI64rr GR64:$src)>;
 }
 
-let Predicates = [HasBMI] in {
-  def : Pat<(X86cmov (cttz GR16:$src), (i16 16), (X86_COND_E_OR_NE),
-              (X86cmp GR16:$src, (i16 0))),
-            (TZCNT16rr GR16:$src)>;
-  def : Pat<(X86cmov (cttz GR32:$src), (i32 32), (X86_COND_E_OR_NE),
-              (X86cmp GR32:$src, (i32 0))),
-            (TZCNT32rr GR32:$src)>;
-  def : Pat<(X86cmov (cttz GR64:$src), (i64 64), (X86_COND_E_OR_NE),
-              (X86cmp GR64:$src, (i64 0))),
-            (TZCNT64rr GR64:$src)>;
-  def : Pat<(X86cmov (i16 16), (cttz GR16:$src), (X86_COND_E_OR_NE),
-              (X86cmp GR16:$src, (i16 0))),
-            (TZCNT16rr GR16:$src)>;
-  def : Pat<(X86cmov (i32 32), (cttz GR32:$src), (X86_COND_E_OR_NE),
-              (X86cmp GR32:$src, (i32 0))),
-            (TZCNT32rr GR32:$src)>;
-  def : Pat<(X86cmov (i64 64), (cttz GR64:$src), (X86_COND_E_OR_NE),
-              (X86cmp GR64:$src, (i64 0))),
-            (TZCNT64rr GR64:$src)>;
-
-  def : Pat<(X86cmov (cttz (loadi16 addr:$src)), (i16 16), (X86_COND_E_OR_NE),
-              (X86cmp (loadi16 addr:$src), (i16 0))),
-            (TZCNT16rm addr:$src)>;
-  def : Pat<(X86cmov (cttz (loadi32 addr:$src)), (i32 32), (X86_COND_E_OR_NE),
-              (X86cmp (loadi32 addr:$src), (i32 0))),
-            (TZCNT32rm addr:$src)>;
-  def : Pat<(X86cmov (cttz (loadi64 addr:$src)), (i64 64), (X86_COND_E_OR_NE),
-              (X86cmp (loadi64 addr:$src), (i64 0))),
-            (TZCNT64rm addr:$src)>;
-  def : Pat<(X86cmov (i16 16), (cttz (loadi16 addr:$src)), (X86_COND_E_OR_NE),
-              (X86cmp (loadi16 addr:$src), (i16 0))),
-            (TZCNT16rm addr:$src)>;
-  def : Pat<(X86cmov (i32 32), (cttz (loadi32 addr:$src)), (X86_COND_E_OR_NE),
-              (X86cmp (loadi32 addr:$src), (i32 0))),
-            (TZCNT32rm addr:$src)>;
-  def : Pat<(X86cmov (i64 64), (cttz (loadi64 addr:$src)), (X86_COND_E_OR_NE),
-              (X86cmp (loadi64 addr:$src), (i64 0))),
-            (TZCNT64rm addr:$src)>;
-}
-
 
 multiclass bmi_bextr_bzhi<bits<8> opc, string mnemonic, RegisterClass RC,
                           X86MemOperand x86memop, Intrinsic Int,
@@ -2440,22 +2407,34 @@ defm TZMSK   : tbm_binary_intr<0x01, "tzmsk", MRM4r, MRM4m>;
 //===----------------------------------------------------------------------===//
 // MONITORX/MWAITX Instructions
 //
-let SchedRW = [WriteSystem] in {
-let Uses = [EAX, ECX, EDX] in
-def MONITORXrrr : I<0x01, MRM_FA, (outs), (ins), "monitorx", [],
-                    IIC_SSE_MONITOR>, TB;
-let Uses = [ECX, EAX, EBX] in
-def MWAITXrr   : I<0x01, MRM_FB, (outs), (ins), "mwaitx", [], IIC_SSE_MWAIT>,
-                 TB;
+let SchedRW = [ WriteSystem ] in {
+  let usesCustomInserter = 1 in {
+    def MONITORX : PseudoI<(outs), (ins i32mem:$src1, GR32:$src2, GR32:$src3),
+                           [(int_x86_monitorx addr:$src1, GR32:$src2, GR32:$src3)]>,
+                   Requires<[ HasMWAITX ]>;
+  }
+
+  let Uses = [ EAX, ECX, EDX ] in {
+    def MONITORXrrr : I<0x01, MRM_FA, (outs), (ins), "monitorx", [], IIC_SSE_MONITORX>,
+                      TB, Requires<[ HasMWAITX ]>;
+  }
+
+  let Uses = [ ECX, EAX, EBX ] in {
+    def MWAITXrrr : I<0x01, MRM_FB, (outs), (ins), "mwaitx", 
+                    [(int_x86_mwaitx ECX, EAX, EBX)], IIC_SSE_MWAITX>,
+                    TB, Requires<[ HasMWAITX ]>;
+  }
 } // SchedRW
 
-def : InstAlias<"mwaitx\t{%eax, %ecx, %ebx|ebx, ecx, eax}", (MWAITXrr)>, Requires<[Not64BitMode]>;
-def : InstAlias<"mwaitx\t{%rax, %rcx, %rbx|rbx, rcx, rax}", (MWAITXrr)>, Requires<[In64BitMode]>;
+def : InstAlias<"mwaitx\t{%eax, %ecx, %ebx|ebx, ecx, eax}", (MWAITXrrr)>,
+      Requires<[ Not64BitMode ]>;
+def : InstAlias<"mwaitx\t{%rax, %rcx, %rbx|rbx, rcx, rax}", (MWAITXrrr)>,
+      Requires<[ In64BitMode ]>;
 
 def : InstAlias<"monitorx\t{%eax, %ecx, %edx|edx, ecx, eax}", (MONITORXrrr)>,
-      Requires<[Not64BitMode]>;
+      Requires<[ Not64BitMode ]>;
 def : InstAlias<"monitorx\t{%rax, %rcx, %rdx|rdx, rcx, rax}", (MONITORXrrr)>,
-      Requires<[In64BitMode]>;
+      Requires<[ In64BitMode ]>;
 
 //===----------------------------------------------------------------------===//
 // CLZERO Instruction
@@ -2535,7 +2514,7 @@ let Predicates = [HasTBM] in {
 //
 
 def CLFLUSHOPT : I<0xAE, MRM7m, (outs), (ins i8mem:$src),
-                   "clflushopt\t$src", []>, PD;
+                   "clflushopt\t$src", [(int_x86_clflushopt addr:$src)]>, PD;
 def CLWB       : I<0xAE, MRM6m, (outs), (ins i8mem:$src), "clwb\t$src", []>, PD;
 def PCOMMIT    : I<0xAE, MRM_F8, (outs), (ins), "pcommit", []>, PD;
 
@@ -2781,6 +2760,11 @@ def : InstAlias<"lods\t{$src, %al|al, $src}", (LODSB srcidx8:$src),  0>;
 def : InstAlias<"lods\t{$src, %ax|ax, $src}", (LODSW srcidx16:$src), 0>;
 def : InstAlias<"lods\t{$src, %eax|eax, $src}", (LODSL srcidx32:$src), 0>;
 def : InstAlias<"lods\t{$src, %rax|rax, $src}", (LODSQ srcidx64:$src), 0>, Requires<[In64BitMode]>;
+def : InstAlias<"lods\t$src", (LODSB srcidx8:$src),  0>;
+def : InstAlias<"lods\t$src", (LODSW srcidx16:$src), 0>;
+def : InstAlias<"lods\t$src", (LODSL srcidx32:$src), 0>;
+def : InstAlias<"lods\t$src", (LODSQ srcidx64:$src), 0>, Requires<[In64BitMode]>;
+
 
 // stos aliases. Accept the source being omitted because it's implicit in
 // the mnemonic, or the mnemonic suffix being omitted because it's implicit
@@ -2793,6 +2777,11 @@ def : InstAlias<"stos\t{%al, $dst|$dst, al}", (STOSB dstidx8:$dst),  0>;
 def : InstAlias<"stos\t{%ax, $dst|$dst, ax}", (STOSW dstidx16:$dst), 0>;
 def : InstAlias<"stos\t{%eax, $dst|$dst, eax}", (STOSL dstidx32:$dst), 0>;
 def : InstAlias<"stos\t{%rax, $dst|$dst, rax}", (STOSQ dstidx64:$dst), 0>, Requires<[In64BitMode]>;
+def : InstAlias<"stos\t$dst", (STOSB dstidx8:$dst),  0>;
+def : InstAlias<"stos\t$dst", (STOSW dstidx16:$dst), 0>;
+def : InstAlias<"stos\t$dst", (STOSL dstidx32:$dst), 0>;
+def : InstAlias<"stos\t$dst", (STOSQ dstidx64:$dst), 0>, Requires<[In64BitMode]>;
+
 
 // scas aliases. Accept the destination being omitted because it's implicit
 // in the mnemonic, or the mnemonic suffix being omitted because it's implicit
@@ -2805,6 +2794,24 @@ def : InstAlias<"scas\t{$dst, %al|al, $dst}", (SCASB dstidx8:$dst),  0>;
 def : InstAlias<"scas\t{$dst, %ax|ax, $dst}", (SCASW dstidx16:$dst), 0>;
 def : InstAlias<"scas\t{$dst, %eax|eax, $dst}", (SCASL dstidx32:$dst), 0>;
 def : InstAlias<"scas\t{$dst, %rax|rax, $dst}", (SCASQ dstidx64:$dst), 0>, Requires<[In64BitMode]>;
+def : InstAlias<"scas\t$dst", (SCASB dstidx8:$dst),  0>;
+def : InstAlias<"scas\t$dst", (SCASW dstidx16:$dst), 0>;
+def : InstAlias<"scas\t$dst", (SCASL dstidx32:$dst), 0>;
+def : InstAlias<"scas\t$dst", (SCASQ dstidx64:$dst), 0>, Requires<[In64BitMode]>;
+
+// cmps aliases. Mnemonic suffix being omitted because it's implicit
+// in the destination.
+def : InstAlias<"cmps\t{$dst, $src|$src, $dst}", (CMPSB dstidx8:$dst, srcidx8:$src),  0>;
+def : InstAlias<"cmps\t{$dst, $src|$src, $dst}", (CMPSW dstidx16:$dst, srcidx16:$src), 0>;
+def : InstAlias<"cmps\t{$dst, $src|$src, $dst}", (CMPSL dstidx32:$dst, srcidx32:$src), 0>;
+def : InstAlias<"cmps\t{$dst, $src|$src, $dst}", (CMPSQ dstidx64:$dst, srcidx64:$src), 0>, Requires<[In64BitMode]>;
+
+// movs aliases. Mnemonic suffix being omitted because it's implicit
+// in the destination.
+def : InstAlias<"movs\t{$src, $dst|$dst, $src}", (MOVSB dstidx8:$dst, srcidx8:$src),  0>;
+def : InstAlias<"movs\t{$src, $dst|$dst, $src}", (MOVSW dstidx16:$dst, srcidx16:$src), 0>;
+def : InstAlias<"movs\t{$src, $dst|$dst, $src}", (MOVSL dstidx32:$dst, srcidx32:$src), 0>;
+def : InstAlias<"movs\t{$src, $dst|$dst, $src}", (MOVSQ dstidx64:$dst, srcidx64:$src), 0>, Requires<[In64BitMode]>;
 
 // div and idiv aliases for explicit A register.
 def : InstAlias<"div{b}\t{$src, %al|al, $src}", (DIV8r  GR8 :$src)>;
@@ -2892,8 +2899,8 @@ def : InstAlias<"fnstsw"     , (FNSTSW16r)>;
 
 // lcall and ljmp aliases.  This seems to be an odd mapping in 64-bit mode, but
 // this is compatible with what GAS does.
-def : InstAlias<"lcall\t$seg, $off", (FARCALL32i i32imm:$off, i16imm:$seg), 0>, Requires<[Not16BitMode]>;
-def : InstAlias<"ljmp\t$seg, $off",  (FARJMP32i  i32imm:$off, i16imm:$seg), 0>, Requires<[Not16BitMode]>;
+def : InstAlias<"lcall\t$seg, $off", (FARCALL32i i32imm:$off, i16imm:$seg), 0>, Requires<[In32BitMode]>;
+def : InstAlias<"ljmp\t$seg, $off",  (FARJMP32i  i32imm:$off, i16imm:$seg), 0>, Requires<[In32BitMode]>;
 def : InstAlias<"lcall\t{*}$dst",    (FARCALL32m opaque48mem:$dst), 0>, Requires<[Not16BitMode]>;
 def : InstAlias<"ljmp\t{*}$dst",     (FARJMP32m  opaque48mem:$dst), 0>, Requires<[Not16BitMode]>;
 def : InstAlias<"lcall\t$seg, $off", (FARCALL16i i16imm:$off, i16imm:$seg), 0>, Requires<[In16BitMode]>;
@@ -2917,6 +2924,18 @@ def : InstAlias<"imul{l}\t{$imm, $r|$r, $imm}", (IMUL32rri8 GR32:$r, GR32:$r, i3
 def : InstAlias<"imul{q}\t{$imm, $r|$r, $imm}", (IMUL64rri32 GR64:$r, GR64:$r, i64i32imm:$imm), 0>;
 def : InstAlias<"imul{q}\t{$imm, $r|$r, $imm}", (IMUL64rri8 GR64:$r, GR64:$r, i64i8imm:$imm), 0>;
 
+// ins aliases. Accept the mnemonic suffix being omitted because it's implicit
+// in the destination.
+def : InstAlias<"ins\t{%dx, $dst|$dst, dx}", (INSB dstidx8:$dst),  0>;
+def : InstAlias<"ins\t{%dx, $dst|$dst, dx}", (INSW dstidx16:$dst),  0>;
+def : InstAlias<"ins\t{%dx, $dst|$dst, dx}", (INSL dstidx32:$dst),  0>;
+
+// outs aliases. Accept the mnemonic suffix being omitted because it's implicit
+// in the source.
+def : InstAlias<"outs\t{$src, %dx|dx, $src}", (OUTSB srcidx8:$src),  0>;
+def : InstAlias<"outs\t{$src, %dx|dx, $src}", (OUTSW srcidx16:$src),  0>;
+def : InstAlias<"outs\t{$src, %dx|dx, $src}", (OUTSL srcidx32:$src),  0>;
+
 // inb %dx -> inb %al, %dx
 def : InstAlias<"inb\t{%dx|dx}", (IN8rr), 0>;
 def : InstAlias<"inw\t{%dx|dx}", (IN16rr), 0>;
@@ -2929,12 +2948,12 @@ def : InstAlias<"inl\t$port", (IN32ri u8imm:$port), 0>;
 // jmp and call aliases for lcall and ljmp.  jmp $42,$5 -> ljmp
 def : InstAlias<"call\t$seg, $off",  (FARCALL16i i16imm:$off, i16imm:$seg)>, Requires<[In16BitMode]>;
 def : InstAlias<"jmp\t$seg, $off",   (FARJMP16i  i16imm:$off, i16imm:$seg)>, Requires<[In16BitMode]>;
-def : InstAlias<"call\t$seg, $off",  (FARCALL32i i32imm:$off, i16imm:$seg)>, Requires<[Not16BitMode]>;
-def : InstAlias<"jmp\t$seg, $off",   (FARJMP32i  i32imm:$off, i16imm:$seg)>, Requires<[Not16BitMode]>;
-def : InstAlias<"callw\t$seg, $off", (FARCALL16i i16imm:$off, i16imm:$seg)>;
-def : InstAlias<"jmpw\t$seg, $off",  (FARJMP16i  i16imm:$off, i16imm:$seg)>;
-def : InstAlias<"calll\t$seg, $off", (FARCALL32i i32imm:$off, i16imm:$seg)>;
-def : InstAlias<"jmpl\t$seg, $off",  (FARJMP32i  i32imm:$off, i16imm:$seg)>;
+def : InstAlias<"call\t$seg, $off",  (FARCALL32i i32imm:$off, i16imm:$seg)>, Requires<[In32BitMode]>;
+def : InstAlias<"jmp\t$seg, $off",   (FARJMP32i  i32imm:$off, i16imm:$seg)>, Requires<[In32BitMode]>;
+def : InstAlias<"callw\t$seg, $off", (FARCALL16i i16imm:$off, i16imm:$seg)>, Requires<[Not64BitMode]>;
+def : InstAlias<"jmpw\t$seg, $off",  (FARJMP16i  i16imm:$off, i16imm:$seg)>, Requires<[Not64BitMode]>;
+def : InstAlias<"calll\t$seg, $off", (FARCALL32i i32imm:$off, i16imm:$seg)>, Requires<[Not64BitMode]>;
+def : InstAlias<"jmpl\t$seg, $off",  (FARJMP32i  i32imm:$off, i16imm:$seg)>, Requires<[Not64BitMode]>;
 
 // Force mov without a suffix with a segment and mem to prefer the 'l' form of
 // the move.  All segment/mem forms are equivalent, this has the shortest
diff --git a/lib/Target/X86/X86InstrMMX.td b/lib/Target/X86/X86InstrMMX.td
index 83f9b1409f61..8d70691714dd 100644
--- a/lib/Target/X86/X86InstrMMX.td
+++ b/lib/Target/X86/X86InstrMMX.td
@@ -282,7 +282,7 @@ def MMX_MOVQ64rr_REV : MMXI<0x7F, MRMDestReg, (outs VR64:$dst), (ins VR64:$src),
 
 let isCodeGenOnly = 1, ForceDisassemble = 1, hasSideEffects = 0, mayStore = 1 in
 def MMX_MOVD64from64rm : MMXRI<0x7E, MRMDestMem,
-                               (outs i64mem:$dst), (ins VR64:$src),
+                               (outs), (ins i64mem:$dst, VR64:$src),
                                "movd\t{$src, $dst|$dst, $src}",
                                [], IIC_MMX_MOV_REG_MM>, Sched<[WriteStore]>;
 
diff --git a/lib/Target/X86/X86InstrMPX.td b/lib/Target/X86/X86InstrMPX.td
index 71ab97374dd6..309f601d1fce 100644
--- a/lib/Target/X86/X86InstrMPX.td
+++ b/lib/Target/X86/X86InstrMPX.td
@@ -55,10 +55,10 @@ def BNDMOVRM64rm : RI<0x1A, MRMSrcMem, (outs BNDR:$dst), (ins i128mem:$src),
 def BNDMOVMRrr   : I<0x1B, MRMDestReg, (outs BNDR:$dst), (ins BNDR:$src),
                     "bndmov\t{$src, $dst|$dst, $src}", []>, PD,
                     Requires<[HasMPX]>;
-def BNDMOVMR32mr : I<0x1B, MRMDestMem, (outs i64mem:$dst), (ins BNDR:$src),
+def BNDMOVMR32mr : I<0x1B, MRMDestMem, (outs), (ins i64mem:$dst, BNDR:$src),
                     "bndmov\t{$src, $dst|$dst, $src}", []>, PD,
                     Requires<[HasMPX, Not64BitMode]>;
-def BNDMOVMR64mr : RI<0x1B, MRMDestMem, (outs i128mem:$dst), (ins BNDR:$src),
+def BNDMOVMR64mr : RI<0x1B, MRMDestMem, (outs), (ins i128mem:$dst, BNDR:$src),
                     "bndmov\t{$src, $dst|$dst, $src}", []>, PD,
                     Requires<[HasMPX, In64BitMode]>;
 
diff --git a/lib/Target/X86/X86InstrSSE.td b/lib/Target/X86/X86InstrSSE.td
index 6a7c45665e9c..9a98f5cac2ee 100644
--- a/lib/Target/X86/X86InstrSSE.td
+++ b/lib/Target/X86/X86InstrSSE.td
@@ -382,75 +382,71 @@ def : Pat<(v4f64 (scalar_to_vector FR64:$src)),
 
 // Bitcasts between 128-bit vector types. Return the original type since
 // no instruction is needed for the conversion
-let Predicates = [HasSSE2] in {
-  def : Pat<(v2i64 (bitconvert (v4i32 VR128:$src))), (v2i64 VR128:$src)>;
-  def : Pat<(v2i64 (bitconvert (v8i16 VR128:$src))), (v2i64 VR128:$src)>;
-  def : Pat<(v2i64 (bitconvert (v16i8 VR128:$src))), (v2i64 VR128:$src)>;
-  def : Pat<(v2i64 (bitconvert (v2f64 VR128:$src))), (v2i64 VR128:$src)>;
-  def : Pat<(v2i64 (bitconvert (v4f32 VR128:$src))), (v2i64 VR128:$src)>;
-  def : Pat<(v4i32 (bitconvert (v2i64 VR128:$src))), (v4i32 VR128:$src)>;
-  def : Pat<(v4i32 (bitconvert (v8i16 VR128:$src))), (v4i32 VR128:$src)>;
-  def : Pat<(v4i32 (bitconvert (v16i8 VR128:$src))), (v4i32 VR128:$src)>;
-  def : Pat<(v4i32 (bitconvert (v2f64 VR128:$src))), (v4i32 VR128:$src)>;
-  def : Pat<(v4i32 (bitconvert (v4f32 VR128:$src))), (v4i32 VR128:$src)>;
-  def : Pat<(v8i16 (bitconvert (v2i64 VR128:$src))), (v8i16 VR128:$src)>;
-  def : Pat<(v8i16 (bitconvert (v4i32 VR128:$src))), (v8i16 VR128:$src)>;
-  def : Pat<(v8i16 (bitconvert (v16i8 VR128:$src))), (v8i16 VR128:$src)>;
-  def : Pat<(v8i16 (bitconvert (v2f64 VR128:$src))), (v8i16 VR128:$src)>;
-  def : Pat<(v8i16 (bitconvert (v4f32 VR128:$src))), (v8i16 VR128:$src)>;
-  def : Pat<(v16i8 (bitconvert (v2i64 VR128:$src))), (v16i8 VR128:$src)>;
-  def : Pat<(v16i8 (bitconvert (v4i32 VR128:$src))), (v16i8 VR128:$src)>;
-  def : Pat<(v16i8 (bitconvert (v8i16 VR128:$src))), (v16i8 VR128:$src)>;
-  def : Pat<(v16i8 (bitconvert (v2f64 VR128:$src))), (v16i8 VR128:$src)>;
-  def : Pat<(v16i8 (bitconvert (v4f32 VR128:$src))), (v16i8 VR128:$src)>;
-  def : Pat<(v4f32 (bitconvert (v2i64 VR128:$src))), (v4f32 VR128:$src)>;
-  def : Pat<(v4f32 (bitconvert (v4i32 VR128:$src))), (v4f32 VR128:$src)>;
-  def : Pat<(v4f32 (bitconvert (v8i16 VR128:$src))), (v4f32 VR128:$src)>;
-  def : Pat<(v4f32 (bitconvert (v16i8 VR128:$src))), (v4f32 VR128:$src)>;
-  def : Pat<(v4f32 (bitconvert (v2f64 VR128:$src))), (v4f32 VR128:$src)>;
-  def : Pat<(v2f64 (bitconvert (v2i64 VR128:$src))), (v2f64 VR128:$src)>;
-  def : Pat<(v2f64 (bitconvert (v4i32 VR128:$src))), (v2f64 VR128:$src)>;
-  def : Pat<(v2f64 (bitconvert (v8i16 VR128:$src))), (v2f64 VR128:$src)>;
-  def : Pat<(v2f64 (bitconvert (v16i8 VR128:$src))), (v2f64 VR128:$src)>;
-  def : Pat<(v2f64 (bitconvert (v4f32 VR128:$src))), (v2f64 VR128:$src)>;
-  def : Pat<(f128  (bitconvert (i128  FR128:$src))), (f128  FR128:$src)>;
-  def : Pat<(i128  (bitconvert (f128  FR128:$src))), (i128  FR128:$src)>;
-}
+def : Pat<(v2i64 (bitconvert (v4i32 VR128:$src))), (v2i64 VR128:$src)>;
+def : Pat<(v2i64 (bitconvert (v8i16 VR128:$src))), (v2i64 VR128:$src)>;
+def : Pat<(v2i64 (bitconvert (v16i8 VR128:$src))), (v2i64 VR128:$src)>;
+def : Pat<(v2i64 (bitconvert (v2f64 VR128:$src))), (v2i64 VR128:$src)>;
+def : Pat<(v2i64 (bitconvert (v4f32 VR128:$src))), (v2i64 VR128:$src)>;
+def : Pat<(v4i32 (bitconvert (v2i64 VR128:$src))), (v4i32 VR128:$src)>;
+def : Pat<(v4i32 (bitconvert (v8i16 VR128:$src))), (v4i32 VR128:$src)>;
+def : Pat<(v4i32 (bitconvert (v16i8 VR128:$src))), (v4i32 VR128:$src)>;
+def : Pat<(v4i32 (bitconvert (v2f64 VR128:$src))), (v4i32 VR128:$src)>;
+def : Pat<(v4i32 (bitconvert (v4f32 VR128:$src))), (v4i32 VR128:$src)>;
+def : Pat<(v8i16 (bitconvert (v2i64 VR128:$src))), (v8i16 VR128:$src)>;
+def : Pat<(v8i16 (bitconvert (v4i32 VR128:$src))), (v8i16 VR128:$src)>;
+def : Pat<(v8i16 (bitconvert (v16i8 VR128:$src))), (v8i16 VR128:$src)>;
+def : Pat<(v8i16 (bitconvert (v2f64 VR128:$src))), (v8i16 VR128:$src)>;
+def : Pat<(v8i16 (bitconvert (v4f32 VR128:$src))), (v8i16 VR128:$src)>;
+def : Pat<(v16i8 (bitconvert (v2i64 VR128:$src))), (v16i8 VR128:$src)>;
+def : Pat<(v16i8 (bitconvert (v4i32 VR128:$src))), (v16i8 VR128:$src)>;
+def : Pat<(v16i8 (bitconvert (v8i16 VR128:$src))), (v16i8 VR128:$src)>;
+def : Pat<(v16i8 (bitconvert (v2f64 VR128:$src))), (v16i8 VR128:$src)>;
+def : Pat<(v16i8 (bitconvert (v4f32 VR128:$src))), (v16i8 VR128:$src)>;
+def : Pat<(v4f32 (bitconvert (v2i64 VR128:$src))), (v4f32 VR128:$src)>;
+def : Pat<(v4f32 (bitconvert (v4i32 VR128:$src))), (v4f32 VR128:$src)>;
+def : Pat<(v4f32 (bitconvert (v8i16 VR128:$src))), (v4f32 VR128:$src)>;
+def : Pat<(v4f32 (bitconvert (v16i8 VR128:$src))), (v4f32 VR128:$src)>;
+def : Pat<(v4f32 (bitconvert (v2f64 VR128:$src))), (v4f32 VR128:$src)>;
+def : Pat<(v2f64 (bitconvert (v2i64 VR128:$src))), (v2f64 VR128:$src)>;
+def : Pat<(v2f64 (bitconvert (v4i32 VR128:$src))), (v2f64 VR128:$src)>;
+def : Pat<(v2f64 (bitconvert (v8i16 VR128:$src))), (v2f64 VR128:$src)>;
+def : Pat<(v2f64 (bitconvert (v16i8 VR128:$src))), (v2f64 VR128:$src)>;
+def : Pat<(v2f64 (bitconvert (v4f32 VR128:$src))), (v2f64 VR128:$src)>;
+def : Pat<(f128  (bitconvert (i128  FR128:$src))), (f128  FR128:$src)>;
+def : Pat<(i128  (bitconvert (f128  FR128:$src))), (i128  FR128:$src)>;
 
 // Bitcasts between 256-bit vector types. Return the original type since
 // no instruction is needed for the conversion
-let Predicates = [HasAVX] in {
-  def : Pat<(v4f64  (bitconvert (v8f32 VR256:$src))),  (v4f64 VR256:$src)>;
-  def : Pat<(v4f64  (bitconvert (v8i32 VR256:$src))),  (v4f64 VR256:$src)>;
-  def : Pat<(v4f64  (bitconvert (v4i64 VR256:$src))),  (v4f64 VR256:$src)>;
-  def : Pat<(v4f64  (bitconvert (v16i16 VR256:$src))), (v4f64 VR256:$src)>;
-  def : Pat<(v4f64  (bitconvert (v32i8 VR256:$src))),  (v4f64 VR256:$src)>;
-  def : Pat<(v8f32  (bitconvert (v8i32 VR256:$src))),  (v8f32 VR256:$src)>;
-  def : Pat<(v8f32  (bitconvert (v4i64 VR256:$src))),  (v8f32 VR256:$src)>;
-  def : Pat<(v8f32  (bitconvert (v4f64 VR256:$src))),  (v8f32 VR256:$src)>;
-  def : Pat<(v8f32  (bitconvert (v32i8 VR256:$src))),  (v8f32 VR256:$src)>;
-  def : Pat<(v8f32  (bitconvert (v16i16 VR256:$src))), (v8f32 VR256:$src)>;
-  def : Pat<(v4i64  (bitconvert (v8f32 VR256:$src))),  (v4i64 VR256:$src)>;
-  def : Pat<(v4i64  (bitconvert (v8i32 VR256:$src))),  (v4i64 VR256:$src)>;
-  def : Pat<(v4i64  (bitconvert (v4f64 VR256:$src))),  (v4i64 VR256:$src)>;
-  def : Pat<(v4i64  (bitconvert (v32i8 VR256:$src))),  (v4i64 VR256:$src)>;
-  def : Pat<(v4i64  (bitconvert (v16i16 VR256:$src))), (v4i64 VR256:$src)>;
-  def : Pat<(v32i8  (bitconvert (v4f64 VR256:$src))),  (v32i8 VR256:$src)>;
-  def : Pat<(v32i8  (bitconvert (v4i64 VR256:$src))),  (v32i8 VR256:$src)>;
-  def : Pat<(v32i8  (bitconvert (v8f32 VR256:$src))),  (v32i8 VR256:$src)>;
-  def : Pat<(v32i8  (bitconvert (v8i32 VR256:$src))),  (v32i8 VR256:$src)>;
-  def : Pat<(v32i8  (bitconvert (v16i16 VR256:$src))), (v32i8 VR256:$src)>;
-  def : Pat<(v8i32  (bitconvert (v32i8 VR256:$src))),  (v8i32 VR256:$src)>;
-  def : Pat<(v8i32  (bitconvert (v16i16 VR256:$src))), (v8i32 VR256:$src)>;
-  def : Pat<(v8i32  (bitconvert (v8f32 VR256:$src))),  (v8i32 VR256:$src)>;
-  def : Pat<(v8i32  (bitconvert (v4i64 VR256:$src))),  (v8i32 VR256:$src)>;
-  def : Pat<(v8i32  (bitconvert (v4f64 VR256:$src))),  (v8i32 VR256:$src)>;
-  def : Pat<(v16i16 (bitconvert (v8f32 VR256:$src))),  (v16i16 VR256:$src)>;
-  def : Pat<(v16i16 (bitconvert (v8i32 VR256:$src))),  (v16i16 VR256:$src)>;
-  def : Pat<(v16i16 (bitconvert (v4i64 VR256:$src))),  (v16i16 VR256:$src)>;
-  def : Pat<(v16i16 (bitconvert (v4f64 VR256:$src))),  (v16i16 VR256:$src)>;
-  def : Pat<(v16i16 (bitconvert (v32i8 VR256:$src))),  (v16i16 VR256:$src)>;
-}
+def : Pat<(v4i64  (bitconvert (v8i32  VR256:$src))), (v4i64  VR256:$src)>;
+def : Pat<(v4i64  (bitconvert (v16i16 VR256:$src))), (v4i64  VR256:$src)>;
+def : Pat<(v4i64  (bitconvert (v32i8  VR256:$src))), (v4i64  VR256:$src)>;
+def : Pat<(v4i64  (bitconvert (v8f32  VR256:$src))), (v4i64  VR256:$src)>;
+def : Pat<(v4i64  (bitconvert (v4f64  VR256:$src))), (v4i64  VR256:$src)>;
+def : Pat<(v8i32  (bitconvert (v4i64  VR256:$src))), (v8i32  VR256:$src)>;
+def : Pat<(v8i32  (bitconvert (v16i16 VR256:$src))), (v8i32  VR256:$src)>;
+def : Pat<(v8i32  (bitconvert (v32i8  VR256:$src))), (v8i32  VR256:$src)>;
+def : Pat<(v8i32  (bitconvert (v4f64  VR256:$src))), (v8i32  VR256:$src)>;
+def : Pat<(v8i32  (bitconvert (v8f32  VR256:$src))), (v8i32  VR256:$src)>;
+def : Pat<(v16i16 (bitconvert (v4i64  VR256:$src))), (v16i16 VR256:$src)>;
+def : Pat<(v16i16 (bitconvert (v8i32  VR256:$src))), (v16i16 VR256:$src)>;
+def : Pat<(v16i16 (bitconvert (v32i8  VR256:$src))), (v16i16 VR256:$src)>;
+def : Pat<(v16i16 (bitconvert (v4f64  VR256:$src))), (v16i16 VR256:$src)>;
+def : Pat<(v16i16 (bitconvert (v8f32  VR256:$src))), (v16i16 VR256:$src)>;
+def : Pat<(v32i8  (bitconvert (v4i64  VR256:$src))), (v32i8  VR256:$src)>;
+def : Pat<(v32i8  (bitconvert (v8i32  VR256:$src))), (v32i8  VR256:$src)>;
+def : Pat<(v32i8  (bitconvert (v16i16 VR256:$src))), (v32i8  VR256:$src)>;
+def : Pat<(v32i8  (bitconvert (v4f64  VR256:$src))), (v32i8  VR256:$src)>;
+def : Pat<(v32i8  (bitconvert (v8f32  VR256:$src))), (v32i8  VR256:$src)>;
+def : Pat<(v8f32  (bitconvert (v4i64  VR256:$src))), (v8f32  VR256:$src)>;
+def : Pat<(v8f32  (bitconvert (v8i32  VR256:$src))), (v8f32  VR256:$src)>;
+def : Pat<(v8f32  (bitconvert (v16i16 VR256:$src))), (v8f32  VR256:$src)>;
+def : Pat<(v8f32  (bitconvert (v32i8  VR256:$src))), (v8f32  VR256:$src)>;
+def : Pat<(v8f32  (bitconvert (v4f64  VR256:$src))), (v8f32  VR256:$src)>;
+def : Pat<(v4f64  (bitconvert (v4i64  VR256:$src))), (v4f64  VR256:$src)>;
+def : Pat<(v4f64  (bitconvert (v8i32  VR256:$src))), (v4f64  VR256:$src)>;
+def : Pat<(v4f64  (bitconvert (v16i16 VR256:$src))), (v4f64  VR256:$src)>;
+def : Pat<(v4f64  (bitconvert (v32i8  VR256:$src))), (v4f64  VR256:$src)>;
+def : Pat<(v4f64  (bitconvert (v8f32  VR256:$src))), (v4f64  VR256:$src)>;
 
 // Alias instructions that map fld0 to xorps for sse or vxorps for avx.
 // This is expanded by ExpandPostRAPseudos.
@@ -472,16 +468,13 @@ let isReMaterializable = 1, isAsCheapAsAMove = 1, canFoldAsLoad = 1,
 // We set canFoldAsLoad because this can be converted to a constant-pool
 // load of an all-zeros value if folding it would be beneficial.
 let isReMaterializable = 1, isAsCheapAsAMove = 1, canFoldAsLoad = 1,
-    isPseudo = 1, SchedRW = [WriteZero] in {
+    isPseudo = 1, Predicates = [NoVLX], SchedRW = [WriteZero] in {
 def V_SET0 : I<0, Pseudo, (outs VR128:$dst), (ins), "",
                [(set VR128:$dst, (v4f32 immAllZerosV))]>;
 }
 
-def : Pat<(v2f64 immAllZerosV), (V_SET0)>;
+let Predicates = [NoVLX] in
 def : Pat<(v4i32 immAllZerosV), (V_SET0)>;
-def : Pat<(v2i64 immAllZerosV), (V_SET0)>;
-def : Pat<(v8i16 immAllZerosV), (V_SET0)>;
-def : Pat<(v16i8 immAllZerosV), (V_SET0)>;
 
 
 // The same as done above but for AVX.  The 256-bit AVX1 ISA doesn't support PI,
@@ -489,39 +482,9 @@ def : Pat<(v16i8 immAllZerosV), (V_SET0)>;
 // at the rename stage without using any execution unit, so SET0PSY
 // and SET0PDY can be used for vector int instructions without penalty
 let isReMaterializable = 1, isAsCheapAsAMove = 1, canFoldAsLoad = 1,
-    isPseudo = 1, Predicates = [HasAVX], SchedRW = [WriteZero] in {
+    isPseudo = 1, Predicates = [HasAVX, NoVLX], SchedRW = [WriteZero] in {
 def AVX_SET0 : I<0, Pseudo, (outs VR256:$dst), (ins), "",
-                 [(set VR256:$dst, (v8f32 immAllZerosV))]>;
-}
-
-let Predicates = [HasAVX] in
-  def : Pat<(v4f64 immAllZerosV), (AVX_SET0)>;
-
-let Predicates = [HasAVX2] in {
-  def : Pat<(v4i64 immAllZerosV), (AVX_SET0)>;
-  def : Pat<(v8i32 immAllZerosV), (AVX_SET0)>;
-  def : Pat<(v16i16 immAllZerosV), (AVX_SET0)>;
-  def : Pat<(v32i8 immAllZerosV), (AVX_SET0)>;
-}
-
-// AVX1 has no support for 256-bit integer instructions, but since the 128-bit
-// VPXOR instruction writes zero to its upper part, it's safe build zeros.
-let Predicates = [HasAVX1Only] in {
-def : Pat<(v32i8 immAllZerosV), (SUBREG_TO_REG (i8 0), (V_SET0), sub_xmm)>;
-def : Pat<(bc_v32i8 (v8f32 immAllZerosV)),
-          (SUBREG_TO_REG (i8 0), (V_SET0), sub_xmm)>;
-
-def : Pat<(v16i16 immAllZerosV), (SUBREG_TO_REG (i16 0), (V_SET0), sub_xmm)>;
-def : Pat<(bc_v16i16 (v8f32 immAllZerosV)),
-          (SUBREG_TO_REG (i16 0), (V_SET0), sub_xmm)>;
-
-def : Pat<(v8i32 immAllZerosV), (SUBREG_TO_REG (i32 0), (V_SET0), sub_xmm)>;
-def : Pat<(bc_v8i32 (v8f32 immAllZerosV)),
-          (SUBREG_TO_REG (i32 0), (V_SET0), sub_xmm)>;
-
-def : Pat<(v4i64 immAllZerosV), (SUBREG_TO_REG (i64 0), (V_SET0), sub_xmm)>;
-def : Pat<(bc_v4i64 (v8f32 immAllZerosV)),
-          (SUBREG_TO_REG (i64 0), (V_SET0), sub_xmm)>;
+                 [(set VR256:$dst, (v8i32 immAllZerosV))]>;
 }
 
 // We set canFoldAsLoad because this can be converted to a constant-pool
@@ -649,15 +612,14 @@ let Predicates = [UseAVX] in {
   def : Pat<(v4f64 (X86vzmovl (insert_subvector undef,
                    (v2f64 (scalar_to_vector (loadf64 addr:$src))), (iPTR 0)))),
             (SUBREG_TO_REG (i32 0), (VMOVSDrm addr:$src), sub_xmm)>;
+  def : Pat<(v4f64 (X86vzload addr:$src)),
+            (SUBREG_TO_REG (i32 0), (VMOVSDrm addr:$src), sub_xmm)>;
   }
 
   // Extract and store.
   def : Pat<(store (f32 (extractelt (v4f32 VR128:$src), (iPTR 0))),
                    addr:$dst),
             (VMOVSSmr addr:$dst, (COPY_TO_REGCLASS (v4f32 VR128:$src), FR32))>;
-  def : Pat<(store (f64 (extractelt (v2f64 VR128:$src), (iPTR 0))),
-                   addr:$dst),
-            (VMOVSDmr addr:$dst, (COPY_TO_REGCLASS (v2f64 VR128:$src), FR64))>;
 
   // Shuffle with VMOVSS
   def : Pat<(v4i32 (X86Movss VR128:$src1, VR128:$src2)),
@@ -771,11 +733,6 @@ let Predicates = [UseSSE2] in {
             (COPY_TO_REGCLASS (MOVSDrm addr:$src), VR128)>;
   }
 
-  // Extract and store.
-  def : Pat<(store (f64 (extractelt (v2f64 VR128:$src), (iPTR 0))),
-                   addr:$dst),
-            (MOVSDmr addr:$dst, (COPY_TO_REGCLASS VR128:$src, FR64))>;
-
   // Shuffle with MOVSD
   def : Pat<(v2i64 (X86Movsd VR128:$src1, VR128:$src2)),
             (MOVSDrr VR128:$src1, (COPY_TO_REGCLASS VR128:$src2, FR64))>;
@@ -800,6 +757,13 @@ let Predicates = [UseSSE2] in {
             (MOVSDrr VR128:$src1, (COPY_TO_REGCLASS VR128:$src2, FR64))>;
 }
 
+// Aliases to help the assembler pick two byte VEX encodings by swapping the
+// operands relative to the normal instructions to use VEX.R instead of VEX.B.
+def : InstAlias<"vmovss\t{$src2, $src1, $dst|$dst, $src1, $src2}",
+                (VMOVSSrr_REV VR128L:$dst, VR128:$src1, VR128H:$src2), 0>;
+def : InstAlias<"vmovsd\t{$src2, $src1, $dst|$dst, $src1, $src2}",
+                (VMOVSDrr_REV VR128L:$dst, VR128:$src1, VR128H:$src2), 0>;
+
 //===----------------------------------------------------------------------===//
 // SSE 1 & 2 - Move Aligned/Unaligned FP Instructions
 //===----------------------------------------------------------------------===//
@@ -937,10 +901,24 @@ let isCodeGenOnly = 1, ForceDisassemble = 1, hasSideEffects = 0,
                             IIC_SSE_MOVU_P_RR>, VEX, VEX_L;
 }
 
-def : Pat<(int_x86_avx_storeu_ps_256 addr:$dst, VR256:$src),
-          (VMOVUPSYmr addr:$dst, VR256:$src)>;
-def : Pat<(int_x86_avx_storeu_pd_256 addr:$dst, VR256:$src),
-          (VMOVUPDYmr addr:$dst, VR256:$src)>;
+// Aliases to help the assembler pick two byte VEX encodings by swapping the
+// operands relative to the normal instructions to use VEX.R instead of VEX.B.
+def : InstAlias<"vmovaps\t{$src, $dst|$dst, $src}",
+                (VMOVAPSrr_REV VR128L:$dst, VR128H:$src), 0>;
+def : InstAlias<"vmovapd\t{$src, $dst|$dst, $src}",
+                (VMOVAPDrr_REV VR128L:$dst, VR128H:$src), 0>;
+def : InstAlias<"vmovups\t{$src, $dst|$dst, $src}",
+                (VMOVUPSrr_REV VR128L:$dst, VR128H:$src), 0>;
+def : InstAlias<"vmovupd\t{$src, $dst|$dst, $src}",
+                (VMOVUPDrr_REV VR128L:$dst, VR128H:$src), 0>;
+def : InstAlias<"vmovaps\t{$src, $dst|$dst, $src}",
+                (VMOVAPSYrr_REV VR256L:$dst, VR256H:$src), 0>;
+def : InstAlias<"vmovapd\t{$src, $dst|$dst, $src}",
+                (VMOVAPDYrr_REV VR256L:$dst, VR256H:$src), 0>;
+def : InstAlias<"vmovups\t{$src, $dst|$dst, $src}",
+                (VMOVUPSYrr_REV VR256L:$dst, VR256H:$src), 0>;
+def : InstAlias<"vmovupd\t{$src, $dst|$dst, $src}",
+                (VMOVUPDYrr_REV VR256L:$dst, VR256H:$src), 0>;
 
 let SchedRW = [WriteStore] in {
 def MOVAPSmr : PSI<0x29, MRMDestMem, (outs), (ins f128mem:$dst, VR128:$src),
@@ -978,20 +956,6 @@ let isCodeGenOnly = 1, ForceDisassemble = 1, hasSideEffects = 0,
                          IIC_SSE_MOVU_P_RR>;
 }
 
-let Predicates = [HasAVX] in {
-  def : Pat<(int_x86_sse_storeu_ps addr:$dst, VR128:$src),
-            (VMOVUPSmr addr:$dst, VR128:$src)>;
-  def : Pat<(int_x86_sse2_storeu_pd addr:$dst, VR128:$src),
-            (VMOVUPDmr addr:$dst, VR128:$src)>;
-}
-
-let Predicates = [UseSSE1] in
-  def : Pat<(int_x86_sse_storeu_ps addr:$dst, VR128:$src),
-            (MOVUPSmr addr:$dst, VR128:$src)>;
-let Predicates = [UseSSE2] in
-  def : Pat<(int_x86_sse2_storeu_pd addr:$dst, VR128:$src),
-            (MOVUPDmr addr:$dst, VR128:$src)>;
-
 // Use vmovaps/vmovups for AVX integer load/store.
 let Predicates = [HasAVX, NoVLX] in {
   // 128-bit load/store
@@ -1004,18 +968,10 @@ let Predicates = [HasAVX, NoVLX] in {
             (VMOVAPSmr addr:$dst, VR128:$src)>;
   def : Pat<(alignedstore (v4i32 VR128:$src), addr:$dst),
             (VMOVAPSmr addr:$dst, VR128:$src)>;
-  def : Pat<(alignedstore (v8i16 VR128:$src), addr:$dst),
-            (VMOVAPSmr addr:$dst, VR128:$src)>;
-  def : Pat<(alignedstore (v16i8 VR128:$src), addr:$dst),
-            (VMOVAPSmr addr:$dst, VR128:$src)>;
   def : Pat<(store (v2i64 VR128:$src), addr:$dst),
             (VMOVUPSmr addr:$dst, VR128:$src)>;
   def : Pat<(store (v4i32 VR128:$src), addr:$dst),
             (VMOVUPSmr addr:$dst, VR128:$src)>;
-  def : Pat<(store (v8i16 VR128:$src), addr:$dst),
-            (VMOVUPSmr addr:$dst, VR128:$src)>;
-  def : Pat<(store (v16i8 VR128:$src), addr:$dst),
-            (VMOVUPSmr addr:$dst, VR128:$src)>;
 
   // 256-bit load/store
   def : Pat<(alignedloadv4i64 addr:$src),
@@ -1026,18 +982,10 @@ let Predicates = [HasAVX, NoVLX] in {
             (VMOVAPSYmr addr:$dst, VR256:$src)>;
   def : Pat<(alignedstore256 (v8i32 VR256:$src), addr:$dst),
             (VMOVAPSYmr addr:$dst, VR256:$src)>;
-  def : Pat<(alignedstore256 (v16i16 VR256:$src), addr:$dst),
-            (VMOVAPSYmr addr:$dst, VR256:$src)>;
-  def : Pat<(alignedstore256 (v32i8 VR256:$src), addr:$dst),
-            (VMOVAPSYmr addr:$dst, VR256:$src)>;
   def : Pat<(store (v4i64 VR256:$src), addr:$dst),
             (VMOVUPSYmr addr:$dst, VR256:$src)>;
   def : Pat<(store (v8i32 VR256:$src), addr:$dst),
             (VMOVUPSYmr addr:$dst, VR256:$src)>;
-  def : Pat<(store (v16i16 VR256:$src), addr:$dst),
-            (VMOVUPSYmr addr:$dst, VR256:$src)>;
-  def : Pat<(store (v32i8 VR256:$src), addr:$dst),
-            (VMOVUPSYmr addr:$dst, VR256:$src)>;
 
   // Special patterns for storing subvector extracts of lower 128-bits
   // Its cheaper to just use VMOVAPS/VMOVUPS instead of VEXTRACTF128mr
@@ -1080,6 +1028,28 @@ let Predicates = [HasAVX, NoVLX] in {
             (VMOVUPSmr addr:$dst, (v16i8 (EXTRACT_SUBREG VR256:$src,sub_xmm)))>;
 }
 
+let Predicates = [HasAVX, NoVLX_Or_NoBWI] in {
+  // 128-bit load/store
+  def : Pat<(alignedstore (v8i16 VR128:$src), addr:$dst),
+            (VMOVAPSmr addr:$dst, VR128:$src)>;
+  def : Pat<(alignedstore (v16i8 VR128:$src), addr:$dst),
+            (VMOVAPSmr addr:$dst, VR128:$src)>;
+  def : Pat<(store (v8i16 VR128:$src), addr:$dst),
+            (VMOVUPSmr addr:$dst, VR128:$src)>;
+  def : Pat<(store (v16i8 VR128:$src), addr:$dst),
+            (VMOVUPSmr addr:$dst, VR128:$src)>;
+
+  // 256-bit load/store
+  def : Pat<(alignedstore256 (v16i16 VR256:$src), addr:$dst),
+            (VMOVAPSYmr addr:$dst, VR256:$src)>;
+  def : Pat<(alignedstore256 (v32i8 VR256:$src), addr:$dst),
+            (VMOVAPSYmr addr:$dst, VR256:$src)>;
+  def : Pat<(store (v16i16 VR256:$src), addr:$dst),
+            (VMOVUPSYmr addr:$dst, VR256:$src)>;
+  def : Pat<(store (v32i8 VR256:$src), addr:$dst),
+            (VMOVUPSYmr addr:$dst, VR256:$src)>;
+}
+
 // Use movaps / movups for SSE integer load / store (one byte shorter).
 // The instructions selected below are then converted to MOVDQA/MOVDQU
 // during the SSE domain pass.
@@ -2039,35 +2009,24 @@ def CVTPD2DQrr  : SDI<0xE6, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
 // SSE2 packed instructions with XS prefix
 def VCVTTPS2DQrr : VS2SI<0x5B, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
                          "cvttps2dq\t{$src, $dst|$dst, $src}",
-                         [(set VR128:$dst,
-                           (int_x86_sse2_cvttps2dq VR128:$src))],
-                         IIC_SSE_CVT_PS_RR>, VEX, Sched<[WriteCvtF2I]>;
+                         [], IIC_SSE_CVT_PS_RR>, VEX, Sched<[WriteCvtF2I]>;
 def VCVTTPS2DQrm : VS2SI<0x5B, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
                          "cvttps2dq\t{$src, $dst|$dst, $src}",
-                         [(set VR128:$dst, (int_x86_sse2_cvttps2dq
-                                            (loadv4f32 addr:$src)))],
-                         IIC_SSE_CVT_PS_RM>, VEX, Sched<[WriteCvtF2ILd]>;
+                         [], IIC_SSE_CVT_PS_RM>, VEX, Sched<[WriteCvtF2ILd]>;
 def VCVTTPS2DQYrr : VS2SI<0x5B, MRMSrcReg, (outs VR256:$dst), (ins VR256:$src),
                           "cvttps2dq\t{$src, $dst|$dst, $src}",
-                          [(set VR256:$dst,
-                            (int_x86_avx_cvtt_ps2dq_256 VR256:$src))],
-                          IIC_SSE_CVT_PS_RR>, VEX, VEX_L, Sched<[WriteCvtF2I]>;
+                          [], IIC_SSE_CVT_PS_RR>, VEX, VEX_L, Sched<[WriteCvtF2I]>;
 def VCVTTPS2DQYrm : VS2SI<0x5B, MRMSrcMem, (outs VR256:$dst), (ins f256mem:$src),
                           "cvttps2dq\t{$src, $dst|$dst, $src}",
-                          [(set VR256:$dst, (int_x86_avx_cvtt_ps2dq_256
-                                             (loadv8f32 addr:$src)))],
-                          IIC_SSE_CVT_PS_RM>, VEX, VEX_L,
+                          [], IIC_SSE_CVT_PS_RM>, VEX, VEX_L,
                           Sched<[WriteCvtF2ILd]>;
 
 def CVTTPS2DQrr : S2SI<0x5B, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
                        "cvttps2dq\t{$src, $dst|$dst, $src}",
-                       [(set VR128:$dst, (int_x86_sse2_cvttps2dq VR128:$src))],
-                       IIC_SSE_CVT_PS_RR>, Sched<[WriteCvtF2I]>;
+                       [], IIC_SSE_CVT_PS_RR>, Sched<[WriteCvtF2I]>;
 def CVTTPS2DQrm : S2SI<0x5B, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
                        "cvttps2dq\t{$src, $dst|$dst, $src}",
-                       [(set VR128:$dst,
-                         (int_x86_sse2_cvttps2dq (memopv4f32 addr:$src)))],
-                       IIC_SSE_CVT_PS_RM>, Sched<[WriteCvtF2ILd]>;
+                       [], IIC_SSE_CVT_PS_RM>, Sched<[WriteCvtF2ILd]>;
 
 let Predicates = [HasAVX] in {
   def : Pat<(int_x86_sse2_cvtdq2ps VR128:$src),
@@ -2137,14 +2096,10 @@ def VCVTTPD2DQXrm : VPDI<0xE6, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
 // YMM only
 def VCVTTPD2DQYrr : VPDI<0xE6, MRMSrcReg, (outs VR128:$dst), (ins VR256:$src),
                          "cvttpd2dq{y}\t{$src, $dst|$dst, $src}",
-                         [(set VR128:$dst,
-                           (int_x86_avx_cvtt_pd2dq_256 VR256:$src))],
-                         IIC_SSE_CVT_PD_RR>, VEX, VEX_L, Sched<[WriteCvtF2I]>;
+                         [], IIC_SSE_CVT_PD_RR>, VEX, VEX_L, Sched<[WriteCvtF2I]>;
 def VCVTTPD2DQYrm : VPDI<0xE6, MRMSrcMem, (outs VR128:$dst), (ins f256mem:$src),
                          "cvttpd2dq{y}\t{$src, $dst|$dst, $src}",
-                         [(set VR128:$dst,
-                          (int_x86_avx_cvtt_pd2dq_256 (loadv4f64 addr:$src)))],
-                         IIC_SSE_CVT_PD_RM>, VEX, VEX_L, Sched<[WriteCvtF2ILd]>;
+                         [], IIC_SSE_CVT_PD_RM>, VEX, VEX_L, Sched<[WriteCvtF2ILd]>;
 def : InstAlias<"vcvttpd2dq\t{$src, $dst|$dst, $src}",
                 (VCVTTPD2DQYrr VR128:$dst, VR256:$src), 0>;
 
@@ -2170,30 +2125,24 @@ def CVTTPD2DQrm : PDI<0xE6, MRMSrcMem, (outs VR128:$dst),(ins f128mem:$src),
 let Predicates = [HasAVX] in {
                   // SSE2 instructions without OpSize prefix
 def VCVTPS2PDrr : I<0x5A, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
-                     "vcvtps2pd\t{$src, $dst|$dst, $src}",
-                     [(set VR128:$dst, (int_x86_sse2_cvtps2pd VR128:$src))],
-                     IIC_SSE_CVT_PD_RR>, PS, VEX, Sched<[WriteCvtF2F]>;
+                    "vcvtps2pd\t{$src, $dst|$dst, $src}",
+                    [], IIC_SSE_CVT_PD_RR>, PS, VEX, Sched<[WriteCvtF2F]>;
 def VCVTPS2PDrm : I<0x5A, MRMSrcMem, (outs VR128:$dst), (ins f64mem:$src),
                     "vcvtps2pd\t{$src, $dst|$dst, $src}",
                     [(set VR128:$dst, (v2f64 (extloadv2f32 addr:$src)))],
                     IIC_SSE_CVT_PD_RM>, PS, VEX, Sched<[WriteCvtF2FLd]>;
 def VCVTPS2PDYrr : I<0x5A, MRMSrcReg, (outs VR256:$dst), (ins VR128:$src),
                      "vcvtps2pd\t{$src, $dst|$dst, $src}",
-                     [(set VR256:$dst,
-                       (int_x86_avx_cvt_ps2_pd_256 VR128:$src))],
-                     IIC_SSE_CVT_PD_RR>, PS, VEX, VEX_L, Sched<[WriteCvtF2F]>;
+                     [], IIC_SSE_CVT_PD_RR>, PS, VEX, VEX_L, Sched<[WriteCvtF2F]>;
 def VCVTPS2PDYrm : I<0x5A, MRMSrcMem, (outs VR256:$dst), (ins f128mem:$src),
                      "vcvtps2pd\t{$src, $dst|$dst, $src}",
-                     [(set VR256:$dst,
-                       (int_x86_avx_cvt_ps2_pd_256 (loadv4f32 addr:$src)))],
-                     IIC_SSE_CVT_PD_RM>, PS, VEX, VEX_L, Sched<[WriteCvtF2FLd]>;
+                     [], IIC_SSE_CVT_PD_RM>, PS, VEX, VEX_L, Sched<[WriteCvtF2FLd]>;
 }
 
 let Predicates = [UseSSE2] in {
 def CVTPS2PDrr : I<0x5A, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
-                       "cvtps2pd\t{$src, $dst|$dst, $src}",
-                       [(set VR128:$dst, (int_x86_sse2_cvtps2pd VR128:$src))],
-                       IIC_SSE_CVT_PD_RR>, PS, Sched<[WriteCvtF2F]>;
+                   "cvtps2pd\t{$src, $dst|$dst, $src}",
+                   [], IIC_SSE_CVT_PD_RR>, PS, Sched<[WriteCvtF2F]>;
 def CVTPS2PDrm : I<0x5A, MRMSrcMem, (outs VR128:$dst), (ins f64mem:$src),
                    "cvtps2pd\t{$src, $dst|$dst, $src}",
                    [(set VR128:$dst, (v2f64 (extloadv2f32 addr:$src)))],
@@ -2204,24 +2153,17 @@ def CVTPS2PDrm : I<0x5A, MRMSrcMem, (outs VR128:$dst), (ins f64mem:$src),
 let Predicates = [HasAVX] in {
 let hasSideEffects = 0, mayLoad = 1 in
 def VCVTDQ2PDrm  : S2SI<0xE6, MRMSrcMem, (outs VR128:$dst), (ins i64mem:$src),
-                     "vcvtdq2pd\t{$src, $dst|$dst, $src}",
-                     []>, VEX, Sched<[WriteCvtI2FLd]>;
+                        "vcvtdq2pd\t{$src, $dst|$dst, $src}",
+                        []>, VEX, Sched<[WriteCvtI2FLd]>;
 def VCVTDQ2PDrr  : S2SI<0xE6, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
-                     "vcvtdq2pd\t{$src, $dst|$dst, $src}",
-                     [(set VR128:$dst,
-                       (int_x86_sse2_cvtdq2pd VR128:$src))]>, VEX,
-                   Sched<[WriteCvtI2F]>;
+                        "vcvtdq2pd\t{$src, $dst|$dst, $src}",
+                        []>, VEX, Sched<[WriteCvtI2F]>;
 def VCVTDQ2PDYrm  : S2SI<0xE6, MRMSrcMem, (outs VR256:$dst), (ins i128mem:$src),
-                     "vcvtdq2pd\t{$src, $dst|$dst, $src}",
-                     [(set VR256:$dst,
-                       (int_x86_avx_cvtdq2_pd_256
-                        (bitconvert (loadv2i64 addr:$src))))]>, VEX, VEX_L,
-                    Sched<[WriteCvtI2FLd]>;
+                         "vcvtdq2pd\t{$src, $dst|$dst, $src}",
+                         []>, VEX, VEX_L, Sched<[WriteCvtI2FLd]>;
 def VCVTDQ2PDYrr  : S2SI<0xE6, MRMSrcReg, (outs VR256:$dst), (ins VR128:$src),
-                     "vcvtdq2pd\t{$src, $dst|$dst, $src}",
-                     [(set VR256:$dst,
-                       (int_x86_avx_cvtdq2_pd_256 VR128:$src))]>, VEX, VEX_L,
-                    Sched<[WriteCvtI2F]>;
+                         "vcvtdq2pd\t{$src, $dst|$dst, $src}",
+                         []>, VEX, VEX_L, Sched<[WriteCvtI2F]>;
 }
 
 let hasSideEffects = 0, mayLoad = 1 in
@@ -2229,8 +2171,7 @@ def CVTDQ2PDrm  : S2SI<0xE6, MRMSrcMem, (outs VR128:$dst), (ins i64mem:$src),
                        "cvtdq2pd\t{$src, $dst|$dst, $src}", [],
                        IIC_SSE_CVT_PD_RR>, Sched<[WriteCvtI2FLd]>;
 def CVTDQ2PDrr  : S2SI<0xE6, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
-                       "cvtdq2pd\t{$src, $dst|$dst, $src}",
-                       [(set VR128:$dst, (int_x86_sse2_cvtdq2pd VR128:$src))],
+                       "cvtdq2pd\t{$src, $dst|$dst, $src}", [],
                        IIC_SSE_CVT_PD_RM>, Sched<[WriteCvtI2F]>;
 
 // AVX register conversion intrinsics
@@ -2239,6 +2180,8 @@ let Predicates = [HasAVX] in {
             (VCVTDQ2PDrr VR128:$src)>;
   def : Pat<(v2f64 (X86cvtdq2pd (bc_v4i32 (loadv2i64 addr:$src)))),
             (VCVTDQ2PDrm addr:$src)>;
+  def : Pat<(v2f64 (X86cvtdq2pd (bc_v4i32 (v2i64 (scalar_to_vector (loadi64 addr:$src)))))),
+            (VCVTDQ2PDrm addr:$src)>;
 
   def : Pat<(v4f64 (sint_to_fp (v4i32 VR128:$src))),
             (VCVTDQ2PDYrr VR128:$src)>;
@@ -2252,6 +2195,8 @@ let Predicates = [HasSSE2] in {
             (CVTDQ2PDrr VR128:$src)>;
   def : Pat<(v2f64 (X86cvtdq2pd (bc_v4i32 (loadv2i64 addr:$src)))),
             (CVTDQ2PDrm addr:$src)>;
+  def : Pat<(v2f64 (X86cvtdq2pd (bc_v4i32 (v2i64 (scalar_to_vector (loadi64 addr:$src)))))),
+            (CVTDQ2PDrm addr:$src)>;
 } // Predicates = [HasSSE2]
 
 // Convert packed double to packed single
@@ -2553,36 +2498,36 @@ let Constraints = "$src1 = $dst" in {
 }
 
 let Predicates = [HasAVX] in {
-def : Pat<(v4i32 (X86cmpp (v4f32 VR128:$src1), VR128:$src2, imm:$cc)),
+def : Pat<(v4f32 (X86cmpp (v4f32 VR128:$src1), VR128:$src2, imm:$cc)),
           (VCMPPSrri (v4f32 VR128:$src1), (v4f32 VR128:$src2), imm:$cc)>;
-def : Pat<(v4i32 (X86cmpp (v4f32 VR128:$src1), (loadv4f32 addr:$src2), imm:$cc)),
+def : Pat<(v4f32 (X86cmpp (v4f32 VR128:$src1), (loadv4f32 addr:$src2), imm:$cc)),
           (VCMPPSrmi (v4f32 VR128:$src1), addr:$src2, imm:$cc)>;
-def : Pat<(v2i64 (X86cmpp (v2f64 VR128:$src1), VR128:$src2, imm:$cc)),
+def : Pat<(v2f64 (X86cmpp (v2f64 VR128:$src1), VR128:$src2, imm:$cc)),
           (VCMPPDrri VR128:$src1, VR128:$src2, imm:$cc)>;
-def : Pat<(v2i64 (X86cmpp (v2f64 VR128:$src1), (loadv2f64 addr:$src2), imm:$cc)),
+def : Pat<(v2f64 (X86cmpp (v2f64 VR128:$src1), (loadv2f64 addr:$src2), imm:$cc)),
           (VCMPPDrmi VR128:$src1, addr:$src2, imm:$cc)>;
 
-def : Pat<(v8i32 (X86cmpp (v8f32 VR256:$src1), VR256:$src2, imm:$cc)),
+def : Pat<(v8f32 (X86cmpp (v8f32 VR256:$src1), VR256:$src2, imm:$cc)),
           (VCMPPSYrri (v8f32 VR256:$src1), (v8f32 VR256:$src2), imm:$cc)>;
-def : Pat<(v8i32 (X86cmpp (v8f32 VR256:$src1), (loadv8f32 addr:$src2), imm:$cc)),
+def : Pat<(v8f32 (X86cmpp (v8f32 VR256:$src1), (loadv8f32 addr:$src2), imm:$cc)),
           (VCMPPSYrmi (v8f32 VR256:$src1), addr:$src2, imm:$cc)>;
-def : Pat<(v4i64 (X86cmpp (v4f64 VR256:$src1), VR256:$src2, imm:$cc)),
+def : Pat<(v4f64 (X86cmpp (v4f64 VR256:$src1), VR256:$src2, imm:$cc)),
           (VCMPPDYrri VR256:$src1, VR256:$src2, imm:$cc)>;
-def : Pat<(v4i64 (X86cmpp (v4f64 VR256:$src1), (loadv4f64 addr:$src2), imm:$cc)),
+def : Pat<(v4f64 (X86cmpp (v4f64 VR256:$src1), (loadv4f64 addr:$src2), imm:$cc)),
           (VCMPPDYrmi VR256:$src1, addr:$src2, imm:$cc)>;
 }
 
 let Predicates = [UseSSE1] in {
-def : Pat<(v4i32 (X86cmpp (v4f32 VR128:$src1), VR128:$src2, imm:$cc)),
+def : Pat<(v4f32 (X86cmpp (v4f32 VR128:$src1), VR128:$src2, imm:$cc)),
           (CMPPSrri (v4f32 VR128:$src1), (v4f32 VR128:$src2), imm:$cc)>;
-def : Pat<(v4i32 (X86cmpp (v4f32 VR128:$src1), (memopv4f32 addr:$src2), imm:$cc)),
+def : Pat<(v4f32 (X86cmpp (v4f32 VR128:$src1), (memopv4f32 addr:$src2), imm:$cc)),
           (CMPPSrmi (v4f32 VR128:$src1), addr:$src2, imm:$cc)>;
 }
 
 let Predicates = [UseSSE2] in {
-def : Pat<(v2i64 (X86cmpp (v2f64 VR128:$src1), VR128:$src2, imm:$cc)),
+def : Pat<(v2f64 (X86cmpp (v2f64 VR128:$src1), VR128:$src2, imm:$cc)),
           (CMPPDrri VR128:$src1, VR128:$src2, imm:$cc)>;
-def : Pat<(v2i64 (X86cmpp (v2f64 VR128:$src1), (memopv2f64 addr:$src2), imm:$cc)),
+def : Pat<(v2f64 (X86cmpp (v2f64 VR128:$src1), (memopv2f64 addr:$src2), imm:$cc)),
           (CMPPDrmi VR128:$src1, addr:$src2, imm:$cc)>;
 }
 
@@ -2763,58 +2708,30 @@ let Predicates = [HasAVX1Only] in {
 //===----------------------------------------------------------------------===//
 
 /// sse12_extr_sign_mask - sse 1 & 2 unpack and interleave
-multiclass sse12_extr_sign_mask<RegisterClass RC, Intrinsic Int, string asm,
-                                Domain d> {
+multiclass sse12_extr_sign_mask<RegisterClass RC, ValueType vt,
+                                string asm, Domain d> {
   def rr : PI<0x50, MRMSrcReg, (outs GR32orGR64:$dst), (ins RC:$src),
               !strconcat(asm, "\t{$src, $dst|$dst, $src}"),
-              [(set GR32orGR64:$dst, (Int RC:$src))], IIC_SSE_MOVMSK, d>,
+              [(set GR32orGR64:$dst, (X86movmsk (vt RC:$src)))], IIC_SSE_MOVMSK, d>,
               Sched<[WriteVecLogic]>;
 }
 
 let Predicates = [HasAVX] in {
-  defm VMOVMSKPS : sse12_extr_sign_mask<VR128, int_x86_sse_movmsk_ps,
-                                        "movmskps", SSEPackedSingle>, PS, VEX;
-  defm VMOVMSKPD : sse12_extr_sign_mask<VR128, int_x86_sse2_movmsk_pd,
-                                        "movmskpd", SSEPackedDouble>, PD, VEX;
-  defm VMOVMSKPSY : sse12_extr_sign_mask<VR256, int_x86_avx_movmsk_ps_256,
-                                        "movmskps", SSEPackedSingle>, PS,
-                                        VEX, VEX_L;
-  defm VMOVMSKPDY : sse12_extr_sign_mask<VR256, int_x86_avx_movmsk_pd_256,
-                                        "movmskpd", SSEPackedDouble>, PD,
-                                        VEX, VEX_L;
-
-  def : Pat<(i32 (X86fgetsign FR32:$src)),
-            (VMOVMSKPSrr (COPY_TO_REGCLASS FR32:$src, VR128))>;
-  def : Pat<(i64 (X86fgetsign FR32:$src)),
-            (SUBREG_TO_REG (i64 0),
-             (VMOVMSKPSrr (COPY_TO_REGCLASS FR32:$src, VR128)), sub_32bit)>;
-  def : Pat<(i32 (X86fgetsign FR64:$src)),
-            (VMOVMSKPDrr (COPY_TO_REGCLASS FR64:$src, VR128))>;
-  def : Pat<(i64 (X86fgetsign FR64:$src)),
-            (SUBREG_TO_REG (i64 0),
-             (VMOVMSKPDrr (COPY_TO_REGCLASS FR64:$src, VR128)), sub_32bit)>;
+  defm VMOVMSKPS : sse12_extr_sign_mask<VR128, v4f32, "movmskps",
+                                        SSEPackedSingle>, PS, VEX;
+  defm VMOVMSKPD : sse12_extr_sign_mask<VR128, v2f64, "movmskpd",
+                                        SSEPackedDouble>, PD, VEX;
+  defm VMOVMSKPSY : sse12_extr_sign_mask<VR256, v8f32, "movmskps",
+                                         SSEPackedSingle>, PS, VEX, VEX_L;
+  defm VMOVMSKPDY : sse12_extr_sign_mask<VR256, v4f64, "movmskpd",
+                                         SSEPackedDouble>, PD, VEX, VEX_L;
 }
 
-defm MOVMSKPS : sse12_extr_sign_mask<VR128, int_x86_sse_movmsk_ps, "movmskps",
+defm MOVMSKPS : sse12_extr_sign_mask<VR128, v4f32, "movmskps",
                                      SSEPackedSingle>, PS;
-defm MOVMSKPD : sse12_extr_sign_mask<VR128, int_x86_sse2_movmsk_pd, "movmskpd",
+defm MOVMSKPD : sse12_extr_sign_mask<VR128, v2f64, "movmskpd",
                                      SSEPackedDouble>, PD;
 
-def : Pat<(i32 (X86fgetsign FR32:$src)),
-          (MOVMSKPSrr (COPY_TO_REGCLASS FR32:$src, VR128))>,
-      Requires<[UseSSE1]>;
-def : Pat<(i64 (X86fgetsign FR32:$src)),
-          (SUBREG_TO_REG (i64 0),
-           (MOVMSKPSrr (COPY_TO_REGCLASS FR32:$src, VR128)), sub_32bit)>,
-      Requires<[UseSSE1]>;
-def : Pat<(i32 (X86fgetsign FR64:$src)),
-          (MOVMSKPDrr (COPY_TO_REGCLASS FR64:$src, VR128))>,
-      Requires<[UseSSE2]>;
-def : Pat<(i64 (X86fgetsign FR64:$src)),
-          (SUBREG_TO_REG (i64 0),
-           (MOVMSKPDrr (COPY_TO_REGCLASS FR64:$src, VR128)), sub_32bit)>,
-      Requires<[UseSSE2]>;
-
 //===---------------------------------------------------------------------===//
 // SSE2 - Packed Integer Logical Instructions
 //===---------------------------------------------------------------------===//
@@ -3695,16 +3612,14 @@ def MOVNTI_64mr : RI<0xC3, MRMDestMem, (outs), (ins i64mem:$dst, GR64:$src),
                   PS, Requires<[HasSSE2]>;
 } // SchedRW = [WriteStore]
 
-let Predicates = [HasAVX2, NoVLX] in {
+let Predicates = [HasAVX, NoVLX] in {
   def : Pat<(alignednontemporalstore (v8i32 VR256:$src), addr:$dst),
             (VMOVNTDQYmr addr:$dst, VR256:$src)>;
   def : Pat<(alignednontemporalstore (v16i16 VR256:$src), addr:$dst),
             (VMOVNTDQYmr addr:$dst, VR256:$src)>;
   def : Pat<(alignednontemporalstore (v32i8 VR256:$src), addr:$dst),
             (VMOVNTDQYmr addr:$dst, VR256:$src)>;
-}
 
-let Predicates = [HasAVX, NoVLX] in {
   def : Pat<(alignednontemporalstore (v4i32 VR128:$src), addr:$dst),
             (VMOVNTDQmr addr:$dst, VR128:$src)>;
   def : Pat<(alignednontemporalstore (v8i16 VR128:$src), addr:$dst),
@@ -3713,12 +3628,14 @@ let Predicates = [HasAVX, NoVLX] in {
             (VMOVNTDQmr addr:$dst, VR128:$src)>;
 }
 
-def : Pat<(alignednontemporalstore (v4i32 VR128:$src), addr:$dst),
-          (MOVNTDQmr addr:$dst, VR128:$src)>;
-def : Pat<(alignednontemporalstore (v8i16 VR128:$src), addr:$dst),
-          (MOVNTDQmr addr:$dst, VR128:$src)>;
-def : Pat<(alignednontemporalstore (v16i8 VR128:$src), addr:$dst),
-          (MOVNTDQmr addr:$dst, VR128:$src)>;
+let Predicates = [UseSSE2] in {
+  def : Pat<(alignednontemporalstore (v4i32 VR128:$src), addr:$dst),
+            (MOVNTDQmr addr:$dst, VR128:$src)>;
+  def : Pat<(alignednontemporalstore (v8i16 VR128:$src), addr:$dst),
+            (MOVNTDQmr addr:$dst, VR128:$src)>;
+  def : Pat<(alignednontemporalstore (v16i8 VR128:$src), addr:$dst),
+            (MOVNTDQmr addr:$dst, VR128:$src)>;
+}
 
 } // AddedComplexity
 
@@ -3760,6 +3677,8 @@ def PAUSE : I<0x90, RawFrm, (outs), (ins),
 
 let SchedRW = [WriteFence] in {
 // Load, store, and memory fence
+// TODO: As with mfence, we may want to ease the availablity of sfence/lfence
+// to include any 64-bit target.
 def SFENCE : I<0xAE, MRM_F8, (outs), (ins),
                "sfence", [(int_x86_sse_sfence)], IIC_SSE_SFENCE>,
                PS, Requires<[HasSSE1]>;
@@ -3768,11 +3687,9 @@ def LFENCE : I<0xAE, MRM_E8, (outs), (ins),
                TB, Requires<[HasSSE2]>;
 def MFENCE : I<0xAE, MRM_F0, (outs), (ins),
                "mfence", [(int_x86_sse2_mfence)], IIC_SSE_MFENCE>,
-               TB, Requires<[HasSSE2]>;
+               TB, Requires<[HasMFence]>;
 } // SchedRW
 
-def : Pat<(X86SFence), (SFENCE)>;
-def : Pat<(X86LFence), (LFENCE)>;
 def : Pat<(X86MFence), (MFENCE)>;
 
 //===----------------------------------------------------------------------===//
@@ -3920,15 +3837,16 @@ def MOVDQUmr :   I<0x7F, MRMDestMem, (outs), (ins i128mem:$dst, VR128:$src),
 
 } // ExeDomain = SSEPackedInt
 
-let Predicates = [HasAVX] in {
-  def : Pat<(int_x86_sse2_storeu_dq addr:$dst, VR128:$src),
-            (VMOVDQUmr addr:$dst, VR128:$src)>;
-  def : Pat<(int_x86_avx_storeu_dq_256 addr:$dst, VR256:$src),
-            (VMOVDQUYmr addr:$dst, VR256:$src)>;
-}
-let Predicates = [UseSSE2] in
-def : Pat<(int_x86_sse2_storeu_dq addr:$dst, VR128:$src),
-          (MOVDQUmr addr:$dst, VR128:$src)>;
+// Aliases to help the assembler pick two byte VEX encodings by swapping the
+// operands relative to the normal instructions to use VEX.R instead of VEX.B.
+def : InstAlias<"vmovdqa\t{$src, $dst|$dst, $src}",
+                (VMOVDQArr_REV VR128L:$dst, VR128H:$src), 0>;
+def : InstAlias<"vmovdqa\t{$src, $dst|$dst, $src}",
+                (VMOVDQAYrr_REV VR256L:$dst, VR256H:$src), 0>;
+def : InstAlias<"vmovdqu\t{$src, $dst|$dst, $src}",
+                (VMOVDQUrr_REV VR128L:$dst, VR128H:$src), 0>;
+def : InstAlias<"vmovdqu\t{$src, $dst|$dst, $src}",
+                (VMOVDQUYrr_REV VR256L:$dst, VR256H:$src), 0>;
 
 //===---------------------------------------------------------------------===//
 // SSE2 - Packed Integer Arithmetic Instructions
@@ -3985,7 +3903,7 @@ let Predicates = [HasAVX2] in
 multiclass PDI_binop_rmi<bits<8> opc, bits<8> opc2, Format ImmForm,
                          string OpcodeStr, SDNode OpNode,
                          SDNode OpNode2, RegisterClass RC,
-                         ValueType DstVT, ValueType SrcVT, PatFrag bc_frag,
+                         ValueType DstVT, ValueType SrcVT,
                          PatFrag ld_frag, ShiftOpndItins itins,
                          bit Is2Addr = 1> {
   // src2 is always 128-bit
@@ -4002,7 +3920,7 @@ multiclass PDI_binop_rmi<bits<8> opc, bits<8> opc2, Format ImmForm,
            !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
            !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
        [(set RC:$dst, (DstVT (OpNode RC:$src1,
-                       (bc_frag (ld_frag addr:$src2)))))], itins.rm>,
+                       (SrcVT (bitconvert (ld_frag addr:$src2))))))], itins.rm>,
       Sched<[WriteVecShiftLd, ReadAfterLd]>;
   def ri : PDIi8<opc2, ImmForm, (outs RC:$dst),
        (ins RC:$src1, u8imm:$src2),
@@ -4046,6 +3964,14 @@ defm PADDD   : PDI_binop_all<0xFE, "paddd", add, v4i32, v8i32,
                              SSE_INTALU_ITINS_P, 1, NoVLX>;
 defm PADDQ   : PDI_binop_all<0xD4, "paddq", add, v2i64, v4i64,
                              SSE_INTALUQ_ITINS_P, 1, NoVLX>;
+defm PADDSB  : PDI_binop_all<0xEC, "paddsb", X86adds, v16i8, v32i8,
+                             SSE_INTALU_ITINS_P, 1, NoVLX_Or_NoBWI>;
+defm PADDSW  : PDI_binop_all<0xED, "paddsw", X86adds, v8i16, v16i16,
+                             SSE_INTALU_ITINS_P, 1, NoVLX_Or_NoBWI>;
+defm PADDUSB : PDI_binop_all<0xDC, "paddusb", X86addus, v16i8, v32i8,
+                             SSE_INTALU_ITINS_P, 0, NoVLX_Or_NoBWI>;
+defm PADDUSW : PDI_binop_all<0xDD, "paddusw", X86addus, v8i16, v16i16,
+                             SSE_INTALU_ITINS_P, 0, NoVLX_Or_NoBWI>;
 defm PMULLW  : PDI_binop_all<0xD5, "pmullw", mul, v8i16, v16i16,
                              SSE_INTMUL_ITINS_P, 1, NoVLX_Or_NoBWI>;
 defm PMULHUW : PDI_binop_all<0xE4, "pmulhuw", mulhu, v8i16, v16i16,
@@ -4060,6 +3986,10 @@ defm PSUBD   : PDI_binop_all<0xFA, "psubd", sub, v4i32, v8i32,
                              SSE_INTALU_ITINS_P, 0, NoVLX>;
 defm PSUBQ   : PDI_binop_all<0xFB, "psubq", sub, v2i64, v4i64,
                              SSE_INTALUQ_ITINS_P, 0, NoVLX>;
+defm PSUBSB  : PDI_binop_all<0xE8, "psubsb", X86subs, v16i8, v32i8,
+                             SSE_INTALU_ITINS_P, 0, NoVLX_Or_NoBWI>;
+defm PSUBSW  : PDI_binop_all<0xE9, "psubsw", X86subs, v8i16, v16i16,
+                             SSE_INTALU_ITINS_P, 0, NoVLX_Or_NoBWI>;
 defm PSUBUSB : PDI_binop_all<0xD8, "psubusb", X86subus, v16i8, v32i8,
                              SSE_INTALU_ITINS_P, 0, NoVLX_Or_NoBWI>;
 defm PSUBUSW : PDI_binop_all<0xD9, "psubusw", X86subus, v8i16, v16i16,
@@ -4078,26 +4008,14 @@ defm PAVGW   : PDI_binop_all<0xE3, "pavgw", X86avg, v8i16, v16i16,
                              SSE_INTALU_ITINS_P, 1, NoVLX_Or_NoBWI>;
 
 // Intrinsic forms
-defm PSUBSB  : PDI_binop_all_int<0xE8, "psubsb", int_x86_sse2_psubs_b,
-                                 int_x86_avx2_psubs_b, SSE_INTALU_ITINS_P, 0>;
-defm PSUBSW  : PDI_binop_all_int<0xE9, "psubsw" , int_x86_sse2_psubs_w,
-                                 int_x86_avx2_psubs_w, SSE_INTALU_ITINS_P, 0>;
-defm PADDSB  : PDI_binop_all_int<0xEC, "paddsb" , int_x86_sse2_padds_b,
-                                 int_x86_avx2_padds_b, SSE_INTALU_ITINS_P, 1>;
-defm PADDSW  : PDI_binop_all_int<0xED, "paddsw" , int_x86_sse2_padds_w,
-                                 int_x86_avx2_padds_w, SSE_INTALU_ITINS_P, 1>;
-defm PADDUSB : PDI_binop_all_int<0xDC, "paddusb", int_x86_sse2_paddus_b,
-                                 int_x86_avx2_paddus_b, SSE_INTALU_ITINS_P, 1>;
-defm PADDUSW : PDI_binop_all_int<0xDD, "paddusw", int_x86_sse2_paddus_w,
-                                 int_x86_avx2_paddus_w, SSE_INTALU_ITINS_P, 1>;
 defm PMADDWD : PDI_binop_all_int<0xF5, "pmaddwd", int_x86_sse2_pmadd_wd,
                                  int_x86_avx2_pmadd_wd, SSE_PMADD, 1>;
 
-let Predicates = [HasAVX] in
+let Predicates = [HasAVX, NoVLX_Or_NoBWI] in
 defm VPSADBW : PDI_binop_rm2<0xF6, "vpsadbw", X86psadbw, v2i64, v16i8, VR128,
                              loadv2i64, i128mem, SSE_INTMUL_ITINS_P, 1, 0>,
                              VEX_4V;
-let Predicates = [HasAVX2] in
+let Predicates = [HasAVX2, NoVLX_Or_NoBWI] in
 defm VPSADBWY : PDI_binop_rm2<0xF6, "vpsadbw", X86psadbw, v4i64, v32i8, VR256,
                              loadv4i64, i256mem, SSE_INTMUL_ITINS_P, 1, 0>,
                              VEX_4V, VEX_L;
@@ -4105,11 +4023,11 @@ let Constraints = "$src1 = $dst" in
 defm PSADBW : PDI_binop_rm2<0xF6, "psadbw", X86psadbw, v2i64, v16i8, VR128,
                             memopv2i64, i128mem, SSE_INTALU_ITINS_P, 1>;
 
-let Predicates = [HasAVX] in
+let Predicates = [HasAVX, NoVLX] in
 defm VPMULUDQ : PDI_binop_rm2<0xF4, "vpmuludq", X86pmuludq, v2i64, v4i32, VR128,
                               loadv2i64, i128mem, SSE_INTMUL_ITINS_P, 1, 0>,
                               VEX_4V;
-let Predicates = [HasAVX2] in
+let Predicates = [HasAVX2, NoVLX] in
 defm VPMULUDQY : PDI_binop_rm2<0xF4, "vpmuludq", X86pmuludq, v4i64, v8i32,
                                VR256, loadv4i64, i256mem,
                                SSE_INTMUL_ITINS_P, 1, 0>, VEX_4V, VEX_L;
@@ -4123,33 +4041,33 @@ defm PMULUDQ : PDI_binop_rm2<0xF4, "pmuludq", X86pmuludq, v2i64, v4i32, VR128,
 
 let Predicates = [HasAVX, NoVLX] in {
 defm VPSLLD : PDI_binop_rmi<0xF2, 0x72, MRM6r, "vpslld", X86vshl, X86vshli,
-                            VR128, v4i32, v4i32, bc_v4i32, loadv2i64,
+                            VR128, v4i32, v4i32, loadv2i64,
                             SSE_INTSHIFT_ITINS_P, 0>, VEX_4V;
 defm VPSLLQ : PDI_binop_rmi<0xF3, 0x73, MRM6r, "vpsllq", X86vshl, X86vshli,
-                            VR128, v2i64, v2i64, bc_v2i64, loadv2i64,
+                            VR128, v2i64, v2i64, loadv2i64,
                             SSE_INTSHIFT_ITINS_P, 0>, VEX_4V;
 
 defm VPSRLD : PDI_binop_rmi<0xD2, 0x72, MRM2r, "vpsrld", X86vsrl, X86vsrli,
-                            VR128, v4i32, v4i32, bc_v4i32, loadv2i64,
+                            VR128, v4i32, v4i32, loadv2i64,
                             SSE_INTSHIFT_ITINS_P, 0>, VEX_4V;
 defm VPSRLQ : PDI_binop_rmi<0xD3, 0x73, MRM2r, "vpsrlq", X86vsrl, X86vsrli,
-                            VR128, v2i64, v2i64, bc_v2i64, loadv2i64,
+                            VR128, v2i64, v2i64, loadv2i64,
                             SSE_INTSHIFT_ITINS_P, 0>, VEX_4V;
 
 defm VPSRAD : PDI_binop_rmi<0xE2, 0x72, MRM4r, "vpsrad", X86vsra, X86vsrai,
-                            VR128, v4i32, v4i32, bc_v4i32, loadv2i64,
+                            VR128, v4i32, v4i32, loadv2i64,
                             SSE_INTSHIFT_ITINS_P, 0>, VEX_4V;
 } // Predicates = [HasAVX, NoVLX]
 
 let Predicates = [HasAVX, NoVLX_Or_NoBWI] in {
 defm VPSLLW : PDI_binop_rmi<0xF1, 0x71, MRM6r, "vpsllw", X86vshl, X86vshli,
-                            VR128, v8i16, v8i16, bc_v8i16, loadv2i64,
+                            VR128, v8i16, v8i16, loadv2i64,
                             SSE_INTSHIFT_ITINS_P, 0>, VEX_4V;
 defm VPSRLW : PDI_binop_rmi<0xD1, 0x71, MRM2r, "vpsrlw", X86vsrl, X86vsrli,
-                            VR128, v8i16, v8i16, bc_v8i16, loadv2i64,
+                            VR128, v8i16, v8i16, loadv2i64,
                             SSE_INTSHIFT_ITINS_P, 0>, VEX_4V;
 defm VPSRAW : PDI_binop_rmi<0xE1, 0x71, MRM4r, "vpsraw", X86vsra, X86vsrai,
-                            VR128, v8i16, v8i16, bc_v8i16, loadv2i64,
+                            VR128, v8i16, v8i16, loadv2i64,
                             SSE_INTSHIFT_ITINS_P, 0>, VEX_4V;
 } // Predicates = [HasAVX, NoVLX_Or_NoBWI]
 
@@ -4161,46 +4079,46 @@ let ExeDomain = SSEPackedInt, SchedRW = [WriteVecShift] ,
                     (outs VR128:$dst), (ins VR128:$src1, u8imm:$src2),
                     "vpslldq\t{$src2, $src1, $dst|$dst, $src1, $src2}",
                     [(set VR128:$dst,
-                      (v2i64 (X86vshldq VR128:$src1, (i8 imm:$src2))))]>,
+                      (v16i8 (X86vshldq VR128:$src1, (i8 imm:$src2))))]>,
                     VEX_4V;
   def VPSRLDQri : PDIi8<0x73, MRM3r,
                     (outs VR128:$dst), (ins VR128:$src1, u8imm:$src2),
                     "vpsrldq\t{$src2, $src1, $dst|$dst, $src1, $src2}",
                     [(set VR128:$dst,
-                      (v2i64 (X86vshrdq VR128:$src1, (i8 imm:$src2))))]>,
+                      (v16i8 (X86vshrdq VR128:$src1, (i8 imm:$src2))))]>,
                     VEX_4V;
   // PSRADQri doesn't exist in SSE[1-3].
 } // Predicates = [HasAVX, NoVLX_Or_NoBWI]
 
 let Predicates = [HasAVX2, NoVLX] in {
 defm VPSLLDY : PDI_binop_rmi<0xF2, 0x72, MRM6r, "vpslld", X86vshl, X86vshli,
-                             VR256, v8i32, v4i32, bc_v4i32, loadv2i64,
+                             VR256, v8i32, v4i32, loadv2i64,
                              SSE_INTSHIFT_ITINS_P, 0>, VEX_4V, VEX_L;
 defm VPSLLQY : PDI_binop_rmi<0xF3, 0x73, MRM6r, "vpsllq", X86vshl, X86vshli,
-                             VR256, v4i64, v2i64, bc_v2i64, loadv2i64,
+                             VR256, v4i64, v2i64, loadv2i64,
                              SSE_INTSHIFT_ITINS_P, 0>, VEX_4V, VEX_L;
 
 defm VPSRLDY : PDI_binop_rmi<0xD2, 0x72, MRM2r, "vpsrld", X86vsrl, X86vsrli,
-                             VR256, v8i32, v4i32, bc_v4i32, loadv2i64,
+                             VR256, v8i32, v4i32, loadv2i64,
                              SSE_INTSHIFT_ITINS_P, 0>, VEX_4V, VEX_L;
 defm VPSRLQY : PDI_binop_rmi<0xD3, 0x73, MRM2r, "vpsrlq", X86vsrl, X86vsrli,
-                             VR256, v4i64, v2i64, bc_v2i64, loadv2i64,
+                             VR256, v4i64, v2i64, loadv2i64,
                              SSE_INTSHIFT_ITINS_P, 0>, VEX_4V, VEX_L;
 
 defm VPSRADY : PDI_binop_rmi<0xE2, 0x72, MRM4r, "vpsrad", X86vsra, X86vsrai,
-                             VR256, v8i32, v4i32, bc_v4i32, loadv2i64,
+                             VR256, v8i32, v4i32, loadv2i64,
                              SSE_INTSHIFT_ITINS_P, 0>, VEX_4V, VEX_L;
 }// Predicates = [HasAVX2, NoVLX]
 
 let Predicates = [HasAVX2, NoVLX_Or_NoBWI] in {
 defm VPSLLWY : PDI_binop_rmi<0xF1, 0x71, MRM6r, "vpsllw", X86vshl, X86vshli,
-                             VR256, v16i16, v8i16, bc_v8i16, loadv2i64,
+                             VR256, v16i16, v8i16, loadv2i64,
                              SSE_INTSHIFT_ITINS_P, 0>, VEX_4V, VEX_L;
 defm VPSRLWY : PDI_binop_rmi<0xD1, 0x71, MRM2r, "vpsrlw", X86vsrl, X86vsrli,
-                             VR256, v16i16, v8i16, bc_v8i16, loadv2i64,
+                             VR256, v16i16, v8i16, loadv2i64,
                              SSE_INTSHIFT_ITINS_P, 0>, VEX_4V, VEX_L;
 defm VPSRAWY : PDI_binop_rmi<0xE1, 0x71, MRM4r, "vpsraw", X86vsra, X86vsrai,
-                             VR256, v16i16, v8i16, bc_v8i16, loadv2i64,
+                             VR256, v16i16, v8i16, loadv2i64,
                              SSE_INTSHIFT_ITINS_P, 0>, VEX_4V, VEX_L;
 }// Predicates = [HasAVX2, NoVLX_Or_NoBWI]
 
@@ -4211,43 +4129,43 @@ let ExeDomain = SSEPackedInt, SchedRW = [WriteVecShift], hasSideEffects = 0 ,
                     (outs VR256:$dst), (ins VR256:$src1, u8imm:$src2),
                     "vpslldq\t{$src2, $src1, $dst|$dst, $src1, $src2}",
                     [(set VR256:$dst,
-                      (v4i64 (X86vshldq VR256:$src1, (i8 imm:$src2))))]>,
+                      (v32i8 (X86vshldq VR256:$src1, (i8 imm:$src2))))]>,
                     VEX_4V, VEX_L;
   def VPSRLDQYri : PDIi8<0x73, MRM3r,
                     (outs VR256:$dst), (ins VR256:$src1, u8imm:$src2),
                     "vpsrldq\t{$src2, $src1, $dst|$dst, $src1, $src2}",
                     [(set VR256:$dst,
-                      (v4i64 (X86vshrdq VR256:$src1, (i8 imm:$src2))))]>,
+                      (v32i8 (X86vshrdq VR256:$src1, (i8 imm:$src2))))]>,
                     VEX_4V, VEX_L;
   // PSRADQYri doesn't exist in SSE[1-3].
 } // Predicates = [HasAVX2, NoVLX_Or_NoBWI]
 
 let Constraints = "$src1 = $dst" in {
 defm PSLLW : PDI_binop_rmi<0xF1, 0x71, MRM6r, "psllw", X86vshl, X86vshli,
-                           VR128, v8i16, v8i16, bc_v8i16, memopv2i64,
+                           VR128, v8i16, v8i16, memopv2i64,
                            SSE_INTSHIFT_ITINS_P>;
 defm PSLLD : PDI_binop_rmi<0xF2, 0x72, MRM6r, "pslld", X86vshl, X86vshli,
-                           VR128, v4i32, v4i32, bc_v4i32, memopv2i64,
+                           VR128, v4i32, v4i32, memopv2i64,
                            SSE_INTSHIFT_ITINS_P>;
 defm PSLLQ : PDI_binop_rmi<0xF3, 0x73, MRM6r, "psllq", X86vshl, X86vshli,
-                           VR128, v2i64, v2i64, bc_v2i64, memopv2i64,
+                           VR128, v2i64, v2i64, memopv2i64,
                            SSE_INTSHIFT_ITINS_P>;
 
 defm PSRLW : PDI_binop_rmi<0xD1, 0x71, MRM2r, "psrlw", X86vsrl, X86vsrli,
-                           VR128, v8i16, v8i16, bc_v8i16, memopv2i64,
+                           VR128, v8i16, v8i16, memopv2i64,
                            SSE_INTSHIFT_ITINS_P>;
 defm PSRLD : PDI_binop_rmi<0xD2, 0x72, MRM2r, "psrld", X86vsrl, X86vsrli,
-                           VR128, v4i32, v4i32, bc_v4i32, memopv2i64,
+                           VR128, v4i32, v4i32, memopv2i64,
                            SSE_INTSHIFT_ITINS_P>;
 defm PSRLQ : PDI_binop_rmi<0xD3, 0x73, MRM2r, "psrlq", X86vsrl, X86vsrli,
-                           VR128, v2i64, v2i64, bc_v2i64, memopv2i64,
+                           VR128, v2i64, v2i64, memopv2i64,
                            SSE_INTSHIFT_ITINS_P>;
 
 defm PSRAW : PDI_binop_rmi<0xE1, 0x71, MRM4r, "psraw", X86vsra, X86vsrai,
-                           VR128, v8i16, v8i16, bc_v8i16, memopv2i64,
+                           VR128, v8i16, v8i16, memopv2i64,
                            SSE_INTSHIFT_ITINS_P>;
 defm PSRAD : PDI_binop_rmi<0xE2, 0x72, MRM4r, "psrad", X86vsra, X86vsrai,
-                           VR128, v4i32, v4i32, bc_v4i32, memopv2i64,
+                           VR128, v4i32, v4i32, memopv2i64,
                            SSE_INTSHIFT_ITINS_P>;
 
 let ExeDomain = SSEPackedInt, SchedRW = [WriteVecShift], hasSideEffects = 0 in {
@@ -4256,13 +4174,13 @@ let ExeDomain = SSEPackedInt, SchedRW = [WriteVecShift], hasSideEffects = 0 in {
                        (outs VR128:$dst), (ins VR128:$src1, u8imm:$src2),
                        "pslldq\t{$src2, $dst|$dst, $src2}",
                        [(set VR128:$dst,
-                         (v2i64 (X86vshldq VR128:$src1, (i8 imm:$src2))))],
+                         (v16i8 (X86vshldq VR128:$src1, (i8 imm:$src2))))],
                        IIC_SSE_INTSHDQ_P_RI>;
   def PSRLDQri : PDIi8<0x73, MRM3r,
                        (outs VR128:$dst), (ins VR128:$src1, u8imm:$src2),
                        "psrldq\t{$src2, $dst|$dst, $src2}",
                        [(set VR128:$dst,
-                         (v2i64 (X86vshrdq VR128:$src1, (i8 imm:$src2))))],
+                         (v16i8 (X86vshrdq VR128:$src1, (i8 imm:$src2))))],
                        IIC_SSE_INTSHDQ_P_RI>;
   // PSRADQri doesn't exist in SSE[1-3].
 }
@@ -4273,17 +4191,17 @@ let ExeDomain = SSEPackedInt, SchedRW = [WriteVecShift], hasSideEffects = 0 in {
 //===---------------------------------------------------------------------===//
 
 defm PCMPEQB : PDI_binop_all<0x74, "pcmpeqb", X86pcmpeq, v16i8, v32i8,
-                             SSE_INTALU_ITINS_P, 1, NoVLX_Or_NoBWI>;
+                             SSE_INTALU_ITINS_P, 1, TruePredicate>;
 defm PCMPEQW : PDI_binop_all<0x75, "pcmpeqw", X86pcmpeq, v8i16, v16i16,
-                             SSE_INTALU_ITINS_P, 1, NoVLX_Or_NoBWI>;
+                             SSE_INTALU_ITINS_P, 1, TruePredicate>;
 defm PCMPEQD : PDI_binop_all<0x76, "pcmpeqd", X86pcmpeq, v4i32, v8i32,
-                             SSE_INTALU_ITINS_P, 1, NoVLX>;
+                             SSE_INTALU_ITINS_P, 1, TruePredicate>;
 defm PCMPGTB : PDI_binop_all<0x64, "pcmpgtb", X86pcmpgt, v16i8, v32i8,
-                             SSE_INTALU_ITINS_P, 0, NoVLX_Or_NoBWI>;
+                             SSE_INTALU_ITINS_P, 0, TruePredicate>;
 defm PCMPGTW : PDI_binop_all<0x65, "pcmpgtw", X86pcmpgt, v8i16, v16i16,
-                             SSE_INTALU_ITINS_P, 0, NoVLX_Or_NoBWI>;
+                             SSE_INTALU_ITINS_P, 0, TruePredicate>;
 defm PCMPGTD : PDI_binop_all<0x66, "pcmpgtd", X86pcmpgt, v4i32, v8i32,
-                             SSE_INTALU_ITINS_P, 0, NoVLX>;
+                             SSE_INTALU_ITINS_P, 0, TruePredicate>;
 
 //===---------------------------------------------------------------------===//
 // SSE2 - Packed Integer Shuffle Instructions
@@ -4291,8 +4209,8 @@ defm PCMPGTD : PDI_binop_all<0x66, "pcmpgtd", X86pcmpgt, v4i32, v8i32,
 
 let ExeDomain = SSEPackedInt in {
 multiclass sse2_pshuffle<string OpcodeStr, ValueType vt128, ValueType vt256,
-                         SDNode OpNode> {
-let Predicates = [HasAVX] in {
+                         SDNode OpNode, Predicate prd> {
+let Predicates = [HasAVX, prd] in {
   def V#NAME#ri : Ii8<0x70, MRMSrcReg, (outs VR128:$dst),
                       (ins VR128:$src1, u8imm:$src2),
                       !strconcat("v", OpcodeStr,
@@ -4310,7 +4228,7 @@ let Predicates = [HasAVX] in {
                   Sched<[WriteShuffleLd]>;
 }
 
-let Predicates = [HasAVX2] in {
+let Predicates = [HasAVX2, prd] in {
   def V#NAME#Yri : Ii8<0x70, MRMSrcReg, (outs VR256:$dst),
                        (ins VR256:$src1, u8imm:$src2),
                        !strconcat("v", OpcodeStr,
@@ -4348,9 +4266,11 @@ let Predicates = [UseSSE2] in {
 }
 } // ExeDomain = SSEPackedInt
 
-defm PSHUFD  : sse2_pshuffle<"pshufd", v4i32, v8i32, X86PShufd>, PD;
-defm PSHUFHW : sse2_pshuffle<"pshufhw", v8i16, v16i16, X86PShufhw>, XS;
-defm PSHUFLW : sse2_pshuffle<"pshuflw", v8i16, v16i16, X86PShuflw>, XD;
+defm PSHUFD  : sse2_pshuffle<"pshufd", v4i32, v8i32, X86PShufd, NoVLX>, PD;
+defm PSHUFHW : sse2_pshuffle<"pshufhw", v8i16, v16i16, X86PShufhw,
+                             NoVLX_Or_NoBWI>, XS;
+defm PSHUFLW : sse2_pshuffle<"pshuflw", v8i16, v16i16, X86PShuflw,
+                             NoVLX_Or_NoBWI>, XD;
 
 let Predicates = [HasAVX] in {
   def : Pat<(v4f32 (X86PShufd (loadv4f32 addr:$src1), (i8 imm:$imm))),
@@ -4372,8 +4292,8 @@ let Predicates = [UseSSE2] in {
 
 let ExeDomain = SSEPackedInt in {
 multiclass sse2_pack<bits<8> opc, string OpcodeStr, ValueType OutVT,
-                     ValueType ArgVT, SDNode OpNode, PatFrag bc_frag,
-                     PatFrag ld_frag, bit Is2Addr = 1> {
+                     ValueType ArgVT, SDNode OpNode, PatFrag ld_frag,
+                     bit Is2Addr = 1> {
   def rr : PDI<opc, MRMSrcReg,
                (outs VR128:$dst), (ins VR128:$src1, VR128:$src2),
                !if(Is2Addr,
@@ -4390,13 +4310,13 @@ multiclass sse2_pack<bits<8> opc, string OpcodeStr, ValueType OutVT,
                    !strconcat(OpcodeStr,
                               "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
                [(set VR128:$dst,
-                     (OutVT (OpNode VR128:$src1,
-                                    (bc_frag (ld_frag addr:$src2)))))]>,
+                     (OutVT (OpNode (ArgVT VR128:$src1),
+                                    (bitconvert (ld_frag addr:$src2)))))]>,
                Sched<[WriteShuffleLd, ReadAfterLd]>;
 }
 
 multiclass sse2_pack_y<bits<8> opc, string OpcodeStr, ValueType OutVT,
-                       ValueType ArgVT, SDNode OpNode, PatFrag bc_frag> {
+                       ValueType ArgVT, SDNode OpNode> {
   def Yrr : PDI<opc, MRMSrcReg,
                 (outs VR256:$dst), (ins VR256:$src1, VR256:$src2),
                 !strconcat(OpcodeStr,
@@ -4409,14 +4329,14 @@ multiclass sse2_pack_y<bits<8> opc, string OpcodeStr, ValueType OutVT,
                 !strconcat(OpcodeStr,
                            "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
                 [(set VR256:$dst,
-                      (OutVT (OpNode VR256:$src1,
-                                     (bc_frag (loadv4i64 addr:$src2)))))]>,
+                      (OutVT (OpNode (ArgVT VR256:$src1),
+                                     (bitconvert (loadv4i64 addr:$src2)))))]>,
                 Sched<[WriteShuffleLd, ReadAfterLd]>;
 }
 
 multiclass sse4_pack<bits<8> opc, string OpcodeStr, ValueType OutVT,
-                     ValueType ArgVT, SDNode OpNode, PatFrag bc_frag,
-                     PatFrag ld_frag, bit Is2Addr = 1> {
+                     ValueType ArgVT, SDNode OpNode, PatFrag ld_frag,
+                     bit Is2Addr = 1> {
   def rr : SS48I<opc, MRMSrcReg,
                  (outs VR128:$dst), (ins VR128:$src1, VR128:$src2),
                  !if(Is2Addr,
@@ -4433,13 +4353,13 @@ multiclass sse4_pack<bits<8> opc, string OpcodeStr, ValueType OutVT,
                      !strconcat(OpcodeStr,
                                 "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
                  [(set VR128:$dst,
-                       (OutVT (OpNode VR128:$src1,
-                                      (bc_frag (ld_frag addr:$src2)))))]>,
+                       (OutVT (OpNode (ArgVT VR128:$src1),
+                                      (bitconvert (ld_frag addr:$src2)))))]>,
                  Sched<[WriteShuffleLd, ReadAfterLd]>;
 }
 
 multiclass sse4_pack_y<bits<8> opc, string OpcodeStr, ValueType OutVT,
-                     ValueType ArgVT, SDNode OpNode, PatFrag bc_frag> {
+                     ValueType ArgVT, SDNode OpNode> {
   def Yrr : SS48I<opc, MRMSrcReg,
                   (outs VR256:$dst), (ins VR256:$src1, VR256:$src2),
                   !strconcat(OpcodeStr,
@@ -4452,47 +4372,46 @@ multiclass sse4_pack_y<bits<8> opc, string OpcodeStr, ValueType OutVT,
                   !strconcat(OpcodeStr,
                              "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
                   [(set VR256:$dst,
-                        (OutVT (OpNode VR256:$src1,
-                                       (bc_frag (loadv4i64 addr:$src2)))))]>,
+                        (OutVT (OpNode (ArgVT VR256:$src1),
+                                       (bitconvert (loadv4i64 addr:$src2)))))]>,
                   Sched<[WriteShuffleLd, ReadAfterLd]>;
 }
 
-let Predicates = [HasAVX] in {
+let Predicates = [HasAVX, NoVLX_Or_NoBWI] in {
   defm VPACKSSWB : sse2_pack<0x63, "vpacksswb", v16i8, v8i16, X86Packss,
-                             bc_v8i16, loadv2i64, 0>, VEX_4V;
+                             loadv2i64, 0>, VEX_4V;
   defm VPACKSSDW : sse2_pack<0x6B, "vpackssdw", v8i16, v4i32, X86Packss,
-                             bc_v4i32, loadv2i64, 0>, VEX_4V;
+                             loadv2i64, 0>, VEX_4V;
 
   defm VPACKUSWB : sse2_pack<0x67, "vpackuswb", v16i8, v8i16, X86Packus,
-                             bc_v8i16, loadv2i64, 0>, VEX_4V;
+                             loadv2i64, 0>, VEX_4V;
   defm VPACKUSDW : sse4_pack<0x2B, "vpackusdw", v8i16, v4i32, X86Packus,
-                             bc_v4i32, loadv2i64, 0>, VEX_4V;
+                             loadv2i64, 0>, VEX_4V;
 }
 
-let Predicates = [HasAVX2] in {
-  defm VPACKSSWB : sse2_pack_y<0x63, "vpacksswb", v32i8, v16i16, X86Packss,
-                               bc_v16i16>, VEX_4V, VEX_L;
-  defm VPACKSSDW : sse2_pack_y<0x6B, "vpackssdw", v16i16, v8i32, X86Packss,
-                               bc_v8i32>, VEX_4V, VEX_L;
+let Predicates = [HasAVX2, NoVLX_Or_NoBWI] in {
+  defm VPACKSSWB : sse2_pack_y<0x63, "vpacksswb", v32i8, v16i16, X86Packss>,
+                               VEX_4V, VEX_L;
+  defm VPACKSSDW : sse2_pack_y<0x6B, "vpackssdw", v16i16, v8i32, X86Packss>,
+                               VEX_4V, VEX_L;
 
-  defm VPACKUSWB : sse2_pack_y<0x67, "vpackuswb", v32i8, v16i16, X86Packus,
-                               bc_v16i16>, VEX_4V, VEX_L;
-  defm VPACKUSDW : sse4_pack_y<0x2B, "vpackusdw", v16i16, v8i32, X86Packus,
-                               bc_v8i32>, VEX_4V, VEX_L;
+  defm VPACKUSWB : sse2_pack_y<0x67, "vpackuswb", v32i8, v16i16, X86Packus>,
+                               VEX_4V, VEX_L;
+  defm VPACKUSDW : sse4_pack_y<0x2B, "vpackusdw", v16i16, v8i32, X86Packus>,
+                               VEX_4V, VEX_L;
 }
 
 let Constraints = "$src1 = $dst" in {
   defm PACKSSWB : sse2_pack<0x63, "packsswb", v16i8, v8i16, X86Packss,
-                            bc_v8i16, memopv2i64>;
+                            memopv2i64>;
   defm PACKSSDW : sse2_pack<0x6B, "packssdw", v8i16, v4i32, X86Packss,
-                            bc_v4i32, memopv2i64>;
+                            memopv2i64>;
 
   defm PACKUSWB : sse2_pack<0x67, "packuswb", v16i8, v8i16, X86Packus,
-                            bc_v8i16, memopv2i64>;
+                            memopv2i64>;
 
-  let Predicates = [HasSSE41] in
   defm PACKUSDW : sse4_pack<0x2B, "packusdw", v8i16, v4i32, X86Packus,
-                            bc_v4i32, memopv2i64>;
+                            memopv2i64>;
 }
 } // ExeDomain = SSEPackedInt
 
@@ -4502,8 +4421,7 @@ let Constraints = "$src1 = $dst" in {
 
 let ExeDomain = SSEPackedInt in {
 multiclass sse2_unpack<bits<8> opc, string OpcodeStr, ValueType vt,
-                       SDNode OpNode, PatFrag bc_frag, PatFrag ld_frag,
-                       bit Is2Addr = 1> {
+                       SDNode OpNode, PatFrag ld_frag, bit Is2Addr = 1> {
   def rr : PDI<opc, MRMSrcReg,
       (outs VR128:$dst), (ins VR128:$src1, VR128:$src2),
       !if(Is2Addr,
@@ -4516,14 +4434,14 @@ multiclass sse2_unpack<bits<8> opc, string OpcodeStr, ValueType vt,
       !if(Is2Addr,
           !strconcat(OpcodeStr,"\t{$src2, $dst|$dst, $src2}"),
           !strconcat(OpcodeStr,"\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
-      [(set VR128:$dst, (OpNode VR128:$src1,
-                                  (bc_frag (ld_frag addr:$src2))))],
+      [(set VR128:$dst, (vt (OpNode VR128:$src1,
+                                  (bitconvert (ld_frag addr:$src2)))))],
                                                IIC_SSE_UNPCK>,
       Sched<[WriteShuffleLd, ReadAfterLd]>;
 }
 
 multiclass sse2_unpack_y<bits<8> opc, string OpcodeStr, ValueType vt,
-                         SDNode OpNode, PatFrag bc_frag> {
+                         SDNode OpNode> {
   def Yrr : PDI<opc, MRMSrcReg,
       (outs VR256:$dst), (ins VR256:$src1, VR256:$src2),
       !strconcat(OpcodeStr,"\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
@@ -4532,72 +4450,72 @@ multiclass sse2_unpack_y<bits<8> opc, string OpcodeStr, ValueType vt,
   def Yrm : PDI<opc, MRMSrcMem,
       (outs VR256:$dst), (ins VR256:$src1, i256mem:$src2),
       !strconcat(OpcodeStr,"\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
-      [(set VR256:$dst, (OpNode VR256:$src1,
-                                  (bc_frag (loadv4i64 addr:$src2))))]>,
+      [(set VR256:$dst, (vt (OpNode VR256:$src1,
+                                  (bitconvert (loadv4i64 addr:$src2)))))]>,
       Sched<[WriteShuffleLd, ReadAfterLd]>;
 }
 
 
 let Predicates = [HasAVX, NoVLX_Or_NoBWI] in {
   defm VPUNPCKLBW  : sse2_unpack<0x60, "vpunpcklbw", v16i8, X86Unpckl,
-                                 bc_v16i8, loadv2i64, 0>, VEX_4V;
+                                 loadv2i64, 0>, VEX_4V;
   defm VPUNPCKLWD  : sse2_unpack<0x61, "vpunpcklwd", v8i16, X86Unpckl,
-                                 bc_v8i16, loadv2i64, 0>, VEX_4V;
+                                 loadv2i64, 0>, VEX_4V;
   defm VPUNPCKHBW  : sse2_unpack<0x68, "vpunpckhbw", v16i8, X86Unpckh,
-                                 bc_v16i8, loadv2i64, 0>, VEX_4V;
+                                 loadv2i64, 0>, VEX_4V;
   defm VPUNPCKHWD  : sse2_unpack<0x69, "vpunpckhwd", v8i16, X86Unpckh,
-                                 bc_v8i16, loadv2i64, 0>, VEX_4V;
+                                 loadv2i64, 0>, VEX_4V;
 }
 let Predicates = [HasAVX, NoVLX] in {
   defm VPUNPCKLDQ  : sse2_unpack<0x62, "vpunpckldq", v4i32, X86Unpckl,
-                                 bc_v4i32, loadv2i64, 0>, VEX_4V;
+                                 loadv2i64, 0>, VEX_4V;
   defm VPUNPCKLQDQ : sse2_unpack<0x6C, "vpunpcklqdq", v2i64, X86Unpckl,
-                                 bc_v2i64, loadv2i64, 0>, VEX_4V;
+                                 loadv2i64, 0>, VEX_4V;
   defm VPUNPCKHDQ  : sse2_unpack<0x6A, "vpunpckhdq", v4i32, X86Unpckh,
-                                 bc_v4i32, loadv2i64, 0>, VEX_4V;
+                                 loadv2i64, 0>, VEX_4V;
   defm VPUNPCKHQDQ : sse2_unpack<0x6D, "vpunpckhqdq", v2i64, X86Unpckh,
-                                 bc_v2i64, loadv2i64, 0>, VEX_4V;
+                                 loadv2i64, 0>, VEX_4V;
 }
 
 let Predicates = [HasAVX2, NoVLX_Or_NoBWI] in {
-  defm VPUNPCKLBW  : sse2_unpack_y<0x60, "vpunpcklbw", v32i8, X86Unpckl,
-                                   bc_v32i8>, VEX_4V, VEX_L;
-  defm VPUNPCKLWD  : sse2_unpack_y<0x61, "vpunpcklwd", v16i16, X86Unpckl,
-                                   bc_v16i16>, VEX_4V, VEX_L;
-  defm VPUNPCKHBW  : sse2_unpack_y<0x68, "vpunpckhbw", v32i8, X86Unpckh,
-                                   bc_v32i8>, VEX_4V, VEX_L;
-  defm VPUNPCKHWD  : sse2_unpack_y<0x69, "vpunpckhwd", v16i16, X86Unpckh,
-                                   bc_v16i16>, VEX_4V, VEX_L;
+  defm VPUNPCKLBW  : sse2_unpack_y<0x60, "vpunpcklbw", v32i8, X86Unpckl>,
+                                   VEX_4V, VEX_L;
+  defm VPUNPCKLWD  : sse2_unpack_y<0x61, "vpunpcklwd", v16i16, X86Unpckl>,
+                                   VEX_4V, VEX_L;
+  defm VPUNPCKHBW  : sse2_unpack_y<0x68, "vpunpckhbw", v32i8, X86Unpckh>,
+                                   VEX_4V, VEX_L;
+  defm VPUNPCKHWD  : sse2_unpack_y<0x69, "vpunpckhwd", v16i16, X86Unpckh>,
+                                   VEX_4V, VEX_L;
 }
 let Predicates = [HasAVX2, NoVLX] in {
-  defm VPUNPCKLDQ  : sse2_unpack_y<0x62, "vpunpckldq", v8i32, X86Unpckl,
-                                   bc_v8i32>, VEX_4V, VEX_L;
-  defm VPUNPCKLQDQ : sse2_unpack_y<0x6C, "vpunpcklqdq", v4i64, X86Unpckl,
-                                   bc_v4i64>, VEX_4V, VEX_L;
-  defm VPUNPCKHDQ  : sse2_unpack_y<0x6A, "vpunpckhdq", v8i32, X86Unpckh,
-                                   bc_v8i32>, VEX_4V, VEX_L;
-  defm VPUNPCKHQDQ : sse2_unpack_y<0x6D, "vpunpckhqdq", v4i64, X86Unpckh,
-                                   bc_v4i64>, VEX_4V, VEX_L;
+  defm VPUNPCKLDQ  : sse2_unpack_y<0x62, "vpunpckldq", v8i32, X86Unpckl>,
+                                   VEX_4V, VEX_L;
+  defm VPUNPCKLQDQ : sse2_unpack_y<0x6C, "vpunpcklqdq", v4i64, X86Unpckl>,
+                                   VEX_4V, VEX_L;
+  defm VPUNPCKHDQ  : sse2_unpack_y<0x6A, "vpunpckhdq", v8i32, X86Unpckh>,
+                                   VEX_4V, VEX_L;
+  defm VPUNPCKHQDQ : sse2_unpack_y<0x6D, "vpunpckhqdq", v4i64, X86Unpckh>,
+                                   VEX_4V, VEX_L;
 }
 
 let Constraints = "$src1 = $dst" in {
   defm PUNPCKLBW  : sse2_unpack<0x60, "punpcklbw", v16i8, X86Unpckl,
-                                bc_v16i8, memopv2i64>;
+                                memopv2i64>;
   defm PUNPCKLWD  : sse2_unpack<0x61, "punpcklwd", v8i16, X86Unpckl,
-                                bc_v8i16, memopv2i64>;
+                                memopv2i64>;
   defm PUNPCKLDQ  : sse2_unpack<0x62, "punpckldq", v4i32, X86Unpckl,
-                                bc_v4i32, memopv2i64>;
+                                memopv2i64>;
   defm PUNPCKLQDQ : sse2_unpack<0x6C, "punpcklqdq", v2i64, X86Unpckl,
-                                bc_v2i64, memopv2i64>;
+                                memopv2i64>;
 
   defm PUNPCKHBW  : sse2_unpack<0x68, "punpckhbw", v16i8, X86Unpckh,
-                                bc_v16i8, memopv2i64>;
+                                memopv2i64>;
   defm PUNPCKHWD  : sse2_unpack<0x69, "punpckhwd", v8i16, X86Unpckh,
-                                bc_v8i16, memopv2i64>;
+                                memopv2i64>;
   defm PUNPCKHDQ  : sse2_unpack<0x6A, "punpckhdq", v4i32, X86Unpckh,
-                                bc_v4i32, memopv2i64>;
+                                memopv2i64>;
   defm PUNPCKHQDQ : sse2_unpack<0x6D, "punpckhqdq", v2i64, X86Unpckh,
-                                bc_v2i64, memopv2i64>;
+                                memopv2i64>;
 }
 } // ExeDomain = SSEPackedInt
 
@@ -4661,20 +4579,20 @@ let ExeDomain = SSEPackedInt, SchedRW = [WriteVecLogic] in {
 def VPMOVMSKBrr  : VPDI<0xD7, MRMSrcReg, (outs GR32orGR64:$dst),
            (ins VR128:$src),
            "pmovmskb\t{$src, $dst|$dst, $src}",
-           [(set GR32orGR64:$dst, (int_x86_sse2_pmovmskb_128 VR128:$src))],
+           [(set GR32orGR64:$dst, (X86movmsk (v16i8 VR128:$src)))],
            IIC_SSE_MOVMSK>, VEX;
 
 let Predicates = [HasAVX2] in {
 def VPMOVMSKBYrr  : VPDI<0xD7, MRMSrcReg, (outs GR32orGR64:$dst),
            (ins VR256:$src),
            "pmovmskb\t{$src, $dst|$dst, $src}",
-           [(set GR32orGR64:$dst, (int_x86_avx2_pmovmskb VR256:$src))]>,
+           [(set GR32orGR64:$dst, (X86movmsk (v32i8 VR256:$src)))]>,
            VEX, VEX_L;
 }
 
 def PMOVMSKBrr : PDI<0xD7, MRMSrcReg, (outs GR32orGR64:$dst), (ins VR128:$src),
            "pmovmskb\t{$src, $dst|$dst, $src}",
-           [(set GR32orGR64:$dst, (int_x86_sse2_pmovmskb_128 VR128:$src))],
+           [(set GR32orGR64:$dst, (X86movmsk (v16i8 VR128:$src)))],
            IIC_SSE_MOVMSK>;
 
 } // ExeDomain = SSEPackedInt
@@ -5022,16 +4940,14 @@ def MOVPQI2QIrr : S2I<0xD6, MRMDestReg, (outs VR128:$dst), (ins VR128:$src),
                       "movq\t{$src, $dst|$dst, $src}", [], IIC_SSE_MOVQ_RR>;
 }
 
+// Aliases to help the assembler pick two byte VEX encodings by swapping the
+// operands relative to the normal instructions to use VEX.R instead of VEX.B.
+def : InstAlias<"vmovq\t{$src, $dst|$dst, $src}",
+                (VMOVPQI2QIrr VR128L:$dst, VR128H:$src), 0>;
+
 //===---------------------------------------------------------------------===//
 // Store / copy lower 64-bits of a XMM register.
 //
-let Predicates = [HasAVX] in
-def : Pat<(int_x86_sse2_storel_dq addr:$dst, VR128:$src),
-          (VMOVPQI2QImr addr:$dst, VR128:$src)>;
-let Predicates = [UseSSE2] in
-def : Pat<(int_x86_sse2_storel_dq addr:$dst, VR128:$src),
-          (MOVPQI2QImr addr:$dst, VR128:$src)>;
-
 let ExeDomain = SSEPackedInt, isCodeGenOnly = 1, AddedComplexity = 20 in {
 def VMOVZQI2PQIrm : I<0x7E, MRMSrcMem, (outs VR128:$dst), (ins i64mem:$src),
                      "vmovq\t{$src, $dst|$dst, $src}",
@@ -5058,6 +4974,8 @@ let Predicates = [UseAVX], AddedComplexity = 20 in {
   def : Pat<(v4i64 (X86vzmovl (insert_subvector undef,
               (v2i64 (scalar_to_vector (loadi64 addr:$src))), (iPTR 0)))),
             (SUBREG_TO_REG (i64 0), (VMOVZQI2PQIrm addr:$src), sub_xmm)>;
+  def : Pat<(v4i64 (X86vzload addr:$src)),
+            (SUBREG_TO_REG (i64 0), (VMOVZQI2PQIrm addr:$src), sub_xmm)>;
 }
 
 let Predicates = [UseSSE2], AddedComplexity = 20 in {
@@ -5066,13 +4984,6 @@ let Predicates = [UseSSE2], AddedComplexity = 20 in {
   def : Pat<(v2i64 (X86vzload addr:$src)), (MOVZQI2PQIrm addr:$src)>;
 }
 
-let Predicates = [HasAVX] in {
-def : Pat<(v4i64 (alignedX86vzload addr:$src)),
-          (SUBREG_TO_REG (i32 0), (VMOVAPSrm addr:$src), sub_xmm)>;
-def : Pat<(v4i64 (X86vzload addr:$src)),
-          (SUBREG_TO_REG (i32 0), (VMOVUPSrm addr:$src), sub_xmm)>;
-}
-
 //===---------------------------------------------------------------------===//
 // Moving from XMM to XMM and clear upper 64 bits. Note, there is a bug in
 // IA32 document. movq xmm1, xmm2 does clear the high bits.
@@ -5442,38 +5353,36 @@ let Constraints = "$src1 = $dst" in {
 
 
 /// SS3I_unop_rm_int - Simple SSSE3 unary op whose type can be v*{i8,i16,i32}.
-multiclass SS3I_unop_rm_int<bits<8> opc, string OpcodeStr, Intrinsic IntId128,
-                            PatFrag ld_frag> {
+multiclass SS3I_unop_rm<bits<8> opc, string OpcodeStr, ValueType vt,
+                        SDNode OpNode, PatFrag ld_frag> {
   def rr128 : SS38I<opc, MRMSrcReg, (outs VR128:$dst),
                     (ins VR128:$src),
                     !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
-                    [(set VR128:$dst, (IntId128 VR128:$src))], IIC_SSE_PABS_RR>,
-                    Sched<[WriteVecALU]>;
+                    [(set VR128:$dst, (vt (OpNode VR128:$src)))],
+                    IIC_SSE_PABS_RR>, Sched<[WriteVecALU]>;
 
   def rm128 : SS38I<opc, MRMSrcMem, (outs VR128:$dst),
                     (ins i128mem:$src),
                     !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
                     [(set VR128:$dst,
-                      (IntId128
-                       (bitconvert (ld_frag addr:$src))))], IIC_SSE_PABS_RM>,
-                    Sched<[WriteVecALULd]>;
+                      (vt (OpNode (bitconvert (ld_frag addr:$src)))))],
+                    IIC_SSE_PABS_RM>, Sched<[WriteVecALULd]>;
 }
 
 /// SS3I_unop_rm_int_y - Simple SSSE3 unary op whose type can be v*{i8,i16,i32}.
-multiclass SS3I_unop_rm_int_y<bits<8> opc, string OpcodeStr,
-                              Intrinsic IntId256> {
+multiclass SS3I_unop_rm_y<bits<8> opc, string OpcodeStr, ValueType vt,
+                          SDNode OpNode> {
   def rr256 : SS38I<opc, MRMSrcReg, (outs VR256:$dst),
                     (ins VR256:$src),
                     !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
-                    [(set VR256:$dst, (IntId256 VR256:$src))]>,
+                    [(set VR256:$dst, (vt (OpNode VR256:$src)))]>,
                     Sched<[WriteVecALU]>;
 
   def rm256 : SS38I<opc, MRMSrcMem, (outs VR256:$dst),
                     (ins i256mem:$src),
                     !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
                     [(set VR256:$dst,
-                      (IntId256
-                       (bitconvert (loadv4i64 addr:$src))))]>,
+                      (vt (OpNode (bitconvert (loadv4i64 addr:$src)))))]>,
                     Sched<[WriteVecALULd]>;
 }
 
@@ -5487,14 +5396,15 @@ def v32i1sextv32i8 : PatLeaf<(v32i8 (X86pcmpgt (bc_v32i8 (v8i32 immAllZerosV)),
 def v16i1sextv16i16: PatLeaf<(v16i16 (X86vsrai VR256:$src, (i8 15)))>;
 def v8i1sextv8i32  : PatLeaf<(v8i32 (X86vsrai VR256:$src, (i8 31)))>;
 
-let Predicates = [HasAVX] in {
-  defm VPABSB  : SS3I_unop_rm_int<0x1C, "vpabsb", int_x86_ssse3_pabs_b_128,
-                                  loadv2i64>, VEX;
-  defm VPABSW  : SS3I_unop_rm_int<0x1D, "vpabsw", int_x86_ssse3_pabs_w_128,
-                                  loadv2i64>, VEX;
-  defm VPABSD  : SS3I_unop_rm_int<0x1E, "vpabsd", int_x86_ssse3_pabs_d_128,
-                                  loadv2i64>, VEX;
+let Predicates = [HasAVX, NoVLX_Or_NoBWI] in {
+  defm VPABSB  : SS3I_unop_rm<0x1C, "vpabsb", v16i8, X86Abs, loadv2i64>, VEX;
+  defm VPABSW  : SS3I_unop_rm<0x1D, "vpabsw", v8i16, X86Abs, loadv2i64>, VEX;
+}
+let Predicates = [HasAVX, NoVLX] in {
+  defm VPABSD  : SS3I_unop_rm<0x1E, "vpabsd", v4i32, X86Abs, loadv2i64>, VEX;
+}
 
+let Predicates = [HasAVX] in {
   def : Pat<(xor
             (bc_v2i64 (v16i1sextv16i8)),
             (bc_v2i64 (add (v16i8 VR128:$src), (v16i1sextv16i8)))),
@@ -5509,14 +5419,15 @@ let Predicates = [HasAVX] in {
             (VPABSDrr128 VR128:$src)>;
 }
 
-let Predicates = [HasAVX2] in {
-  defm VPABSB  : SS3I_unop_rm_int_y<0x1C, "vpabsb",
-                                    int_x86_avx2_pabs_b>, VEX, VEX_L;
-  defm VPABSW  : SS3I_unop_rm_int_y<0x1D, "vpabsw",
-                                    int_x86_avx2_pabs_w>, VEX, VEX_L;
-  defm VPABSD  : SS3I_unop_rm_int_y<0x1E, "vpabsd",
-                                    int_x86_avx2_pabs_d>, VEX, VEX_L;
+let Predicates = [HasAVX2, NoVLX_Or_NoBWI] in {
+  defm VPABSB  : SS3I_unop_rm_y<0x1C, "vpabsb", v32i8, X86Abs>, VEX, VEX_L;
+  defm VPABSW  : SS3I_unop_rm_y<0x1D, "vpabsw", v16i16, X86Abs>, VEX, VEX_L;
+}
+let Predicates = [HasAVX2, NoVLX] in {
+  defm VPABSD  : SS3I_unop_rm_y<0x1E, "vpabsd", v8i32, X86Abs>, VEX, VEX_L;
+}
 
+let Predicates = [HasAVX2] in {
   def : Pat<(xor
             (bc_v4i64 (v32i1sextv32i8)),
             (bc_v4i64 (add (v32i8 VR256:$src), (v32i1sextv32i8)))),
@@ -5531,14 +5442,11 @@ let Predicates = [HasAVX2] in {
             (VPABSDrr256 VR256:$src)>;
 }
 
-defm PABSB : SS3I_unop_rm_int<0x1C, "pabsb", int_x86_ssse3_pabs_b_128,
-                              memopv2i64>;
-defm PABSW : SS3I_unop_rm_int<0x1D, "pabsw", int_x86_ssse3_pabs_w_128,
-                              memopv2i64>;
-defm PABSD : SS3I_unop_rm_int<0x1E, "pabsd", int_x86_ssse3_pabs_d_128,
-                              memopv2i64>;
+defm PABSB : SS3I_unop_rm<0x1C, "pabsb", v16i8, X86Abs, memopv2i64>;
+defm PABSW : SS3I_unop_rm<0x1D, "pabsw", v8i16, X86Abs, memopv2i64>;
+defm PABSD : SS3I_unop_rm<0x1E, "pabsd", v4i32, X86Abs, memopv2i64>;
 
-let Predicates = [HasSSSE3] in {
+let Predicates = [UseSSSE3] in {
   def : Pat<(xor
             (bc_v2i64 (v16i1sextv16i8)),
             (bc_v2i64 (add (v16i8 VR128:$src), (v16i1sextv16i8)))),
@@ -5659,15 +5567,15 @@ let isCommutable = 0 in {
   defm VPHSUBD    : SS3I_binop_rm<0x06, "vphsubd", X86hsub, v4i32, VR128,
                                   loadv2i64, i128mem,
                                   SSE_PHADDSUBD, 0>, VEX_4V;
-  defm VPSIGNB    : SS3I_binop_rm<0x08, "vpsignb", X86psign, v16i8, VR128,
-                                  loadv2i64, i128mem,
-                                  SSE_PSIGN, 0>, VEX_4V;
-  defm VPSIGNW    : SS3I_binop_rm<0x09, "vpsignw", X86psign, v8i16, VR128,
-                                  loadv2i64, i128mem,
-                                  SSE_PSIGN, 0>, VEX_4V;
-  defm VPSIGND    : SS3I_binop_rm<0x0A, "vpsignd", X86psign, v4i32, VR128,
-                                  loadv2i64, i128mem,
-                                  SSE_PSIGN, 0>, VEX_4V;
+  defm VPSIGNB    : SS3I_binop_rm_int<0x08, "vpsignb",
+                                      int_x86_ssse3_psign_b_128,
+                                      SSE_PSIGN, loadv2i64, 0>, VEX_4V;
+  defm VPSIGNW    : SS3I_binop_rm_int<0x09, "vpsignw",
+                                      int_x86_ssse3_psign_w_128,
+                                      SSE_PSIGN, loadv2i64, 0>, VEX_4V;
+  defm VPSIGND    : SS3I_binop_rm_int<0x0A, "vpsignd",
+                                      int_x86_ssse3_psign_d_128,
+                                      SSE_PSIGN, loadv2i64, 0>, VEX_4V;
   defm VPSHUFB    : SS3I_binop_rm<0x00, "vpshufb", X86pshufb, v16i8, VR128,
                                   loadv2i64, i128mem,
                                   SSE_PSHUFB, 0>, VEX_4V;
@@ -5700,15 +5608,12 @@ let isCommutable = 0 in {
   defm VPHSUBDY   : SS3I_binop_rm<0x06, "vphsubd", X86hsub, v8i32, VR256,
                                   loadv4i64, i256mem,
                                   SSE_PHADDSUBW, 0>, VEX_4V, VEX_L;
-  defm VPSIGNBY   : SS3I_binop_rm<0x08, "vpsignb", X86psign, v32i8, VR256,
-                                  loadv4i64, i256mem,
-                                  SSE_PHADDSUBW, 0>, VEX_4V, VEX_L;
-  defm VPSIGNWY   : SS3I_binop_rm<0x09, "vpsignw", X86psign, v16i16, VR256,
-                                  loadv4i64, i256mem,
-                                  SSE_PHADDSUBW, 0>, VEX_4V, VEX_L;
-  defm VPSIGNDY   : SS3I_binop_rm<0x0A, "vpsignd", X86psign, v8i32, VR256,
-                                  loadv4i64, i256mem,
-                                  SSE_PHADDSUBW, 0>, VEX_4V, VEX_L;
+  defm VPSIGNBY   : SS3I_binop_rm_int_y<0x08, "vpsignb", int_x86_avx2_psign_b,
+                                        WriteVecALU>, VEX_4V, VEX_L;
+  defm VPSIGNWY   : SS3I_binop_rm_int_y<0x09, "vpsignw", int_x86_avx2_psign_w,
+                                        WriteVecALU>, VEX_4V, VEX_L;
+  defm VPSIGNDY   : SS3I_binop_rm_int_y<0x0A, "vpsignd", int_x86_avx2_psign_d,
+                                        WriteVecALU>, VEX_4V, VEX_L;
   defm VPSHUFBY   : SS3I_binop_rm<0x00, "vpshufb", X86pshufb, v32i8, VR256,
                                   loadv4i64, i256mem,
                                   SSE_PSHUFB, 0>, VEX_4V, VEX_L;
@@ -5738,12 +5643,12 @@ let isCommutable = 0 in {
                                  memopv2i64, i128mem, SSE_PHADDSUBW>;
   defm PHSUBD    : SS3I_binop_rm<0x06, "phsubd", X86hsub, v4i32, VR128,
                                  memopv2i64, i128mem, SSE_PHADDSUBD>;
-  defm PSIGNB    : SS3I_binop_rm<0x08, "psignb", X86psign, v16i8, VR128,
-                                 memopv2i64, i128mem, SSE_PSIGN>;
-  defm PSIGNW    : SS3I_binop_rm<0x09, "psignw", X86psign, v8i16, VR128,
-                                 memopv2i64, i128mem, SSE_PSIGN>;
-  defm PSIGND    : SS3I_binop_rm<0x0A, "psignd", X86psign, v4i32, VR128,
-                                 memopv2i64, i128mem, SSE_PSIGN>;
+  defm PSIGNB    : SS3I_binop_rm_int<0x08, "psignb", int_x86_ssse3_psign_b_128,
+                                     SSE_PSIGN, memopv2i64>;
+  defm PSIGNW    : SS3I_binop_rm_int<0x09, "psignw", int_x86_ssse3_psign_w_128,
+                                     SSE_PSIGN, memopv2i64>;
+  defm PSIGND    : SS3I_binop_rm_int<0x0A, "psignd", int_x86_ssse3_psign_d_128,
+                                     SSE_PSIGN, memopv2i64>;
   defm PSHUFB    : SS3I_binop_rm<0x00, "pshufb", X86pshufb, v16i8, VR128,
                                  memopv2i64, i128mem, SSE_PSHUFB>;
   defm PHADDSW   : SS3I_binop_rm_int<0x03, "phaddsw",
@@ -5767,7 +5672,7 @@ defm PMULHRSW    : SS3I_binop_rm_int<0x0B, "pmulhrsw",
 
 multiclass ssse3_palignr<string asm, bit Is2Addr = 1> {
   let hasSideEffects = 0 in {
-  def R128rr : SS3AI<0x0F, MRMSrcReg, (outs VR128:$dst),
+  def rri : SS3AI<0x0F, MRMSrcReg, (outs VR128:$dst),
       (ins VR128:$src1, VR128:$src2, u8imm:$src3),
       !if(Is2Addr,
         !strconcat(asm, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
@@ -5775,7 +5680,7 @@ multiclass ssse3_palignr<string asm, bit Is2Addr = 1> {
                   "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")),
       [], IIC_SSE_PALIGNRR>, Sched<[WriteShuffle]>;
   let mayLoad = 1 in
-  def R128rm : SS3AI<0x0F, MRMSrcMem, (outs VR128:$dst),
+  def rmi : SS3AI<0x0F, MRMSrcMem, (outs VR128:$dst),
       (ins VR128:$src1, i128mem:$src2, u8imm:$src3),
       !if(Is2Addr,
         !strconcat(asm, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
@@ -5787,13 +5692,13 @@ multiclass ssse3_palignr<string asm, bit Is2Addr = 1> {
 
 multiclass ssse3_palignr_y<string asm, bit Is2Addr = 1> {
   let hasSideEffects = 0 in {
-  def R256rr : SS3AI<0x0F, MRMSrcReg, (outs VR256:$dst),
+  def Yrri : SS3AI<0x0F, MRMSrcReg, (outs VR256:$dst),
       (ins VR256:$src1, VR256:$src2, u8imm:$src3),
       !strconcat(asm,
                  "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
       []>, Sched<[WriteShuffle]>;
   let mayLoad = 1 in
-  def R256rm : SS3AI<0x0F, MRMSrcMem, (outs VR256:$dst),
+  def Yrmi : SS3AI<0x0F, MRMSrcMem, (outs VR256:$dst),
       (ins VR256:$src1, i256mem:$src2, u8imm:$src3),
       !strconcat(asm,
                  "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
@@ -5802,43 +5707,43 @@ multiclass ssse3_palignr_y<string asm, bit Is2Addr = 1> {
 }
 
 let Predicates = [HasAVX] in
-  defm VPALIGN : ssse3_palignr<"vpalignr", 0>, VEX_4V;
+  defm VPALIGNR : ssse3_palignr<"vpalignr", 0>, VEX_4V;
 let Predicates = [HasAVX2] in
-  defm VPALIGN : ssse3_palignr_y<"vpalignr", 0>, VEX_4V, VEX_L;
+  defm VPALIGNR : ssse3_palignr_y<"vpalignr", 0>, VEX_4V, VEX_L;
 let Constraints = "$src1 = $dst", Predicates = [UseSSSE3] in
-  defm PALIGN : ssse3_palignr<"palignr">;
+  defm PALIGNR : ssse3_palignr<"palignr">;
 
 let Predicates = [HasAVX2, NoVLX_Or_NoBWI] in {
 def : Pat<(v8i32 (X86PAlignr VR256:$src1, VR256:$src2, (i8 imm:$imm))),
-          (VPALIGNR256rr VR256:$src1, VR256:$src2, imm:$imm)>;
+          (VPALIGNRYrri VR256:$src1, VR256:$src2, imm:$imm)>;
 def : Pat<(v8f32 (X86PAlignr VR256:$src1, VR256:$src2, (i8 imm:$imm))),
-          (VPALIGNR256rr VR256:$src1, VR256:$src2, imm:$imm)>;
+          (VPALIGNRYrri VR256:$src1, VR256:$src2, imm:$imm)>;
 def : Pat<(v16i16 (X86PAlignr VR256:$src1, VR256:$src2, (i8 imm:$imm))),
-          (VPALIGNR256rr VR256:$src1, VR256:$src2, imm:$imm)>;
+          (VPALIGNRYrri VR256:$src1, VR256:$src2, imm:$imm)>;
 def : Pat<(v32i8 (X86PAlignr VR256:$src1, VR256:$src2, (i8 imm:$imm))),
-          (VPALIGNR256rr VR256:$src1, VR256:$src2, imm:$imm)>;
+          (VPALIGNRYrri VR256:$src1, VR256:$src2, imm:$imm)>;
 }
 
 let Predicates = [HasAVX, NoVLX_Or_NoBWI] in {
 def : Pat<(v4i32 (X86PAlignr VR128:$src1, VR128:$src2, (i8 imm:$imm))),
-          (VPALIGNR128rr VR128:$src1, VR128:$src2, imm:$imm)>;
+          (VPALIGNRrri VR128:$src1, VR128:$src2, imm:$imm)>;
 def : Pat<(v4f32 (X86PAlignr VR128:$src1, VR128:$src2, (i8 imm:$imm))),
-          (VPALIGNR128rr VR128:$src1, VR128:$src2, imm:$imm)>;
+          (VPALIGNRrri VR128:$src1, VR128:$src2, imm:$imm)>;
 def : Pat<(v8i16 (X86PAlignr VR128:$src1, VR128:$src2, (i8 imm:$imm))),
-          (VPALIGNR128rr VR128:$src1, VR128:$src2, imm:$imm)>;
+          (VPALIGNRrri VR128:$src1, VR128:$src2, imm:$imm)>;
 def : Pat<(v16i8 (X86PAlignr VR128:$src1, VR128:$src2, (i8 imm:$imm))),
-          (VPALIGNR128rr VR128:$src1, VR128:$src2, imm:$imm)>;
+          (VPALIGNRrri VR128:$src1, VR128:$src2, imm:$imm)>;
 }
 
 let Predicates = [UseSSSE3] in {
 def : Pat<(v4i32 (X86PAlignr VR128:$src1, VR128:$src2, (i8 imm:$imm))),
-          (PALIGNR128rr VR128:$src1, VR128:$src2, imm:$imm)>;
+          (PALIGNRrri VR128:$src1, VR128:$src2, imm:$imm)>;
 def : Pat<(v4f32 (X86PAlignr VR128:$src1, VR128:$src2, (i8 imm:$imm))),
-          (PALIGNR128rr VR128:$src1, VR128:$src2, imm:$imm)>;
+          (PALIGNRrri VR128:$src1, VR128:$src2, imm:$imm)>;
 def : Pat<(v8i16 (X86PAlignr VR128:$src1, VR128:$src2, (i8 imm:$imm))),
-          (PALIGNR128rr VR128:$src1, VR128:$src2, imm:$imm)>;
+          (PALIGNRrri VR128:$src1, VR128:$src2, imm:$imm)>;
 def : Pat<(v16i8 (X86PAlignr VR128:$src1, VR128:$src2, (i8 imm:$imm))),
-          (PALIGNR128rr VR128:$src1, VR128:$src2, imm:$imm)>;
+          (PALIGNRrri VR128:$src1, VR128:$src2, imm:$imm)>;
 }
 
 //===---------------------------------------------------------------------===//
@@ -5855,6 +5760,7 @@ def MONITOR : PseudoI<(outs), (ins i32mem:$src1, GR32:$src2, GR32:$src3),
 let Uses = [EAX, ECX, EDX] in
 def MONITORrrr : I<0x01, MRM_C8, (outs), (ins), "monitor", [], IIC_SSE_MONITOR>,
                  TB, Requires<[HasSSE3]>;
+
 let Uses = [ECX, EAX] in
 def MWAITrr   : I<0x01, MRM_C9, (outs), (ins), "mwait",
                 [(int_x86_sse3_mwait ECX, EAX)], IIC_SSE_MWAIT>,
@@ -5890,45 +5796,48 @@ multiclass SS41I_pmovx_rrrm<bits<8> opc, string OpcodeStr, X86MemOperand MemOp,
 multiclass SS41I_pmovx_rm_all<bits<8> opc, string OpcodeStr,
                           X86MemOperand MemOp, X86MemOperand MemYOp,
                           OpndItins SSEItins, OpndItins AVXItins,
-                          OpndItins AVX2Itins> {
+                          OpndItins AVX2Itins, Predicate prd> {
   defm NAME : SS41I_pmovx_rrrm<opc, OpcodeStr, MemOp, VR128, VR128, SSEItins>;
-  let Predicates = [HasAVX, NoVLX] in
+  let Predicates = [HasAVX, prd] in
     defm V#NAME   : SS41I_pmovx_rrrm<opc, !strconcat("v", OpcodeStr), MemOp,
                                      VR128, VR128, AVXItins>, VEX;
-  let Predicates = [HasAVX2, NoVLX] in
+  let Predicates = [HasAVX2, prd] in
     defm V#NAME#Y : SS41I_pmovx_rrrm<opc, !strconcat("v", OpcodeStr), MemYOp,
                                      VR256, VR128, AVX2Itins>, VEX, VEX_L;
 }
 
-multiclass SS41I_pmovx_rm<bits<8> opc, string OpcodeStr,
-                                X86MemOperand MemOp, X86MemOperand MemYOp> {
+multiclass SS41I_pmovx_rm<bits<8> opc, string OpcodeStr, X86MemOperand MemOp,
+                          X86MemOperand MemYOp, Predicate prd> {
   defm PMOVSX#NAME : SS41I_pmovx_rm_all<opc, !strconcat("pmovsx", OpcodeStr),
                                         MemOp, MemYOp,
                                         SSE_INTALU_ITINS_SHUFF_P,
                                         DEFAULT_ITINS_SHUFFLESCHED,
-                                        DEFAULT_ITINS_SHUFFLESCHED>;
+                                        DEFAULT_ITINS_SHUFFLESCHED, prd>;
   defm PMOVZX#NAME : SS41I_pmovx_rm_all<!add(opc, 0x10),
                                         !strconcat("pmovzx", OpcodeStr),
                                         MemOp, MemYOp,
                                         SSE_INTALU_ITINS_SHUFF_P,
                                         DEFAULT_ITINS_SHUFFLESCHED,
-                                        DEFAULT_ITINS_SHUFFLESCHED>;
+                                        DEFAULT_ITINS_SHUFFLESCHED, prd>;
 }
 
-defm BW : SS41I_pmovx_rm<0x20, "bw", i64mem, i128mem>;
-defm WD : SS41I_pmovx_rm<0x23, "wd", i64mem, i128mem>;
-defm DQ : SS41I_pmovx_rm<0x25, "dq", i64mem, i128mem>;
+defm BW : SS41I_pmovx_rm<0x20, "bw", i64mem, i128mem, NoVLX_Or_NoBWI>;
+defm WD : SS41I_pmovx_rm<0x23, "wd", i64mem, i128mem, NoVLX>;
+defm DQ : SS41I_pmovx_rm<0x25, "dq", i64mem, i128mem, NoVLX>;
 
-defm BD : SS41I_pmovx_rm<0x21, "bd", i32mem, i64mem>;
-defm WQ : SS41I_pmovx_rm<0x24, "wq", i32mem, i64mem>;
+defm BD : SS41I_pmovx_rm<0x21, "bd", i32mem, i64mem, NoVLX>;
+defm WQ : SS41I_pmovx_rm<0x24, "wq", i32mem, i64mem, NoVLX>;
 
-defm BQ : SS41I_pmovx_rm<0x22, "bq", i16mem, i32mem>;
+defm BQ : SS41I_pmovx_rm<0x22, "bq", i16mem, i32mem, NoVLX>;
 
 // AVX2 Patterns
 multiclass SS41I_pmovx_avx2_patterns<string OpcPrefix, string ExtTy, SDNode ExtOp> {
   // Register-Register patterns
+  let Predicates = [HasAVX, NoVLX_Or_NoBWI] in {
   def : Pat<(v16i16 (ExtOp (v16i8 VR128:$src))),
             (!cast<I>(OpcPrefix#BWYrr) VR128:$src)>;
+  }
+  let Predicates = [HasAVX, NoVLX] in {
   def : Pat<(v8i32 (ExtOp (v16i8 VR128:$src))),
             (!cast<I>(OpcPrefix#BDYrr) VR128:$src)>;
   def : Pat<(v4i64 (ExtOp (v16i8 VR128:$src))),
@@ -5941,26 +5850,14 @@ multiclass SS41I_pmovx_avx2_patterns<string OpcPrefix, string ExtTy, SDNode ExtO
 
   def : Pat<(v4i64 (ExtOp (v4i32 VR128:$src))),
             (!cast<I>(OpcPrefix#DQYrr) VR128:$src)>;
-
-  // On AVX2, we also support 256bit inputs.
-  def : Pat<(v16i16 (ExtOp (v32i8 VR256:$src))),
-            (!cast<I>(OpcPrefix#BWYrr) (EXTRACT_SUBREG VR256:$src, sub_xmm))>;
-  def : Pat<(v8i32 (ExtOp (v32i8 VR256:$src))),
-            (!cast<I>(OpcPrefix#BDYrr) (EXTRACT_SUBREG VR256:$src, sub_xmm))>;
-  def : Pat<(v4i64 (ExtOp (v32i8 VR256:$src))),
-            (!cast<I>(OpcPrefix#BQYrr) (EXTRACT_SUBREG VR256:$src, sub_xmm))>;
-
-  def : Pat<(v8i32 (ExtOp (v16i16 VR256:$src))),
-            (!cast<I>(OpcPrefix#WDYrr) (EXTRACT_SUBREG VR256:$src, sub_xmm))>;
-  def : Pat<(v4i64 (ExtOp (v16i16 VR256:$src))),
-            (!cast<I>(OpcPrefix#WQYrr) (EXTRACT_SUBREG VR256:$src, sub_xmm))>;
-
-  def : Pat<(v4i64 (ExtOp (v8i32 VR256:$src))),
-            (!cast<I>(OpcPrefix#DQYrr) (EXTRACT_SUBREG VR256:$src, sub_xmm))>;
+  }
 
   // Simple Register-Memory patterns
+  let Predicates = [HasAVX, NoVLX_Or_NoBWI] in {
   def : Pat<(v16i16 (!cast<PatFrag>(ExtTy#"extloadvi8") addr:$src)),
             (!cast<I>(OpcPrefix#BWYrm) addr:$src)>;
+  }
+  let Predicates = [HasAVX, NoVLX] in {
   def : Pat<(v8i32 (!cast<PatFrag>(ExtTy#"extloadvi8") addr:$src)),
             (!cast<I>(OpcPrefix#BDYrm) addr:$src)>;
   def : Pat<(v4i64 (!cast<PatFrag>(ExtTy#"extloadvi8") addr:$src)),
@@ -5973,8 +5870,10 @@ multiclass SS41I_pmovx_avx2_patterns<string OpcPrefix, string ExtTy, SDNode ExtO
 
   def : Pat<(v4i64 (!cast<PatFrag>(ExtTy#"extloadvi32") addr:$src)),
             (!cast<I>(OpcPrefix#DQYrm) addr:$src)>;
+  }
 
   // AVX2 Register-Memory patterns
+  let Predicates = [HasAVX, NoVLX_Or_NoBWI] in {
   def : Pat<(v16i16 (ExtOp (bc_v16i8 (loadv2i64 addr:$src)))),
             (!cast<I>(OpcPrefix#BWYrm) addr:$src)>;
   def : Pat<(v16i16 (ExtOp (v16i8 (vzmovl_v2i64 addr:$src)))),
@@ -5983,7 +5882,8 @@ multiclass SS41I_pmovx_avx2_patterns<string OpcPrefix, string ExtTy, SDNode ExtO
             (!cast<I>(OpcPrefix#BWYrm) addr:$src)>;
   def : Pat<(v16i16 (ExtOp (bc_v16i8 (loadv2i64 addr:$src)))),
             (!cast<I>(OpcPrefix#BWYrm) addr:$src)>;
-
+  }
+  let Predicates = [HasAVX, NoVLX] in {
   def : Pat<(v8i32 (ExtOp (bc_v16i8 (v2i64 (scalar_to_vector (loadi64 addr:$src)))))),
             (!cast<I>(OpcPrefix#BDYrm) addr:$src)>;
   def : Pat<(v8i32 (ExtOp (v16i8 (vzmovl_v2i64 addr:$src)))),
@@ -6028,18 +5928,20 @@ multiclass SS41I_pmovx_avx2_patterns<string OpcPrefix, string ExtTy, SDNode ExtO
             (!cast<I>(OpcPrefix#DQYrm) addr:$src)>;
   def : Pat<(v4i64 (ExtOp (bc_v4i32 (loadv2i64 addr:$src)))),
             (!cast<I>(OpcPrefix#DQYrm) addr:$src)>;
+  }
 }
 
-let Predicates = [HasAVX2, NoVLX] in {
-  defm : SS41I_pmovx_avx2_patterns<"VPMOVSX", "s", X86vsext>;
-  defm : SS41I_pmovx_avx2_patterns<"VPMOVZX", "z", X86vzext>;
-}
+defm : SS41I_pmovx_avx2_patterns<"VPMOVSX", "s", X86vsext>;
+defm : SS41I_pmovx_avx2_patterns<"VPMOVZX", "z", X86vzext>;
 
 // SSE4.1/AVX patterns.
 multiclass SS41I_pmovx_patterns<string OpcPrefix, string ExtTy,
                                 SDNode ExtOp, PatFrag ExtLoad16> {
+  let Predicates = [HasAVX, NoVLX_Or_NoBWI] in {
   def : Pat<(v8i16 (ExtOp (v16i8 VR128:$src))),
             (!cast<I>(OpcPrefix#BWrr) VR128:$src)>;
+  }
+  let Predicates = [HasAVX, NoVLX] in {
   def : Pat<(v4i32 (ExtOp (v16i8 VR128:$src))),
             (!cast<I>(OpcPrefix#BDrr) VR128:$src)>;
   def : Pat<(v2i64 (ExtOp (v16i8 VR128:$src))),
@@ -6052,9 +5954,12 @@ multiclass SS41I_pmovx_patterns<string OpcPrefix, string ExtTy,
 
   def : Pat<(v2i64 (ExtOp (v4i32 VR128:$src))),
             (!cast<I>(OpcPrefix#DQrr) VR128:$src)>;
-
+  }
+  let Predicates = [HasAVX, NoVLX_Or_NoBWI] in {
   def : Pat<(v8i16 (!cast<PatFrag>(ExtTy#"extloadvi8") addr:$src)),
             (!cast<I>(OpcPrefix#BWrm) addr:$src)>;
+  }
+  let Predicates = [HasAVX, NoVLX] in {
   def : Pat<(v4i32 (!cast<PatFrag>(ExtTy#"extloadvi8") addr:$src)),
             (!cast<I>(OpcPrefix#BDrm) addr:$src)>;
   def : Pat<(v2i64 (!cast<PatFrag>(ExtTy#"extloadvi8") addr:$src)),
@@ -6067,7 +5972,8 @@ multiclass SS41I_pmovx_patterns<string OpcPrefix, string ExtTy,
 
   def : Pat<(v2i64 (!cast<PatFrag>(ExtTy#"extloadvi32") addr:$src)),
             (!cast<I>(OpcPrefix#DQrm) addr:$src)>;
-
+  }
+  let Predicates = [HasAVX, NoVLX_Or_NoBWI] in {
   def : Pat<(v8i16 (ExtOp (bc_v16i8 (v2i64 (scalar_to_vector (loadi64 addr:$src)))))),
             (!cast<I>(OpcPrefix#BWrm) addr:$src)>;
   def : Pat<(v8i16 (ExtOp (bc_v16i8 (v2f64 (scalar_to_vector (loadf64 addr:$src)))))),
@@ -6078,7 +5984,8 @@ multiclass SS41I_pmovx_patterns<string OpcPrefix, string ExtTy,
             (!cast<I>(OpcPrefix#BWrm) addr:$src)>;
   def : Pat<(v8i16 (ExtOp (bc_v16i8 (loadv2i64 addr:$src)))),
             (!cast<I>(OpcPrefix#BWrm) addr:$src)>;
-
+  }
+  let Predicates = [HasAVX, NoVLX] in {
   def : Pat<(v4i32 (ExtOp (bc_v16i8 (v4i32 (scalar_to_vector (loadi32 addr:$src)))))),
             (!cast<I>(OpcPrefix#BDrm) addr:$src)>;
   def : Pat<(v4i32 (ExtOp (v16i8 (vzmovl_v4i32 addr:$src)))),
@@ -6127,12 +6034,11 @@ multiclass SS41I_pmovx_patterns<string OpcPrefix, string ExtTy,
             (!cast<I>(OpcPrefix#DQrm) addr:$src)>;
   def : Pat<(v2i64 (ExtOp (bc_v4i32 (loadv2i64 addr:$src)))),
             (!cast<I>(OpcPrefix#DQrm) addr:$src)>;
+  }
 }
 
-let Predicates = [HasAVX, NoVLX] in {
-  defm : SS41I_pmovx_patterns<"VPMOVSX", "s", X86vsext, extloadi32i16>;
-  defm : SS41I_pmovx_patterns<"VPMOVZX", "z", X86vzext, loadi16_anyext>;
-}
+defm : SS41I_pmovx_patterns<"VPMOVSX", "s", X86vsext, extloadi32i16>;
+defm : SS41I_pmovx_patterns<"VPMOVZX", "z", X86vzext, loadi16_anyext>;
 
 let Predicates = [UseSSE41] in {
   defm : SS41I_pmovx_patterns<"PMOVSX", "s", X86vsext, extloadi32i16>;
@@ -6859,63 +6765,67 @@ multiclass SS48I_binop_rm2<bits<8> opc, string OpcodeStr, SDNode OpNode,
 }
 
 let Predicates = [HasAVX, NoVLX] in {
-  defm VPMINSB   : SS48I_binop_rm<0x38, "vpminsb", smin, v16i8, VR128,
-                                  loadv2i64, i128mem, 0, SSE_INTALU_ITINS_P>,
-                                  VEX_4V;
   defm VPMINSD   : SS48I_binop_rm<0x39, "vpminsd", smin, v4i32, VR128,
                                   loadv2i64, i128mem, 0, SSE_INTALU_ITINS_P>,
                                   VEX_4V;
   defm VPMINUD   : SS48I_binop_rm<0x3B, "vpminud", umin, v4i32, VR128,
                                   loadv2i64, i128mem, 0, SSE_INTALU_ITINS_P>,
                                   VEX_4V;
-  defm VPMINUW   : SS48I_binop_rm<0x3A, "vpminuw", umin, v8i16, VR128,
+  defm VPMAXSD   : SS48I_binop_rm<0x3D, "vpmaxsd", smax, v4i32, VR128,
                                   loadv2i64, i128mem, 0, SSE_INTALU_ITINS_P>,
                                   VEX_4V;
-  defm VPMAXSB   : SS48I_binop_rm<0x3C, "vpmaxsb", smax, v16i8, VR128,
+  defm VPMAXUD   : SS48I_binop_rm<0x3F, "vpmaxud", umax, v4i32, VR128,
                                   loadv2i64, i128mem, 0, SSE_INTALU_ITINS_P>,
                                   VEX_4V;
-  defm VPMAXSD   : SS48I_binop_rm<0x3D, "vpmaxsd", smax, v4i32, VR128,
+  defm VPMULDQ   : SS48I_binop_rm2<0x28, "vpmuldq", X86pmuldq, v2i64, v4i32,
+                                   VR128, loadv2i64, i128mem,
+                                   SSE_INTMUL_ITINS_P, 1, 0>, VEX_4V;
+}
+let Predicates = [HasAVX, NoVLX_Or_NoBWI] in {
+  defm VPMINSB   : SS48I_binop_rm<0x38, "vpminsb", smin, v16i8, VR128,
                                   loadv2i64, i128mem, 0, SSE_INTALU_ITINS_P>,
                                   VEX_4V;
-  defm VPMAXUD   : SS48I_binop_rm<0x3F, "vpmaxud", umax, v4i32, VR128,
+  defm VPMINUW   : SS48I_binop_rm<0x3A, "vpminuw", umin, v8i16, VR128,
+                                  loadv2i64, i128mem, 0, SSE_INTALU_ITINS_P>,
+                                  VEX_4V;
+  defm VPMAXSB   : SS48I_binop_rm<0x3C, "vpmaxsb", smax, v16i8, VR128,
                                   loadv2i64, i128mem, 0, SSE_INTALU_ITINS_P>,
                                   VEX_4V;
   defm VPMAXUW   : SS48I_binop_rm<0x3E, "vpmaxuw", umax, v8i16, VR128,
                                   loadv2i64, i128mem, 0, SSE_INTALU_ITINS_P>,
                                   VEX_4V;
-  defm VPMULDQ   : SS48I_binop_rm2<0x28, "vpmuldq", X86pmuldq, v2i64, v4i32,
-                                   VR128, loadv2i64, i128mem,
-                                   SSE_INTMUL_ITINS_P, 1, 0>, VEX_4V;
 }
 
 let Predicates = [HasAVX2, NoVLX] in {
-  defm VPMINSBY  : SS48I_binop_rm<0x38, "vpminsb", smin, v32i8, VR256,
-                                  loadv4i64, i256mem, 0, SSE_INTALU_ITINS_P>,
-                                  VEX_4V, VEX_L;
   defm VPMINSDY  : SS48I_binop_rm<0x39, "vpminsd", smin, v8i32, VR256,
                                   loadv4i64, i256mem, 0, SSE_INTALU_ITINS_P>,
                                   VEX_4V, VEX_L;
   defm VPMINUDY  : SS48I_binop_rm<0x3B, "vpminud", umin, v8i32, VR256,
                                   loadv4i64, i256mem, 0, SSE_INTALU_ITINS_P>,
                                   VEX_4V, VEX_L;
-  defm VPMINUWY  : SS48I_binop_rm<0x3A, "vpminuw", umin, v16i16, VR256,
+  defm VPMAXSDY  : SS48I_binop_rm<0x3D, "vpmaxsd", smax, v8i32, VR256,
                                   loadv4i64, i256mem, 0, SSE_INTALU_ITINS_P>,
                                   VEX_4V, VEX_L;
-  defm VPMAXSBY  : SS48I_binop_rm<0x3C, "vpmaxsb", smax, v32i8, VR256,
+  defm VPMAXUDY  : SS48I_binop_rm<0x3F, "vpmaxud", umax, v8i32, VR256,
                                   loadv4i64, i256mem, 0, SSE_INTALU_ITINS_P>,
                                   VEX_4V, VEX_L;
-  defm VPMAXSDY  : SS48I_binop_rm<0x3D, "vpmaxsd", smax, v8i32, VR256,
+  defm VPMULDQY : SS48I_binop_rm2<0x28, "vpmuldq", X86pmuldq, v4i64, v8i32,
+                                  VR256, loadv4i64, i256mem,
+                                  SSE_INTMUL_ITINS_P, 1, 0>, VEX_4V, VEX_L;
+}
+let Predicates = [HasAVX2, NoVLX_Or_NoBWI] in {
+  defm VPMINSBY  : SS48I_binop_rm<0x38, "vpminsb", smin, v32i8, VR256,
                                   loadv4i64, i256mem, 0, SSE_INTALU_ITINS_P>,
                                   VEX_4V, VEX_L;
-  defm VPMAXUDY  : SS48I_binop_rm<0x3F, "vpmaxud", umax, v8i32, VR256,
+  defm VPMINUWY  : SS48I_binop_rm<0x3A, "vpminuw", umin, v16i16, VR256,
+                                  loadv4i64, i256mem, 0, SSE_INTALU_ITINS_P>,
+                                  VEX_4V, VEX_L;
+  defm VPMAXSBY  : SS48I_binop_rm<0x3C, "vpmaxsb", smax, v32i8, VR256,
                                   loadv4i64, i256mem, 0, SSE_INTALU_ITINS_P>,
                                   VEX_4V, VEX_L;
   defm VPMAXUWY  : SS48I_binop_rm<0x3E, "vpmaxuw", umax, v16i16, VR256,
                                   loadv4i64, i256mem, 0, SSE_INTALU_ITINS_P>,
                                   VEX_4V, VEX_L;
-  defm VPMULDQY : SS48I_binop_rm2<0x28, "vpmuldq", X86pmuldq, v4i64, v8i32,
-                                  VR256, loadv4i64, i256mem,
-                                  SSE_INTMUL_ITINS_P, 1, 0>, VEX_4V, VEX_L;
 }
 
 let Constraints = "$src1 = $dst" in {
@@ -7238,14 +7148,12 @@ let Predicates = [UseAVX] in {
 // on targets where they have equal performance. These were changed to use
 // blends because blends have better throughput on SandyBridge and Haswell, but
 // movs[s/d] are 1-2 byte shorter instructions.
-let Predicates = [UseSSE41] in {
+let Predicates = [UseSSE41], AddedComplexity = 15 in {
   // With SSE41 we can use blends for these patterns.
   def : Pat<(v4f32 (X86vzmovl (v4f32 VR128:$src))),
             (BLENDPSrri (v4f32 (V_SET0)), VR128:$src, (i8 1))>;
   def : Pat<(v4i32 (X86vzmovl (v4i32 VR128:$src))),
             (PBLENDWrri (v4i32 (V_SET0)), VR128:$src, (i8 3))>;
-  def : Pat<(v2f64 (X86vzmovl (v2f64 VR128:$src))),
-            (BLENDPDrri (v2f64 (V_SET0)), VR128:$src, (i8 1))>;
 }
 
 
@@ -7316,13 +7224,14 @@ let Predicates = [UseSSE41] in {
             (BLENDVPDrr0 VR128:$src2, VR128:$src1)>;
 }
 
+let AddedComplexity = 400 in { // Prefer non-temporal versions
 let SchedRW = [WriteLoad] in {
-let Predicates = [HasAVX] in
+let Predicates = [HasAVX, NoVLX] in
 def VMOVNTDQArm : SS48I<0x2A, MRMSrcMem, (outs VR128:$dst), (ins i128mem:$src),
                        "vmovntdqa\t{$src, $dst|$dst, $src}",
                        [(set VR128:$dst, (int_x86_sse41_movntdqa addr:$src))]>,
                        VEX;
-let Predicates = [HasAVX2] in
+let Predicates = [HasAVX2, NoVLX] in
 def VMOVNTDQAYrm : SS48I<0x2A, MRMSrcMem, (outs VR256:$dst), (ins i256mem:$src),
                          "vmovntdqa\t{$src, $dst|$dst, $src}",
                          [(set VR256:$dst, (int_x86_avx2_movntdqa addr:$src))]>,
@@ -7332,6 +7241,35 @@ def MOVNTDQArm : SS48I<0x2A, MRMSrcMem, (outs VR128:$dst), (ins i128mem:$src),
                        [(set VR128:$dst, (int_x86_sse41_movntdqa addr:$src))]>;
 } // SchedRW
 
+let Predicates = [HasAVX2, NoVLX] in {
+  def : Pat<(v8f32 (alignednontemporalload addr:$src)),
+            (VMOVNTDQAYrm addr:$src)>;
+  def : Pat<(v4f64 (alignednontemporalload addr:$src)),
+            (VMOVNTDQAYrm addr:$src)>;
+  def : Pat<(v4i64 (alignednontemporalload addr:$src)),
+            (VMOVNTDQAYrm addr:$src)>;
+}
+
+let Predicates = [HasAVX, NoVLX] in {
+  def : Pat<(v4f32 (alignednontemporalload addr:$src)),
+            (VMOVNTDQArm addr:$src)>;
+  def : Pat<(v2f64 (alignednontemporalload addr:$src)),
+            (VMOVNTDQArm addr:$src)>;
+  def : Pat<(v2i64 (alignednontemporalload addr:$src)),
+            (VMOVNTDQArm addr:$src)>;
+}
+
+let Predicates = [UseSSE41] in {
+  def : Pat<(v4f32 (alignednontemporalload addr:$src)),
+            (MOVNTDQArm addr:$src)>;
+  def : Pat<(v2f64 (alignednontemporalload addr:$src)),
+            (MOVNTDQArm addr:$src)>;
+  def : Pat<(v2i64 (alignednontemporalload addr:$src)),
+            (MOVNTDQArm addr:$src)>;
+}
+
+} // AddedComplexity
+
 //===----------------------------------------------------------------------===//
 // SSE4.2 - Compare Instructions
 //===----------------------------------------------------------------------===//
@@ -7815,14 +7753,24 @@ def INSERTQ  : I<0x79, MRMSrcReg, (outs VR128:$dst),
                                     VR128:$mask))]>, XD;
 }
 
+// Non-temporal (unaligned) scalar stores.
+let AddedComplexity = 400 in { // Prefer non-temporal versions
+let mayStore = 1, SchedRW = [WriteStore] in {
 def MOVNTSS : I<0x2B, MRMDestMem, (outs), (ins f32mem:$dst, VR128:$src),
-                "movntss\t{$src, $dst|$dst, $src}",
-                [(int_x86_sse4a_movnt_ss addr:$dst, VR128:$src)]>, XS;
+                "movntss\t{$src, $dst|$dst, $src}", [], IIC_SSE_MOVNT>, XS;
 
 def MOVNTSD : I<0x2B, MRMDestMem, (outs), (ins f64mem:$dst, VR128:$src),
-                "movntsd\t{$src, $dst|$dst, $src}",
-                [(int_x86_sse4a_movnt_sd addr:$dst, VR128:$src)]>, XD;
-}
+                "movntsd\t{$src, $dst|$dst, $src}", [], IIC_SSE_MOVNT>, XD;
+} // SchedRW
+
+def : Pat<(nontemporalstore FR32:$src, addr:$dst),
+          (MOVNTSS addr:$dst, (COPY_TO_REGCLASS FR32:$src, VR128))>;
+
+def : Pat<(nontemporalstore FR64:$src, addr:$dst),
+          (MOVNTSD addr:$dst, (COPY_TO_REGCLASS FR64:$src, VR128))>;
+
+} // AddedComplexity
+} // HasSSE4A
 
 //===----------------------------------------------------------------------===//
 // AVX Instructions
@@ -7848,24 +7796,24 @@ class avx2_broadcast_rr<bits<8> opc, string OpcodeStr, RegisterClass RC,
          [(set RC:$dst, (ResVT (X86VBroadcast (OpVT VR128:$src))))]>,
          Sched<[Sched]>, VEX;
 
-let ExeDomain = SSEPackedSingle in {
+let ExeDomain = SSEPackedSingle, Predicates = [HasAVX, NoVLX] in {
   def VBROADCASTSSrm  : avx_broadcast_rm<0x18, "vbroadcastss", VR128,
                                              f32mem, v4f32, loadf32, WriteLoad>;
   def VBROADCASTSSYrm : avx_broadcast_rm<0x18, "vbroadcastss", VR256,
                                              f32mem, v8f32, loadf32,
                                              WriteFShuffleLd>, VEX_L;
 }
-let ExeDomain = SSEPackedDouble in
+let ExeDomain = SSEPackedDouble, Predicates = [HasAVX, NoVLX] in
 def VBROADCASTSDYrm  : avx_broadcast_rm<0x19, "vbroadcastsd", VR256, f64mem,
                                     v4f64, loadf64, WriteFShuffleLd>, VEX_L;
 
-let ExeDomain = SSEPackedSingle in {
+let ExeDomain = SSEPackedSingle, Predicates = [HasAVX2, NoVLX] in {
   def VBROADCASTSSrr  : avx2_broadcast_rr<0x18, "vbroadcastss", VR128,
                                           v4f32, v4f32, WriteFShuffle>;
   def VBROADCASTSSYrr : avx2_broadcast_rr<0x18, "vbroadcastss", VR256,
                                           v8f32, v4f32, WriteFShuffle256>, VEX_L;
 }
-let ExeDomain = SSEPackedDouble in
+let ExeDomain = SSEPackedDouble, Predicates = [HasAVX2, NoVLX] in
 def VBROADCASTSDYrr  : avx2_broadcast_rr<0x19, "vbroadcastsd", VR256,
                                          v4f64, v2f64, WriteFShuffle256>, VEX_L;
 
@@ -7977,7 +7925,7 @@ def VEXTRACTF128mr : AVXAIi8<0x19, MRMDestMem, (outs),
 }
 
 // AVX1 patterns
-let Predicates = [HasAVX] in {
+let Predicates = [HasAVX, NoVLX] in {
 def : Pat<(vextract128_extract:$ext VR256:$src1, (iPTR imm)),
           (v4f32 (VEXTRACTF128rr
                     (v8f32 VR256:$src1),
@@ -8015,20 +7963,20 @@ def : Pat<(vextract128_extract:$ext VR256:$src1, (iPTR imm)),
                   (v32i8 VR256:$src1),
                   (EXTRACT_get_vextract128_imm VR128:$ext)))>;
 
-def : Pat<(alignedstore (v2i64 (vextract128_extract:$ext (v4i64 VR256:$src1),
-                                (iPTR imm))), addr:$dst),
+def : Pat<(store (v2i64 (vextract128_extract:$ext (v4i64 VR256:$src1),
+                         (iPTR imm))), addr:$dst),
           (VEXTRACTF128mr addr:$dst, VR256:$src1,
            (EXTRACT_get_vextract128_imm VR128:$ext))>;
-def : Pat<(alignedstore (v4i32 (vextract128_extract:$ext (v8i32 VR256:$src1),
-                                (iPTR imm))), addr:$dst),
+def : Pat<(store (v4i32 (vextract128_extract:$ext (v8i32 VR256:$src1),
+                         (iPTR imm))), addr:$dst),
           (VEXTRACTF128mr addr:$dst, VR256:$src1,
            (EXTRACT_get_vextract128_imm VR128:$ext))>;
-def : Pat<(alignedstore (v8i16 (vextract128_extract:$ext (v16i16 VR256:$src1),
-                                (iPTR imm))), addr:$dst),
+def : Pat<(store (v8i16 (vextract128_extract:$ext (v16i16 VR256:$src1),
+                         (iPTR imm))), addr:$dst),
           (VEXTRACTF128mr addr:$dst, VR256:$src1,
            (EXTRACT_get_vextract128_imm VR128:$ext))>;
-def : Pat<(alignedstore (v16i8 (vextract128_extract:$ext (v32i8 VR256:$src1),
-                                (iPTR imm))), addr:$dst),
+def : Pat<(store (v16i8 (vextract128_extract:$ext (v32i8 VR256:$src1),
+                         (iPTR imm))), addr:$dst),
           (VEXTRACTF128mr addr:$dst, VR256:$src1,
            (EXTRACT_get_vextract128_imm VR128:$ext))>;
 }
@@ -8078,45 +8026,45 @@ defm VMASKMOVPD : avx_movmask_rm<0x2D, 0x2F, "vmaskmovpd",
 multiclass avx_permil<bits<8> opc_rm, bits<8> opc_rmi, string OpcodeStr,
                       RegisterClass RC, X86MemOperand x86memop_f,
                       X86MemOperand x86memop_i, PatFrag i_frag,
-                      Intrinsic IntVar, ValueType vt> {
-  def rr  : AVX8I<opc_rm, MRMSrcReg, (outs RC:$dst),
-             (ins RC:$src1, RC:$src2),
-             !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
-             [(set RC:$dst, (IntVar RC:$src1, RC:$src2))]>, VEX_4V,
-             Sched<[WriteFShuffle]>;
-  def rm  : AVX8I<opc_rm, MRMSrcMem, (outs RC:$dst),
-             (ins RC:$src1, x86memop_i:$src2),
-             !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
-             [(set RC:$dst, (IntVar RC:$src1,
-                             (bitconvert (i_frag addr:$src2))))]>, VEX_4V,
-             Sched<[WriteFShuffleLd, ReadAfterLd]>;
-
+                      ValueType f_vt, ValueType i_vt> {
   let Predicates = [HasAVX, NoVLX] in {
+    def rr  : AVX8I<opc_rm, MRMSrcReg, (outs RC:$dst),
+               (ins RC:$src1, RC:$src2),
+               !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+               [(set RC:$dst, (f_vt (X86VPermilpv RC:$src1, (i_vt RC:$src2))))]>, VEX_4V,
+               Sched<[WriteFShuffle]>;
+    def rm  : AVX8I<opc_rm, MRMSrcMem, (outs RC:$dst),
+               (ins RC:$src1, x86memop_i:$src2),
+               !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+               [(set RC:$dst, (f_vt (X86VPermilpv RC:$src1,
+                              (i_vt (bitconvert (i_frag addr:$src2))))))]>, VEX_4V,
+               Sched<[WriteFShuffleLd, ReadAfterLd]>;
+
     def ri  : AVXAIi8<opc_rmi, MRMSrcReg, (outs RC:$dst),
              (ins RC:$src1, u8imm:$src2),
              !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
-             [(set RC:$dst, (vt (X86VPermilpi RC:$src1, (i8 imm:$src2))))]>, VEX,
+             [(set RC:$dst, (f_vt (X86VPermilpi RC:$src1, (i8 imm:$src2))))]>, VEX,
              Sched<[WriteFShuffle]>;
     def mi  : AVXAIi8<opc_rmi, MRMSrcMem, (outs RC:$dst),
              (ins x86memop_f:$src1, u8imm:$src2),
              !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
              [(set RC:$dst,
-               (vt (X86VPermilpi (load addr:$src1), (i8 imm:$src2))))]>, VEX,
+               (f_vt (X86VPermilpi (load addr:$src1), (i8 imm:$src2))))]>, VEX,
              Sched<[WriteFShuffleLd]>;
   }// Predicates = [HasAVX, NoVLX]
 }
 
 let ExeDomain = SSEPackedSingle in {
   defm VPERMILPS  : avx_permil<0x0C, 0x04, "vpermilps", VR128, f128mem, i128mem,
-                               loadv2i64, int_x86_avx_vpermilvar_ps, v4f32>;
+                               loadv2i64, v4f32, v4i32>;
   defm VPERMILPSY : avx_permil<0x0C, 0x04, "vpermilps", VR256, f256mem, i256mem,
-                       loadv4i64, int_x86_avx_vpermilvar_ps_256, v8f32>, VEX_L;
+                               loadv4i64, v8f32, v8i32>, VEX_L;
 }
 let ExeDomain = SSEPackedDouble in {
   defm VPERMILPD  : avx_permil<0x0D, 0x05, "vpermilpd", VR128, f128mem, i128mem,
-                               loadv2i64, int_x86_avx_vpermilvar_pd, v2f64>;
+                               loadv2i64, v2f64, v2i64>;
   defm VPERMILPDY : avx_permil<0x0D, 0x05, "vpermilpd", VR256, f256mem, i256mem,
-                       loadv4i64, int_x86_avx_vpermilvar_pd_256, v4f64>, VEX_L;
+                               loadv4i64, v4f64, v4i64>, VEX_L;
 }
 
 let Predicates = [HasAVX, NoVLX] in {
@@ -8158,6 +8106,7 @@ def : Pat<(v2i64 (X86VPermilpi (loadv2i64 addr:$src1), (i8 imm:$imm))),
 // VPERM2F128 - Permute Floating-Point Values in 128-bit chunks
 //
 let ExeDomain = SSEPackedSingle in {
+let isCommutable = 1 in
 def VPERM2F128rr : AVXAIi8<0x06, MRMSrcReg, (outs VR256:$dst),
           (ins VR256:$src1, VR256:$src2, u8imm:$src3),
           "vperm2f128\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}",
@@ -8276,9 +8225,14 @@ let Predicates = [HasF16C] in {
 
 // Patterns for  matching conversions from float to half-float and vice versa.
 let Predicates = [HasF16C] in {
+  // Use MXCSR.RC for rounding instead of explicitly specifying the default
+  // rounding mode (Nearest-Even, encoded as 0). Both are equivalent in the
+  // configurations we support (the default). However, falling back to MXCSR is
+  // more consistent with other instructions, which are always controlled by it.
+  // It's encoded as 0b100.
   def : Pat<(fp_to_f16 FR32:$src),
             (i16 (EXTRACT_SUBREG (VMOVPDI2DIrr (VCVTPS2PHrr
-              (COPY_TO_REGCLASS FR32:$src, VR128), 0)), sub_16bit))>;
+              (COPY_TO_REGCLASS FR32:$src, VR128), 4)), sub_16bit))>;
 
   def : Pat<(f16_to_fp GR16:$src),
             (f32 (COPY_TO_REGCLASS (VCVTPH2PSrr
@@ -8286,7 +8240,7 @@ let Predicates = [HasF16C] in {
 
   def : Pat<(f16_to_fp (i16 (fp_to_f16 FR32:$src))),
             (f32 (COPY_TO_REGCLASS (VCVTPH2PSrr
-              (VCVTPS2PHrr (COPY_TO_REGCLASS FR32:$src, VR128), 0)), FR32)) >;
+              (VCVTPS2PHrr (COPY_TO_REGCLASS FR32:$src, VR128), 4)), FR32)) >;
 }
 
 //===----------------------------------------------------------------------===//
@@ -8387,49 +8341,54 @@ let Predicates = [HasAVX2] in {
   def : Pat<(v4f64 (X86VBroadcast (v4f64 VR256:$src))),
             (VBROADCASTSDYrr (v2f64 (EXTRACT_SUBREG (v4f64 VR256:$src),
                                                     sub_xmm)))>;
+}
 
+let Predicates = [HasAVX2, NoVLX] in {
   // Provide fallback in case the load node that is used in the patterns above
   // is used by additional users, which prevents the pattern selection.
-  let AddedComplexity = 20 in {
+    let AddedComplexity = 20 in {
     def : Pat<(v4f32 (X86VBroadcast FR32:$src)),
               (VBROADCASTSSrr (COPY_TO_REGCLASS FR32:$src, VR128))>;
     def : Pat<(v8f32 (X86VBroadcast FR32:$src)),
               (VBROADCASTSSYrr (COPY_TO_REGCLASS FR32:$src, VR128))>;
     def : Pat<(v4f64 (X86VBroadcast FR64:$src)),
               (VBROADCASTSDYrr (COPY_TO_REGCLASS FR64:$src, VR128))>;
+    }
+}
+
+let Predicates = [HasAVX2, NoVLX_Or_NoBWI], AddedComplexity = 20 in {
+  def : Pat<(v16i8 (X86VBroadcast GR8:$src)),
+        (VPBROADCASTBrr (COPY_TO_REGCLASS
+                         (i32 (SUBREG_TO_REG (i32 0), GR8:$src, sub_8bit)),
+                         VR128))>;
+  def : Pat<(v32i8 (X86VBroadcast GR8:$src)),
+        (VPBROADCASTBYrr (COPY_TO_REGCLASS
+                          (i32 (SUBREG_TO_REG (i32 0), GR8:$src, sub_8bit)),
+                          VR128))>;
+
+  def : Pat<(v8i16 (X86VBroadcast GR16:$src)),
+        (VPBROADCASTWrr (COPY_TO_REGCLASS
+                         (i32 (SUBREG_TO_REG (i32 0), GR16:$src, sub_16bit)),
+                         VR128))>;
+  def : Pat<(v16i16 (X86VBroadcast GR16:$src)),
+        (VPBROADCASTWYrr (COPY_TO_REGCLASS
+                          (i32 (SUBREG_TO_REG (i32 0), GR16:$src, sub_16bit)),
+                          VR128))>;
+}
+let Predicates = [HasAVX2, NoVLX], AddedComplexity = 20 in {
+  def : Pat<(v4i32 (X86VBroadcast GR32:$src)),
+            (VBROADCASTSSrr (COPY_TO_REGCLASS GR32:$src, VR128))>;
+  def : Pat<(v8i32 (X86VBroadcast GR32:$src)),
+            (VBROADCASTSSYrr (COPY_TO_REGCLASS GR32:$src, VR128))>;
+  def : Pat<(v4i64 (X86VBroadcast GR64:$src)),
+            (VBROADCASTSDYrr (COPY_TO_REGCLASS GR64:$src, VR128))>;
 
-    def : Pat<(v4i32 (X86VBroadcast GR32:$src)),
-              (VBROADCASTSSrr (COPY_TO_REGCLASS GR32:$src, VR128))>;
-    def : Pat<(v8i32 (X86VBroadcast GR32:$src)),
-              (VBROADCASTSSYrr (COPY_TO_REGCLASS GR32:$src, VR128))>;
-    def : Pat<(v4i64 (X86VBroadcast GR64:$src)),
-              (VBROADCASTSDYrr (COPY_TO_REGCLASS GR64:$src, VR128))>;
-
-    def : Pat<(v16i8 (X86VBroadcast GR8:$src)),
-          (VPBROADCASTBrr (COPY_TO_REGCLASS
-                           (i32 (SUBREG_TO_REG (i32 0), GR8:$src, sub_8bit)),
-                           VR128))>;
-    def : Pat<(v32i8 (X86VBroadcast GR8:$src)),
-          (VPBROADCASTBYrr (COPY_TO_REGCLASS
-                            (i32 (SUBREG_TO_REG (i32 0), GR8:$src, sub_8bit)),
-                            VR128))>;
-
-    def : Pat<(v8i16 (X86VBroadcast GR16:$src)),
-          (VPBROADCASTWrr (COPY_TO_REGCLASS
-                           (i32 (SUBREG_TO_REG (i32 0), GR16:$src, sub_16bit)),
-                           VR128))>;
-    def : Pat<(v16i16 (X86VBroadcast GR16:$src)),
-          (VPBROADCASTWYrr (COPY_TO_REGCLASS
-                            (i32 (SUBREG_TO_REG (i32 0), GR16:$src, sub_16bit)),
-                            VR128))>;
-
-    // The patterns for VPBROADCASTD are not needed because they would match
-    // the exact same thing as VBROADCASTSS patterns.
-
-    def : Pat<(v2i64 (X86VBroadcast GR64:$src)),
-          (VPBROADCASTQrr (COPY_TO_REGCLASS GR64:$src, VR128))>;
-    // The v4i64 pattern is not needed because VBROADCASTSDYrr already match.
-  }
+  // The patterns for VPBROADCASTD are not needed because they would match
+  // the exact same thing as VBROADCASTSS patterns.
+
+  def : Pat<(v2i64 (X86VBroadcast GR64:$src)),
+        (VPBROADCASTQrr (COPY_TO_REGCLASS GR64:$src, VR128))>;
+  // The v4i64 pattern is not needed because VBROADCASTSDYrr already match.
 }
 
 // AVX1 broadcast patterns
@@ -8442,11 +8401,15 @@ def : Pat<(v4i32 (X86VBroadcast (loadi32 addr:$src))),
           (VBROADCASTSSrm addr:$src)>;
 }
 
-let Predicates = [HasAVX] in {
   // Provide fallback in case the load node that is used in the patterns above
   // is used by additional users, which prevents the pattern selection.
-  let AddedComplexity = 20 in {
+let Predicates = [HasAVX], AddedComplexity = 20 in {
   // 128bit broadcasts:
+  def : Pat<(v2f64 (X86VBroadcast f64:$src)),
+            (VMOVDDUPrr (COPY_TO_REGCLASS FR64:$src, VR128))>;
+}
+
+let Predicates = [HasAVX, NoVLX], AddedComplexity = 20 in {
   def : Pat<(v4f32 (X86VBroadcast FR32:$src)),
             (VPSHUFDri (COPY_TO_REGCLASS FR32:$src, VR128), 0)>;
   def : Pat<(v8f32 (X86VBroadcast FR32:$src)),
@@ -8468,12 +8431,9 @@ let Predicates = [HasAVX] in {
             (VINSERTF128rr (INSERT_SUBREG (v4i64 (IMPLICIT_DEF)),
               (VPSHUFDri (COPY_TO_REGCLASS GR64:$src, VR128), 0x44), sub_xmm),
               (VPSHUFDri (COPY_TO_REGCLASS GR64:$src, VR128), 0x44), 1)>;
-  }
 
-  def : Pat<(v2f64 (X86VBroadcast f64:$src)),
-            (VMOVDDUPrr (COPY_TO_REGCLASS FR64:$src, VR128))>;
   def : Pat<(v2i64 (X86VBroadcast i64:$src)),
-            (VMOVDDUPrr (COPY_TO_REGCLASS GR64:$src, VR128))>;
+              (VMOVDDUPrr (COPY_TO_REGCLASS GR64:$src, VR128))>;
 }
 
 //===----------------------------------------------------------------------===//
@@ -8482,21 +8442,23 @@ let Predicates = [HasAVX] in {
 
 multiclass avx2_perm<bits<8> opc, string OpcodeStr, PatFrag mem_frag,
                      ValueType OpVT, X86FoldableSchedWrite Sched> {
-  def Yrr : AVX28I<opc, MRMSrcReg, (outs VR256:$dst),
-                   (ins VR256:$src1, VR256:$src2),
-                   !strconcat(OpcodeStr,
-                       "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
-                   [(set VR256:$dst,
-                     (OpVT (X86VPermv VR256:$src1, VR256:$src2)))]>,
-                   Sched<[Sched]>, VEX_4V, VEX_L;
-  def Yrm : AVX28I<opc, MRMSrcMem, (outs VR256:$dst),
-                   (ins VR256:$src1, i256mem:$src2),
-                   !strconcat(OpcodeStr,
-                       "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
-                   [(set VR256:$dst,
-                     (OpVT (X86VPermv VR256:$src1,
-                            (bitconvert (mem_frag addr:$src2)))))]>,
-                   Sched<[Sched.Folded, ReadAfterLd]>, VEX_4V, VEX_L;
+  let Predicates = [HasAVX2, NoVLX] in {
+    def Yrr : AVX28I<opc, MRMSrcReg, (outs VR256:$dst),
+                     (ins VR256:$src1, VR256:$src2),
+                     !strconcat(OpcodeStr,
+                         "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+                     [(set VR256:$dst,
+                       (OpVT (X86VPermv VR256:$src1, VR256:$src2)))]>,
+                     Sched<[Sched]>, VEX_4V, VEX_L;
+    def Yrm : AVX28I<opc, MRMSrcMem, (outs VR256:$dst),
+                     (ins VR256:$src1, i256mem:$src2),
+                     !strconcat(OpcodeStr,
+                         "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+                     [(set VR256:$dst,
+                       (OpVT (X86VPermv VR256:$src1,
+                              (bitconvert (mem_frag addr:$src2)))))]>,
+                     Sched<[Sched.Folded, ReadAfterLd]>, VEX_4V, VEX_L;
+  }
 }
 
 defm VPERMD : avx2_perm<0x36, "vpermd", loadv4i64, v8i32, WriteShuffle256>;
@@ -8505,21 +8467,23 @@ defm VPERMPS : avx2_perm<0x16, "vpermps", loadv8f32, v8f32, WriteFShuffle256>;
 
 multiclass avx2_perm_imm<bits<8> opc, string OpcodeStr, PatFrag mem_frag,
                          ValueType OpVT, X86FoldableSchedWrite Sched> {
-  def Yri : AVX2AIi8<opc, MRMSrcReg, (outs VR256:$dst),
-                     (ins VR256:$src1, u8imm:$src2),
-                     !strconcat(OpcodeStr,
-                         "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
-                     [(set VR256:$dst,
-                       (OpVT (X86VPermi VR256:$src1, (i8 imm:$src2))))]>,
-                     Sched<[Sched]>, VEX, VEX_L;
-  def Ymi : AVX2AIi8<opc, MRMSrcMem, (outs VR256:$dst),
-                     (ins i256mem:$src1, u8imm:$src2),
-                     !strconcat(OpcodeStr,
-                         "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
-                     [(set VR256:$dst,
-                       (OpVT (X86VPermi (mem_frag addr:$src1),
-                              (i8 imm:$src2))))]>,
-                     Sched<[Sched.Folded, ReadAfterLd]>, VEX, VEX_L;
+  let Predicates = [HasAVX2, NoVLX] in {
+    def Yri : AVX2AIi8<opc, MRMSrcReg, (outs VR256:$dst),
+                       (ins VR256:$src1, u8imm:$src2),
+                       !strconcat(OpcodeStr,
+                           "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+                       [(set VR256:$dst,
+                         (OpVT (X86VPermi VR256:$src1, (i8 imm:$src2))))]>,
+                       Sched<[Sched]>, VEX, VEX_L;
+    def Ymi : AVX2AIi8<opc, MRMSrcMem, (outs VR256:$dst),
+                       (ins i256mem:$src1, u8imm:$src2),
+                       !strconcat(OpcodeStr,
+                           "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+                       [(set VR256:$dst,
+                         (OpVT (X86VPermi (mem_frag addr:$src1),
+                                (i8 imm:$src2))))]>,
+                       Sched<[Sched.Folded, ReadAfterLd]>, VEX, VEX_L;
+  }
 }
 
 defm VPERMQ : avx2_perm_imm<0x00, "vpermq", loadv4i64, v4i64,
@@ -8531,6 +8495,7 @@ defm VPERMPD : avx2_perm_imm<0x01, "vpermpd", loadv4f64, v4f64,
 //===----------------------------------------------------------------------===//
 // VPERM2I128 - Permute Floating-Point Values in 128-bit chunks
 //
+let isCommutable = 1 in
 def VPERM2I128rr : AVX2AIi8<0x46, MRMSrcReg, (outs VR256:$dst),
           (ins VR256:$src1, VR256:$src2, u8imm:$src3),
           "vperm2i128\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}",
@@ -8631,7 +8596,7 @@ def VEXTRACTI128mr : AVX2AIi8<0x39, MRMDestMem, (outs),
           "vextracti128\t{$src2, $src1, $dst|$dst, $src1, $src2}", []>,
           Sched<[WriteStore]>, VEX, VEX_L;
 
-let Predicates = [HasAVX2] in {
+let Predicates = [HasAVX2, NoVLX] in {
 def : Pat<(vextract128_extract:$ext VR256:$src1, (iPTR imm)),
           (v2i64 (VEXTRACTI128rr
                     (v4i64 VR256:$src1),
@@ -8703,116 +8668,42 @@ defm VPMASKMOVQ : avx2_pmovmask<"vpmaskmovq",
                                 int_x86_avx2_maskstore_q,
                                 int_x86_avx2_maskstore_q_256>, VEX_W;
 
-def: Pat<(X86mstore addr:$ptr, (v8i32 VR256:$mask), (v8f32 VR256:$src)),
-         (VMASKMOVPSYmr addr:$ptr, VR256:$mask, VR256:$src)>;
-
-def: Pat<(X86mstore addr:$ptr, (v8i32 VR256:$mask), (v8i32 VR256:$src)),
-         (VPMASKMOVDYmr addr:$ptr, VR256:$mask, VR256:$src)>;
-
-def: Pat<(X86mstore addr:$ptr, (v4i32 VR128:$mask), (v4f32 VR128:$src)),
-         (VMASKMOVPSmr addr:$ptr, VR128:$mask, VR128:$src)>;
-
-def: Pat<(X86mstore addr:$ptr, (v4i32 VR128:$mask), (v4i32 VR128:$src)),
-         (VPMASKMOVDmr addr:$ptr, VR128:$mask, VR128:$src)>;
-
-def: Pat<(v8f32 (masked_load addr:$ptr, (v8i32 VR256:$mask), undef)),
-         (VMASKMOVPSYrm VR256:$mask, addr:$ptr)>;
-
-def: Pat<(v8f32 (masked_load addr:$ptr, (v8i32 VR256:$mask),
-                             (bc_v8f32 (v8i32 immAllZerosV)))),
-         (VMASKMOVPSYrm VR256:$mask, addr:$ptr)>;
-
-def: Pat<(v8f32 (masked_load addr:$ptr, (v8i32 VR256:$mask), (v8f32 VR256:$src0))),
-         (VBLENDVPSYrr VR256:$src0, (VMASKMOVPSYrm VR256:$mask, addr:$ptr),
-                       VR256:$mask)>;
-
-def: Pat<(v8i32 (masked_load addr:$ptr, (v8i32 VR256:$mask), undef)),
-         (VPMASKMOVDYrm VR256:$mask, addr:$ptr)>;
-
-def: Pat<(v8i32 (masked_load addr:$ptr, (v8i32 VR256:$mask), (v8i32 immAllZerosV))),
-         (VPMASKMOVDYrm VR256:$mask, addr:$ptr)>;
-
-def: Pat<(v8i32 (masked_load addr:$ptr, (v8i32 VR256:$mask), (v8i32 VR256:$src0))),
-         (VBLENDVPSYrr VR256:$src0, (VPMASKMOVDYrm VR256:$mask, addr:$ptr),
-                       VR256:$mask)>;
-
-def: Pat<(v4f32 (masked_load addr:$ptr, (v4i32 VR128:$mask), undef)),
-         (VMASKMOVPSrm VR128:$mask, addr:$ptr)>;
-
-def: Pat<(v4f32 (masked_load addr:$ptr, (v4i32 VR128:$mask),
-                             (bc_v4f32 (v4i32 immAllZerosV)))),
-         (VMASKMOVPSrm VR128:$mask, addr:$ptr)>;
-
-def: Pat<(v4f32 (masked_load addr:$ptr, (v4i32 VR128:$mask), (v4f32 VR128:$src0))),
-         (VBLENDVPSrr VR128:$src0, (VMASKMOVPSrm VR128:$mask, addr:$ptr),
-                       VR128:$mask)>;
-
-def: Pat<(v4i32 (masked_load addr:$ptr, (v4i32 VR128:$mask), undef)),
-         (VPMASKMOVDrm VR128:$mask, addr:$ptr)>;
-
-def: Pat<(v4i32 (masked_load addr:$ptr, (v4i32 VR128:$mask), (v4i32 immAllZerosV))),
-         (VPMASKMOVDrm VR128:$mask, addr:$ptr)>;
-
-def: Pat<(v4i32 (masked_load addr:$ptr, (v4i32 VR128:$mask), (v4i32 VR128:$src0))),
-         (VBLENDVPSrr VR128:$src0, (VPMASKMOVDrm VR128:$mask, addr:$ptr),
-                       VR128:$mask)>;
-
-def: Pat<(X86mstore addr:$ptr, (v4i64 VR256:$mask), (v4f64 VR256:$src)),
-         (VMASKMOVPDYmr addr:$ptr, VR256:$mask, VR256:$src)>;
-
-def: Pat<(X86mstore addr:$ptr, (v4i64 VR256:$mask), (v4i64 VR256:$src)),
-         (VPMASKMOVQYmr addr:$ptr, VR256:$mask, VR256:$src)>;
-
-def: Pat<(v4f64 (masked_load addr:$ptr, (v4i64 VR256:$mask), undef)),
-         (VMASKMOVPDYrm VR256:$mask, addr:$ptr)>;
-
-def: Pat<(v4f64 (masked_load addr:$ptr, (v4i64 VR256:$mask),
-                             (v4f64 immAllZerosV))),
-         (VMASKMOVPDYrm VR256:$mask, addr:$ptr)>;
-
-def: Pat<(v4f64 (masked_load addr:$ptr, (v4i64 VR256:$mask), (v4f64 VR256:$src0))),
-         (VBLENDVPDYrr VR256:$src0, (VMASKMOVPDYrm VR256:$mask, addr:$ptr),
-                       VR256:$mask)>;
-
-def: Pat<(v4i64 (masked_load addr:$ptr, (v4i64 VR256:$mask), undef)),
-         (VPMASKMOVQYrm VR256:$mask, addr:$ptr)>;
-
-def: Pat<(v4i64 (masked_load addr:$ptr, (v4i64 VR256:$mask),
-                             (bc_v4i64 (v8i32 immAllZerosV)))),
-         (VPMASKMOVQYrm VR256:$mask, addr:$ptr)>;
-
-def: Pat<(v4i64 (masked_load addr:$ptr, (v4i64 VR256:$mask), (v4i64 VR256:$src0))),
-         (VBLENDVPDYrr VR256:$src0, (VPMASKMOVQYrm VR256:$mask, addr:$ptr),
-                       VR256:$mask)>;
-
-def: Pat<(X86mstore addr:$ptr, (v2i64 VR128:$mask), (v2f64 VR128:$src)),
-         (VMASKMOVPDmr addr:$ptr, VR128:$mask, VR128:$src)>;
-
-def: Pat<(X86mstore addr:$ptr, (v2i64 VR128:$mask), (v2i64 VR128:$src)),
-         (VPMASKMOVQmr addr:$ptr, VR128:$mask, VR128:$src)>;
-
-def: Pat<(v2f64 (masked_load addr:$ptr, (v2i64 VR128:$mask), undef)),
-         (VMASKMOVPDrm VR128:$mask, addr:$ptr)>;
-
-def: Pat<(v2f64 (masked_load addr:$ptr, (v2i64 VR128:$mask),
-                             (v2f64 immAllZerosV))),
-         (VMASKMOVPDrm VR128:$mask, addr:$ptr)>;
-
-def: Pat<(v2f64 (masked_load addr:$ptr, (v2i64 VR128:$mask), (v2f64 VR128:$src0))),
-         (VBLENDVPDrr VR128:$src0, (VMASKMOVPDrm VR128:$mask, addr:$ptr),
-                       VR128:$mask)>;
-
-def: Pat<(v2i64 (masked_load addr:$ptr, (v2i64 VR128:$mask), undef)),
-         (VPMASKMOVQrm VR128:$mask, addr:$ptr)>;
-
-def: Pat<(v2i64 (masked_load addr:$ptr, (v2i64 VR128:$mask),
-                             (bc_v2i64 (v4i32 immAllZerosV)))),
-         (VPMASKMOVQrm VR128:$mask, addr:$ptr)>;
-
-def: Pat<(v2i64 (masked_load addr:$ptr, (v2i64 VR128:$mask), (v2i64 VR128:$src0))),
-         (VBLENDVPDrr VR128:$src0, (VPMASKMOVQrm VR128:$mask, addr:$ptr),
-                       VR128:$mask)>;
-
+multiclass maskmov_lowering<string InstrStr, RegisterClass RC, ValueType VT,
+                          ValueType MaskVT, string BlendStr, ValueType ZeroVT> {
+    // masked store
+    def: Pat<(X86mstore addr:$ptr, (MaskVT RC:$mask), (VT RC:$src)),
+             (!cast<Instruction>(InstrStr#"mr") addr:$ptr, RC:$mask, RC:$src)>;
+    // masked load
+    def: Pat<(VT (masked_load addr:$ptr, (MaskVT RC:$mask), undef)),
+             (!cast<Instruction>(InstrStr#"rm") RC:$mask, addr:$ptr)>;
+    def: Pat<(VT (masked_load addr:$ptr, (MaskVT RC:$mask),
+                              (VT (bitconvert (ZeroVT immAllZerosV))))),
+             (!cast<Instruction>(InstrStr#"rm") RC:$mask, addr:$ptr)>;
+    def: Pat<(VT (masked_load addr:$ptr, (MaskVT RC:$mask), (VT RC:$src0))),
+             (!cast<Instruction>(BlendStr#"rr")
+                 RC:$src0,
+                 (!cast<Instruction>(InstrStr#"rm") RC:$mask, addr:$ptr),
+                 RC:$mask)>;
+}
+let Predicates = [HasAVX] in {
+  defm : maskmov_lowering<"VMASKMOVPS", VR128, v4f32, v4i32, "VBLENDVPS", v4i32>;
+  defm : maskmov_lowering<"VMASKMOVPD", VR128, v2f64, v2i64, "VBLENDVPD", v4i32>;
+  defm : maskmov_lowering<"VMASKMOVPSY", VR256, v8f32, v8i32, "VBLENDVPSY", v8i32>;
+  defm : maskmov_lowering<"VMASKMOVPDY", VR256, v4f64, v4i64, "VBLENDVPDY", v8i32>;
+}
+let Predicates = [HasAVX1Only] in {
+  // load/store i32/i64 not supported use ps/pd version
+  defm : maskmov_lowering<"VMASKMOVPSY", VR256, v8i32, v8i32, "VBLENDVPSY", v8i32>;
+  defm : maskmov_lowering<"VMASKMOVPDY", VR256, v4i64, v4i64, "VBLENDVPDY", v8i32>;
+  defm : maskmov_lowering<"VMASKMOVPS", VR128, v4i32, v4i32, "VBLENDVPS", v4i32>;
+  defm : maskmov_lowering<"VMASKMOVPD", VR128, v2i64, v2i64, "VBLENDVPD", v4i32>;
+}
+let Predicates = [HasAVX2] in {
+  defm : maskmov_lowering<"VPMASKMOVDY", VR256, v8i32, v8i32, "VBLENDVPSY", v8i32>;
+  defm : maskmov_lowering<"VPMASKMOVQY", VR256, v4i64, v4i64, "VBLENDVPDY", v8i32>;
+  defm : maskmov_lowering<"VPMASKMOVD", VR128, v4i32, v4i32, "VBLENDVPS", v4i32>;
+  defm : maskmov_lowering<"VPMASKMOVQ", VR128, v2i64, v2i64, "VBLENDVPD", v4i32>;
+}
 //===----------------------------------------------------------------------===//
 // Variable Bit Shifts
 //
@@ -8852,6 +8743,8 @@ let Predicates = [HasAVX2, NoVLX] in {
   defm VPSRLVD : avx2_var_shift<0x45, "vpsrlvd", srl, v4i32, v8i32>;
   defm VPSRLVQ : avx2_var_shift<0x45, "vpsrlvq", srl, v2i64, v4i64>, VEX_W;
   defm VPSRAVD : avx2_var_shift<0x46, "vpsravd", sra, v4i32, v8i32>;
+  let isCodeGenOnly = 1 in
+    defm VPSRAVD_Int : avx2_var_shift<0x46, "vpsravd", X86vsrav, v4i32, v8i32>;
 }
 //===----------------------------------------------------------------------===//
 // VGATHER - GATHER Operations
@@ -8869,22 +8762,22 @@ multiclass avx2_gather<bits<8> opc, string OpcodeStr, RegisterClass RC256,
             []>, VEX_4VOp3, VEX_L;
 }
 
-let mayLoad = 1, Constraints
+let mayLoad = 1, hasSideEffects = 0, Constraints
   = "@earlyclobber $dst,@earlyclobber $mask_wb, $src1 = $dst, $mask = $mask_wb"
   in {
-  defm VPGATHERDQ : avx2_gather<0x90, "vpgatherdq", VR256, vx64mem, vx64mem>, VEX_W;
-  defm VPGATHERQQ : avx2_gather<0x91, "vpgatherqq", VR256, vx64mem, vy64mem>, VEX_W;
-  defm VPGATHERDD : avx2_gather<0x90, "vpgatherdd", VR256, vx32mem, vy32mem>;
-  defm VPGATHERQD : avx2_gather<0x91, "vpgatherqd", VR128, vx32mem, vy32mem>;
+  defm VPGATHERDQ : avx2_gather<0x90, "vpgatherdq", VR256, vx128mem, vx256mem>, VEX_W;
+  defm VPGATHERQQ : avx2_gather<0x91, "vpgatherqq", VR256, vx128mem, vy256mem>, VEX_W;
+  defm VPGATHERDD : avx2_gather<0x90, "vpgatherdd", VR256, vx128mem, vy256mem>;
+  defm VPGATHERQD : avx2_gather<0x91, "vpgatherqd", VR128, vx64mem, vy128mem>;
 
   let ExeDomain = SSEPackedDouble in {
-    defm VGATHERDPD : avx2_gather<0x92, "vgatherdpd", VR256, vx64mem, vx64mem>, VEX_W;
-    defm VGATHERQPD : avx2_gather<0x93, "vgatherqpd", VR256, vx64mem, vy64mem>, VEX_W;
+    defm VGATHERDPD : avx2_gather<0x92, "vgatherdpd", VR256, vx128mem, vx256mem>, VEX_W;
+    defm VGATHERQPD : avx2_gather<0x93, "vgatherqpd", VR256, vx128mem, vy256mem>, VEX_W;
   }
 
   let ExeDomain = SSEPackedSingle in {
-    defm VGATHERDPS : avx2_gather<0x92, "vgatherdps", VR256, vx32mem, vy32mem>;
-    defm VGATHERQPS : avx2_gather<0x93, "vgatherqps", VR128, vx32mem, vy32mem>;
+    defm VGATHERDPS : avx2_gather<0x92, "vgatherdps", VR256, vx128mem, vy256mem>;
+    defm VGATHERQPS : avx2_gather<0x93, "vgatherqps", VR128, vx64mem, vy128mem>;
   }
 }
 
diff --git a/lib/Target/X86/X86InstrSystem.td b/lib/Target/X86/X86InstrSystem.td
index a97d1e5c86d0..6667bd2aec4a 100644
--- a/lib/Target/X86/X86InstrSystem.td
+++ b/lib/Target/X86/X86InstrSystem.td
@@ -174,11 +174,11 @@ def MOV32rs : I<0x8C, MRMDestReg, (outs GR32:$dst), (ins SEGMENT_REG:$src),
 def MOV64rs : RI<0x8C, MRMDestReg, (outs GR64:$dst), (ins SEGMENT_REG:$src),
                  "mov{q}\t{$src, $dst|$dst, $src}", [], IIC_MOV_REG_SR>;
 
-def MOV16ms : I<0x8C, MRMDestMem, (outs i16mem:$dst), (ins SEGMENT_REG:$src),
+def MOV16ms : I<0x8C, MRMDestMem, (outs), (ins i16mem:$dst, SEGMENT_REG:$src),
                 "mov{w}\t{$src, $dst|$dst, $src}", [], IIC_MOV_MEM_SR>, OpSize16;
-def MOV32ms : I<0x8C, MRMDestMem, (outs i32mem:$dst), (ins SEGMENT_REG:$src),
+def MOV32ms : I<0x8C, MRMDestMem, (outs), (ins i32mem:$dst, SEGMENT_REG:$src),
                 "mov{l}\t{$src, $dst|$dst, $src}", [], IIC_MOV_MEM_SR>, OpSize32;
-def MOV64ms : RI<0x8C, MRMDestMem, (outs i64mem:$dst), (ins SEGMENT_REG:$src),
+def MOV64ms : RI<0x8C, MRMDestMem, (outs), (ins i64mem:$dst, SEGMENT_REG:$src),
                  "mov{q}\t{$src, $dst|$dst, $src}", [], IIC_MOV_MEM_SR>;
 
 def MOV16sr : I<0x8E, MRMSrcReg, (outs SEGMENT_REG:$dst), (ins GR16:$src),
@@ -248,7 +248,7 @@ def STR32r : I<0x00, MRM1r, (outs GR32:$dst), (ins),
                "str{l}\t$dst", [], IIC_STR>, TB, OpSize32;
 def STR64r : RI<0x00, MRM1r, (outs GR64:$dst), (ins),
                 "str{q}\t$dst", [], IIC_STR>, TB;
-def STRm   : I<0x00, MRM1m, (outs i16mem:$dst), (ins),
+def STRm   : I<0x00, MRM1m, (outs), (ins i16mem:$dst),
                "str{w}\t$dst", [], IIC_STR>, TB;
 
 def LTRr : I<0x00, MRM3r, (outs), (ins GR16:$src),
@@ -339,9 +339,11 @@ def POPGS64 : I<0xa9, RawFrm, (outs), (ins),
 
 
 def LDS16rm : I<0xc5, MRMSrcMem, (outs GR16:$dst), (ins opaque32mem:$src),
-                "lds{w}\t{$src, $dst|$dst, $src}", [], IIC_LXS>, OpSize16;
+                "lds{w}\t{$src, $dst|$dst, $src}", [], IIC_LXS>, OpSize16,
+                Requires<[Not64BitMode]>;
 def LDS32rm : I<0xc5, MRMSrcMem, (outs GR32:$dst), (ins opaque48mem:$src),
-                "lds{l}\t{$src, $dst|$dst, $src}", [], IIC_LXS>, OpSize32;
+                "lds{l}\t{$src, $dst|$dst, $src}", [], IIC_LXS>, OpSize32,
+                Requires<[Not64BitMode]>;
 
 def LSS16rm : I<0xb2, MRMSrcMem, (outs GR16:$dst), (ins opaque32mem:$src),
                 "lss{w}\t{$src, $dst|$dst, $src}", [], IIC_LXS>, TB, OpSize16;
@@ -351,9 +353,11 @@ def LSS64rm : RI<0xb2, MRMSrcMem, (outs GR64:$dst), (ins opaque80mem:$src),
                  "lss{q}\t{$src, $dst|$dst, $src}", [], IIC_LXS>, TB;
 
 def LES16rm : I<0xc4, MRMSrcMem, (outs GR16:$dst), (ins opaque32mem:$src),
-                "les{w}\t{$src, $dst|$dst, $src}", [], IIC_LXS>, OpSize16;
+                "les{w}\t{$src, $dst|$dst, $src}", [], IIC_LXS>, OpSize16,
+                Requires<[Not64BitMode]>;
 def LES32rm : I<0xc4, MRMSrcMem, (outs GR32:$dst), (ins opaque48mem:$src),
-                "les{l}\t{$src, $dst|$dst, $src}", [], IIC_LXS>, OpSize32;
+                "les{l}\t{$src, $dst|$dst, $src}", [], IIC_LXS>, OpSize32,
+                Requires<[Not64BitMode]>;
 
 def LFS16rm : I<0xb4, MRMSrcMem, (outs GR16:$dst), (ins opaque32mem:$src),
                 "lfs{w}\t{$src, $dst|$dst, $src}", [], IIC_LXS>, TB, OpSize16;
@@ -385,21 +389,21 @@ def VERWm : I<0x00, MRM5m, (outs), (ins i16mem:$seg),
 // Descriptor-table support instructions
 
 let SchedRW = [WriteSystem] in {
-def SGDT16m : I<0x01, MRM0m, (outs opaque48mem:$dst), (ins),
+def SGDT16m : I<0x01, MRM0m, (outs), (ins opaque48mem:$dst),
               "sgdt{w}\t$dst", [], IIC_SGDT>, TB, OpSize16, Requires<[Not64BitMode]>;
-def SGDT32m : I<0x01, MRM0m, (outs opaque48mem:$dst), (ins),
+def SGDT32m : I<0x01, MRM0m, (outs), (ins opaque48mem:$dst),
               "sgdt{l}\t$dst", [], IIC_SGDT>, OpSize32, TB, Requires <[Not64BitMode]>;
-def SGDT64m : I<0x01, MRM0m, (outs opaque80mem:$dst), (ins),
+def SGDT64m : I<0x01, MRM0m, (outs), (ins opaque80mem:$dst),
               "sgdt{q}\t$dst", [], IIC_SGDT>, TB, Requires <[In64BitMode]>;
-def SIDT16m : I<0x01, MRM1m, (outs opaque48mem:$dst), (ins),
+def SIDT16m : I<0x01, MRM1m, (outs), (ins opaque48mem:$dst),
               "sidt{w}\t$dst", [], IIC_SIDT>, TB, OpSize16, Requires<[Not64BitMode]>;
-def SIDT32m : I<0x01, MRM1m, (outs opaque48mem:$dst), (ins),
+def SIDT32m : I<0x01, MRM1m, (outs), (ins opaque48mem:$dst),
               "sidt{l}\t$dst", []>, OpSize32, TB, Requires <[Not64BitMode]>;
-def SIDT64m : I<0x01, MRM1m, (outs opaque80mem:$dst), (ins),
+def SIDT64m : I<0x01, MRM1m, (outs), (ins opaque80mem:$dst),
               "sidt{q}\t$dst", []>, TB, Requires <[In64BitMode]>;
 def SLDT16r : I<0x00, MRM0r, (outs GR16:$dst), (ins),
                 "sldt{w}\t$dst", [], IIC_SLDT>, TB, OpSize16;
-def SLDT16m : I<0x00, MRM0m, (outs i16mem:$dst), (ins),
+def SLDT16m : I<0x00, MRM0m, (outs), (ins i16mem:$dst),
                 "sldt{w}\t$dst", [], IIC_SLDT>, TB;
 def SLDT32r : I<0x00, MRM0r, (outs GR32:$dst), (ins),
                 "sldt{l}\t$dst", [], IIC_SLDT>, OpSize32, TB;
@@ -408,7 +412,7 @@ def SLDT32r : I<0x00, MRM0r, (outs GR32:$dst), (ins),
 //   extension.
 def SLDT64r : RI<0x00, MRM0r, (outs GR64:$dst), (ins),
                  "sldt{q}\t$dst", [], IIC_SLDT>, TB;
-def SLDT64m : RI<0x00, MRM0m, (outs i16mem:$dst), (ins),
+def SLDT64m : RI<0x00, MRM0m, (outs), (ins i16mem:$dst),
                  "sldt{q}\t$dst", [], IIC_SLDT>, TB;
 
 def LGDT16m : I<0x01, MRM2m, (outs), (ins opaque48mem:$src),
@@ -450,7 +454,7 @@ def SMSW64r : RI<0x01, MRM4r, (outs GR64:$dst), (ins),
                  "smsw{q}\t$dst", [], IIC_SMSW>, TB;
 
 // For memory operands, there is only a 16-bit form
-def SMSW16m : I<0x01, MRM4m, (outs i16mem:$dst), (ins),
+def SMSW16m : I<0x01, MRM4m, (outs), (ins i16mem:$dst),
                 "smsw{w}\t$dst", [], IIC_SMSW>, TB;
 
 def LMSW16r : I<0x01, MRM6r, (outs), (ins GR16:$src),
@@ -558,7 +562,7 @@ let usesCustomInserter = 1 in {
                 [(set GR32:$dst, (int_x86_rdpkru))]>;
 }
 
-let Defs = [EAX, EDX], Uses = [ECX] in 
+let Defs = [EAX, EDX], Uses = [ECX] in
   def RDPKRUr : I<0x01, MRM_EE, (outs), (ins), "rdpkru", []>, TB;
 let Uses = [EAX, ECX, EDX] in
   def WRPKRUr : I<0x01, MRM_EF, (outs), (ins), "wrpkru", []>, TB;
diff --git a/lib/Target/X86/X86InstrVMX.td b/lib/Target/X86/X86InstrVMX.td
index 79afe9a65409..2ea27a934b47 100644
--- a/lib/Target/X86/X86InstrVMX.td
+++ b/lib/Target/X86/X86InstrVMX.td
@@ -41,13 +41,13 @@ def VMLAUNCH : I<0x01, MRM_C2, (outs), (ins), "vmlaunch", []>, TB;
 def VMRESUME : I<0x01, MRM_C3, (outs), (ins), "vmresume", []>, TB;
 def VMPTRLDm : I<0xC7, MRM6m, (outs), (ins i64mem:$vmcs),
   "vmptrld\t$vmcs", []>, PS;
-def VMPTRSTm : I<0xC7, MRM7m, (outs i64mem:$vmcs), (ins),
+def VMPTRSTm : I<0xC7, MRM7m, (outs), (ins i64mem:$vmcs),
   "vmptrst\t$vmcs", []>, TB;
-def VMREAD64rm : I<0x78, MRMDestMem, (outs i64mem:$dst), (ins GR64:$src),
+def VMREAD64rm : I<0x78, MRMDestMem, (outs), (ins i64mem:$dst, GR64:$src),
   "vmread{q}\t{$src, $dst|$dst, $src}", []>, PS, Requires<[In64BitMode]>;
 def VMREAD64rr : I<0x78, MRMDestReg, (outs GR64:$dst), (ins GR64:$src),
   "vmread{q}\t{$src, $dst|$dst, $src}", []>, PS, Requires<[In64BitMode]>;
-def VMREAD32rm : I<0x78, MRMDestMem, (outs i32mem:$dst), (ins GR32:$src),
+def VMREAD32rm : I<0x78, MRMDestMem, (outs), (ins i32mem:$dst, GR32:$src),
   "vmread{l}\t{$src, $dst|$dst, $src}", []>, PS, Requires<[Not64BitMode]>;
 def VMREAD32rr : I<0x78, MRMDestReg, (outs GR32:$dst), (ins GR32:$src),
   "vmread{l}\t{$src, $dst|$dst, $src}", []>, PS, Requires<[Not64BitMode]>;
diff --git a/lib/Target/X86/X86InstrXOP.td b/lib/Target/X86/X86InstrXOP.td
index 4cb2304e464d..f49917b80f36 100644
--- a/lib/Target/X86/X86InstrXOP.td
+++ b/lib/Target/X86/X86InstrXOP.td
@@ -222,123 +222,199 @@ let ExeDomain = SSEPackedInt in { // SSE integer instructions
   defm VPCOMUQ : xopvpcom<0xEF, "uq", X86vpcomu, v2i64>;
 }
 
-// Instruction where either second or third source can be memory
-multiclass xop4op<bits<8> opc, string OpcodeStr, Intrinsic Int> {
-  def rr : IXOPi8<opc, MRMSrcReg, (outs VR128:$dst),
-           (ins VR128:$src1, VR128:$src2, VR128:$src3),
-           !strconcat(OpcodeStr,
-           "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
-           [(set VR128:$dst, (Int VR128:$src1, VR128:$src2, VR128:$src3))]>,
-           XOP_4V, VEX_I8IMM;
-  def rm : IXOPi8<opc, MRMSrcMem, (outs VR128:$dst),
-           (ins VR128:$src1, VR128:$src2, i128mem:$src3),
-           !strconcat(OpcodeStr,
-           "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
-           [(set VR128:$dst,
-             (Int VR128:$src1, VR128:$src2,
-              (bitconvert (loadv2i64 addr:$src3))))]>,
-           XOP_4V, VEX_I8IMM, VEX_W, MemOp4;
-  def mr : IXOPi8<opc, MRMSrcMem, (outs VR128:$dst),
-           (ins VR128:$src1, i128mem:$src2, VR128:$src3),
-           !strconcat(OpcodeStr,
-           "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
-           [(set VR128:$dst,
-             (Int VR128:$src1, (bitconvert (loadv2i64 addr:$src2)),
-              VR128:$src3))]>,
-           XOP_4V, VEX_I8IMM;
+multiclass xop4op<bits<8> opc, string OpcodeStr, SDNode OpNode,
+                  ValueType vt128> {
+  def rrr : IXOPi8<opc, MRMSrcReg, (outs VR128:$dst),
+            (ins VR128:$src1, VR128:$src2, VR128:$src3),
+            !strconcat(OpcodeStr,
+            "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
+            [(set VR128:$dst,
+              (vt128 (OpNode (vt128 VR128:$src1), (vt128 VR128:$src2),
+                             (vt128 VR128:$src3))))]>,
+            XOP_4V, VEX_I8IMM;
+  def rrm : IXOPi8<opc, MRMSrcMem, (outs VR128:$dst),
+            (ins VR128:$src1, VR128:$src2, i128mem:$src3),
+            !strconcat(OpcodeStr,
+            "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
+            [(set VR128:$dst,
+              (vt128 (OpNode (vt128 VR128:$src1), (vt128 VR128:$src2),
+                             (vt128 (bitconvert (loadv2i64 addr:$src3))))))]>,
+            XOP_4V, VEX_I8IMM, VEX_W, MemOp4;
+  def rmr : IXOPi8<opc, MRMSrcMem, (outs VR128:$dst),
+            (ins VR128:$src1, i128mem:$src2, VR128:$src3),
+            !strconcat(OpcodeStr,
+            "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
+            [(set VR128:$dst,
+              (v16i8 (OpNode (vt128 VR128:$src1), (vt128 (bitconvert (loadv2i64 addr:$src2))),
+                             (vt128 VR128:$src3))))]>,
+            XOP_4V, VEX_I8IMM;
+  // For disassembler
+  let isCodeGenOnly = 1, ForceDisassemble = 1, hasSideEffects = 0 in
+  def rrr_REV : IXOPi8<opc, MRMSrcReg, (outs VR128:$dst),
+                (ins VR128:$src1, VR128:$src2, VR128:$src3),
+                !strconcat(OpcodeStr,
+                "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
+                []>, XOP_4V, VEX_I8IMM, VEX_W, MemOp4;
 }
 
 let ExeDomain = SSEPackedInt in {
-  defm VPPERM : xop4op<0xA3, "vpperm", int_x86_xop_vpperm>;
-  defm VPCMOV : xop4op<0xA2, "vpcmov", int_x86_xop_vpcmov>;
+  defm VPPERM : xop4op<0xA3, "vpperm", X86vpperm, v16i8>;
 }
 
-multiclass xop4op256<bits<8> opc, string OpcodeStr, Intrinsic Int> {
-  def rrY : IXOPi8<opc, MRMSrcReg, (outs VR256:$dst),
-           (ins VR256:$src1, VR256:$src2, VR256:$src3),
-           !strconcat(OpcodeStr,
-           "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
-           [(set VR256:$dst, (Int VR256:$src1, VR256:$src2, VR256:$src3))]>,
-           XOP_4V, VEX_I8IMM, VEX_L;
-  def rmY : IXOPi8<opc, MRMSrcMem, (outs VR256:$dst),
-           (ins VR256:$src1, VR256:$src2, i256mem:$src3),
-           !strconcat(OpcodeStr,
-           "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
-           [(set VR256:$dst,
-             (Int VR256:$src1, VR256:$src2,
-              (bitconvert (loadv4i64 addr:$src3))))]>,
-           XOP_4V, VEX_I8IMM, VEX_W, MemOp4, VEX_L;
-  def mrY : IXOPi8<opc, MRMSrcMem, (outs VR256:$dst),
-           (ins VR256:$src1, f256mem:$src2, VR256:$src3),
-           !strconcat(OpcodeStr,
-           "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
-           [(set VR256:$dst,
-             (Int VR256:$src1, (bitconvert (loadv4i64 addr:$src2)),
-              VR256:$src3))]>,
-           XOP_4V, VEX_I8IMM, VEX_L;
+// Instruction where either second or third source can be memory
+multiclass xop4op_int<bits<8> opc, string OpcodeStr,
+                      Intrinsic Int128, Intrinsic Int256> {
+  // 128-bit Instruction
+  def rrr : IXOPi8<opc, MRMSrcReg, (outs VR128:$dst),
+            (ins VR128:$src1, VR128:$src2, VR128:$src3),
+            !strconcat(OpcodeStr,
+            "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
+            [(set VR128:$dst, (Int128 VR128:$src1, VR128:$src2, VR128:$src3))]>,
+            XOP_4V, VEX_I8IMM;
+  def rrm : IXOPi8<opc, MRMSrcMem, (outs VR128:$dst),
+            (ins VR128:$src1, VR128:$src2, i128mem:$src3),
+            !strconcat(OpcodeStr,
+            "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
+            [(set VR128:$dst,
+              (Int128 VR128:$src1, VR128:$src2,
+               (bitconvert (loadv2i64 addr:$src3))))]>,
+            XOP_4V, VEX_I8IMM, VEX_W, MemOp4;
+  def rmr : IXOPi8<opc, MRMSrcMem, (outs VR128:$dst),
+            (ins VR128:$src1, i128mem:$src2, VR128:$src3),
+            !strconcat(OpcodeStr,
+            "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
+            [(set VR128:$dst,
+              (Int128 VR128:$src1, (bitconvert (loadv2i64 addr:$src2)),
+               VR128:$src3))]>,
+            XOP_4V, VEX_I8IMM;
+  // For disassembler
+  let isCodeGenOnly = 1, ForceDisassemble = 1, hasSideEffects = 0 in
+  def rrr_REV : IXOPi8<opc, MRMSrcReg, (outs VR128:$dst),
+            (ins VR128:$src1, VR128:$src2, VR128:$src3),
+            !strconcat(OpcodeStr,
+            "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
+            []>, XOP_4V, VEX_I8IMM, VEX_W, MemOp4;
+
+  // 256-bit Instruction
+  def rrrY : IXOPi8<opc, MRMSrcReg, (outs VR256:$dst),
+             (ins VR256:$src1, VR256:$src2, VR256:$src3),
+             !strconcat(OpcodeStr,
+             "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
+             [(set VR256:$dst, (Int256 VR256:$src1, VR256:$src2, VR256:$src3))]>,
+             XOP_4V, VEX_I8IMM, VEX_L;
+  def rrmY : IXOPi8<opc, MRMSrcMem, (outs VR256:$dst),
+             (ins VR256:$src1, VR256:$src2, i256mem:$src3),
+             !strconcat(OpcodeStr,
+             "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
+             [(set VR256:$dst,
+               (Int256 VR256:$src1, VR256:$src2,
+               (bitconvert (loadv4i64 addr:$src3))))]>,
+             XOP_4V, VEX_I8IMM, VEX_W, MemOp4, VEX_L;
+  def rmrY : IXOPi8<opc, MRMSrcMem, (outs VR256:$dst),
+             (ins VR256:$src1, f256mem:$src2, VR256:$src3),
+             !strconcat(OpcodeStr,
+             "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
+             [(set VR256:$dst,
+               (Int256 VR256:$src1, (bitconvert (loadv4i64 addr:$src2)),
+                VR256:$src3))]>,
+             XOP_4V, VEX_I8IMM, VEX_L;
+  // For disassembler
+  let isCodeGenOnly = 1, ForceDisassemble = 1, hasSideEffects = 0 in
+  def rrrY_REV : IXOPi8<opc, MRMSrcReg, (outs VR256:$dst),
+            (ins VR256:$src1, VR256:$src2, VR256:$src3),
+            !strconcat(OpcodeStr,
+            "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
+            []>, XOP_4V, VEX_I8IMM, VEX_W, MemOp4, VEX_L;
 }
 
-let ExeDomain = SSEPackedInt in
-  defm VPCMOV : xop4op256<0xA2, "vpcmov", int_x86_xop_vpcmov_256>;
+let ExeDomain = SSEPackedInt in {
+  defm VPCMOV : xop4op_int<0xA2, "vpcmov",
+                           int_x86_xop_vpcmov, int_x86_xop_vpcmov_256>;
+}
 
 let Predicates = [HasXOP] in {
   def : Pat<(v2i64 (or (and VR128:$src3, VR128:$src1),
                        (X86andnp VR128:$src3, VR128:$src2))),
-            (VPCMOVrr VR128:$src1, VR128:$src2, VR128:$src3)>;
+            (VPCMOVrrr VR128:$src1, VR128:$src2, VR128:$src3)>;
 
   def : Pat<(v4i64 (or (and VR256:$src3, VR256:$src1),
                        (X86andnp VR256:$src3, VR256:$src2))),
-            (VPCMOVrrY VR256:$src1, VR256:$src2, VR256:$src3)>;
+            (VPCMOVrrrY VR256:$src1, VR256:$src2, VR256:$src3)>;
 }
 
-multiclass xop5op<bits<8> opc, string OpcodeStr, Intrinsic Int128,
-                  Intrinsic Int256, PatFrag ld_128, PatFrag ld_256> {
+multiclass xop5op<bits<8> opc, string OpcodeStr, SDNode OpNode,
+                  ValueType vt128, ValueType vt256,
+                  ValueType id128, ValueType id256,
+                  PatFrag ld_128, PatFrag ld_256> {
   def rr : IXOP5<opc, MRMSrcReg, (outs VR128:$dst),
         (ins VR128:$src1, VR128:$src2, VR128:$src3, u8imm:$src4),
         !strconcat(OpcodeStr,
         "\t{$src4, $src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3, $src4}"),
         [(set VR128:$dst,
-           (Int128 VR128:$src1, VR128:$src2, VR128:$src3, imm:$src4))]>;
+           (vt128 (OpNode (vt128 VR128:$src1), (vt128 VR128:$src2),
+                          (id128 VR128:$src3), (i8 imm:$src4))))]>;
   def rm : IXOP5<opc, MRMSrcMem, (outs VR128:$dst),
-        (ins VR128:$src1, VR128:$src2, f128mem:$src3, u8imm:$src4),
+        (ins VR128:$src1, VR128:$src2, i128mem:$src3, u8imm:$src4),
         !strconcat(OpcodeStr,
         "\t{$src4, $src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3, $src4}"),
         [(set VR128:$dst,
-           (Int128 VR128:$src1, VR128:$src2, (ld_128 addr:$src3), imm:$src4))]>,
+           (vt128 (OpNode (vt128 VR128:$src1), (vt128 VR128:$src2),
+                          (id128 (bitconvert (loadv2i64 addr:$src3))),
+                          (i8 imm:$src4))))]>,
         VEX_W, MemOp4;
   def mr : IXOP5<opc, MRMSrcMem, (outs VR128:$dst),
         (ins VR128:$src1, f128mem:$src2, VR128:$src3, u8imm:$src4),
         !strconcat(OpcodeStr,
         "\t{$src4, $src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3, $src4}"),
         [(set VR128:$dst,
-           (Int128 VR128:$src1, (ld_128 addr:$src2), VR128:$src3, imm:$src4))]>;
+           (vt128 (OpNode (vt128 VR128:$src1),
+                          (vt128 (bitconvert (ld_128 addr:$src2))),
+                          (id128 VR128:$src3), (i8 imm:$src4))))]>;
+  // For disassembler
+  let isCodeGenOnly = 1, ForceDisassemble = 1, hasSideEffects = 0 in
+  def rr_REV : IXOP5<opc, MRMSrcReg, (outs VR128:$dst),
+        (ins VR128:$src1, VR128:$src2, VR128:$src3, u8imm:$src4),
+        !strconcat(OpcodeStr,
+        "\t{$src4, $src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3, $src4}"),
+        []>, VEX_W, MemOp4;
+
   def rrY : IXOP5<opc, MRMSrcReg, (outs VR256:$dst),
         (ins VR256:$src1, VR256:$src2, VR256:$src3, u8imm:$src4),
         !strconcat(OpcodeStr,
         "\t{$src4, $src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3, $src4}"),
         [(set VR256:$dst,
-          (Int256 VR256:$src1, VR256:$src2, VR256:$src3, imm:$src4))]>, VEX_L;
+           (vt256 (OpNode (vt256 VR256:$src1), (vt256 VR256:$src2),
+                          (id256 VR256:$src3), (i8 imm:$src4))))]>, VEX_L;
   def rmY : IXOP5<opc, MRMSrcMem, (outs VR256:$dst),
-        (ins VR256:$src1, VR256:$src2, f256mem:$src3, u8imm:$src4),
+        (ins VR256:$src1, VR256:$src2, i256mem:$src3, u8imm:$src4),
         !strconcat(OpcodeStr,
         "\t{$src4, $src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3, $src4}"),
         [(set VR256:$dst,
-          (Int256 VR256:$src1, VR256:$src2, (ld_256 addr:$src3), imm:$src4))]>,
-        VEX_W, MemOp4, VEX_L;
+           (vt256 (OpNode (vt256 VR256:$src1), (vt256 VR256:$src2),
+                          (id256 (bitconvert (loadv4i64 addr:$src3))),
+                          (i8 imm:$src4))))]>, VEX_W, MemOp4, VEX_L;
   def mrY : IXOP5<opc, MRMSrcMem, (outs VR256:$dst),
         (ins VR256:$src1, f256mem:$src2, VR256:$src3, u8imm:$src4),
         !strconcat(OpcodeStr,
         "\t{$src4, $src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3, $src4}"),
         [(set VR256:$dst,
-           (Int256 VR256:$src1, (ld_256 addr:$src2), VR256:$src3, imm:$src4))]>,
-        VEX_L;
+           (vt256 (OpNode (vt256 VR256:$src1),
+                          (vt256 (bitconvert (ld_256 addr:$src2))),
+                          (id256 VR256:$src3), (i8 imm:$src4))))]>, VEX_L;
+  // For disassembler
+  let isCodeGenOnly = 1, ForceDisassemble = 1, hasSideEffects = 0 in
+  def rrY_REV : IXOP5<opc, MRMSrcReg, (outs VR256:$dst),
+        (ins VR256:$src1, VR256:$src2, VR256:$src3, u8imm:$src4),
+        !strconcat(OpcodeStr,
+        "\t{$src4, $src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3, $src4}"),
+        []>, VEX_W, MemOp4, VEX_L;
 }
 
 let ExeDomain = SSEPackedDouble in
-  defm VPERMIL2PD : xop5op<0x49, "vpermil2pd", int_x86_xop_vpermil2pd,
-                           int_x86_xop_vpermil2pd_256, loadv2f64, loadv4f64>;
+  defm VPERMIL2PD : xop5op<0x49, "vpermil2pd", X86vpermil2, v2f64, v4f64,
+                           v2i64, v4i64, loadv2f64, loadv4f64>;
 
 let ExeDomain = SSEPackedSingle in
-  defm VPERMIL2PS : xop5op<0x48, "vpermil2ps", int_x86_xop_vpermil2ps,
-                           int_x86_xop_vpermil2ps_256, loadv4f32, loadv8f32>;
+  defm VPERMIL2PS : xop5op<0x48, "vpermil2ps", X86vpermil2, v4f32, v8f32,
+                           v4i32, v8i32, loadv4f32, loadv8f32>;
 
diff --git a/lib/Target/X86/X86IntrinsicsInfo.h b/lib/Target/X86/X86IntrinsicsInfo.h
index b525d5eb60a7..b647d11e3866 100644
--- a/lib/Target/X86/X86IntrinsicsInfo.h
+++ b/lib/Target/X86/X86IntrinsicsInfo.h
@@ -14,31 +14,36 @@
 #ifndef LLVM_LIB_TARGET_X86_X86INTRINSICSINFO_H
 #define LLVM_LIB_TARGET_X86_X86INTRINSICSINFO_H
 
+#include "X86ISelLowering.h"
+#include "X86InstrInfo.h"
+
 namespace llvm {
 
-enum IntrinsicType {
+enum IntrinsicType : uint16_t {
   INTR_NO_TYPE,
   GATHER, SCATTER, PREFETCH, RDSEED, RDRAND, RDPMC, RDTSC, XTEST, ADX, FPCLASS, FPCLASSS,
   INTR_TYPE_1OP, INTR_TYPE_2OP, INTR_TYPE_2OP_IMM8, INTR_TYPE_3OP, INTR_TYPE_4OP,
-  CMP_MASK, CMP_MASK_CC,CMP_MASK_SCALAR_CC, VSHIFT, VSHIFT_MASK, COMI, COMI_RM,
+  CMP_MASK, CMP_MASK_CC,CMP_MASK_SCALAR_CC, VSHIFT, COMI, COMI_RM,
   INTR_TYPE_1OP_MASK, INTR_TYPE_1OP_MASK_RM,
   INTR_TYPE_2OP_MASK, INTR_TYPE_2OP_MASK_RM, INTR_TYPE_2OP_IMM8_MASK,
   INTR_TYPE_3OP_MASK, INTR_TYPE_3OP_MASK_RM, INTR_TYPE_3OP_IMM8_MASK,
-  FMA_OP_MASK, FMA_OP_MASKZ, FMA_OP_MASK3, VPERM_3OP_MASK,
-  VPERM_3OP_MASKZ, INTR_TYPE_SCALAR_MASK,
+  FMA_OP_MASK, FMA_OP_MASKZ, FMA_OP_MASK3,
+  FMA_OP_SCALAR_MASK, FMA_OP_SCALAR_MASKZ, FMA_OP_SCALAR_MASK3,
+  VPERM_2OP_MASK, VPERM_3OP_MASK, VPERM_3OP_MASKZ, INTR_TYPE_SCALAR_MASK,
   INTR_TYPE_SCALAR_MASK_RM, INTR_TYPE_3OP_SCALAR_MASK_RM,
-  COMPRESS_EXPAND_IN_REG, COMPRESS_TO_MEM, BRCST_SUBVEC_TO_VEC,
+  COMPRESS_EXPAND_IN_REG, COMPRESS_TO_MEM, BRCST_SUBVEC_TO_VEC, BRCST32x2_TO_VEC,
   TRUNCATE_TO_MEM_VI8, TRUNCATE_TO_MEM_VI16, TRUNCATE_TO_MEM_VI32,
-  EXPAND_FROM_MEM, LOADA, LOADU, BLEND, INSERT_SUBVEC,
-  TERLOG_OP_MASK, TERLOG_OP_MASKZ, BROADCASTM, KUNPCK, CONVERT_MASK_TO_VEC, CONVERT_TO_MASK
+  EXPAND_FROM_MEM, INSERT_SUBVEC,
+  TERLOG_OP_MASK, TERLOG_OP_MASKZ, BROADCASTM, KUNPCK, FIXUPIMM, FIXUPIMM_MASKZ, FIXUPIMMS,
+  FIXUPIMMS_MASKZ, CONVERT_MASK_TO_VEC, CONVERT_TO_MASK
 };
 
 struct IntrinsicData {
 
-  unsigned      Id;
+  uint16_t      Id;
   IntrinsicType Type;
-  unsigned      Opc0;
-  unsigned      Opc1;
+  uint16_t      Opc0;
+  uint16_t      Opc1;
 
   bool operator<(const IntrinsicData &RHS) const {
     return Id < RHS.Id;
@@ -61,6 +66,14 @@ static const IntrinsicData IntrinsicsWithChain[] = {
   X86_INTRINSIC_DATA(addcarryx_u32, ADX, X86ISD::ADC, 0),
   X86_INTRINSIC_DATA(addcarryx_u64, ADX, X86ISD::ADC, 0),
 
+  X86_INTRINSIC_DATA(avx512_gather_dpd_512, GATHER, X86::VGATHERDPDZrm, 0),
+  X86_INTRINSIC_DATA(avx512_gather_dpi_512, GATHER, X86::VPGATHERDDZrm, 0),
+  X86_INTRINSIC_DATA(avx512_gather_dpq_512, GATHER, X86::VPGATHERDQZrm, 0),
+  X86_INTRINSIC_DATA(avx512_gather_dps_512, GATHER, X86::VGATHERDPSZrm, 0),
+  X86_INTRINSIC_DATA(avx512_gather_qpd_512, GATHER, X86::VGATHERQPDZrm, 0),
+  X86_INTRINSIC_DATA(avx512_gather_qpi_512, GATHER, X86::VPGATHERQDZrm, 0),
+  X86_INTRINSIC_DATA(avx512_gather_qpq_512, GATHER, X86::VPGATHERQQZrm, 0),
+  X86_INTRINSIC_DATA(avx512_gather_qps_512, GATHER, X86::VGATHERQPSZrm, 0),
   X86_INTRINSIC_DATA(avx512_gather3div2_df, GATHER, X86::VGATHERQPDZ128rm, 0),
   X86_INTRINSIC_DATA(avx512_gather3div2_di, GATHER, X86::VPGATHERQQZ128rm, 0),
   X86_INTRINSIC_DATA(avx512_gather3div4_df, GATHER, X86::VGATHERQPDZ256rm, 0),
@@ -77,14 +90,6 @@ static const IntrinsicData IntrinsicsWithChain[] = {
   X86_INTRINSIC_DATA(avx512_gather3siv4_si, GATHER, X86::VPGATHERDDZ128rm, 0),
   X86_INTRINSIC_DATA(avx512_gather3siv8_sf, GATHER, X86::VGATHERDPSZ256rm, 0),
   X86_INTRINSIC_DATA(avx512_gather3siv8_si, GATHER, X86::VPGATHERDDZ256rm, 0),
-  X86_INTRINSIC_DATA(avx512_gather_dpd_512, GATHER, X86::VGATHERDPDZrm, 0),
-  X86_INTRINSIC_DATA(avx512_gather_dpi_512, GATHER, X86::VPGATHERDDZrm, 0),
-  X86_INTRINSIC_DATA(avx512_gather_dpq_512, GATHER, X86::VPGATHERDQZrm, 0),
-  X86_INTRINSIC_DATA(avx512_gather_dps_512, GATHER, X86::VGATHERDPSZrm, 0),
-  X86_INTRINSIC_DATA(avx512_gather_qpd_512, GATHER, X86::VGATHERQPDZrm, 0),
-  X86_INTRINSIC_DATA(avx512_gather_qpi_512, GATHER, X86::VPGATHERQDZrm, 0),
-  X86_INTRINSIC_DATA(avx512_gather_qpq_512, GATHER, X86::VPGATHERQQZrm, 0),
-  X86_INTRINSIC_DATA(avx512_gather_qps_512, GATHER, X86::VGATHERQPSZrm, 0),
 
   X86_INTRINSIC_DATA(avx512_gatherpf_dpd_512, PREFETCH,
                      X86::VGATHERPF0DPDm, X86::VGATHERPF1DPDm),
@@ -143,18 +148,6 @@ static const IntrinsicData IntrinsicsWithChain[] = {
                      EXPAND_FROM_MEM, X86ISD::EXPAND, 0),
   X86_INTRINSIC_DATA(avx512_mask_expand_load_q_512,
                      EXPAND_FROM_MEM, X86ISD::EXPAND, 0),
-  X86_INTRINSIC_DATA(avx512_mask_load_pd_128, LOADA, ISD::DELETED_NODE, 0),
-  X86_INTRINSIC_DATA(avx512_mask_load_pd_256, LOADA, ISD::DELETED_NODE, 0),
-  X86_INTRINSIC_DATA(avx512_mask_load_pd_512, LOADA, ISD::DELETED_NODE, 0),
-  X86_INTRINSIC_DATA(avx512_mask_load_ps_128, LOADA, ISD::DELETED_NODE, 0),
-  X86_INTRINSIC_DATA(avx512_mask_load_ps_256, LOADA, ISD::DELETED_NODE, 0),
-  X86_INTRINSIC_DATA(avx512_mask_load_ps_512, LOADA, ISD::DELETED_NODE, 0),
-  X86_INTRINSIC_DATA(avx512_mask_loadu_pd_128, LOADU, ISD::DELETED_NODE, 0),
-  X86_INTRINSIC_DATA(avx512_mask_loadu_pd_256, LOADU, ISD::DELETED_NODE, 0),
-  X86_INTRINSIC_DATA(avx512_mask_loadu_pd_512, LOADU, ISD::DELETED_NODE, 0),
-  X86_INTRINSIC_DATA(avx512_mask_loadu_ps_128, LOADU, ISD::DELETED_NODE, 0),
-  X86_INTRINSIC_DATA(avx512_mask_loadu_ps_256, LOADU, ISD::DELETED_NODE, 0),
-  X86_INTRINSIC_DATA(avx512_mask_loadu_ps_512, LOADU, ISD::DELETED_NODE, 0),
   X86_INTRINSIC_DATA(avx512_mask_pmov_db_mem_128, TRUNCATE_TO_MEM_VI8,
                      X86ISD::VTRUNC, 0),
   X86_INTRINSIC_DATA(avx512_mask_pmov_db_mem_256, TRUNCATE_TO_MEM_VI8,
@@ -223,7 +216,6 @@ static const IntrinsicData IntrinsicsWithChain[] = {
   X86_INTRINSIC_DATA(avx512_scattersiv4_si, SCATTER, X86::VPSCATTERDDZ128mr, 0),
   X86_INTRINSIC_DATA(avx512_scattersiv8_sf, SCATTER, X86::VSCATTERDPSZ256mr, 0),
   X86_INTRINSIC_DATA(avx512_scattersiv8_si, SCATTER, X86::VPSCATTERDDZ256mr, 0),
-
   X86_INTRINSIC_DATA(rdpmc,     RDPMC,  X86ISD::RDPMC_DAG, 0),
   X86_INTRINSIC_DATA(rdrand_16, RDRAND, X86ISD::RDRAND, 0),
   X86_INTRINSIC_DATA(rdrand_32, RDRAND, X86ISD::RDRAND, 0),
@@ -242,7 +234,7 @@ static const IntrinsicData IntrinsicsWithChain[] = {
 /*
  * Find Intrinsic data by intrinsic ID
  */
-static const IntrinsicData* getIntrinsicWithChain(unsigned IntNo) {
+static const IntrinsicData* getIntrinsicWithChain(uint16_t IntNo) {
 
   IntrinsicData IntrinsicToFind = {IntNo, INTR_NO_TYPE, 0, 0 };
   const IntrinsicData *Data =  std::lower_bound(std::begin(IntrinsicsWithChain),
@@ -258,49 +250,51 @@ static const IntrinsicData* getIntrinsicWithChain(unsigned IntNo) {
  * the alphabetical order.
  */
 static const IntrinsicData  IntrinsicsWithoutChain[] = {
+  X86_INTRINSIC_DATA(avx_hadd_pd_256,   INTR_TYPE_2OP, X86ISD::FHADD, 0),
+  X86_INTRINSIC_DATA(avx_hadd_ps_256,   INTR_TYPE_2OP, X86ISD::FHADD, 0),
+  X86_INTRINSIC_DATA(avx_hsub_pd_256,   INTR_TYPE_2OP, X86ISD::FHSUB, 0),
+  X86_INTRINSIC_DATA(avx_hsub_ps_256,   INTR_TYPE_2OP, X86ISD::FHSUB, 0),
+  X86_INTRINSIC_DATA(avx_max_pd_256,    INTR_TYPE_2OP, X86ISD::FMAX, 0),
+  X86_INTRINSIC_DATA(avx_max_ps_256,    INTR_TYPE_2OP, X86ISD::FMAX, 0),
+  X86_INTRINSIC_DATA(avx_min_pd_256,    INTR_TYPE_2OP, X86ISD::FMIN, 0),
+  X86_INTRINSIC_DATA(avx_min_ps_256,    INTR_TYPE_2OP, X86ISD::FMIN, 0),
+  X86_INTRINSIC_DATA(avx_movmsk_pd_256, INTR_TYPE_1OP, X86ISD::MOVMSK, 0),
+  X86_INTRINSIC_DATA(avx_movmsk_ps_256, INTR_TYPE_1OP, X86ISD::MOVMSK, 0),
+  X86_INTRINSIC_DATA(avx_rcp_ps_256,    INTR_TYPE_1OP, X86ISD::FRCP, 0),
+  X86_INTRINSIC_DATA(avx_rsqrt_ps_256,  INTR_TYPE_1OP, X86ISD::FRSQRT, 0),
+  X86_INTRINSIC_DATA(avx_sqrt_pd_256,   INTR_TYPE_1OP, ISD::FSQRT, 0),
+  X86_INTRINSIC_DATA(avx_sqrt_ps_256,   INTR_TYPE_1OP, ISD::FSQRT, 0),
+  X86_INTRINSIC_DATA(avx_vperm2f128_pd_256, INTR_TYPE_3OP, X86ISD::VPERM2X128, 0),
+  X86_INTRINSIC_DATA(avx_vperm2f128_ps_256, INTR_TYPE_3OP, X86ISD::VPERM2X128, 0),
+  X86_INTRINSIC_DATA(avx_vperm2f128_si_256, INTR_TYPE_3OP, X86ISD::VPERM2X128, 0),
+  X86_INTRINSIC_DATA(avx_vpermilvar_pd,     INTR_TYPE_2OP, X86ISD::VPERMILPV, 0),
+  X86_INTRINSIC_DATA(avx_vpermilvar_pd_256, INTR_TYPE_2OP, X86ISD::VPERMILPV, 0),
+  X86_INTRINSIC_DATA(avx_vpermilvar_ps,     INTR_TYPE_2OP, X86ISD::VPERMILPV, 0),
+  X86_INTRINSIC_DATA(avx_vpermilvar_ps_256, INTR_TYPE_2OP, X86ISD::VPERMILPV, 0),
+  X86_INTRINSIC_DATA(avx2_pabs_b, INTR_TYPE_1OP, X86ISD::ABS, 0),
+  X86_INTRINSIC_DATA(avx2_pabs_d, INTR_TYPE_1OP, X86ISD::ABS, 0),
+  X86_INTRINSIC_DATA(avx2_pabs_w, INTR_TYPE_1OP, X86ISD::ABS, 0),
   X86_INTRINSIC_DATA(avx2_packssdw, INTR_TYPE_2OP, X86ISD::PACKSS, 0),
   X86_INTRINSIC_DATA(avx2_packsswb, INTR_TYPE_2OP, X86ISD::PACKSS, 0),
   X86_INTRINSIC_DATA(avx2_packusdw, INTR_TYPE_2OP, X86ISD::PACKUS, 0),
   X86_INTRINSIC_DATA(avx2_packuswb, INTR_TYPE_2OP, X86ISD::PACKUS, 0),
+  X86_INTRINSIC_DATA(avx2_padds_b, INTR_TYPE_2OP, X86ISD::ADDS, 0),
+  X86_INTRINSIC_DATA(avx2_padds_w, INTR_TYPE_2OP, X86ISD::ADDS, 0),
+  X86_INTRINSIC_DATA(avx2_paddus_b, INTR_TYPE_2OP, X86ISD::ADDUS, 0),
+  X86_INTRINSIC_DATA(avx2_paddus_w, INTR_TYPE_2OP, X86ISD::ADDUS, 0),
   X86_INTRINSIC_DATA(avx2_pavg_b,  INTR_TYPE_2OP, X86ISD::AVG, 0),
   X86_INTRINSIC_DATA(avx2_pavg_w,  INTR_TYPE_2OP, X86ISD::AVG, 0),
   X86_INTRINSIC_DATA(avx2_phadd_d, INTR_TYPE_2OP, X86ISD::HADD, 0),
   X86_INTRINSIC_DATA(avx2_phadd_w, INTR_TYPE_2OP, X86ISD::HADD, 0),
   X86_INTRINSIC_DATA(avx2_phsub_d, INTR_TYPE_2OP, X86ISD::HSUB, 0),
   X86_INTRINSIC_DATA(avx2_phsub_w, INTR_TYPE_2OP, X86ISD::HSUB, 0),
-  X86_INTRINSIC_DATA(avx2_pmaxs_b, INTR_TYPE_2OP, ISD::SMAX, 0),
-  X86_INTRINSIC_DATA(avx2_pmaxs_d, INTR_TYPE_2OP, ISD::SMAX, 0),
-  X86_INTRINSIC_DATA(avx2_pmaxs_w, INTR_TYPE_2OP, ISD::SMAX, 0),
-  X86_INTRINSIC_DATA(avx2_pmaxu_b, INTR_TYPE_2OP, ISD::UMAX, 0),
-  X86_INTRINSIC_DATA(avx2_pmaxu_d, INTR_TYPE_2OP, ISD::UMAX, 0),
-  X86_INTRINSIC_DATA(avx2_pmaxu_w, INTR_TYPE_2OP, ISD::UMAX, 0),
-  X86_INTRINSIC_DATA(avx2_pmins_b, INTR_TYPE_2OP, ISD::SMIN, 0),
-  X86_INTRINSIC_DATA(avx2_pmins_d, INTR_TYPE_2OP, ISD::SMIN, 0),
-  X86_INTRINSIC_DATA(avx2_pmins_w, INTR_TYPE_2OP, ISD::SMIN, 0),
-  X86_INTRINSIC_DATA(avx2_pminu_b, INTR_TYPE_2OP, ISD::UMIN, 0),
-  X86_INTRINSIC_DATA(avx2_pminu_d, INTR_TYPE_2OP, ISD::UMIN, 0),
-  X86_INTRINSIC_DATA(avx2_pminu_w, INTR_TYPE_2OP, ISD::UMIN, 0),
-  X86_INTRINSIC_DATA(avx2_pmovsxbd, INTR_TYPE_1OP, X86ISD::VSEXT, 0),
-  X86_INTRINSIC_DATA(avx2_pmovsxbq, INTR_TYPE_1OP, X86ISD::VSEXT, 0),
-  X86_INTRINSIC_DATA(avx2_pmovsxbw, INTR_TYPE_1OP, X86ISD::VSEXT, 0),
-  X86_INTRINSIC_DATA(avx2_pmovsxdq, INTR_TYPE_1OP, X86ISD::VSEXT, 0),
-  X86_INTRINSIC_DATA(avx2_pmovsxwd, INTR_TYPE_1OP, X86ISD::VSEXT, 0),
-  X86_INTRINSIC_DATA(avx2_pmovsxwq, INTR_TYPE_1OP, X86ISD::VSEXT, 0),
-  X86_INTRINSIC_DATA(avx2_pmovzxbd, INTR_TYPE_1OP, X86ISD::VZEXT, 0),
-  X86_INTRINSIC_DATA(avx2_pmovzxbq, INTR_TYPE_1OP, X86ISD::VZEXT, 0),
-  X86_INTRINSIC_DATA(avx2_pmovzxbw, INTR_TYPE_1OP, X86ISD::VZEXT, 0),
-  X86_INTRINSIC_DATA(avx2_pmovzxdq, INTR_TYPE_1OP, X86ISD::VZEXT, 0),
-  X86_INTRINSIC_DATA(avx2_pmovzxwd, INTR_TYPE_1OP, X86ISD::VZEXT, 0),
-  X86_INTRINSIC_DATA(avx2_pmovzxwq, INTR_TYPE_1OP, X86ISD::VZEXT, 0),
+  X86_INTRINSIC_DATA(avx2_pmovmskb, INTR_TYPE_1OP, X86ISD::MOVMSK, 0),
   X86_INTRINSIC_DATA(avx2_pmul_dq, INTR_TYPE_2OP, X86ISD::PMULDQ, 0),
   X86_INTRINSIC_DATA(avx2_pmulh_w, INTR_TYPE_2OP, ISD::MULHS, 0),
   X86_INTRINSIC_DATA(avx2_pmulhu_w, INTR_TYPE_2OP, ISD::MULHU, 0),
   X86_INTRINSIC_DATA(avx2_pmulu_dq, INTR_TYPE_2OP, X86ISD::PMULUDQ, 0),
   X86_INTRINSIC_DATA(avx2_psad_bw, INTR_TYPE_2OP, X86ISD::PSADBW, 0),
   X86_INTRINSIC_DATA(avx2_pshuf_b, INTR_TYPE_2OP, X86ISD::PSHUFB, 0),
-  X86_INTRINSIC_DATA(avx2_psign_b, INTR_TYPE_2OP, X86ISD::PSIGN, 0),
-  X86_INTRINSIC_DATA(avx2_psign_d, INTR_TYPE_2OP, X86ISD::PSIGN, 0),
-  X86_INTRINSIC_DATA(avx2_psign_w, INTR_TYPE_2OP, X86ISD::PSIGN, 0),
   X86_INTRINSIC_DATA(avx2_psll_d, INTR_TYPE_2OP, X86ISD::VSHL, 0),
   X86_INTRINSIC_DATA(avx2_psll_q, INTR_TYPE_2OP, X86ISD::VSHL, 0),
   X86_INTRINSIC_DATA(avx2_psll_w, INTR_TYPE_2OP, X86ISD::VSHL, 0),
@@ -315,8 +309,8 @@ static const IntrinsicData  IntrinsicsWithoutChain[] = {
   X86_INTRINSIC_DATA(avx2_psra_w, INTR_TYPE_2OP, X86ISD::VSRA, 0),
   X86_INTRINSIC_DATA(avx2_psrai_d, VSHIFT, X86ISD::VSRAI, 0),
   X86_INTRINSIC_DATA(avx2_psrai_w, VSHIFT, X86ISD::VSRAI, 0),
-  X86_INTRINSIC_DATA(avx2_psrav_d, INTR_TYPE_2OP, ISD::SRA, 0),
-  X86_INTRINSIC_DATA(avx2_psrav_d_256, INTR_TYPE_2OP, ISD::SRA, 0),
+  X86_INTRINSIC_DATA(avx2_psrav_d,     INTR_TYPE_2OP, X86ISD::VSRAV, 0),
+  X86_INTRINSIC_DATA(avx2_psrav_d_256, INTR_TYPE_2OP, X86ISD::VSRAV, 0),
   X86_INTRINSIC_DATA(avx2_psrl_d, INTR_TYPE_2OP, X86ISD::VSRL, 0),
   X86_INTRINSIC_DATA(avx2_psrl_q, INTR_TYPE_2OP, X86ISD::VSRL, 0),
   X86_INTRINSIC_DATA(avx2_psrl_w, INTR_TYPE_2OP, X86ISD::VSRL, 0),
@@ -327,6 +321,8 @@ static const IntrinsicData  IntrinsicsWithoutChain[] = {
   X86_INTRINSIC_DATA(avx2_psrlv_d_256, INTR_TYPE_2OP, ISD::SRL, 0),
   X86_INTRINSIC_DATA(avx2_psrlv_q, INTR_TYPE_2OP, ISD::SRL, 0),
   X86_INTRINSIC_DATA(avx2_psrlv_q_256, INTR_TYPE_2OP, ISD::SRL, 0),
+  X86_INTRINSIC_DATA(avx2_psubs_b, INTR_TYPE_2OP, X86ISD::SUBS, 0),
+  X86_INTRINSIC_DATA(avx2_psubs_w, INTR_TYPE_2OP, X86ISD::SUBS, 0),
   X86_INTRINSIC_DATA(avx2_psubus_b, INTR_TYPE_2OP, X86ISD::SUBUS, 0),
   X86_INTRINSIC_DATA(avx2_psubus_w, INTR_TYPE_2OP, X86ISD::SUBUS, 0),
   X86_INTRINSIC_DATA(avx2_vperm2i128, INTR_TYPE_3OP, X86ISD::VPERM2X128, 0),
@@ -380,50 +376,6 @@ static const IntrinsicData  IntrinsicsWithoutChain[] = {
   X86_INTRINSIC_DATA(avx512_kunpck_bw, KUNPCK, ISD::CONCAT_VECTORS, 0),
   X86_INTRINSIC_DATA(avx512_kunpck_dq, KUNPCK, ISD::CONCAT_VECTORS, 0),
   X86_INTRINSIC_DATA(avx512_kunpck_wd, KUNPCK, ISD::CONCAT_VECTORS, 0),
-  X86_INTRINSIC_DATA(avx512_mask3_vfmadd_pd_128, FMA_OP_MASK3, X86ISD::FMADD, 0),
-  X86_INTRINSIC_DATA(avx512_mask3_vfmadd_pd_256, FMA_OP_MASK3, X86ISD::FMADD, 0),
-  X86_INTRINSIC_DATA(avx512_mask3_vfmadd_pd_512, FMA_OP_MASK3, X86ISD::FMADD,
-                     X86ISD::FMADD_RND),
-  X86_INTRINSIC_DATA(avx512_mask3_vfmadd_ps_128, FMA_OP_MASK3, X86ISD::FMADD, 0),
-  X86_INTRINSIC_DATA(avx512_mask3_vfmadd_ps_256, FMA_OP_MASK3, X86ISD::FMADD, 0),
-  X86_INTRINSIC_DATA(avx512_mask3_vfmadd_ps_512, FMA_OP_MASK3, X86ISD::FMADD,
-                     X86ISD::FMADD_RND),
-
-  X86_INTRINSIC_DATA(avx512_mask3_vfmaddsub_pd_128, FMA_OP_MASK3, X86ISD::FMADDSUB, 0),
-  X86_INTRINSIC_DATA(avx512_mask3_vfmaddsub_pd_256, FMA_OP_MASK3, X86ISD::FMADDSUB, 0),
-  X86_INTRINSIC_DATA(avx512_mask3_vfmaddsub_pd_512, FMA_OP_MASK3, X86ISD::FMADDSUB,
-                     X86ISD::FMADDSUB_RND),
-  X86_INTRINSIC_DATA(avx512_mask3_vfmaddsub_ps_128, FMA_OP_MASK3, X86ISD::FMADDSUB, 0),
-  X86_INTRINSIC_DATA(avx512_mask3_vfmaddsub_ps_256, FMA_OP_MASK3, X86ISD::FMADDSUB, 0),
-  X86_INTRINSIC_DATA(avx512_mask3_vfmaddsub_ps_512, FMA_OP_MASK3, X86ISD::FMADDSUB,
-                     X86ISD::FMADDSUB_RND),
-
-  X86_INTRINSIC_DATA(avx512_mask3_vfmsub_pd_128, FMA_OP_MASK3, X86ISD::FMSUB, 0),
-  X86_INTRINSIC_DATA(avx512_mask3_vfmsub_pd_256, FMA_OP_MASK3, X86ISD::FMSUB, 0),
-  X86_INTRINSIC_DATA(avx512_mask3_vfmsub_pd_512, FMA_OP_MASK3, X86ISD::FMSUB,
-                     X86ISD::FMSUB_RND),
-  X86_INTRINSIC_DATA(avx512_mask3_vfmsub_ps_128, FMA_OP_MASK3, X86ISD::FMSUB, 0),
-  X86_INTRINSIC_DATA(avx512_mask3_vfmsub_ps_256, FMA_OP_MASK3, X86ISD::FMSUB, 0),
-  X86_INTRINSIC_DATA(avx512_mask3_vfmsub_ps_512, FMA_OP_MASK3, X86ISD::FMSUB,
-                     X86ISD::FMSUB_RND),
-
-  X86_INTRINSIC_DATA(avx512_mask3_vfmsubadd_pd_128, FMA_OP_MASK3, X86ISD::FMSUBADD, 0),
-  X86_INTRINSIC_DATA(avx512_mask3_vfmsubadd_pd_256, FMA_OP_MASK3, X86ISD::FMSUBADD, 0),
-  X86_INTRINSIC_DATA(avx512_mask3_vfmsubadd_pd_512, FMA_OP_MASK3, X86ISD::FMSUBADD,
-                     X86ISD::FMSUBADD_RND),
-  X86_INTRINSIC_DATA(avx512_mask3_vfmsubadd_ps_128, FMA_OP_MASK3, X86ISD::FMSUBADD, 0),
-  X86_INTRINSIC_DATA(avx512_mask3_vfmsubadd_ps_256, FMA_OP_MASK3, X86ISD::FMSUBADD, 0),
-  X86_INTRINSIC_DATA(avx512_mask3_vfmsubadd_ps_512, FMA_OP_MASK3, X86ISD::FMSUBADD,
-                     X86ISD::FMSUBADD_RND),
-
-  X86_INTRINSIC_DATA(avx512_mask3_vfnmsub_pd_128, FMA_OP_MASK3, X86ISD::FNMSUB, 0),
-  X86_INTRINSIC_DATA(avx512_mask3_vfnmsub_pd_256, FMA_OP_MASK3, X86ISD::FNMSUB, 0),
-  X86_INTRINSIC_DATA(avx512_mask3_vfnmsub_pd_512, FMA_OP_MASK3, X86ISD::FNMSUB,
-                     X86ISD::FNMSUB_RND),
-  X86_INTRINSIC_DATA(avx512_mask3_vfnmsub_ps_128, FMA_OP_MASK3, X86ISD::FNMSUB, 0),
-  X86_INTRINSIC_DATA(avx512_mask3_vfnmsub_ps_256, FMA_OP_MASK3, X86ISD::FNMSUB, 0),
-  X86_INTRINSIC_DATA(avx512_mask3_vfnmsub_ps_512, FMA_OP_MASK3, X86ISD::FNMSUB,
-                     X86ISD::FNMSUB_RND),
 
   X86_INTRINSIC_DATA(avx512_mask_add_pd_128, INTR_TYPE_2OP_MASK, ISD::FADD, 0),
   X86_INTRINSIC_DATA(avx512_mask_add_pd_256, INTR_TYPE_2OP_MASK, ISD::FADD, 0),
@@ -449,38 +401,10 @@ static const IntrinsicData  IntrinsicsWithoutChain[] = {
   X86_INTRINSIC_DATA(avx512_mask_andn_ps_128, INTR_TYPE_2OP_MASK, X86ISD::FANDN, 0),
   X86_INTRINSIC_DATA(avx512_mask_andn_ps_256, INTR_TYPE_2OP_MASK, X86ISD::FANDN, 0),
   X86_INTRINSIC_DATA(avx512_mask_andn_ps_512, INTR_TYPE_2OP_MASK, X86ISD::FANDN, 0),
-  X86_INTRINSIC_DATA(avx512_mask_blend_b_128,  BLEND, X86ISD::SELECT, 0),
-  X86_INTRINSIC_DATA(avx512_mask_blend_b_256,  BLEND, X86ISD::SELECT, 0),
-  X86_INTRINSIC_DATA(avx512_mask_blend_b_512,  BLEND, X86ISD::SELECT, 0),
-  X86_INTRINSIC_DATA(avx512_mask_blend_d_128,  BLEND, X86ISD::SELECT, 0),
-  X86_INTRINSIC_DATA(avx512_mask_blend_d_256,  BLEND, X86ISD::SELECT, 0),
-  X86_INTRINSIC_DATA(avx512_mask_blend_d_512,  BLEND, X86ISD::SELECT, 0),
-  X86_INTRINSIC_DATA(avx512_mask_blend_pd_128, BLEND, X86ISD::SELECT, 0),
-  X86_INTRINSIC_DATA(avx512_mask_blend_pd_256, BLEND, X86ISD::SELECT, 0),
-  X86_INTRINSIC_DATA(avx512_mask_blend_pd_512, BLEND, X86ISD::SELECT, 0),
-  X86_INTRINSIC_DATA(avx512_mask_blend_ps_128, BLEND, X86ISD::SELECT, 0),
-  X86_INTRINSIC_DATA(avx512_mask_blend_ps_256, BLEND, X86ISD::SELECT, 0),
-  X86_INTRINSIC_DATA(avx512_mask_blend_ps_512, BLEND, X86ISD::SELECT, 0),
-  X86_INTRINSIC_DATA(avx512_mask_blend_q_128,  BLEND, X86ISD::SELECT, 0),
-  X86_INTRINSIC_DATA(avx512_mask_blend_q_256,  BLEND, X86ISD::SELECT, 0),
-  X86_INTRINSIC_DATA(avx512_mask_blend_q_512,  BLEND, X86ISD::SELECT, 0),
-  X86_INTRINSIC_DATA(avx512_mask_blend_w_128,  BLEND, X86ISD::SELECT, 0),
-  X86_INTRINSIC_DATA(avx512_mask_blend_w_256,  BLEND, X86ISD::SELECT, 0),
-  X86_INTRINSIC_DATA(avx512_mask_blend_w_512,  BLEND, X86ISD::SELECT, 0),
-  X86_INTRINSIC_DATA(avx512_mask_broadcast_sd_pd_256, INTR_TYPE_1OP_MASK,
-                     X86ISD::VBROADCAST, 0),
-  X86_INTRINSIC_DATA(avx512_mask_broadcast_sd_pd_512, INTR_TYPE_1OP_MASK,
+  X86_INTRINSIC_DATA(avx512_mask_broadcastf32x2_256, BRCST32x2_TO_VEC,
                      X86ISD::VBROADCAST, 0),
-  X86_INTRINSIC_DATA(avx512_mask_broadcast_ss_ps_128, INTR_TYPE_1OP_MASK,
+  X86_INTRINSIC_DATA(avx512_mask_broadcastf32x2_512, BRCST32x2_TO_VEC,
                      X86ISD::VBROADCAST, 0),
-  X86_INTRINSIC_DATA(avx512_mask_broadcast_ss_ps_256, INTR_TYPE_1OP_MASK,
-                     X86ISD::VBROADCAST, 0),
-  X86_INTRINSIC_DATA(avx512_mask_broadcast_ss_ps_512, INTR_TYPE_1OP_MASK,
-                     X86ISD::VBROADCAST, 0),
-  X86_INTRINSIC_DATA(avx512_mask_broadcastf32x2_256, INTR_TYPE_1OP_MASK,
-                     X86ISD::SUBV_BROADCAST, 0),
-  X86_INTRINSIC_DATA(avx512_mask_broadcastf32x2_512, INTR_TYPE_1OP_MASK,
-                     X86ISD::SUBV_BROADCAST, 0),
   X86_INTRINSIC_DATA(avx512_mask_broadcastf32x4_256, BRCST_SUBVEC_TO_VEC,
                      X86ISD::SHUF128, 0),
   X86_INTRINSIC_DATA(avx512_mask_broadcastf32x4_512, BRCST_SUBVEC_TO_VEC,
@@ -493,12 +417,12 @@ static const IntrinsicData  IntrinsicsWithoutChain[] = {
                      X86ISD::SHUF128, 0),
   X86_INTRINSIC_DATA(avx512_mask_broadcastf64x4_512, BRCST_SUBVEC_TO_VEC,
                      X86ISD::SHUF128, 0),
-  X86_INTRINSIC_DATA(avx512_mask_broadcasti32x2_128, INTR_TYPE_1OP_MASK,
-                     X86ISD::SUBV_BROADCAST, 0),
-  X86_INTRINSIC_DATA(avx512_mask_broadcasti32x2_256, INTR_TYPE_1OP_MASK,
-                     X86ISD::SUBV_BROADCAST, 0),
-  X86_INTRINSIC_DATA(avx512_mask_broadcasti32x2_512, INTR_TYPE_1OP_MASK,
-                     X86ISD::SUBV_BROADCAST, 0),
+  X86_INTRINSIC_DATA(avx512_mask_broadcasti32x2_128, BRCST32x2_TO_VEC,
+                     X86ISD::VBROADCAST, 0),
+  X86_INTRINSIC_DATA(avx512_mask_broadcasti32x2_256, BRCST32x2_TO_VEC,
+                     X86ISD::VBROADCAST, 0),
+  X86_INTRINSIC_DATA(avx512_mask_broadcasti32x2_512, BRCST32x2_TO_VEC,
+                     X86ISD::VBROADCAST, 0),
   X86_INTRINSIC_DATA(avx512_mask_broadcasti32x4_256, BRCST_SUBVEC_TO_VEC,
                      X86ISD::SHUF128, 0),
   X86_INTRINSIC_DATA(avx512_mask_broadcasti32x4_512, BRCST_SUBVEC_TO_VEC,
@@ -773,6 +697,14 @@ static const IntrinsicData  IntrinsicsWithoutChain[] = {
                      X86ISD::EXPAND, 0),
   X86_INTRINSIC_DATA(avx512_mask_expand_q_512,  COMPRESS_EXPAND_IN_REG,
                      X86ISD::EXPAND, 0),
+  X86_INTRINSIC_DATA(avx512_mask_fixupimm_pd_128, FIXUPIMM, X86ISD::VFIXUPIMM, 0),
+  X86_INTRINSIC_DATA(avx512_mask_fixupimm_pd_256, FIXUPIMM, X86ISD::VFIXUPIMM, 0),
+  X86_INTRINSIC_DATA(avx512_mask_fixupimm_pd_512, FIXUPIMM, X86ISD::VFIXUPIMM, 0),
+  X86_INTRINSIC_DATA(avx512_mask_fixupimm_ps_128, FIXUPIMM, X86ISD::VFIXUPIMM, 0),
+  X86_INTRINSIC_DATA(avx512_mask_fixupimm_ps_256, FIXUPIMM, X86ISD::VFIXUPIMM, 0),
+  X86_INTRINSIC_DATA(avx512_mask_fixupimm_ps_512, FIXUPIMM, X86ISD::VFIXUPIMM, 0),
+  X86_INTRINSIC_DATA(avx512_mask_fixupimm_sd, FIXUPIMMS, X86ISD::VFIXUPIMMS, 0),
+  X86_INTRINSIC_DATA(avx512_mask_fixupimm_ss, FIXUPIMMS, X86ISD::VFIXUPIMMS, 0),
   X86_INTRINSIC_DATA(avx512_mask_fpclass_pd_128, FPCLASS, X86ISD::VFPCLASS, 0),
   X86_INTRINSIC_DATA(avx512_mask_fpclass_pd_256, FPCLASS, X86ISD::VFPCLASS, 0),
   X86_INTRINSIC_DATA(avx512_mask_fpclass_pd_512, FPCLASS, X86ISD::VFPCLASS, 0),
@@ -873,28 +805,10 @@ static const IntrinsicData  IntrinsicsWithoutChain[] = {
                      X86ISD::FMIN, X86ISD::FMIN_RND),
   X86_INTRINSIC_DATA(avx512_mask_min_ss_round, INTR_TYPE_SCALAR_MASK_RM,
                      X86ISD::FMIN, X86ISD::FMIN_RND),
-  X86_INTRINSIC_DATA(avx512_mask_movddup_128, INTR_TYPE_1OP_MASK,
-                     X86ISD::MOVDDUP, 0),
-  X86_INTRINSIC_DATA(avx512_mask_movddup_256, INTR_TYPE_1OP_MASK,
-                     X86ISD::MOVDDUP, 0),
-  X86_INTRINSIC_DATA(avx512_mask_movddup_512, INTR_TYPE_1OP_MASK,
-                     X86ISD::MOVDDUP, 0),
-  X86_INTRINSIC_DATA(avx512_mask_move_sd, INTR_TYPE_SCALAR_MASK, 
+  X86_INTRINSIC_DATA(avx512_mask_move_sd, INTR_TYPE_SCALAR_MASK,
                      X86ISD::MOVSD, 0),
-  X86_INTRINSIC_DATA(avx512_mask_move_ss, INTR_TYPE_SCALAR_MASK, 
+  X86_INTRINSIC_DATA(avx512_mask_move_ss, INTR_TYPE_SCALAR_MASK,
                      X86ISD::MOVSS, 0),
-  X86_INTRINSIC_DATA(avx512_mask_movshdup_128, INTR_TYPE_1OP_MASK,
-                     X86ISD::MOVSHDUP, 0),
-  X86_INTRINSIC_DATA(avx512_mask_movshdup_256, INTR_TYPE_1OP_MASK,
-                     X86ISD::MOVSHDUP, 0),
-  X86_INTRINSIC_DATA(avx512_mask_movshdup_512, INTR_TYPE_1OP_MASK,
-                     X86ISD::MOVSHDUP, 0),
-  X86_INTRINSIC_DATA(avx512_mask_movsldup_128, INTR_TYPE_1OP_MASK,
-                     X86ISD::MOVSLDUP, 0),
-  X86_INTRINSIC_DATA(avx512_mask_movsldup_256, INTR_TYPE_1OP_MASK,
-                     X86ISD::MOVSLDUP, 0),
-  X86_INTRINSIC_DATA(avx512_mask_movsldup_512, INTR_TYPE_1OP_MASK,
-                     X86ISD::MOVSLDUP, 0),
   X86_INTRINSIC_DATA(avx512_mask_mul_pd_128, INTR_TYPE_2OP_MASK, ISD::FMUL, 0),
   X86_INTRINSIC_DATA(avx512_mask_mul_pd_256, INTR_TYPE_2OP_MASK, ISD::FMUL, 0),
   X86_INTRINSIC_DATA(avx512_mask_mul_pd_512, INTR_TYPE_2OP_MASK, ISD::FMUL,
@@ -961,54 +875,64 @@ static const IntrinsicData  IntrinsicsWithoutChain[] = {
   X86_INTRINSIC_DATA(avx512_mask_paddus_w_128, INTR_TYPE_2OP_MASK, X86ISD::ADDUS, 0),
   X86_INTRINSIC_DATA(avx512_mask_paddus_w_256, INTR_TYPE_2OP_MASK, X86ISD::ADDUS, 0),
   X86_INTRINSIC_DATA(avx512_mask_paddus_w_512, INTR_TYPE_2OP_MASK, X86ISD::ADDUS, 0),
-  X86_INTRINSIC_DATA(avx512_mask_palignr_128, INTR_TYPE_3OP_IMM8_MASK,
-                     X86ISD::PALIGNR, 0),
-  X86_INTRINSIC_DATA(avx512_mask_palignr_256, INTR_TYPE_3OP_IMM8_MASK,
-                     X86ISD::PALIGNR, 0),
-  X86_INTRINSIC_DATA(avx512_mask_palignr_512, INTR_TYPE_3OP_IMM8_MASK,
-                     X86ISD::PALIGNR, 0),
-  X86_INTRINSIC_DATA(avx512_mask_pand_d_128, INTR_TYPE_2OP_MASK, ISD::AND, 0),
-  X86_INTRINSIC_DATA(avx512_mask_pand_d_256, INTR_TYPE_2OP_MASK, ISD::AND, 0),
-  X86_INTRINSIC_DATA(avx512_mask_pand_d_512, INTR_TYPE_2OP_MASK, ISD::AND, 0),
-  X86_INTRINSIC_DATA(avx512_mask_pand_q_128, INTR_TYPE_2OP_MASK, ISD::AND, 0),
-  X86_INTRINSIC_DATA(avx512_mask_pand_q_256, INTR_TYPE_2OP_MASK, ISD::AND, 0),
-  X86_INTRINSIC_DATA(avx512_mask_pand_q_512, INTR_TYPE_2OP_MASK, ISD::AND, 0),
-  X86_INTRINSIC_DATA(avx512_mask_pandn_d_128, INTR_TYPE_2OP_MASK, X86ISD::ANDNP, 0),
-  X86_INTRINSIC_DATA(avx512_mask_pandn_d_256, INTR_TYPE_2OP_MASK, X86ISD::ANDNP, 0),
-  X86_INTRINSIC_DATA(avx512_mask_pandn_d_512, INTR_TYPE_2OP_MASK, X86ISD::ANDNP, 0),
-  X86_INTRINSIC_DATA(avx512_mask_pandn_q_128, INTR_TYPE_2OP_MASK, X86ISD::ANDNP, 0),
-  X86_INTRINSIC_DATA(avx512_mask_pandn_q_256, INTR_TYPE_2OP_MASK, X86ISD::ANDNP, 0),
-  X86_INTRINSIC_DATA(avx512_mask_pandn_q_512, INTR_TYPE_2OP_MASK, X86ISD::ANDNP, 0),
   X86_INTRINSIC_DATA(avx512_mask_pavg_b_128, INTR_TYPE_2OP_MASK, X86ISD::AVG, 0),
   X86_INTRINSIC_DATA(avx512_mask_pavg_b_256, INTR_TYPE_2OP_MASK, X86ISD::AVG, 0),
   X86_INTRINSIC_DATA(avx512_mask_pavg_b_512, INTR_TYPE_2OP_MASK, X86ISD::AVG, 0),
   X86_INTRINSIC_DATA(avx512_mask_pavg_w_128, INTR_TYPE_2OP_MASK, X86ISD::AVG, 0),
   X86_INTRINSIC_DATA(avx512_mask_pavg_w_256, INTR_TYPE_2OP_MASK, X86ISD::AVG, 0),
   X86_INTRINSIC_DATA(avx512_mask_pavg_w_512, INTR_TYPE_2OP_MASK, X86ISD::AVG, 0),
-  X86_INTRINSIC_DATA(avx512_mask_pcmpeq_b_128,  CMP_MASK,  X86ISD::PCMPEQM, 0),
-  X86_INTRINSIC_DATA(avx512_mask_pcmpeq_b_256,  CMP_MASK,  X86ISD::PCMPEQM, 0),
-  X86_INTRINSIC_DATA(avx512_mask_pcmpeq_b_512,  CMP_MASK,  X86ISD::PCMPEQM, 0),
-  X86_INTRINSIC_DATA(avx512_mask_pcmpeq_d_128,  CMP_MASK,  X86ISD::PCMPEQM, 0),
-  X86_INTRINSIC_DATA(avx512_mask_pcmpeq_d_256,  CMP_MASK,  X86ISD::PCMPEQM, 0),
-  X86_INTRINSIC_DATA(avx512_mask_pcmpeq_d_512,  CMP_MASK,  X86ISD::PCMPEQM, 0),
-  X86_INTRINSIC_DATA(avx512_mask_pcmpeq_q_128,  CMP_MASK,  X86ISD::PCMPEQM, 0),
-  X86_INTRINSIC_DATA(avx512_mask_pcmpeq_q_256,  CMP_MASK,  X86ISD::PCMPEQM, 0),
-  X86_INTRINSIC_DATA(avx512_mask_pcmpeq_q_512,  CMP_MASK,  X86ISD::PCMPEQM, 0),
-  X86_INTRINSIC_DATA(avx512_mask_pcmpeq_w_128,  CMP_MASK,  X86ISD::PCMPEQM, 0),
-  X86_INTRINSIC_DATA(avx512_mask_pcmpeq_w_256,  CMP_MASK,  X86ISD::PCMPEQM, 0),
-  X86_INTRINSIC_DATA(avx512_mask_pcmpeq_w_512,  CMP_MASK,  X86ISD::PCMPEQM, 0),
-  X86_INTRINSIC_DATA(avx512_mask_pcmpgt_b_128,  CMP_MASK,  X86ISD::PCMPGTM, 0),
-  X86_INTRINSIC_DATA(avx512_mask_pcmpgt_b_256,  CMP_MASK,  X86ISD::PCMPGTM, 0),
-  X86_INTRINSIC_DATA(avx512_mask_pcmpgt_b_512,  CMP_MASK,  X86ISD::PCMPGTM, 0),
-  X86_INTRINSIC_DATA(avx512_mask_pcmpgt_d_128,  CMP_MASK,  X86ISD::PCMPGTM, 0),
-  X86_INTRINSIC_DATA(avx512_mask_pcmpgt_d_256,  CMP_MASK,  X86ISD::PCMPGTM, 0),
-  X86_INTRINSIC_DATA(avx512_mask_pcmpgt_d_512,  CMP_MASK,  X86ISD::PCMPGTM, 0),
-  X86_INTRINSIC_DATA(avx512_mask_pcmpgt_q_128,  CMP_MASK,  X86ISD::PCMPGTM, 0),
-  X86_INTRINSIC_DATA(avx512_mask_pcmpgt_q_256,  CMP_MASK,  X86ISD::PCMPGTM, 0),
-  X86_INTRINSIC_DATA(avx512_mask_pcmpgt_q_512,  CMP_MASK,  X86ISD::PCMPGTM, 0),
-  X86_INTRINSIC_DATA(avx512_mask_pcmpgt_w_128,  CMP_MASK,  X86ISD::PCMPGTM, 0),
-  X86_INTRINSIC_DATA(avx512_mask_pcmpgt_w_256,  CMP_MASK,  X86ISD::PCMPGTM, 0),
-  X86_INTRINSIC_DATA(avx512_mask_pcmpgt_w_512,  CMP_MASK,  X86ISD::PCMPGTM, 0),
+  X86_INTRINSIC_DATA(avx512_mask_pbroadcast_b_gpr_128, INTR_TYPE_1OP_MASK,
+                     X86ISD::VBROADCAST, 0),
+  X86_INTRINSIC_DATA(avx512_mask_pbroadcast_b_gpr_256, INTR_TYPE_1OP_MASK,
+                     X86ISD::VBROADCAST, 0),
+  X86_INTRINSIC_DATA(avx512_mask_pbroadcast_b_gpr_512, INTR_TYPE_1OP_MASK,
+                     X86ISD::VBROADCAST, 0),
+  X86_INTRINSIC_DATA(avx512_mask_pbroadcast_d_gpr_128, INTR_TYPE_1OP_MASK,
+                     X86ISD::VBROADCAST, 0),
+  X86_INTRINSIC_DATA(avx512_mask_pbroadcast_d_gpr_256, INTR_TYPE_1OP_MASK,
+                     X86ISD::VBROADCAST, 0),
+  X86_INTRINSIC_DATA(avx512_mask_pbroadcast_d_gpr_512, INTR_TYPE_1OP_MASK,
+                     X86ISD::VBROADCAST, 0),
+  X86_INTRINSIC_DATA(avx512_mask_pbroadcast_q_gpr_128, INTR_TYPE_1OP_MASK,
+                     X86ISD::VBROADCAST, 0),
+  X86_INTRINSIC_DATA(avx512_mask_pbroadcast_q_gpr_256, INTR_TYPE_1OP_MASK,
+                     X86ISD::VBROADCAST, 0),
+  X86_INTRINSIC_DATA(avx512_mask_pbroadcast_q_gpr_512, INTR_TYPE_1OP_MASK,
+                     X86ISD::VBROADCAST, 0),
+  X86_INTRINSIC_DATA(avx512_mask_pbroadcast_w_gpr_128, INTR_TYPE_1OP_MASK,
+                     X86ISD::VBROADCAST, 0),
+  X86_INTRINSIC_DATA(avx512_mask_pbroadcast_w_gpr_256, INTR_TYPE_1OP_MASK,
+                     X86ISD::VBROADCAST, 0),
+  X86_INTRINSIC_DATA(avx512_mask_pbroadcast_w_gpr_512, INTR_TYPE_1OP_MASK,
+                     X86ISD::VBROADCAST, 0),
+  X86_INTRINSIC_DATA(avx512_mask_permvar_df_256, VPERM_2OP_MASK,
+                     X86ISD::VPERMV, 0),
+  X86_INTRINSIC_DATA(avx512_mask_permvar_df_512, VPERM_2OP_MASK,
+                     X86ISD::VPERMV, 0),
+  X86_INTRINSIC_DATA(avx512_mask_permvar_di_256, VPERM_2OP_MASK,
+                     X86ISD::VPERMV, 0),
+  X86_INTRINSIC_DATA(avx512_mask_permvar_di_512, VPERM_2OP_MASK,
+                     X86ISD::VPERMV, 0),
+  X86_INTRINSIC_DATA(avx512_mask_permvar_hi_128, VPERM_2OP_MASK,
+                     X86ISD::VPERMV, 0),
+  X86_INTRINSIC_DATA(avx512_mask_permvar_hi_256, VPERM_2OP_MASK,
+                     X86ISD::VPERMV, 0),
+  X86_INTRINSIC_DATA(avx512_mask_permvar_hi_512, VPERM_2OP_MASK,
+                     X86ISD::VPERMV, 0),
+  X86_INTRINSIC_DATA(avx512_mask_permvar_qi_128, VPERM_2OP_MASK,
+                     X86ISD::VPERMV, 0),
+  X86_INTRINSIC_DATA(avx512_mask_permvar_qi_256, VPERM_2OP_MASK,
+                     X86ISD::VPERMV, 0),
+  X86_INTRINSIC_DATA(avx512_mask_permvar_qi_512, VPERM_2OP_MASK,
+                     X86ISD::VPERMV, 0),
+  X86_INTRINSIC_DATA(avx512_mask_permvar_sf_256, VPERM_2OP_MASK,
+                     X86ISD::VPERMV, 0),
+  X86_INTRINSIC_DATA(avx512_mask_permvar_sf_512, VPERM_2OP_MASK,
+                     X86ISD::VPERMV, 0),
+  X86_INTRINSIC_DATA(avx512_mask_permvar_si_256, VPERM_2OP_MASK,
+                     X86ISD::VPERMV, 0),
+  X86_INTRINSIC_DATA(avx512_mask_permvar_si_512, VPERM_2OP_MASK,
+                     X86ISD::VPERMV, 0),
   X86_INTRINSIC_DATA(avx512_mask_pmaddubs_w_128, INTR_TYPE_2OP_MASK,
                      X86ISD::VPMADDUBSW, 0),
   X86_INTRINSIC_DATA(avx512_mask_pmaddubs_w_256, INTR_TYPE_2OP_MASK,
@@ -1273,36 +1197,36 @@ static const IntrinsicData  IntrinsicsWithoutChain[] = {
   X86_INTRINSIC_DATA(avx512_mask_pmull_w_128, INTR_TYPE_2OP_MASK, ISD::MUL, 0),
   X86_INTRINSIC_DATA(avx512_mask_pmull_w_256, INTR_TYPE_2OP_MASK, ISD::MUL, 0),
   X86_INTRINSIC_DATA(avx512_mask_pmull_w_512, INTR_TYPE_2OP_MASK, ISD::MUL, 0),
+  X86_INTRINSIC_DATA(avx512_mask_pmultishift_qb_128, INTR_TYPE_2OP_MASK,
+                     X86ISD::MULTISHIFT, 0),
+  X86_INTRINSIC_DATA(avx512_mask_pmultishift_qb_256, INTR_TYPE_2OP_MASK,
+                     X86ISD::MULTISHIFT, 0),
+  X86_INTRINSIC_DATA(avx512_mask_pmultishift_qb_512, INTR_TYPE_2OP_MASK,
+                     X86ISD::MULTISHIFT, 0),
   X86_INTRINSIC_DATA(avx512_mask_pmulu_dq_128, INTR_TYPE_2OP_MASK,
                      X86ISD::PMULUDQ, 0),
   X86_INTRINSIC_DATA(avx512_mask_pmulu_dq_256, INTR_TYPE_2OP_MASK,
                      X86ISD::PMULUDQ, 0),
   X86_INTRINSIC_DATA(avx512_mask_pmulu_dq_512, INTR_TYPE_2OP_MASK,
                      X86ISD::PMULUDQ, 0),
-  X86_INTRINSIC_DATA(avx512_mask_por_d_128, INTR_TYPE_2OP_MASK, ISD::OR, 0),
-  X86_INTRINSIC_DATA(avx512_mask_por_d_256, INTR_TYPE_2OP_MASK, ISD::OR, 0),
-  X86_INTRINSIC_DATA(avx512_mask_por_d_512, INTR_TYPE_2OP_MASK, ISD::OR, 0),
-  X86_INTRINSIC_DATA(avx512_mask_por_q_128, INTR_TYPE_2OP_MASK, ISD::OR, 0),
-  X86_INTRINSIC_DATA(avx512_mask_por_q_256, INTR_TYPE_2OP_MASK, ISD::OR, 0),
-  X86_INTRINSIC_DATA(avx512_mask_por_q_512, INTR_TYPE_2OP_MASK, ISD::OR, 0),
-  X86_INTRINSIC_DATA(avx512_mask_prol_d_128,  INTR_TYPE_2OP_MASK, X86ISD::VROTLI, 0),
-  X86_INTRINSIC_DATA(avx512_mask_prol_d_256,  INTR_TYPE_2OP_MASK, X86ISD::VROTLI, 0),
-  X86_INTRINSIC_DATA(avx512_mask_prol_d_512,  INTR_TYPE_2OP_MASK, X86ISD::VROTLI, 0),
-  X86_INTRINSIC_DATA(avx512_mask_prol_q_128,  INTR_TYPE_2OP_MASK, X86ISD::VROTLI, 0),
-  X86_INTRINSIC_DATA(avx512_mask_prol_q_256,  INTR_TYPE_2OP_MASK, X86ISD::VROTLI, 0),
-  X86_INTRINSIC_DATA(avx512_mask_prol_q_512,  INTR_TYPE_2OP_MASK, X86ISD::VROTLI, 0),
+  X86_INTRINSIC_DATA(avx512_mask_prol_d_128,  INTR_TYPE_2OP_IMM8_MASK, X86ISD::VROTLI, 0),
+  X86_INTRINSIC_DATA(avx512_mask_prol_d_256,  INTR_TYPE_2OP_IMM8_MASK, X86ISD::VROTLI, 0),
+  X86_INTRINSIC_DATA(avx512_mask_prol_d_512,  INTR_TYPE_2OP_IMM8_MASK, X86ISD::VROTLI, 0),
+  X86_INTRINSIC_DATA(avx512_mask_prol_q_128,  INTR_TYPE_2OP_IMM8_MASK, X86ISD::VROTLI, 0),
+  X86_INTRINSIC_DATA(avx512_mask_prol_q_256,  INTR_TYPE_2OP_IMM8_MASK, X86ISD::VROTLI, 0),
+  X86_INTRINSIC_DATA(avx512_mask_prol_q_512,  INTR_TYPE_2OP_IMM8_MASK, X86ISD::VROTLI, 0),
   X86_INTRINSIC_DATA(avx512_mask_prolv_d_128, INTR_TYPE_2OP_MASK, ISD::ROTL, 0),
   X86_INTRINSIC_DATA(avx512_mask_prolv_d_256, INTR_TYPE_2OP_MASK, ISD::ROTL, 0),
   X86_INTRINSIC_DATA(avx512_mask_prolv_d_512, INTR_TYPE_2OP_MASK, ISD::ROTL, 0),
   X86_INTRINSIC_DATA(avx512_mask_prolv_q_128, INTR_TYPE_2OP_MASK, ISD::ROTL, 0),
   X86_INTRINSIC_DATA(avx512_mask_prolv_q_256, INTR_TYPE_2OP_MASK, ISD::ROTL, 0),
   X86_INTRINSIC_DATA(avx512_mask_prolv_q_512, INTR_TYPE_2OP_MASK, ISD::ROTL, 0),
-  X86_INTRINSIC_DATA(avx512_mask_pror_d_128,  INTR_TYPE_2OP_MASK, X86ISD::VROTRI, 0),
-  X86_INTRINSIC_DATA(avx512_mask_pror_d_256,  INTR_TYPE_2OP_MASK, X86ISD::VROTRI, 0),
-  X86_INTRINSIC_DATA(avx512_mask_pror_d_512,  INTR_TYPE_2OP_MASK, X86ISD::VROTRI, 0),
-  X86_INTRINSIC_DATA(avx512_mask_pror_q_128,  INTR_TYPE_2OP_MASK, X86ISD::VROTRI, 0),
-  X86_INTRINSIC_DATA(avx512_mask_pror_q_256,  INTR_TYPE_2OP_MASK, X86ISD::VROTRI, 0),
-  X86_INTRINSIC_DATA(avx512_mask_pror_q_512,  INTR_TYPE_2OP_MASK, X86ISD::VROTRI, 0),  
+  X86_INTRINSIC_DATA(avx512_mask_pror_d_128,  INTR_TYPE_2OP_IMM8_MASK, X86ISD::VROTRI, 0),
+  X86_INTRINSIC_DATA(avx512_mask_pror_d_256,  INTR_TYPE_2OP_IMM8_MASK, X86ISD::VROTRI, 0),
+  X86_INTRINSIC_DATA(avx512_mask_pror_d_512,  INTR_TYPE_2OP_IMM8_MASK, X86ISD::VROTRI, 0),
+  X86_INTRINSIC_DATA(avx512_mask_pror_q_128,  INTR_TYPE_2OP_IMM8_MASK, X86ISD::VROTRI, 0),
+  X86_INTRINSIC_DATA(avx512_mask_pror_q_256,  INTR_TYPE_2OP_IMM8_MASK, X86ISD::VROTRI, 0),
+  X86_INTRINSIC_DATA(avx512_mask_pror_q_512,  INTR_TYPE_2OP_IMM8_MASK, X86ISD::VROTRI, 0),
   X86_INTRINSIC_DATA(avx512_mask_prorv_d_128, INTR_TYPE_2OP_MASK, ISD::ROTR, 0),
   X86_INTRINSIC_DATA(avx512_mask_prorv_d_256, INTR_TYPE_2OP_MASK, ISD::ROTR, 0),
   X86_INTRINSIC_DATA(avx512_mask_prorv_d_512, INTR_TYPE_2OP_MASK, ISD::ROTR, 0),
@@ -1315,44 +1239,26 @@ static const IntrinsicData  IntrinsicsWithoutChain[] = {
                     X86ISD::PSHUFB, 0),
   X86_INTRINSIC_DATA(avx512_mask_pshuf_b_512, INTR_TYPE_2OP_MASK,
                     X86ISD::PSHUFB, 0),
-  X86_INTRINSIC_DATA(avx512_mask_pshuf_d_128, INTR_TYPE_2OP_MASK, 
-                    X86ISD::PSHUFD, 0),
-  X86_INTRINSIC_DATA(avx512_mask_pshuf_d_256, INTR_TYPE_2OP_MASK, 
-                    X86ISD::PSHUFD, 0),
-  X86_INTRINSIC_DATA(avx512_mask_pshuf_d_512, INTR_TYPE_2OP_MASK, 
-                    X86ISD::PSHUFD, 0),
-  X86_INTRINSIC_DATA(avx512_mask_pshufh_w_128,  INTR_TYPE_2OP_MASK,
-                    X86ISD::PSHUFHW, 0),
-  X86_INTRINSIC_DATA(avx512_mask_pshufh_w_256,  INTR_TYPE_2OP_MASK,
-                    X86ISD::PSHUFHW, 0),
-  X86_INTRINSIC_DATA(avx512_mask_pshufh_w_512,  INTR_TYPE_2OP_MASK,
-                    X86ISD::PSHUFHW, 0),
-  X86_INTRINSIC_DATA(avx512_mask_pshufl_w_128,  INTR_TYPE_2OP_MASK,
-                    X86ISD::PSHUFLW, 0),
-  X86_INTRINSIC_DATA(avx512_mask_pshufl_w_256,  INTR_TYPE_2OP_MASK,
-                    X86ISD::PSHUFLW, 0),
-  X86_INTRINSIC_DATA(avx512_mask_pshufl_w_512,  INTR_TYPE_2OP_MASK, 
-                    X86ISD::PSHUFLW, 0),
   X86_INTRINSIC_DATA(avx512_mask_psll_d,        INTR_TYPE_2OP_MASK, X86ISD::VSHL, 0),
   X86_INTRINSIC_DATA(avx512_mask_psll_d_128,    INTR_TYPE_2OP_MASK, X86ISD::VSHL, 0),
   X86_INTRINSIC_DATA(avx512_mask_psll_d_256,    INTR_TYPE_2OP_MASK, X86ISD::VSHL, 0),
-  X86_INTRINSIC_DATA(avx512_mask_psll_di_128,   INTR_TYPE_2OP_MASK, X86ISD::VSHLI, 0),
-  X86_INTRINSIC_DATA(avx512_mask_psll_di_256,   INTR_TYPE_2OP_MASK, X86ISD::VSHLI, 0),
-  X86_INTRINSIC_DATA(avx512_mask_psll_di_512,   INTR_TYPE_2OP_MASK, X86ISD::VSHLI, 0),
+  X86_INTRINSIC_DATA(avx512_mask_psll_di_128,   INTR_TYPE_2OP_IMM8_MASK, X86ISD::VSHLI, 0),
+  X86_INTRINSIC_DATA(avx512_mask_psll_di_256,   INTR_TYPE_2OP_IMM8_MASK, X86ISD::VSHLI, 0),
+  X86_INTRINSIC_DATA(avx512_mask_psll_di_512,   INTR_TYPE_2OP_IMM8_MASK, X86ISD::VSHLI, 0),
   X86_INTRINSIC_DATA(avx512_mask_psll_q,        INTR_TYPE_2OP_MASK, X86ISD::VSHL, 0),
   X86_INTRINSIC_DATA(avx512_mask_psll_q_128,    INTR_TYPE_2OP_MASK, X86ISD::VSHL, 0),
   X86_INTRINSIC_DATA(avx512_mask_psll_q_256,    INTR_TYPE_2OP_MASK, X86ISD::VSHL, 0),
-  X86_INTRINSIC_DATA(avx512_mask_psll_qi_128,   INTR_TYPE_2OP_MASK, X86ISD::VSHLI, 0),
-  X86_INTRINSIC_DATA(avx512_mask_psll_qi_256,   INTR_TYPE_2OP_MASK, X86ISD::VSHLI, 0),
-  X86_INTRINSIC_DATA(avx512_mask_psll_qi_512,   INTR_TYPE_2OP_MASK, X86ISD::VSHLI, 0),
+  X86_INTRINSIC_DATA(avx512_mask_psll_qi_128,   INTR_TYPE_2OP_IMM8_MASK, X86ISD::VSHLI, 0),
+  X86_INTRINSIC_DATA(avx512_mask_psll_qi_256,   INTR_TYPE_2OP_IMM8_MASK, X86ISD::VSHLI, 0),
+  X86_INTRINSIC_DATA(avx512_mask_psll_qi_512,   INTR_TYPE_2OP_IMM8_MASK, X86ISD::VSHLI, 0),
   X86_INTRINSIC_DATA(avx512_mask_psll_w_128,    INTR_TYPE_2OP_MASK, X86ISD::VSHL, 0),
   X86_INTRINSIC_DATA(avx512_mask_psll_w_256,    INTR_TYPE_2OP_MASK, X86ISD::VSHL, 0),
   X86_INTRINSIC_DATA(avx512_mask_psll_w_512,    INTR_TYPE_2OP_MASK, X86ISD::VSHL, 0),
-  X86_INTRINSIC_DATA(avx512_mask_psll_wi_128,   INTR_TYPE_2OP_MASK, X86ISD::VSHLI, 0),
-  X86_INTRINSIC_DATA(avx512_mask_psll_wi_256,   INTR_TYPE_2OP_MASK, X86ISD::VSHLI, 0),
-  X86_INTRINSIC_DATA(avx512_mask_psll_wi_512,   INTR_TYPE_2OP_MASK, X86ISD::VSHLI, 0),
-  X86_INTRINSIC_DATA(avx512_mask_pslli_d,       VSHIFT_MASK, X86ISD::VSHLI, 0),
-  X86_INTRINSIC_DATA(avx512_mask_pslli_q,       VSHIFT_MASK, X86ISD::VSHLI, 0),
+  X86_INTRINSIC_DATA(avx512_mask_psll_wi_128,   INTR_TYPE_2OP_IMM8_MASK, X86ISD::VSHLI, 0),
+  X86_INTRINSIC_DATA(avx512_mask_psll_wi_256,   INTR_TYPE_2OP_IMM8_MASK, X86ISD::VSHLI, 0),
+  X86_INTRINSIC_DATA(avx512_mask_psll_wi_512,   INTR_TYPE_2OP_IMM8_MASK, X86ISD::VSHLI, 0),
+  X86_INTRINSIC_DATA(avx512_mask_psllv_d,       INTR_TYPE_2OP_MASK, ISD::SHL, 0),
+  X86_INTRINSIC_DATA(avx512_mask_psllv_q,       INTR_TYPE_2OP_MASK, ISD::SHL, 0),
   X86_INTRINSIC_DATA(avx512_mask_psllv16_hi,    INTR_TYPE_2OP_MASK, ISD::SHL, 0),
   X86_INTRINSIC_DATA(avx512_mask_psllv2_di,     INTR_TYPE_2OP_MASK, ISD::SHL, 0),
   X86_INTRINSIC_DATA(avx512_mask_psllv32hi,     INTR_TYPE_2OP_MASK, ISD::SHL, 0),
@@ -1360,57 +1266,53 @@ static const IntrinsicData  IntrinsicsWithoutChain[] = {
   X86_INTRINSIC_DATA(avx512_mask_psllv4_si,     INTR_TYPE_2OP_MASK, ISD::SHL, 0),
   X86_INTRINSIC_DATA(avx512_mask_psllv8_hi,     INTR_TYPE_2OP_MASK, ISD::SHL, 0),
   X86_INTRINSIC_DATA(avx512_mask_psllv8_si,     INTR_TYPE_2OP_MASK, ISD::SHL, 0),
-  X86_INTRINSIC_DATA(avx512_mask_psllv_d,       INTR_TYPE_2OP_MASK, ISD::SHL, 0),
-  X86_INTRINSIC_DATA(avx512_mask_psllv_q,       INTR_TYPE_2OP_MASK, ISD::SHL, 0),
   X86_INTRINSIC_DATA(avx512_mask_psra_d,        INTR_TYPE_2OP_MASK, X86ISD::VSRA, 0),
   X86_INTRINSIC_DATA(avx512_mask_psra_d_128,    INTR_TYPE_2OP_MASK, X86ISD::VSRA, 0),
   X86_INTRINSIC_DATA(avx512_mask_psra_d_256,    INTR_TYPE_2OP_MASK, X86ISD::VSRA, 0),
-  X86_INTRINSIC_DATA(avx512_mask_psra_di_128,   INTR_TYPE_2OP_MASK, X86ISD::VSRAI, 0),
-  X86_INTRINSIC_DATA(avx512_mask_psra_di_256,   INTR_TYPE_2OP_MASK, X86ISD::VSRAI, 0),
-  X86_INTRINSIC_DATA(avx512_mask_psra_di_512,   INTR_TYPE_2OP_MASK, X86ISD::VSRAI, 0),
+  X86_INTRINSIC_DATA(avx512_mask_psra_di_128,   INTR_TYPE_2OP_IMM8_MASK, X86ISD::VSRAI, 0),
+  X86_INTRINSIC_DATA(avx512_mask_psra_di_256,   INTR_TYPE_2OP_IMM8_MASK, X86ISD::VSRAI, 0),
+  X86_INTRINSIC_DATA(avx512_mask_psra_di_512,   INTR_TYPE_2OP_IMM8_MASK, X86ISD::VSRAI, 0),
   X86_INTRINSIC_DATA(avx512_mask_psra_q,        INTR_TYPE_2OP_MASK, X86ISD::VSRA, 0),
   X86_INTRINSIC_DATA(avx512_mask_psra_q_128,    INTR_TYPE_2OP_MASK, X86ISD::VSRA, 0),
   X86_INTRINSIC_DATA(avx512_mask_psra_q_256,    INTR_TYPE_2OP_MASK, X86ISD::VSRA, 0),
-  X86_INTRINSIC_DATA(avx512_mask_psra_qi_128,   INTR_TYPE_2OP_MASK, X86ISD::VSRAI, 0),
-  X86_INTRINSIC_DATA(avx512_mask_psra_qi_256,   INTR_TYPE_2OP_MASK, X86ISD::VSRAI, 0),
-  X86_INTRINSIC_DATA(avx512_mask_psra_qi_512,   INTR_TYPE_2OP_MASK, X86ISD::VSRAI, 0),
+  X86_INTRINSIC_DATA(avx512_mask_psra_qi_128,   INTR_TYPE_2OP_IMM8_MASK, X86ISD::VSRAI, 0),
+  X86_INTRINSIC_DATA(avx512_mask_psra_qi_256,   INTR_TYPE_2OP_IMM8_MASK, X86ISD::VSRAI, 0),
+  X86_INTRINSIC_DATA(avx512_mask_psra_qi_512,   INTR_TYPE_2OP_IMM8_MASK, X86ISD::VSRAI, 0),
   X86_INTRINSIC_DATA(avx512_mask_psra_w_128,    INTR_TYPE_2OP_MASK, X86ISD::VSRA, 0),
   X86_INTRINSIC_DATA(avx512_mask_psra_w_256,    INTR_TYPE_2OP_MASK, X86ISD::VSRA, 0),
   X86_INTRINSIC_DATA(avx512_mask_psra_w_512,    INTR_TYPE_2OP_MASK, X86ISD::VSRA, 0),
-  X86_INTRINSIC_DATA(avx512_mask_psra_wi_128,   INTR_TYPE_2OP_MASK, X86ISD::VSRAI, 0),
-  X86_INTRINSIC_DATA(avx512_mask_psra_wi_256,   INTR_TYPE_2OP_MASK, X86ISD::VSRAI, 0),
-  X86_INTRINSIC_DATA(avx512_mask_psra_wi_512,   INTR_TYPE_2OP_MASK, X86ISD::VSRAI, 0),
-  X86_INTRINSIC_DATA(avx512_mask_psrai_d,       VSHIFT_MASK, X86ISD::VSRAI, 0),
-  X86_INTRINSIC_DATA(avx512_mask_psrai_q,       VSHIFT_MASK, X86ISD::VSRAI, 0),
-  X86_INTRINSIC_DATA(avx512_mask_psrav16_hi,    INTR_TYPE_2OP_MASK, ISD::SRA, 0),
-  X86_INTRINSIC_DATA(avx512_mask_psrav32_hi,     INTR_TYPE_2OP_MASK, ISD::SRA, 0),
-  X86_INTRINSIC_DATA(avx512_mask_psrav4_si,     INTR_TYPE_2OP_MASK, ISD::SRA, 0),
-  X86_INTRINSIC_DATA(avx512_mask_psrav8_hi,     INTR_TYPE_2OP_MASK, ISD::SRA, 0),
-  X86_INTRINSIC_DATA(avx512_mask_psrav8_si,     INTR_TYPE_2OP_MASK, ISD::SRA, 0),
-  X86_INTRINSIC_DATA(avx512_mask_psrav_d,       INTR_TYPE_2OP_MASK, ISD::SRA, 0),
-  X86_INTRINSIC_DATA(avx512_mask_psrav_q,       INTR_TYPE_2OP_MASK, ISD::SRA, 0),
-  X86_INTRINSIC_DATA(avx512_mask_psrav_q_128,   INTR_TYPE_2OP_MASK, ISD::SRA, 0),
-  X86_INTRINSIC_DATA(avx512_mask_psrav_q_256,   INTR_TYPE_2OP_MASK, ISD::SRA, 0),
+  X86_INTRINSIC_DATA(avx512_mask_psra_wi_128,   INTR_TYPE_2OP_IMM8_MASK, X86ISD::VSRAI, 0),
+  X86_INTRINSIC_DATA(avx512_mask_psra_wi_256,   INTR_TYPE_2OP_IMM8_MASK, X86ISD::VSRAI, 0),
+  X86_INTRINSIC_DATA(avx512_mask_psra_wi_512,   INTR_TYPE_2OP_IMM8_MASK, X86ISD::VSRAI, 0),
+  X86_INTRINSIC_DATA(avx512_mask_psrav_d,       INTR_TYPE_2OP_MASK, X86ISD::VSRAV, 0),
+  X86_INTRINSIC_DATA(avx512_mask_psrav_q,       INTR_TYPE_2OP_MASK, X86ISD::VSRAV, 0),
+  X86_INTRINSIC_DATA(avx512_mask_psrav_q_128,   INTR_TYPE_2OP_MASK, X86ISD::VSRAV, 0),
+  X86_INTRINSIC_DATA(avx512_mask_psrav_q_256,   INTR_TYPE_2OP_MASK, X86ISD::VSRAV, 0),
+  X86_INTRINSIC_DATA(avx512_mask_psrav16_hi,    INTR_TYPE_2OP_MASK, X86ISD::VSRAV, 0),
+  X86_INTRINSIC_DATA(avx512_mask_psrav32_hi,    INTR_TYPE_2OP_MASK, X86ISD::VSRAV, 0),
+  X86_INTRINSIC_DATA(avx512_mask_psrav4_si,     INTR_TYPE_2OP_MASK, X86ISD::VSRAV, 0),
+  X86_INTRINSIC_DATA(avx512_mask_psrav8_hi,     INTR_TYPE_2OP_MASK, X86ISD::VSRAV, 0),
+  X86_INTRINSIC_DATA(avx512_mask_psrav8_si,     INTR_TYPE_2OP_MASK, X86ISD::VSRAV, 0),
   X86_INTRINSIC_DATA(avx512_mask_psrl_d,        INTR_TYPE_2OP_MASK, X86ISD::VSRL, 0),
   X86_INTRINSIC_DATA(avx512_mask_psrl_d_128,    INTR_TYPE_2OP_MASK, X86ISD::VSRL, 0),
   X86_INTRINSIC_DATA(avx512_mask_psrl_d_256,    INTR_TYPE_2OP_MASK, X86ISD::VSRL, 0),
-  X86_INTRINSIC_DATA(avx512_mask_psrl_di_128,   INTR_TYPE_2OP_MASK, X86ISD::VSRLI, 0),
-  X86_INTRINSIC_DATA(avx512_mask_psrl_di_256,   INTR_TYPE_2OP_MASK, X86ISD::VSRLI, 0),
-  X86_INTRINSIC_DATA(avx512_mask_psrl_di_512,   INTR_TYPE_2OP_MASK, X86ISD::VSRLI, 0),
+  X86_INTRINSIC_DATA(avx512_mask_psrl_di_128,   INTR_TYPE_2OP_IMM8_MASK, X86ISD::VSRLI, 0),
+  X86_INTRINSIC_DATA(avx512_mask_psrl_di_256,   INTR_TYPE_2OP_IMM8_MASK, X86ISD::VSRLI, 0),
+  X86_INTRINSIC_DATA(avx512_mask_psrl_di_512,   INTR_TYPE_2OP_IMM8_MASK, X86ISD::VSRLI, 0),
   X86_INTRINSIC_DATA(avx512_mask_psrl_q,        INTR_TYPE_2OP_MASK, X86ISD::VSRL, 0),
   X86_INTRINSIC_DATA(avx512_mask_psrl_q_128,    INTR_TYPE_2OP_MASK, X86ISD::VSRL, 0),
   X86_INTRINSIC_DATA(avx512_mask_psrl_q_256,    INTR_TYPE_2OP_MASK, X86ISD::VSRL, 0),
-  X86_INTRINSIC_DATA(avx512_mask_psrl_qi_128,   INTR_TYPE_2OP_MASK, X86ISD::VSRLI, 0),
-  X86_INTRINSIC_DATA(avx512_mask_psrl_qi_256,   INTR_TYPE_2OP_MASK, X86ISD::VSRLI, 0),
-  X86_INTRINSIC_DATA(avx512_mask_psrl_qi_512,   INTR_TYPE_2OP_MASK, X86ISD::VSRLI, 0),
+  X86_INTRINSIC_DATA(avx512_mask_psrl_qi_128,   INTR_TYPE_2OP_IMM8_MASK, X86ISD::VSRLI, 0),
+  X86_INTRINSIC_DATA(avx512_mask_psrl_qi_256,   INTR_TYPE_2OP_IMM8_MASK, X86ISD::VSRLI, 0),
+  X86_INTRINSIC_DATA(avx512_mask_psrl_qi_512,   INTR_TYPE_2OP_IMM8_MASK, X86ISD::VSRLI, 0),
   X86_INTRINSIC_DATA(avx512_mask_psrl_w_128,    INTR_TYPE_2OP_MASK, X86ISD::VSRL, 0),
   X86_INTRINSIC_DATA(avx512_mask_psrl_w_256,    INTR_TYPE_2OP_MASK, X86ISD::VSRL, 0),
   X86_INTRINSIC_DATA(avx512_mask_psrl_w_512,    INTR_TYPE_2OP_MASK, X86ISD::VSRL, 0),
-  X86_INTRINSIC_DATA(avx512_mask_psrl_wi_128,   INTR_TYPE_2OP_MASK, X86ISD::VSRLI, 0),
-  X86_INTRINSIC_DATA(avx512_mask_psrl_wi_256,   INTR_TYPE_2OP_MASK, X86ISD::VSRLI, 0),
-  X86_INTRINSIC_DATA(avx512_mask_psrl_wi_512,   INTR_TYPE_2OP_MASK, X86ISD::VSRLI, 0), 
-  X86_INTRINSIC_DATA(avx512_mask_psrli_d,       VSHIFT_MASK, X86ISD::VSRLI, 0),
-  X86_INTRINSIC_DATA(avx512_mask_psrli_q,       VSHIFT_MASK, X86ISD::VSRLI, 0),
+  X86_INTRINSIC_DATA(avx512_mask_psrl_wi_128,   INTR_TYPE_2OP_IMM8_MASK, X86ISD::VSRLI, 0),
+  X86_INTRINSIC_DATA(avx512_mask_psrl_wi_256,   INTR_TYPE_2OP_IMM8_MASK, X86ISD::VSRLI, 0),
+  X86_INTRINSIC_DATA(avx512_mask_psrl_wi_512,   INTR_TYPE_2OP_IMM8_MASK, X86ISD::VSRLI, 0),
+  X86_INTRINSIC_DATA(avx512_mask_psrlv_d,       INTR_TYPE_2OP_MASK, ISD::SRL, 0),
+  X86_INTRINSIC_DATA(avx512_mask_psrlv_q,       INTR_TYPE_2OP_MASK, ISD::SRL, 0),
   X86_INTRINSIC_DATA(avx512_mask_psrlv16_hi,    INTR_TYPE_2OP_MASK, ISD::SRL, 0),
   X86_INTRINSIC_DATA(avx512_mask_psrlv2_di,     INTR_TYPE_2OP_MASK, ISD::SRL, 0),
   X86_INTRINSIC_DATA(avx512_mask_psrlv32hi,     INTR_TYPE_2OP_MASK, ISD::SRL, 0),
@@ -1418,8 +1320,6 @@ static const IntrinsicData  IntrinsicsWithoutChain[] = {
   X86_INTRINSIC_DATA(avx512_mask_psrlv4_si,     INTR_TYPE_2OP_MASK, ISD::SRL, 0),
   X86_INTRINSIC_DATA(avx512_mask_psrlv8_hi,     INTR_TYPE_2OP_MASK, ISD::SRL, 0),
   X86_INTRINSIC_DATA(avx512_mask_psrlv8_si,     INTR_TYPE_2OP_MASK, ISD::SRL, 0),
-  X86_INTRINSIC_DATA(avx512_mask_psrlv_d,       INTR_TYPE_2OP_MASK, ISD::SRL, 0),
-  X86_INTRINSIC_DATA(avx512_mask_psrlv_q,       INTR_TYPE_2OP_MASK, ISD::SRL, 0),
   X86_INTRINSIC_DATA(avx512_mask_psub_b_128, INTR_TYPE_2OP_MASK, ISD::SUB, 0),
   X86_INTRINSIC_DATA(avx512_mask_psub_b_256, INTR_TYPE_2OP_MASK, ISD::SUB, 0),
   X86_INTRINSIC_DATA(avx512_mask_psub_b_512, INTR_TYPE_2OP_MASK, ISD::SUB, 0),
@@ -1456,60 +1356,6 @@ static const IntrinsicData  IntrinsicsWithoutChain[] = {
                      X86ISD::VPTERNLOG, 0),
   X86_INTRINSIC_DATA(avx512_mask_pternlog_q_512, TERLOG_OP_MASK,
                      X86ISD::VPTERNLOG, 0),
-  X86_INTRINSIC_DATA(avx512_mask_punpckhb_w_128, INTR_TYPE_2OP_MASK,
-                     X86ISD::UNPCKH, 0),
-  X86_INTRINSIC_DATA(avx512_mask_punpckhb_w_256, INTR_TYPE_2OP_MASK,
-                     X86ISD::UNPCKH, 0),
-  X86_INTRINSIC_DATA(avx512_mask_punpckhb_w_512, INTR_TYPE_2OP_MASK,
-                     X86ISD::UNPCKH, 0),
-  X86_INTRINSIC_DATA(avx512_mask_punpckhd_q_128, INTR_TYPE_2OP_MASK,
-                     X86ISD::UNPCKH, 0),
-  X86_INTRINSIC_DATA(avx512_mask_punpckhd_q_256, INTR_TYPE_2OP_MASK,
-                     X86ISD::UNPCKH, 0),
-  X86_INTRINSIC_DATA(avx512_mask_punpckhd_q_512, INTR_TYPE_2OP_MASK,
-                     X86ISD::UNPCKH, 0),
-  X86_INTRINSIC_DATA(avx512_mask_punpckhqd_q_128, INTR_TYPE_2OP_MASK,
-                     X86ISD::UNPCKH, 0),
-  X86_INTRINSIC_DATA(avx512_mask_punpckhqd_q_256, INTR_TYPE_2OP_MASK,
-                     X86ISD::UNPCKH, 0),
-  X86_INTRINSIC_DATA(avx512_mask_punpckhqd_q_512, INTR_TYPE_2OP_MASK,
-                     X86ISD::UNPCKH, 0),
-  X86_INTRINSIC_DATA(avx512_mask_punpckhw_d_128, INTR_TYPE_2OP_MASK,
-                     X86ISD::UNPCKH, 0),
-  X86_INTRINSIC_DATA(avx512_mask_punpckhw_d_256, INTR_TYPE_2OP_MASK,
-                     X86ISD::UNPCKH, 0),
-  X86_INTRINSIC_DATA(avx512_mask_punpckhw_d_512, INTR_TYPE_2OP_MASK,
-                     X86ISD::UNPCKH, 0),
-  X86_INTRINSIC_DATA(avx512_mask_punpcklb_w_128, INTR_TYPE_2OP_MASK,
-                     X86ISD::UNPCKL, 0),
-  X86_INTRINSIC_DATA(avx512_mask_punpcklb_w_256, INTR_TYPE_2OP_MASK,
-                     X86ISD::UNPCKL, 0),
-  X86_INTRINSIC_DATA(avx512_mask_punpcklb_w_512, INTR_TYPE_2OP_MASK,
-                     X86ISD::UNPCKL, 0),
-  X86_INTRINSIC_DATA(avx512_mask_punpckld_q_128, INTR_TYPE_2OP_MASK,
-                     X86ISD::UNPCKL, 0),
-  X86_INTRINSIC_DATA(avx512_mask_punpckld_q_256, INTR_TYPE_2OP_MASK,
-                     X86ISD::UNPCKL, 0),
-  X86_INTRINSIC_DATA(avx512_mask_punpckld_q_512, INTR_TYPE_2OP_MASK,
-                     X86ISD::UNPCKL, 0),
-  X86_INTRINSIC_DATA(avx512_mask_punpcklqd_q_128, INTR_TYPE_2OP_MASK,
-                     X86ISD::UNPCKL, 0),
-  X86_INTRINSIC_DATA(avx512_mask_punpcklqd_q_256, INTR_TYPE_2OP_MASK,
-                     X86ISD::UNPCKL, 0),
-  X86_INTRINSIC_DATA(avx512_mask_punpcklqd_q_512, INTR_TYPE_2OP_MASK,
-                     X86ISD::UNPCKL, 0),
-  X86_INTRINSIC_DATA(avx512_mask_punpcklw_d_128, INTR_TYPE_2OP_MASK,
-                     X86ISD::UNPCKL, 0),
-  X86_INTRINSIC_DATA(avx512_mask_punpcklw_d_256, INTR_TYPE_2OP_MASK,
-                     X86ISD::UNPCKL, 0),
-  X86_INTRINSIC_DATA(avx512_mask_punpcklw_d_512, INTR_TYPE_2OP_MASK,
-                     X86ISD::UNPCKL, 0),
-  X86_INTRINSIC_DATA(avx512_mask_pxor_d_128, INTR_TYPE_2OP_MASK, ISD::XOR, 0),
-  X86_INTRINSIC_DATA(avx512_mask_pxor_d_256, INTR_TYPE_2OP_MASK, ISD::XOR, 0),
-  X86_INTRINSIC_DATA(avx512_mask_pxor_d_512, INTR_TYPE_2OP_MASK, ISD::XOR, 0),
-  X86_INTRINSIC_DATA(avx512_mask_pxor_q_128, INTR_TYPE_2OP_MASK, ISD::XOR, 0),
-  X86_INTRINSIC_DATA(avx512_mask_pxor_q_256, INTR_TYPE_2OP_MASK, ISD::XOR, 0),
-  X86_INTRINSIC_DATA(avx512_mask_pxor_q_512, INTR_TYPE_2OP_MASK, ISD::XOR, 0),
   X86_INTRINSIC_DATA(avx512_mask_range_pd_128, INTR_TYPE_3OP_MASK_RM, X86ISD::VRANGE, 0),
   X86_INTRINSIC_DATA(avx512_mask_range_pd_256, INTR_TYPE_3OP_MASK_RM, X86ISD::VRANGE, 0),
   X86_INTRINSIC_DATA(avx512_mask_range_pd_512, INTR_TYPE_3OP_MASK_RM, X86ISD::VRANGE, 0),
@@ -1549,9 +1395,9 @@ static const IntrinsicData  IntrinsicsWithoutChain[] = {
   X86_INTRINSIC_DATA(avx512_mask_scalef_ps_512, INTR_TYPE_2OP_MASK_RM,
                      X86ISD::SCALEF, 0),
   X86_INTRINSIC_DATA(avx512_mask_scalef_sd, INTR_TYPE_SCALAR_MASK_RM,
-                     X86ISD::SCALEF, 0),
+                     X86ISD::SCALEFS, 0),
   X86_INTRINSIC_DATA(avx512_mask_scalef_ss, INTR_TYPE_SCALAR_MASK_RM,
-                     X86ISD::SCALEF, 0),
+                     X86ISD::SCALEFS, 0),
   X86_INTRINSIC_DATA(avx512_mask_shuf_f32x4, INTR_TYPE_3OP_IMM8_MASK,
                      X86ISD::SHUF128, 0),
   X86_INTRINSIC_DATA(avx512_mask_shuf_f32x4_256, INTR_TYPE_3OP_IMM8_MASK,
@@ -1616,30 +1462,6 @@ static const IntrinsicData  IntrinsicsWithoutChain[] = {
   X86_INTRINSIC_DATA(avx512_mask_ucmp_w_128,    CMP_MASK_CC,  X86ISD::CMPMU, 0),
   X86_INTRINSIC_DATA(avx512_mask_ucmp_w_256,    CMP_MASK_CC,  X86ISD::CMPMU, 0),
   X86_INTRINSIC_DATA(avx512_mask_ucmp_w_512,    CMP_MASK_CC,  X86ISD::CMPMU, 0),
-  X86_INTRINSIC_DATA(avx512_mask_unpckh_pd_128, INTR_TYPE_2OP_MASK,
-                     X86ISD::UNPCKH, 0),
-  X86_INTRINSIC_DATA(avx512_mask_unpckh_pd_256, INTR_TYPE_2OP_MASK,
-                     X86ISD::UNPCKH, 0),
-  X86_INTRINSIC_DATA(avx512_mask_unpckh_pd_512, INTR_TYPE_2OP_MASK,
-                     X86ISD::UNPCKH, 0),
-  X86_INTRINSIC_DATA(avx512_mask_unpckh_ps_128, INTR_TYPE_2OP_MASK,
-                     X86ISD::UNPCKH, 0),
-  X86_INTRINSIC_DATA(avx512_mask_unpckh_ps_256, INTR_TYPE_2OP_MASK,
-                     X86ISD::UNPCKH, 0),
-  X86_INTRINSIC_DATA(avx512_mask_unpckh_ps_512, INTR_TYPE_2OP_MASK,
-                     X86ISD::UNPCKH, 0),
-  X86_INTRINSIC_DATA(avx512_mask_unpckl_pd_128, INTR_TYPE_2OP_MASK,
-                     X86ISD::UNPCKL, 0),
-  X86_INTRINSIC_DATA(avx512_mask_unpckl_pd_256, INTR_TYPE_2OP_MASK,
-                     X86ISD::UNPCKL, 0),
-  X86_INTRINSIC_DATA(avx512_mask_unpckl_pd_512, INTR_TYPE_2OP_MASK,
-                     X86ISD::UNPCKL, 0),
-  X86_INTRINSIC_DATA(avx512_mask_unpckl_ps_128, INTR_TYPE_2OP_MASK,
-                     X86ISD::UNPCKL, 0),
-  X86_INTRINSIC_DATA(avx512_mask_unpckl_ps_256, INTR_TYPE_2OP_MASK,
-                     X86ISD::UNPCKL, 0),
-  X86_INTRINSIC_DATA(avx512_mask_unpckl_ps_512, INTR_TYPE_2OP_MASK,
-                     X86ISD::UNPCKL, 0),
   X86_INTRINSIC_DATA(avx512_mask_valign_d_128, INTR_TYPE_3OP_IMM8_MASK,
                      X86ISD::VALIGN, 0),
   X86_INTRINSIC_DATA(avx512_mask_valign_d_256, INTR_TYPE_3OP_IMM8_MASK,
@@ -1673,6 +1495,8 @@ static const IntrinsicData  IntrinsicsWithoutChain[] = {
   X86_INTRINSIC_DATA(avx512_mask_vfmadd_ps_512, FMA_OP_MASK, X86ISD::FMADD,
                      X86ISD::FMADD_RND),
 
+  X86_INTRINSIC_DATA(avx512_mask_vfmadd_sd, FMA_OP_SCALAR_MASK, X86ISD::FMADD_RND, 0),
+  X86_INTRINSIC_DATA(avx512_mask_vfmadd_ss, FMA_OP_SCALAR_MASK, X86ISD::FMADD_RND, 0),
   X86_INTRINSIC_DATA(avx512_mask_vfmaddsub_pd_128, FMA_OP_MASK, X86ISD::FMADDSUB, 0),
   X86_INTRINSIC_DATA(avx512_mask_vfmaddsub_pd_256, FMA_OP_MASK, X86ISD::FMADDSUB, 0),
   X86_INTRINSIC_DATA(avx512_mask_vfmaddsub_pd_512, FMA_OP_MASK, X86ISD::FMADDSUB,
@@ -1730,18 +1554,12 @@ static const IntrinsicData  IntrinsicsWithoutChain[] = {
                     X86ISD::VPERMIV3, 0),
   X86_INTRINSIC_DATA(avx512_mask_vpermi2var_q_512, VPERM_3OP_MASK,
                     X86ISD::VPERMIV3, 0),
-  X86_INTRINSIC_DATA(avx512_mask_vpermil_pd_128, INTR_TYPE_2OP_IMM8_MASK,
-                     X86ISD::VPERMILPI, 0),
-  X86_INTRINSIC_DATA(avx512_mask_vpermil_pd_256, INTR_TYPE_2OP_IMM8_MASK,
-                     X86ISD::VPERMILPI, 0),
-  X86_INTRINSIC_DATA(avx512_mask_vpermil_pd_512, INTR_TYPE_2OP_IMM8_MASK,
-                     X86ISD::VPERMILPI, 0),
-  X86_INTRINSIC_DATA(avx512_mask_vpermil_ps_128, INTR_TYPE_2OP_IMM8_MASK,
-                     X86ISD::VPERMILPI, 0),
-  X86_INTRINSIC_DATA(avx512_mask_vpermil_ps_256, INTR_TYPE_2OP_IMM8_MASK,
-                     X86ISD::VPERMILPI, 0),
-  X86_INTRINSIC_DATA(avx512_mask_vpermil_ps_512, INTR_TYPE_2OP_IMM8_MASK,
-                     X86ISD::VPERMILPI, 0),
+  X86_INTRINSIC_DATA(avx512_mask_vpermi2var_qi_128, VPERM_3OP_MASK,
+                    X86ISD::VPERMV3, 0),
+  X86_INTRINSIC_DATA(avx512_mask_vpermi2var_qi_256, VPERM_3OP_MASK,
+                    X86ISD::VPERMV3, 0),
+  X86_INTRINSIC_DATA(avx512_mask_vpermi2var_qi_512, VPERM_3OP_MASK,
+                    X86ISD::VPERMV3, 0),
   X86_INTRINSIC_DATA(avx512_mask_vpermilvar_pd_128, INTR_TYPE_2OP_MASK,
                      X86ISD::VPERMILPV, 0),
   X86_INTRINSIC_DATA(avx512_mask_vpermilvar_pd_256, INTR_TYPE_2OP_MASK,
@@ -1784,12 +1602,92 @@ static const IntrinsicData  IntrinsicsWithoutChain[] = {
                     X86ISD::VPERMV3, 0),
   X86_INTRINSIC_DATA(avx512_mask_vpermt2var_q_512, VPERM_3OP_MASK,
                     X86ISD::VPERMV3, 0),
+  X86_INTRINSIC_DATA(avx512_mask_vpermt2var_qi_128, VPERM_3OP_MASK,
+                    X86ISD::VPERMV3, 0),
+  X86_INTRINSIC_DATA(avx512_mask_vpermt2var_qi_256, VPERM_3OP_MASK,
+                    X86ISD::VPERMV3, 0),
+  X86_INTRINSIC_DATA(avx512_mask_vpermt2var_qi_512, VPERM_3OP_MASK,
+                    X86ISD::VPERMV3, 0),
+  X86_INTRINSIC_DATA(avx512_mask_vpmadd52h_uq_128 , FMA_OP_MASK,
+                     X86ISD::VPMADD52H, 0),
+  X86_INTRINSIC_DATA(avx512_mask_vpmadd52h_uq_256 , FMA_OP_MASK,
+                     X86ISD::VPMADD52H, 0),
+  X86_INTRINSIC_DATA(avx512_mask_vpmadd52h_uq_512 , FMA_OP_MASK,
+                     X86ISD::VPMADD52H, 0),
+  X86_INTRINSIC_DATA(avx512_mask_vpmadd52l_uq_128 , FMA_OP_MASK,
+                     X86ISD::VPMADD52L, 0),
+  X86_INTRINSIC_DATA(avx512_mask_vpmadd52l_uq_256 , FMA_OP_MASK,
+                     X86ISD::VPMADD52L, 0),
+  X86_INTRINSIC_DATA(avx512_mask_vpmadd52l_uq_512 , FMA_OP_MASK,
+                     X86ISD::VPMADD52L, 0),
   X86_INTRINSIC_DATA(avx512_mask_xor_pd_128, INTR_TYPE_2OP_MASK, X86ISD::FXOR, 0),
   X86_INTRINSIC_DATA(avx512_mask_xor_pd_256, INTR_TYPE_2OP_MASK, X86ISD::FXOR, 0),
   X86_INTRINSIC_DATA(avx512_mask_xor_pd_512, INTR_TYPE_2OP_MASK, X86ISD::FXOR, 0),
   X86_INTRINSIC_DATA(avx512_mask_xor_ps_128, INTR_TYPE_2OP_MASK, X86ISD::FXOR, 0),
   X86_INTRINSIC_DATA(avx512_mask_xor_ps_256, INTR_TYPE_2OP_MASK, X86ISD::FXOR, 0),
   X86_INTRINSIC_DATA(avx512_mask_xor_ps_512, INTR_TYPE_2OP_MASK, X86ISD::FXOR, 0),
+  X86_INTRINSIC_DATA(avx512_mask3_vfmadd_pd_128, FMA_OP_MASK3, X86ISD::FMADD, 0),
+  X86_INTRINSIC_DATA(avx512_mask3_vfmadd_pd_256, FMA_OP_MASK3, X86ISD::FMADD, 0),
+  X86_INTRINSIC_DATA(avx512_mask3_vfmadd_pd_512, FMA_OP_MASK3, X86ISD::FMADD,
+                     X86ISD::FMADD_RND),
+  X86_INTRINSIC_DATA(avx512_mask3_vfmadd_ps_128, FMA_OP_MASK3, X86ISD::FMADD, 0),
+  X86_INTRINSIC_DATA(avx512_mask3_vfmadd_ps_256, FMA_OP_MASK3, X86ISD::FMADD, 0),
+  X86_INTRINSIC_DATA(avx512_mask3_vfmadd_ps_512, FMA_OP_MASK3, X86ISD::FMADD,
+                     X86ISD::FMADD_RND),
+
+  X86_INTRINSIC_DATA(avx512_mask3_vfmadd_sd, FMA_OP_SCALAR_MASK3, X86ISD::FMADD_RND, 0),
+  X86_INTRINSIC_DATA(avx512_mask3_vfmadd_ss, FMA_OP_SCALAR_MASK3, X86ISD::FMADD_RND, 0),
+  X86_INTRINSIC_DATA(avx512_mask3_vfmaddsub_pd_128, FMA_OP_MASK3, X86ISD::FMADDSUB, 0),
+  X86_INTRINSIC_DATA(avx512_mask3_vfmaddsub_pd_256, FMA_OP_MASK3, X86ISD::FMADDSUB, 0),
+  X86_INTRINSIC_DATA(avx512_mask3_vfmaddsub_pd_512, FMA_OP_MASK3, X86ISD::FMADDSUB,
+                     X86ISD::FMADDSUB_RND),
+  X86_INTRINSIC_DATA(avx512_mask3_vfmaddsub_ps_128, FMA_OP_MASK3, X86ISD::FMADDSUB, 0),
+  X86_INTRINSIC_DATA(avx512_mask3_vfmaddsub_ps_256, FMA_OP_MASK3, X86ISD::FMADDSUB, 0),
+  X86_INTRINSIC_DATA(avx512_mask3_vfmaddsub_ps_512, FMA_OP_MASK3, X86ISD::FMADDSUB,
+                     X86ISD::FMADDSUB_RND),
+
+  X86_INTRINSIC_DATA(avx512_mask3_vfmsub_pd_128, FMA_OP_MASK3, X86ISD::FMSUB, 0),
+  X86_INTRINSIC_DATA(avx512_mask3_vfmsub_pd_256, FMA_OP_MASK3, X86ISD::FMSUB, 0),
+  X86_INTRINSIC_DATA(avx512_mask3_vfmsub_pd_512, FMA_OP_MASK3, X86ISD::FMSUB,
+                     X86ISD::FMSUB_RND),
+  X86_INTRINSIC_DATA(avx512_mask3_vfmsub_ps_128, FMA_OP_MASK3, X86ISD::FMSUB, 0),
+  X86_INTRINSIC_DATA(avx512_mask3_vfmsub_ps_256, FMA_OP_MASK3, X86ISD::FMSUB, 0),
+  X86_INTRINSIC_DATA(avx512_mask3_vfmsub_ps_512, FMA_OP_MASK3, X86ISD::FMSUB,
+                     X86ISD::FMSUB_RND),
+
+  X86_INTRINSIC_DATA(avx512_mask3_vfmsubadd_pd_128, FMA_OP_MASK3, X86ISD::FMSUBADD, 0),
+  X86_INTRINSIC_DATA(avx512_mask3_vfmsubadd_pd_256, FMA_OP_MASK3, X86ISD::FMSUBADD, 0),
+  X86_INTRINSIC_DATA(avx512_mask3_vfmsubadd_pd_512, FMA_OP_MASK3, X86ISD::FMSUBADD,
+                     X86ISD::FMSUBADD_RND),
+  X86_INTRINSIC_DATA(avx512_mask3_vfmsubadd_ps_128, FMA_OP_MASK3, X86ISD::FMSUBADD, 0),
+  X86_INTRINSIC_DATA(avx512_mask3_vfmsubadd_ps_256, FMA_OP_MASK3, X86ISD::FMSUBADD, 0),
+  X86_INTRINSIC_DATA(avx512_mask3_vfmsubadd_ps_512, FMA_OP_MASK3, X86ISD::FMSUBADD,
+                     X86ISD::FMSUBADD_RND),
+
+  X86_INTRINSIC_DATA(avx512_mask3_vfnmsub_pd_128, FMA_OP_MASK3, X86ISD::FNMSUB, 0),
+  X86_INTRINSIC_DATA(avx512_mask3_vfnmsub_pd_256, FMA_OP_MASK3, X86ISD::FNMSUB, 0),
+  X86_INTRINSIC_DATA(avx512_mask3_vfnmsub_pd_512, FMA_OP_MASK3, X86ISD::FNMSUB,
+                     X86ISD::FNMSUB_RND),
+  X86_INTRINSIC_DATA(avx512_mask3_vfnmsub_ps_128, FMA_OP_MASK3, X86ISD::FNMSUB, 0),
+  X86_INTRINSIC_DATA(avx512_mask3_vfnmsub_ps_256, FMA_OP_MASK3, X86ISD::FNMSUB, 0),
+  X86_INTRINSIC_DATA(avx512_mask3_vfnmsub_ps_512, FMA_OP_MASK3, X86ISD::FNMSUB,
+                     X86ISD::FNMSUB_RND),
+  X86_INTRINSIC_DATA(avx512_maskz_fixupimm_pd_128, FIXUPIMM_MASKZ,
+                     X86ISD::VFIXUPIMM, 0),
+  X86_INTRINSIC_DATA(avx512_maskz_fixupimm_pd_256, FIXUPIMM_MASKZ,
+                     X86ISD::VFIXUPIMM, 0),
+  X86_INTRINSIC_DATA(avx512_maskz_fixupimm_pd_512, FIXUPIMM_MASKZ,
+                     X86ISD::VFIXUPIMM, 0),
+  X86_INTRINSIC_DATA(avx512_maskz_fixupimm_ps_128, FIXUPIMM_MASKZ,
+                     X86ISD::VFIXUPIMM, 0),
+  X86_INTRINSIC_DATA(avx512_maskz_fixupimm_ps_256, FIXUPIMM_MASKZ,
+                     X86ISD::VFIXUPIMM, 0),
+  X86_INTRINSIC_DATA(avx512_maskz_fixupimm_ps_512, FIXUPIMM_MASKZ,
+                     X86ISD::VFIXUPIMM, 0),
+  X86_INTRINSIC_DATA(avx512_maskz_fixupimm_sd, FIXUPIMMS_MASKZ,
+                     X86ISD::VFIXUPIMMS, 0),
+  X86_INTRINSIC_DATA(avx512_maskz_fixupimm_ss, FIXUPIMMS_MASKZ,
+                     X86ISD::VFIXUPIMMS, 0),
   X86_INTRINSIC_DATA(avx512_maskz_pternlog_d_128, TERLOG_OP_MASKZ,
                      X86ISD::VPTERNLOG, 0),
   X86_INTRINSIC_DATA(avx512_maskz_pternlog_d_256, TERLOG_OP_MASKZ,
@@ -1811,6 +1709,8 @@ static const IntrinsicData  IntrinsicsWithoutChain[] = {
   X86_INTRINSIC_DATA(avx512_maskz_vfmadd_ps_512, FMA_OP_MASKZ, X86ISD::FMADD,
                      X86ISD::FMADD_RND),
 
+  X86_INTRINSIC_DATA(avx512_maskz_vfmadd_sd, FMA_OP_SCALAR_MASKZ, X86ISD::FMADD_RND, 0),
+  X86_INTRINSIC_DATA(avx512_maskz_vfmadd_ss, FMA_OP_SCALAR_MASKZ, X86ISD::FMADD_RND, 0),
   X86_INTRINSIC_DATA(avx512_maskz_vfmaddsub_pd_128, FMA_OP_MASKZ, X86ISD::FMADDSUB, 0),
   X86_INTRINSIC_DATA(avx512_maskz_vfmaddsub_pd_256, FMA_OP_MASKZ, X86ISD::FMADDSUB, 0),
   X86_INTRINSIC_DATA(avx512_maskz_vfmaddsub_pd_512, FMA_OP_MASKZ, X86ISD::FMADDSUB,
@@ -1850,41 +1750,57 @@ static const IntrinsicData  IntrinsicsWithoutChain[] = {
                      X86ISD::VPERMV3, 0),
   X86_INTRINSIC_DATA(avx512_maskz_vpermt2var_q_512, VPERM_3OP_MASKZ,
                      X86ISD::VPERMV3, 0),
-  X86_INTRINSIC_DATA(avx512_pbroadcastb_128, INTR_TYPE_1OP_MASK,
-                     X86ISD::VBROADCAST, 0),
-  X86_INTRINSIC_DATA(avx512_pbroadcastb_256, INTR_TYPE_1OP_MASK,
-                     X86ISD::VBROADCAST, 0),
-  X86_INTRINSIC_DATA(avx512_pbroadcastb_512, INTR_TYPE_1OP_MASK,
-                     X86ISD::VBROADCAST, 0),
-  X86_INTRINSIC_DATA(avx512_pbroadcastd_128, INTR_TYPE_1OP_MASK,
-                     X86ISD::VBROADCAST, 0),
-  X86_INTRINSIC_DATA(avx512_pbroadcastd_256, INTR_TYPE_1OP_MASK,
-                     X86ISD::VBROADCAST, 0),
-  X86_INTRINSIC_DATA(avx512_pbroadcastd_512, INTR_TYPE_1OP_MASK,
-                     X86ISD::VBROADCAST, 0),
-  X86_INTRINSIC_DATA(avx512_pbroadcastq_128, INTR_TYPE_1OP_MASK,
-                     X86ISD::VBROADCAST, 0),
-  X86_INTRINSIC_DATA(avx512_pbroadcastq_256, INTR_TYPE_1OP_MASK,
-                     X86ISD::VBROADCAST, 0),
-  X86_INTRINSIC_DATA(avx512_pbroadcastq_512, INTR_TYPE_1OP_MASK,
-                     X86ISD::VBROADCAST, 0),
-  X86_INTRINSIC_DATA(avx512_pbroadcastw_128, INTR_TYPE_1OP_MASK,
-                     X86ISD::VBROADCAST, 0),
-  X86_INTRINSIC_DATA(avx512_pbroadcastw_256, INTR_TYPE_1OP_MASK,
-                     X86ISD::VBROADCAST, 0),
-  X86_INTRINSIC_DATA(avx512_pbroadcastw_512, INTR_TYPE_1OP_MASK,
-                     X86ISD::VBROADCAST, 0),
+  X86_INTRINSIC_DATA(avx512_maskz_vpermt2var_qi_128, VPERM_3OP_MASKZ,
+                     X86ISD::VPERMV3, 0),
+  X86_INTRINSIC_DATA(avx512_maskz_vpermt2var_qi_256, VPERM_3OP_MASKZ,
+                     X86ISD::VPERMV3, 0),
+  X86_INTRINSIC_DATA(avx512_maskz_vpermt2var_qi_512, VPERM_3OP_MASKZ,
+                     X86ISD::VPERMV3, 0),
+  X86_INTRINSIC_DATA(avx512_maskz_vpmadd52h_uq_128, FMA_OP_MASKZ,
+                     X86ISD::VPMADD52H, 0),
+  X86_INTRINSIC_DATA(avx512_maskz_vpmadd52h_uq_256, FMA_OP_MASKZ,
+                     X86ISD::VPMADD52H, 0),
+  X86_INTRINSIC_DATA(avx512_maskz_vpmadd52h_uq_512, FMA_OP_MASKZ,
+                     X86ISD::VPMADD52H, 0),
+  X86_INTRINSIC_DATA(avx512_maskz_vpmadd52l_uq_128, FMA_OP_MASKZ,
+                     X86ISD::VPMADD52L, 0),
+  X86_INTRINSIC_DATA(avx512_maskz_vpmadd52l_uq_256, FMA_OP_MASKZ,
+                     X86ISD::VPMADD52L, 0),
+  X86_INTRINSIC_DATA(avx512_maskz_vpmadd52l_uq_512, FMA_OP_MASKZ,
+                     X86ISD::VPMADD52L, 0),
   X86_INTRINSIC_DATA(avx512_psad_bw_512, INTR_TYPE_2OP, X86ISD::PSADBW, 0),
-  X86_INTRINSIC_DATA(avx512_psll_dq_512, INTR_TYPE_2OP_IMM8, X86ISD::VSHLDQ, 0),
-  X86_INTRINSIC_DATA(avx512_psrl_dq_512, INTR_TYPE_2OP_IMM8, X86ISD::VSRLDQ, 0),
+  X86_INTRINSIC_DATA(avx512_ptestm_b_128, CMP_MASK, X86ISD::TESTM, 0),
+  X86_INTRINSIC_DATA(avx512_ptestm_b_256, CMP_MASK, X86ISD::TESTM, 0),
+  X86_INTRINSIC_DATA(avx512_ptestm_b_512, CMP_MASK, X86ISD::TESTM, 0),
+  X86_INTRINSIC_DATA(avx512_ptestm_d_128, CMP_MASK, X86ISD::TESTM, 0),
+  X86_INTRINSIC_DATA(avx512_ptestm_d_256, CMP_MASK, X86ISD::TESTM, 0),
+  X86_INTRINSIC_DATA(avx512_ptestm_d_512, CMP_MASK, X86ISD::TESTM, 0),
+  X86_INTRINSIC_DATA(avx512_ptestm_q_128, CMP_MASK, X86ISD::TESTM, 0),
+  X86_INTRINSIC_DATA(avx512_ptestm_q_256, CMP_MASK, X86ISD::TESTM, 0),
+  X86_INTRINSIC_DATA(avx512_ptestm_q_512, CMP_MASK, X86ISD::TESTM, 0),
+  X86_INTRINSIC_DATA(avx512_ptestm_w_128, CMP_MASK, X86ISD::TESTM, 0),
+  X86_INTRINSIC_DATA(avx512_ptestm_w_256, CMP_MASK, X86ISD::TESTM, 0),
+  X86_INTRINSIC_DATA(avx512_ptestm_w_512, CMP_MASK, X86ISD::TESTM, 0),
+  X86_INTRINSIC_DATA(avx512_ptestnm_b_128, CMP_MASK, X86ISD::TESTNM, 0),
+  X86_INTRINSIC_DATA(avx512_ptestnm_b_256, CMP_MASK, X86ISD::TESTNM, 0),
+  X86_INTRINSIC_DATA(avx512_ptestnm_b_512, CMP_MASK, X86ISD::TESTNM, 0),
+  X86_INTRINSIC_DATA(avx512_ptestnm_d_128, CMP_MASK, X86ISD::TESTNM, 0),
+  X86_INTRINSIC_DATA(avx512_ptestnm_d_256, CMP_MASK, X86ISD::TESTNM, 0),
+  X86_INTRINSIC_DATA(avx512_ptestnm_d_512, CMP_MASK, X86ISD::TESTNM, 0),
+  X86_INTRINSIC_DATA(avx512_ptestnm_q_128, CMP_MASK, X86ISD::TESTNM, 0),
+  X86_INTRINSIC_DATA(avx512_ptestnm_q_256, CMP_MASK, X86ISD::TESTNM, 0),
+  X86_INTRINSIC_DATA(avx512_ptestnm_q_512, CMP_MASK, X86ISD::TESTNM, 0),
+  X86_INTRINSIC_DATA(avx512_ptestnm_w_128, CMP_MASK, X86ISD::TESTNM, 0),
+  X86_INTRINSIC_DATA(avx512_ptestnm_w_256, CMP_MASK, X86ISD::TESTNM, 0),
+  X86_INTRINSIC_DATA(avx512_ptestnm_w_512, CMP_MASK, X86ISD::TESTNM, 0),
   X86_INTRINSIC_DATA(avx512_rcp14_pd_128, INTR_TYPE_1OP_MASK, X86ISD::FRCP, 0),
   X86_INTRINSIC_DATA(avx512_rcp14_pd_256, INTR_TYPE_1OP_MASK, X86ISD::FRCP, 0),
   X86_INTRINSIC_DATA(avx512_rcp14_pd_512, INTR_TYPE_1OP_MASK, X86ISD::FRCP, 0),
   X86_INTRINSIC_DATA(avx512_rcp14_ps_128, INTR_TYPE_1OP_MASK, X86ISD::FRCP, 0),
   X86_INTRINSIC_DATA(avx512_rcp14_ps_256, INTR_TYPE_1OP_MASK, X86ISD::FRCP, 0),
   X86_INTRINSIC_DATA(avx512_rcp14_ps_512, INTR_TYPE_1OP_MASK, X86ISD::FRCP, 0),
-  X86_INTRINSIC_DATA(avx512_rcp14_sd, INTR_TYPE_SCALAR_MASK, X86ISD::FRCP, 0),
-  X86_INTRINSIC_DATA(avx512_rcp14_ss, INTR_TYPE_SCALAR_MASK, X86ISD::FRCP, 0),
+  X86_INTRINSIC_DATA(avx512_rcp14_sd, INTR_TYPE_SCALAR_MASK, X86ISD::FRCPS, 0),
+  X86_INTRINSIC_DATA(avx512_rcp14_ss, INTR_TYPE_SCALAR_MASK, X86ISD::FRCPS, 0),
   X86_INTRINSIC_DATA(avx512_rcp28_pd, INTR_TYPE_1OP_MASK_RM, X86ISD::RCP28, 0),
   X86_INTRINSIC_DATA(avx512_rcp28_ps, INTR_TYPE_1OP_MASK_RM, X86ISD::RCP28, 0),
   X86_INTRINSIC_DATA(avx512_rcp28_sd, INTR_TYPE_SCALAR_MASK_RM, X86ISD::RCP28, 0),
@@ -1895,29 +1811,30 @@ static const IntrinsicData  IntrinsicsWithoutChain[] = {
   X86_INTRINSIC_DATA(avx512_rsqrt14_ps_128, INTR_TYPE_1OP_MASK, X86ISD::FRSQRT, 0),
   X86_INTRINSIC_DATA(avx512_rsqrt14_ps_256, INTR_TYPE_1OP_MASK, X86ISD::FRSQRT, 0),
   X86_INTRINSIC_DATA(avx512_rsqrt14_ps_512, INTR_TYPE_1OP_MASK, X86ISD::FRSQRT, 0),
-  X86_INTRINSIC_DATA(avx512_rsqrt14_sd, INTR_TYPE_SCALAR_MASK, X86ISD::FRSQRT, 0),
-  X86_INTRINSIC_DATA(avx512_rsqrt14_ss, INTR_TYPE_SCALAR_MASK, X86ISD::FRSQRT, 0),
+  X86_INTRINSIC_DATA(avx512_rsqrt14_sd, INTR_TYPE_SCALAR_MASK, X86ISD::FRSQRTS, 0),
+  X86_INTRINSIC_DATA(avx512_rsqrt14_ss, INTR_TYPE_SCALAR_MASK, X86ISD::FRSQRTS, 0),
   X86_INTRINSIC_DATA(avx512_rsqrt28_pd, INTR_TYPE_1OP_MASK_RM,X86ISD::RSQRT28, 0),
   X86_INTRINSIC_DATA(avx512_rsqrt28_ps, INTR_TYPE_1OP_MASK_RM,X86ISD::RSQRT28, 0),
   X86_INTRINSIC_DATA(avx512_rsqrt28_sd, INTR_TYPE_SCALAR_MASK_RM,X86ISD::RSQRT28, 0),
   X86_INTRINSIC_DATA(avx512_rsqrt28_ss, INTR_TYPE_SCALAR_MASK_RM,X86ISD::RSQRT28, 0),
   X86_INTRINSIC_DATA(avx512_vcomi_sd, COMI_RM, X86ISD::COMI, X86ISD::UCOMI),
   X86_INTRINSIC_DATA(avx512_vcomi_ss, COMI_RM, X86ISD::COMI, X86ISD::UCOMI),
-  X86_INTRINSIC_DATA(avx_hadd_pd_256,   INTR_TYPE_2OP, X86ISD::FHADD, 0),
-  X86_INTRINSIC_DATA(avx_hadd_ps_256,   INTR_TYPE_2OP, X86ISD::FHADD, 0),
-  X86_INTRINSIC_DATA(avx_hsub_pd_256,   INTR_TYPE_2OP, X86ISD::FHSUB, 0),
-  X86_INTRINSIC_DATA(avx_hsub_ps_256,   INTR_TYPE_2OP, X86ISD::FHSUB, 0),
-  X86_INTRINSIC_DATA(avx_max_pd_256,    INTR_TYPE_2OP, X86ISD::FMAX, 0),
-  X86_INTRINSIC_DATA(avx_max_ps_256,    INTR_TYPE_2OP, X86ISD::FMAX, 0),
-  X86_INTRINSIC_DATA(avx_min_pd_256,    INTR_TYPE_2OP, X86ISD::FMIN, 0),
-  X86_INTRINSIC_DATA(avx_min_ps_256,    INTR_TYPE_2OP, X86ISD::FMIN, 0),
-  X86_INTRINSIC_DATA(avx_rcp_ps_256,    INTR_TYPE_1OP, X86ISD::FRCP, 0),
-  X86_INTRINSIC_DATA(avx_rsqrt_ps_256,  INTR_TYPE_1OP, X86ISD::FRSQRT, 0),
-  X86_INTRINSIC_DATA(avx_sqrt_pd_256,   INTR_TYPE_1OP, ISD::FSQRT, 0),
-  X86_INTRINSIC_DATA(avx_sqrt_ps_256,   INTR_TYPE_1OP, ISD::FSQRT, 0),
-  X86_INTRINSIC_DATA(avx_vperm2f128_pd_256, INTR_TYPE_3OP, X86ISD::VPERM2X128, 0),
-  X86_INTRINSIC_DATA(avx_vperm2f128_ps_256, INTR_TYPE_3OP, X86ISD::VPERM2X128, 0),
-  X86_INTRINSIC_DATA(avx_vperm2f128_si_256, INTR_TYPE_3OP, X86ISD::VPERM2X128, 0),
+  X86_INTRINSIC_DATA(avx512_vcvtsd2si32, INTR_TYPE_2OP,
+                     X86ISD::SCALAR_FP_TO_SINT_RND, 0),
+  X86_INTRINSIC_DATA(avx512_vcvtsd2si64, INTR_TYPE_2OP,
+                     X86ISD::SCALAR_FP_TO_SINT_RND, 0),
+  X86_INTRINSIC_DATA(avx512_vcvtsd2usi32, INTR_TYPE_2OP,
+                     X86ISD::SCALAR_FP_TO_UINT_RND, 0),
+  X86_INTRINSIC_DATA(avx512_vcvtsd2usi64, INTR_TYPE_2OP,
+                     X86ISD::SCALAR_FP_TO_UINT_RND, 0),
+  X86_INTRINSIC_DATA(avx512_vcvtss2si32, INTR_TYPE_2OP,
+                     X86ISD::SCALAR_FP_TO_SINT_RND, 0),
+  X86_INTRINSIC_DATA(avx512_vcvtss2si64, INTR_TYPE_2OP,
+                     X86ISD::SCALAR_FP_TO_SINT_RND, 0),
+  X86_INTRINSIC_DATA(avx512_vcvtss2usi32, INTR_TYPE_2OP,
+                     X86ISD::SCALAR_FP_TO_UINT_RND, 0),
+  X86_INTRINSIC_DATA(avx512_vcvtss2usi64, INTR_TYPE_2OP,
+                     X86ISD::SCALAR_FP_TO_UINT_RND, 0),
   X86_INTRINSIC_DATA(fma_vfmadd_pd,        INTR_TYPE_3OP, X86ISD::FMADD, 0),
   X86_INTRINSIC_DATA(fma_vfmadd_pd_256,    INTR_TYPE_3OP, X86ISD::FMADD, 0),
   X86_INTRINSIC_DATA(fma_vfmadd_ps,        INTR_TYPE_3OP, X86ISD::FMADD, 0),
@@ -1942,6 +1859,24 @@ static const IntrinsicData  IntrinsicsWithoutChain[] = {
   X86_INTRINSIC_DATA(fma_vfnmsub_pd_256,   INTR_TYPE_3OP, X86ISD::FNMSUB, 0),
   X86_INTRINSIC_DATA(fma_vfnmsub_ps,       INTR_TYPE_3OP, X86ISD::FNMSUB, 0),
   X86_INTRINSIC_DATA(fma_vfnmsub_ps_256,   INTR_TYPE_3OP, X86ISD::FNMSUB, 0),
+  X86_INTRINSIC_DATA(sse_comieq_ss,     COMI, X86ISD::COMI, ISD::SETEQ),
+  X86_INTRINSIC_DATA(sse_comige_ss,     COMI, X86ISD::COMI, ISD::SETGE),
+  X86_INTRINSIC_DATA(sse_comigt_ss,     COMI, X86ISD::COMI, ISD::SETGT),
+  X86_INTRINSIC_DATA(sse_comile_ss,     COMI, X86ISD::COMI, ISD::SETLE),
+  X86_INTRINSIC_DATA(sse_comilt_ss,     COMI, X86ISD::COMI, ISD::SETLT),
+  X86_INTRINSIC_DATA(sse_comineq_ss,    COMI, X86ISD::COMI, ISD::SETNE),
+  X86_INTRINSIC_DATA(sse_max_ps,        INTR_TYPE_2OP, X86ISD::FMAX, 0),
+  X86_INTRINSIC_DATA(sse_min_ps,        INTR_TYPE_2OP, X86ISD::FMIN, 0),
+  X86_INTRINSIC_DATA(sse_movmsk_ps,     INTR_TYPE_1OP, X86ISD::MOVMSK, 0),
+  X86_INTRINSIC_DATA(sse_rcp_ps,        INTR_TYPE_1OP, X86ISD::FRCP, 0),
+  X86_INTRINSIC_DATA(sse_rsqrt_ps,      INTR_TYPE_1OP, X86ISD::FRSQRT, 0),
+  X86_INTRINSIC_DATA(sse_sqrt_ps,       INTR_TYPE_1OP, ISD::FSQRT, 0),
+  X86_INTRINSIC_DATA(sse_ucomieq_ss,    COMI, X86ISD::UCOMI, ISD::SETEQ),
+  X86_INTRINSIC_DATA(sse_ucomige_ss,    COMI, X86ISD::UCOMI, ISD::SETGE),
+  X86_INTRINSIC_DATA(sse_ucomigt_ss,    COMI, X86ISD::UCOMI, ISD::SETGT),
+  X86_INTRINSIC_DATA(sse_ucomile_ss,    COMI, X86ISD::UCOMI, ISD::SETLE),
+  X86_INTRINSIC_DATA(sse_ucomilt_ss,    COMI, X86ISD::UCOMI, ISD::SETLT),
+  X86_INTRINSIC_DATA(sse_ucomineq_ss,   COMI, X86ISD::UCOMI, ISD::SETNE),
   X86_INTRINSIC_DATA(sse2_comieq_sd,    COMI, X86ISD::COMI, ISD::SETEQ),
   X86_INTRINSIC_DATA(sse2_comige_sd,    COMI, X86ISD::COMI, ISD::SETGE),
   X86_INTRINSIC_DATA(sse2_comigt_sd,    COMI, X86ISD::COMI, ISD::SETGT),
@@ -1950,22 +1885,21 @@ static const IntrinsicData  IntrinsicsWithoutChain[] = {
   X86_INTRINSIC_DATA(sse2_comineq_sd,   COMI, X86ISD::COMI, ISD::SETNE),
   X86_INTRINSIC_DATA(sse2_max_pd,       INTR_TYPE_2OP, X86ISD::FMAX, 0),
   X86_INTRINSIC_DATA(sse2_min_pd,       INTR_TYPE_2OP, X86ISD::FMIN, 0),
+  X86_INTRINSIC_DATA(sse2_movmsk_pd,    INTR_TYPE_1OP, X86ISD::MOVMSK, 0),
   X86_INTRINSIC_DATA(sse2_packssdw_128, INTR_TYPE_2OP, X86ISD::PACKSS, 0),
   X86_INTRINSIC_DATA(sse2_packsswb_128, INTR_TYPE_2OP, X86ISD::PACKSS, 0),
   X86_INTRINSIC_DATA(sse2_packuswb_128, INTR_TYPE_2OP, X86ISD::PACKUS, 0),
+  X86_INTRINSIC_DATA(sse2_padds_b,      INTR_TYPE_2OP, X86ISD::ADDS, 0),
+  X86_INTRINSIC_DATA(sse2_padds_w,      INTR_TYPE_2OP, X86ISD::ADDS, 0),
+  X86_INTRINSIC_DATA(sse2_paddus_b,     INTR_TYPE_2OP, X86ISD::ADDUS, 0),
+  X86_INTRINSIC_DATA(sse2_paddus_w,     INTR_TYPE_2OP, X86ISD::ADDUS, 0),
   X86_INTRINSIC_DATA(sse2_pavg_b,       INTR_TYPE_2OP, X86ISD::AVG, 0),
   X86_INTRINSIC_DATA(sse2_pavg_w,       INTR_TYPE_2OP, X86ISD::AVG, 0),
-  X86_INTRINSIC_DATA(sse2_pmaxs_w,      INTR_TYPE_2OP, ISD::SMAX, 0),
-  X86_INTRINSIC_DATA(sse2_pmaxu_b,      INTR_TYPE_2OP, ISD::UMAX, 0),
-  X86_INTRINSIC_DATA(sse2_pmins_w,      INTR_TYPE_2OP, ISD::SMIN, 0),
-  X86_INTRINSIC_DATA(sse2_pminu_b,      INTR_TYPE_2OP, ISD::UMIN, 0),
+  X86_INTRINSIC_DATA(sse2_pmovmskb_128, INTR_TYPE_1OP, X86ISD::MOVMSK, 0),
   X86_INTRINSIC_DATA(sse2_pmulh_w,      INTR_TYPE_2OP, ISD::MULHS, 0),
   X86_INTRINSIC_DATA(sse2_pmulhu_w,     INTR_TYPE_2OP, ISD::MULHU, 0),
   X86_INTRINSIC_DATA(sse2_pmulu_dq,     INTR_TYPE_2OP, X86ISD::PMULUDQ, 0),
   X86_INTRINSIC_DATA(sse2_psad_bw,      INTR_TYPE_2OP, X86ISD::PSADBW, 0),
-  X86_INTRINSIC_DATA(sse2_pshuf_d,      INTR_TYPE_2OP, X86ISD::PSHUFD, 0),
-  X86_INTRINSIC_DATA(sse2_pshufh_w,     INTR_TYPE_2OP, X86ISD::PSHUFHW, 0),
-  X86_INTRINSIC_DATA(sse2_pshufl_w,     INTR_TYPE_2OP, X86ISD::PSHUFLW, 0),
   X86_INTRINSIC_DATA(sse2_psll_d,       INTR_TYPE_2OP, X86ISD::VSHL, 0),
   X86_INTRINSIC_DATA(sse2_psll_q,       INTR_TYPE_2OP, X86ISD::VSHL, 0),
   X86_INTRINSIC_DATA(sse2_psll_w,       INTR_TYPE_2OP, X86ISD::VSHL, 0),
@@ -1982,6 +1916,8 @@ static const IntrinsicData  IntrinsicsWithoutChain[] = {
   X86_INTRINSIC_DATA(sse2_psrli_d,      VSHIFT, X86ISD::VSRLI, 0),
   X86_INTRINSIC_DATA(sse2_psrli_q,      VSHIFT, X86ISD::VSRLI, 0),
   X86_INTRINSIC_DATA(sse2_psrli_w,      VSHIFT, X86ISD::VSRLI, 0),
+  X86_INTRINSIC_DATA(sse2_psubs_b,      INTR_TYPE_2OP, X86ISD::SUBS, 0),
+  X86_INTRINSIC_DATA(sse2_psubs_w,      INTR_TYPE_2OP, X86ISD::SUBS, 0),
   X86_INTRINSIC_DATA(sse2_psubus_b,     INTR_TYPE_2OP, X86ISD::SUBUS, 0),
   X86_INTRINSIC_DATA(sse2_psubus_w,     INTR_TYPE_2OP, X86ISD::SUBUS, 0),
   X86_INTRINSIC_DATA(sse2_sqrt_pd,      INTR_TYPE_1OP, ISD::FSQRT, 0),
@@ -1997,48 +1933,17 @@ static const IntrinsicData  IntrinsicsWithoutChain[] = {
   X86_INTRINSIC_DATA(sse3_hsub_ps,      INTR_TYPE_2OP, X86ISD::FHSUB, 0),
   X86_INTRINSIC_DATA(sse41_insertps,    INTR_TYPE_3OP, X86ISD::INSERTPS, 0),
   X86_INTRINSIC_DATA(sse41_packusdw,    INTR_TYPE_2OP, X86ISD::PACKUS, 0),
-  X86_INTRINSIC_DATA(sse41_pmaxsb,      INTR_TYPE_2OP, ISD::SMAX, 0),
-  X86_INTRINSIC_DATA(sse41_pmaxsd,      INTR_TYPE_2OP, ISD::SMAX, 0),
-  X86_INTRINSIC_DATA(sse41_pmaxud,      INTR_TYPE_2OP, ISD::UMAX, 0),
-  X86_INTRINSIC_DATA(sse41_pmaxuw,      INTR_TYPE_2OP, ISD::UMAX, 0),
-  X86_INTRINSIC_DATA(sse41_pminsb,      INTR_TYPE_2OP, ISD::SMIN, 0),
-  X86_INTRINSIC_DATA(sse41_pminsd,      INTR_TYPE_2OP, ISD::SMIN, 0),
-  X86_INTRINSIC_DATA(sse41_pminud,      INTR_TYPE_2OP, ISD::UMIN, 0),
-  X86_INTRINSIC_DATA(sse41_pminuw,      INTR_TYPE_2OP, ISD::UMIN, 0),
-  X86_INTRINSIC_DATA(sse41_pmovzxbd,    INTR_TYPE_1OP, X86ISD::VZEXT, 0),
-  X86_INTRINSIC_DATA(sse41_pmovzxbq,    INTR_TYPE_1OP, X86ISD::VZEXT, 0),
-  X86_INTRINSIC_DATA(sse41_pmovzxbw,    INTR_TYPE_1OP, X86ISD::VZEXT, 0),
-  X86_INTRINSIC_DATA(sse41_pmovzxdq,    INTR_TYPE_1OP, X86ISD::VZEXT, 0),
-  X86_INTRINSIC_DATA(sse41_pmovzxwd,    INTR_TYPE_1OP, X86ISD::VZEXT, 0),
-  X86_INTRINSIC_DATA(sse41_pmovzxwq,    INTR_TYPE_1OP, X86ISD::VZEXT, 0),
   X86_INTRINSIC_DATA(sse41_pmuldq,      INTR_TYPE_2OP, X86ISD::PMULDQ, 0),
   X86_INTRINSIC_DATA(sse4a_extrqi,      INTR_TYPE_3OP, X86ISD::EXTRQI, 0),
   X86_INTRINSIC_DATA(sse4a_insertqi,    INTR_TYPE_4OP, X86ISD::INSERTQI, 0),
-  X86_INTRINSIC_DATA(sse_comieq_ss,     COMI, X86ISD::COMI, ISD::SETEQ),
-  X86_INTRINSIC_DATA(sse_comige_ss,     COMI, X86ISD::COMI, ISD::SETGE),
-  X86_INTRINSIC_DATA(sse_comigt_ss,     COMI, X86ISD::COMI, ISD::SETGT),
-  X86_INTRINSIC_DATA(sse_comile_ss,     COMI, X86ISD::COMI, ISD::SETLE),
-  X86_INTRINSIC_DATA(sse_comilt_ss,     COMI, X86ISD::COMI, ISD::SETLT),
-  X86_INTRINSIC_DATA(sse_comineq_ss,    COMI, X86ISD::COMI, ISD::SETNE),
-  X86_INTRINSIC_DATA(sse_max_ps,        INTR_TYPE_2OP, X86ISD::FMAX, 0),
-  X86_INTRINSIC_DATA(sse_min_ps,        INTR_TYPE_2OP, X86ISD::FMIN, 0),
-  X86_INTRINSIC_DATA(sse_rcp_ps,        INTR_TYPE_1OP, X86ISD::FRCP, 0),
-  X86_INTRINSIC_DATA(sse_rsqrt_ps,      INTR_TYPE_1OP, X86ISD::FRSQRT, 0),
-  X86_INTRINSIC_DATA(sse_sqrt_ps,       INTR_TYPE_1OP, ISD::FSQRT, 0),
-  X86_INTRINSIC_DATA(sse_ucomieq_ss,    COMI, X86ISD::UCOMI, ISD::SETEQ),
-  X86_INTRINSIC_DATA(sse_ucomige_ss,    COMI, X86ISD::UCOMI, ISD::SETGE),
-  X86_INTRINSIC_DATA(sse_ucomigt_ss,    COMI, X86ISD::UCOMI, ISD::SETGT),
-  X86_INTRINSIC_DATA(sse_ucomile_ss,    COMI, X86ISD::UCOMI, ISD::SETLE),
-  X86_INTRINSIC_DATA(sse_ucomilt_ss,    COMI, X86ISD::UCOMI, ISD::SETLT),
-  X86_INTRINSIC_DATA(sse_ucomineq_ss,   COMI, X86ISD::UCOMI, ISD::SETNE),
+  X86_INTRINSIC_DATA(ssse3_pabs_b_128,  INTR_TYPE_1OP, X86ISD::ABS, 0),
+  X86_INTRINSIC_DATA(ssse3_pabs_d_128,  INTR_TYPE_1OP, X86ISD::ABS, 0),
+  X86_INTRINSIC_DATA(ssse3_pabs_w_128,  INTR_TYPE_1OP, X86ISD::ABS, 0),
   X86_INTRINSIC_DATA(ssse3_phadd_d_128, INTR_TYPE_2OP, X86ISD::HADD, 0),
   X86_INTRINSIC_DATA(ssse3_phadd_w_128, INTR_TYPE_2OP, X86ISD::HADD, 0),
   X86_INTRINSIC_DATA(ssse3_phsub_d_128, INTR_TYPE_2OP, X86ISD::HSUB, 0),
   X86_INTRINSIC_DATA(ssse3_phsub_w_128, INTR_TYPE_2OP, X86ISD::HSUB, 0),
   X86_INTRINSIC_DATA(ssse3_pshuf_b_128, INTR_TYPE_2OP, X86ISD::PSHUFB, 0),
-  X86_INTRINSIC_DATA(ssse3_psign_b_128, INTR_TYPE_2OP, X86ISD::PSIGN, 0),
-  X86_INTRINSIC_DATA(ssse3_psign_d_128, INTR_TYPE_2OP, X86ISD::PSIGN, 0),
-  X86_INTRINSIC_DATA(ssse3_psign_w_128, INTR_TYPE_2OP, X86ISD::PSIGN, 0),
   X86_INTRINSIC_DATA(xop_vpcomb,        INTR_TYPE_3OP, X86ISD::VPCOM, 0),
   X86_INTRINSIC_DATA(xop_vpcomd,        INTR_TYPE_3OP, X86ISD::VPCOM, 0),
   X86_INTRINSIC_DATA(xop_vpcomq,        INTR_TYPE_3OP, X86ISD::VPCOM, 0),
@@ -2047,6 +1952,11 @@ static const IntrinsicData  IntrinsicsWithoutChain[] = {
   X86_INTRINSIC_DATA(xop_vpcomuq,       INTR_TYPE_3OP, X86ISD::VPCOMU, 0),
   X86_INTRINSIC_DATA(xop_vpcomuw,       INTR_TYPE_3OP, X86ISD::VPCOMU, 0),
   X86_INTRINSIC_DATA(xop_vpcomw,        INTR_TYPE_3OP, X86ISD::VPCOM, 0),
+  X86_INTRINSIC_DATA(xop_vpermil2pd,     INTR_TYPE_4OP, X86ISD::VPERMIL2, 0),
+  X86_INTRINSIC_DATA(xop_vpermil2pd_256, INTR_TYPE_4OP, X86ISD::VPERMIL2, 0),
+  X86_INTRINSIC_DATA(xop_vpermil2ps,     INTR_TYPE_4OP, X86ISD::VPERMIL2, 0),
+  X86_INTRINSIC_DATA(xop_vpermil2ps_256, INTR_TYPE_4OP, X86ISD::VPERMIL2, 0),
+  X86_INTRINSIC_DATA(xop_vpperm,        INTR_TYPE_3OP, X86ISD::VPPERM, 0),
   X86_INTRINSIC_DATA(xop_vprotb,        INTR_TYPE_2OP, X86ISD::VPROT, 0),
   X86_INTRINSIC_DATA(xop_vprotbi,       INTR_TYPE_2OP, X86ISD::VPROTI, 0),
   X86_INTRINSIC_DATA(xop_vprotd,        INTR_TYPE_2OP, X86ISD::VPROT, 0),
@@ -2069,7 +1979,7 @@ static const IntrinsicData  IntrinsicsWithoutChain[] = {
  * Retrieve data for Intrinsic without chain.
  * Return nullptr if intrinsic is not defined in the table.
  */
-static const IntrinsicData* getIntrinsicWithoutChain(unsigned IntNo) {
+static const IntrinsicData* getIntrinsicWithoutChain(uint16_t IntNo) {
   IntrinsicData IntrinsicToFind = { IntNo, INTR_NO_TYPE, 0, 0 };
   const IntrinsicData *Data = std::lower_bound(std::begin(IntrinsicsWithoutChain),
                                                std::end(IntrinsicsWithoutChain),
@@ -2093,96 +2003,6 @@ static void verifyIntrinsicTables() {
           std::end(IntrinsicsWithChain)) &&
          "Intrinsic data tables should have unique entries");
 }
-
-// X86 specific compare constants.
-// They must be kept in synch with avxintrin.h
-#define _X86_CMP_EQ_OQ    0x00 /* Equal (ordered, non-signaling)  */
-#define _X86_CMP_LT_OS    0x01 /* Less-than (ordered, signaling)  */
-#define _X86_CMP_LE_OS    0x02 /* Less-than-or-equal (ordered, signaling)  */
-#define _X86_CMP_UNORD_Q  0x03 /* Unordered (non-signaling)  */
-#define _X86_CMP_NEQ_UQ   0x04 /* Not-equal (unordered, non-signaling)  */
-#define _X86_CMP_NLT_US   0x05 /* Not-less-than (unordered, signaling)  */
-#define _X86_CMP_NLE_US   0x06 /* Not-less-than-or-equal (unordered, signaling)  */
-#define _X86_CMP_ORD_Q    0x07 /* Ordered (nonsignaling)   */
-#define _X86_CMP_EQ_UQ    0x08 /* Equal (unordered, non-signaling)  */
-#define _X86_CMP_NGE_US   0x09 /* Not-greater-than-or-equal (unord, signaling)  */
-#define _X86_CMP_NGT_US   0x0a /* Not-greater-than (unordered, signaling)  */
-#define _X86_CMP_FALSE_OQ 0x0b /* False (ordered, non-signaling)  */
-#define _X86_CMP_NEQ_OQ   0x0c /* Not-equal (ordered, non-signaling)  */
-#define _X86_CMP_GE_OS    0x0d /* Greater-than-or-equal (ordered, signaling)  */
-#define _X86_CMP_GT_OS    0x0e /* Greater-than (ordered, signaling)  */
-#define _X86_CMP_TRUE_UQ  0x0f /* True (unordered, non-signaling)  */
-#define _X86_CMP_EQ_OS    0x10 /* Equal (ordered, signaling)  */
-#define _X86_CMP_LT_OQ    0x11 /* Less-than (ordered, non-signaling)  */
-#define _X86_CMP_LE_OQ    0x12 /* Less-than-or-equal (ordered, non-signaling)  */
-#define _X86_CMP_UNORD_S  0x13 /* Unordered (signaling)  */
-#define _X86_CMP_NEQ_US   0x14 /* Not-equal (unordered, signaling)  */
-#define _X86_CMP_NLT_UQ   0x15 /* Not-less-than (unordered, non-signaling)  */
-#define _X86_CMP_NLE_UQ   0x16 /* Not-less-than-or-equal (unord, non-signaling)  */
-#define _X86_CMP_ORD_S    0x17 /* Ordered (signaling)  */
-#define _X86_CMP_EQ_US    0x18 /* Equal (unordered, signaling)  */
-#define _X86_CMP_NGE_UQ   0x19 /* Not-greater-than-or-equal (unord, non-sign)  */
-#define _X86_CMP_NGT_UQ   0x1a /* Not-greater-than (unordered, non-signaling)  */
-#define _X86_CMP_FALSE_OS 0x1b /* False (ordered, signaling)  */
-#define _X86_CMP_NEQ_OS   0x1c /* Not-equal (ordered, signaling)  */
-#define _X86_CMP_GE_OQ    0x1d /* Greater-than-or-equal (ordered, non-signaling)  */
-#define _X86_CMP_GT_OQ    0x1e /* Greater-than (ordered, non-signaling)  */
-#define _X86_CMP_TRUE_US  0x1f /* True (unordered, signaling)  */
-
-/*
-* Get comparison modifier from _mm_comi_round_sd/ss intrinsic
-* Return tuple <isOrdered, X86 condcode>
-*/
-static std::tuple<bool,unsigned> TranslateX86ConstCondToX86CC(SDValue &imm) {
-  ConstantSDNode *CImm = dyn_cast<ConstantSDNode>(imm);
-  unsigned IntImm = CImm->getZExtValue();
-  // On a floating point condition, the flags are set as follows:
-  // ZF  PF  CF   op
-  //  0 | 0 | 0 | X > Y
-  //  0 | 0 | 1 | X < Y
-  //  1 | 0 | 0 | X == Y
-  //  1 | 1 | 1 | unordered
-  switch (IntImm) {
-  default: llvm_unreachable("Invalid floating point compare value for Comi!");
-  case _X86_CMP_EQ_OQ:      // 0x00 - Equal (ordered, nonsignaling)
-  case _X86_CMP_EQ_OS:      // 0x10 - Equal (ordered, signaling)
-    return std::make_tuple(true, X86::COND_E);
-  case _X86_CMP_EQ_UQ:      // 0x08 - Equal (unordered, non-signaling)
-  case _X86_CMP_EQ_US:      // 0x18 - Equal (unordered, signaling)
-    return std::make_tuple(false , X86::COND_E);
-  case _X86_CMP_LT_OS:      // 0x01 - Less-than (ordered, signaling)
-  case _X86_CMP_LT_OQ:      // 0x11 - Less-than (ordered, nonsignaling)
-    return std::make_tuple(true, X86::COND_B);
-  case _X86_CMP_NGE_US:     // 0x09 - Not-greater-than-or-equal (unordered, signaling)
-  case _X86_CMP_NGE_UQ:     // 0x19 - Not-greater-than-or-equal (unordered, nonsignaling)
-    return std::make_tuple(false , X86::COND_B);
-  case _X86_CMP_LE_OS:      // 0x02 - Less-than-or-equal (ordered, signaling)
-  case _X86_CMP_LE_OQ:      // 0x12 - Less-than-or-equal (ordered, nonsignaling)
-    return std::make_tuple(true, X86::COND_BE);
-  case _X86_CMP_NGT_US:     // 0x0A - Not-greater-than (unordered, signaling)
-  case _X86_CMP_NGT_UQ:     // 0x1A - Not-greater-than (unordered, nonsignaling)
-    return std::make_tuple(false, X86::COND_BE);
-  case _X86_CMP_GT_OS:      // 0x0E - Greater-than (ordered, signaling)
-  case _X86_CMP_GT_OQ:      // 0x1E - Greater-than (ordered, nonsignaling)
-    return std::make_tuple(true, X86::COND_A);
-  case _X86_CMP_NLE_US:     // 0x06 - Not-less-than-or-equal (unordered,signaling)
-  case _X86_CMP_NLE_UQ:     // 0x16 - Not-less-than-or-equal (unordered, nonsignaling)
-    return std::make_tuple(false, X86::COND_A);
-  case _X86_CMP_GE_OS:      // 0x0D - Greater-than-or-equal (ordered, signaling)
-  case _X86_CMP_GE_OQ:      // 0x1D - Greater-than-or-equal (ordered, nonsignaling)
-    return std::make_tuple(true, X86::COND_AE);
-  case _X86_CMP_NLT_US:     // 0x05 - Not-less-than (unordered, signaling)
-  case _X86_CMP_NLT_UQ:     // 0x15 - Not-less-than (unordered, nonsignaling)
-    return std::make_tuple(false, X86::COND_AE);
-  case _X86_CMP_NEQ_OQ:     // 0x0C - Not-equal (ordered, non-signaling)
-  case _X86_CMP_NEQ_OS:     // 0x1C - Not-equal (ordered, signaling)
-    return std::make_tuple(true, X86::COND_NE);
-  case _X86_CMP_NEQ_UQ:     // 0x04 - Not-equal (unordered, nonsignaling)
-  case _X86_CMP_NEQ_US:     // 0x14 - Not-equal (unordered, signaling)
-    return std::make_tuple(false, X86::COND_NE);
-  }
-}
-
 } // End llvm namespace
 
 #endif
diff --git a/lib/Target/X86/X86MCInstLower.cpp b/lib/Target/X86/X86MCInstLower.cpp
index e1ca558f0f2c..906e3427b2ff 100644
--- a/lib/Target/X86/X86MCInstLower.cpp
+++ b/lib/Target/X86/X86MCInstLower.cpp
@@ -20,6 +20,7 @@
 #include "Utils/X86ShuffleDecode.h"
 #include "llvm/ADT/Optional.h"
 #include "llvm/ADT/SmallString.h"
+#include "llvm/ADT/iterator_range.h"
 #include "llvm/CodeGen/MachineFunction.h"
 #include "llvm/CodeGen/MachineConstantPool.h"
 #include "llvm/CodeGen/MachineOperand.h"
@@ -35,9 +36,15 @@
 #include "llvm/MC/MCFixup.h"
 #include "llvm/MC/MCInst.h"
 #include "llvm/MC/MCInstBuilder.h"
+#include "llvm/MC/MCSection.h"
 #include "llvm/MC/MCStreamer.h"
 #include "llvm/MC/MCSymbol.h"
+#include "llvm/MC/MCSymbolELF.h"
+#include "llvm/MC/MCSectionELF.h"
 #include "llvm/Support/TargetRegistry.h"
+#include "llvm/Support/ELF.h"
+#include "llvm/Target/TargetLoweringObjectFile.h"
+
 using namespace llvm;
 
 namespace {
@@ -72,47 +79,33 @@ private:
 static void EmitNops(MCStreamer &OS, unsigned NumBytes, bool Is64Bit,
                      const MCSubtargetInfo &STI);
 
-namespace llvm {
-   X86AsmPrinter::StackMapShadowTracker::StackMapShadowTracker(TargetMachine &TM)
-     : TM(TM), InShadow(false), RequiredShadowSize(0), CurrentShadowSize(0) {}
-
-  X86AsmPrinter::StackMapShadowTracker::~StackMapShadowTracker() {}
-
-  void
-  X86AsmPrinter::StackMapShadowTracker::startFunction(MachineFunction &F) {
-    MF = &F;
-    CodeEmitter.reset(TM.getTarget().createMCCodeEmitter(
-        *MF->getSubtarget().getInstrInfo(),
-        *MF->getSubtarget().getRegisterInfo(), MF->getContext()));
-  }
-
-  void X86AsmPrinter::StackMapShadowTracker::count(MCInst &Inst,
-                                                   const MCSubtargetInfo &STI) {
-    if (InShadow) {
-      SmallString<256> Code;
-      SmallVector<MCFixup, 4> Fixups;
-      raw_svector_ostream VecOS(Code);
-      CodeEmitter->encodeInstruction(Inst, VecOS, Fixups, STI);
-      CurrentShadowSize += Code.size();
-      if (CurrentShadowSize >= RequiredShadowSize)
-        InShadow = false; // The shadow is big enough. Stop counting.
-    }
+void X86AsmPrinter::StackMapShadowTracker::count(MCInst &Inst,
+                                                 const MCSubtargetInfo &STI,
+                                                 MCCodeEmitter *CodeEmitter) {
+  if (InShadow) {
+    SmallString<256> Code;
+    SmallVector<MCFixup, 4> Fixups;
+    raw_svector_ostream VecOS(Code);
+    CodeEmitter->encodeInstruction(Inst, VecOS, Fixups, STI);
+    CurrentShadowSize += Code.size();
+    if (CurrentShadowSize >= RequiredShadowSize)
+      InShadow = false; // The shadow is big enough. Stop counting.
   }
+}
 
-  void X86AsmPrinter::StackMapShadowTracker::emitShadowPadding(
+void X86AsmPrinter::StackMapShadowTracker::emitShadowPadding(
     MCStreamer &OutStreamer, const MCSubtargetInfo &STI) {
-    if (InShadow && CurrentShadowSize < RequiredShadowSize) {
-      InShadow = false;
-      EmitNops(OutStreamer, RequiredShadowSize - CurrentShadowSize,
-               MF->getSubtarget<X86Subtarget>().is64Bit(), STI);
-    }
+  if (InShadow && CurrentShadowSize < RequiredShadowSize) {
+    InShadow = false;
+    EmitNops(OutStreamer, RequiredShadowSize - CurrentShadowSize,
+             MF->getSubtarget<X86Subtarget>().is64Bit(), STI);
   }
+}
 
-  void X86AsmPrinter::EmitAndCountInstruction(MCInst &Inst) {
-    OutStreamer->EmitInstruction(Inst, getSubtargetInfo());
-    SMShadowTracker.count(Inst, getSubtargetInfo());
-  }
-} // end llvm namespace
+void X86AsmPrinter::EmitAndCountInstruction(MCInst &Inst) {
+  OutStreamer->EmitInstruction(Inst, getSubtargetInfo());
+  SMShadowTracker.count(Inst, getSubtargetInfo(), CodeEmitter.get());
+}
 
 X86MCInstLower::X86MCInstLower(const MachineFunction &mf,
                                X86AsmPrinter &asmprinter)
@@ -140,12 +133,8 @@ GetSymbolFromOperand(const MachineOperand &MO) const {
     // Handle dllimport linkage.
     Name += "__imp_";
     break;
-  case X86II::MO_DARWIN_STUB:
-    Suffix = "$stub";
-    break;
   case X86II::MO_DARWIN_NONLAZY:
   case X86II::MO_DARWIN_NONLAZY_PIC_BASE:
-  case X86II::MO_DARWIN_HIDDEN_NONLAZY_PIC_BASE:
     Suffix = "$non_lazy_ptr";
     break;
   }
@@ -153,8 +142,6 @@ GetSymbolFromOperand(const MachineOperand &MO) const {
   if (!Suffix.empty())
     Name += DL.getPrivateGlobalPrefix();
 
-  unsigned PrefixLen = Name.size();
-
   if (MO.isGlobal()) {
     const GlobalValue *GV = MO.getGlobal();
     AsmPrinter.getNameWithPrefix(Name, GV);
@@ -164,14 +151,11 @@ GetSymbolFromOperand(const MachineOperand &MO) const {
     assert(Suffix.empty());
     Sym = MO.getMBB()->getSymbol();
   }
-  unsigned OrigLen = Name.size() - PrefixLen;
 
   Name += Suffix;
   if (!Sym)
     Sym = Ctx.getOrCreateSymbol(Name);
 
-  StringRef OrigName = StringRef(Name).substr(PrefixLen, OrigLen);
-
   // If the target flags on the operand changes the name of the symbol, do that
   // before we return the symbol.
   switch (MO.getTargetFlags()) {
@@ -189,36 +173,6 @@ GetSymbolFromOperand(const MachineOperand &MO) const {
     }
     break;
   }
-  case X86II::MO_DARWIN_HIDDEN_NONLAZY_PIC_BASE: {
-    MachineModuleInfoImpl::StubValueTy &StubSym =
-      getMachOMMI().getHiddenGVStubEntry(Sym);
-    if (!StubSym.getPointer()) {
-      assert(MO.isGlobal() && "Extern symbol not handled yet");
-      StubSym =
-        MachineModuleInfoImpl::
-        StubValueTy(AsmPrinter.getSymbol(MO.getGlobal()),
-                    !MO.getGlobal()->hasInternalLinkage());
-    }
-    break;
-  }
-  case X86II::MO_DARWIN_STUB: {
-    MachineModuleInfoImpl::StubValueTy &StubSym =
-      getMachOMMI().getFnStubEntry(Sym);
-    if (StubSym.getPointer())
-      return Sym;
-
-    if (MO.isGlobal()) {
-      StubSym =
-        MachineModuleInfoImpl::
-        StubValueTy(AsmPrinter.getSymbol(MO.getGlobal()),
-                    !MO.getGlobal()->hasInternalLinkage());
-    } else {
-      StubSym =
-        MachineModuleInfoImpl::
-        StubValueTy(Ctx.getOrCreateSymbol(OrigName), false);
-    }
-    break;
-  }
   }
 
   return Sym;
@@ -237,7 +191,6 @@ MCOperand X86MCInstLower::LowerSymbolOperand(const MachineOperand &MO,
   // These affect the name of the symbol, not any suffix.
   case X86II::MO_DARWIN_NONLAZY:
   case X86II::MO_DLLIMPORT:
-  case X86II::MO_DARWIN_STUB:
     break;
 
   case X86II::MO_TLVP:      RefKind = MCSymbolRefExpr::VK_TLVP; break;
@@ -265,14 +218,13 @@ MCOperand X86MCInstLower::LowerSymbolOperand(const MachineOperand &MO,
   case X86II::MO_PLT:       RefKind = MCSymbolRefExpr::VK_PLT; break;
   case X86II::MO_PIC_BASE_OFFSET:
   case X86II::MO_DARWIN_NONLAZY_PIC_BASE:
-  case X86II::MO_DARWIN_HIDDEN_NONLAZY_PIC_BASE:
     Expr = MCSymbolRefExpr::create(Sym, Ctx);
     // Subtract the pic base.
     Expr = MCBinaryExpr::createSub(Expr,
                             MCSymbolRefExpr::create(MF.getPICBaseSymbol(), Ctx),
                                    Ctx);
     if (MO.isJTI()) {
-      assert(MAI.doesSetDirectiveSuppressesReloc());
+      assert(MAI.doesSetDirectiveSuppressReloc());
       // If .set directive is supported, use it to reduce the number of
       // relocations the assembler will generate for differences between
       // local labels. This is only safe when the symbols are in the same
@@ -653,50 +605,81 @@ ReSimplify:
   // MOV64ao8, MOV64o8a
   // XCHG16ar, XCHG32ar, XCHG64ar
   case X86::MOV8mr_NOREX:
-  case X86::MOV8mr:     SimplifyShortMoveForm(AsmPrinter, OutMI, X86::MOV8o32a); break;
+  case X86::MOV8mr:
   case X86::MOV8rm_NOREX:
-  case X86::MOV8rm:     SimplifyShortMoveForm(AsmPrinter, OutMI, X86::MOV8ao32); break;
-  case X86::MOV16mr:    SimplifyShortMoveForm(AsmPrinter, OutMI, X86::MOV16o32a); break;
-  case X86::MOV16rm:    SimplifyShortMoveForm(AsmPrinter, OutMI, X86::MOV16ao32); break;
-  case X86::MOV32mr:    SimplifyShortMoveForm(AsmPrinter, OutMI, X86::MOV32o32a); break;
-  case X86::MOV32rm:    SimplifyShortMoveForm(AsmPrinter, OutMI, X86::MOV32ao32); break;
-
-  case X86::ADC8ri:     SimplifyShortImmForm(OutMI, X86::ADC8i8);    break;
-  case X86::ADC16ri:    SimplifyShortImmForm(OutMI, X86::ADC16i16);  break;
-  case X86::ADC32ri:    SimplifyShortImmForm(OutMI, X86::ADC32i32);  break;
-  case X86::ADC64ri32:  SimplifyShortImmForm(OutMI, X86::ADC64i32);  break;
-  case X86::ADD8ri:     SimplifyShortImmForm(OutMI, X86::ADD8i8);    break;
-  case X86::ADD16ri:    SimplifyShortImmForm(OutMI, X86::ADD16i16);  break;
-  case X86::ADD32ri:    SimplifyShortImmForm(OutMI, X86::ADD32i32);  break;
-  case X86::ADD64ri32:  SimplifyShortImmForm(OutMI, X86::ADD64i32);  break;
-  case X86::AND8ri:     SimplifyShortImmForm(OutMI, X86::AND8i8);    break;
-  case X86::AND16ri:    SimplifyShortImmForm(OutMI, X86::AND16i16);  break;
-  case X86::AND32ri:    SimplifyShortImmForm(OutMI, X86::AND32i32);  break;
-  case X86::AND64ri32:  SimplifyShortImmForm(OutMI, X86::AND64i32);  break;
-  case X86::CMP8ri:     SimplifyShortImmForm(OutMI, X86::CMP8i8);    break;
-  case X86::CMP16ri:    SimplifyShortImmForm(OutMI, X86::CMP16i16);  break;
-  case X86::CMP32ri:    SimplifyShortImmForm(OutMI, X86::CMP32i32);  break;
-  case X86::CMP64ri32:  SimplifyShortImmForm(OutMI, X86::CMP64i32);  break;
-  case X86::OR8ri:      SimplifyShortImmForm(OutMI, X86::OR8i8);     break;
-  case X86::OR16ri:     SimplifyShortImmForm(OutMI, X86::OR16i16);   break;
-  case X86::OR32ri:     SimplifyShortImmForm(OutMI, X86::OR32i32);   break;
-  case X86::OR64ri32:   SimplifyShortImmForm(OutMI, X86::OR64i32);   break;
-  case X86::SBB8ri:     SimplifyShortImmForm(OutMI, X86::SBB8i8);    break;
-  case X86::SBB16ri:    SimplifyShortImmForm(OutMI, X86::SBB16i16);  break;
-  case X86::SBB32ri:    SimplifyShortImmForm(OutMI, X86::SBB32i32);  break;
-  case X86::SBB64ri32:  SimplifyShortImmForm(OutMI, X86::SBB64i32);  break;
-  case X86::SUB8ri:     SimplifyShortImmForm(OutMI, X86::SUB8i8);    break;
-  case X86::SUB16ri:    SimplifyShortImmForm(OutMI, X86::SUB16i16);  break;
-  case X86::SUB32ri:    SimplifyShortImmForm(OutMI, X86::SUB32i32);  break;
-  case X86::SUB64ri32:  SimplifyShortImmForm(OutMI, X86::SUB64i32);  break;
-  case X86::TEST8ri:    SimplifyShortImmForm(OutMI, X86::TEST8i8);   break;
-  case X86::TEST16ri:   SimplifyShortImmForm(OutMI, X86::TEST16i16); break;
-  case X86::TEST32ri:   SimplifyShortImmForm(OutMI, X86::TEST32i32); break;
-  case X86::TEST64ri32: SimplifyShortImmForm(OutMI, X86::TEST64i32); break;
-  case X86::XOR8ri:     SimplifyShortImmForm(OutMI, X86::XOR8i8);    break;
-  case X86::XOR16ri:    SimplifyShortImmForm(OutMI, X86::XOR16i16);  break;
-  case X86::XOR32ri:    SimplifyShortImmForm(OutMI, X86::XOR32i32);  break;
-  case X86::XOR64ri32:  SimplifyShortImmForm(OutMI, X86::XOR64i32);  break;
+  case X86::MOV8rm:
+  case X86::MOV16mr:
+  case X86::MOV16rm:
+  case X86::MOV32mr:
+  case X86::MOV32rm: {
+    unsigned NewOpc;
+    switch (OutMI.getOpcode()) {
+    default: llvm_unreachable("Invalid opcode");
+    case X86::MOV8mr_NOREX:
+    case X86::MOV8mr:     NewOpc = X86::MOV8o32a; break;
+    case X86::MOV8rm_NOREX:
+    case X86::MOV8rm:     NewOpc = X86::MOV8ao32; break;
+    case X86::MOV16mr:    NewOpc = X86::MOV16o32a; break;
+    case X86::MOV16rm:    NewOpc = X86::MOV16ao32; break;
+    case X86::MOV32mr:    NewOpc = X86::MOV32o32a; break;
+    case X86::MOV32rm:    NewOpc = X86::MOV32ao32; break;
+    }
+    SimplifyShortMoveForm(AsmPrinter, OutMI, NewOpc);
+    break;
+  }
+
+  case X86::ADC8ri: case X86::ADC16ri: case X86::ADC32ri: case X86::ADC64ri32:
+  case X86::ADD8ri: case X86::ADD16ri: case X86::ADD32ri: case X86::ADD64ri32:
+  case X86::AND8ri: case X86::AND16ri: case X86::AND32ri: case X86::AND64ri32:
+  case X86::CMP8ri: case X86::CMP16ri: case X86::CMP32ri: case X86::CMP64ri32:
+  case X86::OR8ri:  case X86::OR16ri:  case X86::OR32ri:  case X86::OR64ri32:
+  case X86::SBB8ri: case X86::SBB16ri: case X86::SBB32ri: case X86::SBB64ri32:
+  case X86::SUB8ri: case X86::SUB16ri: case X86::SUB32ri: case X86::SUB64ri32:
+  case X86::TEST8ri:case X86::TEST16ri:case X86::TEST32ri:case X86::TEST64ri32:
+  case X86::XOR8ri: case X86::XOR16ri: case X86::XOR32ri: case X86::XOR64ri32: {
+    unsigned NewOpc;
+    switch (OutMI.getOpcode()) {
+    default: llvm_unreachable("Invalid opcode");
+    case X86::ADC8ri:     NewOpc = X86::ADC8i8;    break;
+    case X86::ADC16ri:    NewOpc = X86::ADC16i16;  break;
+    case X86::ADC32ri:    NewOpc = X86::ADC32i32;  break;
+    case X86::ADC64ri32:  NewOpc = X86::ADC64i32;  break;
+    case X86::ADD8ri:     NewOpc = X86::ADD8i8;    break;
+    case X86::ADD16ri:    NewOpc = X86::ADD16i16;  break;
+    case X86::ADD32ri:    NewOpc = X86::ADD32i32;  break;
+    case X86::ADD64ri32:  NewOpc = X86::ADD64i32;  break;
+    case X86::AND8ri:     NewOpc = X86::AND8i8;    break;
+    case X86::AND16ri:    NewOpc = X86::AND16i16;  break;
+    case X86::AND32ri:    NewOpc = X86::AND32i32;  break;
+    case X86::AND64ri32:  NewOpc = X86::AND64i32;  break;
+    case X86::CMP8ri:     NewOpc = X86::CMP8i8;    break;
+    case X86::CMP16ri:    NewOpc = X86::CMP16i16;  break;
+    case X86::CMP32ri:    NewOpc = X86::CMP32i32;  break;
+    case X86::CMP64ri32:  NewOpc = X86::CMP64i32;  break;
+    case X86::OR8ri:      NewOpc = X86::OR8i8;     break;
+    case X86::OR16ri:     NewOpc = X86::OR16i16;   break;
+    case X86::OR32ri:     NewOpc = X86::OR32i32;   break;
+    case X86::OR64ri32:   NewOpc = X86::OR64i32;   break;
+    case X86::SBB8ri:     NewOpc = X86::SBB8i8;    break;
+    case X86::SBB16ri:    NewOpc = X86::SBB16i16;  break;
+    case X86::SBB32ri:    NewOpc = X86::SBB32i32;  break;
+    case X86::SBB64ri32:  NewOpc = X86::SBB64i32;  break;
+    case X86::SUB8ri:     NewOpc = X86::SUB8i8;    break;
+    case X86::SUB16ri:    NewOpc = X86::SUB16i16;  break;
+    case X86::SUB32ri:    NewOpc = X86::SUB32i32;  break;
+    case X86::SUB64ri32:  NewOpc = X86::SUB64i32;  break;
+    case X86::TEST8ri:    NewOpc = X86::TEST8i8;   break;
+    case X86::TEST16ri:   NewOpc = X86::TEST16i16; break;
+    case X86::TEST32ri:   NewOpc = X86::TEST32i32; break;
+    case X86::TEST64ri32: NewOpc = X86::TEST64i32; break;
+    case X86::XOR8ri:     NewOpc = X86::XOR8i8;    break;
+    case X86::XOR16ri:    NewOpc = X86::XOR16i16;  break;
+    case X86::XOR32ri:    NewOpc = X86::XOR32i32;  break;
+    case X86::XOR64ri32:  NewOpc = X86::XOR64i32;  break;
+    }
+    SimplifyShortImmForm(OutMI, NewOpc);
+    break;
+  }
 
   // Try to shrink some forms of movsx.
   case X86::MOVSX16rr8:
@@ -785,55 +768,77 @@ void X86AsmPrinter::LowerTlsAddr(X86MCInstLower &MCInstLowering,
                             .addExpr(tlsRef));
 }
 
-/// \brief Emit the optimal amount of multi-byte nops on X86.
-static void EmitNops(MCStreamer &OS, unsigned NumBytes, bool Is64Bit, const MCSubtargetInfo &STI) {
+/// \brief Emit the largest nop instruction smaller than or equal to \p NumBytes
+/// bytes.  Return the size of nop emitted.
+static unsigned EmitNop(MCStreamer &OS, unsigned NumBytes, bool Is64Bit,
+                        const MCSubtargetInfo &STI) {
   // This works only for 64bit. For 32bit we have to do additional checking if
   // the CPU supports multi-byte nops.
   assert(Is64Bit && "EmitNops only supports X86-64");
-  while (NumBytes) {
-    unsigned Opc, BaseReg, ScaleVal, IndexReg, Displacement, SegmentReg;
-    Opc = IndexReg = Displacement = SegmentReg = 0;
-    BaseReg = X86::RAX; ScaleVal = 1;
-    switch (NumBytes) {
-    case  0: llvm_unreachable("Zero nops?"); break;
-    case  1: NumBytes -=  1; Opc = X86::NOOP; break;
-    case  2: NumBytes -=  2; Opc = X86::XCHG16ar; break;
-    case  3: NumBytes -=  3; Opc = X86::NOOPL; break;
-    case  4: NumBytes -=  4; Opc = X86::NOOPL; Displacement = 8; break;
-    case  5: NumBytes -=  5; Opc = X86::NOOPL; Displacement = 8;
-             IndexReg = X86::RAX; break;
-    case  6: NumBytes -=  6; Opc = X86::NOOPW; Displacement = 8;
-             IndexReg = X86::RAX; break;
-    case  7: NumBytes -=  7; Opc = X86::NOOPL; Displacement = 512; break;
-    case  8: NumBytes -=  8; Opc = X86::NOOPL; Displacement = 512;
-             IndexReg = X86::RAX; break;
-    case  9: NumBytes -=  9; Opc = X86::NOOPW; Displacement = 512;
-             IndexReg = X86::RAX; break;
-    default: NumBytes -= 10; Opc = X86::NOOPW; Displacement = 512;
-             IndexReg = X86::RAX; SegmentReg = X86::CS; break;
-    }
 
-    unsigned NumPrefixes = std::min(NumBytes, 5U);
-    NumBytes -= NumPrefixes;
-    for (unsigned i = 0; i != NumPrefixes; ++i)
-      OS.EmitBytes("\x66");
+  unsigned NopSize;
+  unsigned Opc, BaseReg, ScaleVal, IndexReg, Displacement, SegmentReg;
+  Opc = IndexReg = Displacement = SegmentReg = 0;
+  BaseReg = X86::RAX;
+  ScaleVal = 1;
+  switch (NumBytes) {
+  case  0: llvm_unreachable("Zero nops?"); break;
+  case  1: NopSize = 1; Opc = X86::NOOP; break;
+  case  2: NopSize = 2; Opc = X86::XCHG16ar; break;
+  case  3: NopSize = 3; Opc = X86::NOOPL; break;
+  case  4: NopSize = 4; Opc = X86::NOOPL; Displacement = 8; break;
+  case  5: NopSize = 5; Opc = X86::NOOPL; Displacement = 8;
+           IndexReg = X86::RAX; break;
+  case  6: NopSize = 6; Opc = X86::NOOPW; Displacement = 8;
+           IndexReg = X86::RAX; break;
+  case  7: NopSize = 7; Opc = X86::NOOPL; Displacement = 512; break;
+  case  8: NopSize = 8; Opc = X86::NOOPL; Displacement = 512;
+           IndexReg = X86::RAX; break;
+  case  9: NopSize = 9; Opc = X86::NOOPW; Displacement = 512;
+           IndexReg = X86::RAX; break;
+  default: NopSize = 10; Opc = X86::NOOPW; Displacement = 512;
+           IndexReg = X86::RAX; SegmentReg = X86::CS; break;
+  }
 
-    switch (Opc) {
-    default: llvm_unreachable("Unexpected opcode"); break;
-    case X86::NOOP:
-      OS.EmitInstruction(MCInstBuilder(Opc), STI);
-      break;
-    case X86::XCHG16ar:
-      OS.EmitInstruction(MCInstBuilder(Opc).addReg(X86::AX), STI);
-      break;
-    case X86::NOOPL:
-    case X86::NOOPW:
-      OS.EmitInstruction(MCInstBuilder(Opc).addReg(BaseReg)
-                         .addImm(ScaleVal).addReg(IndexReg)
-                         .addImm(Displacement).addReg(SegmentReg), STI);
-      break;
-    }
-  } // while (NumBytes)
+  unsigned NumPrefixes = std::min(NumBytes - NopSize, 5U);
+  NopSize += NumPrefixes;
+  for (unsigned i = 0; i != NumPrefixes; ++i)
+    OS.EmitBytes("\x66");
+
+  switch (Opc) {
+  default:
+    llvm_unreachable("Unexpected opcode");
+    break;
+  case X86::NOOP:
+    OS.EmitInstruction(MCInstBuilder(Opc), STI);
+    break;
+  case X86::XCHG16ar:
+    OS.EmitInstruction(MCInstBuilder(Opc).addReg(X86::AX), STI);
+    break;
+  case X86::NOOPL:
+  case X86::NOOPW:
+    OS.EmitInstruction(MCInstBuilder(Opc)
+                           .addReg(BaseReg)
+                           .addImm(ScaleVal)
+                           .addReg(IndexReg)
+                           .addImm(Displacement)
+                           .addReg(SegmentReg),
+                       STI);
+    break;
+  }
+  assert(NopSize <= NumBytes && "We overemitted?");
+  return NopSize;
+}
+
+/// \brief Emit the optimal amount of multi-byte nops on X86.
+static void EmitNops(MCStreamer &OS, unsigned NumBytes, bool Is64Bit,
+                     const MCSubtargetInfo &STI) {
+  unsigned NopsToEmit = NumBytes;
+  (void)NopsToEmit;
+  while (NumBytes) {
+    NumBytes -= EmitNop(OS, NumBytes, Is64Bit, STI);
+    assert(NopsToEmit >= NumBytes && "Emitted more than I asked for!");
+  }
 }
 
 void X86AsmPrinter::LowerSTATEPOINT(const MachineInstr &MI,
@@ -891,10 +896,10 @@ void X86AsmPrinter::LowerSTATEPOINT(const MachineInstr &MI,
 
 void X86AsmPrinter::LowerFAULTING_LOAD_OP(const MachineInstr &MI,
                                        X86MCInstLower &MCIL) {
-  // FAULTING_LOAD_OP <def>, <handler label>, <load opcode>, <load operands>
+  // FAULTING_LOAD_OP <def>, <MBB handler>, <load opcode>, <load operands>
 
   unsigned LoadDefRegister = MI.getOperand(0).getReg();
-  MCSymbol *HandlerLabel = MI.getOperand(1).getMCSymbol();
+  MCSymbol *HandlerLabel = MI.getOperand(1).getMBB()->getSymbol();
   unsigned LoadOpcode = MI.getOperand(2).getImm();
   unsigned LoadOperandsBeginIdx = 3;
 
@@ -915,6 +920,43 @@ void X86AsmPrinter::LowerFAULTING_LOAD_OP(const MachineInstr &MI,
   OutStreamer->EmitInstruction(LoadMI, getSubtargetInfo());
 }
 
+void X86AsmPrinter::LowerPATCHABLE_OP(const MachineInstr &MI,
+                                      X86MCInstLower &MCIL) {
+  // PATCHABLE_OP minsize, opcode, operands
+
+  unsigned MinSize = MI.getOperand(0).getImm();
+  unsigned Opcode = MI.getOperand(1).getImm();
+
+  MCInst MCI;
+  MCI.setOpcode(Opcode);
+  for (auto &MO : make_range(MI.operands_begin() + 2, MI.operands_end()))
+    if (auto MaybeOperand = MCIL.LowerMachineOperand(&MI, MO))
+      MCI.addOperand(MaybeOperand.getValue());
+
+  SmallString<256> Code;
+  SmallVector<MCFixup, 4> Fixups;
+  raw_svector_ostream VecOS(Code);
+  CodeEmitter->encodeInstruction(MCI, VecOS, Fixups, getSubtargetInfo());
+
+  if (Code.size() < MinSize) {
+    if (MinSize == 2 && Opcode == X86::PUSH64r) {
+      // This is an optimization that lets us get away without emitting a nop in
+      // many cases.
+      //
+      // NB! In some cases the encoding for PUSH64r (e.g. PUSH64r %R9) takes two
+      // bytes too, so the check on MinSize is important.
+      MCI.setOpcode(X86::PUSH64rmr);
+    } else {
+      unsigned NopSize = EmitNop(*OutStreamer, MinSize, Subtarget->is64Bit(),
+                                 getSubtargetInfo());
+      assert(NopSize == MinSize && "Could not implement MinSize!");
+      (void) NopSize;
+    }
+  }
+
+  OutStreamer->EmitInstruction(MCI, getSubtargetInfo());
+}
+
 // Lower a stackmap of the form:
 // <id>, <shadowBytes>, ...
 void X86AsmPrinter::LowerSTACKMAP(const MachineInstr &MI) {
@@ -982,14 +1024,107 @@ void X86AsmPrinter::LowerPATCHPOINT(const MachineInstr &MI,
            getSubtargetInfo());
 }
 
+void X86AsmPrinter::recordSled(MCSymbol *Sled, const MachineInstr &MI,
+                               SledKind Kind) {
+  auto Fn = MI.getParent()->getParent()->getFunction();
+  auto Attr = Fn->getFnAttribute("function-instrument");
+  bool AlwaysInstrument =
+      Attr.isStringAttribute() && Attr.getValueAsString() == "xray-always";
+  Sleds.emplace_back(
+      XRayFunctionEntry{Sled, CurrentFnSym, Kind, AlwaysInstrument, Fn});
+}
+
+void X86AsmPrinter::LowerPATCHABLE_FUNCTION_ENTER(const MachineInstr &MI,
+                                                  X86MCInstLower &MCIL) {
+  // We want to emit the following pattern:
+  //
+  // .Lxray_sled_N:
+  //   .palign 2, ...
+  //   jmp .tmpN
+  //   # 9 bytes worth of noops
+  // .tmpN
+  //
+  // We need the 9 bytes because at runtime, we'd be patching over the full 11
+  // bytes with the following pattern:
+  //
+  //   mov %r10, <function id, 32-bit>   // 6 bytes
+  //   call <relative offset, 32-bits>   // 5 bytes
+  //
+  auto CurSled = OutContext.createTempSymbol("xray_sled_", true);
+  OutStreamer->EmitLabel(CurSled);
+  OutStreamer->EmitCodeAlignment(4);
+  auto Target = OutContext.createTempSymbol();
+
+  // Use a two-byte `jmp`. This version of JMP takes an 8-bit relative offset as
+  // an operand (computed as an offset from the jmp instruction).
+  // FIXME: Find another less hacky way do force the relative jump.
+  OutStreamer->EmitBytes("\xeb\x09");
+  EmitNops(*OutStreamer, 9, Subtarget->is64Bit(), getSubtargetInfo());
+  OutStreamer->EmitLabel(Target);
+  recordSled(CurSled, MI, SledKind::FUNCTION_ENTER);
+}
+
+void X86AsmPrinter::LowerPATCHABLE_RET(const MachineInstr &MI,
+                                       X86MCInstLower &MCIL) {
+  // Since PATCHABLE_RET takes the opcode of the return statement as an
+  // argument, we use that to emit the correct form of the RET that we want.
+  // i.e. when we see this:
+  //
+  //   PATCHABLE_RET X86::RET ...
+  //
+  // We should emit the RET followed by sleds.
+  //
+  // .Lxray_sled_N:
+  //   ret  # or equivalent instruction
+  //   # 10 bytes worth of noops
+  //
+  // This just makes sure that the alignment for the next instruction is 2.
+  auto CurSled = OutContext.createTempSymbol("xray_sled_", true);
+  OutStreamer->EmitLabel(CurSled);
+  unsigned OpCode = MI.getOperand(0).getImm();
+  MCInst Ret;
+  Ret.setOpcode(OpCode);
+  for (auto &MO : make_range(MI.operands_begin() + 1, MI.operands_end()))
+    if (auto MaybeOperand = MCIL.LowerMachineOperand(&MI, MO))
+      Ret.addOperand(MaybeOperand.getValue());
+  OutStreamer->EmitInstruction(Ret, getSubtargetInfo());
+  EmitNops(*OutStreamer, 10, Subtarget->is64Bit(), getSubtargetInfo());
+  recordSled(CurSled, MI, SledKind::FUNCTION_EXIT);
+}
+
+void X86AsmPrinter::EmitXRayTable() {
+  if (Sleds.empty())
+    return;
+  if (Subtarget->isTargetELF()) {
+    auto *Section = OutContext.getELFSection(
+        "xray_instr_map", ELF::SHT_PROGBITS,
+        ELF::SHF_ALLOC | ELF::SHF_GROUP | ELF::SHF_MERGE, 0,
+        CurrentFnSym->getName());
+    auto PrevSection = OutStreamer->getCurrentSectionOnly();
+    OutStreamer->SwitchSection(Section);
+    for (const auto &Sled : Sleds) {
+      OutStreamer->EmitSymbolValue(Sled.Sled, 8);
+      OutStreamer->EmitSymbolValue(CurrentFnSym, 8);
+      auto Kind = static_cast<uint8_t>(Sled.Kind);
+      OutStreamer->EmitBytes(
+          StringRef(reinterpret_cast<const char *>(&Kind), 1));
+      OutStreamer->EmitBytes(
+          StringRef(reinterpret_cast<const char *>(&Sled.AlwaysInstrument), 1));
+      OutStreamer->EmitZeros(14);
+    }
+    OutStreamer->SwitchSection(PrevSection);
+  }
+  Sleds.clear();
+}
+
 // Returns instruction preceding MBBI in MachineFunction.
 // If MBBI is the first instruction of the first basic block, returns null.
 static MachineBasicBlock::const_iterator
 PrevCrossBBInst(MachineBasicBlock::const_iterator MBBI) {
   const MachineBasicBlock *MBB = MBBI->getParent();
   while (MBBI == MBB->begin()) {
-    if (MBB == MBB->getParent()->begin())
-      return nullptr;
+    if (MBB == &MBB->getParent()->front())
+      return MachineBasicBlock::const_iterator();
     MBB = MBB->getPrevNode();
     MBBI = MBB->end();
   }
@@ -1018,7 +1153,8 @@ static const Constant *getConstantFromPool(const MachineInstr &MI,
 }
 
 static std::string getShuffleComment(const MachineOperand &DstOp,
-                                     const MachineOperand &SrcOp,
+                                     const MachineOperand &SrcOp1,
+                                     const MachineOperand &SrcOp2,
                                      ArrayRef<int> Mask) {
   std::string Comment;
 
@@ -1031,40 +1167,51 @@ static std::string getShuffleComment(const MachineOperand &DstOp,
     return X86ATTInstPrinter::getRegisterName(RegNum);
   };
 
+  // TODO: Add support for specifying an AVX512 style mask register in the comment.
   StringRef DstName = DstOp.isReg() ? GetRegisterName(DstOp.getReg()) : "mem";
-  StringRef SrcName = SrcOp.isReg() ? GetRegisterName(SrcOp.getReg()) : "mem";
+  StringRef Src1Name =
+      SrcOp1.isReg() ? GetRegisterName(SrcOp1.getReg()) : "mem";
+  StringRef Src2Name =
+      SrcOp2.isReg() ? GetRegisterName(SrcOp2.getReg()) : "mem";
+
+  // One source operand, fix the mask to print all elements in one span.
+  SmallVector<int, 8> ShuffleMask(Mask.begin(), Mask.end());
+  if (Src1Name == Src2Name)
+    for (int i = 0, e = ShuffleMask.size(); i != e; ++i)
+      if (ShuffleMask[i] >= e)
+        ShuffleMask[i] -= e;
 
   raw_string_ostream CS(Comment);
   CS << DstName << " = ";
-  bool NeedComma = false;
-  bool InSrc = false;
-  for (int M : Mask) {
-    // Wrap up any prior entry...
-    if (M == SM_SentinelZero && InSrc) {
-      InSrc = false;
-      CS << "]";
-    }
-    if (NeedComma)
+  for (int i = 0, e = ShuffleMask.size(); i != e; ++i) {
+    if (i != 0)
       CS << ",";
-    else
-      NeedComma = true;
-
-    // Print this shuffle...
-    if (M == SM_SentinelZero) {
+    if (ShuffleMask[i] == SM_SentinelZero) {
       CS << "zero";
-    } else {
-      if (!InSrc) {
-        InSrc = true;
-        CS << SrcName << "[";
-      }
-      if (M == SM_SentinelUndef)
+      continue;
+    }
+
+    // Otherwise, it must come from src1 or src2.  Print the span of elements
+    // that comes from this src.
+    bool isSrc1 = ShuffleMask[i] < (int)e;
+    CS << (isSrc1 ? Src1Name : Src2Name) << '[';
+
+    bool IsFirst = true;
+    while (i != e && ShuffleMask[i] != SM_SentinelZero &&
+           (ShuffleMask[i] < (int)e) == isSrc1) {
+      if (!IsFirst)
+        CS << ',';
+      else
+        IsFirst = false;
+      if (ShuffleMask[i] == SM_SentinelUndef)
         CS << "u";
       else
-        CS << M;
+        CS << ShuffleMask[i] % (int)e;
+      ++i;
     }
+    CS << ']';
+    --i; // For loop increments element #.
   }
-  if (InSrc)
-    CS << "]";
   CS.flush();
 
   return Comment;
@@ -1202,12 +1349,21 @@ void X86AsmPrinter::EmitInstruction(const MachineInstr *MI) {
   case TargetOpcode::FAULTING_LOAD_OP:
     return LowerFAULTING_LOAD_OP(*MI, MCInstLowering);
 
+  case TargetOpcode::PATCHABLE_OP:
+    return LowerPATCHABLE_OP(*MI, MCInstLowering);
+
   case TargetOpcode::STACKMAP:
     return LowerSTACKMAP(*MI);
 
   case TargetOpcode::PATCHPOINT:
     return LowerPATCHPOINT(*MI, MCInstLowering);
 
+  case TargetOpcode::PATCHABLE_FUNCTION_ENTER:
+    return LowerPATCHABLE_FUNCTION_ENTER(*MI, MCInstLowering);
+
+  case TargetOpcode::PATCHABLE_RET:
+    return LowerPATCHABLE_RET(*MI, MCInstLowering);
+
   case X86::MORESTACK_RET:
     EmitAndCountInstruction(MCInstBuilder(getRetOpcode(*Subtarget)));
     return;
@@ -1254,7 +1410,9 @@ void X86AsmPrinter::EmitInstruction(const MachineInstr *MI) {
   case X86::SEH_Epilogue: {
     MachineBasicBlock::const_iterator MBBI(MI);
     // Check if preceded by a call and emit nop if so.
-    for (MBBI = PrevCrossBBInst(MBBI); MBBI; MBBI = PrevCrossBBInst(MBBI)) {
+    for (MBBI = PrevCrossBBInst(MBBI);
+         MBBI != MachineBasicBlock::const_iterator();
+         MBBI = PrevCrossBBInst(MBBI)) {
       // Conservatively assume that pseudo instructions don't emit code and keep
       // looking for a call. We may emit an unnecessary nop in some cases.
       if (!MBBI->isPseudo()) {
@@ -1313,14 +1471,38 @@ void X86AsmPrinter::EmitInstruction(const MachineInstr *MI) {
       SmallVector<int, 16> Mask;
       DecodePSHUFBMask(C, Mask);
       if (!Mask.empty())
-        OutStreamer->AddComment(getShuffleComment(DstOp, SrcOp, Mask));
+        OutStreamer->AddComment(getShuffleComment(DstOp, SrcOp, SrcOp, Mask));
     }
     break;
   }
-  case X86::VPERMILPSrm:
+
   case X86::VPERMILPDrm:
+  case X86::VPERMILPDYrm:
+  case X86::VPERMILPDZ128rm:
+  case X86::VPERMILPDZ256rm:
+  case X86::VPERMILPDZrm: {
+    if (!OutStreamer->isVerboseAsm())
+      break;
+    assert(MI->getNumOperands() > 5 &&
+           "We should always have at least 5 operands!");
+    const MachineOperand &DstOp = MI->getOperand(0);
+    const MachineOperand &SrcOp = MI->getOperand(1);
+    const MachineOperand &MaskOp = MI->getOperand(5);
+
+    if (auto *C = getConstantFromPool(*MI, MaskOp)) {
+      SmallVector<int, 8> Mask;
+      DecodeVPERMILPMask(C, 64, Mask);
+      if (!Mask.empty())
+        OutStreamer->AddComment(getShuffleComment(DstOp, SrcOp, SrcOp, Mask));
+    }
+    break;
+  }
+
+  case X86::VPERMILPSrm:
   case X86::VPERMILPSYrm:
-  case X86::VPERMILPDYrm: {
+  case X86::VPERMILPSZ128rm:
+  case X86::VPERMILPSZ256rm:
+  case X86::VPERMILPSZrm: {
     if (!OutStreamer->isVerboseAsm())
       break;
     assert(MI->getNumOperands() > 5 &&
@@ -1329,18 +1511,63 @@ void X86AsmPrinter::EmitInstruction(const MachineInstr *MI) {
     const MachineOperand &SrcOp = MI->getOperand(1);
     const MachineOperand &MaskOp = MI->getOperand(5);
 
+    if (auto *C = getConstantFromPool(*MI, MaskOp)) {
+      SmallVector<int, 16> Mask;
+      DecodeVPERMILPMask(C, 32, Mask);
+      if (!Mask.empty())
+        OutStreamer->AddComment(getShuffleComment(DstOp, SrcOp, SrcOp, Mask));
+    }
+    break;
+  }
+
+  case X86::VPERMIL2PDrm:
+  case X86::VPERMIL2PSrm:
+  case X86::VPERMIL2PDrmY:
+  case X86::VPERMIL2PSrmY: {
+    if (!OutStreamer->isVerboseAsm())
+      break;
+    assert(MI->getNumOperands() > 7 &&
+      "We should always have at least 7 operands!");
+    const MachineOperand &DstOp = MI->getOperand(0);
+    const MachineOperand &SrcOp1 = MI->getOperand(1);
+    const MachineOperand &SrcOp2 = MI->getOperand(2);
+    const MachineOperand &MaskOp = MI->getOperand(6);
+    const MachineOperand &CtrlOp = MI->getOperand(MI->getNumOperands() - 1);
+
+    if (!CtrlOp.isImm())
+      break;
+
     unsigned ElSize;
     switch (MI->getOpcode()) {
     default: llvm_unreachable("Invalid opcode");
-    case X86::VPERMILPSrm: case X86::VPERMILPSYrm: ElSize = 32; break;
-    case X86::VPERMILPDrm: case X86::VPERMILPDYrm: ElSize = 64; break;
+    case X86::VPERMIL2PSrm: case X86::VPERMIL2PSrmY: ElSize = 32; break;
+    case X86::VPERMIL2PDrm: case X86::VPERMIL2PDrmY: ElSize = 64; break;
+    }
+
+    if (auto *C = getConstantFromPool(*MI, MaskOp)) {
+      SmallVector<int, 16> Mask;
+      DecodeVPERMIL2PMask(C, (unsigned)CtrlOp.getImm(), ElSize, Mask);
+      if (!Mask.empty())
+        OutStreamer->AddComment(getShuffleComment(DstOp, SrcOp1, SrcOp2, Mask));
     }
+    break;
+  }
+
+  case X86::VPPERMrrm: {
+    if (!OutStreamer->isVerboseAsm())
+      break;
+    assert(MI->getNumOperands() > 6 &&
+           "We should always have at least 6 operands!");
+    const MachineOperand &DstOp = MI->getOperand(0);
+    const MachineOperand &SrcOp1 = MI->getOperand(1);
+    const MachineOperand &SrcOp2 = MI->getOperand(2);
+    const MachineOperand &MaskOp = MI->getOperand(6);
 
     if (auto *C = getConstantFromPool(*MI, MaskOp)) {
       SmallVector<int, 16> Mask;
-      DecodeVPERMILPMask(C, ElSize, Mask);
+      DecodeVPPERMMask(C, Mask);
       if (!Mask.empty())
-        OutStreamer->AddComment(getShuffleComment(DstOp, SrcOp, Mask));
+        OutStreamer->AddComment(getShuffleComment(DstOp, SrcOp1, SrcOp2, Mask));
     }
     break;
   }
@@ -1413,7 +1640,7 @@ void X86AsmPrinter::EmitInstruction(const MachineInstr *MI) {
               CS << CI->getZExtValue();
             } else {
               // print multi-word constant as (w0,w1)
-              auto Val = CI->getValue();
+              const auto &Val = CI->getValue();
               CS << "(";
               for (int i = 0, N = Val.getNumWords(); i < N; ++i) {
                 if (i > 0)
@@ -1446,7 +1673,7 @@ void X86AsmPrinter::EmitInstruction(const MachineInstr *MI) {
   // is at the end of the shadow.
   if (MI->isCall()) {
     // Count then size of the call towards the shadow
-    SMShadowTracker.count(TmpInst, getSubtargetInfo());
+    SMShadowTracker.count(TmpInst, getSubtargetInfo(), CodeEmitter.get());
     // Then flush the shadow so that we fill with nops before the call, not
     // after it.
     SMShadowTracker.emitShadowPadding(*OutStreamer, getSubtargetInfo());
diff --git a/lib/Target/X86/X86MachineFunctionInfo.h b/lib/Target/X86/X86MachineFunctionInfo.h
index 00515dde5568..d517d82537a7 100644
--- a/lib/Target/X86/X86MachineFunctionInfo.h
+++ b/lib/Target/X86/X86MachineFunctionInfo.h
@@ -17,7 +17,6 @@
 #include "llvm/CodeGen/CallingConvLower.h"
 #include "llvm/CodeGen/MachineFunction.h"
 #include "llvm/CodeGen/MachineValueType.h"
-#include <vector>
 
 namespace llvm {
 
@@ -96,6 +95,12 @@ class X86MachineFunctionInfo : public MachineFunctionInfo {
   /// copies.
   bool IsSplitCSR = false;
 
+  /// True if this function uses the red zone.
+  bool UsesRedZone = false;
+
+  /// True if this function has WIN_ALLOCA instructions.
+  bool HasWinAlloca = false;
+
 private:
   /// ForwardedMustTailRegParms - A list of virtual and physical registers
   /// that must be forwarded to every musttail call.
@@ -167,6 +172,12 @@ public:
 
   bool isSplitCSR() const { return IsSplitCSR; }
   void setIsSplitCSR(bool s) { IsSplitCSR = s; }
+
+  bool getUsesRedZone() const { return UsesRedZone; }
+  void setUsesRedZone(bool V) { UsesRedZone = V; }
+
+  bool hasWinAlloca() const { return HasWinAlloca; }
+  void setHasWinAlloca(bool v) { HasWinAlloca = v; }
 };
 
 } // End llvm namespace
diff --git a/lib/Target/X86/X86OptimizeLEAs.cpp b/lib/Target/X86/X86OptimizeLEAs.cpp
index 45cc0aef1d93..4da0fddda395 100644
--- a/lib/Target/X86/X86OptimizeLEAs.cpp
+++ b/lib/Target/X86/X86OptimizeLEAs.cpp
@@ -8,7 +8,7 @@
 //===----------------------------------------------------------------------===//
 //
 // This file defines the pass that performs some optimizations with LEA
-// instructions in order to improve code size.
+// instructions in order to improve performance and code size.
 // Currently, it does two things:
 // 1) If there are two LEA instructions calculating addresses which only differ
 //    by displacement inside a basic block, one of them is removed.
@@ -24,6 +24,7 @@
 #include "llvm/CodeGen/LiveVariables.h"
 #include "llvm/CodeGen/MachineFunctionPass.h"
 #include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineOperand.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
 #include "llvm/CodeGen/Passes.h"
 #include "llvm/IR/Function.h"
@@ -35,13 +36,186 @@ using namespace llvm;
 
 #define DEBUG_TYPE "x86-optimize-LEAs"
 
-static cl::opt<bool> EnableX86LEAOpt("enable-x86-lea-opt", cl::Hidden,
-                                     cl::desc("X86: Enable LEA optimizations."),
-                                     cl::init(false));
+static cl::opt<bool>
+    DisableX86LEAOpt("disable-x86-lea-opt", cl::Hidden,
+                     cl::desc("X86: Disable LEA optimizations."),
+                     cl::init(false));
 
 STATISTIC(NumSubstLEAs, "Number of LEA instruction substitutions");
 STATISTIC(NumRedundantLEAs, "Number of redundant LEA instructions removed");
 
+class MemOpKey;
+
+/// \brief Returns a hash table key based on memory operands of \p MI. The
+/// number of the first memory operand of \p MI is specified through \p N.
+static inline MemOpKey getMemOpKey(const MachineInstr &MI, unsigned N);
+
+/// \brief Returns true if two machine operands are identical and they are not
+/// physical registers.
+static inline bool isIdenticalOp(const MachineOperand &MO1,
+                                 const MachineOperand &MO2);
+
+/// \brief Returns true if two address displacement operands are of the same
+/// type and use the same symbol/index/address regardless of the offset.
+static bool isSimilarDispOp(const MachineOperand &MO1,
+                            const MachineOperand &MO2);
+
+/// \brief Returns true if the instruction is LEA.
+static inline bool isLEA(const MachineInstr &MI);
+
+/// A key based on instruction's memory operands.
+class MemOpKey {
+public:
+  MemOpKey(const MachineOperand *Base, const MachineOperand *Scale,
+           const MachineOperand *Index, const MachineOperand *Segment,
+           const MachineOperand *Disp)
+      : Disp(Disp) {
+    Operands[0] = Base;
+    Operands[1] = Scale;
+    Operands[2] = Index;
+    Operands[3] = Segment;
+  }
+
+  bool operator==(const MemOpKey &Other) const {
+    // Addresses' bases, scales, indices and segments must be identical.
+    for (int i = 0; i < 4; ++i)
+      if (!isIdenticalOp(*Operands[i], *Other.Operands[i]))
+        return false;
+
+    // Addresses' displacements don't have to be exactly the same. It only
+    // matters that they use the same symbol/index/address. Immediates' or
+    // offsets' differences will be taken care of during instruction
+    // substitution.
+    return isSimilarDispOp(*Disp, *Other.Disp);
+  }
+
+  // Address' base, scale, index and segment operands.
+  const MachineOperand *Operands[4];
+
+  // Address' displacement operand.
+  const MachineOperand *Disp;
+};
+
+/// Provide DenseMapInfo for MemOpKey.
+namespace llvm {
+template <> struct DenseMapInfo<MemOpKey> {
+  typedef DenseMapInfo<const MachineOperand *> PtrInfo;
+
+  static inline MemOpKey getEmptyKey() {
+    return MemOpKey(PtrInfo::getEmptyKey(), PtrInfo::getEmptyKey(),
+                    PtrInfo::getEmptyKey(), PtrInfo::getEmptyKey(),
+                    PtrInfo::getEmptyKey());
+  }
+
+  static inline MemOpKey getTombstoneKey() {
+    return MemOpKey(PtrInfo::getTombstoneKey(), PtrInfo::getTombstoneKey(),
+                    PtrInfo::getTombstoneKey(), PtrInfo::getTombstoneKey(),
+                    PtrInfo::getTombstoneKey());
+  }
+
+  static unsigned getHashValue(const MemOpKey &Val) {
+    // Checking any field of MemOpKey is enough to determine if the key is
+    // empty or tombstone.
+    assert(Val.Disp != PtrInfo::getEmptyKey() && "Cannot hash the empty key");
+    assert(Val.Disp != PtrInfo::getTombstoneKey() &&
+           "Cannot hash the tombstone key");
+
+    hash_code Hash = hash_combine(*Val.Operands[0], *Val.Operands[1],
+                                  *Val.Operands[2], *Val.Operands[3]);
+
+    // If the address displacement is an immediate, it should not affect the
+    // hash so that memory operands which differ only be immediate displacement
+    // would have the same hash. If the address displacement is something else,
+    // we should reflect symbol/index/address in the hash.
+    switch (Val.Disp->getType()) {
+    case MachineOperand::MO_Immediate:
+      break;
+    case MachineOperand::MO_ConstantPoolIndex:
+    case MachineOperand::MO_JumpTableIndex:
+      Hash = hash_combine(Hash, Val.Disp->getIndex());
+      break;
+    case MachineOperand::MO_ExternalSymbol:
+      Hash = hash_combine(Hash, Val.Disp->getSymbolName());
+      break;
+    case MachineOperand::MO_GlobalAddress:
+      Hash = hash_combine(Hash, Val.Disp->getGlobal());
+      break;
+    case MachineOperand::MO_BlockAddress:
+      Hash = hash_combine(Hash, Val.Disp->getBlockAddress());
+      break;
+    case MachineOperand::MO_MCSymbol:
+      Hash = hash_combine(Hash, Val.Disp->getMCSymbol());
+      break;
+    case MachineOperand::MO_MachineBasicBlock:
+      Hash = hash_combine(Hash, Val.Disp->getMBB());
+      break;
+    default:
+      llvm_unreachable("Invalid address displacement operand");
+    }
+
+    return (unsigned)Hash;
+  }
+
+  static bool isEqual(const MemOpKey &LHS, const MemOpKey &RHS) {
+    // Checking any field of MemOpKey is enough to determine if the key is
+    // empty or tombstone.
+    if (RHS.Disp == PtrInfo::getEmptyKey())
+      return LHS.Disp == PtrInfo::getEmptyKey();
+    if (RHS.Disp == PtrInfo::getTombstoneKey())
+      return LHS.Disp == PtrInfo::getTombstoneKey();
+    return LHS == RHS;
+  }
+};
+}
+
+static inline MemOpKey getMemOpKey(const MachineInstr &MI, unsigned N) {
+  assert((isLEA(MI) || MI.mayLoadOrStore()) &&
+         "The instruction must be a LEA, a load or a store");
+  return MemOpKey(&MI.getOperand(N + X86::AddrBaseReg),
+                  &MI.getOperand(N + X86::AddrScaleAmt),
+                  &MI.getOperand(N + X86::AddrIndexReg),
+                  &MI.getOperand(N + X86::AddrSegmentReg),
+                  &MI.getOperand(N + X86::AddrDisp));
+}
+
+static inline bool isIdenticalOp(const MachineOperand &MO1,
+                                 const MachineOperand &MO2) {
+  return MO1.isIdenticalTo(MO2) &&
+         (!MO1.isReg() ||
+          !TargetRegisterInfo::isPhysicalRegister(MO1.getReg()));
+}
+
+#ifndef NDEBUG
+static bool isValidDispOp(const MachineOperand &MO) {
+  return MO.isImm() || MO.isCPI() || MO.isJTI() || MO.isSymbol() ||
+         MO.isGlobal() || MO.isBlockAddress() || MO.isMCSymbol() || MO.isMBB();
+}
+#endif
+
+static bool isSimilarDispOp(const MachineOperand &MO1,
+                            const MachineOperand &MO2) {
+  assert(isValidDispOp(MO1) && isValidDispOp(MO2) &&
+         "Address displacement operand is not valid");
+  return (MO1.isImm() && MO2.isImm()) ||
+         (MO1.isCPI() && MO2.isCPI() && MO1.getIndex() == MO2.getIndex()) ||
+         (MO1.isJTI() && MO2.isJTI() && MO1.getIndex() == MO2.getIndex()) ||
+         (MO1.isSymbol() && MO2.isSymbol() &&
+          MO1.getSymbolName() == MO2.getSymbolName()) ||
+         (MO1.isGlobal() && MO2.isGlobal() &&
+          MO1.getGlobal() == MO2.getGlobal()) ||
+         (MO1.isBlockAddress() && MO2.isBlockAddress() &&
+          MO1.getBlockAddress() == MO2.getBlockAddress()) ||
+         (MO1.isMCSymbol() && MO2.isMCSymbol() &&
+          MO1.getMCSymbol() == MO2.getMCSymbol()) ||
+         (MO1.isMBB() && MO2.isMBB() && MO1.getMBB() == MO2.getMBB());
+}
+
+static inline bool isLEA(const MachineInstr &MI) {
+  unsigned Opcode = MI.getOpcode();
+  return Opcode == X86::LEA16r || Opcode == X86::LEA32r ||
+         Opcode == X86::LEA64r || Opcode == X86::LEA64_32r;
+}
+
 namespace {
 class OptimizeLEAPass : public MachineFunctionPass {
 public:
@@ -55,51 +229,43 @@ public:
   bool runOnMachineFunction(MachineFunction &MF) override;
 
 private:
+  typedef DenseMap<MemOpKey, SmallVector<MachineInstr *, 16>> MemOpMap;
+
   /// \brief Returns a distance between two instructions inside one basic block.
   /// Negative result means, that instructions occur in reverse order.
   int calcInstrDist(const MachineInstr &First, const MachineInstr &Last);
 
   /// \brief Choose the best \p LEA instruction from the \p List to replace
   /// address calculation in \p MI instruction. Return the address displacement
-  /// and the distance between \p MI and the choosen \p LEA in \p AddrDispShift
-  /// and \p Dist.
+  /// and the distance between \p MI and the choosen \p BestLEA in
+  /// \p AddrDispShift and \p Dist.
   bool chooseBestLEA(const SmallVectorImpl<MachineInstr *> &List,
-                     const MachineInstr &MI, MachineInstr *&LEA,
+                     const MachineInstr &MI, MachineInstr *&BestLEA,
                      int64_t &AddrDispShift, int &Dist);
 
-  /// \brief Returns true if two machine operand are identical and they are not
-  /// physical registers.
-  bool isIdenticalOp(const MachineOperand &MO1, const MachineOperand &MO2);
-
-  /// \brief Returns true if the instruction is LEA.
-  bool isLEA(const MachineInstr &MI);
+  /// \brief Returns the difference between addresses' displacements of \p MI1
+  /// and \p MI2. The numbers of the first memory operands for the instructions
+  /// are specified through \p N1 and \p N2.
+  int64_t getAddrDispShift(const MachineInstr &MI1, unsigned N1,
+                           const MachineInstr &MI2, unsigned N2) const;
 
   /// \brief Returns true if the \p Last LEA instruction can be replaced by the
   /// \p First. The difference between displacements of the addresses calculated
   /// by these LEAs is returned in \p AddrDispShift. It'll be used for proper
   /// replacement of the \p Last LEA's uses with the \p First's def register.
   bool isReplaceable(const MachineInstr &First, const MachineInstr &Last,
-                     int64_t &AddrDispShift);
-
-  /// \brief Returns true if two instructions have memory operands that only
-  /// differ by displacement. The numbers of the first memory operands for both
-  /// instructions are specified through \p N1 and \p N2. The address
-  /// displacement is returned through AddrDispShift.
-  bool isSimilarMemOp(const MachineInstr &MI1, unsigned N1,
-                      const MachineInstr &MI2, unsigned N2,
-                      int64_t &AddrDispShift);
+                     int64_t &AddrDispShift) const;
 
   /// \brief Find all LEA instructions in the basic block. Also, assign position
   /// numbers to all instructions in the basic block to speed up calculation of
   /// distance between them.
-  void findLEAs(const MachineBasicBlock &MBB,
-                SmallVectorImpl<MachineInstr *> &List);
+  void findLEAs(const MachineBasicBlock &MBB, MemOpMap &LEAs);
 
   /// \brief Removes redundant address calculations.
-  bool removeRedundantAddrCalc(const SmallVectorImpl<MachineInstr *> &List);
+  bool removeRedundantAddrCalc(MemOpMap &LEAs);
 
   /// \brief Removes LEAs which calculate similar addresses.
-  bool removeRedundantLEAs(SmallVectorImpl<MachineInstr *> &List);
+  bool removeRedundantLEAs(MemOpMap &LEAs);
 
   DenseMap<const MachineInstr *, unsigned> InstrPos;
 
@@ -137,22 +303,20 @@ int OptimizeLEAPass::calcInstrDist(const MachineInstr &First,
 // 4) The LEA should be as close to MI as possible, and prior to it if
 //    possible.
 bool OptimizeLEAPass::chooseBestLEA(const SmallVectorImpl<MachineInstr *> &List,
-                                    const MachineInstr &MI, MachineInstr *&LEA,
+                                    const MachineInstr &MI,
+                                    MachineInstr *&BestLEA,
                                     int64_t &AddrDispShift, int &Dist) {
   const MachineFunction *MF = MI.getParent()->getParent();
   const MCInstrDesc &Desc = MI.getDesc();
-  int MemOpNo = X86II::getMemoryOperandNo(Desc.TSFlags, MI.getOpcode()) +
+  int MemOpNo = X86II::getMemoryOperandNo(Desc.TSFlags) +
                 X86II::getOperandBias(Desc);
 
-  LEA = nullptr;
+  BestLEA = nullptr;
 
   // Loop over all LEA instructions.
   for (auto DefMI : List) {
-    int64_t AddrDispShiftTemp = 0;
-
-    // Compare instructions memory operands.
-    if (!isSimilarMemOp(MI, MemOpNo, *DefMI, 1, AddrDispShiftTemp))
-      continue;
+    // Get new address displacement.
+    int64_t AddrDispShiftTemp = getAddrDispShift(MI, MemOpNo, *DefMI, 1);
 
     // Make sure address displacement fits 4 bytes.
     if (!isInt<32>(AddrDispShiftTemp))
@@ -174,14 +338,14 @@ bool OptimizeLEAPass::chooseBestLEA(const SmallVectorImpl<MachineInstr *> &List,
     int DistTemp = calcInstrDist(*DefMI, MI);
     assert(DistTemp != 0 &&
            "The distance between two different instructions cannot be zero");
-    if (DistTemp > 0 || LEA == nullptr) {
+    if (DistTemp > 0 || BestLEA == nullptr) {
       // Do not update return LEA, if the current one provides a displacement
       // which fits in 1 byte, while the new candidate does not.
-      if (LEA != nullptr && !isInt<8>(AddrDispShiftTemp) &&
+      if (BestLEA != nullptr && !isInt<8>(AddrDispShiftTemp) &&
           isInt<8>(AddrDispShift))
         continue;
 
-      LEA = DefMI;
+      BestLEA = DefMI;
       AddrDispShift = AddrDispShiftTemp;
       Dist = DistTemp;
     }
@@ -191,20 +355,28 @@ bool OptimizeLEAPass::chooseBestLEA(const SmallVectorImpl<MachineInstr *> &List,
       break;
   }
 
-  return LEA != nullptr;
-}
-
-bool OptimizeLEAPass::isIdenticalOp(const MachineOperand &MO1,
-                                    const MachineOperand &MO2) {
-  return MO1.isIdenticalTo(MO2) &&
-         (!MO1.isReg() ||
-          !TargetRegisterInfo::isPhysicalRegister(MO1.getReg()));
+  return BestLEA != nullptr;
 }
 
-bool OptimizeLEAPass::isLEA(const MachineInstr &MI) {
-  unsigned Opcode = MI.getOpcode();
-  return Opcode == X86::LEA16r || Opcode == X86::LEA32r ||
-         Opcode == X86::LEA64r || Opcode == X86::LEA64_32r;
+// Get the difference between the addresses' displacements of the two
+// instructions \p MI1 and \p MI2. The numbers of the first memory operands are
+// passed through \p N1 and \p N2.
+int64_t OptimizeLEAPass::getAddrDispShift(const MachineInstr &MI1, unsigned N1,
+                                          const MachineInstr &MI2,
+                                          unsigned N2) const {
+  const MachineOperand &Op1 = MI1.getOperand(N1 + X86::AddrDisp);
+  const MachineOperand &Op2 = MI2.getOperand(N2 + X86::AddrDisp);
+
+  assert(isSimilarDispOp(Op1, Op2) &&
+         "Address displacement operands are not compatible");
+
+  // After the assert above we can be sure that both operands are of the same
+  // valid type and use the same symbol/index/address, thus displacement shift
+  // calculation is rather simple.
+  if (Op1.isJTI())
+    return 0;
+  return Op1.isImm() ? Op1.getImm() - Op2.getImm()
+                     : Op1.getOffset() - Op2.getOffset();
 }
 
 // Check that the Last LEA can be replaced by the First LEA. To be so,
@@ -215,13 +387,12 @@ bool OptimizeLEAPass::isLEA(const MachineInstr &MI) {
 //    register is used only as address base.
 bool OptimizeLEAPass::isReplaceable(const MachineInstr &First,
                                     const MachineInstr &Last,
-                                    int64_t &AddrDispShift) {
+                                    int64_t &AddrDispShift) const {
   assert(isLEA(First) && isLEA(Last) &&
          "The function works only with LEA instructions");
 
-  // Compare instructions' memory operands.
-  if (!isSimilarMemOp(Last, 1, First, 1, AddrDispShift))
-    return false;
+  // Get new address displacement.
+  AddrDispShift = getAddrDispShift(Last, 1, First, 1);
 
   // Make sure that LEA def registers belong to the same class. There may be
   // instructions (like MOV8mr_NOREX) which allow a limited set of registers to
@@ -239,7 +410,7 @@ bool OptimizeLEAPass::isReplaceable(const MachineInstr &First,
 
     // Get the number of the first memory operand.
     const MCInstrDesc &Desc = MI.getDesc();
-    int MemOpNo = X86II::getMemoryOperandNo(Desc.TSFlags, MI.getOpcode());
+    int MemOpNo = X86II::getMemoryOperandNo(Desc.TSFlags);
 
     // If the use instruction has no memory operand - the LEA is not
     // replaceable.
@@ -270,36 +441,7 @@ bool OptimizeLEAPass::isReplaceable(const MachineInstr &First,
   return true;
 }
 
-// Check if MI1 and MI2 have memory operands which represent addresses that
-// differ only by displacement.
-bool OptimizeLEAPass::isSimilarMemOp(const MachineInstr &MI1, unsigned N1,
-                                     const MachineInstr &MI2, unsigned N2,
-                                     int64_t &AddrDispShift) {
-  // Address base, scale, index and segment operands must be identical.
-  static const int IdenticalOpNums[] = {X86::AddrBaseReg, X86::AddrScaleAmt,
-                                        X86::AddrIndexReg, X86::AddrSegmentReg};
-  for (auto &N : IdenticalOpNums)
-    if (!isIdenticalOp(MI1.getOperand(N1 + N), MI2.getOperand(N2 + N)))
-      return false;
-
-  // Address displacement operands may differ by a constant.
-  const MachineOperand *Op1 = &MI1.getOperand(N1 + X86::AddrDisp);
-  const MachineOperand *Op2 = &MI2.getOperand(N2 + X86::AddrDisp);
-  if (!isIdenticalOp(*Op1, *Op2)) {
-    if (Op1->isImm() && Op2->isImm())
-      AddrDispShift = Op1->getImm() - Op2->getImm();
-    else if (Op1->isGlobal() && Op2->isGlobal() &&
-             Op1->getGlobal() == Op2->getGlobal())
-      AddrDispShift = Op1->getOffset() - Op2->getOffset();
-    else
-      return false;
-  }
-
-  return true;
-}
-
-void OptimizeLEAPass::findLEAs(const MachineBasicBlock &MBB,
-                               SmallVectorImpl<MachineInstr *> &List) {
+void OptimizeLEAPass::findLEAs(const MachineBasicBlock &MBB, MemOpMap &LEAs) {
   unsigned Pos = 0;
   for (auto &MI : MBB) {
     // Assign the position number to the instruction. Note that we are going to
@@ -310,24 +452,22 @@ void OptimizeLEAPass::findLEAs(const MachineBasicBlock &MBB,
     InstrPos[&MI] = Pos += 2;
 
     if (isLEA(MI))
-      List.push_back(const_cast<MachineInstr *>(&MI));
+      LEAs[getMemOpKey(MI, 1)].push_back(const_cast<MachineInstr *>(&MI));
   }
 }
 
 // Try to find load and store instructions which recalculate addresses already
 // calculated by some LEA and replace their memory operands with its def
 // register.
-bool OptimizeLEAPass::removeRedundantAddrCalc(
-    const SmallVectorImpl<MachineInstr *> &List) {
+bool OptimizeLEAPass::removeRedundantAddrCalc(MemOpMap &LEAs) {
   bool Changed = false;
 
-  assert(List.size() > 0);
-  MachineBasicBlock *MBB = List[0]->getParent();
+  assert(!LEAs.empty());
+  MachineBasicBlock *MBB = (*LEAs.begin()->second.begin())->getParent();
 
   // Process all instructions in basic block.
   for (auto I = MBB->begin(), E = MBB->end(); I != E;) {
     MachineInstr &MI = *I++;
-    unsigned Opcode = MI.getOpcode();
 
     // Instruction must be load or store.
     if (!MI.mayLoadOrStore())
@@ -335,7 +475,7 @@ bool OptimizeLEAPass::removeRedundantAddrCalc(
 
     // Get the number of the first memory operand.
     const MCInstrDesc &Desc = MI.getDesc();
-    int MemOpNo = X86II::getMemoryOperandNo(Desc.TSFlags, Opcode);
+    int MemOpNo = X86II::getMemoryOperandNo(Desc.TSFlags);
 
     // If instruction has no memory operand - skip it.
     if (MemOpNo < 0)
@@ -347,7 +487,8 @@ bool OptimizeLEAPass::removeRedundantAddrCalc(
     MachineInstr *DefMI;
     int64_t AddrDispShift;
     int Dist;
-    if (!chooseBestLEA(List, MI, DefMI, AddrDispShift, Dist))
+    if (!chooseBestLEA(LEAs[getMemOpKey(MI, MemOpNo)], MI, DefMI, AddrDispShift,
+                       Dist))
       continue;
 
     // If LEA occurs before current instruction, we can freely replace
@@ -362,9 +503,10 @@ bool OptimizeLEAPass::removeRedundantAddrCalc(
       InstrPos[DefMI] = InstrPos[&MI] - 1;
 
       // Make sure the instructions' position numbers are sane.
-      assert(((InstrPos[DefMI] == 1 && DefMI == MBB->begin()) ||
+      assert(((InstrPos[DefMI] == 1 &&
+               MachineBasicBlock::iterator(DefMI) == MBB->begin()) ||
               InstrPos[DefMI] >
-                  InstrPos[std::prev(MachineBasicBlock::iterator(DefMI))]) &&
+                  InstrPos[&*std::prev(MachineBasicBlock::iterator(DefMI))]) &&
              "Instruction positioning is broken");
     }
 
@@ -393,75 +535,78 @@ bool OptimizeLEAPass::removeRedundantAddrCalc(
 }
 
 // Try to find similar LEAs in the list and replace one with another.
-bool
-OptimizeLEAPass::removeRedundantLEAs(SmallVectorImpl<MachineInstr *> &List) {
+bool OptimizeLEAPass::removeRedundantLEAs(MemOpMap &LEAs) {
   bool Changed = false;
 
-  // Loop over all LEA pairs.
-  auto I1 = List.begin();
-  while (I1 != List.end()) {
-    MachineInstr &First = **I1;
-    auto I2 = std::next(I1);
-    while (I2 != List.end()) {
-      MachineInstr &Last = **I2;
-      int64_t AddrDispShift;
-
-      // LEAs should be in occurence order in the list, so we can freely
-      // replace later LEAs with earlier ones.
-      assert(calcInstrDist(First, Last) > 0 &&
-             "LEAs must be in occurence order in the list");
-
-      // Check that the Last LEA instruction can be replaced by the First.
-      if (!isReplaceable(First, Last, AddrDispShift)) {
-        ++I2;
-        continue;
-      }
-
-      // Loop over all uses of the Last LEA and update their operands. Note that
-      // the correctness of this has already been checked in the isReplaceable
-      // function.
-      for (auto UI = MRI->use_begin(Last.getOperand(0).getReg()),
-                UE = MRI->use_end();
-           UI != UE;) {
-        MachineOperand &MO = *UI++;
-        MachineInstr &MI = *MO.getParent();
-
-        // Get the number of the first memory operand.
-        const MCInstrDesc &Desc = MI.getDesc();
-        int MemOpNo = X86II::getMemoryOperandNo(Desc.TSFlags, MI.getOpcode()) +
-                      X86II::getOperandBias(Desc);
-
-        // Update address base.
-        MO.setReg(First.getOperand(0).getReg());
-
-        // Update address disp.
-        MachineOperand *Op = &MI.getOperand(MemOpNo + X86::AddrDisp);
-        if (Op->isImm())
-          Op->setImm(Op->getImm() + AddrDispShift);
-        else if (Op->isGlobal())
-          Op->setOffset(Op->getOffset() + AddrDispShift);
-        else
-          llvm_unreachable("Invalid address displacement operand");
+  // Loop over all entries in the table.
+  for (auto &E : LEAs) {
+    auto &List = E.second;
+
+    // Loop over all LEA pairs.
+    auto I1 = List.begin();
+    while (I1 != List.end()) {
+      MachineInstr &First = **I1;
+      auto I2 = std::next(I1);
+      while (I2 != List.end()) {
+        MachineInstr &Last = **I2;
+        int64_t AddrDispShift;
+
+        // LEAs should be in occurence order in the list, so we can freely
+        // replace later LEAs with earlier ones.
+        assert(calcInstrDist(First, Last) > 0 &&
+               "LEAs must be in occurence order in the list");
+
+        // Check that the Last LEA instruction can be replaced by the First.
+        if (!isReplaceable(First, Last, AddrDispShift)) {
+          ++I2;
+          continue;
+        }
+
+        // Loop over all uses of the Last LEA and update their operands. Note
+        // that the correctness of this has already been checked in the
+        // isReplaceable function.
+        for (auto UI = MRI->use_begin(Last.getOperand(0).getReg()),
+                  UE = MRI->use_end();
+             UI != UE;) {
+          MachineOperand &MO = *UI++;
+          MachineInstr &MI = *MO.getParent();
+
+          // Get the number of the first memory operand.
+          const MCInstrDesc &Desc = MI.getDesc();
+          int MemOpNo =
+              X86II::getMemoryOperandNo(Desc.TSFlags) +
+              X86II::getOperandBias(Desc);
+
+          // Update address base.
+          MO.setReg(First.getOperand(0).getReg());
+
+          // Update address disp.
+          MachineOperand &Op = MI.getOperand(MemOpNo + X86::AddrDisp);
+          if (Op.isImm())
+            Op.setImm(Op.getImm() + AddrDispShift);
+          else if (!Op.isJTI())
+            Op.setOffset(Op.getOffset() + AddrDispShift);
+        }
+
+        // Since we can possibly extend register lifetime, clear kill flags.
+        MRI->clearKillFlags(First.getOperand(0).getReg());
+
+        ++NumRedundantLEAs;
+        DEBUG(dbgs() << "OptimizeLEAs: Remove redundant LEA: "; Last.dump(););
+
+        // By this moment, all of the Last LEA's uses must be replaced. So we
+        // can freely remove it.
+        assert(MRI->use_empty(Last.getOperand(0).getReg()) &&
+               "The LEA's def register must have no uses");
+        Last.eraseFromParent();
+
+        // Erase removed LEA from the list.
+        I2 = List.erase(I2);
+
+        Changed = true;
       }
-
-      // Since we can possibly extend register lifetime, clear kill flags.
-      MRI->clearKillFlags(First.getOperand(0).getReg());
-
-      ++NumRedundantLEAs;
-      DEBUG(dbgs() << "OptimizeLEAs: Remove redundant LEA: "; Last.dump(););
-
-      // By this moment, all of the Last LEA's uses must be replaced. So we can
-      // freely remove it.
-      assert(MRI->use_empty(Last.getOperand(0).getReg()) &&
-             "The LEA's def register must have no uses");
-      Last.eraseFromParent();
-
-      // Erase removed LEA from the list.
-      I2 = List.erase(I2);
-
-      Changed = true;
+      ++I1;
     }
-    ++I1;
   }
 
   return Changed;
@@ -470,8 +615,7 @@ OptimizeLEAPass::removeRedundantLEAs(SmallVectorImpl<MachineInstr *> &List) {
 bool OptimizeLEAPass::runOnMachineFunction(MachineFunction &MF) {
   bool Changed = false;
 
-  // Perform this optimization only if we care about code size.
-  if (!EnableX86LEAOpt || !MF.getFunction()->optForSize())
+  if (DisableX86LEAOpt || skipFunction(*MF.getFunction()))
     return false;
 
   MRI = &MF.getRegInfo();
@@ -480,7 +624,7 @@ bool OptimizeLEAPass::runOnMachineFunction(MachineFunction &MF) {
 
   // Process all basic blocks.
   for (auto &MBB : MF) {
-    SmallVector<MachineInstr *, 16> LEAs;
+    MemOpMap LEAs;
     InstrPos.clear();
 
     // Find all LEA instructions in basic block.
@@ -490,13 +634,13 @@ bool OptimizeLEAPass::runOnMachineFunction(MachineFunction &MF) {
     if (LEAs.empty())
       continue;
 
-    // Remove redundant LEA instructions. The optimization may have a negative
-    // effect on performance, so do it only for -Oz.
-    if (MF.getFunction()->optForMinSize())
-      Changed |= removeRedundantLEAs(LEAs);
+    // Remove redundant LEA instructions.
+    Changed |= removeRedundantLEAs(LEAs);
 
-    // Remove redundant address calculations.
-    Changed |= removeRedundantAddrCalc(LEAs);
+    // Remove redundant address calculations. Do it only for -Os/-Oz since only
+    // a code size gain is expected from this part of the pass.
+    if (MF.getFunction()->optForSize())
+      Changed |= removeRedundantAddrCalc(LEAs);
   }
 
   return Changed;
diff --git a/lib/Target/X86/X86PadShortFunction.cpp b/lib/Target/X86/X86PadShortFunction.cpp
index 0f425e28fa7d..62a9aafc2cf3 100644
--- a/lib/Target/X86/X86PadShortFunction.cpp
+++ b/lib/Target/X86/X86PadShortFunction.cpp
@@ -55,6 +55,11 @@ namespace {
 
     bool runOnMachineFunction(MachineFunction &MF) override;
 
+    MachineFunctionProperties getRequiredProperties() const override {
+      return MachineFunctionProperties().set(
+          MachineFunctionProperties::Property::AllVRegsAllocated);
+    }
+
     const char *getPassName() const override {
       return "X86 Atom pad short functions";
     }
@@ -93,6 +98,9 @@ FunctionPass *llvm::createX86PadShortFunctions() {
 /// runOnMachineFunction - Loop over all of the basic blocks, inserting
 /// NOOP instructions before early exits.
 bool PadShortFunc::runOnMachineFunction(MachineFunction &MF) {
+  if (skipFunction(*MF.getFunction()))
+    return false;
+
   if (MF.getFunction()->optForSize()) {
     return false;
   }
@@ -179,13 +187,11 @@ bool PadShortFunc::cyclesUntilReturn(MachineBasicBlock *MBB,
 
   unsigned int CyclesToEnd = 0;
 
-  for (MachineBasicBlock::iterator MBBI = MBB->begin();
-        MBBI != MBB->end(); ++MBBI) {
-    MachineInstr *MI = MBBI;
+  for (MachineInstr &MI : *MBB) {
     // Mark basic blocks with a return instruction. Calls to other
     // functions do not count because the called function will be padded,
     // if necessary.
-    if (MI->isReturn() && !MI->isCall()) {
+    if (MI.isReturn() && !MI.isCall()) {
       VisitedBBs[MBB] = VisitedBBInfo(true, CyclesToEnd);
       Cycles += CyclesToEnd;
       return true;
diff --git a/lib/Target/X86/X86RegisterInfo.cpp b/lib/Target/X86/X86RegisterInfo.cpp
index 274b56688558..86750633aecc 100644
--- a/lib/Target/X86/X86RegisterInfo.cpp
+++ b/lib/Target/X86/X86RegisterInfo.cpp
@@ -52,7 +52,7 @@ X86RegisterInfo::X86RegisterInfo(const Triple &TT)
                          X86_MC::getDwarfRegFlavour(TT, false),
                          X86_MC::getDwarfRegFlavour(TT, true),
                          (TT.isArch64Bit() ? X86::RIP : X86::EIP)) {
-  X86_MC::InitLLVM2SEHRegisterMapping(this);
+  X86_MC::initLLVMToSEHAndCVRegMapping(this);
 
   // Cache some information.
   Is64Bit = TT.isArch64Bit();
@@ -162,10 +162,23 @@ X86RegisterInfo::getPointerRegClass(const MachineFunction &MF,
   case 0: // Normal GPRs.
     if (Subtarget.isTarget64BitLP64())
       return &X86::GR64RegClass;
+    // If the target is 64bit but we have been told to use 32bit addresses,
+    // we can still use 64-bit register as long as we know the high bits
+    // are zeros.
+    // Reflect that in the returned register class.
+    if (Is64Bit) {
+      // When the target also allows 64-bit frame pointer and we do have a
+      // frame, this is fine to use it for the address accesses as well.
+      const X86FrameLowering *TFI = getFrameLowering(MF);
+      return TFI->hasFP(MF) && TFI->Uses64BitFramePtr
+                 ? &X86::LOW32_ADDR_ACCESS_RBPRegClass
+                 : &X86::LOW32_ADDR_ACCESSRegClass;
+    }
     return &X86::GR32RegClass;
   case 1: // Normal GPRs except the stack pointer (for encoding reasons).
     if (Subtarget.isTarget64BitLP64())
       return &X86::GR64_NOSPRegClass;
+    // NOSP does not contain RIP, so no special case here.
     return &X86::GR32_NOSPRegClass;
   case 2: // NOREX GPRs.
     if (Subtarget.isTarget64BitLP64())
@@ -174,6 +187,7 @@ X86RegisterInfo::getPointerRegClass(const MachineFunction &MF,
   case 3: // NOREX GPRs except the stack pointer (for encoding reasons).
     if (Subtarget.isTarget64BitLP64())
       return &X86::GR64_NOREX_NOSPRegClass;
+    // NOSP does not contain RIP, so no special case here.
     return &X86::GR32_NOREX_NOSPRegClass;
   case 4: // Available for tailcall (not callee-saved GPRs).
     return getGPRsForTailCall(MF);
@@ -280,15 +294,19 @@ X86RegisterInfo::getCalleeSavedRegs(const MachineFunction *MF) const {
     return CSR_64_SaveList;
   case CallingConv::X86_INTR:
     if (Is64Bit) {
+      if (HasAVX512)
+        return CSR_64_AllRegs_AVX512_SaveList;
       if (HasAVX)
         return CSR_64_AllRegs_AVX_SaveList;
-      else
-        return CSR_64_AllRegs_SaveList;
+      return CSR_64_AllRegs_SaveList;
     } else {
+      if (HasAVX512)
+        return CSR_32_AllRegs_AVX512_SaveList;
+      if (HasAVX)
+        return CSR_32_AllRegs_AVX_SaveList;
       if (HasSSE)
         return CSR_32_AllRegs_SSE_SaveList;
-      else
-        return CSR_32_AllRegs_SaveList;
+      return CSR_32_AllRegs_SaveList;
     }
   default:
     break;
@@ -299,6 +317,10 @@ X86RegisterInfo::getCalleeSavedRegs(const MachineFunction *MF) const {
       return CSR_Win64_SaveList;
     if (CallsEHReturn)
       return CSR_64EHRet_SaveList;
+    if (Subtarget.getTargetLowering()->supportSwiftError() &&
+        MF->getFunction()->getAttributes().hasAttrSomewhere(
+            Attribute::SwiftError))
+      return CSR_64_SwiftError_SaveList;
     return CSR_64_SaveList;
   }
   if (CallsEHReturn)
@@ -366,18 +388,22 @@ X86RegisterInfo::getCallPreservedMask(const MachineFunction &MF,
     return CSR_64_RegMask;
   case CallingConv::X86_INTR:
     if (Is64Bit) {
+      if (HasAVX512)
+        return CSR_64_AllRegs_AVX512_RegMask;
       if (HasAVX)
         return CSR_64_AllRegs_AVX_RegMask;
-      else
-        return CSR_64_AllRegs_RegMask;
+      return CSR_64_AllRegs_RegMask;
     } else {
+      if (HasAVX512)
+        return CSR_32_AllRegs_AVX512_RegMask;
+      if (HasAVX)
+        return CSR_32_AllRegs_AVX_RegMask;
       if (HasSSE)
         return CSR_32_AllRegs_SSE_RegMask;
-      else
-        return CSR_32_AllRegs_RegMask;
+      return CSR_32_AllRegs_RegMask;
     }
-    default:
-      break;
+  default:
+    break;
   }
 
   // Unlike getCalleeSavedRegs(), we don't have MMI so we can't check
@@ -385,6 +411,10 @@ X86RegisterInfo::getCallPreservedMask(const MachineFunction &MF,
   if (Is64Bit) {
     if (IsWin64)
       return CSR_Win64_RegMask;
+    if (Subtarget.getTargetLowering()->supportSwiftError() &&
+        MF.getFunction()->getAttributes().hasAttrSomewhere(
+            Attribute::SwiftError))
+      return CSR_64_SwiftError_RegMask;
     return CSR_64_RegMask;
   }
   return CSR_32_RegMask;
diff --git a/lib/Target/X86/X86RegisterInfo.td b/lib/Target/X86/X86RegisterInfo.td
index 56f0d9352d30..373f9b4c65f2 100644
--- a/lib/Target/X86/X86RegisterInfo.td
+++ b/lib/Target/X86/X86RegisterInfo.td
@@ -226,14 +226,14 @@ let SubRegIndices = [sub_ymm] in {
 }
 
 // Mask Registers, used by AVX-512 instructions.
-def K0 : X86Reg<"k0", 0>, DwarfRegNum<[118, -2, -2]>;
-def K1 : X86Reg<"k1", 1>, DwarfRegNum<[119, -2, -2]>;
-def K2 : X86Reg<"k2", 2>, DwarfRegNum<[120, -2, -2]>;
-def K3 : X86Reg<"k3", 3>, DwarfRegNum<[121, -2, -2]>;
-def K4 : X86Reg<"k4", 4>, DwarfRegNum<[122, -2, -2]>;
-def K5 : X86Reg<"k5", 5>, DwarfRegNum<[123, -2, -2]>;
-def K6 : X86Reg<"k6", 6>, DwarfRegNum<[124, -2, -2]>;
-def K7 : X86Reg<"k7", 7>, DwarfRegNum<[125, -2, -2]>;
+def K0 : X86Reg<"k0", 0>, DwarfRegNum<[118,  93,  93]>;
+def K1 : X86Reg<"k1", 1>, DwarfRegNum<[119,  94,  94]>;
+def K2 : X86Reg<"k2", 2>, DwarfRegNum<[120,  95,  95]>;
+def K3 : X86Reg<"k3", 3>, DwarfRegNum<[121,  96,  96]>;
+def K4 : X86Reg<"k4", 4>, DwarfRegNum<[122,  97,  97]>;
+def K5 : X86Reg<"k5", 5>, DwarfRegNum<[123,  98,  98]>;
+def K6 : X86Reg<"k6", 6>, DwarfRegNum<[124,  99,  99]>;
+def K7 : X86Reg<"k7", 7>, DwarfRegNum<[125, 100, 100]>;
 
 // Floating point stack registers. These don't map one-to-one to the FP
 // pseudo registers, but we still mark them as aliasing FP registers. That
@@ -415,6 +415,26 @@ def GR32_NOREX_NOSP : RegisterClass<"X86", [i32], 32,
 def GR64_NOREX_NOSP : RegisterClass<"X86", [i64], 64,
                                     (and GR64_NOREX, GR64_NOSP)>;
 
+// Register classes used for ABIs that use 32-bit address accesses,
+// while using the whole x84_64 ISA.
+
+// In such cases, it is fine to use RIP as we are sure the 32 high
+// bits are not set. We do not need variants for NOSP as RIP is not
+// allowed there.
+// RIP is not spilled anywhere for now, so stick to 32-bit alignment
+// to save on memory space.
+// FIXME: We could allow all 64bit registers, but we would need
+// something to check that the 32 high bits are not set,
+// which we do not have right now.
+def LOW32_ADDR_ACCESS : RegisterClass<"X86", [i32], 32, (add GR32, RIP)>;
+
+// When RBP is used as a base pointer in a 32-bit addresses environement,
+// this is also safe to use the full register to access addresses.
+// Since RBP will never be spilled, stick to a 32 alignment to save
+// on memory consumption.
+def LOW32_ADDR_ACCESS_RBP : RegisterClass<"X86", [i32], 32,
+                                          (add LOW32_ADDR_ACCESS, RBP)>;
+
 // A class to support the 'A' assembler constraint: EAX then EDX.
 def GR32_AD : RegisterClass<"X86", [i32], 32, (add EAX, EDX)>;
 
@@ -451,6 +471,17 @@ def VR128 : RegisterClass<"X86", [v4f32, v2f64, v16i8, v8i16, v4i32, v2i64],
 def VR256 : RegisterClass<"X86", [v8f32, v4f64, v32i8, v16i16, v8i32, v4i64],
                           256, (sequence "YMM%u", 0, 15)>;
 
+// Special classes that help the assembly parser choose some alternate
+// instructions to favor 2-byte VEX encodings.
+def VR128L : RegisterClass<"X86", [v4f32, v2f64, v16i8, v8i16, v4i32, v2i64],
+                           128, (sequence "XMM%u", 0, 7)>;
+def VR128H : RegisterClass<"X86", [v4f32, v2f64, v16i8, v8i16, v4i32, v2i64],
+                           128, (sequence "XMM%u", 8, 15)>;
+def VR256L : RegisterClass<"X86", [v8f32, v4f64, v32i8, v16i16, v8i32, v4i64],
+                           256, (sequence "YMM%u", 0, 7)>;
+def VR256H : RegisterClass<"X86", [v8f32, v4f64, v32i8, v16i16, v8i32, v4i64],
+                           256, (sequence "YMM%u", 8, 15)>;
+
 // Status flags registers.
 def CCR : RegisterClass<"X86", [i32], 32, (add EFLAGS)> {
   let CopyCost = -1;  // Don't allow copying of status registers.
@@ -477,18 +508,18 @@ def VR256X : RegisterClass<"X86", [v8f32, v4f64, v32i8, v16i16, v8i32, v4i64],
                            256, (sequence "YMM%u", 0, 31)>;
 
 // Mask registers
-def VK1     : RegisterClass<"X86", [i1],    8,  (sequence "K%u", 0, 7)> {let Size = 8;}
-def VK2     : RegisterClass<"X86", [v2i1],  8,  (add VK1)> {let Size = 8;}
-def VK4     : RegisterClass<"X86", [v4i1],  8,  (add VK2)> {let Size = 8;}
-def VK8     : RegisterClass<"X86", [v8i1],  8,  (add VK4)> {let Size = 8;}
+def VK1     : RegisterClass<"X86", [i1],    16,  (sequence "K%u", 0, 7)> {let Size = 16;}
+def VK2     : RegisterClass<"X86", [v2i1],  16,  (add VK1)> {let Size = 16;}
+def VK4     : RegisterClass<"X86", [v4i1],  16,  (add VK2)> {let Size = 16;}
+def VK8     : RegisterClass<"X86", [v8i1],  16,  (add VK4)> {let Size = 16;}
 def VK16    : RegisterClass<"X86", [v16i1], 16, (add VK8)> {let Size = 16;}
 def VK32    : RegisterClass<"X86", [v32i1], 32, (add VK16)> {let Size = 32;}
 def VK64    : RegisterClass<"X86", [v64i1], 64, (add VK32)> {let Size = 64;}
 
-def VK1WM   : RegisterClass<"X86", [i1],    8,  (sub VK1, K0)> {let Size = 8;}
-def VK2WM   : RegisterClass<"X86", [v2i1],  8,  (sub VK2, K0)> {let Size = 8;}
-def VK4WM   : RegisterClass<"X86", [v4i1],  8,  (sub VK4, K0)> {let Size = 8;}
-def VK8WM   : RegisterClass<"X86", [v8i1],  8,  (sub VK8, K0)> {let Size = 8;}
+def VK1WM   : RegisterClass<"X86", [i1],    16,  (sub VK1, K0)> {let Size = 16;}
+def VK2WM   : RegisterClass<"X86", [v2i1],  16,  (sub VK2, K0)> {let Size = 16;}
+def VK4WM   : RegisterClass<"X86", [v4i1],  16,  (sub VK4, K0)> {let Size = 16;}
+def VK8WM   : RegisterClass<"X86", [v8i1],  16,  (sub VK8, K0)> {let Size = 16;}
 def VK16WM  : RegisterClass<"X86", [v16i1], 16, (add VK8WM)>   {let Size = 16;}
 def VK32WM  : RegisterClass<"X86", [v32i1], 32, (add VK16WM)> {let Size = 32;}
 def VK64WM  : RegisterClass<"X86", [v64i1], 64, (add VK32WM)> {let Size = 64;}
diff --git a/lib/Target/X86/X86Schedule.td b/lib/Target/X86/X86Schedule.td
index a261356afe6a..35257f89100c 100644
--- a/lib/Target/X86/X86Schedule.td
+++ b/lib/Target/X86/X86Schedule.td
@@ -364,6 +364,8 @@ def IIC_SSE_PALIGNRR : InstrItinClass;
 def IIC_SSE_PALIGNRM : InstrItinClass;
 def IIC_SSE_MWAIT : InstrItinClass;
 def IIC_SSE_MONITOR : InstrItinClass;
+def IIC_SSE_MWAITX : InstrItinClass;
+def IIC_SSE_MONITORX : InstrItinClass;
 
 def IIC_SSE_PREFETCH : InstrItinClass;
 def IIC_SSE_PAUSE : InstrItinClass;
@@ -633,13 +635,22 @@ def IIC_NOP : InstrItinClass;
 // latencies. Since these latencies are not used for pipeline hazards,
 // they do not need to be exact.
 //
-// The GenericModel contains no instruction itineraries.
-def GenericModel : SchedMachineModel {
+// The GenericX86Model contains no instruction itineraries
+// and disables PostRAScheduler.
+class GenericX86Model : SchedMachineModel {
   let IssueWidth = 4;
   let MicroOpBufferSize = 32;
   let LoadLatency = 4;
   let HighLatency = 10;
   let PostRAScheduler = 0;
+  let CompleteModel = 0;
+}
+
+def GenericModel : GenericX86Model;
+
+// Define a model with the PostRAScheduler enabled.
+def GenericPostRAModel : GenericX86Model {
+  let PostRAScheduler = 1;
 }
 
 include "X86ScheduleAtom.td"
diff --git a/lib/Target/X86/X86ScheduleAtom.td b/lib/Target/X86/X86ScheduleAtom.td
index 4c559c9c1798..a5b440182aa9 100644
--- a/lib/Target/X86/X86ScheduleAtom.td
+++ b/lib/Target/X86/X86ScheduleAtom.td
@@ -544,6 +544,7 @@ def AtomModel : SchedMachineModel {
   // simple loops, expand by a small factor to hide the backedge cost.
   let LoopMicroOpBufferSize = 10;
   let PostRAScheduler = 1;
+  let CompleteModel = 0;
 
   let Itineraries = AtomItineraries;
 }
diff --git a/lib/Target/X86/X86SelectionDAGInfo.cpp b/lib/Target/X86/X86SelectionDAGInfo.cpp
index b1a01614b4a1..d02859b3dcbd 100644
--- a/lib/Target/X86/X86SelectionDAGInfo.cpp
+++ b/lib/Target/X86/X86SelectionDAGInfo.cpp
@@ -25,7 +25,7 @@ using namespace llvm;
 #define DEBUG_TYPE "x86-selectiondag-info"
 
 bool X86SelectionDAGInfo::isBaseRegConflictPossible(
-    SelectionDAG &DAG, ArrayRef<unsigned> ClobberSet) const {
+    SelectionDAG &DAG, ArrayRef<MCPhysReg> ClobberSet) const {
   // We cannot use TRI->hasBasePointer() until *after* we select all basic
   // blocks.  Legalization may introduce new stack temporaries with large
   // alignment requirements.  Fall back to generic code if there are any
@@ -45,7 +45,7 @@ bool X86SelectionDAGInfo::isBaseRegConflictPossible(
 }
 
 SDValue X86SelectionDAGInfo::EmitTargetCodeForMemset(
-    SelectionDAG &DAG, SDLoc dl, SDValue Chain, SDValue Dst, SDValue Src,
+    SelectionDAG &DAG, const SDLoc &dl, SDValue Chain, SDValue Dst, SDValue Src,
     SDValue Size, unsigned Align, bool isVolatile,
     MachinePointerInfo DstPtrInfo) const {
   ConstantSDNode *ConstantSize = dyn_cast<ConstantSDNode>(Size);
@@ -54,8 +54,8 @@ SDValue X86SelectionDAGInfo::EmitTargetCodeForMemset(
 
 #ifndef NDEBUG
   // If the base register might conflict with our physical registers, bail out.
-  const unsigned ClobberSet[] = {X86::RCX, X86::RAX, X86::RDI,
-                                 X86::ECX, X86::EAX, X86::EDI};
+  const MCPhysReg ClobberSet[] = {X86::RCX, X86::RAX, X86::RDI,
+                                  X86::ECX, X86::EAX, X86::EDI};
   assert(!isBaseRegConflictPossible(DAG, ClobberSet));
 #endif
 
@@ -87,8 +87,7 @@ SDValue X86SelectionDAGInfo::EmitTargetCodeForMemset(
       TargetLowering::CallLoweringInfo CLI(DAG);
       CLI.setDebugLoc(dl).setChain(Chain)
         .setCallee(CallingConv::C, Type::getVoidTy(*DAG.getContext()),
-                   DAG.getExternalSymbol(bzeroEntry, IntPtr), std::move(Args),
-                   0)
+                   DAG.getExternalSymbol(bzeroEntry, IntPtr), std::move(Args))
         .setDiscardResult();
 
       std::pair<SDValue,SDValue> CallResult = TLI.LowerCallTo(CLI);
@@ -195,7 +194,7 @@ SDValue X86SelectionDAGInfo::EmitTargetCodeForMemset(
 }
 
 SDValue X86SelectionDAGInfo::EmitTargetCodeForMemcpy(
-    SelectionDAG &DAG, SDLoc dl, SDValue Chain, SDValue Dst, SDValue Src,
+    SelectionDAG &DAG, const SDLoc &dl, SDValue Chain, SDValue Dst, SDValue Src,
     SDValue Size, unsigned Align, bool isVolatile, bool AlwaysInline,
     MachinePointerInfo DstPtrInfo, MachinePointerInfo SrcPtrInfo) const {
   // This requires the copy size to be a constant, preferably
@@ -222,8 +221,8 @@ SDValue X86SelectionDAGInfo::EmitTargetCodeForMemcpy(
     return SDValue();
 
   // If the base register might conflict with our physical registers, bail out.
-  const unsigned ClobberSet[] = {X86::RCX, X86::RSI, X86::RDI,
-                                 X86::ECX, X86::ESI, X86::EDI};
+  const MCPhysReg ClobberSet[] = {X86::RCX, X86::RSI, X86::RDI,
+                                  X86::ECX, X86::ESI, X86::EDI};
   if (isBaseRegConflictPossible(DAG, ClobberSet))
     return SDValue();
 
diff --git a/lib/Target/X86/X86SelectionDAGInfo.h b/lib/Target/X86/X86SelectionDAGInfo.h
index 961bd8c8d5ef..f4a285a5f916 100644
--- a/lib/Target/X86/X86SelectionDAGInfo.h
+++ b/lib/Target/X86/X86SelectionDAGInfo.h
@@ -7,14 +7,15 @@
 //
 //===----------------------------------------------------------------------===//
 //
-// This file defines the X86 subclass for TargetSelectionDAGInfo.
+// This file defines the X86 subclass for SelectionDAGTargetInfo.
 //
 //===----------------------------------------------------------------------===//
 
 #ifndef LLVM_LIB_TARGET_X86_X86SELECTIONDAGINFO_H
 #define LLVM_LIB_TARGET_X86_X86SELECTIONDAGINFO_H
 
-#include "llvm/Target/TargetSelectionDAGInfo.h"
+#include "llvm/CodeGen/SelectionDAGTargetInfo.h"
+#include "llvm/MC/MCRegisterInfo.h"
 
 namespace llvm {
 
@@ -22,27 +23,24 @@ class X86TargetLowering;
 class X86TargetMachine;
 class X86Subtarget;
 
-class X86SelectionDAGInfo : public TargetSelectionDAGInfo {
+class X86SelectionDAGInfo : public SelectionDAGTargetInfo {
   /// Returns true if it is possible for the base register to conflict with the
   /// given set of clobbers for a memory intrinsic.
   bool isBaseRegConflictPossible(SelectionDAG &DAG,
-                                 ArrayRef<unsigned> ClobberSet) const;
+                                 ArrayRef<MCPhysReg> ClobberSet) const;
 
 public:
   explicit X86SelectionDAGInfo() = default;
 
-  SDValue EmitTargetCodeForMemset(SelectionDAG &DAG, SDLoc dl,
-                                  SDValue Chain,
-                                  SDValue Dst, SDValue Src,
-                                  SDValue Size, unsigned Align,
-                                  bool isVolatile,
+  SDValue EmitTargetCodeForMemset(SelectionDAG &DAG, const SDLoc &dl,
+                                  SDValue Chain, SDValue Dst, SDValue Src,
+                                  SDValue Size, unsigned Align, bool isVolatile,
                                   MachinePointerInfo DstPtrInfo) const override;
 
-  SDValue EmitTargetCodeForMemcpy(SelectionDAG &DAG, SDLoc dl,
-                                  SDValue Chain,
-                                  SDValue Dst, SDValue Src,
-                                  SDValue Size, unsigned Align,
-                                  bool isVolatile, bool AlwaysInline,
+  SDValue EmitTargetCodeForMemcpy(SelectionDAG &DAG, const SDLoc &dl,
+                                  SDValue Chain, SDValue Dst, SDValue Src,
+                                  SDValue Size, unsigned Align, bool isVolatile,
+                                  bool AlwaysInline,
                                   MachinePointerInfo DstPtrInfo,
                                   MachinePointerInfo SrcPtrInfo) const override;
 };
diff --git a/lib/Target/X86/X86ShuffleDecodeConstantPool.cpp b/lib/Target/X86/X86ShuffleDecodeConstantPool.cpp
index ef16c5bdbfd8..1adc92cfda63 100644
--- a/lib/Target/X86/X86ShuffleDecodeConstantPool.cpp
+++ b/lib/Target/X86/X86ShuffleDecodeConstantPool.cpp
@@ -40,24 +40,43 @@ void DecodePSHUFBMask(const Constant *C, SmallVectorImpl<int> &ShuffleMask) {
   assert(MaskTySize == 128 || MaskTySize == 256 || MaskTySize == 512);
 #endif
 
-  // This is a straightforward byte vector.
-  if (MaskTy->isVectorTy() && MaskTy->getVectorElementType()->isIntegerTy(8)) {
-    int NumElements = MaskTy->getVectorNumElements();
-    ShuffleMask.reserve(NumElements);
+  if (!MaskTy->isVectorTy())
+    return;
+  int NumElts = MaskTy->getVectorNumElements();
+
+  Type *EltTy = MaskTy->getVectorElementType();
+  if (!EltTy->isIntegerTy())
+    return;
 
-    for (int i = 0; i < NumElements; ++i) {
+  // The shuffle mask requires a byte vector - decode cases with
+  // wider elements as well.
+  unsigned BitWidth = cast<IntegerType>(EltTy)->getBitWidth();
+  if ((BitWidth % 8) != 0)
+    return;
+
+  int Scale = BitWidth / 8;
+  int NumBytes = NumElts * Scale;
+  ShuffleMask.reserve(NumBytes);
+
+  for (int i = 0; i != NumElts; ++i) {
+    Constant *COp = C->getAggregateElement(i);
+    if (!COp) {
+      ShuffleMask.clear();
+      return;
+    } else if (isa<UndefValue>(COp)) {
+      ShuffleMask.append(Scale, SM_SentinelUndef);
+      continue;
+    }
+
+    APInt APElt = cast<ConstantInt>(COp)->getValue();
+    for (int j = 0; j != Scale; ++j) {
       // For AVX vectors with 32 bytes the base of the shuffle is the 16-byte
       // lane of the vector we're inside.
-      int Base = i & ~0xf;
-      Constant *COp = C->getAggregateElement(i);
-      if (!COp) {
-        ShuffleMask.clear();
-        return;
-      } else if (isa<UndefValue>(COp)) {
-        ShuffleMask.push_back(SM_SentinelUndef);
-        continue;
-      }
-      uint64_t Element = cast<ConstantInt>(COp)->getZExtValue();
+      int Base = ((i * Scale) + j) & ~0xf;
+
+      uint64_t Element = APElt.getLoBits(8).getZExtValue();
+      APElt = APElt.lshr(8);
+
       // If the high bit (7) of the byte is set, the element is zeroed.
       if (Element & (1 << 7))
         ShuffleMask.push_back(SM_SentinelZero);
@@ -68,7 +87,8 @@ void DecodePSHUFBMask(const Constant *C, SmallVectorImpl<int> &ShuffleMask) {
       }
     }
   }
-  // TODO: Handle funny-looking vectors too.
+
+  assert(NumBytes == (int)ShuffleMask.size() && "Unexpected shuffle mask size");
 }
 
 void DecodeVPERMILPMask(const Constant *C, unsigned ElSize,
@@ -84,9 +104,11 @@ void DecodeVPERMILPMask(const Constant *C, unsigned ElSize,
   //   <4 x i32> <i32 -2147483648, i32 -2147483648,
   //              i32 -2147483648, i32 -2147483648>
 
-  unsigned MaskTySize = MaskTy->getPrimitiveSizeInBits();
+  if (ElSize != 32 && ElSize != 64)
+    return;
 
-  if (MaskTySize != 128 && MaskTySize != 256) // FIXME: Add support for AVX-512.
+  unsigned MaskTySize = MaskTy->getPrimitiveSizeInBits();
+  if (MaskTySize != 128 && MaskTySize != 256 && MaskTySize != 512)
     return;
 
   // Only support vector types.
@@ -99,14 +121,15 @@ void DecodeVPERMILPMask(const Constant *C, unsigned ElSize,
     return;
 
   // Support any element type from byte up to element size.
-  // This is necesary primarily because 64-bit elements get split to 32-bit
+  // This is necessary primarily because 64-bit elements get split to 32-bit
   // in the constant pool on 32-bit target.
   unsigned EltTySize = VecEltTy->getIntegerBitWidth();
   if (EltTySize < 8 || EltTySize > ElSize)
     return;
 
   unsigned NumElements = MaskTySize / ElSize;
-  assert((NumElements == 2 || NumElements == 4 || NumElements == 8) &&
+  assert((NumElements == 2 || NumElements == 4 || NumElements == 8 ||
+          NumElements == 16) &&
          "Unexpected number of vector elements.");
   ShuffleMask.reserve(NumElements);
   unsigned NumElementsPerLane = 128 / ElSize;
@@ -133,12 +156,154 @@ void DecodeVPERMILPMask(const Constant *C, unsigned ElSize,
   // TODO: Handle funny-looking vectors too.
 }
 
+void DecodeVPERMIL2PMask(const Constant *C, unsigned M2Z, unsigned ElSize,
+                         SmallVectorImpl<int> &ShuffleMask) {
+  Type *MaskTy = C->getType();
+
+  unsigned MaskTySize = MaskTy->getPrimitiveSizeInBits();
+  if (MaskTySize != 128 && MaskTySize != 256)
+    return;
+
+  // Only support vector types.
+  if (!MaskTy->isVectorTy())
+    return;
+
+  // Make sure its an integer type.
+  Type *VecEltTy = MaskTy->getVectorElementType();
+  if (!VecEltTy->isIntegerTy())
+    return;
+
+  // Support any element type from byte up to element size.
+  // This is necessary primarily because 64-bit elements get split to 32-bit
+  // in the constant pool on 32-bit target.
+  unsigned EltTySize = VecEltTy->getIntegerBitWidth();
+  if (EltTySize < 8 || EltTySize > ElSize)
+    return;
+
+  unsigned NumElements = MaskTySize / ElSize;
+  assert((NumElements == 2 || NumElements == 4 || NumElements == 8) &&
+         "Unexpected number of vector elements.");
+  ShuffleMask.reserve(NumElements);
+  unsigned NumElementsPerLane = 128 / ElSize;
+  unsigned Factor = ElSize / EltTySize;
+
+  for (unsigned i = 0; i < NumElements; ++i) {
+    Constant *COp = C->getAggregateElement(i * Factor);
+    if (!COp) {
+      ShuffleMask.clear();
+      return;
+    } else if (isa<UndefValue>(COp)) {
+      ShuffleMask.push_back(SM_SentinelUndef);
+      continue;
+    }
+
+    // VPERMIL2 Operation.
+    // Bits[3] - Match Bit.
+    // Bits[2:1] - (Per Lane) PD Shuffle Mask.
+    // Bits[2:0] - (Per Lane) PS Shuffle Mask.
+    uint64_t Selector = cast<ConstantInt>(COp)->getZExtValue();
+    unsigned MatchBit = (Selector >> 3) & 0x1;
+
+    // M2Z[0:1]     MatchBit
+    //   0Xb           X        Source selected by Selector index.
+    //   10b           0        Source selected by Selector index.
+    //   10b           1        Zero.
+    //   11b           0        Zero.
+    //   11b           1        Source selected by Selector index.
+    if ((M2Z & 0x2) != 0u && MatchBit != (M2Z & 0x1)) {
+      ShuffleMask.push_back(SM_SentinelZero);
+      continue;
+    }
+
+    int Index = i & ~(NumElementsPerLane - 1);
+    if (ElSize == 64)
+      Index += (Selector >> 1) & 0x1;
+    else
+      Index += Selector & 0x3;
+
+    int Src = (Selector >> 2) & 0x1;
+    Index += Src * NumElements;
+    ShuffleMask.push_back(Index);
+  }
+
+  // TODO: Handle funny-looking vectors too.
+}
+
+void DecodeVPPERMMask(const Constant *C, SmallVectorImpl<int> &ShuffleMask) {
+  Type *MaskTy = C->getType();
+  assert(MaskTy->getPrimitiveSizeInBits() == 128);
+
+  // Only support vector types.
+  if (!MaskTy->isVectorTy())
+    return;
+
+  // Make sure its an integer type.
+  Type *VecEltTy = MaskTy->getVectorElementType();
+  if (!VecEltTy->isIntegerTy())
+    return;
+
+  // The shuffle mask requires a byte vector - decode cases with
+  // wider elements as well.
+  unsigned BitWidth = cast<IntegerType>(VecEltTy)->getBitWidth();
+  if ((BitWidth % 8) != 0)
+    return;
+
+  int NumElts = MaskTy->getVectorNumElements();
+  int Scale = BitWidth / 8;
+  int NumBytes = NumElts * Scale;
+  ShuffleMask.reserve(NumBytes);
+
+  for (int i = 0; i != NumElts; ++i) {
+    Constant *COp = C->getAggregateElement(i);
+    if (!COp) {
+      ShuffleMask.clear();
+      return;
+    } else if (isa<UndefValue>(COp)) {
+      ShuffleMask.append(Scale, SM_SentinelUndef);
+      continue;
+    }
+
+    // VPPERM Operation
+    // Bits[4:0] - Byte Index (0 - 31)
+    // Bits[7:5] - Permute Operation
+    //
+    // Permute Operation:
+    // 0 - Source byte (no logical operation).
+    // 1 - Invert source byte.
+    // 2 - Bit reverse of source byte.
+    // 3 - Bit reverse of inverted source byte.
+    // 4 - 00h (zero - fill).
+    // 5 - FFh (ones - fill).
+    // 6 - Most significant bit of source byte replicated in all bit positions.
+    // 7 - Invert most significant bit of source byte and replicate in all bit positions.
+    APInt MaskElt = cast<ConstantInt>(COp)->getValue();
+    for (int j = 0; j != Scale; ++j) {
+      APInt Index = MaskElt.getLoBits(5);
+      APInt PermuteOp = MaskElt.lshr(5).getLoBits(3);
+      MaskElt = MaskElt.lshr(8);
+
+      if (PermuteOp == 4) {
+        ShuffleMask.push_back(SM_SentinelZero);
+        continue;
+      }
+      if (PermuteOp != 0) {
+        ShuffleMask.clear();
+        return;
+      }
+      ShuffleMask.push_back((int)Index.getZExtValue());
+    }
+  }
+
+  assert(NumBytes == (int)ShuffleMask.size() && "Unexpected shuffle mask size");
+}
+
 void DecodeVPERMVMask(const Constant *C, MVT VT,
                       SmallVectorImpl<int> &ShuffleMask) {
   Type *MaskTy = C->getType();
   if (MaskTy->isVectorTy()) {
     unsigned NumElements = MaskTy->getVectorNumElements();
     if (NumElements == VT.getVectorNumElements()) {
+      unsigned EltMaskSize = Log2_64(NumElements);
       for (unsigned i = 0; i < NumElements; ++i) {
         Constant *COp = C->getAggregateElement(i);
         if (!COp || (!isa<UndefValue>(COp) && !isa<ConstantInt>(COp))) {
@@ -148,9 +313,9 @@ void DecodeVPERMVMask(const Constant *C, MVT VT,
         if (isa<UndefValue>(COp))
           ShuffleMask.push_back(SM_SentinelUndef);
         else {
-          uint64_t Element = cast<ConstantInt>(COp)->getZExtValue();
-          Element &= (1 << NumElements) - 1;
-          ShuffleMask.push_back(Element);
+          APInt Element = cast<ConstantInt>(COp)->getValue();
+          Element = Element.getLoBits(EltMaskSize);
+          ShuffleMask.push_back(Element.getZExtValue());
         }
       }
     }
@@ -171,6 +336,7 @@ void DecodeVPERMV3Mask(const Constant *C, MVT VT,
   Type *MaskTy = C->getType();
   unsigned NumElements = MaskTy->getVectorNumElements();
   if (NumElements == VT.getVectorNumElements()) {
+    unsigned EltMaskSize = Log2_64(NumElements * 2);
     for (unsigned i = 0; i < NumElements; ++i) {
       Constant *COp = C->getAggregateElement(i);
       if (!COp) {
@@ -180,9 +346,9 @@ void DecodeVPERMV3Mask(const Constant *C, MVT VT,
       if (isa<UndefValue>(COp))
         ShuffleMask.push_back(SM_SentinelUndef);
       else {
-        uint64_t Element = cast<ConstantInt>(COp)->getZExtValue();
-        Element &= (1 << NumElements*2) - 1;
-        ShuffleMask.push_back(Element);
+        APInt Element = cast<ConstantInt>(COp)->getValue();
+        Element = Element.getLoBits(EltMaskSize);
+        ShuffleMask.push_back(Element.getZExtValue());
       }
     }
   }
diff --git a/lib/Target/X86/X86ShuffleDecodeConstantPool.h b/lib/Target/X86/X86ShuffleDecodeConstantPool.h
index bcf46322c8cd..d2565b849807 100644
--- a/lib/Target/X86/X86ShuffleDecodeConstantPool.h
+++ b/lib/Target/X86/X86ShuffleDecodeConstantPool.h
@@ -25,18 +25,25 @@ namespace llvm {
 class Constant;
 class MVT;
 
-/// \brief Decode a PSHUFB mask from an IR-level vector constant.
+/// Decode a PSHUFB mask from an IR-level vector constant.
 void DecodePSHUFBMask(const Constant *C, SmallVectorImpl<int> &ShuffleMask);
 
-/// \brief Decode a VPERMILP variable mask from an IR-level vector constant.
+/// Decode a VPERMILP variable mask from an IR-level vector constant.
 void DecodeVPERMILPMask(const Constant *C, unsigned ElSize,
                         SmallVectorImpl<int> &ShuffleMask);
 
-/// \brief Decode a VPERM W/D/Q/PS/PD mask from an IR-level vector constant.
+/// Decode a VPERMILP2 variable mask from an IR-level vector constant.
+void DecodeVPERMIL2PMask(const Constant *C, unsigned MatchImm, unsigned ElSize,
+                         SmallVectorImpl<int> &ShuffleMask);
+
+/// Decode a VPPERM variable mask from an IR-level vector constant.
+void DecodeVPPERMMask(const Constant *C, SmallVectorImpl<int> &ShuffleMask);
+
+/// Decode a VPERM W/D/Q/PS/PD mask from an IR-level vector constant.
 void DecodeVPERMVMask(const Constant *C, MVT VT,
                       SmallVectorImpl<int> &ShuffleMask);
 
-/// \brief Decode a VPERMT2 W/D/Q/PS/PD mask from an IR-level vector constant.
+/// Decode a VPERMT2 W/D/Q/PS/PD mask from an IR-level vector constant.
 void DecodeVPERMV3Mask(const Constant *C, MVT VT,
                        SmallVectorImpl<int> &ShuffleMask);
 
diff --git a/lib/Target/X86/X86Subtarget.cpp b/lib/Target/X86/X86Subtarget.cpp
index 8ef08c960f0b..8f77682d2276 100644
--- a/lib/Target/X86/X86Subtarget.cpp
+++ b/lib/Target/X86/X86Subtarget.cpp
@@ -46,105 +46,99 @@ X86EarlyIfConv("x86-early-ifcvt", cl::Hidden,
 
 /// Classify a blockaddress reference for the current subtarget according to how
 /// we should reference it in a non-pcrel context.
-unsigned char X86Subtarget::ClassifyBlockAddressReference() const {
-  if (isPICStyleGOT())    // 32-bit ELF targets.
-    return X86II::MO_GOTOFF;
-
-  if (isPICStyleStubPIC())   // Darwin/32 in PIC mode.
-    return X86II::MO_PIC_BASE_OFFSET;
-
-  // Direct static reference to label.
-  return X86II::MO_NO_FLAG;
+unsigned char X86Subtarget::classifyBlockAddressReference() const {
+  return classifyLocalReference(nullptr);
 }
 
 /// Classify a global variable reference for the current subtarget according to
 /// how we should reference it in a non-pcrel context.
-unsigned char X86Subtarget::
-ClassifyGlobalReference(const GlobalValue *GV, const TargetMachine &TM) const {
-  // DLLImport only exists on windows, it is implemented as a load from a
-  // DLLIMPORT stub.
-  if (GV->hasDLLImportStorageClass())
-    return X86II::MO_DLLIMPORT;
+unsigned char
+X86Subtarget::classifyGlobalReference(const GlobalValue *GV) const {
+  return classifyGlobalReference(GV, *GV->getParent());
+}
 
-  bool isDef = GV->isStrongDefinitionForLinker();
+unsigned char
+X86Subtarget::classifyLocalReference(const GlobalValue *GV) const {
+  // 64 bits can use %rip addressing for anything local.
+  if (is64Bit())
+    return X86II::MO_NO_FLAG;
 
-  // X86-64 in PIC mode.
-  if (isPICStyleRIPRel()) {
-    // Large model never uses stubs.
-    if (TM.getCodeModel() == CodeModel::Large)
-      return X86II::MO_NO_FLAG;
+  // If this is for a position dependent executable, the static linker can
+  // figure it out.
+  if (!isPositionIndependent())
+    return X86II::MO_NO_FLAG;
 
-    if (isTargetDarwin()) {
-      // If symbol visibility is hidden, the extra load is not needed if
-      // target is x86-64 or the symbol is definitely defined in the current
-      // translation unit.
-      if (GV->hasDefaultVisibility() && !isDef)
-        return X86II::MO_GOTPCREL;
-    } else if (!isTargetWin64()) {
-      assert(isTargetELF() && "Unknown rip-relative target");
+  // The COFF dynamic linker just patches the executable sections.
+  if (isTargetCOFF())
+    return X86II::MO_NO_FLAG;
 
-      // Extra load is needed for all externally visible.
-      if (!GV->hasLocalLinkage() && GV->hasDefaultVisibility())
-        return X86II::MO_GOTPCREL;
-    }
+  if (isTargetDarwin()) {
+    // 32 bit macho has no relocation for a-b if a is undefined, even if
+    // b is in the section that is being relocated.
+    // This means we have to use o load even for GVs that are known to be
+    // local to the dso.
+    if (GV && (GV->isDeclarationForLinker() || GV->hasCommonLinkage()))
+      return X86II::MO_DARWIN_NONLAZY_PIC_BASE;
 
-    return X86II::MO_NO_FLAG;
+    return X86II::MO_PIC_BASE_OFFSET;
   }
 
-  if (isPICStyleGOT()) {   // 32-bit ELF targets.
-    // Extra load is needed for all externally visible.
-    if (GV->hasLocalLinkage() || GV->hasHiddenVisibility())
-      return X86II::MO_GOTOFF;
-    return X86II::MO_GOT;
-  }
+  return X86II::MO_GOTOFF;
+}
 
-  if (isPICStyleStubPIC()) {  // Darwin/32 in PIC mode.
-    // Determine whether we have a stub reference and/or whether the reference
-    // is relative to the PIC base or not.
+unsigned char X86Subtarget::classifyGlobalReference(const GlobalValue *GV,
+                                                    const Module &M) const {
+  // Large model never uses stubs.
+  if (TM.getCodeModel() == CodeModel::Large)
+    return X86II::MO_NO_FLAG;
 
-    // If this is a strong reference to a definition, it is definitely not
-    // through a stub.
-    if (isDef)
-      return X86II::MO_PIC_BASE_OFFSET;
+  if (TM.shouldAssumeDSOLocal(M, GV))
+    return classifyLocalReference(GV);
 
-    // Unless we have a symbol with hidden visibility, we have to go through a
-    // normal $non_lazy_ptr stub because this symbol might be resolved late.
-    if (!GV->hasHiddenVisibility())  // Non-hidden $non_lazy_ptr reference.
-      return X86II::MO_DARWIN_NONLAZY_PIC_BASE;
+  if (isTargetCOFF())
+    return X86II::MO_DLLIMPORT;
 
-    // If symbol visibility is hidden, we have a stub for common symbol
-    // references and external declarations.
-    if (GV->isDeclarationForLinker() || GV->hasCommonLinkage()) {
-      // Hidden $non_lazy_ptr reference.
-      return X86II::MO_DARWIN_HIDDEN_NONLAZY_PIC_BASE;
-    }
+  if (is64Bit())
+    return X86II::MO_GOTPCREL;
 
-    // Otherwise, no stub.
-    return X86II::MO_PIC_BASE_OFFSET;
+  if (isTargetDarwin()) {
+    if (!isPositionIndependent())
+      return X86II::MO_DARWIN_NONLAZY;
+    return X86II::MO_DARWIN_NONLAZY_PIC_BASE;
   }
 
-  if (isPICStyleStubNoDynamic()) {  // Darwin/32 in -mdynamic-no-pic mode.
-    // Determine whether we have a stub reference.
+  return X86II::MO_GOT;
+}
 
-    // If this is a strong reference to a definition, it is definitely not
-    // through a stub.
-    if (isDef)
-      return X86II::MO_NO_FLAG;
+unsigned char
+X86Subtarget::classifyGlobalFunctionReference(const GlobalValue *GV) const {
+  return classifyGlobalFunctionReference(GV, *GV->getParent());
+}
 
-    // Unless we have a symbol with hidden visibility, we have to go through a
-    // normal $non_lazy_ptr stub because this symbol might be resolved late.
-    if (!GV->hasHiddenVisibility())  // Non-hidden $non_lazy_ptr reference.
-      return X86II::MO_DARWIN_NONLAZY;
+unsigned char
+X86Subtarget::classifyGlobalFunctionReference(const GlobalValue *GV,
+                                              const Module &M) const {
+  if (TM.shouldAssumeDSOLocal(M, GV))
+    return X86II::MO_NO_FLAG;
+
+  assert(!isTargetCOFF());
 
-    // Otherwise, no stub.
+  if (isTargetELF())
+    return X86II::MO_PLT;
+
+  if (is64Bit()) {
+    auto *F = dyn_cast_or_null<Function>(GV);
+    if (F && F->hasFnAttribute(Attribute::NonLazyBind))
+      // If the function is marked as non-lazy, generate an indirect call
+      // which loads from the GOT directly. This avoids runtime overhead
+      // at the cost of eager binding (and one extra byte of encoding).
+      return X86II::MO_GOTPCREL;
     return X86II::MO_NO_FLAG;
   }
 
-  // Direct static reference to global.
   return X86II::MO_NO_FLAG;
 }
 
-
 /// This function returns the name of a function which has an interface like
 /// the non-standard bzero function, if such a function exists on the
 /// current subtarget and it is considered preferable over memset with zero
@@ -165,7 +159,7 @@ bool X86Subtarget::hasSinCos() const {
 }
 
 /// Return true if the subtarget allows calls to immediate address.
-bool X86Subtarget::IsLegalToCallImmediateAddr(const TargetMachine &TM) const {
+bool X86Subtarget::isLegalToCallImmediateAddr() const {
   // FIXME: I386 PE/COFF supports PC relative calls using IMAGE_REL_I386_REL32
   // but WinCOFFObjectWriter::RecordRelocation cannot emit them.  Once it does,
   // the following check for Win32 should be removed.
@@ -227,18 +221,19 @@ void X86Subtarget::initSubtargetFeatures(StringRef CPU, StringRef FS) {
   assert((!In64BitMode || HasX86_64) &&
          "64-bit code requested on a subtarget that doesn't support it!");
 
-  // Stack alignment is 16 bytes on Darwin, Linux and Solaris (both
+  // Stack alignment is 16 bytes on Darwin, Linux, kFreeBSD and Solaris (both
   // 32 and 64 bit) and for all 64-bit targets.
   if (StackAlignOverride)
     stackAlignment = StackAlignOverride;
   else if (isTargetDarwin() || isTargetLinux() || isTargetSolaris() ||
-           In64BitMode)
+           isTargetKFreeBSD() || In64BitMode)
     stackAlignment = 16;
 }
 
 void X86Subtarget::initializeEnvironment() {
   X86SSELevel = NoSSE;
   X863DNowLevel = NoThreeDNow;
+  HasX87 = false;
   HasCMov = false;
   HasX86_64 = false;
   HasPOPCNT = false;
@@ -261,6 +256,8 @@ void X86Subtarget::initializeEnvironment() {
   HasLZCNT = false;
   HasBMI = false;
   HasBMI2 = false;
+  HasVBMI = false;
+  HasIFMA = false;
   HasRTM = false;
   HasHLE = false;
   HasERI = false;
@@ -275,6 +272,7 @@ void X86Subtarget::initializeEnvironment() {
   HasPRFCHW = false;
   HasRDSEED = false;
   HasLAHFSAHF = false;
+  HasMWAITX = false;
   HasMPX = false;
   IsBTMemSlow = false;
   IsSHLDSlow = false;
@@ -283,6 +281,7 @@ void X86Subtarget::initializeEnvironment() {
   HasSSEUnalignedMem = false;
   HasCmpxchg16b = false;
   UseLeaForSP = false;
+  HasFastPartialYMMWrite = false;
   HasSlowDivide32 = false;
   HasSlowDivide64 = false;
   PadShortFunctions = false;
@@ -303,11 +302,11 @@ X86Subtarget &X86Subtarget::initializeSubtargetDependencies(StringRef CPU,
   return *this;
 }
 
-X86Subtarget::X86Subtarget(const Triple &TT, const std::string &CPU,
-                           const std::string &FS, const X86TargetMachine &TM,
+X86Subtarget::X86Subtarget(const Triple &TT, StringRef CPU, StringRef FS,
+                           const X86TargetMachine &TM,
                            unsigned StackAlignOverride)
     : X86GenSubtargetInfo(TT, CPU, FS), X86ProcFamily(Others),
-      PICStyle(PICStyles::None), TargetTriple(TT),
+      PICStyle(PICStyles::None), TM(TM), TargetTriple(TT),
       StackAlignOverride(StackAlignOverride),
       In64BitMode(TargetTriple.getArch() == Triple::x86_64),
       In32BitMode(TargetTriple.getArch() == Triple::x86 &&
@@ -317,24 +316,16 @@ X86Subtarget::X86Subtarget(const Triple &TT, const std::string &CPU,
       TSInfo(), InstrInfo(initializeSubtargetDependencies(CPU, FS)),
       TLInfo(TM, *this), FrameLowering(*this, getStackAlignment()) {
   // Determine the PICStyle based on the target selected.
-  if (TM.getRelocationModel() == Reloc::Static) {
-    // Unless we're in PIC or DynamicNoPIC mode, set the PIC style to None.
+  if (!isPositionIndependent())
     setPICStyle(PICStyles::None);
-  } else if (is64Bit()) {
-    // PIC in 64 bit mode is always rip-rel.
+  else if (is64Bit())
     setPICStyle(PICStyles::RIPRel);
-  } else if (isTargetCOFF()) {
+  else if (isTargetCOFF())
     setPICStyle(PICStyles::None);
-  } else if (isTargetDarwin()) {
-    if (TM.getRelocationModel() == Reloc::PIC_)
-      setPICStyle(PICStyles::StubPIC);
-    else {
-      assert(TM.getRelocationModel() == Reloc::DynamicNoPIC);
-      setPICStyle(PICStyles::StubDynamicNoPIC);
-    }
-  } else if (isTargetELF()) {
+  else if (isTargetDarwin())
+    setPICStyle(PICStyles::StubPIC);
+  else if (isTargetELF())
     setPICStyle(PICStyles::GOT);
-  }
 }
 
 bool X86Subtarget::enableEarlyIfConversion() const {
diff --git a/lib/Target/X86/X86Subtarget.h b/lib/Target/X86/X86Subtarget.h
index 13d1026dcaa0..a274b797ca8f 100644
--- a/lib/Target/X86/X86Subtarget.h
+++ b/lib/Target/X86/X86Subtarget.h
@@ -35,11 +35,10 @@ class TargetMachine;
 ///
 namespace PICStyles {
 enum Style {
-  StubPIC,          // Used on i386-darwin in -fPIC mode.
-  StubDynamicNoPIC, // Used on i386-darwin in -mdynamic-no-pic mode.
-  GOT,              // Used on many 32-bit unices in -fPIC mode.
-  RIPRel,           // Used on X86-64 when not in -static mode.
-  None              // Set when in -static mode (not PIC or DynamicNoPIC mode).
+  StubPIC,          // Used on i386-darwin in pic mode.
+  GOT,              // Used on 32 bit elf on when in pic mode.
+  RIPRel,           // Used on X86-64 when in pic mode.
+  None              // Set when not in pic mode.
 };
 }
 
@@ -64,12 +63,17 @@ protected:
   /// Which PIC style to use
   PICStyles::Style PICStyle;
 
+  const TargetMachine &TM;
+
   /// SSE1, SSE2, SSE3, SSSE3, SSE41, SSE42, or none supported.
   X86SSEEnum X86SSELevel;
 
   /// MMX, 3DNow, 3DNow Athlon, or none supported.
   X863DNowEnum X863DNowLevel;
 
+  /// True if the processor supports X87 instructions.
+  bool HasX87;
+
   /// True if this processor has conditional move instructions
   /// (generally pentium pro+).
   bool HasCMov;
@@ -134,6 +138,12 @@ protected:
   /// Processor has BMI2 instructions.
   bool HasBMI2;
 
+  /// Processor has VBMI instructions.
+  bool HasVBMI;
+
+  /// Processor has Integer Fused Multiply Add
+  bool HasIFMA;
+
   /// Processor has RTM instructions.
   bool HasRTM;
 
@@ -155,6 +165,12 @@ protected:
   /// Processor has LAHF/SAHF instructions.
   bool HasLAHFSAHF;
 
+  /// Processor has MONITORX/MWAITX instructions.
+  bool HasMWAITX;
+
+  /// Processor has Prefetch with intent to Write instruction
+  bool HasPFPREFETCHWT1;
+
   /// True if BT (bit test) of memory instructions are slow.
   bool IsBTMemSlow;
 
@@ -179,6 +195,10 @@ protected:
   /// the stack pointer. This is an optimization for Intel Atom processors.
   bool UseLeaForSP;
 
+  /// True if there is no performance penalty to writing only the lower parts
+  /// of a YMM register without clearing the upper part.
+  bool HasFastPartialYMMWrite;
+
   /// True if 8-bit divisions are significantly faster than
   /// 32-bit divisions and should be used when possible.
   bool HasSlowDivide32;
@@ -226,9 +246,30 @@ protected:
   /// Processor has PKU extenstions
   bool HasPKU;
 
-  /// Processot supports MPX - Memory Protection Extensions
+  /// Processor supports MPX - Memory Protection Extensions
   bool HasMPX;
 
+  /// Processor supports Invalidate Process-Context Identifier
+  bool HasInvPCId;
+
+  /// Processor has VM Functions
+  bool HasVMFUNC;
+
+  /// Processor has Supervisor Mode Access Protection
+  bool HasSMAP;
+
+  /// Processor has Software Guard Extensions
+  bool HasSGX;
+
+  /// Processor supports Flush Cache Line instruction
+  bool HasCLFLUSHOPT;
+
+  /// Processor has Persistent Commit feature
+  bool HasPCOMMIT;
+
+  /// Processor supports Cache Line Write Back instruction
+  bool HasCLWB;
+
   /// Use software floating point for code generation.
   bool UseSoftFloat;
 
@@ -271,7 +312,7 @@ public:
   /// This constructor initializes the data members to match that
   /// of the specified triple.
   ///
-  X86Subtarget(const Triple &TT, const std::string &CPU, const std::string &FS,
+  X86Subtarget(const Triple &TT, StringRef CPU, StringRef FS,
                const X86TargetMachine &TM, unsigned StackAlignOverride);
 
   const X86TargetLowering *getTargetLowering() const override {
@@ -336,6 +377,7 @@ public:
   PICStyles::Style getPICStyle() const { return PICStyle; }
   void setPICStyle(PICStyles::Style Style)  { PICStyle = Style; }
 
+  bool hasX87() const { return HasX87; }
   bool hasCMov() const { return HasCMov; }
   bool hasSSE1() const { return X86SSELevel >= SSE1; }
   bool hasSSE2() const { return X86SSELevel >= SSE2; }
@@ -374,6 +416,8 @@ public:
   bool hasLZCNT() const { return HasLZCNT; }
   bool hasBMI() const { return HasBMI; }
   bool hasBMI2() const { return HasBMI2; }
+  bool hasVBMI() const { return HasVBMI; }
+  bool hasIFMA() const { return HasIFMA; }
   bool hasRTM() const { return HasRTM; }
   bool hasHLE() const { return HasHLE; }
   bool hasADX() const { return HasADX; }
@@ -381,6 +425,7 @@ public:
   bool hasPRFCHW() const { return HasPRFCHW; }
   bool hasRDSEED() const { return HasRDSEED; }
   bool hasLAHFSAHF() const { return HasLAHFSAHF; }
+  bool hasMWAITX() const { return HasMWAITX; }
   bool isBTMemSlow() const { return IsBTMemSlow; }
   bool isSHLDSlow() const { return IsSHLDSlow; }
   bool isUnalignedMem16Slow() const { return IsUAMem16Slow; }
@@ -388,6 +433,7 @@ public:
   bool hasSSEUnalignedMem() const { return HasSSEUnalignedMem; }
   bool hasCmpxchg16b() const { return HasCmpxchg16b; }
   bool useLeaForSP() const { return UseLeaForSP; }
+  bool hasFastPartialYMMWrite() const { return HasFastPartialYMMWrite; }
   bool hasSlowDivide32() const { return HasSlowDivide32; }
   bool hasSlowDivide64() const { return HasSlowDivide64; }
   bool padShortFunctions() const { return PadShortFunctions; }
@@ -408,6 +454,11 @@ public:
   bool isSLM() const { return X86ProcFamily == IntelSLM; }
   bool useSoftFloat() const { return UseSoftFloat; }
 
+  /// Use mfence if we have SSE2 or we're on x86-64 (even if we asked for
+  /// no-sse2). There isn't any reason to disable it if the target processor
+  /// supports it.
+  bool hasMFence() const { return hasSSE2() || is64Bit(); }
+
   const Triple &getTargetTriple() const { return TargetTriple; }
 
   bool isTargetDarwin() const { return TargetTriple.isOSDarwin(); }
@@ -421,6 +472,8 @@ public:
   bool isTargetMachO() const { return TargetTriple.isOSBinFormatMachO(); }
 
   bool isTargetLinux() const { return TargetTriple.isOSLinux(); }
+  bool isTargetKFreeBSD() const { return TargetTriple.isOSKFreeBSD(); }
+  bool isTargetGlibc() const { return TargetTriple.isOSGlibc(); }
   bool isTargetAndroid() const { return TargetTriple.isAndroid(); }
   bool isTargetNaCl() const { return TargetTriple.isOSNaCl(); }
   bool isTargetNaCl32() const { return isTargetNaCl() && !is64Bit(); }
@@ -463,7 +516,6 @@ public:
     return !In64BitMode && (isTargetCygMing() || isTargetKnownWindowsMSVC());
   }
 
-  bool isPICStyleSet() const { return PICStyle != PICStyles::None; }
   bool isPICStyleGOT() const { return PICStyle == PICStyles::GOT; }
   bool isPICStyleRIPRel() const { return PICStyle == PICStyles::RIPRel; }
 
@@ -471,13 +523,7 @@ public:
     return PICStyle == PICStyles::StubPIC;
   }
 
-  bool isPICStyleStubNoDynamic() const {
-    return PICStyle == PICStyles::StubDynamicNoPIC;
-  }
-  bool isPICStyleStubAny() const {
-    return PICStyle == PICStyles::StubDynamicNoPIC ||
-           PICStyle == PICStyles::StubPIC;
-  }
+  bool isPositionIndependent() const { return TM.isPositionIndependent(); }
 
   bool isCallingConvWin64(CallingConv::ID CC) const {
     switch (CC) {
@@ -502,18 +548,25 @@ public:
     }
   }
 
-  /// ClassifyGlobalReference - Classify a global variable reference for the
-  /// current subtarget according to how we should reference it in a non-pcrel
-  /// context.
-  unsigned char ClassifyGlobalReference(const GlobalValue *GV,
-                                        const TargetMachine &TM)const;
+  /// Classify a global variable reference for the current subtarget according
+  /// to how we should reference it in a non-pcrel context.
+  unsigned char classifyLocalReference(const GlobalValue *GV) const;
+
+  unsigned char classifyGlobalReference(const GlobalValue *GV,
+                                        const Module &M) const;
+  unsigned char classifyGlobalReference(const GlobalValue *GV) const;
+
+  /// Classify a global function reference for the current subtarget.
+  unsigned char classifyGlobalFunctionReference(const GlobalValue *GV,
+                                                const Module &M) const;
+  unsigned char classifyGlobalFunctionReference(const GlobalValue *GV) const;
 
   /// Classify a blockaddress reference for the current subtarget according to
   /// how we should reference it in a non-pcrel context.
-  unsigned char ClassifyBlockAddressReference() const;
+  unsigned char classifyBlockAddressReference() const;
 
   /// Return true if the subtarget allows calls to immediate address.
-  bool IsLegalToCallImmediateAddr(const TargetMachine &TM) const;
+  bool isLegalToCallImmediateAddr() const;
 
   /// This function returns the name of a function which has an interface
   /// like the non-standard bzero function, if such a function exists on
diff --git a/lib/Target/X86/X86TargetMachine.cpp b/lib/Target/X86/X86TargetMachine.cpp
index 0e7e4c0c84a9..50c9c25a27c0 100644
--- a/lib/Target/X86/X86TargetMachine.cpp
+++ b/lib/Target/X86/X86TargetMachine.cpp
@@ -16,6 +16,7 @@
 #include "X86TargetObjectFile.h"
 #include "X86TargetTransformInfo.h"
 #include "llvm/CodeGen/Passes.h"
+#include "llvm/CodeGen/TargetPassConfig.h"
 #include "llvm/IR/Function.h"
 #include "llvm/IR/LegacyPassManager.h"
 #include "llvm/Support/CommandLine.h"
@@ -39,6 +40,7 @@ extern "C" void LLVMInitializeX86Target() {
 
   PassRegistry &PR = *PassRegistry::getPassRegistry();
   initializeWinEHStatePassPass(PR);
+  initializeFixupBWInstPassPass(PR);
 }
 
 static std::unique_ptr<TargetLoweringObjectFile> createTLOF(const Triple &TT) {
@@ -73,17 +75,22 @@ static std::string computeDataLayout(const Triple &TT) {
   // Some ABIs align 64 bit integers and doubles to 64 bits, others to 32.
   if (TT.isArch64Bit() || TT.isOSWindows() || TT.isOSNaCl())
     Ret += "-i64:64";
+  else if (TT.isOSIAMCU())
+    Ret += "-i64:32-f64:32";
   else
     Ret += "-f64:32:64";
 
   // Some ABIs align long double to 128 bits, others to 32.
-  if (TT.isOSNaCl())
+  if (TT.isOSNaCl() || TT.isOSIAMCU())
     ; // No f80
   else if (TT.isArch64Bit() || TT.isOSDarwin())
     Ret += "-f80:128";
   else
     Ret += "-f80:32";
 
+  if (TT.isOSIAMCU())
+    Ret += "-f128:32";
+
   // The registers can hold 8, 16, 32 or, in x86-64, 64 bits.
   if (TT.isArch64Bit())
     Ret += "-n8:16:32:64";
@@ -91,7 +98,7 @@ static std::string computeDataLayout(const Triple &TT) {
     Ret += "-n8:16:32";
 
   // The stack is aligned to 32 bits on some ABIs and 128 bits on others.
-  if (!TT.isArch64Bit() && TT.isOSWindows())
+  if ((!TT.isArch64Bit() && TT.isOSWindows()) || TT.isOSIAMCU())
     Ret += "-a:0:32-S32";
   else
     Ret += "-S128";
@@ -99,22 +106,60 @@ static std::string computeDataLayout(const Triple &TT) {
   return Ret;
 }
 
-/// X86TargetMachine ctor - Create an X86 target.
+static Reloc::Model getEffectiveRelocModel(const Triple &TT,
+                                           Optional<Reloc::Model> RM) {
+  bool is64Bit = TT.getArch() == Triple::x86_64;
+  if (!RM.hasValue()) {
+    // Darwin defaults to PIC in 64 bit mode and dynamic-no-pic in 32 bit mode.
+    // Win64 requires rip-rel addressing, thus we force it to PIC. Otherwise we
+    // use static relocation model by default.
+    if (TT.isOSDarwin()) {
+      if (is64Bit)
+        return Reloc::PIC_;
+      return Reloc::DynamicNoPIC;
+    }
+    if (TT.isOSWindows() && is64Bit)
+      return Reloc::PIC_;
+    return Reloc::Static;
+  }
+
+  // ELF and X86-64 don't have a distinct DynamicNoPIC model.  DynamicNoPIC
+  // is defined as a model for code which may be used in static or dynamic
+  // executables but not necessarily a shared library. On X86-32 we just
+  // compile in -static mode, in x86-64 we use PIC.
+  if (*RM == Reloc::DynamicNoPIC) {
+    if (is64Bit)
+      return Reloc::PIC_;
+    if (!TT.isOSDarwin())
+      return Reloc::Static;
+  }
+
+  // If we are on Darwin, disallow static relocation model in X86-64 mode, since
+  // the Mach-O file format doesn't support it.
+  if (*RM == Reloc::Static && TT.isOSDarwin() && is64Bit)
+    return Reloc::PIC_;
+
+  return *RM;
+}
+
+/// Create an X86 target.
 ///
 X86TargetMachine::X86TargetMachine(const Target &T, const Triple &TT,
                                    StringRef CPU, StringRef FS,
                                    const TargetOptions &Options,
-                                   Reloc::Model RM, CodeModel::Model CM,
-                                   CodeGenOpt::Level OL)
-    : LLVMTargetMachine(T, computeDataLayout(TT), TT, CPU, FS, Options, RM, CM,
-                        OL),
+                                   Optional<Reloc::Model> RM,
+                                   CodeModel::Model CM, CodeGenOpt::Level OL)
+    : LLVMTargetMachine(T, computeDataLayout(TT), TT, CPU, FS, Options,
+                        getEffectiveRelocModel(TT, RM), CM, OL),
       TLOF(createTLOF(getTargetTriple())),
       Subtarget(TT, CPU, FS, *this, Options.StackAlignmentOverride) {
   // Windows stack unwinder gets confused when execution flow "falls through"
   // after a call to 'noreturn' function.
   // To prevent that, we emit a trap for 'unreachable' IR instructions.
   // (which on X86, happens to be the 'ud2' instruction)
-  if (Subtarget.isTargetWin64())
+  // On PS4, the "return address" of a 'noreturn' call must still be within
+  // the calling function, and TrapUnreachable is an easy way to get that.
+  if (Subtarget.isTargetWin64() || Subtarget.isTargetPS4())
     this->Options.TrapUnreachable = true;
 
   // By default (and when -ffast-math is on), enable estimate codegen for
@@ -137,12 +182,17 @@ X86TargetMachine::getSubtargetImpl(const Function &F) const {
   Attribute CPUAttr = F.getFnAttribute("target-cpu");
   Attribute FSAttr = F.getFnAttribute("target-features");
 
-  std::string CPU = !CPUAttr.hasAttribute(Attribute::None)
-                        ? CPUAttr.getValueAsString().str()
-                        : TargetCPU;
-  std::string FS = !FSAttr.hasAttribute(Attribute::None)
-                       ? FSAttr.getValueAsString().str()
-                       : TargetFS;
+  StringRef CPU = !CPUAttr.hasAttribute(Attribute::None)
+                      ? CPUAttr.getValueAsString()
+                      : (StringRef)TargetCPU;
+  StringRef FS = !FSAttr.hasAttribute(Attribute::None)
+                     ? FSAttr.getValueAsString()
+                     : (StringRef)TargetFS;
+
+  SmallString<512> Key;
+  Key.reserve(CPU.size() + FS.size());
+  Key += CPU;
+  Key += FS;
 
   // FIXME: This is related to the code below to reset the target options,
   // we need to know whether or not the soft float flag is set on the
@@ -150,14 +200,15 @@ X86TargetMachine::getSubtargetImpl(const Function &F) const {
   // it as a key for the subtarget since that can be the only difference
   // between two functions.
   bool SoftFloat =
-      F.hasFnAttribute("use-soft-float") &&
       F.getFnAttribute("use-soft-float").getValueAsString() == "true";
   // If the soft float attribute is set on the function turn on the soft float
   // subtarget feature.
   if (SoftFloat)
-    FS += FS.empty() ? "+soft-float" : ",+soft-float";
+    Key += FS.empty() ? "+soft-float" : ",+soft-float";
 
-  auto &I = SubtargetMap[CPU + FS];
+  FS = Key.substr(CPU.size());
+
+  auto &I = SubtargetMap[Key];
   if (!I) {
     // This needs to be done before we create a new subtarget since any
     // creation will depend on the TM and the code generation flags on the
@@ -234,7 +285,6 @@ bool X86PassConfig::addInstSelector() {
     addPass(createCleanupLocalDynamicTLSPass());
 
   addPass(createX86GlobalBaseRegPass());
-
   return false;
 }
 
@@ -254,10 +304,13 @@ bool X86PassConfig::addPreISel() {
 }
 
 void X86PassConfig::addPreRegAlloc() {
-  if (getOptLevel() != CodeGenOpt::None)
+  if (getOptLevel() != CodeGenOpt::None) {
+    addPass(createX86FixupSetCC());
     addPass(createX86OptimizeLEAs());
+    addPass(createX86CallFrameOptimization());
+  }
 
-  addPass(createX86CallFrameOptimization());
+  addPass(createX86WinAllocaExpander());
 }
 
 void X86PassConfig::addPostRegAlloc() {
@@ -274,6 +327,7 @@ void X86PassConfig::addPreEmitPass() {
     addPass(createX86IssueVZeroUpperPass());
 
   if (getOptLevel() != CodeGenOpt::None) {
+    addPass(createX86FixupBWInsts());
     addPass(createX86PadShortFunctions());
     addPass(createX86FixupLEAs());
   }
diff --git a/lib/Target/X86/X86TargetMachine.h b/lib/Target/X86/X86TargetMachine.h
index 262955698e44..4734a44315a9 100644
--- a/lib/Target/X86/X86TargetMachine.h
+++ b/lib/Target/X86/X86TargetMachine.h
@@ -30,8 +30,9 @@ class X86TargetMachine final : public LLVMTargetMachine {
 
 public:
   X86TargetMachine(const Target &T, const Triple &TT, StringRef CPU,
-                   StringRef FS, const TargetOptions &Options, Reloc::Model RM,
-                   CodeModel::Model CM, CodeGenOpt::Level OL);
+                   StringRef FS, const TargetOptions &Options,
+                   Optional<Reloc::Model> RM, CodeModel::Model CM,
+                   CodeGenOpt::Level OL);
   ~X86TargetMachine() override;
   const X86Subtarget *getSubtargetImpl(const Function &F) const override;
 
diff --git a/lib/Target/X86/X86TargetObjectFile.cpp b/lib/Target/X86/X86TargetObjectFile.cpp
index 782768d0ab16..d664cff5f2c1 100644
--- a/lib/Target/X86/X86TargetObjectFile.cpp
+++ b/lib/Target/X86/X86TargetObjectFile.cpp
@@ -73,53 +73,30 @@ X86LinuxNaClTargetObjectFile::Initialize(MCContext &Ctx,
   InitializeELF(TM.Options.UseInitArray);
 }
 
-const MCExpr *X86WindowsTargetObjectFile::getExecutableRelativeSymbol(
-    const ConstantExpr *CE, Mangler &Mang, const TargetMachine &TM) const {
-  // We are looking for the difference of two symbols, need a subtraction
-  // operation.
-  const SubOperator *Sub = dyn_cast<SubOperator>(CE);
-  if (!Sub)
-    return nullptr;
-
-  // Symbols must first be numbers before we can subtract them, we need to see a
-  // ptrtoint on both subtraction operands.
-  const PtrToIntOperator *SubLHS =
-      dyn_cast<PtrToIntOperator>(Sub->getOperand(0));
-  const PtrToIntOperator *SubRHS =
-      dyn_cast<PtrToIntOperator>(Sub->getOperand(1));
-  if (!SubLHS || !SubRHS)
-    return nullptr;
-
+const MCExpr *X86WindowsTargetObjectFile::lowerRelativeReference(
+    const GlobalValue *LHS, const GlobalValue *RHS, Mangler &Mang,
+    const TargetMachine &TM) const {
   // Our symbols should exist in address space zero, cowardly no-op if
   // otherwise.
-  if (SubLHS->getPointerAddressSpace() != 0 ||
-      SubRHS->getPointerAddressSpace() != 0)
+  if (LHS->getType()->getPointerAddressSpace() != 0 ||
+      RHS->getType()->getPointerAddressSpace() != 0)
     return nullptr;
 
   // Both ptrtoint instructions must wrap global objects:
   // - Only global variables are eligible for image relative relocations.
   // - The subtrahend refers to the special symbol __ImageBase, a GlobalVariable.
-  const auto *GOLHS = dyn_cast<GlobalObject>(SubLHS->getPointerOperand());
-  const auto *GVRHS = dyn_cast<GlobalVariable>(SubRHS->getPointerOperand());
-  if (!GOLHS || !GVRHS)
-    return nullptr;
-
   // We expect __ImageBase to be a global variable without a section, externally
   // defined.
   //
   // It should look something like this: @__ImageBase = external constant i8
-  if (GVRHS->isThreadLocal() || GVRHS->getName() != "__ImageBase" ||
-      !GVRHS->hasExternalLinkage() || GVRHS->hasInitializer() ||
-      GVRHS->hasSection())
-    return nullptr;
-
-  // An image-relative, thread-local, symbol makes no sense.
-  if (GOLHS->isThreadLocal())
+  if (!isa<GlobalObject>(LHS) || !isa<GlobalVariable>(RHS) ||
+      LHS->isThreadLocal() || RHS->isThreadLocal() ||
+      RHS->getName() != "__ImageBase" || !RHS->hasExternalLinkage() ||
+      cast<GlobalVariable>(RHS)->hasInitializer() || RHS->hasSection())
     return nullptr;
 
-  return MCSymbolRefExpr::create(TM.getSymbol(GOLHS, Mang),
-                                 MCSymbolRefExpr::VK_COFF_IMGREL32,
-                                 getContext());
+  return MCSymbolRefExpr::create(
+      TM.getSymbol(LHS, Mang), MCSymbolRefExpr::VK_COFF_IMGREL32, getContext());
 }
 
 static std::string APIntToHexString(const APInt &AI) {
@@ -154,16 +131,34 @@ static std::string scalarConstantToHexString(const Constant *C) {
 }
 
 MCSection *X86WindowsTargetObjectFile::getSectionForConstant(
-    const DataLayout &DL, SectionKind Kind, const Constant *C) const {
+    const DataLayout &DL, SectionKind Kind, const Constant *C,
+    unsigned &Align) const {
   if (Kind.isMergeableConst() && C) {
     const unsigned Characteristics = COFF::IMAGE_SCN_CNT_INITIALIZED_DATA |
                                      COFF::IMAGE_SCN_MEM_READ |
                                      COFF::IMAGE_SCN_LNK_COMDAT;
     std::string COMDATSymName;
-    if (Kind.isMergeableConst4() || Kind.isMergeableConst8())
-      COMDATSymName = "__real@" + scalarConstantToHexString(C);
-    else if (Kind.isMergeableConst16())
-      COMDATSymName = "__xmm@" + scalarConstantToHexString(C);
+    if (Kind.isMergeableConst4()) {
+      if (Align <= 4) {
+        COMDATSymName = "__real@" + scalarConstantToHexString(C);
+        Align = 4;
+      }
+    } else if (Kind.isMergeableConst8()) {
+      if (Align <= 8) {
+        COMDATSymName = "__real@" + scalarConstantToHexString(C);
+        Align = 8;
+      }
+    } else if (Kind.isMergeableConst16()) {
+      if (Align <= 16) {
+        COMDATSymName = "__xmm@" + scalarConstantToHexString(C);
+        Align = 16;
+      }
+    } else if (Kind.isMergeableConst32()) {
+      if (Align <= 32) {
+        COMDATSymName = "__ymm@" + scalarConstantToHexString(C);
+        Align = 32;
+      }
+    }
 
     if (!COMDATSymName.empty())
       return getContext().getCOFFSection(".rdata", Characteristics, Kind,
@@ -171,5 +166,5 @@ MCSection *X86WindowsTargetObjectFile::getSectionForConstant(
                                          COFF::IMAGE_COMDAT_SELECT_ANY);
   }
 
-  return TargetLoweringObjectFile::getSectionForConstant(DL, Kind, C);
+  return TargetLoweringObjectFile::getSectionForConstant(DL, Kind, C, Align);
 }
diff --git a/lib/Target/X86/X86TargetObjectFile.h b/lib/Target/X86/X86TargetObjectFile.h
index 6b2448cc9de6..2e703f1494fa 100644
--- a/lib/Target/X86/X86TargetObjectFile.h
+++ b/lib/Target/X86/X86TargetObjectFile.h
@@ -40,6 +40,11 @@ namespace llvm {
   /// \brief This implemenatation is used for X86 ELF targets that don't
   /// have a further specialization.
   class X86ELFTargetObjectFile : public TargetLoweringObjectFileELF {
+  public:
+    X86ELFTargetObjectFile() {
+      PLTRelativeVariantKind = MCSymbolRefExpr::VK_PLT;
+    }
+
     /// \brief Describe a TLS variable address within debug info.
     const MCExpr *getDebugThreadLocalSymbol(const MCSymbol *Sym) const override;
   };
@@ -53,13 +58,15 @@ namespace llvm {
   /// \brief This implementation is used for Windows targets on x86 and x86-64.
   class X86WindowsTargetObjectFile : public TargetLoweringObjectFileCOFF {
     const MCExpr *
-    getExecutableRelativeSymbol(const ConstantExpr *CE, Mangler &Mang,
-                                const TargetMachine &TM) const override;
+    lowerRelativeReference(const GlobalValue *LHS, const GlobalValue *RHS,
+                           Mangler &Mang,
+                           const TargetMachine &TM) const override;
 
     /// \brief Given a mergeable constant with the specified size and relocation
     /// information, return a section that it should be placed in.
     MCSection *getSectionForConstant(const DataLayout &DL, SectionKind Kind,
-                                     const Constant *C) const override;
+                                     const Constant *C,
+                                     unsigned &Align) const override;
   };
 
 } // end namespace llvm
diff --git a/lib/Target/X86/X86TargetTransformInfo.cpp b/lib/Target/X86/X86TargetTransformInfo.cpp
index 2e7bbb208743..f44a8c662028 100644
--- a/lib/Target/X86/X86TargetTransformInfo.cpp
+++ b/lib/Target/X86/X86TargetTransformInfo.cpp
@@ -532,21 +532,24 @@ int X86TTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src) {
   // potential massive combinations (elem_num x src_type x dst_type).
 
   static const TypeConversionCostTblEntry AVX512DQConversionTbl[] = {
-    { ISD::UINT_TO_FP,  MVT::v2f64,  MVT::v2i64,  1 },
-    { ISD::UINT_TO_FP,  MVT::v4f64,  MVT::v4i64,  1 },
-    { ISD::UINT_TO_FP,  MVT::v8f64,  MVT::v8i64,  1 },
     { ISD::UINT_TO_FP,  MVT::v2f32,  MVT::v2i64,  1 },
+    { ISD::UINT_TO_FP,  MVT::v2f64,  MVT::v2i64,  1 },
     { ISD::UINT_TO_FP,  MVT::v4f32,  MVT::v4i64,  1 },
+    { ISD::UINT_TO_FP,  MVT::v4f64,  MVT::v4i64,  1 },
     { ISD::UINT_TO_FP,  MVT::v8f32,  MVT::v8i64,  1 },
+    { ISD::UINT_TO_FP,  MVT::v8f64,  MVT::v8i64,  1 },
 
-    { ISD::FP_TO_UINT,  MVT::v2i64, MVT::v2f64, 1 },
-    { ISD::FP_TO_UINT,  MVT::v4i64, MVT::v4f64, 1 },
-    { ISD::FP_TO_UINT,  MVT::v8i64, MVT::v8f64, 1 },
     { ISD::FP_TO_UINT,  MVT::v2i64, MVT::v2f32, 1 },
     { ISD::FP_TO_UINT,  MVT::v4i64, MVT::v4f32, 1 },
     { ISD::FP_TO_UINT,  MVT::v8i64, MVT::v8f32, 1 },
+    { ISD::FP_TO_UINT,  MVT::v2i64, MVT::v2f64, 1 },
+    { ISD::FP_TO_UINT,  MVT::v4i64, MVT::v4f64, 1 },
+    { ISD::FP_TO_UINT,  MVT::v8i64, MVT::v8f64, 1 },
   };
 
+  // TODO: For AVX512DQ + AVX512VL, we also have cheap casts for 128-bit and
+  // 256-bit wide vectors.
+
   static const TypeConversionCostTblEntry AVX512FConversionTbl[] = {
     { ISD::FP_EXTEND, MVT::v8f64,   MVT::v8f32,  1 },
     { ISD::FP_EXTEND, MVT::v8f64,   MVT::v16f32, 3 },
@@ -560,43 +563,46 @@ int X86TTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src) {
     // v16i1 -> v16i32 - load + broadcast
     { ISD::SIGN_EXTEND, MVT::v16i32, MVT::v16i1,  2 },
     { ISD::ZERO_EXTEND, MVT::v16i32, MVT::v16i1,  2 },
-
     { ISD::SIGN_EXTEND, MVT::v16i32, MVT::v16i8,  1 },
     { ISD::ZERO_EXTEND, MVT::v16i32, MVT::v16i8,  1 },
     { ISD::SIGN_EXTEND, MVT::v16i32, MVT::v16i16, 1 },
     { ISD::ZERO_EXTEND, MVT::v16i32, MVT::v16i16, 1 },
-    { ISD::SIGN_EXTEND, MVT::v8i64,  MVT::v8i32,  1 },
-    { ISD::ZERO_EXTEND, MVT::v8i64,  MVT::v8i32,  1 },
     { ISD::ZERO_EXTEND, MVT::v8i64,  MVT::v8i16,  1 },
     { ISD::SIGN_EXTEND, MVT::v8i64,  MVT::v8i16,  1 },
+    { ISD::SIGN_EXTEND, MVT::v8i64,  MVT::v8i32,  1 },
+    { ISD::ZERO_EXTEND, MVT::v8i64,  MVT::v8i32,  1 },
 
+    { ISD::SINT_TO_FP,  MVT::v8f64,  MVT::v8i1,   4 },
     { ISD::SINT_TO_FP,  MVT::v16f32, MVT::v16i1,  3 },
+    { ISD::SINT_TO_FP,  MVT::v8f64,  MVT::v8i8,   2 },
     { ISD::SINT_TO_FP,  MVT::v16f32, MVT::v16i8,  2 },
+    { ISD::SINT_TO_FP,  MVT::v8f64,  MVT::v8i16,  2 },
     { ISD::SINT_TO_FP,  MVT::v16f32, MVT::v16i16, 2 },
     { ISD::SINT_TO_FP,  MVT::v16f32, MVT::v16i32, 1 },
-    { ISD::SINT_TO_FP,  MVT::v8f64,  MVT::v8i1,   4 },
-    { ISD::SINT_TO_FP,  MVT::v8f64,  MVT::v8i8,   2 },
-    { ISD::SINT_TO_FP,  MVT::v8f64,  MVT::v8i16,  2 },
     { ISD::SINT_TO_FP,  MVT::v8f64,  MVT::v8i32,  1 },
+    { ISD::UINT_TO_FP,  MVT::v8f32,  MVT::v8i64, 26 },
+    { ISD::UINT_TO_FP,  MVT::v8f64,  MVT::v8i64, 26 },
 
+    { ISD::UINT_TO_FP,  MVT::v8f64,  MVT::v8i1,   4 },
     { ISD::UINT_TO_FP,  MVT::v16f32, MVT::v16i1,  3 },
+    { ISD::UINT_TO_FP,  MVT::v2f64,  MVT::v2i8,   2 },
+    { ISD::UINT_TO_FP,  MVT::v4f64,  MVT::v4i8,   2 },
+    { ISD::UINT_TO_FP,  MVT::v8f32,  MVT::v8i8,   2 },
+    { ISD::UINT_TO_FP,  MVT::v8f64,  MVT::v8i8,   2 },
     { ISD::UINT_TO_FP,  MVT::v16f32, MVT::v16i8,  2 },
+    { ISD::UINT_TO_FP,  MVT::v2f64,  MVT::v2i16,  5 },
+    { ISD::UINT_TO_FP,  MVT::v4f64,  MVT::v4i16,  2 },
+    { ISD::UINT_TO_FP,  MVT::v8f32,  MVT::v8i16,  2 },
+    { ISD::UINT_TO_FP,  MVT::v8f64,  MVT::v8i16,  2 },
     { ISD::UINT_TO_FP,  MVT::v16f32, MVT::v16i16, 2 },
-    { ISD::UINT_TO_FP,  MVT::v16f32, MVT::v16i32, 1 },
-    { ISD::UINT_TO_FP,  MVT::v8f32,  MVT::v8i32,  1 },
+    { ISD::UINT_TO_FP,  MVT::v2f32,  MVT::v2i32,  2 },
+    { ISD::UINT_TO_FP,  MVT::v2f64,  MVT::v2i32,  1 },
     { ISD::UINT_TO_FP,  MVT::v4f32,  MVT::v4i32,  1 },
-    { ISD::UINT_TO_FP,  MVT::v8f64,  MVT::v8i1,   4 },
-    { ISD::UINT_TO_FP,  MVT::v8f64,  MVT::v8i16,  2 },
-    { ISD::UINT_TO_FP,  MVT::v8f64,  MVT::v8i32,  1 },
-    { ISD::UINT_TO_FP,  MVT::v8f64,  MVT::v8i8,   2 },
-    { ISD::UINT_TO_FP,  MVT::v8f32,  MVT::v8i8,   2 },
-    { ISD::UINT_TO_FP,  MVT::v8f32,  MVT::v8i16,  2 },
-    { ISD::UINT_TO_FP,  MVT::v4f64,  MVT::v4i8,   2 },
-    { ISD::UINT_TO_FP,  MVT::v4f64,  MVT::v4i16,  2 },
     { ISD::UINT_TO_FP,  MVT::v4f64,  MVT::v4i32,  1 },
-    { ISD::UINT_TO_FP,  MVT::v2f64,  MVT::v2i8,   2 },
-    { ISD::UINT_TO_FP,  MVT::v2f64,  MVT::v2i16,  5 },
-    { ISD::UINT_TO_FP,  MVT::v2f32,  MVT::v2i32,  2 },
+    { ISD::UINT_TO_FP,  MVT::v8f32,  MVT::v8i32,  1 },
+    { ISD::UINT_TO_FP,  MVT::v8f64,  MVT::v8i32,  1 },
+    { ISD::UINT_TO_FP,  MVT::v16f32, MVT::v16i32, 1 },
+    { ISD::UINT_TO_FP,  MVT::v2f32,  MVT::v2i64,  5 },
     { ISD::UINT_TO_FP,  MVT::v2f64,  MVT::v2i64,  5 },
     { ISD::UINT_TO_FP,  MVT::v4f64,  MVT::v4i64, 12 },
     { ISD::UINT_TO_FP,  MVT::v8f64,  MVT::v8i64, 26 },
@@ -608,20 +614,20 @@ int X86TTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src) {
   };
 
   static const TypeConversionCostTblEntry AVX2ConversionTbl[] = {
-    { ISD::SIGN_EXTEND, MVT::v16i16, MVT::v16i8,  1 },
-    { ISD::ZERO_EXTEND, MVT::v16i16, MVT::v16i8,  1 },
-    { ISD::SIGN_EXTEND, MVT::v8i32,  MVT::v8i1,   3 },
-    { ISD::ZERO_EXTEND, MVT::v8i32,  MVT::v8i1,   3 },
-    { ISD::SIGN_EXTEND, MVT::v8i32,  MVT::v8i8,   3 },
-    { ISD::ZERO_EXTEND, MVT::v8i32,  MVT::v8i8,   3 },
-    { ISD::SIGN_EXTEND, MVT::v8i32,  MVT::v8i16,  1 },
-    { ISD::ZERO_EXTEND, MVT::v8i32,  MVT::v8i16,  1 },
     { ISD::SIGN_EXTEND, MVT::v4i64,  MVT::v4i1,   3 },
     { ISD::ZERO_EXTEND, MVT::v4i64,  MVT::v4i1,   3 },
+    { ISD::SIGN_EXTEND, MVT::v8i32,  MVT::v8i1,   3 },
+    { ISD::ZERO_EXTEND, MVT::v8i32,  MVT::v8i1,   3 },
     { ISD::SIGN_EXTEND, MVT::v4i64,  MVT::v4i8,   3 },
     { ISD::ZERO_EXTEND, MVT::v4i64,  MVT::v4i8,   3 },
+    { ISD::SIGN_EXTEND, MVT::v8i32,  MVT::v8i8,   3 },
+    { ISD::ZERO_EXTEND, MVT::v8i32,  MVT::v8i8,   3 },
+    { ISD::SIGN_EXTEND, MVT::v16i16, MVT::v16i8,  1 },
+    { ISD::ZERO_EXTEND, MVT::v16i16, MVT::v16i8,  1 },
     { ISD::SIGN_EXTEND, MVT::v4i64,  MVT::v4i16,  3 },
     { ISD::ZERO_EXTEND, MVT::v4i64,  MVT::v4i16,  3 },
+    { ISD::SIGN_EXTEND, MVT::v8i32,  MVT::v8i16,  1 },
+    { ISD::ZERO_EXTEND, MVT::v8i32,  MVT::v8i16,  1 },
     { ISD::SIGN_EXTEND, MVT::v4i64,  MVT::v4i32,  1 },
     { ISD::ZERO_EXTEND, MVT::v4i64,  MVT::v4i32,  1 },
 
@@ -639,66 +645,69 @@ int X86TTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src) {
   };
 
   static const TypeConversionCostTblEntry AVXConversionTbl[] = {
-    { ISD::SIGN_EXTEND, MVT::v16i16, MVT::v16i8, 4 },
-    { ISD::ZERO_EXTEND, MVT::v16i16, MVT::v16i8, 4 },
-    { ISD::SIGN_EXTEND, MVT::v8i32,  MVT::v8i1,  7 },
-    { ISD::ZERO_EXTEND, MVT::v8i32,  MVT::v8i1,  4 },
-    { ISD::SIGN_EXTEND, MVT::v8i32,  MVT::v8i8,  7 },
-    { ISD::ZERO_EXTEND, MVT::v8i32,  MVT::v8i8,  4 },
-    { ISD::SIGN_EXTEND, MVT::v8i32,  MVT::v8i16, 4 },
-    { ISD::ZERO_EXTEND, MVT::v8i32,  MVT::v8i16, 4 },
     { ISD::SIGN_EXTEND, MVT::v4i64,  MVT::v4i1,  6 },
     { ISD::ZERO_EXTEND, MVT::v4i64,  MVT::v4i1,  4 },
+    { ISD::SIGN_EXTEND, MVT::v8i32,  MVT::v8i1,  7 },
+    { ISD::ZERO_EXTEND, MVT::v8i32,  MVT::v8i1,  4 },
     { ISD::SIGN_EXTEND, MVT::v4i64,  MVT::v4i8,  6 },
     { ISD::ZERO_EXTEND, MVT::v4i64,  MVT::v4i8,  4 },
+    { ISD::SIGN_EXTEND, MVT::v8i32,  MVT::v8i8,  7 },
+    { ISD::ZERO_EXTEND, MVT::v8i32,  MVT::v8i8,  4 },
+    { ISD::SIGN_EXTEND, MVT::v16i16, MVT::v16i8, 4 },
+    { ISD::ZERO_EXTEND, MVT::v16i16, MVT::v16i8, 4 },
     { ISD::SIGN_EXTEND, MVT::v4i64,  MVT::v4i16, 6 },
     { ISD::ZERO_EXTEND, MVT::v4i64,  MVT::v4i16, 3 },
+    { ISD::SIGN_EXTEND, MVT::v8i32,  MVT::v8i16, 4 },
+    { ISD::ZERO_EXTEND, MVT::v8i32,  MVT::v8i16, 4 },
     { ISD::SIGN_EXTEND, MVT::v4i64,  MVT::v4i32, 4 },
     { ISD::ZERO_EXTEND, MVT::v4i64,  MVT::v4i32, 4 },
 
+    { ISD::TRUNCATE,    MVT::v16i8, MVT::v16i16, 4 },
+    { ISD::TRUNCATE,    MVT::v8i8,  MVT::v8i32,  4 },
+    { ISD::TRUNCATE,    MVT::v8i16, MVT::v8i32,  5 },
     { ISD::TRUNCATE,    MVT::v4i8,  MVT::v4i64,  4 },
     { ISD::TRUNCATE,    MVT::v4i16, MVT::v4i64,  4 },
     { ISD::TRUNCATE,    MVT::v4i32, MVT::v4i64,  4 },
-    { ISD::TRUNCATE,    MVT::v8i8,  MVT::v8i32,  4 },
-    { ISD::TRUNCATE,    MVT::v8i16, MVT::v8i32,  5 },
-    { ISD::TRUNCATE,    MVT::v16i8, MVT::v16i16, 4 },
     { ISD::TRUNCATE,    MVT::v8i32, MVT::v8i64,  9 },
 
-    { ISD::SINT_TO_FP,  MVT::v8f32, MVT::v8i1,  8 },
-    { ISD::SINT_TO_FP,  MVT::v8f32, MVT::v8i8,  8 },
-    { ISD::SINT_TO_FP,  MVT::v8f32, MVT::v8i16, 5 },
-    { ISD::SINT_TO_FP,  MVT::v8f32, MVT::v8i32, 1 },
     { ISD::SINT_TO_FP,  MVT::v4f32, MVT::v4i1,  3 },
-    { ISD::SINT_TO_FP,  MVT::v4f32, MVT::v4i8,  3 },
-    { ISD::SINT_TO_FP,  MVT::v4f32, MVT::v4i16, 3 },
-    { ISD::SINT_TO_FP,  MVT::v4f32, MVT::v4i32, 1 },
     { ISD::SINT_TO_FP,  MVT::v4f64, MVT::v4i1,  3 },
+    { ISD::SINT_TO_FP,  MVT::v8f32, MVT::v8i1,  8 },
+    { ISD::SINT_TO_FP,  MVT::v4f32, MVT::v4i8,  3 },
     { ISD::SINT_TO_FP,  MVT::v4f64, MVT::v4i8,  3 },
+    { ISD::SINT_TO_FP,  MVT::v8f32, MVT::v8i8,  8 },
+    { ISD::SINT_TO_FP,  MVT::v4f32, MVT::v4i16, 3 },
     { ISD::SINT_TO_FP,  MVT::v4f64, MVT::v4i16, 3 },
+    { ISD::SINT_TO_FP,  MVT::v8f32, MVT::v8i16, 5 },
+    { ISD::SINT_TO_FP,  MVT::v4f32, MVT::v4i32, 1 },
     { ISD::SINT_TO_FP,  MVT::v4f64, MVT::v4i32, 1 },
+    { ISD::SINT_TO_FP,  MVT::v8f32, MVT::v8i32, 1 },
 
-    { ISD::UINT_TO_FP,  MVT::v8f32, MVT::v8i1,  6 },
-    { ISD::UINT_TO_FP,  MVT::v8f32, MVT::v8i8,  5 },
-    { ISD::UINT_TO_FP,  MVT::v8f32, MVT::v8i16, 5 },
-    { ISD::UINT_TO_FP,  MVT::v8f32, MVT::v8i32, 9 },
     { ISD::UINT_TO_FP,  MVT::v4f32, MVT::v4i1,  7 },
-    { ISD::UINT_TO_FP,  MVT::v4f32, MVT::v4i8,  2 },
-    { ISD::UINT_TO_FP,  MVT::v4f32, MVT::v4i16, 2 },
-    { ISD::UINT_TO_FP,  MVT::v4f32, MVT::v4i32, 6 },
     { ISD::UINT_TO_FP,  MVT::v4f64, MVT::v4i1,  7 },
+    { ISD::UINT_TO_FP,  MVT::v8f32, MVT::v8i1,  6 },
+    { ISD::UINT_TO_FP,  MVT::v4f32, MVT::v4i8,  2 },
     { ISD::UINT_TO_FP,  MVT::v4f64, MVT::v4i8,  2 },
+    { ISD::UINT_TO_FP,  MVT::v8f32, MVT::v8i8,  5 },
+    { ISD::UINT_TO_FP,  MVT::v4f32, MVT::v4i16, 2 },
     { ISD::UINT_TO_FP,  MVT::v4f64, MVT::v4i16, 2 },
+    { ISD::UINT_TO_FP,  MVT::v8f32, MVT::v8i16, 5 },
+    { ISD::UINT_TO_FP,  MVT::v2f64, MVT::v2i32, 6 },
+    { ISD::UINT_TO_FP,  MVT::v4f32, MVT::v4i32, 6 },
     { ISD::UINT_TO_FP,  MVT::v4f64, MVT::v4i32, 6 },
+    { ISD::UINT_TO_FP,  MVT::v8f32, MVT::v8i32, 9 },
     // The generic code to compute the scalar overhead is currently broken.
     // Workaround this limitation by estimating the scalarization overhead
     // here. We have roughly 10 instructions per scalar element.
     // Multiply that by the vector width.
     // FIXME: remove that when PR19268 is fixed.
-    { ISD::UINT_TO_FP,  MVT::v2f64, MVT::v2i64, 2*10 },
-    { ISD::UINT_TO_FP,  MVT::v4f64, MVT::v4i64, 4*10 },
+    { ISD::UINT_TO_FP,  MVT::v2f64, MVT::v2i64, 10 },
+    { ISD::UINT_TO_FP,  MVT::v4f64, MVT::v4i64, 20 },
+    { ISD::SINT_TO_FP,  MVT::v4f64, MVT::v4i64, 13 },
+    { ISD::SINT_TO_FP,  MVT::v4f64, MVT::v4i64, 13 },
 
-    { ISD::FP_TO_SINT,  MVT::v8i8,  MVT::v8f32, 7 },
     { ISD::FP_TO_SINT,  MVT::v4i8,  MVT::v4f32, 1 },
+    { ISD::FP_TO_SINT,  MVT::v8i8,  MVT::v8f32, 7 },
     // This node is expanded into scalarized operations but BasicTTI is overly
     // optimistic estimating its cost.  It computes 3 per element (one
     // vector-extract, one scalar conversion and one vector-insert).  The
@@ -706,89 +715,104 @@ int X86TTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src) {
     // should be factored in too.  Inflating the cost per element by 1.
     { ISD::FP_TO_UINT,  MVT::v8i32, MVT::v8f32, 8*4 },
     { ISD::FP_TO_UINT,  MVT::v4i32, MVT::v4f64, 4*4 },
+
+    { ISD::FP_EXTEND,   MVT::v4f64,  MVT::v4f32,  1 },
+    { ISD::FP_ROUND,    MVT::v4f32,  MVT::v4f64,  1 },
   };
 
   static const TypeConversionCostTblEntry SSE41ConversionTbl[] = {
-    { ISD::ZERO_EXTEND, MVT::v16i32, MVT::v16i16, 4 },
-    { ISD::SIGN_EXTEND, MVT::v16i32, MVT::v16i16, 4 },
-    { ISD::ZERO_EXTEND, MVT::v8i32,  MVT::v8i16,  2 },
-    { ISD::SIGN_EXTEND, MVT::v8i32,  MVT::v8i16,  2 },
-    { ISD::ZERO_EXTEND, MVT::v4i32,  MVT::v4i16,  1 },
-    { ISD::SIGN_EXTEND, MVT::v4i32,  MVT::v4i16,  1 },
-    { ISD::ZERO_EXTEND, MVT::v16i32, MVT::v16i8,  4 },
-    { ISD::SIGN_EXTEND, MVT::v16i32, MVT::v16i8,  4 },
-    { ISD::ZERO_EXTEND, MVT::v8i32,  MVT::v8i8,   2 },
-    { ISD::SIGN_EXTEND, MVT::v8i32,  MVT::v8i8,   2 },
+    { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i8,    2 },
+    { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i8,    2 },
+    { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i16,   2 },
+    { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i16,   2 },
+    { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i32,   2 },
+    { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i32,   2 },
+
+    { ISD::ZERO_EXTEND, MVT::v4i16,  MVT::v4i8,   1 },
+    { ISD::SIGN_EXTEND, MVT::v4i16,  MVT::v4i8,   2 },
     { ISD::ZERO_EXTEND, MVT::v4i32,  MVT::v4i8,   1 },
     { ISD::SIGN_EXTEND, MVT::v4i32,  MVT::v4i8,   1 },
-    { ISD::ZERO_EXTEND, MVT::v16i16, MVT::v16i8,  2 },
-    { ISD::SIGN_EXTEND, MVT::v16i16, MVT::v16i8,  2 },
     { ISD::ZERO_EXTEND, MVT::v8i16,  MVT::v8i8,   1 },
     { ISD::SIGN_EXTEND, MVT::v8i16,  MVT::v8i8,   1 },
-    { ISD::ZERO_EXTEND, MVT::v4i16,  MVT::v4i8,   1 },
-    { ISD::SIGN_EXTEND, MVT::v4i16,  MVT::v4i8,   2 },
+    { ISD::ZERO_EXTEND, MVT::v8i32,  MVT::v8i8,   2 },
+    { ISD::SIGN_EXTEND, MVT::v8i32,  MVT::v8i8,   2 },
+    { ISD::ZERO_EXTEND, MVT::v16i16, MVT::v16i8,  2 },
+    { ISD::SIGN_EXTEND, MVT::v16i16, MVT::v16i8,  2 },
+    { ISD::ZERO_EXTEND, MVT::v16i32, MVT::v16i8,  4 },
+    { ISD::SIGN_EXTEND, MVT::v16i32, MVT::v16i8,  4 },
+    { ISD::ZERO_EXTEND, MVT::v4i32,  MVT::v4i16,  1 },
+    { ISD::SIGN_EXTEND, MVT::v4i32,  MVT::v4i16,  1 },
+    { ISD::ZERO_EXTEND, MVT::v8i32,  MVT::v8i16,  2 },
+    { ISD::SIGN_EXTEND, MVT::v8i32,  MVT::v8i16,  2 },
+    { ISD::ZERO_EXTEND, MVT::v16i32, MVT::v16i16, 4 },
+    { ISD::SIGN_EXTEND, MVT::v16i32, MVT::v16i16, 4 },
 
-    { ISD::TRUNCATE,    MVT::v16i16, MVT::v16i32, 6 },
-    { ISD::TRUNCATE,    MVT::v8i16,  MVT::v8i32,  3 },
+    { ISD::TRUNCATE,    MVT::v4i8,   MVT::v4i16,  2 },
+    { ISD::TRUNCATE,    MVT::v8i8,   MVT::v8i16,  1 },
+    { ISD::TRUNCATE,    MVT::v4i8,   MVT::v4i32,  1 },
     { ISD::TRUNCATE,    MVT::v4i16,  MVT::v4i32,  1 },
-    { ISD::TRUNCATE,    MVT::v16i8,  MVT::v16i32, 30 },
     { ISD::TRUNCATE,    MVT::v8i8,   MVT::v8i32,  3 },
-    { ISD::TRUNCATE,    MVT::v4i8,   MVT::v4i32,  1 },
-    { ISD::TRUNCATE,    MVT::v16i8,  MVT::v16i16, 3 },
-    { ISD::TRUNCATE,    MVT::v8i8,   MVT::v8i16,  1 },
-    { ISD::TRUNCATE,    MVT::v4i8,   MVT::v4i16,  2 },
+    { ISD::TRUNCATE,    MVT::v8i16,  MVT::v8i32,  3 },
+    { ISD::TRUNCATE,    MVT::v16i16, MVT::v16i32, 6 },
+
   };
 
   static const TypeConversionCostTblEntry SSE2ConversionTbl[] = {
     // These are somewhat magic numbers justified by looking at the output of
     // Intel's IACA, running some kernels and making sure when we take
     // legalization into account the throughput will be overestimated.
-    { ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i64, 2*10 },
-    { ISD::UINT_TO_FP, MVT::v2f64, MVT::v4i32, 4*10 },
-    { ISD::UINT_TO_FP, MVT::v2f64, MVT::v8i16, 8*10 },
-    { ISD::UINT_TO_FP, MVT::v2f64, MVT::v16i8, 16*10 },
-    { ISD::SINT_TO_FP, MVT::v2f64, MVT::v2i64, 2*10 },
-    { ISD::SINT_TO_FP, MVT::v2f64, MVT::v4i32, 4*10 },
-    { ISD::SINT_TO_FP, MVT::v2f64, MVT::v8i16, 8*10 },
+    { ISD::SINT_TO_FP, MVT::v4f32, MVT::v16i8, 8 },
     { ISD::SINT_TO_FP, MVT::v2f64, MVT::v16i8, 16*10 },
-    // There are faster sequences for float conversions.
-    { ISD::UINT_TO_FP, MVT::v4f32, MVT::v2i64, 15 },
-    { ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i32, 8 },
-    { ISD::UINT_TO_FP, MVT::v4f32, MVT::v8i16, 15 },
-    { ISD::UINT_TO_FP, MVT::v4f32, MVT::v16i8, 8 },
-    { ISD::SINT_TO_FP, MVT::v4f32, MVT::v2i64, 15 },
-    { ISD::SINT_TO_FP, MVT::v4f32, MVT::v4i32, 15 },
     { ISD::SINT_TO_FP, MVT::v4f32, MVT::v8i16, 15 },
-    { ISD::SINT_TO_FP, MVT::v4f32, MVT::v16i8, 8 },
+    { ISD::SINT_TO_FP, MVT::v2f64, MVT::v8i16, 8*10 },
+    { ISD::SINT_TO_FP, MVT::v4f32, MVT::v4i32, 5 },
+    { ISD::SINT_TO_FP, MVT::v2f64, MVT::v4i32, 4*10 },
+    { ISD::SINT_TO_FP, MVT::v4f32, MVT::v2i64, 15 },
+    { ISD::SINT_TO_FP, MVT::v2f64, MVT::v2i64, 2*10 },
 
-    { ISD::ZERO_EXTEND, MVT::v16i32, MVT::v16i16, 6 },
-    { ISD::SIGN_EXTEND, MVT::v16i32, MVT::v16i16, 8 },
-    { ISD::ZERO_EXTEND, MVT::v8i32,  MVT::v8i16,  3 },
-    { ISD::SIGN_EXTEND, MVT::v8i32,  MVT::v8i16,  4 },
-    { ISD::ZERO_EXTEND, MVT::v4i32,  MVT::v4i16,  1 },
-    { ISD::SIGN_EXTEND, MVT::v4i32,  MVT::v4i16,  2 },
-    { ISD::ZERO_EXTEND, MVT::v16i32, MVT::v16i8,  9 },
-    { ISD::SIGN_EXTEND, MVT::v16i32, MVT::v16i8,  12 },
-    { ISD::ZERO_EXTEND, MVT::v8i32,  MVT::v8i8,   6 },
-    { ISD::SIGN_EXTEND, MVT::v8i32,  MVT::v8i8,   6 },
+    { ISD::UINT_TO_FP, MVT::v2f64, MVT::v16i8, 16*10 },
+    { ISD::UINT_TO_FP, MVT::v4f32, MVT::v16i8, 8 },
+    { ISD::UINT_TO_FP, MVT::v4f32, MVT::v8i16, 15 },
+    { ISD::UINT_TO_FP, MVT::v2f64, MVT::v8i16, 8*10 },
+    { ISD::UINT_TO_FP, MVT::v2f64, MVT::v4i32, 4*10 },
+    { ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i32, 8 },
+    { ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i64, 2*10 },
+    { ISD::UINT_TO_FP, MVT::v4f32, MVT::v2i64, 15 },
+
+    { ISD::ZERO_EXTEND, MVT::v4i16,  MVT::v4i8,   1 },
+    { ISD::SIGN_EXTEND, MVT::v4i16,  MVT::v4i8,   6 },
     { ISD::ZERO_EXTEND, MVT::v4i32,  MVT::v4i8,   2 },
     { ISD::SIGN_EXTEND, MVT::v4i32,  MVT::v4i8,   3 },
-    { ISD::ZERO_EXTEND, MVT::v16i16, MVT::v16i8,  3 },
-    { ISD::SIGN_EXTEND, MVT::v16i16, MVT::v16i8,  4 },
+    { ISD::ZERO_EXTEND, MVT::v4i64,  MVT::v4i8,   4 },
+    { ISD::SIGN_EXTEND, MVT::v4i64,  MVT::v4i8,   8 },
     { ISD::ZERO_EXTEND, MVT::v8i16,  MVT::v8i8,   1 },
     { ISD::SIGN_EXTEND, MVT::v8i16,  MVT::v8i8,   2 },
-    { ISD::ZERO_EXTEND, MVT::v4i16,  MVT::v4i8,   1 },
-    { ISD::SIGN_EXTEND, MVT::v4i16,  MVT::v4i8,   6 },
+    { ISD::ZERO_EXTEND, MVT::v8i32,  MVT::v8i8,   6 },
+    { ISD::SIGN_EXTEND, MVT::v8i32,  MVT::v8i8,   6 },
+    { ISD::ZERO_EXTEND, MVT::v16i16, MVT::v16i8,  3 },
+    { ISD::SIGN_EXTEND, MVT::v16i16, MVT::v16i8,  4 },
+    { ISD::ZERO_EXTEND, MVT::v16i32, MVT::v16i8,  9 },
+    { ISD::SIGN_EXTEND, MVT::v16i32, MVT::v16i8,  12 },
+    { ISD::ZERO_EXTEND, MVT::v4i32,  MVT::v4i16,  1 },
+    { ISD::SIGN_EXTEND, MVT::v4i32,  MVT::v4i16,  2 },
+    { ISD::ZERO_EXTEND, MVT::v4i64,  MVT::v4i16,  3 },
+    { ISD::SIGN_EXTEND, MVT::v4i64,  MVT::v4i16,  10 },
+    { ISD::ZERO_EXTEND, MVT::v8i32,  MVT::v8i16,  3 },
+    { ISD::SIGN_EXTEND, MVT::v8i32,  MVT::v8i16,  4 },
+    { ISD::ZERO_EXTEND, MVT::v16i32, MVT::v16i16, 6 },
+    { ISD::SIGN_EXTEND, MVT::v16i32, MVT::v16i16, 8 },
+    { ISD::ZERO_EXTEND, MVT::v4i64,  MVT::v4i32,  3 },
+    { ISD::SIGN_EXTEND, MVT::v4i64,  MVT::v4i32,  5 },
 
-    { ISD::TRUNCATE,    MVT::v16i16, MVT::v16i32, 10 },
-    { ISD::TRUNCATE,    MVT::v8i16,  MVT::v8i32,  5 },
+    { ISD::TRUNCATE,    MVT::v4i8,   MVT::v4i16,  4 },
+    { ISD::TRUNCATE,    MVT::v8i8,   MVT::v8i16,  2 },
+    { ISD::TRUNCATE,    MVT::v16i8,  MVT::v16i16, 3 },
+    { ISD::TRUNCATE,    MVT::v4i8,   MVT::v4i32,  3 },
     { ISD::TRUNCATE,    MVT::v4i16,  MVT::v4i32,  3 },
-    { ISD::TRUNCATE,    MVT::v16i8,  MVT::v16i32, 7 },
     { ISD::TRUNCATE,    MVT::v8i8,   MVT::v8i32,  4 },
-    { ISD::TRUNCATE,    MVT::v4i8,   MVT::v4i32,  3 },
-    { ISD::TRUNCATE,    MVT::v16i8,  MVT::v16i16, 3 },
-    { ISD::TRUNCATE,    MVT::v8i8,   MVT::v8i16,  2 },
-    { ISD::TRUNCATE,    MVT::v4i8,   MVT::v4i16,  4 },
+    { ISD::TRUNCATE,    MVT::v16i8,  MVT::v16i32, 7 },
+    { ISD::TRUNCATE,    MVT::v8i16,  MVT::v8i32,  5 },
+    { ISD::TRUNCATE,    MVT::v16i16, MVT::v16i32, 10 },
   };
 
   std::pair<int, MVT> LTSrc = TLI->getTypeLegalizationCost(DL, Src);
@@ -859,13 +883,17 @@ int X86TTIImpl::getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy) {
   int ISD = TLI->InstructionOpcodeToISD(Opcode);
   assert(ISD && "Invalid opcode");
 
+  static const CostTblEntry SSE2CostTbl[] = {
+    { ISD::SETCC,   MVT::v2i64,   8 },
+    { ISD::SETCC,   MVT::v4i32,   1 },
+    { ISD::SETCC,   MVT::v8i16,   1 },
+    { ISD::SETCC,   MVT::v16i8,   1 },
+  };
+
   static const CostTblEntry SSE42CostTbl[] = {
     { ISD::SETCC,   MVT::v2f64,   1 },
     { ISD::SETCC,   MVT::v4f32,   1 },
     { ISD::SETCC,   MVT::v2i64,   1 },
-    { ISD::SETCC,   MVT::v4i32,   1 },
-    { ISD::SETCC,   MVT::v8i16,   1 },
-    { ISD::SETCC,   MVT::v16i8,   1 },
   };
 
   static const CostTblEntry AVX1CostTbl[] = {
@@ -908,12 +936,112 @@ int X86TTIImpl::getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy) {
     if (const auto *Entry = CostTableLookup(SSE42CostTbl, ISD, MTy))
       return LT.first * Entry->Cost;
 
+  if (ST->hasSSE2())
+    if (const auto *Entry = CostTableLookup(SSE2CostTbl, ISD, MTy))
+      return LT.first * Entry->Cost;
+
   return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy);
 }
 
+int X86TTIImpl::getIntrinsicInstrCost(Intrinsic::ID IID, Type *RetTy,
+                                      ArrayRef<Type *> Tys, FastMathFlags FMF) {
+  static const CostTblEntry XOPCostTbl[] = {
+    { ISD::BITREVERSE, MVT::v4i64,   4 },
+    { ISD::BITREVERSE, MVT::v8i32,   4 },
+    { ISD::BITREVERSE, MVT::v16i16,  4 },
+    { ISD::BITREVERSE, MVT::v32i8,   4 },
+    { ISD::BITREVERSE, MVT::v2i64,   1 },
+    { ISD::BITREVERSE, MVT::v4i32,   1 },
+    { ISD::BITREVERSE, MVT::v8i16,   1 },
+    { ISD::BITREVERSE, MVT::v16i8,   1 },
+    { ISD::BITREVERSE, MVT::i64,     3 },
+    { ISD::BITREVERSE, MVT::i32,     3 },
+    { ISD::BITREVERSE, MVT::i16,     3 },
+    { ISD::BITREVERSE, MVT::i8,      3 }
+  };
+  static const CostTblEntry AVX2CostTbl[] = {
+    { ISD::BITREVERSE, MVT::v4i64,   5 },
+    { ISD::BITREVERSE, MVT::v8i32,   5 },
+    { ISD::BITREVERSE, MVT::v16i16,  5 },
+    { ISD::BITREVERSE, MVT::v32i8,   5 },
+    { ISD::BSWAP,      MVT::v4i64,   1 },
+    { ISD::BSWAP,      MVT::v8i32,   1 },
+    { ISD::BSWAP,      MVT::v16i16,  1 }
+  };
+  static const CostTblEntry AVX1CostTbl[] = {
+    { ISD::BITREVERSE, MVT::v4i64,  10 },
+    { ISD::BITREVERSE, MVT::v8i32,  10 },
+    { ISD::BITREVERSE, MVT::v16i16, 10 },
+    { ISD::BITREVERSE, MVT::v32i8,  10 },
+    { ISD::BSWAP,      MVT::v4i64,   4 },
+    { ISD::BSWAP,      MVT::v8i32,   4 },
+    { ISD::BSWAP,      MVT::v16i16,  4 }
+  };
+  static const CostTblEntry SSSE3CostTbl[] = {
+    { ISD::BITREVERSE, MVT::v2i64,   5 },
+    { ISD::BITREVERSE, MVT::v4i32,   5 },
+    { ISD::BITREVERSE, MVT::v8i16,   5 },
+    { ISD::BITREVERSE, MVT::v16i8,   5 },
+    { ISD::BSWAP,      MVT::v2i64,   1 },
+    { ISD::BSWAP,      MVT::v4i32,   1 },
+    { ISD::BSWAP,      MVT::v8i16,   1 }
+  };
+  static const CostTblEntry SSE2CostTbl[] = {
+    { ISD::BSWAP,      MVT::v2i64,   7 },
+    { ISD::BSWAP,      MVT::v4i32,   7 },
+    { ISD::BSWAP,      MVT::v8i16,   7 }
+  };
+
+  unsigned ISD = ISD::DELETED_NODE;
+  switch (IID) {
+  default:
+    break;
+  case Intrinsic::bitreverse:
+    ISD = ISD::BITREVERSE;
+    break;
+  case Intrinsic::bswap:
+    ISD = ISD::BSWAP;
+    break;
+  }
+
+  // Legalize the type.
+  std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, RetTy);
+  MVT MTy = LT.second;
+
+  // Attempt to lookup cost.
+  if (ST->hasXOP())
+    if (const auto *Entry = CostTableLookup(XOPCostTbl, ISD, MTy))
+      return LT.first * Entry->Cost;
+
+  if (ST->hasAVX2())
+    if (const auto *Entry = CostTableLookup(AVX2CostTbl, ISD, MTy))
+      return LT.first * Entry->Cost;
+
+  if (ST->hasAVX())
+    if (const auto *Entry = CostTableLookup(AVX1CostTbl, ISD, MTy))
+      return LT.first * Entry->Cost;
+
+  if (ST->hasSSSE3())
+    if (const auto *Entry = CostTableLookup(SSSE3CostTbl, ISD, MTy))
+      return LT.first * Entry->Cost;
+
+  if (ST->hasSSE2())
+    if (const auto *Entry = CostTableLookup(SSE2CostTbl, ISD, MTy))
+      return LT.first * Entry->Cost;
+
+  return BaseT::getIntrinsicInstrCost(IID, RetTy, Tys, FMF);
+}
+
+int X86TTIImpl::getIntrinsicInstrCost(Intrinsic::ID IID, Type *RetTy,
+                                      ArrayRef<Value *> Args, FastMathFlags FMF) {
+  return BaseT::getIntrinsicInstrCost(IID, RetTy, Args, FMF);
+}
+
 int X86TTIImpl::getVectorInstrCost(unsigned Opcode, Type *Val, unsigned Index) {
   assert(Val->isVectorTy() && "This must be a vector type");
 
+  Type *ScalarType = Val->getScalarType();
+
   if (Index != -1U) {
     // Legalize the type.
     std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, Val);
@@ -927,11 +1055,17 @@ int X86TTIImpl::getVectorInstrCost(unsigned Opcode, Type *Val, unsigned Index) {
     Index = Index % Width;
 
     // Floating point scalars are already located in index #0.
-    if (Val->getScalarType()->isFloatingPointTy() && Index == 0)
+    if (ScalarType->isFloatingPointTy() && Index == 0)
       return 0;
   }
 
-  return BaseT::getVectorInstrCost(Opcode, Val, Index);
+  // Add to the base cost if we know that the extracted element of a vector is
+  // destined to be moved to and used in the integer register file.
+  int RegisterFileMoveCost = 0;
+  if (Opcode == Instruction::ExtractElement && ScalarType->isPointerTy())
+    RegisterFileMoveCost = 1;
+
+  return BaseT::getVectorInstrCost(Opcode, Val, Index) + RegisterFileMoveCost;
 }
 
 int X86TTIImpl::getScalarizationOverhead(Type *Ty, bool Insert, bool Extract) {
@@ -983,10 +1117,10 @@ int X86TTIImpl::getMemoryOpCost(unsigned Opcode, Type *Src, unsigned Alignment,
   // Each load/store unit costs 1.
   int Cost = LT.first * 1;
 
-  // On Sandybridge 256bit load/stores are double pumped
-  // (but not on Haswell).
-  if (LT.second.getSizeInBits() > 128 && !ST->hasAVX2())
-    Cost*=2;
+  // This isn't exactly right. We're using slow unaligned 32-byte accesses as a
+  // proxy for a double-pumped AVX memory interface such as on Sandybridge.
+  if (LT.second.getStoreSize() == 32 && ST->isUnalignedMem32Slow())
+    Cost *= 2;
 
   return Cost;
 }
@@ -1001,14 +1135,14 @@ int X86TTIImpl::getMaskedMemoryOpCost(unsigned Opcode, Type *SrcTy,
 
   unsigned NumElem = SrcVTy->getVectorNumElements();
   VectorType *MaskTy =
-    VectorType::get(Type::getInt8Ty(getGlobalContext()), NumElem);
+    VectorType::get(Type::getInt8Ty(SrcVTy->getContext()), NumElem);
   if ((Opcode == Instruction::Load && !isLegalMaskedLoad(SrcVTy)) ||
       (Opcode == Instruction::Store && !isLegalMaskedStore(SrcVTy)) ||
       !isPowerOf2_32(NumElem)) {
     // Scalarization
     int MaskSplitCost = getScalarizationOverhead(MaskTy, false, true);
     int ScalarCompareCost = getCmpSelInstrCost(
-        Instruction::ICmp, Type::getInt8Ty(getGlobalContext()), nullptr);
+        Instruction::ICmp, Type::getInt8Ty(SrcVTy->getContext()), nullptr);
     int BranchCost = getCFInstrCost(Instruction::Br);
     int MaskCmpCost = NumElem * (BranchCost + ScalarCompareCost);
 
@@ -1171,7 +1305,7 @@ int X86TTIImpl::getIntImmCost(const APInt &Imm, Type *Ty) {
     int64_t Val = Tmp.getSExtValue();
     Cost += getIntImmCost(Val);
   }
-  // We need at least one instruction to materialze the constant.
+  // We need at least one instruction to materialize the constant.
   return std::max(1, Cost);
 }
 
@@ -1314,7 +1448,7 @@ int X86TTIImpl::getGSVectorCost(unsigned Opcode, Type *SrcVTy, Value *Ptr,
     GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(Ptr);
     if (IndexSize < 64 || !GEP)
       return IndexSize;
- 
+
     unsigned NumOfVarIndices = 0;
     Value *Ptrs = GEP->getPointerOperand();
     if (Ptrs->getType()->isVectorTy() && !getSplatValue(Ptrs))
@@ -1339,7 +1473,7 @@ int X86TTIImpl::getGSVectorCost(unsigned Opcode, Type *SrcVTy, Value *Ptr,
   unsigned IndexSize = (VF >= 16) ? getIndexSizeInBits(Ptr, DL) :
     DL.getPointerSizeInBits();
 
-  Type *IndexVTy = VectorType::get(IntegerType::get(getGlobalContext(),
+  Type *IndexVTy = VectorType::get(IntegerType::get(SrcVTy->getContext(),
                                                     IndexSize), VF);
   std::pair<int, MVT> IdxsLT = TLI->getTypeLegalizationCost(DL, IndexVTy);
   std::pair<int, MVT> SrcLT = TLI->getTypeLegalizationCost(DL, SrcVTy);
@@ -1374,10 +1508,10 @@ int X86TTIImpl::getGSScalarCost(unsigned Opcode, Type *SrcVTy,
   int MaskUnpackCost = 0;
   if (VariableMask) {
     VectorType *MaskTy =
-      VectorType::get(Type::getInt1Ty(getGlobalContext()), VF);
+      VectorType::get(Type::getInt1Ty(SrcVTy->getContext()), VF);
     MaskUnpackCost = getScalarizationOverhead(MaskTy, false, true);
     int ScalarCompareCost =
-      getCmpSelInstrCost(Instruction::ICmp, Type::getInt1Ty(getGlobalContext()),
+      getCmpSelInstrCost(Instruction::ICmp, Type::getInt1Ty(SrcVTy->getContext()),
                          nullptr);
     int BranchCost = getCFInstrCost(Instruction::Br);
     MaskUnpackCost += VF * (BranchCost + ScalarCompareCost);
@@ -1438,7 +1572,8 @@ bool X86TTIImpl::isLegalMaskedLoad(Type *DataTy) {
   int DataWidth = isa<PointerType>(ScalarTy) ?
     DL.getPointerSizeInBits() : ScalarTy->getPrimitiveSizeInBits();
 
-  return (DataWidth >= 32 && ST->hasAVX2());
+  return (DataWidth >= 32 && ST->hasAVX()) ||
+         (DataWidth >= 8 && ST->hasBWI());
 }
 
 bool X86TTIImpl::isLegalMaskedStore(Type *DataType) {
diff --git a/lib/Target/X86/X86TargetTransformInfo.h b/lib/Target/X86/X86TargetTransformInfo.h
index adb745e912d1..ab8046bb9fd4 100644
--- a/lib/Target/X86/X86TargetTransformInfo.h
+++ b/lib/Target/X86/X86TargetTransformInfo.h
@@ -80,6 +80,11 @@ public:
                              bool VariableMask, unsigned Alignment);
   int getAddressComputationCost(Type *PtrTy, bool IsComplex);
 
+  int getIntrinsicInstrCost(Intrinsic::ID IID, Type *RetTy,
+                            ArrayRef<Type *> Tys, FastMathFlags FMF);
+  int getIntrinsicInstrCost(Intrinsic::ID IID, Type *RetTy,
+                            ArrayRef<Value *> Args, FastMathFlags FMF);
+
   int getReductionCost(unsigned Opcode, Type *Ty, bool IsPairwiseForm);
 
   int getIntImmCost(int64_t);
diff --git a/lib/Target/X86/X86VZeroUpper.cpp b/lib/Target/X86/X86VZeroUpper.cpp
index 6925b272b4a5..9320e1e2226f 100644
--- a/lib/Target/X86/X86VZeroUpper.cpp
+++ b/lib/Target/X86/X86VZeroUpper.cpp
@@ -38,6 +38,10 @@ namespace {
 
     VZeroUpperInserter() : MachineFunctionPass(ID) {}
     bool runOnMachineFunction(MachineFunction &MF) override;
+    MachineFunctionProperties getRequiredProperties() const override {
+      return MachineFunctionProperties().set(
+          MachineFunctionProperties::Property::AllVRegsAllocated);
+    }
     const char *getPassName() const override {return "X86 vzeroupper inserter";}
 
   private:
@@ -80,6 +84,7 @@ namespace {
     BlockStateMap BlockStates;
     DirtySuccessorsWorkList DirtySuccessors;
     bool EverMadeChange;
+    bool IsX86INTR;
     const TargetInstrInfo *TII;
 
     static char ID;
@@ -122,10 +127,9 @@ static bool clobbersAllYmmRegs(const MachineOperand &MO) {
   return true;
 }
 
-static bool hasYmmReg(MachineInstr *MI) {
-  for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) {
-    const MachineOperand &MO = MI->getOperand(i);
-    if (MI->isCall() && MO.isRegMask() && !clobbersAllYmmRegs(MO))
+static bool hasYmmReg(MachineInstr &MI) {
+  for (const MachineOperand &MO : MI.operands()) {
+    if (MI.isCall() && MO.isRegMask() && !clobbersAllYmmRegs(MO))
       return true;
     if (!MO.isReg())
       continue;
@@ -137,12 +141,10 @@ static bool hasYmmReg(MachineInstr *MI) {
   return false;
 }
 
-/// clobbersAnyYmmReg() - Check if any YMM register will be clobbered by this
-/// instruction.
-static bool callClobbersAnyYmmReg(MachineInstr *MI) {
-  assert(MI->isCall() && "Can only be called on call instructions.");
-  for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) {
-    const MachineOperand &MO = MI->getOperand(i);
+/// Check if any YMM register will be clobbered by this instruction.
+static bool callClobbersAnyYmmReg(MachineInstr &MI) {
+  assert(MI.isCall() && "Can only be called on call instructions.");
+  for (const MachineOperand &MO : MI.operands()) {
     if (!MO.isRegMask())
       continue;
     for (unsigned reg = X86::YMM0; reg <= X86::YMM15; ++reg) {
@@ -153,16 +155,16 @@ static bool callClobbersAnyYmmReg(MachineInstr *MI) {
   return false;
 }
 
-// Insert a vzeroupper instruction before I.
+/// Insert a vzeroupper instruction before I.
 void VZeroUpperInserter::insertVZeroUpper(MachineBasicBlock::iterator I,
-                                              MachineBasicBlock &MBB) {
+                                          MachineBasicBlock &MBB) {
   DebugLoc dl = I->getDebugLoc();
   BuildMI(MBB, I, dl, TII->get(X86::VZEROUPPER));
   ++NumVZU;
   EverMadeChange = true;
 }
 
-// Add MBB to the DirtySuccessors list if it hasn't already been added.
+/// Add MBB to the DirtySuccessors list if it hasn't already been added.
 void VZeroUpperInserter::addDirtySuccessor(MachineBasicBlock &MBB) {
   if (!BlockStates[MBB.getNumber()].AddedToDirtySuccessors) {
     DirtySuccessors.push_back(&MBB);
@@ -170,21 +172,29 @@ void VZeroUpperInserter::addDirtySuccessor(MachineBasicBlock &MBB) {
   }
 }
 
-/// processBasicBlock - Loop over all of the instructions in the basic block,
-/// inserting vzeroupper instructions before function calls.
+/// Loop over all of the instructions in the basic block, inserting vzeroupper
+/// instructions before function calls.
 void VZeroUpperInserter::processBasicBlock(MachineBasicBlock &MBB) {
 
-  // Start by assuming that the block PASS_THROUGH, which implies no unguarded
+  // Start by assuming that the block is PASS_THROUGH which implies no unguarded
   // calls.
   BlockExitState CurState = PASS_THROUGH;
   BlockStates[MBB.getNumber()].FirstUnguardedCall = MBB.end();
 
-  for (MachineBasicBlock::iterator I = MBB.begin(); I != MBB.end(); ++I) {
-    MachineInstr *MI = I;
-    bool isControlFlow = MI->isCall() || MI->isReturn();
+  for (MachineInstr &MI : MBB) {
+    // No need for vzeroupper before iret in interrupt handler function,
+    // epilogue will restore YMM registers if needed.
+    bool IsReturnFromX86INTR = IsX86INTR && MI.isReturn();
+    bool IsControlFlow = MI.isCall() || MI.isReturn();
+
+    // An existing VZERO* instruction resets the state.
+    if (MI.getOpcode() == X86::VZEROALL || MI.getOpcode() == X86::VZEROUPPER) {
+      CurState = EXITS_CLEAN;
+      continue;
+    }
 
     // Shortcut: don't need to check regular instructions in dirty state.
-    if (!isControlFlow && CurState == EXITS_DIRTY)
+    if ((!IsControlFlow || IsReturnFromX86INTR) && CurState == EXITS_DIRTY)
       continue;
 
     if (hasYmmReg(MI)) {
@@ -196,7 +206,7 @@ void VZeroUpperInserter::processBasicBlock(MachineBasicBlock &MBB) {
 
     // Check for control-flow out of the current function (which might
     // indirectly execute SSE instructions).
-    if (!isControlFlow)
+    if (!IsControlFlow || IsReturnFromX86INTR)
       continue;
 
     // If the call won't clobber any YMM register, skip it as well. It usually
@@ -204,22 +214,21 @@ void VZeroUpperInserter::processBasicBlock(MachineBasicBlock &MBB) {
     // standard calling convention is not used (RegMask is not used to mark
     // register clobbered and register usage (def/imp-def/use) is well-defined
     // and explicitly specified.
-    if (MI->isCall() && !callClobbersAnyYmmReg(MI))
+    if (MI.isCall() && !callClobbersAnyYmmReg(MI))
       continue;
 
-    // The VZEROUPPER instruction resets the upper 128 bits of all Intel AVX
-    // registers. This instruction has zero latency. In addition, the processor
-    // changes back to Clean state, after which execution of Intel SSE
-    // instructions or Intel AVX instructions has no transition penalty. Add
-    // the VZEROUPPER instruction before any function call/return that might
-    // execute SSE code.
+    // The VZEROUPPER instruction resets the upper 128 bits of all AVX
+    // registers. In addition, the processor changes back to Clean state, after
+    // which execution of SSE instructions or AVX instructions has no transition
+    // penalty. Add the VZEROUPPER instruction before any function call/return
+    // that might execute SSE code.
     // FIXME: In some cases, we may want to move the VZEROUPPER into a
     // predecessor block.
     if (CurState == EXITS_DIRTY) {
       // After the inserted VZEROUPPER the state becomes clean again, but
       // other YMM may appear before other subsequent calls or even before
       // the end of the BB.
-      insertVZeroUpper(I, MBB);
+      insertVZeroUpper(MI, MBB);
       CurState = EXITS_CLEAN;
     } else if (CurState == PASS_THROUGH) {
       // If this block is currently in pass-through state and we encounter a
@@ -227,7 +236,7 @@ void VZeroUpperInserter::processBasicBlock(MachineBasicBlock &MBB) {
       // block has successors that exit dirty. Record the location of the call,
       // and set the state to EXITS_CLEAN, but do not insert the vzeroupper yet.
       // It will be inserted later if necessary.
-      BlockStates[MBB.getNumber()].FirstUnguardedCall = I;
+      BlockStates[MBB.getNumber()].FirstUnguardedCall = MI;
       CurState = EXITS_CLEAN;
     }
   }
@@ -244,15 +253,16 @@ void VZeroUpperInserter::processBasicBlock(MachineBasicBlock &MBB) {
   BlockStates[MBB.getNumber()].ExitState = CurState;
 }
 
-/// runOnMachineFunction - Loop over all of the basic blocks, inserting
-/// vzeroupper instructions before function calls.
+/// Loop over all of the basic blocks, inserting vzeroupper instructions before
+/// function calls.
 bool VZeroUpperInserter::runOnMachineFunction(MachineFunction &MF) {
   const X86Subtarget &ST = MF.getSubtarget<X86Subtarget>();
-  if (!ST.hasAVX() || ST.hasAVX512())
+  if (!ST.hasAVX() || ST.hasAVX512() || ST.hasFastPartialYMMWrite())
     return false;
   TII = ST.getInstrInfo();
   MachineRegisterInfo &MRI = MF.getRegInfo();
   EverMadeChange = false;
+  IsX86INTR = MF.getFunction()->getCallingConv() == CallingConv::X86_INTR;
 
   bool FnHasLiveInYmm = checkFnHasLiveInYmm(MRI);
 
@@ -284,12 +294,12 @@ bool VZeroUpperInserter::runOnMachineFunction(MachineFunction &MF) {
   for (MachineBasicBlock &MBB : MF)
     processBasicBlock(MBB);
 
-  // If any YMM regs are live in to this function, add the entry block to the
+  // If any YMM regs are live-in to this function, add the entry block to the
   // DirtySuccessors list
   if (FnHasLiveInYmm)
     addDirtySuccessor(MF.front());
 
-  // Re-visit all blocks that are successors of EXITS_DIRTY bsocks. Add
+  // Re-visit all blocks that are successors of EXITS_DIRTY blocks. Add
   // vzeroupper instructions to unguarded calls, and propagate EXITS_DIRTY
   // through PASS_THROUGH blocks.
   while (!DirtySuccessors.empty()) {
@@ -302,16 +312,14 @@ bool VZeroUpperInserter::runOnMachineFunction(MachineFunction &MF) {
     if (BBState.FirstUnguardedCall != MBB.end())
       insertVZeroUpper(BBState.FirstUnguardedCall, MBB);
 
-    // If this successor was a pass-through block then it is now dirty, and its
+    // If this successor was a pass-through block, then it is now dirty. Its
     // successors need to be added to the worklist (if they haven't been
     // already).
     if (BBState.ExitState == PASS_THROUGH) {
       DEBUG(dbgs() << "MBB #" << MBB.getNumber()
                    << " was Pass-through, is now Dirty-out.\n");
-      for (MachineBasicBlock::succ_iterator SI = MBB.succ_begin(),
-                                            SE = MBB.succ_end();
-           SI != SE; ++SI)
-        addDirtySuccessor(**SI);
+      for (MachineBasicBlock *Succ : MBB.successors())
+        addDirtySuccessor(*Succ);
     }
   }
 
diff --git a/lib/Target/X86/X86WinAllocaExpander.cpp b/lib/Target/X86/X86WinAllocaExpander.cpp
new file mode 100644
index 000000000000..cc82074e685f
--- /dev/null
+++ b/lib/Target/X86/X86WinAllocaExpander.cpp
@@ -0,0 +1,294 @@
+//===----- X86WinAllocaExpander.cpp - Expand WinAlloca pseudo instruction -===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines a pass that expands WinAlloca pseudo-instructions.
+//
+// It performs a conservative analysis to determine whether each allocation
+// falls within a region of the stack that is safe to use, or whether stack
+// probes must be emitted.
+//
+//===----------------------------------------------------------------------===//
+
+#include "X86.h"
+#include "X86InstrBuilder.h"
+#include "X86InstrInfo.h"
+#include "X86MachineFunctionInfo.h"
+#include "X86Subtarget.h"
+#include "llvm/ADT/PostOrderIterator.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/Passes.h"
+#include "llvm/IR/Function.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Target/TargetInstrInfo.h"
+
+using namespace llvm;
+
+namespace {
+
+class X86WinAllocaExpander : public MachineFunctionPass {
+public:
+  X86WinAllocaExpander() : MachineFunctionPass(ID) {}
+
+  bool runOnMachineFunction(MachineFunction &MF) override;
+
+private:
+  /// Strategies for lowering a WinAlloca.
+  enum Lowering { TouchAndSub, Sub, Probe };
+
+  /// Deterministic-order map from WinAlloca instruction to desired lowering.
+  typedef MapVector<MachineInstr*, Lowering> LoweringMap;
+
+  /// Compute which lowering to use for each WinAlloca instruction.
+  void computeLowerings(MachineFunction &MF, LoweringMap& Lowerings);
+
+  /// Get the appropriate lowering based on current offset and amount.
+  Lowering getLowering(int64_t CurrentOffset, int64_t AllocaAmount);
+
+  /// Lower a WinAlloca instruction.
+  void lower(MachineInstr* MI, Lowering L);
+
+  MachineRegisterInfo *MRI;
+  const X86Subtarget *STI;
+  const TargetInstrInfo *TII;
+  const X86RegisterInfo *TRI;
+  unsigned StackPtr;
+  unsigned SlotSize;
+  int64_t StackProbeSize;
+
+  const char *getPassName() const override { return "X86 WinAlloca Expander"; }
+  static char ID;
+};
+
+char X86WinAllocaExpander::ID = 0;
+
+} // end anonymous namespace
+
+FunctionPass *llvm::createX86WinAllocaExpander() {
+  return new X86WinAllocaExpander();
+}
+
+/// Return the allocation amount for a WinAlloca instruction, or -1 if unknown.
+static int64_t getWinAllocaAmount(MachineInstr *MI, MachineRegisterInfo *MRI) {
+  assert(MI->getOpcode() == X86::WIN_ALLOCA_32 ||
+         MI->getOpcode() == X86::WIN_ALLOCA_64);
+  assert(MI->getOperand(0).isReg());
+
+  unsigned AmountReg = MI->getOperand(0).getReg();
+  MachineInstr *Def = MRI->getUniqueVRegDef(AmountReg);
+
+  // Look through copies.
+  while (Def && Def->isCopy() && Def->getOperand(1).isReg())
+    Def = MRI->getUniqueVRegDef(Def->getOperand(1).getReg());
+
+  if (!Def ||
+      (Def->getOpcode() != X86::MOV32ri && Def->getOpcode() != X86::MOV64ri) ||
+      !Def->getOperand(1).isImm())
+    return -1;
+
+  return Def->getOperand(1).getImm();
+}
+
+X86WinAllocaExpander::Lowering
+X86WinAllocaExpander::getLowering(int64_t CurrentOffset,
+                                  int64_t AllocaAmount) {
+  // For a non-constant amount or a large amount, we have to probe.
+  if (AllocaAmount < 0 || AllocaAmount > StackProbeSize)
+    return Probe;
+
+  // If it fits within the safe region of the stack, just subtract.
+  if (CurrentOffset + AllocaAmount <= StackProbeSize)
+    return Sub;
+
+  // Otherwise, touch the current tip of the stack, then subtract.
+  return TouchAndSub;
+}
+
+static bool isPushPop(const MachineInstr &MI) {
+  switch (MI.getOpcode()) {
+  case X86::PUSH32i8:
+  case X86::PUSH32r:
+  case X86::PUSH32rmm:
+  case X86::PUSH32rmr:
+  case X86::PUSHi32:
+  case X86::PUSH64i8:
+  case X86::PUSH64r:
+  case X86::PUSH64rmm:
+  case X86::PUSH64rmr:
+  case X86::PUSH64i32:
+  case X86::POP32r:
+  case X86::POP64r:
+    return true;
+  default:
+    return false;
+  }
+}
+
+void X86WinAllocaExpander::computeLowerings(MachineFunction &MF,
+                                            LoweringMap &Lowerings) {
+  // Do a one-pass reverse post-order walk of the CFG to conservatively estimate
+  // the offset between the stack pointer and the lowest touched part of the
+  // stack, and use that to decide how to lower each WinAlloca instruction.
+
+  // Initialize OutOffset[B], the stack offset at exit from B, to something big.
+  DenseMap<MachineBasicBlock *, int64_t> OutOffset;
+  for (MachineBasicBlock &MBB : MF)
+    OutOffset[&MBB] = INT32_MAX;
+
+  // Note: we don't know the offset at the start of the entry block since the
+  // prologue hasn't been inserted yet, and how much that will adjust the stack
+  // pointer depends on register spills, which have not been computed yet.
+
+  // Compute the reverse post-order.
+  ReversePostOrderTraversal<MachineFunction*> RPO(&MF);
+
+  for (MachineBasicBlock *MBB : RPO) {
+    int64_t Offset = -1;
+    for (MachineBasicBlock *Pred : MBB->predecessors())
+      Offset = std::max(Offset, OutOffset[Pred]);
+    if (Offset == -1) Offset = INT32_MAX;
+
+    for (MachineInstr &MI : *MBB) {
+      if (MI.getOpcode() == X86::WIN_ALLOCA_32 ||
+          MI.getOpcode() == X86::WIN_ALLOCA_64) {
+        // A WinAlloca moves StackPtr, and potentially touches it.
+        int64_t Amount = getWinAllocaAmount(&MI, MRI);
+        Lowering L = getLowering(Offset, Amount);
+        Lowerings[&MI] = L;
+        switch (L) {
+        case Sub:
+          Offset += Amount;
+          break;
+        case TouchAndSub:
+          Offset = Amount;
+          break;
+        case Probe:
+          Offset = 0;
+          break;
+        }
+      } else if (MI.isCall() || isPushPop(MI)) {
+        // Calls, pushes and pops touch the tip of the stack.
+        Offset = 0;
+      } else if (MI.getOpcode() == X86::ADJCALLSTACKUP32 ||
+                 MI.getOpcode() == X86::ADJCALLSTACKUP64) {
+        Offset -= MI.getOperand(0).getImm();
+      } else if (MI.getOpcode() == X86::ADJCALLSTACKDOWN32 ||
+                 MI.getOpcode() == X86::ADJCALLSTACKDOWN64) {
+        Offset += MI.getOperand(0).getImm();
+      } else if (MI.modifiesRegister(StackPtr, TRI)) {
+        // Any other modification of SP means we've lost track of it.
+        Offset = INT32_MAX;
+      }
+    }
+
+    OutOffset[MBB] = Offset;
+  }
+}
+
+static unsigned getSubOpcode(bool Is64Bit, int64_t Amount) {
+  if (Is64Bit)
+    return isInt<8>(Amount) ? X86::SUB64ri8 : X86::SUB64ri32;
+  return isInt<8>(Amount) ? X86::SUB32ri8 : X86::SUB32ri;
+}
+
+void X86WinAllocaExpander::lower(MachineInstr* MI, Lowering L) {
+  DebugLoc DL = MI->getDebugLoc();
+  MachineBasicBlock *MBB = MI->getParent();
+  MachineBasicBlock::iterator I = *MI;
+
+  int64_t Amount = getWinAllocaAmount(MI, MRI);
+  if (Amount == 0) {
+    MI->eraseFromParent();
+    return;
+  }
+
+  bool Is64Bit = STI->is64Bit();
+  assert(SlotSize == 4 || SlotSize == 8);
+  unsigned RegA = (SlotSize == 8) ? X86::RAX : X86::EAX;
+
+  switch (L) {
+  case TouchAndSub:
+    assert(Amount >= SlotSize);
+
+    // Use a push to touch the top of the stack.
+    BuildMI(*MBB, I, DL, TII->get(Is64Bit ? X86::PUSH64r : X86::PUSH32r))
+        .addReg(RegA, RegState::Undef);
+    Amount -= SlotSize;
+    if (!Amount)
+      break;
+
+    // Fall through to make any remaining adjustment.
+  case Sub:
+    assert(Amount > 0);
+    if (Amount == SlotSize) {
+      // Use push to save size.
+      BuildMI(*MBB, I, DL, TII->get(Is64Bit ? X86::PUSH64r : X86::PUSH32r))
+          .addReg(RegA, RegState::Undef);
+    } else {
+      // Sub.
+      BuildMI(*MBB, I, DL, TII->get(getSubOpcode(Is64Bit, Amount)), StackPtr)
+          .addReg(StackPtr)
+          .addImm(Amount);
+    }
+    break;
+  case Probe:
+    // The probe lowering expects the amount in RAX/EAX.
+    BuildMI(*MBB, MI, DL, TII->get(TargetOpcode::COPY), RegA)
+        .addReg(MI->getOperand(0).getReg());
+
+    // Do the probe.
+    STI->getFrameLowering()->emitStackProbe(*MBB->getParent(), *MBB, MI, DL,
+                                            /*InPrologue=*/false);
+    break;
+  }
+
+  unsigned AmountReg = MI->getOperand(0).getReg();
+  MI->eraseFromParent();
+
+  // Delete the definition of AmountReg, possibly walking a chain of copies.
+  for (;;) {
+    if (!MRI->use_empty(AmountReg))
+      break;
+    MachineInstr *AmountDef = MRI->getUniqueVRegDef(AmountReg);
+    if (!AmountDef)
+      break;
+    if (AmountDef->isCopy() && AmountDef->getOperand(1).isReg())
+      AmountReg = AmountDef->getOperand(1).isReg();
+    AmountDef->eraseFromParent();
+    break;
+  }
+}
+
+bool X86WinAllocaExpander::runOnMachineFunction(MachineFunction &MF) {
+  if (!MF.getInfo<X86MachineFunctionInfo>()->hasWinAlloca())
+    return false;
+
+  MRI = &MF.getRegInfo();
+  STI = &MF.getSubtarget<X86Subtarget>();
+  TII = STI->getInstrInfo();
+  TRI = STI->getRegisterInfo();
+  StackPtr = TRI->getStackRegister();
+  SlotSize = TRI->getSlotSize();
+
+  StackProbeSize = 4096;
+  if (MF.getFunction()->hasFnAttribute("stack-probe-size")) {
+    MF.getFunction()
+        ->getFnAttribute("stack-probe-size")
+        .getValueAsString()
+        .getAsInteger(0, StackProbeSize);
+  }
+
+  LoweringMap Lowerings;
+  computeLowerings(MF, Lowerings);
+  for (auto &P : Lowerings)
+    lower(P.first, P.second);
+
+  return true;
+}
diff --git a/lib/Target/X86/X86WinEHState.cpp b/lib/Target/X86/X86WinEHState.cpp
index dce94a9e9ef7..99387edef99a 100644
--- a/lib/Target/X86/X86WinEHState.cpp
+++ b/lib/Target/X86/X86WinEHState.cpp
@@ -15,33 +15,32 @@
 //===----------------------------------------------------------------------===//
 
 #include "X86.h"
+#include "llvm/ADT/PostOrderIterator.h"
 #include "llvm/Analysis/CFG.h"
 #include "llvm/Analysis/EHPersonalities.h"
 #include "llvm/CodeGen/MachineModuleInfo.h"
-#include "llvm/CodeGen/Passes.h"
 #include "llvm/CodeGen/WinEHFuncInfo.h"
-#include "llvm/IR/Dominators.h"
+#include "llvm/IR/CallSite.h"
 #include "llvm/IR/Function.h"
-#include "llvm/IR/IRBuilder.h"
 #include "llvm/IR/Instructions.h"
 #include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/IRBuilder.h"
 #include "llvm/IR/Module.h"
-#include "llvm/IR/PatternMatch.h"
 #include "llvm/Pass.h"
 #include "llvm/Support/Debug.h"
-#include "llvm/Support/raw_ostream.h"
-#include "llvm/Transforms/Utils/BasicBlockUtils.h"
-#include "llvm/Transforms/Utils/Cloning.h"
-#include "llvm/Transforms/Utils/Local.h"
+#include <deque>
 
 using namespace llvm;
-using namespace llvm::PatternMatch;
 
 #define DEBUG_TYPE "winehstate"
 
-namespace llvm { void initializeWinEHStatePassPass(PassRegistry &); }
+namespace llvm {
+void initializeWinEHStatePassPass(PassRegistry &);
+}
 
 namespace {
+const int OverdefinedState = INT_MIN;
+
 class WinEHStatePass : public FunctionPass {
 public:
   static char ID; // Pass identification, replacement for typeid.
@@ -68,12 +67,20 @@ private:
   void linkExceptionRegistration(IRBuilder<> &Builder, Function *Handler);
   void unlinkExceptionRegistration(IRBuilder<> &Builder);
   void addStateStores(Function &F, WinEHFuncInfo &FuncInfo);
-  void insertStateNumberStore(Value *ParentRegNode, Instruction *IP, int State);
+  void insertStateNumberStore(Instruction *IP, int State);
 
   Value *emitEHLSDA(IRBuilder<> &Builder, Function *F);
 
   Function *generateLSDAInEAXThunk(Function *ParentFunc);
 
+  bool isStateStoreNeeded(EHPersonality Personality, CallSite CS);
+  void rewriteSetJmpCallSite(IRBuilder<> &Builder, Function &F, CallSite CS,
+                             Value *State);
+  int getBaseStateForBB(DenseMap<BasicBlock *, ColorVector> &BlockColors,
+                        WinEHFuncInfo &FuncInfo, BasicBlock *BB);
+  int getStateForCallSite(DenseMap<BasicBlock *, ColorVector> &BlockColors,
+                          WinEHFuncInfo &FuncInfo, CallSite CS);
+
   // Module-level type getters.
   Type *getEHLinkRegistrationType();
   Type *getSEHRegistrationType();
@@ -84,20 +91,23 @@ private:
   StructType *EHLinkRegistrationTy = nullptr;
   StructType *CXXEHRegistrationTy = nullptr;
   StructType *SEHRegistrationTy = nullptr;
-  Function *FrameRecover = nullptr;
-  Function *FrameAddress = nullptr;
-  Function *FrameEscape = nullptr;
+  Constant *SetJmp3 = nullptr;
+  Constant *CxxLongjmpUnwind = nullptr;
 
   // Per-function state
   EHPersonality Personality = EHPersonality::Unknown;
   Function *PersonalityFn = nullptr;
+  bool UseStackGuard = false;
+  int ParentBaseState;
+  Constant *SehLongjmpUnwind = nullptr;
+  Constant *Cookie = nullptr;
 
   /// The stack allocation containing all EH data, including the link in the
   /// fs:00 chain and the current state.
   AllocaInst *RegNode = nullptr;
 
-  /// Struct type of RegNode. Used for GEPing.
-  Type *RegNodeTy = nullptr;
+  // The allocation containing the EH security guard.
+  AllocaInst *EHGuardNode = nullptr;
 
   /// The index of the state field of RegNode.
   int StateFieldIndex = ~0U;
@@ -116,9 +126,6 @@ INITIALIZE_PASS(WinEHStatePass, "x86-winehstate",
 
 bool WinEHStatePass::doInitialization(Module &M) {
   TheModule = &M;
-  FrameEscape = Intrinsic::getDeclaration(TheModule, Intrinsic::localescape);
-  FrameRecover = Intrinsic::getDeclaration(TheModule, Intrinsic::localrecover);
-  FrameAddress = Intrinsic::getDeclaration(TheModule, Intrinsic::frameaddress);
   return false;
 }
 
@@ -128,9 +135,10 @@ bool WinEHStatePass::doFinalization(Module &M) {
   EHLinkRegistrationTy = nullptr;
   CXXEHRegistrationTy = nullptr;
   SEHRegistrationTy = nullptr;
-  FrameEscape = nullptr;
-  FrameRecover = nullptr;
-  FrameAddress = nullptr;
+  SetJmp3 = nullptr;
+  CxxLongjmpUnwind = nullptr;
+  SehLongjmpUnwind = nullptr;
+  Cookie = nullptr;
   return false;
 }
 
@@ -164,6 +172,13 @@ bool WinEHStatePass::runOnFunction(Function &F) {
   if (!HasPads)
     return false;
 
+  Type *Int8PtrType = Type::getInt8PtrTy(TheModule->getContext());
+  SetJmp3 = TheModule->getOrInsertFunction(
+      "_setjmp3", FunctionType::get(
+                      Type::getInt32Ty(TheModule->getContext()),
+                      {Int8PtrType, Type::getInt32Ty(TheModule->getContext())},
+                      /*isVarArg=*/true));
+
   // Disable frame pointer elimination in this function.
   // FIXME: Do the nested handlers need to keep the parent ebp in ebp, or can we
   // use an arbitrary register?
@@ -182,6 +197,10 @@ bool WinEHStatePass::runOnFunction(Function &F) {
   // Reset per-function state.
   PersonalityFn = nullptr;
   Personality = EHPersonality::Unknown;
+  UseStackGuard = false;
+  RegNode = nullptr;
+  EHGuardNode = nullptr;
+
   return true;
 }
 
@@ -256,9 +275,14 @@ void WinEHStatePass::emitExceptionRegistrationRecord(Function *F) {
   assert(Personality == EHPersonality::MSVC_CXX ||
          Personality == EHPersonality::MSVC_X86SEH);
 
-  StringRef PersonalityName = PersonalityFn->getName();
+  // Struct type of RegNode. Used for GEPing.
+  Type *RegNodeTy;
+
   IRBuilder<> Builder(&F->getEntryBlock(), F->getEntryBlock().begin());
   Type *Int8PtrType = Builder.getInt8PtrTy();
+  Type *Int32Ty = Builder.getInt32Ty();
+  Type *VoidTy = Builder.getVoidTy();
+
   if (Personality == EHPersonality::MSVC_CXX) {
     RegNodeTy = getCXXEHRegistrationType();
     RegNode = Builder.CreateAlloca(RegNodeTy);
@@ -268,42 +292,71 @@ void WinEHStatePass::emitExceptionRegistrationRecord(Function *F) {
     Builder.CreateStore(SP, Builder.CreateStructGEP(RegNodeTy, RegNode, 0));
     // TryLevel = -1
     StateFieldIndex = 2;
-    insertStateNumberStore(RegNode, &*Builder.GetInsertPoint(), -1);
+    ParentBaseState = -1;
+    insertStateNumberStore(&*Builder.GetInsertPoint(), ParentBaseState);
     // Handler = __ehhandler$F
     Function *Trampoline = generateLSDAInEAXThunk(F);
     Link = Builder.CreateStructGEP(RegNodeTy, RegNode, 1);
     linkExceptionRegistration(Builder, Trampoline);
+
+    CxxLongjmpUnwind = TheModule->getOrInsertFunction(
+        "__CxxLongjmpUnwind",
+        FunctionType::get(VoidTy, Int8PtrType, /*isVarArg=*/false));
+    cast<Function>(CxxLongjmpUnwind->stripPointerCasts())
+        ->setCallingConv(CallingConv::X86_StdCall);
   } else if (Personality == EHPersonality::MSVC_X86SEH) {
     // If _except_handler4 is in use, some additional guard checks and prologue
     // stuff is required.
-    bool UseStackGuard = (PersonalityName == "_except_handler4");
+    StringRef PersonalityName = PersonalityFn->getName();
+    UseStackGuard = (PersonalityName == "_except_handler4");
+
+    // Allocate local structures.
     RegNodeTy = getSEHRegistrationType();
     RegNode = Builder.CreateAlloca(RegNodeTy);
+    if (UseStackGuard)
+      EHGuardNode = Builder.CreateAlloca(Int32Ty);
+
     // SavedESP = llvm.stacksave()
     Value *SP = Builder.CreateCall(
         Intrinsic::getDeclaration(TheModule, Intrinsic::stacksave), {});
     Builder.CreateStore(SP, Builder.CreateStructGEP(RegNodeTy, RegNode, 0));
     // TryLevel = -2 / -1
     StateFieldIndex = 4;
-    insertStateNumberStore(RegNode, &*Builder.GetInsertPoint(),
-                           UseStackGuard ? -2 : -1);
+    ParentBaseState = UseStackGuard ? -2 : -1;
+    insertStateNumberStore(&*Builder.GetInsertPoint(), ParentBaseState);
     // ScopeTable = llvm.x86.seh.lsda(F)
-    Value *FI8 = Builder.CreateBitCast(F, Int8PtrType);
-    Value *LSDA = Builder.CreateCall(
-        Intrinsic::getDeclaration(TheModule, Intrinsic::x86_seh_lsda), FI8);
-    Type *Int32Ty = Type::getInt32Ty(TheModule->getContext());
+    Value *LSDA = emitEHLSDA(Builder, F);
     LSDA = Builder.CreatePtrToInt(LSDA, Int32Ty);
     // If using _except_handler4, xor the address of the table with
     // __security_cookie.
     if (UseStackGuard) {
-      Value *Cookie =
-          TheModule->getOrInsertGlobal("__security_cookie", Int32Ty);
-      Value *Val = Builder.CreateLoad(Int32Ty, Cookie);
+      Cookie = TheModule->getOrInsertGlobal("__security_cookie", Int32Ty);
+      Value *Val = Builder.CreateLoad(Int32Ty, Cookie, "cookie");
       LSDA = Builder.CreateXor(LSDA, Val);
     }
     Builder.CreateStore(LSDA, Builder.CreateStructGEP(RegNodeTy, RegNode, 3));
+
+    // If using _except_handler4, the EHGuard contains: FramePtr xor Cookie.
+    if (UseStackGuard) {
+      Value *Val = Builder.CreateLoad(Int32Ty, Cookie);
+      Value *FrameAddr = Builder.CreateCall(
+          Intrinsic::getDeclaration(TheModule, Intrinsic::frameaddress),
+          Builder.getInt32(0), "frameaddr");
+      Value *FrameAddrI32 = Builder.CreatePtrToInt(FrameAddr, Int32Ty);
+      FrameAddrI32 = Builder.CreateXor(FrameAddrI32, Val);
+      Builder.CreateStore(FrameAddrI32, EHGuardNode);
+    }
+
+    // Register the exception handler.
     Link = Builder.CreateStructGEP(RegNodeTy, RegNode, 2);
     linkExceptionRegistration(Builder, PersonalityFn);
+
+    SehLongjmpUnwind = TheModule->getOrInsertFunction(
+        UseStackGuard ? "_seh_longjmp_unwind4" : "_seh_longjmp_unwind",
+        FunctionType::get(Type::getVoidTy(TheModule->getContext()), Int8PtrType,
+                          /*isVarArg=*/false));
+    cast<Function>(SehLongjmpUnwind->stripPointerCasts())
+        ->setCallingConv(CallingConv::X86_StdCall);
   } else {
     llvm_unreachable("unexpected personality function");
   }
@@ -398,15 +451,203 @@ void WinEHStatePass::unlinkExceptionRegistration(IRBuilder<> &Builder) {
   Builder.CreateStore(Next, FSZero);
 }
 
+// Calls to setjmp(p) are lowered to _setjmp3(p, 0) by the frontend.
+// The idea behind _setjmp3 is that it takes an optional number of personality
+// specific parameters to indicate how to restore the personality-specific frame
+// state when longjmp is initiated.  Typically, the current TryLevel is saved.
+void WinEHStatePass::rewriteSetJmpCallSite(IRBuilder<> &Builder, Function &F,
+                                           CallSite CS, Value *State) {
+  // Don't rewrite calls with a weird number of arguments.
+  if (CS.getNumArgOperands() != 2)
+    return;
+
+  Instruction *Inst = CS.getInstruction();
+
+  SmallVector<OperandBundleDef, 1> OpBundles;
+  CS.getOperandBundlesAsDefs(OpBundles);
+
+  SmallVector<Value *, 3> OptionalArgs;
+  if (Personality == EHPersonality::MSVC_CXX) {
+    OptionalArgs.push_back(CxxLongjmpUnwind);
+    OptionalArgs.push_back(State);
+    OptionalArgs.push_back(emitEHLSDA(Builder, &F));
+  } else if (Personality == EHPersonality::MSVC_X86SEH) {
+    OptionalArgs.push_back(SehLongjmpUnwind);
+    OptionalArgs.push_back(State);
+    if (UseStackGuard)
+      OptionalArgs.push_back(Cookie);
+  } else {
+    llvm_unreachable("unhandled personality!");
+  }
+
+  SmallVector<Value *, 5> Args;
+  Args.push_back(
+      Builder.CreateBitCast(CS.getArgOperand(0), Builder.getInt8PtrTy()));
+  Args.push_back(Builder.getInt32(OptionalArgs.size()));
+  Args.append(OptionalArgs.begin(), OptionalArgs.end());
+
+  CallSite NewCS;
+  if (CS.isCall()) {
+    auto *CI = cast<CallInst>(Inst);
+    CallInst *NewCI = Builder.CreateCall(SetJmp3, Args, OpBundles);
+    NewCI->setTailCallKind(CI->getTailCallKind());
+    NewCS = NewCI;
+  } else {
+    auto *II = cast<InvokeInst>(Inst);
+    NewCS = Builder.CreateInvoke(
+        SetJmp3, II->getNormalDest(), II->getUnwindDest(), Args, OpBundles);
+  }
+  NewCS.setCallingConv(CS.getCallingConv());
+  NewCS.setAttributes(CS.getAttributes());
+  NewCS->setDebugLoc(CS->getDebugLoc());
+
+  Instruction *NewInst = NewCS.getInstruction();
+  NewInst->takeName(Inst);
+  Inst->replaceAllUsesWith(NewInst);
+  Inst->eraseFromParent();
+}
+
+// Figure out what state we should assign calls in this block.
+int WinEHStatePass::getBaseStateForBB(
+    DenseMap<BasicBlock *, ColorVector> &BlockColors, WinEHFuncInfo &FuncInfo,
+    BasicBlock *BB) {
+  int BaseState = ParentBaseState;
+  auto &BBColors = BlockColors[BB];
+
+  assert(BBColors.size() == 1 && "multi-color BB not removed by preparation");
+  BasicBlock *FuncletEntryBB = BBColors.front();
+  if (auto *FuncletPad =
+          dyn_cast<FuncletPadInst>(FuncletEntryBB->getFirstNonPHI())) {
+    auto BaseStateI = FuncInfo.FuncletBaseStateMap.find(FuncletPad);
+    if (BaseStateI != FuncInfo.FuncletBaseStateMap.end())
+      BaseState = BaseStateI->second;
+  }
+
+  return BaseState;
+}
+
+// Calculate the state a call-site is in.
+int WinEHStatePass::getStateForCallSite(
+    DenseMap<BasicBlock *, ColorVector> &BlockColors, WinEHFuncInfo &FuncInfo,
+    CallSite CS) {
+  if (auto *II = dyn_cast<InvokeInst>(CS.getInstruction())) {
+    // Look up the state number of the EH pad this unwinds to.
+    assert(FuncInfo.InvokeStateMap.count(II) && "invoke has no state!");
+    return FuncInfo.InvokeStateMap[II];
+  }
+  // Possibly throwing call instructions have no actions to take after
+  // an unwind. Ensure they are in the -1 state.
+  return getBaseStateForBB(BlockColors, FuncInfo, CS.getParent());
+}
+
+// Calculate the intersection of all the FinalStates for a BasicBlock's
+// predecessors.
+static int getPredState(DenseMap<BasicBlock *, int> &FinalStates, Function &F,
+                        int ParentBaseState, BasicBlock *BB) {
+  // The entry block has no predecessors but we know that the prologue always
+  // sets us up with a fixed state.
+  if (&F.getEntryBlock() == BB)
+    return ParentBaseState;
+
+  // This is an EH Pad, conservatively report this basic block as overdefined.
+  if (BB->isEHPad())
+    return OverdefinedState;
+
+  int CommonState = OverdefinedState;
+  for (BasicBlock *PredBB : predecessors(BB)) {
+    // We didn't manage to get a state for one of these predecessors,
+    // conservatively report this basic block as overdefined.
+    auto PredEndState = FinalStates.find(PredBB);
+    if (PredEndState == FinalStates.end())
+      return OverdefinedState;
+
+    // This code is reachable via exceptional control flow,
+    // conservatively report this basic block as overdefined.
+    if (isa<CatchReturnInst>(PredBB->getTerminator()))
+      return OverdefinedState;
+
+    int PredState = PredEndState->second;
+    assert(PredState != OverdefinedState &&
+           "overdefined BBs shouldn't be in FinalStates");
+    if (CommonState == OverdefinedState)
+      CommonState = PredState;
+
+    // At least two predecessors have different FinalStates,
+    // conservatively report this basic block as overdefined.
+    if (CommonState != PredState)
+      return OverdefinedState;
+  }
+
+  return CommonState;
+}
+
+// Calculate the intersection of all the InitialStates for a BasicBlock's
+// successors.
+static int getSuccState(DenseMap<BasicBlock *, int> &InitialStates, Function &F,
+                        int ParentBaseState, BasicBlock *BB) {
+  // This block rejoins normal control flow,
+  // conservatively report this basic block as overdefined.
+  if (isa<CatchReturnInst>(BB->getTerminator()))
+    return OverdefinedState;
+
+  int CommonState = OverdefinedState;
+  for (BasicBlock *SuccBB : successors(BB)) {
+    // We didn't manage to get a state for one of these predecessors,
+    // conservatively report this basic block as overdefined.
+    auto SuccStartState = InitialStates.find(SuccBB);
+    if (SuccStartState == InitialStates.end())
+      return OverdefinedState;
+
+    // This is an EH Pad, conservatively report this basic block as overdefined.
+    if (SuccBB->isEHPad())
+      return OverdefinedState;
+
+    int SuccState = SuccStartState->second;
+    assert(SuccState != OverdefinedState &&
+           "overdefined BBs shouldn't be in FinalStates");
+    if (CommonState == OverdefinedState)
+      CommonState = SuccState;
+
+    // At least two successors have different InitialStates,
+    // conservatively report this basic block as overdefined.
+    if (CommonState != SuccState)
+      return OverdefinedState;
+  }
+
+  return CommonState;
+}
+
+bool WinEHStatePass::isStateStoreNeeded(EHPersonality Personality,
+                                        CallSite CS) {
+  if (!CS)
+    return false;
+
+  // If the function touches memory, it needs a state store.
+  if (isAsynchronousEHPersonality(Personality))
+    return !CS.doesNotAccessMemory();
+
+  // If the function throws, it needs a state store.
+  return !CS.doesNotThrow();
+}
+
 void WinEHStatePass::addStateStores(Function &F, WinEHFuncInfo &FuncInfo) {
   // Mark the registration node. The backend needs to know which alloca it is so
   // that it can recover the original frame pointer.
-  IRBuilder<> Builder(RegNode->getParent(), std::next(RegNode->getIterator()));
+  IRBuilder<> Builder(RegNode->getNextNode());
   Value *RegNodeI8 = Builder.CreateBitCast(RegNode, Builder.getInt8PtrTy());
   Builder.CreateCall(
       Intrinsic::getDeclaration(TheModule, Intrinsic::x86_seh_ehregnode),
       {RegNodeI8});
 
+  if (EHGuardNode) {
+    IRBuilder<> Builder(EHGuardNode->getNextNode());
+    Value *EHGuardNodeI8 =
+        Builder.CreateBitCast(EHGuardNode, Builder.getInt8PtrTy());
+    Builder.CreateCall(
+        Intrinsic::getDeclaration(TheModule, Intrinsic::x86_seh_ehguard),
+        {EHGuardNodeI8});
+  }
+
   // Calculate state numbers.
   if (isAsynchronousEHPersonality(Personality))
     calculateSEHStateNumbers(&F, FuncInfo);
@@ -415,42 +656,141 @@ void WinEHStatePass::addStateStores(Function &F, WinEHFuncInfo &FuncInfo) {
 
   // Iterate all the instructions and emit state number stores.
   DenseMap<BasicBlock *, ColorVector> BlockColors = colorEHFunclets(F);
-  for (BasicBlock &BB : F) {
-    // Figure out what state we should assign calls in this block.
-    int BaseState = -1;
-    auto &BBColors = BlockColors[&BB];
+  ReversePostOrderTraversal<Function *> RPOT(&F);
+
+  // InitialStates yields the state of the first call-site for a BasicBlock.
+  DenseMap<BasicBlock *, int> InitialStates;
+  // FinalStates yields the state of the last call-site for a BasicBlock.
+  DenseMap<BasicBlock *, int> FinalStates;
+  // Worklist used to revisit BasicBlocks with indeterminate
+  // Initial/Final-States.
+  std::deque<BasicBlock *> Worklist;
+  // Fill in InitialStates and FinalStates for BasicBlocks with call-sites.
+  for (BasicBlock *BB : RPOT) {
+    int InitialState = OverdefinedState;
+    int FinalState;
+    if (&F.getEntryBlock() == BB)
+      InitialState = FinalState = ParentBaseState;
+    for (Instruction &I : *BB) {
+      CallSite CS(&I);
+      if (!isStateStoreNeeded(Personality, CS))
+        continue;
+
+      int State = getStateForCallSite(BlockColors, FuncInfo, CS);
+      if (InitialState == OverdefinedState)
+        InitialState = State;
+      FinalState = State;
+    }
+    // No call-sites in this basic block? That's OK, we will come back to these
+    // in a later pass.
+    if (InitialState == OverdefinedState) {
+      Worklist.push_back(BB);
+      continue;
+    }
+    DEBUG(dbgs() << "X86WinEHState: " << BB->getName()
+                 << " InitialState=" << InitialState << '\n');
+    DEBUG(dbgs() << "X86WinEHState: " << BB->getName()
+                 << " FinalState=" << FinalState << '\n');
+    InitialStates.insert({BB, InitialState});
+    FinalStates.insert({BB, FinalState});
+  }
 
-    assert(BBColors.size() == 1 &&
-           "multi-color BB not removed by preparation");
+  // Try to fill-in InitialStates and FinalStates which have no call-sites.
+  while (!Worklist.empty()) {
+    BasicBlock *BB = Worklist.front();
+    Worklist.pop_front();
+    // This BasicBlock has already been figured out, nothing more we can do.
+    if (InitialStates.count(BB) != 0)
+      continue;
+
+    int PredState = getPredState(FinalStates, F, ParentBaseState, BB);
+    if (PredState == OverdefinedState)
+      continue;
+
+    // We successfully inferred this BasicBlock's state via it's predecessors;
+    // enqueue it's successors to see if we can infer their states.
+    InitialStates.insert({BB, PredState});
+    FinalStates.insert({BB, PredState});
+    for (BasicBlock *SuccBB : successors(BB))
+      Worklist.push_back(SuccBB);
+  }
+
+  // Try to hoist stores from successors.
+  for (BasicBlock *BB : RPOT) {
+    int SuccState = getSuccState(InitialStates, F, ParentBaseState, BB);
+    if (SuccState == OverdefinedState)
+      continue;
+
+    // Update our FinalState to reflect the common InitialState of our
+    // successors.
+    FinalStates.insert({BB, SuccState});
+  }
+
+  // Finally, insert state stores before call-sites which transition us to a new
+  // state.
+  for (BasicBlock *BB : RPOT) {
+    auto &BBColors = BlockColors[BB];
     BasicBlock *FuncletEntryBB = BBColors.front();
-    if (auto *FuncletPad =
-            dyn_cast<FuncletPadInst>(FuncletEntryBB->getFirstNonPHI())) {
-      auto BaseStateI = FuncInfo.FuncletBaseStateMap.find(FuncletPad);
-      if (BaseStateI != FuncInfo.FuncletBaseStateMap.end())
-        BaseState = BaseStateI->second;
+    if (isa<CleanupPadInst>(FuncletEntryBB->getFirstNonPHI()))
+      continue;
+
+    int PrevState = getPredState(FinalStates, F, ParentBaseState, BB);
+    DEBUG(dbgs() << "X86WinEHState: " << BB->getName()
+                 << " PrevState=" << PrevState << '\n');
+
+    for (Instruction &I : *BB) {
+      CallSite CS(&I);
+      if (!isStateStoreNeeded(Personality, CS))
+        continue;
+
+      int State = getStateForCallSite(BlockColors, FuncInfo, CS);
+      if (State != PrevState)
+        insertStateNumberStore(&I, State);
+      PrevState = State;
     }
 
-    for (Instruction &I : BB) {
-      if (auto *CI = dyn_cast<CallInst>(&I)) {
-        // Possibly throwing call instructions have no actions to take after
-        // an unwind. Ensure they are in the -1 state.
-        if (CI->doesNotThrow())
-          continue;
-        insertStateNumberStore(RegNode, CI, BaseState);
-      } else if (auto *II = dyn_cast<InvokeInst>(&I)) {
-        // Look up the state number of the landingpad this unwinds to.
-        assert(FuncInfo.InvokeStateMap.count(II) && "invoke has no state!");
-        int State = FuncInfo.InvokeStateMap[II];
-        insertStateNumberStore(RegNode, II, State);
-      }
+    // We might have hoisted a state store into this block, emit it now.
+    auto EndState = FinalStates.find(BB);
+    if (EndState != FinalStates.end())
+      if (EndState->second != PrevState)
+        insertStateNumberStore(BB->getTerminator(), EndState->second);
+  }
+
+  SmallVector<CallSite, 1> SetJmp3CallSites;
+  for (BasicBlock *BB : RPOT) {
+    for (Instruction &I : *BB) {
+      CallSite CS(&I);
+      if (!CS)
+        continue;
+      if (CS.getCalledValue()->stripPointerCasts() !=
+          SetJmp3->stripPointerCasts())
+        continue;
+
+      SetJmp3CallSites.push_back(CS);
+    }
+  }
+
+  for (CallSite CS : SetJmp3CallSites) {
+    auto &BBColors = BlockColors[CS->getParent()];
+    BasicBlock *FuncletEntryBB = BBColors.front();
+    bool InCleanup = isa<CleanupPadInst>(FuncletEntryBB->getFirstNonPHI());
+
+    IRBuilder<> Builder(CS.getInstruction());
+    Value *State;
+    if (InCleanup) {
+      Value *StateField =
+          Builder.CreateStructGEP(nullptr, RegNode, StateFieldIndex);
+      State = Builder.CreateLoad(StateField);
+    } else {
+      State = Builder.getInt32(getStateForCallSite(BlockColors, FuncInfo, CS));
     }
+    rewriteSetJmpCallSite(Builder, F, CS, State);
   }
 }
 
-void WinEHStatePass::insertStateNumberStore(Value *ParentRegNode,
-                                            Instruction *IP, int State) {
+void WinEHStatePass::insertStateNumberStore(Instruction *IP, int State) {
   IRBuilder<> Builder(IP);
   Value *StateField =
-      Builder.CreateStructGEP(RegNodeTy, ParentRegNode, StateFieldIndex);
+      Builder.CreateStructGEP(nullptr, RegNode, StateFieldIndex);
   Builder.CreateStore(Builder.getInt32(State), StateField);
 }
diff --git a/lib/Target/XCore/Disassembler/Makefile b/lib/Target/XCore/Disassembler/Makefile
deleted file mode 100644
index 4caffdd1da6a..000000000000
--- a/lib/Target/XCore/Disassembler/Makefile
+++ /dev/null
@@ -1,16 +0,0 @@
-##===- lib/Target/XCore/Disassembler/Makefile --------------*- Makefile -*-===##
-#
-#                     The LLVM Compiler Infrastructure
-#
-# This file is distributed under the University of Illinois Open Source
-# License. See LICENSE.TXT for details.
-#
-##===----------------------------------------------------------------------===##
-
-LEVEL = ../../../..
-LIBRARYNAME = LLVMXCoreDisassembler
-
-# Hack: we need to include 'main' XCore target directory to grab private headers
-CPP.Flags += -I$(PROJ_OBJ_DIR)/.. -I$(PROJ_SRC_DIR)/..
-
-include $(LEVEL)/Makefile.common
diff --git a/lib/Target/XCore/Disassembler/XCoreDisassembler.cpp b/lib/Target/XCore/Disassembler/XCoreDisassembler.cpp
index aaf267af5311..2e8f762458a7 100644
--- a/lib/Target/XCore/Disassembler/XCoreDisassembler.cpp
+++ b/lib/Target/XCore/Disassembler/XCoreDisassembler.cpp
@@ -15,7 +15,7 @@
 #include "XCore.h"
 #include "XCoreRegisterInfo.h"
 #include "llvm/MC/MCContext.h"
-#include "llvm/MC/MCDisassembler.h"
+#include "llvm/MC/MCDisassembler/MCDisassembler.h"
 #include "llvm/MC/MCFixedLenDisassembler.h"
 #include "llvm/MC/MCInst.h"
 #include "llvm/MC/MCSubtargetInfo.h"
diff --git a/lib/Target/XCore/InstPrinter/Makefile b/lib/Target/XCore/InstPrinter/Makefile
deleted file mode 100644
index 1c1c61299c39..000000000000
--- a/lib/Target/XCore/InstPrinter/Makefile
+++ /dev/null
@@ -1,16 +0,0 @@
-##===- lib/Target/XCore/AsmPrinter/Makefile ----------------*- Makefile -*-===##
-#
-#                     The LLVM Compiler Infrastructure
-#
-# This file is distributed under the University of Illinois Open Source
-# License. See LICENSE.TXT for details.
-#
-##===----------------------------------------------------------------------===##
-
-LEVEL = ../../../..
-LIBRARYNAME = LLVMXCoreAsmPrinter
-
-# Hack: we need to include 'main' xcore target directory to grab private headers
-CPP.Flags += -I$(PROJ_OBJ_DIR)/.. -I$(PROJ_SRC_DIR)/..
-
-include $(LEVEL)/Makefile.common
diff --git a/lib/Target/XCore/MCTargetDesc/Makefile b/lib/Target/XCore/MCTargetDesc/Makefile
deleted file mode 100644
index de61543bfe9c..000000000000
--- a/lib/Target/XCore/MCTargetDesc/Makefile
+++ /dev/null
@@ -1,16 +0,0 @@
-##===- lib/Target/XCore/TargetDesc/Makefile ----------------*- Makefile -*-===##
-#
-#                     The LLVM Compiler Infrastructure
-#
-# This file is distributed under the University of Illinois Open Source
-# License. See LICENSE.TXT for details.
-#
-##===----------------------------------------------------------------------===##
-
-LEVEL = ../../../..
-LIBRARYNAME = LLVMXCoreDesc
-
-# Hack: we need to include 'main' target directory to grab private headers
-CPP.Flags += -I$(PROJ_OBJ_DIR)/.. -I$(PROJ_SRC_DIR)/..
-
-include $(LEVEL)/Makefile.common
diff --git a/lib/Target/XCore/MCTargetDesc/XCoreMCTargetDesc.cpp b/lib/Target/XCore/MCTargetDesc/XCoreMCTargetDesc.cpp
index b4085835f285..63ca1e7d4646 100644
--- a/lib/Target/XCore/MCTargetDesc/XCoreMCTargetDesc.cpp
+++ b/lib/Target/XCore/MCTargetDesc/XCoreMCTargetDesc.cpp
@@ -15,7 +15,6 @@
 #include "InstPrinter/XCoreInstPrinter.h"
 #include "XCoreMCAsmInfo.h"
 #include "XCoreTargetStreamer.h"
-#include "llvm/MC/MCCodeGenInfo.h"
 #include "llvm/MC/MCInstrInfo.h"
 #include "llvm/MC/MCRegisterInfo.h"
 #include "llvm/MC/MCSubtargetInfo.h"
@@ -62,22 +61,13 @@ static MCAsmInfo *createXCoreMCAsmInfo(const MCRegisterInfo &MRI,
   return MAI;
 }
 
-static MCCodeGenInfo *createXCoreMCCodeGenInfo(const Triple &TT,
-                                               Reloc::Model RM,
-                                               CodeModel::Model CM,
-                                               CodeGenOpt::Level OL) {
-  MCCodeGenInfo *X = new MCCodeGenInfo();
-  if (RM == Reloc::Default) {
-    RM = Reloc::Static;
-  }
+static void adjustCodeGenOpts(const Triple &TT, Reloc::Model RM,
+                              CodeModel::Model &CM) {
   if (CM == CodeModel::Default) {
     CM = CodeModel::Small;
   }
   if (CM != CodeModel::Small && CM != CodeModel::Large)
     report_fatal_error("Target only supports CodeModel Small or Large");
-
-  X->initMCCodeGenInfo(RM, CM, OL);
-  return X;
 }
 
 static MCInstPrinter *createXCoreMCInstPrinter(const Triple &T,
@@ -137,8 +127,8 @@ extern "C" void LLVMInitializeXCoreTargetMC() {
   RegisterMCAsmInfoFn X(TheXCoreTarget, createXCoreMCAsmInfo);
 
   // Register the MC codegen info.
-  TargetRegistry::RegisterMCCodeGenInfo(TheXCoreTarget,
-                                        createXCoreMCCodeGenInfo);
+  TargetRegistry::registerMCAdjustCodeGenOpts(TheXCoreTarget,
+                                              adjustCodeGenOpts);
 
   // Register the MC instruction info.
   TargetRegistry::RegisterMCInstrInfo(TheXCoreTarget, createXCoreMCInstrInfo);
diff --git a/lib/Target/XCore/Makefile b/lib/Target/XCore/Makefile
deleted file mode 100644
index 92ddc8860876..000000000000
--- a/lib/Target/XCore/Makefile
+++ /dev/null
@@ -1,23 +0,0 @@
-##===- lib/Target/XCore/Makefile ---------------------------*- Makefile -*-===##
-#
-#                     The LLVM Compiler Infrastructure
-#
-# This file is distributed under the University of Illinois Open Source
-# License. See LICENSE.TXT for details.
-#
-##===----------------------------------------------------------------------===##
-
-LEVEL = ../../..
-LIBRARYNAME = LLVMXCoreCodeGen
-TARGET = XCore
-
-# Make sure that tblgen is run, first thing.
-BUILT_SOURCES = XCoreGenRegisterInfo.inc XCoreGenInstrInfo.inc \
-		XCoreGenAsmWriter.inc \
-		XCoreGenDAGISel.inc XCoreGenCallingConv.inc \
-		XCoreGenDisassemblerTables.inc XCoreGenSubtargetInfo.inc
-
-DIRS = Disassembler InstPrinter TargetInfo MCTargetDesc
-
-include $(LEVEL)/Makefile.common
-
diff --git a/lib/Target/XCore/TargetInfo/Makefile b/lib/Target/XCore/TargetInfo/Makefile
deleted file mode 100644
index f8a409517497..000000000000
--- a/lib/Target/XCore/TargetInfo/Makefile
+++ /dev/null
@@ -1,16 +0,0 @@
-##===- lib/Target/XCore/TargetInfo/Makefile ----------------*- Makefile -*-===##
-#
-#                     The LLVM Compiler Infrastructure
-#
-# This file is distributed under the University of Illinois Open Source
-# License. See LICENSE.TXT for details.
-#
-##===----------------------------------------------------------------------===##
-
-LEVEL = ../../../..
-LIBRARYNAME = LLVMXCoreInfo
-
-# Hack: we need to include 'main' target directory to grab private headers
-CPPFLAGS = -I$(PROJ_OBJ_DIR)/.. -I$(PROJ_SRC_DIR)/..
-
-include $(LEVEL)/Makefile.common
diff --git a/lib/Target/XCore/XCoreAsmPrinter.cpp b/lib/Target/XCore/XCoreAsmPrinter.cpp
index b00cdd5040eb..be66e6cb8124 100644
--- a/lib/Target/XCore/XCoreAsmPrinter.cpp
+++ b/lib/Target/XCore/XCoreAsmPrinter.cpp
@@ -93,8 +93,7 @@ void XCoreAsmPrinter::emitArrayBound(MCSymbol *Sym, const GlobalVariable *GV) {
   assert( ( GV->hasExternalLinkage() || GV->hasWeakLinkage() ||
             GV->hasLinkOnceLinkage() || GV->hasCommonLinkage() ) &&
           "Unexpected linkage");
-  if (ArrayType *ATy = dyn_cast<ArrayType>(
-                        cast<PointerType>(GV->getType())->getElementType())) {
+  if (ArrayType *ATy = dyn_cast<ArrayType>(GV->getValueType())) {
 
     MCSymbol *SymGlob = OutContext.getOrCreateSymbol(
                           Twine(Sym->getName() + StringRef(".globound")));
diff --git a/lib/Target/XCore/XCoreFrameLowering.cpp b/lib/Target/XCore/XCoreFrameLowering.cpp
index ae493de083b8..75a2eb0fdd2f 100644
--- a/lib/Target/XCore/XCoreFrameLowering.cpp
+++ b/lib/Target/XCore/XCoreFrameLowering.cpp
@@ -58,10 +58,9 @@ static bool CompareSSIOffset(const StackSlotInfo& a, const StackSlotInfo& b) {
   return a.Offset < b.Offset;
 }
 
-
 static void EmitDefCfaRegister(MachineBasicBlock &MBB,
-                               MachineBasicBlock::iterator MBBI, DebugLoc dl,
-                               const TargetInstrInfo &TII,
+                               MachineBasicBlock::iterator MBBI,
+                               const DebugLoc &dl, const TargetInstrInfo &TII,
                                MachineModuleInfo *MMI, unsigned DRegNum) {
   unsigned CFIIndex = MMI->addFrameInst(
       MCCFIInstruction::createDefCfaRegister(nullptr, DRegNum));
@@ -70,8 +69,8 @@ static void EmitDefCfaRegister(MachineBasicBlock &MBB,
 }
 
 static void EmitDefCfaOffset(MachineBasicBlock &MBB,
-                             MachineBasicBlock::iterator MBBI, DebugLoc dl,
-                             const TargetInstrInfo &TII,
+                             MachineBasicBlock::iterator MBBI,
+                             const DebugLoc &dl, const TargetInstrInfo &TII,
                              MachineModuleInfo *MMI, int Offset) {
   unsigned CFIIndex =
       MMI->addFrameInst(MCCFIInstruction::createDefCfaOffset(nullptr, -Offset));
@@ -80,7 +79,7 @@ static void EmitDefCfaOffset(MachineBasicBlock &MBB,
 }
 
 static void EmitCfiOffset(MachineBasicBlock &MBB,
-                          MachineBasicBlock::iterator MBBI, DebugLoc dl,
+                          MachineBasicBlock::iterator MBBI, const DebugLoc &dl,
                           const TargetInstrInfo &TII, MachineModuleInfo *MMI,
                           unsigned DRegNum, int Offset) {
   unsigned CFIIndex = MMI->addFrameInst(
@@ -96,7 +95,7 @@ static void EmitCfiOffset(MachineBasicBlock &MBB,
 /// \param OffsetFromTop the spill offset from the top of the frame.
 /// \param [in,out] Adjusted the current SP offset from the top of the frame.
 static void IfNeededExtSP(MachineBasicBlock &MBB,
-                          MachineBasicBlock::iterator MBBI, DebugLoc dl,
+                          MachineBasicBlock::iterator MBBI, const DebugLoc &dl,
                           const TargetInstrInfo &TII, MachineModuleInfo *MMI,
                           int OffsetFromTop, int &Adjusted, int FrameSize,
                           bool emitFrameMoves) {
@@ -120,7 +119,7 @@ static void IfNeededExtSP(MachineBasicBlock &MBB,
 /// \param [in,out] RemainingAdj the current SP offset from the top of the
 /// frame.
 static void IfNeededLDAWSP(MachineBasicBlock &MBB,
-                           MachineBasicBlock::iterator MBBI, DebugLoc dl,
+                           MachineBasicBlock::iterator MBBI, const DebugLoc &dl,
                            const TargetInstrInfo &TII, int OffsetFromTop,
                            int &RemainingAdj) {
   while (OffsetFromTop < RemainingAdj - MaxImmU16) {
@@ -173,8 +172,9 @@ static void GetEHSpillList(SmallVectorImpl<StackSlotInfo> &SpillList,
   std::sort(SpillList.begin(), SpillList.end(), CompareSSIOffset);
 }
 
-static MachineMemOperand *
-getFrameIndexMMO(MachineBasicBlock &MBB, int FrameIndex, unsigned flags) {
+static MachineMemOperand *getFrameIndexMMO(MachineBasicBlock &MBB,
+                                           int FrameIndex,
+                                           MachineMemOperand::Flags flags) {
   MachineFunction *MF = MBB.getParent();
   const MachineFrameInfo &MFI = *MF->getFrameInfo();
   MachineMemOperand *MMO = MF->getMachineMemOperand(
@@ -187,10 +187,11 @@ getFrameIndexMMO(MachineBasicBlock &MBB, int FrameIndex, unsigned flags) {
 /// Restore clobbered registers with their spill slot value.
 /// The SP will be adjusted at the same time, thus the SpillList must be ordered
 /// with the largest (negative) offsets first.
-static void
-RestoreSpillList(MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI,
-                 DebugLoc dl, const TargetInstrInfo &TII, int &RemainingAdj,
-                 SmallVectorImpl<StackSlotInfo> &SpillList) {
+static void RestoreSpillList(MachineBasicBlock &MBB,
+                             MachineBasicBlock::iterator MBBI,
+                             const DebugLoc &dl, const TargetInstrInfo &TII,
+                             int &RemainingAdj,
+                             SmallVectorImpl<StackSlotInfo> &SpillList) {
   for (unsigned i = 0, e = SpillList.size(); i != e; ++i) {
     assert(SpillList[i].Offset % 4 == 0 && "Misaligned stack offset");
     assert(SpillList[i].Offset <= 0 && "Unexpected positive stack offset");
@@ -482,9 +483,9 @@ restoreCalleeSavedRegisters(MachineBasicBlock &MBB,
 
 // This function eliminates ADJCALLSTACKDOWN,
 // ADJCALLSTACKUP pseudo instructions
-void XCoreFrameLowering::
-eliminateCallFramePseudoInstr(MachineFunction &MF, MachineBasicBlock &MBB,
-                              MachineBasicBlock::iterator I) const {
+MachineBasicBlock::iterator XCoreFrameLowering::eliminateCallFramePseudoInstr(
+    MachineFunction &MF, MachineBasicBlock &MBB,
+    MachineBasicBlock::iterator I) const {
   const XCoreInstrInfo &TII = *MF.getSubtarget<XCoreSubtarget>().getInstrInfo();
   if (!hasReservedCallFrame(MF)) {
     // Turn the adjcallstackdown instruction into 'extsp <amt>' and the
@@ -528,7 +529,7 @@ eliminateCallFramePseudoInstr(MachineFunction &MF, MachineBasicBlock &MBB,
     }
   }
 
-  MBB.erase(I);
+  return MBB.erase(I);
 }
 
 void XCoreFrameLowering::determineCalleeSaves(MachineFunction &MF,
diff --git a/lib/Target/XCore/XCoreFrameLowering.h b/lib/Target/XCore/XCoreFrameLowering.h
index 69c71adc8d3f..8729d2208bb2 100644
--- a/lib/Target/XCore/XCoreFrameLowering.h
+++ b/lib/Target/XCore/XCoreFrameLowering.h
@@ -41,8 +41,8 @@ namespace llvm {
                                   const std::vector<CalleeSavedInfo> &CSI,
                                   const TargetRegisterInfo *TRI) const override;
 
-    void eliminateCallFramePseudoInstr(MachineFunction &MF,
-                                  MachineBasicBlock &MBB,
+    MachineBasicBlock::iterator
+    eliminateCallFramePseudoInstr(MachineFunction &MF, MachineBasicBlock &MBB,
                                   MachineBasicBlock::iterator I) const override;
 
     bool hasFP(const MachineFunction &MF) const override;
diff --git a/lib/Target/XCore/XCoreFrameToArgsOffsetElim.cpp b/lib/Target/XCore/XCoreFrameToArgsOffsetElim.cpp
index 77292c4f8f52..bd6baef3271e 100644
--- a/lib/Target/XCore/XCoreFrameToArgsOffsetElim.cpp
+++ b/lib/Target/XCore/XCoreFrameToArgsOffsetElim.cpp
@@ -17,7 +17,6 @@
 #include "llvm/CodeGen/MachineFrameInfo.h"
 #include "llvm/CodeGen/MachineFunctionPass.h"
 #include "llvm/CodeGen/MachineInstrBuilder.h"
-#include "llvm/Support/Compiler.h"
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/Target/TargetMachine.h"
 using namespace llvm;
@@ -28,6 +27,10 @@ namespace {
     XCoreFTAOElim() : MachineFunctionPass(ID) {}
 
     bool runOnMachineFunction(MachineFunction &Fn) override;
+    MachineFunctionProperties getRequiredProperties() const override {
+      return MachineFunctionProperties().set(
+          MachineFunctionProperties::Property::AllVRegsAllocated);
+    }
 
     const char *getPassName() const override {
       return "XCore FRAME_TO_ARGS_OFFSET Elimination";
diff --git a/lib/Target/XCore/XCoreISelDAGToDAG.cpp b/lib/Target/XCore/XCoreISelDAGToDAG.cpp
index 9f61c84cd445..ce25cbcfd124 100644
--- a/lib/Target/XCore/XCoreISelDAGToDAG.cpp
+++ b/lib/Target/XCore/XCoreISelDAGToDAG.cpp
@@ -25,7 +25,6 @@
 #include "llvm/IR/Function.h"
 #include "llvm/IR/Intrinsics.h"
 #include "llvm/IR/LLVMContext.h"
-#include "llvm/Support/Compiler.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/raw_ostream.h"
@@ -42,12 +41,12 @@ namespace {
     XCoreDAGToDAGISel(XCoreTargetMachine &TM, CodeGenOpt::Level OptLevel)
       : SelectionDAGISel(TM, OptLevel) {}
 
-    SDNode *Select(SDNode *N) override;
-    SDNode *SelectBRIND(SDNode *N);
+    void Select(SDNode *N) override;
+    bool tryBRIND(SDNode *N);
 
     /// getI32Imm - Return a target constant with the specified value, of type
     /// i32.
-    inline SDValue getI32Imm(unsigned Imm, SDLoc dl) {
+    inline SDValue getI32Imm(unsigned Imm, const SDLoc &dl) {
       return CurDAG->getTargetConstant(Imm, dl, MVT::i32);
     }
 
@@ -70,14 +69,14 @@ namespace {
 
     const char *getPassName() const override {
       return "XCore DAG->DAG Pattern Instruction Selection";
-    } 
-    
+    }
+
     // Include the pieces autogenerated from the target description.
   #include "XCoreGenDAGISel.inc"
   };
 }  // end anonymous namespace
 
-/// createXCoreISelDag - This pass converts a legalized DAG into a 
+/// createXCoreISelDag - This pass converts a legalized DAG into a
 /// XCore-specific DAG, ready for instruction scheduling.
 ///
 FunctionPass *llvm::createXCoreISelDag(XCoreTargetMachine &TM,
@@ -130,7 +129,7 @@ SelectInlineAsmMemoryOperand(const SDValue &Op, unsigned ConstraintID,
   return false;
 }
 
-SDNode *XCoreDAGToDAGISel::Select(SDNode *N) {
+void XCoreDAGToDAGISel::Select(SDNode *N) {
   SDLoc dl(N);
   switch (N->getOpcode()) {
   default: break;
@@ -140,8 +139,9 @@ SDNode *XCoreDAGToDAGISel::Select(SDNode *N) {
       // Transformation function: get the size of a mask
       // Look for the first non-zero bit
       SDValue MskSize = getI32Imm(32 - countLeadingZeros((uint32_t)Val), dl);
-      return CurDAG->getMachineNode(XCore::MKMSK_rus, dl,
-                                    MVT::i32, MskSize);
+      ReplaceNode(N, CurDAG->getMachineNode(XCore::MKMSK_rus, dl,
+                                            MVT::i32, MskSize));
+      return;
     }
     else if (!isUInt<16>(Val)) {
       SDValue CPIdx = CurDAG->getTargetConstantPool(
@@ -155,57 +155,64 @@ SDNode *XCoreDAGToDAGISel::Select(SDNode *N) {
           MF->getMachineMemOperand(MachinePointerInfo::getConstantPool(*MF),
                                    MachineMemOperand::MOLoad, 4, 4);
       cast<MachineSDNode>(node)->setMemRefs(MemOp, MemOp + 1);
-      return node;
+      ReplaceNode(N, node);
+      return;
     }
     break;
   }
   case XCoreISD::LADD: {
     SDValue Ops[] = { N->getOperand(0), N->getOperand(1),
                         N->getOperand(2) };
-    return CurDAG->getMachineNode(XCore::LADD_l5r, dl, MVT::i32, MVT::i32,
-                                  Ops);
+    ReplaceNode(N, CurDAG->getMachineNode(XCore::LADD_l5r, dl, MVT::i32,
+                                          MVT::i32, Ops));
+    return;
   }
   case XCoreISD::LSUB: {
     SDValue Ops[] = { N->getOperand(0), N->getOperand(1),
                         N->getOperand(2) };
-    return CurDAG->getMachineNode(XCore::LSUB_l5r, dl, MVT::i32, MVT::i32,
-                                  Ops);
+    ReplaceNode(N, CurDAG->getMachineNode(XCore::LSUB_l5r, dl, MVT::i32,
+                                          MVT::i32, Ops));
+    return;
   }
   case XCoreISD::MACCU: {
     SDValue Ops[] = { N->getOperand(0), N->getOperand(1),
                       N->getOperand(2), N->getOperand(3) };
-    return CurDAG->getMachineNode(XCore::MACCU_l4r, dl, MVT::i32, MVT::i32,
-                                  Ops);
+    ReplaceNode(N, CurDAG->getMachineNode(XCore::MACCU_l4r, dl, MVT::i32,
+                                          MVT::i32, Ops));
+    return;
   }
   case XCoreISD::MACCS: {
     SDValue Ops[] = { N->getOperand(0), N->getOperand(1),
                       N->getOperand(2), N->getOperand(3) };
-    return CurDAG->getMachineNode(XCore::MACCS_l4r, dl, MVT::i32, MVT::i32,
-                                  Ops);
+    ReplaceNode(N, CurDAG->getMachineNode(XCore::MACCS_l4r, dl, MVT::i32,
+                                          MVT::i32, Ops));
+    return;
   }
   case XCoreISD::LMUL: {
     SDValue Ops[] = { N->getOperand(0), N->getOperand(1),
                       N->getOperand(2), N->getOperand(3) };
-    return CurDAG->getMachineNode(XCore::LMUL_l6r, dl, MVT::i32, MVT::i32,
-                                  Ops);
+    ReplaceNode(N, CurDAG->getMachineNode(XCore::LMUL_l6r, dl, MVT::i32,
+                                          MVT::i32, Ops));
+    return;
   }
   case XCoreISD::CRC8: {
     SDValue Ops[] = { N->getOperand(0), N->getOperand(1), N->getOperand(2) };
-    return CurDAG->getMachineNode(XCore::CRC8_l4r, dl, MVT::i32, MVT::i32,
-                                  Ops);
+    ReplaceNode(N, CurDAG->getMachineNode(XCore::CRC8_l4r, dl, MVT::i32,
+                                          MVT::i32, Ops));
+    return;
   }
   case ISD::BRIND:
-    if (SDNode *ResNode = SelectBRIND(N))
-      return ResNode;
+    if (tryBRIND(N))
+      return;
     break;
   // Other cases are autogenerated.
   }
-  return SelectCode(N);
+  SelectCode(N);
 }
 
 /// Given a chain return a new chain where any appearance of Old is replaced
 /// by New. There must be at most one instruction between Old and Chain and
-/// this instruction must be a TokenFactor. Returns an empty SDValue if 
+/// this instruction must be a TokenFactor. Returns an empty SDValue if
 /// these conditions don't hold.
 static SDValue
 replaceInChain(SelectionDAG *CurDAG, SDValue Chain, SDValue Old, SDValue New)
@@ -229,16 +236,16 @@ replaceInChain(SelectionDAG *CurDAG, SDValue Chain, SDValue Old, SDValue New)
   return CurDAG->getNode(ISD::TokenFactor, SDLoc(Chain), MVT::Other, Ops);
 }
 
-SDNode *XCoreDAGToDAGISel::SelectBRIND(SDNode *N) {
+bool XCoreDAGToDAGISel::tryBRIND(SDNode *N) {
   SDLoc dl(N);
   // (brind (int_xcore_checkevent (addr)))
   SDValue Chain = N->getOperand(0);
   SDValue Addr = N->getOperand(1);
   if (Addr->getOpcode() != ISD::INTRINSIC_W_CHAIN)
-    return nullptr;
+    return false;
   unsigned IntNo = cast<ConstantSDNode>(Addr->getOperand(1))->getZExtValue();
   if (IntNo != Intrinsic::xcore_checkevent)
-    return nullptr;
+    return false;
   SDValue nextAddr = Addr->getOperand(2);
   SDValue CheckEventChainOut(Addr.getNode(), 1);
   if (!CheckEventChainOut.use_empty()) {
@@ -250,7 +257,7 @@ SDNode *XCoreDAGToDAGISel::SelectBRIND(SDNode *N) {
     SDValue NewChain = replaceInChain(CurDAG, Chain, CheckEventChainOut,
                                       CheckEventChainIn);
     if (!NewChain.getNode())
-      return nullptr;
+      return false;
     Chain = NewChain;
   }
   // Enable events on the thread using setsr 1 and then disable them immediately
@@ -266,8 +273,10 @@ SDNode *XCoreDAGToDAGISel::SelectBRIND(SDNode *N) {
                                    constOne, Glue), 0);
   if (nextAddr->getOpcode() == XCoreISD::PCRelativeWrapper &&
       nextAddr->getOperand(0)->getOpcode() == ISD::TargetBlockAddress) {
-    return CurDAG->SelectNodeTo(N, XCore::BRFU_lu6, MVT::Other,
-                                nextAddr->getOperand(0), Glue);
+    CurDAG->SelectNodeTo(N, XCore::BRFU_lu6, MVT::Other,
+                         nextAddr->getOperand(0), Glue);
+    return true;
   }
-  return CurDAG->SelectNodeTo(N, XCore::BAU_1r, MVT::Other, nextAddr, Glue);
+  CurDAG->SelectNodeTo(N, XCore::BAU_1r, MVT::Other, nextAddr, Glue);
+  return true;
 }
diff --git a/lib/Target/XCore/XCoreISelLowering.cpp b/lib/Target/XCore/XCoreISelLowering.cpp
index 105b2cfb1be6..6f6ac3bd5f4c 100644
--- a/lib/Target/XCore/XCoreISelLowering.cpp
+++ b/lib/Target/XCore/XCoreISelLowering.cpp
@@ -110,8 +110,6 @@ XCoreTargetLowering::XCoreTargetLowering(const TargetMachine &TM,
   setOperationAction(ISD::CTPOP, MVT::i32, Expand);
   setOperationAction(ISD::ROTL , MVT::i32, Expand);
   setOperationAction(ISD::ROTR , MVT::i32, Expand);
-  setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::i32, Expand);
-  setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i32, Expand);
 
   setOperationAction(ISD::TRAP, MVT::Other, Legal);
 
@@ -156,7 +154,6 @@ XCoreTargetLowering::XCoreTargetLowering(const TargetMachine &TM,
   // Atomic operations
   // We request a fence for ATOMIC_* instructions, to reduce them to Monotonic.
   // As we are always Sequential Consistent, an ATOMIC_FENCE becomes a no OP.
-  setInsertFencesForAtomic(true);
   setOperationAction(ISD::ATOMIC_FENCE, MVT::Other, Custom);
   setOperationAction(ISD::ATOMIC_LOAD, MVT::i32, Custom);
   setOperationAction(ISD::ATOMIC_STORE, MVT::i32, Custom);
@@ -257,11 +254,11 @@ SDValue XCoreTargetLowering::getGlobalAddressWrapper(SDValue GA,
   // FIXME there is no actual debug info here
   SDLoc dl(GA);
 
-  if (GV->getType()->getElementType()->isFunctionTy())
+  if (GV->getValueType()->isFunctionTy())
     return DAG.getNode(XCoreISD::PCRelativeWrapper, dl, MVT::i32, GA);
 
   const auto *GVar = dyn_cast<GlobalVariable>(GV);
-  if ((GV->hasSection() && StringRef(GV->getSection()).startswith(".cp.")) ||
+  if ((GV->hasSection() && GV->getSection().startswith(".cp.")) ||
       (GVar && GVar->isConstant() && GV->hasLocalLinkage()))
     return DAG.getNode(XCoreISD::CPRelativeWrapper, dl, MVT::i32, GA);
 
@@ -272,7 +269,7 @@ static bool IsSmallObject(const GlobalValue *GV, const XCoreTargetLowering &XTL)
   if (XTL.getTargetMachine().getCodeModel() == CodeModel::Small)
     return true;
 
-  Type *ObjType = GV->getType()->getPointerElementType();
+  Type *ObjType = GV->getValueType();
   if (!ObjType->isSized())
     return false;
 
@@ -309,8 +306,7 @@ LowerGlobalAddress(SDValue Op, SelectionDAG &DAG) const
         Type::getInt8Ty(*DAG.getContext()), GA, Idx);
     SDValue CP = DAG.getConstantPool(GAI, MVT::i32);
     return DAG.getLoad(getPointerTy(DAG.getDataLayout()), DL,
-                       DAG.getEntryNode(), CP, MachinePointerInfo(), false,
-                       false, false, 0);
+                       DAG.getEntryNode(), CP, MachinePointerInfo());
   }
 }
 
@@ -371,17 +367,15 @@ LowerBR_JT(SDValue Op, SelectionDAG &DAG) const
                      ScaledIndex);
 }
 
-SDValue XCoreTargetLowering::
-lowerLoadWordFromAlignedBasePlusOffset(SDLoc DL, SDValue Chain, SDValue Base,
-                                       int64_t Offset, SelectionDAG &DAG) const
-{
+SDValue XCoreTargetLowering::lowerLoadWordFromAlignedBasePlusOffset(
+    const SDLoc &DL, SDValue Chain, SDValue Base, int64_t Offset,
+    SelectionDAG &DAG) const {
   auto PtrVT = getPointerTy(DAG.getDataLayout());
   if ((Offset & 0x3) == 0) {
-    return DAG.getLoad(PtrVT, DL, Chain, Base, MachinePointerInfo(), false,
-                       false, false, 0);
+    return DAG.getLoad(PtrVT, DL, Chain, Base, MachinePointerInfo());
   }
   // Lower to pair of consecutive word aligned loads plus some bit shifting.
-  int32_t HighOffset = RoundUpToAlignment(Offset, 4);
+  int32_t HighOffset = alignTo(Offset, 4);
   int32_t LowOffset = HighOffset - 4;
   SDValue LowAddr, HighAddr;
   if (GlobalAddressSDNode *GASD =
@@ -399,10 +393,8 @@ lowerLoadWordFromAlignedBasePlusOffset(SDLoc DL, SDValue Chain, SDValue Base,
   SDValue LowShift = DAG.getConstant((Offset - LowOffset) * 8, DL, MVT::i32);
   SDValue HighShift = DAG.getConstant((HighOffset - Offset) * 8, DL, MVT::i32);
 
-  SDValue Low = DAG.getLoad(PtrVT, DL, Chain, LowAddr, MachinePointerInfo(),
-                            false, false, false, 0);
-  SDValue High = DAG.getLoad(PtrVT, DL, Chain, HighAddr, MachinePointerInfo(),
-                             false, false, false, 0);
+  SDValue Low = DAG.getLoad(PtrVT, DL, Chain, LowAddr, MachinePointerInfo());
+  SDValue High = DAG.getLoad(PtrVT, DL, Chain, HighAddr, MachinePointerInfo());
   SDValue LowShifted = DAG.getNode(ISD::SRL, DL, MVT::i32, Low, LowShift);
   SDValue HighShifted = DAG.getNode(ISD::SHL, DL, MVT::i32, High, HighShift);
   SDValue Result = DAG.getNode(ISD::OR, DL, MVT::i32, LowShifted, HighShifted);
@@ -462,17 +454,16 @@ LowerLOAD(SDValue Op, SelectionDAG &DAG) const {
   }
 
   if (LD->getAlignment() == 2) {
-    SDValue Low = DAG.getExtLoad(ISD::ZEXTLOAD, DL, MVT::i32, Chain,
-                                 BasePtr, LD->getPointerInfo(), MVT::i16,
-                                 LD->isVolatile(), LD->isNonTemporal(),
-                                 LD->isInvariant(), 2);
+    SDValue Low =
+        DAG.getExtLoad(ISD::ZEXTLOAD, DL, MVT::i32, Chain, BasePtr,
+                       LD->getPointerInfo(), MVT::i16,
+                       /* Alignment = */ 2, LD->getMemOperand()->getFlags());
     SDValue HighAddr = DAG.getNode(ISD::ADD, DL, MVT::i32, BasePtr,
                                    DAG.getConstant(2, DL, MVT::i32));
-    SDValue High = DAG.getExtLoad(ISD::EXTLOAD, DL, MVT::i32, Chain,
-                                  HighAddr,
-                                  LD->getPointerInfo().getWithOffset(2),
-                                  MVT::i16, LD->isVolatile(),
-                                  LD->isNonTemporal(), LD->isInvariant(), 2);
+    SDValue High =
+        DAG.getExtLoad(ISD::EXTLOAD, DL, MVT::i32, Chain, HighAddr,
+                       LD->getPointerInfo().getWithOffset(2), MVT::i16,
+                       /* Alignment = */ 2, LD->getMemOperand()->getFlags());
     SDValue HighShifted = DAG.getNode(ISD::SHL, DL, MVT::i32, High,
                                       DAG.getConstant(16, DL, MVT::i32));
     SDValue Result = DAG.getNode(ISD::OR, DL, MVT::i32, Low, HighShifted);
@@ -496,7 +487,7 @@ LowerLOAD(SDValue Op, SelectionDAG &DAG) const {
       CallingConv::C, IntPtrTy,
       DAG.getExternalSymbol("__misaligned_load",
                             getPointerTy(DAG.getDataLayout())),
-      std::move(Args), 0);
+      std::move(Args));
 
   std::pair<SDValue, SDValue> CallResult = LowerCallTo(CLI);
   SDValue Ops[] = { CallResult.first, CallResult.second };
@@ -529,16 +520,14 @@ LowerSTORE(SDValue Op, SelectionDAG &DAG) const
     SDValue Low = Value;
     SDValue High = DAG.getNode(ISD::SRL, dl, MVT::i32, Value,
                                       DAG.getConstant(16, dl, MVT::i32));
-    SDValue StoreLow = DAG.getTruncStore(Chain, dl, Low, BasePtr,
-                                         ST->getPointerInfo(), MVT::i16,
-                                         ST->isVolatile(), ST->isNonTemporal(),
-                                         2);
+    SDValue StoreLow = DAG.getTruncStore(
+        Chain, dl, Low, BasePtr, ST->getPointerInfo(), MVT::i16,
+        /* Alignment = */ 2, ST->getMemOperand()->getFlags());
     SDValue HighAddr = DAG.getNode(ISD::ADD, dl, MVT::i32, BasePtr,
                                    DAG.getConstant(2, dl, MVT::i32));
-    SDValue StoreHigh = DAG.getTruncStore(Chain, dl, High, HighAddr,
-                                          ST->getPointerInfo().getWithOffset(2),
-                                          MVT::i16, ST->isVolatile(),
-                                          ST->isNonTemporal(), 2);
+    SDValue StoreHigh = DAG.getTruncStore(
+        Chain, dl, High, HighAddr, ST->getPointerInfo().getWithOffset(2),
+        MVT::i16, /* Alignment = */ 2, ST->getMemOperand()->getFlags());
     return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, StoreLow, StoreHigh);
   }
 
@@ -559,7 +548,7 @@ LowerSTORE(SDValue Op, SelectionDAG &DAG) const
       CallingConv::C, Type::getVoidTy(*DAG.getContext()),
       DAG.getExternalSymbol("__misaligned_store",
                             getPointerTy(DAG.getDataLayout())),
-      std::move(Args), 0);
+      std::move(Args));
 
   std::pair<SDValue, SDValue> CallResult = LowerCallTo(CLI);
   return CallResult.second;
@@ -725,11 +714,9 @@ ExpandADDSUB(SDNode *N, SelectionDAG &DAG) const
          (N->getOpcode() == ISD::ADD || N->getOpcode() == ISD::SUB) &&
         "Unknown operand to lower!");
 
-  if (N->getOpcode() == ISD::ADD) {
-    SDValue Result = TryExpandADDWithMul(N, DAG);
-    if (Result.getNode())
+  if (N->getOpcode() == ISD::ADD)
+    if (SDValue Result = TryExpandADDWithMul(N, DAG))
       return Result;
-  }
 
   SDLoc dl(N);
 
@@ -774,19 +761,17 @@ LowerVAARG(SDValue Op, SelectionDAG &DAG) const
   EVT PtrVT = VAListPtr.getValueType();
   const Value *SV = cast<SrcValueSDNode>(Node->getOperand(2))->getValue();
   SDLoc dl(Node);
-  SDValue VAList = DAG.getLoad(PtrVT, dl, InChain,
-                               VAListPtr, MachinePointerInfo(SV),
-                               false, false, false, 0);
+  SDValue VAList =
+      DAG.getLoad(PtrVT, dl, InChain, VAListPtr, MachinePointerInfo(SV));
   // Increment the pointer, VAList, to the next vararg
   SDValue nextPtr = DAG.getNode(ISD::ADD, dl, PtrVT, VAList,
                                 DAG.getIntPtrConstant(VT.getSizeInBits() / 8,
                                                       dl));
   // Store the incremented VAList to the legalized pointer
   InChain = DAG.getStore(VAList.getValue(1), dl, nextPtr, VAListPtr,
-                         MachinePointerInfo(SV), false, false, 0);
+                         MachinePointerInfo(SV));
   // Load the actual argument out of the pointer VAList
-  return DAG.getLoad(VT, dl, InChain, VAList, MachinePointerInfo(),
-                     false, false, false, 0);
+  return DAG.getLoad(VT, dl, InChain, VAList, MachinePointerInfo());
 }
 
 SDValue XCoreTargetLowering::
@@ -799,7 +784,7 @@ LowerVASTART(SDValue Op, SelectionDAG &DAG) const
   XCoreFunctionInfo *XFI = MF.getInfo<XCoreFunctionInfo>();
   SDValue Addr = DAG.getFrameIndex(XFI->getVarArgsFrameIndex(), MVT::i32);
   return DAG.getStore(Op.getOperand(0), dl, Addr, Op.getOperand(1),
-                      MachinePointerInfo(), false, false, 0);
+                      MachinePointerInfo());
 }
 
 SDValue XCoreTargetLowering::LowerFRAMEADDR(SDValue Op,
@@ -832,9 +817,9 @@ LowerRETURNADDR(SDValue Op, SelectionDAG &DAG) const {
   XCoreFunctionInfo *XFI = MF.getInfo<XCoreFunctionInfo>();
   int FI = XFI->createLRSpillSlot(MF);
   SDValue FIN = DAG.getFrameIndex(FI, MVT::i32);
-  return DAG.getLoad(
-      getPointerTy(DAG.getDataLayout()), SDLoc(Op), DAG.getEntryNode(), FIN,
-      MachinePointerInfo::getFixedStack(MF, FI), false, false, false, 0);
+  return DAG.getLoad(getPointerTy(DAG.getDataLayout()), SDLoc(Op),
+                     DAG.getEntryNode(), FIN,
+                     MachinePointerInfo::getFixedStack(MF, FI));
 }
 
 SDValue XCoreTargetLowering::
@@ -915,33 +900,31 @@ LowerINIT_TRAMPOLINE(SDValue Op, SelectionDAG &DAG) const {
   SDValue Addr = Trmp;
 
   SDLoc dl(Op);
-  OutChains[0] = DAG.getStore(Chain, dl,
-                              DAG.getConstant(0x0a3cd805, dl, MVT::i32), Addr,
-                              MachinePointerInfo(TrmpAddr), false, false, 0);
+  OutChains[0] =
+      DAG.getStore(Chain, dl, DAG.getConstant(0x0a3cd805, dl, MVT::i32), Addr,
+                   MachinePointerInfo(TrmpAddr));
 
   Addr = DAG.getNode(ISD::ADD, dl, MVT::i32, Trmp,
                      DAG.getConstant(4, dl, MVT::i32));
-  OutChains[1] = DAG.getStore(Chain, dl,
-                              DAG.getConstant(0xd80456c0, dl, MVT::i32), Addr,
-                              MachinePointerInfo(TrmpAddr, 4), false, false, 0);
+  OutChains[1] =
+      DAG.getStore(Chain, dl, DAG.getConstant(0xd80456c0, dl, MVT::i32), Addr,
+                   MachinePointerInfo(TrmpAddr, 4));
 
   Addr = DAG.getNode(ISD::ADD, dl, MVT::i32, Trmp,
                      DAG.getConstant(8, dl, MVT::i32));
-  OutChains[2] = DAG.getStore(Chain, dl,
-                              DAG.getConstant(0x27fb0a3c, dl, MVT::i32), Addr,
-                              MachinePointerInfo(TrmpAddr, 8), false, false, 0);
+  OutChains[2] =
+      DAG.getStore(Chain, dl, DAG.getConstant(0x27fb0a3c, dl, MVT::i32), Addr,
+                   MachinePointerInfo(TrmpAddr, 8));
 
   Addr = DAG.getNode(ISD::ADD, dl, MVT::i32, Trmp,
                      DAG.getConstant(12, dl, MVT::i32));
-  OutChains[3] = DAG.getStore(Chain, dl, Nest, Addr,
-                              MachinePointerInfo(TrmpAddr, 12), false, false,
-                              0);
+  OutChains[3] =
+      DAG.getStore(Chain, dl, Nest, Addr, MachinePointerInfo(TrmpAddr, 12));
 
   Addr = DAG.getNode(ISD::ADD, dl, MVT::i32, Trmp,
                      DAG.getConstant(16, dl, MVT::i32));
-  OutChains[4] = DAG.getStore(Chain, dl, FPtr, Addr,
-                              MachinePointerInfo(TrmpAddr, 16), false, false,
-                              0);
+  OutChains[4] =
+      DAG.getStore(Chain, dl, FPtr, Addr, MachinePointerInfo(TrmpAddr, 16));
 
   return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, OutChains);
 }
@@ -973,29 +956,30 @@ SDValue XCoreTargetLowering::
 LowerATOMIC_LOAD(SDValue Op, SelectionDAG &DAG) const {
   AtomicSDNode *N = cast<AtomicSDNode>(Op);
   assert(N->getOpcode() == ISD::ATOMIC_LOAD && "Bad Atomic OP");
-  assert(N->getOrdering() <= Monotonic &&
-         "setInsertFencesForAtomic(true) and yet greater than Monotonic");
+  assert((N->getOrdering() == AtomicOrdering::Unordered ||
+          N->getOrdering() == AtomicOrdering::Monotonic) &&
+         "setInsertFencesForAtomic(true) expects unordered / monotonic");
   if (N->getMemoryVT() == MVT::i32) {
     if (N->getAlignment() < 4)
       report_fatal_error("atomic load must be aligned");
     return DAG.getLoad(getPointerTy(DAG.getDataLayout()), SDLoc(Op),
                        N->getChain(), N->getBasePtr(), N->getPointerInfo(),
-                       N->isVolatile(), N->isNonTemporal(), N->isInvariant(),
-                       N->getAlignment(), N->getAAInfo(), N->getRanges());
+                       N->getAlignment(), N->getMemOperand()->getFlags(),
+                       N->getAAInfo(), N->getRanges());
   }
   if (N->getMemoryVT() == MVT::i16) {
     if (N->getAlignment() < 2)
       report_fatal_error("atomic load must be aligned");
     return DAG.getExtLoad(ISD::EXTLOAD, SDLoc(Op), MVT::i32, N->getChain(),
                           N->getBasePtr(), N->getPointerInfo(), MVT::i16,
-                          N->isVolatile(), N->isNonTemporal(),
-                          N->isInvariant(), N->getAlignment(), N->getAAInfo());
+                          N->getAlignment(), N->getMemOperand()->getFlags(),
+                          N->getAAInfo());
   }
   if (N->getMemoryVT() == MVT::i8)
     return DAG.getExtLoad(ISD::EXTLOAD, SDLoc(Op), MVT::i32, N->getChain(),
                           N->getBasePtr(), N->getPointerInfo(), MVT::i8,
-                          N->isVolatile(), N->isNonTemporal(),
-                          N->isInvariant(), N->getAlignment(), N->getAAInfo());
+                          N->getAlignment(), N->getMemOperand()->getFlags(),
+                          N->getAAInfo());
   return SDValue();
 }
 
@@ -1003,29 +987,29 @@ SDValue XCoreTargetLowering::
 LowerATOMIC_STORE(SDValue Op, SelectionDAG &DAG) const {
   AtomicSDNode *N = cast<AtomicSDNode>(Op);
   assert(N->getOpcode() == ISD::ATOMIC_STORE && "Bad Atomic OP");
-  assert(N->getOrdering() <= Monotonic &&
-         "setInsertFencesForAtomic(true) and yet greater than Monotonic");
+  assert((N->getOrdering() == AtomicOrdering::Unordered ||
+          N->getOrdering() == AtomicOrdering::Monotonic) &&
+         "setInsertFencesForAtomic(true) expects unordered / monotonic");
   if (N->getMemoryVT() == MVT::i32) {
     if (N->getAlignment() < 4)
       report_fatal_error("atomic store must be aligned");
-    return DAG.getStore(N->getChain(), SDLoc(Op), N->getVal(),
-                        N->getBasePtr(), N->getPointerInfo(),
-                        N->isVolatile(), N->isNonTemporal(),
-                        N->getAlignment(), N->getAAInfo());
+    return DAG.getStore(N->getChain(), SDLoc(Op), N->getVal(), N->getBasePtr(),
+                        N->getPointerInfo(), N->getAlignment(),
+                        N->getMemOperand()->getFlags(), N->getAAInfo());
   }
   if (N->getMemoryVT() == MVT::i16) {
     if (N->getAlignment() < 2)
       report_fatal_error("atomic store must be aligned");
     return DAG.getTruncStore(N->getChain(), SDLoc(Op), N->getVal(),
                              N->getBasePtr(), N->getPointerInfo(), MVT::i16,
-                             N->isVolatile(), N->isNonTemporal(),
-                             N->getAlignment(), N->getAAInfo());
+                             N->getAlignment(), N->getMemOperand()->getFlags(),
+                             N->getAAInfo());
   }
   if (N->getMemoryVT() == MVT::i8)
     return DAG.getTruncStore(N->getChain(), SDLoc(Op), N->getVal(),
                              N->getBasePtr(), N->getPointerInfo(), MVT::i8,
-                             N->isVolatile(), N->isNonTemporal(),
-                             N->getAlignment(), N->getAAInfo());
+                             N->getAlignment(), N->getMemOperand()->getFlags(),
+                             N->getAAInfo());
   return SDValue();
 }
 
@@ -1071,11 +1055,10 @@ XCoreTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
 
 /// LowerCallResult - Lower the result values of a call into the
 /// appropriate copies out of appropriate physical registers / memory locations.
-static SDValue
-LowerCallResult(SDValue Chain, SDValue InFlag,
-                const SmallVectorImpl<CCValAssign> &RVLocs,
-                SDLoc dl, SelectionDAG &DAG,
-                SmallVectorImpl<SDValue> &InVals) {
+static SDValue LowerCallResult(SDValue Chain, SDValue InFlag,
+                               const SmallVectorImpl<CCValAssign> &RVLocs,
+                               const SDLoc &dl, SelectionDAG &DAG,
+                               SmallVectorImpl<SDValue> &InVals) {
   SmallVector<std::pair<int, unsigned>, 4> ResultMemLocs;
   // Copy results out of physical registers.
   for (unsigned i = 0, e = RVLocs.size(); i != e; ++i) {
@@ -1118,15 +1101,12 @@ LowerCallResult(SDValue Chain, SDValue InFlag,
 /// regs to (physical regs)/(stack frame), CALLSEQ_START and
 /// CALLSEQ_END are emitted.
 /// TODO: isTailCall, sret.
-SDValue
-XCoreTargetLowering::LowerCCCCallTo(SDValue Chain, SDValue Callee,
-                                    CallingConv::ID CallConv, bool isVarArg,
-                                    bool isTailCall,
-                                    const SmallVectorImpl<ISD::OutputArg> &Outs,
-                                    const SmallVectorImpl<SDValue> &OutVals,
-                                    const SmallVectorImpl<ISD::InputArg> &Ins,
-                                    SDLoc dl, SelectionDAG &DAG,
-                                    SmallVectorImpl<SDValue> &InVals) const {
+SDValue XCoreTargetLowering::LowerCCCCallTo(
+    SDValue Chain, SDValue Callee, CallingConv::ID CallConv, bool isVarArg,
+    bool isTailCall, const SmallVectorImpl<ISD::OutputArg> &Outs,
+    const SmallVectorImpl<SDValue> &OutVals,
+    const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &dl,
+    SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals) const {
 
   // Analyze operands of the call, assigning locations to each operand.
   SmallVector<CCValAssign, 16> ArgLocs;
@@ -1256,15 +1236,10 @@ namespace {
 }
 
 /// XCore formal arguments implementation
-SDValue
-XCoreTargetLowering::LowerFormalArguments(SDValue Chain,
-                                          CallingConv::ID CallConv,
-                                          bool isVarArg,
-                                      const SmallVectorImpl<ISD::InputArg> &Ins,
-                                          SDLoc dl,
-                                          SelectionDAG &DAG,
-                                          SmallVectorImpl<SDValue> &InVals)
-                                            const {
+SDValue XCoreTargetLowering::LowerFormalArguments(
+    SDValue Chain, CallingConv::ID CallConv, bool isVarArg,
+    const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &dl,
+    SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals) const {
   switch (CallConv)
   {
     default:
@@ -1280,15 +1255,10 @@ XCoreTargetLowering::LowerFormalArguments(SDValue Chain,
 /// virtual registers and generate load operations for
 /// arguments places on the stack.
 /// TODO: sret
-SDValue
-XCoreTargetLowering::LowerCCCArguments(SDValue Chain,
-                                       CallingConv::ID CallConv,
-                                       bool isVarArg,
-                                       const SmallVectorImpl<ISD::InputArg>
-                                         &Ins,
-                                       SDLoc dl,
-                                       SelectionDAG &DAG,
-                                       SmallVectorImpl<SDValue> &InVals) const {
+SDValue XCoreTargetLowering::LowerCCCArguments(
+    SDValue Chain, CallingConv::ID CallConv, bool isVarArg,
+    const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &dl,
+    SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals) const {
   MachineFunction &MF = DAG.getMachineFunction();
   MachineFrameInfo *MFI = MF.getFrameInfo();
   MachineRegisterInfo &RegInfo = MF.getRegInfo();
@@ -1333,7 +1303,7 @@ XCoreTargetLowering::LowerCCCArguments(SDValue Chain,
         {
 #ifndef NDEBUG
           errs() << "LowerFormalArguments Unhandled argument type: "
-                 << RegVT.getSimpleVT().SimpleTy << "\n";
+                 << RegVT.getEVTString() << "\n";
 #endif
           llvm_unreachable(nullptr);
         }
@@ -1362,8 +1332,7 @@ XCoreTargetLowering::LowerCCCArguments(SDValue Chain,
       //from this parameter
       SDValue FIN = DAG.getFrameIndex(FI, MVT::i32);
       ArgIn = DAG.getLoad(VA.getLocVT(), dl, Chain, FIN,
-                          MachinePointerInfo::getFixedStack(MF, FI), false,
-                          false, false, 0);
+                          MachinePointerInfo::getFixedStack(MF, FI));
     }
     const ArgDataPair ADP = { ArgIn, Ins[i].Flags };
     ArgData.push_back(ADP);
@@ -1395,8 +1364,8 @@ XCoreTargetLowering::LowerCCCArguments(SDValue Chain,
         SDValue Val = DAG.getCopyFromReg(Chain, dl, VReg, MVT::i32);
         CFRegNode.push_back(Val.getValue(Val->getNumValues() - 1));
         // Move argument from virt reg -> stack
-        SDValue Store = DAG.getStore(Val.getValue(1), dl, Val, FIN,
-                                     MachinePointerInfo(), false, false, 0);
+        SDValue Store =
+            DAG.getStore(Val.getValue(1), dl, Val, FIN, MachinePointerInfo());
         MemOps.push_back(Store);
       }
     } else {
@@ -1463,11 +1432,11 @@ CanLowerReturn(CallingConv::ID CallConv, MachineFunction &MF,
 }
 
 SDValue
-XCoreTargetLowering::LowerReturn(SDValue Chain,
-                                 CallingConv::ID CallConv, bool isVarArg,
+XCoreTargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv,
+                                 bool isVarArg,
                                  const SmallVectorImpl<ISD::OutputArg> &Outs,
                                  const SmallVectorImpl<SDValue> &OutVals,
-                                 SDLoc dl, SelectionDAG &DAG) const {
+                                 const SDLoc &dl, SelectionDAG &DAG) const {
 
   XCoreFunctionInfo *XFI =
     DAG.getMachineFunction().getInfo<XCoreFunctionInfo>();
@@ -1514,8 +1483,7 @@ XCoreTargetLowering::LowerReturn(SDValue Chain,
     SDValue FIN = DAG.getFrameIndex(FI, MVT::i32);
     MemOpChains.push_back(DAG.getStore(
         Chain, dl, OutVals[i], FIN,
-        MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI), false,
-        false, 0));
+        MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI)));
   }
 
   // Transform all store nodes into one single node because
@@ -1551,11 +1519,11 @@ XCoreTargetLowering::LowerReturn(SDValue Chain,
 //===----------------------------------------------------------------------===//
 
 MachineBasicBlock *
-XCoreTargetLowering::EmitInstrWithCustomInserter(MachineInstr *MI,
+XCoreTargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI,
                                                  MachineBasicBlock *BB) const {
   const TargetInstrInfo &TII = *Subtarget.getInstrInfo();
-  DebugLoc dl = MI->getDebugLoc();
-  assert((MI->getOpcode() == XCore::SELECT_CC) &&
+  DebugLoc dl = MI.getDebugLoc();
+  assert((MI.getOpcode() == XCore::SELECT_CC) &&
          "Unexpected instr type to insert");
 
   // To "insert" a SELECT_CC instruction, we actually have to insert the diamond
@@ -1588,7 +1556,8 @@ XCoreTargetLowering::EmitInstrWithCustomInserter(MachineInstr *MI,
   BB->addSuccessor(sinkMBB);
 
   BuildMI(BB, dl, TII.get(XCore::BRFT_lru6))
-    .addReg(MI->getOperand(1).getReg()).addMBB(sinkMBB);
+      .addReg(MI.getOperand(1).getReg())
+      .addMBB(sinkMBB);
 
   //  copy0MBB:
   //   %FalseValue = ...
@@ -1602,12 +1571,13 @@ XCoreTargetLowering::EmitInstrWithCustomInserter(MachineInstr *MI,
   //   %Result = phi [ %FalseValue, copy0MBB ], [ %TrueValue, thisMBB ]
   //  ...
   BB = sinkMBB;
-  BuildMI(*BB, BB->begin(), dl,
-          TII.get(XCore::PHI), MI->getOperand(0).getReg())
-    .addReg(MI->getOperand(3).getReg()).addMBB(copy0MBB)
-    .addReg(MI->getOperand(2).getReg()).addMBB(thisMBB);
+  BuildMI(*BB, BB->begin(), dl, TII.get(XCore::PHI), MI.getOperand(0).getReg())
+      .addReg(MI.getOperand(3).getReg())
+      .addMBB(copy0MBB)
+      .addReg(MI.getOperand(2).getReg())
+      .addMBB(thisMBB);
 
-  MI->eraseFromParent();   // The pseudo instruction is gone now.
+  MI.eraseFromParent(); // The pseudo instruction is gone now.
   return BB;
 }
 
diff --git a/lib/Target/XCore/XCoreISelLowering.h b/lib/Target/XCore/XCoreISelLowering.h
index b6f09ff418b5..41813bbb8156 100644
--- a/lib/Target/XCore/XCoreISelLowering.h
+++ b/lib/Target/XCore/XCoreISelLowering.h
@@ -119,8 +119,8 @@ namespace llvm {
     const char *getTargetNodeName(unsigned Opcode) const override;
 
     MachineBasicBlock *
-      EmitInstrWithCustomInserter(MachineInstr *MI,
-                                  MachineBasicBlock *MBB) const override;
+    EmitInstrWithCustomInserter(MachineInstr &MI,
+                                MachineBasicBlock *MBB) const override;
 
     bool isLegalAddressingMode(const DataLayout &DL, const AddrMode &AM,
                                Type *Ty, unsigned AS) const override;
@@ -144,11 +144,10 @@ namespace llvm {
     const XCoreSubtarget &Subtarget;
 
     // Lower Operand helpers
-    SDValue LowerCCCArguments(SDValue Chain,
-                              CallingConv::ID CallConv,
+    SDValue LowerCCCArguments(SDValue Chain, CallingConv::ID CallConv,
                               bool isVarArg,
                               const SmallVectorImpl<ISD::InputArg> &Ins,
-                              SDLoc dl, SelectionDAG &DAG,
+                              const SDLoc &dl, SelectionDAG &DAG,
                               SmallVectorImpl<SDValue> &InVals) const;
     SDValue LowerCCCCallTo(SDValue Chain, SDValue Callee,
                            CallingConv::ID CallConv, bool isVarArg,
@@ -156,13 +155,14 @@ namespace llvm {
                            const SmallVectorImpl<ISD::OutputArg> &Outs,
                            const SmallVectorImpl<SDValue> &OutVals,
                            const SmallVectorImpl<ISD::InputArg> &Ins,
-                           SDLoc dl, SelectionDAG &DAG,
+                           const SDLoc &dl, SelectionDAG &DAG,
                            SmallVectorImpl<SDValue> &InVals) const;
     SDValue getReturnAddressFrameIndex(SelectionDAG &DAG) const;
     SDValue getGlobalAddressWrapper(SDValue GA, const GlobalValue *GV,
                                     SelectionDAG &DAG) const;
-    SDValue lowerLoadWordFromAlignedBasePlusOffset(SDLoc DL, SDValue Chain,
-                                                   SDValue Base, int64_t Offset,
+    SDValue lowerLoadWordFromAlignedBasePlusOffset(const SDLoc &DL,
+                                                   SDValue Chain, SDValue Base,
+                                                   int64_t Offset,
                                                    SelectionDAG &DAG) const;
 
     // Lower Operand specifics
@@ -206,29 +206,28 @@ namespace llvm {
                                        unsigned Depth = 0) const override;
 
     SDValue
-      LowerFormalArguments(SDValue Chain,
-                           CallingConv::ID CallConv,
-                           bool isVarArg,
-                           const SmallVectorImpl<ISD::InputArg> &Ins,
-                           SDLoc dl, SelectionDAG &DAG,
-                           SmallVectorImpl<SDValue> &InVals) const override;
+    LowerFormalArguments(SDValue Chain, CallingConv::ID CallConv, bool isVarArg,
+                         const SmallVectorImpl<ISD::InputArg> &Ins,
+                         const SDLoc &dl, SelectionDAG &DAG,
+                         SmallVectorImpl<SDValue> &InVals) const override;
 
     SDValue
       LowerCall(TargetLowering::CallLoweringInfo &CLI,
                 SmallVectorImpl<SDValue> &InVals) const override;
 
-    SDValue
-      LowerReturn(SDValue Chain,
-                  CallingConv::ID CallConv, bool isVarArg,
-                  const SmallVectorImpl<ISD::OutputArg> &Outs,
-                  const SmallVectorImpl<SDValue> &OutVals,
-                  SDLoc dl, SelectionDAG &DAG) const override;
+    SDValue LowerReturn(SDValue Chain, CallingConv::ID CallConv, bool isVarArg,
+                        const SmallVectorImpl<ISD::OutputArg> &Outs,
+                        const SmallVectorImpl<SDValue> &OutVals,
+                        const SDLoc &dl, SelectionDAG &DAG) const override;
 
     bool
       CanLowerReturn(CallingConv::ID CallConv, MachineFunction &MF,
                      bool isVarArg,
                      const SmallVectorImpl<ISD::OutputArg> &ArgsFlags,
                      LLVMContext &Context) const override;
+    bool shouldInsertFencesForAtomic(const Instruction *I) const override {
+      return true;
+    }
   };
 }
 
diff --git a/lib/Target/XCore/XCoreInstrInfo.cpp b/lib/Target/XCore/XCoreInstrInfo.cpp
index e4129aee9479..e0b3e7153da9 100644
--- a/lib/Target/XCore/XCoreInstrInfo.cpp
+++ b/lib/Target/XCore/XCoreInstrInfo.cpp
@@ -60,17 +60,16 @@ static bool isZeroImm(const MachineOperand &op) {
 /// the destination along with the FrameIndex of the loaded stack slot.  If
 /// not, return 0.  This predicate must return 0 if the instruction has
 /// any side effects other than loading from the stack slot.
-unsigned
-XCoreInstrInfo::isLoadFromStackSlot(const MachineInstr *MI, int &FrameIndex) const{
-  int Opcode = MI->getOpcode();
+unsigned XCoreInstrInfo::isLoadFromStackSlot(const MachineInstr &MI,
+                                             int &FrameIndex) const {
+  int Opcode = MI.getOpcode();
   if (Opcode == XCore::LDWFI) 
   {
-    if ((MI->getOperand(1).isFI()) && // is a stack slot
-        (MI->getOperand(2).isImm()) &&  // the imm is zero
-        (isZeroImm(MI->getOperand(2)))) 
-    {
-      FrameIndex = MI->getOperand(1).getIndex();
-      return MI->getOperand(0).getReg();
+    if ((MI.getOperand(1).isFI()) &&  // is a stack slot
+        (MI.getOperand(2).isImm()) && // the imm is zero
+        (isZeroImm(MI.getOperand(2)))) {
+      FrameIndex = MI.getOperand(1).getIndex();
+      return MI.getOperand(0).getReg();
     }
   }
   return 0;
@@ -81,18 +80,16 @@ XCoreInstrInfo::isLoadFromStackSlot(const MachineInstr *MI, int &FrameIndex) con
   /// the source reg along with the FrameIndex of the loaded stack slot.  If
   /// not, return 0.  This predicate must return 0 if the instruction has
   /// any side effects other than storing to the stack slot.
-unsigned
-XCoreInstrInfo::isStoreToStackSlot(const MachineInstr *MI,
-                                   int &FrameIndex) const {
-  int Opcode = MI->getOpcode();
+unsigned XCoreInstrInfo::isStoreToStackSlot(const MachineInstr &MI,
+                                            int &FrameIndex) const {
+  int Opcode = MI.getOpcode();
   if (Opcode == XCore::STWFI)
   {
-    if ((MI->getOperand(1).isFI()) && // is a stack slot
-        (MI->getOperand(2).isImm()) &&  // the imm is zero
-        (isZeroImm(MI->getOperand(2))))
-    {
-      FrameIndex = MI->getOperand(1).getIndex();
-      return MI->getOperand(0).getReg();
+    if ((MI.getOperand(1).isFI()) &&  // is a stack slot
+        (MI.getOperand(2).isImm()) && // the imm is zero
+        (isZeroImm(MI.getOperand(2)))) {
+      FrameIndex = MI.getOperand(1).getIndex();
+      return MI.getOperand(0).getReg();
     }
   }
   return 0;
@@ -190,24 +187,24 @@ static inline XCore::CondCode GetOppositeBranchCondition(XCore::CondCode CC)
 /// Note that RemoveBranch and InsertBranch must be implemented to support
 /// cases where this method returns success.
 ///
-bool
-XCoreInstrInfo::AnalyzeBranch(MachineBasicBlock &MBB, MachineBasicBlock *&TBB,
-                              MachineBasicBlock *&FBB,
-                              SmallVectorImpl<MachineOperand> &Cond,
-                              bool AllowModify) const {
+bool XCoreInstrInfo::analyzeBranch(MachineBasicBlock &MBB,
+                                   MachineBasicBlock *&TBB,
+                                   MachineBasicBlock *&FBB,
+                                   SmallVectorImpl<MachineOperand> &Cond,
+                                   bool AllowModify) const {
   // If the block has no terminators, it just falls into the block after it.
   MachineBasicBlock::iterator I = MBB.getLastNonDebugInstr();
   if (I == MBB.end())
     return false;
 
-  if (!isUnpredicatedTerminator(I))
+  if (!isUnpredicatedTerminator(*I))
     return false;
 
   // Get the last instruction in the block.
   MachineInstr *LastInst = I;
   
   // If there is only one terminator instruction, process it.
-  if (I == MBB.begin() || !isUnpredicatedTerminator(--I)) {
+  if (I == MBB.begin() || !isUnpredicatedTerminator(*--I)) {
     if (IsBRU(LastInst->getOpcode())) {
       TBB = LastInst->getOperand(0).getMBB();
       return false;
@@ -230,8 +227,7 @@ XCoreInstrInfo::AnalyzeBranch(MachineBasicBlock &MBB, MachineBasicBlock *&TBB,
   MachineInstr *SecondLastInst = I;
 
   // If there are three terminators, we don't know what sort of block this is.
-  if (SecondLastInst && I != MBB.begin() &&
-      isUnpredicatedTerminator(--I))
+  if (SecondLastInst && I != MBB.begin() && isUnpredicatedTerminator(*--I))
     return true;
   
   unsigned SecondLastOpc    = SecondLastInst->getOpcode();
@@ -273,11 +269,11 @@ XCoreInstrInfo::AnalyzeBranch(MachineBasicBlock &MBB, MachineBasicBlock *&TBB,
   return true;
 }
 
-unsigned
-XCoreInstrInfo::InsertBranch(MachineBasicBlock &MBB,MachineBasicBlock *TBB,
-                             MachineBasicBlock *FBB,
-                             ArrayRef<MachineOperand> Cond,
-                             DebugLoc DL)const{
+unsigned XCoreInstrInfo::InsertBranch(MachineBasicBlock &MBB,
+                                      MachineBasicBlock *TBB,
+                                      MachineBasicBlock *FBB,
+                                      ArrayRef<MachineOperand> Cond,
+                                      const DebugLoc &DL) const {
   // Shouldn't be a fall through.
   assert(TBB && "InsertBranch must not be told to insert a fallthrough");
   assert((Cond.size() == 2 || Cond.size() == 0) &&
@@ -330,9 +326,9 @@ XCoreInstrInfo::RemoveBranch(MachineBasicBlock &MBB) const {
 }
 
 void XCoreInstrInfo::copyPhysReg(MachineBasicBlock &MBB,
-                                 MachineBasicBlock::iterator I, DebugLoc DL,
-                                 unsigned DestReg, unsigned SrcReg,
-                                 bool KillSrc) const {
+                                 MachineBasicBlock::iterator I,
+                                 const DebugLoc &DL, unsigned DestReg,
+                                 unsigned SrcReg, bool KillSrc) const {
   bool GRDest = XCore::GRRegsRegClass.contains(DestReg);
   bool GRSrc  = XCore::GRRegsRegClass.contains(SrcReg);
 
diff --git a/lib/Target/XCore/XCoreInstrInfo.h b/lib/Target/XCore/XCoreInstrInfo.h
index b958c361f5a2..783bc6bab5d5 100644
--- a/lib/Target/XCore/XCoreInstrInfo.h
+++ b/lib/Target/XCore/XCoreInstrInfo.h
@@ -39,7 +39,7 @@ public:
   /// the destination along with the FrameIndex of the loaded stack slot.  If
   /// not, return 0.  This predicate must return 0 if the instruction has
   /// any side effects other than loading from the stack slot.
-  unsigned isLoadFromStackSlot(const MachineInstr *MI,
+  unsigned isLoadFromStackSlot(const MachineInstr &MI,
                                int &FrameIndex) const override;
 
   /// isStoreToStackSlot - If the specified machine instruction is a direct
@@ -47,23 +47,22 @@ public:
   /// the source reg along with the FrameIndex of the loaded stack slot.  If
   /// not, return 0.  This predicate must return 0 if the instruction has
   /// any side effects other than storing to the stack slot.
-  unsigned isStoreToStackSlot(const MachineInstr *MI,
+  unsigned isStoreToStackSlot(const MachineInstr &MI,
                               int &FrameIndex) const override;
 
-  bool AnalyzeBranch(MachineBasicBlock &MBB, MachineBasicBlock *&TBB,
+  bool analyzeBranch(MachineBasicBlock &MBB, MachineBasicBlock *&TBB,
                      MachineBasicBlock *&FBB,
                      SmallVectorImpl<MachineOperand> &Cond,
                      bool AllowModify) const override;
 
   unsigned InsertBranch(MachineBasicBlock &MBB, MachineBasicBlock *TBB,
                         MachineBasicBlock *FBB, ArrayRef<MachineOperand> Cond,
-                        DebugLoc DL) const override;
+                        const DebugLoc &DL) const override;
 
   unsigned RemoveBranch(MachineBasicBlock &MBB) const override;
 
-  void copyPhysReg(MachineBasicBlock &MBB,
-                   MachineBasicBlock::iterator I, DebugLoc DL,
-                   unsigned DestReg, unsigned SrcReg,
+  void copyPhysReg(MachineBasicBlock &MBB, MachineBasicBlock::iterator I,
+                   const DebugLoc &DL, unsigned DestReg, unsigned SrcReg,
                    bool KillSrc) const override;
 
   void storeRegToStackSlot(MachineBasicBlock &MBB,
diff --git a/lib/Target/XCore/XCoreLowerThreadLocal.cpp b/lib/Target/XCore/XCoreLowerThreadLocal.cpp
index f0b720151b17..5cc51cd7a992 100644
--- a/lib/Target/XCore/XCoreLowerThreadLocal.cpp
+++ b/lib/Target/XCore/XCoreLowerThreadLocal.cpp
@@ -76,7 +76,7 @@ createLoweredInitializer(ArrayType *NewType, Constant *OriginalInitializer) {
 
 static Instruction *
 createReplacementInstr(ConstantExpr *CE, Instruction *Instr) {
-  IRBuilder<true,NoFolder> Builder(Instr);
+  IRBuilder<NoFolder> Builder(Instr);
   unsigned OpCode = CE->getOpcode();
   switch (OpCode) {
     case Instruction::GetElementPtr: {
@@ -179,7 +179,6 @@ static bool isZeroLengthArray(Type *Ty) {
 
 bool XCoreLowerThreadLocal::lowerGlobal(GlobalVariable *GV) {
   Module *M = GV->getParent();
-  LLVMContext &Ctx = M->getContext();
   if (!GV->isThreadLocal())
     return false;
 
@@ -189,7 +188,7 @@ bool XCoreLowerThreadLocal::lowerGlobal(GlobalVariable *GV) {
     return false;
 
   // Create replacement global.
-  ArrayType *NewType = createLoweredType(GV->getType()->getElementType());
+  ArrayType *NewType = createLoweredType(GV->getValueType());
   Constant *NewInitializer = nullptr;
   if (GV->hasInitializer())
     NewInitializer = createLoweredInitializer(NewType,
@@ -210,11 +209,8 @@ bool XCoreLowerThreadLocal::lowerGlobal(GlobalVariable *GV) {
     Function *GetID = Intrinsic::getDeclaration(GV->getParent(),
                                                 Intrinsic::xcore_getid);
     Value *ThreadID = Builder.CreateCall(GetID, {});
-    SmallVector<Value *, 2> Indices;
-    Indices.push_back(Constant::getNullValue(Type::getInt64Ty(Ctx)));
-    Indices.push_back(ThreadID);
-    Value *Addr =
-        Builder.CreateInBoundsGEP(NewGV->getValueType(), NewGV, Indices);
+    Value *Addr = Builder.CreateInBoundsGEP(NewGV->getValueType(), NewGV,
+                                            {Builder.getInt64(0), ThreadID});
     U->replaceUsesOfWith(GV, Addr);
   }
 
diff --git a/lib/Target/XCore/XCoreSelectionDAGInfo.cpp b/lib/Target/XCore/XCoreSelectionDAGInfo.cpp
index 40568d124de0..61fbf0dc3d2c 100644
--- a/lib/Target/XCore/XCoreSelectionDAGInfo.cpp
+++ b/lib/Target/XCore/XCoreSelectionDAGInfo.cpp
@@ -16,13 +16,10 @@ using namespace llvm;
 
 #define DEBUG_TYPE "xcore-selectiondag-info"
 
-SDValue XCoreSelectionDAGInfo::
-EmitTargetCodeForMemcpy(SelectionDAG &DAG, SDLoc dl, SDValue Chain,
-                        SDValue Dst, SDValue Src, SDValue Size, unsigned Align,
-                        bool isVolatile, bool AlwaysInline,
-                        MachinePointerInfo DstPtrInfo,
-                        MachinePointerInfo SrcPtrInfo) const
-{
+SDValue XCoreSelectionDAGInfo::EmitTargetCodeForMemcpy(
+    SelectionDAG &DAG, const SDLoc &dl, SDValue Chain, SDValue Dst, SDValue Src,
+    SDValue Size, unsigned Align, bool isVolatile, bool AlwaysInline,
+    MachinePointerInfo DstPtrInfo, MachinePointerInfo SrcPtrInfo) const {
   unsigned SizeBitWidth = Size.getValueType().getSizeInBits();
   // Call __memcpy_4 if the src, dst and size are all 4 byte aligned.
   if (!AlwaysInline && (Align & 3) == 0 &&
@@ -42,7 +39,7 @@ EmitTargetCodeForMemcpy(SelectionDAG &DAG, SDLoc dl, SDValue Chain,
                    Type::getVoidTy(*DAG.getContext()),
                    DAG.getExternalSymbol("__memcpy_4",
                                          TLI.getPointerTy(DAG.getDataLayout())),
-                   std::move(Args), 0)
+                   std::move(Args))
         .setDiscardResult();
 
     std::pair<SDValue,SDValue> CallResult = TLI.LowerCallTo(CLI);
diff --git a/lib/Target/XCore/XCoreSelectionDAGInfo.h b/lib/Target/XCore/XCoreSelectionDAGInfo.h
index 77b3527d77e3..7cd0d8216e91 100644
--- a/lib/Target/XCore/XCoreSelectionDAGInfo.h
+++ b/lib/Target/XCore/XCoreSelectionDAGInfo.h
@@ -7,30 +7,27 @@
 //
 //===----------------------------------------------------------------------===//
 //
-// This file defines the XCore subclass for TargetSelectionDAGInfo.
+// This file defines the XCore subclass for SelectionDAGTargetInfo.
 //
 //===----------------------------------------------------------------------===//
 
 #ifndef LLVM_LIB_TARGET_XCORE_XCORESELECTIONDAGINFO_H
 #define LLVM_LIB_TARGET_XCORE_XCORESELECTIONDAGINFO_H
 
-#include "llvm/Target/TargetSelectionDAGInfo.h"
+#include "llvm/CodeGen/SelectionDAGTargetInfo.h"
 
 namespace llvm {
 
 class XCoreTargetMachine;
 
-class XCoreSelectionDAGInfo : public TargetSelectionDAGInfo {
+class XCoreSelectionDAGInfo : public SelectionDAGTargetInfo {
 public:
-
-  SDValue
-  EmitTargetCodeForMemcpy(SelectionDAG &DAG, SDLoc dl,
-                          SDValue Chain,
-                          SDValue Op1, SDValue Op2,
-                          SDValue Op3, unsigned Align, bool isVolatile,
-                          bool AlwaysInline,
-                          MachinePointerInfo DstPtrInfo,
-                          MachinePointerInfo SrcPtrInfo) const override;
+  SDValue EmitTargetCodeForMemcpy(SelectionDAG &DAG, const SDLoc &dl,
+                                  SDValue Chain, SDValue Op1, SDValue Op2,
+                                  SDValue Op3, unsigned Align, bool isVolatile,
+                                  bool AlwaysInline,
+                                  MachinePointerInfo DstPtrInfo,
+                                  MachinePointerInfo SrcPtrInfo) const override;
 };
 
 }
diff --git a/lib/Target/XCore/XCoreTargetMachine.cpp b/lib/Target/XCore/XCoreTargetMachine.cpp
index 4a79dac0bed9..c3eab802f815 100644
--- a/lib/Target/XCore/XCoreTargetMachine.cpp
+++ b/lib/Target/XCore/XCoreTargetMachine.cpp
@@ -15,21 +15,29 @@
 #include "XCoreTargetTransformInfo.h"
 #include "XCore.h"
 #include "llvm/CodeGen/Passes.h"
+#include "llvm/CodeGen/TargetPassConfig.h"
 #include "llvm/IR/Module.h"
 #include "llvm/IR/LegacyPassManager.h"
 #include "llvm/Support/TargetRegistry.h"
 using namespace llvm;
 
-/// XCoreTargetMachine ctor - Create an ILP32 architecture model
+static Reloc::Model getEffectiveRelocModel(Optional<Reloc::Model> RM) {
+  if (!RM.hasValue())
+    return Reloc::Static;
+  return *RM;
+}
+
+/// Create an ILP32 architecture model
 ///
 XCoreTargetMachine::XCoreTargetMachine(const Target &T, const Triple &TT,
                                        StringRef CPU, StringRef FS,
                                        const TargetOptions &Options,
-                                       Reloc::Model RM, CodeModel::Model CM,
+                                       Optional<Reloc::Model> RM,
+                                       CodeModel::Model CM,
                                        CodeGenOpt::Level OL)
     : LLVMTargetMachine(
           T, "e-m:e-p:32:32-i1:8:32-i8:8:32-i16:16:32-i64:32-f64:32-a:0:32-n32",
-          TT, CPU, FS, Options, RM, CM, OL),
+          TT, CPU, FS, Options, getEffectiveRelocModel(RM), CM, OL),
       TLOF(make_unique<XCoreTargetObjectFile>()),
       Subtarget(TT, CPU, FS, *this) {
   initAsmInfo();
diff --git a/lib/Target/XCore/XCoreTargetMachine.h b/lib/Target/XCore/XCoreTargetMachine.h
index a8addfc3e429..4bd25bc8776c 100644
--- a/lib/Target/XCore/XCoreTargetMachine.h
+++ b/lib/Target/XCore/XCoreTargetMachine.h
@@ -25,7 +25,7 @@ class XCoreTargetMachine : public LLVMTargetMachine {
 public:
   XCoreTargetMachine(const Target &T, const Triple &TT, StringRef CPU,
                      StringRef FS, const TargetOptions &Options,
-                     Reloc::Model RM, CodeModel::Model CM,
+                     Optional<Reloc::Model> RM, CodeModel::Model CM,
                      CodeGenOpt::Level OL);
   ~XCoreTargetMachine() override;
 
diff --git a/lib/Target/XCore/XCoreTargetObjectFile.cpp b/lib/Target/XCore/XCoreTargetObjectFile.cpp
index aa16ecc148db..abe1deddc56e 100644
--- a/lib/Target/XCore/XCoreTargetObjectFile.cpp
+++ b/lib/Target/XCore/XCoreTargetObjectFile.cpp
@@ -122,7 +122,7 @@ XCoreTargetObjectFile::SelectSectionForGlobal(const GlobalValue *GV,
     if (Kind.isMergeableConst8())       return MergeableConst8Section;
     if (Kind.isMergeableConst16())      return MergeableConst16Section;
   }
-  Type *ObjType = GV->getType()->getPointerElementType();
+  Type *ObjType = GV->getValueType();
   auto &DL = GV->getParent()->getDataLayout();
   if (TM.getCodeModel() == CodeModel::Small || !ObjType->isSized() ||
       DL.getTypeAllocSize(ObjType) < CodeModelLargeSize) {
@@ -145,8 +145,10 @@ XCoreTargetObjectFile::SelectSectionForGlobal(const GlobalValue *GV,
   report_fatal_error("Target does not support TLS or Common sections");
 }
 
-MCSection *XCoreTargetObjectFile::getSectionForConstant(
-    const DataLayout &DL, SectionKind Kind, const Constant *C) const {
+MCSection *XCoreTargetObjectFile::getSectionForConstant(const DataLayout &DL,
+                                                        SectionKind Kind,
+                                                        const Constant *C,
+                                                        unsigned &Align) const {
   if (Kind.isMergeableConst4())           return MergeableConst4Section;
   if (Kind.isMergeableConst8())           return MergeableConst8Section;
   if (Kind.isMergeableConst16())          return MergeableConst16Section;
diff --git a/lib/Target/XCore/XCoreTargetObjectFile.h b/lib/Target/XCore/XCoreTargetObjectFile.h
index 6701c661a73e..c129d757dea3 100644
--- a/lib/Target/XCore/XCoreTargetObjectFile.h
+++ b/lib/Target/XCore/XCoreTargetObjectFile.h
@@ -34,7 +34,8 @@ static const unsigned CodeModelLargeSize = 256;
                                       const TargetMachine &TM) const override;
 
     MCSection *getSectionForConstant(const DataLayout &DL, SectionKind Kind,
-                                     const Constant *C) const override;
+                                     const Constant *C,
+                                     unsigned &Align) const override;
   };
 } // end namespace llvm
 
diff --git a/lib/Transforms/Hello/CMakeLists.txt b/lib/Transforms/Hello/CMakeLists.txt
index e0b81907c7fb..4a55dd9c04b8 100644
--- a/lib/Transforms/Hello/CMakeLists.txt
+++ b/lib/Transforms/Hello/CMakeLists.txt
@@ -15,4 +15,6 @@ add_llvm_loadable_module( LLVMHello
 
   DEPENDS
   intrinsics_gen
+  PLUGIN_TOOL
+  opt
   )
diff --git a/lib/Transforms/Hello/Makefile b/lib/Transforms/Hello/Makefile
deleted file mode 100644
index f1e31489d3c9..000000000000
--- a/lib/Transforms/Hello/Makefile
+++ /dev/null
@@ -1,24 +0,0 @@
-##===- lib/Transforms/Hello/Makefile -----------------------*- Makefile -*-===##
-#
-#                     The LLVM Compiler Infrastructure
-#
-# This file is distributed under the University of Illinois Open Source
-# License. See LICENSE.TXT for details.
-#
-##===----------------------------------------------------------------------===##
-
-LEVEL = ../../..
-LIBRARYNAME = LLVMHello
-LOADABLE_MODULE = 1
-USEDLIBS =
-
-# If we don't need RTTI or EH, there's no reason to export anything
-# from the hello plugin.
-ifneq ($(REQUIRES_RTTI), 1)
-ifneq ($(REQUIRES_EH), 1)
-EXPORTED_SYMBOL_FILE = $(PROJ_SRC_DIR)/Hello.exports
-endif
-endif
-
-include $(LEVEL)/Makefile.common
-
diff --git a/lib/Transforms/IPO/ArgumentPromotion.cpp b/lib/Transforms/IPO/ArgumentPromotion.cpp
index 0e05129b5261..0716a3a9cbe9 100644
--- a/lib/Transforms/IPO/ArgumentPromotion.cpp
+++ b/lib/Transforms/IPO/ArgumentPromotion.cpp
@@ -38,6 +38,7 @@
 #include "llvm/Analysis/BasicAliasAnalysis.h"
 #include "llvm/Analysis/CallGraph.h"
 #include "llvm/Analysis/CallGraphSCCPass.h"
+#include "llvm/Analysis/Loads.h"
 #include "llvm/Analysis/TargetLibraryInfo.h"
 #include "llvm/Analysis/ValueTracking.h"
 #include "llvm/IR/CFG.h"
@@ -68,6 +69,7 @@ namespace {
     void getAnalysisUsage(AnalysisUsage &AU) const override {
       AU.addRequired<AssumptionCacheTracker>();
       AU.addRequired<TargetLibraryInfoWrapperPass>();
+      getAAResultsAnalysisUsage(AU);
       CallGraphSCCPass::getAnalysisUsage(AU);
     }
 
@@ -78,19 +80,8 @@ namespace {
       initializeArgPromotionPass(*PassRegistry::getPassRegistry());
     }
 
-    /// A vector used to hold the indices of a single GEP instruction
-    typedef std::vector<uint64_t> IndicesVector;
-
   private:
-    bool isDenselyPacked(Type *type, const DataLayout &DL);
-    bool canPaddingBeAccessed(Argument *Arg);
-    CallGraphNode *PromoteArguments(CallGraphNode *CGN);
-    bool isSafeToPromoteArgument(Argument *Arg, bool isByVal,
-                                 AAResults &AAR) const;
-    CallGraphNode *DoPromotion(Function *F,
-                              SmallPtrSetImpl<Argument*> &ArgsToPromote,
-                              SmallPtrSetImpl<Argument*> &ByValArgsToTransform);
-    
+
     using llvm::Pass::doInitialization;
     bool doInitialization(CallGraph &CG) override;
     /// The maximum number of elements to expand, or 0 for unlimited.
@@ -98,6 +89,21 @@ namespace {
   };
 }
 
+/// A vector used to hold the indices of a single GEP instruction
+typedef std::vector<uint64_t> IndicesVector;
+
+static CallGraphNode *
+PromoteArguments(CallGraphNode *CGN, CallGraph &CG,
+                 function_ref<AAResults &(Function &F)> AARGetter,
+                 unsigned MaxElements);
+static bool isDenselyPacked(Type *type, const DataLayout &DL);
+static bool canPaddingBeAccessed(Argument *Arg);
+static bool isSafeToPromoteArgument(Argument *Arg, bool isByVal, AAResults &AAR,
+                                    unsigned MaxElements);
+static CallGraphNode *
+DoPromotion(Function *F, SmallPtrSetImpl<Argument *> &ArgsToPromote,
+            SmallPtrSetImpl<Argument *> &ByValArgsToTransform, CallGraph &CG);
+
 char ArgPromotion::ID = 0;
 INITIALIZE_PASS_BEGIN(ArgPromotion, "argpromotion",
                 "Promote 'by reference' arguments to scalars", false, false)
@@ -111,16 +117,19 @@ Pass *llvm::createArgumentPromotionPass(unsigned maxElements) {
   return new ArgPromotion(maxElements);
 }
 
-bool ArgPromotion::runOnSCC(CallGraphSCC &SCC) {
+static bool runImpl(CallGraphSCC &SCC, CallGraph &CG,
+                    function_ref<AAResults &(Function &F)> AARGetter,
+                    unsigned MaxElements) {
   bool Changed = false, LocalChange;
 
   do {  // Iterate until we stop promoting from this SCC.
     LocalChange = false;
     // Attempt to promote arguments from all functions in this SCC.
-    for (CallGraphSCC::iterator I = SCC.begin(), E = SCC.end(); I != E; ++I) {
-      if (CallGraphNode *CGN = PromoteArguments(*I)) {
+    for (CallGraphNode *OldNode : SCC) {
+      if (CallGraphNode *NewNode =
+              PromoteArguments(OldNode, CG, AARGetter, MaxElements)) {
         LocalChange = true;
-        SCC.ReplaceNode(*I, CGN);
+        SCC.ReplaceNode(OldNode, NewNode);
       }
     }
     Changed |= LocalChange;               // Remember that we changed something.
@@ -129,8 +138,30 @@ bool ArgPromotion::runOnSCC(CallGraphSCC &SCC) {
   return Changed;
 }
 
+bool ArgPromotion::runOnSCC(CallGraphSCC &SCC) {
+  if (skipSCC(SCC))
+    return false;
+
+  // Get the callgraph information that we need to update to reflect our
+  // changes.
+  CallGraph &CG = getAnalysis<CallGraphWrapperPass>().getCallGraph();
+
+  // We compute dedicated AA results for each function in the SCC as needed. We
+  // use a lambda referencing external objects so that they live long enough to
+  // be queried, but we re-use them each time.
+  Optional<BasicAAResult> BAR;
+  Optional<AAResults> AAR;
+  auto AARGetter = [&](Function &F) -> AAResults & {
+    BAR.emplace(createLegacyPMBasicAAResult(*this, F));
+    AAR.emplace(createLegacyPMAAResults(*this, F, *BAR));
+    return *AAR;
+  };
+
+  return runImpl(SCC, CG, AARGetter, maxElements);
+}
+
 /// \brief Checks if a type could have padding bytes.
-bool ArgPromotion::isDenselyPacked(Type *type, const DataLayout &DL) {
+static bool isDenselyPacked(Type *type, const DataLayout &DL) {
 
   // There is no size information, so be conservative.
   if (!type->isSized())
@@ -166,7 +197,7 @@ bool ArgPromotion::isDenselyPacked(Type *type, const DataLayout &DL) {
 }
 
 /// \brief Checks if the padding bytes of an argument could be accessed.
-bool ArgPromotion::canPaddingBeAccessed(Argument *arg) {
+static bool canPaddingBeAccessed(Argument *arg) {
 
   assert(arg->hasByValAttr());
 
@@ -207,7 +238,10 @@ bool ArgPromotion::canPaddingBeAccessed(Argument *arg) {
 /// example, all callers are direct).  If safe to promote some arguments, it
 /// calls the DoPromotion method.
 ///
-CallGraphNode *ArgPromotion::PromoteArguments(CallGraphNode *CGN) {
+static CallGraphNode *
+PromoteArguments(CallGraphNode *CGN, CallGraph &CG,
+                 function_ref<AAResults &(Function &F)> AARGetter,
+                 unsigned MaxElements) {
   Function *F = CGN->getFunction();
 
   // Make sure that it is local to this module.
@@ -242,20 +276,13 @@ CallGraphNode *ArgPromotion::PromoteArguments(CallGraphNode *CGN) {
   
   const DataLayout &DL = F->getParent()->getDataLayout();
 
-  // We need to manually construct BasicAA directly in order to disable its use
-  // of other function analyses.
-  BasicAAResult BAR(createLegacyPMBasicAAResult(*this, *F));
-
-  // Construct our own AA results for this function. We do this manually to
-  // work around the limitations of the legacy pass manager.
-  AAResults AAR(createLegacyPMAAResults(*this, *F, BAR));
+  AAResults &AAR = AARGetter(*F);
 
   // Check to see which arguments are promotable.  If an argument is promotable,
   // add it to ArgsToPromote.
   SmallPtrSet<Argument*, 8> ArgsToPromote;
   SmallPtrSet<Argument*, 8> ByValArgsToTransform;
-  for (unsigned i = 0, e = PointerArgs.size(); i != e; ++i) {
-    Argument *PtrArg = PointerArgs[i];
+  for (Argument *PtrArg : PointerArgs) {
     Type *AgTy = cast<PointerType>(PtrArg->getType())->getElementType();
 
     // Replace sret attribute with noalias. This reduces register pressure by
@@ -285,10 +312,10 @@ CallGraphNode *ArgPromotion::PromoteArguments(CallGraphNode *CGN) {
         (isDenselyPacked(AgTy, DL) || !canPaddingBeAccessed(PtrArg));
     if (isSafeToPromote) {
       if (StructType *STy = dyn_cast<StructType>(AgTy)) {
-        if (maxElements > 0 && STy->getNumElements() > maxElements) {
+        if (MaxElements > 0 && STy->getNumElements() > MaxElements) {
           DEBUG(dbgs() << "argpromotion disable promoting argument '"
                 << PtrArg->getName() << "' because it would require adding more"
-                << " than " << maxElements << " arguments to the function.\n");
+                << " than " << MaxElements << " arguments to the function.\n");
           continue;
         }
         
@@ -302,7 +329,7 @@ CallGraphNode *ArgPromotion::PromoteArguments(CallGraphNode *CGN) {
         }
 
         // Safe to transform, don't even bother trying to "promote" it.
-        // Passing the elements as a scalar will allow scalarrepl to hack on
+        // Passing the elements as a scalar will allow sroa to hack on
         // the new alloca we introduce.
         if (AllSimple) {
           ByValArgsToTransform.insert(PtrArg);
@@ -328,7 +355,8 @@ CallGraphNode *ArgPromotion::PromoteArguments(CallGraphNode *CGN) {
     }
     
     // Otherwise, see if we can promote the pointer to its value.
-    if (isSafeToPromoteArgument(PtrArg, PtrArg->hasByValOrInAllocaAttr(), AAR))
+    if (isSafeToPromoteArgument(PtrArg, PtrArg->hasByValOrInAllocaAttr(), AAR,
+                                MaxElements))
       ArgsToPromote.insert(PtrArg);
   }
 
@@ -336,7 +364,7 @@ CallGraphNode *ArgPromotion::PromoteArguments(CallGraphNode *CGN) {
   if (ArgsToPromote.empty() && ByValArgsToTransform.empty()) 
     return nullptr;
 
-  return DoPromotion(F, ArgsToPromote, ByValArgsToTransform);
+  return DoPromotion(F, ArgsToPromote, ByValArgsToTransform, CG);
 }
 
 /// AllCallersPassInValidPointerForArgument - Return true if we can prove that
@@ -364,8 +392,7 @@ static bool AllCallersPassInValidPointerForArgument(Argument *Arg) {
 /// elements in Prefix is the same as the corresponding elements in Longer.
 ///
 /// This means it also returns true when Prefix and Longer are equal!
-static bool IsPrefix(const ArgPromotion::IndicesVector &Prefix,
-                     const ArgPromotion::IndicesVector &Longer) {
+static bool IsPrefix(const IndicesVector &Prefix, const IndicesVector &Longer) {
   if (Prefix.size() > Longer.size())
     return false;
   return std::equal(Prefix.begin(), Prefix.end(), Longer.begin());
@@ -373,9 +400,9 @@ static bool IsPrefix(const ArgPromotion::IndicesVector &Prefix,
 
 
 /// Checks if Indices, or a prefix of Indices, is in Set.
-static bool PrefixIn(const ArgPromotion::IndicesVector &Indices,
-                     std::set<ArgPromotion::IndicesVector> &Set) {
-    std::set<ArgPromotion::IndicesVector>::iterator Low;
+static bool PrefixIn(const IndicesVector &Indices,
+                     std::set<IndicesVector> &Set) {
+    std::set<IndicesVector>::iterator Low;
     Low = Set.upper_bound(Indices);
     if (Low != Set.begin())
       Low--;
@@ -392,9 +419,9 @@ static bool PrefixIn(const ArgPromotion::IndicesVector &Indices,
 /// is already a prefix of Indices in Safe, Indices are implicitely marked safe
 /// already. Furthermore, any indices that Indices is itself a prefix of, are
 /// removed from Safe (since they are implicitely safe because of Indices now).
-static void MarkIndicesSafe(const ArgPromotion::IndicesVector &ToMark,
-                            std::set<ArgPromotion::IndicesVector> &Safe) {
-  std::set<ArgPromotion::IndicesVector>::iterator Low;
+static void MarkIndicesSafe(const IndicesVector &ToMark,
+                            std::set<IndicesVector> &Safe) {
+  std::set<IndicesVector>::iterator Low;
   Low = Safe.upper_bound(ToMark);
   // Guard against the case where Safe is empty
   if (Low != Safe.begin())
@@ -415,9 +442,9 @@ static void MarkIndicesSafe(const ArgPromotion::IndicesVector &ToMark,
   Low = Safe.insert(Low, ToMark);
   ++Low;
   // If there we're a prefix of longer index list(s), remove those
-  std::set<ArgPromotion::IndicesVector>::iterator End = Safe.end();
+  std::set<IndicesVector>::iterator End = Safe.end();
   while (Low != End && IsPrefix(ToMark, *Low)) {
-    std::set<ArgPromotion::IndicesVector>::iterator Remove = Low;
+    std::set<IndicesVector>::iterator Remove = Low;
     ++Low;
     Safe.erase(Remove);
   }
@@ -428,9 +455,8 @@ static void MarkIndicesSafe(const ArgPromotion::IndicesVector &ToMark,
 /// This method limits promotion of aggregates to only promote up to three
 /// elements of the aggregate in order to avoid exploding the number of
 /// arguments passed in.
-bool ArgPromotion::isSafeToPromoteArgument(Argument *Arg,
-                                           bool isByValOrInAlloca,
-                                           AAResults &AAR) const {
+static bool isSafeToPromoteArgument(Argument *Arg, bool isByValOrInAlloca,
+                                    AAResults &AAR, unsigned MaxElements) {
   typedef std::set<IndicesVector> GEPIndicesSet;
 
   // Quick exit for unused arguments
@@ -518,7 +544,8 @@ bool ArgPromotion::isSafeToPromoteArgument(Argument *Arg,
         // TODO: This runs the above loop over and over again for dead GEPs
         // Couldn't we just do increment the UI iterator earlier and erase the
         // use?
-        return isSafeToPromoteArgument(Arg, isByValOrInAlloca, AAR);
+        return isSafeToPromoteArgument(Arg, isByValOrInAlloca, AAR,
+                                       MaxElements);
       }
 
       // Ensure that all of the indices are constants.
@@ -552,10 +579,10 @@ bool ArgPromotion::isSafeToPromoteArgument(Argument *Arg,
     // to make sure that we aren't promoting too many elements.  If so, nothing
     // to do.
     if (ToPromote.find(Operands) == ToPromote.end()) {
-      if (maxElements > 0 && ToPromote.size() == maxElements) {
+      if (MaxElements > 0 && ToPromote.size() == MaxElements) {
         DEBUG(dbgs() << "argpromotion not promoting argument '"
               << Arg->getName() << "' because it would require adding more "
-              << "than " << maxElements << " arguments to the function.\n");
+              << "than " << MaxElements << " arguments to the function.\n");
         // We limit aggregate promotion to only promoting up to a fixed number
         // of elements of the aggregate.
         return false;
@@ -575,10 +602,9 @@ bool ArgPromotion::isSafeToPromoteArgument(Argument *Arg,
   // blocks we know to be transparent to the load.
   SmallPtrSet<BasicBlock*, 16> TranspBlocks;
 
-  for (unsigned i = 0, e = Loads.size(); i != e; ++i) {
+  for (LoadInst *Load : Loads) {
     // Check to see if the load is invalidated from the start of the block to
     // the load itself.
-    LoadInst *Load = Loads[i];
     BasicBlock *BB = Load->getParent();
 
     MemoryLocation Loc = MemoryLocation::get(Load);
@@ -604,9 +630,9 @@ bool ArgPromotion::isSafeToPromoteArgument(Argument *Arg,
 /// DoPromotion - This method actually performs the promotion of the specified
 /// arguments, and returns the new function.  At this point, we know that it's
 /// safe to do so.
-CallGraphNode *ArgPromotion::DoPromotion(Function *F,
-                             SmallPtrSetImpl<Argument*> &ArgsToPromote,
-                             SmallPtrSetImpl<Argument*> &ByValArgsToTransform) {
+static CallGraphNode *
+DoPromotion(Function *F, SmallPtrSetImpl<Argument *> &ArgsToPromote,
+            SmallPtrSetImpl<Argument *> &ByValArgsToTransform, CallGraph &CG) {
 
   // Start by computing a new prototype for the function, which is the same as
   // the old function, but has modified arguments.
@@ -700,12 +726,11 @@ CallGraphNode *ArgPromotion::DoPromotion(Function *F,
       }
 
       // Add a parameter to the function for each element passed in.
-      for (ScalarizeTable::iterator SI = ArgIndices.begin(),
-             E = ArgIndices.end(); SI != E; ++SI) {
+      for (const auto &ArgIndex : ArgIndices) {
         // not allowed to dereference ->begin() if size() is 0
         Params.push_back(GetElementPtrInst::getIndexedType(
             cast<PointerType>(I->getType()->getScalarType())->getElementType(),
-            SI->second));
+            ArgIndex.second));
         assert(Params.back());
       }
 
@@ -745,10 +770,6 @@ CallGraphNode *ArgPromotion::DoPromotion(Function *F,
   F->getParent()->getFunctionList().insert(F->getIterator(), NF);
   NF->takeName(F);
 
-  // Get the callgraph information that we need to update to reflect our
-  // changes.
-  CallGraph &CG = getAnalysis<CallGraphWrapperPass>().getCallGraph();
-
   // Get a new callgraph node for NF.
   CallGraphNode *NF_CGN = CG.getOrInsertFunction(NF);
 
@@ -800,27 +821,25 @@ CallGraphNode *ArgPromotion::DoPromotion(Function *F,
         // Store the Value* version of the indices in here, but declare it now
         // for reuse.
         std::vector<Value*> Ops;
-        for (ScalarizeTable::iterator SI = ArgIndices.begin(),
-               E = ArgIndices.end(); SI != E; ++SI) {
+        for (const auto &ArgIndex : ArgIndices) {
           Value *V = *AI;
-          LoadInst *OrigLoad = OriginalLoads[std::make_pair(&*I, SI->second)];
-          if (!SI->second.empty()) {
-            Ops.reserve(SI->second.size());
+          LoadInst *OrigLoad =
+              OriginalLoads[std::make_pair(&*I, ArgIndex.second)];
+          if (!ArgIndex.second.empty()) {
+            Ops.reserve(ArgIndex.second.size());
             Type *ElTy = V->getType();
-            for (IndicesVector::const_iterator II = SI->second.begin(),
-                                               IE = SI->second.end();
-                 II != IE; ++II) {
+            for (unsigned long II : ArgIndex.second) {
               // Use i32 to index structs, and i64 for others (pointers/arrays).
               // This satisfies GEP constraints.
               Type *IdxTy = (ElTy->isStructTy() ?
                     Type::getInt32Ty(F->getContext()) : 
                     Type::getInt64Ty(F->getContext()));
-              Ops.push_back(ConstantInt::get(IdxTy, *II));
+              Ops.push_back(ConstantInt::get(IdxTy, II));
               // Keep track of the type we're currently indexing.
-              ElTy = cast<CompositeType>(ElTy)->getTypeAtIndex(*II);
+              ElTy = cast<CompositeType>(ElTy)->getTypeAtIndex(II);
             }
             // And create a GEP to extract those indices.
-            V = GetElementPtrInst::Create(SI->first, V, Ops,
+            V = GetElementPtrInst::Create(ArgIndex.first, V, Ops,
                                           V->getName() + ".idx", Call);
             Ops.clear();
           }
@@ -852,15 +871,18 @@ CallGraphNode *ArgPromotion::DoPromotion(Function *F,
       AttributesVec.push_back(AttributeSet::get(Call->getContext(),
                                                 CallPAL.getFnAttributes()));
 
+    SmallVector<OperandBundleDef, 1> OpBundles;
+    CS.getOperandBundlesAsDefs(OpBundles);
+
     Instruction *New;
     if (InvokeInst *II = dyn_cast<InvokeInst>(Call)) {
       New = InvokeInst::Create(NF, II->getNormalDest(), II->getUnwindDest(),
-                               Args, "", Call);
+                               Args, OpBundles, "", Call);
       cast<InvokeInst>(New)->setCallingConv(CS.getCallingConv());
       cast<InvokeInst>(New)->setAttributes(AttributeSet::get(II->getContext(),
                                                             AttributesVec));
     } else {
-      New = CallInst::Create(NF, Args, "", Call);
+      New = CallInst::Create(NF, Args, OpBundles, "", Call);
       cast<CallInst>(New)->setCallingConv(CS.getCallingConv());
       cast<CallInst>(New)->setAttributes(AttributeSet::get(New->getContext(),
                                                           AttributesVec));
diff --git a/lib/Transforms/IPO/CMakeLists.txt b/lib/Transforms/IPO/CMakeLists.txt
index 351b88fe2aa0..d6782c738cbe 100644
--- a/lib/Transforms/IPO/CMakeLists.txt
+++ b/lib/Transforms/IPO/CMakeLists.txt
@@ -19,7 +19,7 @@ add_llvm_library(LLVMipo
   Inliner.cpp
   Internalize.cpp
   LoopExtractor.cpp
-  LowerBitSets.cpp
+  LowerTypeTests.cpp
   MergeFunctions.cpp
   PartialInlining.cpp
   PassManagerBuilder.cpp
@@ -27,6 +27,7 @@ add_llvm_library(LLVMipo
   SampleProfile.cpp
   StripDeadPrototypes.cpp
   StripSymbols.cpp
+  WholeProgramDevirt.cpp
 
   ADDITIONAL_HEADER_DIRS
   ${LLVM_MAIN_INCLUDE_DIR}/llvm/Transforms
diff --git a/lib/Transforms/IPO/ConstantMerge.cpp b/lib/Transforms/IPO/ConstantMerge.cpp
index 0aa49d6fde01..d75ed206ad23 100644
--- a/lib/Transforms/IPO/ConstantMerge.cpp
+++ b/lib/Transforms/IPO/ConstantMerge.cpp
@@ -17,7 +17,7 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "llvm/Transforms/IPO.h"
+#include "llvm/Transforms/IPO/ConstantMerge.h"
 #include "llvm/ADT/DenseMap.h"
 #include "llvm/ADT/PointerIntPair.h"
 #include "llvm/ADT/SmallPtrSet.h"
@@ -28,41 +28,13 @@
 #include "llvm/IR/Module.h"
 #include "llvm/IR/Operator.h"
 #include "llvm/Pass.h"
+#include "llvm/Transforms/IPO.h"
 using namespace llvm;
 
 #define DEBUG_TYPE "constmerge"
 
 STATISTIC(NumMerged, "Number of global constants merged");
 
-namespace {
-  struct ConstantMerge : public ModulePass {
-    static char ID; // Pass identification, replacement for typeid
-    ConstantMerge() : ModulePass(ID) {
-      initializeConstantMergePass(*PassRegistry::getPassRegistry());
-    }
-
-    // For this pass, process all of the globals in the module, eliminating
-    // duplicate constants.
-    bool runOnModule(Module &M) override;
-
-    // Return true iff we can determine the alignment of this global variable.
-    bool hasKnownAlignment(GlobalVariable *GV) const;
-
-    // Return the alignment of the global, including converting the default
-    // alignment to a concrete value.
-    unsigned getAlignment(GlobalVariable *GV) const;
-
-  };
-}
-
-char ConstantMerge::ID = 0;
-INITIALIZE_PASS(ConstantMerge, "constmerge",
-                "Merge Duplicate Global Constants", false, false)
-
-ModulePass *llvm::createConstantMergePass() { return new ConstantMerge(); }
-
-
-
 /// Find values that are marked as llvm.used.
 static void FindUsedValues(GlobalVariable *LLVMUsed,
                            SmallPtrSetImpl<const GlobalValue*> &UsedValues) {
@@ -85,18 +57,17 @@ static bool IsBetterCanonical(const GlobalVariable &A,
   if (A.hasLocalLinkage() && !B.hasLocalLinkage())
     return false;
 
-  return A.hasUnnamedAddr();
+  return A.hasGlobalUnnamedAddr();
 }
 
-unsigned ConstantMerge::getAlignment(GlobalVariable *GV) const {
+static unsigned getAlignment(GlobalVariable *GV) {
   unsigned Align = GV->getAlignment();
   if (Align)
     return Align;
   return GV->getParent()->getDataLayout().getPreferredAlignment(GV);
 }
 
-bool ConstantMerge::runOnModule(Module &M) {
-
+static bool mergeConstants(Module &M) {
   // Find all the globals that are marked "used".  These cannot be merged.
   SmallPtrSet<const GlobalValue*, 8> UsedGlobals;
   FindUsedValues(M.getGlobalVariable("llvm.used"), UsedGlobals);
@@ -181,11 +152,11 @@ bool ConstantMerge::runOnModule(Module &M) {
       if (!Slot || Slot == GV)
         continue;
 
-      if (!Slot->hasUnnamedAddr() && !GV->hasUnnamedAddr())
+      if (!Slot->hasGlobalUnnamedAddr() && !GV->hasGlobalUnnamedAddr())
         continue;
 
-      if (!GV->hasUnnamedAddr())
-        Slot->setUnnamedAddr(false);
+      if (!GV->hasGlobalUnnamedAddr())
+        Slot->setUnnamedAddr(GlobalValue::UnnamedAddr::None);
 
       // Make all uses of the duplicate constant use the canonical version.
       Replacements.push_back(std::make_pair(GV, Slot));
@@ -220,3 +191,34 @@ bool ConstantMerge::runOnModule(Module &M) {
     Replacements.clear();
   }
 }
+
+PreservedAnalyses ConstantMergePass::run(Module &M, ModuleAnalysisManager &) {
+  if (!mergeConstants(M))
+    return PreservedAnalyses::all();
+  return PreservedAnalyses::none();
+}
+
+namespace {
+struct ConstantMergeLegacyPass : public ModulePass {
+  static char ID; // Pass identification, replacement for typeid
+  ConstantMergeLegacyPass() : ModulePass(ID) {
+    initializeConstantMergeLegacyPassPass(*PassRegistry::getPassRegistry());
+  }
+
+  // For this pass, process all of the globals in the module, eliminating
+  // duplicate constants.
+  bool runOnModule(Module &M) {
+    if (skipModule(M))
+      return false;
+    return mergeConstants(M);
+  }
+};
+}
+
+char ConstantMergeLegacyPass::ID = 0;
+INITIALIZE_PASS(ConstantMergeLegacyPass, "constmerge",
+                "Merge Duplicate Global Constants", false, false)
+
+ModulePass *llvm::createConstantMergePass() {
+  return new ConstantMergeLegacyPass();
+}
diff --git a/lib/Transforms/IPO/CrossDSOCFI.cpp b/lib/Transforms/IPO/CrossDSOCFI.cpp
index 5bbb7513005c..58731eaf6e30 100644
--- a/lib/Transforms/IPO/CrossDSOCFI.cpp
+++ b/lib/Transforms/IPO/CrossDSOCFI.cpp
@@ -12,7 +12,7 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "llvm/Transforms/IPO.h"
+#include "llvm/Transforms/IPO/CrossDSOCFI.h"
 #include "llvm/ADT/DenseSet.h"
 #include "llvm/ADT/EquivalenceClasses.h"
 #include "llvm/ADT/Statistic.h"
@@ -30,13 +30,14 @@
 #include "llvm/Pass.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/raw_ostream.h"
+#include "llvm/Transforms/IPO.h"
 #include "llvm/Transforms/Utils/BasicBlockUtils.h"
 
 using namespace llvm;
 
 #define DEBUG_TYPE "cross-dso-cfi"
 
-STATISTIC(TypeIds, "Number of unique type identifiers");
+STATISTIC(NumTypeIds, "Number of unique type identifiers");
 
 namespace {
 
@@ -46,13 +47,10 @@ struct CrossDSOCFI : public ModulePass {
     initializeCrossDSOCFIPass(*PassRegistry::getPassRegistry());
   }
 
-  Module *M;
   MDNode *VeryLikelyWeights;
 
-  ConstantInt *extractBitSetTypeId(MDNode *MD);
-  void buildCFICheck();
-
-  bool doInitialization(Module &M) override;
+  ConstantInt *extractNumericTypeId(MDNode *MD);
+  void buildCFICheck(Module &M);
   bool runOnModule(Module &M) override;
 };
 
@@ -65,18 +63,10 @@ char CrossDSOCFI::ID = 0;
 
 ModulePass *llvm::createCrossDSOCFIPass() { return new CrossDSOCFI; }
 
-bool CrossDSOCFI::doInitialization(Module &Mod) {
-  M = &Mod;
-  VeryLikelyWeights =
-      MDBuilder(M->getContext()).createBranchWeights((1U << 20) - 1, 1);
-
-  return false;
-}
-
-/// extractBitSetTypeId - Extracts TypeId from a hash-based bitset MDNode.
-ConstantInt *CrossDSOCFI::extractBitSetTypeId(MDNode *MD) {
+/// Extracts a numeric type identifier from an MDNode containing type metadata.
+ConstantInt *CrossDSOCFI::extractNumericTypeId(MDNode *MD) {
   // This check excludes vtables for classes inside anonymous namespaces.
-  auto TM = dyn_cast<ValueAsMetadata>(MD->getOperand(0));
+  auto TM = dyn_cast<ValueAsMetadata>(MD->getOperand(1));
   if (!TM)
     return nullptr;
   auto C = dyn_cast_or_null<ConstantInt>(TM->getValue());
@@ -84,68 +74,63 @@ ConstantInt *CrossDSOCFI::extractBitSetTypeId(MDNode *MD) {
   // We are looking for i64 constants.
   if (C->getBitWidth() != 64) return nullptr;
 
-  // Sanity check.
-  auto FM = dyn_cast_or_null<ValueAsMetadata>(MD->getOperand(1));
-  // Can be null if a function was removed by an optimization.
-  if (FM) {
-    auto F = dyn_cast<Function>(FM->getValue());
-    // But can never be a function declaration.
-    assert(!F || !F->isDeclaration());
-    (void)F; // Suppress unused variable warning in the no-asserts build.
-  }
   return C;
 }
 
 /// buildCFICheck - emits __cfi_check for the current module.
-void CrossDSOCFI::buildCFICheck() {
+void CrossDSOCFI::buildCFICheck(Module &M) {
   // FIXME: verify that __cfi_check ends up near the end of the code section,
-  // but before the jump slots created in LowerBitSets.
-  llvm::DenseSet<uint64_t> BitSetIds;
-  NamedMDNode *BitSetNM = M->getNamedMetadata("llvm.bitsets");
-
-  if (BitSetNM)
-    for (unsigned I = 0, E = BitSetNM->getNumOperands(); I != E; ++I)
-      if (ConstantInt *TypeId = extractBitSetTypeId(BitSetNM->getOperand(I)))
-        BitSetIds.insert(TypeId->getZExtValue());
-
-  LLVMContext &Ctx = M->getContext();
-  Constant *C = M->getOrInsertFunction(
-      "__cfi_check",
-      FunctionType::get(
-          Type::getVoidTy(Ctx),
-          {Type::getInt64Ty(Ctx), PointerType::getUnqual(Type::getInt8Ty(Ctx))},
-          false));
+  // but before the jump slots created in LowerTypeTests.
+  llvm::DenseSet<uint64_t> TypeIds;
+  SmallVector<MDNode *, 2> Types;
+  for (GlobalObject &GO : M.global_objects()) {
+    Types.clear();
+    GO.getMetadata(LLVMContext::MD_type, Types);
+    for (MDNode *Type : Types) {
+      // Sanity check. GO must not be a function declaration.
+      assert(!isa<Function>(&GO) || !cast<Function>(&GO)->isDeclaration());
+
+      if (ConstantInt *TypeId = extractNumericTypeId(Type))
+        TypeIds.insert(TypeId->getZExtValue());
+    }
+  }
+
+  LLVMContext &Ctx = M.getContext();
+  Constant *C = M.getOrInsertFunction(
+      "__cfi_check", Type::getVoidTy(Ctx), Type::getInt64Ty(Ctx),
+      Type::getInt8PtrTy(Ctx), Type::getInt8PtrTy(Ctx), nullptr);
   Function *F = dyn_cast<Function>(C);
   F->setAlignment(4096);
   auto args = F->arg_begin();
-  Argument &CallSiteTypeId = *(args++);
+  Value &CallSiteTypeId = *(args++);
   CallSiteTypeId.setName("CallSiteTypeId");
-  Argument &Addr = *(args++);
+  Value &Addr = *(args++);
   Addr.setName("Addr");
+  Value &CFICheckFailData = *(args++);
+  CFICheckFailData.setName("CFICheckFailData");
   assert(args == F->arg_end());
 
   BasicBlock *BB = BasicBlock::Create(Ctx, "entry", F);
+  BasicBlock *ExitBB = BasicBlock::Create(Ctx, "exit", F);
 
-  BasicBlock *TrapBB = BasicBlock::Create(Ctx, "trap", F);
-  IRBuilder<> IRBTrap(TrapBB);
-  Function *TrapFn = Intrinsic::getDeclaration(M, Intrinsic::trap);
-  llvm::CallInst *TrapCall = IRBTrap.CreateCall(TrapFn);
-  TrapCall->setDoesNotReturn();
-  TrapCall->setDoesNotThrow();
-  IRBTrap.CreateUnreachable();
+  BasicBlock *TrapBB = BasicBlock::Create(Ctx, "fail", F);
+  IRBuilder<> IRBFail(TrapBB);
+  Constant *CFICheckFailFn = M.getOrInsertFunction(
+      "__cfi_check_fail", Type::getVoidTy(Ctx), Type::getInt8PtrTy(Ctx),
+      Type::getInt8PtrTy(Ctx), nullptr);
+  IRBFail.CreateCall(CFICheckFailFn, {&CFICheckFailData, &Addr});
+  IRBFail.CreateBr(ExitBB);
 
-  BasicBlock *ExitBB = BasicBlock::Create(Ctx, "exit", F);
   IRBuilder<> IRBExit(ExitBB);
   IRBExit.CreateRetVoid();
 
   IRBuilder<> IRB(BB);
-  SwitchInst *SI = IRB.CreateSwitch(&CallSiteTypeId, TrapBB, BitSetIds.size());
-  for (uint64_t TypeId : BitSetIds) {
+  SwitchInst *SI = IRB.CreateSwitch(&CallSiteTypeId, TrapBB, TypeIds.size());
+  for (uint64_t TypeId : TypeIds) {
     ConstantInt *CaseTypeId = ConstantInt::get(Type::getInt64Ty(Ctx), TypeId);
     BasicBlock *TestBB = BasicBlock::Create(Ctx, "test", F);
     IRBuilder<> IRBTest(TestBB);
-    Function *BitsetTestFn =
-        Intrinsic::getDeclaration(M, Intrinsic::bitset_test);
+    Function *BitsetTestFn = Intrinsic::getDeclaration(&M, Intrinsic::type_test);
 
     Value *Test = IRBTest.CreateCall(
         BitsetTestFn, {&Addr, MetadataAsValue::get(
@@ -154,13 +139,26 @@ void CrossDSOCFI::buildCFICheck() {
     BI->setMetadata(LLVMContext::MD_prof, VeryLikelyWeights);
 
     SI->addCase(CaseTypeId, TestBB);
-    ++TypeIds;
+    ++NumTypeIds;
   }
 }
 
 bool CrossDSOCFI::runOnModule(Module &M) {
+  if (skipModule(M))
+    return false;
+
+  VeryLikelyWeights =
+    MDBuilder(M.getContext()).createBranchWeights((1U << 20) - 1, 1);
   if (M.getModuleFlag("Cross-DSO CFI") == nullptr)
     return false;
-  buildCFICheck();
+  buildCFICheck(M);
   return true;
 }
+
+PreservedAnalyses CrossDSOCFIPass::run(Module &M, AnalysisManager<Module> &AM) {
+  CrossDSOCFI Impl;
+  bool Changed = Impl.runOnModule(M);
+  if (!Changed)
+    return PreservedAnalyses::all();
+  return PreservedAnalyses::none();
+}
diff --git a/lib/Transforms/IPO/DeadArgumentElimination.cpp b/lib/Transforms/IPO/DeadArgumentElimination.cpp
index 4de3d95ab11d..c8c895b18796 100644
--- a/lib/Transforms/IPO/DeadArgumentElimination.cpp
+++ b/lib/Transforms/IPO/DeadArgumentElimination.cpp
@@ -17,8 +17,7 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "llvm/Transforms/IPO.h"
-#include "llvm/ADT/DenseMap.h"
+#include "llvm/Transforms/IPO/DeadArgumentElimination.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/Statistic.h"
 #include "llvm/ADT/StringExtras.h"
@@ -35,8 +34,8 @@
 #include "llvm/Pass.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/raw_ostream.h"
+#include "llvm/Transforms/IPO.h"
 #include "llvm/Transforms/Utils/BasicBlockUtils.h"
-#include <map>
 #include <set>
 #include <tuple>
 using namespace llvm;
@@ -51,77 +50,6 @@ namespace {
   /// DAE - The dead argument elimination pass.
   ///
   class DAE : public ModulePass {
-  public:
-
-    /// Struct that represents (part of) either a return value or a function
-    /// argument.  Used so that arguments and return values can be used
-    /// interchangeably.
-    struct RetOrArg {
-      RetOrArg(const Function *F, unsigned Idx, bool IsArg) : F(F), Idx(Idx),
-               IsArg(IsArg) {}
-      const Function *F;
-      unsigned Idx;
-      bool IsArg;
-
-      /// Make RetOrArg comparable, so we can put it into a map.
-      bool operator<(const RetOrArg &O) const {
-        return std::tie(F, Idx, IsArg) < std::tie(O.F, O.Idx, O.IsArg);
-      }
-
-      /// Make RetOrArg comparable, so we can easily iterate the multimap.
-      bool operator==(const RetOrArg &O) const {
-        return F == O.F && Idx == O.Idx && IsArg == O.IsArg;
-      }
-
-      std::string getDescription() const {
-        return (Twine(IsArg ? "Argument #" : "Return value #") + utostr(Idx) +
-                " of function " + F->getName()).str();
-      }
-    };
-
-    /// Liveness enum - During our initial pass over the program, we determine
-    /// that things are either alive or maybe alive. We don't mark anything
-    /// explicitly dead (even if we know they are), since anything not alive
-    /// with no registered uses (in Uses) will never be marked alive and will
-    /// thus become dead in the end.
-    enum Liveness { Live, MaybeLive };
-
-    /// Convenience wrapper
-    RetOrArg CreateRet(const Function *F, unsigned Idx) {
-      return RetOrArg(F, Idx, false);
-    }
-    /// Convenience wrapper
-    RetOrArg CreateArg(const Function *F, unsigned Idx) {
-      return RetOrArg(F, Idx, true);
-    }
-
-    typedef std::multimap<RetOrArg, RetOrArg> UseMap;
-    /// This maps a return value or argument to any MaybeLive return values or
-    /// arguments it uses. This allows the MaybeLive values to be marked live
-    /// when any of its users is marked live.
-    /// For example (indices are left out for clarity):
-    ///  - Uses[ret F] = ret G
-    ///    This means that F calls G, and F returns the value returned by G.
-    ///  - Uses[arg F] = ret G
-    ///    This means that some function calls G and passes its result as an
-    ///    argument to F.
-    ///  - Uses[ret F] = arg F
-    ///    This means that F returns one of its own arguments.
-    ///  - Uses[arg F] = arg G
-    ///    This means that G calls F and passes one of its own (G's) arguments
-    ///    directly to F.
-    UseMap Uses;
-
-    typedef std::set<RetOrArg> LiveSet;
-    typedef std::set<const Function*> LiveFuncSet;
-
-    /// This set contains all values that have been determined to be live.
-    LiveSet LiveValues;
-    /// This set contains all values that are cannot be changed in any way.
-    LiveFuncSet LiveFunctions;
-
-    typedef SmallVector<RetOrArg, 5> UseVector;
-
   protected:
     // DAH uses this to specify a different ID.
     explicit DAE(char &ID) : ModulePass(ID) {}
@@ -132,25 +60,16 @@ namespace {
       initializeDAEPass(*PassRegistry::getPassRegistry());
     }
 
-    bool runOnModule(Module &M) override;
+    bool runOnModule(Module &M) override {
+      if (skipModule(M))
+        return false;
+      DeadArgumentEliminationPass DAEP(ShouldHackArguments());
+      ModuleAnalysisManager DummyMAM;
+      PreservedAnalyses PA = DAEP.run(M, DummyMAM);
+      return !PA.areAllPreserved();
+    }
 
     virtual bool ShouldHackArguments() const { return false; }
-
-  private:
-    Liveness MarkIfNotLive(RetOrArg Use, UseVector &MaybeLiveUses);
-    Liveness SurveyUse(const Use *U, UseVector &MaybeLiveUses,
-                       unsigned RetValNum = -1U);
-    Liveness SurveyUses(const Value *V, UseVector &MaybeLiveUses);
-
-    void SurveyFunction(const Function &F);
-    void MarkValue(const RetOrArg &RA, Liveness L,
-                   const UseVector &MaybeLiveUses);
-    void MarkLive(const RetOrArg &RA);
-    void MarkLive(const Function &F);
-    void PropagateLiveness(const RetOrArg &RA);
-    bool RemoveDeadStuffFromFunction(Function *F);
-    bool DeleteDeadVarargs(Function &Fn);
-    bool RemoveDeadArgumentsFromCallers(Function &Fn);
   };
 }
 
@@ -183,7 +102,7 @@ ModulePass *llvm::createDeadArgHackingPass() { return new DAH(); }
 
 /// DeleteDeadVarargs - If this is an function that takes a ... list, and if
 /// llvm.vastart is never called, the varargs list is dead for the function.
-bool DAE::DeleteDeadVarargs(Function &Fn) {
+bool DeadArgumentEliminationPass::DeleteDeadVarargs(Function &Fn) {
   assert(Fn.getFunctionType()->isVarArg() && "Function isn't varargs!");
   if (Fn.isDeclaration() || !Fn.hasLocalLinkage()) return false;
 
@@ -200,9 +119,9 @@ bool DAE::DeleteDeadVarargs(Function &Fn) {
 
   // Okay, we know we can transform this function if safe.  Scan its body
   // looking for calls marked musttail or calls to llvm.vastart.
-  for (Function::iterator BB = Fn.begin(), E = Fn.end(); BB != E; ++BB) {
-    for (BasicBlock::iterator I = BB->begin(), E = BB->end(); I != E; ++I) {
-      CallInst *CI = dyn_cast<CallInst>(I);
+  for (BasicBlock &BB : Fn) {
+    for (Instruction &I : BB) {
+      CallInst *CI = dyn_cast<CallInst>(&I);
       if (!CI)
         continue;
       if (CI->isMustTailCall())
@@ -229,6 +148,7 @@ bool DAE::DeleteDeadVarargs(Function &Fn) {
   // Create the new function body and insert it into the module...
   Function *NF = Function::Create(NFTy, Fn.getLinkage());
   NF->copyAttributesFrom(&Fn);
+  NF->setComdat(Fn.getComdat());
   Fn.getParent()->getFunctionList().insert(Fn.getIterator(), NF);
   NF->takeName(&Fn);
 
@@ -257,14 +177,17 @@ bool DAE::DeleteDeadVarargs(Function &Fn) {
       PAL = AttributeSet::get(Fn.getContext(), AttributesVec);
     }
 
+    SmallVector<OperandBundleDef, 1> OpBundles;
+    CS.getOperandBundlesAsDefs(OpBundles);
+
     Instruction *New;
     if (InvokeInst *II = dyn_cast<InvokeInst>(Call)) {
       New = InvokeInst::Create(NF, II->getNormalDest(), II->getUnwindDest(),
-                               Args, "", Call);
+                               Args, OpBundles, "", Call);
       cast<InvokeInst>(New)->setCallingConv(CS.getCallingConv());
       cast<InvokeInst>(New)->setAttributes(PAL);
     } else {
-      New = CallInst::Create(NF, Args, "", Call);
+      New = CallInst::Create(NF, Args, OpBundles, "", Call);
       cast<CallInst>(New)->setCallingConv(CS.getCallingConv());
       cast<CallInst>(New)->setAttributes(PAL);
       if (cast<CallInst>(Call)->isTailCall())
@@ -316,8 +239,7 @@ bool DAE::DeleteDeadVarargs(Function &Fn) {
 /// RemoveDeadArgumentsFromCallers - Checks if the given function has any 
 /// arguments that are unused, and changes the caller parameters to be undefined
 /// instead.
-bool DAE::RemoveDeadArgumentsFromCallers(Function &Fn)
-{
+bool DeadArgumentEliminationPass::RemoveDeadArgumentsFromCallers(Function &Fn) {
   // We cannot change the arguments if this TU does not define the function or
   // if the linker may choose a function body from another TU, even if the
   // nominal linkage indicates that other copies of the function have the same
@@ -329,7 +251,7 @@ bool DAE::RemoveDeadArgumentsFromCallers(Function &Fn)
   //   %v = load i32 %p
   //   ret void
   // }
-  if (!Fn.isStrongDefinitionForLinker())
+  if (!Fn.hasExactDefinition())
     return false;
 
   // Functions with local linkage should already have been handled, except the
@@ -409,7 +331,9 @@ static Type *getRetComponentType(const Function *F, unsigned Idx) {
 /// MarkIfNotLive - This checks Use for liveness in LiveValues. If Use is not
 /// live, it adds Use to the MaybeLiveUses argument. Returns the determined
 /// liveness of Use.
-DAE::Liveness DAE::MarkIfNotLive(RetOrArg Use, UseVector &MaybeLiveUses) {
+DeadArgumentEliminationPass::Liveness
+DeadArgumentEliminationPass::MarkIfNotLive(RetOrArg Use,
+                                           UseVector &MaybeLiveUses) {
   // We're live if our use or its Function is already marked as live.
   if (LiveFunctions.count(Use.F) || LiveValues.count(Use))
     return Live;
@@ -428,8 +352,9 @@ DAE::Liveness DAE::MarkIfNotLive(RetOrArg Use, UseVector &MaybeLiveUses) {
 /// RetValNum is the return value number to use when this use is used in a
 /// return instruction. This is used in the recursion, you should always leave
 /// it at 0.
-DAE::Liveness DAE::SurveyUse(const Use *U,
-                             UseVector &MaybeLiveUses, unsigned RetValNum) {
+DeadArgumentEliminationPass::Liveness
+DeadArgumentEliminationPass::SurveyUse(const Use *U, UseVector &MaybeLiveUses,
+                                       unsigned RetValNum) {
     const User *V = U->getUser();
     if (const ReturnInst *RI = dyn_cast<ReturnInst>(V)) {
       // The value is returned from a function. It's only live when the
@@ -442,13 +367,14 @@ DAE::Liveness DAE::SurveyUse(const Use *U,
         // We might be live, depending on the liveness of Use.
         return MarkIfNotLive(Use, MaybeLiveUses);
       } else {
-        DAE::Liveness Result = MaybeLive;
+        DeadArgumentEliminationPass::Liveness Result = MaybeLive;
         for (unsigned i = 0; i < NumRetVals(F); ++i) {
           RetOrArg Use = CreateRet(F, i);
           // We might be live, depending on the liveness of Use. If any
           // sub-value is live, then the entire value is considered live. This
           // is a conservative choice, and better tracking is possible.
-          DAE::Liveness SubResult = MarkIfNotLive(Use, MaybeLiveUses);
+          DeadArgumentEliminationPass::Liveness SubResult =
+              MarkIfNotLive(Use, MaybeLiveUses);
           if (Result != Live)
             Result = SubResult;
         }
@@ -514,7 +440,9 @@ DAE::Liveness DAE::SurveyUse(const Use *U,
 /// Adds all uses that cause the result to be MaybeLive to MaybeLiveRetUses. If
 /// the result is Live, MaybeLiveUses might be modified but its content should
 /// be ignored (since it might not be complete).
-DAE::Liveness DAE::SurveyUses(const Value *V, UseVector &MaybeLiveUses) {
+DeadArgumentEliminationPass::Liveness
+DeadArgumentEliminationPass::SurveyUses(const Value *V,
+                                        UseVector &MaybeLiveUses) {
   // Assume it's dead (which will only hold if there are no uses at all..).
   Liveness Result = MaybeLive;
   // Check each use.
@@ -534,7 +462,7 @@ DAE::Liveness DAE::SurveyUses(const Value *V, UseVector &MaybeLiveUses) {
 // We consider arguments of non-internal functions to be intrinsically alive as
 // well as arguments to functions which have their "address taken".
 //
-void DAE::SurveyFunction(const Function &F) {
+void DeadArgumentEliminationPass::SurveyFunction(const Function &F) {
   // Functions with inalloca parameters are expecting args in a particular
   // register and memory layout.
   if (F.getAttributes().hasAttrSomewhere(Attribute::InAlloca)) {
@@ -570,12 +498,13 @@ void DAE::SurveyFunction(const Function &F) {
         return;
       }
 
-  if (!F.hasLocalLinkage() && (!ShouldHackArguments() || F.isIntrinsic())) {
+  if (!F.hasLocalLinkage() && (!ShouldHackArguments || F.isIntrinsic())) {
     MarkLive(F);
     return;
   }
 
-  DEBUG(dbgs() << "DAE - Inspecting callers for fn: " << F.getName() << "\n");
+  DEBUG(dbgs() << "DeadArgumentEliminationPass - Inspecting callers for fn: "
+               << F.getName() << "\n");
   // Keep track of the number of live retvals, so we can skip checks once all
   // of them turn out to be live.
   unsigned NumLiveRetVals = 0;
@@ -637,7 +566,8 @@ void DAE::SurveyFunction(const Function &F) {
   for (unsigned i = 0; i != RetCount; ++i)
     MarkValue(CreateRet(&F, i), RetValLiveness[i], MaybeLiveRetUses[i]);
 
-  DEBUG(dbgs() << "DAE - Inspecting args for fn: " << F.getName() << "\n");
+  DEBUG(dbgs() << "DeadArgumentEliminationPass - Inspecting args for fn: "
+               << F.getName() << "\n");
 
   // Now, check all of our arguments.
   unsigned i = 0;
@@ -669,17 +599,16 @@ void DAE::SurveyFunction(const Function &F) {
 /// MaybeLive, it also takes all uses in MaybeLiveUses and records them in Uses,
 /// such that RA will be marked live if any use in MaybeLiveUses gets marked
 /// live later on.
-void DAE::MarkValue(const RetOrArg &RA, Liveness L,
-                    const UseVector &MaybeLiveUses) {
+void DeadArgumentEliminationPass::MarkValue(const RetOrArg &RA, Liveness L,
+                                            const UseVector &MaybeLiveUses) {
   switch (L) {
     case Live: MarkLive(RA); break;
     case MaybeLive:
     {
       // Note any uses of this value, so this return value can be
       // marked live whenever one of the uses becomes live.
-      for (UseVector::const_iterator UI = MaybeLiveUses.begin(),
-           UE = MaybeLiveUses.end(); UI != UE; ++UI)
-        Uses.insert(std::make_pair(*UI, RA));
+      for (const auto &MaybeLiveUse : MaybeLiveUses)
+        Uses.insert(std::make_pair(MaybeLiveUse, RA));
       break;
     }
   }
@@ -689,8 +618,9 @@ void DAE::MarkValue(const RetOrArg &RA, Liveness L,
 /// changed in any way. Additionally,
 /// mark any values that are used as this function's parameters or by its return
 /// values (according to Uses) live as well.
-void DAE::MarkLive(const Function &F) {
-  DEBUG(dbgs() << "DAE - Intrinsically live fn: " << F.getName() << "\n");
+void DeadArgumentEliminationPass::MarkLive(const Function &F) {
+  DEBUG(dbgs() << "DeadArgumentEliminationPass - Intrinsically live fn: "
+               << F.getName() << "\n");
   // Mark the function as live.
   LiveFunctions.insert(&F);
   // Mark all arguments as live.
@@ -704,20 +634,21 @@ void DAE::MarkLive(const Function &F) {
 /// MarkLive - Mark the given return value or argument as live. Additionally,
 /// mark any values that are used by this value (according to Uses) live as
 /// well.
-void DAE::MarkLive(const RetOrArg &RA) {
+void DeadArgumentEliminationPass::MarkLive(const RetOrArg &RA) {
   if (LiveFunctions.count(RA.F))
     return; // Function was already marked Live.
 
   if (!LiveValues.insert(RA).second)
     return; // We were already marked Live.
 
-  DEBUG(dbgs() << "DAE - Marking " << RA.getDescription() << " live\n");
+  DEBUG(dbgs() << "DeadArgumentEliminationPass - Marking "
+               << RA.getDescription() << " live\n");
   PropagateLiveness(RA);
 }
 
 /// PropagateLiveness - Given that RA is a live value, propagate it's liveness
 /// to any other values it uses (according to Uses).
-void DAE::PropagateLiveness(const RetOrArg &RA) {
+void DeadArgumentEliminationPass::PropagateLiveness(const RetOrArg &RA) {
   // We don't use upper_bound (or equal_range) here, because our recursive call
   // to ourselves is likely to cause the upper_bound (which is the first value
   // not belonging to RA) to become erased and the iterator invalidated.
@@ -736,7 +667,7 @@ void DAE::PropagateLiveness(const RetOrArg &RA) {
 // that are not in LiveValues. Transform the function and all of the callees of
 // the function to not have these arguments and return values.
 //
-bool DAE::RemoveDeadStuffFromFunction(Function *F) {
+bool DeadArgumentEliminationPass::RemoveDeadStuffFromFunction(Function *F) {
   // Don't modify fully live functions
   if (LiveFunctions.count(F))
     return false;
@@ -777,8 +708,9 @@ bool DAE::RemoveDeadStuffFromFunction(Function *F) {
       }
     } else {
       ++NumArgumentsEliminated;
-      DEBUG(dbgs() << "DAE - Removing argument " << i << " (" << I->getName()
-            << ") from " << F->getName() << "\n");
+      DEBUG(dbgs() << "DeadArgumentEliminationPass - Removing argument " << i
+                   << " (" << I->getName() << ") from " << F->getName()
+                   << "\n");
     }
   }
 
@@ -821,8 +753,8 @@ bool DAE::RemoveDeadStuffFromFunction(Function *F) {
         NewRetIdxs[i] = RetTypes.size() - 1;
       } else {
         ++NumRetValsEliminated;
-        DEBUG(dbgs() << "DAE - Removing return value " << i << " from "
-              << F->getName() << "\n");
+        DEBUG(dbgs() << "DeadArgumentEliminationPass - Removing return value "
+                     << i << " from " << F->getName() << "\n");
       }
     }
     if (RetTypes.size() > 1) {
@@ -882,6 +814,7 @@ bool DAE::RemoveDeadStuffFromFunction(Function *F) {
   // Create the new function body and insert it into the module...
   Function *NF = Function::Create(NFTy, F->getLinkage());
   NF->copyAttributesFrom(F);
+  NF->setComdat(F->getComdat());
   NF->setAttributes(NewPAL);
   // Insert the new function before the old function, so we won't be processing
   // it again.
@@ -950,14 +883,17 @@ bool DAE::RemoveDeadStuffFromFunction(Function *F) {
     // Reconstruct the AttributesList based on the vector we constructed.
     AttributeSet NewCallPAL = AttributeSet::get(F->getContext(), AttributesVec);
 
+    SmallVector<OperandBundleDef, 1> OpBundles;
+    CS.getOperandBundlesAsDefs(OpBundles);
+
     Instruction *New;
     if (InvokeInst *II = dyn_cast<InvokeInst>(Call)) {
       New = InvokeInst::Create(NF, II->getNormalDest(), II->getUnwindDest(),
-                               Args, "", Call->getParent());
+                               Args, OpBundles, "", Call->getParent());
       cast<InvokeInst>(New)->setCallingConv(CS.getCallingConv());
       cast<InvokeInst>(New)->setAttributes(NewCallPAL);
     } else {
-      New = CallInst::Create(NF, Args, "", Call);
+      New = CallInst::Create(NF, Args, OpBundles, "", Call);
       cast<CallInst>(New)->setCallingConv(CS.getCallingConv());
       cast<CallInst>(New)->setAttributes(NewCallPAL);
       if (cast<CallInst>(Call)->isTailCall())
@@ -1045,8 +981,8 @@ bool DAE::RemoveDeadStuffFromFunction(Function *F) {
   // If we change the return value of the function we must rewrite any return
   // instructions.  Check this now.
   if (F->getReturnType() != NF->getReturnType())
-    for (Function::iterator BB = NF->begin(), E = NF->end(); BB != E; ++BB)
-      if (ReturnInst *RI = dyn_cast<ReturnInst>(BB->getTerminator())) {
+    for (BasicBlock &BB : *NF)
+      if (ReturnInst *RI = dyn_cast<ReturnInst>(BB.getTerminator())) {
         Value *RetVal;
 
         if (NFTy->getReturnType()->isVoidTy()) {
@@ -1081,7 +1017,7 @@ bool DAE::RemoveDeadStuffFromFunction(Function *F) {
         // Replace the return instruction with one returning the new return
         // value (possibly 0 if we became void).
         ReturnInst::Create(F->getContext(), RetVal, RI);
-        BB->getInstList().erase(RI);
+        BB.getInstList().erase(RI);
       }
 
   // Patch the pointer to LLVM function in debug info descriptor.
@@ -1093,14 +1029,15 @@ bool DAE::RemoveDeadStuffFromFunction(Function *F) {
   return true;
 }
 
-bool DAE::runOnModule(Module &M) {
+PreservedAnalyses DeadArgumentEliminationPass::run(Module &M,
+                                                   ModuleAnalysisManager &) {
   bool Changed = false;
 
   // First pass: Do a simple check to see if any functions can have their "..."
   // removed.  We can do this if they never call va_start.  This loop cannot be
   // fused with the next loop, because deleting a function invalidates
   // information computed while surveying other functions.
-  DEBUG(dbgs() << "DAE - Deleting dead varargs\n");
+  DEBUG(dbgs() << "DeadArgumentEliminationPass - Deleting dead varargs\n");
   for (Module::iterator I = M.begin(), E = M.end(); I != E; ) {
     Function &F = *I++;
     if (F.getFunctionType()->isVarArg())
@@ -1111,7 +1048,7 @@ bool DAE::runOnModule(Module &M) {
   // We assume all arguments are dead unless proven otherwise (allowing us to
   // determine that dead arguments passed into recursive functions are dead).
   //
-  DEBUG(dbgs() << "DAE - Determining liveness\n");
+  DEBUG(dbgs() << "DeadArgumentEliminationPass - Determining liveness\n");
   for (auto &F : M)
     SurveyFunction(F);
 
@@ -1129,5 +1066,7 @@ bool DAE::runOnModule(Module &M) {
   for (auto &F : M)
     Changed |= RemoveDeadArgumentsFromCallers(F);
 
-  return Changed;
+  if (!Changed)
+    return PreservedAnalyses::all();
+  return PreservedAnalyses::none();
 }
diff --git a/lib/Transforms/IPO/ElimAvailExtern.cpp b/lib/Transforms/IPO/ElimAvailExtern.cpp
index af313a6b001d..98c4b1740306 100644
--- a/lib/Transforms/IPO/ElimAvailExtern.cpp
+++ b/lib/Transforms/IPO/ElimAvailExtern.cpp
@@ -13,10 +13,11 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "llvm/Transforms/IPO.h"
+#include "llvm/Transforms/IPO/ElimAvailExtern.h"
 #include "llvm/ADT/Statistic.h"
 #include "llvm/IR/Constants.h"
 #include "llvm/IR/Module.h"
+#include "llvm/Transforms/IPO.h"
 #include "llvm/Transforms/Utils/GlobalStatus.h"
 #include "llvm/Pass.h"
 using namespace llvm;
@@ -26,30 +27,7 @@ using namespace llvm;
 STATISTIC(NumFunctions, "Number of functions removed");
 STATISTIC(NumVariables, "Number of global variables removed");
 
-namespace {
-struct EliminateAvailableExternally : public ModulePass {
-  static char ID; // Pass identification, replacement for typeid
-  EliminateAvailableExternally() : ModulePass(ID) {
-    initializeEliminateAvailableExternallyPass(
-        *PassRegistry::getPassRegistry());
-  }
-
-  // run - Do the EliminateAvailableExternally pass on the specified module,
-  // optionally updating the specified callgraph to reflect the changes.
-  //
-  bool runOnModule(Module &M) override;
-};
-}
-
-char EliminateAvailableExternally::ID = 0;
-INITIALIZE_PASS(EliminateAvailableExternally, "elim-avail-extern",
-                "Eliminate Available Externally Globals", false, false)
-
-ModulePass *llvm::createEliminateAvailableExternallyPass() {
-  return new EliminateAvailableExternally();
-}
-
-bool EliminateAvailableExternally::runOnModule(Module &M) {
+static bool eliminateAvailableExternally(Module &M) {
   bool Changed = false;
 
   // Drop initializers of available externally global variables.
@@ -82,3 +60,37 @@ bool EliminateAvailableExternally::runOnModule(Module &M) {
 
   return Changed;
 }
+
+PreservedAnalyses
+EliminateAvailableExternallyPass::run(Module &M, ModuleAnalysisManager &) {
+  if (!eliminateAvailableExternally(M))
+    return PreservedAnalyses::all();
+  return PreservedAnalyses::none();
+}
+
+namespace {
+struct EliminateAvailableExternallyLegacyPass : public ModulePass {
+  static char ID; // Pass identification, replacement for typeid
+  EliminateAvailableExternallyLegacyPass() : ModulePass(ID) {
+    initializeEliminateAvailableExternallyLegacyPassPass(
+        *PassRegistry::getPassRegistry());
+  }
+
+  // run - Do the EliminateAvailableExternally pass on the specified module,
+  // optionally updating the specified callgraph to reflect the changes.
+  //
+  bool runOnModule(Module &M) {
+    if (skipModule(M))
+      return false;
+    return eliminateAvailableExternally(M);
+  }
+};
+}
+
+char EliminateAvailableExternallyLegacyPass::ID = 0;
+INITIALIZE_PASS(EliminateAvailableExternallyLegacyPass, "elim-avail-extern",
+                "Eliminate Available Externally Globals", false, false)
+
+ModulePass *llvm::createEliminateAvailableExternallyPass() {
+  return new EliminateAvailableExternallyLegacyPass();
+}
diff --git a/lib/Transforms/IPO/ExtractGV.cpp b/lib/Transforms/IPO/ExtractGV.cpp
index 1a3b9253d72f..479fd182598a 100644
--- a/lib/Transforms/IPO/ExtractGV.cpp
+++ b/lib/Transforms/IPO/ExtractGV.cpp
@@ -68,6 +68,9 @@ namespace {
       : ModulePass(ID), Named(GVs.begin(), GVs.end()), deleteStuff(deleteS) {}
 
     bool runOnModule(Module &M) override {
+      if (skipModule(M))
+        return false;
+
       // Visit the global inline asm.
       if (!deleteStuff)
         M.setModuleInlineAsm("");
@@ -101,20 +104,20 @@ namespace {
       }
 
       // Visit the Functions.
-      for (Module::iterator I = M.begin(), E = M.end(); I != E; ++I) {
+      for (Function &F : M) {
         bool Delete =
-            deleteStuff == (bool)Named.count(&*I) && !I->isDeclaration();
+            deleteStuff == (bool)Named.count(&F) && !F.isDeclaration();
         if (!Delete) {
-          if (I->hasAvailableExternallyLinkage())
+          if (F.hasAvailableExternallyLinkage())
             continue;
         }
 
-        makeVisible(*I, Delete);
+        makeVisible(F, Delete);
 
         if (Delete) {
           // Make this a declaration and drop it's comdat.
-          I->deleteBody();
-          I->setComdat(nullptr);
+          F.deleteBody();
+          F.setComdat(nullptr);
         }
       }
 
@@ -128,7 +131,7 @@ namespace {
         makeVisible(*CurI, Delete);
 
         if (Delete) {
-          Type *Ty =  CurI->getType()->getElementType();
+          Type *Ty =  CurI->getValueType();
 
           CurI->removeFromParent();
           llvm::Value *Declaration;
diff --git a/lib/Transforms/IPO/ForceFunctionAttrs.cpp b/lib/Transforms/IPO/ForceFunctionAttrs.cpp
index 6df044762cf4..968712138208 100644
--- a/lib/Transforms/IPO/ForceFunctionAttrs.cpp
+++ b/lib/Transforms/IPO/ForceFunctionAttrs.cpp
@@ -80,7 +80,8 @@ static void addForcedAttributes(Function &F) {
   }
 }
 
-PreservedAnalyses ForceFunctionAttrsPass::run(Module &M) {
+PreservedAnalyses ForceFunctionAttrsPass::run(Module &M,
+                                              ModuleAnalysisManager &) {
   if (ForceAttributes.empty())
     return PreservedAnalyses::all();
 
diff --git a/lib/Transforms/IPO/FunctionAttrs.cpp b/lib/Transforms/IPO/FunctionAttrs.cpp
index 527fdd1885a4..fff544085414 100644
--- a/lib/Transforms/IPO/FunctionAttrs.cpp
+++ b/lib/Transforms/IPO/FunctionAttrs.cpp
@@ -13,6 +13,7 @@
 ///
 //===----------------------------------------------------------------------===//
 
+#include "llvm/Transforms/IPO/FunctionAttrs.h"
 #include "llvm/Transforms/IPO.h"
 #include "llvm/ADT/SCCIterator.h"
 #include "llvm/ADT/SetVector.h"
@@ -52,38 +53,6 @@ typedef SmallSetVector<Function *, 8> SCCNodeSet;
 }
 
 namespace {
-struct PostOrderFunctionAttrs : public CallGraphSCCPass {
-  static char ID; // Pass identification, replacement for typeid
-  PostOrderFunctionAttrs() : CallGraphSCCPass(ID) {
-    initializePostOrderFunctionAttrsPass(*PassRegistry::getPassRegistry());
-  }
-
-  bool runOnSCC(CallGraphSCC &SCC) override;
-
-  void getAnalysisUsage(AnalysisUsage &AU) const override {
-    AU.setPreservesCFG();
-    AU.addRequired<AssumptionCacheTracker>();
-    AU.addRequired<TargetLibraryInfoWrapperPass>();
-    CallGraphSCCPass::getAnalysisUsage(AU);
-  }
-
-private:
-  TargetLibraryInfo *TLI;
-};
-}
-
-char PostOrderFunctionAttrs::ID = 0;
-INITIALIZE_PASS_BEGIN(PostOrderFunctionAttrs, "functionattrs",
-                      "Deduce function attributes", false, false)
-INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker)
-INITIALIZE_PASS_DEPENDENCY(CallGraphWrapperPass)
-INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfoWrapperPass)
-INITIALIZE_PASS_END(PostOrderFunctionAttrs, "functionattrs",
-                    "Deduce function attributes", false, false)
-
-Pass *llvm::createPostOrderFunctionAttrsPass() { return new PostOrderFunctionAttrs(); }
-
-namespace {
 /// The three kinds of memory access relevant to 'readonly' and
 /// 'readnone' attributes.
 enum MemoryAccessKind {
@@ -100,9 +69,10 @@ static MemoryAccessKind checkFunctionMemoryAccess(Function &F, AAResults &AAR,
     // Already perfect!
     return MAK_ReadNone;
 
-  // Definitions with weak linkage may be overridden at linktime with
-  // something that writes memory, so treat them like declarations.
-  if (F.isDeclaration() || F.mayBeOverridden()) {
+  // Non-exact function definitions may not be selected at link time, and an
+  // alternative version that writes to memory may be selected.  See the comment
+  // on GlobalValue::isDefinitionExact for more details.
+  if (!F.hasExactDefinition()) {
     if (AliasAnalysis::onlyReadsMemory(MRB))
       return MAK_ReadOnly;
 
@@ -119,8 +89,12 @@ static MemoryAccessKind checkFunctionMemoryAccess(Function &F, AAResults &AAR,
     // Detect these now, skipping to the next instruction if one is found.
     CallSite CS(cast<Value>(I));
     if (CS) {
-      // Ignore calls to functions in the same SCC.
-      if (CS.getCalledFunction() && SCCNodes.count(CS.getCalledFunction()))
+      // Ignore calls to functions in the same SCC, as long as the call sites
+      // don't have operand bundles.  Calls with operand bundles are allowed to
+      // have memory effects not described by the memory effects of the call
+      // target.
+      if (!CS.hasOperandBundles() && CS.getCalledFunction() &&
+          SCCNodes.count(CS.getCalledFunction()))
         continue;
       FunctionModRefBehavior MRB = AAR.getModRefBehavior(CS);
 
@@ -311,8 +285,7 @@ struct ArgumentUsesTracker : public CaptureTracker {
     }
 
     Function *F = CS.getCalledFunction();
-    if (!F || F->isDeclaration() || F->mayBeOverridden() ||
-        !SCCNodes.count(F)) {
+    if (!F || !F->hasExactDefinition() || !SCCNodes.count(F)) {
       Captured = true;
       return true;
     }
@@ -490,6 +463,11 @@ determinePointerReadAttrs(Argument *A,
     }
 
     case Instruction::Load:
+      // A volatile load has side effects beyond what readonly can be relied
+      // upon.
+      if (cast<LoadInst>(I)->isVolatile())
+        return Attribute::None;
+
       IsRead = true;
       break;
 
@@ -517,9 +495,10 @@ static bool addArgumentAttrs(const SCCNodeSet &SCCNodes) {
   // Check each function in turn, determining which pointer arguments are not
   // captured.
   for (Function *F : SCCNodes) {
-    // Definitions with weak linkage may be overridden at linktime with
-    // something that captures pointers, so treat them like declarations.
-    if (F->isDeclaration() || F->mayBeOverridden())
+    // We can infer and propagate function attributes only when we know that the
+    // definition we'll get at link time is *exactly* the definition we see now.
+    // For more details, see GlobalValue::mayBeDerefined.
+    if (!F->hasExactDefinition())
       continue;
 
     // Functions that are readonly (or readnone) and nounwind and don't return
@@ -557,12 +536,9 @@ static bool addArgumentAttrs(const SCCNodeSet &SCCNodes) {
             // then it must be calling into another function in our SCC. Save
             // its particulars for Argument-SCC analysis later.
             ArgumentGraphNode *Node = AG[&*A];
-            for (SmallVectorImpl<Argument *>::iterator
-                     UI = Tracker.Uses.begin(),
-                     UE = Tracker.Uses.end();
-                 UI != UE; ++UI) {
-              Node->Uses.push_back(AG[*UI]);
-              if (*UI != A)
+            for (Argument *Use : Tracker.Uses) {
+              Node->Uses.push_back(AG[Use]);
+              if (Use != &*A)
                 HasNonLocalUses = true;
             }
           }
@@ -627,17 +603,15 @@ static bool addArgumentAttrs(const SCCNodeSet &SCCNodes) {
     SmallPtrSet<Argument *, 8> ArgumentSCCNodes;
     // Fill ArgumentSCCNodes with the elements of the ArgumentSCC.  Used for
     // quickly looking up whether a given Argument is in this ArgumentSCC.
-    for (auto I = ArgumentSCC.begin(), E = ArgumentSCC.end(); I != E; ++I) {
-      ArgumentSCCNodes.insert((*I)->Definition);
+    for (ArgumentGraphNode *I : ArgumentSCC) {
+      ArgumentSCCNodes.insert(I->Definition);
     }
 
     for (auto I = ArgumentSCC.begin(), E = ArgumentSCC.end();
          I != E && !SCCCaptured; ++I) {
       ArgumentGraphNode *N = *I;
-      for (SmallVectorImpl<ArgumentGraphNode *>::iterator UI = N->Uses.begin(),
-                                                          UE = N->Uses.end();
-           UI != UE; ++UI) {
-        Argument *A = (*UI)->Definition;
+      for (ArgumentGraphNode *Use : N->Uses) {
+        Argument *A = Use->Definition;
         if (A->hasNoCaptureAttr() || ArgumentSCCNodes.count(A))
           continue;
         SCCCaptured = true;
@@ -703,8 +677,8 @@ static bool addArgumentAttrs(const SCCNodeSet &SCCNodes) {
 /// doesn't alias any other pointer visible to the caller.
 static bool isFunctionMallocLike(Function *F, const SCCNodeSet &SCCNodes) {
   SmallSetVector<Value *, 8> FlowsToReturn;
-  for (Function::iterator I = F->begin(), E = F->end(); I != E; ++I)
-    if (ReturnInst *Ret = dyn_cast<ReturnInst>(I->getTerminator()))
+  for (BasicBlock &BB : *F)
+    if (ReturnInst *Ret = dyn_cast<ReturnInst>(BB.getTerminator()))
       FlowsToReturn.insert(Ret->getReturnValue());
 
   for (unsigned i = 0; i != FlowsToReturn.size(); ++i) {
@@ -772,9 +746,10 @@ static bool addNoAliasAttrs(const SCCNodeSet &SCCNodes) {
     if (F->doesNotAlias(0))
       continue;
 
-    // Definitions with weak linkage may be overridden at linktime, so
-    // treat them like declarations.
-    if (F->isDeclaration() || F->mayBeOverridden())
+    // We can infer and propagate function attributes only when we know that the
+    // definition we'll get at link time is *exactly* the definition we see now.
+    // For more details, see GlobalValue::mayBeDerefined.
+    if (!F->hasExactDefinition())
       return false;
 
     // We annotate noalias return values, which are only applicable to
@@ -807,7 +782,7 @@ static bool addNoAliasAttrs(const SCCNodeSet &SCCNodes) {
 /// \p Speculative based on whether the returned conclusion is a speculative
 /// conclusion due to SCC calls.
 static bool isReturnNonNull(Function *F, const SCCNodeSet &SCCNodes,
-                            const TargetLibraryInfo &TLI, bool &Speculative) {
+                            bool &Speculative) {
   assert(F->getReturnType()->isPointerTy() &&
          "nonnull only meaningful on pointer types");
   Speculative = false;
@@ -821,7 +796,7 @@ static bool isReturnNonNull(Function *F, const SCCNodeSet &SCCNodes,
     Value *RetVal = FlowsToReturn[i];
 
     // If this value is locally known to be non-null, we're good
-    if (isKnownNonNull(RetVal, &TLI))
+    if (isKnownNonNull(RetVal))
       continue;
 
     // Otherwise, we need to look upwards since we can't make any local
@@ -870,8 +845,7 @@ static bool isReturnNonNull(Function *F, const SCCNodeSet &SCCNodes,
 }
 
 /// Deduce nonnull attributes for the SCC.
-static bool addNonNullAttrs(const SCCNodeSet &SCCNodes,
-                            const TargetLibraryInfo &TLI) {
+static bool addNonNullAttrs(const SCCNodeSet &SCCNodes) {
   // Speculative that all functions in the SCC return only nonnull
   // pointers.  We may refute this as we analyze functions.
   bool SCCReturnsNonNull = true;
@@ -886,9 +860,10 @@ static bool addNonNullAttrs(const SCCNodeSet &SCCNodes,
                                         Attribute::NonNull))
       continue;
 
-    // Definitions with weak linkage may be overridden at linktime, so
-    // treat them like declarations.
-    if (F->isDeclaration() || F->mayBeOverridden())
+    // We can infer and propagate function attributes only when we know that the
+    // definition we'll get at link time is *exactly* the definition we see now.
+    // For more details, see GlobalValue::mayBeDerefined.
+    if (!F->hasExactDefinition())
       return false;
 
     // We annotate nonnull return values, which are only applicable to
@@ -897,7 +872,7 @@ static bool addNonNullAttrs(const SCCNodeSet &SCCNodes,
       continue;
 
     bool Speculative = false;
-    if (isReturnNonNull(F, SCCNodes, TLI, Speculative)) {
+    if (isReturnNonNull(F, SCCNodes, Speculative)) {
       if (!Speculative) {
         // Mark the function eagerly since we may discover a function
         // which prevents us from speculating about the entire SCC
@@ -930,6 +905,49 @@ static bool addNonNullAttrs(const SCCNodeSet &SCCNodes,
   return MadeChange;
 }
 
+/// Remove the convergent attribute from all functions in the SCC if every
+/// callsite within the SCC is not convergent (except for calls to functions
+/// within the SCC).  Returns true if changes were made.
+static bool removeConvergentAttrs(const SCCNodeSet &SCCNodes) {
+  // For every function in SCC, ensure that either
+  //  * it is not convergent, or
+  //  * we can remove its convergent attribute.
+  bool HasConvergentFn = false;
+  for (Function *F : SCCNodes) {
+    if (!F->isConvergent()) continue;
+    HasConvergentFn = true;
+
+    // Can't remove convergent from function declarations.
+    if (F->isDeclaration()) return false;
+
+    // Can't remove convergent if any of our functions has a convergent call to a
+    // function not in the SCC.
+    for (Instruction &I : instructions(*F)) {
+      CallSite CS(&I);
+      // Bail if CS is a convergent call to a function not in the SCC.
+      if (CS && CS.isConvergent() &&
+          SCCNodes.count(CS.getCalledFunction()) == 0)
+        return false;
+    }
+  }
+
+  // If the SCC doesn't have any convergent functions, we have nothing to do.
+  if (!HasConvergentFn) return false;
+
+  // If we got here, all of the calls the SCC makes to functions not in the SCC
+  // are non-convergent.  Therefore all of the SCC's functions can also be made
+  // non-convergent.  We'll remove the attr from the callsites in
+  // InstCombineCalls.
+  for (Function *F : SCCNodes) {
+    if (!F->isConvergent()) continue;
+
+    DEBUG(dbgs() << "Removing convergent attr from fn " << F->getName()
+                 << "\n");
+    F->setNotConvergent();
+  }
+  return true;
+}
+
 static bool setDoesNotRecurse(Function &F) {
   if (F.doesNotRecurse())
     return false;
@@ -938,56 +956,129 @@ static bool setDoesNotRecurse(Function &F) {
   return true;
 }
 
-static bool addNoRecurseAttrs(const CallGraphSCC &SCC) {
+static bool addNoRecurseAttrs(const SCCNodeSet &SCCNodes) {
   // Try and identify functions that do not recurse.
 
   // If the SCC contains multiple nodes we know for sure there is recursion.
-  if (!SCC.isSingular())
+  if (SCCNodes.size() != 1)
     return false;
 
-  const CallGraphNode *CGN = *SCC.begin();
-  Function *F = CGN->getFunction();
+  Function *F = *SCCNodes.begin();
   if (!F || F->isDeclaration() || F->doesNotRecurse())
     return false;
 
   // If all of the calls in F are identifiable and are to norecurse functions, F
   // is norecurse. This check also detects self-recursion as F is not currently
   // marked norecurse, so any called from F to F will not be marked norecurse.
-  if (std::all_of(CGN->begin(), CGN->end(),
-                  [](const CallGraphNode::CallRecord &CR) {
-                    Function *F = CR.second->getFunction();
-                    return F && F->doesNotRecurse();
-                  }))
-    // Function calls a potentially recursive function.
-    return setDoesNotRecurse(*F);
-
-  // Nothing else we can deduce usefully during the postorder traversal.
-  return false;
+  for (Instruction &I : instructions(*F))
+    if (auto CS = CallSite(&I)) {
+      Function *Callee = CS.getCalledFunction();
+      if (!Callee || Callee == F || !Callee->doesNotRecurse())
+        // Function calls a potentially recursive function.
+        return false;
+    }
+
+  // Every call was to a non-recursive function other than this function, and
+  // we have no indirect recursion as the SCC size is one. This function cannot
+  // recurse.
+  return setDoesNotRecurse(*F);
 }
 
-bool PostOrderFunctionAttrs::runOnSCC(CallGraphSCC &SCC) {
-  TLI = &getAnalysis<TargetLibraryInfoWrapperPass>().getTLI();
-  bool Changed = false;
+PreservedAnalyses PostOrderFunctionAttrsPass::run(LazyCallGraph::SCC &C,
+                                                  CGSCCAnalysisManager &AM) {
+  FunctionAnalysisManager &FAM =
+      AM.getResult<FunctionAnalysisManagerCGSCCProxy>(C).getManager();
 
-  // We compute dedicated AA results for each function in the SCC as needed. We
-  // use a lambda referencing external objects so that they live long enough to
-  // be queried, but we re-use them each time.
-  Optional<BasicAAResult> BAR;
-  Optional<AAResults> AAR;
+  // We pass a lambda into functions to wire them up to the analysis manager
+  // for getting function analyses.
   auto AARGetter = [&](Function &F) -> AAResults & {
-    BAR.emplace(createLegacyPMBasicAAResult(*this, F));
-    AAR.emplace(createLegacyPMAAResults(*this, F, *BAR));
-    return *AAR;
+    return FAM.getResult<AAManager>(F);
   };
 
+  // Fill SCCNodes with the elements of the SCC. Also track whether there are
+  // any external or opt-none nodes that will prevent us from optimizing any
+  // part of the SCC.
+  SCCNodeSet SCCNodes;
+  bool HasUnknownCall = false;
+  for (LazyCallGraph::Node &N : C) {
+    Function &F = N.getFunction();
+    if (F.hasFnAttribute(Attribute::OptimizeNone)) {
+      // Treat any function we're trying not to optimize as if it were an
+      // indirect call and omit it from the node set used below.
+      HasUnknownCall = true;
+      continue;
+    }
+    // Track whether any functions in this SCC have an unknown call edge.
+    // Note: if this is ever a performance hit, we can common it with
+    // subsequent routines which also do scans over the instructions of the
+    // function.
+    if (!HasUnknownCall)
+      for (Instruction &I : instructions(F))
+        if (auto CS = CallSite(&I))
+          if (!CS.getCalledFunction()) {
+            HasUnknownCall = true;
+            break;
+          }
+
+    SCCNodes.insert(&F);
+  }
+
+  bool Changed = false;
+  Changed |= addReadAttrs(SCCNodes, AARGetter);
+  Changed |= addArgumentAttrs(SCCNodes);
+
+  // If we have no external nodes participating in the SCC, we can deduce some
+  // more precise attributes as well.
+  if (!HasUnknownCall) {
+    Changed |= addNoAliasAttrs(SCCNodes);
+    Changed |= addNonNullAttrs(SCCNodes);
+    Changed |= removeConvergentAttrs(SCCNodes);
+    Changed |= addNoRecurseAttrs(SCCNodes);
+  }
+
+  return Changed ? PreservedAnalyses::none() : PreservedAnalyses::all();
+}
+
+namespace {
+struct PostOrderFunctionAttrsLegacyPass : public CallGraphSCCPass {
+  static char ID; // Pass identification, replacement for typeid
+  PostOrderFunctionAttrsLegacyPass() : CallGraphSCCPass(ID) {
+    initializePostOrderFunctionAttrsLegacyPassPass(*PassRegistry::getPassRegistry());
+  }
+
+  bool runOnSCC(CallGraphSCC &SCC) override;
+
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    AU.setPreservesCFG();
+    AU.addRequired<AssumptionCacheTracker>();
+    getAAResultsAnalysisUsage(AU);
+    CallGraphSCCPass::getAnalysisUsage(AU);
+  }
+};
+}
+
+char PostOrderFunctionAttrsLegacyPass::ID = 0;
+INITIALIZE_PASS_BEGIN(PostOrderFunctionAttrsLegacyPass, "functionattrs",
+                      "Deduce function attributes", false, false)
+INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker)
+INITIALIZE_PASS_DEPENDENCY(CallGraphWrapperPass)
+INITIALIZE_PASS_END(PostOrderFunctionAttrsLegacyPass, "functionattrs",
+                    "Deduce function attributes", false, false)
+
+Pass *llvm::createPostOrderFunctionAttrsLegacyPass() { return new PostOrderFunctionAttrsLegacyPass(); }
+
+template <typename AARGetterT>
+static bool runImpl(CallGraphSCC &SCC, AARGetterT AARGetter) {
+  bool Changed = false;
+
   // Fill SCCNodes with the elements of the SCC. Used for quickly looking up
   // whether a given CallGraphNode is in this SCC. Also track whether there are
   // any external or opt-none nodes that will prevent us from optimizing any
   // part of the SCC.
   SCCNodeSet SCCNodes;
   bool ExternalNode = false;
-  for (CallGraphSCC::iterator I = SCC.begin(), E = SCC.end(); I != E; ++I) {
-    Function *F = (*I)->getFunction();
+  for (CallGraphNode *I : SCC) {
+    Function *F = I->getFunction();
     if (!F || F->hasFnAttribute(Attribute::OptimizeNone)) {
       // External node or function we're trying not to optimize - we both avoid
       // transform them and avoid leveraging information they provide.
@@ -1005,28 +1096,37 @@ bool PostOrderFunctionAttrs::runOnSCC(CallGraphSCC &SCC) {
   // more precise attributes as well.
   if (!ExternalNode) {
     Changed |= addNoAliasAttrs(SCCNodes);
-    Changed |= addNonNullAttrs(SCCNodes, *TLI);
+    Changed |= addNonNullAttrs(SCCNodes);
+    Changed |= removeConvergentAttrs(SCCNodes);
+    Changed |= addNoRecurseAttrs(SCCNodes);
   }
 
-  Changed |= addNoRecurseAttrs(SCC);
   return Changed;
 }
 
+bool PostOrderFunctionAttrsLegacyPass::runOnSCC(CallGraphSCC &SCC) {
+  if (skipSCC(SCC))
+    return false;
+
+  // We compute dedicated AA results for each function in the SCC as needed. We
+  // use a lambda referencing external objects so that they live long enough to
+  // be queried, but we re-use them each time.
+  Optional<BasicAAResult> BAR;
+  Optional<AAResults> AAR;
+  auto AARGetter = [&](Function &F) -> AAResults & {
+    BAR.emplace(createLegacyPMBasicAAResult(*this, F));
+    AAR.emplace(createLegacyPMAAResults(*this, F, *BAR));
+    return *AAR;
+  };
+
+  return runImpl(SCC, AARGetter);
+}
+
 namespace {
-/// A pass to do RPO deduction and propagation of function attributes.
-///
-/// This pass provides a general RPO or "top down" propagation of
-/// function attributes. For a few (rare) cases, we can deduce significantly
-/// more about function attributes by working in RPO, so this pass
-/// provides the compliment to the post-order pass above where the majority of
-/// deduction is performed.
-// FIXME: Currently there is no RPO CGSCC pass structure to slide into and so
-// this is a boring module pass, but eventually it should be an RPO CGSCC pass
-// when such infrastructure is available.
-struct ReversePostOrderFunctionAttrs : public ModulePass {
+struct ReversePostOrderFunctionAttrsLegacyPass : public ModulePass {
   static char ID; // Pass identification, replacement for typeid
-  ReversePostOrderFunctionAttrs() : ModulePass(ID) {
-    initializeReversePostOrderFunctionAttrsPass(*PassRegistry::getPassRegistry());
+  ReversePostOrderFunctionAttrsLegacyPass() : ModulePass(ID) {
+    initializeReversePostOrderFunctionAttrsLegacyPassPass(*PassRegistry::getPassRegistry());
   }
 
   bool runOnModule(Module &M) override;
@@ -1034,19 +1134,20 @@ struct ReversePostOrderFunctionAttrs : public ModulePass {
   void getAnalysisUsage(AnalysisUsage &AU) const override {
     AU.setPreservesCFG();
     AU.addRequired<CallGraphWrapperPass>();
+    AU.addPreserved<CallGraphWrapperPass>();
   }
 };
 }
 
-char ReversePostOrderFunctionAttrs::ID = 0;
-INITIALIZE_PASS_BEGIN(ReversePostOrderFunctionAttrs, "rpo-functionattrs",
+char ReversePostOrderFunctionAttrsLegacyPass::ID = 0;
+INITIALIZE_PASS_BEGIN(ReversePostOrderFunctionAttrsLegacyPass, "rpo-functionattrs",
                       "Deduce function attributes in RPO", false, false)
 INITIALIZE_PASS_DEPENDENCY(CallGraphWrapperPass)
-INITIALIZE_PASS_END(ReversePostOrderFunctionAttrs, "rpo-functionattrs",
+INITIALIZE_PASS_END(ReversePostOrderFunctionAttrsLegacyPass, "rpo-functionattrs",
                     "Deduce function attributes in RPO", false, false)
 
 Pass *llvm::createReversePostOrderFunctionAttrsPass() {
-  return new ReversePostOrderFunctionAttrs();
+  return new ReversePostOrderFunctionAttrsLegacyPass();
 }
 
 static bool addNoRecurseAttrsTopDown(Function &F) {
@@ -1078,7 +1179,7 @@ static bool addNoRecurseAttrsTopDown(Function &F) {
   return setDoesNotRecurse(F);
 }
 
-bool ReversePostOrderFunctionAttrs::runOnModule(Module &M) {
+static bool deduceFunctionAttributeInRPO(Module &M, CallGraph &CG) {
   // We only have a post-order SCC traversal (because SCCs are inherently
   // discovered in post-order), so we accumulate them in a vector and then walk
   // it in reverse. This is simpler than using the RPO iterator infrastructure
@@ -1086,7 +1187,6 @@ bool ReversePostOrderFunctionAttrs::runOnModule(Module &M) {
   // graph. We can also cheat egregiously because we're primarily interested in
   // synthesizing norecurse and so we can only save the singular SCCs as SCCs
   // with multiple functions in them will clearly be recursive.
-  auto &CG = getAnalysis<CallGraphWrapperPass>().getCallGraph();
   SmallVector<Function *, 16> Worklist;
   for (scc_iterator<CallGraph *> I = scc_begin(&CG); !I.isAtEnd(); ++I) {
     if (I->size() != 1)
@@ -1104,3 +1204,24 @@ bool ReversePostOrderFunctionAttrs::runOnModule(Module &M) {
 
   return Changed;
 }
+
+bool ReversePostOrderFunctionAttrsLegacyPass::runOnModule(Module &M) {
+  if (skipModule(M))
+    return false;
+
+  auto &CG = getAnalysis<CallGraphWrapperPass>().getCallGraph();
+
+  return deduceFunctionAttributeInRPO(M, CG);
+}
+
+PreservedAnalyses
+ReversePostOrderFunctionAttrsPass::run(Module &M, AnalysisManager<Module> &AM) {
+  auto &CG = AM.getResult<CallGraphAnalysis>(M);
+
+  bool Changed = deduceFunctionAttributeInRPO(M, CG);
+  if (!Changed)
+    return PreservedAnalyses::all();
+  PreservedAnalyses PA;
+  PA.preserve<CallGraphAnalysis>();
+  return PA;
+}
diff --git a/lib/Transforms/IPO/FunctionImport.cpp b/lib/Transforms/IPO/FunctionImport.cpp
index 5e0df9505119..c9d075e76325 100644
--- a/lib/Transforms/IPO/FunctionImport.cpp
+++ b/lib/Transforms/IPO/FunctionImport.cpp
@@ -13,329 +13,670 @@
 
 #include "llvm/Transforms/IPO/FunctionImport.h"
 
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/Statistic.h"
 #include "llvm/ADT/StringSet.h"
+#include "llvm/ADT/Triple.h"
 #include "llvm/IR/AutoUpgrade.h"
 #include "llvm/IR/DiagnosticPrinter.h"
 #include "llvm/IR/IntrinsicInst.h"
 #include "llvm/IR/Module.h"
 #include "llvm/IRReader/IRReader.h"
 #include "llvm/Linker/Linker.h"
-#include "llvm/Object/FunctionIndexObjectFile.h"
+#include "llvm/Object/IRObjectFile.h"
+#include "llvm/Object/ModuleSummaryIndexObjectFile.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/SourceMgr.h"
+#include "llvm/Transforms/IPO/Internalize.h"
+#include "llvm/Transforms/Utils/FunctionImportUtils.h"
 
-#include <map>
+#define DEBUG_TYPE "function-import"
 
 using namespace llvm;
 
-#define DEBUG_TYPE "function-import"
+STATISTIC(NumImported, "Number of functions imported");
 
 /// Limit on instruction count of imported functions.
 static cl::opt<unsigned> ImportInstrLimit(
     "import-instr-limit", cl::init(100), cl::Hidden, cl::value_desc("N"),
     cl::desc("Only import functions with less than N instructions"));
 
+static cl::opt<float>
+    ImportInstrFactor("import-instr-evolution-factor", cl::init(0.7),
+                      cl::Hidden, cl::value_desc("x"),
+                      cl::desc("As we import functions, multiply the "
+                               "`import-instr-limit` threshold by this factor "
+                               "before processing newly imported functions"));
+
+static cl::opt<bool> PrintImports("print-imports", cl::init(false), cl::Hidden,
+                                  cl::desc("Print imported functions"));
+
+// Temporary allows the function import pass to disable always linking
+// referenced discardable symbols.
+static cl::opt<bool>
+    DontForceImportReferencedDiscardableSymbols("disable-force-link-odr",
+                                                cl::init(false), cl::Hidden);
+
+static cl::opt<bool> EnableImportMetadata(
+    "enable-import-metadata", cl::init(
+#if !defined(NDEBUG)
+                                  true /*Enabled with asserts.*/
+#else
+                                  false
+#endif
+                                  ),
+    cl::Hidden, cl::desc("Enable import metadata like 'thinlto_src_module'"));
+
 // Load lazily a module from \p FileName in \p Context.
 static std::unique_ptr<Module> loadFile(const std::string &FileName,
                                         LLVMContext &Context) {
   SMDiagnostic Err;
   DEBUG(dbgs() << "Loading '" << FileName << "'\n");
-  // Metadata isn't loaded or linked until after all functions are
-  // imported, after which it will be materialized and linked.
+  // Metadata isn't loaded until functions are imported, to minimize
+  // the memory overhead.
   std::unique_ptr<Module> Result =
       getLazyIRFileModule(FileName, Err, Context,
                           /* ShouldLazyLoadMetadata = */ true);
   if (!Result) {
     Err.print("function-import", errs());
-    return nullptr;
+    report_fatal_error("Abort");
   }
 
   return Result;
 }
 
 namespace {
-/// Helper to load on demand a Module from file and cache it for subsequent
-/// queries. It can be used with the FunctionImporter.
-class ModuleLazyLoaderCache {
-  /// Cache of lazily loaded module for import.
-  StringMap<std::unique_ptr<Module>> ModuleMap;
 
-  /// Retrieve a Module from the cache or lazily load it on demand.
-  std::function<std::unique_ptr<Module>(StringRef FileName)> createLazyModule;
+// Return true if the Summary describes a GlobalValue that can be externally
+// referenced, i.e. it does not need renaming (linkage is not local) or renaming
+// is possible (does not have a section for instance).
+static bool canBeExternallyReferenced(const GlobalValueSummary &Summary) {
+  if (!Summary.needsRenaming())
+    return true;
 
-public:
-  /// Create the loader, Module will be initialized in \p Context.
-  ModuleLazyLoaderCache(std::function<
-      std::unique_ptr<Module>(StringRef FileName)> createLazyModule)
-      : createLazyModule(createLazyModule) {}
-
-  /// Retrieve a Module from the cache or lazily load it on demand.
-  Module &operator()(StringRef FileName);
-
-  std::unique_ptr<Module> takeModule(StringRef FileName) {
-    auto I = ModuleMap.find(FileName);
-    assert(I != ModuleMap.end());
-    std::unique_ptr<Module> Ret = std::move(I->second);
-    ModuleMap.erase(I);
-    return Ret;
-  }
-};
+  if (Summary.hasSection())
+    // Can't rename a global that needs renaming if has a section.
+    return false;
 
-// Get a Module for \p FileName from the cache, or load it lazily.
-Module &ModuleLazyLoaderCache::operator()(StringRef Identifier) {
-  auto &Module = ModuleMap[Identifier];
-  if (!Module)
-    Module = createLazyModule(Identifier);
-  return *Module;
+  return true;
 }
-} // anonymous namespace
 
-/// Walk through the instructions in \p F looking for external
-/// calls not already in the \p CalledFunctions set. If any are
-/// found they are added to the \p Worklist for importing.
-static void findExternalCalls(const Module &DestModule, Function &F,
-                              const FunctionInfoIndex &Index,
-                              StringSet<> &CalledFunctions,
-                              SmallVector<StringRef, 64> &Worklist) {
-  // We need to suffix internal function calls imported from other modules,
-  // prepare the suffix ahead of time.
-  std::string Suffix;
-  if (F.getParent() != &DestModule)
-    Suffix =
-        (Twine(".llvm.") +
-         Twine(Index.getModuleId(F.getParent()->getModuleIdentifier()))).str();
-
-  for (auto &BB : F) {
-    for (auto &I : BB) {
-      if (isa<CallInst>(I)) {
-        auto CalledFunction = cast<CallInst>(I).getCalledFunction();
-        // Insert any new external calls that have not already been
-        // added to set/worklist.
-        if (!CalledFunction || !CalledFunction->hasName())
-          continue;
-        // Ignore intrinsics early
-        if (CalledFunction->isIntrinsic()) {
-          assert(CalledFunction->getIntrinsicID() != 0);
-          continue;
-        }
-        auto ImportedName = CalledFunction->getName();
-        auto Renamed = (ImportedName + Suffix).str();
-        // Rename internal functions
-        if (CalledFunction->hasInternalLinkage()) {
-          ImportedName = Renamed;
-        }
-        auto It = CalledFunctions.insert(ImportedName);
-        if (!It.second) {
-          // This is a call to a function we already considered, skip.
-          continue;
-        }
-        // Ignore functions already present in the destination module
-        auto *SrcGV = DestModule.getNamedValue(ImportedName);
-        if (SrcGV) {
-          if (GlobalAlias *SGA = dyn_cast<GlobalAlias>(SrcGV))
-            SrcGV = SGA->getBaseObject();
-          assert(isa<Function>(SrcGV) && "Name collision during import");
-          if (!cast<Function>(SrcGV)->isDeclaration()) {
-            DEBUG(dbgs() << DestModule.getModuleIdentifier() << ": Ignoring "
-                         << ImportedName << " already in DestinationModule\n");
-            continue;
-          }
+// Return true if \p GUID describes a GlobalValue that can be externally
+// referenced, i.e. it does not need renaming (linkage is not local) or
+// renaming is possible (does not have a section for instance).
+static bool canBeExternallyReferenced(const ModuleSummaryIndex &Index,
+                                      GlobalValue::GUID GUID) {
+  auto Summaries = Index.findGlobalValueSummaryList(GUID);
+  if (Summaries == Index.end())
+    return true;
+  if (Summaries->second.size() != 1)
+    // If there are multiple globals with this GUID, then we know it is
+    // not a local symbol, and it is necessarily externally referenced.
+    return true;
+
+  // We don't need to check for the module path, because if it can't be
+  // externally referenced and we call it, it is necessarilly in the same
+  // module
+  return canBeExternallyReferenced(**Summaries->second.begin());
+}
+
+// Return true if the global described by \p Summary can be imported in another
+// module.
+static bool eligibleForImport(const ModuleSummaryIndex &Index,
+                              const GlobalValueSummary &Summary) {
+  if (!canBeExternallyReferenced(Summary))
+    // Can't import a global that needs renaming if has a section for instance.
+    // FIXME: we may be able to import it by copying it without promotion.
+    return false;
+
+  // Check references (and potential calls) in the same module. If the current
+  // value references a global that can't be externally referenced it is not
+  // eligible for import.
+  bool AllRefsCanBeExternallyReferenced =
+      llvm::all_of(Summary.refs(), [&](const ValueInfo &VI) {
+        return canBeExternallyReferenced(Index, VI.getGUID());
+      });
+  if (!AllRefsCanBeExternallyReferenced)
+    return false;
+
+  if (auto *FuncSummary = dyn_cast<FunctionSummary>(&Summary)) {
+    bool AllCallsCanBeExternallyReferenced = llvm::all_of(
+        FuncSummary->calls(), [&](const FunctionSummary::EdgeTy &Edge) {
+          return canBeExternallyReferenced(Index, Edge.first.getGUID());
+        });
+    if (!AllCallsCanBeExternallyReferenced)
+      return false;
+  }
+  return true;
+}
+
+/// Given a list of possible callee implementation for a call site, select one
+/// that fits the \p Threshold.
+///
+/// FIXME: select "best" instead of first that fits. But what is "best"?
+/// - The smallest: more likely to be inlined.
+/// - The one with the least outgoing edges (already well optimized).
+/// - One from a module already being imported from in order to reduce the
+///   number of source modules parsed/linked.
+/// - One that has PGO data attached.
+/// - [insert you fancy metric here]
+static const GlobalValueSummary *
+selectCallee(const ModuleSummaryIndex &Index,
+             const GlobalValueSummaryList &CalleeSummaryList,
+             unsigned Threshold) {
+  auto It = llvm::find_if(
+      CalleeSummaryList,
+      [&](const std::unique_ptr<GlobalValueSummary> &SummaryPtr) {
+        auto *GVSummary = SummaryPtr.get();
+        if (GlobalValue::isInterposableLinkage(GVSummary->linkage()))
+          // There is no point in importing these, we can't inline them
+          return false;
+        if (auto *AS = dyn_cast<AliasSummary>(GVSummary)) {
+          GVSummary = &AS->getAliasee();
+          // Alias can't point to "available_externally". However when we import
+          // linkOnceODR the linkage does not change. So we import the alias
+          // and aliasee only in this case.
+          // FIXME: we should import alias as available_externally *function*,
+          // the destination module does need to know it is an alias.
+          if (!GlobalValue::isLinkOnceODRLinkage(GVSummary->linkage()))
+            return false;
         }
 
-        Worklist.push_back(It.first->getKey());
-        DEBUG(dbgs() << DestModule.getModuleIdentifier()
-                     << ": Adding callee for : " << ImportedName << " : "
-                     << F.getName() << "\n");
-      }
-    }
+        auto *Summary = cast<FunctionSummary>(GVSummary);
+
+        if (Summary->instCount() > Threshold)
+          return false;
+
+        if (!eligibleForImport(Index, *Summary))
+          return false;
+
+        return true;
+      });
+  if (It == CalleeSummaryList.end())
+    return nullptr;
+
+  return cast<GlobalValueSummary>(It->get());
+}
+
+/// Return the summary for the function \p GUID that fits the \p Threshold, or
+/// null if there's no match.
+static const GlobalValueSummary *selectCallee(GlobalValue::GUID GUID,
+                                              unsigned Threshold,
+                                              const ModuleSummaryIndex &Index) {
+  auto CalleeSummaryList = Index.findGlobalValueSummaryList(GUID);
+  if (CalleeSummaryList == Index.end())
+    return nullptr; // This function does not have a summary
+  return selectCallee(Index, CalleeSummaryList->second, Threshold);
+}
+
+/// Mark the global \p GUID as export by module \p ExportModulePath if found in
+/// this module. If it is a GlobalVariable, we also mark any referenced global
+/// in the current module as exported.
+static void exportGlobalInModule(const ModuleSummaryIndex &Index,
+                                 StringRef ExportModulePath,
+                                 GlobalValue::GUID GUID,
+                                 FunctionImporter::ExportSetTy &ExportList) {
+  auto FindGlobalSummaryInModule =
+      [&](GlobalValue::GUID GUID) -> GlobalValueSummary *{
+        auto SummaryList = Index.findGlobalValueSummaryList(GUID);
+        if (SummaryList == Index.end())
+          // This global does not have a summary, it is not part of the ThinLTO
+          // process
+          return nullptr;
+        auto SummaryIter = llvm::find_if(
+            SummaryList->second,
+            [&](const std::unique_ptr<GlobalValueSummary> &Summary) {
+              return Summary->modulePath() == ExportModulePath;
+            });
+        if (SummaryIter == SummaryList->second.end())
+          return nullptr;
+        return SummaryIter->get();
+      };
+
+  auto *Summary = FindGlobalSummaryInModule(GUID);
+  if (!Summary)
+    return;
+  // We found it in the current module, mark as exported
+  ExportList.insert(GUID);
+
+  auto GVS = dyn_cast<GlobalVarSummary>(Summary);
+  if (!GVS)
+    return;
+  // FunctionImportGlobalProcessing::doPromoteLocalToGlobal() will always
+  // trigger importing  the initializer for `constant unnamed addr` globals that
+  // are referenced. We conservatively export all the referenced symbols for
+  // every global to workaround this, so that the ExportList is accurate.
+  // FIXME: with a "isConstant" flag in the summary we could be more targetted.
+  for (auto &Ref : GVS->refs()) {
+    auto GUID = Ref.getGUID();
+    auto *RefSummary = FindGlobalSummaryInModule(GUID);
+    if (RefSummary)
+      // Found a ref in the current module, mark it as exported
+      ExportList.insert(GUID);
   }
 }
 
-// Helper function: given a worklist and an index, will process all the worklist
-// and decide what to import based on the summary information.
-//
-// Nothing is actually imported, functions are materialized in their source
-// module and analyzed there.
-//
-// \p ModuleToFunctionsToImportMap is filled with the set of Function to import
-// per Module.
-static void GetImportList(Module &DestModule,
-                          SmallVector<StringRef, 64> &Worklist,
-                          StringSet<> &CalledFunctions,
-                          std::map<StringRef, DenseSet<const GlobalValue *>>
-                              &ModuleToFunctionsToImportMap,
-                          const FunctionInfoIndex &Index,
-                          ModuleLazyLoaderCache &ModuleLoaderCache) {
-  while (!Worklist.empty()) {
-    auto CalledFunctionName = Worklist.pop_back_val();
-    DEBUG(dbgs() << DestModule.getModuleIdentifier() << ": Process import for "
-                 << CalledFunctionName << "\n");
-
-    // Try to get a summary for this function call.
-    auto InfoList = Index.findFunctionInfoList(CalledFunctionName);
-    if (InfoList == Index.end()) {
-      DEBUG(dbgs() << DestModule.getModuleIdentifier() << ": No summary for "
-                   << CalledFunctionName << " Ignoring.\n");
+using EdgeInfo = std::pair<const FunctionSummary *, unsigned /* Threshold */>;
+
+/// Compute the list of functions to import for a given caller. Mark these
+/// imported functions and the symbols they reference in their source module as
+/// exported from their source module.
+static void computeImportForFunction(
+    const FunctionSummary &Summary, const ModuleSummaryIndex &Index,
+    unsigned Threshold, const GVSummaryMapTy &DefinedGVSummaries,
+    SmallVectorImpl<EdgeInfo> &Worklist,
+    FunctionImporter::ImportMapTy &ImportsForModule,
+    StringMap<FunctionImporter::ExportSetTy> *ExportLists = nullptr) {
+  for (auto &Edge : Summary.calls()) {
+    auto GUID = Edge.first.getGUID();
+    DEBUG(dbgs() << " edge -> " << GUID << " Threshold:" << Threshold << "\n");
+
+    if (DefinedGVSummaries.count(GUID)) {
+      DEBUG(dbgs() << "ignored! Target already in destination module.\n");
       continue;
     }
-    assert(!InfoList->second.empty() && "No summary, error at import?");
-
-    // Comdat can have multiple entries, FIXME: what do we do with them?
-    auto &Info = InfoList->second[0];
-    assert(Info && "Nullptr in list, error importing summaries?\n");
-
-    auto *Summary = Info->functionSummary();
-    if (!Summary) {
-      // FIXME: in case we are lazyloading summaries, we can do it now.
-      DEBUG(dbgs() << DestModule.getModuleIdentifier()
-                   << ": Missing summary for  " << CalledFunctionName
-                   << ", error at import?\n");
-      llvm_unreachable("Missing summary");
-    }
 
-    if (Summary->instCount() > ImportInstrLimit) {
-      DEBUG(dbgs() << DestModule.getModuleIdentifier() << ": Skip import of "
-                   << CalledFunctionName << " with " << Summary->instCount()
-                   << " instructions (limit " << ImportInstrLimit << ")\n");
+    auto *CalleeSummary = selectCallee(GUID, Threshold, Index);
+    if (!CalleeSummary) {
+      DEBUG(dbgs() << "ignored! No qualifying callee with summary found.\n");
       continue;
     }
-
-    // Get the module path from the summary.
-    auto ModuleIdentifier = Summary->modulePath();
-    DEBUG(dbgs() << DestModule.getModuleIdentifier() << ": Importing "
-                 << CalledFunctionName << " from " << ModuleIdentifier << "\n");
-
-    auto &SrcModule = ModuleLoaderCache(ModuleIdentifier);
-
-    // The function that we will import!
-    GlobalValue *SGV = SrcModule.getNamedValue(CalledFunctionName);
-
-    if (!SGV) {
-      // The destination module is referencing function using their renamed name
-      // when importing a function that was originally local in the source
-      // module. The source module we have might not have been renamed so we try
-      // to remove the suffix added during the renaming to recover the original
-      // name in the source module.
-      std::pair<StringRef, StringRef> Split =
-          CalledFunctionName.split(".llvm.");
-      SGV = SrcModule.getNamedValue(Split.first);
-      assert(SGV && "Can't find function to import in source module");
+    // "Resolve" the summary, traversing alias,
+    const FunctionSummary *ResolvedCalleeSummary;
+    if (isa<AliasSummary>(CalleeSummary)) {
+      ResolvedCalleeSummary = cast<FunctionSummary>(
+          &cast<AliasSummary>(CalleeSummary)->getAliasee());
+      assert(
+          GlobalValue::isLinkOnceODRLinkage(ResolvedCalleeSummary->linkage()) &&
+          "Unexpected alias to a non-linkonceODR in import list");
+    } else
+      ResolvedCalleeSummary = cast<FunctionSummary>(CalleeSummary);
+
+    assert(ResolvedCalleeSummary->instCount() <= Threshold &&
+           "selectCallee() didn't honor the threshold");
+
+    auto ExportModulePath = ResolvedCalleeSummary->modulePath();
+    auto &ProcessedThreshold = ImportsForModule[ExportModulePath][GUID];
+    /// Since the traversal of the call graph is DFS, we can revisit a function
+    /// a second time with a higher threshold. In this case, it is added back to
+    /// the worklist with the new threshold.
+    if (ProcessedThreshold && ProcessedThreshold >= Threshold) {
+      DEBUG(dbgs() << "ignored! Target was already seen with Threshold "
+                   << ProcessedThreshold << "\n");
+      continue;
     }
-    if (!SGV) {
-      report_fatal_error(Twine("Can't load function '") + CalledFunctionName +
-                         "' in Module '" + SrcModule.getModuleIdentifier() +
-                         "', error in the summary?\n");
+    // Mark this function as imported in this module, with the current Threshold
+    ProcessedThreshold = Threshold;
+
+    // Make exports in the source module.
+    if (ExportLists) {
+      auto &ExportList = (*ExportLists)[ExportModulePath];
+      ExportList.insert(GUID);
+      // Mark all functions and globals referenced by this function as exported
+      // to the outside if they are defined in the same source module.
+      for (auto &Edge : ResolvedCalleeSummary->calls()) {
+        auto CalleeGUID = Edge.first.getGUID();
+        exportGlobalInModule(Index, ExportModulePath, CalleeGUID, ExportList);
+      }
+      for (auto &Ref : ResolvedCalleeSummary->refs()) {
+        auto GUID = Ref.getGUID();
+        exportGlobalInModule(Index, ExportModulePath, GUID, ExportList);
+      }
     }
 
-    Function *F = dyn_cast<Function>(SGV);
-    if (!F && isa<GlobalAlias>(SGV)) {
-      auto *SGA = dyn_cast<GlobalAlias>(SGV);
-      F = dyn_cast<Function>(SGA->getBaseObject());
-      CalledFunctionName = F->getName();
-    }
-    assert(F && "Imported Function is ... not a Function");
-
-    // We cannot import weak_any functions/aliases without possibly affecting
-    // the order they are seen and selected by the linker, changing program
-    // semantics.
-    if (SGV->hasWeakAnyLinkage()) {
-      DEBUG(dbgs() << DestModule.getModuleIdentifier()
-                   << ": Ignoring import request for weak-any "
-                   << (isa<Function>(SGV) ? "function " : "alias ")
-                   << CalledFunctionName << " from "
-                   << SrcModule.getModuleIdentifier() << "\n");
+    // Insert the newly imported function to the worklist.
+    Worklist.push_back(std::make_pair(ResolvedCalleeSummary, Threshold));
+  }
+}
+
+/// Given the list of globals defined in a module, compute the list of imports
+/// as well as the list of "exports", i.e. the list of symbols referenced from
+/// another module (that may require promotion).
+static void ComputeImportForModule(
+    const GVSummaryMapTy &DefinedGVSummaries, const ModuleSummaryIndex &Index,
+    FunctionImporter::ImportMapTy &ImportsForModule,
+    StringMap<FunctionImporter::ExportSetTy> *ExportLists = nullptr) {
+  // Worklist contains the list of function imported in this module, for which
+  // we will analyse the callees and may import further down the callgraph.
+  SmallVector<EdgeInfo, 128> Worklist;
+
+  // Populate the worklist with the import for the functions in the current
+  // module
+  for (auto &GVSummary : DefinedGVSummaries) {
+    auto *Summary = GVSummary.second;
+    if (auto *AS = dyn_cast<AliasSummary>(Summary))
+      Summary = &AS->getAliasee();
+    auto *FuncSummary = dyn_cast<FunctionSummary>(Summary);
+    if (!FuncSummary)
+      // Skip import for global variables
       continue;
-    }
+    DEBUG(dbgs() << "Initalize import for " << GVSummary.first << "\n");
+    computeImportForFunction(*FuncSummary, Index, ImportInstrLimit,
+                             DefinedGVSummaries, Worklist, ImportsForModule,
+                             ExportLists);
+  }
 
-    // Add the function to the import list
-    auto &Entry = ModuleToFunctionsToImportMap[SrcModule.getModuleIdentifier()];
-    Entry.insert(F);
+  while (!Worklist.empty()) {
+    auto FuncInfo = Worklist.pop_back_val();
+    auto *Summary = FuncInfo.first;
+    auto Threshold = FuncInfo.second;
 
     // Process the newly imported functions and add callees to the worklist.
-    F->materialize();
-    findExternalCalls(DestModule, *F, Index, CalledFunctions, Worklist);
+    // Adjust the threshold
+    Threshold = Threshold * ImportInstrFactor;
+
+    computeImportForFunction(*Summary, Index, Threshold, DefinedGVSummaries,
+                             Worklist, ImportsForModule, ExportLists);
   }
 }
 
+} // anonymous namespace
+
+/// Compute all the import and export for every module using the Index.
+void llvm::ComputeCrossModuleImport(
+    const ModuleSummaryIndex &Index,
+    const StringMap<GVSummaryMapTy> &ModuleToDefinedGVSummaries,
+    StringMap<FunctionImporter::ImportMapTy> &ImportLists,
+    StringMap<FunctionImporter::ExportSetTy> &ExportLists) {
+  // For each module that has function defined, compute the import/export lists.
+  for (auto &DefinedGVSummaries : ModuleToDefinedGVSummaries) {
+    auto &ImportsForModule = ImportLists[DefinedGVSummaries.first()];
+    DEBUG(dbgs() << "Computing import for Module '"
+                 << DefinedGVSummaries.first() << "'\n");
+    ComputeImportForModule(DefinedGVSummaries.second, Index, ImportsForModule,
+                           &ExportLists);
+  }
+
+#ifndef NDEBUG
+  DEBUG(dbgs() << "Import/Export lists for " << ImportLists.size()
+               << " modules:\n");
+  for (auto &ModuleImports : ImportLists) {
+    auto ModName = ModuleImports.first();
+    auto &Exports = ExportLists[ModName];
+    DEBUG(dbgs() << "* Module " << ModName << " exports " << Exports.size()
+                 << " functions. Imports from " << ModuleImports.second.size()
+                 << " modules.\n");
+    for (auto &Src : ModuleImports.second) {
+      auto SrcModName = Src.first();
+      DEBUG(dbgs() << " - " << Src.second.size() << " functions imported from "
+                   << SrcModName << "\n");
+    }
+  }
+#endif
+}
+
+/// Compute all the imports for the given module in the Index.
+void llvm::ComputeCrossModuleImportForModule(
+    StringRef ModulePath, const ModuleSummaryIndex &Index,
+    FunctionImporter::ImportMapTy &ImportList) {
+
+  // Collect the list of functions this module defines.
+  // GUID -> Summary
+  GVSummaryMapTy FunctionSummaryMap;
+  Index.collectDefinedFunctionsForModule(ModulePath, FunctionSummaryMap);
+
+  // Compute the import list for this module.
+  DEBUG(dbgs() << "Computing import for Module '" << ModulePath << "'\n");
+  ComputeImportForModule(FunctionSummaryMap, Index, ImportList);
+
+#ifndef NDEBUG
+  DEBUG(dbgs() << "* Module " << ModulePath << " imports from "
+               << ImportList.size() << " modules.\n");
+  for (auto &Src : ImportList) {
+    auto SrcModName = Src.first();
+    DEBUG(dbgs() << " - " << Src.second.size() << " functions imported from "
+                 << SrcModName << "\n");
+  }
+#endif
+}
+
+/// Compute the set of summaries needed for a ThinLTO backend compilation of
+/// \p ModulePath.
+void llvm::gatherImportedSummariesForModule(
+    StringRef ModulePath,
+    const StringMap<GVSummaryMapTy> &ModuleToDefinedGVSummaries,
+    const StringMap<FunctionImporter::ImportMapTy> &ImportLists,
+    std::map<std::string, GVSummaryMapTy> &ModuleToSummariesForIndex) {
+  // Include all summaries from the importing module.
+  ModuleToSummariesForIndex[ModulePath] =
+      ModuleToDefinedGVSummaries.lookup(ModulePath);
+  auto ModuleImports = ImportLists.find(ModulePath);
+  if (ModuleImports != ImportLists.end()) {
+    // Include summaries for imports.
+    for (auto &ILI : ModuleImports->second) {
+      auto &SummariesForIndex = ModuleToSummariesForIndex[ILI.first()];
+      const auto &DefinedGVSummaries =
+          ModuleToDefinedGVSummaries.lookup(ILI.first());
+      for (auto &GI : ILI.second) {
+        const auto &DS = DefinedGVSummaries.find(GI.first);
+        assert(DS != DefinedGVSummaries.end() &&
+               "Expected a defined summary for imported global value");
+        SummariesForIndex[GI.first] = DS->second;
+      }
+    }
+  }
+}
+
+/// Emit the files \p ModulePath will import from into \p OutputFilename.
+std::error_code llvm::EmitImportsFiles(
+    StringRef ModulePath, StringRef OutputFilename,
+    const StringMap<FunctionImporter::ImportMapTy> &ImportLists) {
+  auto ModuleImports = ImportLists.find(ModulePath);
+  std::error_code EC;
+  raw_fd_ostream ImportsOS(OutputFilename, EC, sys::fs::OpenFlags::F_None);
+  if (EC)
+    return EC;
+  if (ModuleImports != ImportLists.end())
+    for (auto &ILI : ModuleImports->second)
+      ImportsOS << ILI.first() << "\n";
+  return std::error_code();
+}
+
+/// Fixup WeakForLinker linkages in \p TheModule based on summary analysis.
+void llvm::thinLTOResolveWeakForLinkerModule(
+    Module &TheModule, const GVSummaryMapTy &DefinedGlobals) {
+  auto updateLinkage = [&](GlobalValue &GV) {
+    if (!GlobalValue::isWeakForLinker(GV.getLinkage()))
+      return;
+    // See if the global summary analysis computed a new resolved linkage.
+    const auto &GS = DefinedGlobals.find(GV.getGUID());
+    if (GS == DefinedGlobals.end())
+      return;
+    auto NewLinkage = GS->second->linkage();
+    if (NewLinkage == GV.getLinkage())
+      return;
+    DEBUG(dbgs() << "ODR fixing up linkage for `" << GV.getName() << "` from "
+                 << GV.getLinkage() << " to " << NewLinkage << "\n");
+    GV.setLinkage(NewLinkage);
+  };
+
+  // Process functions and global now
+  for (auto &GV : TheModule)
+    updateLinkage(GV);
+  for (auto &GV : TheModule.globals())
+    updateLinkage(GV);
+  for (auto &GV : TheModule.aliases())
+    updateLinkage(GV);
+}
+
+/// Run internalization on \p TheModule based on symmary analysis.
+void llvm::thinLTOInternalizeModule(Module &TheModule,
+                                    const GVSummaryMapTy &DefinedGlobals) {
+  // Parse inline ASM and collect the list of symbols that are not defined in
+  // the current module.
+  StringSet<> AsmUndefinedRefs;
+  object::IRObjectFile::CollectAsmUndefinedRefs(
+      Triple(TheModule.getTargetTriple()), TheModule.getModuleInlineAsm(),
+      [&AsmUndefinedRefs](StringRef Name, object::BasicSymbolRef::Flags Flags) {
+        if (Flags & object::BasicSymbolRef::SF_Undefined)
+          AsmUndefinedRefs.insert(Name);
+      });
+
+  // Declare a callback for the internalize pass that will ask for every
+  // candidate GlobalValue if it can be internalized or not.
+  auto MustPreserveGV = [&](const GlobalValue &GV) -> bool {
+    // Can't be internalized if referenced in inline asm.
+    if (AsmUndefinedRefs.count(GV.getName()))
+      return true;
+
+    // Lookup the linkage recorded in the summaries during global analysis.
+    const auto &GS = DefinedGlobals.find(GV.getGUID());
+    GlobalValue::LinkageTypes Linkage;
+    if (GS == DefinedGlobals.end()) {
+      // Must have been promoted (possibly conservatively). Find original
+      // name so that we can access the correct summary and see if it can
+      // be internalized again.
+      // FIXME: Eventually we should control promotion instead of promoting
+      // and internalizing again.
+      StringRef OrigName =
+          ModuleSummaryIndex::getOriginalNameBeforePromote(GV.getName());
+      std::string OrigId = GlobalValue::getGlobalIdentifier(
+          OrigName, GlobalValue::InternalLinkage,
+          TheModule.getSourceFileName());
+      const auto &GS = DefinedGlobals.find(GlobalValue::getGUID(OrigId));
+      if (GS == DefinedGlobals.end()) {
+        // Also check the original non-promoted non-globalized name. In some
+        // cases a preempted weak value is linked in as a local copy because
+        // it is referenced by an alias (IRLinker::linkGlobalValueProto).
+        // In that case, since it was originally not a local value, it was
+        // recorded in the index using the original name.
+        // FIXME: This may not be needed once PR27866 is fixed.
+        const auto &GS = DefinedGlobals.find(GlobalValue::getGUID(OrigName));
+        assert(GS != DefinedGlobals.end());
+        Linkage = GS->second->linkage();
+      } else {
+        Linkage = GS->second->linkage();
+      }
+    } else
+      Linkage = GS->second->linkage();
+    return !GlobalValue::isLocalLinkage(Linkage);
+  };
+
+  // FIXME: See if we can just internalize directly here via linkage changes
+  // based on the index, rather than invoking internalizeModule.
+  llvm::internalizeModule(TheModule, MustPreserveGV);
+}
+
 // Automatically import functions in Module \p DestModule based on the summaries
 // index.
 //
-// The current implementation imports every called functions that exists in the
-// summaries index.
-bool FunctionImporter::importFunctions(Module &DestModule) {
+bool FunctionImporter::importFunctions(
+    Module &DestModule, const FunctionImporter::ImportMapTy &ImportList,
+    bool ForceImportReferencedDiscardableSymbols) {
   DEBUG(dbgs() << "Starting import for Module "
                << DestModule.getModuleIdentifier() << "\n");
   unsigned ImportedCount = 0;
 
-  /// First step is collecting the called external functions.
-  StringSet<> CalledFunctions;
-  SmallVector<StringRef, 64> Worklist;
-  for (auto &F : DestModule) {
-    if (F.isDeclaration() || F.hasFnAttribute(Attribute::OptimizeNone))
-      continue;
-    findExternalCalls(DestModule, F, Index, CalledFunctions, Worklist);
-  }
-  if (Worklist.empty())
-    return false;
-
-  /// Second step: for every call to an external function, try to import it.
-
   // Linker that will be used for importing function
   Linker TheLinker(DestModule);
-
-  // Map of Module -> List of Function to import from the Module
-  std::map<StringRef, DenseSet<const GlobalValue *>>
-      ModuleToFunctionsToImportMap;
-
-  // Analyze the summaries and get the list of functions to import by
-  // populating ModuleToFunctionsToImportMap
-  ModuleLazyLoaderCache ModuleLoaderCache(ModuleLoader);
-  GetImportList(DestModule, Worklist, CalledFunctions,
-                ModuleToFunctionsToImportMap, Index, ModuleLoaderCache);
-  assert(Worklist.empty() && "Worklist hasn't been flushed in GetImportList");
-
-  StringMap<std::unique_ptr<DenseMap<unsigned, MDNode *>>>
-      ModuleToTempMDValsMap;
-
   // Do the actual import of functions now, one Module at a time
-  for (auto &FunctionsToImportPerModule : ModuleToFunctionsToImportMap) {
+  std::set<StringRef> ModuleNameOrderedList;
+  for (auto &FunctionsToImportPerModule : ImportList) {
+    ModuleNameOrderedList.insert(FunctionsToImportPerModule.first());
+  }
+  for (auto &Name : ModuleNameOrderedList) {
     // Get the module for the import
-    auto &FunctionsToImport = FunctionsToImportPerModule.second;
-    std::unique_ptr<Module> SrcModule =
-        ModuleLoaderCache.takeModule(FunctionsToImportPerModule.first);
+    const auto &FunctionsToImportPerModule = ImportList.find(Name);
+    assert(FunctionsToImportPerModule != ImportList.end());
+    std::unique_ptr<Module> SrcModule = ModuleLoader(Name);
     assert(&DestModule.getContext() == &SrcModule->getContext() &&
            "Context mismatch");
 
-    // Save the mapping of value ids to temporary metadata created when
-    // importing this function. If we have already imported from this module,
-    // add new temporary metadata to the existing mapping.
-    auto &TempMDVals = ModuleToTempMDValsMap[SrcModule->getModuleIdentifier()];
-    if (!TempMDVals)
-      TempMDVals = llvm::make_unique<DenseMap<unsigned, MDNode *>>();
+    // If modules were created with lazy metadata loading, materialize it
+    // now, before linking it (otherwise this will be a noop).
+    SrcModule->materializeMetadata();
+    UpgradeDebugInfo(*SrcModule);
+
+    auto &ImportGUIDs = FunctionsToImportPerModule->second;
+    // Find the globals to import
+    DenseSet<const GlobalValue *> GlobalsToImport;
+    for (Function &F : *SrcModule) {
+      if (!F.hasName())
+        continue;
+      auto GUID = F.getGUID();
+      auto Import = ImportGUIDs.count(GUID);
+      DEBUG(dbgs() << (Import ? "Is" : "Not") << " importing function " << GUID
+                   << " " << F.getName() << " from "
+                   << SrcModule->getSourceFileName() << "\n");
+      if (Import) {
+        F.materialize();
+        if (EnableImportMetadata) {
+          // Add 'thinlto_src_module' metadata for statistics and debugging.
+          F.setMetadata(
+              "thinlto_src_module",
+              llvm::MDNode::get(
+                  DestModule.getContext(),
+                  {llvm::MDString::get(DestModule.getContext(),
+                                       SrcModule->getSourceFileName())}));
+        }
+        GlobalsToImport.insert(&F);
+      }
+    }
+    for (GlobalVariable &GV : SrcModule->globals()) {
+      if (!GV.hasName())
+        continue;
+      auto GUID = GV.getGUID();
+      auto Import = ImportGUIDs.count(GUID);
+      DEBUG(dbgs() << (Import ? "Is" : "Not") << " importing global " << GUID
+                   << " " << GV.getName() << " from "
+                   << SrcModule->getSourceFileName() << "\n");
+      if (Import) {
+        GV.materialize();
+        GlobalsToImport.insert(&GV);
+      }
+    }
+    for (GlobalAlias &GA : SrcModule->aliases()) {
+      if (!GA.hasName())
+        continue;
+      auto GUID = GA.getGUID();
+      auto Import = ImportGUIDs.count(GUID);
+      DEBUG(dbgs() << (Import ? "Is" : "Not") << " importing alias " << GUID
+                   << " " << GA.getName() << " from "
+                   << SrcModule->getSourceFileName() << "\n");
+      if (Import) {
+        // Alias can't point to "available_externally". However when we import
+        // linkOnceODR the linkage does not change. So we import the alias
+        // and aliasee only in this case. This has been handled by
+        // computeImportForFunction()
+        GlobalObject *GO = GA.getBaseObject();
+        assert(GO->hasLinkOnceODRLinkage() &&
+               "Unexpected alias to a non-linkonceODR in import list");
+#ifndef NDEBUG
+        if (!GlobalsToImport.count(GO))
+          DEBUG(dbgs() << " alias triggers importing aliasee " << GO->getGUID()
+                       << " " << GO->getName() << " from "
+                       << SrcModule->getSourceFileName() << "\n");
+#endif
+        GO->materialize();
+        GlobalsToImport.insert(GO);
+        GA.materialize();
+        GlobalsToImport.insert(&GA);
+      }
+    }
 
     // Link in the specified functions.
-    if (TheLinker.linkInModule(std::move(SrcModule), Linker::Flags::None,
-                               &Index, &FunctionsToImport, TempMDVals.get()))
+    if (renameModuleForThinLTO(*SrcModule, Index, &GlobalsToImport))
+      return true;
+
+    if (PrintImports) {
+      for (const auto *GV : GlobalsToImport)
+        dbgs() << DestModule.getSourceFileName() << ": Import " << GV->getName()
+               << " from " << SrcModule->getSourceFileName() << "\n";
+    }
+
+    // Instruct the linker that the client will take care of linkonce resolution
+    unsigned Flags = Linker::Flags::None;
+    if (!ForceImportReferencedDiscardableSymbols)
+      Flags |= Linker::Flags::DontForceLinkLinkonceODR;
+
+    if (TheLinker.linkInModule(std::move(SrcModule), Flags, &GlobalsToImport))
       report_fatal_error("Function Import: link error");
 
-    ImportedCount += FunctionsToImport.size();
+    ImportedCount += GlobalsToImport.size();
   }
 
-  // Now link in metadata for all modules from which we imported functions.
-  for (StringMapEntry<std::unique_ptr<DenseMap<unsigned, MDNode *>>> &SME :
-       ModuleToTempMDValsMap) {
-    // Load the specified source module.
-    auto &SrcModule = ModuleLoaderCache(SME.getKey());
-    // The modules were created with lazy metadata loading. Materialize it
-    // now, before linking it.
-    SrcModule.materializeMetadata();
-    UpgradeDebugInfo(SrcModule);
-
-    // Link in all necessary metadata from this module.
-    if (TheLinker.linkInMetadata(SrcModule, SME.getValue().get()))
-      return false;
-  }
+  NumImported += ImportedCount;
 
   DEBUG(dbgs() << "Imported " << ImportedCount << " functions for Module "
                << DestModule.getModuleIdentifier() << "\n");
@@ -355,11 +696,11 @@ static void diagnosticHandler(const DiagnosticInfo &DI) {
   OS << '\n';
 }
 
-/// Parse the function index out of an IR file and return the function
+/// Parse the summary index out of an IR file and return the summary
 /// index object if found, or nullptr if not.
-static std::unique_ptr<FunctionInfoIndex>
-getFunctionIndexForFile(StringRef Path, std::string &Error,
-                        DiagnosticHandlerFunction DiagnosticHandler) {
+static std::unique_ptr<ModuleSummaryIndex> getModuleSummaryIndexForFile(
+    StringRef Path, std::string &Error,
+    const DiagnosticHandlerFunction &DiagnosticHandler) {
   std::unique_ptr<MemoryBuffer> Buffer;
   ErrorOr<std::unique_ptr<MemoryBuffer>> BufferOrErr =
       MemoryBuffer::getFile(Path);
@@ -368,9 +709,9 @@ getFunctionIndexForFile(StringRef Path, std::string &Error,
     return nullptr;
   }
   Buffer = std::move(BufferOrErr.get());
-  ErrorOr<std::unique_ptr<object::FunctionIndexObjectFile>> ObjOrErr =
-      object::FunctionIndexObjectFile::create(Buffer->getMemBufferRef(),
-                                              DiagnosticHandler);
+  ErrorOr<std::unique_ptr<object::ModuleSummaryIndexObjectFile>> ObjOrErr =
+      object::ModuleSummaryIndexObjectFile::create(Buffer->getMemBufferRef(),
+                                                   DiagnosticHandler);
   if (std::error_code EC = ObjOrErr.getError()) {
     Error = EC.message();
     return nullptr;
@@ -381,32 +722,34 @@ getFunctionIndexForFile(StringRef Path, std::string &Error,
 namespace {
 /// Pass that performs cross-module function import provided a summary file.
 class FunctionImportPass : public ModulePass {
-  /// Optional function summary index to use for importing, otherwise
+  /// Optional module summary index to use for importing, otherwise
   /// the summary-file option must be specified.
-  const FunctionInfoIndex *Index;
+  const ModuleSummaryIndex *Index;
 
 public:
   /// Pass identification, replacement for typeid
   static char ID;
 
   /// Specify pass name for debug output
-  const char *getPassName() const override {
-    return "Function Importing";
-  }
+  const char *getPassName() const override { return "Function Importing"; }
 
-  explicit FunctionImportPass(const FunctionInfoIndex *Index = nullptr)
+  explicit FunctionImportPass(const ModuleSummaryIndex *Index = nullptr)
       : ModulePass(ID), Index(Index) {}
 
   bool runOnModule(Module &M) override {
+    if (skipModule(M))
+      return false;
+
     if (SummaryFile.empty() && !Index)
       report_fatal_error("error: -function-import requires -summary-file or "
                          "file from frontend\n");
-    std::unique_ptr<FunctionInfoIndex> IndexPtr;
+    std::unique_ptr<ModuleSummaryIndex> IndexPtr;
     if (!SummaryFile.empty()) {
       if (Index)
         report_fatal_error("error: -summary-file and index from frontend\n");
       std::string Error;
-      IndexPtr = getFunctionIndexForFile(SummaryFile, Error, diagnosticHandler);
+      IndexPtr =
+          getModuleSummaryIndexForFile(SummaryFile, Error, diagnosticHandler);
       if (!IndexPtr) {
         errs() << "Error loading file '" << SummaryFile << "': " << Error
                << "\n";
@@ -415,9 +758,14 @@ public:
       Index = IndexPtr.get();
     }
 
-    // First we need to promote to global scope and rename any local values that
+    // First step is collecting the import list.
+    FunctionImporter::ImportMapTy ImportList;
+    ComputeCrossModuleImportForModule(M.getModuleIdentifier(), *Index,
+                                      ImportList);
+
+    // Next we need to promote to global scope and rename any local values that
     // are potentially exported to other modules.
-    if (renameModuleForThinLTO(M, Index)) {
+    if (renameModuleForThinLTO(M, *Index, nullptr)) {
       errs() << "Error renaming module\n";
       return false;
     }
@@ -427,7 +775,8 @@ public:
       return loadFile(Identifier, M.getContext());
     };
     FunctionImporter Importer(*Index, ModuleLoader);
-    return Importer.importFunctions(M);
+    return Importer.importFunctions(
+        M, ImportList, !DontForceImportReferencedDiscardableSymbols);
   }
 };
 } // anonymous namespace
@@ -439,7 +788,7 @@ INITIALIZE_PASS_END(FunctionImportPass, "function-import",
                     "Summary Based Function Import", false, false)
 
 namespace llvm {
-Pass *createFunctionImportPass(const FunctionInfoIndex *Index = nullptr) {
+Pass *createFunctionImportPass(const ModuleSummaryIndex *Index = nullptr) {
   return new FunctionImportPass(Index);
 }
 }
diff --git a/lib/Transforms/IPO/GlobalDCE.cpp b/lib/Transforms/IPO/GlobalDCE.cpp
index 9b276ed28e2e..4c74698a1b61 100644
--- a/lib/Transforms/IPO/GlobalDCE.cpp
+++ b/lib/Transforms/IPO/GlobalDCE.cpp
@@ -15,15 +15,16 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "llvm/Transforms/IPO.h"
+#include "llvm/Transforms/IPO/GlobalDCE.h"
 #include "llvm/ADT/SmallPtrSet.h"
 #include "llvm/ADT/Statistic.h"
 #include "llvm/IR/Constants.h"
 #include "llvm/IR/Instructions.h"
 #include "llvm/IR/Module.h"
+#include "llvm/Pass.h"
+#include "llvm/Transforms/IPO.h"
 #include "llvm/Transforms/Utils/CtorUtils.h"
 #include "llvm/Transforms/Utils/GlobalStatus.h"
-#include "llvm/Pass.h"
 #include <unordered_map>
 using namespace llvm;
 
@@ -31,32 +32,41 @@ using namespace llvm;
 
 STATISTIC(NumAliases  , "Number of global aliases removed");
 STATISTIC(NumFunctions, "Number of functions removed");
+STATISTIC(NumIFuncs,    "Number of indirect functions removed");
 STATISTIC(NumVariables, "Number of global variables removed");
 
 namespace {
-  struct GlobalDCE : public ModulePass {
+  class GlobalDCELegacyPass : public ModulePass {
+  public:
     static char ID; // Pass identification, replacement for typeid
-    GlobalDCE() : ModulePass(ID) {
-      initializeGlobalDCEPass(*PassRegistry::getPassRegistry());
+    GlobalDCELegacyPass() : ModulePass(ID) {
+      initializeGlobalDCELegacyPassPass(*PassRegistry::getPassRegistry());
     }
 
     // run - Do the GlobalDCE pass on the specified module, optionally updating
     // the specified callgraph to reflect the changes.
     //
-    bool runOnModule(Module &M) override;
+    bool runOnModule(Module &M) override {
+      if (skipModule(M))
+        return false;
+
+      ModuleAnalysisManager DummyMAM;
+      auto PA = Impl.run(M, DummyMAM);
+      return !PA.areAllPreserved();
+    }
 
   private:
-    SmallPtrSet<GlobalValue*, 32> AliveGlobals;
-    SmallPtrSet<Constant *, 8> SeenConstants;
-    std::unordered_multimap<Comdat *, GlobalValue *> ComdatMembers;
+    GlobalDCEPass Impl;
+  };
+}
 
-    /// GlobalIsNeeded - mark the specific global value as needed, and
-    /// recursively mark anything that it uses as also needed.
-    void GlobalIsNeeded(GlobalValue *GV);
-    void MarkUsedGlobalsAsNeeded(Constant *C);
+char GlobalDCELegacyPass::ID = 0;
+INITIALIZE_PASS(GlobalDCELegacyPass, "globaldce",
+                "Dead Global Elimination", false, false)
 
-    bool RemoveUnusedGlobalValue(GlobalValue &GV);
-  };
+// Public interface to the GlobalDCEPass.
+ModulePass *llvm::createGlobalDCEPass() {
+  return new GlobalDCELegacyPass();
 }
 
 /// Returns true if F contains only a single "ret" instruction.
@@ -68,13 +78,7 @@ static bool isEmptyFunction(Function *F) {
   return RI.getReturnValue() == nullptr;
 }
 
-char GlobalDCE::ID = 0;
-INITIALIZE_PASS(GlobalDCE, "globaldce",
-                "Dead Global Elimination", false, false)
-
-ModulePass *llvm::createGlobalDCEPass() { return new GlobalDCE(); }
-
-bool GlobalDCE::runOnModule(Module &M) {
+PreservedAnalyses GlobalDCEPass::run(Module &M, ModuleAnalysisManager &) {
   bool Changed = false;
 
   // Remove empty functions from the global ctors list.
@@ -92,21 +96,14 @@ bool GlobalDCE::runOnModule(Module &M) {
       ComdatMembers.insert(std::make_pair(C, &GA));
 
   // Loop over the module, adding globals which are obviously necessary.
-  for (Function &F : M) {
-    Changed |= RemoveUnusedGlobalValue(F);
-    // Functions with external linkage are needed if they have a body
-    if (!F.isDeclaration() && !F.hasAvailableExternallyLinkage())
-      if (!F.isDiscardableIfUnused())
-        GlobalIsNeeded(&F);
-  }
-
-  for (GlobalVariable &GV : M.globals()) {
-    Changed |= RemoveUnusedGlobalValue(GV);
+  for (GlobalObject &GO : M.global_objects()) {
+    Changed |= RemoveUnusedGlobalValue(GO);
+    // Functions with external linkage are needed if they have a body.
     // Externally visible & appending globals are needed, if they have an
     // initializer.
-    if (!GV.isDeclaration() && !GV.hasAvailableExternallyLinkage())
-      if (!GV.isDiscardableIfUnused())
-        GlobalIsNeeded(&GV);
+    if (!GO.isDeclaration() && !GO.hasAvailableExternallyLinkage())
+      if (!GO.isDiscardableIfUnused())
+        GlobalIsNeeded(&GO);
   }
 
   for (GlobalAlias &GA : M.aliases()) {
@@ -116,6 +113,13 @@ bool GlobalDCE::runOnModule(Module &M) {
       GlobalIsNeeded(&GA);
   }
 
+  for (GlobalIFunc &GIF : M.ifuncs()) {
+    Changed |= RemoveUnusedGlobalValue(GIF);
+    // Externally visible ifuncs are needed.
+    if (!GIF.isDiscardableIfUnused())
+      GlobalIsNeeded(&GIF);
+  }
+
   // Now that all globals which are needed are in the AliveGlobals set, we loop
   // through the program, deleting those which are not alive.
   //
@@ -150,6 +154,14 @@ bool GlobalDCE::runOnModule(Module &M) {
       GA.setAliasee(nullptr);
     }
 
+  // The third pass drops targets of ifuncs which are dead...
+  std::vector<GlobalIFunc*> DeadIFuncs;
+  for (GlobalIFunc &GIF : M.ifuncs())
+    if (!AliveGlobals.count(&GIF)) {
+      DeadIFuncs.push_back(&GIF);
+      GIF.setResolver(nullptr);
+    }
+
   if (!DeadFunctions.empty()) {
     // Now that all interferences have been dropped, delete the actual objects
     // themselves.
@@ -180,17 +192,29 @@ bool GlobalDCE::runOnModule(Module &M) {
     Changed = true;
   }
 
+  // Now delete any dead aliases.
+  if (!DeadIFuncs.empty()) {
+    for (GlobalIFunc *GIF : DeadIFuncs) {
+      RemoveUnusedGlobalValue(*GIF);
+      M.getIFuncList().erase(GIF);
+    }
+    NumIFuncs += DeadIFuncs.size();
+    Changed = true;
+  }
+
   // Make sure that all memory is released
   AliveGlobals.clear();
   SeenConstants.clear();
   ComdatMembers.clear();
 
-  return Changed;
+  if (Changed)
+    return PreservedAnalyses::none();
+  return PreservedAnalyses::all();
 }
 
 /// GlobalIsNeeded - the specific global value as needed, and
 /// recursively mark anything that it uses as also needed.
-void GlobalDCE::GlobalIsNeeded(GlobalValue *G) {
+void GlobalDCEPass::GlobalIsNeeded(GlobalValue *G) {
   // If the global is already in the set, no need to reprocess it.
   if (!AliveGlobals.insert(G).second)
     return;
@@ -205,9 +229,9 @@ void GlobalDCE::GlobalIsNeeded(GlobalValue *G) {
     // referenced by the initializer to the alive set.
     if (GV->hasInitializer())
       MarkUsedGlobalsAsNeeded(GV->getInitializer());
-  } else if (GlobalAlias *GA = dyn_cast<GlobalAlias>(G)) {
-    // The target of a global alias is needed.
-    MarkUsedGlobalsAsNeeded(GA->getAliasee());
+  } else if (GlobalIndirectSymbol *GIS = dyn_cast<GlobalIndirectSymbol>(G)) {
+    // The target of a global alias or ifunc is needed.
+    MarkUsedGlobalsAsNeeded(GIS->getIndirectSymbol());
   } else {
     // Otherwise this must be a function object.  We have to scan the body of
     // the function looking for constants and global values which are used as
@@ -228,7 +252,7 @@ void GlobalDCE::GlobalIsNeeded(GlobalValue *G) {
   }
 }
 
-void GlobalDCE::MarkUsedGlobalsAsNeeded(Constant *C) {
+void GlobalDCEPass::MarkUsedGlobalsAsNeeded(Constant *C) {
   if (GlobalValue *GV = dyn_cast<GlobalValue>(C))
     return GlobalIsNeeded(GV);
 
@@ -248,7 +272,7 @@ void GlobalDCE::MarkUsedGlobalsAsNeeded(Constant *C) {
 // so, nuke it.  This will reduce the reference count on the global value, which
 // might make it deader.
 //
-bool GlobalDCE::RemoveUnusedGlobalValue(GlobalValue &GV) {
+bool GlobalDCEPass::RemoveUnusedGlobalValue(GlobalValue &GV) {
   if (GV.use_empty())
     return false;
   GV.removeDeadConstantUsers();
diff --git a/lib/Transforms/IPO/GlobalOpt.cpp b/lib/Transforms/IPO/GlobalOpt.cpp
index fd7736905fe8..310c29275faf 100644
--- a/lib/Transforms/IPO/GlobalOpt.cpp
+++ b/lib/Transforms/IPO/GlobalOpt.cpp
@@ -13,7 +13,7 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "llvm/Transforms/IPO.h"
+#include "llvm/Transforms/IPO/GlobalOpt.h"
 #include "llvm/ADT/DenseMap.h"
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/SmallPtrSet.h"
@@ -40,11 +40,11 @@
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/MathExtras.h"
 #include "llvm/Support/raw_ostream.h"
+#include "llvm/Transforms/IPO.h"
 #include "llvm/Transforms/Utils/CtorUtils.h"
+#include "llvm/Transforms/Utils/Evaluator.h"
 #include "llvm/Transforms/Utils/GlobalStatus.h"
-#include "llvm/Transforms/Utils/ModuleUtils.h"
 #include <algorithm>
-#include <deque>
 using namespace llvm;
 
 #define DEBUG_TYPE "globalopt"
@@ -65,46 +65,6 @@ STATISTIC(NumAliasesResolved, "Number of global aliases resolved");
 STATISTIC(NumAliasesRemoved, "Number of global aliases eliminated");
 STATISTIC(NumCXXDtorsRemoved, "Number of global C++ destructors removed");
 
-namespace {
-  struct GlobalOpt : public ModulePass {
-    void getAnalysisUsage(AnalysisUsage &AU) const override {
-      AU.addRequired<TargetLibraryInfoWrapperPass>();
-      AU.addRequired<DominatorTreeWrapperPass>();
-    }
-    static char ID; // Pass identification, replacement for typeid
-    GlobalOpt() : ModulePass(ID) {
-      initializeGlobalOptPass(*PassRegistry::getPassRegistry());
-    }
-
-    bool runOnModule(Module &M) override;
-
-  private:
-    bool OptimizeFunctions(Module &M);
-    bool OptimizeGlobalVars(Module &M);
-    bool OptimizeGlobalAliases(Module &M);
-    bool deleteIfDead(GlobalValue &GV);
-    bool processGlobal(GlobalValue &GV);
-    bool processInternalGlobal(GlobalVariable *GV, const GlobalStatus &GS);
-    bool OptimizeEmptyGlobalCXXDtors(Function *CXAAtExitFn);
-
-    bool isPointerValueDeadOnEntryToFunction(const Function *F,
-                                             GlobalValue *GV);
-
-    TargetLibraryInfo *TLI;
-    SmallSet<const Comdat *, 8> NotDiscardableComdats;
-  };
-}
-
-char GlobalOpt::ID = 0;
-INITIALIZE_PASS_BEGIN(GlobalOpt, "globalopt",
-                "Global Variable Optimizer", false, false)
-INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfoWrapperPass)
-INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
-INITIALIZE_PASS_END(GlobalOpt, "globalopt",
-                "Global Variable Optimizer", false, false)
-
-ModulePass *llvm::createGlobalOptimizerPass() { return new GlobalOpt(); }
-
 /// Is this global variable possibly used by a leak checker as a root?  If so,
 /// we might not really want to eliminate the stores to it.
 static bool isLeakCheckerRoot(GlobalVariable *GV) {
@@ -120,7 +80,7 @@ static bool isLeakCheckerRoot(GlobalVariable *GV) {
     return false;
 
   SmallVector<Type *, 4> Types;
-  Types.push_back(cast<PointerType>(GV->getType())->getElementType());
+  Types.push_back(GV->getValueType());
 
   unsigned Limit = 20;
   do {
@@ -329,7 +289,7 @@ static bool CleanupConstantGlobalUsers(Value *V, Constant *Init,
         // we already know what the result of any load from that GEP is.
         // TODO: Handle splats.
         if (Init && isa<ConstantAggregateZero>(Init) && GEP->isInBounds())
-          SubInit = Constant::getNullValue(GEP->getType()->getElementType());
+          SubInit = Constant::getNullValue(GEP->getResultElementType());
       }
       Changed |= CleanupConstantGlobalUsers(GEP, SubInit, DL, TLI);
 
@@ -475,7 +435,7 @@ static GlobalVariable *SRAGlobal(GlobalVariable *GV, const DataLayout &DL) {
   if (!GlobalUsersSafeToSRA(GV))
     return nullptr;
 
-  assert(GV->hasLocalLinkage() && !GV->isConstant());
+  assert(GV->hasLocalLinkage());
   Constant *Init = GV->getInitializer();
   Type *Ty = Init->getType();
 
@@ -499,6 +459,7 @@ static GlobalVariable *SRAGlobal(GlobalVariable *GV, const DataLayout &DL) {
                                                GV->getThreadLocalMode(),
                                               GV->getType()->getAddressSpace());
       NGV->setExternallyInitialized(GV->isExternallyInitialized());
+      NGV->copyAttributesFrom(GV);
       Globals.push_back(NGV);
       NewGlobals.push_back(NGV);
 
@@ -533,6 +494,7 @@ static GlobalVariable *SRAGlobal(GlobalVariable *GV, const DataLayout &DL) {
                                                GV->getThreadLocalMode(),
                                               GV->getType()->getAddressSpace());
       NGV->setExternallyInitialized(GV->isExternallyInitialized());
+      NGV->copyAttributesFrom(GV);
       Globals.push_back(NGV);
       NewGlobals.push_back(NGV);
 
@@ -867,9 +829,8 @@ OptimizeGlobalAddressOfMalloc(GlobalVariable *GV, CallInst *CI, Type *AllocTy,
   }
 
   Constant *RepValue = NewGV;
-  if (NewGV->getType() != GV->getType()->getElementType())
-    RepValue = ConstantExpr::getBitCast(RepValue,
-                                        GV->getType()->getElementType());
+  if (NewGV->getType() != GV->getValueType())
+    RepValue = ConstantExpr::getBitCast(RepValue, GV->getValueType());
 
   // If there is a comparison against null, we will insert a global bool to
   // keep track of whether the global was initialized yet or not.
@@ -1283,6 +1244,9 @@ static GlobalVariable *PerformHeapAllocSRoA(GlobalVariable *GV, CallInst *CI,
   std::vector<Value*> FieldGlobals;
   std::vector<Value*> FieldMallocs;
 
+  SmallVector<OperandBundleDef, 1> OpBundles;
+  CI->getOperandBundlesAsDefs(OpBundles);
+
   unsigned AS = GV->getType()->getPointerAddressSpace();
   for (unsigned FieldNo = 0, e = STy->getNumElements(); FieldNo != e;++FieldNo){
     Type *FieldTy = STy->getElementType(FieldNo);
@@ -1292,6 +1256,7 @@ static GlobalVariable *PerformHeapAllocSRoA(GlobalVariable *GV, CallInst *CI,
         *GV->getParent(), PFieldTy, false, GlobalValue::InternalLinkage,
         Constant::getNullValue(PFieldTy), GV->getName() + ".f" + Twine(FieldNo),
         nullptr, GV->getThreadLocalMode());
+    NGV->copyAttributesFrom(GV);
     FieldGlobals.push_back(NGV);
 
     unsigned TypeSize = DL.getTypeAllocSize(FieldTy);
@@ -1300,7 +1265,7 @@ static GlobalVariable *PerformHeapAllocSRoA(GlobalVariable *GV, CallInst *CI,
     Type *IntPtrTy = DL.getIntPtrType(CI->getType());
     Value *NMI = CallInst::CreateMalloc(CI, IntPtrTy, FieldTy,
                                         ConstantInt::get(IntPtrTy, TypeSize),
-                                        NElems, nullptr,
+                                        NElems, OpBundles, nullptr,
                                         CI->getName() + ".f" + Twine(FieldNo));
     FieldMallocs.push_back(NMI);
     new StoreInst(NMI, NGV, CI);
@@ -1359,7 +1324,7 @@ static GlobalVariable *PerformHeapAllocSRoA(GlobalVariable *GV, CallInst *CI,
                                          Cmp, NullPtrBlock);
 
     // Fill in FreeBlock.
-    CallInst::CreateFree(GVVal, BI);
+    CallInst::CreateFree(GVVal, OpBundles, BI);
     new StoreInst(Constant::getNullValue(GVVal->getType()), FieldGlobals[i],
                   FreeBlock);
     BranchInst::Create(NextBlock, FreeBlock);
@@ -1397,8 +1362,8 @@ static GlobalVariable *PerformHeapAllocSRoA(GlobalVariable *GV, CallInst *CI,
 
     // Insert a store of null into each global.
     for (unsigned i = 0, e = FieldGlobals.size(); i != e; ++i) {
-      PointerType *PT = cast<PointerType>(FieldGlobals[i]->getType());
-      Constant *Null = Constant::getNullValue(PT->getElementType());
+      Type *ValTy = cast<GlobalValue>(FieldGlobals[i])->getValueType();
+      Constant *Null = Constant::getNullValue(ValTy);
       new StoreInst(Null, FieldGlobals[i], SI);
     }
     // Erase the original store.
@@ -1500,7 +1465,7 @@ static bool tryToOptimizeStoreOfMallocToGlobal(GlobalVariable *GV, CallInst *CI,
   // into multiple malloc'd arrays, one for each field.  This is basically
   // SRoA for malloc'd memory.
 
-  if (Ordering != NotAtomic)
+  if (Ordering != AtomicOrdering::NotAtomic)
     return false;
 
   // If this is an allocation of a fixed size array of structs, analyze as a
@@ -1525,9 +1490,11 @@ static bool tryToOptimizeStoreOfMallocToGlobal(GlobalVariable *GV, CallInst *CI,
       unsigned TypeSize = DL.getStructLayout(AllocSTy)->getSizeInBytes();
       Value *AllocSize = ConstantInt::get(IntPtrTy, TypeSize);
       Value *NumElements = ConstantInt::get(IntPtrTy, AT->getNumElements());
-      Instruction *Malloc = CallInst::CreateMalloc(CI, IntPtrTy, AllocSTy,
-                                                   AllocSize, NumElements,
-                                                   nullptr, CI->getName());
+      SmallVector<OperandBundleDef, 1> OpBundles;
+      CI->getOperandBundlesAsDefs(OpBundles);
+      Instruction *Malloc =
+          CallInst::CreateMalloc(CI, IntPtrTy, AllocSTy, AllocSize, NumElements,
+                                 OpBundles, nullptr, CI->getName());
       Instruction *Cast = new BitCastInst(Malloc, CI->getType(), "tmp", CI);
       CI->replaceAllUsesWith(Cast);
       CI->eraseFromParent();
@@ -1583,7 +1550,7 @@ static bool optimizeOnceStoredGlobal(GlobalVariable *GV, Value *StoredOnceVal,
 /// boolean and select between the two values whenever it is used.  This exposes
 /// the values to other scalar optimizations.
 static bool TryToShrinkGlobalToBoolean(GlobalVariable *GV, Constant *OtherVal) {
-  Type *GVElType = GV->getType()->getElementType();
+  Type *GVElType = GV->getValueType();
 
   // If GVElType is already i1, it is already shrunk.  If the type of the GV is
   // an FP value, pointer or vector, don't do this optimization because a select
@@ -1611,6 +1578,7 @@ static bool TryToShrinkGlobalToBoolean(GlobalVariable *GV, Constant *OtherVal) {
                                              GV->getName()+".b",
                                              GV->getThreadLocalMode(),
                                              GV->getType()->getAddressSpace());
+  NewGV->copyAttributesFrom(GV);
   GV->getParent()->getGlobalList().insert(GV->getIterator(), NewGV);
 
   Constant *InitVal = GV->getInitializer();
@@ -1679,7 +1647,8 @@ static bool TryToShrinkGlobalToBoolean(GlobalVariable *GV, Constant *OtherVal) {
   return true;
 }
 
-bool GlobalOpt::deleteIfDead(GlobalValue &GV) {
+static bool deleteIfDead(GlobalValue &GV,
+                         SmallSet<const Comdat *, 8> &NotDiscardableComdats) {
   GV.removeDeadConstantUsers();
 
   if (!GV.isDiscardableIfUnused())
@@ -1703,36 +1672,9 @@ bool GlobalOpt::deleteIfDead(GlobalValue &GV) {
   return true;
 }
 
-/// Analyze the specified global variable and optimize it if possible.  If we
-/// make a change, return true.
-bool GlobalOpt::processGlobal(GlobalValue &GV) {
-  // Do more involved optimizations if the global is internal.
-  if (!GV.hasLocalLinkage())
-    return false;
-
-  GlobalStatus GS;
-
-  if (GlobalStatus::analyzeGlobal(&GV, GS))
-    return false;
-
-  bool Changed = false;
-  if (!GS.IsCompared && !GV.hasUnnamedAddr()) {
-    GV.setUnnamedAddr(true);
-    NumUnnamed++;
-    Changed = true;
-  }
-
-  auto *GVar = dyn_cast<GlobalVariable>(&GV);
-  if (!GVar)
-    return Changed;
-
-  if (GVar->isConstant() || !GVar->hasInitializer())
-    return Changed;
-
-  return processInternalGlobal(GVar, GS) || Changed;
-}
-
-bool GlobalOpt::isPointerValueDeadOnEntryToFunction(const Function *F, GlobalValue *GV) {
+static bool isPointerValueDeadOnEntryToFunction(
+    const Function *F, GlobalValue *GV,
+    function_ref<DominatorTree &(Function &)> LookupDomTree) {
   // Find all uses of GV. We expect them all to be in F, and if we can't
   // identify any of the uses we bail out.
   //
@@ -1776,8 +1718,7 @@ bool GlobalOpt::isPointerValueDeadOnEntryToFunction(const Function *F, GlobalVal
   // of them are known not to depend on the value of the global at the function
   // entry point. We do this by ensuring that every load is dominated by at
   // least one store.
-  auto &DT = getAnalysis<DominatorTreeWrapperPass>(*const_cast<Function *>(F))
-                 .getDomTree();
+  auto &DT = LookupDomTree(*const_cast<Function *>(F));
 
   // The below check is quadratic. Check we're not going to do too many tests.
   // FIXME: Even though this will always have worst-case quadratic time, we
@@ -1866,8 +1807,9 @@ static void makeAllConstantUsesInstructions(Constant *C) {
 
 /// Analyze the specified global variable and optimize
 /// it if possible.  If we make a change, return true.
-bool GlobalOpt::processInternalGlobal(GlobalVariable *GV,
-                                      const GlobalStatus &GS) {
+static bool processInternalGlobal(
+    GlobalVariable *GV, const GlobalStatus &GS, TargetLibraryInfo *TLI,
+    function_ref<DominatorTree &(Function &)> LookupDomTree) {
   auto &DL = GV->getParent()->getDataLayout();
   // If this is a first class global and has only one accessing function and
   // this function is non-recursive, we replace the global with a local alloca
@@ -1879,16 +1821,17 @@ bool GlobalOpt::processInternalGlobal(GlobalVariable *GV,
   // If the global is in different address space, don't bring it to stack.
   if (!GS.HasMultipleAccessingFunctions &&
       GS.AccessingFunction &&
-      GV->getType()->getElementType()->isSingleValueType() &&
+      GV->getValueType()->isSingleValueType() &&
       GV->getType()->getAddressSpace() == 0 &&
       !GV->isExternallyInitialized() &&
       allNonInstructionUsersCanBeMadeInstructions(GV) &&
       GS.AccessingFunction->doesNotRecurse() &&
-      isPointerValueDeadOnEntryToFunction(GS.AccessingFunction, GV) ) {
+      isPointerValueDeadOnEntryToFunction(GS.AccessingFunction, GV,
+                                          LookupDomTree)) {
     DEBUG(dbgs() << "LOCALIZING GLOBAL: " << *GV << "\n");
     Instruction &FirstI = const_cast<Instruction&>(*GS.AccessingFunction
                                                    ->getEntryBlock().begin());
-    Type *ElemTy = GV->getType()->getElementType();
+    Type *ElemTy = GV->getValueType();
     // FIXME: Pass Global's alignment when globals have alignment
     AllocaInst *Alloca = new AllocaInst(ElemTy, nullptr,
                                         GV->getName(), &FirstI);
@@ -1896,7 +1839,7 @@ bool GlobalOpt::processInternalGlobal(GlobalVariable *GV,
       new StoreInst(GV->getInitializer(), Alloca, &FirstI);
 
     makeAllConstantUsesInstructions(GV);
-    
+
     GV->replaceAllUsesWith(Alloca);
     GV->eraseFromParent();
     ++NumLocalized;
@@ -1926,7 +1869,8 @@ bool GlobalOpt::processInternalGlobal(GlobalVariable *GV,
     }
     return Changed;
 
-  } else if (GS.StoredType <= GlobalStatus::InitializerStored) {
+  }
+  if (GS.StoredType <= GlobalStatus::InitializerStored) {
     DEBUG(dbgs() << "MARKING CONSTANT: " << *GV << "\n");
     GV->setConstant(true);
 
@@ -1939,15 +1883,18 @@ bool GlobalOpt::processInternalGlobal(GlobalVariable *GV,
             << "all users and delete global!\n");
       GV->eraseFromParent();
       ++NumDeleted;
+      return true;
     }
 
+    // Fall through to the next check; see if we can optimize further.
     ++NumMarked;
-    return true;
-  } else if (!GV->getInitializer()->getType()->isSingleValueType()) {
+  }
+  if (!GV->getInitializer()->getType()->isSingleValueType()) {
     const DataLayout &DL = GV->getParent()->getDataLayout();
     if (SRAGlobal(GV, DL))
       return true;
-  } else if (GS.StoredType == GlobalStatus::StoredOnce && GS.StoredOnceValue) {
+  }
+  if (GS.StoredType == GlobalStatus::StoredOnce && GS.StoredOnceValue) {
     // If the initial value for the global was an undef value, and if only
     // one other value was stored into it, we can just change the
     // initializer to be the stored value, then delete all stores to the
@@ -1978,7 +1925,7 @@ bool GlobalOpt::processInternalGlobal(GlobalVariable *GV,
     // Otherwise, if the global was not a boolean, we can shrink it to be a
     // boolean.
     if (Constant *SOVConstant = dyn_cast<Constant>(GS.StoredOnceValue)) {
-      if (GS.Ordering == NotAtomic) {
+      if (GS.Ordering == AtomicOrdering::NotAtomic) {
         if (TryToShrinkGlobalToBoolean(GV, SOVConstant)) {
           ++NumShrunkToBool;
           return true;
@@ -1990,6 +1937,44 @@ bool GlobalOpt::processInternalGlobal(GlobalVariable *GV,
   return false;
 }
 
+/// Analyze the specified global variable and optimize it if possible.  If we
+/// make a change, return true.
+static bool
+processGlobal(GlobalValue &GV, TargetLibraryInfo *TLI,
+              function_ref<DominatorTree &(Function &)> LookupDomTree) {
+  if (GV.getName().startswith("llvm."))
+    return false;
+
+  GlobalStatus GS;
+
+  if (GlobalStatus::analyzeGlobal(&GV, GS))
+    return false;
+
+  bool Changed = false;
+  if (!GS.IsCompared && !GV.hasGlobalUnnamedAddr()) {
+    auto NewUnnamedAddr = GV.hasLocalLinkage() ? GlobalValue::UnnamedAddr::Global
+                                               : GlobalValue::UnnamedAddr::Local;
+    if (NewUnnamedAddr != GV.getUnnamedAddr()) {
+      GV.setUnnamedAddr(NewUnnamedAddr);
+      NumUnnamed++;
+      Changed = true;
+    }
+  }
+
+  // Do more involved optimizations if the global is internal.
+  if (!GV.hasLocalLinkage())
+    return Changed;
+
+  auto *GVar = dyn_cast<GlobalVariable>(&GV);
+  if (!GVar)
+    return Changed;
+
+  if (GVar->isConstant() || !GVar->hasInitializer())
+    return Changed;
+
+  return processInternalGlobal(GVar, GS, TLI, LookupDomTree) || Changed;
+}
+
 /// Walk all of the direct calls of the specified function, changing them to
 /// FastCC.
 static void ChangeCalleesToFastCall(Function *F) {
@@ -2034,7 +2019,10 @@ static bool isProfitableToMakeFastCC(Function *F) {
   return CC == CallingConv::C || CC == CallingConv::X86_ThisCall;
 }
 
-bool GlobalOpt::OptimizeFunctions(Module &M) {
+static bool
+OptimizeFunctions(Module &M, TargetLibraryInfo *TLI,
+                  function_ref<DominatorTree &(Function &)> LookupDomTree,
+                  SmallSet<const Comdat *, 8> &NotDiscardableComdats) {
   bool Changed = false;
   // Optimize functions.
   for (Module::iterator FI = M.begin(), E = M.end(); FI != E; ) {
@@ -2043,12 +2031,12 @@ bool GlobalOpt::OptimizeFunctions(Module &M) {
     if (!F->hasName() && !F->isDeclaration() && !F->hasLocalLinkage())
       F->setLinkage(GlobalValue::InternalLinkage);
 
-    if (deleteIfDead(*F)) {
+    if (deleteIfDead(*F, NotDiscardableComdats)) {
       Changed = true;
       continue;
     }
 
-    Changed |= processGlobal(*F);
+    Changed |= processGlobal(*F, TLI, LookupDomTree);
 
     if (!F->hasLocalLinkage())
       continue;
@@ -2075,7 +2063,10 @@ bool GlobalOpt::OptimizeFunctions(Module &M) {
   return Changed;
 }
 
-bool GlobalOpt::OptimizeGlobalVars(Module &M) {
+static bool
+OptimizeGlobalVars(Module &M, TargetLibraryInfo *TLI,
+                   function_ref<DominatorTree &(Function &)> LookupDomTree,
+                   SmallSet<const Comdat *, 8> &NotDiscardableComdats) {
   bool Changed = false;
 
   for (Module::global_iterator GVI = M.global_begin(), E = M.global_end();
@@ -2093,148 +2084,16 @@ bool GlobalOpt::OptimizeGlobalVars(Module &M) {
           GV->setInitializer(New);
       }
 
-    if (deleteIfDead(*GV)) {
+    if (deleteIfDead(*GV, NotDiscardableComdats)) {
       Changed = true;
       continue;
     }
 
-    Changed |= processGlobal(*GV);
+    Changed |= processGlobal(*GV, TLI, LookupDomTree);
   }
   return Changed;
 }
 
-static inline bool
-isSimpleEnoughValueToCommit(Constant *C,
-                            SmallPtrSetImpl<Constant *> &SimpleConstants,
-                            const DataLayout &DL);
-
-/// Return true if the specified constant can be handled by the code generator.
-/// We don't want to generate something like:
-///   void *X = &X/42;
-/// because the code generator doesn't have a relocation that can handle that.
-///
-/// This function should be called if C was not found (but just got inserted)
-/// in SimpleConstants to avoid having to rescan the same constants all the
-/// time.
-static bool
-isSimpleEnoughValueToCommitHelper(Constant *C,
-                                  SmallPtrSetImpl<Constant *> &SimpleConstants,
-                                  const DataLayout &DL) {
-  // Simple global addresses are supported, do not allow dllimport or
-  // thread-local globals.
-  if (auto *GV = dyn_cast<GlobalValue>(C))
-    return !GV->hasDLLImportStorageClass() && !GV->isThreadLocal();
-
-  // Simple integer, undef, constant aggregate zero, etc are all supported.
-  if (C->getNumOperands() == 0 || isa<BlockAddress>(C))
-    return true;
-
-  // Aggregate values are safe if all their elements are.
-  if (isa<ConstantArray>(C) || isa<ConstantStruct>(C) ||
-      isa<ConstantVector>(C)) {
-    for (Value *Op : C->operands())
-      if (!isSimpleEnoughValueToCommit(cast<Constant>(Op), SimpleConstants, DL))
-        return false;
-    return true;
-  }
-
-  // We don't know exactly what relocations are allowed in constant expressions,
-  // so we allow &global+constantoffset, which is safe and uniformly supported
-  // across targets.
-  ConstantExpr *CE = cast<ConstantExpr>(C);
-  switch (CE->getOpcode()) {
-  case Instruction::BitCast:
-    // Bitcast is fine if the casted value is fine.
-    return isSimpleEnoughValueToCommit(CE->getOperand(0), SimpleConstants, DL);
-
-  case Instruction::IntToPtr:
-  case Instruction::PtrToInt:
-    // int <=> ptr is fine if the int type is the same size as the
-    // pointer type.
-    if (DL.getTypeSizeInBits(CE->getType()) !=
-        DL.getTypeSizeInBits(CE->getOperand(0)->getType()))
-      return false;
-    return isSimpleEnoughValueToCommit(CE->getOperand(0), SimpleConstants, DL);
-
-  // GEP is fine if it is simple + constant offset.
-  case Instruction::GetElementPtr:
-    for (unsigned i = 1, e = CE->getNumOperands(); i != e; ++i)
-      if (!isa<ConstantInt>(CE->getOperand(i)))
-        return false;
-    return isSimpleEnoughValueToCommit(CE->getOperand(0), SimpleConstants, DL);
-
-  case Instruction::Add:
-    // We allow simple+cst.
-    if (!isa<ConstantInt>(CE->getOperand(1)))
-      return false;
-    return isSimpleEnoughValueToCommit(CE->getOperand(0), SimpleConstants, DL);
-  }
-  return false;
-}
-
-static inline bool
-isSimpleEnoughValueToCommit(Constant *C,
-                            SmallPtrSetImpl<Constant *> &SimpleConstants,
-                            const DataLayout &DL) {
-  // If we already checked this constant, we win.
-  if (!SimpleConstants.insert(C).second)
-    return true;
-  // Check the constant.
-  return isSimpleEnoughValueToCommitHelper(C, SimpleConstants, DL);
-}
-
-
-/// Return true if this constant is simple enough for us to understand.  In
-/// particular, if it is a cast to anything other than from one pointer type to
-/// another pointer type, we punt.  We basically just support direct accesses to
-/// globals and GEP's of globals.  This should be kept up to date with
-/// CommitValueTo.
-static bool isSimpleEnoughPointerToCommit(Constant *C) {
-  // Conservatively, avoid aggregate types. This is because we don't
-  // want to worry about them partially overlapping other stores.
-  if (!cast<PointerType>(C->getType())->getElementType()->isSingleValueType())
-    return false;
-
-  if (GlobalVariable *GV = dyn_cast<GlobalVariable>(C))
-    // Do not allow weak/*_odr/linkonce linkage or external globals.
-    return GV->hasUniqueInitializer();
-
-  if (ConstantExpr *CE = dyn_cast<ConstantExpr>(C)) {
-    // Handle a constantexpr gep.
-    if (CE->getOpcode() == Instruction::GetElementPtr &&
-        isa<GlobalVariable>(CE->getOperand(0)) &&
-        cast<GEPOperator>(CE)->isInBounds()) {
-      GlobalVariable *GV = cast<GlobalVariable>(CE->getOperand(0));
-      // Do not allow weak/*_odr/linkonce/dllimport/dllexport linkage or
-      // external globals.
-      if (!GV->hasUniqueInitializer())
-        return false;
-
-      // The first index must be zero.
-      ConstantInt *CI = dyn_cast<ConstantInt>(*std::next(CE->op_begin()));
-      if (!CI || !CI->isZero()) return false;
-
-      // The remaining indices must be compile-time known integers within the
-      // notional bounds of the corresponding static array types.
-      if (!CE->isGEPWithNoNotionalOverIndexing())
-        return false;
-
-      return ConstantFoldLoadThroughGEPConstantExpr(GV->getInitializer(), CE);
-
-    // A constantexpr bitcast from a pointer to another pointer is a no-op,
-    // and we know how to evaluate it by moving the bitcast from the pointer
-    // operand to the value operand.
-    } else if (CE->getOpcode() == Instruction::BitCast &&
-               isa<GlobalVariable>(CE->getOperand(0))) {
-      // Do not allow weak/*_odr/linkonce/dllimport/dllexport linkage or
-      // external globals.
-      return cast<GlobalVariable>(CE->getOperand(0))->hasUniqueInitializer();
-    }
-  }
-
-  return false;
-}
-
 /// Evaluate a piece of a constantexpr store into a global initializer.  This
 /// returns 'Init' modified to reflect 'Val' stored into it.  At this point, the
 /// GEP operands of Addr [0, OpNo) have been stepped into.
@@ -2298,533 +2157,10 @@ static void CommitValueTo(Constant *Val, Constant *Addr) {
   GV->setInitializer(EvaluateStoreInto(GV->getInitializer(), Val, CE, 2));
 }
 
-namespace {
-
-/// This class evaluates LLVM IR, producing the Constant representing each SSA
-/// instruction.  Changes to global variables are stored in a mapping that can
-/// be iterated over after the evaluation is complete.  Once an evaluation call
-/// fails, the evaluation object should not be reused.
-class Evaluator {
-public:
-  Evaluator(const DataLayout &DL, const TargetLibraryInfo *TLI)
-      : DL(DL), TLI(TLI) {
-    ValueStack.emplace_back();
-  }
-
-  ~Evaluator() {
-    for (auto &Tmp : AllocaTmps)
-      // If there are still users of the alloca, the program is doing something
-      // silly, e.g. storing the address of the alloca somewhere and using it
-      // later.  Since this is undefined, we'll just make it be null.
-      if (!Tmp->use_empty())
-        Tmp->replaceAllUsesWith(Constant::getNullValue(Tmp->getType()));
-  }
-
-  /// Evaluate a call to function F, returning true if successful, false if we
-  /// can't evaluate it.  ActualArgs contains the formal arguments for the
-  /// function.
-  bool EvaluateFunction(Function *F, Constant *&RetVal,
-                        const SmallVectorImpl<Constant*> &ActualArgs);
-
-  /// Evaluate all instructions in block BB, returning true if successful, false
-  /// if we can't evaluate it.  NewBB returns the next BB that control flows
-  /// into, or null upon return.
-  bool EvaluateBlock(BasicBlock::iterator CurInst, BasicBlock *&NextBB);
-
-  Constant *getVal(Value *V) {
-    if (Constant *CV = dyn_cast<Constant>(V)) return CV;
-    Constant *R = ValueStack.back().lookup(V);
-    assert(R && "Reference to an uncomputed value!");
-    return R;
-  }
-
-  void setVal(Value *V, Constant *C) {
-    ValueStack.back()[V] = C;
-  }
-
-  const DenseMap<Constant*, Constant*> &getMutatedMemory() const {
-    return MutatedMemory;
-  }
-
-  const SmallPtrSetImpl<GlobalVariable*> &getInvariants() const {
-    return Invariants;
-  }
-
-private:
-  Constant *ComputeLoadResult(Constant *P);
-
-  /// As we compute SSA register values, we store their contents here. The back
-  /// of the deque contains the current function and the stack contains the
-  /// values in the calling frames.
-  std::deque<DenseMap<Value*, Constant*>> ValueStack;
-
-  /// This is used to detect recursion.  In pathological situations we could hit
-  /// exponential behavior, but at least there is nothing unbounded.
-  SmallVector<Function*, 4> CallStack;
-
-  /// For each store we execute, we update this map.  Loads check this to get
-  /// the most up-to-date value.  If evaluation is successful, this state is
-  /// committed to the process.
-  DenseMap<Constant*, Constant*> MutatedMemory;
-
-  /// To 'execute' an alloca, we create a temporary global variable to represent
-  /// its body.  This vector is needed so we can delete the temporary globals
-  /// when we are done.
-  SmallVector<std::unique_ptr<GlobalVariable>, 32> AllocaTmps;
-
-  /// These global variables have been marked invariant by the static
-  /// constructor.
-  SmallPtrSet<GlobalVariable*, 8> Invariants;
-
-  /// These are constants we have checked and know to be simple enough to live
-  /// in a static initializer of a global.
-  SmallPtrSet<Constant*, 8> SimpleConstants;
-
-  const DataLayout &DL;
-  const TargetLibraryInfo *TLI;
-};
-
-}  // anonymous namespace
-
-/// Return the value that would be computed by a load from P after the stores
-/// reflected by 'memory' have been performed.  If we can't decide, return null.
-Constant *Evaluator::ComputeLoadResult(Constant *P) {
-  // If this memory location has been recently stored, use the stored value: it
-  // is the most up-to-date.
-  DenseMap<Constant*, Constant*>::const_iterator I = MutatedMemory.find(P);
-  if (I != MutatedMemory.end()) return I->second;
-
-  // Access it.
-  if (GlobalVariable *GV = dyn_cast<GlobalVariable>(P)) {
-    if (GV->hasDefinitiveInitializer())
-      return GV->getInitializer();
-    return nullptr;
-  }
-
-  // Handle a constantexpr getelementptr.
-  if (ConstantExpr *CE = dyn_cast<ConstantExpr>(P))
-    if (CE->getOpcode() == Instruction::GetElementPtr &&
-        isa<GlobalVariable>(CE->getOperand(0))) {
-      GlobalVariable *GV = cast<GlobalVariable>(CE->getOperand(0));
-      if (GV->hasDefinitiveInitializer())
-        return ConstantFoldLoadThroughGEPConstantExpr(GV->getInitializer(), CE);
-    }
-
-  return nullptr;  // don't know how to evaluate.
-}
-
-/// Evaluate all instructions in block BB, returning true if successful, false
-/// if we can't evaluate it.  NewBB returns the next BB that control flows into,
-/// or null upon return.
-bool Evaluator::EvaluateBlock(BasicBlock::iterator CurInst,
-                              BasicBlock *&NextBB) {
-  // This is the main evaluation loop.
-  while (1) {
-    Constant *InstResult = nullptr;
-
-    DEBUG(dbgs() << "Evaluating Instruction: " << *CurInst << "\n");
-
-    if (StoreInst *SI = dyn_cast<StoreInst>(CurInst)) {
-      if (!SI->isSimple()) {
-        DEBUG(dbgs() << "Store is not simple! Can not evaluate.\n");
-        return false;  // no volatile/atomic accesses.
-      }
-      Constant *Ptr = getVal(SI->getOperand(1));
-      if (ConstantExpr *CE = dyn_cast<ConstantExpr>(Ptr)) {
-        DEBUG(dbgs() << "Folding constant ptr expression: " << *Ptr);
-        Ptr = ConstantFoldConstantExpression(CE, DL, TLI);
-        DEBUG(dbgs() << "; To: " << *Ptr << "\n");
-      }
-      if (!isSimpleEnoughPointerToCommit(Ptr)) {
-        // If this is too complex for us to commit, reject it.
-        DEBUG(dbgs() << "Pointer is too complex for us to evaluate store.");
-        return false;
-      }
-
-      Constant *Val = getVal(SI->getOperand(0));
-
-      // If this might be too difficult for the backend to handle (e.g. the addr
-      // of one global variable divided by another) then we can't commit it.
-      if (!isSimpleEnoughValueToCommit(Val, SimpleConstants, DL)) {
-        DEBUG(dbgs() << "Store value is too complex to evaluate store. " << *Val
-              << "\n");
-        return false;
-      }
-
-      if (ConstantExpr *CE = dyn_cast<ConstantExpr>(Ptr)) {
-        if (CE->getOpcode() == Instruction::BitCast) {
-          DEBUG(dbgs() << "Attempting to resolve bitcast on constant ptr.\n");
-          // If we're evaluating a store through a bitcast, then we need
-          // to pull the bitcast off the pointer type and push it onto the
-          // stored value.
-          Ptr = CE->getOperand(0);
-
-          Type *NewTy = cast<PointerType>(Ptr->getType())->getElementType();
-
-          // In order to push the bitcast onto the stored value, a bitcast
-          // from NewTy to Val's type must be legal.  If it's not, we can try
-          // introspecting NewTy to find a legal conversion.
-          while (!Val->getType()->canLosslesslyBitCastTo(NewTy)) {
-            // If NewTy is a struct, we can convert the pointer to the struct
-            // into a pointer to its first member.
-            // FIXME: This could be extended to support arrays as well.
-            if (StructType *STy = dyn_cast<StructType>(NewTy)) {
-              NewTy = STy->getTypeAtIndex(0U);
-
-              IntegerType *IdxTy = IntegerType::get(NewTy->getContext(), 32);
-              Constant *IdxZero = ConstantInt::get(IdxTy, 0, false);
-              Constant * const IdxList[] = {IdxZero, IdxZero};
-
-              Ptr = ConstantExpr::getGetElementPtr(nullptr, Ptr, IdxList);
-              if (ConstantExpr *CE = dyn_cast<ConstantExpr>(Ptr))
-                Ptr = ConstantFoldConstantExpression(CE, DL, TLI);
-
-            // If we can't improve the situation by introspecting NewTy,
-            // we have to give up.
-            } else {
-              DEBUG(dbgs() << "Failed to bitcast constant ptr, can not "
-                    "evaluate.\n");
-              return false;
-            }
-          }
-
-          // If we found compatible types, go ahead and push the bitcast
-          // onto the stored value.
-          Val = ConstantExpr::getBitCast(Val, NewTy);
-
-          DEBUG(dbgs() << "Evaluated bitcast: " << *Val << "\n");
-        }
-      }
-
-      MutatedMemory[Ptr] = Val;
-    } else if (BinaryOperator *BO = dyn_cast<BinaryOperator>(CurInst)) {
-      InstResult = ConstantExpr::get(BO->getOpcode(),
-                                     getVal(BO->getOperand(0)),
-                                     getVal(BO->getOperand(1)));
-      DEBUG(dbgs() << "Found a BinaryOperator! Simplifying: " << *InstResult
-            << "\n");
-    } else if (CmpInst *CI = dyn_cast<CmpInst>(CurInst)) {
-      InstResult = ConstantExpr::getCompare(CI->getPredicate(),
-                                            getVal(CI->getOperand(0)),
-                                            getVal(CI->getOperand(1)));
-      DEBUG(dbgs() << "Found a CmpInst! Simplifying: " << *InstResult
-            << "\n");
-    } else if (CastInst *CI = dyn_cast<CastInst>(CurInst)) {
-      InstResult = ConstantExpr::getCast(CI->getOpcode(),
-                                         getVal(CI->getOperand(0)),
-                                         CI->getType());
-      DEBUG(dbgs() << "Found a Cast! Simplifying: " << *InstResult
-            << "\n");
-    } else if (SelectInst *SI = dyn_cast<SelectInst>(CurInst)) {
-      InstResult = ConstantExpr::getSelect(getVal(SI->getOperand(0)),
-                                           getVal(SI->getOperand(1)),
-                                           getVal(SI->getOperand(2)));
-      DEBUG(dbgs() << "Found a Select! Simplifying: " << *InstResult
-            << "\n");
-    } else if (auto *EVI = dyn_cast<ExtractValueInst>(CurInst)) {
-      InstResult = ConstantExpr::getExtractValue(
-          getVal(EVI->getAggregateOperand()), EVI->getIndices());
-      DEBUG(dbgs() << "Found an ExtractValueInst! Simplifying: " << *InstResult
-                   << "\n");
-    } else if (auto *IVI = dyn_cast<InsertValueInst>(CurInst)) {
-      InstResult = ConstantExpr::getInsertValue(
-          getVal(IVI->getAggregateOperand()),
-          getVal(IVI->getInsertedValueOperand()), IVI->getIndices());
-      DEBUG(dbgs() << "Found an InsertValueInst! Simplifying: " << *InstResult
-                   << "\n");
-    } else if (GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(CurInst)) {
-      Constant *P = getVal(GEP->getOperand(0));
-      SmallVector<Constant*, 8> GEPOps;
-      for (User::op_iterator i = GEP->op_begin() + 1, e = GEP->op_end();
-           i != e; ++i)
-        GEPOps.push_back(getVal(*i));
-      InstResult =
-          ConstantExpr::getGetElementPtr(GEP->getSourceElementType(), P, GEPOps,
-                                         cast<GEPOperator>(GEP)->isInBounds());
-      DEBUG(dbgs() << "Found a GEP! Simplifying: " << *InstResult
-            << "\n");
-    } else if (LoadInst *LI = dyn_cast<LoadInst>(CurInst)) {
-
-      if (!LI->isSimple()) {
-        DEBUG(dbgs() << "Found a Load! Not a simple load, can not evaluate.\n");
-        return false;  // no volatile/atomic accesses.
-      }
-
-      Constant *Ptr = getVal(LI->getOperand(0));
-      if (ConstantExpr *CE = dyn_cast<ConstantExpr>(Ptr)) {
-        Ptr = ConstantFoldConstantExpression(CE, DL, TLI);
-        DEBUG(dbgs() << "Found a constant pointer expression, constant "
-              "folding: " << *Ptr << "\n");
-      }
-      InstResult = ComputeLoadResult(Ptr);
-      if (!InstResult) {
-        DEBUG(dbgs() << "Failed to compute load result. Can not evaluate load."
-              "\n");
-        return false; // Could not evaluate load.
-      }
-
-      DEBUG(dbgs() << "Evaluated load: " << *InstResult << "\n");
-    } else if (AllocaInst *AI = dyn_cast<AllocaInst>(CurInst)) {
-      if (AI->isArrayAllocation()) {
-        DEBUG(dbgs() << "Found an array alloca. Can not evaluate.\n");
-        return false;  // Cannot handle array allocs.
-      }
-      Type *Ty = AI->getType()->getElementType();
-      AllocaTmps.push_back(
-          make_unique<GlobalVariable>(Ty, false, GlobalValue::InternalLinkage,
-                                      UndefValue::get(Ty), AI->getName()));
-      InstResult = AllocaTmps.back().get();
-      DEBUG(dbgs() << "Found an alloca. Result: " << *InstResult << "\n");
-    } else if (isa<CallInst>(CurInst) || isa<InvokeInst>(CurInst)) {
-      CallSite CS(&*CurInst);
-
-      // Debug info can safely be ignored here.
-      if (isa<DbgInfoIntrinsic>(CS.getInstruction())) {
-        DEBUG(dbgs() << "Ignoring debug info.\n");
-        ++CurInst;
-        continue;
-      }
-
-      // Cannot handle inline asm.
-      if (isa<InlineAsm>(CS.getCalledValue())) {
-        DEBUG(dbgs() << "Found inline asm, can not evaluate.\n");
-        return false;
-      }
-
-      if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(CS.getInstruction())) {
-        if (MemSetInst *MSI = dyn_cast<MemSetInst>(II)) {
-          if (MSI->isVolatile()) {
-            DEBUG(dbgs() << "Can not optimize a volatile memset " <<
-                  "intrinsic.\n");
-            return false;
-          }
-          Constant *Ptr = getVal(MSI->getDest());
-          Constant *Val = getVal(MSI->getValue());
-          Constant *DestVal = ComputeLoadResult(getVal(Ptr));
-          if (Val->isNullValue() && DestVal && DestVal->isNullValue()) {
-            // This memset is a no-op.
-            DEBUG(dbgs() << "Ignoring no-op memset.\n");
-            ++CurInst;
-            continue;
-          }
-        }
-
-        if (II->getIntrinsicID() == Intrinsic::lifetime_start ||
-            II->getIntrinsicID() == Intrinsic::lifetime_end) {
-          DEBUG(dbgs() << "Ignoring lifetime intrinsic.\n");
-          ++CurInst;
-          continue;
-        }
-
-        if (II->getIntrinsicID() == Intrinsic::invariant_start) {
-          // We don't insert an entry into Values, as it doesn't have a
-          // meaningful return value.
-          if (!II->use_empty()) {
-            DEBUG(dbgs() << "Found unused invariant_start. Can't evaluate.\n");
-            return false;
-          }
-          ConstantInt *Size = cast<ConstantInt>(II->getArgOperand(0));
-          Value *PtrArg = getVal(II->getArgOperand(1));
-          Value *Ptr = PtrArg->stripPointerCasts();
-          if (GlobalVariable *GV = dyn_cast<GlobalVariable>(Ptr)) {
-            Type *ElemTy = cast<PointerType>(GV->getType())->getElementType();
-            if (!Size->isAllOnesValue() &&
-                Size->getValue().getLimitedValue() >=
-                    DL.getTypeStoreSize(ElemTy)) {
-              Invariants.insert(GV);
-              DEBUG(dbgs() << "Found a global var that is an invariant: " << *GV
-                    << "\n");
-            } else {
-              DEBUG(dbgs() << "Found a global var, but can not treat it as an "
-                    "invariant.\n");
-            }
-          }
-          // Continue even if we do nothing.
-          ++CurInst;
-          continue;
-        } else if (II->getIntrinsicID() == Intrinsic::assume) {
-          DEBUG(dbgs() << "Skipping assume intrinsic.\n");
-          ++CurInst;
-          continue;
-        }
-
-        DEBUG(dbgs() << "Unknown intrinsic. Can not evaluate.\n");
-        return false;
-      }
-
-      // Resolve function pointers.
-      Function *Callee = dyn_cast<Function>(getVal(CS.getCalledValue()));
-      if (!Callee || Callee->mayBeOverridden()) {
-        DEBUG(dbgs() << "Can not resolve function pointer.\n");
-        return false;  // Cannot resolve.
-      }
-
-      SmallVector<Constant*, 8> Formals;
-      for (User::op_iterator i = CS.arg_begin(), e = CS.arg_end(); i != e; ++i)
-        Formals.push_back(getVal(*i));
-
-      if (Callee->isDeclaration()) {
-        // If this is a function we can constant fold, do it.
-        if (Constant *C = ConstantFoldCall(Callee, Formals, TLI)) {
-          InstResult = C;
-          DEBUG(dbgs() << "Constant folded function call. Result: " <<
-                *InstResult << "\n");
-        } else {
-          DEBUG(dbgs() << "Can not constant fold function call.\n");
-          return false;
-        }
-      } else {
-        if (Callee->getFunctionType()->isVarArg()) {
-          DEBUG(dbgs() << "Can not constant fold vararg function call.\n");
-          return false;
-        }
-
-        Constant *RetVal = nullptr;
-        // Execute the call, if successful, use the return value.
-        ValueStack.emplace_back();
-        if (!EvaluateFunction(Callee, RetVal, Formals)) {
-          DEBUG(dbgs() << "Failed to evaluate function.\n");
-          return false;
-        }
-        ValueStack.pop_back();
-        InstResult = RetVal;
-
-        if (InstResult) {
-          DEBUG(dbgs() << "Successfully evaluated function. Result: " <<
-                InstResult << "\n\n");
-        } else {
-          DEBUG(dbgs() << "Successfully evaluated function. Result: 0\n\n");
-        }
-      }
-    } else if (isa<TerminatorInst>(CurInst)) {
-      DEBUG(dbgs() << "Found a terminator instruction.\n");
-
-      if (BranchInst *BI = dyn_cast<BranchInst>(CurInst)) {
-        if (BI->isUnconditional()) {
-          NextBB = BI->getSuccessor(0);
-        } else {
-          ConstantInt *Cond =
-            dyn_cast<ConstantInt>(getVal(BI->getCondition()));
-          if (!Cond) return false;  // Cannot determine.
-
-          NextBB = BI->getSuccessor(!Cond->getZExtValue());
-        }
-      } else if (SwitchInst *SI = dyn_cast<SwitchInst>(CurInst)) {
-        ConstantInt *Val =
-          dyn_cast<ConstantInt>(getVal(SI->getCondition()));
-        if (!Val) return false;  // Cannot determine.
-        NextBB = SI->findCaseValue(Val).getCaseSuccessor();
-      } else if (IndirectBrInst *IBI = dyn_cast<IndirectBrInst>(CurInst)) {
-        Value *Val = getVal(IBI->getAddress())->stripPointerCasts();
-        if (BlockAddress *BA = dyn_cast<BlockAddress>(Val))
-          NextBB = BA->getBasicBlock();
-        else
-          return false;  // Cannot determine.
-      } else if (isa<ReturnInst>(CurInst)) {
-        NextBB = nullptr;
-      } else {
-        // invoke, unwind, resume, unreachable.
-        DEBUG(dbgs() << "Can not handle terminator.");
-        return false;  // Cannot handle this terminator.
-      }
-
-      // We succeeded at evaluating this block!
-      DEBUG(dbgs() << "Successfully evaluated block.\n");
-      return true;
-    } else {
-      // Did not know how to evaluate this!
-      DEBUG(dbgs() << "Failed to evaluate block due to unhandled instruction."
-            "\n");
-      return false;
-    }
-
-    if (!CurInst->use_empty()) {
-      if (ConstantExpr *CE = dyn_cast<ConstantExpr>(InstResult))
-        InstResult = ConstantFoldConstantExpression(CE, DL, TLI);
-
-      setVal(&*CurInst, InstResult);
-    }
-
-    // If we just processed an invoke, we finished evaluating the block.
-    if (InvokeInst *II = dyn_cast<InvokeInst>(CurInst)) {
-      NextBB = II->getNormalDest();
-      DEBUG(dbgs() << "Found an invoke instruction. Finished Block.\n\n");
-      return true;
-    }
-
-    // Advance program counter.
-    ++CurInst;
-  }
-}
-
-/// Evaluate a call to function F, returning true if successful, false if we
-/// can't evaluate it.  ActualArgs contains the formal arguments for the
-/// function.
-bool Evaluator::EvaluateFunction(Function *F, Constant *&RetVal,
-                                 const SmallVectorImpl<Constant*> &ActualArgs) {
-  // Check to see if this function is already executing (recursion).  If so,
-  // bail out.  TODO: we might want to accept limited recursion.
-  if (std::find(CallStack.begin(), CallStack.end(), F) != CallStack.end())
-    return false;
-
-  CallStack.push_back(F);
-
-  // Initialize arguments to the incoming values specified.
-  unsigned ArgNo = 0;
-  for (Function::arg_iterator AI = F->arg_begin(), E = F->arg_end(); AI != E;
-       ++AI, ++ArgNo)
-    setVal(&*AI, ActualArgs[ArgNo]);
-
-  // ExecutedBlocks - We only handle non-looping, non-recursive code.  As such,
-  // we can only evaluate any one basic block at most once.  This set keeps
-  // track of what we have executed so we can detect recursive cases etc.
-  SmallPtrSet<BasicBlock*, 32> ExecutedBlocks;
-
-  // CurBB - The current basic block we're evaluating.
-  BasicBlock *CurBB = &F->front();
-
-  BasicBlock::iterator CurInst = CurBB->begin();
-
-  while (1) {
-    BasicBlock *NextBB = nullptr; // Initialized to avoid compiler warnings.
-    DEBUG(dbgs() << "Trying to evaluate BB: " << *CurBB << "\n");
-
-    if (!EvaluateBlock(CurInst, NextBB))
-      return false;
-
-    if (!NextBB) {
-      // Successfully running until there's no next block means that we found
-      // the return.  Fill it the return value and pop the call stack.
-      ReturnInst *RI = cast<ReturnInst>(CurBB->getTerminator());
-      if (RI->getNumOperands())
-        RetVal = getVal(RI->getOperand(0));
-      CallStack.pop_back();
-      return true;
-    }
-
-    // Okay, we succeeded in evaluating this control flow.  See if we have
-    // executed the new block before.  If so, we have a looping function,
-    // which we cannot evaluate in reasonable time.
-    if (!ExecutedBlocks.insert(NextBB).second)
-      return false;  // looped!
-
-    // Okay, we have never been in this block before.  Check to see if there
-    // are any PHI nodes.  If so, evaluate them with information about where
-    // we came from.
-    PHINode *PN = nullptr;
-    for (CurInst = NextBB->begin();
-         (PN = dyn_cast<PHINode>(CurInst)); ++CurInst)
-      setVal(PN, getVal(PN->getIncomingValueForBlock(CurBB)));
-
-    // Advance to the next block.
-    CurBB = NextBB;
-  }
-}
-
 /// Evaluate static constructors in the function, if we can.  Return true if we
 /// can, false otherwise.
 static bool EvaluateStaticConstructor(Function *F, const DataLayout &DL,
-                                      const TargetLibraryInfo *TLI) {
+                                      TargetLibraryInfo *TLI) {
   // Call the function.
   Evaluator Eval(DL, TLI);
   Constant *RetValDummy;
@@ -2838,10 +2174,8 @@ static bool EvaluateStaticConstructor(Function *F, const DataLayout &DL,
     DEBUG(dbgs() << "FULLY EVALUATED GLOBAL CTOR FUNCTION '"
           << F->getName() << "' to " << Eval.getMutatedMemory().size()
           << " stores.\n");
-    for (DenseMap<Constant*, Constant*>::const_iterator I =
-           Eval.getMutatedMemory().begin(), E = Eval.getMutatedMemory().end();
-         I != E; ++I)
-      CommitValueTo(I->second, I->first);
+    for (const auto &I : Eval.getMutatedMemory())
+      CommitValueTo(I.second, I.first);
     for (GlobalVariable *GV : Eval.getInvariants())
       GV->setConstant(true);
   }
@@ -2850,8 +2184,9 @@ static bool EvaluateStaticConstructor(Function *F, const DataLayout &DL,
 }
 
 static int compareNames(Constant *const *A, Constant *const *B) {
-  return (*A)->stripPointerCasts()->getName().compare(
-      (*B)->stripPointerCasts()->getName());
+  Value *AStripped = (*A)->stripPointerCastsNoFollowAliases();
+  Value *BStripped = (*B)->stripPointerCastsNoFollowAliases();
+  return AStripped->getName().compare(BStripped->getName());
 }
 
 static void setUsedInitializer(GlobalVariable &V,
@@ -2995,7 +2330,9 @@ static bool hasUsesToReplace(GlobalAlias &GA, const LLVMUsed &U,
   return true;
 }
 
-bool GlobalOpt::OptimizeGlobalAliases(Module &M) {
+static bool
+OptimizeGlobalAliases(Module &M,
+                      SmallSet<const Comdat *, 8> &NotDiscardableComdats) {
   bool Changed = false;
   LLVMUsed Used(M);
 
@@ -3010,13 +2347,13 @@ bool GlobalOpt::OptimizeGlobalAliases(Module &M) {
     if (!J->hasName() && !J->isDeclaration() && !J->hasLocalLinkage())
       J->setLinkage(GlobalValue::InternalLinkage);
 
-    if (deleteIfDead(*J)) {
+    if (deleteIfDead(*J, NotDiscardableComdats)) {
       Changed = true;
       continue;
     }
 
     // If the aliasee may change at link time, nothing can be done - bail out.
-    if (J->mayBeOverridden())
+    if (J->isInterposable())
       continue;
 
     Constant *Aliasee = J->getAliasee();
@@ -3064,23 +2401,16 @@ bool GlobalOpt::OptimizeGlobalAliases(Module &M) {
 }
 
 static Function *FindCXAAtExit(Module &M, TargetLibraryInfo *TLI) {
-  if (!TLI->has(LibFunc::cxa_atexit))
+  LibFunc::Func F = LibFunc::cxa_atexit;
+  if (!TLI->has(F))
     return nullptr;
 
-  Function *Fn = M.getFunction(TLI->getName(LibFunc::cxa_atexit));
-
+  Function *Fn = M.getFunction(TLI->getName(F));
   if (!Fn)
     return nullptr;
 
-  FunctionType *FTy = Fn->getFunctionType();
-
-  // Checking that the function has the right return type, the right number of
-  // parameters and that they all have pointer types should be enough.
-  if (!FTy->getReturnType()->isIntegerTy() ||
-      FTy->getNumParams() != 3 ||
-      !FTy->getParamType(0)->isPointerTy() ||
-      !FTy->getParamType(1)->isPointerTy() ||
-      !FTy->getParamType(2)->isPointerTy())
+  // Make sure that the function has the correct prototype.
+  if (!TLI->getLibFunc(*Fn, F) || F != LibFunc::cxa_atexit)
     return nullptr;
 
   return Fn;
@@ -3132,7 +2462,7 @@ static bool cxxDtorIsEmpty(const Function &Fn,
   return false;
 }
 
-bool GlobalOpt::OptimizeEmptyGlobalCXXDtors(Function *CXAAtExitFn) {
+static bool OptimizeEmptyGlobalCXXDtors(Function *CXAAtExitFn) {
   /// Itanium C++ ABI p3.3.5:
   ///
   ///   After constructing a global (or local static) object, that will require
@@ -3179,12 +2509,11 @@ bool GlobalOpt::OptimizeEmptyGlobalCXXDtors(Function *CXAAtExitFn) {
   return Changed;
 }
 
-bool GlobalOpt::runOnModule(Module &M) {
+static bool optimizeGlobalsInModule(
+    Module &M, const DataLayout &DL, TargetLibraryInfo *TLI,
+    function_ref<DominatorTree &(Function &)> LookupDomTree) {
+  SmallSet<const Comdat *, 8> NotDiscardableComdats;
   bool Changed = false;
-
-  auto &DL = M.getDataLayout();
-  TLI = &getAnalysis<TargetLibraryInfoWrapperPass>().getTLI();
-
   bool LocalChange = true;
   while (LocalChange) {
     LocalChange = false;
@@ -3204,7 +2533,8 @@ bool GlobalOpt::runOnModule(Module &M) {
           NotDiscardableComdats.insert(C);
 
     // Delete functions that are trivially dead, ccc -> fastcc
-    LocalChange |= OptimizeFunctions(M);
+    LocalChange |=
+        OptimizeFunctions(M, TLI, LookupDomTree, NotDiscardableComdats);
 
     // Optimize global_ctors list.
     LocalChange |= optimizeGlobalCtorsList(M, [&](Function *F) {
@@ -3212,10 +2542,11 @@ bool GlobalOpt::runOnModule(Module &M) {
     });
 
     // Optimize non-address-taken globals.
-    LocalChange |= OptimizeGlobalVars(M);
+    LocalChange |= OptimizeGlobalVars(M, TLI, LookupDomTree,
+                                      NotDiscardableComdats);
 
     // Resolve aliases, when possible.
-    LocalChange |= OptimizeGlobalAliases(M);
+    LocalChange |= OptimizeGlobalAliases(M, NotDiscardableComdats);
 
     // Try to remove trivial global destructors if they are not removed
     // already.
@@ -3232,3 +2563,53 @@ bool GlobalOpt::runOnModule(Module &M) {
   return Changed;
 }
 
+PreservedAnalyses GlobalOptPass::run(Module &M, AnalysisManager<Module> &AM) {
+    auto &DL = M.getDataLayout();
+    auto &TLI = AM.getResult<TargetLibraryAnalysis>(M);
+    auto &FAM =
+        AM.getResult<FunctionAnalysisManagerModuleProxy>(M).getManager();
+    auto LookupDomTree = [&FAM](Function &F) -> DominatorTree &{
+      return FAM.getResult<DominatorTreeAnalysis>(F);
+    };
+    if (!optimizeGlobalsInModule(M, DL, &TLI, LookupDomTree))
+      return PreservedAnalyses::all();
+    return PreservedAnalyses::none();
+}
+
+namespace {
+struct GlobalOptLegacyPass : public ModulePass {
+  static char ID; // Pass identification, replacement for typeid
+  GlobalOptLegacyPass() : ModulePass(ID) {
+    initializeGlobalOptLegacyPassPass(*PassRegistry::getPassRegistry());
+  }
+
+  bool runOnModule(Module &M) override {
+    if (skipModule(M))
+      return false;
+
+    auto &DL = M.getDataLayout();
+    auto *TLI = &getAnalysis<TargetLibraryInfoWrapperPass>().getTLI();
+    auto LookupDomTree = [this](Function &F) -> DominatorTree & {
+      return this->getAnalysis<DominatorTreeWrapperPass>(F).getDomTree();
+    };
+    return optimizeGlobalsInModule(M, DL, TLI, LookupDomTree);
+  }
+
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    AU.addRequired<TargetLibraryInfoWrapperPass>();
+    AU.addRequired<DominatorTreeWrapperPass>();
+  }
+};
+}
+
+char GlobalOptLegacyPass::ID = 0;
+INITIALIZE_PASS_BEGIN(GlobalOptLegacyPass, "globalopt",
+                      "Global Variable Optimizer", false, false)
+INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfoWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
+INITIALIZE_PASS_END(GlobalOptLegacyPass, "globalopt",
+                    "Global Variable Optimizer", false, false)
+
+ModulePass *llvm::createGlobalOptimizerPass() {
+  return new GlobalOptLegacyPass();
+}
diff --git a/lib/Transforms/IPO/IPConstantPropagation.cpp b/lib/Transforms/IPO/IPConstantPropagation.cpp
index af541d155254..916135e33cd5 100644
--- a/lib/Transforms/IPO/IPConstantPropagation.cpp
+++ b/lib/Transforms/IPO/IPConstantPropagation.cpp
@@ -41,44 +41,14 @@ namespace {
     }
 
     bool runOnModule(Module &M) override;
-  private:
-    bool PropagateConstantsIntoArguments(Function &F);
-    bool PropagateConstantReturn(Function &F);
   };
 }
 
-char IPCP::ID = 0;
-INITIALIZE_PASS(IPCP, "ipconstprop",
-                "Interprocedural constant propagation", false, false)
-
-ModulePass *llvm::createIPConstantPropagationPass() { return new IPCP(); }
-
-bool IPCP::runOnModule(Module &M) {
-  bool Changed = false;
-  bool LocalChange = true;
-
-  // FIXME: instead of using smart algorithms, we just iterate until we stop
-  // making changes.
-  while (LocalChange) {
-    LocalChange = false;
-    for (Module::iterator I = M.begin(), E = M.end(); I != E; ++I)
-      if (!I->isDeclaration()) {
-        // Delete any klingons.
-        I->removeDeadConstantUsers();
-        if (I->hasLocalLinkage())
-          LocalChange |= PropagateConstantsIntoArguments(*I);
-        Changed |= PropagateConstantReturn(*I);
-      }
-    Changed |= LocalChange;
-  }
-  return Changed;
-}
-
 /// PropagateConstantsIntoArguments - Look at all uses of the specified
 /// function.  If all uses are direct call sites, and all pass a particular
 /// constant in for an argument, propagate that constant in as the argument.
 ///
-bool IPCP::PropagateConstantsIntoArguments(Function &F) {
+static bool PropagateConstantsIntoArguments(Function &F) {
   if (F.arg_empty() || F.use_empty()) return false; // No arguments? Early exit.
 
   // For each argument, keep track of its constant value and whether it is a
@@ -157,13 +127,14 @@ bool IPCP::PropagateConstantsIntoArguments(Function &F) {
 // Additionally if a function always returns one of its arguments directly,
 // callers will be updated to use the value they pass in directly instead of
 // using the return value.
-bool IPCP::PropagateConstantReturn(Function &F) {
+static bool PropagateConstantReturn(Function &F) {
   if (F.getReturnType()->isVoidTy())
     return false; // No return value.
 
-  // If this function could be overridden later in the link stage, we can't
-  // propagate information about its results into callers.
-  if (F.mayBeOverridden())
+  // We can infer and propagate the return value only when we know that the
+  // definition we'll get at link time is *exactly* the definition we see now.
+  // For more details, see GlobalValue::mayBeDerefined.
+  if (!F.isDefinitionExact())
     return false;
     
   // Check to see if this function returns a constant.
@@ -176,8 +147,8 @@ bool IPCP::PropagateConstantReturn(Function &F) {
     RetVals.push_back(UndefValue::get(F.getReturnType()));
 
   unsigned NumNonConstant = 0;
-  for (Function::iterator BB = F.begin(), E = F.end(); BB != E; ++BB)
-    if (ReturnInst *RI = dyn_cast<ReturnInst>(BB->getTerminator())) {
+  for (BasicBlock &BB : F)
+    if (ReturnInst *RI = dyn_cast<ReturnInst>(BB.getTerminator())) {
       for (unsigned i = 0, e = RetVals.size(); i != e; ++i) {
         // Already found conflicting return values?
         Value *RV = RetVals[i];
@@ -277,3 +248,33 @@ bool IPCP::PropagateConstantReturn(Function &F) {
   if (MadeChange) ++NumReturnValProped;
   return MadeChange;
 }
+
+char IPCP::ID = 0;
+INITIALIZE_PASS(IPCP, "ipconstprop",
+                "Interprocedural constant propagation", false, false)
+
+ModulePass *llvm::createIPConstantPropagationPass() { return new IPCP(); }
+
+bool IPCP::runOnModule(Module &M) {
+  if (skipModule(M))
+    return false;
+
+  bool Changed = false;
+  bool LocalChange = true;
+
+  // FIXME: instead of using smart algorithms, we just iterate until we stop
+  // making changes.
+  while (LocalChange) {
+    LocalChange = false;
+    for (Function &F : M)
+      if (!F.isDeclaration()) {
+        // Delete any klingons.
+        F.removeDeadConstantUsers();
+        if (F.hasLocalLinkage())
+          LocalChange |= PropagateConstantsIntoArguments(F);
+        Changed |= PropagateConstantReturn(F);
+      }
+    Changed |= LocalChange;
+  }
+  return Changed;
+}
diff --git a/lib/Transforms/IPO/IPO.cpp b/lib/Transforms/IPO/IPO.cpp
index 89629cf06e08..3507eba81b2f 100644
--- a/lib/Transforms/IPO/IPO.cpp
+++ b/lib/Transforms/IPO/IPO.cpp
@@ -18,31 +18,32 @@
 #include "llvm/InitializePasses.h"
 #include "llvm/IR/LegacyPassManager.h"
 #include "llvm/Transforms/IPO.h"
+#include "llvm/Transforms/IPO/FunctionAttrs.h"
 
 using namespace llvm;
 
 void llvm::initializeIPO(PassRegistry &Registry) {
   initializeArgPromotionPass(Registry);
-  initializeConstantMergePass(Registry);
+  initializeConstantMergeLegacyPassPass(Registry);
   initializeCrossDSOCFIPass(Registry);
   initializeDAEPass(Registry);
   initializeDAHPass(Registry);
   initializeForceFunctionAttrsLegacyPassPass(Registry);
-  initializeGlobalDCEPass(Registry);
-  initializeGlobalOptPass(Registry);
+  initializeGlobalDCELegacyPassPass(Registry);
+  initializeGlobalOptLegacyPassPass(Registry);
   initializeIPCPPass(Registry);
   initializeAlwaysInlinerPass(Registry);
   initializeSimpleInlinerPass(Registry);
   initializeInferFunctionAttrsLegacyPassPass(Registry);
-  initializeInternalizePassPass(Registry);
+  initializeInternalizeLegacyPassPass(Registry);
   initializeLoopExtractorPass(Registry);
   initializeBlockExtractorPassPass(Registry);
   initializeSingleLoopExtractorPass(Registry);
-  initializeLowerBitSetsPass(Registry);
+  initializeLowerTypeTestsPass(Registry);
   initializeMergeFunctionsPass(Registry);
-  initializePartialInlinerPass(Registry);
-  initializePostOrderFunctionAttrsPass(Registry);
-  initializeReversePostOrderFunctionAttrsPass(Registry);
+  initializePartialInlinerLegacyPassPass(Registry);
+  initializePostOrderFunctionAttrsLegacyPassPass(Registry);
+  initializeReversePostOrderFunctionAttrsLegacyPassPass(Registry);
   initializePruneEHPass(Registry);
   initializeStripDeadPrototypesLegacyPassPass(Registry);
   initializeStripSymbolsPass(Registry);
@@ -50,9 +51,10 @@ void llvm::initializeIPO(PassRegistry &Registry) {
   initializeStripDeadDebugInfoPass(Registry);
   initializeStripNonDebugSymbolsPass(Registry);
   initializeBarrierNoopPass(Registry);
-  initializeEliminateAvailableExternallyPass(Registry);
-  initializeSampleProfileLoaderPass(Registry);
+  initializeEliminateAvailableExternallyLegacyPassPass(Registry);
+  initializeSampleProfileLoaderLegacyPassPass(Registry);
   initializeFunctionImportPassPass(Registry);
+  initializeWholeProgramDevirtPass(Registry);
 }
 
 void LLVMInitializeIPO(LLVMPassRegistryRef R) {
@@ -72,7 +74,7 @@ void LLVMAddDeadArgEliminationPass(LLVMPassManagerRef PM) {
 }
 
 void LLVMAddFunctionAttrsPass(LLVMPassManagerRef PM) {
-  unwrap(PM)->add(createPostOrderFunctionAttrsPass());
+  unwrap(PM)->add(createPostOrderFunctionAttrsLegacyPass());
 }
 
 void LLVMAddFunctionInliningPass(LLVMPassManagerRef PM) {
@@ -104,10 +106,10 @@ void LLVMAddIPSCCPPass(LLVMPassManagerRef PM) {
 }
 
 void LLVMAddInternalizePass(LLVMPassManagerRef PM, unsigned AllButMain) {
-  std::vector<const char *> Export;
-  if (AllButMain)
-    Export.push_back("main");
-  unwrap(PM)->add(createInternalizePass(Export));
+  auto PreserveMain = [=](const GlobalValue &GV) {
+    return AllButMain && GV.getName() == "main";
+  };
+  unwrap(PM)->add(createInternalizePass(PreserveMain));
 }
 
 void LLVMAddStripDeadPrototypesPass(LLVMPassManagerRef PM) {
diff --git a/lib/Transforms/IPO/InferFunctionAttrs.cpp b/lib/Transforms/IPO/InferFunctionAttrs.cpp
index 4295a7595c29..ab2d2bd8b02a 100644
--- a/lib/Transforms/IPO/InferFunctionAttrs.cpp
+++ b/lib/Transforms/IPO/InferFunctionAttrs.cpp
@@ -8,7 +8,6 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/Transforms/IPO/InferFunctionAttrs.h"
-#include "llvm/ADT/Statistic.h"
 #include "llvm/Analysis/TargetLibraryInfo.h"
 #include "llvm/Analysis/MemoryBuiltins.h"
 #include "llvm/IR/Function.h"
@@ -16,937 +15,27 @@
 #include "llvm/IR/Module.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/raw_ostream.h"
+#include "llvm/Transforms/Utils/BuildLibCalls.h"
 using namespace llvm;
 
 #define DEBUG_TYPE "inferattrs"
 
-STATISTIC(NumReadNone, "Number of functions inferred as readnone");
-STATISTIC(NumReadOnly, "Number of functions inferred as readonly");
-STATISTIC(NumArgMemOnly, "Number of functions inferred as argmemonly");
-STATISTIC(NumNoUnwind, "Number of functions inferred as nounwind");
-STATISTIC(NumNoCapture, "Number of arguments inferred as nocapture");
-STATISTIC(NumReadOnlyArg, "Number of arguments inferred as readonly");
-STATISTIC(NumNoAlias, "Number of function returns inferred as noalias");
-STATISTIC(NumNonNull, "Number of function returns inferred as nonnull returns");
-
-static bool setDoesNotAccessMemory(Function &F) {
-  if (F.doesNotAccessMemory())
-    return false;
-  F.setDoesNotAccessMemory();
-  ++NumReadNone;
-  return true;
-}
-
-static bool setOnlyReadsMemory(Function &F) {
-  if (F.onlyReadsMemory())
-    return false;
-  F.setOnlyReadsMemory();
-  ++NumReadOnly;
-  return true;
-}
-
-static bool setOnlyAccessesArgMemory(Function &F) {
-  if (F.onlyAccessesArgMemory())
-    return false;
-  F.setOnlyAccessesArgMemory ();
-  ++NumArgMemOnly;
-  return true;
-}
-
-
-static bool setDoesNotThrow(Function &F) {
-  if (F.doesNotThrow())
-    return false;
-  F.setDoesNotThrow();
-  ++NumNoUnwind;
-  return true;
-}
-
-static bool setDoesNotCapture(Function &F, unsigned n) {
-  if (F.doesNotCapture(n))
-    return false;
-  F.setDoesNotCapture(n);
-  ++NumNoCapture;
-  return true;
-}
-
-static bool setOnlyReadsMemory(Function &F, unsigned n) {
-  if (F.onlyReadsMemory(n))
-    return false;
-  F.setOnlyReadsMemory(n);
-  ++NumReadOnlyArg;
-  return true;
-}
-
-static bool setDoesNotAlias(Function &F, unsigned n) {
-  if (F.doesNotAlias(n))
-    return false;
-  F.setDoesNotAlias(n);
-  ++NumNoAlias;
-  return true;
-}
-
-static bool setNonNull(Function &F, unsigned n) {
-  assert((n != AttributeSet::ReturnIndex ||
-          F.getReturnType()->isPointerTy()) &&
-         "nonnull applies only to pointers");
-  if (F.getAttributes().hasAttribute(n, Attribute::NonNull))
-    return false;
-  F.addAttribute(n, Attribute::NonNull);
-  ++NumNonNull;
-  return true;
-}
-
-/// Analyze the name and prototype of the given function and set any applicable
-/// attributes.
-///
-/// Returns true if any attributes were set and false otherwise.
-static bool inferPrototypeAttributes(Function &F,
-                                     const TargetLibraryInfo &TLI) {
-  if (F.hasFnAttribute(Attribute::OptimizeNone))
-    return false;
-
-  FunctionType *FTy = F.getFunctionType();
-  LibFunc::Func TheLibFunc;
-  if (!(TLI.getLibFunc(F.getName(), TheLibFunc) && TLI.has(TheLibFunc)))
-    return false;
-
-  bool Changed = false;
-  switch (TheLibFunc) {
-  case LibFunc::strlen:
-    if (FTy->getNumParams() != 1 || !FTy->getParamType(0)->isPointerTy())
-      return false;
-    Changed |= setOnlyReadsMemory(F);
-    Changed |= setDoesNotThrow(F);
-    Changed |= setDoesNotCapture(F, 1);
-    return Changed;
-  case LibFunc::strchr:
-  case LibFunc::strrchr:
-    if (FTy->getNumParams() != 2 || !FTy->getParamType(0)->isPointerTy() ||
-        !FTy->getParamType(1)->isIntegerTy())
-      return false;
-    Changed |= setOnlyReadsMemory(F);
-    Changed |= setDoesNotThrow(F);
-    return Changed;
-  case LibFunc::strtol:
-  case LibFunc::strtod:
-  case LibFunc::strtof:
-  case LibFunc::strtoul:
-  case LibFunc::strtoll:
-  case LibFunc::strtold:
-  case LibFunc::strtoull:
-    if (FTy->getNumParams() < 2 || !FTy->getParamType(1)->isPointerTy())
-      return false;
-    Changed |= setDoesNotThrow(F);
-    Changed |= setDoesNotCapture(F, 2);
-    Changed |= setOnlyReadsMemory(F, 1);
-    return Changed;
-  case LibFunc::strcpy:
-  case LibFunc::stpcpy:
-  case LibFunc::strcat:
-  case LibFunc::strncat:
-  case LibFunc::strncpy:
-  case LibFunc::stpncpy:
-    if (FTy->getNumParams() < 2 || !FTy->getParamType(1)->isPointerTy())
-      return false;
-    Changed |= setDoesNotThrow(F);
-    Changed |= setDoesNotCapture(F, 2);
-    Changed |= setOnlyReadsMemory(F, 2);
-    return Changed;
-  case LibFunc::strxfrm:
-    if (FTy->getNumParams() != 3 || !FTy->getParamType(0)->isPointerTy() ||
-        !FTy->getParamType(1)->isPointerTy())
-      return false;
-    Changed |= setDoesNotThrow(F);
-    Changed |= setDoesNotCapture(F, 1);
-    Changed |= setDoesNotCapture(F, 2);
-    Changed |= setOnlyReadsMemory(F, 2);
-    return Changed;
-  case LibFunc::strcmp:      // 0,1
-  case LibFunc::strspn:      // 0,1
-  case LibFunc::strncmp:     // 0,1
-  case LibFunc::strcspn:     // 0,1
-  case LibFunc::strcoll:     // 0,1
-  case LibFunc::strcasecmp:  // 0,1
-  case LibFunc::strncasecmp: //
-    if (FTy->getNumParams() < 2 || !FTy->getParamType(0)->isPointerTy() ||
-        !FTy->getParamType(1)->isPointerTy())
-      return false;
-    Changed |= setOnlyReadsMemory(F);
-    Changed |= setDoesNotThrow(F);
-    Changed |= setDoesNotCapture(F, 1);
-    Changed |= setDoesNotCapture(F, 2);
-    return Changed;
-  case LibFunc::strstr:
-  case LibFunc::strpbrk:
-    if (FTy->getNumParams() != 2 || !FTy->getParamType(1)->isPointerTy())
-      return false;
-    Changed |= setOnlyReadsMemory(F);
-    Changed |= setDoesNotThrow(F);
-    Changed |= setDoesNotCapture(F, 2);
-    return Changed;
-  case LibFunc::strtok:
-  case LibFunc::strtok_r:
-    if (FTy->getNumParams() < 2 || !FTy->getParamType(1)->isPointerTy())
-      return false;
-    Changed |= setDoesNotThrow(F);
-    Changed |= setDoesNotCapture(F, 2);
-    Changed |= setOnlyReadsMemory(F, 2);
-    return Changed;
-  case LibFunc::scanf:
-    if (FTy->getNumParams() < 1 || !FTy->getParamType(0)->isPointerTy())
-      return false;
-    Changed |= setDoesNotThrow(F);
-    Changed |= setDoesNotCapture(F, 1);
-    Changed |= setOnlyReadsMemory(F, 1);
-    return Changed;
-  case LibFunc::setbuf:
-  case LibFunc::setvbuf:
-    if (FTy->getNumParams() < 1 || !FTy->getParamType(0)->isPointerTy())
-      return false;
-    Changed |= setDoesNotThrow(F);
-    Changed |= setDoesNotCapture(F, 1);
-    return Changed;
-  case LibFunc::strdup:
-  case LibFunc::strndup:
-    if (FTy->getNumParams() < 1 || !FTy->getReturnType()->isPointerTy() ||
-        !FTy->getParamType(0)->isPointerTy())
-      return false;
-    Changed |= setDoesNotThrow(F);
-    Changed |= setDoesNotAlias(F, 0);
-    Changed |= setDoesNotCapture(F, 1);
-    Changed |= setOnlyReadsMemory(F, 1);
-    return Changed;
-  case LibFunc::stat:
-  case LibFunc::statvfs:
-    if (FTy->getNumParams() < 2 || !FTy->getParamType(0)->isPointerTy() ||
-        !FTy->getParamType(1)->isPointerTy())
-      return false;
-    Changed |= setDoesNotThrow(F);
-    Changed |= setDoesNotCapture(F, 1);
-    Changed |= setDoesNotCapture(F, 2);
-    Changed |= setOnlyReadsMemory(F, 1);
-    return Changed;
-  case LibFunc::sscanf:
-    if (FTy->getNumParams() < 2 || !FTy->getParamType(0)->isPointerTy() ||
-        !FTy->getParamType(1)->isPointerTy())
-      return false;
-    Changed |= setDoesNotThrow(F);
-    Changed |= setDoesNotCapture(F, 1);
-    Changed |= setDoesNotCapture(F, 2);
-    Changed |= setOnlyReadsMemory(F, 1);
-    Changed |= setOnlyReadsMemory(F, 2);
-    return Changed;
-  case LibFunc::sprintf:
-    if (FTy->getNumParams() < 2 || !FTy->getParamType(0)->isPointerTy() ||
-        !FTy->getParamType(1)->isPointerTy())
-      return false;
-    Changed |= setDoesNotThrow(F);
-    Changed |= setDoesNotCapture(F, 1);
-    Changed |= setDoesNotCapture(F, 2);
-    Changed |= setOnlyReadsMemory(F, 2);
-    return Changed;
-  case LibFunc::snprintf:
-    if (FTy->getNumParams() != 3 || !FTy->getParamType(0)->isPointerTy() ||
-        !FTy->getParamType(2)->isPointerTy())
-      return false;
-    Changed |= setDoesNotThrow(F);
-    Changed |= setDoesNotCapture(F, 1);
-    Changed |= setDoesNotCapture(F, 3);
-    Changed |= setOnlyReadsMemory(F, 3);
-    return Changed;
-  case LibFunc::setitimer:
-    if (FTy->getNumParams() != 3 || !FTy->getParamType(1)->isPointerTy() ||
-        !FTy->getParamType(2)->isPointerTy())
-      return false;
-    Changed |= setDoesNotThrow(F);
-    Changed |= setDoesNotCapture(F, 2);
-    Changed |= setDoesNotCapture(F, 3);
-    Changed |= setOnlyReadsMemory(F, 2);
-    return Changed;
-  case LibFunc::system:
-    if (FTy->getNumParams() != 1 || !FTy->getParamType(0)->isPointerTy())
-      return false;
-    // May throw; "system" is a valid pthread cancellation point.
-    Changed |= setDoesNotCapture(F, 1);
-    Changed |= setOnlyReadsMemory(F, 1);
-    return Changed;
-  case LibFunc::malloc:
-    if (FTy->getNumParams() != 1 || !FTy->getReturnType()->isPointerTy())
-      return false;
-    Changed |= setDoesNotThrow(F);
-    Changed |= setDoesNotAlias(F, 0);
-    return Changed;
-  case LibFunc::memcmp:
-    if (FTy->getNumParams() != 3 || !FTy->getParamType(0)->isPointerTy() ||
-        !FTy->getParamType(1)->isPointerTy())
-      return false;
-    Changed |= setOnlyReadsMemory(F);
-    Changed |= setDoesNotThrow(F);
-    Changed |= setDoesNotCapture(F, 1);
-    Changed |= setDoesNotCapture(F, 2);
-    return Changed;
-  case LibFunc::memchr:
-  case LibFunc::memrchr:
-    if (FTy->getNumParams() != 3)
-      return false;
-    Changed |= setOnlyReadsMemory(F);
-    Changed |= setDoesNotThrow(F);
-    return Changed;
-  case LibFunc::modf:
-  case LibFunc::modff:
-  case LibFunc::modfl:
-    if (FTy->getNumParams() < 2 || !FTy->getParamType(1)->isPointerTy())
-      return false;
-    Changed |= setDoesNotThrow(F);
-    Changed |= setDoesNotCapture(F, 2);
-    return Changed;
-  case LibFunc::memcpy:
-  case LibFunc::memccpy:
-  case LibFunc::memmove:
-    if (FTy->getNumParams() < 2 || !FTy->getParamType(1)->isPointerTy())
-      return false;
-    Changed |= setDoesNotThrow(F);
-    Changed |= setDoesNotCapture(F, 2);
-    Changed |= setOnlyReadsMemory(F, 2);
-    return Changed;
-  case LibFunc::memalign:
-    if (!FTy->getReturnType()->isPointerTy())
-      return false;
-    Changed |= setDoesNotAlias(F, 0);
-    return Changed;
-  case LibFunc::mkdir:
-    if (FTy->getNumParams() == 0 || !FTy->getParamType(0)->isPointerTy())
-      return false;
-    Changed |= setDoesNotThrow(F);
-    Changed |= setDoesNotCapture(F, 1);
-    Changed |= setOnlyReadsMemory(F, 1);
-    return Changed;
-  case LibFunc::mktime:
-    if (FTy->getNumParams() == 0 || !FTy->getParamType(0)->isPointerTy())
-      return false;
-    Changed |= setDoesNotThrow(F);
-    Changed |= setDoesNotCapture(F, 1);
-    return Changed;
-  case LibFunc::realloc:
-    if (FTy->getNumParams() != 2 || !FTy->getParamType(0)->isPointerTy() ||
-        !FTy->getReturnType()->isPointerTy())
-      return false;
-    Changed |= setDoesNotThrow(F);
-    Changed |= setDoesNotAlias(F, 0);
-    Changed |= setDoesNotCapture(F, 1);
-    return Changed;
-  case LibFunc::read:
-    if (FTy->getNumParams() != 3 || !FTy->getParamType(1)->isPointerTy())
-      return false;
-    // May throw; "read" is a valid pthread cancellation point.
-    Changed |= setDoesNotCapture(F, 2);
-    return Changed;
-  case LibFunc::rewind:
-    if (FTy->getNumParams() < 1 || !FTy->getParamType(0)->isPointerTy())
-      return false;
-    Changed |= setDoesNotThrow(F);
-    Changed |= setDoesNotCapture(F, 1);
-    return Changed;
-  case LibFunc::rmdir:
-  case LibFunc::remove:
-  case LibFunc::realpath:
-    if (FTy->getNumParams() < 1 || !FTy->getParamType(0)->isPointerTy())
-      return false;
-    Changed |= setDoesNotThrow(F);
-    Changed |= setDoesNotCapture(F, 1);
-    Changed |= setOnlyReadsMemory(F, 1);
-    return Changed;
-  case LibFunc::rename:
-    if (FTy->getNumParams() < 2 || !FTy->getParamType(0)->isPointerTy() ||
-        !FTy->getParamType(1)->isPointerTy())
-      return false;
-    Changed |= setDoesNotThrow(F);
-    Changed |= setDoesNotCapture(F, 1);
-    Changed |= setDoesNotCapture(F, 2);
-    Changed |= setOnlyReadsMemory(F, 1);
-    Changed |= setOnlyReadsMemory(F, 2);
-    return Changed;
-  case LibFunc::readlink:
-    if (FTy->getNumParams() < 2 || !FTy->getParamType(0)->isPointerTy() ||
-        !FTy->getParamType(1)->isPointerTy())
-      return false;
-    Changed |= setDoesNotThrow(F);
-    Changed |= setDoesNotCapture(F, 1);
-    Changed |= setDoesNotCapture(F, 2);
-    Changed |= setOnlyReadsMemory(F, 1);
-    return Changed;
-  case LibFunc::write:
-    if (FTy->getNumParams() != 3 || !FTy->getParamType(1)->isPointerTy())
-      return false;
-    // May throw; "write" is a valid pthread cancellation point.
-    Changed |= setDoesNotCapture(F, 2);
-    Changed |= setOnlyReadsMemory(F, 2);
-    return Changed;
-  case LibFunc::bcopy:
-    if (FTy->getNumParams() != 3 || !FTy->getParamType(0)->isPointerTy() ||
-        !FTy->getParamType(1)->isPointerTy())
-      return false;
-    Changed |= setDoesNotThrow(F);
-    Changed |= setDoesNotCapture(F, 1);
-    Changed |= setDoesNotCapture(F, 2);
-    Changed |= setOnlyReadsMemory(F, 1);
-    return Changed;
-  case LibFunc::bcmp:
-    if (FTy->getNumParams() != 3 || !FTy->getParamType(0)->isPointerTy() ||
-        !FTy->getParamType(1)->isPointerTy())
-      return false;
-    Changed |= setDoesNotThrow(F);
-    Changed |= setOnlyReadsMemory(F);
-    Changed |= setDoesNotCapture(F, 1);
-    Changed |= setDoesNotCapture(F, 2);
-    return Changed;
-  case LibFunc::bzero:
-    if (FTy->getNumParams() != 2 || !FTy->getParamType(0)->isPointerTy())
-      return false;
-    Changed |= setDoesNotThrow(F);
-    Changed |= setDoesNotCapture(F, 1);
-    return Changed;
-  case LibFunc::calloc:
-    if (FTy->getNumParams() != 2 || !FTy->getReturnType()->isPointerTy())
-      return false;
-    Changed |= setDoesNotThrow(F);
-    Changed |= setDoesNotAlias(F, 0);
-    return Changed;
-  case LibFunc::chmod:
-  case LibFunc::chown:
-    if (FTy->getNumParams() == 0 || !FTy->getParamType(0)->isPointerTy())
-      return false;
-    Changed |= setDoesNotThrow(F);
-    Changed |= setDoesNotCapture(F, 1);
-    Changed |= setOnlyReadsMemory(F, 1);
-    return Changed;
-  case LibFunc::ctermid:
-  case LibFunc::clearerr:
-  case LibFunc::closedir:
-    if (FTy->getNumParams() == 0 || !FTy->getParamType(0)->isPointerTy())
-      return false;
-    Changed |= setDoesNotThrow(F);
-    Changed |= setDoesNotCapture(F, 1);
-    return Changed;
-  case LibFunc::atoi:
-  case LibFunc::atol:
-  case LibFunc::atof:
-  case LibFunc::atoll:
-    if (FTy->getNumParams() != 1 || !FTy->getParamType(0)->isPointerTy())
-      return false;
-    Changed |= setDoesNotThrow(F);
-    Changed |= setOnlyReadsMemory(F);
-    Changed |= setDoesNotCapture(F, 1);
-    return Changed;
-  case LibFunc::access:
-    if (FTy->getNumParams() != 2 || !FTy->getParamType(0)->isPointerTy())
-      return false;
-    Changed |= setDoesNotThrow(F);
-    Changed |= setDoesNotCapture(F, 1);
-    Changed |= setOnlyReadsMemory(F, 1);
-    return Changed;
-  case LibFunc::fopen:
-    if (FTy->getNumParams() != 2 || !FTy->getReturnType()->isPointerTy() ||
-        !FTy->getParamType(0)->isPointerTy() ||
-        !FTy->getParamType(1)->isPointerTy())
-      return false;
-    Changed |= setDoesNotThrow(F);
-    Changed |= setDoesNotAlias(F, 0);
-    Changed |= setDoesNotCapture(F, 1);
-    Changed |= setDoesNotCapture(F, 2);
-    Changed |= setOnlyReadsMemory(F, 1);
-    Changed |= setOnlyReadsMemory(F, 2);
-    return Changed;
-  case LibFunc::fdopen:
-    if (FTy->getNumParams() != 2 || !FTy->getReturnType()->isPointerTy() ||
-        !FTy->getParamType(1)->isPointerTy())
-      return false;
-    Changed |= setDoesNotThrow(F);
-    Changed |= setDoesNotAlias(F, 0);
-    Changed |= setDoesNotCapture(F, 2);
-    Changed |= setOnlyReadsMemory(F, 2);
-    return Changed;
-  case LibFunc::feof:
-  case LibFunc::free:
-  case LibFunc::fseek:
-  case LibFunc::ftell:
-  case LibFunc::fgetc:
-  case LibFunc::fseeko:
-  case LibFunc::ftello:
-  case LibFunc::fileno:
-  case LibFunc::fflush:
-  case LibFunc::fclose:
-  case LibFunc::fsetpos:
-  case LibFunc::flockfile:
-  case LibFunc::funlockfile:
-  case LibFunc::ftrylockfile:
-    if (FTy->getNumParams() == 0 || !FTy->getParamType(0)->isPointerTy())
-      return false;
-    Changed |= setDoesNotThrow(F);
-    Changed |= setDoesNotCapture(F, 1);
-    return Changed;
-  case LibFunc::ferror:
-    if (FTy->getNumParams() != 1 || !FTy->getParamType(0)->isPointerTy())
-      return false;
-    Changed |= setDoesNotThrow(F);
-    Changed |= setDoesNotCapture(F, 1);
-    Changed |= setOnlyReadsMemory(F);
-    return Changed;
-  case LibFunc::fputc:
-  case LibFunc::fstat:
-  case LibFunc::frexp:
-  case LibFunc::frexpf:
-  case LibFunc::frexpl:
-  case LibFunc::fstatvfs:
-    if (FTy->getNumParams() != 2 || !FTy->getParamType(1)->isPointerTy())
-      return false;
-    Changed |= setDoesNotThrow(F);
-    Changed |= setDoesNotCapture(F, 2);
-    return Changed;
-  case LibFunc::fgets:
-    if (FTy->getNumParams() != 3 || !FTy->getParamType(0)->isPointerTy() ||
-        !FTy->getParamType(2)->isPointerTy())
-      return false;
-    Changed |= setDoesNotThrow(F);
-    Changed |= setDoesNotCapture(F, 3);
-    return Changed;
-  case LibFunc::fread:
-    if (FTy->getNumParams() != 4 || !FTy->getParamType(0)->isPointerTy() ||
-        !FTy->getParamType(3)->isPointerTy())
-      return false;
-    Changed |= setDoesNotThrow(F);
-    Changed |= setDoesNotCapture(F, 1);
-    Changed |= setDoesNotCapture(F, 4);
-    return Changed;
-  case LibFunc::fwrite:
-    if (FTy->getNumParams() != 4 || !FTy->getParamType(0)->isPointerTy() ||
-        !FTy->getParamType(3)->isPointerTy())
-      return false;
-    Changed |= setDoesNotThrow(F);
-    Changed |= setDoesNotCapture(F, 1);
-    Changed |= setDoesNotCapture(F, 4);
-    return Changed;
-  case LibFunc::fputs:
-    if (FTy->getNumParams() < 2 || !FTy->getParamType(0)->isPointerTy() ||
-        !FTy->getParamType(1)->isPointerTy())
-      return false;
-    Changed |= setDoesNotThrow(F);
-    Changed |= setDoesNotCapture(F, 1);
-    Changed |= setDoesNotCapture(F, 2);
-    Changed |= setOnlyReadsMemory(F, 1);
-    return Changed;
-  case LibFunc::fscanf:
-  case LibFunc::fprintf:
-    if (FTy->getNumParams() < 2 || !FTy->getParamType(0)->isPointerTy() ||
-        !FTy->getParamType(1)->isPointerTy())
-      return false;
-    Changed |= setDoesNotThrow(F);
-    Changed |= setDoesNotCapture(F, 1);
-    Changed |= setDoesNotCapture(F, 2);
-    Changed |= setOnlyReadsMemory(F, 2);
-    return Changed;
-  case LibFunc::fgetpos:
-    if (FTy->getNumParams() < 2 || !FTy->getParamType(0)->isPointerTy() ||
-        !FTy->getParamType(1)->isPointerTy())
-      return false;
-    Changed |= setDoesNotThrow(F);
-    Changed |= setDoesNotCapture(F, 1);
-    Changed |= setDoesNotCapture(F, 2);
-    return Changed;
-  case LibFunc::getc:
-  case LibFunc::getlogin_r:
-  case LibFunc::getc_unlocked:
-    if (FTy->getNumParams() == 0 || !FTy->getParamType(0)->isPointerTy())
-      return false;
-    Changed |= setDoesNotThrow(F);
-    Changed |= setDoesNotCapture(F, 1);
-    return Changed;
-  case LibFunc::getenv:
-    if (FTy->getNumParams() != 1 || !FTy->getParamType(0)->isPointerTy())
-      return false;
-    Changed |= setDoesNotThrow(F);
-    Changed |= setOnlyReadsMemory(F);
-    Changed |= setDoesNotCapture(F, 1);
-    return Changed;
-  case LibFunc::gets:
-  case LibFunc::getchar:
-    Changed |= setDoesNotThrow(F);
-    return Changed;
-  case LibFunc::getitimer:
-    if (FTy->getNumParams() != 2 || !FTy->getParamType(1)->isPointerTy())
-      return false;
-    Changed |= setDoesNotThrow(F);
-    Changed |= setDoesNotCapture(F, 2);
-    return Changed;
-  case LibFunc::getpwnam:
-    if (FTy->getNumParams() != 1 || !FTy->getParamType(0)->isPointerTy())
-      return false;
-    Changed |= setDoesNotThrow(F);
-    Changed |= setDoesNotCapture(F, 1);
-    Changed |= setOnlyReadsMemory(F, 1);
-    return Changed;
-  case LibFunc::ungetc:
-    if (FTy->getNumParams() != 2 || !FTy->getParamType(1)->isPointerTy())
-      return false;
-    Changed |= setDoesNotThrow(F);
-    Changed |= setDoesNotCapture(F, 2);
-    return Changed;
-  case LibFunc::uname:
-    if (FTy->getNumParams() != 1 || !FTy->getParamType(0)->isPointerTy())
-      return false;
-    Changed |= setDoesNotThrow(F);
-    Changed |= setDoesNotCapture(F, 1);
-    return Changed;
-  case LibFunc::unlink:
-    if (FTy->getNumParams() != 1 || !FTy->getParamType(0)->isPointerTy())
-      return false;
-    Changed |= setDoesNotThrow(F);
-    Changed |= setDoesNotCapture(F, 1);
-    Changed |= setOnlyReadsMemory(F, 1);
-    return Changed;
-  case LibFunc::unsetenv:
-    if (FTy->getNumParams() != 1 || !FTy->getParamType(0)->isPointerTy())
-      return false;
-    Changed |= setDoesNotThrow(F);
-    Changed |= setDoesNotCapture(F, 1);
-    Changed |= setOnlyReadsMemory(F, 1);
-    return Changed;
-  case LibFunc::utime:
-  case LibFunc::utimes:
-    if (FTy->getNumParams() != 2 || !FTy->getParamType(0)->isPointerTy() ||
-        !FTy->getParamType(1)->isPointerTy())
-      return false;
-    Changed |= setDoesNotThrow(F);
-    Changed |= setDoesNotCapture(F, 1);
-    Changed |= setDoesNotCapture(F, 2);
-    Changed |= setOnlyReadsMemory(F, 1);
-    Changed |= setOnlyReadsMemory(F, 2);
-    return Changed;
-  case LibFunc::putc:
-    if (FTy->getNumParams() != 2 || !FTy->getParamType(1)->isPointerTy())
-      return false;
-    Changed |= setDoesNotThrow(F);
-    Changed |= setDoesNotCapture(F, 2);
-    return Changed;
-  case LibFunc::puts:
-  case LibFunc::printf:
-  case LibFunc::perror:
-    if (FTy->getNumParams() != 1 || !FTy->getParamType(0)->isPointerTy())
-      return false;
-    Changed |= setDoesNotThrow(F);
-    Changed |= setDoesNotCapture(F, 1);
-    Changed |= setOnlyReadsMemory(F, 1);
-    return Changed;
-  case LibFunc::pread:
-    if (FTy->getNumParams() != 4 || !FTy->getParamType(1)->isPointerTy())
-      return false;
-    // May throw; "pread" is a valid pthread cancellation point.
-    Changed |= setDoesNotCapture(F, 2);
-    return Changed;
-  case LibFunc::pwrite:
-    if (FTy->getNumParams() != 4 || !FTy->getParamType(1)->isPointerTy())
-      return false;
-    // May throw; "pwrite" is a valid pthread cancellation point.
-    Changed |= setDoesNotCapture(F, 2);
-    Changed |= setOnlyReadsMemory(F, 2);
-    return Changed;
-  case LibFunc::putchar:
-    Changed |= setDoesNotThrow(F);
-    return Changed;
-  case LibFunc::popen:
-    if (FTy->getNumParams() != 2 || !FTy->getReturnType()->isPointerTy() ||
-        !FTy->getParamType(0)->isPointerTy() ||
-        !FTy->getParamType(1)->isPointerTy())
-      return false;
-    Changed |= setDoesNotThrow(F);
-    Changed |= setDoesNotAlias(F, 0);
-    Changed |= setDoesNotCapture(F, 1);
-    Changed |= setDoesNotCapture(F, 2);
-    Changed |= setOnlyReadsMemory(F, 1);
-    Changed |= setOnlyReadsMemory(F, 2);
-    return Changed;
-  case LibFunc::pclose:
-    if (FTy->getNumParams() != 1 || !FTy->getParamType(0)->isPointerTy())
-      return false;
-    Changed |= setDoesNotThrow(F);
-    Changed |= setDoesNotCapture(F, 1);
-    return Changed;
-  case LibFunc::vscanf:
-    if (FTy->getNumParams() != 2 || !FTy->getParamType(1)->isPointerTy())
-      return false;
-    Changed |= setDoesNotThrow(F);
-    Changed |= setDoesNotCapture(F, 1);
-    Changed |= setOnlyReadsMemory(F, 1);
-    return Changed;
-  case LibFunc::vsscanf:
-    if (FTy->getNumParams() != 3 || !FTy->getParamType(1)->isPointerTy() ||
-        !FTy->getParamType(2)->isPointerTy())
-      return false;
-    Changed |= setDoesNotThrow(F);
-    Changed |= setDoesNotCapture(F, 1);
-    Changed |= setDoesNotCapture(F, 2);
-    Changed |= setOnlyReadsMemory(F, 1);
-    Changed |= setOnlyReadsMemory(F, 2);
-    return Changed;
-  case LibFunc::vfscanf:
-    if (FTy->getNumParams() != 3 || !FTy->getParamType(1)->isPointerTy() ||
-        !FTy->getParamType(2)->isPointerTy())
-      return false;
-    Changed |= setDoesNotThrow(F);
-    Changed |= setDoesNotCapture(F, 1);
-    Changed |= setDoesNotCapture(F, 2);
-    Changed |= setOnlyReadsMemory(F, 2);
-    return Changed;
-  case LibFunc::valloc:
-    if (!FTy->getReturnType()->isPointerTy())
-      return false;
-    Changed |= setDoesNotThrow(F);
-    Changed |= setDoesNotAlias(F, 0);
-    return Changed;
-  case LibFunc::vprintf:
-    if (FTy->getNumParams() != 2 || !FTy->getParamType(0)->isPointerTy())
-      return false;
-    Changed |= setDoesNotThrow(F);
-    Changed |= setDoesNotCapture(F, 1);
-    Changed |= setOnlyReadsMemory(F, 1);
-    return Changed;
-  case LibFunc::vfprintf:
-  case LibFunc::vsprintf:
-    if (FTy->getNumParams() != 3 || !FTy->getParamType(0)->isPointerTy() ||
-        !FTy->getParamType(1)->isPointerTy())
-      return false;
-    Changed |= setDoesNotThrow(F);
-    Changed |= setDoesNotCapture(F, 1);
-    Changed |= setDoesNotCapture(F, 2);
-    Changed |= setOnlyReadsMemory(F, 2);
-    return Changed;
-  case LibFunc::vsnprintf:
-    if (FTy->getNumParams() != 4 || !FTy->getParamType(0)->isPointerTy() ||
-        !FTy->getParamType(2)->isPointerTy())
-      return false;
-    Changed |= setDoesNotThrow(F);
-    Changed |= setDoesNotCapture(F, 1);
-    Changed |= setDoesNotCapture(F, 3);
-    Changed |= setOnlyReadsMemory(F, 3);
-    return Changed;
-  case LibFunc::open:
-    if (FTy->getNumParams() < 2 || !FTy->getParamType(0)->isPointerTy())
-      return false;
-    // May throw; "open" is a valid pthread cancellation point.
-    Changed |= setDoesNotCapture(F, 1);
-    Changed |= setOnlyReadsMemory(F, 1);
-    return Changed;
-  case LibFunc::opendir:
-    if (FTy->getNumParams() != 1 || !FTy->getReturnType()->isPointerTy() ||
-        !FTy->getParamType(0)->isPointerTy())
-      return false;
-    Changed |= setDoesNotThrow(F);
-    Changed |= setDoesNotAlias(F, 0);
-    Changed |= setDoesNotCapture(F, 1);
-    Changed |= setOnlyReadsMemory(F, 1);
-    return Changed;
-  case LibFunc::tmpfile:
-    if (!FTy->getReturnType()->isPointerTy())
-      return false;
-    Changed |= setDoesNotThrow(F);
-    Changed |= setDoesNotAlias(F, 0);
-    return Changed;
-  case LibFunc::times:
-    if (FTy->getNumParams() != 1 || !FTy->getParamType(0)->isPointerTy())
-      return false;
-    Changed |= setDoesNotThrow(F);
-    Changed |= setDoesNotCapture(F, 1);
-    return Changed;
-  case LibFunc::htonl:
-  case LibFunc::htons:
-  case LibFunc::ntohl:
-  case LibFunc::ntohs:
-    Changed |= setDoesNotThrow(F);
-    Changed |= setDoesNotAccessMemory(F);
-    return Changed;
-  case LibFunc::lstat:
-    if (FTy->getNumParams() != 2 || !FTy->getParamType(0)->isPointerTy() ||
-        !FTy->getParamType(1)->isPointerTy())
-      return false;
-    Changed |= setDoesNotThrow(F);
-    Changed |= setDoesNotCapture(F, 1);
-    Changed |= setDoesNotCapture(F, 2);
-    Changed |= setOnlyReadsMemory(F, 1);
-    return Changed;
-  case LibFunc::lchown:
-    if (FTy->getNumParams() != 3 || !FTy->getParamType(0)->isPointerTy())
-      return false;
-    Changed |= setDoesNotThrow(F);
-    Changed |= setDoesNotCapture(F, 1);
-    Changed |= setOnlyReadsMemory(F, 1);
-    return Changed;
-  case LibFunc::qsort:
-    if (FTy->getNumParams() != 4 || !FTy->getParamType(3)->isPointerTy())
-      return false;
-    // May throw; places call through function pointer.
-    Changed |= setDoesNotCapture(F, 4);
-    return Changed;
-  case LibFunc::dunder_strdup:
-  case LibFunc::dunder_strndup:
-    if (FTy->getNumParams() < 1 || !FTy->getReturnType()->isPointerTy() ||
-        !FTy->getParamType(0)->isPointerTy())
-      return false;
-    Changed |= setDoesNotThrow(F);
-    Changed |= setDoesNotAlias(F, 0);
-    Changed |= setDoesNotCapture(F, 1);
-    Changed |= setOnlyReadsMemory(F, 1);
-    return Changed;
-  case LibFunc::dunder_strtok_r:
-    if (FTy->getNumParams() != 3 || !FTy->getParamType(1)->isPointerTy())
-      return false;
-    Changed |= setDoesNotThrow(F);
-    Changed |= setDoesNotCapture(F, 2);
-    Changed |= setOnlyReadsMemory(F, 2);
-    return Changed;
-  case LibFunc::under_IO_getc:
-    if (FTy->getNumParams() != 1 || !FTy->getParamType(0)->isPointerTy())
-      return false;
-    Changed |= setDoesNotThrow(F);
-    Changed |= setDoesNotCapture(F, 1);
-    return Changed;
-  case LibFunc::under_IO_putc:
-    if (FTy->getNumParams() != 2 || !FTy->getParamType(1)->isPointerTy())
-      return false;
-    Changed |= setDoesNotThrow(F);
-    Changed |= setDoesNotCapture(F, 2);
-    return Changed;
-  case LibFunc::dunder_isoc99_scanf:
-    if (FTy->getNumParams() < 1 || !FTy->getParamType(0)->isPointerTy())
-      return false;
-    Changed |= setDoesNotThrow(F);
-    Changed |= setDoesNotCapture(F, 1);
-    Changed |= setOnlyReadsMemory(F, 1);
-    return Changed;
-  case LibFunc::stat64:
-  case LibFunc::lstat64:
-  case LibFunc::statvfs64:
-    if (FTy->getNumParams() < 1 || !FTy->getParamType(0)->isPointerTy() ||
-        !FTy->getParamType(1)->isPointerTy())
-      return false;
-    Changed |= setDoesNotThrow(F);
-    Changed |= setDoesNotCapture(F, 1);
-    Changed |= setDoesNotCapture(F, 2);
-    Changed |= setOnlyReadsMemory(F, 1);
-    return Changed;
-  case LibFunc::dunder_isoc99_sscanf:
-    if (FTy->getNumParams() < 1 || !FTy->getParamType(0)->isPointerTy() ||
-        !FTy->getParamType(1)->isPointerTy())
-      return false;
-    Changed |= setDoesNotThrow(F);
-    Changed |= setDoesNotCapture(F, 1);
-    Changed |= setDoesNotCapture(F, 2);
-    Changed |= setOnlyReadsMemory(F, 1);
-    Changed |= setOnlyReadsMemory(F, 2);
-    return Changed;
-  case LibFunc::fopen64:
-    if (FTy->getNumParams() != 2 || !FTy->getReturnType()->isPointerTy() ||
-        !FTy->getParamType(0)->isPointerTy() ||
-        !FTy->getParamType(1)->isPointerTy())
-      return false;
-    Changed |= setDoesNotThrow(F);
-    Changed |= setDoesNotAlias(F, 0);
-    Changed |= setDoesNotCapture(F, 1);
-    Changed |= setDoesNotCapture(F, 2);
-    Changed |= setOnlyReadsMemory(F, 1);
-    Changed |= setOnlyReadsMemory(F, 2);
-    return Changed;
-  case LibFunc::fseeko64:
-  case LibFunc::ftello64:
-    if (FTy->getNumParams() == 0 || !FTy->getParamType(0)->isPointerTy())
-      return false;
-    Changed |= setDoesNotThrow(F);
-    Changed |= setDoesNotCapture(F, 1);
-    return Changed;
-  case LibFunc::tmpfile64:
-    if (!FTy->getReturnType()->isPointerTy())
-      return false;
-    Changed |= setDoesNotThrow(F);
-    Changed |= setDoesNotAlias(F, 0);
-    return Changed;
-  case LibFunc::fstat64:
-  case LibFunc::fstatvfs64:
-    if (FTy->getNumParams() != 2 || !FTy->getParamType(1)->isPointerTy())
-      return false;
-    Changed |= setDoesNotThrow(F);
-    Changed |= setDoesNotCapture(F, 2);
-    return Changed;
-  case LibFunc::open64:
-    if (FTy->getNumParams() < 2 || !FTy->getParamType(0)->isPointerTy())
-      return false;
-    // May throw; "open" is a valid pthread cancellation point.
-    Changed |= setDoesNotCapture(F, 1);
-    Changed |= setOnlyReadsMemory(F, 1);
-    return Changed;
-  case LibFunc::gettimeofday:
-    if (FTy->getNumParams() != 2 || !FTy->getParamType(0)->isPointerTy() ||
-        !FTy->getParamType(1)->isPointerTy())
-      return false;
-    // Currently some platforms have the restrict keyword on the arguments to
-    // gettimeofday. To be conservative, do not add noalias to gettimeofday's
-    // arguments.
-    Changed |= setDoesNotThrow(F);
-    Changed |= setDoesNotCapture(F, 1);
-    Changed |= setDoesNotCapture(F, 2);
-    return Changed;
-
-  case LibFunc::Znwj: // new(unsigned int)
-  case LibFunc::Znwm: // new(unsigned long)
-  case LibFunc::Znaj: // new[](unsigned int)
-  case LibFunc::Znam: // new[](unsigned long)
-  case LibFunc::msvc_new_int: // new(unsigned int)
-  case LibFunc::msvc_new_longlong: // new(unsigned long long)
-  case LibFunc::msvc_new_array_int: // new[](unsigned int)
-  case LibFunc::msvc_new_array_longlong: // new[](unsigned long long)
-    if (FTy->getNumParams() != 1)
-      return false;
-    // Operator new always returns a nonnull noalias pointer
-    Changed |= setNonNull(F, AttributeSet::ReturnIndex);
-    Changed |= setDoesNotAlias(F, AttributeSet::ReturnIndex);
-    return Changed;
-
-  //TODO: add LibFunc entries for:
-  //case LibFunc::memset_pattern4:
-  //case LibFunc::memset_pattern8:
-  case LibFunc::memset_pattern16:
-    if (FTy->isVarArg() || FTy->getNumParams() != 3 ||
-        !isa<PointerType>(FTy->getParamType(0)) ||
-        !isa<PointerType>(FTy->getParamType(1)) ||
-        !isa<IntegerType>(FTy->getParamType(2)))
-      return false;
-
-    Changed |= setOnlyAccessesArgMemory(F);
-    Changed |= setOnlyReadsMemory(F, 2);
-    return Changed;
-
-  default:
-    // FIXME: It'd be really nice to cover all the library functions we're
-    // aware of here.
-    return false;
-  }
-}
-
 static bool inferAllPrototypeAttributes(Module &M,
                                         const TargetLibraryInfo &TLI) {
   bool Changed = false;
 
   for (Function &F : M.functions())
-    // We only infer things using the prototype if the definition isn't around
-    // to analyze directly.
-    if (F.isDeclaration())
-      Changed |= inferPrototypeAttributes(F, TLI);
+    // We only infer things using the prototype and the name; we don't need
+    // definitions.
+    if (F.isDeclaration() && !F.hasFnAttribute((Attribute::OptimizeNone)))
+      Changed |= inferLibFuncAttributes(F, TLI);
 
   return Changed;
 }
 
 PreservedAnalyses InferFunctionAttrsPass::run(Module &M,
-                                              AnalysisManager<Module> *AM) {
-  auto &TLI = AM->getResult<TargetLibraryAnalysis>(M);
+                                              AnalysisManager<Module> &AM) {
+  auto &TLI = AM.getResult<TargetLibraryAnalysis>(M);
 
   if (!inferAllPrototypeAttributes(M, TLI))
     // If we didn't infer anything, preserve all analyses.
@@ -970,6 +59,9 @@ struct InferFunctionAttrsLegacyPass : public ModulePass {
   }
 
   bool runOnModule(Module &M) override {
+    if (skipModule(M))
+      return false;
+
     auto &TLI = getAnalysis<TargetLibraryInfoWrapperPass>().getTLI();
     return inferAllPrototypeAttributes(M, TLI);
   }
diff --git a/lib/Transforms/IPO/InlineAlways.cpp b/lib/Transforms/IPO/InlineAlways.cpp
index 1704bfea0b86..cb1ab95ec2af 100644
--- a/lib/Transforms/IPO/InlineAlways.cpp
+++ b/lib/Transforms/IPO/InlineAlways.cpp
@@ -17,6 +17,7 @@
 #include "llvm/Analysis/AssumptionCache.h"
 #include "llvm/Analysis/CallGraph.h"
 #include "llvm/Analysis/InlineCost.h"
+#include "llvm/Analysis/ProfileSummaryInfo.h"
 #include "llvm/Analysis/TargetLibraryInfo.h"
 #include "llvm/IR/CallSite.h"
 #include "llvm/IR/CallingConv.h"
@@ -37,16 +38,17 @@ namespace {
 class AlwaysInliner : public Inliner {
 
 public:
-  // Use extremely low threshold.
-  AlwaysInliner() : Inliner(ID, -2000000000, /*InsertLifetime*/ true) {
+  AlwaysInliner() : Inliner(ID, /*InsertLifetime*/ true) {
     initializeAlwaysInlinerPass(*PassRegistry::getPassRegistry());
   }
 
-  AlwaysInliner(bool InsertLifetime)
-      : Inliner(ID, -2000000000, InsertLifetime) {
+  AlwaysInliner(bool InsertLifetime) : Inliner(ID, InsertLifetime) {
     initializeAlwaysInlinerPass(*PassRegistry::getPassRegistry());
   }
 
+  /// Main run interface method.  We override here to avoid calling skipSCC().
+  bool runOnSCC(CallGraphSCC &SCC) override { return inlineCalls(SCC); }
+
   static char ID; // Pass identification, replacement for typeid
 
   InlineCost getInlineCost(CallSite CS) override;
@@ -64,6 +66,7 @@ INITIALIZE_PASS_BEGIN(AlwaysInliner, "always-inline",
                 "Inliner for always_inline functions", false, false)
 INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker)
 INITIALIZE_PASS_DEPENDENCY(CallGraphWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(ProfileSummaryInfoWrapperPass)
 INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfoWrapperPass)
 INITIALIZE_PASS_END(AlwaysInliner, "always-inline",
                 "Inliner for always_inline functions", false, false)
diff --git a/lib/Transforms/IPO/InlineSimple.cpp b/lib/Transforms/IPO/InlineSimple.cpp
index 45609f891ed8..2aa650bd219d 100644
--- a/lib/Transforms/IPO/InlineSimple.cpp
+++ b/lib/Transforms/IPO/InlineSimple.cpp
@@ -14,6 +14,7 @@
 #include "llvm/Analysis/AssumptionCache.h"
 #include "llvm/Analysis/CallGraph.h"
 #include "llvm/Analysis/InlineCost.h"
+#include "llvm/Analysis/ProfileSummaryInfo.h"
 #include "llvm/Analysis/TargetLibraryInfo.h"
 #include "llvm/Analysis/TargetTransformInfo.h"
 #include "llvm/IR/CallSite.h"
@@ -38,14 +39,20 @@ namespace {
 /// inliner pass and the always inliner pass. The two passes use different cost
 /// analyses to determine when to inline.
 class SimpleInliner : public Inliner {
+  // This field is populated based on one of the following:
+  //  * optimization or size-optimization levels,
+  //  * the --inline-threshold flag, or
+  //  * a user specified value.
+  int DefaultThreshold;
 
 public:
-  SimpleInliner() : Inliner(ID) {
+  SimpleInliner()
+      : Inliner(ID), DefaultThreshold(llvm::getDefaultInlineThreshold()) {
     initializeSimpleInlinerPass(*PassRegistry::getPassRegistry());
   }
 
-  SimpleInliner(int Threshold)
-      : Inliner(ID, Threshold, /*InsertLifetime*/ true) {
+  explicit SimpleInliner(int Threshold)
+      : Inliner(ID), DefaultThreshold(Threshold) {
     initializeSimpleInlinerPass(*PassRegistry::getPassRegistry());
   }
 
@@ -54,7 +61,7 @@ public:
   InlineCost getInlineCost(CallSite CS) override {
     Function *Callee = CS.getCalledFunction();
     TargetTransformInfo &TTI = TTIWP->getTTI(*Callee);
-    return llvm::getInlineCost(CS, getInlineThreshold(CS), TTI, ACT);
+    return llvm::getInlineCost(CS, DefaultThreshold, TTI, ACT, PSI);
   }
 
   bool runOnSCC(CallGraphSCC &SCC) override;
@@ -64,17 +71,6 @@ private:
   TargetTransformInfoWrapperPass *TTIWP;
 };
 
-static int computeThresholdFromOptLevels(unsigned OptLevel,
-                                         unsigned SizeOptLevel) {
-  if (OptLevel > 2)
-    return 275;
-  if (SizeOptLevel == 1) // -Os
-    return 75;
-  if (SizeOptLevel == 2) // -Oz
-    return 25;
-  return 225;
-}
-
 } // end anonymous namespace
 
 char SimpleInliner::ID = 0;
@@ -82,6 +78,7 @@ INITIALIZE_PASS_BEGIN(SimpleInliner, "inline",
                 "Function Integration/Inlining", false, false)
 INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker)
 INITIALIZE_PASS_DEPENDENCY(CallGraphWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(ProfileSummaryInfoWrapperPass)
 INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass)
 INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfoWrapperPass)
 INITIALIZE_PASS_END(SimpleInliner, "inline",
@@ -96,7 +93,7 @@ Pass *llvm::createFunctionInliningPass(int Threshold) {
 Pass *llvm::createFunctionInliningPass(unsigned OptLevel,
                                        unsigned SizeOptLevel) {
   return new SimpleInliner(
-      computeThresholdFromOptLevels(OptLevel, SizeOptLevel));
+      llvm::computeThresholdFromOptLevels(OptLevel, SizeOptLevel));
 }
 
 bool SimpleInliner::runOnSCC(CallGraphSCC &SCC) {
diff --git a/lib/Transforms/IPO/Inliner.cpp b/lib/Transforms/IPO/Inliner.cpp
index bbe5f8761d5f..79535ca49780 100644
--- a/lib/Transforms/IPO/Inliner.cpp
+++ b/lib/Transforms/IPO/Inliner.cpp
@@ -13,7 +13,6 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "llvm/Transforms/IPO/InlinerPass.h"
 #include "llvm/ADT/SmallPtrSet.h"
 #include "llvm/ADT/Statistic.h"
 #include "llvm/Analysis/AliasAnalysis.h"
@@ -21,6 +20,7 @@
 #include "llvm/Analysis/BasicAliasAnalysis.h"
 #include "llvm/Analysis/CallGraph.h"
 #include "llvm/Analysis/InlineCost.h"
+#include "llvm/Analysis/ProfileSummaryInfo.h"
 #include "llvm/Analysis/TargetLibraryInfo.h"
 #include "llvm/IR/CallSite.h"
 #include "llvm/IR/DataLayout.h"
@@ -28,9 +28,9 @@
 #include "llvm/IR/Instructions.h"
 #include "llvm/IR/IntrinsicInst.h"
 #include "llvm/IR/Module.h"
-#include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/raw_ostream.h"
+#include "llvm/Transforms/IPO/InlinerPass.h"
 #include "llvm/Transforms/Utils/Cloning.h"
 #include "llvm/Transforms/Utils/Local.h"
 using namespace llvm;
@@ -47,40 +47,19 @@ STATISTIC(NumMergedAllocas, "Number of allocas merged together");
 // if those would be more profitable and blocked inline steps.
 STATISTIC(NumCallerCallersAnalyzed, "Number of caller-callers analyzed");
 
-static cl::opt<int>
-InlineLimit("inline-threshold", cl::Hidden, cl::init(225), cl::ZeroOrMore,
-        cl::desc("Control the amount of inlining to perform (default = 225)"));
-
-static cl::opt<int>
-HintThreshold("inlinehint-threshold", cl::Hidden, cl::init(325),
-              cl::desc("Threshold for inlining functions with inline hint"));
-
-// We instroduce this threshold to help performance of instrumentation based
-// PGO before we actually hook up inliner with analysis passes such as BPI and
-// BFI.
-static cl::opt<int>
-ColdThreshold("inlinecold-threshold", cl::Hidden, cl::init(225),
-              cl::desc("Threshold for inlining functions with cold attribute"));
-
-// Threshold to use when optsize is specified (and there is no -inline-limit).
-const int OptSizeThreshold = 75;
+Inliner::Inliner(char &ID) : CallGraphSCCPass(ID), InsertLifetime(true) {}
 
-Inliner::Inliner(char &ID)
-    : CallGraphSCCPass(ID), InlineThreshold(InlineLimit), InsertLifetime(true) {
-}
-
-Inliner::Inliner(char &ID, int Threshold, bool InsertLifetime)
-    : CallGraphSCCPass(ID),
-      InlineThreshold(InlineLimit.getNumOccurrences() > 0 ? InlineLimit
-                                                          : Threshold),
-      InsertLifetime(InsertLifetime) {}
+Inliner::Inliner(char &ID, bool InsertLifetime)
+    : CallGraphSCCPass(ID), InsertLifetime(InsertLifetime) {}
 
 /// For this class, we declare that we require and preserve the call graph.
 /// If the derived class implements this method, it should
 /// always explicitly call the implementation here.
 void Inliner::getAnalysisUsage(AnalysisUsage &AU) const {
   AU.addRequired<AssumptionCacheTracker>();
+  AU.addRequired<ProfileSummaryInfoWrapperPass>();
   AU.addRequired<TargetLibraryInfoWrapperPass>();
+  getAAResultsAnalysisUsage(AU);
   CallGraphSCCPass::getAnalysisUsage(AU);
 }
 
@@ -243,67 +222,6 @@ static bool InlineCallIfPossible(Pass &P, CallSite CS, InlineFunctionInfo &IFI,
   return true;
 }
 
-unsigned Inliner::getInlineThreshold(CallSite CS) const {
-  int Threshold = InlineThreshold; // -inline-threshold or else selected by
-                                   // overall opt level
-
-  // If -inline-threshold is not given, listen to the optsize attribute when it
-  // would decrease the threshold.
-  Function *Caller = CS.getCaller();
-  bool OptSize = Caller && !Caller->isDeclaration() &&
-                 // FIXME: Use Function::optForSize().
-                 Caller->hasFnAttribute(Attribute::OptimizeForSize);
-  if (!(InlineLimit.getNumOccurrences() > 0) && OptSize &&
-      OptSizeThreshold < Threshold)
-    Threshold = OptSizeThreshold;
-
-  Function *Callee = CS.getCalledFunction();
-  if (!Callee || Callee->isDeclaration())
-    return Threshold;
-
-  // If profile information is available, use that to adjust threshold of hot
-  // and cold functions.
-  // FIXME: The heuristic used below for determining hotness and coldness are
-  // based on preliminary SPEC tuning and may not be optimal. Replace this with
-  // a well-tuned heuristic based on *callsite* hotness and not callee hotness.
-  uint64_t FunctionCount = 0, MaxFunctionCount = 0;
-  bool HasPGOCounts = false;
-  if (Callee->getEntryCount() &&
-      Callee->getParent()->getMaximumFunctionCount()) {
-    HasPGOCounts = true;
-    FunctionCount = Callee->getEntryCount().getValue();
-    MaxFunctionCount =
-        Callee->getParent()->getMaximumFunctionCount().getValue();
-  }
-
-  // Listen to the inlinehint attribute or profile based hotness information
-  // when it would increase the threshold and the caller does not need to
-  // minimize its size.
-  bool InlineHint =
-      Callee->hasFnAttribute(Attribute::InlineHint) ||
-      (HasPGOCounts &&
-       FunctionCount >= (uint64_t)(0.3 * (double)MaxFunctionCount));
-  if (InlineHint && HintThreshold > Threshold &&
-      !Caller->hasFnAttribute(Attribute::MinSize))
-    Threshold = HintThreshold;
-
-  // Listen to the cold attribute or profile based coldness information
-  // when it would decrease the threshold.
-  bool ColdCallee =
-      Callee->hasFnAttribute(Attribute::Cold) ||
-      (HasPGOCounts &&
-       FunctionCount <= (uint64_t)(0.01 * (double)MaxFunctionCount));
-  // Command line argument for InlineLimit will override the default
-  // ColdThreshold. If we have -inline-threshold but no -inlinecold-threshold,
-  // do not use the default cold threshold even if it is smaller.
-  if ((InlineLimit.getNumOccurrences() == 0 ||
-       ColdThreshold.getNumOccurrences() > 0) && ColdCallee &&
-      ColdThreshold < Threshold)
-    Threshold = ColdThreshold;
-
-  return Threshold;
-}
-
 static void emitAnalysis(CallSite CS, const Twine &Msg) {
   Function *Caller = CS.getCaller();
   LLVMContext &Ctx = Caller->getContext();
@@ -311,6 +229,76 @@ static void emitAnalysis(CallSite CS, const Twine &Msg) {
   emitOptimizationRemarkAnalysis(Ctx, DEBUG_TYPE, *Caller, DLoc, Msg);
 }
 
+bool Inliner::shouldBeDeferred(Function *Caller, CallSite CS, InlineCost IC,
+                               int &TotalSecondaryCost) {
+
+  // For now we only handle local or inline functions.
+  if (!Caller->hasLocalLinkage() && !Caller->hasLinkOnceODRLinkage())
+    return false;
+  // Try to detect the case where the current inlining candidate caller (call
+  // it B) is a static or linkonce-ODR function and is an inlining candidate
+  // elsewhere, and the current candidate callee (call it C) is large enough
+  // that inlining it into B would make B too big to inline later. In these
+  // circumstances it may be best not to inline C into B, but to inline B into
+  // its callers.
+  //
+  // This only applies to static and linkonce-ODR functions because those are
+  // expected to be available for inlining in the translation units where they
+  // are used. Thus we will always have the opportunity to make local inlining
+  // decisions. Importantly the linkonce-ODR linkage covers inline functions
+  // and templates in C++.
+  //
+  // FIXME: All of this logic should be sunk into getInlineCost. It relies on
+  // the internal implementation of the inline cost metrics rather than
+  // treating them as truly abstract units etc.
+  TotalSecondaryCost = 0;
+  // The candidate cost to be imposed upon the current function.
+  int CandidateCost = IC.getCost() - (InlineConstants::CallPenalty + 1);
+  // This bool tracks what happens if we do NOT inline C into B.
+  bool callerWillBeRemoved = Caller->hasLocalLinkage();
+  // This bool tracks what happens if we DO inline C into B.
+  bool inliningPreventsSomeOuterInline = false;
+  for (User *U : Caller->users()) {
+    CallSite CS2(U);
+
+    // If this isn't a call to Caller (it could be some other sort
+    // of reference) skip it.  Such references will prevent the caller
+    // from being removed.
+    if (!CS2 || CS2.getCalledFunction() != Caller) {
+      callerWillBeRemoved = false;
+      continue;
+    }
+
+    InlineCost IC2 = getInlineCost(CS2);
+    ++NumCallerCallersAnalyzed;
+    if (!IC2) {
+      callerWillBeRemoved = false;
+      continue;
+    }
+    if (IC2.isAlways())
+      continue;
+
+    // See if inlining or original callsite would erase the cost delta of
+    // this callsite. We subtract off the penalty for the call instruction,
+    // which we would be deleting.
+    if (IC2.getCostDelta() <= CandidateCost) {
+      inliningPreventsSomeOuterInline = true;
+      TotalSecondaryCost += IC2.getCost();
+    }
+  }
+  // If all outer calls to Caller would get inlined, the cost for the last
+  // one is set very low by getInlineCost, in anticipation that Caller will
+  // be removed entirely.  We did not account for this above unless there
+  // is only one caller of Caller.
+  if (callerWillBeRemoved && !Caller->use_empty())
+    TotalSecondaryCost += InlineConstants::LastCallToStaticBonus;
+
+  if (inliningPreventsSomeOuterInline && TotalSecondaryCost < IC.getCost())
+    return true;
+
+  return false;
+}
+
 /// Return true if the inliner should attempt to inline at the given CallSite.
 bool Inliner::shouldInline(CallSite CS) {
   InlineCost IC = getInlineCost(CS);
@@ -342,77 +330,17 @@ bool Inliner::shouldInline(CallSite CS) {
                          Twine(IC.getCostDelta() + IC.getCost()) + ")");
     return false;
   }
-  
-  // Try to detect the case where the current inlining candidate caller (call
-  // it B) is a static or linkonce-ODR function and is an inlining candidate
-  // elsewhere, and the current candidate callee (call it C) is large enough
-  // that inlining it into B would make B too big to inline later. In these
-  // circumstances it may be best not to inline C into B, but to inline B into
-  // its callers.
-  //
-  // This only applies to static and linkonce-ODR functions because those are
-  // expected to be available for inlining in the translation units where they
-  // are used. Thus we will always have the opportunity to make local inlining
-  // decisions. Importantly the linkonce-ODR linkage covers inline functions
-  // and templates in C++.
-  //
-  // FIXME: All of this logic should be sunk into getInlineCost. It relies on
-  // the internal implementation of the inline cost metrics rather than
-  // treating them as truly abstract units etc.
-  if (Caller->hasLocalLinkage() || Caller->hasLinkOnceODRLinkage()) {
-    int TotalSecondaryCost = 0;
-    // The candidate cost to be imposed upon the current function.
-    int CandidateCost = IC.getCost() - (InlineConstants::CallPenalty + 1);
-    // This bool tracks what happens if we do NOT inline C into B.
-    bool callerWillBeRemoved = Caller->hasLocalLinkage();
-    // This bool tracks what happens if we DO inline C into B.
-    bool inliningPreventsSomeOuterInline = false;
-    for (User *U : Caller->users()) {
-      CallSite CS2(U);
-
-      // If this isn't a call to Caller (it could be some other sort
-      // of reference) skip it.  Such references will prevent the caller
-      // from being removed.
-      if (!CS2 || CS2.getCalledFunction() != Caller) {
-        callerWillBeRemoved = false;
-        continue;
-      }
 
-      InlineCost IC2 = getInlineCost(CS2);
-      ++NumCallerCallersAnalyzed;
-      if (!IC2) {
-        callerWillBeRemoved = false;
-        continue;
-      }
-      if (IC2.isAlways())
-        continue;
-
-      // See if inlining or original callsite would erase the cost delta of
-      // this callsite. We subtract off the penalty for the call instruction,
-      // which we would be deleting.
-      if (IC2.getCostDelta() <= CandidateCost) {
-        inliningPreventsSomeOuterInline = true;
-        TotalSecondaryCost += IC2.getCost();
-      }
-    }
-    // If all outer calls to Caller would get inlined, the cost for the last
-    // one is set very low by getInlineCost, in anticipation that Caller will
-    // be removed entirely.  We did not account for this above unless there
-    // is only one caller of Caller.
-    if (callerWillBeRemoved && !Caller->use_empty())
-      TotalSecondaryCost += InlineConstants::LastCallToStaticBonus;
-
-    if (inliningPreventsSomeOuterInline && TotalSecondaryCost < IC.getCost()) {
-      DEBUG(dbgs() << "    NOT Inlining: " << *CS.getInstruction() <<
-           " Cost = " << IC.getCost() <<
-           ", outer Cost = " << TotalSecondaryCost << '\n');
-      emitAnalysis(
-          CS, Twine("Not inlining. Cost of inlining " +
-                    CS.getCalledFunction()->getName() +
-                    " increases the cost of inlining " +
-                    CS.getCaller()->getName() + " in other contexts"));
-      return false;
-    }
+  int TotalSecondaryCost = 0;
+  if (shouldBeDeferred(Caller, CS, IC, TotalSecondaryCost)) {
+    DEBUG(dbgs() << "    NOT Inlining: " << *CS.getInstruction()
+          << " Cost = " << IC.getCost()
+          << ", outer Cost = " << TotalSecondaryCost << '\n');
+    emitAnalysis(CS, Twine("Not inlining. Cost of inlining " +
+                           CS.getCalledFunction()->getName() +
+                           " increases the cost of inlining " +
+                           CS.getCaller()->getName() + " in other contexts"));
+    return false;
   }
 
   DEBUG(dbgs() << "    Inlining: cost=" << IC.getCost()
@@ -440,8 +368,15 @@ static bool InlineHistoryIncludes(Function *F, int InlineHistoryID,
 }
 
 bool Inliner::runOnSCC(CallGraphSCC &SCC) {
+  if (skipSCC(SCC))
+    return false;
+  return inlineCalls(SCC);
+}
+
+bool Inliner::inlineCalls(CallGraphSCC &SCC) {
   CallGraph &CG = getAnalysis<CallGraphWrapperPass>().getCallGraph();
   ACT = &getAnalysis<AssumptionCacheTracker>();
+  PSI = getAnalysis<ProfileSummaryInfoWrapperPass>().getPSI(CG.getModule());
   auto &TLI = getAnalysis<TargetLibraryInfoWrapperPass>().getTLI();
 
   SmallPtrSet<Function*, 8> SCCFunctions;
diff --git a/lib/Transforms/IPO/Internalize.cpp b/lib/Transforms/IPO/Internalize.cpp
index 21bb5d000bc7..8c5c6f77077c 100644
--- a/lib/Transforms/IPO/Internalize.cpp
+++ b/lib/Transforms/IPO/Internalize.cpp
@@ -8,8 +8,8 @@
 //===----------------------------------------------------------------------===//
 //
 // This pass loops over all of the functions and variables in the input module.
-// If the function or variable is not in the list of external names given to
-// the pass it is marked as internal.
+// If the function or variable does not need to be preserved according to the
+// client supplied callback, it is marked as internal.
 //
 // This transformation would not be legal in a regular compilation, but it gets
 // extra information from the linker about what is safe.
@@ -19,98 +19,77 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "llvm/Transforms/IPO.h"
+#include "llvm/Transforms/IPO/Internalize.h"
 #include "llvm/ADT/SmallPtrSet.h"
 #include "llvm/ADT/Statistic.h"
+#include "llvm/ADT/StringSet.h"
 #include "llvm/Analysis/CallGraph.h"
 #include "llvm/IR/Module.h"
 #include "llvm/Pass.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/raw_ostream.h"
+#include "llvm/Transforms/IPO.h"
 #include "llvm/Transforms/Utils/GlobalStatus.h"
-#include "llvm/Transforms/Utils/ModuleUtils.h"
 #include <fstream>
 #include <set>
 using namespace llvm;
 
 #define DEBUG_TYPE "internalize"
 
-STATISTIC(NumAliases  , "Number of aliases internalized");
+STATISTIC(NumAliases, "Number of aliases internalized");
 STATISTIC(NumFunctions, "Number of functions internalized");
-STATISTIC(NumGlobals  , "Number of global vars internalized");
+STATISTIC(NumGlobals, "Number of global vars internalized");
 
 // APIFile - A file which contains a list of symbols that should not be marked
 // external.
 static cl::opt<std::string>
-APIFile("internalize-public-api-file", cl::value_desc("filename"),
-        cl::desc("A file containing list of symbol names to preserve"));
+    APIFile("internalize-public-api-file", cl::value_desc("filename"),
+            cl::desc("A file containing list of symbol names to preserve"));
 
 // APIList - A list of symbols that should not be marked internal.
 static cl::list<std::string>
-APIList("internalize-public-api-list", cl::value_desc("list"),
-        cl::desc("A list of symbol names to preserve"),
-        cl::CommaSeparated);
+    APIList("internalize-public-api-list", cl::value_desc("list"),
+            cl::desc("A list of symbol names to preserve"), cl::CommaSeparated);
 
 namespace {
-  class InternalizePass : public ModulePass {
-    std::set<std::string> ExternalNames;
-  public:
-    static char ID; // Pass identification, replacement for typeid
-    explicit InternalizePass();
-    explicit InternalizePass(ArrayRef<const char *> ExportList);
-    void LoadFile(const char *Filename);
-    bool maybeInternalize(GlobalValue &GV,
-                          const std::set<const Comdat *> &ExternalComdats);
-    void checkComdatVisibility(GlobalValue &GV,
-                               std::set<const Comdat *> &ExternalComdats);
-    bool runOnModule(Module &M) override;
-
-    void getAnalysisUsage(AnalysisUsage &AU) const override {
-      AU.setPreservesCFG();
-      AU.addPreserved<CallGraphWrapperPass>();
-    }
-  };
-} // end anonymous namespace
-
-char InternalizePass::ID = 0;
-INITIALIZE_PASS(InternalizePass, "internalize",
-                "Internalize Global Symbols", false, false)
-
-InternalizePass::InternalizePass() : ModulePass(ID) {
-  initializeInternalizePassPass(*PassRegistry::getPassRegistry());
-  if (!APIFile.empty())           // If a filename is specified, use it.
-    LoadFile(APIFile.c_str());
-  ExternalNames.insert(APIList.begin(), APIList.end());
-}
-
-InternalizePass::InternalizePass(ArrayRef<const char *> ExportList)
-    : ModulePass(ID) {
-  initializeInternalizePassPass(*PassRegistry::getPassRegistry());
-  for(ArrayRef<const char *>::const_iterator itr = ExportList.begin();
-        itr != ExportList.end(); itr++) {
-    ExternalNames.insert(*itr);
+// Helper to load an API list to preserve from file and expose it as a functor
+// for internalization.
+class PreserveAPIList {
+public:
+  PreserveAPIList() {
+    if (!APIFile.empty())
+      LoadFile(APIFile);
+    ExternalNames.insert(APIList.begin(), APIList.end());
   }
-}
 
-void InternalizePass::LoadFile(const char *Filename) {
-  // Load the APIFile...
-  std::ifstream In(Filename);
-  if (!In.good()) {
-    errs() << "WARNING: Internalize couldn't load file '" << Filename
-         << "'! Continuing as if it's empty.\n";
-    return; // Just continue as if the file were empty
+  bool operator()(const GlobalValue &GV) {
+    return ExternalNames.count(GV.getName());
   }
-  while (In) {
-    std::string Symbol;
-    In >> Symbol;
-    if (!Symbol.empty())
-      ExternalNames.insert(Symbol);
+
+private:
+  // Contains the set of symbols loaded from file
+  StringSet<> ExternalNames;
+
+  void LoadFile(StringRef Filename) {
+    // Load the APIFile...
+    std::ifstream In(Filename.data());
+    if (!In.good()) {
+      errs() << "WARNING: Internalize couldn't load file '" << Filename
+             << "'! Continuing as if it's empty.\n";
+      return; // Just continue as if the file were empty
+    }
+    while (In) {
+      std::string Symbol;
+      In >> Symbol;
+      if (!Symbol.empty())
+        ExternalNames.insert(Symbol);
+    }
   }
-}
+};
+} // end anonymous namespace
 
-static bool isExternallyVisible(const GlobalValue &GV,
-                                const std::set<std::string> &ExternalNames) {
+bool InternalizePass::shouldPreserveGV(const GlobalValue &GV) {
   // Function must be defined here
   if (GV.isDeclaration())
     return true;
@@ -123,15 +102,17 @@ static bool isExternallyVisible(const GlobalValue &GV,
   if (GV.hasDLLExportStorageClass())
     return true;
 
-  // Marked to keep external?
-  if (!GV.hasLocalLinkage() && ExternalNames.count(GV.getName()))
+  // Already local, has nothing to do.
+  if (GV.hasLocalLinkage())
+    return false;
+
+  // Check some special cases
+  if (AlwaysPreserved.count(GV.getName()))
     return true;
 
-  return false;
+  return MustPreserveGV(GV);
 }
 
-// Internalize GV if it is possible to do so, i.e. it is not externally visible
-// and is not a member of an externally visible comdat.
 bool InternalizePass::maybeInternalize(
     GlobalValue &GV, const std::set<const Comdat *> &ExternalComdats) {
   if (Comdat *C = GV.getComdat()) {
@@ -148,7 +129,7 @@ bool InternalizePass::maybeInternalize(
     if (GV.hasLocalLinkage())
       return false;
 
-    if (isExternallyVisible(GV, ExternalNames))
+    if (shouldPreserveGV(GV))
       return false;
   }
 
@@ -165,13 +146,12 @@ void InternalizePass::checkComdatVisibility(
   if (!C)
     return;
 
-  if (isExternallyVisible(GV, ExternalNames))
+  if (shouldPreserveGV(GV))
     ExternalComdats.insert(C);
 }
 
-bool InternalizePass::runOnModule(Module &M) {
-  CallGraphWrapperPass *CGPass = getAnalysisIfAvailable<CallGraphWrapperPass>();
-  CallGraph *CG = CGPass ? &CGPass->getCallGraph() : nullptr;
+bool InternalizePass::internalizeModule(Module &M, CallGraph *CG) {
+  bool Changed = false;
   CallGraphNode *ExternalNode = CG ? CG->getExternalCallingNode() : nullptr;
 
   SmallPtrSet<GlobalValue *, 8> Used;
@@ -198,13 +178,14 @@ bool InternalizePass::runOnModule(Module &M) {
   // conservative, we internalize symbols in llvm.compiler.used, but we
   // keep llvm.compiler.used so that the symbol is not deleted by llvm.
   for (GlobalValue *V : Used) {
-    ExternalNames.insert(V->getName());
+    AlwaysPreserved.insert(V->getName());
   }
 
   // Mark all functions not in the api as internal.
   for (Function &I : M) {
     if (!maybeInternalize(I, ExternalComdats))
       continue;
+    Changed = true;
 
     if (ExternalNode)
       // Remove a callgraph edge from the external node to this function.
@@ -217,53 +198,97 @@ bool InternalizePass::runOnModule(Module &M) {
   // Never internalize the llvm.used symbol.  It is used to implement
   // attribute((used)).
   // FIXME: Shouldn't this just filter on llvm.metadata section??
-  ExternalNames.insert("llvm.used");
-  ExternalNames.insert("llvm.compiler.used");
+  AlwaysPreserved.insert("llvm.used");
+  AlwaysPreserved.insert("llvm.compiler.used");
 
   // Never internalize anchors used by the machine module info, else the info
   // won't find them.  (see MachineModuleInfo.)
-  ExternalNames.insert("llvm.global_ctors");
-  ExternalNames.insert("llvm.global_dtors");
-  ExternalNames.insert("llvm.global.annotations");
+  AlwaysPreserved.insert("llvm.global_ctors");
+  AlwaysPreserved.insert("llvm.global_dtors");
+  AlwaysPreserved.insert("llvm.global.annotations");
 
   // Never internalize symbols code-gen inserts.
   // FIXME: We should probably add this (and the __stack_chk_guard) via some
   // type of call-back in CodeGen.
-  ExternalNames.insert("__stack_chk_fail");
-  ExternalNames.insert("__stack_chk_guard");
+  AlwaysPreserved.insert("__stack_chk_fail");
+  AlwaysPreserved.insert("__stack_chk_guard");
 
   // Mark all global variables with initializers that are not in the api as
   // internal as well.
-  for (Module::global_iterator I = M.global_begin(), E = M.global_end();
-       I != E; ++I) {
-    if (!maybeInternalize(*I, ExternalComdats))
+  for (auto &GV : M.globals()) {
+    if (!maybeInternalize(GV, ExternalComdats))
       continue;
+    Changed = true;
 
     ++NumGlobals;
-    DEBUG(dbgs() << "Internalized gvar " << I->getName() << "\n");
+    DEBUG(dbgs() << "Internalized gvar " << GV.getName() << "\n");
   }
 
   // Mark all aliases that are not in the api as internal as well.
-  for (Module::alias_iterator I = M.alias_begin(), E = M.alias_end();
-       I != E; ++I) {
-    if (!maybeInternalize(*I, ExternalComdats))
+  for (auto &GA : M.aliases()) {
+    if (!maybeInternalize(GA, ExternalComdats))
       continue;
+    Changed = true;
 
     ++NumAliases;
-    DEBUG(dbgs() << "Internalized alias " << I->getName() << "\n");
+    DEBUG(dbgs() << "Internalized alias " << GA.getName() << "\n");
   }
 
-  // We do not keep track of whether this pass changed the module because
-  // it adds unnecessary complexity:
-  // 1) This pass will generally be near the start of the pass pipeline, so
-  //    there will be no analyses to invalidate.
-  // 2) This pass will most likely end up changing the module and it isn't worth
-  //    worrying about optimizing the case where the module is unchanged.
-  return true;
+  return Changed;
 }
 
-ModulePass *llvm::createInternalizePass() { return new InternalizePass(); }
+InternalizePass::InternalizePass() : MustPreserveGV(PreserveAPIList()) {}
+
+PreservedAnalyses InternalizePass::run(Module &M, AnalysisManager<Module> &AM) {
+  if (!internalizeModule(M, AM.getCachedResult<CallGraphAnalysis>(M)))
+    return PreservedAnalyses::all();
+
+  PreservedAnalyses PA;
+  PA.preserve<CallGraphAnalysis>();
+  return PA;
+}
+
+namespace {
+class InternalizeLegacyPass : public ModulePass {
+  // Client supplied callback to control wheter a symbol must be preserved.
+  std::function<bool(const GlobalValue &)> MustPreserveGV;
+
+public:
+  static char ID; // Pass identification, replacement for typeid
+
+  InternalizeLegacyPass() : ModulePass(ID), MustPreserveGV(PreserveAPIList()) {}
+
+  InternalizeLegacyPass(std::function<bool(const GlobalValue &)> MustPreserveGV)
+      : ModulePass(ID), MustPreserveGV(std::move(MustPreserveGV)) {
+    initializeInternalizeLegacyPassPass(*PassRegistry::getPassRegistry());
+  }
+
+  bool runOnModule(Module &M) override {
+    if (skipModule(M))
+      return false;
+
+    CallGraphWrapperPass *CGPass =
+        getAnalysisIfAvailable<CallGraphWrapperPass>();
+    CallGraph *CG = CGPass ? &CGPass->getCallGraph() : nullptr;
+    return internalizeModule(M, MustPreserveGV, CG);
+  }
+
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    AU.setPreservesCFG();
+    AU.addPreserved<CallGraphWrapperPass>();
+  }
+};
+}
+
+char InternalizeLegacyPass::ID = 0;
+INITIALIZE_PASS(InternalizeLegacyPass, "internalize",
+                "Internalize Global Symbols", false, false)
+
+ModulePass *llvm::createInternalizePass() {
+  return new InternalizeLegacyPass();
+}
 
-ModulePass *llvm::createInternalizePass(ArrayRef<const char *> ExportList) {
-  return new InternalizePass(ExportList);
+ModulePass *llvm::createInternalizePass(
+    std::function<bool(const GlobalValue &)> MustPreserveGV) {
+  return new InternalizeLegacyPass(std::move(MustPreserveGV));
 }
diff --git a/lib/Transforms/IPO/LLVMBuild.txt b/lib/Transforms/IPO/LLVMBuild.txt
index b5410f5f7757..bc3df98d504c 100644
--- a/lib/Transforms/IPO/LLVMBuild.txt
+++ b/lib/Transforms/IPO/LLVMBuild.txt
@@ -20,4 +20,4 @@ type = Library
 name = IPO
 parent = Transforms
 library_name = ipo
-required_libraries = Analysis Core InstCombine IRReader Linker Object ProfileData Scalar Support TransformUtils Vectorize
+required_libraries = Analysis Core InstCombine IRReader Linker Object ProfileData Scalar Support TransformUtils Vectorize Instrumentation
diff --git a/lib/Transforms/IPO/LoopExtractor.cpp b/lib/Transforms/IPO/LoopExtractor.cpp
index 3c6a7bb7a17a..f898c3b5a935 100644
--- a/lib/Transforms/IPO/LoopExtractor.cpp
+++ b/lib/Transforms/IPO/LoopExtractor.cpp
@@ -81,7 +81,7 @@ INITIALIZE_PASS(SingleLoopExtractor, "loop-extract-single",
 Pass *llvm::createLoopExtractorPass() { return new LoopExtractor(); }
 
 bool LoopExtractor::runOnLoop(Loop *L, LPPassManager &) {
-  if (skipOptnoneFunction(L))
+  if (skipLoop(L))
     return false;
 
   // Only visit top-level loops.
@@ -249,6 +249,9 @@ void BlockExtractorPass::SplitLandingPadPreds(Function *F) {
 }
 
 bool BlockExtractorPass::runOnModule(Module &M) {
+  if (skipModule(M))
+    return false;
+
   std::set<BasicBlock*> TranslatedBlocksToNotExtract;
   for (unsigned i = 0, e = BlocksToNotExtract.size(); i != e; ++i) {
     BasicBlock *BB = BlocksToNotExtract[i];
@@ -272,15 +275,13 @@ bool BlockExtractorPass::runOnModule(Module &M) {
     std::string &FuncName  = BlocksToNotExtractByName.back().first;
     std::string &BlockName = BlocksToNotExtractByName.back().second;
 
-    for (Module::iterator FI = M.begin(), FE = M.end(); FI != FE; ++FI) {
-      Function &F = *FI;
+    for (Function &F : M) {
       if (F.getName() != FuncName) continue;
 
-      for (Function::iterator BI = F.begin(), BE = F.end(); BI != BE; ++BI) {
-        BasicBlock &BB = *BI;
+      for (BasicBlock &BB : F) {
         if (BB.getName() != BlockName) continue;
 
-        TranslatedBlocksToNotExtract.insert(&*BI);
+        TranslatedBlocksToNotExtract.insert(&BB);
       }
     }
 
@@ -290,18 +291,18 @@ bool BlockExtractorPass::runOnModule(Module &M) {
   // Now that we know which blocks to not extract, figure out which ones we WANT
   // to extract.
   std::vector<BasicBlock*> BlocksToExtract;
-  for (Module::iterator F = M.begin(), E = M.end(); F != E; ++F) {
-    SplitLandingPadPreds(&*F);
-    for (Function::iterator BB = F->begin(), E = F->end(); BB != E; ++BB)
-      if (!TranslatedBlocksToNotExtract.count(&*BB))
-        BlocksToExtract.push_back(&*BB);
+  for (Function &F : M) {
+    SplitLandingPadPreds(&F);
+    for (BasicBlock &BB : F)
+      if (!TranslatedBlocksToNotExtract.count(&BB))
+        BlocksToExtract.push_back(&BB);
   }
 
-  for (unsigned i = 0, e = BlocksToExtract.size(); i != e; ++i) {
+  for (BasicBlock *BlockToExtract : BlocksToExtract) {
     SmallVector<BasicBlock*, 2> BlocksToExtractVec;
-    BlocksToExtractVec.push_back(BlocksToExtract[i]);
+    BlocksToExtractVec.push_back(BlockToExtract);
     if (const InvokeInst *II =
-        dyn_cast<InvokeInst>(BlocksToExtract[i]->getTerminator()))
+            dyn_cast<InvokeInst>(BlockToExtract->getTerminator()))
       BlocksToExtractVec.push_back(II->getUnwindDest());
     CodeExtractor(BlocksToExtractVec).extractCodeRegion();
   }
diff --git a/lib/Transforms/IPO/LowerBitSets.cpp b/lib/Transforms/IPO/LowerTypeTests.cpp
index 7b515745c312..36089f0a8801 100644
--- a/lib/Transforms/IPO/LowerBitSets.cpp
+++ b/lib/Transforms/IPO/LowerTypeTests.cpp
@@ -1,4 +1,4 @@
-//===-- LowerBitSets.cpp - Bitset lowering pass ---------------------------===//
+//===-- LowerTypeTests.cpp - type metadata lowering pass ------------------===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -7,12 +7,12 @@
 //
 //===----------------------------------------------------------------------===//
 //
-// This pass lowers bitset metadata and calls to the llvm.bitset.test intrinsic.
-// See http://llvm.org/docs/LangRef.html#bitsets for more information.
+// This pass lowers type metadata and calls to the llvm.type.test intrinsic.
+// See http://llvm.org/docs/TypeMetadata.html for more information.
 //
 //===----------------------------------------------------------------------===//
 
-#include "llvm/Transforms/IPO/LowerBitSets.h"
+#include "llvm/Transforms/IPO/LowerTypeTests.h"
 #include "llvm/Transforms/IPO.h"
 #include "llvm/ADT/EquivalenceClasses.h"
 #include "llvm/ADT/Statistic.h"
@@ -33,17 +33,18 @@
 #include "llvm/Transforms/Utils/BasicBlockUtils.h"
 
 using namespace llvm;
+using namespace lowertypetests;
 
-#define DEBUG_TYPE "lowerbitsets"
+#define DEBUG_TYPE "lowertypetests"
 
 STATISTIC(ByteArraySizeBits, "Byte array size in bits");
 STATISTIC(ByteArraySizeBytes, "Byte array size in bytes");
 STATISTIC(NumByteArraysCreated, "Number of byte arrays created");
-STATISTIC(NumBitSetCallsLowered, "Number of bitset calls lowered");
-STATISTIC(NumBitSetDisjointSets, "Number of disjoint sets of bitsets");
+STATISTIC(NumTypeTestCallsLowered, "Number of type test calls lowered");
+STATISTIC(NumTypeIdDisjointSets, "Number of disjoint sets of type identifiers");
 
 static cl::opt<bool> AvoidReuse(
-    "lowerbitsets-avoid-reuse",
+    "lowertypetests-avoid-reuse",
     cl::desc("Try to avoid reuse of byte array addresses using aliases"),
     cl::Hidden, cl::init(true));
 
@@ -203,10 +204,10 @@ struct ByteArrayInfo {
   Constant *Mask;
 };
 
-struct LowerBitSets : public ModulePass {
+struct LowerTypeTests : public ModulePass {
   static char ID;
-  LowerBitSets() : ModulePass(ID) {
-    initializeLowerBitSetsPass(*PassRegistry::getPassRegistry());
+  LowerTypeTests() : ModulePass(ID) {
+    initializeLowerTypeTestsPass(*PassRegistry::getPassRegistry());
   }
 
   Module *M;
@@ -221,105 +222,68 @@ struct LowerBitSets : public ModulePass {
   IntegerType *Int64Ty;
   IntegerType *IntPtrTy;
 
-  // The llvm.bitsets named metadata.
-  NamedMDNode *BitSetNM;
-
-  // Mapping from bitset identifiers to the call sites that test them.
-  DenseMap<Metadata *, std::vector<CallInst *>> BitSetTestCallSites;
+  // Mapping from type identifiers to the call sites that test them.
+  DenseMap<Metadata *, std::vector<CallInst *>> TypeTestCallSites;
 
   std::vector<ByteArrayInfo> ByteArrayInfos;
 
   BitSetInfo
-  buildBitSet(Metadata *BitSet,
+  buildBitSet(Metadata *TypeId,
               const DenseMap<GlobalObject *, uint64_t> &GlobalLayout);
   ByteArrayInfo *createByteArray(BitSetInfo &BSI);
   void allocateByteArrays();
   Value *createBitSetTest(IRBuilder<> &B, BitSetInfo &BSI, ByteArrayInfo *&BAI,
                           Value *BitOffset);
-  void lowerBitSetCalls(ArrayRef<Metadata *> BitSets,
-                        Constant *CombinedGlobalAddr,
-                        const DenseMap<GlobalObject *, uint64_t> &GlobalLayout);
+  void
+  lowerTypeTestCalls(ArrayRef<Metadata *> TypeIds, Constant *CombinedGlobalAddr,
+                     const DenseMap<GlobalObject *, uint64_t> &GlobalLayout);
   Value *
   lowerBitSetCall(CallInst *CI, BitSetInfo &BSI, ByteArrayInfo *&BAI,
                   Constant *CombinedGlobal,
                   const DenseMap<GlobalObject *, uint64_t> &GlobalLayout);
-  void buildBitSetsFromGlobalVariables(ArrayRef<Metadata *> BitSets,
+  void buildBitSetsFromGlobalVariables(ArrayRef<Metadata *> TypeIds,
                                        ArrayRef<GlobalVariable *> Globals);
   unsigned getJumpTableEntrySize();
   Type *getJumpTableEntryType();
   Constant *createJumpTableEntry(GlobalObject *Src, Function *Dest,
                                  unsigned Distance);
-  void verifyBitSetMDNode(MDNode *Op);
-  void buildBitSetsFromFunctions(ArrayRef<Metadata *> BitSets,
+  void verifyTypeMDNode(GlobalObject *GO, MDNode *Type);
+  void buildBitSetsFromFunctions(ArrayRef<Metadata *> TypeIds,
                                  ArrayRef<Function *> Functions);
-  void buildBitSetsFromDisjointSet(ArrayRef<Metadata *> BitSets,
+  void buildBitSetsFromDisjointSet(ArrayRef<Metadata *> TypeIds,
                                    ArrayRef<GlobalObject *> Globals);
-  bool buildBitSets();
-  bool eraseBitSetMetadata();
-
-  bool doInitialization(Module &M) override;
+  bool lower();
   bool runOnModule(Module &M) override;
 };
 
 } // anonymous namespace
 
-INITIALIZE_PASS_BEGIN(LowerBitSets, "lowerbitsets",
-                "Lower bitset metadata", false, false)
-INITIALIZE_PASS_END(LowerBitSets, "lowerbitsets",
-                "Lower bitset metadata", false, false)
-char LowerBitSets::ID = 0;
-
-ModulePass *llvm::createLowerBitSetsPass() { return new LowerBitSets; }
-
-bool LowerBitSets::doInitialization(Module &Mod) {
-  M = &Mod;
-  const DataLayout &DL = Mod.getDataLayout();
-
-  Triple TargetTriple(M->getTargetTriple());
-  LinkerSubsectionsViaSymbols = TargetTriple.isMacOSX();
-  Arch = TargetTriple.getArch();
-  ObjectFormat = TargetTriple.getObjectFormat();
+INITIALIZE_PASS(LowerTypeTests, "lowertypetests", "Lower type metadata", false,
+                false)
+char LowerTypeTests::ID = 0;
 
-  Int1Ty = Type::getInt1Ty(M->getContext());
-  Int8Ty = Type::getInt8Ty(M->getContext());
-  Int32Ty = Type::getInt32Ty(M->getContext());
-  Int32PtrTy = PointerType::getUnqual(Int32Ty);
-  Int64Ty = Type::getInt64Ty(M->getContext());
-  IntPtrTy = DL.getIntPtrType(M->getContext(), 0);
+ModulePass *llvm::createLowerTypeTestsPass() { return new LowerTypeTests; }
 
-  BitSetNM = M->getNamedMetadata("llvm.bitsets");
-
-  BitSetTestCallSites.clear();
-
-  return false;
-}
-
-/// Build a bit set for BitSet using the object layouts in
+/// Build a bit set for TypeId using the object layouts in
 /// GlobalLayout.
-BitSetInfo LowerBitSets::buildBitSet(
-    Metadata *BitSet,
+BitSetInfo LowerTypeTests::buildBitSet(
+    Metadata *TypeId,
     const DenseMap<GlobalObject *, uint64_t> &GlobalLayout) {
   BitSetBuilder BSB;
 
-  // Compute the byte offset of each element of this bitset.
-  if (BitSetNM) {
-    for (MDNode *Op : BitSetNM->operands()) {
-      if (Op->getOperand(0) != BitSet || !Op->getOperand(1))
-        continue;
-      Constant *OpConst =
-          cast<ConstantAsMetadata>(Op->getOperand(1))->getValue();
-      if (auto GA = dyn_cast<GlobalAlias>(OpConst))
-        OpConst = GA->getAliasee();
-      auto OpGlobal = dyn_cast<GlobalObject>(OpConst);
-      if (!OpGlobal)
+  // Compute the byte offset of each address associated with this type
+  // identifier.
+  SmallVector<MDNode *, 2> Types;
+  for (auto &GlobalAndOffset : GlobalLayout) {
+    Types.clear();
+    GlobalAndOffset.first->getMetadata(LLVMContext::MD_type, Types);
+    for (MDNode *Type : Types) {
+      if (Type->getOperand(1) != TypeId)
         continue;
       uint64_t Offset =
-          cast<ConstantInt>(cast<ConstantAsMetadata>(Op->getOperand(2))
+          cast<ConstantInt>(cast<ConstantAsMetadata>(Type->getOperand(0))
                                 ->getValue())->getZExtValue();
-
-      Offset += GlobalLayout.find(OpGlobal)->second;
-
-      BSB.addOffset(Offset);
+      BSB.addOffset(GlobalAndOffset.second + Offset);
     }
   }
 
@@ -341,7 +305,7 @@ static Value *createMaskedBitTest(IRBuilder<> &B, Value *Bits,
   return B.CreateICmpNE(MaskedBits, ConstantInt::get(BitsType, 0));
 }
 
-ByteArrayInfo *LowerBitSets::createByteArray(BitSetInfo &BSI) {
+ByteArrayInfo *LowerTypeTests::createByteArray(BitSetInfo &BSI) {
   // Create globals to stand in for byte arrays and masks. These never actually
   // get initialized, we RAUW and erase them later in allocateByteArrays() once
   // we know the offset and mask to use.
@@ -360,7 +324,7 @@ ByteArrayInfo *LowerBitSets::createByteArray(BitSetInfo &BSI) {
   return BAI;
 }
 
-void LowerBitSets::allocateByteArrays() {
+void LowerTypeTests::allocateByteArrays() {
   std::stable_sort(ByteArrayInfos.begin(), ByteArrayInfos.end(),
                    [](const ByteArrayInfo &BAI1, const ByteArrayInfo &BAI2) {
                      return BAI1.BitSize > BAI2.BitSize;
@@ -413,8 +377,8 @@ void LowerBitSets::allocateByteArrays() {
 
 /// Build a test that bit BitOffset is set in BSI, where
 /// BitSetGlobal is a global containing the bits in BSI.
-Value *LowerBitSets::createBitSetTest(IRBuilder<> &B, BitSetInfo &BSI,
-                                      ByteArrayInfo *&BAI, Value *BitOffset) {
+Value *LowerTypeTests::createBitSetTest(IRBuilder<> &B, BitSetInfo &BSI,
+                                        ByteArrayInfo *&BAI, Value *BitOffset) {
   if (BSI.BitSize <= 64) {
     // If the bit set is sufficiently small, we can avoid a load by bit testing
     // a constant.
@@ -454,9 +418,9 @@ Value *LowerBitSets::createBitSetTest(IRBuilder<> &B, BitSetInfo &BSI,
   }
 }
 
-/// Lower a llvm.bitset.test call to its implementation. Returns the value to
+/// Lower a llvm.type.test call to its implementation. Returns the value to
 /// replace the call with.
-Value *LowerBitSets::lowerBitSetCall(
+Value *LowerTypeTests::lowerBitSetCall(
     CallInst *CI, BitSetInfo &BSI, ByteArrayInfo *&BAI,
     Constant *CombinedGlobalIntAddr,
     const DenseMap<GlobalObject *, uint64_t> &GlobalLayout) {
@@ -524,10 +488,10 @@ Value *LowerBitSets::lowerBitSetCall(
   return P;
 }
 
-/// Given a disjoint set of bitsets and globals, layout the globals, build the
-/// bit sets and lower the llvm.bitset.test calls.
-void LowerBitSets::buildBitSetsFromGlobalVariables(
-    ArrayRef<Metadata *> BitSets, ArrayRef<GlobalVariable *> Globals) {
+/// Given a disjoint set of type identifiers and globals, lay out the globals,
+/// build the bit sets and lower the llvm.type.test calls.
+void LowerTypeTests::buildBitSetsFromGlobalVariables(
+    ArrayRef<Metadata *> TypeIds, ArrayRef<GlobalVariable *> Globals) {
   // Build a new global with the combined contents of the referenced globals.
   // This global is a struct whose even-indexed elements contain the original
   // contents of the referenced globals and whose odd-indexed elements contain
@@ -544,7 +508,7 @@ void LowerBitSets::buildBitSetsFromGlobalVariables(
     // Cap at 128 was found experimentally to have a good data/instruction
     // overhead tradeoff.
     if (Padding > 128)
-      Padding = RoundUpToAlignment(InitSize, 128) - InitSize;
+      Padding = alignTo(InitSize, 128) - InitSize;
 
     GlobalInits.push_back(
         ConstantAggregateZero::get(ArrayType::get(Int8Ty, Padding)));
@@ -565,7 +529,7 @@ void LowerBitSets::buildBitSetsFromGlobalVariables(
     // Multiply by 2 to account for padding elements.
     GlobalLayout[Globals[I]] = CombinedGlobalLayout->getElementOffset(I * 2);
 
-  lowerBitSetCalls(BitSets, CombinedGlobal, GlobalLayout);
+  lowerTypeTestCalls(TypeIds, CombinedGlobal, GlobalLayout);
 
   // Build aliases pointing to offsets into the combined global for each
   // global from which we built the combined global, and replace references
@@ -591,19 +555,19 @@ void LowerBitSets::buildBitSetsFromGlobalVariables(
   }
 }
 
-void LowerBitSets::lowerBitSetCalls(
-    ArrayRef<Metadata *> BitSets, Constant *CombinedGlobalAddr,
+void LowerTypeTests::lowerTypeTestCalls(
+    ArrayRef<Metadata *> TypeIds, Constant *CombinedGlobalAddr,
     const DenseMap<GlobalObject *, uint64_t> &GlobalLayout) {
   Constant *CombinedGlobalIntAddr =
       ConstantExpr::getPtrToInt(CombinedGlobalAddr, IntPtrTy);
 
-  // For each bitset in this disjoint set...
-  for (Metadata *BS : BitSets) {
+  // For each type identifier in this disjoint set...
+  for (Metadata *TypeId : TypeIds) {
     // Build the bitset.
-    BitSetInfo BSI = buildBitSet(BS, GlobalLayout);
+    BitSetInfo BSI = buildBitSet(TypeId, GlobalLayout);
     DEBUG({
-      if (auto BSS = dyn_cast<MDString>(BS))
-        dbgs() << BSS->getString() << ": ";
+      if (auto MDS = dyn_cast<MDString>(TypeId))
+        dbgs() << MDS->getString() << ": ";
       else
         dbgs() << "<unnamed>: ";
       BSI.print(dbgs());
@@ -611,9 +575,9 @@ void LowerBitSets::lowerBitSetCalls(
 
     ByteArrayInfo *BAI = nullptr;
 
-    // Lower each call to llvm.bitset.test for this bitset.
-    for (CallInst *CI : BitSetTestCallSites[BS]) {
-      ++NumBitSetCallsLowered;
+    // Lower each call to llvm.type.test for this type identifier.
+    for (CallInst *CI : TypeTestCallSites[TypeId]) {
+      ++NumTypeTestCallsLowered;
       Value *Lowered =
           lowerBitSetCall(CI, BSI, BAI, CombinedGlobalIntAddr, GlobalLayout);
       CI->replaceAllUsesWith(Lowered);
@@ -622,39 +586,32 @@ void LowerBitSets::lowerBitSetCalls(
   }
 }
 
-void LowerBitSets::verifyBitSetMDNode(MDNode *Op) {
-  if (Op->getNumOperands() != 3)
+void LowerTypeTests::verifyTypeMDNode(GlobalObject *GO, MDNode *Type) {
+  if (Type->getNumOperands() != 2)
     report_fatal_error(
-        "All operands of llvm.bitsets metadata must have 3 elements");
-  if (!Op->getOperand(1))
-    return;
-
-  auto OpConstMD = dyn_cast<ConstantAsMetadata>(Op->getOperand(1));
-  if (!OpConstMD)
-    report_fatal_error("Bit set element must be a constant");
-  auto OpGlobal = dyn_cast<GlobalObject>(OpConstMD->getValue());
-  if (!OpGlobal)
-    return;
+        "All operands of type metadata must have 2 elements");
 
-  if (OpGlobal->isThreadLocal())
+  if (GO->isThreadLocal())
     report_fatal_error("Bit set element may not be thread-local");
-  if (OpGlobal->hasSection())
-    report_fatal_error("Bit set element may not have an explicit section");
+  if (isa<GlobalVariable>(GO) && GO->hasSection())
+    report_fatal_error(
+        "A member of a type identifier may not have an explicit section");
 
-  if (isa<GlobalVariable>(OpGlobal) && OpGlobal->isDeclarationForLinker())
-    report_fatal_error("Bit set global var element must be a definition");
+  if (isa<GlobalVariable>(GO) && GO->isDeclarationForLinker())
+    report_fatal_error(
+        "A global var member of a type identifier must be a definition");
 
-  auto OffsetConstMD = dyn_cast<ConstantAsMetadata>(Op->getOperand(2));
+  auto OffsetConstMD = dyn_cast<ConstantAsMetadata>(Type->getOperand(0));
   if (!OffsetConstMD)
-    report_fatal_error("Bit set element offset must be a constant");
+    report_fatal_error("Type offset must be a constant");
   auto OffsetInt = dyn_cast<ConstantInt>(OffsetConstMD->getValue());
   if (!OffsetInt)
-    report_fatal_error("Bit set element offset must be an integer constant");
+    report_fatal_error("Type offset must be an integer constant");
 }
 
 static const unsigned kX86JumpTableEntrySize = 8;
 
-unsigned LowerBitSets::getJumpTableEntrySize() {
+unsigned LowerTypeTests::getJumpTableEntrySize() {
   if (Arch != Triple::x86 && Arch != Triple::x86_64)
     report_fatal_error("Unsupported architecture for jump tables");
 
@@ -665,8 +622,9 @@ unsigned LowerBitSets::getJumpTableEntrySize() {
 // consists of an instruction sequence containing a relative branch to Dest. The
 // constant will be laid out at address Src+(Len*Distance) where Len is the
 // target-specific jump table entry size.
-Constant *LowerBitSets::createJumpTableEntry(GlobalObject *Src, Function *Dest,
-                                             unsigned Distance) {
+Constant *LowerTypeTests::createJumpTableEntry(GlobalObject *Src,
+                                               Function *Dest,
+                                               unsigned Distance) {
   if (Arch != Triple::x86 && Arch != Triple::x86_64)
     report_fatal_error("Unsupported architecture for jump tables");
 
@@ -693,7 +651,7 @@ Constant *LowerBitSets::createJumpTableEntry(GlobalObject *Src, Function *Dest,
   return ConstantStruct::getAnon(Fields, /*Packed=*/true);
 }
 
-Type *LowerBitSets::getJumpTableEntryType() {
+Type *LowerTypeTests::getJumpTableEntryType() {
   if (Arch != Triple::x86 && Arch != Triple::x86_64)
     report_fatal_error("Unsupported architecture for jump tables");
 
@@ -702,10 +660,10 @@ Type *LowerBitSets::getJumpTableEntryType() {
                          /*Packed=*/true);
 }
 
-/// Given a disjoint set of bitsets and functions, build a jump table for the
-/// functions, build the bit sets and lower the llvm.bitset.test calls.
-void LowerBitSets::buildBitSetsFromFunctions(ArrayRef<Metadata *> BitSets,
-                                             ArrayRef<Function *> Functions) {
+/// Given a disjoint set of type identifiers and functions, build a jump table
+/// for the functions, build the bit sets and lower the llvm.type.test calls.
+void LowerTypeTests::buildBitSetsFromFunctions(ArrayRef<Metadata *> TypeIds,
+                                               ArrayRef<Function *> Functions) {
   // Unlike the global bitset builder, the function bitset builder cannot
   // re-arrange functions in a particular order and base its calculations on the
   // layout of the functions' entry points, as we have no idea how large a
@@ -719,8 +677,7 @@ void LowerBitSets::buildBitSetsFromFunctions(ArrayRef<Metadata *> BitSets,
   // verification done inside the module.
   //
   // In more concrete terms, suppose we have three functions f, g, h which are
-  // members of a single bitset, and a function foo that returns their
-  // addresses:
+  // of the same type, and a function foo that returns their addresses:
   //
   // f:
   // mov 0, %eax
@@ -803,7 +760,7 @@ void LowerBitSets::buildBitSetsFromFunctions(ArrayRef<Metadata *> BitSets,
   JumpTable->setSection(ObjectFormat == Triple::MachO
                             ? "__TEXT,__text,regular,pure_instructions"
                             : ".text");
-  lowerBitSetCalls(BitSets, JumpTable, GlobalLayout);
+  lowerTypeTestCalls(TypeIds, JumpTable, GlobalLayout);
 
   // Build aliases pointing to offsets into the jump table, and replace
   // references to the original functions with references to the aliases.
@@ -838,39 +795,32 @@ void LowerBitSets::buildBitSetsFromFunctions(ArrayRef<Metadata *> BitSets,
       ConstantArray::get(JumpTableType, JumpTableEntries));
 }
 
-void LowerBitSets::buildBitSetsFromDisjointSet(
-    ArrayRef<Metadata *> BitSets, ArrayRef<GlobalObject *> Globals) {
-  llvm::DenseMap<Metadata *, uint64_t> BitSetIndices;
-  llvm::DenseMap<GlobalObject *, uint64_t> GlobalIndices;
-  for (unsigned I = 0; I != BitSets.size(); ++I)
-    BitSetIndices[BitSets[I]] = I;
-  for (unsigned I = 0; I != Globals.size(); ++I)
-    GlobalIndices[Globals[I]] = I;
-
-  // For each bitset, build a set of indices that refer to globals referenced by
-  // the bitset.
-  std::vector<std::set<uint64_t>> BitSetMembers(BitSets.size());
-  if (BitSetNM) {
-    for (MDNode *Op : BitSetNM->operands()) {
-      // Op = { bitset name, global, offset }
-      if (!Op->getOperand(1))
-        continue;
-      auto I = BitSetIndices.find(Op->getOperand(0));
-      if (I == BitSetIndices.end())
-        continue;
-
-      auto OpGlobal = dyn_cast<GlobalObject>(
-          cast<ConstantAsMetadata>(Op->getOperand(1))->getValue());
-      if (!OpGlobal)
-        continue;
-      BitSetMembers[I->second].insert(GlobalIndices[OpGlobal]);
+void LowerTypeTests::buildBitSetsFromDisjointSet(
+    ArrayRef<Metadata *> TypeIds, ArrayRef<GlobalObject *> Globals) {
+  llvm::DenseMap<Metadata *, uint64_t> TypeIdIndices;
+  for (unsigned I = 0; I != TypeIds.size(); ++I)
+    TypeIdIndices[TypeIds[I]] = I;
+
+  // For each type identifier, build a set of indices that refer to members of
+  // the type identifier.
+  std::vector<std::set<uint64_t>> TypeMembers(TypeIds.size());
+  SmallVector<MDNode *, 2> Types;
+  unsigned GlobalIndex = 0;
+  for (GlobalObject *GO : Globals) {
+    Types.clear();
+    GO->getMetadata(LLVMContext::MD_type, Types);
+    for (MDNode *Type : Types) {
+      // Type = { offset, type identifier }
+      unsigned TypeIdIndex = TypeIdIndices[Type->getOperand(1)];
+      TypeMembers[TypeIdIndex].insert(GlobalIndex);
     }
+    GlobalIndex++;
   }
 
   // Order the sets of indices by size. The GlobalLayoutBuilder works best
   // when given small index sets first.
   std::stable_sort(
-      BitSetMembers.begin(), BitSetMembers.end(),
+      TypeMembers.begin(), TypeMembers.end(),
       [](const std::set<uint64_t> &O1, const std::set<uint64_t> &O2) {
         return O1.size() < O2.size();
       });
@@ -879,7 +829,7 @@ void LowerBitSets::buildBitSetsFromDisjointSet(
   // fragments. The GlobalLayoutBuilder tries to lay out members of fragments as
   // close together as possible.
   GlobalLayoutBuilder GLB(Globals.size());
-  for (auto &&MemSet : BitSetMembers)
+  for (auto &&MemSet : TypeMembers)
     GLB.addFragment(MemSet);
 
   // Build the bitsets from this disjoint set.
@@ -891,13 +841,13 @@ void LowerBitSets::buildBitSetsFromDisjointSet(
       for (auto &&Offset : F) {
         auto GV = dyn_cast<GlobalVariable>(Globals[Offset]);
         if (!GV)
-          report_fatal_error(
-              "Bit set may not contain both global variables and functions");
+          report_fatal_error("Type identifier may not contain both global "
+                             "variables and functions");
         *OGI++ = GV;
       }
     }
 
-    buildBitSetsFromGlobalVariables(BitSets, OrderedGVs);
+    buildBitSetsFromGlobalVariables(TypeIds, OrderedGVs);
   } else {
     // Build a vector of functions with the computed layout.
     std::vector<Function *> OrderedFns(Globals.size());
@@ -906,102 +856,97 @@ void LowerBitSets::buildBitSetsFromDisjointSet(
       for (auto &&Offset : F) {
         auto Fn = dyn_cast<Function>(Globals[Offset]);
         if (!Fn)
-          report_fatal_error(
-              "Bit set may not contain both global variables and functions");
+          report_fatal_error("Type identifier may not contain both global "
+                             "variables and functions");
         *OFI++ = Fn;
       }
     }
 
-    buildBitSetsFromFunctions(BitSets, OrderedFns);
+    buildBitSetsFromFunctions(TypeIds, OrderedFns);
   }
 }
 
-/// Lower all bit sets in this module.
-bool LowerBitSets::buildBitSets() {
-  Function *BitSetTestFunc =
-      M->getFunction(Intrinsic::getName(Intrinsic::bitset_test));
-  if (!BitSetTestFunc)
+/// Lower all type tests in this module.
+bool LowerTypeTests::lower() {
+  Function *TypeTestFunc =
+      M->getFunction(Intrinsic::getName(Intrinsic::type_test));
+  if (!TypeTestFunc || TypeTestFunc->use_empty())
     return false;
 
-  // Equivalence class set containing bitsets and the globals they reference.
-  // This is used to partition the set of bitsets in the module into disjoint
-  // sets.
+  // Equivalence class set containing type identifiers and the globals that
+  // reference them. This is used to partition the set of type identifiers in
+  // the module into disjoint sets.
   typedef EquivalenceClasses<PointerUnion<GlobalObject *, Metadata *>>
       GlobalClassesTy;
   GlobalClassesTy GlobalClasses;
 
-  // Verify the bitset metadata and build a mapping from bitset identifiers to
-  // their last observed index in BitSetNM. This will used later to
-  // deterministically order the list of bitset identifiers.
-  llvm::DenseMap<Metadata *, unsigned> BitSetIdIndices;
-  if (BitSetNM) {
-    for (unsigned I = 0, E = BitSetNM->getNumOperands(); I != E; ++I) {
-      MDNode *Op = BitSetNM->getOperand(I);
-      verifyBitSetMDNode(Op);
-      BitSetIdIndices[Op->getOperand(0)] = I;
+  // Verify the type metadata and build a mapping from type identifiers to their
+  // last observed index in the list of globals. This will be used later to
+  // deterministically order the list of type identifiers.
+  llvm::DenseMap<Metadata *, unsigned> TypeIdIndices;
+  unsigned I = 0;
+  SmallVector<MDNode *, 2> Types;
+  for (GlobalObject &GO : M->global_objects()) {
+    Types.clear();
+    GO.getMetadata(LLVMContext::MD_type, Types);
+    for (MDNode *Type : Types) {
+      verifyTypeMDNode(&GO, Type);
+      TypeIdIndices[cast<MDNode>(Type)->getOperand(1)] = ++I;
     }
   }
 
-  for (const Use &U : BitSetTestFunc->uses()) {
+  for (const Use &U : TypeTestFunc->uses()) {
     auto CI = cast<CallInst>(U.getUser());
 
     auto BitSetMDVal = dyn_cast<MetadataAsValue>(CI->getArgOperand(1));
     if (!BitSetMDVal)
       report_fatal_error(
-          "Second argument of llvm.bitset.test must be metadata");
+          "Second argument of llvm.type.test must be metadata");
     auto BitSet = BitSetMDVal->getMetadata();
 
-    // Add the call site to the list of call sites for this bit set. We also use
-    // BitSetTestCallSites to keep track of whether we have seen this bit set
-    // before. If we have, we don't need to re-add the referenced globals to the
-    // equivalence class.
-    std::pair<DenseMap<Metadata *, std::vector<CallInst *>>::iterator,
-              bool> Ins =
-        BitSetTestCallSites.insert(
+    // Add the call site to the list of call sites for this type identifier. We
+    // also use TypeTestCallSites to keep track of whether we have seen this
+    // type identifier before. If we have, we don't need to re-add the
+    // referenced globals to the equivalence class.
+    std::pair<DenseMap<Metadata *, std::vector<CallInst *>>::iterator, bool>
+        Ins = TypeTestCallSites.insert(
             std::make_pair(BitSet, std::vector<CallInst *>()));
     Ins.first->second.push_back(CI);
     if (!Ins.second)
       continue;
 
-    // Add the bitset to the equivalence class.
+    // Add the type identifier to the equivalence class.
     GlobalClassesTy::iterator GCI = GlobalClasses.insert(BitSet);
     GlobalClassesTy::member_iterator CurSet = GlobalClasses.findLeader(GCI);
 
-    if (!BitSetNM)
-      continue;
-
-    // Add the referenced globals to the bitset's equivalence class.
-    for (MDNode *Op : BitSetNM->operands()) {
-      if (Op->getOperand(0) != BitSet || !Op->getOperand(1))
-        continue;
-
-      auto OpGlobal = dyn_cast<GlobalObject>(
-          cast<ConstantAsMetadata>(Op->getOperand(1))->getValue());
-      if (!OpGlobal)
-        continue;
-
-      CurSet = GlobalClasses.unionSets(
-          CurSet, GlobalClasses.findLeader(GlobalClasses.insert(OpGlobal)));
+    // Add the referenced globals to the type identifier's equivalence class.
+    for (GlobalObject &GO : M->global_objects()) {
+      Types.clear();
+      GO.getMetadata(LLVMContext::MD_type, Types);
+      for (MDNode *Type : Types)
+        if (Type->getOperand(1) == BitSet)
+          CurSet = GlobalClasses.unionSets(
+              CurSet, GlobalClasses.findLeader(GlobalClasses.insert(&GO)));
     }
   }
 
   if (GlobalClasses.empty())
     return false;
 
-  // Build a list of disjoint sets ordered by their maximum BitSetNM index
-  // for determinism.
+  // Build a list of disjoint sets ordered by their maximum global index for
+  // determinism.
   std::vector<std::pair<GlobalClassesTy::iterator, unsigned>> Sets;
   for (GlobalClassesTy::iterator I = GlobalClasses.begin(),
                                  E = GlobalClasses.end();
        I != E; ++I) {
     if (!I->isLeader()) continue;
-    ++NumBitSetDisjointSets;
+    ++NumTypeIdDisjointSets;
 
     unsigned MaxIndex = 0;
     for (GlobalClassesTy::member_iterator MI = GlobalClasses.member_begin(I);
          MI != GlobalClasses.member_end(); ++MI) {
       if ((*MI).is<Metadata *>())
-        MaxIndex = std::max(MaxIndex, BitSetIdIndices[MI->get<Metadata *>()]);
+        MaxIndex = std::max(MaxIndex, TypeIdIndices[MI->get<Metadata *>()]);
     }
     Sets.emplace_back(I, MaxIndex);
   }
@@ -1013,26 +958,26 @@ bool LowerBitSets::buildBitSets() {
 
   // For each disjoint set we found...
   for (const auto &S : Sets) {
-    // Build the list of bitsets in this disjoint set.
-    std::vector<Metadata *> BitSets;
+    // Build the list of type identifiers in this disjoint set.
+    std::vector<Metadata *> TypeIds;
     std::vector<GlobalObject *> Globals;
     for (GlobalClassesTy::member_iterator MI =
              GlobalClasses.member_begin(S.first);
          MI != GlobalClasses.member_end(); ++MI) {
       if ((*MI).is<Metadata *>())
-        BitSets.push_back(MI->get<Metadata *>());
+        TypeIds.push_back(MI->get<Metadata *>());
       else
         Globals.push_back(MI->get<GlobalObject *>());
     }
 
-    // Order bitsets by BitSetNM index for determinism. This ordering is stable
-    // as there is a one-to-one mapping between metadata and indices.
-    std::sort(BitSets.begin(), BitSets.end(), [&](Metadata *M1, Metadata *M2) {
-      return BitSetIdIndices[M1] < BitSetIdIndices[M2];
+    // Order type identifiers by global index for determinism. This ordering is
+    // stable as there is a one-to-one mapping between metadata and indices.
+    std::sort(TypeIds.begin(), TypeIds.end(), [&](Metadata *M1, Metadata *M2) {
+      return TypeIdIndices[M1] < TypeIdIndices[M2];
     });
 
-    // Lower the bitsets in this disjoint set.
-    buildBitSetsFromDisjointSet(BitSets, Globals);
+    // Build bitsets for this disjoint set.
+    buildBitSetsFromDisjointSet(TypeIds, Globals);
   }
 
   allocateByteArrays();
@@ -1040,16 +985,36 @@ bool LowerBitSets::buildBitSets() {
   return true;
 }
 
-bool LowerBitSets::eraseBitSetMetadata() {
-  if (!BitSetNM)
-    return false;
+// Initialization helper shared by the old and the new PM.
+static void init(LowerTypeTests *LTT, Module &M) {
+  LTT->M = &M;
+  const DataLayout &DL = M.getDataLayout();
+  Triple TargetTriple(M.getTargetTriple());
+  LTT->LinkerSubsectionsViaSymbols = TargetTriple.isMacOSX();
+  LTT->Arch = TargetTriple.getArch();
+  LTT->ObjectFormat = TargetTriple.getObjectFormat();
+  LTT->Int1Ty = Type::getInt1Ty(M.getContext());
+  LTT->Int8Ty = Type::getInt8Ty(M.getContext());
+  LTT->Int32Ty = Type::getInt32Ty(M.getContext());
+  LTT->Int32PtrTy = PointerType::getUnqual(LTT->Int32Ty);
+  LTT->Int64Ty = Type::getInt64Ty(M.getContext());
+  LTT->IntPtrTy = DL.getIntPtrType(M.getContext(), 0);
+  LTT->TypeTestCallSites.clear();
+}
 
-  M->eraseNamedMetadata(BitSetNM);
-  return true;
+bool LowerTypeTests::runOnModule(Module &M) {
+  if (skipModule(M))
+    return false;
+  init(this, M);
+  return lower();
 }
 
-bool LowerBitSets::runOnModule(Module &M) {
-  bool Changed = buildBitSets();
-  Changed |= eraseBitSetMetadata();
-  return Changed;
+PreservedAnalyses LowerTypeTestsPass::run(Module &M,
+                                          AnalysisManager<Module> &AM) {
+  LowerTypeTests Impl;
+  init(&Impl, M);
+  bool Changed = Impl.lower();
+  if (!Changed)
+    return PreservedAnalyses::all();
+  return PreservedAnalyses::none();
 }
diff --git a/lib/Transforms/IPO/Makefile b/lib/Transforms/IPO/Makefile
deleted file mode 100644
index 5c42374139aa..000000000000
--- a/lib/Transforms/IPO/Makefile
+++ /dev/null
@@ -1,15 +0,0 @@
-##===- lib/Transforms/IPO/Makefile -------------------------*- Makefile -*-===##
-#
-#                     The LLVM Compiler Infrastructure
-#
-# This file is distributed under the University of Illinois Open Source
-# License. See LICENSE.TXT for details.
-#
-##===----------------------------------------------------------------------===##
-
-LEVEL = ../../..
-LIBRARYNAME = LLVMipo
-BUILD_ARCHIVE = 1
-
-include $(LEVEL)/Makefile.common
-
diff --git a/lib/Transforms/IPO/MergeFunctions.cpp b/lib/Transforms/IPO/MergeFunctions.cpp
index 8a209a18c540..fe653a75ddb5 100644
--- a/lib/Transforms/IPO/MergeFunctions.cpp
+++ b/lib/Transforms/IPO/MergeFunctions.cpp
@@ -89,13 +89,10 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "llvm/Transforms/IPO.h"
-#include "llvm/ADT/DenseSet.h"
-#include "llvm/ADT/FoldingSet.h"
+#include "llvm/ADT/Hashing.h"
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/SmallSet.h"
 #include "llvm/ADT/Statistic.h"
-#include "llvm/ADT/Hashing.h"
 #include "llvm/IR/CallSite.h"
 #include "llvm/IR/Constants.h"
 #include "llvm/IR/DataLayout.h"
@@ -112,6 +109,7 @@
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/raw_ostream.h"
+#include "llvm/Transforms/IPO.h"
 #include <vector>
 
 using namespace llvm;
@@ -189,7 +187,7 @@ public:
 
 private:
   /// Test whether two basic blocks have equivalent behaviour.
-  int cmpBasicBlocks(const BasicBlock *BBL, const BasicBlock *BBR);
+  int cmpBasicBlocks(const BasicBlock *BBL, const BasicBlock *BBR) const;
 
   /// Constants comparison.
   /// Its analog to lexicographical comparison between hypothetical numbers
@@ -293,11 +291,11 @@ private:
   /// look at their particular properties (bit-width for vectors, and
   /// address space for pointers).
   /// If these properties are equal - compare their contents.
-  int cmpConstants(const Constant *L, const Constant *R);
+  int cmpConstants(const Constant *L, const Constant *R) const;
 
   /// Compares two global values by number. Uses the GlobalNumbersState to
   /// identify the same gobals across function calls.
-  int cmpGlobalValues(GlobalValue *L, GlobalValue *R);
+  int cmpGlobalValues(GlobalValue *L, GlobalValue *R) const;
 
   /// Assign or look up previously assigned numbers for the two values, and
   /// return whether the numbers are equal. Numbers are assigned in the order
@@ -317,11 +315,11 @@ private:
   ///          then left value is greater.
   ///          In another words, we compare serial numbers, for more details
   ///          see comments for sn_mapL and sn_mapR.
-  int cmpValues(const Value *L, const Value *R);
+  int cmpValues(const Value *L, const Value *R) const;
 
   /// Compare two Instructions for equivalence, similar to
-  /// Instruction::isSameOperationAs but with modifications to the type
-  /// comparison.
+  /// Instruction::isSameOperationAs.
+  ///
   /// Stages are listed in "most significant stage first" order:
   /// On each stage below, we do comparison between some left and right
   /// operation parts. If parts are non-equal, we assign parts comparison
@@ -339,8 +337,9 @@ private:
   /// For example, for Load it would be:
   /// 6.1.Load: volatile (as boolean flag)
   /// 6.2.Load: alignment (as integer numbers)
-  /// 6.3.Load: synch-scope (as integer numbers)
-  /// 6.4.Load: range metadata (as integer numbers)
+  /// 6.3.Load: ordering (as underlying enum class value)
+  /// 6.4.Load: synch-scope (as integer numbers)
+  /// 6.5.Load: range metadata (as integer ranges)
   /// On this stage its better to see the code, since its not more than 10-15
   /// strings for particular instruction, and could change sometimes.
   int cmpOperations(const Instruction *L, const Instruction *R) const;
@@ -353,8 +352,9 @@ private:
   /// 3. Pointer operand type (using cmpType method).
   /// 4. Number of operands.
   /// 5. Compare operands, using cmpValues method.
-  int cmpGEPs(const GEPOperator *GEPL, const GEPOperator *GEPR);
-  int cmpGEPs(const GetElementPtrInst *GEPL, const GetElementPtrInst *GEPR) {
+  int cmpGEPs(const GEPOperator *GEPL, const GEPOperator *GEPR) const;
+  int cmpGEPs(const GetElementPtrInst *GEPL,
+              const GetElementPtrInst *GEPR) const {
     return cmpGEPs(cast<GEPOperator>(GEPL), cast<GEPOperator>(GEPR));
   }
 
@@ -401,12 +401,13 @@ private:
   int cmpTypes(Type *TyL, Type *TyR) const;
 
   int cmpNumbers(uint64_t L, uint64_t R) const;
+  int cmpOrderings(AtomicOrdering L, AtomicOrdering R) const;
   int cmpAPInts(const APInt &L, const APInt &R) const;
   int cmpAPFloats(const APFloat &L, const APFloat &R) const;
   int cmpInlineAsm(const InlineAsm *L, const InlineAsm *R) const;
   int cmpMem(StringRef L, StringRef R) const;
   int cmpAttrs(const AttributeSet L, const AttributeSet R) const;
-  int cmpRangeMetadata(const MDNode* L, const MDNode* R) const;
+  int cmpRangeMetadata(const MDNode *L, const MDNode *R) const;
   int cmpOperandBundlesSchema(const Instruction *L, const Instruction *R) const;
 
   // The two functions undergoing comparison.
@@ -445,7 +446,7 @@ private:
   /// But, we are still not able to compare operands of PHI nodes, since those
   /// could be operands from further BBs we didn't scan yet.
   /// So it's impossible to use dominance properties in general.
-  DenseMap<const Value*, int> sn_mapL, sn_mapR;
+  mutable DenseMap<const Value*, int> sn_mapL, sn_mapR;
 
   // The global state we will use
   GlobalNumberState* GlobalNumbers;
@@ -477,6 +478,12 @@ int FunctionComparator::cmpNumbers(uint64_t L, uint64_t R) const {
   return 0;
 }
 
+int FunctionComparator::cmpOrderings(AtomicOrdering L, AtomicOrdering R) const {
+  if ((int)L < (int)R) return -1;
+  if ((int)L > (int)R) return 1;
+  return 0;
+}
+
 int FunctionComparator::cmpAPInts(const APInt &L, const APInt &R) const {
   if (int Res = cmpNumbers(L.getBitWidth(), R.getBitWidth()))
     return Res;
@@ -538,8 +545,8 @@ int FunctionComparator::cmpAttrs(const AttributeSet L,
   return 0;
 }
 
-int FunctionComparator::cmpRangeMetadata(const MDNode* L,
-                                         const MDNode* R) const {
+int FunctionComparator::cmpRangeMetadata(const MDNode *L,
+                                         const MDNode *R) const {
   if (L == R)
     return 0;
   if (!L)
@@ -547,7 +554,7 @@ int FunctionComparator::cmpRangeMetadata(const MDNode* L,
   if (!R)
     return 1;
   // Range metadata is a sequence of numbers. Make sure they are the same
-  // sequence. 
+  // sequence.
   // TODO: Note that as this is metadata, it is possible to drop and/or merge
   // this data when considering functions to merge. Thus this comparison would
   // return 0 (i.e. equivalent), but merging would become more complicated
@@ -557,8 +564,8 @@ int FunctionComparator::cmpRangeMetadata(const MDNode* L,
   if (int Res = cmpNumbers(L->getNumOperands(), R->getNumOperands()))
     return Res;
   for (size_t I = 0; I < L->getNumOperands(); ++I) {
-    ConstantInt* LLow = mdconst::extract<ConstantInt>(L->getOperand(I));
-    ConstantInt* RLow = mdconst::extract<ConstantInt>(R->getOperand(I));
+    ConstantInt *LLow = mdconst::extract<ConstantInt>(L->getOperand(I));
+    ConstantInt *RLow = mdconst::extract<ConstantInt>(R->getOperand(I));
     if (int Res = cmpAPInts(LLow->getValue(), RLow->getValue()))
       return Res;
   }
@@ -596,7 +603,8 @@ int FunctionComparator::cmpOperandBundlesSchema(const Instruction *L,
 /// type.
 /// 2. Compare constant contents.
 /// For more details see declaration comments.
-int FunctionComparator::cmpConstants(const Constant *L, const Constant *R) {
+int FunctionComparator::cmpConstants(const Constant *L,
+                                     const Constant *R) const {
 
   Type *TyL = L->getType();
   Type *TyR = R->getType();
@@ -793,7 +801,7 @@ int FunctionComparator::cmpConstants(const Constant *L, const Constant *R) {
   }
 }
 
-int FunctionComparator::cmpGlobalValues(GlobalValue *L, GlobalValue* R) {
+int FunctionComparator::cmpGlobalValues(GlobalValue *L, GlobalValue *R) const {
   return cmpNumbers(GlobalNumbers->getNumber(L), GlobalNumbers->getNumber(R));
 }
 
@@ -898,9 +906,9 @@ int FunctionComparator::cmpTypes(Type *TyL, Type *TyR) const {
 int FunctionComparator::cmpOperations(const Instruction *L,
                                       const Instruction *R) const {
   // Differences from Instruction::isSameOperationAs:
-  //  * replace type comparison with calls to isEquivalentType.
-  //  * we test for I->hasSameSubclassOptionalData (nuw/nsw/tail) at the top
-  //  * because of the above, we don't test for the tail bit on calls later on
+  //  * replace type comparison with calls to cmpTypes.
+  //  * we test for I->getRawSubclassOptionalData (nuw/nsw/tail) at the top.
+  //  * because of the above, we don't test for the tail bit on calls later on.
   if (int Res = cmpNumbers(L->getOpcode(), R->getOpcode()))
     return Res;
 
@@ -914,15 +922,6 @@ int FunctionComparator::cmpOperations(const Instruction *L,
                            R->getRawSubclassOptionalData()))
     return Res;
 
-  if (const AllocaInst *AI = dyn_cast<AllocaInst>(L)) {
-    if (int Res = cmpTypes(AI->getAllocatedType(),
-                           cast<AllocaInst>(R)->getAllocatedType()))
-      return Res;
-    if (int Res =
-            cmpNumbers(AI->getAlignment(), cast<AllocaInst>(R)->getAlignment()))
-      return Res;
-  }
-
   // We have two instructions of identical opcode and #operands.  Check to see
   // if all operands are the same type
   for (unsigned i = 0, e = L->getNumOperands(); i != e; ++i) {
@@ -932,6 +931,12 @@ int FunctionComparator::cmpOperations(const Instruction *L,
   }
 
   // Check special state that is a part of some instructions.
+  if (const AllocaInst *AI = dyn_cast<AllocaInst>(L)) {
+    if (int Res = cmpTypes(AI->getAllocatedType(),
+                           cast<AllocaInst>(R)->getAllocatedType()))
+      return Res;
+    return cmpNumbers(AI->getAlignment(), cast<AllocaInst>(R)->getAlignment());
+  }
   if (const LoadInst *LI = dyn_cast<LoadInst>(L)) {
     if (int Res = cmpNumbers(LI->isVolatile(), cast<LoadInst>(R)->isVolatile()))
       return Res;
@@ -939,7 +944,7 @@ int FunctionComparator::cmpOperations(const Instruction *L,
             cmpNumbers(LI->getAlignment(), cast<LoadInst>(R)->getAlignment()))
       return Res;
     if (int Res =
-            cmpNumbers(LI->getOrdering(), cast<LoadInst>(R)->getOrdering()))
+            cmpOrderings(LI->getOrdering(), cast<LoadInst>(R)->getOrdering()))
       return Res;
     if (int Res =
             cmpNumbers(LI->getSynchScope(), cast<LoadInst>(R)->getSynchScope()))
@@ -955,7 +960,7 @@ int FunctionComparator::cmpOperations(const Instruction *L,
             cmpNumbers(SI->getAlignment(), cast<StoreInst>(R)->getAlignment()))
       return Res;
     if (int Res =
-            cmpNumbers(SI->getOrdering(), cast<StoreInst>(R)->getOrdering()))
+            cmpOrderings(SI->getOrdering(), cast<StoreInst>(R)->getOrdering()))
       return Res;
     return cmpNumbers(SI->getSynchScope(), cast<StoreInst>(R)->getSynchScope());
   }
@@ -996,6 +1001,7 @@ int FunctionComparator::cmpOperations(const Instruction *L,
       if (int Res = cmpNumbers(LIndices[i], RIndices[i]))
         return Res;
     }
+    return 0;
   }
   if (const ExtractValueInst *EVI = dyn_cast<ExtractValueInst>(L)) {
     ArrayRef<unsigned> LIndices = EVI->getIndices();
@@ -1009,11 +1015,10 @@ int FunctionComparator::cmpOperations(const Instruction *L,
   }
   if (const FenceInst *FI = dyn_cast<FenceInst>(L)) {
     if (int Res =
-            cmpNumbers(FI->getOrdering(), cast<FenceInst>(R)->getOrdering()))
+            cmpOrderings(FI->getOrdering(), cast<FenceInst>(R)->getOrdering()))
       return Res;
     return cmpNumbers(FI->getSynchScope(), cast<FenceInst>(R)->getSynchScope());
   }
-
   if (const AtomicCmpXchgInst *CXI = dyn_cast<AtomicCmpXchgInst>(L)) {
     if (int Res = cmpNumbers(CXI->isVolatile(),
                              cast<AtomicCmpXchgInst>(R)->isVolatile()))
@@ -1021,11 +1026,13 @@ int FunctionComparator::cmpOperations(const Instruction *L,
     if (int Res = cmpNumbers(CXI->isWeak(),
                              cast<AtomicCmpXchgInst>(R)->isWeak()))
       return Res;
-    if (int Res = cmpNumbers(CXI->getSuccessOrdering(),
-                             cast<AtomicCmpXchgInst>(R)->getSuccessOrdering()))
+    if (int Res =
+            cmpOrderings(CXI->getSuccessOrdering(),
+                         cast<AtomicCmpXchgInst>(R)->getSuccessOrdering()))
       return Res;
-    if (int Res = cmpNumbers(CXI->getFailureOrdering(),
-                             cast<AtomicCmpXchgInst>(R)->getFailureOrdering()))
+    if (int Res =
+            cmpOrderings(CXI->getFailureOrdering(),
+                         cast<AtomicCmpXchgInst>(R)->getFailureOrdering()))
       return Res;
     return cmpNumbers(CXI->getSynchScope(),
                       cast<AtomicCmpXchgInst>(R)->getSynchScope());
@@ -1037,19 +1044,30 @@ int FunctionComparator::cmpOperations(const Instruction *L,
     if (int Res = cmpNumbers(RMWI->isVolatile(),
                              cast<AtomicRMWInst>(R)->isVolatile()))
       return Res;
-    if (int Res = cmpNumbers(RMWI->getOrdering(),
+    if (int Res = cmpOrderings(RMWI->getOrdering(),
                              cast<AtomicRMWInst>(R)->getOrdering()))
       return Res;
     return cmpNumbers(RMWI->getSynchScope(),
                       cast<AtomicRMWInst>(R)->getSynchScope());
   }
+  if (const PHINode *PNL = dyn_cast<PHINode>(L)) {
+    const PHINode *PNR = cast<PHINode>(R);
+    // Ensure that in addition to the incoming values being identical
+    // (checked by the caller of this function), the incoming blocks
+    // are also identical.
+    for (unsigned i = 0, e = PNL->getNumIncomingValues(); i != e; ++i) {
+      if (int Res =
+              cmpValues(PNL->getIncomingBlock(i), PNR->getIncomingBlock(i)))
+        return Res;
+    }
+  }
   return 0;
 }
 
 // Determine whether two GEP operations perform the same underlying arithmetic.
 // Read method declaration comments for more details.
 int FunctionComparator::cmpGEPs(const GEPOperator *GEPL,
-                               const GEPOperator *GEPR) {
+                                const GEPOperator *GEPR) const {
 
   unsigned int ASL = GEPL->getPointerAddressSpace();
   unsigned int ASR = GEPR->getPointerAddressSpace();
@@ -1106,7 +1124,7 @@ int FunctionComparator::cmpInlineAsm(const InlineAsm *L,
 /// this is the first time the values are seen, they're added to the mapping so
 /// that we will detect mismatches on next use.
 /// See comments in declaration for more details.
-int FunctionComparator::cmpValues(const Value *L, const Value *R) {
+int FunctionComparator::cmpValues(const Value *L, const Value *R) const {
   // Catch self-reference case.
   if (L == FnL) {
     if (R == FnR)
@@ -1149,7 +1167,7 @@ int FunctionComparator::cmpValues(const Value *L, const Value *R) {
 }
 // Test whether two basic blocks have equivalent behaviour.
 int FunctionComparator::cmpBasicBlocks(const BasicBlock *BBL,
-                                       const BasicBlock *BBR) {
+                                       const BasicBlock *BBR) const {
   BasicBlock::const_iterator InstL = BBL->begin(), InstLE = BBL->end();
   BasicBlock::const_iterator InstR = BBR->begin(), InstRE = BBR->end();
 
@@ -1186,7 +1204,8 @@ int FunctionComparator::cmpBasicBlocks(const BasicBlock *BBL,
       }
     }
 
-    ++InstL, ++InstR;
+    ++InstL;
+    ++InstR;
   } while (InstL != InstLE && InstR != InstRE);
 
   if (InstL != InstLE && InstR == InstRE)
@@ -1249,7 +1268,7 @@ int FunctionComparator::compare() {
   // functions, then takes each block from each terminator in order. As an
   // artifact, this also means that unreachable blocks are ignored.
   SmallVector<const BasicBlock *, 8> FnLBBs, FnRBBs;
-  SmallSet<const BasicBlock *, 128> VisitedBBs; // in terms of F1.
+  SmallPtrSet<const BasicBlock *, 32> VisitedBBs; // in terms of F1.
 
   FnLBBs.push_back(&FnL->getEntryBlock());
   FnRBBs.push_back(&FnR->getEntryBlock());
@@ -1517,6 +1536,9 @@ bool MergeFunctions::doSanityCheck(std::vector<WeakVH> &Worklist) {
 }
 
 bool MergeFunctions::runOnModule(Module &M) {
+  if (skipModule(M))
+    return false;
+
   bool Changed = false;
 
   // All functions in the module, ordered by hash. Functions with a unique
@@ -1555,28 +1577,12 @@ bool MergeFunctions::runOnModule(Module &M) {
     DEBUG(dbgs() << "size of module: " << M.size() << '\n');
     DEBUG(dbgs() << "size of worklist: " << Worklist.size() << '\n');
 
-    // Insert only strong functions and merge them. Strong function merging
-    // always deletes one of them.
-    for (std::vector<WeakVH>::iterator I = Worklist.begin(),
-           E = Worklist.end(); I != E; ++I) {
-      if (!*I) continue;
-      Function *F = cast<Function>(*I);
-      if (!F->isDeclaration() && !F->hasAvailableExternallyLinkage() &&
-          !F->mayBeOverridden()) {
-        Changed |= insert(F);
-      }
-    }
-
-    // Insert only weak functions and merge them. By doing these second we
-    // create thunks to the strong function when possible. When two weak
-    // functions are identical, we create a new strong function with two weak
-    // weak thunks to it which are identical but not mergable.
-    for (std::vector<WeakVH>::iterator I = Worklist.begin(),
-           E = Worklist.end(); I != E; ++I) {
-      if (!*I) continue;
-      Function *F = cast<Function>(*I);
-      if (!F->isDeclaration() && !F->hasAvailableExternallyLinkage() &&
-          F->mayBeOverridden()) {
+    // Insert functions and merge them.
+    for (WeakVH &I : Worklist) {
+      if (!I)
+        continue;
+      Function *F = cast<Function>(I);
+      if (!F->isDeclaration() && !F->hasAvailableExternallyLinkage()) {
         Changed |= insert(F);
       }
     }
@@ -1631,7 +1637,7 @@ void MergeFunctions::replaceDirectCallers(Function *Old, Function *New) {
 
 // Replace G with an alias to F if possible, or else a thunk to F. Deletes G.
 void MergeFunctions::writeThunkOrAlias(Function *F, Function *G) {
-  if (HasGlobalAliases && G->hasUnnamedAddr()) {
+  if (HasGlobalAliases && G->hasGlobalUnnamedAddr()) {
     if (G->hasExternalLinkage() || G->hasLocalLinkage() ||
         G->hasWeakLinkage()) {
       writeAlias(F, G);
@@ -1645,7 +1651,7 @@ void MergeFunctions::writeThunkOrAlias(Function *F, Function *G) {
 // Helper for writeThunk,
 // Selects proper bitcast operation,
 // but a bit simpler then CastInst::getCastOpcode.
-static Value *createCast(IRBuilder<false> &Builder, Value *V, Type *DestTy) {
+static Value *createCast(IRBuilder<> &Builder, Value *V, Type *DestTy) {
   Type *SrcTy = V->getType();
   if (SrcTy->isStructTy()) {
     assert(DestTy->isStructTy());
@@ -1673,7 +1679,7 @@ static Value *createCast(IRBuilder<false> &Builder, Value *V, Type *DestTy) {
 // Replace G with a simple tail call to bitcast(F). Also replace direct uses
 // of G with bitcast(F). Deletes G.
 void MergeFunctions::writeThunk(Function *F, Function *G) {
-  if (!G->mayBeOverridden()) {
+  if (!G->isInterposable()) {
     // Redirect direct callers of G to F.
     replaceDirectCallers(G, F);
   }
@@ -1688,7 +1694,7 @@ void MergeFunctions::writeThunk(Function *F, Function *G) {
   Function *NewG = Function::Create(G->getFunctionType(), G->getLinkage(), "",
                                     G->getParent());
   BasicBlock *BB = BasicBlock::Create(F->getContext(), "", NewG);
-  IRBuilder<false> Builder(BB);
+  IRBuilder<> Builder(BB);
 
   SmallVector<Value *, 16> Args;
   unsigned i = 0;
@@ -1734,8 +1740,8 @@ void MergeFunctions::writeAlias(Function *F, Function *G) {
 
 // Merge two equivalent functions. Upon completion, Function G is deleted.
 void MergeFunctions::mergeTwoFunctions(Function *F, Function *G) {
-  if (F->mayBeOverridden()) {
-    assert(G->mayBeOverridden());
+  if (F->isInterposable()) {
+    assert(G->isInterposable());
 
     // Make them both thunks to the same internal function.
     Function *H = Function::Create(F->getFunctionType(), F->getLinkage(), "",
@@ -1816,20 +1822,16 @@ bool MergeFunctions::insert(Function *NewFunction) {
   // important when operating on more than one module independently to prevent
   // cycles of thunks calling each other when the modules are linked together.
   //
-  // When one function is weak and the other is strong there is an order imposed
-  // already. We process strong functions before weak functions.
-  if ((OldF.getFunc()->mayBeOverridden() && NewFunction->mayBeOverridden()) ||
-      (!OldF.getFunc()->mayBeOverridden() && !NewFunction->mayBeOverridden()))
-    if (OldF.getFunc()->getName() > NewFunction->getName()) {
-      // Swap the two functions.
-      Function *F = OldF.getFunc();
-      replaceFunctionInTree(*Result.first, NewFunction);
-      NewFunction = F;
-      assert(OldF.getFunc() != F && "Must have swapped the functions.");
-    }
-
-  // Never thunk a strong function to a weak function.
-  assert(!OldF.getFunc()->mayBeOverridden() || NewFunction->mayBeOverridden());
+  // First of all, we process strong functions before weak functions.
+  if ((OldF.getFunc()->isInterposable() && !NewFunction->isInterposable()) ||
+     (OldF.getFunc()->isInterposable() == NewFunction->isInterposable() &&
+       OldF.getFunc()->getName() > NewFunction->getName())) {
+    // Swap the two functions.
+    Function *F = OldF.getFunc();
+    replaceFunctionInTree(*Result.first, NewFunction);
+    NewFunction = F;
+    assert(OldF.getFunc() != F && "Must have swapped the functions.");
+  }
 
   DEBUG(dbgs() << "  " << OldF.getFunc()->getName()
                << " == " << NewFunction->getName() << '\n');
diff --git a/lib/Transforms/IPO/PartialInlining.cpp b/lib/Transforms/IPO/PartialInlining.cpp
index 0c5c84bbccab..49c44173491e 100644
--- a/lib/Transforms/IPO/PartialInlining.cpp
+++ b/lib/Transforms/IPO/PartialInlining.cpp
@@ -12,13 +12,14 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "llvm/Transforms/IPO.h"
+#include "llvm/Transforms/IPO/PartialInlining.h"
 #include "llvm/ADT/Statistic.h"
 #include "llvm/IR/CFG.h"
 #include "llvm/IR/Dominators.h"
 #include "llvm/IR/Instructions.h"
 #include "llvm/IR/Module.h"
 #include "llvm/Pass.h"
+#include "llvm/Transforms/IPO.h"
 #include "llvm/Transforms/Utils/Cloning.h"
 #include "llvm/Transforms/Utils/CodeExtractor.h"
 using namespace llvm;
@@ -28,27 +29,34 @@ using namespace llvm;
 STATISTIC(NumPartialInlined, "Number of functions partially inlined");
 
 namespace {
-  struct PartialInliner : public ModulePass {
-    void getAnalysisUsage(AnalysisUsage &AU) const override { }
-    static char ID; // Pass identification, replacement for typeid
-    PartialInliner() : ModulePass(ID) {
-      initializePartialInlinerPass(*PassRegistry::getPassRegistry());
-    }
+struct PartialInlinerLegacyPass : public ModulePass {
+  static char ID; // Pass identification, replacement for typeid
+  PartialInlinerLegacyPass() : ModulePass(ID) {
+    initializePartialInlinerLegacyPassPass(*PassRegistry::getPassRegistry());
+  }
 
-    bool runOnModule(Module& M) override;
+  bool runOnModule(Module &M) override {
+    if (skipModule(M))
+      return false;
+    ModuleAnalysisManager DummyMAM;
+    auto PA = Impl.run(M, DummyMAM);
+    return !PA.areAllPreserved();
+  }
 
-  private:
-    Function* unswitchFunction(Function* F);
+private:
+  PartialInlinerPass Impl;
   };
 }
 
-char PartialInliner::ID = 0;
-INITIALIZE_PASS(PartialInliner, "partial-inliner",
-                "Partial Inliner", false, false)
+char PartialInlinerLegacyPass::ID = 0;
+INITIALIZE_PASS(PartialInlinerLegacyPass, "partial-inliner", "Partial Inliner",
+                false, false)
 
-ModulePass* llvm::createPartialInliningPass() { return new PartialInliner(); }
+ModulePass *llvm::createPartialInliningPass() {
+  return new PartialInlinerLegacyPass();
+}
 
-Function* PartialInliner::unswitchFunction(Function* F) {
+Function *PartialInlinerPass::unswitchFunction(Function *F) {
   // First, verify that this function is an unswitching candidate...
   BasicBlock *entryBlock = &F->front();
   BranchInst *BR = dyn_cast<BranchInst>(entryBlock->getTerminator());
@@ -71,10 +79,8 @@ Function* PartialInliner::unswitchFunction(Function* F) {
   
   // Clone the function, so that we can hack away on it.
   ValueToValueMapTy VMap;
-  Function* duplicateFunction = CloneFunction(F, VMap,
-                                              /*ModuleLevelChanges=*/false);
+  Function* duplicateFunction = CloneFunction(F, VMap);
   duplicateFunction->setLinkage(GlobalValue::InternalLinkage);
-  F->getParent()->getFunctionList().push_back(duplicateFunction);
   BasicBlock* newEntryBlock = cast<BasicBlock>(VMap[entryBlock]);
   BasicBlock* newReturnBlock = cast<BasicBlock>(VMap[returnBlock]);
   BasicBlock* newNonReturnBlock = cast<BasicBlock>(VMap[nonReturnBlock]);
@@ -112,11 +118,10 @@ Function* PartialInliner::unswitchFunction(Function* F) {
   // Gather up the blocks that we're going to extract.
   std::vector<BasicBlock*> toExtract;
   toExtract.push_back(newNonReturnBlock);
-  for (Function::iterator FI = duplicateFunction->begin(),
-       FE = duplicateFunction->end(); FI != FE; ++FI)
-    if (&*FI != newEntryBlock && &*FI != newReturnBlock &&
-        &*FI != newNonReturnBlock)
-      toExtract.push_back(&*FI);
+  for (BasicBlock &BB : *duplicateFunction)
+    if (&BB != newEntryBlock && &BB != newReturnBlock &&
+        &BB != newNonReturnBlock)
+      toExtract.push_back(&BB);
 
   // The CodeExtractor needs a dominator tree.
   DominatorTree DT;
@@ -131,11 +136,10 @@ Function* PartialInliner::unswitchFunction(Function* F) {
   // Inline the top-level if test into all callers.
   std::vector<User *> Users(duplicateFunction->user_begin(),
                             duplicateFunction->user_end());
-  for (std::vector<User*>::iterator UI = Users.begin(), UE = Users.end();
-       UI != UE; ++UI)
-    if (CallInst *CI = dyn_cast<CallInst>(*UI))
+  for (User *User : Users)
+    if (CallInst *CI = dyn_cast<CallInst>(User))
       InlineFunction(CI, IFI);
-    else if (InvokeInst *II = dyn_cast<InvokeInst>(*UI))
+    else if (InvokeInst *II = dyn_cast<InvokeInst>(User))
       InlineFunction(II, IFI);
   
   // Ditch the duplicate, since we're done with it, and rewrite all remaining
@@ -148,13 +152,13 @@ Function* PartialInliner::unswitchFunction(Function* F) {
   return extractedFunction;
 }
 
-bool PartialInliner::runOnModule(Module& M) {
+PreservedAnalyses PartialInlinerPass::run(Module &M, ModuleAnalysisManager &) {
   std::vector<Function*> worklist;
   worklist.reserve(M.size());
-  for (Module::iterator FI = M.begin(), FE = M.end(); FI != FE; ++FI)
-    if (!FI->use_empty() && !FI->isDeclaration())
-      worklist.push_back(&*FI);
-    
+  for (Function &F : M)
+    if (!F.use_empty() && !F.isDeclaration())
+      worklist.push_back(&F);
+
   bool changed = false;
   while (!worklist.empty()) {
     Function* currFunc = worklist.back();
@@ -178,6 +182,8 @@ bool PartialInliner::runOnModule(Module& M) {
     }
     
   }
-  
-  return changed;
+
+  if (changed)
+    return PreservedAnalyses::none();
+  return PreservedAnalyses::all();
 }
diff --git a/lib/Transforms/IPO/PassManagerBuilder.cpp b/lib/Transforms/IPO/PassManagerBuilder.cpp
index faada9c2a7db..cf5b76dc365b 100644
--- a/lib/Transforms/IPO/PassManagerBuilder.cpp
+++ b/lib/Transforms/IPO/PassManagerBuilder.cpp
@@ -16,23 +16,27 @@
 #include "llvm-c/Transforms/PassManagerBuilder.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/Analysis/BasicAliasAnalysis.h"
-#include "llvm/Analysis/CFLAliasAnalysis.h"
+#include "llvm/Analysis/CFLAndersAliasAnalysis.h"
+#include "llvm/Analysis/CFLSteensAliasAnalysis.h"
 #include "llvm/Analysis/GlobalsModRef.h"
 #include "llvm/Analysis/Passes.h"
 #include "llvm/Analysis/ScopedNoAliasAA.h"
 #include "llvm/Analysis/TargetLibraryInfo.h"
 #include "llvm/Analysis/TypeBasedAliasAnalysis.h"
 #include "llvm/IR/DataLayout.h"
-#include "llvm/IR/FunctionInfo.h"
 #include "llvm/IR/LegacyPassManager.h"
+#include "llvm/IR/ModuleSummaryIndex.h"
 #include "llvm/IR/Verifier.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/ManagedStatic.h"
 #include "llvm/Target/TargetMachine.h"
 #include "llvm/Transforms/IPO.h"
 #include "llvm/Transforms/IPO/ForceFunctionAttrs.h"
+#include "llvm/Transforms/IPO/FunctionAttrs.h"
 #include "llvm/Transforms/IPO/InferFunctionAttrs.h"
+#include "llvm/Transforms/Instrumentation.h"
 #include "llvm/Transforms/Scalar.h"
+#include "llvm/Transforms/Scalar/GVN.h"
 #include "llvm/Transforms/Vectorize.h"
 
 using namespace llvm;
@@ -58,10 +62,6 @@ static cl::opt<bool> ExtraVectorizerPasses(
     "extra-vectorizer-passes", cl::init(false), cl::Hidden,
     cl::desc("Run cleanup optimization passes after vectorization."));
 
-static cl::opt<bool> UseNewSROA("use-new-sroa",
-  cl::init(true), cl::Hidden,
-  cl::desc("Enable the new, experimental SROA pass"));
-
 static cl::opt<bool>
 RunLoopRerolling("reroll-loops", cl::Hidden,
                  cl::desc("Run the loop rerolling pass"));
@@ -80,9 +80,19 @@ RunSLPAfterLoopVectorization("run-slp-after-loop-vectorization",
   cl::desc("Run the SLP vectorizer (and BB vectorizer) after the Loop "
            "vectorizer instead of before"));
 
-static cl::opt<bool> UseCFLAA("use-cfl-aa",
-  cl::init(false), cl::Hidden,
-  cl::desc("Enable the new, experimental CFL alias analysis"));
+// Experimental option to use CFL-AA
+enum class CFLAAType { None, Steensgaard, Andersen, Both };
+static cl::opt<CFLAAType>
+    UseCFLAA("use-cfl-aa", cl::init(CFLAAType::None), cl::Hidden,
+             cl::desc("Enable the new, experimental CFL alias analysis"),
+             cl::values(clEnumValN(CFLAAType::None, "none", "Disable CFL-AA"),
+                        clEnumValN(CFLAAType::Steensgaard, "steens",
+                                   "Enable unification-based CFL-AA"),
+                        clEnumValN(CFLAAType::Andersen, "anders",
+                                   "Enable inclusion-based CFL-AA"),
+                        clEnumValN(CFLAAType::Both, "both",
+                                   "Enable both variants of CFL-aa"),
+                        clEnumValEnd));
 
 static cl::opt<bool>
 EnableMLSM("mlsm", cl::init(true), cl::Hidden,
@@ -92,25 +102,44 @@ static cl::opt<bool> EnableLoopInterchange(
     "enable-loopinterchange", cl::init(false), cl::Hidden,
     cl::desc("Enable the new, experimental LoopInterchange Pass"));
 
-static cl::opt<bool> EnableLoopDistribute(
-    "enable-loop-distribute", cl::init(false), cl::Hidden,
-    cl::desc("Enable the new, experimental LoopDistribution Pass"));
-
 static cl::opt<bool> EnableNonLTOGlobalsModRef(
     "enable-non-lto-gmr", cl::init(true), cl::Hidden,
     cl::desc(
         "Enable the GlobalsModRef AliasAnalysis outside of the LTO pipeline."));
 
 static cl::opt<bool> EnableLoopLoadElim(
-    "enable-loop-load-elim", cl::init(false), cl::Hidden,
-    cl::desc("Enable the new, experimental LoopLoadElimination Pass"));
+    "enable-loop-load-elim", cl::init(true), cl::Hidden,
+    cl::desc("Enable the LoopLoadElimination Pass"));
+
+static cl::opt<std::string> RunPGOInstrGen(
+    "profile-generate", cl::init(""), cl::Hidden,
+    cl::desc("Enable generation phase of PGO instrumentation and specify the "
+             "path of profile data file"));
+
+static cl::opt<std::string> RunPGOInstrUse(
+    "profile-use", cl::init(""), cl::Hidden, cl::value_desc("filename"),
+    cl::desc("Enable use phase of PGO instrumentation and specify the path "
+             "of profile data file"));
+
+static cl::opt<bool> UseLoopVersioningLICM(
+    "enable-loop-versioning-licm", cl::init(false), cl::Hidden,
+    cl::desc("Enable the experimental Loop Versioning LICM pass"));
+
+static cl::opt<bool>
+    DisablePreInliner("disable-preinline", cl::init(false), cl::Hidden,
+                      cl::desc("Disable pre-instrumentation inliner"));
+
+static cl::opt<int> PreInlineThreshold(
+    "preinline-threshold", cl::Hidden, cl::init(75), cl::ZeroOrMore,
+    cl::desc("Control the amount of inlining in pre-instrumentation inliner "
+             "(default = 75)"));
 
 PassManagerBuilder::PassManagerBuilder() {
     OptLevel = 2;
     SizeLevel = 0;
     LibraryInfo = nullptr;
     Inliner = nullptr;
-    FunctionIndex = nullptr;
+    ModuleSummary = nullptr;
     DisableUnitAtATime = false;
     DisableUnrollLoops = false;
     BBVectorize = RunBBVectorization;
@@ -123,6 +152,10 @@ PassManagerBuilder::PassManagerBuilder() {
     VerifyOutput = false;
     MergeFunctions = false;
     PrepareForLTO = false;
+    PGOInstrGen = RunPGOInstrGen;
+    PGOInstrUse = RunPGOInstrUse;
+    PrepareForThinLTO = false;
+    PerformThinLTO = false;
 }
 
 PassManagerBuilder::~PassManagerBuilder() {
@@ -137,11 +170,11 @@ static ManagedStatic<SmallVector<std::pair<PassManagerBuilder::ExtensionPointTy,
 void PassManagerBuilder::addGlobalExtension(
     PassManagerBuilder::ExtensionPointTy Ty,
     PassManagerBuilder::ExtensionFn Fn) {
-  GlobalExtensions->push_back(std::make_pair(Ty, Fn));
+  GlobalExtensions->push_back(std::make_pair(Ty, std::move(Fn)));
 }
 
 void PassManagerBuilder::addExtension(ExtensionPointTy Ty, ExtensionFn Fn) {
-  Extensions.push_back(std::make_pair(Ty, Fn));
+  Extensions.push_back(std::make_pair(Ty, std::move(Fn)));
 }
 
 void PassManagerBuilder::addExtensionsToPM(ExtensionPointTy ETy,
@@ -156,15 +189,34 @@ void PassManagerBuilder::addExtensionsToPM(ExtensionPointTy ETy,
 
 void PassManagerBuilder::addInitialAliasAnalysisPasses(
     legacy::PassManagerBase &PM) const {
+  switch (UseCFLAA) {
+  case CFLAAType::Steensgaard:
+    PM.add(createCFLSteensAAWrapperPass());
+    break;
+  case CFLAAType::Andersen:
+    PM.add(createCFLAndersAAWrapperPass());
+    break;
+  case CFLAAType::Both:
+    PM.add(createCFLSteensAAWrapperPass());
+    PM.add(createCFLAndersAAWrapperPass());
+    break;
+  default:
+    break;
+  }
+
   // Add TypeBasedAliasAnalysis before BasicAliasAnalysis so that
   // BasicAliasAnalysis wins if they disagree. This is intended to help
   // support "obvious" type-punning idioms.
-  if (UseCFLAA)
-    PM.add(createCFLAAWrapperPass());
   PM.add(createTypeBasedAAWrapperPass());
   PM.add(createScopedNoAliasAAWrapperPass());
 }
 
+void PassManagerBuilder::addInstructionCombiningPass(
+    legacy::PassManagerBase &PM) const {
+  bool ExpensiveCombines = OptLevel > 2;
+  PM.add(createInstructionCombiningPass(ExpensiveCombines));
+}
+
 void PassManagerBuilder::populateFunctionPassManager(
     legacy::FunctionPassManager &FPM) {
   addExtensionsToPM(EP_EarlyAsPossible, FPM);
@@ -178,94 +230,50 @@ void PassManagerBuilder::populateFunctionPassManager(
   addInitialAliasAnalysisPasses(FPM);
 
   FPM.add(createCFGSimplificationPass());
-  if (UseNewSROA)
-    FPM.add(createSROAPass());
-  else
-    FPM.add(createScalarReplAggregatesPass());
+  FPM.add(createSROAPass());
   FPM.add(createEarlyCSEPass());
+  FPM.add(createGVNHoistPass());
   FPM.add(createLowerExpectIntrinsicPass());
 }
 
-void PassManagerBuilder::populateModulePassManager(
-    legacy::PassManagerBase &MPM) {
-  // Allow forcing function attributes as a debugging and tuning aid.
-  MPM.add(createForceFunctionAttrsLegacyPass());
-
-  // If all optimizations are disabled, just run the always-inline pass and,
-  // if enabled, the function merging pass.
-  if (OptLevel == 0) {
-    if (Inliner) {
-      MPM.add(Inliner);
-      Inliner = nullptr;
-    }
-
-    // FIXME: The BarrierNoopPass is a HACK! The inliner pass above implicitly
-    // creates a CGSCC pass manager, but we don't want to add extensions into
-    // that pass manager. To prevent this we insert a no-op module pass to reset
-    // the pass manager to get the same behavior as EP_OptimizerLast in non-O0
-    // builds. The function merging pass is 
-    if (MergeFunctions)
-      MPM.add(createMergeFunctionsPass());
-    else if (!GlobalExtensions->empty() || !Extensions.empty())
-      MPM.add(createBarrierNoopPass());
-
-    addExtensionsToPM(EP_EnabledOnOptLevel0, MPM);
+// Do PGO instrumentation generation or use pass as the option specified.
+void PassManagerBuilder::addPGOInstrPasses(legacy::PassManagerBase &MPM) {
+  if (PGOInstrGen.empty() && PGOInstrUse.empty())
     return;
-  }
-
-  // Add LibraryInfo if we have some.
-  if (LibraryInfo)
-    MPM.add(new TargetLibraryInfoWrapperPass(*LibraryInfo));
-
-  addInitialAliasAnalysisPasses(MPM);
-
-  if (!DisableUnitAtATime) {
-    // Infer attributes about declarations if possible.
-    MPM.add(createInferFunctionAttrsLegacyPass());
-
-    addExtensionsToPM(EP_ModuleOptimizerEarly, MPM);
-
-    MPM.add(createIPSCCPPass());              // IP SCCP
-    MPM.add(createGlobalOptimizerPass());     // Optimize out global vars
-    // Promote any localized global vars
-    MPM.add(createPromoteMemoryToRegisterPass());
-
-    MPM.add(createDeadArgEliminationPass());  // Dead argument elimination
-
-    MPM.add(createInstructionCombiningPass());// Clean up after IPCP & DAE
+  // Perform the preinline and cleanup passes for O1 and above.
+  // And avoid doing them if optimizing for size.
+  if (OptLevel > 0 && SizeLevel == 0 && !DisablePreInliner) {
+    // Create preinline pass.
+    MPM.add(createFunctionInliningPass(PreInlineThreshold));
+    MPM.add(createSROAPass());
+    MPM.add(createEarlyCSEPass());             // Catch trivial redundancies
+    MPM.add(createCFGSimplificationPass());    // Merge & remove BBs
+    MPM.add(createInstructionCombiningPass()); // Combine silly seq's
     addExtensionsToPM(EP_Peephole, MPM);
-    MPM.add(createCFGSimplificationPass());   // Clean up after IPCP & DAE
   }
-
-  if (EnableNonLTOGlobalsModRef)
-    // We add a module alias analysis pass here. In part due to bugs in the
-    // analysis infrastructure this "works" in that the analysis stays alive
-    // for the entire SCC pass run below.
-    MPM.add(createGlobalsAAWrapperPass());
-
-  // Start of CallGraph SCC passes.
-  if (!DisableUnitAtATime)
-    MPM.add(createPruneEHPass());             // Remove dead EH info
-  if (Inliner) {
-    MPM.add(Inliner);
-    Inliner = nullptr;
+  if (!PGOInstrGen.empty()) {
+    MPM.add(createPGOInstrumentationGenLegacyPass());
+    // Add the profile lowering pass.
+    InstrProfOptions Options;
+    Options.InstrProfileOutput = PGOInstrGen;
+    MPM.add(createInstrProfilingLegacyPass(Options));
   }
-  if (!DisableUnitAtATime)
-    MPM.add(createPostOrderFunctionAttrsPass());
-  if (OptLevel > 2)
-    MPM.add(createArgumentPromotionPass());   // Scalarize uninlined fn args
-
+  if (!PGOInstrUse.empty())
+    MPM.add(createPGOInstrumentationUseLegacyPass(PGOInstrUse));
+}
+void PassManagerBuilder::addFunctionSimplificationPasses(
+    legacy::PassManagerBase &MPM) {
   // Start of function pass.
   // Break up aggregate allocas, using SSAUpdater.
-  if (UseNewSROA)
-    MPM.add(createSROAPass());
-  else
-    MPM.add(createScalarReplAggregatesPass(-1, false));
+  MPM.add(createSROAPass());
   MPM.add(createEarlyCSEPass());              // Catch trivial redundancies
+  // Speculative execution if the target has divergent branches; otherwise nop.
+  MPM.add(createSpeculativeExecutionIfHasBranchDivergencePass());
   MPM.add(createJumpThreadingPass());         // Thread jumps.
   MPM.add(createCorrelatedValuePropagationPass()); // Propagate conditionals
   MPM.add(createCFGSimplificationPass());     // Merge & remove BBs
-  MPM.add(createInstructionCombiningPass());  // Combine silly seq's
+  // Combine silly seq's
+  addInstructionCombiningPass(MPM);
   addExtensionsToPM(EP_Peephole, MPM);
 
   MPM.add(createTailCallEliminationPass()); // Eliminate tail calls
@@ -276,7 +284,7 @@ void PassManagerBuilder::populateModulePassManager(
   MPM.add(createLICMPass());                  // Hoist loop invariants
   MPM.add(createLoopUnswitchPass(SizeLevel || OptLevel < 3));
   MPM.add(createCFGSimplificationPass());
-  MPM.add(createInstructionCombiningPass());
+  addInstructionCombiningPass(MPM);
   MPM.add(createIndVarSimplifyPass());        // Canonicalize indvars
   MPM.add(createLoopIdiomPass());             // Recognize idioms like memset.
   MPM.add(createLoopDeletionPass());          // Delete dead loops
@@ -303,7 +311,7 @@ void PassManagerBuilder::populateModulePassManager(
 
   // Run instcombine after redundancy elimination to exploit opportunities
   // opened up by them.
-  MPM.add(createInstructionCombiningPass());
+  addInstructionCombiningPass(MPM);
   addExtensionsToPM(EP_Peephole, MPM);
   MPM.add(createJumpThreadingPass());         // Thread jumps
   MPM.add(createCorrelatedValuePropagationPass());
@@ -320,7 +328,7 @@ void PassManagerBuilder::populateModulePassManager(
 
     if (BBVectorize) {
       MPM.add(createBBVectorizePass());
-      MPM.add(createInstructionCombiningPass());
+      addInstructionCombiningPass(MPM);
       addExtensionsToPM(EP_Peephole, MPM);
       if (OptLevel > 1 && UseGVNAfterVectorization)
         MPM.add(createGVNPass(DisableGVNLoadPRE)); // Remove redundancies
@@ -338,18 +346,99 @@ void PassManagerBuilder::populateModulePassManager(
 
   MPM.add(createAggressiveDCEPass());         // Delete dead instructions
   MPM.add(createCFGSimplificationPass()); // Merge & remove BBs
-  MPM.add(createInstructionCombiningPass());  // Clean up after everything.
+  // Clean up after everything.
+  addInstructionCombiningPass(MPM);
   addExtensionsToPM(EP_Peephole, MPM);
+}
+
+void PassManagerBuilder::populateModulePassManager(
+    legacy::PassManagerBase &MPM) {
+  // Allow forcing function attributes as a debugging and tuning aid.
+  MPM.add(createForceFunctionAttrsLegacyPass());
+
+  // If all optimizations are disabled, just run the always-inline pass and,
+  // if enabled, the function merging pass.
+  if (OptLevel == 0) {
+    addPGOInstrPasses(MPM);
+    if (Inliner) {
+      MPM.add(Inliner);
+      Inliner = nullptr;
+    }
+
+    // FIXME: The BarrierNoopPass is a HACK! The inliner pass above implicitly
+    // creates a CGSCC pass manager, but we don't want to add extensions into
+    // that pass manager. To prevent this we insert a no-op module pass to reset
+    // the pass manager to get the same behavior as EP_OptimizerLast in non-O0
+    // builds. The function merging pass is
+    if (MergeFunctions)
+      MPM.add(createMergeFunctionsPass());
+    else if (!GlobalExtensions->empty() || !Extensions.empty())
+      MPM.add(createBarrierNoopPass());
+
+    addExtensionsToPM(EP_EnabledOnOptLevel0, MPM);
+    return;
+  }
+
+  // Add LibraryInfo if we have some.
+  if (LibraryInfo)
+    MPM.add(new TargetLibraryInfoWrapperPass(*LibraryInfo));
+
+  addInitialAliasAnalysisPasses(MPM);
+
+  if (!DisableUnitAtATime) {
+    // Infer attributes about declarations if possible.
+    MPM.add(createInferFunctionAttrsLegacyPass());
+
+    addExtensionsToPM(EP_ModuleOptimizerEarly, MPM);
+
+    MPM.add(createIPSCCPPass());          // IP SCCP
+    MPM.add(createGlobalOptimizerPass()); // Optimize out global vars
+    // Promote any localized global vars.
+    MPM.add(createPromoteMemoryToRegisterPass());
+
+    MPM.add(createDeadArgEliminationPass()); // Dead argument elimination
+
+    addInstructionCombiningPass(MPM); // Clean up after IPCP & DAE
+    addExtensionsToPM(EP_Peephole, MPM);
+    MPM.add(createCFGSimplificationPass()); // Clean up after IPCP & DAE
+  }
+
+  if (!PerformThinLTO) {
+    /// PGO instrumentation is added during the compile phase for ThinLTO, do
+    /// not run it a second time
+    addPGOInstrPasses(MPM);
+  }
+
+  // Indirect call promotion that promotes intra-module targets only.
+  MPM.add(createPGOIndirectCallPromotionLegacyPass());
+
+  if (EnableNonLTOGlobalsModRef)
+    // We add a module alias analysis pass here. In part due to bugs in the
+    // analysis infrastructure this "works" in that the analysis stays alive
+    // for the entire SCC pass run below.
+    MPM.add(createGlobalsAAWrapperPass());
+
+  // Start of CallGraph SCC passes.
+  if (!DisableUnitAtATime)
+    MPM.add(createPruneEHPass()); // Remove dead EH info
+  if (Inliner) {
+    MPM.add(Inliner);
+    Inliner = nullptr;
+  }
+  if (!DisableUnitAtATime)
+    MPM.add(createPostOrderFunctionAttrsLegacyPass());
+  if (OptLevel > 2)
+    MPM.add(createArgumentPromotionPass()); // Scalarize uninlined fn args
+
+  addFunctionSimplificationPasses(MPM);
 
   // FIXME: This is a HACK! The inliner pass above implicitly creates a CGSCC
   // pass manager that we are specifically trying to avoid. To prevent this
   // we must insert a no-op module pass to reset the pass manager.
   MPM.add(createBarrierNoopPass());
 
-  if (!DisableUnitAtATime)
-    MPM.add(createReversePostOrderFunctionAttrsPass());
-
-  if (!DisableUnitAtATime && OptLevel > 1 && !PrepareForLTO) {
+  if (!DisableUnitAtATime && OptLevel > 1 && !PrepareForLTO &&
+      !PrepareForThinLTO)
     // Remove avail extern fns and globals definitions if we aren't
     // compiling an object file for later LTO. For LTO we want to preserve
     // these so they are eligible for inlining at link-time. Note if they
@@ -360,6 +449,34 @@ void PassManagerBuilder::populateModulePassManager(
     // globals referenced by available external functions dead
     // and saves running remaining passes on the eliminated functions.
     MPM.add(createEliminateAvailableExternallyPass());
+
+  if (!DisableUnitAtATime)
+    MPM.add(createReversePostOrderFunctionAttrsPass());
+
+  // If we are planning to perform ThinLTO later, let's not bloat the code with
+  // unrolling/vectorization/... now. We'll first run the inliner + CGSCC passes
+  // during ThinLTO and perform the rest of the optimizations afterward.
+  if (PrepareForThinLTO) {
+    // Reduce the size of the IR as much as possible.
+    MPM.add(createGlobalOptimizerPass());
+    // Rename anon function to be able to export them in the summary.
+    MPM.add(createNameAnonFunctionPass());
+    return;
+  }
+
+  if (PerformThinLTO)
+    // Optimize globals now when performing ThinLTO, this enables more
+    // optimizations later.
+    MPM.add(createGlobalOptimizerPass());
+
+  // Scheduling LoopVersioningLICM when inlining is over, because after that
+  // we may see more accurate aliasing. Reason to run this late is that too
+  // early versioning may prevent further inlining due to increase of code
+  // size. By placing it just after inlining other optimizations which runs
+  // later might get benefit of no-alias assumption in clone loop.
+  if (UseLoopVersioningLICM) {
+    MPM.add(createLoopVersioningLICMPass());    // Do LoopVersioningLICM
+    MPM.add(createLICMPass());                  // Hoist loop invariants
   }
 
   if (EnableNonLTOGlobalsModRef)
@@ -391,9 +508,10 @@ void PassManagerBuilder::populateModulePassManager(
   MPM.add(createLoopRotatePass(SizeLevel == 2 ? 0 : -1));
 
   // Distribute loops to allow partial vectorization.  I.e. isolate dependences
-  // into separate loop that would otherwise inhibit vectorization.
-  if (EnableLoopDistribute)
-    MPM.add(createLoopDistributePass());
+  // into separate loop that would otherwise inhibit vectorization.  This is
+  // currently only performed for loops marked with the metadata
+  // llvm.loop.distribute=true or when -enable-loop-distribute is specified.
+  MPM.add(createLoopDistributePass(/*ProcessAllLoopsByDefault=*/false));
 
   MPM.add(createLoopVectorizePass(DisableUnrollLoops, LoopVectorize));
 
@@ -407,7 +525,7 @@ void PassManagerBuilder::populateModulePassManager(
   // on -O1 and no #pragma is found). Would be good to have these two passes
   // as function calls, so that we can only pass them when the vectorizer
   // changed the code.
-  MPM.add(createInstructionCombiningPass());
+  addInstructionCombiningPass(MPM);
   if (OptLevel > 1 && ExtraVectorizerPasses) {
     // At higher optimization levels, try to clean up any runtime overlap and
     // alignment checks inserted by the vectorizer. We want to track correllated
@@ -417,11 +535,11 @@ void PassManagerBuilder::populateModulePassManager(
     // dead (or speculatable) control flows or more combining opportunities.
     MPM.add(createEarlyCSEPass());
     MPM.add(createCorrelatedValuePropagationPass());
-    MPM.add(createInstructionCombiningPass());
+    addInstructionCombiningPass(MPM);
     MPM.add(createLICMPass());
     MPM.add(createLoopUnswitchPass(SizeLevel || OptLevel < 3));
     MPM.add(createCFGSimplificationPass());
-    MPM.add(createInstructionCombiningPass());
+    addInstructionCombiningPass(MPM);
   }
 
   if (RunSLPAfterLoopVectorization) {
@@ -434,7 +552,7 @@ void PassManagerBuilder::populateModulePassManager(
 
     if (BBVectorize) {
       MPM.add(createBBVectorizePass());
-      MPM.add(createInstructionCombiningPass());
+      addInstructionCombiningPass(MPM);
       addExtensionsToPM(EP_Peephole, MPM);
       if (OptLevel > 1 && UseGVNAfterVectorization)
         MPM.add(createGVNPass(DisableGVNLoadPRE)); // Remove redundancies
@@ -449,19 +567,22 @@ void PassManagerBuilder::populateModulePassManager(
 
   addExtensionsToPM(EP_Peephole, MPM);
   MPM.add(createCFGSimplificationPass());
-  MPM.add(createInstructionCombiningPass());
+  addInstructionCombiningPass(MPM);
 
   if (!DisableUnrollLoops) {
     MPM.add(createLoopUnrollPass());    // Unroll small loops
 
     // LoopUnroll may generate some redundency to cleanup.
-    MPM.add(createInstructionCombiningPass());
+    addInstructionCombiningPass(MPM);
 
     // Runtime unrolling will introduce runtime check in loop prologue. If the
     // unrolled loop is a inner loop, then the prologue will be inside the
     // outer loop. LICM pass can help to promote the runtime check out if the
     // checked value is loop invariant.
     MPM.add(createLICMPass());
+
+    // Get rid of LCSSA nodes.
+    MPM.add(createInstructionSimplifierPass());
   }
 
   // After vectorization and unrolling, assume intrinsics may tell us more
@@ -487,11 +608,15 @@ void PassManagerBuilder::populateModulePassManager(
 }
 
 void PassManagerBuilder::addLTOOptimizationPasses(legacy::PassManagerBase &PM) {
+  // Remove unused virtual tables to improve the quality of code generated by
+  // whole-program devirtualization and bitset lowering.
+  PM.add(createGlobalDCEPass());
+
   // Provide AliasAnalysis services for optimizations.
   addInitialAliasAnalysisPasses(PM);
 
-  if (FunctionIndex)
-    PM.add(createFunctionImportPass(FunctionIndex));
+  if (ModuleSummary)
+    PM.add(createFunctionImportPass(ModuleSummary));
 
   // Allow forcing function attributes as a debugging and tuning aid.
   PM.add(createForceFunctionAttrsLegacyPass());
@@ -499,14 +624,32 @@ void PassManagerBuilder::addLTOOptimizationPasses(legacy::PassManagerBase &PM) {
   // Infer attributes about declarations if possible.
   PM.add(createInferFunctionAttrsLegacyPass());
 
-  // Propagate constants at call sites into the functions they call.  This
-  // opens opportunities for globalopt (and inlining) by substituting function
-  // pointers passed as arguments to direct uses of functions.
-  PM.add(createIPSCCPPass());
+  if (OptLevel > 1) {
+    // Indirect call promotion. This should promote all the targets that are
+    // left by the earlier promotion pass that promotes intra-module targets.
+    // This two-step promotion is to save the compile time. For LTO, it should
+    // produce the same result as if we only do promotion here.
+    PM.add(createPGOIndirectCallPromotionLegacyPass(true));
+
+    // Propagate constants at call sites into the functions they call.  This
+    // opens opportunities for globalopt (and inlining) by substituting function
+    // pointers passed as arguments to direct uses of functions.
+    PM.add(createIPSCCPPass());
+  }
 
-  // Now that we internalized some globals, see if we can hack on them!
-  PM.add(createPostOrderFunctionAttrsPass());
+  // Infer attributes about definitions. The readnone attribute in particular is
+  // required for virtual constant propagation.
+  PM.add(createPostOrderFunctionAttrsLegacyPass());
   PM.add(createReversePostOrderFunctionAttrsPass());
+
+  // Apply whole-program devirtualization and virtual constant propagation.
+  PM.add(createWholeProgramDevirtPass());
+
+  // That's all we need at opt level 1.
+  if (OptLevel == 1)
+    return;
+
+  // Now that we internalized some globals, see if we can hack on them!
   PM.add(createGlobalOptimizerPass());
   // Promote any localized global vars.
   PM.add(createPromoteMemoryToRegisterPass());
@@ -522,7 +665,7 @@ void PassManagerBuilder::addLTOOptimizationPasses(legacy::PassManagerBase &PM) {
   // simplification opportunities, and both can propagate functions through
   // function pointers.  When this happens, we often have to resolve varargs
   // calls, etc, so let instcombine do this.
-  PM.add(createInstructionCombiningPass());
+  addInstructionCombiningPass(PM);
   addExtensionsToPM(EP_Peephole, PM);
 
   // Inline small functions
@@ -544,18 +687,15 @@ void PassManagerBuilder::addLTOOptimizationPasses(legacy::PassManagerBase &PM) {
   PM.add(createArgumentPromotionPass());
 
   // The IPO passes may leave cruft around.  Clean up after them.
-  PM.add(createInstructionCombiningPass());
+  addInstructionCombiningPass(PM);
   addExtensionsToPM(EP_Peephole, PM);
   PM.add(createJumpThreadingPass());
 
   // Break up allocas
-  if (UseNewSROA)
-    PM.add(createSROAPass());
-  else
-    PM.add(createScalarReplAggregatesPass());
+  PM.add(createSROAPass());
 
   // Run a few AA driven optimizations here and now, to cleanup the code.
-  PM.add(createPostOrderFunctionAttrsPass()); // Add nocapture.
+  PM.add(createPostOrderFunctionAttrsLegacyPass()); // Add nocapture.
   PM.add(createGlobalsAAWrapperPass()); // IP alias analysis.
 
   PM.add(createLICMPass());                 // Hoist loop invariants.
@@ -573,15 +713,20 @@ void PassManagerBuilder::addLTOOptimizationPasses(legacy::PassManagerBase &PM) {
   if (EnableLoopInterchange)
     PM.add(createLoopInterchangePass());
 
+  if (!DisableUnrollLoops)
+    PM.add(createSimpleLoopUnrollPass());   // Unroll small loops
   PM.add(createLoopVectorizePass(true, LoopVectorize));
+  // The vectorizer may have significantly shortened a loop body; unroll again.
+  if (!DisableUnrollLoops)
+    PM.add(createLoopUnrollPass());
 
   // Now that we've optimized loops (in particular loop induction variables),
   // we may have exposed more scalar opportunities. Run parts of the scalar
   // optimizer again at this point.
-  PM.add(createInstructionCombiningPass()); // Initial cleanup
+  addInstructionCombiningPass(PM); // Initial cleanup
   PM.add(createCFGSimplificationPass()); // if-convert
   PM.add(createSCCPPass()); // Propagate exposed constants
-  PM.add(createInstructionCombiningPass()); // Clean up again
+  addInstructionCombiningPass(PM); // Clean up again
   PM.add(createBitTrackingDCEPass());
 
   // More scalar chains could be vectorized due to more alias information
@@ -597,7 +742,7 @@ void PassManagerBuilder::addLTOOptimizationPasses(legacy::PassManagerBase &PM) {
     PM.add(createLoadCombinePass());
 
   // Cleanup and simplify the code after the scalar optimizations.
-  PM.add(createInstructionCombiningPass());
+  addInstructionCombiningPass(PM);
   addExtensionsToPM(EP_Peephole, PM);
 
   PM.add(createJumpThreadingPass());
@@ -620,6 +765,23 @@ void PassManagerBuilder::addLateLTOOptimizationPasses(
     PM.add(createMergeFunctionsPass());
 }
 
+void PassManagerBuilder::populateThinLTOPassManager(
+    legacy::PassManagerBase &PM) {
+  PerformThinLTO = true;
+
+  if (VerifyInput)
+    PM.add(createVerifierPass());
+
+  if (ModuleSummary)
+    PM.add(createFunctionImportPass(ModuleSummary));
+
+  populateModulePassManager(PM);
+
+  if (VerifyOutput)
+    PM.add(createVerifierPass());
+  PerformThinLTO = false;
+}
+
 void PassManagerBuilder::populateLTOPassManager(legacy::PassManagerBase &PM) {
   if (LibraryInfo)
     PM.add(new TargetLibraryInfoWrapperPass(*LibraryInfo));
@@ -627,17 +789,17 @@ void PassManagerBuilder::populateLTOPassManager(legacy::PassManagerBase &PM) {
   if (VerifyInput)
     PM.add(createVerifierPass());
 
-  if (OptLevel > 1)
+  if (OptLevel != 0)
     addLTOOptimizationPasses(PM);
 
   // Create a function that performs CFI checks for cross-DSO calls with targets
   // in the current module.
   PM.add(createCrossDSOCFIPass());
 
-  // Lower bit sets to globals. This pass supports Clang's control flow
-  // integrity mechanisms (-fsanitize=cfi*) and needs to run at link time if CFI
-  // is enabled. The pass does nothing if CFI is disabled.
-  PM.add(createLowerBitSetsPass());
+  // Lower type metadata and the type.test intrinsic. This pass supports Clang's
+  // control flow integrity mechanisms (-fsanitize=cfi*) and needs to run at
+  // link time if CFI is enabled. The pass does nothing if CFI is disabled.
+  PM.add(createLowerTypeTestsPass());
 
   if (OptLevel != 0)
     addLateLTOOptimizationPasses(PM);
diff --git a/lib/Transforms/IPO/PruneEH.cpp b/lib/Transforms/IPO/PruneEH.cpp
index 22a95fa03f7c..2aa3fa55cefd 100644
--- a/lib/Transforms/IPO/PruneEH.cpp
+++ b/lib/Transforms/IPO/PruneEH.cpp
@@ -16,7 +16,6 @@
 
 #include "llvm/Transforms/IPO.h"
 #include "llvm/ADT/SmallPtrSet.h"
-#include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/Statistic.h"
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/Analysis/CallGraph.h"
@@ -48,10 +47,10 @@ namespace {
     // runOnSCC - Analyze the SCC, performing the transformation if possible.
     bool runOnSCC(CallGraphSCC &SCC) override;
 
-    bool SimplifyFunction(Function *F);
-    void DeleteBasicBlock(BasicBlock *BB);
   };
 }
+static bool SimplifyFunction(Function *F, CallGraph &CG);
+static void DeleteBasicBlock(BasicBlock *BB, CallGraph &CG);
 
 char PruneEH::ID = 0;
 INITIALIZE_PASS_BEGIN(PruneEH, "prune-eh",
@@ -62,22 +61,20 @@ INITIALIZE_PASS_END(PruneEH, "prune-eh",
 
 Pass *llvm::createPruneEHPass() { return new PruneEH(); }
 
-
-bool PruneEH::runOnSCC(CallGraphSCC &SCC) {
+static bool runImpl(CallGraphSCC &SCC, CallGraph &CG) {
   SmallPtrSet<CallGraphNode *, 8> SCCNodes;
-  CallGraph &CG = getAnalysis<CallGraphWrapperPass>().getCallGraph();
   bool MadeChange = false;
 
   // Fill SCCNodes with the elements of the SCC.  Used for quickly
   // looking up whether a given CallGraphNode is in this SCC.
-  for (CallGraphSCC::iterator I = SCC.begin(), E = SCC.end(); I != E; ++I)
-    SCCNodes.insert(*I);
+  for (CallGraphNode *I : SCC)
+    SCCNodes.insert(I);
 
   // First pass, scan all of the functions in the SCC, simplifying them
   // according to what we know.
-  for (CallGraphSCC::iterator I = SCC.begin(), E = SCC.end(); I != E; ++I)
-    if (Function *F = (*I)->getFunction())
-      MadeChange |= SimplifyFunction(F);
+  for (CallGraphNode *I : SCC)
+    if (Function *F = I->getFunction())
+      MadeChange |= SimplifyFunction(F, CG);
 
   // Next, check to see if any callees might throw or if there are any external
   // functions in this SCC: if so, we cannot prune any functions in this SCC.
@@ -93,7 +90,10 @@ bool PruneEH::runOnSCC(CallGraphSCC &SCC) {
     if (!F) {
       SCCMightUnwind = true;
       SCCMightReturn = true;
-    } else if (F->isDeclaration() || F->mayBeOverridden()) {
+    } else if (F->isDeclaration() || F->isInterposable()) {
+      // Note: isInterposable (as opposed to hasExactDefinition) is fine above,
+      // since we're not inferring new attributes here, but only using existing,
+      // assumed to be correct, function attributes.
       SCCMightUnwind |= !F->doesNotThrow();
       SCCMightReturn |= !F->doesNotReturn();
     } else {
@@ -153,8 +153,8 @@ bool PruneEH::runOnSCC(CallGraphSCC &SCC) {
 
   // If the SCC doesn't unwind or doesn't throw, note this fact.
   if (!SCCMightUnwind || !SCCMightReturn)
-    for (CallGraphSCC::iterator I = SCC.begin(), E = SCC.end(); I != E; ++I) {
-      Function *F = (*I)->getFunction();
+    for (CallGraphNode *I : SCC) {
+      Function *F = I->getFunction();
 
       if (!SCCMightUnwind && !F->hasFnAttribute(Attribute::NoUnwind)) {
         F->addFnAttr(Attribute::NoUnwind);
@@ -167,22 +167,30 @@ bool PruneEH::runOnSCC(CallGraphSCC &SCC) {
       }
     }
 
-  for (CallGraphSCC::iterator I = SCC.begin(), E = SCC.end(); I != E; ++I) {
+  for (CallGraphNode *I : SCC) {
     // Convert any invoke instructions to non-throwing functions in this node
     // into call instructions with a branch.  This makes the exception blocks
     // dead.
-    if (Function *F = (*I)->getFunction())
-      MadeChange |= SimplifyFunction(F);
+    if (Function *F = I->getFunction())
+      MadeChange |= SimplifyFunction(F, CG);
   }
 
   return MadeChange;
 }
 
 
+bool PruneEH::runOnSCC(CallGraphSCC &SCC) {
+  if (skipSCC(SCC))
+    return false;
+  CallGraph &CG = getAnalysis<CallGraphWrapperPass>().getCallGraph();
+  return runImpl(SCC, CG);
+}
+
+
 // SimplifyFunction - Given information about callees, simplify the specified
 // function if we have invokes to non-unwinding functions or code after calls to
 // no-return functions.
-bool PruneEH::SimplifyFunction(Function *F) {
+static bool SimplifyFunction(Function *F, CallGraph &CG) {
   bool MadeChange = false;
   for (Function::iterator BB = F->begin(), E = F->end(); BB != E; ++BB) {
     if (InvokeInst *II = dyn_cast<InvokeInst>(BB->getTerminator()))
@@ -192,7 +200,7 @@ bool PruneEH::SimplifyFunction(Function *F) {
 
         // If the unwind block is now dead, nuke it.
         if (pred_empty(UnwindBlock))
-          DeleteBasicBlock(UnwindBlock);  // Delete the new BB.
+          DeleteBasicBlock(UnwindBlock, CG);  // Delete the new BB.
 
         ++NumRemoved;
         MadeChange = true;
@@ -211,7 +219,7 @@ bool PruneEH::SimplifyFunction(Function *F) {
           BB->getInstList().pop_back();
           new UnreachableInst(BB->getContext(), &*BB);
 
-          DeleteBasicBlock(New);  // Delete the new BB.
+          DeleteBasicBlock(New, CG);  // Delete the new BB.
           MadeChange = true;
           ++NumUnreach;
           break;
@@ -224,9 +232,8 @@ bool PruneEH::SimplifyFunction(Function *F) {
 /// DeleteBasicBlock - remove the specified basic block from the program,
 /// updating the callgraph to reflect any now-obsolete edges due to calls that
 /// exist in the BB.
-void PruneEH::DeleteBasicBlock(BasicBlock *BB) {
+static void DeleteBasicBlock(BasicBlock *BB, CallGraph &CG) {
   assert(pred_empty(BB) && "BB is not dead!");
-  CallGraph &CG = getAnalysis<CallGraphWrapperPass>().getCallGraph();
 
   Instruction *TokenInst = nullptr;
 
diff --git a/lib/Transforms/IPO/SampleProfile.cpp b/lib/Transforms/IPO/SampleProfile.cpp
index 928d92ef9d12..39de108edc06 100644
--- a/lib/Transforms/IPO/SampleProfile.cpp
+++ b/lib/Transforms/IPO/SampleProfile.cpp
@@ -22,10 +22,12 @@
 //
 //===----------------------------------------------------------------------===//
 
+#include "llvm/Transforms/SampleProfile.h"
 #include "llvm/ADT/DenseMap.h"
 #include "llvm/ADT/SmallPtrSet.h"
 #include "llvm/ADT/SmallSet.h"
 #include "llvm/ADT/StringRef.h"
+#include "llvm/Analysis/AssumptionCache.h"
 #include "llvm/Analysis/LoopInfo.h"
 #include "llvm/Analysis/PostDominators.h"
 #include "llvm/IR/Constants.h"
@@ -35,6 +37,7 @@
 #include "llvm/IR/Function.h"
 #include "llvm/IR/InstIterator.h"
 #include "llvm/IR/Instructions.h"
+#include "llvm/IR/IntrinsicInst.h"
 #include "llvm/IR/LLVMContext.h"
 #include "llvm/IR/MDBuilder.h"
 #include "llvm/IR/Metadata.h"
@@ -76,16 +79,6 @@ static cl::opt<double> SampleProfileHotThreshold(
     "sample-profile-inline-hot-threshold", cl::init(0.1), cl::value_desc("N"),
     cl::desc("Inlined functions that account for more than N% of all samples "
              "collected in the parent function, will be inlined again."));
-static cl::opt<double> SampleProfileGlobalHotThreshold(
-    "sample-profile-global-hot-threshold", cl::init(30), cl::value_desc("N"),
-    cl::desc("Top-level functions that account for more than N% of all samples "
-             "collected in the profile, will be marked as hot for the inliner "
-             "to consider."));
-static cl::opt<double> SampleProfileGlobalColdThreshold(
-    "sample-profile-global-cold-threshold", cl::init(0.5), cl::value_desc("N"),
-    cl::desc("Top-level functions that account for less than N% of all samples "
-             "collected in the profile, will be marked as cold for the inliner "
-             "to consider."));
 
 namespace {
 typedef DenseMap<const BasicBlock *, uint64_t> BlockWeightMap;
@@ -100,30 +93,19 @@ typedef DenseMap<const BasicBlock *, SmallVector<const BasicBlock *, 8>>
 /// This pass reads profile data from the file specified by
 /// -sample-profile-file and annotates every affected function with the
 /// profile information found in that file.
-class SampleProfileLoader : public ModulePass {
+class SampleProfileLoader {
 public:
-  // Class identification, replacement for typeinfo
-  static char ID;
-
   SampleProfileLoader(StringRef Name = SampleProfileFile)
-      : ModulePass(ID), DT(nullptr), PDT(nullptr), LI(nullptr), Reader(),
+      : DT(nullptr), PDT(nullptr), LI(nullptr), ACT(nullptr), Reader(),
         Samples(nullptr), Filename(Name), ProfileIsValid(false),
-        TotalCollectedSamples(0) {
-    initializeSampleProfileLoaderPass(*PassRegistry::getPassRegistry());
-  }
+        TotalCollectedSamples(0) {}
 
-  bool doInitialization(Module &M) override;
+  bool doInitialization(Module &M);
+  bool runOnModule(Module &M);
+  void setACT(AssumptionCacheTracker *A) { ACT = A; }
 
   void dump() { Reader->dump(); }
 
-  const char *getPassName() const override { return "Sample profile pass"; }
-
-  bool runOnModule(Module &M) override;
-
-  void getAnalysisUsage(AnalysisUsage &AU) const override {
-    AU.setPreservesCFG();
-  }
-
 protected:
   bool runOnFunction(Function &F);
   unsigned getFunctionLoc(Function &F);
@@ -133,14 +115,12 @@ protected:
   const FunctionSamples *findCalleeFunctionSamples(const CallInst &I) const;
   const FunctionSamples *findFunctionSamples(const Instruction &I) const;
   bool inlineHotFunctions(Function &F);
-  bool emitInlineHints(Function &F);
   void printEdgeWeight(raw_ostream &OS, Edge E);
   void printBlockWeight(raw_ostream &OS, const BasicBlock *BB) const;
   void printBlockEquivalence(raw_ostream &OS, const BasicBlock *BB);
   bool computeBlockWeights(Function &F);
   void findEquivalenceClasses(Function &F);
-  void findEquivalencesFor(BasicBlock *BB1,
-                           SmallVector<BasicBlock *, 8> Descendants,
+  void findEquivalencesFor(BasicBlock *BB1, ArrayRef<BasicBlock *> Descendants,
                            DominatorTreeBase<BasicBlock> *DomTree);
   void propagateWeights(Function &F);
   uint64_t visitEdge(Edge E, unsigned *NumUnknownEdges, Edge *UnknownEdge);
@@ -163,10 +143,10 @@ protected:
   EdgeWeightMap EdgeWeights;
 
   /// \brief Set of visited blocks during propagation.
-  SmallPtrSet<const BasicBlock *, 128> VisitedBlocks;
+  SmallPtrSet<const BasicBlock *, 32> VisitedBlocks;
 
   /// \brief Set of visited edges during propagation.
-  SmallSet<Edge, 128> VisitedEdges;
+  SmallSet<Edge, 32> VisitedEdges;
 
   /// \brief Equivalence classes for block weights.
   ///
@@ -181,6 +161,8 @@ protected:
   std::unique_ptr<DominatorTreeBase<BasicBlock>> PDT;
   std::unique_ptr<LoopInfo> LI;
 
+  AssumptionCacheTracker *ACT;
+
   /// \brief Predecessors for each basic block in the CFG.
   BlockEdgeMap Predecessors;
 
@@ -206,6 +188,32 @@ protected:
   uint64_t TotalCollectedSamples;
 };
 
+class SampleProfileLoaderLegacyPass : public ModulePass {
+public:
+  // Class identification, replacement for typeinfo
+  static char ID;
+
+  SampleProfileLoaderLegacyPass(StringRef Name = SampleProfileFile)
+      : ModulePass(ID), SampleLoader(Name) {
+    initializeSampleProfileLoaderLegacyPassPass(
+        *PassRegistry::getPassRegistry());
+  }
+
+  void dump() { SampleLoader.dump(); }
+
+  bool doInitialization(Module &M) override {
+    return SampleLoader.doInitialization(M);
+  }
+  const char *getPassName() const override { return "Sample profile pass"; }
+  bool runOnModule(Module &M) override;
+
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    AU.addRequired<AssumptionCacheTracker>();
+  }
+private:
+  SampleProfileLoader SampleLoader;
+};
+
 class SampleCoverageTracker {
 public:
   SampleCoverageTracker() : SampleCoverage(), TotalUsedSamples(0) {}
@@ -285,7 +293,6 @@ bool callsiteIsHot(const FunctionSamples *CallerFS,
       (double)CallsiteTotalSamples / (double)ParentTotalSamples * 100.0;
   return PercentSamples >= SampleProfileHotThreshold;
 }
-
 }
 
 /// Mark as used the sample record for the given function samples at
@@ -445,7 +452,7 @@ void SampleProfileLoader::printBlockWeight(raw_ostream &OS,
 /// \returns the weight of \p Inst.
 ErrorOr<uint64_t>
 SampleProfileLoader::getInstWeight(const Instruction &Inst) const {
-  DebugLoc DLoc = Inst.getDebugLoc();
+  const DebugLoc &DLoc = Inst.getDebugLoc();
   if (!DLoc)
     return std::error_code();
 
@@ -453,6 +460,11 @@ SampleProfileLoader::getInstWeight(const Instruction &Inst) const {
   if (!FS)
     return std::error_code();
 
+  // Ignore all dbg_value intrinsics.
+  const IntrinsicInst *II = dyn_cast<IntrinsicInst>(&Inst);
+  if (II && II->getIntrinsicID() == Intrinsic::dbg_value)
+    return std::error_code();
+
   const DILocation *DIL = DLoc;
   unsigned Lineno = DLoc.getLine();
   unsigned HeaderLineno = DIL->getScope()->getSubprogram()->getLine();
@@ -476,6 +488,13 @@ SampleProfileLoader::getInstWeight(const Instruction &Inst) const {
                  << Inst << " (line offset: " << Lineno - HeaderLineno << "."
                  << DIL->getDiscriminator() << " - weight: " << R.get()
                  << ")\n");
+  } else {
+    // If a call instruction is inlined in profile, but not inlined here,
+    // it means that the inlined callsite has no sample, thus the call
+    // instruction should have 0 count.
+    const CallInst *CI = dyn_cast<CallInst>(&Inst);
+    if (CI && findCalleeFunctionSamples(*CI))
+      R = 0;
   }
   return R;
 }
@@ -490,19 +509,22 @@ SampleProfileLoader::getInstWeight(const Instruction &Inst) const {
 /// \returns the weight for \p BB.
 ErrorOr<uint64_t>
 SampleProfileLoader::getBlockWeight(const BasicBlock *BB) const {
-  bool Found = false;
-  uint64_t Weight = 0;
+  DenseMap<uint64_t, uint64_t> CM;
   for (auto &I : BB->getInstList()) {
     const ErrorOr<uint64_t> &R = getInstWeight(I);
-    if (R && R.get() >= Weight) {
-      Weight = R.get();
-      Found = true;
+    if (R) CM[R.get()]++;
+  }
+  if (CM.size() == 0) return std::error_code();
+  uint64_t W = 0, C = 0;
+  for (const auto &C_W : CM) {
+    if (C_W.second == W) {
+      C = std::max(C, C_W.first);
+    } else if (C_W.second > W) {
+      C = C_W.first;
+      W = C_W.second;
     }
   }
-  if (Found)
-    return Weight;
-  else
-    return std::error_code();
+  return C;
 }
 
 /// \brief Compute and store the weights of every basic block.
@@ -549,19 +571,12 @@ SampleProfileLoader::findCalleeFunctionSamples(const CallInst &Inst) const {
   if (!SP)
     return nullptr;
 
-  Function *CalleeFunc = Inst.getCalledFunction();
-  if (!CalleeFunc) {
-    return nullptr;
-  }
-
-  StringRef CalleeName = CalleeFunc->getName();
   const FunctionSamples *FS = findFunctionSamples(Inst);
   if (FS == nullptr)
     return nullptr;
 
-  return FS->findFunctionSamplesAt(
-      CallsiteLocation(getOffset(DIL->getLine(), SP->getLine()),
-                       DIL->getDiscriminator(), CalleeName));
+  return FS->findFunctionSamplesAt(LineLocation(
+      getOffset(DIL->getLine(), SP->getLine()), DIL->getDiscriminator()));
 }
 
 /// \brief Get the FunctionSamples for an instruction.
@@ -575,22 +590,17 @@ SampleProfileLoader::findCalleeFunctionSamples(const CallInst &Inst) const {
 /// \returns the FunctionSamples pointer to the inlined instance.
 const FunctionSamples *
 SampleProfileLoader::findFunctionSamples(const Instruction &Inst) const {
-  SmallVector<CallsiteLocation, 10> S;
+  SmallVector<LineLocation, 10> S;
   const DILocation *DIL = Inst.getDebugLoc();
   if (!DIL) {
     return Samples;
   }
-  StringRef CalleeName;
-  for (const DILocation *DIL = Inst.getDebugLoc(); DIL;
-       DIL = DIL->getInlinedAt()) {
+  for (DIL = DIL->getInlinedAt(); DIL; DIL = DIL->getInlinedAt()) {
     DISubprogram *SP = DIL->getScope()->getSubprogram();
     if (!SP)
       return nullptr;
-    if (!CalleeName.empty()) {
-      S.push_back(CallsiteLocation(getOffset(DIL->getLine(), SP->getLine()),
-                                   DIL->getDiscriminator(), CalleeName));
-    }
-    CalleeName = SP->getLinkageName();
+    S.push_back(LineLocation(getOffset(DIL->getLine(), SP->getLine()),
+                             DIL->getDiscriminator()));
   }
   if (S.size() == 0)
     return Samples;
@@ -601,63 +611,6 @@ SampleProfileLoader::findFunctionSamples(const Instruction &Inst) const {
   return FS;
 }
 
-/// \brief Emit an inline hint if \p F is globally hot or cold.
-///
-/// If \p F consumes a significant fraction of samples (indicated by
-/// SampleProfileGlobalHotThreshold), apply the InlineHint attribute for the
-/// inliner to consider the function hot.
-///
-/// If \p F consumes a small fraction of samples (indicated by
-/// SampleProfileGlobalColdThreshold), apply the Cold attribute for the inliner
-/// to consider the function cold.
-///
-/// FIXME - This setting of inline hints is sub-optimal. Instead of marking a
-/// function globally hot or cold, we should be annotating individual callsites.
-/// This is not currently possible, but work on the inliner will eventually
-/// provide this ability. See http://reviews.llvm.org/D15003 for details and
-/// discussion.
-///
-/// \returns True if either attribute was applied to \p F.
-bool SampleProfileLoader::emitInlineHints(Function &F) {
-  if (TotalCollectedSamples == 0)
-    return false;
-
-  uint64_t FunctionSamples = Samples->getTotalSamples();
-  double SamplesPercent =
-      (double)FunctionSamples / (double)TotalCollectedSamples * 100.0;
-
-  // If the function collected more samples than the hot threshold, mark
-  // it globally hot.
-  if (SamplesPercent >= SampleProfileGlobalHotThreshold) {
-    F.addFnAttr(llvm::Attribute::InlineHint);
-    std::string Msg;
-    raw_string_ostream S(Msg);
-    S << "Applied inline hint to globally hot function '" << F.getName()
-      << "' with " << format("%.2f", SamplesPercent)
-      << "% of samples (threshold: "
-      << format("%.2f", SampleProfileGlobalHotThreshold.getValue()) << "%)";
-    S.flush();
-    emitOptimizationRemark(F.getContext(), DEBUG_TYPE, F, DebugLoc(), Msg);
-    return true;
-  }
-
-  // If the function collected fewer samples than the cold threshold, mark
-  // it globally cold.
-  if (SamplesPercent <= SampleProfileGlobalColdThreshold) {
-    F.addFnAttr(llvm::Attribute::Cold);
-    std::string Msg;
-    raw_string_ostream S(Msg);
-    S << "Applied cold hint to globally cold function '" << F.getName()
-      << "' with " << format("%.2f", SamplesPercent)
-      << "% of samples (threshold: "
-      << format("%.2f", SampleProfileGlobalColdThreshold.getValue()) << "%)";
-    S.flush();
-    emitOptimizationRemark(F.getContext(), DEBUG_TYPE, F, DebugLoc(), Msg);
-    return true;
-  }
-
-  return false;
-}
 
 /// \brief Iteratively inline hot callsites of a function.
 ///
@@ -685,7 +638,7 @@ bool SampleProfileLoader::inlineHotFunctions(Function &F) {
       }
     }
     for (auto CI : CIS) {
-      InlineFunctionInfo IFI;
+      InlineFunctionInfo IFI(nullptr, ACT);
       Function *CalledFunction = CI->getCalledFunction();
       DebugLoc DLoc = CI->getDebugLoc();
       uint64_t NumSamples = findCalleeFunctionSamples(*CI)->getTotalSamples();
@@ -731,7 +684,7 @@ bool SampleProfileLoader::inlineHotFunctions(Function &F) {
 ///                 with blocks from \p BB1's dominator tree, then
 ///                 this is the post-dominator tree, and vice versa.
 void SampleProfileLoader::findEquivalencesFor(
-    BasicBlock *BB1, SmallVector<BasicBlock *, 8> Descendants,
+    BasicBlock *BB1, ArrayRef<BasicBlock *> Descendants,
     DominatorTreeBase<BasicBlock> *DomTree) {
   const BasicBlock *EC = EquivalenceClass[BB1];
   uint64_t Weight = BlockWeights[EC];
@@ -859,23 +812,31 @@ bool SampleProfileLoader::propagateThroughEdges(Function &F) {
     // edge is unknown (see setEdgeOrBlockWeight).
     for (unsigned i = 0; i < 2; i++) {
       uint64_t TotalWeight = 0;
-      unsigned NumUnknownEdges = 0;
-      Edge UnknownEdge, SelfReferentialEdge;
+      unsigned NumUnknownEdges = 0, NumTotalEdges = 0;
+      Edge UnknownEdge, SelfReferentialEdge, SingleEdge;
 
       if (i == 0) {
         // First, visit all predecessor edges.
+        NumTotalEdges = Predecessors[BB].size();
         for (auto *Pred : Predecessors[BB]) {
           Edge E = std::make_pair(Pred, BB);
           TotalWeight += visitEdge(E, &NumUnknownEdges, &UnknownEdge);
           if (E.first == E.second)
             SelfReferentialEdge = E;
         }
+        if (NumTotalEdges == 1) {
+          SingleEdge = std::make_pair(Predecessors[BB][0], BB);
+        }
       } else {
         // On the second round, visit all successor edges.
+        NumTotalEdges = Successors[BB].size();
         for (auto *Succ : Successors[BB]) {
           Edge E = std::make_pair(BB, Succ);
           TotalWeight += visitEdge(E, &NumUnknownEdges, &UnknownEdge);
         }
+        if (NumTotalEdges == 1) {
+          SingleEdge = std::make_pair(BB, Successors[BB][0]);
+        }
       }
 
       // After visiting all the edges, there are three cases that we
@@ -904,18 +865,24 @@ bool SampleProfileLoader::propagateThroughEdges(Function &F) {
       if (NumUnknownEdges <= 1) {
         uint64_t &BBWeight = BlockWeights[EC];
         if (NumUnknownEdges == 0) {
-          // If we already know the weight of all edges, the weight of the
-          // basic block can be computed. It should be no larger than the sum
-          // of all edge weights.
-          if (TotalWeight > BBWeight) {
-            BBWeight = TotalWeight;
+          if (!VisitedBlocks.count(EC)) {
+            // If we already know the weight of all edges, the weight of the
+            // basic block can be computed. It should be no larger than the sum
+            // of all edge weights.
+            if (TotalWeight > BBWeight) {
+              BBWeight = TotalWeight;
+              Changed = true;
+              DEBUG(dbgs() << "All edge weights for " << BB->getName()
+                           << " known. Set weight for block: ";
+                    printBlockWeight(dbgs(), BB););
+            }
+          } else if (NumTotalEdges == 1 &&
+                     EdgeWeights[SingleEdge] < BlockWeights[EC]) {
+            // If there is only one edge for the visited basic block, use the
+            // block weight to adjust edge weight if edge weight is smaller.
+            EdgeWeights[SingleEdge] = BlockWeights[EC];
             Changed = true;
-            DEBUG(dbgs() << "All edge weights for " << BB->getName()
-                         << " known. Set weight for block: ";
-                  printBlockWeight(dbgs(), BB););
           }
-          if (VisitedBlocks.insert(EC).second)
-            Changed = true;
         } else if (NumUnknownEdges == 1 && VisitedBlocks.count(EC)) {
           // If there is a single unknown edge and the block has been
           // visited, then we can compute E's weight.
@@ -1020,6 +987,19 @@ void SampleProfileLoader::propagateWeights(Function &F) {
   MDBuilder MDB(Ctx);
   for (auto &BI : F) {
     BasicBlock *BB = &BI;
+
+    if (BlockWeights[BB]) {
+      for (auto &I : BB->getInstList()) {
+        if (CallInst *CI = dyn_cast<CallInst>(&I)) {
+          if (!dyn_cast<IntrinsicInst>(&I)) {
+            SmallVector<uint32_t, 1> Weights;
+            Weights.push_back(BlockWeights[BB]);
+            CI->setMetadata(LLVMContext::MD_prof, 
+                            MDB.createBranchWeights(Weights));
+          }
+        }
+      }
+    }
     TerminatorInst *TI = BB->getTerminator();
     if (TI->getNumSuccessors() == 1)
       continue;
@@ -1084,7 +1064,7 @@ void SampleProfileLoader::propagateWeights(Function &F) {
 /// \returns the line number where \p F is defined. If it returns 0,
 ///          it means that there is no debug information available for \p F.
 unsigned SampleProfileLoader::getFunctionLoc(Function &F) {
-  if (DISubprogram *S = getDISubprogram(&F))
+  if (DISubprogram *S = F.getSubprogram())
     return S->getLine();
 
   // If the start of \p F is missing, emit a diagnostic to inform the user
@@ -1165,8 +1145,6 @@ bool SampleProfileLoader::emitAnnotations(Function &F) {
   DEBUG(dbgs() << "Line number for the first instruction in " << F.getName()
                << ": " << getFunctionLoc(F) << "\n");
 
-  Changed |= emitInlineHints(F);
-
   Changed |= inlineHotFunctions(F);
 
   // Compute basic block weights.
@@ -1190,7 +1168,7 @@ bool SampleProfileLoader::emitAnnotations(Function &F) {
     unsigned Coverage = CoverageTracker.computeCoverage(Used, Total);
     if (Coverage < SampleProfileRecordCoverage) {
       F.getContext().diagnose(DiagnosticInfoSampleProfile(
-          getDISubprogram(&F)->getFilename(), getFunctionLoc(F),
+          F.getSubprogram()->getFilename(), getFunctionLoc(F),
           Twine(Used) + " of " + Twine(Total) + " available profile records (" +
               Twine(Coverage) + "%) were applied",
           DS_Warning));
@@ -1203,7 +1181,7 @@ bool SampleProfileLoader::emitAnnotations(Function &F) {
     unsigned Coverage = CoverageTracker.computeCoverage(Used, Total);
     if (Coverage < SampleProfileSampleCoverage) {
       F.getContext().diagnose(DiagnosticInfoSampleProfile(
-          getDISubprogram(&F)->getFilename(), getFunctionLoc(F),
+          F.getSubprogram()->getFilename(), getFunctionLoc(F),
           Twine(Used) + " of " + Twine(Total) + " available profile samples (" +
               Twine(Coverage) + "%) were applied",
           DS_Warning));
@@ -1212,12 +1190,12 @@ bool SampleProfileLoader::emitAnnotations(Function &F) {
   return Changed;
 }
 
-char SampleProfileLoader::ID = 0;
-INITIALIZE_PASS_BEGIN(SampleProfileLoader, "sample-profile",
-                      "Sample Profile loader", false, false)
-INITIALIZE_PASS_DEPENDENCY(AddDiscriminators)
-INITIALIZE_PASS_END(SampleProfileLoader, "sample-profile",
-                    "Sample Profile loader", false, false)
+char SampleProfileLoaderLegacyPass::ID = 0;
+INITIALIZE_PASS_BEGIN(SampleProfileLoaderLegacyPass, "sample-profile",
+                "Sample Profile loader", false, false)
+INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker)
+INITIALIZE_PASS_END(SampleProfileLoaderLegacyPass, "sample-profile",
+                "Sample Profile loader", false, false)
 
 bool SampleProfileLoader::doInitialization(Module &M) {
   auto &Ctx = M.getContext();
@@ -1233,11 +1211,11 @@ bool SampleProfileLoader::doInitialization(Module &M) {
 }
 
 ModulePass *llvm::createSampleProfileLoaderPass() {
-  return new SampleProfileLoader(SampleProfileFile);
+  return new SampleProfileLoaderLegacyPass(SampleProfileFile);
 }
 
 ModulePass *llvm::createSampleProfileLoaderPass(StringRef Name) {
-  return new SampleProfileLoader(Name);
+  return new SampleProfileLoaderLegacyPass(Name);
 }
 
 bool SampleProfileLoader::runOnModule(Module &M) {
@@ -1254,12 +1232,33 @@ bool SampleProfileLoader::runOnModule(Module &M) {
       clearFunctionData();
       retval |= runOnFunction(F);
     }
+  M.setProfileSummary(Reader->getSummary().getMD(M.getContext()));
   return retval;
 }
 
+bool SampleProfileLoaderLegacyPass::runOnModule(Module &M) {
+  // FIXME: pass in AssumptionCache correctly for the new pass manager. 
+  SampleLoader.setACT(&getAnalysis<AssumptionCacheTracker>());
+  return SampleLoader.runOnModule(M);
+}
+
 bool SampleProfileLoader::runOnFunction(Function &F) {
+  F.setEntryCount(0);
   Samples = Reader->getSamplesFor(F);
   if (!Samples->empty())
     return emitAnnotations(F);
   return false;
 }
+
+PreservedAnalyses SampleProfileLoaderPass::run(Module &M,
+                                               AnalysisManager<Module> &AM) {
+
+  SampleProfileLoader SampleLoader(SampleProfileFile);
+
+  SampleLoader.doInitialization(M);
+
+  if (!SampleLoader.runOnModule(M))
+    return PreservedAnalyses::all();
+
+  return PreservedAnalyses::none();
+}
diff --git a/lib/Transforms/IPO/StripDeadPrototypes.cpp b/lib/Transforms/IPO/StripDeadPrototypes.cpp
index c94cc7c74a89..3c3c5dd19d1f 100644
--- a/lib/Transforms/IPO/StripDeadPrototypes.cpp
+++ b/lib/Transforms/IPO/StripDeadPrototypes.cpp
@@ -53,7 +53,8 @@ static bool stripDeadPrototypes(Module &M) {
   return MadeChange;
 }
 
-PreservedAnalyses StripDeadPrototypesPass::run(Module &M) {
+PreservedAnalyses StripDeadPrototypesPass::run(Module &M,
+                                               ModuleAnalysisManager &) {
   if (stripDeadPrototypes(M))
     return PreservedAnalyses::none();
   return PreservedAnalyses::all();
@@ -69,6 +70,9 @@ public:
         *PassRegistry::getPassRegistry());
   }
   bool runOnModule(Module &M) override {
+    if (skipModule(M))
+      return false;
+
     return stripDeadPrototypes(M);
   }
 };
diff --git a/lib/Transforms/IPO/StripSymbols.cpp b/lib/Transforms/IPO/StripSymbols.cpp
index 46f352f7f9f1..fd250366cef2 100644
--- a/lib/Transforms/IPO/StripSymbols.cpp
+++ b/lib/Transforms/IPO/StripSymbols.cpp
@@ -21,7 +21,6 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/Transforms/IPO.h"
-#include "llvm/ADT/DenseMap.h"
 #include "llvm/ADT/SmallPtrSet.h"
 #include "llvm/IR/Constants.h"
 #include "llvm/IR/DebugInfo.h"
@@ -216,11 +215,11 @@ static bool StripSymbolNames(Module &M, bool PreserveDbgInfo) {
         I->setName("");     // Internal symbols can't participate in linkage
   }
 
-  for (Module::iterator I = M.begin(), E = M.end(); I != E; ++I) {
-    if (I->hasLocalLinkage() && llvmUsedValues.count(&*I) == 0)
-      if (!PreserveDbgInfo || !I->getName().startswith("llvm.dbg"))
-        I->setName("");     // Internal symbols can't participate in linkage
-    StripSymtab(I->getValueSymbolTable(), PreserveDbgInfo);
+  for (Function &I : M) {
+    if (I.hasLocalLinkage() && llvmUsedValues.count(&I) == 0)
+      if (!PreserveDbgInfo || !I.getName().startswith("llvm.dbg"))
+        I.setName(""); // Internal symbols can't participate in linkage
+    StripSymtab(I.getValueSymbolTable(), PreserveDbgInfo);
   }
 
   // Remove all names from types.
@@ -230,6 +229,9 @@ static bool StripSymbolNames(Module &M, bool PreserveDbgInfo) {
 }
 
 bool StripSymbols::runOnModule(Module &M) {
+  if (skipModule(M))
+    return false;
+
   bool Changed = false;
   Changed |= StripDebugInfo(M);
   if (!OnlyDebugInfo)
@@ -238,10 +240,15 @@ bool StripSymbols::runOnModule(Module &M) {
 }
 
 bool StripNonDebugSymbols::runOnModule(Module &M) {
+  if (skipModule(M))
+    return false;
+
   return StripSymbolNames(M, true);
 }
 
 bool StripDebugDeclare::runOnModule(Module &M) {
+  if (skipModule(M))
+    return false;
 
   Function *Declare = M.getFunction("llvm.dbg.declare");
   std::vector<Constant*> DeadConstants;
@@ -287,6 +294,9 @@ bool StripDebugDeclare::runOnModule(Module &M) {
 /// optimized away by the optimizer. This special pass removes debug info for
 /// such symbols.
 bool StripDeadDebugInfo::runOnModule(Module &M) {
+  if (skipModule(M))
+    return false;
+
   bool Changed = false;
 
   LLVMContext &C = M.getContext();
@@ -312,20 +322,6 @@ bool StripDeadDebugInfo::runOnModule(Module &M) {
   }
 
   for (DICompileUnit *DIC : F.compile_units()) {
-    // Create our live subprogram list.
-    bool SubprogramChange = false;
-    for (DISubprogram *DISP : DIC->getSubprograms()) {
-      // Make sure we visit each subprogram only once.
-      if (!VisitedSet.insert(DISP).second)
-        continue;
-
-      // If the function referenced by DISP is not null, the function is live.
-      if (LiveSPs.count(DISP))
-        LiveSubprograms.push_back(DISP);
-      else
-        SubprogramChange = true;
-    }
-
     // Create our live global variable list.
     bool GlobalVariableChange = false;
     for (DIGlobalVariable *DIG : DIC->getGlobalVariables()) {
@@ -341,14 +337,8 @@ bool StripDeadDebugInfo::runOnModule(Module &M) {
         GlobalVariableChange = true;
     }
 
-    // If we found dead subprograms or global variables, replace the current
-    // subprogram list/global variable list with our new live subprogram/global
-    // variable list.
-    if (SubprogramChange) {
-      DIC->replaceSubprograms(MDTuple::get(C, LiveSubprograms));
-      Changed = true;
-    }
-
+    // If we found dead global variables, replace the current global
+    // variable list with our new live global variable list.
     if (GlobalVariableChange) {
       DIC->replaceGlobalVariables(MDTuple::get(C, LiveGlobalVariables));
       Changed = true;
diff --git a/lib/Transforms/IPO/WholeProgramDevirt.cpp b/lib/Transforms/IPO/WholeProgramDevirt.cpp
new file mode 100644
index 000000000000..53eb4e2c9076
--- /dev/null
+++ b/lib/Transforms/IPO/WholeProgramDevirt.cpp
@@ -0,0 +1,843 @@
+//===- WholeProgramDevirt.cpp - Whole program virtual call optimization ---===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This pass implements whole program optimization of virtual calls in cases
+// where we know (via !type metadata) that the list of callees is fixed. This
+// includes the following:
+// - Single implementation devirtualization: if a virtual call has a single
+//   possible callee, replace all calls with a direct call to that callee.
+// - Virtual constant propagation: if the virtual function's return type is an
+//   integer <=64 bits and all possible callees are readnone, for each class and
+//   each list of constant arguments: evaluate the function, store the return
+//   value alongside the virtual table, and rewrite each virtual call as a load
+//   from the virtual table.
+// - Uniform return value optimization: if the conditions for virtual constant
+//   propagation hold and each function returns the same constant value, replace
+//   each virtual call with that constant.
+// - Unique return value optimization for i1 return values: if the conditions
+//   for virtual constant propagation hold and a single vtable's function
+//   returns 0, or a single vtable's function returns 1, replace each virtual
+//   call with a comparison of the vptr against that vtable's address.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Transforms/IPO/WholeProgramDevirt.h"
+#include "llvm/ADT/ArrayRef.h"
+#include "llvm/ADT/DenseSet.h"
+#include "llvm/ADT/MapVector.h"
+#include "llvm/Analysis/TypeMetadataUtils.h"
+#include "llvm/IR/CallSite.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/DataLayout.h"
+#include "llvm/IR/DiagnosticInfo.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/Intrinsics.h"
+#include "llvm/IR/Module.h"
+#include "llvm/Pass.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Transforms/IPO.h"
+#include "llvm/Transforms/Utils/Evaluator.h"
+#include "llvm/Transforms/Utils/Local.h"
+
+#include <set>
+
+using namespace llvm;
+using namespace wholeprogramdevirt;
+
+#define DEBUG_TYPE "wholeprogramdevirt"
+
+// Find the minimum offset that we may store a value of size Size bits at. If
+// IsAfter is set, look for an offset before the object, otherwise look for an
+// offset after the object.
+uint64_t
+wholeprogramdevirt::findLowestOffset(ArrayRef<VirtualCallTarget> Targets,
+                                     bool IsAfter, uint64_t Size) {
+  // Find a minimum offset taking into account only vtable sizes.
+  uint64_t MinByte = 0;
+  for (const VirtualCallTarget &Target : Targets) {
+    if (IsAfter)
+      MinByte = std::max(MinByte, Target.minAfterBytes());
+    else
+      MinByte = std::max(MinByte, Target.minBeforeBytes());
+  }
+
+  // Build a vector of arrays of bytes covering, for each target, a slice of the
+  // used region (see AccumBitVector::BytesUsed in
+  // llvm/Transforms/IPO/WholeProgramDevirt.h) starting at MinByte. Effectively,
+  // this aligns the used regions to start at MinByte.
+  //
+  // In this example, A, B and C are vtables, # is a byte already allocated for
+  // a virtual function pointer, AAAA... (etc.) are the used regions for the
+  // vtables and Offset(X) is the value computed for the Offset variable below
+  // for X.
+  //
+  //                    Offset(A)
+  //                    |       |
+  //                            |MinByte
+  // A: ################AAAAAAAA|AAAAAAAA
+  // B: ########BBBBBBBBBBBBBBBB|BBBB
+  // C: ########################|CCCCCCCCCCCCCCCC
+  //            |   Offset(B)   |
+  //
+  // This code produces the slices of A, B and C that appear after the divider
+  // at MinByte.
+  std::vector<ArrayRef<uint8_t>> Used;
+  for (const VirtualCallTarget &Target : Targets) {
+    ArrayRef<uint8_t> VTUsed = IsAfter ? Target.TM->Bits->After.BytesUsed
+                                       : Target.TM->Bits->Before.BytesUsed;
+    uint64_t Offset = IsAfter ? MinByte - Target.minAfterBytes()
+                              : MinByte - Target.minBeforeBytes();
+
+    // Disregard used regions that are smaller than Offset. These are
+    // effectively all-free regions that do not need to be checked.
+    if (VTUsed.size() > Offset)
+      Used.push_back(VTUsed.slice(Offset));
+  }
+
+  if (Size == 1) {
+    // Find a free bit in each member of Used.
+    for (unsigned I = 0;; ++I) {
+      uint8_t BitsUsed = 0;
+      for (auto &&B : Used)
+        if (I < B.size())
+          BitsUsed |= B[I];
+      if (BitsUsed != 0xff)
+        return (MinByte + I) * 8 +
+               countTrailingZeros(uint8_t(~BitsUsed), ZB_Undefined);
+    }
+  } else {
+    // Find a free (Size/8) byte region in each member of Used.
+    // FIXME: see if alignment helps.
+    for (unsigned I = 0;; ++I) {
+      for (auto &&B : Used) {
+        unsigned Byte = 0;
+        while ((I + Byte) < B.size() && Byte < (Size / 8)) {
+          if (B[I + Byte])
+            goto NextI;
+          ++Byte;
+        }
+      }
+      return (MinByte + I) * 8;
+    NextI:;
+    }
+  }
+}
+
+void wholeprogramdevirt::setBeforeReturnValues(
+    MutableArrayRef<VirtualCallTarget> Targets, uint64_t AllocBefore,
+    unsigned BitWidth, int64_t &OffsetByte, uint64_t &OffsetBit) {
+  if (BitWidth == 1)
+    OffsetByte = -(AllocBefore / 8 + 1);
+  else
+    OffsetByte = -((AllocBefore + 7) / 8 + (BitWidth + 7) / 8);
+  OffsetBit = AllocBefore % 8;
+
+  for (VirtualCallTarget &Target : Targets) {
+    if (BitWidth == 1)
+      Target.setBeforeBit(AllocBefore);
+    else
+      Target.setBeforeBytes(AllocBefore, (BitWidth + 7) / 8);
+  }
+}
+
+void wholeprogramdevirt::setAfterReturnValues(
+    MutableArrayRef<VirtualCallTarget> Targets, uint64_t AllocAfter,
+    unsigned BitWidth, int64_t &OffsetByte, uint64_t &OffsetBit) {
+  if (BitWidth == 1)
+    OffsetByte = AllocAfter / 8;
+  else
+    OffsetByte = (AllocAfter + 7) / 8;
+  OffsetBit = AllocAfter % 8;
+
+  for (VirtualCallTarget &Target : Targets) {
+    if (BitWidth == 1)
+      Target.setAfterBit(AllocAfter);
+    else
+      Target.setAfterBytes(AllocAfter, (BitWidth + 7) / 8);
+  }
+}
+
+VirtualCallTarget::VirtualCallTarget(Function *Fn, const TypeMemberInfo *TM)
+    : Fn(Fn), TM(TM),
+      IsBigEndian(Fn->getParent()->getDataLayout().isBigEndian()) {}
+
+namespace {
+
+// A slot in a set of virtual tables. The TypeID identifies the set of virtual
+// tables, and the ByteOffset is the offset in bytes from the address point to
+// the virtual function pointer.
+struct VTableSlot {
+  Metadata *TypeID;
+  uint64_t ByteOffset;
+};
+
+}
+
+namespace llvm {
+
+template <> struct DenseMapInfo<VTableSlot> {
+  static VTableSlot getEmptyKey() {
+    return {DenseMapInfo<Metadata *>::getEmptyKey(),
+            DenseMapInfo<uint64_t>::getEmptyKey()};
+  }
+  static VTableSlot getTombstoneKey() {
+    return {DenseMapInfo<Metadata *>::getTombstoneKey(),
+            DenseMapInfo<uint64_t>::getTombstoneKey()};
+  }
+  static unsigned getHashValue(const VTableSlot &I) {
+    return DenseMapInfo<Metadata *>::getHashValue(I.TypeID) ^
+           DenseMapInfo<uint64_t>::getHashValue(I.ByteOffset);
+  }
+  static bool isEqual(const VTableSlot &LHS,
+                      const VTableSlot &RHS) {
+    return LHS.TypeID == RHS.TypeID && LHS.ByteOffset == RHS.ByteOffset;
+  }
+};
+
+}
+
+namespace {
+
+// A virtual call site. VTable is the loaded virtual table pointer, and CS is
+// the indirect virtual call.
+struct VirtualCallSite {
+  Value *VTable;
+  CallSite CS;
+
+  // If non-null, this field points to the associated unsafe use count stored in
+  // the DevirtModule::NumUnsafeUsesForTypeTest map below. See the description
+  // of that field for details.
+  unsigned *NumUnsafeUses;
+
+  void emitRemark() {
+    Function *F = CS.getCaller();
+    emitOptimizationRemark(F->getContext(), DEBUG_TYPE, *F,
+                           CS.getInstruction()->getDebugLoc(),
+                           "devirtualized call");
+  }
+
+  void replaceAndErase(Value *New) {
+    emitRemark();
+    CS->replaceAllUsesWith(New);
+    if (auto II = dyn_cast<InvokeInst>(CS.getInstruction())) {
+      BranchInst::Create(II->getNormalDest(), CS.getInstruction());
+      II->getUnwindDest()->removePredecessor(II->getParent());
+    }
+    CS->eraseFromParent();
+    // This use is no longer unsafe.
+    if (NumUnsafeUses)
+      --*NumUnsafeUses;
+  }
+};
+
+struct DevirtModule {
+  Module &M;
+  IntegerType *Int8Ty;
+  PointerType *Int8PtrTy;
+  IntegerType *Int32Ty;
+
+  MapVector<VTableSlot, std::vector<VirtualCallSite>> CallSlots;
+
+  // This map keeps track of the number of "unsafe" uses of a loaded function
+  // pointer. The key is the associated llvm.type.test intrinsic call generated
+  // by this pass. An unsafe use is one that calls the loaded function pointer
+  // directly. Every time we eliminate an unsafe use (for example, by
+  // devirtualizing it or by applying virtual constant propagation), we
+  // decrement the value stored in this map. If a value reaches zero, we can
+  // eliminate the type check by RAUWing the associated llvm.type.test call with
+  // true.
+  std::map<CallInst *, unsigned> NumUnsafeUsesForTypeTest;
+
+  DevirtModule(Module &M)
+      : M(M), Int8Ty(Type::getInt8Ty(M.getContext())),
+        Int8PtrTy(Type::getInt8PtrTy(M.getContext())),
+        Int32Ty(Type::getInt32Ty(M.getContext())) {}
+
+  void scanTypeTestUsers(Function *TypeTestFunc, Function *AssumeFunc);
+  void scanTypeCheckedLoadUsers(Function *TypeCheckedLoadFunc);
+
+  void buildTypeIdentifierMap(
+      std::vector<VTableBits> &Bits,
+      DenseMap<Metadata *, std::set<TypeMemberInfo>> &TypeIdMap);
+  bool
+  tryFindVirtualCallTargets(std::vector<VirtualCallTarget> &TargetsForSlot,
+                            const std::set<TypeMemberInfo> &TypeMemberInfos,
+                            uint64_t ByteOffset);
+  bool trySingleImplDevirt(ArrayRef<VirtualCallTarget> TargetsForSlot,
+                           MutableArrayRef<VirtualCallSite> CallSites);
+  bool tryEvaluateFunctionsWithArgs(
+      MutableArrayRef<VirtualCallTarget> TargetsForSlot,
+      ArrayRef<ConstantInt *> Args);
+  bool tryUniformRetValOpt(IntegerType *RetType,
+                           ArrayRef<VirtualCallTarget> TargetsForSlot,
+                           MutableArrayRef<VirtualCallSite> CallSites);
+  bool tryUniqueRetValOpt(unsigned BitWidth,
+                          ArrayRef<VirtualCallTarget> TargetsForSlot,
+                          MutableArrayRef<VirtualCallSite> CallSites);
+  bool tryVirtualConstProp(MutableArrayRef<VirtualCallTarget> TargetsForSlot,
+                           ArrayRef<VirtualCallSite> CallSites);
+
+  void rebuildGlobal(VTableBits &B);
+
+  bool run();
+};
+
+struct WholeProgramDevirt : public ModulePass {
+  static char ID;
+  WholeProgramDevirt() : ModulePass(ID) {
+    initializeWholeProgramDevirtPass(*PassRegistry::getPassRegistry());
+  }
+  bool runOnModule(Module &M) {
+    if (skipModule(M))
+      return false;
+
+    return DevirtModule(M).run();
+  }
+};
+
+} // anonymous namespace
+
+INITIALIZE_PASS(WholeProgramDevirt, "wholeprogramdevirt",
+                "Whole program devirtualization", false, false)
+char WholeProgramDevirt::ID = 0;
+
+ModulePass *llvm::createWholeProgramDevirtPass() {
+  return new WholeProgramDevirt;
+}
+
+PreservedAnalyses WholeProgramDevirtPass::run(Module &M,
+                                              ModuleAnalysisManager &) {
+  if (!DevirtModule(M).run())
+    return PreservedAnalyses::all();
+  return PreservedAnalyses::none();
+}
+
+void DevirtModule::buildTypeIdentifierMap(
+    std::vector<VTableBits> &Bits,
+    DenseMap<Metadata *, std::set<TypeMemberInfo>> &TypeIdMap) {
+  DenseMap<GlobalVariable *, VTableBits *> GVToBits;
+  Bits.reserve(M.getGlobalList().size());
+  SmallVector<MDNode *, 2> Types;
+  for (GlobalVariable &GV : M.globals()) {
+    Types.clear();
+    GV.getMetadata(LLVMContext::MD_type, Types);
+    if (Types.empty())
+      continue;
+
+    VTableBits *&BitsPtr = GVToBits[&GV];
+    if (!BitsPtr) {
+      Bits.emplace_back();
+      Bits.back().GV = &GV;
+      Bits.back().ObjectSize =
+          M.getDataLayout().getTypeAllocSize(GV.getInitializer()->getType());
+      BitsPtr = &Bits.back();
+    }
+
+    for (MDNode *Type : Types) {
+      auto TypeID = Type->getOperand(1).get();
+
+      uint64_t Offset =
+          cast<ConstantInt>(
+              cast<ConstantAsMetadata>(Type->getOperand(0))->getValue())
+              ->getZExtValue();
+
+      TypeIdMap[TypeID].insert({BitsPtr, Offset});
+    }
+  }
+}
+
+bool DevirtModule::tryFindVirtualCallTargets(
+    std::vector<VirtualCallTarget> &TargetsForSlot,
+    const std::set<TypeMemberInfo> &TypeMemberInfos, uint64_t ByteOffset) {
+  for (const TypeMemberInfo &TM : TypeMemberInfos) {
+    if (!TM.Bits->GV->isConstant())
+      return false;
+
+    auto Init = dyn_cast<ConstantArray>(TM.Bits->GV->getInitializer());
+    if (!Init)
+      return false;
+    ArrayType *VTableTy = Init->getType();
+
+    uint64_t ElemSize =
+        M.getDataLayout().getTypeAllocSize(VTableTy->getElementType());
+    uint64_t GlobalSlotOffset = TM.Offset + ByteOffset;
+    if (GlobalSlotOffset % ElemSize != 0)
+      return false;
+
+    unsigned Op = GlobalSlotOffset / ElemSize;
+    if (Op >= Init->getNumOperands())
+      return false;
+
+    auto Fn = dyn_cast<Function>(Init->getOperand(Op)->stripPointerCasts());
+    if (!Fn)
+      return false;
+
+    // We can disregard __cxa_pure_virtual as a possible call target, as
+    // calls to pure virtuals are UB.
+    if (Fn->getName() == "__cxa_pure_virtual")
+      continue;
+
+    TargetsForSlot.push_back({Fn, &TM});
+  }
+
+  // Give up if we couldn't find any targets.
+  return !TargetsForSlot.empty();
+}
+
+bool DevirtModule::trySingleImplDevirt(
+    ArrayRef<VirtualCallTarget> TargetsForSlot,
+    MutableArrayRef<VirtualCallSite> CallSites) {
+  // See if the program contains a single implementation of this virtual
+  // function.
+  Function *TheFn = TargetsForSlot[0].Fn;
+  for (auto &&Target : TargetsForSlot)
+    if (TheFn != Target.Fn)
+      return false;
+
+  // If so, update each call site to call that implementation directly.
+  for (auto &&VCallSite : CallSites) {
+    VCallSite.emitRemark();
+    VCallSite.CS.setCalledFunction(ConstantExpr::getBitCast(
+        TheFn, VCallSite.CS.getCalledValue()->getType()));
+    // This use is no longer unsafe.
+    if (VCallSite.NumUnsafeUses)
+      --*VCallSite.NumUnsafeUses;
+  }
+  return true;
+}
+
+bool DevirtModule::tryEvaluateFunctionsWithArgs(
+    MutableArrayRef<VirtualCallTarget> TargetsForSlot,
+    ArrayRef<ConstantInt *> Args) {
+  // Evaluate each function and store the result in each target's RetVal
+  // field.
+  for (VirtualCallTarget &Target : TargetsForSlot) {
+    if (Target.Fn->arg_size() != Args.size() + 1)
+      return false;
+    for (unsigned I = 0; I != Args.size(); ++I)
+      if (Target.Fn->getFunctionType()->getParamType(I + 1) !=
+          Args[I]->getType())
+        return false;
+
+    Evaluator Eval(M.getDataLayout(), nullptr);
+    SmallVector<Constant *, 2> EvalArgs;
+    EvalArgs.push_back(
+        Constant::getNullValue(Target.Fn->getFunctionType()->getParamType(0)));
+    EvalArgs.insert(EvalArgs.end(), Args.begin(), Args.end());
+    Constant *RetVal;
+    if (!Eval.EvaluateFunction(Target.Fn, RetVal, EvalArgs) ||
+        !isa<ConstantInt>(RetVal))
+      return false;
+    Target.RetVal = cast<ConstantInt>(RetVal)->getZExtValue();
+  }
+  return true;
+}
+
+bool DevirtModule::tryUniformRetValOpt(
+    IntegerType *RetType, ArrayRef<VirtualCallTarget> TargetsForSlot,
+    MutableArrayRef<VirtualCallSite> CallSites) {
+  // Uniform return value optimization. If all functions return the same
+  // constant, replace all calls with that constant.
+  uint64_t TheRetVal = TargetsForSlot[0].RetVal;
+  for (const VirtualCallTarget &Target : TargetsForSlot)
+    if (Target.RetVal != TheRetVal)
+      return false;
+
+  auto TheRetValConst = ConstantInt::get(RetType, TheRetVal);
+  for (auto Call : CallSites)
+    Call.replaceAndErase(TheRetValConst);
+  return true;
+}
+
+bool DevirtModule::tryUniqueRetValOpt(
+    unsigned BitWidth, ArrayRef<VirtualCallTarget> TargetsForSlot,
+    MutableArrayRef<VirtualCallSite> CallSites) {
+  // IsOne controls whether we look for a 0 or a 1.
+  auto tryUniqueRetValOptFor = [&](bool IsOne) {
+    const TypeMemberInfo *UniqueMember = 0;
+    for (const VirtualCallTarget &Target : TargetsForSlot) {
+      if (Target.RetVal == (IsOne ? 1 : 0)) {
+        if (UniqueMember)
+          return false;
+        UniqueMember = Target.TM;
+      }
+    }
+
+    // We should have found a unique member or bailed out by now. We already
+    // checked for a uniform return value in tryUniformRetValOpt.
+    assert(UniqueMember);
+
+    // Replace each call with the comparison.
+    for (auto &&Call : CallSites) {
+      IRBuilder<> B(Call.CS.getInstruction());
+      Value *OneAddr = B.CreateBitCast(UniqueMember->Bits->GV, Int8PtrTy);
+      OneAddr = B.CreateConstGEP1_64(OneAddr, UniqueMember->Offset);
+      Value *Cmp = B.CreateICmp(IsOne ? ICmpInst::ICMP_EQ : ICmpInst::ICMP_NE,
+                                Call.VTable, OneAddr);
+      Call.replaceAndErase(Cmp);
+    }
+    return true;
+  };
+
+  if (BitWidth == 1) {
+    if (tryUniqueRetValOptFor(true))
+      return true;
+    if (tryUniqueRetValOptFor(false))
+      return true;
+  }
+  return false;
+}
+
+bool DevirtModule::tryVirtualConstProp(
+    MutableArrayRef<VirtualCallTarget> TargetsForSlot,
+    ArrayRef<VirtualCallSite> CallSites) {
+  // This only works if the function returns an integer.
+  auto RetType = dyn_cast<IntegerType>(TargetsForSlot[0].Fn->getReturnType());
+  if (!RetType)
+    return false;
+  unsigned BitWidth = RetType->getBitWidth();
+  if (BitWidth > 64)
+    return false;
+
+  // Make sure that each function does not access memory, takes at least one
+  // argument, does not use its first argument (which we assume is 'this'),
+  // and has the same return type.
+  for (VirtualCallTarget &Target : TargetsForSlot) {
+    if (!Target.Fn->doesNotAccessMemory() || Target.Fn->arg_empty() ||
+        !Target.Fn->arg_begin()->use_empty() ||
+        Target.Fn->getReturnType() != RetType)
+      return false;
+  }
+
+  // Group call sites by the list of constant arguments they pass.
+  // The comparator ensures deterministic ordering.
+  struct ByAPIntValue {
+    bool operator()(const std::vector<ConstantInt *> &A,
+                    const std::vector<ConstantInt *> &B) const {
+      return std::lexicographical_compare(
+          A.begin(), A.end(), B.begin(), B.end(),
+          [](ConstantInt *AI, ConstantInt *BI) {
+            return AI->getValue().ult(BI->getValue());
+          });
+    }
+  };
+  std::map<std::vector<ConstantInt *>, std::vector<VirtualCallSite>,
+           ByAPIntValue>
+      VCallSitesByConstantArg;
+  for (auto &&VCallSite : CallSites) {
+    std::vector<ConstantInt *> Args;
+    if (VCallSite.CS.getType() != RetType)
+      continue;
+    for (auto &&Arg :
+         make_range(VCallSite.CS.arg_begin() + 1, VCallSite.CS.arg_end())) {
+      if (!isa<ConstantInt>(Arg))
+        break;
+      Args.push_back(cast<ConstantInt>(&Arg));
+    }
+    if (Args.size() + 1 != VCallSite.CS.arg_size())
+      continue;
+
+    VCallSitesByConstantArg[Args].push_back(VCallSite);
+  }
+
+  for (auto &&CSByConstantArg : VCallSitesByConstantArg) {
+    if (!tryEvaluateFunctionsWithArgs(TargetsForSlot, CSByConstantArg.first))
+      continue;
+
+    if (tryUniformRetValOpt(RetType, TargetsForSlot, CSByConstantArg.second))
+      continue;
+
+    if (tryUniqueRetValOpt(BitWidth, TargetsForSlot, CSByConstantArg.second))
+      continue;
+
+    // Find an allocation offset in bits in all vtables associated with the
+    // type.
+    uint64_t AllocBefore =
+        findLowestOffset(TargetsForSlot, /*IsAfter=*/false, BitWidth);
+    uint64_t AllocAfter =
+        findLowestOffset(TargetsForSlot, /*IsAfter=*/true, BitWidth);
+
+    // Calculate the total amount of padding needed to store a value at both
+    // ends of the object.
+    uint64_t TotalPaddingBefore = 0, TotalPaddingAfter = 0;
+    for (auto &&Target : TargetsForSlot) {
+      TotalPaddingBefore += std::max<int64_t>(
+          (AllocBefore + 7) / 8 - Target.allocatedBeforeBytes() - 1, 0);
+      TotalPaddingAfter += std::max<int64_t>(
+          (AllocAfter + 7) / 8 - Target.allocatedAfterBytes() - 1, 0);
+    }
+
+    // If the amount of padding is too large, give up.
+    // FIXME: do something smarter here.
+    if (std::min(TotalPaddingBefore, TotalPaddingAfter) > 128)
+      continue;
+
+    // Calculate the offset to the value as a (possibly negative) byte offset
+    // and (if applicable) a bit offset, and store the values in the targets.
+    int64_t OffsetByte;
+    uint64_t OffsetBit;
+    if (TotalPaddingBefore <= TotalPaddingAfter)
+      setBeforeReturnValues(TargetsForSlot, AllocBefore, BitWidth, OffsetByte,
+                            OffsetBit);
+    else
+      setAfterReturnValues(TargetsForSlot, AllocAfter, BitWidth, OffsetByte,
+                           OffsetBit);
+
+    // Rewrite each call to a load from OffsetByte/OffsetBit.
+    for (auto Call : CSByConstantArg.second) {
+      IRBuilder<> B(Call.CS.getInstruction());
+      Value *Addr = B.CreateConstGEP1_64(Call.VTable, OffsetByte);
+      if (BitWidth == 1) {
+        Value *Bits = B.CreateLoad(Addr);
+        Value *Bit = ConstantInt::get(Int8Ty, 1ULL << OffsetBit);
+        Value *BitsAndBit = B.CreateAnd(Bits, Bit);
+        auto IsBitSet = B.CreateICmpNE(BitsAndBit, ConstantInt::get(Int8Ty, 0));
+        Call.replaceAndErase(IsBitSet);
+      } else {
+        Value *ValAddr = B.CreateBitCast(Addr, RetType->getPointerTo());
+        Value *Val = B.CreateLoad(RetType, ValAddr);
+        Call.replaceAndErase(Val);
+      }
+    }
+  }
+  return true;
+}
+
+void DevirtModule::rebuildGlobal(VTableBits &B) {
+  if (B.Before.Bytes.empty() && B.After.Bytes.empty())
+    return;
+
+  // Align each byte array to pointer width.
+  unsigned PointerSize = M.getDataLayout().getPointerSize();
+  B.Before.Bytes.resize(alignTo(B.Before.Bytes.size(), PointerSize));
+  B.After.Bytes.resize(alignTo(B.After.Bytes.size(), PointerSize));
+
+  // Before was stored in reverse order; flip it now.
+  for (size_t I = 0, Size = B.Before.Bytes.size(); I != Size / 2; ++I)
+    std::swap(B.Before.Bytes[I], B.Before.Bytes[Size - 1 - I]);
+
+  // Build an anonymous global containing the before bytes, followed by the
+  // original initializer, followed by the after bytes.
+  auto NewInit = ConstantStruct::getAnon(
+      {ConstantDataArray::get(M.getContext(), B.Before.Bytes),
+       B.GV->getInitializer(),
+       ConstantDataArray::get(M.getContext(), B.After.Bytes)});
+  auto NewGV =
+      new GlobalVariable(M, NewInit->getType(), B.GV->isConstant(),
+                         GlobalVariable::PrivateLinkage, NewInit, "", B.GV);
+  NewGV->setSection(B.GV->getSection());
+  NewGV->setComdat(B.GV->getComdat());
+
+  // Copy the original vtable's metadata to the anonymous global, adjusting
+  // offsets as required.
+  NewGV->copyMetadata(B.GV, B.Before.Bytes.size());
+
+  // Build an alias named after the original global, pointing at the second
+  // element (the original initializer).
+  auto Alias = GlobalAlias::create(
+      B.GV->getInitializer()->getType(), 0, B.GV->getLinkage(), "",
+      ConstantExpr::getGetElementPtr(
+          NewInit->getType(), NewGV,
+          ArrayRef<Constant *>{ConstantInt::get(Int32Ty, 0),
+                               ConstantInt::get(Int32Ty, 1)}),
+      &M);
+  Alias->setVisibility(B.GV->getVisibility());
+  Alias->takeName(B.GV);
+
+  B.GV->replaceAllUsesWith(Alias);
+  B.GV->eraseFromParent();
+}
+
+void DevirtModule::scanTypeTestUsers(Function *TypeTestFunc,
+                                     Function *AssumeFunc) {
+  // Find all virtual calls via a virtual table pointer %p under an assumption
+  // of the form llvm.assume(llvm.type.test(%p, %md)). This indicates that %p
+  // points to a member of the type identifier %md. Group calls by (type ID,
+  // offset) pair (effectively the identity of the virtual function) and store
+  // to CallSlots.
+  DenseSet<Value *> SeenPtrs;
+  for (auto I = TypeTestFunc->use_begin(), E = TypeTestFunc->use_end();
+       I != E;) {
+    auto CI = dyn_cast<CallInst>(I->getUser());
+    ++I;
+    if (!CI)
+      continue;
+
+    // Search for virtual calls based on %p and add them to DevirtCalls.
+    SmallVector<DevirtCallSite, 1> DevirtCalls;
+    SmallVector<CallInst *, 1> Assumes;
+    findDevirtualizableCallsForTypeTest(DevirtCalls, Assumes, CI);
+
+    // If we found any, add them to CallSlots. Only do this if we haven't seen
+    // the vtable pointer before, as it may have been CSE'd with pointers from
+    // other call sites, and we don't want to process call sites multiple times.
+    if (!Assumes.empty()) {
+      Metadata *TypeId =
+          cast<MetadataAsValue>(CI->getArgOperand(1))->getMetadata();
+      Value *Ptr = CI->getArgOperand(0)->stripPointerCasts();
+      if (SeenPtrs.insert(Ptr).second) {
+        for (DevirtCallSite Call : DevirtCalls) {
+          CallSlots[{TypeId, Call.Offset}].push_back(
+              {CI->getArgOperand(0), Call.CS, nullptr});
+        }
+      }
+    }
+
+    // We no longer need the assumes or the type test.
+    for (auto Assume : Assumes)
+      Assume->eraseFromParent();
+    // We can't use RecursivelyDeleteTriviallyDeadInstructions here because we
+    // may use the vtable argument later.
+    if (CI->use_empty())
+      CI->eraseFromParent();
+  }
+}
+
+void DevirtModule::scanTypeCheckedLoadUsers(Function *TypeCheckedLoadFunc) {
+  Function *TypeTestFunc = Intrinsic::getDeclaration(&M, Intrinsic::type_test);
+
+  for (auto I = TypeCheckedLoadFunc->use_begin(),
+            E = TypeCheckedLoadFunc->use_end();
+       I != E;) {
+    auto CI = dyn_cast<CallInst>(I->getUser());
+    ++I;
+    if (!CI)
+      continue;
+
+    Value *Ptr = CI->getArgOperand(0);
+    Value *Offset = CI->getArgOperand(1);
+    Value *TypeIdValue = CI->getArgOperand(2);
+    Metadata *TypeId = cast<MetadataAsValue>(TypeIdValue)->getMetadata();
+
+    SmallVector<DevirtCallSite, 1> DevirtCalls;
+    SmallVector<Instruction *, 1> LoadedPtrs;
+    SmallVector<Instruction *, 1> Preds;
+    bool HasNonCallUses = false;
+    findDevirtualizableCallsForTypeCheckedLoad(DevirtCalls, LoadedPtrs, Preds,
+                                               HasNonCallUses, CI);
+
+    // Start by generating "pessimistic" code that explicitly loads the function
+    // pointer from the vtable and performs the type check. If possible, we will
+    // eliminate the load and the type check later.
+
+    // If possible, only generate the load at the point where it is used.
+    // This helps avoid unnecessary spills.
+    IRBuilder<> LoadB(
+        (LoadedPtrs.size() == 1 && !HasNonCallUses) ? LoadedPtrs[0] : CI);
+    Value *GEP = LoadB.CreateGEP(Int8Ty, Ptr, Offset);
+    Value *GEPPtr = LoadB.CreateBitCast(GEP, PointerType::getUnqual(Int8PtrTy));
+    Value *LoadedValue = LoadB.CreateLoad(Int8PtrTy, GEPPtr);
+
+    for (Instruction *LoadedPtr : LoadedPtrs) {
+      LoadedPtr->replaceAllUsesWith(LoadedValue);
+      LoadedPtr->eraseFromParent();
+    }
+
+    // Likewise for the type test.
+    IRBuilder<> CallB((Preds.size() == 1 && !HasNonCallUses) ? Preds[0] : CI);
+    CallInst *TypeTestCall = CallB.CreateCall(TypeTestFunc, {Ptr, TypeIdValue});
+
+    for (Instruction *Pred : Preds) {
+      Pred->replaceAllUsesWith(TypeTestCall);
+      Pred->eraseFromParent();
+    }
+
+    // We have already erased any extractvalue instructions that refer to the
+    // intrinsic call, but the intrinsic may have other non-extractvalue uses
+    // (although this is unlikely). In that case, explicitly build a pair and
+    // RAUW it.
+    if (!CI->use_empty()) {
+      Value *Pair = UndefValue::get(CI->getType());
+      IRBuilder<> B(CI);
+      Pair = B.CreateInsertValue(Pair, LoadedValue, {0});
+      Pair = B.CreateInsertValue(Pair, TypeTestCall, {1});
+      CI->replaceAllUsesWith(Pair);
+    }
+
+    // The number of unsafe uses is initially the number of uses.
+    auto &NumUnsafeUses = NumUnsafeUsesForTypeTest[TypeTestCall];
+    NumUnsafeUses = DevirtCalls.size();
+
+    // If the function pointer has a non-call user, we cannot eliminate the type
+    // check, as one of those users may eventually call the pointer. Increment
+    // the unsafe use count to make sure it cannot reach zero.
+    if (HasNonCallUses)
+      ++NumUnsafeUses;
+    for (DevirtCallSite Call : DevirtCalls) {
+      CallSlots[{TypeId, Call.Offset}].push_back(
+          {Ptr, Call.CS, &NumUnsafeUses});
+    }
+
+    CI->eraseFromParent();
+  }
+}
+
+bool DevirtModule::run() {
+  Function *TypeTestFunc =
+      M.getFunction(Intrinsic::getName(Intrinsic::type_test));
+  Function *TypeCheckedLoadFunc =
+      M.getFunction(Intrinsic::getName(Intrinsic::type_checked_load));
+  Function *AssumeFunc = M.getFunction(Intrinsic::getName(Intrinsic::assume));
+
+  if ((!TypeTestFunc || TypeTestFunc->use_empty() || !AssumeFunc ||
+       AssumeFunc->use_empty()) &&
+      (!TypeCheckedLoadFunc || TypeCheckedLoadFunc->use_empty()))
+    return false;
+
+  if (TypeTestFunc && AssumeFunc)
+    scanTypeTestUsers(TypeTestFunc, AssumeFunc);
+
+  if (TypeCheckedLoadFunc)
+    scanTypeCheckedLoadUsers(TypeCheckedLoadFunc);
+
+  // Rebuild type metadata into a map for easy lookup.
+  std::vector<VTableBits> Bits;
+  DenseMap<Metadata *, std::set<TypeMemberInfo>> TypeIdMap;
+  buildTypeIdentifierMap(Bits, TypeIdMap);
+  if (TypeIdMap.empty())
+    return true;
+
+  // For each (type, offset) pair:
+  bool DidVirtualConstProp = false;
+  for (auto &S : CallSlots) {
+    // Search each of the members of the type identifier for the virtual
+    // function implementation at offset S.first.ByteOffset, and add to
+    // TargetsForSlot.
+    std::vector<VirtualCallTarget> TargetsForSlot;
+    if (!tryFindVirtualCallTargets(TargetsForSlot, TypeIdMap[S.first.TypeID],
+                                   S.first.ByteOffset))
+      continue;
+
+    if (trySingleImplDevirt(TargetsForSlot, S.second))
+      continue;
+
+    DidVirtualConstProp |= tryVirtualConstProp(TargetsForSlot, S.second);
+  }
+
+  // If we were able to eliminate all unsafe uses for a type checked load,
+  // eliminate the type test by replacing it with true.
+  if (TypeCheckedLoadFunc) {
+    auto True = ConstantInt::getTrue(M.getContext());
+    for (auto &&U : NumUnsafeUsesForTypeTest) {
+      if (U.second == 0) {
+        U.first->replaceAllUsesWith(True);
+        U.first->eraseFromParent();
+      }
+    }
+  }
+
+  // Rebuild each global we touched as part of virtual constant propagation to
+  // include the before and after bytes.
+  if (DidVirtualConstProp)
+    for (VTableBits &B : Bits)
+      rebuildGlobal(B);
+
+  return true;
+}
diff --git a/lib/Transforms/InstCombine/InstCombineAddSub.cpp b/lib/Transforms/InstCombine/InstCombineAddSub.cpp
index 6f49399f57bf..221a22007173 100644
--- a/lib/Transforms/InstCombine/InstCombineAddSub.cpp
+++ b/lib/Transforms/InstCombine/InstCombineAddSub.cpp
@@ -58,7 +58,6 @@ namespace {
     // operators inevitably call FAddendCoef's constructor which is not cheap.
     void operator=(const FAddendCoef &A);
     void operator+=(const FAddendCoef &A);
-    void operator-=(const FAddendCoef &A);
     void operator*=(const FAddendCoef &S);
 
     bool isOne() const { return isInt() && IntVal == 1; }
@@ -123,11 +122,18 @@ namespace {
     bool isConstant() const { return Val == nullptr; }
     bool isZero() const { return Coeff.isZero(); }
 
-    void set(short Coefficient, Value *V) { Coeff.set(Coefficient), Val = V; }
-    void set(const APFloat& Coefficient, Value *V)
-      { Coeff.set(Coefficient); Val = V; }
-    void set(const ConstantFP* Coefficient, Value *V)
-      { Coeff.set(Coefficient->getValueAPF()); Val = V; }
+    void set(short Coefficient, Value *V) {
+      Coeff.set(Coefficient);
+      Val = V;
+    }
+    void set(const APFloat &Coefficient, Value *V) {
+      Coeff.set(Coefficient);
+      Val = V;
+    }
+    void set(const ConstantFP *Coefficient, Value *V) {
+      Coeff.set(Coefficient->getValueAPF());
+      Val = V;
+    }
 
     void negate() { Coeff.negate(); }
 
@@ -272,27 +278,6 @@ void FAddendCoef::operator+=(const FAddendCoef &That) {
   T.add(createAPFloatFromInt(T.getSemantics(), That.IntVal), RndMode);
 }
 
-void FAddendCoef::operator-=(const FAddendCoef &That) {
-  enum APFloat::roundingMode RndMode = APFloat::rmNearestTiesToEven;
-  if (isInt() == That.isInt()) {
-    if (isInt())
-      IntVal -= That.IntVal;
-    else
-      getFpVal().subtract(That.getFpVal(), RndMode);
-    return;
-  }
-
-  if (isInt()) {
-    const APFloat &T = That.getFpVal();
-    convertToFpType(T.getSemantics());
-    getFpVal().subtract(T, RndMode);
-    return;
-  }
-
-  APFloat &T = getFpVal();
-  T.subtract(createAPFloatFromInt(T.getSemantics(), IntVal), RndMode);
-}
-
 void FAddendCoef::operator*=(const FAddendCoef &That) {
   if (That.isOne())
     return;
@@ -321,8 +306,6 @@ void FAddendCoef::operator*=(const FAddendCoef &That) {
                 APFloat::rmNearestTiesToEven);
   else
     F0.multiply(That.getFpVal(), APFloat::rmNearestTiesToEven);
-
-  return;
 }
 
 void FAddendCoef::negate() {
@@ -716,10 +699,9 @@ Value *FAddCombine::createNaryFAdd
   bool LastValNeedNeg = false;
 
   // Iterate the addends, creating fadd/fsub using adjacent two addends.
-  for (AddendVect::const_iterator I = Opnds.begin(), E = Opnds.end();
-       I != E; I++) {
+  for (const FAddend *Opnd : Opnds) {
     bool NeedNeg;
-    Value *V = createAddendVal(**I, NeedNeg);
+    Value *V = createAddendVal(*Opnd, NeedNeg);
     if (!LastVal) {
       LastVal = V;
       LastValNeedNeg = NeedNeg;
@@ -808,9 +790,7 @@ unsigned FAddCombine::calcInstrNumber(const AddendVect &Opnds) {
   unsigned NegOpndNum = 0;
 
   // Adjust the number of instructions needed to emit the N-ary add.
-  for (AddendVect::const_iterator I = Opnds.begin(), E = Opnds.end();
-       I != E; I++) {
-    const FAddend *Opnd = *I;
+  for (const FAddend *Opnd : Opnds) {
     if (Opnd->isConstant())
       continue;
 
@@ -1052,22 +1032,26 @@ Instruction *InstCombiner::visitAdd(BinaryOperator &I) {
   Value *LHS = I.getOperand(0), *RHS = I.getOperand(1);
 
   if (Value *V = SimplifyVectorOp(I))
-    return ReplaceInstUsesWith(I, V);
+    return replaceInstUsesWith(I, V);
 
   if (Value *V = SimplifyAddInst(LHS, RHS, I.hasNoSignedWrap(),
                                  I.hasNoUnsignedWrap(), DL, TLI, DT, AC))
-    return ReplaceInstUsesWith(I, V);
+    return replaceInstUsesWith(I, V);
 
    // (A*B)+(A*C) -> A*(B+C) etc
   if (Value *V = SimplifyUsingDistributiveLaws(I))
-    return ReplaceInstUsesWith(I, V);
+    return replaceInstUsesWith(I, V);
 
-  if (ConstantInt *CI = dyn_cast<ConstantInt>(RHS)) {
+  const APInt *Val;
+  if (match(RHS, m_APInt(Val))) {
     // X + (signbit) --> X ^ signbit
-    const APInt &Val = CI->getValue();
-    if (Val.isSignBit())
+    if (Val->isSignBit())
       return BinaryOperator::CreateXor(LHS, RHS);
+  }
 
+  // FIXME: Use the match above instead of dyn_cast to allow these transforms
+  // for splat vectors.
+  if (ConstantInt *CI = dyn_cast<ConstantInt>(RHS)) {
     // See if SimplifyDemandedBits can simplify this.  This handles stuff like
     // (X & 254)+1 -> (X&254)|1
     if (SimplifyDemandedInstructionBits(I))
@@ -1157,7 +1141,7 @@ Instruction *InstCombiner::visitAdd(BinaryOperator &I) {
       return BinaryOperator::CreateSub(LHS, V);
 
   if (Value *V = checkForNegativeOperand(I, Builder))
-    return ReplaceInstUsesWith(I, V);
+    return replaceInstUsesWith(I, V);
 
   // A+B --> A|B iff A and B have no bits set in common.
   if (haveNoCommonBitsSet(LHS, RHS, DL, AC, &I, DT))
@@ -1169,6 +1153,9 @@ Instruction *InstCombiner::visitAdd(BinaryOperator &I) {
       return BinaryOperator::CreateSub(SubOne(CRHS), X);
   }
 
+  // FIXME: We already did a check for ConstantInt RHS above this.
+  // FIXME: Is this pattern covered by another fold? No regression tests fail on
+  // removal.
   if (ConstantInt *CRHS = dyn_cast<ConstantInt>(RHS)) {
     // (X & FF00) + xx00  -> (X+xx00) & FF00
     Value *X;
@@ -1317,11 +1304,11 @@ Instruction *InstCombiner::visitFAdd(BinaryOperator &I) {
   Value *LHS = I.getOperand(0), *RHS = I.getOperand(1);
 
   if (Value *V = SimplifyVectorOp(I))
-    return ReplaceInstUsesWith(I, V);
+    return replaceInstUsesWith(I, V);
 
   if (Value *V =
           SimplifyFAddInst(LHS, RHS, I.getFastMathFlags(), DL, TLI, DT, AC))
-    return ReplaceInstUsesWith(I, V);
+    return replaceInstUsesWith(I, V);
 
   if (isa<Constant>(RHS)) {
     if (isa<PHINode>(LHS))
@@ -1415,7 +1402,7 @@ Instruction *InstCombiner::visitFAdd(BinaryOperator &I) {
 
   if (I.hasUnsafeAlgebra()) {
     if (Value *V = FAddCombine(Builder).simplify(&I))
-      return ReplaceInstUsesWith(I, V);
+      return replaceInstUsesWith(I, V);
   }
 
   return Changed ? &I : nullptr;
@@ -1493,15 +1480,15 @@ Instruction *InstCombiner::visitSub(BinaryOperator &I) {
   Value *Op0 = I.getOperand(0), *Op1 = I.getOperand(1);
 
   if (Value *V = SimplifyVectorOp(I))
-    return ReplaceInstUsesWith(I, V);
+    return replaceInstUsesWith(I, V);
 
   if (Value *V = SimplifySubInst(Op0, Op1, I.hasNoSignedWrap(),
                                  I.hasNoUnsignedWrap(), DL, TLI, DT, AC))
-    return ReplaceInstUsesWith(I, V);
+    return replaceInstUsesWith(I, V);
 
   // (A*B)-(A*C) -> A*(B-C) etc
   if (Value *V = SimplifyUsingDistributiveLaws(I))
-    return ReplaceInstUsesWith(I, V);
+    return replaceInstUsesWith(I, V);
 
   // If this is a 'B = x-(-A)', change to B = x+A.
   if (Value *V = dyn_castNegVal(Op1)) {
@@ -1667,13 +1654,13 @@ Instruction *InstCombiner::visitSub(BinaryOperator &I) {
   if (match(Op0, m_PtrToInt(m_Value(LHSOp))) &&
       match(Op1, m_PtrToInt(m_Value(RHSOp))))
     if (Value *Res = OptimizePointerDifference(LHSOp, RHSOp, I.getType()))
-      return ReplaceInstUsesWith(I, Res);
+      return replaceInstUsesWith(I, Res);
 
   // trunc(p)-trunc(q) -> trunc(p-q)
   if (match(Op0, m_Trunc(m_PtrToInt(m_Value(LHSOp)))) &&
       match(Op1, m_Trunc(m_PtrToInt(m_Value(RHSOp)))))
     if (Value *Res = OptimizePointerDifference(LHSOp, RHSOp, I.getType()))
-      return ReplaceInstUsesWith(I, Res);
+      return replaceInstUsesWith(I, Res);
 
   bool Changed = false;
   if (!I.hasNoSignedWrap() && WillNotOverflowSignedSub(Op0, Op1, I)) {
@@ -1692,11 +1679,11 @@ Instruction *InstCombiner::visitFSub(BinaryOperator &I) {
   Value *Op0 = I.getOperand(0), *Op1 = I.getOperand(1);
 
   if (Value *V = SimplifyVectorOp(I))
-    return ReplaceInstUsesWith(I, V);
+    return replaceInstUsesWith(I, V);
 
   if (Value *V =
           SimplifyFSubInst(Op0, Op1, I.getFastMathFlags(), DL, TLI, DT, AC))
-    return ReplaceInstUsesWith(I, V);
+    return replaceInstUsesWith(I, V);
 
   // fsub nsz 0, X ==> fsub nsz -0.0, X
   if (I.getFastMathFlags().noSignedZeros() && match(Op0, m_Zero())) {
@@ -1736,7 +1723,7 @@ Instruction *InstCombiner::visitFSub(BinaryOperator &I) {
 
   if (I.hasUnsafeAlgebra()) {
     if (Value *V = FAddCombine(Builder).simplify(&I))
-      return ReplaceInstUsesWith(I, V);
+      return replaceInstUsesWith(I, V);
   }
 
   return nullptr;
diff --git a/lib/Transforms/InstCombine/InstCombineAndOrXor.cpp b/lib/Transforms/InstCombine/InstCombineAndOrXor.cpp
index 76cefd97cd8f..1a6459b3d689 100644
--- a/lib/Transforms/InstCombine/InstCombineAndOrXor.cpp
+++ b/lib/Transforms/InstCombine/InstCombineAndOrXor.cpp
@@ -39,30 +39,29 @@ static inline Value *dyn_castNotVal(Value *V) {
 }
 
 /// Similar to getICmpCode but for FCmpInst. This encodes a fcmp predicate into
-/// a three bit mask. It also returns whether it is an ordered predicate by
-/// reference.
-static unsigned getFCmpCode(FCmpInst::Predicate CC, bool &isOrdered) {
-  isOrdered = false;
-  switch (CC) {
-  case FCmpInst::FCMP_ORD: isOrdered = true; return 0;  // 000
-  case FCmpInst::FCMP_UNO:                   return 0;  // 000
-  case FCmpInst::FCMP_OGT: isOrdered = true; return 1;  // 001
-  case FCmpInst::FCMP_UGT:                   return 1;  // 001
-  case FCmpInst::FCMP_OEQ: isOrdered = true; return 2;  // 010
-  case FCmpInst::FCMP_UEQ:                   return 2;  // 010
-  case FCmpInst::FCMP_OGE: isOrdered = true; return 3;  // 011
-  case FCmpInst::FCMP_UGE:                   return 3;  // 011
-  case FCmpInst::FCMP_OLT: isOrdered = true; return 4;  // 100
-  case FCmpInst::FCMP_ULT:                   return 4;  // 100
-  case FCmpInst::FCMP_ONE: isOrdered = true; return 5;  // 101
-  case FCmpInst::FCMP_UNE:                   return 5;  // 101
-  case FCmpInst::FCMP_OLE: isOrdered = true; return 6;  // 110
-  case FCmpInst::FCMP_ULE:                   return 6;  // 110
-    // True -> 7
-  default:
-    // Not expecting FCMP_FALSE and FCMP_TRUE;
-    llvm_unreachable("Unexpected FCmp predicate!");
-  }
+/// a four bit mask.
+static unsigned getFCmpCode(FCmpInst::Predicate CC) {
+  assert(FCmpInst::FCMP_FALSE <= CC && CC <= FCmpInst::FCMP_TRUE &&
+         "Unexpected FCmp predicate!");
+  // Take advantage of the bit pattern of FCmpInst::Predicate here.
+  //                                                 U L G E
+  static_assert(FCmpInst::FCMP_FALSE ==  0, "");  // 0 0 0 0
+  static_assert(FCmpInst::FCMP_OEQ   ==  1, "");  // 0 0 0 1
+  static_assert(FCmpInst::FCMP_OGT   ==  2, "");  // 0 0 1 0
+  static_assert(FCmpInst::FCMP_OGE   ==  3, "");  // 0 0 1 1
+  static_assert(FCmpInst::FCMP_OLT   ==  4, "");  // 0 1 0 0
+  static_assert(FCmpInst::FCMP_OLE   ==  5, "");  // 0 1 0 1
+  static_assert(FCmpInst::FCMP_ONE   ==  6, "");  // 0 1 1 0
+  static_assert(FCmpInst::FCMP_ORD   ==  7, "");  // 0 1 1 1
+  static_assert(FCmpInst::FCMP_UNO   ==  8, "");  // 1 0 0 0
+  static_assert(FCmpInst::FCMP_UEQ   ==  9, "");  // 1 0 0 1
+  static_assert(FCmpInst::FCMP_UGT   == 10, "");  // 1 0 1 0
+  static_assert(FCmpInst::FCMP_UGE   == 11, "");  // 1 0 1 1
+  static_assert(FCmpInst::FCMP_ULT   == 12, "");  // 1 1 0 0
+  static_assert(FCmpInst::FCMP_ULE   == 13, "");  // 1 1 0 1
+  static_assert(FCmpInst::FCMP_UNE   == 14, "");  // 1 1 1 0
+  static_assert(FCmpInst::FCMP_TRUE  == 15, "");  // 1 1 1 1
+  return CC;
 }
 
 /// This is the complement of getICmpCode, which turns an opcode and two
@@ -78,26 +77,16 @@ static Value *getNewICmpValue(bool Sign, unsigned Code, Value *LHS, Value *RHS,
 }
 
 /// This is the complement of getFCmpCode, which turns an opcode and two
-/// operands into either a FCmp instruction. isordered is passed in to determine
-/// which kind of predicate to use in the new fcmp instruction.
-static Value *getFCmpValue(bool isordered, unsigned code,
-                           Value *LHS, Value *RHS,
+/// operands into either a FCmp instruction, or a true/false constant.
+static Value *getFCmpValue(unsigned Code, Value *LHS, Value *RHS,
                            InstCombiner::BuilderTy *Builder) {
-  CmpInst::Predicate Pred;
-  switch (code) {
-  default: llvm_unreachable("Illegal FCmp code!");
-  case 0: Pred = isordered ? FCmpInst::FCMP_ORD : FCmpInst::FCMP_UNO; break;
-  case 1: Pred = isordered ? FCmpInst::FCMP_OGT : FCmpInst::FCMP_UGT; break;
-  case 2: Pred = isordered ? FCmpInst::FCMP_OEQ : FCmpInst::FCMP_UEQ; break;
-  case 3: Pred = isordered ? FCmpInst::FCMP_OGE : FCmpInst::FCMP_UGE; break;
-  case 4: Pred = isordered ? FCmpInst::FCMP_OLT : FCmpInst::FCMP_ULT; break;
-  case 5: Pred = isordered ? FCmpInst::FCMP_ONE : FCmpInst::FCMP_UNE; break;
-  case 6: Pred = isordered ? FCmpInst::FCMP_OLE : FCmpInst::FCMP_ULE; break;
-  case 7:
-    if (!isordered)
-      return ConstantInt::get(CmpInst::makeCmpResultType(LHS->getType()), 1);
-    Pred = FCmpInst::FCMP_ORD; break;
-  }
+  const auto Pred = static_cast<FCmpInst::Predicate>(Code);
+  assert(FCmpInst::FCMP_FALSE <= Pred && Pred <= FCmpInst::FCMP_TRUE &&
+         "Unexpected FCmp predicate!");
+  if (Pred == FCmpInst::FCMP_FALSE)
+    return ConstantInt::get(CmpInst::makeCmpResultType(LHS->getType()), 0);
+  if (Pred == FCmpInst::FCMP_TRUE)
+    return ConstantInt::get(CmpInst::makeCmpResultType(LHS->getType()), 1);
   return Builder->CreateFCmp(Pred, LHS, RHS);
 }
 
@@ -243,7 +232,7 @@ Instruction *InstCombiner::OptAndOp(Instruction *Op,
 
     if (CI->getValue() == ShlMask)
       // Masking out bits that the shift already masks.
-      return ReplaceInstUsesWith(TheAnd, Op);   // No need for the and.
+      return replaceInstUsesWith(TheAnd, Op);   // No need for the and.
 
     if (CI != AndRHS) {                  // Reducing bits set in and.
       TheAnd.setOperand(1, CI);
@@ -263,7 +252,7 @@ Instruction *InstCombiner::OptAndOp(Instruction *Op,
 
     if (CI->getValue() == ShrMask)
       // Masking out bits that the shift already masks.
-      return ReplaceInstUsesWith(TheAnd, Op);
+      return replaceInstUsesWith(TheAnd, Op);
 
     if (CI != AndRHS) {
       TheAnd.setOperand(1, CI);  // Reduce bits set in and cst.
@@ -465,11 +454,9 @@ static unsigned getTypeOfMaskedICmp(Value* A, Value* B, Value* C,
   if (CCst && CCst->isZero()) {
     // if C is zero, then both A and B qualify as mask
     result |= (icmp_eq ? (FoldMskICmp_Mask_AllZeroes |
-                          FoldMskICmp_Mask_AllZeroes |
                           FoldMskICmp_AMask_Mixed |
                           FoldMskICmp_BMask_Mixed)
                        : (FoldMskICmp_Mask_NotAllZeroes |
-                          FoldMskICmp_Mask_NotAllZeroes |
                           FoldMskICmp_AMask_NotMixed |
                           FoldMskICmp_BMask_NotMixed));
     if (icmp_abit)
@@ -666,7 +653,7 @@ static unsigned foldLogOpOfMaskedICmpsHelper(Value*& A,
   if (!ICmpInst::isEquality(RHSCC))
     return 0;
 
-  // Look for ANDs in on the right side of the RHS icmp.
+  // Look for ANDs on the right side of the RHS icmp.
   if (!ok && R2->getType()->isIntegerTy()) {
     if (!match(R2, m_And(m_Value(R11), m_Value(R12)))) {
       R11 = R2;
@@ -694,9 +681,9 @@ static unsigned foldLogOpOfMaskedICmpsHelper(Value*& A,
     B = L21; C = L1;
   }
 
-  unsigned left_type = getTypeOfMaskedICmp(A, B, C, LHSCC);
-  unsigned right_type = getTypeOfMaskedICmp(A, D, E, RHSCC);
-  return left_type & right_type;
+  unsigned LeftType = getTypeOfMaskedICmp(A, B, C, LHSCC);
+  unsigned RightType = getTypeOfMaskedICmp(A, D, E, RHSCC);
+  return LeftType & RightType;
 }
 
 /// Try to fold (icmp(A & B) ==/!= C) &/| (icmp(A & D) ==/!= E)
@@ -705,9 +692,9 @@ static Value *foldLogOpOfMaskedICmps(ICmpInst *LHS, ICmpInst *RHS, bool IsAnd,
                                      llvm::InstCombiner::BuilderTy *Builder) {
   Value *A = nullptr, *B = nullptr, *C = nullptr, *D = nullptr, *E = nullptr;
   ICmpInst::Predicate LHSCC = LHS->getPredicate(), RHSCC = RHS->getPredicate();
-  unsigned mask = foldLogOpOfMaskedICmpsHelper(A, B, C, D, E, LHS, RHS,
+  unsigned Mask = foldLogOpOfMaskedICmpsHelper(A, B, C, D, E, LHS, RHS,
                                                LHSCC, RHSCC);
-  if (mask == 0) return nullptr;
+  if (Mask == 0) return nullptr;
   assert(ICmpInst::isEquality(LHSCC) && ICmpInst::isEquality(RHSCC) &&
          "foldLogOpOfMaskedICmpsHelper must return an equality predicate.");
 
@@ -723,48 +710,48 @@ static Value *foldLogOpOfMaskedICmps(ICmpInst *LHS, ICmpInst *RHS, bool IsAnd,
   // input and output).
 
   // In most cases we're going to produce an EQ for the "&&" case.
-  ICmpInst::Predicate NEWCC = IsAnd ? ICmpInst::ICMP_EQ : ICmpInst::ICMP_NE;
+  ICmpInst::Predicate NewCC = IsAnd ? ICmpInst::ICMP_EQ : ICmpInst::ICMP_NE;
   if (!IsAnd) {
     // Convert the masking analysis into its equivalent with negated
     // comparisons.
-    mask = conjugateICmpMask(mask);
+    Mask = conjugateICmpMask(Mask);
   }
 
-  if (mask & FoldMskICmp_Mask_AllZeroes) {
+  if (Mask & FoldMskICmp_Mask_AllZeroes) {
     // (icmp eq (A & B), 0) & (icmp eq (A & D), 0)
     // -> (icmp eq (A & (B|D)), 0)
-    Value *newOr = Builder->CreateOr(B, D);
-    Value *newAnd = Builder->CreateAnd(A, newOr);
-    // we can't use C as zero, because we might actually handle
+    Value *NewOr = Builder->CreateOr(B, D);
+    Value *NewAnd = Builder->CreateAnd(A, NewOr);
+    // We can't use C as zero because we might actually handle
     //   (icmp ne (A & B), B) & (icmp ne (A & D), D)
-    // with B and D, having a single bit set
-    Value *zero = Constant::getNullValue(A->getType());
-    return Builder->CreateICmp(NEWCC, newAnd, zero);
+    // with B and D, having a single bit set.
+    Value *Zero = Constant::getNullValue(A->getType());
+    return Builder->CreateICmp(NewCC, NewAnd, Zero);
   }
-  if (mask & FoldMskICmp_BMask_AllOnes) {
+  if (Mask & FoldMskICmp_BMask_AllOnes) {
     // (icmp eq (A & B), B) & (icmp eq (A & D), D)
     // -> (icmp eq (A & (B|D)), (B|D))
-    Value *newOr = Builder->CreateOr(B, D);
-    Value *newAnd = Builder->CreateAnd(A, newOr);
-    return Builder->CreateICmp(NEWCC, newAnd, newOr);
+    Value *NewOr = Builder->CreateOr(B, D);
+    Value *NewAnd = Builder->CreateAnd(A, NewOr);
+    return Builder->CreateICmp(NewCC, NewAnd, NewOr);
   }
-  if (mask & FoldMskICmp_AMask_AllOnes) {
+  if (Mask & FoldMskICmp_AMask_AllOnes) {
     // (icmp eq (A & B), A) & (icmp eq (A & D), A)
     // -> (icmp eq (A & (B&D)), A)
-    Value *newAnd1 = Builder->CreateAnd(B, D);
-    Value *newAnd = Builder->CreateAnd(A, newAnd1);
-    return Builder->CreateICmp(NEWCC, newAnd, A);
+    Value *NewAnd1 = Builder->CreateAnd(B, D);
+    Value *NewAnd2 = Builder->CreateAnd(A, NewAnd1);
+    return Builder->CreateICmp(NewCC, NewAnd2, A);
   }
 
   // Remaining cases assume at least that B and D are constant, and depend on
-  // their actual values. This isn't strictly, necessary, just a "handle the
+  // their actual values. This isn't strictly necessary, just a "handle the
   // easy cases for now" decision.
   ConstantInt *BCst = dyn_cast<ConstantInt>(B);
   if (!BCst) return nullptr;
   ConstantInt *DCst = dyn_cast<ConstantInt>(D);
   if (!DCst) return nullptr;
 
-  if (mask & (FoldMskICmp_Mask_NotAllZeroes | FoldMskICmp_BMask_NotAllOnes)) {
+  if (Mask & (FoldMskICmp_Mask_NotAllZeroes | FoldMskICmp_BMask_NotAllOnes)) {
     // (icmp ne (A & B), 0) & (icmp ne (A & D), 0) and
     // (icmp ne (A & B), B) & (icmp ne (A & D), D)
     //     -> (icmp ne (A & B), 0) or (icmp ne (A & D), 0)
@@ -777,7 +764,7 @@ static Value *foldLogOpOfMaskedICmps(ICmpInst *LHS, ICmpInst *RHS, bool IsAnd,
     else if (NewMask == DCst->getValue())
       return RHS;
   }
-  if (mask & FoldMskICmp_AMask_NotAllOnes) {
+  if (Mask & FoldMskICmp_AMask_NotAllOnes) {
     // (icmp ne (A & B), B) & (icmp ne (A & D), D)
     //     -> (icmp ne (A & B), A) or (icmp ne (A & D), A)
     // Only valid if one of the masks is a superset of the other (check "B|D" is
@@ -789,7 +776,7 @@ static Value *foldLogOpOfMaskedICmps(ICmpInst *LHS, ICmpInst *RHS, bool IsAnd,
     else if (NewMask == DCst->getValue())
       return RHS;
   }
-  if (mask & FoldMskICmp_BMask_Mixed) {
+  if (Mask & FoldMskICmp_BMask_Mixed) {
     // (icmp eq (A & B), C) & (icmp eq (A & D), E)
     // We already know that B & C == C && D & E == E.
     // If we can prove that (B & D) & (C ^ E) == 0, that is, the bits of
@@ -797,26 +784,26 @@ static Value *foldLogOpOfMaskedICmps(ICmpInst *LHS, ICmpInst *RHS, bool IsAnd,
     // contradict, then we can transform to
     // -> (icmp eq (A & (B|D)), (C|E))
     // Currently, we only handle the case of B, C, D, and E being constant.
-    // we can't simply use C and E, because we might actually handle
+    // We can't simply use C and E because we might actually handle
     //   (icmp ne (A & B), B) & (icmp eq (A & D), D)
-    // with B and D, having a single bit set
+    // with B and D, having a single bit set.
     ConstantInt *CCst = dyn_cast<ConstantInt>(C);
     if (!CCst) return nullptr;
     ConstantInt *ECst = dyn_cast<ConstantInt>(E);
     if (!ECst) return nullptr;
-    if (LHSCC != NEWCC)
+    if (LHSCC != NewCC)
       CCst = cast<ConstantInt>(ConstantExpr::getXor(BCst, CCst));
-    if (RHSCC != NEWCC)
+    if (RHSCC != NewCC)
       ECst = cast<ConstantInt>(ConstantExpr::getXor(DCst, ECst));
-    // if there is a conflict we should actually return a false for the
-    // whole construct
+    // If there is a conflict, we should actually return a false for the
+    // whole construct.
     if (((BCst->getValue() & DCst->getValue()) &
          (CCst->getValue() ^ ECst->getValue())) != 0)
       return ConstantInt::get(LHS->getType(), !IsAnd);
-    Value *newOr1 = Builder->CreateOr(B, D);
-    Value *newOr2 = ConstantExpr::getOr(CCst, ECst);
-    Value *newAnd = Builder->CreateAnd(A, newOr1);
-    return Builder->CreateICmp(NEWCC, newAnd, newOr2);
+    Value *NewOr1 = Builder->CreateOr(B, D);
+    Value *NewOr2 = ConstantExpr::getOr(CCst, ECst);
+    Value *NewAnd = Builder->CreateAnd(A, NewOr1);
+    return Builder->CreateICmp(NewCC, NewAnd, NewOr2);
   }
   return nullptr;
 }
@@ -915,15 +902,10 @@ Value *InstCombiner::FoldAndOfICmps(ICmpInst *LHS, ICmpInst *RHS) {
 
   if (LHSCst == RHSCst && LHSCC == RHSCC) {
     // (icmp ult A, C) & (icmp ult B, C) --> (icmp ult (A|B), C)
-    // where C is a power of 2
-    if (LHSCC == ICmpInst::ICMP_ULT &&
-        LHSCst->getValue().isPowerOf2()) {
-      Value *NewOr = Builder->CreateOr(Val, Val2);
-      return Builder->CreateICmp(LHSCC, NewOr, LHSCst);
-    }
-
+    // where C is a power of 2 or
     // (icmp eq A, 0) & (icmp eq B, 0) --> (icmp eq (A|B), 0)
-    if (LHSCC == ICmpInst::ICMP_EQ && LHSCst->isZero()) {
+    if ((LHSCC == ICmpInst::ICMP_ULT && LHSCst->getValue().isPowerOf2()) ||
+        (LHSCC == ICmpInst::ICMP_EQ && LHSCst->isZero())) {
       Value *NewOr = Builder->CreateOr(Val, Val2);
       return Builder->CreateICmp(LHSCC, NewOr, LHSCst);
     }
@@ -975,16 +957,6 @@ Value *InstCombiner::FoldAndOfICmps(ICmpInst *LHS, ICmpInst *RHS) {
       RHSCC == ICmpInst::ICMP_SGE || RHSCC == ICmpInst::ICMP_SLE)
     return nullptr;
 
-  // Make a constant range that's the intersection of the two icmp ranges.
-  // If the intersection is empty, we know that the result is false.
-  ConstantRange LHSRange =
-      ConstantRange::makeAllowedICmpRegion(LHSCC, LHSCst->getValue());
-  ConstantRange RHSRange =
-      ConstantRange::makeAllowedICmpRegion(RHSCC, RHSCst->getValue());
-
-  if (LHSRange.intersectWith(RHSRange).isEmptySet())
-    return ConstantInt::get(CmpInst::makeCmpResultType(LHS->getType()), 0);
-
   // We can't fold (ugt x, C) & (sgt x, C2).
   if (!PredicatesFoldable(LHSCC, RHSCC))
     return nullptr;
@@ -1124,6 +1096,29 @@ Value *InstCombiner::FoldAndOfICmps(ICmpInst *LHS, ICmpInst *RHS) {
 /// Optimize (fcmp)&(fcmp).  NOTE: Unlike the rest of instcombine, this returns
 /// a Value which should already be inserted into the function.
 Value *InstCombiner::FoldAndOfFCmps(FCmpInst *LHS, FCmpInst *RHS) {
+  Value *Op0LHS = LHS->getOperand(0), *Op0RHS = LHS->getOperand(1);
+  Value *Op1LHS = RHS->getOperand(0), *Op1RHS = RHS->getOperand(1);
+  FCmpInst::Predicate Op0CC = LHS->getPredicate(), Op1CC = RHS->getPredicate();
+
+  if (Op0LHS == Op1RHS && Op0RHS == Op1LHS) {
+    // Swap RHS operands to match LHS.
+    Op1CC = FCmpInst::getSwappedPredicate(Op1CC);
+    std::swap(Op1LHS, Op1RHS);
+  }
+
+  // Simplify (fcmp cc0 x, y) & (fcmp cc1 x, y).
+  // Suppose the relation between x and y is R, where R is one of
+  // U(1000), L(0100), G(0010) or E(0001), and CC0 and CC1 are the bitmasks for
+  // testing the desired relations.
+  //
+  // Since (R & CC0) and (R & CC1) are either R or 0, we actually have this:
+  //    bool(R & CC0) && bool(R & CC1)
+  //  = bool((R & CC0) & (R & CC1))
+  //  = bool(R & (CC0 & CC1)) <= by re-association, commutation, and idempotency
+  if (Op0LHS == Op1LHS && Op0RHS == Op1RHS)
+    return getFCmpValue(getFCmpCode(Op0CC) & getFCmpCode(Op1CC), Op0LHS, Op0RHS,
+                        Builder);
+
   if (LHS->getPredicate() == FCmpInst::FCMP_ORD &&
       RHS->getPredicate() == FCmpInst::FCMP_ORD) {
     if (LHS->getOperand(0)->getType() != RHS->getOperand(0)->getType())
@@ -1147,56 +1142,6 @@ Value *InstCombiner::FoldAndOfFCmps(FCmpInst *LHS, FCmpInst *RHS) {
     return nullptr;
   }
 
-  Value *Op0LHS = LHS->getOperand(0), *Op0RHS = LHS->getOperand(1);
-  Value *Op1LHS = RHS->getOperand(0), *Op1RHS = RHS->getOperand(1);
-  FCmpInst::Predicate Op0CC = LHS->getPredicate(), Op1CC = RHS->getPredicate();
-
-
-  if (Op0LHS == Op1RHS && Op0RHS == Op1LHS) {
-    // Swap RHS operands to match LHS.
-    Op1CC = FCmpInst::getSwappedPredicate(Op1CC);
-    std::swap(Op1LHS, Op1RHS);
-  }
-
-  if (Op0LHS == Op1LHS && Op0RHS == Op1RHS) {
-    // Simplify (fcmp cc0 x, y) & (fcmp cc1 x, y).
-    if (Op0CC == Op1CC)
-      return Builder->CreateFCmp((FCmpInst::Predicate)Op0CC, Op0LHS, Op0RHS);
-    if (Op0CC == FCmpInst::FCMP_FALSE || Op1CC == FCmpInst::FCMP_FALSE)
-      return ConstantInt::get(CmpInst::makeCmpResultType(LHS->getType()), 0);
-    if (Op0CC == FCmpInst::FCMP_TRUE)
-      return RHS;
-    if (Op1CC == FCmpInst::FCMP_TRUE)
-      return LHS;
-
-    bool Op0Ordered;
-    bool Op1Ordered;
-    unsigned Op0Pred = getFCmpCode(Op0CC, Op0Ordered);
-    unsigned Op1Pred = getFCmpCode(Op1CC, Op1Ordered);
-    // uno && ord -> false
-    if (Op0Pred == 0 && Op1Pred == 0 && Op0Ordered != Op1Ordered)
-        return ConstantInt::get(CmpInst::makeCmpResultType(LHS->getType()), 0);
-    if (Op1Pred == 0) {
-      std::swap(LHS, RHS);
-      std::swap(Op0Pred, Op1Pred);
-      std::swap(Op0Ordered, Op1Ordered);
-    }
-    if (Op0Pred == 0) {
-      // uno && ueq -> uno && (uno || eq) -> uno
-      // ord && olt -> ord && (ord && lt) -> olt
-      if (!Op0Ordered && (Op0Ordered == Op1Ordered))
-        return LHS;
-      if (Op0Ordered && (Op0Ordered == Op1Ordered))
-        return RHS;
-
-      // uno && oeq -> uno && (ord && eq) -> false
-      if (!Op0Ordered)
-        return ConstantInt::get(CmpInst::makeCmpResultType(LHS->getType()), 0);
-      // ord && ueq -> ord && (uno || eq) -> oeq
-      return getFCmpValue(true, Op1Pred, Op0LHS, Op0RHS, Builder);
-    }
-  }
-
   return nullptr;
 }
 
@@ -1248,19 +1193,131 @@ static Instruction *matchDeMorgansLaws(BinaryOperator &I,
   return nullptr;
 }
 
+Instruction *InstCombiner::foldCastedBitwiseLogic(BinaryOperator &I) {
+  auto LogicOpc = I.getOpcode();
+  assert((LogicOpc == Instruction::And || LogicOpc == Instruction::Or ||
+          LogicOpc == Instruction::Xor) &&
+         "Unexpected opcode for bitwise logic folding");
+
+  Value *Op0 = I.getOperand(0), *Op1 = I.getOperand(1);
+  CastInst *Cast0 = dyn_cast<CastInst>(Op0);
+  if (!Cast0)
+    return nullptr;
+
+  // This must be a cast from an integer or integer vector source type to allow
+  // transformation of the logic operation to the source type.
+  Type *DestTy = I.getType();
+  Type *SrcTy = Cast0->getSrcTy();
+  if (!SrcTy->isIntOrIntVectorTy())
+    return nullptr;
+
+  // If one operand is a bitcast and the other is a constant, move the logic
+  // operation ahead of the bitcast. That is, do the logic operation in the
+  // original type. This can eliminate useless bitcasts and allow normal
+  // combines that would otherwise be impeded by the bitcast. Canonicalization
+  // ensures that if there is a constant operand, it will be the second operand.
+  Value *BC = nullptr;
+  Constant *C = nullptr;
+  if ((match(Op0, m_BitCast(m_Value(BC))) && match(Op1, m_Constant(C)))) {
+    Value *NewConstant = ConstantExpr::getBitCast(C, SrcTy);
+    Value *NewOp = Builder->CreateBinOp(LogicOpc, BC, NewConstant, I.getName());
+    return CastInst::CreateBitOrPointerCast(NewOp, DestTy);
+  }
+
+  CastInst *Cast1 = dyn_cast<CastInst>(Op1);
+  if (!Cast1)
+    return nullptr;
+
+  // Both operands of the logic operation are casts. The casts must be of the
+  // same type for reduction.
+  auto CastOpcode = Cast0->getOpcode();
+  if (CastOpcode != Cast1->getOpcode() || SrcTy != Cast1->getSrcTy())
+    return nullptr;
+
+  Value *Cast0Src = Cast0->getOperand(0);
+  Value *Cast1Src = Cast1->getOperand(0);
+
+  // fold (logic (cast A), (cast B)) -> (cast (logic A, B))
+
+  // Only do this if the casts both really cause code to be generated.
+  if ((!isa<ICmpInst>(Cast0Src) || !isa<ICmpInst>(Cast1Src)) &&
+      ShouldOptimizeCast(CastOpcode, Cast0Src, DestTy) &&
+      ShouldOptimizeCast(CastOpcode, Cast1Src, DestTy)) {
+    Value *NewOp = Builder->CreateBinOp(LogicOpc, Cast0Src, Cast1Src,
+                                        I.getName());
+    return CastInst::Create(CastOpcode, NewOp, DestTy);
+  }
+
+  // For now, only 'and'/'or' have optimizations after this.
+  if (LogicOpc == Instruction::Xor)
+    return nullptr;
+
+  // If this is logic(cast(icmp), cast(icmp)), try to fold this even if the
+  // cast is otherwise not optimizable.  This happens for vector sexts.
+  ICmpInst *ICmp0 = dyn_cast<ICmpInst>(Cast0Src);
+  ICmpInst *ICmp1 = dyn_cast<ICmpInst>(Cast1Src);
+  if (ICmp0 && ICmp1) {
+    Value *Res = LogicOpc == Instruction::And ? FoldAndOfICmps(ICmp0, ICmp1)
+                                              : FoldOrOfICmps(ICmp0, ICmp1, &I);
+    if (Res)
+      return CastInst::Create(CastOpcode, Res, DestTy);
+    return nullptr;
+  }
+
+  // If this is logic(cast(fcmp), cast(fcmp)), try to fold this even if the
+  // cast is otherwise not optimizable.  This happens for vector sexts.
+  FCmpInst *FCmp0 = dyn_cast<FCmpInst>(Cast0Src);
+  FCmpInst *FCmp1 = dyn_cast<FCmpInst>(Cast1Src);
+  if (FCmp0 && FCmp1) {
+    Value *Res = LogicOpc == Instruction::And ? FoldAndOfFCmps(FCmp0, FCmp1)
+                                              : FoldOrOfFCmps(FCmp0, FCmp1);
+    if (Res)
+      return CastInst::Create(CastOpcode, Res, DestTy);
+    return nullptr;
+  }
+
+  return nullptr;
+}
+
+static Instruction *foldBoolSextMaskToSelect(BinaryOperator &I) {
+  Value *Op0 = I.getOperand(0), *Op1 = I.getOperand(1);
+
+  // Canonicalize SExt or Not to the LHS
+  if (match(Op1, m_SExt(m_Value())) || match(Op1, m_Not(m_Value()))) {
+    std::swap(Op0, Op1);
+  }
+
+  // Fold (and (sext bool to A), B) --> (select bool, B, 0)
+  Value *X = nullptr;
+  if (match(Op0, m_SExt(m_Value(X))) &&
+      X->getType()->getScalarType()->isIntegerTy(1)) {
+    Value *Zero = Constant::getNullValue(Op1->getType());
+    return SelectInst::Create(X, Op1, Zero);
+  }
+
+  // Fold (and ~(sext bool to A), B) --> (select bool, 0, B)
+  if (match(Op0, m_Not(m_SExt(m_Value(X)))) &&
+      X->getType()->getScalarType()->isIntegerTy(1)) {
+    Value *Zero = Constant::getNullValue(Op0->getType());
+    return SelectInst::Create(X, Zero, Op1);
+  }
+  
+  return nullptr;
+}
+
 Instruction *InstCombiner::visitAnd(BinaryOperator &I) {
   bool Changed = SimplifyAssociativeOrCommutative(I);
   Value *Op0 = I.getOperand(0), *Op1 = I.getOperand(1);
 
   if (Value *V = SimplifyVectorOp(I))
-    return ReplaceInstUsesWith(I, V);
+    return replaceInstUsesWith(I, V);
 
   if (Value *V = SimplifyAndInst(Op0, Op1, DL, TLI, DT, AC))
-    return ReplaceInstUsesWith(I, V);
+    return replaceInstUsesWith(I, V);
 
   // (A|B)&(A|C) -> A|(B&C) etc
   if (Value *V = SimplifyUsingDistributiveLaws(I))
-    return ReplaceInstUsesWith(I, V);
+    return replaceInstUsesWith(I, V);
 
   // See if we can simplify any instructions used by the instruction whose sole
   // purpose is to compute bits we don't care about.
@@ -1268,7 +1325,7 @@ Instruction *InstCombiner::visitAnd(BinaryOperator &I) {
     return &I;
 
   if (Value *V = SimplifyBSwap(I))
-    return ReplaceInstUsesWith(I, V);
+    return replaceInstUsesWith(I, V);
 
   if (ConstantInt *AndRHS = dyn_cast<ConstantInt>(Op1)) {
     const APInt &AndRHSMask = AndRHS->getValue();
@@ -1399,8 +1456,7 @@ Instruction *InstCombiner::visitAnd(BinaryOperator &I) {
     {
       Value *tmpOp0 = Op0;
       Value *tmpOp1 = Op1;
-      if (Op0->hasOneUse() &&
-          match(Op0, m_Xor(m_Value(A), m_Value(B)))) {
+      if (match(Op0, m_OneUse(m_Xor(m_Value(A), m_Value(B))))) {
         if (A == Op1 || B == Op1 ) {
           tmpOp1 = Op0;
           tmpOp0 = Op1;
@@ -1408,12 +1464,11 @@ Instruction *InstCombiner::visitAnd(BinaryOperator &I) {
         }
       }
 
-      if (tmpOp1->hasOneUse() &&
-          match(tmpOp1, m_Xor(m_Value(A), m_Value(B)))) {
+      if (match(tmpOp1, m_OneUse(m_Xor(m_Value(A), m_Value(B))))) {
         if (B == tmpOp0) {
           std::swap(A, B);
         }
-        // Notice that the patten (A&(~B)) is actually (A&(-1^B)), so if
+        // Notice that the pattern (A&(~B)) is actually (A&(-1^B)), so if
         // A is originally -1 (or a vector of -1 and undefs), then we enter
         // an endless loop. By checking that A is non-constant we ensure that
         // we will never get to the loop.
@@ -1458,7 +1513,7 @@ Instruction *InstCombiner::visitAnd(BinaryOperator &I) {
     ICmpInst *RHS = dyn_cast<ICmpInst>(Op1);
     if (LHS && RHS)
       if (Value *Res = FoldAndOfICmps(LHS, RHS))
-        return ReplaceInstUsesWith(I, Res);
+        return replaceInstUsesWith(I, Res);
 
     // TODO: Make this recursive; it's a little tricky because an arbitrary
     // number of 'and' instructions might have to be created.
@@ -1466,18 +1521,18 @@ Instruction *InstCombiner::visitAnd(BinaryOperator &I) {
     if (LHS && match(Op1, m_OneUse(m_And(m_Value(X), m_Value(Y))))) {
       if (auto *Cmp = dyn_cast<ICmpInst>(X))
         if (Value *Res = FoldAndOfICmps(LHS, Cmp))
-          return ReplaceInstUsesWith(I, Builder->CreateAnd(Res, Y));
+          return replaceInstUsesWith(I, Builder->CreateAnd(Res, Y));
       if (auto *Cmp = dyn_cast<ICmpInst>(Y))
         if (Value *Res = FoldAndOfICmps(LHS, Cmp))
-          return ReplaceInstUsesWith(I, Builder->CreateAnd(Res, X));
+          return replaceInstUsesWith(I, Builder->CreateAnd(Res, X));
     }
     if (RHS && match(Op0, m_OneUse(m_And(m_Value(X), m_Value(Y))))) {
       if (auto *Cmp = dyn_cast<ICmpInst>(X))
         if (Value *Res = FoldAndOfICmps(Cmp, RHS))
-          return ReplaceInstUsesWith(I, Builder->CreateAnd(Res, Y));
+          return replaceInstUsesWith(I, Builder->CreateAnd(Res, Y));
       if (auto *Cmp = dyn_cast<ICmpInst>(Y))
         if (Value *Res = FoldAndOfICmps(Cmp, RHS))
-          return ReplaceInstUsesWith(I, Builder->CreateAnd(Res, X));
+          return replaceInstUsesWith(I, Builder->CreateAnd(Res, X));
     }
   }
 
@@ -1485,92 +1540,46 @@ Instruction *InstCombiner::visitAnd(BinaryOperator &I) {
   if (FCmpInst *LHS = dyn_cast<FCmpInst>(I.getOperand(0)))
     if (FCmpInst *RHS = dyn_cast<FCmpInst>(I.getOperand(1)))
       if (Value *Res = FoldAndOfFCmps(LHS, RHS))
-        return ReplaceInstUsesWith(I, Res);
-
-
-  if (CastInst *Op0C = dyn_cast<CastInst>(Op0)) {
-    Value *Op0COp = Op0C->getOperand(0);
-    Type *SrcTy = Op0COp->getType();
-    // fold (and (cast A), (cast B)) -> (cast (and A, B))
-    if (CastInst *Op1C = dyn_cast<CastInst>(Op1)) {
-      if (Op0C->getOpcode() == Op1C->getOpcode() && // same cast kind ?
-          SrcTy == Op1C->getOperand(0)->getType() &&
-          SrcTy->isIntOrIntVectorTy()) {
-        Value *Op1COp = Op1C->getOperand(0);
-
-        // Only do this if the casts both really cause code to be generated.
-        if (ShouldOptimizeCast(Op0C->getOpcode(), Op0COp, I.getType()) &&
-            ShouldOptimizeCast(Op1C->getOpcode(), Op1COp, I.getType())) {
-          Value *NewOp = Builder->CreateAnd(Op0COp, Op1COp, I.getName());
-          return CastInst::Create(Op0C->getOpcode(), NewOp, I.getType());
-        }
+        return replaceInstUsesWith(I, Res);
 
-        // If this is and(cast(icmp), cast(icmp)), try to fold this even if the
-        // cast is otherwise not optimizable.  This happens for vector sexts.
-        if (ICmpInst *RHS = dyn_cast<ICmpInst>(Op1COp))
-          if (ICmpInst *LHS = dyn_cast<ICmpInst>(Op0COp))
-            if (Value *Res = FoldAndOfICmps(LHS, RHS))
-              return CastInst::Create(Op0C->getOpcode(), Res, I.getType());
-
-        // If this is and(cast(fcmp), cast(fcmp)), try to fold this even if the
-        // cast is otherwise not optimizable.  This happens for vector sexts.
-        if (FCmpInst *RHS = dyn_cast<FCmpInst>(Op1COp))
-          if (FCmpInst *LHS = dyn_cast<FCmpInst>(Op0COp))
-            if (Value *Res = FoldAndOfFCmps(LHS, RHS))
-              return CastInst::Create(Op0C->getOpcode(), Res, I.getType());
-      }
-    }
+  if (Instruction *CastedAnd = foldCastedBitwiseLogic(I))
+    return CastedAnd;
 
-    // If we are masking off the sign bit of a floating-point value, convert
-    // this to the canonical fabs intrinsic call and cast back to integer.
-    // The backend should know how to optimize fabs().
-    // TODO: This transform should also apply to vectors.
-    ConstantInt *CI;
-    if (isa<BitCastInst>(Op0C) && SrcTy->isFloatingPointTy() &&
-        match(Op1, m_ConstantInt(CI)) && CI->isMaxValue(true)) {
-      Module *M = I.getModule();
-      Function *Fabs = Intrinsic::getDeclaration(M, Intrinsic::fabs, SrcTy);
-      Value *Call = Builder->CreateCall(Fabs, Op0COp, "fabs");
-      return CastInst::CreateBitOrPointerCast(Call, I.getType());
-    }
-  }
+  if (Instruction *Select = foldBoolSextMaskToSelect(I))
+    return Select;
 
-  {
-    Value *X = nullptr;
-    bool OpsSwapped = false;
-    // Canonicalize SExt or Not to the LHS
-    if (match(Op1, m_SExt(m_Value())) ||
-        match(Op1, m_Not(m_Value()))) {
-      std::swap(Op0, Op1);
-      OpsSwapped = true;
-    }
+  return Changed ? &I : nullptr;
+}
 
-    // Fold (and (sext bool to A), B) --> (select bool, B, 0)
-    if (match(Op0, m_SExt(m_Value(X))) &&
-        X->getType()->getScalarType()->isIntegerTy(1)) {
-      Value *Zero = Constant::getNullValue(Op1->getType());
-      return SelectInst::Create(X, Op1, Zero);
-    }
+/// Given an OR instruction, check to see if this is a bswap idiom. If so,
+/// insert the new intrinsic and return it.
+Instruction *InstCombiner::MatchBSwap(BinaryOperator &I) {
+  Value *Op0 = I.getOperand(0), *Op1 = I.getOperand(1);
 
-    // Fold (and ~(sext bool to A), B) --> (select bool, 0, B)
-    if (match(Op0, m_Not(m_SExt(m_Value(X)))) &&
-        X->getType()->getScalarType()->isIntegerTy(1)) {
-      Value *Zero = Constant::getNullValue(Op0->getType());
-      return SelectInst::Create(X, Zero, Op1);
-    }
+  // Look through zero extends.
+  if (Instruction *Ext = dyn_cast<ZExtInst>(Op0))
+    Op0 = Ext->getOperand(0);
 
-    if (OpsSwapped)
-      std::swap(Op0, Op1);
-  }
+  if (Instruction *Ext = dyn_cast<ZExtInst>(Op1))
+    Op1 = Ext->getOperand(0);
 
-  return Changed ? &I : nullptr;
-}
+  // (A | B) | C  and  A | (B | C)                  -> bswap if possible.
+  bool OrOfOrs = match(Op0, m_Or(m_Value(), m_Value())) ||
+                 match(Op1, m_Or(m_Value(), m_Value()));
+
+  // (A >> B) | (C << D)  and  (A << B) | (B >> C)  -> bswap if possible.
+  bool OrOfShifts = match(Op0, m_LogicalShift(m_Value(), m_Value())) &&
+                    match(Op1, m_LogicalShift(m_Value(), m_Value()));
+
+  // (A & B) | (C & D)                              -> bswap if possible.
+  bool OrOfAnds = match(Op0, m_And(m_Value(), m_Value())) &&
+                  match(Op1, m_And(m_Value(), m_Value()));
+
+  if (!OrOfOrs && !OrOfShifts && !OrOfAnds)
+    return nullptr;
 
-/// Given an OR instruction, check to see if this is a bswap or bitreverse
-/// idiom. If so, insert the new intrinsic and return it.
-Instruction *InstCombiner::MatchBSwapOrBitReverse(BinaryOperator &I) {
   SmallVector<Instruction*, 4> Insts;
-  if (!recognizeBitReverseOrBSwapIdiom(&I, true, false, Insts))
+  if (!recognizeBSwapOrBitReverseIdiom(&I, true, false, Insts))
     return nullptr;
   Instruction *LastInst = Insts.pop_back_val();
   LastInst->removeFromParent();
@@ -1580,28 +1589,89 @@ Instruction *InstCombiner::MatchBSwapOrBitReverse(BinaryOperator &I) {
   return LastInst;
 }
 
-/// We have an expression of the form (A&C)|(B&D).  Check if A is (cond?-1:0)
-/// and either B or D is ~(cond?-1,0) or (cond?0,-1), then we can simplify this
-/// expression to "cond ? C : D or B".
-static Instruction *MatchSelectFromAndOr(Value *A, Value *B,
-                                         Value *C, Value *D) {
-  // If A is not a select of -1/0, this cannot match.
-  Value *Cond = nullptr;
-  if (!match(A, m_SExt(m_Value(Cond))) ||
-      !Cond->getType()->isIntegerTy(1))
+/// If all elements of two constant vectors are 0/-1 and inverses, return true.
+static bool areInverseVectorBitmasks(Constant *C1, Constant *C2) {
+  unsigned NumElts = C1->getType()->getVectorNumElements();
+  for (unsigned i = 0; i != NumElts; ++i) {
+    Constant *EltC1 = C1->getAggregateElement(i);
+    Constant *EltC2 = C2->getAggregateElement(i);
+    if (!EltC1 || !EltC2)
+      return false;
+
+    // One element must be all ones, and the other must be all zeros.
+    // FIXME: Allow undef elements.
+    if (!((match(EltC1, m_Zero()) && match(EltC2, m_AllOnes())) ||
+          (match(EltC2, m_Zero()) && match(EltC1, m_AllOnes()))))
+      return false;
+  }
+  return true;
+}
+
+/// We have an expression of the form (A & C) | (B & D). If A is a scalar or
+/// vector composed of all-zeros or all-ones values and is the bitwise 'not' of
+/// B, it can be used as the condition operand of a select instruction.
+static Value *getSelectCondition(Value *A, Value *B,
+                                 InstCombiner::BuilderTy &Builder) {
+  // If these are scalars or vectors of i1, A can be used directly.
+  Type *Ty = A->getType();
+  if (match(A, m_Not(m_Specific(B))) && Ty->getScalarType()->isIntegerTy(1))
+    return A;
+
+  // If A and B are sign-extended, look through the sexts to find the booleans.
+  Value *Cond;
+  if (match(A, m_SExt(m_Value(Cond))) &&
+      Cond->getType()->getScalarType()->isIntegerTy(1) &&
+      match(B, m_CombineOr(m_Not(m_SExt(m_Specific(Cond))),
+                           m_SExt(m_Not(m_Specific(Cond))))))
+    return Cond;
+
+  // All scalar (and most vector) possibilities should be handled now.
+  // Try more matches that only apply to non-splat constant vectors.
+  if (!Ty->isVectorTy())
     return nullptr;
 
-  // ((cond?-1:0)&C) | (B&(cond?0:-1)) -> cond ? C : B.
-  if (match(D, m_Not(m_SExt(m_Specific(Cond)))))
-    return SelectInst::Create(Cond, C, B);
-  if (match(D, m_SExt(m_Not(m_Specific(Cond)))))
-    return SelectInst::Create(Cond, C, B);
-
-  // ((cond?-1:0)&C) | ((cond?0:-1)&D) -> cond ? C : D.
-  if (match(B, m_Not(m_SExt(m_Specific(Cond)))))
-    return SelectInst::Create(Cond, C, D);
-  if (match(B, m_SExt(m_Not(m_Specific(Cond)))))
-    return SelectInst::Create(Cond, C, D);
+  // If both operands are constants, see if the constants are inverse bitmasks.
+  Constant *AC, *BC;
+  if (match(A, m_Constant(AC)) && match(B, m_Constant(BC)) &&
+      areInverseVectorBitmasks(AC, BC))
+    return ConstantExpr::getTrunc(AC, CmpInst::makeCmpResultType(Ty));
+
+  // If both operands are xor'd with constants using the same sexted boolean
+  // operand, see if the constants are inverse bitmasks.
+  if (match(A, (m_Xor(m_SExt(m_Value(Cond)), m_Constant(AC)))) &&
+      match(B, (m_Xor(m_SExt(m_Specific(Cond)), m_Constant(BC)))) &&
+      Cond->getType()->getScalarType()->isIntegerTy(1) &&
+      areInverseVectorBitmasks(AC, BC)) {
+    AC = ConstantExpr::getTrunc(AC, CmpInst::makeCmpResultType(Ty));
+    return Builder.CreateXor(Cond, AC);
+  }
+  return nullptr;
+}
+
+/// We have an expression of the form (A & C) | (B & D). Try to simplify this
+/// to "A' ? C : D", where A' is a boolean or vector of booleans.
+static Value *matchSelectFromAndOr(Value *A, Value *C, Value *B, Value *D,
+                                   InstCombiner::BuilderTy &Builder) {
+  // The potential condition of the select may be bitcasted. In that case, look
+  // through its bitcast and the corresponding bitcast of the 'not' condition.
+  Type *OrigType = A->getType();
+  Value *SrcA, *SrcB;
+  if (match(A, m_OneUse(m_BitCast(m_Value(SrcA)))) &&
+      match(B, m_OneUse(m_BitCast(m_Value(SrcB))))) {
+    A = SrcA;
+    B = SrcB;
+  }
+
+  if (Value *Cond = getSelectCondition(A, B, Builder)) {
+    // ((bc Cond) & C) | ((bc ~Cond) & D) --> bc (select Cond, (bc C), (bc D))
+    // The bitcasts will either all exist or all not exist. The builder will
+    // not create unnecessary casts if the types already match.
+    Value *BitcastC = Builder.CreateBitCast(C, A->getType());
+    Value *BitcastD = Builder.CreateBitCast(D, A->getType());
+    Value *Select = Builder.CreateSelect(Cond, BitcastC, BitcastD);
+    return Builder.CreateBitCast(Select, OrigType);
+  }
+
   return nullptr;
 }
 
@@ -1940,6 +2010,27 @@ Value *InstCombiner::FoldOrOfICmps(ICmpInst *LHS, ICmpInst *RHS,
 /// Optimize (fcmp)|(fcmp).  NOTE: Unlike the rest of instcombine, this returns
 /// a Value which should already be inserted into the function.
 Value *InstCombiner::FoldOrOfFCmps(FCmpInst *LHS, FCmpInst *RHS) {
+  Value *Op0LHS = LHS->getOperand(0), *Op0RHS = LHS->getOperand(1);
+  Value *Op1LHS = RHS->getOperand(0), *Op1RHS = RHS->getOperand(1);
+  FCmpInst::Predicate Op0CC = LHS->getPredicate(), Op1CC = RHS->getPredicate();
+
+  if (Op0LHS == Op1RHS && Op0RHS == Op1LHS) {
+    // Swap RHS operands to match LHS.
+    Op1CC = FCmpInst::getSwappedPredicate(Op1CC);
+    std::swap(Op1LHS, Op1RHS);
+  }
+
+  // Simplify (fcmp cc0 x, y) | (fcmp cc1 x, y).
+  // This is a similar transformation to the one in FoldAndOfFCmps.
+  //
+  // Since (R & CC0) and (R & CC1) are either R or 0, we actually have this:
+  //    bool(R & CC0) || bool(R & CC1)
+  //  = bool((R & CC0) | (R & CC1))
+  //  = bool(R & (CC0 | CC1)) <= by reversed distribution (contribution? ;)
+  if (Op0LHS == Op1LHS && Op0RHS == Op1RHS)
+    return getFCmpValue(getFCmpCode(Op0CC) | getFCmpCode(Op1CC), Op0LHS, Op0RHS,
+                        Builder);
+
   if (LHS->getPredicate() == FCmpInst::FCMP_UNO &&
       RHS->getPredicate() == FCmpInst::FCMP_UNO &&
       LHS->getOperand(0)->getType() == RHS->getOperand(0)->getType()) {
@@ -1964,35 +2055,6 @@ Value *InstCombiner::FoldOrOfFCmps(FCmpInst *LHS, FCmpInst *RHS) {
     return nullptr;
   }
 
-  Value *Op0LHS = LHS->getOperand(0), *Op0RHS = LHS->getOperand(1);
-  Value *Op1LHS = RHS->getOperand(0), *Op1RHS = RHS->getOperand(1);
-  FCmpInst::Predicate Op0CC = LHS->getPredicate(), Op1CC = RHS->getPredicate();
-
-  if (Op0LHS == Op1RHS && Op0RHS == Op1LHS) {
-    // Swap RHS operands to match LHS.
-    Op1CC = FCmpInst::getSwappedPredicate(Op1CC);
-    std::swap(Op1LHS, Op1RHS);
-  }
-  if (Op0LHS == Op1LHS && Op0RHS == Op1RHS) {
-    // Simplify (fcmp cc0 x, y) | (fcmp cc1 x, y).
-    if (Op0CC == Op1CC)
-      return Builder->CreateFCmp((FCmpInst::Predicate)Op0CC, Op0LHS, Op0RHS);
-    if (Op0CC == FCmpInst::FCMP_TRUE || Op1CC == FCmpInst::FCMP_TRUE)
-      return ConstantInt::get(CmpInst::makeCmpResultType(LHS->getType()), 1);
-    if (Op0CC == FCmpInst::FCMP_FALSE)
-      return RHS;
-    if (Op1CC == FCmpInst::FCMP_FALSE)
-      return LHS;
-    bool Op0Ordered;
-    bool Op1Ordered;
-    unsigned Op0Pred = getFCmpCode(Op0CC, Op0Ordered);
-    unsigned Op1Pred = getFCmpCode(Op1CC, Op1Ordered);
-    if (Op0Ordered == Op1Ordered) {
-      // If both are ordered or unordered, return a new fcmp with
-      // or'ed predicates.
-      return getFCmpValue(Op0Ordered, Op0Pred|Op1Pred, Op0LHS, Op0RHS, Builder);
-    }
-  }
   return nullptr;
 }
 
@@ -2062,14 +2124,14 @@ Instruction *InstCombiner::visitOr(BinaryOperator &I) {
   Value *Op0 = I.getOperand(0), *Op1 = I.getOperand(1);
 
   if (Value *V = SimplifyVectorOp(I))
-    return ReplaceInstUsesWith(I, V);
+    return replaceInstUsesWith(I, V);
 
   if (Value *V = SimplifyOrInst(Op0, Op1, DL, TLI, DT, AC))
-    return ReplaceInstUsesWith(I, V);
+    return replaceInstUsesWith(I, V);
 
   // (A&B)|(A&C) -> A&(B|C) etc
   if (Value *V = SimplifyUsingDistributiveLaws(I))
-    return ReplaceInstUsesWith(I, V);
+    return replaceInstUsesWith(I, V);
 
   // See if we can simplify any instructions used by the instruction whose sole
   // purpose is to compute bits we don't care about.
@@ -2077,7 +2139,7 @@ Instruction *InstCombiner::visitOr(BinaryOperator &I) {
     return &I;
 
   if (Value *V = SimplifyBSwap(I))
-    return ReplaceInstUsesWith(I, V);
+    return replaceInstUsesWith(I, V);
 
   if (ConstantInt *RHS = dyn_cast<ConstantInt>(Op1)) {
     ConstantInt *C1 = nullptr; Value *X = nullptr;
@@ -2111,23 +2173,13 @@ Instruction *InstCombiner::visitOr(BinaryOperator &I) {
         return NV;
   }
 
+  // Given an OR instruction, check to see if this is a bswap.
+  if (Instruction *BSwap = MatchBSwap(I))
+    return BSwap;
+
   Value *A = nullptr, *B = nullptr;
   ConstantInt *C1 = nullptr, *C2 = nullptr;
 
-  // (A | B) | C  and  A | (B | C)                  -> bswap if possible.
-  bool OrOfOrs = match(Op0, m_Or(m_Value(), m_Value())) ||
-                 match(Op1, m_Or(m_Value(), m_Value()));
-  // (A >> B) | (C << D)  and  (A << B) | (B >> C)  -> bswap if possible.
-  bool OrOfShifts = match(Op0, m_LogicalShift(m_Value(), m_Value())) &&
-                    match(Op1, m_LogicalShift(m_Value(), m_Value()));
-  // (A & B) | (C & D)                              -> bswap if possible.
-  bool OrOfAnds = match(Op0, m_And(m_Value(), m_Value())) &&
-                  match(Op1, m_And(m_Value(), m_Value()));
-
-  if (OrOfOrs || OrOfShifts || OrOfAnds)
-    if (Instruction *BSwap = MatchBSwapOrBitReverse(I))
-      return BSwap;
-
   // (X^C)|Y -> (X|Y)^C iff Y&C == 0
   if (Op0->hasOneUse() &&
       match(Op0, m_Xor(m_Value(A), m_ConstantInt(C1))) &&
@@ -2207,18 +2259,27 @@ Instruction *InstCombiner::visitOr(BinaryOperator &I) {
       }
     }
 
-    // (A & (C0?-1:0)) | (B & ~(C0?-1:0)) ->  C0 ? A : B, and commuted variants.
-    // Don't do this for vector select idioms, the code generator doesn't handle
-    // them well yet.
-    if (!I.getType()->isVectorTy()) {
-      if (Instruction *Match = MatchSelectFromAndOr(A, B, C, D))
-        return Match;
-      if (Instruction *Match = MatchSelectFromAndOr(B, A, D, C))
-        return Match;
-      if (Instruction *Match = MatchSelectFromAndOr(C, B, A, D))
-        return Match;
-      if (Instruction *Match = MatchSelectFromAndOr(D, A, B, C))
-        return Match;
+    // Don't try to form a select if it's unlikely that we'll get rid of at
+    // least one of the operands. A select is generally more expensive than the
+    // 'or' that it is replacing.
+    if (Op0->hasOneUse() || Op1->hasOneUse()) {
+      // (Cond & C) | (~Cond & D) -> Cond ? C : D, and commuted variants.
+      if (Value *V = matchSelectFromAndOr(A, C, B, D, *Builder))
+        return replaceInstUsesWith(I, V);
+      if (Value *V = matchSelectFromAndOr(A, C, D, B, *Builder))
+        return replaceInstUsesWith(I, V);
+      if (Value *V = matchSelectFromAndOr(C, A, B, D, *Builder))
+        return replaceInstUsesWith(I, V);
+      if (Value *V = matchSelectFromAndOr(C, A, D, B, *Builder))
+        return replaceInstUsesWith(I, V);
+      if (Value *V = matchSelectFromAndOr(B, D, A, C, *Builder))
+        return replaceInstUsesWith(I, V);
+      if (Value *V = matchSelectFromAndOr(B, D, C, A, *Builder))
+        return replaceInstUsesWith(I, V);
+      if (Value *V = matchSelectFromAndOr(D, B, A, C, *Builder))
+        return replaceInstUsesWith(I, V);
+      if (Value *V = matchSelectFromAndOr(D, B, C, A, *Builder))
+        return replaceInstUsesWith(I, V);
     }
 
     // ((A&~B)|(~A&B)) -> A^B
@@ -2342,7 +2403,7 @@ Instruction *InstCombiner::visitOr(BinaryOperator &I) {
     ICmpInst *RHS = dyn_cast<ICmpInst>(Op1);
     if (LHS && RHS)
       if (Value *Res = FoldOrOfICmps(LHS, RHS, &I))
-        return ReplaceInstUsesWith(I, Res);
+        return replaceInstUsesWith(I, Res);
 
     // TODO: Make this recursive; it's a little tricky because an arbitrary
     // number of 'or' instructions might have to be created.
@@ -2350,18 +2411,18 @@ Instruction *InstCombiner::visitOr(BinaryOperator &I) {
     if (LHS && match(Op1, m_OneUse(m_Or(m_Value(X), m_Value(Y))))) {
       if (auto *Cmp = dyn_cast<ICmpInst>(X))
         if (Value *Res = FoldOrOfICmps(LHS, Cmp, &I))
-          return ReplaceInstUsesWith(I, Builder->CreateOr(Res, Y));
+          return replaceInstUsesWith(I, Builder->CreateOr(Res, Y));
       if (auto *Cmp = dyn_cast<ICmpInst>(Y))
         if (Value *Res = FoldOrOfICmps(LHS, Cmp, &I))
-          return ReplaceInstUsesWith(I, Builder->CreateOr(Res, X));
+          return replaceInstUsesWith(I, Builder->CreateOr(Res, X));
     }
     if (RHS && match(Op0, m_OneUse(m_Or(m_Value(X), m_Value(Y))))) {
       if (auto *Cmp = dyn_cast<ICmpInst>(X))
         if (Value *Res = FoldOrOfICmps(Cmp, RHS, &I))
-          return ReplaceInstUsesWith(I, Builder->CreateOr(Res, Y));
+          return replaceInstUsesWith(I, Builder->CreateOr(Res, Y));
       if (auto *Cmp = dyn_cast<ICmpInst>(Y))
         if (Value *Res = FoldOrOfICmps(Cmp, RHS, &I))
-          return ReplaceInstUsesWith(I, Builder->CreateOr(Res, X));
+          return replaceInstUsesWith(I, Builder->CreateOr(Res, X));
     }
   }
 
@@ -2369,48 +2430,17 @@ Instruction *InstCombiner::visitOr(BinaryOperator &I) {
   if (FCmpInst *LHS = dyn_cast<FCmpInst>(I.getOperand(0)))
     if (FCmpInst *RHS = dyn_cast<FCmpInst>(I.getOperand(1)))
       if (Value *Res = FoldOrOfFCmps(LHS, RHS))
-        return ReplaceInstUsesWith(I, Res);
-
-  // fold (or (cast A), (cast B)) -> (cast (or A, B))
-  if (CastInst *Op0C = dyn_cast<CastInst>(Op0)) {
-    CastInst *Op1C = dyn_cast<CastInst>(Op1);
-    if (Op1C && Op0C->getOpcode() == Op1C->getOpcode()) {// same cast kind ?
-      Type *SrcTy = Op0C->getOperand(0)->getType();
-      if (SrcTy == Op1C->getOperand(0)->getType() &&
-          SrcTy->isIntOrIntVectorTy()) {
-        Value *Op0COp = Op0C->getOperand(0), *Op1COp = Op1C->getOperand(0);
-
-        if ((!isa<ICmpInst>(Op0COp) || !isa<ICmpInst>(Op1COp)) &&
-            // Only do this if the casts both really cause code to be
-            // generated.
-            ShouldOptimizeCast(Op0C->getOpcode(), Op0COp, I.getType()) &&
-            ShouldOptimizeCast(Op1C->getOpcode(), Op1COp, I.getType())) {
-          Value *NewOp = Builder->CreateOr(Op0COp, Op1COp, I.getName());
-          return CastInst::Create(Op0C->getOpcode(), NewOp, I.getType());
-        }
+        return replaceInstUsesWith(I, Res);
 
-        // If this is or(cast(icmp), cast(icmp)), try to fold this even if the
-        // cast is otherwise not optimizable.  This happens for vector sexts.
-        if (ICmpInst *RHS = dyn_cast<ICmpInst>(Op1COp))
-          if (ICmpInst *LHS = dyn_cast<ICmpInst>(Op0COp))
-            if (Value *Res = FoldOrOfICmps(LHS, RHS, &I))
-              return CastInst::Create(Op0C->getOpcode(), Res, I.getType());
-
-        // If this is or(cast(fcmp), cast(fcmp)), try to fold this even if the
-        // cast is otherwise not optimizable.  This happens for vector sexts.
-        if (FCmpInst *RHS = dyn_cast<FCmpInst>(Op1COp))
-          if (FCmpInst *LHS = dyn_cast<FCmpInst>(Op0COp))
-            if (Value *Res = FoldOrOfFCmps(LHS, RHS))
-              return CastInst::Create(Op0C->getOpcode(), Res, I.getType());
-      }
-    }
-  }
+  if (Instruction *CastedOr = foldCastedBitwiseLogic(I))
+    return CastedOr;
 
-  // or(sext(A), B) -> A ? -1 : B where A is an i1
-  // or(A, sext(B)) -> B ? -1 : A where B is an i1
-  if (match(Op0, m_SExt(m_Value(A))) && A->getType()->isIntegerTy(1))
+  // or(sext(A), B) / or(B, sext(A)) --> A ? -1 : B, where A is i1 or <N x i1>.
+  if (match(Op0, m_OneUse(m_SExt(m_Value(A)))) &&
+      A->getType()->getScalarType()->isIntegerTy(1))
     return SelectInst::Create(A, ConstantInt::getSigned(I.getType(), -1), Op1);
-  if (match(Op1, m_SExt(m_Value(A))) && A->getType()->isIntegerTy(1))
+  if (match(Op1, m_OneUse(m_SExt(m_Value(A)))) &&
+      A->getType()->getScalarType()->isIntegerTy(1))
     return SelectInst::Create(A, ConstantInt::getSigned(I.getType(), -1), Op0);
 
   // Note: If we've gotten to the point of visiting the outer OR, then the
@@ -2447,14 +2477,14 @@ Instruction *InstCombiner::visitXor(BinaryOperator &I) {
   Value *Op0 = I.getOperand(0), *Op1 = I.getOperand(1);
 
   if (Value *V = SimplifyVectorOp(I))
-    return ReplaceInstUsesWith(I, V);
+    return replaceInstUsesWith(I, V);
 
   if (Value *V = SimplifyXorInst(Op0, Op1, DL, TLI, DT, AC))
-    return ReplaceInstUsesWith(I, V);
+    return replaceInstUsesWith(I, V);
 
   // (A&B)^(A&C) -> A&(B^C) etc
   if (Value *V = SimplifyUsingDistributiveLaws(I))
-    return ReplaceInstUsesWith(I, V);
+    return replaceInstUsesWith(I, V);
 
   // See if we can simplify any instructions used by the instruction whose sole
   // purpose is to compute bits we don't care about.
@@ -2462,7 +2492,7 @@ Instruction *InstCombiner::visitXor(BinaryOperator &I) {
     return &I;
 
   if (Value *V = SimplifyBSwap(I))
-    return ReplaceInstUsesWith(I, V);
+    return replaceInstUsesWith(I, V);
 
   // Is this a ~ operation?
   if (Value *NotOp = dyn_castNotVal(&I)) {
@@ -2731,29 +2761,14 @@ Instruction *InstCombiner::visitXor(BinaryOperator &I) {
           Value *Op0 = LHS->getOperand(0), *Op1 = LHS->getOperand(1);
           unsigned Code = getICmpCode(LHS) ^ getICmpCode(RHS);
           bool isSigned = LHS->isSigned() || RHS->isSigned();
-          return ReplaceInstUsesWith(I,
+          return replaceInstUsesWith(I,
                                getNewICmpValue(isSigned, Code, Op0, Op1,
                                                Builder));
         }
       }
 
-  // fold (xor (cast A), (cast B)) -> (cast (xor A, B))
-  if (CastInst *Op0C = dyn_cast<CastInst>(Op0)) {
-    if (CastInst *Op1C = dyn_cast<CastInst>(Op1))
-      if (Op0C->getOpcode() == Op1C->getOpcode()) { // same cast kind?
-        Type *SrcTy = Op0C->getOperand(0)->getType();
-        if (SrcTy == Op1C->getOperand(0)->getType() && SrcTy->isIntegerTy() &&
-            // Only do this if the casts both really cause code to be generated.
-            ShouldOptimizeCast(Op0C->getOpcode(), Op0C->getOperand(0),
-                               I.getType()) &&
-            ShouldOptimizeCast(Op1C->getOpcode(), Op1C->getOperand(0),
-                               I.getType())) {
-          Value *NewOp = Builder->CreateXor(Op0C->getOperand(0),
-                                            Op1C->getOperand(0), I.getName());
-          return CastInst::Create(Op0C->getOpcode(), NewOp, I.getType());
-        }
-      }
-  }
+  if (Instruction *CastedXor = foldCastedBitwiseLogic(I))
+    return CastedXor;
 
   return Changed ? &I : nullptr;
 }
diff --git a/lib/Transforms/InstCombine/InstCombineCalls.cpp b/lib/Transforms/InstCombine/InstCombineCalls.cpp
index 090245d1b22c..8acff91345d6 100644
--- a/lib/Transforms/InstCombine/InstCombineCalls.cpp
+++ b/lib/Transforms/InstCombine/InstCombineCalls.cpp
@@ -14,6 +14,7 @@
 #include "InstCombineInternal.h"
 #include "llvm/ADT/Statistic.h"
 #include "llvm/Analysis/InstructionSimplify.h"
+#include "llvm/Analysis/Loads.h"
 #include "llvm/Analysis/MemoryBuiltins.h"
 #include "llvm/IR/CallSite.h"
 #include "llvm/IR/Dominators.h"
@@ -29,8 +30,8 @@ using namespace PatternMatch;
 
 STATISTIC(NumSimplified, "Number of library calls simplified");
 
-/// getPromotedType - Return the specified type promoted as it would be to pass
-/// though a va_arg area.
+/// Return the specified type promoted as it would be to pass though a va_arg
+/// area.
 static Type *getPromotedType(Type *Ty) {
   if (IntegerType* ITy = dyn_cast<IntegerType>(Ty)) {
     if (ITy->getBitWidth() < 32)
@@ -39,8 +40,8 @@ static Type *getPromotedType(Type *Ty) {
   return Ty;
 }
 
-/// reduceToSingleValueType - Given an aggregate type which ultimately holds a
-/// single scalar element, like {{{type}}} or [1 x type], return type.
+/// Given an aggregate type which ultimately holds a single scalar element,
+/// like {{{type}}} or [1 x type], return type.
 static Type *reduceToSingleValueType(Type *T) {
   while (!T->isSingleValueType()) {
     if (StructType *STy = dyn_cast<StructType>(T)) {
@@ -60,6 +61,23 @@ static Type *reduceToSingleValueType(Type *T) {
   return T;
 }
 
+/// Return a constant boolean vector that has true elements in all positions
+/// where the input constant data vector has an element with the sign bit set.
+static Constant *getNegativeIsTrueBoolVec(ConstantDataVector *V) {
+  SmallVector<Constant *, 32> BoolVec;
+  IntegerType *BoolTy = Type::getInt1Ty(V->getContext());
+  for (unsigned I = 0, E = V->getNumElements(); I != E; ++I) {
+    Constant *Elt = V->getElementAsConstant(I);
+    assert((isa<ConstantInt>(Elt) || isa<ConstantFP>(Elt)) &&
+           "Unexpected constant data vector element type");
+    bool Sign = V->getElementType()->isIntegerTy()
+                    ? cast<ConstantInt>(Elt)->isNegative()
+                    : cast<ConstantFP>(Elt)->isNegative();
+    BoolVec.push_back(ConstantInt::get(BoolTy, Sign));
+  }
+  return ConstantVector::get(BoolVec);
+}
+
 Instruction *InstCombiner::SimplifyMemTransfer(MemIntrinsic *MI) {
   unsigned DstAlign = getKnownAlignment(MI->getArgOperand(0), DL, MI, AC, DT);
   unsigned SrcAlign = getKnownAlignment(MI->getArgOperand(1), DL, MI, AC, DT);
@@ -197,7 +215,7 @@ Instruction *InstCombiner::SimplifyMemSet(MemSetInst *MI) {
   return nullptr;
 }
 
-static Value *SimplifyX86immshift(const IntrinsicInst &II,
+static Value *simplifyX86immShift(const IntrinsicInst &II,
                                   InstCombiner::BuilderTy &Builder) {
   bool LogicalShift = false;
   bool ShiftLeft = false;
@@ -307,83 +325,216 @@ static Value *SimplifyX86immshift(const IntrinsicInst &II,
   return Builder.CreateAShr(Vec, ShiftVec);
 }
 
-static Value *SimplifyX86extend(const IntrinsicInst &II,
-                                InstCombiner::BuilderTy &Builder,
-                                bool SignExtend) {
-  VectorType *SrcTy = cast<VectorType>(II.getArgOperand(0)->getType());
-  VectorType *DstTy = cast<VectorType>(II.getType());
-  unsigned NumDstElts = DstTy->getNumElements();
-
-  // Extract a subvector of the first NumDstElts lanes and sign/zero extend.
-  SmallVector<int, 8> ShuffleMask;
-  for (int i = 0; i != (int)NumDstElts; ++i)
-    ShuffleMask.push_back(i);
-
-  Value *SV = Builder.CreateShuffleVector(II.getArgOperand(0),
-                                          UndefValue::get(SrcTy), ShuffleMask);
-  return SignExtend ? Builder.CreateSExt(SV, DstTy)
-                    : Builder.CreateZExt(SV, DstTy);
-}
-
-static Value *SimplifyX86insertps(const IntrinsicInst &II,
+// Attempt to simplify AVX2 per-element shift intrinsics to a generic IR shift.
+// Unlike the generic IR shifts, the intrinsics have defined behaviour for out
+// of range shift amounts (logical - set to zero, arithmetic - splat sign bit).
+static Value *simplifyX86varShift(const IntrinsicInst &II,
                                   InstCombiner::BuilderTy &Builder) {
-  if (auto *CInt = dyn_cast<ConstantInt>(II.getArgOperand(2))) {
-    VectorType *VecTy = cast<VectorType>(II.getType());
-    assert(VecTy->getNumElements() == 4 && "insertps with wrong vector type");
-
-    // The immediate permute control byte looks like this:
-    //    [3:0] - zero mask for each 32-bit lane
-    //    [5:4] - select one 32-bit destination lane
-    //    [7:6] - select one 32-bit source lane
-
-    uint8_t Imm = CInt->getZExtValue();
-    uint8_t ZMask = Imm & 0xf;
-    uint8_t DestLane = (Imm >> 4) & 0x3;
-    uint8_t SourceLane = (Imm >> 6) & 0x3;
-
-    ConstantAggregateZero *ZeroVector = ConstantAggregateZero::get(VecTy);
-
-    // If all zero mask bits are set, this was just a weird way to
-    // generate a zero vector.
-    if (ZMask == 0xf)
-      return ZeroVector;
-
-    // Initialize by passing all of the first source bits through.
-    int ShuffleMask[4] = { 0, 1, 2, 3 };
-
-    // We may replace the second operand with the zero vector.
-    Value *V1 = II.getArgOperand(1);
-
-    if (ZMask) {
-      // If the zero mask is being used with a single input or the zero mask
-      // overrides the destination lane, this is a shuffle with the zero vector.
-      if ((II.getArgOperand(0) == II.getArgOperand(1)) ||
-          (ZMask & (1 << DestLane))) {
-        V1 = ZeroVector;
-        // We may still move 32-bits of the first source vector from one lane
-        // to another.
-        ShuffleMask[DestLane] = SourceLane;
-        // The zero mask may override the previous insert operation.
-        for (unsigned i = 0; i < 4; ++i)
-          if ((ZMask >> i) & 0x1)
-            ShuffleMask[i] = i + 4;
+  bool LogicalShift = false;
+  bool ShiftLeft = false;
+
+  switch (II.getIntrinsicID()) {
+  default:
+    return nullptr;
+  case Intrinsic::x86_avx2_psrav_d:
+  case Intrinsic::x86_avx2_psrav_d_256:
+    LogicalShift = false;
+    ShiftLeft = false;
+    break;
+  case Intrinsic::x86_avx2_psrlv_d:
+  case Intrinsic::x86_avx2_psrlv_d_256:
+  case Intrinsic::x86_avx2_psrlv_q:
+  case Intrinsic::x86_avx2_psrlv_q_256:
+    LogicalShift = true;
+    ShiftLeft = false;
+    break;
+  case Intrinsic::x86_avx2_psllv_d:
+  case Intrinsic::x86_avx2_psllv_d_256:
+  case Intrinsic::x86_avx2_psllv_q:
+  case Intrinsic::x86_avx2_psllv_q_256:
+    LogicalShift = true;
+    ShiftLeft = true;
+    break;
+  }
+  assert((LogicalShift || !ShiftLeft) && "Only logical shifts can shift left");
+
+  // Simplify if all shift amounts are constant/undef.
+  auto *CShift = dyn_cast<Constant>(II.getArgOperand(1));
+  if (!CShift)
+    return nullptr;
+
+  auto Vec = II.getArgOperand(0);
+  auto VT = cast<VectorType>(II.getType());
+  auto SVT = VT->getVectorElementType();
+  int NumElts = VT->getNumElements();
+  int BitWidth = SVT->getIntegerBitWidth();
+
+  // Collect each element's shift amount.
+  // We also collect special cases: UNDEF = -1, OUT-OF-RANGE = BitWidth.
+  bool AnyOutOfRange = false;
+  SmallVector<int, 8> ShiftAmts;
+  for (int I = 0; I < NumElts; ++I) {
+    auto *CElt = CShift->getAggregateElement(I);
+    if (CElt && isa<UndefValue>(CElt)) {
+      ShiftAmts.push_back(-1);
+      continue;
+    }
+
+    auto *COp = dyn_cast_or_null<ConstantInt>(CElt);
+    if (!COp)
+      return nullptr;
+
+    // Handle out of range shifts.
+    // If LogicalShift - set to BitWidth (special case).
+    // If ArithmeticShift - set to (BitWidth - 1) (sign splat).
+    APInt ShiftVal = COp->getValue();
+    if (ShiftVal.uge(BitWidth)) {
+      AnyOutOfRange = LogicalShift;
+      ShiftAmts.push_back(LogicalShift ? BitWidth : BitWidth - 1);
+      continue;
+    }
+
+    ShiftAmts.push_back((int)ShiftVal.getZExtValue());
+  }
+
+  // If all elements out of range or UNDEF, return vector of zeros/undefs.
+  // ArithmeticShift should only hit this if they are all UNDEF.
+  auto OutOfRange = [&](int Idx) { return (Idx < 0) || (BitWidth <= Idx); };
+  if (llvm::all_of(ShiftAmts, OutOfRange)) {
+    SmallVector<Constant *, 8> ConstantVec;
+    for (int Idx : ShiftAmts) {
+      if (Idx < 0) {
+        ConstantVec.push_back(UndefValue::get(SVT));
       } else {
-        // TODO: Model this case as 2 shuffles or a 'logical and' plus shuffle?
-        return nullptr;
+        assert(LogicalShift && "Logical shift expected");
+        ConstantVec.push_back(ConstantInt::getNullValue(SVT));
       }
-    } else {
-      // Replace the selected destination lane with the selected source lane.
-      ShuffleMask[DestLane] = SourceLane + 4;
     }
+    return ConstantVector::get(ConstantVec);
+  }
 
-    return Builder.CreateShuffleVector(II.getArgOperand(0), V1, ShuffleMask);
+  // We can't handle only some out of range values with generic logical shifts.
+  if (AnyOutOfRange)
+    return nullptr;
+
+  // Build the shift amount constant vector.
+  SmallVector<Constant *, 8> ShiftVecAmts;
+  for (int Idx : ShiftAmts) {
+    if (Idx < 0)
+      ShiftVecAmts.push_back(UndefValue::get(SVT));
+    else
+      ShiftVecAmts.push_back(ConstantInt::get(SVT, Idx));
   }
-  return nullptr;
+  auto ShiftVec = ConstantVector::get(ShiftVecAmts);
+
+  if (ShiftLeft)
+    return Builder.CreateShl(Vec, ShiftVec);
+
+  if (LogicalShift)
+    return Builder.CreateLShr(Vec, ShiftVec);
+
+  return Builder.CreateAShr(Vec, ShiftVec);
+}
+
+static Value *simplifyX86movmsk(const IntrinsicInst &II,
+                                InstCombiner::BuilderTy &Builder) {
+  Value *Arg = II.getArgOperand(0);
+  Type *ResTy = II.getType();
+  Type *ArgTy = Arg->getType();
+
+  // movmsk(undef) -> zero as we must ensure the upper bits are zero.
+  if (isa<UndefValue>(Arg))
+    return Constant::getNullValue(ResTy);
+
+  // We can't easily peek through x86_mmx types.
+  if (!ArgTy->isVectorTy())
+    return nullptr;
+
+  auto *C = dyn_cast<Constant>(Arg);
+  if (!C)
+    return nullptr;
+
+  // Extract signbits of the vector input and pack into integer result.
+  APInt Result(ResTy->getPrimitiveSizeInBits(), 0);
+  for (unsigned I = 0, E = ArgTy->getVectorNumElements(); I != E; ++I) {
+    auto *COp = C->getAggregateElement(I);
+    if (!COp)
+      return nullptr;
+    if (isa<UndefValue>(COp))
+      continue;
+
+    auto *CInt = dyn_cast<ConstantInt>(COp);
+    auto *CFp = dyn_cast<ConstantFP>(COp);
+    if (!CInt && !CFp)
+      return nullptr;
+
+    if ((CInt && CInt->isNegative()) || (CFp && CFp->isNegative()))
+      Result.setBit(I);
+  }
+
+  return Constant::getIntegerValue(ResTy, Result);
+}
+
+static Value *simplifyX86insertps(const IntrinsicInst &II,
+                                  InstCombiner::BuilderTy &Builder) {
+  auto *CInt = dyn_cast<ConstantInt>(II.getArgOperand(2));
+  if (!CInt)
+    return nullptr;
+
+  VectorType *VecTy = cast<VectorType>(II.getType());
+  assert(VecTy->getNumElements() == 4 && "insertps with wrong vector type");
+
+  // The immediate permute control byte looks like this:
+  //    [3:0] - zero mask for each 32-bit lane
+  //    [5:4] - select one 32-bit destination lane
+  //    [7:6] - select one 32-bit source lane
+
+  uint8_t Imm = CInt->getZExtValue();
+  uint8_t ZMask = Imm & 0xf;
+  uint8_t DestLane = (Imm >> 4) & 0x3;
+  uint8_t SourceLane = (Imm >> 6) & 0x3;
+
+  ConstantAggregateZero *ZeroVector = ConstantAggregateZero::get(VecTy);
+
+  // If all zero mask bits are set, this was just a weird way to
+  // generate a zero vector.
+  if (ZMask == 0xf)
+    return ZeroVector;
+
+  // Initialize by passing all of the first source bits through.
+  uint32_t ShuffleMask[4] = { 0, 1, 2, 3 };
+
+  // We may replace the second operand with the zero vector.
+  Value *V1 = II.getArgOperand(1);
+
+  if (ZMask) {
+    // If the zero mask is being used with a single input or the zero mask
+    // overrides the destination lane, this is a shuffle with the zero vector.
+    if ((II.getArgOperand(0) == II.getArgOperand(1)) ||
+        (ZMask & (1 << DestLane))) {
+      V1 = ZeroVector;
+      // We may still move 32-bits of the first source vector from one lane
+      // to another.
+      ShuffleMask[DestLane] = SourceLane;
+      // The zero mask may override the previous insert operation.
+      for (unsigned i = 0; i < 4; ++i)
+        if ((ZMask >> i) & 0x1)
+          ShuffleMask[i] = i + 4;
+    } else {
+      // TODO: Model this case as 2 shuffles or a 'logical and' plus shuffle?
+      return nullptr;
+    }
+  } else {
+    // Replace the selected destination lane with the selected source lane.
+    ShuffleMask[DestLane] = SourceLane + 4;
+  }
+
+  return Builder.CreateShuffleVector(II.getArgOperand(0), V1, ShuffleMask);
 }
 
 /// Attempt to simplify SSE4A EXTRQ/EXTRQI instructions using constant folding
 /// or conversion to a shuffle vector.
-static Value *SimplifyX86extrq(IntrinsicInst &II, Value *Op0,
+static Value *simplifyX86extrq(IntrinsicInst &II, Value *Op0,
                                ConstantInt *CILength, ConstantInt *CIIndex,
                                InstCombiner::BuilderTy &Builder) {
   auto LowConstantHighUndef = [&](uint64_t Val) {
@@ -476,7 +627,7 @@ static Value *SimplifyX86extrq(IntrinsicInst &II, Value *Op0,
 
 /// Attempt to simplify SSE4A INSERTQ/INSERTQI instructions using constant
 /// folding or conversion to a shuffle vector.
-static Value *SimplifyX86insertq(IntrinsicInst &II, Value *Op0, Value *Op1,
+static Value *simplifyX86insertq(IntrinsicInst &II, Value *Op0, Value *Op1,
                                  APInt APLength, APInt APIndex,
                                  InstCombiner::BuilderTy &Builder) {
 
@@ -571,74 +722,211 @@ static Value *SimplifyX86insertq(IntrinsicInst &II, Value *Op0, Value *Op1,
   return nullptr;
 }
 
-/// The shuffle mask for a perm2*128 selects any two halves of two 256-bit
-/// source vectors, unless a zero bit is set. If a zero bit is set,
-/// then ignore that half of the mask and clear that half of the vector.
-static Value *SimplifyX86vperm2(const IntrinsicInst &II,
+/// Attempt to convert pshufb* to shufflevector if the mask is constant.
+static Value *simplifyX86pshufb(const IntrinsicInst &II,
                                 InstCombiner::BuilderTy &Builder) {
-  if (auto *CInt = dyn_cast<ConstantInt>(II.getArgOperand(2))) {
-    VectorType *VecTy = cast<VectorType>(II.getType());
-    ConstantAggregateZero *ZeroVector = ConstantAggregateZero::get(VecTy);
+  Constant *V = dyn_cast<Constant>(II.getArgOperand(1));
+  if (!V)
+    return nullptr;
+
+  auto *VecTy = cast<VectorType>(II.getType());
+  auto *MaskEltTy = Type::getInt32Ty(II.getContext());
+  unsigned NumElts = VecTy->getNumElements();
+  assert((NumElts == 16 || NumElts == 32) &&
+         "Unexpected number of elements in shuffle mask!");
+
+  // Construct a shuffle mask from constant integers or UNDEFs.
+  Constant *Indexes[32] = {NULL};
+
+  // Each byte in the shuffle control mask forms an index to permute the
+  // corresponding byte in the destination operand.
+  for (unsigned I = 0; I < NumElts; ++I) {
+    Constant *COp = V->getAggregateElement(I);
+    if (!COp || (!isa<UndefValue>(COp) && !isa<ConstantInt>(COp)))
+      return nullptr;
+
+    if (isa<UndefValue>(COp)) {
+      Indexes[I] = UndefValue::get(MaskEltTy);
+      continue;
+    }
 
-    // The immediate permute control byte looks like this:
-    //    [1:0] - select 128 bits from sources for low half of destination
-    //    [2]   - ignore
-    //    [3]   - zero low half of destination
-    //    [5:4] - select 128 bits from sources for high half of destination
-    //    [6]   - ignore
-    //    [7]   - zero high half of destination
+    int8_t Index = cast<ConstantInt>(COp)->getValue().getZExtValue();
 
-    uint8_t Imm = CInt->getZExtValue();
+    // If the most significant bit (bit[7]) of each byte of the shuffle
+    // control mask is set, then zero is written in the result byte.
+    // The zero vector is in the right-hand side of the resulting
+    // shufflevector.
 
-    bool LowHalfZero = Imm & 0x08;
-    bool HighHalfZero = Imm & 0x80;
+    // The value of each index for the high 128-bit lane is the least
+    // significant 4 bits of the respective shuffle control byte.
+    Index = ((Index < 0) ? NumElts : Index & 0x0F) + (I & 0xF0);
+    Indexes[I] = ConstantInt::get(MaskEltTy, Index);
+  }
 
-    // If both zero mask bits are set, this was just a weird way to
-    // generate a zero vector.
-    if (LowHalfZero && HighHalfZero)
-      return ZeroVector;
+  auto ShuffleMask = ConstantVector::get(makeArrayRef(Indexes, NumElts));
+  auto V1 = II.getArgOperand(0);
+  auto V2 = Constant::getNullValue(VecTy);
+  return Builder.CreateShuffleVector(V1, V2, ShuffleMask);
+}
 
-    // If 0 or 1 zero mask bits are set, this is a simple shuffle.
-    unsigned NumElts = VecTy->getNumElements();
-    unsigned HalfSize = NumElts / 2;
-    SmallVector<int, 8> ShuffleMask(NumElts);
+/// Attempt to convert vpermilvar* to shufflevector if the mask is constant.
+static Value *simplifyX86vpermilvar(const IntrinsicInst &II,
+                                    InstCombiner::BuilderTy &Builder) {
+  Constant *V = dyn_cast<Constant>(II.getArgOperand(1));
+  if (!V)
+    return nullptr;
+
+  auto *MaskEltTy = Type::getInt32Ty(II.getContext());
+  unsigned NumElts = cast<VectorType>(V->getType())->getNumElements();
+  assert(NumElts == 8 || NumElts == 4 || NumElts == 2);
 
-    // The high bit of the selection field chooses the 1st or 2nd operand.
-    bool LowInputSelect = Imm & 0x02;
-    bool HighInputSelect = Imm & 0x20;
+  // Construct a shuffle mask from constant integers or UNDEFs.
+  Constant *Indexes[8] = {NULL};
 
-    // The low bit of the selection field chooses the low or high half
-    // of the selected operand.
-    bool LowHalfSelect = Imm & 0x01;
-    bool HighHalfSelect = Imm & 0x10;
+  // The intrinsics only read one or two bits, clear the rest.
+  for (unsigned I = 0; I < NumElts; ++I) {
+    Constant *COp = V->getAggregateElement(I);
+    if (!COp || (!isa<UndefValue>(COp) && !isa<ConstantInt>(COp)))
+      return nullptr;
 
-    // Determine which operand(s) are actually in use for this instruction.
-    Value *V0 = LowInputSelect ? II.getArgOperand(1) : II.getArgOperand(0);
-    Value *V1 = HighInputSelect ? II.getArgOperand(1) : II.getArgOperand(0);
+    if (isa<UndefValue>(COp)) {
+      Indexes[I] = UndefValue::get(MaskEltTy);
+      continue;
+    }
 
-    // If needed, replace operands based on zero mask.
-    V0 = LowHalfZero ? ZeroVector : V0;
-    V1 = HighHalfZero ? ZeroVector : V1;
+    APInt Index = cast<ConstantInt>(COp)->getValue();
+    Index = Index.zextOrTrunc(32).getLoBits(2);
 
-    // Permute low half of result.
-    unsigned StartIndex = LowHalfSelect ? HalfSize : 0;
-    for (unsigned i = 0; i < HalfSize; ++i)
-      ShuffleMask[i] = StartIndex + i;
+    // The PD variants uses bit 1 to select per-lane element index, so
+    // shift down to convert to generic shuffle mask index.
+    if (II.getIntrinsicID() == Intrinsic::x86_avx_vpermilvar_pd ||
+        II.getIntrinsicID() == Intrinsic::x86_avx_vpermilvar_pd_256)
+      Index = Index.lshr(1);
 
-    // Permute high half of result.
-    StartIndex = HighHalfSelect ? HalfSize : 0;
-    StartIndex += NumElts;
-    for (unsigned i = 0; i < HalfSize; ++i)
-      ShuffleMask[i + HalfSize] = StartIndex + i;
+    // The _256 variants are a bit trickier since the mask bits always index
+    // into the corresponding 128 half. In order to convert to a generic
+    // shuffle, we have to make that explicit.
+    if ((II.getIntrinsicID() == Intrinsic::x86_avx_vpermilvar_ps_256 ||
+         II.getIntrinsicID() == Intrinsic::x86_avx_vpermilvar_pd_256) &&
+        ((NumElts / 2) <= I)) {
+      Index += APInt(32, NumElts / 2);
+    }
 
-    return Builder.CreateShuffleVector(V0, V1, ShuffleMask);
+    Indexes[I] = ConstantInt::get(MaskEltTy, Index);
   }
-  return nullptr;
+
+  auto ShuffleMask = ConstantVector::get(makeArrayRef(Indexes, NumElts));
+  auto V1 = II.getArgOperand(0);
+  auto V2 = UndefValue::get(V1->getType());
+  return Builder.CreateShuffleVector(V1, V2, ShuffleMask);
+}
+
+/// Attempt to convert vpermd/vpermps to shufflevector if the mask is constant.
+static Value *simplifyX86vpermv(const IntrinsicInst &II,
+                                InstCombiner::BuilderTy &Builder) {
+  auto *V = dyn_cast<Constant>(II.getArgOperand(1));
+  if (!V)
+    return nullptr;
+
+  auto *VecTy = cast<VectorType>(II.getType());
+  auto *MaskEltTy = Type::getInt32Ty(II.getContext());
+  unsigned Size = VecTy->getNumElements();
+  assert(Size == 8 && "Unexpected shuffle mask size");
+
+  // Construct a shuffle mask from constant integers or UNDEFs.
+  Constant *Indexes[8] = {NULL};
+
+  for (unsigned I = 0; I < Size; ++I) {
+    Constant *COp = V->getAggregateElement(I);
+    if (!COp || (!isa<UndefValue>(COp) && !isa<ConstantInt>(COp)))
+      return nullptr;
+
+    if (isa<UndefValue>(COp)) {
+      Indexes[I] = UndefValue::get(MaskEltTy);
+      continue;
+    }
+
+    APInt Index = cast<ConstantInt>(COp)->getValue();
+    Index = Index.zextOrTrunc(32).getLoBits(3);
+    Indexes[I] = ConstantInt::get(MaskEltTy, Index);
+  }
+
+  auto ShuffleMask = ConstantVector::get(makeArrayRef(Indexes, Size));
+  auto V1 = II.getArgOperand(0);
+  auto V2 = UndefValue::get(VecTy);
+  return Builder.CreateShuffleVector(V1, V2, ShuffleMask);
+}
+
+/// The shuffle mask for a perm2*128 selects any two halves of two 256-bit
+/// source vectors, unless a zero bit is set. If a zero bit is set,
+/// then ignore that half of the mask and clear that half of the vector.
+static Value *simplifyX86vperm2(const IntrinsicInst &II,
+                                InstCombiner::BuilderTy &Builder) {
+  auto *CInt = dyn_cast<ConstantInt>(II.getArgOperand(2));
+  if (!CInt)
+    return nullptr;
+
+  VectorType *VecTy = cast<VectorType>(II.getType());
+  ConstantAggregateZero *ZeroVector = ConstantAggregateZero::get(VecTy);
+
+  // The immediate permute control byte looks like this:
+  //    [1:0] - select 128 bits from sources for low half of destination
+  //    [2]   - ignore
+  //    [3]   - zero low half of destination
+  //    [5:4] - select 128 bits from sources for high half of destination
+  //    [6]   - ignore
+  //    [7]   - zero high half of destination
+
+  uint8_t Imm = CInt->getZExtValue();
+
+  bool LowHalfZero = Imm & 0x08;
+  bool HighHalfZero = Imm & 0x80;
+
+  // If both zero mask bits are set, this was just a weird way to
+  // generate a zero vector.
+  if (LowHalfZero && HighHalfZero)
+    return ZeroVector;
+
+  // If 0 or 1 zero mask bits are set, this is a simple shuffle.
+  unsigned NumElts = VecTy->getNumElements();
+  unsigned HalfSize = NumElts / 2;
+  SmallVector<uint32_t, 8> ShuffleMask(NumElts);
+
+  // The high bit of the selection field chooses the 1st or 2nd operand.
+  bool LowInputSelect = Imm & 0x02;
+  bool HighInputSelect = Imm & 0x20;
+
+  // The low bit of the selection field chooses the low or high half
+  // of the selected operand.
+  bool LowHalfSelect = Imm & 0x01;
+  bool HighHalfSelect = Imm & 0x10;
+
+  // Determine which operand(s) are actually in use for this instruction.
+  Value *V0 = LowInputSelect ? II.getArgOperand(1) : II.getArgOperand(0);
+  Value *V1 = HighInputSelect ? II.getArgOperand(1) : II.getArgOperand(0);
+
+  // If needed, replace operands based on zero mask.
+  V0 = LowHalfZero ? ZeroVector : V0;
+  V1 = HighHalfZero ? ZeroVector : V1;
+
+  // Permute low half of result.
+  unsigned StartIndex = LowHalfSelect ? HalfSize : 0;
+  for (unsigned i = 0; i < HalfSize; ++i)
+    ShuffleMask[i] = StartIndex + i;
+
+  // Permute high half of result.
+  StartIndex = HighHalfSelect ? HalfSize : 0;
+  StartIndex += NumElts;
+  for (unsigned i = 0; i < HalfSize; ++i)
+    ShuffleMask[i + HalfSize] = StartIndex + i;
+
+  return Builder.CreateShuffleVector(V0, V1, ShuffleMask);
 }
 
 /// Decode XOP integer vector comparison intrinsics.
-static Value *SimplifyX86vpcom(const IntrinsicInst &II,
-                               InstCombiner::BuilderTy &Builder, bool IsSigned) {
+static Value *simplifyX86vpcom(const IntrinsicInst &II,
+                               InstCombiner::BuilderTy &Builder,
+                               bool IsSigned) {
   if (auto *CInt = dyn_cast<ConstantInt>(II.getArgOperand(2))) {
     uint64_t Imm = CInt->getZExtValue() & 0x7;
     VectorType *VecTy = cast<VectorType>(II.getType());
@@ -667,21 +955,296 @@ static Value *SimplifyX86vpcom(const IntrinsicInst &II,
       return ConstantInt::getSigned(VecTy, -1); // TRUE
     }
 
-    if (Value *Cmp = Builder.CreateICmp(Pred, II.getArgOperand(0), II.getArgOperand(1)))
+    if (Value *Cmp = Builder.CreateICmp(Pred, II.getArgOperand(0),
+                                        II.getArgOperand(1)))
       return Builder.CreateSExtOrTrunc(Cmp, VecTy);
   }
   return nullptr;
 }
 
-/// visitCallInst - CallInst simplification.  This mostly only handles folding
-/// of intrinsic instructions.  For normal calls, it allows visitCallSite to do
-/// the heavy lifting.
-///
+static Value *simplifyMinnumMaxnum(const IntrinsicInst &II) {
+  Value *Arg0 = II.getArgOperand(0);
+  Value *Arg1 = II.getArgOperand(1);
+
+  // fmin(x, x) -> x
+  if (Arg0 == Arg1)
+    return Arg0;
+
+  const auto *C1 = dyn_cast<ConstantFP>(Arg1);
+
+  // fmin(x, nan) -> x
+  if (C1 && C1->isNaN())
+    return Arg0;
+
+  // This is the value because if undef were NaN, we would return the other
+  // value and cannot return a NaN unless both operands are.
+  //
+  // fmin(undef, x) -> x
+  if (isa<UndefValue>(Arg0))
+    return Arg1;
+
+  // fmin(x, undef) -> x
+  if (isa<UndefValue>(Arg1))
+    return Arg0;
+
+  Value *X = nullptr;
+  Value *Y = nullptr;
+  if (II.getIntrinsicID() == Intrinsic::minnum) {
+    // fmin(x, fmin(x, y)) -> fmin(x, y)
+    // fmin(y, fmin(x, y)) -> fmin(x, y)
+    if (match(Arg1, m_FMin(m_Value(X), m_Value(Y)))) {
+      if (Arg0 == X || Arg0 == Y)
+        return Arg1;
+    }
+
+    // fmin(fmin(x, y), x) -> fmin(x, y)
+    // fmin(fmin(x, y), y) -> fmin(x, y)
+    if (match(Arg0, m_FMin(m_Value(X), m_Value(Y)))) {
+      if (Arg1 == X || Arg1 == Y)
+        return Arg0;
+    }
+
+    // TODO: fmin(nnan x, inf) -> x
+    // TODO: fmin(nnan ninf x, flt_max) -> x
+    if (C1 && C1->isInfinity()) {
+      // fmin(x, -inf) -> -inf
+      if (C1->isNegative())
+        return Arg1;
+    }
+  } else {
+    assert(II.getIntrinsicID() == Intrinsic::maxnum);
+    // fmax(x, fmax(x, y)) -> fmax(x, y)
+    // fmax(y, fmax(x, y)) -> fmax(x, y)
+    if (match(Arg1, m_FMax(m_Value(X), m_Value(Y)))) {
+      if (Arg0 == X || Arg0 == Y)
+        return Arg1;
+    }
+
+    // fmax(fmax(x, y), x) -> fmax(x, y)
+    // fmax(fmax(x, y), y) -> fmax(x, y)
+    if (match(Arg0, m_FMax(m_Value(X), m_Value(Y)))) {
+      if (Arg1 == X || Arg1 == Y)
+        return Arg0;
+    }
+
+    // TODO: fmax(nnan x, -inf) -> x
+    // TODO: fmax(nnan ninf x, -flt_max) -> x
+    if (C1 && C1->isInfinity()) {
+      // fmax(x, inf) -> inf
+      if (!C1->isNegative())
+        return Arg1;
+    }
+  }
+  return nullptr;
+}
+
+static bool maskIsAllOneOrUndef(Value *Mask) {
+  auto *ConstMask = dyn_cast<Constant>(Mask);
+  if (!ConstMask)
+    return false;
+  if (ConstMask->isAllOnesValue() || isa<UndefValue>(ConstMask))
+    return true;
+  for (unsigned I = 0, E = ConstMask->getType()->getVectorNumElements(); I != E;
+       ++I) {
+    if (auto *MaskElt = ConstMask->getAggregateElement(I))
+      if (MaskElt->isAllOnesValue() || isa<UndefValue>(MaskElt))
+        continue;
+    return false;
+  }
+  return true;
+}
+
+static Value *simplifyMaskedLoad(const IntrinsicInst &II,
+                                 InstCombiner::BuilderTy &Builder) {
+  // If the mask is all ones or undefs, this is a plain vector load of the 1st
+  // argument.
+  if (maskIsAllOneOrUndef(II.getArgOperand(2))) {
+    Value *LoadPtr = II.getArgOperand(0);
+    unsigned Alignment = cast<ConstantInt>(II.getArgOperand(1))->getZExtValue();
+    return Builder.CreateAlignedLoad(LoadPtr, Alignment, "unmaskedload");
+  }
+
+  return nullptr;
+}
+
+static Instruction *simplifyMaskedStore(IntrinsicInst &II, InstCombiner &IC) {
+  auto *ConstMask = dyn_cast<Constant>(II.getArgOperand(3));
+  if (!ConstMask)
+    return nullptr;
+
+  // If the mask is all zeros, this instruction does nothing.
+  if (ConstMask->isNullValue())
+    return IC.eraseInstFromFunction(II);
+
+  // If the mask is all ones, this is a plain vector store of the 1st argument.
+  if (ConstMask->isAllOnesValue()) {
+    Value *StorePtr = II.getArgOperand(1);
+    unsigned Alignment = cast<ConstantInt>(II.getArgOperand(2))->getZExtValue();
+    return new StoreInst(II.getArgOperand(0), StorePtr, false, Alignment);
+  }
+
+  return nullptr;
+}
+
+static Instruction *simplifyMaskedGather(IntrinsicInst &II, InstCombiner &IC) {
+  // If the mask is all zeros, return the "passthru" argument of the gather.
+  auto *ConstMask = dyn_cast<Constant>(II.getArgOperand(2));
+  if (ConstMask && ConstMask->isNullValue())
+    return IC.replaceInstUsesWith(II, II.getArgOperand(3));
+
+  return nullptr;
+}
+
+static Instruction *simplifyMaskedScatter(IntrinsicInst &II, InstCombiner &IC) {
+  // If the mask is all zeros, a scatter does nothing.
+  auto *ConstMask = dyn_cast<Constant>(II.getArgOperand(3));
+  if (ConstMask && ConstMask->isNullValue())
+    return IC.eraseInstFromFunction(II);
+
+  return nullptr;
+}
+
+// TODO: If the x86 backend knew how to convert a bool vector mask back to an
+// XMM register mask efficiently, we could transform all x86 masked intrinsics
+// to LLVM masked intrinsics and remove the x86 masked intrinsic defs.
+static Instruction *simplifyX86MaskedLoad(IntrinsicInst &II, InstCombiner &IC) {
+  Value *Ptr = II.getOperand(0);
+  Value *Mask = II.getOperand(1);
+  Constant *ZeroVec = Constant::getNullValue(II.getType());
+
+  // Special case a zero mask since that's not a ConstantDataVector.
+  // This masked load instruction creates a zero vector.
+  if (isa<ConstantAggregateZero>(Mask))
+    return IC.replaceInstUsesWith(II, ZeroVec);
+
+  auto *ConstMask = dyn_cast<ConstantDataVector>(Mask);
+  if (!ConstMask)
+    return nullptr;
+
+  // The mask is constant. Convert this x86 intrinsic to the LLVM instrinsic
+  // to allow target-independent optimizations.
+
+  // First, cast the x86 intrinsic scalar pointer to a vector pointer to match
+  // the LLVM intrinsic definition for the pointer argument.
+  unsigned AddrSpace = cast<PointerType>(Ptr->getType())->getAddressSpace();
+  PointerType *VecPtrTy = PointerType::get(II.getType(), AddrSpace);
+  Value *PtrCast = IC.Builder->CreateBitCast(Ptr, VecPtrTy, "castvec");
+
+  // Second, convert the x86 XMM integer vector mask to a vector of bools based
+  // on each element's most significant bit (the sign bit).
+  Constant *BoolMask = getNegativeIsTrueBoolVec(ConstMask);
+
+  // The pass-through vector for an x86 masked load is a zero vector.
+  CallInst *NewMaskedLoad =
+      IC.Builder->CreateMaskedLoad(PtrCast, 1, BoolMask, ZeroVec);
+  return IC.replaceInstUsesWith(II, NewMaskedLoad);
+}
+
+// TODO: If the x86 backend knew how to convert a bool vector mask back to an
+// XMM register mask efficiently, we could transform all x86 masked intrinsics
+// to LLVM masked intrinsics and remove the x86 masked intrinsic defs.
+static bool simplifyX86MaskedStore(IntrinsicInst &II, InstCombiner &IC) {
+  Value *Ptr = II.getOperand(0);
+  Value *Mask = II.getOperand(1);
+  Value *Vec = II.getOperand(2);
+
+  // Special case a zero mask since that's not a ConstantDataVector:
+  // this masked store instruction does nothing.
+  if (isa<ConstantAggregateZero>(Mask)) {
+    IC.eraseInstFromFunction(II);
+    return true;
+  }
+
+  // The SSE2 version is too weird (eg, unaligned but non-temporal) to do
+  // anything else at this level.
+  if (II.getIntrinsicID() == Intrinsic::x86_sse2_maskmov_dqu)
+    return false;
+
+  auto *ConstMask = dyn_cast<ConstantDataVector>(Mask);
+  if (!ConstMask)
+    return false;
+
+  // The mask is constant. Convert this x86 intrinsic to the LLVM instrinsic
+  // to allow target-independent optimizations.
+
+  // First, cast the x86 intrinsic scalar pointer to a vector pointer to match
+  // the LLVM intrinsic definition for the pointer argument.
+  unsigned AddrSpace = cast<PointerType>(Ptr->getType())->getAddressSpace();
+  PointerType *VecPtrTy = PointerType::get(Vec->getType(), AddrSpace);
+  Value *PtrCast = IC.Builder->CreateBitCast(Ptr, VecPtrTy, "castvec");
+
+  // Second, convert the x86 XMM integer vector mask to a vector of bools based
+  // on each element's most significant bit (the sign bit).
+  Constant *BoolMask = getNegativeIsTrueBoolVec(ConstMask);
+
+  IC.Builder->CreateMaskedStore(Vec, PtrCast, 1, BoolMask);
+
+  // 'Replace uses' doesn't work for stores. Erase the original masked store.
+  IC.eraseInstFromFunction(II);
+  return true;
+}
+
+// Returns true iff the 2 intrinsics have the same operands, limiting the
+// comparison to the first NumOperands.
+static bool haveSameOperands(const IntrinsicInst &I, const IntrinsicInst &E,
+                             unsigned NumOperands) {
+  assert(I.getNumArgOperands() >= NumOperands && "Not enough operands");
+  assert(E.getNumArgOperands() >= NumOperands && "Not enough operands");
+  for (unsigned i = 0; i < NumOperands; i++)
+    if (I.getArgOperand(i) != E.getArgOperand(i))
+      return false;
+  return true;
+}
+
+// Remove trivially empty start/end intrinsic ranges, i.e. a start
+// immediately followed by an end (ignoring debuginfo or other
+// start/end intrinsics in between). As this handles only the most trivial
+// cases, tracking the nesting level is not needed:
+//
+//   call @llvm.foo.start(i1 0) ; &I
+//   call @llvm.foo.start(i1 0)
+//   call @llvm.foo.end(i1 0) ; This one will not be skipped: it will be removed
+//   call @llvm.foo.end(i1 0)
+static bool removeTriviallyEmptyRange(IntrinsicInst &I, unsigned StartID,
+                                      unsigned EndID, InstCombiner &IC) {
+  assert(I.getIntrinsicID() == StartID &&
+         "Start intrinsic does not have expected ID");
+  BasicBlock::iterator BI(I), BE(I.getParent()->end());
+  for (++BI; BI != BE; ++BI) {
+    if (auto *E = dyn_cast<IntrinsicInst>(BI)) {
+      if (isa<DbgInfoIntrinsic>(E) || E->getIntrinsicID() == StartID)
+        continue;
+      if (E->getIntrinsicID() == EndID &&
+          haveSameOperands(I, *E, E->getNumArgOperands())) {
+        IC.eraseInstFromFunction(*E);
+        IC.eraseInstFromFunction(I);
+        return true;
+      }
+    }
+    break;
+  }
+
+  return false;
+}
+
+Instruction *InstCombiner::visitVAStartInst(VAStartInst &I) {
+  removeTriviallyEmptyRange(I, Intrinsic::vastart, Intrinsic::vaend, *this);
+  return nullptr;
+}
+
+Instruction *InstCombiner::visitVACopyInst(VACopyInst &I) {
+  removeTriviallyEmptyRange(I, Intrinsic::vacopy, Intrinsic::vaend, *this);
+  return nullptr;
+}
+
+/// CallInst simplification. This mostly only handles folding of intrinsic
+/// instructions. For normal calls, it allows visitCallSite to do the heavy
+/// lifting.
 Instruction *InstCombiner::visitCallInst(CallInst &CI) {
   auto Args = CI.arg_operands();
   if (Value *V = SimplifyCall(CI.getCalledValue(), Args.begin(), Args.end(), DL,
                               TLI, DT, AC))
-    return ReplaceInstUsesWith(CI, V);
+    return replaceInstUsesWith(CI, V);
 
   if (isFreeCall(&CI, TLI))
     return visitFree(CI);
@@ -705,7 +1268,7 @@ Instruction *InstCombiner::visitCallInst(CallInst &CI) {
     // memmove/cpy/set of zero bytes is a noop.
     if (Constant *NumBytes = dyn_cast<Constant>(MI->getLength())) {
       if (NumBytes->isNullValue())
-        return EraseInstFromFunction(CI);
+        return eraseInstFromFunction(CI);
 
       if (ConstantInt *CI = dyn_cast<ConstantInt>(NumBytes))
         if (CI->getZExtValue() == 1) {
@@ -738,7 +1301,7 @@ Instruction *InstCombiner::visitCallInst(CallInst &CI) {
     if (MemTransferInst *MTI = dyn_cast<MemTransferInst>(MI)) {
       // memmove(x,x,size) -> noop.
       if (MTI->getSource() == MTI->getDest())
-        return EraseInstFromFunction(CI);
+        return eraseInstFromFunction(CI);
     }
 
     // If we can determine a pointer alignment that is bigger than currently
@@ -754,19 +1317,30 @@ Instruction *InstCombiner::visitCallInst(CallInst &CI) {
     if (Changed) return II;
   }
 
-  auto SimplifyDemandedVectorEltsLow = [this](Value *Op, unsigned Width, unsigned DemandedWidth)
-  {
+  auto SimplifyDemandedVectorEltsLow = [this](Value *Op, unsigned Width,
+                                              unsigned DemandedWidth) {
     APInt UndefElts(Width, 0);
     APInt DemandedElts = APInt::getLowBitsSet(Width, DemandedWidth);
     return SimplifyDemandedVectorElts(Op, DemandedElts, UndefElts);
   };
+  auto SimplifyDemandedVectorEltsHigh = [this](Value *Op, unsigned Width,
+                                              unsigned DemandedWidth) {
+    APInt UndefElts(Width, 0);
+    APInt DemandedElts = APInt::getHighBitsSet(Width, DemandedWidth);
+    return SimplifyDemandedVectorElts(Op, DemandedElts, UndefElts);
+  };
 
   switch (II->getIntrinsicID()) {
   default: break;
   case Intrinsic::objectsize: {
     uint64_t Size;
-    if (getObjectSize(II->getArgOperand(0), Size, DL, TLI))
-      return ReplaceInstUsesWith(CI, ConstantInt::get(CI.getType(), Size));
+    if (getObjectSize(II->getArgOperand(0), Size, DL, TLI)) {
+      APInt APSize(II->getType()->getIntegerBitWidth(), Size);
+      // Equality check to be sure that `Size` can fit in a value of type
+      // `II->getType()`
+      if (APSize == Size)
+        return replaceInstUsesWith(CI, ConstantInt::get(II->getType(), APSize));
+    }
     return nullptr;
   }
   case Intrinsic::bswap: {
@@ -775,7 +1349,7 @@ Instruction *InstCombiner::visitCallInst(CallInst &CI) {
 
     // bswap(bswap(x)) -> x
     if (match(IIOperand, m_BSwap(m_Value(X))))
-        return ReplaceInstUsesWith(CI, X);
+        return replaceInstUsesWith(CI, X);
 
     // bswap(trunc(bswap(x))) -> trunc(lshr(x, c))
     if (match(IIOperand, m_Trunc(m_BSwap(m_Value(X))))) {
@@ -794,18 +1368,29 @@ Instruction *InstCombiner::visitCallInst(CallInst &CI) {
 
     // bitreverse(bitreverse(x)) -> x
     if (match(IIOperand, m_Intrinsic<Intrinsic::bitreverse>(m_Value(X))))
-      return ReplaceInstUsesWith(CI, X);
+      return replaceInstUsesWith(CI, X);
     break;
   }
 
+  case Intrinsic::masked_load:
+    if (Value *SimplifiedMaskedOp = simplifyMaskedLoad(*II, *Builder))
+      return replaceInstUsesWith(CI, SimplifiedMaskedOp);
+    break;
+  case Intrinsic::masked_store:
+    return simplifyMaskedStore(*II, *this);
+  case Intrinsic::masked_gather:
+    return simplifyMaskedGather(*II, *this);
+  case Intrinsic::masked_scatter:
+    return simplifyMaskedScatter(*II, *this);
+
   case Intrinsic::powi:
     if (ConstantInt *Power = dyn_cast<ConstantInt>(II->getArgOperand(1))) {
       // powi(x, 0) -> 1.0
       if (Power->isZero())
-        return ReplaceInstUsesWith(CI, ConstantFP::get(CI.getType(), 1.0));
+        return replaceInstUsesWith(CI, ConstantFP::get(CI.getType(), 1.0));
       // powi(x, 1) -> x
       if (Power->isOne())
-        return ReplaceInstUsesWith(CI, II->getArgOperand(0));
+        return replaceInstUsesWith(CI, II->getArgOperand(0));
       // powi(x, -1) -> 1/x
       if (Power->isAllOnesValue())
         return BinaryOperator::CreateFDiv(ConstantFP::get(CI.getType(), 1.0),
@@ -825,7 +1410,7 @@ Instruction *InstCombiner::visitCallInst(CallInst &CI) {
     unsigned TrailingZeros = KnownOne.countTrailingZeros();
     APInt Mask(APInt::getLowBitsSet(BitWidth, TrailingZeros));
     if ((Mask & KnownZero) == Mask)
-      return ReplaceInstUsesWith(CI, ConstantInt::get(IT,
+      return replaceInstUsesWith(CI, ConstantInt::get(IT,
                                  APInt(BitWidth, TrailingZeros)));
 
     }
@@ -843,7 +1428,7 @@ Instruction *InstCombiner::visitCallInst(CallInst &CI) {
     unsigned LeadingZeros = KnownOne.countLeadingZeros();
     APInt Mask(APInt::getHighBitsSet(BitWidth, LeadingZeros));
     if ((Mask & KnownZero) == Mask)
-      return ReplaceInstUsesWith(CI, ConstantInt::get(IT,
+      return replaceInstUsesWith(CI, ConstantInt::get(IT,
                                  APInt(BitWidth, LeadingZeros)));
 
     }
@@ -882,84 +1467,14 @@ Instruction *InstCombiner::visitCallInst(CallInst &CI) {
   case Intrinsic::maxnum: {
     Value *Arg0 = II->getArgOperand(0);
     Value *Arg1 = II->getArgOperand(1);
-
-    // fmin(x, x) -> x
-    if (Arg0 == Arg1)
-      return ReplaceInstUsesWith(CI, Arg0);
-
-    const ConstantFP *C0 = dyn_cast<ConstantFP>(Arg0);
-    const ConstantFP *C1 = dyn_cast<ConstantFP>(Arg1);
-
-    // Canonicalize constants into the RHS.
-    if (C0 && !C1) {
+    // Canonicalize constants to the RHS.
+    if (isa<ConstantFP>(Arg0) && !isa<ConstantFP>(Arg1)) {
       II->setArgOperand(0, Arg1);
       II->setArgOperand(1, Arg0);
       return II;
     }
-
-    // fmin(x, nan) -> x
-    if (C1 && C1->isNaN())
-      return ReplaceInstUsesWith(CI, Arg0);
-
-    // This is the value because if undef were NaN, we would return the other
-    // value and cannot return a NaN unless both operands are.
-    //
-    // fmin(undef, x) -> x
-    if (isa<UndefValue>(Arg0))
-      return ReplaceInstUsesWith(CI, Arg1);
-
-    // fmin(x, undef) -> x
-    if (isa<UndefValue>(Arg1))
-      return ReplaceInstUsesWith(CI, Arg0);
-
-    Value *X = nullptr;
-    Value *Y = nullptr;
-    if (II->getIntrinsicID() == Intrinsic::minnum) {
-      // fmin(x, fmin(x, y)) -> fmin(x, y)
-      // fmin(y, fmin(x, y)) -> fmin(x, y)
-      if (match(Arg1, m_FMin(m_Value(X), m_Value(Y)))) {
-        if (Arg0 == X || Arg0 == Y)
-          return ReplaceInstUsesWith(CI, Arg1);
-      }
-
-      // fmin(fmin(x, y), x) -> fmin(x, y)
-      // fmin(fmin(x, y), y) -> fmin(x, y)
-      if (match(Arg0, m_FMin(m_Value(X), m_Value(Y)))) {
-        if (Arg1 == X || Arg1 == Y)
-          return ReplaceInstUsesWith(CI, Arg0);
-      }
-
-      // TODO: fmin(nnan x, inf) -> x
-      // TODO: fmin(nnan ninf x, flt_max) -> x
-      if (C1 && C1->isInfinity()) {
-        // fmin(x, -inf) -> -inf
-        if (C1->isNegative())
-          return ReplaceInstUsesWith(CI, Arg1);
-      }
-    } else {
-      assert(II->getIntrinsicID() == Intrinsic::maxnum);
-      // fmax(x, fmax(x, y)) -> fmax(x, y)
-      // fmax(y, fmax(x, y)) -> fmax(x, y)
-      if (match(Arg1, m_FMax(m_Value(X), m_Value(Y)))) {
-        if (Arg0 == X || Arg0 == Y)
-          return ReplaceInstUsesWith(CI, Arg1);
-      }
-
-      // fmax(fmax(x, y), x) -> fmax(x, y)
-      // fmax(fmax(x, y), y) -> fmax(x, y)
-      if (match(Arg0, m_FMax(m_Value(X), m_Value(Y)))) {
-        if (Arg1 == X || Arg1 == Y)
-          return ReplaceInstUsesWith(CI, Arg0);
-      }
-
-      // TODO: fmax(nnan x, -inf) -> x
-      // TODO: fmax(nnan ninf x, -flt_max) -> x
-      if (C1 && C1->isInfinity()) {
-        // fmax(x, inf) -> inf
-        if (!C1->isNegative())
-          return ReplaceInstUsesWith(CI, Arg1);
-      }
-    }
+    if (Value *V = simplifyMinnumMaxnum(*II))
+      return replaceInstUsesWith(*II, V);
     break;
   }
   case Intrinsic::ppc_altivec_lvx:
@@ -1041,19 +1556,6 @@ Instruction *InstCombiner::visitCallInst(CallInst &CI) {
     }
     break;
 
-  case Intrinsic::x86_sse_storeu_ps:
-  case Intrinsic::x86_sse2_storeu_pd:
-  case Intrinsic::x86_sse2_storeu_dq:
-    // Turn X86 storeu -> store if the pointer is known aligned.
-    if (getOrEnforceKnownAlignment(II->getArgOperand(0), 16, DL, II, AC, DT) >=
-        16) {
-      Type *OpPtrTy =
-        PointerType::getUnqual(II->getArgOperand(1)->getType());
-      Value *Ptr = Builder->CreateBitCast(II->getArgOperand(0), OpPtrTy);
-      return new StoreInst(II->getArgOperand(1), Ptr);
-    }
-    break;
-
   case Intrinsic::x86_vcvtph2ps_128:
   case Intrinsic::x86_vcvtph2ps_256: {
     auto Arg = II->getArgOperand(0);
@@ -1070,12 +1572,12 @@ Instruction *InstCombiner::visitCallInst(CallInst &CI) {
 
     // Constant folding: Convert to generic half to single conversion.
     if (isa<ConstantAggregateZero>(Arg))
-      return ReplaceInstUsesWith(*II, ConstantAggregateZero::get(RetType));
+      return replaceInstUsesWith(*II, ConstantAggregateZero::get(RetType));
 
     if (isa<ConstantDataVector>(Arg)) {
       auto VectorHalfAsShorts = Arg;
       if (RetWidth < ArgWidth) {
-        SmallVector<int, 8> SubVecMask;
+        SmallVector<uint32_t, 8> SubVecMask;
         for (unsigned i = 0; i != RetWidth; ++i)
           SubVecMask.push_back((int)i);
         VectorHalfAsShorts = Builder->CreateShuffleVector(
@@ -1087,7 +1589,7 @@ Instruction *InstCombiner::visitCallInst(CallInst &CI) {
       auto VectorHalfs =
           Builder->CreateBitCast(VectorHalfAsShorts, VectorHalfType);
       auto VectorFloats = Builder->CreateFPExt(VectorHalfs, RetType);
-      return ReplaceInstUsesWith(*II, VectorFloats);
+      return replaceInstUsesWith(*II, VectorFloats);
     }
 
     // We only use the lowest lanes of the argument.
@@ -1117,6 +1619,107 @@ Instruction *InstCombiner::visitCallInst(CallInst &CI) {
     break;
   }
 
+  case Intrinsic::x86_mmx_pmovmskb:
+  case Intrinsic::x86_sse_movmsk_ps:
+  case Intrinsic::x86_sse2_movmsk_pd:
+  case Intrinsic::x86_sse2_pmovmskb_128:
+  case Intrinsic::x86_avx_movmsk_pd_256:
+  case Intrinsic::x86_avx_movmsk_ps_256:
+  case Intrinsic::x86_avx2_pmovmskb: {
+    if (Value *V = simplifyX86movmsk(*II, *Builder))
+      return replaceInstUsesWith(*II, V);
+    break;
+  }
+
+  case Intrinsic::x86_sse_comieq_ss:
+  case Intrinsic::x86_sse_comige_ss:
+  case Intrinsic::x86_sse_comigt_ss:
+  case Intrinsic::x86_sse_comile_ss:
+  case Intrinsic::x86_sse_comilt_ss:
+  case Intrinsic::x86_sse_comineq_ss:
+  case Intrinsic::x86_sse_ucomieq_ss:
+  case Intrinsic::x86_sse_ucomige_ss:
+  case Intrinsic::x86_sse_ucomigt_ss:
+  case Intrinsic::x86_sse_ucomile_ss:
+  case Intrinsic::x86_sse_ucomilt_ss:
+  case Intrinsic::x86_sse_ucomineq_ss:
+  case Intrinsic::x86_sse2_comieq_sd:
+  case Intrinsic::x86_sse2_comige_sd:
+  case Intrinsic::x86_sse2_comigt_sd:
+  case Intrinsic::x86_sse2_comile_sd:
+  case Intrinsic::x86_sse2_comilt_sd:
+  case Intrinsic::x86_sse2_comineq_sd:
+  case Intrinsic::x86_sse2_ucomieq_sd:
+  case Intrinsic::x86_sse2_ucomige_sd:
+  case Intrinsic::x86_sse2_ucomigt_sd:
+  case Intrinsic::x86_sse2_ucomile_sd:
+  case Intrinsic::x86_sse2_ucomilt_sd:
+  case Intrinsic::x86_sse2_ucomineq_sd: {
+    // These intrinsics only demand the 0th element of their input vectors. If
+    // we can simplify the input based on that, do so now.
+    bool MadeChange = false;
+    Value *Arg0 = II->getArgOperand(0);
+    Value *Arg1 = II->getArgOperand(1);
+    unsigned VWidth = Arg0->getType()->getVectorNumElements();
+    if (Value *V = SimplifyDemandedVectorEltsLow(Arg0, VWidth, 1)) {
+      II->setArgOperand(0, V);
+      MadeChange = true;
+    }
+    if (Value *V = SimplifyDemandedVectorEltsLow(Arg1, VWidth, 1)) {
+      II->setArgOperand(1, V);
+      MadeChange = true;
+    }
+    if (MadeChange)
+      return II;
+    break;
+  }
+
+  case Intrinsic::x86_sse_add_ss:
+  case Intrinsic::x86_sse_sub_ss:
+  case Intrinsic::x86_sse_mul_ss:
+  case Intrinsic::x86_sse_div_ss:
+  case Intrinsic::x86_sse_min_ss:
+  case Intrinsic::x86_sse_max_ss:
+  case Intrinsic::x86_sse_cmp_ss:
+  case Intrinsic::x86_sse2_add_sd:
+  case Intrinsic::x86_sse2_sub_sd:
+  case Intrinsic::x86_sse2_mul_sd:
+  case Intrinsic::x86_sse2_div_sd:
+  case Intrinsic::x86_sse2_min_sd:
+  case Intrinsic::x86_sse2_max_sd:
+  case Intrinsic::x86_sse2_cmp_sd: {
+    // These intrinsics only demand the lowest element of the second input
+    // vector.
+    Value *Arg1 = II->getArgOperand(1);
+    unsigned VWidth = Arg1->getType()->getVectorNumElements();
+    if (Value *V = SimplifyDemandedVectorEltsLow(Arg1, VWidth, 1)) {
+      II->setArgOperand(1, V);
+      return II;
+    }
+    break;
+  }
+
+  case Intrinsic::x86_sse41_round_ss:
+  case Intrinsic::x86_sse41_round_sd: {
+    // These intrinsics demand the upper elements of the first input vector and
+    // the lowest element of the second input vector.
+    bool MadeChange = false;
+    Value *Arg0 = II->getArgOperand(0);
+    Value *Arg1 = II->getArgOperand(1);
+    unsigned VWidth = Arg0->getType()->getVectorNumElements();
+    if (Value *V = SimplifyDemandedVectorEltsHigh(Arg0, VWidth, VWidth - 1)) {
+      II->setArgOperand(0, V);
+      MadeChange = true;
+    }
+    if (Value *V = SimplifyDemandedVectorEltsLow(Arg1, VWidth, 1)) {
+      II->setArgOperand(1, V);
+      MadeChange = true;
+    }
+    if (MadeChange)
+      return II;
+    break;
+  }
+
   // Constant fold ashr( <A x Bi>, Ci ).
   // Constant fold lshr( <A x Bi>, Ci ).
   // Constant fold shl( <A x Bi>, Ci ).
@@ -1136,8 +1739,8 @@ Instruction *InstCombiner::visitCallInst(CallInst &CI) {
   case Intrinsic::x86_avx2_pslli_d:
   case Intrinsic::x86_avx2_pslli_q:
   case Intrinsic::x86_avx2_pslli_w:
-    if (Value *V = SimplifyX86immshift(*II, *Builder))
-      return ReplaceInstUsesWith(*II, V);
+    if (Value *V = simplifyX86immShift(*II, *Builder))
+      return replaceInstUsesWith(*II, V);
     break;
 
   case Intrinsic::x86_sse2_psra_d:
@@ -1156,8 +1759,8 @@ Instruction *InstCombiner::visitCallInst(CallInst &CI) {
   case Intrinsic::x86_avx2_psll_d:
   case Intrinsic::x86_avx2_psll_q:
   case Intrinsic::x86_avx2_psll_w: {
-    if (Value *V = SimplifyX86immshift(*II, *Builder))
-      return ReplaceInstUsesWith(*II, V);
+    if (Value *V = simplifyX86immShift(*II, *Builder))
+      return replaceInstUsesWith(*II, V);
 
     // SSE2/AVX2 uses only the first 64-bits of the 128-bit vector
     // operand to compute the shift amount.
@@ -1173,35 +1776,23 @@ Instruction *InstCombiner::visitCallInst(CallInst &CI) {
     break;
   }
 
-  case Intrinsic::x86_avx2_pmovsxbd:
-  case Intrinsic::x86_avx2_pmovsxbq:
-  case Intrinsic::x86_avx2_pmovsxbw:
-  case Intrinsic::x86_avx2_pmovsxdq:
-  case Intrinsic::x86_avx2_pmovsxwd:
-  case Intrinsic::x86_avx2_pmovsxwq:
-    if (Value *V = SimplifyX86extend(*II, *Builder, true))
-      return ReplaceInstUsesWith(*II, V);
-    break;
-
-  case Intrinsic::x86_sse41_pmovzxbd:
-  case Intrinsic::x86_sse41_pmovzxbq:
-  case Intrinsic::x86_sse41_pmovzxbw:
-  case Intrinsic::x86_sse41_pmovzxdq:
-  case Intrinsic::x86_sse41_pmovzxwd:
-  case Intrinsic::x86_sse41_pmovzxwq:
-  case Intrinsic::x86_avx2_pmovzxbd:
-  case Intrinsic::x86_avx2_pmovzxbq:
-  case Intrinsic::x86_avx2_pmovzxbw:
-  case Intrinsic::x86_avx2_pmovzxdq:
-  case Intrinsic::x86_avx2_pmovzxwd:
-  case Intrinsic::x86_avx2_pmovzxwq:
-    if (Value *V = SimplifyX86extend(*II, *Builder, false))
-      return ReplaceInstUsesWith(*II, V);
+  case Intrinsic::x86_avx2_psllv_d:
+  case Intrinsic::x86_avx2_psllv_d_256:
+  case Intrinsic::x86_avx2_psllv_q:
+  case Intrinsic::x86_avx2_psllv_q_256:
+  case Intrinsic::x86_avx2_psrav_d:
+  case Intrinsic::x86_avx2_psrav_d_256:
+  case Intrinsic::x86_avx2_psrlv_d:
+  case Intrinsic::x86_avx2_psrlv_d_256:
+  case Intrinsic::x86_avx2_psrlv_q:
+  case Intrinsic::x86_avx2_psrlv_q_256:
+    if (Value *V = simplifyX86varShift(*II, *Builder))
+      return replaceInstUsesWith(*II, V);
     break;
 
   case Intrinsic::x86_sse41_insertps:
-    if (Value *V = SimplifyX86insertps(*II, *Builder))
-      return ReplaceInstUsesWith(*II, V);
+    if (Value *V = simplifyX86insertps(*II, *Builder))
+      return replaceInstUsesWith(*II, V);
     break;
 
   case Intrinsic::x86_sse4a_extrq: {
@@ -1223,19 +1814,22 @@ Instruction *InstCombiner::visitCallInst(CallInst &CI) {
            : nullptr;
 
     // Attempt to simplify to a constant, shuffle vector or EXTRQI call.
-    if (Value *V = SimplifyX86extrq(*II, Op0, CILength, CIIndex, *Builder))
-      return ReplaceInstUsesWith(*II, V);
+    if (Value *V = simplifyX86extrq(*II, Op0, CILength, CIIndex, *Builder))
+      return replaceInstUsesWith(*II, V);
 
     // EXTRQ only uses the lowest 64-bits of the first 128-bit vector
     // operands and the lowest 16-bits of the second.
+    bool MadeChange = false;
     if (Value *V = SimplifyDemandedVectorEltsLow(Op0, VWidth0, 1)) {
       II->setArgOperand(0, V);
-      return II;
+      MadeChange = true;
     }
     if (Value *V = SimplifyDemandedVectorEltsLow(Op1, VWidth1, 2)) {
       II->setArgOperand(1, V);
-      return II;
+      MadeChange = true;
     }
+    if (MadeChange)
+      return II;
     break;
   }
 
@@ -1252,8 +1846,8 @@ Instruction *InstCombiner::visitCallInst(CallInst &CI) {
     ConstantInt *CIIndex = dyn_cast<ConstantInt>(II->getArgOperand(2));
 
     // Attempt to simplify to a constant or shuffle vector.
-    if (Value *V = SimplifyX86extrq(*II, Op0, CILength, CIIndex, *Builder))
-      return ReplaceInstUsesWith(*II, V);
+    if (Value *V = simplifyX86extrq(*II, Op0, CILength, CIIndex, *Builder))
+      return replaceInstUsesWith(*II, V);
 
     // EXTRQI only uses the lowest 64-bits of the first 128-bit vector
     // operand.
@@ -1281,11 +1875,11 @@ Instruction *InstCombiner::visitCallInst(CallInst &CI) {
 
     // Attempt to simplify to a constant, shuffle vector or INSERTQI call.
     if (CI11) {
-      APInt V11 = CI11->getValue();
+      const APInt &V11 = CI11->getValue();
       APInt Len = V11.zextOrTrunc(6);
       APInt Idx = V11.lshr(8).zextOrTrunc(6);
-      if (Value *V = SimplifyX86insertq(*II, Op0, Op1, Len, Idx, *Builder))
-        return ReplaceInstUsesWith(*II, V);
+      if (Value *V = simplifyX86insertq(*II, Op0, Op1, Len, Idx, *Builder))
+        return replaceInstUsesWith(*II, V);
     }
 
     // INSERTQ only uses the lowest 64-bits of the first 128-bit vector
@@ -1317,21 +1911,23 @@ Instruction *InstCombiner::visitCallInst(CallInst &CI) {
     if (CILength && CIIndex) {
       APInt Len = CILength->getValue().zextOrTrunc(6);
       APInt Idx = CIIndex->getValue().zextOrTrunc(6);
-      if (Value *V = SimplifyX86insertq(*II, Op0, Op1, Len, Idx, *Builder))
-        return ReplaceInstUsesWith(*II, V);
+      if (Value *V = simplifyX86insertq(*II, Op0, Op1, Len, Idx, *Builder))
+        return replaceInstUsesWith(*II, V);
     }
 
     // INSERTQI only uses the lowest 64-bits of the first two 128-bit vector
     // operands.
+    bool MadeChange = false;
     if (Value *V = SimplifyDemandedVectorEltsLow(Op0, VWidth0, 1)) {
       II->setArgOperand(0, V);
-      return II;
+      MadeChange = true;
     }
-
     if (Value *V = SimplifyDemandedVectorEltsLow(Op1, VWidth1, 1)) {
       II->setArgOperand(1, V);
-      return II;
+      MadeChange = true;
     }
+    if (MadeChange)
+      return II;
     break;
   }
 
@@ -1352,143 +1948,87 @@ Instruction *InstCombiner::visitCallInst(CallInst &CI) {
 
     // fold (blend A, A, Mask) -> A
     if (Op0 == Op1)
-      return ReplaceInstUsesWith(CI, Op0);
+      return replaceInstUsesWith(CI, Op0);
 
     // Zero Mask - select 1st argument.
     if (isa<ConstantAggregateZero>(Mask))
-      return ReplaceInstUsesWith(CI, Op0);
+      return replaceInstUsesWith(CI, Op0);
 
     // Constant Mask - select 1st/2nd argument lane based on top bit of mask.
-    if (auto C = dyn_cast<ConstantDataVector>(Mask)) {
-      auto Tyi1 = Builder->getInt1Ty();
-      auto SelectorType = cast<VectorType>(Mask->getType());
-      auto EltTy = SelectorType->getElementType();
-      unsigned Size = SelectorType->getNumElements();
-      unsigned BitWidth =
-          EltTy->isFloatTy()
-              ? 32
-              : (EltTy->isDoubleTy() ? 64 : EltTy->getIntegerBitWidth());
-      assert((BitWidth == 64 || BitWidth == 32 || BitWidth == 8) &&
-             "Wrong arguments for variable blend intrinsic");
-      SmallVector<Constant *, 32> Selectors;
-      for (unsigned I = 0; I < Size; ++I) {
-        // The intrinsics only read the top bit
-        uint64_t Selector;
-        if (BitWidth == 8)
-          Selector = C->getElementAsInteger(I);
-        else
-          Selector = C->getElementAsAPFloat(I).bitcastToAPInt().getZExtValue();
-        Selectors.push_back(ConstantInt::get(Tyi1, Selector >> (BitWidth - 1)));
-      }
-      auto NewSelector = ConstantVector::get(Selectors);
+    if (auto *ConstantMask = dyn_cast<ConstantDataVector>(Mask)) {
+      Constant *NewSelector = getNegativeIsTrueBoolVec(ConstantMask);
       return SelectInst::Create(NewSelector, Op1, Op0, "blendv");
     }
     break;
   }
 
   case Intrinsic::x86_ssse3_pshuf_b_128:
-  case Intrinsic::x86_avx2_pshuf_b: {
-    // Turn pshufb(V1,mask) -> shuffle(V1,Zero,mask) if mask is a constant.
-    auto *V = II->getArgOperand(1);
-    auto *VTy = cast<VectorType>(V->getType());
-    unsigned NumElts = VTy->getNumElements();
-    assert((NumElts == 16 || NumElts == 32) &&
-           "Unexpected number of elements in shuffle mask!");
-    // Initialize the resulting shuffle mask to all zeroes.
-    uint32_t Indexes[32] = {0};
-
-    if (auto *Mask = dyn_cast<ConstantDataVector>(V)) {
-      // Each byte in the shuffle control mask forms an index to permute the
-      // corresponding byte in the destination operand.
-      for (unsigned I = 0; I < NumElts; ++I) {
-        int8_t Index = Mask->getElementAsInteger(I);
-        // If the most significant bit (bit[7]) of each byte of the shuffle
-        // control mask is set, then zero is written in the result byte.
-        // The zero vector is in the right-hand side of the resulting
-        // shufflevector.
-
-        // The value of each index is the least significant 4 bits of the
-        // shuffle control byte.
-        Indexes[I] = (Index < 0) ? NumElts : Index & 0xF;
-      }
-    } else if (!isa<ConstantAggregateZero>(V))
-      break;
-
-    // The value of each index for the high 128-bit lane is the least
-    // significant 4 bits of the respective shuffle control byte.
-    for (unsigned I = 16; I < NumElts; ++I)
-      Indexes[I] += I & 0xF0;
-
-    auto NewC = ConstantDataVector::get(V->getContext(),
-                                        makeArrayRef(Indexes, NumElts));
-    auto V1 = II->getArgOperand(0);
-    auto V2 = Constant::getNullValue(II->getType());
-    auto Shuffle = Builder->CreateShuffleVector(V1, V2, NewC);
-    return ReplaceInstUsesWith(CI, Shuffle);
-  }
+  case Intrinsic::x86_avx2_pshuf_b:
+    if (Value *V = simplifyX86pshufb(*II, *Builder))
+      return replaceInstUsesWith(*II, V);
+    break;
 
   case Intrinsic::x86_avx_vpermilvar_ps:
   case Intrinsic::x86_avx_vpermilvar_ps_256:
   case Intrinsic::x86_avx_vpermilvar_pd:
-  case Intrinsic::x86_avx_vpermilvar_pd_256: {
-    // Convert vpermil* to shufflevector if the mask is constant.
-    Value *V = II->getArgOperand(1);
-    unsigned Size = cast<VectorType>(V->getType())->getNumElements();
-    assert(Size == 8 || Size == 4 || Size == 2);
-    uint32_t Indexes[8];
-    if (auto C = dyn_cast<ConstantDataVector>(V)) {
-      // The intrinsics only read one or two bits, clear the rest.
-      for (unsigned I = 0; I < Size; ++I) {
-        uint32_t Index = C->getElementAsInteger(I) & 0x3;
-        if (II->getIntrinsicID() == Intrinsic::x86_avx_vpermilvar_pd ||
-            II->getIntrinsicID() == Intrinsic::x86_avx_vpermilvar_pd_256)
-          Index >>= 1;
-        Indexes[I] = Index;
-      }
-    } else if (isa<ConstantAggregateZero>(V)) {
-      for (unsigned I = 0; I < Size; ++I)
-        Indexes[I] = 0;
-    } else {
-      break;
-    }
-    // The _256 variants are a bit trickier since the mask bits always index
-    // into the corresponding 128 half. In order to convert to a generic
-    // shuffle, we have to make that explicit.
-    if (II->getIntrinsicID() == Intrinsic::x86_avx_vpermilvar_ps_256 ||
-        II->getIntrinsicID() == Intrinsic::x86_avx_vpermilvar_pd_256) {
-      for (unsigned I = Size / 2; I < Size; ++I)
-        Indexes[I] += Size / 2;
-    }
-    auto NewC =
-        ConstantDataVector::get(V->getContext(), makeArrayRef(Indexes, Size));
-    auto V1 = II->getArgOperand(0);
-    auto V2 = UndefValue::get(V1->getType());
-    auto Shuffle = Builder->CreateShuffleVector(V1, V2, NewC);
-    return ReplaceInstUsesWith(CI, Shuffle);
-  }
+  case Intrinsic::x86_avx_vpermilvar_pd_256:
+    if (Value *V = simplifyX86vpermilvar(*II, *Builder))
+      return replaceInstUsesWith(*II, V);
+    break;
+
+  case Intrinsic::x86_avx2_permd:
+  case Intrinsic::x86_avx2_permps:
+    if (Value *V = simplifyX86vpermv(*II, *Builder))
+      return replaceInstUsesWith(*II, V);
+    break;
 
   case Intrinsic::x86_avx_vperm2f128_pd_256:
   case Intrinsic::x86_avx_vperm2f128_ps_256:
   case Intrinsic::x86_avx_vperm2f128_si_256:
   case Intrinsic::x86_avx2_vperm2i128:
-    if (Value *V = SimplifyX86vperm2(*II, *Builder))
-      return ReplaceInstUsesWith(*II, V);
+    if (Value *V = simplifyX86vperm2(*II, *Builder))
+      return replaceInstUsesWith(*II, V);
+    break;
+
+  case Intrinsic::x86_avx_maskload_ps:
+  case Intrinsic::x86_avx_maskload_pd:
+  case Intrinsic::x86_avx_maskload_ps_256:
+  case Intrinsic::x86_avx_maskload_pd_256:
+  case Intrinsic::x86_avx2_maskload_d:
+  case Intrinsic::x86_avx2_maskload_q:
+  case Intrinsic::x86_avx2_maskload_d_256:
+  case Intrinsic::x86_avx2_maskload_q_256:
+    if (Instruction *I = simplifyX86MaskedLoad(*II, *this))
+      return I;
+    break;
+
+  case Intrinsic::x86_sse2_maskmov_dqu:
+  case Intrinsic::x86_avx_maskstore_ps:
+  case Intrinsic::x86_avx_maskstore_pd:
+  case Intrinsic::x86_avx_maskstore_ps_256:
+  case Intrinsic::x86_avx_maskstore_pd_256:
+  case Intrinsic::x86_avx2_maskstore_d:
+  case Intrinsic::x86_avx2_maskstore_q:
+  case Intrinsic::x86_avx2_maskstore_d_256:
+  case Intrinsic::x86_avx2_maskstore_q_256:
+    if (simplifyX86MaskedStore(*II, *this))
+      return nullptr;
     break;
 
   case Intrinsic::x86_xop_vpcomb:
   case Intrinsic::x86_xop_vpcomd:
   case Intrinsic::x86_xop_vpcomq:
   case Intrinsic::x86_xop_vpcomw:
-    if (Value *V = SimplifyX86vpcom(*II, *Builder, true))
-      return ReplaceInstUsesWith(*II, V);
+    if (Value *V = simplifyX86vpcom(*II, *Builder, true))
+      return replaceInstUsesWith(*II, V);
     break;
 
   case Intrinsic::x86_xop_vpcomub:
   case Intrinsic::x86_xop_vpcomud:
   case Intrinsic::x86_xop_vpcomuq:
   case Intrinsic::x86_xop_vpcomuw:
-    if (Value *V = SimplifyX86vpcom(*II, *Builder, false))
-      return ReplaceInstUsesWith(*II, V);
+    if (Value *V = simplifyX86vpcom(*II, *Builder, false))
+      return replaceInstUsesWith(*II, V);
     break;
 
   case Intrinsic::ppc_altivec_vperm:
@@ -1585,7 +2125,7 @@ Instruction *InstCombiner::visitCallInst(CallInst &CI) {
 
     // Handle mul by zero first:
     if (isa<ConstantAggregateZero>(Arg0) || isa<ConstantAggregateZero>(Arg1)) {
-      return ReplaceInstUsesWith(CI, ConstantAggregateZero::get(II->getType()));
+      return replaceInstUsesWith(CI, ConstantAggregateZero::get(II->getType()));
     }
 
     // Check for constant LHS & RHS - in this case we just simplify.
@@ -1597,7 +2137,7 @@ Instruction *InstCombiner::visitCallInst(CallInst &CI) {
         CV0 = ConstantExpr::getIntegerCast(CV0, NewVT, /*isSigned=*/!Zext);
         CV1 = ConstantExpr::getIntegerCast(CV1, NewVT, /*isSigned=*/!Zext);
 
-        return ReplaceInstUsesWith(CI, ConstantExpr::getMul(CV0, CV1));
+        return replaceInstUsesWith(CI, ConstantExpr::getMul(CV0, CV1));
       }
 
       // Couldn't simplify - canonicalize constant to the RHS.
@@ -1615,7 +2155,7 @@ Instruction *InstCombiner::visitCallInst(CallInst &CI) {
     break;
   }
 
-  case Intrinsic::AMDGPU_rcp: {
+  case Intrinsic::amdgcn_rcp: {
     if (const ConstantFP *C = dyn_cast<ConstantFP>(II->getArgOperand(0))) {
       const APFloat &ArgVal = C->getValueAPF();
       APFloat Val(ArgVal.getSemantics(), 1.0);
@@ -1624,18 +2164,43 @@ Instruction *InstCombiner::visitCallInst(CallInst &CI) {
       // Only do this if it was exact and therefore not dependent on the
       // rounding mode.
       if (Status == APFloat::opOK)
-        return ReplaceInstUsesWith(CI, ConstantFP::get(II->getContext(), Val));
+        return replaceInstUsesWith(CI, ConstantFP::get(II->getContext(), Val));
     }
 
     break;
   }
+  case Intrinsic::amdgcn_frexp_mant:
+  case Intrinsic::amdgcn_frexp_exp: {
+    Value *Src = II->getArgOperand(0);
+    if (const ConstantFP *C = dyn_cast<ConstantFP>(Src)) {
+      int Exp;
+      APFloat Significand = frexp(C->getValueAPF(), Exp,
+                                  APFloat::rmNearestTiesToEven);
+
+      if (II->getIntrinsicID() == Intrinsic::amdgcn_frexp_mant) {
+        return replaceInstUsesWith(CI, ConstantFP::get(II->getContext(),
+                                                       Significand));
+      }
+
+      // Match instruction special case behavior.
+      if (Exp == APFloat::IEK_NaN || Exp == APFloat::IEK_Inf)
+        Exp = 0;
+
+      return replaceInstUsesWith(CI, ConstantInt::get(II->getType(), Exp));
+    }
+
+    if (isa<UndefValue>(Src))
+      return replaceInstUsesWith(CI, UndefValue::get(II->getType()));
+
+    break;
+  }
   case Intrinsic::stackrestore: {
     // If the save is right next to the restore, remove the restore.  This can
     // happen when variable allocas are DCE'd.
     if (IntrinsicInst *SS = dyn_cast<IntrinsicInst>(II->getArgOperand(0))) {
       if (SS->getIntrinsicID() == Intrinsic::stacksave) {
         if (&*++SS->getIterator() == II)
-          return EraseInstFromFunction(CI);
+          return eraseInstFromFunction(CI);
       }
     }
 
@@ -1653,8 +2218,14 @@ Instruction *InstCombiner::visitCallInst(CallInst &CI) {
         if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(BCI)) {
           // If there is a stackrestore below this one, remove this one.
           if (II->getIntrinsicID() == Intrinsic::stackrestore)
-            return EraseInstFromFunction(CI);
-          // Otherwise, ignore the intrinsic.
+            return eraseInstFromFunction(CI);
+
+          // Bail if we cross over an intrinsic with side effects, such as
+          // llvm.stacksave, llvm.read_register, or llvm.setjmp.
+          if (II->mayHaveSideEffects()) {
+            CannotRemove = true;
+            break;
+          }
         } else {
           // If we found a non-intrinsic call, we can't remove the stack
           // restore.
@@ -1668,42 +2239,29 @@ Instruction *InstCombiner::visitCallInst(CallInst &CI) {
     // are no allocas or calls between the restore and the return, nuke the
     // restore.
     if (!CannotRemove && (isa<ReturnInst>(TI) || isa<ResumeInst>(TI)))
-      return EraseInstFromFunction(CI);
+      return eraseInstFromFunction(CI);
     break;
   }
-  case Intrinsic::lifetime_start: {
-    // Remove trivially empty lifetime_start/end ranges, i.e. a start
-    // immediately followed by an end (ignoring debuginfo or other
-    // lifetime markers in between).
-    BasicBlock::iterator BI = II->getIterator(), BE = II->getParent()->end();
-    for (++BI; BI != BE; ++BI) {
-      if (IntrinsicInst *LTE = dyn_cast<IntrinsicInst>(BI)) {
-        if (isa<DbgInfoIntrinsic>(LTE) ||
-            LTE->getIntrinsicID() == Intrinsic::lifetime_start)
-          continue;
-        if (LTE->getIntrinsicID() == Intrinsic::lifetime_end) {
-          if (II->getOperand(0) == LTE->getOperand(0) &&
-              II->getOperand(1) == LTE->getOperand(1)) {
-            EraseInstFromFunction(*LTE);
-            return EraseInstFromFunction(*II);
-          }
-          continue;
-        }
-      }
-      break;
-    }
+  case Intrinsic::lifetime_start:
+    if (removeTriviallyEmptyRange(*II, Intrinsic::lifetime_start,
+                                  Intrinsic::lifetime_end, *this))
+      return nullptr;
     break;
-  }
   case Intrinsic::assume: {
+    Value *IIOperand = II->getArgOperand(0);
+    // Remove an assume if it is immediately followed by an identical assume.
+    if (match(II->getNextNode(),
+              m_Intrinsic<Intrinsic::assume>(m_Specific(IIOperand))))
+      return eraseInstFromFunction(CI);
+
     // Canonicalize assume(a && b) -> assume(a); assume(b);
     // Note: New assumption intrinsics created here are registered by
     // the InstCombineIRInserter object.
-    Value *IIOperand = II->getArgOperand(0), *A, *B,
-          *AssumeIntrinsic = II->getCalledValue();
+    Value *AssumeIntrinsic = II->getCalledValue(), *A, *B;
     if (match(IIOperand, m_And(m_Value(A), m_Value(B)))) {
       Builder->CreateCall(AssumeIntrinsic, A, II->getName());
       Builder->CreateCall(AssumeIntrinsic, B, II->getName());
-      return EraseInstFromFunction(*II);
+      return eraseInstFromFunction(*II);
     }
     // assume(!(a || b)) -> assume(!a); assume(!b);
     if (match(IIOperand, m_Not(m_Or(m_Value(A), m_Value(B))))) {
@@ -1711,7 +2269,7 @@ Instruction *InstCombiner::visitCallInst(CallInst &CI) {
                           II->getName());
       Builder->CreateCall(AssumeIntrinsic, Builder->CreateNot(B),
                           II->getName());
-      return EraseInstFromFunction(*II);
+      return eraseInstFromFunction(*II);
     }
 
     // assume( (load addr) != null ) -> add 'nonnull' metadata to load
@@ -1728,7 +2286,7 @@ Instruction *InstCombiner::visitCallInst(CallInst &CI) {
         if (isValidAssumeForContext(II, LI, DT)) {
           MDNode *MD = MDNode::get(II->getContext(), None);
           LI->setMetadata(LLVMContext::MD_nonnull, MD);
-          return EraseInstFromFunction(*II);
+          return eraseInstFromFunction(*II);
         }
       }
       // TODO: apply nonnull return attributes to calls and invokes
@@ -1739,7 +2297,7 @@ Instruction *InstCombiner::visitCallInst(CallInst &CI) {
     APInt KnownZero(1, 0), KnownOne(1, 0);
     computeKnownBits(IIOperand, KnownZero, KnownOne, 0, II);
     if (KnownOne.isAllOnesValue())
-      return EraseInstFromFunction(*II);
+      return eraseInstFromFunction(*II);
 
     break;
   }
@@ -1748,46 +2306,38 @@ Instruction *InstCombiner::visitCallInst(CallInst &CI) {
     // facts about the relocate value, while being careful to
     // preserve relocation semantics.
     Value *DerivedPtr = cast<GCRelocateInst>(II)->getDerivedPtr();
-    auto *GCRelocateType = cast<PointerType>(II->getType());
 
     // Remove the relocation if unused, note that this check is required
     // to prevent the cases below from looping forever.
     if (II->use_empty())
-      return EraseInstFromFunction(*II);
+      return eraseInstFromFunction(*II);
 
     // Undef is undef, even after relocation.
     // TODO: provide a hook for this in GCStrategy.  This is clearly legal for
     // most practical collectors, but there was discussion in the review thread
     // about whether it was legal for all possible collectors.
-    if (isa<UndefValue>(DerivedPtr)) {
-      // gc_relocate is uncasted. Use undef of gc_relocate's type to replace it.
-      return ReplaceInstUsesWith(*II, UndefValue::get(GCRelocateType));
-    }
+    if (isa<UndefValue>(DerivedPtr))
+      // Use undef of gc_relocate's type to replace it.
+      return replaceInstUsesWith(*II, UndefValue::get(II->getType()));
 
-    // The relocation of null will be null for most any collector.
-    // TODO: provide a hook for this in GCStrategy.  There might be some weird
-    // collector this property does not hold for.
-    if (isa<ConstantPointerNull>(DerivedPtr)) {
-      // gc_relocate is uncasted. Use null-pointer of gc_relocate's type to replace it.
-      return ReplaceInstUsesWith(*II, ConstantPointerNull::get(GCRelocateType));
-    }
+    if (auto *PT = dyn_cast<PointerType>(II->getType())) {
+      // The relocation of null will be null for most any collector.
+      // TODO: provide a hook for this in GCStrategy.  There might be some
+      // weird collector this property does not hold for.
+      if (isa<ConstantPointerNull>(DerivedPtr))
+        // Use null-pointer of gc_relocate's type to replace it.
+        return replaceInstUsesWith(*II, ConstantPointerNull::get(PT));
 
-    // isKnownNonNull -> nonnull attribute
-    if (isKnownNonNullAt(DerivedPtr, II, DT, TLI))
-      II->addAttribute(AttributeSet::ReturnIndex, Attribute::NonNull);
-
-    // isDereferenceablePointer -> deref attribute
-    if (isDereferenceablePointer(DerivedPtr, DL)) {
-      if (Argument *A = dyn_cast<Argument>(DerivedPtr)) {
-        uint64_t Bytes = A->getDereferenceableBytes();
-        II->addDereferenceableAttr(AttributeSet::ReturnIndex, Bytes);
-      }
+      // isKnownNonNull -> nonnull attribute
+      if (isKnownNonNullAt(DerivedPtr, II, DT))
+        II->addAttribute(AttributeSet::ReturnIndex, Attribute::NonNull);
     }
 
     // TODO: bitcast(relocate(p)) -> relocate(bitcast(p))
     // Canonicalize on the type from the uses to the defs
 
     // TODO: relocate((gep p, C, C2, ...)) -> gep(relocate(p), C, C2, ...)
+    break;
   }
   }
 
@@ -1800,8 +2350,8 @@ Instruction *InstCombiner::visitInvokeInst(InvokeInst &II) {
   return visitCallSite(&II);
 }
 
-/// isSafeToEliminateVarargsCast - If this cast does not affect the value
-/// passed through the varargs area, we can eliminate the use of the cast.
+/// If this cast does not affect the value passed through the varargs area, we
+/// can eliminate the use of the cast.
 static bool isSafeToEliminateVarargsCast(const CallSite CS,
                                          const DataLayout &DL,
                                          const CastInst *const CI,
@@ -1833,26 +2383,22 @@ static bool isSafeToEliminateVarargsCast(const CallSite CS,
   return true;
 }
 
-// Try to fold some different type of calls here.
-// Currently we're only working with the checking functions, memcpy_chk,
-// mempcpy_chk, memmove_chk, memset_chk, strcpy_chk, stpcpy_chk, strncpy_chk,
-// strcat_chk and strncat_chk.
 Instruction *InstCombiner::tryOptimizeCall(CallInst *CI) {
   if (!CI->getCalledFunction()) return nullptr;
 
   auto InstCombineRAUW = [this](Instruction *From, Value *With) {
-    ReplaceInstUsesWith(*From, With);
+    replaceInstUsesWith(*From, With);
   };
   LibCallSimplifier Simplifier(DL, TLI, InstCombineRAUW);
   if (Value *With = Simplifier.optimizeCall(CI)) {
     ++NumSimplified;
-    return CI->use_empty() ? CI : ReplaceInstUsesWith(*CI, With);
+    return CI->use_empty() ? CI : replaceInstUsesWith(*CI, With);
   }
 
   return nullptr;
 }
 
-static IntrinsicInst *FindInitTrampolineFromAlloca(Value *TrampMem) {
+static IntrinsicInst *findInitTrampolineFromAlloca(Value *TrampMem) {
   // Strip off at most one level of pointer casts, looking for an alloca.  This
   // is good enough in practice and simpler than handling any number of casts.
   Value *Underlying = TrampMem->stripPointerCasts();
@@ -1891,7 +2437,7 @@ static IntrinsicInst *FindInitTrampolineFromAlloca(Value *TrampMem) {
   return InitTrampoline;
 }
 
-static IntrinsicInst *FindInitTrampolineFromBB(IntrinsicInst *AdjustTramp,
+static IntrinsicInst *findInitTrampolineFromBB(IntrinsicInst *AdjustTramp,
                                                Value *TrampMem) {
   // Visit all the previous instructions in the basic block, and try to find a
   // init.trampoline which has a direct path to the adjust.trampoline.
@@ -1913,7 +2459,7 @@ static IntrinsicInst *FindInitTrampolineFromBB(IntrinsicInst *AdjustTramp,
 // call to llvm.init.trampoline if the call to the trampoline can be optimized
 // to a direct call to a function.  Otherwise return NULL.
 //
-static IntrinsicInst *FindInitTrampoline(Value *Callee) {
+static IntrinsicInst *findInitTrampoline(Value *Callee) {
   Callee = Callee->stripPointerCasts();
   IntrinsicInst *AdjustTramp = dyn_cast<IntrinsicInst>(Callee);
   if (!AdjustTramp ||
@@ -1922,15 +2468,14 @@ static IntrinsicInst *FindInitTrampoline(Value *Callee) {
 
   Value *TrampMem = AdjustTramp->getOperand(0);
 
-  if (IntrinsicInst *IT = FindInitTrampolineFromAlloca(TrampMem))
+  if (IntrinsicInst *IT = findInitTrampolineFromAlloca(TrampMem))
     return IT;
-  if (IntrinsicInst *IT = FindInitTrampolineFromBB(AdjustTramp, TrampMem))
+  if (IntrinsicInst *IT = findInitTrampolineFromBB(AdjustTramp, TrampMem))
     return IT;
   return nullptr;
 }
 
-// visitCallSite - Improvements for call and invoke instructions.
-//
+/// Improvements for call and invoke instructions.
 Instruction *InstCombiner::visitCallSite(CallSite CS) {
 
   if (isAllocLikeFn(CS.getInstruction(), TLI))
@@ -1945,8 +2490,9 @@ Instruction *InstCombiner::visitCallSite(CallSite CS) {
   unsigned ArgNo = 0;
 
   for (Value *V : CS.args()) {
-    if (V->getType()->isPointerTy() && !CS.paramHasAttr(ArgNo+1, Attribute::NonNull) &&
-        isKnownNonNullAt(V, CS.getInstruction(), DT, TLI))
+    if (V->getType()->isPointerTy() &&
+        !CS.paramHasAttr(ArgNo + 1, Attribute::NonNull) &&
+        isKnownNonNullAt(V, CS.getInstruction(), DT))
       Indices.push_back(ArgNo + 1);
     ArgNo++;
   }
@@ -1968,7 +2514,16 @@ Instruction *InstCombiner::visitCallSite(CallSite CS) {
   if (!isa<Function>(Callee) && transformConstExprCastCall(CS))
     return nullptr;
 
-  if (Function *CalleeF = dyn_cast<Function>(Callee))
+  if (Function *CalleeF = dyn_cast<Function>(Callee)) {
+    // Remove the convergent attr on calls when the callee is not convergent.
+    if (CS.isConvergent() && !CalleeF->isConvergent() &&
+        !CalleeF->isIntrinsic()) {
+      DEBUG(dbgs() << "Removing convergent attr from instr "
+                   << CS.getInstruction() << "\n");
+      CS.setNotConvergent();
+      return CS.getInstruction();
+    }
+
     // If the call and callee calling conventions don't match, this call must
     // be unreachable, as the call is undefined.
     if (CalleeF->getCallingConv() != CS.getCallingConv() &&
@@ -1983,9 +2538,9 @@ Instruction *InstCombiner::visitCallSite(CallSite CS) {
       // If OldCall does not return void then replaceAllUsesWith undef.
       // This allows ValueHandlers and custom metadata to adjust itself.
       if (!OldCall->getType()->isVoidTy())
-        ReplaceInstUsesWith(*OldCall, UndefValue::get(OldCall->getType()));
+        replaceInstUsesWith(*OldCall, UndefValue::get(OldCall->getType()));
       if (isa<CallInst>(OldCall))
-        return EraseInstFromFunction(*OldCall);
+        return eraseInstFromFunction(*OldCall);
 
       // We cannot remove an invoke, because it would change the CFG, just
       // change the callee to a null pointer.
@@ -1993,12 +2548,13 @@ Instruction *InstCombiner::visitCallSite(CallSite CS) {
                                     Constant::getNullValue(CalleeF->getType()));
       return nullptr;
     }
+  }
 
   if (isa<ConstantPointerNull>(Callee) || isa<UndefValue>(Callee)) {
     // If CS does not return void then replaceAllUsesWith undef.
     // This allows ValueHandlers and custom metadata to adjust itself.
     if (!CS.getInstruction()->getType()->isVoidTy())
-      ReplaceInstUsesWith(*CS.getInstruction(),
+      replaceInstUsesWith(*CS.getInstruction(),
                           UndefValue::get(CS.getInstruction()->getType()));
 
     if (isa<InvokeInst>(CS.getInstruction())) {
@@ -2013,10 +2569,10 @@ Instruction *InstCombiner::visitCallSite(CallSite CS) {
                   UndefValue::get(Type::getInt1PtrTy(Callee->getContext())),
                   CS.getInstruction());
 
-    return EraseInstFromFunction(*CS.getInstruction());
+    return eraseInstFromFunction(*CS.getInstruction());
   }
 
-  if (IntrinsicInst *II = FindInitTrampoline(Callee))
+  if (IntrinsicInst *II = findInitTrampoline(Callee))
     return transformCallThroughTrampoline(CS, II);
 
   PointerType *PTy = cast<PointerType>(Callee->getType());
@@ -2048,15 +2604,14 @@ Instruction *InstCombiner::visitCallSite(CallSite CS) {
     Instruction *I = tryOptimizeCall(CI);
     // If we changed something return the result, etc. Otherwise let
     // the fallthrough check.
-    if (I) return EraseInstFromFunction(*I);
+    if (I) return eraseInstFromFunction(*I);
   }
 
   return Changed ? CS.getInstruction() : nullptr;
 }
 
-// transformConstExprCastCall - If the callee is a constexpr cast of a function,
-// attempt to move the cast to the arguments of the call/invoke.
-//
+/// If the callee is a constexpr cast of a function, attempt to move the cast to
+/// the arguments of the call/invoke.
 bool InstCombiner::transformConstExprCastCall(CallSite CS) {
   Function *Callee =
     dyn_cast<Function>(CS.getCalledValue()->stripPointerCasts());
@@ -2316,7 +2871,7 @@ bool InstCombiner::transformConstExprCastCall(CallSite CS) {
   }
 
   if (!Caller->use_empty())
-    ReplaceInstUsesWith(*Caller, NV);
+    replaceInstUsesWith(*Caller, NV);
   else if (Caller->hasValueHandle()) {
     if (OldRetTy == NV->getType())
       ValueHandleBase::ValueIsRAUWd(Caller, NV);
@@ -2326,14 +2881,12 @@ bool InstCombiner::transformConstExprCastCall(CallSite CS) {
       ValueHandleBase::ValueIsDeleted(Caller);
   }
 
-  EraseInstFromFunction(*Caller);
+  eraseInstFromFunction(*Caller);
   return true;
 }
 
-// transformCallThroughTrampoline - Turn a call to a function created by
-// init_trampoline / adjust_trampoline intrinsic pair into a direct call to the
-// underlying function.
-//
+/// Turn a call to a function created by init_trampoline / adjust_trampoline
+/// intrinsic pair into a direct call to the underlying function.
 Instruction *
 InstCombiner::transformCallThroughTrampoline(CallSite CS,
                                              IntrinsicInst *Tramp) {
@@ -2351,8 +2904,7 @@ InstCombiner::transformCallThroughTrampoline(CallSite CS,
          "transformCallThroughTrampoline called with incorrect CallSite.");
 
   Function *NestF =cast<Function>(Tramp->getArgOperand(1)->stripPointerCasts());
-  PointerType *NestFPTy = cast<PointerType>(NestF->getType());
-  FunctionType *NestFTy = cast<FunctionType>(NestFPTy->getElementType());
+  FunctionType *NestFTy = cast<FunctionType>(NestF->getValueType());
 
   const AttributeSet &NestAttrs = NestF->getAttributes();
   if (!NestAttrs.isEmpty()) {
@@ -2412,7 +2964,8 @@ InstCombiner::transformCallThroughTrampoline(CallSite CS,
                                                  Idx + (Idx >= NestIdx), B));
           }
 
-          ++Idx, ++I;
+          ++Idx;
+          ++I;
         } while (1);
       }
 
@@ -2446,7 +2999,8 @@ InstCombiner::transformCallThroughTrampoline(CallSite CS,
           // Add the original type.
           NewTypes.push_back(*I);
 
-          ++Idx, ++I;
+          ++Idx;
+          ++I;
         } while (1);
       }
 
@@ -2461,15 +3015,18 @@ InstCombiner::transformCallThroughTrampoline(CallSite CS,
       const AttributeSet &NewPAL =
           AttributeSet::get(FTy->getContext(), NewAttrs);
 
+      SmallVector<OperandBundleDef, 1> OpBundles;
+      CS.getOperandBundlesAsDefs(OpBundles);
+
       Instruction *NewCaller;
       if (InvokeInst *II = dyn_cast<InvokeInst>(Caller)) {
         NewCaller = InvokeInst::Create(NewCallee,
                                        II->getNormalDest(), II->getUnwindDest(),
-                                       NewArgs);
+                                       NewArgs, OpBundles);
         cast<InvokeInst>(NewCaller)->setCallingConv(II->getCallingConv());
         cast<InvokeInst>(NewCaller)->setAttributes(NewPAL);
       } else {
-        NewCaller = CallInst::Create(NewCallee, NewArgs);
+        NewCaller = CallInst::Create(NewCallee, NewArgs, OpBundles);
         if (cast<CallInst>(Caller)->isTailCall())
           cast<CallInst>(NewCaller)->setTailCall();
         cast<CallInst>(NewCaller)->
diff --git a/lib/Transforms/InstCombine/InstCombineCasts.cpp b/lib/Transforms/InstCombine/InstCombineCasts.cpp
index 0f01d183b1ad..20556157188f 100644
--- a/lib/Transforms/InstCombine/InstCombineCasts.cpp
+++ b/lib/Transforms/InstCombine/InstCombineCasts.cpp
@@ -149,9 +149,9 @@ Instruction *InstCombiner::PromoteCastOfAllocation(BitCastInst &CI,
     // New is the allocation instruction, pointer typed. AI is the original
     // allocation instruction, also pointer typed. Thus, cast to use is BitCast.
     Value *NewCast = AllocaBuilder.CreateBitCast(New, AI.getType(), "tmpcast");
-    ReplaceInstUsesWith(AI, NewCast);
+    replaceInstUsesWith(AI, NewCast);
   }
-  return ReplaceInstUsesWith(CI, New);
+  return replaceInstUsesWith(CI, New);
 }
 
 /// Given an expression that CanEvaluateTruncated or CanEvaluateSExtd returns
@@ -508,7 +508,7 @@ Instruction *InstCombiner::visitTrunc(TruncInst &CI) {
           " to avoid cast: " << CI << '\n');
     Value *Res = EvaluateInDifferentType(Src, DestTy, false);
     assert(Res->getType() == DestTy);
-    return ReplaceInstUsesWith(CI, Res);
+    return replaceInstUsesWith(CI, Res);
   }
 
   // Canonicalize trunc x to i1 -> (icmp ne (and x, 1), 0), likewise for vector.
@@ -532,7 +532,7 @@ Instruction *InstCombiner::visitTrunc(TruncInst &CI) {
     // If the shift amount is larger than the size of A, then the result is
     // known to be zero because all the input bits got shifted out.
     if (Cst->getZExtValue() >= ASize)
-      return ReplaceInstUsesWith(CI, Constant::getNullValue(DestTy));
+      return replaceInstUsesWith(CI, Constant::getNullValue(DestTy));
 
     // Since we're doing an lshr and a zero extend, and know that the shift
     // amount is smaller than ASize, it is always safe to do the shift in A's
@@ -606,7 +606,7 @@ Instruction *InstCombiner::transformZExtICmp(ICmpInst *ICI, Instruction &CI,
         In = Builder->CreateXor(In, One, In->getName() + ".not");
       }
 
-      return ReplaceInstUsesWith(CI, In);
+      return replaceInstUsesWith(CI, In);
     }
 
     // zext (X == 0) to i32 --> X^1      iff X has only the low bit set.
@@ -636,7 +636,7 @@ Instruction *InstCombiner::transformZExtICmp(ICmpInst *ICI, Instruction &CI,
           Constant *Res = ConstantInt::get(Type::getInt1Ty(CI.getContext()),
                                            isNE);
           Res = ConstantExpr::getZExt(Res, CI.getType());
-          return ReplaceInstUsesWith(CI, Res);
+          return replaceInstUsesWith(CI, Res);
         }
 
         uint32_t ShAmt = KnownZeroMask.logBase2();
@@ -654,7 +654,7 @@ Instruction *InstCombiner::transformZExtICmp(ICmpInst *ICI, Instruction &CI,
         }
 
         if (CI.getType() == In->getType())
-          return ReplaceInstUsesWith(CI, In);
+          return replaceInstUsesWith(CI, In);
         return CastInst::CreateIntegerCast(In, CI.getType(), false/*ZExt*/);
       }
     }
@@ -694,7 +694,7 @@ Instruction *InstCombiner::transformZExtICmp(ICmpInst *ICI, Instruction &CI,
           if (ICI->getPredicate() == ICmpInst::ICMP_EQ)
             Result = Builder->CreateXor(Result, ConstantInt::get(ITy, 1));
           Result->takeName(ICI);
-          return ReplaceInstUsesWith(CI, Result);
+          return replaceInstUsesWith(CI, Result);
         }
       }
     }
@@ -872,7 +872,7 @@ Instruction *InstCombiner::visitZExt(ZExtInst &CI) {
                           APInt::getHighBitsSet(DestBitSize,
                                                 DestBitSize-SrcBitsKept),
                              0, &CI))
-      return ReplaceInstUsesWith(CI, Res);
+      return replaceInstUsesWith(CI, Res);
 
     // We need to emit an AND to clear the high bits.
     Constant *C = ConstantInt::get(Res->getType(),
@@ -986,7 +986,7 @@ Instruction *InstCombiner::transformSExtICmp(ICmpInst *ICI, Instruction &CI) {
 
       if (Pred == ICmpInst::ICMP_SGT)
         In = Builder->CreateNot(In, In->getName()+".not");
-      return ReplaceInstUsesWith(CI, In);
+      return replaceInstUsesWith(CI, In);
     }
   }
 
@@ -1009,7 +1009,7 @@ Instruction *InstCombiner::transformSExtICmp(ICmpInst *ICI, Instruction &CI) {
           Value *V = Pred == ICmpInst::ICMP_NE ?
                        ConstantInt::getAllOnesValue(CI.getType()) :
                        ConstantInt::getNullValue(CI.getType());
-          return ReplaceInstUsesWith(CI, V);
+          return replaceInstUsesWith(CI, V);
         }
 
         if (!Op1C->isZero() == (Pred == ICmpInst::ICMP_NE)) {
@@ -1041,7 +1041,7 @@ Instruction *InstCombiner::transformSExtICmp(ICmpInst *ICI, Instruction &CI) {
         }
 
         if (CI.getType() == In->getType())
-          return ReplaceInstUsesWith(CI, In);
+          return replaceInstUsesWith(CI, In);
         return CastInst::CreateIntegerCast(In, CI.getType(), true/*SExt*/);
       }
     }
@@ -1137,7 +1137,7 @@ Instruction *InstCombiner::visitSExt(SExtInst &CI) {
   ComputeSignBit(Src, KnownZero, KnownOne, 0, &CI);
   if (KnownZero) {
     Value *ZExt = Builder->CreateZExt(Src, DestTy);
-    return ReplaceInstUsesWith(CI, ZExt);
+    return replaceInstUsesWith(CI, ZExt);
   }
 
   // Attempt to extend the entire input expression tree to the destination
@@ -1158,7 +1158,7 @@ Instruction *InstCombiner::visitSExt(SExtInst &CI) {
     // If the high bits are already filled with sign bit, just replace this
     // cast with the result.
     if (ComputeNumSignBits(Res, 0, &CI) > DestBitSize - SrcBitSize)
-      return ReplaceInstUsesWith(CI, Res);
+      return replaceInstUsesWith(CI, Res);
 
     // We need to emit a shl + ashr to do the sign extend.
     Value *ShAmt = ConstantInt::get(DestTy, DestBitSize-SrcBitSize);
@@ -1400,8 +1400,11 @@ Instruction *InstCombiner::visitFPTrunc(FPTruncInst &CI) {
         Function *Overload = Intrinsic::getDeclaration(
             CI.getModule(), II->getIntrinsicID(), IntrinsicType);
 
+        SmallVector<OperandBundleDef, 1> OpBundles;
+        II->getOperandBundlesAsDefs(OpBundles);
+
         Value *Args[] = { InnerTrunc };
-        return CallInst::Create(Overload, Args, II->getName());
+        return CallInst::Create(Overload, Args, OpBundles, II->getName());
       }
     }
   }
@@ -1451,7 +1454,7 @@ Instruction *InstCombiner::FoldItoFPtoI(Instruction &FI) {
     if (FITy->getScalarSizeInBits() < SrcTy->getScalarSizeInBits())
       return new TruncInst(SrcI, FITy);
     if (SrcTy == FITy)
-      return ReplaceInstUsesWith(FI, SrcI);
+      return replaceInstUsesWith(FI, SrcI);
     return new BitCastInst(SrcI, FITy);
   }
   return nullptr;
@@ -1796,7 +1799,7 @@ Instruction *InstCombiner::visitBitCast(BitCastInst &CI) {
   // Get rid of casts from one type to the same type. These are useless and can
   // be replaced by the operand.
   if (DestTy == Src->getType())
-    return ReplaceInstUsesWith(CI, Src);
+    return replaceInstUsesWith(CI, Src);
 
   if (PointerType *DstPTy = dyn_cast<PointerType>(DestTy)) {
     PointerType *SrcPTy = cast<PointerType>(SrcTy);
@@ -1811,6 +1814,13 @@ Instruction *InstCombiner::visitBitCast(BitCastInst &CI) {
       if (Instruction *V = PromoteCastOfAllocation(CI, *AI))
         return V;
 
+    // When the type pointed to is not sized the cast cannot be
+    // turned into a gep.
+    Type *PointeeType =
+        cast<PointerType>(Src->getType()->getScalarType())->getElementType();
+    if (!PointeeType->isSized())
+      return nullptr;
+
     // If the source and destination are pointers, and this cast is equivalent
     // to a getelementptr X, 0, 0, 0...  turn it into the appropriate gep.
     // This can enhance SROA and other transforms that want type-safe pointers.
@@ -1854,7 +1864,7 @@ Instruction *InstCombiner::visitBitCast(BitCastInst &CI) {
       // assemble the elements of the vector manually.  Try to rip the code out
       // and replace it with insertelements.
       if (Value *V = optimizeIntegerToVectorInsertions(CI, *this))
-        return ReplaceInstUsesWith(CI, V);
+        return replaceInstUsesWith(CI, V);
     }
   }
 
diff --git a/lib/Transforms/InstCombine/InstCombineCompares.cpp b/lib/Transforms/InstCombine/InstCombineCompares.cpp
index d9311a343ead..bfd73f4bbac5 100644
--- a/lib/Transforms/InstCombine/InstCombineCompares.cpp
+++ b/lib/Transforms/InstCombine/InstCombineCompares.cpp
@@ -13,18 +13,19 @@
 
 #include "InstCombineInternal.h"
 #include "llvm/ADT/APSInt.h"
+#include "llvm/ADT/SetVector.h"
 #include "llvm/ADT/Statistic.h"
 #include "llvm/Analysis/ConstantFolding.h"
 #include "llvm/Analysis/InstructionSimplify.h"
 #include "llvm/Analysis/MemoryBuiltins.h"
+#include "llvm/Analysis/TargetLibraryInfo.h"
+#include "llvm/Analysis/VectorUtils.h"
 #include "llvm/IR/ConstantRange.h"
 #include "llvm/IR/DataLayout.h"
 #include "llvm/IR/GetElementPtrTypeIterator.h"
 #include "llvm/IR/IntrinsicInst.h"
 #include "llvm/IR/PatternMatch.h"
-#include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Debug.h"
-#include "llvm/Analysis/TargetLibraryInfo.h"
 
 using namespace llvm;
 using namespace PatternMatch;
@@ -55,8 +56,8 @@ static bool HasAddOverflow(ConstantInt *Result,
   return Result->getValue().slt(In1->getValue());
 }
 
-/// AddWithOverflow - Compute Result = In1+In2, returning true if the result
-/// overflowed for this type.
+/// Compute Result = In1+In2, returning true if the result overflowed for this
+/// type.
 static bool AddWithOverflow(Constant *&Result, Constant *In1,
                             Constant *In2, bool IsSigned = false) {
   Result = ConstantExpr::getAdd(In1, In2);
@@ -90,8 +91,8 @@ static bool HasSubOverflow(ConstantInt *Result,
   return Result->getValue().sgt(In1->getValue());
 }
 
-/// SubWithOverflow - Compute Result = In1-In2, returning true if the result
-/// overflowed for this type.
+/// Compute Result = In1-In2, returning true if the result overflowed for this
+/// type.
 static bool SubWithOverflow(Constant *&Result, Constant *In1,
                             Constant *In2, bool IsSigned = false) {
   Result = ConstantExpr::getSub(In1, In2);
@@ -113,13 +114,21 @@ static bool SubWithOverflow(Constant *&Result, Constant *In1,
                         IsSigned);
 }
 
-/// isSignBitCheck - Given an exploded icmp instruction, return true if the
-/// comparison only checks the sign bit.  If it only checks the sign bit, set
-/// TrueIfSigned if the result of the comparison is true when the input value is
-/// signed.
-static bool isSignBitCheck(ICmpInst::Predicate pred, ConstantInt *RHS,
+/// Given an icmp instruction, return true if any use of this comparison is a
+/// branch on sign bit comparison.
+static bool isBranchOnSignBitCheck(ICmpInst &I, bool isSignBit) {
+  for (auto *U : I.users())
+    if (isa<BranchInst>(U))
+      return isSignBit;
+  return false;
+}
+
+/// Given an exploded icmp instruction, return true if the comparison only
+/// checks the sign bit. If it only checks the sign bit, set TrueIfSigned if the
+/// result of the comparison is true when the input value is signed.
+static bool isSignBitCheck(ICmpInst::Predicate Pred, ConstantInt *RHS,
                            bool &TrueIfSigned) {
-  switch (pred) {
+  switch (Pred) {
   case ICmpInst::ICMP_SLT:   // True if LHS s< 0
     TrueIfSigned = true;
     return RHS->isZero();
@@ -145,21 +154,21 @@ static bool isSignBitCheck(ICmpInst::Predicate pred, ConstantInt *RHS,
 /// Returns true if the exploded icmp can be expressed as a signed comparison
 /// to zero and updates the predicate accordingly.
 /// The signedness of the comparison is preserved.
-static bool isSignTest(ICmpInst::Predicate &pred, const ConstantInt *RHS) {
-  if (!ICmpInst::isSigned(pred))
+static bool isSignTest(ICmpInst::Predicate &Pred, const ConstantInt *RHS) {
+  if (!ICmpInst::isSigned(Pred))
     return false;
 
   if (RHS->isZero())
-    return ICmpInst::isRelational(pred);
+    return ICmpInst::isRelational(Pred);
 
   if (RHS->isOne()) {
-    if (pred == ICmpInst::ICMP_SLT) {
-      pred = ICmpInst::ICMP_SLE;
+    if (Pred == ICmpInst::ICMP_SLT) {
+      Pred = ICmpInst::ICMP_SLE;
       return true;
     }
   } else if (RHS->isAllOnesValue()) {
-    if (pred == ICmpInst::ICMP_SGT) {
-      pred = ICmpInst::ICMP_SGE;
+    if (Pred == ICmpInst::ICMP_SGT) {
+      Pred = ICmpInst::ICMP_SGE;
       return true;
     }
   }
@@ -167,19 +176,18 @@ static bool isSignTest(ICmpInst::Predicate &pred, const ConstantInt *RHS) {
   return false;
 }
 
-// isHighOnes - Return true if the constant is of the form 1+0+.
-// This is the same as lowones(~X).
+/// Return true if the constant is of the form 1+0+. This is the same as
+/// lowones(~X).
 static bool isHighOnes(const ConstantInt *CI) {
   return (~CI->getValue() + 1).isPowerOf2();
 }
 
-/// ComputeSignedMinMaxValuesFromKnownBits - Given a signed integer type and a
-/// set of known zero and one bits, compute the maximum and minimum values that
-/// could have the specified known zero and known one bits, returning them in
-/// min/max.
-static void ComputeSignedMinMaxValuesFromKnownBits(const APInt& KnownZero,
-                                                   const APInt& KnownOne,
-                                                   APInt& Min, APInt& Max) {
+/// Given a signed integer type and a set of known zero and one bits, compute
+/// the maximum and minimum values that could have the specified known zero and
+/// known one bits, returning them in Min/Max.
+static void ComputeSignedMinMaxValuesFromKnownBits(const APInt &KnownZero,
+                                                   const APInt &KnownOne,
+                                                   APInt &Min, APInt &Max) {
   assert(KnownZero.getBitWidth() == KnownOne.getBitWidth() &&
          KnownZero.getBitWidth() == Min.getBitWidth() &&
          KnownZero.getBitWidth() == Max.getBitWidth() &&
@@ -197,10 +205,9 @@ static void ComputeSignedMinMaxValuesFromKnownBits(const APInt& KnownZero,
   }
 }
 
-// ComputeUnsignedMinMaxValuesFromKnownBits - Given an unsigned integer type and
-// a set of known zero and one bits, compute the maximum and minimum values that
-// could have the specified known zero and known one bits, returning them in
-// min/max.
+/// Given an unsigned integer type and a set of known zero and one bits, compute
+/// the maximum and minimum values that could have the specified known zero and
+/// known one bits, returning them in Min/Max.
 static void ComputeUnsignedMinMaxValuesFromKnownBits(const APInt &KnownZero,
                                                      const APInt &KnownOne,
                                                      APInt &Min, APInt &Max) {
@@ -216,14 +223,14 @@ static void ComputeUnsignedMinMaxValuesFromKnownBits(const APInt &KnownZero,
   Max = KnownOne|UnknownBits;
 }
 
-/// FoldCmpLoadFromIndexedGlobal - Called we see this pattern:
+/// This is called when we see this pattern:
 ///   cmp pred (load (gep GV, ...)), cmpcst
-/// where GV is a global variable with a constant initializer.  Try to simplify
-/// this into some simple computation that does not need the load.  For example
+/// where GV is a global variable with a constant initializer. Try to simplify
+/// this into some simple computation that does not need the load. For example
 /// we can optimize "icmp eq (load (gep "foo", 0, i)), 0" into "icmp eq i, 3".
 ///
 /// If AndCst is non-null, then the loaded value is masked with that constant
-/// before doing the comparison.  This handles cases like "A[i]&4 == 0".
+/// before doing the comparison. This handles cases like "A[i]&4 == 0".
 Instruction *InstCombiner::
 FoldCmpLoadFromIndexedGlobal(GetElementPtrInst *GEP, GlobalVariable *GV,
                              CmpInst &ICI, ConstantInt *AndCst) {
@@ -401,7 +408,7 @@ FoldCmpLoadFromIndexedGlobal(GetElementPtrInst *GEP, GlobalVariable *GV,
   if (SecondTrueElement != Overdefined) {
     // None true -> false.
     if (FirstTrueElement == Undefined)
-      return ReplaceInstUsesWith(ICI, Builder->getFalse());
+      return replaceInstUsesWith(ICI, Builder->getFalse());
 
     Value *FirstTrueIdx = ConstantInt::get(Idx->getType(), FirstTrueElement);
 
@@ -421,7 +428,7 @@ FoldCmpLoadFromIndexedGlobal(GetElementPtrInst *GEP, GlobalVariable *GV,
   if (SecondFalseElement != Overdefined) {
     // None false -> true.
     if (FirstFalseElement == Undefined)
-      return ReplaceInstUsesWith(ICI, Builder->getTrue());
+      return replaceInstUsesWith(ICI, Builder->getTrue());
 
     Value *FirstFalseIdx = ConstantInt::get(Idx->getType(), FirstFalseElement);
 
@@ -492,12 +499,12 @@ FoldCmpLoadFromIndexedGlobal(GetElementPtrInst *GEP, GlobalVariable *GV,
   return nullptr;
 }
 
-/// EvaluateGEPOffsetExpression - Return a value that can be used to compare
-/// the *offset* implied by a GEP to zero.  For example, if we have &A[i], we
-/// want to return 'i' for "icmp ne i, 0".  Note that, in general, indices can
-/// be complex, and scales are involved.  The above expression would also be
-/// legal to codegen as "icmp ne (i*4), 0" (assuming A is a pointer to i32).
-/// This later form is less amenable to optimization though, and we are allowed
+/// Return a value that can be used to compare the *offset* implied by a GEP to
+/// zero. For example, if we have &A[i], we want to return 'i' for
+/// "icmp ne i, 0". Note that, in general, indices can be complex, and scales
+/// are involved. The above expression would also be legal to codegen as
+/// "icmp ne (i*4), 0" (assuming A is a pointer to i32).
+/// This latter form is less amenable to optimization though, and we are allowed
 /// to generate the first by knowing that pointer arithmetic doesn't overflow.
 ///
 /// If we can't emit an optimized form for this expression, this returns null.
@@ -595,8 +602,323 @@ static Value *EvaluateGEPOffsetExpression(User *GEP, InstCombiner &IC,
   return IC.Builder->CreateAdd(VariableIdx, OffsetVal, "offset");
 }
 
-/// FoldGEPICmp - Fold comparisons between a GEP instruction and something
-/// else.  At this point we know that the GEP is on the LHS of the comparison.
+/// Returns true if we can rewrite Start as a GEP with pointer Base
+/// and some integer offset. The nodes that need to be re-written
+/// for this transformation will be added to Explored.
+static bool canRewriteGEPAsOffset(Value *Start, Value *Base,
+                                  const DataLayout &DL,
+                                  SetVector<Value *> &Explored) {
+  SmallVector<Value *, 16> WorkList(1, Start);
+  Explored.insert(Base);
+
+  // The following traversal gives us an order which can be used
+  // when doing the final transformation. Since in the final
+  // transformation we create the PHI replacement instructions first,
+  // we don't have to get them in any particular order.
+  //
+  // However, for other instructions we will have to traverse the
+  // operands of an instruction first, which means that we have to
+  // do a post-order traversal.
+  while (!WorkList.empty()) {
+    SetVector<PHINode *> PHIs;
+
+    while (!WorkList.empty()) {
+      if (Explored.size() >= 100)
+        return false;
+
+      Value *V = WorkList.back();
+
+      if (Explored.count(V) != 0) {
+        WorkList.pop_back();
+        continue;
+      }
+
+      if (!isa<IntToPtrInst>(V) && !isa<PtrToIntInst>(V) &&
+          !isa<GEPOperator>(V) && !isa<PHINode>(V))
+        // We've found some value that we can't explore which is different from
+        // the base. Therefore we can't do this transformation.
+        return false;
+
+      if (isa<IntToPtrInst>(V) || isa<PtrToIntInst>(V)) {
+        auto *CI = dyn_cast<CastInst>(V);
+        if (!CI->isNoopCast(DL))
+          return false;
+
+        if (Explored.count(CI->getOperand(0)) == 0)
+          WorkList.push_back(CI->getOperand(0));
+      }
+
+      if (auto *GEP = dyn_cast<GEPOperator>(V)) {
+        // We're limiting the GEP to having one index. This will preserve
+        // the original pointer type. We could handle more cases in the
+        // future.
+        if (GEP->getNumIndices() != 1 || !GEP->isInBounds() ||
+            GEP->getType() != Start->getType())
+          return false;
+
+        if (Explored.count(GEP->getOperand(0)) == 0)
+          WorkList.push_back(GEP->getOperand(0));
+      }
+
+      if (WorkList.back() == V) {
+        WorkList.pop_back();
+        // We've finished visiting this node, mark it as such.
+        Explored.insert(V);
+      }
+
+      if (auto *PN = dyn_cast<PHINode>(V)) {
+        // We cannot transform PHIs on unsplittable basic blocks.
+        if (isa<CatchSwitchInst>(PN->getParent()->getTerminator()))
+          return false;
+        Explored.insert(PN);
+        PHIs.insert(PN);
+      }
+    }
+
+    // Explore the PHI nodes further.
+    for (auto *PN : PHIs)
+      for (Value *Op : PN->incoming_values())
+        if (Explored.count(Op) == 0)
+          WorkList.push_back(Op);
+  }
+
+  // Make sure that we can do this. Since we can't insert GEPs in a basic
+  // block before a PHI node, we can't easily do this transformation if
+  // we have PHI node users of transformed instructions.
+  for (Value *Val : Explored) {
+    for (Value *Use : Val->uses()) {
+
+      auto *PHI = dyn_cast<PHINode>(Use);
+      auto *Inst = dyn_cast<Instruction>(Val);
+
+      if (Inst == Base || Inst == PHI || !Inst || !PHI ||
+          Explored.count(PHI) == 0)
+        continue;
+
+      if (PHI->getParent() == Inst->getParent())
+        return false;
+    }
+  }
+  return true;
+}
+
+// Sets the appropriate insert point on Builder where we can add
+// a replacement Instruction for V (if that is possible).
+static void setInsertionPoint(IRBuilder<> &Builder, Value *V,
+                              bool Before = true) {
+  if (auto *PHI = dyn_cast<PHINode>(V)) {
+    Builder.SetInsertPoint(&*PHI->getParent()->getFirstInsertionPt());
+    return;
+  }
+  if (auto *I = dyn_cast<Instruction>(V)) {
+    if (!Before)
+      I = &*std::next(I->getIterator());
+    Builder.SetInsertPoint(I);
+    return;
+  }
+  if (auto *A = dyn_cast<Argument>(V)) {
+    // Set the insertion point in the entry block.
+    BasicBlock &Entry = A->getParent()->getEntryBlock();
+    Builder.SetInsertPoint(&*Entry.getFirstInsertionPt());
+    return;
+  }
+  // Otherwise, this is a constant and we don't need to set a new
+  // insertion point.
+  assert(isa<Constant>(V) && "Setting insertion point for unknown value!");
+}
+
+/// Returns a re-written value of Start as an indexed GEP using Base as a
+/// pointer.
+static Value *rewriteGEPAsOffset(Value *Start, Value *Base,
+                                 const DataLayout &DL,
+                                 SetVector<Value *> &Explored) {
+  // Perform all the substitutions. This is a bit tricky because we can
+  // have cycles in our use-def chains.
+  // 1. Create the PHI nodes without any incoming values.
+  // 2. Create all the other values.
+  // 3. Add the edges for the PHI nodes.
+  // 4. Emit GEPs to get the original pointers.
+  // 5. Remove the original instructions.
+  Type *IndexType = IntegerType::get(
+      Base->getContext(), DL.getPointerTypeSizeInBits(Start->getType()));
+
+  DenseMap<Value *, Value *> NewInsts;
+  NewInsts[Base] = ConstantInt::getNullValue(IndexType);
+
+  // Create the new PHI nodes, without adding any incoming values.
+  for (Value *Val : Explored) {
+    if (Val == Base)
+      continue;
+    // Create empty phi nodes. This avoids cyclic dependencies when creating
+    // the remaining instructions.
+    if (auto *PHI = dyn_cast<PHINode>(Val))
+      NewInsts[PHI] = PHINode::Create(IndexType, PHI->getNumIncomingValues(),
+                                      PHI->getName() + ".idx", PHI);
+  }
+  IRBuilder<> Builder(Base->getContext());
+
+  // Create all the other instructions.
+  for (Value *Val : Explored) {
+
+    if (NewInsts.find(Val) != NewInsts.end())
+      continue;
+
+    if (auto *CI = dyn_cast<CastInst>(Val)) {
+      NewInsts[CI] = NewInsts[CI->getOperand(0)];
+      continue;
+    }
+    if (auto *GEP = dyn_cast<GEPOperator>(Val)) {
+      Value *Index = NewInsts[GEP->getOperand(1)] ? NewInsts[GEP->getOperand(1)]
+                                                  : GEP->getOperand(1);
+      setInsertionPoint(Builder, GEP);
+      // Indices might need to be sign extended. GEPs will magically do
+      // this, but we need to do it ourselves here.
+      if (Index->getType()->getScalarSizeInBits() !=
+          NewInsts[GEP->getOperand(0)]->getType()->getScalarSizeInBits()) {
+        Index = Builder.CreateSExtOrTrunc(
+            Index, NewInsts[GEP->getOperand(0)]->getType(),
+            GEP->getOperand(0)->getName() + ".sext");
+      }
+
+      auto *Op = NewInsts[GEP->getOperand(0)];
+      if (isa<ConstantInt>(Op) && dyn_cast<ConstantInt>(Op)->isZero())
+        NewInsts[GEP] = Index;
+      else
+        NewInsts[GEP] = Builder.CreateNSWAdd(
+            Op, Index, GEP->getOperand(0)->getName() + ".add");
+      continue;
+    }
+    if (isa<PHINode>(Val))
+      continue;
+
+    llvm_unreachable("Unexpected instruction type");
+  }
+
+  // Add the incoming values to the PHI nodes.
+  for (Value *Val : Explored) {
+    if (Val == Base)
+      continue;
+    // All the instructions have been created, we can now add edges to the
+    // phi nodes.
+    if (auto *PHI = dyn_cast<PHINode>(Val)) {
+      PHINode *NewPhi = static_cast<PHINode *>(NewInsts[PHI]);
+      for (unsigned I = 0, E = PHI->getNumIncomingValues(); I < E; ++I) {
+        Value *NewIncoming = PHI->getIncomingValue(I);
+
+        if (NewInsts.find(NewIncoming) != NewInsts.end())
+          NewIncoming = NewInsts[NewIncoming];
+
+        NewPhi->addIncoming(NewIncoming, PHI->getIncomingBlock(I));
+      }
+    }
+  }
+
+  for (Value *Val : Explored) {
+    if (Val == Base)
+      continue;
+
+    // Depending on the type, for external users we have to emit
+    // a GEP or a GEP + ptrtoint.
+    setInsertionPoint(Builder, Val, false);
+
+    // If required, create an inttoptr instruction for Base.
+    Value *NewBase = Base;
+    if (!Base->getType()->isPointerTy())
+      NewBase = Builder.CreateBitOrPointerCast(Base, Start->getType(),
+                                               Start->getName() + "to.ptr");
+
+    Value *GEP = Builder.CreateInBoundsGEP(
+        Start->getType()->getPointerElementType(), NewBase,
+        makeArrayRef(NewInsts[Val]), Val->getName() + ".ptr");
+
+    if (!Val->getType()->isPointerTy()) {
+      Value *Cast = Builder.CreatePointerCast(GEP, Val->getType(),
+                                              Val->getName() + ".conv");
+      GEP = Cast;
+    }
+    Val->replaceAllUsesWith(GEP);
+  }
+
+  return NewInsts[Start];
+}
+
+/// Looks through GEPs, IntToPtrInsts and PtrToIntInsts in order to express
+/// the input Value as a constant indexed GEP. Returns a pair containing
+/// the GEPs Pointer and Index.
+static std::pair<Value *, Value *>
+getAsConstantIndexedAddress(Value *V, const DataLayout &DL) {
+  Type *IndexType = IntegerType::get(V->getContext(),
+                                     DL.getPointerTypeSizeInBits(V->getType()));
+
+  Constant *Index = ConstantInt::getNullValue(IndexType);
+  while (true) {
+    if (GEPOperator *GEP = dyn_cast<GEPOperator>(V)) {
+      // We accept only inbouds GEPs here to exclude the possibility of
+      // overflow.
+      if (!GEP->isInBounds())
+        break;
+      if (GEP->hasAllConstantIndices() && GEP->getNumIndices() == 1 &&
+          GEP->getType() == V->getType()) {
+        V = GEP->getOperand(0);
+        Constant *GEPIndex = static_cast<Constant *>(GEP->getOperand(1));
+        Index = ConstantExpr::getAdd(
+            Index, ConstantExpr::getSExtOrBitCast(GEPIndex, IndexType));
+        continue;
+      }
+      break;
+    }
+    if (auto *CI = dyn_cast<IntToPtrInst>(V)) {
+      if (!CI->isNoopCast(DL))
+        break;
+      V = CI->getOperand(0);
+      continue;
+    }
+    if (auto *CI = dyn_cast<PtrToIntInst>(V)) {
+      if (!CI->isNoopCast(DL))
+        break;
+      V = CI->getOperand(0);
+      continue;
+    }
+    break;
+  }
+  return {V, Index};
+}
+
+/// Converts (CMP GEPLHS, RHS) if this change would make RHS a constant.
+/// We can look through PHIs, GEPs and casts in order to determine a common base
+/// between GEPLHS and RHS.
+static Instruction *transformToIndexedCompare(GEPOperator *GEPLHS, Value *RHS,
+                                              ICmpInst::Predicate Cond,
+                                              const DataLayout &DL) {
+  if (!GEPLHS->hasAllConstantIndices())
+    return nullptr;
+
+  Value *PtrBase, *Index;
+  std::tie(PtrBase, Index) = getAsConstantIndexedAddress(GEPLHS, DL);
+
+  // The set of nodes that will take part in this transformation.
+  SetVector<Value *> Nodes;
+
+  if (!canRewriteGEPAsOffset(RHS, PtrBase, DL, Nodes))
+    return nullptr;
+
+  // We know we can re-write this as
+  //  ((gep Ptr, OFFSET1) cmp (gep Ptr, OFFSET2)
+  // Since we've only looked through inbouds GEPs we know that we
+  // can't have overflow on either side. We can therefore re-write
+  // this as:
+  //   OFFSET1 cmp OFFSET2
+  Value *NewRHS = rewriteGEPAsOffset(RHS, PtrBase, DL, Nodes);
+
+  // RewriteGEPAsOffset has replaced RHS and all of its uses with a re-written
+  // GEP having PtrBase as the pointer base, and has returned in NewRHS the
+  // offset. Since Index is the offset of LHS to the base pointer, we will now
+  // compare the offsets instead of comparing the pointers.
+  return new ICmpInst(ICmpInst::getSignedPredicate(Cond), Index, NewRHS);
+}
+
+/// Fold comparisons between a GEP instruction and something else. At this point
+/// we know that the GEP is on the LHS of the comparison.
 Instruction *InstCombiner::FoldGEPICmp(GEPOperator *GEPLHS, Value *RHS,
                                        ICmpInst::Predicate Cond,
                                        Instruction &I) {
@@ -670,12 +992,13 @@ Instruction *InstCombiner::FoldGEPICmp(GEPOperator *GEPLHS, Value *RHS,
 
         Value *Cmp = Builder->CreateICmp(ICmpInst::getSignedPredicate(Cond),
                                          LOffset, ROffset);
-        return ReplaceInstUsesWith(I, Cmp);
+        return replaceInstUsesWith(I, Cmp);
       }
 
       // Otherwise, the base pointers are different and the indices are
-      // different, bail out.
-      return nullptr;
+      // different. Try convert this to an indexed compare by looking through
+      // PHIs/casts.
+      return transformToIndexedCompare(GEPLHS, RHS, Cond, DL);
     }
 
     // If one of the GEPs has all zero indices, recurse.
@@ -706,7 +1029,7 @@ Instruction *InstCombiner::FoldGEPICmp(GEPOperator *GEPLHS, Value *RHS,
         }
 
       if (NumDifferences == 0)   // SAME GEP?
-        return ReplaceInstUsesWith(I, // No comparison is needed here.
+        return replaceInstUsesWith(I, // No comparison is needed here.
                              Builder->getInt1(ICmpInst::isTrueWhenEqual(Cond)));
 
       else if (NumDifferences == 1 && GEPsInBounds) {
@@ -727,7 +1050,10 @@ Instruction *InstCombiner::FoldGEPICmp(GEPOperator *GEPLHS, Value *RHS,
       return new ICmpInst(ICmpInst::getSignedPredicate(Cond), L, R);
     }
   }
-  return nullptr;
+
+  // Try convert this to an indexed compare by looking through PHIs/casts as a
+  // last resort.
+  return transformToIndexedCompare(GEPLHS, RHS, Cond, DL);
 }
 
 Instruction *InstCombiner::FoldAllocaCmp(ICmpInst &ICI, AllocaInst *Alloca,
@@ -802,12 +1128,12 @@ Instruction *InstCombiner::FoldAllocaCmp(ICmpInst &ICI, AllocaInst *Alloca,
   }
 
   Type *CmpTy = CmpInst::makeCmpResultType(Other->getType());
-  return ReplaceInstUsesWith(
+  return replaceInstUsesWith(
       ICI,
       ConstantInt::get(CmpTy, !CmpInst::isTrueWhenEqual(ICI.getPredicate())));
 }
 
-/// FoldICmpAddOpCst - Fold "icmp pred (X+CI), X".
+/// Fold "icmp pred (X+CI), X".
 Instruction *InstCombiner::FoldICmpAddOpCst(Instruction &ICI,
                                             Value *X, ConstantInt *CI,
                                             ICmpInst::Predicate Pred) {
@@ -855,8 +1181,8 @@ Instruction *InstCombiner::FoldICmpAddOpCst(Instruction &ICI,
   return new ICmpInst(ICmpInst::ICMP_SLT, X, ConstantExpr::getSub(SMax, C));
 }
 
-/// FoldICmpDivCst - Fold "icmp pred, ([su]div X, DivRHS), CmpRHS" where DivRHS
-/// and CmpRHS are both known to be integer constants.
+/// Fold "icmp pred, ([su]div X, DivRHS), CmpRHS" where DivRHS and CmpRHS are
+/// both known to be integer constants.
 Instruction *InstCombiner::FoldICmpDivCst(ICmpInst &ICI, BinaryOperator *DivI,
                                           ConstantInt *DivRHS) {
   ConstantInt *CmpRHS = cast<ConstantInt>(ICI.getOperand(1));
@@ -898,8 +1224,8 @@ Instruction *InstCombiner::FoldICmpDivCst(ICmpInst &ICI, BinaryOperator *DivI,
   // Get the ICmp opcode
   ICmpInst::Predicate Pred = ICI.getPredicate();
 
-  /// If the division is known to be exact, then there is no remainder from the
-  /// divide, so the covered range size is unit, otherwise it is the divisor.
+  // If the division is known to be exact, then there is no remainder from the
+  // divide, so the covered range size is unit, otherwise it is the divisor.
   ConstantInt *RangeSize = DivI->isExact() ? getOne(Prod) : DivRHS;
 
   // Figure out the interval that is being checked.  For example, a comparison
@@ -973,46 +1299,46 @@ Instruction *InstCombiner::FoldICmpDivCst(ICmpInst &ICI, BinaryOperator *DivI,
   default: llvm_unreachable("Unhandled icmp opcode!");
   case ICmpInst::ICMP_EQ:
     if (LoOverflow && HiOverflow)
-      return ReplaceInstUsesWith(ICI, Builder->getFalse());
+      return replaceInstUsesWith(ICI, Builder->getFalse());
     if (HiOverflow)
       return new ICmpInst(DivIsSigned ? ICmpInst::ICMP_SGE :
                           ICmpInst::ICMP_UGE, X, LoBound);
     if (LoOverflow)
       return new ICmpInst(DivIsSigned ? ICmpInst::ICMP_SLT :
                           ICmpInst::ICMP_ULT, X, HiBound);
-    return ReplaceInstUsesWith(ICI, InsertRangeTest(X, LoBound, HiBound,
+    return replaceInstUsesWith(ICI, InsertRangeTest(X, LoBound, HiBound,
                                                     DivIsSigned, true));
   case ICmpInst::ICMP_NE:
     if (LoOverflow && HiOverflow)
-      return ReplaceInstUsesWith(ICI, Builder->getTrue());
+      return replaceInstUsesWith(ICI, Builder->getTrue());
     if (HiOverflow)
       return new ICmpInst(DivIsSigned ? ICmpInst::ICMP_SLT :
                           ICmpInst::ICMP_ULT, X, LoBound);
     if (LoOverflow)
       return new ICmpInst(DivIsSigned ? ICmpInst::ICMP_SGE :
                           ICmpInst::ICMP_UGE, X, HiBound);
-    return ReplaceInstUsesWith(ICI, InsertRangeTest(X, LoBound, HiBound,
+    return replaceInstUsesWith(ICI, InsertRangeTest(X, LoBound, HiBound,
                                                     DivIsSigned, false));
   case ICmpInst::ICMP_ULT:
   case ICmpInst::ICMP_SLT:
     if (LoOverflow == +1)   // Low bound is greater than input range.
-      return ReplaceInstUsesWith(ICI, Builder->getTrue());
+      return replaceInstUsesWith(ICI, Builder->getTrue());
     if (LoOverflow == -1)   // Low bound is less than input range.
-      return ReplaceInstUsesWith(ICI, Builder->getFalse());
+      return replaceInstUsesWith(ICI, Builder->getFalse());
     return new ICmpInst(Pred, X, LoBound);
   case ICmpInst::ICMP_UGT:
   case ICmpInst::ICMP_SGT:
     if (HiOverflow == +1)       // High bound greater than input range.
-      return ReplaceInstUsesWith(ICI, Builder->getFalse());
+      return replaceInstUsesWith(ICI, Builder->getFalse());
     if (HiOverflow == -1)       // High bound less than input range.
-      return ReplaceInstUsesWith(ICI, Builder->getTrue());
+      return replaceInstUsesWith(ICI, Builder->getTrue());
     if (Pred == ICmpInst::ICMP_UGT)
       return new ICmpInst(ICmpInst::ICMP_UGE, X, HiBound);
     return new ICmpInst(ICmpInst::ICMP_SGE, X, HiBound);
   }
 }
 
-/// FoldICmpShrCst - Handle "icmp(([al]shr X, cst1), cst2)".
+/// Handle "icmp(([al]shr X, cst1), cst2)".
 Instruction *InstCombiner::FoldICmpShrCst(ICmpInst &ICI, BinaryOperator *Shr,
                                           ConstantInt *ShAmt) {
   const APInt &CmpRHSV = cast<ConstantInt>(ICI.getOperand(1))->getValue();
@@ -1077,7 +1403,7 @@ Instruction *InstCombiner::FoldICmpShrCst(ICmpInst &ICI, BinaryOperator *Shr,
   if (Comp != CmpRHSV) { // Comparing against a bit that we know is zero.
     bool IsICMP_NE = ICI.getPredicate() == ICmpInst::ICMP_NE;
     Constant *Cst = Builder->getInt1(IsICMP_NE);
-    return ReplaceInstUsesWith(ICI, Cst);
+    return replaceInstUsesWith(ICI, Cst);
   }
 
   // Otherwise, check to see if the bits shifted out are known to be zero.
@@ -1098,7 +1424,7 @@ Instruction *InstCombiner::FoldICmpShrCst(ICmpInst &ICI, BinaryOperator *Shr,
   return nullptr;
 }
 
-/// FoldICmpCstShrCst - Handle "(icmp eq/ne (ashr/lshr const2, A), const1)" ->
+/// Handle "(icmp eq/ne (ashr/lshr const2, A), const1)" ->
 /// (icmp eq/ne A, Log2(const2/const1)) ->
 /// (icmp eq/ne A, Log2(const2) - Log2(const1)).
 Instruction *InstCombiner::FoldICmpCstShrCst(ICmpInst &I, Value *Op, Value *A,
@@ -1109,7 +1435,7 @@ Instruction *InstCombiner::FoldICmpCstShrCst(ICmpInst &I, Value *Op, Value *A,
   auto getConstant = [&I, this](bool IsTrue) {
     if (I.getPredicate() == I.ICMP_NE)
       IsTrue = !IsTrue;
-    return ReplaceInstUsesWith(I, ConstantInt::get(I.getType(), IsTrue));
+    return replaceInstUsesWith(I, ConstantInt::get(I.getType(), IsTrue));
   };
 
   auto getICmp = [&I](CmpInst::Predicate Pred, Value *LHS, Value *RHS) {
@@ -1118,8 +1444,8 @@ Instruction *InstCombiner::FoldICmpCstShrCst(ICmpInst &I, Value *Op, Value *A,
     return new ICmpInst(Pred, LHS, RHS);
   };
 
-  APInt AP1 = CI1->getValue();
-  APInt AP2 = CI2->getValue();
+  const APInt &AP1 = CI1->getValue();
+  const APInt &AP2 = CI2->getValue();
 
   // Don't bother doing any work for cases which InstSimplify handles.
   if (AP2 == 0)
@@ -1163,7 +1489,7 @@ Instruction *InstCombiner::FoldICmpCstShrCst(ICmpInst &I, Value *Op, Value *A,
   return getConstant(false);
 }
 
-/// FoldICmpCstShlCst - Handle "(icmp eq/ne (shl const2, A), const1)" ->
+/// Handle "(icmp eq/ne (shl const2, A), const1)" ->
 /// (icmp eq/ne A, TrailingZeros(const1) - TrailingZeros(const2)).
 Instruction *InstCombiner::FoldICmpCstShlCst(ICmpInst &I, Value *Op, Value *A,
                                              ConstantInt *CI1,
@@ -1173,7 +1499,7 @@ Instruction *InstCombiner::FoldICmpCstShlCst(ICmpInst &I, Value *Op, Value *A,
   auto getConstant = [&I, this](bool IsTrue) {
     if (I.getPredicate() == I.ICMP_NE)
       IsTrue = !IsTrue;
-    return ReplaceInstUsesWith(I, ConstantInt::get(I.getType(), IsTrue));
+    return replaceInstUsesWith(I, ConstantInt::get(I.getType(), IsTrue));
   };
 
   auto getICmp = [&I](CmpInst::Predicate Pred, Value *LHS, Value *RHS) {
@@ -1182,8 +1508,8 @@ Instruction *InstCombiner::FoldICmpCstShlCst(ICmpInst &I, Value *Op, Value *A,
     return new ICmpInst(Pred, LHS, RHS);
   };
 
-  APInt AP1 = CI1->getValue();
-  APInt AP2 = CI2->getValue();
+  const APInt &AP1 = CI1->getValue();
+  const APInt &AP2 = CI2->getValue();
 
   // Don't bother doing any work for cases which InstSimplify handles.
   if (AP2 == 0)
@@ -1208,8 +1534,7 @@ Instruction *InstCombiner::FoldICmpCstShlCst(ICmpInst &I, Value *Op, Value *A,
   return getConstant(false);
 }
 
-/// visitICmpInstWithInstAndIntCst - Handle "icmp (instr, intcst)".
-///
+/// Handle "icmp (instr, intcst)".
 Instruction *InstCombiner::visitICmpInstWithInstAndIntCst(ICmpInst &ICI,
                                                           Instruction *LHSI,
                                                           ConstantInt *RHS) {
@@ -1412,9 +1737,9 @@ Instruction *InstCombiner::visitICmpInstWithInstAndIntCst(ICmpInst &ICI,
             // As a special case, check to see if this means that the
             // result is always true or false now.
             if (ICI.getPredicate() == ICmpInst::ICMP_EQ)
-              return ReplaceInstUsesWith(ICI, Builder->getFalse());
+              return replaceInstUsesWith(ICI, Builder->getFalse());
             if (ICI.getPredicate() == ICmpInst::ICMP_NE)
-              return ReplaceInstUsesWith(ICI, Builder->getTrue());
+              return replaceInstUsesWith(ICI, Builder->getTrue());
           } else {
             ICI.setOperand(1, NewCst);
             Constant *NewAndCst;
@@ -1674,7 +1999,7 @@ Instruction *InstCombiner::visitICmpInstWithInstAndIntCst(ICmpInst &ICI,
       if (Comp != RHS) {// Comparing against a bit that we know is zero.
         bool IsICMP_NE = ICI.getPredicate() == ICmpInst::ICMP_NE;
         Constant *Cst = Builder->getInt1(IsICMP_NE);
-        return ReplaceInstUsesWith(ICI, Cst);
+        return replaceInstUsesWith(ICI, Cst);
       }
 
       // If the shift is NUW, then it is just shifting out zeros, no need for an
@@ -1764,8 +2089,28 @@ Instruction *InstCombiner::visitICmpInstWithInstAndIntCst(ICmpInst &ICI,
     break;
   }
 
-  case Instruction::SDiv:
   case Instruction::UDiv:
+    if (ConstantInt *DivLHS = dyn_cast<ConstantInt>(LHSI->getOperand(0))) {
+      Value *X = LHSI->getOperand(1);
+      const APInt &C1 = RHS->getValue();
+      const APInt &C2 = DivLHS->getValue();
+      assert(C2 != 0 && "udiv 0, X should have been simplified already.");
+      // (icmp ugt (udiv C2, X), C1) -> (icmp ule X, C2/(C1+1))
+      if (ICI.getPredicate() == ICmpInst::ICMP_UGT) {
+        assert(!C1.isMaxValue() &&
+               "icmp ugt X, UINT_MAX should have been simplified already.");
+        return new ICmpInst(ICmpInst::ICMP_ULE, X,
+                            ConstantInt::get(X->getType(), C2.udiv(C1 + 1)));
+      }
+      // (icmp ult (udiv C2, X), C1) -> (icmp ugt X, C2/C1)
+      if (ICI.getPredicate() == ICmpInst::ICMP_ULT) {
+        assert(C1 != 0 && "icmp ult X, 0 should have been simplified already.");
+        return new ICmpInst(ICmpInst::ICMP_UGT, X,
+                            ConstantInt::get(X->getType(), C2.udiv(C1)));
+      }
+    }
+  // fall-through
+  case Instruction::SDiv:
     // Fold: icmp pred ([us]div X, C1), C2 -> range test
     // Fold this div into the comparison, producing a range check.
     // Determine, based on the divide type, what the range is being
@@ -1895,27 +2240,30 @@ Instruction *InstCombiner::visitICmpInstWithInstAndIntCst(ICmpInst &ICI,
         }
         break;
       case Instruction::Xor:
-        // For the xor case, we can xor two constants together, eliminating
-        // the explicit xor.
-        if (Constant *BOC = dyn_cast<Constant>(BO->getOperand(1))) {
-          return new ICmpInst(ICI.getPredicate(), BO->getOperand(0),
-                              ConstantExpr::getXor(RHS, BOC));
-        } else if (RHSV == 0) {
-          // Replace ((xor A, B) != 0) with (A != B)
-          return new ICmpInst(ICI.getPredicate(), BO->getOperand(0),
-                              BO->getOperand(1));
+        if (BO->hasOneUse()) {
+          if (Constant *BOC = dyn_cast<Constant>(BO->getOperand(1))) {
+            // For the xor case, we can xor two constants together, eliminating
+            // the explicit xor.
+            return new ICmpInst(ICI.getPredicate(), BO->getOperand(0),
+                ConstantExpr::getXor(RHS, BOC));
+          } else if (RHSV == 0) {
+            // Replace ((xor A, B) != 0) with (A != B)
+            return new ICmpInst(ICI.getPredicate(), BO->getOperand(0),
+                BO->getOperand(1));
+          }
         }
         break;
       case Instruction::Sub:
-        // Replace ((sub A, B) != C) with (B != A-C) if A & C are constants.
-        if (ConstantInt *BOp0C = dyn_cast<ConstantInt>(BO->getOperand(0))) {
-          if (BO->hasOneUse())
+        if (BO->hasOneUse()) {
+          if (ConstantInt *BOp0C = dyn_cast<ConstantInt>(BO->getOperand(0))) {
+            // Replace ((sub A, B) != C) with (B != A-C) if A & C are constants.
             return new ICmpInst(ICI.getPredicate(), BO->getOperand(1),
-                                ConstantExpr::getSub(BOp0C, RHS));
-        } else if (RHSV == 0) {
-          // Replace ((sub A, B) != 0) with (A != B)
-          return new ICmpInst(ICI.getPredicate(), BO->getOperand(0),
-                              BO->getOperand(1));
+                ConstantExpr::getSub(BOp0C, RHS));
+          } else if (RHSV == 0) {
+            // Replace ((sub A, B) != 0) with (A != B)
+            return new ICmpInst(ICI.getPredicate(), BO->getOperand(0),
+                BO->getOperand(1));
+          }
         }
         break;
       case Instruction::Or:
@@ -1924,7 +2272,16 @@ Instruction *InstCombiner::visitICmpInstWithInstAndIntCst(ICmpInst &ICI,
         if (ConstantInt *BOC = dyn_cast<ConstantInt>(BO->getOperand(1))) {
           Constant *NotCI = ConstantExpr::getNot(RHS);
           if (!ConstantExpr::getAnd(BOC, NotCI)->isNullValue())
-            return ReplaceInstUsesWith(ICI, Builder->getInt1(isICMP_NE));
+            return replaceInstUsesWith(ICI, Builder->getInt1(isICMP_NE));
+
+          // Comparing if all bits outside of a constant mask are set?
+          // Replace (X | C) == -1 with (X & ~C) == ~C.
+          // This removes the -1 constant.
+          if (BO->hasOneUse() && RHS->isAllOnesValue()) {
+            Constant *NotBOC = ConstantExpr::getNot(BOC);
+            Value *And = Builder->CreateAnd(BO->getOperand(0), NotBOC);
+            return new ICmpInst(ICI.getPredicate(), And, NotBOC);
+          }
         }
         break;
 
@@ -1933,7 +2290,7 @@ Instruction *InstCombiner::visitICmpInstWithInstAndIntCst(ICmpInst &ICI,
           // If bits are being compared against that are and'd out, then the
           // comparison can never succeed!
           if ((RHSV & ~BOC->getValue()) != 0)
-            return ReplaceInstUsesWith(ICI, Builder->getInt1(isICMP_NE));
+            return replaceInstUsesWith(ICI, Builder->getInt1(isICMP_NE));
 
           // If we have ((X & C) == C), turn it into ((X & C) != 0).
           if (RHS == BOC && RHSV.isPowerOf2())
@@ -2013,11 +2370,10 @@ Instruction *InstCombiner::visitICmpInstWithInstAndIntCst(ICmpInst &ICI,
   return nullptr;
 }
 
-/// visitICmpInstWithCastAndCast - Handle icmp (cast x to y), (cast/cst).
-/// We only handle extending casts so far.
-///
-Instruction *InstCombiner::visitICmpInstWithCastAndCast(ICmpInst &ICI) {
-  const CastInst *LHSCI = cast<CastInst>(ICI.getOperand(0));
+/// Handle icmp (cast x to y), (cast/cst). We only handle extending casts so
+/// far.
+Instruction *InstCombiner::visitICmpInstWithCastAndCast(ICmpInst &ICmp) {
+  const CastInst *LHSCI = cast<CastInst>(ICmp.getOperand(0));
   Value *LHSCIOp        = LHSCI->getOperand(0);
   Type *SrcTy     = LHSCIOp->getType();
   Type *DestTy    = LHSCI->getType();
@@ -2028,7 +2384,7 @@ Instruction *InstCombiner::visitICmpInstWithCastAndCast(ICmpInst &ICI) {
   if (LHSCI->getOpcode() == Instruction::PtrToInt &&
       DL.getPointerTypeSizeInBits(SrcTy) == DestTy->getIntegerBitWidth()) {
     Value *RHSOp = nullptr;
-    if (PtrToIntOperator *RHSC = dyn_cast<PtrToIntOperator>(ICI.getOperand(1))) {
+    if (auto *RHSC = dyn_cast<PtrToIntOperator>(ICmp.getOperand(1))) {
       Value *RHSCIOp = RHSC->getOperand(0);
       if (RHSCIOp->getType()->getPointerAddressSpace() ==
           LHSCIOp->getType()->getPointerAddressSpace()) {
@@ -2037,11 +2393,12 @@ Instruction *InstCombiner::visitICmpInstWithCastAndCast(ICmpInst &ICI) {
         if (LHSCIOp->getType() != RHSOp->getType())
           RHSOp = Builder->CreateBitCast(RHSOp, LHSCIOp->getType());
       }
-    } else if (Constant *RHSC = dyn_cast<Constant>(ICI.getOperand(1)))
+    } else if (auto *RHSC = dyn_cast<Constant>(ICmp.getOperand(1))) {
       RHSOp = ConstantExpr::getIntToPtr(RHSC, SrcTy);
+    }
 
     if (RHSOp)
-      return new ICmpInst(ICI.getPredicate(), LHSCIOp, RHSOp);
+      return new ICmpInst(ICmp.getPredicate(), LHSCIOp, RHSOp);
   }
 
   // The code below only handles extension cast instructions, so far.
@@ -2051,9 +2408,9 @@ Instruction *InstCombiner::visitICmpInstWithCastAndCast(ICmpInst &ICI) {
     return nullptr;
 
   bool isSignedExt = LHSCI->getOpcode() == Instruction::SExt;
-  bool isSignedCmp = ICI.isSigned();
+  bool isSignedCmp = ICmp.isSigned();
 
-  if (CastInst *CI = dyn_cast<CastInst>(ICI.getOperand(1))) {
+  if (auto *CI = dyn_cast<CastInst>(ICmp.getOperand(1))) {
     // Not an extension from the same type?
     RHSCIOp = CI->getOperand(0);
     if (RHSCIOp->getType() != LHSCIOp->getType())
@@ -2065,50 +2422,51 @@ Instruction *InstCombiner::visitICmpInstWithCastAndCast(ICmpInst &ICI) {
       return nullptr;
 
     // Deal with equality cases early.
-    if (ICI.isEquality())
-      return new ICmpInst(ICI.getPredicate(), LHSCIOp, RHSCIOp);
+    if (ICmp.isEquality())
+      return new ICmpInst(ICmp.getPredicate(), LHSCIOp, RHSCIOp);
 
     // A signed comparison of sign extended values simplifies into a
     // signed comparison.
     if (isSignedCmp && isSignedExt)
-      return new ICmpInst(ICI.getPredicate(), LHSCIOp, RHSCIOp);
+      return new ICmpInst(ICmp.getPredicate(), LHSCIOp, RHSCIOp);
 
     // The other three cases all fold into an unsigned comparison.
-    return new ICmpInst(ICI.getUnsignedPredicate(), LHSCIOp, RHSCIOp);
+    return new ICmpInst(ICmp.getUnsignedPredicate(), LHSCIOp, RHSCIOp);
   }
 
-  // If we aren't dealing with a constant on the RHS, exit early
-  ConstantInt *CI = dyn_cast<ConstantInt>(ICI.getOperand(1));
-  if (!CI)
+  // If we aren't dealing with a constant on the RHS, exit early.
+  auto *C = dyn_cast<Constant>(ICmp.getOperand(1));
+  if (!C)
     return nullptr;
 
   // Compute the constant that would happen if we truncated to SrcTy then
-  // reextended to DestTy.
-  Constant *Res1 = ConstantExpr::getTrunc(CI, SrcTy);
-  Constant *Res2 = ConstantExpr::getCast(LHSCI->getOpcode(),
-                                                Res1, DestTy);
+  // re-extended to DestTy.
+  Constant *Res1 = ConstantExpr::getTrunc(C, SrcTy);
+  Constant *Res2 = ConstantExpr::getCast(LHSCI->getOpcode(), Res1, DestTy);
 
   // If the re-extended constant didn't change...
-  if (Res2 == CI) {
+  if (Res2 == C) {
     // Deal with equality cases early.
-    if (ICI.isEquality())
-      return new ICmpInst(ICI.getPredicate(), LHSCIOp, Res1);
+    if (ICmp.isEquality())
+      return new ICmpInst(ICmp.getPredicate(), LHSCIOp, Res1);
 
     // A signed comparison of sign extended values simplifies into a
     // signed comparison.
     if (isSignedExt && isSignedCmp)
-      return new ICmpInst(ICI.getPredicate(), LHSCIOp, Res1);
+      return new ICmpInst(ICmp.getPredicate(), LHSCIOp, Res1);
 
     // The other three cases all fold into an unsigned comparison.
-    return new ICmpInst(ICI.getUnsignedPredicate(), LHSCIOp, Res1);
+    return new ICmpInst(ICmp.getUnsignedPredicate(), LHSCIOp, Res1);
   }
 
-  // The re-extended constant changed so the constant cannot be represented
-  // in the shorter type. Consequently, we cannot emit a simple comparison.
+  // The re-extended constant changed, partly changed (in the case of a vector),
+  // or could not be determined to be equal (in the case of a constant
+  // expression), so the constant cannot be represented in the shorter type.
+  // Consequently, we cannot emit a simple comparison.
   // All the cases that fold to true or false will have already been handled
   // by SimplifyICmpInst, so only deal with the tricky case.
 
-  if (isSignedCmp || !isSignedExt)
+  if (isSignedCmp || !isSignedExt || !isa<ConstantInt>(C))
     return nullptr;
 
   // Evaluate the comparison for LT (we invert for GT below). LE and GE cases
@@ -2117,17 +2475,17 @@ Instruction *InstCombiner::visitICmpInstWithCastAndCast(ICmpInst &ICI) {
   // We're performing an unsigned comp with a sign extended value.
   // This is true if the input is >= 0. [aka >s -1]
   Constant *NegOne = Constant::getAllOnesValue(SrcTy);
-  Value *Result = Builder->CreateICmpSGT(LHSCIOp, NegOne, ICI.getName());
+  Value *Result = Builder->CreateICmpSGT(LHSCIOp, NegOne, ICmp.getName());
 
   // Finally, return the value computed.
-  if (ICI.getPredicate() == ICmpInst::ICMP_ULT)
-    return ReplaceInstUsesWith(ICI, Result);
+  if (ICmp.getPredicate() == ICmpInst::ICMP_ULT)
+    return replaceInstUsesWith(ICmp, Result);
 
-  assert(ICI.getPredicate() == ICmpInst::ICMP_UGT && "ICmp should be folded!");
+  assert(ICmp.getPredicate() == ICmpInst::ICMP_UGT && "ICmp should be folded!");
   return BinaryOperator::CreateNot(Result);
 }
 
-/// ProcessUGT_ADDCST_ADD - The caller has matched a pattern of the form:
+/// The caller has matched a pattern of the form:
 ///   I = icmp ugt (add (add A, B), CI2), CI1
 /// If this is of the form:
 ///   sum = a + b
@@ -2207,7 +2565,7 @@ static Instruction *ProcessUGT_ADDCST_ADD(ICmpInst &I, Value *A, Value *B,
 
   // The inner add was the result of the narrow add, zero extended to the
   // wider type.  Replace it with the result computed by the intrinsic.
-  IC.ReplaceInstUsesWith(*OrigAdd, ZExt);
+  IC.replaceInstUsesWith(*OrigAdd, ZExt);
 
   // The original icmp gets replaced with the overflow value.
   return ExtractValueInst::Create(Call, 1, "sadd.overflow");
@@ -2491,7 +2849,7 @@ static Instruction *ProcessUMulZExtIdiom(ICmpInst &I, Value *MulVal,
         continue;
       if (TruncInst *TI = dyn_cast<TruncInst>(U)) {
         if (TI->getType()->getPrimitiveSizeInBits() == MulWidth)
-          IC.ReplaceInstUsesWith(*TI, Mul);
+          IC.replaceInstUsesWith(*TI, Mul);
         else
           TI->setOperand(0, Mul);
       } else if (BinaryOperator *BO = dyn_cast<BinaryOperator>(U)) {
@@ -2503,7 +2861,7 @@ static Instruction *ProcessUMulZExtIdiom(ICmpInst &I, Value *MulVal,
         Instruction *Zext =
             cast<Instruction>(Builder->CreateZExt(ShortAnd, BO->getType()));
         IC.Worklist.Add(Zext);
-        IC.ReplaceInstUsesWith(*BO, Zext);
+        IC.replaceInstUsesWith(*BO, Zext);
       } else {
         llvm_unreachable("Unexpected Binary operation");
       }
@@ -2545,9 +2903,9 @@ static Instruction *ProcessUMulZExtIdiom(ICmpInst &I, Value *MulVal,
   return ExtractValueInst::Create(Call, 1);
 }
 
-// DemandedBitsLHSMask - When performing a comparison against a constant,
-// it is possible that not all the bits in the LHS are demanded.  This helper
-// method computes the mask that IS demanded.
+/// When performing a comparison against a constant, it is possible that not all
+/// the bits in the LHS are demanded. This helper method computes the mask that
+/// IS demanded.
 static APInt DemandedBitsLHSMask(ICmpInst &I,
                                  unsigned BitWidth, bool isSignCheck) {
   if (isSignCheck)
@@ -2656,9 +3014,7 @@ bool InstCombiner::dominatesAllUses(const Instruction *DI,
   return true;
 }
 
-///
-/// true when the instruction sequence within a block is select-cmp-br.
-///
+/// Return true when the instruction sequence within a block is select-cmp-br.
 static bool isChainSelectCmpBranch(const SelectInst *SI) {
   const BasicBlock *BB = SI->getParent();
   if (!BB)
@@ -2672,7 +3028,6 @@ static bool isChainSelectCmpBranch(const SelectInst *SI) {
   return true;
 }
 
-///
 /// \brief True when a select result is replaced by one of its operands
 /// in select-icmp sequence. This will eventually result in the elimination
 /// of the select.
@@ -2738,6 +3093,63 @@ bool InstCombiner::replacedSelectWithOperand(SelectInst *SI,
   return false;
 }
 
+/// If we have an icmp le or icmp ge instruction with a constant operand, turn
+/// it into the appropriate icmp lt or icmp gt instruction. This transform
+/// allows them to be folded in visitICmpInst.
+static ICmpInst *canonicalizeCmpWithConstant(ICmpInst &I) {
+  ICmpInst::Predicate Pred = I.getPredicate();
+  if (Pred != ICmpInst::ICMP_SLE && Pred != ICmpInst::ICMP_SGE &&
+      Pred != ICmpInst::ICMP_ULE && Pred != ICmpInst::ICMP_UGE)
+    return nullptr;
+
+  Value *Op0 = I.getOperand(0);
+  Value *Op1 = I.getOperand(1);
+  auto *Op1C = dyn_cast<Constant>(Op1);
+  if (!Op1C)
+    return nullptr;
+
+  // Check if the constant operand can be safely incremented/decremented without
+  // overflowing/underflowing. For scalars, SimplifyICmpInst has already handled
+  // the edge cases for us, so we just assert on them. For vectors, we must
+  // handle the edge cases.
+  Type *Op1Type = Op1->getType();
+  bool IsSigned = I.isSigned();
+  bool IsLE = (Pred == ICmpInst::ICMP_SLE || Pred == ICmpInst::ICMP_ULE);
+  auto *CI = dyn_cast<ConstantInt>(Op1C);
+  if (CI) {
+    // A <= MAX -> TRUE ; A >= MIN -> TRUE
+    assert(IsLE ? !CI->isMaxValue(IsSigned) : !CI->isMinValue(IsSigned));
+  } else if (Op1Type->isVectorTy()) {
+    // TODO? If the edge cases for vectors were guaranteed to be handled as they
+    // are for scalar, we could remove the min/max checks. However, to do that,
+    // we would have to use insertelement/shufflevector to replace edge values.
+    unsigned NumElts = Op1Type->getVectorNumElements();
+    for (unsigned i = 0; i != NumElts; ++i) {
+      Constant *Elt = Op1C->getAggregateElement(i);
+      if (!Elt)
+        return nullptr;
+
+      if (isa<UndefValue>(Elt))
+        continue;
+      // Bail out if we can't determine if this constant is min/max or if we
+      // know that this constant is min/max.
+      auto *CI = dyn_cast<ConstantInt>(Elt);
+      if (!CI || (IsLE ? CI->isMaxValue(IsSigned) : CI->isMinValue(IsSigned)))
+        return nullptr;
+    }
+  } else {
+    // ConstantExpr?
+    return nullptr;
+  }
+
+  // Increment or decrement the constant and set the new comparison predicate:
+  // ULE -> ULT ; UGE -> UGT ; SLE -> SLT ; SGE -> SGT
+  Constant *OneOrNegOne = ConstantInt::get(Op1Type, IsLE ? 1 : -1, true);
+  CmpInst::Predicate NewPred = IsLE ? ICmpInst::ICMP_ULT: ICmpInst::ICMP_UGT;
+  NewPred = IsSigned ? ICmpInst::getSignedPredicate(NewPred) : NewPred;
+  return new ICmpInst(NewPred, Op0, ConstantExpr::getAdd(Op1C, OneOrNegOne));
+}
+
 Instruction *InstCombiner::visitICmpInst(ICmpInst &I) {
   bool Changed = false;
   Value *Op0 = I.getOperand(0), *Op1 = I.getOperand(1);
@@ -2748,8 +3160,7 @@ Instruction *InstCombiner::visitICmpInst(ICmpInst &I) {
   /// complex to least complex.  This puts constants before unary operators,
   /// before binary operators.
   if (Op0Cplxity < Op1Cplxity ||
-        (Op0Cplxity == Op1Cplxity &&
-         swapMayExposeCSEOpportunities(Op0, Op1))) {
+      (Op0Cplxity == Op1Cplxity && swapMayExposeCSEOpportunities(Op0, Op1))) {
     I.swapOperands();
     std::swap(Op0, Op1);
     Changed = true;
@@ -2757,12 +3168,11 @@ Instruction *InstCombiner::visitICmpInst(ICmpInst &I) {
 
   if (Value *V =
           SimplifyICmpInst(I.getPredicate(), Op0, Op1, DL, TLI, DT, AC, &I))
-    return ReplaceInstUsesWith(I, V);
+    return replaceInstUsesWith(I, V);
 
   // comparing -val or val with non-zero is the same as just comparing val
   // ie, abs(val) != 0 -> val != 0
-  if (I.getPredicate() == ICmpInst::ICMP_NE && match(Op1, m_Zero()))
-  {
+  if (I.getPredicate() == ICmpInst::ICMP_NE && match(Op1, m_Zero())) {
     Value *Cond, *SelectTrue, *SelectFalse;
     if (match(Op0, m_Select(m_Value(Cond), m_Value(SelectTrue),
                             m_Value(SelectFalse)))) {
@@ -2780,47 +3190,50 @@ Instruction *InstCombiner::visitICmpInst(ICmpInst &I) {
   Type *Ty = Op0->getType();
 
   // icmp's with boolean values can always be turned into bitwise operations
-  if (Ty->isIntegerTy(1)) {
+  if (Ty->getScalarType()->isIntegerTy(1)) {
     switch (I.getPredicate()) {
     default: llvm_unreachable("Invalid icmp instruction!");
-    case ICmpInst::ICMP_EQ: {               // icmp eq i1 A, B -> ~(A^B)
-      Value *Xor = Builder->CreateXor(Op0, Op1, I.getName()+"tmp");
+    case ICmpInst::ICMP_EQ: {                // icmp eq i1 A, B -> ~(A^B)
+      Value *Xor = Builder->CreateXor(Op0, Op1, I.getName() + "tmp");
       return BinaryOperator::CreateNot(Xor);
     }
-    case ICmpInst::ICMP_NE:                  // icmp eq i1 A, B -> A^B
+    case ICmpInst::ICMP_NE:                  // icmp ne i1 A, B -> A^B
       return BinaryOperator::CreateXor(Op0, Op1);
 
     case ICmpInst::ICMP_UGT:
       std::swap(Op0, Op1);                   // Change icmp ugt -> icmp ult
       // FALL THROUGH
-    case ICmpInst::ICMP_ULT:{               // icmp ult i1 A, B -> ~A & B
-      Value *Not = Builder->CreateNot(Op0, I.getName()+"tmp");
+    case ICmpInst::ICMP_ULT:{                // icmp ult i1 A, B -> ~A & B
+      Value *Not = Builder->CreateNot(Op0, I.getName() + "tmp");
       return BinaryOperator::CreateAnd(Not, Op1);
     }
     case ICmpInst::ICMP_SGT:
       std::swap(Op0, Op1);                   // Change icmp sgt -> icmp slt
       // FALL THROUGH
     case ICmpInst::ICMP_SLT: {               // icmp slt i1 A, B -> A & ~B
-      Value *Not = Builder->CreateNot(Op1, I.getName()+"tmp");
+      Value *Not = Builder->CreateNot(Op1, I.getName() + "tmp");
       return BinaryOperator::CreateAnd(Not, Op0);
     }
     case ICmpInst::ICMP_UGE:
       std::swap(Op0, Op1);                   // Change icmp uge -> icmp ule
       // FALL THROUGH
-    case ICmpInst::ICMP_ULE: {               //  icmp ule i1 A, B -> ~A | B
-      Value *Not = Builder->CreateNot(Op0, I.getName()+"tmp");
+    case ICmpInst::ICMP_ULE: {               // icmp ule i1 A, B -> ~A | B
+      Value *Not = Builder->CreateNot(Op0, I.getName() + "tmp");
       return BinaryOperator::CreateOr(Not, Op1);
     }
     case ICmpInst::ICMP_SGE:
       std::swap(Op0, Op1);                   // Change icmp sge -> icmp sle
       // FALL THROUGH
-    case ICmpInst::ICMP_SLE: {               //  icmp sle i1 A, B -> A | ~B
-      Value *Not = Builder->CreateNot(Op1, I.getName()+"tmp");
+    case ICmpInst::ICMP_SLE: {               // icmp sle i1 A, B -> A | ~B
+      Value *Not = Builder->CreateNot(Op1, I.getName() + "tmp");
       return BinaryOperator::CreateOr(Not, Op0);
     }
     }
   }
 
+  if (ICmpInst *NewICmp = canonicalizeCmpWithConstant(I))
+    return NewICmp;
+
   unsigned BitWidth = 0;
   if (Ty->isIntOrIntVectorTy())
     BitWidth = Ty->getScalarSizeInBits();
@@ -2853,6 +3266,19 @@ Instruction *InstCombiner::visitICmpInst(ICmpInst &I) {
         return Res;
     }
 
+    // (icmp sgt smin(PosA, B) 0) -> (icmp sgt B 0)
+    if (CI->isZero() && I.getPredicate() == ICmpInst::ICMP_SGT)
+      if (auto *SI = dyn_cast<SelectInst>(Op0)) {
+        SelectPatternResult SPR = matchSelectPattern(SI, A, B);
+        if (SPR.Flavor == SPF_SMIN) {
+          if (isKnownPositive(A, DL))
+            return new ICmpInst(I.getPredicate(), B, CI);
+          if (isKnownPositive(B, DL))
+            return new ICmpInst(I.getPredicate(), A, CI);
+        }
+      }
+    
+
     // The following transforms are only 'worth it' if the only user of the
     // subtraction is the icmp.
     if (Op0->hasOneUse()) {
@@ -2882,30 +3308,6 @@ Instruction *InstCombiner::visitICmpInst(ICmpInst &I) {
         return new ICmpInst(ICmpInst::ICMP_SLE, A, B);
     }
 
-    // If we have an icmp le or icmp ge instruction, turn it into the
-    // appropriate icmp lt or icmp gt instruction.  This allows us to rely on
-    // them being folded in the code below.  The SimplifyICmpInst code has
-    // already handled the edge cases for us, so we just assert on them.
-    switch (I.getPredicate()) {
-    default: break;
-    case ICmpInst::ICMP_ULE:
-      assert(!CI->isMaxValue(false));                 // A <=u MAX -> TRUE
-      return new ICmpInst(ICmpInst::ICMP_ULT, Op0,
-                          Builder->getInt(CI->getValue()+1));
-    case ICmpInst::ICMP_SLE:
-      assert(!CI->isMaxValue(true));                  // A <=s MAX -> TRUE
-      return new ICmpInst(ICmpInst::ICMP_SLT, Op0,
-                          Builder->getInt(CI->getValue()+1));
-    case ICmpInst::ICMP_UGE:
-      assert(!CI->isMinValue(false));                 // A >=u MIN -> TRUE
-      return new ICmpInst(ICmpInst::ICMP_UGT, Op0,
-                          Builder->getInt(CI->getValue()-1));
-    case ICmpInst::ICMP_SGE:
-      assert(!CI->isMinValue(true));                  // A >=s MIN -> TRUE
-      return new ICmpInst(ICmpInst::ICMP_SGT, Op0,
-                          Builder->getInt(CI->getValue()-1));
-    }
-
     if (I.isEquality()) {
       ConstantInt *CI2;
       if (match(Op0, m_AShr(m_ConstantInt(CI2), m_Value(A))) ||
@@ -2925,6 +3327,42 @@ Instruction *InstCombiner::visitICmpInst(ICmpInst &I) {
     // bits, if it is a sign bit comparison, it only demands the sign bit.
     bool UnusedBit;
     isSignBit = isSignBitCheck(I.getPredicate(), CI, UnusedBit);
+
+    // Canonicalize icmp instructions based on dominating conditions.
+    BasicBlock *Parent = I.getParent();
+    BasicBlock *Dom = Parent->getSinglePredecessor();
+    auto *BI = Dom ? dyn_cast<BranchInst>(Dom->getTerminator()) : nullptr;
+    ICmpInst::Predicate Pred;
+    BasicBlock *TrueBB, *FalseBB;
+    ConstantInt *CI2;
+    if (BI && match(BI, m_Br(m_ICmp(Pred, m_Specific(Op0), m_ConstantInt(CI2)),
+                             TrueBB, FalseBB)) &&
+        TrueBB != FalseBB) {
+      ConstantRange CR = ConstantRange::makeAllowedICmpRegion(I.getPredicate(),
+                                                              CI->getValue());
+      ConstantRange DominatingCR =
+          (Parent == TrueBB)
+              ? ConstantRange::makeExactICmpRegion(Pred, CI2->getValue())
+              : ConstantRange::makeExactICmpRegion(
+                    CmpInst::getInversePredicate(Pred), CI2->getValue());
+      ConstantRange Intersection = DominatingCR.intersectWith(CR);
+      ConstantRange Difference = DominatingCR.difference(CR);
+      if (Intersection.isEmptySet())
+        return replaceInstUsesWith(I, Builder->getFalse());
+      if (Difference.isEmptySet())
+        return replaceInstUsesWith(I, Builder->getTrue());
+      // Canonicalizing a sign bit comparison that gets used in a branch,
+      // pessimizes codegen by generating branch on zero instruction instead
+      // of a test and branch. So we avoid canonicalizing in such situations
+      // because test and branch instruction has better branch displacement
+      // than compare and branch instruction.
+      if (!isBranchOnSignBitCheck(I, isSignBit) && !I.isEquality()) {
+        if (auto *AI = Intersection.getSingleElement())
+          return new ICmpInst(ICmpInst::ICMP_EQ, Op0, Builder->getInt(*AI));
+        if (auto *AD = Difference.getSingleElement())
+          return new ICmpInst(ICmpInst::ICMP_NE, Op0, Builder->getInt(*AD));
+      }
+    }
   }
 
   // See if we can fold the comparison based on range information we can get
@@ -2975,7 +3413,7 @@ Instruction *InstCombiner::visitICmpInst(ICmpInst &I) {
     default: llvm_unreachable("Unknown icmp opcode!");
     case ICmpInst::ICMP_EQ: {
       if (Op0Max.ult(Op1Min) || Op0Min.ugt(Op1Max))
-        return ReplaceInstUsesWith(I, ConstantInt::getFalse(I.getType()));
+        return replaceInstUsesWith(I, ConstantInt::getFalse(I.getType()));
 
       // If all bits are known zero except for one, then we know at most one
       // bit is set.   If the comparison is against zero, then this is a check
@@ -3019,7 +3457,7 @@ Instruction *InstCombiner::visitICmpInst(ICmpInst &I) {
     }
     case ICmpInst::ICMP_NE: {
       if (Op0Max.ult(Op1Min) || Op0Min.ugt(Op1Max))
-        return ReplaceInstUsesWith(I, ConstantInt::getTrue(I.getType()));
+        return replaceInstUsesWith(I, ConstantInt::getTrue(I.getType()));
 
       // If all bits are known zero except for one, then we know at most one
       // bit is set.   If the comparison is against zero, then this is a check
@@ -3063,9 +3501,9 @@ Instruction *InstCombiner::visitICmpInst(ICmpInst &I) {
     }
     case ICmpInst::ICMP_ULT:
       if (Op0Max.ult(Op1Min))          // A <u B -> true if max(A) < min(B)
-        return ReplaceInstUsesWith(I, ConstantInt::getTrue(I.getType()));
+        return replaceInstUsesWith(I, ConstantInt::getTrue(I.getType()));
       if (Op0Min.uge(Op1Max))          // A <u B -> false if min(A) >= max(B)
-        return ReplaceInstUsesWith(I, ConstantInt::getFalse(I.getType()));
+        return replaceInstUsesWith(I, ConstantInt::getFalse(I.getType()));
       if (Op1Min == Op0Max)            // A <u B -> A != B if max(A) == min(B)
         return new ICmpInst(ICmpInst::ICMP_NE, Op0, Op1);
       if (ConstantInt *CI = dyn_cast<ConstantInt>(Op1)) {
@@ -3081,9 +3519,9 @@ Instruction *InstCombiner::visitICmpInst(ICmpInst &I) {
       break;
     case ICmpInst::ICMP_UGT:
       if (Op0Min.ugt(Op1Max))          // A >u B -> true if min(A) > max(B)
-        return ReplaceInstUsesWith(I, ConstantInt::getTrue(I.getType()));
+        return replaceInstUsesWith(I, ConstantInt::getTrue(I.getType()));
       if (Op0Max.ule(Op1Min))          // A >u B -> false if max(A) <= max(B)
-        return ReplaceInstUsesWith(I, ConstantInt::getFalse(I.getType()));
+        return replaceInstUsesWith(I, ConstantInt::getFalse(I.getType()));
 
       if (Op1Max == Op0Min)            // A >u B -> A != B if min(A) == max(B)
         return new ICmpInst(ICmpInst::ICMP_NE, Op0, Op1);
@@ -3100,9 +3538,9 @@ Instruction *InstCombiner::visitICmpInst(ICmpInst &I) {
       break;
     case ICmpInst::ICMP_SLT:
       if (Op0Max.slt(Op1Min))          // A <s B -> true if max(A) < min(C)
-        return ReplaceInstUsesWith(I, ConstantInt::getTrue(I.getType()));
+        return replaceInstUsesWith(I, ConstantInt::getTrue(I.getType()));
       if (Op0Min.sge(Op1Max))          // A <s B -> false if min(A) >= max(C)
-        return ReplaceInstUsesWith(I, ConstantInt::getFalse(I.getType()));
+        return replaceInstUsesWith(I, ConstantInt::getFalse(I.getType()));
       if (Op1Min == Op0Max)            // A <s B -> A != B if max(A) == min(B)
         return new ICmpInst(ICmpInst::ICMP_NE, Op0, Op1);
       if (ConstantInt *CI = dyn_cast<ConstantInt>(Op1)) {
@@ -3113,9 +3551,9 @@ Instruction *InstCombiner::visitICmpInst(ICmpInst &I) {
       break;
     case ICmpInst::ICMP_SGT:
       if (Op0Min.sgt(Op1Max))          // A >s B -> true if min(A) > max(B)
-        return ReplaceInstUsesWith(I, ConstantInt::getTrue(I.getType()));
+        return replaceInstUsesWith(I, ConstantInt::getTrue(I.getType()));
       if (Op0Max.sle(Op1Min))          // A >s B -> false if max(A) <= min(B)
-        return ReplaceInstUsesWith(I, ConstantInt::getFalse(I.getType()));
+        return replaceInstUsesWith(I, ConstantInt::getFalse(I.getType()));
 
       if (Op1Max == Op0Min)            // A >s B -> A != B if min(A) == max(B)
         return new ICmpInst(ICmpInst::ICMP_NE, Op0, Op1);
@@ -3128,30 +3566,30 @@ Instruction *InstCombiner::visitICmpInst(ICmpInst &I) {
     case ICmpInst::ICMP_SGE:
       assert(!isa<ConstantInt>(Op1) && "ICMP_SGE with ConstantInt not folded!");
       if (Op0Min.sge(Op1Max))          // A >=s B -> true if min(A) >= max(B)
-        return ReplaceInstUsesWith(I, ConstantInt::getTrue(I.getType()));
+        return replaceInstUsesWith(I, ConstantInt::getTrue(I.getType()));
       if (Op0Max.slt(Op1Min))          // A >=s B -> false if max(A) < min(B)
-        return ReplaceInstUsesWith(I, ConstantInt::getFalse(I.getType()));
+        return replaceInstUsesWith(I, ConstantInt::getFalse(I.getType()));
       break;
     case ICmpInst::ICMP_SLE:
       assert(!isa<ConstantInt>(Op1) && "ICMP_SLE with ConstantInt not folded!");
       if (Op0Max.sle(Op1Min))          // A <=s B -> true if max(A) <= min(B)
-        return ReplaceInstUsesWith(I, ConstantInt::getTrue(I.getType()));
+        return replaceInstUsesWith(I, ConstantInt::getTrue(I.getType()));
       if (Op0Min.sgt(Op1Max))          // A <=s B -> false if min(A) > max(B)
-        return ReplaceInstUsesWith(I, ConstantInt::getFalse(I.getType()));
+        return replaceInstUsesWith(I, ConstantInt::getFalse(I.getType()));
       break;
     case ICmpInst::ICMP_UGE:
       assert(!isa<ConstantInt>(Op1) && "ICMP_UGE with ConstantInt not folded!");
       if (Op0Min.uge(Op1Max))          // A >=u B -> true if min(A) >= max(B)
-        return ReplaceInstUsesWith(I, ConstantInt::getTrue(I.getType()));
+        return replaceInstUsesWith(I, ConstantInt::getTrue(I.getType()));
       if (Op0Max.ult(Op1Min))          // A >=u B -> false if max(A) < min(B)
-        return ReplaceInstUsesWith(I, ConstantInt::getFalse(I.getType()));
+        return replaceInstUsesWith(I, ConstantInt::getFalse(I.getType()));
       break;
     case ICmpInst::ICMP_ULE:
       assert(!isa<ConstantInt>(Op1) && "ICMP_ULE with ConstantInt not folded!");
       if (Op0Max.ule(Op1Min))          // A <=u B -> true if max(A) <= min(B)
-        return ReplaceInstUsesWith(I, ConstantInt::getTrue(I.getType()));
+        return replaceInstUsesWith(I, ConstantInt::getTrue(I.getType()));
       if (Op0Min.ugt(Op1Max))          // A <=u B -> false if min(A) > max(B)
-        return ReplaceInstUsesWith(I, ConstantInt::getFalse(I.getType()));
+        return replaceInstUsesWith(I, ConstantInt::getFalse(I.getType()));
       break;
     }
 
@@ -3179,12 +3617,22 @@ Instruction *InstCombiner::visitICmpInst(ICmpInst &I) {
   // See if we are doing a comparison between a constant and an instruction that
   // can be folded into the comparison.
   if (ConstantInt *CI = dyn_cast<ConstantInt>(Op1)) {
+    Value *A = nullptr, *B = nullptr;
     // Since the RHS is a ConstantInt (CI), if the left hand side is an
     // instruction, see if that instruction also has constants so that the
     // instruction can be folded into the icmp
     if (Instruction *LHSI = dyn_cast<Instruction>(Op0))
       if (Instruction *Res = visitICmpInstWithInstAndIntCst(I, LHSI, CI))
         return Res;
+
+    // (icmp eq/ne (udiv A, B), 0) -> (icmp ugt/ule i32 B, A)
+    if (I.isEquality() && CI->isZero() &&
+        match(Op0, m_UDiv(m_Value(A), m_Value(B)))) {
+      ICmpInst::Predicate Pred = I.getPredicate() == ICmpInst::ICMP_EQ
+                                     ? ICmpInst::ICMP_UGT
+                                     : ICmpInst::ICMP_ULE;
+      return new ICmpInst(Pred, B, A);
+    }
   }
 
   // Handle icmp with constant (but not simple integer constant) RHS
@@ -3354,10 +3802,14 @@ Instruction *InstCombiner::visitICmpInst(ICmpInst &I) {
     // Analyze the case when either Op0 or Op1 is an add instruction.
     // Op0 = A + B (or A and B are null); Op1 = C + D (or C and D are null).
     Value *A = nullptr, *B = nullptr, *C = nullptr, *D = nullptr;
-    if (BO0 && BO0->getOpcode() == Instruction::Add)
-      A = BO0->getOperand(0), B = BO0->getOperand(1);
-    if (BO1 && BO1->getOpcode() == Instruction::Add)
-      C = BO1->getOperand(0), D = BO1->getOperand(1);
+    if (BO0 && BO0->getOpcode() == Instruction::Add) {
+      A = BO0->getOperand(0);
+      B = BO0->getOperand(1);
+    }
+    if (BO1 && BO1->getOpcode() == Instruction::Add) {
+      C = BO1->getOperand(0);
+      D = BO1->getOperand(1);
+    }
 
     // icmp (X+cst) < 0 --> X < -cst
     if (NoOp0WrapProblem && ICmpInst::isSigned(Pred) && match(Op1, m_Zero()))
@@ -3474,11 +3926,18 @@ Instruction *InstCombiner::visitICmpInst(ICmpInst &I) {
 
     // Analyze the case when either Op0 or Op1 is a sub instruction.
     // Op0 = A - B (or A and B are null); Op1 = C - D (or C and D are null).
-    A = nullptr; B = nullptr; C = nullptr; D = nullptr;
-    if (BO0 && BO0->getOpcode() == Instruction::Sub)
-      A = BO0->getOperand(0), B = BO0->getOperand(1);
-    if (BO1 && BO1->getOpcode() == Instruction::Sub)
-      C = BO1->getOperand(0), D = BO1->getOperand(1);
+    A = nullptr;
+    B = nullptr;
+    C = nullptr;
+    D = nullptr;
+    if (BO0 && BO0->getOpcode() == Instruction::Sub) {
+      A = BO0->getOperand(0);
+      B = BO0->getOperand(1);
+    }
+    if (BO1 && BO1->getOpcode() == Instruction::Sub) {
+      C = BO1->getOperand(0);
+      D = BO1->getOperand(1);
+    }
 
     // icmp (X-Y), X -> icmp 0, Y for equalities or if there is no overflow.
     if (A == Op1 && NoOp0WrapProblem)
@@ -3525,9 +3984,9 @@ Instruction *InstCombiner::visitICmpInst(ICmpInst &I) {
       switch (SRem == BO0 ? ICmpInst::getSwappedPredicate(Pred) : Pred) {
         default: break;
         case ICmpInst::ICMP_EQ:
-          return ReplaceInstUsesWith(I, ConstantInt::getFalse(I.getType()));
+          return replaceInstUsesWith(I, ConstantInt::getFalse(I.getType()));
         case ICmpInst::ICMP_NE:
-          return ReplaceInstUsesWith(I, ConstantInt::getTrue(I.getType()));
+          return replaceInstUsesWith(I, ConstantInt::getTrue(I.getType()));
         case ICmpInst::ICMP_SGT:
         case ICmpInst::ICMP_SGE:
           return new ICmpInst(ICmpInst::ICMP_SGT, SRem->getOperand(1),
@@ -3654,8 +4113,8 @@ Instruction *InstCombiner::visitICmpInst(ICmpInst &I) {
       Constant *Overflow;
       if (OptimizeOverflowCheck(OCF_UNSIGNED_ADD, A, B, *AddI, Result,
                                 Overflow)) {
-        ReplaceInstUsesWith(*AddI, Result);
-        return ReplaceInstUsesWith(I, Overflow);
+        replaceInstUsesWith(*AddI, Result);
+        return replaceInstUsesWith(I, Overflow);
       }
     }
 
@@ -3834,7 +4293,7 @@ Instruction *InstCombiner::visitICmpInst(ICmpInst &I) {
   return Changed ? &I : nullptr;
 }
 
-/// FoldFCmp_IntToFP_Cst - Fold fcmp ([us]itofp x, cst) if possible.
+/// Fold fcmp ([us]itofp x, cst) if possible.
 Instruction *InstCombiner::FoldFCmp_IntToFP_Cst(FCmpInst &I,
                                                 Instruction *LHSI,
                                                 Constant *RHSC) {
@@ -3864,10 +4323,10 @@ Instruction *InstCombiner::FoldFCmp_IntToFP_Cst(FCmpInst &I,
       RHSRoundInt.roundToIntegral(APFloat::rmNearestTiesToEven);
       if (RHS.compare(RHSRoundInt) != APFloat::cmpEqual) {
         if (P == FCmpInst::FCMP_OEQ || P == FCmpInst::FCMP_UEQ)
-          return ReplaceInstUsesWith(I, Builder->getFalse());
+          return replaceInstUsesWith(I, Builder->getFalse());
 
         assert(P == FCmpInst::FCMP_ONE || P == FCmpInst::FCMP_UNE);
-        return ReplaceInstUsesWith(I, Builder->getTrue());
+        return replaceInstUsesWith(I, Builder->getTrue());
       }
     }
 
@@ -3933,9 +4392,9 @@ Instruction *InstCombiner::FoldFCmp_IntToFP_Cst(FCmpInst &I,
     Pred = ICmpInst::ICMP_NE;
     break;
   case FCmpInst::FCMP_ORD:
-    return ReplaceInstUsesWith(I, Builder->getTrue());
+    return replaceInstUsesWith(I, Builder->getTrue());
   case FCmpInst::FCMP_UNO:
-    return ReplaceInstUsesWith(I, Builder->getFalse());
+    return replaceInstUsesWith(I, Builder->getFalse());
   }
 
   // Now we know that the APFloat is a normal number, zero or inf.
@@ -3953,8 +4412,8 @@ Instruction *InstCombiner::FoldFCmp_IntToFP_Cst(FCmpInst &I,
     if (SMax.compare(RHS) == APFloat::cmpLessThan) {  // smax < 13123.0
       if (Pred == ICmpInst::ICMP_NE  || Pred == ICmpInst::ICMP_SLT ||
           Pred == ICmpInst::ICMP_SLE)
-        return ReplaceInstUsesWith(I, Builder->getTrue());
-      return ReplaceInstUsesWith(I, Builder->getFalse());
+        return replaceInstUsesWith(I, Builder->getTrue());
+      return replaceInstUsesWith(I, Builder->getFalse());
     }
   } else {
     // If the RHS value is > UnsignedMax, fold the comparison. This handles
@@ -3965,8 +4424,8 @@ Instruction *InstCombiner::FoldFCmp_IntToFP_Cst(FCmpInst &I,
     if (UMax.compare(RHS) == APFloat::cmpLessThan) {  // umax < 13123.0
       if (Pred == ICmpInst::ICMP_NE  || Pred == ICmpInst::ICMP_ULT ||
           Pred == ICmpInst::ICMP_ULE)
-        return ReplaceInstUsesWith(I, Builder->getTrue());
-      return ReplaceInstUsesWith(I, Builder->getFalse());
+        return replaceInstUsesWith(I, Builder->getTrue());
+      return replaceInstUsesWith(I, Builder->getFalse());
     }
   }
 
@@ -3978,8 +4437,8 @@ Instruction *InstCombiner::FoldFCmp_IntToFP_Cst(FCmpInst &I,
     if (SMin.compare(RHS) == APFloat::cmpGreaterThan) { // smin > 12312.0
       if (Pred == ICmpInst::ICMP_NE || Pred == ICmpInst::ICMP_SGT ||
           Pred == ICmpInst::ICMP_SGE)
-        return ReplaceInstUsesWith(I, Builder->getTrue());
-      return ReplaceInstUsesWith(I, Builder->getFalse());
+        return replaceInstUsesWith(I, Builder->getTrue());
+      return replaceInstUsesWith(I, Builder->getFalse());
     }
   } else {
     // See if the RHS value is < UnsignedMin.
@@ -3989,8 +4448,8 @@ Instruction *InstCombiner::FoldFCmp_IntToFP_Cst(FCmpInst &I,
     if (SMin.compare(RHS) == APFloat::cmpGreaterThan) { // umin > 12312.0
       if (Pred == ICmpInst::ICMP_NE || Pred == ICmpInst::ICMP_UGT ||
           Pred == ICmpInst::ICMP_UGE)
-        return ReplaceInstUsesWith(I, Builder->getTrue());
-      return ReplaceInstUsesWith(I, Builder->getFalse());
+        return replaceInstUsesWith(I, Builder->getTrue());
+      return replaceInstUsesWith(I, Builder->getFalse());
     }
   }
 
@@ -4012,14 +4471,14 @@ Instruction *InstCombiner::FoldFCmp_IntToFP_Cst(FCmpInst &I,
       switch (Pred) {
       default: llvm_unreachable("Unexpected integer comparison!");
       case ICmpInst::ICMP_NE:  // (float)int != 4.4   --> true
-        return ReplaceInstUsesWith(I, Builder->getTrue());
+        return replaceInstUsesWith(I, Builder->getTrue());
       case ICmpInst::ICMP_EQ:  // (float)int == 4.4   --> false
-        return ReplaceInstUsesWith(I, Builder->getFalse());
+        return replaceInstUsesWith(I, Builder->getFalse());
       case ICmpInst::ICMP_ULE:
         // (float)int <= 4.4   --> int <= 4
         // (float)int <= -4.4  --> false
         if (RHS.isNegative())
-          return ReplaceInstUsesWith(I, Builder->getFalse());
+          return replaceInstUsesWith(I, Builder->getFalse());
         break;
       case ICmpInst::ICMP_SLE:
         // (float)int <= 4.4   --> int <= 4
@@ -4031,7 +4490,7 @@ Instruction *InstCombiner::FoldFCmp_IntToFP_Cst(FCmpInst &I,
         // (float)int < -4.4   --> false
         // (float)int < 4.4    --> int <= 4
         if (RHS.isNegative())
-          return ReplaceInstUsesWith(I, Builder->getFalse());
+          return replaceInstUsesWith(I, Builder->getFalse());
         Pred = ICmpInst::ICMP_ULE;
         break;
       case ICmpInst::ICMP_SLT:
@@ -4044,7 +4503,7 @@ Instruction *InstCombiner::FoldFCmp_IntToFP_Cst(FCmpInst &I,
         // (float)int > 4.4    --> int > 4
         // (float)int > -4.4   --> true
         if (RHS.isNegative())
-          return ReplaceInstUsesWith(I, Builder->getTrue());
+          return replaceInstUsesWith(I, Builder->getTrue());
         break;
       case ICmpInst::ICMP_SGT:
         // (float)int > 4.4    --> int > 4
@@ -4056,7 +4515,7 @@ Instruction *InstCombiner::FoldFCmp_IntToFP_Cst(FCmpInst &I,
         // (float)int >= -4.4   --> true
         // (float)int >= 4.4    --> int > 4
         if (RHS.isNegative())
-          return ReplaceInstUsesWith(I, Builder->getTrue());
+          return replaceInstUsesWith(I, Builder->getTrue());
         Pred = ICmpInst::ICMP_UGT;
         break;
       case ICmpInst::ICMP_SGE:
@@ -4089,7 +4548,7 @@ Instruction *InstCombiner::visitFCmpInst(FCmpInst &I) {
 
   if (Value *V = SimplifyFCmpInst(I.getPredicate(), Op0, Op1,
                                   I.getFastMathFlags(), DL, TLI, DT, AC, &I))
-    return ReplaceInstUsesWith(I, V);
+    return replaceInstUsesWith(I, V);
 
   // Simplify 'fcmp pred X, X'
   if (Op0 == Op1) {
@@ -4208,39 +4667,33 @@ Instruction *InstCombiner::visitFCmpInst(FCmpInst &I) {
           break;
 
         CallInst *CI = cast<CallInst>(LHSI);
-        const Function *F = CI->getCalledFunction();
-        if (!F)
+        Intrinsic::ID IID = getIntrinsicForCallSite(CI, TLI);
+        if (IID != Intrinsic::fabs)
           break;
 
         // Various optimization for fabs compared with zero.
-        LibFunc::Func Func;
-        if (F->getIntrinsicID() == Intrinsic::fabs ||
-            (TLI->getLibFunc(F->getName(), Func) && TLI->has(Func) &&
-             (Func == LibFunc::fabs || Func == LibFunc::fabsf ||
-              Func == LibFunc::fabsl))) {
-          switch (I.getPredicate()) {
-          default:
-            break;
-            // fabs(x) < 0 --> false
-          case FCmpInst::FCMP_OLT:
-            return ReplaceInstUsesWith(I, Builder->getFalse());
-            // fabs(x) > 0 --> x != 0
-          case FCmpInst::FCMP_OGT:
-            return new FCmpInst(FCmpInst::FCMP_ONE, CI->getArgOperand(0), RHSC);
-            // fabs(x) <= 0 --> x == 0
-          case FCmpInst::FCMP_OLE:
-            return new FCmpInst(FCmpInst::FCMP_OEQ, CI->getArgOperand(0), RHSC);
-            // fabs(x) >= 0 --> !isnan(x)
-          case FCmpInst::FCMP_OGE:
-            return new FCmpInst(FCmpInst::FCMP_ORD, CI->getArgOperand(0), RHSC);
-            // fabs(x) == 0 --> x == 0
-            // fabs(x) != 0 --> x != 0
-          case FCmpInst::FCMP_OEQ:
-          case FCmpInst::FCMP_UEQ:
-          case FCmpInst::FCMP_ONE:
-          case FCmpInst::FCMP_UNE:
-            return new FCmpInst(I.getPredicate(), CI->getArgOperand(0), RHSC);
-          }
+        switch (I.getPredicate()) {
+        default:
+          break;
+        // fabs(x) < 0 --> false
+        case FCmpInst::FCMP_OLT:
+          llvm_unreachable("handled by SimplifyFCmpInst");
+        // fabs(x) > 0 --> x != 0
+        case FCmpInst::FCMP_OGT:
+          return new FCmpInst(FCmpInst::FCMP_ONE, CI->getArgOperand(0), RHSC);
+        // fabs(x) <= 0 --> x == 0
+        case FCmpInst::FCMP_OLE:
+          return new FCmpInst(FCmpInst::FCMP_OEQ, CI->getArgOperand(0), RHSC);
+        // fabs(x) >= 0 --> !isnan(x)
+        case FCmpInst::FCMP_OGE:
+          return new FCmpInst(FCmpInst::FCMP_ORD, CI->getArgOperand(0), RHSC);
+        // fabs(x) == 0 --> x == 0
+        // fabs(x) != 0 --> x != 0
+        case FCmpInst::FCMP_OEQ:
+        case FCmpInst::FCMP_UEQ:
+        case FCmpInst::FCMP_ONE:
+        case FCmpInst::FCMP_UNE:
+          return new FCmpInst(I.getPredicate(), CI->getArgOperand(0), RHSC);
         }
       }
       }
diff --git a/lib/Transforms/InstCombine/InstCombineInternal.h b/lib/Transforms/InstCombine/InstCombineInternal.h
index e4e506509d39..aa421ff594fb 100644
--- a/lib/Transforms/InstCombine/InstCombineInternal.h
+++ b/lib/Transforms/InstCombine/InstCombineInternal.h
@@ -138,7 +138,7 @@ IntrinsicIDToOverflowCheckFlavor(unsigned ID) {
 /// \brief An IRBuilder inserter that adds new instructions to the instcombine
 /// worklist.
 class LLVM_LIBRARY_VISIBILITY InstCombineIRInserter
-    : public IRBuilderDefaultInserter<true> {
+    : public IRBuilderDefaultInserter {
   InstCombineWorklist &Worklist;
   AssumptionCache *AC;
 
@@ -148,7 +148,7 @@ public:
 
   void InsertHelper(Instruction *I, const Twine &Name, BasicBlock *BB,
                     BasicBlock::iterator InsertPt) const {
-    IRBuilderDefaultInserter<true>::InsertHelper(I, Name, BB, InsertPt);
+    IRBuilderDefaultInserter::InsertHelper(I, Name, BB, InsertPt);
     Worklist.Add(I);
 
     using namespace llvm::PatternMatch;
@@ -171,12 +171,14 @@ public:
 
   /// \brief An IRBuilder that automatically inserts new instructions into the
   /// worklist.
-  typedef IRBuilder<true, TargetFolder, InstCombineIRInserter> BuilderTy;
+  typedef IRBuilder<TargetFolder, InstCombineIRInserter> BuilderTy;
   BuilderTy *Builder;
 
 private:
   // Mode in which we are running the combiner.
   const bool MinimizeSize;
+  /// Enable combines that trigger rarely but are costly in compiletime.
+  const bool ExpensiveCombines;
 
   AliasAnalysis *AA;
 
@@ -195,11 +197,12 @@ private:
 
 public:
   InstCombiner(InstCombineWorklist &Worklist, BuilderTy *Builder,
-               bool MinimizeSize, AliasAnalysis *AA,
+               bool MinimizeSize, bool ExpensiveCombines, AliasAnalysis *AA,
                AssumptionCache *AC, TargetLibraryInfo *TLI,
                DominatorTree *DT, const DataLayout &DL, LoopInfo *LI)
       : Worklist(Worklist), Builder(Builder), MinimizeSize(MinimizeSize),
-        AA(AA), AC(AC), TLI(TLI), DT(DT), DL(DL), LI(LI), MadeIRChange(false) {}
+        ExpensiveCombines(ExpensiveCombines), AA(AA), AC(AC), TLI(TLI), DT(DT),
+        DL(DL), LI(LI), MadeIRChange(false) {}
 
   /// \brief Run the combiner over the entire worklist until it is empty.
   ///
@@ -327,6 +330,8 @@ public:
   Instruction *visitShuffleVectorInst(ShuffleVectorInst &SVI);
   Instruction *visitExtractValueInst(ExtractValueInst &EV);
   Instruction *visitLandingPadInst(LandingPadInst &LI);
+  Instruction *visitVAStartInst(VAStartInst &I);
+  Instruction *visitVACopyInst(VACopyInst &I);
 
   // visitInstruction - Specify what to return for unhandled instructions...
   Instruction *visitInstruction(Instruction &I) { return nullptr; }
@@ -390,6 +395,7 @@ private:
   Value *EmitGEPOffset(User *GEP);
   Instruction *scalarizePHI(ExtractElementInst &EI, PHINode *PN);
   Value *EvaluateInDifferentElementOrder(Value *V, ArrayRef<int> Mask);
+  Instruction *foldCastedBitwiseLogic(BinaryOperator &I);
 
 public:
   /// \brief Inserts an instruction \p New before instruction \p Old
@@ -417,7 +423,7 @@ public:
   /// replaceable with another preexisting expression. Here we add all uses of
   /// I to the worklist, replace all uses of I with the new value, then return
   /// I, so that the inst combiner will know that I was modified.
-  Instruction *ReplaceInstUsesWith(Instruction &I, Value *V) {
+  Instruction *replaceInstUsesWith(Instruction &I, Value *V) {
     // If there are no uses to replace, then we return nullptr to indicate that
     // no changes were made to the program.
     if (I.use_empty()) return nullptr;
@@ -451,16 +457,16 @@ public:
   /// When dealing with an instruction that has side effects or produces a void
   /// value, we can't rely on DCE to delete the instruction. Instead, visit
   /// methods should return the value returned by this function.
-  Instruction *EraseInstFromFunction(Instruction &I) {
+  Instruction *eraseInstFromFunction(Instruction &I) {
     DEBUG(dbgs() << "IC: ERASE " << I << '\n');
 
     assert(I.use_empty() && "Cannot erase instruction that is used!");
     // Make sure that we reprocess all operands now that we reduced their
     // use counts.
     if (I.getNumOperands() < 8) {
-      for (User::op_iterator i = I.op_begin(), e = I.op_end(); i != e; ++i)
-        if (Instruction *Op = dyn_cast<Instruction>(*i))
-          Worklist.Add(Op);
+      for (Use &Operand : I.operands())
+        if (auto *Inst = dyn_cast<Instruction>(Operand))
+          Worklist.Add(Inst);
     }
     Worklist.Remove(&I);
     I.eraseFromParent();
@@ -515,12 +521,12 @@ private:
   Value *SimplifyDemandedUseBits(Value *V, APInt DemandedMask, APInt &KnownZero,
                                  APInt &KnownOne, unsigned Depth,
                                  Instruction *CxtI);
-  bool SimplifyDemandedBits(Use &U, APInt DemandedMask, APInt &KnownZero,
+  bool SimplifyDemandedBits(Use &U, const APInt &DemandedMask, APInt &KnownZero,
                             APInt &KnownOne, unsigned Depth = 0);
   /// Helper routine of SimplifyDemandedUseBits. It tries to simplify demanded
   /// bit for "r1 = shr x, c1; r2 = shl r1, c2" instruction sequence.
   Value *SimplifyShrShlDemandedBits(Instruction *Lsr, Instruction *Sftl,
-                                    APInt DemandedMask, APInt &KnownZero,
+                                    const APInt &DemandedMask, APInt &KnownZero,
                                     APInt &KnownOne);
 
   /// \brief Tries to simplify operands to an integer instruction based on its
@@ -556,7 +562,7 @@ private:
   Value *InsertRangeTest(Value *V, Constant *Lo, Constant *Hi, bool isSigned,
                          bool Inside);
   Instruction *PromoteCastOfAllocation(BitCastInst &CI, AllocaInst &AI);
-  Instruction *MatchBSwapOrBitReverse(BinaryOperator &I);
+  Instruction *MatchBSwap(BinaryOperator &I);
   bool SimplifyStoreAtEndOfBlock(StoreInst &SI);
   Instruction *SimplifyMemTransfer(MemIntrinsic *MI);
   Instruction *SimplifyMemSet(MemSetInst *MI);
diff --git a/lib/Transforms/InstCombine/InstCombineLoadStoreAlloca.cpp b/lib/Transforms/InstCombine/InstCombineLoadStoreAlloca.cpp
index dd2889de405e..d312983ed51b 100644
--- a/lib/Transforms/InstCombine/InstCombineLoadStoreAlloca.cpp
+++ b/lib/Transforms/InstCombine/InstCombineLoadStoreAlloca.cpp
@@ -205,11 +205,11 @@ static Instruction *simplifyAllocaArraySize(InstCombiner &IC, AllocaInst &AI) {
 
     // Now make everything use the getelementptr instead of the original
     // allocation.
-    return IC.ReplaceInstUsesWith(AI, GEP);
+    return IC.replaceInstUsesWith(AI, GEP);
   }
 
   if (isa<UndefValue>(AI.getArraySize()))
-    return IC.ReplaceInstUsesWith(AI, Constant::getNullValue(AI.getType()));
+    return IC.replaceInstUsesWith(AI, Constant::getNullValue(AI.getType()));
 
   // Ensure that the alloca array size argument has type intptr_t, so that
   // any casting is exposed early.
@@ -271,7 +271,7 @@ Instruction *InstCombiner::visitAllocaInst(AllocaInst &AI) {
         EntryAI->setAlignment(MaxAlign);
         if (AI.getType() != EntryAI->getType())
           return new BitCastInst(EntryAI, AI.getType());
-        return ReplaceInstUsesWith(AI, EntryAI);
+        return replaceInstUsesWith(AI, EntryAI);
       }
     }
   }
@@ -291,12 +291,12 @@ Instruction *InstCombiner::visitAllocaInst(AllocaInst &AI) {
         DEBUG(dbgs() << "Found alloca equal to global: " << AI << '\n');
         DEBUG(dbgs() << "  memcpy = " << *Copy << '\n');
         for (unsigned i = 0, e = ToDelete.size(); i != e; ++i)
-          EraseInstFromFunction(*ToDelete[i]);
+          eraseInstFromFunction(*ToDelete[i]);
         Constant *TheSrc = cast<Constant>(Copy->getSource());
         Constant *Cast
           = ConstantExpr::getPointerBitCastOrAddrSpaceCast(TheSrc, AI.getType());
-        Instruction *NewI = ReplaceInstUsesWith(AI, Cast);
-        EraseInstFromFunction(*Copy);
+        Instruction *NewI = replaceInstUsesWith(AI, Cast);
+        eraseInstFromFunction(*Copy);
         ++NumGlobalCopies;
         return NewI;
       }
@@ -326,7 +326,8 @@ static LoadInst *combineLoadToNewType(InstCombiner &IC, LoadInst &LI, Type *NewT
 
   LoadInst *NewLoad = IC.Builder->CreateAlignedLoad(
       IC.Builder->CreateBitCast(Ptr, NewTy->getPointerTo(AS)),
-      LI.getAlignment(), LI.getName() + Suffix);
+      LI.getAlignment(), LI.isVolatile(), LI.getName() + Suffix);
+  NewLoad->setAtomic(LI.getOrdering(), LI.getSynchScope());
   MDBuilder MDB(NewLoad->getContext());
   for (const auto &MDPair : MD) {
     unsigned ID = MDPair.first;
@@ -398,7 +399,8 @@ static StoreInst *combineStoreToNewValue(InstCombiner &IC, StoreInst &SI, Value
 
   StoreInst *NewStore = IC.Builder->CreateAlignedStore(
       V, IC.Builder->CreateBitCast(Ptr, V->getType()->getPointerTo(AS)),
-      SI.getAlignment());
+      SI.getAlignment(), SI.isVolatile());
+  NewStore->setAtomic(SI.getOrdering(), SI.getSynchScope());
   for (const auto &MDPair : MD) {
     unsigned ID = MDPair.first;
     MDNode *N = MDPair.second;
@@ -438,7 +440,7 @@ static StoreInst *combineStoreToNewValue(InstCombiner &IC, StoreInst &SI, Value
   return NewStore;
 }
 
-/// \brief Combine loads to match the type of value their uses after looking
+/// \brief Combine loads to match the type of their uses' value after looking
 /// through intervening bitcasts.
 ///
 /// The core idea here is that if the result of a load is used in an operation,
@@ -456,9 +458,9 @@ static StoreInst *combineStoreToNewValue(InstCombiner &IC, StoreInst &SI, Value
 /// later. However, it is risky in case some backend or other part of LLVM is
 /// relying on the exact type loaded to select appropriate atomic operations.
 static Instruction *combineLoadToOperationType(InstCombiner &IC, LoadInst &LI) {
-  // FIXME: We could probably with some care handle both volatile and atomic
-  // loads here but it isn't clear that this is important.
-  if (!LI.isSimple())
+  // FIXME: We could probably with some care handle both volatile and ordered
+  // atomic loads here but it isn't clear that this is important.
+  if (!LI.isUnordered())
     return nullptr;
 
   if (LI.use_empty())
@@ -486,7 +488,7 @@ static Instruction *combineLoadToOperationType(InstCombiner &IC, LoadInst &LI) {
         auto *SI = cast<StoreInst>(*UI++);
         IC.Builder->SetInsertPoint(SI);
         combineStoreToNewValue(IC, *SI, NewLoad);
-        IC.EraseInstFromFunction(*SI);
+        IC.eraseInstFromFunction(*SI);
       }
       assert(LI.use_empty() && "Failed to remove all users of the load!");
       // Return the old load so the combiner can delete it safely.
@@ -503,7 +505,7 @@ static Instruction *combineLoadToOperationType(InstCombiner &IC, LoadInst &LI) {
       if (CI->isNoopCast(DL)) {
         LoadInst *NewLoad = combineLoadToNewType(IC, LI, CI->getDestTy());
         CI->replaceAllUsesWith(NewLoad);
-        IC.EraseInstFromFunction(*CI);
+        IC.eraseInstFromFunction(*CI);
         return &LI;
       }
     }
@@ -523,16 +525,17 @@ static Instruction *unpackLoadToAggregate(InstCombiner &IC, LoadInst &LI) {
   if (!T->isAggregateType())
     return nullptr;
 
+  StringRef Name = LI.getName();
   assert(LI.getAlignment() && "Alignment must be set at this point");
 
   if (auto *ST = dyn_cast<StructType>(T)) {
     // If the struct only have one element, we unpack.
-    unsigned Count = ST->getNumElements();
-    if (Count == 1) {
+    auto NumElements = ST->getNumElements();
+    if (NumElements == 1) {
       LoadInst *NewLoad = combineLoadToNewType(IC, LI, ST->getTypeAtIndex(0U),
                                                ".unpack");
-      return IC.ReplaceInstUsesWith(LI, IC.Builder->CreateInsertValue(
-        UndefValue::get(T), NewLoad, 0, LI.getName()));
+      return IC.replaceInstUsesWith(LI, IC.Builder->CreateInsertValue(
+        UndefValue::get(T), NewLoad, 0, Name));
     }
 
     // We don't want to break loads with padding here as we'd loose
@@ -542,38 +545,67 @@ static Instruction *unpackLoadToAggregate(InstCombiner &IC, LoadInst &LI) {
     if (SL->hasPadding())
       return nullptr;
 
-    auto Name = LI.getName();
-    SmallString<16> LoadName = Name;
-    LoadName += ".unpack";
-    SmallString<16> EltName = Name;
-    EltName += ".elt";
+    auto Align = LI.getAlignment();
+    if (!Align)
+      Align = DL.getABITypeAlignment(ST);
+
     auto *Addr = LI.getPointerOperand();
-    Value *V = UndefValue::get(T);
-    auto *IdxType = Type::getInt32Ty(ST->getContext());
+    auto *IdxType = Type::getInt32Ty(T->getContext());
     auto *Zero = ConstantInt::get(IdxType, 0);
-    for (unsigned i = 0; i < Count; i++) {
+
+    Value *V = UndefValue::get(T);
+    for (unsigned i = 0; i < NumElements; i++) {
       Value *Indices[2] = {
         Zero,
         ConstantInt::get(IdxType, i),
       };
-      auto *Ptr = IC.Builder->CreateInBoundsGEP(ST, Addr, makeArrayRef(Indices), EltName);
-      auto *L = IC.Builder->CreateAlignedLoad(Ptr, LI.getAlignment(),
-                                              LoadName);
+      auto *Ptr = IC.Builder->CreateInBoundsGEP(ST, Addr, makeArrayRef(Indices),
+                                                Name + ".elt");
+      auto EltAlign = MinAlign(Align, SL->getElementOffset(i));
+      auto *L = IC.Builder->CreateAlignedLoad(Ptr, EltAlign, Name + ".unpack");
       V = IC.Builder->CreateInsertValue(V, L, i);
     }
 
     V->setName(Name);
-    return IC.ReplaceInstUsesWith(LI, V);
+    return IC.replaceInstUsesWith(LI, V);
   }
 
   if (auto *AT = dyn_cast<ArrayType>(T)) {
-    // If the array only have one element, we unpack.
-    if (AT->getNumElements() == 1) {
-      LoadInst *NewLoad = combineLoadToNewType(IC, LI, AT->getElementType(),
-                                               ".unpack");
-      return IC.ReplaceInstUsesWith(LI, IC.Builder->CreateInsertValue(
-        UndefValue::get(T), NewLoad, 0, LI.getName()));
+    auto *ET = AT->getElementType();
+    auto NumElements = AT->getNumElements();
+    if (NumElements == 1) {
+      LoadInst *NewLoad = combineLoadToNewType(IC, LI, ET, ".unpack");
+      return IC.replaceInstUsesWith(LI, IC.Builder->CreateInsertValue(
+        UndefValue::get(T), NewLoad, 0, Name));
     }
+
+    const DataLayout &DL = IC.getDataLayout();
+    auto EltSize = DL.getTypeAllocSize(ET);
+    auto Align = LI.getAlignment();
+    if (!Align)
+      Align = DL.getABITypeAlignment(T);
+
+    auto *Addr = LI.getPointerOperand();
+    auto *IdxType = Type::getInt64Ty(T->getContext());
+    auto *Zero = ConstantInt::get(IdxType, 0);
+
+    Value *V = UndefValue::get(T);
+    uint64_t Offset = 0;
+    for (uint64_t i = 0; i < NumElements; i++) {
+      Value *Indices[2] = {
+        Zero,
+        ConstantInt::get(IdxType, i),
+      };
+      auto *Ptr = IC.Builder->CreateInBoundsGEP(AT, Addr, makeArrayRef(Indices),
+                                                Name + ".elt");
+      auto *L = IC.Builder->CreateAlignedLoad(Ptr, MinAlign(Align, Offset),
+                                              Name + ".unpack");
+      V = IC.Builder->CreateInsertValue(V, L, i);
+      Offset += EltSize;
+    }
+
+    V->setName(Name);
+    return IC.replaceInstUsesWith(LI, V);
   }
 
   return nullptr;
@@ -610,7 +642,7 @@ static bool isObjectSizeLessThanOrEq(Value *V, uint64_t MaxSize,
     }
 
     if (GlobalAlias *GA = dyn_cast<GlobalAlias>(P)) {
-      if (GA->mayBeOverridden())
+      if (GA->isInterposable())
         return false;
       Worklist.push_back(GA->getAliasee());
       continue;
@@ -638,7 +670,7 @@ static bool isObjectSizeLessThanOrEq(Value *V, uint64_t MaxSize,
       if (!GV->hasDefinitiveInitializer() || !GV->isConstant())
         return false;
 
-      uint64_t InitSize = DL.getTypeAllocSize(GV->getType()->getElementType());
+      uint64_t InitSize = DL.getTypeAllocSize(GV->getValueType());
       if (InitSize > MaxSize)
         return false;
       continue;
@@ -695,10 +727,8 @@ static bool canReplaceGEPIdxWithZero(InstCombiner &IC, GetElementPtrInst *GEPI,
     return false;
 
   SmallVector<Value *, 4> Ops(GEPI->idx_begin(), GEPI->idx_begin() + Idx);
-  Type *AllocTy = GetElementPtrInst::getIndexedType(
-      cast<PointerType>(GEPI->getOperand(0)->getType()->getScalarType())
-          ->getElementType(),
-      Ops);
+  Type *AllocTy =
+    GetElementPtrInst::getIndexedType(GEPI->getSourceElementType(), Ops);
   if (!AllocTy || !AllocTy->isSized())
     return false;
   const DataLayout &DL = IC.getDataLayout();
@@ -781,10 +811,6 @@ Instruction *InstCombiner::visitLoadInst(LoadInst &LI) {
       return &LI;
   }
 
-  // None of the following transforms are legal for volatile/atomic loads.
-  // FIXME: Some of it is okay for atomic loads; needs refactoring.
-  if (!LI.isSimple()) return nullptr;
-
   if (Instruction *Res = unpackLoadToAggregate(*this, LI))
     return Res;
 
@@ -793,10 +819,12 @@ Instruction *InstCombiner::visitLoadInst(LoadInst &LI) {
   // separated by a few arithmetic operations.
   BasicBlock::iterator BBI(LI);
   AAMDNodes AATags;
+  bool IsLoadCSE = false;
   if (Value *AvailableVal =
-      FindAvailableLoadedValue(Op, LI.getParent(), BBI,
-                               DefMaxInstsToScan, AA, &AATags)) {
-    if (LoadInst *NLI = dyn_cast<LoadInst>(AvailableVal)) {
+      FindAvailableLoadedValue(&LI, LI.getParent(), BBI,
+                               DefMaxInstsToScan, AA, &AATags, &IsLoadCSE)) {
+    if (IsLoadCSE) {
+      LoadInst *NLI = cast<LoadInst>(AvailableVal);
       unsigned KnownIDs[] = {
           LLVMContext::MD_tbaa,            LLVMContext::MD_alias_scope,
           LLVMContext::MD_noalias,         LLVMContext::MD_range,
@@ -807,11 +835,15 @@ Instruction *InstCombiner::visitLoadInst(LoadInst &LI) {
       combineMetadata(NLI, &LI, KnownIDs);
     };
 
-    return ReplaceInstUsesWith(
+    return replaceInstUsesWith(
         LI, Builder->CreateBitOrPointerCast(AvailableVal, LI.getType(),
                                             LI.getName() + ".cast"));
   }
 
+  // None of the following transforms are legal for volatile/ordered atomic
+  // loads.  Most of them do apply for unordered atomics.
+  if (!LI.isUnordered()) return nullptr;
+
   // load(gep null, ...) -> unreachable
   if (GetElementPtrInst *GEPI = dyn_cast<GetElementPtrInst>(Op)) {
     const Value *GEPI0 = GEPI->getOperand(0);
@@ -823,7 +855,7 @@ Instruction *InstCombiner::visitLoadInst(LoadInst &LI) {
       // CFG.
       new StoreInst(UndefValue::get(LI.getType()),
                     Constant::getNullValue(Op->getType()), &LI);
-      return ReplaceInstUsesWith(LI, UndefValue::get(LI.getType()));
+      return replaceInstUsesWith(LI, UndefValue::get(LI.getType()));
     }
   }
 
@@ -836,7 +868,7 @@ Instruction *InstCombiner::visitLoadInst(LoadInst &LI) {
     // unreachable instruction directly because we cannot modify the CFG.
     new StoreInst(UndefValue::get(LI.getType()),
                   Constant::getNullValue(Op->getType()), &LI);
-    return ReplaceInstUsesWith(LI, UndefValue::get(LI.getType()));
+    return replaceInstUsesWith(LI, UndefValue::get(LI.getType()));
   }
 
   if (Op->hasOneUse()) {
@@ -853,14 +885,17 @@ Instruction *InstCombiner::visitLoadInst(LoadInst &LI) {
     if (SelectInst *SI = dyn_cast<SelectInst>(Op)) {
       // load (select (Cond, &V1, &V2))  --> select(Cond, load &V1, load &V2).
       unsigned Align = LI.getAlignment();
-      if (isSafeToLoadUnconditionally(SI->getOperand(1), SI, Align) &&
-          isSafeToLoadUnconditionally(SI->getOperand(2), SI, Align)) {
+      if (isSafeToLoadUnconditionally(SI->getOperand(1), Align, DL, SI) &&
+          isSafeToLoadUnconditionally(SI->getOperand(2), Align, DL, SI)) {
         LoadInst *V1 = Builder->CreateLoad(SI->getOperand(1),
                                            SI->getOperand(1)->getName()+".val");
         LoadInst *V2 = Builder->CreateLoad(SI->getOperand(2),
                                            SI->getOperand(2)->getName()+".val");
+        assert(LI.isUnordered() && "implied by above");
         V1->setAlignment(Align);
+        V1->setAtomic(LI.getOrdering(), LI.getSynchScope());
         V2->setAlignment(Align);
+        V2->setAtomic(LI.getOrdering(), LI.getSynchScope());
         return SelectInst::Create(SI->getCondition(), V1, V2);
       }
 
@@ -882,6 +917,61 @@ Instruction *InstCombiner::visitLoadInst(LoadInst &LI) {
   return nullptr;
 }
 
+/// \brief Look for extractelement/insertvalue sequence that acts like a bitcast.
+///
+/// \returns underlying value that was "cast", or nullptr otherwise.
+///
+/// For example, if we have:
+///
+///     %E0 = extractelement <2 x double> %U, i32 0
+///     %V0 = insertvalue [2 x double] undef, double %E0, 0
+///     %E1 = extractelement <2 x double> %U, i32 1
+///     %V1 = insertvalue [2 x double] %V0, double %E1, 1
+///
+/// and the layout of a <2 x double> is isomorphic to a [2 x double],
+/// then %V1 can be safely approximated by a conceptual "bitcast" of %U.
+/// Note that %U may contain non-undef values where %V1 has undef.
+static Value *likeBitCastFromVector(InstCombiner &IC, Value *V) {
+  Value *U = nullptr;
+  while (auto *IV = dyn_cast<InsertValueInst>(V)) {
+    auto *E = dyn_cast<ExtractElementInst>(IV->getInsertedValueOperand());
+    if (!E)
+      return nullptr;
+    auto *W = E->getVectorOperand();
+    if (!U)
+      U = W;
+    else if (U != W)
+      return nullptr;
+    auto *CI = dyn_cast<ConstantInt>(E->getIndexOperand());
+    if (!CI || IV->getNumIndices() != 1 || CI->getZExtValue() != *IV->idx_begin())
+      return nullptr;
+    V = IV->getAggregateOperand();
+  }
+  if (!isa<UndefValue>(V) ||!U)
+    return nullptr;
+
+  auto *UT = cast<VectorType>(U->getType());
+  auto *VT = V->getType();
+  // Check that types UT and VT are bitwise isomorphic.
+  const auto &DL = IC.getDataLayout();
+  if (DL.getTypeStoreSizeInBits(UT) != DL.getTypeStoreSizeInBits(VT)) {
+    return nullptr;
+  }
+  if (auto *AT = dyn_cast<ArrayType>(VT)) {
+    if (AT->getNumElements() != UT->getNumElements())
+      return nullptr;
+  } else {
+    auto *ST = cast<StructType>(VT);
+    if (ST->getNumElements() != UT->getNumElements())
+      return nullptr;
+    for (const auto *EltT : ST->elements()) {
+      if (EltT != UT->getElementType())
+        return nullptr;
+    }
+  }
+  return U;
+}
+
 /// \brief Combine stores to match the type of value being stored.
 ///
 /// The core idea here is that the memory does not have any intrinsic type and
@@ -903,9 +993,9 @@ Instruction *InstCombiner::visitLoadInst(LoadInst &LI) {
 /// the store instruction as otherwise there is no way to signal whether it was
 /// combined or not: IC.EraseInstFromFunction returns a null pointer.
 static bool combineStoreToValueType(InstCombiner &IC, StoreInst &SI) {
-  // FIXME: We could probably with some care handle both volatile and atomic
-  // stores here but it isn't clear that this is important.
-  if (!SI.isSimple())
+  // FIXME: We could probably with some care handle both volatile and ordered
+  // atomic stores here but it isn't clear that this is important.
+  if (!SI.isUnordered())
     return false;
 
   Value *V = SI.getValueOperand();
@@ -917,8 +1007,13 @@ static bool combineStoreToValueType(InstCombiner &IC, StoreInst &SI) {
     return true;
   }
 
-  // FIXME: We should also canonicalize loads of vectors when their elements are
-  // cast to other types.
+  if (Value *U = likeBitCastFromVector(IC, V)) {
+    combineStoreToNewValue(IC, SI, U);
+    return true;
+  }
+
+  // FIXME: We should also canonicalize stores of vectors when their elements
+  // are cast to other types.
   return false;
 }
 
@@ -950,11 +1045,16 @@ static bool unpackStoreToAggregate(InstCombiner &IC, StoreInst &SI) {
     if (SL->hasPadding())
       return false;
 
+    auto Align = SI.getAlignment();
+    if (!Align)
+      Align = DL.getABITypeAlignment(ST);
+
     SmallString<16> EltName = V->getName();
     EltName += ".elt";
     auto *Addr = SI.getPointerOperand();
     SmallString<16> AddrName = Addr->getName();
     AddrName += ".repack";
+
     auto *IdxType = Type::getInt32Ty(ST->getContext());
     auto *Zero = ConstantInt::get(IdxType, 0);
     for (unsigned i = 0; i < Count; i++) {
@@ -962,9 +1062,11 @@ static bool unpackStoreToAggregate(InstCombiner &IC, StoreInst &SI) {
         Zero,
         ConstantInt::get(IdxType, i),
       };
-      auto *Ptr = IC.Builder->CreateInBoundsGEP(ST, Addr, makeArrayRef(Indices), AddrName);
+      auto *Ptr = IC.Builder->CreateInBoundsGEP(ST, Addr, makeArrayRef(Indices),
+                                                AddrName);
       auto *Val = IC.Builder->CreateExtractValue(V, i, EltName);
-      IC.Builder->CreateStore(Val, Ptr);
+      auto EltAlign = MinAlign(Align, SL->getElementOffset(i));
+      IC.Builder->CreateAlignedStore(Val, Ptr, EltAlign);
     }
 
     return true;
@@ -972,11 +1074,43 @@ static bool unpackStoreToAggregate(InstCombiner &IC, StoreInst &SI) {
 
   if (auto *AT = dyn_cast<ArrayType>(T)) {
     // If the array only have one element, we unpack.
-    if (AT->getNumElements() == 1) {
+    auto NumElements = AT->getNumElements();
+    if (NumElements == 1) {
       V = IC.Builder->CreateExtractValue(V, 0);
       combineStoreToNewValue(IC, SI, V);
       return true;
     }
+
+    const DataLayout &DL = IC.getDataLayout();
+    auto EltSize = DL.getTypeAllocSize(AT->getElementType());
+    auto Align = SI.getAlignment();
+    if (!Align)
+      Align = DL.getABITypeAlignment(T);
+
+    SmallString<16> EltName = V->getName();
+    EltName += ".elt";
+    auto *Addr = SI.getPointerOperand();
+    SmallString<16> AddrName = Addr->getName();
+    AddrName += ".repack";
+
+    auto *IdxType = Type::getInt64Ty(T->getContext());
+    auto *Zero = ConstantInt::get(IdxType, 0);
+
+    uint64_t Offset = 0;
+    for (uint64_t i = 0; i < NumElements; i++) {
+      Value *Indices[2] = {
+        Zero,
+        ConstantInt::get(IdxType, i),
+      };
+      auto *Ptr = IC.Builder->CreateInBoundsGEP(AT, Addr, makeArrayRef(Indices),
+                                                AddrName);
+      auto *Val = IC.Builder->CreateExtractValue(V, i, EltName);
+      auto EltAlign = MinAlign(Align, Offset);
+      IC.Builder->CreateAlignedStore(Val, Ptr, EltAlign);
+      Offset += EltSize;
+    }
+
+    return true;
   }
 
   return false;
@@ -1017,7 +1151,7 @@ Instruction *InstCombiner::visitStoreInst(StoreInst &SI) {
 
   // Try to canonicalize the stored type.
   if (combineStoreToValueType(*this, SI))
-    return EraseInstFromFunction(SI);
+    return eraseInstFromFunction(SI);
 
   // Attempt to improve the alignment.
   unsigned KnownAlign = getOrEnforceKnownAlignment(
@@ -1033,7 +1167,7 @@ Instruction *InstCombiner::visitStoreInst(StoreInst &SI) {
 
   // Try to canonicalize the stored type.
   if (unpackStoreToAggregate(*this, SI))
-    return EraseInstFromFunction(SI);
+    return eraseInstFromFunction(SI);
 
   // Replace GEP indices if possible.
   if (Instruction *NewGEPI = replaceGEPIdxWithZero(*this, Ptr, SI)) {
@@ -1049,11 +1183,11 @@ Instruction *InstCombiner::visitStoreInst(StoreInst &SI) {
   // alloca dead.
   if (Ptr->hasOneUse()) {
     if (isa<AllocaInst>(Ptr))
-      return EraseInstFromFunction(SI);
+      return eraseInstFromFunction(SI);
     if (GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(Ptr)) {
       if (isa<AllocaInst>(GEP->getOperand(0))) {
         if (GEP->getOperand(0)->hasOneUse())
-          return EraseInstFromFunction(SI);
+          return eraseInstFromFunction(SI);
       }
     }
   }
@@ -1079,7 +1213,7 @@ Instruction *InstCombiner::visitStoreInst(StoreInst &SI) {
                                                         SI.getOperand(1))) {
         ++NumDeadStore;
         ++BBI;
-        EraseInstFromFunction(*PrevSI);
+        eraseInstFromFunction(*PrevSI);
         continue;
       }
       break;
@@ -1091,7 +1225,7 @@ Instruction *InstCombiner::visitStoreInst(StoreInst &SI) {
     if (LoadInst *LI = dyn_cast<LoadInst>(BBI)) {
       if (LI == Val && equivalentAddressValues(LI->getOperand(0), Ptr)) {
         assert(SI.isUnordered() && "can't eliminate ordering operation");
-        return EraseInstFromFunction(SI);
+        return eraseInstFromFunction(SI);
       }
 
       // Otherwise, this is a load from some other location.  Stores before it
@@ -1116,11 +1250,7 @@ Instruction *InstCombiner::visitStoreInst(StoreInst &SI) {
 
   // store undef, Ptr -> noop
   if (isa<UndefValue>(Val))
-    return EraseInstFromFunction(SI);
-
-  // The code below needs to be audited and adjusted for unordered atomics
-  if (!SI.isSimple())
-    return nullptr;
+    return eraseInstFromFunction(SI);
 
   // If this store is the last instruction in the basic block (possibly
   // excepting debug info instructions), and if the block ends with an
@@ -1147,6 +1277,9 @@ Instruction *InstCombiner::visitStoreInst(StoreInst &SI) {
 /// into a phi node with a store in the successor.
 ///
 bool InstCombiner::SimplifyStoreAtEndOfBlock(StoreInst &SI) {
+  assert(SI.isUnordered() &&
+         "this code has not been auditted for volatile or ordered store case");
+  
   BasicBlock *StoreBB = SI.getParent();
 
   // Check to see if the successor block has exactly two incoming edges.  If
@@ -1268,7 +1401,7 @@ bool InstCombiner::SimplifyStoreAtEndOfBlock(StoreInst &SI) {
   }
 
   // Nuke the old stores.
-  EraseInstFromFunction(SI);
-  EraseInstFromFunction(*OtherStore);
+  eraseInstFromFunction(SI);
+  eraseInstFromFunction(*OtherStore);
   return true;
 }
diff --git a/lib/Transforms/InstCombine/InstCombineMulDivRem.cpp b/lib/Transforms/InstCombine/InstCombineMulDivRem.cpp
index 160792b0a000..788097f33f12 100644
--- a/lib/Transforms/InstCombine/InstCombineMulDivRem.cpp
+++ b/lib/Transforms/InstCombine/InstCombineMulDivRem.cpp
@@ -45,28 +45,28 @@ static Value *simplifyValueKnownNonZero(Value *V, InstCombiner &IC,
 
   // (PowerOfTwo >>u B) --> isExact since shifting out the result would make it
   // inexact.  Similarly for <<.
-  if (BinaryOperator *I = dyn_cast<BinaryOperator>(V))
-    if (I->isLogicalShift() &&
-        isKnownToBeAPowerOfTwo(I->getOperand(0), IC.getDataLayout(), false, 0,
-                               IC.getAssumptionCache(), &CxtI,
-                               IC.getDominatorTree())) {
-      // We know that this is an exact/nuw shift and that the input is a
-      // non-zero context as well.
-      if (Value *V2 = simplifyValueKnownNonZero(I->getOperand(0), IC, CxtI)) {
-        I->setOperand(0, V2);
-        MadeChange = true;
-      }
+  BinaryOperator *I = dyn_cast<BinaryOperator>(V);
+  if (I && I->isLogicalShift() &&
+      isKnownToBeAPowerOfTwo(I->getOperand(0), IC.getDataLayout(), false, 0,
+                             IC.getAssumptionCache(), &CxtI,
+                             IC.getDominatorTree())) {
+    // We know that this is an exact/nuw shift and that the input is a
+    // non-zero context as well.
+    if (Value *V2 = simplifyValueKnownNonZero(I->getOperand(0), IC, CxtI)) {
+      I->setOperand(0, V2);
+      MadeChange = true;
+    }
 
-      if (I->getOpcode() == Instruction::LShr && !I->isExact()) {
-        I->setIsExact();
-        MadeChange = true;
-      }
+    if (I->getOpcode() == Instruction::LShr && !I->isExact()) {
+      I->setIsExact();
+      MadeChange = true;
+    }
 
-      if (I->getOpcode() == Instruction::Shl && !I->hasNoUnsignedWrap()) {
-        I->setHasNoUnsignedWrap();
-        MadeChange = true;
-      }
+    if (I->getOpcode() == Instruction::Shl && !I->hasNoUnsignedWrap()) {
+      I->setHasNoUnsignedWrap();
+      MadeChange = true;
     }
+  }
 
   // TODO: Lots more we could do here:
   //    If V is a phi node, we can call this on each of its operands.
@@ -177,13 +177,13 @@ Instruction *InstCombiner::visitMul(BinaryOperator &I) {
   Value *Op0 = I.getOperand(0), *Op1 = I.getOperand(1);
 
   if (Value *V = SimplifyVectorOp(I))
-    return ReplaceInstUsesWith(I, V);
+    return replaceInstUsesWith(I, V);
 
   if (Value *V = SimplifyMulInst(Op0, Op1, DL, TLI, DT, AC))
-    return ReplaceInstUsesWith(I, V);
+    return replaceInstUsesWith(I, V);
 
   if (Value *V = SimplifyUsingDistributiveLaws(I))
-    return ReplaceInstUsesWith(I, V);
+    return replaceInstUsesWith(I, V);
 
   // X * -1 == 0 - X
   if (match(Op1, m_AllOnes())) {
@@ -323,7 +323,7 @@ Instruction *InstCombiner::visitMul(BinaryOperator &I) {
       if (PossiblyExactOperator *SDiv = dyn_cast<PossiblyExactOperator>(BO))
         if (SDiv->isExact()) {
           if (Op1BO == Op1C)
-            return ReplaceInstUsesWith(I, Op0BO);
+            return replaceInstUsesWith(I, Op0BO);
           return BinaryOperator::CreateNeg(Op0BO);
         }
 
@@ -374,10 +374,13 @@ Instruction *InstCombiner::visitMul(BinaryOperator &I) {
     APInt Negative2(I.getType()->getPrimitiveSizeInBits(), (uint64_t)-2, true);
 
     Value *BoolCast = nullptr, *OtherOp = nullptr;
-    if (MaskedValueIsZero(Op0, Negative2, 0, &I))
-      BoolCast = Op0, OtherOp = Op1;
-    else if (MaskedValueIsZero(Op1, Negative2, 0, &I))
-      BoolCast = Op1, OtherOp = Op0;
+    if (MaskedValueIsZero(Op0, Negative2, 0, &I)) {
+      BoolCast = Op0;
+      OtherOp = Op1;
+    } else if (MaskedValueIsZero(Op1, Negative2, 0, &I)) {
+      BoolCast = Op1;
+      OtherOp = Op0;
+    }
 
     if (BoolCast) {
       Value *V = Builder->CreateSub(Constant::getNullValue(I.getType()),
@@ -536,14 +539,14 @@ Instruction *InstCombiner::visitFMul(BinaryOperator &I) {
   Value *Op0 = I.getOperand(0), *Op1 = I.getOperand(1);
 
   if (Value *V = SimplifyVectorOp(I))
-    return ReplaceInstUsesWith(I, V);
+    return replaceInstUsesWith(I, V);
 
   if (isa<Constant>(Op0))
     std::swap(Op0, Op1);
 
   if (Value *V =
           SimplifyFMulInst(Op0, Op1, I.getFastMathFlags(), DL, TLI, DT, AC))
-    return ReplaceInstUsesWith(I, V);
+    return replaceInstUsesWith(I, V);
 
   bool AllowReassociate = I.hasUnsafeAlgebra();
 
@@ -574,7 +577,7 @@ Instruction *InstCombiner::visitFMul(BinaryOperator &I) {
       // Try to simplify "MDC * Constant"
       if (isFMulOrFDivWithConstant(Op0))
         if (Value *V = foldFMulConst(cast<Instruction>(Op0), C, &I))
-          return ReplaceInstUsesWith(I, V);
+          return replaceInstUsesWith(I, V);
 
       // (MDC +/- C1) * C => (MDC * C) +/- (C1 * C)
       Instruction *FAddSub = dyn_cast<Instruction>(Op0);
@@ -612,11 +615,22 @@ Instruction *InstCombiner::visitFMul(BinaryOperator &I) {
     }
   }
 
-  // sqrt(X) * sqrt(X) -> X
-  if (AllowReassociate && (Op0 == Op1))
-    if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(Op0))
-      if (II->getIntrinsicID() == Intrinsic::sqrt)
-        return ReplaceInstUsesWith(I, II->getOperand(0));
+  if (Op0 == Op1) {
+    if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(Op0)) {
+      // sqrt(X) * sqrt(X) -> X
+      if (AllowReassociate && II->getIntrinsicID() == Intrinsic::sqrt)
+        return replaceInstUsesWith(I, II->getOperand(0));
+
+      // fabs(X) * fabs(X) -> X * X
+      if (II->getIntrinsicID() == Intrinsic::fabs) {
+        Instruction *FMulVal = BinaryOperator::CreateFMul(II->getOperand(0),
+                                                          II->getOperand(0),
+                                                          I.getName());
+        FMulVal->copyFastMathFlags(&I);
+        return FMulVal;
+      }
+    }
+  }
 
   // Under unsafe algebra do:
   // X * log2(0.5*Y) = X*log2(Y) - X
@@ -641,7 +655,7 @@ Instruction *InstCombiner::visitFMul(BinaryOperator &I) {
       Value *FMulVal = Builder->CreateFMul(OpX, Log2);
       Value *FSub = Builder->CreateFSub(FMulVal, OpX);
       FSub->takeName(&I);
-      return ReplaceInstUsesWith(I, FSub);
+      return replaceInstUsesWith(I, FSub);
     }
   }
 
@@ -661,7 +675,7 @@ Instruction *InstCombiner::visitFMul(BinaryOperator &I) {
       if (N1) {
         Value *FMul = Builder->CreateFMul(N0, N1);
         FMul->takeName(&I);
-        return ReplaceInstUsesWith(I, FMul);
+        return replaceInstUsesWith(I, FMul);
       }
 
       if (Opnd0->hasOneUse()) {
@@ -669,7 +683,7 @@ Instruction *InstCombiner::visitFMul(BinaryOperator &I) {
         Value *T = Builder->CreateFMul(N0, Opnd1);
         Value *Neg = Builder->CreateFNeg(T);
         Neg->takeName(&I);
-        return ReplaceInstUsesWith(I, Neg);
+        return replaceInstUsesWith(I, Neg);
       }
     }
 
@@ -698,7 +712,7 @@ Instruction *InstCombiner::visitFMul(BinaryOperator &I) {
 
           Value *R = Builder->CreateFMul(T, Y);
           R->takeName(&I);
-          return ReplaceInstUsesWith(I, R);
+          return replaceInstUsesWith(I, R);
         }
       }
     }
@@ -1043,10 +1057,10 @@ Instruction *InstCombiner::visitUDiv(BinaryOperator &I) {
   Value *Op0 = I.getOperand(0), *Op1 = I.getOperand(1);
 
   if (Value *V = SimplifyVectorOp(I))
-    return ReplaceInstUsesWith(I, V);
+    return replaceInstUsesWith(I, V);
 
   if (Value *V = SimplifyUDivInst(Op0, Op1, DL, TLI, DT, AC))
-    return ReplaceInstUsesWith(I, V);
+    return replaceInstUsesWith(I, V);
 
   // Handle the integer div common cases
   if (Instruction *Common = commonIDivTransforms(I))
@@ -1116,27 +1130,43 @@ Instruction *InstCombiner::visitSDiv(BinaryOperator &I) {
   Value *Op0 = I.getOperand(0), *Op1 = I.getOperand(1);
 
   if (Value *V = SimplifyVectorOp(I))
-    return ReplaceInstUsesWith(I, V);
+    return replaceInstUsesWith(I, V);
 
   if (Value *V = SimplifySDivInst(Op0, Op1, DL, TLI, DT, AC))
-    return ReplaceInstUsesWith(I, V);
+    return replaceInstUsesWith(I, V);
 
   // Handle the integer div common cases
   if (Instruction *Common = commonIDivTransforms(I))
     return Common;
 
-  // sdiv X, -1 == -X
-  if (match(Op1, m_AllOnes()))
-    return BinaryOperator::CreateNeg(Op0);
+  const APInt *Op1C;
+  if (match(Op1, m_APInt(Op1C))) {
+    // sdiv X, -1 == -X
+    if (Op1C->isAllOnesValue())
+      return BinaryOperator::CreateNeg(Op0);
 
-  if (ConstantInt *RHS = dyn_cast<ConstantInt>(Op1)) {
-    // sdiv X, C  -->  ashr exact X, log2(C)
-    if (I.isExact() && RHS->getValue().isNonNegative() &&
-        RHS->getValue().isPowerOf2()) {
-      Value *ShAmt = llvm::ConstantInt::get(RHS->getType(),
-                                            RHS->getValue().exactLogBase2());
+    // sdiv exact X, C  -->  ashr exact X, log2(C)
+    if (I.isExact() && Op1C->isNonNegative() && Op1C->isPowerOf2()) {
+      Value *ShAmt = ConstantInt::get(Op1->getType(), Op1C->exactLogBase2());
       return BinaryOperator::CreateExactAShr(Op0, ShAmt, I.getName());
     }
+
+    // If the dividend is sign-extended and the constant divisor is small enough
+    // to fit in the source type, shrink the division to the narrower type:
+    // (sext X) sdiv C --> sext (X sdiv C)
+    Value *Op0Src;
+    if (match(Op0, m_OneUse(m_SExt(m_Value(Op0Src)))) &&
+        Op0Src->getType()->getScalarSizeInBits() >= Op1C->getMinSignedBits()) {
+
+      // In the general case, we need to make sure that the dividend is not the
+      // minimum signed value because dividing that by -1 is UB. But here, we
+      // know that the -1 divisor case is already handled above.
+
+      Constant *NarrowDivisor =
+          ConstantExpr::getTrunc(cast<Constant>(Op1), Op0Src->getType());
+      Value *NarrowOp = Builder->CreateSDiv(Op0Src, NarrowDivisor);
+      return new SExtInst(NarrowOp, Op0->getType());
+    }
   }
 
   if (Constant *RHS = dyn_cast<Constant>(Op1)) {
@@ -1214,11 +1244,11 @@ Instruction *InstCombiner::visitFDiv(BinaryOperator &I) {
   Value *Op0 = I.getOperand(0), *Op1 = I.getOperand(1);
 
   if (Value *V = SimplifyVectorOp(I))
-    return ReplaceInstUsesWith(I, V);
+    return replaceInstUsesWith(I, V);
 
   if (Value *V = SimplifyFDivInst(Op0, Op1, I.getFastMathFlags(),
                                   DL, TLI, DT, AC))
-    return ReplaceInstUsesWith(I, V);
+    return replaceInstUsesWith(I, V);
 
   if (isa<Constant>(Op0))
     if (SelectInst *SI = dyn_cast<SelectInst>(Op1))
@@ -1363,8 +1393,17 @@ Instruction *InstCombiner::commonIRemTransforms(BinaryOperator &I) {
         if (Instruction *R = FoldOpIntoSelect(I, SI))
           return R;
       } else if (isa<PHINode>(Op0I)) {
-        if (Instruction *NV = FoldOpIntoPhi(I))
-          return NV;
+        using namespace llvm::PatternMatch;
+        const APInt *Op1Int;
+        if (match(Op1, m_APInt(Op1Int)) && !Op1Int->isMinValue() &&
+            (I.getOpcode() == Instruction::URem ||
+             !Op1Int->isMinSignedValue())) {
+          // FoldOpIntoPhi will speculate instructions to the end of the PHI's
+          // predecessor blocks, so do this only if we know the srem or urem
+          // will not fault.
+          if (Instruction *NV = FoldOpIntoPhi(I))
+            return NV;
+        }
       }
 
       // See if we can fold away this rem instruction.
@@ -1380,10 +1419,10 @@ Instruction *InstCombiner::visitURem(BinaryOperator &I) {
   Value *Op0 = I.getOperand(0), *Op1 = I.getOperand(1);
 
   if (Value *V = SimplifyVectorOp(I))
-    return ReplaceInstUsesWith(I, V);
+    return replaceInstUsesWith(I, V);
 
   if (Value *V = SimplifyURemInst(Op0, Op1, DL, TLI, DT, AC))
-    return ReplaceInstUsesWith(I, V);
+    return replaceInstUsesWith(I, V);
 
   if (Instruction *common = commonIRemTransforms(I))
     return common;
@@ -1405,7 +1444,7 @@ Instruction *InstCombiner::visitURem(BinaryOperator &I) {
   if (match(Op0, m_One())) {
     Value *Cmp = Builder->CreateICmpNE(Op1, Op0);
     Value *Ext = Builder->CreateZExt(Cmp, I.getType());
-    return ReplaceInstUsesWith(I, Ext);
+    return replaceInstUsesWith(I, Ext);
   }
 
   return nullptr;
@@ -1415,10 +1454,10 @@ Instruction *InstCombiner::visitSRem(BinaryOperator &I) {
   Value *Op0 = I.getOperand(0), *Op1 = I.getOperand(1);
 
   if (Value *V = SimplifyVectorOp(I))
-    return ReplaceInstUsesWith(I, V);
+    return replaceInstUsesWith(I, V);
 
   if (Value *V = SimplifySRemInst(Op0, Op1, DL, TLI, DT, AC))
-    return ReplaceInstUsesWith(I, V);
+    return replaceInstUsesWith(I, V);
 
   // Handle the integer rem common cases
   if (Instruction *Common = commonIRemTransforms(I))
@@ -1490,11 +1529,11 @@ Instruction *InstCombiner::visitFRem(BinaryOperator &I) {
   Value *Op0 = I.getOperand(0), *Op1 = I.getOperand(1);
 
   if (Value *V = SimplifyVectorOp(I))
-    return ReplaceInstUsesWith(I, V);
+    return replaceInstUsesWith(I, V);
 
   if (Value *V = SimplifyFRemInst(Op0, Op1, I.getFastMathFlags(),
                                   DL, TLI, DT, AC))
-    return ReplaceInstUsesWith(I, V);
+    return replaceInstUsesWith(I, V);
 
   // Handle cases involving: rem X, (select Cond, Y, Z)
   if (isa<SelectInst>(Op1) && SimplifyDivRemOfSelect(I))
diff --git a/lib/Transforms/InstCombine/InstCombinePHI.cpp b/lib/Transforms/InstCombine/InstCombinePHI.cpp
index f1aa98b5e359..79a4912332ff 100644
--- a/lib/Transforms/InstCombine/InstCombinePHI.cpp
+++ b/lib/Transforms/InstCombine/InstCombinePHI.cpp
@@ -15,8 +15,11 @@
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/SmallPtrSet.h"
 #include "llvm/Analysis/InstructionSimplify.h"
+#include "llvm/Analysis/ValueTracking.h"
+#include "llvm/IR/PatternMatch.h"
 #include "llvm/Transforms/Utils/Local.h"
 using namespace llvm;
+using namespace llvm::PatternMatch;
 
 #define DEBUG_TYPE "instcombine"
 
@@ -32,15 +35,6 @@ Instruction *InstCombiner::FoldPHIArgBinOpIntoPHI(PHINode &PN) {
   Type *LHSType = LHSVal->getType();
   Type *RHSType = RHSVal->getType();
 
-  bool isNUW = false, isNSW = false, isExact = false;
-  if (OverflowingBinaryOperator *BO =
-        dyn_cast<OverflowingBinaryOperator>(FirstInst)) {
-    isNUW = BO->hasNoUnsignedWrap();
-    isNSW = BO->hasNoSignedWrap();
-  } else if (PossiblyExactOperator *PEO =
-               dyn_cast<PossiblyExactOperator>(FirstInst))
-    isExact = PEO->isExact();
-
   // Scan to see if all operands are the same opcode, and all have one use.
   for (unsigned i = 1; i != PN.getNumIncomingValues(); ++i) {
     Instruction *I = dyn_cast<Instruction>(PN.getIncomingValue(i));
@@ -56,13 +50,6 @@ Instruction *InstCombiner::FoldPHIArgBinOpIntoPHI(PHINode &PN) {
       if (CI->getPredicate() != cast<CmpInst>(FirstInst)->getPredicate())
         return nullptr;
 
-    if (isNUW)
-      isNUW = cast<OverflowingBinaryOperator>(I)->hasNoUnsignedWrap();
-    if (isNSW)
-      isNSW = cast<OverflowingBinaryOperator>(I)->hasNoSignedWrap();
-    if (isExact)
-      isExact = cast<PossiblyExactOperator>(I)->isExact();
-
     // Keep track of which operand needs a phi node.
     if (I->getOperand(0) != LHSVal) LHSVal = nullptr;
     if (I->getOperand(1) != RHSVal) RHSVal = nullptr;
@@ -121,9 +108,12 @@ Instruction *InstCombiner::FoldPHIArgBinOpIntoPHI(PHINode &PN) {
   BinaryOperator *BinOp = cast<BinaryOperator>(FirstInst);
   BinaryOperator *NewBinOp =
     BinaryOperator::Create(BinOp->getOpcode(), LHSVal, RHSVal);
-  if (isNUW) NewBinOp->setHasNoUnsignedWrap();
-  if (isNSW) NewBinOp->setHasNoSignedWrap();
-  if (isExact) NewBinOp->setIsExact();
+
+  NewBinOp->copyIRFlags(PN.getIncomingValue(0));
+
+  for (unsigned i = 1, e = PN.getNumIncomingValues(); i != e; ++i)
+    NewBinOp->andIRFlags(PN.getIncomingValue(i));
+
   NewBinOp->setDebugLoc(FirstInst->getDebugLoc());
   return NewBinOp;
 }
@@ -494,7 +484,6 @@ Instruction *InstCombiner::FoldPHIArgOpIntoPHI(PHINode &PN) {
   // code size and simplifying code.
   Constant *ConstantOp = nullptr;
   Type *CastSrcTy = nullptr;
-  bool isNUW = false, isNSW = false, isExact = false;
 
   if (isa<CastInst>(FirstInst)) {
     CastSrcTy = FirstInst->getOperand(0)->getType();
@@ -511,14 +500,6 @@ Instruction *InstCombiner::FoldPHIArgOpIntoPHI(PHINode &PN) {
     ConstantOp = dyn_cast<Constant>(FirstInst->getOperand(1));
     if (!ConstantOp)
       return FoldPHIArgBinOpIntoPHI(PN);
-
-    if (OverflowingBinaryOperator *BO =
-        dyn_cast<OverflowingBinaryOperator>(FirstInst)) {
-      isNUW = BO->hasNoUnsignedWrap();
-      isNSW = BO->hasNoSignedWrap();
-    } else if (PossiblyExactOperator *PEO =
-               dyn_cast<PossiblyExactOperator>(FirstInst))
-      isExact = PEO->isExact();
   } else {
     return nullptr;  // Cannot fold this operation.
   }
@@ -534,13 +515,6 @@ Instruction *InstCombiner::FoldPHIArgOpIntoPHI(PHINode &PN) {
     } else if (I->getOperand(1) != ConstantOp) {
       return nullptr;
     }
-
-    if (isNUW)
-      isNUW = cast<OverflowingBinaryOperator>(I)->hasNoUnsignedWrap();
-    if (isNSW)
-      isNSW = cast<OverflowingBinaryOperator>(I)->hasNoSignedWrap();
-    if (isExact)
-      isExact = cast<PossiblyExactOperator>(I)->isExact();
   }
 
   // Okay, they are all the same operation.  Create a new PHI node of the
@@ -581,9 +555,11 @@ Instruction *InstCombiner::FoldPHIArgOpIntoPHI(PHINode &PN) {
 
   if (BinaryOperator *BinOp = dyn_cast<BinaryOperator>(FirstInst)) {
     BinOp = BinaryOperator::Create(BinOp->getOpcode(), PhiVal, ConstantOp);
-    if (isNUW) BinOp->setHasNoUnsignedWrap();
-    if (isNSW) BinOp->setHasNoSignedWrap();
-    if (isExact) BinOp->setIsExact();
+    BinOp->copyIRFlags(PN.getIncomingValue(0));
+
+    for (unsigned i = 1, e = PN.getNumIncomingValues(); i != e; ++i)
+      BinOp->andIRFlags(PN.getIncomingValue(i));
+
     BinOp->setDebugLoc(FirstInst->getDebugLoc());
     return BinOp;
   }
@@ -641,6 +617,16 @@ static bool PHIsEqualValue(PHINode *PN, Value *NonPhiInVal,
   return true;
 }
 
+/// Return an existing non-zero constant if this phi node has one, otherwise
+/// return constant 1.
+static ConstantInt *GetAnyNonZeroConstInt(PHINode &PN) {
+  assert(isa<IntegerType>(PN.getType()) && "Expect only intger type phi");
+  for (Value *V : PN.operands())
+    if (auto *ConstVA = dyn_cast<ConstantInt>(V))
+      if (!ConstVA->isZeroValue())
+        return ConstVA;
+  return ConstantInt::get(cast<IntegerType>(PN.getType()), 1);
+}
 
 namespace {
 struct PHIUsageRecord {
@@ -768,7 +754,7 @@ Instruction *InstCombiner::SliceUpIllegalIntegerPHI(PHINode &FirstPhi) {
 
   // If we have no users, they must be all self uses, just nuke the PHI.
   if (PHIUsers.empty())
-    return ReplaceInstUsesWith(FirstPhi, UndefValue::get(FirstPhi.getType()));
+    return replaceInstUsesWith(FirstPhi, UndefValue::get(FirstPhi.getType()));
 
   // If this phi node is transformable, create new PHIs for all the pieces
   // extracted out of it.  First, sort the users by their offset and size.
@@ -864,22 +850,22 @@ Instruction *InstCombiner::SliceUpIllegalIntegerPHI(PHINode &FirstPhi) {
     }
 
     // Replace the use of this piece with the PHI node.
-    ReplaceInstUsesWith(*PHIUsers[UserI].Inst, EltPHI);
+    replaceInstUsesWith(*PHIUsers[UserI].Inst, EltPHI);
   }
 
   // Replace all the remaining uses of the PHI nodes (self uses and the lshrs)
   // with undefs.
   Value *Undef = UndefValue::get(FirstPhi.getType());
   for (unsigned i = 1, e = PHIsToSlice.size(); i != e; ++i)
-    ReplaceInstUsesWith(*PHIsToSlice[i], Undef);
-  return ReplaceInstUsesWith(FirstPhi, Undef);
+    replaceInstUsesWith(*PHIsToSlice[i], Undef);
+  return replaceInstUsesWith(FirstPhi, Undef);
 }
 
 // PHINode simplification
 //
 Instruction *InstCombiner::visitPHINode(PHINode &PN) {
   if (Value *V = SimplifyInstruction(&PN, DL, TLI, DT, AC))
-    return ReplaceInstUsesWith(PN, V);
+    return replaceInstUsesWith(PN, V);
 
   if (Instruction *Result = FoldPHIArgZextsIntoPHI(PN))
     return Result;
@@ -905,7 +891,7 @@ Instruction *InstCombiner::visitPHINode(PHINode &PN) {
       SmallPtrSet<PHINode*, 16> PotentiallyDeadPHIs;
       PotentiallyDeadPHIs.insert(&PN);
       if (DeadPHICycle(PU, PotentiallyDeadPHIs))
-        return ReplaceInstUsesWith(PN, UndefValue::get(PN.getType()));
+        return replaceInstUsesWith(PN, UndefValue::get(PN.getType()));
     }
 
     // If this phi has a single use, and if that use just computes a value for
@@ -917,7 +903,30 @@ Instruction *InstCombiner::visitPHINode(PHINode &PN) {
     if (PHIUser->hasOneUse() &&
         (isa<BinaryOperator>(PHIUser) || isa<GetElementPtrInst>(PHIUser)) &&
         PHIUser->user_back() == &PN) {
-      return ReplaceInstUsesWith(PN, UndefValue::get(PN.getType()));
+      return replaceInstUsesWith(PN, UndefValue::get(PN.getType()));
+    }
+    // When a PHI is used only to be compared with zero, it is safe to replace
+    // an incoming value proved as known nonzero with any non-zero constant.
+    // For example, in the code below, the incoming value %v can be replaced
+    // with any non-zero constant based on the fact that the PHI is only used to
+    // be compared with zero and %v is a known non-zero value:
+    // %v = select %cond, 1, 2
+    // %p = phi [%v, BB] ...
+    //      icmp eq, %p, 0
+    auto *CmpInst = dyn_cast<ICmpInst>(PHIUser);
+    // FIXME: To be simple, handle only integer type for now.
+    if (CmpInst && isa<IntegerType>(PN.getType()) && CmpInst->isEquality() &&
+        match(CmpInst->getOperand(1), m_Zero())) {
+      ConstantInt *NonZeroConst = nullptr;
+      for (unsigned i = 0, e = PN.getNumIncomingValues(); i != e; ++i) {
+        Instruction *CtxI = PN.getIncomingBlock(i)->getTerminator();
+        Value *VA = PN.getIncomingValue(i);
+        if (isKnownNonZero(VA, DL, 0, AC, CtxI, DT)) {
+          if (!NonZeroConst)
+            NonZeroConst = GetAnyNonZeroConstInt(PN);
+          PN.setIncomingValue(i, NonZeroConst);
+        }
+      }
     }
   }
 
@@ -951,7 +960,7 @@ Instruction *InstCombiner::visitPHINode(PHINode &PN) {
       if (InValNo == NumIncomingVals) {
         SmallPtrSet<PHINode*, 16> ValueEqualPHIs;
         if (PHIsEqualValue(&PN, NonPhiInVal, ValueEqualPHIs))
-          return ReplaceInstUsesWith(PN, NonPhiInVal);
+          return replaceInstUsesWith(PN, NonPhiInVal);
       }
     }
   }
diff --git a/lib/Transforms/InstCombine/InstCombineSelect.cpp b/lib/Transforms/InstCombine/InstCombineSelect.cpp
index 51219bcb0b7b..d7eed790e2ab 100644
--- a/lib/Transforms/InstCombine/InstCombineSelect.cpp
+++ b/lib/Transforms/InstCombine/InstCombineSelect.cpp
@@ -116,25 +116,41 @@ static Constant *GetSelectFoldableConstant(Instruction *I) {
   }
 }
 
-/// Here we have (select c, TI, FI), and we know that TI and FI
-/// have the same opcode and only one use each.  Try to simplify this.
+/// We have (select c, TI, FI), and we know that TI and FI have the same opcode.
 Instruction *InstCombiner::FoldSelectOpOp(SelectInst &SI, Instruction *TI,
                                           Instruction *FI) {
-  if (TI->getNumOperands() == 1) {
-    // If this is a non-volatile load or a cast from the same type,
-    // merge.
-    if (TI->isCast()) {
-      Type *FIOpndTy = FI->getOperand(0)->getType();
-      if (TI->getOperand(0)->getType() != FIOpndTy)
+  // If this is a cast from the same type, merge.
+  if (TI->getNumOperands() == 1 && TI->isCast()) {
+    Type *FIOpndTy = FI->getOperand(0)->getType();
+    if (TI->getOperand(0)->getType() != FIOpndTy)
+      return nullptr;
+
+    // The select condition may be a vector. We may only change the operand
+    // type if the vector width remains the same (and matches the condition).
+    Type *CondTy = SI.getCondition()->getType();
+    if (CondTy->isVectorTy()) {
+      if (!FIOpndTy->isVectorTy())
         return nullptr;
-      // The select condition may be a vector. We may only change the operand
-      // type if the vector width remains the same (and matches the condition).
-      Type *CondTy = SI.getCondition()->getType();
-      if (CondTy->isVectorTy() && (!FIOpndTy->isVectorTy() ||
-          CondTy->getVectorNumElements() != FIOpndTy->getVectorNumElements()))
+      if (CondTy->getVectorNumElements() != FIOpndTy->getVectorNumElements())
         return nullptr;
-    } else {
-      return nullptr;  // unknown unary op.
+
+      // TODO: If the backend knew how to deal with casts better, we could
+      // remove this limitation. For now, there's too much potential to create
+      // worse codegen by promoting the select ahead of size-altering casts
+      // (PR28160).
+      //
+      // Note that ValueTracking's matchSelectPattern() looks through casts
+      // without checking 'hasOneUse' when it matches min/max patterns, so this
+      // transform may end up happening anyway.
+      if (TI->getOpcode() != Instruction::BitCast &&
+          (!TI->hasOneUse() || !FI->hasOneUse()))
+        return nullptr;
+
+    } else if (!TI->hasOneUse() || !FI->hasOneUse()) {
+      // TODO: The one-use restrictions for a scalar select could be eased if
+      // the fold of a select in visitLoadInst() was enhanced to match a pattern
+      // that includes a cast.
+      return nullptr;
     }
 
     // Fold this by inserting a select from the input values.
@@ -144,8 +160,13 @@ Instruction *InstCombiner::FoldSelectOpOp(SelectInst &SI, Instruction *TI,
                             TI->getType());
   }
 
-  // Only handle binary operators here.
-  if (!isa<BinaryOperator>(TI))
+  // TODO: This function ends awkwardly in unreachable - fix to be more normal.
+
+  // Only handle binary operators with one-use here. As with the cast case
+  // above, it may be possible to relax the one-use constraint, but that needs
+  // be examined carefully since it may not reduce the total number of
+  // instructions.
+  if (!isa<BinaryOperator>(TI) || !TI->hasOneUse() || !FI->hasOneUse())
     return nullptr;
 
   // Figure out if the operations have any operands in common.
@@ -231,12 +252,7 @@ Instruction *InstCombiner::FoldSelectIntoOp(SelectInst &SI, Value *TrueVal,
             BinaryOperator *TVI_BO = cast<BinaryOperator>(TVI);
             BinaryOperator *BO = BinaryOperator::Create(TVI_BO->getOpcode(),
                                                         FalseVal, NewSel);
-            if (isa<PossiblyExactOperator>(BO))
-              BO->setIsExact(TVI_BO->isExact());
-            if (isa<OverflowingBinaryOperator>(BO)) {
-              BO->setHasNoUnsignedWrap(TVI_BO->hasNoUnsignedWrap());
-              BO->setHasNoSignedWrap(TVI_BO->hasNoSignedWrap());
-            }
+            BO->copyIRFlags(TVI_BO);
             return BO;
           }
         }
@@ -266,12 +282,7 @@ Instruction *InstCombiner::FoldSelectIntoOp(SelectInst &SI, Value *TrueVal,
             BinaryOperator *FVI_BO = cast<BinaryOperator>(FVI);
             BinaryOperator *BO = BinaryOperator::Create(FVI_BO->getOpcode(),
                                                         TrueVal, NewSel);
-            if (isa<PossiblyExactOperator>(BO))
-              BO->setIsExact(FVI_BO->isExact());
-            if (isa<OverflowingBinaryOperator>(BO)) {
-              BO->setHasNoUnsignedWrap(FVI_BO->hasNoUnsignedWrap());
-              BO->setHasNoSignedWrap(FVI_BO->hasNoSignedWrap());
-            }
+            BO->copyIRFlags(FVI_BO);
             return BO;
           }
         }
@@ -353,7 +364,7 @@ static Value *foldSelectICmpAndOr(const SelectInst &SI, Value *TrueVal,
 ///   %1 = icmp ne i32 %x, 0
 ///   %2 = select i1 %1, i32 %0, i32 32
 /// \code
-/// 
+///
 /// into:
 ///   %0 = tail call i32 @llvm.cttz.i32(i32 %x, i1 false)
 static Value *foldSelectCttzCtlz(ICmpInst *ICI, Value *TrueVal, Value *FalseVal,
@@ -519,10 +530,10 @@ Instruction *InstCombiner::visitSelectInstWithICmp(SelectInst &SI,
 
           // Check if we can express the operation with a single or.
           if (C2->isAllOnesValue())
-            return ReplaceInstUsesWith(SI, Builder->CreateOr(AShr, C1));
+            return replaceInstUsesWith(SI, Builder->CreateOr(AShr, C1));
 
           Value *And = Builder->CreateAnd(AShr, C2->getValue()-C1->getValue());
-          return ReplaceInstUsesWith(SI, Builder->CreateAdd(And, C1));
+          return replaceInstUsesWith(SI, Builder->CreateAdd(And, C1));
         }
       }
     }
@@ -585,15 +596,15 @@ Instruction *InstCombiner::visitSelectInstWithICmp(SelectInst &SI,
         V = Builder->CreateOr(X, *Y);
 
       if (V)
-        return ReplaceInstUsesWith(SI, V);
+        return replaceInstUsesWith(SI, V);
     }
   }
 
   if (Value *V = foldSelectICmpAndOr(SI, TrueVal, FalseVal, Builder))
-    return ReplaceInstUsesWith(SI, V);
+    return replaceInstUsesWith(SI, V);
 
   if (Value *V = foldSelectCttzCtlz(ICI, TrueVal, FalseVal, Builder))
-    return ReplaceInstUsesWith(SI, V);
+    return replaceInstUsesWith(SI, V);
 
   return Changed ? &SI : nullptr;
 }
@@ -642,11 +653,14 @@ Instruction *InstCombiner::FoldSPFofSPF(Instruction *Inner,
                                         Value *A, Value *B,
                                         Instruction &Outer,
                                         SelectPatternFlavor SPF2, Value *C) {
+  if (Outer.getType() != Inner->getType())
+    return nullptr;
+
   if (C == A || C == B) {
     // MAX(MAX(A, B), B) -> MAX(A, B)
     // MIN(MIN(a, b), a) -> MIN(a, b)
     if (SPF1 == SPF2)
-      return ReplaceInstUsesWith(Outer, Inner);
+      return replaceInstUsesWith(Outer, Inner);
 
     // MAX(MIN(a, b), a) -> a
     // MIN(MAX(a, b), a) -> a
@@ -654,14 +668,14 @@ Instruction *InstCombiner::FoldSPFofSPF(Instruction *Inner,
         (SPF1 == SPF_SMAX && SPF2 == SPF_SMIN) ||
         (SPF1 == SPF_UMIN && SPF2 == SPF_UMAX) ||
         (SPF1 == SPF_UMAX && SPF2 == SPF_UMIN))
-      return ReplaceInstUsesWith(Outer, C);
+      return replaceInstUsesWith(Outer, C);
   }
 
   if (SPF1 == SPF2) {
     if (ConstantInt *CB = dyn_cast<ConstantInt>(B)) {
       if (ConstantInt *CC = dyn_cast<ConstantInt>(C)) {
-        APInt ACB = CB->getValue();
-        APInt ACC = CC->getValue();
+        const APInt &ACB = CB->getValue();
+        const APInt &ACC = CC->getValue();
 
         // MIN(MIN(A, 23), 97) -> MIN(A, 23)
         // MAX(MAX(A, 97), 23) -> MAX(A, 97)
@@ -669,7 +683,7 @@ Instruction *InstCombiner::FoldSPFofSPF(Instruction *Inner,
             (SPF1 == SPF_SMIN && ACB.sle(ACC)) ||
             (SPF1 == SPF_UMAX && ACB.uge(ACC)) ||
             (SPF1 == SPF_SMAX && ACB.sge(ACC)))
-          return ReplaceInstUsesWith(Outer, Inner);
+          return replaceInstUsesWith(Outer, Inner);
 
         // MIN(MIN(A, 97), 23) -> MIN(A, 23)
         // MAX(MAX(A, 23), 97) -> MAX(A, 97)
@@ -687,7 +701,7 @@ Instruction *InstCombiner::FoldSPFofSPF(Instruction *Inner,
   // ABS(ABS(X)) -> ABS(X)
   // NABS(NABS(X)) -> NABS(X)
   if (SPF1 == SPF2 && (SPF1 == SPF_ABS || SPF1 == SPF_NABS)) {
-    return ReplaceInstUsesWith(Outer, Inner);
+    return replaceInstUsesWith(Outer, Inner);
   }
 
   // ABS(NABS(X)) -> ABS(X)
@@ -697,7 +711,7 @@ Instruction *InstCombiner::FoldSPFofSPF(Instruction *Inner,
     SelectInst *SI = cast<SelectInst>(Inner);
     Value *NewSI = Builder->CreateSelect(
         SI->getCondition(), SI->getFalseValue(), SI->getTrueValue());
-    return ReplaceInstUsesWith(Outer, NewSI);
+    return replaceInstUsesWith(Outer, NewSI);
   }
 
   auto IsFreeOrProfitableToInvert =
@@ -742,7 +756,7 @@ Instruction *InstCombiner::FoldSPFofSPF(Instruction *Inner,
         Builder, getInverseMinMaxSelectPattern(SPF1), NotA, NotB);
     Value *NewOuter = Builder->CreateNot(generateMinMaxSelectPattern(
         Builder, getInverseMinMaxSelectPattern(SPF2), NewInner, NotC));
-    return ReplaceInstUsesWith(Outer, NewOuter);
+    return replaceInstUsesWith(Outer, NewOuter);
   }
 
   return nullptr;
@@ -823,76 +837,156 @@ static Value *foldSelectICmpAnd(const SelectInst &SI, ConstantInt *TrueVal,
   return V;
 }
 
+/// Turn select C, (X + Y), (X - Y) --> (X + (select C, Y, (-Y))).
+/// This is even legal for FP.
+static Instruction *foldAddSubSelect(SelectInst &SI,
+                                     InstCombiner::BuilderTy &Builder) {
+  Value *CondVal = SI.getCondition();
+  Value *TrueVal = SI.getTrueValue();
+  Value *FalseVal = SI.getFalseValue();
+  auto *TI = dyn_cast<Instruction>(TrueVal);
+  auto *FI = dyn_cast<Instruction>(FalseVal);
+  if (!TI || !FI || !TI->hasOneUse() || !FI->hasOneUse())
+    return nullptr;
+
+  Instruction *AddOp = nullptr, *SubOp = nullptr;
+  if ((TI->getOpcode() == Instruction::Sub &&
+       FI->getOpcode() == Instruction::Add) ||
+      (TI->getOpcode() == Instruction::FSub &&
+       FI->getOpcode() == Instruction::FAdd)) {
+    AddOp = FI;
+    SubOp = TI;
+  } else if ((FI->getOpcode() == Instruction::Sub &&
+              TI->getOpcode() == Instruction::Add) ||
+             (FI->getOpcode() == Instruction::FSub &&
+              TI->getOpcode() == Instruction::FAdd)) {
+    AddOp = TI;
+    SubOp = FI;
+  }
+
+  if (AddOp) {
+    Value *OtherAddOp = nullptr;
+    if (SubOp->getOperand(0) == AddOp->getOperand(0)) {
+      OtherAddOp = AddOp->getOperand(1);
+    } else if (SubOp->getOperand(0) == AddOp->getOperand(1)) {
+      OtherAddOp = AddOp->getOperand(0);
+    }
+
+    if (OtherAddOp) {
+      // So at this point we know we have (Y -> OtherAddOp):
+      //        select C, (add X, Y), (sub X, Z)
+      Value *NegVal; // Compute -Z
+      if (SI.getType()->isFPOrFPVectorTy()) {
+        NegVal = Builder.CreateFNeg(SubOp->getOperand(1));
+        if (Instruction *NegInst = dyn_cast<Instruction>(NegVal)) {
+          FastMathFlags Flags = AddOp->getFastMathFlags();
+          Flags &= SubOp->getFastMathFlags();
+          NegInst->setFastMathFlags(Flags);
+        }
+      } else {
+        NegVal = Builder.CreateNeg(SubOp->getOperand(1));
+      }
+
+      Value *NewTrueOp = OtherAddOp;
+      Value *NewFalseOp = NegVal;
+      if (AddOp != TI)
+        std::swap(NewTrueOp, NewFalseOp);
+      Value *NewSel = Builder.CreateSelect(CondVal, NewTrueOp, NewFalseOp,
+                                           SI.getName() + ".p");
+
+      if (SI.getType()->isFPOrFPVectorTy()) {
+        Instruction *RI =
+            BinaryOperator::CreateFAdd(SubOp->getOperand(0), NewSel);
+
+        FastMathFlags Flags = AddOp->getFastMathFlags();
+        Flags &= SubOp->getFastMathFlags();
+        RI->setFastMathFlags(Flags);
+        return RI;
+      } else
+        return BinaryOperator::CreateAdd(SubOp->getOperand(0), NewSel);
+    }
+  }
+  return nullptr;
+}
+
 Instruction *InstCombiner::visitSelectInst(SelectInst &SI) {
   Value *CondVal = SI.getCondition();
   Value *TrueVal = SI.getTrueValue();
   Value *FalseVal = SI.getFalseValue();
+  Type *SelType = SI.getType();
 
   if (Value *V =
           SimplifySelectInst(CondVal, TrueVal, FalseVal, DL, TLI, DT, AC))
-    return ReplaceInstUsesWith(SI, V);
+    return replaceInstUsesWith(SI, V);
 
-  if (SI.getType()->isIntegerTy(1)) {
-    if (ConstantInt *C = dyn_cast<ConstantInt>(TrueVal)) {
-      if (C->getZExtValue()) {
-        // Change: A = select B, true, C --> A = or B, C
-        return BinaryOperator::CreateOr(CondVal, FalseVal);
-      }
+  if (SelType->getScalarType()->isIntegerTy(1) &&
+      TrueVal->getType() == CondVal->getType()) {
+    if (match(TrueVal, m_One())) {
+      // Change: A = select B, true, C --> A = or B, C
+      return BinaryOperator::CreateOr(CondVal, FalseVal);
+    }
+    if (match(TrueVal, m_Zero())) {
       // Change: A = select B, false, C --> A = and !B, C
-      Value *NotCond = Builder->CreateNot(CondVal, "not."+CondVal->getName());
+      Value *NotCond = Builder->CreateNot(CondVal, "not." + CondVal->getName());
       return BinaryOperator::CreateAnd(NotCond, FalseVal);
     }
-    if (ConstantInt *C = dyn_cast<ConstantInt>(FalseVal)) {
-      if (!C->getZExtValue()) {
-        // Change: A = select B, C, false --> A = and B, C
-        return BinaryOperator::CreateAnd(CondVal, TrueVal);
-      }
+    if (match(FalseVal, m_Zero())) {
+      // Change: A = select B, C, false --> A = and B, C
+      return BinaryOperator::CreateAnd(CondVal, TrueVal);
+    }
+    if (match(FalseVal, m_One())) {
       // Change: A = select B, C, true --> A = or !B, C
-      Value *NotCond = Builder->CreateNot(CondVal, "not."+CondVal->getName());
+      Value *NotCond = Builder->CreateNot(CondVal, "not." + CondVal->getName());
       return BinaryOperator::CreateOr(NotCond, TrueVal);
     }
 
-    // select a, b, a  -> a&b
-    // select a, a, b  -> a|b
+    // select a, a, b  -> a | b
+    // select a, b, a  -> a & b
     if (CondVal == TrueVal)
       return BinaryOperator::CreateOr(CondVal, FalseVal);
     if (CondVal == FalseVal)
       return BinaryOperator::CreateAnd(CondVal, TrueVal);
 
-    // select a, ~a, b -> (~a)&b
-    // select a, b, ~a -> (~a)|b
+    // select a, ~a, b -> (~a) & b
+    // select a, b, ~a -> (~a) | b
     if (match(TrueVal, m_Not(m_Specific(CondVal))))
       return BinaryOperator::CreateAnd(TrueVal, FalseVal);
     if (match(FalseVal, m_Not(m_Specific(CondVal))))
       return BinaryOperator::CreateOr(TrueVal, FalseVal);
   }
 
-  // Selecting between two integer constants?
-  if (ConstantInt *TrueValC = dyn_cast<ConstantInt>(TrueVal))
-    if (ConstantInt *FalseValC = dyn_cast<ConstantInt>(FalseVal)) {
-      // select C, 1, 0 -> zext C to int
-      if (FalseValC->isZero() && TrueValC->getValue() == 1)
-        return new ZExtInst(CondVal, SI.getType());
-
-      // select C, -1, 0 -> sext C to int
-      if (FalseValC->isZero() && TrueValC->isAllOnesValue())
-        return new SExtInst(CondVal, SI.getType());
-
-      // select C, 0, 1 -> zext !C to int
-      if (TrueValC->isZero() && FalseValC->getValue() == 1) {
-        Value *NotCond = Builder->CreateNot(CondVal, "not."+CondVal->getName());
-        return new ZExtInst(NotCond, SI.getType());
-      }
+  // Selecting between two integer or vector splat integer constants?
+  //
+  // Note that we don't handle a scalar select of vectors:
+  // select i1 %c, <2 x i8> <1, 1>, <2 x i8> <0, 0>
+  // because that may need 3 instructions to splat the condition value:
+  // extend, insertelement, shufflevector.
+  if (CondVal->getType()->isVectorTy() == SelType->isVectorTy()) {
+    // select C, 1, 0 -> zext C to int
+    if (match(TrueVal, m_One()) && match(FalseVal, m_Zero()))
+      return new ZExtInst(CondVal, SelType);
+
+    // select C, -1, 0 -> sext C to int
+    if (match(TrueVal, m_AllOnes()) && match(FalseVal, m_Zero()))
+      return new SExtInst(CondVal, SelType);
+
+    // select C, 0, 1 -> zext !C to int
+    if (match(TrueVal, m_Zero()) && match(FalseVal, m_One())) {
+      Value *NotCond = Builder->CreateNot(CondVal, "not." + CondVal->getName());
+      return new ZExtInst(NotCond, SelType);
+    }
 
-      // select C, 0, -1 -> sext !C to int
-      if (TrueValC->isZero() && FalseValC->isAllOnesValue()) {
-        Value *NotCond = Builder->CreateNot(CondVal, "not."+CondVal->getName());
-        return new SExtInst(NotCond, SI.getType());
-      }
+    // select C, 0, -1 -> sext !C to int
+    if (match(TrueVal, m_Zero()) && match(FalseVal, m_AllOnes())) {
+      Value *NotCond = Builder->CreateNot(CondVal, "not." + CondVal->getName());
+      return new SExtInst(NotCond, SelType);
+    }
+  }
 
+  if (ConstantInt *TrueValC = dyn_cast<ConstantInt>(TrueVal))
+    if (ConstantInt *FalseValC = dyn_cast<ConstantInt>(FalseVal))
       if (Value *V = foldSelectICmpAnd(SI, TrueValC, FalseValC, Builder))
-        return ReplaceInstUsesWith(SI, V);
-    }
+        return replaceInstUsesWith(SI, V);
 
   // See if we are selecting two values based on a comparison of the two values.
   if (FCmpInst *FCI = dyn_cast<FCmpInst>(CondVal)) {
@@ -907,7 +1001,7 @@ Instruction *InstCombiner::visitSelectInst(SelectInst &SI) {
               !CFPt->getValueAPF().isZero()) ||
             ((CFPf = dyn_cast<ConstantFP>(FalseVal)) &&
              !CFPf->getValueAPF().isZero()))
-        return ReplaceInstUsesWith(SI, FalseVal);
+        return replaceInstUsesWith(SI, FalseVal);
       }
       // Transform (X une Y) ? X : Y  -> X
       if (FCI->getPredicate() == FCmpInst::FCMP_UNE) {
@@ -919,7 +1013,7 @@ Instruction *InstCombiner::visitSelectInst(SelectInst &SI) {
               !CFPt->getValueAPF().isZero()) ||
             ((CFPf = dyn_cast<ConstantFP>(FalseVal)) &&
              !CFPf->getValueAPF().isZero()))
-        return ReplaceInstUsesWith(SI, TrueVal);
+        return replaceInstUsesWith(SI, TrueVal);
       }
 
       // Canonicalize to use ordered comparisons by swapping the select
@@ -950,7 +1044,7 @@ Instruction *InstCombiner::visitSelectInst(SelectInst &SI) {
               !CFPt->getValueAPF().isZero()) ||
             ((CFPf = dyn_cast<ConstantFP>(FalseVal)) &&
              !CFPf->getValueAPF().isZero()))
-          return ReplaceInstUsesWith(SI, FalseVal);
+          return replaceInstUsesWith(SI, FalseVal);
       }
       // Transform (X une Y) ? Y : X  -> Y
       if (FCI->getPredicate() == FCmpInst::FCMP_UNE) {
@@ -962,7 +1056,7 @@ Instruction *InstCombiner::visitSelectInst(SelectInst &SI) {
               !CFPt->getValueAPF().isZero()) ||
             ((CFPf = dyn_cast<ConstantFP>(FalseVal)) &&
              !CFPf->getValueAPF().isZero()))
-          return ReplaceInstUsesWith(SI, TrueVal);
+          return replaceInstUsesWith(SI, TrueVal);
       }
 
       // Canonicalize to use ordered comparisons by swapping the select
@@ -991,77 +1085,18 @@ Instruction *InstCombiner::visitSelectInst(SelectInst &SI) {
     if (Instruction *Result = visitSelectInstWithICmp(SI, ICI))
       return Result;
 
-  if (Instruction *TI = dyn_cast<Instruction>(TrueVal))
-    if (Instruction *FI = dyn_cast<Instruction>(FalseVal))
-      if (TI->hasOneUse() && FI->hasOneUse()) {
-        Instruction *AddOp = nullptr, *SubOp = nullptr;
-
-        // Turn (select C, (op X, Y), (op X, Z)) -> (op X, (select C, Y, Z))
-        if (TI->getOpcode() == FI->getOpcode())
-          if (Instruction *IV = FoldSelectOpOp(SI, TI, FI))
-            return IV;
-
-        // Turn select C, (X+Y), (X-Y) --> (X+(select C, Y, (-Y))).  This is
-        // even legal for FP.
-        if ((TI->getOpcode() == Instruction::Sub &&
-             FI->getOpcode() == Instruction::Add) ||
-            (TI->getOpcode() == Instruction::FSub &&
-             FI->getOpcode() == Instruction::FAdd)) {
-          AddOp = FI; SubOp = TI;
-        } else if ((FI->getOpcode() == Instruction::Sub &&
-                    TI->getOpcode() == Instruction::Add) ||
-                   (FI->getOpcode() == Instruction::FSub &&
-                    TI->getOpcode() == Instruction::FAdd)) {
-          AddOp = TI; SubOp = FI;
-        }
+  if (Instruction *Add = foldAddSubSelect(SI, *Builder))
+    return Add;
 
-        if (AddOp) {
-          Value *OtherAddOp = nullptr;
-          if (SubOp->getOperand(0) == AddOp->getOperand(0)) {
-            OtherAddOp = AddOp->getOperand(1);
-          } else if (SubOp->getOperand(0) == AddOp->getOperand(1)) {
-            OtherAddOp = AddOp->getOperand(0);
-          }
-
-          if (OtherAddOp) {
-            // So at this point we know we have (Y -> OtherAddOp):
-            //        select C, (add X, Y), (sub X, Z)
-            Value *NegVal;  // Compute -Z
-            if (SI.getType()->isFPOrFPVectorTy()) {
-              NegVal = Builder->CreateFNeg(SubOp->getOperand(1));
-              if (Instruction *NegInst = dyn_cast<Instruction>(NegVal)) {
-                FastMathFlags Flags = AddOp->getFastMathFlags();
-                Flags &= SubOp->getFastMathFlags();
-                NegInst->setFastMathFlags(Flags);
-              }
-            } else {
-              NegVal = Builder->CreateNeg(SubOp->getOperand(1));
-            }
-
-            Value *NewTrueOp = OtherAddOp;
-            Value *NewFalseOp = NegVal;
-            if (AddOp != TI)
-              std::swap(NewTrueOp, NewFalseOp);
-            Value *NewSel =
-              Builder->CreateSelect(CondVal, NewTrueOp,
-                                    NewFalseOp, SI.getName() + ".p");
-
-            if (SI.getType()->isFPOrFPVectorTy()) {
-              Instruction *RI =
-                BinaryOperator::CreateFAdd(SubOp->getOperand(0), NewSel);
-
-              FastMathFlags Flags = AddOp->getFastMathFlags();
-              Flags &= SubOp->getFastMathFlags();
-              RI->setFastMathFlags(Flags);
-              return RI;
-            } else
-              return BinaryOperator::CreateAdd(SubOp->getOperand(0), NewSel);
-          }
-        }
-      }
+  // Turn (select C, (op X, Y), (op X, Z)) -> (op X, (select C, Y, Z))
+  auto *TI = dyn_cast<Instruction>(TrueVal);
+  auto *FI = dyn_cast<Instruction>(FalseVal);
+  if (TI && FI && TI->getOpcode() == FI->getOpcode())
+    if (Instruction *IV = FoldSelectOpOp(SI, TI, FI))
+      return IV;
 
   // See if we can fold the select into one of our operands.
-  if (SI.getType()->isIntOrIntVectorTy() || SI.getType()->isFPOrFPVectorTy()) {
+  if (SelType->isIntOrIntVectorTy() || SelType->isFPOrFPVectorTy()) {
     if (Instruction *FoldI = FoldSelectIntoOp(SI, TrueVal, FalseVal))
       return FoldI;
 
@@ -1073,7 +1108,7 @@ Instruction *InstCombiner::visitSelectInst(SelectInst &SI) {
     if (SelectPatternResult::isMinOrMax(SPF)) {
       // Canonicalize so that type casts are outside select patterns.
       if (LHS->getType()->getPrimitiveSizeInBits() !=
-          SI.getType()->getPrimitiveSizeInBits()) {
+          SelType->getPrimitiveSizeInBits()) {
         CmpInst::Predicate Pred = getCmpPredicateForMinMax(SPF, SPR.Ordered);
 
         Value *Cmp;
@@ -1088,8 +1123,8 @@ Instruction *InstCombiner::visitSelectInst(SelectInst &SI) {
 
         Value *NewSI = Builder->CreateCast(CastOp,
                                            Builder->CreateSelect(Cmp, LHS, RHS),
-                                           SI.getType());
-        return ReplaceInstUsesWith(SI, NewSI);
+                                           SelType);
+        return replaceInstUsesWith(SI, NewSI);
       }
     }
 
@@ -1132,7 +1167,7 @@ Instruction *InstCombiner::visitSelectInst(SelectInst &SI) {
                               : Builder->CreateICmpULT(NewLHS, NewRHS);
           Value *NewSI =
               Builder->CreateNot(Builder->CreateSelect(NewCmp, NewLHS, NewRHS));
-          return ReplaceInstUsesWith(SI, NewSI);
+          return replaceInstUsesWith(SI, NewSI);
         }
       }
     }
@@ -1195,18 +1230,36 @@ Instruction *InstCombiner::visitSelectInst(SelectInst &SI) {
     return &SI;
   }
 
-  if (VectorType* VecTy = dyn_cast<VectorType>(SI.getType())) {
+  if (VectorType* VecTy = dyn_cast<VectorType>(SelType)) {
     unsigned VWidth = VecTy->getNumElements();
     APInt UndefElts(VWidth, 0);
     APInt AllOnesEltMask(APInt::getAllOnesValue(VWidth));
     if (Value *V = SimplifyDemandedVectorElts(&SI, AllOnesEltMask, UndefElts)) {
       if (V != &SI)
-        return ReplaceInstUsesWith(SI, V);
+        return replaceInstUsesWith(SI, V);
       return &SI;
     }
 
     if (isa<ConstantAggregateZero>(CondVal)) {
-      return ReplaceInstUsesWith(SI, FalseVal);
+      return replaceInstUsesWith(SI, FalseVal);
+    }
+  }
+
+  // See if we can determine the result of this select based on a dominating
+  // condition.
+  BasicBlock *Parent = SI.getParent();
+  if (BasicBlock *Dom = Parent->getSinglePredecessor()) {
+    auto *PBI = dyn_cast_or_null<BranchInst>(Dom->getTerminator());
+    if (PBI && PBI->isConditional() &&
+        PBI->getSuccessor(0) != PBI->getSuccessor(1) &&
+        (PBI->getSuccessor(0) == Parent || PBI->getSuccessor(1) == Parent)) {
+      bool CondIsFalse = PBI->getSuccessor(1) == Parent;
+      Optional<bool> Implication = isImpliedCondition(
+        PBI->getCondition(), SI.getCondition(), DL, CondIsFalse);
+      if (Implication) {
+        Value *V = *Implication ? TrueVal : FalseVal;
+        return replaceInstUsesWith(SI, V);
+      }
     }
   }
 
diff --git a/lib/Transforms/InstCombine/InstCombineShifts.cpp b/lib/Transforms/InstCombine/InstCombineShifts.cpp
index 0c7defa5fff8..08e16a7ee1af 100644
--- a/lib/Transforms/InstCombine/InstCombineShifts.cpp
+++ b/lib/Transforms/InstCombine/InstCombineShifts.cpp
@@ -55,6 +55,51 @@ Instruction *InstCombiner::commonShiftTransforms(BinaryOperator &I) {
   return nullptr;
 }
 
+/// Return true if we can simplify two logical (either left or right) shifts
+/// that have constant shift amounts.
+static bool canEvaluateShiftedShift(unsigned FirstShiftAmt,
+                                    bool IsFirstShiftLeft,
+                                    Instruction *SecondShift, InstCombiner &IC,
+                                    Instruction *CxtI) {
+  assert(SecondShift->isLogicalShift() && "Unexpected instruction type");
+
+  // We need constant shifts.
+  auto *SecondShiftConst = dyn_cast<ConstantInt>(SecondShift->getOperand(1));
+  if (!SecondShiftConst)
+    return false;
+
+  unsigned SecondShiftAmt = SecondShiftConst->getZExtValue();
+  bool IsSecondShiftLeft = SecondShift->getOpcode() == Instruction::Shl;
+
+  // We can always fold  shl(c1) +  shl(c2) ->  shl(c1+c2).
+  // We can always fold lshr(c1) + lshr(c2) -> lshr(c1+c2).
+  if (IsFirstShiftLeft == IsSecondShiftLeft)
+    return true;
+
+  // We can always fold lshr(c) +  shl(c) -> and(c2).
+  // We can always fold  shl(c) + lshr(c) -> and(c2).
+  if (FirstShiftAmt == SecondShiftAmt)
+    return true;
+
+  unsigned TypeWidth = SecondShift->getType()->getScalarSizeInBits();
+
+  // If the 2nd shift is bigger than the 1st, we can fold:
+  //   lshr(c1) +  shl(c2) ->  shl(c3) + and(c4) or
+  //   shl(c1)  + lshr(c2) -> lshr(c3) + and(c4),
+  // but it isn't profitable unless we know the and'd out bits are already zero.
+  // Also check that the 2nd shift is valid (less than the type width) or we'll
+  // crash trying to produce the bit mask for the 'and'.
+  if (SecondShiftAmt > FirstShiftAmt && SecondShiftAmt < TypeWidth) {
+    unsigned MaskShift = IsSecondShiftLeft ? TypeWidth - SecondShiftAmt
+                                           : SecondShiftAmt - FirstShiftAmt;
+    APInt Mask = APInt::getLowBitsSet(TypeWidth, FirstShiftAmt) << MaskShift;
+    if (IC.MaskedValueIsZero(SecondShift->getOperand(0), Mask, 0, CxtI))
+      return true;
+  }
+
+  return false;
+}
+
 /// See if we can compute the specified value, but shifted
 /// logically to the left or right by some number of bits.  This should return
 /// true if the expression can be computed for the same cost as the current
@@ -67,7 +112,7 @@ Instruction *InstCombiner::commonShiftTransforms(BinaryOperator &I) {
 /// where the client will ask if E can be computed shifted right by 64-bits.  If
 /// this succeeds, the GetShiftedValue function will be called to produce the
 /// value.
-static bool CanEvaluateShifted(Value *V, unsigned NumBits, bool isLeftShift,
+static bool CanEvaluateShifted(Value *V, unsigned NumBits, bool IsLeftShift,
                                InstCombiner &IC, Instruction *CxtI) {
   // We can always evaluate constants shifted.
   if (isa<Constant>(V))
@@ -81,8 +126,8 @@ static bool CanEvaluateShifted(Value *V, unsigned NumBits, bool isLeftShift,
   // the value which means that we don't care if the shift has multiple uses.
   //  TODO:  Handle opposite shift by exact value.
   ConstantInt *CI = nullptr;
-  if ((isLeftShift && match(I, m_LShr(m_Value(), m_ConstantInt(CI)))) ||
-      (!isLeftShift && match(I, m_Shl(m_Value(), m_ConstantInt(CI))))) {
+  if ((IsLeftShift && match(I, m_LShr(m_Value(), m_ConstantInt(CI)))) ||
+      (!IsLeftShift && match(I, m_Shl(m_Value(), m_ConstantInt(CI))))) {
     if (CI->getZExtValue() == NumBits) {
       // TODO: Check that the input bits are already zero with MaskedValueIsZero
 #if 0
@@ -111,64 +156,19 @@ static bool CanEvaluateShifted(Value *V, unsigned NumBits, bool isLeftShift,
   case Instruction::Or:
   case Instruction::Xor:
     // Bitwise operators can all arbitrarily be arbitrarily evaluated shifted.
-    return CanEvaluateShifted(I->getOperand(0), NumBits, isLeftShift, IC, I) &&
-           CanEvaluateShifted(I->getOperand(1), NumBits, isLeftShift, IC, I);
-
-  case Instruction::Shl: {
-    // We can often fold the shift into shifts-by-a-constant.
-    CI = dyn_cast<ConstantInt>(I->getOperand(1));
-    if (!CI) return false;
-
-    // We can always fold shl(c1)+shl(c2) -> shl(c1+c2).
-    if (isLeftShift) return true;
+    return CanEvaluateShifted(I->getOperand(0), NumBits, IsLeftShift, IC, I) &&
+           CanEvaluateShifted(I->getOperand(1), NumBits, IsLeftShift, IC, I);
 
-    // We can always turn shl(c)+shr(c) -> and(c2).
-    if (CI->getValue() == NumBits) return true;
+  case Instruction::Shl:
+  case Instruction::LShr:
+    return canEvaluateShiftedShift(NumBits, IsLeftShift, I, IC, CxtI);
 
-    unsigned TypeWidth = I->getType()->getScalarSizeInBits();
-
-    // We can turn shl(c1)+shr(c2) -> shl(c3)+and(c4), but it isn't
-    // profitable unless we know the and'd out bits are already zero.
-    if (CI->getZExtValue() > NumBits) {
-      unsigned LowBits = TypeWidth - CI->getZExtValue();
-      if (IC.MaskedValueIsZero(I->getOperand(0),
-                       APInt::getLowBitsSet(TypeWidth, NumBits) << LowBits,
-                       0, CxtI))
-        return true;
-    }
-
-    return false;
-  }
-  case Instruction::LShr: {
-    // We can often fold the shift into shifts-by-a-constant.
-    CI = dyn_cast<ConstantInt>(I->getOperand(1));
-    if (!CI) return false;
-
-    // We can always fold lshr(c1)+lshr(c2) -> lshr(c1+c2).
-    if (!isLeftShift) return true;
-
-    // We can always turn lshr(c)+shl(c) -> and(c2).
-    if (CI->getValue() == NumBits) return true;
-
-    unsigned TypeWidth = I->getType()->getScalarSizeInBits();
-
-    // We can always turn lshr(c1)+shl(c2) -> lshr(c3)+and(c4), but it isn't
-    // profitable unless we know the and'd out bits are already zero.
-    if (CI->getValue().ult(TypeWidth) && CI->getZExtValue() > NumBits) {
-      unsigned LowBits = CI->getZExtValue() - NumBits;
-      if (IC.MaskedValueIsZero(I->getOperand(0),
-                          APInt::getLowBitsSet(TypeWidth, NumBits) << LowBits,
-                          0, CxtI))
-        return true;
-    }
-
-    return false;
-  }
   case Instruction::Select: {
     SelectInst *SI = cast<SelectInst>(I);
-    return CanEvaluateShifted(SI->getTrueValue(), NumBits, isLeftShift,
-                              IC, SI) &&
-           CanEvaluateShifted(SI->getFalseValue(), NumBits, isLeftShift, IC, SI);
+    Value *TrueVal = SI->getTrueValue();
+    Value *FalseVal = SI->getFalseValue();
+    return CanEvaluateShifted(TrueVal, NumBits, IsLeftShift, IC, SI) &&
+           CanEvaluateShifted(FalseVal, NumBits, IsLeftShift, IC, SI);
   }
   case Instruction::PHI: {
     // We can change a phi if we can change all operands.  Note that we never
@@ -176,8 +176,7 @@ static bool CanEvaluateShifted(Value *V, unsigned NumBits, bool isLeftShift,
     // instructions with a single use.
     PHINode *PN = cast<PHINode>(I);
     for (Value *IncValue : PN->incoming_values())
-      if (!CanEvaluateShifted(IncValue, NumBits, isLeftShift,
-                              IC, PN))
+      if (!CanEvaluateShifted(IncValue, NumBits, IsLeftShift, IC, PN))
         return false;
     return true;
   }
@@ -257,6 +256,8 @@ static Value *GetShiftedValue(Value *V, unsigned NumBits, bool isLeftShift,
     BO->setHasNoSignedWrap(false);
     return BO;
   }
+  // FIXME: This is almost identical to the SHL case. Refactor both cases into
+  // a helper function.
   case Instruction::LShr: {
     BinaryOperator *BO = cast<BinaryOperator>(I);
     unsigned TypeWidth = BO->getType()->getScalarSizeInBits();
@@ -340,7 +341,7 @@ Instruction *InstCombiner::FoldShiftByConstant(Value *Op0, Constant *Op1,
     DEBUG(dbgs() << "ICE: GetShiftedValue propagating shift through expression"
               " to eliminate shift:\n  IN: " << *Op0 << "\n  SH: " << I <<"\n");
 
-    return ReplaceInstUsesWith(
+    return replaceInstUsesWith(
         I, GetShiftedValue(Op0, COp1->getZExtValue(), isLeftShift, *this, DL));
   }
 
@@ -356,7 +357,7 @@ Instruction *InstCombiner::FoldShiftByConstant(Value *Op0, Constant *Op1,
     if (BO->getOpcode() == Instruction::Mul && isLeftShift)
       if (Constant *BOOp = dyn_cast<Constant>(BO->getOperand(1)))
         return BinaryOperator::CreateMul(BO->getOperand(0),
-                                        ConstantExpr::getShl(BOOp, Op1));
+                                         ConstantExpr::getShl(BOOp, Op1));
 
   // Try to fold constant and into select arguments.
   if (SelectInst *SI = dyn_cast<SelectInst>(Op0))
@@ -573,7 +574,7 @@ Instruction *InstCombiner::FoldShiftByConstant(Value *Op0, Constant *Op1,
       // saturates.
       if (AmtSum >= TypeBits) {
         if (I.getOpcode() != Instruction::AShr)
-          return ReplaceInstUsesWith(I, Constant::getNullValue(I.getType()));
+          return replaceInstUsesWith(I, Constant::getNullValue(I.getType()));
         AmtSum = TypeBits-1;  // Saturate to 31 for i32 ashr.
       }
 
@@ -694,12 +695,12 @@ Instruction *InstCombiner::FoldShiftByConstant(Value *Op0, Constant *Op1,
 
 Instruction *InstCombiner::visitShl(BinaryOperator &I) {
   if (Value *V = SimplifyVectorOp(I))
-    return ReplaceInstUsesWith(I, V);
+    return replaceInstUsesWith(I, V);
 
   if (Value *V =
           SimplifyShlInst(I.getOperand(0), I.getOperand(1), I.hasNoSignedWrap(),
                           I.hasNoUnsignedWrap(), DL, TLI, DT, AC))
-    return ReplaceInstUsesWith(I, V);
+    return replaceInstUsesWith(I, V);
 
   if (Instruction *V = commonShiftTransforms(I))
     return V;
@@ -710,11 +711,11 @@ Instruction *InstCombiner::visitShl(BinaryOperator &I) {
     // If the shifted-out value is known-zero, then this is a NUW shift.
     if (!I.hasNoUnsignedWrap() &&
         MaskedValueIsZero(I.getOperand(0),
-                          APInt::getHighBitsSet(Op1C->getBitWidth(), ShAmt),
-                          0, &I)) {
-          I.setHasNoUnsignedWrap();
-          return &I;
-        }
+                          APInt::getHighBitsSet(Op1C->getBitWidth(), ShAmt), 0,
+                          &I)) {
+      I.setHasNoUnsignedWrap();
+      return &I;
+    }
 
     // If the shifted out value is all signbits, this is a NSW shift.
     if (!I.hasNoSignedWrap() &&
@@ -736,11 +737,11 @@ Instruction *InstCombiner::visitShl(BinaryOperator &I) {
 
 Instruction *InstCombiner::visitLShr(BinaryOperator &I) {
   if (Value *V = SimplifyVectorOp(I))
-    return ReplaceInstUsesWith(I, V);
+    return replaceInstUsesWith(I, V);
 
   if (Value *V = SimplifyLShrInst(I.getOperand(0), I.getOperand(1), I.isExact(),
                                   DL, TLI, DT, AC))
-    return ReplaceInstUsesWith(I, V);
+    return replaceInstUsesWith(I, V);
 
   if (Instruction *R = commonShiftTransforms(I))
     return R;
@@ -780,11 +781,11 @@ Instruction *InstCombiner::visitLShr(BinaryOperator &I) {
 
 Instruction *InstCombiner::visitAShr(BinaryOperator &I) {
   if (Value *V = SimplifyVectorOp(I))
-    return ReplaceInstUsesWith(I, V);
+    return replaceInstUsesWith(I, V);
 
   if (Value *V = SimplifyAShrInst(I.getOperand(0), I.getOperand(1), I.isExact(),
                                   DL, TLI, DT, AC))
-    return ReplaceInstUsesWith(I, V);
+    return replaceInstUsesWith(I, V);
 
   if (Instruction *R = commonShiftTransforms(I))
     return R;
@@ -813,8 +814,8 @@ Instruction *InstCombiner::visitAShr(BinaryOperator &I) {
 
     // If the shifted-out value is known-zero, then this is an exact shift.
     if (!I.isExact() &&
-        MaskedValueIsZero(Op0,APInt::getLowBitsSet(Op1C->getBitWidth(),ShAmt),
-                          0, &I)){
+        MaskedValueIsZero(Op0, APInt::getLowBitsSet(Op1C->getBitWidth(), ShAmt),
+                          0, &I)) {
       I.setIsExact();
       return &I;
     }
diff --git a/lib/Transforms/InstCombine/InstCombineSimplifyDemanded.cpp b/lib/Transforms/InstCombine/InstCombineSimplifyDemanded.cpp
index 743d51483ea1..f3268d2c3471 100644
--- a/lib/Transforms/InstCombine/InstCombineSimplifyDemanded.cpp
+++ b/lib/Transforms/InstCombine/InstCombineSimplifyDemanded.cpp
@@ -22,10 +22,9 @@ using namespace llvm::PatternMatch;
 
 #define DEBUG_TYPE "instcombine"
 
-/// ShrinkDemandedConstant - Check to see if the specified operand of the
-/// specified instruction is a constant integer.  If so, check to see if there
-/// are any bits set in the constant that are not demanded.  If so, shrink the
-/// constant and return true.
+/// Check to see if the specified operand of the specified instruction is a
+/// constant integer. If so, check to see if there are any bits set in the
+/// constant that are not demanded. If so, shrink the constant and return true.
 static bool ShrinkDemandedConstant(Instruction *I, unsigned OpNo,
                                    APInt Demanded) {
   assert(I && "No instruction?");
@@ -49,9 +48,8 @@ static bool ShrinkDemandedConstant(Instruction *I, unsigned OpNo,
 
 
 
-/// SimplifyDemandedInstructionBits - Inst is an integer instruction that
-/// SimplifyDemandedBits knows about.  See if the instruction has any
-/// properties that allow us to simplify its operands.
+/// Inst is an integer instruction that SimplifyDemandedBits knows about. See if
+/// the instruction has any properties that allow us to simplify its operands.
 bool InstCombiner::SimplifyDemandedInstructionBits(Instruction &Inst) {
   unsigned BitWidth = Inst.getType()->getScalarSizeInBits();
   APInt KnownZero(BitWidth, 0), KnownOne(BitWidth, 0);
@@ -61,14 +59,14 @@ bool InstCombiner::SimplifyDemandedInstructionBits(Instruction &Inst) {
                                      0, &Inst);
   if (!V) return false;
   if (V == &Inst) return true;
-  ReplaceInstUsesWith(Inst, V);
+  replaceInstUsesWith(Inst, V);
   return true;
 }
 
-/// SimplifyDemandedBits - This form of SimplifyDemandedBits simplifies the
-/// specified instruction operand if possible, updating it in place.  It returns
-/// true if it made any change and false otherwise.
-bool InstCombiner::SimplifyDemandedBits(Use &U, APInt DemandedMask,
+/// This form of SimplifyDemandedBits simplifies the specified instruction
+/// operand if possible, updating it in place. It returns true if it made any
+/// change and false otherwise.
+bool InstCombiner::SimplifyDemandedBits(Use &U, const APInt &DemandedMask,
                                         APInt &KnownZero, APInt &KnownOne,
                                         unsigned Depth) {
   auto *UserI = dyn_cast<Instruction>(U.getUser());
@@ -80,21 +78,22 @@ bool InstCombiner::SimplifyDemandedBits(Use &U, APInt DemandedMask,
 }
 
 
-/// SimplifyDemandedUseBits - This function attempts to replace V with a simpler
-/// value based on the demanded bits.  When this function is called, it is known
-/// that only the bits set in DemandedMask of the result of V are ever used
-/// downstream. Consequently, depending on the mask and V, it may be possible
-/// to replace V with a constant or one of its operands. In such cases, this
-/// function does the replacement and returns true. In all other cases, it
-/// returns false after analyzing the expression and setting KnownOne and known
-/// to be one in the expression.  KnownZero contains all the bits that are known
-/// to be zero in the expression. These are provided to potentially allow the
-/// caller (which might recursively be SimplifyDemandedBits itself) to simplify
-/// the expression. KnownOne and KnownZero always follow the invariant that
-/// KnownOne & KnownZero == 0. That is, a bit can't be both 1 and 0. Note that
-/// the bits in KnownOne and KnownZero may only be accurate for those bits set
-/// in DemandedMask. Note also that the bitwidth of V, DemandedMask, KnownZero
-/// and KnownOne must all be the same.
+/// This function attempts to replace V with a simpler value based on the
+/// demanded bits. When this function is called, it is known that only the bits
+/// set in DemandedMask of the result of V are ever used downstream.
+/// Consequently, depending on the mask and V, it may be possible to replace V
+/// with a constant or one of its operands. In such cases, this function does
+/// the replacement and returns true. In all other cases, it returns false after
+/// analyzing the expression and setting KnownOne and known to be one in the
+/// expression. KnownZero contains all the bits that are known to be zero in the
+/// expression. These are provided to potentially allow the caller (which might
+/// recursively be SimplifyDemandedBits itself) to simplify the expression.
+/// KnownOne and KnownZero always follow the invariant that:
+///   KnownOne & KnownZero == 0.
+/// That is, a bit can't be both 1 and 0. Note that the bits in KnownOne and
+/// KnownZero may only be accurate for those bits set in DemandedMask. Note also
+/// that the bitwidth of V, DemandedMask, KnownZero and KnownOne must all be the
+/// same.
 ///
 /// This returns null if it did not change anything and it permits no
 /// simplification.  This returns V itself if it did some simplification of V's
@@ -768,6 +767,34 @@ Value *InstCombiner::SimplifyDemandedUseBits(Value *V, APInt DemandedMask,
         // TODO: Could compute known zero/one bits based on the input.
         break;
       }
+      case Intrinsic::x86_mmx_pmovmskb:
+      case Intrinsic::x86_sse_movmsk_ps:
+      case Intrinsic::x86_sse2_movmsk_pd:
+      case Intrinsic::x86_sse2_pmovmskb_128:
+      case Intrinsic::x86_avx_movmsk_ps_256:
+      case Intrinsic::x86_avx_movmsk_pd_256:
+      case Intrinsic::x86_avx2_pmovmskb: {
+        // MOVMSK copies the vector elements' sign bits to the low bits
+        // and zeros the high bits.
+        unsigned ArgWidth;
+        if (II->getIntrinsicID() == Intrinsic::x86_mmx_pmovmskb) {
+          ArgWidth = 8; // Arg is x86_mmx, but treated as <8 x i8>.
+        } else {
+          auto Arg = II->getArgOperand(0);
+          auto ArgType = cast<VectorType>(Arg->getType());
+          ArgWidth = ArgType->getNumElements();
+        }
+
+        // If we don't need any of low bits then return zero,
+        // we know that DemandedMask is non-zero already.
+        APInt DemandedElts = DemandedMask.zextOrTrunc(ArgWidth);
+        if (DemandedElts == 0)
+          return ConstantInt::getNullValue(VTy);
+
+        // We know that the upper bits are set to zero.
+        KnownZero = APInt::getHighBitsSet(BitWidth, BitWidth - ArgWidth);
+        return nullptr;
+      }
       case Intrinsic::x86_sse42_crc32_64_64:
         KnownZero = APInt::getHighBitsSet(64, 32);
         return nullptr;
@@ -802,7 +829,10 @@ Value *InstCombiner::SimplifyDemandedUseBits(Value *V, APInt DemandedMask,
 /// As with SimplifyDemandedUseBits, it returns NULL if the simplification was
 /// not successful.
 Value *InstCombiner::SimplifyShrShlDemandedBits(Instruction *Shr,
-  Instruction *Shl, APInt DemandedMask, APInt &KnownZero, APInt &KnownOne) {
+                                                Instruction *Shl,
+                                                const APInt &DemandedMask,
+                                                APInt &KnownZero,
+                                                APInt &KnownOne) {
 
   const APInt &ShlOp1 = cast<ConstantInt>(Shl->getOperand(1))->getValue();
   const APInt &ShrOp1 = cast<ConstantInt>(Shr->getOperand(1))->getValue();
@@ -865,10 +895,10 @@ Value *InstCombiner::SimplifyShrShlDemandedBits(Instruction *Shr,
   return nullptr;
 }
 
-/// SimplifyDemandedVectorElts - The specified value produces a vector with
-/// any number of elements. DemandedElts contains the set of elements that are
-/// actually used by the caller.  This method analyzes which elements of the
-/// operand are undef and returns that information in UndefElts.
+/// The specified value produces a vector with any number of elements.
+/// DemandedElts contains the set of elements that are actually used by the
+/// caller. This method analyzes which elements of the operand are undef and
+/// returns that information in UndefElts.
 ///
 /// If the information about demanded elements can be used to simplify the
 /// operation, the operation is simplified, then the resultant value is
@@ -876,7 +906,7 @@ Value *InstCombiner::SimplifyShrShlDemandedBits(Instruction *Shr,
 Value *InstCombiner::SimplifyDemandedVectorElts(Value *V, APInt DemandedElts,
                                                 APInt &UndefElts,
                                                 unsigned Depth) {
-  unsigned VWidth = cast<VectorType>(V->getType())->getNumElements();
+  unsigned VWidth = V->getType()->getVectorNumElements();
   APInt EltMask(APInt::getAllOnesValue(VWidth));
   assert((DemandedElts & ~EltMask) == 0 && "Invalid DemandedElts!");
 
@@ -1179,16 +1209,42 @@ Value *InstCombiner::SimplifyDemandedVectorElts(Value *V, APInt DemandedElts,
     switch (II->getIntrinsicID()) {
     default: break;
 
-    // Binary vector operations that work column-wise.  A dest element is a
-    // function of the corresponding input elements from the two inputs.
+    // Unary scalar-as-vector operations that work column-wise.
+    case Intrinsic::x86_sse_rcp_ss:
+    case Intrinsic::x86_sse_rsqrt_ss:
+    case Intrinsic::x86_sse_sqrt_ss:
+    case Intrinsic::x86_sse2_sqrt_sd:
+    case Intrinsic::x86_xop_vfrcz_ss:
+    case Intrinsic::x86_xop_vfrcz_sd:
+      TmpV = SimplifyDemandedVectorElts(II->getArgOperand(0), DemandedElts,
+                                        UndefElts, Depth + 1);
+      if (TmpV) { II->setArgOperand(0, TmpV); MadeChange = true; }
+
+      // If lowest element of a scalar op isn't used then use Arg0.
+      if (DemandedElts.getLoBits(1) != 1)
+        return II->getArgOperand(0);
+      // TODO: If only low elt lower SQRT to FSQRT (with rounding/exceptions
+      // checks).
+      break;
+
+    // Binary scalar-as-vector operations that work column-wise.  A dest element
+    // is a function of the corresponding input elements from the two inputs.
+    case Intrinsic::x86_sse_add_ss:
     case Intrinsic::x86_sse_sub_ss:
     case Intrinsic::x86_sse_mul_ss:
+    case Intrinsic::x86_sse_div_ss:
     case Intrinsic::x86_sse_min_ss:
     case Intrinsic::x86_sse_max_ss:
+    case Intrinsic::x86_sse_cmp_ss:
+    case Intrinsic::x86_sse2_add_sd:
     case Intrinsic::x86_sse2_sub_sd:
     case Intrinsic::x86_sse2_mul_sd:
+    case Intrinsic::x86_sse2_div_sd:
     case Intrinsic::x86_sse2_min_sd:
     case Intrinsic::x86_sse2_max_sd:
+    case Intrinsic::x86_sse2_cmp_sd:
+    case Intrinsic::x86_sse41_round_ss:
+    case Intrinsic::x86_sse41_round_sd:
       TmpV = SimplifyDemandedVectorElts(II->getArgOperand(0), DemandedElts,
                                         UndefElts, Depth + 1);
       if (TmpV) { II->setArgOperand(0, TmpV); MadeChange = true; }
@@ -1201,11 +1257,15 @@ Value *InstCombiner::SimplifyDemandedVectorElts(Value *V, APInt DemandedElts,
       if (DemandedElts == 1) {
         switch (II->getIntrinsicID()) {
         default: break;
+        case Intrinsic::x86_sse_add_ss:
         case Intrinsic::x86_sse_sub_ss:
         case Intrinsic::x86_sse_mul_ss:
+        case Intrinsic::x86_sse_div_ss:
+        case Intrinsic::x86_sse2_add_sd:
         case Intrinsic::x86_sse2_sub_sd:
         case Intrinsic::x86_sse2_mul_sd:
-          // TODO: Lower MIN/MAX/ABS/etc
+        case Intrinsic::x86_sse2_div_sd:
+          // TODO: Lower MIN/MAX/etc.
           Value *LHS = II->getArgOperand(0);
           Value *RHS = II->getArgOperand(1);
           // Extract the element as scalars.
@@ -1216,6 +1276,11 @@ Value *InstCombiner::SimplifyDemandedVectorElts(Value *V, APInt DemandedElts,
 
           switch (II->getIntrinsicID()) {
           default: llvm_unreachable("Case stmts out of sync!");
+          case Intrinsic::x86_sse_add_ss:
+          case Intrinsic::x86_sse2_add_sd:
+            TmpV = InsertNewInstWith(BinaryOperator::CreateFAdd(LHS, RHS,
+                                                        II->getName()), *II);
+            break;
           case Intrinsic::x86_sse_sub_ss:
           case Intrinsic::x86_sse2_sub_sd:
             TmpV = InsertNewInstWith(BinaryOperator::CreateFSub(LHS, RHS,
@@ -1226,6 +1291,11 @@ Value *InstCombiner::SimplifyDemandedVectorElts(Value *V, APInt DemandedElts,
             TmpV = InsertNewInstWith(BinaryOperator::CreateFMul(LHS, RHS,
                                                          II->getName()), *II);
             break;
+          case Intrinsic::x86_sse_div_ss:
+          case Intrinsic::x86_sse2_div_sd:
+            TmpV = InsertNewInstWith(BinaryOperator::CreateFDiv(LHS, RHS,
+                                                         II->getName()), *II);
+            break;
           }
 
           Instruction *New =
@@ -1238,6 +1308,10 @@ Value *InstCombiner::SimplifyDemandedVectorElts(Value *V, APInt DemandedElts,
         }
       }
 
+      // If lowest element of a scalar op isn't used then use Arg0.
+      if (DemandedElts.getLoBits(1) != 1)
+        return II->getArgOperand(0);
+
       // Output elements are undefined if both are undefined.  Consider things
       // like undef&0.  The result is known zero, not undef.
       UndefElts &= UndefElts2;
diff --git a/lib/Transforms/InstCombine/InstCombineVectorOps.cpp b/lib/Transforms/InstCombine/InstCombineVectorOps.cpp
index bc4c0ebae790..a76138756148 100644
--- a/lib/Transforms/InstCombine/InstCombineVectorOps.cpp
+++ b/lib/Transforms/InstCombine/InstCombineVectorOps.cpp
@@ -62,21 +62,31 @@ static bool cheapToScalarize(Value *V, bool isConstant) {
   return false;
 }
 
-// If we have a PHI node with a vector type that has only 2 uses: feed
+// If we have a PHI node with a vector type that is only used to feed
 // itself and be an operand of extractelement at a constant location,
 // try to replace the PHI of the vector type with a PHI of a scalar type.
 Instruction *InstCombiner::scalarizePHI(ExtractElementInst &EI, PHINode *PN) {
-  // Verify that the PHI node has exactly 2 uses. Otherwise return NULL.
-  if (!PN->hasNUses(2))
-    return nullptr;
+  SmallVector<Instruction *, 2> Extracts;
+  // The users we want the PHI to have are:
+  // 1) The EI ExtractElement (we already know this)
+  // 2) Possibly more ExtractElements with the same index.
+  // 3) Another operand, which will feed back into the PHI.
+  Instruction *PHIUser = nullptr;
+  for (auto U : PN->users()) {
+    if (ExtractElementInst *EU = dyn_cast<ExtractElementInst>(U)) {
+      if (EI.getIndexOperand() == EU->getIndexOperand())
+        Extracts.push_back(EU);
+      else
+        return nullptr;
+    } else if (!PHIUser) {
+      PHIUser = cast<Instruction>(U);
+    } else {
+      return nullptr;
+    }
+  }
 
-  // If so, it's known at this point that one operand is PHI and the other is
-  // an extractelement node. Find the PHI user that is not the extractelement
-  // node.
-  auto iu = PN->user_begin();
-  Instruction *PHIUser = dyn_cast<Instruction>(*iu);
-  if (PHIUser == cast<Instruction>(&EI))
-    PHIUser = cast<Instruction>(*(++iu));
+  if (!PHIUser)
+    return nullptr;
 
   // Verify that this PHI user has one use, which is the PHI itself,
   // and that it is a binary operation which is cheap to scalarize.
@@ -106,7 +116,8 @@ Instruction *InstCombiner::scalarizePHI(ExtractElementInst &EI, PHINode *PN) {
                                      B0->getOperand(opId)->getName() + ".Elt"),
           *B0);
       Value *newPHIUser = InsertNewInstWith(
-          BinaryOperator::Create(B0->getOpcode(), scalarPHI, Op), *B0);
+          BinaryOperator::CreateWithCopiedFlags(B0->getOpcode(),
+                                                scalarPHI, Op, B0), *B0);
       scalarPHI->addIncoming(newPHIUser, inBB);
     } else {
       // Scalarize PHI input:
@@ -125,19 +136,23 @@ Instruction *InstCombiner::scalarizePHI(ExtractElementInst &EI, PHINode *PN) {
       scalarPHI->addIncoming(newEI, inBB);
     }
   }
-  return ReplaceInstUsesWith(EI, scalarPHI);
+
+  for (auto E : Extracts)
+    replaceInstUsesWith(*E, scalarPHI);
+
+  return &EI;
 }
 
 Instruction *InstCombiner::visitExtractElementInst(ExtractElementInst &EI) {
   if (Value *V = SimplifyExtractElementInst(
           EI.getVectorOperand(), EI.getIndexOperand(), DL, TLI, DT, AC))
-    return ReplaceInstUsesWith(EI, V);
+    return replaceInstUsesWith(EI, V);
 
   // If vector val is constant with all elements the same, replace EI with
   // that element.  We handle a known element # below.
   if (Constant *C = dyn_cast<Constant>(EI.getOperand(0)))
     if (cheapToScalarize(C, false))
-      return ReplaceInstUsesWith(EI, C->getAggregateElement(0U));
+      return replaceInstUsesWith(EI, C->getAggregateElement(0U));
 
   // If extracting a specified index from the vector, see if we can recursively
   // find a previously computed scalar that was inserted into the vector.
@@ -193,12 +208,13 @@ Instruction *InstCombiner::visitExtractElementInst(ExtractElementInst &EI) {
         Value *newEI1 =
           Builder->CreateExtractElement(BO->getOperand(1), EI.getOperand(1),
                                         EI.getName()+".rhs");
-        return BinaryOperator::Create(BO->getOpcode(), newEI0, newEI1);
+        return BinaryOperator::CreateWithCopiedFlags(BO->getOpcode(),
+                                                     newEI0, newEI1, BO);
       }
     } else if (InsertElementInst *IE = dyn_cast<InsertElementInst>(I)) {
       // Extracting the inserted element?
       if (IE->getOperand(2) == EI.getOperand(1))
-        return ReplaceInstUsesWith(EI, IE->getOperand(1));
+        return replaceInstUsesWith(EI, IE->getOperand(1));
       // If the inserted and extracted elements are constants, they must not
       // be the same value, extract from the pre-inserted value instead.
       if (isa<Constant>(IE->getOperand(2)) && isa<Constant>(EI.getOperand(1))) {
@@ -216,7 +232,7 @@ Instruction *InstCombiner::visitExtractElementInst(ExtractElementInst &EI) {
           SVI->getOperand(0)->getType()->getVectorNumElements();
 
         if (SrcIdx < 0)
-          return ReplaceInstUsesWith(EI, UndefValue::get(EI.getType()));
+          return replaceInstUsesWith(EI, UndefValue::get(EI.getType()));
         if (SrcIdx < (int)LHSWidth)
           Src = SVI->getOperand(0);
         else {
@@ -417,7 +433,7 @@ static void replaceExtractElements(InsertElementInst *InsElt,
       continue;
     auto *NewExt = ExtractElementInst::Create(WideVec, OldExt->getOperand(1));
     NewExt->insertAfter(WideVec);
-    IC.ReplaceInstUsesWith(*OldExt, NewExt);
+    IC.replaceInstUsesWith(*OldExt, NewExt);
   }
 }
 
@@ -546,7 +562,7 @@ Instruction *InstCombiner::visitInsertValueInst(InsertValueInst &I) {
   }
 
   if (IsRedundant)
-    return ReplaceInstUsesWith(I, I.getOperand(0));
+    return replaceInstUsesWith(I, I.getOperand(0));
   return nullptr;
 }
 
@@ -557,7 +573,7 @@ Instruction *InstCombiner::visitInsertElementInst(InsertElementInst &IE) {
 
   // Inserting an undef or into an undefined place, remove this.
   if (isa<UndefValue>(ScalarOp) || isa<UndefValue>(IdxOp))
-    ReplaceInstUsesWith(IE, VecOp);
+    replaceInstUsesWith(IE, VecOp);
 
   // If the inserted element was extracted from some other vector, and if the
   // indexes are constant, try to turn this into a shufflevector operation.
@@ -571,15 +587,15 @@ Instruction *InstCombiner::visitInsertElementInst(InsertElementInst &IE) {
       unsigned InsertedIdx = cast<ConstantInt>(IdxOp)->getZExtValue();
 
       if (ExtractedIdx >= NumExtractVectorElts) // Out of range extract.
-        return ReplaceInstUsesWith(IE, VecOp);
+        return replaceInstUsesWith(IE, VecOp);
 
       if (InsertedIdx >= NumInsertVectorElts)  // Out of range insert.
-        return ReplaceInstUsesWith(IE, UndefValue::get(IE.getType()));
+        return replaceInstUsesWith(IE, UndefValue::get(IE.getType()));
 
       // If we are extracting a value from a vector, then inserting it right
       // back into the same place, just use the input vector.
       if (EI->getOperand(0) == VecOp && ExtractedIdx == InsertedIdx)
-        return ReplaceInstUsesWith(IE, VecOp);
+        return replaceInstUsesWith(IE, VecOp);
 
       // If this insertelement isn't used by some other insertelement, turn it
       // (and any insertelements it points to), into one big shuffle.
@@ -605,7 +621,7 @@ Instruction *InstCombiner::visitInsertElementInst(InsertElementInst &IE) {
   APInt AllOnesEltMask(APInt::getAllOnesValue(VWidth));
   if (Value *V = SimplifyDemandedVectorElts(&IE, AllOnesEltMask, UndefElts)) {
     if (V != &IE)
-      return ReplaceInstUsesWith(IE, V);
+      return replaceInstUsesWith(IE, V);
     return &IE;
   }
 
@@ -910,7 +926,7 @@ Instruction *InstCombiner::visitShuffleVectorInst(ShuffleVectorInst &SVI) {
 
   // Undefined shuffle mask -> undefined value.
   if (isa<UndefValue>(SVI.getOperand(2)))
-    return ReplaceInstUsesWith(SVI, UndefValue::get(SVI.getType()));
+    return replaceInstUsesWith(SVI, UndefValue::get(SVI.getType()));
 
   unsigned VWidth = cast<VectorType>(SVI.getType())->getNumElements();
 
@@ -918,7 +934,7 @@ Instruction *InstCombiner::visitShuffleVectorInst(ShuffleVectorInst &SVI) {
   APInt AllOnesEltMask(APInt::getAllOnesValue(VWidth));
   if (Value *V = SimplifyDemandedVectorElts(&SVI, AllOnesEltMask, UndefElts)) {
     if (V != &SVI)
-      return ReplaceInstUsesWith(SVI, V);
+      return replaceInstUsesWith(SVI, V);
     LHS = SVI.getOperand(0);
     RHS = SVI.getOperand(1);
     MadeChange = true;
@@ -933,7 +949,7 @@ Instruction *InstCombiner::visitShuffleVectorInst(ShuffleVectorInst &SVI) {
       // shuffle(undef,undef,mask) -> undef.
       Value *Result = (VWidth == LHSWidth)
                       ? LHS : UndefValue::get(SVI.getType());
-      return ReplaceInstUsesWith(SVI, Result);
+      return replaceInstUsesWith(SVI, Result);
     }
 
     // Remap any references to RHS to use LHS.
@@ -967,13 +983,13 @@ Instruction *InstCombiner::visitShuffleVectorInst(ShuffleVectorInst &SVI) {
     recognizeIdentityMask(Mask, isLHSID, isRHSID);
 
     // Eliminate identity shuffles.
-    if (isLHSID) return ReplaceInstUsesWith(SVI, LHS);
-    if (isRHSID) return ReplaceInstUsesWith(SVI, RHS);
+    if (isLHSID) return replaceInstUsesWith(SVI, LHS);
+    if (isRHSID) return replaceInstUsesWith(SVI, RHS);
   }
 
   if (isa<UndefValue>(RHS) && CanEvaluateShuffled(LHS, Mask)) {
     Value *V = EvaluateInDifferentElementOrder(LHS, Mask);
-    return ReplaceInstUsesWith(SVI, V);
+    return replaceInstUsesWith(SVI, V);
   }
 
   // SROA generates shuffle+bitcast when the extracted sub-vector is bitcast to
@@ -1060,7 +1076,7 @@ Instruction *InstCombiner::visitShuffleVectorInst(ShuffleVectorInst &SVI) {
           NewBC, ConstantInt::get(Int32Ty, BegIdx), SVI.getName() + ".extract");
       // The shufflevector isn't being replaced: the bitcast that used it
       // is. InstCombine will visit the newly-created instructions.
-      ReplaceInstUsesWith(*BC, Ext);
+      replaceInstUsesWith(*BC, Ext);
       MadeChange = true;
     }
   }
@@ -1251,8 +1267,8 @@ Instruction *InstCombiner::visitShuffleVectorInst(ShuffleVectorInst &SVI) {
   // corresponding argument.
   bool isLHSID, isRHSID;
   recognizeIdentityMask(newMask, isLHSID, isRHSID);
-  if (isLHSID && VWidth == LHSOp0Width) return ReplaceInstUsesWith(SVI, newLHS);
-  if (isRHSID && VWidth == RHSOp0Width) return ReplaceInstUsesWith(SVI, newRHS);
+  if (isLHSID && VWidth == LHSOp0Width) return replaceInstUsesWith(SVI, newLHS);
+  if (isRHSID && VWidth == RHSOp0Width) return replaceInstUsesWith(SVI, newRHS);
 
   return MadeChange ? &SVI : nullptr;
 }
diff --git a/lib/Transforms/InstCombine/InstructionCombining.cpp b/lib/Transforms/InstCombine/InstructionCombining.cpp
index 903a0b5f5400..51c3262b5d14 100644
--- a/lib/Transforms/InstCombine/InstructionCombining.cpp
+++ b/lib/Transforms/InstCombine/InstructionCombining.cpp
@@ -39,7 +39,9 @@
 #include "llvm/ADT/SmallPtrSet.h"
 #include "llvm/ADT/Statistic.h"
 #include "llvm/ADT/StringSwitch.h"
+#include "llvm/Analysis/AliasAnalysis.h"
 #include "llvm/Analysis/AssumptionCache.h"
+#include "llvm/Analysis/BasicAliasAnalysis.h"
 #include "llvm/Analysis/CFG.h"
 #include "llvm/Analysis/ConstantFolding.h"
 #include "llvm/Analysis/EHPersonalities.h"
@@ -76,6 +78,10 @@ STATISTIC(NumExpand,    "Number of expansions");
 STATISTIC(NumFactor   , "Number of factorizations");
 STATISTIC(NumReassoc  , "Number of reassociations");
 
+static cl::opt<bool>
+EnableExpensiveCombines("expensive-combines",
+                        cl::desc("Enable expensive instruction combines"));
+
 Value *InstCombiner::EmitGEPOffset(User *GEP) {
   return llvm::EmitGEPOffset(Builder, DL, GEP);
 }
@@ -120,33 +126,23 @@ bool InstCombiner::ShouldChangeType(Type *From, Type *To) const {
 // all other opcodes, the function conservatively returns false.
 static bool MaintainNoSignedWrap(BinaryOperator &I, Value *B, Value *C) {
   OverflowingBinaryOperator *OBO = dyn_cast<OverflowingBinaryOperator>(&I);
-  if (!OBO || !OBO->hasNoSignedWrap()) {
+  if (!OBO || !OBO->hasNoSignedWrap())
     return false;
-  }
 
   // We reason about Add and Sub Only.
   Instruction::BinaryOps Opcode = I.getOpcode();
-  if (Opcode != Instruction::Add &&
-      Opcode != Instruction::Sub) {
+  if (Opcode != Instruction::Add && Opcode != Instruction::Sub)
     return false;
-  }
-
-  ConstantInt *CB = dyn_cast<ConstantInt>(B);
-  ConstantInt *CC = dyn_cast<ConstantInt>(C);
 
-  if (!CB || !CC) {
+  const APInt *BVal, *CVal;
+  if (!match(B, m_APInt(BVal)) || !match(C, m_APInt(CVal)))
     return false;
-  }
 
-  const APInt &BVal = CB->getValue();
-  const APInt &CVal = CC->getValue();
   bool Overflow = false;
-
-  if (Opcode == Instruction::Add) {
-    BVal.sadd_ov(CVal, Overflow);
-  } else {
-    BVal.ssub_ov(CVal, Overflow);
-  }
+  if (Opcode == Instruction::Add)
+    BVal->sadd_ov(*CVal, Overflow);
+  else
+    BVal->ssub_ov(*CVal, Overflow);
 
   return !Overflow;
 }
@@ -166,6 +162,49 @@ static void ClearSubclassDataAfterReassociation(BinaryOperator &I) {
   I.setFastMathFlags(FMF);
 }
 
+/// Combine constant operands of associative operations either before or after a
+/// cast to eliminate one of the associative operations:
+/// (op (cast (op X, C2)), C1) --> (cast (op X, op (C1, C2)))
+/// (op (cast (op X, C2)), C1) --> (op (cast X), op (C1, C2))
+static bool simplifyAssocCastAssoc(BinaryOperator *BinOp1) {
+  auto *Cast = dyn_cast<CastInst>(BinOp1->getOperand(0));
+  if (!Cast || !Cast->hasOneUse())
+    return false;
+
+  // TODO: Enhance logic for other casts and remove this check.
+  auto CastOpcode = Cast->getOpcode();
+  if (CastOpcode != Instruction::ZExt)
+    return false;
+
+  // TODO: Enhance logic for other BinOps and remove this check.
+  auto AssocOpcode = BinOp1->getOpcode();
+  if (AssocOpcode != Instruction::Xor && AssocOpcode != Instruction::And &&
+      AssocOpcode != Instruction::Or)
+    return false;
+
+  auto *BinOp2 = dyn_cast<BinaryOperator>(Cast->getOperand(0));
+  if (!BinOp2 || !BinOp2->hasOneUse() || BinOp2->getOpcode() != AssocOpcode)
+    return false;
+
+  Constant *C1, *C2;
+  if (!match(BinOp1->getOperand(1), m_Constant(C1)) ||
+      !match(BinOp2->getOperand(1), m_Constant(C2)))
+    return false;
+
+  // TODO: This assumes a zext cast.
+  // Eg, if it was a trunc, we'd cast C1 to the source type because casting C2
+  // to the destination type might lose bits.
+
+  // Fold the constants together in the destination type:
+  // (op (cast (op X, C2)), C1) --> (op (cast X), FoldedC)
+  Type *DestTy = C1->getType();
+  Constant *CastC2 = ConstantExpr::getCast(CastOpcode, C2, DestTy);
+  Constant *FoldedC = ConstantExpr::get(AssocOpcode, C1, CastC2);
+  Cast->setOperand(0, BinOp2->getOperand(0));
+  BinOp1->setOperand(1, FoldedC);
+  return true;
+}
+
 /// This performs a few simplifications for operators that are associative or
 /// commutative:
 ///
@@ -253,6 +292,12 @@ bool InstCombiner::SimplifyAssociativeOrCommutative(BinaryOperator &I) {
     }
 
     if (I.isAssociative() && I.isCommutative()) {
+      if (simplifyAssocCastAssoc(&I)) {
+        Changed = true;
+        ++NumReassoc;
+        continue;
+      }
+
       // Transform: "(A op B) op C" ==> "(C op A) op B" if "C op A" simplifies.
       if (Op0 && Op0->getOpcode() == Opcode) {
         Value *A = Op0->getOperand(0);
@@ -919,10 +964,10 @@ Instruction *InstCombiner::FoldOpIntoPhi(Instruction &I) {
   for (auto UI = PN->user_begin(), E = PN->user_end(); UI != E;) {
     Instruction *User = cast<Instruction>(*UI++);
     if (User == &I) continue;
-    ReplaceInstUsesWith(*User, NewPN);
-    EraseInstFromFunction(*User);
+    replaceInstUsesWith(*User, NewPN);
+    eraseInstFromFunction(*User);
   }
-  return ReplaceInstUsesWith(I, NewPN);
+  return replaceInstUsesWith(I, NewPN);
 }
 
 /// Given a pointer type and a constant offset, determine whether or not there
@@ -1334,8 +1379,8 @@ Value *InstCombiner::SimplifyVectorOp(BinaryOperator &Inst) {
 Instruction *InstCombiner::visitGetElementPtrInst(GetElementPtrInst &GEP) {
   SmallVector<Value*, 8> Ops(GEP.op_begin(), GEP.op_end());
 
-  if (Value *V = SimplifyGEPInst(Ops, DL, TLI, DT, AC))
-    return ReplaceInstUsesWith(GEP, V);
+  if (Value *V = SimplifyGEPInst(GEP.getSourceElementType(), Ops, DL, TLI, DT, AC))
+    return replaceInstUsesWith(GEP, V);
 
   Value *PtrOp = GEP.getOperand(0);
 
@@ -1349,19 +1394,18 @@ Instruction *InstCombiner::visitGetElementPtrInst(GetElementPtrInst &GEP) {
   for (User::op_iterator I = GEP.op_begin() + 1, E = GEP.op_end(); I != E;
        ++I, ++GTI) {
     // Skip indices into struct types.
-    SequentialType *SeqTy = dyn_cast<SequentialType>(*GTI);
-    if (!SeqTy)
+    if (isa<StructType>(*GTI))
       continue;
 
     // Index type should have the same width as IntPtr
     Type *IndexTy = (*I)->getType();
     Type *NewIndexType = IndexTy->isVectorTy() ?
       VectorType::get(IntPtrTy, IndexTy->getVectorNumElements()) : IntPtrTy;
- 
+
     // If the element type has zero size then any index over it is equivalent
     // to an index of zero, so replace it with zero if it is not zero already.
-    if (SeqTy->getElementType()->isSized() &&
-        DL.getTypeAllocSize(SeqTy->getElementType()) == 0)
+    Type *EltTy = GTI.getIndexedType();
+    if (EltTy->isSized() && DL.getTypeAllocSize(EltTy) == 0)
       if (!isa<Constant>(*I) || !cast<Constant>(*I)->isNullValue()) {
         *I = Constant::getNullValue(NewIndexType);
         MadeChange = true;
@@ -1393,7 +1437,7 @@ Instruction *InstCombiner::visitGetElementPtrInst(GetElementPtrInst &GEP) {
     if (Op1 == &GEP)
       return nullptr;
 
-    signed DI = -1;
+    int DI = -1;
 
     for (auto I = PN->op_begin()+1, E = PN->op_end(); I !=E; ++I) {
       GetElementPtrInst *Op2 = dyn_cast<GetElementPtrInst>(*I);
@@ -1405,7 +1449,7 @@ Instruction *InstCombiner::visitGetElementPtrInst(GetElementPtrInst &GEP) {
         return nullptr;
 
       // Keep track of the type as we walk the GEP.
-      Type *CurTy = Op1->getOperand(0)->getType()->getScalarType();
+      Type *CurTy = nullptr;
 
       for (unsigned J = 0, F = Op1->getNumOperands(); J != F; ++J) {
         if (Op1->getOperand(J)->getType() != Op2->getOperand(J)->getType())
@@ -1436,7 +1480,9 @@ Instruction *InstCombiner::visitGetElementPtrInst(GetElementPtrInst &GEP) {
 
         // Sink down a layer of the type for the next iteration.
         if (J > 0) {
-          if (CompositeType *CT = dyn_cast<CompositeType>(CurTy)) {
+          if (J == 1) {
+            CurTy = Op1->getSourceElementType();
+          } else if (CompositeType *CT = dyn_cast<CompositeType>(CurTy)) {
             CurTy = CT->getTypeAtIndex(Op1->getOperand(J));
           } else {
             CurTy = nullptr;
@@ -1565,8 +1611,7 @@ Instruction *InstCombiner::visitGetElementPtrInst(GetElementPtrInst &GEP) {
     unsigned AS = GEP.getPointerAddressSpace();
     if (GEP.getOperand(1)->getType()->getScalarSizeInBits() ==
         DL.getPointerSizeInBits(AS)) {
-      Type *PtrTy = GEP.getPointerOperandType();
-      Type *Ty = PtrTy->getPointerElementType();
+      Type *Ty = GEP.getSourceElementType();
       uint64_t TyAllocSize = DL.getTypeAllocSize(Ty);
 
       bool Matched = false;
@@ -1629,9 +1674,8 @@ Instruction *InstCombiner::visitGetElementPtrInst(GetElementPtrInst &GEP) {
     //
     // This occurs when the program declares an array extern like "int X[];"
     if (HasZeroPointerIndex) {
-      PointerType *CPTy = cast<PointerType>(PtrOp->getType());
       if (ArrayType *CATy =
-          dyn_cast<ArrayType>(CPTy->getElementType())) {
+          dyn_cast<ArrayType>(GEP.getSourceElementType())) {
         // GEP (bitcast i8* X to [0 x i8]*), i32 0, ... ?
         if (CATy->getElementType() == StrippedPtrTy->getElementType()) {
           // -> GEP i8* X, ...
@@ -1688,7 +1732,7 @@ Instruction *InstCombiner::visitGetElementPtrInst(GetElementPtrInst &GEP) {
       // %t = getelementptr i32* bitcast ([2 x i32]* %str to i32*), i32 %V
       // into:  %t1 = getelementptr [2 x i32]* %str, i32 0, i32 %V; bitcast
       Type *SrcElTy = StrippedPtrTy->getElementType();
-      Type *ResElTy = PtrOp->getType()->getPointerElementType();
+      Type *ResElTy = GEP.getSourceElementType();
       if (SrcElTy->isArrayTy() &&
           DL.getTypeAllocSize(SrcElTy->getArrayElementType()) ==
               DL.getTypeAllocSize(ResElTy)) {
@@ -1822,7 +1866,7 @@ Instruction *InstCombiner::visitGetElementPtrInst(GetElementPtrInst &GEP) {
             if (I != BCI) {
               I->takeName(BCI);
               BCI->getParent()->getInstList().insert(BCI->getIterator(), I);
-              ReplaceInstUsesWith(*BCI, I);
+              replaceInstUsesWith(*BCI, I);
             }
             return &GEP;
           }
@@ -1844,7 +1888,7 @@ Instruction *InstCombiner::visitGetElementPtrInst(GetElementPtrInst &GEP) {
                 : Builder->CreateGEP(nullptr, Operand, NewIndices);
 
         if (NGEP->getType() == GEP.getType())
-          return ReplaceInstUsesWith(GEP, NGEP);
+          return replaceInstUsesWith(GEP, NGEP);
         NGEP->takeName(&GEP);
 
         if (NGEP->getType()->getPointerAddressSpace() != GEP.getAddressSpace())
@@ -1857,6 +1901,20 @@ Instruction *InstCombiner::visitGetElementPtrInst(GetElementPtrInst &GEP) {
   return nullptr;
 }
 
+static bool isNeverEqualToUnescapedAlloc(Value *V, const TargetLibraryInfo *TLI,
+                                         Instruction *AI) {
+  if (isa<ConstantPointerNull>(V))
+    return true;
+  if (auto *LI = dyn_cast<LoadInst>(V))
+    return isa<GlobalVariable>(LI->getPointerOperand());
+  // Two distinct allocations will never be equal.
+  // We rely on LookThroughBitCast in isAllocLikeFn being false, since looking
+  // through bitcasts of V can cause
+  // the result statement below to be true, even when AI and V (ex:
+  // i8* ->i32* ->i8* of AI) are the same allocations.
+  return isAllocLikeFn(V, TLI) && V != AI;
+}
+
 static bool
 isAllocSiteRemovable(Instruction *AI, SmallVectorImpl<WeakVH> &Users,
                      const TargetLibraryInfo *TLI) {
@@ -1881,7 +1939,12 @@ isAllocSiteRemovable(Instruction *AI, SmallVectorImpl<WeakVH> &Users,
       case Instruction::ICmp: {
         ICmpInst *ICI = cast<ICmpInst>(I);
         // We can fold eq/ne comparisons with null to false/true, respectively.
-        if (!ICI->isEquality() || !isa<ConstantPointerNull>(ICI->getOperand(1)))
+        // We also fold comparisons in some conditions provided the alloc has
+        // not escaped (see isNeverEqualToUnescapedAlloc).
+        if (!ICI->isEquality())
+          return false;
+        unsigned OtherIndex = (ICI->getOperand(0) == PI) ? 1 : 0;
+        if (!isNeverEqualToUnescapedAlloc(ICI->getOperand(OtherIndex), TLI, AI))
           return false;
         Users.emplace_back(I);
         continue;
@@ -1941,23 +2004,40 @@ Instruction *InstCombiner::visitAllocSite(Instruction &MI) {
   SmallVector<WeakVH, 64> Users;
   if (isAllocSiteRemovable(&MI, Users, TLI)) {
     for (unsigned i = 0, e = Users.size(); i != e; ++i) {
-      Instruction *I = cast_or_null<Instruction>(&*Users[i]);
-      if (!I) continue;
+      // Lowering all @llvm.objectsize calls first because they may
+      // use a bitcast/GEP of the alloca we are removing.
+      if (!Users[i])
+       continue;
+
+      Instruction *I = cast<Instruction>(&*Users[i]);
+
+      if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(I)) {
+        if (II->getIntrinsicID() == Intrinsic::objectsize) {
+          uint64_t Size;
+          if (!getObjectSize(II->getArgOperand(0), Size, DL, TLI)) {
+            ConstantInt *CI = cast<ConstantInt>(II->getArgOperand(1));
+            Size = CI->isZero() ? -1ULL : 0;
+          }
+          replaceInstUsesWith(*I, ConstantInt::get(I->getType(), Size));
+          eraseInstFromFunction(*I);
+          Users[i] = nullptr; // Skip examining in the next loop.
+        }
+      }
+    }
+    for (unsigned i = 0, e = Users.size(); i != e; ++i) {
+      if (!Users[i])
+        continue;
+
+      Instruction *I = cast<Instruction>(&*Users[i]);
 
       if (ICmpInst *C = dyn_cast<ICmpInst>(I)) {
-        ReplaceInstUsesWith(*C,
+        replaceInstUsesWith(*C,
                             ConstantInt::get(Type::getInt1Ty(C->getContext()),
                                              C->isFalseWhenEqual()));
       } else if (isa<BitCastInst>(I) || isa<GetElementPtrInst>(I)) {
-        ReplaceInstUsesWith(*I, UndefValue::get(I->getType()));
-      } else if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(I)) {
-        if (II->getIntrinsicID() == Intrinsic::objectsize) {
-          ConstantInt *CI = cast<ConstantInt>(II->getArgOperand(1));
-          uint64_t DontKnow = CI->isZero() ? -1ULL : 0;
-          ReplaceInstUsesWith(*I, ConstantInt::get(I->getType(), DontKnow));
-        }
+        replaceInstUsesWith(*I, UndefValue::get(I->getType()));
       }
-      EraseInstFromFunction(*I);
+      eraseInstFromFunction(*I);
     }
 
     if (InvokeInst *II = dyn_cast<InvokeInst>(&MI)) {
@@ -1967,7 +2047,7 @@ Instruction *InstCombiner::visitAllocSite(Instruction &MI) {
       InvokeInst::Create(F, II->getNormalDest(), II->getUnwindDest(),
                          None, "", II->getParent());
     }
-    return EraseInstFromFunction(MI);
+    return eraseInstFromFunction(MI);
   }
   return nullptr;
 }
@@ -2038,13 +2118,13 @@ Instruction *InstCombiner::visitFree(CallInst &FI) {
     // Insert a new store to null because we cannot modify the CFG here.
     Builder->CreateStore(ConstantInt::getTrue(FI.getContext()),
                          UndefValue::get(Type::getInt1PtrTy(FI.getContext())));
-    return EraseInstFromFunction(FI);
+    return eraseInstFromFunction(FI);
   }
 
   // If we have 'free null' delete the instruction.  This can happen in stl code
   // when lots of inlining happens.
   if (isa<ConstantPointerNull>(Op))
-    return EraseInstFromFunction(FI);
+    return eraseInstFromFunction(FI);
 
   // If we optimize for code size, try to move the call to free before the null
   // test so that simplify cfg can remove the empty block and dead code
@@ -2145,6 +2225,7 @@ Instruction *InstCombiner::visitSwitchInst(SwitchInst &SI) {
   unsigned LeadingKnownOnes = KnownOne.countLeadingOnes();
 
   // Compute the number of leading bits we can ignore.
+  // TODO: A better way to determine this would use ComputeNumSignBits().
   for (auto &C : SI.cases()) {
     LeadingKnownZeros = std::min(
         LeadingKnownZeros, C.getCaseValue()->getValue().countLeadingZeros());
@@ -2154,17 +2235,15 @@ Instruction *InstCombiner::visitSwitchInst(SwitchInst &SI) {
 
   unsigned NewWidth = BitWidth - std::max(LeadingKnownZeros, LeadingKnownOnes);
 
-  // Truncate the condition operand if the new type is equal to or larger than
-  // the largest legal integer type. We need to be conservative here since
-  // x86 generates redundant zero-extension instructions if the operand is
-  // truncated to i8 or i16.
+  // Shrink the condition operand if the new type is smaller than the old type.
+  // This may produce a non-standard type for the switch, but that's ok because
+  // the backend should extend back to a legal type for the target.
   bool TruncCond = false;
-  if (NewWidth > 0 && BitWidth > NewWidth &&
-      NewWidth >= DL.getLargestLegalIntTypeSize()) {
+  if (NewWidth > 0 && NewWidth < BitWidth) {
     TruncCond = true;
     IntegerType *Ty = IntegerType::get(SI.getContext(), NewWidth);
     Builder->SetInsertPoint(&SI);
-    Value *NewCond = Builder->CreateTrunc(SI.getCondition(), Ty, "trunc");
+    Value *NewCond = Builder->CreateTrunc(Cond, Ty, "trunc");
     SI.setCondition(NewCond);
 
     for (auto &C : SI.cases())
@@ -2172,28 +2251,27 @@ Instruction *InstCombiner::visitSwitchInst(SwitchInst &SI) {
           SI.getContext(), C.getCaseValue()->getValue().trunc(NewWidth)));
   }
 
-  if (Instruction *I = dyn_cast<Instruction>(Cond)) {
-    if (I->getOpcode() == Instruction::Add)
-      if (ConstantInt *AddRHS = dyn_cast<ConstantInt>(I->getOperand(1))) {
-        // change 'switch (X+4) case 1:' into 'switch (X) case -3'
-        // Skip the first item since that's the default case.
-        for (SwitchInst::CaseIt i = SI.case_begin(), e = SI.case_end();
-             i != e; ++i) {
-          ConstantInt* CaseVal = i.getCaseValue();
-          Constant *LHS = CaseVal;
-          if (TruncCond)
-            LHS = LeadingKnownZeros
-                      ? ConstantExpr::getZExt(CaseVal, Cond->getType())
-                      : ConstantExpr::getSExt(CaseVal, Cond->getType());
-          Constant* NewCaseVal = ConstantExpr::getSub(LHS, AddRHS);
-          assert(isa<ConstantInt>(NewCaseVal) &&
-                 "Result of expression should be constant");
-          i.setValue(cast<ConstantInt>(NewCaseVal));
-        }
-        SI.setCondition(I->getOperand(0));
-        Worklist.Add(I);
-        return &SI;
+  ConstantInt *AddRHS = nullptr;
+  if (match(Cond, m_Add(m_Value(), m_ConstantInt(AddRHS)))) {
+    Instruction *I = cast<Instruction>(Cond);
+    // Change 'switch (X+4) case 1:' into 'switch (X) case -3'.
+    for (SwitchInst::CaseIt i = SI.case_begin(), e = SI.case_end(); i != e;
+         ++i) {
+      ConstantInt *CaseVal = i.getCaseValue();
+      Constant *LHS = CaseVal;
+      if (TruncCond) {
+        LHS = LeadingKnownZeros
+                  ? ConstantExpr::getZExt(CaseVal, Cond->getType())
+                  : ConstantExpr::getSExt(CaseVal, Cond->getType());
       }
+      Constant *NewCaseVal = ConstantExpr::getSub(LHS, AddRHS);
+      assert(isa<ConstantInt>(NewCaseVal) &&
+             "Result of expression should be constant");
+      i.setValue(cast<ConstantInt>(NewCaseVal));
+    }
+    SI.setCondition(I->getOperand(0));
+    Worklist.Add(I);
+    return &SI;
   }
 
   return TruncCond ? &SI : nullptr;
@@ -2203,11 +2281,11 @@ Instruction *InstCombiner::visitExtractValueInst(ExtractValueInst &EV) {
   Value *Agg = EV.getAggregateOperand();
 
   if (!EV.hasIndices())
-    return ReplaceInstUsesWith(EV, Agg);
+    return replaceInstUsesWith(EV, Agg);
 
   if (Value *V =
           SimplifyExtractValueInst(Agg, EV.getIndices(), DL, TLI, DT, AC))
-    return ReplaceInstUsesWith(EV, V);
+    return replaceInstUsesWith(EV, V);
 
   if (InsertValueInst *IV = dyn_cast<InsertValueInst>(Agg)) {
     // We're extracting from an insertvalue instruction, compare the indices
@@ -2233,7 +2311,7 @@ Instruction *InstCombiner::visitExtractValueInst(ExtractValueInst &EV) {
       // %B = insertvalue { i32, { i32 } } %A, i32 42, 1, 0
       // %C = extractvalue { i32, { i32 } } %B, 1, 0
       // with "i32 42"
-      return ReplaceInstUsesWith(EV, IV->getInsertedValueOperand());
+      return replaceInstUsesWith(EV, IV->getInsertedValueOperand());
     if (exti == exte) {
       // The extract list is a prefix of the insert list. i.e. replace
       // %I = insertvalue { i32, { i32 } } %A, i32 42, 1, 0
@@ -2273,8 +2351,8 @@ Instruction *InstCombiner::visitExtractValueInst(ExtractValueInst &EV) {
       case Intrinsic::sadd_with_overflow:
         if (*EV.idx_begin() == 0) {  // Normal result.
           Value *LHS = II->getArgOperand(0), *RHS = II->getArgOperand(1);
-          ReplaceInstUsesWith(*II, UndefValue::get(II->getType()));
-          EraseInstFromFunction(*II);
+          replaceInstUsesWith(*II, UndefValue::get(II->getType()));
+          eraseInstFromFunction(*II);
           return BinaryOperator::CreateAdd(LHS, RHS);
         }
 
@@ -2290,8 +2368,8 @@ Instruction *InstCombiner::visitExtractValueInst(ExtractValueInst &EV) {
       case Intrinsic::ssub_with_overflow:
         if (*EV.idx_begin() == 0) {  // Normal result.
           Value *LHS = II->getArgOperand(0), *RHS = II->getArgOperand(1);
-          ReplaceInstUsesWith(*II, UndefValue::get(II->getType()));
-          EraseInstFromFunction(*II);
+          replaceInstUsesWith(*II, UndefValue::get(II->getType()));
+          eraseInstFromFunction(*II);
           return BinaryOperator::CreateSub(LHS, RHS);
         }
         break;
@@ -2299,8 +2377,8 @@ Instruction *InstCombiner::visitExtractValueInst(ExtractValueInst &EV) {
       case Intrinsic::smul_with_overflow:
         if (*EV.idx_begin() == 0) {  // Normal result.
           Value *LHS = II->getArgOperand(0), *RHS = II->getArgOperand(1);
-          ReplaceInstUsesWith(*II, UndefValue::get(II->getType()));
-          EraseInstFromFunction(*II);
+          replaceInstUsesWith(*II, UndefValue::get(II->getType()));
+          eraseInstFromFunction(*II);
           return BinaryOperator::CreateMul(LHS, RHS);
         }
         break;
@@ -2330,8 +2408,8 @@ Instruction *InstCombiner::visitExtractValueInst(ExtractValueInst &EV) {
       Value *GEP = Builder->CreateInBoundsGEP(L->getType(),
                                               L->getPointerOperand(), Indices);
       // Returning the load directly will cause the main loop to insert it in
-      // the wrong spot, so use ReplaceInstUsesWith().
-      return ReplaceInstUsesWith(EV, Builder->CreateLoad(GEP));
+      // the wrong spot, so use replaceInstUsesWith().
+      return replaceInstUsesWith(EV, Builder->CreateLoad(GEP));
     }
   // We could simplify extracts from other values. Note that nested extracts may
   // already be simplified implicitly by the above: extract (extract (insert) )
@@ -2348,8 +2426,10 @@ Instruction *InstCombiner::visitExtractValueInst(ExtractValueInst &EV) {
 static bool isCatchAll(EHPersonality Personality, Constant *TypeInfo) {
   switch (Personality) {
   case EHPersonality::GNU_C:
-    // The GCC C EH personality only exists to support cleanups, so it's not
-    // clear what the semantics of catch clauses are.
+  case EHPersonality::GNU_C_SjLj:
+  case EHPersonality::Rust:
+    // The GCC C EH and Rust personality only exists to support cleanups, so
+    // it's not clear what the semantics of catch clauses are.
     return false;
   case EHPersonality::Unknown:
     return false;
@@ -2358,6 +2438,7 @@ static bool isCatchAll(EHPersonality Personality, Constant *TypeInfo) {
     // match foreign exceptions (or didn't, before gcc-4.7).
     return false;
   case EHPersonality::GNU_CXX:
+  case EHPersonality::GNU_CXX_SjLj:
   case EHPersonality::GNU_ObjC:
   case EHPersonality::MSVC_X86SEH:
   case EHPersonality::MSVC_Win64SEH:
@@ -2701,12 +2782,15 @@ static bool TryToSinkInstruction(Instruction *I, BasicBlock *DestBlock) {
         &DestBlock->getParent()->getEntryBlock())
     return false;
 
+  // Do not sink into catchswitch blocks.
+  if (isa<CatchSwitchInst>(DestBlock->getTerminator()))
+    return false;
+
   // Do not sink convergent call instructions.
   if (auto *CI = dyn_cast<CallInst>(I)) {
     if (CI->isConvergent())
       return false;
   }
-
   // We can only sink load instructions if there is nothing between the load and
   // the end of block that could change the value.
   if (I->mayReadFromMemory()) {
@@ -2731,7 +2815,7 @@ bool InstCombiner::run() {
     // Check to see if we can DCE the instruction.
     if (isInstructionTriviallyDead(I, TLI)) {
       DEBUG(dbgs() << "IC: DCE: " << *I << '\n');
-      EraseInstFromFunction(*I);
+      eraseInstFromFunction(*I);
       ++NumDeadInst;
       MadeIRChange = true;
       continue;
@@ -2744,17 +2828,17 @@ bool InstCombiner::run() {
         DEBUG(dbgs() << "IC: ConstFold to: " << *C << " from: " << *I << '\n');
 
         // Add operands to the worklist.
-        ReplaceInstUsesWith(*I, C);
+        replaceInstUsesWith(*I, C);
         ++NumConstProp;
-        EraseInstFromFunction(*I);
+        eraseInstFromFunction(*I);
         MadeIRChange = true;
         continue;
       }
     }
 
-    // In general, it is possible for computeKnownBits to determine all bits in a
-    // value even when the operands are not all constants.
-    if (!I->use_empty() && I->getType()->isIntegerTy()) {
+    // In general, it is possible for computeKnownBits to determine all bits in
+    // a value even when the operands are not all constants.
+    if (ExpensiveCombines && !I->use_empty() && I->getType()->isIntegerTy()) {
       unsigned BitWidth = I->getType()->getScalarSizeInBits();
       APInt KnownZero(BitWidth, 0);
       APInt KnownOne(BitWidth, 0);
@@ -2765,9 +2849,9 @@ bool InstCombiner::run() {
                         " from: " << *I << '\n');
 
         // Add operands to the worklist.
-        ReplaceInstUsesWith(*I, C);
+        replaceInstUsesWith(*I, C);
         ++NumConstProp;
-        EraseInstFromFunction(*I);
+        eraseInstFromFunction(*I);
         MadeIRChange = true;
         continue;
       }
@@ -2800,6 +2884,7 @@ bool InstCombiner::run() {
         if (UserIsSuccessor && UserParent->getSinglePredecessor()) {
           // Okay, the CFG is simple enough, try to sink this instruction.
           if (TryToSinkInstruction(I, UserParent)) {
+            DEBUG(dbgs() << "IC: Sink: " << *I << '\n');
             MadeIRChange = true;
             // We'll add uses of the sunk instruction below, but since sinking
             // can expose opportunities for it's *operands* add them to the
@@ -2852,7 +2937,7 @@ bool InstCombiner::run() {
 
         InstParent->getInstList().insert(InsertPos, Result);
 
-        EraseInstFromFunction(*I);
+        eraseInstFromFunction(*I);
       } else {
 #ifndef NDEBUG
         DEBUG(dbgs() << "IC: Mod = " << OrigI << '\n'
@@ -2862,7 +2947,7 @@ bool InstCombiner::run() {
         // If the instruction was modified, it's possible that it is now dead.
         // if so, remove it.
         if (isInstructionTriviallyDead(I, TLI)) {
-          EraseInstFromFunction(*I);
+          eraseInstFromFunction(*I);
         } else {
           Worklist.Add(I);
           Worklist.AddUsersToWorkList(*I);
@@ -3002,35 +3087,20 @@ static bool prepareICWorklistFromFunction(Function &F, const DataLayout &DL,
   // Do a depth-first traversal of the function, populate the worklist with
   // the reachable instructions.  Ignore blocks that are not reachable.  Keep
   // track of which blocks we visit.
-  SmallPtrSet<BasicBlock *, 64> Visited;
+  SmallPtrSet<BasicBlock *, 32> Visited;
   MadeIRChange |=
       AddReachableCodeToWorklist(&F.front(), DL, Visited, ICWorklist, TLI);
 
   // Do a quick scan over the function.  If we find any blocks that are
   // unreachable, remove any instructions inside of them.  This prevents
   // the instcombine code from having to deal with some bad special cases.
-  for (Function::iterator BB = F.begin(), E = F.end(); BB != E; ++BB) {
-    if (Visited.count(&*BB))
+  for (BasicBlock &BB : F) {
+    if (Visited.count(&BB))
       continue;
 
-    // Delete the instructions backwards, as it has a reduced likelihood of
-    // having to update as many def-use and use-def chains.
-    Instruction *EndInst = BB->getTerminator(); // Last not to be deleted.
-    while (EndInst != BB->begin()) {
-      // Delete the next to last instruction.
-      Instruction *Inst = &*--EndInst->getIterator();
-      if (!Inst->use_empty() && !Inst->getType()->isTokenTy())
-        Inst->replaceAllUsesWith(UndefValue::get(Inst->getType()));
-      if (Inst->isEHPad() || Inst->getType()->isTokenTy()) {
-        EndInst = Inst;
-        continue;
-      }
-      if (!isa<DbgInfoIntrinsic>(Inst)) {
-        ++NumDeadInst;
-        MadeIRChange = true;
-      }
-      Inst->eraseFromParent();
-    }
+    unsigned NumDeadInstInBB = removeAllNonTerminatorAndEHPadInstructions(&BB);
+    MadeIRChange |= NumDeadInstInBB > 0;
+    NumDeadInst += NumDeadInstInBB;
   }
 
   return MadeIRChange;
@@ -3040,12 +3110,14 @@ static bool
 combineInstructionsOverFunction(Function &F, InstCombineWorklist &Worklist,
                                 AliasAnalysis *AA, AssumptionCache &AC,
                                 TargetLibraryInfo &TLI, DominatorTree &DT,
+                                bool ExpensiveCombines = true,
                                 LoopInfo *LI = nullptr) {
   auto &DL = F.getParent()->getDataLayout();
+  ExpensiveCombines |= EnableExpensiveCombines;
 
   /// Builder - This is an IRBuilder that automatically inserts new
   /// instructions into the worklist when they are created.
-  IRBuilder<true, TargetFolder, InstCombineIRInserter> Builder(
+  IRBuilder<TargetFolder, InstCombineIRInserter> Builder(
       F.getContext(), TargetFolder(DL), InstCombineIRInserter(Worklist, &AC));
 
   // Lower dbg.declare intrinsics otherwise their value may be clobbered
@@ -3059,14 +3131,11 @@ combineInstructionsOverFunction(Function &F, InstCombineWorklist &Worklist,
     DEBUG(dbgs() << "\n\nINSTCOMBINE ITERATION #" << Iteration << " on "
                  << F.getName() << "\n");
 
-    bool Changed = false;
-    if (prepareICWorklistFromFunction(F, DL, &TLI, Worklist))
-      Changed = true;
+    bool Changed = prepareICWorklistFromFunction(F, DL, &TLI, Worklist);
 
-    InstCombiner IC(Worklist, &Builder, F.optForMinSize(),
+    InstCombiner IC(Worklist, &Builder, F.optForMinSize(), ExpensiveCombines,
                     AA, &AC, &TLI, &DT, DL, LI);
-    if (IC.run())
-      Changed = true;
+    Changed |= IC.run();
 
     if (!Changed)
       break;
@@ -3076,45 +3145,26 @@ combineInstructionsOverFunction(Function &F, InstCombineWorklist &Worklist,
 }
 
 PreservedAnalyses InstCombinePass::run(Function &F,
-                                       AnalysisManager<Function> *AM) {
-  auto &AC = AM->getResult<AssumptionAnalysis>(F);
-  auto &DT = AM->getResult<DominatorTreeAnalysis>(F);
-  auto &TLI = AM->getResult<TargetLibraryAnalysis>(F);
+                                       AnalysisManager<Function> &AM) {
+  auto &AC = AM.getResult<AssumptionAnalysis>(F);
+  auto &DT = AM.getResult<DominatorTreeAnalysis>(F);
+  auto &TLI = AM.getResult<TargetLibraryAnalysis>(F);
 
-  auto *LI = AM->getCachedResult<LoopAnalysis>(F);
+  auto *LI = AM.getCachedResult<LoopAnalysis>(F);
 
   // FIXME: The AliasAnalysis is not yet supported in the new pass manager
-  if (!combineInstructionsOverFunction(F, Worklist, nullptr, AC, TLI, DT, LI))
+  if (!combineInstructionsOverFunction(F, Worklist, nullptr, AC, TLI, DT,
+                                       ExpensiveCombines, LI))
     // No changes, all analyses are preserved.
     return PreservedAnalyses::all();
 
   // Mark all the analyses that instcombine updates as preserved.
-  // FIXME: Need a way to preserve CFG analyses here!
+  // FIXME: This should also 'preserve the CFG'.
   PreservedAnalyses PA;
   PA.preserve<DominatorTreeAnalysis>();
   return PA;
 }
 
-namespace {
-/// \brief The legacy pass manager's instcombine pass.
-///
-/// This is a basic whole-function wrapper around the instcombine utility. It
-/// will try to combine all instructions in the function.
-class InstructionCombiningPass : public FunctionPass {
-  InstCombineWorklist Worklist;
-
-public:
-  static char ID; // Pass identification, replacement for typeid
-
-  InstructionCombiningPass() : FunctionPass(ID) {
-    initializeInstructionCombiningPassPass(*PassRegistry::getPassRegistry());
-  }
-
-  void getAnalysisUsage(AnalysisUsage &AU) const override;
-  bool runOnFunction(Function &F) override;
-};
-}
-
 void InstructionCombiningPass::getAnalysisUsage(AnalysisUsage &AU) const {
   AU.setPreservesCFG();
   AU.addRequired<AAResultsWrapperPass>();
@@ -3122,11 +3172,13 @@ void InstructionCombiningPass::getAnalysisUsage(AnalysisUsage &AU) const {
   AU.addRequired<TargetLibraryInfoWrapperPass>();
   AU.addRequired<DominatorTreeWrapperPass>();
   AU.addPreserved<DominatorTreeWrapperPass>();
+  AU.addPreserved<AAResultsWrapperPass>();
+  AU.addPreserved<BasicAAWrapperPass>();
   AU.addPreserved<GlobalsAAWrapperPass>();
 }
 
 bool InstructionCombiningPass::runOnFunction(Function &F) {
-  if (skipOptnoneFunction(F))
+  if (skipFunction(F))
     return false;
 
   // Required analyses.
@@ -3139,7 +3191,8 @@ bool InstructionCombiningPass::runOnFunction(Function &F) {
   auto *LIWP = getAnalysisIfAvailable<LoopInfoWrapperPass>();
   auto *LI = LIWP ? &LIWP->getLoopInfo() : nullptr;
 
-  return combineInstructionsOverFunction(F, Worklist, AA, AC, TLI, DT, LI);
+  return combineInstructionsOverFunction(F, Worklist, AA, AC, TLI, DT,
+                                         ExpensiveCombines, LI);
 }
 
 char InstructionCombiningPass::ID = 0;
@@ -3162,6 +3215,6 @@ void LLVMInitializeInstCombine(LLVMPassRegistryRef R) {
   initializeInstructionCombiningPassPass(*unwrap(R));
 }
 
-FunctionPass *llvm::createInstructionCombiningPass() {
-  return new InstructionCombiningPass();
+FunctionPass *llvm::createInstructionCombiningPass(bool ExpensiveCombines) {
+  return new InstructionCombiningPass(ExpensiveCombines);
 }
diff --git a/lib/Transforms/InstCombine/Makefile b/lib/Transforms/InstCombine/Makefile
deleted file mode 100644
index 0c488e78b6d9..000000000000
--- a/lib/Transforms/InstCombine/Makefile
+++ /dev/null
@@ -1,15 +0,0 @@
-##===- lib/Transforms/InstCombine/Makefile -----------------*- Makefile -*-===##
-#
-#                     The LLVM Compiler Infrastructure
-#
-# This file is distributed under the University of Illinois Open Source
-# License. See LICENSE.TXT for details.
-#
-##===----------------------------------------------------------------------===##
-
-LEVEL = ../../..
-LIBRARYNAME = LLVMInstCombine
-BUILD_ARCHIVE = 1
-
-include $(LEVEL)/Makefile.common
-
diff --git a/lib/Transforms/Instrumentation/AddressSanitizer.cpp b/lib/Transforms/Instrumentation/AddressSanitizer.cpp
index a9df5e5898ae..43d1b377f858 100644
--- a/lib/Transforms/Instrumentation/AddressSanitizer.cpp
+++ b/lib/Transforms/Instrumentation/AddressSanitizer.cpp
@@ -13,14 +13,11 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "llvm/Transforms/Instrumentation.h"
 #include "llvm/ADT/ArrayRef.h"
 #include "llvm/ADT/DenseMap.h"
-#include "llvm/ADT/DenseSet.h"
 #include "llvm/ADT/DepthFirstIterator.h"
 #include "llvm/ADT/SetVector.h"
 #include "llvm/ADT/SmallSet.h"
-#include "llvm/ADT/SmallString.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/Statistic.h"
 #include "llvm/ADT/StringExtras.h"
@@ -48,6 +45,7 @@
 #include "llvm/Support/Endian.h"
 #include "llvm/Support/SwapByteOrder.h"
 #include "llvm/Support/raw_ostream.h"
+#include "llvm/Transforms/Instrumentation.h"
 #include "llvm/Transforms/Scalar.h"
 #include "llvm/Transforms/Utils/ASanStackFrameLayout.h"
 #include "llvm/Transforms/Utils/BasicBlockUtils.h"
@@ -65,17 +63,23 @@ using namespace llvm;
 
 static const uint64_t kDefaultShadowScale = 3;
 static const uint64_t kDefaultShadowOffset32 = 1ULL << 29;
-static const uint64_t kIOSShadowOffset32 = 1ULL << 30;
 static const uint64_t kDefaultShadowOffset64 = 1ULL << 44;
+static const uint64_t kIOSShadowOffset32 = 1ULL << 30;
+static const uint64_t kIOSShadowOffset64 = 0x120200000;
+static const uint64_t kIOSSimShadowOffset32 = 1ULL << 30;
+static const uint64_t kIOSSimShadowOffset64 = kDefaultShadowOffset64;
 static const uint64_t kSmallX86_64ShadowOffset = 0x7FFF8000;  // < 2G.
 static const uint64_t kLinuxKasan_ShadowOffset64 = 0xdffffc0000000000;
 static const uint64_t kPPC64_ShadowOffset64 = 1ULL << 41;
+static const uint64_t kSystemZ_ShadowOffset64 = 1ULL << 52;
 static const uint64_t kMIPS32_ShadowOffset32 = 0x0aaa0000;
 static const uint64_t kMIPS64_ShadowOffset64 = 1ULL << 37;
 static const uint64_t kAArch64_ShadowOffset64 = 1ULL << 36;
 static const uint64_t kFreeBSD_ShadowOffset32 = 1ULL << 30;
 static const uint64_t kFreeBSD_ShadowOffset64 = 1ULL << 46;
 static const uint64_t kWindowsShadowOffset32 = 3ULL << 28;
+// TODO(wwchrome): Experimental for asan Win64, may change.
+static const uint64_t kWindowsShadowOffset64 = 0x1ULL << 45;  // 32TB.
 
 static const size_t kMinStackMallocSize = 1 << 6;   // 64B
 static const size_t kMaxStackMallocSize = 1 << 16;  // 64K
@@ -89,11 +93,15 @@ static const char *const kAsanReportErrorTemplate = "__asan_report_";
 static const char *const kAsanRegisterGlobalsName = "__asan_register_globals";
 static const char *const kAsanUnregisterGlobalsName =
     "__asan_unregister_globals";
+static const char *const kAsanRegisterImageGlobalsName =
+  "__asan_register_image_globals";
+static const char *const kAsanUnregisterImageGlobalsName =
+  "__asan_unregister_image_globals";
 static const char *const kAsanPoisonGlobalsName = "__asan_before_dynamic_init";
 static const char *const kAsanUnpoisonGlobalsName = "__asan_after_dynamic_init";
 static const char *const kAsanInitName = "__asan_init";
 static const char *const kAsanVersionCheckName =
-    "__asan_version_mismatch_check_v6";
+    "__asan_version_mismatch_check_v8";
 static const char *const kAsanPtrCmp = "__sanitizer_ptr_cmp";
 static const char *const kAsanPtrSub = "__sanitizer_ptr_sub";
 static const char *const kAsanHandleNoReturnName = "__asan_handle_no_return";
@@ -101,13 +109,16 @@ static const int kMaxAsanStackMallocSizeClass = 10;
 static const char *const kAsanStackMallocNameTemplate = "__asan_stack_malloc_";
 static const char *const kAsanStackFreeNameTemplate = "__asan_stack_free_";
 static const char *const kAsanGenPrefix = "__asan_gen_";
+static const char *const kODRGenPrefix = "__odr_asan_gen_";
 static const char *const kSanCovGenPrefix = "__sancov_gen_";
 static const char *const kAsanPoisonStackMemoryName =
     "__asan_poison_stack_memory";
 static const char *const kAsanUnpoisonStackMemoryName =
     "__asan_unpoison_stack_memory";
+static const char *const kAsanGlobalsRegisteredFlagName =
+    "__asan_globals_registered";
 
-static const char *const kAsanOptionDetectUAR =
+static const char *const kAsanOptionDetectUseAfterReturn =
     "__asan_option_detect_stack_use_after_return";
 
 static const char *const kAsanAllocaPoison = "__asan_alloca_poison";
@@ -154,8 +165,11 @@ static cl::opt<int> ClMaxInsnsToInstrumentPerBB(
 static cl::opt<bool> ClStack("asan-stack", cl::desc("Handle stack memory"),
                              cl::Hidden, cl::init(true));
 static cl::opt<bool> ClUseAfterReturn("asan-use-after-return",
-                                      cl::desc("Check return-after-free"),
+                                      cl::desc("Check stack-use-after-return"),
                                       cl::Hidden, cl::init(true));
+static cl::opt<bool> ClUseAfterScope("asan-use-after-scope",
+                                     cl::desc("Check stack-use-after-scope"),
+                                     cl::Hidden, cl::init(false));
 // This flag may need to be replaced with -f[no]asan-globals.
 static cl::opt<bool> ClGlobals("asan-globals",
                                cl::desc("Handle global objects"), cl::Hidden,
@@ -192,10 +206,14 @@ static cl::opt<bool> ClSkipPromotableAllocas(
 
 // These flags allow to change the shadow mapping.
 // The shadow mapping looks like
-//    Shadow = (Mem >> scale) + (1 << offset_log)
+//    Shadow = (Mem >> scale) + offset
 static cl::opt<int> ClMappingScale("asan-mapping-scale",
                                    cl::desc("scale of asan shadow mapping"),
                                    cl::Hidden, cl::init(0));
+static cl::opt<unsigned long long> ClMappingOffset(
+    "asan-mapping-offset",
+    cl::desc("offset of asan shadow mapping [EXPERIMENTAL]"), cl::Hidden,
+    cl::init(0));
 
 // Optimization flags. Not user visible, used mostly for testing
 // and benchmarking the tool.
@@ -211,11 +229,6 @@ static cl::opt<bool> ClOptStack(
     "asan-opt-stack", cl::desc("Don't instrument scalar stack variables"),
     cl::Hidden, cl::init(false));
 
-static cl::opt<bool> ClCheckLifetime(
-    "asan-check-lifetime",
-    cl::desc("Use llvm.lifetime intrinsics to insert extra checks"), cl::Hidden,
-    cl::init(false));
-
 static cl::opt<bool> ClDynamicAllocaStack(
     "asan-stack-dynamic-alloca",
     cl::desc("Use dynamic alloca to represent stack variables"), cl::Hidden,
@@ -226,6 +239,19 @@ static cl::opt<uint32_t> ClForceExperiment(
     cl::desc("Force optimization experiment (for testing)"), cl::Hidden,
     cl::init(0));
 
+static cl::opt<bool>
+    ClUsePrivateAliasForGlobals("asan-use-private-alias",
+                                cl::desc("Use private aliases for global"
+                                         " variables"),
+                                cl::Hidden, cl::init(false));
+
+static cl::opt<bool>
+    ClUseMachOGlobalsSection("asan-globals-live-support",
+                             cl::desc("Use linker features to support dead "
+                                      "code stripping of globals "
+                                      "(Mach-O only)"),
+                             cl::Hidden, cl::init(false));
+
 // Debug flags.
 static cl::opt<int> ClDebug("asan-debug", cl::desc("debug"), cl::Hidden,
                             cl::init(0));
@@ -334,11 +360,13 @@ struct ShadowMapping {
 static ShadowMapping getShadowMapping(Triple &TargetTriple, int LongSize,
                                       bool IsKasan) {
   bool IsAndroid = TargetTriple.isAndroid();
-  bool IsIOS = TargetTriple.isiOS();
+  bool IsIOS = TargetTriple.isiOS() || TargetTriple.isWatchOS();
   bool IsFreeBSD = TargetTriple.isOSFreeBSD();
   bool IsLinux = TargetTriple.isOSLinux();
   bool IsPPC64 = TargetTriple.getArch() == llvm::Triple::ppc64 ||
                  TargetTriple.getArch() == llvm::Triple::ppc64le;
+  bool IsSystemZ = TargetTriple.getArch() == llvm::Triple::systemz;
+  bool IsX86 = TargetTriple.getArch() == llvm::Triple::x86;
   bool IsX86_64 = TargetTriple.getArch() == llvm::Triple::x86_64;
   bool IsMIPS32 = TargetTriple.getArch() == llvm::Triple::mips ||
                   TargetTriple.getArch() == llvm::Triple::mipsel;
@@ -359,7 +387,8 @@ static ShadowMapping getShadowMapping(Triple &TargetTriple, int LongSize,
     else if (IsFreeBSD)
       Mapping.Offset = kFreeBSD_ShadowOffset32;
     else if (IsIOS)
-      Mapping.Offset = kIOSShadowOffset32;
+      // If we're targeting iOS and x86, the binary is built for iOS simulator.
+      Mapping.Offset = IsX86 ? kIOSSimShadowOffset32 : kIOSShadowOffset32;
     else if (IsWindows)
       Mapping.Offset = kWindowsShadowOffset32;
     else
@@ -367,6 +396,8 @@ static ShadowMapping getShadowMapping(Triple &TargetTriple, int LongSize,
   } else {  // LongSize == 64
     if (IsPPC64)
       Mapping.Offset = kPPC64_ShadowOffset64;
+    else if (IsSystemZ)
+      Mapping.Offset = kSystemZ_ShadowOffset64;
     else if (IsFreeBSD)
       Mapping.Offset = kFreeBSD_ShadowOffset64;
     else if (IsLinux && IsX86_64) {
@@ -374,8 +405,13 @@ static ShadowMapping getShadowMapping(Triple &TargetTriple, int LongSize,
         Mapping.Offset = kLinuxKasan_ShadowOffset64;
       else
         Mapping.Offset = kSmallX86_64ShadowOffset;
+    } else if (IsWindows && IsX86_64) {
+      Mapping.Offset = kWindowsShadowOffset64;
     } else if (IsMIPS64)
       Mapping.Offset = kMIPS64_ShadowOffset64;
+    else if (IsIOS)
+      // If we're targeting iOS and x86, the binary is built for iOS simulator.
+      Mapping.Offset = IsX86_64 ? kIOSSimShadowOffset64 : kIOSShadowOffset64;
     else if (IsAArch64)
       Mapping.Offset = kAArch64_ShadowOffset64;
     else
@@ -383,14 +419,20 @@ static ShadowMapping getShadowMapping(Triple &TargetTriple, int LongSize,
   }
 
   Mapping.Scale = kDefaultShadowScale;
-  if (ClMappingScale) {
+  if (ClMappingScale.getNumOccurrences() > 0) {
     Mapping.Scale = ClMappingScale;
   }
 
+  if (ClMappingOffset.getNumOccurrences() > 0) {
+    Mapping.Offset = ClMappingOffset;
+  }
+
   // OR-ing shadow offset if more efficient (at least on x86) if the offset
   // is a power of two, but on ppc64 we have to use add since the shadow
-  // offset is not necessary 1/8-th of the address space.
-  Mapping.OrShadowOffset = !IsAArch64 && !IsPPC64
+  // offset is not necessary 1/8-th of the address space.  On SystemZ,
+  // we could OR the constant in a single instruction, but it's more
+  // efficient to load it once and use indexed addressing.
+  Mapping.OrShadowOffset = !IsAArch64 && !IsPPC64 && !IsSystemZ
                            && !(Mapping.Offset & (Mapping.Offset - 1));
 
   return Mapping;
@@ -404,9 +446,11 @@ static size_t RedzoneSizeForScale(int MappingScale) {
 
 /// AddressSanitizer: instrument the code in module to find memory bugs.
 struct AddressSanitizer : public FunctionPass {
-  explicit AddressSanitizer(bool CompileKernel = false, bool Recover = false)
+  explicit AddressSanitizer(bool CompileKernel = false, bool Recover = false,
+                            bool UseAfterScope = false)
       : FunctionPass(ID), CompileKernel(CompileKernel || ClEnableKasan),
-        Recover(Recover || ClRecover) {
+        Recover(Recover || ClRecover),
+        UseAfterScope(UseAfterScope || ClUseAfterScope) {
     initializeAddressSanitizerPass(*PassRegistry::getPassRegistry());
   }
   const char *getPassName() const override {
@@ -417,19 +461,20 @@ struct AddressSanitizer : public FunctionPass {
     AU.addRequired<TargetLibraryInfoWrapperPass>();
   }
   uint64_t getAllocaSizeInBytes(AllocaInst *AI) const {
+    uint64_t ArraySize = 1;
+    if (AI->isArrayAllocation()) {
+      ConstantInt *CI = dyn_cast<ConstantInt>(AI->getArraySize());
+      assert(CI && "non-constant array size");
+      ArraySize = CI->getZExtValue();
+    }
     Type *Ty = AI->getAllocatedType();
     uint64_t SizeInBytes =
         AI->getModule()->getDataLayout().getTypeAllocSize(Ty);
-    return SizeInBytes;
+    return SizeInBytes * ArraySize;
   }
   /// Check if we want (and can) handle this alloca.
   bool isInterestingAlloca(AllocaInst &AI);
 
-  // Check if we have dynamic alloca.
-  bool isDynamicAlloca(AllocaInst &AI) const {
-    return AI.isArrayAllocation() || !AI.isStaticAlloca();
-  }
-
   /// If it is an interesting memory access, return the PointerOperand
   /// and set IsWrite/Alignment. Otherwise return nullptr.
   Value *isInterestingMemoryAccess(Instruction *I, bool *IsWrite,
@@ -483,6 +528,7 @@ struct AddressSanitizer : public FunctionPass {
   int LongSize;
   bool CompileKernel;
   bool Recover;
+  bool UseAfterScope;
   Type *IntptrTy;
   ShadowMapping Mapping;
   DominatorTree *DT;
@@ -519,6 +565,7 @@ class AddressSanitizerModule : public ModulePass {
 
   bool InstrumentGlobals(IRBuilder<> &IRB, Module &M);
   bool ShouldInstrumentGlobal(GlobalVariable *G);
+  bool ShouldUseMachOGlobalsSection() const;
   void poisonOneInitializer(Function &GlobalInit, GlobalValue *ModuleName);
   void createInitializerPoisonCalls(Module &M, GlobalValue *ModuleName);
   size_t MinRedzoneSizeForGlobal() const {
@@ -536,6 +583,8 @@ class AddressSanitizerModule : public ModulePass {
   Function *AsanUnpoisonGlobals;
   Function *AsanRegisterGlobals;
   Function *AsanUnregisterGlobals;
+  Function *AsanRegisterImageGlobals;
+  Function *AsanUnregisterImageGlobals;
 };
 
 // Stack poisoning does not play well with exception handling.
@@ -680,7 +729,7 @@ struct FunctionStackPoisoner : public InstVisitor<FunctionStackPoisoner> {
     }
 
     StackAlignment = std::max(StackAlignment, AI.getAlignment());
-    if (ASan.isDynamicAlloca(AI))
+    if (!AI.isStaticAlloca())
       DynamicAllocaVec.push_back(&AI);
     else
       AllocaVec.push_back(&AI);
@@ -692,7 +741,8 @@ struct FunctionStackPoisoner : public InstVisitor<FunctionStackPoisoner> {
     Intrinsic::ID ID = II.getIntrinsicID();
     if (ID == Intrinsic::stackrestore) StackRestoreVec.push_back(&II);
     if (ID == Intrinsic::localescape) LocalEscapeCall = &II;
-    if (!ClCheckLifetime) return;
+    if (!ASan.UseAfterScope)
+      return;
     if (ID != Intrinsic::lifetime_start && ID != Intrinsic::lifetime_end)
       return;
     // Found lifetime intrinsic, add ASan instrumentation if necessary.
@@ -707,7 +757,8 @@ struct FunctionStackPoisoner : public InstVisitor<FunctionStackPoisoner> {
       return;
     // Find alloca instruction that corresponds to llvm.lifetime argument.
     AllocaInst *AI = findAllocaForValue(II.getArgOperand(1));
-    if (!AI) return;
+    if (!AI || !ASan.isInterestingAlloca(*AI))
+      return;
     bool DoPoison = (ID == Intrinsic::lifetime_end);
     AllocaPoisonCall APC = {&II, AI, SizeValue, DoPoison};
     AllocaPoisonCallVec.push_back(APC);
@@ -760,9 +811,10 @@ INITIALIZE_PASS_END(
     "AddressSanitizer: detects use-after-free and out-of-bounds bugs.", false,
     false)
 FunctionPass *llvm::createAddressSanitizerFunctionPass(bool CompileKernel,
-                                                       bool Recover) {
+                                                       bool Recover,
+                                                       bool UseAfterScope) {
   assert(!CompileKernel || Recover);
-  return new AddressSanitizer(CompileKernel, Recover);
+  return new AddressSanitizer(CompileKernel, Recover, UseAfterScope);
 }
 
 char AddressSanitizerModule::ID = 0;
@@ -792,7 +844,7 @@ static GlobalVariable *createPrivateGlobalForString(Module &M, StringRef Str,
   GlobalVariable *GV =
       new GlobalVariable(M, StrConst->getType(), true,
                          GlobalValue::PrivateLinkage, StrConst, kAsanGenPrefix);
-  if (AllowMerging) GV->setUnnamedAddr(true);
+  if (AllowMerging) GV->setUnnamedAddr(GlobalValue::UnnamedAddr::Global);
   GV->setAlignment(1);  // Strings may not be merged w/o setting align 1.
   return GV;
 }
@@ -809,13 +861,23 @@ static GlobalVariable *createPrivateGlobalForSourceLoc(Module &M,
   auto GV = new GlobalVariable(M, LocStruct->getType(), true,
                                GlobalValue::PrivateLinkage, LocStruct,
                                kAsanGenPrefix);
-  GV->setUnnamedAddr(true);
+  GV->setUnnamedAddr(GlobalValue::UnnamedAddr::Global);
   return GV;
 }
 
-static bool GlobalWasGeneratedByAsan(GlobalVariable *G) {
-  return G->getName().find(kAsanGenPrefix) == 0 ||
-         G->getName().find(kSanCovGenPrefix) == 0;
+/// \brief Check if \p G has been created by a trusted compiler pass.
+static bool GlobalWasGeneratedByCompiler(GlobalVariable *G) {
+  // Do not instrument asan globals.
+  if (G->getName().startswith(kAsanGenPrefix) ||
+      G->getName().startswith(kSanCovGenPrefix) ||
+      G->getName().startswith(kODRGenPrefix))
+    return true;
+
+  // Do not instrument gcov counter arrays.
+  if (G->getName() == "__llvm_gcov_ctr")
+    return true;
+
+  return false;
 }
 
 Value *AddressSanitizer::memToShadow(Value *Shadow, IRBuilder<> &IRB) {
@@ -858,7 +920,7 @@ bool AddressSanitizer::isInterestingAlloca(AllocaInst &AI) {
   bool IsInteresting =
       (AI.getAllocatedType()->isSized() &&
        // alloca() may be called with 0 size, ignore it.
-       getAllocaSizeInBytes(&AI) > 0 &&
+       ((!AI.isStaticAlloca()) || getAllocaSizeInBytes(&AI) > 0) &&
        // We are only interested in allocas not promotable to registers.
        // Promotable allocas are common under -O0.
        (!ClSkipPromotableAllocas || !isAllocaPromotable(&AI)) &&
@@ -907,6 +969,14 @@ Value *AddressSanitizer::isInterestingMemoryAccess(Instruction *I,
     PtrOperand = XCHG->getPointerOperand();
   }
 
+  // Do not instrument acesses from different address spaces; we cannot deal
+  // with them.
+  if (PtrOperand) {
+    Type *PtrTy = cast<PointerType>(PtrOperand->getType()->getScalarType());
+    if (PtrTy->getPointerAddressSpace() != 0)
+      return nullptr;
+  }
+
   // Treat memory accesses to promotable allocas as non-interesting since they
   // will not cause memory violations. This greatly speeds up the instrumented
   // executable at -O0.
@@ -948,9 +1018,9 @@ void AddressSanitizer::instrumentPointerComparisonOrSubtraction(
   IRBuilder<> IRB(I);
   Function *F = isa<ICmpInst>(I) ? AsanPtrCmpFunction : AsanPtrSubFunction;
   Value *Param[2] = {I->getOperand(0), I->getOperand(1)};
-  for (int i = 0; i < 2; i++) {
-    if (Param[i]->getType()->isPointerTy())
-      Param[i] = IRB.CreatePointerCast(Param[i], IntptrTy);
+  for (Value *&i : Param) {
+    if (i->getType()->isPointerTy())
+      i = IRB.CreatePointerCast(i, IntptrTy);
   }
   IRB.CreateCall(F, Param);
 }
@@ -1048,7 +1118,7 @@ Instruction *AddressSanitizer::generateCrashCode(Instruction *InsertBefore,
 Value *AddressSanitizer::createSlowPathCmp(IRBuilder<> &IRB, Value *AddrLong,
                                            Value *ShadowValue,
                                            uint32_t TypeSize) {
-  size_t Granularity = 1 << Mapping.Scale;
+  size_t Granularity = static_cast<size_t>(1) << Mapping.Scale;
   // Addr & (Granularity - 1)
   Value *LastAccessedByte =
       IRB.CreateAnd(AddrLong, ConstantInt::get(IntptrTy, Granularity - 1));
@@ -1091,7 +1161,7 @@ void AddressSanitizer::instrumentAddress(Instruction *OrigIns,
       IRB.CreateLoad(IRB.CreateIntToPtr(ShadowPtr, ShadowPtrTy));
 
   Value *Cmp = IRB.CreateICmpNE(ShadowValue, CmpVal);
-  size_t Granularity = 1 << Mapping.Scale;
+  size_t Granularity = 1ULL << Mapping.Scale;
   TerminatorInst *CrashTerm = nullptr;
 
   if (ClAlwaysSlowPath || (TypeSize < 8 * Granularity)) {
@@ -1184,13 +1254,13 @@ void AddressSanitizerModule::createInitializerPoisonCalls(
 }
 
 bool AddressSanitizerModule::ShouldInstrumentGlobal(GlobalVariable *G) {
-  Type *Ty = cast<PointerType>(G->getType())->getElementType();
+  Type *Ty = G->getValueType();
   DEBUG(dbgs() << "GLOBAL: " << *G << "\n");
 
   if (GlobalsMD.get(G).IsBlacklisted) return false;
   if (!Ty->isSized()) return false;
   if (!G->hasInitializer()) return false;
-  if (GlobalWasGeneratedByAsan(G)) return false;  // Our own global.
+  if (GlobalWasGeneratedByCompiler(G)) return false; // Our own globals.
   // Touch only those globals that will not be defined in other modules.
   // Don't handle ODR linkage types and COMDATs since other modules may be built
   // without ASan.
@@ -1207,12 +1277,12 @@ bool AddressSanitizerModule::ShouldInstrumentGlobal(GlobalVariable *G) {
   if (G->getAlignment() > MinRedzoneSizeForGlobal()) return false;
 
   if (G->hasSection()) {
-    StringRef Section(G->getSection());
+    StringRef Section = G->getSection();
 
     // Globals from llvm.metadata aren't emitted, do not instrument them.
     if (Section == "llvm.metadata") return false;
     // Do not instrument globals from special LLVM sections.
-    if (Section.find("__llvm") != StringRef::npos) return false;
+    if (Section.find("__llvm") != StringRef::npos || Section.find("__LLVM") != StringRef::npos) return false;
 
     // Do not instrument function pointers to initialization and termination
     // routines: dynamic linker will not properly handle redzones.
@@ -1271,8 +1341,29 @@ bool AddressSanitizerModule::ShouldInstrumentGlobal(GlobalVariable *G) {
   return true;
 }
 
+// On Mach-O platforms, we emit global metadata in a separate section of the
+// binary in order to allow the linker to properly dead strip. This is only
+// supported on recent versions of ld64.
+bool AddressSanitizerModule::ShouldUseMachOGlobalsSection() const {
+  if (!ClUseMachOGlobalsSection)
+    return false;
+
+  if (!TargetTriple.isOSBinFormatMachO())
+    return false;
+
+  if (TargetTriple.isMacOSX() && !TargetTriple.isMacOSXVersionLT(10, 11))
+    return true;
+  if (TargetTriple.isiOS() /* or tvOS */ && !TargetTriple.isOSVersionLT(9))
+    return true;
+  if (TargetTriple.isWatchOS() && !TargetTriple.isOSVersionLT(2))
+    return true;
+
+  return false;
+}
+
 void AddressSanitizerModule::initializeCallbacks(Module &M) {
   IRBuilder<> IRB(*C);
+
   // Declare our poisoning and unpoisoning functions.
   AsanPoisonGlobals = checkSanitizerInterfaceFunction(M.getOrInsertFunction(
       kAsanPoisonGlobalsName, IRB.getVoidTy(), IntptrTy, nullptr));
@@ -1280,6 +1371,7 @@ void AddressSanitizerModule::initializeCallbacks(Module &M) {
   AsanUnpoisonGlobals = checkSanitizerInterfaceFunction(M.getOrInsertFunction(
       kAsanUnpoisonGlobalsName, IRB.getVoidTy(), nullptr));
   AsanUnpoisonGlobals->setLinkage(Function::ExternalLinkage);
+
   // Declare functions that register/unregister globals.
   AsanRegisterGlobals = checkSanitizerInterfaceFunction(M.getOrInsertFunction(
       kAsanRegisterGlobalsName, IRB.getVoidTy(), IntptrTy, IntptrTy, nullptr));
@@ -1288,6 +1380,18 @@ void AddressSanitizerModule::initializeCallbacks(Module &M) {
       M.getOrInsertFunction(kAsanUnregisterGlobalsName, IRB.getVoidTy(),
                             IntptrTy, IntptrTy, nullptr));
   AsanUnregisterGlobals->setLinkage(Function::ExternalLinkage);
+
+  // Declare the functions that find globals in a shared object and then invoke
+  // the (un)register function on them.
+  AsanRegisterImageGlobals = checkSanitizerInterfaceFunction(
+      M.getOrInsertFunction(kAsanRegisterImageGlobalsName,
+      IRB.getVoidTy(), IntptrTy, nullptr));
+  AsanRegisterImageGlobals->setLinkage(Function::ExternalLinkage);
+
+  AsanUnregisterImageGlobals = checkSanitizerInterfaceFunction(
+      M.getOrInsertFunction(kAsanUnregisterImageGlobalsName,
+      IRB.getVoidTy(), IntptrTy, nullptr));
+  AsanUnregisterImageGlobals->setLinkage(Function::ExternalLinkage);
 }
 
 // This function replaces all global variables with new variables that have
@@ -1313,10 +1417,11 @@ bool AddressSanitizerModule::InstrumentGlobals(IRBuilder<> &IRB, Module &M) {
   //   const char *module_name;
   //   size_t has_dynamic_init;
   //   void *source_location;
+  //   size_t odr_indicator;
   // We initialize an array of such structures and pass it to a run-time call.
   StructType *GlobalStructTy =
       StructType::get(IntptrTy, IntptrTy, IntptrTy, IntptrTy, IntptrTy,
-                      IntptrTy, IntptrTy, nullptr);
+                      IntptrTy, IntptrTy, IntptrTy, nullptr);
   SmallVector<Constant *, 16> Initializers(n);
 
   bool HasDynamicallyInitializedGlobals = false;
@@ -1332,14 +1437,14 @@ bool AddressSanitizerModule::InstrumentGlobals(IRBuilder<> &IRB, Module &M) {
     GlobalVariable *G = GlobalsToChange[i];
 
     auto MD = GlobalsMD.get(G);
+    StringRef NameForGlobal = G->getName();
     // Create string holding the global name (use global name from metadata
     // if it's available, otherwise just write the name of global variable).
     GlobalVariable *Name = createPrivateGlobalForString(
-        M, MD.Name.empty() ? G->getName() : MD.Name,
+        M, MD.Name.empty() ? NameForGlobal : MD.Name,
         /*AllowMerging*/ true);
 
-    PointerType *PtrTy = cast<PointerType>(G->getType());
-    Type *Ty = PtrTy->getElementType();
+    Type *Ty = G->getValueType();
     uint64_t SizeInBytes = DL.getTypeAllocSize(Ty);
     uint64_t MinRZ = MinRedzoneSizeForGlobal();
     // MinRZ <= RZ <= kMaxGlobalRedzone
@@ -1384,41 +1489,125 @@ bool AddressSanitizerModule::InstrumentGlobals(IRBuilder<> &IRB, Module &M) {
       SourceLoc = ConstantInt::get(IntptrTy, 0);
     }
 
+    Constant *ODRIndicator = ConstantExpr::getNullValue(IRB.getInt8PtrTy());
+    GlobalValue *InstrumentedGlobal = NewGlobal;
+
+    bool CanUsePrivateAliases = TargetTriple.isOSBinFormatELF();
+    if (CanUsePrivateAliases && ClUsePrivateAliasForGlobals) {
+      // Create local alias for NewGlobal to avoid crash on ODR between
+      // instrumented and non-instrumented libraries.
+      auto *GA = GlobalAlias::create(GlobalValue::InternalLinkage,
+                                     NameForGlobal + M.getName(), NewGlobal);
+
+      // With local aliases, we need to provide another externally visible
+      // symbol __odr_asan_XXX to detect ODR violation.
+      auto *ODRIndicatorSym =
+          new GlobalVariable(M, IRB.getInt8Ty(), false, Linkage,
+                             Constant::getNullValue(IRB.getInt8Ty()),
+                             kODRGenPrefix + NameForGlobal, nullptr,
+                             NewGlobal->getThreadLocalMode());
+
+      // Set meaningful attributes for indicator symbol.
+      ODRIndicatorSym->setVisibility(NewGlobal->getVisibility());
+      ODRIndicatorSym->setDLLStorageClass(NewGlobal->getDLLStorageClass());
+      ODRIndicatorSym->setAlignment(1);
+      ODRIndicator = ODRIndicatorSym;
+      InstrumentedGlobal = GA;
+    }
+
     Initializers[i] = ConstantStruct::get(
-        GlobalStructTy, ConstantExpr::getPointerCast(NewGlobal, IntptrTy),
+        GlobalStructTy,
+        ConstantExpr::getPointerCast(InstrumentedGlobal, IntptrTy),
         ConstantInt::get(IntptrTy, SizeInBytes),
         ConstantInt::get(IntptrTy, SizeInBytes + RightRedzoneSize),
         ConstantExpr::getPointerCast(Name, IntptrTy),
         ConstantExpr::getPointerCast(ModuleName, IntptrTy),
-        ConstantInt::get(IntptrTy, MD.IsDynInit), SourceLoc, nullptr);
+        ConstantInt::get(IntptrTy, MD.IsDynInit), SourceLoc,
+        ConstantExpr::getPointerCast(ODRIndicator, IntptrTy), nullptr);
 
     if (ClInitializers && MD.IsDynInit) HasDynamicallyInitializedGlobals = true;
 
     DEBUG(dbgs() << "NEW GLOBAL: " << *NewGlobal << "\n");
   }
 
-  ArrayType *ArrayOfGlobalStructTy = ArrayType::get(GlobalStructTy, n);
-  GlobalVariable *AllGlobals = new GlobalVariable(
-      M, ArrayOfGlobalStructTy, false, GlobalVariable::InternalLinkage,
-      ConstantArray::get(ArrayOfGlobalStructTy, Initializers), "");
+
+  GlobalVariable *AllGlobals = nullptr;
+  GlobalVariable *RegisteredFlag = nullptr;
+
+  // On recent Mach-O platforms, we emit the global metadata in a way that
+  // allows the linker to properly strip dead globals.
+  if (ShouldUseMachOGlobalsSection()) {
+    // RegisteredFlag serves two purposes. First, we can pass it to dladdr()
+    // to look up the loaded image that contains it. Second, we can store in it
+    // whether registration has already occurred, to prevent duplicate
+    // registration.
+    //
+    // Common linkage allows us to coalesce needles defined in each object
+    // file so that there's only one per shared library.
+    RegisteredFlag = new GlobalVariable(
+        M, IntptrTy, false, GlobalVariable::CommonLinkage,
+        ConstantInt::get(IntptrTy, 0), kAsanGlobalsRegisteredFlagName);
+
+    // We also emit a structure which binds the liveness of the global
+    // variable to the metadata struct.
+    StructType *LivenessTy = StructType::get(IntptrTy, IntptrTy, nullptr);
+
+    for (size_t i = 0; i < n; i++) {
+      GlobalVariable *Metadata = new GlobalVariable(
+          M, GlobalStructTy, false, GlobalVariable::InternalLinkage,
+          Initializers[i], "");
+      Metadata->setSection("__DATA,__asan_globals,regular");
+      Metadata->setAlignment(1); // don't leave padding in between
+
+      auto LivenessBinder = ConstantStruct::get(LivenessTy,
+          Initializers[i]->getAggregateElement(0u),
+          ConstantExpr::getPointerCast(Metadata, IntptrTy),
+          nullptr);
+      GlobalVariable *Liveness = new GlobalVariable(
+          M, LivenessTy, false, GlobalVariable::InternalLinkage,
+          LivenessBinder, "");
+      Liveness->setSection("__DATA,__asan_liveness,regular,live_support");
+    }
+  } else {
+    // On all other platfoms, we just emit an array of global metadata
+    // structures.
+    ArrayType *ArrayOfGlobalStructTy = ArrayType::get(GlobalStructTy, n);
+    AllGlobals = new GlobalVariable(
+        M, ArrayOfGlobalStructTy, false, GlobalVariable::InternalLinkage,
+        ConstantArray::get(ArrayOfGlobalStructTy, Initializers), "");
+  }
 
   // Create calls for poisoning before initializers run and unpoisoning after.
   if (HasDynamicallyInitializedGlobals)
     createInitializerPoisonCalls(M, ModuleName);
-  IRB.CreateCall(AsanRegisterGlobals,
-                 {IRB.CreatePointerCast(AllGlobals, IntptrTy),
-                  ConstantInt::get(IntptrTy, n)});
 
-  // We also need to unregister globals at the end, e.g. when a shared library
+  // Create a call to register the globals with the runtime.
+  if (ShouldUseMachOGlobalsSection()) {
+    IRB.CreateCall(AsanRegisterImageGlobals,
+                   {IRB.CreatePointerCast(RegisteredFlag, IntptrTy)});
+  } else {
+    IRB.CreateCall(AsanRegisterGlobals,
+                   {IRB.CreatePointerCast(AllGlobals, IntptrTy),
+                    ConstantInt::get(IntptrTy, n)});
+  }
+
+  // We also need to unregister globals at the end, e.g., when a shared library
   // gets closed.
   Function *AsanDtorFunction =
       Function::Create(FunctionType::get(Type::getVoidTy(*C), false),
                        GlobalValue::InternalLinkage, kAsanModuleDtorName, &M);
   BasicBlock *AsanDtorBB = BasicBlock::Create(*C, "", AsanDtorFunction);
   IRBuilder<> IRB_Dtor(ReturnInst::Create(*C, AsanDtorBB));
-  IRB_Dtor.CreateCall(AsanUnregisterGlobals,
-                      {IRB.CreatePointerCast(AllGlobals, IntptrTy),
-                       ConstantInt::get(IntptrTy, n)});
+
+  if (ShouldUseMachOGlobalsSection()) {
+    IRB_Dtor.CreateCall(AsanUnregisterImageGlobals,
+                        {IRB.CreatePointerCast(RegisteredFlag, IntptrTy)});
+  } else {
+    IRB_Dtor.CreateCall(AsanUnregisterGlobals,
+                        {IRB.CreatePointerCast(AllGlobals, IntptrTy),
+                         ConstantInt::get(IntptrTy, n)});
+  }
+
   appendToGlobalDtors(M, AsanDtorFunction, kAsanCtorAndDtorPriority);
 
   DEBUG(dbgs() << M);
@@ -1467,7 +1656,7 @@ void AddressSanitizer::initializeCallbacks(Module &M) {
               IRB.getVoidTy(), IntptrTy, IntptrTy, ExpType, nullptr));
       for (size_t AccessSizeIndex = 0; AccessSizeIndex < kNumberOfAccessSizes;
            AccessSizeIndex++) {
-        const std::string Suffix = TypeStr + itostr(1 << AccessSizeIndex);
+        const std::string Suffix = TypeStr + itostr(1ULL << AccessSizeIndex);
         AsanErrorCallback[AccessIsWrite][Exp][AccessSizeIndex] =
             checkSanitizerInterfaceFunction(M.getOrInsertFunction(
                 kAsanReportErrorTemplate + ExpStr + Suffix + EndingStr,
@@ -1608,6 +1797,8 @@ bool AddressSanitizer::runOnFunction(Function &F) {
   bool IsWrite;
   unsigned Alignment;
   uint64_t TypeSize;
+  const TargetLibraryInfo *TLI =
+      &getAnalysis<TargetLibraryInfoWrapperPass>().getTLI();
 
   // Fill the set of memory operations to instrument.
   for (auto &BB : F) {
@@ -1636,6 +1827,8 @@ bool AddressSanitizer::runOnFunction(Function &F) {
           TempsToInstrument.clear();
           if (CS.doesNotReturn()) NoReturnCalls.push_back(CS.getInstruction());
         }
+        if (CallInst *CI = dyn_cast<CallInst>(&Inst))
+          maybeMarkSanitizerLibraryCallNoBuiltin(CI, TLI);
         continue;
       }
       ToInstrument.push_back(&Inst);
@@ -1648,8 +1841,6 @@ bool AddressSanitizer::runOnFunction(Function &F) {
       CompileKernel ||
       (ClInstrumentationWithCallsThreshold >= 0 &&
        ToInstrument.size() > (unsigned)ClInstrumentationWithCallsThreshold);
-  const TargetLibraryInfo *TLI =
-      &getAnalysis<TargetLibraryInfoWrapperPass>().getTLI();
   const DataLayout &DL = F.getParent()->getDataLayout();
   ObjectSizeOffsetVisitor ObjSizeVis(DL, TLI, F.getContext(),
                                      /*RoundToAlign=*/true);
@@ -1713,12 +1904,15 @@ void FunctionStackPoisoner::initializeCallbacks(Module &M) {
         M.getOrInsertFunction(kAsanStackFreeNameTemplate + Suffix,
                               IRB.getVoidTy(), IntptrTy, IntptrTy, nullptr));
   }
-  AsanPoisonStackMemoryFunc = checkSanitizerInterfaceFunction(
-      M.getOrInsertFunction(kAsanPoisonStackMemoryName, IRB.getVoidTy(),
-                            IntptrTy, IntptrTy, nullptr));
-  AsanUnpoisonStackMemoryFunc = checkSanitizerInterfaceFunction(
-      M.getOrInsertFunction(kAsanUnpoisonStackMemoryName, IRB.getVoidTy(),
-                            IntptrTy, IntptrTy, nullptr));
+  if (ASan.UseAfterScope) {
+    AsanPoisonStackMemoryFunc = checkSanitizerInterfaceFunction(
+        M.getOrInsertFunction(kAsanPoisonStackMemoryName, IRB.getVoidTy(),
+                              IntptrTy, IntptrTy, nullptr));
+    AsanUnpoisonStackMemoryFunc = checkSanitizerInterfaceFunction(
+        M.getOrInsertFunction(kAsanUnpoisonStackMemoryName, IRB.getVoidTy(),
+                              IntptrTy, IntptrTy, nullptr));
+  }
+
   AsanAllocaPoisonFunc = checkSanitizerInterfaceFunction(M.getOrInsertFunction(
       kAsanAllocaPoison, IRB.getVoidTy(), IntptrTy, IntptrTy, nullptr));
   AsanAllocasUnpoisonFunc =
@@ -1825,13 +2019,21 @@ void FunctionStackPoisoner::poisonStack() {
   assert(AllocaVec.size() > 0 || DynamicAllocaVec.size() > 0);
 
   // Insert poison calls for lifetime intrinsics for alloca.
-  bool HavePoisonedAllocas = false;
+  bool HavePoisonedStaticAllocas = false;
   for (const auto &APC : AllocaPoisonCallVec) {
     assert(APC.InsBefore);
     assert(APC.AI);
+    assert(ASan.isInterestingAlloca(*APC.AI));
+    bool IsDynamicAlloca = !(*APC.AI).isStaticAlloca();
+    if (!ClInstrumentAllocas && IsDynamicAlloca)
+      continue;
+
     IRBuilder<> IRB(APC.InsBefore);
     poisonAlloca(APC.AI, APC.Size, IRB, APC.DoPoison);
-    HavePoisonedAllocas |= APC.DoPoison;
+    // Dynamic allocas will be unpoisoned unconditionally below in
+    // unpoisonDynamicAllocas.
+    // Flag that we need unpoison static allocas.
+    HavePoisonedStaticAllocas |= (APC.DoPoison && !IsDynamicAlloca);
   }
 
   if (ClInstrumentAllocas && DynamicAllocaVec.size() > 0) {
@@ -1846,7 +2048,7 @@ void FunctionStackPoisoner::poisonStack() {
 
   int StackMallocIdx = -1;
   DebugLoc EntryDebugLocation;
-  if (auto SP = getDISubprogram(&F))
+  if (auto SP = F.getSubprogram())
     EntryDebugLocation = DebugLoc::get(SP->getScopeLine(), 0, SP);
 
   Instruction *InsBefore = AllocaVec[0];
@@ -1878,7 +2080,7 @@ void FunctionStackPoisoner::poisonStack() {
   // i.e. 32 bytes on 64-bit platforms and 16 bytes in 32-bit platforms.
   size_t MinHeaderSize = ASan.LongSize / 2;
   ASanStackFrameLayout L;
-  ComputeASanStackFrameLayout(SVD, 1UL << Mapping.Scale, MinHeaderSize, &L);
+  ComputeASanStackFrameLayout(SVD, 1ULL << Mapping.Scale, MinHeaderSize, &L);
   DEBUG(dbgs() << L.DescriptionString << " --- " << L.FrameSize << "\n");
   uint64_t LocalStackSize = L.FrameSize;
   bool DoStackMalloc = ClUseAfterReturn && !ASan.CompileKernel &&
@@ -1904,13 +2106,13 @@ void FunctionStackPoisoner::poisonStack() {
     //     ? __asan_stack_malloc_N(LocalStackSize)
     //     : nullptr;
     // void *LocalStackBase = (FakeStack) ? FakeStack : alloca(LocalStackSize);
-    Constant *OptionDetectUAR = F.getParent()->getOrInsertGlobal(
-        kAsanOptionDetectUAR, IRB.getInt32Ty());
-    Value *UARIsEnabled =
-        IRB.CreateICmpNE(IRB.CreateLoad(OptionDetectUAR),
+    Constant *OptionDetectUseAfterReturn = F.getParent()->getOrInsertGlobal(
+        kAsanOptionDetectUseAfterReturn, IRB.getInt32Ty());
+    Value *UseAfterReturnIsEnabled =
+        IRB.CreateICmpNE(IRB.CreateLoad(OptionDetectUseAfterReturn),
                          Constant::getNullValue(IRB.getInt32Ty()));
     Instruction *Term =
-        SplitBlockAndInsertIfThen(UARIsEnabled, InsBefore, false);
+        SplitBlockAndInsertIfThen(UseAfterReturnIsEnabled, InsBefore, false);
     IRBuilder<> IRBIf(Term);
     IRBIf.SetCurrentDebugLocation(EntryDebugLocation);
     StackMallocIdx = StackMallocSizeClass(LocalStackSize);
@@ -1920,7 +2122,7 @@ void FunctionStackPoisoner::poisonStack() {
                          ConstantInt::get(IntptrTy, LocalStackSize));
     IRB.SetInsertPoint(InsBefore);
     IRB.SetCurrentDebugLocation(EntryDebugLocation);
-    FakeStack = createPHI(IRB, UARIsEnabled, FakeStackValue, Term,
+    FakeStack = createPHI(IRB, UseAfterReturnIsEnabled, FakeStackValue, Term,
                           ConstantInt::get(IntptrTy, 0));
 
     Value *NoFakeStack =
@@ -1977,6 +2179,16 @@ void FunctionStackPoisoner::poisonStack() {
   Value *ShadowBase = ASan.memToShadow(LocalStackBase, IRB);
   poisonRedZones(L.ShadowBytes, IRB, ShadowBase, true);
 
+  auto UnpoisonStack = [&](IRBuilder<> &IRB) {
+    if (HavePoisonedStaticAllocas) {
+      // If we poisoned some allocas in llvm.lifetime analysis,
+      // unpoison whole stack frame now.
+      poisonAlloca(LocalStackBase, LocalStackSize, IRB, false);
+    } else {
+      poisonRedZones(L.ShadowBytes, IRB, ShadowBase, false);
+    }
+  };
+
   // (Un)poison the stack before all ret instructions.
   for (auto Ret : RetVec) {
     IRBuilder<> IRBRet(Ret);
@@ -2021,13 +2233,9 @@ void FunctionStackPoisoner::poisonStack() {
       }
 
       IRBuilder<> IRBElse(ElseTerm);
-      poisonRedZones(L.ShadowBytes, IRBElse, ShadowBase, false);
-    } else if (HavePoisonedAllocas) {
-      // If we poisoned some allocas in llvm.lifetime analysis,
-      // unpoison whole stack frame now.
-      poisonAlloca(LocalStackBase, LocalStackSize, IRBRet, false);
+      UnpoisonStack(IRBElse);
     } else {
-      poisonRedZones(L.ShadowBytes, IRBRet, ShadowBase, false);
+      UnpoisonStack(IRBRet);
     }
   }
 
diff --git a/lib/Transforms/Instrumentation/BoundsChecking.cpp b/lib/Transforms/Instrumentation/BoundsChecking.cpp
index fd3dfd9af033..d4c8369fa9d3 100644
--- a/lib/Transforms/Instrumentation/BoundsChecking.cpp
+++ b/lib/Transforms/Instrumentation/BoundsChecking.cpp
@@ -36,7 +36,7 @@ STATISTIC(ChecksAdded, "Bounds checks added");
 STATISTIC(ChecksSkipped, "Bounds checks skipped");
 STATISTIC(ChecksUnable, "Bounds checks unable to add");
 
-typedef IRBuilder<true, TargetFolder> BuilderTy;
+typedef IRBuilder<TargetFolder> BuilderTy;
 
 namespace {
   struct BoundsChecking : public FunctionPass {
@@ -185,9 +185,8 @@ bool BoundsChecking::runOnFunction(Function &F) {
   }
 
   bool MadeChange = false;
-  for (std::vector<Instruction*>::iterator i = WorkList.begin(),
-       e = WorkList.end(); i != e; ++i) {
-    Inst = *i;
+  for (Instruction *i : WorkList) {
+    Inst = i;
 
     Builder->SetInsertPoint(Inst);
     if (LoadInst *LI = dyn_cast<LoadInst>(Inst)) {
diff --git a/lib/Transforms/Instrumentation/CFGMST.h b/lib/Transforms/Instrumentation/CFGMST.h
index c47fdbf68996..3cd7351cad62 100644
--- a/lib/Transforms/Instrumentation/CFGMST.h
+++ b/lib/Transforms/Instrumentation/CFGMST.h
@@ -21,7 +21,6 @@
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/Transforms/Utils/BasicBlockUtils.h"
-#include <string>
 #include <utility>
 #include <vector>
 
diff --git a/lib/Transforms/Instrumentation/CMakeLists.txt b/lib/Transforms/Instrumentation/CMakeLists.txt
index cae1e5af7ac7..57a569b3791e 100644
--- a/lib/Transforms/Instrumentation/CMakeLists.txt
+++ b/lib/Transforms/Instrumentation/CMakeLists.txt
@@ -4,12 +4,13 @@ add_llvm_library(LLVMInstrumentation
   DataFlowSanitizer.cpp
   GCOVProfiling.cpp
   MemorySanitizer.cpp
+  IndirectCallPromotion.cpp
   Instrumentation.cpp
   InstrProfiling.cpp
   PGOInstrumentation.cpp
-  SafeStack.cpp
   SanitizerCoverage.cpp
   ThreadSanitizer.cpp
+  EfficiencySanitizer.cpp
 
   ADDITIONAL_HEADER_DIRS
   ${LLVM_MAIN_INCLUDE_DIR}/llvm/Transforms
diff --git a/lib/Transforms/Instrumentation/DataFlowSanitizer.cpp b/lib/Transforms/Instrumentation/DataFlowSanitizer.cpp
index d459fc50d136..b34d5b8c45a7 100644
--- a/lib/Transforms/Instrumentation/DataFlowSanitizer.cpp
+++ b/lib/Transforms/Instrumentation/DataFlowSanitizer.cpp
@@ -134,7 +134,7 @@ namespace {
 
 StringRef GetGlobalTypeString(const GlobalValue &G) {
   // Types of GlobalVariables are always pointer types.
-  Type *GType = G.getType()->getElementType();
+  Type *GType = G.getValueType();
   // For now we support blacklisting struct types only.
   if (StructType *SGType = dyn_cast<StructType>(GType)) {
     if (!SGType->isLiteral())
@@ -166,7 +166,7 @@ class DFSanABIList {
     if (isIn(*GA.getParent(), Category))
       return true;
 
-    if (isa<FunctionType>(GA.getType()->getElementType()))
+    if (isa<FunctionType>(GA.getValueType()))
       return SCL->inSection("fun", GA.getName(), Category);
 
     return SCL->inSection("global", GA.getName(), Category) ||
@@ -791,25 +791,20 @@ bool DataFlowSanitizer::runOnModule(Module &M) {
     }
   }
 
-  for (std::vector<Function *>::iterator i = FnsToInstrument.begin(),
-                                         e = FnsToInstrument.end();
-       i != e; ++i) {
-    if (!*i || (*i)->isDeclaration())
+  for (Function *i : FnsToInstrument) {
+    if (!i || i->isDeclaration())
       continue;
 
-    removeUnreachableBlocks(**i);
+    removeUnreachableBlocks(*i);
 
-    DFSanFunction DFSF(*this, *i, FnsWithNativeABI.count(*i));
+    DFSanFunction DFSF(*this, i, FnsWithNativeABI.count(i));
 
     // DFSanVisitor may create new basic blocks, which confuses df_iterator.
     // Build a copy of the list before iterating over it.
-    llvm::SmallVector<BasicBlock *, 4> BBList(
-        depth_first(&(*i)->getEntryBlock()));
+    llvm::SmallVector<BasicBlock *, 4> BBList(depth_first(&i->getEntryBlock()));
 
-    for (llvm::SmallVector<BasicBlock *, 4>::iterator i = BBList.begin(),
-                                                      e = BBList.end();
-         i != e; ++i) {
-      Instruction *Inst = &(*i)->front();
+    for (BasicBlock *i : BBList) {
+      Instruction *Inst = &i->front();
       while (1) {
         // DFSanVisitor may split the current basic block, changing the current
         // instruction's next pointer and moving the next instruction to the
@@ -1066,11 +1061,10 @@ Value *DFSanFunction::loadShadow(Value *Addr, uint64_t Size, uint64_t Align,
   SmallVector<Value *, 2> Objs;
   GetUnderlyingObjects(Addr, Objs, Pos->getModule()->getDataLayout());
   bool AllConstants = true;
-  for (SmallVector<Value *, 2>::iterator i = Objs.begin(), e = Objs.end();
-       i != e; ++i) {
-    if (isa<Function>(*i) || isa<BlockAddress>(*i))
+  for (Value *Obj : Objs) {
+    if (isa<Function>(Obj) || isa<BlockAddress>(Obj))
       continue;
-    if (isa<GlobalVariable>(*i) && cast<GlobalVariable>(*i)->isConstant())
+    if (isa<GlobalVariable>(Obj) && cast<GlobalVariable>(Obj)->isConstant())
       continue;
 
     AllConstants = false;
@@ -1412,10 +1406,6 @@ void DFSanVisitor::visitCallSite(CallSite CS) {
   if (F == DFSF.DFS.DFSanVarargWrapperFn)
     return;
 
-  assert(!(cast<FunctionType>(
-      CS.getCalledValue()->getType()->getPointerElementType())->isVarArg() &&
-           dyn_cast<InvokeInst>(CS.getInstruction())));
-
   IRBuilder<> IRB(CS.getInstruction());
 
   DenseMap<Value *, Function *>::iterator i =
diff --git a/lib/Transforms/Instrumentation/EfficiencySanitizer.cpp b/lib/Transforms/Instrumentation/EfficiencySanitizer.cpp
new file mode 100644
index 000000000000..fb80f87369f9
--- /dev/null
+++ b/lib/Transforms/Instrumentation/EfficiencySanitizer.cpp
@@ -0,0 +1,901 @@
+//===-- EfficiencySanitizer.cpp - performance tuner -----------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file is a part of EfficiencySanitizer, a family of performance tuners
+// that detects multiple performance issues via separate sub-tools.
+//
+// The instrumentation phase is straightforward:
+//   - Take action on every memory access: either inlined instrumentation,
+//     or Inserted calls to our run-time library.
+//   - Optimizations may apply to avoid instrumenting some of the accesses.
+//   - Turn mem{set,cpy,move} instrinsics into library calls.
+// The rest is handled by the run-time library.
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Transforms/Instrumentation.h"
+#include "llvm/ADT/SmallString.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/ADT/StringExtras.h"
+#include "llvm/Analysis/TargetLibraryInfo.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/Module.h"
+#include "llvm/IR/Type.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Transforms/Utils/BasicBlockUtils.h"
+#include "llvm/Transforms/Utils/Local.h"
+#include "llvm/Transforms/Utils/ModuleUtils.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "esan"
+
+// The tool type must be just one of these ClTool* options, as the tools
+// cannot be combined due to shadow memory constraints.
+static cl::opt<bool>
+    ClToolCacheFrag("esan-cache-frag", cl::init(false),
+                    cl::desc("Detect data cache fragmentation"), cl::Hidden);
+static cl::opt<bool>
+    ClToolWorkingSet("esan-working-set", cl::init(false),
+                    cl::desc("Measure the working set size"), cl::Hidden);
+// Each new tool will get its own opt flag here.
+// These are converted to EfficiencySanitizerOptions for use
+// in the code.
+
+static cl::opt<bool> ClInstrumentLoadsAndStores(
+    "esan-instrument-loads-and-stores", cl::init(true),
+    cl::desc("Instrument loads and stores"), cl::Hidden);
+static cl::opt<bool> ClInstrumentMemIntrinsics(
+    "esan-instrument-memintrinsics", cl::init(true),
+    cl::desc("Instrument memintrinsics (memset/memcpy/memmove)"), cl::Hidden);
+static cl::opt<bool> ClInstrumentFastpath(
+    "esan-instrument-fastpath", cl::init(true),
+    cl::desc("Instrument fastpath"), cl::Hidden);
+static cl::opt<bool> ClAuxFieldInfo(
+    "esan-aux-field-info", cl::init(true),
+    cl::desc("Generate binary with auxiliary struct field information"),
+    cl::Hidden);
+
+// Experiments show that the performance difference can be 2x or more,
+// and accuracy loss is typically negligible, so we turn this on by default.
+static cl::opt<bool> ClAssumeIntraCacheLine(
+    "esan-assume-intra-cache-line", cl::init(true),
+    cl::desc("Assume each memory access touches just one cache line, for "
+             "better performance but with a potential loss of accuracy."),
+    cl::Hidden);
+
+STATISTIC(NumInstrumentedLoads, "Number of instrumented loads");
+STATISTIC(NumInstrumentedStores, "Number of instrumented stores");
+STATISTIC(NumFastpaths, "Number of instrumented fastpaths");
+STATISTIC(NumAccessesWithIrregularSize,
+          "Number of accesses with a size outside our targeted callout sizes");
+STATISTIC(NumIgnoredStructs, "Number of ignored structs");
+STATISTIC(NumIgnoredGEPs, "Number of ignored GEP instructions");
+STATISTIC(NumInstrumentedGEPs, "Number of instrumented GEP instructions");
+STATISTIC(NumAssumedIntraCacheLine,
+          "Number of accesses assumed to be intra-cache-line");
+
+static const uint64_t EsanCtorAndDtorPriority = 0;
+static const char *const EsanModuleCtorName = "esan.module_ctor";
+static const char *const EsanModuleDtorName = "esan.module_dtor";
+static const char *const EsanInitName = "__esan_init";
+static const char *const EsanExitName = "__esan_exit";
+
+// We need to specify the tool to the runtime earlier than
+// the ctor is called in some cases, so we set a global variable.
+static const char *const EsanWhichToolName = "__esan_which_tool";
+
+// We must keep these Shadow* constants consistent with the esan runtime.
+// FIXME: Try to place these shadow constants, the names of the __esan_*
+// interface functions, and the ToolType enum into a header shared between
+// llvm and compiler-rt.
+static const uint64_t ShadowMask = 0x00000fffffffffffull;
+static const uint64_t ShadowOffs[3] = { // Indexed by scale
+  0x0000130000000000ull,
+  0x0000220000000000ull,
+  0x0000440000000000ull,
+};
+// This array is indexed by the ToolType enum.
+static const int ShadowScale[] = {
+  0, // ESAN_None.
+  2, // ESAN_CacheFrag: 4B:1B, so 4 to 1 == >>2.
+  6, // ESAN_WorkingSet: 64B:1B, so 64 to 1 == >>6.
+};
+
+// MaxStructCounterNameSize is a soft size limit to avoid insanely long
+// names for those extremely large structs.
+static const unsigned MaxStructCounterNameSize = 512;
+
+namespace {
+
+static EfficiencySanitizerOptions
+OverrideOptionsFromCL(EfficiencySanitizerOptions Options) {
+  if (ClToolCacheFrag)
+    Options.ToolType = EfficiencySanitizerOptions::ESAN_CacheFrag;
+  else if (ClToolWorkingSet)
+    Options.ToolType = EfficiencySanitizerOptions::ESAN_WorkingSet;
+
+  // Direct opt invocation with no params will have the default ESAN_None.
+  // We run the default tool in that case.
+  if (Options.ToolType == EfficiencySanitizerOptions::ESAN_None)
+    Options.ToolType = EfficiencySanitizerOptions::ESAN_CacheFrag;
+
+  return Options;
+}
+
+// Create a constant for Str so that we can pass it to the run-time lib.
+static GlobalVariable *createPrivateGlobalForString(Module &M, StringRef Str,
+                                                    bool AllowMerging) {
+  Constant *StrConst = ConstantDataArray::getString(M.getContext(), Str);
+  // We use private linkage for module-local strings. If they can be merged
+  // with another one, we set the unnamed_addr attribute.
+  GlobalVariable *GV =
+    new GlobalVariable(M, StrConst->getType(), true,
+                       GlobalValue::PrivateLinkage, StrConst, "");
+  if (AllowMerging)
+    GV->setUnnamedAddr(GlobalValue::UnnamedAddr::Global);
+  GV->setAlignment(1);  // Strings may not be merged w/o setting align 1.
+  return GV;
+}
+
+/// EfficiencySanitizer: instrument each module to find performance issues.
+class EfficiencySanitizer : public ModulePass {
+public:
+  EfficiencySanitizer(
+      const EfficiencySanitizerOptions &Opts = EfficiencySanitizerOptions())
+      : ModulePass(ID), Options(OverrideOptionsFromCL(Opts)) {}
+  const char *getPassName() const override;
+  void getAnalysisUsage(AnalysisUsage &AU) const override;
+  bool runOnModule(Module &M) override;
+  static char ID;
+
+private:
+  bool initOnModule(Module &M);
+  void initializeCallbacks(Module &M);
+  bool shouldIgnoreStructType(StructType *StructTy);
+  void createStructCounterName(
+      StructType *StructTy, SmallString<MaxStructCounterNameSize> &NameStr);
+  void createCacheFragAuxGV(
+    Module &M, const DataLayout &DL, StructType *StructTy,
+    GlobalVariable *&TypeNames, GlobalVariable *&Offsets, GlobalVariable *&Size);
+  GlobalVariable *createCacheFragInfoGV(Module &M, const DataLayout &DL,
+                                        Constant *UnitName);
+  Constant *createEsanInitToolInfoArg(Module &M, const DataLayout &DL);
+  void createDestructor(Module &M, Constant *ToolInfoArg);
+  bool runOnFunction(Function &F, Module &M);
+  bool instrumentLoadOrStore(Instruction *I, const DataLayout &DL);
+  bool instrumentMemIntrinsic(MemIntrinsic *MI);
+  bool instrumentGetElementPtr(Instruction *I, Module &M);
+  bool insertCounterUpdate(Instruction *I, StructType *StructTy,
+                           unsigned CounterIdx);
+  unsigned getFieldCounterIdx(StructType *StructTy) {
+    return 0;
+  }
+  unsigned getArrayCounterIdx(StructType *StructTy) {
+    return StructTy->getNumElements();
+  }
+  unsigned getStructCounterSize(StructType *StructTy) {
+    // The struct counter array includes:
+    // - one counter for each struct field,
+    // - one counter for the struct access within an array.
+    return (StructTy->getNumElements()/*field*/ + 1/*array*/);
+  }
+  bool shouldIgnoreMemoryAccess(Instruction *I);
+  int getMemoryAccessFuncIndex(Value *Addr, const DataLayout &DL);
+  Value *appToShadow(Value *Shadow, IRBuilder<> &IRB);
+  bool instrumentFastpath(Instruction *I, const DataLayout &DL, bool IsStore,
+                          Value *Addr, unsigned Alignment);
+  // Each tool has its own fastpath routine:
+  bool instrumentFastpathCacheFrag(Instruction *I, const DataLayout &DL,
+                                   Value *Addr, unsigned Alignment);
+  bool instrumentFastpathWorkingSet(Instruction *I, const DataLayout &DL,
+                                    Value *Addr, unsigned Alignment);
+
+  EfficiencySanitizerOptions Options;
+  LLVMContext *Ctx;
+  Type *IntptrTy;
+  // Our slowpath involves callouts to the runtime library.
+  // Access sizes are powers of two: 1, 2, 4, 8, 16.
+  static const size_t NumberOfAccessSizes = 5;
+  Function *EsanAlignedLoad[NumberOfAccessSizes];
+  Function *EsanAlignedStore[NumberOfAccessSizes];
+  Function *EsanUnalignedLoad[NumberOfAccessSizes];
+  Function *EsanUnalignedStore[NumberOfAccessSizes];
+  // For irregular sizes of any alignment:
+  Function *EsanUnalignedLoadN, *EsanUnalignedStoreN;
+  Function *MemmoveFn, *MemcpyFn, *MemsetFn;
+  Function *EsanCtorFunction;
+  Function *EsanDtorFunction;
+  // Remember the counter variable for each struct type to avoid
+  // recomputing the variable name later during instrumentation.
+  std::map<Type *, GlobalVariable *> StructTyMap;
+};
+} // namespace
+
+char EfficiencySanitizer::ID = 0;
+INITIALIZE_PASS_BEGIN(
+    EfficiencySanitizer, "esan",
+    "EfficiencySanitizer: finds performance issues.", false, false)
+INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfoWrapperPass)
+INITIALIZE_PASS_END(
+    EfficiencySanitizer, "esan",
+    "EfficiencySanitizer: finds performance issues.", false, false)
+
+const char *EfficiencySanitizer::getPassName() const {
+  return "EfficiencySanitizer";
+}
+
+void EfficiencySanitizer::getAnalysisUsage(AnalysisUsage &AU) const {
+  AU.addRequired<TargetLibraryInfoWrapperPass>();
+}
+
+ModulePass *
+llvm::createEfficiencySanitizerPass(const EfficiencySanitizerOptions &Options) {
+  return new EfficiencySanitizer(Options);
+}
+
+void EfficiencySanitizer::initializeCallbacks(Module &M) {
+  IRBuilder<> IRB(M.getContext());
+  // Initialize the callbacks.
+  for (size_t Idx = 0; Idx < NumberOfAccessSizes; ++Idx) {
+    const unsigned ByteSize = 1U << Idx;
+    std::string ByteSizeStr = utostr(ByteSize);
+    // We'll inline the most common (i.e., aligned and frequent sizes)
+    // load + store instrumentation: these callouts are for the slowpath.
+    SmallString<32> AlignedLoadName("__esan_aligned_load" + ByteSizeStr);
+    EsanAlignedLoad[Idx] =
+        checkSanitizerInterfaceFunction(M.getOrInsertFunction(
+            AlignedLoadName, IRB.getVoidTy(), IRB.getInt8PtrTy(), nullptr));
+    SmallString<32> AlignedStoreName("__esan_aligned_store" + ByteSizeStr);
+    EsanAlignedStore[Idx] =
+        checkSanitizerInterfaceFunction(M.getOrInsertFunction(
+            AlignedStoreName, IRB.getVoidTy(), IRB.getInt8PtrTy(), nullptr));
+    SmallString<32> UnalignedLoadName("__esan_unaligned_load" + ByteSizeStr);
+    EsanUnalignedLoad[Idx] =
+        checkSanitizerInterfaceFunction(M.getOrInsertFunction(
+            UnalignedLoadName, IRB.getVoidTy(), IRB.getInt8PtrTy(), nullptr));
+    SmallString<32> UnalignedStoreName("__esan_unaligned_store" + ByteSizeStr);
+    EsanUnalignedStore[Idx] =
+        checkSanitizerInterfaceFunction(M.getOrInsertFunction(
+            UnalignedStoreName, IRB.getVoidTy(), IRB.getInt8PtrTy(), nullptr));
+  }
+  EsanUnalignedLoadN = checkSanitizerInterfaceFunction(
+      M.getOrInsertFunction("__esan_unaligned_loadN", IRB.getVoidTy(),
+                            IRB.getInt8PtrTy(), IntptrTy, nullptr));
+  EsanUnalignedStoreN = checkSanitizerInterfaceFunction(
+      M.getOrInsertFunction("__esan_unaligned_storeN", IRB.getVoidTy(),
+                            IRB.getInt8PtrTy(), IntptrTy, nullptr));
+  MemmoveFn = checkSanitizerInterfaceFunction(
+      M.getOrInsertFunction("memmove", IRB.getInt8PtrTy(), IRB.getInt8PtrTy(),
+                            IRB.getInt8PtrTy(), IntptrTy, nullptr));
+  MemcpyFn = checkSanitizerInterfaceFunction(
+      M.getOrInsertFunction("memcpy", IRB.getInt8PtrTy(), IRB.getInt8PtrTy(),
+                            IRB.getInt8PtrTy(), IntptrTy, nullptr));
+  MemsetFn = checkSanitizerInterfaceFunction(
+      M.getOrInsertFunction("memset", IRB.getInt8PtrTy(), IRB.getInt8PtrTy(),
+                            IRB.getInt32Ty(), IntptrTy, nullptr));
+}
+
+bool EfficiencySanitizer::shouldIgnoreStructType(StructType *StructTy) {
+  if (StructTy == nullptr || StructTy->isOpaque() /* no struct body */)
+    return true;
+  return false;
+}
+
+void EfficiencySanitizer::createStructCounterName(
+    StructType *StructTy, SmallString<MaxStructCounterNameSize> &NameStr) {
+  // Append NumFields and field type ids to avoid struct conflicts
+  // with the same name but different fields.
+  if (StructTy->hasName())
+    NameStr += StructTy->getName();
+  else
+    NameStr += "struct.anon";
+  // We allow the actual size of the StructCounterName to be larger than
+  // MaxStructCounterNameSize and append #NumFields and at least one
+  // field type id.
+  // Append #NumFields.
+  NameStr += "#";
+  Twine(StructTy->getNumElements()).toVector(NameStr);
+  // Append struct field type ids in the reverse order.
+  for (int i = StructTy->getNumElements() - 1; i >= 0; --i) {
+    NameStr += "#";
+    Twine(StructTy->getElementType(i)->getTypeID()).toVector(NameStr);
+    if (NameStr.size() >= MaxStructCounterNameSize)
+      break;
+  }
+  if (StructTy->isLiteral()) {
+    // End with # for literal struct.
+    NameStr += "#";
+  }
+}
+
+// Create global variables with auxiliary information (e.g., struct field size,
+// offset, and type name) for better user report.
+void EfficiencySanitizer::createCacheFragAuxGV(
+    Module &M, const DataLayout &DL, StructType *StructTy,
+    GlobalVariable *&TypeName, GlobalVariable *&Offset,
+    GlobalVariable *&Size) {
+  auto *Int8PtrTy = Type::getInt8PtrTy(*Ctx);
+  auto *Int32Ty = Type::getInt32Ty(*Ctx);
+  // FieldTypeName.
+  auto *TypeNameArrayTy = ArrayType::get(Int8PtrTy, StructTy->getNumElements());
+  TypeName = new GlobalVariable(M, TypeNameArrayTy, true,
+                                 GlobalVariable::InternalLinkage, nullptr);
+  SmallVector<Constant *, 16> TypeNameVec;
+  // FieldOffset.
+  auto *OffsetArrayTy = ArrayType::get(Int32Ty, StructTy->getNumElements());
+  Offset = new GlobalVariable(M, OffsetArrayTy, true,
+                              GlobalVariable::InternalLinkage, nullptr);
+  SmallVector<Constant *, 16> OffsetVec;
+  // FieldSize
+  auto *SizeArrayTy = ArrayType::get(Int32Ty, StructTy->getNumElements());
+  Size = new GlobalVariable(M, SizeArrayTy, true,
+                            GlobalVariable::InternalLinkage, nullptr);
+  SmallVector<Constant *, 16> SizeVec;
+  for (unsigned i = 0; i < StructTy->getNumElements(); ++i) {
+    Type *Ty = StructTy->getElementType(i);
+    std::string Str;
+    raw_string_ostream StrOS(Str);
+    Ty->print(StrOS);
+    TypeNameVec.push_back(
+        ConstantExpr::getPointerCast(
+            createPrivateGlobalForString(M, StrOS.str(), true),
+            Int8PtrTy));
+    OffsetVec.push_back(
+        ConstantInt::get(Int32Ty,
+                         DL.getStructLayout(StructTy)->getElementOffset(i)));
+    SizeVec.push_back(ConstantInt::get(Int32Ty,
+                                       DL.getTypeAllocSize(Ty)));
+    }
+  TypeName->setInitializer(ConstantArray::get(TypeNameArrayTy, TypeNameVec));
+  Offset->setInitializer(ConstantArray::get(OffsetArrayTy, OffsetVec));
+  Size->setInitializer(ConstantArray::get(SizeArrayTy, SizeVec));
+}
+
+// Create the global variable for the cache-fragmentation tool.
+GlobalVariable *EfficiencySanitizer::createCacheFragInfoGV(
+    Module &M, const DataLayout &DL, Constant *UnitName) {
+  assert(Options.ToolType == EfficiencySanitizerOptions::ESAN_CacheFrag);
+
+  auto *Int8PtrTy = Type::getInt8PtrTy(*Ctx);
+  auto *Int8PtrPtrTy = Int8PtrTy->getPointerTo();
+  auto *Int32Ty = Type::getInt32Ty(*Ctx);
+  auto *Int32PtrTy = Type::getInt32PtrTy(*Ctx);
+  auto *Int64Ty = Type::getInt64Ty(*Ctx);
+  auto *Int64PtrTy = Type::getInt64PtrTy(*Ctx);
+  // This structure should be kept consistent with the StructInfo struct
+  // in the runtime library.
+  // struct StructInfo {
+  //   const char *StructName;
+  //   u32 Size;
+  //   u32 NumFields;
+  //   u32 *FieldOffset;           // auxiliary struct field info.
+  //   u32 *FieldSize;             // auxiliary struct field info.
+  //   const char **FieldTypeName; // auxiliary struct field info.
+  //   u64 *FieldCounters;
+  //   u64 *ArrayCounter;
+  // };
+  auto *StructInfoTy =
+    StructType::get(Int8PtrTy, Int32Ty, Int32Ty, Int32PtrTy, Int32PtrTy,
+                    Int8PtrPtrTy, Int64PtrTy, Int64PtrTy, nullptr);
+  auto *StructInfoPtrTy = StructInfoTy->getPointerTo();
+  // This structure should be kept consistent with the CacheFragInfo struct
+  // in the runtime library.
+  // struct CacheFragInfo {
+  //   const char *UnitName;
+  //   u32 NumStructs;
+  //   StructInfo *Structs;
+  // };
+  auto *CacheFragInfoTy =
+    StructType::get(Int8PtrTy, Int32Ty, StructInfoPtrTy, nullptr);
+
+  std::vector<StructType *> Vec = M.getIdentifiedStructTypes();
+  unsigned NumStructs = 0;
+  SmallVector<Constant *, 16> Initializers;
+
+  for (auto &StructTy : Vec) {
+    if (shouldIgnoreStructType(StructTy)) {
+      ++NumIgnoredStructs;
+      continue;
+    }
+    ++NumStructs;
+
+    // StructName.
+    SmallString<MaxStructCounterNameSize> CounterNameStr;
+    createStructCounterName(StructTy, CounterNameStr);
+    GlobalVariable *StructCounterName = createPrivateGlobalForString(
+        M, CounterNameStr, /*AllowMerging*/true);
+
+    // Counters.
+    // We create the counter array with StructCounterName and weak linkage
+    // so that the structs with the same name and layout from different
+    // compilation units will be merged into one.
+    auto *CounterArrayTy = ArrayType::get(Int64Ty,
+                                          getStructCounterSize(StructTy));
+    GlobalVariable *Counters =
+      new GlobalVariable(M, CounterArrayTy, false,
+                         GlobalVariable::WeakAnyLinkage,
+                         ConstantAggregateZero::get(CounterArrayTy),
+                         CounterNameStr);
+
+    // Remember the counter variable for each struct type.
+    StructTyMap.insert(std::pair<Type *, GlobalVariable *>(StructTy, Counters));
+
+    // We pass the field type name array, offset array, and size array to
+    // the runtime for better reporting.
+    GlobalVariable *TypeName = nullptr, *Offset = nullptr, *Size = nullptr;
+    if (ClAuxFieldInfo)
+      createCacheFragAuxGV(M, DL, StructTy, TypeName, Offset, Size);
+
+    Constant *FieldCounterIdx[2];
+    FieldCounterIdx[0] = ConstantInt::get(Int32Ty, 0);
+    FieldCounterIdx[1] = ConstantInt::get(Int32Ty,
+                                          getFieldCounterIdx(StructTy));
+    Constant *ArrayCounterIdx[2];
+    ArrayCounterIdx[0] = ConstantInt::get(Int32Ty, 0);
+    ArrayCounterIdx[1] = ConstantInt::get(Int32Ty,
+                                          getArrayCounterIdx(StructTy));
+    Initializers.push_back(
+        ConstantStruct::get(
+            StructInfoTy,
+            ConstantExpr::getPointerCast(StructCounterName, Int8PtrTy),
+            ConstantInt::get(Int32Ty,
+                             DL.getStructLayout(StructTy)->getSizeInBytes()),
+            ConstantInt::get(Int32Ty, StructTy->getNumElements()),
+            Offset == nullptr ? ConstantPointerNull::get(Int32PtrTy) :
+                ConstantExpr::getPointerCast(Offset, Int32PtrTy),
+            Size == nullptr ? ConstantPointerNull::get(Int32PtrTy) :
+                ConstantExpr::getPointerCast(Size, Int32PtrTy),
+            TypeName == nullptr ? ConstantPointerNull::get(Int8PtrPtrTy) :
+                ConstantExpr::getPointerCast(TypeName, Int8PtrPtrTy),
+            ConstantExpr::getGetElementPtr(CounterArrayTy, Counters,
+                                           FieldCounterIdx),
+            ConstantExpr::getGetElementPtr(CounterArrayTy, Counters,
+                                           ArrayCounterIdx),
+            nullptr));
+  }
+  // Structs.
+  Constant *StructInfo;
+  if (NumStructs == 0) {
+    StructInfo = ConstantPointerNull::get(StructInfoPtrTy);
+  } else {
+    auto *StructInfoArrayTy = ArrayType::get(StructInfoTy, NumStructs);
+    StructInfo = ConstantExpr::getPointerCast(
+        new GlobalVariable(M, StructInfoArrayTy, false,
+                           GlobalVariable::InternalLinkage,
+                           ConstantArray::get(StructInfoArrayTy, Initializers)),
+        StructInfoPtrTy);
+  }
+
+  auto *CacheFragInfoGV = new GlobalVariable(
+      M, CacheFragInfoTy, true, GlobalVariable::InternalLinkage,
+      ConstantStruct::get(CacheFragInfoTy,
+                          UnitName,
+                          ConstantInt::get(Int32Ty, NumStructs),
+                          StructInfo,
+                          nullptr));
+  return CacheFragInfoGV;
+}
+
+// Create the tool-specific argument passed to EsanInit and EsanExit.
+Constant *EfficiencySanitizer::createEsanInitToolInfoArg(Module &M,
+                                                         const DataLayout &DL) {
+  // This structure contains tool-specific information about each compilation
+  // unit (module) and is passed to the runtime library.
+  GlobalVariable *ToolInfoGV = nullptr;
+
+  auto *Int8PtrTy = Type::getInt8PtrTy(*Ctx);
+  // Compilation unit name.
+  auto *UnitName = ConstantExpr::getPointerCast(
+      createPrivateGlobalForString(M, M.getModuleIdentifier(), true),
+      Int8PtrTy);
+
+  // Create the tool-specific variable.
+  if (Options.ToolType == EfficiencySanitizerOptions::ESAN_CacheFrag)
+    ToolInfoGV = createCacheFragInfoGV(M, DL, UnitName);
+
+  if (ToolInfoGV != nullptr)
+    return ConstantExpr::getPointerCast(ToolInfoGV, Int8PtrTy);
+
+  // Create the null pointer if no tool-specific variable created.
+  return ConstantPointerNull::get(Int8PtrTy);
+}
+
+void EfficiencySanitizer::createDestructor(Module &M, Constant *ToolInfoArg) {
+  PointerType *Int8PtrTy = Type::getInt8PtrTy(*Ctx);
+  EsanDtorFunction = Function::Create(FunctionType::get(Type::getVoidTy(*Ctx),
+                                                        false),
+                                      GlobalValue::InternalLinkage,
+                                      EsanModuleDtorName, &M);
+  ReturnInst::Create(*Ctx, BasicBlock::Create(*Ctx, "", EsanDtorFunction));
+  IRBuilder<> IRB_Dtor(EsanDtorFunction->getEntryBlock().getTerminator());
+  Function *EsanExit = checkSanitizerInterfaceFunction(
+      M.getOrInsertFunction(EsanExitName, IRB_Dtor.getVoidTy(),
+                            Int8PtrTy, nullptr));
+  EsanExit->setLinkage(Function::ExternalLinkage);
+  IRB_Dtor.CreateCall(EsanExit, {ToolInfoArg});
+  appendToGlobalDtors(M, EsanDtorFunction, EsanCtorAndDtorPriority);
+}
+
+bool EfficiencySanitizer::initOnModule(Module &M) {
+  Ctx = &M.getContext();
+  const DataLayout &DL = M.getDataLayout();
+  IRBuilder<> IRB(M.getContext());
+  IntegerType *OrdTy = IRB.getInt32Ty();
+  PointerType *Int8PtrTy = Type::getInt8PtrTy(*Ctx);
+  IntptrTy = DL.getIntPtrType(M.getContext());
+  // Create the variable passed to EsanInit and EsanExit.
+  Constant *ToolInfoArg = createEsanInitToolInfoArg(M, DL);
+  // Constructor
+  // We specify the tool type both in the EsanWhichToolName global
+  // and as an arg to the init routine as a sanity check.
+  std::tie(EsanCtorFunction, std::ignore) = createSanitizerCtorAndInitFunctions(
+      M, EsanModuleCtorName, EsanInitName, /*InitArgTypes=*/{OrdTy, Int8PtrTy},
+      /*InitArgs=*/{
+        ConstantInt::get(OrdTy, static_cast<int>(Options.ToolType)),
+        ToolInfoArg});
+  appendToGlobalCtors(M, EsanCtorFunction, EsanCtorAndDtorPriority);
+
+  createDestructor(M, ToolInfoArg);
+
+  new GlobalVariable(M, OrdTy, true,
+                     GlobalValue::WeakAnyLinkage,
+                     ConstantInt::get(OrdTy,
+                                      static_cast<int>(Options.ToolType)),
+                     EsanWhichToolName);
+
+  return true;
+}
+
+Value *EfficiencySanitizer::appToShadow(Value *Shadow, IRBuilder<> &IRB) {
+  // Shadow = ((App & Mask) + Offs) >> Scale
+  Shadow = IRB.CreateAnd(Shadow, ConstantInt::get(IntptrTy, ShadowMask));
+  uint64_t Offs;
+  int Scale = ShadowScale[Options.ToolType];
+  if (Scale <= 2)
+    Offs = ShadowOffs[Scale];
+  else
+    Offs = ShadowOffs[0] << Scale;
+  Shadow = IRB.CreateAdd(Shadow, ConstantInt::get(IntptrTy, Offs));
+  if (Scale > 0)
+    Shadow = IRB.CreateLShr(Shadow, Scale);
+  return Shadow;
+}
+
+bool EfficiencySanitizer::shouldIgnoreMemoryAccess(Instruction *I) {
+  if (Options.ToolType == EfficiencySanitizerOptions::ESAN_CacheFrag) {
+    // We'd like to know about cache fragmentation in vtable accesses and
+    // constant data references, so we do not currently ignore anything.
+    return false;
+  } else if (Options.ToolType == EfficiencySanitizerOptions::ESAN_WorkingSet) {
+    // TODO: the instrumentation disturbs the data layout on the stack, so we
+    // may want to add an option to ignore stack references (if we can
+    // distinguish them) to reduce overhead.
+  }
+  // TODO(bruening): future tools will be returning true for some cases.
+  return false;
+}
+
+bool EfficiencySanitizer::runOnModule(Module &M) {
+  bool Res = initOnModule(M);
+  initializeCallbacks(M);
+  for (auto &F : M) {
+    Res |= runOnFunction(F, M);
+  }
+  return Res;
+}
+
+bool EfficiencySanitizer::runOnFunction(Function &F, Module &M) {
+  // This is required to prevent instrumenting the call to __esan_init from
+  // within the module constructor.
+  if (&F == EsanCtorFunction)
+    return false;
+  SmallVector<Instruction *, 8> LoadsAndStores;
+  SmallVector<Instruction *, 8> MemIntrinCalls;
+  SmallVector<Instruction *, 8> GetElementPtrs;
+  bool Res = false;
+  const DataLayout &DL = M.getDataLayout();
+  const TargetLibraryInfo *TLI =
+      &getAnalysis<TargetLibraryInfoWrapperPass>().getTLI();
+
+  for (auto &BB : F) {
+    for (auto &Inst : BB) {
+      if ((isa<LoadInst>(Inst) || isa<StoreInst>(Inst) ||
+           isa<AtomicRMWInst>(Inst) || isa<AtomicCmpXchgInst>(Inst)) &&
+          !shouldIgnoreMemoryAccess(&Inst))
+        LoadsAndStores.push_back(&Inst);
+      else if (isa<MemIntrinsic>(Inst))
+        MemIntrinCalls.push_back(&Inst);
+      else if (isa<GetElementPtrInst>(Inst))
+        GetElementPtrs.push_back(&Inst);
+      else if (CallInst *CI = dyn_cast<CallInst>(&Inst))
+        maybeMarkSanitizerLibraryCallNoBuiltin(CI, TLI);
+    }
+  }
+
+  if (ClInstrumentLoadsAndStores) {
+    for (auto Inst : LoadsAndStores) {
+      Res |= instrumentLoadOrStore(Inst, DL);
+    }
+  }
+
+  if (ClInstrumentMemIntrinsics) {
+    for (auto Inst : MemIntrinCalls) {
+      Res |= instrumentMemIntrinsic(cast<MemIntrinsic>(Inst));
+    }
+  }
+
+  if (Options.ToolType == EfficiencySanitizerOptions::ESAN_CacheFrag) {
+    for (auto Inst : GetElementPtrs) {
+      Res |= instrumentGetElementPtr(Inst, M);
+    }
+  }
+
+  return Res;
+}
+
+bool EfficiencySanitizer::instrumentLoadOrStore(Instruction *I,
+                                                const DataLayout &DL) {
+  IRBuilder<> IRB(I);
+  bool IsStore;
+  Value *Addr;
+  unsigned Alignment;
+  if (LoadInst *Load = dyn_cast<LoadInst>(I)) {
+    IsStore = false;
+    Alignment = Load->getAlignment();
+    Addr = Load->getPointerOperand();
+  } else if (StoreInst *Store = dyn_cast<StoreInst>(I)) {
+    IsStore = true;
+    Alignment = Store->getAlignment();
+    Addr = Store->getPointerOperand();
+  } else if (AtomicRMWInst *RMW = dyn_cast<AtomicRMWInst>(I)) {
+    IsStore = true;
+    Alignment = 0;
+    Addr = RMW->getPointerOperand();
+  } else if (AtomicCmpXchgInst *Xchg = dyn_cast<AtomicCmpXchgInst>(I)) {
+    IsStore = true;
+    Alignment = 0;
+    Addr = Xchg->getPointerOperand();
+  } else
+    llvm_unreachable("Unsupported mem access type");
+
+  Type *OrigTy = cast<PointerType>(Addr->getType())->getElementType();
+  const uint32_t TypeSizeBytes = DL.getTypeStoreSizeInBits(OrigTy) / 8;
+  Value *OnAccessFunc = nullptr;
+
+  // Convert 0 to the default alignment.
+  if (Alignment == 0)
+    Alignment = DL.getPrefTypeAlignment(OrigTy);
+
+  if (IsStore)
+    NumInstrumentedStores++;
+  else
+    NumInstrumentedLoads++;
+  int Idx = getMemoryAccessFuncIndex(Addr, DL);
+  if (Idx < 0) {
+    OnAccessFunc = IsStore ? EsanUnalignedStoreN : EsanUnalignedLoadN;
+    IRB.CreateCall(OnAccessFunc,
+                   {IRB.CreatePointerCast(Addr, IRB.getInt8PtrTy()),
+                    ConstantInt::get(IntptrTy, TypeSizeBytes)});
+  } else {
+    if (ClInstrumentFastpath &&
+        instrumentFastpath(I, DL, IsStore, Addr, Alignment)) {
+      NumFastpaths++;
+      return true;
+    }
+    if (Alignment == 0 || (Alignment % TypeSizeBytes) == 0)
+      OnAccessFunc = IsStore ? EsanAlignedStore[Idx] : EsanAlignedLoad[Idx];
+    else
+      OnAccessFunc = IsStore ? EsanUnalignedStore[Idx] : EsanUnalignedLoad[Idx];
+    IRB.CreateCall(OnAccessFunc,
+                   IRB.CreatePointerCast(Addr, IRB.getInt8PtrTy()));
+  }
+  return true;
+}
+
+// It's simplest to replace the memset/memmove/memcpy intrinsics with
+// calls that the runtime library intercepts.
+// Our pass is late enough that calls should not turn back into intrinsics.
+bool EfficiencySanitizer::instrumentMemIntrinsic(MemIntrinsic *MI) {
+  IRBuilder<> IRB(MI);
+  bool Res = false;
+  if (isa<MemSetInst>(MI)) {
+    IRB.CreateCall(
+        MemsetFn,
+        {IRB.CreatePointerCast(MI->getArgOperand(0), IRB.getInt8PtrTy()),
+         IRB.CreateIntCast(MI->getArgOperand(1), IRB.getInt32Ty(), false),
+         IRB.CreateIntCast(MI->getArgOperand(2), IntptrTy, false)});
+    MI->eraseFromParent();
+    Res = true;
+  } else if (isa<MemTransferInst>(MI)) {
+    IRB.CreateCall(
+        isa<MemCpyInst>(MI) ? MemcpyFn : MemmoveFn,
+        {IRB.CreatePointerCast(MI->getArgOperand(0), IRB.getInt8PtrTy()),
+         IRB.CreatePointerCast(MI->getArgOperand(1), IRB.getInt8PtrTy()),
+         IRB.CreateIntCast(MI->getArgOperand(2), IntptrTy, false)});
+    MI->eraseFromParent();
+    Res = true;
+  } else
+    llvm_unreachable("Unsupported mem intrinsic type");
+  return Res;
+}
+
+bool EfficiencySanitizer::instrumentGetElementPtr(Instruction *I, Module &M) {
+  GetElementPtrInst *GepInst = dyn_cast<GetElementPtrInst>(I);
+  bool Res = false;
+  if (GepInst == nullptr || GepInst->getNumIndices() == 1) {
+    ++NumIgnoredGEPs;
+    return false;
+  }
+  Type *SourceTy = GepInst->getSourceElementType();
+  StructType *StructTy;
+  ConstantInt *Idx;
+  // Check if GEP calculates address from a struct array.
+  if (isa<StructType>(SourceTy)) {
+    StructTy = cast<StructType>(SourceTy);
+    Idx = dyn_cast<ConstantInt>(GepInst->getOperand(1));
+    if ((Idx == nullptr || Idx->getSExtValue() != 0) &&
+        !shouldIgnoreStructType(StructTy) && StructTyMap.count(StructTy) != 0)
+      Res |= insertCounterUpdate(I, StructTy, getArrayCounterIdx(StructTy));
+  }
+  // Iterate all (except the first and the last) idx within each GEP instruction
+  // for possible nested struct field address calculation.
+  for (unsigned i = 1; i < GepInst->getNumIndices(); ++i) {
+    SmallVector<Value *, 8> IdxVec(GepInst->idx_begin(),
+                                   GepInst->idx_begin() + i);
+    Type *Ty = GetElementPtrInst::getIndexedType(SourceTy, IdxVec);
+    unsigned CounterIdx = 0;
+    if (isa<ArrayType>(Ty)) {
+      ArrayType *ArrayTy = cast<ArrayType>(Ty);
+      StructTy = dyn_cast<StructType>(ArrayTy->getElementType());
+      if (shouldIgnoreStructType(StructTy) || StructTyMap.count(StructTy) == 0)
+        continue;
+      // The last counter for struct array access.
+      CounterIdx = getArrayCounterIdx(StructTy);
+    } else if (isa<StructType>(Ty)) {
+      StructTy = cast<StructType>(Ty);
+      if (shouldIgnoreStructType(StructTy) || StructTyMap.count(StructTy) == 0)
+        continue;
+      // Get the StructTy's subfield index.
+      Idx = cast<ConstantInt>(GepInst->getOperand(i+1));
+      assert(Idx->getSExtValue() >= 0 &&
+             Idx->getSExtValue() < StructTy->getNumElements());
+      CounterIdx = getFieldCounterIdx(StructTy) + Idx->getSExtValue();
+    }
+    Res |= insertCounterUpdate(I, StructTy, CounterIdx);
+  }
+  if (Res)
+    ++NumInstrumentedGEPs;
+  else
+    ++NumIgnoredGEPs;
+  return Res;
+}
+
+bool EfficiencySanitizer::insertCounterUpdate(Instruction *I,
+                                              StructType *StructTy,
+                                              unsigned CounterIdx) {
+  GlobalVariable *CounterArray = StructTyMap[StructTy];
+  if (CounterArray == nullptr)
+    return false;
+  IRBuilder<> IRB(I);
+  Constant *Indices[2];
+  // Xref http://llvm.org/docs/LangRef.html#i-getelementptr and
+  // http://llvm.org/docs/GetElementPtr.html.
+  // The first index of the GEP instruction steps through the first operand,
+  // i.e., the array itself.
+  Indices[0] = ConstantInt::get(IRB.getInt32Ty(), 0);
+  // The second index is the index within the array.
+  Indices[1] = ConstantInt::get(IRB.getInt32Ty(), CounterIdx);
+  Constant *Counter =
+    ConstantExpr::getGetElementPtr(
+        ArrayType::get(IRB.getInt64Ty(), getStructCounterSize(StructTy)),
+        CounterArray, Indices);
+  Value *Load = IRB.CreateLoad(Counter);
+  IRB.CreateStore(IRB.CreateAdd(Load, ConstantInt::get(IRB.getInt64Ty(), 1)),
+                  Counter);
+  return true;
+}
+
+int EfficiencySanitizer::getMemoryAccessFuncIndex(Value *Addr,
+                                                  const DataLayout &DL) {
+  Type *OrigPtrTy = Addr->getType();
+  Type *OrigTy = cast<PointerType>(OrigPtrTy)->getElementType();
+  assert(OrigTy->isSized());
+  // The size is always a multiple of 8.
+  uint32_t TypeSizeBytes = DL.getTypeStoreSizeInBits(OrigTy) / 8;
+  if (TypeSizeBytes != 1 && TypeSizeBytes != 2 && TypeSizeBytes != 4 &&
+      TypeSizeBytes != 8 && TypeSizeBytes != 16) {
+    // Irregular sizes do not have per-size call targets.
+    NumAccessesWithIrregularSize++;
+    return -1;
+  }
+  size_t Idx = countTrailingZeros(TypeSizeBytes);
+  assert(Idx < NumberOfAccessSizes);
+  return Idx;
+}
+
+bool EfficiencySanitizer::instrumentFastpath(Instruction *I,
+                                             const DataLayout &DL, bool IsStore,
+                                             Value *Addr, unsigned Alignment) {
+  if (Options.ToolType == EfficiencySanitizerOptions::ESAN_CacheFrag) {
+    return instrumentFastpathCacheFrag(I, DL, Addr, Alignment);
+  } else if (Options.ToolType == EfficiencySanitizerOptions::ESAN_WorkingSet) {
+    return instrumentFastpathWorkingSet(I, DL, Addr, Alignment);
+  }
+  return false;
+}
+
+bool EfficiencySanitizer::instrumentFastpathCacheFrag(Instruction *I,
+                                                      const DataLayout &DL,
+                                                      Value *Addr,
+                                                      unsigned Alignment) {
+  // Do nothing.
+  return true; // Return true to avoid slowpath instrumentation.
+}
+
+bool EfficiencySanitizer::instrumentFastpathWorkingSet(
+    Instruction *I, const DataLayout &DL, Value *Addr, unsigned Alignment) {
+  assert(ShadowScale[Options.ToolType] == 6); // The code below assumes this
+  IRBuilder<> IRB(I);
+  Type *OrigTy = cast<PointerType>(Addr->getType())->getElementType();
+  const uint32_t TypeSize = DL.getTypeStoreSizeInBits(OrigTy);
+  // Bail to the slowpath if the access might touch multiple cache lines.
+  // An access aligned to its size is guaranteed to be intra-cache-line.
+  // getMemoryAccessFuncIndex has already ruled out a size larger than 16
+  // and thus larger than a cache line for platforms this tool targets
+  // (and our shadow memory setup assumes 64-byte cache lines).
+  assert(TypeSize <= 128);
+  if (!(TypeSize == 8 ||
+        (Alignment % (TypeSize / 8)) == 0)) {
+    if (ClAssumeIntraCacheLine)
+      ++NumAssumedIntraCacheLine;
+    else
+      return false;
+  }
+
+  // We inline instrumentation to set the corresponding shadow bits for
+  // each cache line touched by the application.  Here we handle a single
+  // load or store where we've already ruled out the possibility that it
+  // might touch more than one cache line and thus we simply update the
+  // shadow memory for a single cache line.
+  // Our shadow memory model is fine with races when manipulating shadow values.
+  // We generate the following code:
+  //
+  //   const char BitMask = 0x81;
+  //   char *ShadowAddr = appToShadow(AppAddr);
+  //   if ((*ShadowAddr & BitMask) != BitMask)
+  //     *ShadowAddr |= Bitmask;
+  //
+  Value *AddrPtr = IRB.CreatePointerCast(Addr, IntptrTy);
+  Value *ShadowPtr = appToShadow(AddrPtr, IRB);
+  Type *ShadowTy = IntegerType::get(*Ctx, 8U);
+  Type *ShadowPtrTy = PointerType::get(ShadowTy, 0);
+  // The bottom bit is used for the current sampling period's working set.
+  // The top bit is used for the total working set.  We set both on each
+  // memory access, if they are not already set.
+  Value *ValueMask = ConstantInt::get(ShadowTy, 0x81); // 10000001B
+
+  Value *OldValue = IRB.CreateLoad(IRB.CreateIntToPtr(ShadowPtr, ShadowPtrTy));
+  // The AND and CMP will be turned into a TEST instruction by the compiler.
+  Value *Cmp = IRB.CreateICmpNE(IRB.CreateAnd(OldValue, ValueMask), ValueMask);
+  TerminatorInst *CmpTerm = SplitBlockAndInsertIfThen(Cmp, I, false);
+  // FIXME: do I need to call SetCurrentDebugLocation?
+  IRB.SetInsertPoint(CmpTerm);
+  // We use OR to set the shadow bits to avoid corrupting the middle 6 bits,
+  // which are used by the runtime library.
+  Value *NewVal = IRB.CreateOr(OldValue, ValueMask);
+  IRB.CreateStore(NewVal, IRB.CreateIntToPtr(ShadowPtr, ShadowPtrTy));
+  IRB.SetInsertPoint(I);
+
+  return true;
+}
diff --git a/lib/Transforms/Instrumentation/GCOVProfiling.cpp b/lib/Transforms/Instrumentation/GCOVProfiling.cpp
index ffde7f8d9bae..b4070b602768 100644
--- a/lib/Transforms/Instrumentation/GCOVProfiling.cpp
+++ b/lib/Transforms/Instrumentation/GCOVProfiling.cpp
@@ -14,7 +14,6 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "llvm/Transforms/Instrumentation.h"
 #include "llvm/ADT/DenseMap.h"
 #include "llvm/ADT/Hashing.h"
 #include "llvm/ADT/STLExtras.h"
@@ -35,6 +34,8 @@
 #include "llvm/Support/FileSystem.h"
 #include "llvm/Support/Path.h"
 #include "llvm/Support/raw_ostream.h"
+#include "llvm/Transforms/GCOVProfiler.h"
+#include "llvm/Transforms/Instrumentation.h"
 #include "llvm/Transforms/Utils/ModuleUtils.h"
 #include <algorithm>
 #include <memory>
@@ -68,86 +69,93 @@ GCOVOptions GCOVOptions::getDefault() {
 }
 
 namespace {
-  class GCOVFunction;
+class GCOVFunction;
+
+class GCOVProfiler {
+public:
+  GCOVProfiler() : GCOVProfiler(GCOVOptions::getDefault()) {}
+  GCOVProfiler(const GCOVOptions &Opts) : Options(Opts) {
+    assert((Options.EmitNotes || Options.EmitData) &&
+           "GCOVProfiler asked to do nothing?");
+    ReversedVersion[0] = Options.Version[3];
+    ReversedVersion[1] = Options.Version[2];
+    ReversedVersion[2] = Options.Version[1];
+    ReversedVersion[3] = Options.Version[0];
+    ReversedVersion[4] = '\0';
+  }
+  bool runOnModule(Module &M);
+
+private:
+  // Create the .gcno files for the Module based on DebugInfo.
+  void emitProfileNotes();
+
+  // Modify the program to track transitions along edges and call into the
+  // profiling runtime to emit .gcda files when run.
+  bool emitProfileArcs();
+
+  // Get pointers to the functions in the runtime library.
+  Constant *getStartFileFunc();
+  Constant *getIncrementIndirectCounterFunc();
+  Constant *getEmitFunctionFunc();
+  Constant *getEmitArcsFunc();
+  Constant *getSummaryInfoFunc();
+  Constant *getEndFileFunc();
+
+  // Create or retrieve an i32 state value that is used to represent the
+  // pred block number for certain non-trivial edges.
+  GlobalVariable *getEdgeStateValue();
+
+  // Produce a table of pointers to counters, by predecessor and successor
+  // block number.
+  GlobalVariable *buildEdgeLookupTable(Function *F, GlobalVariable *Counter,
+                                       const UniqueVector<BasicBlock *> &Preds,
+                                       const UniqueVector<BasicBlock *> &Succs);
+
+  // Add the function to write out all our counters to the global destructor
+  // list.
+  Function *
+  insertCounterWriteout(ArrayRef<std::pair<GlobalVariable *, MDNode *>>);
+  Function *insertFlush(ArrayRef<std::pair<GlobalVariable *, MDNode *>>);
+  void insertIndirectCounterIncrement();
+
+  std::string mangleName(const DICompileUnit *CU, const char *NewStem);
 
-  class GCOVProfiler : public ModulePass {
-  public:
-    static char ID;
-    GCOVProfiler() : GCOVProfiler(GCOVOptions::getDefault()) {}
-    GCOVProfiler(const GCOVOptions &Opts) : ModulePass(ID), Options(Opts) {
-      assert((Options.EmitNotes || Options.EmitData) &&
-             "GCOVProfiler asked to do nothing?");
-      ReversedVersion[0] = Options.Version[3];
-      ReversedVersion[1] = Options.Version[2];
-      ReversedVersion[2] = Options.Version[1];
-      ReversedVersion[3] = Options.Version[0];
-      ReversedVersion[4] = '\0';
-      initializeGCOVProfilerPass(*PassRegistry::getPassRegistry());
-    }
-    const char *getPassName() const override {
-      return "GCOV Profiler";
-    }
+  GCOVOptions Options;
 
-  private:
-    bool runOnModule(Module &M) override;
-
-    // Create the .gcno files for the Module based on DebugInfo.
-    void emitProfileNotes();
-
-    // Modify the program to track transitions along edges and call into the
-    // profiling runtime to emit .gcda files when run.
-    bool emitProfileArcs();
-
-    // Get pointers to the functions in the runtime library.
-    Constant *getStartFileFunc();
-    Constant *getIncrementIndirectCounterFunc();
-    Constant *getEmitFunctionFunc();
-    Constant *getEmitArcsFunc();
-    Constant *getSummaryInfoFunc();
-    Constant *getDeleteWriteoutFunctionListFunc();
-    Constant *getDeleteFlushFunctionListFunc();
-    Constant *getEndFileFunc();
-
-    // Create or retrieve an i32 state value that is used to represent the
-    // pred block number for certain non-trivial edges.
-    GlobalVariable *getEdgeStateValue();
-
-    // Produce a table of pointers to counters, by predecessor and successor
-    // block number.
-    GlobalVariable *buildEdgeLookupTable(Function *F,
-                                         GlobalVariable *Counter,
-                                         const UniqueVector<BasicBlock *>&Preds,
-                                         const UniqueVector<BasicBlock*>&Succs);
-
-    // Add the function to write out all our counters to the global destructor
-    // list.
-    Function *insertCounterWriteout(ArrayRef<std::pair<GlobalVariable*,
-                                                       MDNode*> >);
-    Function *insertFlush(ArrayRef<std::pair<GlobalVariable*, MDNode*> >);
-    void insertIndirectCounterIncrement();
-
-    std::string mangleName(const DICompileUnit *CU, const char *NewStem);
-
-    GCOVOptions Options;
-
-    // Reversed, NUL-terminated copy of Options.Version.
-    char ReversedVersion[5];
-    // Checksum, produced by hash of EdgeDestinations
-    SmallVector<uint32_t, 4> FileChecksums;
-
-    Module *M;
-    LLVMContext *Ctx;
-    SmallVector<std::unique_ptr<GCOVFunction>, 16> Funcs;
-    DenseMap<DISubprogram *, Function *> FnMap;
-  };
+  // Reversed, NUL-terminated copy of Options.Version.
+  char ReversedVersion[5];
+  // Checksum, produced by hash of EdgeDestinations
+  SmallVector<uint32_t, 4> FileChecksums;
+
+  Module *M;
+  LLVMContext *Ctx;
+  SmallVector<std::unique_ptr<GCOVFunction>, 16> Funcs;
+};
+
+class GCOVProfilerLegacyPass : public ModulePass {
+public:
+  static char ID;
+  GCOVProfilerLegacyPass()
+      : GCOVProfilerLegacyPass(GCOVOptions::getDefault()) {}
+  GCOVProfilerLegacyPass(const GCOVOptions &Opts)
+      : ModulePass(ID), Profiler(Opts) {
+    initializeGCOVProfilerLegacyPassPass(*PassRegistry::getPassRegistry());
+  }
+  const char *getPassName() const override { return "GCOV Profiler"; }
+
+  bool runOnModule(Module &M) override { return Profiler.runOnModule(M); }
+
+private:
+  GCOVProfiler Profiler;
+};
 }
 
-char GCOVProfiler::ID = 0;
-INITIALIZE_PASS(GCOVProfiler, "insert-gcov-profiling",
+char GCOVProfilerLegacyPass::ID = 0;
+INITIALIZE_PASS(GCOVProfilerLegacyPass, "insert-gcov-profiling",
                 "Insert instrumentation for GCOV profiling", false, false)
 
 ModulePass *llvm::createGCOVProfilerPass(const GCOVOptions &Options) {
-  return new GCOVProfiler(Options);
+  return new GCOVProfilerLegacyPass(Options);
 }
 
 static StringRef getFunctionName(const DISubprogram *SP) {
@@ -257,10 +265,9 @@ namespace {
     void writeOut() {
       uint32_t Len = 3;
       SmallVector<StringMapEntry<GCOVLines *> *, 32> SortedLinesByFile;
-      for (StringMap<GCOVLines *>::iterator I = LinesByFile.begin(),
-               E = LinesByFile.end(); I != E; ++I) {
-        Len += I->second->length();
-        SortedLinesByFile.push_back(&*I);
+      for (auto &I : LinesByFile) {
+        Len += I.second->length();
+        SortedLinesByFile.push_back(&I);
       }
 
       writeBytes(LinesTag, 4);
@@ -272,10 +279,8 @@ namespace {
                    StringMapEntry<GCOVLines *> *RHS) {
         return LHS->getKey() < RHS->getKey();
       });
-      for (SmallVectorImpl<StringMapEntry<GCOVLines *> *>::iterator
-               I = SortedLinesByFile.begin(), E = SortedLinesByFile.end();
-           I != E; ++I)
-        (*I)->getValue()->writeOut();
+      for (auto &I : SortedLinesByFile)
+        I->getValue()->writeOut();
       write(0);
       write(0);
     }
@@ -450,28 +455,32 @@ bool GCOVProfiler::runOnModule(Module &M) {
   this->M = &M;
   Ctx = &M.getContext();
 
-  FnMap.clear();
-  for (Function &F : M) {
-    if (DISubprogram *SP = F.getSubprogram())
-      FnMap[SP] = &F;
-  }
-
   if (Options.EmitNotes) emitProfileNotes();
   if (Options.EmitData) return emitProfileArcs();
   return false;
 }
 
-static bool functionHasLines(Function *F) {
+PreservedAnalyses GCOVProfilerPass::run(Module &M,
+                                        AnalysisManager<Module> &AM) {
+
+  GCOVProfiler Profiler(GCOVOpts);
+
+  if (!Profiler.runOnModule(M))
+    return PreservedAnalyses::all();
+
+  return PreservedAnalyses::none();
+}
+
+static bool functionHasLines(Function &F) {
   // Check whether this function actually has any source lines. Not only
   // do these waste space, they also can crash gcov.
-  for (Function::iterator BB = F->begin(), E = F->end(); BB != E; ++BB) {
-    for (BasicBlock::iterator I = BB->begin(), IE = BB->end();
-         I != IE; ++I) {
+  for (auto &BB : F) {
+    for (auto &I : BB) {
       // Debug intrinsic locations correspond to the location of the
       // declaration, not necessarily any statements or expressions.
-      if (isa<DbgInfoIntrinsic>(I)) continue;
+      if (isa<DbgInfoIntrinsic>(&I)) continue;
 
-      const DebugLoc &Loc = I->getDebugLoc();
+      const DebugLoc &Loc = I.getDebugLoc();
       if (!Loc)
         continue;
 
@@ -504,27 +513,27 @@ void GCOVProfiler::emitProfileNotes() {
     std::string EdgeDestinations;
 
     unsigned FunctionIdent = 0;
-    for (auto *SP : CU->getSubprograms()) {
-      Function *F = FnMap[SP];
-      if (!F) continue;
+    for (auto &F : M->functions()) {
+      DISubprogram *SP = F.getSubprogram();
+      if (!SP) continue;
       if (!functionHasLines(F)) continue;
 
       // gcov expects every function to start with an entry block that has a
       // single successor, so split the entry block to make sure of that.
-      BasicBlock &EntryBlock = F->getEntryBlock();
+      BasicBlock &EntryBlock = F.getEntryBlock();
       BasicBlock::iterator It = EntryBlock.begin();
       while (isa<AllocaInst>(*It) || isa<DbgInfoIntrinsic>(*It))
         ++It;
       EntryBlock.splitBasicBlock(It);
 
-      Funcs.push_back(make_unique<GCOVFunction>(SP, F, &out, FunctionIdent++,
+      Funcs.push_back(make_unique<GCOVFunction>(SP, &F, &out, FunctionIdent++,
                                                 Options.UseCfgChecksum,
                                                 Options.ExitBlockBeforeBody));
       GCOVFunction &Func = *Funcs.back();
 
-      for (Function::iterator BB = F->begin(), E = F->end(); BB != E; ++BB) {
-        GCOVBlock &Block = Func.getBlock(&*BB);
-        TerminatorInst *TI = BB->getTerminator();
+      for (auto &BB : F) {
+        GCOVBlock &Block = Func.getBlock(&BB);
+        TerminatorInst *TI = BB.getTerminator();
         if (int successors = TI->getNumSuccessors()) {
           for (int i = 0; i != successors; ++i) {
             Block.addEdge(Func.getBlock(TI->getSuccessor(i)));
@@ -534,13 +543,12 @@ void GCOVProfiler::emitProfileNotes() {
         }
 
         uint32_t Line = 0;
-        for (BasicBlock::iterator I = BB->begin(), IE = BB->end();
-             I != IE; ++I) {
+        for (auto &I : BB) {
           // Debug intrinsic locations correspond to the location of the
           // declaration, not necessarily any statements or expressions.
-          if (isa<DbgInfoIntrinsic>(I)) continue;
+          if (isa<DbgInfoIntrinsic>(&I)) continue;
 
-          const DebugLoc &Loc = I->getDebugLoc();
+          const DebugLoc &Loc = I.getDebugLoc();
           if (!Loc)
             continue;
 
@@ -581,16 +589,15 @@ bool GCOVProfiler::emitProfileArcs() {
   bool Result = false;
   bool InsertIndCounterIncrCode = false;
   for (unsigned i = 0, e = CU_Nodes->getNumOperands(); i != e; ++i) {
-    auto *CU = cast<DICompileUnit>(CU_Nodes->getOperand(i));
     SmallVector<std::pair<GlobalVariable *, MDNode *>, 8> CountersBySP;
-    for (auto *SP : CU->getSubprograms()) {
-      Function *F = FnMap[SP];
-      if (!F) continue;
+    for (auto &F : M->functions()) {
+      DISubprogram *SP = F.getSubprogram();
+      if (!SP) continue;
       if (!functionHasLines(F)) continue;
       if (!Result) Result = true;
       unsigned Edges = 0;
-      for (Function::iterator BB = F->begin(), E = F->end(); BB != E; ++BB) {
-        TerminatorInst *TI = BB->getTerminator();
+      for (auto &BB : F) {
+        TerminatorInst *TI = BB.getTerminator();
         if (isa<ReturnInst>(TI))
           ++Edges;
         else
@@ -610,12 +617,12 @@ bool GCOVProfiler::emitProfileArcs() {
       UniqueVector<BasicBlock *> ComplexEdgeSuccs;
 
       unsigned Edge = 0;
-      for (Function::iterator BB = F->begin(), E = F->end(); BB != E; ++BB) {
-        TerminatorInst *TI = BB->getTerminator();
+      for (auto &BB : F) {
+        TerminatorInst *TI = BB.getTerminator();
         int Successors = isa<ReturnInst>(TI) ? 1 : TI->getNumSuccessors();
         if (Successors) {
           if (Successors == 1) {
-            IRBuilder<> Builder(&*BB->getFirstInsertionPt());
+            IRBuilder<> Builder(&*BB.getFirstInsertionPt());
             Value *Counter = Builder.CreateConstInBoundsGEP2_64(Counters, 0,
                                                                 Edge);
             Value *Count = Builder.CreateLoad(Counter);
@@ -626,16 +633,13 @@ bool GCOVProfiler::emitProfileArcs() {
             Value *Sel = Builder.CreateSelect(BI->getCondition(),
                                               Builder.getInt64(Edge),
                                               Builder.getInt64(Edge + 1));
-            SmallVector<Value *, 2> Idx;
-            Idx.push_back(Builder.getInt64(0));
-            Idx.push_back(Sel);
-            Value *Counter = Builder.CreateInBoundsGEP(Counters->getValueType(),
-                                                       Counters, Idx);
+            Value *Counter = Builder.CreateInBoundsGEP(
+                Counters->getValueType(), Counters, {Builder.getInt64(0), Sel});
             Value *Count = Builder.CreateLoad(Counter);
             Count = Builder.CreateAdd(Count, Builder.getInt64(1));
             Builder.CreateStore(Count, Counter);
           } else {
-            ComplexEdgePreds.insert(&*BB);
+            ComplexEdgePreds.insert(&BB);
             for (int i = 0; i != Successors; ++i)
               ComplexEdgeSuccs.insert(TI->getSuccessor(i));
           }
@@ -646,7 +650,7 @@ bool GCOVProfiler::emitProfileArcs() {
 
       if (!ComplexEdgePreds.empty()) {
         GlobalVariable *EdgeTable =
-          buildEdgeLookupTable(F, Counters,
+          buildEdgeLookupTable(&F, Counters,
                                ComplexEdgePreds, ComplexEdgeSuccs);
         GlobalVariable *EdgeState = getEdgeStateValue();
 
@@ -679,7 +683,7 @@ bool GCOVProfiler::emitProfileArcs() {
     FunctionType *FTy = FunctionType::get(Type::getVoidTy(*Ctx), false);
     Function *F = Function::Create(FTy, GlobalValue::InternalLinkage,
                                    "__llvm_gcov_init", M);
-    F->setUnnamedAddr(true);
+    F->setUnnamedAddr(GlobalValue::UnnamedAddr::Global);
     F->setLinkage(GlobalValue::InternalLinkage);
     F->addFnAttr(Attribute::NoInline);
     if (Options.NoRedZone)
@@ -732,8 +736,8 @@ GlobalVariable *GCOVProfiler::buildEdgeLookupTable(
     EdgeTable[i] = NullValue;
 
   unsigned Edge = 0;
-  for (Function::iterator BB = F->begin(), E = F->end(); BB != E; ++BB) {
-    TerminatorInst *TI = BB->getTerminator();
+  for (BasicBlock &BB : *F) {
+    TerminatorInst *TI = BB.getTerminator();
     int Successors = isa<ReturnInst>(TI) ? 1 : TI->getNumSuccessors();
     if (Successors > 1 && !isa<BranchInst>(TI) && !isa<ReturnInst>(TI)) {
       for (int i = 0; i != Successors; ++i) {
@@ -742,7 +746,7 @@ GlobalVariable *GCOVProfiler::buildEdgeLookupTable(
         Value *Counter = Builder.CreateConstInBoundsGEP2_64(Counters, 0,
                                                             Edge + i);
         EdgeTable[((Succs.idFor(Succ) - 1) * Preds.size()) +
-                  (Preds.idFor(&*BB) - 1)] = cast<Constant>(Counter);
+                  (Preds.idFor(&BB) - 1)] = cast<Constant>(Counter);
       }
     }
     Edge += Successors;
@@ -754,7 +758,7 @@ GlobalVariable *GCOVProfiler::buildEdgeLookupTable(
           ConstantArray::get(EdgeTableTy,
                              makeArrayRef(&EdgeTable[0],TableSize)),
           "__llvm_gcda_edge_table");
-  EdgeTableGV->setUnnamedAddr(true);
+  EdgeTableGV->setUnnamedAddr(GlobalValue::UnnamedAddr::Global);
   return EdgeTableGV;
 }
 
@@ -805,16 +809,6 @@ Constant *GCOVProfiler::getSummaryInfoFunc() {
   return M->getOrInsertFunction("llvm_gcda_summary_info", FTy);
 }
 
-Constant *GCOVProfiler::getDeleteWriteoutFunctionListFunc() {
-  FunctionType *FTy = FunctionType::get(Type::getVoidTy(*Ctx), false);
-  return M->getOrInsertFunction("llvm_delete_writeout_function_list", FTy);
-}
-
-Constant *GCOVProfiler::getDeleteFlushFunctionListFunc() {
-  FunctionType *FTy = FunctionType::get(Type::getVoidTy(*Ctx), false);
-  return M->getOrInsertFunction("llvm_delete_flush_function_list", FTy);
-}
-
 Constant *GCOVProfiler::getEndFileFunc() {
   FunctionType *FTy = FunctionType::get(Type::getVoidTy(*Ctx), false);
   return M->getOrInsertFunction("llvm_gcda_end_file", FTy);
@@ -828,7 +822,7 @@ GlobalVariable *GCOVProfiler::getEdgeStateValue() {
                             ConstantInt::get(Type::getInt32Ty(*Ctx),
                                              0xffffffff),
                             "__llvm_gcov_global_state_pred");
-    GV->setUnnamedAddr(true);
+    GV->setUnnamedAddr(GlobalValue::UnnamedAddr::Global);
   }
   return GV;
 }
@@ -840,7 +834,7 @@ Function *GCOVProfiler::insertCounterWriteout(
   if (!WriteoutF)
     WriteoutF = Function::Create(WriteoutFTy, GlobalValue::InternalLinkage,
                                  "__llvm_gcov_writeout", M);
-  WriteoutF->setUnnamedAddr(true);
+  WriteoutF->setUnnamedAddr(GlobalValue::UnnamedAddr::Global);
   WriteoutF->addFnAttr(Attribute::NoInline);
   if (Options.NoRedZone)
     WriteoutF->addFnAttr(Attribute::NoRedZone);
@@ -884,7 +878,7 @@ Function *GCOVProfiler::insertCounterWriteout(
 
         GlobalVariable *GV = CountersBySP[j].first;
         unsigned Arcs =
-          cast<ArrayType>(GV->getType()->getElementType())->getNumElements();
+          cast<ArrayType>(GV->getValueType())->getNumElements();
         Builder.CreateCall(EmitArcs, {Builder.getInt32(Arcs),
                                       Builder.CreateConstGEP2_64(GV, 0, 0)});
       }
@@ -900,7 +894,7 @@ Function *GCOVProfiler::insertCounterWriteout(
 void GCOVProfiler::insertIndirectCounterIncrement() {
   Function *Fn =
     cast<Function>(GCOVProfiler::getIncrementIndirectCounterFunc());
-  Fn->setUnnamedAddr(true);
+  Fn->setUnnamedAddr(GlobalValue::UnnamedAddr::Global);
   Fn->setLinkage(GlobalValue::InternalLinkage);
   Fn->addFnAttr(Attribute::NoInline);
   if (Options.NoRedZone)
@@ -957,7 +951,7 @@ insertFlush(ArrayRef<std::pair<GlobalVariable*, MDNode*> > CountersBySP) {
                               "__llvm_gcov_flush", M);
   else
     FlushF->setLinkage(GlobalValue::InternalLinkage);
-  FlushF->setUnnamedAddr(true);
+  FlushF->setUnnamedAddr(GlobalValue::UnnamedAddr::Global);
   FlushF->addFnAttr(Attribute::NoInline);
   if (Options.NoRedZone)
     FlushF->addFnAttr(Attribute::NoRedZone);
@@ -972,11 +966,9 @@ insertFlush(ArrayRef<std::pair<GlobalVariable*, MDNode*> > CountersBySP) {
   Builder.CreateCall(WriteoutF, {});
 
   // Zero out the counters.
-  for (ArrayRef<std::pair<GlobalVariable *, MDNode *> >::iterator
-         I = CountersBySP.begin(), E = CountersBySP.end();
-       I != E; ++I) {
-    GlobalVariable *GV = I->first;
-    Constant *Null = Constant::getNullValue(GV->getType()->getElementType());
+  for (const auto &I : CountersBySP) {
+    GlobalVariable *GV = I.first;
+    Constant *Null = Constant::getNullValue(GV->getValueType());
     Builder.CreateStore(Null, GV);
   }
 
diff --git a/lib/Transforms/Instrumentation/IndirectCallPromotion.cpp b/lib/Transforms/Instrumentation/IndirectCallPromotion.cpp
new file mode 100644
index 000000000000..202b94b19c4c
--- /dev/null
+++ b/lib/Transforms/Instrumentation/IndirectCallPromotion.cpp
@@ -0,0 +1,661 @@
+//===-- IndirectCallPromotion.cpp - Promote indirect calls to direct calls ===//
+//
+//                      The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the transformation that promotes indirect calls to
+// conditional direct calls when the indirect-call value profile metadata is
+// available.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/ADT/Triple.h"
+#include "llvm/Analysis/CFG.h"
+#include "llvm/Analysis/IndirectCallPromotionAnalysis.h"
+#include "llvm/Analysis/IndirectCallSiteVisitor.h"
+#include "llvm/IR/CallSite.h"
+#include "llvm/IR/DiagnosticInfo.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/InstIterator.h"
+#include "llvm/IR/InstVisitor.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/MDBuilder.h"
+#include "llvm/IR/Module.h"
+#include "llvm/Pass.h"
+#include "llvm/ProfileData/InstrProfReader.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Transforms/Instrumentation.h"
+#include "llvm/Transforms/PGOInstrumentation.h"
+#include "llvm/Transforms/Utils/BasicBlockUtils.h"
+#include <string>
+#include <utility>
+#include <vector>
+
+using namespace llvm;
+
+#define DEBUG_TYPE "pgo-icall-prom"
+
+STATISTIC(NumOfPGOICallPromotion, "Number of indirect call promotions.");
+STATISTIC(NumOfPGOICallsites, "Number of indirect call candidate sites.");
+
+// Command line option to disable indirect-call promotion with the default as
+// false. This is for debug purpose.
+static cl::opt<bool> DisableICP("disable-icp", cl::init(false), cl::Hidden,
+                                cl::desc("Disable indirect call promotion"));
+
+// Set the cutoff value for the promotion. If the value is other than 0, we
+// stop the transformation once the total number of promotions equals the cutoff
+// value.
+// For debug use only.
+static cl::opt<unsigned>
+    ICPCutOff("icp-cutoff", cl::init(0), cl::Hidden, cl::ZeroOrMore,
+              cl::desc("Max number of promotions for this compilaiton"));
+
+// If ICPCSSkip is non zero, the first ICPCSSkip callsites will be skipped.
+// For debug use only.
+static cl::opt<unsigned>
+    ICPCSSkip("icp-csskip", cl::init(0), cl::Hidden, cl::ZeroOrMore,
+              cl::desc("Skip Callsite up to this number for this compilaiton"));
+
+// Set if the pass is called in LTO optimization. The difference for LTO mode
+// is the pass won't prefix the source module name to the internal linkage
+// symbols.
+static cl::opt<bool> ICPLTOMode("icp-lto", cl::init(false), cl::Hidden,
+                                cl::desc("Run indirect-call promotion in LTO "
+                                         "mode"));
+
+// If the option is set to true, only call instructions will be considered for
+// transformation -- invoke instructions will be ignored.
+static cl::opt<bool>
+    ICPCallOnly("icp-call-only", cl::init(false), cl::Hidden,
+                cl::desc("Run indirect-call promotion for call instructions "
+                         "only"));
+
+// If the option is set to true, only invoke instructions will be considered for
+// transformation -- call instructions will be ignored.
+static cl::opt<bool> ICPInvokeOnly("icp-invoke-only", cl::init(false),
+                                   cl::Hidden,
+                                   cl::desc("Run indirect-call promotion for "
+                                            "invoke instruction only"));
+
+// Dump the function level IR if the transformation happened in this
+// function. For debug use only.
+static cl::opt<bool>
+    ICPDUMPAFTER("icp-dumpafter", cl::init(false), cl::Hidden,
+                 cl::desc("Dump IR after transformation happens"));
+
+namespace {
+class PGOIndirectCallPromotionLegacyPass : public ModulePass {
+public:
+  static char ID;
+
+  PGOIndirectCallPromotionLegacyPass(bool InLTO = false)
+      : ModulePass(ID), InLTO(InLTO) {
+    initializePGOIndirectCallPromotionLegacyPassPass(
+        *PassRegistry::getPassRegistry());
+  }
+
+  const char *getPassName() const override {
+    return "PGOIndirectCallPromotion";
+  }
+
+private:
+  bool runOnModule(Module &M) override;
+
+  // If this pass is called in LTO. We need to special handling the PGOFuncName
+  // for the static variables due to LTO's internalization.
+  bool InLTO;
+};
+} // end anonymous namespace
+
+char PGOIndirectCallPromotionLegacyPass::ID = 0;
+INITIALIZE_PASS(PGOIndirectCallPromotionLegacyPass, "pgo-icall-prom",
+                "Use PGO instrumentation profile to promote indirect calls to "
+                "direct calls.",
+                false, false)
+
+ModulePass *llvm::createPGOIndirectCallPromotionLegacyPass(bool InLTO) {
+  return new PGOIndirectCallPromotionLegacyPass(InLTO);
+}
+
+namespace {
+// The class for main data structure to promote indirect calls to conditional
+// direct calls.
+class ICallPromotionFunc {
+private:
+  Function &F;
+  Module *M;
+
+  // Symtab that maps indirect call profile values to function names and
+  // defines.
+  InstrProfSymtab *Symtab;
+
+  enum TargetStatus {
+    OK,                   // Should be able to promote.
+    NotAvailableInModule, // Cannot find the target in current module.
+    ReturnTypeMismatch,   // Return type mismatch b/w target and indirect-call.
+    NumArgsMismatch,      // Number of arguments does not match.
+    ArgTypeMismatch       // Type mismatch in the arguments (cannot bitcast).
+  };
+
+  // Test if we can legally promote this direct-call of Target.
+  TargetStatus isPromotionLegal(Instruction *Inst, uint64_t Target,
+                                Function *&F);
+
+  // A struct that records the direct target and it's call count.
+  struct PromotionCandidate {
+    Function *TargetFunction;
+    uint64_t Count;
+    PromotionCandidate(Function *F, uint64_t C) : TargetFunction(F), Count(C) {}
+  };
+
+  // Check if the indirect-call call site should be promoted. Return the number
+  // of promotions. Inst is the candidate indirect call, ValueDataRef
+  // contains the array of value profile data for profiled targets,
+  // TotalCount is the total profiled count of call executions, and
+  // NumCandidates is the number of candidate entries in ValueDataRef.
+  std::vector<PromotionCandidate> getPromotionCandidatesForCallSite(
+      Instruction *Inst, const ArrayRef<InstrProfValueData> &ValueDataRef,
+      uint64_t TotalCount, uint32_t NumCandidates);
+
+  // Main function that transforms Inst (either a indirect-call instruction, or
+  // an invoke instruction , to a conditional call to F. This is like:
+  //     if (Inst.CalledValue == F)
+  //        F(...);
+  //     else
+  //        Inst(...);
+  //     end
+  // TotalCount is the profile count value that the instruction executes.
+  // Count is the profile count value that F is the target function.
+  // These two values are being used to update the branch weight.
+  void promote(Instruction *Inst, Function *F, uint64_t Count,
+               uint64_t TotalCount);
+
+  // Promote a list of targets for one indirect-call callsite. Return
+  // the number of promotions.
+  uint32_t tryToPromote(Instruction *Inst,
+                        const std::vector<PromotionCandidate> &Candidates,
+                        uint64_t &TotalCount);
+
+  static const char *StatusToString(const TargetStatus S) {
+    switch (S) {
+    case OK:
+      return "OK to promote";
+    case NotAvailableInModule:
+      return "Cannot find the target";
+    case ReturnTypeMismatch:
+      return "Return type mismatch";
+    case NumArgsMismatch:
+      return "The number of arguments mismatch";
+    case ArgTypeMismatch:
+      return "Argument Type mismatch";
+    }
+    llvm_unreachable("Should not reach here");
+  }
+
+  // Noncopyable
+  ICallPromotionFunc(const ICallPromotionFunc &other) = delete;
+  ICallPromotionFunc &operator=(const ICallPromotionFunc &other) = delete;
+
+public:
+  ICallPromotionFunc(Function &Func, Module *Modu, InstrProfSymtab *Symtab)
+      : F(Func), M(Modu), Symtab(Symtab) {
+  }
+  bool processFunction();
+};
+} // end anonymous namespace
+
+ICallPromotionFunc::TargetStatus
+ICallPromotionFunc::isPromotionLegal(Instruction *Inst, uint64_t Target,
+                                     Function *&TargetFunction) {
+  Function *DirectCallee = Symtab->getFunction(Target);
+  if (DirectCallee == nullptr)
+    return NotAvailableInModule;
+  // Check the return type.
+  Type *CallRetType = Inst->getType();
+  if (!CallRetType->isVoidTy()) {
+    Type *FuncRetType = DirectCallee->getReturnType();
+    if (FuncRetType != CallRetType &&
+        !CastInst::isBitCastable(FuncRetType, CallRetType))
+      return ReturnTypeMismatch;
+  }
+
+  // Check if the arguments are compatible with the parameters
+  FunctionType *DirectCalleeType = DirectCallee->getFunctionType();
+  unsigned ParamNum = DirectCalleeType->getFunctionNumParams();
+  CallSite CS(Inst);
+  unsigned ArgNum = CS.arg_size();
+
+  if (ParamNum != ArgNum && !DirectCalleeType->isVarArg())
+    return NumArgsMismatch;
+
+  for (unsigned I = 0; I < ParamNum; ++I) {
+    Type *PTy = DirectCalleeType->getFunctionParamType(I);
+    Type *ATy = CS.getArgument(I)->getType();
+    if (PTy == ATy)
+      continue;
+    if (!CastInst::castIsValid(Instruction::BitCast, CS.getArgument(I), PTy))
+      return ArgTypeMismatch;
+  }
+
+  DEBUG(dbgs() << " #" << NumOfPGOICallPromotion << " Promote the icall to "
+               << Symtab->getFuncName(Target) << "\n");
+  TargetFunction = DirectCallee;
+  return OK;
+}
+
+// Indirect-call promotion heuristic. The direct targets are sorted based on
+// the count. Stop at the first target that is not promoted.
+std::vector<ICallPromotionFunc::PromotionCandidate>
+ICallPromotionFunc::getPromotionCandidatesForCallSite(
+    Instruction *Inst, const ArrayRef<InstrProfValueData> &ValueDataRef,
+    uint64_t TotalCount, uint32_t NumCandidates) {
+  std::vector<PromotionCandidate> Ret;
+
+  DEBUG(dbgs() << " \nWork on callsite #" << NumOfPGOICallsites << *Inst
+               << " Num_targets: " << ValueDataRef.size()
+               << " Num_candidates: " << NumCandidates << "\n");
+  NumOfPGOICallsites++;
+  if (ICPCSSkip != 0 && NumOfPGOICallsites <= ICPCSSkip) {
+    DEBUG(dbgs() << " Skip: User options.\n");
+    return Ret;
+  }
+
+  for (uint32_t I = 0; I < NumCandidates; I++) {
+    uint64_t Count = ValueDataRef[I].Count;
+    assert(Count <= TotalCount);
+    uint64_t Target = ValueDataRef[I].Value;
+    DEBUG(dbgs() << " Candidate " << I << " Count=" << Count
+                 << "  Target_func: " << Target << "\n");
+
+    if (ICPInvokeOnly && dyn_cast<CallInst>(Inst)) {
+      DEBUG(dbgs() << " Not promote: User options.\n");
+      break;
+    }
+    if (ICPCallOnly && dyn_cast<InvokeInst>(Inst)) {
+      DEBUG(dbgs() << " Not promote: User option.\n");
+      break;
+    }
+    if (ICPCutOff != 0 && NumOfPGOICallPromotion >= ICPCutOff) {
+      DEBUG(dbgs() << " Not promote: Cutoff reached.\n");
+      break;
+    }
+    Function *TargetFunction = nullptr;
+    TargetStatus Status = isPromotionLegal(Inst, Target, TargetFunction);
+    if (Status != OK) {
+      StringRef TargetFuncName = Symtab->getFuncName(Target);
+      const char *Reason = StatusToString(Status);
+      DEBUG(dbgs() << " Not promote: " << Reason << "\n");
+      emitOptimizationRemarkMissed(
+          F.getContext(), "pgo-icall-prom", F, Inst->getDebugLoc(),
+          Twine("Cannot promote indirect call to ") +
+              (TargetFuncName.empty() ? Twine(Target) : Twine(TargetFuncName)) +
+              Twine(" with count of ") + Twine(Count) + ": " + Reason);
+      break;
+    }
+    Ret.push_back(PromotionCandidate(TargetFunction, Count));
+    TotalCount -= Count;
+  }
+  return Ret;
+}
+
+// Create a diamond structure for If_Then_Else. Also update the profile
+// count. Do the fix-up for the invoke instruction.
+static void createIfThenElse(Instruction *Inst, Function *DirectCallee,
+                             uint64_t Count, uint64_t TotalCount,
+                             BasicBlock **DirectCallBB,
+                             BasicBlock **IndirectCallBB,
+                             BasicBlock **MergeBB) {
+  CallSite CS(Inst);
+  Value *OrigCallee = CS.getCalledValue();
+
+  IRBuilder<> BBBuilder(Inst);
+  LLVMContext &Ctx = Inst->getContext();
+  Value *BCI1 =
+      BBBuilder.CreateBitCast(OrigCallee, Type::getInt8PtrTy(Ctx), "");
+  Value *BCI2 =
+      BBBuilder.CreateBitCast(DirectCallee, Type::getInt8PtrTy(Ctx), "");
+  Value *PtrCmp = BBBuilder.CreateICmpEQ(BCI1, BCI2, "");
+
+  uint64_t ElseCount = TotalCount - Count;
+  uint64_t MaxCount = (Count >= ElseCount ? Count : ElseCount);
+  uint64_t Scale = calculateCountScale(MaxCount);
+  MDBuilder MDB(Inst->getContext());
+  MDNode *BranchWeights = MDB.createBranchWeights(
+      scaleBranchCount(Count, Scale), scaleBranchCount(ElseCount, Scale));
+  TerminatorInst *ThenTerm, *ElseTerm;
+  SplitBlockAndInsertIfThenElse(PtrCmp, Inst, &ThenTerm, &ElseTerm,
+                                BranchWeights);
+  *DirectCallBB = ThenTerm->getParent();
+  (*DirectCallBB)->setName("if.true.direct_targ");
+  *IndirectCallBB = ElseTerm->getParent();
+  (*IndirectCallBB)->setName("if.false.orig_indirect");
+  *MergeBB = Inst->getParent();
+  (*MergeBB)->setName("if.end.icp");
+
+  // Special handing of Invoke instructions.
+  InvokeInst *II = dyn_cast<InvokeInst>(Inst);
+  if (!II)
+    return;
+
+  // We don't need branch instructions for invoke.
+  ThenTerm->eraseFromParent();
+  ElseTerm->eraseFromParent();
+
+  // Add jump from Merge BB to the NormalDest. This is needed for the newly
+  // created direct invoke stmt -- as its NormalDst will be fixed up to MergeBB.
+  BranchInst::Create(II->getNormalDest(), *MergeBB);
+}
+
+// Find the PHI in BB that have the CallResult as the operand.
+static bool getCallRetPHINode(BasicBlock *BB, Instruction *Inst) {
+  BasicBlock *From = Inst->getParent();
+  for (auto &I : *BB) {
+    PHINode *PHI = dyn_cast<PHINode>(&I);
+    if (!PHI)
+      continue;
+    int IX = PHI->getBasicBlockIndex(From);
+    if (IX == -1)
+      continue;
+    Value *V = PHI->getIncomingValue(IX);
+    if (dyn_cast<Instruction>(V) == Inst)
+      return true;
+  }
+  return false;
+}
+
+// This method fixes up PHI nodes in BB where BB is the UnwindDest of an
+// invoke instruction. In BB, there may be PHIs with incoming block being
+// OrigBB (the MergeBB after if-then-else splitting). After moving the invoke
+// instructions to its own BB, OrigBB is no longer the predecessor block of BB.
+// Instead two new predecessors are added: IndirectCallBB and DirectCallBB,
+// so the PHI node's incoming BBs need to be fixed up accordingly.
+static void fixupPHINodeForUnwind(Instruction *Inst, BasicBlock *BB,
+                                  BasicBlock *OrigBB,
+                                  BasicBlock *IndirectCallBB,
+                                  BasicBlock *DirectCallBB) {
+  for (auto &I : *BB) {
+    PHINode *PHI = dyn_cast<PHINode>(&I);
+    if (!PHI)
+      continue;
+    int IX = PHI->getBasicBlockIndex(OrigBB);
+    if (IX == -1)
+      continue;
+    Value *V = PHI->getIncomingValue(IX);
+    PHI->addIncoming(V, IndirectCallBB);
+    PHI->setIncomingBlock(IX, DirectCallBB);
+  }
+}
+
+// This method fixes up PHI nodes in BB where BB is the NormalDest of an
+// invoke instruction. In BB, there may be PHIs with incoming block being
+// OrigBB (the MergeBB after if-then-else splitting). After moving the invoke
+// instructions to its own BB, a new incoming edge will be added to the original
+// NormalDstBB from the IndirectCallBB.
+static void fixupPHINodeForNormalDest(Instruction *Inst, BasicBlock *BB,
+                                      BasicBlock *OrigBB,
+                                      BasicBlock *IndirectCallBB,
+                                      Instruction *NewInst) {
+  for (auto &I : *BB) {
+    PHINode *PHI = dyn_cast<PHINode>(&I);
+    if (!PHI)
+      continue;
+    int IX = PHI->getBasicBlockIndex(OrigBB);
+    if (IX == -1)
+      continue;
+    Value *V = PHI->getIncomingValue(IX);
+    if (dyn_cast<Instruction>(V) == Inst) {
+      PHI->setIncomingBlock(IX, IndirectCallBB);
+      PHI->addIncoming(NewInst, OrigBB);
+      continue;
+    }
+    PHI->addIncoming(V, IndirectCallBB);
+  }
+}
+
+// Add a bitcast instruction to the direct-call return value if needed.
+static Instruction *insertCallRetCast(const Instruction *Inst,
+                                      Instruction *DirectCallInst,
+                                      Function *DirectCallee) {
+  if (Inst->getType()->isVoidTy())
+    return DirectCallInst;
+
+  Type *CallRetType = Inst->getType();
+  Type *FuncRetType = DirectCallee->getReturnType();
+  if (FuncRetType == CallRetType)
+    return DirectCallInst;
+
+  BasicBlock *InsertionBB;
+  if (CallInst *CI = dyn_cast<CallInst>(DirectCallInst))
+    InsertionBB = CI->getParent();
+  else
+    InsertionBB = (dyn_cast<InvokeInst>(DirectCallInst))->getNormalDest();
+
+  return (new BitCastInst(DirectCallInst, CallRetType, "",
+                          InsertionBB->getTerminator()));
+}
+
+// Create a DirectCall instruction in the DirectCallBB.
+// Parameter Inst is the indirect-call (invoke) instruction.
+// DirectCallee is the decl of the direct-call (invoke) target.
+// DirecallBB is the BB that the direct-call (invoke) instruction is inserted.
+// MergeBB is the bottom BB of the if-then-else-diamond after the
+// transformation. For invoke instruction, the edges from DirectCallBB and
+// IndirectCallBB to MergeBB are removed before this call (during
+// createIfThenElse).
+static Instruction *createDirectCallInst(const Instruction *Inst,
+                                         Function *DirectCallee,
+                                         BasicBlock *DirectCallBB,
+                                         BasicBlock *MergeBB) {
+  Instruction *NewInst = Inst->clone();
+  if (CallInst *CI = dyn_cast<CallInst>(NewInst)) {
+    CI->setCalledFunction(DirectCallee);
+    CI->mutateFunctionType(DirectCallee->getFunctionType());
+  } else {
+    // Must be an invoke instruction. Direct invoke's normal destination is
+    // fixed up to MergeBB. MergeBB is the place where return cast is inserted.
+    // Also since IndirectCallBB does not have an edge to MergeBB, there is no
+    // need to insert new PHIs into MergeBB.
+    InvokeInst *II = dyn_cast<InvokeInst>(NewInst);
+    assert(II);
+    II->setCalledFunction(DirectCallee);
+    II->mutateFunctionType(DirectCallee->getFunctionType());
+    II->setNormalDest(MergeBB);
+  }
+
+  DirectCallBB->getInstList().insert(DirectCallBB->getFirstInsertionPt(),
+                                     NewInst);
+
+  // Clear the value profile data.
+  NewInst->setMetadata(LLVMContext::MD_prof, 0);
+  CallSite NewCS(NewInst);
+  FunctionType *DirectCalleeType = DirectCallee->getFunctionType();
+  unsigned ParamNum = DirectCalleeType->getFunctionNumParams();
+  for (unsigned I = 0; I < ParamNum; ++I) {
+    Type *ATy = NewCS.getArgument(I)->getType();
+    Type *PTy = DirectCalleeType->getParamType(I);
+    if (ATy != PTy) {
+      BitCastInst *BI = new BitCastInst(NewCS.getArgument(I), PTy, "", NewInst);
+      NewCS.setArgument(I, BI);
+    }
+  }
+
+  return insertCallRetCast(Inst, NewInst, DirectCallee);
+}
+
+// Create a PHI to unify the return values of calls.
+static void insertCallRetPHI(Instruction *Inst, Instruction *CallResult,
+                             Function *DirectCallee) {
+  if (Inst->getType()->isVoidTy())
+    return;
+
+  BasicBlock *RetValBB = CallResult->getParent();
+
+  BasicBlock *PHIBB;
+  if (InvokeInst *II = dyn_cast<InvokeInst>(CallResult))
+    RetValBB = II->getNormalDest();
+
+  PHIBB = RetValBB->getSingleSuccessor();
+  if (getCallRetPHINode(PHIBB, Inst))
+    return;
+
+  PHINode *CallRetPHI = PHINode::Create(Inst->getType(), 0);
+  PHIBB->getInstList().push_front(CallRetPHI);
+  Inst->replaceAllUsesWith(CallRetPHI);
+  CallRetPHI->addIncoming(Inst, Inst->getParent());
+  CallRetPHI->addIncoming(CallResult, RetValBB);
+}
+
+// This function does the actual indirect-call promotion transformation:
+// For an indirect-call like:
+//     Ret = (*Foo)(Args);
+// It transforms to:
+//     if (Foo == DirectCallee)
+//        Ret1 = DirectCallee(Args);
+//     else
+//        Ret2 = (*Foo)(Args);
+//     Ret = phi(Ret1, Ret2);
+// It adds type casts for the args do not match the parameters and the return
+// value. Branch weights metadata also updated.
+void ICallPromotionFunc::promote(Instruction *Inst, Function *DirectCallee,
+                                 uint64_t Count, uint64_t TotalCount) {
+  assert(DirectCallee != nullptr);
+  BasicBlock *BB = Inst->getParent();
+  // Just to suppress the non-debug build warning.
+  (void)BB;
+  DEBUG(dbgs() << "\n\n== Basic Block Before ==\n");
+  DEBUG(dbgs() << *BB << "\n");
+
+  BasicBlock *DirectCallBB, *IndirectCallBB, *MergeBB;
+  createIfThenElse(Inst, DirectCallee, Count, TotalCount, &DirectCallBB,
+                   &IndirectCallBB, &MergeBB);
+
+  Instruction *NewInst =
+      createDirectCallInst(Inst, DirectCallee, DirectCallBB, MergeBB);
+
+  // Move Inst from MergeBB to IndirectCallBB.
+  Inst->removeFromParent();
+  IndirectCallBB->getInstList().insert(IndirectCallBB->getFirstInsertionPt(),
+                                       Inst);
+
+  if (InvokeInst *II = dyn_cast<InvokeInst>(Inst)) {
+    // At this point, the original indirect invoke instruction has the original
+    // UnwindDest and NormalDest. For the direct invoke instruction, the
+    // NormalDest points to MergeBB, and MergeBB jumps to the original
+    // NormalDest. MergeBB might have a new bitcast instruction for the return
+    // value. The PHIs are with the original NormalDest. Since we now have two
+    // incoming edges to NormalDest and UnwindDest, we have to do some fixups.
+    //
+    // UnwindDest will not use the return value. So pass nullptr here.
+    fixupPHINodeForUnwind(Inst, II->getUnwindDest(), MergeBB, IndirectCallBB,
+                          DirectCallBB);
+    // We don't need to update the operand from NormalDest for DirectCallBB.
+    // Pass nullptr here.
+    fixupPHINodeForNormalDest(Inst, II->getNormalDest(), MergeBB,
+                              IndirectCallBB, NewInst);
+  }
+
+  insertCallRetPHI(Inst, NewInst, DirectCallee);
+
+  DEBUG(dbgs() << "\n== Basic Blocks After ==\n");
+  DEBUG(dbgs() << *BB << *DirectCallBB << *IndirectCallBB << *MergeBB << "\n");
+
+  emitOptimizationRemark(
+      F.getContext(), "pgo-icall-prom", F, Inst->getDebugLoc(),
+      Twine("Promote indirect call to ") + DirectCallee->getName() +
+          " with count " + Twine(Count) + " out of " + Twine(TotalCount));
+}
+
+// Promote indirect-call to conditional direct-call for one callsite.
+uint32_t ICallPromotionFunc::tryToPromote(
+    Instruction *Inst, const std::vector<PromotionCandidate> &Candidates,
+    uint64_t &TotalCount) {
+  uint32_t NumPromoted = 0;
+
+  for (auto &C : Candidates) {
+    uint64_t Count = C.Count;
+    promote(Inst, C.TargetFunction, Count, TotalCount);
+    assert(TotalCount >= Count);
+    TotalCount -= Count;
+    NumOfPGOICallPromotion++;
+    NumPromoted++;
+  }
+  return NumPromoted;
+}
+
+// Traverse all the indirect-call callsite and get the value profile
+// annotation to perform indirect-call promotion.
+bool ICallPromotionFunc::processFunction() {
+  bool Changed = false;
+  ICallPromotionAnalysis ICallAnalysis;
+  for (auto &I : findIndirectCallSites(F)) {
+    uint32_t NumVals, NumCandidates;
+    uint64_t TotalCount;
+    auto ICallProfDataRef = ICallAnalysis.getPromotionCandidatesForInstruction(
+        I, NumVals, TotalCount, NumCandidates);
+    if (!NumCandidates)
+      continue;
+    auto PromotionCandidates = getPromotionCandidatesForCallSite(
+        I, ICallProfDataRef, TotalCount, NumCandidates);
+    uint32_t NumPromoted = tryToPromote(I, PromotionCandidates, TotalCount);
+    if (NumPromoted == 0)
+      continue;
+
+    Changed = true;
+    // Adjust the MD.prof metadata. First delete the old one.
+    I->setMetadata(LLVMContext::MD_prof, 0);
+    // If all promoted, we don't need the MD.prof metadata.
+    if (TotalCount == 0 || NumPromoted == NumVals)
+      continue;
+    // Otherwise we need update with the un-promoted records back.
+    annotateValueSite(*M, *I, ICallProfDataRef.slice(NumPromoted), TotalCount,
+                      IPVK_IndirectCallTarget, NumCandidates);
+  }
+  return Changed;
+}
+
+// A wrapper function that does the actual work.
+static bool promoteIndirectCalls(Module &M, bool InLTO) {
+  if (DisableICP)
+    return false;
+  InstrProfSymtab Symtab;
+  Symtab.create(M, InLTO);
+  bool Changed = false;
+  for (auto &F : M) {
+    if (F.isDeclaration())
+      continue;
+    if (F.hasFnAttribute(Attribute::OptimizeNone))
+      continue;
+    ICallPromotionFunc ICallPromotion(F, &M, &Symtab);
+    bool FuncChanged = ICallPromotion.processFunction();
+    if (ICPDUMPAFTER && FuncChanged) {
+      DEBUG(dbgs() << "\n== IR Dump After =="; F.print(dbgs()));
+      DEBUG(dbgs() << "\n");
+    }
+    Changed |= FuncChanged;
+    if (ICPCutOff != 0 && NumOfPGOICallPromotion >= ICPCutOff) {
+      DEBUG(dbgs() << " Stop: Cutoff reached.\n");
+      break;
+    }
+  }
+  return Changed;
+}
+
+bool PGOIndirectCallPromotionLegacyPass::runOnModule(Module &M) {
+  // Command-line option has the priority for InLTO.
+  return promoteIndirectCalls(M, InLTO | ICPLTOMode);
+}
+
+PreservedAnalyses PGOIndirectCallPromotion::run(Module &M, AnalysisManager<Module> &AM) {
+  if (!promoteIndirectCalls(M, InLTO | ICPLTOMode))
+    return PreservedAnalyses::all();
+
+  return PreservedAnalyses::none();
+}
diff --git a/lib/Transforms/Instrumentation/InstrProfiling.cpp b/lib/Transforms/Instrumentation/InstrProfiling.cpp
index 28483e7e9b69..b11c6be696f3 100644
--- a/lib/Transforms/Instrumentation/InstrProfiling.cpp
+++ b/lib/Transforms/Instrumentation/InstrProfiling.cpp
@@ -13,12 +13,12 @@
 //
 //===----------------------------------------------------------------------===//
 
+#include "llvm/Transforms/InstrProfiling.h"
 #include "llvm/ADT/Triple.h"
 #include "llvm/IR/IRBuilder.h"
 #include "llvm/IR/IntrinsicInst.h"
 #include "llvm/IR/Module.h"
 #include "llvm/ProfileData/InstrProf.h"
-#include "llvm/Transforms/Instrumentation.h"
 #include "llvm/Transforms/Utils/ModuleUtils.h"
 
 using namespace llvm;
@@ -27,121 +27,112 @@ using namespace llvm;
 
 namespace {
 
-class InstrProfiling : public ModulePass {
+cl::opt<bool> DoNameCompression("enable-name-compression",
+                                cl::desc("Enable name string compression"),
+                                cl::init(true));
+
+cl::opt<bool> ValueProfileStaticAlloc(
+    "vp-static-alloc",
+    cl::desc("Do static counter allocation for value profiler"),
+    cl::init(true));
+cl::opt<double> NumCountersPerValueSite(
+    "vp-counters-per-site",
+    cl::desc("The average number of profile counters allocated "
+             "per value profiling site."),
+    // This is set to a very small value because in real programs, only
+    // a very small percentage of value sites have non-zero targets, e.g, 1/30.
+    // For those sites with non-zero profile, the average number of targets
+    // is usually smaller than 2.
+    cl::init(1.0));
+
+class InstrProfilingLegacyPass : public ModulePass {
+  InstrProfiling InstrProf;
+
 public:
   static char ID;
-
-  InstrProfiling() : ModulePass(ID) {}
-
-  InstrProfiling(const InstrProfOptions &Options)
-      : ModulePass(ID), Options(Options) {}
-
+  InstrProfilingLegacyPass() : ModulePass(ID), InstrProf() {}
+  InstrProfilingLegacyPass(const InstrProfOptions &Options)
+      : ModulePass(ID), InstrProf(Options) {}
   const char *getPassName() const override {
     return "Frontend instrumentation-based coverage lowering";
   }
 
-  bool runOnModule(Module &M) override;
+  bool runOnModule(Module &M) override { return InstrProf.run(M); }
 
   void getAnalysisUsage(AnalysisUsage &AU) const override {
     AU.setPreservesCFG();
   }
+};
 
-private:
-  InstrProfOptions Options;
-  Module *M;
-  typedef struct PerFunctionProfileData {
-    uint32_t NumValueSites[IPVK_Last+1];
-    GlobalVariable* RegionCounters;
-    GlobalVariable* DataVar;
-    PerFunctionProfileData() : RegionCounters(nullptr), DataVar(nullptr) {
-      memset(NumValueSites, 0, sizeof(uint32_t) * (IPVK_Last+1));
-    }
-  } PerFunctionProfileData;
-  DenseMap<GlobalVariable *, PerFunctionProfileData> ProfileDataMap;
-  std::vector<Value *> UsedVars;
-
-  bool isMachO() const {
-    return Triple(M->getTargetTriple()).isOSBinFormatMachO();
-  }
-
-  /// Get the section name for the counter variables.
-  StringRef getCountersSection() const {
-    return getInstrProfCountersSectionName(isMachO());
-  }
-
-  /// Get the section name for the name variables.
-  StringRef getNameSection() const {
-    return getInstrProfNameSectionName(isMachO());
-  }
-
-  /// Get the section name for the profile data variables.
-  StringRef getDataSection() const {
-    return getInstrProfDataSectionName(isMachO());
-  }
-
-  /// Get the section name for the coverage mapping data.
-  StringRef getCoverageSection() const {
-    return getInstrProfCoverageSectionName(isMachO());
-  }
-
-  /// Count the number of instrumented value sites for the function.
-  void computeNumValueSiteCounts(InstrProfValueProfileInst *Ins);
-
-  /// Replace instrprof_value_profile with a call to runtime library.
-  void lowerValueProfileInst(InstrProfValueProfileInst *Ins);
-
-  /// Replace instrprof_increment with an increment of the appropriate value.
-  void lowerIncrement(InstrProfIncrementInst *Inc);
+} // anonymous namespace
 
-  /// Force emitting of name vars for unused functions.
-  void lowerCoverageData(GlobalVariable *CoverageNamesVar);
+PreservedAnalyses InstrProfiling::run(Module &M, AnalysisManager<Module> &AM) {
+  if (!run(M))
+    return PreservedAnalyses::all();
 
-  /// Get the region counters for an increment, creating them if necessary.
-  ///
-  /// If the counter array doesn't yet exist, the profile data variables
-  /// referring to them will also be created.
-  GlobalVariable *getOrCreateRegionCounters(InstrProfIncrementInst *Inc);
+  return PreservedAnalyses::none();
+}
 
-  /// Emit runtime registration functions for each profile data variable.
-  void emitRegistration();
+char InstrProfilingLegacyPass::ID = 0;
+INITIALIZE_PASS(InstrProfilingLegacyPass, "instrprof",
+                "Frontend instrumentation-based coverage lowering.", false,
+                false)
 
-  /// Emit the necessary plumbing to pull in the runtime initialization.
-  void emitRuntimeHook();
+ModulePass *
+llvm::createInstrProfilingLegacyPass(const InstrProfOptions &Options) {
+  return new InstrProfilingLegacyPass(Options);
+}
 
-  /// Add uses of our data variables and runtime hook.
-  void emitUses();
+bool InstrProfiling::isMachO() const {
+  return Triple(M->getTargetTriple()).isOSBinFormatMachO();
+}
 
-  /// Create a static initializer for our data, on platforms that need it,
-  /// and for any profile output file that was specified.
-  void emitInitialization();
-};
+/// Get the section name for the counter variables.
+StringRef InstrProfiling::getCountersSection() const {
+  return getInstrProfCountersSectionName(isMachO());
+}
 
-} // anonymous namespace
+/// Get the section name for the name variables.
+StringRef InstrProfiling::getNameSection() const {
+  return getInstrProfNameSectionName(isMachO());
+}
 
-char InstrProfiling::ID = 0;
-INITIALIZE_PASS(InstrProfiling, "instrprof",
-                "Frontend instrumentation-based coverage lowering.", false,
-                false)
+/// Get the section name for the profile data variables.
+StringRef InstrProfiling::getDataSection() const {
+  return getInstrProfDataSectionName(isMachO());
+}
 
-ModulePass *llvm::createInstrProfilingPass(const InstrProfOptions &Options) {
-  return new InstrProfiling(Options);
+/// Get the section name for the coverage mapping data.
+StringRef InstrProfiling::getCoverageSection() const {
+  return getInstrProfCoverageSectionName(isMachO());
 }
 
-bool InstrProfiling::runOnModule(Module &M) {
+bool InstrProfiling::run(Module &M) {
   bool MadeChange = false;
 
   this->M = &M;
+  NamesVar = nullptr;
+  NamesSize = 0;
   ProfileDataMap.clear();
   UsedVars.clear();
 
   // We did not know how many value sites there would be inside
   // the instrumented function. This is counting the number of instrumented
   // target value sites to enter it as field in the profile data variable.
-  for (Function &F : M)
+  for (Function &F : M) {
+    InstrProfIncrementInst *FirstProfIncInst = nullptr;
     for (BasicBlock &BB : F)
-      for (auto I = BB.begin(), E = BB.end(); I != E;)
-        if (auto *Ind = dyn_cast<InstrProfValueProfileInst>(I++))
+      for (auto I = BB.begin(), E = BB.end(); I != E; I++)
+        if (auto *Ind = dyn_cast<InstrProfValueProfileInst>(I))
           computeNumValueSiteCounts(Ind);
+        else if (FirstProfIncInst == nullptr)
+          FirstProfIncInst = dyn_cast<InstrProfIncrementInst>(I);
+
+    // Value profiling intrinsic lowering requires per-function profile data
+    // variable to be created first.
+    if (FirstProfIncInst != nullptr)
+      static_cast<void>(getOrCreateRegionCounters(FirstProfIncInst));
+  }
 
   for (Function &F : M)
     for (BasicBlock &BB : F)
@@ -157,7 +148,7 @@ bool InstrProfiling::runOnModule(Module &M) {
       }
 
   if (GlobalVariable *CoverageNamesVar =
-          M.getNamedGlobal(getCoverageNamesVarName())) {
+          M.getNamedGlobal(getCoverageUnusedNamesVarName())) {
     lowerCoverageData(CoverageNamesVar);
     MadeChange = true;
   }
@@ -165,6 +156,8 @@ bool InstrProfiling::runOnModule(Module &M) {
   if (!MadeChange)
     return false;
 
+  emitVNodes();
+  emitNameData();
   emitRegistration();
   emitRuntimeHook();
   emitUses();
@@ -204,7 +197,7 @@ void InstrProfiling::lowerValueProfileInst(InstrProfValueProfileInst *Ind) {
   GlobalVariable *Name = Ind->getName();
   auto It = ProfileDataMap.find(Name);
   assert(It != ProfileDataMap.end() && It->second.DataVar &&
-    "value profiling detected in function with no counter incerement");
+         "value profiling detected in function with no counter incerement");
 
   GlobalVariable *DataVar = It->second.DataVar;
   uint64_t ValueKind = Ind->getValueKind()->getZExtValue();
@@ -213,9 +206,9 @@ void InstrProfiling::lowerValueProfileInst(InstrProfValueProfileInst *Ind) {
     Index += It->second.NumValueSites[Kind];
 
   IRBuilder<> Builder(Ind);
-  Value* Args[3] = {Ind->getTargetValue(),
-      Builder.CreateBitCast(DataVar, Builder.getInt8PtrTy()),
-      Builder.getInt32(Index)};
+  Value *Args[3] = {Ind->getTargetValue(),
+                    Builder.CreateBitCast(DataVar, Builder.getInt8PtrTy()),
+                    Builder.getInt32(Index)};
   Ind->replaceAllUsesWith(
       Builder.CreateCall(getOrInsertValueProfilingCall(*M), Args));
   Ind->eraseFromParent();
@@ -243,9 +236,8 @@ void InstrProfiling::lowerCoverageData(GlobalVariable *CoverageNamesVar) {
     assert(isa<GlobalVariable>(V) && "Missing reference to function name");
     GlobalVariable *Name = cast<GlobalVariable>(V);
 
-    // Move the name variable to the right section.
-    Name->setSection(getNameSection());
-    Name->setAlignment(1);
+    Name->setLinkage(GlobalValue::PrivateLinkage);
+    ReferencedNames.push_back(Name);
   }
 }
 
@@ -261,22 +253,77 @@ static inline bool shouldRecordFunctionAddr(Function *F) {
   if (!F->hasLinkOnceLinkage() && !F->hasLocalLinkage() &&
       !F->hasAvailableExternallyLinkage())
     return true;
+  // Prohibit function address recording if the function is both internal and
+  // COMDAT. This avoids the profile data variable referencing internal symbols
+  // in COMDAT.
+  if (F->hasLocalLinkage() && F->hasComdat())
+    return false;
   // Check uses of this function for other than direct calls or invokes to it.
-  return F->hasAddressTaken();
+  // Inline virtual functions have linkeOnceODR linkage. When a key method
+  // exists, the vtable will only be emitted in the TU where the key method
+  // is defined. In a TU where vtable is not available, the function won't
+  // be 'addresstaken'. If its address is not recorded here, the profile data
+  // with missing address may be picked by the linker leading  to missing
+  // indirect call target info.
+  return F->hasAddressTaken() || F->hasLinkOnceLinkage();
+}
+
+static inline bool needsComdatForCounter(Function &F, Module &M) {
+
+  if (F.hasComdat())
+    return true;
+
+  Triple TT(M.getTargetTriple());
+  if (!TT.isOSBinFormatELF())
+    return false;
+
+  // See createPGOFuncNameVar for more details. To avoid link errors, profile
+  // counters for function with available_externally linkage needs to be changed
+  // to linkonce linkage. On ELF based systems, this leads to weak symbols to be
+  // created. Without using comdat, duplicate entries won't be removed by the
+  // linker leading to increased data segement size and raw profile size. Even
+  // worse, since the referenced counter from profile per-function data object
+  // will be resolved to the common strong definition, the profile counts for
+  // available_externally functions will end up being duplicated in raw profile
+  // data. This can result in distorted profile as the counts of those dups
+  // will be accumulated by the profile merger.
+  GlobalValue::LinkageTypes Linkage = F.getLinkage();
+  if (Linkage != GlobalValue::ExternalWeakLinkage &&
+      Linkage != GlobalValue::AvailableExternallyLinkage)
+    return false;
+
+  return true;
 }
 
-static inline Comdat *getOrCreateProfileComdat(Module &M,
+static inline Comdat *getOrCreateProfileComdat(Module &M, Function &F,
                                                InstrProfIncrementInst *Inc) {
+  if (!needsComdatForCounter(F, M))
+    return nullptr;
+
   // COFF format requires a COMDAT section to have a key symbol with the same
-  // name. The linker targeting COFF also requires that the COMDAT section
+  // name. The linker targeting COFF also requires that the COMDAT
   // a section is associated to must precede the associating section. For this
-  // reason, we must choose the name var's name as the name of the comdat.
+  // reason, we must choose the counter var's name as the name of the comdat.
   StringRef ComdatPrefix = (Triple(M.getTargetTriple()).isOSBinFormatCOFF()
-                                ? getInstrProfNameVarPrefix()
+                                ? getInstrProfCountersVarPrefix()
                                 : getInstrProfComdatPrefix());
   return M.getOrInsertComdat(StringRef(getVarName(Inc, ComdatPrefix)));
 }
 
+static bool needsRuntimeRegistrationOfSectionRange(const Module &M) {
+  // Don't do this for Darwin.  compiler-rt uses linker magic.
+  if (Triple(M.getTargetTriple()).isOSDarwin())
+    return false;
+
+  // Use linker script magic to get data/cnts/name start/end.
+  if (Triple(M.getTargetTriple()).isOSLinux() ||
+      Triple(M.getTargetTriple()).isOSFreeBSD() ||
+      Triple(M.getTargetTriple()).isPS4CPU())
+    return false;
+
+  return true;
+}
+
 GlobalVariable *
 InstrProfiling::getOrCreateRegionCounters(InstrProfIncrementInst *Inc) {
   GlobalVariable *NamePtr = Inc->getName();
@@ -294,11 +341,7 @@ InstrProfiling::getOrCreateRegionCounters(InstrProfIncrementInst *Inc) {
   // linking.
   Function *Fn = Inc->getParent()->getParent();
   Comdat *ProfileVarsComdat = nullptr;
-  if (Fn->hasComdat())
-    ProfileVarsComdat = getOrCreateProfileComdat(*M, Inc);
-  NamePtr->setSection(getNameSection());
-  NamePtr->setAlignment(1);
-  NamePtr->setComdat(ProfileVarsComdat);
+  ProfileVarsComdat = getOrCreateProfileComdat(*M, *Fn, Inc);
 
   uint64_t NumCounters = Inc->getNumCounters()->getZExtValue();
   LLVMContext &Ctx = M->getContext();
@@ -314,27 +357,51 @@ InstrProfiling::getOrCreateRegionCounters(InstrProfIncrementInst *Inc) {
   CounterPtr->setAlignment(8);
   CounterPtr->setComdat(ProfileVarsComdat);
 
-  // Create data variable.
   auto *Int8PtrTy = Type::getInt8PtrTy(Ctx);
+  // Allocate statically the array of pointers to value profile nodes for
+  // the current function.
+  Constant *ValuesPtrExpr = ConstantPointerNull::get(Int8PtrTy);
+  if (ValueProfileStaticAlloc && !needsRuntimeRegistrationOfSectionRange(*M)) {
+
+    uint64_t NS = 0;
+    for (uint32_t Kind = IPVK_First; Kind <= IPVK_Last; ++Kind)
+      NS += PD.NumValueSites[Kind];
+    if (NS) {
+      ArrayType *ValuesTy = ArrayType::get(Type::getInt64Ty(Ctx), NS);
+
+      auto *ValuesVar =
+          new GlobalVariable(*M, ValuesTy, false, NamePtr->getLinkage(),
+                             Constant::getNullValue(ValuesTy),
+                             getVarName(Inc, getInstrProfValuesVarPrefix()));
+      ValuesVar->setVisibility(NamePtr->getVisibility());
+      ValuesVar->setSection(getInstrProfValuesSectionName(isMachO()));
+      ValuesVar->setAlignment(8);
+      ValuesVar->setComdat(ProfileVarsComdat);
+      ValuesPtrExpr =
+          ConstantExpr::getBitCast(ValuesVar, llvm::Type::getInt8PtrTy(Ctx));
+    }
+  }
+
+  // Create data variable.
   auto *Int16Ty = Type::getInt16Ty(Ctx);
-  auto *Int16ArrayTy = ArrayType::get(Int16Ty, IPVK_Last+1);
+  auto *Int16ArrayTy = ArrayType::get(Int16Ty, IPVK_Last + 1);
   Type *DataTypes[] = {
-    #define INSTR_PROF_DATA(Type, LLVMType, Name, Init) LLVMType,
-    #include "llvm/ProfileData/InstrProfData.inc"
+#define INSTR_PROF_DATA(Type, LLVMType, Name, Init) LLVMType,
+#include "llvm/ProfileData/InstrProfData.inc"
   };
   auto *DataTy = StructType::get(Ctx, makeArrayRef(DataTypes));
 
-  Constant *FunctionAddr = shouldRecordFunctionAddr(Fn) ?
-                           ConstantExpr::getBitCast(Fn, Int8PtrTy) :
-                           ConstantPointerNull::get(Int8PtrTy);
+  Constant *FunctionAddr = shouldRecordFunctionAddr(Fn)
+                               ? ConstantExpr::getBitCast(Fn, Int8PtrTy)
+                               : ConstantPointerNull::get(Int8PtrTy);
 
-  Constant *Int16ArrayVals[IPVK_Last+1];
+  Constant *Int16ArrayVals[IPVK_Last + 1];
   for (uint32_t Kind = IPVK_First; Kind <= IPVK_Last; ++Kind)
     Int16ArrayVals[Kind] = ConstantInt::get(Int16Ty, PD.NumValueSites[Kind]);
 
   Constant *DataVals[] = {
-    #define INSTR_PROF_DATA(Type, LLVMType, Name, Init) Init,
-    #include "llvm/ProfileData/InstrProfData.inc"
+#define INSTR_PROF_DATA(Type, LLVMType, Name, Init) Init,
+#include "llvm/ProfileData/InstrProfData.inc"
   };
   auto *Data = new GlobalVariable(*M, DataTy, false, NamePtr->getLinkage(),
                                   ConstantStruct::get(DataTy, DataVals),
@@ -350,28 +417,99 @@ InstrProfiling::getOrCreateRegionCounters(InstrProfIncrementInst *Inc) {
 
   // Mark the data variable as used so that it isn't stripped out.
   UsedVars.push_back(Data);
+  // Now that the linkage set by the FE has been passed to the data and counter
+  // variables, reset Name variable's linkage and visibility to private so that
+  // it can be removed later by the compiler.
+  NamePtr->setLinkage(GlobalValue::PrivateLinkage);
+  // Collect the referenced names to be used by emitNameData.
+  ReferencedNames.push_back(NamePtr);
 
   return CounterPtr;
 }
 
-void InstrProfiling::emitRegistration() {
-  // Don't do this for Darwin.  compiler-rt uses linker magic.
-  if (Triple(M->getTargetTriple()).isOSDarwin())
+void InstrProfiling::emitVNodes() {
+  if (!ValueProfileStaticAlloc)
     return;
 
-  // Use linker script magic to get data/cnts/name start/end.
-  if (Triple(M->getTargetTriple()).isOSLinux() ||
-      Triple(M->getTargetTriple()).isOSFreeBSD())
+  // For now only support this on platforms that do
+  // not require runtime registration to discover
+  // named section start/end.
+  if (needsRuntimeRegistrationOfSectionRange(*M))
+    return;
+
+  size_t TotalNS = 0;
+  for (auto &PD : ProfileDataMap) {
+    for (uint32_t Kind = IPVK_First; Kind <= IPVK_Last; ++Kind)
+      TotalNS += PD.second.NumValueSites[Kind];
+  }
+
+  if (!TotalNS)
+    return;
+
+  uint64_t NumCounters = TotalNS * NumCountersPerValueSite;
+// Heuristic for small programs with very few total value sites.
+// The default value of vp-counters-per-site is chosen based on
+// the observation that large apps usually have a low percentage
+// of value sites that actually have any profile data, and thus
+// the average number of counters per site is low. For small
+// apps with very few sites, this may not be true. Bump up the
+// number of counters in this case.
+#define INSTR_PROF_MIN_VAL_COUNTS 10
+  if (NumCounters < INSTR_PROF_MIN_VAL_COUNTS)
+    NumCounters = std::max(INSTR_PROF_MIN_VAL_COUNTS, (int)NumCounters * 2);
+
+  auto &Ctx = M->getContext();
+  Type *VNodeTypes[] = {
+#define INSTR_PROF_VALUE_NODE(Type, LLVMType, Name, Init) LLVMType,
+#include "llvm/ProfileData/InstrProfData.inc"
+  };
+  auto *VNodeTy = StructType::get(Ctx, makeArrayRef(VNodeTypes));
+
+  ArrayType *VNodesTy = ArrayType::get(VNodeTy, NumCounters);
+  auto *VNodesVar = new GlobalVariable(
+      *M, VNodesTy, false, llvm::GlobalValue::PrivateLinkage,
+      Constant::getNullValue(VNodesTy), getInstrProfVNodesVarName());
+  VNodesVar->setSection(getInstrProfVNodesSectionName(isMachO()));
+  UsedVars.push_back(VNodesVar);
+}
+
+void InstrProfiling::emitNameData() {
+  std::string UncompressedData;
+
+  if (ReferencedNames.empty())
+    return;
+
+  std::string CompressedNameStr;
+  if (Error E = collectPGOFuncNameStrings(ReferencedNames, CompressedNameStr,
+                                          DoNameCompression)) {
+    llvm::report_fatal_error(toString(std::move(E)), false);
+  }
+
+  auto &Ctx = M->getContext();
+  auto *NamesVal = llvm::ConstantDataArray::getString(
+      Ctx, StringRef(CompressedNameStr), false);
+  NamesVar = new llvm::GlobalVariable(*M, NamesVal->getType(), true,
+                                      llvm::GlobalValue::PrivateLinkage,
+                                      NamesVal, getInstrProfNamesVarName());
+  NamesSize = CompressedNameStr.size();
+  NamesVar->setSection(getNameSection());
+  UsedVars.push_back(NamesVar);
+}
+
+void InstrProfiling::emitRegistration() {
+  if (!needsRuntimeRegistrationOfSectionRange(*M))
     return;
 
   // Construct the function.
   auto *VoidTy = Type::getVoidTy(M->getContext());
   auto *VoidPtrTy = Type::getInt8PtrTy(M->getContext());
+  auto *Int64Ty = Type::getInt64Ty(M->getContext());
   auto *RegisterFTy = FunctionType::get(VoidTy, false);
   auto *RegisterF = Function::Create(RegisterFTy, GlobalValue::InternalLinkage,
                                      getInstrProfRegFuncsName(), M);
-  RegisterF->setUnnamedAddr(true);
-  if (Options.NoRedZone) RegisterF->addFnAttr(Attribute::NoRedZone);
+  RegisterF->setUnnamedAddr(GlobalValue::UnnamedAddr::Global);
+  if (Options.NoRedZone)
+    RegisterF->addFnAttr(Attribute::NoRedZone);
 
   auto *RuntimeRegisterTy = FunctionType::get(VoidTy, VoidPtrTy, false);
   auto *RuntimeRegisterF =
@@ -380,7 +518,20 @@ void InstrProfiling::emitRegistration() {
 
   IRBuilder<> IRB(BasicBlock::Create(M->getContext(), "", RegisterF));
   for (Value *Data : UsedVars)
-    IRB.CreateCall(RuntimeRegisterF, IRB.CreateBitCast(Data, VoidPtrTy));
+    if (Data != NamesVar)
+      IRB.CreateCall(RuntimeRegisterF, IRB.CreateBitCast(Data, VoidPtrTy));
+
+  if (NamesVar) {
+    Type *ParamTypes[] = {VoidPtrTy, Int64Ty};
+    auto *NamesRegisterTy =
+        FunctionType::get(VoidTy, makeArrayRef(ParamTypes), false);
+    auto *NamesRegisterF =
+        Function::Create(NamesRegisterTy, GlobalVariable::ExternalLinkage,
+                         getInstrProfNamesRegFuncName(), M);
+    IRB.CreateCall(NamesRegisterF, {IRB.CreateBitCast(NamesVar, VoidPtrTy),
+                                    IRB.getInt64(NamesSize)});
+  }
+
   IRB.CreateRetVoid();
 }
 
@@ -392,7 +543,8 @@ void InstrProfiling::emitRuntimeHook() {
     return;
 
   // If the module's provided its own runtime, we don't need to do anything.
-  if (M->getGlobalVariable(getInstrProfRuntimeHookVarName())) return;
+  if (M->getGlobalVariable(getInstrProfRuntimeHookVarName()))
+    return;
 
   // Declare an external variable that will pull in the runtime initialization.
   auto *Int32Ty = Type::getInt32Ty(M->getContext());
@@ -405,8 +557,11 @@ void InstrProfiling::emitRuntimeHook() {
                                 GlobalValue::LinkOnceODRLinkage,
                                 getInstrProfRuntimeHookVarUseFuncName(), M);
   User->addFnAttr(Attribute::NoInline);
-  if (Options.NoRedZone) User->addFnAttr(Attribute::NoRedZone);
+  if (Options.NoRedZone)
+    User->addFnAttr(Attribute::NoRedZone);
   User->setVisibility(GlobalValue::HiddenVisibility);
+  if (Triple(M->getTargetTriple()).supportsCOMDAT())
+    User->setComdat(M->getOrInsertComdat(User->getName()));
 
   IRBuilder<> IRB(BasicBlock::Create(M->getContext(), "", User));
   auto *Load = IRB.CreateLoad(Var);
@@ -448,16 +603,18 @@ void InstrProfiling::emitInitialization() {
   std::string InstrProfileOutput = Options.InstrProfileOutput;
 
   Constant *RegisterF = M->getFunction(getInstrProfRegFuncsName());
-  if (!RegisterF && InstrProfileOutput.empty()) return;
+  if (!RegisterF && InstrProfileOutput.empty())
+    return;
 
   // Create the initialization function.
   auto *VoidTy = Type::getVoidTy(M->getContext());
   auto *F = Function::Create(FunctionType::get(VoidTy, false),
                              GlobalValue::InternalLinkage,
                              getInstrProfInitFuncName(), M);
-  F->setUnnamedAddr(true);
+  F->setUnnamedAddr(GlobalValue::UnnamedAddr::Global);
   F->addFnAttr(Attribute::NoInline);
-  if (Options.NoRedZone) F->addFnAttr(Attribute::NoRedZone);
+  if (Options.NoRedZone)
+    F->addFnAttr(Attribute::NoRedZone);
 
   // Add the basic block and the necessary calls.
   IRBuilder<> IRB(BasicBlock::Create(M->getContext(), "", F));
diff --git a/lib/Transforms/Instrumentation/Instrumentation.cpp b/lib/Transforms/Instrumentation/Instrumentation.cpp
index a05a5fa09f9a..2963d08752c4 100644
--- a/lib/Transforms/Instrumentation/Instrumentation.cpp
+++ b/lib/Transforms/Instrumentation/Instrumentation.cpp
@@ -59,15 +59,16 @@ void llvm::initializeInstrumentation(PassRegistry &Registry) {
   initializeAddressSanitizerPass(Registry);
   initializeAddressSanitizerModulePass(Registry);
   initializeBoundsCheckingPass(Registry);
-  initializeGCOVProfilerPass(Registry);
-  initializePGOInstrumentationGenPass(Registry);
-  initializePGOInstrumentationUsePass(Registry);
-  initializeInstrProfilingPass(Registry);
+  initializeGCOVProfilerLegacyPassPass(Registry);
+  initializePGOInstrumentationGenLegacyPassPass(Registry);
+  initializePGOInstrumentationUseLegacyPassPass(Registry);
+  initializePGOIndirectCallPromotionLegacyPassPass(Registry);
+  initializeInstrProfilingLegacyPassPass(Registry);
   initializeMemorySanitizerPass(Registry);
   initializeThreadSanitizerPass(Registry);
   initializeSanitizerCoverageModulePass(Registry);
   initializeDataFlowSanitizerPass(Registry);
-  initializeSafeStackPass(Registry);
+  initializeEfficiencySanitizerPass(Registry);
 }
 
 /// LLVMInitializeInstrumentation - C binding for
diff --git a/lib/Transforms/Instrumentation/Makefile b/lib/Transforms/Instrumentation/Makefile
deleted file mode 100644
index 6cbc7a9cd88a..000000000000
--- a/lib/Transforms/Instrumentation/Makefile
+++ /dev/null
@@ -1,15 +0,0 @@
-##===- lib/Transforms/Instrumentation/Makefile -------------*- Makefile -*-===##
-#
-#                     The LLVM Compiler Infrastructure
-#
-# This file is distributed under the University of Illinois Open Source
-# License. See LICENSE.TXT for details.
-#
-##===----------------------------------------------------------------------===##
-
-LEVEL = ../../..
-LIBRARYNAME = LLVMInstrumentation
-BUILD_ARCHIVE = 1
-
-include $(LEVEL)/Makefile.common
-
diff --git a/lib/Transforms/Instrumentation/MemorySanitizer.cpp b/lib/Transforms/Instrumentation/MemorySanitizer.cpp
index 34aaa7f27d6e..970f9ab86e82 100644
--- a/lib/Transforms/Instrumentation/MemorySanitizer.cpp
+++ b/lib/Transforms/Instrumentation/MemorySanitizer.cpp
@@ -91,7 +91,6 @@
 
 //===----------------------------------------------------------------------===//
 
-#include "llvm/Transforms/Instrumentation.h"
 #include "llvm/ADT/DepthFirstIterator.h"
 #include "llvm/ADT/SmallString.h"
 #include "llvm/ADT/SmallVector.h"
@@ -109,9 +108,9 @@
 #include "llvm/IR/Type.h"
 #include "llvm/IR/ValueMap.h"
 #include "llvm/Support/CommandLine.h"
-#include "llvm/Support/Compiler.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/raw_ostream.h"
+#include "llvm/Transforms/Instrumentation.h"
 #include "llvm/Transforms/Utils/BasicBlockUtils.h"
 #include "llvm/Transforms/Utils/Local.h"
 #include "llvm/Transforms/Utils/ModuleUtils.h"
@@ -191,6 +190,12 @@ static cl::opt<bool> ClCheckConstantShadow("msan-check-constant-shadow",
        cl::desc("Insert checks for constant shadow values"),
        cl::Hidden, cl::init(false));
 
+// This is off by default because of a bug in gold:
+// https://sourceware.org/bugzilla/show_bug.cgi?id=19002
+static cl::opt<bool> ClWithComdat("msan-with-comdat",
+       cl::desc("Place MSan constructors in comdat sections"),
+       cl::Hidden, cl::init(false));
+
 static const char *const kMsanModuleCtorName = "msan.module_ctor";
 static const char *const kMsanInitName = "__msan_init";
 
@@ -312,6 +317,9 @@ class MemorySanitizer : public FunctionPass {
         TrackOrigins(std::max(TrackOrigins, (int)ClTrackOrigins)),
         WarningFn(nullptr) {}
   const char *getPassName() const override { return "MemorySanitizer"; }
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    AU.addRequired<TargetLibraryInfoWrapperPass>();
+  }
   bool runOnFunction(Function &F) override;
   bool doInitialization(Module &M) override;
   static char ID;  // Pass identification, replacement for typeid.
@@ -374,13 +382,18 @@ class MemorySanitizer : public FunctionPass {
   friend struct VarArgAMD64Helper;
   friend struct VarArgMIPS64Helper;
   friend struct VarArgAArch64Helper;
+  friend struct VarArgPowerPC64Helper;
 };
 } // anonymous namespace
 
 char MemorySanitizer::ID = 0;
-INITIALIZE_PASS(MemorySanitizer, "msan",
-                "MemorySanitizer: detects uninitialized reads.",
-                false, false)
+INITIALIZE_PASS_BEGIN(
+    MemorySanitizer, "msan",
+    "MemorySanitizer: detects uninitialized reads.", false, false)
+INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfoWrapperPass)
+INITIALIZE_PASS_END(
+    MemorySanitizer, "msan",
+    "MemorySanitizer: detects uninitialized reads.", false, false)
 
 FunctionPass *llvm::createMemorySanitizerPass(int TrackOrigins) {
   return new MemorySanitizer(TrackOrigins);
@@ -540,8 +553,14 @@ bool MemorySanitizer::doInitialization(Module &M) {
       createSanitizerCtorAndInitFunctions(M, kMsanModuleCtorName, kMsanInitName,
                                           /*InitArgTypes=*/{},
                                           /*InitArgs=*/{});
+  if (ClWithComdat) {
+    Comdat *MsanCtorComdat = M.getOrInsertComdat(kMsanModuleCtorName);
+    MsanCtorFunction->setComdat(MsanCtorComdat);
+    appendToGlobalCtors(M, MsanCtorFunction, 0, MsanCtorFunction);
+  } else {
+    appendToGlobalCtors(M, MsanCtorFunction, 0);
+  }
 
-  appendToGlobalCtors(M, MsanCtorFunction, 0);
 
   if (TrackOrigins)
     new GlobalVariable(M, IRB.getInt32Ty(), true, GlobalValue::WeakODRLinkage,
@@ -591,7 +610,7 @@ CreateVarArgHelper(Function &Func, MemorySanitizer &Msan,
 
 unsigned TypeSizeToSizeIndex(unsigned TypeSize) {
   if (TypeSize <= 8) return 0;
-  return Log2_32_Ceil(TypeSize / 8);
+  return Log2_32_Ceil((TypeSize + 7) / 8);
 }
 
 /// This class does all the work for a given function. Store and Load
@@ -606,6 +625,7 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
   SmallVector<PHINode *, 16> ShadowPHINodes, OriginPHINodes;
   ValueMap<Value*, Value*> ShadowMap, OriginMap;
   std::unique_ptr<VarArgHelper> VAHelper;
+  const TargetLibraryInfo *TLI;
 
   // The following flags disable parts of MSan instrumentation based on
   // blacklist contents and command-line options.
@@ -623,7 +643,7 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
       : Shadow(S), Origin(O), OrigIns(I) { }
   };
   SmallVector<ShadowOriginAndInsertPoint, 16> InstrumentationList;
-  SmallVector<Instruction*, 16> StoreList;
+  SmallVector<StoreInst *, 16> StoreList;
 
   MemorySanitizerVisitor(Function &F, MemorySanitizer &MS)
       : F(F), MS(MS), VAHelper(CreateVarArgHelper(F, MS, *this)) {
@@ -635,6 +655,7 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
     // FIXME: Consider using SpecialCaseList to specify a list of functions that
     // must always return fully initialized values. For now, we hardcode "main".
     CheckReturnValue = SanitizeFunction && (F.getName() == "main");
+    TLI = &MS.getAnalysis<TargetLibraryInfoWrapperPass>().getTLI();
 
     DEBUG(if (!InsertChecks)
           dbgs() << "MemorySanitizer is not inserting checks into '"
@@ -731,26 +752,26 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
   }
 
   void materializeStores(bool InstrumentWithCalls) {
-    for (auto Inst : StoreList) {
-      StoreInst &SI = *dyn_cast<StoreInst>(Inst);
-
-      IRBuilder<> IRB(&SI);
-      Value *Val = SI.getValueOperand();
-      Value *Addr = SI.getPointerOperand();
-      Value *Shadow = SI.isAtomic() ? getCleanShadow(Val) : getShadow(Val);
+    for (StoreInst *SI : StoreList) {
+      IRBuilder<> IRB(SI);
+      Value *Val = SI->getValueOperand();
+      Value *Addr = SI->getPointerOperand();
+      Value *Shadow = SI->isAtomic() ? getCleanShadow(Val) : getShadow(Val);
       Value *ShadowPtr = getShadowPtr(Addr, Shadow->getType(), IRB);
 
       StoreInst *NewSI =
-          IRB.CreateAlignedStore(Shadow, ShadowPtr, SI.getAlignment());
+          IRB.CreateAlignedStore(Shadow, ShadowPtr, SI->getAlignment());
       DEBUG(dbgs() << "  STORE: " << *NewSI << "\n");
       (void)NewSI;
 
-      if (ClCheckAccessAddress) insertShadowCheck(Addr, &SI);
+      if (ClCheckAccessAddress)
+        insertShadowCheck(Addr, SI);
 
-      if (SI.isAtomic()) SI.setOrdering(addReleaseOrdering(SI.getOrdering()));
+      if (SI->isAtomic())
+        SI->setOrdering(addReleaseOrdering(SI->getOrdering()));
 
-      if (MS.TrackOrigins && !SI.isAtomic())
-        storeOrigin(IRB, Addr, Shadow, getOrigin(Val), SI.getAlignment(),
+      if (MS.TrackOrigins && !SI->isAtomic())
+        storeOrigin(IRB, Addr, Shadow, getOrigin(Val), SI->getAlignment(),
                     InstrumentWithCalls);
     }
   }
@@ -1142,7 +1163,7 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
             setOrigin(A, getCleanOrigin());
           }
         }
-        ArgOffset += RoundUpToAlignment(Size, kShadowTLSAlignment);
+        ArgOffset += alignTo(Size, kShadowTLSAlignment);
       }
       assert(*ShadowPtr && "Could not find shadow for an argument");
       return *ShadowPtr;
@@ -1210,34 +1231,34 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
 
   AtomicOrdering addReleaseOrdering(AtomicOrdering a) {
     switch (a) {
-      case NotAtomic:
-        return NotAtomic;
-      case Unordered:
-      case Monotonic:
-      case Release:
-        return Release;
-      case Acquire:
-      case AcquireRelease:
-        return AcquireRelease;
-      case SequentiallyConsistent:
-        return SequentiallyConsistent;
+      case AtomicOrdering::NotAtomic:
+        return AtomicOrdering::NotAtomic;
+      case AtomicOrdering::Unordered:
+      case AtomicOrdering::Monotonic:
+      case AtomicOrdering::Release:
+        return AtomicOrdering::Release;
+      case AtomicOrdering::Acquire:
+      case AtomicOrdering::AcquireRelease:
+        return AtomicOrdering::AcquireRelease;
+      case AtomicOrdering::SequentiallyConsistent:
+        return AtomicOrdering::SequentiallyConsistent;
     }
     llvm_unreachable("Unknown ordering");
   }
 
   AtomicOrdering addAcquireOrdering(AtomicOrdering a) {
     switch (a) {
-      case NotAtomic:
-        return NotAtomic;
-      case Unordered:
-      case Monotonic:
-      case Acquire:
-        return Acquire;
-      case Release:
-      case AcquireRelease:
-        return AcquireRelease;
-      case SequentiallyConsistent:
-        return SequentiallyConsistent;
+      case AtomicOrdering::NotAtomic:
+        return AtomicOrdering::NotAtomic;
+      case AtomicOrdering::Unordered:
+      case AtomicOrdering::Monotonic:
+      case AtomicOrdering::Acquire:
+        return AtomicOrdering::Acquire;
+      case AtomicOrdering::Release:
+      case AtomicOrdering::AcquireRelease:
+        return AtomicOrdering::AcquireRelease;
+      case AtomicOrdering::SequentiallyConsistent:
+        return AtomicOrdering::SequentiallyConsistent;
     }
     llvm_unreachable("Unknown ordering");
   }
@@ -1603,7 +1624,7 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
       for (unsigned Idx = 0; Idx < NumElements; ++Idx) {
         if (ConstantInt *Elt =
                 dyn_cast<ConstantInt>(ConstArg->getAggregateElement(Idx))) {
-          APInt V = Elt->getValue();
+          const APInt &V = Elt->getValue();
           APInt V2 = APInt(V.getBitWidth(), 1) << V.countTrailingZeros();
           Elements.push_back(ConstantInt::get(EltTy, V2));
         } else {
@@ -1613,7 +1634,7 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
       ShadowMul = ConstantVector::get(Elements);
     } else {
       if (ConstantInt *Elt = dyn_cast<ConstantInt>(ConstArg)) {
-        APInt V = Elt->getValue();
+        const APInt &V = Elt->getValue();
         APInt V2 = APInt(V.getBitWidth(), 1) << V.countTrailingZeros();
         ShadowMul = ConstantInt::get(Ty, V2);
       } else {
@@ -2123,6 +2144,14 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
     return CreateShadowCast(IRB, S2, T, /* Signed */ true);
   }
 
+  // Given a vector, extract its first element, and return all
+  // zeroes if it is zero, and all ones otherwise.
+  Value *LowerElementShadowExtend(IRBuilder<> &IRB, Value *S, Type *T) {
+    Value *S1 = IRB.CreateExtractElement(S, (uint64_t)0);
+    Value *S2 = IRB.CreateICmpNE(S1, getCleanShadow(S1));
+    return CreateShadowCast(IRB, S2, T, /* Signed */ true);
+  }
+
   Value *VariableShadowExtend(IRBuilder<> &IRB, Value *S) {
     Type *T = S->getType();
     assert(T->isVectorTy());
@@ -2270,15 +2299,39 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
     setOriginForNaryOp(I);
   }
 
+  // \brief Instrument compare-packed intrinsic.
+  // Basically, an or followed by sext(icmp ne 0) to end up with all-zeros or
+  // all-ones shadow.
+  void handleVectorComparePackedIntrinsic(IntrinsicInst &I) {
+    IRBuilder<> IRB(&I);
+    Type *ResTy = getShadowTy(&I);
+    Value *S0 = IRB.CreateOr(getShadow(&I, 0), getShadow(&I, 1));
+    Value *S = IRB.CreateSExt(
+        IRB.CreateICmpNE(S0, Constant::getNullValue(ResTy)), ResTy);
+    setShadow(&I, S);
+    setOriginForNaryOp(I);
+  }
+
+  // \brief Instrument compare-scalar intrinsic.
+  // This handles both cmp* intrinsics which return the result in the first
+  // element of a vector, and comi* which return the result as i32.
+  void handleVectorCompareScalarIntrinsic(IntrinsicInst &I) {
+    IRBuilder<> IRB(&I);
+    Value *S0 = IRB.CreateOr(getShadow(&I, 0), getShadow(&I, 1));
+    Value *S = LowerElementShadowExtend(IRB, S0, getShadowTy(&I));
+    setShadow(&I, S);
+    setOriginForNaryOp(I);
+  }
+
   void visitIntrinsicInst(IntrinsicInst &I) {
     switch (I.getIntrinsicID()) {
     case llvm::Intrinsic::bswap:
       handleBswap(I);
       break;
-    case llvm::Intrinsic::x86_avx512_cvtsd2usi64:
-    case llvm::Intrinsic::x86_avx512_cvtsd2usi:
-    case llvm::Intrinsic::x86_avx512_cvtss2usi64:
-    case llvm::Intrinsic::x86_avx512_cvtss2usi:
+    case llvm::Intrinsic::x86_avx512_vcvtsd2usi64:
+    case llvm::Intrinsic::x86_avx512_vcvtsd2usi32:
+    case llvm::Intrinsic::x86_avx512_vcvtss2usi64:
+    case llvm::Intrinsic::x86_avx512_vcvtss2usi32:
     case llvm::Intrinsic::x86_avx512_cvttss2usi64:
     case llvm::Intrinsic::x86_avx512_cvttss2usi:
     case llvm::Intrinsic::x86_avx512_cvttsd2usi64:
@@ -2303,8 +2356,6 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
     case llvm::Intrinsic::x86_sse_cvttss2si:
       handleVectorConvertIntrinsic(I, 1);
       break;
-    case llvm::Intrinsic::x86_sse2_cvtdq2pd:
-    case llvm::Intrinsic::x86_sse2_cvtps2pd:
     case llvm::Intrinsic::x86_sse_cvtps2pi:
     case llvm::Intrinsic::x86_sse_cvttps2pi:
       handleVectorConvertIntrinsic(I, 2);
@@ -2413,6 +2464,43 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
       handleVectorPmaddIntrinsic(I, 16);
       break;
 
+    case llvm::Intrinsic::x86_sse_cmp_ss:
+    case llvm::Intrinsic::x86_sse2_cmp_sd:
+    case llvm::Intrinsic::x86_sse_comieq_ss:
+    case llvm::Intrinsic::x86_sse_comilt_ss:
+    case llvm::Intrinsic::x86_sse_comile_ss:
+    case llvm::Intrinsic::x86_sse_comigt_ss:
+    case llvm::Intrinsic::x86_sse_comige_ss:
+    case llvm::Intrinsic::x86_sse_comineq_ss:
+    case llvm::Intrinsic::x86_sse_ucomieq_ss:
+    case llvm::Intrinsic::x86_sse_ucomilt_ss:
+    case llvm::Intrinsic::x86_sse_ucomile_ss:
+    case llvm::Intrinsic::x86_sse_ucomigt_ss:
+    case llvm::Intrinsic::x86_sse_ucomige_ss:
+    case llvm::Intrinsic::x86_sse_ucomineq_ss:
+    case llvm::Intrinsic::x86_sse2_comieq_sd:
+    case llvm::Intrinsic::x86_sse2_comilt_sd:
+    case llvm::Intrinsic::x86_sse2_comile_sd:
+    case llvm::Intrinsic::x86_sse2_comigt_sd:
+    case llvm::Intrinsic::x86_sse2_comige_sd:
+    case llvm::Intrinsic::x86_sse2_comineq_sd:
+    case llvm::Intrinsic::x86_sse2_ucomieq_sd:
+    case llvm::Intrinsic::x86_sse2_ucomilt_sd:
+    case llvm::Intrinsic::x86_sse2_ucomile_sd:
+    case llvm::Intrinsic::x86_sse2_ucomigt_sd:
+    case llvm::Intrinsic::x86_sse2_ucomige_sd:
+    case llvm::Intrinsic::x86_sse2_ucomineq_sd:
+      handleVectorCompareScalarIntrinsic(I);
+      break;
+
+    case llvm::Intrinsic::x86_sse_cmp_ps:
+    case llvm::Intrinsic::x86_sse2_cmp_pd:
+      // FIXME: For x86_avx_cmp_pd_256 and x86_avx_cmp_ps_256 this function
+      // generates reasonably looking IR that fails in the backend with "Do not
+      // know how to split the result of this operator!".
+      handleVectorComparePackedIntrinsic(I);
+      break;
+
     default:
       if (!handleUnknownIntrinsic(I))
         visitInstruction(I);
@@ -2450,6 +2538,8 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
                                                  AttributeSet::FunctionIndex,
                                                  B));
       }
+
+      maybeMarkSanitizerLibraryCallNoBuiltin(Call, TLI);
     }
     IRBuilder<> IRB(&I);
 
@@ -2498,7 +2588,7 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
       (void)Store;
       assert(Size != 0 && Store != nullptr);
       DEBUG(dbgs() << "  Param:" << *Store << "\n");
-      ArgOffset += RoundUpToAlignment(Size, 8);
+      ArgOffset += alignTo(Size, 8);
     }
     DEBUG(dbgs() << "  done with call args\n");
 
@@ -2811,14 +2901,19 @@ struct VarArgAMD64Helper : public VarArgHelper {
          ArgIt != End; ++ArgIt) {
       Value *A = *ArgIt;
       unsigned ArgNo = CS.getArgumentNo(ArgIt);
+      bool IsFixed = ArgNo < CS.getFunctionType()->getNumParams();
       bool IsByVal = CS.paramHasAttr(ArgNo + 1, Attribute::ByVal);
       if (IsByVal) {
         // ByVal arguments always go to the overflow area.
+        // Fixed arguments passed through the overflow area will be stepped
+        // over by va_start, so don't count them towards the offset.
+        if (IsFixed)
+          continue;
         assert(A->getType()->isPointerTy());
         Type *RealTy = A->getType()->getPointerElementType();
         uint64_t ArgSize = DL.getTypeAllocSize(RealTy);
         Value *Base = getShadowPtrForVAArgument(RealTy, IRB, OverflowOffset);
-        OverflowOffset += RoundUpToAlignment(ArgSize, 8);
+        OverflowOffset += alignTo(ArgSize, 8);
         IRB.CreateMemCpy(Base, MSV.getShadowPtr(A, IRB.getInt8Ty(), IRB),
                          ArgSize, kShadowTLSAlignment);
       } else {
@@ -2838,10 +2933,16 @@ struct VarArgAMD64Helper : public VarArgHelper {
             FpOffset += 16;
             break;
           case AK_Memory:
+            if (IsFixed)
+              continue;
             uint64_t ArgSize = DL.getTypeAllocSize(A->getType());
             Base = getShadowPtrForVAArgument(A->getType(), IRB, OverflowOffset);
-            OverflowOffset += RoundUpToAlignment(ArgSize, 8);
+            OverflowOffset += alignTo(ArgSize, 8);
         }
+        // Take fixed arguments into account for GpOffset and FpOffset,
+        // but don't actually store shadows for them.
+        if (IsFixed)
+          continue;
         IRB.CreateAlignedStore(MSV.getShadow(A), Base, kShadowTLSAlignment);
       }
     }
@@ -2952,20 +3053,22 @@ struct VarArgMIPS64Helper : public VarArgHelper {
   void visitCallSite(CallSite &CS, IRBuilder<> &IRB) override {
     unsigned VAArgOffset = 0;
     const DataLayout &DL = F.getParent()->getDataLayout();
-    for (CallSite::arg_iterator ArgIt = CS.arg_begin() + 1, End = CS.arg_end();
+    for (CallSite::arg_iterator ArgIt = CS.arg_begin() +
+         CS.getFunctionType()->getNumParams(), End = CS.arg_end();
          ArgIt != End; ++ArgIt) {
+      llvm::Triple TargetTriple(F.getParent()->getTargetTriple());
       Value *A = *ArgIt;
       Value *Base;
       uint64_t ArgSize = DL.getTypeAllocSize(A->getType());
-#if defined(__MIPSEB__) || defined(MIPSEB)
-      // Adjusting the shadow for argument with size < 8 to match the placement
-      // of bits in big endian system
-      if (ArgSize < 8)
-        VAArgOffset += (8 - ArgSize);
-#endif
+      if (TargetTriple.getArch() == llvm::Triple::mips64) {
+        // Adjusting the shadow for argument with size < 8 to match the placement
+        // of bits in big endian system
+        if (ArgSize < 8)
+          VAArgOffset += (8 - ArgSize);
+      }
       Base = getShadowPtrForVAArgument(A->getType(), IRB, VAArgOffset);
       VAArgOffset += ArgSize;
-      VAArgOffset = RoundUpToAlignment(VAArgOffset, 8);
+      VAArgOffset = alignTo(VAArgOffset, 8);
       IRB.CreateAlignedStore(MSV.getShadow(A), Base, kShadowTLSAlignment);
     }
 
@@ -3038,13 +3141,13 @@ struct VarArgMIPS64Helper : public VarArgHelper {
 
 /// \brief AArch64-specific implementation of VarArgHelper.
 struct VarArgAArch64Helper : public VarArgHelper {
-  static const unsigned kAArch64GrArgSize = 56;
+  static const unsigned kAArch64GrArgSize = 64;
   static const unsigned kAArch64VrArgSize = 128;
 
   static const unsigned AArch64GrBegOffset = 0;
   static const unsigned AArch64GrEndOffset = kAArch64GrArgSize;
   // Make VR space aligned to 16 bytes.
-  static const unsigned AArch64VrBegOffset = AArch64GrEndOffset + 8;
+  static const unsigned AArch64VrBegOffset = AArch64GrEndOffset;
   static const unsigned AArch64VrEndOffset = AArch64VrBegOffset
                                              + kAArch64VrArgSize;
   static const unsigned AArch64VAEndOffset = AArch64VrEndOffset;
@@ -3089,9 +3192,11 @@ struct VarArgAArch64Helper : public VarArgHelper {
     unsigned OverflowOffset = AArch64VAEndOffset;
 
     const DataLayout &DL = F.getParent()->getDataLayout();
-    for (CallSite::arg_iterator ArgIt = CS.arg_begin() + 1, End = CS.arg_end();
+    for (CallSite::arg_iterator ArgIt = CS.arg_begin(), End = CS.arg_end();
          ArgIt != End; ++ArgIt) {
       Value *A = *ArgIt;
+      unsigned ArgNo = CS.getArgumentNo(ArgIt);
+      bool IsFixed = ArgNo < CS.getFunctionType()->getNumParams();
       ArgKind AK = classifyArgument(A);
       if (AK == AK_GeneralPurpose && GrOffset >= AArch64GrEndOffset)
         AK = AK_Memory;
@@ -3108,11 +3213,19 @@ struct VarArgAArch64Helper : public VarArgHelper {
           VrOffset += 16;
           break;
         case AK_Memory:
+          // Don't count fixed arguments in the overflow area - va_start will
+          // skip right over them.
+          if (IsFixed)
+            continue;
           uint64_t ArgSize = DL.getTypeAllocSize(A->getType());
           Base = getShadowPtrForVAArgument(A->getType(), IRB, OverflowOffset);
-          OverflowOffset += RoundUpToAlignment(ArgSize, 8);
+          OverflowOffset += alignTo(ArgSize, 8);
           break;
       }
+      // Count Gp/Vr fixed arguments to their respective offsets, but don't
+      // bother to actually store a shadow.
+      if (IsFixed)
+        continue;
       IRB.CreateAlignedStore(MSV.getShadow(A), Base, kShadowTLSAlignment);
     }
     Constant *OverflowSize =
@@ -3271,6 +3384,163 @@ struct VarArgAArch64Helper : public VarArgHelper {
   }
 };
 
+/// \brief PowerPC64-specific implementation of VarArgHelper.
+struct VarArgPowerPC64Helper : public VarArgHelper {
+  Function &F;
+  MemorySanitizer &MS;
+  MemorySanitizerVisitor &MSV;
+  Value *VAArgTLSCopy;
+  Value *VAArgSize;
+
+  SmallVector<CallInst*, 16> VAStartInstrumentationList;
+
+  VarArgPowerPC64Helper(Function &F, MemorySanitizer &MS,
+                    MemorySanitizerVisitor &MSV)
+    : F(F), MS(MS), MSV(MSV), VAArgTLSCopy(nullptr),
+      VAArgSize(nullptr) {}
+
+  void visitCallSite(CallSite &CS, IRBuilder<> &IRB) override {
+    // For PowerPC, we need to deal with alignment of stack arguments -
+    // they are mostly aligned to 8 bytes, but vectors and i128 arrays
+    // are aligned to 16 bytes, byvals can be aligned to 8 or 16 bytes,
+    // and QPX vectors are aligned to 32 bytes.  For that reason, we
+    // compute current offset from stack pointer (which is always properly
+    // aligned), and offset for the first vararg, then subtract them.
+    unsigned VAArgBase;
+    llvm::Triple TargetTriple(F.getParent()->getTargetTriple());
+    // Parameter save area starts at 48 bytes from frame pointer for ABIv1,
+    // and 32 bytes for ABIv2.  This is usually determined by target
+    // endianness, but in theory could be overriden by function attribute.
+    // For simplicity, we ignore it here (it'd only matter for QPX vectors).
+    if (TargetTriple.getArch() == llvm::Triple::ppc64)
+      VAArgBase = 48;
+    else
+      VAArgBase = 32;
+    unsigned VAArgOffset = VAArgBase;
+    const DataLayout &DL = F.getParent()->getDataLayout();
+    for (CallSite::arg_iterator ArgIt = CS.arg_begin(), End = CS.arg_end();
+         ArgIt != End; ++ArgIt) {
+      Value *A = *ArgIt;
+      unsigned ArgNo = CS.getArgumentNo(ArgIt);
+      bool IsFixed = ArgNo < CS.getFunctionType()->getNumParams();
+      bool IsByVal = CS.paramHasAttr(ArgNo + 1, Attribute::ByVal);
+      if (IsByVal) {
+        assert(A->getType()->isPointerTy());
+        Type *RealTy = A->getType()->getPointerElementType();
+        uint64_t ArgSize = DL.getTypeAllocSize(RealTy);
+        uint64_t ArgAlign = CS.getParamAlignment(ArgNo + 1);
+        if (ArgAlign < 8)
+          ArgAlign = 8;
+        VAArgOffset = alignTo(VAArgOffset, ArgAlign);
+        if (!IsFixed) {
+          Value *Base = getShadowPtrForVAArgument(RealTy, IRB,
+                                                  VAArgOffset - VAArgBase);
+          IRB.CreateMemCpy(Base, MSV.getShadowPtr(A, IRB.getInt8Ty(), IRB),
+                           ArgSize, kShadowTLSAlignment);
+        }
+        VAArgOffset += alignTo(ArgSize, 8);
+      } else {
+        Value *Base;
+        uint64_t ArgSize = DL.getTypeAllocSize(A->getType());
+        uint64_t ArgAlign = 8;
+        if (A->getType()->isArrayTy()) {
+          // Arrays are aligned to element size, except for long double
+          // arrays, which are aligned to 8 bytes.
+          Type *ElementTy = A->getType()->getArrayElementType();
+          if (!ElementTy->isPPC_FP128Ty())
+            ArgAlign = DL.getTypeAllocSize(ElementTy);
+        } else if (A->getType()->isVectorTy()) {
+          // Vectors are naturally aligned.
+          ArgAlign = DL.getTypeAllocSize(A->getType());
+        }
+        if (ArgAlign < 8)
+          ArgAlign = 8;
+        VAArgOffset = alignTo(VAArgOffset, ArgAlign);
+        if (DL.isBigEndian()) {
+          // Adjusting the shadow for argument with size < 8 to match the placement
+          // of bits in big endian system
+          if (ArgSize < 8)
+            VAArgOffset += (8 - ArgSize);
+        }
+        if (!IsFixed) {
+          Base = getShadowPtrForVAArgument(A->getType(), IRB,
+                                           VAArgOffset - VAArgBase);
+          IRB.CreateAlignedStore(MSV.getShadow(A), Base, kShadowTLSAlignment);
+        }
+        VAArgOffset += ArgSize;
+        VAArgOffset = alignTo(VAArgOffset, 8);
+      }
+      if (IsFixed)
+        VAArgBase = VAArgOffset;
+    }
+
+    Constant *TotalVAArgSize = ConstantInt::get(IRB.getInt64Ty(),
+                                                VAArgOffset - VAArgBase);
+    // Here using VAArgOverflowSizeTLS as VAArgSizeTLS to avoid creation of
+    // a new class member i.e. it is the total size of all VarArgs.
+    IRB.CreateStore(TotalVAArgSize, MS.VAArgOverflowSizeTLS);
+  }
+
+  /// \brief Compute the shadow address for a given va_arg.
+  Value *getShadowPtrForVAArgument(Type *Ty, IRBuilder<> &IRB,
+                                   int ArgOffset) {
+    Value *Base = IRB.CreatePointerCast(MS.VAArgTLS, MS.IntptrTy);
+    Base = IRB.CreateAdd(Base, ConstantInt::get(MS.IntptrTy, ArgOffset));
+    return IRB.CreateIntToPtr(Base, PointerType::get(MSV.getShadowTy(Ty), 0),
+                              "_msarg");
+  }
+
+  void visitVAStartInst(VAStartInst &I) override {
+    IRBuilder<> IRB(&I);
+    VAStartInstrumentationList.push_back(&I);
+    Value *VAListTag = I.getArgOperand(0);
+    Value *ShadowPtr = MSV.getShadowPtr(VAListTag, IRB.getInt8Ty(), IRB);
+    IRB.CreateMemSet(ShadowPtr, Constant::getNullValue(IRB.getInt8Ty()),
+                     /* size */8, /* alignment */8, false);
+  }
+
+  void visitVACopyInst(VACopyInst &I) override {
+    IRBuilder<> IRB(&I);
+    Value *VAListTag = I.getArgOperand(0);
+    Value *ShadowPtr = MSV.getShadowPtr(VAListTag, IRB.getInt8Ty(), IRB);
+    // Unpoison the whole __va_list_tag.
+    // FIXME: magic ABI constants.
+    IRB.CreateMemSet(ShadowPtr, Constant::getNullValue(IRB.getInt8Ty()),
+                     /* size */8, /* alignment */8, false);
+  }
+
+  void finalizeInstrumentation() override {
+    assert(!VAArgSize && !VAArgTLSCopy &&
+           "finalizeInstrumentation called twice");
+    IRBuilder<> IRB(F.getEntryBlock().getFirstNonPHI());
+    VAArgSize = IRB.CreateLoad(MS.VAArgOverflowSizeTLS);
+    Value *CopySize = IRB.CreateAdd(ConstantInt::get(MS.IntptrTy, 0),
+                                    VAArgSize);
+
+    if (!VAStartInstrumentationList.empty()) {
+      // If there is a va_start in this function, make a backup copy of
+      // va_arg_tls somewhere in the function entry block.
+      VAArgTLSCopy = IRB.CreateAlloca(Type::getInt8Ty(*MS.C), CopySize);
+      IRB.CreateMemCpy(VAArgTLSCopy, MS.VAArgTLS, CopySize, 8);
+    }
+
+    // Instrument va_start.
+    // Copy va_list shadow from the backup copy of the TLS contents.
+    for (size_t i = 0, n = VAStartInstrumentationList.size(); i < n; i++) {
+      CallInst *OrigInst = VAStartInstrumentationList[i];
+      IRBuilder<> IRB(OrigInst->getNextNode());
+      Value *VAListTag = OrigInst->getArgOperand(0);
+      Value *RegSaveAreaPtrPtr =
+        IRB.CreateIntToPtr(IRB.CreatePtrToInt(VAListTag, MS.IntptrTy),
+                        Type::getInt64PtrTy(*MS.C));
+      Value *RegSaveAreaPtr = IRB.CreateLoad(RegSaveAreaPtrPtr);
+      Value *RegSaveAreaShadowPtr =
+      MSV.getShadowPtr(RegSaveAreaPtr, IRB.getInt8Ty(), IRB);
+      IRB.CreateMemCpy(RegSaveAreaShadowPtr, VAArgTLSCopy, CopySize, 8);
+    }
+  }
+};
+
 /// \brief A no-op implementation of VarArgHelper.
 struct VarArgNoOpHelper : public VarArgHelper {
   VarArgNoOpHelper(Function &F, MemorySanitizer &MS,
@@ -3297,6 +3567,9 @@ VarArgHelper *CreateVarArgHelper(Function &Func, MemorySanitizer &Msan,
     return new VarArgMIPS64Helper(Func, Msan, Visitor);
   else if (TargetTriple.getArch() == llvm::Triple::aarch64)
     return new VarArgAArch64Helper(Func, Msan, Visitor);
+  else if (TargetTriple.getArch() == llvm::Triple::ppc64 ||
+           TargetTriple.getArch() == llvm::Triple::ppc64le)
+    return new VarArgPowerPC64Helper(Func, Msan, Visitor);
   else
     return new VarArgNoOpHelper(Func, Msan, Visitor);
 }
diff --git a/lib/Transforms/Instrumentation/PGOInstrumentation.cpp b/lib/Transforms/Instrumentation/PGOInstrumentation.cpp
index 4b59b93b325f..f54d8ad48146 100644
--- a/lib/Transforms/Instrumentation/PGOInstrumentation.cpp
+++ b/lib/Transforms/Instrumentation/PGOInstrumentation.cpp
@@ -25,9 +25,12 @@
 //
 // This file contains two passes:
 // (1) Pass PGOInstrumentationGen which instruments the IR to generate edge
-// count profile, and
+// count profile, and generates the instrumentation for indirect call
+// profiling.
 // (2) Pass PGOInstrumentationUse which reads the edge count profile and
-// annotates the branch weights.
+// annotates the branch weights. It also reads the indirect call value
+// profiling records and annotate the indirect call instructions.
+//
 // To get the precise counter information, These two passes need to invoke at
 // the same compilation point (so they see the same IR). For pass
 // PGOInstrumentationGen, the real work is done in instrumentOneFunc(). For
@@ -45,14 +48,16 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "llvm/Transforms/Instrumentation.h"
+#include "llvm/Transforms/PGOInstrumentation.h"
 #include "CFGMST.h"
-#include "llvm/ADT/DenseMap.h"
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/Statistic.h"
+#include "llvm/ADT/Triple.h"
 #include "llvm/Analysis/BlockFrequencyInfo.h"
 #include "llvm/Analysis/BranchProbabilityInfo.h"
 #include "llvm/Analysis/CFG.h"
+#include "llvm/Analysis/IndirectCallSiteVisitor.h"
+#include "llvm/IR/CallSite.h"
 #include "llvm/IR/DiagnosticInfo.h"
 #include "llvm/IR/IRBuilder.h"
 #include "llvm/IR/InstIterator.h"
@@ -62,10 +67,13 @@
 #include "llvm/IR/Module.h"
 #include "llvm/Pass.h"
 #include "llvm/ProfileData/InstrProfReader.h"
+#include "llvm/ProfileData/ProfileCommon.h"
 #include "llvm/Support/BranchProbability.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/JamCRC.h"
+#include "llvm/Transforms/Instrumentation.h"
 #include "llvm/Transforms/Utils/BasicBlockUtils.h"
+#include <algorithm>
 #include <string>
 #include <utility>
 #include <vector>
@@ -81,6 +89,7 @@ STATISTIC(NumOfPGOSplit, "Number of critical edge splits.");
 STATISTIC(NumOfPGOFunc, "Number of functions having valid profile counts.");
 STATISTIC(NumOfPGOMismatch, "Number of functions having mismatch profile.");
 STATISTIC(NumOfPGOMissing, "Number of functions without profile.");
+STATISTIC(NumOfPGOICall, "Number of indirect call value instrumentations.");
 
 // Command line option to specify the file to read profile from. This is
 // mainly used for testing.
@@ -90,13 +99,37 @@ static cl::opt<std::string>
                        cl::desc("Specify the path of profile data file. This is"
                                 "mainly for test purpose."));
 
+// Command line option to disable value profiling. The default is false:
+// i.e. value profiling is enabled by default. This is for debug purpose.
+static cl::opt<bool> DisableValueProfiling("disable-vp", cl::init(false),
+                                           cl::Hidden,
+                                           cl::desc("Disable Value Profiling"));
+
+// Command line option to set the maximum number of VP annotations to write to
+// the metadata for a single indirect call callsite.
+static cl::opt<unsigned> MaxNumAnnotations(
+    "icp-max-annotations", cl::init(3), cl::Hidden, cl::ZeroOrMore,
+    cl::desc("Max number of annotations for a single indirect "
+             "call callsite"));
+
+// Command line option to enable/disable the warning about missing profile
+// information.
+static cl::opt<bool> NoPGOWarnMissing("no-pgo-warn-missing", cl::init(false),
+                                      cl::Hidden);
+
+// Command line option to enable/disable the warning about a hash mismatch in
+// the profile data.
+static cl::opt<bool> NoPGOWarnMismatch("no-pgo-warn-mismatch", cl::init(false),
+                                       cl::Hidden);
+
 namespace {
-class PGOInstrumentationGen : public ModulePass {
+class PGOInstrumentationGenLegacyPass : public ModulePass {
 public:
   static char ID;
 
-  PGOInstrumentationGen() : ModulePass(ID) {
-    initializePGOInstrumentationGenPass(*PassRegistry::getPassRegistry());
+  PGOInstrumentationGenLegacyPass() : ModulePass(ID) {
+    initializePGOInstrumentationGenLegacyPassPass(
+        *PassRegistry::getPassRegistry());
   }
 
   const char *getPassName() const override {
@@ -111,16 +144,17 @@ private:
   }
 };
 
-class PGOInstrumentationUse : public ModulePass {
+class PGOInstrumentationUseLegacyPass : public ModulePass {
 public:
   static char ID;
 
   // Provide the profile filename as the parameter.
-  PGOInstrumentationUse(std::string Filename = "")
-      : ModulePass(ID), ProfileFileName(Filename) {
+  PGOInstrumentationUseLegacyPass(std::string Filename = "")
+      : ModulePass(ID), ProfileFileName(std::move(Filename)) {
     if (!PGOTestProfileFile.empty())
       ProfileFileName = PGOTestProfileFile;
-    initializePGOInstrumentationUsePass(*PassRegistry::getPassRegistry());
+    initializePGOInstrumentationUseLegacyPassPass(
+        *PassRegistry::getPassRegistry());
   }
 
   const char *getPassName() const override {
@@ -129,37 +163,36 @@ public:
 
 private:
   std::string ProfileFileName;
-  std::unique_ptr<IndexedInstrProfReader> PGOReader;
-  bool runOnModule(Module &M) override;
 
+  bool runOnModule(Module &M) override;
   void getAnalysisUsage(AnalysisUsage &AU) const override {
     AU.addRequired<BlockFrequencyInfoWrapperPass>();
   }
 };
 } // end anonymous namespace
 
-char PGOInstrumentationGen::ID = 0;
-INITIALIZE_PASS_BEGIN(PGOInstrumentationGen, "pgo-instr-gen",
+char PGOInstrumentationGenLegacyPass::ID = 0;
+INITIALIZE_PASS_BEGIN(PGOInstrumentationGenLegacyPass, "pgo-instr-gen",
                       "PGO instrumentation.", false, false)
 INITIALIZE_PASS_DEPENDENCY(BlockFrequencyInfoWrapperPass)
 INITIALIZE_PASS_DEPENDENCY(BranchProbabilityInfoWrapperPass)
-INITIALIZE_PASS_END(PGOInstrumentationGen, "pgo-instr-gen",
+INITIALIZE_PASS_END(PGOInstrumentationGenLegacyPass, "pgo-instr-gen",
                     "PGO instrumentation.", false, false)
 
-ModulePass *llvm::createPGOInstrumentationGenPass() {
-  return new PGOInstrumentationGen();
+ModulePass *llvm::createPGOInstrumentationGenLegacyPass() {
+  return new PGOInstrumentationGenLegacyPass();
 }
 
-char PGOInstrumentationUse::ID = 0;
-INITIALIZE_PASS_BEGIN(PGOInstrumentationUse, "pgo-instr-use",
+char PGOInstrumentationUseLegacyPass::ID = 0;
+INITIALIZE_PASS_BEGIN(PGOInstrumentationUseLegacyPass, "pgo-instr-use",
                       "Read PGO instrumentation profile.", false, false)
 INITIALIZE_PASS_DEPENDENCY(BlockFrequencyInfoWrapperPass)
 INITIALIZE_PASS_DEPENDENCY(BranchProbabilityInfoWrapperPass)
-INITIALIZE_PASS_END(PGOInstrumentationUse, "pgo-instr-use",
+INITIALIZE_PASS_END(PGOInstrumentationUseLegacyPass, "pgo-instr-use",
                     "Read PGO instrumentation profile.", false, false)
 
-ModulePass *llvm::createPGOInstrumentationUsePass(StringRef Filename) {
-  return new PGOInstrumentationUse(Filename.str());
+ModulePass *llvm::createPGOInstrumentationUseLegacyPass(StringRef Filename) {
+  return new PGOInstrumentationUseLegacyPass(Filename.str());
 }
 
 namespace {
@@ -225,7 +258,7 @@ public:
   // Dump edges and BB information.
   void dumpInfo(std::string Str = "") const {
     MST.dumpEdges(dbgs(), Twine("Dump Function ") + FuncName + " Hash: " +
-                          Twine(FunctionHash) + "\t" + Str);
+                              Twine(FunctionHash) + "\t" + Str);
   }
 
   FuncPGOInstrumentation(Function &Func, bool CreateGlobalVar = false,
@@ -247,7 +280,7 @@ public:
 
     if (CreateGlobalVar)
       FuncNameVar = createPGOFuncNameVar(F, FuncName);
-  };
+  }
 };
 
 // Compute Hash value for the CFG: the lower 32 bits are CRC32 of the index
@@ -305,7 +338,7 @@ BasicBlock *FuncPGOInstrumentation<Edge, BBInfo>::getInstrBB(Edge *E) {
   return InstrBB;
 }
 
-// Visit all edge and instrument the edges not in MST.
+// Visit all edge and instrument the edges not in MST, and do value profiling.
 // Critical edges will be split.
 static void instrumentOneFunc(Function &F, Module *M,
                               BranchProbabilityInfo *BPI,
@@ -318,6 +351,7 @@ static void instrumentOneFunc(Function &F, Module *M,
   }
 
   uint32_t I = 0;
+  Type *I8PtrTy = Type::getInt8PtrTy(M->getContext());
   for (auto &E : FuncInfo.MST.AllEdges) {
     BasicBlock *InstrBB = FuncInfo.getInstrBB(E.get());
     if (!InstrBB)
@@ -326,13 +360,34 @@ static void instrumentOneFunc(Function &F, Module *M,
     IRBuilder<> Builder(InstrBB, InstrBB->getFirstInsertionPt());
     assert(Builder.GetInsertPoint() != InstrBB->end() &&
            "Cannot get the Instrumentation point");
-    Type *I8PtrTy = Type::getInt8PtrTy(M->getContext());
     Builder.CreateCall(
         Intrinsic::getDeclaration(M, Intrinsic::instrprof_increment),
         {llvm::ConstantExpr::getBitCast(FuncInfo.FuncNameVar, I8PtrTy),
          Builder.getInt64(FuncInfo.FunctionHash), Builder.getInt32(NumCounters),
          Builder.getInt32(I++)});
   }
+
+  if (DisableValueProfiling)
+    return;
+
+  unsigned NumIndirectCallSites = 0;
+  for (auto &I : findIndirectCallSites(F)) {
+    CallSite CS(I);
+    Value *Callee = CS.getCalledValue();
+    DEBUG(dbgs() << "Instrument one indirect call: CallSite Index = "
+                 << NumIndirectCallSites << "\n");
+    IRBuilder<> Builder(I);
+    assert(Builder.GetInsertPoint() != I->getParent()->end() &&
+           "Cannot get the Instrumentation point");
+    Builder.CreateCall(
+        Intrinsic::getDeclaration(M, Intrinsic::instrprof_value_profile),
+        {llvm::ConstantExpr::getBitCast(FuncInfo.FuncNameVar, I8PtrTy),
+         Builder.getInt64(FuncInfo.FunctionHash),
+         Builder.CreatePtrToInt(Callee, Builder.getInt64Ty()),
+         Builder.getInt32(llvm::InstrProfValueKind::IPVK_IndirectCallTarget),
+         Builder.getInt32(NumIndirectCallSites++)});
+  }
+  NumOfPGOICall += NumIndirectCallSites;
 }
 
 // This class represents a CFG edge in profile use compilation.
@@ -352,7 +407,8 @@ struct PGOUseEdge : public PGOEdge {
   const std::string infoString() const {
     if (!CountValid)
       return PGOEdge::infoString();
-    return (Twine(PGOEdge::infoString()) + "  Count=" + Twine(CountValue)).str();
+    return (Twine(PGOEdge::infoString()) + "  Count=" + Twine(CountValue))
+        .str();
   }
 };
 
@@ -399,6 +455,33 @@ static uint64_t sumEdgeCount(const ArrayRef<PGOUseEdge *> Edges) {
 }
 
 class PGOUseFunc {
+public:
+  PGOUseFunc(Function &Func, Module *Modu, BranchProbabilityInfo *BPI = nullptr,
+             BlockFrequencyInfo *BFI = nullptr)
+      : F(Func), M(Modu), FuncInfo(Func, false, BPI, BFI),
+        FreqAttr(FFA_Normal) {}
+
+  // Read counts for the instrumented BB from profile.
+  bool readCounters(IndexedInstrProfReader *PGOReader);
+
+  // Populate the counts for all BBs.
+  void populateCounters();
+
+  // Set the branch weights based on the count values.
+  void setBranchWeights();
+
+  // Annotate the indirect call sites.
+  void annotateIndirectCallSites();
+
+  // The hotness of the function from the profile count.
+  enum FuncFreqAttr { FFA_Normal, FFA_Cold, FFA_Hot };
+
+  // Return the function hotness from the profile.
+  FuncFreqAttr getFuncFreqAttr() const { return FreqAttr; }
+
+  // Return the profile record for this function;
+  InstrProfRecord &getProfileRecord() { return ProfileRecord; }
+
 private:
   Function &F;
   Module *M;
@@ -414,6 +497,12 @@ private:
   // compilation.
   uint64_t ProgramMaxCount;
 
+  // ProfileRecord for this function.
+  InstrProfRecord ProfileRecord;
+
+  // Function hotness info derived from profile.
+  FuncFreqAttr FreqAttr;
+
   // Find the Instrumented BB and set the value.
   void setInstrumentedCounts(const std::vector<uint64_t> &CountFromProfile);
 
@@ -427,7 +516,7 @@ private:
   // Set the hot/cold inline hints based on the count values.
   // FIXME: This function should be removed once the functionality in
   // the inliner is implemented.
-  void applyFunctionAttributes(uint64_t EntryCount, uint64_t MaxCount) {
+  void markFunctionAttributes(uint64_t EntryCount, uint64_t MaxCount) {
     if (ProgramMaxCount == 0)
       return;
     // Threshold of the hot functions.
@@ -435,24 +524,10 @@ private:
     // Threshold of the cold functions.
     const BranchProbability ColdFunctionThreshold(2, 10000);
     if (EntryCount >= HotFunctionThreshold.scale(ProgramMaxCount))
-      F.addFnAttr(llvm::Attribute::InlineHint);
+      FreqAttr = FFA_Hot;
     else if (MaxCount <= ColdFunctionThreshold.scale(ProgramMaxCount))
-      F.addFnAttr(llvm::Attribute::Cold);
+      FreqAttr = FFA_Cold;
   }
-
-public:
-  PGOUseFunc(Function &Func, Module *Modu, BranchProbabilityInfo *BPI = nullptr,
-             BlockFrequencyInfo *BFI = nullptr)
-      : F(Func), M(Modu), FuncInfo(Func, false, BPI, BFI) {}
-
-  // Read counts for the instrumented BB from profile.
-  bool readCounters(IndexedInstrProfReader *PGOReader);
-
-  // Populate the counts for all BBs.
-  void populateCounters();
-
-  // Set the branch weights based on the count values.
-  void setBranchWeights();
 };
 
 // Visit all the edges and assign the count value for the instrumented
@@ -511,21 +586,32 @@ void PGOUseFunc::setEdgeCount(DirectEdges &Edges, uint64_t Value) {
 // Return true if the profile are successfully read, and false on errors.
 bool PGOUseFunc::readCounters(IndexedInstrProfReader *PGOReader) {
   auto &Ctx = M->getContext();
-  ErrorOr<InstrProfRecord> Result =
+  Expected<InstrProfRecord> Result =
       PGOReader->getInstrProfRecord(FuncInfo.FuncName, FuncInfo.FunctionHash);
-  if (std::error_code EC = Result.getError()) {
-    if (EC == instrprof_error::unknown_function)
-      NumOfPGOMissing++;
-    else if (EC == instrprof_error::hash_mismatch ||
-             EC == llvm::instrprof_error::malformed)
-      NumOfPGOMismatch++;
-
-    std::string Msg = EC.message() + std::string(" ") + F.getName().str();
-    Ctx.diagnose(
-        DiagnosticInfoPGOProfile(M->getName().data(), Msg, DS_Warning));
+  if (Error E = Result.takeError()) {
+    handleAllErrors(std::move(E), [&](const InstrProfError &IPE) {
+      auto Err = IPE.get();
+      bool SkipWarning = false;
+      if (Err == instrprof_error::unknown_function) {
+        NumOfPGOMissing++;
+        SkipWarning = NoPGOWarnMissing;
+      } else if (Err == instrprof_error::hash_mismatch ||
+                 Err == instrprof_error::malformed) {
+        NumOfPGOMismatch++;
+        SkipWarning = NoPGOWarnMismatch;
+      }
+
+      if (SkipWarning)
+        return;
+
+      std::string Msg = IPE.message() + std::string(" ") + F.getName().str();
+      Ctx.diagnose(
+          DiagnosticInfoPGOProfile(M->getName().data(), Msg, DS_Warning));
+    });
     return false;
   }
-  std::vector<uint64_t> &CountFromProfile = Result.get().Counts;
+  ProfileRecord = std::move(Result.get());
+  std::vector<uint64_t> &CountFromProfile = ProfileRecord.Counts;
 
   NumOfPGOFunc++;
   DEBUG(dbgs() << CountFromProfile.size() << " counts\n");
@@ -605,16 +691,17 @@ void PGOUseFunc::populateCounters() {
   }
 
   DEBUG(dbgs() << "Populate counts in " << NumPasses << " passes.\n");
+#ifndef NDEBUG
   // Assert every BB has a valid counter.
+  for (auto &BB : F)
+    assert(getBBInfo(&BB).CountValid && "BB count is not valid");
+#endif
   uint64_t FuncEntryCount = getBBInfo(&*F.begin()).CountValue;
+  F.setEntryCount(FuncEntryCount);
   uint64_t FuncMaxCount = FuncEntryCount;
-  for (auto &BB : F) {
-    assert(getBBInfo(&BB).CountValid && "BB count is not valid");
-    uint64_t Count = getBBInfo(&BB).CountValue;
-    if (Count > FuncMaxCount)
-      FuncMaxCount = Count;
-  }
-  applyFunctionAttributes(FuncEntryCount, FuncMaxCount);
+  for (auto &BB : F)
+    FuncMaxCount = std::max(FuncMaxCount, getBBInfo(&BB).CountValue);
+  markFunctionAttributes(FuncEntryCount, FuncMaxCount);
 
   DEBUG(FuncInfo.dumpInfo("after reading profile."));
 }
@@ -642,7 +729,7 @@ void PGOUseFunc::setBranchWeights() {
       const PGOUseEdge *E = BBCountInfo.OutEdges[s];
       const BasicBlock *SrcBB = E->SrcBB;
       const BasicBlock *DestBB = E->DestBB;
-      if (DestBB == 0)
+      if (DestBB == nullptr)
         continue;
       unsigned SuccNum = GetSuccessorNumber(SrcBB, DestBB);
       uint64_t EdgeCount = E->CountValue;
@@ -663,56 +750,204 @@ void PGOUseFunc::setBranchWeights() {
           dbgs() << "\n";);
   }
 }
+
+// Traverse all the indirect callsites and annotate the instructions.
+void PGOUseFunc::annotateIndirectCallSites() {
+  if (DisableValueProfiling)
+    return;
+
+  // Create the PGOFuncName meta data.
+  createPGOFuncNameMetadata(F, FuncInfo.FuncName);
+
+  unsigned IndirectCallSiteIndex = 0;
+  auto IndirectCallSites = findIndirectCallSites(F);
+  unsigned NumValueSites =
+      ProfileRecord.getNumValueSites(IPVK_IndirectCallTarget);
+  if (NumValueSites != IndirectCallSites.size()) {
+    std::string Msg =
+        std::string("Inconsistent number of indirect call sites: ") +
+        F.getName().str();
+    auto &Ctx = M->getContext();
+    Ctx.diagnose(
+        DiagnosticInfoPGOProfile(M->getName().data(), Msg, DS_Warning));
+    return;
+  }
+
+  for (auto &I : IndirectCallSites) {
+    DEBUG(dbgs() << "Read one indirect call instrumentation: Index="
+                 << IndirectCallSiteIndex << " out of " << NumValueSites
+                 << "\n");
+    annotateValueSite(*M, *I, ProfileRecord, IPVK_IndirectCallTarget,
+                      IndirectCallSiteIndex, MaxNumAnnotations);
+    IndirectCallSiteIndex++;
+  }
+}
 } // end anonymous namespace
 
-bool PGOInstrumentationGen::runOnModule(Module &M) {
+// Create a COMDAT variable IR_LEVEL_PROF_VARNAME to make the runtime
+// aware this is an ir_level profile so it can set the version flag.
+static void createIRLevelProfileFlagVariable(Module &M) {
+  Type *IntTy64 = Type::getInt64Ty(M.getContext());
+  uint64_t ProfileVersion = (INSTR_PROF_RAW_VERSION | VARIANT_MASK_IR_PROF);
+  auto IRLevelVersionVariable = new GlobalVariable(
+      M, IntTy64, true, GlobalVariable::ExternalLinkage,
+      Constant::getIntegerValue(IntTy64, APInt(64, ProfileVersion)),
+      INSTR_PROF_QUOTE(IR_LEVEL_PROF_VERSION_VAR));
+  IRLevelVersionVariable->setVisibility(GlobalValue::DefaultVisibility);
+  Triple TT(M.getTargetTriple());
+  if (!TT.supportsCOMDAT())
+    IRLevelVersionVariable->setLinkage(GlobalValue::WeakAnyLinkage);
+  else
+    IRLevelVersionVariable->setComdat(M.getOrInsertComdat(
+        StringRef(INSTR_PROF_QUOTE(IR_LEVEL_PROF_VERSION_VAR))));
+}
+
+static bool InstrumentAllFunctions(
+    Module &M, function_ref<BranchProbabilityInfo *(Function &)> LookupBPI,
+    function_ref<BlockFrequencyInfo *(Function &)> LookupBFI) {
+  createIRLevelProfileFlagVariable(M);
   for (auto &F : M) {
     if (F.isDeclaration())
       continue;
-    BranchProbabilityInfo *BPI =
-        &(getAnalysis<BranchProbabilityInfoWrapperPass>(F).getBPI());
-    BlockFrequencyInfo *BFI =
-        &(getAnalysis<BlockFrequencyInfoWrapperPass>(F).getBFI());
+    auto *BPI = LookupBPI(F);
+    auto *BFI = LookupBFI(F);
     instrumentOneFunc(F, &M, BPI, BFI);
   }
   return true;
 }
 
-static void setPGOCountOnFunc(PGOUseFunc &Func,
-                              IndexedInstrProfReader *PGOReader) {
-  if (Func.readCounters(PGOReader)) {
-    Func.populateCounters();
-    Func.setBranchWeights();
-  }
+bool PGOInstrumentationGenLegacyPass::runOnModule(Module &M) {
+  if (skipModule(M))
+    return false;
+
+  auto LookupBPI = [this](Function &F) {
+    return &this->getAnalysis<BranchProbabilityInfoWrapperPass>(F).getBPI();
+  };
+  auto LookupBFI = [this](Function &F) {
+    return &this->getAnalysis<BlockFrequencyInfoWrapperPass>(F).getBFI();
+  };
+  return InstrumentAllFunctions(M, LookupBPI, LookupBFI);
 }
 
-bool PGOInstrumentationUse::runOnModule(Module &M) {
+PreservedAnalyses PGOInstrumentationGen::run(Module &M,
+                                             AnalysisManager<Module> &AM) {
+
+  auto &FAM = AM.getResult<FunctionAnalysisManagerModuleProxy>(M).getManager();
+  auto LookupBPI = [&FAM](Function &F) {
+    return &FAM.getResult<BranchProbabilityAnalysis>(F);
+  };
+
+  auto LookupBFI = [&FAM](Function &F) {
+    return &FAM.getResult<BlockFrequencyAnalysis>(F);
+  };
+
+  if (!InstrumentAllFunctions(M, LookupBPI, LookupBFI))
+    return PreservedAnalyses::all();
+
+  return PreservedAnalyses::none();
+}
+
+static bool annotateAllFunctions(
+    Module &M, StringRef ProfileFileName,
+    function_ref<BranchProbabilityInfo *(Function &)> LookupBPI,
+    function_ref<BlockFrequencyInfo *(Function &)> LookupBFI) {
   DEBUG(dbgs() << "Read in profile counters: ");
   auto &Ctx = M.getContext();
   // Read the counter array from file.
   auto ReaderOrErr = IndexedInstrProfReader::create(ProfileFileName);
-  if (std::error_code EC = ReaderOrErr.getError()) {
-    Ctx.diagnose(
-        DiagnosticInfoPGOProfile(ProfileFileName.data(), EC.message()));
+  if (Error E = ReaderOrErr.takeError()) {
+    handleAllErrors(std::move(E), [&](const ErrorInfoBase &EI) {
+      Ctx.diagnose(
+          DiagnosticInfoPGOProfile(ProfileFileName.data(), EI.message()));
+    });
     return false;
   }
 
-  PGOReader = std::move(ReaderOrErr.get());
+  std::unique_ptr<IndexedInstrProfReader> PGOReader =
+      std::move(ReaderOrErr.get());
   if (!PGOReader) {
     Ctx.diagnose(DiagnosticInfoPGOProfile(ProfileFileName.data(),
-                                          "Cannot get PGOReader"));
+                                          StringRef("Cannot get PGOReader")));
+    return false;
+  }
+  // TODO: might need to change the warning once the clang option is finalized.
+  if (!PGOReader->isIRLevelProfile()) {
+    Ctx.diagnose(DiagnosticInfoPGOProfile(
+        ProfileFileName.data(), "Not an IR level instrumentation profile"));
     return false;
   }
 
+  std::vector<Function *> HotFunctions;
+  std::vector<Function *> ColdFunctions;
   for (auto &F : M) {
     if (F.isDeclaration())
       continue;
-    BranchProbabilityInfo *BPI =
-        &(getAnalysis<BranchProbabilityInfoWrapperPass>(F).getBPI());
-    BlockFrequencyInfo *BFI =
-        &(getAnalysis<BlockFrequencyInfoWrapperPass>(F).getBFI());
+    auto *BPI = LookupBPI(F);
+    auto *BFI = LookupBFI(F);
     PGOUseFunc Func(F, &M, BPI, BFI);
-    setPGOCountOnFunc(Func, PGOReader.get());
+    if (!Func.readCounters(PGOReader.get()))
+      continue;
+    Func.populateCounters();
+    Func.setBranchWeights();
+    Func.annotateIndirectCallSites();
+    PGOUseFunc::FuncFreqAttr FreqAttr = Func.getFuncFreqAttr();
+    if (FreqAttr == PGOUseFunc::FFA_Cold)
+      ColdFunctions.push_back(&F);
+    else if (FreqAttr == PGOUseFunc::FFA_Hot)
+      HotFunctions.push_back(&F);
+  }
+  M.setProfileSummary(PGOReader->getSummary().getMD(M.getContext()));
+  // Set function hotness attribute from the profile.
+  // We have to apply these attributes at the end because their presence
+  // can affect the BranchProbabilityInfo of any callers, resulting in an
+  // inconsistent MST between prof-gen and prof-use.
+  for (auto &F : HotFunctions) {
+    F->addFnAttr(llvm::Attribute::InlineHint);
+    DEBUG(dbgs() << "Set inline attribute to function: " << F->getName()
+                 << "\n");
+  }
+  for (auto &F : ColdFunctions) {
+    F->addFnAttr(llvm::Attribute::Cold);
+    DEBUG(dbgs() << "Set cold attribute to function: " << F->getName() << "\n");
   }
+
   return true;
 }
+
+PGOInstrumentationUse::PGOInstrumentationUse(std::string Filename)
+    : ProfileFileName(std::move(Filename)) {
+  if (!PGOTestProfileFile.empty())
+    ProfileFileName = PGOTestProfileFile;
+}
+
+PreservedAnalyses PGOInstrumentationUse::run(Module &M,
+                                             AnalysisManager<Module> &AM) {
+
+  auto &FAM = AM.getResult<FunctionAnalysisManagerModuleProxy>(M).getManager();
+  auto LookupBPI = [&FAM](Function &F) {
+    return &FAM.getResult<BranchProbabilityAnalysis>(F);
+  };
+
+  auto LookupBFI = [&FAM](Function &F) {
+    return &FAM.getResult<BlockFrequencyAnalysis>(F);
+  };
+
+  if (!annotateAllFunctions(M, ProfileFileName, LookupBPI, LookupBFI))
+    return PreservedAnalyses::all();
+
+  return PreservedAnalyses::none();
+}
+
+bool PGOInstrumentationUseLegacyPass::runOnModule(Module &M) {
+  if (skipModule(M))
+    return false;
+
+  auto LookupBPI = [this](Function &F) {
+    return &this->getAnalysis<BranchProbabilityInfoWrapperPass>(F).getBPI();
+  };
+  auto LookupBFI = [this](Function &F) {
+    return &this->getAnalysis<BlockFrequencyInfoWrapperPass>(F).getBFI();
+  };
+
+  return annotateAllFunctions(M, ProfileFileName, LookupBPI, LookupBFI);
+}
diff --git a/lib/Transforms/Instrumentation/SanitizerCoverage.cpp b/lib/Transforms/Instrumentation/SanitizerCoverage.cpp
index 09de7a2cda2b..7d404473655d 100644
--- a/lib/Transforms/Instrumentation/SanitizerCoverage.cpp
+++ b/lib/Transforms/Instrumentation/SanitizerCoverage.cpp
@@ -28,13 +28,15 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "llvm/Transforms/Instrumentation.h"
 #include "llvm/ADT/ArrayRef.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/Analysis/EHPersonalities.h"
+#include "llvm/Analysis/PostDominators.h"
+#include "llvm/IR/CFG.h"
 #include "llvm/IR/CallSite.h"
 #include "llvm/IR/DataLayout.h"
 #include "llvm/IR/DebugInfo.h"
+#include "llvm/IR/Dominators.h"
 #include "llvm/IR/Function.h"
 #include "llvm/IR/IRBuilder.h"
 #include "llvm/IR/InlineAsm.h"
@@ -45,6 +47,7 @@
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/raw_ostream.h"
+#include "llvm/Transforms/Instrumentation.h"
 #include "llvm/Transforms/Scalar.h"
 #include "llvm/Transforms/Utils/BasicBlockUtils.h"
 #include "llvm/Transforms/Utils/ModuleUtils.h"
@@ -53,22 +56,28 @@ using namespace llvm;
 
 #define DEBUG_TYPE "sancov"
 
-static const char *const kSanCovModuleInitName = "__sanitizer_cov_module_init";
-static const char *const kSanCovName = "__sanitizer_cov";
-static const char *const kSanCovWithCheckName = "__sanitizer_cov_with_check";
-static const char *const kSanCovIndirCallName = "__sanitizer_cov_indir_call16";
-static const char *const kSanCovTraceEnter = "__sanitizer_cov_trace_func_enter";
-static const char *const kSanCovTraceBB = "__sanitizer_cov_trace_basic_block";
-static const char *const kSanCovTraceCmp = "__sanitizer_cov_trace_cmp";
-static const char *const kSanCovTraceSwitch = "__sanitizer_cov_trace_switch";
-static const char *const kSanCovModuleCtorName = "sancov.module_ctor";
-static const uint64_t    kSanCtorAndDtorPriority = 2;
-
-static cl::opt<int> ClCoverageLevel("sanitizer-coverage-level",
-       cl::desc("Sanitizer Coverage. 0: none, 1: entry block, 2: all blocks, "
-                "3: all blocks and critical edges, "
-                "4: above plus indirect calls"),
-       cl::Hidden, cl::init(0));
+static const char *const SanCovModuleInitName = "__sanitizer_cov_module_init";
+static const char *const SanCovName = "__sanitizer_cov";
+static const char *const SanCovWithCheckName = "__sanitizer_cov_with_check";
+static const char *const SanCovIndirCallName = "__sanitizer_cov_indir_call16";
+static const char *const SanCovTracePCIndirName =
+    "__sanitizer_cov_trace_pc_indir";
+static const char *const SanCovTraceEnterName =
+    "__sanitizer_cov_trace_func_enter";
+static const char *const SanCovTraceBBName =
+    "__sanitizer_cov_trace_basic_block";
+static const char *const SanCovTracePCName = "__sanitizer_cov_trace_pc";
+static const char *const SanCovTraceCmpName = "__sanitizer_cov_trace_cmp";
+static const char *const SanCovTraceSwitchName = "__sanitizer_cov_trace_switch";
+static const char *const SanCovModuleCtorName = "sancov.module_ctor";
+static const uint64_t SanCtorAndDtorPriority = 2;
+
+static cl::opt<int> ClCoverageLevel(
+    "sanitizer-coverage-level",
+    cl::desc("Sanitizer Coverage. 0: none, 1: entry block, 2: all blocks, "
+             "3: all blocks and critical edges, "
+             "4: above plus indirect calls"),
+    cl::Hidden, cl::init(0));
 
 static cl::opt<unsigned> ClCoverageBlockThreshold(
     "sanitizer-coverage-block-threshold",
@@ -82,12 +91,21 @@ static cl::opt<bool>
                                    "callbacks at every basic block"),
                           cl::Hidden, cl::init(false));
 
+static cl::opt<bool> ClExperimentalTracePC("sanitizer-coverage-trace-pc",
+                                           cl::desc("Experimental pc tracing"),
+                                           cl::Hidden, cl::init(false));
+
 static cl::opt<bool>
     ClExperimentalCMPTracing("sanitizer-coverage-experimental-trace-compares",
                              cl::desc("Experimental tracing of CMP and similar "
                                       "instructions"),
                              cl::Hidden, cl::init(false));
 
+static cl::opt<bool>
+    ClPruneBlocks("sanitizer-coverage-prune-blocks",
+                  cl::desc("Reduce the number of instrumented blocks"),
+                  cl::Hidden, cl::init(true));
+
 // Experimental 8-bit counters used as an additional search heuristic during
 // coverage-guided fuzzing.
 // The counters are not thread-friendly:
@@ -131,22 +149,28 @@ SanitizerCoverageOptions OverrideFromCL(SanitizerCoverageOptions Options) {
   Options.TraceBB |= ClExperimentalTracing;
   Options.TraceCmp |= ClExperimentalCMPTracing;
   Options.Use8bitCounters |= ClUse8bitCounters;
+  Options.TracePC |= ClExperimentalTracePC;
   return Options;
 }
 
 class SanitizerCoverageModule : public ModulePass {
- public:
+public:
   SanitizerCoverageModule(
       const SanitizerCoverageOptions &Options = SanitizerCoverageOptions())
-      : ModulePass(ID), Options(OverrideFromCL(Options)) {}
+      : ModulePass(ID), Options(OverrideFromCL(Options)) {
+    initializeSanitizerCoverageModulePass(*PassRegistry::getPassRegistry());
+  }
   bool runOnModule(Module &M) override;
   bool runOnFunction(Function &F);
-  static char ID;  // Pass identification, replacement for typeid
-  const char *getPassName() const override {
-    return "SanitizerCoverageModule";
+  static char ID; // Pass identification, replacement for typeid
+  const char *getPassName() const override { return "SanitizerCoverageModule"; }
+
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    AU.addRequired<DominatorTreeWrapperPass>();
+    AU.addRequired<PostDominatorTreeWrapperPass>();
   }
 
- private:
+private:
   void InjectCoverageForIndirectCalls(Function &F,
                                       ArrayRef<Instruction *> IndirCalls);
   void InjectTraceForCmp(Function &F, ArrayRef<Instruction *> CmpTraceTargets);
@@ -162,8 +186,8 @@ class SanitizerCoverageModule : public ModulePass {
   }
   Function *SanCovFunction;
   Function *SanCovWithCheckFunction;
-  Function *SanCovIndirCallFunction;
-  Function *SanCovTraceEnter, *SanCovTraceBB;
+  Function *SanCovIndirCallFunction, *SanCovTracePCIndir;
+  Function *SanCovTraceEnter, *SanCovTraceBB, *SanCovTracePC;
   Function *SanCovTraceCmpFunction;
   Function *SanCovTraceSwitchFunction;
   InlineAsm *EmptyAsm;
@@ -178,7 +202,7 @@ class SanitizerCoverageModule : public ModulePass {
   SanitizerCoverageOptions Options;
 };
 
-}  // namespace
+} // namespace
 
 bool SanitizerCoverageModule::runOnModule(Module &M) {
   if (Options.CoverageType == SanitizerCoverageOptions::SCK_None)
@@ -195,28 +219,32 @@ bool SanitizerCoverageModule::runOnModule(Module &M) {
   Int64Ty = IRB.getInt64Ty();
 
   SanCovFunction = checkSanitizerInterfaceFunction(
-      M.getOrInsertFunction(kSanCovName, VoidTy, Int32PtrTy, nullptr));
+      M.getOrInsertFunction(SanCovName, VoidTy, Int32PtrTy, nullptr));
   SanCovWithCheckFunction = checkSanitizerInterfaceFunction(
-      M.getOrInsertFunction(kSanCovWithCheckName, VoidTy, Int32PtrTy, nullptr));
+      M.getOrInsertFunction(SanCovWithCheckName, VoidTy, Int32PtrTy, nullptr));
+  SanCovTracePCIndir = checkSanitizerInterfaceFunction(
+      M.getOrInsertFunction(SanCovTracePCIndirName, VoidTy, IntptrTy, nullptr));
   SanCovIndirCallFunction =
       checkSanitizerInterfaceFunction(M.getOrInsertFunction(
-          kSanCovIndirCallName, VoidTy, IntptrTy, IntptrTy, nullptr));
+          SanCovIndirCallName, VoidTy, IntptrTy, IntptrTy, nullptr));
   SanCovTraceCmpFunction =
       checkSanitizerInterfaceFunction(M.getOrInsertFunction(
-          kSanCovTraceCmp, VoidTy, Int64Ty, Int64Ty, Int64Ty, nullptr));
+          SanCovTraceCmpName, VoidTy, Int64Ty, Int64Ty, Int64Ty, nullptr));
   SanCovTraceSwitchFunction =
       checkSanitizerInterfaceFunction(M.getOrInsertFunction(
-          kSanCovTraceSwitch, VoidTy, Int64Ty, Int64PtrTy, nullptr));
+          SanCovTraceSwitchName, VoidTy, Int64Ty, Int64PtrTy, nullptr));
 
   // We insert an empty inline asm after cov callbacks to avoid callback merge.
   EmptyAsm = InlineAsm::get(FunctionType::get(IRB.getVoidTy(), false),
                             StringRef(""), StringRef(""),
                             /*hasSideEffects=*/true);
 
+  SanCovTracePC = checkSanitizerInterfaceFunction(
+      M.getOrInsertFunction(SanCovTracePCName, VoidTy, nullptr));
   SanCovTraceEnter = checkSanitizerInterfaceFunction(
-      M.getOrInsertFunction(kSanCovTraceEnter, VoidTy, Int32PtrTy, nullptr));
+      M.getOrInsertFunction(SanCovTraceEnterName, VoidTy, Int32PtrTy, nullptr));
   SanCovTraceBB = checkSanitizerInterfaceFunction(
-      M.getOrInsertFunction(kSanCovTraceBB, VoidTy, Int32PtrTy, nullptr));
+      M.getOrInsertFunction(SanCovTraceBBName, VoidTy, Int32PtrTy, nullptr));
 
   // At this point we create a dummy array of guards because we don't
   // know how many elements we will need.
@@ -243,7 +271,6 @@ bool SanitizerCoverageModule::runOnModule(Module &M) {
       M, Int32ArrayNTy, false, GlobalValue::PrivateLinkage,
       Constant::getNullValue(Int32ArrayNTy), "__sancov_gen_cov");
 
-
   // Replace the dummy array with the real one.
   GuardArray->replaceAllUsesWith(
       IRB.CreatePointerCast(RealGuardArray, Int32PtrTy));
@@ -252,13 +279,12 @@ bool SanitizerCoverageModule::runOnModule(Module &M) {
   GlobalVariable *RealEightBitCounterArray;
   if (Options.Use8bitCounters) {
     // Make sure the array is 16-aligned.
-    static const int kCounterAlignment = 16;
-    Type *Int8ArrayNTy =
-        ArrayType::get(Int8Ty, RoundUpToAlignment(N, kCounterAlignment));
+    static const int CounterAlignment = 16;
+    Type *Int8ArrayNTy = ArrayType::get(Int8Ty, alignTo(N, CounterAlignment));
     RealEightBitCounterArray = new GlobalVariable(
         M, Int8ArrayNTy, false, GlobalValue::PrivateLinkage,
         Constant::getNullValue(Int8ArrayNTy), "__sancov_gen_cov_counter");
-    RealEightBitCounterArray->setAlignment(kCounterAlignment);
+    RealEightBitCounterArray->setAlignment(CounterAlignment);
     EightBitCounterArray->replaceAllUsesWith(
         IRB.CreatePointerCast(RealEightBitCounterArray, Int8PtrTy));
     EightBitCounterArray->eraseFromParent();
@@ -271,26 +297,64 @@ bool SanitizerCoverageModule::runOnModule(Module &M) {
       new GlobalVariable(M, ModNameStrConst->getType(), true,
                          GlobalValue::PrivateLinkage, ModNameStrConst);
 
-  Function *CtorFunc;
-  std::tie(CtorFunc, std::ignore) = createSanitizerCtorAndInitFunctions(
-      M, kSanCovModuleCtorName, kSanCovModuleInitName,
-      {Int32PtrTy, IntptrTy, Int8PtrTy, Int8PtrTy},
-      {IRB.CreatePointerCast(RealGuardArray, Int32PtrTy),
-       ConstantInt::get(IntptrTy, N),
-       Options.Use8bitCounters
-           ? IRB.CreatePointerCast(RealEightBitCounterArray, Int8PtrTy)
-           : Constant::getNullValue(Int8PtrTy),
-       IRB.CreatePointerCast(ModuleName, Int8PtrTy)});
+  if (!Options.TracePC) {
+    Function *CtorFunc;
+    std::tie(CtorFunc, std::ignore) = createSanitizerCtorAndInitFunctions(
+        M, SanCovModuleCtorName, SanCovModuleInitName,
+        {Int32PtrTy, IntptrTy, Int8PtrTy, Int8PtrTy},
+        {IRB.CreatePointerCast(RealGuardArray, Int32PtrTy),
+         ConstantInt::get(IntptrTy, N),
+         Options.Use8bitCounters
+             ? IRB.CreatePointerCast(RealEightBitCounterArray, Int8PtrTy)
+             : Constant::getNullValue(Int8PtrTy),
+         IRB.CreatePointerCast(ModuleName, Int8PtrTy)});
+
+    appendToGlobalCtors(M, CtorFunc, SanCtorAndDtorPriority);
+  }
+
+  return true;
+}
+
+// True if block has successors and it dominates all of them.
+static bool isFullDominator(const BasicBlock *BB, const DominatorTree *DT) {
+  if (succ_begin(BB) == succ_end(BB))
+    return false;
+
+  for (const BasicBlock *SUCC : make_range(succ_begin(BB), succ_end(BB))) {
+    if (!DT->dominates(BB, SUCC))
+      return false;
+  }
+
+  return true;
+}
 
-  appendToGlobalCtors(M, CtorFunc, kSanCtorAndDtorPriority);
+// True if block has predecessors and it postdominates all of them.
+static bool isFullPostDominator(const BasicBlock *BB,
+                                const PostDominatorTree *PDT) {
+  if (pred_begin(BB) == pred_end(BB))
+    return false;
+
+  for (const BasicBlock *PRED : make_range(pred_begin(BB), pred_end(BB))) {
+    if (!PDT->dominates(BB, PRED))
+      return false;
+  }
 
   return true;
 }
 
+static bool shouldInstrumentBlock(const Function& F, const BasicBlock *BB, const DominatorTree *DT,
+                                  const PostDominatorTree *PDT) {
+  if (!ClPruneBlocks || &F.getEntryBlock() == BB)
+    return true;
+
+  return !(isFullDominator(BB, DT) || isFullPostDominator(BB, PDT));
+}
+
 bool SanitizerCoverageModule::runOnFunction(Function &F) {
-  if (F.empty()) return false;
+  if (F.empty())
+    return false;
   if (F.getName().find(".module_ctor") != std::string::npos)
-    return false;  // Should not instrument sanitizer init functions.
+    return false; // Should not instrument sanitizer init functions.
   // Don't instrument functions using SEH for now. Splitting basic blocks like
   // we do for coverage breaks WinEHPrepare.
   // FIXME: Remove this when SEH no longer uses landingpad pattern matching.
@@ -299,12 +363,19 @@ bool SanitizerCoverageModule::runOnFunction(Function &F) {
     return false;
   if (Options.CoverageType >= SanitizerCoverageOptions::SCK_Edge)
     SplitAllCriticalEdges(F);
-  SmallVector<Instruction*, 8> IndirCalls;
-  SmallVector<BasicBlock*, 16> AllBlocks;
-  SmallVector<Instruction*, 8> CmpTraceTargets;
-  SmallVector<Instruction*, 8> SwitchTraceTargets;
+  SmallVector<Instruction *, 8> IndirCalls;
+  SmallVector<BasicBlock *, 16> BlocksToInstrument;
+  SmallVector<Instruction *, 8> CmpTraceTargets;
+  SmallVector<Instruction *, 8> SwitchTraceTargets;
+
+  const DominatorTree *DT =
+      &getAnalysis<DominatorTreeWrapperPass>(F).getDomTree();
+  const PostDominatorTree *PDT =
+      &getAnalysis<PostDominatorTreeWrapperPass>(F).getPostDomTree();
+
   for (auto &BB : F) {
-    AllBlocks.push_back(&BB);
+    if (shouldInstrumentBlock(F, &BB, DT, PDT))
+      BlocksToInstrument.push_back(&BB);
     for (auto &Inst : BB) {
       if (Options.IndirectCalls) {
         CallSite CS(&Inst);
@@ -319,7 +390,8 @@ bool SanitizerCoverageModule::runOnFunction(Function &F) {
       }
     }
   }
-  InjectCoverage(F, AllBlocks);
+
+  InjectCoverage(F, BlocksToInstrument);
   InjectCoverageForIndirectCalls(F, IndirCalls);
   InjectTraceForCmp(F, CmpTraceTargets);
   InjectTraceForSwitch(F, SwitchTraceTargets);
@@ -346,28 +418,34 @@ bool SanitizerCoverageModule::InjectCoverage(Function &F,
 // On every indirect call we call a run-time function
 // __sanitizer_cov_indir_call* with two parameters:
 //   - callee address,
-//   - global cache array that contains kCacheSize pointers (zero-initialized).
+//   - global cache array that contains CacheSize pointers (zero-initialized).
 //     The cache is used to speed up recording the caller-callee pairs.
 // The address of the caller is passed implicitly via caller PC.
-// kCacheSize is encoded in the name of the run-time function.
+// CacheSize is encoded in the name of the run-time function.
 void SanitizerCoverageModule::InjectCoverageForIndirectCalls(
     Function &F, ArrayRef<Instruction *> IndirCalls) {
-  if (IndirCalls.empty()) return;
-  const int kCacheSize = 16;
-  const int kCacheAlignment = 64;  // Align for better performance.
-  Type *Ty = ArrayType::get(IntptrTy, kCacheSize);
+  if (IndirCalls.empty())
+    return;
+  const int CacheSize = 16;
+  const int CacheAlignment = 64; // Align for better performance.
+  Type *Ty = ArrayType::get(IntptrTy, CacheSize);
   for (auto I : IndirCalls) {
     IRBuilder<> IRB(I);
     CallSite CS(I);
     Value *Callee = CS.getCalledValue();
-    if (isa<InlineAsm>(Callee)) continue;
+    if (isa<InlineAsm>(Callee))
+      continue;
     GlobalVariable *CalleeCache = new GlobalVariable(
         *F.getParent(), Ty, false, GlobalValue::PrivateLinkage,
         Constant::getNullValue(Ty), "__sancov_gen_callee_cache");
-    CalleeCache->setAlignment(kCacheAlignment);
-    IRB.CreateCall(SanCovIndirCallFunction,
-                   {IRB.CreatePointerCast(Callee, IntptrTy),
-                    IRB.CreatePointerCast(CalleeCache, IntptrTy)});
+    CalleeCache->setAlignment(CacheAlignment);
+    if (Options.TracePC)
+      IRB.CreateCall(SanCovTracePCIndir,
+                     IRB.CreatePointerCast(Callee, IntptrTy));
+    else
+      IRB.CreateCall(SanCovIndirCallFunction,
+                     {IRB.CreatePointerCast(Callee, IntptrTy),
+                      IRB.CreatePointerCast(CalleeCache, IntptrTy)});
   }
 }
 
@@ -376,7 +454,7 @@ void SanitizerCoverageModule::InjectCoverageForIndirectCalls(
 //      {NumCases, ValueSizeInBits, Case0Value, Case1Value, Case2Value, ... })
 
 void SanitizerCoverageModule::InjectTraceForSwitch(
-    Function &F, ArrayRef<Instruction *> SwitchTraceTargets) {
+    Function &, ArrayRef<Instruction *> SwitchTraceTargets) {
   for (auto I : SwitchTraceTargets) {
     if (SwitchInst *SI = dyn_cast<SwitchInst>(I)) {
       IRBuilder<> IRB(I);
@@ -391,7 +469,7 @@ void SanitizerCoverageModule::InjectTraceForSwitch(
       if (Cond->getType()->getScalarSizeInBits() <
           Int64Ty->getScalarSizeInBits())
         Cond = IRB.CreateIntCast(Cond, Int64Ty, false);
-      for (auto It: SI->cases()) {
+      for (auto It : SI->cases()) {
         Constant *C = It.getCaseValue();
         if (C->getType()->getScalarSizeInBits() <
             Int64Ty->getScalarSizeInBits())
@@ -409,15 +487,15 @@ void SanitizerCoverageModule::InjectTraceForSwitch(
   }
 }
 
-
 void SanitizerCoverageModule::InjectTraceForCmp(
-    Function &F, ArrayRef<Instruction *> CmpTraceTargets) {
+    Function &, ArrayRef<Instruction *> CmpTraceTargets) {
   for (auto I : CmpTraceTargets) {
     if (ICmpInst *ICMP = dyn_cast<ICmpInst>(I)) {
       IRBuilder<> IRB(ICMP);
       Value *A0 = ICMP->getOperand(0);
       Value *A1 = ICMP->getOperand(1);
-      if (!A0->getType()->isIntegerTy()) continue;
+      if (!A0->getType()->isIntegerTy())
+        continue;
       uint64_t TypeSize = DL->getTypeStoreSizeInBits(A0->getType());
       // __sanitizer_cov_trace_cmp((type_size << 32) | predicate, A0, A1);
       IRB.CreateCall(
@@ -430,8 +508,8 @@ void SanitizerCoverageModule::InjectTraceForCmp(
 }
 
 void SanitizerCoverageModule::SetNoSanitizeMetadata(Instruction *I) {
-  I->setMetadata(
-      I->getModule()->getMDKindID("nosanitize"), MDNode::get(*C, None));
+  I->setMetadata(I->getModule()->getMDKindID("nosanitize"),
+                 MDNode::get(*C, None));
 }
 
 void SanitizerCoverageModule::InjectCoverageAtBlock(Function &F, BasicBlock &BB,
@@ -448,7 +526,7 @@ void SanitizerCoverageModule::InjectCoverageAtBlock(Function &F, BasicBlock &BB,
   bool IsEntryBB = &BB == &F.getEntryBlock();
   DebugLoc EntryLoc;
   if (IsEntryBB) {
-    if (auto SP = getDISubprogram(&F))
+    if (auto SP = F.getSubprogram())
       EntryLoc = DebugLoc::get(SP->getScopeLine(), 0, SP);
     // Keep static allocas and llvm.localescape calls in the entry block.  Even
     // if we aren't splitting the block, it's nice for allocas to be before
@@ -465,16 +543,20 @@ void SanitizerCoverageModule::InjectCoverageAtBlock(Function &F, BasicBlock &BB,
       ConstantInt::get(IntptrTy, (1 + NumberOfInstrumentedBlocks()) * 4));
   Type *Int32PtrTy = PointerType::getUnqual(IRB.getInt32Ty());
   GuardP = IRB.CreateIntToPtr(GuardP, Int32PtrTy);
-  if (Options.TraceBB) {
+  if (Options.TracePC) {
+    IRB.CreateCall(SanCovTracePC); // gets the PC using GET_CALLER_PC.
+    IRB.CreateCall(EmptyAsm, {}); // Avoids callback merge.
+  } else if (Options.TraceBB) {
     IRB.CreateCall(IsEntryBB ? SanCovTraceEnter : SanCovTraceBB, GuardP);
   } else if (UseCalls) {
     IRB.CreateCall(SanCovWithCheckFunction, GuardP);
   } else {
     LoadInst *Load = IRB.CreateLoad(GuardP);
-    Load->setAtomic(Monotonic);
+    Load->setAtomic(AtomicOrdering::Monotonic);
     Load->setAlignment(4);
     SetNoSanitizeMetadata(Load);
-    Value *Cmp = IRB.CreateICmpSGE(Constant::getNullValue(Load->getType()), Load);
+    Value *Cmp =
+        IRB.CreateICmpSGE(Constant::getNullValue(Load->getType()), Load);
     Instruction *Ins = SplitBlockAndInsertIfThen(
         Cmp, &*IP, false, MDBuilder(*C).createBranchWeights(1, 100000));
     IRB.SetInsertPoint(Ins);
@@ -499,9 +581,16 @@ void SanitizerCoverageModule::InjectCoverageAtBlock(Function &F, BasicBlock &BB,
 }
 
 char SanitizerCoverageModule::ID = 0;
-INITIALIZE_PASS(SanitizerCoverageModule, "sancov",
-    "SanitizerCoverage: TODO."
-    "ModulePass", false, false)
+INITIALIZE_PASS_BEGIN(SanitizerCoverageModule, "sancov",
+                      "SanitizerCoverage: TODO."
+                      "ModulePass",
+                      false, false)
+INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(PostDominatorTreeWrapperPass)
+INITIALIZE_PASS_END(SanitizerCoverageModule, "sancov",
+                    "SanitizerCoverage: TODO."
+                    "ModulePass",
+                    false, false)
 ModulePass *llvm::createSanitizerCoverageModulePass(
     const SanitizerCoverageOptions &Options) {
   return new SanitizerCoverageModule(Options);
diff --git a/lib/Transforms/Instrumentation/ThreadSanitizer.cpp b/lib/Transforms/Instrumentation/ThreadSanitizer.cpp
index 9331e1d2b3fd..dcb62d3ed1b5 100644
--- a/lib/Transforms/Instrumentation/ThreadSanitizer.cpp
+++ b/lib/Transforms/Instrumentation/ThreadSanitizer.cpp
@@ -26,6 +26,7 @@
 #include "llvm/ADT/Statistic.h"
 #include "llvm/ADT/StringExtras.h"
 #include "llvm/Analysis/CaptureTracking.h"
+#include "llvm/Analysis/TargetLibraryInfo.h"
 #include "llvm/Analysis/ValueTracking.h"
 #include "llvm/IR/DataLayout.h"
 #include "llvm/IR/Function.h"
@@ -36,11 +37,13 @@
 #include "llvm/IR/Metadata.h"
 #include "llvm/IR/Module.h"
 #include "llvm/IR/Type.h"
+#include "llvm/ProfileData/InstrProf.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/MathExtras.h"
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/Transforms/Utils/BasicBlockUtils.h"
+#include "llvm/Transforms/Utils/Local.h"
 #include "llvm/Transforms/Utils/ModuleUtils.h"
 
 using namespace llvm;
@@ -81,6 +84,7 @@ namespace {
 struct ThreadSanitizer : public FunctionPass {
   ThreadSanitizer() : FunctionPass(ID) {}
   const char *getPassName() const override;
+  void getAnalysisUsage(AnalysisUsage &AU) const override;
   bool runOnFunction(Function &F) override;
   bool doInitialization(Module &M) override;
   static char ID;  // Pass identification, replacement for typeid.
@@ -121,7 +125,13 @@ struct ThreadSanitizer : public FunctionPass {
 }  // namespace
 
 char ThreadSanitizer::ID = 0;
-INITIALIZE_PASS(ThreadSanitizer, "tsan",
+INITIALIZE_PASS_BEGIN(
+    ThreadSanitizer, "tsan",
+    "ThreadSanitizer: detects data races.",
+    false, false)
+INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfoWrapperPass)
+INITIALIZE_PASS_END(
+    ThreadSanitizer, "tsan",
     "ThreadSanitizer: detects data races.",
     false, false)
 
@@ -129,6 +139,10 @@ const char *ThreadSanitizer::getPassName() const {
   return "ThreadSanitizer";
 }
 
+void ThreadSanitizer::getAnalysisUsage(AnalysisUsage &AU) const {
+  AU.addRequired<TargetLibraryInfoWrapperPass>();
+}
+
 FunctionPass *llvm::createThreadSanitizerPass() {
   return new ThreadSanitizer();
 }
@@ -243,6 +257,37 @@ static bool isVtableAccess(Instruction *I) {
   return false;
 }
 
+// Do not instrument known races/"benign races" that come from compiler
+// instrumentatin. The user has no way of suppressing them.
+static bool shouldInstrumentReadWriteFromAddress(Value *Addr) {
+  // Peel off GEPs and BitCasts.
+  Addr = Addr->stripInBoundsOffsets();
+
+  if (GlobalVariable *GV = dyn_cast<GlobalVariable>(Addr)) {
+    if (GV->hasSection()) {
+      StringRef SectionName = GV->getSection();
+      // Check if the global is in the PGO counters section.
+      if (SectionName.endswith(getInstrProfCountersSectionName(
+            /*AddSegment=*/false)))
+        return false;
+    }
+
+    // Check if the global is in a GCOV counter array.
+    if (GV->getName().startswith("__llvm_gcov_ctr"))
+      return false;
+  }
+
+  // Do not instrument acesses from different address spaces; we cannot deal
+  // with them.
+  if (Addr) {
+    Type *PtrTy = cast<PointerType>(Addr->getType()->getScalarType());
+    if (PtrTy->getPointerAddressSpace() != 0)
+      return false;
+  }
+
+  return true;
+}
+
 bool ThreadSanitizer::addrPointsToConstantData(Value *Addr) {
   // If this is a GEP, just analyze its pointer operand.
   if (GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(Addr))
@@ -281,14 +326,17 @@ void ThreadSanitizer::chooseInstructionsToInstrument(
     const DataLayout &DL) {
   SmallSet<Value*, 8> WriteTargets;
   // Iterate from the end.
-  for (SmallVectorImpl<Instruction*>::reverse_iterator It = Local.rbegin(),
-       E = Local.rend(); It != E; ++It) {
-    Instruction *I = *It;
+  for (Instruction *I : reverse(Local)) {
     if (StoreInst *Store = dyn_cast<StoreInst>(I)) {
-      WriteTargets.insert(Store->getPointerOperand());
+      Value *Addr = Store->getPointerOperand();
+      if (!shouldInstrumentReadWriteFromAddress(Addr))
+        continue;
+      WriteTargets.insert(Addr);
     } else {
       LoadInst *Load = cast<LoadInst>(I);
       Value *Addr = Load->getPointerOperand();
+      if (!shouldInstrumentReadWriteFromAddress(Addr))
+        continue;
       if (WriteTargets.count(Addr)) {
         // We will write to this temp, so no reason to analyze the read.
         NumOmittedReadsBeforeWrite++;
@@ -344,6 +392,8 @@ bool ThreadSanitizer::runOnFunction(Function &F) {
   bool HasCalls = false;
   bool SanitizeFunction = F.hasFnAttribute(Attribute::SanitizeThread);
   const DataLayout &DL = F.getParent()->getDataLayout();
+  const TargetLibraryInfo *TLI =
+      &getAnalysis<TargetLibraryInfoWrapperPass>().getTLI();
 
   // Traverse all instructions, collect loads/stores/returns, check for calls.
   for (auto &BB : F) {
@@ -355,6 +405,8 @@ bool ThreadSanitizer::runOnFunction(Function &F) {
       else if (isa<ReturnInst>(Inst))
         RetVec.push_back(&Inst);
       else if (isa<CallInst>(Inst) || isa<InvokeInst>(Inst)) {
+        if (CallInst *CI = dyn_cast<CallInst>(&Inst))
+          maybeMarkSanitizerLibraryCallNoBuiltin(CI, TLI);
         if (isa<MemIntrinsic>(Inst))
           MemIntrinCalls.push_back(&Inst);
         HasCalls = true;
@@ -456,14 +508,16 @@ bool ThreadSanitizer::instrumentLoadOrStore(Instruction *I,
 static ConstantInt *createOrdering(IRBuilder<> *IRB, AtomicOrdering ord) {
   uint32_t v = 0;
   switch (ord) {
-    case NotAtomic: llvm_unreachable("unexpected atomic ordering!");
-    case Unordered:              // Fall-through.
-    case Monotonic:              v = 0; break;
-    // case Consume:                v = 1; break;  // Not specified yet.
-    case Acquire:                v = 2; break;
-    case Release:                v = 3; break;
-    case AcquireRelease:         v = 4; break;
-    case SequentiallyConsistent: v = 5; break;
+    case AtomicOrdering::NotAtomic:
+      llvm_unreachable("unexpected atomic ordering!");
+    case AtomicOrdering::Unordered:              // Fall-through.
+    case AtomicOrdering::Monotonic:              v = 0; break;
+    // Not specified yet:
+    // case AtomicOrdering::Consume:                v = 1; break;
+    case AtomicOrdering::Acquire:                v = 2; break;
+    case AtomicOrdering::Release:                v = 3; break;
+    case AtomicOrdering::AcquireRelease:         v = 4; break;
+    case AtomicOrdering::SequentiallyConsistent: v = 5; break;
   }
   return IRB->getInt32(v);
 }
@@ -496,6 +550,11 @@ bool ThreadSanitizer::instrumentMemIntrinsic(Instruction *I) {
   return false;
 }
 
+static Value *createIntOrPtrToIntCast(Value *V, Type* Ty, IRBuilder<> &IRB) {
+  return isa<PointerType>(V->getType()) ?
+    IRB.CreatePtrToInt(V, Ty) : IRB.CreateIntCast(V, Ty, false);
+}
+
 // Both llvm and ThreadSanitizer atomic operations are based on C++11/C1x
 // standards.  For background see C++11 standard.  A slightly older, publicly
 // available draft of the standard (not entirely up-to-date, but close enough
@@ -517,9 +576,16 @@ bool ThreadSanitizer::instrumentAtomic(Instruction *I, const DataLayout &DL) {
     Type *PtrTy = Ty->getPointerTo();
     Value *Args[] = {IRB.CreatePointerCast(Addr, PtrTy),
                      createOrdering(&IRB, LI->getOrdering())};
-    CallInst *C = CallInst::Create(TsanAtomicLoad[Idx], Args);
-    ReplaceInstWithInst(I, C);
-
+    Type *OrigTy = cast<PointerType>(Addr->getType())->getElementType();
+    if (Ty == OrigTy) {
+      Instruction *C = CallInst::Create(TsanAtomicLoad[Idx], Args);
+      ReplaceInstWithInst(I, C);
+    } else {
+      // We are loading a pointer, so we need to cast the return value.
+      Value *C = IRB.CreateCall(TsanAtomicLoad[Idx], Args);
+      Instruction *Cast = CastInst::Create(Instruction::IntToPtr, C, OrigTy);
+      ReplaceInstWithInst(I, Cast);
+    }
   } else if (StoreInst *SI = dyn_cast<StoreInst>(I)) {
     Value *Addr = SI->getPointerOperand();
     int Idx = getMemoryAccessFuncIndex(Addr, DL);
@@ -530,7 +596,7 @@ bool ThreadSanitizer::instrumentAtomic(Instruction *I, const DataLayout &DL) {
     Type *Ty = Type::getIntNTy(IRB.getContext(), BitSize);
     Type *PtrTy = Ty->getPointerTo();
     Value *Args[] = {IRB.CreatePointerCast(Addr, PtrTy),
-                     IRB.CreateIntCast(SI->getValueOperand(), Ty, false),
+                     createIntOrPtrToIntCast(SI->getValueOperand(), Ty, IRB),
                      createOrdering(&IRB, SI->getOrdering())};
     CallInst *C = CallInst::Create(TsanAtomicStore[Idx], Args);
     ReplaceInstWithInst(I, C);
@@ -560,15 +626,26 @@ bool ThreadSanitizer::instrumentAtomic(Instruction *I, const DataLayout &DL) {
     const unsigned BitSize = ByteSize * 8;
     Type *Ty = Type::getIntNTy(IRB.getContext(), BitSize);
     Type *PtrTy = Ty->getPointerTo();
+    Value *CmpOperand =
+      createIntOrPtrToIntCast(CASI->getCompareOperand(), Ty, IRB);
+    Value *NewOperand =
+      createIntOrPtrToIntCast(CASI->getNewValOperand(), Ty, IRB);
     Value *Args[] = {IRB.CreatePointerCast(Addr, PtrTy),
-                     IRB.CreateIntCast(CASI->getCompareOperand(), Ty, false),
-                     IRB.CreateIntCast(CASI->getNewValOperand(), Ty, false),
+                     CmpOperand,
+                     NewOperand,
                      createOrdering(&IRB, CASI->getSuccessOrdering()),
                      createOrdering(&IRB, CASI->getFailureOrdering())};
     CallInst *C = IRB.CreateCall(TsanAtomicCAS[Idx], Args);
-    Value *Success = IRB.CreateICmpEQ(C, CASI->getCompareOperand());
+    Value *Success = IRB.CreateICmpEQ(C, CmpOperand);
+    Value *OldVal = C;
+    Type *OrigOldValTy = CASI->getNewValOperand()->getType();
+    if (Ty != OrigOldValTy) {
+      // The value is a pointer, so we need to cast the return value.
+      OldVal = IRB.CreateIntToPtr(C, OrigOldValTy);
+    }
 
-    Value *Res = IRB.CreateInsertValue(UndefValue::get(CASI->getType()), C, 0);
+    Value *Res =
+      IRB.CreateInsertValue(UndefValue::get(CASI->getType()), OldVal, 0);
     Res = IRB.CreateInsertValue(Res, Success, 1);
 
     I->replaceAllUsesWith(Res);
diff --git a/lib/Transforms/Makefile b/lib/Transforms/Makefile
deleted file mode 100644
index c390517d07cd..000000000000
--- a/lib/Transforms/Makefile
+++ /dev/null
@@ -1,20 +0,0 @@
-##===- lib/Transforms/Makefile -----------------------------*- Makefile -*-===##
-#
-#                     The LLVM Compiler Infrastructure
-#
-# This file is distributed under the University of Illinois Open Source
-# License. See LICENSE.TXT for details.
-#
-##===----------------------------------------------------------------------===##
-
-LEVEL = ../..
-PARALLEL_DIRS = Utils Instrumentation Scalar InstCombine IPO Vectorize Hello ObjCARC
-
-include $(LEVEL)/Makefile.config
-
-# No support for plugins on windows targets
-ifeq ($(HOST_OS), $(filter $(HOST_OS), Cygwin MingW Minix))
-  PARALLEL_DIRS := $(filter-out Hello, $(PARALLEL_DIRS))
-endif
-
-include $(LEVEL)/Makefile.common
diff --git a/lib/Transforms/ObjCARC/BlotMapVector.h b/lib/Transforms/ObjCARC/BlotMapVector.h
index d6439b698418..ef075bdccbfe 100644
--- a/lib/Transforms/ObjCARC/BlotMapVector.h
+++ b/lib/Transforms/ObjCARC/BlotMapVector.h
@@ -31,7 +31,7 @@ public:
   const_iterator begin() const { return Vector.begin(); }
   const_iterator end() const { return Vector.end(); }
 
-#ifdef XDEBUG
+#ifdef EXPENSIVE_CHECKS
   ~BlotMapVector() {
     assert(Vector.size() >= Map.size()); // May differ due to blotting.
     for (typename MapTy::const_iterator I = Map.begin(), E = Map.end(); I != E;
diff --git a/lib/Transforms/ObjCARC/DependencyAnalysis.h b/lib/Transforms/ObjCARC/DependencyAnalysis.h
index 8e042d47ee6e..8cc1232b18ca 100644
--- a/lib/Transforms/ObjCARC/DependencyAnalysis.h
+++ b/lib/Transforms/ObjCARC/DependencyAnalysis.h
@@ -24,6 +24,7 @@
 #define LLVM_LIB_TRANSFORMS_OBJCARC_DEPENDENCYANALYSIS_H
 
 #include "llvm/ADT/SmallPtrSet.h"
+#include "llvm/Analysis/ObjCARCInstKind.h"
 
 namespace llvm {
   class BasicBlock;
diff --git a/lib/Transforms/ObjCARC/Makefile b/lib/Transforms/ObjCARC/Makefile
deleted file mode 100644
index 2a34e21714f1..000000000000
--- a/lib/Transforms/ObjCARC/Makefile
+++ /dev/null
@@ -1,15 +0,0 @@
-##===- lib/Transforms/ObjCARC/Makefile ---------------------*- Makefile -*-===##
-#
-#                     The LLVM Compiler Infrastructure
-#
-# This file is distributed under the University of Illinois Open Source
-# License. See LICENSE.TXT for details.
-#
-##===----------------------------------------------------------------------===##
-
-LEVEL = ../../..
-LIBRARYNAME = LLVMObjCARCOpts
-BUILD_ARCHIVE = 1
-
-include $(LEVEL)/Makefile.common
-
diff --git a/lib/Transforms/ObjCARC/ObjCARC.cpp b/lib/Transforms/ObjCARC/ObjCARC.cpp
index d860723bb460..688dd12c408a 100644
--- a/lib/Transforms/ObjCARC/ObjCARC.cpp
+++ b/lib/Transforms/ObjCARC/ObjCARC.cpp
@@ -17,7 +17,6 @@
 #include "llvm-c/Core.h"
 #include "llvm-c/Initialization.h"
 #include "llvm/InitializePasses.h"
-#include "llvm/Support/CommandLine.h"
 
 namespace llvm {
   class PassRegistry;
diff --git a/lib/Transforms/ObjCARC/ObjCARC.h b/lib/Transforms/ObjCARC/ObjCARC.h
index 5fd45b00af17..f02b75f0b456 100644
--- a/lib/Transforms/ObjCARC/ObjCARC.h
+++ b/lib/Transforms/ObjCARC/ObjCARC.h
@@ -24,7 +24,6 @@
 #define LLVM_LIB_TRANSFORMS_OBJCARC_OBJCARC_H
 
 #include "llvm/ADT/StringSwitch.h"
-#include "llvm/ADT/Optional.h"
 #include "llvm/Analysis/AliasAnalysis.h"
 #include "llvm/Analysis/ObjCARCAnalysisUtils.h"
 #include "llvm/Analysis/ObjCARCInstKind.h"
diff --git a/lib/Transforms/ObjCARC/ObjCARCAPElim.cpp b/lib/Transforms/ObjCARC/ObjCARCAPElim.cpp
index 969e77c1f888..b2c62a0e8eeb 100644
--- a/lib/Transforms/ObjCARC/ObjCARCAPElim.cpp
+++ b/lib/Transforms/ObjCARC/ObjCARCAPElim.cpp
@@ -70,7 +70,7 @@ void ObjCARCAPElim::getAnalysisUsage(AnalysisUsage &AU) const {
 /// possibly produce autoreleases.
 bool ObjCARCAPElim::MayAutorelease(ImmutableCallSite CS, unsigned Depth) {
   if (const Function *Callee = CS.getCalledFunction()) {
-    if (Callee->isDeclaration() || Callee->mayBeOverridden())
+    if (!Callee->hasExactDefinition())
       return true;
     for (const BasicBlock &BB : *Callee) {
       for (const Instruction &I : BB)
@@ -132,6 +132,9 @@ bool ObjCARCAPElim::runOnModule(Module &M) {
   if (!ModuleHasARC(M))
     return false;
 
+  if (skipModule(M))
+    return false;
+
   // Find the llvm.global_ctors variable, as the first step in
   // identifying the global constructors. In theory, unnecessary autorelease
   // pools could occur anywhere, but in practice it's pretty rare. Global
diff --git a/lib/Transforms/ObjCARC/ObjCARCContract.cpp b/lib/Transforms/ObjCARC/ObjCARCContract.cpp
index 1cdf5689f42a..11e2d03e17d9 100644
--- a/lib/Transforms/ObjCARC/ObjCARCContract.cpp
+++ b/lib/Transforms/ObjCARC/ObjCARCContract.cpp
@@ -66,7 +66,7 @@ namespace {
 
     /// The inline asm string to insert between calls and RetainRV calls to make
     /// the optimization work on targets which need it.
-    const MDString *RetainRVMarker;
+    const MDString *RVInstMarker;
 
     /// The set of inserted objc_storeStrong calls. If at the end of walking the
     /// function we have found no alloca instructions, these calls can be marked
@@ -201,6 +201,7 @@ static StoreInst *findSafeStoreForStoreStrongContraction(LoadInst *Load,
 
   // Get the location associated with Load.
   MemoryLocation Loc = MemoryLocation::get(Load);
+  auto *LocPtr = Loc.Ptr->stripPointerCasts();
 
   // Walk down to find the store and the release, which may be in either order.
   for (auto I = std::next(BasicBlock::iterator(Load)),
@@ -261,7 +262,7 @@ static StoreInst *findSafeStoreForStoreStrongContraction(LoadInst *Load,
 
     // Then make sure that the pointer we are storing to is Ptr. If so, we
     // found our Store!
-    if (Store->getPointerOperand() == Loc.Ptr)
+    if (Store->getPointerOperand()->stripPointerCasts() == LocPtr)
       continue;
 
     // Otherwise, we have an unknown store to some other ptr that clobbers
@@ -423,20 +424,20 @@ bool ObjCARCContract::tryToPeepholeInstruction(
         return false;
       // If we succeed in our optimization, fall through.
       // FALLTHROUGH
-    case ARCInstKind::RetainRV: {
+    case ARCInstKind::RetainRV:
+    case ARCInstKind::ClaimRV: {
       // If we're compiling for a target which needs a special inline-asm
-      // marker to do the retainAutoreleasedReturnValue optimization,
-      // insert it now.
-      if (!RetainRVMarker)
+      // marker to do the return value optimization, insert it now.
+      if (!RVInstMarker)
         return false;
       BasicBlock::iterator BBI = Inst->getIterator();
       BasicBlock *InstParent = Inst->getParent();
 
-      // Step up to see if the call immediately precedes the RetainRV call.
+      // Step up to see if the call immediately precedes the RV call.
       // If it's an invoke, we have to cross a block boundary. And we have
       // to carefully dodge no-op instructions.
       do {
-        if (&*BBI == InstParent->begin()) {
+        if (BBI == InstParent->begin()) {
           BasicBlock *Pred = InstParent->getSinglePredecessor();
           if (!Pred)
             goto decline_rv_optimization;
@@ -447,14 +448,14 @@ bool ObjCARCContract::tryToPeepholeInstruction(
       } while (IsNoopInstruction(&*BBI));
 
       if (&*BBI == GetArgRCIdentityRoot(Inst)) {
-        DEBUG(dbgs() << "Adding inline asm marker for "
-                        "retainAutoreleasedReturnValue optimization.\n");
+        DEBUG(dbgs() << "Adding inline asm marker for the return value "
+                        "optimization.\n");
         Changed = true;
-        InlineAsm *IA =
-          InlineAsm::get(FunctionType::get(Type::getVoidTy(Inst->getContext()),
-                                           /*isVarArg=*/false),
-                         RetainRVMarker->getString(),
-                         /*Constraints=*/"", /*hasSideEffects=*/true);
+        InlineAsm *IA = InlineAsm::get(
+            FunctionType::get(Type::getVoidTy(Inst->getContext()),
+                              /*isVarArg=*/false),
+            RVInstMarker->getString(),
+            /*Constraints=*/"", /*hasSideEffects=*/true);
         CallInst::Create(IA, "", Inst);
       }
     decline_rv_optimization:
@@ -605,7 +606,7 @@ bool ObjCARCContract::runOnFunction(Function &F) {
                cast<GEPOperator>(Arg)->hasAllZeroIndices())
         Arg = cast<GEPOperator>(Arg)->getPointerOperand();
       else if (isa<GlobalAlias>(Arg) &&
-               !cast<GlobalAlias>(Arg)->mayBeOverridden())
+               !cast<GlobalAlias>(Arg)->isInterposable())
         Arg = cast<GlobalAlias>(Arg)->getAliasee();
       else
         break;
@@ -650,15 +651,15 @@ bool ObjCARCContract::doInitialization(Module &M) {
 
   EP.init(&M);
 
-  // Initialize RetainRVMarker.
-  RetainRVMarker = nullptr;
+  // Initialize RVInstMarker.
+  RVInstMarker = nullptr;
   if (NamedMDNode *NMD =
           M.getNamedMetadata("clang.arc.retainAutoreleasedReturnValueMarker"))
     if (NMD->getNumOperands() == 1) {
       const MDNode *N = NMD->getOperand(0);
       if (N->getNumOperands() == 1)
         if (const MDString *S = dyn_cast<MDString>(N->getOperand(0)))
-          RetainRVMarker = S;
+          RVInstMarker = S;
     }
 
   return false;
diff --git a/lib/Transforms/ObjCARC/ObjCARCExpand.cpp b/lib/Transforms/ObjCARC/ObjCARCExpand.cpp
index 53c19c39f97f..bb6a0a0e73db 100644
--- a/lib/Transforms/ObjCARC/ObjCARCExpand.cpp
+++ b/lib/Transforms/ObjCARC/ObjCARCExpand.cpp
@@ -24,7 +24,6 @@
 //===----------------------------------------------------------------------===//
 
 #include "ObjCARC.h"
-#include "llvm/ADT/StringRef.h"
 #include "llvm/IR/Function.h"
 #include "llvm/IR/InstIterator.h"
 #include "llvm/IR/Instruction.h"
diff --git a/lib/Transforms/ObjCARC/ObjCARCOpts.cpp b/lib/Transforms/ObjCARC/ObjCARCOpts.cpp
index f0ee6e2be487..a6907b56cf45 100644
--- a/lib/Transforms/ObjCARC/ObjCARCOpts.cpp
+++ b/lib/Transforms/ObjCARC/ObjCARCOpts.cpp
@@ -889,6 +889,7 @@ void ObjCARCOpt::OptimizeIndividualCalls(Function &F) {
                            Inst->getParent(), Inst,
                            DependingInstructions, Visited, PA);
           break;
+        case ARCInstKind::ClaimRV:
         case ARCInstKind::RetainRV:
         case ARCInstKind::AutoreleaseRV:
           // Don't move these; the RV optimization depends on the autoreleaseRV
@@ -1459,17 +1460,13 @@ bool ObjCARCOpt::Visit(Function &F,
 
   // Use reverse-postorder on the reverse CFG for bottom-up.
   bool BottomUpNestingDetected = false;
-  for (SmallVectorImpl<BasicBlock *>::const_reverse_iterator I =
-       ReverseCFGPostOrder.rbegin(), E = ReverseCFGPostOrder.rend();
-       I != E; ++I)
-    BottomUpNestingDetected |= VisitBottomUp(*I, BBStates, Retains);
+  for (BasicBlock *BB : reverse(ReverseCFGPostOrder))
+    BottomUpNestingDetected |= VisitBottomUp(BB, BBStates, Retains);
 
   // Use reverse-postorder for top-down.
   bool TopDownNestingDetected = false;
-  for (SmallVectorImpl<BasicBlock *>::const_reverse_iterator I =
-       PostOrder.rbegin(), E = PostOrder.rend();
-       I != E; ++I)
-    TopDownNestingDetected |= VisitTopDown(*I, BBStates, Releases);
+  for (BasicBlock *BB : reverse(PostOrder))
+    TopDownNestingDetected |= VisitTopDown(BB, BBStates, Releases);
 
   return TopDownNestingDetected && BottomUpNestingDetected;
 }
@@ -1554,9 +1551,7 @@ bool ObjCARCOpt::PairUpRetainsAndReleases(
   unsigned NewCount = 0;
   bool FirstRelease = true;
   for (;;) {
-    for (SmallVectorImpl<Instruction *>::const_iterator
-           NI = NewRetains.begin(), NE = NewRetains.end(); NI != NE; ++NI) {
-      Instruction *NewRetain = *NI;
+    for (Instruction *NewRetain : NewRetains) {
       auto It = Retains.find(NewRetain);
       assert(It != Retains.end());
       const RRInfo &NewRetainRRI = It->second;
@@ -1630,9 +1625,7 @@ bool ObjCARCOpt::PairUpRetainsAndReleases(
     if (NewReleases.empty()) break;
 
     // Back the other way.
-    for (SmallVectorImpl<Instruction *>::const_iterator
-           NI = NewReleases.begin(), NE = NewReleases.end(); NI != NE; ++NI) {
-      Instruction *NewRelease = *NI;
+    for (Instruction *NewRelease : NewReleases) {
       auto It = Releases.find(NewRelease);
       assert(It != Releases.end());
       const RRInfo &NewReleaseRRI = It->second;
diff --git a/lib/Transforms/Scalar/ADCE.cpp b/lib/Transforms/Scalar/ADCE.cpp
index 590a52da6b19..0eed0240c741 100644
--- a/lib/Transforms/Scalar/ADCE.cpp
+++ b/lib/Transforms/Scalar/ADCE.cpp
@@ -22,10 +22,12 @@
 #include "llvm/Analysis/GlobalsModRef.h"
 #include "llvm/IR/BasicBlock.h"
 #include "llvm/IR/CFG.h"
+#include "llvm/IR/DebugInfoMetadata.h"
 #include "llvm/IR/InstIterator.h"
 #include "llvm/IR/Instructions.h"
 #include "llvm/IR/IntrinsicInst.h"
 #include "llvm/Pass.h"
+#include "llvm/ProfileData/InstrProf.h"
 #include "llvm/Transforms/Scalar.h"
 using namespace llvm;
 
@@ -33,22 +35,70 @@ using namespace llvm;
 
 STATISTIC(NumRemoved, "Number of instructions removed");
 
+static void collectLiveScopes(const DILocalScope &LS,
+                              SmallPtrSetImpl<const Metadata *> &AliveScopes) {
+  if (!AliveScopes.insert(&LS).second)
+    return;
+
+  if (isa<DISubprogram>(LS))
+    return;
+
+  // Tail-recurse through the scope chain.
+  collectLiveScopes(cast<DILocalScope>(*LS.getScope()), AliveScopes);
+}
+
+static void collectLiveScopes(const DILocation &DL,
+                              SmallPtrSetImpl<const Metadata *> &AliveScopes) {
+  // Even though DILocations are not scopes, shove them into AliveScopes so we
+  // don't revisit them.
+  if (!AliveScopes.insert(&DL).second)
+    return;
+
+  // Collect live scopes from the scope chain.
+  collectLiveScopes(*DL.getScope(), AliveScopes);
+
+  // Tail-recurse through the inlined-at chain.
+  if (const DILocation *IA = DL.getInlinedAt())
+    collectLiveScopes(*IA, AliveScopes);
+}
+
+// Check if this instruction is a runtime call for value profiling and
+// if it's instrumenting a constant.
+static bool isInstrumentsConstant(Instruction &I) {
+  if (CallInst *CI = dyn_cast<CallInst>(&I))
+    if (Function *Callee = CI->getCalledFunction())
+      if (Callee->getName().equals(getInstrProfValueProfFuncName()))
+        if (isa<Constant>(CI->getArgOperand(0)))
+          return true;
+  return false;
+}
+
 static bool aggressiveDCE(Function& F) {
-  SmallPtrSet<Instruction*, 128> Alive;
+  SmallPtrSet<Instruction*, 32> Alive;
   SmallVector<Instruction*, 128> Worklist;
 
   // Collect the set of "root" instructions that are known live.
   for (Instruction &I : instructions(F)) {
-    if (isa<TerminatorInst>(I) || isa<DbgInfoIntrinsic>(I) || I.isEHPad() ||
-        I.mayHaveSideEffects()) {
+    if (isa<TerminatorInst>(I) || I.isEHPad() || I.mayHaveSideEffects()) {
+      // Skip any value profile instrumentation calls if they are
+      // instrumenting constants.
+      if (isInstrumentsConstant(I))
+        continue;
       Alive.insert(&I);
       Worklist.push_back(&I);
     }
   }
 
-  // Propagate liveness backwards to operands.
+  // Propagate liveness backwards to operands.  Keep track of live debug info
+  // scopes.
+  SmallPtrSet<const Metadata *, 32> AliveScopes;
   while (!Worklist.empty()) {
     Instruction *Curr = Worklist.pop_back_val();
+
+    // Collect the live debug info scopes attached to this instruction.
+    if (const DILocation *DL = Curr->getDebugLoc())
+      collectLiveScopes(*DL, AliveScopes);
+
     for (Use &OI : Curr->operands()) {
       if (Instruction *Inst = dyn_cast<Instruction>(OI))
         if (Alive.insert(Inst).second)
@@ -61,10 +111,30 @@ static bool aggressiveDCE(Function& F) {
   // value of the function, and may therefore be deleted safely.
   // NOTE: We reuse the Worklist vector here for memory efficiency.
   for (Instruction &I : instructions(F)) {
-    if (!Alive.count(&I)) {
-      Worklist.push_back(&I);
-      I.dropAllReferences();
+    // Check if the instruction is alive.
+    if (Alive.count(&I))
+      continue;
+
+    if (auto *DII = dyn_cast<DbgInfoIntrinsic>(&I)) {
+      // Check if the scope of this variable location is alive.
+      if (AliveScopes.count(DII->getDebugLoc()->getScope()))
+        continue;
+
+      // Fallthrough and drop the intrinsic.
+      DEBUG({
+        // If intrinsic is pointing at a live SSA value, there may be an
+        // earlier optimization bug: if we know the location of the variable,
+        // why isn't the scope of the location alive?
+        if (Value *V = DII->getVariableLocation())
+          if (Instruction *II = dyn_cast<Instruction>(V))
+            if (Alive.count(II))
+              dbgs() << "Dropping debug info for " << *DII << "\n";
+      });
     }
+
+    // Prepare to delete.
+    Worklist.push_back(&I);
+    I.dropAllReferences();
   }
 
   for (Instruction *&I : Worklist) {
@@ -75,10 +145,14 @@ static bool aggressiveDCE(Function& F) {
   return !Worklist.empty();
 }
 
-PreservedAnalyses ADCEPass::run(Function &F) {
-  if (aggressiveDCE(F))
-    return PreservedAnalyses::none();
-  return PreservedAnalyses::all();
+PreservedAnalyses ADCEPass::run(Function &F, FunctionAnalysisManager &) {
+  if (!aggressiveDCE(F))
+    return PreservedAnalyses::all();
+
+  // FIXME: This should also 'preserve the CFG'.
+  auto PA = PreservedAnalyses();
+  PA.preserve<GlobalsAA>();
+  return PA;
 }
 
 namespace {
@@ -89,7 +163,7 @@ struct ADCELegacyPass : public FunctionPass {
   }
 
   bool runOnFunction(Function& F) override {
-    if (skipOptnoneFunction(F))
+    if (skipFunction(F))
       return false;
     return aggressiveDCE(F);
   }
diff --git a/lib/Transforms/Scalar/AlignmentFromAssumptions.cpp b/lib/Transforms/Scalar/AlignmentFromAssumptions.cpp
index 4b721d38adba..7f8b8ce91e79 100644
--- a/lib/Transforms/Scalar/AlignmentFromAssumptions.cpp
+++ b/lib/Transforms/Scalar/AlignmentFromAssumptions.cpp
@@ -18,6 +18,7 @@
 
 #define AA_NAME "alignment-from-assumptions"
 #define DEBUG_TYPE AA_NAME
+#include "llvm/Transforms/Scalar/AlignmentFromAssumptions.h"
 #include "llvm/Transforms/Scalar.h"
 #include "llvm/ADT/SmallPtrSet.h"
 #include "llvm/ADT/Statistic.h"
@@ -25,13 +26,11 @@
 #include "llvm/Analysis/GlobalsModRef.h"
 #include "llvm/Analysis/AssumptionCache.h"
 #include "llvm/Analysis/LoopInfo.h"
-#include "llvm/Analysis/ScalarEvolution.h"
 #include "llvm/Analysis/ScalarEvolutionExpressions.h"
 #include "llvm/Analysis/ValueTracking.h"
 #include "llvm/IR/Constant.h"
 #include "llvm/IR/Dominators.h"
 #include "llvm/IR/Instruction.h"
-#include "llvm/IR/IntrinsicInst.h"
 #include "llvm/IR/Intrinsics.h"
 #include "llvm/IR/Module.h"
 #include "llvm/Support/Debug.h"
@@ -67,18 +66,7 @@ struct AlignmentFromAssumptions : public FunctionPass {
     AU.addPreserved<ScalarEvolutionWrapperPass>();
   }
 
-  // For memory transfers, we need a common alignment for both the source and
-  // destination. If we have a new alignment for only one operand of a transfer
-  // instruction, save it in these maps.  If we reach the other operand through
-  // another assumption later, then we may change the alignment at that point.
-  DenseMap<MemTransferInst *, unsigned> NewDestAlignments, NewSrcAlignments;
-
-  ScalarEvolution *SE;
-  DominatorTree *DT;
-
-  bool extractAlignmentInfo(CallInst *I, Value *&AAPtr, const SCEV *&AlignSCEV,
-                            const SCEV *&OffSCEV);
-  bool processAssumption(CallInst *I);
+  AlignmentFromAssumptionsPass Impl;
 };
 }
 
@@ -209,9 +197,10 @@ static unsigned getNewAlignment(const SCEV *AASCEV, const SCEV *AlignSCEV,
   return 0;
 }
 
-bool AlignmentFromAssumptions::extractAlignmentInfo(CallInst *I,
-                                 Value *&AAPtr, const SCEV *&AlignSCEV,
-                                 const SCEV *&OffSCEV) {
+bool AlignmentFromAssumptionsPass::extractAlignmentInfo(CallInst *I,
+                                                        Value *&AAPtr,
+                                                        const SCEV *&AlignSCEV,
+                                                        const SCEV *&OffSCEV) {
   // An alignment assume must be a statement about the least-significant
   // bits of the pointer being zero, possibly with some offset.
   ICmpInst *ICI = dyn_cast<ICmpInst>(I->getArgOperand(0));
@@ -302,7 +291,7 @@ bool AlignmentFromAssumptions::extractAlignmentInfo(CallInst *I,
   return true;
 }
 
-bool AlignmentFromAssumptions::processAssumption(CallInst *ACall) {
+bool AlignmentFromAssumptionsPass::processAssumption(CallInst *ACall) {
   Value *AAPtr;
   const SCEV *AlignSCEV, *OffSCEV;
   if (!extractAlignmentInfo(ACall, AAPtr, AlignSCEV, OffSCEV))
@@ -411,14 +400,26 @@ bool AlignmentFromAssumptions::processAssumption(CallInst *ACall) {
 }
 
 bool AlignmentFromAssumptions::runOnFunction(Function &F) {
-  bool Changed = false;
+  if (skipFunction(F))
+    return false;
+
   auto &AC = getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F);
-  SE = &getAnalysis<ScalarEvolutionWrapperPass>().getSE();
-  DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
+  ScalarEvolution *SE = &getAnalysis<ScalarEvolutionWrapperPass>().getSE();
+  DominatorTree *DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
+
+  return Impl.runImpl(F, AC, SE, DT);
+}
+
+bool AlignmentFromAssumptionsPass::runImpl(Function &F, AssumptionCache &AC,
+                                           ScalarEvolution *SE_,
+                                           DominatorTree *DT_) {
+  SE = SE_;
+  DT = DT_;
 
   NewDestAlignments.clear();
   NewSrcAlignments.clear();
 
+  bool Changed = false;
   for (auto &AssumeVH : AC.assumptions())
     if (AssumeVH)
       Changed |= processAssumption(cast<CallInst>(AssumeVH));
@@ -426,3 +427,20 @@ bool AlignmentFromAssumptions::runOnFunction(Function &F) {
   return Changed;
 }
 
+PreservedAnalyses
+AlignmentFromAssumptionsPass::run(Function &F, FunctionAnalysisManager &AM) {
+
+  AssumptionCache &AC = AM.getResult<AssumptionAnalysis>(F);
+  ScalarEvolution &SE = AM.getResult<ScalarEvolutionAnalysis>(F);
+  DominatorTree &DT = AM.getResult<DominatorTreeAnalysis>(F);
+  bool Changed = runImpl(F, AC, &SE, &DT);
+  if (!Changed)
+    return PreservedAnalyses::all();
+  PreservedAnalyses PA;
+  PA.preserve<AAManager>();
+  PA.preserve<ScalarEvolutionAnalysis>();
+  PA.preserve<GlobalsAA>();
+  PA.preserve<LoopAnalysis>();
+  PA.preserve<DominatorTreeAnalysis>();
+  return PA;
+}
diff --git a/lib/Transforms/Scalar/BDCE.cpp b/lib/Transforms/Scalar/BDCE.cpp
index cb9b8b6fffc8..4f6225f4c7b0 100644
--- a/lib/Transforms/Scalar/BDCE.cpp
+++ b/lib/Transforms/Scalar/BDCE.cpp
@@ -14,11 +14,11 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "llvm/Transforms/Scalar.h"
+#include "llvm/Transforms/Scalar/BDCE.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/Statistic.h"
-#include "llvm/Analysis/GlobalsModRef.h"
 #include "llvm/Analysis/DemandedBits.h"
+#include "llvm/Analysis/GlobalsModRef.h"
 #include "llvm/IR/CFG.h"
 #include "llvm/IR/InstIterator.h"
 #include "llvm/IR/Instructions.h"
@@ -27,6 +27,7 @@
 #include "llvm/Pass.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/raw_ostream.h"
+#include "llvm/Transforms/Scalar.h"
 using namespace llvm;
 
 #define DEBUG_TYPE "bdce"
@@ -34,35 +35,7 @@ using namespace llvm;
 STATISTIC(NumRemoved, "Number of instructions removed (unused)");
 STATISTIC(NumSimplified, "Number of instructions trivialized (dead bits)");
 
-namespace {
-struct BDCE : public FunctionPass {
-  static char ID; // Pass identification, replacement for typeid
-  BDCE() : FunctionPass(ID) {
-    initializeBDCEPass(*PassRegistry::getPassRegistry());
-  }
-
-  bool runOnFunction(Function& F) override;
-
-  void getAnalysisUsage(AnalysisUsage& AU) const override {
-    AU.setPreservesCFG();
-    AU.addRequired<DemandedBits>();
-    AU.addPreserved<GlobalsAAWrapperPass>();
-  }
-};
-}
-
-char BDCE::ID = 0;
-INITIALIZE_PASS_BEGIN(BDCE, "bdce", "Bit-Tracking Dead Code Elimination",
-                      false, false)
-INITIALIZE_PASS_DEPENDENCY(DemandedBits)
-INITIALIZE_PASS_END(BDCE, "bdce", "Bit-Tracking Dead Code Elimination",
-                    false, false)
-
-bool BDCE::runOnFunction(Function& F) {
-  if (skipOptnoneFunction(F))
-    return false;
-  DemandedBits &DB = getAnalysis<DemandedBits>();
-
+static bool bitTrackingDCE(Function &F, DemandedBits &DB) {
   SmallVector<Instruction*, 128> Worklist;
   bool Changed = false;
   for (Instruction &I : instructions(F)) {
@@ -96,7 +69,44 @@ bool BDCE::runOnFunction(Function& F) {
   return Changed;
 }
 
-FunctionPass *llvm::createBitTrackingDCEPass() {
-  return new BDCE();
+PreservedAnalyses BDCEPass::run(Function &F, FunctionAnalysisManager &AM) {
+  auto &DB = AM.getResult<DemandedBitsAnalysis>(F);
+  if (!bitTrackingDCE(F, DB))
+    return PreservedAnalyses::all();
+
+  // FIXME: This should also 'preserve the CFG'.
+  auto PA = PreservedAnalyses();
+  PA.preserve<GlobalsAA>();
+  return PA;
 }
 
+namespace {
+struct BDCELegacyPass : public FunctionPass {
+  static char ID; // Pass identification, replacement for typeid
+  BDCELegacyPass() : FunctionPass(ID) {
+    initializeBDCELegacyPassPass(*PassRegistry::getPassRegistry());
+  }
+
+  bool runOnFunction(Function &F) override {
+    if (skipFunction(F))
+      return false;
+    auto &DB = getAnalysis<DemandedBitsWrapperPass>().getDemandedBits();
+    return bitTrackingDCE(F, DB);
+  }
+
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    AU.setPreservesCFG();
+    AU.addRequired<DemandedBitsWrapperPass>();
+    AU.addPreserved<GlobalsAAWrapperPass>();
+  }
+};
+}
+
+char BDCELegacyPass::ID = 0;
+INITIALIZE_PASS_BEGIN(BDCELegacyPass, "bdce",
+                      "Bit-Tracking Dead Code Elimination", false, false)
+INITIALIZE_PASS_DEPENDENCY(DemandedBitsWrapperPass)
+INITIALIZE_PASS_END(BDCELegacyPass, "bdce",
+                    "Bit-Tracking Dead Code Elimination", false, false)
+
+FunctionPass *llvm::createBitTrackingDCEPass() { return new BDCELegacyPass(); }
diff --git a/lib/Transforms/Scalar/CMakeLists.txt b/lib/Transforms/Scalar/CMakeLists.txt
index a0ddbd085206..9f04344b8b0a 100644
--- a/lib/Transforms/Scalar/CMakeLists.txt
+++ b/lib/Transforms/Scalar/CMakeLists.txt
@@ -10,13 +10,16 @@ add_llvm_library(LLVMScalarOpts
   EarlyCSE.cpp
   FlattenCFGPass.cpp
   Float2Int.cpp
+  GuardWidening.cpp
   GVN.cpp
+  GVNHoist.cpp
   InductiveRangeCheckElimination.cpp
   IndVarSimplify.cpp
   JumpThreading.cpp
   LICM.cpp
   LoadCombine.cpp
   LoopDeletion.cpp
+  LoopDataPrefetch.cpp
   LoopDistribute.cpp
   LoopIdiomRecognize.cpp
   LoopInstSimplify.cpp
@@ -24,11 +27,14 @@ add_llvm_library(LLVMScalarOpts
   LoopLoadElimination.cpp
   LoopRerollPass.cpp
   LoopRotation.cpp
+  LoopSimplifyCFG.cpp
   LoopStrengthReduce.cpp
   LoopUnrollPass.cpp
   LoopUnswitch.cpp
+  LoopVersioningLICM.cpp
   LowerAtomic.cpp
   LowerExpectIntrinsic.cpp
+  LowerGuardIntrinsic.cpp
   MemCpyOptimizer.cpp
   MergedLoadStoreMotion.cpp
   NaryReassociate.cpp
@@ -40,7 +46,6 @@ add_llvm_library(LLVMScalarOpts
   SCCP.cpp
   SROA.cpp
   Scalar.cpp
-  ScalarReplAggregates.cpp
   Scalarizer.cpp
   SeparateConstOffsetFromGEP.cpp
   SimplifyCFGPass.cpp
diff --git a/lib/Transforms/Scalar/ConstantHoisting.cpp b/lib/Transforms/Scalar/ConstantHoisting.cpp
index 84f7f5fff5b5..913e939c2bd4 100644
--- a/lib/Transforms/Scalar/ConstantHoisting.cpp
+++ b/lib/Transforms/Scalar/ConstantHoisting.cpp
@@ -33,20 +33,20 @@
 // %0 = load i64* inttoptr (i64 big_constant to i64*)
 //===----------------------------------------------------------------------===//
 
-#include "llvm/Transforms/Scalar.h"
+#include "llvm/Transforms/Scalar/ConstantHoisting.h"
 #include "llvm/ADT/SmallSet.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/Statistic.h"
-#include "llvm/Analysis/TargetTransformInfo.h"
 #include "llvm/IR/Constants.h"
-#include "llvm/IR/Dominators.h"
 #include "llvm/IR/IntrinsicInst.h"
 #include "llvm/Pass.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/raw_ostream.h"
+#include "llvm/Transforms/Scalar.h"
 #include <tuple>
 
 using namespace llvm;
+using namespace consthoist;
 
 #define DEBUG_TYPE "consthoist"
 
@@ -54,75 +54,12 @@ STATISTIC(NumConstantsHoisted, "Number of constants hoisted");
 STATISTIC(NumConstantsRebased, "Number of constants rebased");
 
 namespace {
-struct ConstantUser;
-struct RebasedConstantInfo;
-
-typedef SmallVector<ConstantUser, 8> ConstantUseListType;
-typedef SmallVector<RebasedConstantInfo, 4> RebasedConstantListType;
-
-/// \brief Keeps track of the user of a constant and the operand index where the
-/// constant is used.
-struct ConstantUser {
-  Instruction *Inst;
-  unsigned OpndIdx;
-
-  ConstantUser(Instruction *Inst, unsigned Idx) : Inst(Inst), OpndIdx(Idx) { }
-};
-
-/// \brief Keeps track of a constant candidate and its uses.
-struct ConstantCandidate {
-  ConstantUseListType Uses;
-  ConstantInt *ConstInt;
-  unsigned CumulativeCost;
-
-  ConstantCandidate(ConstantInt *ConstInt)
-    : ConstInt(ConstInt), CumulativeCost(0) { }
-
-  /// \brief Add the user to the use list and update the cost.
-  void addUser(Instruction *Inst, unsigned Idx, unsigned Cost) {
-    CumulativeCost += Cost;
-    Uses.push_back(ConstantUser(Inst, Idx));
-  }
-};
-
-/// \brief This represents a constant that has been rebased with respect to a
-/// base constant. The difference to the base constant is recorded in Offset.
-struct RebasedConstantInfo {
-  ConstantUseListType Uses;
-  Constant *Offset;
-
-  RebasedConstantInfo(ConstantUseListType &&Uses, Constant *Offset)
-    : Uses(std::move(Uses)), Offset(Offset) { }
-};
-
-/// \brief A base constant and all its rebased constants.
-struct ConstantInfo {
-  ConstantInt *BaseConstant;
-  RebasedConstantListType RebasedConstants;
-};
-
 /// \brief The constant hoisting pass.
-class ConstantHoisting : public FunctionPass {
-  typedef DenseMap<ConstantInt *, unsigned> ConstCandMapType;
-  typedef std::vector<ConstantCandidate> ConstCandVecType;
-
-  const TargetTransformInfo *TTI;
-  DominatorTree *DT;
-  BasicBlock *Entry;
-
-  /// Keeps track of constant candidates found in the function.
-  ConstCandVecType ConstCandVec;
-
-  /// Keep track of cast instructions we already cloned.
-  SmallDenseMap<Instruction *, Instruction *> ClonedCastMap;
-
-  /// These are the final constants we decided to hoist.
-  SmallVector<ConstantInfo, 8> ConstantVec;
+class ConstantHoistingLegacyPass : public FunctionPass {
 public:
   static char ID; // Pass identification, replacement for typeid
-  ConstantHoisting() : FunctionPass(ID), TTI(nullptr), DT(nullptr),
-                       Entry(nullptr) {
-    initializeConstantHoistingPass(*PassRegistry::getPassRegistry());
+  ConstantHoistingLegacyPass() : FunctionPass(ID) {
+    initializeConstantHoistingLegacyPassPass(*PassRegistry::getPassRegistry());
   }
 
   bool runOnFunction(Function &Fn) override;
@@ -135,67 +72,36 @@ public:
     AU.addRequired<TargetTransformInfoWrapperPass>();
   }
 
-private:
-  /// \brief Initialize the pass.
-  void setup(Function &Fn) {
-    DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
-    TTI = &getAnalysis<TargetTransformInfoWrapperPass>().getTTI(Fn);
-    Entry = &Fn.getEntryBlock();
-  }
+  void releaseMemory() override { Impl.releaseMemory(); }
 
-  /// \brief Cleanup.
-  void cleanup() {
-    ConstantVec.clear();
-    ClonedCastMap.clear();
-    ConstCandVec.clear();
-
-    TTI = nullptr;
-    DT = nullptr;
-    Entry = nullptr;
-  }
-
-  Instruction *findMatInsertPt(Instruction *Inst, unsigned Idx = ~0U) const;
-  Instruction *findConstantInsertionPoint(const ConstantInfo &ConstInfo) const;
-  void collectConstantCandidates(ConstCandMapType &ConstCandMap,
-                                 Instruction *Inst, unsigned Idx,
-                                 ConstantInt *ConstInt);
-  void collectConstantCandidates(ConstCandMapType &ConstCandMap,
-                                 Instruction *Inst);
-  void collectConstantCandidates(Function &Fn);
-  void findAndMakeBaseConstant(ConstCandVecType::iterator S,
-                               ConstCandVecType::iterator E);
-  void findBaseConstants();
-  void emitBaseConstants(Instruction *Base, Constant *Offset,
-                         const ConstantUser &ConstUser);
-  bool emitBaseConstants();
-  void deleteDeadCastInst() const;
-  bool optimizeConstants(Function &Fn);
+private:
+  ConstantHoistingPass Impl;
 };
 }
 
-char ConstantHoisting::ID = 0;
-INITIALIZE_PASS_BEGIN(ConstantHoisting, "consthoist", "Constant Hoisting",
-                      false, false)
+char ConstantHoistingLegacyPass::ID = 0;
+INITIALIZE_PASS_BEGIN(ConstantHoistingLegacyPass, "consthoist",
+                      "Constant Hoisting", false, false)
 INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
 INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass)
-INITIALIZE_PASS_END(ConstantHoisting, "consthoist", "Constant Hoisting",
-                    false, false)
+INITIALIZE_PASS_END(ConstantHoistingLegacyPass, "consthoist",
+                    "Constant Hoisting", false, false)
 
 FunctionPass *llvm::createConstantHoistingPass() {
-  return new ConstantHoisting();
+  return new ConstantHoistingLegacyPass();
 }
 
 /// \brief Perform the constant hoisting optimization for the given function.
-bool ConstantHoisting::runOnFunction(Function &Fn) {
-  if (skipOptnoneFunction(Fn))
+bool ConstantHoistingLegacyPass::runOnFunction(Function &Fn) {
+  if (skipFunction(Fn))
     return false;
 
   DEBUG(dbgs() << "********** Begin Constant Hoisting **********\n");
   DEBUG(dbgs() << "********** Function: " << Fn.getName() << '\n');
 
-  setup(Fn);
-
-  bool MadeChange = optimizeConstants(Fn);
+  bool MadeChange = Impl.runImpl(
+      Fn, getAnalysis<TargetTransformInfoWrapperPass>().getTTI(Fn),
+      getAnalysis<DominatorTreeWrapperPass>().getDomTree(), Fn.getEntryBlock());
 
   if (MadeChange) {
     DEBUG(dbgs() << "********** Function after Constant Hoisting: "
@@ -204,15 +110,13 @@ bool ConstantHoisting::runOnFunction(Function &Fn) {
   }
   DEBUG(dbgs() << "********** End Constant Hoisting **********\n");
 
-  cleanup();
-
   return MadeChange;
 }
 
 
 /// \brief Find the constant materialization insertion point.
-Instruction *ConstantHoisting::findMatInsertPt(Instruction *Inst,
-                                               unsigned Idx) const {
+Instruction *ConstantHoistingPass::findMatInsertPt(Instruction *Inst,
+                                                   unsigned Idx) const {
   // If the operand is a cast instruction, then we have to materialize the
   // constant before the cast instruction.
   if (Idx != ~0U) {
@@ -237,8 +141,8 @@ Instruction *ConstantHoisting::findMatInsertPt(Instruction *Inst,
 }
 
 /// \brief Find an insertion point that dominates all uses.
-Instruction *ConstantHoisting::
-findConstantInsertionPoint(const ConstantInfo &ConstInfo) const {
+Instruction *ConstantHoistingPass::findConstantInsertionPoint(
+    const ConstantInfo &ConstInfo) const {
   assert(!ConstInfo.RebasedConstants.empty() && "Invalid constant info entry.");
   // Collect all basic blocks.
   SmallPtrSet<BasicBlock *, 8> BBs;
@@ -272,10 +176,9 @@ findConstantInsertionPoint(const ConstantInfo &ConstInfo) const {
 /// The operand at index Idx is not necessarily the constant integer itself. It
 /// could also be a cast instruction or a constant expression that uses the
 // constant integer.
-void ConstantHoisting::collectConstantCandidates(ConstCandMapType &ConstCandMap,
-                                                 Instruction *Inst,
-                                                 unsigned Idx,
-                                                 ConstantInt *ConstInt) {
+void ConstantHoistingPass::collectConstantCandidates(
+    ConstCandMapType &ConstCandMap, Instruction *Inst, unsigned Idx,
+    ConstantInt *ConstInt) {
   unsigned Cost;
   // Ask the target about the cost of materializing the constant for the given
   // instruction and operand index.
@@ -309,8 +212,8 @@ void ConstantHoisting::collectConstantCandidates(ConstCandMapType &ConstCandMap,
 
 /// \brief Scan the instruction for expensive integer constants and record them
 /// in the constant candidate vector.
-void ConstantHoisting::collectConstantCandidates(ConstCandMapType &ConstCandMap,
-                                                 Instruction *Inst) {
+void ConstantHoistingPass::collectConstantCandidates(
+    ConstCandMapType &ConstCandMap, Instruction *Inst) {
   // Skip all cast instructions. They are visited indirectly later on.
   if (Inst->isCast())
     return;
@@ -320,6 +223,18 @@ void ConstantHoisting::collectConstantCandidates(ConstCandMapType &ConstCandMap,
     if (isa<InlineAsm>(Call->getCalledValue()))
       return;
 
+  // Switch cases must remain constant, and if the value being tested is
+  // constant the entire thing should disappear.
+  if (isa<SwitchInst>(Inst))
+    return;
+
+  // Static allocas (constant size in the entry block) are handled by
+  // prologue/epilogue insertion so they're free anyway. We definitely don't
+  // want to make them non-constant.
+  auto AI = dyn_cast<AllocaInst>(Inst);
+  if (AI && AI->isStaticAlloca())
+    return;
+
   // Scan all operands.
   for (unsigned Idx = 0, E = Inst->getNumOperands(); Idx != E; ++Idx) {
     Value *Opnd = Inst->getOperand(Idx);
@@ -363,25 +278,116 @@ void ConstantHoisting::collectConstantCandidates(ConstCandMapType &ConstCandMap,
 
 /// \brief Collect all integer constants in the function that cannot be folded
 /// into an instruction itself.
-void ConstantHoisting::collectConstantCandidates(Function &Fn) {
+void ConstantHoistingPass::collectConstantCandidates(Function &Fn) {
   ConstCandMapType ConstCandMap;
   for (BasicBlock &BB : Fn)
     for (Instruction &Inst : BB)
       collectConstantCandidates(ConstCandMap, &Inst);
 }
 
-/// \brief Find the base constant within the given range and rebase all other
-/// constants with respect to the base constant.
-void ConstantHoisting::findAndMakeBaseConstant(ConstCandVecType::iterator S,
-                                               ConstCandVecType::iterator E) {
-  auto MaxCostItr = S;
+// This helper function is necessary to deal with values that have different
+// bit widths (APInt Operator- does not like that). If the value cannot be
+// represented in uint64 we return an "empty" APInt. This is then interpreted
+// as the value is not in range.
+static llvm::Optional<APInt> calculateOffsetDiff(APInt V1, APInt V2)
+{
+  llvm::Optional<APInt> Res = None;
+  unsigned BW = V1.getBitWidth() > V2.getBitWidth() ?
+                V1.getBitWidth() : V2.getBitWidth();
+  uint64_t LimVal1 = V1.getLimitedValue();
+  uint64_t LimVal2 = V2.getLimitedValue();
+
+  if (LimVal1 == ~0ULL || LimVal2 == ~0ULL)
+    return Res;
+
+  uint64_t Diff = LimVal1 - LimVal2;
+  return APInt(BW, Diff, true);
+}
+
+// From a list of constants, one needs to picked as the base and the other
+// constants will be transformed into an offset from that base constant. The
+// question is which we can pick best? For example, consider these constants
+// and their number of uses:
+//
+//  Constants| 2 | 4 | 12 | 42 |
+//  NumUses  | 3 | 2 |  8 |  7 |
+//
+// Selecting constant 12 because it has the most uses will generate negative
+// offsets for constants 2 and 4 (i.e. -10 and -8 respectively). If negative
+// offsets lead to less optimal code generation, then there might be better
+// solutions. Suppose immediates in the range of 0..35 are most optimally
+// supported by the architecture, then selecting constant 2 is most optimal
+// because this will generate offsets: 0, 2, 10, 40. Offsets 0, 2 and 10 are in
+// range 0..35, and thus 3 + 2 + 8 = 13 uses are in range. Selecting 12 would
+// have only 8 uses in range, so choosing 2 as a base is more optimal. Thus, in
+// selecting the base constant the range of the offsets is a very important
+// factor too that we take into account here. This algorithm calculates a total
+// costs for selecting a constant as the base and substract the costs if
+// immediates are out of range. It has quadratic complexity, so we call this
+// function only when we're optimising for size and there are less than 100
+// constants, we fall back to the straightforward algorithm otherwise
+// which does not do all the offset calculations.
+unsigned
+ConstantHoistingPass::maximizeConstantsInRange(ConstCandVecType::iterator S,
+                                           ConstCandVecType::iterator E,
+                                           ConstCandVecType::iterator &MaxCostItr) {
   unsigned NumUses = 0;
-  // Use the constant that has the maximum cost as base constant.
+
+  if(!Entry->getParent()->optForSize() || std::distance(S,E) > 100) {
+    for (auto ConstCand = S; ConstCand != E; ++ConstCand) {
+      NumUses += ConstCand->Uses.size();
+      if (ConstCand->CumulativeCost > MaxCostItr->CumulativeCost)
+        MaxCostItr = ConstCand;
+    }
+    return NumUses;
+  }
+
+  DEBUG(dbgs() << "== Maximize constants in range ==\n");
+  int MaxCost = -1;
   for (auto ConstCand = S; ConstCand != E; ++ConstCand) {
+    auto Value = ConstCand->ConstInt->getValue();
+    Type *Ty = ConstCand->ConstInt->getType();
+    int Cost = 0;
     NumUses += ConstCand->Uses.size();
-    if (ConstCand->CumulativeCost > MaxCostItr->CumulativeCost)
+    DEBUG(dbgs() << "= Constant: " << ConstCand->ConstInt->getValue() << "\n");
+
+    for (auto User : ConstCand->Uses) {
+      unsigned Opcode = User.Inst->getOpcode();
+      unsigned OpndIdx = User.OpndIdx;
+      Cost += TTI->getIntImmCost(Opcode, OpndIdx, Value, Ty);
+      DEBUG(dbgs() << "Cost: " << Cost << "\n");
+
+      for (auto C2 = S; C2 != E; ++C2) {
+        llvm::Optional<APInt> Diff = calculateOffsetDiff(
+                                      C2->ConstInt->getValue(),
+                                      ConstCand->ConstInt->getValue());
+        if (Diff) {
+          const int ImmCosts =
+            TTI->getIntImmCodeSizeCost(Opcode, OpndIdx, Diff.getValue(), Ty);
+          Cost -= ImmCosts;
+          DEBUG(dbgs() << "Offset " << Diff.getValue() << " "
+                       << "has penalty: " << ImmCosts << "\n"
+                       << "Adjusted cost: " << Cost << "\n");
+        }
+      }
+    }
+    DEBUG(dbgs() << "Cumulative cost: " << Cost << "\n");
+    if (Cost > MaxCost) {
+      MaxCost = Cost;
       MaxCostItr = ConstCand;
+      DEBUG(dbgs() << "New candidate: " << MaxCostItr->ConstInt->getValue()
+                   << "\n");
+    }
   }
+  return NumUses;
+}
+
+/// \brief Find the base constant within the given range and rebase all other
+/// constants with respect to the base constant.
+void ConstantHoistingPass::findAndMakeBaseConstant(
+    ConstCandVecType::iterator S, ConstCandVecType::iterator E) {
+  auto MaxCostItr = S;
+  unsigned NumUses = maximizeConstantsInRange(S, E, MaxCostItr);
 
   // Don't hoist constants that have only one use.
   if (NumUses <= 1)
@@ -404,7 +410,7 @@ void ConstantHoisting::findAndMakeBaseConstant(ConstCandVecType::iterator S,
 
 /// \brief Finds and combines constant candidates that can be easily
 /// rematerialized with an add from a common base constant.
-void ConstantHoisting::findBaseConstants() {
+void ConstantHoistingPass::findBaseConstants() {
   // Sort the constants by value and type. This invalidates the mapping!
   std::sort(ConstCandVec.begin(), ConstCandVec.end(),
             [](const ConstantCandidate &LHS, const ConstantCandidate &RHS) {
@@ -466,8 +472,9 @@ static bool updateOperand(Instruction *Inst, unsigned Idx, Instruction *Mat) {
 
 /// \brief Emit materialization code for all rebased constants and update their
 /// users.
-void ConstantHoisting::emitBaseConstants(Instruction *Base, Constant *Offset,
-                                         const ConstantUser &ConstUser) {
+void ConstantHoistingPass::emitBaseConstants(Instruction *Base,
+                                             Constant *Offset,
+                                             const ConstantUser &ConstUser) {
   Instruction *Mat = Base;
   if (Offset) {
     Instruction *InsertionPt = findMatInsertPt(ConstUser.Inst,
@@ -538,7 +545,7 @@ void ConstantHoisting::emitBaseConstants(Instruction *Base, Constant *Offset,
 
 /// \brief Hoist and hide the base constant behind a bitcast and emit
 /// materialization code for derived constants.
-bool ConstantHoisting::emitBaseConstants() {
+bool ConstantHoistingPass::emitBaseConstants() {
   bool MadeChange = false;
   for (auto const &ConstInfo : ConstantVec) {
     // Hoist and hide the base constant behind a bitcast.
@@ -572,14 +579,18 @@ bool ConstantHoisting::emitBaseConstants() {
 
 /// \brief Check all cast instructions we made a copy of and remove them if they
 /// have no more users.
-void ConstantHoisting::deleteDeadCastInst() const {
+void ConstantHoistingPass::deleteDeadCastInst() const {
   for (auto const &I : ClonedCastMap)
     if (I.first->use_empty())
       I.first->eraseFromParent();
 }
 
 /// \brief Optimize expensive integer constants in the given function.
-bool ConstantHoisting::optimizeConstants(Function &Fn) {
+bool ConstantHoistingPass::runImpl(Function &Fn, TargetTransformInfo &TTI,
+                                   DominatorTree &DT, BasicBlock &Entry) {
+  this->TTI = &TTI;
+  this->DT = &DT;
+  this->Entry = &Entry;  
   // Collect all constant candidates.
   collectConstantCandidates(Fn);
 
@@ -604,3 +615,14 @@ bool ConstantHoisting::optimizeConstants(Function &Fn) {
 
   return MadeChange;
 }
+
+PreservedAnalyses ConstantHoistingPass::run(Function &F,
+                                            FunctionAnalysisManager &AM) {
+  auto &DT = AM.getResult<DominatorTreeAnalysis>(F);
+  auto &TTI = AM.getResult<TargetIRAnalysis>(F);
+  if (!runImpl(F, TTI, DT, F.getEntryBlock()))
+    return PreservedAnalyses::all();
+
+  // FIXME: This should also 'preserve the CFG'.
+  return PreservedAnalyses::none();
+}
diff --git a/lib/Transforms/Scalar/ConstantProp.cpp b/lib/Transforms/Scalar/ConstantProp.cpp
index c974ebb9456f..88172d19fe5a 100644
--- a/lib/Transforms/Scalar/ConstantProp.cpp
+++ b/lib/Transforms/Scalar/ConstantProp.cpp
@@ -61,11 +61,14 @@ FunctionPass *llvm::createConstantPropagationPass() {
 }
 
 bool ConstantPropagation::runOnFunction(Function &F) {
+  if (skipFunction(F))
+    return false;
+
   // Initialize the worklist to all of the instructions ready to process...
   std::set<Instruction*> WorkList;
-  for(inst_iterator i = inst_begin(F), e = inst_end(F); i != e; ++i) {
-      WorkList.insert(&*i);
-  }
+  for (Instruction &I: instructions(&F))
+    WorkList.insert(&I);
+
   bool Changed = false;
   const DataLayout &DL = F.getParent()->getDataLayout();
   TargetLibraryInfo *TLI =
diff --git a/lib/Transforms/Scalar/CorrelatedValuePropagation.cpp b/lib/Transforms/Scalar/CorrelatedValuePropagation.cpp
index 686bd4071104..c0fed0533392 100644
--- a/lib/Transforms/Scalar/CorrelatedValuePropagation.cpp
+++ b/lib/Transforms/Scalar/CorrelatedValuePropagation.cpp
@@ -11,6 +11,7 @@
 //
 //===----------------------------------------------------------------------===//
 
+#include "llvm/Transforms/Scalar/CorrelatedValuePropagation.h"
 #include "llvm/Transforms/Scalar.h"
 #include "llvm/ADT/Statistic.h"
 #include "llvm/Analysis/GlobalsModRef.h"
@@ -35,22 +36,11 @@ STATISTIC(NumMemAccess, "Number of memory access targets propagated");
 STATISTIC(NumCmps,      "Number of comparisons propagated");
 STATISTIC(NumReturns,   "Number of return values propagated");
 STATISTIC(NumDeadCases, "Number of switch cases removed");
+STATISTIC(NumSDivs,     "Number of sdiv converted to udiv");
+STATISTIC(NumSRems,     "Number of srem converted to urem");
 
 namespace {
   class CorrelatedValuePropagation : public FunctionPass {
-    LazyValueInfo *LVI;
-
-    bool processSelect(SelectInst *SI);
-    bool processPHI(PHINode *P);
-    bool processMemAccess(Instruction *I);
-    bool processCmp(CmpInst *C);
-    bool processSwitch(SwitchInst *SI);
-    bool processCallSite(CallSite CS);
-
-    /// Return a constant value for V usable at At and everything it
-    /// dominates.  If no such Constant can be found, return nullptr.
-    Constant *getConstantAt(Value *V, Instruction *At);
-
   public:
     static char ID;
     CorrelatedValuePropagation(): FunctionPass(ID) {
@@ -60,7 +50,7 @@ namespace {
     bool runOnFunction(Function &F) override;
 
     void getAnalysisUsage(AnalysisUsage &AU) const override {
-      AU.addRequired<LazyValueInfo>();
+      AU.addRequired<LazyValueInfoWrapperPass>();
       AU.addPreserved<GlobalsAAWrapperPass>();
     }
   };
@@ -69,7 +59,7 @@ namespace {
 char CorrelatedValuePropagation::ID = 0;
 INITIALIZE_PASS_BEGIN(CorrelatedValuePropagation, "correlated-propagation",
                 "Value Propagation", false, false)
-INITIALIZE_PASS_DEPENDENCY(LazyValueInfo)
+INITIALIZE_PASS_DEPENDENCY(LazyValueInfoWrapperPass)
 INITIALIZE_PASS_END(CorrelatedValuePropagation, "correlated-propagation",
                 "Value Propagation", false, false)
 
@@ -78,7 +68,7 @@ Pass *llvm::createCorrelatedValuePropagationPass() {
   return new CorrelatedValuePropagation();
 }
 
-bool CorrelatedValuePropagation::processSelect(SelectInst *S) {
+static bool processSelect(SelectInst *S, LazyValueInfo *LVI) {
   if (S->getType()->isVectorTy()) return false;
   if (isa<Constant>(S->getOperand(0))) return false;
 
@@ -101,7 +91,7 @@ bool CorrelatedValuePropagation::processSelect(SelectInst *S) {
   return true;
 }
 
-bool CorrelatedValuePropagation::processPHI(PHINode *P) {
+static bool processPHI(PHINode *P, LazyValueInfo *LVI) {
   bool Changed = false;
 
   BasicBlock *BB = P->getParent();
@@ -169,7 +159,7 @@ bool CorrelatedValuePropagation::processPHI(PHINode *P) {
   return Changed;
 }
 
-bool CorrelatedValuePropagation::processMemAccess(Instruction *I) {
+static bool processMemAccess(Instruction *I, LazyValueInfo *LVI) {
   Value *Pointer = nullptr;
   if (LoadInst *L = dyn_cast<LoadInst>(I))
     Pointer = L->getPointerOperand();
@@ -186,11 +176,11 @@ bool CorrelatedValuePropagation::processMemAccess(Instruction *I) {
   return true;
 }
 
-/// processCmp - See if LazyValueInfo's ability to exploit edge conditions,
-/// or range information is sufficient to prove this comparison.  Even for
-/// local conditions, this can sometimes prove conditions instcombine can't by
+/// See if LazyValueInfo's ability to exploit edge conditions or range
+/// information is sufficient to prove this comparison. Even for local
+/// conditions, this can sometimes prove conditions instcombine can't by
 /// exploiting range information.
-bool CorrelatedValuePropagation::processCmp(CmpInst *C) {
+static bool processCmp(CmpInst *C, LazyValueInfo *LVI) {
   Value *Op0 = C->getOperand(0);
   Constant *Op1 = dyn_cast<Constant>(C->getOperand(1));
   if (!Op1) return false;
@@ -218,14 +208,14 @@ bool CorrelatedValuePropagation::processCmp(CmpInst *C) {
   return true;
 }
 
-/// processSwitch - Simplify a switch instruction by removing cases which can
-/// never fire.  If the uselessness of a case could be determined locally then
-/// constant propagation would already have figured it out.  Instead, walk the
-/// predecessors and statically evaluate cases based on information available
-/// on that edge.  Cases that cannot fire no matter what the incoming edge can
-/// safely be removed.  If a case fires on every incoming edge then the entire
-/// switch can be removed and replaced with a branch to the case destination.
-bool CorrelatedValuePropagation::processSwitch(SwitchInst *SI) {
+/// Simplify a switch instruction by removing cases which can never fire. If the
+/// uselessness of a case could be determined locally then constant propagation
+/// would already have figured it out. Instead, walk the predecessors and
+/// statically evaluate cases based on information available on that edge. Cases
+/// that cannot fire no matter what the incoming edge can safely be removed. If
+/// a case fires on every incoming edge then the entire switch can be removed
+/// and replaced with a branch to the case destination.
+static bool processSwitch(SwitchInst *SI, LazyValueInfo *LVI) {
   Value *Cond = SI->getCondition();
   BasicBlock *BB = SI->getParent();
 
@@ -304,16 +294,18 @@ bool CorrelatedValuePropagation::processSwitch(SwitchInst *SI) {
   return Changed;
 }
 
-/// processCallSite - Infer nonnull attributes for the arguments at the
-/// specified callsite.
-bool CorrelatedValuePropagation::processCallSite(CallSite CS) {
+/// Infer nonnull attributes for the arguments at the specified callsite.
+static bool processCallSite(CallSite CS, LazyValueInfo *LVI) {
   SmallVector<unsigned, 4> Indices;
   unsigned ArgNo = 0;
 
   for (Value *V : CS.args()) {
     PointerType *Type = dyn_cast<PointerType>(V->getType());
-
+    // Try to mark pointer typed parameters as non-null.  We skip the
+    // relatively expensive analysis for constants which are obviously either
+    // null or non-null to start with.
     if (Type && !CS.paramHasAttr(ArgNo + 1, Attribute::NonNull) &&
+        !isa<Constant>(V) && 
         LVI->getPredicateAt(ICmpInst::ICMP_EQ, V,
                             ConstantPointerNull::get(Type),
                             CS.getInstruction()) == LazyValueInfo::False)
@@ -334,7 +326,62 @@ bool CorrelatedValuePropagation::processCallSite(CallSite CS) {
   return true;
 }
 
-Constant *CorrelatedValuePropagation::getConstantAt(Value *V, Instruction *At) {
+// Helper function to rewrite srem and sdiv. As a policy choice, we choose not
+// to waste compile time on anything where the operands are local defs.  While
+// LVI can sometimes reason about such cases, it's not its primary purpose.
+static bool hasLocalDefs(BinaryOperator *SDI) {
+  for (Value *O : SDI->operands()) {
+    auto *I = dyn_cast<Instruction>(O);
+    if (I && I->getParent() == SDI->getParent())
+      return true;
+  }
+  return false;
+}
+
+static bool hasPositiveOperands(BinaryOperator *SDI, LazyValueInfo *LVI) {
+  Constant *Zero = ConstantInt::get(SDI->getType(), 0);
+  for (Value *O : SDI->operands()) {
+    auto Result = LVI->getPredicateAt(ICmpInst::ICMP_SGE, O, Zero, SDI);
+    if (Result != LazyValueInfo::True)
+      return false;
+  }
+  return true;
+}
+
+static bool processSRem(BinaryOperator *SDI, LazyValueInfo *LVI) {
+  if (SDI->getType()->isVectorTy() || hasLocalDefs(SDI) ||
+      !hasPositiveOperands(SDI, LVI))
+    return false;
+
+  ++NumSRems;
+  auto *BO = BinaryOperator::CreateURem(SDI->getOperand(0), SDI->getOperand(1),
+                                        SDI->getName(), SDI);
+  SDI->replaceAllUsesWith(BO);
+  SDI->eraseFromParent();
+  return true;
+}
+
+/// See if LazyValueInfo's ability to exploit edge conditions or range
+/// information is sufficient to prove the both operands of this SDiv are
+/// positive.  If this is the case, replace the SDiv with a UDiv. Even for local
+/// conditions, this can sometimes prove conditions instcombine can't by
+/// exploiting range information.
+static bool processSDiv(BinaryOperator *SDI, LazyValueInfo *LVI) {
+  if (SDI->getType()->isVectorTy() || hasLocalDefs(SDI) ||
+      !hasPositiveOperands(SDI, LVI))
+    return false;
+
+  ++NumSDivs;
+  auto *BO = BinaryOperator::CreateUDiv(SDI->getOperand(0), SDI->getOperand(1),
+                                        SDI->getName(), SDI);
+  BO->setIsExact(SDI->isExact());
+  SDI->replaceAllUsesWith(BO);
+  SDI->eraseFromParent();
+
+  return true;
+}
+
+static Constant *getConstantAt(Value *V, Instruction *At, LazyValueInfo *LVI) {
   if (Constant *C = LVI->getConstant(V, At->getParent(), At))
     return C;
 
@@ -357,44 +404,45 @@ Constant *CorrelatedValuePropagation::getConstantAt(Value *V, Instruction *At) {
     ConstantInt::getFalse(C->getContext());
 }
 
-bool CorrelatedValuePropagation::runOnFunction(Function &F) {
-  if (skipOptnoneFunction(F))
-    return false;
-
-  LVI = &getAnalysis<LazyValueInfo>();
-
+static bool runImpl(Function &F, LazyValueInfo *LVI) {
   bool FnChanged = false;
 
-  for (Function::iterator FI = F.begin(), FE = F.end(); FI != FE; ++FI) {
+  for (BasicBlock &BB : F) {
     bool BBChanged = false;
-    for (BasicBlock::iterator BI = FI->begin(), BE = FI->end(); BI != BE; ) {
+    for (BasicBlock::iterator BI = BB.begin(), BE = BB.end(); BI != BE;) {
       Instruction *II = &*BI++;
       switch (II->getOpcode()) {
       case Instruction::Select:
-        BBChanged |= processSelect(cast<SelectInst>(II));
+        BBChanged |= processSelect(cast<SelectInst>(II), LVI);
         break;
       case Instruction::PHI:
-        BBChanged |= processPHI(cast<PHINode>(II));
+        BBChanged |= processPHI(cast<PHINode>(II), LVI);
         break;
       case Instruction::ICmp:
       case Instruction::FCmp:
-        BBChanged |= processCmp(cast<CmpInst>(II));
+        BBChanged |= processCmp(cast<CmpInst>(II), LVI);
         break;
       case Instruction::Load:
       case Instruction::Store:
-        BBChanged |= processMemAccess(II);
+        BBChanged |= processMemAccess(II, LVI);
         break;
       case Instruction::Call:
       case Instruction::Invoke:
-        BBChanged |= processCallSite(CallSite(II));
+        BBChanged |= processCallSite(CallSite(II), LVI);
+        break;
+      case Instruction::SRem:
+        BBChanged |= processSRem(cast<BinaryOperator>(II), LVI);
+        break;
+      case Instruction::SDiv:
+        BBChanged |= processSDiv(cast<BinaryOperator>(II), LVI);
         break;
       }
     }
 
-    Instruction *Term = FI->getTerminator();
+    Instruction *Term = BB.getTerminator();
     switch (Term->getOpcode()) {
     case Instruction::Switch:
-      BBChanged |= processSwitch(cast<SwitchInst>(Term));
+      BBChanged |= processSwitch(cast<SwitchInst>(Term), LVI);
       break;
     case Instruction::Ret: {
       auto *RI = cast<ReturnInst>(Term);
@@ -404,7 +452,7 @@ bool CorrelatedValuePropagation::runOnFunction(Function &F) {
       auto *RetVal = RI->getReturnValue();
       if (!RetVal) break; // handle "ret void"
       if (isa<Constant>(RetVal)) break; // nothing to do
-      if (auto *C = getConstantAt(RetVal, RI)) {
+      if (auto *C = getConstantAt(RetVal, RI, LVI)) {
         ++NumReturns;
         RI->replaceUsesOfWith(RetVal, C);
         BBChanged = true;        
@@ -417,3 +465,28 @@ bool CorrelatedValuePropagation::runOnFunction(Function &F) {
 
   return FnChanged;
 }
+
+bool CorrelatedValuePropagation::runOnFunction(Function &F) {
+  if (skipFunction(F))
+    return false;
+
+  LazyValueInfo *LVI = &getAnalysis<LazyValueInfoWrapperPass>().getLVI();
+  return runImpl(F, LVI);
+}
+
+PreservedAnalyses
+CorrelatedValuePropagationPass::run(Function &F, FunctionAnalysisManager &AM) {
+
+  LazyValueInfo *LVI = &AM.getResult<LazyValueAnalysis>(F);
+  bool Changed = runImpl(F, LVI);
+
+  // FIXME: We need to invalidate LVI to avoid PR28400. Is there a better
+  // solution?
+  AM.invalidate<LazyValueAnalysis>(F);
+
+  if (!Changed)
+    return PreservedAnalyses::all();
+  PreservedAnalyses PA;
+  PA.preserve<GlobalsAA>();
+  return PA;
+}
diff --git a/lib/Transforms/Scalar/DCE.cpp b/lib/Transforms/Scalar/DCE.cpp
index b67c3c7742fd..f73809d9f045 100644
--- a/lib/Transforms/Scalar/DCE.cpp
+++ b/lib/Transforms/Scalar/DCE.cpp
@@ -16,13 +16,14 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "llvm/Transforms/Scalar.h"
+#include "llvm/Transforms/Scalar/DCE.h"
 #include "llvm/ADT/SetVector.h"
 #include "llvm/ADT/Statistic.h"
 #include "llvm/IR/InstIterator.h"
 #include "llvm/IR/Instruction.h"
 #include "llvm/Pass.h"
 #include "llvm/Analysis/TargetLibraryInfo.h"
+#include "llvm/Transforms/Scalar.h"
 #include "llvm/Transforms/Utils/Local.h"
 using namespace llvm;
 
@@ -41,7 +42,7 @@ namespace {
       initializeDeadInstEliminationPass(*PassRegistry::getPassRegistry());
     }
     bool runOnBasicBlock(BasicBlock &BB) override {
-      if (skipOptnoneFunction(BB))
+      if (skipBasicBlock(BB))
         return false;
       auto *TLIP = getAnalysisIfAvailable<TargetLibraryInfoWrapperPass>();
       TargetLibraryInfo *TLI = TLIP ? &TLIP->getTLI() : nullptr;
@@ -71,28 +72,6 @@ Pass *llvm::createDeadInstEliminationPass() {
   return new DeadInstElimination();
 }
 
-
-namespace {
-  //===--------------------------------------------------------------------===//
-  // DeadCodeElimination pass implementation
-  //
-  struct DCE : public FunctionPass {
-    static char ID; // Pass identification, replacement for typeid
-    DCE() : FunctionPass(ID) {
-      initializeDCEPass(*PassRegistry::getPassRegistry());
-    }
-
-    bool runOnFunction(Function &F) override;
-
-    void getAnalysisUsage(AnalysisUsage &AU) const override {
-      AU.setPreservesCFG();
-    }
- };
-}
-
-char DCE::ID = 0;
-INITIALIZE_PASS(DCE, "dce", "Dead Code Elimination", false, false)
-
 static bool DCEInstruction(Instruction *I,
                            SmallSetVector<Instruction *, 16> &WorkList,
                            const TargetLibraryInfo *TLI) {
@@ -121,13 +100,7 @@ static bool DCEInstruction(Instruction *I,
   return false;
 }
 
-bool DCE::runOnFunction(Function &F) {
-  if (skipOptnoneFunction(F))
-    return false;
-
-  auto *TLIP = getAnalysisIfAvailable<TargetLibraryInfoWrapperPass>();
-  TargetLibraryInfo *TLI = TLIP ? &TLIP->getTLI() : nullptr;
-
+static bool eliminateDeadCode(Function &F, TargetLibraryInfo *TLI) {
   bool MadeChange = false;
   SmallSetVector<Instruction *, 16> WorkList;
   // Iterate over the original function, only adding insts to the worklist
@@ -150,7 +123,38 @@ bool DCE::runOnFunction(Function &F) {
   return MadeChange;
 }
 
-FunctionPass *llvm::createDeadCodeEliminationPass() {
-  return new DCE();
+PreservedAnalyses DCEPass::run(Function &F, AnalysisManager<Function> &AM) {
+  if (eliminateDeadCode(F, AM.getCachedResult<TargetLibraryAnalysis>(F)))
+    return PreservedAnalyses::none();
+  return PreservedAnalyses::all();
+}
+
+namespace {
+struct DCELegacyPass : public FunctionPass {
+  static char ID; // Pass identification, replacement for typeid
+  DCELegacyPass() : FunctionPass(ID) {
+    initializeDCELegacyPassPass(*PassRegistry::getPassRegistry());
+  }
+
+  bool runOnFunction(Function &F) override {
+    if (skipFunction(F))
+      return false;
+
+    auto *TLIP = getAnalysisIfAvailable<TargetLibraryInfoWrapperPass>();
+    TargetLibraryInfo *TLI = TLIP ? &TLIP->getTLI() : nullptr;
+
+    return eliminateDeadCode(F, TLI);
+  }
+
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    AU.setPreservesCFG();
+  }
+};
 }
 
+char DCELegacyPass::ID = 0;
+INITIALIZE_PASS(DCELegacyPass, "dce", "Dead Code Elimination", false, false)
+
+FunctionPass *llvm::createDeadCodeEliminationPass() {
+  return new DCELegacyPass();
+}
diff --git a/lib/Transforms/Scalar/DeadStoreElimination.cpp b/lib/Transforms/Scalar/DeadStoreElimination.cpp
index 36ad0a5f7b91..ed58a87ae1a8 100644
--- a/lib/Transforms/Scalar/DeadStoreElimination.cpp
+++ b/lib/Transforms/Scalar/DeadStoreElimination.cpp
@@ -15,7 +15,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "llvm/Transforms/Scalar.h"
+#include "llvm/Transforms/Scalar/DeadStoreElimination.h"
+#include "llvm/ADT/DenseMap.h"
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/SetVector.h"
 #include "llvm/ADT/Statistic.h"
@@ -34,9 +35,12 @@
 #include "llvm/IR/Instructions.h"
 #include "llvm/IR/IntrinsicInst.h"
 #include "llvm/Pass.h"
+#include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/raw_ostream.h"
+#include "llvm/Transforms/Scalar.h"
 #include "llvm/Transforms/Utils/Local.h"
+#include <map>
 using namespace llvm;
 
 #define DEBUG_TYPE "dse"
@@ -44,90 +48,35 @@ using namespace llvm;
 STATISTIC(NumRedundantStores, "Number of redundant stores deleted");
 STATISTIC(NumFastStores, "Number of stores deleted");
 STATISTIC(NumFastOther , "Number of other instrs removed");
+STATISTIC(NumCompletePartials, "Number of stores dead by later partials");
 
-namespace {
-  struct DSE : public FunctionPass {
-    AliasAnalysis *AA;
-    MemoryDependenceAnalysis *MD;
-    DominatorTree *DT;
-    const TargetLibraryInfo *TLI;
-
-    static char ID; // Pass identification, replacement for typeid
-    DSE() : FunctionPass(ID), AA(nullptr), MD(nullptr), DT(nullptr) {
-      initializeDSEPass(*PassRegistry::getPassRegistry());
-    }
-
-    bool runOnFunction(Function &F) override {
-      if (skipOptnoneFunction(F))
-        return false;
-
-      AA = &getAnalysis<AAResultsWrapperPass>().getAAResults();
-      MD = &getAnalysis<MemoryDependenceAnalysis>();
-      DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
-      TLI = &getAnalysis<TargetLibraryInfoWrapperPass>().getTLI();
+static cl::opt<bool>
+EnablePartialOverwriteTracking("enable-dse-partial-overwrite-tracking",
+  cl::init(true), cl::Hidden,
+  cl::desc("Enable partial-overwrite tracking in DSE"));
 
-      bool Changed = false;
-      for (BasicBlock &I : F)
-        // Only check non-dead blocks.  Dead blocks may have strange pointer
-        // cycles that will confuse alias analysis.
-        if (DT->isReachableFromEntry(&I))
-          Changed |= runOnBasicBlock(I);
-
-      AA = nullptr; MD = nullptr; DT = nullptr;
-      return Changed;
-    }
-
-    bool runOnBasicBlock(BasicBlock &BB);
-    bool MemoryIsNotModifiedBetween(Instruction *FirstI, Instruction *SecondI);
-    bool HandleFree(CallInst *F);
-    bool handleEndBlock(BasicBlock &BB);
-    void RemoveAccessedObjects(const MemoryLocation &LoadedLoc,
-                               SmallSetVector<Value *, 16> &DeadStackObjects,
-                               const DataLayout &DL);
-
-    void getAnalysisUsage(AnalysisUsage &AU) const override {
-      AU.setPreservesCFG();
-      AU.addRequired<DominatorTreeWrapperPass>();
-      AU.addRequired<AAResultsWrapperPass>();
-      AU.addRequired<MemoryDependenceAnalysis>();
-      AU.addRequired<TargetLibraryInfoWrapperPass>();
-      AU.addPreserved<DominatorTreeWrapperPass>();
-      AU.addPreserved<GlobalsAAWrapperPass>();
-      AU.addPreserved<MemoryDependenceAnalysis>();
-    }
-  };
-}
-
-char DSE::ID = 0;
-INITIALIZE_PASS_BEGIN(DSE, "dse", "Dead Store Elimination", false, false)
-INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
-INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass)
-INITIALIZE_PASS_DEPENDENCY(GlobalsAAWrapperPass)
-INITIALIZE_PASS_DEPENDENCY(MemoryDependenceAnalysis)
-INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfoWrapperPass)
-INITIALIZE_PASS_END(DSE, "dse", "Dead Store Elimination", false, false)
-
-FunctionPass *llvm::createDeadStoreEliminationPass() { return new DSE(); }
 
 //===----------------------------------------------------------------------===//
 // Helper functions
 //===----------------------------------------------------------------------===//
 
-/// DeleteDeadInstruction - Delete this instruction.  Before we do, go through
-/// and zero out all the operands of this instruction.  If any of them become
-/// dead, delete them and the computation tree that feeds them.
-///
+/// Delete this instruction.  Before we do, go through and zero out all the
+/// operands of this instruction.  If any of them become dead, delete them and
+/// the computation tree that feeds them.
 /// If ValueSet is non-null, remove any deleted instructions from it as well.
-///
-static void DeleteDeadInstruction(Instruction *I,
-                               MemoryDependenceAnalysis &MD,
-                               const TargetLibraryInfo &TLI,
-                               SmallSetVector<Value*, 16> *ValueSet = nullptr) {
+static void
+deleteDeadInstruction(Instruction *I, BasicBlock::iterator *BBI,
+                      MemoryDependenceResults &MD, const TargetLibraryInfo &TLI,
+                      SmallSetVector<Value *, 16> *ValueSet = nullptr) {
   SmallVector<Instruction*, 32> NowDeadInsts;
 
   NowDeadInsts.push_back(I);
   --NumFastOther;
 
+  // Keeping the iterator straight is a pain, so we let this routine tell the
+  // caller what the next instruction is after we're done mucking about.
+  BasicBlock::iterator NewIter = *BBI;
+
   // Before we touch this instruction, remove it from memdep!
   do {
     Instruction *DeadInst = NowDeadInsts.pop_back_val();
@@ -150,15 +99,19 @@ static void DeleteDeadInstruction(Instruction *I,
           NowDeadInsts.push_back(OpI);
     }
 
-    DeadInst->eraseFromParent();
+
+    if (NewIter == DeadInst->getIterator())
+      NewIter = DeadInst->eraseFromParent();
+    else
+      DeadInst->eraseFromParent();
 
     if (ValueSet) ValueSet->remove(DeadInst);
   } while (!NowDeadInsts.empty());
+  *BBI = NewIter;
 }
 
-
-/// hasMemoryWrite - Does this instruction write some memory?  This only returns
-/// true for things that we can analyze with other helpers below.
+/// Does this instruction write some memory?  This only returns true for things
+/// that we can analyze with other helpers below.
 static bool hasMemoryWrite(Instruction *I, const TargetLibraryInfo &TLI) {
   if (isa<StoreInst>(I))
     return true;
@@ -176,30 +129,23 @@ static bool hasMemoryWrite(Instruction *I, const TargetLibraryInfo &TLI) {
   }
   if (auto CS = CallSite(I)) {
     if (Function *F = CS.getCalledFunction()) {
-      if (TLI.has(LibFunc::strcpy) &&
-          F->getName() == TLI.getName(LibFunc::strcpy)) {
+      StringRef FnName = F->getName();
+      if (TLI.has(LibFunc::strcpy) && FnName == TLI.getName(LibFunc::strcpy))
         return true;
-      }
-      if (TLI.has(LibFunc::strncpy) &&
-          F->getName() == TLI.getName(LibFunc::strncpy)) {
+      if (TLI.has(LibFunc::strncpy) && FnName == TLI.getName(LibFunc::strncpy))
         return true;
-      }
-      if (TLI.has(LibFunc::strcat) &&
-          F->getName() == TLI.getName(LibFunc::strcat)) {
+      if (TLI.has(LibFunc::strcat) && FnName == TLI.getName(LibFunc::strcat))
         return true;
-      }
-      if (TLI.has(LibFunc::strncat) &&
-          F->getName() == TLI.getName(LibFunc::strncat)) {
+      if (TLI.has(LibFunc::strncat) && FnName == TLI.getName(LibFunc::strncat))
         return true;
-      }
     }
   }
   return false;
 }
 
-/// getLocForWrite - Return a Location stored to by the specified instruction.
-/// If isRemovable returns true, this function and getLocForRead completely
-/// describe the memory operations for this instruction.
+/// Return a Location stored to by the specified instruction. If isRemovable
+/// returns true, this function and getLocForRead completely describe the memory
+/// operations for this instruction.
 static MemoryLocation getLocForWrite(Instruction *Inst, AliasAnalysis &AA) {
   if (StoreInst *SI = dyn_cast<StoreInst>(Inst))
     return MemoryLocation::get(SI);
@@ -228,8 +174,8 @@ static MemoryLocation getLocForWrite(Instruction *Inst, AliasAnalysis &AA) {
   }
 }
 
-/// getLocForRead - Return the location read by the specified "hasMemoryWrite"
-/// instruction if any.
+/// Return the location read by the specified "hasMemoryWrite" instruction if
+/// any.
 static MemoryLocation getLocForRead(Instruction *Inst,
                                     const TargetLibraryInfo &TLI) {
   assert(hasMemoryWrite(Inst, TLI) && "Unknown instruction case");
@@ -241,9 +187,8 @@ static MemoryLocation getLocForRead(Instruction *Inst,
   return MemoryLocation();
 }
 
-
-/// isRemovable - If the value of this instruction and the memory it writes to
-/// is unused, may we delete this instruction?
+/// If the value of this instruction and the memory it writes to is unused, may
+/// we delete this instruction?
 static bool isRemovable(Instruction *I) {
   // Don't remove volatile/atomic stores.
   if (StoreInst *SI = dyn_cast<StoreInst>(I))
@@ -275,9 +220,9 @@ static bool isRemovable(Instruction *I) {
 }
 
 
-/// isShortenable - Returns true if this instruction can be safely shortened in
+/// Returns true if the end of this instruction can be safely shortened in
 /// length.
-static bool isShortenable(Instruction *I) {
+static bool isShortenableAtTheEnd(Instruction *I) {
   // Don't shorten stores for now
   if (isa<StoreInst>(I))
     return false;
@@ -288,6 +233,7 @@ static bool isShortenable(Instruction *I) {
       case Intrinsic::memset:
       case Intrinsic::memcpy:
         // Do shorten memory intrinsics.
+        // FIXME: Add memmove if it's also safe to transform.
         return true;
     }
   }
@@ -297,7 +243,16 @@ static bool isShortenable(Instruction *I) {
   return false;
 }
 
-/// getStoredPointerOperand - Return the pointer that is being written to.
+/// Returns true if the beginning of this instruction can be safely shortened
+/// in length.
+static bool isShortenableAtTheBeginning(Instruction *I) {
+  // FIXME: Handle only memset for now. Supporting memcpy/memmove should be
+  // easily done by offsetting the source address.
+  IntrinsicInst *II = dyn_cast<IntrinsicInst>(I);
+  return II && II->getIntrinsicID() == Intrinsic::memset;
+}
+
+/// Return the pointer that is being written to.
 static Value *getStoredPointerOperand(Instruction *I) {
   if (StoreInst *SI = dyn_cast<StoreInst>(I))
     return SI->getPointerOperand();
@@ -327,46 +282,45 @@ static uint64_t getPointerSize(const Value *V, const DataLayout &DL,
 }
 
 namespace {
-  enum OverwriteResult
-  {
-    OverwriteComplete,
-    OverwriteEnd,
-    OverwriteUnknown
-  };
+enum OverwriteResult {
+  OverwriteBegin,
+  OverwriteComplete,
+  OverwriteEnd,
+  OverwriteUnknown
+};
 }
 
-/// isOverwrite - Return 'OverwriteComplete' if a store to the 'Later' location
-/// completely overwrites a store to the 'Earlier' location.
-/// 'OverwriteEnd' if the end of the 'Earlier' location is completely
-/// overwritten by 'Later', or 'OverwriteUnknown' if nothing can be determined
+typedef DenseMap<Instruction *,
+                 std::map<int64_t, int64_t>> InstOverlapIntervalsTy;
+
+/// Return 'OverwriteComplete' if a store to the 'Later' location completely
+/// overwrites a store to the 'Earlier' location, 'OverwriteEnd' if the end of
+/// the 'Earlier' location is completely overwritten by 'Later',
+/// 'OverwriteBegin' if the beginning of the 'Earlier' location is overwritten
+/// by 'Later', or 'OverwriteUnknown' if nothing can be determined.
 static OverwriteResult isOverwrite(const MemoryLocation &Later,
                                    const MemoryLocation &Earlier,
                                    const DataLayout &DL,
                                    const TargetLibraryInfo &TLI,
-                                   int64_t &EarlierOff, int64_t &LaterOff) {
+                                   int64_t &EarlierOff, int64_t &LaterOff,
+                                   Instruction *DepWrite,
+                                   InstOverlapIntervalsTy &IOL) {
+  // If we don't know the sizes of either access, then we can't do a comparison.
+  if (Later.Size == MemoryLocation::UnknownSize ||
+      Earlier.Size == MemoryLocation::UnknownSize)
+    return OverwriteUnknown;
+
   const Value *P1 = Earlier.Ptr->stripPointerCasts();
   const Value *P2 = Later.Ptr->stripPointerCasts();
 
   // If the start pointers are the same, we just have to compare sizes to see if
   // the later store was larger than the earlier store.
   if (P1 == P2) {
-    // If we don't know the sizes of either access, then we can't do a
-    // comparison.
-    if (Later.Size == MemoryLocation::UnknownSize ||
-        Earlier.Size == MemoryLocation::UnknownSize)
-      return OverwriteUnknown;
-
     // Make sure that the Later size is >= the Earlier size.
     if (Later.Size >= Earlier.Size)
       return OverwriteComplete;
   }
 
-  // Otherwise, we have to have size information, and the later store has to be
-  // larger than the earlier one.
-  if (Later.Size == MemoryLocation::UnknownSize ||
-      Earlier.Size == MemoryLocation::UnknownSize)
-    return OverwriteUnknown;
-
   // Check to see if the later store is to the entire object (either a global,
   // an alloca, or a byval/inalloca argument).  If so, then it clearly
   // overwrites any other store to the same object.
@@ -416,8 +370,68 @@ static OverwriteResult isOverwrite(const MemoryLocation &Later,
       uint64_t(EarlierOff - LaterOff) + Earlier.Size <= Later.Size)
     return OverwriteComplete;
 
-  // The other interesting case is if the later store overwrites the end of
-  // the earlier store
+  // We may now overlap, although the overlap is not complete. There might also
+  // be other incomplete overlaps, and together, they might cover the complete
+  // earlier write.
+  // Note: The correctness of this logic depends on the fact that this function
+  // is not even called providing DepWrite when there are any intervening reads.
+  if (EnablePartialOverwriteTracking &&
+      LaterOff < int64_t(EarlierOff + Earlier.Size) &&
+      int64_t(LaterOff + Later.Size) >= EarlierOff) {
+
+    // Insert our part of the overlap into the map.
+    auto &IM = IOL[DepWrite];
+    DEBUG(dbgs() << "DSE: Partial overwrite: Earlier [" << EarlierOff << ", " <<
+                    int64_t(EarlierOff + Earlier.Size) << ") Later [" <<
+                    LaterOff << ", " << int64_t(LaterOff + Later.Size) << ")\n");
+
+    // Make sure that we only insert non-overlapping intervals and combine
+    // adjacent intervals. The intervals are stored in the map with the ending
+    // offset as the key (in the half-open sense) and the starting offset as
+    // the value.
+    int64_t LaterIntStart = LaterOff, LaterIntEnd = LaterOff + Later.Size;
+
+    // Find any intervals ending at, or after, LaterIntStart which start
+    // before LaterIntEnd.
+    auto ILI = IM.lower_bound(LaterIntStart);
+    if (ILI != IM.end() && ILI->second <= LaterIntEnd) {
+      // This existing interval is overlapped with the current store somewhere
+      // in [LaterIntStart, LaterIntEnd]. Merge them by erasing the existing
+      // intervals and adjusting our start and end.
+      LaterIntStart = std::min(LaterIntStart, ILI->second);
+      LaterIntEnd = std::max(LaterIntEnd, ILI->first);
+      ILI = IM.erase(ILI);
+
+      // Continue erasing and adjusting our end in case other previous
+      // intervals are also overlapped with the current store.
+      //
+      // |--- ealier 1 ---|  |--- ealier 2 ---|
+      //     |------- later---------|
+      //
+      while (ILI != IM.end() && ILI->second <= LaterIntEnd) {
+        assert(ILI->second > LaterIntStart && "Unexpected interval");
+        LaterIntEnd = std::max(LaterIntEnd, ILI->first);
+        ILI = IM.erase(ILI);
+      }
+    }
+
+    IM[LaterIntEnd] = LaterIntStart;
+
+    ILI = IM.begin();
+    if (ILI->second <= EarlierOff &&
+        ILI->first >= int64_t(EarlierOff + Earlier.Size)) {
+      DEBUG(dbgs() << "DSE: Full overwrite from partials: Earlier [" <<
+                      EarlierOff << ", " <<
+                      int64_t(EarlierOff + Earlier.Size) <<
+                      ") Composite Later [" <<
+                      ILI->second << ", " << ILI->first << ")\n");
+      ++NumCompletePartials;
+      return OverwriteComplete;
+    }
+  }
+
+  // Another interesting case is if the later store overwrites the end of the
+  // earlier store.
   //
   //      |--earlier--|
   //                |--   later   --|
@@ -429,11 +443,25 @@ static OverwriteResult isOverwrite(const MemoryLocation &Later,
       int64_t(LaterOff + Later.Size) >= int64_t(EarlierOff + Earlier.Size))
     return OverwriteEnd;
 
+  // Finally, we also need to check if the later store overwrites the beginning
+  // of the earlier store.
+  //
+  //                |--earlier--|
+  //      |--   later   --|
+  //
+  // In this case we may want to move the destination address and trim the size
+  // of earlier to avoid generating writes to addresses which will definitely
+  // be overwritten later.
+  if (LaterOff <= EarlierOff && int64_t(LaterOff + Later.Size) > EarlierOff) {
+    assert (int64_t(LaterOff + Later.Size) < int64_t(EarlierOff + Earlier.Size)
+            && "Expect to be handled as OverwriteComplete" );
+    return OverwriteBegin;
+  }
   // Otherwise, they don't completely overlap.
   return OverwriteUnknown;
 }
 
-/// isPossibleSelfRead - If 'Inst' might be a self read (i.e. a noop copy of a
+/// If 'Inst' might be a self read (i.e. a noop copy of a
 /// memory region into an identical pointer) then it doesn't actually make its
 /// input dead in the traditional sense.  Consider this case:
 ///
@@ -478,192 +506,13 @@ static bool isPossibleSelfRead(Instruction *Inst,
 }
 
 
-//===----------------------------------------------------------------------===//
-// DSE Pass
-//===----------------------------------------------------------------------===//
-
-bool DSE::runOnBasicBlock(BasicBlock &BB) {
-  const DataLayout &DL = BB.getModule()->getDataLayout();
-  bool MadeChange = false;
-
-  // Do a top-down walk on the BB.
-  for (BasicBlock::iterator BBI = BB.begin(), BBE = BB.end(); BBI != BBE; ) {
-    Instruction *Inst = &*BBI++;
-
-    // Handle 'free' calls specially.
-    if (CallInst *F = isFreeCall(Inst, TLI)) {
-      MadeChange |= HandleFree(F);
-      continue;
-    }
-
-    // If we find something that writes memory, get its memory dependence.
-    if (!hasMemoryWrite(Inst, *TLI))
-      continue;
-
-    // If we're storing the same value back to a pointer that we just
-    // loaded from, then the store can be removed.
-    if (StoreInst *SI = dyn_cast<StoreInst>(Inst)) {
-
-      auto RemoveDeadInstAndUpdateBBI = [&](Instruction *DeadInst) {
-        // DeleteDeadInstruction can delete the current instruction.  Save BBI
-        // in case we need it.
-        WeakVH NextInst(&*BBI);
-
-        DeleteDeadInstruction(DeadInst, *MD, *TLI);
-
-        if (!NextInst) // Next instruction deleted.
-          BBI = BB.begin();
-        else if (BBI != BB.begin()) // Revisit this instruction if possible.
-          --BBI;
-        ++NumRedundantStores;
-        MadeChange = true;
-      };
-
-      if (LoadInst *DepLoad = dyn_cast<LoadInst>(SI->getValueOperand())) {
-        if (SI->getPointerOperand() == DepLoad->getPointerOperand() &&
-            isRemovable(SI) &&
-            MemoryIsNotModifiedBetween(DepLoad, SI)) {
-
-          DEBUG(dbgs() << "DSE: Remove Store Of Load from same pointer:\n  "
-                       << "LOAD: " << *DepLoad << "\n  STORE: " << *SI << '\n');
-
-          RemoveDeadInstAndUpdateBBI(SI);
-          continue;
-        }
-      }
-
-      // Remove null stores into the calloc'ed objects
-      Constant *StoredConstant = dyn_cast<Constant>(SI->getValueOperand());
-
-      if (StoredConstant && StoredConstant->isNullValue() &&
-          isRemovable(SI)) {
-        Instruction *UnderlyingPointer = dyn_cast<Instruction>(
-            GetUnderlyingObject(SI->getPointerOperand(), DL));
-
-        if (UnderlyingPointer && isCallocLikeFn(UnderlyingPointer, TLI) &&
-            MemoryIsNotModifiedBetween(UnderlyingPointer, SI)) {
-          DEBUG(dbgs()
-                << "DSE: Remove null store to the calloc'ed object:\n  DEAD: "
-                << *Inst << "\n  OBJECT: " << *UnderlyingPointer << '\n');
-
-          RemoveDeadInstAndUpdateBBI(SI);
-          continue;
-        }
-      }
-    }
-
-    MemDepResult InstDep = MD->getDependency(Inst);
-
-    // Ignore any store where we can't find a local dependence.
-    // FIXME: cross-block DSE would be fun. :)
-    if (!InstDep.isDef() && !InstDep.isClobber())
-      continue;
-
-    // Figure out what location is being stored to.
-    MemoryLocation Loc = getLocForWrite(Inst, *AA);
-
-    // If we didn't get a useful location, fail.
-    if (!Loc.Ptr)
-      continue;
-
-    while (InstDep.isDef() || InstDep.isClobber()) {
-      // Get the memory clobbered by the instruction we depend on.  MemDep will
-      // skip any instructions that 'Loc' clearly doesn't interact with.  If we
-      // end up depending on a may- or must-aliased load, then we can't optimize
-      // away the store and we bail out.  However, if we depend on on something
-      // that overwrites the memory location we *can* potentially optimize it.
-      //
-      // Find out what memory location the dependent instruction stores.
-      Instruction *DepWrite = InstDep.getInst();
-      MemoryLocation DepLoc = getLocForWrite(DepWrite, *AA);
-      // If we didn't get a useful location, or if it isn't a size, bail out.
-      if (!DepLoc.Ptr)
-        break;
-
-      // If we find a write that is a) removable (i.e., non-volatile), b) is
-      // completely obliterated by the store to 'Loc', and c) which we know that
-      // 'Inst' doesn't load from, then we can remove it.
-      if (isRemovable(DepWrite) &&
-          !isPossibleSelfRead(Inst, Loc, DepWrite, *TLI, *AA)) {
-        int64_t InstWriteOffset, DepWriteOffset;
-        OverwriteResult OR =
-            isOverwrite(Loc, DepLoc, DL, *TLI, DepWriteOffset, InstWriteOffset);
-        if (OR == OverwriteComplete) {
-          DEBUG(dbgs() << "DSE: Remove Dead Store:\n  DEAD: "
-                << *DepWrite << "\n  KILLER: " << *Inst << '\n');
-
-          // Delete the store and now-dead instructions that feed it.
-          DeleteDeadInstruction(DepWrite, *MD, *TLI);
-          ++NumFastStores;
-          MadeChange = true;
-
-          // DeleteDeadInstruction can delete the current instruction in loop
-          // cases, reset BBI.
-          BBI = Inst->getIterator();
-          if (BBI != BB.begin())
-            --BBI;
-          break;
-        } else if (OR == OverwriteEnd && isShortenable(DepWrite)) {
-          // TODO: base this on the target vector size so that if the earlier
-          // store was too small to get vector writes anyway then its likely
-          // a good idea to shorten it
-          // Power of 2 vector writes are probably always a bad idea to optimize
-          // as any store/memset/memcpy is likely using vector instructions so
-          // shortening it to not vector size is likely to be slower
-          MemIntrinsic* DepIntrinsic = cast<MemIntrinsic>(DepWrite);
-          unsigned DepWriteAlign = DepIntrinsic->getAlignment();
-          if (llvm::isPowerOf2_64(InstWriteOffset) ||
-              ((DepWriteAlign != 0) && InstWriteOffset % DepWriteAlign == 0)) {
-
-            DEBUG(dbgs() << "DSE: Remove Dead Store:\n  OW END: "
-                  << *DepWrite << "\n  KILLER (offset "
-                  << InstWriteOffset << ", "
-                  << DepLoc.Size << ")"
-                  << *Inst << '\n');
-
-            Value* DepWriteLength = DepIntrinsic->getLength();
-            Value* TrimmedLength = ConstantInt::get(DepWriteLength->getType(),
-                                                    InstWriteOffset -
-                                                    DepWriteOffset);
-            DepIntrinsic->setLength(TrimmedLength);
-            MadeChange = true;
-          }
-        }
-      }
-
-      // If this is a may-aliased store that is clobbering the store value, we
-      // can keep searching past it for another must-aliased pointer that stores
-      // to the same location.  For example, in:
-      //   store -> P
-      //   store -> Q
-      //   store -> P
-      // we can remove the first store to P even though we don't know if P and Q
-      // alias.
-      if (DepWrite == &BB.front()) break;
-
-      // Can't look past this instruction if it might read 'Loc'.
-      if (AA->getModRefInfo(DepWrite, Loc) & MRI_Ref)
-        break;
-
-      InstDep = MD->getPointerDependencyFrom(Loc, false,
-                                             DepWrite->getIterator(), &BB);
-    }
-  }
-
-  // If this block ends in a return, unwind, or unreachable, all allocas are
-  // dead at its end, which means stores to them are also dead.
-  if (BB.getTerminator()->getNumSuccessors() == 0)
-    MadeChange |= handleEndBlock(BB);
-
-  return MadeChange;
-}
-
 /// Returns true if the memory which is accessed by the second instruction is not
 /// modified between the first and the second instruction.
 /// Precondition: Second instruction must be dominated by the first
 /// instruction.
-bool DSE::MemoryIsNotModifiedBetween(Instruction *FirstI,
-                                     Instruction *SecondI) {
+static bool memoryIsNotModifiedBetween(Instruction *FirstI,
+                                       Instruction *SecondI,
+                                       AliasAnalysis *AA) {
   SmallVector<BasicBlock *, 16> WorkList;
   SmallPtrSet<BasicBlock *, 8> Visited;
   BasicBlock::iterator FirstBBI(FirstI);
@@ -718,7 +567,7 @@ bool DSE::MemoryIsNotModifiedBetween(Instruction *FirstI,
 
 /// Find all blocks that will unconditionally lead to the block BB and append
 /// them to F.
-static void FindUnconditionalPreds(SmallVectorImpl<BasicBlock *> &Blocks,
+static void findUnconditionalPreds(SmallVectorImpl<BasicBlock *> &Blocks,
                                    BasicBlock *BB, DominatorTree *DT) {
   for (pred_iterator I = pred_begin(BB), E = pred_end(BB); I != E; ++I) {
     BasicBlock *Pred = *I;
@@ -732,9 +581,11 @@ static void FindUnconditionalPreds(SmallVectorImpl<BasicBlock *> &Blocks,
   }
 }
 
-/// HandleFree - Handle frees of entire structures whose dependency is a store
+/// Handle frees of entire structures whose dependency is a store
 /// to a field of that structure.
-bool DSE::HandleFree(CallInst *F) {
+static bool handleFree(CallInst *F, AliasAnalysis *AA,
+                       MemoryDependenceResults *MD, DominatorTree *DT,
+                       const TargetLibraryInfo *TLI) {
   bool MadeChange = false;
 
   MemoryLocation Loc = MemoryLocation(F->getOperand(0));
@@ -761,10 +612,9 @@ bool DSE::HandleFree(CallInst *F) {
       if (!AA->isMustAlias(F->getArgOperand(0), DepPointer))
         break;
 
-      auto Next = ++Dependency->getIterator();
-
-      // DCE instructions only used to calculate that store
-      DeleteDeadInstruction(Dependency, *MD, *TLI);
+      // DCE instructions only used to calculate that store.
+      BasicBlock::iterator BBI(Dependency);
+      deleteDeadInstruction(Dependency, &BBI, *MD, *TLI);
       ++NumFastStores;
       MadeChange = true;
 
@@ -773,23 +623,53 @@ bool DSE::HandleFree(CallInst *F) {
       //    s[0] = 0;
       //    s[1] = 0; // This has just been deleted.
       //    free(s);
-      Dep = MD->getPointerDependencyFrom(Loc, false, Next, BB);
+      Dep = MD->getPointerDependencyFrom(Loc, false, BBI, BB);
     }
 
     if (Dep.isNonLocal())
-      FindUnconditionalPreds(Blocks, BB, DT);
+      findUnconditionalPreds(Blocks, BB, DT);
   }
 
   return MadeChange;
 }
 
-/// handleEndBlock - Remove dead stores to stack-allocated locations in the
-/// function end block.  Ex:
+/// Check to see if the specified location may alias any of the stack objects in
+/// the DeadStackObjects set. If so, they become live because the location is
+/// being loaded.
+static void removeAccessedObjects(const MemoryLocation &LoadedLoc,
+                                  SmallSetVector<Value *, 16> &DeadStackObjects,
+                                  const DataLayout &DL, AliasAnalysis *AA,
+                                  const TargetLibraryInfo *TLI) {
+  const Value *UnderlyingPointer = GetUnderlyingObject(LoadedLoc.Ptr, DL);
+
+  // A constant can't be in the dead pointer set.
+  if (isa<Constant>(UnderlyingPointer))
+    return;
+
+  // If the kill pointer can be easily reduced to an alloca, don't bother doing
+  // extraneous AA queries.
+  if (isa<AllocaInst>(UnderlyingPointer) || isa<Argument>(UnderlyingPointer)) {
+    DeadStackObjects.remove(const_cast<Value*>(UnderlyingPointer));
+    return;
+  }
+
+  // Remove objects that could alias LoadedLoc.
+  DeadStackObjects.remove_if([&](Value *I) {
+    // See if the loaded location could alias the stack location.
+    MemoryLocation StackLoc(I, getPointerSize(I, DL, *TLI));
+    return !AA->isNoAlias(StackLoc, LoadedLoc);
+  });
+}
+
+/// Remove dead stores to stack-allocated locations in the function end block.
+/// Ex:
 /// %A = alloca i32
 /// ...
 /// store i32 1, i32* %A
 /// ret void
-bool DSE::handleEndBlock(BasicBlock &BB) {
+static bool handleEndBlock(BasicBlock &BB, AliasAnalysis *AA,
+                             MemoryDependenceResults *MD,
+                             const TargetLibraryInfo *TLI) {
   bool MadeChange = false;
 
   // Keep track of all of the stack objects that are dead at the end of the
@@ -828,15 +708,14 @@ bool DSE::handleEndBlock(BasicBlock &BB) {
 
       // Stores to stack values are valid candidates for removal.
       bool AllDead = true;
-      for (SmallVectorImpl<Value *>::iterator I = Pointers.begin(),
-           E = Pointers.end(); I != E; ++I)
-        if (!DeadStackObjects.count(*I)) {
+      for (Value *Pointer : Pointers)
+        if (!DeadStackObjects.count(Pointer)) {
           AllDead = false;
           break;
         }
 
       if (AllDead) {
-        Instruction *Dead = &*BBI++;
+        Instruction *Dead = &*BBI;
 
         DEBUG(dbgs() << "DSE: Dead Store at End of Block:\n  DEAD: "
                      << *Dead << "\n  Objects: ";
@@ -849,7 +728,7 @@ bool DSE::handleEndBlock(BasicBlock &BB) {
               dbgs() << '\n');
 
         // DCE instructions only used to calculate that store.
-        DeleteDeadInstruction(Dead, *MD, *TLI, &DeadStackObjects);
+        deleteDeadInstruction(Dead, &BBI, *MD, *TLI, &DeadStackObjects);
         ++NumFastStores;
         MadeChange = true;
         continue;
@@ -858,8 +737,7 @@ bool DSE::handleEndBlock(BasicBlock &BB) {
 
     // Remove any dead non-memory-mutating instructions.
     if (isInstructionTriviallyDead(&*BBI, TLI)) {
-      Instruction *Inst = &*BBI++;
-      DeleteDeadInstruction(Inst, *MD, *TLI, &DeadStackObjects);
+      deleteDeadInstruction(&*BBI, &BBI, *MD, *TLI, &DeadStackObjects);
       ++NumFastOther;
       MadeChange = true;
       continue;
@@ -873,7 +751,7 @@ bool DSE::handleEndBlock(BasicBlock &BB) {
     }
 
     if (auto CS = CallSite(&*BBI)) {
-      // Remove allocation function calls from the list of dead stack objects; 
+      // Remove allocation function calls from the list of dead stack objects;
       // there can't be any references before the definition.
       if (isAllocLikeFn(&*BBI, TLI))
         DeadStackObjects.remove(&*BBI);
@@ -900,6 +778,14 @@ bool DSE::handleEndBlock(BasicBlock &BB) {
       continue;
     }
 
+    // We can remove the dead stores, irrespective of the fence and its ordering
+    // (release/acquire/seq_cst). Fences only constraints the ordering of
+    // already visible stores, it does not make a store visible to other
+    // threads. So, skipping over a fence does not change a store from being
+    // dead.
+    if (isa<FenceInst>(*BBI))
+      continue;
+
     MemoryLocation LoadedLoc;
 
     // If we encounter a use of the pointer, it is no longer considered dead
@@ -922,7 +808,7 @@ bool DSE::handleEndBlock(BasicBlock &BB) {
 
     // Remove any allocas from the DeadPointer set that are loaded, as this
     // makes any stores above the access live.
-    RemoveAccessedObjects(LoadedLoc, DeadStackObjects, DL);
+    removeAccessedObjects(LoadedLoc, DeadStackObjects, DL, AA, TLI);
 
     // If all of the allocas were clobbered by the access then we're not going
     // to find anything else to process.
@@ -933,29 +819,285 @@ bool DSE::handleEndBlock(BasicBlock &BB) {
   return MadeChange;
 }
 
-/// RemoveAccessedObjects - Check to see if the specified location may alias any
-/// of the stack objects in the DeadStackObjects set.  If so, they become live
-/// because the location is being loaded.
-void DSE::RemoveAccessedObjects(const MemoryLocation &LoadedLoc,
-                                SmallSetVector<Value *, 16> &DeadStackObjects,
-                                const DataLayout &DL) {
-  const Value *UnderlyingPointer = GetUnderlyingObject(LoadedLoc.Ptr, DL);
+static bool eliminateNoopStore(Instruction *Inst, BasicBlock::iterator &BBI,
+                               AliasAnalysis *AA, MemoryDependenceResults *MD,
+                               const DataLayout &DL,
+                               const TargetLibraryInfo *TLI) {
+  // Must be a store instruction.
+  StoreInst *SI = dyn_cast<StoreInst>(Inst);
+  if (!SI)
+    return false;
 
-  // A constant can't be in the dead pointer set.
-  if (isa<Constant>(UnderlyingPointer))
-    return;
+  // If we're storing the same value back to a pointer that we just loaded from,
+  // then the store can be removed.
+  if (LoadInst *DepLoad = dyn_cast<LoadInst>(SI->getValueOperand())) {
+    if (SI->getPointerOperand() == DepLoad->getPointerOperand() &&
+        isRemovable(SI) && memoryIsNotModifiedBetween(DepLoad, SI, AA)) {
 
-  // If the kill pointer can be easily reduced to an alloca, don't bother doing
-  // extraneous AA queries.
-  if (isa<AllocaInst>(UnderlyingPointer) || isa<Argument>(UnderlyingPointer)) {
-    DeadStackObjects.remove(const_cast<Value*>(UnderlyingPointer));
-    return;
+      DEBUG(dbgs() << "DSE: Remove Store Of Load from same pointer:\n  LOAD: "
+                   << *DepLoad << "\n  STORE: " << *SI << '\n');
+
+      deleteDeadInstruction(SI, &BBI, *MD, *TLI);
+      ++NumRedundantStores;
+      return true;
+    }
   }
 
-  // Remove objects that could alias LoadedLoc.
-  DeadStackObjects.remove_if([&](Value *I) {
-    // See if the loaded location could alias the stack location.
-    MemoryLocation StackLoc(I, getPointerSize(I, DL, *TLI));
-    return !AA->isNoAlias(StackLoc, LoadedLoc);
-  });
+  // Remove null stores into the calloc'ed objects
+  Constant *StoredConstant = dyn_cast<Constant>(SI->getValueOperand());
+  if (StoredConstant && StoredConstant->isNullValue() && isRemovable(SI)) {
+    Instruction *UnderlyingPointer =
+        dyn_cast<Instruction>(GetUnderlyingObject(SI->getPointerOperand(), DL));
+
+    if (UnderlyingPointer && isCallocLikeFn(UnderlyingPointer, TLI) &&
+        memoryIsNotModifiedBetween(UnderlyingPointer, SI, AA)) {
+      DEBUG(
+          dbgs() << "DSE: Remove null store to the calloc'ed object:\n  DEAD: "
+                 << *Inst << "\n  OBJECT: " << *UnderlyingPointer << '\n');
+
+      deleteDeadInstruction(SI, &BBI, *MD, *TLI);
+      ++NumRedundantStores;
+      return true;
+    }
+  }
+  return false;
+}
+
+static bool eliminateDeadStores(BasicBlock &BB, AliasAnalysis *AA,
+                                MemoryDependenceResults *MD, DominatorTree *DT,
+                                const TargetLibraryInfo *TLI) {
+  const DataLayout &DL = BB.getModule()->getDataLayout();
+  bool MadeChange = false;
+
+  // A map of interval maps representing partially-overwritten value parts.
+  InstOverlapIntervalsTy IOL;
+
+  // Do a top-down walk on the BB.
+  for (BasicBlock::iterator BBI = BB.begin(), BBE = BB.end(); BBI != BBE; ) {
+    // Handle 'free' calls specially.
+    if (CallInst *F = isFreeCall(&*BBI, TLI)) {
+      MadeChange |= handleFree(F, AA, MD, DT, TLI);
+      // Increment BBI after handleFree has potentially deleted instructions.
+      // This ensures we maintain a valid iterator.
+      ++BBI;
+      continue;
+    }
+
+    Instruction *Inst = &*BBI++;
+
+    // Check to see if Inst writes to memory.  If not, continue.
+    if (!hasMemoryWrite(Inst, *TLI))
+      continue;
+
+    // eliminateNoopStore will update in iterator, if necessary.
+    if (eliminateNoopStore(Inst, BBI, AA, MD, DL, TLI)) {
+      MadeChange = true;
+      continue;
+    }
+
+    // If we find something that writes memory, get its memory dependence.
+    MemDepResult InstDep = MD->getDependency(Inst);
+
+    // Ignore any store where we can't find a local dependence.
+    // FIXME: cross-block DSE would be fun. :)
+    if (!InstDep.isDef() && !InstDep.isClobber())
+      continue;
+
+    // Figure out what location is being stored to.
+    MemoryLocation Loc = getLocForWrite(Inst, *AA);
+
+    // If we didn't get a useful location, fail.
+    if (!Loc.Ptr)
+      continue;
+
+    while (InstDep.isDef() || InstDep.isClobber()) {
+      // Get the memory clobbered by the instruction we depend on.  MemDep will
+      // skip any instructions that 'Loc' clearly doesn't interact with.  If we
+      // end up depending on a may- or must-aliased load, then we can't optimize
+      // away the store and we bail out.  However, if we depend on something
+      // that overwrites the memory location we *can* potentially optimize it.
+      //
+      // Find out what memory location the dependent instruction stores.
+      Instruction *DepWrite = InstDep.getInst();
+      MemoryLocation DepLoc = getLocForWrite(DepWrite, *AA);
+      // If we didn't get a useful location, or if it isn't a size, bail out.
+      if (!DepLoc.Ptr)
+        break;
+
+      // If we find a write that is a) removable (i.e., non-volatile), b) is
+      // completely obliterated by the store to 'Loc', and c) which we know that
+      // 'Inst' doesn't load from, then we can remove it.
+      if (isRemovable(DepWrite) &&
+          !isPossibleSelfRead(Inst, Loc, DepWrite, *TLI, *AA)) {
+        int64_t InstWriteOffset, DepWriteOffset;
+        OverwriteResult OR =
+            isOverwrite(Loc, DepLoc, DL, *TLI, DepWriteOffset, InstWriteOffset,
+                        DepWrite, IOL);
+        if (OR == OverwriteComplete) {
+          DEBUG(dbgs() << "DSE: Remove Dead Store:\n  DEAD: "
+                << *DepWrite << "\n  KILLER: " << *Inst << '\n');
+
+          // Delete the store and now-dead instructions that feed it.
+          deleteDeadInstruction(DepWrite, &BBI, *MD, *TLI);
+          ++NumFastStores;
+          MadeChange = true;
+
+          // We erased DepWrite; start over.
+          InstDep = MD->getDependency(Inst);
+          continue;
+        } else if ((OR == OverwriteEnd && isShortenableAtTheEnd(DepWrite)) ||
+                   ((OR == OverwriteBegin &&
+                     isShortenableAtTheBeginning(DepWrite)))) {
+          // TODO: base this on the target vector size so that if the earlier
+          // store was too small to get vector writes anyway then its likely
+          // a good idea to shorten it
+          // Power of 2 vector writes are probably always a bad idea to optimize
+          // as any store/memset/memcpy is likely using vector instructions so
+          // shortening it to not vector size is likely to be slower
+          MemIntrinsic *DepIntrinsic = cast<MemIntrinsic>(DepWrite);
+          unsigned DepWriteAlign = DepIntrinsic->getAlignment();
+          bool IsOverwriteEnd = (OR == OverwriteEnd);
+          if (!IsOverwriteEnd)
+            InstWriteOffset = int64_t(InstWriteOffset + Loc.Size);
+
+          if ((llvm::isPowerOf2_64(InstWriteOffset) &&
+               DepWriteAlign <= InstWriteOffset) ||
+              ((DepWriteAlign != 0) && InstWriteOffset % DepWriteAlign == 0)) {
+
+            DEBUG(dbgs() << "DSE: Remove Dead Store:\n  OW "
+                         << (IsOverwriteEnd ? "END" : "BEGIN") << ": "
+                         << *DepWrite << "\n  KILLER (offset "
+                         << InstWriteOffset << ", " << DepLoc.Size << ")"
+                         << *Inst << '\n');
+
+            int64_t NewLength =
+                IsOverwriteEnd
+                    ? InstWriteOffset - DepWriteOffset
+                    : DepLoc.Size - (InstWriteOffset - DepWriteOffset);
+
+            Value *DepWriteLength = DepIntrinsic->getLength();
+            Value *TrimmedLength =
+                ConstantInt::get(DepWriteLength->getType(), NewLength);
+            DepIntrinsic->setLength(TrimmedLength);
+
+            if (!IsOverwriteEnd) {
+              int64_t OffsetMoved = (InstWriteOffset - DepWriteOffset);
+              Value *Indices[1] = {
+                  ConstantInt::get(DepWriteLength->getType(), OffsetMoved)};
+              GetElementPtrInst *NewDestGEP = GetElementPtrInst::CreateInBounds(
+                  DepIntrinsic->getRawDest(), Indices, "", DepWrite);
+              DepIntrinsic->setDest(NewDestGEP);
+            }
+            MadeChange = true;
+          }
+        }
+      }
+
+      // If this is a may-aliased store that is clobbering the store value, we
+      // can keep searching past it for another must-aliased pointer that stores
+      // to the same location.  For example, in:
+      //   store -> P
+      //   store -> Q
+      //   store -> P
+      // we can remove the first store to P even though we don't know if P and Q
+      // alias.
+      if (DepWrite == &BB.front()) break;
+
+      // Can't look past this instruction if it might read 'Loc'.
+      if (AA->getModRefInfo(DepWrite, Loc) & MRI_Ref)
+        break;
+
+      InstDep = MD->getPointerDependencyFrom(Loc, false,
+                                             DepWrite->getIterator(), &BB);
+    }
+  }
+
+  // If this block ends in a return, unwind, or unreachable, all allocas are
+  // dead at its end, which means stores to them are also dead.
+  if (BB.getTerminator()->getNumSuccessors() == 0)
+    MadeChange |= handleEndBlock(BB, AA, MD, TLI);
+
+  return MadeChange;
+}
+
+static bool eliminateDeadStores(Function &F, AliasAnalysis *AA,
+                                MemoryDependenceResults *MD, DominatorTree *DT,
+                                const TargetLibraryInfo *TLI) {
+  bool MadeChange = false;
+  for (BasicBlock &BB : F)
+    // Only check non-dead blocks.  Dead blocks may have strange pointer
+    // cycles that will confuse alias analysis.
+    if (DT->isReachableFromEntry(&BB))
+      MadeChange |= eliminateDeadStores(BB, AA, MD, DT, TLI);
+  return MadeChange;
+}
+
+//===----------------------------------------------------------------------===//
+// DSE Pass
+//===----------------------------------------------------------------------===//
+PreservedAnalyses DSEPass::run(Function &F, FunctionAnalysisManager &AM) {
+  AliasAnalysis *AA = &AM.getResult<AAManager>(F);
+  DominatorTree *DT = &AM.getResult<DominatorTreeAnalysis>(F);
+  MemoryDependenceResults *MD = &AM.getResult<MemoryDependenceAnalysis>(F);
+  const TargetLibraryInfo *TLI = &AM.getResult<TargetLibraryAnalysis>(F);
+
+  if (!eliminateDeadStores(F, AA, MD, DT, TLI))
+    return PreservedAnalyses::all();
+  PreservedAnalyses PA;
+  PA.preserve<DominatorTreeAnalysis>();
+  PA.preserve<GlobalsAA>();
+  PA.preserve<MemoryDependenceAnalysis>();
+  return PA;
+}
+
+namespace {
+/// A legacy pass for the legacy pass manager that wraps \c DSEPass.
+class DSELegacyPass : public FunctionPass {
+public:
+  DSELegacyPass() : FunctionPass(ID) {
+    initializeDSELegacyPassPass(*PassRegistry::getPassRegistry());
+  }
+
+  bool runOnFunction(Function &F) override {
+    if (skipFunction(F))
+      return false;
+
+    DominatorTree *DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
+    AliasAnalysis *AA = &getAnalysis<AAResultsWrapperPass>().getAAResults();
+    MemoryDependenceResults *MD =
+        &getAnalysis<MemoryDependenceWrapperPass>().getMemDep();
+    const TargetLibraryInfo *TLI =
+        &getAnalysis<TargetLibraryInfoWrapperPass>().getTLI();
+
+    return eliminateDeadStores(F, AA, MD, DT, TLI);
+  }
+
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    AU.setPreservesCFG();
+    AU.addRequired<DominatorTreeWrapperPass>();
+    AU.addRequired<AAResultsWrapperPass>();
+    AU.addRequired<MemoryDependenceWrapperPass>();
+    AU.addRequired<TargetLibraryInfoWrapperPass>();
+    AU.addPreserved<DominatorTreeWrapperPass>();
+    AU.addPreserved<GlobalsAAWrapperPass>();
+    AU.addPreserved<MemoryDependenceWrapperPass>();
+  }
+
+  static char ID; // Pass identification, replacement for typeid
+};
+} // end anonymous namespace
+
+char DSELegacyPass::ID = 0;
+INITIALIZE_PASS_BEGIN(DSELegacyPass, "dse", "Dead Store Elimination", false,
+                      false)
+INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(GlobalsAAWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(MemoryDependenceWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfoWrapperPass)
+INITIALIZE_PASS_END(DSELegacyPass, "dse", "Dead Store Elimination", false,
+                    false)
+
+FunctionPass *llvm::createDeadStoreEliminationPass() {
+  return new DSELegacyPass();
 }
diff --git a/lib/Transforms/Scalar/EarlyCSE.cpp b/lib/Transforms/Scalar/EarlyCSE.cpp
index 7ef062e71ff3..9d0ef42e0396 100644
--- a/lib/Transforms/Scalar/EarlyCSE.cpp
+++ b/lib/Transforms/Scalar/EarlyCSE.cpp
@@ -16,8 +16,8 @@
 #include "llvm/ADT/Hashing.h"
 #include "llvm/ADT/ScopedHashTable.h"
 #include "llvm/ADT/Statistic.h"
-#include "llvm/Analysis/GlobalsModRef.h"
 #include "llvm/Analysis/AssumptionCache.h"
+#include "llvm/Analysis/GlobalsModRef.h"
 #include "llvm/Analysis/InstructionSimplify.h"
 #include "llvm/Analysis/TargetLibraryInfo.h"
 #include "llvm/Analysis/TargetTransformInfo.h"
@@ -40,6 +40,7 @@ using namespace llvm::PatternMatch;
 
 STATISTIC(NumSimplify, "Number of instructions simplified or DCE'd");
 STATISTIC(NumCSE,      "Number of instructions CSE'd");
+STATISTIC(NumCSECVP,   "Number of compare instructions CVP'd");
 STATISTIC(NumCSELoad,  "Number of load instructions CSE'd");
 STATISTIC(NumCSECall,  "Number of call instructions CSE'd");
 STATISTIC(NumDSE,      "Number of trivial dead stores removed");
@@ -97,15 +98,6 @@ unsigned DenseMapInfo<SimpleValue>::getHashValue(SimpleValue Val) {
     if (BinOp->isCommutative() && BinOp->getOperand(0) > BinOp->getOperand(1))
       std::swap(LHS, RHS);
 
-    if (isa<OverflowingBinaryOperator>(BinOp)) {
-      // Hash the overflow behavior
-      unsigned Overflow =
-          BinOp->hasNoSignedWrap() * OverflowingBinaryOperator::NoSignedWrap |
-          BinOp->hasNoUnsignedWrap() *
-              OverflowingBinaryOperator::NoUnsignedWrap;
-      return hash_combine(BinOp->getOpcode(), Overflow, LHS, RHS);
-    }
-
     return hash_combine(BinOp->getOpcode(), LHS, RHS);
   }
 
@@ -152,7 +144,7 @@ bool DenseMapInfo<SimpleValue>::isEqual(SimpleValue LHS, SimpleValue RHS) {
 
   if (LHSI->getOpcode() != RHSI->getOpcode())
     return false;
-  if (LHSI->isIdenticalTo(RHSI))
+  if (LHSI->isIdenticalToWhenDefined(RHSI))
     return true;
 
   // If we're not strictly identical, we still might be a commutable instruction
@@ -164,15 +156,6 @@ bool DenseMapInfo<SimpleValue>::isEqual(SimpleValue LHS, SimpleValue RHS) {
            "same opcode, but different instruction type?");
     BinaryOperator *RHSBinOp = cast<BinaryOperator>(RHSI);
 
-    // Check overflow attributes
-    if (isa<OverflowingBinaryOperator>(LHSBinOp)) {
-      assert(isa<OverflowingBinaryOperator>(RHSBinOp) &&
-             "same opcode, but different operator type?");
-      if (LHSBinOp->hasNoUnsignedWrap() != RHSBinOp->hasNoUnsignedWrap() ||
-          LHSBinOp->hasNoSignedWrap() != RHSBinOp->hasNoSignedWrap())
-        return false;
-    }
-
     // Commuted equality
     return LHSBinOp->getOperand(0) == RHSBinOp->getOperand(1) &&
            LHSBinOp->getOperand(1) == RHSBinOp->getOperand(0);
@@ -296,16 +279,18 @@ public:
   /// present the table; it is the responsibility of the consumer to inspect
   /// the atomicity/volatility if needed.
   struct LoadValue {
-    Value *Data;
+    Instruction *DefInst;
     unsigned Generation;
     int MatchingId;
     bool IsAtomic;
+    bool IsInvariant;
     LoadValue()
-      : Data(nullptr), Generation(0), MatchingId(-1), IsAtomic(false) {}
-    LoadValue(Value *Data, unsigned Generation, unsigned MatchingId,
-              bool IsAtomic)
-      : Data(Data), Generation(Generation), MatchingId(MatchingId),
-        IsAtomic(IsAtomic) {}
+        : DefInst(nullptr), Generation(0), MatchingId(-1), IsAtomic(false),
+          IsInvariant(false) {}
+    LoadValue(Instruction *Inst, unsigned Generation, unsigned MatchingId,
+              bool IsAtomic, bool IsInvariant)
+        : DefInst(Inst), Generation(Generation), MatchingId(MatchingId),
+          IsAtomic(IsAtomic), IsInvariant(IsInvariant) {}
   };
   typedef RecyclingAllocator<BumpPtrAllocator,
                              ScopedHashTableVal<Value *, LoadValue>>
@@ -318,7 +303,8 @@ public:
   /// values.
   ///
   /// It uses the same generation count as loads.
-  typedef ScopedHashTable<CallValue, std::pair<Value *, unsigned>> CallHTType;
+  typedef ScopedHashTable<CallValue, std::pair<Instruction *, unsigned>>
+      CallHTType;
   CallHTType AvailableCalls;
 
   /// \brief This is the current generation of the memory value.
@@ -354,7 +340,7 @@ private:
   // Contains all the needed information to create a stack for doing a depth
   // first tranversal of the tree. This includes scopes for values, loads, and
   // calls as well as the generation. There is a child iterator so that the
-  // children do not need to be store spearately.
+  // children do not need to be store separately.
   class StackNode {
   public:
     StackNode(ScopedHTType &AvailableValues, LoadHTType &AvailableLoads,
@@ -446,7 +432,12 @@ private:
       return true;
     }
 
-    
+    bool isInvariantLoad() const {
+      if (auto *LI = dyn_cast<LoadInst>(Inst))
+        return LI->getMetadata(LLVMContext::MD_invariant_load) != nullptr;
+      return false;
+    }
+
     bool isMatchingMemLoc(const ParseMemoryInst &Inst) const {
       return (getPointerOperand() == Inst.getPointerOperand() &&
               getMatchingId() == Inst.getMatchingId());
@@ -500,6 +491,7 @@ private:
 }
 
 bool EarlyCSE::processNode(DomTreeNode *Node) {
+  bool Changed = false;
   BasicBlock *BB = Node->getBlock();
 
   // If this block has a single predecessor, then the predecessor is the parent
@@ -513,7 +505,7 @@ bool EarlyCSE::processNode(DomTreeNode *Node) {
 
   // If this node has a single predecessor which ends in a conditional branch,
   // we can infer the value of the branch condition given that we took this
-  // path.  We need the single predeccesor to ensure there's not another path
+  // path.  We need the single predecessor to ensure there's not another path
   // which reaches this block where the condition might hold a different
   // value.  Since we're adding this to the scoped hash table (like any other
   // def), it will have been popped if we encounter a future merge block.
@@ -530,9 +522,13 @@ bool EarlyCSE::processNode(DomTreeNode *Node) {
             DEBUG(dbgs() << "EarlyCSE CVP: Add conditional value for '"
                   << CondInst->getName() << "' as " << *ConditionalConstant
                   << " in " << BB->getName() << "\n");
-            // Replace all dominated uses with the known value
-            replaceDominatedUsesWith(CondInst, ConditionalConstant, DT,
-                                     BasicBlockEdge(Pred, BB));
+            // Replace all dominated uses with the known value.
+            if (unsigned Count =
+                    replaceDominatedUsesWith(CondInst, ConditionalConstant, DT,
+                                             BasicBlockEdge(Pred, BB))) {
+              Changed = true;
+              NumCSECVP = NumCSECVP + Count;
+            }
           }
 
   /// LastStore - Keep track of the last non-volatile store that we saw... for
@@ -541,7 +537,6 @@ bool EarlyCSE::processNode(DomTreeNode *Node) {
   /// stores which can occur in bitfield code among other things.
   Instruction *LastStore = nullptr;
 
-  bool Changed = false;
   const DataLayout &DL = BB->getModule()->getDataLayout();
 
   // See if any instructions in the block can be eliminated.  If so, do it.  If
@@ -567,15 +562,38 @@ bool EarlyCSE::processNode(DomTreeNode *Node) {
       continue;
     }
 
+    if (match(Inst, m_Intrinsic<Intrinsic::experimental_guard>())) {
+      if (auto *CondI =
+              dyn_cast<Instruction>(cast<CallInst>(Inst)->getArgOperand(0))) {
+        // The condition we're on guarding here is true for all dominated
+        // locations.
+        if (SimpleValue::canHandle(CondI))
+          AvailableValues.insert(CondI, ConstantInt::getTrue(BB->getContext()));
+      }
+
+      // Guard intrinsics read all memory, but don't write any memory.
+      // Accordingly, don't update the generation but consume the last store (to
+      // avoid an incorrect DSE).
+      LastStore = nullptr;
+      continue;
+    }
+
     // If the instruction can be simplified (e.g. X+0 = X) then replace it with
     // its simpler value.
     if (Value *V = SimplifyInstruction(Inst, DL, &TLI, &DT, &AC)) {
       DEBUG(dbgs() << "EarlyCSE Simplify: " << *Inst << "  to: " << *V << '\n');
-      Inst->replaceAllUsesWith(V);
-      Inst->eraseFromParent();
-      Changed = true;
-      ++NumSimplify;
-      continue;
+      if (!Inst->use_empty()) {
+        Inst->replaceAllUsesWith(V);
+        Changed = true;
+      }
+      if (isInstructionTriviallyDead(Inst, &TLI)) {
+        Inst->eraseFromParent();
+        Changed = true;
+      }
+      if (Changed) {
+        ++NumSimplify;
+        continue;
+      }
     }
 
     // If this is a simple instruction that we can value number, process it.
@@ -583,6 +601,8 @@ bool EarlyCSE::processNode(DomTreeNode *Node) {
       // See if the instruction has an available value.  If so, use it.
       if (Value *V = AvailableValues.lookup(Inst)) {
         DEBUG(dbgs() << "EarlyCSE CSE: " << *Inst << "  to: " << *V << '\n');
+        if (auto *I = dyn_cast<Instruction>(V))
+          I->andIRFlags(Inst);
         Inst->replaceAllUsesWith(V);
         Inst->eraseFromParent();
         Changed = true;
@@ -606,18 +626,25 @@ bool EarlyCSE::processNode(DomTreeNode *Node) {
       }
 
       // If we have an available version of this load, and if it is the right
-      // generation, replace this instruction.
+      // generation or the load is known to be from an invariant location,
+      // replace this instruction.
+      //
+      // A dominating invariant load implies that the location loaded from is
+      // unchanging beginning at the point of the invariant load, so the load
+      // we're CSE'ing _away_ does not need to be invariant, only the available
+      // load we're CSE'ing _to_ does.
       LoadValue InVal = AvailableLoads.lookup(MemInst.getPointerOperand());
-      if (InVal.Data != nullptr && InVal.Generation == CurrentGeneration &&
+      if (InVal.DefInst != nullptr &&
+          (InVal.Generation == CurrentGeneration || InVal.IsInvariant) &&
           InVal.MatchingId == MemInst.getMatchingId() &&
           // We don't yet handle removing loads with ordering of any kind.
           !MemInst.isVolatile() && MemInst.isUnordered() &&
           // We can't replace an atomic load with one which isn't also atomic.
           InVal.IsAtomic >= MemInst.isAtomic()) {
-        Value *Op = getOrCreateResult(InVal.Data, Inst->getType());
+        Value *Op = getOrCreateResult(InVal.DefInst, Inst->getType());
         if (Op != nullptr) {
           DEBUG(dbgs() << "EarlyCSE CSE LOAD: " << *Inst
-                       << "  to: " << *InVal.Data << '\n');
+                       << "  to: " << *InVal.DefInst << '\n');
           if (!Inst->use_empty())
             Inst->replaceAllUsesWith(Op);
           Inst->eraseFromParent();
@@ -631,7 +658,7 @@ bool EarlyCSE::processNode(DomTreeNode *Node) {
       AvailableLoads.insert(
           MemInst.getPointerOperand(),
           LoadValue(Inst, CurrentGeneration, MemInst.getMatchingId(),
-                    MemInst.isAtomic()));
+                    MemInst.isAtomic(), MemInst.isInvariantLoad()));
       LastStore = nullptr;
       continue;
     }
@@ -649,7 +676,7 @@ bool EarlyCSE::processNode(DomTreeNode *Node) {
     if (CallValue::canHandle(Inst)) {
       // If we have an available version of this call, and if it is the right
       // generation, replace this instruction.
-      std::pair<Value *, unsigned> InVal = AvailableCalls.lookup(Inst);
+      std::pair<Instruction *, unsigned> InVal = AvailableCalls.lookup(Inst);
       if (InVal.first != nullptr && InVal.second == CurrentGeneration) {
         DEBUG(dbgs() << "EarlyCSE CSE CALL: " << *Inst
                      << "  to: " << *InVal.first << '\n');
@@ -663,7 +690,7 @@ bool EarlyCSE::processNode(DomTreeNode *Node) {
 
       // Otherwise, remember that we have this instruction.
       AvailableCalls.insert(
-          Inst, std::pair<Value *, unsigned>(Inst, CurrentGeneration));
+          Inst, std::pair<Instruction *, unsigned>(Inst, CurrentGeneration));
       continue;
     }
 
@@ -673,7 +700,7 @@ bool EarlyCSE::processNode(DomTreeNode *Node) {
     // to advance the generation.  We do need to prevent DSE across the fence,
     // but that's handled above.
     if (FenceInst *FI = dyn_cast<FenceInst>(Inst))
-      if (FI->getOrdering() == Release) {
+      if (FI->getOrdering() == AtomicOrdering::Release) {
         assert(Inst->mayReadFromMemory() && "relied on to prevent DSE above");
         continue;
       }
@@ -685,8 +712,8 @@ bool EarlyCSE::processNode(DomTreeNode *Node) {
     // the store originally was.
     if (MemInst.isValid() && MemInst.isStore()) {
       LoadValue InVal = AvailableLoads.lookup(MemInst.getPointerOperand());
-      if (InVal.Data &&
-          InVal.Data == getOrCreateResult(Inst, InVal.Data->getType()) &&
+      if (InVal.DefInst &&
+          InVal.DefInst == getOrCreateResult(Inst, InVal.DefInst->getType()) &&
           InVal.Generation == CurrentGeneration &&
           InVal.MatchingId == MemInst.getMatchingId() &&
           // We don't yet handle removing stores with ordering of any kind.
@@ -743,7 +770,7 @@ bool EarlyCSE::processNode(DomTreeNode *Node) {
         AvailableLoads.insert(
             MemInst.getPointerOperand(),
             LoadValue(Inst, CurrentGeneration, MemInst.getMatchingId(),
-                      MemInst.isAtomic()));
+                      MemInst.isAtomic(), /*IsInvariant=*/false));
 
         // Remember that this was the last unordered store we saw for DSE. We
         // don't yet handle DSE on ordered or volatile stores since we don't
@@ -818,11 +845,11 @@ bool EarlyCSE::run() {
 }
 
 PreservedAnalyses EarlyCSEPass::run(Function &F,
-                                    AnalysisManager<Function> *AM) {
-  auto &TLI = AM->getResult<TargetLibraryAnalysis>(F);
-  auto &TTI = AM->getResult<TargetIRAnalysis>(F);
-  auto &DT = AM->getResult<DominatorTreeAnalysis>(F);
-  auto &AC = AM->getResult<AssumptionAnalysis>(F);
+                                    AnalysisManager<Function> &AM) {
+  auto &TLI = AM.getResult<TargetLibraryAnalysis>(F);
+  auto &TTI = AM.getResult<TargetIRAnalysis>(F);
+  auto &DT = AM.getResult<DominatorTreeAnalysis>(F);
+  auto &AC = AM.getResult<AssumptionAnalysis>(F);
 
   EarlyCSE CSE(TLI, TTI, DT, AC);
 
@@ -833,6 +860,7 @@ PreservedAnalyses EarlyCSEPass::run(Function &F,
   // FIXME: Bundle this with other CFG-preservation.
   PreservedAnalyses PA;
   PA.preserve<DominatorTreeAnalysis>();
+  PA.preserve<GlobalsAA>();
   return PA;
 }
 
@@ -853,7 +881,7 @@ public:
   }
 
   bool runOnFunction(Function &F) override {
-    if (skipOptnoneFunction(F))
+    if (skipFunction(F))
       return false;
 
     auto &TLI = getAnalysis<TargetLibraryInfoWrapperPass>().getTLI();
diff --git a/lib/Transforms/Scalar/Float2Int.cpp b/lib/Transforms/Scalar/Float2Int.cpp
index 7f5d78656b50..7aa6dc6992b6 100644
--- a/lib/Transforms/Scalar/Float2Int.cpp
+++ b/lib/Transforms/Scalar/Float2Int.cpp
@@ -13,15 +13,13 @@
 //===----------------------------------------------------------------------===//
 
 #define DEBUG_TYPE "float2int"
+
+#include "llvm/Transforms/Scalar/Float2Int.h"
 #include "llvm/ADT/APInt.h"
 #include "llvm/ADT/APSInt.h"
-#include "llvm/ADT/DenseMap.h"
-#include "llvm/ADT/EquivalenceClasses.h"
-#include "llvm/ADT/MapVector.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/Analysis/AliasAnalysis.h"
 #include "llvm/Analysis/GlobalsModRef.h"
-#include "llvm/IR/ConstantRange.h"
 #include "llvm/IR/Constants.h"
 #include "llvm/IR/IRBuilder.h"
 #include "llvm/IR/InstIterator.h"
@@ -53,41 +51,31 @@ MaxIntegerBW("float2int-max-integer-bw", cl::init(64), cl::Hidden,
                       "(default=64)"));
 
 namespace {
-  struct Float2Int : public FunctionPass {
+  struct Float2IntLegacyPass : public FunctionPass {
     static char ID; // Pass identification, replacement for typeid
-    Float2Int() : FunctionPass(ID) {
-      initializeFloat2IntPass(*PassRegistry::getPassRegistry());
+    Float2IntLegacyPass() : FunctionPass(ID) {
+      initializeFloat2IntLegacyPassPass(*PassRegistry::getPassRegistry());
+    }
+
+    bool runOnFunction(Function &F) override {
+      if (skipFunction(F))
+        return false;
+
+      return Impl.runImpl(F);
     }
 
-    bool runOnFunction(Function &F) override;
     void getAnalysisUsage(AnalysisUsage &AU) const override {
       AU.setPreservesCFG();
       AU.addPreserved<GlobalsAAWrapperPass>();
     }
 
-    void findRoots(Function &F, SmallPtrSet<Instruction*,8> &Roots);
-    ConstantRange seen(Instruction *I, ConstantRange R);
-    ConstantRange badRange();
-    ConstantRange unknownRange();
-    ConstantRange validateRange(ConstantRange R);
-    void walkBackwards(const SmallPtrSetImpl<Instruction*> &Roots);
-    void walkForwards();
-    bool validateAndTransform();
-    Value *convert(Instruction *I, Type *ToTy);
-    void cleanup();
-
-    MapVector<Instruction*, ConstantRange > SeenInsts;
-    SmallPtrSet<Instruction*,8> Roots;
-    EquivalenceClasses<Instruction*> ECs;
-    MapVector<Instruction*, Value*> ConvertedInsts;
-    LLVMContext *Ctx;
+  private:
+    Float2IntPass Impl;
   };
 }
 
-char Float2Int::ID = 0;
-INITIALIZE_PASS_BEGIN(Float2Int, "float2int", "Float to int", false, false)
-INITIALIZE_PASS_DEPENDENCY(GlobalsAAWrapperPass)
-INITIALIZE_PASS_END(Float2Int, "float2int", "Float to int", false, false)
+char Float2IntLegacyPass::ID = 0;
+INITIALIZE_PASS(Float2IntLegacyPass, "float2int", "Float to int", false, false)
 
 // Given a FCmp predicate, return a matching ICmp predicate if one
 // exists, otherwise return BAD_ICMP_PREDICATE.
@@ -129,7 +117,7 @@ static Instruction::BinaryOps mapBinOpcode(unsigned Opcode) {
 
 // Find the roots - instructions that convert from the FP domain to
 // integer domain.
-void Float2Int::findRoots(Function &F, SmallPtrSet<Instruction*,8> &Roots) {
+void Float2IntPass::findRoots(Function &F, SmallPtrSet<Instruction*,8> &Roots) {
   for (auto &I : instructions(F)) {
     if (isa<VectorType>(I.getType()))
       continue;
@@ -149,7 +137,7 @@ void Float2Int::findRoots(Function &F, SmallPtrSet<Instruction*,8> &Roots) {
 }
 
 // Helper - mark I as having been traversed, having range R.
-ConstantRange Float2Int::seen(Instruction *I, ConstantRange R) {
+ConstantRange Float2IntPass::seen(Instruction *I, ConstantRange R) {
   DEBUG(dbgs() << "F2I: " << *I << ":" << R << "\n");
   if (SeenInsts.find(I) != SeenInsts.end())
     SeenInsts.find(I)->second = R;
@@ -159,13 +147,13 @@ ConstantRange Float2Int::seen(Instruction *I, ConstantRange R) {
 }
 
 // Helper - get a range representing a poison value.
-ConstantRange Float2Int::badRange() {
+ConstantRange Float2IntPass::badRange() {
   return ConstantRange(MaxIntegerBW + 1, true);
 }
-ConstantRange Float2Int::unknownRange() {
+ConstantRange Float2IntPass::unknownRange() {
   return ConstantRange(MaxIntegerBW + 1, false);
 }
-ConstantRange Float2Int::validateRange(ConstantRange R) {
+ConstantRange Float2IntPass::validateRange(ConstantRange R) {
   if (R.getBitWidth() > MaxIntegerBW + 1)
     return badRange();
   return R;
@@ -185,7 +173,7 @@ ConstantRange Float2Int::validateRange(ConstantRange R) {
 
 // Breadth-first walk of the use-def graph; determine the set of nodes
 // we care about and eagerly determine if some of them are poisonous.
-void Float2Int::walkBackwards(const SmallPtrSetImpl<Instruction*> &Roots) {
+void Float2IntPass::walkBackwards(const SmallPtrSetImpl<Instruction*> &Roots) {
   std::deque<Instruction*> Worklist(Roots.begin(), Roots.end());
   while (!Worklist.empty()) {
     Instruction *I = Worklist.back();
@@ -246,8 +234,8 @@ void Float2Int::walkBackwards(const SmallPtrSetImpl<Instruction*> &Roots) {
 
 // Walk forwards down the list of seen instructions, so we visit defs before
 // uses.
-void Float2Int::walkForwards() {
-  for (auto &It : make_range(SeenInsts.rbegin(), SeenInsts.rend())) {
+void Float2IntPass::walkForwards() {
+  for (auto &It : reverse(SeenInsts)) {
     if (It.second != unknownRange())
       continue;
 
@@ -318,7 +306,7 @@ void Float2Int::walkForwards() {
         // Instead, we ask APFloat to round itself to an integral value - this
         // preserves sign-of-zero - then compare the result with the original.
         //
-        APFloat F = CF->getValueAPF();
+        const APFloat &F = CF->getValueAPF();
 
         // First, weed out obviously incorrect values. Non-finite numbers
         // can't be represented and neither can negative zero, unless
@@ -357,7 +345,7 @@ void Float2Int::walkForwards() {
 }
 
 // If there is a valid transform to be done, do it.
-bool Float2Int::validateAndTransform() {
+bool Float2IntPass::validateAndTransform() {
   bool MadeChange = false;
 
   // Iterate over every disjoint partition of the def-use graph.
@@ -439,7 +427,7 @@ bool Float2Int::validateAndTransform() {
   return MadeChange;
 }
 
-Value *Float2Int::convert(Instruction *I, Type *ToTy) {
+Value *Float2IntPass::convert(Instruction *I, Type *ToTy) {
   if (ConvertedInsts.find(I) != ConvertedInsts.end())
     // Already converted this instruction.
     return ConvertedInsts[I];
@@ -511,15 +499,12 @@ Value *Float2Int::convert(Instruction *I, Type *ToTy) {
 }
 
 // Perform dead code elimination on the instructions we just modified.
-void Float2Int::cleanup() {
-  for (auto &I : make_range(ConvertedInsts.rbegin(), ConvertedInsts.rend()))
+void Float2IntPass::cleanup() {
+  for (auto &I : reverse(ConvertedInsts))
     I.first->eraseFromParent();
 }
 
-bool Float2Int::runOnFunction(Function &F) {
-  if (skipOptnoneFunction(F))
-    return false;
-
+bool Float2IntPass::runImpl(Function &F) {
   DEBUG(dbgs() << "F2I: Looking at function " << F.getName() << "\n");
   // Clear out all state.
   ECs = EquivalenceClasses<Instruction*>();
@@ -540,4 +525,17 @@ bool Float2Int::runOnFunction(Function &F) {
   return Modified;
 }
 
-FunctionPass *llvm::createFloat2IntPass() { return new Float2Int(); }
+namespace llvm {
+FunctionPass *createFloat2IntPass() { return new Float2IntLegacyPass(); }
+
+PreservedAnalyses Float2IntPass::run(Function &F, FunctionAnalysisManager &) {
+  if (!runImpl(F))
+    return PreservedAnalyses::all();
+  else {
+    // FIXME: This should also 'preserve the CFG'.
+    PreservedAnalyses PA;
+    PA.preserve<GlobalsAA>();
+    return PA;
+  }
+}
+} // End namespace llvm
diff --git a/lib/Transforms/Scalar/GVN.cpp b/lib/Transforms/Scalar/GVN.cpp
index a028b8c444ba..a35a1062cbcd 100644
--- a/lib/Transforms/Scalar/GVN.cpp
+++ b/lib/Transforms/Scalar/GVN.cpp
@@ -15,7 +15,7 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "llvm/Transforms/Scalar.h"
+#include "llvm/Transforms/Scalar/GVN.h"
 #include "llvm/ADT/DenseMap.h"
 #include "llvm/ADT/DepthFirstIterator.h"
 #include "llvm/ADT/Hashing.h"
@@ -44,7 +44,6 @@
 #include "llvm/IR/LLVMContext.h"
 #include "llvm/IR/Metadata.h"
 #include "llvm/IR/PatternMatch.h"
-#include "llvm/Support/Allocator.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/raw_ostream.h"
@@ -53,6 +52,7 @@
 #include "llvm/Transforms/Utils/SSAUpdater.h"
 #include <vector>
 using namespace llvm;
+using namespace llvm::gvn;
 using namespace PatternMatch;
 
 #define DEBUG_TYPE "gvn"
@@ -74,106 +74,167 @@ static cl::opt<uint32_t>
 MaxRecurseDepth("max-recurse-depth", cl::Hidden, cl::init(1000), cl::ZeroOrMore,
                 cl::desc("Max recurse depth (default = 1000)"));
 
-//===----------------------------------------------------------------------===//
-//                         ValueTable Class
-//===----------------------------------------------------------------------===//
-
-/// This class holds the mapping between values and value numbers.  It is used
-/// as an efficient mechanism to determine the expression-wise equivalence of
-/// two values.
-namespace {
-  struct Expression {
-    uint32_t opcode;
-    Type *type;
-    SmallVector<uint32_t, 4> varargs;
+struct llvm::GVN::Expression {
+  uint32_t opcode;
+  Type *type;
+  SmallVector<uint32_t, 4> varargs;
 
-    Expression(uint32_t o = ~2U) : opcode(o) { }
+  Expression(uint32_t o = ~2U) : opcode(o) {}
 
-    bool operator==(const Expression &other) const {
-      if (opcode != other.opcode)
-        return false;
-      if (opcode == ~0U || opcode == ~1U)
-        return true;
-      if (type != other.type)
-        return false;
-      if (varargs != other.varargs)
-        return false;
+  bool operator==(const Expression &other) const {
+    if (opcode != other.opcode)
+      return false;
+    if (opcode == ~0U || opcode == ~1U)
       return true;
-    }
-
-    friend hash_code hash_value(const Expression &Value) {
-      return hash_combine(Value.opcode, Value.type,
-                          hash_combine_range(Value.varargs.begin(),
-                                             Value.varargs.end()));
-    }
-  };
+    if (type != other.type)
+      return false;
+    if (varargs != other.varargs)
+      return false;
+    return true;
+  }
 
-  class ValueTable {
-    DenseMap<Value*, uint32_t> valueNumbering;
-    DenseMap<Expression, uint32_t> expressionNumbering;
-    AliasAnalysis *AA;
-    MemoryDependenceAnalysis *MD;
-    DominatorTree *DT;
-
-    uint32_t nextValueNumber;
-
-    Expression create_expression(Instruction* I);
-    Expression create_cmp_expression(unsigned Opcode,
-                                     CmpInst::Predicate Predicate,
-                                     Value *LHS, Value *RHS);
-    Expression create_extractvalue_expression(ExtractValueInst* EI);
-    uint32_t lookup_or_add_call(CallInst* C);
-  public:
-    ValueTable() : nextValueNumber(1) { }
-    uint32_t lookup_or_add(Value *V);
-    uint32_t lookup(Value *V) const;
-    uint32_t lookup_or_add_cmp(unsigned Opcode, CmpInst::Predicate Pred,
-                               Value *LHS, Value *RHS);
-    bool exists(Value *V) const;
-    void add(Value *V, uint32_t num);
-    void clear();
-    void erase(Value *v);
-    void setAliasAnalysis(AliasAnalysis* A) { AA = A; }
-    AliasAnalysis *getAliasAnalysis() const { return AA; }
-    void setMemDep(MemoryDependenceAnalysis* M) { MD = M; }
-    void setDomTree(DominatorTree* D) { DT = D; }
-    uint32_t getNextUnusedValueNumber() { return nextValueNumber; }
-    void verifyRemoved(const Value *) const;
-  };
-}
+  friend hash_code hash_value(const Expression &Value) {
+    return hash_combine(
+        Value.opcode, Value.type,
+        hash_combine_range(Value.varargs.begin(), Value.varargs.end()));
+  }
+};
 
 namespace llvm {
-template <> struct DenseMapInfo<Expression> {
-  static inline Expression getEmptyKey() {
-    return ~0U;
-  }
+template <> struct DenseMapInfo<GVN::Expression> {
+  static inline GVN::Expression getEmptyKey() { return ~0U; }
 
-  static inline Expression getTombstoneKey() {
-    return ~1U;
-  }
+  static inline GVN::Expression getTombstoneKey() { return ~1U; }
 
-  static unsigned getHashValue(const Expression e) {
+  static unsigned getHashValue(const GVN::Expression &e) {
     using llvm::hash_value;
     return static_cast<unsigned>(hash_value(e));
   }
-  static bool isEqual(const Expression &LHS, const Expression &RHS) {
+  static bool isEqual(const GVN::Expression &LHS, const GVN::Expression &RHS) {
     return LHS == RHS;
   }
 };
+} // End llvm namespace.
+
+/// Represents a particular available value that we know how to materialize.
+/// Materialization of an AvailableValue never fails.  An AvailableValue is
+/// implicitly associated with a rematerialization point which is the
+/// location of the instruction from which it was formed.
+struct llvm::gvn::AvailableValue {
+  enum ValType {
+    SimpleVal, // A simple offsetted value that is accessed.
+    LoadVal,   // A value produced by a load.
+    MemIntrin, // A memory intrinsic which is loaded from.
+    UndefVal   // A UndefValue representing a value from dead block (which
+               // is not yet physically removed from the CFG).
+  };
 
-}
+  /// V - The value that is live out of the block.
+  PointerIntPair<Value *, 2, ValType> Val;
+
+  /// Offset - The byte offset in Val that is interesting for the load query.
+  unsigned Offset;
+
+  static AvailableValue get(Value *V, unsigned Offset = 0) {
+    AvailableValue Res;
+    Res.Val.setPointer(V);
+    Res.Val.setInt(SimpleVal);
+    Res.Offset = Offset;
+    return Res;
+  }
+
+  static AvailableValue getMI(MemIntrinsic *MI, unsigned Offset = 0) {
+    AvailableValue Res;
+    Res.Val.setPointer(MI);
+    Res.Val.setInt(MemIntrin);
+    Res.Offset = Offset;
+    return Res;
+  }
+
+  static AvailableValue getLoad(LoadInst *LI, unsigned Offset = 0) {
+    AvailableValue Res;
+    Res.Val.setPointer(LI);
+    Res.Val.setInt(LoadVal);
+    Res.Offset = Offset;
+    return Res;
+  }
+
+  static AvailableValue getUndef() {
+    AvailableValue Res;
+    Res.Val.setPointer(nullptr);
+    Res.Val.setInt(UndefVal);
+    Res.Offset = 0;
+    return Res;
+  }
+
+  bool isSimpleValue() const { return Val.getInt() == SimpleVal; }
+  bool isCoercedLoadValue() const { return Val.getInt() == LoadVal; }
+  bool isMemIntrinValue() const { return Val.getInt() == MemIntrin; }
+  bool isUndefValue() const { return Val.getInt() == UndefVal; }
+
+  Value *getSimpleValue() const {
+    assert(isSimpleValue() && "Wrong accessor");
+    return Val.getPointer();
+  }
+
+  LoadInst *getCoercedLoadValue() const {
+    assert(isCoercedLoadValue() && "Wrong accessor");
+    return cast<LoadInst>(Val.getPointer());
+  }
+
+  MemIntrinsic *getMemIntrinValue() const {
+    assert(isMemIntrinValue() && "Wrong accessor");
+    return cast<MemIntrinsic>(Val.getPointer());
+  }
+
+  /// Emit code at the specified insertion point to adjust the value defined
+  /// here to the specified type. This handles various coercion cases.
+  Value *MaterializeAdjustedValue(LoadInst *LI, Instruction *InsertPt,
+                                  GVN &gvn) const;
+};
+
+/// Represents an AvailableValue which can be rematerialized at the end of
+/// the associated BasicBlock.
+struct llvm::gvn::AvailableValueInBlock {
+  /// BB - The basic block in question.
+  BasicBlock *BB;
+
+  /// AV - The actual available value
+  AvailableValue AV;
+
+  static AvailableValueInBlock get(BasicBlock *BB, AvailableValue &&AV) {
+    AvailableValueInBlock Res;
+    Res.BB = BB;
+    Res.AV = std::move(AV);
+    return Res;
+  }
+
+  static AvailableValueInBlock get(BasicBlock *BB, Value *V,
+                                   unsigned Offset = 0) {
+    return get(BB, AvailableValue::get(V, Offset));
+  }
+  static AvailableValueInBlock getUndef(BasicBlock *BB) {
+    return get(BB, AvailableValue::getUndef());
+  }
+
+  /// Emit code at the end of this block to adjust the value defined here to
+  /// the specified type. This handles various coercion cases.
+  Value *MaterializeAdjustedValue(LoadInst *LI, GVN &gvn) const {
+    return AV.MaterializeAdjustedValue(LI, BB->getTerminator(), gvn);
+  }
+};
 
 //===----------------------------------------------------------------------===//
 //                     ValueTable Internal Functions
 //===----------------------------------------------------------------------===//
 
-Expression ValueTable::create_expression(Instruction *I) {
+GVN::Expression GVN::ValueTable::createExpr(Instruction *I) {
   Expression e;
   e.type = I->getType();
   e.opcode = I->getOpcode();
   for (Instruction::op_iterator OI = I->op_begin(), OE = I->op_end();
        OI != OE; ++OI)
-    e.varargs.push_back(lookup_or_add(*OI));
+    e.varargs.push_back(lookupOrAdd(*OI));
   if (I->isCommutative()) {
     // Ensure that commutative instructions that only differ by a permutation
     // of their operands get the same value number by sorting the operand value
@@ -201,15 +262,15 @@ Expression ValueTable::create_expression(Instruction *I) {
   return e;
 }
 
-Expression ValueTable::create_cmp_expression(unsigned Opcode,
-                                             CmpInst::Predicate Predicate,
-                                             Value *LHS, Value *RHS) {
+GVN::Expression GVN::ValueTable::createCmpExpr(unsigned Opcode,
+                                               CmpInst::Predicate Predicate,
+                                               Value *LHS, Value *RHS) {
   assert((Opcode == Instruction::ICmp || Opcode == Instruction::FCmp) &&
          "Not a comparison!");
   Expression e;
   e.type = CmpInst::makeCmpResultType(LHS->getType());
-  e.varargs.push_back(lookup_or_add(LHS));
-  e.varargs.push_back(lookup_or_add(RHS));
+  e.varargs.push_back(lookupOrAdd(LHS));
+  e.varargs.push_back(lookupOrAdd(RHS));
 
   // Sort the operand value numbers so x<y and y>x get the same value number.
   if (e.varargs[0] > e.varargs[1]) {
@@ -220,7 +281,7 @@ Expression ValueTable::create_cmp_expression(unsigned Opcode,
   return e;
 }
 
-Expression ValueTable::create_extractvalue_expression(ExtractValueInst *EI) {
+GVN::Expression GVN::ValueTable::createExtractvalueExpr(ExtractValueInst *EI) {
   assert(EI && "Not an ExtractValueInst?");
   Expression e;
   e.type = EI->getType();
@@ -252,8 +313,8 @@ Expression ValueTable::create_extractvalue_expression(ExtractValueInst *EI) {
       // Intrinsic recognized. Grab its args to finish building the expression.
       assert(I->getNumArgOperands() == 2 &&
              "Expect two args for recognised intrinsics.");
-      e.varargs.push_back(lookup_or_add(I->getArgOperand(0)));
-      e.varargs.push_back(lookup_or_add(I->getArgOperand(1)));
+      e.varargs.push_back(lookupOrAdd(I->getArgOperand(0)));
+      e.varargs.push_back(lookupOrAdd(I->getArgOperand(1)));
       return e;
     }
   }
@@ -263,7 +324,7 @@ Expression ValueTable::create_extractvalue_expression(ExtractValueInst *EI) {
   e.opcode = EI->getOpcode();
   for (Instruction::op_iterator OI = EI->op_begin(), OE = EI->op_end();
        OI != OE; ++OI)
-    e.varargs.push_back(lookup_or_add(*OI));
+    e.varargs.push_back(lookupOrAdd(*OI));
 
   for (ExtractValueInst::idx_iterator II = EI->idx_begin(), IE = EI->idx_end();
          II != IE; ++II)
@@ -276,20 +337,32 @@ Expression ValueTable::create_extractvalue_expression(ExtractValueInst *EI) {
 //                     ValueTable External Functions
 //===----------------------------------------------------------------------===//
 
+GVN::ValueTable::ValueTable() : nextValueNumber(1) {}
+GVN::ValueTable::ValueTable(const ValueTable &Arg)
+    : valueNumbering(Arg.valueNumbering),
+      expressionNumbering(Arg.expressionNumbering), AA(Arg.AA), MD(Arg.MD),
+      DT(Arg.DT), nextValueNumber(Arg.nextValueNumber) {}
+GVN::ValueTable::ValueTable(ValueTable &&Arg)
+    : valueNumbering(std::move(Arg.valueNumbering)),
+      expressionNumbering(std::move(Arg.expressionNumbering)),
+      AA(std::move(Arg.AA)), MD(std::move(Arg.MD)), DT(std::move(Arg.DT)),
+      nextValueNumber(std::move(Arg.nextValueNumber)) {}
+GVN::ValueTable::~ValueTable() {}
+
 /// add - Insert a value into the table with a specified value number.
-void ValueTable::add(Value *V, uint32_t num) {
+void GVN::ValueTable::add(Value *V, uint32_t num) {
   valueNumbering.insert(std::make_pair(V, num));
 }
 
-uint32_t ValueTable::lookup_or_add_call(CallInst *C) {
+uint32_t GVN::ValueTable::lookupOrAddCall(CallInst *C) {
   if (AA->doesNotAccessMemory(C)) {
-    Expression exp = create_expression(C);
+    Expression exp = createExpr(C);
     uint32_t &e = expressionNumbering[exp];
     if (!e) e = nextValueNumber++;
     valueNumbering[C] = e;
     return e;
   } else if (AA->onlyReadsMemory(C)) {
-    Expression exp = create_expression(C);
+    Expression exp = createExpr(C);
     uint32_t &e = expressionNumbering[exp];
     if (!e) {
       e = nextValueNumber++;
@@ -318,21 +391,21 @@ uint32_t ValueTable::lookup_or_add_call(CallInst *C) {
       }
 
       for (unsigned i = 0, e = C->getNumArgOperands(); i < e; ++i) {
-        uint32_t c_vn = lookup_or_add(C->getArgOperand(i));
-        uint32_t cd_vn = lookup_or_add(local_cdep->getArgOperand(i));
+        uint32_t c_vn = lookupOrAdd(C->getArgOperand(i));
+        uint32_t cd_vn = lookupOrAdd(local_cdep->getArgOperand(i));
         if (c_vn != cd_vn) {
           valueNumbering[C] = nextValueNumber;
           return nextValueNumber++;
         }
       }
 
-      uint32_t v = lookup_or_add(local_cdep);
+      uint32_t v = lookupOrAdd(local_cdep);
       valueNumbering[C] = v;
       return v;
     }
 
     // Non-local case.
-    const MemoryDependenceAnalysis::NonLocalDepInfo &deps =
+    const MemoryDependenceResults::NonLocalDepInfo &deps =
       MD->getNonLocalCallDependency(CallSite(C));
     // FIXME: Move the checking logic to MemDep!
     CallInst* cdep = nullptr;
@@ -372,15 +445,15 @@ uint32_t ValueTable::lookup_or_add_call(CallInst *C) {
       return nextValueNumber++;
     }
     for (unsigned i = 0, e = C->getNumArgOperands(); i < e; ++i) {
-      uint32_t c_vn = lookup_or_add(C->getArgOperand(i));
-      uint32_t cd_vn = lookup_or_add(cdep->getArgOperand(i));
+      uint32_t c_vn = lookupOrAdd(C->getArgOperand(i));
+      uint32_t cd_vn = lookupOrAdd(cdep->getArgOperand(i));
       if (c_vn != cd_vn) {
         valueNumbering[C] = nextValueNumber;
         return nextValueNumber++;
       }
     }
 
-    uint32_t v = lookup_or_add(cdep);
+    uint32_t v = lookupOrAdd(cdep);
     valueNumbering[C] = v;
     return v;
 
@@ -391,11 +464,11 @@ uint32_t ValueTable::lookup_or_add_call(CallInst *C) {
 }
 
 /// Returns true if a value number exists for the specified value.
-bool ValueTable::exists(Value *V) const { return valueNumbering.count(V) != 0; }
+bool GVN::ValueTable::exists(Value *V) const { return valueNumbering.count(V) != 0; }
 
 /// lookup_or_add - Returns the value number for the specified value, assigning
 /// it a new number if it did not have one before.
-uint32_t ValueTable::lookup_or_add(Value *V) {
+uint32_t GVN::ValueTable::lookupOrAdd(Value *V) {
   DenseMap<Value*, uint32_t>::iterator VI = valueNumbering.find(V);
   if (VI != valueNumbering.end())
     return VI->second;
@@ -409,7 +482,7 @@ uint32_t ValueTable::lookup_or_add(Value *V) {
   Expression exp;
   switch (I->getOpcode()) {
     case Instruction::Call:
-      return lookup_or_add_call(cast<CallInst>(I));
+      return lookupOrAddCall(cast<CallInst>(I));
     case Instruction::Add:
     case Instruction::FAdd:
     case Instruction::Sub:
@@ -448,10 +521,10 @@ uint32_t ValueTable::lookup_or_add(Value *V) {
     case Instruction::ShuffleVector:
     case Instruction::InsertValue:
     case Instruction::GetElementPtr:
-      exp = create_expression(I);
+      exp = createExpr(I);
       break;
     case Instruction::ExtractValue:
-      exp = create_extractvalue_expression(cast<ExtractValueInst>(I));
+      exp = createExtractvalueExpr(cast<ExtractValueInst>(I));
       break;
     default:
       valueNumbering[V] = nextValueNumber;
@@ -466,7 +539,7 @@ uint32_t ValueTable::lookup_or_add(Value *V) {
 
 /// Returns the value number of the specified value. Fails if
 /// the value has not yet been numbered.
-uint32_t ValueTable::lookup(Value *V) const {
+uint32_t GVN::ValueTable::lookup(Value *V) const {
   DenseMap<Value*, uint32_t>::const_iterator VI = valueNumbering.find(V);
   assert(VI != valueNumbering.end() && "Value not numbered?");
   return VI->second;
@@ -476,30 +549,30 @@ uint32_t ValueTable::lookup(Value *V) const {
 /// assigning it a new number if it did not have one before.  Useful when
 /// we deduced the result of a comparison, but don't immediately have an
 /// instruction realizing that comparison to hand.
-uint32_t ValueTable::lookup_or_add_cmp(unsigned Opcode,
-                                       CmpInst::Predicate Predicate,
-                                       Value *LHS, Value *RHS) {
-  Expression exp = create_cmp_expression(Opcode, Predicate, LHS, RHS);
+uint32_t GVN::ValueTable::lookupOrAddCmp(unsigned Opcode,
+                                         CmpInst::Predicate Predicate,
+                                         Value *LHS, Value *RHS) {
+  Expression exp = createCmpExpr(Opcode, Predicate, LHS, RHS);
   uint32_t& e = expressionNumbering[exp];
   if (!e) e = nextValueNumber++;
   return e;
 }
 
 /// Remove all entries from the ValueTable.
-void ValueTable::clear() {
+void GVN::ValueTable::clear() {
   valueNumbering.clear();
   expressionNumbering.clear();
   nextValueNumber = 1;
 }
 
 /// Remove a value from the value numbering.
-void ValueTable::erase(Value *V) {
+void GVN::ValueTable::erase(Value *V) {
   valueNumbering.erase(V);
 }
 
 /// verifyRemoved - Verify that the value is removed from all internal data
 /// structures.
-void ValueTable::verifyRemoved(const Value *V) const {
+void GVN::ValueTable::verifyRemoved(const Value *V) const {
   for (DenseMap<Value*, uint32_t>::const_iterator
          I = valueNumbering.begin(), E = valueNumbering.end(); I != E; ++I) {
     assert(I->first != V && "Inst still occurs in value numbering map!");
@@ -510,251 +583,26 @@ void ValueTable::verifyRemoved(const Value *V) const {
 //                                GVN Pass
 //===----------------------------------------------------------------------===//
 
-namespace {
-  class GVN;
-  struct AvailableValueInBlock {
-    /// BB - The basic block in question.
-    BasicBlock *BB;
-    enum ValType {
-      SimpleVal,  // A simple offsetted value that is accessed.
-      LoadVal,    // A value produced by a load.
-      MemIntrin,  // A memory intrinsic which is loaded from.
-      UndefVal    // A UndefValue representing a value from dead block (which
-                  // is not yet physically removed from the CFG). 
-    };
-  
-    /// V - The value that is live out of the block.
-    PointerIntPair<Value *, 2, ValType> Val;
-  
-    /// Offset - The byte offset in Val that is interesting for the load query.
-    unsigned Offset;
-  
-    static AvailableValueInBlock get(BasicBlock *BB, Value *V,
-                                     unsigned Offset = 0) {
-      AvailableValueInBlock Res;
-      Res.BB = BB;
-      Res.Val.setPointer(V);
-      Res.Val.setInt(SimpleVal);
-      Res.Offset = Offset;
-      return Res;
-    }
-  
-    static AvailableValueInBlock getMI(BasicBlock *BB, MemIntrinsic *MI,
-                                       unsigned Offset = 0) {
-      AvailableValueInBlock Res;
-      Res.BB = BB;
-      Res.Val.setPointer(MI);
-      Res.Val.setInt(MemIntrin);
-      Res.Offset = Offset;
-      return Res;
-    }
-  
-    static AvailableValueInBlock getLoad(BasicBlock *BB, LoadInst *LI,
-                                         unsigned Offset = 0) {
-      AvailableValueInBlock Res;
-      Res.BB = BB;
-      Res.Val.setPointer(LI);
-      Res.Val.setInt(LoadVal);
-      Res.Offset = Offset;
-      return Res;
-    }
-
-    static AvailableValueInBlock getUndef(BasicBlock *BB) {
-      AvailableValueInBlock Res;
-      Res.BB = BB;
-      Res.Val.setPointer(nullptr);
-      Res.Val.setInt(UndefVal);
-      Res.Offset = 0;
-      return Res;
-    }
-
-    bool isSimpleValue() const { return Val.getInt() == SimpleVal; }
-    bool isCoercedLoadValue() const { return Val.getInt() == LoadVal; }
-    bool isMemIntrinValue() const { return Val.getInt() == MemIntrin; }
-    bool isUndefValue() const { return Val.getInt() == UndefVal; }
-  
-    Value *getSimpleValue() const {
-      assert(isSimpleValue() && "Wrong accessor");
-      return Val.getPointer();
-    }
-  
-    LoadInst *getCoercedLoadValue() const {
-      assert(isCoercedLoadValue() && "Wrong accessor");
-      return cast<LoadInst>(Val.getPointer());
-    }
-  
-    MemIntrinsic *getMemIntrinValue() const {
-      assert(isMemIntrinValue() && "Wrong accessor");
-      return cast<MemIntrinsic>(Val.getPointer());
-    }
-  
-    /// Emit code into this block to adjust the value defined here to the
-    /// specified type. This handles various coercion cases.
-    Value *MaterializeAdjustedValue(LoadInst *LI, GVN &gvn) const;
-  };
-
-  class GVN : public FunctionPass {
-    bool NoLoads;
-    MemoryDependenceAnalysis *MD;
-    DominatorTree *DT;
-    const TargetLibraryInfo *TLI;
-    AssumptionCache *AC;
-    SetVector<BasicBlock *> DeadBlocks;
-
-    ValueTable VN;
-
-    /// A mapping from value numbers to lists of Value*'s that
-    /// have that value number.  Use findLeader to query it.
-    struct LeaderTableEntry {
-      Value *Val;
-      const BasicBlock *BB;
-      LeaderTableEntry *Next;
-    };
-    DenseMap<uint32_t, LeaderTableEntry> LeaderTable;
-    BumpPtrAllocator TableAllocator;
-
-    // Block-local map of equivalent values to their leader, does not
-    // propagate to any successors. Entries added mid-block are applied
-    // to the remaining instructions in the block.
-    SmallMapVector<llvm::Value *, llvm::Constant *, 4> ReplaceWithConstMap;
-    SmallVector<Instruction*, 8> InstrsToErase;
-
-    typedef SmallVector<NonLocalDepResult, 64> LoadDepVect;
-    typedef SmallVector<AvailableValueInBlock, 64> AvailValInBlkVect;
-    typedef SmallVector<BasicBlock*, 64> UnavailBlkVect;
-
-  public:
-    static char ID; // Pass identification, replacement for typeid
-    explicit GVN(bool noloads = false)
-        : FunctionPass(ID), NoLoads(noloads), MD(nullptr) {
-      initializeGVNPass(*PassRegistry::getPassRegistry());
-    }
-
-    bool runOnFunction(Function &F) override;
-
-    /// This removes the specified instruction from
-    /// our various maps and marks it for deletion.
-    void markInstructionForDeletion(Instruction *I) {
-      VN.erase(I);
-      InstrsToErase.push_back(I);
-    }
-
-    DominatorTree &getDominatorTree() const { return *DT; }
-    AliasAnalysis *getAliasAnalysis() const { return VN.getAliasAnalysis(); }
-    MemoryDependenceAnalysis &getMemDep() const { return *MD; }
-  private:
-    /// Push a new Value to the LeaderTable onto the list for its value number.
-    void addToLeaderTable(uint32_t N, Value *V, const BasicBlock *BB) {
-      LeaderTableEntry &Curr = LeaderTable[N];
-      if (!Curr.Val) {
-        Curr.Val = V;
-        Curr.BB = BB;
-        return;
-      }
-
-      LeaderTableEntry *Node = TableAllocator.Allocate<LeaderTableEntry>();
-      Node->Val = V;
-      Node->BB = BB;
-      Node->Next = Curr.Next;
-      Curr.Next = Node;
-    }
-
-    /// Scan the list of values corresponding to a given
-    /// value number, and remove the given instruction if encountered.
-    void removeFromLeaderTable(uint32_t N, Instruction *I, BasicBlock *BB) {
-      LeaderTableEntry* Prev = nullptr;
-      LeaderTableEntry* Curr = &LeaderTable[N];
-
-      while (Curr && (Curr->Val != I || Curr->BB != BB)) {
-        Prev = Curr;
-        Curr = Curr->Next;
-      }
-
-      if (!Curr)
-        return;
-
-      if (Prev) {
-        Prev->Next = Curr->Next;
-      } else {
-        if (!Curr->Next) {
-          Curr->Val = nullptr;
-          Curr->BB = nullptr;
-        } else {
-          LeaderTableEntry* Next = Curr->Next;
-          Curr->Val = Next->Val;
-          Curr->BB = Next->BB;
-          Curr->Next = Next->Next;
-        }
-      }
-    }
-
-    // List of critical edges to be split between iterations.
-    SmallVector<std::pair<TerminatorInst*, unsigned>, 4> toSplit;
-
-    // This transformation requires dominator postdominator info
-    void getAnalysisUsage(AnalysisUsage &AU) const override {
-      AU.addRequired<AssumptionCacheTracker>();
-      AU.addRequired<DominatorTreeWrapperPass>();
-      AU.addRequired<TargetLibraryInfoWrapperPass>();
-      if (!NoLoads)
-        AU.addRequired<MemoryDependenceAnalysis>();
-      AU.addRequired<AAResultsWrapperPass>();
-
-      AU.addPreserved<DominatorTreeWrapperPass>();
-      AU.addPreserved<GlobalsAAWrapperPass>();
-    }
-
-
-    // Helper functions of redundant load elimination 
-    bool processLoad(LoadInst *L);
-    bool processNonLocalLoad(LoadInst *L);
-    bool processAssumeIntrinsic(IntrinsicInst *II);
-    void AnalyzeLoadAvailability(LoadInst *LI, LoadDepVect &Deps, 
-                                 AvailValInBlkVect &ValuesPerBlock,
-                                 UnavailBlkVect &UnavailableBlocks);
-    bool PerformLoadPRE(LoadInst *LI, AvailValInBlkVect &ValuesPerBlock, 
-                        UnavailBlkVect &UnavailableBlocks);
-
-    // Other helper routines
-    bool processInstruction(Instruction *I);
-    bool processBlock(BasicBlock *BB);
-    void dump(DenseMap<uint32_t, Value*> &d);
-    bool iterateOnFunction(Function &F);
-    bool performPRE(Function &F);
-    bool performScalarPRE(Instruction *I);
-    bool performScalarPREInsertion(Instruction *Instr, BasicBlock *Pred,
-                                   unsigned int ValNo);
-    Value *findLeader(const BasicBlock *BB, uint32_t num);
-    void cleanupGlobalSets();
-    void verifyRemoved(const Instruction *I) const;
-    bool splitCriticalEdges();
-    BasicBlock *splitCriticalEdges(BasicBlock *Pred, BasicBlock *Succ);
-    bool replaceOperandsWithConsts(Instruction *I) const;
-    bool propagateEquality(Value *LHS, Value *RHS, const BasicBlockEdge &Root,
-                           bool DominatesByEdge);
-    bool processFoldableCondBr(BranchInst *BI);
-    void addDeadBlock(BasicBlock *BB);
-    void assignValNumForDeadCode();
-  };
-
-  char GVN::ID = 0;
-}
-
-// The public interface to this file...
-FunctionPass *llvm::createGVNPass(bool NoLoads) {
-  return new GVN(NoLoads);
+PreservedAnalyses GVN::run(Function &F, AnalysisManager<Function> &AM) {
+  // FIXME: The order of evaluation of these 'getResult' calls is very
+  // significant! Re-ordering these variables will cause GVN when run alone to
+  // be less effective! We should fix memdep and basic-aa to not exhibit this
+  // behavior, but until then don't change the order here.
+  auto &AC = AM.getResult<AssumptionAnalysis>(F);
+  auto &DT = AM.getResult<DominatorTreeAnalysis>(F);
+  auto &TLI = AM.getResult<TargetLibraryAnalysis>(F);
+  auto &AA = AM.getResult<AAManager>(F);
+  auto &MemDep = AM.getResult<MemoryDependenceAnalysis>(F);
+  bool Changed = runImpl(F, AC, DT, TLI, AA, &MemDep);
+  if (!Changed)
+    return PreservedAnalyses::all();
+  PreservedAnalyses PA;
+  PA.preserve<DominatorTreeAnalysis>();
+  PA.preserve<GlobalsAA>();
+  return PA;
 }
 
-INITIALIZE_PASS_BEGIN(GVN, "gvn", "Global Value Numbering", false, false)
-INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker)
-INITIALIZE_PASS_DEPENDENCY(MemoryDependenceAnalysis)
-INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
-INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfoWrapperPass)
-INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass)
-INITIALIZE_PASS_DEPENDENCY(GlobalsAAWrapperPass)
-INITIALIZE_PASS_END(GVN, "gvn", "Global Value Numbering", false, false)
-
-#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
+LLVM_DUMP_METHOD
 void GVN::dump(DenseMap<uint32_t, Value*>& d) {
   errs() << "{\n";
   for (DenseMap<uint32_t, Value*>::iterator I = d.begin(),
@@ -764,7 +612,6 @@ void GVN::dump(DenseMap<uint32_t, Value*>& d) {
   }
   errs() << "}\n";
 }
-#endif
 
 /// Return true if we can prove that the value
 /// we're analyzing is fully available in the specified block.  As we go, keep
@@ -875,38 +722,45 @@ static bool CanCoerceMustAliasedValueToLoad(Value *StoredVal,
 static Value *CoerceAvailableValueToLoadType(Value *StoredVal, Type *LoadedTy,
                                              IRBuilder<> &IRB,
                                              const DataLayout &DL) {
-  if (!CanCoerceMustAliasedValueToLoad(StoredVal, LoadedTy, DL))
-    return nullptr;
+  assert(CanCoerceMustAliasedValueToLoad(StoredVal, LoadedTy, DL) &&
+         "precondition violation - materialization can't fail");
+
+  if (auto *CExpr = dyn_cast<ConstantExpr>(StoredVal))
+    StoredVal = ConstantFoldConstantExpression(CExpr, DL);
 
   // If this is already the right type, just return it.
   Type *StoredValTy = StoredVal->getType();
 
-  uint64_t StoreSize = DL.getTypeSizeInBits(StoredValTy);
-  uint64_t LoadSize = DL.getTypeSizeInBits(LoadedTy);
+  uint64_t StoredValSize = DL.getTypeSizeInBits(StoredValTy);
+  uint64_t LoadedValSize = DL.getTypeSizeInBits(LoadedTy);
 
   // If the store and reload are the same size, we can always reuse it.
-  if (StoreSize == LoadSize) {
+  if (StoredValSize == LoadedValSize) {
     // Pointer to Pointer -> use bitcast.
     if (StoredValTy->getScalarType()->isPointerTy() &&
-        LoadedTy->getScalarType()->isPointerTy())
-      return IRB.CreateBitCast(StoredVal, LoadedTy);
+        LoadedTy->getScalarType()->isPointerTy()) {
+      StoredVal = IRB.CreateBitCast(StoredVal, LoadedTy);
+    } else {
+      // Convert source pointers to integers, which can be bitcast.
+      if (StoredValTy->getScalarType()->isPointerTy()) {
+        StoredValTy = DL.getIntPtrType(StoredValTy);
+        StoredVal = IRB.CreatePtrToInt(StoredVal, StoredValTy);
+      }
 
-    // Convert source pointers to integers, which can be bitcast.
-    if (StoredValTy->getScalarType()->isPointerTy()) {
-      StoredValTy = DL.getIntPtrType(StoredValTy);
-      StoredVal = IRB.CreatePtrToInt(StoredVal, StoredValTy);
-    }
+      Type *TypeToCastTo = LoadedTy;
+      if (TypeToCastTo->getScalarType()->isPointerTy())
+        TypeToCastTo = DL.getIntPtrType(TypeToCastTo);
 
-    Type *TypeToCastTo = LoadedTy;
-    if (TypeToCastTo->getScalarType()->isPointerTy())
-      TypeToCastTo = DL.getIntPtrType(TypeToCastTo);
+      if (StoredValTy != TypeToCastTo)
+        StoredVal = IRB.CreateBitCast(StoredVal, TypeToCastTo);
 
-    if (StoredValTy != TypeToCastTo)
-      StoredVal = IRB.CreateBitCast(StoredVal, TypeToCastTo);
+      // Cast to pointer if the load needs a pointer type.
+      if (LoadedTy->getScalarType()->isPointerTy())
+        StoredVal = IRB.CreateIntToPtr(StoredVal, LoadedTy);
+    }
 
-    // Cast to pointer if the load needs a pointer type.
-    if (LoadedTy->getScalarType()->isPointerTy())
-      StoredVal = IRB.CreateIntToPtr(StoredVal, LoadedTy);
+    if (auto *CExpr = dyn_cast<ConstantExpr>(StoredVal))
+      StoredVal = ConstantFoldConstantExpression(CExpr, DL);
 
     return StoredVal;
   }
@@ -914,7 +768,8 @@ static Value *CoerceAvailableValueToLoadType(Value *StoredVal, Type *LoadedTy,
   // If the loaded value is smaller than the available value, then we can
   // extract out a piece from it.  If the available value is too small, then we
   // can't do anything.
-  assert(StoreSize >= LoadSize && "CanCoerceMustAliasedValueToLoad fail");
+  assert(StoredValSize >= LoadedValSize &&
+         "CanCoerceMustAliasedValueToLoad fail");
 
   // Convert source pointers to integers, which can be manipulated.
   if (StoredValTy->getScalarType()->isPointerTy()) {
@@ -924,29 +779,35 @@ static Value *CoerceAvailableValueToLoadType(Value *StoredVal, Type *LoadedTy,
 
   // Convert vectors and fp to integer, which can be manipulated.
   if (!StoredValTy->isIntegerTy()) {
-    StoredValTy = IntegerType::get(StoredValTy->getContext(), StoreSize);
+    StoredValTy = IntegerType::get(StoredValTy->getContext(), StoredValSize);
     StoredVal = IRB.CreateBitCast(StoredVal, StoredValTy);
   }
 
   // If this is a big-endian system, we need to shift the value down to the low
   // bits so that a truncate will work.
   if (DL.isBigEndian()) {
-    StoredVal = IRB.CreateLShr(StoredVal, StoreSize - LoadSize, "tmp");
+    uint64_t ShiftAmt = DL.getTypeStoreSizeInBits(StoredValTy) -
+                        DL.getTypeStoreSizeInBits(LoadedTy);
+    StoredVal = IRB.CreateLShr(StoredVal, ShiftAmt, "tmp");
   }
 
   // Truncate the integer to the right size now.
-  Type *NewIntTy = IntegerType::get(StoredValTy->getContext(), LoadSize);
+  Type *NewIntTy = IntegerType::get(StoredValTy->getContext(), LoadedValSize);
   StoredVal  = IRB.CreateTrunc(StoredVal, NewIntTy, "trunc");
 
-  if (LoadedTy == NewIntTy)
-    return StoredVal;
+  if (LoadedTy != NewIntTy) {
+    // If the result is a pointer, inttoptr.
+    if (LoadedTy->getScalarType()->isPointerTy())
+      StoredVal = IRB.CreateIntToPtr(StoredVal, LoadedTy, "inttoptr");
+    else
+      // Otherwise, bitcast.
+      StoredVal = IRB.CreateBitCast(StoredVal, LoadedTy, "bitcast");
+  }
 
-  // If the result is a pointer, inttoptr.
-  if (LoadedTy->getScalarType()->isPointerTy())
-    return IRB.CreateIntToPtr(StoredVal, LoadedTy, "inttoptr");
+  if (auto *CExpr = dyn_cast<ConstantExpr>(StoredVal))
+    StoredVal = ConstantFoldConstantExpression(CExpr, DL);
 
-  // Otherwise, bitcast.
-  return IRB.CreateBitCast(StoredVal, LoadedTy, "bitcast");
+  return StoredVal;
 }
 
 /// This function is called when we have a
@@ -1067,10 +928,15 @@ static int AnalyzeLoadFromClobberingLoad(Type *LoadTy, Value *LoadPtr,
       GetPointerBaseWithConstantOffset(LoadPtr, LoadOffs, DL);
   unsigned LoadSize = DL.getTypeStoreSize(LoadTy);
 
-  unsigned Size = MemoryDependenceAnalysis::getLoadLoadClobberFullWidthSize(
+  unsigned Size = MemoryDependenceResults::getLoadLoadClobberFullWidthSize(
       LoadBase, LoadOffs, LoadSize, DepLI);
   if (Size == 0) return -1;
 
+  // Check non-obvious conditions enforced by MDA which we rely on for being
+  // able to materialize this potentially available value
+  assert(DepLI->isSimple() && "Cannot widen volatile/atomic load!");
+  assert(DepLI->getType()->isIntegerTy() && "Can't widen non-integer load");
+
   return AnalyzeLoadFromClobberingWrite(LoadTy, LoadPtr, DepPtr, Size*8, DL);
 }
 
@@ -1117,7 +983,7 @@ static int AnalyzeLoadFromClobberingMemInst(Type *LoadTy, Value *LoadPtr,
   Src = ConstantExpr::getGetElementPtr(Type::getInt8Ty(Src->getContext()), Src,
                                        OffsetCst);
   Src = ConstantExpr::getBitCast(Src, PointerType::get(LoadTy, AS));
-  if (ConstantFoldLoadFromConstPtr(Src, DL))
+  if (ConstantFoldLoadFromConstPtr(Src, LoadTy, DL))
     return Offset;
   return -1;
 }
@@ -1173,9 +1039,9 @@ static Value *GetLoadValueForLoad(LoadInst *SrcVal, unsigned Offset,
   const DataLayout &DL = SrcVal->getModule()->getDataLayout();
   // If Offset+LoadTy exceeds the size of SrcVal, then we must be wanting to
   // widen SrcVal out to a larger load.
-  unsigned SrcValSize = DL.getTypeStoreSize(SrcVal->getType());
+  unsigned SrcValStoreSize = DL.getTypeStoreSize(SrcVal->getType());
   unsigned LoadSize = DL.getTypeStoreSize(LoadTy);
-  if (Offset+LoadSize > SrcValSize) {
+  if (Offset+LoadSize > SrcValStoreSize) {
     assert(SrcVal->isSimple() && "Cannot widen volatile/atomic load!");
     assert(SrcVal->getType()->isIntegerTy() && "Can't widen non-integer load");
     // If we have a load/load clobber an DepLI can be widened to cover this
@@ -1207,8 +1073,7 @@ static Value *GetLoadValueForLoad(LoadInst *SrcVal, unsigned Offset,
     // system, we need to shift down to get the relevant bits.
     Value *RV = NewLoad;
     if (DL.isBigEndian())
-      RV = Builder.CreateLShr(RV,
-                    NewLoadSize*8-SrcVal->getType()->getPrimitiveSizeInBits());
+      RV = Builder.CreateLShr(RV, (NewLoadSize - SrcValStoreSize) * 8);
     RV = Builder.CreateTrunc(RV, SrcVal->getType());
     SrcVal->replaceAllUsesWith(RV);
 
@@ -1279,7 +1144,7 @@ static Value *GetMemInstValueForLoad(MemIntrinsic *SrcInst, unsigned Offset,
   Src = ConstantExpr::getGetElementPtr(Type::getInt8Ty(Src->getContext()), Src,
                                        OffsetCst);
   Src = ConstantExpr::getBitCast(Src, PointerType::get(LoadTy, AS));
-  return ConstantFoldLoadFromConstPtr(Src, DL);
+  return ConstantFoldLoadFromConstPtr(Src, LoadTy, DL);
 }
 
 
@@ -1294,7 +1159,8 @@ static Value *ConstructSSAForLoadSet(LoadInst *LI,
   if (ValuesPerBlock.size() == 1 &&
       gvn.getDominatorTree().properlyDominates(ValuesPerBlock[0].BB,
                                                LI->getParent())) {
-    assert(!ValuesPerBlock[0].isUndefValue() && "Dead BB dominate this block");
+    assert(!ValuesPerBlock[0].AV.isUndefValue() &&
+           "Dead BB dominate this block");
     return ValuesPerBlock[0].MaterializeAdjustedValue(LI, gvn);
   }
 
@@ -1316,15 +1182,16 @@ static Value *ConstructSSAForLoadSet(LoadInst *LI,
   return SSAUpdate.GetValueInMiddleOfBlock(LI->getParent());
 }
 
-Value *AvailableValueInBlock::MaterializeAdjustedValue(LoadInst *LI,
-                                                       GVN &gvn) const {
+Value *AvailableValue::MaterializeAdjustedValue(LoadInst *LI,
+                                                Instruction *InsertPt,
+                                                GVN &gvn) const {
   Value *Res;
   Type *LoadTy = LI->getType();
   const DataLayout &DL = LI->getModule()->getDataLayout();
   if (isSimpleValue()) {
     Res = getSimpleValue();
     if (Res->getType() != LoadTy) {
-      Res = GetStoreValueForLoad(Res, Offset, LoadTy, BB->getTerminator(), DL);
+      Res = GetStoreValueForLoad(Res, Offset, LoadTy, InsertPt, DL);
 
       DEBUG(dbgs() << "GVN COERCED NONLOCAL VAL:\nOffset: " << Offset << "  "
                    << *getSimpleValue() << '\n'
@@ -1335,16 +1202,15 @@ Value *AvailableValueInBlock::MaterializeAdjustedValue(LoadInst *LI,
     if (Load->getType() == LoadTy && Offset == 0) {
       Res = Load;
     } else {
-      Res = GetLoadValueForLoad(Load, Offset, LoadTy, BB->getTerminator(),
-                                gvn);
-  
+      Res = GetLoadValueForLoad(Load, Offset, LoadTy, InsertPt, gvn);
+
       DEBUG(dbgs() << "GVN COERCED NONLOCAL LOAD:\nOffset: " << Offset << "  "
                    << *getCoercedLoadValue() << '\n'
                    << *Res << '\n' << "\n\n\n");
     }
   } else if (isMemIntrinValue()) {
     Res = GetMemInstValueForLoad(getMemIntrinValue(), Offset, LoadTy,
-                                 BB->getTerminator(), DL);
+                                 InsertPt, DL);
     DEBUG(dbgs() << "GVN COERCED NONLOCAL MEM INTRIN:\nOffset: " << Offset
                  << "  " << *getMemIntrinValue() << '\n'
                  << *Res << '\n' << "\n\n\n");
@@ -1353,6 +1219,7 @@ Value *AvailableValueInBlock::MaterializeAdjustedValue(LoadInst *LI,
     DEBUG(dbgs() << "GVN COERCED NONLOCAL Undef:\n";);
     return UndefValue::get(LoadTy);
   }
+  assert(Res && "failed to materialize?");
   return Res;
 }
 
@@ -1362,7 +1229,134 @@ static bool isLifetimeStart(const Instruction *Inst) {
   return false;
 }
 
-void GVN::AnalyzeLoadAvailability(LoadInst *LI, LoadDepVect &Deps, 
+bool GVN::AnalyzeLoadAvailability(LoadInst *LI, MemDepResult DepInfo,
+                                  Value *Address, AvailableValue &Res) {
+
+  assert((DepInfo.isDef() || DepInfo.isClobber()) &&
+         "expected a local dependence");
+  assert(LI->isUnordered() && "rules below are incorrect for ordered access");
+
+  const DataLayout &DL = LI->getModule()->getDataLayout();
+
+  if (DepInfo.isClobber()) {
+    // If the dependence is to a store that writes to a superset of the bits
+    // read by the load, we can extract the bits we need for the load from the
+    // stored value.
+    if (StoreInst *DepSI = dyn_cast<StoreInst>(DepInfo.getInst())) {
+      // Can't forward from non-atomic to atomic without violating memory model.
+      if (Address && LI->isAtomic() <= DepSI->isAtomic()) {
+        int Offset =
+          AnalyzeLoadFromClobberingStore(LI->getType(), Address, DepSI);
+        if (Offset != -1) {
+          Res = AvailableValue::get(DepSI->getValueOperand(), Offset);
+          return true;
+        }
+      }
+    }
+
+    // Check to see if we have something like this:
+    //    load i32* P
+    //    load i8* (P+1)
+    // if we have this, replace the later with an extraction from the former.
+    if (LoadInst *DepLI = dyn_cast<LoadInst>(DepInfo.getInst())) {
+      // If this is a clobber and L is the first instruction in its block, then
+      // we have the first instruction in the entry block.
+      // Can't forward from non-atomic to atomic without violating memory model.
+      if (DepLI != LI && Address && LI->isAtomic() <= DepLI->isAtomic()) {
+        int Offset =
+          AnalyzeLoadFromClobberingLoad(LI->getType(), Address, DepLI, DL);
+
+        if (Offset != -1) {
+          Res = AvailableValue::getLoad(DepLI, Offset);
+          return true;
+        }
+      }
+    }
+
+    // If the clobbering value is a memset/memcpy/memmove, see if we can
+    // forward a value on from it.
+    if (MemIntrinsic *DepMI = dyn_cast<MemIntrinsic>(DepInfo.getInst())) {
+      if (Address && !LI->isAtomic()) {
+        int Offset = AnalyzeLoadFromClobberingMemInst(LI->getType(), Address,
+                                                      DepMI, DL);
+        if (Offset != -1) {
+          Res = AvailableValue::getMI(DepMI, Offset);
+          return true;
+        }
+      }
+    }
+    // Nothing known about this clobber, have to be conservative
+    DEBUG(
+      // fast print dep, using operator<< on instruction is too slow.
+      dbgs() << "GVN: load ";
+      LI->printAsOperand(dbgs());
+      Instruction *I = DepInfo.getInst();
+      dbgs() << " is clobbered by " << *I << '\n';
+    );
+    return false;
+  }
+  assert(DepInfo.isDef() && "follows from above");
+
+  Instruction *DepInst = DepInfo.getInst();
+
+  // Loading the allocation -> undef.
+  if (isa<AllocaInst>(DepInst) || isMallocLikeFn(DepInst, TLI) ||
+      // Loading immediately after lifetime begin -> undef.
+      isLifetimeStart(DepInst)) {
+    Res = AvailableValue::get(UndefValue::get(LI->getType()));
+    return true;
+  }
+
+  // Loading from calloc (which zero initializes memory) -> zero
+  if (isCallocLikeFn(DepInst, TLI)) {
+    Res = AvailableValue::get(Constant::getNullValue(LI->getType()));
+    return true;
+  }
+
+  if (StoreInst *S = dyn_cast<StoreInst>(DepInst)) {
+    // Reject loads and stores that are to the same address but are of
+    // different types if we have to. If the stored value is larger or equal to
+    // the loaded value, we can reuse it.
+    if (S->getValueOperand()->getType() != LI->getType() &&
+        !CanCoerceMustAliasedValueToLoad(S->getValueOperand(),
+                                         LI->getType(), DL))
+      return false;
+
+    // Can't forward from non-atomic to atomic without violating memory model.
+    if (S->isAtomic() < LI->isAtomic())
+      return false;
+
+    Res = AvailableValue::get(S->getValueOperand());
+    return true;
+  }
+
+  if (LoadInst *LD = dyn_cast<LoadInst>(DepInst)) {
+    // If the types mismatch and we can't handle it, reject reuse of the load.
+    // If the stored value is larger or equal to the loaded value, we can reuse
+    // it.
+    if (LD->getType() != LI->getType() &&
+        !CanCoerceMustAliasedValueToLoad(LD, LI->getType(), DL))
+      return false;
+
+    // Can't forward from non-atomic to atomic without violating memory model.
+    if (LD->isAtomic() < LI->isAtomic())
+      return false;
+
+    Res = AvailableValue::getLoad(LD);
+    return true;
+  }
+
+  // Unknown def - must be conservative
+  DEBUG(
+    // fast print dep, using operator<< on instruction is too slow.
+    dbgs() << "GVN: load ";
+    LI->printAsOperand(dbgs());
+    dbgs() << " has unknown def " << *DepInst << '\n';
+  );
+  return false;
+}
+
+void GVN::AnalyzeLoadAvailability(LoadInst *LI, LoadDepVect &Deps,
                                   AvailValInBlkVect &ValuesPerBlock,
                                   UnavailBlkVect &UnavailableBlocks) {
 
@@ -1371,7 +1365,6 @@ void GVN::AnalyzeLoadAvailability(LoadInst *LI, LoadDepVect &Deps,
   // dependencies that produce an unknown value for the load (such as a call
   // that could potentially clobber the load).
   unsigned NumDeps = Deps.size();
-  const DataLayout &DL = LI->getModule()->getDataLayout();
   for (unsigned i = 0, e = NumDeps; i != e; ++i) {
     BasicBlock *DepBB = Deps[i].getBB();
     MemDepResult DepInfo = Deps[i].getResult();
@@ -1388,122 +1381,28 @@ void GVN::AnalyzeLoadAvailability(LoadInst *LI, LoadDepVect &Deps,
       continue;
     }
 
-    if (DepInfo.isClobber()) {
-      // The address being loaded in this non-local block may not be the same as
-      // the pointer operand of the load if PHI translation occurs.  Make sure
-      // to consider the right address.
-      Value *Address = Deps[i].getAddress();
-
-      // If the dependence is to a store that writes to a superset of the bits
-      // read by the load, we can extract the bits we need for the load from the
-      // stored value.
-      if (StoreInst *DepSI = dyn_cast<StoreInst>(DepInfo.getInst())) {
-        if (Address) {
-          int Offset =
-              AnalyzeLoadFromClobberingStore(LI->getType(), Address, DepSI);
-          if (Offset != -1) {
-            ValuesPerBlock.push_back(AvailableValueInBlock::get(DepBB,
-                                                       DepSI->getValueOperand(),
-                                                                Offset));
-            continue;
-          }
-        }
-      }
-
-      // Check to see if we have something like this:
-      //    load i32* P
-      //    load i8* (P+1)
-      // if we have this, replace the later with an extraction from the former.
-      if (LoadInst *DepLI = dyn_cast<LoadInst>(DepInfo.getInst())) {
-        // If this is a clobber and L is the first instruction in its block, then
-        // we have the first instruction in the entry block.
-        if (DepLI != LI && Address) {
-          int Offset =
-              AnalyzeLoadFromClobberingLoad(LI->getType(), Address, DepLI, DL);
-
-          if (Offset != -1) {
-            ValuesPerBlock.push_back(AvailableValueInBlock::getLoad(DepBB,DepLI,
-                                                                    Offset));
-            continue;
-          }
-        }
-      }
-
-      // If the clobbering value is a memset/memcpy/memmove, see if we can
-      // forward a value on from it.
-      if (MemIntrinsic *DepMI = dyn_cast<MemIntrinsic>(DepInfo.getInst())) {
-        if (Address) {
-          int Offset = AnalyzeLoadFromClobberingMemInst(LI->getType(), Address,
-                                                        DepMI, DL);
-          if (Offset != -1) {
-            ValuesPerBlock.push_back(AvailableValueInBlock::getMI(DepBB, DepMI,
-                                                                  Offset));
-            continue;
-          }
-        }
-      }
-
-      UnavailableBlocks.push_back(DepBB);
-      continue;
-    }
-
-    // DepInfo.isDef() here
-
-    Instruction *DepInst = DepInfo.getInst();
-
-    // Loading the allocation -> undef.
-    if (isa<AllocaInst>(DepInst) || isMallocLikeFn(DepInst, TLI) ||
-        // Loading immediately after lifetime begin -> undef.
-        isLifetimeStart(DepInst)) {
-      ValuesPerBlock.push_back(AvailableValueInBlock::get(DepBB,
-                                             UndefValue::get(LI->getType())));
-      continue;
-    }
-
-    // Loading from calloc (which zero initializes memory) -> zero
-    if (isCallocLikeFn(DepInst, TLI)) {
-      ValuesPerBlock.push_back(AvailableValueInBlock::get(
-          DepBB, Constant::getNullValue(LI->getType())));
-      continue;
-    }
-
-    if (StoreInst *S = dyn_cast<StoreInst>(DepInst)) {
-      // Reject loads and stores that are to the same address but are of
-      // different types if we have to.
-      if (S->getValueOperand()->getType() != LI->getType()) {
-        // If the stored value is larger or equal to the loaded value, we can
-        // reuse it.
-        if (!CanCoerceMustAliasedValueToLoad(S->getValueOperand(),
-                                             LI->getType(), DL)) {
-          UnavailableBlocks.push_back(DepBB);
-          continue;
-        }
-      }
+    // The address being loaded in this non-local block may not be the same as
+    // the pointer operand of the load if PHI translation occurs.  Make sure
+    // to consider the right address.
+    Value *Address = Deps[i].getAddress();
 
+    AvailableValue AV;
+    if (AnalyzeLoadAvailability(LI, DepInfo, Address, AV)) {
+      // subtlety: because we know this was a non-local dependency, we know
+      // it's safe to materialize anywhere between the instruction within
+      // DepInfo and the end of it's block.
       ValuesPerBlock.push_back(AvailableValueInBlock::get(DepBB,
-                                                         S->getValueOperand()));
-      continue;
-    }
-
-    if (LoadInst *LD = dyn_cast<LoadInst>(DepInst)) {
-      // If the types mismatch and we can't handle it, reject reuse of the load.
-      if (LD->getType() != LI->getType()) {
-        // If the stored value is larger or equal to the loaded value, we can
-        // reuse it.
-        if (!CanCoerceMustAliasedValueToLoad(LD, LI->getType(), DL)) {
-          UnavailableBlocks.push_back(DepBB);
-          continue;
-        }
-      }
-      ValuesPerBlock.push_back(AvailableValueInBlock::getLoad(DepBB, LD));
-      continue;
+                                                          std::move(AV)));
+    } else {
+      UnavailableBlocks.push_back(DepBB);
     }
-
-    UnavailableBlocks.push_back(DepBB);
   }
+
+  assert(NumDeps == ValuesPerBlock.size() + UnavailableBlocks.size() &&
+         "post condition violation");
 }
 
-bool GVN::PerformLoadPRE(LoadInst *LI, AvailValInBlkVect &ValuesPerBlock, 
+bool GVN::PerformLoadPRE(LoadInst *LI, AvailValInBlkVect &ValuesPerBlock,
                          UnavailBlkVect &UnavailableBlocks) {
   // Okay, we have *some* definitions of the value.  This means that the value
   // is available in some of our (transitive) predecessors.  Lets think about
@@ -1661,16 +1560,17 @@ bool GVN::PerformLoadPRE(LoadInst *LI, AvailValInBlkVect &ValuesPerBlock,
     // parent's availability map.  However, in doing so, we risk getting into
     // ordering issues.  If a block hasn't been processed yet, we would be
     // marking a value as AVAIL-IN, which isn't what we intend.
-    VN.lookup_or_add(I);
+    VN.lookupOrAdd(I);
   }
 
   for (const auto &PredLoad : PredLoads) {
     BasicBlock *UnavailablePred = PredLoad.first;
     Value *LoadPtr = PredLoad.second;
 
-    Instruction *NewLoad = new LoadInst(LoadPtr, LI->getName()+".pre", false,
-                                        LI->getAlignment(),
-                                        UnavailablePred->getTerminator());
+    auto *NewLoad = new LoadInst(LoadPtr, LI->getName()+".pre",
+                                 LI->isVolatile(), LI->getAlignment(),
+                                 LI->getOrdering(), LI->getSynchScope(),
+                                 UnavailablePred->getTerminator());
 
     // Transfer the old load's AA tags to the new load.
     AAMDNodes Tags;
@@ -1682,6 +1582,8 @@ bool GVN::PerformLoadPRE(LoadInst *LI, AvailValInBlkVect &ValuesPerBlock,
       NewLoad->setMetadata(LLVMContext::MD_invariant_load, MD);
     if (auto *InvGroupMD = LI->getMetadata(LLVMContext::MD_invariant_group))
       NewLoad->setMetadata(LLVMContext::MD_invariant_group, InvGroupMD);
+    if (auto *RangeMD = LI->getMetadata(LLVMContext::MD_range))
+      NewLoad->setMetadata(LLVMContext::MD_range, RangeMD);
 
     // Transfer DebugLoc.
     NewLoad->setDebugLoc(LI->getDebugLoc());
@@ -1846,30 +1748,29 @@ bool GVN::processAssumeIntrinsic(IntrinsicInst *IntrinsicI) {
 }
 
 static void patchReplacementInstruction(Instruction *I, Value *Repl) {
+  auto *ReplInst = dyn_cast<Instruction>(Repl);
+  if (!ReplInst)
+    return;
+
   // Patch the replacement so that it is not more restrictive than the value
   // being replaced.
-  BinaryOperator *Op = dyn_cast<BinaryOperator>(I);
-  BinaryOperator *ReplOp = dyn_cast<BinaryOperator>(Repl);
-  if (Op && ReplOp)
-    ReplOp->andIRFlags(Op);
-
-  if (Instruction *ReplInst = dyn_cast<Instruction>(Repl)) {
-    // FIXME: If both the original and replacement value are part of the
-    // same control-flow region (meaning that the execution of one
-    // guarantees the execution of the other), then we can combine the
-    // noalias scopes here and do better than the general conservative
-    // answer used in combineMetadata().
-
-    // In general, GVN unifies expressions over different control-flow
-    // regions, and so we need a conservative combination of the noalias
-    // scopes.
-    static const unsigned KnownIDs[] = {
-        LLVMContext::MD_tbaa,           LLVMContext::MD_alias_scope,
-        LLVMContext::MD_noalias,        LLVMContext::MD_range,
-        LLVMContext::MD_fpmath,         LLVMContext::MD_invariant_load,
-        LLVMContext::MD_invariant_group};
-    combineMetadata(ReplInst, I, KnownIDs);
-  }
+  ReplInst->andIRFlags(I);
+
+  // FIXME: If both the original and replacement value are part of the
+  // same control-flow region (meaning that the execution of one
+  // guarantees the execution of the other), then we can combine the
+  // noalias scopes here and do better than the general conservative
+  // answer used in combineMetadata().
+
+  // In general, GVN unifies expressions over different control-flow
+  // regions, and so we need a conservative combination of the noalias
+  // scopes.
+  static const unsigned KnownIDs[] = {
+      LLVMContext::MD_tbaa,           LLVMContext::MD_alias_scope,
+      LLVMContext::MD_noalias,        LLVMContext::MD_range,
+      LLVMContext::MD_fpmath,         LLVMContext::MD_invariant_load,
+      LLVMContext::MD_invariant_group};
+  combineMetadata(ReplInst, I, KnownIDs);
 }
 
 static void patchAndReplaceAllUsesWith(Instruction *I, Value *Repl) {
@@ -1883,7 +1784,8 @@ bool GVN::processLoad(LoadInst *L) {
   if (!MD)
     return false;
 
-  if (!L->isSimple())
+  // This code hasn't been audited for ordered or volatile memory access
+  if (!L->isUnordered())
     return false;
 
   if (L->use_empty()) {
@@ -1893,84 +1795,14 @@ bool GVN::processLoad(LoadInst *L) {
 
   // ... to a pointer that has been loaded from before...
   MemDepResult Dep = MD->getDependency(L);
-  const DataLayout &DL = L->getModule()->getDataLayout();
-
-  // If we have a clobber and target data is around, see if this is a clobber
-  // that we can fix up through code synthesis.
-  if (Dep.isClobber()) {
-    // Check to see if we have something like this:
-    //   store i32 123, i32* %P
-    //   %A = bitcast i32* %P to i8*
-    //   %B = gep i8* %A, i32 1
-    //   %C = load i8* %B
-    //
-    // We could do that by recognizing if the clobber instructions are obviously
-    // a common base + constant offset, and if the previous store (or memset)
-    // completely covers this load.  This sort of thing can happen in bitfield
-    // access code.
-    Value *AvailVal = nullptr;
-    if (StoreInst *DepSI = dyn_cast<StoreInst>(Dep.getInst())) {
-      int Offset = AnalyzeLoadFromClobberingStore(
-          L->getType(), L->getPointerOperand(), DepSI);
-      if (Offset != -1)
-        AvailVal = GetStoreValueForLoad(DepSI->getValueOperand(), Offset,
-                                        L->getType(), L, DL);
-    }
-
-    // Check to see if we have something like this:
-    //    load i32* P
-    //    load i8* (P+1)
-    // if we have this, replace the later with an extraction from the former.
-    if (LoadInst *DepLI = dyn_cast<LoadInst>(Dep.getInst())) {
-      // If this is a clobber and L is the first instruction in its block, then
-      // we have the first instruction in the entry block.
-      if (DepLI == L)
-        return false;
-
-      int Offset = AnalyzeLoadFromClobberingLoad(
-          L->getType(), L->getPointerOperand(), DepLI, DL);
-      if (Offset != -1)
-        AvailVal = GetLoadValueForLoad(DepLI, Offset, L->getType(), L, *this);
-    }
-
-    // If the clobbering value is a memset/memcpy/memmove, see if we can forward
-    // a value on from it.
-    if (MemIntrinsic *DepMI = dyn_cast<MemIntrinsic>(Dep.getInst())) {
-      int Offset = AnalyzeLoadFromClobberingMemInst(
-          L->getType(), L->getPointerOperand(), DepMI, DL);
-      if (Offset != -1)
-        AvailVal = GetMemInstValueForLoad(DepMI, Offset, L->getType(), L, DL);
-    }
-
-    if (AvailVal) {
-      DEBUG(dbgs() << "GVN COERCED INST:\n" << *Dep.getInst() << '\n'
-            << *AvailVal << '\n' << *L << "\n\n\n");
-
-      // Replace the load!
-      L->replaceAllUsesWith(AvailVal);
-      if (AvailVal->getType()->getScalarType()->isPointerTy())
-        MD->invalidateCachedPointerInfo(AvailVal);
-      markInstructionForDeletion(L);
-      ++NumGVNLoad;
-      return true;
-    }
-
-    // If the value isn't available, don't do anything!
-    DEBUG(
-      // fast print dep, using operator<< on instruction is too slow.
-      dbgs() << "GVN: load ";
-      L->printAsOperand(dbgs());
-      Instruction *I = Dep.getInst();
-      dbgs() << " is clobbered by " << *I << '\n';
-    );
-    return false;
-  }
 
   // If it is defined in another block, try harder.
   if (Dep.isNonLocal())
     return processNonLocalLoad(L);
 
-  if (!Dep.isDef()) {
+  // Only handle the local case below
+  if (!Dep.isDef() && !Dep.isClobber()) {
+    // This might be a NonFuncLocal or an Unknown
     DEBUG(
       // fast print dep, using operator<< on instruction is too slow.
       dbgs() << "GVN: load ";
@@ -1980,86 +1812,18 @@ bool GVN::processLoad(LoadInst *L) {
     return false;
   }
 
-  Instruction *DepInst = Dep.getInst();
-  if (StoreInst *DepSI = dyn_cast<StoreInst>(DepInst)) {
-    Value *StoredVal = DepSI->getValueOperand();
-
-    // The store and load are to a must-aliased pointer, but they may not
-    // actually have the same type.  See if we know how to reuse the stored
-    // value (depending on its type).
-    if (StoredVal->getType() != L->getType()) {
-      IRBuilder<> Builder(L);
-      StoredVal =
-          CoerceAvailableValueToLoadType(StoredVal, L->getType(), Builder, DL);
-      if (!StoredVal)
-        return false;
-
-      DEBUG(dbgs() << "GVN COERCED STORE:\n" << *DepSI << '\n' << *StoredVal
-                   << '\n' << *L << "\n\n\n");
-    }
-
-    // Remove it!
-    L->replaceAllUsesWith(StoredVal);
-    if (StoredVal->getType()->getScalarType()->isPointerTy())
-      MD->invalidateCachedPointerInfo(StoredVal);
-    markInstructionForDeletion(L);
-    ++NumGVNLoad;
-    return true;
-  }
-
-  if (LoadInst *DepLI = dyn_cast<LoadInst>(DepInst)) {
-    Value *AvailableVal = DepLI;
-
-    // The loads are of a must-aliased pointer, but they may not actually have
-    // the same type.  See if we know how to reuse the previously loaded value
-    // (depending on its type).
-    if (DepLI->getType() != L->getType()) {
-      IRBuilder<> Builder(L);
-      AvailableVal =
-          CoerceAvailableValueToLoadType(DepLI, L->getType(), Builder, DL);
-      if (!AvailableVal)
-        return false;
-
-      DEBUG(dbgs() << "GVN COERCED LOAD:\n" << *DepLI << "\n" << *AvailableVal
-                   << "\n" << *L << "\n\n\n");
-    }
-
-    // Remove it!
-    patchAndReplaceAllUsesWith(L, AvailableVal);
-    if (DepLI->getType()->getScalarType()->isPointerTy())
-      MD->invalidateCachedPointerInfo(DepLI);
-    markInstructionForDeletion(L);
-    ++NumGVNLoad;
-    return true;
-  }
-
-  // If this load really doesn't depend on anything, then we must be loading an
-  // undef value.  This can happen when loading for a fresh allocation with no
-  // intervening stores, for example.
-  if (isa<AllocaInst>(DepInst) || isMallocLikeFn(DepInst, TLI)) {
-    L->replaceAllUsesWith(UndefValue::get(L->getType()));
-    markInstructionForDeletion(L);
-    ++NumGVNLoad;
-    return true;
-  }
+  AvailableValue AV;
+  if (AnalyzeLoadAvailability(L, Dep, L->getPointerOperand(), AV)) {
+    Value *AvailableValue = AV.MaterializeAdjustedValue(L, L, *this);
 
-  // If this load occurs either right after a lifetime begin,
-  // then the loaded value is undefined.
-  if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(DepInst)) {
-    if (II->getIntrinsicID() == Intrinsic::lifetime_start) {
-      L->replaceAllUsesWith(UndefValue::get(L->getType()));
-      markInstructionForDeletion(L);
-      ++NumGVNLoad;
-      return true;
-    }
-  }
-
-  // If this load follows a calloc (which zero initializes memory),
-  // then the loaded value is zero
-  if (isCallocLikeFn(DepInst, TLI)) {
-    L->replaceAllUsesWith(Constant::getNullValue(L->getType()));
+    // Replace the load!
+    patchAndReplaceAllUsesWith(L, AvailableValue);
     markInstructionForDeletion(L);
     ++NumGVNLoad;
+    // Tell MDA to rexamine the reused pointer since we might have more
+    // information after forwarding it.
+    if (MD && AvailableValue->getType()->getScalarType()->isPointerTy())
+      MD->invalidateCachedPointerInfo(AvailableValue);
     return true;
   }
 
@@ -2105,9 +1869,8 @@ static bool isOnlyReachableViaThisEdge(const BasicBlockEdge &E,
   // GVN runs all such loops have preheaders, which means that Dst will have
   // been changed to have only one predecessor, namely Src.
   const BasicBlock *Pred = E.getEnd()->getSinglePredecessor();
-  const BasicBlock *Src = E.getStart();
-  assert((!Pred || Pred == Src) && "No edge between these basic blocks!");
-  (void)Src;
+  assert((!Pred || Pred == E.getStart()) &&
+         "No edge between these basic blocks!");
   return Pred != nullptr;
 }
 
@@ -2133,7 +1896,8 @@ bool GVN::replaceOperandsWithConsts(Instruction *Instr) const {
 /// The given values are known to be equal in every block
 /// dominated by 'Root'.  Exploit this, for example by replacing 'LHS' with
 /// 'RHS' everywhere in the scope.  Returns whether a change was made.
-/// If DominatesByEdge is false, then it means that it is dominated by Root.End.
+/// If DominatesByEdge is false, then it means that we will propagate the RHS
+/// value starting from the end of Root.Start.
 bool GVN::propagateEquality(Value *LHS, Value *RHS, const BasicBlockEdge &Root,
                             bool DominatesByEdge) {
   SmallVector<std::pair<Value*, Value*>, 4> Worklist;
@@ -2141,7 +1905,7 @@ bool GVN::propagateEquality(Value *LHS, Value *RHS, const BasicBlockEdge &Root,
   bool Changed = false;
   // For speed, compute a conservative fast approximation to
   // DT->dominates(Root, Root.getEnd());
-  bool RootDominatesEnd = isOnlyReachableViaThisEdge(Root, DT);
+  const bool RootDominatesEnd = isOnlyReachableViaThisEdge(Root, DT);
 
   while (!Worklist.empty()) {
     std::pair<Value*, Value*> Item = Worklist.pop_back_val();
@@ -2164,12 +1928,12 @@ bool GVN::propagateEquality(Value *LHS, Value *RHS, const BasicBlockEdge &Root,
     // right-hand side, ensure the longest lived term is on the right-hand side,
     // so the shortest lived term will be replaced by the longest lived.
     // This tends to expose more simplifications.
-    uint32_t LVN = VN.lookup_or_add(LHS);
+    uint32_t LVN = VN.lookupOrAdd(LHS);
     if ((isa<Argument>(LHS) && isa<Argument>(RHS)) ||
         (isa<Instruction>(LHS) && isa<Instruction>(RHS))) {
       // Move the 'oldest' value to the right-hand side, using the value number
       // as a proxy for age.
-      uint32_t RVN = VN.lookup_or_add(RHS);
+      uint32_t RVN = VN.lookupOrAdd(RHS);
       if (LVN < RVN) {
         std::swap(LHS, RHS);
         LVN = RVN;
@@ -2195,7 +1959,7 @@ bool GVN::propagateEquality(Value *LHS, Value *RHS, const BasicBlockEdge &Root,
       unsigned NumReplacements =
           DominatesByEdge
               ? replaceDominatedUsesWith(LHS, RHS, *DT, Root)
-              : replaceDominatedUsesWith(LHS, RHS, *DT, Root.getEnd());
+              : replaceDominatedUsesWith(LHS, RHS, *DT, Root.getStart());
 
       Changed |= NumReplacements > 0;
       NumGVNEqProp += NumReplacements;
@@ -2245,7 +2009,7 @@ bool GVN::propagateEquality(Value *LHS, Value *RHS, const BasicBlockEdge &Root,
         // Floating point -0.0 and 0.0 compare equal, so we can only
         // propagate values if we know that we have a constant and that
         // its value is non-zero.
-        
+
         // FIXME: We should do this optimization if 'no signed zeros' is
         // applicable via an instruction-level fast-math-flag or some other
         // indicator that relaxed FP semantics are being used.
@@ -2253,7 +2017,7 @@ bool GVN::propagateEquality(Value *LHS, Value *RHS, const BasicBlockEdge &Root,
         if (isa<ConstantFP>(Op1) && !cast<ConstantFP>(Op1)->isZero())
           Worklist.push_back(std::make_pair(Op0, Op1));
       }
- 
+
       // If "A >= B" is known true, replace "A < B" with false everywhere.
       CmpInst::Predicate NotPred = Cmp->getInversePredicate();
       Constant *NotVal = ConstantInt::get(Cmp->getType(), isKnownFalse);
@@ -2261,7 +2025,7 @@ bool GVN::propagateEquality(Value *LHS, Value *RHS, const BasicBlockEdge &Root,
       // out the value number that it would have and use that to find an
       // appropriate instruction (if any).
       uint32_t NextNum = VN.getNextUnusedValueNumber();
-      uint32_t Num = VN.lookup_or_add_cmp(Cmp->getOpcode(), NotPred, Op0, Op1);
+      uint32_t Num = VN.lookupOrAddCmp(Cmp->getOpcode(), NotPred, Op0, Op1);
       // If the number we were assigned was brand new then there is no point in
       // looking for an instruction realizing it: there cannot be one!
       if (Num < NextNum) {
@@ -2271,7 +2035,7 @@ bool GVN::propagateEquality(Value *LHS, Value *RHS, const BasicBlockEdge &Root,
               DominatesByEdge
                   ? replaceDominatedUsesWith(NotCmp, NotVal, *DT, Root)
                   : replaceDominatedUsesWith(NotCmp, NotVal, *DT,
-                                             Root.getEnd());
+                                             Root.getStart());
           Changed |= NumReplacements > 0;
           NumGVNEqProp += NumReplacements;
         }
@@ -2303,12 +2067,21 @@ bool GVN::processInstruction(Instruction *I) {
   // "%z = and i32 %x, %y" becomes "%z = and i32 %x, %x" which we now simplify.
   const DataLayout &DL = I->getModule()->getDataLayout();
   if (Value *V = SimplifyInstruction(I, DL, TLI, DT, AC)) {
-    I->replaceAllUsesWith(V);
-    if (MD && V->getType()->getScalarType()->isPointerTy())
-      MD->invalidateCachedPointerInfo(V);
-    markInstructionForDeletion(I);
-    ++NumGVNSimpl;
-    return true;
+    bool Changed = false;
+    if (!I->use_empty()) {
+      I->replaceAllUsesWith(V);
+      Changed = true;
+    }
+    if (isInstructionTriviallyDead(I, TLI)) {
+      markInstructionForDeletion(I);
+      Changed = true;
+    }
+    if (Changed) {
+      if (MD && V->getType()->getScalarType()->isPointerTy())
+        MD->invalidateCachedPointerInfo(V);
+      ++NumGVNSimpl;
+      return true;
+    }
   }
 
   if (IntrinsicInst *IntrinsicI = dyn_cast<IntrinsicInst>(I))
@@ -2319,7 +2092,7 @@ bool GVN::processInstruction(Instruction *I) {
     if (processLoad(LI))
       return true;
 
-    unsigned Num = VN.lookup_or_add(LI);
+    unsigned Num = VN.lookupOrAdd(LI);
     addToLeaderTable(Num, LI, LI->getParent());
     return false;
   }
@@ -2383,7 +2156,7 @@ bool GVN::processInstruction(Instruction *I) {
     return false;
 
   uint32_t NextNum = VN.getNextUnusedValueNumber();
-  unsigned Num = VN.lookup_or_add(I);
+  unsigned Num = VN.lookupOrAdd(I);
 
   // Allocations are always uniquely numbered, so we can save time and memory
   // by fast failing them.
@@ -2422,18 +2195,16 @@ bool GVN::processInstruction(Instruction *I) {
 }
 
 /// runOnFunction - This is the main transformation entry point for a function.
-bool GVN::runOnFunction(Function& F) {
-  if (skipOptnoneFunction(F))
-    return false;
-
-  if (!NoLoads)
-    MD = &getAnalysis<MemoryDependenceAnalysis>();
-  DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
-  AC = &getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F);
-  TLI = &getAnalysis<TargetLibraryInfoWrapperPass>().getTLI();
-  VN.setAliasAnalysis(&getAnalysis<AAResultsWrapperPass>().getAAResults());
-  VN.setMemDep(MD);
+bool GVN::runImpl(Function &F, AssumptionCache &RunAC, DominatorTree &RunDT,
+                  const TargetLibraryInfo &RunTLI, AAResults &RunAA,
+                  MemoryDependenceResults *RunMD) {
+  AC = &RunAC;
+  DT = &RunDT;
   VN.setDomTree(DT);
+  TLI = &RunTLI;
+  VN.setAliasAnalysis(&RunAA);
+  MD = RunMD;
+  VN.setMemDep(MD);
 
   bool Changed = false;
   bool ShouldContinue = true;
@@ -2476,7 +2247,7 @@ bool GVN::runOnFunction(Function& F) {
 
   cleanupGlobalSets();
   // Do not cleanup DeadBlocks in cleanupGlobalSets() as it's called for each
-  // iteration. 
+  // iteration.
   DeadBlocks.clear();
 
   return Changed;
@@ -2576,8 +2347,6 @@ bool GVN::performScalarPREInsertion(Instruction *Instr, BasicBlock *Pred,
 }
 
 bool GVN::performScalarPRE(Instruction *CurInst) {
-  SmallVector<std::pair<Value*, BasicBlock*>, 8> predMap;
-
   if (isa<AllocaInst>(CurInst) || isa<TerminatorInst>(CurInst) ||
       isa<PHINode>(CurInst) || CurInst->getType()->isVoidTy() ||
       CurInst->mayReadFromMemory() || CurInst->mayHaveSideEffects() ||
@@ -2608,8 +2377,8 @@ bool GVN::performScalarPRE(Instruction *CurInst) {
   unsigned NumWithout = 0;
   BasicBlock *PREPred = nullptr;
   BasicBlock *CurrentBlock = CurInst->getParent();
-  predMap.clear();
 
+  SmallVector<std::pair<Value *, BasicBlock *>, 8> predMap;
   for (BasicBlock *P : predecessors(CurrentBlock)) {
     // We're not interested in PRE where the block is its
     // own predecessor, or in blocks with predecessors
@@ -2702,7 +2471,7 @@ bool GVN::performScalarPRE(Instruction *CurInst) {
   DEBUG(verifyRemoved(CurInst));
   CurInst->eraseFromParent();
   ++NumGVNInstr;
-  
+
   return true;
 }
 
@@ -2825,7 +2594,7 @@ void GVN::addDeadBlock(BasicBlock *BB) {
     SmallVector<BasicBlock *, 8> Dom;
     DT->getDescendants(D, Dom);
     DeadBlocks.insert(Dom.begin(), Dom.end());
-    
+
     // Figure out the dominance-frontier(D).
     for (BasicBlock *B : Dom) {
       for (BasicBlock *S : successors(B)) {
@@ -2883,13 +2652,13 @@ void GVN::addDeadBlock(BasicBlock *BB) {
 // If the given branch is recognized as a foldable branch (i.e. conditional
 // branch with constant condition), it will perform following analyses and
 // transformation.
-//  1) If the dead out-coming edge is a critical-edge, split it. Let 
+//  1) If the dead out-coming edge is a critical-edge, split it. Let
 //     R be the target of the dead out-coming edge.
 //  1) Identify the set of dead blocks implied by the branch's dead outcoming
 //     edge. The result of this step will be {X| X is dominated by R}
 //  2) Identify those blocks which haves at least one dead predecessor. The
 //     result of this step will be dominance-frontier(R).
-//  3) Update the PHIs in DF(R) by replacing the operands corresponding to 
+//  3) Update the PHIs in DF(R) by replacing the operands corresponding to
 //     dead blocks with "UndefVal" in an hope these PHIs will optimized away.
 //
 // Return true iff *NEW* dead code are found.
@@ -2905,8 +2674,8 @@ bool GVN::processFoldableCondBr(BranchInst *BI) {
   if (!Cond)
     return false;
 
-  BasicBlock *DeadRoot = Cond->getZExtValue() ? 
-                         BI->getSuccessor(1) : BI->getSuccessor(0);
+  BasicBlock *DeadRoot =
+      Cond->getZExtValue() ? BI->getSuccessor(1) : BI->getSuccessor(0);
   if (DeadBlocks.count(DeadRoot))
     return false;
 
@@ -2924,8 +2693,62 @@ bool GVN::processFoldableCondBr(BranchInst *BI) {
 void GVN::assignValNumForDeadCode() {
   for (BasicBlock *BB : DeadBlocks) {
     for (Instruction &Inst : *BB) {
-      unsigned ValNum = VN.lookup_or_add(&Inst);
+      unsigned ValNum = VN.lookupOrAdd(&Inst);
       addToLeaderTable(ValNum, &Inst, BB);
     }
   }
 }
+
+class llvm::gvn::GVNLegacyPass : public FunctionPass {
+public:
+  static char ID; // Pass identification, replacement for typeid
+  explicit GVNLegacyPass(bool NoLoads = false)
+      : FunctionPass(ID), NoLoads(NoLoads) {
+    initializeGVNLegacyPassPass(*PassRegistry::getPassRegistry());
+  }
+
+  bool runOnFunction(Function &F) override {
+    if (skipFunction(F))
+      return false;
+
+    return Impl.runImpl(
+        F, getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F),
+        getAnalysis<DominatorTreeWrapperPass>().getDomTree(),
+        getAnalysis<TargetLibraryInfoWrapperPass>().getTLI(),
+        getAnalysis<AAResultsWrapperPass>().getAAResults(),
+        NoLoads ? nullptr
+                : &getAnalysis<MemoryDependenceWrapperPass>().getMemDep());
+  }
+
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    AU.addRequired<AssumptionCacheTracker>();
+    AU.addRequired<DominatorTreeWrapperPass>();
+    AU.addRequired<TargetLibraryInfoWrapperPass>();
+    if (!NoLoads)
+      AU.addRequired<MemoryDependenceWrapperPass>();
+    AU.addRequired<AAResultsWrapperPass>();
+
+    AU.addPreserved<DominatorTreeWrapperPass>();
+    AU.addPreserved<GlobalsAAWrapperPass>();
+  }
+
+private:
+  bool NoLoads;
+  GVN Impl;
+};
+
+char GVNLegacyPass::ID = 0;
+
+// The public interface to this file...
+FunctionPass *llvm::createGVNPass(bool NoLoads) {
+  return new GVNLegacyPass(NoLoads);
+}
+
+INITIALIZE_PASS_BEGIN(GVNLegacyPass, "gvn", "Global Value Numbering", false, false)
+INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker)
+INITIALIZE_PASS_DEPENDENCY(MemoryDependenceWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfoWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(GlobalsAAWrapperPass)
+INITIALIZE_PASS_END(GVNLegacyPass, "gvn", "Global Value Numbering", false, false)
diff --git a/lib/Transforms/Scalar/GVNHoist.cpp b/lib/Transforms/Scalar/GVNHoist.cpp
new file mode 100644
index 000000000000..cce1db3874b7
--- /dev/null
+++ b/lib/Transforms/Scalar/GVNHoist.cpp
@@ -0,0 +1,825 @@
+//===- GVNHoist.cpp - Hoist scalar and load expressions -------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This pass hoists expressions from branches to a common dominator. It uses
+// GVN (global value numbering) to discover expressions computing the same
+// values. The primary goal is to reduce the code size, and in some
+// cases reduce critical path (by exposing more ILP).
+// Hoisting may affect the performance in some cases. To mitigate that, hoisting
+// is disabled in the following cases.
+// 1. Scalars across calls.
+// 2. geps when corresponding load/store cannot be hoisted.
+//===----------------------------------------------------------------------===//
+
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/SmallPtrSet.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/Analysis/ValueTracking.h"
+#include "llvm/Transforms/Scalar.h"
+#include "llvm/Transforms/Scalar/GVN.h"
+#include "llvm/Transforms/Utils/MemorySSA.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "gvn-hoist"
+
+STATISTIC(NumHoisted, "Number of instructions hoisted");
+STATISTIC(NumRemoved, "Number of instructions removed");
+STATISTIC(NumLoadsHoisted, "Number of loads hoisted");
+STATISTIC(NumLoadsRemoved, "Number of loads removed");
+STATISTIC(NumStoresHoisted, "Number of stores hoisted");
+STATISTIC(NumStoresRemoved, "Number of stores removed");
+STATISTIC(NumCallsHoisted, "Number of calls hoisted");
+STATISTIC(NumCallsRemoved, "Number of calls removed");
+
+static cl::opt<int>
+    MaxHoistedThreshold("gvn-max-hoisted", cl::Hidden, cl::init(-1),
+                        cl::desc("Max number of instructions to hoist "
+                                 "(default unlimited = -1)"));
+static cl::opt<int> MaxNumberOfBBSInPath(
+    "gvn-hoist-max-bbs", cl::Hidden, cl::init(4),
+    cl::desc("Max number of basic blocks on the path between "
+             "hoisting locations (default = 4, unlimited = -1)"));
+
+namespace {
+
+// Provides a sorting function based on the execution order of two instructions.
+struct SortByDFSIn {
+private:
+  DenseMap<const BasicBlock *, unsigned> &DFSNumber;
+
+public:
+  SortByDFSIn(DenseMap<const BasicBlock *, unsigned> &D) : DFSNumber(D) {}
+
+  // Returns true when A executes before B.
+  bool operator()(const Instruction *A, const Instruction *B) const {
+    // FIXME: libc++ has a std::sort() algorithm that will call the compare
+    // function on the same element.  Once PR20837 is fixed and some more years
+    // pass by and all the buildbots have moved to a corrected std::sort(),
+    // enable the following assert:
+    //
+    // assert(A != B);
+
+    const BasicBlock *BA = A->getParent();
+    const BasicBlock *BB = B->getParent();
+    unsigned NA = DFSNumber[BA];
+    unsigned NB = DFSNumber[BB];
+    if (NA < NB)
+      return true;
+    if (NA == NB) {
+      // Sort them in the order they occur in the same basic block.
+      BasicBlock::const_iterator AI(A), BI(B);
+      return std::distance(AI, BI) < 0;
+    }
+    return false;
+  }
+};
+
+// A map from a pair of VNs to all the instructions with those VNs.
+typedef DenseMap<std::pair<unsigned, unsigned>, SmallVector<Instruction *, 4>>
+    VNtoInsns;
+// An invalid value number Used when inserting a single value number into
+// VNtoInsns.
+enum : unsigned { InvalidVN = ~2U };
+
+// Records all scalar instructions candidate for code hoisting.
+class InsnInfo {
+  VNtoInsns VNtoScalars;
+
+public:
+  // Inserts I and its value number in VNtoScalars.
+  void insert(Instruction *I, GVN::ValueTable &VN) {
+    // Scalar instruction.
+    unsigned V = VN.lookupOrAdd(I);
+    VNtoScalars[{V, InvalidVN}].push_back(I);
+  }
+
+  const VNtoInsns &getVNTable() const { return VNtoScalars; }
+};
+
+// Records all load instructions candidate for code hoisting.
+class LoadInfo {
+  VNtoInsns VNtoLoads;
+
+public:
+  // Insert Load and the value number of its memory address in VNtoLoads.
+  void insert(LoadInst *Load, GVN::ValueTable &VN) {
+    if (Load->isSimple()) {
+      unsigned V = VN.lookupOrAdd(Load->getPointerOperand());
+      VNtoLoads[{V, InvalidVN}].push_back(Load);
+    }
+  }
+
+  const VNtoInsns &getVNTable() const { return VNtoLoads; }
+};
+
+// Records all store instructions candidate for code hoisting.
+class StoreInfo {
+  VNtoInsns VNtoStores;
+
+public:
+  // Insert the Store and a hash number of the store address and the stored
+  // value in VNtoStores.
+  void insert(StoreInst *Store, GVN::ValueTable &VN) {
+    if (!Store->isSimple())
+      return;
+    // Hash the store address and the stored value.
+    Value *Ptr = Store->getPointerOperand();
+    Value *Val = Store->getValueOperand();
+    VNtoStores[{VN.lookupOrAdd(Ptr), VN.lookupOrAdd(Val)}].push_back(Store);
+  }
+
+  const VNtoInsns &getVNTable() const { return VNtoStores; }
+};
+
+// Records all call instructions candidate for code hoisting.
+class CallInfo {
+  VNtoInsns VNtoCallsScalars;
+  VNtoInsns VNtoCallsLoads;
+  VNtoInsns VNtoCallsStores;
+
+public:
+  // Insert Call and its value numbering in one of the VNtoCalls* containers.
+  void insert(CallInst *Call, GVN::ValueTable &VN) {
+    // A call that doesNotAccessMemory is handled as a Scalar,
+    // onlyReadsMemory will be handled as a Load instruction,
+    // all other calls will be handled as stores.
+    unsigned V = VN.lookupOrAdd(Call);
+    auto Entry = std::make_pair(V, InvalidVN);
+
+    if (Call->doesNotAccessMemory())
+      VNtoCallsScalars[Entry].push_back(Call);
+    else if (Call->onlyReadsMemory())
+      VNtoCallsLoads[Entry].push_back(Call);
+    else
+      VNtoCallsStores[Entry].push_back(Call);
+  }
+
+  const VNtoInsns &getScalarVNTable() const { return VNtoCallsScalars; }
+
+  const VNtoInsns &getLoadVNTable() const { return VNtoCallsLoads; }
+
+  const VNtoInsns &getStoreVNTable() const { return VNtoCallsStores; }
+};
+
+typedef DenseMap<const BasicBlock *, bool> BBSideEffectsSet;
+typedef SmallVector<Instruction *, 4> SmallVecInsn;
+typedef SmallVectorImpl<Instruction *> SmallVecImplInsn;
+
+// This pass hoists common computations across branches sharing common
+// dominator. The primary goal is to reduce the code size, and in some
+// cases reduce critical path (by exposing more ILP).
+class GVNHoist {
+public:
+  GVN::ValueTable VN;
+  DominatorTree *DT;
+  AliasAnalysis *AA;
+  MemoryDependenceResults *MD;
+  const bool OptForMinSize;
+  DenseMap<const BasicBlock *, unsigned> DFSNumber;
+  BBSideEffectsSet BBSideEffects;
+  MemorySSA *MSSA;
+  int HoistedCtr;
+
+  enum InsKind { Unknown, Scalar, Load, Store };
+
+  GVNHoist(DominatorTree *Dt, AliasAnalysis *Aa, MemoryDependenceResults *Md,
+           bool OptForMinSize)
+      : DT(Dt), AA(Aa), MD(Md), OptForMinSize(OptForMinSize), HoistedCtr(0) {}
+
+  // Return true when there are exception handling in BB.
+  bool hasEH(const BasicBlock *BB) {
+    auto It = BBSideEffects.find(BB);
+    if (It != BBSideEffects.end())
+      return It->second;
+
+    if (BB->isEHPad() || BB->hasAddressTaken()) {
+      BBSideEffects[BB] = true;
+      return true;
+    }
+
+    if (BB->getTerminator()->mayThrow()) {
+      BBSideEffects[BB] = true;
+      return true;
+    }
+
+    BBSideEffects[BB] = false;
+    return false;
+  }
+
+  // Return true when all paths from A to the end of the function pass through
+  // either B or C.
+  bool hoistingFromAllPaths(const BasicBlock *A, const BasicBlock *B,
+                            const BasicBlock *C) {
+    // We fully copy the WL in order to be able to remove items from it.
+    SmallPtrSet<const BasicBlock *, 2> WL;
+    WL.insert(B);
+    WL.insert(C);
+
+    for (auto It = df_begin(A), E = df_end(A); It != E;) {
+      // There exists a path from A to the exit of the function if we are still
+      // iterating in DF traversal and we removed all instructions from the work
+      // list.
+      if (WL.empty())
+        return false;
+
+      const BasicBlock *BB = *It;
+      if (WL.erase(BB)) {
+        // Stop DFS traversal when BB is in the work list.
+        It.skipChildren();
+        continue;
+      }
+
+      // Check for end of function, calls that do not return, etc.
+      if (!isGuaranteedToTransferExecutionToSuccessor(BB->getTerminator()))
+        return false;
+
+      // Increment DFS traversal when not skipping children.
+      ++It;
+    }
+
+    return true;
+  }
+
+  /* Return true when I1 appears before I2 in the instructions of BB.  */
+  bool firstInBB(BasicBlock *BB, const Instruction *I1, const Instruction *I2) {
+    for (Instruction &I : *BB) {
+      if (&I == I1)
+        return true;
+      if (&I == I2)
+        return false;
+    }
+
+    llvm_unreachable("I1 and I2 not found in BB");
+  }
+  // Return true when there are users of Def in BB.
+  bool hasMemoryUseOnPath(MemoryAccess *Def, const BasicBlock *BB,
+                          const Instruction *OldPt) {
+    const BasicBlock *DefBB = Def->getBlock();
+    const BasicBlock *OldBB = OldPt->getParent();
+
+    for (User *U : Def->users())
+      if (auto *MU = dyn_cast<MemoryUse>(U)) {
+        BasicBlock *UBB = MU->getBlock();
+        // Only analyze uses in BB.
+        if (BB != UBB)
+          continue;
+
+        // A use in the same block as the Def is on the path.
+        if (UBB == DefBB) {
+          assert(MSSA->locallyDominates(Def, MU) && "def not dominating use");
+          return true;
+        }
+
+        if (UBB != OldBB)
+          return true;
+
+        // It is only harmful to hoist when the use is before OldPt.
+        if (firstInBB(UBB, MU->getMemoryInst(), OldPt))
+          return true;
+      }
+
+    return false;
+  }
+
+  // Return true when there are exception handling or loads of memory Def
+  // between OldPt and NewPt.
+
+  // Decrement by 1 NBBsOnAllPaths for each block between HoistPt and BB, and
+  // return true when the counter NBBsOnAllPaths reaces 0, except when it is
+  // initialized to -1 which is unlimited.
+  bool hasEHOrLoadsOnPath(const Instruction *NewPt, const Instruction *OldPt,
+                          MemoryAccess *Def, int &NBBsOnAllPaths) {
+    const BasicBlock *NewBB = NewPt->getParent();
+    const BasicBlock *OldBB = OldPt->getParent();
+    assert(DT->dominates(NewBB, OldBB) && "invalid path");
+    assert(DT->dominates(Def->getBlock(), NewBB) &&
+           "def does not dominate new hoisting point");
+
+    // Walk all basic blocks reachable in depth-first iteration on the inverse
+    // CFG from OldBB to NewBB. These blocks are all the blocks that may be
+    // executed between the execution of NewBB and OldBB. Hoisting an expression
+    // from OldBB into NewBB has to be safe on all execution paths.
+    for (auto I = idf_begin(OldBB), E = idf_end(OldBB); I != E;) {
+      if (*I == NewBB) {
+        // Stop traversal when reaching HoistPt.
+        I.skipChildren();
+        continue;
+      }
+
+      // Impossible to hoist with exceptions on the path.
+      if (hasEH(*I))
+        return true;
+
+      // Check that we do not move a store past loads.
+      if (hasMemoryUseOnPath(Def, *I, OldPt))
+        return true;
+
+      // Stop walk once the limit is reached.
+      if (NBBsOnAllPaths == 0)
+        return true;
+
+      // -1 is unlimited number of blocks on all paths.
+      if (NBBsOnAllPaths != -1)
+        --NBBsOnAllPaths;
+
+      ++I;
+    }
+
+    return false;
+  }
+
+  // Return true when there are exception handling between HoistPt and BB.
+  // Decrement by 1 NBBsOnAllPaths for each block between HoistPt and BB, and
+  // return true when the counter NBBsOnAllPaths reaches 0, except when it is
+  // initialized to -1 which is unlimited.
+  bool hasEHOnPath(const BasicBlock *HoistPt, const BasicBlock *BB,
+                   int &NBBsOnAllPaths) {
+    assert(DT->dominates(HoistPt, BB) && "Invalid path");
+
+    // Walk all basic blocks reachable in depth-first iteration on
+    // the inverse CFG from BBInsn to NewHoistPt. These blocks are all the
+    // blocks that may be executed between the execution of NewHoistPt and
+    // BBInsn. Hoisting an expression from BBInsn into NewHoistPt has to be safe
+    // on all execution paths.
+    for (auto I = idf_begin(BB), E = idf_end(BB); I != E;) {
+      if (*I == HoistPt) {
+        // Stop traversal when reaching NewHoistPt.
+        I.skipChildren();
+        continue;
+      }
+
+      // Impossible to hoist with exceptions on the path.
+      if (hasEH(*I))
+        return true;
+
+      // Stop walk once the limit is reached.
+      if (NBBsOnAllPaths == 0)
+        return true;
+
+      // -1 is unlimited number of blocks on all paths.
+      if (NBBsOnAllPaths != -1)
+        --NBBsOnAllPaths;
+
+      ++I;
+    }
+
+    return false;
+  }
+
+  // Return true when it is safe to hoist a memory load or store U from OldPt
+  // to NewPt.
+  bool safeToHoistLdSt(const Instruction *NewPt, const Instruction *OldPt,
+                       MemoryUseOrDef *U, InsKind K, int &NBBsOnAllPaths) {
+
+    // In place hoisting is safe.
+    if (NewPt == OldPt)
+      return true;
+
+    const BasicBlock *NewBB = NewPt->getParent();
+    const BasicBlock *OldBB = OldPt->getParent();
+    const BasicBlock *UBB = U->getBlock();
+
+    // Check for dependences on the Memory SSA.
+    MemoryAccess *D = U->getDefiningAccess();
+    BasicBlock *DBB = D->getBlock();
+    if (DT->properlyDominates(NewBB, DBB))
+      // Cannot move the load or store to NewBB above its definition in DBB.
+      return false;
+
+    if (NewBB == DBB && !MSSA->isLiveOnEntryDef(D))
+      if (auto *UD = dyn_cast<MemoryUseOrDef>(D))
+        if (firstInBB(DBB, NewPt, UD->getMemoryInst()))
+          // Cannot move the load or store to NewPt above its definition in D.
+          return false;
+
+    // Check for unsafe hoistings due to side effects.
+    if (K == InsKind::Store) {
+      if (hasEHOrLoadsOnPath(NewPt, OldPt, D, NBBsOnAllPaths))
+        return false;
+    } else if (hasEHOnPath(NewBB, OldBB, NBBsOnAllPaths))
+      return false;
+
+    if (UBB == NewBB) {
+      if (DT->properlyDominates(DBB, NewBB))
+        return true;
+      assert(UBB == DBB);
+      assert(MSSA->locallyDominates(D, U));
+    }
+
+    // No side effects: it is safe to hoist.
+    return true;
+  }
+
+  // Return true when it is safe to hoist scalar instructions from BB1 and BB2
+  // to HoistBB.
+  bool safeToHoistScalar(const BasicBlock *HoistBB, const BasicBlock *BB1,
+                         const BasicBlock *BB2, int &NBBsOnAllPaths) {
+    // Check that the hoisted expression is needed on all paths.  When HoistBB
+    // already contains an instruction to be hoisted, the expression is needed
+    // on all paths.  Enable scalar hoisting at -Oz as it is safe to hoist
+    // scalars to a place where they are partially needed.
+    if (!OptForMinSize && BB1 != HoistBB &&
+        !hoistingFromAllPaths(HoistBB, BB1, BB2))
+      return false;
+
+    if (hasEHOnPath(HoistBB, BB1, NBBsOnAllPaths) ||
+        hasEHOnPath(HoistBB, BB2, NBBsOnAllPaths))
+      return false;
+
+    // Safe to hoist scalars from BB1 and BB2 to HoistBB.
+    return true;
+  }
+
+  // Each element of a hoisting list contains the basic block where to hoist and
+  // a list of instructions to be hoisted.
+  typedef std::pair<BasicBlock *, SmallVecInsn> HoistingPointInfo;
+  typedef SmallVector<HoistingPointInfo, 4> HoistingPointList;
+
+  // Partition InstructionsToHoist into a set of candidates which can share a
+  // common hoisting point. The partitions are collected in HPL. IsScalar is
+  // true when the instructions in InstructionsToHoist are scalars. IsLoad is
+  // true when the InstructionsToHoist are loads, false when they are stores.
+  void partitionCandidates(SmallVecImplInsn &InstructionsToHoist,
+                           HoistingPointList &HPL, InsKind K) {
+    // No need to sort for two instructions.
+    if (InstructionsToHoist.size() > 2) {
+      SortByDFSIn Pred(DFSNumber);
+      std::sort(InstructionsToHoist.begin(), InstructionsToHoist.end(), Pred);
+    }
+
+    int NBBsOnAllPaths = MaxNumberOfBBSInPath;
+
+    SmallVecImplInsn::iterator II = InstructionsToHoist.begin();
+    SmallVecImplInsn::iterator Start = II;
+    Instruction *HoistPt = *II;
+    BasicBlock *HoistBB = HoistPt->getParent();
+    MemoryUseOrDef *UD;
+    if (K != InsKind::Scalar)
+      UD = cast<MemoryUseOrDef>(MSSA->getMemoryAccess(HoistPt));
+
+    for (++II; II != InstructionsToHoist.end(); ++II) {
+      Instruction *Insn = *II;
+      BasicBlock *BB = Insn->getParent();
+      BasicBlock *NewHoistBB;
+      Instruction *NewHoistPt;
+
+      if (BB == HoistBB) {
+        NewHoistBB = HoistBB;
+        NewHoistPt = firstInBB(BB, Insn, HoistPt) ? Insn : HoistPt;
+      } else {
+        NewHoistBB = DT->findNearestCommonDominator(HoistBB, BB);
+        if (NewHoistBB == BB)
+          NewHoistPt = Insn;
+        else if (NewHoistBB == HoistBB)
+          NewHoistPt = HoistPt;
+        else
+          NewHoistPt = NewHoistBB->getTerminator();
+      }
+
+      if (K == InsKind::Scalar) {
+        if (safeToHoistScalar(NewHoistBB, HoistBB, BB, NBBsOnAllPaths)) {
+          // Extend HoistPt to NewHoistPt.
+          HoistPt = NewHoistPt;
+          HoistBB = NewHoistBB;
+          continue;
+        }
+      } else {
+        // When NewBB already contains an instruction to be hoisted, the
+        // expression is needed on all paths.
+        // Check that the hoisted expression is needed on all paths: it is
+        // unsafe to hoist loads to a place where there may be a path not
+        // loading from the same address: for instance there may be a branch on
+        // which the address of the load may not be initialized.
+        if ((HoistBB == NewHoistBB || BB == NewHoistBB ||
+             hoistingFromAllPaths(NewHoistBB, HoistBB, BB)) &&
+            // Also check that it is safe to move the load or store from HoistPt
+            // to NewHoistPt, and from Insn to NewHoistPt.
+            safeToHoistLdSt(NewHoistPt, HoistPt, UD, K, NBBsOnAllPaths) &&
+            safeToHoistLdSt(NewHoistPt, Insn,
+                            cast<MemoryUseOrDef>(MSSA->getMemoryAccess(Insn)),
+                            K, NBBsOnAllPaths)) {
+          // Extend HoistPt to NewHoistPt.
+          HoistPt = NewHoistPt;
+          HoistBB = NewHoistBB;
+          continue;
+        }
+      }
+
+      // At this point it is not safe to extend the current hoisting to
+      // NewHoistPt: save the hoisting list so far.
+      if (std::distance(Start, II) > 1)
+        HPL.push_back({HoistBB, SmallVecInsn(Start, II)});
+
+      // Start over from BB.
+      Start = II;
+      if (K != InsKind::Scalar)
+        UD = cast<MemoryUseOrDef>(MSSA->getMemoryAccess(*Start));
+      HoistPt = Insn;
+      HoistBB = BB;
+      NBBsOnAllPaths = MaxNumberOfBBSInPath;
+    }
+
+    // Save the last partition.
+    if (std::distance(Start, II) > 1)
+      HPL.push_back({HoistBB, SmallVecInsn(Start, II)});
+  }
+
+  // Initialize HPL from Map.
+  void computeInsertionPoints(const VNtoInsns &Map, HoistingPointList &HPL,
+                              InsKind K) {
+    for (const auto &Entry : Map) {
+      if (MaxHoistedThreshold != -1 && ++HoistedCtr > MaxHoistedThreshold)
+        return;
+
+      const SmallVecInsn &V = Entry.second;
+      if (V.size() < 2)
+        continue;
+
+      // Compute the insertion point and the list of expressions to be hoisted.
+      SmallVecInsn InstructionsToHoist;
+      for (auto I : V)
+        if (!hasEH(I->getParent()))
+          InstructionsToHoist.push_back(I);
+
+      if (!InstructionsToHoist.empty())
+        partitionCandidates(InstructionsToHoist, HPL, K);
+    }
+  }
+
+  // Return true when all operands of Instr are available at insertion point
+  // HoistPt. When limiting the number of hoisted expressions, one could hoist
+  // a load without hoisting its access function. So before hoisting any
+  // expression, make sure that all its operands are available at insert point.
+  bool allOperandsAvailable(const Instruction *I,
+                            const BasicBlock *HoistPt) const {
+    for (const Use &Op : I->operands())
+      if (const auto *Inst = dyn_cast<Instruction>(&Op))
+        if (!DT->dominates(Inst->getParent(), HoistPt))
+          return false;
+
+    return true;
+  }
+
+  Instruction *firstOfTwo(Instruction *I, Instruction *J) const {
+    for (Instruction &I1 : *I->getParent())
+      if (&I1 == I || &I1 == J)
+        return &I1;
+    llvm_unreachable("Both I and J must be from same BB");
+  }
+
+  // Replace the use of From with To in Insn.
+  void replaceUseWith(Instruction *Insn, Value *From, Value *To) const {
+    for (Value::use_iterator UI = From->use_begin(), UE = From->use_end();
+         UI != UE;) {
+      Use &U = *UI++;
+      if (U.getUser() == Insn) {
+        U.set(To);
+        return;
+      }
+    }
+    llvm_unreachable("should replace exactly once");
+  }
+
+  bool makeOperandsAvailable(Instruction *Repl, BasicBlock *HoistPt) const {
+    // Check whether the GEP of a ld/st can be synthesized at HoistPt.
+    GetElementPtrInst *Gep = nullptr;
+    Instruction *Val = nullptr;
+    if (auto *Ld = dyn_cast<LoadInst>(Repl))
+      Gep = dyn_cast<GetElementPtrInst>(Ld->getPointerOperand());
+    if (auto *St = dyn_cast<StoreInst>(Repl)) {
+      Gep = dyn_cast<GetElementPtrInst>(St->getPointerOperand());
+      Val = dyn_cast<Instruction>(St->getValueOperand());
+      // Check that the stored value is available.
+      if (Val) {
+        if (isa<GetElementPtrInst>(Val)) {
+          // Check whether we can compute the GEP at HoistPt.
+          if (!allOperandsAvailable(Val, HoistPt))
+            return false;
+        } else if (!DT->dominates(Val->getParent(), HoistPt))
+          return false;
+      }
+    }
+
+    // Check whether we can compute the Gep at HoistPt.
+    if (!Gep || !allOperandsAvailable(Gep, HoistPt))
+      return false;
+
+    // Copy the gep before moving the ld/st.
+    Instruction *ClonedGep = Gep->clone();
+    ClonedGep->insertBefore(HoistPt->getTerminator());
+    replaceUseWith(Repl, Gep, ClonedGep);
+
+    // Also copy Val when it is a GEP.
+    if (Val && isa<GetElementPtrInst>(Val)) {
+      Instruction *ClonedVal = Val->clone();
+      ClonedVal->insertBefore(HoistPt->getTerminator());
+      replaceUseWith(Repl, Val, ClonedVal);
+    }
+
+    return true;
+  }
+
+  std::pair<unsigned, unsigned> hoist(HoistingPointList &HPL) {
+    unsigned NI = 0, NL = 0, NS = 0, NC = 0, NR = 0;
+    for (const HoistingPointInfo &HP : HPL) {
+      // Find out whether we already have one of the instructions in HoistPt,
+      // in which case we do not have to move it.
+      BasicBlock *HoistPt = HP.first;
+      const SmallVecInsn &InstructionsToHoist = HP.second;
+      Instruction *Repl = nullptr;
+      for (Instruction *I : InstructionsToHoist)
+        if (I->getParent() == HoistPt) {
+          // If there are two instructions in HoistPt to be hoisted in place:
+          // update Repl to be the first one, such that we can rename the uses
+          // of the second based on the first.
+          Repl = !Repl ? I : firstOfTwo(Repl, I);
+        }
+
+      if (Repl) {
+        // Repl is already in HoistPt: it remains in place.
+        assert(allOperandsAvailable(Repl, HoistPt) &&
+               "instruction depends on operands that are not available");
+      } else {
+        // When we do not find Repl in HoistPt, select the first in the list
+        // and move it to HoistPt.
+        Repl = InstructionsToHoist.front();
+
+        // We can move Repl in HoistPt only when all operands are available.
+        // The order in which hoistings are done may influence the availability
+        // of operands.
+        if (!allOperandsAvailable(Repl, HoistPt) &&
+            !makeOperandsAvailable(Repl, HoistPt))
+          continue;
+        Repl->moveBefore(HoistPt->getTerminator());
+      }
+
+      if (isa<LoadInst>(Repl))
+        ++NL;
+      else if (isa<StoreInst>(Repl))
+        ++NS;
+      else if (isa<CallInst>(Repl))
+        ++NC;
+      else // Scalar
+        ++NI;
+
+      // Remove and rename all other instructions.
+      for (Instruction *I : InstructionsToHoist)
+        if (I != Repl) {
+          ++NR;
+          if (isa<LoadInst>(Repl))
+            ++NumLoadsRemoved;
+          else if (isa<StoreInst>(Repl))
+            ++NumStoresRemoved;
+          else if (isa<CallInst>(Repl))
+            ++NumCallsRemoved;
+          I->replaceAllUsesWith(Repl);
+          I->eraseFromParent();
+        }
+    }
+
+    NumHoisted += NL + NS + NC + NI;
+    NumRemoved += NR;
+    NumLoadsHoisted += NL;
+    NumStoresHoisted += NS;
+    NumCallsHoisted += NC;
+    return {NI, NL + NC + NS};
+  }
+
+  // Hoist all expressions. Returns Number of scalars hoisted
+  // and number of non-scalars hoisted.
+  std::pair<unsigned, unsigned> hoistExpressions(Function &F) {
+    InsnInfo II;
+    LoadInfo LI;
+    StoreInfo SI;
+    CallInfo CI;
+    for (BasicBlock *BB : depth_first(&F.getEntryBlock())) {
+      for (Instruction &I1 : *BB) {
+        if (auto *Load = dyn_cast<LoadInst>(&I1))
+          LI.insert(Load, VN);
+        else if (auto *Store = dyn_cast<StoreInst>(&I1))
+          SI.insert(Store, VN);
+        else if (auto *Call = dyn_cast<CallInst>(&I1)) {
+          if (auto *Intr = dyn_cast<IntrinsicInst>(Call)) {
+            if (isa<DbgInfoIntrinsic>(Intr) ||
+                Intr->getIntrinsicID() == Intrinsic::assume)
+              continue;
+          }
+          if (Call->mayHaveSideEffects()) {
+            if (!OptForMinSize)
+              break;
+            // We may continue hoisting across calls which write to memory.
+            if (Call->mayThrow())
+              break;
+          }
+          CI.insert(Call, VN);
+        } else if (OptForMinSize || !isa<GetElementPtrInst>(&I1))
+          // Do not hoist scalars past calls that may write to memory because
+          // that could result in spills later. geps are handled separately.
+          // TODO: We can relax this for targets like AArch64 as they have more
+          // registers than X86.
+          II.insert(&I1, VN);
+      }
+    }
+
+    HoistingPointList HPL;
+    computeInsertionPoints(II.getVNTable(), HPL, InsKind::Scalar);
+    computeInsertionPoints(LI.getVNTable(), HPL, InsKind::Load);
+    computeInsertionPoints(SI.getVNTable(), HPL, InsKind::Store);
+    computeInsertionPoints(CI.getScalarVNTable(), HPL, InsKind::Scalar);
+    computeInsertionPoints(CI.getLoadVNTable(), HPL, InsKind::Load);
+    computeInsertionPoints(CI.getStoreVNTable(), HPL, InsKind::Store);
+    return hoist(HPL);
+  }
+
+  bool run(Function &F) {
+    VN.setDomTree(DT);
+    VN.setAliasAnalysis(AA);
+    VN.setMemDep(MD);
+    bool Res = false;
+
+    unsigned I = 0;
+    for (const BasicBlock *BB : depth_first(&F.getEntryBlock()))
+      DFSNumber.insert({BB, ++I});
+
+    // FIXME: use lazy evaluation of VN to avoid the fix-point computation.
+    while (1) {
+      // FIXME: only compute MemorySSA once. We need to update the analysis in
+      // the same time as transforming the code.
+      MemorySSA M(F, AA, DT);
+      MSSA = &M;
+
+      auto HoistStat = hoistExpressions(F);
+      if (HoistStat.first + HoistStat.second == 0) {
+        return Res;
+      }
+      if (HoistStat.second > 0) {
+        // To address a limitation of the current GVN, we need to rerun the
+        // hoisting after we hoisted loads in order to be able to hoist all
+        // scalars dependent on the hoisted loads. Same for stores.
+        VN.clear();
+      }
+      Res = true;
+    }
+
+    return Res;
+  }
+};
+
+class GVNHoistLegacyPass : public FunctionPass {
+public:
+  static char ID;
+
+  GVNHoistLegacyPass() : FunctionPass(ID) {
+    initializeGVNHoistLegacyPassPass(*PassRegistry::getPassRegistry());
+  }
+
+  bool runOnFunction(Function &F) override {
+    auto &DT = getAnalysis<DominatorTreeWrapperPass>().getDomTree();
+    auto &AA = getAnalysis<AAResultsWrapperPass>().getAAResults();
+    auto &MD = getAnalysis<MemoryDependenceWrapperPass>().getMemDep();
+
+    GVNHoist G(&DT, &AA, &MD, F.optForMinSize());
+    return G.run(F);
+  }
+
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    AU.addRequired<DominatorTreeWrapperPass>();
+    AU.addRequired<AAResultsWrapperPass>();
+    AU.addRequired<MemoryDependenceWrapperPass>();
+    AU.addPreserved<DominatorTreeWrapperPass>();
+  }
+};
+} // namespace
+
+PreservedAnalyses GVNHoistPass::run(Function &F,
+                                    AnalysisManager<Function> &AM) {
+  DominatorTree &DT = AM.getResult<DominatorTreeAnalysis>(F);
+  AliasAnalysis &AA = AM.getResult<AAManager>(F);
+  MemoryDependenceResults &MD = AM.getResult<MemoryDependenceAnalysis>(F);
+
+  GVNHoist G(&DT, &AA, &MD, F.optForMinSize());
+  if (!G.run(F))
+    return PreservedAnalyses::all();
+
+  PreservedAnalyses PA;
+  PA.preserve<DominatorTreeAnalysis>();
+  return PA;
+}
+
+char GVNHoistLegacyPass::ID = 0;
+INITIALIZE_PASS_BEGIN(GVNHoistLegacyPass, "gvn-hoist",
+                      "Early GVN Hoisting of Expressions", false, false)
+INITIALIZE_PASS_DEPENDENCY(MemoryDependenceWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass)
+INITIALIZE_PASS_END(GVNHoistLegacyPass, "gvn-hoist",
+                    "Early GVN Hoisting of Expressions", false, false)
+
+FunctionPass *llvm::createGVNHoistPass() { return new GVNHoistLegacyPass(); }
diff --git a/lib/Transforms/Scalar/GuardWidening.cpp b/lib/Transforms/Scalar/GuardWidening.cpp
new file mode 100644
index 000000000000..7686e65efed9
--- /dev/null
+++ b/lib/Transforms/Scalar/GuardWidening.cpp
@@ -0,0 +1,691 @@
+//===- GuardWidening.cpp - ---- Guard widening ----------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the guard widening pass.  The semantics of the
+// @llvm.experimental.guard intrinsic lets LLVM transform it so that it fails
+// more often that it did before the transform.  This optimization is called
+// "widening" and can be used hoist and common runtime checks in situations like
+// these:
+//
+//    %cmp0 = 7 u< Length
+//    call @llvm.experimental.guard(i1 %cmp0) [ "deopt"(...) ]
+//    call @unknown_side_effects()
+//    %cmp1 = 9 u< Length
+//    call @llvm.experimental.guard(i1 %cmp1) [ "deopt"(...) ]
+//    ...
+//
+// =>
+//
+//    %cmp0 = 9 u< Length
+//    call @llvm.experimental.guard(i1 %cmp0) [ "deopt"(...) ]
+//    call @unknown_side_effects()
+//    ...
+//
+// If %cmp0 is false, @llvm.experimental.guard will "deoptimize" back to a
+// generic implementation of the same function, which will have the correct
+// semantics from that point onward.  It is always _legal_ to deoptimize (so
+// replacing %cmp0 with false is "correct"), though it may not always be
+// profitable to do so.
+//
+// NB! This pass is a work in progress.  It hasn't been tuned to be "production
+// ready" yet.  It is known to have quadriatic running time and will not scale
+// to large numbers of guards
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Transforms/Scalar/GuardWidening.h"
+#include "llvm/Pass.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/DepthFirstIterator.h"
+#include "llvm/Analysis/LoopInfo.h"
+#include "llvm/Analysis/PostDominators.h"
+#include "llvm/Analysis/ValueTracking.h"
+#include "llvm/IR/Dominators.h"
+#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/PatternMatch.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Transforms/Scalar.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "guard-widening"
+
+namespace {
+
+class GuardWideningImpl {
+  DominatorTree &DT;
+  PostDominatorTree &PDT;
+  LoopInfo &LI;
+
+  /// The set of guards whose conditions have been widened into dominating
+  /// guards.
+  SmallVector<IntrinsicInst *, 16> EliminatedGuards;
+
+  /// The set of guards which have been widened to include conditions to other
+  /// guards.
+  DenseSet<IntrinsicInst *> WidenedGuards;
+
+  /// Try to eliminate guard \p Guard by widening it into an earlier dominating
+  /// guard.  \p DFSI is the DFS iterator on the dominator tree that is
+  /// currently visiting the block containing \p Guard, and \p GuardsPerBlock
+  /// maps BasicBlocks to the set of guards seen in that block.
+  bool eliminateGuardViaWidening(
+      IntrinsicInst *Guard, const df_iterator<DomTreeNode *> &DFSI,
+      const DenseMap<BasicBlock *, SmallVector<IntrinsicInst *, 8>> &
+          GuardsPerBlock);
+
+  /// Used to keep track of which widening potential is more effective.
+  enum WideningScore {
+    /// Don't widen.
+    WS_IllegalOrNegative,
+
+    /// Widening is performance neutral as far as the cycles spent in check
+    /// conditions goes (but can still help, e.g., code layout, having less
+    /// deopt state).
+    WS_Neutral,
+
+    /// Widening is profitable.
+    WS_Positive,
+
+    /// Widening is very profitable.  Not significantly different from \c
+    /// WS_Positive, except by the order.
+    WS_VeryPositive
+  };
+
+  static StringRef scoreTypeToString(WideningScore WS);
+
+  /// Compute the score for widening the condition in \p DominatedGuard
+  /// (contained in \p DominatedGuardLoop) into \p DominatingGuard (contained in
+  /// \p DominatingGuardLoop).
+  WideningScore computeWideningScore(IntrinsicInst *DominatedGuard,
+                                     Loop *DominatedGuardLoop,
+                                     IntrinsicInst *DominatingGuard,
+                                     Loop *DominatingGuardLoop);
+
+  /// Helper to check if \p V can be hoisted to \p InsertPos.
+  bool isAvailableAt(Value *V, Instruction *InsertPos) {
+    SmallPtrSet<Instruction *, 8> Visited;
+    return isAvailableAt(V, InsertPos, Visited);
+  }
+
+  bool isAvailableAt(Value *V, Instruction *InsertPos,
+                     SmallPtrSetImpl<Instruction *> &Visited);
+
+  /// Helper to hoist \p V to \p InsertPos.  Guaranteed to succeed if \c
+  /// isAvailableAt returned true.
+  void makeAvailableAt(Value *V, Instruction *InsertPos);
+
+  /// Common helper used by \c widenGuard and \c isWideningCondProfitable.  Try
+  /// to generate an expression computing the logical AND of \p Cond0 and \p
+  /// Cond1.  Return true if the expression computing the AND is only as
+  /// expensive as computing one of the two. If \p InsertPt is true then
+  /// actually generate the resulting expression, make it available at \p
+  /// InsertPt and return it in \p Result (else no change to the IR is made).
+  bool widenCondCommon(Value *Cond0, Value *Cond1, Instruction *InsertPt,
+                       Value *&Result);
+
+  /// Represents a range check of the form \c Base + \c Offset u< \c Length,
+  /// with the constraint that \c Length is not negative.  \c CheckInst is the
+  /// pre-existing instruction in the IR that computes the result of this range
+  /// check.
+  class RangeCheck {
+    Value *Base;
+    ConstantInt *Offset;
+    Value *Length;
+    ICmpInst *CheckInst;
+
+  public:
+    explicit RangeCheck(Value *Base, ConstantInt *Offset, Value *Length,
+                        ICmpInst *CheckInst)
+        : Base(Base), Offset(Offset), Length(Length), CheckInst(CheckInst) {}
+
+    void setBase(Value *NewBase) { Base = NewBase; }
+    void setOffset(ConstantInt *NewOffset) { Offset = NewOffset; }
+
+    Value *getBase() const { return Base; }
+    ConstantInt *getOffset() const { return Offset; }
+    const APInt &getOffsetValue() const { return getOffset()->getValue(); }
+    Value *getLength() const { return Length; };
+    ICmpInst *getCheckInst() const { return CheckInst; }
+
+    void print(raw_ostream &OS, bool PrintTypes = false) {
+      OS << "Base: ";
+      Base->printAsOperand(OS, PrintTypes);
+      OS << " Offset: ";
+      Offset->printAsOperand(OS, PrintTypes);
+      OS << " Length: ";
+      Length->printAsOperand(OS, PrintTypes);
+    }
+
+    LLVM_DUMP_METHOD void dump() {
+      print(dbgs());
+      dbgs() << "\n";
+    }
+  };
+
+  /// Parse \p CheckCond into a conjunction (logical-and) of range checks; and
+  /// append them to \p Checks.  Returns true on success, may clobber \c Checks
+  /// on failure.
+  bool parseRangeChecks(Value *CheckCond, SmallVectorImpl<RangeCheck> &Checks) {
+    SmallPtrSet<Value *, 8> Visited;
+    return parseRangeChecks(CheckCond, Checks, Visited);
+  }
+
+  bool parseRangeChecks(Value *CheckCond, SmallVectorImpl<RangeCheck> &Checks,
+                        SmallPtrSetImpl<Value *> &Visited);
+
+  /// Combine the checks in \p Checks into a smaller set of checks and append
+  /// them into \p CombinedChecks.  Return true on success (i.e. all of checks
+  /// in \p Checks were combined into \p CombinedChecks).  Clobbers \p Checks
+  /// and \p CombinedChecks on success and on failure.
+  bool combineRangeChecks(SmallVectorImpl<RangeCheck> &Checks,
+                          SmallVectorImpl<RangeCheck> &CombinedChecks);
+
+  /// Can we compute the logical AND of \p Cond0 and \p Cond1 for the price of
+  /// computing only one of the two expressions?
+  bool isWideningCondProfitable(Value *Cond0, Value *Cond1) {
+    Value *ResultUnused;
+    return widenCondCommon(Cond0, Cond1, /*InsertPt=*/nullptr, ResultUnused);
+  }
+
+  /// Widen \p ToWiden to fail if \p NewCondition is false (in addition to
+  /// whatever it is already checking).
+  void widenGuard(IntrinsicInst *ToWiden, Value *NewCondition) {
+    Value *Result;
+    widenCondCommon(ToWiden->getArgOperand(0), NewCondition, ToWiden, Result);
+    ToWiden->setArgOperand(0, Result);
+  }
+
+public:
+  explicit GuardWideningImpl(DominatorTree &DT, PostDominatorTree &PDT,
+                             LoopInfo &LI)
+      : DT(DT), PDT(PDT), LI(LI) {}
+
+  /// The entry point for this pass.
+  bool run();
+};
+
+struct GuardWideningLegacyPass : public FunctionPass {
+  static char ID;
+  GuardWideningPass Impl;
+
+  GuardWideningLegacyPass() : FunctionPass(ID) {
+    initializeGuardWideningLegacyPassPass(*PassRegistry::getPassRegistry());
+  }
+
+  bool runOnFunction(Function &F) override {
+    if (skipFunction(F))
+      return false;
+    return GuardWideningImpl(
+               getAnalysis<DominatorTreeWrapperPass>().getDomTree(),
+               getAnalysis<PostDominatorTreeWrapperPass>().getPostDomTree(),
+               getAnalysis<LoopInfoWrapperPass>().getLoopInfo()).run();
+  }
+
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    AU.setPreservesCFG();
+    AU.addRequired<DominatorTreeWrapperPass>();
+    AU.addRequired<PostDominatorTreeWrapperPass>();
+    AU.addRequired<LoopInfoWrapperPass>();
+  }
+};
+
+}
+
+bool GuardWideningImpl::run() {
+  using namespace llvm::PatternMatch;
+
+  DenseMap<BasicBlock *, SmallVector<IntrinsicInst *, 8>> GuardsInBlock;
+  bool Changed = false;
+
+  for (auto DFI = df_begin(DT.getRootNode()), DFE = df_end(DT.getRootNode());
+       DFI != DFE; ++DFI) {
+    auto *BB = (*DFI)->getBlock();
+    auto &CurrentList = GuardsInBlock[BB];
+
+    for (auto &I : *BB)
+      if (match(&I, m_Intrinsic<Intrinsic::experimental_guard>()))
+        CurrentList.push_back(cast<IntrinsicInst>(&I));
+
+    for (auto *II : CurrentList)
+      Changed |= eliminateGuardViaWidening(II, DFI, GuardsInBlock);
+  }
+
+  for (auto *II : EliminatedGuards)
+    if (!WidenedGuards.count(II))
+      II->eraseFromParent();
+
+  return Changed;
+}
+
+bool GuardWideningImpl::eliminateGuardViaWidening(
+    IntrinsicInst *GuardInst, const df_iterator<DomTreeNode *> &DFSI,
+    const DenseMap<BasicBlock *, SmallVector<IntrinsicInst *, 8>> &
+        GuardsInBlock) {
+  IntrinsicInst *BestSoFar = nullptr;
+  auto BestScoreSoFar = WS_IllegalOrNegative;
+  auto *GuardInstLoop = LI.getLoopFor(GuardInst->getParent());
+
+  // In the set of dominating guards, find the one we can merge GuardInst with
+  // for the most profit.
+  for (unsigned i = 0, e = DFSI.getPathLength(); i != e; ++i) {
+    auto *CurBB = DFSI.getPath(i)->getBlock();
+    auto *CurLoop = LI.getLoopFor(CurBB);
+    assert(GuardsInBlock.count(CurBB) && "Must have been populated by now!");
+    const auto &GuardsInCurBB = GuardsInBlock.find(CurBB)->second;
+
+    auto I = GuardsInCurBB.begin();
+    auto E = GuardsInCurBB.end();
+
+#ifndef NDEBUG
+    {
+      unsigned Index = 0;
+      for (auto &I : *CurBB) {
+        if (Index == GuardsInCurBB.size())
+          break;
+        if (GuardsInCurBB[Index] == &I)
+          Index++;
+      }
+      assert(Index == GuardsInCurBB.size() &&
+             "Guards expected to be in order!");
+    }
+#endif
+
+    assert((i == (e - 1)) == (GuardInst->getParent() == CurBB) && "Bad DFS?");
+
+    if (i == (e - 1)) {
+      // Corner case: make sure we're only looking at guards strictly dominating
+      // GuardInst when visiting GuardInst->getParent().
+      auto NewEnd = std::find(I, E, GuardInst);
+      assert(NewEnd != E && "GuardInst not in its own block?");
+      E = NewEnd;
+    }
+
+    for (auto *Candidate : make_range(I, E)) {
+      auto Score =
+          computeWideningScore(GuardInst, GuardInstLoop, Candidate, CurLoop);
+      DEBUG(dbgs() << "Score between " << *GuardInst->getArgOperand(0)
+                   << " and " << *Candidate->getArgOperand(0) << " is "
+                   << scoreTypeToString(Score) << "\n");
+      if (Score > BestScoreSoFar) {
+        BestScoreSoFar = Score;
+        BestSoFar = Candidate;
+      }
+    }
+  }
+
+  if (BestScoreSoFar == WS_IllegalOrNegative) {
+    DEBUG(dbgs() << "Did not eliminate guard " << *GuardInst << "\n");
+    return false;
+  }
+
+  assert(BestSoFar != GuardInst && "Should have never visited same guard!");
+  assert(DT.dominates(BestSoFar, GuardInst) && "Should be!");
+
+  DEBUG(dbgs() << "Widening " << *GuardInst << " into " << *BestSoFar
+               << " with score " << scoreTypeToString(BestScoreSoFar) << "\n");
+  widenGuard(BestSoFar, GuardInst->getArgOperand(0));
+  GuardInst->setArgOperand(0, ConstantInt::getTrue(GuardInst->getContext()));
+  EliminatedGuards.push_back(GuardInst);
+  WidenedGuards.insert(BestSoFar);
+  return true;
+}
+
+GuardWideningImpl::WideningScore GuardWideningImpl::computeWideningScore(
+    IntrinsicInst *DominatedGuard, Loop *DominatedGuardLoop,
+    IntrinsicInst *DominatingGuard, Loop *DominatingGuardLoop) {
+  bool HoistingOutOfLoop = false;
+
+  if (DominatingGuardLoop != DominatedGuardLoop) {
+    if (DominatingGuardLoop &&
+        !DominatingGuardLoop->contains(DominatedGuardLoop))
+      return WS_IllegalOrNegative;
+
+    HoistingOutOfLoop = true;
+  }
+
+  if (!isAvailableAt(DominatedGuard->getArgOperand(0), DominatingGuard))
+    return WS_IllegalOrNegative;
+
+  bool HoistingOutOfIf =
+      !PDT.dominates(DominatedGuard->getParent(), DominatingGuard->getParent());
+
+  if (isWideningCondProfitable(DominatedGuard->getArgOperand(0),
+                               DominatingGuard->getArgOperand(0)))
+    return HoistingOutOfLoop ? WS_VeryPositive : WS_Positive;
+
+  if (HoistingOutOfLoop)
+    return WS_Positive;
+
+  return HoistingOutOfIf ? WS_IllegalOrNegative : WS_Neutral;
+}
+
+bool GuardWideningImpl::isAvailableAt(Value *V, Instruction *Loc,
+                                      SmallPtrSetImpl<Instruction *> &Visited) {
+  auto *Inst = dyn_cast<Instruction>(V);
+  if (!Inst || DT.dominates(Inst, Loc) || Visited.count(Inst))
+    return true;
+
+  if (!isSafeToSpeculativelyExecute(Inst, Loc, &DT) ||
+      Inst->mayReadFromMemory())
+    return false;
+
+  Visited.insert(Inst);
+
+  // We only want to go _up_ the dominance chain when recursing.
+  assert(!isa<PHINode>(Loc) &&
+         "PHIs should return false for isSafeToSpeculativelyExecute");
+  assert(DT.isReachableFromEntry(Inst->getParent()) &&
+         "We did a DFS from the block entry!");
+  return all_of(Inst->operands(),
+                [&](Value *Op) { return isAvailableAt(Op, Loc, Visited); });
+}
+
+void GuardWideningImpl::makeAvailableAt(Value *V, Instruction *Loc) {
+  auto *Inst = dyn_cast<Instruction>(V);
+  if (!Inst || DT.dominates(Inst, Loc))
+    return;
+
+  assert(isSafeToSpeculativelyExecute(Inst, Loc, &DT) &&
+         !Inst->mayReadFromMemory() && "Should've checked with isAvailableAt!");
+
+  for (Value *Op : Inst->operands())
+    makeAvailableAt(Op, Loc);
+
+  Inst->moveBefore(Loc);
+}
+
+bool GuardWideningImpl::widenCondCommon(Value *Cond0, Value *Cond1,
+                                        Instruction *InsertPt, Value *&Result) {
+  using namespace llvm::PatternMatch;
+
+  {
+    // L >u C0 && L >u C1  ->  L >u max(C0, C1)
+    ConstantInt *RHS0, *RHS1;
+    Value *LHS;
+    ICmpInst::Predicate Pred0, Pred1;
+    if (match(Cond0, m_ICmp(Pred0, m_Value(LHS), m_ConstantInt(RHS0))) &&
+        match(Cond1, m_ICmp(Pred1, m_Specific(LHS), m_ConstantInt(RHS1)))) {
+
+      ConstantRange CR0 =
+          ConstantRange::makeExactICmpRegion(Pred0, RHS0->getValue());
+      ConstantRange CR1 =
+          ConstantRange::makeExactICmpRegion(Pred1, RHS1->getValue());
+
+      // SubsetIntersect is a subset of the actual mathematical intersection of
+      // CR0 and CR1, while SupersetIntersect is a superset of the actual
+      // mathematical intersection.  If these two ConstantRanges are equal, then
+      // we know we were able to represent the actual mathematical intersection
+      // of CR0 and CR1, and can use the same to generate an icmp instruction.
+      //
+      // Given what we're doing here and the semantics of guards, it would
+      // actually be correct to just use SubsetIntersect, but that may be too
+      // aggressive in cases we care about.
+      auto SubsetIntersect = CR0.inverse().unionWith(CR1.inverse()).inverse();
+      auto SupersetIntersect = CR0.intersectWith(CR1);
+
+      APInt NewRHSAP;
+      CmpInst::Predicate Pred;
+      if (SubsetIntersect == SupersetIntersect &&
+          SubsetIntersect.getEquivalentICmp(Pred, NewRHSAP)) {
+        if (InsertPt) {
+          ConstantInt *NewRHS = ConstantInt::get(Cond0->getContext(), NewRHSAP);
+          Result = new ICmpInst(InsertPt, Pred, LHS, NewRHS, "wide.chk");
+        }
+        return true;
+      }
+    }
+  }
+
+  {
+    SmallVector<GuardWideningImpl::RangeCheck, 4> Checks, CombinedChecks;
+    if (parseRangeChecks(Cond0, Checks) && parseRangeChecks(Cond1, Checks) &&
+        combineRangeChecks(Checks, CombinedChecks)) {
+      if (InsertPt) {
+        Result = nullptr;
+        for (auto &RC : CombinedChecks) {
+          makeAvailableAt(RC.getCheckInst(), InsertPt);
+          if (Result)
+            Result = BinaryOperator::CreateAnd(RC.getCheckInst(), Result, "",
+                                               InsertPt);
+          else
+            Result = RC.getCheckInst();
+        }
+
+        Result->setName("wide.chk");
+      }
+      return true;
+    }
+  }
+
+  // Base case -- just logical-and the two conditions together.
+
+  if (InsertPt) {
+    makeAvailableAt(Cond0, InsertPt);
+    makeAvailableAt(Cond1, InsertPt);
+
+    Result = BinaryOperator::CreateAnd(Cond0, Cond1, "wide.chk", InsertPt);
+  }
+
+  // We were not able to compute Cond0 AND Cond1 for the price of one.
+  return false;
+}
+
+bool GuardWideningImpl::parseRangeChecks(
+    Value *CheckCond, SmallVectorImpl<GuardWideningImpl::RangeCheck> &Checks,
+    SmallPtrSetImpl<Value *> &Visited) {
+  if (!Visited.insert(CheckCond).second)
+    return true;
+
+  using namespace llvm::PatternMatch;
+
+  {
+    Value *AndLHS, *AndRHS;
+    if (match(CheckCond, m_And(m_Value(AndLHS), m_Value(AndRHS))))
+      return parseRangeChecks(AndLHS, Checks) &&
+             parseRangeChecks(AndRHS, Checks);
+  }
+
+  auto *IC = dyn_cast<ICmpInst>(CheckCond);
+  if (!IC || !IC->getOperand(0)->getType()->isIntegerTy() ||
+      (IC->getPredicate() != ICmpInst::ICMP_ULT &&
+       IC->getPredicate() != ICmpInst::ICMP_UGT))
+    return false;
+
+  Value *CmpLHS = IC->getOperand(0), *CmpRHS = IC->getOperand(1);
+  if (IC->getPredicate() == ICmpInst::ICMP_UGT)
+    std::swap(CmpLHS, CmpRHS);
+
+  auto &DL = IC->getModule()->getDataLayout();
+
+  GuardWideningImpl::RangeCheck Check(
+      CmpLHS, cast<ConstantInt>(ConstantInt::getNullValue(CmpRHS->getType())),
+      CmpRHS, IC);
+
+  if (!isKnownNonNegative(Check.getLength(), DL))
+    return false;
+
+  // What we have in \c Check now is a correct interpretation of \p CheckCond.
+  // Try to see if we can move some constant offsets into the \c Offset field.
+
+  bool Changed;
+  auto &Ctx = CheckCond->getContext();
+
+  do {
+    Value *OpLHS;
+    ConstantInt *OpRHS;
+    Changed = false;
+
+#ifndef NDEBUG
+    auto *BaseInst = dyn_cast<Instruction>(Check.getBase());
+    assert((!BaseInst || DT.isReachableFromEntry(BaseInst->getParent())) &&
+           "Unreachable instruction?");
+#endif
+
+    if (match(Check.getBase(), m_Add(m_Value(OpLHS), m_ConstantInt(OpRHS)))) {
+      Check.setBase(OpLHS);
+      APInt NewOffset = Check.getOffsetValue() + OpRHS->getValue();
+      Check.setOffset(ConstantInt::get(Ctx, NewOffset));
+      Changed = true;
+    } else if (match(Check.getBase(),
+                     m_Or(m_Value(OpLHS), m_ConstantInt(OpRHS)))) {
+      unsigned BitWidth = OpLHS->getType()->getScalarSizeInBits();
+      APInt KnownZero(BitWidth, 0), KnownOne(BitWidth, 0);
+      computeKnownBits(OpLHS, KnownZero, KnownOne, DL);
+      if ((OpRHS->getValue() & KnownZero) == OpRHS->getValue()) {
+        Check.setBase(OpLHS);
+        APInt NewOffset = Check.getOffsetValue() + OpRHS->getValue();
+        Check.setOffset(ConstantInt::get(Ctx, NewOffset));
+        Changed = true;
+      }
+    }
+  } while (Changed);
+
+  Checks.push_back(Check);
+  return true;
+}
+
+bool GuardWideningImpl::combineRangeChecks(
+    SmallVectorImpl<GuardWideningImpl::RangeCheck> &Checks,
+    SmallVectorImpl<GuardWideningImpl::RangeCheck> &RangeChecksOut) {
+  unsigned OldCount = Checks.size();
+  while (!Checks.empty()) {
+    // Pick all of the range checks with a specific base and length, and try to
+    // merge them.
+    Value *CurrentBase = Checks.front().getBase();
+    Value *CurrentLength = Checks.front().getLength();
+
+    SmallVector<GuardWideningImpl::RangeCheck, 3> CurrentChecks;
+
+    auto IsCurrentCheck = [&](GuardWideningImpl::RangeCheck &RC) {
+      return RC.getBase() == CurrentBase && RC.getLength() == CurrentLength;
+    };
+
+    std::copy_if(Checks.begin(), Checks.end(),
+                 std::back_inserter(CurrentChecks), IsCurrentCheck);
+    Checks.erase(remove_if(Checks, IsCurrentCheck), Checks.end());
+
+    assert(CurrentChecks.size() != 0 && "We know we have at least one!");
+
+    if (CurrentChecks.size() < 3) {
+      RangeChecksOut.insert(RangeChecksOut.end(), CurrentChecks.begin(),
+                            CurrentChecks.end());
+      continue;
+    }
+
+    // CurrentChecks.size() will typically be 3 here, but so far there has been
+    // no need to hard-code that fact.
+
+    std::sort(CurrentChecks.begin(), CurrentChecks.end(),
+              [&](const GuardWideningImpl::RangeCheck &LHS,
+                  const GuardWideningImpl::RangeCheck &RHS) {
+      return LHS.getOffsetValue().slt(RHS.getOffsetValue());
+    });
+
+    // Note: std::sort should not invalidate the ChecksStart iterator.
+
+    ConstantInt *MinOffset = CurrentChecks.front().getOffset(),
+                *MaxOffset = CurrentChecks.back().getOffset();
+
+    unsigned BitWidth = MaxOffset->getValue().getBitWidth();
+    if ((MaxOffset->getValue() - MinOffset->getValue())
+            .ugt(APInt::getSignedMinValue(BitWidth)))
+      return false;
+
+    APInt MaxDiff = MaxOffset->getValue() - MinOffset->getValue();
+    const APInt &HighOffset = MaxOffset->getValue();
+    auto OffsetOK = [&](const GuardWideningImpl::RangeCheck &RC) {
+      return (HighOffset - RC.getOffsetValue()).ult(MaxDiff);
+    };
+
+    if (MaxDiff.isMinValue() ||
+        !std::all_of(std::next(CurrentChecks.begin()), CurrentChecks.end(),
+                     OffsetOK))
+      return false;
+
+    // We have a series of f+1 checks as:
+    //
+    //   I+k_0 u< L   ... Chk_0
+    //   I_k_1 u< L   ... Chk_1
+    //   ...
+    //   I_k_f u< L   ... Chk_(f+1)
+    //
+    //     with forall i in [0,f): k_f-k_i u< k_f-k_0  ... Precond_0
+    //          k_f-k_0 u< INT_MIN+k_f                 ... Precond_1
+    //          k_f != k_0                             ... Precond_2
+    //
+    // Claim:
+    //   Chk_0 AND Chk_(f+1)  implies all the other checks
+    //
+    // Informal proof sketch:
+    //
+    // We will show that the integer range [I+k_0,I+k_f] does not unsigned-wrap
+    // (i.e. going from I+k_0 to I+k_f does not cross the -1,0 boundary) and
+    // thus I+k_f is the greatest unsigned value in that range.
+    //
+    // This combined with Ckh_(f+1) shows that everything in that range is u< L.
+    // Via Precond_0 we know that all of the indices in Chk_0 through Chk_(f+1)
+    // lie in [I+k_0,I+k_f], this proving our claim.
+    //
+    // To see that [I+k_0,I+k_f] is not a wrapping range, note that there are
+    // two possibilities: I+k_0 u< I+k_f or I+k_0 >u I+k_f (they can't be equal
+    // since k_0 != k_f).  In the former case, [I+k_0,I+k_f] is not a wrapping
+    // range by definition, and the latter case is impossible:
+    //
+    //   0-----I+k_f---I+k_0----L---INT_MAX,INT_MIN------------------(-1)
+    //   xxxxxx             xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx
+    //
+    // For Chk_0 to succeed, we'd have to have k_f-k_0 (the range highlighted
+    // with 'x' above) to be at least >u INT_MIN.
+
+    RangeChecksOut.emplace_back(CurrentChecks.front());
+    RangeChecksOut.emplace_back(CurrentChecks.back());
+  }
+
+  assert(RangeChecksOut.size() <= OldCount && "We pessimized!");
+  return RangeChecksOut.size() != OldCount;
+}
+
+PreservedAnalyses GuardWideningPass::run(Function &F,
+                                         AnalysisManager<Function> &AM) {
+  auto &DT = AM.getResult<DominatorTreeAnalysis>(F);
+  auto &LI = AM.getResult<LoopAnalysis>(F);
+  auto &PDT = AM.getResult<PostDominatorTreeAnalysis>(F);
+  bool Changed = GuardWideningImpl(DT, PDT, LI).run();
+  return Changed ? PreservedAnalyses::none() : PreservedAnalyses::all();
+}
+
+StringRef GuardWideningImpl::scoreTypeToString(WideningScore WS) {
+  switch (WS) {
+  case WS_IllegalOrNegative:
+    return "IllegalOrNegative";
+  case WS_Neutral:
+    return "Neutral";
+  case WS_Positive:
+    return "Positive";
+  case WS_VeryPositive:
+    return "VeryPositive";
+  }
+
+  llvm_unreachable("Fully covered switch above!");
+}
+
+char GuardWideningLegacyPass::ID = 0;
+
+INITIALIZE_PASS_BEGIN(GuardWideningLegacyPass, "guard-widening", "Widen guards",
+                      false, false)
+INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(PostDominatorTreeWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass)
+INITIALIZE_PASS_END(GuardWideningLegacyPass, "guard-widening", "Widen guards",
+                    false, false)
+
+FunctionPass *llvm::createGuardWideningPass() {
+  return new GuardWideningLegacyPass();
+}
diff --git a/lib/Transforms/Scalar/IndVarSimplify.cpp b/lib/Transforms/Scalar/IndVarSimplify.cpp
index ec5e15f0b8f8..542cf38e43bb 100644
--- a/lib/Transforms/Scalar/IndVarSimplify.cpp
+++ b/lib/Transforms/Scalar/IndVarSimplify.cpp
@@ -24,13 +24,14 @@
 //
 //===----------------------------------------------------------------------===//
 
+#include "llvm/Transforms/Scalar/IndVarSimplify.h"
 #include "llvm/Transforms/Scalar.h"
-#include "llvm/ADT/DenseMap.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/Statistic.h"
 #include "llvm/Analysis/GlobalsModRef.h"
 #include "llvm/Analysis/LoopInfo.h"
 #include "llvm/Analysis/LoopPass.h"
+#include "llvm/Analysis/LoopPassManager.h"
 #include "llvm/Analysis/ScalarEvolutionExpander.h"
 #include "llvm/Analysis/ScalarEvolutionAliasAnalysis.h"
 #include "llvm/Analysis/TargetLibraryInfo.h"
@@ -69,9 +70,6 @@ static cl::opt<bool> VerifyIndvars(
   "verify-indvars", cl::Hidden,
   cl::desc("Verify the ScalarEvolution result after running indvars"));
 
-static cl::opt<bool> ReduceLiveIVs("liv-reduce", cl::Hidden,
-  cl::desc("Reduce live induction variables."));
-
 enum ReplaceExitVal { NeverRepl, OnlyCheapRepl, AlwaysRepl };
 
 static cl::opt<ReplaceExitVal> ReplaceExitValue(
@@ -87,42 +85,16 @@ static cl::opt<ReplaceExitVal> ReplaceExitValue(
 namespace {
 struct RewritePhi;
 
-class IndVarSimplify : public LoopPass {
-  LoopInfo                  *LI;
-  ScalarEvolution           *SE;
-  DominatorTree             *DT;
-  TargetLibraryInfo         *TLI;
+class IndVarSimplify {
+  LoopInfo *LI;
+  ScalarEvolution *SE;
+  DominatorTree *DT;
+  const DataLayout &DL;
+  TargetLibraryInfo *TLI;
   const TargetTransformInfo *TTI;
 
   SmallVector<WeakVH, 16> DeadInsts;
-  bool Changed;
-public:
-
-  static char ID; // Pass identification, replacement for typeid
-  IndVarSimplify()
-    : LoopPass(ID), LI(nullptr), SE(nullptr), DT(nullptr), Changed(false) {
-    initializeIndVarSimplifyPass(*PassRegistry::getPassRegistry());
-  }
-
-  bool runOnLoop(Loop *L, LPPassManager &LPM) override;
-
-  void getAnalysisUsage(AnalysisUsage &AU) const override {
-    AU.addRequired<DominatorTreeWrapperPass>();
-    AU.addRequired<LoopInfoWrapperPass>();
-    AU.addRequired<ScalarEvolutionWrapperPass>();
-    AU.addRequiredID(LoopSimplifyID);
-    AU.addRequiredID(LCSSAID);
-    AU.addPreserved<GlobalsAAWrapperPass>();
-    AU.addPreserved<ScalarEvolutionWrapperPass>();
-    AU.addPreservedID(LoopSimplifyID);
-    AU.addPreservedID(LCSSAID);
-    AU.setPreservesCFG();
-  }
-
-private:
-  void releaseMemory() override {
-    DeadInsts.clear();
-  }
+  bool Changed = false;
 
   bool isValidRewrite(Value *FromVal, Value *ToVal);
 
@@ -133,6 +105,7 @@ private:
 
   bool canLoopBeDeleted(Loop *L, SmallVector<RewritePhi, 8> &RewritePhiSet);
   void rewriteLoopExitValues(Loop *L, SCEVExpander &Rewriter);
+  void rewriteFirstIterationLoopExitValues(Loop *L);
 
   Value *linearFunctionTestReplace(Loop *L, const SCEV *BackedgeTakenCount,
                                    PHINode *IndVar, SCEVExpander &Rewriter);
@@ -141,22 +114,15 @@ private:
 
   Value *expandSCEVIfNeeded(SCEVExpander &Rewriter, const SCEV *S, Loop *L,
                             Instruction *InsertPt, Type *Ty);
-};
-}
 
-char IndVarSimplify::ID = 0;
-INITIALIZE_PASS_BEGIN(IndVarSimplify, "indvars",
-                "Induction Variable Simplification", false, false)
-INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
-INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass)
-INITIALIZE_PASS_DEPENDENCY(ScalarEvolutionWrapperPass)
-INITIALIZE_PASS_DEPENDENCY(LoopSimplify)
-INITIALIZE_PASS_DEPENDENCY(LCSSA)
-INITIALIZE_PASS_END(IndVarSimplify, "indvars",
-                "Induction Variable Simplification", false, false)
+public:
+  IndVarSimplify(LoopInfo *LI, ScalarEvolution *SE, DominatorTree *DT,
+                 const DataLayout &DL, TargetLibraryInfo *TLI,
+                 TargetTransformInfo *TTI)
+      : LI(LI), SE(SE), DT(DT), DL(DL), TLI(TLI), TTI(TTI) {}
 
-Pass *llvm::createIndVarSimplifyPass() {
-  return new IndVarSimplify();
+  bool run(Loop *L);
+};
 }
 
 /// Return true if the SCEV expansion generated by the rewriter can replace the
@@ -504,10 +470,9 @@ struct RewritePhi {
   unsigned Ith;  // Ith incoming value.
   Value *Val;    // Exit value after expansion.
   bool HighCost; // High Cost when expansion.
-  bool SafePhi;  // LCSSASafePhiForRAUW.
 
-  RewritePhi(PHINode *P, unsigned I, Value *V, bool H, bool S)
-      : PN(P), Ith(I), Val(V), HighCost(H), SafePhi(S) {}
+  RewritePhi(PHINode *P, unsigned I, Value *V, bool H)
+      : PN(P), Ith(I), Val(V), HighCost(H) {}
 };
 }
 
@@ -550,9 +515,7 @@ void IndVarSimplify::rewriteLoopExitValues(Loop *L, SCEVExpander &Rewriter) {
   // Find all values that are computed inside the loop, but used outside of it.
   // Because of LCSSA, these values will only occur in LCSSA PHI Nodes.  Scan
   // the exit blocks of the loop to find them.
-  for (unsigned i = 0, e = ExitBlocks.size(); i != e; ++i) {
-    BasicBlock *ExitBB = ExitBlocks[i];
-
+  for (BasicBlock *ExitBB : ExitBlocks) {
     // If there are no PHI nodes in this exit block, then no values defined
     // inside the loop are used on this path, skip it.
     PHINode *PN = dyn_cast<PHINode>(ExitBB->begin());
@@ -560,29 +523,13 @@ void IndVarSimplify::rewriteLoopExitValues(Loop *L, SCEVExpander &Rewriter) {
 
     unsigned NumPreds = PN->getNumIncomingValues();
 
-    // We would like to be able to RAUW single-incoming value PHI nodes. We
-    // have to be certain this is safe even when this is an LCSSA PHI node.
-    // While the computed exit value is no longer varying in *this* loop, the
-    // exit block may be an exit block for an outer containing loop as well,
-    // the exit value may be varying in the outer loop, and thus it may still
-    // require an LCSSA PHI node. The safe case is when this is
-    // single-predecessor PHI node (LCSSA) and the exit block containing it is
-    // part of the enclosing loop, or this is the outer most loop of the nest.
-    // In either case the exit value could (at most) be varying in the same
-    // loop body as the phi node itself. Thus if it is in turn used outside of
-    // an enclosing loop it will only be via a separate LCSSA node.
-    bool LCSSASafePhiForRAUW =
-        NumPreds == 1 &&
-        (!L->getParentLoop() || L->getParentLoop() == LI->getLoopFor(ExitBB));
-
     // Iterate over all of the PHI nodes.
     BasicBlock::iterator BBI = ExitBB->begin();
     while ((PN = dyn_cast<PHINode>(BBI++))) {
       if (PN->use_empty())
         continue; // dead use, don't replace it
 
-      // SCEV only supports integer expressions for now.
-      if (!PN->getType()->isIntegerTy() && !PN->getType()->isPointerTy())
+      if (!SE->isSCEVable(PN->getType()))
         continue;
 
       // It's necessary to tell ScalarEvolution about this explicitly so that
@@ -669,8 +616,7 @@ void IndVarSimplify::rewriteLoopExitValues(Loop *L, SCEVExpander &Rewriter) {
         }
 
         // Collect all the candidate PHINodes to be rewritten.
-        RewritePhiSet.push_back(
-            RewritePhi(PN, i, ExitVal, HighCost, LCSSASafePhiForRAUW));
+        RewritePhiSet.emplace_back(PN, i, ExitVal, HighCost);
       }
     }
   }
@@ -699,9 +645,9 @@ void IndVarSimplify::rewriteLoopExitValues(Loop *L, SCEVExpander &Rewriter) {
     if (isInstructionTriviallyDead(Inst, TLI))
       DeadInsts.push_back(Inst);
 
-    // If we determined that this PHI is safe to replace even if an LCSSA
-    // PHI, do so.
-    if (Phi.SafePhi) {
+    // Replace PN with ExitVal if that is legal and does not break LCSSA.
+    if (PN->getNumIncomingValues() == 1 &&
+        LI->replacementPreservesLCSSAForm(PN, ExitVal)) {
       PN->replaceAllUsesWith(ExitVal);
       PN->eraseFromParent();
     }
@@ -712,6 +658,80 @@ void IndVarSimplify::rewriteLoopExitValues(Loop *L, SCEVExpander &Rewriter) {
   Rewriter.clearInsertPoint();
 }
 
+//===---------------------------------------------------------------------===//
+// rewriteFirstIterationLoopExitValues: Rewrite loop exit values if we know
+// they will exit at the first iteration.
+//===---------------------------------------------------------------------===//
+
+/// Check to see if this loop has loop invariant conditions which lead to loop
+/// exits. If so, we know that if the exit path is taken, it is at the first
+/// loop iteration. This lets us predict exit values of PHI nodes that live in
+/// loop header.
+void IndVarSimplify::rewriteFirstIterationLoopExitValues(Loop *L) {
+  // Verify the input to the pass is already in LCSSA form.
+  assert(L->isLCSSAForm(*DT));
+
+  SmallVector<BasicBlock *, 8> ExitBlocks;
+  L->getUniqueExitBlocks(ExitBlocks);
+  auto *LoopHeader = L->getHeader();
+  assert(LoopHeader && "Invalid loop");
+
+  for (auto *ExitBB : ExitBlocks) {
+    BasicBlock::iterator BBI = ExitBB->begin();
+    // If there are no more PHI nodes in this exit block, then no more
+    // values defined inside the loop are used on this path.
+    while (auto *PN = dyn_cast<PHINode>(BBI++)) {
+      for (unsigned IncomingValIdx = 0, E = PN->getNumIncomingValues();
+          IncomingValIdx != E; ++IncomingValIdx) {
+        auto *IncomingBB = PN->getIncomingBlock(IncomingValIdx);
+
+        // We currently only support loop exits from loop header. If the
+        // incoming block is not loop header, we need to recursively check
+        // all conditions starting from loop header are loop invariants.
+        // Additional support might be added in the future.
+        if (IncomingBB != LoopHeader)
+          continue;
+
+        // Get condition that leads to the exit path.
+        auto *TermInst = IncomingBB->getTerminator();
+
+        Value *Cond = nullptr;
+        if (auto *BI = dyn_cast<BranchInst>(TermInst)) {
+          // Must be a conditional branch, otherwise the block
+          // should not be in the loop.
+          Cond = BI->getCondition();
+        } else if (auto *SI = dyn_cast<SwitchInst>(TermInst))
+          Cond = SI->getCondition();
+        else
+          continue;
+
+        if (!L->isLoopInvariant(Cond))
+          continue;
+
+        auto *ExitVal =
+            dyn_cast<PHINode>(PN->getIncomingValue(IncomingValIdx));
+
+        // Only deal with PHIs.
+        if (!ExitVal)
+          continue;
+
+        // If ExitVal is a PHI on the loop header, then we know its
+        // value along this exit because the exit can only be taken
+        // on the first iteration.
+        auto *LoopPreheader = L->getLoopPreheader();
+        assert(LoopPreheader && "Invalid loop");
+        int PreheaderIdx = ExitVal->getBasicBlockIndex(LoopPreheader);
+        if (PreheaderIdx != -1) {
+          assert(ExitVal->getParent() == LoopHeader &&
+                 "ExitVal must be in loop header");
+          PN->setIncomingValue(IncomingValIdx,
+              ExitVal->getIncomingValue(PreheaderIdx));
+        }
+      }
+    }
+  }
+}
+
 /// Check whether it is possible to delete the loop after rewriting exit
 /// value. If it is possible, ignore ReplaceExitValue and do rewriting
 /// aggressively.
@@ -1240,6 +1260,12 @@ Instruction *WidenIV::widenIVUse(NarrowIVDefUse DU, SCEVExpander &Rewriter) {
       if (UsePhi->getNumOperands() != 1)
         truncateIVUse(DU, DT, LI);
       else {
+        // Widening the PHI requires us to insert a trunc.  The logical place
+        // for this trunc is in the same BB as the PHI.  This is not possible if
+        // the BB is terminated by a catchswitch.
+        if (isa<CatchSwitchInst>(UsePhi->getParent()->getTerminator()))
+          return nullptr;
+
         PHINode *WidePhi =
           PHINode::Create(DU.WideDef->getType(), 1, UsePhi->getName() + ".wide",
                           UsePhi);
@@ -1317,8 +1343,7 @@ Instruction *WidenIV::widenIVUse(NarrowIVDefUse DU, SCEVExpander &Rewriter) {
   // Reuse the IV increment that SCEVExpander created as long as it dominates
   // NarrowUse.
   Instruction *WideUse = nullptr;
-  if (WideAddRec == WideIncExpr
-      && Rewriter.hoistIVInc(WideInc, DU.NarrowUse))
+  if (WideAddRec == WideIncExpr && Rewriter.hoistIVInc(WideInc, DU.NarrowUse))
     WideUse = WideInc;
   else {
     WideUse = cloneIVUser(DU, WideAddRec);
@@ -1355,8 +1380,7 @@ void WidenIV::pushNarrowIVUsers(Instruction *NarrowDef, Instruction *WideDef) {
     if (!Widened.insert(NarrowUser).second)
       continue;
 
-    NarrowIVUsers.push_back(
-        NarrowIVDefUse(NarrowDef, NarrowUser, WideDef, NeverNegative));
+    NarrowIVUsers.emplace_back(NarrowDef, NarrowUser, WideDef, NeverNegative);
   }
 }
 
@@ -1391,9 +1415,10 @@ PHINode *WidenIV::createWideIV(SCEVExpander &Rewriter) {
   // An AddRec must have loop-invariant operands. Since this AddRec is
   // materialized by a loop header phi, the expression cannot have any post-loop
   // operands, so they must dominate the loop header.
-  assert(SE->properlyDominates(AddRec->getStart(), L->getHeader()) &&
-         SE->properlyDominates(AddRec->getStepRecurrence(*SE), L->getHeader())
-         && "Loop header phi recurrence inputs do not dominate the loop");
+  assert(
+      SE->properlyDominates(AddRec->getStart(), L->getHeader()) &&
+      SE->properlyDominates(AddRec->getStepRecurrence(*SE), L->getHeader()) &&
+      "Loop header phi recurrence inputs do not dominate the loop");
 
   // The rewriter provides a value for the desired IV expression. This may
   // either find an existing phi or materialize a new one. Either way, we
@@ -1463,8 +1488,6 @@ public:
     : SE(SCEV), TTI(TTI), IVPhi(IV) {
     DT = DTree;
     WI.NarrowIV = IVPhi;
-    if (ReduceLiveIVs)
-      setSplitOverflowIntrinsics();
   }
 
   // Implement the interface used by simplifyUsersOfIV.
@@ -1729,6 +1752,7 @@ static PHINode *FindLoopCounter(Loop *L, const SCEV *BECount,
   const SCEV *BestInit = nullptr;
   BasicBlock *LatchBlock = L->getLoopLatch();
   assert(LatchBlock && "needsLFTR should guarantee a loop latch");
+  const DataLayout &DL = L->getHeader()->getModule()->getDataLayout();
 
   for (BasicBlock::iterator I = L->getHeader()->begin(); isa<PHINode>(I); ++I) {
     PHINode *Phi = cast<PHINode>(I);
@@ -1747,8 +1771,7 @@ static PHINode *FindLoopCounter(Loop *L, const SCEV *BECount,
     // AR may be wider than BECount. With eq/ne tests overflow is immaterial.
     // AR may not be a narrower type, or we may never exit.
     uint64_t PhiWidth = SE->getTypeSizeInBits(AR->getType());
-    if (PhiWidth < BCWidth ||
-        !L->getHeader()->getModule()->getDataLayout().isLegalInteger(PhiWidth))
+    if (PhiWidth < BCWidth || !DL.isLegalInteger(PhiWidth))
       continue;
 
     const SCEV *Step = dyn_cast<SCEVConstant>(AR->getStepRecurrence(*SE));
@@ -1767,8 +1790,8 @@ static PHINode *FindLoopCounter(Loop *L, const SCEV *BECount,
       // the loop test. In this case we assume that performing LFTR could not
       // increase the number of undef users.
       if (ICmpInst *Cond = getLoopTest(L)) {
-        if (Phi != getLoopPhiForCounter(Cond->getOperand(0), L, DT)
-            && Phi != getLoopPhiForCounter(Cond->getOperand(1), L, DT)) {
+        if (Phi != getLoopPhiForCounter(Cond->getOperand(0), L, DT) &&
+            Phi != getLoopPhiForCounter(Cond->getOperand(1), L, DT)) {
           continue;
         }
       }
@@ -1810,9 +1833,7 @@ static Value *genLoopLimit(PHINode *IndVar, const SCEV *IVCount, Loop *L,
   // finds a valid pointer IV. Sign extend BECount in order to materialize a
   // GEP. Avoid running SCEVExpander on a new pointer value, instead reusing
   // the existing GEPs whenever possible.
-  if (IndVar->getType()->isPointerTy()
-      && !IVCount->getType()->isPointerTy()) {
-
+  if (IndVar->getType()->isPointerTy() && !IVCount->getType()->isPointerTy()) {
     // IVOffset will be the new GEP offset that is interpreted by GEP as a
     // signed value. IVCount on the other hand represents the loop trip count,
     // which is an unsigned value. FindLoopCounter only allows induction
@@ -1833,13 +1854,13 @@ static Value *genLoopLimit(PHINode *IndVar, const SCEV *IVCount, Loop *L,
     // We could handle pointer IVs other than i8*, but we need to compensate for
     // gep index scaling. See canExpandBackedgeTakenCount comments.
     assert(SE->getSizeOfExpr(IntegerType::getInt64Ty(IndVar->getContext()),
-             cast<PointerType>(GEPBase->getType())->getElementType())->isOne()
-           && "unit stride pointer IV must be i8*");
+                             cast<PointerType>(GEPBase->getType())
+                                 ->getElementType())->isOne() &&
+           "unit stride pointer IV must be i8*");
 
     IRBuilder<> Builder(L->getLoopPreheader()->getTerminator());
     return Builder.CreateGEP(nullptr, GEPBase, GEPOffset, "lftr.limit");
-  }
-  else {
+  } else {
     // In any other case, convert both IVInit and IVCount to integers before
     // comparing. This may result in SCEV expension of pointers, but in practice
     // SCEV will fold the pointer arithmetic away as such:
@@ -1913,8 +1934,9 @@ linearFunctionTestReplace(Loop *L,
   }
 
   Value *ExitCnt = genLoopLimit(IndVar, IVCount, L, Rewriter, SE);
-  assert(ExitCnt->getType()->isPointerTy() == IndVar->getType()->isPointerTy()
-         && "genLoopLimit missed a cast");
+  assert(ExitCnt->getType()->isPointerTy() ==
+             IndVar->getType()->isPointerTy() &&
+         "genLoopLimit missed a cast");
 
   // Insert a new icmp_ne or icmp_eq instruction before the branch.
   BranchInst *BI = cast<BranchInst>(L->getExitingBlock()->getTerminator());
@@ -2074,9 +2096,9 @@ void IndVarSimplify::sinkUnusedInvariants(Loop *L) {
 //  IndVarSimplify driver. Manage several subpasses of IV simplification.
 //===----------------------------------------------------------------------===//
 
-bool IndVarSimplify::runOnLoop(Loop *L, LPPassManager &LPM) {
-  if (skipOptnoneFunction(L))
-    return false;
+bool IndVarSimplify::run(Loop *L) {
+  // We need (and expect!) the incoming loop to be in LCSSA.
+  assert(L->isRecursivelyLCSSAForm(*DT) && "LCSSA required to run indvars!");
 
   // If LoopSimplify form is not available, stay out of trouble. Some notes:
   //  - LSR currently only supports LoopSimplify-form loops. Indvars'
@@ -2089,18 +2111,6 @@ bool IndVarSimplify::runOnLoop(Loop *L, LPPassManager &LPM) {
   if (!L->isLoopSimplifyForm())
     return false;
 
-  LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
-  SE = &getAnalysis<ScalarEvolutionWrapperPass>().getSE();
-  DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
-  auto *TLIP = getAnalysisIfAvailable<TargetLibraryInfoWrapperPass>();
-  TLI = TLIP ? &TLIP->getTLI() : nullptr;
-  auto *TTIP = getAnalysisIfAvailable<TargetTransformInfoWrapperPass>();
-  TTI = TTIP ? &TTIP->getTTI(*L->getHeader()->getParent()) : nullptr;
-  const DataLayout &DL = L->getHeader()->getModule()->getDataLayout();
-
-  DeadInsts.clear();
-  Changed = false;
-
   // If there are any floating-point recurrences, attempt to
   // transform them to use integer recurrences.
   rewriteNonIntegerIVs(L);
@@ -2172,6 +2182,11 @@ bool IndVarSimplify::runOnLoop(Loop *L, LPPassManager &LPM) {
   // loop may be sunk below the loop to reduce register pressure.
   sinkUnusedInvariants(L);
 
+  // rewriteFirstIterationLoopExitValues does not rely on the computation of
+  // trip count and therefore can further simplify exit values in addition to
+  // rewriteLoopExitValues.
+  rewriteFirstIterationLoopExitValues(L);
+
   // Clean up dead instructions.
   Changed |= DeleteDeadPHIs(L->getHeader(), TLI);
 
@@ -2197,3 +2212,69 @@ bool IndVarSimplify::runOnLoop(Loop *L, LPPassManager &LPM) {
 
   return Changed;
 }
+
+PreservedAnalyses IndVarSimplifyPass::run(Loop &L, AnalysisManager<Loop> &AM) {
+  auto &FAM = AM.getResult<FunctionAnalysisManagerLoopProxy>(L).getManager();
+  Function *F = L.getHeader()->getParent();
+  const DataLayout &DL = F->getParent()->getDataLayout();
+
+  auto *LI = FAM.getCachedResult<LoopAnalysis>(*F);
+  auto *SE = FAM.getCachedResult<ScalarEvolutionAnalysis>(*F);
+  auto *DT = FAM.getCachedResult<DominatorTreeAnalysis>(*F);
+
+  assert((LI && SE && DT) &&
+         "Analyses required for indvarsimplify not available!");
+
+  // Optional analyses.
+  auto *TTI = FAM.getCachedResult<TargetIRAnalysis>(*F);
+  auto *TLI = FAM.getCachedResult<TargetLibraryAnalysis>(*F);
+
+  IndVarSimplify IVS(LI, SE, DT, DL, TLI, TTI);
+  if (!IVS.run(&L))
+    return PreservedAnalyses::all();
+
+  // FIXME: This should also 'preserve the CFG'.
+  return getLoopPassPreservedAnalyses();
+}
+
+namespace {
+struct IndVarSimplifyLegacyPass : public LoopPass {
+  static char ID; // Pass identification, replacement for typeid
+  IndVarSimplifyLegacyPass() : LoopPass(ID) {
+    initializeIndVarSimplifyLegacyPassPass(*PassRegistry::getPassRegistry());
+  }
+
+  bool runOnLoop(Loop *L, LPPassManager &LPM) override {
+    if (skipLoop(L))
+      return false;
+
+    auto *LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
+    auto *SE = &getAnalysis<ScalarEvolutionWrapperPass>().getSE();
+    auto *DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
+    auto *TLIP = getAnalysisIfAvailable<TargetLibraryInfoWrapperPass>();
+    auto *TLI = TLIP ? &TLIP->getTLI() : nullptr;
+    auto *TTIP = getAnalysisIfAvailable<TargetTransformInfoWrapperPass>();
+    auto *TTI = TTIP ? &TTIP->getTTI(*L->getHeader()->getParent()) : nullptr;
+    const DataLayout &DL = L->getHeader()->getModule()->getDataLayout();
+
+    IndVarSimplify IVS(LI, SE, DT, DL, TLI, TTI);
+    return IVS.run(L);
+  }
+
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    AU.setPreservesCFG();
+    getLoopAnalysisUsage(AU);
+  }
+};
+}
+
+char IndVarSimplifyLegacyPass::ID = 0;
+INITIALIZE_PASS_BEGIN(IndVarSimplifyLegacyPass, "indvars",
+                      "Induction Variable Simplification", false, false)
+INITIALIZE_PASS_DEPENDENCY(LoopPass)
+INITIALIZE_PASS_END(IndVarSimplifyLegacyPass, "indvars",
+                    "Induction Variable Simplification", false, false)
+
+Pass *llvm::createIndVarSimplifyPass() {
+  return new IndVarSimplifyLegacyPass();
+}
diff --git a/lib/Transforms/Scalar/InductiveRangeCheckElimination.cpp b/lib/Transforms/Scalar/InductiveRangeCheckElimination.cpp
index dea61f6ff3d7..ec7f09a2d598 100644
--- a/lib/Transforms/Scalar/InductiveRangeCheckElimination.cpp
+++ b/lib/Transforms/Scalar/InductiveRangeCheckElimination.cpp
@@ -67,7 +67,6 @@
 #include "llvm/Transforms/Utils/LoopUtils.h"
 #include "llvm/Transforms/Utils/SimplifyIndVar.h"
 #include "llvm/Transforms/Utils/UnrollLoop.h"
-#include <array>
 
 using namespace llvm;
 
@@ -114,24 +113,22 @@ class InductiveRangeCheck {
     RANGE_CHECK_UNKNOWN = (unsigned)-1
   };
 
-  static const char *rangeCheckKindToStr(RangeCheckKind);
+  static StringRef rangeCheckKindToStr(RangeCheckKind);
 
-  const SCEV *Offset;
-  const SCEV *Scale;
-  Value *Length;
-  BranchInst *Branch;
-  RangeCheckKind Kind;
+  const SCEV *Offset = nullptr;
+  const SCEV *Scale = nullptr;
+  Value *Length = nullptr;
+  Use *CheckUse = nullptr;
+  RangeCheckKind Kind = RANGE_CHECK_UNKNOWN;
 
   static RangeCheckKind parseRangeCheckICmp(Loop *L, ICmpInst *ICI,
                                             ScalarEvolution &SE, Value *&Index,
                                             Value *&Length);
 
-  static InductiveRangeCheck::RangeCheckKind
-  parseRangeCheck(Loop *L, ScalarEvolution &SE, Value *Condition,
-                  const SCEV *&Index, Value *&UpperLimit);
-
-  InductiveRangeCheck() :
-    Offset(nullptr), Scale(nullptr), Length(nullptr), Branch(nullptr) { }
+  static void
+  extractRangeChecksFromCond(Loop *L, ScalarEvolution &SE, Use &ConditionUse,
+                             SmallVectorImpl<InductiveRangeCheck> &Checks,
+                             SmallPtrSetImpl<Value *> &Visited);
 
 public:
   const SCEV *getOffset() const { return Offset; }
@@ -150,9 +147,9 @@ public:
       Length->print(OS);
     else
       OS << "(null)";
-    OS << "\n  Branch: ";
-    getBranch()->print(OS);
-    OS << "\n";
+    OS << "\n  CheckUse: ";
+    getCheckUse()->getUser()->print(OS);
+    OS << " Operand: " << getCheckUse()->getOperandNo() << "\n";
   }
 
 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
@@ -161,7 +158,7 @@ public:
   }
 #endif
 
-  BranchInst *getBranch() const { return Branch; }
+  Use *getCheckUse() const { return CheckUse; }
 
   /// Represents an signed integer range [Range.getBegin(), Range.getEnd()).  If
   /// R.getEnd() sle R.getBegin(), then R denotes the empty range.
@@ -180,8 +177,6 @@ public:
     const SCEV *getEnd() const { return End; }
   };
 
-  typedef SpecificBumpPtrAllocator<InductiveRangeCheck> AllocatorTy;
-
   /// This is the value the condition of the branch needs to evaluate to for the
   /// branch to take the hot successor (see (1) above).
   bool getPassingDirection() { return true; }
@@ -190,19 +185,20 @@ public:
   /// check is redundant and can be constant-folded away.  The induction
   /// variable is not required to be the canonical {0,+,1} induction variable.
   Optional<Range> computeSafeIterationSpace(ScalarEvolution &SE,
-                                            const SCEVAddRecExpr *IndVar,
-                                            IRBuilder<> &B) const;
-
-  /// Create an inductive range check out of BI if possible, else return
-  /// nullptr.
-  static InductiveRangeCheck *create(AllocatorTy &Alloc, BranchInst *BI,
-                                     Loop *L, ScalarEvolution &SE,
-                                     BranchProbabilityInfo &BPI);
+                                            const SCEVAddRecExpr *IndVar) const;
+
+  /// Parse out a set of inductive range checks from \p BI and append them to \p
+  /// Checks.
+  ///
+  /// NB! There may be conditions feeding into \p BI that aren't inductive range
+  /// checks, and hence don't end up in \p Checks.
+  static void
+  extractRangeChecksFromBranch(BranchInst *BI, Loop *L, ScalarEvolution &SE,
+                               BranchProbabilityInfo &BPI,
+                               SmallVectorImpl<InductiveRangeCheck> &Checks);
 };
 
 class InductiveRangeCheckElimination : public LoopPass {
-  InductiveRangeCheck::AllocatorTy Allocator;
-
 public:
   static char ID;
   InductiveRangeCheckElimination() : LoopPass(ID) {
@@ -211,11 +207,8 @@ public:
   }
 
   void getAnalysisUsage(AnalysisUsage &AU) const override {
-    AU.addRequired<LoopInfoWrapperPass>();
-    AU.addRequiredID(LoopSimplifyID);
-    AU.addRequiredID(LCSSAID);
-    AU.addRequired<ScalarEvolutionWrapperPass>();
     AU.addRequired<BranchProbabilityInfoWrapperPass>();
+    getLoopAnalysisUsage(AU);
   }
 
   bool runOnLoop(Loop *L, LPPassManager &LPM) override;
@@ -226,15 +219,12 @@ char InductiveRangeCheckElimination::ID = 0;
 
 INITIALIZE_PASS_BEGIN(InductiveRangeCheckElimination, "irce",
                       "Inductive range check elimination", false, false)
-INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass)
-INITIALIZE_PASS_DEPENDENCY(LoopSimplify)
-INITIALIZE_PASS_DEPENDENCY(LCSSA)
-INITIALIZE_PASS_DEPENDENCY(ScalarEvolutionWrapperPass)
 INITIALIZE_PASS_DEPENDENCY(BranchProbabilityInfoWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(LoopPass)
 INITIALIZE_PASS_END(InductiveRangeCheckElimination, "irce",
                     "Inductive range check elimination", false, false)
 
-const char *InductiveRangeCheck::rangeCheckKindToStr(
+StringRef InductiveRangeCheck::rangeCheckKindToStr(
     InductiveRangeCheck::RangeCheckKind RCK) {
   switch (RCK) {
   case InductiveRangeCheck::RANGE_CHECK_UNKNOWN:
@@ -253,11 +243,9 @@ const char *InductiveRangeCheck::rangeCheckKindToStr(
   llvm_unreachable("unknown range check type!");
 }
 
-/// Parse a single ICmp instruction, `ICI`, into a range check.  If `ICI`
-/// cannot
+/// Parse a single ICmp instruction, `ICI`, into a range check.  If `ICI` cannot
 /// be interpreted as a range check, return `RANGE_CHECK_UNKNOWN` and set
-/// `Index` and `Length` to `nullptr`.  Otherwise set `Index` to the value
-/// being
+/// `Index` and `Length` to `nullptr`.  Otherwise set `Index` to the value being
 /// range checked, and set `Length` to the upper limit `Index` is being range
 /// checked with if (and only if) the range check type is stronger or equal to
 /// RANGE_CHECK_UPPER.
@@ -327,106 +315,89 @@ InductiveRangeCheck::parseRangeCheckICmp(Loop *L, ICmpInst *ICI,
   llvm_unreachable("default clause returns!");
 }
 
-/// Parses an arbitrary condition into a range check.  `Length` is set only if
-/// the range check is recognized to be `RANGE_CHECK_UPPER` or stronger.
-InductiveRangeCheck::RangeCheckKind
-InductiveRangeCheck::parseRangeCheck(Loop *L, ScalarEvolution &SE,
-                                     Value *Condition, const SCEV *&Index,
-                                     Value *&Length) {
+void InductiveRangeCheck::extractRangeChecksFromCond(
+    Loop *L, ScalarEvolution &SE, Use &ConditionUse,
+    SmallVectorImpl<InductiveRangeCheck> &Checks,
+    SmallPtrSetImpl<Value *> &Visited) {
   using namespace llvm::PatternMatch;
 
-  Value *A = nullptr;
-  Value *B = nullptr;
-
-  if (match(Condition, m_And(m_Value(A), m_Value(B)))) {
-    Value *IndexA = nullptr, *IndexB = nullptr;
-    Value *LengthA = nullptr, *LengthB = nullptr;
-    ICmpInst *ICmpA = dyn_cast<ICmpInst>(A), *ICmpB = dyn_cast<ICmpInst>(B);
-
-    if (!ICmpA || !ICmpB)
-      return InductiveRangeCheck::RANGE_CHECK_UNKNOWN;
-
-    auto RCKindA = parseRangeCheckICmp(L, ICmpA, SE, IndexA, LengthA);
-    auto RCKindB = parseRangeCheckICmp(L, ICmpB, SE, IndexB, LengthB);
-
-    if (RCKindA == InductiveRangeCheck::RANGE_CHECK_UNKNOWN ||
-        RCKindB == InductiveRangeCheck::RANGE_CHECK_UNKNOWN)
-      return InductiveRangeCheck::RANGE_CHECK_UNKNOWN;
-
-    if (IndexA != IndexB)
-      return InductiveRangeCheck::RANGE_CHECK_UNKNOWN;
-
-    if (LengthA != nullptr && LengthB != nullptr && LengthA != LengthB)
-      return InductiveRangeCheck::RANGE_CHECK_UNKNOWN;
-
-    Index = SE.getSCEV(IndexA);
-    if (isa<SCEVCouldNotCompute>(Index))
-      return InductiveRangeCheck::RANGE_CHECK_UNKNOWN;
+  Value *Condition = ConditionUse.get();
+  if (!Visited.insert(Condition).second)
+    return;
 
-    Length = LengthA == nullptr ? LengthB : LengthA;
+  if (match(Condition, m_And(m_Value(), m_Value()))) {
+    SmallVector<InductiveRangeCheck, 8> SubChecks;
+    extractRangeChecksFromCond(L, SE, cast<User>(Condition)->getOperandUse(0),
+                               SubChecks, Visited);
+    extractRangeChecksFromCond(L, SE, cast<User>(Condition)->getOperandUse(1),
+                               SubChecks, Visited);
+
+    if (SubChecks.size() == 2) {
+      // Handle a special case where we know how to merge two checks separately
+      // checking the upper and lower bounds into a full range check.
+      const auto &RChkA = SubChecks[0];
+      const auto &RChkB = SubChecks[1];
+      if ((RChkA.Length == RChkB.Length || !RChkA.Length || !RChkB.Length) &&
+          RChkA.Offset == RChkB.Offset && RChkA.Scale == RChkB.Scale) {
+
+        // If RChkA.Kind == RChkB.Kind then we just found two identical checks.
+        // But if one of them is a RANGE_CHECK_LOWER and the other is a
+        // RANGE_CHECK_UPPER (only possibility if they're different) then
+        // together they form a RANGE_CHECK_BOTH.
+        SubChecks[0].Kind =
+            (InductiveRangeCheck::RangeCheckKind)(RChkA.Kind | RChkB.Kind);
+        SubChecks[0].Length = RChkA.Length ? RChkA.Length : RChkB.Length;
+        SubChecks[0].CheckUse = &ConditionUse;
+
+        // We updated one of the checks in place, now erase the other.
+        SubChecks.pop_back();
+      }
+    }
 
-    return (InductiveRangeCheck::RangeCheckKind)(RCKindA | RCKindB);
+    Checks.insert(Checks.end(), SubChecks.begin(), SubChecks.end());
+    return;
   }
 
-  if (ICmpInst *ICI = dyn_cast<ICmpInst>(Condition)) {
-    Value *IndexVal = nullptr;
-
-    auto RCKind = parseRangeCheckICmp(L, ICI, SE, IndexVal, Length);
+  ICmpInst *ICI = dyn_cast<ICmpInst>(Condition);
+  if (!ICI)
+    return;
 
-    if (RCKind == InductiveRangeCheck::RANGE_CHECK_UNKNOWN)
-      return InductiveRangeCheck::RANGE_CHECK_UNKNOWN;
+  Value *Length = nullptr, *Index;
+  auto RCKind = parseRangeCheckICmp(L, ICI, SE, Index, Length);
+  if (RCKind == InductiveRangeCheck::RANGE_CHECK_UNKNOWN)
+    return;
 
-    Index = SE.getSCEV(IndexVal);
-    if (isa<SCEVCouldNotCompute>(Index))
-      return InductiveRangeCheck::RANGE_CHECK_UNKNOWN;
+  const auto *IndexAddRec = dyn_cast<SCEVAddRecExpr>(SE.getSCEV(Index));
+  bool IsAffineIndex =
+      IndexAddRec && (IndexAddRec->getLoop() == L) && IndexAddRec->isAffine();
 
-    return RCKind;
-  }
+  if (!IsAffineIndex)
+    return;
 
-  return InductiveRangeCheck::RANGE_CHECK_UNKNOWN;
+  InductiveRangeCheck IRC;
+  IRC.Length = Length;
+  IRC.Offset = IndexAddRec->getStart();
+  IRC.Scale = IndexAddRec->getStepRecurrence(SE);
+  IRC.CheckUse = &ConditionUse;
+  IRC.Kind = RCKind;
+  Checks.push_back(IRC);
 }
 
-
-InductiveRangeCheck *
-InductiveRangeCheck::create(InductiveRangeCheck::AllocatorTy &A, BranchInst *BI,
-                            Loop *L, ScalarEvolution &SE,
-                            BranchProbabilityInfo &BPI) {
+void InductiveRangeCheck::extractRangeChecksFromBranch(
+    BranchInst *BI, Loop *L, ScalarEvolution &SE, BranchProbabilityInfo &BPI,
+    SmallVectorImpl<InductiveRangeCheck> &Checks) {
 
   if (BI->isUnconditional() || BI->getParent() == L->getLoopLatch())
-    return nullptr;
+    return;
 
   BranchProbability LikelyTaken(15, 16);
 
-  if (BPI.getEdgeProbability(BI->getParent(), (unsigned) 0) < LikelyTaken)
-    return nullptr;
-
-  Value *Length = nullptr;
-  const SCEV *IndexSCEV = nullptr;
-
-  auto RCKind = InductiveRangeCheck::parseRangeCheck(L, SE, BI->getCondition(),
-                                                     IndexSCEV, Length);
-
-  if (RCKind == InductiveRangeCheck::RANGE_CHECK_UNKNOWN)
-    return nullptr;
-
-  assert(IndexSCEV && "contract with SplitRangeCheckCondition!");
-  assert((!(RCKind & InductiveRangeCheck::RANGE_CHECK_UPPER) || Length) &&
-         "contract with SplitRangeCheckCondition!");
-
-  const SCEVAddRecExpr *IndexAddRec = dyn_cast<SCEVAddRecExpr>(IndexSCEV);
-  bool IsAffineIndex =
-      IndexAddRec && (IndexAddRec->getLoop() == L) && IndexAddRec->isAffine();
+  if (BPI.getEdgeProbability(BI->getParent(), (unsigned)0) < LikelyTaken)
+    return;
 
-  if (!IsAffineIndex)
-    return nullptr;
-
-  InductiveRangeCheck *IRC = new (A.Allocate()) InductiveRangeCheck;
-  IRC->Length = Length;
-  IRC->Offset = IndexAddRec->getStart();
-  IRC->Scale = IndexAddRec->getStepRecurrence(SE);
-  IRC->Branch = BI;
-  IRC->Kind = RCKind;
-  return IRC;
+  SmallPtrSet<Value *, 8> Visited;
+  InductiveRangeCheck::extractRangeChecksFromCond(L, SE, BI->getOperandUse(0),
+                                                  Checks, Visited);
 }
 
 namespace {
@@ -666,7 +637,7 @@ LoopStructure::parseLoopStructure(ScalarEvolution &SE, BranchProbabilityInfo &BP
     return None;
   }
 
-  BranchInst *LatchBr = dyn_cast<BranchInst>(&*Latch->rbegin());
+  BranchInst *LatchBr = dyn_cast<BranchInst>(Latch->getTerminator());
   if (!LatchBr || LatchBr->isUnconditional()) {
     FailureReason = "latch terminator not conditional branch";
     return None;
@@ -792,7 +763,7 @@ LoopStructure::parseLoopStructure(ScalarEvolution &SE, BranchProbabilityInfo &BP
         return None;
       }
 
-      IRBuilder<> B(&*Preheader->rbegin());
+      IRBuilder<> B(Preheader->getTerminator());
       RightValue = B.CreateAdd(RightValue, One);
     }
 
@@ -814,7 +785,7 @@ LoopStructure::parseLoopStructure(ScalarEvolution &SE, BranchProbabilityInfo &BP
         return None;
       }
 
-      IRBuilder<> B(&*Preheader->rbegin());
+      IRBuilder<> B(Preheader->getTerminator());
       RightValue = B.CreateSub(RightValue, One);
     }
   }
@@ -833,7 +804,7 @@ LoopStructure::parseLoopStructure(ScalarEvolution &SE, BranchProbabilityInfo &BP
   const DataLayout &DL = Preheader->getModule()->getDataLayout();
   Value *IndVarStartV =
       SCEVExpander(SE, DL, "irce")
-          .expandCodeFor(IndVarStart, IndVarTy, &*Preheader->rbegin());
+          .expandCodeFor(IndVarStart, IndVarTy, Preheader->getTerminator());
   IndVarStartV->setName("indvar.start");
 
   LoopStructure Result;
@@ -947,7 +918,7 @@ void LoopConstrainer::cloneLoop(LoopConstrainer::ClonedLoop &Result,
 
     for (Instruction &I : *ClonedBB)
       RemapInstruction(&I, Result.Map,
-                       RF_NoModuleLevelChanges | RF_IgnoreMissingEntries);
+                       RF_NoModuleLevelChanges | RF_IgnoreMissingLocals);
 
     // Exit blocks will now have one more predecessor and their PHI nodes need
     // to be edited to reflect that.  No phi nodes need to be introduced because
@@ -1055,7 +1026,7 @@ LoopConstrainer::RewrittenRangeInfo LoopConstrainer::changeIterationSpaceEnd(
   RRI.PseudoExit = BasicBlock::Create(Ctx, Twine(LS.Tag) + ".pseudo.exit", &F,
                                       &*BBInsertLocation);
 
-  BranchInst *PreheaderJump = cast<BranchInst>(&*Preheader->rbegin());
+  BranchInst *PreheaderJump = cast<BranchInst>(Preheader->getTerminator());
   bool Increasing = LS.IndVarIncreasing;
 
   IRBuilder<> B(PreheaderJump);
@@ -1305,9 +1276,8 @@ bool LoopConstrainer::run() {
 /// in which the range check can be safely elided.  If it cannot compute such a
 /// range, returns None.
 Optional<InductiveRangeCheck::Range>
-InductiveRangeCheck::computeSafeIterationSpace(ScalarEvolution &SE,
-                                               const SCEVAddRecExpr *IndVar,
-                                               IRBuilder<> &) const {
+InductiveRangeCheck::computeSafeIterationSpace(
+    ScalarEvolution &SE, const SCEVAddRecExpr *IndVar) const {
   // IndVar is of the form "A + B * I" (where "I" is the canonical induction
   // variable, that may or may not exist as a real llvm::Value in the loop) and
   // this inductive range check is a range check on the "C + D * I" ("C" is
@@ -1375,7 +1345,7 @@ InductiveRangeCheck::computeSafeIterationSpace(ScalarEvolution &SE,
 static Optional<InductiveRangeCheck::Range>
 IntersectRange(ScalarEvolution &SE,
                const Optional<InductiveRangeCheck::Range> &R1,
-               const InductiveRangeCheck::Range &R2, IRBuilder<> &B) {
+               const InductiveRangeCheck::Range &R2) {
   if (!R1.hasValue())
     return R2;
   auto &R1Value = R1.getValue();
@@ -1392,6 +1362,9 @@ IntersectRange(ScalarEvolution &SE,
 }
 
 bool InductiveRangeCheckElimination::runOnLoop(Loop *L, LPPassManager &LPM) {
+  if (skipLoop(L))
+    return false;
+
   if (L->getBlocks().size() >= LoopSizeCutoff) {
     DEBUG(dbgs() << "irce: giving up constraining loop, too large\n";);
     return false;
@@ -1404,17 +1377,15 @@ bool InductiveRangeCheckElimination::runOnLoop(Loop *L, LPPassManager &LPM) {
   }
 
   LLVMContext &Context = Preheader->getContext();
-  InductiveRangeCheck::AllocatorTy IRCAlloc;
-  SmallVector<InductiveRangeCheck *, 16> RangeChecks;
+  SmallVector<InductiveRangeCheck, 16> RangeChecks;
   ScalarEvolution &SE = getAnalysis<ScalarEvolutionWrapperPass>().getSE();
   BranchProbabilityInfo &BPI =
       getAnalysis<BranchProbabilityInfoWrapperPass>().getBPI();
 
   for (auto BBI : L->getBlocks())
     if (BranchInst *TBI = dyn_cast<BranchInst>(BBI->getTerminator()))
-      if (InductiveRangeCheck *IRC =
-          InductiveRangeCheck::create(IRCAlloc, TBI, L, SE, BPI))
-        RangeChecks.push_back(IRC);
+      InductiveRangeCheck::extractRangeChecksFromBranch(TBI, L, SE, BPI,
+                                                        RangeChecks);
 
   if (RangeChecks.empty())
     return false;
@@ -1423,8 +1394,8 @@ bool InductiveRangeCheckElimination::runOnLoop(Loop *L, LPPassManager &LPM) {
     OS << "irce: looking at loop "; L->print(OS);
     OS << "irce: loop has " << RangeChecks.size()
        << " inductive range checks: \n";
-    for (InductiveRangeCheck *IRC : RangeChecks)
-      IRC->print(OS);
+    for (InductiveRangeCheck &IRC : RangeChecks)
+      IRC.print(OS);
   };
 
   DEBUG(PrintRecognizedRangeChecks(dbgs()));
@@ -1450,14 +1421,14 @@ bool InductiveRangeCheckElimination::runOnLoop(Loop *L, LPPassManager &LPM) {
   Optional<InductiveRangeCheck::Range> SafeIterRange;
   Instruction *ExprInsertPt = Preheader->getTerminator();
 
-  SmallVector<InductiveRangeCheck *, 4> RangeChecksToEliminate;
+  SmallVector<InductiveRangeCheck, 4> RangeChecksToEliminate;
 
   IRBuilder<> B(ExprInsertPt);
-  for (InductiveRangeCheck *IRC : RangeChecks) {
-    auto Result = IRC->computeSafeIterationSpace(SE, IndVar, B);
+  for (InductiveRangeCheck &IRC : RangeChecks) {
+    auto Result = IRC.computeSafeIterationSpace(SE, IndVar);
     if (Result.hasValue()) {
       auto MaybeSafeIterRange =
-        IntersectRange(SE, SafeIterRange, Result.getValue(), B);
+          IntersectRange(SE, SafeIterRange, Result.getValue());
       if (MaybeSafeIterRange.hasValue()) {
         RangeChecksToEliminate.push_back(IRC);
         SafeIterRange = MaybeSafeIterRange.getValue();
@@ -1487,11 +1458,11 @@ bool InductiveRangeCheckElimination::runOnLoop(Loop *L, LPPassManager &LPM) {
 
     // Optimize away the now-redundant range checks.
 
-    for (InductiveRangeCheck *IRC : RangeChecksToEliminate) {
-      ConstantInt *FoldedRangeCheck = IRC->getPassingDirection()
+    for (InductiveRangeCheck &IRC : RangeChecksToEliminate) {
+      ConstantInt *FoldedRangeCheck = IRC.getPassingDirection()
                                           ? ConstantInt::getTrue(Context)
                                           : ConstantInt::getFalse(Context);
-      IRC->getBranch()->setCondition(FoldedRangeCheck);
+      IRC.getCheckUse()->set(FoldedRangeCheck);
     }
   }
 
diff --git a/lib/Transforms/Scalar/JumpThreading.cpp b/lib/Transforms/Scalar/JumpThreading.cpp
index dcdcfed66e64..b9e717cf763e 100644
--- a/lib/Transforms/Scalar/JumpThreading.cpp
+++ b/lib/Transforms/Scalar/JumpThreading.cpp
@@ -11,31 +11,25 @@
 //
 //===----------------------------------------------------------------------===//
 
+#include "llvm/Transforms/Scalar/JumpThreading.h"
 #include "llvm/Transforms/Scalar.h"
 #include "llvm/ADT/DenseMap.h"
 #include "llvm/ADT/DenseSet.h"
 #include "llvm/ADT/STLExtras.h"
-#include "llvm/ADT/SmallPtrSet.h"
-#include "llvm/ADT/SmallSet.h"
 #include "llvm/ADT/Statistic.h"
 #include "llvm/Analysis/GlobalsModRef.h"
 #include "llvm/Analysis/CFG.h"
-#include "llvm/Analysis/BlockFrequencyInfo.h"
 #include "llvm/Analysis/BlockFrequencyInfoImpl.h"
-#include "llvm/Analysis/BranchProbabilityInfo.h"
 #include "llvm/Analysis/ConstantFolding.h"
 #include "llvm/Analysis/InstructionSimplify.h"
-#include "llvm/Analysis/LazyValueInfo.h"
 #include "llvm/Analysis/Loads.h"
 #include "llvm/Analysis/LoopInfo.h"
-#include "llvm/Analysis/TargetLibraryInfo.h"
 #include "llvm/Analysis/ValueTracking.h"
 #include "llvm/IR/DataLayout.h"
 #include "llvm/IR/IntrinsicInst.h"
 #include "llvm/IR/LLVMContext.h"
 #include "llvm/IR/MDBuilder.h"
 #include "llvm/IR/Metadata.h"
-#include "llvm/IR/ValueHandle.h"
 #include "llvm/Pass.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Debug.h"
@@ -46,6 +40,7 @@
 #include <algorithm>
 #include <memory>
 using namespace llvm;
+using namespace jumpthreading;
 
 #define DEBUG_TYPE "jump-threading"
 
@@ -66,17 +61,6 @@ ImplicationSearchThreshold(
   cl::init(3), cl::Hidden);
 
 namespace {
-  // These are at global scope so static functions can use them too.
-  typedef SmallVectorImpl<std::pair<Constant*, BasicBlock*> > PredValueInfo;
-  typedef SmallVector<std::pair<Constant*, BasicBlock*>, 8> PredValueInfoTy;
-
-  // This is used to keep track of what kind of constant we're currently hoping
-  // to find.
-  enum ConstantPreference {
-    WantInteger,
-    WantBlockAddress
-  };
-
   /// This pass performs 'jump threading', which looks at blocks that have
   /// multiple predecessors and multiple successors.  If one or more of the
   /// predecessors of the block can be proven to always jump to one of the
@@ -94,89 +78,31 @@ namespace {
   /// revectored to the false side of the second if.
   ///
   class JumpThreading : public FunctionPass {
-    TargetLibraryInfo *TLI;
-    LazyValueInfo *LVI;
-    std::unique_ptr<BlockFrequencyInfo> BFI;
-    std::unique_ptr<BranchProbabilityInfo> BPI;
-    bool HasProfileData;
-#ifdef NDEBUG
-    SmallPtrSet<const BasicBlock *, 16> LoopHeaders;
-#else
-    SmallSet<AssertingVH<const BasicBlock>, 16> LoopHeaders;
-#endif
-    DenseSet<std::pair<Value*, BasicBlock*> > RecursionSet;
-
-    unsigned BBDupThreshold;
-
-    // RAII helper for updating the recursion stack.
-    struct RecursionSetRemover {
-      DenseSet<std::pair<Value*, BasicBlock*> > &TheSet;
-      std::pair<Value*, BasicBlock*> ThePair;
-
-      RecursionSetRemover(DenseSet<std::pair<Value*, BasicBlock*> > &S,
-                          std::pair<Value*, BasicBlock*> P)
-        : TheSet(S), ThePair(P) { }
-
-      ~RecursionSetRemover() {
-        TheSet.erase(ThePair);
-      }
-    };
+    JumpThreadingPass Impl;
+
   public:
     static char ID; // Pass identification
-    JumpThreading(int T = -1) : FunctionPass(ID) {
-      BBDupThreshold = (T == -1) ? BBDuplicateThreshold : unsigned(T);
+    JumpThreading(int T = -1) : FunctionPass(ID), Impl(T) {
       initializeJumpThreadingPass(*PassRegistry::getPassRegistry());
     }
 
     bool runOnFunction(Function &F) override;
 
     void getAnalysisUsage(AnalysisUsage &AU) const override {
-      AU.addRequired<LazyValueInfo>();
-      AU.addPreserved<LazyValueInfo>();
+      AU.addRequired<LazyValueInfoWrapperPass>();
+      AU.addPreserved<LazyValueInfoWrapperPass>();
       AU.addPreserved<GlobalsAAWrapperPass>();
       AU.addRequired<TargetLibraryInfoWrapperPass>();
     }
 
-    void releaseMemory() override {
-      BFI.reset();
-      BPI.reset();
-    }
-
-    void FindLoopHeaders(Function &F);
-    bool ProcessBlock(BasicBlock *BB);
-    bool ThreadEdge(BasicBlock *BB, const SmallVectorImpl<BasicBlock*> &PredBBs,
-                    BasicBlock *SuccBB);
-    bool DuplicateCondBranchOnPHIIntoPred(BasicBlock *BB,
-                                  const SmallVectorImpl<BasicBlock *> &PredBBs);
-
-    bool ComputeValueKnownInPredecessors(Value *V, BasicBlock *BB,
-                                         PredValueInfo &Result,
-                                         ConstantPreference Preference,
-                                         Instruction *CxtI = nullptr);
-    bool ProcessThreadableEdges(Value *Cond, BasicBlock *BB,
-                                ConstantPreference Preference,
-                                Instruction *CxtI = nullptr);
-
-    bool ProcessBranchOnPHI(PHINode *PN);
-    bool ProcessBranchOnXOR(BinaryOperator *BO);
-    bool ProcessImpliedCondition(BasicBlock *BB);
-
-    bool SimplifyPartiallyRedundantLoad(LoadInst *LI);
-    bool TryToUnfoldSelect(CmpInst *CondCmp, BasicBlock *BB);
-    bool TryToUnfoldSelectInCurrBB(BasicBlock *BB);
-
-  private:
-    BasicBlock *SplitBlockPreds(BasicBlock *BB, ArrayRef<BasicBlock *> Preds,
-                                const char *Suffix);
-    void UpdateBlockFreqAndEdgeWeight(BasicBlock *PredBB, BasicBlock *BB,
-                                      BasicBlock *NewBB, BasicBlock *SuccBB);
+    void releaseMemory() override { Impl.releaseMemory(); }
   };
 }
 
 char JumpThreading::ID = 0;
 INITIALIZE_PASS_BEGIN(JumpThreading, "jump-threading",
                 "Jump Threading", false, false)
-INITIALIZE_PASS_DEPENDENCY(LazyValueInfo)
+INITIALIZE_PASS_DEPENDENCY(LazyValueInfoWrapperPass)
 INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfoWrapperPass)
 INITIALIZE_PASS_END(JumpThreading, "jump-threading",
                 "Jump Threading", false, false)
@@ -184,24 +110,72 @@ INITIALIZE_PASS_END(JumpThreading, "jump-threading",
 // Public interface to the Jump Threading pass
 FunctionPass *llvm::createJumpThreadingPass(int Threshold) { return new JumpThreading(Threshold); }
 
+JumpThreadingPass::JumpThreadingPass(int T) {
+  BBDupThreshold = (T == -1) ? BBDuplicateThreshold : unsigned(T);
+}
+
 /// runOnFunction - Top level algorithm.
 ///
 bool JumpThreading::runOnFunction(Function &F) {
-  if (skipOptnoneFunction(F))
+  if (skipFunction(F))
     return false;
+  auto TLI = &getAnalysis<TargetLibraryInfoWrapperPass>().getTLI();
+  auto LVI = &getAnalysis<LazyValueInfoWrapperPass>().getLVI();
+  std::unique_ptr<BlockFrequencyInfo> BFI;
+  std::unique_ptr<BranchProbabilityInfo> BPI;
+  bool HasProfileData = F.getEntryCount().hasValue();
+  if (HasProfileData) {
+    LoopInfo LI{DominatorTree(F)};
+    BPI.reset(new BranchProbabilityInfo(F, LI));
+    BFI.reset(new BlockFrequencyInfo(F, *BPI, LI));
+  }
+  return Impl.runImpl(F, TLI, LVI, HasProfileData, std::move(BFI),
+                      std::move(BPI));
+}
+
+PreservedAnalyses JumpThreadingPass::run(Function &F,
+                                         AnalysisManager<Function> &AM) {
+
+  auto &TLI = AM.getResult<TargetLibraryAnalysis>(F);
+  auto &LVI = AM.getResult<LazyValueAnalysis>(F);
+  std::unique_ptr<BlockFrequencyInfo> BFI;
+  std::unique_ptr<BranchProbabilityInfo> BPI;
+  bool HasProfileData = F.getEntryCount().hasValue();
+  if (HasProfileData) {
+    LoopInfo LI{DominatorTree(F)};
+    BPI.reset(new BranchProbabilityInfo(F, LI));
+    BFI.reset(new BlockFrequencyInfo(F, *BPI, LI));
+  }
+  bool Changed =
+      runImpl(F, &TLI, &LVI, HasProfileData, std::move(BFI), std::move(BPI));
+
+  // FIXME: We need to invalidate LVI to avoid PR28400. Is there a better
+  // solution?
+  AM.invalidate<LazyValueAnalysis>(F);
+
+  if (!Changed)
+    return PreservedAnalyses::all();
+  PreservedAnalyses PA;
+  PA.preserve<GlobalsAA>();
+  return PA;
+}
+
+bool JumpThreadingPass::runImpl(Function &F, TargetLibraryInfo *TLI_,
+                                LazyValueInfo *LVI_, bool HasProfileData_,
+                                std::unique_ptr<BlockFrequencyInfo> BFI_,
+                                std::unique_ptr<BranchProbabilityInfo> BPI_) {
 
   DEBUG(dbgs() << "Jump threading on function '" << F.getName() << "'\n");
-  TLI = &getAnalysis<TargetLibraryInfoWrapperPass>().getTLI();
-  LVI = &getAnalysis<LazyValueInfo>();
+  TLI = TLI_;
+  LVI = LVI_;
   BFI.reset();
   BPI.reset();
   // When profile data is available, we need to update edge weights after
   // successful jump threading, which requires both BPI and BFI being available.
-  HasProfileData = F.getEntryCount().hasValue();
+  HasProfileData = HasProfileData_;
   if (HasProfileData) {
-    LoopInfo LI{DominatorTree(F)};
-    BPI.reset(new BranchProbabilityInfo(F, LI));
-    BFI.reset(new BlockFrequencyInfo(F, *BPI, LI));
+    BPI = std::move(BPI_);
+    BFI = std::move(BFI_);
   }
 
   // Remove unreachable blocks from function as they may result in infinite
@@ -245,10 +219,13 @@ bool JumpThreading::runOnFunction(Function &F) {
       // Can't thread an unconditional jump, but if the block is "almost
       // empty", we can replace uses of it with uses of the successor and make
       // this dead.
+      // We should not eliminate the loop header either, because eliminating
+      // a loop header might later prevent LoopSimplify from transforming nested
+      // loops into simplified form.
       if (BI && BI->isUnconditional() &&
           BB != &BB->getParent()->getEntryBlock() &&
           // If the terminator is the only non-phi instruction, try to nuke it.
-          BB->getFirstNonPHIOrDbg()->isTerminator()) {
+          BB->getFirstNonPHIOrDbg()->isTerminator() && !LoopHeaders.count(BB)) {
         // Since TryToSimplifyUncondBranchFromEmptyBlock may delete the
         // block, we have to make sure it isn't in the LoopHeaders set.  We
         // reinsert afterward if needed.
@@ -361,7 +338,7 @@ static unsigned getJumpThreadDuplicationCost(const BasicBlock *BB,
 /// enough to track all of these properties and keep it up-to-date as the CFG
 /// mutates, so we don't allow any of these transformations.
 ///
-void JumpThreading::FindLoopHeaders(Function &F) {
+void JumpThreadingPass::FindLoopHeaders(Function &F) {
   SmallVector<std::pair<const BasicBlock*,const BasicBlock*>, 32> Edges;
   FindFunctionBackedges(F, Edges);
 
@@ -395,10 +372,9 @@ static Constant *getKnownConstant(Value *Val, ConstantPreference Preference) {
 ///
 /// This returns true if there were any known values.
 ///
-bool JumpThreading::
-ComputeValueKnownInPredecessors(Value *V, BasicBlock *BB, PredValueInfo &Result,
-                                ConstantPreference Preference,
-                                Instruction *CxtI) {
+bool JumpThreadingPass::ComputeValueKnownInPredecessors(
+    Value *V, BasicBlock *BB, PredValueInfo &Result,
+    ConstantPreference Preference, Instruction *CxtI) {
   // This method walks up use-def chains recursively.  Because of this, we could
   // get into an infinite loop going around loops in the use-def chain.  To
   // prevent this, keep track of what (value, block) pairs we've already visited
@@ -415,7 +391,7 @@ ComputeValueKnownInPredecessors(Value *V, BasicBlock *BB, PredValueInfo &Result,
     for (BasicBlock *Pred : predecessors(BB))
       Result.push_back(std::make_pair(KC, Pred));
 
-    return true;
+    return !Result.empty();
   }
 
   // If V is a non-instruction value, or an instruction in a different block,
@@ -465,6 +441,25 @@ ComputeValueKnownInPredecessors(Value *V, BasicBlock *BB, PredValueInfo &Result,
     return !Result.empty();
   }
 
+  // Handle Cast instructions.  Only see through Cast when the source operand is
+  // PHI or Cmp and the source type is i1 to save the compilation time.
+  if (CastInst *CI = dyn_cast<CastInst>(I)) {
+    Value *Source = CI->getOperand(0);
+    if (!Source->getType()->isIntegerTy(1))
+      return false;
+    if (!isa<PHINode>(Source) && !isa<CmpInst>(Source))
+      return false;
+    ComputeValueKnownInPredecessors(Source, BB, Result, Preference, CxtI);
+    if (Result.empty())
+      return false;
+
+    // Convert the known values.
+    for (auto &R : Result)
+      R.first = ConstantExpr::getCast(CI->getOpcode(), R.first, CI->getType());
+
+    return true;
+  }
+
   PredValueInfoTy LHSVals, RHSVals;
 
   // Handle some boolean conditions.
@@ -705,7 +700,7 @@ static bool hasAddressTakenAndUsed(BasicBlock *BB) {
 
 /// ProcessBlock - If there are any predecessors whose control can be threaded
 /// through to a successor, transform them now.
-bool JumpThreading::ProcessBlock(BasicBlock *BB) {
+bool JumpThreadingPass::ProcessBlock(BasicBlock *BB) {
   // If the block is trivially dead, just return and let the caller nuke it.
   // This simplifies other transformations.
   if (pred_empty(BB) &&
@@ -889,7 +884,7 @@ bool JumpThreading::ProcessBlock(BasicBlock *BB) {
   return false;
 }
 
-bool JumpThreading::ProcessImpliedCondition(BasicBlock *BB) {
+bool JumpThreadingPass::ProcessImpliedCondition(BasicBlock *BB) {
   auto *BI = dyn_cast<BranchInst>(BB->getTerminator());
   if (!BI || !BI->isConditional())
     return false;
@@ -903,12 +898,17 @@ bool JumpThreading::ProcessImpliedCondition(BasicBlock *BB) {
 
   while (CurrentPred && Iter++ < ImplicationSearchThreshold) {
     auto *PBI = dyn_cast<BranchInst>(CurrentPred->getTerminator());
-    if (!PBI || !PBI->isConditional() || PBI->getSuccessor(0) != CurrentBB)
+    if (!PBI || !PBI->isConditional())
+      return false;
+    if (PBI->getSuccessor(0) != CurrentBB && PBI->getSuccessor(1) != CurrentBB)
       return false;
 
-    if (isImpliedCondition(PBI->getCondition(), Cond, DL)) {
-      BI->getSuccessor(1)->removePredecessor(BB);
-      BranchInst::Create(BI->getSuccessor(0), BI);
+    bool FalseDest = PBI->getSuccessor(1) == CurrentBB;
+    Optional<bool> Implication =
+      isImpliedCondition(PBI->getCondition(), Cond, DL, FalseDest);
+    if (Implication) {
+      BI->getSuccessor(*Implication ? 1 : 0)->removePredecessor(BB);
+      BranchInst::Create(BI->getSuccessor(*Implication ? 0 : 1), BI);
       BI->eraseFromParent();
       return true;
     }
@@ -923,9 +923,9 @@ bool JumpThreading::ProcessImpliedCondition(BasicBlock *BB) {
 /// load instruction, eliminate it by replacing it with a PHI node.  This is an
 /// important optimization that encourages jump threading, and needs to be run
 /// interlaced with other jump threading tasks.
-bool JumpThreading::SimplifyPartiallyRedundantLoad(LoadInst *LI) {
-  // Don't hack volatile/atomic loads.
-  if (!LI->isSimple()) return false;
+bool JumpThreadingPass::SimplifyPartiallyRedundantLoad(LoadInst *LI) {
+  // Don't hack volatile and ordered loads.
+  if (!LI->isUnordered()) return false;
 
   // If the load is defined in a block with exactly one predecessor, it can't be
   // partially redundant.
@@ -952,10 +952,9 @@ bool JumpThreading::SimplifyPartiallyRedundantLoad(LoadInst *LI) {
   BasicBlock::iterator BBIt(LI);
 
   if (Value *AvailableVal =
-        FindAvailableLoadedValue(LoadedPtr, LoadBB, BBIt, DefMaxInstsToScan)) {
+        FindAvailableLoadedValue(LI, LoadBB, BBIt, DefMaxInstsToScan)) {
     // If the value of the load is locally available within the block, just use
     // it.  This frequently occurs for reg2mem'd allocas.
-    //cerr << "LOAD ELIMINATED:\n" << *BBIt << *LI << "\n";
 
     // If the returned value is the load itself, replace with an undef. This can
     // only happen in dead loops.
@@ -994,7 +993,7 @@ bool JumpThreading::SimplifyPartiallyRedundantLoad(LoadInst *LI) {
     // Scan the predecessor to see if the value is available in the pred.
     BBIt = PredBB->end();
     AAMDNodes ThisAATags;
-    Value *PredAvailable = FindAvailableLoadedValue(LoadedPtr, PredBB, BBIt,
+    Value *PredAvailable = FindAvailableLoadedValue(LI, PredBB, BBIt,
                                                     DefMaxInstsToScan,
                                                     nullptr, &ThisAATags);
     if (!PredAvailable) {
@@ -1056,9 +1055,10 @@ bool JumpThreading::SimplifyPartiallyRedundantLoad(LoadInst *LI) {
   if (UnavailablePred) {
     assert(UnavailablePred->getTerminator()->getNumSuccessors() == 1 &&
            "Can't handle critical edge here!");
-    LoadInst *NewVal = new LoadInst(LoadedPtr, LI->getName()+".pr", false,
-                                 LI->getAlignment(),
-                                 UnavailablePred->getTerminator());
+    LoadInst *NewVal =
+        new LoadInst(LoadedPtr, LI->getName() + ".pr", false,
+                     LI->getAlignment(), LI->getOrdering(), LI->getSynchScope(),
+                     UnavailablePred->getTerminator());
     NewVal->setDebugLoc(LI->getDebugLoc());
     if (AATags)
       NewVal->setAAMetadata(AATags);
@@ -1100,8 +1100,6 @@ bool JumpThreading::SimplifyPartiallyRedundantLoad(LoadInst *LI) {
     PN->addIncoming(PredV, I->first);
   }
 
-  //cerr << "PRE: " << *LI << *PN << "\n";
-
   LI->replaceAllUsesWith(PN);
   LI->eraseFromParent();
 
@@ -1171,9 +1169,9 @@ FindMostPopularDest(BasicBlock *BB,
   return MostPopularDest;
 }
 
-bool JumpThreading::ProcessThreadableEdges(Value *Cond, BasicBlock *BB,
-                                           ConstantPreference Preference,
-                                           Instruction *CxtI) {
+bool JumpThreadingPass::ProcessThreadableEdges(Value *Cond, BasicBlock *BB,
+                                               ConstantPreference Preference,
+                                               Instruction *CxtI) {
   // If threading this would thread across a loop header, don't even try to
   // thread the edge.
   if (LoopHeaders.count(BB))
@@ -1279,7 +1277,7 @@ bool JumpThreading::ProcessThreadableEdges(Value *Cond, BasicBlock *BB,
 /// a PHI node in the current block.  See if there are any simplifications we
 /// can do based on inputs to the phi node.
 ///
-bool JumpThreading::ProcessBranchOnPHI(PHINode *PN) {
+bool JumpThreadingPass::ProcessBranchOnPHI(PHINode *PN) {
   BasicBlock *BB = PN->getParent();
 
   // TODO: We could make use of this to do it once for blocks with common PHI
@@ -1309,7 +1307,7 @@ bool JumpThreading::ProcessBranchOnPHI(PHINode *PN) {
 /// a xor instruction in the current block.  See if there are any
 /// simplifications we can do based on inputs to the xor.
 ///
-bool JumpThreading::ProcessBranchOnXOR(BinaryOperator *BO) {
+bool JumpThreadingPass::ProcessBranchOnXOR(BinaryOperator *BO) {
   BasicBlock *BB = BO->getParent();
 
   // If either the LHS or RHS of the xor is a constant, don't do this
@@ -1437,9 +1435,9 @@ static void AddPHINodeEntriesForMappedBlock(BasicBlock *PHIBB,
 /// ThreadEdge - We have decided that it is safe and profitable to factor the
 /// blocks in PredBBs to one predecessor, then thread an edge from it to SuccBB
 /// across BB.  Transform the IR to reflect this change.
-bool JumpThreading::ThreadEdge(BasicBlock *BB,
-                               const SmallVectorImpl<BasicBlock*> &PredBBs,
-                               BasicBlock *SuccBB) {
+bool JumpThreadingPass::ThreadEdge(BasicBlock *BB,
+                                   const SmallVectorImpl<BasicBlock *> &PredBBs,
+                                   BasicBlock *SuccBB) {
   // If threading to the same block as we come from, we would infinite loop.
   if (SuccBB == BB) {
     DEBUG(dbgs() << "  Not threading across BB '" << BB->getName()
@@ -1593,9 +1591,9 @@ bool JumpThreading::ThreadEdge(BasicBlock *BB,
 /// Create a new basic block that will be the predecessor of BB and successor of
 /// all blocks in Preds. When profile data is availble, update the frequency of
 /// this new block.
-BasicBlock *JumpThreading::SplitBlockPreds(BasicBlock *BB,
-                                           ArrayRef<BasicBlock *> Preds,
-                                           const char *Suffix) {
+BasicBlock *JumpThreadingPass::SplitBlockPreds(BasicBlock *BB,
+                                               ArrayRef<BasicBlock *> Preds,
+                                               const char *Suffix) {
   // Collect the frequencies of all predecessors of BB, which will be used to
   // update the edge weight on BB->SuccBB.
   BlockFrequency PredBBFreq(0);
@@ -1615,10 +1613,10 @@ BasicBlock *JumpThreading::SplitBlockPreds(BasicBlock *BB,
 /// Update the block frequency of BB and branch weight and the metadata on the
 /// edge BB->SuccBB. This is done by scaling the weight of BB->SuccBB by 1 -
 /// Freq(PredBB->BB) / Freq(BB->SuccBB).
-void JumpThreading::UpdateBlockFreqAndEdgeWeight(BasicBlock *PredBB,
-                                                 BasicBlock *BB,
-                                                 BasicBlock *NewBB,
-                                                 BasicBlock *SuccBB) {
+void JumpThreadingPass::UpdateBlockFreqAndEdgeWeight(BasicBlock *PredBB,
+                                                     BasicBlock *BB,
+                                                     BasicBlock *NewBB,
+                                                     BasicBlock *SuccBB) {
   if (!HasProfileData)
     return;
 
@@ -1679,8 +1677,8 @@ void JumpThreading::UpdateBlockFreqAndEdgeWeight(BasicBlock *PredBB,
 /// If we can duplicate the contents of BB up into PredBB do so now, this
 /// improves the odds that the branch will be on an analyzable instruction like
 /// a compare.
-bool JumpThreading::DuplicateCondBranchOnPHIIntoPred(BasicBlock *BB,
-                                 const SmallVectorImpl<BasicBlock *> &PredBBs) {
+bool JumpThreadingPass::DuplicateCondBranchOnPHIIntoPred(
+    BasicBlock *BB, const SmallVectorImpl<BasicBlock *> &PredBBs) {
   assert(!PredBBs.empty() && "Can't handle an empty set");
 
   // If BB is a loop header, then duplicating this block outside the loop would
@@ -1750,13 +1748,18 @@ bool JumpThreading::DuplicateCondBranchOnPHIIntoPred(BasicBlock *BB,
     // phi translation.
     if (Value *IV =
             SimplifyInstruction(New, BB->getModule()->getDataLayout())) {
-      delete New;
       ValueMapping[&*BI] = IV;
+      if (!New->mayHaveSideEffects()) {
+        delete New;
+        New = nullptr;
+      }
     } else {
+      ValueMapping[&*BI] = New;
+    }
+    if (New) {
       // Otherwise, insert the new instruction into the block.
       New->setName(BI->getName());
       PredBB->getInstList().insert(OldPredBranch->getIterator(), New);
-      ValueMapping[&*BI] = New;
     }
   }
 
@@ -1829,7 +1832,7 @@ bool JumpThreading::DuplicateCondBranchOnPHIIntoPred(BasicBlock *BB,
 ///
 /// And expand the select into a branch structure if one of its arms allows %c
 /// to be folded. This later enables threading from bb1 over bb2.
-bool JumpThreading::TryToUnfoldSelect(CmpInst *CondCmp, BasicBlock *BB) {
+bool JumpThreadingPass::TryToUnfoldSelect(CmpInst *CondCmp, BasicBlock *BB) {
   BranchInst *CondBr = dyn_cast<BranchInst>(BB->getTerminator());
   PHINode *CondLHS = dyn_cast<PHINode>(CondCmp->getOperand(0));
   Constant *CondRHS = cast<Constant>(CondCmp->getOperand(1));
@@ -1907,7 +1910,7 @@ bool JumpThreading::TryToUnfoldSelect(CmpInst *CondCmp, BasicBlock *BB) {
 /// select if the associated PHI has at least one constant.  If the unfolded
 /// select is not jump-threaded, it will be folded again in the later
 /// optimizations.
-bool JumpThreading::TryToUnfoldSelectInCurrBB(BasicBlock *BB) {
+bool JumpThreadingPass::TryToUnfoldSelectInCurrBB(BasicBlock *BB) {
   // If threading this would thread across a loop header, don't thread the edge.
   // See the comments above FindLoopHeaders for justifications and caveats.
   if (LoopHeaders.count(BB))
diff --git a/lib/Transforms/Scalar/LICM.cpp b/lib/Transforms/Scalar/LICM.cpp
index 8923ff74253c..2c0a70e44f57 100644
--- a/lib/Transforms/Scalar/LICM.cpp
+++ b/lib/Transforms/Scalar/LICM.cpp
@@ -30,15 +30,19 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "llvm/Transforms/Scalar.h"
+#include "llvm/Transforms/Scalar/LICM.h"
 #include "llvm/ADT/Statistic.h"
 #include "llvm/Analysis/AliasAnalysis.h"
 #include "llvm/Analysis/AliasSetTracker.h"
 #include "llvm/Analysis/BasicAliasAnalysis.h"
+#include "llvm/Analysis/CaptureTracking.h"
 #include "llvm/Analysis/ConstantFolding.h"
 #include "llvm/Analysis/GlobalsModRef.h"
+#include "llvm/Analysis/Loads.h"
 #include "llvm/Analysis/LoopInfo.h"
 #include "llvm/Analysis/LoopPass.h"
+#include "llvm/Analysis/LoopPassManager.h"
+#include "llvm/Analysis/MemoryBuiltins.h"
 #include "llvm/Analysis/ScalarEvolution.h"
 #include "llvm/Analysis/ScalarEvolutionAliasAnalysis.h"
 #include "llvm/Analysis/TargetLibraryInfo.h"
@@ -56,183 +60,173 @@
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/raw_ostream.h"
+#include "llvm/Transforms/Scalar.h"
 #include "llvm/Transforms/Utils/Local.h"
 #include "llvm/Transforms/Utils/LoopUtils.h"
 #include "llvm/Transforms/Utils/SSAUpdater.h"
 #include <algorithm>
+#include <utility>
 using namespace llvm;
 
 #define DEBUG_TYPE "licm"
 
-STATISTIC(NumSunk      , "Number of instructions sunk out of loop");
-STATISTIC(NumHoisted   , "Number of instructions hoisted out of loop");
+STATISTIC(NumSunk, "Number of instructions sunk out of loop");
+STATISTIC(NumHoisted, "Number of instructions hoisted out of loop");
 STATISTIC(NumMovedLoads, "Number of load insts hoisted or sunk");
 STATISTIC(NumMovedCalls, "Number of call insts hoisted or sunk");
-STATISTIC(NumPromoted  , "Number of memory locations promoted to registers");
+STATISTIC(NumPromoted, "Number of memory locations promoted to registers");
 
 static cl::opt<bool>
-DisablePromotion("disable-licm-promotion", cl::Hidden,
-                 cl::desc("Disable memory promotion in LICM pass"));
+    DisablePromotion("disable-licm-promotion", cl::Hidden,
+                     cl::desc("Disable memory promotion in LICM pass"));
 
 static bool inSubLoop(BasicBlock *BB, Loop *CurLoop, LoopInfo *LI);
 static bool isNotUsedInLoop(const Instruction &I, const Loop *CurLoop,
-                            const LICMSafetyInfo *SafetyInfo);
-static bool hoist(Instruction &I, BasicBlock *Preheader);
+                            const LoopSafetyInfo *SafetyInfo);
+static bool hoist(Instruction &I, const DominatorTree *DT, const Loop *CurLoop,
+                  const LoopSafetyInfo *SafetyInfo);
 static bool sink(Instruction &I, const LoopInfo *LI, const DominatorTree *DT,
                  const Loop *CurLoop, AliasSetTracker *CurAST,
-                 const LICMSafetyInfo *SafetyInfo);
-static bool isGuaranteedToExecute(const Instruction &Inst,
-                                  const DominatorTree *DT,
-                                  const Loop *CurLoop,
-                                  const LICMSafetyInfo *SafetyInfo);
+                 const LoopSafetyInfo *SafetyInfo);
 static bool isSafeToExecuteUnconditionally(const Instruction &Inst,
                                            const DominatorTree *DT,
-                                           const TargetLibraryInfo *TLI,
                                            const Loop *CurLoop,
-                                           const LICMSafetyInfo *SafetyInfo,
+                                           const LoopSafetyInfo *SafetyInfo,
                                            const Instruction *CtxI = nullptr);
 static bool pointerInvalidatedByLoop(Value *V, uint64_t Size,
-                                     const AAMDNodes &AAInfo, 
+                                     const AAMDNodes &AAInfo,
                                      AliasSetTracker *CurAST);
 static Instruction *
 CloneInstructionInExitBlock(Instruction &I, BasicBlock &ExitBlock, PHINode &PN,
                             const LoopInfo *LI,
-                            const LICMSafetyInfo *SafetyInfo);
+                            const LoopSafetyInfo *SafetyInfo);
 static bool canSinkOrHoistInst(Instruction &I, AliasAnalysis *AA,
                                DominatorTree *DT, TargetLibraryInfo *TLI,
                                Loop *CurLoop, AliasSetTracker *CurAST,
-                               LICMSafetyInfo *SafetyInfo);
+                               LoopSafetyInfo *SafetyInfo);
 
 namespace {
-  struct LICM : public LoopPass {
-    static char ID; // Pass identification, replacement for typeid
-    LICM() : LoopPass(ID) {
-      initializeLICMPass(*PassRegistry::getPassRegistry());
-    }
+struct LoopInvariantCodeMotion {
+  bool runOnLoop(Loop *L, AliasAnalysis *AA, LoopInfo *LI, DominatorTree *DT,
+                 TargetLibraryInfo *TLI, ScalarEvolution *SE, bool DeleteAST);
 
-    bool runOnLoop(Loop *L, LPPassManager &LPM) override;
-
-    /// This transformation requires natural loop information & requires that
-    /// loop preheaders be inserted into the CFG...
-    ///
-    void getAnalysisUsage(AnalysisUsage &AU) const override {
-      AU.setPreservesCFG();
-      AU.addRequired<DominatorTreeWrapperPass>();
-      AU.addRequired<LoopInfoWrapperPass>();
-      AU.addRequiredID(LoopSimplifyID);
-      AU.addPreservedID(LoopSimplifyID);
-      AU.addRequiredID(LCSSAID);
-      AU.addPreservedID(LCSSAID);
-      AU.addRequired<AAResultsWrapperPass>();
-      AU.addPreserved<AAResultsWrapperPass>();
-      AU.addPreserved<BasicAAWrapperPass>();
-      AU.addPreserved<GlobalsAAWrapperPass>();
-      AU.addPreserved<ScalarEvolutionWrapperPass>();
-      AU.addPreserved<SCEVAAWrapperPass>();
-      AU.addRequired<TargetLibraryInfoWrapperPass>();
-    }
+  DenseMap<Loop *, AliasSetTracker *> &getLoopToAliasSetMap() {
+    return LoopToAliasSetMap;
+  }
+
+private:
+  DenseMap<Loop *, AliasSetTracker *> LoopToAliasSetMap;
 
-    using llvm::Pass::doFinalization;
+  AliasSetTracker *collectAliasInfoForLoop(Loop *L, LoopInfo *LI,
+                                           AliasAnalysis *AA);
+};
+
+struct LegacyLICMPass : public LoopPass {
+  static char ID; // Pass identification, replacement for typeid
+  LegacyLICMPass() : LoopPass(ID) {
+    initializeLegacyLICMPassPass(*PassRegistry::getPassRegistry());
+  }
 
-    bool doFinalization() override {
-      assert(LoopToAliasSetMap.empty() && "Didn't free loop alias sets");
+  bool runOnLoop(Loop *L, LPPassManager &LPM) override {
+    if (skipLoop(L))
       return false;
-    }
 
-  private:
-    AliasAnalysis *AA;       // Current AliasAnalysis information
-    LoopInfo      *LI;       // Current LoopInfo
-    DominatorTree *DT;       // Dominator Tree for the current Loop.
+    auto *SE = getAnalysisIfAvailable<ScalarEvolutionWrapperPass>();
+    return LICM.runOnLoop(L,
+                          &getAnalysis<AAResultsWrapperPass>().getAAResults(),
+                          &getAnalysis<LoopInfoWrapperPass>().getLoopInfo(),
+                          &getAnalysis<DominatorTreeWrapperPass>().getDomTree(),
+                          &getAnalysis<TargetLibraryInfoWrapperPass>().getTLI(),
+                          SE ? &SE->getSE() : nullptr, false);
+  }
 
-    TargetLibraryInfo *TLI;  // TargetLibraryInfo for constant folding.
+  /// This transformation requires natural loop information & requires that
+  /// loop preheaders be inserted into the CFG...
+  ///
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    AU.setPreservesCFG();
+    AU.addRequired<TargetLibraryInfoWrapperPass>();
+    getLoopAnalysisUsage(AU);
+  }
 
-    // State that is updated as we process loops.
-    bool Changed;            // Set to true when we change anything.
-    BasicBlock *Preheader;   // The preheader block of the current loop...
-    Loop *CurLoop;           // The current loop we are working on...
-    AliasSetTracker *CurAST; // AliasSet information for the current loop...
-    DenseMap<Loop*, AliasSetTracker*> LoopToAliasSetMap;
+  using llvm::Pass::doFinalization;
 
-    /// cloneBasicBlockAnalysis - Simple Analysis hook. Clone alias set info.
-    void cloneBasicBlockAnalysis(BasicBlock *From, BasicBlock *To,
-                                 Loop *L) override;
+  bool doFinalization() override {
+    assert(LICM.getLoopToAliasSetMap().empty() &&
+           "Didn't free loop alias sets");
+    return false;
+  }
 
-    /// deleteAnalysisValue - Simple Analysis hook. Delete value V from alias
-    /// set.
-    void deleteAnalysisValue(Value *V, Loop *L) override;
+private:
+  LoopInvariantCodeMotion LICM;
 
-    /// Simple Analysis hook. Delete loop L from alias set map.
-    void deleteAnalysisLoop(Loop *L) override;
-  };
+  /// cloneBasicBlockAnalysis - Simple Analysis hook. Clone alias set info.
+  void cloneBasicBlockAnalysis(BasicBlock *From, BasicBlock *To,
+                               Loop *L) override;
+
+  /// deleteAnalysisValue - Simple Analysis hook. Delete value V from alias
+  /// set.
+  void deleteAnalysisValue(Value *V, Loop *L) override;
+
+  /// Simple Analysis hook. Delete loop L from alias set map.
+  void deleteAnalysisLoop(Loop *L) override;
+};
+}
+
+PreservedAnalyses LICMPass::run(Loop &L, AnalysisManager<Loop> &AM) {
+  const auto &FAM =
+      AM.getResult<FunctionAnalysisManagerLoopProxy>(L).getManager();
+  Function *F = L.getHeader()->getParent();
+
+  auto *AA = FAM.getCachedResult<AAManager>(*F);
+  auto *LI = FAM.getCachedResult<LoopAnalysis>(*F);
+  auto *DT = FAM.getCachedResult<DominatorTreeAnalysis>(*F);
+  auto *TLI = FAM.getCachedResult<TargetLibraryAnalysis>(*F);
+  auto *SE = FAM.getCachedResult<ScalarEvolutionAnalysis>(*F);
+  assert((AA && LI && DT && TLI && SE) && "Analyses for LICM not available");
+
+  LoopInvariantCodeMotion LICM;
+
+  if (!LICM.runOnLoop(&L, AA, LI, DT, TLI, SE, true))
+    return PreservedAnalyses::all();
+
+  // FIXME: There is no setPreservesCFG in the new PM. When that becomes
+  // available, it should be used here.
+  return getLoopPassPreservedAnalyses();
 }
 
-char LICM::ID = 0;
-INITIALIZE_PASS_BEGIN(LICM, "licm", "Loop Invariant Code Motion", false, false)
-INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
-INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass)
-INITIALIZE_PASS_DEPENDENCY(LoopSimplify)
-INITIALIZE_PASS_DEPENDENCY(LCSSA)
-INITIALIZE_PASS_DEPENDENCY(ScalarEvolutionWrapperPass)
+char LegacyLICMPass::ID = 0;
+INITIALIZE_PASS_BEGIN(LegacyLICMPass, "licm", "Loop Invariant Code Motion",
+                      false, false)
+INITIALIZE_PASS_DEPENDENCY(LoopPass)
 INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfoWrapperPass)
-INITIALIZE_PASS_DEPENDENCY(BasicAAWrapperPass)
-INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass)
-INITIALIZE_PASS_DEPENDENCY(GlobalsAAWrapperPass)
-INITIALIZE_PASS_DEPENDENCY(SCEVAAWrapperPass)
-INITIALIZE_PASS_END(LICM, "licm", "Loop Invariant Code Motion", false, false)
+INITIALIZE_PASS_END(LegacyLICMPass, "licm", "Loop Invariant Code Motion", false,
+                    false)
 
-Pass *llvm::createLICMPass() { return new LICM(); }
+Pass *llvm::createLICMPass() { return new LegacyLICMPass(); }
 
 /// Hoist expressions out of the specified loop. Note, alias info for inner
 /// loop is not preserved so it is not a good idea to run LICM multiple
 /// times on one loop.
+/// We should delete AST for inner loops in the new pass manager to avoid
+/// memory leak.
 ///
-bool LICM::runOnLoop(Loop *L, LPPassManager &LPM) {
-  if (skipOptnoneFunction(L))
-    return false;
-
-  Changed = false;
-
-  // Get our Loop and Alias Analysis information...
-  LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
-  AA = &getAnalysis<AAResultsWrapperPass>().getAAResults();
-  DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
-
-  TLI = &getAnalysis<TargetLibraryInfoWrapperPass>().getTLI();
+bool LoopInvariantCodeMotion::runOnLoop(Loop *L, AliasAnalysis *AA,
+                                        LoopInfo *LI, DominatorTree *DT,
+                                        TargetLibraryInfo *TLI,
+                                        ScalarEvolution *SE, bool DeleteAST) {
+  bool Changed = false;
 
   assert(L->isLCSSAForm(*DT) && "Loop is not in LCSSA form.");
 
-  CurAST = new AliasSetTracker(*AA);
-  // Collect Alias info from subloops.
-  for (Loop *InnerL : L->getSubLoops()) {
-    AliasSetTracker *InnerAST = LoopToAliasSetMap[InnerL];
-    assert(InnerAST && "Where is my AST?");
-
-    // What if InnerLoop was modified by other passes ?
-    CurAST->add(*InnerAST);
-
-    // Once we've incorporated the inner loop's AST into ours, we don't need the
-    // subloop's anymore.
-    delete InnerAST;
-    LoopToAliasSetMap.erase(InnerL);
-  }
-
-  CurLoop = L;
+  AliasSetTracker *CurAST = collectAliasInfoForLoop(L, LI, AA);
 
   // Get the preheader block to move instructions into...
-  Preheader = L->getLoopPreheader();
-
-  // Loop over the body of this loop, looking for calls, invokes, and stores.
-  // Because subloops have already been incorporated into AST, we skip blocks in
-  // subloops.
-  //
-  for (BasicBlock *BB : L->blocks()) {
-    if (LI->getLoopFor(BB) == L)        // Ignore blocks in subloops.
-      CurAST->add(*BB);                 // Incorporate the specified basic block
-  }
+  BasicBlock *Preheader = L->getLoopPreheader();
 
   // Compute loop safety information.
-  LICMSafetyInfo SafetyInfo;
-  computeLICMSafetyInfo(&SafetyInfo, CurLoop);
+  LoopSafetyInfo SafetyInfo;
+  computeLoopSafetyInfo(&SafetyInfo, L);
 
   // We want to visit all of the instructions in this loop... that are not parts
   // of our subloops (they have already had their invariants hoisted out of
@@ -245,11 +239,11 @@ bool LICM::runOnLoop(Loop *L, LPPassManager &LPM) {
   // instructions, we perform another pass to hoist them out of the loop.
   //
   if (L->hasDedicatedExits())
-    Changed |= sinkRegion(DT->getNode(L->getHeader()), AA, LI, DT, TLI, CurLoop,
+    Changed |= sinkRegion(DT->getNode(L->getHeader()), AA, LI, DT, TLI, L,
                           CurAST, &SafetyInfo);
   if (Preheader)
-    Changed |= hoistRegion(DT->getNode(L->getHeader()), AA, LI, DT, TLI,
-                           CurLoop, CurAST, &SafetyInfo);
+    Changed |= hoistRegion(DT->getNode(L->getHeader()), AA, LI, DT, TLI, L,
+                           CurAST, &SafetyInfo);
 
   // Now that all loop invariants have been removed from the loop, promote any
   // memory references to scalars that we can.
@@ -260,9 +254,8 @@ bool LICM::runOnLoop(Loop *L, LPPassManager &LPM) {
 
     // Loop over all of the alias sets in the tracker object.
     for (AliasSet &AS : *CurAST)
-      Changed |= promoteLoopAccessesToScalars(AS, ExitBlocks, InsertPts,
-                                              PIC, LI, DT, CurLoop, 
-                                              CurAST, &SafetyInfo);
+      Changed |= promoteLoopAccessesToScalars(
+          AS, ExitBlocks, InsertPts, PIC, LI, DT, TLI, L, CurAST, &SafetyInfo);
 
     // Once we have promoted values across the loop body we have to recursively
     // reform LCSSA as any nested loop may now have values defined within the
@@ -271,8 +264,7 @@ bool LICM::runOnLoop(Loop *L, LPPassManager &LPM) {
     // SSAUpdater strategy during promotion that was LCSSA aware and reformed
     // it as it went.
     if (Changed) {
-      auto *SEWP = getAnalysisIfAvailable<ScalarEvolutionWrapperPass>();
-      formLCSSARecursively(*L, *DT, LI, SEWP ? &SEWP->getSE() : nullptr);
+      formLCSSARecursively(*L, *DT, LI, SE);
     }
   }
 
@@ -283,50 +275,49 @@ bool LICM::runOnLoop(Loop *L, LPPassManager &LPM) {
   assert((!L->getParentLoop() || L->getParentLoop()->isLCSSAForm(*DT)) &&
          "Parent loop not left in LCSSA form after LICM!");
 
-  // Clear out loops state information for the next iteration
-  CurLoop = nullptr;
-  Preheader = nullptr;
-
   // If this loop is nested inside of another one, save the alias information
   // for when we process the outer loop.
-  if (L->getParentLoop())
+  if (L->getParentLoop() && !DeleteAST)
     LoopToAliasSetMap[L] = CurAST;
   else
     delete CurAST;
+
+  if (Changed && SE)
+    SE->forgetLoopDispositions(L);
   return Changed;
 }
 
 /// Walk the specified region of the CFG (defined by all blocks dominated by
-/// the specified block, and that are in the current loop) in reverse depth 
+/// the specified block, and that are in the current loop) in reverse depth
 /// first order w.r.t the DominatorTree.  This allows us to visit uses before
 /// definitions, allowing us to sink a loop body in one pass without iteration.
 ///
 bool llvm::sinkRegion(DomTreeNode *N, AliasAnalysis *AA, LoopInfo *LI,
                       DominatorTree *DT, TargetLibraryInfo *TLI, Loop *CurLoop,
-                      AliasSetTracker *CurAST, LICMSafetyInfo *SafetyInfo) {
+                      AliasSetTracker *CurAST, LoopSafetyInfo *SafetyInfo) {
 
   // Verify inputs.
-  assert(N != nullptr && AA != nullptr && LI != nullptr && 
-         DT != nullptr && CurLoop != nullptr && CurAST != nullptr && 
-         SafetyInfo != nullptr && "Unexpected input to sinkRegion");
+  assert(N != nullptr && AA != nullptr && LI != nullptr && DT != nullptr &&
+         CurLoop != nullptr && CurAST != nullptr && SafetyInfo != nullptr &&
+         "Unexpected input to sinkRegion");
 
-  // Set changed as false.
-  bool Changed = false;
-  // Get basic block
   BasicBlock *BB = N->getBlock();
   // If this subregion is not in the top level loop at all, exit.
-  if (!CurLoop->contains(BB)) return Changed;
+  if (!CurLoop->contains(BB))
+    return false;
 
   // We are processing blocks in reverse dfo, so process children first.
-  const std::vector<DomTreeNode*> &Children = N->getChildren();
+  bool Changed = false;
+  const std::vector<DomTreeNode *> &Children = N->getChildren();
   for (DomTreeNode *Child : Children)
     Changed |= sinkRegion(Child, AA, LI, DT, TLI, CurLoop, CurAST, SafetyInfo);
 
   // Only need to process the contents of this block if it is not part of a
   // subloop (which would already have been processed).
-  if (inSubLoop(BB,CurLoop,LI)) return Changed;
+  if (inSubLoop(BB, CurLoop, LI))
+    return Changed;
 
-  for (BasicBlock::iterator II = BB->end(); II != BB->begin(); ) {
+  for (BasicBlock::iterator II = BB->end(); II != BB->begin();) {
     Instruction &I = *--II;
 
     // If the instruction is dead, we would try to sink it because it isn't used
@@ -361,21 +352,23 @@ bool llvm::sinkRegion(DomTreeNode *N, AliasAnalysis *AA, LoopInfo *LI,
 ///
 bool llvm::hoistRegion(DomTreeNode *N, AliasAnalysis *AA, LoopInfo *LI,
                        DominatorTree *DT, TargetLibraryInfo *TLI, Loop *CurLoop,
-                       AliasSetTracker *CurAST, LICMSafetyInfo *SafetyInfo) {
+                       AliasSetTracker *CurAST, LoopSafetyInfo *SafetyInfo) {
   // Verify inputs.
-  assert(N != nullptr && AA != nullptr && LI != nullptr && 
-         DT != nullptr && CurLoop != nullptr && CurAST != nullptr && 
-         SafetyInfo != nullptr && "Unexpected input to hoistRegion");
-  // Set changed as false.
-  bool Changed = false;
-  // Get basic block
+  assert(N != nullptr && AA != nullptr && LI != nullptr && DT != nullptr &&
+         CurLoop != nullptr && CurAST != nullptr && SafetyInfo != nullptr &&
+         "Unexpected input to hoistRegion");
+
   BasicBlock *BB = N->getBlock();
+
   // If this subregion is not in the top level loop at all, exit.
-  if (!CurLoop->contains(BB)) return Changed;
+  if (!CurLoop->contains(BB))
+    return false;
+
   // Only need to process the contents of this block if it is not part of a
   // subloop (which would already have been processed).
+  bool Changed = false;
   if (!inSubLoop(BB, CurLoop, LI))
-    for (BasicBlock::iterator II = BB->begin(), E = BB->end(); II != E; ) {
+    for (BasicBlock::iterator II = BB->begin(), E = BB->end(); II != E;) {
       Instruction &I = *II++;
       // Try constant folding this instruction.  If all the operands are
       // constants, it is technically hoistable, but it would be better to just
@@ -396,12 +389,13 @@ bool llvm::hoistRegion(DomTreeNode *N, AliasAnalysis *AA, LoopInfo *LI,
       //
       if (CurLoop->hasLoopInvariantOperands(&I) &&
           canSinkOrHoistInst(I, AA, DT, TLI, CurLoop, CurAST, SafetyInfo) &&
-          isSafeToExecuteUnconditionally(I, DT, TLI, CurLoop, SafetyInfo,
-                                 CurLoop->getLoopPreheader()->getTerminator()))
-        Changed |= hoist(I, CurLoop->getLoopPreheader());
+          isSafeToExecuteUnconditionally(
+              I, DT, CurLoop, SafetyInfo,
+              CurLoop->getLoopPreheader()->getTerminator()))
+        Changed |= hoist(I, DT, CurLoop, SafetyInfo);
     }
 
-  const std::vector<DomTreeNode*> &Children = N->getChildren();
+  const std::vector<DomTreeNode *> &Children = N->getChildren();
   for (DomTreeNode *Child : Children)
     Changed |= hoistRegion(Child, AA, LI, DT, TLI, CurLoop, CurAST, SafetyInfo);
   return Changed;
@@ -410,7 +404,7 @@ bool llvm::hoistRegion(DomTreeNode *N, AliasAnalysis *AA, LoopInfo *LI,
 /// Computes loop safety information, checks loop body & header
 /// for the possibility of may throw exception.
 ///
-void llvm::computeLICMSafetyInfo(LICMSafetyInfo * SafetyInfo, Loop * CurLoop) {
+void llvm::computeLoopSafetyInfo(LoopSafetyInfo *SafetyInfo, Loop *CurLoop) {
   assert(CurLoop != nullptr && "CurLoop cant be null");
   BasicBlock *Header = CurLoop->getHeader();
   // Setting default safety values.
@@ -419,15 +413,17 @@ void llvm::computeLICMSafetyInfo(LICMSafetyInfo * SafetyInfo, Loop * CurLoop) {
   // Iterate over header and compute safety info.
   for (BasicBlock::iterator I = Header->begin(), E = Header->end();
        (I != E) && !SafetyInfo->HeaderMayThrow; ++I)
-    SafetyInfo->HeaderMayThrow |= I->mayThrow();
-  
+    SafetyInfo->HeaderMayThrow |=
+        !isGuaranteedToTransferExecutionToSuccessor(&*I);
+
   SafetyInfo->MayThrow = SafetyInfo->HeaderMayThrow;
-  // Iterate over loop instructions and compute safety info. 
-  for (Loop::block_iterator BB = CurLoop->block_begin(), 
-       BBE = CurLoop->block_end(); (BB != BBE) && !SafetyInfo->MayThrow ; ++BB)
+  // Iterate over loop instructions and compute safety info.
+  for (Loop::block_iterator BB = CurLoop->block_begin(),
+                            BBE = CurLoop->block_end();
+       (BB != BBE) && !SafetyInfo->MayThrow; ++BB)
     for (BasicBlock::iterator I = (*BB)->begin(), E = (*BB)->end();
          (I != E) && !SafetyInfo->MayThrow; ++I)
-      SafetyInfo->MayThrow |= I->mayThrow();
+      SafetyInfo->MayThrow |= !isGuaranteedToTransferExecutionToSuccessor(&*I);
 
   // Compute funclet colors if we might sink/hoist in a function with a funclet
   // personality routine.
@@ -443,11 +439,11 @@ void llvm::computeLICMSafetyInfo(LICMSafetyInfo * SafetyInfo, Loop * CurLoop) {
 ///
 bool canSinkOrHoistInst(Instruction &I, AliasAnalysis *AA, DominatorTree *DT,
                         TargetLibraryInfo *TLI, Loop *CurLoop,
-                        AliasSetTracker *CurAST, LICMSafetyInfo *SafetyInfo) {
+                        AliasSetTracker *CurAST, LoopSafetyInfo *SafetyInfo) {
   // Loads have extra constraints we have to verify before we can hoist them.
   if (LoadInst *LI = dyn_cast<LoadInst>(&I)) {
     if (!LI->isUnordered())
-      return false;        // Don't hoist volatile/atomic loads!
+      return false; // Don't hoist volatile/atomic loads!
 
     // Loads from constant memory are always safe to move, even if they end up
     // in the same alias set as something that ends up being modified.
@@ -499,7 +495,8 @@ bool canSinkOrHoistInst(Instruction &I, AliasAnalysis *AA, DominatorTree *DT,
           break;
         }
       }
-      if (!FoundMod) return true;
+      if (!FoundMod)
+        return true;
     }
 
     // FIXME: This should use mod/ref information to see if we can hoist or
@@ -518,9 +515,8 @@ bool canSinkOrHoistInst(Instruction &I, AliasAnalysis *AA, DominatorTree *DT,
 
   // TODO: Plumb the context instruction through to make hoisting and sinking
   // more powerful. Hoisting of loads already works due to the special casing
-  // above. 
-  return isSafeToExecuteUnconditionally(I, DT, TLI, CurLoop, SafetyInfo,
-                                        nullptr);
+  // above.
+  return isSafeToExecuteUnconditionally(I, DT, CurLoop, SafetyInfo, nullptr);
 }
 
 /// Returns true if a PHINode is a trivially replaceable with an
@@ -541,7 +537,7 @@ static bool isTriviallyReplacablePHI(const PHINode &PN, const Instruction &I) {
 /// blocks of the loop.
 ///
 static bool isNotUsedInLoop(const Instruction &I, const Loop *CurLoop,
-                            const LICMSafetyInfo *SafetyInfo) {
+                            const LoopSafetyInfo *SafetyInfo) {
   const auto &BlockColors = SafetyInfo->BlockColors;
   for (const User *U : I.users()) {
     const Instruction *UI = cast<Instruction>(U);
@@ -588,7 +584,7 @@ static bool isNotUsedInLoop(const Instruction &I, const Loop *CurLoop,
 static Instruction *
 CloneInstructionInExitBlock(Instruction &I, BasicBlock &ExitBlock, PHINode &PN,
                             const LoopInfo *LI,
-                            const LICMSafetyInfo *SafetyInfo) {
+                            const LoopSafetyInfo *SafetyInfo) {
   Instruction *New;
   if (auto *CI = dyn_cast<CallInst>(&I)) {
     const auto &BlockColors = SafetyInfo->BlockColors;
@@ -621,7 +617,8 @@ CloneInstructionInExitBlock(Instruction &I, BasicBlock &ExitBlock, PHINode &PN,
   }
 
   ExitBlock.getInstList().insert(ExitBlock.getFirstInsertionPt(), New);
-  if (!I.getName().empty()) New->setName(I.getName() + ".le");
+  if (!I.getName().empty())
+    New->setName(I.getName() + ".le");
 
   // Build LCSSA PHI nodes for any in-loop operands. Note that this is
   // particularly cheap because we can rip off the PHI node that we're
@@ -652,18 +649,20 @@ CloneInstructionInExitBlock(Instruction &I, BasicBlock &ExitBlock, PHINode &PN,
 ///
 static bool sink(Instruction &I, const LoopInfo *LI, const DominatorTree *DT,
                  const Loop *CurLoop, AliasSetTracker *CurAST,
-                 const LICMSafetyInfo *SafetyInfo) {
+                 const LoopSafetyInfo *SafetyInfo) {
   DEBUG(dbgs() << "LICM sinking instruction: " << I << "\n");
   bool Changed = false;
-  if (isa<LoadInst>(I)) ++NumMovedLoads;
-  else if (isa<CallInst>(I)) ++NumMovedCalls;
+  if (isa<LoadInst>(I))
+    ++NumMovedLoads;
+  else if (isa<CallInst>(I))
+    ++NumMovedCalls;
   ++NumSunk;
   Changed = true;
 
 #ifndef NDEBUG
   SmallVector<BasicBlock *, 32> ExitBlocks;
   CurLoop->getUniqueExitBlocks(ExitBlocks);
-  SmallPtrSet<BasicBlock *, 32> ExitBlockSet(ExitBlocks.begin(), 
+  SmallPtrSet<BasicBlock *, 32> ExitBlockSet(ExitBlocks.begin(),
                                              ExitBlocks.end());
 #endif
 
@@ -717,18 +716,30 @@ static bool sink(Instruction &I, const LoopInfo *LI, const DominatorTree *DT,
 /// When an instruction is found to only use loop invariant operands that
 /// is safe to hoist, this instruction is called to do the dirty work.
 ///
-static bool hoist(Instruction &I, BasicBlock *Preheader) {
-  DEBUG(dbgs() << "LICM hoisting to " << Preheader->getName() << ": "
-        << I << "\n");
+static bool hoist(Instruction &I, const DominatorTree *DT, const Loop *CurLoop,
+                  const LoopSafetyInfo *SafetyInfo) {
+  auto *Preheader = CurLoop->getLoopPreheader();
+  DEBUG(dbgs() << "LICM hoisting to " << Preheader->getName() << ": " << I
+               << "\n");
+
+  // Metadata can be dependent on conditions we are hoisting above.
+  // Conservatively strip all metadata on the instruction unless we were
+  // guaranteed to execute I if we entered the loop, in which case the metadata
+  // is valid in the loop preheader.
+  if (I.hasMetadataOtherThanDebugLoc() &&
+      // The check on hasMetadataOtherThanDebugLoc is to prevent us from burning
+      // time in isGuaranteedToExecute if we don't actually have anything to
+      // drop.  It is a compile time optimization, not required for correctness.
+      !isGuaranteedToExecute(I, DT, CurLoop, SafetyInfo))
+    I.dropUnknownNonDebugMetadata();
+
   // Move the new node to the Preheader, before its terminator.
   I.moveBefore(Preheader->getTerminator());
 
-  // Metadata can be dependent on the condition we are hoisting above.
-  // Conservatively strip all metadata on the instruction.
-  I.dropUnknownNonDebugMetadata();
-
-  if (isa<LoadInst>(I)) ++NumMovedLoads;
-  else if (isa<CallInst>(I)) ++NumMovedCalls;
+  if (isa<LoadInst>(I))
+    ++NumMovedLoads;
+  else if (isa<CallInst>(I))
+    ++NumMovedCalls;
   ++NumHoisted;
   return true;
 }
@@ -736,134 +747,91 @@ static bool hoist(Instruction &I, BasicBlock *Preheader) {
 /// Only sink or hoist an instruction if it is not a trapping instruction,
 /// or if the instruction is known not to trap when moved to the preheader.
 /// or if it is a trapping instruction and is guaranteed to execute.
-static bool isSafeToExecuteUnconditionally(const Instruction &Inst, 
+static bool isSafeToExecuteUnconditionally(const Instruction &Inst,
                                            const DominatorTree *DT,
-                                           const TargetLibraryInfo *TLI,
                                            const Loop *CurLoop,
-                                           const LICMSafetyInfo *SafetyInfo,
+                                           const LoopSafetyInfo *SafetyInfo,
                                            const Instruction *CtxI) {
-  if (isSafeToSpeculativelyExecute(&Inst, CtxI, DT, TLI))
+  if (isSafeToSpeculativelyExecute(&Inst, CtxI, DT))
     return true;
 
   return isGuaranteedToExecute(Inst, DT, CurLoop, SafetyInfo);
 }
 
-static bool isGuaranteedToExecute(const Instruction &Inst,
-                                  const DominatorTree *DT,
-                                  const Loop *CurLoop,
-                                  const LICMSafetyInfo * SafetyInfo) {
-
-  // We have to check to make sure that the instruction dominates all
-  // of the exit blocks.  If it doesn't, then there is a path out of the loop
-  // which does not execute this instruction, so we can't hoist it.
-
-  // If the instruction is in the header block for the loop (which is very
-  // common), it is always guaranteed to dominate the exit blocks.  Since this
-  // is a common case, and can save some work, check it now.
-  if (Inst.getParent() == CurLoop->getHeader())
-    // If there's a throw in the header block, we can't guarantee we'll reach
-    // Inst.
-    return !SafetyInfo->HeaderMayThrow;
-
-  // Somewhere in this loop there is an instruction which may throw and make us
-  // exit the loop.
-  if (SafetyInfo->MayThrow)
-    return false;
-
-  // Get the exit blocks for the current loop.
-  SmallVector<BasicBlock*, 8> ExitBlocks;
-  CurLoop->getExitBlocks(ExitBlocks);
-
-  // Verify that the block dominates each of the exit blocks of the loop.
-  for (BasicBlock *ExitBlock : ExitBlocks)
-    if (!DT->dominates(Inst.getParent(), ExitBlock))
-      return false;
-
-  // As a degenerate case, if the loop is statically infinite then we haven't
-  // proven anything since there are no exit blocks.
-  if (ExitBlocks.empty())
-    return false;
-
-  return true;
-}
-
 namespace {
-  class LoopPromoter : public LoadAndStorePromoter {
-    Value *SomePtr;  // Designated pointer to store to.
-    SmallPtrSetImpl<Value*> &PointerMustAliases;
-    SmallVectorImpl<BasicBlock*> &LoopExitBlocks;
-    SmallVectorImpl<Instruction*> &LoopInsertPts;
-    PredIteratorCache &PredCache;
-    AliasSetTracker &AST;
-    LoopInfo &LI;
-    DebugLoc DL;
-    int Alignment;
-    AAMDNodes AATags;
-
-    Value *maybeInsertLCSSAPHI(Value *V, BasicBlock *BB) const {
-      if (Instruction *I = dyn_cast<Instruction>(V))
-        if (Loop *L = LI.getLoopFor(I->getParent()))
-          if (!L->contains(BB)) {
-            // We need to create an LCSSA PHI node for the incoming value and
-            // store that.
-            PHINode *PN =
-                PHINode::Create(I->getType(), PredCache.size(BB),
-                                I->getName() + ".lcssa", &BB->front());
-            for (BasicBlock *Pred : PredCache.get(BB))
-              PN->addIncoming(I, Pred);
-            return PN;
-          }
-      return V;
-    }
+class LoopPromoter : public LoadAndStorePromoter {
+  Value *SomePtr; // Designated pointer to store to.
+  SmallPtrSetImpl<Value *> &PointerMustAliases;
+  SmallVectorImpl<BasicBlock *> &LoopExitBlocks;
+  SmallVectorImpl<Instruction *> &LoopInsertPts;
+  PredIteratorCache &PredCache;
+  AliasSetTracker &AST;
+  LoopInfo &LI;
+  DebugLoc DL;
+  int Alignment;
+  AAMDNodes AATags;
 
-  public:
-    LoopPromoter(Value *SP,
-                 ArrayRef<const Instruction *> Insts,
-                 SSAUpdater &S, SmallPtrSetImpl<Value *> &PMA,
-                 SmallVectorImpl<BasicBlock *> &LEB,
-                 SmallVectorImpl<Instruction *> &LIP, PredIteratorCache &PIC,
-                 AliasSetTracker &ast, LoopInfo &li, DebugLoc dl, int alignment,
-                 const AAMDNodes &AATags)
-        : LoadAndStorePromoter(Insts, S), SomePtr(SP), PointerMustAliases(PMA),
-          LoopExitBlocks(LEB), LoopInsertPts(LIP), PredCache(PIC), AST(ast),
-          LI(li), DL(dl), Alignment(alignment), AATags(AATags) {}
-
-    bool isInstInList(Instruction *I,
-                      const SmallVectorImpl<Instruction*> &) const override {
-      Value *Ptr;
-      if (LoadInst *LI = dyn_cast<LoadInst>(I))
-        Ptr = LI->getOperand(0);
-      else
-        Ptr = cast<StoreInst>(I)->getPointerOperand();
-      return PointerMustAliases.count(Ptr);
-    }
+  Value *maybeInsertLCSSAPHI(Value *V, BasicBlock *BB) const {
+    if (Instruction *I = dyn_cast<Instruction>(V))
+      if (Loop *L = LI.getLoopFor(I->getParent()))
+        if (!L->contains(BB)) {
+          // We need to create an LCSSA PHI node for the incoming value and
+          // store that.
+          PHINode *PN = PHINode::Create(I->getType(), PredCache.size(BB),
+                                        I->getName() + ".lcssa", &BB->front());
+          for (BasicBlock *Pred : PredCache.get(BB))
+            PN->addIncoming(I, Pred);
+          return PN;
+        }
+    return V;
+  }
 
-    void doExtraRewritesBeforeFinalDeletion() const override {
-      // Insert stores after in the loop exit blocks.  Each exit block gets a
-      // store of the live-out values that feed them.  Since we've already told
-      // the SSA updater about the defs in the loop and the preheader
-      // definition, it is all set and we can start using it.
-      for (unsigned i = 0, e = LoopExitBlocks.size(); i != e; ++i) {
-        BasicBlock *ExitBlock = LoopExitBlocks[i];
-        Value *LiveInValue = SSA.GetValueInMiddleOfBlock(ExitBlock);
-        LiveInValue = maybeInsertLCSSAPHI(LiveInValue, ExitBlock);
-        Value *Ptr = maybeInsertLCSSAPHI(SomePtr, ExitBlock);
-        Instruction *InsertPos = LoopInsertPts[i];
-        StoreInst *NewSI = new StoreInst(LiveInValue, Ptr, InsertPos);
-        NewSI->setAlignment(Alignment);
-        NewSI->setDebugLoc(DL);
-        if (AATags) NewSI->setAAMetadata(AATags);
-      }
-    }
+public:
+  LoopPromoter(Value *SP, ArrayRef<const Instruction *> Insts, SSAUpdater &S,
+               SmallPtrSetImpl<Value *> &PMA,
+               SmallVectorImpl<BasicBlock *> &LEB,
+               SmallVectorImpl<Instruction *> &LIP, PredIteratorCache &PIC,
+               AliasSetTracker &ast, LoopInfo &li, DebugLoc dl, int alignment,
+               const AAMDNodes &AATags)
+      : LoadAndStorePromoter(Insts, S), SomePtr(SP), PointerMustAliases(PMA),
+        LoopExitBlocks(LEB), LoopInsertPts(LIP), PredCache(PIC), AST(ast),
+        LI(li), DL(std::move(dl)), Alignment(alignment), AATags(AATags) {}
+
+  bool isInstInList(Instruction *I,
+                    const SmallVectorImpl<Instruction *> &) const override {
+    Value *Ptr;
+    if (LoadInst *LI = dyn_cast<LoadInst>(I))
+      Ptr = LI->getOperand(0);
+    else
+      Ptr = cast<StoreInst>(I)->getPointerOperand();
+    return PointerMustAliases.count(Ptr);
+  }
 
-    void replaceLoadWithValue(LoadInst *LI, Value *V) const override {
-      // Update alias analysis.
-      AST.copyValue(LI, V);
+  void doExtraRewritesBeforeFinalDeletion() const override {
+    // Insert stores after in the loop exit blocks.  Each exit block gets a
+    // store of the live-out values that feed them.  Since we've already told
+    // the SSA updater about the defs in the loop and the preheader
+    // definition, it is all set and we can start using it.
+    for (unsigned i = 0, e = LoopExitBlocks.size(); i != e; ++i) {
+      BasicBlock *ExitBlock = LoopExitBlocks[i];
+      Value *LiveInValue = SSA.GetValueInMiddleOfBlock(ExitBlock);
+      LiveInValue = maybeInsertLCSSAPHI(LiveInValue, ExitBlock);
+      Value *Ptr = maybeInsertLCSSAPHI(SomePtr, ExitBlock);
+      Instruction *InsertPos = LoopInsertPts[i];
+      StoreInst *NewSI = new StoreInst(LiveInValue, Ptr, InsertPos);
+      NewSI->setAlignment(Alignment);
+      NewSI->setDebugLoc(DL);
+      if (AATags)
+        NewSI->setAAMetadata(AATags);
     }
-    void instructionDeleted(Instruction *I) const override {
-      AST.deleteValue(I);
-    }
-  };
+  }
+
+  void replaceLoadWithValue(LoadInst *LI, Value *V) const override {
+    // Update alias analysis.
+    AST.copyValue(LI, V);
+  }
+  void instructionDeleted(Instruction *I) const override { AST.deleteValue(I); }
+};
 } // end anon namespace
 
 /// Try to promote memory values to scalars by sinking stores out of the
@@ -871,32 +839,28 @@ namespace {
 /// the stores in the loop, looking for stores to Must pointers which are
 /// loop invariant.
 ///
-bool llvm::promoteLoopAccessesToScalars(AliasSet &AS,
-                                        SmallVectorImpl<BasicBlock*>&ExitBlocks,
-                                        SmallVectorImpl<Instruction*>&InsertPts,
-                                        PredIteratorCache &PIC, LoopInfo *LI, 
-                                        DominatorTree *DT, Loop *CurLoop, 
-                                        AliasSetTracker *CurAST, 
-                                        LICMSafetyInfo * SafetyInfo) { 
+bool llvm::promoteLoopAccessesToScalars(
+    AliasSet &AS, SmallVectorImpl<BasicBlock *> &ExitBlocks,
+    SmallVectorImpl<Instruction *> &InsertPts, PredIteratorCache &PIC,
+    LoopInfo *LI, DominatorTree *DT, const TargetLibraryInfo *TLI,
+    Loop *CurLoop, AliasSetTracker *CurAST, LoopSafetyInfo *SafetyInfo) {
   // Verify inputs.
-  assert(LI != nullptr && DT != nullptr && 
-         CurLoop != nullptr && CurAST != nullptr && 
-         SafetyInfo != nullptr && 
+  assert(LI != nullptr && DT != nullptr && CurLoop != nullptr &&
+         CurAST != nullptr && SafetyInfo != nullptr &&
          "Unexpected Input to promoteLoopAccessesToScalars");
-  // Initially set Changed status to false.
-  bool Changed = false;
+
   // We can promote this alias set if it has a store, if it is a "Must" alias
   // set, if the pointer is loop invariant, and if we are not eliminating any
   // volatile loads or stores.
   if (AS.isForwardingAliasSet() || !AS.isMod() || !AS.isMustAlias() ||
       AS.isVolatile() || !CurLoop->isLoopInvariant(AS.begin()->getValue()))
-    return Changed;
+    return false;
 
   assert(!AS.empty() &&
          "Must alias set should have at least one pointer element in it!");
 
   Value *SomePtr = AS.begin()->getValue();
-  BasicBlock * Preheader = CurLoop->getLoopPreheader();
+  BasicBlock *Preheader = CurLoop->getLoopPreheader();
 
   // It isn't safe to promote a load/store from the loop if the load/store is
   // conditional.  For example, turning:
@@ -909,12 +873,27 @@ bool llvm::promoteLoopAccessesToScalars(AliasSet &AS,
   //
   // is not safe, because *P may only be valid to access if 'c' is true.
   //
+  // The safety property divides into two parts:
+  // 1) The memory may not be dereferenceable on entry to the loop.  In this
+  //    case, we can't insert the required load in the preheader.
+  // 2) The memory model does not allow us to insert a store along any dynamic
+  //    path which did not originally have one.
+  //
   // It is safe to promote P if all uses are direct load/stores and if at
   // least one is guaranteed to be executed.
   bool GuaranteedToExecute = false;
 
-  SmallVector<Instruction*, 64> LoopUses;
-  SmallPtrSet<Value*, 4> PointerMustAliases;
+  // It is also safe to promote P if we can prove that speculating a load into
+  // the preheader is safe (i.e. proving dereferenceability on all
+  // paths through the loop), and that the memory can be proven thread local
+  // (so that the memory model requirement doesn't apply.)  We first establish
+  // the former, and then run a capture analysis below to establish the later.
+  // We can use any access within the alias set to prove dereferenceability
+  // since they're all must alias.
+  bool CanSpeculateLoad = false;
+
+  SmallVector<Instruction *, 64> LoopUses;
+  SmallPtrSet<Value *, 4> PointerMustAliases;
 
   // We start with an alignment of one and try to find instructions that allow
   // us to prove better alignment.
@@ -922,11 +901,32 @@ bool llvm::promoteLoopAccessesToScalars(AliasSet &AS,
   AAMDNodes AATags;
   bool HasDedicatedExits = CurLoop->hasDedicatedExits();
 
+  // Don't sink stores from loops without dedicated block exits. Exits
+  // containing indirect branches are not transformed by loop simplify,
+  // make sure we catch that. An additional load may be generated in the
+  // preheader for SSA updater, so also avoid sinking when no preheader
+  // is available.
+  if (!HasDedicatedExits || !Preheader)
+    return false;
+
+  const DataLayout &MDL = Preheader->getModule()->getDataLayout();
+
+  if (SafetyInfo->MayThrow) {
+    // If a loop can throw, we have to insert a store along each unwind edge.
+    // That said, we can't actually make the unwind edge explicit. Therefore,
+    // we have to prove that the store is dead along the unwind edge.
+    //
+    // Currently, this code just special-cases alloca instructions.
+    if (!isa<AllocaInst>(GetUnderlyingObject(SomePtr, MDL)))
+      return false;
+  }
+
   // Check that all of the pointers in the alias set have the same type.  We
   // cannot (yet) promote a memory location that is loaded and stored in
   // different sizes.  While we are at it, collect alignment and AA info.
-  for (AliasSet::iterator ASI = AS.begin(), E = AS.end(); ASI != E; ++ASI) {
-    Value *ASIV = ASI->getValue();
+  bool Changed = false;
+  for (const auto &ASI : AS) {
+    Value *ASIV = ASI.getValue();
     PointerMustAliases.insert(ASIV);
 
     // Check that all of the pointers in the alias set have the same type.  We
@@ -947,6 +947,10 @@ bool llvm::promoteLoopAccessesToScalars(AliasSet &AS,
         assert(!Load->isVolatile() && "AST broken");
         if (!Load->isSimple())
           return Changed;
+
+        if (!GuaranteedToExecute && !CanSpeculateLoad)
+          CanSpeculateLoad = isSafeToExecuteUnconditionally(
+              *Load, DT, CurLoop, SafetyInfo, Preheader->getTerminator());
       } else if (const StoreInst *Store = dyn_cast<StoreInst>(UI)) {
         // Stores *of* the pointer are not interesting, only stores *to* the
         // pointer.
@@ -955,13 +959,6 @@ bool llvm::promoteLoopAccessesToScalars(AliasSet &AS,
         assert(!Store->isVolatile() && "AST broken");
         if (!Store->isSimple())
           return Changed;
-        // Don't sink stores from loops without dedicated block exits. Exits
-        // containing indirect branches are not transformed by loop simplify,
-        // make sure we catch that. An additional load may be generated in the
-        // preheader for SSA updater, so also avoid sinking when no preheader
-        // is available.
-        if (!HasDedicatedExits || !Preheader)
-          return Changed;
 
         // Note that we only check GuaranteedToExecute inside the store case
         // so that we do not introduce stores where they did not exist before
@@ -972,16 +969,22 @@ bool llvm::promoteLoopAccessesToScalars(AliasSet &AS,
         // instruction will be executed, update the alignment.
         // Larger is better, with the exception of 0 being the best alignment.
         unsigned InstAlignment = Store->getAlignment();
-        if ((InstAlignment > Alignment || InstAlignment == 0) && Alignment != 0)
+        if ((InstAlignment > Alignment || InstAlignment == 0) &&
+            Alignment != 0) {
           if (isGuaranteedToExecute(*UI, DT, CurLoop, SafetyInfo)) {
             GuaranteedToExecute = true;
             Alignment = InstAlignment;
           }
+        } else if (!GuaranteedToExecute) {
+          GuaranteedToExecute =
+              isGuaranteedToExecute(*UI, DT, CurLoop, SafetyInfo);
+        }
 
-        if (!GuaranteedToExecute)
-          GuaranteedToExecute = isGuaranteedToExecute(*UI, DT, 
-                                                      CurLoop, SafetyInfo);
-
+        if (!GuaranteedToExecute && !CanSpeculateLoad) {
+          CanSpeculateLoad = isDereferenceableAndAlignedPointer(
+              Store->getPointerOperand(), Store->getAlignment(), MDL,
+              Preheader->getTerminator(), DT);
+        }
       } else
         return Changed; // Not a load or store.
 
@@ -997,8 +1000,17 @@ bool llvm::promoteLoopAccessesToScalars(AliasSet &AS,
     }
   }
 
-  // If there isn't a guaranteed-to-execute instruction, we can't promote.
-  if (!GuaranteedToExecute)
+  // Check legality per comment above. Otherwise, we can't promote.
+  bool PromotionIsLegal = GuaranteedToExecute;
+  if (!PromotionIsLegal && CanSpeculateLoad) {
+    // If this is a thread local location, then we can insert stores along
+    // paths which originally didn't have them without violating the memory
+    // model.
+    Value *Object = GetUnderlyingObject(SomePtr, MDL);
+    PromotionIsLegal =
+        isAllocLikeFn(Object, TLI) && !PointerMayBeCaptured(Object, true, true);
+  }
+  if (!PromotionIsLegal)
     return Changed;
 
   // Figure out the loop exits and their insertion points, if this is the
@@ -1017,7 +1029,8 @@ bool llvm::promoteLoopAccessesToScalars(AliasSet &AS,
       return Changed;
 
   // Otherwise, this is safe to promote, lets do it!
-  DEBUG(dbgs() << "LICM: Promoting value stored to in loop: " <<*SomePtr<<'\n');
+  DEBUG(dbgs() << "LICM: Promoting value stored to in loop: " << *SomePtr
+               << '\n');
   Changed = true;
   ++NumPromoted;
 
@@ -1028,20 +1041,19 @@ bool llvm::promoteLoopAccessesToScalars(AliasSet &AS,
   DebugLoc DL = LoopUses[0]->getDebugLoc();
 
   // We use the SSAUpdater interface to insert phi nodes as required.
-  SmallVector<PHINode*, 16> NewPHIs;
+  SmallVector<PHINode *, 16> NewPHIs;
   SSAUpdater SSA(&NewPHIs);
-  LoopPromoter Promoter(SomePtr, LoopUses, SSA,
-                        PointerMustAliases, ExitBlocks,
+  LoopPromoter Promoter(SomePtr, LoopUses, SSA, PointerMustAliases, ExitBlocks,
                         InsertPts, PIC, *CurAST, *LI, DL, Alignment, AATags);
 
   // Set up the preheader to have a definition of the value.  It is the live-out
   // value from the preheader that uses in the loop will use.
-  LoadInst *PreheaderLoad =
-    new LoadInst(SomePtr, SomePtr->getName()+".promoted",
-                 Preheader->getTerminator());
+  LoadInst *PreheaderLoad = new LoadInst(
+      SomePtr, SomePtr->getName() + ".promoted", Preheader->getTerminator());
   PreheaderLoad->setAlignment(Alignment);
   PreheaderLoad->setDebugLoc(DL);
-  if (AATags) PreheaderLoad->setAAMetadata(AATags);
+  if (AATags)
+    PreheaderLoad->setAAMetadata(AATags);
   SSA.AddAvailableValue(Preheader, PreheaderLoad);
 
   // Rewrite all the loads in the loop and remember all the definitions from
@@ -1055,10 +1067,67 @@ bool llvm::promoteLoopAccessesToScalars(AliasSet &AS,
   return Changed;
 }
 
+/// Returns an owning pointer to an alias set which incorporates aliasing info
+/// from L and all subloops of L.
+/// FIXME: In new pass manager, there is no helper functions to handle loop
+/// analysis such as cloneBasicBlockAnalysis. So the AST needs to be recompute
+/// from scratch for every loop. Hook up with the helper functions when
+/// available in the new pass manager to avoid redundant computation.
+AliasSetTracker *
+LoopInvariantCodeMotion::collectAliasInfoForLoop(Loop *L, LoopInfo *LI,
+                                                 AliasAnalysis *AA) {
+  AliasSetTracker *CurAST = nullptr;
+  SmallVector<Loop *, 4> RecomputeLoops;
+  for (Loop *InnerL : L->getSubLoops()) {
+    auto MapI = LoopToAliasSetMap.find(InnerL);
+    // If the AST for this inner loop is missing it may have been merged into
+    // some other loop's AST and then that loop unrolled, and so we need to
+    // recompute it.
+    if (MapI == LoopToAliasSetMap.end()) {
+      RecomputeLoops.push_back(InnerL);
+      continue;
+    }
+    AliasSetTracker *InnerAST = MapI->second;
+
+    if (CurAST != nullptr) {
+      // What if InnerLoop was modified by other passes ?
+      CurAST->add(*InnerAST);
+
+      // Once we've incorporated the inner loop's AST into ours, we don't need
+      // the subloop's anymore.
+      delete InnerAST;
+    } else {
+      CurAST = InnerAST;
+    }
+    LoopToAliasSetMap.erase(MapI);
+  }
+  if (CurAST == nullptr)
+    CurAST = new AliasSetTracker(*AA);
+
+  auto mergeLoop = [&](Loop *L) {
+    // Loop over the body of this loop, looking for calls, invokes, and stores.
+    // Because subloops have already been incorporated into AST, we skip blocks
+    // in subloops.
+    for (BasicBlock *BB : L->blocks())
+      if (LI->getLoopFor(BB) == L) // Ignore blocks in subloops.
+        CurAST->add(*BB);          // Incorporate the specified basic block
+  };
+
+  // Add everything from the sub loops that are no longer directly available.
+  for (Loop *InnerL : RecomputeLoops)
+    mergeLoop(InnerL);
+
+  // And merge in this loop.
+  mergeLoop(L);
+
+  return CurAST;
+}
+
 /// Simple analysis hook. Clone alias set info.
 ///
-void LICM::cloneBasicBlockAnalysis(BasicBlock *From, BasicBlock *To, Loop *L) {
-  AliasSetTracker *AST = LoopToAliasSetMap.lookup(L);
+void LegacyLICMPass::cloneBasicBlockAnalysis(BasicBlock *From, BasicBlock *To,
+                                             Loop *L) {
+  AliasSetTracker *AST = LICM.getLoopToAliasSetMap().lookup(L);
   if (!AST)
     return;
 
@@ -1067,8 +1136,8 @@ void LICM::cloneBasicBlockAnalysis(BasicBlock *From, BasicBlock *To, Loop *L) {
 
 /// Simple Analysis hook. Delete value V from alias set
 ///
-void LICM::deleteAnalysisValue(Value *V, Loop *L) {
-  AliasSetTracker *AST = LoopToAliasSetMap.lookup(L);
+void LegacyLICMPass::deleteAnalysisValue(Value *V, Loop *L) {
+  AliasSetTracker *AST = LICM.getLoopToAliasSetMap().lookup(L);
   if (!AST)
     return;
 
@@ -1077,21 +1146,20 @@ void LICM::deleteAnalysisValue(Value *V, Loop *L) {
 
 /// Simple Analysis hook. Delete value L from alias set map.
 ///
-void LICM::deleteAnalysisLoop(Loop *L) {
-  AliasSetTracker *AST = LoopToAliasSetMap.lookup(L);
+void LegacyLICMPass::deleteAnalysisLoop(Loop *L) {
+  AliasSetTracker *AST = LICM.getLoopToAliasSetMap().lookup(L);
   if (!AST)
     return;
 
   delete AST;
-  LoopToAliasSetMap.erase(L);
+  LICM.getLoopToAliasSetMap().erase(L);
 }
 
-
 /// Return true if the body of this loop may store into the memory
 /// location pointed to by V.
 ///
 static bool pointerInvalidatedByLoop(Value *V, uint64_t Size,
-                                     const AAMDNodes &AAInfo, 
+                                     const AAMDNodes &AAInfo,
                                      AliasSetTracker *CurAST) {
   // Check to see if any of the basic blocks in CurLoop invalidate *V.
   return CurAST->getAliasSetForPointer(V, Size, AAInfo).isMod();
@@ -1104,4 +1172,3 @@ static bool inSubLoop(BasicBlock *BB, Loop *CurLoop, LoopInfo *LI) {
   assert(CurLoop->contains(BB) && "Only valid if BB is IN the loop");
   return LI->getLoopFor(BB) != CurLoop;
 }
-
diff --git a/lib/Transforms/Scalar/LoadCombine.cpp b/lib/Transforms/Scalar/LoadCombine.cpp
index 1648878b0628..dfe51a4ce44c 100644
--- a/lib/Transforms/Scalar/LoadCombine.cpp
+++ b/lib/Transforms/Scalar/LoadCombine.cpp
@@ -35,10 +35,12 @@ using namespace llvm;
 STATISTIC(NumLoadsAnalyzed, "Number of loads analyzed for combining");
 STATISTIC(NumLoadsCombined, "Number of loads combined");
 
+#define LDCOMBINE_NAME "Combine Adjacent Loads"
+
 namespace {
 struct PointerOffsetPair {
   Value *Pointer;
-  uint64_t Offset;
+  APInt Offset;
 };
 
 struct LoadPOPPair {
@@ -63,12 +65,16 @@ public:
   using llvm::Pass::doInitialization;
   bool doInitialization(Function &) override;
   bool runOnBasicBlock(BasicBlock &BB) override;
-  void getAnalysisUsage(AnalysisUsage &AU) const override;
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    AU.setPreservesCFG();
+    AU.addRequired<AAResultsWrapperPass>();
+    AU.addPreserved<GlobalsAAWrapperPass>();
+  }
 
-  const char *getPassName() const override { return "LoadCombine"; }
+  const char *getPassName() const override { return LDCOMBINE_NAME; }
   static char ID;
 
-  typedef IRBuilder<true, TargetFolder> BuilderTy;
+  typedef IRBuilder<TargetFolder> BuilderTy;
 
 private:
   BuilderTy *Builder;
@@ -87,22 +93,25 @@ bool LoadCombine::doInitialization(Function &F) {
 }
 
 PointerOffsetPair LoadCombine::getPointerOffsetPair(LoadInst &LI) {
+  auto &DL = LI.getModule()->getDataLayout();
+
   PointerOffsetPair POP;
   POP.Pointer = LI.getPointerOperand();
-  POP.Offset = 0;
+  unsigned BitWidth = DL.getPointerSizeInBits(LI.getPointerAddressSpace());
+  POP.Offset = APInt(BitWidth, 0);
+
   while (isa<BitCastInst>(POP.Pointer) || isa<GetElementPtrInst>(POP.Pointer)) {
     if (auto *GEP = dyn_cast<GetElementPtrInst>(POP.Pointer)) {
-      auto &DL = LI.getModule()->getDataLayout();
-      unsigned BitWidth = DL.getPointerTypeSizeInBits(GEP->getType());
-      APInt Offset(BitWidth, 0);
-      if (GEP->accumulateConstantOffset(DL, Offset))
-        POP.Offset += Offset.getZExtValue();
-      else
+      APInt LastOffset = POP.Offset;
+      if (!GEP->accumulateConstantOffset(DL, POP.Offset)) {
         // Can't handle GEPs with variable indices.
+        POP.Offset = LastOffset;
         return POP;
+      }
       POP.Pointer = GEP->getPointerOperand();
-    } else if (auto *BC = dyn_cast<BitCastInst>(POP.Pointer))
+    } else if (auto *BC = dyn_cast<BitCastInst>(POP.Pointer)) {
       POP.Pointer = BC->getOperand(0);
+    }
   }
   return POP;
 }
@@ -115,8 +124,8 @@ bool LoadCombine::combineLoads(
       continue;
     std::sort(Loads.second.begin(), Loads.second.end(),
               [](const LoadPOPPair &A, const LoadPOPPair &B) {
-      return A.POP.Offset < B.POP.Offset;
-    });
+                return A.POP.Offset.slt(B.POP.Offset);
+              });
     if (aggregateLoads(Loads.second))
       Combined = true;
   }
@@ -132,28 +141,31 @@ bool LoadCombine::aggregateLoads(SmallVectorImpl<LoadPOPPair> &Loads) {
   LoadInst *BaseLoad = nullptr;
   SmallVector<LoadPOPPair, 8> AggregateLoads;
   bool Combined = false;
-  uint64_t PrevOffset = -1ull;
+  bool ValidPrevOffset = false;
+  APInt PrevOffset;
   uint64_t PrevSize = 0;
   for (auto &L : Loads) {
-    if (PrevOffset == -1ull) {
+    if (ValidPrevOffset == false) {
       BaseLoad = L.Load;
       PrevOffset = L.POP.Offset;
       PrevSize = L.Load->getModule()->getDataLayout().getTypeStoreSize(
           L.Load->getType());
       AggregateLoads.push_back(L);
+      ValidPrevOffset = true;
       continue;
     }
     if (L.Load->getAlignment() > BaseLoad->getAlignment())
       continue;
-    if (L.POP.Offset > PrevOffset + PrevSize) {
+    APInt PrevEnd = PrevOffset + PrevSize;
+    if (L.POP.Offset.sgt(PrevEnd)) {
       // No other load will be combinable
       if (combineLoads(AggregateLoads))
         Combined = true;
       AggregateLoads.clear();
-      PrevOffset = -1;
+      ValidPrevOffset = false;
       continue;
     }
-    if (L.POP.Offset != PrevOffset + PrevSize)
+    if (L.POP.Offset != PrevEnd)
       // This load is offset less than the size of the last load.
       // FIXME: We may want to handle this case.
       continue;
@@ -199,7 +211,7 @@ bool LoadCombine::combineLoads(SmallVectorImpl<LoadPOPPair> &Loads) {
   Value *Ptr = Builder->CreateConstGEP1_64(
       Builder->CreatePointerCast(Loads[0].POP.Pointer,
                                  Builder->getInt8PtrTy(AddressSpace)),
-      Loads[0].POP.Offset);
+      Loads[0].POP.Offset.getSExtValue());
   LoadInst *NewLoad = new LoadInst(
       Builder->CreatePointerCast(
           Ptr, PointerType::get(IntegerType::get(Ptr->getContext(), TotalSize),
@@ -212,7 +224,7 @@ bool LoadCombine::combineLoads(SmallVectorImpl<LoadPOPPair> &Loads) {
     Value *V = Builder->CreateExtractInteger(
         L.Load->getModule()->getDataLayout(), NewLoad,
         cast<IntegerType>(L.Load->getType()),
-        L.POP.Offset - Loads[0].POP.Offset, "combine.extract");
+        (L.POP.Offset - Loads[0].POP.Offset).getZExtValue(), "combine.extract");
     L.Load->replaceAllUsesWith(V);
   }
 
@@ -221,12 +233,12 @@ bool LoadCombine::combineLoads(SmallVectorImpl<LoadPOPPair> &Loads) {
 }
 
 bool LoadCombine::runOnBasicBlock(BasicBlock &BB) {
-  if (skipOptnoneFunction(BB))
+  if (skipBasicBlock(BB))
     return false;
 
   AA = &getAnalysis<AAResultsWrapperPass>().getAAResults();
 
-  IRBuilder<true, TargetFolder> TheBuilder(
+  IRBuilder<TargetFolder> TheBuilder(
       BB.getContext(), TargetFolder(BB.getModule()->getDataLayout()));
   Builder = &TheBuilder;
 
@@ -260,23 +272,12 @@ bool LoadCombine::runOnBasicBlock(BasicBlock &BB) {
   return Combined;
 }
 
-void LoadCombine::getAnalysisUsage(AnalysisUsage &AU) const {
-  AU.setPreservesCFG();
-
-  AU.addRequired<AAResultsWrapperPass>();
-  AU.addPreserved<GlobalsAAWrapperPass>();
-}
-
 char LoadCombine::ID = 0;
 
 BasicBlockPass *llvm::createLoadCombinePass() {
   return new LoadCombine();
 }
 
-INITIALIZE_PASS_BEGIN(LoadCombine, "load-combine", "Combine Adjacent Loads",
-                      false, false)
+INITIALIZE_PASS_BEGIN(LoadCombine, "load-combine", LDCOMBINE_NAME, false, false)
 INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass)
-INITIALIZE_PASS_DEPENDENCY(GlobalsAAWrapperPass)
-INITIALIZE_PASS_END(LoadCombine, "load-combine", "Combine Adjacent Loads",
-                    false, false)
-
+INITIALIZE_PASS_END(LoadCombine, "load-combine", LDCOMBINE_NAME, false, false)
diff --git a/lib/Target/PowerPC/PPCLoopDataPrefetch.cpp b/lib/Transforms/Scalar/LoopDataPrefetch.cpp
index e3a35d5df358..66b59d27dfde 100644
--- a/lib/Target/PowerPC/PPCLoopDataPrefetch.cpp
+++ b/lib/Transforms/Scalar/LoopDataPrefetch.cpp
@@ -1,4 +1,4 @@
-//===-------- PPCLoopDataPrefetch.cpp - Loop Data Prefetching Pass --------===//
+//===-------- LoopDataPrefetch.cpp - Loop Data Prefetching Pass -----------===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -11,8 +11,7 @@
 //
 //===----------------------------------------------------------------------===//
 
-#define DEBUG_TYPE "ppc-loop-data-prefetch"
-#include "PPC.h"
+#define DEBUG_TYPE "loop-data-prefetch"
 #include "llvm/Transforms/Scalar.h"
 #include "llvm/ADT/DepthFirstIterator.h"
 #include "llvm/ADT/Statistic.h"
@@ -27,6 +26,7 @@
 #include "llvm/Analysis/TargetTransformInfo.h"
 #include "llvm/Analysis/ValueTracking.h"
 #include "llvm/IR/CFG.h"
+#include "llvm/IR/DiagnosticInfo.h"
 #include "llvm/IR/Dominators.h"
 #include "llvm/IR/Function.h"
 #include "llvm/IR/IntrinsicInst.h"
@@ -41,30 +41,35 @@ using namespace llvm;
 // By default, we limit this to creating 16 PHIs (which is a little over half
 // of the allocatable register set).
 static cl::opt<bool>
-PrefetchWrites("ppc-loop-prefetch-writes", cl::Hidden, cl::init(false),
+PrefetchWrites("loop-prefetch-writes", cl::Hidden, cl::init(false),
                cl::desc("Prefetch write addresses"));
 
-// This seems like a reasonable default for the BG/Q (this pass is enabled, by
-// default, only on the BG/Q).
 static cl::opt<unsigned>
-PrefDist("ppc-loop-prefetch-distance", cl::Hidden, cl::init(300),
-         cl::desc("The loop prefetch distance"));
+    PrefetchDistance("prefetch-distance",
+                     cl::desc("Number of instructions to prefetch ahead"),
+                     cl::Hidden);
 
 static cl::opt<unsigned>
-CacheLineSize("ppc-loop-prefetch-cache-line", cl::Hidden, cl::init(64),
-              cl::desc("The loop prefetch cache line size"));
+    MinPrefetchStride("min-prefetch-stride",
+                      cl::desc("Min stride to add prefetches"), cl::Hidden);
+
+static cl::opt<unsigned> MaxPrefetchIterationsAhead(
+    "max-prefetch-iters-ahead",
+    cl::desc("Max number of iterations to prefetch ahead"), cl::Hidden);
+
+STATISTIC(NumPrefetches, "Number of prefetches inserted");
 
 namespace llvm {
-  void initializePPCLoopDataPrefetchPass(PassRegistry&);
+  void initializeLoopDataPrefetchPass(PassRegistry&);
 }
 
 namespace {
 
-  class PPCLoopDataPrefetch : public FunctionPass {
+  class LoopDataPrefetch : public FunctionPass {
   public:
     static char ID; // Pass ID, replacement for typeid
-    PPCLoopDataPrefetch() : FunctionPass(ID) {
-      initializePPCLoopDataPrefetchPass(*PassRegistry::getPassRegistry());
+    LoopDataPrefetch() : FunctionPass(ID) {
+      initializeLoopDataPrefetchPass(*PassRegistry::getPassRegistry());
     }
 
     void getAnalysisUsage(AnalysisUsage &AU) const override {
@@ -80,9 +85,32 @@ namespace {
     }
 
     bool runOnFunction(Function &F) override;
-    bool runOnLoop(Loop *L);
 
   private:
+    bool runOnLoop(Loop *L);
+
+    /// \brief Check if the the stride of the accesses is large enough to
+    /// warrant a prefetch.
+    bool isStrideLargeEnough(const SCEVAddRecExpr *AR);
+
+    unsigned getMinPrefetchStride() {
+      if (MinPrefetchStride.getNumOccurrences() > 0)
+        return MinPrefetchStride;
+      return TTI->getMinPrefetchStride();
+    }
+
+    unsigned getPrefetchDistance() {
+      if (PrefetchDistance.getNumOccurrences() > 0)
+        return PrefetchDistance;
+      return TTI->getPrefetchDistance();
+    }
+
+    unsigned getMaxPrefetchIterationsAhead() {
+      if (MaxPrefetchIterationsAhead.getNumOccurrences() > 0)
+        return MaxPrefetchIterationsAhead;
+      return TTI->getMaxPrefetchIterationsAhead();
+    }
+
     AssumptionCache *AC;
     LoopInfo *LI;
     ScalarEvolution *SE;
@@ -91,35 +119,61 @@ namespace {
   };
 }
 
-char PPCLoopDataPrefetch::ID = 0;
-INITIALIZE_PASS_BEGIN(PPCLoopDataPrefetch, "ppc-loop-data-prefetch",
-                      "PPC Loop Data Prefetch", false, false)
+char LoopDataPrefetch::ID = 0;
+INITIALIZE_PASS_BEGIN(LoopDataPrefetch, "loop-data-prefetch",
+                      "Loop Data Prefetch", false, false)
 INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker)
 INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass)
 INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass)
 INITIALIZE_PASS_DEPENDENCY(ScalarEvolutionWrapperPass)
-INITIALIZE_PASS_END(PPCLoopDataPrefetch, "ppc-loop-data-prefetch",
-                    "PPC Loop Data Prefetch", false, false)
+INITIALIZE_PASS_END(LoopDataPrefetch, "loop-data-prefetch",
+                    "Loop Data Prefetch", false, false)
 
-FunctionPass *llvm::createPPCLoopDataPrefetchPass() { return new PPCLoopDataPrefetch(); }
+FunctionPass *llvm::createLoopDataPrefetchPass() { return new LoopDataPrefetch(); }
+
+bool LoopDataPrefetch::isStrideLargeEnough(const SCEVAddRecExpr *AR) {
+  unsigned TargetMinStride = getMinPrefetchStride();
+  // No need to check if any stride goes.
+  if (TargetMinStride <= 1)
+    return true;
+
+  const auto *ConstStride = dyn_cast<SCEVConstant>(AR->getStepRecurrence(*SE));
+  // If MinStride is set, don't prefetch unless we can ensure that stride is
+  // larger.
+  if (!ConstStride)
+    return false;
+
+  unsigned AbsStride = std::abs(ConstStride->getAPInt().getSExtValue());
+  return TargetMinStride <= AbsStride;
+}
+
+bool LoopDataPrefetch::runOnFunction(Function &F) {
+  if (skipFunction(F))
+    return false;
 
-bool PPCLoopDataPrefetch::runOnFunction(Function &F) {
   LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
   SE = &getAnalysis<ScalarEvolutionWrapperPass>().getSE();
   DL = &F.getParent()->getDataLayout();
   AC = &getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F);
   TTI = &getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F);
 
+  // If PrefetchDistance is not set, don't run the pass.  This gives an
+  // opportunity for targets to run this pass for selected subtargets only
+  // (whose TTI sets PrefetchDistance).
+  if (getPrefetchDistance() == 0)
+    return false;
+  assert(TTI->getCacheLineSize() && "Cache line size is not set for target");
+
   bool MadeChange = false;
 
-  for (auto I = LI->begin(), IE = LI->end(); I != IE; ++I)
-    for (auto L = df_begin(*I), LE = df_end(*I); L != LE; ++L)
+  for (Loop *I : *LI)
+    for (auto L = df_begin(I), LE = df_end(I); L != LE; ++L)
       MadeChange |= runOnLoop(*L);
 
   return MadeChange;
 }
 
-bool PPCLoopDataPrefetch::runOnLoop(Loop *L) {
+bool LoopDataPrefetch::runOnLoop(Loop *L) {
   bool MadeChange = false;
 
   // Only prefetch in the inner-most loop
@@ -135,7 +189,7 @@ bool PPCLoopDataPrefetch::runOnLoop(Loop *L) {
        I != IE; ++I) {
 
     // If the loop already has prefetches, then assume that the user knows
-    // what he or she is doing and don't add any more.
+    // what they are doing and don't add any more.
     for (BasicBlock::iterator J = (*I)->begin(), JE = (*I)->end();
          J != JE; ++J)
       if (CallInst *CI = dyn_cast<CallInst>(J))
@@ -149,10 +203,18 @@ bool PPCLoopDataPrefetch::runOnLoop(Loop *L) {
   if (!LoopSize)
     LoopSize = 1;
 
-  unsigned ItersAhead = PrefDist/LoopSize;
+  unsigned ItersAhead = getPrefetchDistance() / LoopSize;
   if (!ItersAhead)
     ItersAhead = 1;
 
+  if (ItersAhead > getMaxPrefetchIterationsAhead())
+    return MadeChange;
+
+  Function *F = L->getHeader()->getParent();
+  DEBUG(dbgs() << "Prefetching " << ItersAhead
+               << " iterations ahead (loop size: " << LoopSize << ") in "
+               << F->getName() << ": " << *L);
+
   SmallVector<std::pair<Instruction *, const SCEVAddRecExpr *>, 16> PrefLoads;
   for (Loop::block_iterator I = L->block_begin(), IE = L->block_end();
        I != IE; ++I) {
@@ -182,18 +244,21 @@ bool PPCLoopDataPrefetch::runOnLoop(Loop *L) {
       if (!LSCEVAddRec)
         continue;
 
+      // Check if the the stride of the accesses is large enough to warrant a
+      // prefetch.
+      if (!isStrideLargeEnough(LSCEVAddRec))
+        continue;
+
       // We don't want to double prefetch individual cache lines. If this load
       // is known to be within one cache line of some other load that has
       // already been prefetched, then don't prefetch this one as well.
       bool DupPref = false;
-      for (SmallVector<std::pair<Instruction *, const SCEVAddRecExpr *>,
-             16>::iterator K = PrefLoads.begin(), KE = PrefLoads.end();
-           K != KE; ++K) {
-        const SCEV *PtrDiff = SE->getMinusSCEV(LSCEVAddRec, K->second);
+      for (const auto &PrefLoad : PrefLoads) {
+        const SCEV *PtrDiff = SE->getMinusSCEV(LSCEVAddRec, PrefLoad.second);
         if (const SCEVConstant *ConstPtrDiff =
             dyn_cast<SCEVConstant>(PtrDiff)) {
           int64_t PD = std::abs(ConstPtrDiff->getValue()->getSExtValue());
-          if (PD < (int64_t) CacheLineSize) {
+          if (PD < (int64_t) TTI->getCacheLineSize()) {
             DupPref = true;
             break;
           }
@@ -223,6 +288,12 @@ bool PPCLoopDataPrefetch::runOnLoop(Loop *L) {
           {PrefPtrValue,
            ConstantInt::get(I32, MemI->mayReadFromMemory() ? 0 : 1),
            ConstantInt::get(I32, 3), ConstantInt::get(I32, 1)});
+      ++NumPrefetches;
+      DEBUG(dbgs() << "  Access: " << *PtrValue << ", SCEV: " << *LSCEV
+                   << "\n");
+      emitOptimizationRemark(F->getContext(), DEBUG_TYPE, *F,
+                             MemI->getDebugLoc(), "prefetched memory access");
+
 
       MadeChange = true;
     }
diff --git a/lib/Transforms/Scalar/LoopDeletion.cpp b/lib/Transforms/Scalar/LoopDeletion.cpp
index 7b1940b48c31..19b2f89555c2 100644
--- a/lib/Transforms/Scalar/LoopDeletion.cpp
+++ b/lib/Transforms/Scalar/LoopDeletion.cpp
@@ -14,75 +14,28 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "llvm/Transforms/Scalar.h"
+#include "llvm/Transforms/Scalar/LoopDeletion.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/Statistic.h"
 #include "llvm/Analysis/GlobalsModRef.h"
 #include "llvm/Analysis/LoopPass.h"
-#include "llvm/Analysis/ScalarEvolution.h"
+#include "llvm/Analysis/LoopPassManager.h"
 #include "llvm/IR/Dominators.h"
+#include "llvm/Transforms/Scalar.h"
+#include "llvm/Transforms/Utils/LoopUtils.h"
 using namespace llvm;
 
 #define DEBUG_TYPE "loop-delete"
 
 STATISTIC(NumDeleted, "Number of loops deleted");
 
-namespace {
-  class LoopDeletion : public LoopPass {
-  public:
-    static char ID; // Pass ID, replacement for typeid
-    LoopDeletion() : LoopPass(ID) {
-      initializeLoopDeletionPass(*PassRegistry::getPassRegistry());
-    }
-
-    // Possibly eliminate loop L if it is dead.
-    bool runOnLoop(Loop *L, LPPassManager &) override;
-
-    void getAnalysisUsage(AnalysisUsage &AU) const override {
-      AU.addRequired<DominatorTreeWrapperPass>();
-      AU.addRequired<LoopInfoWrapperPass>();
-      AU.addRequired<ScalarEvolutionWrapperPass>();
-      AU.addRequiredID(LoopSimplifyID);
-      AU.addRequiredID(LCSSAID);
-
-      AU.addPreserved<ScalarEvolutionWrapperPass>();
-      AU.addPreserved<DominatorTreeWrapperPass>();
-      AU.addPreserved<LoopInfoWrapperPass>();
-      AU.addPreserved<GlobalsAAWrapperPass>();
-      AU.addPreservedID(LoopSimplifyID);
-      AU.addPreservedID(LCSSAID);
-    }
-
-  private:
-    bool isLoopDead(Loop *L, SmallVectorImpl<BasicBlock *> &exitingBlocks,
-                    SmallVectorImpl<BasicBlock *> &exitBlocks,
-                    bool &Changed, BasicBlock *Preheader);
-
-  };
-}
-
-char LoopDeletion::ID = 0;
-INITIALIZE_PASS_BEGIN(LoopDeletion, "loop-deletion",
-                "Delete dead loops", false, false)
-INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
-INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass)
-INITIALIZE_PASS_DEPENDENCY(ScalarEvolutionWrapperPass)
-INITIALIZE_PASS_DEPENDENCY(LoopSimplify)
-INITIALIZE_PASS_DEPENDENCY(LCSSA)
-INITIALIZE_PASS_END(LoopDeletion, "loop-deletion",
-                "Delete dead loops", false, false)
-
-Pass *llvm::createLoopDeletionPass() {
-  return new LoopDeletion();
-}
-
 /// isLoopDead - Determined if a loop is dead.  This assumes that we've already
 /// checked for unique exit and exiting blocks, and that the code is in LCSSA
 /// form.
-bool LoopDeletion::isLoopDead(Loop *L,
-                              SmallVectorImpl<BasicBlock *> &exitingBlocks,
-                              SmallVectorImpl<BasicBlock *> &exitBlocks,
-                              bool &Changed, BasicBlock *Preheader) {
+bool LoopDeletionPass::isLoopDead(Loop *L, ScalarEvolution &SE,
+                                  SmallVectorImpl<BasicBlock *> &exitingBlocks,
+                                  SmallVectorImpl<BasicBlock *> &exitBlocks,
+                                  bool &Changed, BasicBlock *Preheader) {
   BasicBlock *exitBlock = exitBlocks[0];
 
   // Make sure that all PHI entries coming from the loop are loop invariant.
@@ -91,6 +44,8 @@ bool LoopDeletion::isLoopDead(Loop *L,
   // sufficient to guarantee that no loop-variant values are used outside
   // of the loop.
   BasicBlock::iterator BI = exitBlock->begin();
+  bool AllEntriesInvariant = true;
+  bool AllOutgoingValuesSame = true;
   while (PHINode *P = dyn_cast<PHINode>(BI)) {
     Value *incoming = P->getIncomingValueForBlock(exitingBlocks[0]);
 
@@ -98,27 +53,37 @@ bool LoopDeletion::isLoopDead(Loop *L,
     // block.  If there are different incoming values for different exiting
     // blocks, then it is impossible to statically determine which value should
     // be used.
-    for (unsigned i = 1, e = exitingBlocks.size(); i < e; ++i) {
-      if (incoming != P->getIncomingValueForBlock(exitingBlocks[i]))
-        return false;
-    }
+    AllOutgoingValuesSame =
+        all_of(makeArrayRef(exitingBlocks).slice(1), [&](BasicBlock *BB) {
+          return incoming == P->getIncomingValueForBlock(BB);
+        });
+
+    if (!AllOutgoingValuesSame)
+      break;
 
     if (Instruction *I = dyn_cast<Instruction>(incoming))
-      if (!L->makeLoopInvariant(I, Changed, Preheader->getTerminator()))
-        return false;
+      if (!L->makeLoopInvariant(I, Changed, Preheader->getTerminator())) {
+        AllEntriesInvariant = false;
+        break;
+      }
 
     ++BI;
   }
 
+  if (Changed)
+    SE.forgetLoopDispositions(L);
+
+  if (!AllEntriesInvariant || !AllOutgoingValuesSame)
+    return false;
+
   // Make sure that no instructions in the block have potential side-effects.
   // This includes instructions that could write to memory, and loads that are
   // marked volatile.  This could be made more aggressive by using aliasing
   // information to identify readonly and readnone calls.
   for (Loop::block_iterator LI = L->block_begin(), LE = L->block_end();
        LI != LE; ++LI) {
-    for (BasicBlock::iterator BI = (*LI)->begin(), BE = (*LI)->end();
-         BI != BE; ++BI) {
-      if (BI->mayHaveSideEffects())
+    for (Instruction &I : **LI) {
+      if (I.mayHaveSideEffects())
         return false;
     }
   }
@@ -126,15 +91,15 @@ bool LoopDeletion::isLoopDead(Loop *L,
   return true;
 }
 
-/// runOnLoop - Remove dead loops, by which we mean loops that do not impact the
-/// observable behavior of the program other than finite running time.  Note
-/// we do ensure that this never remove a loop that might be infinite, as doing
-/// so could change the halting/non-halting nature of a program.
-/// NOTE: This entire process relies pretty heavily on LoopSimplify and LCSSA
-/// in order to make various safety checks work.
-bool LoopDeletion::runOnLoop(Loop *L, LPPassManager &) {
-  if (skipOptnoneFunction(L))
-    return false;
+/// Remove dead loops, by which we mean loops that do not impact the observable
+/// behavior of the program other than finite running time.  Note we do ensure
+/// that this never remove a loop that might be infinite, as doing so could
+/// change the halting/non-halting nature of a program. NOTE: This entire
+/// process relies pretty heavily on LoopSimplify and LCSSA in order to make
+/// various safety checks work.
+bool LoopDeletionPass::runImpl(Loop *L, DominatorTree &DT, ScalarEvolution &SE,
+                               LoopInfo &loopInfo) {
+  assert(L->isLCSSAForm(DT) && "Expected LCSSA!");
 
   // We can only remove the loop if there is a preheader that we can
   // branch from after removing it.
@@ -151,10 +116,10 @@ bool LoopDeletion::runOnLoop(Loop *L, LPPassManager &) {
   if (L->begin() != L->end())
     return false;
 
-  SmallVector<BasicBlock*, 4> exitingBlocks;
+  SmallVector<BasicBlock *, 4> exitingBlocks;
   L->getExitingBlocks(exitingBlocks);
 
-  SmallVector<BasicBlock*, 4> exitBlocks;
+  SmallVector<BasicBlock *, 4> exitBlocks;
   L->getUniqueExitBlocks(exitBlocks);
 
   // We require that the loop only have a single exit block.  Otherwise, we'd
@@ -166,12 +131,11 @@ bool LoopDeletion::runOnLoop(Loop *L, LPPassManager &) {
 
   // Finally, we have to check that the loop really is dead.
   bool Changed = false;
-  if (!isLoopDead(L, exitingBlocks, exitBlocks, Changed, preheader))
+  if (!isLoopDead(L, SE, exitingBlocks, exitBlocks, Changed, preheader))
     return Changed;
 
   // Don't remove loops for which we can't solve the trip count.
   // They could be infinite, in which case we'd be changing program behavior.
-  ScalarEvolution &SE = getAnalysis<ScalarEvolutionWrapperPass>().getSE();
   const SCEV *S = SE.getMaxBackedgeTakenCount(L);
   if (isa<SCEVCouldNotCompute>(S))
     return Changed;
@@ -208,16 +172,14 @@ bool LoopDeletion::runOnLoop(Loop *L, LPPassManager &) {
 
   // Update the dominator tree and remove the instructions and blocks that will
   // be deleted from the reference counting scheme.
-  DominatorTree &DT = getAnalysis<DominatorTreeWrapperPass>().getDomTree();
   SmallVector<DomTreeNode*, 8> ChildNodes;
   for (Loop::block_iterator LI = L->block_begin(), LE = L->block_end();
        LI != LE; ++LI) {
     // Move all of the block's children to be children of the preheader, which
     // allows us to remove the domtree entry for the block.
     ChildNodes.insert(ChildNodes.begin(), DT[*LI]->begin(), DT[*LI]->end());
-    for (SmallVectorImpl<DomTreeNode *>::iterator DI = ChildNodes.begin(),
-         DE = ChildNodes.end(); DI != DE; ++DI) {
-      DT.changeImmediateDominator(*DI, DT[preheader]);
+    for (DomTreeNode *ChildNode : ChildNodes) {
+      DT.changeImmediateDominator(ChildNode, DT[preheader]);
     }
 
     ChildNodes.clear();
@@ -238,8 +200,8 @@ bool LoopDeletion::runOnLoop(Loop *L, LPPassManager &) {
 
   // Finally, the blocks from loopinfo.  This has to happen late because
   // otherwise our loop iterators won't work.
-  LoopInfo &loopInfo = getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
-  SmallPtrSet<BasicBlock*, 8> blocks;
+
+  SmallPtrSet<BasicBlock *, 8> blocks;
   blocks.insert(L->block_begin(), L->block_end());
   for (BasicBlock *BB : blocks)
     loopInfo.removeBlock(BB);
@@ -252,3 +214,56 @@ bool LoopDeletion::runOnLoop(Loop *L, LPPassManager &) {
 
   return Changed;
 }
+
+PreservedAnalyses LoopDeletionPass::run(Loop &L, AnalysisManager<Loop> &AM) {
+  auto &FAM = AM.getResult<FunctionAnalysisManagerLoopProxy>(L).getManager();
+  Function *F = L.getHeader()->getParent();
+
+  auto &DT = *FAM.getCachedResult<DominatorTreeAnalysis>(*F);
+  auto &SE = *FAM.getCachedResult<ScalarEvolutionAnalysis>(*F);
+  auto &LI = *FAM.getCachedResult<LoopAnalysis>(*F);
+
+  bool Changed = runImpl(&L, DT, SE, LI);
+  if (!Changed)
+    return PreservedAnalyses::all();
+
+  return getLoopPassPreservedAnalyses();
+}
+
+namespace {
+class LoopDeletionLegacyPass : public LoopPass {
+public:
+  static char ID; // Pass ID, replacement for typeid
+  LoopDeletionLegacyPass() : LoopPass(ID) {
+    initializeLoopDeletionLegacyPassPass(*PassRegistry::getPassRegistry());
+  }
+
+  // Possibly eliminate loop L if it is dead.
+  bool runOnLoop(Loop *L, LPPassManager &) override;
+
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    getLoopAnalysisUsage(AU);
+  }
+};
+}
+
+char LoopDeletionLegacyPass::ID = 0;
+INITIALIZE_PASS_BEGIN(LoopDeletionLegacyPass, "loop-deletion",
+                      "Delete dead loops", false, false)
+INITIALIZE_PASS_DEPENDENCY(LoopPass)
+INITIALIZE_PASS_END(LoopDeletionLegacyPass, "loop-deletion",
+                    "Delete dead loops", false, false)
+
+Pass *llvm::createLoopDeletionPass() { return new LoopDeletionLegacyPass(); }
+
+bool LoopDeletionLegacyPass::runOnLoop(Loop *L, LPPassManager &) {
+  if (skipLoop(L))
+    return false;
+
+  DominatorTree &DT = getAnalysis<DominatorTreeWrapperPass>().getDomTree();
+  ScalarEvolution &SE = getAnalysis<ScalarEvolutionWrapperPass>().getSE();
+  LoopInfo &loopInfo = getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
+
+  LoopDeletionPass Impl;
+  return Impl.runImpl(L, DT, SE, loopInfo);
+}
diff --git a/lib/Transforms/Scalar/LoopDistribute.cpp b/lib/Transforms/Scalar/LoopDistribute.cpp
index 3d3cf3e2890b..7eca28ed2bb7 100644
--- a/lib/Transforms/Scalar/LoopDistribute.cpp
+++ b/lib/Transforms/Scalar/LoopDistribute.cpp
@@ -22,12 +22,17 @@
 //
 //===----------------------------------------------------------------------===//
 
+#include "llvm/Transforms/Scalar/LoopDistribute.h"
 #include "llvm/ADT/DepthFirstIterator.h"
 #include "llvm/ADT/EquivalenceClasses.h"
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/Statistic.h"
+#include "llvm/Analysis/BlockFrequencyInfo.h"
 #include "llvm/Analysis/LoopAccessAnalysis.h"
 #include "llvm/Analysis/LoopInfo.h"
+#include "llvm/Analysis/LoopPassManager.h"
+#include "llvm/Analysis/OptimizationDiagnosticInfo.h"
+#include "llvm/IR/DiagnosticInfo.h"
 #include "llvm/IR/Dominators.h"
 #include "llvm/Pass.h"
 #include "llvm/Support/CommandLine.h"
@@ -60,6 +65,19 @@ static cl::opt<unsigned> DistributeSCEVCheckThreshold(
     cl::desc("The maximum number of SCEV checks allowed for Loop "
              "Distribution"));
 
+static cl::opt<unsigned> PragmaDistributeSCEVCheckThreshold(
+    "loop-distribute-scev-check-threshold-with-pragma", cl::init(128),
+    cl::Hidden,
+    cl::desc(
+        "The maximum number of SCEV checks allowed for Loop "
+        "Distribution for loop marked with #pragma loop distribute(enable)"));
+
+// Note that the initial value for this depends on whether the pass is invoked
+// directly or from the optimization pipeline.
+static cl::opt<bool> EnableLoopDistribute(
+    "enable-loop-distribute", cl::Hidden,
+    cl::desc("Enable the new, experimental LoopDistribution Pass"));
+
 STATISTIC(NumLoopsDistributed, "Number of loops distributed");
 
 namespace {
@@ -170,7 +188,7 @@ public:
 
     // Delete the instructions backwards, as it has a reduced likelihood of
     // having to update as many def-use and use-def chains.
-    for (auto *Inst : make_range(Unused.rbegin(), Unused.rend())) {
+    for (auto *Inst : reverse(Unused)) {
       if (!Inst->use_empty())
         Inst->replaceAllUsesWith(UndefValue::get(Inst->getType()));
       Inst->eraseFromParent();
@@ -571,121 +589,39 @@ private:
   AccessesType Accesses;
 };
 
-/// \brief The pass class.
-class LoopDistribute : public FunctionPass {
+/// \brief The actual class performing the per-loop work.
+class LoopDistributeForLoop {
 public:
-  LoopDistribute() : FunctionPass(ID) {
-    initializeLoopDistributePass(*PassRegistry::getPassRegistry());
-  }
-
-  bool runOnFunction(Function &F) override {
-    LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
-    LAA = &getAnalysis<LoopAccessAnalysis>();
-    DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
-    SE = &getAnalysis<ScalarEvolutionWrapperPass>().getSE();
-
-    // Build up a worklist of inner-loops to vectorize. This is necessary as the
-    // act of distributing a loop creates new loops and can invalidate iterators
-    // across the loops.
-    SmallVector<Loop *, 8> Worklist;
-
-    for (Loop *TopLevelLoop : *LI)
-      for (Loop *L : depth_first(TopLevelLoop))
-        // We only handle inner-most loops.
-        if (L->empty())
-          Worklist.push_back(L);
-
-    // Now walk the identified inner loops.
-    bool Changed = false;
-    for (Loop *L : Worklist)
-      Changed |= processLoop(L);
-
-    // Process each loop nest in the function.
-    return Changed;
-  }
-
-  void getAnalysisUsage(AnalysisUsage &AU) const override {
-    AU.addRequired<ScalarEvolutionWrapperPass>();
-    AU.addRequired<LoopInfoWrapperPass>();
-    AU.addPreserved<LoopInfoWrapperPass>();
-    AU.addRequired<LoopAccessAnalysis>();
-    AU.addRequired<DominatorTreeWrapperPass>();
-    AU.addPreserved<DominatorTreeWrapperPass>();
-  }
-
-  static char ID;
-
-private:
-  /// \brief Filter out checks between pointers from the same partition.
-  ///
-  /// \p PtrToPartition contains the partition number for pointers.  Partition
-  /// number -1 means that the pointer is used in multiple partitions.  In this
-  /// case we can't safely omit the check.
-  SmallVector<RuntimePointerChecking::PointerCheck, 4>
-  includeOnlyCrossPartitionChecks(
-      const SmallVectorImpl<RuntimePointerChecking::PointerCheck> &AllChecks,
-      const SmallVectorImpl<int> &PtrToPartition,
-      const RuntimePointerChecking *RtPtrChecking) {
-    SmallVector<RuntimePointerChecking::PointerCheck, 4> Checks;
-
-    std::copy_if(AllChecks.begin(), AllChecks.end(), std::back_inserter(Checks),
-                 [&](const RuntimePointerChecking::PointerCheck &Check) {
-                   for (unsigned PtrIdx1 : Check.first->Members)
-                     for (unsigned PtrIdx2 : Check.second->Members)
-                       // Only include this check if there is a pair of pointers
-                       // that require checking and the pointers fall into
-                       // separate partitions.
-                       //
-                       // (Note that we already know at this point that the two
-                       // pointer groups need checking but it doesn't follow
-                       // that each pair of pointers within the two groups need
-                       // checking as well.
-                       //
-                       // In other words we don't want to include a check just
-                       // because there is a pair of pointers between the two
-                       // pointer groups that require checks and a different
-                       // pair whose pointers fall into different partitions.)
-                       if (RtPtrChecking->needsChecking(PtrIdx1, PtrIdx2) &&
-                           !RuntimePointerChecking::arePointersInSamePartition(
-                               PtrToPartition, PtrIdx1, PtrIdx2))
-                         return true;
-                   return false;
-                 });
-
-    return Checks;
+  LoopDistributeForLoop(Loop *L, Function *F, LoopInfo *LI, DominatorTree *DT,
+                        ScalarEvolution *SE, OptimizationRemarkEmitter *ORE)
+      : L(L), F(F), LI(LI), LAI(nullptr), DT(DT), SE(SE), ORE(ORE) {
+    setForced();
   }
 
   /// \brief Try to distribute an inner-most loop.
-  bool processLoop(Loop *L) {
+  bool processLoop(std::function<const LoopAccessInfo &(Loop &)> &GetLAA) {
     assert(L->empty() && "Only process inner loops.");
 
     DEBUG(dbgs() << "\nLDist: In \"" << L->getHeader()->getParent()->getName()
                  << "\" checking " << *L << "\n");
 
     BasicBlock *PH = L->getLoopPreheader();
-    if (!PH) {
-      DEBUG(dbgs() << "Skipping; no preheader");
-      return false;
-    }
-    if (!L->getExitBlock()) {
-      DEBUG(dbgs() << "Skipping; multiple exit blocks");
-      return false;
-    }
-    // LAA will check that we only have a single exiting block.
+    if (!PH)
+      return fail("no preheader");
+    if (!L->getExitBlock())
+      return fail("multiple exit blocks");
 
-    const LoopAccessInfo &LAI = LAA->getInfo(L, ValueToValueMap());
+    // LAA will check that we only have a single exiting block.
+    LAI = &GetLAA(*L);
 
     // Currently, we only distribute to isolate the part of the loop with
     // dependence cycles to enable partial vectorization.
-    if (LAI.canVectorizeMemory()) {
-      DEBUG(dbgs() << "Skipping; memory operations are safe for vectorization");
-      return false;
-    }
-    auto *Dependences = LAI.getDepChecker().getDependences();
-    if (!Dependences || Dependences->empty()) {
-      DEBUG(dbgs() << "Skipping; No unsafe dependences to isolate");
-      return false;
-    }
+    if (LAI->canVectorizeMemory())
+      return fail("memory operations are safe for vectorization");
+
+    auto *Dependences = LAI->getDepChecker().getDependences();
+    if (!Dependences || Dependences->empty())
+      return fail("no unsafe dependences to isolate");
 
     InstPartitionContainer Partitions(L, LI, DT);
 
@@ -708,7 +644,7 @@ private:
     // NumUnsafeDependencesActive > 0 indicates this situation and in this case
     // we just keep assigning to the same cyclic partition until
     // NumUnsafeDependencesActive reaches 0.
-    const MemoryDepChecker &DepChecker = LAI.getDepChecker();
+    const MemoryDepChecker &DepChecker = LAI->getDepChecker();
     MemoryInstructionDependences MID(DepChecker.getMemoryInstructions(),
                                      *Dependences);
 
@@ -738,14 +674,14 @@ private:
 
     DEBUG(dbgs() << "Seeded partitions:\n" << Partitions);
     if (Partitions.getSize() < 2)
-      return false;
+      return fail("cannot isolate unsafe dependencies");
 
     // Run the merge heuristics: Merge non-cyclic adjacent partitions since we
     // should be able to vectorize these together.
     Partitions.mergeBeforePopulating();
     DEBUG(dbgs() << "\nMerged partitions:\n" << Partitions);
     if (Partitions.getSize() < 2)
-      return false;
+      return fail("cannot isolate unsafe dependencies");
 
     // Now, populate the partitions with non-memory operations.
     Partitions.populateUsedSet();
@@ -757,15 +693,15 @@ private:
       DEBUG(dbgs() << "\nPartitions merged to ensure unique loads:\n"
                    << Partitions);
       if (Partitions.getSize() < 2)
-        return false;
+        return fail("cannot isolate unsafe dependencies");
     }
 
     // Don't distribute the loop if we need too many SCEV run-time checks.
-    const SCEVUnionPredicate &Pred = LAI.PSE.getUnionPredicate();
-    if (Pred.getComplexity() > DistributeSCEVCheckThreshold) {
-      DEBUG(dbgs() << "Too many SCEV run-time checks needed.\n");
-      return false;
-    }
+    const SCEVUnionPredicate &Pred = LAI->getPSE().getUnionPredicate();
+    if (Pred.getComplexity() > (IsForced.getValueOr(false)
+                                    ? PragmaDistributeSCEVCheckThreshold
+                                    : DistributeSCEVCheckThreshold))
+      return fail("too many SCEV run-time checks needed.\n");
 
     DEBUG(dbgs() << "\nDistributing loop: " << *L << "\n");
     // We're done forming the partitions set up the reverse mapping from
@@ -779,19 +715,20 @@ private:
       SplitBlock(PH, PH->getTerminator(), DT, LI);
 
     // If we need run-time checks, version the loop now.
-    auto PtrToPartition = Partitions.computePartitionSetForPointers(LAI);
-    const auto *RtPtrChecking = LAI.getRuntimePointerChecking();
+    auto PtrToPartition = Partitions.computePartitionSetForPointers(*LAI);
+    const auto *RtPtrChecking = LAI->getRuntimePointerChecking();
     const auto &AllChecks = RtPtrChecking->getChecks();
     auto Checks = includeOnlyCrossPartitionChecks(AllChecks, PtrToPartition,
                                                   RtPtrChecking);
 
     if (!Pred.isAlwaysTrue() || !Checks.empty()) {
       DEBUG(dbgs() << "\nPointers:\n");
-      DEBUG(LAI.getRuntimePointerChecking()->printChecks(dbgs(), Checks));
-      LoopVersioning LVer(LAI, L, LI, DT, SE, false);
+      DEBUG(LAI->getRuntimePointerChecking()->printChecks(dbgs(), Checks));
+      LoopVersioning LVer(*LAI, L, LI, DT, SE, false);
       LVer.setAliasChecks(std::move(Checks));
-      LVer.setSCEVChecks(LAI.PSE.getUnionPredicate());
+      LVer.setSCEVChecks(LAI->getPSE().getUnionPredicate());
       LVer.versionLoop(DefsUsedOutside);
+      LVer.annotateLoopWithNoAlias();
     }
 
     // Create identical copies of the original loop for each partition and hook
@@ -810,27 +747,244 @@ private:
     }
 
     ++NumLoopsDistributed;
+    // Report the success.
+    emitOptimizationRemark(F->getContext(), LDIST_NAME, *F, L->getStartLoc(),
+                           "distributed loop");
     return true;
   }
 
+  /// \brief Provide diagnostics then \return with false.
+  bool fail(llvm::StringRef Message) {
+    LLVMContext &Ctx = F->getContext();
+    bool Forced = isForced().getValueOr(false);
+
+    DEBUG(dbgs() << "Skipping; " << Message << "\n");
+
+    // With Rpass-missed report that distribution failed.
+    ORE->emitOptimizationRemarkMissed(
+        LDIST_NAME, L,
+        "loop not distributed: use -Rpass-analysis=loop-distribute for more "
+        "info");
+
+    // With Rpass-analysis report why.  This is on by default if distribution
+    // was requested explicitly.
+    emitOptimizationRemarkAnalysis(
+        Ctx, Forced ? DiagnosticInfoOptimizationRemarkAnalysis::AlwaysPrint
+                    : LDIST_NAME,
+        *F, L->getStartLoc(), Twine("loop not distributed: ") + Message);
+
+    // Also issue a warning if distribution was requested explicitly but it
+    // failed.
+    if (Forced)
+      Ctx.diagnose(DiagnosticInfoOptimizationFailure(
+          *F, L->getStartLoc(), "loop not distributed: failed "
+                                "explicitly specified loop distribution"));
+
+    return false;
+  }
+
+  /// \brief Return if distribution forced to be enabled/disabled for the loop.
+  ///
+  /// If the optional has a value, it indicates whether distribution was forced
+  /// to be enabled (true) or disabled (false).  If the optional has no value
+  /// distribution was not forced either way.
+  const Optional<bool> &isForced() const { return IsForced; }
+
+private:
+  /// \brief Filter out checks between pointers from the same partition.
+  ///
+  /// \p PtrToPartition contains the partition number for pointers.  Partition
+  /// number -1 means that the pointer is used in multiple partitions.  In this
+  /// case we can't safely omit the check.
+  SmallVector<RuntimePointerChecking::PointerCheck, 4>
+  includeOnlyCrossPartitionChecks(
+      const SmallVectorImpl<RuntimePointerChecking::PointerCheck> &AllChecks,
+      const SmallVectorImpl<int> &PtrToPartition,
+      const RuntimePointerChecking *RtPtrChecking) {
+    SmallVector<RuntimePointerChecking::PointerCheck, 4> Checks;
+
+    std::copy_if(AllChecks.begin(), AllChecks.end(), std::back_inserter(Checks),
+                 [&](const RuntimePointerChecking::PointerCheck &Check) {
+                   for (unsigned PtrIdx1 : Check.first->Members)
+                     for (unsigned PtrIdx2 : Check.second->Members)
+                       // Only include this check if there is a pair of pointers
+                       // that require checking and the pointers fall into
+                       // separate partitions.
+                       //
+                       // (Note that we already know at this point that the two
+                       // pointer groups need checking but it doesn't follow
+                       // that each pair of pointers within the two groups need
+                       // checking as well.
+                       //
+                       // In other words we don't want to include a check just
+                       // because there is a pair of pointers between the two
+                       // pointer groups that require checks and a different
+                       // pair whose pointers fall into different partitions.)
+                       if (RtPtrChecking->needsChecking(PtrIdx1, PtrIdx2) &&
+                           !RuntimePointerChecking::arePointersInSamePartition(
+                               PtrToPartition, PtrIdx1, PtrIdx2))
+                         return true;
+                   return false;
+                 });
+
+    return Checks;
+  }
+
+  /// \brief Check whether the loop metadata is forcing distribution to be
+  /// enabled/disabled.
+  void setForced() {
+    Optional<const MDOperand *> Value =
+        findStringMetadataForLoop(L, "llvm.loop.distribute.enable");
+    if (!Value)
+      return;
+
+    const MDOperand *Op = *Value;
+    assert(Op && mdconst::hasa<ConstantInt>(*Op) && "invalid metadata");
+    IsForced = mdconst::extract<ConstantInt>(*Op)->getZExtValue();
+  }
+
+  Loop *L;
+  Function *F;
+
   // Analyses used.
   LoopInfo *LI;
-  LoopAccessAnalysis *LAA;
+  const LoopAccessInfo *LAI;
   DominatorTree *DT;
   ScalarEvolution *SE;
+  OptimizationRemarkEmitter *ORE;
+
+  /// \brief Indicates whether distribution is forced to be enabled/disabled for
+  /// the loop.
+  ///
+  /// If the optional has a value, it indicates whether distribution was forced
+  /// to be enabled (true) or disabled (false).  If the optional has no value
+  /// distribution was not forced either way.
+  Optional<bool> IsForced;
+};
+
+/// Shared implementation between new and old PMs.
+static bool runImpl(Function &F, LoopInfo *LI, DominatorTree *DT,
+                    ScalarEvolution *SE, OptimizationRemarkEmitter *ORE,
+                    std::function<const LoopAccessInfo &(Loop &)> &GetLAA,
+                    bool ProcessAllLoops) {
+  // Build up a worklist of inner-loops to vectorize. This is necessary as the
+  // act of distributing a loop creates new loops and can invalidate iterators
+  // across the loops.
+  SmallVector<Loop *, 8> Worklist;
+
+  for (Loop *TopLevelLoop : *LI)
+    for (Loop *L : depth_first(TopLevelLoop))
+      // We only handle inner-most loops.
+      if (L->empty())
+        Worklist.push_back(L);
+
+  // Now walk the identified inner loops.
+  bool Changed = false;
+  for (Loop *L : Worklist) {
+    LoopDistributeForLoop LDL(L, &F, LI, DT, SE, ORE);
+
+    // If distribution was forced for the specific loop to be
+    // enabled/disabled, follow that.  Otherwise use the global flag.
+    if (LDL.isForced().getValueOr(ProcessAllLoops))
+      Changed |= LDL.processLoop(GetLAA);
+  }
+
+  // Process each loop nest in the function.
+  return Changed;
+}
+
+/// \brief The pass class.
+class LoopDistributeLegacy : public FunctionPass {
+public:
+  /// \p ProcessAllLoopsByDefault specifies whether loop distribution should be
+  /// performed by default.  Pass -enable-loop-distribute={0,1} overrides this
+  /// default.  We use this to keep LoopDistribution off by default when invoked
+  /// from the optimization pipeline but on when invoked explicitly from opt.
+  LoopDistributeLegacy(bool ProcessAllLoopsByDefault = true)
+      : FunctionPass(ID), ProcessAllLoops(ProcessAllLoopsByDefault) {
+    // The default is set by the caller.
+    if (EnableLoopDistribute.getNumOccurrences() > 0)
+      ProcessAllLoops = EnableLoopDistribute;
+    initializeLoopDistributeLegacyPass(*PassRegistry::getPassRegistry());
+  }
+
+  bool runOnFunction(Function &F) override {
+    if (skipFunction(F))
+      return false;
+
+    auto *LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
+    auto *LAA = &getAnalysis<LoopAccessLegacyAnalysis>();
+    auto *DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
+    auto *SE = &getAnalysis<ScalarEvolutionWrapperPass>().getSE();
+    auto *ORE = &getAnalysis<OptimizationRemarkEmitterWrapperPass>().getORE();
+    std::function<const LoopAccessInfo &(Loop &)> GetLAA =
+        [&](Loop &L) -> const LoopAccessInfo & { return LAA->getInfo(&L); };
+
+    return runImpl(F, LI, DT, SE, ORE, GetLAA, ProcessAllLoops);
+  }
+
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    AU.addRequired<ScalarEvolutionWrapperPass>();
+    AU.addRequired<LoopInfoWrapperPass>();
+    AU.addPreserved<LoopInfoWrapperPass>();
+    AU.addRequired<LoopAccessLegacyAnalysis>();
+    AU.addRequired<DominatorTreeWrapperPass>();
+    AU.addPreserved<DominatorTreeWrapperPass>();
+    AU.addRequired<OptimizationRemarkEmitterWrapperPass>();
+  }
+
+  static char ID;
+
+private:
+  /// \brief Whether distribution should be on in this function.  The per-loop
+  /// pragma can override this.
+  bool ProcessAllLoops;
 };
 } // anonymous namespace
 
-char LoopDistribute::ID;
+PreservedAnalyses LoopDistributePass::run(Function &F,
+                                          FunctionAnalysisManager &AM) {
+  // FIXME: This does not currently match the behavior from the old PM.
+  // ProcessAllLoops with the old PM defaults to true when invoked from opt and
+  // false when invoked from the optimization pipeline.
+  bool ProcessAllLoops = false;
+  if (EnableLoopDistribute.getNumOccurrences() > 0)
+    ProcessAllLoops = EnableLoopDistribute;
+
+  auto &LI = AM.getResult<LoopAnalysis>(F);
+  auto &DT = AM.getResult<DominatorTreeAnalysis>(F);
+  auto &SE = AM.getResult<ScalarEvolutionAnalysis>(F);
+  auto &ORE = AM.getResult<OptimizationRemarkEmitterAnalysis>(F);
+
+  auto &LAM = AM.getResult<LoopAnalysisManagerFunctionProxy>(F).getManager();
+  std::function<const LoopAccessInfo &(Loop &)> GetLAA =
+      [&](Loop &L) -> const LoopAccessInfo & {
+    return LAM.getResult<LoopAccessAnalysis>(L);
+  };
+
+  bool Changed = runImpl(F, &LI, &DT, &SE, &ORE, GetLAA, ProcessAllLoops);
+  if (!Changed)
+    return PreservedAnalyses::all();
+  PreservedAnalyses PA;
+  PA.preserve<LoopAnalysis>();
+  PA.preserve<DominatorTreeAnalysis>();
+  return PA;
+}
+
+char LoopDistributeLegacy::ID;
 static const char ldist_name[] = "Loop Distribition";
 
-INITIALIZE_PASS_BEGIN(LoopDistribute, LDIST_NAME, ldist_name, false, false)
+INITIALIZE_PASS_BEGIN(LoopDistributeLegacy, LDIST_NAME, ldist_name, false,
+                      false)
 INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass)
-INITIALIZE_PASS_DEPENDENCY(LoopAccessAnalysis)
+INITIALIZE_PASS_DEPENDENCY(LoopAccessLegacyAnalysis)
 INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
 INITIALIZE_PASS_DEPENDENCY(ScalarEvolutionWrapperPass)
-INITIALIZE_PASS_END(LoopDistribute, LDIST_NAME, ldist_name, false, false)
+INITIALIZE_PASS_DEPENDENCY(OptimizationRemarkEmitterWrapperPass)
+INITIALIZE_PASS_END(LoopDistributeLegacy, LDIST_NAME, ldist_name, false, false)
 
 namespace llvm {
-FunctionPass *createLoopDistributePass() { return new LoopDistribute(); }
+FunctionPass *createLoopDistributePass(bool ProcessAllLoopsByDefault) {
+  return new LoopDistributeLegacy(ProcessAllLoopsByDefault);
+}
 }
diff --git a/lib/Transforms/Scalar/LoopIdiomRecognize.cpp b/lib/Transforms/Scalar/LoopIdiomRecognize.cpp
index 4521640e3947..1468676a3543 100644
--- a/lib/Transforms/Scalar/LoopIdiomRecognize.cpp
+++ b/lib/Transforms/Scalar/LoopIdiomRecognize.cpp
@@ -26,22 +26,21 @@
 // i64 and larger types when i64 is legal and the value has few bits set.  It
 // would be good to enhance isel to emit a loop for ctpop in this case.
 //
-// We should enhance the memset/memcpy recognition to handle multiple stores in
-// the loop.  This would handle things like:
-//   void foo(_Complex float *P)
-//     for (i) { __real__(*P) = 0;  __imag__(*P) = 0; }
-//
 // This could recognize common matrix multiplies and dot product idioms and
 // replace them with calls to BLAS (if linked in??).
 //
 //===----------------------------------------------------------------------===//
 
-#include "llvm/Transforms/Scalar.h"
+#include "llvm/Transforms/Scalar/LoopIdiomRecognize.h"
+#include "llvm/ADT/MapVector.h"
+#include "llvm/ADT/SetVector.h"
 #include "llvm/ADT/Statistic.h"
 #include "llvm/Analysis/AliasAnalysis.h"
 #include "llvm/Analysis/BasicAliasAnalysis.h"
 #include "llvm/Analysis/GlobalsModRef.h"
+#include "llvm/Analysis/LoopAccessAnalysis.h"
 #include "llvm/Analysis/LoopPass.h"
+#include "llvm/Analysis/LoopPassManager.h"
 #include "llvm/Analysis/ScalarEvolutionAliasAnalysis.h"
 #include "llvm/Analysis/ScalarEvolutionExpander.h"
 #include "llvm/Analysis/ScalarEvolutionExpressions.h"
@@ -55,7 +54,10 @@
 #include "llvm/IR/Module.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/raw_ostream.h"
+#include "llvm/Transforms/Scalar.h"
+#include "llvm/Transforms/Utils/BuildLibCalls.h"
 #include "llvm/Transforms/Utils/Local.h"
+#include "llvm/Transforms/Utils/LoopUtils.h"
 using namespace llvm;
 
 #define DEBUG_TYPE "loop-idiom"
@@ -65,7 +67,7 @@ STATISTIC(NumMemCpy, "Number of memcpy's formed from loop load+stores");
 
 namespace {
 
-class LoopIdiomRecognize : public LoopPass {
+class LoopIdiomRecognize {
   Loop *CurLoop;
   AliasAnalysis *AA;
   DominatorTree *DT;
@@ -76,39 +78,21 @@ class LoopIdiomRecognize : public LoopPass {
   const DataLayout *DL;
 
 public:
-  static char ID;
-  explicit LoopIdiomRecognize() : LoopPass(ID) {
-    initializeLoopIdiomRecognizePass(*PassRegistry::getPassRegistry());
-  }
+  explicit LoopIdiomRecognize(AliasAnalysis *AA, DominatorTree *DT,
+                              LoopInfo *LI, ScalarEvolution *SE,
+                              TargetLibraryInfo *TLI,
+                              const TargetTransformInfo *TTI,
+                              const DataLayout *DL)
+      : CurLoop(nullptr), AA(AA), DT(DT), LI(LI), SE(SE), TLI(TLI), TTI(TTI),
+        DL(DL) {}
 
-  bool runOnLoop(Loop *L, LPPassManager &LPM) override;
-
-  /// This transformation requires natural loop information & requires that
-  /// loop preheaders be inserted into the CFG.
-  ///
-  void getAnalysisUsage(AnalysisUsage &AU) const override {
-    AU.addRequired<LoopInfoWrapperPass>();
-    AU.addPreserved<LoopInfoWrapperPass>();
-    AU.addRequiredID(LoopSimplifyID);
-    AU.addPreservedID(LoopSimplifyID);
-    AU.addRequiredID(LCSSAID);
-    AU.addPreservedID(LCSSAID);
-    AU.addRequired<AAResultsWrapperPass>();
-    AU.addPreserved<AAResultsWrapperPass>();
-    AU.addRequired<ScalarEvolutionWrapperPass>();
-    AU.addPreserved<ScalarEvolutionWrapperPass>();
-    AU.addPreserved<SCEVAAWrapperPass>();
-    AU.addRequired<DominatorTreeWrapperPass>();
-    AU.addPreserved<DominatorTreeWrapperPass>();
-    AU.addRequired<TargetLibraryInfoWrapperPass>();
-    AU.addRequired<TargetTransformInfoWrapperPass>();
-    AU.addPreserved<BasicAAWrapperPass>();
-    AU.addPreserved<GlobalsAAWrapperPass>();
-  }
+  bool runOnLoop(Loop *L);
 
 private:
   typedef SmallVector<StoreInst *, 8> StoreList;
-  StoreList StoreRefsForMemset;
+  typedef MapVector<Value *, StoreList> StoreListMap;
+  StoreListMap StoreRefsForMemset;
+  StoreListMap StoreRefsForMemsetPattern;
   StoreList StoreRefsForMemcpy;
   bool HasMemset;
   bool HasMemsetPattern;
@@ -122,14 +106,18 @@ private:
                       SmallVectorImpl<BasicBlock *> &ExitBlocks);
 
   void collectStores(BasicBlock *BB);
-  bool isLegalStore(StoreInst *SI, bool &ForMemset, bool &ForMemcpy);
-  bool processLoopStore(StoreInst *SI, const SCEV *BECount);
+  bool isLegalStore(StoreInst *SI, bool &ForMemset, bool &ForMemsetPattern,
+                    bool &ForMemcpy);
+  bool processLoopStores(SmallVectorImpl<StoreInst *> &SL, const SCEV *BECount,
+                         bool ForMemset);
   bool processLoopMemSet(MemSetInst *MSI, const SCEV *BECount);
 
   bool processLoopStridedStore(Value *DestPtr, unsigned StoreSize,
                                unsigned StoreAlignment, Value *StoredVal,
-                               Instruction *TheStore, const SCEVAddRecExpr *Ev,
-                               const SCEV *BECount, bool NegStride);
+                               Instruction *TheStore,
+                               SmallPtrSetImpl<Instruction *> &Stores,
+                               const SCEVAddRecExpr *Ev, const SCEV *BECount,
+                               bool NegStride);
   bool processLoopStoreOfLoopLoad(StoreInst *SI, const SCEV *BECount);
 
   /// @}
@@ -145,38 +133,82 @@ private:
   /// @}
 };
 
+class LoopIdiomRecognizeLegacyPass : public LoopPass {
+public:
+  static char ID;
+  explicit LoopIdiomRecognizeLegacyPass() : LoopPass(ID) {
+    initializeLoopIdiomRecognizeLegacyPassPass(
+        *PassRegistry::getPassRegistry());
+  }
+
+  bool runOnLoop(Loop *L, LPPassManager &LPM) override {
+    if (skipLoop(L))
+      return false;
+
+    AliasAnalysis *AA = &getAnalysis<AAResultsWrapperPass>().getAAResults();
+    DominatorTree *DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
+    LoopInfo *LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
+    ScalarEvolution *SE = &getAnalysis<ScalarEvolutionWrapperPass>().getSE();
+    TargetLibraryInfo *TLI =
+        &getAnalysis<TargetLibraryInfoWrapperPass>().getTLI();
+    const TargetTransformInfo *TTI =
+        &getAnalysis<TargetTransformInfoWrapperPass>().getTTI(
+            *L->getHeader()->getParent());
+    const DataLayout *DL = &L->getHeader()->getModule()->getDataLayout();
+
+    LoopIdiomRecognize LIR(AA, DT, LI, SE, TLI, TTI, DL);
+    return LIR.runOnLoop(L);
+  }
+
+  /// This transformation requires natural loop information & requires that
+  /// loop preheaders be inserted into the CFG.
+  ///
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    AU.addRequired<TargetLibraryInfoWrapperPass>();
+    AU.addRequired<TargetTransformInfoWrapperPass>();
+    getLoopAnalysisUsage(AU);
+  }
+};
 } // End anonymous namespace.
 
-char LoopIdiomRecognize::ID = 0;
-INITIALIZE_PASS_BEGIN(LoopIdiomRecognize, "loop-idiom", "Recognize loop idioms",
-                      false, false)
-INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass)
-INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
-INITIALIZE_PASS_DEPENDENCY(LoopSimplify)
-INITIALIZE_PASS_DEPENDENCY(LCSSA)
-INITIALIZE_PASS_DEPENDENCY(ScalarEvolutionWrapperPass)
+PreservedAnalyses LoopIdiomRecognizePass::run(Loop &L,
+                                              AnalysisManager<Loop> &AM) {
+  const auto &FAM =
+      AM.getResult<FunctionAnalysisManagerLoopProxy>(L).getManager();
+  Function *F = L.getHeader()->getParent();
+
+  // Use getCachedResult because Loop pass cannot trigger a function analysis.
+  auto *AA = FAM.getCachedResult<AAManager>(*F);
+  auto *DT = FAM.getCachedResult<DominatorTreeAnalysis>(*F);
+  auto *LI = FAM.getCachedResult<LoopAnalysis>(*F);
+  auto *SE = FAM.getCachedResult<ScalarEvolutionAnalysis>(*F);
+  auto *TLI = FAM.getCachedResult<TargetLibraryAnalysis>(*F);
+  const auto *TTI = FAM.getCachedResult<TargetIRAnalysis>(*F);
+  const auto *DL = &L.getHeader()->getModule()->getDataLayout();
+  assert((AA && DT && LI && SE && TLI && TTI && DL) &&
+         "Analyses for Loop Idiom Recognition not available");
+
+  LoopIdiomRecognize LIR(AA, DT, LI, SE, TLI, TTI, DL);
+  if (!LIR.runOnLoop(&L))
+    return PreservedAnalyses::all();
+
+  return getLoopPassPreservedAnalyses();
+}
+
+char LoopIdiomRecognizeLegacyPass::ID = 0;
+INITIALIZE_PASS_BEGIN(LoopIdiomRecognizeLegacyPass, "loop-idiom",
+                      "Recognize loop idioms", false, false)
+INITIALIZE_PASS_DEPENDENCY(LoopPass)
 INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfoWrapperPass)
-INITIALIZE_PASS_DEPENDENCY(BasicAAWrapperPass)
-INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass)
-INITIALIZE_PASS_DEPENDENCY(GlobalsAAWrapperPass)
-INITIALIZE_PASS_DEPENDENCY(SCEVAAWrapperPass)
 INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass)
-INITIALIZE_PASS_END(LoopIdiomRecognize, "loop-idiom", "Recognize loop idioms",
-                    false, false)
+INITIALIZE_PASS_END(LoopIdiomRecognizeLegacyPass, "loop-idiom",
+                    "Recognize loop idioms", false, false)
 
-Pass *llvm::createLoopIdiomPass() { return new LoopIdiomRecognize(); }
+Pass *llvm::createLoopIdiomPass() { return new LoopIdiomRecognizeLegacyPass(); }
 
-/// deleteDeadInstruction - Delete this instruction.  Before we do, go through
-/// and zero out all the operands of this instruction.  If any of them become
-/// dead, delete them and the computation tree that feeds them.
-///
-static void deleteDeadInstruction(Instruction *I,
-                                  const TargetLibraryInfo *TLI) {
-  SmallVector<Value *, 16> Operands(I->value_op_begin(), I->value_op_end());
+static void deleteDeadInstruction(Instruction *I) {
   I->replaceAllUsesWith(UndefValue::get(I->getType()));
   I->eraseFromParent();
-  for (Value *Op : Operands)
-    RecursivelyDeleteTriviallyDeadInstructions(Op, TLI);
 }
 
 //===----------------------------------------------------------------------===//
@@ -185,10 +217,7 @@ static void deleteDeadInstruction(Instruction *I,
 //
 //===----------------------------------------------------------------------===//
 
-bool LoopIdiomRecognize::runOnLoop(Loop *L, LPPassManager &LPM) {
-  if (skipOptnoneFunction(L))
-    return false;
-
+bool LoopIdiomRecognize::runOnLoop(Loop *L) {
   CurLoop = L;
   // If the loop could not be converted to canonical form, it must have an
   // indirectbr in it, just give up.
@@ -200,15 +229,6 @@ bool LoopIdiomRecognize::runOnLoop(Loop *L, LPPassManager &LPM) {
   if (Name == "memset" || Name == "memcpy")
     return false;
 
-  AA = &getAnalysis<AAResultsWrapperPass>().getAAResults();
-  DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
-  LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
-  SE = &getAnalysis<ScalarEvolutionWrapperPass>().getSE();
-  TLI = &getAnalysis<TargetLibraryInfoWrapperPass>().getTLI();
-  TTI = &getAnalysis<TargetTransformInfoWrapperPass>().getTTI(
-      *CurLoop->getHeader()->getParent());
-  DL = &CurLoop->getHeader()->getModule()->getDataLayout();
-
   HasMemset = TLI->has(LibFunc::memset);
   HasMemsetPattern = TLI->has(LibFunc::memset_pattern16);
   HasMemcpy = TLI->has(LibFunc::memcpy);
@@ -240,6 +260,14 @@ bool LoopIdiomRecognize::runOnCountableLoop() {
                << CurLoop->getHeader()->getName() << "\n");
 
   bool MadeChange = false;
+
+  // The following transforms hoist stores/memsets into the loop pre-header.
+  // Give up if the loop has instructions may throw.
+  LoopSafetyInfo SafetyInfo;
+  computeLoopSafetyInfo(&SafetyInfo, CurLoop);
+  if (SafetyInfo.MayThrow)
+    return MadeChange;
+
   // Scan all the blocks in the loop that are not in subloops.
   for (auto *BB : CurLoop->getBlocks()) {
     // Ignore blocks in subloops.
@@ -258,9 +286,9 @@ static unsigned getStoreSizeInBytes(StoreInst *SI, const DataLayout *DL) {
   return (unsigned)SizeInBits >> 3;
 }
 
-static unsigned getStoreStride(const SCEVAddRecExpr *StoreEv) {
+static APInt getStoreStride(const SCEVAddRecExpr *StoreEv) {
   const SCEVConstant *ConstStride = cast<SCEVConstant>(StoreEv->getOperand(1));
-  return ConstStride->getAPInt().getZExtValue();
+  return ConstStride->getAPInt();
 }
 
 /// getMemSetPatternValue - If a strided store of the specified value is safe to
@@ -305,11 +333,15 @@ static Constant *getMemSetPatternValue(Value *V, const DataLayout *DL) {
 }
 
 bool LoopIdiomRecognize::isLegalStore(StoreInst *SI, bool &ForMemset,
-                                      bool &ForMemcpy) {
+                                      bool &ForMemsetPattern, bool &ForMemcpy) {
   // Don't touch volatile stores.
   if (!SI->isSimple())
     return false;
 
+  // Avoid merging nontemporal stores.
+  if (SI->getMetadata(LLVMContext::MD_nontemporal))
+    return false;
+
   Value *StoredVal = SI->getValueOperand();
   Value *StorePtr = SI->getPointerOperand();
 
@@ -353,7 +385,7 @@ bool LoopIdiomRecognize::isLegalStore(StoreInst *SI, bool &ForMemset,
              StorePtr->getType()->getPointerAddressSpace() == 0 &&
              (PatternValue = getMemSetPatternValue(StoredVal, DL))) {
     // It looks like we can use PatternValue!
-    ForMemset = true;
+    ForMemsetPattern = true;
     return true;
   }
 
@@ -361,7 +393,7 @@ bool LoopIdiomRecognize::isLegalStore(StoreInst *SI, bool &ForMemset,
   if (HasMemcpy) {
     // Check to see if the stride matches the size of the store.  If so, then we
     // know that every byte is touched in the loop.
-    unsigned Stride = getStoreStride(StoreEv);
+    APInt Stride = getStoreStride(StoreEv);
     unsigned StoreSize = getStoreSizeInBytes(SI, DL);
     if (StoreSize != Stride && StoreSize != -Stride)
       return false;
@@ -393,6 +425,7 @@ bool LoopIdiomRecognize::isLegalStore(StoreInst *SI, bool &ForMemset,
 
 void LoopIdiomRecognize::collectStores(BasicBlock *BB) {
   StoreRefsForMemset.clear();
+  StoreRefsForMemsetPattern.clear();
   StoreRefsForMemcpy.clear();
   for (Instruction &I : *BB) {
     StoreInst *SI = dyn_cast<StoreInst>(&I);
@@ -400,15 +433,22 @@ void LoopIdiomRecognize::collectStores(BasicBlock *BB) {
       continue;
 
     bool ForMemset = false;
+    bool ForMemsetPattern = false;
     bool ForMemcpy = false;
     // Make sure this is a strided store with a constant stride.
-    if (!isLegalStore(SI, ForMemset, ForMemcpy))
+    if (!isLegalStore(SI, ForMemset, ForMemsetPattern, ForMemcpy))
       continue;
 
     // Save the store locations.
-    if (ForMemset)
-      StoreRefsForMemset.push_back(SI);
-    else if (ForMemcpy)
+    if (ForMemset) {
+      // Find the base pointer.
+      Value *Ptr = GetUnderlyingObject(SI->getPointerOperand(), *DL);
+      StoreRefsForMemset[Ptr].push_back(SI);
+    } else if (ForMemsetPattern) {
+      // Find the base pointer.
+      Value *Ptr = GetUnderlyingObject(SI->getPointerOperand(), *DL);
+      StoreRefsForMemsetPattern[Ptr].push_back(SI);
+    } else if (ForMemcpy)
       StoreRefsForMemcpy.push_back(SI);
   }
 }
@@ -430,9 +470,14 @@ bool LoopIdiomRecognize::runOnLoopBlock(
   // Look for store instructions, which may be optimized to memset/memcpy.
   collectStores(BB);
 
-  // Look for a single store which can be optimized into a memset.
-  for (auto &SI : StoreRefsForMemset)
-    MadeChange |= processLoopStore(SI, BECount);
+  // Look for a single store or sets of stores with a common base, which can be
+  // optimized into a memset (memset_pattern).  The latter most commonly happens
+  // with structs and handunrolled loops.
+  for (auto &SL : StoreRefsForMemset)
+    MadeChange |= processLoopStores(SL.second, BECount, true);
+
+  for (auto &SL : StoreRefsForMemsetPattern)
+    MadeChange |= processLoopStores(SL.second, BECount, false);
 
   // Optimize the store into a memcpy, if it feeds an similarly strided load.
   for (auto &SI : StoreRefsForMemcpy)
@@ -458,26 +503,144 @@ bool LoopIdiomRecognize::runOnLoopBlock(
   return MadeChange;
 }
 
-/// processLoopStore - See if this store can be promoted to a memset.
-bool LoopIdiomRecognize::processLoopStore(StoreInst *SI, const SCEV *BECount) {
-  assert(SI->isSimple() && "Expected only non-volatile stores.");
+/// processLoopStores - See if this store(s) can be promoted to a memset.
+bool LoopIdiomRecognize::processLoopStores(SmallVectorImpl<StoreInst *> &SL,
+                                           const SCEV *BECount,
+                                           bool ForMemset) {
+  // Try to find consecutive stores that can be transformed into memsets.
+  SetVector<StoreInst *> Heads, Tails;
+  SmallDenseMap<StoreInst *, StoreInst *> ConsecutiveChain;
+
+  // Do a quadratic search on all of the given stores and find
+  // all of the pairs of stores that follow each other.
+  SmallVector<unsigned, 16> IndexQueue;
+  for (unsigned i = 0, e = SL.size(); i < e; ++i) {
+    assert(SL[i]->isSimple() && "Expected only non-volatile stores.");
+
+    Value *FirstStoredVal = SL[i]->getValueOperand();
+    Value *FirstStorePtr = SL[i]->getPointerOperand();
+    const SCEVAddRecExpr *FirstStoreEv =
+        cast<SCEVAddRecExpr>(SE->getSCEV(FirstStorePtr));
+    APInt FirstStride = getStoreStride(FirstStoreEv);
+    unsigned FirstStoreSize = getStoreSizeInBytes(SL[i], DL);
+
+    // See if we can optimize just this store in isolation.
+    if (FirstStride == FirstStoreSize || -FirstStride == FirstStoreSize) {
+      Heads.insert(SL[i]);
+      continue;
+    }
 
-  Value *StoredVal = SI->getValueOperand();
-  Value *StorePtr = SI->getPointerOperand();
+    Value *FirstSplatValue = nullptr;
+    Constant *FirstPatternValue = nullptr;
 
-  // Check to see if the stride matches the size of the store.  If so, then we
-  // know that every byte is touched in the loop.
-  const SCEVAddRecExpr *StoreEv = cast<SCEVAddRecExpr>(SE->getSCEV(StorePtr));
-  unsigned Stride = getStoreStride(StoreEv);
-  unsigned StoreSize = getStoreSizeInBytes(SI, DL);
-  if (StoreSize != Stride && StoreSize != -Stride)
-    return false;
+    if (ForMemset)
+      FirstSplatValue = isBytewiseValue(FirstStoredVal);
+    else
+      FirstPatternValue = getMemSetPatternValue(FirstStoredVal, DL);
+
+    assert((FirstSplatValue || FirstPatternValue) &&
+           "Expected either splat value or pattern value.");
+
+    IndexQueue.clear();
+    // If a store has multiple consecutive store candidates, search Stores
+    // array according to the sequence: from i+1 to e, then from i-1 to 0.
+    // This is because usually pairing with immediate succeeding or preceding
+    // candidate create the best chance to find memset opportunity.
+    unsigned j = 0;
+    for (j = i + 1; j < e; ++j)
+      IndexQueue.push_back(j);
+    for (j = i; j > 0; --j)
+      IndexQueue.push_back(j - 1);
+
+    for (auto &k : IndexQueue) {
+      assert(SL[k]->isSimple() && "Expected only non-volatile stores.");
+      Value *SecondStorePtr = SL[k]->getPointerOperand();
+      const SCEVAddRecExpr *SecondStoreEv =
+          cast<SCEVAddRecExpr>(SE->getSCEV(SecondStorePtr));
+      APInt SecondStride = getStoreStride(SecondStoreEv);
+
+      if (FirstStride != SecondStride)
+        continue;
 
-  bool NegStride = StoreSize == -Stride;
+      Value *SecondStoredVal = SL[k]->getValueOperand();
+      Value *SecondSplatValue = nullptr;
+      Constant *SecondPatternValue = nullptr;
+
+      if (ForMemset)
+        SecondSplatValue = isBytewiseValue(SecondStoredVal);
+      else
+        SecondPatternValue = getMemSetPatternValue(SecondStoredVal, DL);
+
+      assert((SecondSplatValue || SecondPatternValue) &&
+             "Expected either splat value or pattern value.");
+
+      if (isConsecutiveAccess(SL[i], SL[k], *DL, *SE, false)) {
+        if (ForMemset) {
+          if (FirstSplatValue != SecondSplatValue)
+            continue;
+        } else {
+          if (FirstPatternValue != SecondPatternValue)
+            continue;
+        }
+        Tails.insert(SL[k]);
+        Heads.insert(SL[i]);
+        ConsecutiveChain[SL[i]] = SL[k];
+        break;
+      }
+    }
+  }
+
+  // We may run into multiple chains that merge into a single chain. We mark the
+  // stores that we transformed so that we don't visit the same store twice.
+  SmallPtrSet<Value *, 16> TransformedStores;
+  bool Changed = false;
+
+  // For stores that start but don't end a link in the chain:
+  for (SetVector<StoreInst *>::iterator it = Heads.begin(), e = Heads.end();
+       it != e; ++it) {
+    if (Tails.count(*it))
+      continue;
+
+    // We found a store instr that starts a chain. Now follow the chain and try
+    // to transform it.
+    SmallPtrSet<Instruction *, 8> AdjacentStores;
+    StoreInst *I = *it;
+
+    StoreInst *HeadStore = I;
+    unsigned StoreSize = 0;
+
+    // Collect the chain into a list.
+    while (Tails.count(I) || Heads.count(I)) {
+      if (TransformedStores.count(I))
+        break;
+      AdjacentStores.insert(I);
+
+      StoreSize += getStoreSizeInBytes(I, DL);
+      // Move to the next value in the chain.
+      I = ConsecutiveChain[I];
+    }
 
-  // See if we can optimize just this store in isolation.
-  return processLoopStridedStore(StorePtr, StoreSize, SI->getAlignment(),
-                                 StoredVal, SI, StoreEv, BECount, NegStride);
+    Value *StoredVal = HeadStore->getValueOperand();
+    Value *StorePtr = HeadStore->getPointerOperand();
+    const SCEVAddRecExpr *StoreEv = cast<SCEVAddRecExpr>(SE->getSCEV(StorePtr));
+    APInt Stride = getStoreStride(StoreEv);
+
+    // Check to see if the stride matches the size of the stores.  If so, then
+    // we know that every byte is touched in the loop.
+    if (StoreSize != Stride && StoreSize != -Stride)
+      continue;
+
+    bool NegStride = StoreSize == -Stride;
+
+    if (processLoopStridedStore(StorePtr, StoreSize, HeadStore->getAlignment(),
+                                StoredVal, HeadStore, AdjacentStores, StoreEv,
+                                BECount, NegStride)) {
+      TransformedStores.insert(AdjacentStores.begin(), AdjacentStores.end());
+      Changed = true;
+    }
+  }
+
+  return Changed;
 }
 
 /// processLoopMemSet - See if this memset can be promoted to a large memset.
@@ -488,7 +651,7 @@ bool LoopIdiomRecognize::processLoopMemSet(MemSetInst *MSI,
     return false;
 
   // If we're not allowed to hack on memset, we fail.
-  if (!TLI->has(LibFunc::memset))
+  if (!HasMemset)
     return false;
 
   Value *Pointer = MSI->getDest();
@@ -507,11 +670,12 @@ bool LoopIdiomRecognize::processLoopMemSet(MemSetInst *MSI,
 
   // Check to see if the stride matches the size of the memset.  If so, then we
   // know that every byte is touched in the loop.
-  const SCEVConstant *Stride = dyn_cast<SCEVConstant>(Ev->getOperand(1));
+  const SCEVConstant *ConstStride = dyn_cast<SCEVConstant>(Ev->getOperand(1));
+  if (!ConstStride)
+    return false;
 
-  // TODO: Could also handle negative stride here someday, that will require the
-  // validity check in mayLoopAccessLocation to be updated though.
-  if (!Stride || MSI->getLength() != Stride->getValue())
+  APInt Stride = ConstStride->getAPInt();
+  if (SizeInBytes != Stride && SizeInBytes != -Stride)
     return false;
 
   // Verify that the memset value is loop invariant.  If not, we can't promote
@@ -520,18 +684,22 @@ bool LoopIdiomRecognize::processLoopMemSet(MemSetInst *MSI,
   if (!SplatValue || !CurLoop->isLoopInvariant(SplatValue))
     return false;
 
+  SmallPtrSet<Instruction *, 1> MSIs;
+  MSIs.insert(MSI);
+  bool NegStride = SizeInBytes == -Stride;
   return processLoopStridedStore(Pointer, (unsigned)SizeInBytes,
-                                 MSI->getAlignment(), SplatValue, MSI, Ev,
-                                 BECount, /*NegStride=*/false);
+                                 MSI->getAlignment(), SplatValue, MSI, MSIs, Ev,
+                                 BECount, NegStride);
 }
 
 /// mayLoopAccessLocation - Return true if the specified loop might access the
 /// specified pointer location, which is a loop-strided access.  The 'Access'
 /// argument specifies what the verboten forms of access are (read or write).
-static bool mayLoopAccessLocation(Value *Ptr, ModRefInfo Access, Loop *L,
-                                  const SCEV *BECount, unsigned StoreSize,
-                                  AliasAnalysis &AA,
-                                  Instruction *IgnoredStore) {
+static bool
+mayLoopAccessLocation(Value *Ptr, ModRefInfo Access, Loop *L,
+                      const SCEV *BECount, unsigned StoreSize,
+                      AliasAnalysis &AA,
+                      SmallPtrSetImpl<Instruction *> &IgnoredStores) {
   // Get the location that may be stored across the loop.  Since the access is
   // strided positively through memory, we say that the modified location starts
   // at the pointer and has infinite size.
@@ -550,8 +718,9 @@ static bool mayLoopAccessLocation(Value *Ptr, ModRefInfo Access, Loop *L,
 
   for (Loop::block_iterator BI = L->block_begin(), E = L->block_end(); BI != E;
        ++BI)
-    for (BasicBlock::iterator I = (*BI)->begin(), E = (*BI)->end(); I != E; ++I)
-      if (&*I != IgnoredStore && (AA.getModRefInfo(&*I, StoreLoc) & Access))
+    for (Instruction &I : **BI)
+      if (IgnoredStores.count(&I) == 0 &&
+          (AA.getModRefInfo(&I, StoreLoc) & Access))
         return true;
 
   return false;
@@ -574,7 +743,8 @@ static const SCEV *getStartForNegStride(const SCEV *Start, const SCEV *BECount,
 /// transform this into a memset or memset_pattern in the loop preheader, do so.
 bool LoopIdiomRecognize::processLoopStridedStore(
     Value *DestPtr, unsigned StoreSize, unsigned StoreAlignment,
-    Value *StoredVal, Instruction *TheStore, const SCEVAddRecExpr *Ev,
+    Value *StoredVal, Instruction *TheStore,
+    SmallPtrSetImpl<Instruction *> &Stores, const SCEVAddRecExpr *Ev,
     const SCEV *BECount, bool NegStride) {
   Value *SplatValue = isBytewiseValue(StoredVal);
   Constant *PatternValue = nullptr;
@@ -609,7 +779,7 @@ bool LoopIdiomRecognize::processLoopStridedStore(
   Value *BasePtr =
       Expander.expandCodeFor(Start, DestInt8PtrTy, Preheader->getTerminator());
   if (mayLoopAccessLocation(BasePtr, MRI_ModRef, CurLoop, BECount, StoreSize,
-                            *AA, TheStore)) {
+                            *AA, Stores)) {
     Expander.clear();
     // If we generated new code for the base pointer, clean up.
     RecursivelyDeleteTriviallyDeadInstructions(BasePtr, TLI);
@@ -644,13 +814,14 @@ bool LoopIdiomRecognize::processLoopStridedStore(
     Value *MSP =
         M->getOrInsertFunction("memset_pattern16", Builder.getVoidTy(),
                                Int8PtrTy, Int8PtrTy, IntPtr, (void *)nullptr);
+    inferLibFuncAttributes(*M->getFunction("memset_pattern16"), *TLI);
 
     // Otherwise we should form a memset_pattern16.  PatternValue is known to be
     // an constant array of 16-bytes.  Plop the value into a mergable global.
     GlobalVariable *GV = new GlobalVariable(*M, PatternValue->getType(), true,
                                             GlobalValue::PrivateLinkage,
                                             PatternValue, ".memset_pattern");
-    GV->setUnnamedAddr(true); // Ok to merge these.
+    GV->setUnnamedAddr(GlobalValue::UnnamedAddr::Global); // Ok to merge these.
     GV->setAlignment(16);
     Value *PatternPtr = ConstantExpr::getBitCast(GV, Int8PtrTy);
     NewCall = Builder.CreateCall(MSP, {BasePtr, PatternPtr, NumBytes});
@@ -662,7 +833,8 @@ bool LoopIdiomRecognize::processLoopStridedStore(
 
   // Okay, the memset has been formed.  Zap the original store and anything that
   // feeds into it.
-  deleteDeadInstruction(TheStore, TLI);
+  for (auto *I : Stores)
+    deleteDeadInstruction(I);
   ++NumMemSet;
   return true;
 }
@@ -676,7 +848,7 @@ bool LoopIdiomRecognize::processLoopStoreOfLoopLoad(StoreInst *SI,
 
   Value *StorePtr = SI->getPointerOperand();
   const SCEVAddRecExpr *StoreEv = cast<SCEVAddRecExpr>(SE->getSCEV(StorePtr));
-  unsigned Stride = getStoreStride(StoreEv);
+  APInt Stride = getStoreStride(StoreEv);
   unsigned StoreSize = getStoreSizeInBytes(SI, DL);
   bool NegStride = StoreSize == -Stride;
 
@@ -714,8 +886,10 @@ bool LoopIdiomRecognize::processLoopStoreOfLoopLoad(StoreInst *SI,
   Value *StoreBasePtr = Expander.expandCodeFor(
       StrStart, Builder.getInt8PtrTy(StrAS), Preheader->getTerminator());
 
+  SmallPtrSet<Instruction *, 1> Stores;
+  Stores.insert(SI);
   if (mayLoopAccessLocation(StoreBasePtr, MRI_ModRef, CurLoop, BECount,
-                            StoreSize, *AA, SI)) {
+                            StoreSize, *AA, Stores)) {
     Expander.clear();
     // If we generated new code for the base pointer, clean up.
     RecursivelyDeleteTriviallyDeadInstructions(StoreBasePtr, TLI);
@@ -735,7 +909,7 @@ bool LoopIdiomRecognize::processLoopStoreOfLoopLoad(StoreInst *SI,
       LdStart, Builder.getInt8PtrTy(LdAS), Preheader->getTerminator());
 
   if (mayLoopAccessLocation(LoadBasePtr, MRI_Mod, CurLoop, BECount, StoreSize,
-                            *AA, SI)) {
+                            *AA, Stores)) {
     Expander.clear();
     // If we generated new code for the base pointer, clean up.
     RecursivelyDeleteTriviallyDeadInstructions(LoadBasePtr, TLI);
@@ -769,7 +943,7 @@ bool LoopIdiomRecognize::processLoopStoreOfLoopLoad(StoreInst *SI,
 
   // Okay, the memcpy has been formed.  Zap the original store and anything that
   // feeds into it.
-  deleteDeadInstruction(SI, TLI);
+  deleteDeadInstruction(SI);
   ++NumMemCpy;
   return true;
 }
@@ -993,7 +1167,7 @@ bool LoopIdiomRecognize::recognizePopcount() {
 }
 
 static CallInst *createPopcntIntrinsic(IRBuilder<> &IRBuilder, Value *Val,
-                                       DebugLoc DL) {
+                                       const DebugLoc &DL) {
   Value *Ops[] = {Val};
   Type *Tys[] = {Val->getType()};
 
diff --git a/lib/Transforms/Scalar/LoopInstSimplify.cpp b/lib/Transforms/Scalar/LoopInstSimplify.cpp
index b4102fe9ba34..629cb87d7a91 100644
--- a/lib/Transforms/Scalar/LoopInstSimplify.cpp
+++ b/lib/Transforms/Scalar/LoopInstSimplify.cpp
@@ -11,88 +11,43 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "llvm/Transforms/Scalar.h"
+#include "llvm/Transforms/Scalar/LoopInstSimplify.h"
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/Statistic.h"
 #include "llvm/Analysis/AssumptionCache.h"
 #include "llvm/Analysis/InstructionSimplify.h"
 #include "llvm/Analysis/LoopInfo.h"
 #include "llvm/Analysis/LoopPass.h"
+#include "llvm/Analysis/LoopPassManager.h"
 #include "llvm/Analysis/ScalarEvolution.h"
+#include "llvm/Analysis/TargetLibraryInfo.h"
 #include "llvm/IR/DataLayout.h"
 #include "llvm/IR/Dominators.h"
 #include "llvm/IR/Instructions.h"
 #include "llvm/Support/Debug.h"
-#include "llvm/Analysis/TargetLibraryInfo.h"
+#include "llvm/Transforms/Scalar.h"
 #include "llvm/Transforms/Utils/Local.h"
+#include "llvm/Transforms/Utils/LoopUtils.h"
 using namespace llvm;
 
 #define DEBUG_TYPE "loop-instsimplify"
 
 STATISTIC(NumSimplified, "Number of redundant instructions simplified");
 
-namespace {
-  class LoopInstSimplify : public LoopPass {
-  public:
-    static char ID; // Pass ID, replacement for typeid
-    LoopInstSimplify() : LoopPass(ID) {
-      initializeLoopInstSimplifyPass(*PassRegistry::getPassRegistry());
-    }
-
-    bool runOnLoop(Loop*, LPPassManager&) override;
-
-    void getAnalysisUsage(AnalysisUsage &AU) const override {
-      AU.setPreservesCFG();
-      AU.addRequired<AssumptionCacheTracker>();
-      AU.addRequired<LoopInfoWrapperPass>();
-      AU.addRequiredID(LoopSimplifyID);
-      AU.addPreservedID(LoopSimplifyID);
-      AU.addPreservedID(LCSSAID);
-      AU.addPreserved<ScalarEvolutionWrapperPass>();
-      AU.addRequired<TargetLibraryInfoWrapperPass>();
-    }
-  };
-}
-
-char LoopInstSimplify::ID = 0;
-INITIALIZE_PASS_BEGIN(LoopInstSimplify, "loop-instsimplify",
-                "Simplify instructions in loops", false, false)
-INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker)
-INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfoWrapperPass)
-INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
-INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass)
-INITIALIZE_PASS_DEPENDENCY(LCSSA)
-INITIALIZE_PASS_END(LoopInstSimplify, "loop-instsimplify",
-                "Simplify instructions in loops", false, false)
-
-Pass *llvm::createLoopInstSimplifyPass() {
-  return new LoopInstSimplify();
-}
-
-bool LoopInstSimplify::runOnLoop(Loop *L, LPPassManager &LPM) {
-  if (skipOptnoneFunction(L))
-    return false;
-
-  DominatorTreeWrapperPass *DTWP =
-      getAnalysisIfAvailable<DominatorTreeWrapperPass>();
-  DominatorTree *DT = DTWP ? &DTWP->getDomTree() : nullptr;
-  LoopInfo *LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
-  const TargetLibraryInfo *TLI =
-      &getAnalysis<TargetLibraryInfoWrapperPass>().getTLI();
-  auto &AC = getAnalysis<AssumptionCacheTracker>().getAssumptionCache(
-      *L->getHeader()->getParent());
-
-  SmallVector<BasicBlock*, 8> ExitBlocks;
+static bool SimplifyLoopInst(Loop *L, DominatorTree *DT, LoopInfo *LI,
+                             AssumptionCache *AC,
+                             const TargetLibraryInfo *TLI) {
+  SmallVector<BasicBlock *, 8> ExitBlocks;
   L->getUniqueExitBlocks(ExitBlocks);
   array_pod_sort(ExitBlocks.begin(), ExitBlocks.end());
 
-  SmallPtrSet<const Instruction*, 8> S1, S2, *ToSimplify = &S1, *Next = &S2;
+  SmallPtrSet<const Instruction *, 8> S1, S2, *ToSimplify = &S1, *Next = &S2;
 
   // The bit we are stealing from the pointer represents whether this basic
   // block is the header of a subloop, in which case we only process its phis.
-  typedef PointerIntPair<BasicBlock*, 1> WorklistItem;
+  typedef PointerIntPair<BasicBlock *, 1> WorklistItem;
   SmallVector<WorklistItem, 16> VisitStack;
-  SmallPtrSet<BasicBlock*, 32> Visited;
+  SmallPtrSet<BasicBlock *, 32> Visited;
 
   bool Changed = false;
   bool LocalChanged;
@@ -122,7 +77,7 @@ bool LoopInstSimplify::runOnLoop(Loop *L, LPPassManager &LPM) {
 
         // Don't bother simplifying unused instructions.
         if (!I->use_empty()) {
-          Value *V = SimplifyInstruction(I, DL, TLI, DT, &AC);
+          Value *V = SimplifyInstruction(I, DL, TLI, DT, AC);
           if (V && LI->replacementPreservesLCSSAForm(I, V)) {
             // Mark all uses for resimplification next time round the loop.
             for (User *U : I->users())
@@ -133,14 +88,13 @@ bool LoopInstSimplify::runOnLoop(Loop *L, LPPassManager &LPM) {
             ++NumSimplified;
           }
         }
-        bool res = RecursivelyDeleteTriviallyDeadInstructions(I, TLI);
-        if (res) {
-          // RecursivelyDeleteTriviallyDeadInstruction can remove
-          // more than one instruction, so simply incrementing the
-          // iterator does not work. When instructions get deleted
-          // re-iterate instead.
-          BI = BB->begin(); BE = BB->end();
-          LocalChanged |= res;
+        if (RecursivelyDeleteTriviallyDeadInstructions(I, TLI)) {
+          // RecursivelyDeleteTriviallyDeadInstruction can remove more than one
+          // instruction, so simply incrementing the iterator does not work.
+          // When instructions get deleted re-iterate instead.
+          BI = BB->begin();
+          BE = BB->end();
+          LocalChanged = true;
         }
 
         if (IsSubloopHeader && !isa<PHINode>(I))
@@ -148,8 +102,10 @@ bool LoopInstSimplify::runOnLoop(Loop *L, LPPassManager &LPM) {
       }
 
       // Add all successors to the worklist, except for loop exit blocks and the
-      // bodies of subloops. We visit the headers of loops so that we can process
-      // their phis, but we contract the rest of the subloop body and only follow
+      // bodies of subloops. We visit the headers of loops so that we can
+      // process
+      // their phis, but we contract the rest of the subloop body and only
+      // follow
       // edges leading back to the original loop.
       for (succ_iterator SI = succ_begin(BB), SE = succ_end(BB); SI != SE;
            ++SI) {
@@ -158,11 +114,11 @@ bool LoopInstSimplify::runOnLoop(Loop *L, LPPassManager &LPM) {
           continue;
 
         const Loop *SuccLoop = LI->getLoopFor(SuccBB);
-        if (SuccLoop && SuccLoop->getHeader() == SuccBB
-                     && L->contains(SuccLoop)) {
+        if (SuccLoop && SuccLoop->getHeader() == SuccBB &&
+            L->contains(SuccLoop)) {
           VisitStack.push_back(WorklistItem(SuccBB, true));
 
-          SmallVector<BasicBlock*, 8> SubLoopExitBlocks;
+          SmallVector<BasicBlock *, 8> SubLoopExitBlocks;
           SuccLoop->getExitBlocks(SubLoopExitBlocks);
 
           for (unsigned i = 0; i < SubLoopExitBlocks.size(); ++i) {
@@ -174,8 +130,8 @@ bool LoopInstSimplify::runOnLoop(Loop *L, LPPassManager &LPM) {
           continue;
         }
 
-        bool IsExitBlock = std::binary_search(ExitBlocks.begin(),
-                                              ExitBlocks.end(), SuccBB);
+        bool IsExitBlock =
+            std::binary_search(ExitBlocks.begin(), ExitBlocks.end(), SuccBB);
         if (IsExitBlock)
           continue;
 
@@ -193,3 +149,68 @@ bool LoopInstSimplify::runOnLoop(Loop *L, LPPassManager &LPM) {
 
   return Changed;
 }
+
+namespace {
+class LoopInstSimplifyLegacyPass : public LoopPass {
+public:
+  static char ID; // Pass ID, replacement for typeid
+  LoopInstSimplifyLegacyPass() : LoopPass(ID) {
+    initializeLoopInstSimplifyLegacyPassPass(*PassRegistry::getPassRegistry());
+  }
+
+  bool runOnLoop(Loop *L, LPPassManager &LPM) override {
+    if (skipLoop(L))
+      return false;
+    DominatorTreeWrapperPass *DTWP =
+        getAnalysisIfAvailable<DominatorTreeWrapperPass>();
+    DominatorTree *DT = DTWP ? &DTWP->getDomTree() : nullptr;
+    LoopInfo *LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
+    AssumptionCache *AC =
+        &getAnalysis<AssumptionCacheTracker>().getAssumptionCache(
+            *L->getHeader()->getParent());
+    const TargetLibraryInfo *TLI =
+        &getAnalysis<TargetLibraryInfoWrapperPass>().getTLI();
+
+    return SimplifyLoopInst(L, DT, LI, AC, TLI);
+  }
+
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    AU.addRequired<AssumptionCacheTracker>();
+    AU.addRequired<TargetLibraryInfoWrapperPass>();
+    AU.setPreservesCFG();
+    getLoopAnalysisUsage(AU);
+  }
+};
+}
+
+PreservedAnalyses LoopInstSimplifyPass::run(Loop &L,
+                                            AnalysisManager<Loop> &AM) {
+  const auto &FAM =
+      AM.getResult<FunctionAnalysisManagerLoopProxy>(L).getManager();
+  Function *F = L.getHeader()->getParent();
+
+  // Use getCachedResult because Loop pass cannot trigger a function analysis.
+  auto *DT = FAM.getCachedResult<DominatorTreeAnalysis>(*F);
+  auto *LI = FAM.getCachedResult<LoopAnalysis>(*F);
+  auto *AC = FAM.getCachedResult<AssumptionAnalysis>(*F);
+  const auto *TLI = FAM.getCachedResult<TargetLibraryAnalysis>(*F);
+  assert((LI && AC && TLI) && "Analyses for Loop Inst Simplify not available");
+
+  if (!SimplifyLoopInst(&L, DT, LI, AC, TLI))
+    return PreservedAnalyses::all();
+
+  return getLoopPassPreservedAnalyses();
+}
+
+char LoopInstSimplifyLegacyPass::ID = 0;
+INITIALIZE_PASS_BEGIN(LoopInstSimplifyLegacyPass, "loop-instsimplify",
+                      "Simplify instructions in loops", false, false)
+INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker)
+INITIALIZE_PASS_DEPENDENCY(LoopPass)
+INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfoWrapperPass)
+INITIALIZE_PASS_END(LoopInstSimplifyLegacyPass, "loop-instsimplify",
+                    "Simplify instructions in loops", false, false)
+
+Pass *llvm::createLoopInstSimplifyPass() {
+  return new LoopInstSimplifyLegacyPass();
+}
diff --git a/lib/Transforms/Scalar/LoopInterchange.cpp b/lib/Transforms/Scalar/LoopInterchange.cpp
index 4295235a3f36..9241ec365277 100644
--- a/lib/Transforms/Scalar/LoopInterchange.cpp
+++ b/lib/Transforms/Scalar/LoopInterchange.cpp
@@ -15,7 +15,6 @@
 
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/Analysis/AliasAnalysis.h"
-#include "llvm/Analysis/AliasSetTracker.h"
 #include "llvm/Analysis/AssumptionCache.h"
 #include "llvm/Analysis/BlockFrequencyInfo.h"
 #include "llvm/Analysis/CodeMetrics.h"
@@ -72,7 +71,7 @@ void printDepMatrix(CharMatrix &DepMatrix) {
 #endif
 
 static bool populateDependencyMatrix(CharMatrix &DepMatrix, unsigned Level,
-                                     Loop *L, DependenceAnalysis *DA) {
+                                     Loop *L, DependenceInfo *DI) {
   typedef SmallVector<Value *, 16> ValueVector;
   ValueVector MemInstr;
 
@@ -117,7 +116,7 @@ static bool populateDependencyMatrix(CharMatrix &DepMatrix, unsigned Level,
         continue;
       if (isa<LoadInst>(Src) && isa<LoadInst>(Des))
         continue;
-      if (auto D = DA->depends(Src, Des, true)) {
+      if (auto D = DI->depends(Src, Des, true)) {
         DEBUG(dbgs() << "Found Dependency between Src=" << Src << " Des=" << Des
                      << "\n");
         if (D->isFlow()) {
@@ -404,12 +403,9 @@ public:
 
 private:
   void splitInnerLoopLatch(Instruction *);
-  void splitOuterLoopLatch();
   void splitInnerLoopHeader();
   bool adjustLoopLinks();
   void adjustLoopPreheaders();
-  void adjustOuterLoopPreheader();
-  void adjustInnerLoopPreheader();
   bool adjustLoopBranches();
   void updateIncomingBlock(BasicBlock *CurrBlock, BasicBlock *OldPred,
                            BasicBlock *NewPred);
@@ -430,11 +426,11 @@ struct LoopInterchange : public FunctionPass {
   static char ID;
   ScalarEvolution *SE;
   LoopInfo *LI;
-  DependenceAnalysis *DA;
+  DependenceInfo *DI;
   DominatorTree *DT;
   bool PreserveLCSSA;
   LoopInterchange()
-      : FunctionPass(ID), SE(nullptr), LI(nullptr), DA(nullptr), DT(nullptr) {
+      : FunctionPass(ID), SE(nullptr), LI(nullptr), DI(nullptr), DT(nullptr) {
     initializeLoopInterchangePass(*PassRegistry::getPassRegistry());
   }
 
@@ -443,15 +439,18 @@ struct LoopInterchange : public FunctionPass {
     AU.addRequired<AAResultsWrapperPass>();
     AU.addRequired<DominatorTreeWrapperPass>();
     AU.addRequired<LoopInfoWrapperPass>();
-    AU.addRequired<DependenceAnalysis>();
+    AU.addRequired<DependenceAnalysisWrapperPass>();
     AU.addRequiredID(LoopSimplifyID);
     AU.addRequiredID(LCSSAID);
   }
 
   bool runOnFunction(Function &F) override {
+    if (skipFunction(F))
+      return false;
+
     SE = &getAnalysis<ScalarEvolutionWrapperPass>().getSE();
     LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
-    DA = &getAnalysis<DependenceAnalysis>();
+    DI = &getAnalysis<DependenceAnalysisWrapperPass>().getDI();
     auto *DTWP = getAnalysisIfAvailable<DominatorTreeWrapperPass>();
     DT = DTWP ? &DTWP->getDomTree() : nullptr;
     PreserveLCSSA = mustPreserveAnalysisID(LCSSAID);
@@ -472,8 +471,7 @@ struct LoopInterchange : public FunctionPass {
   }
 
   bool isComputableLoopNest(LoopVector LoopList) {
-    for (auto I = LoopList.begin(), E = LoopList.end(); I != E; ++I) {
-      Loop *L = *I;
+    for (Loop *L : LoopList) {
       const SCEV *ExitCountOuter = SE->getBackedgeTakenCount(L);
       if (ExitCountOuter == SE->getCouldNotCompute()) {
         DEBUG(dbgs() << "Couldn't compute Backedge count\n");
@@ -491,7 +489,7 @@ struct LoopInterchange : public FunctionPass {
     return true;
   }
 
-  unsigned selectLoopForInterchange(LoopVector LoopList) {
+  unsigned selectLoopForInterchange(const LoopVector &LoopList) {
     // TODO: Add a better heuristic to select the loop to be interchanged based
     // on the dependence matrix. Currently we select the innermost loop.
     return LoopList.size() - 1;
@@ -515,7 +513,7 @@ struct LoopInterchange : public FunctionPass {
                  << "\n");
 
     if (!populateDependencyMatrix(DependencyMatrix, LoopList.size(),
-                                  OuterMostLoop, DA)) {
+                                  OuterMostLoop, DI)) {
       DEBUG(dbgs() << "Populating Dependency matrix failed\n");
       return false;
     }
@@ -813,7 +811,6 @@ bool LoopInterchangeLegality::currentLimitations() {
   //      A[j+1][i+2] = A[j][i]+k;
   //  }
   // }
-  bool FoundInduction = false;
   Instruction *InnerIndexVarInc = nullptr;
   if (InnerInductionVar->getIncomingBlock(0) == InnerLoopPreHeader)
     InnerIndexVarInc =
@@ -829,17 +826,17 @@ bool LoopInterchangeLegality::currentLimitations() {
   // we do not have any instruction between the induction variable and branch
   // instruction.
 
-  for (auto I = InnerLoopLatch->rbegin(), E = InnerLoopLatch->rend();
-       I != E && !FoundInduction; ++I) {
-    if (isa<BranchInst>(*I) || isa<CmpInst>(*I) || isa<TruncInst>(*I))
+  bool FoundInduction = false;
+  for (const Instruction &I : reverse(*InnerLoopLatch)) {
+    if (isa<BranchInst>(I) || isa<CmpInst>(I) || isa<TruncInst>(I))
       continue;
-    const Instruction &Ins = *I;
     // We found an instruction. If this is not induction variable then it is not
     // safe to split this loop latch.
-    if (!Ins.isIdenticalTo(InnerIndexVarInc))
+    if (!I.isIdenticalTo(InnerIndexVarInc))
       return true;
-    else
-      FoundInduction = true;
+
+    FoundInduction = true;
+    break;
   }
   // The loop latch ended and we didn't find the induction variable return as
   // current limitation.
@@ -903,8 +900,7 @@ int LoopInterchangeProfitability::getInstrOrderCost() {
   BadOrder = GoodOrder = 0;
   for (auto BI = InnerLoop->block_begin(), BE = InnerLoop->block_end();
        BI != BE; ++BI) {
-    for (auto I = (*BI)->begin(), E = (*BI)->end(); I != E; ++I) {
-      const Instruction &Ins = *I;
+    for (Instruction &Ins : **BI) {
       if (const GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(&Ins)) {
         unsigned NumOp = GEP->getNumOperands();
         bool FoundInnerInduction = false;
@@ -1073,13 +1069,6 @@ void LoopInterchangeTransform::splitInnerLoopLatch(Instruction *Inc) {
   InnerLoopLatch = SplitBlock(InnerLoopLatchPred, Inc, DT, LI);
 }
 
-void LoopInterchangeTransform::splitOuterLoopLatch() {
-  BasicBlock *OuterLoopLatch = OuterLoop->getLoopLatch();
-  BasicBlock *OuterLatchLcssaPhiBlock = OuterLoopLatch;
-  OuterLoopLatch = SplitBlock(OuterLatchLcssaPhiBlock,
-                              OuterLoopLatch->getFirstNonPHI(), DT, LI);
-}
-
 void LoopInterchangeTransform::splitInnerLoopHeader() {
 
   // Split the inner loop header out. Here make sure that the reduction PHI's
@@ -1102,8 +1091,7 @@ void LoopInterchangeTransform::splitInnerLoopHeader() {
       PHI->replaceAllUsesWith(V);
       PHIVec.push_back((PHI));
     }
-    for (auto I = PHIVec.begin(), E = PHIVec.end(); I != E; ++I) {
-      PHINode *P = *I;
+    for (PHINode *P : PHIVec) {
       P->eraseFromParent();
     }
   } else {
@@ -1124,20 +1112,6 @@ static void moveBBContents(BasicBlock *FromBB, Instruction *InsertBefore) {
                 FromBB->getTerminator()->getIterator());
 }
 
-void LoopInterchangeTransform::adjustOuterLoopPreheader() {
-  BasicBlock *OuterLoopPreHeader = OuterLoop->getLoopPreheader();
-  BasicBlock *InnerPreHeader = InnerLoop->getLoopPreheader();
-
-  moveBBContents(OuterLoopPreHeader, InnerPreHeader->getTerminator());
-}
-
-void LoopInterchangeTransform::adjustInnerLoopPreheader() {
-  BasicBlock *InnerLoopPreHeader = InnerLoop->getLoopPreheader();
-  BasicBlock *OuterHeader = OuterLoop->getHeader();
-
-  moveBBContents(InnerLoopPreHeader, OuterHeader->getTerminator());
-}
-
 void LoopInterchangeTransform::updateIncomingBlock(BasicBlock *CurrBlock,
                                                    BasicBlock *OldPred,
                                                    BasicBlock *NewPred) {
@@ -1234,8 +1208,7 @@ bool LoopInterchangeTransform::adjustLoopBranches() {
     PHINode *LcssaPhi = cast<PHINode>(I);
     LcssaVec.push_back(LcssaPhi);
   }
-  for (auto I = LcssaVec.begin(), E = LcssaVec.end(); I != E; ++I) {
-    PHINode *P = *I;
+  for (PHINode *P : LcssaVec) {
     Value *Incoming = P->getIncomingValueForBlock(InnerLoopLatch);
     P->replaceAllUsesWith(Incoming);
     P->eraseFromParent();
@@ -1294,11 +1267,11 @@ char LoopInterchange::ID = 0;
 INITIALIZE_PASS_BEGIN(LoopInterchange, "loop-interchange",
                       "Interchanges loops for cache reuse", false, false)
 INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass)
-INITIALIZE_PASS_DEPENDENCY(DependenceAnalysis)
+INITIALIZE_PASS_DEPENDENCY(DependenceAnalysisWrapperPass)
 INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
 INITIALIZE_PASS_DEPENDENCY(ScalarEvolutionWrapperPass)
 INITIALIZE_PASS_DEPENDENCY(LoopSimplify)
-INITIALIZE_PASS_DEPENDENCY(LCSSA)
+INITIALIZE_PASS_DEPENDENCY(LCSSAWrapperPass)
 INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass)
 
 INITIALIZE_PASS_END(LoopInterchange, "loop-interchange",
diff --git a/lib/Transforms/Scalar/LoopLoadElimination.cpp b/lib/Transforms/Scalar/LoopLoadElimination.cpp
index 1064d088514d..f29228c7659e 100644
--- a/lib/Transforms/Scalar/LoopLoadElimination.cpp
+++ b/lib/Transforms/Scalar/LoopLoadElimination.cpp
@@ -28,6 +28,7 @@
 #include "llvm/IR/Module.h"
 #include "llvm/Pass.h"
 #include "llvm/Support/Debug.h"
+#include "llvm/Transforms/Scalar.h"
 #include "llvm/Transforms/Utils/LoopVersioning.h"
 #include <forward_list>
 
@@ -61,7 +62,8 @@ struct StoreToLoadForwardingCandidate {
 
   /// \brief Return true if the dependence from the store to the load has a
   /// distance of one.  E.g. A[i+1] = A[i]
-  bool isDependenceDistanceOfOne(PredicatedScalarEvolution &PSE) const {
+  bool isDependenceDistanceOfOne(PredicatedScalarEvolution &PSE,
+                                 Loop *L) const {
     Value *LoadPtr = Load->getPointerOperand();
     Value *StorePtr = Store->getPointerOperand();
     Type *LoadPtrType = LoadPtr->getType();
@@ -72,6 +74,13 @@ struct StoreToLoadForwardingCandidate {
            LoadType == StorePtr->getType()->getPointerElementType() &&
            "Should be a known dependence");
 
+    // Currently we only support accesses with unit stride.  FIXME: we should be
+    // able to handle non unit stirde as well as long as the stride is equal to
+    // the dependence distance.
+    if (getPtrStride(PSE, LoadPtr, L) != 1 ||
+        getPtrStride(PSE, StorePtr, L) != 1)
+      return false;
+
     auto &DL = Load->getParent()->getModule()->getDataLayout();
     unsigned TypeByteSize = DL.getTypeAllocSize(const_cast<Type *>(LoadType));
 
@@ -83,7 +92,7 @@ struct StoreToLoadForwardingCandidate {
     auto *Dist = cast<SCEVConstant>(
         PSE.getSE()->getMinusSCEV(StorePtrSCEV, LoadPtrSCEV));
     const APInt &Val = Dist->getAPInt();
-    return Val.abs() == TypeByteSize;
+    return Val == TypeByteSize;
   }
 
   Value *getLoadPtr() const { return Load->getPointerOperand(); }
@@ -110,12 +119,17 @@ bool doesStoreDominatesAllLatches(BasicBlock *StoreBlock, Loop *L,
                      });
 }
 
+/// \brief Return true if the load is not executed on all paths in the loop.
+static bool isLoadConditional(LoadInst *Load, Loop *L) {
+  return Load->getParent() != L->getHeader();
+}
+
 /// \brief The per-loop class that does most of the work.
 class LoadEliminationForLoop {
 public:
   LoadEliminationForLoop(Loop *L, LoopInfo *LI, const LoopAccessInfo &LAI,
                          DominatorTree *DT)
-      : L(L), LI(LI), LAI(LAI), DT(DT), PSE(LAI.PSE) {}
+      : L(L), LI(LI), LAI(LAI), DT(DT), PSE(LAI.getPSE()) {}
 
   /// \brief Look through the loop-carried and loop-independent dependences in
   /// this loop and find store->load dependences.
@@ -162,6 +176,12 @@ public:
       auto *Load = dyn_cast<LoadInst>(Destination);
       if (!Load)
         continue;
+
+      // Only progagate the value if they are of the same type.
+      if (Store->getPointerOperand()->getType() !=
+          Load->getPointerOperand()->getType())
+        continue;
+
       Candidates.emplace_front(Load, Store);
     }
 
@@ -219,12 +239,12 @@ public:
         if (OtherCand == nullptr)
           continue;
 
-        // Handle the very basic of case when the two stores are in the same
-        // block so deciding which one forwards is easy.  The later one forwards
-        // as long as they both have a dependence distance of one to the load.
+        // Handle the very basic case when the two stores are in the same block
+        // so deciding which one forwards is easy.  The later one forwards as
+        // long as they both have a dependence distance of one to the load.
         if (Cand.Store->getParent() == OtherCand->Store->getParent() &&
-            Cand.isDependenceDistanceOfOne(PSE) &&
-            OtherCand->isDependenceDistanceOfOne(PSE)) {
+            Cand.isDependenceDistanceOfOne(PSE, L) &&
+            OtherCand->isDependenceDistanceOfOne(PSE, L)) {
           // They are in the same block, the later one will forward to the load.
           if (getInstrIndex(OtherCand->Store) < getInstrIndex(Cand.Store))
             OtherCand = &Cand;
@@ -429,14 +449,21 @@ public:
     unsigned NumForwarding = 0;
     for (const StoreToLoadForwardingCandidate Cand : StoreToLoadDependences) {
       DEBUG(dbgs() << "Candidate " << Cand);
+
       // Make sure that the stored values is available everywhere in the loop in
       // the next iteration.
       if (!doesStoreDominatesAllLatches(Cand.Store->getParent(), L, DT))
         continue;
 
+      // If the load is conditional we can't hoist its 0-iteration instance to
+      // the preheader because that would make it unconditional.  Thus we would
+      // access a memory location that the original loop did not access.
+      if (isLoadConditional(Cand.Load, L))
+        continue;
+
       // Check whether the SCEV difference is the same as the induction step,
       // thus we load the value in the next iteration.
-      if (!Cand.isDependenceDistanceOfOne(PSE))
+      if (!Cand.isDependenceDistanceOfOne(PSE, L))
         continue;
 
       ++NumForwarding;
@@ -459,18 +486,25 @@ public:
       return false;
     }
 
-    if (LAI.PSE.getUnionPredicate().getComplexity() >
+    if (LAI.getPSE().getUnionPredicate().getComplexity() >
         LoadElimSCEVCheckThreshold) {
       DEBUG(dbgs() << "Too many SCEV run-time checks needed.\n");
       return false;
     }
 
-    // Point of no-return, start the transformation.  First, version the loop if
-    // necessary.
-    if (!Checks.empty() || !LAI.PSE.getUnionPredicate().isAlwaysTrue()) {
+    if (!Checks.empty() || !LAI.getPSE().getUnionPredicate().isAlwaysTrue()) {
+      if (L->getHeader()->getParent()->optForSize()) {
+        DEBUG(dbgs() << "Versioning is needed but not allowed when optimizing "
+                        "for size.\n");
+        return false;
+      }
+
+      // Point of no-return, start the transformation.  First, version the loop
+      // if necessary.
+
       LoopVersioning LV(LAI, L, LI, DT, PSE.getSE(), false);
       LV.setAliasChecks(std::move(Checks));
-      LV.setSCEVChecks(LAI.PSE.getUnionPredicate());
+      LV.setSCEVChecks(LAI.getPSE().getUnionPredicate());
       LV.versionLoop();
     }
 
@@ -508,8 +542,11 @@ public:
   }
 
   bool runOnFunction(Function &F) override {
+    if (skipFunction(F))
+      return false;
+
     auto *LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
-    auto *LAA = &getAnalysis<LoopAccessAnalysis>();
+    auto *LAA = &getAnalysis<LoopAccessLegacyAnalysis>();
     auto *DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
 
     // Build up a worklist of inner-loops to vectorize. This is necessary as the
@@ -526,7 +563,7 @@ public:
     // Now walk the identified inner loops.
     bool Changed = false;
     for (Loop *L : Worklist) {
-      const LoopAccessInfo &LAI = LAA->getInfo(L, ValueToValueMap());
+      const LoopAccessInfo &LAI = LAA->getInfo(L);
       // The actual work is performed by LoadEliminationForLoop.
       LoadEliminationForLoop LEL(L, LI, LAI, DT);
       Changed |= LEL.processLoop();
@@ -537,9 +574,10 @@ public:
   }
 
   void getAnalysisUsage(AnalysisUsage &AU) const override {
+    AU.addRequiredID(LoopSimplifyID);
     AU.addRequired<LoopInfoWrapperPass>();
     AU.addPreserved<LoopInfoWrapperPass>();
-    AU.addRequired<LoopAccessAnalysis>();
+    AU.addRequired<LoopAccessLegacyAnalysis>();
     AU.addRequired<ScalarEvolutionWrapperPass>();
     AU.addRequired<DominatorTreeWrapperPass>();
     AU.addPreserved<DominatorTreeWrapperPass>();
@@ -554,9 +592,10 @@ static const char LLE_name[] = "Loop Load Elimination";
 
 INITIALIZE_PASS_BEGIN(LoopLoadElimination, LLE_OPTION, LLE_name, false, false)
 INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass)
-INITIALIZE_PASS_DEPENDENCY(LoopAccessAnalysis)
+INITIALIZE_PASS_DEPENDENCY(LoopAccessLegacyAnalysis)
 INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
 INITIALIZE_PASS_DEPENDENCY(ScalarEvolutionWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(LoopSimplify)
 INITIALIZE_PASS_END(LoopLoadElimination, LLE_OPTION, LLE_name, false, false)
 
 namespace llvm {
diff --git a/lib/Transforms/Scalar/LoopRerollPass.cpp b/lib/Transforms/Scalar/LoopRerollPass.cpp
index 27c2d8824df0..d2f1b66076a6 100644
--- a/lib/Transforms/Scalar/LoopRerollPass.cpp
+++ b/lib/Transforms/Scalar/LoopRerollPass.cpp
@@ -14,7 +14,7 @@
 #include "llvm/Transforms/Scalar.h"
 #include "llvm/ADT/MapVector.h"
 #include "llvm/ADT/STLExtras.h"
-#include "llvm/ADT/SmallBitVector.h"
+#include "llvm/ADT/BitVector.h"
 #include "llvm/ADT/SmallSet.h"
 #include "llvm/ADT/Statistic.h"
 #include "llvm/Analysis/AliasAnalysis.h"
@@ -128,9 +128,8 @@ NumToleratedFailedMatches("reroll-num-tolerated-failed-matches", cl::init(400),
 
 namespace {
   enum IterationLimits {
-    /// The maximum number of iterations that we'll try and reroll. This
-    /// has to be less than 25 in order to fit into a SmallBitVector.
-    IL_MaxRerollIterations = 16,
+    /// The maximum number of iterations that we'll try and reroll.
+    IL_MaxRerollIterations = 32,
     /// The bitvector index used by loop induction variables and other
     /// instructions that belong to all iterations.
     IL_All,
@@ -147,13 +146,8 @@ namespace {
     bool runOnLoop(Loop *L, LPPassManager &LPM) override;
 
     void getAnalysisUsage(AnalysisUsage &AU) const override {
-      AU.addRequired<AAResultsWrapperPass>();
-      AU.addRequired<LoopInfoWrapperPass>();
-      AU.addPreserved<LoopInfoWrapperPass>();
-      AU.addRequired<DominatorTreeWrapperPass>();
-      AU.addPreserved<DominatorTreeWrapperPass>();
-      AU.addRequired<ScalarEvolutionWrapperPass>();
       AU.addRequired<TargetLibraryInfoWrapperPass>();
+      getLoopAnalysisUsage(AU);
     }
 
   protected:
@@ -169,6 +163,9 @@ namespace {
 
     // Map between induction variable and its increment
     DenseMap<Instruction *, int64_t> IVToIncMap;
+    // For loop with multiple induction variable, remember the one used only to
+    // control the loop.
+    Instruction *LoopControlIV;
 
     // A chain of isomorphic instructions, identified by a single-use PHI
     // representing a reduction. Only the last value may be used outside the
@@ -356,9 +353,11 @@ namespace {
                      ScalarEvolution *SE, AliasAnalysis *AA,
                      TargetLibraryInfo *TLI, DominatorTree *DT, LoopInfo *LI,
                      bool PreserveLCSSA,
-                     DenseMap<Instruction *, int64_t> &IncrMap)
+                     DenseMap<Instruction *, int64_t> &IncrMap,
+                     Instruction *LoopCtrlIV)
           : Parent(Parent), L(L), SE(SE), AA(AA), TLI(TLI), DT(DT), LI(LI),
-            PreserveLCSSA(PreserveLCSSA), IV(IV), IVToIncMap(IncrMap) {}
+            PreserveLCSSA(PreserveLCSSA), IV(IV), IVToIncMap(IncrMap),
+            LoopControlIV(LoopCtrlIV) {}
 
       /// Stage 1: Find all the DAG roots for the induction variable.
       bool findRoots();
@@ -370,7 +369,7 @@ namespace {
       void replace(const SCEV *IterCount);
 
     protected:
-      typedef MapVector<Instruction*, SmallBitVector> UsesTy;
+      typedef MapVector<Instruction*, BitVector> UsesTy;
 
       bool findRootsRecursive(Instruction *IVU,
                               SmallInstructionSet SubsumedInsts);
@@ -396,6 +395,8 @@ namespace {
       bool instrDependsOn(Instruction *I,
                           UsesTy::iterator Start,
                           UsesTy::iterator End);
+      void replaceIV(Instruction *Inst, Instruction *IV, const SCEV *IterCount);
+      void updateNonLoopCtrlIncr();
 
       LoopReroll *Parent;
 
@@ -426,8 +427,18 @@ namespace {
       UsesTy Uses;
       // Map between induction variable and its increment
       DenseMap<Instruction *, int64_t> &IVToIncMap;
+      Instruction *LoopControlIV;
     };
 
+    // Check if it is a compare-like instruction whose user is a branch
+    bool isCompareUsedByBranch(Instruction *I) {
+      auto *TI = I->getParent()->getTerminator();
+      if (!isa<BranchInst>(TI) || !isa<CmpInst>(I))
+        return false;
+      return I->hasOneUse() && TI->getOperand(0) == I;
+    };
+
+    bool isLoopControlIV(Loop *L, Instruction *IV);
     void collectPossibleIVs(Loop *L, SmallInstructionVector &PossibleIVs);
     void collectPossibleReductions(Loop *L,
            ReductionTracker &Reductions);
@@ -438,10 +449,7 @@ namespace {
 
 char LoopReroll::ID = 0;
 INITIALIZE_PASS_BEGIN(LoopReroll, "loop-reroll", "Reroll loops", false, false)
-INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass)
-INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass)
-INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
-INITIALIZE_PASS_DEPENDENCY(ScalarEvolutionWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(LoopPass)
 INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfoWrapperPass)
 INITIALIZE_PASS_END(LoopReroll, "loop-reroll", "Reroll loops", false, false)
 
@@ -460,6 +468,110 @@ static bool hasUsesOutsideLoop(Instruction *I, Loop *L) {
   return false;
 }
 
+static const SCEVConstant *getIncrmentFactorSCEV(ScalarEvolution *SE,
+                                                 const SCEV *SCEVExpr,
+                                                 Instruction &IV) {
+  const SCEVMulExpr *MulSCEV = dyn_cast<SCEVMulExpr>(SCEVExpr);
+
+  // If StepRecurrence of a SCEVExpr is a constant (c1 * c2, c2 = sizeof(ptr)),
+  // Return c1.
+  if (!MulSCEV && IV.getType()->isPointerTy())
+    if (const SCEVConstant *IncSCEV = dyn_cast<SCEVConstant>(SCEVExpr)) {
+      const PointerType *PTy = cast<PointerType>(IV.getType());
+      Type *ElTy = PTy->getElementType();
+      const SCEV *SizeOfExpr =
+          SE->getSizeOfExpr(SE->getEffectiveSCEVType(IV.getType()), ElTy);
+      if (IncSCEV->getValue()->getValue().isNegative()) {
+        const SCEV *NewSCEV =
+            SE->getUDivExpr(SE->getNegativeSCEV(SCEVExpr), SizeOfExpr);
+        return dyn_cast<SCEVConstant>(SE->getNegativeSCEV(NewSCEV));
+      } else {
+        return dyn_cast<SCEVConstant>(SE->getUDivExpr(SCEVExpr, SizeOfExpr));
+      }
+    }
+
+  if (!MulSCEV)
+    return nullptr;
+
+  // If StepRecurrence of a SCEVExpr is a c * sizeof(x), where c is constant,
+  // Return c.
+  const SCEVConstant *CIncSCEV = nullptr;
+  for (const SCEV *Operand : MulSCEV->operands()) {
+    if (const SCEVConstant *Constant = dyn_cast<SCEVConstant>(Operand)) {
+      CIncSCEV = Constant;
+    } else if (const SCEVUnknown *Unknown = dyn_cast<SCEVUnknown>(Operand)) {
+      Type *AllocTy;
+      if (!Unknown->isSizeOf(AllocTy))
+        break;
+    } else {
+      return nullptr;
+    }
+  }
+  return CIncSCEV;
+}
+
+// Check if an IV is only used to control the loop. There are two cases:
+// 1. It only has one use which is loop increment, and the increment is only
+// used by comparison and the PHI (could has sext with nsw in between), and the
+// comparison is only used by branch.
+// 2. It is used by loop increment and the comparison, the loop increment is
+// only used by the PHI, and the comparison is used only by the branch.
+bool LoopReroll::isLoopControlIV(Loop *L, Instruction *IV) {
+  unsigned IVUses = IV->getNumUses();
+  if (IVUses != 2 && IVUses != 1)
+    return false;
+
+  for (auto *User : IV->users()) {
+    int32_t IncOrCmpUses = User->getNumUses();
+    bool IsCompInst = isCompareUsedByBranch(cast<Instruction>(User));
+
+    // User can only have one or two uses.
+    if (IncOrCmpUses != 2 && IncOrCmpUses != 1)
+      return false;
+
+    // Case 1
+    if (IVUses == 1) {
+      // The only user must be the loop increment.
+      // The loop increment must have two uses.
+      if (IsCompInst || IncOrCmpUses != 2)
+        return false;
+    }
+
+    // Case 2
+    if (IVUses == 2 && IncOrCmpUses != 1)
+      return false;
+
+    // The users of the IV must be a binary operation or a comparison
+    if (auto *BO = dyn_cast<BinaryOperator>(User)) {
+      if (BO->getOpcode() == Instruction::Add) {
+        // Loop Increment
+        // User of Loop Increment should be either PHI or CMP
+        for (auto *UU : User->users()) {
+          if (PHINode *PN = dyn_cast<PHINode>(UU)) {
+            if (PN != IV)
+              return false;
+          }
+          // Must be a CMP or an ext (of a value with nsw) then CMP
+          else {
+            Instruction *UUser = dyn_cast<Instruction>(UU);
+            // Skip SExt if we are extending an nsw value
+            // TODO: Allow ZExt too
+            if (BO->hasNoSignedWrap() && UUser && UUser->getNumUses() == 1 &&
+                isa<SExtInst>(UUser))
+              UUser = dyn_cast<Instruction>(*(UUser->user_begin()));
+            if (!isCompareUsedByBranch(UUser))
+              return false;
+          }
+        }
+      } else
+        return false;
+      // Compare : can only have one use, and must be branch
+    } else if (!IsCompInst)
+      return false;
+  }
+  return true;
+}
+
 // Collect the list of loop induction variables with respect to which it might
 // be possible to reroll the loop.
 void LoopReroll::collectPossibleIVs(Loop *L,
@@ -469,7 +581,7 @@ void LoopReroll::collectPossibleIVs(Loop *L,
        IE = Header->getFirstInsertionPt(); I != IE; ++I) {
     if (!isa<PHINode>(I))
       continue;
-    if (!I->getType()->isIntegerTy())
+    if (!I->getType()->isIntegerTy() && !I->getType()->isPointerTy())
       continue;
 
     if (const SCEVAddRecExpr *PHISCEV =
@@ -478,15 +590,27 @@ void LoopReroll::collectPossibleIVs(Loop *L,
         continue;
       if (!PHISCEV->isAffine())
         continue;
-      if (const SCEVConstant *IncSCEV =
-          dyn_cast<SCEVConstant>(PHISCEV->getStepRecurrence(*SE))) {
-        const APInt &AInt = IncSCEV->getAPInt().abs();
+      const SCEVConstant *IncSCEV = nullptr;
+      if (I->getType()->isPointerTy())
+        IncSCEV =
+            getIncrmentFactorSCEV(SE, PHISCEV->getStepRecurrence(*SE), *I);
+      else
+        IncSCEV = dyn_cast<SCEVConstant>(PHISCEV->getStepRecurrence(*SE));
+      if (IncSCEV) {
+        const APInt &AInt = IncSCEV->getValue()->getValue().abs();
         if (IncSCEV->getValue()->isZero() || AInt.uge(MaxInc))
           continue;
         IVToIncMap[&*I] = IncSCEV->getValue()->getSExtValue();
         DEBUG(dbgs() << "LRR: Possible IV: " << *I << " = " << *PHISCEV
                      << "\n");
-        PossibleIVs.push_back(&*I);
+
+        if (isLoopControlIV(L, &*I)) {
+          assert(!LoopControlIV && "Found two loop control only IV");
+          LoopControlIV = &(*I);
+          DEBUG(dbgs() << "LRR: Possible loop control only IV: " << *I << " = "
+                       << *PHISCEV << "\n");
+        } else
+          PossibleIVs.push_back(&*I);
       }
     }
   }
@@ -611,9 +735,8 @@ void LoopReroll::DAGRootTracker::collectInLoopUserSet(
   const SmallInstructionSet &Exclude,
   const SmallInstructionSet &Final,
   DenseSet<Instruction *> &Users) {
-  for (SmallInstructionVector::const_iterator I = Roots.begin(),
-       IE = Roots.end(); I != IE; ++I)
-    collectInLoopUserSet(*I, Exclude, Final, Users);
+  for (Instruction *Root : Roots)
+    collectInLoopUserSet(Root, Exclude, Final, Users);
 }
 
 static bool isSimpleLoadStore(Instruction *I) {
@@ -651,10 +774,12 @@ static bool isSimpleArithmeticOp(User *IVU) {
 
 static bool isLoopIncrement(User *U, Instruction *IV) {
   BinaryOperator *BO = dyn_cast<BinaryOperator>(U);
-  if (!BO || BO->getOpcode() != Instruction::Add)
+
+  if ((BO && BO->getOpcode() != Instruction::Add) ||
+      (!BO && !isa<GetElementPtrInst>(U)))
     return false;
 
-  for (auto *UU : BO->users()) {
+  for (auto *UU : U->users()) {
     PHINode *PN = dyn_cast<PHINode>(UU);
     if (PN && PN == IV)
       return true;
@@ -1031,6 +1156,33 @@ bool LoopReroll::DAGRootTracker::validate(ReductionTracker &Reductions) {
     Uses[I].set(IL_All);
   }
 
+  // Make sure we mark loop-control-only PHIs as used in all iterations. See
+  // comment above LoopReroll::isLoopControlIV for more information.
+  BasicBlock *Header = L->getHeader();
+  if (LoopControlIV && LoopControlIV != IV) {
+    for (auto *U : LoopControlIV->users()) {
+      Instruction *IVUser = dyn_cast<Instruction>(U);
+      // IVUser could be loop increment or compare
+      Uses[IVUser].set(IL_All);
+      for (auto *UU : IVUser->users()) {
+        Instruction *UUser = dyn_cast<Instruction>(UU);
+        // UUser could be compare, PHI or branch
+        Uses[UUser].set(IL_All);
+        // Skip SExt
+        if (isa<SExtInst>(UUser)) {
+          UUser = dyn_cast<Instruction>(*(UUser->user_begin()));
+          Uses[UUser].set(IL_All);
+        }
+        // Is UUser a compare instruction?
+        if (UU->hasOneUse()) {
+          Instruction *BI = dyn_cast<BranchInst>(*UUser->user_begin());
+          if (BI == cast<BranchInst>(Header->getTerminator()))
+            Uses[BI].set(IL_All);
+        }
+      }
+    }
+  }
+
   // Make sure all instructions in the loop are in one and only one
   // set.
   for (auto &KV : Uses) {
@@ -1272,61 +1424,136 @@ void LoopReroll::DAGRootTracker::replace(const SCEV *IterCount) {
 
     ++J;
   }
-  bool Negative = IVToIncMap[IV] < 0;
-  const DataLayout &DL = Header->getModule()->getDataLayout();
 
-  // We need to create a new induction variable for each different BaseInst.
-  for (auto &DRS : RootSets) {
-    // Insert the new induction variable.
-    const SCEVAddRecExpr *RealIVSCEV =
-      cast<SCEVAddRecExpr>(SE->getSCEV(DRS.BaseInst));
-    const SCEV *Start = RealIVSCEV->getStart();
-    const SCEVAddRecExpr *H = cast<SCEVAddRecExpr>(SE->getAddRecExpr(
-        Start, SE->getConstant(RealIVSCEV->getType(), Negative ? -1 : 1), L,
-        SCEV::FlagAnyWrap));
-    { // Limit the lifetime of SCEVExpander.
-      SCEVExpander Expander(*SE, DL, "reroll");
-      Value *NewIV = Expander.expandCodeFor(H, IV->getType(), &Header->front());
-
-      for (auto &KV : Uses) {
-        if (KV.second.find_first() == 0)
-          KV.first->replaceUsesOfWith(DRS.BaseInst, NewIV);
-      }
+  bool HasTwoIVs = LoopControlIV && LoopControlIV != IV;
+
+  if (HasTwoIVs) {
+    updateNonLoopCtrlIncr();
+    replaceIV(LoopControlIV, LoopControlIV, IterCount);
+  } else
+    // We need to create a new induction variable for each different BaseInst.
+    for (auto &DRS : RootSets)
+      // Insert the new induction variable.
+      replaceIV(DRS.BaseInst, IV, IterCount);
 
-      if (BranchInst *BI = dyn_cast<BranchInst>(Header->getTerminator())) {
-        // FIXME: Why do we need this check?
-        if (Uses[BI].find_first() == IL_All) {
-          const SCEV *ICSCEV = RealIVSCEV->evaluateAtIteration(IterCount, *SE);
+  SimplifyInstructionsInBlock(Header, TLI);
+  DeleteDeadPHIs(Header, TLI);
+}
 
-          // Iteration count SCEV minus 1
-          const SCEV *ICMinus1SCEV = SE->getMinusSCEV(
-              ICSCEV, SE->getConstant(ICSCEV->getType(), Negative ? -1 : 1));
+// For non-loop-control IVs, we only need to update the last increment
+// with right amount, then we are done.
+void LoopReroll::DAGRootTracker::updateNonLoopCtrlIncr() {
+  const SCEV *NewInc = nullptr;
+  for (auto *LoopInc : LoopIncs) {
+    GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(LoopInc);
+    const SCEVConstant *COp = nullptr;
+    if (GEP && LoopInc->getOperand(0)->getType()->isPointerTy()) {
+      COp = dyn_cast<SCEVConstant>(SE->getSCEV(LoopInc->getOperand(1)));
+    } else {
+      COp = dyn_cast<SCEVConstant>(SE->getSCEV(LoopInc->getOperand(0)));
+      if (!COp)
+        COp = dyn_cast<SCEVConstant>(SE->getSCEV(LoopInc->getOperand(1)));
+    }
 
-          Value *ICMinus1; // Iteration count minus 1
-          if (isa<SCEVConstant>(ICMinus1SCEV)) {
-            ICMinus1 = Expander.expandCodeFor(ICMinus1SCEV, NewIV->getType(), BI);
-          } else {
-            BasicBlock *Preheader = L->getLoopPreheader();
-            if (!Preheader)
-              Preheader = InsertPreheaderForLoop(L, DT, LI, PreserveLCSSA);
+    assert(COp && "Didn't find constant operand of LoopInc!\n");
 
-            ICMinus1 = Expander.expandCodeFor(ICMinus1SCEV, NewIV->getType(),
-                                              Preheader->getTerminator());
-          }
+    const APInt &AInt = COp->getValue()->getValue();
+    const SCEV *ScaleSCEV = SE->getConstant(COp->getType(), Scale);
+    if (AInt.isNegative()) {
+      NewInc = SE->getNegativeSCEV(COp);
+      NewInc = SE->getUDivExpr(NewInc, ScaleSCEV);
+      NewInc = SE->getNegativeSCEV(NewInc);
+    } else
+      NewInc = SE->getUDivExpr(COp, ScaleSCEV);
+
+    LoopInc->setOperand(1, dyn_cast<SCEVConstant>(NewInc)->getValue());
+  }
+}
 
-          Value *Cond =
-            new ICmpInst(BI, CmpInst::ICMP_EQ, NewIV, ICMinus1, "exitcond");
-          BI->setCondition(Cond);
+void LoopReroll::DAGRootTracker::replaceIV(Instruction *Inst,
+                                           Instruction *InstIV,
+                                           const SCEV *IterCount) {
+  BasicBlock *Header = L->getHeader();
+  int64_t Inc = IVToIncMap[InstIV];
+  bool NeedNewIV = InstIV == LoopControlIV;
+  bool Negative = !NeedNewIV && Inc < 0;
+
+  const SCEVAddRecExpr *RealIVSCEV = cast<SCEVAddRecExpr>(SE->getSCEV(Inst));
+  const SCEV *Start = RealIVSCEV->getStart();
+
+  if (NeedNewIV)
+    Start = SE->getConstant(Start->getType(), 0);
+
+  const SCEV *SizeOfExpr = nullptr;
+  const SCEV *IncrExpr =
+      SE->getConstant(RealIVSCEV->getType(), Negative ? -1 : 1);
+  if (auto *PTy = dyn_cast<PointerType>(Inst->getType())) {
+    Type *ElTy = PTy->getElementType();
+    SizeOfExpr =
+        SE->getSizeOfExpr(SE->getEffectiveSCEVType(Inst->getType()), ElTy);
+    IncrExpr = SE->getMulExpr(IncrExpr, SizeOfExpr);
+  }
+  const SCEV *NewIVSCEV =
+      SE->getAddRecExpr(Start, IncrExpr, L, SCEV::FlagAnyWrap);
+
+  { // Limit the lifetime of SCEVExpander.
+    const DataLayout &DL = Header->getModule()->getDataLayout();
+    SCEVExpander Expander(*SE, DL, "reroll");
+    Value *NewIV =
+        Expander.expandCodeFor(NewIVSCEV, InstIV->getType(), &Header->front());
+
+    for (auto &KV : Uses)
+      if (KV.second.find_first() == 0)
+        KV.first->replaceUsesOfWith(Inst, NewIV);
+
+    if (BranchInst *BI = dyn_cast<BranchInst>(Header->getTerminator())) {
+      // FIXME: Why do we need this check?
+      if (Uses[BI].find_first() == IL_All) {
+        const SCEV *ICSCEV = RealIVSCEV->evaluateAtIteration(IterCount, *SE);
+
+        if (NeedNewIV)
+          ICSCEV = SE->getMulExpr(IterCount,
+                                  SE->getConstant(IterCount->getType(), Scale));
+
+        // Iteration count SCEV minus or plus 1
+        const SCEV *MinusPlus1SCEV =
+            SE->getConstant(ICSCEV->getType(), Negative ? -1 : 1);
+        if (Inst->getType()->isPointerTy()) {
+          assert(SizeOfExpr && "SizeOfExpr is not initialized");
+          MinusPlus1SCEV = SE->getMulExpr(MinusPlus1SCEV, SizeOfExpr);
+        }
 
-          if (BI->getSuccessor(1) != Header)
-            BI->swapSuccessors();
+        const SCEV *ICMinusPlus1SCEV = SE->getMinusSCEV(ICSCEV, MinusPlus1SCEV);
+        // Iteration count minus 1
+        Instruction *InsertPtr = nullptr;
+        if (isa<SCEVConstant>(ICMinusPlus1SCEV)) {
+          InsertPtr = BI;
+        } else {
+          BasicBlock *Preheader = L->getLoopPreheader();
+          if (!Preheader)
+            Preheader = InsertPreheaderForLoop(L, DT, LI, PreserveLCSSA);
+          InsertPtr = Preheader->getTerminator();
         }
+
+        if (!isa<PointerType>(NewIV->getType()) && NeedNewIV &&
+            (SE->getTypeSizeInBits(NewIV->getType()) <
+             SE->getTypeSizeInBits(ICMinusPlus1SCEV->getType()))) {
+          IRBuilder<> Builder(BI);
+          Builder.SetCurrentDebugLocation(BI->getDebugLoc());
+          NewIV = Builder.CreateSExt(NewIV, ICMinusPlus1SCEV->getType());
+        }
+        Value *ICMinusPlus1 = Expander.expandCodeFor(
+            ICMinusPlus1SCEV, NewIV->getType(), InsertPtr);
+
+        Value *Cond =
+            new ICmpInst(BI, CmpInst::ICMP_EQ, NewIV, ICMinusPlus1, "exitcond");
+        BI->setCondition(Cond);
+
+        if (BI->getSuccessor(1) != Header)
+          BI->swapSuccessors();
       }
     }
   }
-
-  SimplifyInstructionsInBlock(Header, TLI);
-  DeleteDeadPHIs(Header, TLI);
 }
 
 // Validate the selected reductions. All iterations must have an isomorphic
@@ -1334,9 +1561,7 @@ void LoopReroll::DAGRootTracker::replace(const SCEV *IterCount) {
 // entries must appear in order.
 bool LoopReroll::ReductionTracker::validateSelected() {
   // For a non-associative reduction, the chain entries must appear in order.
-  for (DenseSet<int>::iterator RI = Reds.begin(), RIE = Reds.end();
-       RI != RIE; ++RI) {
-    int i = *RI;
+  for (int i : Reds) {
     int PrevIter = 0, BaseCount = 0, Count = 0;
     for (Instruction *J : PossibleReds[i]) {
       // Note that all instructions in the chain must have been found because
@@ -1380,9 +1605,7 @@ bool LoopReroll::ReductionTracker::validateSelected() {
 void LoopReroll::ReductionTracker::replaceSelected() {
   // Fixup reductions to refer to the last instruction associated with the
   // first iteration (not the last).
-  for (DenseSet<int>::iterator RI = Reds.begin(), RIE = Reds.end();
-       RI != RIE; ++RI) {
-    int i = *RI;
+  for (int i : Reds) {
     int j = 0;
     for (int e = PossibleReds[i].size(); j != e; ++j)
       if (PossibleRedIter[PossibleReds[i][j]] != 0) {
@@ -1396,9 +1619,8 @@ void LoopReroll::ReductionTracker::replaceSelected() {
       Users.push_back(cast<Instruction>(U));
     }
 
-    for (SmallInstructionVector::iterator J = Users.begin(),
-         JE = Users.end(); J != JE; ++J)
-      (*J)->replaceUsesOfWith(PossibleReds[i].getReducedValue(),
+    for (Instruction *User : Users)
+      User->replaceUsesOfWith(PossibleReds[i].getReducedValue(),
                               PossibleReds[i][j]);
   }
 }
@@ -1450,7 +1672,7 @@ bool LoopReroll::reroll(Instruction *IV, Loop *L, BasicBlock *Header,
                         const SCEV *IterCount,
                         ReductionTracker &Reductions) {
   DAGRootTracker DAGRoots(this, L, IV, SE, AA, TLI, DT, LI, PreserveLCSSA,
-                          IVToIncMap);
+                          IVToIncMap, LoopControlIV);
 
   if (!DAGRoots.findRoots())
     return false;
@@ -1472,7 +1694,7 @@ bool LoopReroll::reroll(Instruction *IV, Loop *L, BasicBlock *Header,
 }
 
 bool LoopReroll::runOnLoop(Loop *L, LPPassManager &LPM) {
-  if (skipOptnoneFunction(L))
+  if (skipLoop(L))
     return false;
 
   AA = &getAnalysis<AAResultsWrapperPass>().getAAResults();
@@ -1487,41 +1709,46 @@ bool LoopReroll::runOnLoop(Loop *L, LPPassManager &LPM) {
         "] Loop %" << Header->getName() << " (" <<
         L->getNumBlocks() << " block(s))\n");
 
-  bool Changed = false;
-
   // For now, we'll handle only single BB loops.
   if (L->getNumBlocks() > 1)
-    return Changed;
+    return false;
 
   if (!SE->hasLoopInvariantBackedgeTakenCount(L))
-    return Changed;
+    return false;
 
   const SCEV *LIBETC = SE->getBackedgeTakenCount(L);
   const SCEV *IterCount = SE->getAddExpr(LIBETC, SE->getOne(LIBETC->getType()));
+  DEBUG(dbgs() << "\n Before Reroll:\n" << *(L->getHeader()) << "\n");
   DEBUG(dbgs() << "LRR: iteration count = " << *IterCount << "\n");
 
   // First, we need to find the induction variable with respect to which we can
   // reroll (there may be several possible options).
   SmallInstructionVector PossibleIVs;
   IVToIncMap.clear();
+  LoopControlIV = nullptr;
   collectPossibleIVs(L, PossibleIVs);
 
   if (PossibleIVs.empty()) {
     DEBUG(dbgs() << "LRR: No possible IVs found\n");
-    return Changed;
+    return false;
   }
 
   ReductionTracker Reductions;
   collectPossibleReductions(L, Reductions);
+  bool Changed = false;
 
   // For each possible IV, collect the associated possible set of 'root' nodes
   // (i+1, i+2, etc.).
-  for (SmallInstructionVector::iterator I = PossibleIVs.begin(),
-       IE = PossibleIVs.end(); I != IE; ++I)
-    if (reroll(*I, L, Header, IterCount, Reductions)) {
+  for (Instruction *PossibleIV : PossibleIVs)
+    if (reroll(PossibleIV, L, Header, IterCount, Reductions)) {
       Changed = true;
       break;
     }
+  DEBUG(dbgs() << "\n After Reroll:\n" << *(L->getHeader()) << "\n");
+
+  // Trip count of L has changed so SE must be re-evaluated.
+  if (Changed)
+    SE->forgetLoop(L);
 
   return Changed;
 }
diff --git a/lib/Transforms/Scalar/LoopRotation.cpp b/lib/Transforms/Scalar/LoopRotation.cpp
index 5e6c2da08cc3..7a06a25a7073 100644
--- a/lib/Transforms/Scalar/LoopRotation.cpp
+++ b/lib/Transforms/Scalar/LoopRotation.cpp
@@ -11,7 +11,7 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "llvm/Transforms/Scalar.h"
+#include "llvm/Transforms/Scalar/LoopRotation.h"
 #include "llvm/ADT/Statistic.h"
 #include "llvm/Analysis/AliasAnalysis.h"
 #include "llvm/Analysis/BasicAliasAnalysis.h"
@@ -20,6 +20,7 @@
 #include "llvm/Analysis/InstructionSimplify.h"
 #include "llvm/Analysis/GlobalsModRef.h"
 #include "llvm/Analysis/LoopPass.h"
+#include "llvm/Analysis/LoopPassManager.h"
 #include "llvm/Analysis/ScalarEvolution.h"
 #include "llvm/Analysis/ScalarEvolutionAliasAnalysis.h"
 #include "llvm/Analysis/TargetTransformInfo.h"
@@ -32,20 +33,46 @@
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/raw_ostream.h"
+#include "llvm/Transforms/Scalar.h"
 #include "llvm/Transforms/Utils/BasicBlockUtils.h"
 #include "llvm/Transforms/Utils/Local.h"
+#include "llvm/Transforms/Utils/LoopUtils.h"
 #include "llvm/Transforms/Utils/SSAUpdater.h"
 #include "llvm/Transforms/Utils/ValueMapper.h"
 using namespace llvm;
 
 #define DEBUG_TYPE "loop-rotate"
 
-static cl::opt<unsigned>
-DefaultRotationThreshold("rotation-max-header-size", cl::init(16), cl::Hidden,
-       cl::desc("The default maximum header size for automatic loop rotation"));
+static cl::opt<unsigned> DefaultRotationThreshold(
+    "rotation-max-header-size", cl::init(16), cl::Hidden,
+    cl::desc("The default maximum header size for automatic loop rotation"));
 
 STATISTIC(NumRotated, "Number of loops rotated");
 
+namespace {
+/// A simple loop rotation transformation.
+class LoopRotate {
+  const unsigned MaxHeaderSize;
+  LoopInfo *LI;
+  const TargetTransformInfo *TTI;
+  AssumptionCache *AC;
+  DominatorTree *DT;
+  ScalarEvolution *SE;
+
+public:
+  LoopRotate(unsigned MaxHeaderSize, LoopInfo *LI,
+             const TargetTransformInfo *TTI, AssumptionCache *AC,
+             DominatorTree *DT, ScalarEvolution *SE)
+      : MaxHeaderSize(MaxHeaderSize), LI(LI), TTI(TTI), AC(AC), DT(DT), SE(SE) {
+  }
+  bool processLoop(Loop *L);
+
+private:
+  bool rotateLoop(Loop *L, bool SimplifiedLatch);
+  bool simplifyLoopLatch(Loop *L);
+};
+} // end anonymous namespace
+
 /// RewriteUsesOfClonedInstructions - We just cloned the instructions from the
 /// old header into the preheader.  If there were uses of the values produced by
 /// these instruction that were outside of the loop, we have to insert PHI nodes
@@ -69,7 +96,7 @@ static void RewriteUsesOfClonedInstructions(BasicBlock *OrigHeader,
     if (OrigHeaderVal->use_empty())
       continue;
 
-    Value *OrigPreHeaderVal = ValueMap[OrigHeaderVal];
+    Value *OrigPreHeaderVal = ValueMap.lookup(OrigHeaderVal);
 
     // The value now exits in two versions: the initial value in the preheader
     // and the loop "next" value in the original header.
@@ -79,7 +106,8 @@ static void RewriteUsesOfClonedInstructions(BasicBlock *OrigHeader,
 
     // Visit each use of the OrigHeader instruction.
     for (Value::use_iterator UI = OrigHeaderVal->use_begin(),
-         UE = OrigHeaderVal->use_end(); UI != UE; ) {
+                             UE = OrigHeaderVal->use_end();
+         UI != UE;) {
       // Grab the use before incrementing the iterator.
       Use &U = *UI;
 
@@ -108,6 +136,41 @@ static void RewriteUsesOfClonedInstructions(BasicBlock *OrigHeader,
       // Anything else can be handled by SSAUpdater.
       SSA.RewriteUse(U);
     }
+
+    // Replace MetadataAsValue(ValueAsMetadata(OrigHeaderVal)) uses in debug
+    // intrinsics.
+    LLVMContext &C = OrigHeader->getContext();
+    if (auto *VAM = ValueAsMetadata::getIfExists(OrigHeaderVal)) {
+      if (auto *MAV = MetadataAsValue::getIfExists(C, VAM)) {
+        for (auto UI = MAV->use_begin(), E = MAV->use_end(); UI != E;) {
+          // Grab the use before incrementing the iterator. Otherwise, altering
+          // the Use will invalidate the iterator.
+          Use &U = *UI++;
+          DbgInfoIntrinsic *UserInst = dyn_cast<DbgInfoIntrinsic>(U.getUser());
+          if (!UserInst)
+            continue;
+
+          // The original users in the OrigHeader are already using the original
+          // definitions.
+          BasicBlock *UserBB = UserInst->getParent();
+          if (UserBB == OrigHeader)
+            continue;
+
+          // Users in the OrigPreHeader need to use the value to which the
+          // original definitions are mapped and anything else can be handled by
+          // the SSAUpdater. To avoid adding PHINodes, check if the value is
+          // available in UserBB, if not substitute undef.
+          Value *NewVal;
+          if (UserBB == OrigPreheader)
+            NewVal = OrigPreHeaderVal;
+          else if (SSA.HasValueForBlock(UserBB))
+            NewVal = SSA.GetValueInMiddleOfBlock(UserBB);
+          else
+            NewVal = UndefValue::get(OrigHeaderVal->getType());
+          U = MetadataAsValue::get(C, ValueAsMetadata::get(NewVal));
+        }
+      }
+    }
   }
 }
 
@@ -121,10 +184,7 @@ static void RewriteUsesOfClonedInstructions(BasicBlock *OrigHeader,
 /// rotation. LoopRotate should be repeatable and converge to a canonical
 /// form. This property is satisfied because simplifying the loop latch can only
 /// happen once across multiple invocations of the LoopRotate pass.
-static bool rotateLoop(Loop *L, unsigned MaxHeaderSize, LoopInfo *LI,
-                       const TargetTransformInfo *TTI, AssumptionCache *AC,
-                       DominatorTree *DT, ScalarEvolution *SE,
-                       bool SimplifiedLatch) {
+bool LoopRotate::rotateLoop(Loop *L, bool SimplifiedLatch) {
   // If the loop has only one block then there is not much to rotate.
   if (L->getBlocks().size() == 1)
     return false;
@@ -162,7 +222,14 @@ static bool rotateLoop(Loop *L, unsigned MaxHeaderSize, LoopInfo *LI,
     Metrics.analyzeBasicBlock(OrigHeader, *TTI, EphValues);
     if (Metrics.notDuplicatable) {
       DEBUG(dbgs() << "LoopRotation: NOT rotating - contains non-duplicatable"
-            << " instructions: "; L->dump());
+                   << " instructions: ";
+            L->dump());
+      return false;
+    }
+    if (Metrics.convergent) {
+      DEBUG(dbgs() << "LoopRotation: NOT rotating - contains convergent "
+                      "instructions: ";
+            L->dump());
       return false;
     }
     if (Metrics.NumInsts > MaxHeaderSize)
@@ -225,10 +292,9 @@ static bool rotateLoop(Loop *L, unsigned MaxHeaderSize, LoopInfo *LI,
     // executing in each iteration of the loop.  This means it is safe to hoist
     // something that might trap, but isn't safe to hoist something that reads
     // memory (without proving that the loop doesn't write).
-    if (L->hasLoopInvariantOperands(Inst) &&
-        !Inst->mayReadFromMemory() && !Inst->mayWriteToMemory() &&
-        !isa<TerminatorInst>(Inst) && !isa<DbgInfoIntrinsic>(Inst) &&
-        !isa<AllocaInst>(Inst)) {
+    if (L->hasLoopInvariantOperands(Inst) && !Inst->mayReadFromMemory() &&
+        !Inst->mayWriteToMemory() && !isa<TerminatorInst>(Inst) &&
+        !isa<DbgInfoIntrinsic>(Inst) && !isa<AllocaInst>(Inst)) {
       Inst->moveBefore(LoopEntryBranch);
       continue;
     }
@@ -238,7 +304,7 @@ static bool rotateLoop(Loop *L, unsigned MaxHeaderSize, LoopInfo *LI,
 
     // Eagerly remap the operands of the instruction.
     RemapInstruction(C, ValueMap,
-                     RF_NoModuleLevelChanges|RF_IgnoreMissingEntries);
+                     RF_NoModuleLevelChanges | RF_IgnoreMissingLocals);
 
     // With the operands remapped, see if the instruction constant folds or is
     // otherwise simplifyable.  This commonly occurs because the entry from PHI
@@ -248,13 +314,18 @@ static bool rotateLoop(Loop *L, unsigned MaxHeaderSize, LoopInfo *LI,
     if (V && LI->replacementPreservesLCSSAForm(C, V)) {
       // If so, then delete the temporary instruction and stick the folded value
       // in the map.
-      delete C;
       ValueMap[Inst] = V;
+      if (!C->mayHaveSideEffects()) {
+        delete C;
+        C = nullptr;
+      }
     } else {
+      ValueMap[Inst] = C;
+    }
+    if (C) {
       // Otherwise, stick the new instruction into the new block!
       C->setName(Inst->getName());
       C->insertBefore(LoopEntryBranch);
-      ValueMap[Inst] = C;
     }
   }
 
@@ -280,7 +351,6 @@ static bool rotateLoop(Loop *L, unsigned MaxHeaderSize, LoopInfo *LI,
   L->moveToHeader(NewHeader);
   assert(L->getHeader() == NewHeader && "Latch block is our new header");
 
-
   // At this point, we've finished our major CFG changes.  As part of cloning
   // the loop into the preheader we've simplified instructions and the
   // duplicated conditional branch may now be branching on a constant.  If it is
@@ -291,8 +361,8 @@ static bool rotateLoop(Loop *L, unsigned MaxHeaderSize, LoopInfo *LI,
   BranchInst *PHBI = cast<BranchInst>(OrigPreheader->getTerminator());
   assert(PHBI->isConditional() && "Should be clone of BI condbr!");
   if (!isa<ConstantInt>(PHBI->getCondition()) ||
-      PHBI->getSuccessor(cast<ConstantInt>(PHBI->getCondition())->isZero())
-          != NewHeader) {
+      PHBI->getSuccessor(cast<ConstantInt>(PHBI->getCondition())->isZero()) !=
+          NewHeader) {
     // The conditional branch can't be folded, handle the general case.
     // Update DominatorTree to reflect the CFG change we just made.  Then split
     // edges as necessary to preserve LoopSimplify form.
@@ -329,18 +399,17 @@ static bool rotateLoop(Loop *L, unsigned MaxHeaderSize, LoopInfo *LI,
     // be split.
     SmallVector<BasicBlock *, 4> ExitPreds(pred_begin(Exit), pred_end(Exit));
     bool SplitLatchEdge = false;
-    for (SmallVectorImpl<BasicBlock *>::iterator PI = ExitPreds.begin(),
-                                                 PE = ExitPreds.end();
-         PI != PE; ++PI) {
+    for (BasicBlock *ExitPred : ExitPreds) {
       // We only need to split loop exit edges.
-      Loop *PredLoop = LI->getLoopFor(*PI);
+      Loop *PredLoop = LI->getLoopFor(ExitPred);
       if (!PredLoop || PredLoop->contains(Exit))
         continue;
-      if (isa<IndirectBrInst>((*PI)->getTerminator()))
+      if (isa<IndirectBrInst>(ExitPred->getTerminator()))
         continue;
-      SplitLatchEdge |= L->getLoopLatch() == *PI;
+      SplitLatchEdge |= L->getLoopLatch() == ExitPred;
       BasicBlock *ExitSplit = SplitCriticalEdge(
-          *PI, Exit, CriticalEdgeSplittingOptions(DT, LI).setPreserveLCSSA());
+          ExitPred, Exit,
+          CriticalEdgeSplittingOptions(DT, LI).setPreserveLCSSA());
       ExitSplit->moveBefore(Exit);
     }
     assert(SplitLatchEdge &&
@@ -384,8 +453,8 @@ static bool rotateLoop(Loop *L, unsigned MaxHeaderSize, LoopInfo *LI,
           }
         }
 
-      // If the dominator changed, this may have an effect on other
-      // predecessors, continue until we reach a fixpoint.
+        // If the dominator changed, this may have an effect on other
+        // predecessors, continue until we reach a fixpoint.
       } while (Changed);
     }
   }
@@ -432,7 +501,7 @@ static bool shouldSpeculateInstrs(BasicBlock::iterator Begin,
       // GEPs are cheap if all indices are constant.
       if (!cast<GEPOperator>(I)->hasAllConstantIndices())
         return false;
-      // fall-thru to increment case
+    // fall-thru to increment case
     case Instruction::Add:
     case Instruction::Sub:
     case Instruction::And:
@@ -441,11 +510,10 @@ static bool shouldSpeculateInstrs(BasicBlock::iterator Begin,
     case Instruction::Shl:
     case Instruction::LShr:
     case Instruction::AShr: {
-      Value *IVOpnd = !isa<Constant>(I->getOperand(0))
-                          ? I->getOperand(0)
-                          : !isa<Constant>(I->getOperand(1))
-                                ? I->getOperand(1)
-                                : nullptr;
+      Value *IVOpnd =
+          !isa<Constant>(I->getOperand(0))
+              ? I->getOperand(0)
+              : !isa<Constant>(I->getOperand(1)) ? I->getOperand(1) : nullptr;
       if (!IVOpnd)
         return false;
 
@@ -482,7 +550,7 @@ static bool shouldSpeculateInstrs(BasicBlock::iterator Begin,
 /// canonical form so downstream passes can handle it.
 ///
 /// I don't believe this invalidates SCEV.
-static bool simplifyLoopLatch(Loop *L, LoopInfo *LI, DominatorTree *DT) {
+bool LoopRotate::simplifyLoopLatch(Loop *L) {
   BasicBlock *Latch = L->getLoopLatch();
   if (!Latch || Latch->hasAddressTaken())
     return false;
@@ -503,7 +571,7 @@ static bool simplifyLoopLatch(Loop *L, LoopInfo *LI, DominatorTree *DT) {
     return false;
 
   DEBUG(dbgs() << "Folding loop latch " << Latch->getName() << " into "
-        << LastExit->getName() << "\n");
+               << LastExit->getName() << "\n");
 
   // Hoist the instructions from Latch into LastExit.
   LastExit->getInstList().splice(BI->getIterator(), Latch->getInstList(),
@@ -527,26 +595,19 @@ static bool simplifyLoopLatch(Loop *L, LoopInfo *LI, DominatorTree *DT) {
   return true;
 }
 
-/// Rotate \c L as many times as possible. Return true if the loop is rotated
-/// at least once.
-static bool iterativelyRotateLoop(Loop *L, unsigned MaxHeaderSize, LoopInfo *LI,
-                                  const TargetTransformInfo *TTI,
-                                  AssumptionCache *AC, DominatorTree *DT,
-                                  ScalarEvolution *SE) {
+/// Rotate \c L, and return true if any modification was made.
+bool LoopRotate::processLoop(Loop *L) {
   // Save the loop metadata.
   MDNode *LoopMD = L->getLoopID();
 
   // Simplify the loop latch before attempting to rotate the header
   // upward. Rotation may not be needed if the loop tail can be folded into the
   // loop exit.
-  bool SimplifiedLatch = simplifyLoopLatch(L, LI, DT);
+  bool SimplifiedLatch = simplifyLoopLatch(L);
 
-  // One loop can be rotated multiple times.
-  bool MadeChange = false;
-  while (rotateLoop(L, MaxHeaderSize, LI, TTI, AC, DT, SE, SimplifiedLatch)) {
-    MadeChange = true;
-    SimplifiedLatch = false;
-  }
+  bool MadeChange = rotateLoop(L, SimplifiedLatch);
+  assert((!MadeChange || L->isLoopExiting(L->getLoopLatch())) &&
+         "Loop latch should be exiting after loop-rotate.");
 
   // Restore the loop metadata.
   // NB! We presume LoopRotation DOESN'T ADD its own metadata.
@@ -556,15 +617,37 @@ static bool iterativelyRotateLoop(Loop *L, unsigned MaxHeaderSize, LoopInfo *LI,
   return MadeChange;
 }
 
+LoopRotatePass::LoopRotatePass() {}
+
+PreservedAnalyses LoopRotatePass::run(Loop &L, AnalysisManager<Loop> &AM) {
+  auto &FAM = AM.getResult<FunctionAnalysisManagerLoopProxy>(L).getManager();
+  Function *F = L.getHeader()->getParent();
+
+  auto *LI = FAM.getCachedResult<LoopAnalysis>(*F);
+  const auto *TTI = FAM.getCachedResult<TargetIRAnalysis>(*F);
+  auto *AC = FAM.getCachedResult<AssumptionAnalysis>(*F);
+  assert((LI && TTI && AC) && "Analyses for loop rotation not available");
+
+  // Optional analyses.
+  auto *DT = FAM.getCachedResult<DominatorTreeAnalysis>(*F);
+  auto *SE = FAM.getCachedResult<ScalarEvolutionAnalysis>(*F);
+  LoopRotate LR(DefaultRotationThreshold, LI, TTI, AC, DT, SE);
+
+  bool Changed = LR.processLoop(&L);
+  if (!Changed)
+    return PreservedAnalyses::all();
+  return getLoopPassPreservedAnalyses();
+}
+
 namespace {
 
-class LoopRotate : public LoopPass {
+class LoopRotateLegacyPass : public LoopPass {
   unsigned MaxHeaderSize;
 
 public:
   static char ID; // Pass ID, replacement for typeid
-  LoopRotate(int SpecifiedMaxHeaderSize = -1) : LoopPass(ID) {
-    initializeLoopRotatePass(*PassRegistry::getPassRegistry());
+  LoopRotateLegacyPass(int SpecifiedMaxHeaderSize = -1) : LoopPass(ID) {
+    initializeLoopRotateLegacyPassPass(*PassRegistry::getPassRegistry());
     if (SpecifiedMaxHeaderSize == -1)
       MaxHeaderSize = DefaultRotationThreshold;
     else
@@ -573,24 +656,13 @@ public:
 
   // LCSSA form makes instruction renaming easier.
   void getAnalysisUsage(AnalysisUsage &AU) const override {
-    AU.addPreserved<AAResultsWrapperPass>();
     AU.addRequired<AssumptionCacheTracker>();
-    AU.addPreserved<DominatorTreeWrapperPass>();
-    AU.addRequired<LoopInfoWrapperPass>();
-    AU.addPreserved<LoopInfoWrapperPass>();
-    AU.addRequiredID(LoopSimplifyID);
-    AU.addPreservedID(LoopSimplifyID);
-    AU.addRequiredID(LCSSAID);
-    AU.addPreservedID(LCSSAID);
-    AU.addPreserved<ScalarEvolutionWrapperPass>();
-    AU.addPreserved<SCEVAAWrapperPass>();
     AU.addRequired<TargetTransformInfoWrapperPass>();
-    AU.addPreserved<BasicAAWrapperPass>();
-    AU.addPreserved<GlobalsAAWrapperPass>();
+    getLoopAnalysisUsage(AU);
   }
 
   bool runOnLoop(Loop *L, LPPassManager &LPM) override {
-    if (skipOptnoneFunction(L))
+    if (skipLoop(L))
       return false;
     Function &F = *L->getHeader()->getParent();
 
@@ -601,24 +673,21 @@ public:
     auto *DT = DTWP ? &DTWP->getDomTree() : nullptr;
     auto *SEWP = getAnalysisIfAvailable<ScalarEvolutionWrapperPass>();
     auto *SE = SEWP ? &SEWP->getSE() : nullptr;
-
-    return iterativelyRotateLoop(L, MaxHeaderSize, LI, TTI, AC, DT, SE);
+    LoopRotate LR(MaxHeaderSize, LI, TTI, AC, DT, SE);
+    return LR.processLoop(L);
   }
 };
 }
 
-char LoopRotate::ID = 0;
-INITIALIZE_PASS_BEGIN(LoopRotate, "loop-rotate", "Rotate Loops", false, false)
-INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass)
+char LoopRotateLegacyPass::ID = 0;
+INITIALIZE_PASS_BEGIN(LoopRotateLegacyPass, "loop-rotate", "Rotate Loops",
+                      false, false)
 INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker)
-INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass)
-INITIALIZE_PASS_DEPENDENCY(LoopSimplify)
-INITIALIZE_PASS_DEPENDENCY(LCSSA)
-INITIALIZE_PASS_DEPENDENCY(SCEVAAWrapperPass)
-INITIALIZE_PASS_DEPENDENCY(BasicAAWrapperPass)
-INITIALIZE_PASS_DEPENDENCY(GlobalsAAWrapperPass)
-INITIALIZE_PASS_END(LoopRotate, "loop-rotate", "Rotate Loops", false, false)
+INITIALIZE_PASS_DEPENDENCY(LoopPass)
+INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass)
+INITIALIZE_PASS_END(LoopRotateLegacyPass, "loop-rotate", "Rotate Loops", false,
+                    false)
 
 Pass *llvm::createLoopRotatePass(int MaxHeaderSize) {
-  return new LoopRotate(MaxHeaderSize);
+  return new LoopRotateLegacyPass(MaxHeaderSize);
 }
diff --git a/lib/Transforms/Scalar/LoopSimplifyCFG.cpp b/lib/Transforms/Scalar/LoopSimplifyCFG.cpp
new file mode 100644
index 000000000000..ec227932c09e
--- /dev/null
+++ b/lib/Transforms/Scalar/LoopSimplifyCFG.cpp
@@ -0,0 +1,114 @@
+//===--------- LoopSimplifyCFG.cpp - Loop CFG Simplification Pass ---------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the Loop SimplifyCFG Pass. This pass is responsible for
+// basic loop CFG cleanup, primarily to assist other loop passes. If you
+// encounter a noncanonical CFG construct that causes another loop pass to
+// perform suboptimally, this is the place to fix it up.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Transforms/Scalar/LoopSimplifyCFG.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/Analysis/AliasAnalysis.h"
+#include "llvm/Analysis/BasicAliasAnalysis.h"
+#include "llvm/Analysis/AssumptionCache.h"
+#include "llvm/Analysis/DependenceAnalysis.h"
+#include "llvm/Analysis/GlobalsModRef.h"
+#include "llvm/Analysis/LoopInfo.h"
+#include "llvm/Analysis/LoopPass.h"
+#include "llvm/Analysis/LoopPassManager.h"
+#include "llvm/Analysis/ScalarEvolution.h"
+#include "llvm/Analysis/ScalarEvolutionAliasAnalysis.h"
+#include "llvm/Analysis/TargetTransformInfo.h"
+#include "llvm/IR/Dominators.h"
+#include "llvm/Transforms/Scalar.h"
+#include "llvm/Transforms/Utils/Local.h"
+#include "llvm/Transforms/Utils/LoopUtils.h"
+using namespace llvm;
+
+#define DEBUG_TYPE "loop-simplifycfg"
+
+static bool simplifyLoopCFG(Loop &L, DominatorTree &DT, LoopInfo &LI) {
+  bool Changed = false;
+  // Copy blocks into a temporary array to avoid iterator invalidation issues
+  // as we remove them.
+  SmallVector<WeakVH, 16> Blocks(L.blocks());
+
+  for (auto &Block : Blocks) {
+    // Attempt to merge blocks in the trivial case. Don't modify blocks which
+    // belong to other loops.
+    BasicBlock *Succ = cast_or_null<BasicBlock>(Block);
+    if (!Succ)
+      continue;
+
+    BasicBlock *Pred = Succ->getSinglePredecessor();
+    if (!Pred || !Pred->getSingleSuccessor() || LI.getLoopFor(Pred) != &L)
+      continue;
+
+    // Pred is going to disappear, so we need to update the loop info.
+    if (L.getHeader() == Pred)
+      L.moveToHeader(Succ);
+    LI.removeBlock(Pred);
+    MergeBasicBlockIntoOnlyPred(Succ, &DT);
+    Changed = true;
+  }
+
+  return Changed;
+}
+
+PreservedAnalyses LoopSimplifyCFGPass::run(Loop &L, AnalysisManager<Loop> &AM) {
+  const auto &FAM =
+      AM.getResult<FunctionAnalysisManagerLoopProxy>(L).getManager();
+  Function *F = L.getHeader()->getParent();
+
+  auto *LI = FAM.getCachedResult<LoopAnalysis>(*F);
+  auto *DT = FAM.getCachedResult<DominatorTreeAnalysis>(*F);
+  assert((LI && DT) && "Analyses for LoopSimplifyCFG not available");
+
+  if (!simplifyLoopCFG(L, *DT, *LI))
+    return PreservedAnalyses::all();
+  return getLoopPassPreservedAnalyses();
+}
+
+namespace {
+class LoopSimplifyCFGLegacyPass : public LoopPass {
+public:
+  static char ID; // Pass ID, replacement for typeid
+  LoopSimplifyCFGLegacyPass() : LoopPass(ID) {
+    initializeLoopSimplifyCFGLegacyPassPass(*PassRegistry::getPassRegistry());
+  }
+
+  bool runOnLoop(Loop *L, LPPassManager &) override {
+    if (skipLoop(L))
+      return false;
+
+    DominatorTree &DT = getAnalysis<DominatorTreeWrapperPass>().getDomTree();
+    LoopInfo &LI = getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
+    return simplifyLoopCFG(*L, DT, LI);
+  }
+
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    AU.addPreserved<DependenceAnalysisWrapperPass>();
+    getLoopAnalysisUsage(AU);
+  }
+};
+}
+
+char LoopSimplifyCFGLegacyPass::ID = 0;
+INITIALIZE_PASS_BEGIN(LoopSimplifyCFGLegacyPass, "loop-simplifycfg",
+                      "Simplify loop CFG", false, false)
+INITIALIZE_PASS_DEPENDENCY(LoopPass)
+INITIALIZE_PASS_END(LoopSimplifyCFGLegacyPass, "loop-simplifycfg",
+                    "Simplify loop CFG", false, false)
+
+Pass *llvm::createLoopSimplifyCFGPass() {
+  return new LoopSimplifyCFGLegacyPass();
+}
diff --git a/lib/Transforms/Scalar/LoopStrengthReduce.cpp b/lib/Transforms/Scalar/LoopStrengthReduce.cpp
index acfdec43d21a..77c77eb7d798 100644
--- a/lib/Transforms/Scalar/LoopStrengthReduce.cpp
+++ b/lib/Transforms/Scalar/LoopStrengthReduce.cpp
@@ -684,10 +684,6 @@ static bool isAddressUse(Instruction *Inst, Value *OperandVal) {
     switch (II->getIntrinsicID()) {
       default: break;
       case Intrinsic::prefetch:
-      case Intrinsic::x86_sse_storeu_ps:
-      case Intrinsic::x86_sse2_storeu_pd:
-      case Intrinsic::x86_sse2_storeu_dq:
-      case Intrinsic::x86_sse2_storel_dq:
         if (II->getArgOperand(0) == OperandVal)
           isAddress = true;
         break;
@@ -704,18 +700,6 @@ static MemAccessTy getAccessType(const Instruction *Inst) {
     AccessTy.AddrSpace = SI->getPointerAddressSpace();
   } else if (const LoadInst *LI = dyn_cast<LoadInst>(Inst)) {
     AccessTy.AddrSpace = LI->getPointerAddressSpace();
-  } else if (const IntrinsicInst *II = dyn_cast<IntrinsicInst>(Inst)) {
-    // Addressing modes can also be folded into prefetches and a variety
-    // of intrinsics.
-    switch (II->getIntrinsicID()) {
-    default: break;
-    case Intrinsic::x86_sse_storeu_ps:
-    case Intrinsic::x86_sse2_storeu_pd:
-    case Intrinsic::x86_sse2_storeu_dq:
-    case Intrinsic::x86_sse2_storel_dq:
-      AccessTy.MemTy = II->getArgOperand(0)->getType();
-      break;
-    }
   }
 
   // All pointers have the same requirements, so canonicalize them to an
@@ -963,8 +947,8 @@ void Cost::RateRegister(const SCEV *Reg,
          isa<SCEVConstant>(cast<SCEVAddRecExpr>(Reg)->getStart()))))
     ++SetupCost;
 
-    NumIVMuls += isa<SCEVMulExpr>(Reg) &&
-                 SE.hasComputableLoopEvolution(Reg, L);
+  NumIVMuls += isa<SCEVMulExpr>(Reg) &&
+               SE.hasComputableLoopEvolution(Reg, L);
 }
 
 /// Record this register in the set. If we haven't seen it before, rate
@@ -2752,34 +2736,31 @@ void LSRInstance::CollectChains() {
   LatchPath.push_back(LoopHeader);
 
   // Walk the instruction stream from the loop header to the loop latch.
-  for (SmallVectorImpl<BasicBlock *>::reverse_iterator
-         BBIter = LatchPath.rbegin(), BBEnd = LatchPath.rend();
-       BBIter != BBEnd; ++BBIter) {
-    for (BasicBlock::iterator I = (*BBIter)->begin(), E = (*BBIter)->end();
-         I != E; ++I) {
+  for (BasicBlock *BB : reverse(LatchPath)) {
+    for (Instruction &I : *BB) {
       // Skip instructions that weren't seen by IVUsers analysis.
-      if (isa<PHINode>(I) || !IU.isIVUserOrOperand(&*I))
+      if (isa<PHINode>(I) || !IU.isIVUserOrOperand(&I))
         continue;
 
       // Ignore users that are part of a SCEV expression. This way we only
       // consider leaf IV Users. This effectively rediscovers a portion of
       // IVUsers analysis but in program order this time.
-      if (SE.isSCEVable(I->getType()) && !isa<SCEVUnknown>(SE.getSCEV(&*I)))
+      if (SE.isSCEVable(I.getType()) && !isa<SCEVUnknown>(SE.getSCEV(&I)))
         continue;
 
       // Remove this instruction from any NearUsers set it may be in.
       for (unsigned ChainIdx = 0, NChains = IVChainVec.size();
            ChainIdx < NChains; ++ChainIdx) {
-        ChainUsersVec[ChainIdx].NearUsers.erase(&*I);
+        ChainUsersVec[ChainIdx].NearUsers.erase(&I);
       }
       // Search for operands that can be chained.
       SmallPtrSet<Instruction*, 4> UniqueOperands;
-      User::op_iterator IVOpEnd = I->op_end();
-      User::op_iterator IVOpIter = findIVOperand(I->op_begin(), IVOpEnd, L, SE);
+      User::op_iterator IVOpEnd = I.op_end();
+      User::op_iterator IVOpIter = findIVOperand(I.op_begin(), IVOpEnd, L, SE);
       while (IVOpIter != IVOpEnd) {
         Instruction *IVOpInst = cast<Instruction>(*IVOpIter);
         if (UniqueOperands.insert(IVOpInst).second)
-          ChainInstruction(&*I, IVOpInst, ChainUsersVec);
+          ChainInstruction(&I, IVOpInst, ChainUsersVec);
         IVOpIter = findIVOperand(std::next(IVOpIter), IVOpEnd, L, SE);
       }
     } // Continue walking down the instructions.
@@ -4331,28 +4312,15 @@ BasicBlock::iterator
 LSRInstance::HoistInsertPosition(BasicBlock::iterator IP,
                                  const SmallVectorImpl<Instruction *> &Inputs)
                                                                          const {
+  Instruction *Tentative = &*IP;
   for (;;) {
-    const Loop *IPLoop = LI.getLoopFor(IP->getParent());
-    unsigned IPLoopDepth = IPLoop ? IPLoop->getLoopDepth() : 0;
-
-    BasicBlock *IDom;
-    for (DomTreeNode *Rung = DT.getNode(IP->getParent()); ; ) {
-      if (!Rung) return IP;
-      Rung = Rung->getIDom();
-      if (!Rung) return IP;
-      IDom = Rung->getBlock();
-
-      // Don't climb into a loop though.
-      const Loop *IDomLoop = LI.getLoopFor(IDom);
-      unsigned IDomDepth = IDomLoop ? IDomLoop->getLoopDepth() : 0;
-      if (IDomDepth <= IPLoopDepth &&
-          (IDomDepth != IPLoopDepth || IDomLoop == IPLoop))
-        break;
-    }
-
     bool AllDominate = true;
     Instruction *BetterPos = nullptr;
-    Instruction *Tentative = IDom->getTerminator();
+    // Don't bother attempting to insert before a catchswitch, their basic block
+    // cannot have other non-PHI instructions.
+    if (isa<CatchSwitchInst>(Tentative))
+      return IP;
+
     for (Instruction *Inst : Inputs) {
       if (Inst == Tentative || !DT.dominates(Inst, Tentative)) {
         AllDominate = false;
@@ -4360,7 +4328,7 @@ LSRInstance::HoistInsertPosition(BasicBlock::iterator IP,
       }
       // Attempt to find an insert position in the middle of the block,
       // instead of at the end, so that it can be used for other expansions.
-      if (IDom == Inst->getParent() &&
+      if (Tentative->getParent() == Inst->getParent() &&
           (!BetterPos || !DT.dominates(Inst, BetterPos)))
         BetterPos = &*std::next(BasicBlock::iterator(Inst));
     }
@@ -4370,6 +4338,26 @@ LSRInstance::HoistInsertPosition(BasicBlock::iterator IP,
       IP = BetterPos->getIterator();
     else
       IP = Tentative->getIterator();
+
+    const Loop *IPLoop = LI.getLoopFor(IP->getParent());
+    unsigned IPLoopDepth = IPLoop ? IPLoop->getLoopDepth() : 0;
+
+    BasicBlock *IDom;
+    for (DomTreeNode *Rung = DT.getNode(IP->getParent()); ; ) {
+      if (!Rung) return IP;
+      Rung = Rung->getIDom();
+      if (!Rung) return IP;
+      IDom = Rung->getBlock();
+
+      // Don't climb into a loop though.
+      const Loop *IDomLoop = LI.getLoopFor(IDom);
+      unsigned IDomDepth = IDomLoop ? IDomLoop->getLoopDepth() : 0;
+      if (IDomDepth <= IPLoopDepth &&
+          (IDomDepth != IPLoopDepth || IDomLoop == IPLoop))
+        break;
+    }
+
+    Tentative = IDom->getTerminator();
   }
 
   return IP;
@@ -4426,7 +4414,7 @@ LSRInstance::AdjustInsertPositionForExpand(BasicBlock::iterator LowestIP,
   while (isa<PHINode>(IP)) ++IP;
 
   // Ignore landingpad instructions.
-  while (!isa<TerminatorInst>(IP) && IP->isEHPad()) ++IP;
+  while (IP->isEHPad()) ++IP;
 
   // Ignore debug intrinsics.
   while (isa<DbgInfoIntrinsic>(IP)) ++IP;
@@ -4961,7 +4949,7 @@ INITIALIZE_PASS_BEGIN(LoopStrengthReduce, "loop-reduce",
 INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass)
 INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
 INITIALIZE_PASS_DEPENDENCY(ScalarEvolutionWrapperPass)
-INITIALIZE_PASS_DEPENDENCY(IVUsers)
+INITIALIZE_PASS_DEPENDENCY(IVUsersWrapperPass)
 INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass)
 INITIALIZE_PASS_DEPENDENCY(LoopSimplify)
 INITIALIZE_PASS_END(LoopStrengthReduce, "loop-reduce",
@@ -4991,16 +4979,16 @@ void LoopStrengthReduce::getAnalysisUsage(AnalysisUsage &AU) const {
   // Requiring LoopSimplify a second time here prevents IVUsers from running
   // twice, since LoopSimplify was invalidated by running ScalarEvolution.
   AU.addRequiredID(LoopSimplifyID);
-  AU.addRequired<IVUsers>();
-  AU.addPreserved<IVUsers>();
+  AU.addRequired<IVUsersWrapperPass>();
+  AU.addPreserved<IVUsersWrapperPass>();
   AU.addRequired<TargetTransformInfoWrapperPass>();
 }
 
 bool LoopStrengthReduce::runOnLoop(Loop *L, LPPassManager & /*LPM*/) {
-  if (skipOptnoneFunction(L))
+  if (skipLoop(L))
     return false;
 
-  auto &IU = getAnalysis<IVUsers>();
+  auto &IU = getAnalysis<IVUsersWrapperPass>().getIU();
   auto &SE = getAnalysis<ScalarEvolutionWrapperPass>().getSE();
   auto &DT = getAnalysis<DominatorTreeWrapperPass>().getDomTree();
   auto &LI = getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
diff --git a/lib/Transforms/Scalar/LoopUnrollPass.cpp b/lib/Transforms/Scalar/LoopUnrollPass.cpp
index ecef6dbe24e6..91af4a1922ce 100644
--- a/lib/Transforms/Scalar/LoopUnrollPass.cpp
+++ b/lib/Transforms/Scalar/LoopUnrollPass.cpp
@@ -12,13 +12,13 @@
 // counts of loops easily.
 //===----------------------------------------------------------------------===//
 
-#include "llvm/Transforms/Scalar.h"
 #include "llvm/ADT/SetVector.h"
-#include "llvm/Analysis/GlobalsModRef.h"
 #include "llvm/Analysis/AssumptionCache.h"
 #include "llvm/Analysis/CodeMetrics.h"
+#include "llvm/Analysis/GlobalsModRef.h"
 #include "llvm/Analysis/InstructionSimplify.h"
 #include "llvm/Analysis/LoopPass.h"
+#include "llvm/Analysis/LoopUnrollAnalyzer.h"
 #include "llvm/Analysis/ScalarEvolution.h"
 #include "llvm/Analysis/ScalarEvolutionExpressions.h"
 #include "llvm/Analysis/TargetTransformInfo.h"
@@ -31,8 +31,11 @@
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/raw_ostream.h"
+#include "llvm/Transforms/Scalar.h"
+#include "llvm/Transforms/Utils/LoopUtils.h"
 #include "llvm/Transforms/Utils/UnrollLoop.h"
 #include <climits>
+#include <utility>
 
 using namespace llvm;
 
@@ -43,40 +46,54 @@ static cl::opt<unsigned>
                     cl::desc("The baseline cost threshold for loop unrolling"));
 
 static cl::opt<unsigned> UnrollPercentDynamicCostSavedThreshold(
-    "unroll-percent-dynamic-cost-saved-threshold", cl::Hidden,
+    "unroll-percent-dynamic-cost-saved-threshold", cl::init(50), cl::Hidden,
     cl::desc("The percentage of estimated dynamic cost which must be saved by "
              "unrolling to allow unrolling up to the max threshold."));
 
 static cl::opt<unsigned> UnrollDynamicCostSavingsDiscount(
-    "unroll-dynamic-cost-savings-discount", cl::Hidden,
+    "unroll-dynamic-cost-savings-discount", cl::init(100), cl::Hidden,
     cl::desc("This is the amount discounted from the total unroll cost when "
              "the unrolled form has a high dynamic cost savings (triggered by "
              "the '-unroll-perecent-dynamic-cost-saved-threshold' flag)."));
 
 static cl::opt<unsigned> UnrollMaxIterationsCountToAnalyze(
-    "unroll-max-iteration-count-to-analyze", cl::init(0), cl::Hidden,
+    "unroll-max-iteration-count-to-analyze", cl::init(10), cl::Hidden,
     cl::desc("Don't allow loop unrolling to simulate more than this number of"
              "iterations when checking full unroll profitability"));
 
-static cl::opt<unsigned>
-UnrollCount("unroll-count", cl::Hidden,
-  cl::desc("Use this unroll count for all loops including those with "
-           "unroll_count pragma values, for testing purposes"));
+static cl::opt<unsigned> UnrollCount(
+    "unroll-count", cl::Hidden,
+    cl::desc("Use this unroll count for all loops including those with "
+             "unroll_count pragma values, for testing purposes"));
 
-static cl::opt<bool>
-UnrollAllowPartial("unroll-allow-partial", cl::Hidden,
-  cl::desc("Allows loops to be partially unrolled until "
-           "-unroll-threshold loop size is reached."));
+static cl::opt<unsigned> UnrollMaxCount(
+    "unroll-max-count", cl::Hidden,
+    cl::desc("Set the max unroll count for partial and runtime unrolling, for"
+             "testing purposes"));
+
+static cl::opt<unsigned> UnrollFullMaxCount(
+    "unroll-full-max-count", cl::Hidden,
+    cl::desc(
+        "Set the max unroll count for full unrolling, for testing purposes"));
 
 static cl::opt<bool>
-UnrollRuntime("unroll-runtime", cl::ZeroOrMore, cl::Hidden,
-  cl::desc("Unroll loops with run-time trip counts"));
+    UnrollAllowPartial("unroll-allow-partial", cl::Hidden,
+                       cl::desc("Allows loops to be partially unrolled until "
+                                "-unroll-threshold loop size is reached."));
 
-static cl::opt<unsigned>
-PragmaUnrollThreshold("pragma-unroll-threshold", cl::init(16 * 1024), cl::Hidden,
-  cl::desc("Unrolled size limit for loops with an unroll(full) or "
-           "unroll_count pragma."));
+static cl::opt<bool> UnrollAllowRemainder(
+    "unroll-allow-remainder", cl::Hidden,
+    cl::desc("Allow generation of a loop remainder (extra iterations) "
+             "when unrolling a loop."));
 
+static cl::opt<bool>
+    UnrollRuntime("unroll-runtime", cl::ZeroOrMore, cl::Hidden,
+                  cl::desc("Unroll loops with run-time trip counts"));
+
+static cl::opt<unsigned> PragmaUnrollThreshold(
+    "pragma-unroll-threshold", cl::init(16 * 1024), cl::Hidden,
+    cl::desc("Unrolled size limit for loops with an unroll(full) or "
+             "unroll_count pragma."));
 
 /// A magic value for use with the Threshold parameter to indicate
 /// that the loop unroll should be performed regardless of how much
@@ -88,26 +105,28 @@ static const unsigned NoThreshold = UINT_MAX;
 static const unsigned DefaultUnrollRuntimeCount = 8;
 
 /// Gather the various unrolling parameters based on the defaults, compiler
-/// flags, TTI overrides, pragmas, and user specified parameters.
+/// flags, TTI overrides and user specified parameters.
 static TargetTransformInfo::UnrollingPreferences gatherUnrollingPreferences(
     Loop *L, const TargetTransformInfo &TTI, Optional<unsigned> UserThreshold,
     Optional<unsigned> UserCount, Optional<bool> UserAllowPartial,
-    Optional<bool> UserRuntime, unsigned PragmaCount, bool PragmaFullUnroll,
-    bool PragmaEnableUnroll, unsigned TripCount) {
+    Optional<bool> UserRuntime) {
   TargetTransformInfo::UnrollingPreferences UP;
 
   // Set up the defaults
   UP.Threshold = 150;
-  UP.PercentDynamicCostSavedThreshold = 20;
-  UP.DynamicCostSavingsDiscount = 2000;
-  UP.OptSizeThreshold = 50;
+  UP.PercentDynamicCostSavedThreshold = 50;
+  UP.DynamicCostSavingsDiscount = 100;
+  UP.OptSizeThreshold = 0;
   UP.PartialThreshold = UP.Threshold;
-  UP.PartialOptSizeThreshold = UP.OptSizeThreshold;
+  UP.PartialOptSizeThreshold = 0;
   UP.Count = 0;
   UP.MaxCount = UINT_MAX;
+  UP.FullUnrollMaxCount = UINT_MAX;
   UP.Partial = false;
   UP.Runtime = false;
+  UP.AllowRemainder = true;
   UP.AllowExpensiveTripCount = false;
+  UP.Force = false;
 
   // Override with any target specific settings
   TTI.getUnrollingPreferences(L, UP);
@@ -118,12 +137,6 @@ static TargetTransformInfo::UnrollingPreferences gatherUnrollingPreferences(
     UP.PartialThreshold = UP.PartialOptSizeThreshold;
   }
 
-  // Apply unroll count pragmas
-  if (PragmaCount)
-    UP.Count = PragmaCount;
-  else if (PragmaFullUnroll)
-    UP.Count = TripCount;
-
   // Apply any user values specified by cl::opt
   if (UnrollThreshold.getNumOccurrences() > 0) {
     UP.Threshold = UnrollThreshold;
@@ -134,10 +147,14 @@ static TargetTransformInfo::UnrollingPreferences gatherUnrollingPreferences(
         UnrollPercentDynamicCostSavedThreshold;
   if (UnrollDynamicCostSavingsDiscount.getNumOccurrences() > 0)
     UP.DynamicCostSavingsDiscount = UnrollDynamicCostSavingsDiscount;
-  if (UnrollCount.getNumOccurrences() > 0)
-    UP.Count = UnrollCount;
+  if (UnrollMaxCount.getNumOccurrences() > 0)
+    UP.MaxCount = UnrollMaxCount;
+  if (UnrollFullMaxCount.getNumOccurrences() > 0)
+    UP.FullUnrollMaxCount = UnrollFullMaxCount;
   if (UnrollAllowPartial.getNumOccurrences() > 0)
     UP.Partial = UnrollAllowPartial;
+  if (UnrollAllowRemainder.getNumOccurrences() > 0)
+    UP.AllowRemainder = UnrollAllowRemainder;
   if (UnrollRuntime.getNumOccurrences() > 0)
     UP.Runtime = UnrollRuntime;
 
@@ -153,259 +170,42 @@ static TargetTransformInfo::UnrollingPreferences gatherUnrollingPreferences(
   if (UserRuntime.hasValue())
     UP.Runtime = *UserRuntime;
 
-  if (PragmaCount > 0 ||
-      ((PragmaFullUnroll || PragmaEnableUnroll) && TripCount != 0)) {
-    // If the loop has an unrolling pragma, we want to be more aggressive with
-    // unrolling limits. Set thresholds to at least the PragmaTheshold value
-    // which is larger than the default limits.
-    if (UP.Threshold != NoThreshold)
-      UP.Threshold = std::max<unsigned>(UP.Threshold, PragmaUnrollThreshold);
-    if (UP.PartialThreshold != NoThreshold)
-      UP.PartialThreshold =
-          std::max<unsigned>(UP.PartialThreshold, PragmaUnrollThreshold);
-  }
-
   return UP;
 }
 
 namespace {
-// This class is used to get an estimate of the optimization effects that we
-// could get from complete loop unrolling. It comes from the fact that some
-// loads might be replaced with concrete constant values and that could trigger
-// a chain of instruction simplifications.
-//
-// E.g. we might have:
-//   int a[] = {0, 1, 0};
-//   v = 0;
-//   for (i = 0; i < 3; i ++)
-//     v += b[i]*a[i];
-// If we completely unroll the loop, we would get:
-//   v = b[0]*a[0] + b[1]*a[1] + b[2]*a[2]
-// Which then will be simplified to:
-//   v = b[0]* 0 + b[1]* 1 + b[2]* 0
-// And finally:
-//   v = b[1]
-class UnrolledInstAnalyzer : private InstVisitor<UnrolledInstAnalyzer, bool> {
-  typedef InstVisitor<UnrolledInstAnalyzer, bool> Base;
-  friend class InstVisitor<UnrolledInstAnalyzer, bool>;
-  struct SimplifiedAddress {
-    Value *Base = nullptr;
-    ConstantInt *Offset = nullptr;
-  };
+/// A struct to densely store the state of an instruction after unrolling at
+/// each iteration.
+///
+/// This is designed to work like a tuple of <Instruction *, int> for the
+/// purposes of hashing and lookup, but to be able to associate two boolean
+/// states with each key.
+struct UnrolledInstState {
+  Instruction *I;
+  int Iteration : 30;
+  unsigned IsFree : 1;
+  unsigned IsCounted : 1;
+};
 
-public:
-  UnrolledInstAnalyzer(unsigned Iteration,
-                       DenseMap<Value *, Constant *> &SimplifiedValues,
-                       ScalarEvolution &SE)
-      : SimplifiedValues(SimplifiedValues), SE(SE) {
-      IterationNumber = SE.getConstant(APInt(64, Iteration));
+/// Hashing and equality testing for a set of the instruction states.
+struct UnrolledInstStateKeyInfo {
+  typedef DenseMapInfo<Instruction *> PtrInfo;
+  typedef DenseMapInfo<std::pair<Instruction *, int>> PairInfo;
+  static inline UnrolledInstState getEmptyKey() {
+    return {PtrInfo::getEmptyKey(), 0, 0, 0};
   }
-
-  // Allow access to the initial visit method.
-  using Base::visit;
-
-private:
-  /// \brief A cache of pointer bases and constant-folded offsets corresponding
-  /// to GEP (or derived from GEP) instructions.
-  ///
-  /// In order to find the base pointer one needs to perform non-trivial
-  /// traversal of the corresponding SCEV expression, so it's good to have the
-  /// results saved.
-  DenseMap<Value *, SimplifiedAddress> SimplifiedAddresses;
-
-  /// \brief SCEV expression corresponding to number of currently simulated
-  /// iteration.
-  const SCEV *IterationNumber;
-
-  /// \brief A Value->Constant map for keeping values that we managed to
-  /// constant-fold on the given iteration.
-  ///
-  /// While we walk the loop instructions, we build up and maintain a mapping
-  /// of simplified values specific to this iteration.  The idea is to propagate
-  /// any special information we have about loads that can be replaced with
-  /// constants after complete unrolling, and account for likely simplifications
-  /// post-unrolling.
-  DenseMap<Value *, Constant *> &SimplifiedValues;
-
-  ScalarEvolution &SE;
-
-  /// \brief Try to simplify instruction \param I using its SCEV expression.
-  ///
-  /// The idea is that some AddRec expressions become constants, which then
-  /// could trigger folding of other instructions. However, that only happens
-  /// for expressions whose start value is also constant, which isn't always the
-  /// case. In another common and important case the start value is just some
-  /// address (i.e. SCEVUnknown) - in this case we compute the offset and save
-  /// it along with the base address instead.
-  bool simplifyInstWithSCEV(Instruction *I) {
-    if (!SE.isSCEVable(I->getType()))
-      return false;
-
-    const SCEV *S = SE.getSCEV(I);
-    if (auto *SC = dyn_cast<SCEVConstant>(S)) {
-      SimplifiedValues[I] = SC->getValue();
-      return true;
-    }
-
-    auto *AR = dyn_cast<SCEVAddRecExpr>(S);
-    if (!AR)
-      return false;
-
-    const SCEV *ValueAtIteration = AR->evaluateAtIteration(IterationNumber, SE);
-    // Check if the AddRec expression becomes a constant.
-    if (auto *SC = dyn_cast<SCEVConstant>(ValueAtIteration)) {
-      SimplifiedValues[I] = SC->getValue();
-      return true;
-    }
-
-    // Check if the offset from the base address becomes a constant.
-    auto *Base = dyn_cast<SCEVUnknown>(SE.getPointerBase(S));
-    if (!Base)
-      return false;
-    auto *Offset =
-        dyn_cast<SCEVConstant>(SE.getMinusSCEV(ValueAtIteration, Base));
-    if (!Offset)
-      return false;
-    SimplifiedAddress Address;
-    Address.Base = Base->getValue();
-    Address.Offset = Offset->getValue();
-    SimplifiedAddresses[I] = Address;
-    return true;
+  static inline UnrolledInstState getTombstoneKey() {
+    return {PtrInfo::getTombstoneKey(), 0, 0, 0};
   }
-
-  /// Base case for the instruction visitor.
-  bool visitInstruction(Instruction &I) {
-    return simplifyInstWithSCEV(&I);
+  static inline unsigned getHashValue(const UnrolledInstState &S) {
+    return PairInfo::getHashValue({S.I, S.Iteration});
   }
-
-  /// Try to simplify binary operator I.
-  ///
-  /// TODO: Probably it's worth to hoist the code for estimating the
-  /// simplifications effects to a separate class, since we have a very similar
-  /// code in InlineCost already.
-  bool visitBinaryOperator(BinaryOperator &I) {
-    Value *LHS = I.getOperand(0), *RHS = I.getOperand(1);
-    if (!isa<Constant>(LHS))
-      if (Constant *SimpleLHS = SimplifiedValues.lookup(LHS))
-        LHS = SimpleLHS;
-    if (!isa<Constant>(RHS))
-      if (Constant *SimpleRHS = SimplifiedValues.lookup(RHS))
-        RHS = SimpleRHS;
-
-    Value *SimpleV = nullptr;
-    const DataLayout &DL = I.getModule()->getDataLayout();
-    if (auto FI = dyn_cast<FPMathOperator>(&I))
-      SimpleV =
-          SimplifyFPBinOp(I.getOpcode(), LHS, RHS, FI->getFastMathFlags(), DL);
-    else
-      SimpleV = SimplifyBinOp(I.getOpcode(), LHS, RHS, DL);
-
-    if (Constant *C = dyn_cast_or_null<Constant>(SimpleV))
-      SimplifiedValues[&I] = C;
-
-    if (SimpleV)
-      return true;
-    return Base::visitBinaryOperator(I);
-  }
-
-  /// Try to fold load I.
-  bool visitLoad(LoadInst &I) {
-    Value *AddrOp = I.getPointerOperand();
-
-    auto AddressIt = SimplifiedAddresses.find(AddrOp);
-    if (AddressIt == SimplifiedAddresses.end())
-      return false;
-    ConstantInt *SimplifiedAddrOp = AddressIt->second.Offset;
-
-    auto *GV = dyn_cast<GlobalVariable>(AddressIt->second.Base);
-    // We're only interested in loads that can be completely folded to a
-    // constant.
-    if (!GV || !GV->hasDefinitiveInitializer() || !GV->isConstant())
-      return false;
-
-    ConstantDataSequential *CDS =
-        dyn_cast<ConstantDataSequential>(GV->getInitializer());
-    if (!CDS)
-      return false;
-
-    // We might have a vector load from an array. FIXME: for now we just bail
-    // out in this case, but we should be able to resolve and simplify such
-    // loads.
-    if(!CDS->isElementTypeCompatible(I.getType()))
-      return false;
-
-    int ElemSize = CDS->getElementType()->getPrimitiveSizeInBits() / 8U;
-    assert(SimplifiedAddrOp->getValue().getActiveBits() < 64 &&
-           "Unexpectedly large index value.");
-    int64_t Index = SimplifiedAddrOp->getSExtValue() / ElemSize;
-    if (Index >= CDS->getNumElements()) {
-      // FIXME: For now we conservatively ignore out of bound accesses, but
-      // we're allowed to perform the optimization in this case.
-      return false;
-    }
-
-    Constant *CV = CDS->getElementAsConstant(Index);
-    assert(CV && "Constant expected.");
-    SimplifiedValues[&I] = CV;
-
-    return true;
-  }
-
-  bool visitCastInst(CastInst &I) {
-    // Propagate constants through casts.
-    Constant *COp = dyn_cast<Constant>(I.getOperand(0));
-    if (!COp)
-      COp = SimplifiedValues.lookup(I.getOperand(0));
-    if (COp)
-      if (Constant *C =
-              ConstantExpr::getCast(I.getOpcode(), COp, I.getType())) {
-        SimplifiedValues[&I] = C;
-        return true;
-      }
-
-    return Base::visitCastInst(I);
-  }
-
-  bool visitCmpInst(CmpInst &I) {
-    Value *LHS = I.getOperand(0), *RHS = I.getOperand(1);
-
-    // First try to handle simplified comparisons.
-    if (!isa<Constant>(LHS))
-      if (Constant *SimpleLHS = SimplifiedValues.lookup(LHS))
-        LHS = SimpleLHS;
-    if (!isa<Constant>(RHS))
-      if (Constant *SimpleRHS = SimplifiedValues.lookup(RHS))
-        RHS = SimpleRHS;
-
-    if (!isa<Constant>(LHS) && !isa<Constant>(RHS)) {
-      auto SimplifiedLHS = SimplifiedAddresses.find(LHS);
-      if (SimplifiedLHS != SimplifiedAddresses.end()) {
-        auto SimplifiedRHS = SimplifiedAddresses.find(RHS);
-        if (SimplifiedRHS != SimplifiedAddresses.end()) {
-          SimplifiedAddress &LHSAddr = SimplifiedLHS->second;
-          SimplifiedAddress &RHSAddr = SimplifiedRHS->second;
-          if (LHSAddr.Base == RHSAddr.Base) {
-            LHS = LHSAddr.Offset;
-            RHS = RHSAddr.Offset;
-          }
-        }
-      }
-    }
-
-    if (Constant *CLHS = dyn_cast<Constant>(LHS)) {
-      if (Constant *CRHS = dyn_cast<Constant>(RHS)) {
-        if (Constant *C = ConstantExpr::getCompare(I.getPredicate(), CLHS, CRHS)) {
-          SimplifiedValues[&I] = C;
-          return true;
-        }
-      }
-    }
-
-    return Base::visitCmpInst(I);
+  static inline bool isEqual(const UnrolledInstState &LHS,
+                             const UnrolledInstState &RHS) {
+    return PairInfo::isEqual({LHS.I, LHS.Iteration}, {RHS.I, RHS.Iteration});
   }
 };
-} // namespace
-
+}
 
 namespace {
 struct EstimatedUnrollCost {
@@ -441,18 +241,25 @@ analyzeLoopUnrollCost(const Loop *L, unsigned TripCount, DominatorTree &DT,
   assert(UnrollMaxIterationsCountToAnalyze < (INT_MAX / 2) &&
          "The unroll iterations max is too large!");
 
+  // Only analyze inner loops. We can't properly estimate cost of nested loops
+  // and we won't visit inner loops again anyway.
+  if (!L->empty())
+    return None;
+
   // Don't simulate loops with a big or unknown tripcount
   if (!UnrollMaxIterationsCountToAnalyze || !TripCount ||
       TripCount > UnrollMaxIterationsCountToAnalyze)
     return None;
 
   SmallSetVector<BasicBlock *, 16> BBWorklist;
+  SmallSetVector<std::pair<BasicBlock *, BasicBlock *>, 4> ExitWorklist;
   DenseMap<Value *, Constant *> SimplifiedValues;
   SmallVector<std::pair<Value *, Constant *>, 4> SimplifiedInputValues;
 
   // The estimated cost of the unrolled form of the loop. We try to estimate
   // this by simplifying as much as we can while computing the estimate.
   int UnrolledCost = 0;
+
   // We also track the estimated dynamic (that is, actually executed) cost in
   // the rolled form. This helps identify cases when the savings from unrolling
   // aren't just exposing dead control flows, but actual reduced dynamic
@@ -460,6 +267,97 @@ analyzeLoopUnrollCost(const Loop *L, unsigned TripCount, DominatorTree &DT,
   // unrolling.
   int RolledDynamicCost = 0;
 
+  // We track the simplification of each instruction in each iteration. We use
+  // this to recursively merge costs into the unrolled cost on-demand so that
+  // we don't count the cost of any dead code. This is essentially a map from
+  // <instruction, int> to <bool, bool>, but stored as a densely packed struct.
+  DenseSet<UnrolledInstState, UnrolledInstStateKeyInfo> InstCostMap;
+
+  // A small worklist used to accumulate cost of instructions from each
+  // observable and reached root in the loop.
+  SmallVector<Instruction *, 16> CostWorklist;
+
+  // PHI-used worklist used between iterations while accumulating cost.
+  SmallVector<Instruction *, 4> PHIUsedList;
+
+  // Helper function to accumulate cost for instructions in the loop.
+  auto AddCostRecursively = [&](Instruction &RootI, int Iteration) {
+    assert(Iteration >= 0 && "Cannot have a negative iteration!");
+    assert(CostWorklist.empty() && "Must start with an empty cost list");
+    assert(PHIUsedList.empty() && "Must start with an empty phi used list");
+    CostWorklist.push_back(&RootI);
+    for (;; --Iteration) {
+      do {
+        Instruction *I = CostWorklist.pop_back_val();
+
+        // InstCostMap only uses I and Iteration as a key, the other two values
+        // don't matter here.
+        auto CostIter = InstCostMap.find({I, Iteration, 0, 0});
+        if (CostIter == InstCostMap.end())
+          // If an input to a PHI node comes from a dead path through the loop
+          // we may have no cost data for it here. What that actually means is
+          // that it is free.
+          continue;
+        auto &Cost = *CostIter;
+        if (Cost.IsCounted)
+          // Already counted this instruction.
+          continue;
+
+        // Mark that we are counting the cost of this instruction now.
+        Cost.IsCounted = true;
+
+        // If this is a PHI node in the loop header, just add it to the PHI set.
+        if (auto *PhiI = dyn_cast<PHINode>(I))
+          if (PhiI->getParent() == L->getHeader()) {
+            assert(Cost.IsFree && "Loop PHIs shouldn't be evaluated as they "
+                                  "inherently simplify during unrolling.");
+            if (Iteration == 0)
+              continue;
+
+            // Push the incoming value from the backedge into the PHI used list
+            // if it is an in-loop instruction. We'll use this to populate the
+            // cost worklist for the next iteration (as we count backwards).
+            if (auto *OpI = dyn_cast<Instruction>(
+                    PhiI->getIncomingValueForBlock(L->getLoopLatch())))
+              if (L->contains(OpI))
+                PHIUsedList.push_back(OpI);
+            continue;
+          }
+
+        // First accumulate the cost of this instruction.
+        if (!Cost.IsFree) {
+          UnrolledCost += TTI.getUserCost(I);
+          DEBUG(dbgs() << "Adding cost of instruction (iteration " << Iteration
+                       << "): ");
+          DEBUG(I->dump());
+        }
+
+        // We must count the cost of every operand which is not free,
+        // recursively. If we reach a loop PHI node, simply add it to the set
+        // to be considered on the next iteration (backwards!).
+        for (Value *Op : I->operands()) {
+          // Check whether this operand is free due to being a constant or
+          // outside the loop.
+          auto *OpI = dyn_cast<Instruction>(Op);
+          if (!OpI || !L->contains(OpI))
+            continue;
+
+          // Otherwise accumulate its cost.
+          CostWorklist.push_back(OpI);
+        }
+      } while (!CostWorklist.empty());
+
+      if (PHIUsedList.empty())
+        // We've exhausted the search.
+        break;
+
+      assert(Iteration > 0 &&
+             "Cannot track PHI-used values past the first iteration!");
+      CostWorklist.append(PHIUsedList.begin(), PHIUsedList.end());
+      PHIUsedList.clear();
+    }
+  };
+
   // Ensure that we don't violate the loop structure invariants relied on by
   // this analysis.
   assert(L->isLoopSimplifyForm() && "Must put loop into normal form first.");
@@ -502,7 +400,7 @@ analyzeLoopUnrollCost(const Loop *L, unsigned TripCount, DominatorTree &DT,
     while (!SimplifiedInputValues.empty())
       SimplifiedValues.insert(SimplifiedInputValues.pop_back_val());
 
-    UnrolledInstAnalyzer Analyzer(Iteration, SimplifiedValues, SE);
+    UnrolledInstAnalyzer Analyzer(Iteration, SimplifiedValues, SE, L);
 
     BBWorklist.clear();
     BBWorklist.insert(L->getHeader());
@@ -514,22 +412,32 @@ analyzeLoopUnrollCost(const Loop *L, unsigned TripCount, DominatorTree &DT,
       // it.  We don't change the actual IR, just count optimization
       // opportunities.
       for (Instruction &I : *BB) {
-        int InstCost = TTI.getUserCost(&I);
+        // Track this instruction's expected baseline cost when executing the
+        // rolled loop form.
+        RolledDynamicCost += TTI.getUserCost(&I);
 
         // Visit the instruction to analyze its loop cost after unrolling,
-        // and if the visitor returns false, include this instruction in the
-        // unrolled cost.
-        if (!Analyzer.visit(I))
-          UnrolledCost += InstCost;
-        else {
-          DEBUG(dbgs() << "  " << I
-                       << " would be simplified if loop is unrolled.\n");
-          (void)0;
-        }
+        // and if the visitor returns true, mark the instruction as free after
+        // unrolling and continue.
+        bool IsFree = Analyzer.visit(I);
+        bool Inserted = InstCostMap.insert({&I, (int)Iteration,
+                                           (unsigned)IsFree,
+                                           /*IsCounted*/ false}).second;
+        (void)Inserted;
+        assert(Inserted && "Cannot have a state for an unvisited instruction!");
+
+        if (IsFree)
+          continue;
 
-        // Also track this instructions expected cost when executing the rolled
-        // loop form.
-        RolledDynamicCost += InstCost;
+        // If the instruction might have a side-effect recursively account for
+        // the cost of it and all the instructions leading up to it.
+        if (I.mayHaveSideEffects())
+          AddCostRecursively(I, Iteration);
+
+        // Can't properly model a cost of a call.
+        // FIXME: With a proper cost model we should be able to do it.
+        if(isa<CallInst>(&I))
+          return None;
 
         // If unrolled body turns out to be too big, bail out.
         if (UnrolledCost > MaxUnrolledLoopSize) {
@@ -545,42 +453,45 @@ analyzeLoopUnrollCost(const Loop *L, unsigned TripCount, DominatorTree &DT,
 
       // Add in the live successors by first checking whether we have terminator
       // that may be simplified based on the values simplified by this call.
+      BasicBlock *KnownSucc = nullptr;
       if (BranchInst *BI = dyn_cast<BranchInst>(TI)) {
         if (BI->isConditional()) {
           if (Constant *SimpleCond =
                   SimplifiedValues.lookup(BI->getCondition())) {
-            BasicBlock *Succ = nullptr;
             // Just take the first successor if condition is undef
             if (isa<UndefValue>(SimpleCond))
-              Succ = BI->getSuccessor(0);
-            else
-              Succ = BI->getSuccessor(
-                  cast<ConstantInt>(SimpleCond)->isZero() ? 1 : 0);
-            if (L->contains(Succ))
-              BBWorklist.insert(Succ);
-            continue;
+              KnownSucc = BI->getSuccessor(0);
+            else if (ConstantInt *SimpleCondVal =
+                         dyn_cast<ConstantInt>(SimpleCond))
+              KnownSucc = BI->getSuccessor(SimpleCondVal->isZero() ? 1 : 0);
           }
         }
       } else if (SwitchInst *SI = dyn_cast<SwitchInst>(TI)) {
         if (Constant *SimpleCond =
                 SimplifiedValues.lookup(SI->getCondition())) {
-          BasicBlock *Succ = nullptr;
           // Just take the first successor if condition is undef
           if (isa<UndefValue>(SimpleCond))
-            Succ = SI->getSuccessor(0);
-          else
-            Succ = SI->findCaseValue(cast<ConstantInt>(SimpleCond))
-                       .getCaseSuccessor();
-          if (L->contains(Succ))
-            BBWorklist.insert(Succ);
-          continue;
+            KnownSucc = SI->getSuccessor(0);
+          else if (ConstantInt *SimpleCondVal =
+                       dyn_cast<ConstantInt>(SimpleCond))
+            KnownSucc = SI->findCaseValue(SimpleCondVal).getCaseSuccessor();
         }
       }
+      if (KnownSucc) {
+        if (L->contains(KnownSucc))
+          BBWorklist.insert(KnownSucc);
+        else
+          ExitWorklist.insert({BB, KnownSucc});
+        continue;
+      }
 
       // Add BB's successors to the worklist.
       for (BasicBlock *Succ : successors(BB))
         if (L->contains(Succ))
           BBWorklist.insert(Succ);
+        else
+          ExitWorklist.insert({BB, Succ});
+      AddCostRecursively(*TI, Iteration);
     }
 
     // If we found no optimization opportunities on the first iteration, we
@@ -591,6 +502,23 @@ analyzeLoopUnrollCost(const Loop *L, unsigned TripCount, DominatorTree &DT,
       return None;
     }
   }
+
+  while (!ExitWorklist.empty()) {
+    BasicBlock *ExitingBB, *ExitBB;
+    std::tie(ExitingBB, ExitBB) = ExitWorklist.pop_back_val();
+
+    for (Instruction &I : *ExitBB) {
+      auto *PN = dyn_cast<PHINode>(&I);
+      if (!PN)
+        break;
+
+      Value *Op = PN->getIncomingValueForBlock(ExitingBB);
+      if (auto *OpI = dyn_cast<Instruction>(Op))
+        if (L->contains(OpI))
+          AddCostRecursively(*OpI, TripCount - 1);
+    }
+  }
+
   DEBUG(dbgs() << "Analysis finished:\n"
                << "UnrolledCost: " << UnrolledCost << ", "
                << "RolledDynamicCost: " << RolledDynamicCost << "\n");
@@ -599,18 +527,18 @@ analyzeLoopUnrollCost(const Loop *L, unsigned TripCount, DominatorTree &DT,
 
 /// ApproximateLoopSize - Approximate the size of the loop.
 static unsigned ApproximateLoopSize(const Loop *L, unsigned &NumCalls,
-                                    bool &NotDuplicatable,
+                                    bool &NotDuplicatable, bool &Convergent,
                                     const TargetTransformInfo &TTI,
                                     AssumptionCache *AC) {
   SmallPtrSet<const Value *, 32> EphValues;
   CodeMetrics::collectEphemeralValues(L, AC, EphValues);
 
   CodeMetrics Metrics;
-  for (Loop::block_iterator I = L->block_begin(), E = L->block_end();
-       I != E; ++I)
-    Metrics.analyzeBasicBlock(*I, TTI, EphValues);
+  for (BasicBlock *BB : L->blocks())
+    Metrics.analyzeBasicBlock(BB, TTI, EphValues);
   NumCalls = Metrics.NumInlineCandidates;
   NotDuplicatable = Metrics.notDuplicatable;
+  Convergent = Metrics.convergent;
 
   unsigned LoopSize = Metrics.NumInsts;
 
@@ -676,21 +604,22 @@ static unsigned UnrollCountPragmaValue(const Loop *L) {
 // unrolling pass is run more than once (which it generally is).
 static void SetLoopAlreadyUnrolled(Loop *L) {
   MDNode *LoopID = L->getLoopID();
-  if (!LoopID) return;
-
   // First remove any existing loop unrolling metadata.
   SmallVector<Metadata *, 4> MDs;
   // Reserve first location for self reference to the LoopID metadata node.
   MDs.push_back(nullptr);
-  for (unsigned i = 1, ie = LoopID->getNumOperands(); i < ie; ++i) {
-    bool IsUnrollMetadata = false;
-    MDNode *MD = dyn_cast<MDNode>(LoopID->getOperand(i));
-    if (MD) {
-      const MDString *S = dyn_cast<MDString>(MD->getOperand(0));
-      IsUnrollMetadata = S && S->getString().startswith("llvm.loop.unroll.");
+
+  if (LoopID) {
+    for (unsigned i = 1, ie = LoopID->getNumOperands(); i < ie; ++i) {
+      bool IsUnrollMetadata = false;
+      MDNode *MD = dyn_cast<MDNode>(LoopID->getOperand(i));
+      if (MD) {
+        const MDString *S = dyn_cast<MDString>(MD->getOperand(0));
+        IsUnrollMetadata = S && S->getString().startswith("llvm.loop.unroll.");
+      }
+      if (!IsUnrollMetadata)
+        MDs.push_back(LoopID->getOperand(i));
     }
-    if (!IsUnrollMetadata)
-      MDs.push_back(LoopID->getOperand(i));
   }
 
   // Add unroll(disable) metadata to disable future unrolling.
@@ -737,9 +666,9 @@ static bool canUnrollCompletely(Loop *L, unsigned Threshold,
       (int64_t)UnrolledCost - (int64_t)DynamicCostSavingsDiscount <=
           (int64_t)Threshold) {
     DEBUG(dbgs() << "  Can fully unroll, because unrolling will reduce the "
-                    "expected dynamic cost by " << PercentDynamicCostSaved
-                 << "% (threshold: " << PercentDynamicCostSavedThreshold
-                 << "%)\n"
+                    "expected dynamic cost by "
+                 << PercentDynamicCostSaved << "% (threshold: "
+                 << PercentDynamicCostSavedThreshold << "%)\n"
                  << "  and the unrolled cost (" << UnrolledCost
                  << ") is less than the max threshold ("
                  << DynamicCostSavingsDiscount << ").\n");
@@ -758,82 +687,77 @@ static bool canUnrollCompletely(Loop *L, unsigned Threshold,
   return false;
 }
 
-static bool tryToUnrollLoop(Loop *L, DominatorTree &DT, LoopInfo *LI,
-                            ScalarEvolution *SE, const TargetTransformInfo &TTI,
-                            AssumptionCache &AC, bool PreserveLCSSA,
-                            Optional<unsigned> ProvidedCount,
-                            Optional<unsigned> ProvidedThreshold,
-                            Optional<bool> ProvidedAllowPartial,
-                            Optional<bool> ProvidedRuntime) {
-  BasicBlock *Header = L->getHeader();
-  DEBUG(dbgs() << "Loop Unroll: F[" << Header->getParent()->getName()
-        << "] Loop %" << Header->getName() << "\n");
+// Returns true if unroll count was set explicitly.
+// Calculates unroll count and writes it to UP.Count.
+static bool computeUnrollCount(Loop *L, const TargetTransformInfo &TTI,
+                               DominatorTree &DT, LoopInfo *LI,
+                               ScalarEvolution *SE, unsigned TripCount,
+                               unsigned TripMultiple, unsigned LoopSize,
+                               TargetTransformInfo::UnrollingPreferences &UP) {
+  // BEInsns represents number of instructions optimized when "back edge"
+  // becomes "fall through" in unrolled loop.
+  // For now we count a conditional branch on a backedge and a comparison
+  // feeding it.
+  unsigned BEInsns = 2;
+  // Check for explicit Count.
+  // 1st priority is unroll count set by "unroll-count" option.
+  bool UserUnrollCount = UnrollCount.getNumOccurrences() > 0;
+  if (UserUnrollCount) {
+    UP.Count = UnrollCount;
+    UP.AllowExpensiveTripCount = true;
+    UP.Force = true;
+    if (UP.AllowRemainder &&
+        (LoopSize - BEInsns) * UP.Count + BEInsns < UP.Threshold)
+      return true;
+  }
 
-  if (HasUnrollDisablePragma(L)) {
-    return false;
+  // 2nd priority is unroll count set by pragma.
+  unsigned PragmaCount = UnrollCountPragmaValue(L);
+  if (PragmaCount > 0) {
+    UP.Count = PragmaCount;
+    UP.Runtime = true;
+    UP.AllowExpensiveTripCount = true;
+    UP.Force = true;
+    if (UP.AllowRemainder &&
+        (LoopSize - BEInsns) * UP.Count + BEInsns < PragmaUnrollThreshold)
+      return true;
   }
   bool PragmaFullUnroll = HasUnrollFullPragma(L);
-  bool PragmaEnableUnroll = HasUnrollEnablePragma(L);
-  unsigned PragmaCount = UnrollCountPragmaValue(L);
-  bool HasPragma = PragmaFullUnroll || PragmaEnableUnroll || PragmaCount > 0;
-
-  // Find trip count and trip multiple if count is not available
-  unsigned TripCount = 0;
-  unsigned TripMultiple = 1;
-  // If there are multiple exiting blocks but one of them is the latch, use the
-  // latch for the trip count estimation. Otherwise insist on a single exiting
-  // block for the trip count estimation.
-  BasicBlock *ExitingBlock = L->getLoopLatch();
-  if (!ExitingBlock || !L->isLoopExiting(ExitingBlock))
-    ExitingBlock = L->getExitingBlock();
-  if (ExitingBlock) {
-    TripCount = SE->getSmallConstantTripCount(L, ExitingBlock);
-    TripMultiple = SE->getSmallConstantTripMultiple(L, ExitingBlock);
+  if (PragmaFullUnroll && TripCount != 0) {
+    UP.Count = TripCount;
+    if ((LoopSize - BEInsns) * UP.Count + BEInsns < PragmaUnrollThreshold)
+      return false;
   }
 
-  TargetTransformInfo::UnrollingPreferences UP = gatherUnrollingPreferences(
-      L, TTI, ProvidedThreshold, ProvidedCount, ProvidedAllowPartial,
-      ProvidedRuntime, PragmaCount, PragmaFullUnroll, PragmaEnableUnroll,
-      TripCount);
-
-  unsigned Count = UP.Count;
-  bool CountSetExplicitly = Count != 0;
-  // Use a heuristic count if we didn't set anything explicitly.
-  if (!CountSetExplicitly)
-    Count = TripCount == 0 ? DefaultUnrollRuntimeCount : TripCount;
-  if (TripCount && Count > TripCount)
-    Count = TripCount;
+  bool PragmaEnableUnroll = HasUnrollEnablePragma(L);
+  bool ExplicitUnroll = PragmaCount > 0 || PragmaFullUnroll ||
+                        PragmaEnableUnroll || UserUnrollCount;
 
-  unsigned NumInlineCandidates;
-  bool notDuplicatable;
-  unsigned LoopSize =
-      ApproximateLoopSize(L, NumInlineCandidates, notDuplicatable, TTI, &AC);
-  DEBUG(dbgs() << "  Loop Size = " << LoopSize << "\n");
+  uint64_t UnrolledSize;
+  DebugLoc LoopLoc = L->getStartLoc();
+  Function *F = L->getHeader()->getParent();
+  LLVMContext &Ctx = F->getContext();
 
-  // When computing the unrolled size, note that the conditional branch on the
-  // backedge and the comparison feeding it are not replicated like the rest of
-  // the loop body (which is why 2 is subtracted).
-  uint64_t UnrolledSize = (uint64_t)(LoopSize-2) * Count + 2;
-  if (notDuplicatable) {
-    DEBUG(dbgs() << "  Not unrolling loop which contains non-duplicatable"
-                 << " instructions.\n");
-    return false;
-  }
-  if (NumInlineCandidates != 0) {
-    DEBUG(dbgs() << "  Not unrolling loop with inlinable calls.\n");
-    return false;
+  if (ExplicitUnroll && TripCount != 0) {
+    // If the loop has an unrolling pragma, we want to be more aggressive with
+    // unrolling limits. Set thresholds to at least the PragmaThreshold value
+    // which is larger than the default limits.
+    UP.Threshold = std::max<unsigned>(UP.Threshold, PragmaUnrollThreshold);
+    UP.PartialThreshold =
+        std::max<unsigned>(UP.PartialThreshold, PragmaUnrollThreshold);
   }
 
-  // Given Count, TripCount and thresholds determine the type of
-  // unrolling which is to be performed.
-  enum { Full = 0, Partial = 1, Runtime = 2 };
-  int Unrolling;
-  if (TripCount && Count == TripCount) {
-    Unrolling = Partial;
-    // If the loop is really small, we don't need to run an expensive analysis.
+  // 3rd priority is full unroll count.
+  // Full unroll make sense only when TripCount could be staticaly calculated.
+  // Also we need to check if we exceed FullUnrollMaxCount.
+  if (TripCount && TripCount <= UP.FullUnrollMaxCount) {
+    // When computing the unrolled size, note that BEInsns are not replicated
+    // like the rest of the loop body.
+    UnrolledSize = (uint64_t)(LoopSize - BEInsns) * TripCount + BEInsns;
     if (canUnrollCompletely(L, UP.Threshold, 100, UP.DynamicCostSavingsDiscount,
                             UnrolledSize, UnrolledSize)) {
-      Unrolling = Full;
+      UP.Count = TripCount;
+      return ExplicitUnroll;
     } else {
       // The loop isn't that small, but we still can fully unroll it if that
       // helps to remove a significant number of instructions.
@@ -845,99 +769,216 @@ static bool tryToUnrollLoop(Loop *L, DominatorTree &DT, LoopInfo *LI,
                                 UP.PercentDynamicCostSavedThreshold,
                                 UP.DynamicCostSavingsDiscount,
                                 Cost->UnrolledCost, Cost->RolledDynamicCost)) {
-          Unrolling = Full;
+          UP.Count = TripCount;
+          return ExplicitUnroll;
         }
     }
-  } else if (TripCount && Count < TripCount) {
-    Unrolling = Partial;
-  } else {
-    Unrolling = Runtime;
   }
 
-  // Reduce count based on the type of unrolling and the threshold values.
-  unsigned OriginalCount = Count;
-  bool AllowRuntime = PragmaEnableUnroll || (PragmaCount > 0) || UP.Runtime;
-  // Don't unroll a runtime trip count loop with unroll full pragma.
-  if (HasRuntimeUnrollDisablePragma(L) || PragmaFullUnroll) {
-    AllowRuntime = false;
-  }
-  if (Unrolling == Partial) {
-    bool AllowPartial = PragmaEnableUnroll || UP.Partial;
-    if (!AllowPartial && !CountSetExplicitly) {
+  // 4rd priority is partial unrolling.
+  // Try partial unroll only when TripCount could be staticaly calculated.
+  if (TripCount) {
+    if (UP.Count == 0)
+      UP.Count = TripCount;
+    UP.Partial |= ExplicitUnroll;
+    if (!UP.Partial) {
       DEBUG(dbgs() << "  will not try to unroll partially because "
                    << "-unroll-allow-partial not given\n");
+      UP.Count = 0;
       return false;
     }
-    if (UP.PartialThreshold != NoThreshold &&
-        UnrolledSize > UP.PartialThreshold) {
+    if (UP.PartialThreshold != NoThreshold) {
       // Reduce unroll count to be modulo of TripCount for partial unrolling.
-      Count = (std::max(UP.PartialThreshold, 3u) - 2) / (LoopSize - 2);
-      while (Count != 0 && TripCount % Count != 0)
-        Count--;
-    }
-  } else if (Unrolling == Runtime) {
-    if (!AllowRuntime && !CountSetExplicitly) {
-      DEBUG(dbgs() << "  will not try to unroll loop with runtime trip count "
-                   << "-unroll-runtime not given\n");
-      return false;
-    }
-    // Reduce unroll count to be the largest power-of-two factor of
-    // the original count which satisfies the threshold limit.
-    while (Count != 0 && UnrolledSize > UP.PartialThreshold) {
-      Count >>= 1;
-      UnrolledSize = (LoopSize-2) * Count + 2;
+      UnrolledSize = (uint64_t)(LoopSize - BEInsns) * UP.Count + BEInsns;
+      if (UnrolledSize > UP.PartialThreshold)
+        UP.Count = (std::max(UP.PartialThreshold, 3u) - BEInsns) /
+                   (LoopSize - BEInsns);
+      if (UP.Count > UP.MaxCount)
+        UP.Count = UP.MaxCount;
+      while (UP.Count != 0 && TripCount % UP.Count != 0)
+        UP.Count--;
+      if (UP.AllowRemainder && UP.Count <= 1) {
+        // If there is no Count that is modulo of TripCount, set Count to
+        // largest power-of-two factor that satisfies the threshold limit.
+        // As we'll create fixup loop, do the type of unrolling only if
+        // remainder loop is allowed.
+        UP.Count = DefaultUnrollRuntimeCount;
+        UnrolledSize = (LoopSize - BEInsns) * UP.Count + BEInsns;
+        while (UP.Count != 0 && UnrolledSize > UP.PartialThreshold) {
+          UP.Count >>= 1;
+          UnrolledSize = (LoopSize - BEInsns) * UP.Count + BEInsns;
+        }
+      }
+      if (UP.Count < 2) {
+        if (PragmaEnableUnroll)
+          emitOptimizationRemarkMissed(
+              Ctx, DEBUG_TYPE, *F, LoopLoc,
+              "Unable to unroll loop as directed by unroll(enable) pragma "
+              "because unrolled size is too large.");
+        UP.Count = 0;
+      }
+    } else {
+      UP.Count = TripCount;
     }
-    if (Count > UP.MaxCount)
-      Count = UP.MaxCount;
-    DEBUG(dbgs() << "  partially unrolling with count: " << Count << "\n");
-  }
-
-  if (HasPragma) {
-    if (PragmaCount != 0)
-      // If loop has an unroll count pragma mark loop as unrolled to prevent
-      // unrolling beyond that requested by the pragma.
-      SetLoopAlreadyUnrolled(L);
-
-    // Emit optimization remarks if we are unable to unroll the loop
-    // as directed by a pragma.
-    DebugLoc LoopLoc = L->getStartLoc();
-    Function *F = Header->getParent();
-    LLVMContext &Ctx = F->getContext();
-    if ((PragmaCount > 0) && Count != OriginalCount) {
-      emitOptimizationRemarkMissed(
-          Ctx, DEBUG_TYPE, *F, LoopLoc,
-          "Unable to unroll loop the number of times directed by "
-          "unroll_count pragma because unrolled size is too large.");
-    } else if (PragmaFullUnroll && !TripCount) {
-      emitOptimizationRemarkMissed(
-          Ctx, DEBUG_TYPE, *F, LoopLoc,
-          "Unable to fully unroll loop as directed by unroll(full) pragma "
-          "because loop has a runtime trip count.");
-    } else if (PragmaEnableUnroll && Count != TripCount && Count < 2) {
+    if ((PragmaFullUnroll || PragmaEnableUnroll) && TripCount &&
+        UP.Count != TripCount)
       emitOptimizationRemarkMissed(
           Ctx, DEBUG_TYPE, *F, LoopLoc,
-          "Unable to unroll loop as directed by unroll(enable) pragma because "
+          "Unable to fully unroll loop as directed by unroll pragma because "
           "unrolled size is too large.");
-    } else if ((PragmaFullUnroll || PragmaEnableUnroll) && TripCount &&
-               Count != TripCount) {
+    return ExplicitUnroll;
+  }
+  assert(TripCount == 0 &&
+         "All cases when TripCount is constant should be covered here.");
+  if (PragmaFullUnroll)
+    emitOptimizationRemarkMissed(
+        Ctx, DEBUG_TYPE, *F, LoopLoc,
+        "Unable to fully unroll loop as directed by unroll(full) pragma "
+        "because loop has a runtime trip count.");
+
+  // 5th priority is runtime unrolling.
+  // Don't unroll a runtime trip count loop when it is disabled.
+  if (HasRuntimeUnrollDisablePragma(L)) {
+    UP.Count = 0;
+    return false;
+  }
+  // Reduce count based on the type of unrolling and the threshold values.
+  UP.Runtime |= PragmaEnableUnroll || PragmaCount > 0 || UserUnrollCount;
+  if (!UP.Runtime) {
+    DEBUG(dbgs() << "  will not try to unroll loop with runtime trip count "
+                 << "-unroll-runtime not given\n");
+    UP.Count = 0;
+    return false;
+  }
+  if (UP.Count == 0)
+    UP.Count = DefaultUnrollRuntimeCount;
+  UnrolledSize = (LoopSize - BEInsns) * UP.Count + BEInsns;
+
+  // Reduce unroll count to be the largest power-of-two factor of
+  // the original count which satisfies the threshold limit.
+  while (UP.Count != 0 && UnrolledSize > UP.PartialThreshold) {
+    UP.Count >>= 1;
+    UnrolledSize = (LoopSize - BEInsns) * UP.Count + BEInsns;
+  }
+
+#ifndef NDEBUG
+  unsigned OrigCount = UP.Count;
+#endif
+
+  if (!UP.AllowRemainder && UP.Count != 0 && (TripMultiple % UP.Count) != 0) {
+    while (UP.Count != 0 && TripMultiple % UP.Count != 0)
+      UP.Count >>= 1;
+    DEBUG(dbgs() << "Remainder loop is restricted (that could architecture "
+                    "specific or because the loop contains a convergent "
+                    "instruction), so unroll count must divide the trip "
+                    "multiple, "
+                 << TripMultiple << ".  Reducing unroll count from "
+                 << OrigCount << " to " << UP.Count << ".\n");
+    if (PragmaCount > 0 && !UP.AllowRemainder)
       emitOptimizationRemarkMissed(
           Ctx, DEBUG_TYPE, *F, LoopLoc,
-          "Unable to fully unroll loop as directed by unroll pragma because "
-          "unrolled size is too large.");
-    }
+          Twine("Unable to unroll loop the number of times directed by "
+                "unroll_count pragma because remainder loop is restricted "
+                "(that could architecture specific or because the loop "
+                "contains a convergent instruction) and so must have an unroll "
+                "count that divides the loop trip multiple of ") +
+              Twine(TripMultiple) + ".  Unrolling instead " + Twine(UP.Count) +
+              " time(s).");
   }
 
-  if (Unrolling != Full && Count < 2) {
-    // Partial unrolling by 1 is a nop.  For full unrolling, a factor
-    // of 1 makes sense because loop control can be eliminated.
+  if (UP.Count > UP.MaxCount)
+    UP.Count = UP.MaxCount;
+  DEBUG(dbgs() << "  partially unrolling with count: " << UP.Count << "\n");
+  if (UP.Count < 2)
+    UP.Count = 0;
+  return ExplicitUnroll;
+}
+
+static bool tryToUnrollLoop(Loop *L, DominatorTree &DT, LoopInfo *LI,
+                            ScalarEvolution *SE, const TargetTransformInfo &TTI,
+                            AssumptionCache &AC, bool PreserveLCSSA,
+                            Optional<unsigned> ProvidedCount,
+                            Optional<unsigned> ProvidedThreshold,
+                            Optional<bool> ProvidedAllowPartial,
+                            Optional<bool> ProvidedRuntime) {
+  DEBUG(dbgs() << "Loop Unroll: F[" << L->getHeader()->getParent()->getName()
+               << "] Loop %" << L->getHeader()->getName() << "\n");
+  if (HasUnrollDisablePragma(L)) {
     return false;
   }
 
+  unsigned NumInlineCandidates;
+  bool NotDuplicatable;
+  bool Convergent;
+  unsigned LoopSize = ApproximateLoopSize(
+      L, NumInlineCandidates, NotDuplicatable, Convergent, TTI, &AC);
+  DEBUG(dbgs() << "  Loop Size = " << LoopSize << "\n");
+  if (NotDuplicatable) {
+    DEBUG(dbgs() << "  Not unrolling loop which contains non-duplicatable"
+                 << " instructions.\n");
+    return false;
+  }
+  if (NumInlineCandidates != 0) {
+    DEBUG(dbgs() << "  Not unrolling loop with inlinable calls.\n");
+    return false;
+  }
+  if (!L->isLoopSimplifyForm()) {
+    DEBUG(
+        dbgs() << "  Not unrolling loop which is not in loop-simplify form.\n");
+    return false;
+  }
+
+  // Find trip count and trip multiple if count is not available
+  unsigned TripCount = 0;
+  unsigned TripMultiple = 1;
+  // If there are multiple exiting blocks but one of them is the latch, use the
+  // latch for the trip count estimation. Otherwise insist on a single exiting
+  // block for the trip count estimation.
+  BasicBlock *ExitingBlock = L->getLoopLatch();
+  if (!ExitingBlock || !L->isLoopExiting(ExitingBlock))
+    ExitingBlock = L->getExitingBlock();
+  if (ExitingBlock) {
+    TripCount = SE->getSmallConstantTripCount(L, ExitingBlock);
+    TripMultiple = SE->getSmallConstantTripMultiple(L, ExitingBlock);
+  }
+
+  TargetTransformInfo::UnrollingPreferences UP = gatherUnrollingPreferences(
+      L, TTI, ProvidedThreshold, ProvidedCount, ProvidedAllowPartial,
+      ProvidedRuntime);
+
+  // If the loop contains a convergent operation, the prelude we'd add
+  // to do the first few instructions before we hit the unrolled loop
+  // is unsafe -- it adds a control-flow dependency to the convergent
+  // operation.  Therefore restrict remainder loop (try unrollig without).
+  //
+  // TODO: This is quite conservative.  In practice, convergent_op()
+  // is likely to be called unconditionally in the loop.  In this
+  // case, the program would be ill-formed (on most architectures)
+  // unless n were the same on all threads in a thread group.
+  // Assuming n is the same on all threads, any kind of unrolling is
+  // safe.  But currently llvm's notion of convergence isn't powerful
+  // enough to express this.
+  if (Convergent)
+    UP.AllowRemainder = false;
+
+  bool IsCountSetExplicitly = computeUnrollCount(L, TTI, DT, LI, SE, TripCount,
+                                                 TripMultiple, LoopSize, UP);
+  if (!UP.Count)
+    return false;
+  // Unroll factor (Count) must be less or equal to TripCount.
+  if (TripCount && UP.Count > TripCount)
+    UP.Count = TripCount;
+
   // Unroll the loop.
-  if (!UnrollLoop(L, Count, TripCount, AllowRuntime, UP.AllowExpensiveTripCount,
-                  TripMultiple, LI, SE, &DT, &AC, PreserveLCSSA))
+  if (!UnrollLoop(L, UP.Count, TripCount, UP.Force, UP.Runtime,
+                  UP.AllowExpensiveTripCount, TripMultiple, LI, SE, &DT, &AC,
+                  PreserveLCSSA))
     return false;
 
+  // If loop has an unroll count pragma or unrolled by explicitly set count
+  // mark loop as unrolled to prevent unrolling beyond that requested.
+  if (IsCountSetExplicitly)
+    SetLoopAlreadyUnrolled(L);
   return true;
 }
 
@@ -948,8 +989,9 @@ public:
   LoopUnroll(Optional<unsigned> Threshold = None,
              Optional<unsigned> Count = None,
              Optional<bool> AllowPartial = None, Optional<bool> Runtime = None)
-      : LoopPass(ID), ProvidedCount(Count), ProvidedThreshold(Threshold),
-        ProvidedAllowPartial(AllowPartial), ProvidedRuntime(Runtime) {
+      : LoopPass(ID), ProvidedCount(std::move(Count)),
+        ProvidedThreshold(Threshold), ProvidedAllowPartial(AllowPartial),
+        ProvidedRuntime(Runtime) {
     initializeLoopUnrollPass(*PassRegistry::getPassRegistry());
   }
 
@@ -959,7 +1001,7 @@ public:
   Optional<bool> ProvidedRuntime;
 
   bool runOnLoop(Loop *L, LPPassManager &) override {
-    if (skipOptnoneFunction(L))
+    if (skipLoop(L))
       return false;
 
     Function &F = *L->getHeader()->getParent();
@@ -982,35 +1024,19 @@ public:
   ///
   void getAnalysisUsage(AnalysisUsage &AU) const override {
     AU.addRequired<AssumptionCacheTracker>();
-    AU.addRequired<DominatorTreeWrapperPass>();
-    AU.addRequired<LoopInfoWrapperPass>();
-    AU.addPreserved<LoopInfoWrapperPass>();
-    AU.addRequiredID(LoopSimplifyID);
-    AU.addPreservedID(LoopSimplifyID);
-    AU.addRequiredID(LCSSAID);
-    AU.addPreservedID(LCSSAID);
-    AU.addRequired<ScalarEvolutionWrapperPass>();
-    AU.addPreserved<ScalarEvolutionWrapperPass>();
     AU.addRequired<TargetTransformInfoWrapperPass>();
-    // FIXME: Loop unroll requires LCSSA. And LCSSA requires dom info.
-    // If loop unroll does not preserve dom info then LCSSA pass on next
-    // loop will receive invalid dom info.
-    // For now, recreate dom info, if loop is unrolled.
-    AU.addPreserved<DominatorTreeWrapperPass>();
-    AU.addPreserved<GlobalsAAWrapperPass>();
+    // FIXME: Loop passes are required to preserve domtree, and for now we just
+    // recreate dom info if anything gets unrolled.
+    getLoopAnalysisUsage(AU);
   }
 };
 }
 
 char LoopUnroll::ID = 0;
 INITIALIZE_PASS_BEGIN(LoopUnroll, "loop-unroll", "Unroll loops", false, false)
-INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass)
 INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker)
-INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
-INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass)
-INITIALIZE_PASS_DEPENDENCY(LoopSimplify)
-INITIALIZE_PASS_DEPENDENCY(LCSSA)
-INITIALIZE_PASS_DEPENDENCY(ScalarEvolutionWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(LoopPass)
+INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass)
 INITIALIZE_PASS_END(LoopUnroll, "loop-unroll", "Unroll loops", false, false)
 
 Pass *llvm::createLoopUnrollPass(int Threshold, int Count, int AllowPartial,
diff --git a/lib/Transforms/Scalar/LoopUnswitch.cpp b/lib/Transforms/Scalar/LoopUnswitch.cpp
index 95d7f8a3beda..71980e85e8ca 100644
--- a/lib/Transforms/Scalar/LoopUnswitch.cpp
+++ b/lib/Transforms/Scalar/LoopUnswitch.cpp
@@ -55,6 +55,7 @@
 #include "llvm/Transforms/Utils/BasicBlockUtils.h"
 #include "llvm/Transforms/Utils/Cloning.h"
 #include "llvm/Transforms/Utils/Local.h"
+#include "llvm/Transforms/Utils/LoopUtils.h"
 #include <algorithm>
 #include <map>
 #include <set>
@@ -64,6 +65,7 @@ using namespace llvm;
 
 STATISTIC(NumBranches, "Number of branches unswitched");
 STATISTIC(NumSwitches, "Number of switches unswitched");
+STATISTIC(NumGuards,   "Number of guards unswitched");
 STATISTIC(NumSelects , "Number of selects unswitched");
 STATISTIC(NumTrivial , "Number of unswitches that are trivial");
 STATISTIC(NumSimplify, "Number of simplifications of unswitched code");
@@ -187,6 +189,9 @@ namespace {
     BasicBlock *loopHeader;
     BasicBlock *loopPreheader;
 
+    bool SanitizeMemory;
+    LoopSafetyInfo SafetyInfo;
+
     // LoopBlocks contains all of the basic blocks of the loop, including the
     // preheader of the loop, the body of the loop, and the exit blocks of the
     // loop, in that order.
@@ -211,17 +216,8 @@ namespace {
     ///
     void getAnalysisUsage(AnalysisUsage &AU) const override {
       AU.addRequired<AssumptionCacheTracker>();
-      AU.addRequiredID(LoopSimplifyID);
-      AU.addPreservedID(LoopSimplifyID);
-      AU.addRequired<LoopInfoWrapperPass>();
-      AU.addPreserved<LoopInfoWrapperPass>();
-      AU.addRequiredID(LCSSAID);
-      AU.addPreservedID(LCSSAID);
-      AU.addRequired<DominatorTreeWrapperPass>();
-      AU.addPreserved<DominatorTreeWrapperPass>();
-      AU.addPreserved<ScalarEvolutionWrapperPass>();
       AU.addRequired<TargetTransformInfoWrapperPass>();
-      AU.addPreserved<GlobalsAAWrapperPass>();
+      getLoopAnalysisUsage(AU);
     }
 
   private:
@@ -382,11 +378,9 @@ void LUAnalysisCache::cloneData(const Loop *NewLoop, const Loop *OldLoop,
 char LoopUnswitch::ID = 0;
 INITIALIZE_PASS_BEGIN(LoopUnswitch, "loop-unswitch", "Unswitch loops",
                       false, false)
-INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass)
 INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker)
-INITIALIZE_PASS_DEPENDENCY(LoopSimplify)
-INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass)
-INITIALIZE_PASS_DEPENDENCY(LCSSA)
+INITIALIZE_PASS_DEPENDENCY(LoopPass)
+INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass)
 INITIALIZE_PASS_END(LoopUnswitch, "loop-unswitch", "Unswitch loops",
                       false, false)
 
@@ -396,7 +390,11 @@ Pass *llvm::createLoopUnswitchPass(bool Os) {
 
 /// Cond is a condition that occurs in L. If it is invariant in the loop, or has
 /// an invariant piece, return the invariant. Otherwise, return null.
-static Value *FindLIVLoopCondition(Value *Cond, Loop *L, bool &Changed) {
+static Value *FindLIVLoopCondition(Value *Cond, Loop *L, bool &Changed,
+                                   DenseMap<Value *, Value *> &Cache) {
+  auto CacheIt = Cache.find(Cond);
+  if (CacheIt != Cache.end())
+    return CacheIt->second;
 
   // We started analyze new instruction, increment scanned instructions counter.
   ++TotalInsts;
@@ -411,8 +409,10 @@ static Value *FindLIVLoopCondition(Value *Cond, Loop *L, bool &Changed) {
   // TODO: Handle: br (VARIANT|INVARIANT).
 
   // Hoist simple values out.
-  if (L->makeLoopInvariant(Cond, Changed))
+  if (L->makeLoopInvariant(Cond, Changed)) {
+    Cache[Cond] = Cond;
     return Cond;
+  }
 
   if (BinaryOperator *BO = dyn_cast<BinaryOperator>(Cond))
     if (BO->getOpcode() == Instruction::And ||
@@ -420,17 +420,29 @@ static Value *FindLIVLoopCondition(Value *Cond, Loop *L, bool &Changed) {
       // If either the left or right side is invariant, we can unswitch on this,
       // which will cause the branch to go away in one loop and the condition to
       // simplify in the other one.
-      if (Value *LHS = FindLIVLoopCondition(BO->getOperand(0), L, Changed))
+      if (Value *LHS =
+              FindLIVLoopCondition(BO->getOperand(0), L, Changed, Cache)) {
+        Cache[Cond] = LHS;
         return LHS;
-      if (Value *RHS = FindLIVLoopCondition(BO->getOperand(1), L, Changed))
+      }
+      if (Value *RHS =
+              FindLIVLoopCondition(BO->getOperand(1), L, Changed, Cache)) {
+        Cache[Cond] = RHS;
         return RHS;
+      }
     }
 
+  Cache[Cond] = nullptr;
   return nullptr;
 }
 
+static Value *FindLIVLoopCondition(Value *Cond, Loop *L, bool &Changed) {
+  DenseMap<Value *, Value *> Cache;
+  return FindLIVLoopCondition(Cond, L, Changed, Cache);
+}
+
 bool LoopUnswitch::runOnLoop(Loop *L, LPPassManager &LPM_Ref) {
-  if (skipOptnoneFunction(L))
+  if (skipLoop(L))
     return false;
 
   AC = &getAnalysis<AssumptionCacheTracker>().getAssumptionCache(
@@ -441,6 +453,10 @@ bool LoopUnswitch::runOnLoop(Loop *L, LPPassManager &LPM_Ref) {
   currentLoop = L;
   Function *F = currentLoop->getHeader()->getParent();
 
+  SanitizeMemory = F->hasFnAttribute(Attribute::SanitizeMemory);
+  if (SanitizeMemory)
+    computeLoopSafetyInfo(&SafetyInfo, L);
+
   EnabledPGO = F->getEntryCount().hasValue();
 
   if (LoopUnswitchWithBlockFrequency && EnabledPGO) {
@@ -499,17 +515,34 @@ bool LoopUnswitch::processCurrentLoop() {
     return true;
   }
 
-  // Do not unswitch loops containing convergent operations, as we might be
-  // making them control dependent on the unswitch value when they were not
-  // before.
-  // FIXME: This could be refined to only bail if the convergent operation is
-  // not already control-dependent on the unswitch value.
+  // Run through the instructions in the loop, keeping track of three things:
+  //
+  //  - That we do not unswitch loops containing convergent operations, as we
+  //    might be making them control dependent on the unswitch value when they
+  //    were not before.
+  //    FIXME: This could be refined to only bail if the convergent operation is
+  //    not already control-dependent on the unswitch value.
+  //
+  //  - That basic blocks in the loop contain invokes whose predecessor edges we
+  //    cannot split.
+  //
+  //  - The set of guard intrinsics encountered (these are non terminator
+  //    instructions that are also profitable to be unswitched).
+
+  SmallVector<IntrinsicInst *, 4> Guards;
+
   for (const auto BB : currentLoop->blocks()) {
     for (auto &I : *BB) {
       auto CS = CallSite(&I);
       if (!CS) continue;
       if (CS.hasFnAttr(Attribute::Convergent))
         return false;
+      if (auto *II = dyn_cast<InvokeInst>(&I))
+        if (!II->getUnwindDest()->canSplitPredecessors())
+          return false;
+      if (auto *II = dyn_cast<IntrinsicInst>(&I))
+        if (II->getIntrinsicID() == Intrinsic::experimental_guard)
+          Guards.push_back(II);
     }
   }
 
@@ -529,12 +562,36 @@ bool LoopUnswitch::processCurrentLoop() {
       return false;
   }
 
+  for (IntrinsicInst *Guard : Guards) {
+    Value *LoopCond =
+        FindLIVLoopCondition(Guard->getOperand(0), currentLoop, Changed);
+    if (LoopCond &&
+        UnswitchIfProfitable(LoopCond, ConstantInt::getTrue(Context))) {
+      // NB! Unswitching (if successful) could have erased some of the
+      // instructions in Guards leaving dangling pointers there.  This is fine
+      // because we're returning now, and won't look at Guards again.
+      ++NumGuards;
+      return true;
+    }
+  }
+
   // Loop over all of the basic blocks in the loop.  If we find an interior
   // block that is branching on a loop-invariant condition, we can unswitch this
   // loop.
   for (Loop::block_iterator I = currentLoop->block_begin(),
          E = currentLoop->block_end(); I != E; ++I) {
     TerminatorInst *TI = (*I)->getTerminator();
+
+    // Unswitching on a potentially uninitialized predicate is not
+    // MSan-friendly. Limit this to the cases when the original predicate is
+    // guaranteed to execute, to avoid creating a use-of-uninitialized-value
+    // in the code that did not have one.
+    // This is a workaround for the discrepancy between LLVM IR and MSan
+    // semantics. See PR28054 for more details.
+    if (SanitizeMemory &&
+        !isGuaranteedToExecute(*TI, DT, currentLoop, &SafetyInfo))
+      continue;
+
     if (BranchInst *BI = dyn_cast<BranchInst>(TI)) {
       // If this isn't branching on an invariant condition, we can't unswitch
       // it.
@@ -628,8 +685,8 @@ static bool isTrivialLoopExitBlockHelper(Loop *L, BasicBlock *BB,
 
   // Okay, everything after this looks good, check to make sure that this block
   // doesn't include any side effects.
-  for (BasicBlock::iterator I = BB->begin(), E = BB->end(); I != E; ++I)
-    if (I->mayHaveSideEffects())
+  for (Instruction &I : *BB)
+    if (I.mayHaveSideEffects())
       return false;
 
   return true;
@@ -679,8 +736,8 @@ static Loop *CloneLoop(Loop *L, Loop *PL, ValueToValueMapTy &VM,
       New.addBasicBlockToLoop(cast<BasicBlock>(VM[*I]), *LI);
 
   // Add all of the subloops to the new loop.
-  for (Loop::iterator I = L->begin(), E = L->end(); I != E; ++I)
-    CloneLoop(*I, &New, VM, LI, LPM);
+  for (Loop *I : *L)
+    CloneLoop(I, &New, VM, LI, LPM);
 
   return &New;
 }
@@ -1075,10 +1132,9 @@ void LoopUnswitch::UnswitchNontrivialCondition(Value *LIC, Constant *Val,
 
   // Rewrite the code to refer to itself.
   for (unsigned i = 0, e = NewBlocks.size(); i != e; ++i)
-    for (BasicBlock::iterator I = NewBlocks[i]->begin(),
-           E = NewBlocks[i]->end(); I != E; ++I)
-      RemapInstruction(&*I, VMap,
-                       RF_NoModuleLevelChanges | RF_IgnoreMissingEntries);
+    for (Instruction &I : *NewBlocks[i])
+      RemapInstruction(&I, VMap,
+                       RF_NoModuleLevelChanges | RF_IgnoreMissingLocals);
 
   // Rewrite the original preheader to select between versions of the loop.
   BranchInst *OldBR = cast<BranchInst>(loopPreheader->getTerminator());
@@ -1180,9 +1236,8 @@ void LoopUnswitch::RewriteLoopBodyWithConditionConstant(Loop *L, Value *LIC,
       Worklist.push_back(UI);
     }
 
-    for (std::vector<Instruction*>::iterator UI = Worklist.begin(),
-         UE = Worklist.end(); UI != UE; ++UI)
-      (*UI)->replaceUsesOfWith(LIC, Replacement);
+    for (Instruction *UI : Worklist)
+      UI->replaceUsesOfWith(LIC, Replacement);
 
     SimplifyCode(Worklist, L);
     return;
diff --git a/lib/Transforms/Scalar/LoopVersioningLICM.cpp b/lib/Transforms/Scalar/LoopVersioningLICM.cpp
new file mode 100644
index 000000000000..0ccf0af7165b
--- /dev/null
+++ b/lib/Transforms/Scalar/LoopVersioningLICM.cpp
@@ -0,0 +1,571 @@
+//===----------- LoopVersioningLICM.cpp - LICM Loop Versioning ------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// When alias analysis is uncertain about the aliasing between any two accesses,
+// it will return MayAlias. This uncertainty from alias analysis restricts LICM
+// from proceeding further. In cases where alias analysis is uncertain we might
+// use loop versioning as an alternative.
+//
+// Loop Versioning will create a version of the loop with aggressive aliasing
+// assumptions in addition to the original with conservative (default) aliasing
+// assumptions. The version of the loop making aggressive aliasing assumptions
+// will have all the memory accesses marked as no-alias. These two versions of
+// loop will be preceded by a memory runtime check. This runtime check consists
+// of bound checks for all unique memory accessed in loop, and it ensures the
+// lack of memory aliasing. The result of the runtime check determines which of
+// the loop versions is executed: If the runtime check detects any memory
+// aliasing, then the original loop is executed. Otherwise, the version with
+// aggressive aliasing assumptions is used.
+//
+// Following are the top level steps:
+//
+// a) Perform LoopVersioningLICM's feasibility check.
+// b) If loop is a candidate for versioning then create a memory bound check,
+//    by considering all the memory accesses in loop body.
+// c) Clone original loop and set all memory accesses as no-alias in new loop.
+// d) Set original loop & versioned loop as a branch target of the runtime check
+//    result.
+//
+// It transforms loop as shown below:
+//
+//                         +----------------+
+//                         |Runtime Memcheck|
+//                         +----------------+
+//                                 |
+//              +----------+----------------+----------+
+//              |                                      |
+//    +---------+----------+               +-----------+----------+
+//    |Orig Loop Preheader |               |Cloned Loop Preheader |
+//    +--------------------+               +----------------------+
+//              |                                      |
+//    +--------------------+               +----------------------+
+//    |Orig Loop Body      |               |Cloned Loop Body      |
+//    +--------------------+               +----------------------+
+//              |                                      |
+//    +--------------------+               +----------------------+
+//    |Orig Loop Exit Block|               |Cloned Loop Exit Block|
+//    +--------------------+               +-----------+----------+
+//              |                                      |
+//              +----------+--------------+-----------+
+//                                 |
+//                           +-----+----+
+//                           |Join Block|
+//                           +----------+
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/ADT/MapVector.h"
+#include "llvm/ADT/SmallPtrSet.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/ADT/StringExtras.h"
+#include "llvm/Analysis/AliasAnalysis.h"
+#include "llvm/Analysis/AliasSetTracker.h"
+#include "llvm/Analysis/ConstantFolding.h"
+#include "llvm/Analysis/GlobalsModRef.h"
+#include "llvm/Analysis/LoopAccessAnalysis.h"
+#include "llvm/Analysis/LoopInfo.h"
+#include "llvm/Analysis/LoopPass.h"
+#include "llvm/Analysis/ScalarEvolution.h"
+#include "llvm/Analysis/ScalarEvolutionExpander.h"
+#include "llvm/Analysis/TargetLibraryInfo.h"
+#include "llvm/Analysis/ValueTracking.h"
+#include "llvm/Analysis/VectorUtils.h"
+#include "llvm/IR/Dominators.h"
+#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/MDBuilder.h"
+#include "llvm/IR/PatternMatch.h"
+#include "llvm/IR/PredIteratorCache.h"
+#include "llvm/IR/Type.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Transforms/Scalar.h"
+#include "llvm/Transforms/Utils/BasicBlockUtils.h"
+#include "llvm/Transforms/Utils/Cloning.h"
+#include "llvm/Transforms/Utils/LoopUtils.h"
+#include "llvm/Transforms/Utils/LoopVersioning.h"
+#include "llvm/Transforms/Utils/ValueMapper.h"
+
+#define DEBUG_TYPE "loop-versioning-licm"
+static const char* LICMVersioningMetaData =
+    "llvm.loop.licm_versioning.disable";
+
+using namespace llvm;
+
+/// Threshold minimum allowed percentage for possible
+/// invariant instructions in a loop.
+static cl::opt<float>
+    LVInvarThreshold("licm-versioning-invariant-threshold",
+                     cl::desc("LoopVersioningLICM's minimum allowed percentage"
+                              "of possible invariant instructions per loop"),
+                     cl::init(25), cl::Hidden);
+
+/// Threshold for maximum allowed loop nest/depth
+static cl::opt<unsigned> LVLoopDepthThreshold(
+    "licm-versioning-max-depth-threshold",
+    cl::desc(
+        "LoopVersioningLICM's threshold for maximum allowed loop nest/depth"),
+    cl::init(2), cl::Hidden);
+
+/// \brief Create MDNode for input string.
+static MDNode *createStringMetadata(Loop *TheLoop, StringRef Name, unsigned V) {
+  LLVMContext &Context = TheLoop->getHeader()->getContext();
+  Metadata *MDs[] = {
+      MDString::get(Context, Name),
+      ConstantAsMetadata::get(ConstantInt::get(Type::getInt32Ty(Context), V))};
+  return MDNode::get(Context, MDs);
+}
+
+/// \brief Set input string into loop metadata by keeping other values intact.
+void llvm::addStringMetadataToLoop(Loop *TheLoop, const char *MDString,
+                                   unsigned V) {
+  SmallVector<Metadata *, 4> MDs(1);
+  // If the loop already has metadata, retain it.
+  MDNode *LoopID = TheLoop->getLoopID();
+  if (LoopID) {
+    for (unsigned i = 1, ie = LoopID->getNumOperands(); i < ie; ++i) {
+      MDNode *Node = cast<MDNode>(LoopID->getOperand(i));
+      MDs.push_back(Node);
+    }
+  }
+  // Add new metadata.
+  MDs.push_back(createStringMetadata(TheLoop, MDString, V));
+  // Replace current metadata node with new one.
+  LLVMContext &Context = TheLoop->getHeader()->getContext();
+  MDNode *NewLoopID = MDNode::get(Context, MDs);
+  // Set operand 0 to refer to the loop id itself.
+  NewLoopID->replaceOperandWith(0, NewLoopID);
+  TheLoop->setLoopID(NewLoopID);
+}
+
+namespace {
+struct LoopVersioningLICM : public LoopPass {
+  static char ID;
+
+  bool runOnLoop(Loop *L, LPPassManager &LPM) override;
+
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    AU.setPreservesCFG();
+    AU.addRequired<AAResultsWrapperPass>();
+    AU.addRequired<DominatorTreeWrapperPass>();
+    AU.addRequiredID(LCSSAID);
+    AU.addRequired<LoopAccessLegacyAnalysis>();
+    AU.addRequired<LoopInfoWrapperPass>();
+    AU.addRequiredID(LoopSimplifyID);
+    AU.addRequired<ScalarEvolutionWrapperPass>();
+    AU.addRequired<TargetLibraryInfoWrapperPass>();
+    AU.addPreserved<AAResultsWrapperPass>();
+    AU.addPreserved<GlobalsAAWrapperPass>();
+  }
+
+  LoopVersioningLICM()
+      : LoopPass(ID), AA(nullptr), SE(nullptr), LI(nullptr), DT(nullptr),
+        TLI(nullptr), LAA(nullptr), LAI(nullptr), Changed(false),
+        Preheader(nullptr), CurLoop(nullptr), CurAST(nullptr),
+        LoopDepthThreshold(LVLoopDepthThreshold),
+        InvariantThreshold(LVInvarThreshold), LoadAndStoreCounter(0),
+        InvariantCounter(0), IsReadOnlyLoop(true) {
+    initializeLoopVersioningLICMPass(*PassRegistry::getPassRegistry());
+  }
+
+  AliasAnalysis *AA;         // Current AliasAnalysis information
+  ScalarEvolution *SE;       // Current ScalarEvolution
+  LoopInfo *LI;              // Current LoopInfo
+  DominatorTree *DT;         // Dominator Tree for the current Loop.
+  TargetLibraryInfo *TLI;    // TargetLibraryInfo for constant folding.
+  LoopAccessLegacyAnalysis *LAA;   // Current LoopAccessAnalysis
+  const LoopAccessInfo *LAI; // Current Loop's LoopAccessInfo
+
+  bool Changed;            // Set to true when we change anything.
+  BasicBlock *Preheader;   // The preheader block of the current loop.
+  Loop *CurLoop;           // The current loop we are working on.
+  AliasSetTracker *CurAST; // AliasSet information for the current loop.
+  ValueToValueMap Strides;
+
+  unsigned LoopDepthThreshold;  // Maximum loop nest threshold
+  float InvariantThreshold;     // Minimum invariant threshold
+  unsigned LoadAndStoreCounter; // Counter to track num of load & store
+  unsigned InvariantCounter;    // Counter to track num of invariant
+  bool IsReadOnlyLoop;          // Read only loop marker.
+
+  bool isLegalForVersioning();
+  bool legalLoopStructure();
+  bool legalLoopInstructions();
+  bool legalLoopMemoryAccesses();
+  bool isLoopAlreadyVisited();
+  void setNoAliasToLoop(Loop *);
+  bool instructionSafeForVersioning(Instruction *);
+  const char *getPassName() const override { return "Loop Versioning"; }
+};
+}
+
+/// \brief Check loop structure and confirms it's good for LoopVersioningLICM.
+bool LoopVersioningLICM::legalLoopStructure() {
+  // Loop must have a preheader, if not return false.
+  if (!CurLoop->getLoopPreheader()) {
+    DEBUG(dbgs() << "    loop preheader is missing\n");
+    return false;
+  }
+  // Loop should be innermost loop, if not return false.
+  if (CurLoop->getSubLoops().size()) {
+    DEBUG(dbgs() << "    loop is not innermost\n");
+    return false;
+  }
+  // Loop should have a single backedge, if not return false.
+  if (CurLoop->getNumBackEdges() != 1) {
+    DEBUG(dbgs() << "    loop has multiple backedges\n");
+    return false;
+  }
+  // Loop must have a single exiting block, if not return false.
+  if (!CurLoop->getExitingBlock()) {
+    DEBUG(dbgs() << "    loop has multiple exiting block\n");
+    return false;
+  }
+  // We only handle bottom-tested loop, i.e. loop in which the condition is
+  // checked at the end of each iteration. With that we can assume that all
+  // instructions in the loop are executed the same number of times.
+  if (CurLoop->getExitingBlock() != CurLoop->getLoopLatch()) {
+    DEBUG(dbgs() << "    loop is not bottom tested\n");
+    return false;
+  }
+  // Parallel loops must not have aliasing loop-invariant memory accesses.
+  // Hence we don't need to version anything in this case.
+  if (CurLoop->isAnnotatedParallel()) {
+    DEBUG(dbgs() << "    Parallel loop is not worth versioning\n");
+    return false;
+  }
+  // Loop depth more then LoopDepthThreshold are not allowed
+  if (CurLoop->getLoopDepth() > LoopDepthThreshold) {
+    DEBUG(dbgs() << "    loop depth is more then threshold\n");
+    return false;
+  }
+  // Loop should have a dedicated exit block, if not return false.
+  if (!CurLoop->hasDedicatedExits()) {
+    DEBUG(dbgs() << "    loop does not has dedicated exit blocks\n");
+    return false;
+  }
+  // We need to be able to compute the loop trip count in order
+  // to generate the bound checks.
+  const SCEV *ExitCount = SE->getBackedgeTakenCount(CurLoop);
+  if (ExitCount == SE->getCouldNotCompute()) {
+    DEBUG(dbgs() << "    loop does not has trip count\n");
+    return false;
+  }
+  return true;
+}
+
+/// \brief Check memory accesses in loop and confirms it's good for
+/// LoopVersioningLICM.
+bool LoopVersioningLICM::legalLoopMemoryAccesses() {
+  bool HasMayAlias = false;
+  bool TypeSafety = false;
+  bool HasMod = false;
+  // Memory check:
+  // Transform phase will generate a versioned loop and also a runtime check to
+  // ensure the pointers are independent and they don’t alias.
+  // In version variant of loop, alias meta data asserts that all access are
+  // mutually independent.
+  //
+  // Pointers aliasing in alias domain are avoided because with multiple
+  // aliasing domains we may not be able to hoist potential loop invariant
+  // access out of the loop.
+  //
+  // Iterate over alias tracker sets, and confirm AliasSets doesn't have any
+  // must alias set.
+  for (const auto &I : *CurAST) {
+    const AliasSet &AS = I;
+    // Skip Forward Alias Sets, as this should be ignored as part of
+    // the AliasSetTracker object.
+    if (AS.isForwardingAliasSet())
+      continue;
+    // With MustAlias its not worth adding runtime bound check.
+    if (AS.isMustAlias())
+      return false;
+    Value *SomePtr = AS.begin()->getValue();
+    bool TypeCheck = true;
+    // Check for Mod & MayAlias
+    HasMayAlias |= AS.isMayAlias();
+    HasMod |= AS.isMod();
+    for (const auto &A : AS) {
+      Value *Ptr = A.getValue();
+      // Alias tracker should have pointers of same data type.
+      TypeCheck = (TypeCheck && (SomePtr->getType() == Ptr->getType()));
+    }
+    // At least one alias tracker should have pointers of same data type.
+    TypeSafety |= TypeCheck;
+  }
+  // Ensure types should be of same type.
+  if (!TypeSafety) {
+    DEBUG(dbgs() << "    Alias tracker type safety failed!\n");
+    return false;
+  }
+  // Ensure loop body shouldn't be read only.
+  if (!HasMod) {
+    DEBUG(dbgs() << "    No memory modified in loop body\n");
+    return false;
+  }
+  // Make sure alias set has may alias case.
+  // If there no alias memory ambiguity, return false.
+  if (!HasMayAlias) {
+    DEBUG(dbgs() << "    No ambiguity in memory access.\n");
+    return false;
+  }
+  return true;
+}
+
+/// \brief Check loop instructions safe for Loop versioning.
+/// It returns true if it's safe else returns false.
+/// Consider following:
+/// 1) Check all load store in loop body are non atomic & non volatile.
+/// 2) Check function call safety, by ensuring its not accessing memory.
+/// 3) Loop body shouldn't have any may throw instruction.
+bool LoopVersioningLICM::instructionSafeForVersioning(Instruction *I) {
+  assert(I != nullptr && "Null instruction found!");
+  // Check function call safety
+  if (isa<CallInst>(I) && !AA->doesNotAccessMemory(CallSite(I))) {
+    DEBUG(dbgs() << "    Unsafe call site found.\n");
+    return false;
+  }
+  // Avoid loops with possiblity of throw
+  if (I->mayThrow()) {
+    DEBUG(dbgs() << "    May throw instruction found in loop body\n");
+    return false;
+  }
+  // If current instruction is load instructions
+  // make sure it's a simple load (non atomic & non volatile)
+  if (I->mayReadFromMemory()) {
+    LoadInst *Ld = dyn_cast<LoadInst>(I);
+    if (!Ld || !Ld->isSimple()) {
+      DEBUG(dbgs() << "    Found a non-simple load.\n");
+      return false;
+    }
+    LoadAndStoreCounter++;
+    Value *Ptr = Ld->getPointerOperand();
+    // Check loop invariant.
+    if (SE->isLoopInvariant(SE->getSCEV(Ptr), CurLoop))
+      InvariantCounter++;
+  }
+  // If current instruction is store instruction
+  // make sure it's a simple store (non atomic & non volatile)
+  else if (I->mayWriteToMemory()) {
+    StoreInst *St = dyn_cast<StoreInst>(I);
+    if (!St || !St->isSimple()) {
+      DEBUG(dbgs() << "    Found a non-simple store.\n");
+      return false;
+    }
+    LoadAndStoreCounter++;
+    Value *Ptr = St->getPointerOperand();
+    // Check loop invariant.
+    if (SE->isLoopInvariant(SE->getSCEV(Ptr), CurLoop))
+      InvariantCounter++;
+
+    IsReadOnlyLoop = false;
+  }
+  return true;
+}
+
+/// \brief Check loop instructions and confirms it's good for
+/// LoopVersioningLICM.
+bool LoopVersioningLICM::legalLoopInstructions() {
+  // Resetting counters.
+  LoadAndStoreCounter = 0;
+  InvariantCounter = 0;
+  IsReadOnlyLoop = true;
+  // Iterate over loop blocks and instructions of each block and check
+  // instruction safety.
+  for (auto *Block : CurLoop->getBlocks())
+    for (auto &Inst : *Block) {
+      // If instruction is unsafe just return false.
+      if (!instructionSafeForVersioning(&Inst))
+        return false;
+    }
+  // Get LoopAccessInfo from current loop.
+  LAI = &LAA->getInfo(CurLoop);
+  // Check LoopAccessInfo for need of runtime check.
+  if (LAI->getRuntimePointerChecking()->getChecks().empty()) {
+    DEBUG(dbgs() << "    LAA: Runtime check not found !!\n");
+    return false;
+  }
+  // Number of runtime-checks should be less then RuntimeMemoryCheckThreshold
+  if (LAI->getNumRuntimePointerChecks() >
+      VectorizerParams::RuntimeMemoryCheckThreshold) {
+    DEBUG(dbgs() << "    LAA: Runtime checks are more than threshold !!\n");
+    return false;
+  }
+  // Loop should have at least one invariant load or store instruction.
+  if (!InvariantCounter) {
+    DEBUG(dbgs() << "    Invariant not found !!\n");
+    return false;
+  }
+  // Read only loop not allowed.
+  if (IsReadOnlyLoop) {
+    DEBUG(dbgs() << "    Found a read-only loop!\n");
+    return false;
+  }
+  // Profitablity check:
+  // Check invariant threshold, should be in limit.
+  if (InvariantCounter * 100 < InvariantThreshold * LoadAndStoreCounter) {
+    DEBUG(dbgs()
+          << "    Invariant load & store are less then defined threshold\n");
+    DEBUG(dbgs() << "    Invariant loads & stores: "
+                 << ((InvariantCounter * 100) / LoadAndStoreCounter) << "%\n");
+    DEBUG(dbgs() << "    Invariant loads & store threshold: "
+                 << InvariantThreshold << "%\n");
+    return false;
+  }
+  return true;
+}
+
+/// \brief It checks loop is already visited or not.
+/// check loop meta data, if loop revisited return true
+/// else false.
+bool LoopVersioningLICM::isLoopAlreadyVisited() {
+  // Check LoopVersioningLICM metadata into loop
+  if (findStringMetadataForLoop(CurLoop, LICMVersioningMetaData)) {
+    return true;
+  }
+  return false;
+}
+
+/// \brief Checks legality for LoopVersioningLICM by considering following:
+/// a) loop structure legality   b) loop instruction legality
+/// c) loop memory access legality.
+/// Return true if legal else returns false.
+bool LoopVersioningLICM::isLegalForVersioning() {
+  DEBUG(dbgs() << "Loop: " << *CurLoop);
+  // Make sure not re-visiting same loop again.
+  if (isLoopAlreadyVisited()) {
+    DEBUG(
+        dbgs() << "    Revisiting loop in LoopVersioningLICM not allowed.\n\n");
+    return false;
+  }
+  // Check loop structure leagality.
+  if (!legalLoopStructure()) {
+    DEBUG(
+        dbgs() << "    Loop structure not suitable for LoopVersioningLICM\n\n");
+    return false;
+  }
+  // Check loop instruction leagality.
+  if (!legalLoopInstructions()) {
+    DEBUG(dbgs()
+          << "    Loop instructions not suitable for LoopVersioningLICM\n\n");
+    return false;
+  }
+  // Check loop memory access leagality.
+  if (!legalLoopMemoryAccesses()) {
+    DEBUG(dbgs()
+          << "    Loop memory access not suitable for LoopVersioningLICM\n\n");
+    return false;
+  }
+  // Loop versioning is feasible, return true.
+  DEBUG(dbgs() << "    Loop Versioning found to be beneficial\n\n");
+  return true;
+}
+
+/// \brief Update loop with aggressive aliasing assumptions.
+/// It marks no-alias to any pairs of memory operations by assuming
+/// loop should not have any must-alias memory accesses pairs.
+/// During LoopVersioningLICM legality we ignore loops having must
+/// aliasing memory accesses.
+void LoopVersioningLICM::setNoAliasToLoop(Loop *VerLoop) {
+  // Get latch terminator instruction.
+  Instruction *I = VerLoop->getLoopLatch()->getTerminator();
+  // Create alias scope domain.
+  MDBuilder MDB(I->getContext());
+  MDNode *NewDomain = MDB.createAnonymousAliasScopeDomain("LVDomain");
+  StringRef Name = "LVAliasScope";
+  SmallVector<Metadata *, 4> Scopes, NoAliases;
+  MDNode *NewScope = MDB.createAnonymousAliasScope(NewDomain, Name);
+  // Iterate over each instruction of loop.
+  // set no-alias for all load & store instructions.
+  for (auto *Block : CurLoop->getBlocks()) {
+    for (auto &Inst : *Block) {
+      // Only interested in instruction that may modify or read memory.
+      if (!Inst.mayReadFromMemory() && !Inst.mayWriteToMemory())
+        continue;
+      Scopes.push_back(NewScope);
+      NoAliases.push_back(NewScope);
+      // Set no-alias for current instruction.
+      Inst.setMetadata(
+          LLVMContext::MD_noalias,
+          MDNode::concatenate(Inst.getMetadata(LLVMContext::MD_noalias),
+                              MDNode::get(Inst.getContext(), NoAliases)));
+      // set alias-scope for current instruction.
+      Inst.setMetadata(
+          LLVMContext::MD_alias_scope,
+          MDNode::concatenate(Inst.getMetadata(LLVMContext::MD_alias_scope),
+                              MDNode::get(Inst.getContext(), Scopes)));
+    }
+  }
+}
+
+bool LoopVersioningLICM::runOnLoop(Loop *L, LPPassManager &LPM) {
+  if (skipLoop(L))
+    return false;
+  Changed = false;
+  // Get Analysis information.
+  LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
+  AA = &getAnalysis<AAResultsWrapperPass>().getAAResults();
+  SE = &getAnalysis<ScalarEvolutionWrapperPass>().getSE();
+  DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
+  TLI = &getAnalysis<TargetLibraryInfoWrapperPass>().getTLI();
+  LAA = &getAnalysis<LoopAccessLegacyAnalysis>();
+  LAI = nullptr;
+  // Set Current Loop
+  CurLoop = L;
+  // Get the preheader block.
+  Preheader = L->getLoopPreheader();
+  // Initial allocation
+  CurAST = new AliasSetTracker(*AA);
+
+  // Loop over the body of this loop, construct AST.
+  for (auto *Block : L->getBlocks()) {
+    if (LI->getLoopFor(Block) == L) // Ignore blocks in subloop.
+      CurAST->add(*Block);          // Incorporate the specified basic block
+  }
+  // Check feasiblity of LoopVersioningLICM.
+  // If versioning found to be feasible and beneficial then proceed
+  // else simply return, by cleaning up memory.
+  if (isLegalForVersioning()) {
+    // Do loop versioning.
+    // Create memcheck for memory accessed inside loop.
+    // Clone original loop, and set blocks properly.
+    LoopVersioning LVer(*LAI, CurLoop, LI, DT, SE, true);
+    LVer.versionLoop();
+    // Set Loop Versioning metaData for original loop.
+    addStringMetadataToLoop(LVer.getNonVersionedLoop(), LICMVersioningMetaData);
+    // Set Loop Versioning metaData for version loop.
+    addStringMetadataToLoop(LVer.getVersionedLoop(), LICMVersioningMetaData);
+    // Set "llvm.mem.parallel_loop_access" metaData to versioned loop.
+    addStringMetadataToLoop(LVer.getVersionedLoop(),
+                            "llvm.mem.parallel_loop_access");
+    // Update version loop with aggressive aliasing assumption.
+    setNoAliasToLoop(LVer.getVersionedLoop());
+    Changed = true;
+  }
+  // Delete allocated memory.
+  delete CurAST;
+  return Changed;
+}
+
+char LoopVersioningLICM::ID = 0;
+INITIALIZE_PASS_BEGIN(LoopVersioningLICM, "loop-versioning-licm",
+                      "Loop Versioning For LICM", false, false)
+INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(GlobalsAAWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(LCSSAWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(LoopAccessLegacyAnalysis)
+INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(LoopSimplify)
+INITIALIZE_PASS_DEPENDENCY(ScalarEvolutionWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfoWrapperPass)
+INITIALIZE_PASS_END(LoopVersioningLICM, "loop-versioning-licm",
+                    "Loop Versioning For LICM", false, false)
+
+Pass *llvm::createLoopVersioningLICMPass() { return new LoopVersioningLICM(); }
diff --git a/lib/Transforms/Scalar/LowerAtomic.cpp b/lib/Transforms/Scalar/LowerAtomic.cpp
index 41511bcb7b04..08e60b16bedf 100644
--- a/lib/Transforms/Scalar/LowerAtomic.cpp
+++ b/lib/Transforms/Scalar/LowerAtomic.cpp
@@ -12,11 +12,12 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "llvm/Transforms/Scalar.h"
+#include "llvm/Transforms/Scalar/LowerAtomic.h"
 #include "llvm/IR/Function.h"
 #include "llvm/IR/IRBuilder.h"
 #include "llvm/IR/IntrinsicInst.h"
 #include "llvm/Pass.h"
+#include "llvm/Transforms/Scalar.h"
 using namespace llvm;
 
 #define DEBUG_TYPE "loweratomic"
@@ -100,49 +101,74 @@ static bool LowerFenceInst(FenceInst *FI) {
 }
 
 static bool LowerLoadInst(LoadInst *LI) {
-  LI->setAtomic(NotAtomic);
+  LI->setAtomic(AtomicOrdering::NotAtomic);
   return true;
 }
 
 static bool LowerStoreInst(StoreInst *SI) {
-  SI->setAtomic(NotAtomic);
+  SI->setAtomic(AtomicOrdering::NotAtomic);
   return true;
 }
 
-namespace {
-  struct LowerAtomic : public BasicBlockPass {
-    static char ID;
-    LowerAtomic() : BasicBlockPass(ID) {
-      initializeLowerAtomicPass(*PassRegistry::getPassRegistry());
-    }
-    bool runOnBasicBlock(BasicBlock &BB) override {
-      if (skipOptnoneFunction(BB))
-        return false;
-      bool Changed = false;
-      for (BasicBlock::iterator DI = BB.begin(), DE = BB.end(); DI != DE; ) {
-        Instruction *Inst = &*DI++;
-        if (FenceInst *FI = dyn_cast<FenceInst>(Inst))
-          Changed |= LowerFenceInst(FI);
-        else if (AtomicCmpXchgInst *CXI = dyn_cast<AtomicCmpXchgInst>(Inst))
-          Changed |= LowerAtomicCmpXchgInst(CXI);
-        else if (AtomicRMWInst *RMWI = dyn_cast<AtomicRMWInst>(Inst))
-          Changed |= LowerAtomicRMWInst(RMWI);
-        else if (LoadInst *LI = dyn_cast<LoadInst>(Inst)) {
-          if (LI->isAtomic())
-            LowerLoadInst(LI);
-        } else if (StoreInst *SI = dyn_cast<StoreInst>(Inst)) {
-          if (SI->isAtomic())
-            LowerStoreInst(SI);
-        }
-      }
-      return Changed;
+static bool runOnBasicBlock(BasicBlock &BB) {
+  bool Changed = false;
+  for (BasicBlock::iterator DI = BB.begin(), DE = BB.end(); DI != DE;) {
+    Instruction *Inst = &*DI++;
+    if (FenceInst *FI = dyn_cast<FenceInst>(Inst))
+      Changed |= LowerFenceInst(FI);
+    else if (AtomicCmpXchgInst *CXI = dyn_cast<AtomicCmpXchgInst>(Inst))
+      Changed |= LowerAtomicCmpXchgInst(CXI);
+    else if (AtomicRMWInst *RMWI = dyn_cast<AtomicRMWInst>(Inst))
+      Changed |= LowerAtomicRMWInst(RMWI);
+    else if (LoadInst *LI = dyn_cast<LoadInst>(Inst)) {
+      if (LI->isAtomic())
+        LowerLoadInst(LI);
+    } else if (StoreInst *SI = dyn_cast<StoreInst>(Inst)) {
+      if (SI->isAtomic())
+        LowerStoreInst(SI);
     }
+  }
+  return Changed;
+}
+
+static bool lowerAtomics(Function &F) {
+  bool Changed = false;
+  for (BasicBlock &BB : F) {
+    Changed |= runOnBasicBlock(BB);
+  }
+  return Changed;
+}
+
+PreservedAnalyses LowerAtomicPass::run(Function &F, FunctionAnalysisManager &) {
+  if (lowerAtomics(F))
+    return PreservedAnalyses::none();
+  return PreservedAnalyses::all();
+}
+
+namespace {
+class LowerAtomicLegacyPass : public FunctionPass {
+public:
+  static char ID;
+
+  LowerAtomicLegacyPass() : FunctionPass(ID) {
+    initializeLowerAtomicLegacyPassPass(*PassRegistry::getPassRegistry());
+  }
+
+  bool runOnFunction(Function &F) override {
+    if (skipFunction(F))
+      return false;
+    FunctionAnalysisManager DummyFAM;
+    auto PA = Impl.run(F, DummyFAM);
+    return !PA.areAllPreserved();
+  }
+
+private:
+  LowerAtomicPass Impl;
   };
 }
 
-char LowerAtomic::ID = 0;
-INITIALIZE_PASS(LowerAtomic, "loweratomic",
-                "Lower atomic intrinsics to non-atomic form",
-                false, false)
+char LowerAtomicLegacyPass::ID = 0;
+INITIALIZE_PASS(LowerAtomicLegacyPass, "loweratomic",
+                "Lower atomic intrinsics to non-atomic form", false, false)
 
-Pass *llvm::createLowerAtomicPass() { return new LowerAtomic(); }
+Pass *llvm::createLowerAtomicPass() { return new LowerAtomicLegacyPass(); }
diff --git a/lib/Transforms/Scalar/LowerExpectIntrinsic.cpp b/lib/Transforms/Scalar/LowerExpectIntrinsic.cpp
index 2ace902a7a1b..79f0db1163a4 100644
--- a/lib/Transforms/Scalar/LowerExpectIntrinsic.cpp
+++ b/lib/Transforms/Scalar/LowerExpectIntrinsic.cpp
@@ -34,12 +34,24 @@ using namespace llvm;
 STATISTIC(ExpectIntrinsicsHandled,
           "Number of 'expect' intrinsic instructions handled");
 
-static cl::opt<uint32_t>
-LikelyBranchWeight("likely-branch-weight", cl::Hidden, cl::init(64),
-                   cl::desc("Weight of the branch likely to be taken (default = 64)"));
-static cl::opt<uint32_t>
-UnlikelyBranchWeight("unlikely-branch-weight", cl::Hidden, cl::init(4),
-                   cl::desc("Weight of the branch unlikely to be taken (default = 4)"));
+// These default values are chosen to represent an extremely skewed outcome for
+// a condition, but they leave some room for interpretation by later passes.
+//
+// If the documentation for __builtin_expect() was made explicit that it should
+// only be used in extreme cases, we could make this ratio higher. As it stands,
+// programmers may be using __builtin_expect() / llvm.expect to annotate that a
+// branch is likely or unlikely to be taken.
+//
+// There is a known dependency on this ratio in CodeGenPrepare when transforming
+// 'select' instructions. It may be worthwhile to hoist these values to some
+// shared space, so they can be used directly by other passes.
+
+static cl::opt<uint32_t> LikelyBranchWeight(
+    "likely-branch-weight", cl::Hidden, cl::init(2000),
+    cl::desc("Weight of the branch likely to be taken (default = 2000)"));
+static cl::opt<uint32_t> UnlikelyBranchWeight(
+    "unlikely-branch-weight", cl::Hidden, cl::init(1),
+    cl::desc("Weight of the branch unlikely to be taken (default = 1)"));
 
 static bool handleSwitchExpect(SwitchInst &SI) {
   CallInst *CI = dyn_cast<CallInst>(SI.getCondition());
@@ -158,7 +170,8 @@ static bool lowerExpectIntrinsic(Function &F) {
   return Changed;
 }
 
-PreservedAnalyses LowerExpectIntrinsicPass::run(Function &F) {
+PreservedAnalyses LowerExpectIntrinsicPass::run(Function &F,
+                                                FunctionAnalysisManager &) {
   if (lowerExpectIntrinsic(F))
     return PreservedAnalyses::none();
 
diff --git a/lib/Transforms/Scalar/LowerGuardIntrinsic.cpp b/lib/Transforms/Scalar/LowerGuardIntrinsic.cpp
new file mode 100644
index 000000000000..57491007d014
--- /dev/null
+++ b/lib/Transforms/Scalar/LowerGuardIntrinsic.cpp
@@ -0,0 +1,123 @@
+//===- LowerGuardIntrinsic.cpp - Lower the guard intrinsic ---------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This pass lowers the llvm.experimental.guard intrinsic to a conditional call
+// to @llvm.experimental.deoptimize.  Once this happens, the guard can no longer
+// be widened.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Transforms/Scalar.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/IR/BasicBlock.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/InstIterator.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/Intrinsics.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/MDBuilder.h"
+#include "llvm/IR/Module.h"
+#include "llvm/Pass.h"
+#include "llvm/Transforms/Utils/BasicBlockUtils.h"
+
+using namespace llvm;
+
+static cl::opt<uint32_t> PredicatePassBranchWeight(
+    "guards-predicate-pass-branch-weight", cl::Hidden, cl::init(1 << 20),
+    cl::desc("The probability of a guard failing is assumed to be the "
+             "reciprocal of this value (default = 1 << 20)"));
+
+namespace {
+struct LowerGuardIntrinsic : public FunctionPass {
+  static char ID;
+  LowerGuardIntrinsic() : FunctionPass(ID) {
+    initializeLowerGuardIntrinsicPass(*PassRegistry::getPassRegistry());
+  }
+
+  bool runOnFunction(Function &F) override;
+};
+}
+
+static void MakeGuardControlFlowExplicit(Function *DeoptIntrinsic,
+                                         CallInst *CI) {
+  OperandBundleDef DeoptOB(*CI->getOperandBundle(LLVMContext::OB_deopt));
+  SmallVector<Value *, 4> Args(std::next(CI->arg_begin()), CI->arg_end());
+
+  auto *CheckBB = CI->getParent();
+  auto *DeoptBlockTerm =
+      SplitBlockAndInsertIfThen(CI->getArgOperand(0), CI, true);
+
+  auto *CheckBI = cast<BranchInst>(CheckBB->getTerminator());
+
+  // SplitBlockAndInsertIfThen inserts control flow that branches to
+  // DeoptBlockTerm if the condition is true.  We want the opposite.
+  CheckBI->swapSuccessors();
+
+  CheckBI->getSuccessor(0)->setName("guarded");
+  CheckBI->getSuccessor(1)->setName("deopt");
+
+  if (auto *MD = CI->getMetadata(LLVMContext::MD_make_implicit))
+    CheckBI->setMetadata(LLVMContext::MD_make_implicit, MD);
+
+  MDBuilder MDB(CI->getContext());
+  CheckBI->setMetadata(LLVMContext::MD_prof,
+                       MDB.createBranchWeights(PredicatePassBranchWeight, 1));
+
+  IRBuilder<> B(DeoptBlockTerm);
+  auto *DeoptCall = B.CreateCall(DeoptIntrinsic, Args, {DeoptOB}, "");
+
+  if (DeoptIntrinsic->getReturnType()->isVoidTy()) {
+    B.CreateRetVoid();
+  } else {
+    DeoptCall->setName("deoptcall");
+    B.CreateRet(DeoptCall);
+  }
+
+  DeoptCall->setCallingConv(CI->getCallingConv());
+  DeoptBlockTerm->eraseFromParent();
+}
+
+bool LowerGuardIntrinsic::runOnFunction(Function &F) {
+  // Check if we can cheaply rule out the possibility of not having any work to
+  // do.
+  auto *GuardDecl = F.getParent()->getFunction(
+      Intrinsic::getName(Intrinsic::experimental_guard));
+  if (!GuardDecl || GuardDecl->use_empty())
+    return false;
+
+  SmallVector<CallInst *, 8> ToLower;
+  for (auto &I : instructions(F))
+    if (auto *CI = dyn_cast<CallInst>(&I))
+      if (auto *F = CI->getCalledFunction())
+        if (F->getIntrinsicID() == Intrinsic::experimental_guard)
+          ToLower.push_back(CI);
+
+  if (ToLower.empty())
+    return false;
+
+  auto *DeoptIntrinsic = Intrinsic::getDeclaration(
+      F.getParent(), Intrinsic::experimental_deoptimize, {F.getReturnType()});
+  DeoptIntrinsic->setCallingConv(GuardDecl->getCallingConv());
+
+  for (auto *CI : ToLower) {
+    MakeGuardControlFlowExplicit(DeoptIntrinsic, CI);
+    CI->eraseFromParent();
+  }
+
+  return true;
+}
+
+char LowerGuardIntrinsic::ID = 0;
+INITIALIZE_PASS(LowerGuardIntrinsic, "lower-guard-intrinsic",
+                "Lower the guard intrinsic to normal control flow", false,
+                false)
+
+Pass *llvm::createLowerGuardIntrinsicPass() {
+  return new LowerGuardIntrinsic();
+}
diff --git a/lib/Transforms/Scalar/Makefile b/lib/Transforms/Scalar/Makefile
deleted file mode 100644
index cc42fd00ac7d..000000000000
--- a/lib/Transforms/Scalar/Makefile
+++ /dev/null
@@ -1,15 +0,0 @@
-##===- lib/Transforms/Scalar/Makefile ----------------------*- Makefile -*-===##
-#
-#                     The LLVM Compiler Infrastructure
-#
-# This file is distributed under the University of Illinois Open Source
-# License. See LICENSE.TXT for details.
-#
-##===----------------------------------------------------------------------===##
-
-LEVEL = ../../..
-LIBRARYNAME = LLVMScalarOpts
-BUILD_ARCHIVE = 1
-
-include $(LEVEL)/Makefile.common
-
diff --git a/lib/Transforms/Scalar/MemCpyOptimizer.cpp b/lib/Transforms/Scalar/MemCpyOptimizer.cpp
index 6b43b0f7a2ad..d64c658f8436 100644
--- a/lib/Transforms/Scalar/MemCpyOptimizer.cpp
+++ b/lib/Transforms/Scalar/MemCpyOptimizer.cpp
@@ -12,22 +12,16 @@
 //
 //===----------------------------------------------------------------------===//
 
+#include "llvm/Transforms/Scalar/MemCpyOptimizer.h"
 #include "llvm/Transforms/Scalar.h"
+#include "llvm/ADT/DenseSet.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/Statistic.h"
-#include "llvm/Analysis/AliasAnalysis.h"
-#include "llvm/Analysis/AssumptionCache.h"
-#include "llvm/Analysis/GlobalsModRef.h"
-#include "llvm/Analysis/MemoryDependenceAnalysis.h"
-#include "llvm/Analysis/TargetLibraryInfo.h"
 #include "llvm/Analysis/ValueTracking.h"
 #include "llvm/IR/DataLayout.h"
-#include "llvm/IR/Dominators.h"
 #include "llvm/IR/GetElementPtrTypeIterator.h"
 #include "llvm/IR/GlobalVariable.h"
 #include "llvm/IR/IRBuilder.h"
-#include "llvm/IR/Instructions.h"
-#include "llvm/IR/IntrinsicInst.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/Transforms/Utils/Local.h"
@@ -184,7 +178,7 @@ bool MemsetRange::isProfitableToUseMemset(const DataLayout &DL) const {
   // size. If so, check to see whether we will end up actually reducing the
   // number of stores used.
   unsigned Bytes = unsigned(End-Start);
-  unsigned MaxIntSize = DL.getLargestLegalIntTypeSize();
+  unsigned MaxIntSize = DL.getLargestLegalIntTypeSizeInBits() / 8;
   if (MaxIntSize == 0)
     MaxIntSize = 1;
   unsigned NumPointerStores = Bytes / MaxIntSize;
@@ -301,19 +295,16 @@ void MemsetRanges::addRange(int64_t Start, int64_t Size, Value *Ptr,
 }
 
 //===----------------------------------------------------------------------===//
-//                         MemCpyOpt Pass
+//                         MemCpyOptLegacyPass Pass
 //===----------------------------------------------------------------------===//
 
 namespace {
-  class MemCpyOpt : public FunctionPass {
-    MemoryDependenceAnalysis *MD;
-    TargetLibraryInfo *TLI;
+  class MemCpyOptLegacyPass : public FunctionPass {
+    MemCpyOptPass Impl;
   public:
     static char ID; // Pass identification, replacement for typeid
-    MemCpyOpt() : FunctionPass(ID) {
-      initializeMemCpyOptPass(*PassRegistry::getPassRegistry());
-      MD = nullptr;
-      TLI = nullptr;
+    MemCpyOptLegacyPass() : FunctionPass(ID) {
+      initializeMemCpyOptLegacyPassPass(*PassRegistry::getPassRegistry());
     }
 
     bool runOnFunction(Function &F) override;
@@ -324,11 +315,11 @@ namespace {
       AU.setPreservesCFG();
       AU.addRequired<AssumptionCacheTracker>();
       AU.addRequired<DominatorTreeWrapperPass>();
-      AU.addRequired<MemoryDependenceAnalysis>();
+      AU.addRequired<MemoryDependenceWrapperPass>();
       AU.addRequired<AAResultsWrapperPass>();
       AU.addRequired<TargetLibraryInfoWrapperPass>();
       AU.addPreserved<GlobalsAAWrapperPass>();
-      AU.addPreserved<MemoryDependenceAnalysis>();
+      AU.addPreserved<MemoryDependenceWrapperPass>();
     }
 
     // Helper functions
@@ -348,29 +339,30 @@ namespace {
     bool iterateOnFunction(Function &F);
   };
 
-  char MemCpyOpt::ID = 0;
+  char MemCpyOptLegacyPass::ID = 0;
 }
 
 /// The public interface to this file...
-FunctionPass *llvm::createMemCpyOptPass() { return new MemCpyOpt(); }
+FunctionPass *llvm::createMemCpyOptPass() { return new MemCpyOptLegacyPass(); }
 
-INITIALIZE_PASS_BEGIN(MemCpyOpt, "memcpyopt", "MemCpy Optimization",
+INITIALIZE_PASS_BEGIN(MemCpyOptLegacyPass, "memcpyopt", "MemCpy Optimization",
                       false, false)
 INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker)
 INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
-INITIALIZE_PASS_DEPENDENCY(MemoryDependenceAnalysis)
+INITIALIZE_PASS_DEPENDENCY(MemoryDependenceWrapperPass)
 INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfoWrapperPass)
 INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass)
 INITIALIZE_PASS_DEPENDENCY(GlobalsAAWrapperPass)
-INITIALIZE_PASS_END(MemCpyOpt, "memcpyopt", "MemCpy Optimization",
+INITIALIZE_PASS_END(MemCpyOptLegacyPass, "memcpyopt", "MemCpy Optimization",
                     false, false)
 
 /// When scanning forward over instructions, we look for some other patterns to
 /// fold away. In particular, this looks for stores to neighboring locations of
 /// memory. If it sees enough consecutive ones, it attempts to merge them
 /// together into a memcpy/memset.
-Instruction *MemCpyOpt::tryMergingIntoMemset(Instruction *StartInst,
-                                             Value *StartPtr, Value *ByteVal) {
+Instruction *MemCpyOptPass::tryMergingIntoMemset(Instruction *StartInst,
+                                                 Value *StartPtr,
+                                                 Value *ByteVal) {
   const DataLayout &DL = StartInst->getModule()->getDataLayout();
 
   // Okay, so we now have a single store that can be splatable.  Scan to find
@@ -493,7 +485,93 @@ static unsigned findCommonAlignment(const DataLayout &DL, const StoreInst *SI,
   return std::min(StoreAlign, LoadAlign);
 }
 
-bool MemCpyOpt::processStore(StoreInst *SI, BasicBlock::iterator &BBI) {
+// This method try to lift a store instruction before position P.
+// It will lift the store and its argument + that anything that
+// may alias with these.
+// The method returns true if it was successful.
+static bool moveUp(AliasAnalysis &AA, StoreInst *SI, Instruction *P) {
+  // If the store alias this position, early bail out.
+  MemoryLocation StoreLoc = MemoryLocation::get(SI);
+  if (AA.getModRefInfo(P, StoreLoc) != MRI_NoModRef)
+    return false;
+
+  // Keep track of the arguments of all instruction we plan to lift
+  // so we can make sure to lift them as well if apropriate.
+  DenseSet<Instruction*> Args;
+  if (auto *Ptr = dyn_cast<Instruction>(SI->getPointerOperand()))
+    if (Ptr->getParent() == SI->getParent())
+      Args.insert(Ptr);
+
+  // Instruction to lift before P.
+  SmallVector<Instruction*, 8> ToLift;
+
+  // Memory locations of lifted instructions.
+  SmallVector<MemoryLocation, 8> MemLocs;
+  MemLocs.push_back(StoreLoc);
+
+  // Lifted callsites.
+  SmallVector<ImmutableCallSite, 8> CallSites;
+
+  for (auto I = --SI->getIterator(), E = P->getIterator(); I != E; --I) {
+    auto *C = &*I;
+
+    bool MayAlias = AA.getModRefInfo(C) != MRI_NoModRef;
+
+    bool NeedLift = false;
+    if (Args.erase(C))
+      NeedLift = true;
+    else if (MayAlias) {
+      NeedLift = std::any_of(MemLocs.begin(), MemLocs.end(),
+        [C, &AA](const MemoryLocation &ML) {
+          return AA.getModRefInfo(C, ML);
+        });
+
+      if (!NeedLift)
+        NeedLift = std::any_of(CallSites.begin(), CallSites.end(),
+          [C, &AA](const ImmutableCallSite &CS) {
+            return AA.getModRefInfo(C, CS);
+          });
+    }
+
+    if (!NeedLift)
+      continue;
+
+    if (MayAlias) {
+      if (auto CS = ImmutableCallSite(C)) {
+        // If we can't lift this before P, it's game over.
+        if (AA.getModRefInfo(P, CS) != MRI_NoModRef)
+          return false;
+
+        CallSites.push_back(CS);
+      } else if (isa<LoadInst>(C) || isa<StoreInst>(C) || isa<VAArgInst>(C)) {
+        // If we can't lift this before P, it's game over.
+        auto ML = MemoryLocation::get(C);
+        if (AA.getModRefInfo(P, ML) != MRI_NoModRef)
+          return false;
+
+        MemLocs.push_back(ML);
+      } else
+        // We don't know how to lift this instruction.
+        return false;
+    }
+
+    ToLift.push_back(C);
+    for (unsigned k = 0, e = C->getNumOperands(); k != e; ++k)
+      if (auto *A = dyn_cast<Instruction>(C->getOperand(k)))
+        if (A->getParent() == SI->getParent())
+          Args.insert(A);
+  }
+
+  // We made it, we need to lift
+  for (auto *I : reverse(ToLift)) {
+    DEBUG(dbgs() << "Lifting " << *I << " before " << *P << "\n");
+    I->moveBefore(P);
+  }
+
+  return true;
+}
+
+bool MemCpyOptPass::processStore(StoreInst *SI, BasicBlock::iterator &BBI) {
   if (!SI->isSimple()) return false;
 
   // Avoid merging nontemporal stores since the resulting
@@ -514,7 +592,7 @@ bool MemCpyOpt::processStore(StoreInst *SI, BasicBlock::iterator &BBI) {
 
       auto *T = LI->getType();
       if (T->isAggregateType()) {
-        AliasAnalysis &AA = getAnalysis<AAResultsWrapperPass>().getAAResults();
+        AliasAnalysis &AA = LookupAliasAnalysis();
         MemoryLocation LoadLoc = MemoryLocation::get(LI);
 
         // We use alias analysis to check if an instruction may store to
@@ -522,26 +600,20 @@ bool MemCpyOpt::processStore(StoreInst *SI, BasicBlock::iterator &BBI) {
         // such an instruction is found, we try to promote there instead
         // of at the store position.
         Instruction *P = SI;
-        for (BasicBlock::iterator I = ++LI->getIterator(), E = SI->getIterator();
-             I != E; ++I) {
-          if (!(AA.getModRefInfo(&*I, LoadLoc) & MRI_Mod))
-            continue;
-
-          // We found an instruction that may write to the loaded memory.
-          // We can try to promote at this position instead of the store
-          // position if nothing alias the store memory after this and the store
-          // destination is not in the range.
-          P = &*I;
-          for (; I != E; ++I) {
-            MemoryLocation StoreLoc = MemoryLocation::get(SI);
-            if (&*I == SI->getOperand(1) ||
-                AA.getModRefInfo(&*I, StoreLoc) != MRI_NoModRef) {
-              P = nullptr;
-              break;
-            }
+        for (auto &I : make_range(++LI->getIterator(), SI->getIterator())) {
+          if (AA.getModRefInfo(&I, LoadLoc) & MRI_Mod) {
+            P = &I;
+            break;
           }
+        }
 
-          break;
+        // We found an instruction that may write to the loaded memory.
+        // We can try to promote at this position instead of the store
+        // position if nothing alias the store memory after this and the store
+        // destination is not in the range.
+        if (P && P != SI) {
+          if (!moveUp(AA, SI, P))
+            P = nullptr;
         }
 
         // If a valid insertion position is found, then we can promote
@@ -594,7 +666,9 @@ bool MemCpyOpt::processStore(StoreInst *SI, BasicBlock::iterator &BBI) {
       if (C) {
         // Check that nothing touches the dest of the "copy" between
         // the call and the store.
-        AliasAnalysis &AA = getAnalysis<AAResultsWrapperPass>().getAAResults();
+        Value *CpyDest = SI->getPointerOperand()->stripPointerCasts();
+        bool CpyDestIsLocal = isa<AllocaInst>(CpyDest);
+        AliasAnalysis &AA = LookupAliasAnalysis();
         MemoryLocation StoreLoc = MemoryLocation::get(SI);
         for (BasicBlock::iterator I = --SI->getIterator(), E = C->getIterator();
              I != E; --I) {
@@ -602,6 +676,12 @@ bool MemCpyOpt::processStore(StoreInst *SI, BasicBlock::iterator &BBI) {
             C = nullptr;
             break;
           }
+          // The store to dest may never happen if an exception can be thrown
+          // between the load and the store.
+          if (I->mayThrow() && !CpyDestIsLocal) {
+            C = nullptr;
+            break;
+          }
         }
       }
 
@@ -665,7 +745,7 @@ bool MemCpyOpt::processStore(StoreInst *SI, BasicBlock::iterator &BBI) {
   return false;
 }
 
-bool MemCpyOpt::processMemSet(MemSetInst *MSI, BasicBlock::iterator &BBI) {
+bool MemCpyOptPass::processMemSet(MemSetInst *MSI, BasicBlock::iterator &BBI) {
   // See if there is another memset or store neighboring this memset which
   // allows us to widen out the memset to do a single larger store.
   if (isa<ConstantInt>(MSI->getLength()) && !MSI->isVolatile())
@@ -681,10 +761,9 @@ bool MemCpyOpt::processMemSet(MemSetInst *MSI, BasicBlock::iterator &BBI) {
 /// Takes a memcpy and a call that it depends on,
 /// and checks for the possibility of a call slot optimization by having
 /// the call write its result directly into the destination of the memcpy.
-bool MemCpyOpt::performCallSlotOptzn(Instruction *cpy,
-                                     Value *cpyDest, Value *cpySrc,
-                                     uint64_t cpyLen, unsigned cpyAlign,
-                                     CallInst *C) {
+bool MemCpyOptPass::performCallSlotOptzn(Instruction *cpy, Value *cpyDest,
+                                         Value *cpySrc, uint64_t cpyLen,
+                                         unsigned cpyAlign, CallInst *C) {
   // The general transformation to keep in mind is
   //
   //   call @func(..., src, ...)
@@ -699,6 +778,11 @@ bool MemCpyOpt::performCallSlotOptzn(Instruction *cpy,
   // src only holds uninitialized values at the moment of the call, meaning that
   // the memcpy can be discarded rather than moved.
 
+  // Lifetime marks shouldn't be operated on.
+  if (Function *F = C->getCalledFunction())
+    if (F->isIntrinsic() && F->getIntrinsicID() == Intrinsic::lifetime_start)
+      return false;
+
   // Deliberately get the source and destination with bitcasts stripped away,
   // because we'll need to do type comparisons based on the underlying type.
   CallSite CS(C);
@@ -734,6 +818,10 @@ bool MemCpyOpt::performCallSlotOptzn(Instruction *cpy,
     if (destSize < srcSize)
       return false;
   } else if (Argument *A = dyn_cast<Argument>(cpyDest)) {
+    // The store to dest may never happen if the call can throw.
+    if (C->mayThrow())
+      return false;
+
     if (A->getDereferenceableBytes() < srcSize) {
       // If the destination is an sret parameter then only accesses that are
       // outside of the returned struct type can trap.
@@ -805,7 +893,7 @@ bool MemCpyOpt::performCallSlotOptzn(Instruction *cpy,
 
   // Since we're changing the parameter to the callsite, we need to make sure
   // that what would be the new parameter dominates the callsite.
-  DominatorTree &DT = getAnalysis<DominatorTreeWrapperPass>().getDomTree();
+  DominatorTree &DT = LookupDomTree();
   if (Instruction *cpyDestInst = dyn_cast<Instruction>(cpyDest))
     if (!DT.dominates(cpyDestInst, C))
       return false;
@@ -814,7 +902,7 @@ bool MemCpyOpt::performCallSlotOptzn(Instruction *cpy,
   // unexpected manner, for example via a global, which we deduce from
   // the use analysis, we also need to know that it does not sneakily
   // access dest.  We rely on AA to figure this out for us.
-  AliasAnalysis &AA = getAnalysis<AAResultsWrapperPass>().getAAResults();
+  AliasAnalysis &AA = LookupAliasAnalysis();
   ModRefInfo MR = AA.getModRefInfo(C, cpyDest, srcSize);
   // If necessary, perform additional analysis.
   if (MR != MRI_NoModRef)
@@ -867,7 +955,8 @@ bool MemCpyOpt::performCallSlotOptzn(Instruction *cpy,
 
 /// We've found that the (upward scanning) memory dependence of memcpy 'M' is
 /// the memcpy 'MDep'. Try to simplify M to copy from MDep's input if we can.
-bool MemCpyOpt::processMemCpyMemCpyDependence(MemCpyInst *M, MemCpyInst *MDep) {
+bool MemCpyOptPass::processMemCpyMemCpyDependence(MemCpyInst *M,
+                                                  MemCpyInst *MDep) {
   // We can only transforms memcpy's where the dest of one is the source of the
   // other.
   if (M->getSource() != MDep->getDest() || MDep->isVolatile())
@@ -888,7 +977,7 @@ bool MemCpyOpt::processMemCpyMemCpyDependence(MemCpyInst *M, MemCpyInst *MDep) {
   if (!MDepLen || !MLen || MDepLen->getZExtValue() < MLen->getZExtValue())
     return false;
 
-  AliasAnalysis &AA = getAnalysis<AAResultsWrapperPass>().getAAResults();
+  AliasAnalysis &AA = LookupAliasAnalysis();
 
   // Verify that the copied-from memory doesn't change in between the two
   // transfers.  For example, in:
@@ -954,8 +1043,8 @@ bool MemCpyOpt::processMemCpyMemCpyDependence(MemCpyInst *M, MemCpyInst *MDep) {
 ///   memcpy(dst, src, src_size);
 ///   memset(dst + src_size, c, dst_size <= src_size ? 0 : dst_size - src_size);
 /// \endcode
-bool MemCpyOpt::processMemSetMemCpyDependence(MemCpyInst *MemCpy,
-                                              MemSetInst *MemSet) {
+bool MemCpyOptPass::processMemSetMemCpyDependence(MemCpyInst *MemCpy,
+                                                  MemSetInst *MemSet) {
   // We can only transform memset/memcpy with the same destination.
   if (MemSet->getDest() != MemCpy->getDest())
     return false;
@@ -1019,8 +1108,8 @@ bool MemCpyOpt::processMemSetMemCpyDependence(MemCpyInst *MemCpy,
 /// When dst2_size <= dst1_size.
 ///
 /// The \p MemCpy must have a Constant length.
-bool MemCpyOpt::performMemCpyToMemSetOptzn(MemCpyInst *MemCpy,
-                                           MemSetInst *MemSet) {
+bool MemCpyOptPass::performMemCpyToMemSetOptzn(MemCpyInst *MemCpy,
+                                               MemSetInst *MemSet) {
   // This only makes sense on memcpy(..., memset(...), ...).
   if (MemSet->getRawDest() != MemCpy->getRawSource())
     return false;
@@ -1043,7 +1132,7 @@ bool MemCpyOpt::performMemCpyToMemSetOptzn(MemCpyInst *MemCpy,
 /// B to be a memcpy from X to Z (or potentially a memmove, depending on
 /// circumstances). This allows later passes to remove the first memcpy
 /// altogether.
-bool MemCpyOpt::processMemCpy(MemCpyInst *M) {
+bool MemCpyOptPass::processMemCpy(MemCpyInst *M) {
   // We can only optimize non-volatile memcpy's.
   if (M->isVolatile()) return false;
 
@@ -1141,8 +1230,8 @@ bool MemCpyOpt::processMemCpy(MemCpyInst *M) {
 
 /// Transforms memmove calls to memcpy calls when the src/dst are guaranteed
 /// not to alias.
-bool MemCpyOpt::processMemMove(MemMoveInst *M) {
-  AliasAnalysis &AA = getAnalysis<AAResultsWrapperPass>().getAAResults();
+bool MemCpyOptPass::processMemMove(MemMoveInst *M) {
+  AliasAnalysis &AA = LookupAliasAnalysis();
 
   if (!TLI->has(LibFunc::memmove))
     return false;
@@ -1152,7 +1241,8 @@ bool MemCpyOpt::processMemMove(MemMoveInst *M) {
                     MemoryLocation::getForSource(M)))
     return false;
 
-  DEBUG(dbgs() << "MemCpyOpt: Optimizing memmove -> memcpy: " << *M << "\n");
+  DEBUG(dbgs() << "MemCpyOptPass: Optimizing memmove -> memcpy: " << *M
+               << "\n");
 
   // If not, then we know we can transform this.
   Type *ArgTys[3] = { M->getRawDest()->getType(),
@@ -1170,7 +1260,7 @@ bool MemCpyOpt::processMemMove(MemMoveInst *M) {
 }
 
 /// This is called on every byval argument in call sites.
-bool MemCpyOpt::processByValArgument(CallSite CS, unsigned ArgNo) {
+bool MemCpyOptPass::processByValArgument(CallSite CS, unsigned ArgNo) {
   const DataLayout &DL = CS.getCaller()->getParent()->getDataLayout();
   // Find out what feeds this byval argument.
   Value *ByValArg = CS.getArgument(ArgNo);
@@ -1202,10 +1292,8 @@ bool MemCpyOpt::processByValArgument(CallSite CS, unsigned ArgNo) {
 
   // If it is greater than the memcpy, then we check to see if we can force the
   // source of the memcpy to the alignment we need.  If we fail, we bail out.
-  AssumptionCache &AC =
-      getAnalysis<AssumptionCacheTracker>().getAssumptionCache(
-          *CS->getParent()->getParent());
-  DominatorTree &DT = getAnalysis<DominatorTreeWrapperPass>().getDomTree();
+  AssumptionCache &AC = LookupAssumptionCache();
+  DominatorTree &DT = LookupDomTree();
   if (MDep->getAlignment() < ByValAlign &&
       getOrEnforceKnownAlignment(MDep->getSource(), ByValAlign, DL,
                                  CS.getInstruction(), &AC, &DT) < ByValAlign)
@@ -1231,7 +1319,7 @@ bool MemCpyOpt::processByValArgument(CallSite CS, unsigned ArgNo) {
     TmpCast = new BitCastInst(MDep->getSource(), ByValArg->getType(),
                               "tmpcast", CS.getInstruction());
 
-  DEBUG(dbgs() << "MemCpyOpt: Forwarding memcpy to byval:\n"
+  DEBUG(dbgs() << "MemCpyOptPass: Forwarding memcpy to byval:\n"
                << "  " << *MDep << "\n"
                << "  " << *CS.getInstruction() << "\n");
 
@@ -1241,13 +1329,13 @@ bool MemCpyOpt::processByValArgument(CallSite CS, unsigned ArgNo) {
   return true;
 }
 
-/// Executes one iteration of MemCpyOpt.
-bool MemCpyOpt::iterateOnFunction(Function &F) {
+/// Executes one iteration of MemCpyOptPass.
+bool MemCpyOptPass::iterateOnFunction(Function &F) {
   bool MadeChange = false;
 
   // Walk all instruction in the function.
-  for (Function::iterator BB = F.begin(), BBE = F.end(); BB != BBE; ++BB) {
-    for (BasicBlock::iterator BI = BB->begin(), BE = BB->end(); BI != BE;) {
+  for (BasicBlock &BB : F) {
+    for (BasicBlock::iterator BI = BB.begin(), BE = BB.end(); BI != BE;) {
       // Avoid invalidating the iterator.
       Instruction *I = &*BI++;
 
@@ -1269,7 +1357,8 @@ bool MemCpyOpt::iterateOnFunction(Function &F) {
 
       // Reprocess the instruction if desired.
       if (RepeatInstruction) {
-        if (BI != BB->begin()) --BI;
+        if (BI != BB.begin())
+          --BI;
         MadeChange = true;
       }
     }
@@ -1278,14 +1367,42 @@ bool MemCpyOpt::iterateOnFunction(Function &F) {
   return MadeChange;
 }
 
-/// This is the main transformation entry point for a function.
-bool MemCpyOpt::runOnFunction(Function &F) {
-  if (skipOptnoneFunction(F))
-    return false;
+PreservedAnalyses MemCpyOptPass::run(Function &F, FunctionAnalysisManager &AM) {
+
+  auto &MD = AM.getResult<MemoryDependenceAnalysis>(F);
+  auto &TLI = AM.getResult<TargetLibraryAnalysis>(F);
+
+  auto LookupAliasAnalysis = [&]() -> AliasAnalysis & {
+    return AM.getResult<AAManager>(F);
+  };
+  auto LookupAssumptionCache = [&]() -> AssumptionCache & {
+    return AM.getResult<AssumptionAnalysis>(F);
+  };
+  auto LookupDomTree = [&]() -> DominatorTree & {
+    return AM.getResult<DominatorTreeAnalysis>(F);
+  };
+
+  bool MadeChange = runImpl(F, &MD, &TLI, LookupAliasAnalysis,
+                            LookupAssumptionCache, LookupDomTree);
+  if (!MadeChange)
+    return PreservedAnalyses::all();
+  PreservedAnalyses PA;
+  PA.preserve<GlobalsAA>();
+  PA.preserve<MemoryDependenceAnalysis>();
+  return PA;
+}
 
+bool MemCpyOptPass::runImpl(
+    Function &F, MemoryDependenceResults *MD_, TargetLibraryInfo *TLI_,
+    std::function<AliasAnalysis &()> LookupAliasAnalysis_,
+    std::function<AssumptionCache &()> LookupAssumptionCache_,
+    std::function<DominatorTree &()> LookupDomTree_) {
   bool MadeChange = false;
-  MD = &getAnalysis<MemoryDependenceAnalysis>();
-  TLI = &getAnalysis<TargetLibraryInfoWrapperPass>().getTLI();
+  MD = MD_;
+  TLI = TLI_;
+  LookupAliasAnalysis = std::move(LookupAliasAnalysis_);
+  LookupAssumptionCache = std::move(LookupAssumptionCache_);
+  LookupDomTree = std::move(LookupDomTree_);
 
   // If we don't have at least memset and memcpy, there is little point of doing
   // anything here.  These are required by a freestanding implementation, so if
@@ -1302,3 +1419,25 @@ bool MemCpyOpt::runOnFunction(Function &F) {
   MD = nullptr;
   return MadeChange;
 }
+
+/// This is the main transformation entry point for a function.
+bool MemCpyOptLegacyPass::runOnFunction(Function &F) {
+  if (skipFunction(F))
+    return false;
+
+  auto *MD = &getAnalysis<MemoryDependenceWrapperPass>().getMemDep();
+  auto *TLI = &getAnalysis<TargetLibraryInfoWrapperPass>().getTLI();
+
+  auto LookupAliasAnalysis = [this]() -> AliasAnalysis & {
+    return getAnalysis<AAResultsWrapperPass>().getAAResults();
+  };
+  auto LookupAssumptionCache = [this, &F]() -> AssumptionCache & {
+    return getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F);
+  };
+  auto LookupDomTree = [this]() -> DominatorTree & {
+    return getAnalysis<DominatorTreeWrapperPass>().getDomTree();
+  };
+
+  return Impl.runImpl(F, MD, TLI, LookupAliasAnalysis, LookupAssumptionCache,
+                      LookupDomTree);
+}
diff --git a/lib/Transforms/Scalar/MergedLoadStoreMotion.cpp b/lib/Transforms/Scalar/MergedLoadStoreMotion.cpp
index c812d618c16a..30261b755001 100644
--- a/lib/Transforms/Scalar/MergedLoadStoreMotion.cpp
+++ b/lib/Transforms/Scalar/MergedLoadStoreMotion.cpp
@@ -72,9 +72,7 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "llvm/Transforms/Scalar.h"
-#include "llvm/ADT/SetVector.h"
-#include "llvm/ADT/SmallPtrSet.h"
+#include "llvm/Transforms/Scalar/MergedLoadStoreMotion.h"
 #include "llvm/ADT/Statistic.h"
 #include "llvm/Analysis/AliasAnalysis.h"
 #include "llvm/Analysis/CFG.h"
@@ -82,51 +80,37 @@
 #include "llvm/Analysis/Loads.h"
 #include "llvm/Analysis/MemoryBuiltins.h"
 #include "llvm/Analysis/MemoryDependenceAnalysis.h"
-#include "llvm/Analysis/TargetLibraryInfo.h"
+#include "llvm/Analysis/ValueTracking.h"
 #include "llvm/IR/Metadata.h"
 #include "llvm/IR/PatternMatch.h"
-#include "llvm/Support/Allocator.h"
-#include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/raw_ostream.h"
+#include "llvm/Transforms/Scalar.h"
 #include "llvm/Transforms/Utils/BasicBlockUtils.h"
 #include "llvm/Transforms/Utils/SSAUpdater.h"
-#include <vector>
 
 using namespace llvm;
 
 #define DEBUG_TYPE "mldst-motion"
 
+namespace {
 //===----------------------------------------------------------------------===//
 //                         MergedLoadStoreMotion Pass
 //===----------------------------------------------------------------------===//
+class MergedLoadStoreMotion {
+  MemoryDependenceResults *MD = nullptr;
+  AliasAnalysis *AA = nullptr;
 
-namespace {
-class MergedLoadStoreMotion : public FunctionPass {
-  AliasAnalysis *AA;
-  MemoryDependenceAnalysis *MD;
+  // The mergeLoad/Store algorithms could have Size0 * Size1 complexity,
+  // where Size0 and Size1 are the #instructions on the two sides of
+  // the diamond. The constant chosen here is arbitrary. Compiler Time
+  // Control is enforced by the check Size0 * Size1 < MagicCompileTimeControl.
+  const int MagicCompileTimeControl = 250;
 
 public:
-  static char ID; // Pass identification, replacement for typeid
-  MergedLoadStoreMotion()
-      : FunctionPass(ID), MD(nullptr), MagicCompileTimeControl(250) {
-    initializeMergedLoadStoreMotionPass(*PassRegistry::getPassRegistry());
-  }
-
-  bool runOnFunction(Function &F) override;
+  bool run(Function &F, MemoryDependenceResults *MD, AliasAnalysis &AA);
 
 private:
-  // This transformation requires dominator postdominator info
-  void getAnalysisUsage(AnalysisUsage &AU) const override {
-    AU.setPreservesCFG();
-    AU.addRequired<TargetLibraryInfoWrapperPass>();
-    AU.addRequired<AAResultsWrapperPass>();
-    AU.addPreserved<GlobalsAAWrapperPass>();
-    AU.addPreserved<MemoryDependenceAnalysis>();
-  }
-
-  // Helper routines
-
   ///
   /// \brief Remove instruction from parent and update memory dependence
   /// analysis.
@@ -135,9 +119,9 @@ private:
   BasicBlock *getDiamondTail(BasicBlock *BB);
   bool isDiamondHead(BasicBlock *BB);
   // Routines for hoisting loads
-  bool isLoadHoistBarrierInRange(const Instruction& Start,
-                                 const Instruction& End,
-                                 LoadInst* LI);
+  bool isLoadHoistBarrierInRange(const Instruction &Start,
+                                 const Instruction &End, LoadInst *LI,
+                                 bool SafeToLoadUnconditionally);
   LoadInst *canHoistFromBlock(BasicBlock *BB, LoadInst *LI);
   void hoistInstruction(BasicBlock *BB, Instruction *HoistCand,
                         Instruction *ElseInst);
@@ -151,31 +135,8 @@ private:
                                  const Instruction &End, MemoryLocation Loc);
   bool sinkStore(BasicBlock *BB, StoreInst *SinkCand, StoreInst *ElseInst);
   bool mergeStores(BasicBlock *BB);
-  // The mergeLoad/Store algorithms could have Size0 * Size1 complexity,
-  // where Size0 and Size1 are the #instructions on the two sides of
-  // the diamond. The constant chosen here is arbitrary. Compiler Time
-  // Control is enforced by the check Size0 * Size1 < MagicCompileTimeControl.
-  const int MagicCompileTimeControl;
 };
-
-char MergedLoadStoreMotion::ID = 0;
-} // anonymous namespace
-
-///
-/// \brief createMergedLoadStoreMotionPass - The public interface to this file.
-///
-FunctionPass *llvm::createMergedLoadStoreMotionPass() {
-  return new MergedLoadStoreMotion();
-}
-
-INITIALIZE_PASS_BEGIN(MergedLoadStoreMotion, "mldst-motion",
-                      "MergedLoadStoreMotion", false, false)
-INITIALIZE_PASS_DEPENDENCY(MemoryDependenceAnalysis)
-INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfoWrapperPass)
-INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass)
-INITIALIZE_PASS_DEPENDENCY(GlobalsAAWrapperPass)
-INITIALIZE_PASS_END(MergedLoadStoreMotion, "mldst-motion",
-                    "MergedLoadStoreMotion", false, false)
+} // end anonymous namespace
 
 ///
 /// \brief Remove instruction from parent and update memory dependence analysis.
@@ -184,9 +145,9 @@ void MergedLoadStoreMotion::removeInstruction(Instruction *Inst) {
   // Notify the memory dependence analysis.
   if (MD) {
     MD->removeInstruction(Inst);
-    if (LoadInst *LI = dyn_cast<LoadInst>(Inst))
+    if (auto *LI = dyn_cast<LoadInst>(Inst))
       MD->invalidateCachedPointerInfo(LI->getPointerOperand());
-    if (Inst->getType()->getScalarType()->isPointerTy()) {
+    if (Inst->getType()->isPtrOrPtrVectorTy()) {
       MD->invalidateCachedPointerInfo(Inst);
     }
   }
@@ -198,10 +159,7 @@ void MergedLoadStoreMotion::removeInstruction(Instruction *Inst) {
 ///
 BasicBlock *MergedLoadStoreMotion::getDiamondTail(BasicBlock *BB) {
   assert(isDiamondHead(BB) && "Basic block is not head of a diamond");
-  BranchInst *BI = (BranchInst *)(BB->getTerminator());
-  BasicBlock *Succ0 = BI->getSuccessor(0);
-  BasicBlock *Tail = Succ0->getTerminator()->getSuccessor(0);
-  return Tail;
+  return BB->getTerminator()->getSuccessor(0)->getSingleSuccessor();
 }
 
 ///
@@ -210,25 +168,22 @@ BasicBlock *MergedLoadStoreMotion::getDiamondTail(BasicBlock *BB) {
 bool MergedLoadStoreMotion::isDiamondHead(BasicBlock *BB) {
   if (!BB)
     return false;
-  if (!isa<BranchInst>(BB->getTerminator()))
-    return false;
-  if (BB->getTerminator()->getNumSuccessors() != 2)
+  auto *BI = dyn_cast<BranchInst>(BB->getTerminator());
+  if (!BI || !BI->isConditional())
     return false;
 
-  BranchInst *BI = (BranchInst *)(BB->getTerminator());
   BasicBlock *Succ0 = BI->getSuccessor(0);
   BasicBlock *Succ1 = BI->getSuccessor(1);
 
-  if (!Succ0->getSinglePredecessor() ||
-      Succ0->getTerminator()->getNumSuccessors() != 1)
+  if (!Succ0->getSinglePredecessor())
     return false;
-  if (!Succ1->getSinglePredecessor() ||
-      Succ1->getTerminator()->getNumSuccessors() != 1)
+  if (!Succ1->getSinglePredecessor())
     return false;
 
-  BasicBlock *Tail = Succ0->getTerminator()->getSuccessor(0);
+  BasicBlock *Succ0Succ = Succ0->getSingleSuccessor();
+  BasicBlock *Succ1Succ = Succ1->getSingleSuccessor();
   // Ignore triangles.
-  if (Succ1->getTerminator()->getSuccessor(0) != Tail)
+  if (!Succ0Succ || !Succ1Succ || Succ0Succ != Succ1Succ)
     return false;
   return true;
 }
@@ -240,9 +195,14 @@ bool MergedLoadStoreMotion::isDiamondHead(BasicBlock *BB) {
 /// being loaded or protect against the load from happening
 /// it is considered a hoist barrier.
 ///
-bool MergedLoadStoreMotion::isLoadHoistBarrierInRange(const Instruction& Start, 
-                                                      const Instruction& End,
-                                                      LoadInst* LI) {
+bool MergedLoadStoreMotion::isLoadHoistBarrierInRange(
+    const Instruction &Start, const Instruction &End, LoadInst *LI,
+    bool SafeToLoadUnconditionally) {
+  if (!SafeToLoadUnconditionally)
+    for (const Instruction &Inst :
+         make_range(Start.getIterator(), End.getIterator()))
+      if (!isGuaranteedToTransferExecutionToSuccessor(&Inst))
+        return true;
   MemoryLocation Loc = MemoryLocation::get(LI);
   return AA->canInstructionRangeModRef(Start, End, Loc, MRI_Mod);
 }
@@ -256,23 +216,28 @@ bool MergedLoadStoreMotion::isLoadHoistBarrierInRange(const Instruction& Start,
 ///
 LoadInst *MergedLoadStoreMotion::canHoistFromBlock(BasicBlock *BB1,
                                                    LoadInst *Load0) {
-
+  BasicBlock *BB0 = Load0->getParent();
+  BasicBlock *Head = BB0->getSinglePredecessor();
+  bool SafeToLoadUnconditionally = isSafeToLoadUnconditionally(
+      Load0->getPointerOperand(), Load0->getAlignment(),
+      Load0->getModule()->getDataLayout(),
+      /*ScanFrom=*/Head->getTerminator());
   for (BasicBlock::iterator BBI = BB1->begin(), BBE = BB1->end(); BBI != BBE;
        ++BBI) {
     Instruction *Inst = &*BBI;
 
     // Only merge and hoist loads when their result in used only in BB
-    if (!isa<LoadInst>(Inst) || Inst->isUsedOutsideOfBlock(BB1))
+    auto *Load1 = dyn_cast<LoadInst>(Inst);
+    if (!Load1 || Inst->isUsedOutsideOfBlock(BB1))
       continue;
 
-    LoadInst *Load1 = dyn_cast<LoadInst>(Inst);
-    BasicBlock *BB0 = Load0->getParent();
-
     MemoryLocation Loc0 = MemoryLocation::get(Load0);
     MemoryLocation Loc1 = MemoryLocation::get(Load1);
-    if (AA->isMustAlias(Loc0, Loc1) && Load0->isSameOperationAs(Load1) &&
-        !isLoadHoistBarrierInRange(BB1->front(), *Load1, Load1) &&
-        !isLoadHoistBarrierInRange(BB0->front(), *Load0, Load0)) {
+    if (Load0->isSameOperationAs(Load1) && AA->isMustAlias(Loc0, Loc1) &&
+        !isLoadHoistBarrierInRange(BB1->front(), *Load1, Load1,
+                                   SafeToLoadUnconditionally) &&
+        !isLoadHoistBarrierInRange(BB0->front(), *Load0, Load0,
+                                   SafeToLoadUnconditionally)) {
       return Load1;
     }
   }
@@ -319,11 +284,10 @@ void MergedLoadStoreMotion::hoistInstruction(BasicBlock *BB,
 ///
 bool MergedLoadStoreMotion::isSafeToHoist(Instruction *I) const {
   BasicBlock *Parent = I->getParent();
-  for (unsigned i = 0, e = I->getNumOperands(); i != e; ++i) {
-    Instruction *Instr = dyn_cast<Instruction>(I->getOperand(i));
-    if (Instr && Instr->getParent() == Parent)
-      return false;
-  }
+  for (Use &U : I->operands())
+    if (auto *Instr = dyn_cast<Instruction>(&U))
+      if (Instr->getParent() == Parent)
+        return false;
   return true;
 }
 
@@ -333,8 +297,8 @@ bool MergedLoadStoreMotion::isSafeToHoist(Instruction *I) const {
 bool MergedLoadStoreMotion::hoistLoad(BasicBlock *BB, LoadInst *L0,
                                       LoadInst *L1) {
   // Only one definition?
-  Instruction *A0 = dyn_cast<Instruction>(L0->getPointerOperand());
-  Instruction *A1 = dyn_cast<Instruction>(L1->getPointerOperand());
+  auto *A0 = dyn_cast<Instruction>(L0->getPointerOperand());
+  auto *A1 = dyn_cast<Instruction>(L1->getPointerOperand());
   if (A0 && A1 && A0->isIdenticalTo(A1) && isSafeToHoist(A0) &&
       A0->hasOneUse() && (A0->getParent() == L0->getParent()) &&
       A1->hasOneUse() && (A1->getParent() == L1->getParent()) &&
@@ -345,8 +309,8 @@ bool MergedLoadStoreMotion::hoistLoad(BasicBlock *BB, LoadInst *L0,
     hoistInstruction(BB, A0, A1);
     hoistInstruction(BB, L0, L1);
     return true;
-  } else
-    return false;
+  }
+  return false;
 }
 
 ///
@@ -358,7 +322,7 @@ bool MergedLoadStoreMotion::hoistLoad(BasicBlock *BB, LoadInst *L0,
 bool MergedLoadStoreMotion::mergeLoads(BasicBlock *BB) {
   bool MergedLoads = false;
   assert(isDiamondHead(BB));
-  BranchInst *BI = dyn_cast<BranchInst>(BB->getTerminator());
+  BranchInst *BI = cast<BranchInst>(BB->getTerminator());
   BasicBlock *Succ0 = BI->getSuccessor(0);
   BasicBlock *Succ1 = BI->getSuccessor(1);
   // #Instructions in Succ1 for Compile Time Control
@@ -369,8 +333,8 @@ bool MergedLoadStoreMotion::mergeLoads(BasicBlock *BB) {
     Instruction *I = &*BBI;
     ++BBI;
 
-    // Only move non-simple (atomic, volatile) loads.
-    LoadInst *L0 = dyn_cast<LoadInst>(I);
+    // Don't move non-simple (atomic, volatile) loads.
+    auto *L0 = dyn_cast<LoadInst>(I);
     if (!L0 || !L0->isSimple() || L0->isUsedOutsideOfBlock(Succ0))
       continue;
 
@@ -399,6 +363,10 @@ bool MergedLoadStoreMotion::mergeLoads(BasicBlock *BB) {
 bool MergedLoadStoreMotion::isStoreSinkBarrierInRange(const Instruction &Start,
                                                       const Instruction &End,
                                                       MemoryLocation Loc) {
+  for (const Instruction &Inst :
+       make_range(Start.getIterator(), End.getIterator()))
+    if (Inst.mayThrow())
+      return true;
   return AA->canInstructionRangeModRef(Start, End, Loc, MRI_ModRef);
 }
 
@@ -411,22 +379,16 @@ StoreInst *MergedLoadStoreMotion::canSinkFromBlock(BasicBlock *BB1,
                                                    StoreInst *Store0) {
   DEBUG(dbgs() << "can Sink? : "; Store0->dump(); dbgs() << "\n");
   BasicBlock *BB0 = Store0->getParent();
-  for (BasicBlock::reverse_iterator RBI = BB1->rbegin(), RBE = BB1->rend();
-       RBI != RBE; ++RBI) {
-    Instruction *Inst = &*RBI;
-
-    if (!isa<StoreInst>(Inst))
-       continue;
-
-    StoreInst *Store1 = cast<StoreInst>(Inst);
+  for (Instruction &Inst : reverse(*BB1)) {
+    auto *Store1 = dyn_cast<StoreInst>(&Inst);
+    if (!Store1)
+      continue;
 
     MemoryLocation Loc0 = MemoryLocation::get(Store0);
     MemoryLocation Loc1 = MemoryLocation::get(Store1);
     if (AA->isMustAlias(Loc0, Loc1) && Store0->isSameOperationAs(Store1) &&
-      !isStoreSinkBarrierInRange(*(std::next(BasicBlock::iterator(Store1))),
-                                 BB1->back(), Loc1) &&
-      !isStoreSinkBarrierInRange(*(std::next(BasicBlock::iterator(Store0))),
-                                 BB0->back(), Loc0)) {
+        !isStoreSinkBarrierInRange(*Store1->getNextNode(), BB1->back(), Loc1) &&
+        !isStoreSinkBarrierInRange(*Store0->getNextNode(), BB0->back(), Loc0)) {
       return Store1;
     }
   }
@@ -439,17 +401,17 @@ StoreInst *MergedLoadStoreMotion::canSinkFromBlock(BasicBlock *BB1,
 PHINode *MergedLoadStoreMotion::getPHIOperand(BasicBlock *BB, StoreInst *S0,
                                               StoreInst *S1) {
   // Create a phi if the values mismatch.
-  PHINode *NewPN = nullptr;
   Value *Opd1 = S0->getValueOperand();
   Value *Opd2 = S1->getValueOperand();
-  if (Opd1 != Opd2) {
-    NewPN = PHINode::Create(Opd1->getType(), 2, Opd2->getName() + ".sink",
-                            &BB->front());
-    NewPN->addIncoming(Opd1, S0->getParent());
-    NewPN->addIncoming(Opd2, S1->getParent());
-    if (MD && NewPN->getType()->getScalarType()->isPointerTy())
-      MD->invalidateCachedPointerInfo(NewPN);
-  }
+  if (Opd1 == Opd2)
+    return nullptr;
+
+  auto *NewPN = PHINode::Create(Opd1->getType(), 2, Opd2->getName() + ".sink",
+                                &BB->front());
+  NewPN->addIncoming(Opd1, S0->getParent());
+  NewPN->addIncoming(Opd2, S1->getParent());
+  if (MD && NewPN->getType()->getScalarType()->isPointerTy())
+    MD->invalidateCachedPointerInfo(NewPN);
   return NewPN;
 }
 
@@ -461,8 +423,8 @@ PHINode *MergedLoadStoreMotion::getPHIOperand(BasicBlock *BB, StoreInst *S0,
 bool MergedLoadStoreMotion::sinkStore(BasicBlock *BB, StoreInst *S0,
                                       StoreInst *S1) {
   // Only one definition?
-  Instruction *A0 = dyn_cast<Instruction>(S0->getPointerOperand());
-  Instruction *A1 = dyn_cast<Instruction>(S1->getPointerOperand());
+  auto *A0 = dyn_cast<Instruction>(S0->getPointerOperand());
+  auto *A1 = dyn_cast<Instruction>(S1->getPointerOperand());
   if (A0 && A1 && A0->isIdenticalTo(A1) && A0->hasOneUse() &&
       (A0->getParent() == S0->getParent()) && A1->hasOneUse() &&
       (A1->getParent() == S1->getParent()) && isa<GetElementPtrInst>(A0)) {
@@ -476,7 +438,7 @@ bool MergedLoadStoreMotion::sinkStore(BasicBlock *BB, StoreInst *S0,
     S0->dropUnknownNonDebugMetadata();
 
     // Create the new store to be inserted at the join point.
-    StoreInst *SNew = (StoreInst *)(S0->clone());
+    StoreInst *SNew = cast<StoreInst>(S0->clone());
     Instruction *ANew = A0->clone();
     SNew->insertBefore(&*InsertPt);
     ANew->insertBefore(SNew);
@@ -484,9 +446,8 @@ bool MergedLoadStoreMotion::sinkStore(BasicBlock *BB, StoreInst *S0,
     assert(S0->getParent() == A0->getParent());
     assert(S1->getParent() == A1->getParent());
 
-    PHINode *NewPN = getPHIOperand(BB, S0, S1);
     // New PHI operand? Use it.
-    if (NewPN)
+    if (PHINode *NewPN = getPHIOperand(BB, S0, S1))
       SNew->setOperand(0, NewPN);
     removeInstruction(S0);
     removeInstruction(S1);
@@ -532,11 +493,9 @@ bool MergedLoadStoreMotion::mergeStores(BasicBlock *T) {
     Instruction *I = &*RBI;
     ++RBI;
 
-    // Sink move non-simple (atomic, volatile) stores
-    if (!isa<StoreInst>(I))
-      continue;
-    StoreInst *S0 = (StoreInst *)I;
-    if (!S0->isSimple())
+    // Don't sink non-simple (atomic, volatile) stores.
+    auto *S0 = dyn_cast<StoreInst>(I);
+    if (!S0 || !S0->isSimple())
       continue;
 
     ++NStores;
@@ -551,22 +510,18 @@ bool MergedLoadStoreMotion::mergeStores(BasicBlock *T) {
       // is likely stale at this point.
       if (!Res)
         break;
-      else {
-        RBI = Pred0->rbegin();
-        RBE = Pred0->rend();
-        DEBUG(dbgs() << "Search again\n"; Instruction *I = &*RBI; I->dump());
-      }
+      RBI = Pred0->rbegin();
+      RBE = Pred0->rend();
+      DEBUG(dbgs() << "Search again\n"; Instruction *I = &*RBI; I->dump());
     }
   }
   return MergedStores;
 }
 
-///
-/// \brief Run the transformation for each function
-///
-bool MergedLoadStoreMotion::runOnFunction(Function &F) {
-  MD = getAnalysisIfAvailable<MemoryDependenceAnalysis>();
-  AA = &getAnalysis<AAResultsWrapperPass>().getAAResults();
+bool MergedLoadStoreMotion::run(Function &F, MemoryDependenceResults *MD,
+                                AliasAnalysis &AA) {
+  this->MD = MD;
+  this->AA = &AA;
 
   bool Changed = false;
   DEBUG(dbgs() << "Instruction Merger\n");
@@ -585,3 +540,66 @@ bool MergedLoadStoreMotion::runOnFunction(Function &F) {
   }
   return Changed;
 }
+
+namespace {
+class MergedLoadStoreMotionLegacyPass : public FunctionPass {
+public:
+  static char ID; // Pass identification, replacement for typeid
+  MergedLoadStoreMotionLegacyPass() : FunctionPass(ID) {
+    initializeMergedLoadStoreMotionLegacyPassPass(
+        *PassRegistry::getPassRegistry());
+  }
+
+  ///
+  /// \brief Run the transformation for each function
+  ///
+  bool runOnFunction(Function &F) override {
+    if (skipFunction(F))
+      return false;
+    MergedLoadStoreMotion Impl;
+    auto *MDWP = getAnalysisIfAvailable<MemoryDependenceWrapperPass>();
+    return Impl.run(F, MDWP ? &MDWP->getMemDep() : nullptr,
+                    getAnalysis<AAResultsWrapperPass>().getAAResults());
+  }
+
+private:
+  // This transformation requires dominator postdominator info
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    AU.setPreservesCFG();
+    AU.addRequired<AAResultsWrapperPass>();
+    AU.addPreserved<GlobalsAAWrapperPass>();
+    AU.addPreserved<MemoryDependenceWrapperPass>();
+  }
+};
+
+char MergedLoadStoreMotionLegacyPass::ID = 0;
+} // anonymous namespace
+
+///
+/// \brief createMergedLoadStoreMotionPass - The public interface to this file.
+///
+FunctionPass *llvm::createMergedLoadStoreMotionPass() {
+  return new MergedLoadStoreMotionLegacyPass();
+}
+
+INITIALIZE_PASS_BEGIN(MergedLoadStoreMotionLegacyPass, "mldst-motion",
+                      "MergedLoadStoreMotion", false, false)
+INITIALIZE_PASS_DEPENDENCY(MemoryDependenceWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass)
+INITIALIZE_PASS_END(MergedLoadStoreMotionLegacyPass, "mldst-motion",
+                    "MergedLoadStoreMotion", false, false)
+
+PreservedAnalyses
+MergedLoadStoreMotionPass::run(Function &F, AnalysisManager<Function> &AM) {
+  MergedLoadStoreMotion Impl;
+  auto *MD = AM.getCachedResult<MemoryDependenceAnalysis>(F);
+  auto &AA = AM.getResult<AAManager>(F);
+  if (!Impl.run(F, MD, AA))
+    return PreservedAnalyses::all();
+
+  // FIXME: This should also 'preserve the CFG'.
+  PreservedAnalyses PA;
+  PA.preserve<GlobalsAA>();
+  PA.preserve<MemoryDependenceAnalysis>();
+  return PA;
+}
diff --git a/lib/Transforms/Scalar/NaryReassociate.cpp b/lib/Transforms/Scalar/NaryReassociate.cpp
index c8f885e7eec5..ed754fa71025 100644
--- a/lib/Transforms/Scalar/NaryReassociate.cpp
+++ b/lib/Transforms/Scalar/NaryReassociate.cpp
@@ -208,7 +208,7 @@ FunctionPass *llvm::createNaryReassociatePass() {
 }
 
 bool NaryReassociate::runOnFunction(Function &F) {
-  if (skipOptnoneFunction(F))
+  if (skipFunction(F))
     return false;
 
   AC = &getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F);
@@ -299,49 +299,18 @@ Instruction *NaryReassociate::tryReassociate(Instruction *I) {
   }
 }
 
-// FIXME: extract this method into TTI->getGEPCost.
 static bool isGEPFoldable(GetElementPtrInst *GEP,
-                          const TargetTransformInfo *TTI,
-                          const DataLayout *DL) {
-  GlobalVariable *BaseGV = nullptr;
-  int64_t BaseOffset = 0;
-  bool HasBaseReg = false;
-  int64_t Scale = 0;
-
-  if (GlobalVariable *GV = dyn_cast<GlobalVariable>(GEP->getPointerOperand()))
-    BaseGV = GV;
-  else
-    HasBaseReg = true;
-
-  gep_type_iterator GTI = gep_type_begin(GEP);
-  for (auto I = GEP->idx_begin(); I != GEP->idx_end(); ++I, ++GTI) {
-    if (isa<SequentialType>(*GTI)) {
-      int64_t ElementSize = DL->getTypeAllocSize(GTI.getIndexedType());
-      if (ConstantInt *ConstIdx = dyn_cast<ConstantInt>(*I)) {
-        BaseOffset += ConstIdx->getSExtValue() * ElementSize;
-      } else {
-        // Needs scale register.
-        if (Scale != 0) {
-          // No addressing mode takes two scale registers.
-          return false;
-        }
-        Scale = ElementSize;
-      }
-    } else {
-      StructType *STy = cast<StructType>(*GTI);
-      uint64_t Field = cast<ConstantInt>(*I)->getZExtValue();
-      BaseOffset += DL->getStructLayout(STy)->getElementOffset(Field);
-    }
-  }
-
-  unsigned AddrSpace = GEP->getPointerAddressSpace();
-  return TTI->isLegalAddressingMode(GEP->getType()->getElementType(), BaseGV,
-                                    BaseOffset, HasBaseReg, Scale, AddrSpace);
+                          const TargetTransformInfo *TTI) {
+  SmallVector<const Value*, 4> Indices;
+  for (auto I = GEP->idx_begin(); I != GEP->idx_end(); ++I)
+    Indices.push_back(*I);
+  return TTI->getGEPCost(GEP->getSourceElementType(), GEP->getPointerOperand(),
+                         Indices) == TargetTransformInfo::TCC_Free;
 }
 
 Instruction *NaryReassociate::tryReassociateGEP(GetElementPtrInst *GEP) {
   // Not worth reassociating GEP if it is foldable.
-  if (isGEPFoldable(GEP, TTI, DL))
+  if (isGEPFoldable(GEP, TTI))
     return nullptr;
 
   gep_type_iterator GTI = gep_type_begin(*GEP);
@@ -434,7 +403,7 @@ GetElementPtrInst *NaryReassociate::tryReassociateGEPAtIndex(
 
   // NewGEP = (char *)Candidate + RHS * sizeof(IndexedType)
   uint64_t IndexedSize = DL->getTypeAllocSize(IndexedType);
-  Type *ElementType = GEP->getType()->getElementType();
+  Type *ElementType = GEP->getResultElementType();
   uint64_t ElementSize = DL->getTypeAllocSize(ElementType);
   // Another less rare case: because I is not necessarily the last index of the
   // GEP, the size of the type at the I-th index (IndexedSize) is not
diff --git a/lib/Transforms/Scalar/PartiallyInlineLibCalls.cpp b/lib/Transforms/Scalar/PartiallyInlineLibCalls.cpp
index 9f26f78892c6..c4b3e3464f40 100644
--- a/lib/Transforms/Scalar/PartiallyInlineLibCalls.cpp
+++ b/lib/Transforms/Scalar/PartiallyInlineLibCalls.cpp
@@ -13,12 +13,10 @@
 //
 //===----------------------------------------------------------------------===//
 
+#include "llvm/Transforms/Scalar/PartiallyInlineLibCalls.h"
+#include "llvm/Analysis/TargetLibraryInfo.h"
 #include "llvm/Analysis/TargetTransformInfo.h"
 #include "llvm/IR/IRBuilder.h"
-#include "llvm/IR/Intrinsics.h"
-#include "llvm/Pass.h"
-#include "llvm/Support/CommandLine.h"
-#include "llvm/Analysis/TargetLibraryInfo.h"
 #include "llvm/Transforms/Scalar.h"
 #include "llvm/Transforms/Utils/BasicBlockUtils.h"
 
@@ -26,85 +24,9 @@ using namespace llvm;
 
 #define DEBUG_TYPE "partially-inline-libcalls"
 
-namespace {
-  class PartiallyInlineLibCalls : public FunctionPass {
-  public:
-    static char ID;
-
-    PartiallyInlineLibCalls() :
-      FunctionPass(ID) {
-      initializePartiallyInlineLibCallsPass(*PassRegistry::getPassRegistry());
-    }
-
-    void getAnalysisUsage(AnalysisUsage &AU) const override;
-    bool runOnFunction(Function &F) override;
-
-  private:
-    /// Optimize calls to sqrt.
-    bool optimizeSQRT(CallInst *Call, Function *CalledFunc,
-                      BasicBlock &CurrBB, Function::iterator &BB);
-  };
-
-  char PartiallyInlineLibCalls::ID = 0;
-}
-
-INITIALIZE_PASS(PartiallyInlineLibCalls, "partially-inline-libcalls",
-                "Partially inline calls to library functions", false, false)
-
-void PartiallyInlineLibCalls::getAnalysisUsage(AnalysisUsage &AU) const {
-  AU.addRequired<TargetLibraryInfoWrapperPass>();
-  AU.addRequired<TargetTransformInfoWrapperPass>();
-  FunctionPass::getAnalysisUsage(AU);
-}
-
-bool PartiallyInlineLibCalls::runOnFunction(Function &F) {
-  bool Changed = false;
-  Function::iterator CurrBB;
-  TargetLibraryInfo *TLI =
-      &getAnalysis<TargetLibraryInfoWrapperPass>().getTLI();
-  const TargetTransformInfo *TTI =
-      &getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F);
-  for (Function::iterator BB = F.begin(), BE = F.end(); BB != BE;) {
-    CurrBB = BB++;
-
-    for (BasicBlock::iterator II = CurrBB->begin(), IE = CurrBB->end();
-         II != IE; ++II) {
-      CallInst *Call = dyn_cast<CallInst>(&*II);
-      Function *CalledFunc;
-
-      if (!Call || !(CalledFunc = Call->getCalledFunction()))
-        continue;
-
-      // Skip if function either has local linkage or is not a known library
-      // function.
-      LibFunc::Func LibFunc;
-      if (CalledFunc->hasLocalLinkage() || !CalledFunc->hasName() ||
-          !TLI->getLibFunc(CalledFunc->getName(), LibFunc))
-        continue;
-
-      switch (LibFunc) {
-      case LibFunc::sqrtf:
-      case LibFunc::sqrt:
-        if (TTI->haveFastSqrt(Call->getType()) &&
-            optimizeSQRT(Call, CalledFunc, *CurrBB, BB))
-          break;
-        continue;
-      default:
-        continue;
-      }
 
-      Changed = true;
-      break;
-    }
-  }
-
-  return Changed;
-}
-
-bool PartiallyInlineLibCalls::optimizeSQRT(CallInst *Call,
-                                           Function *CalledFunc,
-                                           BasicBlock &CurrBB,
-                                           Function::iterator &BB) {
+static bool optimizeSQRT(CallInst *Call, Function *CalledFunc,
+                         BasicBlock &CurrBB, Function::iterator &BB) {
   // There is no need to change the IR, since backend will emit sqrt
   // instruction if the call has already been marked read-only.
   if (Call->onlyReadsMemory())
@@ -158,6 +80,97 @@ bool PartiallyInlineLibCalls::optimizeSQRT(CallInst *Call,
   return true;
 }
 
+static bool runPartiallyInlineLibCalls(Function &F, TargetLibraryInfo *TLI,
+                                       const TargetTransformInfo *TTI) {
+  bool Changed = false;
+
+  Function::iterator CurrBB;
+  for (Function::iterator BB = F.begin(), BE = F.end(); BB != BE;) {
+    CurrBB = BB++;
+
+    for (BasicBlock::iterator II = CurrBB->begin(), IE = CurrBB->end();
+         II != IE; ++II) {
+      CallInst *Call = dyn_cast<CallInst>(&*II);
+      Function *CalledFunc;
+
+      if (!Call || !(CalledFunc = Call->getCalledFunction()))
+        continue;
+
+      // Skip if function either has local linkage or is not a known library
+      // function.
+      LibFunc::Func LibFunc;
+      if (CalledFunc->hasLocalLinkage() || !CalledFunc->hasName() ||
+          !TLI->getLibFunc(CalledFunc->getName(), LibFunc))
+        continue;
+
+      switch (LibFunc) {
+      case LibFunc::sqrtf:
+      case LibFunc::sqrt:
+        if (TTI->haveFastSqrt(Call->getType()) &&
+            optimizeSQRT(Call, CalledFunc, *CurrBB, BB))
+          break;
+        continue;
+      default:
+        continue;
+      }
+
+      Changed = true;
+      break;
+    }
+  }
+
+  return Changed;
+}
+
+PreservedAnalyses
+PartiallyInlineLibCallsPass::run(Function &F, AnalysisManager<Function> &AM) {
+  auto &TLI = AM.getResult<TargetLibraryAnalysis>(F);
+  auto &TTI = AM.getResult<TargetIRAnalysis>(F);
+  if (!runPartiallyInlineLibCalls(F, &TLI, &TTI))
+    return PreservedAnalyses::all();
+  return PreservedAnalyses::none();
+}
+
+namespace {
+class PartiallyInlineLibCallsLegacyPass : public FunctionPass {
+public:
+  static char ID;
+
+  PartiallyInlineLibCallsLegacyPass() : FunctionPass(ID) {
+    initializePartiallyInlineLibCallsLegacyPassPass(
+        *PassRegistry::getPassRegistry());
+  }
+
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    AU.addRequired<TargetLibraryInfoWrapperPass>();
+    AU.addRequired<TargetTransformInfoWrapperPass>();
+    FunctionPass::getAnalysisUsage(AU);
+  }
+
+  bool runOnFunction(Function &F) override {
+    if (skipFunction(F))
+      return false;
+
+    TargetLibraryInfo *TLI =
+        &getAnalysis<TargetLibraryInfoWrapperPass>().getTLI();
+    const TargetTransformInfo *TTI =
+        &getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F);
+    return runPartiallyInlineLibCalls(F, TLI, TTI);
+  }
+};
+}
+
+char PartiallyInlineLibCallsLegacyPass::ID = 0;
+INITIALIZE_PASS_BEGIN(PartiallyInlineLibCallsLegacyPass,
+                      "partially-inline-libcalls",
+                      "Partially inline calls to library functions", false,
+                      false)
+INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfoWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass)
+INITIALIZE_PASS_END(PartiallyInlineLibCallsLegacyPass,
+                    "partially-inline-libcalls",
+                    "Partially inline calls to library functions", false, false)
+
 FunctionPass *llvm::createPartiallyInlineLibCallsPass() {
-  return new PartiallyInlineLibCalls();
+  return new PartiallyInlineLibCallsLegacyPass();
 }
diff --git a/lib/Transforms/Scalar/PlaceSafepoints.cpp b/lib/Transforms/Scalar/PlaceSafepoints.cpp
index b56b35599120..e47b636348e3 100644
--- a/lib/Transforms/Scalar/PlaceSafepoints.cpp
+++ b/lib/Transforms/Scalar/PlaceSafepoints.cpp
@@ -49,45 +49,32 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/Pass.h"
-#include "llvm/IR/LegacyPassManager.h"
-#include "llvm/ADT/SetOperations.h"
+
 #include "llvm/ADT/SetVector.h"
 #include "llvm/ADT/Statistic.h"
-#include "llvm/ADT/StringRef.h"
-#include "llvm/Analysis/LoopPass.h"
-#include "llvm/Analysis/LoopInfo.h"
-#include "llvm/Analysis/ScalarEvolution.h"
-#include "llvm/Analysis/ScalarEvolutionExpressions.h"
 #include "llvm/Analysis/CFG.h"
-#include "llvm/Analysis/InstructionSimplify.h"
-#include "llvm/IR/BasicBlock.h"
+#include "llvm/Analysis/ScalarEvolution.h"
 #include "llvm/IR/CallSite.h"
 #include "llvm/IR/Dominators.h"
-#include "llvm/IR/Function.h"
-#include "llvm/IR/IRBuilder.h"
-#include "llvm/IR/InstIterator.h"
-#include "llvm/IR/Instructions.h"
-#include "llvm/IR/Intrinsics.h"
 #include "llvm/IR/IntrinsicInst.h"
-#include "llvm/IR/Module.h"
+#include "llvm/IR/LegacyPassManager.h"
 #include "llvm/IR/Statepoint.h"
-#include "llvm/IR/Value.h"
-#include "llvm/IR/Verifier.h"
-#include "llvm/Support/Debug.h"
 #include "llvm/Support/CommandLine.h"
-#include "llvm/Support/raw_ostream.h"
+#include "llvm/Support/Debug.h"
 #include "llvm/Transforms/Scalar.h"
 #include "llvm/Transforms/Utils/BasicBlockUtils.h"
 #include "llvm/Transforms/Utils/Cloning.h"
 #include "llvm/Transforms/Utils/Local.h"
 
 #define DEBUG_TYPE "safepoint-placement"
+
 STATISTIC(NumEntrySafepoints, "Number of entry safepoints inserted");
-STATISTIC(NumCallSafepoints, "Number of call safepoints inserted");
 STATISTIC(NumBackedgeSafepoints, "Number of backedge safepoints inserted");
 
-STATISTIC(CallInLoop, "Number of loops w/o safepoints due to calls in loop");
-STATISTIC(FiniteExecution, "Number of loops w/o safepoints finite execution");
+STATISTIC(CallInLoop,
+          "Number of loops without safepoints due to calls in loop");
+STATISTIC(FiniteExecution,
+          "Number of loops without safepoints finite execution");
 
 using namespace llvm;
 
@@ -108,9 +95,6 @@ static cl::opt<int> CountedLoopTripWidth("spp-counted-loop-trip-width",
 static cl::opt<bool> SplitBackedge("spp-split-backedge", cl::Hidden,
                                    cl::init(false));
 
-// Print tracing output
-static cl::opt<bool> TraceLSP("spp-trace", cl::Hidden, cl::init(false));
-
 namespace {
 
 /// An analysis pass whose purpose is to identify each of the backedges in
@@ -138,8 +122,8 @@ struct PlaceBackedgeSafepointsImpl : public FunctionPass {
   bool runOnLoop(Loop *);
   void runOnLoopAndSubLoops(Loop *L) {
     // Visit all the subloops
-    for (auto I = L->begin(), E = L->end(); I != E; I++)
-      runOnLoopAndSubLoops(*I);
+    for (Loop *I : *L)
+      runOnLoopAndSubLoops(I);
     runOnLoop(L);
   }
 
@@ -147,8 +131,8 @@ struct PlaceBackedgeSafepointsImpl : public FunctionPass {
     SE = &getAnalysis<ScalarEvolutionWrapperPass>().getSE();
     DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
     LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
-    for (auto I = LI->begin(), E = LI->end(); I != E; I++) {
-      runOnLoopAndSubLoops(*I);
+    for (Loop *I : *LI) {
+      runOnLoopAndSubLoops(I);
     }
     return false;
   }
@@ -200,13 +184,9 @@ static bool needsStatepoint(const CallSite &CS) {
     if (call->isInlineAsm())
       return false;
   }
-  if (isStatepoint(CS) || isGCRelocate(CS) || isGCResult(CS)) {
-    return false;
-  }
-  return true;
-}
 
-static Value *ReplaceWithStatepoint(const CallSite &CS);
+  return !(isStatepoint(CS) || isGCRelocate(CS) || isGCResult(CS));
+}
 
 /// Returns true if this loop is known to contain a call safepoint which
 /// must unconditionally execute on any iteration of the loop which returns
@@ -278,43 +258,44 @@ static bool mustBeFiniteCountedLoop(Loop *L, ScalarEvolution *SE,
   return /* not finite */ false;
 }
 
-static void scanOneBB(Instruction *start, Instruction *end,
-                      std::vector<CallInst *> &calls,
-                      std::set<BasicBlock *> &seen,
-                      std::vector<BasicBlock *> &worklist) {
-  for (BasicBlock::iterator itr(start);
-       itr != start->getParent()->end() && itr != BasicBlock::iterator(end);
-       itr++) {
-    if (CallInst *CI = dyn_cast<CallInst>(&*itr)) {
-      calls.push_back(CI);
-    }
+static void scanOneBB(Instruction *Start, Instruction *End,
+                      std::vector<CallInst *> &Calls,
+                      DenseSet<BasicBlock *> &Seen,
+                      std::vector<BasicBlock *> &Worklist) {
+  for (BasicBlock::iterator BBI(Start), BBE0 = Start->getParent()->end(),
+                                        BBE1 = BasicBlock::iterator(End);
+       BBI != BBE0 && BBI != BBE1; BBI++) {
+    if (CallInst *CI = dyn_cast<CallInst>(&*BBI))
+      Calls.push_back(CI);
+
     // FIXME: This code does not handle invokes
-    assert(!dyn_cast<InvokeInst>(&*itr) &&
+    assert(!isa<InvokeInst>(&*BBI) &&
            "support for invokes in poll code needed");
+
     // Only add the successor blocks if we reach the terminator instruction
     // without encountering end first
-    if (itr->isTerminator()) {
-      BasicBlock *BB = itr->getParent();
+    if (BBI->isTerminator()) {
+      BasicBlock *BB = BBI->getParent();
       for (BasicBlock *Succ : successors(BB)) {
-        if (seen.count(Succ) == 0) {
-          worklist.push_back(Succ);
-          seen.insert(Succ);
+        if (Seen.insert(Succ).second) {
+          Worklist.push_back(Succ);
         }
       }
     }
   }
 }
-static void scanInlinedCode(Instruction *start, Instruction *end,
-                            std::vector<CallInst *> &calls,
-                            std::set<BasicBlock *> &seen) {
-  calls.clear();
-  std::vector<BasicBlock *> worklist;
-  seen.insert(start->getParent());
-  scanOneBB(start, end, calls, seen, worklist);
-  while (!worklist.empty()) {
-    BasicBlock *BB = worklist.back();
-    worklist.pop_back();
-    scanOneBB(&*BB->begin(), end, calls, seen, worklist);
+
+static void scanInlinedCode(Instruction *Start, Instruction *End,
+                            std::vector<CallInst *> &Calls,
+                            DenseSet<BasicBlock *> &Seen) {
+  Calls.clear();
+  std::vector<BasicBlock *> Worklist;
+  Seen.insert(Start->getParent());
+  scanOneBB(Start, End, Calls, Seen, Worklist);
+  while (!Worklist.empty()) {
+    BasicBlock *BB = Worklist.back();
+    Worklist.pop_back();
+    scanOneBB(&*BB->begin(), End, Calls, Seen, Worklist);
   }
 }
 
@@ -324,29 +305,27 @@ bool PlaceBackedgeSafepointsImpl::runOnLoop(Loop *L) {
   // Note: In common usage, there will be only one edge due to LoopSimplify
   // having run sometime earlier in the pipeline, but this code must be correct
   // w.r.t. loops with multiple backedges.
-  BasicBlock *header = L->getHeader();
+  BasicBlock *Header = L->getHeader();
   SmallVector<BasicBlock*, 16> LoopLatches;
   L->getLoopLatches(LoopLatches);
-  for (BasicBlock *pred : LoopLatches) {
-    assert(L->contains(pred));
+  for (BasicBlock *Pred : LoopLatches) {
+    assert(L->contains(Pred));
 
     // Make a policy decision about whether this loop needs a safepoint or
     // not.  Note that this is about unburdening the optimizer in loops, not
     // avoiding the runtime cost of the actual safepoint.
     if (!AllBackedges) {
-      if (mustBeFiniteCountedLoop(L, SE, pred)) {
-        if (TraceLSP)
-          errs() << "skipping safepoint placement in finite loop\n";
+      if (mustBeFiniteCountedLoop(L, SE, Pred)) {
+        DEBUG(dbgs() << "skipping safepoint placement in finite loop\n");
         FiniteExecution++;
         continue;
       }
       if (CallSafepointsEnabled &&
-          containsUnconditionalCallSafepoint(L, header, pred, *DT)) {
+          containsUnconditionalCallSafepoint(L, Header, Pred, *DT)) {
         // Note: This is only semantically legal since we won't do any further
         // IPO or inlining before the actual call insertion..  If we hadn't, we
         // might latter loose this call safepoint.
-        if (TraceLSP)
-          errs() << "skipping safepoint placement due to unconditional call\n";
+        DEBUG(dbgs() << "skipping safepoint placement due to unconditional call\n");
         CallInLoop++;
         continue;
       }
@@ -360,14 +339,11 @@ bool PlaceBackedgeSafepointsImpl::runOnLoop(Loop *L) {
     // Safepoint insertion would involve creating a new basic block (as the
     // target of the current backedge) which does the safepoint (of all live
     // variables) and branches to the true header
-    TerminatorInst *term = pred->getTerminator();
+    TerminatorInst *Term = Pred->getTerminator();
 
-    if (TraceLSP) {
-      errs() << "[LSP] terminator instruction: ";
-      term->dump();
-    }
+    DEBUG(dbgs() << "[LSP] terminator instruction: " << *Term);
 
-    PollLocations.push_back(term);
+    PollLocations.push_back(Term);
   }
 
   return false;
@@ -411,27 +387,26 @@ static Instruction *findLocationForEntrySafepoint(Function &F,
   // hasNextInstruction and nextInstruction are used to iterate
   // through a "straight line" execution sequence.
 
-  auto hasNextInstruction = [](Instruction *I) {
-    if (!I->isTerminator()) {
+  auto HasNextInstruction = [](Instruction *I) {
+    if (!I->isTerminator())
       return true;
-    }
+
     BasicBlock *nextBB = I->getParent()->getUniqueSuccessor();
     return nextBB && (nextBB->getUniquePredecessor() != nullptr);
   };
 
-  auto nextInstruction = [&hasNextInstruction](Instruction *I) {
-    assert(hasNextInstruction(I) &&
+  auto NextInstruction = [&](Instruction *I) {
+    assert(HasNextInstruction(I) &&
            "first check if there is a next instruction!");
-    if (I->isTerminator()) {
+
+    if (I->isTerminator())
       return &I->getParent()->getUniqueSuccessor()->front();
-    } else {
-      return &*++I->getIterator();
-    }
+    return &*++I->getIterator();
   };
 
-  Instruction *cursor = nullptr;
-  for (cursor = &F.getEntryBlock().front(); hasNextInstruction(cursor);
-       cursor = nextInstruction(cursor)) {
+  Instruction *Cursor = nullptr;
+  for (Cursor = &F.getEntryBlock().front(); HasNextInstruction(Cursor);
+       Cursor = NextInstruction(Cursor)) {
 
     // We need to ensure a safepoint poll occurs before any 'real' call.  The
     // easiest way to ensure finite execution between safepoints in the face of
@@ -440,51 +415,17 @@ static Instruction *findLocationForEntrySafepoint(Function &F,
     // which can grow the stack by an unbounded amount.  This isn't required
     // for GC semantics per se, but is a common requirement for languages
     // which detect stack overflow via guard pages and then throw exceptions.
-    if (auto CS = CallSite(cursor)) {
+    if (auto CS = CallSite(Cursor)) {
       if (doesNotRequireEntrySafepointBefore(CS))
         continue;
       break;
     }
   }
 
-  assert((hasNextInstruction(cursor) || cursor->isTerminator()) &&
+  assert((HasNextInstruction(Cursor) || Cursor->isTerminator()) &&
          "either we stopped because of a call, or because of terminator");
 
-  return cursor;
-}
-
-/// Identify the list of call sites which need to be have parseable state
-static void findCallSafepoints(Function &F,
-                               std::vector<CallSite> &Found /*rval*/) {
-  assert(Found.empty() && "must be empty!");
-  for (Instruction &I : instructions(F)) {
-    Instruction *inst = &I;
-    if (isa<CallInst>(inst) || isa<InvokeInst>(inst)) {
-      CallSite CS(inst);
-
-      // No safepoint needed or wanted
-      if (!needsStatepoint(CS)) {
-        continue;
-      }
-
-      Found.push_back(CS);
-    }
-  }
-}
-
-/// Implement a unique function which doesn't require we sort the input
-/// vector.  Doing so has the effect of changing the output of a couple of
-/// tests in ways which make them less useful in testing fused safepoints.
-template <typename T> static void unique_unsorted(std::vector<T> &vec) {
-  std::set<T> seen;
-  std::vector<T> tmp;
-  vec.reserve(vec.size());
-  std::swap(tmp, vec);
-  for (auto V : tmp) {
-    if (seen.insert(V).second) {
-      vec.push_back(V);
-    }
-  }
+  return Cursor;
 }
 
 static const char *const GCSafepointPollName = "gc.safepoint_poll";
@@ -514,24 +455,6 @@ static bool enableEntrySafepoints(Function &F) { return !NoEntry; }
 static bool enableBackedgeSafepoints(Function &F) { return !NoBackedge; }
 static bool enableCallSafepoints(Function &F) { return !NoCall; }
 
-// Normalize basic block to make it ready to be target of invoke statepoint.
-// Ensure that 'BB' does not have phi nodes. It may require spliting it.
-static BasicBlock *normalizeForInvokeSafepoint(BasicBlock *BB,
-                                               BasicBlock *InvokeParent) {
-  BasicBlock *ret = BB;
-
-  if (!BB->getUniquePredecessor()) {
-    ret = SplitBlockPredecessors(BB, InvokeParent, "");
-  }
-
-  // Now that 'ret' has unique predecessor we can safely remove all phi nodes
-  // from it
-  FoldSingleEntryPHINodes(ret);
-  assert(!isa<PHINode>(ret->begin()));
-
-  return ret;
-}
-
 bool PlaceSafepoints::runOnFunction(Function &F) {
   if (F.isDeclaration() || F.empty()) {
     // This is a declaration, nothing to do.  Must exit early to avoid crash in
@@ -549,13 +472,13 @@ bool PlaceSafepoints::runOnFunction(Function &F) {
   if (!shouldRewriteFunction(F))
     return false;
 
-  bool modified = false;
+  bool Modified = false;
 
   // In various bits below, we rely on the fact that uses are reachable from
   // defs.  When there are basic blocks unreachable from the entry, dominance
   // and reachablity queries return non-sensical results.  Thus, we preprocess
   // the function to ensure these properties hold.
-  modified |= removeUnreachableBlocks(F);
+  Modified |= removeUnreachableBlocks(F);
 
   // STEP 1 - Insert the safepoint polling locations.  We do not need to
   // actually insert parse points yet.  That will be done for all polls and
@@ -574,8 +497,7 @@ bool PlaceSafepoints::runOnFunction(Function &F) {
     // with for the moment.
     legacy::FunctionPassManager FPM(F.getParent());
     bool CanAssumeCallSafepoints = enableCallSafepoints(F);
-    PlaceBackedgeSafepointsImpl *PBS =
-      new PlaceBackedgeSafepointsImpl(CanAssumeCallSafepoints);
+    auto *PBS = new PlaceBackedgeSafepointsImpl(CanAssumeCallSafepoints);
     FPM.add(PBS);
     FPM.run(F);
 
@@ -603,7 +525,7 @@ bool PlaceSafepoints::runOnFunction(Function &F) {
     // The poll location must be the terminator of a loop latch block.
     for (TerminatorInst *Term : PollLocations) {
       // We are inserting a poll, the function is modified
-      modified = true;
+      Modified = true;
 
       if (SplitBackedge) {
         // Split the backedge of the loop and insert the poll within that new
@@ -643,14 +565,13 @@ bool PlaceSafepoints::runOnFunction(Function &F) {
   }
 
   if (enableEntrySafepoints(F)) {
-    Instruction *Location = findLocationForEntrySafepoint(F, DT);
-    if (!Location) {
-      // policy choice not to insert?
-    } else {
+    if (Instruction *Location = findLocationForEntrySafepoint(F, DT)) {
       PollsNeeded.push_back(Location);
-      modified = true;
+      Modified = true;
       NumEntrySafepoints++;
     }
+    // TODO: else we should assert that there was, in fact, a policy choice to
+    // not insert a entry safepoint poll.
   }
 
   // Now that we've identified all the needed safepoint poll locations, insert
@@ -661,71 +582,8 @@ bool PlaceSafepoints::runOnFunction(Function &F) {
     ParsePointNeeded.insert(ParsePointNeeded.end(), RuntimeCalls.begin(),
                             RuntimeCalls.end());
   }
-  PollsNeeded.clear(); // make sure we don't accidentally use
-  // The dominator tree has been invalidated by the inlining performed in the
-  // above loop.  TODO: Teach the inliner how to update the dom tree?
-  DT.recalculate(F);
-
-  if (enableCallSafepoints(F)) {
-    std::vector<CallSite> Calls;
-    findCallSafepoints(F, Calls);
-    NumCallSafepoints += Calls.size();
-    ParsePointNeeded.insert(ParsePointNeeded.end(), Calls.begin(), Calls.end());
-  }
-
-  // Unique the vectors since we can end up with duplicates if we scan the call
-  // site for call safepoints after we add it for entry or backedge.  The
-  // only reason we need tracking at all is that some functions might have
-  // polls but not call safepoints and thus we might miss marking the runtime
-  // calls for the polls. (This is useful in test cases!)
-  unique_unsorted(ParsePointNeeded);
-
-  // Any parse point (no matter what source) will be handled here
-
-  // We're about to start modifying the function
-  if (!ParsePointNeeded.empty())
-    modified = true;
-
-  // Now run through and insert the safepoints, but do _NOT_ update or remove
-  // any existing uses.  We have references to live variables that need to
-  // survive to the last iteration of this loop.
-  std::vector<Value *> Results;
-  Results.reserve(ParsePointNeeded.size());
-  for (size_t i = 0; i < ParsePointNeeded.size(); i++) {
-    CallSite &CS = ParsePointNeeded[i];
-
-    // For invoke statepoints we need to remove all phi nodes at the normal
-    // destination block.
-    // Reason for this is that we can place gc_result only after last phi node
-    // in basic block. We will get malformed code after RAUW for the
-    // gc_result if one of this phi nodes uses result from the invoke.
-    if (InvokeInst *Invoke = dyn_cast<InvokeInst>(CS.getInstruction())) {
-      normalizeForInvokeSafepoint(Invoke->getNormalDest(),
-                                  Invoke->getParent());
-    }
-
-    Value *GCResult = ReplaceWithStatepoint(CS);
-    Results.push_back(GCResult);
-  }
-  assert(Results.size() == ParsePointNeeded.size());
-
-  // Adjust all users of the old call sites to use the new ones instead
-  for (size_t i = 0; i < ParsePointNeeded.size(); i++) {
-    CallSite &CS = ParsePointNeeded[i];
-    Value *GCResult = Results[i];
-    if (GCResult) {
-      // Can not RAUW for the invoke gc result in case of phi nodes preset.
-      assert(CS.isCall() || !isa<PHINode>(cast<Instruction>(GCResult)->getParent()->begin()));
-
-      // Replace all uses with the new call
-      CS.getInstruction()->replaceAllUsesWith(GCResult);
-    }
 
-    // Now that we've handled all uses, remove the original call itself
-    // Note: The insert point can't be the deleted instruction!
-    CS.getInstruction()->eraseFromParent();
-  }
-  return modified;
+  return Modified;
 }
 
 char PlaceBackedgeSafepointsImpl::ID = 0;
@@ -763,191 +621,60 @@ InsertSafepointPoll(Instruction *InsertBefore,
 
   auto *F = M->getFunction(GCSafepointPollName);
   assert(F && "gc.safepoint_poll function is missing");
-  assert(F->getType()->getElementType() ==
+  assert(F->getValueType() ==
          FunctionType::get(Type::getVoidTy(M->getContext()), false) &&
          "gc.safepoint_poll declared with wrong type");
   assert(!F->empty() && "gc.safepoint_poll must be a non-empty function");
   CallInst *PollCall = CallInst::Create(F, "", InsertBefore);
 
   // Record some information about the call site we're replacing
-  BasicBlock::iterator before(PollCall), after(PollCall);
-  bool isBegin(false);
-  if (before == OrigBB->begin()) {
-    isBegin = true;
-  } else {
-    before--;
-  }
-  after++;
-  assert(after != OrigBB->end() && "must have successor");
+  BasicBlock::iterator Before(PollCall), After(PollCall);
+  bool IsBegin = false;
+  if (Before == OrigBB->begin())
+    IsBegin = true;
+  else
+    Before--;
 
-  // do the actual inlining
+  After++;
+  assert(After != OrigBB->end() && "must have successor");
+
+  // Do the actual inlining
   InlineFunctionInfo IFI;
   bool InlineStatus = InlineFunction(PollCall, IFI);
   assert(InlineStatus && "inline must succeed");
   (void)InlineStatus; // suppress warning in release-asserts
 
-  // Check post conditions
+  // Check post-conditions
   assert(IFI.StaticAllocas.empty() && "can't have allocs");
 
-  std::vector<CallInst *> calls; // new calls
-  std::set<BasicBlock *> BBs;    // new BBs + insertee
+  std::vector<CallInst *> Calls; // new calls
+  DenseSet<BasicBlock *> BBs;    // new BBs + insertee
+
   // Include only the newly inserted instructions, Note: begin may not be valid
   // if we inserted to the beginning of the basic block
-  BasicBlock::iterator start;
-  if (isBegin) {
-    start = OrigBB->begin();
-  } else {
-    start = before;
-    start++;
-  }
+  BasicBlock::iterator Start = IsBegin ? OrigBB->begin() : std::next(Before);
 
   // If your poll function includes an unreachable at the end, that's not
   // valid.  Bugpoint likes to create this, so check for it.
-  assert(isPotentiallyReachable(&*start, &*after, nullptr, nullptr) &&
+  assert(isPotentiallyReachable(&*Start, &*After) &&
          "malformed poll function");
 
-  scanInlinedCode(&*(start), &*(after), calls, BBs);
-  assert(!calls.empty() && "slow path not found for safepoint poll");
+  scanInlinedCode(&*Start, &*After, Calls, BBs);
+  assert(!Calls.empty() && "slow path not found for safepoint poll");
 
   // Record the fact we need a parsable state at the runtime call contained in
   // the poll function.  This is required so that the runtime knows how to
   // parse the last frame when we actually take  the safepoint (i.e. execute
   // the slow path)
   assert(ParsePointsNeeded.empty());
-  for (size_t i = 0; i < calls.size(); i++) {
-
+  for (auto *CI : Calls) {
     // No safepoint needed or wanted
-    if (!needsStatepoint(calls[i])) {
+    if (!needsStatepoint(CI))
       continue;
-    }
 
     // These are likely runtime calls.  Should we assert that via calling
     // convention or something?
-    ParsePointsNeeded.push_back(CallSite(calls[i]));
-  }
-  assert(ParsePointsNeeded.size() <= calls.size());
-}
-
-/// Replaces the given call site (Call or Invoke) with a gc.statepoint
-/// intrinsic with an empty deoptimization arguments list.  This does
-/// NOT do explicit relocation for GC support.
-static Value *ReplaceWithStatepoint(const CallSite &CS /* to replace */) {
-  assert(CS.getInstruction()->getModule() && "must be set");
-
-  // TODO: technically, a pass is not allowed to get functions from within a
-  // function pass since it might trigger a new function addition.  Refactor
-  // this logic out to the initialization of the pass.  Doesn't appear to
-  // matter in practice.
-
-  // Then go ahead and use the builder do actually do the inserts.  We insert
-  // immediately before the previous instruction under the assumption that all
-  // arguments will be available here.  We can't insert afterwards since we may
-  // be replacing a terminator.
-  IRBuilder<> Builder(CS.getInstruction());
-
-  // Note: The gc args are not filled in at this time, that's handled by
-  // RewriteStatepointsForGC (which is currently under review).
-
-  // Create the statepoint given all the arguments
-  Instruction *Token = nullptr;
-
-  uint64_t ID;
-  uint32_t NumPatchBytes;
-
-  AttributeSet OriginalAttrs = CS.getAttributes();
-  Attribute AttrID =
-      OriginalAttrs.getAttribute(AttributeSet::FunctionIndex, "statepoint-id");
-  Attribute AttrNumPatchBytes = OriginalAttrs.getAttribute(
-      AttributeSet::FunctionIndex, "statepoint-num-patch-bytes");
-
-  AttrBuilder AttrsToRemove;
-  bool HasID = AttrID.isStringAttribute() &&
-               !AttrID.getValueAsString().getAsInteger(10, ID);
-
-  if (HasID)
-    AttrsToRemove.addAttribute("statepoint-id");
-  else
-    ID = 0xABCDEF00;
-
-  bool HasNumPatchBytes =
-      AttrNumPatchBytes.isStringAttribute() &&
-      !AttrNumPatchBytes.getValueAsString().getAsInteger(10, NumPatchBytes);
-
-  if (HasNumPatchBytes)
-    AttrsToRemove.addAttribute("statepoint-num-patch-bytes");
-  else
-    NumPatchBytes = 0;
-
-  OriginalAttrs = OriginalAttrs.removeAttributes(
-      CS.getInstruction()->getContext(), AttributeSet::FunctionIndex,
-      AttrsToRemove);
-
-  if (CS.isCall()) {
-    CallInst *ToReplace = cast<CallInst>(CS.getInstruction());
-    CallInst *Call = Builder.CreateGCStatepointCall(
-        ID, NumPatchBytes, CS.getCalledValue(),
-        makeArrayRef(CS.arg_begin(), CS.arg_end()), None, None,
-        "safepoint_token");
-    Call->setTailCall(ToReplace->isTailCall());
-    Call->setCallingConv(ToReplace->getCallingConv());
-
-    // In case if we can handle this set of attributes - set up function
-    // attributes directly on statepoint and return attributes later for
-    // gc_result intrinsic.
-    Call->setAttributes(OriginalAttrs.getFnAttributes());
-
-    Token = Call;
-
-    // Put the following gc_result and gc_relocate calls immediately after
-    // the old call (which we're about to delete).
-    assert(ToReplace->getNextNode() && "not a terminator, must have next");
-    Builder.SetInsertPoint(ToReplace->getNextNode());
-    Builder.SetCurrentDebugLocation(ToReplace->getNextNode()->getDebugLoc());
-  } else if (CS.isInvoke()) {
-    InvokeInst *ToReplace = cast<InvokeInst>(CS.getInstruction());
-
-    // Insert the new invoke into the old block.  We'll remove the old one in a
-    // moment at which point this will become the new terminator for the
-    // original block.
-    Builder.SetInsertPoint(ToReplace->getParent());
-    InvokeInst *Invoke = Builder.CreateGCStatepointInvoke(
-        ID, NumPatchBytes, CS.getCalledValue(), ToReplace->getNormalDest(),
-        ToReplace->getUnwindDest(), makeArrayRef(CS.arg_begin(), CS.arg_end()),
-        None, None, "safepoint_token");
-
-    Invoke->setCallingConv(ToReplace->getCallingConv());
-
-    // In case if we can handle this set of attributes - set up function
-    // attributes directly on statepoint and return attributes later for
-    // gc_result intrinsic.
-    Invoke->setAttributes(OriginalAttrs.getFnAttributes());
-
-    Token = Invoke;
-
-    // We'll insert the gc.result into the normal block
-    BasicBlock *NormalDest = ToReplace->getNormalDest();
-    // Can not insert gc.result in case of phi nodes preset.
-    // Should have removed this cases prior to running this function
-    assert(!isa<PHINode>(NormalDest->begin()));
-    Instruction *IP = &*(NormalDest->getFirstInsertionPt());
-    Builder.SetInsertPoint(IP);
-  } else {
-    llvm_unreachable("unexpect type of CallSite");
-  }
-  assert(Token);
-
-  // Handle the return value of the original call - update all uses to use a
-  // gc_result hanging off the statepoint node we just inserted
-
-  // Only add the gc_result iff there is actually a used result
-  if (!CS.getType()->isVoidTy() && !CS.getInstruction()->use_empty()) {
-    std::string TakenName =
-        CS.getInstruction()->hasName() ? CS.getInstruction()->getName() : "";
-    CallInst *GCResult = Builder.CreateGCResult(Token, CS.getType(), TakenName);
-    GCResult->setAttributes(OriginalAttrs.getRetAttributes());
-    return GCResult;
-  } else {
-    // No return value for the call.
-    return nullptr;
+    ParsePointsNeeded.push_back(CallSite(CI));
   }
+  assert(ParsePointsNeeded.size() <= Calls.size());
 }
diff --git a/lib/Transforms/Scalar/Reassociate.cpp b/lib/Transforms/Scalar/Reassociate.cpp
index bcadd4e2bee6..b930a8fb7e99 100644
--- a/lib/Transforms/Scalar/Reassociate.cpp
+++ b/lib/Transforms/Scalar/Reassociate.cpp
@@ -20,7 +20,7 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "llvm/Transforms/Scalar.h"
+#include "llvm/Transforms/Scalar/Reassociate.h"
 #include "llvm/ADT/DenseMap.h"
 #include "llvm/ADT/PostOrderIterator.h"
 #include "llvm/ADT/STLExtras.h"
@@ -39,9 +39,11 @@
 #include "llvm/Pass.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/raw_ostream.h"
+#include "llvm/Transforms/Scalar.h"
 #include "llvm/Transforms/Utils/Local.h"
 #include <algorithm>
 using namespace llvm;
+using namespace reassociate;
 
 #define DEBUG_TYPE "reassociate"
 
@@ -49,17 +51,6 @@ STATISTIC(NumChanged, "Number of insts reassociated");
 STATISTIC(NumAnnihil, "Number of expr tree annihilated");
 STATISTIC(NumFactor , "Number of multiplies factored");
 
-namespace {
-  struct ValueEntry {
-    unsigned Rank;
-    Value *Op;
-    ValueEntry(unsigned R, Value *O) : Rank(R), Op(O) {}
-  };
-  inline bool operator<(const ValueEntry &LHS, const ValueEntry &RHS) {
-    return LHS.Rank > RHS.Rank;   // Sort so that highest rank goes to start.
-  }
-}
-
 #ifndef NDEBUG
 /// Print out the expression identified in the Ops list.
 ///
@@ -75,120 +66,35 @@ static void PrintOps(Instruction *I, const SmallVectorImpl<ValueEntry> &Ops) {
 }
 #endif
 
-namespace {
-  /// \brief Utility class representing a base and exponent pair which form one
-  /// factor of some product.
-  struct Factor {
-    Value *Base;
-    unsigned Power;
-
-    Factor(Value *Base, unsigned Power) : Base(Base), Power(Power) {}
-
-    /// \brief Sort factors in descending order by their power.
-    struct PowerDescendingSorter {
-      bool operator()(const Factor &LHS, const Factor &RHS) {
-        return LHS.Power > RHS.Power;
-      }
-    };
-
-    /// \brief Compare factors for equal powers.
-    struct PowerEqual {
-      bool operator()(const Factor &LHS, const Factor &RHS) {
-        return LHS.Power == RHS.Power;
-      }
-    };
-  };
-  
-  /// Utility class representing a non-constant Xor-operand. We classify
-  /// non-constant Xor-Operands into two categories:
-  ///  C1) The operand is in the form "X & C", where C is a constant and C != ~0
-  ///  C2)
-  ///    C2.1) The operand is in the form of "X | C", where C is a non-zero
-  ///          constant.
-  ///    C2.2) Any operand E which doesn't fall into C1 and C2.1, we view this
-  ///          operand as "E | 0"
-  class XorOpnd {
-  public:
-    XorOpnd(Value *V);
-
-    bool isInvalid() const { return SymbolicPart == nullptr; }
-    bool isOrExpr() const { return isOr; }
-    Value *getValue() const { return OrigVal; }
-    Value *getSymbolicPart() const { return SymbolicPart; }
-    unsigned getSymbolicRank() const { return SymbolicRank; }
-    const APInt &getConstPart() const { return ConstPart; }
-
-    void Invalidate() { SymbolicPart = OrigVal = nullptr; }
-    void setSymbolicRank(unsigned R) { SymbolicRank = R; }
-
-    // Sort the XorOpnd-Pointer in ascending order of symbolic-value-rank.
-    // The purpose is twofold:
-    // 1) Cluster together the operands sharing the same symbolic-value.
-    // 2) Operand having smaller symbolic-value-rank is permuted earlier, which 
-    //   could potentially shorten crital path, and expose more loop-invariants.
-    //   Note that values' rank are basically defined in RPO order (FIXME). 
-    //   So, if Rank(X) < Rank(Y) < Rank(Z), it means X is defined earlier 
-    //   than Y which is defined earlier than Z. Permute "x | 1", "Y & 2",
-    //   "z" in the order of X-Y-Z is better than any other orders.
-    struct PtrSortFunctor {
-      bool operator()(XorOpnd * const &LHS, XorOpnd * const &RHS) {
-        return LHS->getSymbolicRank() < RHS->getSymbolicRank();
-      }
-    };
-  private:
-    Value *OrigVal;
-    Value *SymbolicPart;
-    APInt ConstPart;
-    unsigned SymbolicRank;
-    bool isOr;
-  };
-}
-
-namespace {
-  class Reassociate : public FunctionPass {
-    DenseMap<BasicBlock*, unsigned> RankMap;
-    DenseMap<AssertingVH<Value>, unsigned> ValueRankMap;
-    SetVector<AssertingVH<Instruction> > RedoInsts;
-    bool MadeChange;
-  public:
-    static char ID; // Pass identification, replacement for typeid
-    Reassociate() : FunctionPass(ID) {
-      initializeReassociatePass(*PassRegistry::getPassRegistry());
-    }
-
-    bool runOnFunction(Function &F) override;
-
-    void getAnalysisUsage(AnalysisUsage &AU) const override {
-      AU.setPreservesCFG();
-      AU.addPreserved<GlobalsAAWrapperPass>();
-    }
-  private:
-    void BuildRankMap(Function &F);
-    unsigned getRank(Value *V);
-    void canonicalizeOperands(Instruction *I);
-    void ReassociateExpression(BinaryOperator *I);
-    void RewriteExprTree(BinaryOperator *I, SmallVectorImpl<ValueEntry> &Ops);
-    Value *OptimizeExpression(BinaryOperator *I,
-                              SmallVectorImpl<ValueEntry> &Ops);
-    Value *OptimizeAdd(Instruction *I, SmallVectorImpl<ValueEntry> &Ops);
-    Value *OptimizeXor(Instruction *I, SmallVectorImpl<ValueEntry> &Ops);
-    bool CombineXorOpnd(Instruction *I, XorOpnd *Opnd1, APInt &ConstOpnd,
-                        Value *&Res);
-    bool CombineXorOpnd(Instruction *I, XorOpnd *Opnd1, XorOpnd *Opnd2,
-                        APInt &ConstOpnd, Value *&Res);
-    bool collectMultiplyFactors(SmallVectorImpl<ValueEntry> &Ops,
-                                SmallVectorImpl<Factor> &Factors);
-    Value *buildMinimalMultiplyDAG(IRBuilder<> &Builder,
-                                   SmallVectorImpl<Factor> &Factors);
-    Value *OptimizeMul(BinaryOperator *I, SmallVectorImpl<ValueEntry> &Ops);
-    Value *RemoveFactorFromExpression(Value *V, Value *Factor);
-    void EraseInst(Instruction *I);
-    void RecursivelyEraseDeadInsts(Instruction *I,
-                                   SetVector<AssertingVH<Instruction>> &Insts);
-    void OptimizeInst(Instruction *I);
-    Instruction *canonicalizeNegConstExpr(Instruction *I);
-  };
-}
+/// Utility class representing a non-constant Xor-operand. We classify
+/// non-constant Xor-Operands into two categories:
+///  C1) The operand is in the form "X & C", where C is a constant and C != ~0
+///  C2)
+///    C2.1) The operand is in the form of "X | C", where C is a non-zero
+///          constant.
+///    C2.2) Any operand E which doesn't fall into C1 and C2.1, we view this
+///          operand as "E | 0"
+class llvm::reassociate::XorOpnd {
+public:
+  XorOpnd(Value *V);
+
+  bool isInvalid() const { return SymbolicPart == nullptr; }
+  bool isOrExpr() const { return isOr; }
+  Value *getValue() const { return OrigVal; }
+  Value *getSymbolicPart() const { return SymbolicPart; }
+  unsigned getSymbolicRank() const { return SymbolicRank; }
+  const APInt &getConstPart() const { return ConstPart; }
+
+  void Invalidate() { SymbolicPart = OrigVal = nullptr; }
+  void setSymbolicRank(unsigned R) { SymbolicRank = R; }
+
+private:
+  Value *OrigVal;
+  Value *SymbolicPart;
+  APInt ConstPart;
+  unsigned SymbolicRank;
+  bool isOr;
+};
 
 XorOpnd::XorOpnd(Value *V) {
   assert(!isa<ConstantInt>(V) && "No ConstantInt");
@@ -217,13 +123,6 @@ XorOpnd::XorOpnd(Value *V) {
   isOr = true;
 }
 
-char Reassociate::ID = 0;
-INITIALIZE_PASS(Reassociate, "reassociate",
-                "Reassociate expressions", false, false)
-
-// Public interface to the Reassociate pass
-FunctionPass *llvm::createReassociatePass() { return new Reassociate(); }
-
 /// Return true if V is an instruction of the specified opcode and if it
 /// only has one use.
 static BinaryOperator *isReassociableOp(Value *V, unsigned Opcode) {
@@ -246,7 +145,8 @@ static BinaryOperator *isReassociableOp(Value *V, unsigned Opcode1,
   return nullptr;
 }
 
-void Reassociate::BuildRankMap(Function &F) {
+void ReassociatePass::BuildRankMap(
+    Function &F, ReversePostOrderTraversal<Function *> &RPOT) {
   unsigned i = 2;
 
   // Assign distinct ranks to function arguments.
@@ -255,22 +155,19 @@ void Reassociate::BuildRankMap(Function &F) {
     DEBUG(dbgs() << "Calculated Rank[" << I->getName() << "] = " << i << "\n");
   }
 
-  ReversePostOrderTraversal<Function*> RPOT(&F);
-  for (ReversePostOrderTraversal<Function*>::rpo_iterator I = RPOT.begin(),
-         E = RPOT.end(); I != E; ++I) {
-    BasicBlock *BB = *I;
+  for (BasicBlock *BB : RPOT) {
     unsigned BBRank = RankMap[BB] = ++i << 16;
 
     // Walk the basic block, adding precomputed ranks for any instructions that
     // we cannot move.  This ensures that the ranks for these instructions are
     // all different in the block.
-    for (BasicBlock::iterator I = BB->begin(), E = BB->end(); I != E; ++I)
-      if (mayBeMemoryDependent(*I))
-        ValueRankMap[&*I] = ++BBRank;
+    for (Instruction &I : *BB)
+      if (mayBeMemoryDependent(I))
+        ValueRankMap[&I] = ++BBRank;
   }
 }
 
-unsigned Reassociate::getRank(Value *V) {
+unsigned ReassociatePass::getRank(Value *V) {
   Instruction *I = dyn_cast<Instruction>(V);
   if (!I) {
     if (isa<Argument>(V)) return ValueRankMap[V];   // Function argument.
@@ -301,7 +198,7 @@ unsigned Reassociate::getRank(Value *V) {
 }
 
 // Canonicalize constants to RHS.  Otherwise, sort the operands by rank.
-void Reassociate::canonicalizeOperands(Instruction *I) {
+void ReassociatePass::canonicalizeOperands(Instruction *I) {
   assert(isa<BinaryOperator>(I) && "Expected binary operator.");
   assert(I->isCommutative() && "Expected commutative operator.");
 
@@ -711,8 +608,8 @@ static bool LinearizeExprTree(BinaryOperator *I,
 
 /// Now that the operands for this expression tree are
 /// linearized and optimized, emit them in-order.
-void Reassociate::RewriteExprTree(BinaryOperator *I,
-                                  SmallVectorImpl<ValueEntry> &Ops) {
+void ReassociatePass::RewriteExprTree(BinaryOperator *I,
+                                      SmallVectorImpl<ValueEntry> &Ops) {
   assert(Ops.size() > 1 && "Single values should be used directly!");
 
   // Since our optimizations should never increase the number of operations, the
@@ -1095,7 +992,7 @@ static Value *EmitAddTreeOfValues(Instruction *I,
 /// If V is an expression tree that is a multiplication sequence,
 /// and if this sequence contains a multiply by Factor,
 /// remove Factor from the tree and return the new tree.
-Value *Reassociate::RemoveFactorFromExpression(Value *V, Value *Factor) {
+Value *ReassociatePass::RemoveFactorFromExpression(Value *V, Value *Factor) {
   BinaryOperator *BO = isReassociableOp(V, Instruction::Mul, Instruction::FMul);
   if (!BO)
     return nullptr;
@@ -1129,7 +1026,7 @@ Value *Reassociate::RemoveFactorFromExpression(Value *V, Value *Factor) {
         }
     } else if (ConstantFP *FC1 = dyn_cast<ConstantFP>(Factor)) {
       if (ConstantFP *FC2 = dyn_cast<ConstantFP>(Factors[i].Op)) {
-        APFloat F1(FC1->getValueAPF());
+        const APFloat &F1 = FC1->getValueAPF();
         APFloat F2(FC2->getValueAPF());
         F2.changeSign();
         if (F1.compare(F2) == APFloat::cmpEqual) {
@@ -1258,9 +1155,9 @@ static Value *createAndInstr(Instruction *InsertBefore, Value *Opnd,
 // If it was successful, true is returned, and the "R" and "C" is returned
 // via "Res" and "ConstOpnd", respectively; otherwise, false is returned,
 // and both "Res" and "ConstOpnd" remain unchanged.
-//  
-bool Reassociate::CombineXorOpnd(Instruction *I, XorOpnd *Opnd1,
-                                 APInt &ConstOpnd, Value *&Res) {
+//
+bool ReassociatePass::CombineXorOpnd(Instruction *I, XorOpnd *Opnd1,
+                                     APInt &ConstOpnd, Value *&Res) {
   // Xor-Rule 1: (x | c1) ^ c2 = (x | c1) ^ (c1 ^ c1) ^ c2 
   //                       = ((x | c1) ^ c1) ^ (c1 ^ c2)
   //                       = (x & ~c1) ^ (c1 ^ c2)
@@ -1294,8 +1191,9 @@ bool Reassociate::CombineXorOpnd(Instruction *I, XorOpnd *Opnd1,
 // via "Res" and "ConstOpnd", respectively (If the entire expression is
 // evaluated to a constant, the Res is set to NULL); otherwise, false is
 // returned, and both "Res" and "ConstOpnd" remain unchanged.
-bool Reassociate::CombineXorOpnd(Instruction *I, XorOpnd *Opnd1, XorOpnd *Opnd2,
-                                 APInt &ConstOpnd, Value *&Res) {
+bool ReassociatePass::CombineXorOpnd(Instruction *I, XorOpnd *Opnd1,
+                                     XorOpnd *Opnd2, APInt &ConstOpnd,
+                                     Value *&Res) {
   Value *X = Opnd1->getSymbolicPart();
   if (X != Opnd2->getSymbolicPart())
     return false;
@@ -1369,8 +1267,8 @@ bool Reassociate::CombineXorOpnd(Instruction *I, XorOpnd *Opnd1, XorOpnd *Opnd2,
 /// Optimize a series of operands to an 'xor' instruction. If it can be reduced
 /// to a single Value, it is returned, otherwise the Ops list is mutated as
 /// necessary.
-Value *Reassociate::OptimizeXor(Instruction *I,
-                                SmallVectorImpl<ValueEntry> &Ops) {
+Value *ReassociatePass::OptimizeXor(Instruction *I,
+                                    SmallVectorImpl<ValueEntry> &Ops) {
   if (Value *V = OptimizeAndOrXor(Instruction::Xor, Ops))
     return V;
       
@@ -1405,7 +1303,19 @@ Value *Reassociate::OptimizeXor(Instruction *I,
   //  the same symbolic value cluster together. For instance, the input operand
   //  sequence ("x | 123", "y & 456", "x & 789") will be sorted into:
   //  ("x | 123", "x & 789", "y & 456").
-  std::stable_sort(OpndPtrs.begin(), OpndPtrs.end(), XorOpnd::PtrSortFunctor());
+  //
+  //  The purpose is twofold:
+  //  1) Cluster together the operands sharing the same symbolic-value.
+  //  2) Operand having smaller symbolic-value-rank is permuted earlier, which
+  //     could potentially shorten crital path, and expose more loop-invariants.
+  //     Note that values' rank are basically defined in RPO order (FIXME).
+  //     So, if Rank(X) < Rank(Y) < Rank(Z), it means X is defined earlier
+  //     than Y which is defined earlier than Z. Permute "x | 1", "Y & 2",
+  //     "z" in the order of X-Y-Z is better than any other orders.
+  std::stable_sort(OpndPtrs.begin(), OpndPtrs.end(),
+                   [](XorOpnd *LHS, XorOpnd *RHS) {
+    return LHS->getSymbolicRank() < RHS->getSymbolicRank();
+  });
 
   // Step 3: Combine adjacent operands
   XorOpnd *PrevOpnd = nullptr;
@@ -1478,8 +1388,8 @@ Value *Reassociate::OptimizeXor(Instruction *I,
 /// Optimize a series of operands to an 'add' instruction.  This
 /// optimizes based on identities.  If it can be reduced to a single Value, it
 /// is returned, otherwise the Ops list is mutated as necessary.
-Value *Reassociate::OptimizeAdd(Instruction *I,
-                                SmallVectorImpl<ValueEntry> &Ops) {
+Value *ReassociatePass::OptimizeAdd(Instruction *I,
+                                    SmallVectorImpl<ValueEntry> &Ops) {
   // Scan the operand lists looking for X and -X pairs.  If we find any, we
   // can simplify expressions like X+-X == 0 and X+~X ==-1.  While we're at it,
   // scan for any
@@ -1716,8 +1626,8 @@ Value *Reassociate::OptimizeAdd(Instruction *I,
 ///   ((((x*y)*x)*y)*x) -> [(x, 3), (y, 2)]
 ///
 /// \returns Whether any factors have a power greater than one.
-bool Reassociate::collectMultiplyFactors(SmallVectorImpl<ValueEntry> &Ops,
-                                         SmallVectorImpl<Factor> &Factors) {
+bool ReassociatePass::collectMultiplyFactors(SmallVectorImpl<ValueEntry> &Ops,
+                                             SmallVectorImpl<Factor> &Factors) {
   // FIXME: Have Ops be (ValueEntry, Multiplicity) pairs, simplifying this.
   // Compute the sum of powers of simplifiable factors.
   unsigned FactorPowerSum = 0;
@@ -1763,7 +1673,10 @@ bool Reassociate::collectMultiplyFactors(SmallVectorImpl<ValueEntry> &Ops,
   // below our mininum of '4'.
   assert(FactorPowerSum >= 4);
 
-  std::stable_sort(Factors.begin(), Factors.end(), Factor::PowerDescendingSorter());
+  std::stable_sort(Factors.begin(), Factors.end(),
+                   [](const Factor &LHS, const Factor &RHS) {
+    return LHS.Power > RHS.Power;
+  });
   return true;
 }
 
@@ -1790,8 +1703,9 @@ static Value *buildMultiplyTree(IRBuilder<> &Builder,
 /// equal and the powers are sorted in decreasing order, compute the minimal
 /// DAG of multiplies to compute the final product, and return that product
 /// value.
-Value *Reassociate::buildMinimalMultiplyDAG(IRBuilder<> &Builder,
-                                            SmallVectorImpl<Factor> &Factors) {
+Value *
+ReassociatePass::buildMinimalMultiplyDAG(IRBuilder<> &Builder,
+                                         SmallVectorImpl<Factor> &Factors) {
   assert(Factors[0].Power);
   SmallVector<Value *, 4> OuterProduct;
   for (unsigned LastIdx = 0, Idx = 1, Size = Factors.size();
@@ -1822,7 +1736,9 @@ Value *Reassociate::buildMinimalMultiplyDAG(IRBuilder<> &Builder,
   // Unique factors with equal powers -- we've folded them into the first one's
   // base.
   Factors.erase(std::unique(Factors.begin(), Factors.end(),
-                            Factor::PowerEqual()),
+                            [](const Factor &LHS, const Factor &RHS) {
+                              return LHS.Power == RHS.Power;
+                            }),
                 Factors.end());
 
   // Iteratively collect the base of each factor with an add power into the
@@ -1845,8 +1761,8 @@ Value *Reassociate::buildMinimalMultiplyDAG(IRBuilder<> &Builder,
   return V;
 }
 
-Value *Reassociate::OptimizeMul(BinaryOperator *I,
-                                SmallVectorImpl<ValueEntry> &Ops) {
+Value *ReassociatePass::OptimizeMul(BinaryOperator *I,
+                                    SmallVectorImpl<ValueEntry> &Ops) {
   // We can only optimize the multiplies when there is a chain of more than
   // three, such that a balanced tree might require fewer total multiplies.
   if (Ops.size() < 4)
@@ -1869,8 +1785,8 @@ Value *Reassociate::OptimizeMul(BinaryOperator *I,
   return nullptr;
 }
 
-Value *Reassociate::OptimizeExpression(BinaryOperator *I,
-                                       SmallVectorImpl<ValueEntry> &Ops) {
+Value *ReassociatePass::OptimizeExpression(BinaryOperator *I,
+                                           SmallVectorImpl<ValueEntry> &Ops) {
   // Now that we have the linearized expression tree, try to optimize it.
   // Start by folding any constants that we found.
   Constant *Cst = nullptr;
@@ -1930,7 +1846,7 @@ Value *Reassociate::OptimizeExpression(BinaryOperator *I,
 
 // Remove dead instructions and if any operands are trivially dead add them to
 // Insts so they will be removed as well.
-void Reassociate::RecursivelyEraseDeadInsts(
+void ReassociatePass::RecursivelyEraseDeadInsts(
     Instruction *I, SetVector<AssertingVH<Instruction>> &Insts) {
   assert(isInstructionTriviallyDead(I) && "Trivially dead instructions only!");
   SmallVector<Value *, 4> Ops(I->op_begin(), I->op_end());
@@ -1945,7 +1861,7 @@ void Reassociate::RecursivelyEraseDeadInsts(
 }
 
 /// Zap the given instruction, adding interesting operands to the work list.
-void Reassociate::EraseInst(Instruction *I) {
+void ReassociatePass::EraseInst(Instruction *I) {
   assert(isInstructionTriviallyDead(I) && "Trivially dead instructions only!");
   SmallVector<Value*, 8> Ops(I->op_begin(), I->op_end());
   // Erase the dead instruction.
@@ -1969,7 +1885,7 @@ void Reassociate::EraseInst(Instruction *I) {
 // Canonicalize expressions of the following form:
 //  x + (-Constant * y) -> x - (Constant * y)
 //  x - (-Constant * y) -> x + (Constant * y)
-Instruction *Reassociate::canonicalizeNegConstExpr(Instruction *I) {
+Instruction *ReassociatePass::canonicalizeNegConstExpr(Instruction *I) {
   if (!I->hasOneUse() || I->getType()->isVectorTy())
     return nullptr;
 
@@ -2046,7 +1962,7 @@ Instruction *Reassociate::canonicalizeNegConstExpr(Instruction *I) {
 
 /// Inspect and optimize the given instruction. Note that erasing
 /// instructions is not allowed.
-void Reassociate::OptimizeInst(Instruction *I) {
+void ReassociatePass::OptimizeInst(Instruction *I) {
   // Only consider operations that we understand.
   if (!isa<BinaryOperator>(I))
     return;
@@ -2173,7 +2089,7 @@ void Reassociate::OptimizeInst(Instruction *I) {
   ReassociateExpression(BO);
 }
 
-void Reassociate::ReassociateExpression(BinaryOperator *I) {
+void ReassociatePass::ReassociateExpression(BinaryOperator *I) {
   // First, walk the expression tree, linearizing the tree, collecting the
   // operand information.
   SmallVector<RepeatedValue, 8> Tree;
@@ -2255,46 +2171,53 @@ void Reassociate::ReassociateExpression(BinaryOperator *I) {
   RewriteExprTree(I, Ops);
 }
 
-bool Reassociate::runOnFunction(Function &F) {
-  if (skipOptnoneFunction(F))
-    return false;
-
-  // Calculate the rank map for F
-  BuildRankMap(F);
+PreservedAnalyses ReassociatePass::run(Function &F, FunctionAnalysisManager &) {
+  // Reassociate needs for each instruction to have its operands already
+  // processed, so we first perform a RPOT of the basic blocks so that
+  // when we process a basic block, all its dominators have been processed
+  // before.
+  ReversePostOrderTraversal<Function *> RPOT(&F);
+  BuildRankMap(F, RPOT);
 
   MadeChange = false;
-  for (Function::iterator BI = F.begin(), BE = F.end(); BI != BE; ++BI) {
+  for (BasicBlock *BI : RPOT) {
+    // Use a worklist to keep track of which instructions have been processed
+    // (and which insts won't be optimized again) so when redoing insts,
+    // optimize insts rightaway which won't be processed later.
+    SmallSet<Instruction *, 8> Worklist;
+
+    // Insert all instructions in the BB
+    for (Instruction &I : *BI)
+      Worklist.insert(&I);
+
     // Optimize every instruction in the basic block.
-    for (BasicBlock::iterator II = BI->begin(), IE = BI->end(); II != IE; )
+    for (BasicBlock::iterator II = BI->begin(), IE = BI->end(); II != IE;) {
+      // This instruction has been processed.
+      Worklist.erase(&*II);
       if (isInstructionTriviallyDead(&*II)) {
         EraseInst(&*II++);
       } else {
         OptimizeInst(&*II);
-        assert(II->getParent() == BI && "Moved to a different block!");
+        assert(II->getParent() == &*BI && "Moved to a different block!");
         ++II;
       }
 
-    // Make a copy of all the instructions to be redone so we can remove dead
-    // instructions.
-    SetVector<AssertingVH<Instruction>> ToRedo(RedoInsts);
-    // Iterate over all instructions to be reevaluated and remove trivially dead
-    // instructions. If any operand of the trivially dead instruction becomes
-    // dead mark it for deletion as well. Continue this process until all
-    // trivially dead instructions have been removed.
-    while (!ToRedo.empty()) {
-      Instruction *I = ToRedo.pop_back_val();
-      if (isInstructionTriviallyDead(I))
-        RecursivelyEraseDeadInsts(I, ToRedo);
-    }
-
-    // Now that we have removed dead instructions, we can reoptimize the
-    // remaining instructions.
-    while (!RedoInsts.empty()) {
-      Instruction *I = RedoInsts.pop_back_val();
-      if (isInstructionTriviallyDead(I))
-        EraseInst(I);
-      else
-        OptimizeInst(I);
+      // If the above optimizations produced new instructions to optimize or
+      // made modifications which need to be redone, do them now if they won't
+      // be handled later.
+      while (!RedoInsts.empty()) {
+        Instruction *I = RedoInsts.pop_back_val();
+        // Process instructions that won't be processed later, either
+        // inside the block itself or in another basic block (based on rank),
+        // since these will be processed later.
+        if ((I->getParent() != BI || !Worklist.count(I)) &&
+            RankMap[I->getParent()] <= RankMap[BI]) {
+          if (isInstructionTriviallyDead(I))
+            EraseInst(I);
+          else
+            OptimizeInst(I);
+        }
+      }
     }
   }
 
@@ -2302,5 +2225,46 @@ bool Reassociate::runOnFunction(Function &F) {
   RankMap.clear();
   ValueRankMap.clear();
 
-  return MadeChange;
+  if (MadeChange) {
+    // FIXME: This should also 'preserve the CFG'.
+    auto PA = PreservedAnalyses();
+    PA.preserve<GlobalsAA>();
+    return PA;
+  }
+
+  return PreservedAnalyses::all();
+}
+
+namespace {
+  class ReassociateLegacyPass : public FunctionPass {
+    ReassociatePass Impl;
+  public:
+    static char ID; // Pass identification, replacement for typeid
+    ReassociateLegacyPass() : FunctionPass(ID) {
+      initializeReassociateLegacyPassPass(*PassRegistry::getPassRegistry());
+    }
+
+    bool runOnFunction(Function &F) override {
+      if (skipFunction(F))
+        return false;
+
+      FunctionAnalysisManager DummyFAM;
+      auto PA = Impl.run(F, DummyFAM);
+      return !PA.areAllPreserved();
+    }
+
+    void getAnalysisUsage(AnalysisUsage &AU) const override {
+      AU.setPreservesCFG();
+      AU.addPreserved<GlobalsAAWrapperPass>();
+    }
+  };
+}
+
+char ReassociateLegacyPass::ID = 0;
+INITIALIZE_PASS(ReassociateLegacyPass, "reassociate",
+                "Reassociate expressions", false, false)
+
+// Public interface to the Reassociate pass
+FunctionPass *llvm::createReassociatePass() {
+  return new ReassociateLegacyPass();
 }
diff --git a/lib/Transforms/Scalar/Reg2Mem.cpp b/lib/Transforms/Scalar/Reg2Mem.cpp
index 915f89780c08..615029dd161b 100644
--- a/lib/Transforms/Scalar/Reg2Mem.cpp
+++ b/lib/Transforms/Scalar/Reg2Mem.cpp
@@ -68,7 +68,7 @@ INITIALIZE_PASS_END(RegToMem, "reg2mem", "Demote all values to stack slots",
                 false, false)
 
 bool RegToMem::runOnFunction(Function &F) {
-  if (F.isDeclaration())
+  if (F.isDeclaration() || skipFunction(F))
     return false;
 
   // Insert all new allocas into entry block.
@@ -89,10 +89,9 @@ bool RegToMem::runOnFunction(Function &F) {
   // Find the escaped instructions. But don't create stack slots for
   // allocas in entry block.
   std::list<Instruction*> WorkList;
-  for (Function::iterator ibb = F.begin(), ibe = F.end();
-       ibb != ibe; ++ibb)
-    for (BasicBlock::iterator iib = ibb->begin(), iie = ibb->end();
-         iib != iie; ++iib) {
+  for (BasicBlock &ibb : F)
+    for (BasicBlock::iterator iib = ibb.begin(), iie = ibb.end(); iib != iie;
+         ++iib) {
       if (!(isa<AllocaInst>(iib) && iib->getParent() == BBEntry) &&
           valueEscapes(&*iib)) {
         WorkList.push_front(&*iib);
@@ -101,25 +100,22 @@ bool RegToMem::runOnFunction(Function &F) {
 
   // Demote escaped instructions
   NumRegsDemoted += WorkList.size();
-  for (std::list<Instruction*>::iterator ilb = WorkList.begin(),
-       ile = WorkList.end(); ilb != ile; ++ilb)
-    DemoteRegToStack(**ilb, false, AllocaInsertionPoint);
+  for (Instruction *ilb : WorkList)
+    DemoteRegToStack(*ilb, false, AllocaInsertionPoint);
 
   WorkList.clear();
 
   // Find all phi's
-  for (Function::iterator ibb = F.begin(), ibe = F.end();
-       ibb != ibe; ++ibb)
-    for (BasicBlock::iterator iib = ibb->begin(), iie = ibb->end();
-         iib != iie; ++iib)
+  for (BasicBlock &ibb : F)
+    for (BasicBlock::iterator iib = ibb.begin(), iie = ibb.end(); iib != iie;
+         ++iib)
       if (isa<PHINode>(iib))
         WorkList.push_front(&*iib);
 
   // Demote phi nodes
   NumPhisDemoted += WorkList.size();
-  for (std::list<Instruction*>::iterator ilb = WorkList.begin(),
-       ile = WorkList.end(); ilb != ile; ++ilb)
-    DemotePHIToStack(cast<PHINode>(*ilb), AllocaInsertionPoint);
+  for (Instruction *ilb : WorkList)
+    DemotePHIToStack(cast<PHINode>(ilb), AllocaInsertionPoint);
 
   return true;
 }
diff --git a/lib/Transforms/Scalar/RewriteStatepointsForGC.cpp b/lib/Transforms/Scalar/RewriteStatepointsForGC.cpp
index d77d5745e60c..bab39a32677f 100644
--- a/lib/Transforms/Scalar/RewriteStatepointsForGC.cpp
+++ b/lib/Transforms/Scalar/RewriteStatepointsForGC.cpp
@@ -14,7 +14,6 @@
 
 #include "llvm/Pass.h"
 #include "llvm/Analysis/CFG.h"
-#include "llvm/Analysis/InstructionSimplify.h"
 #include "llvm/Analysis/TargetTransformInfo.h"
 #include "llvm/ADT/SetOperations.h"
 #include "llvm/ADT/Statistic.h"
@@ -63,7 +62,7 @@ static cl::opt<unsigned>
 RematerializationThreshold("spp-rematerialization-threshold", cl::Hidden,
                            cl::init(6));
 
-#ifdef XDEBUG
+#ifdef EXPENSIVE_CHECKS
 static bool ClobberNonLive = true;
 #else
 static bool ClobberNonLive = false;
@@ -72,19 +71,10 @@ static cl::opt<bool, true> ClobberNonLiveOverride("rs4gc-clobber-non-live",
                                                   cl::location(ClobberNonLive),
                                                   cl::Hidden);
 
-static cl::opt<bool> UseDeoptBundles("rs4gc-use-deopt-bundles", cl::Hidden,
-                                     cl::init(false));
 static cl::opt<bool>
     AllowStatepointWithNoDeoptInfo("rs4gc-allow-statepoint-with-no-deopt-info",
                                    cl::Hidden, cl::init(true));
 
-/// Should we split vectors of pointers into their individual elements?  This
-/// is known to be buggy, but the alternate implementation isn't yet ready.
-/// This is purely to provide a debugging and dianostic hook until the vector
-/// split is replaced with vector relocations.
-static cl::opt<bool> UseVectorSplit("rs4gc-split-vector-values", cl::Hidden,
-                                    cl::init(true));
-
 namespace {
 struct RewriteStatepointsForGC : public ModulePass {
   static char ID; // Pass identification, replacement for typeid
@@ -141,24 +131,25 @@ ModulePass *llvm::createRewriteStatepointsForGCPass() {
 INITIALIZE_PASS_BEGIN(RewriteStatepointsForGC, "rewrite-statepoints-for-gc",
                       "Make relocations explicit at statepoints", false, false)
 INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass)
 INITIALIZE_PASS_END(RewriteStatepointsForGC, "rewrite-statepoints-for-gc",
                     "Make relocations explicit at statepoints", false, false)
 
 namespace {
 struct GCPtrLivenessData {
   /// Values defined in this block.
-  DenseMap<BasicBlock *, DenseSet<Value *>> KillSet;
+  MapVector<BasicBlock *, SetVector<Value *>> KillSet;
   /// Values used in this block (and thus live); does not included values
   /// killed within this block.
-  DenseMap<BasicBlock *, DenseSet<Value *>> LiveSet;
+  MapVector<BasicBlock *, SetVector<Value *>> LiveSet;
 
   /// Values live into this basic block (i.e. used by any
   /// instruction in this basic block or ones reachable from here)
-  DenseMap<BasicBlock *, DenseSet<Value *>> LiveIn;
+  MapVector<BasicBlock *, SetVector<Value *>> LiveIn;
 
   /// Values live out of this basic block (i.e. live into
   /// any successor block)
-  DenseMap<BasicBlock *, DenseSet<Value *>> LiveOut;
+  MapVector<BasicBlock *, SetVector<Value *>> LiveOut;
 };
 
 // The type of the internal cache used inside the findBasePointers family
@@ -171,9 +162,9 @@ struct GCPtrLivenessData {
 // Generally, after the execution of a full findBasePointer call, only the
 // base relation will remain.  Internally, we add a mixture of the two
 // types, then update all the second type to the first type
-typedef DenseMap<Value *, Value *> DefiningValueMapTy;
-typedef DenseSet<Value *> StatepointLiveSetTy;
-typedef DenseMap<AssertingVH<Instruction>, AssertingVH<Value>>
+typedef MapVector<Value *, Value *> DefiningValueMapTy;
+typedef SetVector<Value *> StatepointLiveSetTy;
+typedef MapVector<AssertingVH<Instruction>, AssertingVH<Value>>
   RematerializedValueMapTy;
 
 struct PartiallyConstructedSafepointRecord {
@@ -181,7 +172,7 @@ struct PartiallyConstructedSafepointRecord {
   StatepointLiveSetTy LiveSet;
 
   /// Mapping from live pointers to a base-defining-value
-  DenseMap<Value *, Value *> PointerToBase;
+  MapVector<Value *, Value *> PointerToBase;
 
   /// The *new* gc.statepoint instruction itself.  This produces the token
   /// that normal path gc.relocates and the gc.result are tied to.
@@ -199,9 +190,8 @@ struct PartiallyConstructedSafepointRecord {
 }
 
 static ArrayRef<Use> GetDeoptBundleOperands(ImmutableCallSite CS) {
-  assert(UseDeoptBundles && "Should not be called otherwise!");
-
-  Optional<OperandBundleUse> DeoptBundle = CS.getOperandBundle("deopt");
+  Optional<OperandBundleUse> DeoptBundle =
+      CS.getOperandBundle(LLVMContext::OB_deopt);
 
   if (!DeoptBundle.hasValue()) {
     assert(AllowStatepointWithNoDeoptInfo &&
@@ -229,7 +219,7 @@ static bool isGCPointerType(Type *T) {
     // For the sake of this example GC, we arbitrarily pick addrspace(1) as our
     // GC managed heap.  We know that a pointer into this heap needs to be
     // updated and that no other pointer does.
-    return (1 == PT->getAddressSpace());
+    return PT->getAddressSpace() == 1;
   return false;
 }
 
@@ -260,8 +250,7 @@ static bool containsGCPtrType(Type *Ty) {
   if (ArrayType *AT = dyn_cast<ArrayType>(Ty))
     return containsGCPtrType(AT->getElementType());
   if (StructType *ST = dyn_cast<StructType>(Ty))
-    return std::any_of(ST->subtypes().begin(), ST->subtypes().end(),
-                       containsGCPtrType);
+    return any_of(ST->subtypes(), containsGCPtrType);
   return false;
 }
 
@@ -273,19 +262,6 @@ static bool isUnhandledGCPointerType(Type *Ty) {
 }
 #endif
 
-static bool order_by_name(Value *a, Value *b) {
-  if (a->hasName() && b->hasName()) {
-    return -1 == a->getName().compare(b->getName());
-  } else if (a->hasName() && !b->hasName()) {
-    return true;
-  } else if (!a->hasName() && b->hasName()) {
-    return false;
-  } else {
-    // Better than nothing, but not stable
-    return a < b;
-  }
-}
-
 // Return the name of the value suffixed with the provided value, or if the
 // value didn't have a name, the default value specified.
 static std::string suffixed_name_or(Value *V, StringRef Suffix,
@@ -297,30 +273,25 @@ static std::string suffixed_name_or(Value *V, StringRef Suffix,
 // given instruction. The  analysis is performed immediately before the
 // given instruction. Values defined by that instruction are not considered
 // live.  Values used by that instruction are considered live.
-static void analyzeParsePointLiveness(
-    DominatorTree &DT, GCPtrLivenessData &OriginalLivenessData,
-    const CallSite &CS, PartiallyConstructedSafepointRecord &result) {
-  Instruction *inst = CS.getInstruction();
+static void
+analyzeParsePointLiveness(DominatorTree &DT,
+                          GCPtrLivenessData &OriginalLivenessData, CallSite CS,
+                          PartiallyConstructedSafepointRecord &Result) {
+  Instruction *Inst = CS.getInstruction();
 
   StatepointLiveSetTy LiveSet;
-  findLiveSetAtInst(inst, OriginalLivenessData, LiveSet);
+  findLiveSetAtInst(Inst, OriginalLivenessData, LiveSet);
 
   if (PrintLiveSet) {
-    // Note: This output is used by several of the test cases
-    // The order of elements in a set is not stable, put them in a vec and sort
-    // by name
-    SmallVector<Value *, 64> Temp;
-    Temp.insert(Temp.end(), LiveSet.begin(), LiveSet.end());
-    std::sort(Temp.begin(), Temp.end(), order_by_name);
-    errs() << "Live Variables:\n";
-    for (Value *V : Temp)
+    dbgs() << "Live Variables:\n";
+    for (Value *V : LiveSet)
       dbgs() << " " << V->getName() << " " << *V << "\n";
   }
   if (PrintLiveSetSize) {
-    errs() << "Safepoint For: " << CS.getCalledValue()->getName() << "\n";
-    errs() << "Number live values: " << LiveSet.size() << "\n";
+    dbgs() << "Safepoint For: " << CS.getCalledValue()->getName() << "\n";
+    dbgs() << "Number live values: " << LiveSet.size() << "\n";
   }
-  result.LiveSet = LiveSet;
+  Result.LiveSet = LiveSet;
 }
 
 static bool isKnownBaseResult(Value *V);
@@ -372,8 +343,10 @@ findBaseDefiningValueOfVector(Value *I) {
     return BaseDefiningValueResult(I, true);
 
   if (isa<Constant>(I))
-    // Constant vectors consist only of constant pointers.
-    return BaseDefiningValueResult(I, true);
+    // Base of constant vector consists only of constant null pointers. 
+    // For reasoning see similar case inside 'findBaseDefiningValue' function.
+    return BaseDefiningValueResult(ConstantAggregateZero::get(I->getType()),
+                                   true);
 
   if (isa<LoadInst>(I))
     return BaseDefiningValueResult(I, true);
@@ -415,14 +388,20 @@ static BaseDefiningValueResult findBaseDefiningValue(Value *I) {
     // We should have never reached here if this argument isn't an gc value
     return BaseDefiningValueResult(I, true);
 
-  if (isa<Constant>(I))
+  if (isa<Constant>(I)) {
     // We assume that objects with a constant base (e.g. a global) can't move
     // and don't need to be reported to the collector because they are always
-    // live.  All constants have constant bases.  Besides global references, all
-    // kinds of constants (e.g. undef, constant expressions, null pointers) can
-    // be introduced by the inliner or the optimizer, especially on dynamically
-    // dead paths.  See e.g. test4 in constants.ll.
-    return BaseDefiningValueResult(I, true);
+    // live. Besides global references, all kinds of constants (e.g. undef, 
+    // constant expressions, null pointers) can be introduced by the inliner or
+    // the optimizer, especially on dynamically dead paths.
+    // Here we treat all of them as having single null base. By doing this we
+    // trying to avoid problems reporting various conflicts in a form of 
+    // "phi (const1, const2)" or "phi (const, regular gc ptr)".
+    // See constant.ll file for relevant test cases.
+
+    return BaseDefiningValueResult(
+        ConstantPointerNull::get(cast<PointerType>(I->getType())), true);
+  }
 
   if (CastInst *CI = dyn_cast<CastInst>(I)) {
     Value *Def = CI->stripPointerCasts();
@@ -570,30 +549,36 @@ class BDVState {
 public:
   enum Status { Unknown, Base, Conflict };
 
-  BDVState(Status s, Value *b = nullptr) : status(s), base(b) {
-    assert(status != Base || b);
+  BDVState() : Status(Unknown), BaseValue(nullptr) {}
+
+  explicit BDVState(Status Status, Value *BaseValue = nullptr)
+      : Status(Status), BaseValue(BaseValue) {
+    assert(Status != Base || BaseValue);
   }
-  explicit BDVState(Value *b) : status(Base), base(b) {}
-  BDVState() : status(Unknown), base(nullptr) {}
 
-  Status getStatus() const { return status; }
-  Value *getBase() const { return base; }
+  explicit BDVState(Value *BaseValue) : Status(Base), BaseValue(BaseValue) {}
+
+  Status getStatus() const { return Status; }
+  Value *getBaseValue() const { return BaseValue; }
 
   bool isBase() const { return getStatus() == Base; }
   bool isUnknown() const { return getStatus() == Unknown; }
   bool isConflict() const { return getStatus() == Conflict; }
 
-  bool operator==(const BDVState &other) const {
-    return base == other.base && status == other.status;
+  bool operator==(const BDVState &Other) const {
+    return BaseValue == Other.BaseValue && Status == Other.Status;
   }
 
   bool operator!=(const BDVState &other) const { return !(*this == other); }
 
   LLVM_DUMP_METHOD
-  void dump() const { print(dbgs()); dbgs() << '\n'; }
-  
+  void dump() const {
+    print(dbgs());
+    dbgs() << '\n';
+  }
+
   void print(raw_ostream &OS) const {
-    switch (status) {
+    switch (getStatus()) {
     case Unknown:
       OS << "U";
       break;
@@ -604,13 +589,13 @@ public:
       OS << "C";
       break;
     };
-    OS << " (" << base << " - "
-       << (base ? base->getName() : "nullptr") << "): ";
+    OS << " (" << getBaseValue() << " - "
+       << (getBaseValue() ? getBaseValue()->getName() : "nullptr") << "): ";
   }
 
 private:
-  Status status;
-  AssertingVH<Value> base; // non null only if status == base
+  Status Status;
+  AssertingVH<Value> BaseValue; // Non-null only if Status == Base.
 };
 }
 
@@ -621,75 +606,50 @@ static raw_ostream &operator<<(raw_ostream &OS, const BDVState &State) {
 }
 #endif
 
-namespace {
-// Values of type BDVState form a lattice, and this is a helper
-// class that implementes the meet operation.  The meat of the meet
-// operation is implemented in MeetBDVStates::pureMeet
-class MeetBDVStates {
-public:
-  /// Initializes the currentResult to the TOP state so that if can be met with
-  /// any other state to produce that state.
-  MeetBDVStates() {}
-
-  // Destructively meet the current result with the given BDVState
-  void meetWith(BDVState otherState) {
-    currentResult = meet(otherState, currentResult);
-  }
+static BDVState meetBDVStateImpl(const BDVState &LHS, const BDVState &RHS) {
+  switch (LHS.getStatus()) {
+  case BDVState::Unknown:
+    return RHS;
 
-  BDVState getResult() const { return currentResult; }
+  case BDVState::Base:
+    assert(LHS.getBaseValue() && "can't be null");
+    if (RHS.isUnknown())
+      return LHS;
 
-private:
-  BDVState currentResult;
-
-  /// Perform a meet operation on two elements of the BDVState lattice.
-  static BDVState meet(BDVState LHS, BDVState RHS) {
-    assert((pureMeet(LHS, RHS) == pureMeet(RHS, LHS)) &&
-           "math is wrong: meet does not commute!");
-    BDVState Result = pureMeet(LHS, RHS);
-    DEBUG(dbgs() << "meet of " << LHS << " with " << RHS
-                 << " produced " << Result << "\n");
-    return Result;
-  }
-
-  static BDVState pureMeet(const BDVState &stateA, const BDVState &stateB) {
-    switch (stateA.getStatus()) {
-    case BDVState::Unknown:
-      return stateB;
-
-    case BDVState::Base:
-      assert(stateA.getBase() && "can't be null");
-      if (stateB.isUnknown())
-        return stateA;
-
-      if (stateB.isBase()) {
-        if (stateA.getBase() == stateB.getBase()) {
-          assert(stateA == stateB && "equality broken!");
-          return stateA;
-        }
-        return BDVState(BDVState::Conflict);
+    if (RHS.isBase()) {
+      if (LHS.getBaseValue() == RHS.getBaseValue()) {
+        assert(LHS == RHS && "equality broken!");
+        return LHS;
       }
-      assert(stateB.isConflict() && "only three states!");
       return BDVState(BDVState::Conflict);
-
-    case BDVState::Conflict:
-      return stateA;
     }
-    llvm_unreachable("only three states!");
+    assert(RHS.isConflict() && "only three states!");
+    return BDVState(BDVState::Conflict);
+
+  case BDVState::Conflict:
+    return LHS;
   }
-};
+  llvm_unreachable("only three states!");
 }
 
+// Values of type BDVState form a lattice, and this function implements the meet
+// operation.
+static BDVState meetBDVState(BDVState LHS, BDVState RHS) {
+  BDVState Result = meetBDVStateImpl(LHS, RHS);
+  assert(Result == meetBDVStateImpl(RHS, LHS) &&
+         "Math is wrong: meet does not commute!");
+  return Result;
+}
 
-/// For a given value or instruction, figure out what base ptr it's derived
-/// from.  For gc objects, this is simply itself.  On success, returns a value
-/// which is the base pointer.  (This is reliable and can be used for
-/// relocation.)  On failure, returns nullptr.
-static Value *findBasePointer(Value *I, DefiningValueMapTy &cache) {
-  Value *def = findBaseOrBDV(I, cache);
+/// For a given value or instruction, figure out what base ptr its derived from.
+/// For gc objects, this is simply itself.  On success, returns a value which is
+/// the base pointer.  (This is reliable and can be used for relocation.)  On
+/// failure, returns nullptr.
+static Value *findBasePointer(Value *I, DefiningValueMapTy &Cache) {
+  Value *Def = findBaseOrBDV(I, Cache);
 
-  if (isKnownBaseResult(def)) {
-    return def;
-  }
+  if (isKnownBaseResult(Def))
+    return Def;
 
   // Here's the rough algorithm:
   // - For every SSA value, construct a mapping to either an actual base
@@ -731,14 +691,14 @@ static Value *findBasePointer(Value *I, DefiningValueMapTy &cache) {
   // one for which we don't already know a definite base value for
   /* scope */ {
     SmallVector<Value*, 16> Worklist;
-    Worklist.push_back(def);
-    States.insert(std::make_pair(def, BDVState()));
+    Worklist.push_back(Def);
+    States.insert({Def, BDVState()});
     while (!Worklist.empty()) {
       Value *Current = Worklist.pop_back_val();
       assert(!isKnownBaseResult(Current) && "why did it get added?");
 
       auto visitIncomingValue = [&](Value *InVal) {
-        Value *Base = findBaseOrBDV(InVal, cache);
+        Value *Base = findBaseOrBDV(InVal, Cache);
         if (isKnownBaseResult(Base))
           // Known bases won't need new instructions introduced and can be
           // ignored safely
@@ -748,12 +708,12 @@ static Value *findBasePointer(Value *I, DefiningValueMapTy &cache) {
         if (States.insert(std::make_pair(Base, BDVState())).second)
           Worklist.push_back(Base);
       };
-      if (PHINode *Phi = dyn_cast<PHINode>(Current)) {
-        for (Value *InVal : Phi->incoming_values())
+      if (PHINode *PN = dyn_cast<PHINode>(Current)) {
+        for (Value *InVal : PN->incoming_values())
           visitIncomingValue(InVal);
-      } else if (SelectInst *Sel = dyn_cast<SelectInst>(Current)) {
-        visitIncomingValue(Sel->getTrueValue());
-        visitIncomingValue(Sel->getFalseValue());
+      } else if (SelectInst *SI = dyn_cast<SelectInst>(Current)) {
+        visitIncomingValue(SI->getTrueValue());
+        visitIncomingValue(SI->getFalseValue());
       } else if (auto *EE = dyn_cast<ExtractElementInst>(Current)) {
         visitIncomingValue(EE->getVectorOperand());
       } else if (auto *IE = dyn_cast<InsertElementInst>(Current)) {
@@ -762,7 +722,7 @@ static Value *findBasePointer(Value *I, DefiningValueMapTy &cache) {
       } else {
         // There is one known class of instructions we know we don't handle.
         assert(isa<ShuffleVectorInst>(Current));
-        llvm_unreachable("unimplemented instruction case");
+        llvm_unreachable("Unimplemented instruction case");
       }
     }
   }
@@ -784,12 +744,12 @@ static Value *findBasePointer(Value *I, DefiningValueMapTy &cache) {
     return I->second;
   };
 
-  bool progress = true;
-  while (progress) {
+  bool Progress = true;
+  while (Progress) {
 #ifndef NDEBUG
-    const size_t oldSize = States.size();
+    const size_t OldSize = States.size();
 #endif
-    progress = false;
+    Progress = false;
     // We're only changing values in this loop, thus safe to keep iterators.
     // Since this is computing a fixed point, the order of visit does not
     // effect the result.  TODO: We could use a worklist here and make this run
@@ -801,38 +761,39 @@ static Value *findBasePointer(Value *I, DefiningValueMapTy &cache) {
       // Given an input value for the current instruction, return a BDVState
       // instance which represents the BDV of that value.
       auto getStateForInput = [&](Value *V) mutable {
-        Value *BDV = findBaseOrBDV(V, cache);
+        Value *BDV = findBaseOrBDV(V, Cache);
         return getStateForBDV(BDV);
       };
 
-      MeetBDVStates calculateMeet;
-      if (SelectInst *select = dyn_cast<SelectInst>(BDV)) {
-        calculateMeet.meetWith(getStateForInput(select->getTrueValue()));
-        calculateMeet.meetWith(getStateForInput(select->getFalseValue()));
-      } else if (PHINode *Phi = dyn_cast<PHINode>(BDV)) {
-        for (Value *Val : Phi->incoming_values())
-          calculateMeet.meetWith(getStateForInput(Val));
+      BDVState NewState;
+      if (SelectInst *SI = dyn_cast<SelectInst>(BDV)) {
+        NewState = meetBDVState(NewState, getStateForInput(SI->getTrueValue()));
+        NewState =
+            meetBDVState(NewState, getStateForInput(SI->getFalseValue()));
+      } else if (PHINode *PN = dyn_cast<PHINode>(BDV)) {
+        for (Value *Val : PN->incoming_values())
+          NewState = meetBDVState(NewState, getStateForInput(Val));
       } else if (auto *EE = dyn_cast<ExtractElementInst>(BDV)) {
         // The 'meet' for an extractelement is slightly trivial, but it's still
         // useful in that it drives us to conflict if our input is.
-        calculateMeet.meetWith(getStateForInput(EE->getVectorOperand()));
+        NewState =
+            meetBDVState(NewState, getStateForInput(EE->getVectorOperand()));
       } else {
         // Given there's a inherent type mismatch between the operands, will
         // *always* produce Conflict.
         auto *IE = cast<InsertElementInst>(BDV);
-        calculateMeet.meetWith(getStateForInput(IE->getOperand(0)));
-        calculateMeet.meetWith(getStateForInput(IE->getOperand(1)));
+        NewState = meetBDVState(NewState, getStateForInput(IE->getOperand(0)));
+        NewState = meetBDVState(NewState, getStateForInput(IE->getOperand(1)));
       }
 
-      BDVState oldState = States[BDV];
-      BDVState newState = calculateMeet.getResult();
-      if (oldState != newState) {
-        progress = true;
-        States[BDV] = newState;
+      BDVState OldState = States[BDV];
+      if (OldState != NewState) {
+        Progress = true;
+        States[BDV] = NewState;
       }
     }
 
-    assert(oldSize == States.size() &&
+    assert(OldSize == States.size() &&
            "fixed point shouldn't be adding any new nodes to state");
   }
 
@@ -842,7 +803,7 @@ static Value *findBasePointer(Value *I, DefiningValueMapTy &cache) {
     DEBUG(dbgs() << " " << Pair.second << " for " << *Pair.first << "\n");
   }
 #endif
-  
+
   // Insert Phis for all conflicts
   // TODO: adjust naming patterns to avoid this order of iteration dependency
   for (auto Pair : States) {
@@ -856,14 +817,13 @@ static Value *findBasePointer(Value *I, DefiningValueMapTy &cache) {
     // The problem is that we need to convert from a vector base to a scalar
     // base for the particular indice we're interested in.
     if (State.isBase() && isa<ExtractElementInst>(I) &&
-        isa<VectorType>(State.getBase()->getType())) {
+        isa<VectorType>(State.getBaseValue()->getType())) {
       auto *EE = cast<ExtractElementInst>(I);
       // TODO: In many cases, the new instruction is just EE itself.  We should
       // exploit this, but can't do it here since it would break the invariant
       // about the BDV not being known to be a base.
-      auto *BaseInst = ExtractElementInst::Create(State.getBase(),
-                                                  EE->getIndexOperand(),
-                                                  "base_ee", EE);
+      auto *BaseInst = ExtractElementInst::Create(
+          State.getBaseValue(), EE->getIndexOperand(), "base_ee", EE);
       BaseInst->setMetadata("is_base_value", MDNode::get(I->getContext(), {}));
       States[I] = BDVState(BDVState::Base, BaseInst);
     }
@@ -871,10 +831,8 @@ static Value *findBasePointer(Value *I, DefiningValueMapTy &cache) {
     // Since we're joining a vector and scalar base, they can never be the
     // same.  As a result, we should always see insert element having reached
     // the conflict state.
-    if (isa<InsertElementInst>(I)) {
-      assert(State.isConflict());
-    }
-    
+    assert(!isa<InsertElementInst>(I) || State.isConflict());
+
     if (!State.isConflict())
       continue;
 
@@ -887,12 +845,11 @@ static Value *findBasePointer(Value *I, DefiningValueMapTy &cache) {
         assert(NumPreds > 0 && "how did we reach here");
         std::string Name = suffixed_name_or(I, ".base", "base_phi");
         return PHINode::Create(I->getType(), NumPreds, Name, I);
-      } else if (SelectInst *Sel = dyn_cast<SelectInst>(I)) {
+      } else if (SelectInst *SI = dyn_cast<SelectInst>(I)) {
         // The undef will be replaced later
-        UndefValue *Undef = UndefValue::get(Sel->getType());
+        UndefValue *Undef = UndefValue::get(SI->getType());
         std::string Name = suffixed_name_or(I, ".base", "base_select");
-        return SelectInst::Create(Sel->getCondition(), Undef,
-                                  Undef, Name, Sel);
+        return SelectInst::Create(SI->getCondition(), Undef, Undef, Name, SI);
       } else if (auto *EE = dyn_cast<ExtractElementInst>(I)) {
         UndefValue *Undef = UndefValue::get(EE->getVectorOperand()->getType());
         std::string Name = suffixed_name_or(I, ".base", "base_ee");
@@ -906,7 +863,6 @@ static Value *findBasePointer(Value *I, DefiningValueMapTy &cache) {
         return InsertElementInst::Create(VecUndef, ScalarUndef,
                                          IE->getOperand(2), Name, IE);
       }
-
     };
     Instruction *BaseInst = MakeBaseInstPlaceholder(I);
     // Add metadata marking this as a base value
@@ -921,24 +877,21 @@ static Value *findBasePointer(Value *I, DefiningValueMapTy &cache) {
   // instruction to propagate the base of it's BDV and have entered that newly
   // introduced instruction into the state table.  In either case, we are
   // assured to be able to determine an instruction which produces it's base
-  // pointer. 
+  // pointer.
   auto getBaseForInput = [&](Value *Input, Instruction *InsertPt) {
-    Value *BDV = findBaseOrBDV(Input, cache);
+    Value *BDV = findBaseOrBDV(Input, Cache);
     Value *Base = nullptr;
     if (isKnownBaseResult(BDV)) {
       Base = BDV;
     } else {
       // Either conflict or base.
       assert(States.count(BDV));
-      Base = States[BDV].getBase();
+      Base = States[BDV].getBaseValue();
     }
-    assert(Base && "can't be null");
+    assert(Base && "Can't be null");
     // The cast is needed since base traversal may strip away bitcasts
-    if (Base->getType() != Input->getType() &&
-        InsertPt) {
-      Base = new BitCastInst(Base, Input->getType(), "cast",
-                             InsertPt);
-    }
+    if (Base->getType() != Input->getType() && InsertPt)
+      Base = new BitCastInst(Base, Input->getType(), "cast", InsertPt);
     return Base;
   };
 
@@ -954,12 +907,12 @@ static Value *findBasePointer(Value *I, DefiningValueMapTy &cache) {
     if (!State.isConflict())
       continue;
 
-    if (PHINode *basephi = dyn_cast<PHINode>(State.getBase())) {
-      PHINode *phi = cast<PHINode>(BDV);
-      unsigned NumPHIValues = phi->getNumIncomingValues();
+    if (PHINode *BasePHI = dyn_cast<PHINode>(State.getBaseValue())) {
+      PHINode *PN = cast<PHINode>(BDV);
+      unsigned NumPHIValues = PN->getNumIncomingValues();
       for (unsigned i = 0; i < NumPHIValues; i++) {
-        Value *InVal = phi->getIncomingValue(i);
-        BasicBlock *InBB = phi->getIncomingBlock(i);
+        Value *InVal = PN->getIncomingValue(i);
+        BasicBlock *InBB = PN->getIncomingBlock(i);
 
         // If we've already seen InBB, add the same incoming value
         // we added for it earlier.  The IR verifier requires phi
@@ -970,22 +923,21 @@ static Value *findBasePointer(Value *I, DefiningValueMapTy &cache) {
         // bitcasts (and hence two distinct values) as incoming
         // values for the same basic block.
 
-        int blockIndex = basephi->getBasicBlockIndex(InBB);
-        if (blockIndex != -1) {
-          Value *oldBase = basephi->getIncomingValue(blockIndex);
-          basephi->addIncoming(oldBase, InBB);
-          
+        int BlockIndex = BasePHI->getBasicBlockIndex(InBB);
+        if (BlockIndex != -1) {
+          Value *OldBase = BasePHI->getIncomingValue(BlockIndex);
+          BasePHI->addIncoming(OldBase, InBB);
+
 #ifndef NDEBUG
           Value *Base = getBaseForInput(InVal, nullptr);
-          // In essence this assert states: the only way two
-          // values incoming from the same basic block may be
-          // different is by being different bitcasts of the same
-          // value.  A cleanup that remains TODO is changing
-          // findBaseOrBDV to return an llvm::Value of the correct
-          // type (and still remain pure).  This will remove the
-          // need to add bitcasts.
-          assert(Base->stripPointerCasts() == oldBase->stripPointerCasts() &&
-                 "sanity -- findBaseOrBDV should be pure!");
+          // In essence this assert states: the only way two values
+          // incoming from the same basic block may be different is by
+          // being different bitcasts of the same value.  A cleanup
+          // that remains TODO is changing findBaseOrBDV to return an
+          // llvm::Value of the correct type (and still remain pure).
+          // This will remove the need to add bitcasts.
+          assert(Base->stripPointerCasts() == OldBase->stripPointerCasts() &&
+                 "Sanity -- findBaseOrBDV should be pure!");
 #endif
           continue;
         }
@@ -994,28 +946,25 @@ static Value *findBasePointer(Value *I, DefiningValueMapTy &cache) {
         // need to insert a bitcast in the incoming block.
         // TODO: Need to split critical edges if insertion is needed
         Value *Base = getBaseForInput(InVal, InBB->getTerminator());
-        basephi->addIncoming(Base, InBB);
+        BasePHI->addIncoming(Base, InBB);
       }
-      assert(basephi->getNumIncomingValues() == NumPHIValues);
-    } else if (SelectInst *BaseSel = dyn_cast<SelectInst>(State.getBase())) {
-      SelectInst *Sel = cast<SelectInst>(BDV);
-      // Operand 1 & 2 are true, false path respectively. TODO: refactor to
-      // something more safe and less hacky.
-      for (int i = 1; i <= 2; i++) {
-        Value *InVal = Sel->getOperand(i);
-        // Find the instruction which produces the base for each input.  We may
-        // need to insert a bitcast.
-        Value *Base = getBaseForInput(InVal, BaseSel);
-        BaseSel->setOperand(i, Base);
-      }
-    } else if (auto *BaseEE = dyn_cast<ExtractElementInst>(State.getBase())) {
+      assert(BasePHI->getNumIncomingValues() == NumPHIValues);
+    } else if (SelectInst *BaseSI =
+                   dyn_cast<SelectInst>(State.getBaseValue())) {
+      SelectInst *SI = cast<SelectInst>(BDV);
+
+      // Find the instruction which produces the base for each input.
+      // We may need to insert a bitcast.
+      BaseSI->setTrueValue(getBaseForInput(SI->getTrueValue(), BaseSI));
+      BaseSI->setFalseValue(getBaseForInput(SI->getFalseValue(), BaseSI));
+    } else if (auto *BaseEE =
+                   dyn_cast<ExtractElementInst>(State.getBaseValue())) {
       Value *InVal = cast<ExtractElementInst>(BDV)->getVectorOperand();
       // Find the instruction which produces the base for each input.  We may
       // need to insert a bitcast.
-      Value *Base = getBaseForInput(InVal, BaseEE);
-      BaseEE->setOperand(0, Base);
+      BaseEE->setOperand(0, getBaseForInput(InVal, BaseEE));
     } else {
-      auto *BaseIE = cast<InsertElementInst>(State.getBase());
+      auto *BaseIE = cast<InsertElementInst>(State.getBaseValue());
       auto *BdvIE = cast<InsertElementInst>(BDV);
       auto UpdateOperand = [&](int OperandIdx) {
         Value *InVal = BdvIE->getOperand(OperandIdx);
@@ -1025,69 +974,6 @@ static Value *findBasePointer(Value *I, DefiningValueMapTy &cache) {
       UpdateOperand(0); // vector operand
       UpdateOperand(1); // scalar operand
     }
-
-  }
-
-  // Now that we're done with the algorithm, see if we can optimize the 
-  // results slightly by reducing the number of new instructions needed. 
-  // Arguably, this should be integrated into the algorithm above, but 
-  // doing as a post process step is easier to reason about for the moment.
-  DenseMap<Value *, Value *> ReverseMap;
-  SmallPtrSet<Instruction *, 16> NewInsts;
-  SmallSetVector<AssertingVH<Instruction>, 16> Worklist;
-  // Note: We need to visit the states in a deterministic order.  We uses the
-  // Keys we sorted above for this purpose.  Note that we are papering over a
-  // bigger problem with the algorithm above - it's visit order is not
-  // deterministic.  A larger change is needed to fix this.
-  for (auto Pair : States) {
-    auto *BDV = Pair.first;
-    auto State = Pair.second;
-    Value *Base = State.getBase();
-    assert(BDV && Base);
-    assert(!isKnownBaseResult(BDV) && "why did it get added?");
-    assert(isKnownBaseResult(Base) &&
-           "must be something we 'know' is a base pointer");
-    if (!State.isConflict())
-      continue;
-
-    ReverseMap[Base] = BDV;
-    if (auto *BaseI = dyn_cast<Instruction>(Base)) {
-      NewInsts.insert(BaseI);
-      Worklist.insert(BaseI);
-    }
-  }
-  auto ReplaceBaseInstWith = [&](Value *BDV, Instruction *BaseI,
-                                 Value *Replacement) {
-    // Add users which are new instructions (excluding self references)
-    for (User *U : BaseI->users())
-      if (auto *UI = dyn_cast<Instruction>(U))
-        if (NewInsts.count(UI) && UI != BaseI)
-          Worklist.insert(UI);
-    // Then do the actual replacement
-    NewInsts.erase(BaseI);
-    ReverseMap.erase(BaseI);
-    BaseI->replaceAllUsesWith(Replacement);
-    assert(States.count(BDV));
-    assert(States[BDV].isConflict() && States[BDV].getBase() == BaseI);
-    States[BDV] = BDVState(BDVState::Conflict, Replacement);
-    BaseI->eraseFromParent();
-  };
-  const DataLayout &DL = cast<Instruction>(def)->getModule()->getDataLayout();
-  while (!Worklist.empty()) {
-    Instruction *BaseI = Worklist.pop_back_val();
-    assert(NewInsts.count(BaseI));
-    Value *Bdv = ReverseMap[BaseI];
-    if (auto *BdvI = dyn_cast<Instruction>(Bdv))
-      if (BaseI->isIdenticalTo(BdvI)) {
-        DEBUG(dbgs() << "Identical Base: " << *BaseI << "\n");
-        ReplaceBaseInstWith(Bdv, BaseI, Bdv);
-        continue;
-      }
-    if (Value *V = SimplifyInstruction(BaseI, DL)) {
-      DEBUG(dbgs() << "Base " << *BaseI << " simplified to " << *V << "\n");
-      ReplaceBaseInstWith(Bdv, BaseI, V);
-      continue;
-    }
   }
 
   // Cache all of our results so we can cheaply reuse them
@@ -1095,25 +981,27 @@ static Value *findBasePointer(Value *I, DefiningValueMapTy &cache) {
   // relation and one of the base pointer relation!  FIXME
   for (auto Pair : States) {
     auto *BDV = Pair.first;
-    Value *base = Pair.second.getBase();
-    assert(BDV && base);
+    Value *Base = Pair.second.getBaseValue();
+    assert(BDV && Base);
+    assert(!isKnownBaseResult(BDV) && "why did it get added?");
 
-    std::string fromstr = cache.count(BDV) ? cache[BDV]->getName() : "none";
     DEBUG(dbgs() << "Updating base value cache"
-          << " for: " << BDV->getName()
-          << " from: " << fromstr
-          << " to: " << base->getName() << "\n");
-
-    if (cache.count(BDV)) {
-      // Once we transition from the BDV relation being store in the cache to
+                 << " for: " << BDV->getName() << " from: "
+                 << (Cache.count(BDV) ? Cache[BDV]->getName().str() : "none")
+                 << " to: " << Base->getName() << "\n");
+
+    if (Cache.count(BDV)) {
+      assert(isKnownBaseResult(Base) &&
+             "must be something we 'know' is a base pointer");
+      // Once we transition from the BDV relation being store in the Cache to
       // the base relation being stored, it must be stable
-      assert((!isKnownBaseResult(cache[BDV]) || cache[BDV] == base) &&
+      assert((!isKnownBaseResult(Cache[BDV]) || Cache[BDV] == Base) &&
              "base relation should be stable");
     }
-    cache[BDV] = base;
+    Cache[BDV] = Base;
   }
-  assert(cache.count(def));
-  return cache[def];
+  assert(Cache.count(Def));
+  return Cache[Def];
 }
 
 // For a set of live pointers (base and/or derived), identify the base
@@ -1133,15 +1021,9 @@ static Value *findBasePointer(Value *I, DefiningValueMapTy &cache) {
 // pointer was a base pointer.
 static void
 findBasePointers(const StatepointLiveSetTy &live,
-                 DenseMap<Value *, Value *> &PointerToBase,
+                 MapVector<Value *, Value *> &PointerToBase,
                  DominatorTree *DT, DefiningValueMapTy &DVCache) {
-  // For the naming of values inserted to be deterministic - which makes for
-  // much cleaner and more stable tests - we need to assign an order to the
-  // live values.  DenseSets do not provide a deterministic order across runs.
-  SmallVector<Value *, 64> Temp;
-  Temp.insert(Temp.end(), live.begin(), live.end());
-  std::sort(Temp.begin(), Temp.end(), order_by_name);
-  for (Value *ptr : Temp) {
+  for (Value *ptr : live) {
     Value *base = findBasePointer(ptr, DVCache);
     assert(base && "failed to find base pointer");
     PointerToBase[ptr] = base;
@@ -1149,41 +1031,24 @@ findBasePointers(const StatepointLiveSetTy &live,
             DT->dominates(cast<Instruction>(base)->getParent(),
                           cast<Instruction>(ptr)->getParent())) &&
            "The base we found better dominate the derived pointer");
-
-    // If you see this trip and like to live really dangerously, the code should
-    // be correct, just with idioms the verifier can't handle.  You can try
-    // disabling the verifier at your own substantial risk.
-    assert(!isa<ConstantPointerNull>(base) &&
-           "the relocation code needs adjustment to handle the relocation of "
-           "a null pointer constant without causing false positives in the "
-           "safepoint ir verifier.");
   }
 }
 
 /// Find the required based pointers (and adjust the live set) for the given
 /// parse point.
 static void findBasePointers(DominatorTree &DT, DefiningValueMapTy &DVCache,
-                             const CallSite &CS,
+                             CallSite CS,
                              PartiallyConstructedSafepointRecord &result) {
-  DenseMap<Value *, Value *> PointerToBase;
+  MapVector<Value *, Value *> PointerToBase;
   findBasePointers(result.LiveSet, PointerToBase, &DT, DVCache);
 
   if (PrintBasePointers) {
-    // Note: Need to print these in a stable order since this is checked in
-    // some tests.
     errs() << "Base Pairs (w/o Relocation):\n";
-    SmallVector<Value *, 64> Temp;
-    Temp.reserve(PointerToBase.size());
-    for (auto Pair : PointerToBase) {
-      Temp.push_back(Pair.first);
-    }
-    std::sort(Temp.begin(), Temp.end(), order_by_name);
-    for (Value *Ptr : Temp) {
-      Value *Base = PointerToBase[Ptr];
+    for (auto &Pair : PointerToBase) {
       errs() << " derived ";
-      Ptr->printAsOperand(errs(), false);
+      Pair.first->printAsOperand(errs(), false);
       errs() << " base ";
-      Base->printAsOperand(errs(), false);
+      Pair.second->printAsOperand(errs(), false);
       errs() << "\n";;
     }
   }
@@ -1194,7 +1059,7 @@ static void findBasePointers(DominatorTree &DT, DefiningValueMapTy &DVCache,
 /// Given an updated version of the dataflow liveness results, update the
 /// liveset and base pointer maps for the call site CS.
 static void recomputeLiveInValues(GCPtrLivenessData &RevisedLivenessData,
-                                  const CallSite &CS,
+                                  CallSite CS,
                                   PartiallyConstructedSafepointRecord &result);
 
 static void recomputeLiveInValues(
@@ -1206,8 +1071,7 @@ static void recomputeLiveInValues(
   computeLiveInValues(DT, F, RevisedLivenessData);
   for (size_t i = 0; i < records.size(); i++) {
     struct PartiallyConstructedSafepointRecord &info = records[i];
-    const CallSite &CS = toUpdate[i];
-    recomputeLiveInValues(RevisedLivenessData, CS, info);
+    recomputeLiveInValues(RevisedLivenessData, toUpdate[i], info);
   }
 }
 
@@ -1257,8 +1121,7 @@ static AttributeSet legalizeCallAttributes(AttributeSet AS) {
         // These attributes control the generation of the gc.statepoint call /
         // invoke itself; and once the gc.statepoint is in place, they're of no
         // use.
-        if (Attr.hasAttribute("statepoint-num-patch-bytes") ||
-            Attr.hasAttribute("statepoint-id"))
+        if (isStatepointDirectiveAttr(Attr))
           continue;
 
         Ret = Ret.addAttributes(
@@ -1349,11 +1212,37 @@ namespace {
 class DeferredReplacement {
   AssertingVH<Instruction> Old;
   AssertingVH<Instruction> New;
+  bool IsDeoptimize = false;
+
+  DeferredReplacement() {}
 
 public:
-  explicit DeferredReplacement(Instruction *Old, Instruction *New) :
-    Old(Old), New(New) {
-    assert(Old != New && "Not allowed!");
+  static DeferredReplacement createRAUW(Instruction *Old, Instruction *New) {
+    assert(Old != New && Old && New &&
+           "Cannot RAUW equal values or to / from null!");
+
+    DeferredReplacement D;
+    D.Old = Old;
+    D.New = New;
+    return D;
+  }
+
+  static DeferredReplacement createDelete(Instruction *ToErase) {
+    DeferredReplacement D;
+    D.Old = ToErase;
+    return D;
+  }
+
+  static DeferredReplacement createDeoptimizeReplacement(Instruction *Old) {
+#ifndef NDEBUG
+    auto *F = cast<CallInst>(Old)->getCalledFunction();
+    assert(F && F->getIntrinsicID() == Intrinsic::experimental_deoptimize &&
+           "Only way to construct a deoptimize deferred replacement");
+#endif
+    DeferredReplacement D;
+    D.Old = Old;
+    D.IsDeoptimize = true;
+    return D;
   }
 
   /// Does the task represented by this instance.
@@ -1362,12 +1251,23 @@ public:
     Instruction *NewI = New;
 
     assert(OldI != NewI && "Disallowed at construction?!");
+    assert((!IsDeoptimize || !New) &&
+           "Deoptimize instrinsics are not replaced!");
 
     Old = nullptr;
     New = nullptr;
 
     if (NewI)
       OldI->replaceAllUsesWith(NewI);
+
+    if (IsDeoptimize) {
+      // Note: we've inserted instructions, so the call to llvm.deoptimize may
+      // not necessarilly be followed by the matching return.
+      auto *RI = cast<ReturnInst>(OldI->getParent()->getTerminator());
+      new UnreachableInst(RI->getContext(), RI);
+      RI->eraseFromParent();
+    }
+
     OldI->eraseFromParent();
   }
 };
@@ -1380,8 +1280,6 @@ makeStatepointExplicitImpl(const CallSite CS, /* to replace */
                            PartiallyConstructedSafepointRecord &Result,
                            std::vector<DeferredReplacement> &Replacements) {
   assert(BasePtrs.size() == LiveVariables.size());
-  assert((UseDeoptBundles || isStatepoint(CS)) &&
-         "This method expects to be rewriting a statepoint");
 
   // Then go ahead and use the builder do actually do the inserts.  We insert
   // immediately before the previous instruction under the assumption that all
@@ -1391,47 +1289,53 @@ makeStatepointExplicitImpl(const CallSite CS, /* to replace */
   IRBuilder<> Builder(InsertBefore);
 
   ArrayRef<Value *> GCArgs(LiveVariables);
-  uint64_t StatepointID = 0xABCDEF00;
+  uint64_t StatepointID = StatepointDirectives::DefaultStatepointID;
   uint32_t NumPatchBytes = 0;
   uint32_t Flags = uint32_t(StatepointFlags::None);
 
-  ArrayRef<Use> CallArgs;
-  ArrayRef<Use> DeoptArgs;
+  ArrayRef<Use> CallArgs(CS.arg_begin(), CS.arg_end());
+  ArrayRef<Use> DeoptArgs = GetDeoptBundleOperands(CS);
   ArrayRef<Use> TransitionArgs;
-
-  Value *CallTarget = nullptr;
-
-  if (UseDeoptBundles) {
-    CallArgs = {CS.arg_begin(), CS.arg_end()};
-    DeoptArgs = GetDeoptBundleOperands(CS);
-    // TODO: we don't fill in TransitionArgs or Flags in this branch, but we
-    // could have an operand bundle for that too.
-    AttributeSet OriginalAttrs = CS.getAttributes();
-
-    Attribute AttrID = OriginalAttrs.getAttribute(AttributeSet::FunctionIndex,
-                                                  "statepoint-id");
-    if (AttrID.isStringAttribute())
-      AttrID.getValueAsString().getAsInteger(10, StatepointID);
-
-    Attribute AttrNumPatchBytes = OriginalAttrs.getAttribute(
-        AttributeSet::FunctionIndex, "statepoint-num-patch-bytes");
-    if (AttrNumPatchBytes.isStringAttribute())
-      AttrNumPatchBytes.getValueAsString().getAsInteger(10, NumPatchBytes);
-
-    CallTarget = CS.getCalledValue();
-  } else {
-    // This branch will be gone soon, and we will soon only support the
-    // UseDeoptBundles == true configuration.
-    Statepoint OldSP(CS);
-    StatepointID = OldSP.getID();
-    NumPatchBytes = OldSP.getNumPatchBytes();
-    Flags = OldSP.getFlags();
-
-    CallArgs = {OldSP.arg_begin(), OldSP.arg_end()};
-    DeoptArgs = {OldSP.vm_state_begin(), OldSP.vm_state_end()};
-    TransitionArgs = {OldSP.gc_transition_args_begin(),
-                      OldSP.gc_transition_args_end()};
-    CallTarget = OldSP.getCalledValue();
+  if (auto TransitionBundle =
+      CS.getOperandBundle(LLVMContext::OB_gc_transition)) {
+    Flags |= uint32_t(StatepointFlags::GCTransition);
+    TransitionArgs = TransitionBundle->Inputs;
+  }
+
+  // Instead of lowering calls to @llvm.experimental.deoptimize as normal calls
+  // with a return value, we lower then as never returning calls to
+  // __llvm_deoptimize that are followed by unreachable to get better codegen.
+  bool IsDeoptimize = false;
+
+  StatepointDirectives SD =
+      parseStatepointDirectivesFromAttrs(CS.getAttributes());
+  if (SD.NumPatchBytes)
+    NumPatchBytes = *SD.NumPatchBytes;
+  if (SD.StatepointID)
+    StatepointID = *SD.StatepointID;
+
+  Value *CallTarget = CS.getCalledValue();
+  if (Function *F = dyn_cast<Function>(CallTarget)) {
+    if (F->getIntrinsicID() == Intrinsic::experimental_deoptimize) {
+      // Calls to llvm.experimental.deoptimize are lowered to calls to the
+      // __llvm_deoptimize symbol.  We want to resolve this now, since the
+      // verifier does not allow taking the address of an intrinsic function.
+
+      SmallVector<Type *, 8> DomainTy;
+      for (Value *Arg : CallArgs)
+        DomainTy.push_back(Arg->getType());
+      auto *FTy = FunctionType::get(Type::getVoidTy(F->getContext()), DomainTy,
+                                    /* isVarArg = */ false);
+
+      // Note: CallTarget can be a bitcast instruction of a symbol if there are
+      // calls to @llvm.experimental.deoptimize with different argument types in
+      // the same module.  This is fine -- we assume the frontend knew what it
+      // was doing when generating this kind of IR.
+      CallTarget =
+          F->getParent()->getOrInsertFunction("__llvm_deoptimize", FTy);
+
+      IsDeoptimize = true;
+    }
   }
 
   // Create the statepoint given all the arguments
@@ -1514,7 +1418,13 @@ makeStatepointExplicitImpl(const CallSite CS, /* to replace */
   }
   assert(Token && "Should be set in one of the above branches!");
 
-  if (UseDeoptBundles) {
+  if (IsDeoptimize) {
+    // If we're wrapping an @llvm.experimental.deoptimize in a statepoint, we
+    // transform the tail-call like structure to a call to a void function
+    // followed by unreachable to get better codegen.
+    Replacements.push_back(
+        DeferredReplacement::createDeoptimizeReplacement(CS.getInstruction()));
+  } else {
     Token->setName("statepoint_token");
     if (!CS.getType()->isVoidTy() && !CS.getInstruction()->use_empty()) {
       StringRef Name =
@@ -1528,24 +1438,12 @@ makeStatepointExplicitImpl(const CallSite CS, /* to replace */
       // llvm::Instruction.  Instead, we defer the replacement and deletion to
       // after the live sets have been made explicit in the IR, and we no longer
       // have raw pointers to worry about.
-      Replacements.emplace_back(CS.getInstruction(), GCResult);
+      Replacements.emplace_back(
+          DeferredReplacement::createRAUW(CS.getInstruction(), GCResult));
     } else {
-      Replacements.emplace_back(CS.getInstruction(), nullptr);
+      Replacements.emplace_back(
+          DeferredReplacement::createDelete(CS.getInstruction()));
     }
-  } else {
-    assert(!CS.getInstruction()->hasNUsesOrMore(2) &&
-           "only valid use before rewrite is gc.result");
-    assert(!CS.getInstruction()->hasOneUse() ||
-           isGCResult(cast<Instruction>(*CS.getInstruction()->user_begin())));
-
-    // Take the name of the original statepoint token if there was one.
-    Token->takeName(CS.getInstruction());
-
-    // Update the gc.result of the original statepoint (if any) to use the newly
-    // inserted statepoint.  This is safe to do here since the token can't be
-    // considered a live reference.
-    CS.getInstruction()->replaceAllUsesWith(Token);
-    CS.getInstruction()->eraseFromParent();
   }
 
   Result.StatepointToken = Token;
@@ -1555,43 +1453,13 @@ makeStatepointExplicitImpl(const CallSite CS, /* to replace */
   CreateGCRelocates(LiveVariables, LiveStartIdx, BasePtrs, Token, Builder);
 }
 
-namespace {
-struct NameOrdering {
-  Value *Base;
-  Value *Derived;
-
-  bool operator()(NameOrdering const &a, NameOrdering const &b) {
-    return -1 == a.Derived->getName().compare(b.Derived->getName());
-  }
-};
-}
-
-static void StabilizeOrder(SmallVectorImpl<Value *> &BaseVec,
-                           SmallVectorImpl<Value *> &LiveVec) {
-  assert(BaseVec.size() == LiveVec.size());
-
-  SmallVector<NameOrdering, 64> Temp;
-  for (size_t i = 0; i < BaseVec.size(); i++) {
-    NameOrdering v;
-    v.Base = BaseVec[i];
-    v.Derived = LiveVec[i];
-    Temp.push_back(v);
-  }
-
-  std::sort(Temp.begin(), Temp.end(), NameOrdering());
-  for (size_t i = 0; i < BaseVec.size(); i++) {
-    BaseVec[i] = Temp[i].Base;
-    LiveVec[i] = Temp[i].Derived;
-  }
-}
-
 // Replace an existing gc.statepoint with a new one and a set of gc.relocates
 // which make the relocations happening at this safepoint explicit.
 //
 // WARNING: Does not do any fixup to adjust users of the original live
 // values.  That's the callers responsibility.
 static void
-makeStatepointExplicit(DominatorTree &DT, const CallSite &CS,
+makeStatepointExplicit(DominatorTree &DT, CallSite CS,
                        PartiallyConstructedSafepointRecord &Result,
                        std::vector<DeferredReplacement> &Replacements) {
   const auto &LiveSet = Result.LiveSet;
@@ -1609,11 +1477,6 @@ makeStatepointExplicit(DominatorTree &DT, const CallSite &CS,
   }
   assert(LiveVec.size() == BaseVec.size());
 
-  // To make the output IR slightly more stable (for use in diffs), ensure a
-  // fixed order of the values in the safepoint (by sorting the value name).
-  // The order is otherwise meaningless.
-  StabilizeOrder(BaseVec, LiveVec);
-
   // Do the actual rewriting and delete the old statepoint
   makeStatepointExplicitImpl(CS, BaseVec, LiveVec, Result, Replacements);
 }
@@ -1634,7 +1497,7 @@ insertRelocationStores(iterator_range<Value::user_iterator> GCRelocs,
     if (!Relocate)
       continue;
 
-    Value *OriginalValue = const_cast<Value *>(Relocate->getDerivedPtr());
+    Value *OriginalValue = Relocate->getDerivedPtr();
     assert(AllocaMap.count(OriginalValue));
     Value *Alloca = AllocaMap[OriginalValue];
 
@@ -1660,11 +1523,10 @@ insertRelocationStores(iterator_range<Value::user_iterator> GCRelocs,
 
 // Helper function for the "relocationViaAlloca". Similar to the
 // "insertRelocationStores" but works for rematerialized values.
-static void
-insertRematerializationStores(
-  RematerializedValueMapTy RematerializedValues,
-  DenseMap<Value *, Value *> &AllocaMap,
-  DenseSet<Value *> &VisitedLiveValues) {
+static void insertRematerializationStores(
+    const RematerializedValueMapTy &RematerializedValues,
+    DenseMap<Value *, Value *> &AllocaMap,
+    DenseSet<Value *> &VisitedLiveValues) {
 
   for (auto RematerializedValuePair: RematerializedValues) {
     Instruction *RematerializedValue = RematerializedValuePair.first;
@@ -1691,9 +1553,8 @@ static void relocationViaAlloca(
   // record initial number of (static) allocas; we'll check we have the same
   // number when we get done.
   int InitialAllocaNum = 0;
-  for (auto I = F.getEntryBlock().begin(), E = F.getEntryBlock().end(); I != E;
-       I++)
-    if (isa<AllocaInst>(*I))
+  for (Instruction &I : F.getEntryBlock())
+    if (isa<AllocaInst>(I))
       InitialAllocaNum++;
 #endif
 
@@ -1777,8 +1638,7 @@ static void relocationViaAlloca(
 
       auto InsertClobbersAt = [&](Instruction *IP) {
         for (auto *AI : ToClobber) {
-          auto AIType = cast<PointerType>(AI->getType());
-          auto PT = cast<PointerType>(AIType->getElementType());
+          auto PT = cast<PointerType>(AI->getAllocatedType());
           Constant *CPN = ConstantPointerNull::get(PT);
           StoreInst *Store = new StoreInst(CPN, AI);
           Store->insertBefore(IP);
@@ -1919,141 +1779,7 @@ static void findLiveReferences(
   computeLiveInValues(DT, F, OriginalLivenessData);
   for (size_t i = 0; i < records.size(); i++) {
     struct PartiallyConstructedSafepointRecord &info = records[i];
-    const CallSite &CS = toUpdate[i];
-    analyzeParsePointLiveness(DT, OriginalLivenessData, CS, info);
-  }
-}
-
-/// Remove any vector of pointers from the live set by scalarizing them over the
-/// statepoint instruction.  Adds the scalarized pieces to the live set.  It
-/// would be preferable to include the vector in the statepoint itself, but
-/// the lowering code currently does not handle that.  Extending it would be
-/// slightly non-trivial since it requires a format change.  Given how rare
-/// such cases are (for the moment?) scalarizing is an acceptable compromise.
-static void splitVectorValues(Instruction *StatepointInst,
-                              StatepointLiveSetTy &LiveSet,
-                              DenseMap<Value *, Value *>& PointerToBase,
-                              DominatorTree &DT) {
-  SmallVector<Value *, 16> ToSplit;
-  for (Value *V : LiveSet)
-    if (isa<VectorType>(V->getType()))
-      ToSplit.push_back(V);
-
-  if (ToSplit.empty())
-    return;
-
-  DenseMap<Value *, SmallVector<Value *, 16>> ElementMapping;
-
-  Function &F = *(StatepointInst->getParent()->getParent());
-
-  DenseMap<Value *, AllocaInst *> AllocaMap;
-  // First is normal return, second is exceptional return (invoke only)
-  DenseMap<Value *, std::pair<Value *, Value *>> Replacements;
-  for (Value *V : ToSplit) {
-    AllocaInst *Alloca =
-        new AllocaInst(V->getType(), "", F.getEntryBlock().getFirstNonPHI());
-    AllocaMap[V] = Alloca;
-
-    VectorType *VT = cast<VectorType>(V->getType());
-    IRBuilder<> Builder(StatepointInst);
-    SmallVector<Value *, 16> Elements;
-    for (unsigned i = 0; i < VT->getNumElements(); i++)
-      Elements.push_back(Builder.CreateExtractElement(V, Builder.getInt32(i)));
-    ElementMapping[V] = Elements;
-
-    auto InsertVectorReform = [&](Instruction *IP) {
-      Builder.SetInsertPoint(IP);
-      Builder.SetCurrentDebugLocation(IP->getDebugLoc());
-      Value *ResultVec = UndefValue::get(VT);
-      for (unsigned i = 0; i < VT->getNumElements(); i++)
-        ResultVec = Builder.CreateInsertElement(ResultVec, Elements[i],
-                                                Builder.getInt32(i));
-      return ResultVec;
-    };
-
-    if (isa<CallInst>(StatepointInst)) {
-      BasicBlock::iterator Next(StatepointInst);
-      Next++;
-      Instruction *IP = &*(Next);
-      Replacements[V].first = InsertVectorReform(IP);
-      Replacements[V].second = nullptr;
-    } else {
-      InvokeInst *Invoke = cast<InvokeInst>(StatepointInst);
-      // We've already normalized - check that we don't have shared destination
-      // blocks
-      BasicBlock *NormalDest = Invoke->getNormalDest();
-      assert(!isa<PHINode>(NormalDest->begin()));
-      BasicBlock *UnwindDest = Invoke->getUnwindDest();
-      assert(!isa<PHINode>(UnwindDest->begin()));
-      // Insert insert element sequences in both successors
-      Instruction *IP = &*(NormalDest->getFirstInsertionPt());
-      Replacements[V].first = InsertVectorReform(IP);
-      IP = &*(UnwindDest->getFirstInsertionPt());
-      Replacements[V].second = InsertVectorReform(IP);
-    }
-  }
-
-  for (Value *V : ToSplit) {
-    AllocaInst *Alloca = AllocaMap[V];
-
-    // Capture all users before we start mutating use lists
-    SmallVector<Instruction *, 16> Users;
-    for (User *U : V->users())
-      Users.push_back(cast<Instruction>(U));
-
-    for (Instruction *I : Users) {
-      if (auto Phi = dyn_cast<PHINode>(I)) {
-        for (unsigned i = 0; i < Phi->getNumIncomingValues(); i++)
-          if (V == Phi->getIncomingValue(i)) {
-            LoadInst *Load = new LoadInst(
-                Alloca, "", Phi->getIncomingBlock(i)->getTerminator());
-            Phi->setIncomingValue(i, Load);
-          }
-      } else {
-        LoadInst *Load = new LoadInst(Alloca, "", I);
-        I->replaceUsesOfWith(V, Load);
-      }
-    }
-
-    // Store the original value and the replacement value into the alloca
-    StoreInst *Store = new StoreInst(V, Alloca);
-    if (auto I = dyn_cast<Instruction>(V))
-      Store->insertAfter(I);
-    else
-      Store->insertAfter(Alloca);
-
-    // Normal return for invoke, or call return
-    Instruction *Replacement = cast<Instruction>(Replacements[V].first);
-    (new StoreInst(Replacement, Alloca))->insertAfter(Replacement);
-    // Unwind return for invoke only
-    Replacement = cast_or_null<Instruction>(Replacements[V].second);
-    if (Replacement)
-      (new StoreInst(Replacement, Alloca))->insertAfter(Replacement);
-  }
-
-  // apply mem2reg to promote alloca to SSA
-  SmallVector<AllocaInst *, 16> Allocas;
-  for (Value *V : ToSplit)
-    Allocas.push_back(AllocaMap[V]);
-  PromoteMemToReg(Allocas, DT);
-
-  // Update our tracking of live pointers and base mappings to account for the
-  // changes we just made.
-  for (Value *V : ToSplit) {
-    auto &Elements = ElementMapping[V];
-
-    LiveSet.erase(V);
-    LiveSet.insert(Elements.begin(), Elements.end());
-    // We need to update the base mapping as well.
-    assert(PointerToBase.count(V));
-    Value *OldBase = PointerToBase[V];
-    auto &BaseElements = ElementMapping[OldBase];
-    PointerToBase.erase(V);
-    assert(Elements.size() == BaseElements.size());
-    for (unsigned i = 0; i < Elements.size(); i++) {
-      Value *Elem = Elements[i];
-      PointerToBase[Elem] = BaseElements[i];
-    }
+    analyzeParsePointLiveness(DT, OriginalLivenessData, toUpdate[i], info);
   }
 }
 
@@ -2109,7 +1835,7 @@ chainToBasePointerCost(SmallVectorImpl<Instruction*> &Chain,
 
     } else if (GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(Instr)) {
       // Cost of the address calculation
-      Type *ValTy = GEP->getPointerOperandType()->getPointerElementType();
+      Type *ValTy = GEP->getSourceElementType();
       Cost += TTI.getAddressComputationCost(ValTy);
 
       // And cost of the GEP itself
@@ -2244,7 +1970,7 @@ static void rematerializeLiveValues(CallSite CS,
 
   // Remove rematerializaed values from the live set
   for (auto LiveValue: LiveValuesToBeDeleted) {
-    Info.LiveSet.erase(LiveValue);
+    Info.LiveSet.remove(LiveValue);
   }
 }
 
@@ -2257,11 +1983,8 @@ static bool insertParsePoints(Function &F, DominatorTree &DT,
   Uniqued.insert(ToUpdate.begin(), ToUpdate.end());
   assert(Uniqued.size() == ToUpdate.size() && "no duplicates please!");
 
-  for (CallSite CS : ToUpdate) {
-    assert(CS.getInstruction()->getParent()->getParent() == &F);
-    assert((UseDeoptBundles || isStatepoint(CS)) &&
-           "expected to already be a deopt statepoint");
-  }
+  for (CallSite CS : ToUpdate)
+    assert(CS.getInstruction()->getFunction() == &F);
 #endif
 
   // When inserting gc.relocates for invokes, we need to be able to insert at
@@ -2287,12 +2010,7 @@ static bool insertParsePoints(Function &F, DominatorTree &DT,
   for (CallSite CS : ToUpdate) {
     SmallVector<Value *, 64> DeoptValues;
 
-    iterator_range<const Use *> DeoptStateRange =
-        UseDeoptBundles
-            ? iterator_range<const Use *>(GetDeoptBundleOperands(CS))
-            : iterator_range<const Use *>(Statepoint(CS).vm_state_args());
-
-    for (Value *Arg : DeoptStateRange) {
+    for (Value *Arg : GetDeoptBundleOperands(CS)) {
       assert(!isUnhandledGCPointerType(Arg->getType()) &&
              "support for FCA unimplemented");
       if (isHandledGCPointerType(Arg->getType()))
@@ -2374,29 +2092,13 @@ static bool insertParsePoints(Function &F, DominatorTree &DT,
   for (auto &Info : Records)
     for (auto &BasePair : Info.PointerToBase)
       if (isa<Constant>(BasePair.second))
-        Info.LiveSet.erase(BasePair.first);
+        Info.LiveSet.remove(BasePair.first);
 
   for (CallInst *CI : Holders)
     CI->eraseFromParent();
 
   Holders.clear();
 
-  // Do a limited scalarization of any live at safepoint vector values which
-  // contain pointers.  This enables this pass to run after vectorization at
-  // the cost of some possible performance loss.  Note: This is known to not
-  // handle updating of the side tables correctly which can lead to relocation
-  // bugs when the same vector is live at multiple statepoints.  We're in the
-  // process of implementing the alternate lowering - relocating the
-  // vector-of-pointers as first class item and updating the backend to
-  // understand that - but that's not yet complete.  
-  if (UseVectorSplit)
-    for (size_t i = 0; i < Records.size(); i++) {
-      PartiallyConstructedSafepointRecord &Info = Records[i];
-      Instruction *Statepoint = ToUpdate[i].getInstruction();
-      splitVectorValues(cast<Instruction>(Statepoint), Info.LiveSet,
-                        Info.PointerToBase, DT);
-    }
-
   // In order to reduce live set of statepoint we might choose to rematerialize
   // some values instead of relocating them. This is purely an optimization and
   // does not influence correctness.
@@ -2592,13 +2294,9 @@ bool RewriteStatepointsForGC::runOnFunction(Function &F) {
       getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F);
 
   auto NeedsRewrite = [](Instruction &I) {
-    if (UseDeoptBundles) {
-      if (ImmutableCallSite CS = ImmutableCallSite(&I))
-        return !callsGCLeafFunction(CS);
-      return false;
-    }
-
-    return isStatepoint(I);
+    if (ImmutableCallSite CS = ImmutableCallSite(&I))
+      return !callsGCLeafFunction(CS) && !isStatepoint(CS);
+    return false;
   };
 
   // Gather all the statepoints which need rewritten.  Be careful to only
@@ -2682,15 +2380,12 @@ bool RewriteStatepointsForGC::runOnFunction(Function &F) {
 
 /// Compute the live-in set for the location rbegin starting from
 /// the live-out set of the basic block
-static void computeLiveInValues(BasicBlock::reverse_iterator rbegin,
-                                BasicBlock::reverse_iterator rend,
-                                DenseSet<Value *> &LiveTmp) {
-
-  for (BasicBlock::reverse_iterator ritr = rbegin; ritr != rend; ritr++) {
-    Instruction *I = &*ritr;
-
+static void computeLiveInValues(BasicBlock::reverse_iterator Begin,
+                                BasicBlock::reverse_iterator End,
+                                SetVector<Value *> &LiveTmp) {
+  for (auto &I : make_range(Begin, End)) {
     // KILL/Def - Remove this definition from LiveIn
-    LiveTmp.erase(I);
+    LiveTmp.remove(&I);
 
     // Don't consider *uses* in PHI nodes, we handle their contribution to
     // predecessor blocks when we seed the LiveOut sets
@@ -2698,7 +2393,7 @@ static void computeLiveInValues(BasicBlock::reverse_iterator rbegin,
       continue;
 
     // USE - Add to the LiveIn set for this instruction
-    for (Value *V : I->operands()) {
+    for (Value *V : I.operands()) {
       assert(!isUnhandledGCPointerType(V->getType()) &&
              "support for FCA unimplemented");
       if (isHandledGCPointerType(V->getType()) && !isa<Constant>(V)) {
@@ -2718,24 +2413,24 @@ static void computeLiveInValues(BasicBlock::reverse_iterator rbegin,
   }
 }
 
-static void computeLiveOutSeed(BasicBlock *BB, DenseSet<Value *> &LiveTmp) {
-
+static void computeLiveOutSeed(BasicBlock *BB, SetVector<Value *> &LiveTmp) {
   for (BasicBlock *Succ : successors(BB)) {
-    const BasicBlock::iterator E(Succ->getFirstNonPHI());
-    for (BasicBlock::iterator I = Succ->begin(); I != E; I++) {
-      PHINode *Phi = cast<PHINode>(&*I);
-      Value *V = Phi->getIncomingValueForBlock(BB);
+    for (auto &I : *Succ) {
+      PHINode *PN = dyn_cast<PHINode>(&I);
+      if (!PN)
+        break;
+
+      Value *V = PN->getIncomingValueForBlock(BB);
       assert(!isUnhandledGCPointerType(V->getType()) &&
              "support for FCA unimplemented");
-      if (isHandledGCPointerType(V->getType()) && !isa<Constant>(V)) {
+      if (isHandledGCPointerType(V->getType()) && !isa<Constant>(V))
         LiveTmp.insert(V);
-      }
     }
   }
 }
 
-static DenseSet<Value *> computeKillSet(BasicBlock *BB) {
-  DenseSet<Value *> KillSet;
+static SetVector<Value *> computeKillSet(BasicBlock *BB) {
+  SetVector<Value *> KillSet;
   for (Instruction &I : *BB)
     if (isHandledGCPointerType(I.getType()))
       KillSet.insert(&I);
@@ -2745,7 +2440,7 @@ static DenseSet<Value *> computeKillSet(BasicBlock *BB) {
 #ifndef NDEBUG
 /// Check that the items in 'Live' dominate 'TI'.  This is used as a basic
 /// sanity check for the liveness computation.
-static void checkBasicSSA(DominatorTree &DT, DenseSet<Value *> &Live,
+static void checkBasicSSA(DominatorTree &DT, SetVector<Value *> &Live,
                           TerminatorInst *TI, bool TermOkay = false) {
   for (Value *V : Live) {
     if (auto *I = dyn_cast<Instruction>(V)) {
@@ -2773,17 +2468,7 @@ static void checkBasicSSA(DominatorTree &DT, GCPtrLivenessData &Data,
 
 static void computeLiveInValues(DominatorTree &DT, Function &F,
                                 GCPtrLivenessData &Data) {
-
-  SmallSetVector<BasicBlock *, 200> Worklist;
-  auto AddPredsToWorklist = [&](BasicBlock *BB) {
-    // We use a SetVector so that we don't have duplicates in the worklist.
-    Worklist.insert(pred_begin(BB), pred_end(BB));
-  };
-  auto NextItem = [&]() {
-    BasicBlock *BB = Worklist.back();
-    Worklist.pop_back();
-    return BB;
-  };
+  SmallSetVector<BasicBlock *, 32> Worklist;
 
   // Seed the liveness for each individual block
   for (BasicBlock &BB : F) {
@@ -2796,56 +2481,55 @@ static void computeLiveInValues(DominatorTree &DT, Function &F,
       assert(!Data.LiveSet[&BB].count(Kill) && "live set contains kill");
 #endif
 
-    Data.LiveOut[&BB] = DenseSet<Value *>();
+    Data.LiveOut[&BB] = SetVector<Value *>();
     computeLiveOutSeed(&BB, Data.LiveOut[&BB]);
     Data.LiveIn[&BB] = Data.LiveSet[&BB];
-    set_union(Data.LiveIn[&BB], Data.LiveOut[&BB]);
-    set_subtract(Data.LiveIn[&BB], Data.KillSet[&BB]);
+    Data.LiveIn[&BB].set_union(Data.LiveOut[&BB]);
+    Data.LiveIn[&BB].set_subtract(Data.KillSet[&BB]);
     if (!Data.LiveIn[&BB].empty())
-      AddPredsToWorklist(&BB);
+      Worklist.insert(pred_begin(&BB), pred_end(&BB));
   }
 
   // Propagate that liveness until stable
   while (!Worklist.empty()) {
-    BasicBlock *BB = NextItem();
+    BasicBlock *BB = Worklist.pop_back_val();
 
-    // Compute our new liveout set, then exit early if it hasn't changed
-    // despite the contribution of our successor.
-    DenseSet<Value *> LiveOut = Data.LiveOut[BB];
+    // Compute our new liveout set, then exit early if it hasn't changed despite
+    // the contribution of our successor.
+    SetVector<Value *> LiveOut = Data.LiveOut[BB];
     const auto OldLiveOutSize = LiveOut.size();
     for (BasicBlock *Succ : successors(BB)) {
       assert(Data.LiveIn.count(Succ));
-      set_union(LiveOut, Data.LiveIn[Succ]);
+      LiveOut.set_union(Data.LiveIn[Succ]);
     }
     // assert OutLiveOut is a subset of LiveOut
     if (OldLiveOutSize == LiveOut.size()) {
       // If the sets are the same size, then we didn't actually add anything
-      // when unioning our successors LiveIn  Thus, the LiveIn of this block
+      // when unioning our successors LiveIn.  Thus, the LiveIn of this block
       // hasn't changed.
       continue;
     }
     Data.LiveOut[BB] = LiveOut;
 
     // Apply the effects of this basic block
-    DenseSet<Value *> LiveTmp = LiveOut;
-    set_union(LiveTmp, Data.LiveSet[BB]);
-    set_subtract(LiveTmp, Data.KillSet[BB]);
+    SetVector<Value *> LiveTmp = LiveOut;
+    LiveTmp.set_union(Data.LiveSet[BB]);
+    LiveTmp.set_subtract(Data.KillSet[BB]);
 
     assert(Data.LiveIn.count(BB));
-    const DenseSet<Value *> &OldLiveIn = Data.LiveIn[BB];
+    const SetVector<Value *> &OldLiveIn = Data.LiveIn[BB];
     // assert: OldLiveIn is a subset of LiveTmp
     if (OldLiveIn.size() != LiveTmp.size()) {
       Data.LiveIn[BB] = LiveTmp;
-      AddPredsToWorklist(BB);
+      Worklist.insert(pred_begin(BB), pred_end(BB));
     }
-  } // while( !worklist.empty() )
+  } // while (!Worklist.empty())
 
 #ifndef NDEBUG
   // Sanity check our output against SSA properties.  This helps catch any
   // missing kills during the above iteration.
-  for (BasicBlock &BB : F) {
+  for (BasicBlock &BB : F)
     checkBasicSSA(DT, Data, BB);
-  }
 #endif
 }
 
@@ -2856,7 +2540,7 @@ static void findLiveSetAtInst(Instruction *Inst, GCPtrLivenessData &Data,
 
   // Note: The copy is intentional and required
   assert(Data.LiveOut.count(BB));
-  DenseSet<Value *> LiveOut = Data.LiveOut[BB];
+  SetVector<Value *> LiveOut = Data.LiveOut[BB];
 
   // We want to handle the statepoint itself oddly.  It's
   // call result is not live (normal), nor are it's arguments
@@ -2864,12 +2548,12 @@ static void findLiveSetAtInst(Instruction *Inst, GCPtrLivenessData &Data,
   // specifically what we need to relocate
   BasicBlock::reverse_iterator rend(Inst->getIterator());
   computeLiveInValues(BB->rbegin(), rend, LiveOut);
-  LiveOut.erase(Inst);
+  LiveOut.remove(Inst);
   Out.insert(LiveOut.begin(), LiveOut.end());
 }
 
 static void recomputeLiveInValues(GCPtrLivenessData &RevisedLivenessData,
-                                  const CallSite &CS,
+                                  CallSite CS,
                                   PartiallyConstructedSafepointRecord &Info) {
   Instruction *Inst = CS.getInstruction();
   StatepointLiveSetTy Updated;
@@ -2877,33 +2561,32 @@ static void recomputeLiveInValues(GCPtrLivenessData &RevisedLivenessData,
 
 #ifndef NDEBUG
   DenseSet<Value *> Bases;
-  for (auto KVPair : Info.PointerToBase) {
+  for (auto KVPair : Info.PointerToBase)
     Bases.insert(KVPair.second);
-  }
 #endif
+
   // We may have base pointers which are now live that weren't before.  We need
   // to update the PointerToBase structure to reflect this.
   for (auto V : Updated)
-    if (!Info.PointerToBase.count(V)) {
-      assert(Bases.count(V) && "can't find base for unexpected live value");
-      Info.PointerToBase[V] = V;
+    if (Info.PointerToBase.insert({V, V}).second) {
+      assert(Bases.count(V) && "Can't find base for unexpected live value!");
       continue;
     }
 
 #ifndef NDEBUG
-  for (auto V : Updated) {
+  for (auto V : Updated)
     assert(Info.PointerToBase.count(V) &&
-           "must be able to find base for live value");
-  }
+           "Must be able to find base for live value!");
 #endif
 
   // Remove any stale base mappings - this can happen since our liveness is
-  // more precise then the one inherent in the base pointer analysis
+  // more precise then the one inherent in the base pointer analysis.
   DenseSet<Value *> ToErase;
   for (auto KVPair : Info.PointerToBase)
     if (!Updated.count(KVPair.first))
       ToErase.insert(KVPair.first);
-  for (auto V : ToErase)
+
+  for (auto *V : ToErase)
     Info.PointerToBase.erase(V);
 
 #ifndef NDEBUG
diff --git a/lib/Transforms/Scalar/SCCP.cpp b/lib/Transforms/Scalar/SCCP.cpp
index 8569e080873c..da700f18cdaf 100644
--- a/lib/Transforms/Scalar/SCCP.cpp
+++ b/lib/Transforms/Scalar/SCCP.cpp
@@ -17,15 +17,15 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "llvm/Transforms/Scalar.h"
+#include "llvm/Transforms/IPO/SCCP.h"
 #include "llvm/ADT/DenseMap.h"
 #include "llvm/ADT/DenseSet.h"
 #include "llvm/ADT/PointerIntPair.h"
 #include "llvm/ADT/SmallPtrSet.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/Statistic.h"
-#include "llvm/Analysis/GlobalsModRef.h"
 #include "llvm/Analysis/ConstantFolding.h"
+#include "llvm/Analysis/GlobalsModRef.h"
 #include "llvm/Analysis/TargetLibraryInfo.h"
 #include "llvm/IR/CallSite.h"
 #include "llvm/IR/Constants.h"
@@ -38,6 +38,8 @@
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/Transforms/IPO.h"
+#include "llvm/Transforms/Scalar.h"
+#include "llvm/Transforms/Scalar/SCCP.h"
 #include "llvm/Transforms/Utils/Local.h"
 #include <algorithm>
 using namespace llvm;
@@ -57,8 +59,8 @@ namespace {
 ///
 class LatticeVal {
   enum LatticeValueTy {
-    /// undefined - This LLVM Value has no known value yet.
-    undefined,
+    /// unknown - This LLVM Value has no known value yet.
+    unknown,
 
     /// constant - This LLVM Value has a specific constant value.
     constant,
@@ -83,9 +85,9 @@ class LatticeVal {
   }
 
 public:
-  LatticeVal() : Val(nullptr, undefined) {}
+  LatticeVal() : Val(nullptr, unknown) {}
 
-  bool isUndefined() const { return getLatticeValue() == undefined; }
+  bool isUnknown() const { return getLatticeValue() == unknown; }
   bool isConstant() const {
     return getLatticeValue() == constant || getLatticeValue() == forcedconstant;
   }
@@ -112,7 +114,7 @@ public:
       return false;
     }
 
-    if (isUndefined()) {
+    if (isUnknown()) {
       Val.setInt(constant);
       assert(V && "Marking constant with NULL");
       Val.setPointer(V);
@@ -139,7 +141,7 @@ public:
   }
 
   void markForcedConstant(Constant *V) {
-    assert(isUndefined() && "Can't force a defined value!");
+    assert(isUnknown() && "Can't force a defined value!");
     Val.setInt(forcedconstant);
     Val.setPointer(V);
   }
@@ -228,7 +230,7 @@ public:
   /// performing Interprocedural SCCP.
   void TrackValueOfGlobalVariable(GlobalVariable *GV) {
     // We only track the contents of scalar globals.
-    if (GV->getType()->getElementType()->isSingleValueType()) {
+    if (GV->getValueType()->isSingleValueType()) {
       LatticeVal &IV = TrackedGlobals[GV];
       if (!isa<UndefValue>(GV->getInitializer()))
         IV.markConstant(GV->getInitializer());
@@ -268,6 +270,18 @@ public:
     return BBExecutable.count(BB);
   }
 
+  std::vector<LatticeVal> getStructLatticeValueFor(Value *V) const {
+    std::vector<LatticeVal> StructValues;
+    StructType *STy = dyn_cast<StructType>(V->getType());
+    assert(STy && "getStructLatticeValueFor() can be called only on structs");
+    for (unsigned i = 0, e = STy->getNumElements(); i != e; ++i) {
+      auto I = StructValueState.find(std::make_pair(V, i));
+      assert(I != StructValueState.end() && "Value not in valuemap!");
+      StructValues.push_back(I->second);
+    }
+    return StructValues;
+  }
+
   LatticeVal getLatticeValueFor(Value *V) const {
     DenseMap<Value*, LatticeVal>::const_iterator I = ValueState.find(V);
     assert(I != ValueState.end() && "V is not in valuemap!");
@@ -302,6 +316,13 @@ public:
   }
 
 private:
+  // pushToWorkList - Helper for markConstant/markForcedConstant
+  void pushToWorkList(LatticeVal &IV, Value *V) {
+    if (IV.isOverdefined())
+      return OverdefinedInstWorkList.push_back(V);
+    InstWorkList.push_back(V);
+  }
+
   // markConstant - Make a value be marked as "constant".  If the value
   // is not already a constant, add it to the instruction work list so that
   // the users of the instruction are updated later.
@@ -309,10 +330,7 @@ private:
   void markConstant(LatticeVal &IV, Value *V, Constant *C) {
     if (!IV.markConstant(C)) return;
     DEBUG(dbgs() << "markConstant: " << *C << ": " << *V << '\n');
-    if (IV.isOverdefined())
-      OverdefinedInstWorkList.push_back(V);
-    else
-      InstWorkList.push_back(V);
+    pushToWorkList(IV, V);
   }
 
   void markConstant(Value *V, Constant *C) {
@@ -325,10 +343,7 @@ private:
     LatticeVal &IV = ValueState[V];
     IV.markForcedConstant(C);
     DEBUG(dbgs() << "markForcedConstant: " << *C << ": " << *V << '\n');
-    if (IV.isOverdefined())
-      OverdefinedInstWorkList.push_back(V);
-    else
-      InstWorkList.push_back(V);
+    pushToWorkList(IV, V);
   }
 
 
@@ -348,14 +363,14 @@ private:
   }
 
   void mergeInValue(LatticeVal &IV, Value *V, LatticeVal MergeWithV) {
-    if (IV.isOverdefined() || MergeWithV.isUndefined())
+    if (IV.isOverdefined() || MergeWithV.isUnknown())
       return;  // Noop.
     if (MergeWithV.isOverdefined())
-      markOverdefined(IV, V);
-    else if (IV.isUndefined())
-      markConstant(IV, V, MergeWithV.getConstant());
-    else if (IV.getConstant() != MergeWithV.getConstant())
-      markOverdefined(IV, V);
+      return markOverdefined(IV, V);
+    if (IV.isUnknown())
+      return markConstant(IV, V, MergeWithV.getConstant());
+    if (IV.getConstant() != MergeWithV.getConstant())
+      return markOverdefined(IV, V);
   }
 
   void mergeInValue(Value *V, LatticeVal MergeWithV) {
@@ -378,7 +393,7 @@ private:
       return LV;  // Common case, already in the map.
 
     if (Constant *C = dyn_cast<Constant>(V)) {
-      // Undef values remain undefined.
+      // Undef values remain unknown.
       if (!isa<UndefValue>(V))
         LV.markConstant(C);          // Constants are constant
     }
@@ -409,7 +424,7 @@ private:
       if (!Elt)
         LV.markOverdefined();      // Unknown sort of constant.
       else if (isa<UndefValue>(Elt))
-        ; // Undef values remain undefined.
+        ; // Undef values remain unknown.
       else
         LV.markConstant(Elt);      // Constants are constant.
     }
@@ -537,7 +552,7 @@ void SCCPSolver::getFeasibleSuccessors(TerminatorInst &TI,
     if (!CI) {
       // Overdefined condition variables, and branches on unfoldable constant
       // conditions, mean the branch could go either way.
-      if (!BCValue.isUndefined())
+      if (!BCValue.isUnknown())
         Succs[0] = Succs[1] = true;
       return;
     }
@@ -561,9 +576,9 @@ void SCCPSolver::getFeasibleSuccessors(TerminatorInst &TI,
     LatticeVal SCValue = getValueState(SI->getCondition());
     ConstantInt *CI = SCValue.getConstantInt();
 
-    if (!CI) {   // Overdefined or undefined condition?
+    if (!CI) {   // Overdefined or unknown condition?
       // All destinations are executable!
-      if (!SCValue.isUndefined())
+      if (!SCValue.isUnknown())
         Succs.assign(TI.getNumSuccessors(), true);
       return;
     }
@@ -607,7 +622,7 @@ bool SCCPSolver::isEdgeFeasible(BasicBlock *From, BasicBlock *To) {
     // undef conditions mean that neither edge is feasible yet.
     ConstantInt *CI = BCValue.getConstantInt();
     if (!CI)
-      return !BCValue.isUndefined();
+      return !BCValue.isUnknown();
 
     // Constant condition variables mean the branch can only go a single way.
     return BI->getSuccessor(CI->isZero()) == To;
@@ -625,7 +640,7 @@ bool SCCPSolver::isEdgeFeasible(BasicBlock *From, BasicBlock *To) {
     ConstantInt *CI = SCValue.getConstantInt();
 
     if (!CI)
-      return !SCValue.isUndefined();
+      return !SCValue.isUnknown();
 
     return SI->findCaseValue(CI).getCaseSuccessor() == To;
   }
@@ -677,12 +692,12 @@ void SCCPSolver::visitPHINode(PHINode &PN) {
   // are overdefined, the PHI becomes overdefined as well.  If they are all
   // constant, and they agree with each other, the PHI becomes the identical
   // constant.  If they are constant and don't agree, the PHI is overdefined.
-  // If there are no executable operands, the PHI remains undefined.
+  // If there are no executable operands, the PHI remains unknown.
   //
   Constant *OperandVal = nullptr;
   for (unsigned i = 0, e = PN.getNumIncomingValues(); i != e; ++i) {
     LatticeVal IV = getValueState(PN.getIncomingValue(i));
-    if (IV.isUndefined()) continue;  // Doesn't influence PHI node.
+    if (IV.isUnknown()) continue;  // Doesn't influence PHI node.
 
     if (!isEdgeFeasible(PN.getIncomingBlock(i), PN.getParent()))
       continue;
@@ -708,7 +723,7 @@ void SCCPSolver::visitPHINode(PHINode &PN) {
   // If we exited the loop, this means that the PHI node only has constant
   // arguments that agree with each other(and OperandVal is the constant) or
   // OperandVal is null because there are no defined incoming arguments.  If
-  // this is the case, the PHI remains undefined.
+  // this is the case, the PHI remains unknown.
   //
   if (OperandVal)
     markConstant(&PN, OperandVal);      // Acquire operand value
@@ -758,8 +773,9 @@ void SCCPSolver::visitCastInst(CastInst &I) {
   if (OpSt.isOverdefined())          // Inherit overdefinedness of operand
     markOverdefined(&I);
   else if (OpSt.isConstant()) {
-    Constant *C =
-        ConstantExpr::getCast(I.getOpcode(), OpSt.getConstant(), I.getType());
+    // Fold the constant as we build.
+    Constant *C = ConstantFoldCastOperand(I.getOpcode(), OpSt.getConstant(),
+                                          I.getType(), DL);
     if (isa<UndefValue>(C))
       return;
     // Propagate constant value
@@ -829,7 +845,7 @@ void SCCPSolver::visitSelectInst(SelectInst &I) {
     return markAnythingOverdefined(&I);
 
   LatticeVal CondValue = getValueState(I.getCondition());
-  if (CondValue.isUndefined())
+  if (CondValue.isUnknown())
     return;
 
   if (ConstantInt *CondCB = CondValue.getConstantInt()) {
@@ -849,9 +865,9 @@ void SCCPSolver::visitSelectInst(SelectInst &I) {
       TVal.getConstant() == FVal.getConstant())
     return markConstant(&I, FVal.getConstant());
 
-  if (TVal.isUndefined())   // select ?, undef, X -> X.
+  if (TVal.isUnknown())   // select ?, undef, X -> X.
     return mergeInValue(&I, FVal);
-  if (FVal.isUndefined())   // select ?, X, undef -> X.
+  if (FVal.isUnknown())   // select ?, X, undef -> X.
     return mergeInValue(&I, TVal);
   markOverdefined(&I);
 }
@@ -890,7 +906,7 @@ void SCCPSolver::visitBinaryOperator(Instruction &I) {
       NonOverdefVal = &V2State;
 
     if (NonOverdefVal) {
-      if (NonOverdefVal->isUndefined()) {
+      if (NonOverdefVal->isUnknown()) {
         // Could annihilate value.
         if (I.getOpcode() == Instruction::And)
           markConstant(IV, &I, Constant::getNullValue(I.getType()));
@@ -934,7 +950,7 @@ void SCCPSolver::visitCmpInst(CmpInst &I) {
     return markConstant(IV, &I, C);
   }
 
-  // If operands are still undefined, wait for it to resolve.
+  // If operands are still unknown, wait for it to resolve.
   if (!V1State.isOverdefined() && !V2State.isOverdefined())
     return;
 
@@ -944,69 +960,16 @@ void SCCPSolver::visitCmpInst(CmpInst &I) {
 void SCCPSolver::visitExtractElementInst(ExtractElementInst &I) {
   // TODO : SCCP does not handle vectors properly.
   return markOverdefined(&I);
-
-#if 0
-  LatticeVal &ValState = getValueState(I.getOperand(0));
-  LatticeVal &IdxState = getValueState(I.getOperand(1));
-
-  if (ValState.isOverdefined() || IdxState.isOverdefined())
-    markOverdefined(&I);
-  else if(ValState.isConstant() && IdxState.isConstant())
-    markConstant(&I, ConstantExpr::getExtractElement(ValState.getConstant(),
-                                                     IdxState.getConstant()));
-#endif
 }
 
 void SCCPSolver::visitInsertElementInst(InsertElementInst &I) {
   // TODO : SCCP does not handle vectors properly.
   return markOverdefined(&I);
-#if 0
-  LatticeVal &ValState = getValueState(I.getOperand(0));
-  LatticeVal &EltState = getValueState(I.getOperand(1));
-  LatticeVal &IdxState = getValueState(I.getOperand(2));
-
-  if (ValState.isOverdefined() || EltState.isOverdefined() ||
-      IdxState.isOverdefined())
-    markOverdefined(&I);
-  else if(ValState.isConstant() && EltState.isConstant() &&
-          IdxState.isConstant())
-    markConstant(&I, ConstantExpr::getInsertElement(ValState.getConstant(),
-                                                    EltState.getConstant(),
-                                                    IdxState.getConstant()));
-  else if (ValState.isUndefined() && EltState.isConstant() &&
-           IdxState.isConstant())
-    markConstant(&I,ConstantExpr::getInsertElement(UndefValue::get(I.getType()),
-                                                   EltState.getConstant(),
-                                                   IdxState.getConstant()));
-#endif
 }
 
 void SCCPSolver::visitShuffleVectorInst(ShuffleVectorInst &I) {
   // TODO : SCCP does not handle vectors properly.
   return markOverdefined(&I);
-#if 0
-  LatticeVal &V1State   = getValueState(I.getOperand(0));
-  LatticeVal &V2State   = getValueState(I.getOperand(1));
-  LatticeVal &MaskState = getValueState(I.getOperand(2));
-
-  if (MaskState.isUndefined() ||
-      (V1State.isUndefined() && V2State.isUndefined()))
-    return;  // Undefined output if mask or both inputs undefined.
-
-  if (V1State.isOverdefined() || V2State.isOverdefined() ||
-      MaskState.isOverdefined()) {
-    markOverdefined(&I);
-  } else {
-    // A mix of constant/undef inputs.
-    Constant *V1 = V1State.isConstant() ?
-        V1State.getConstant() : UndefValue::get(I.getType());
-    Constant *V2 = V2State.isConstant() ?
-        V2State.getConstant() : UndefValue::get(I.getType());
-    Constant *Mask = MaskState.isConstant() ?
-      MaskState.getConstant() : UndefValue::get(I.getOperand(2)->getType());
-    markConstant(&I, ConstantExpr::getShuffleVector(V1, V2, Mask));
-  }
-#endif
 }
 
 // Handle getelementptr instructions.  If all operands are constants then we
@@ -1020,7 +983,7 @@ void SCCPSolver::visitGetElementPtrInst(GetElementPtrInst &I) {
 
   for (unsigned i = 0, e = I.getNumOperands(); i != e; ++i) {
     LatticeVal State = getValueState(I.getOperand(i));
-    if (State.isUndefined())
+    if (State.isUnknown())
       return;  // Operands are not resolved yet.
 
     if (State.isOverdefined())
@@ -1066,7 +1029,7 @@ void SCCPSolver::visitLoadInst(LoadInst &I) {
     return markAnythingOverdefined(&I);
 
   LatticeVal PtrVal = getValueState(I.getOperand(0));
-  if (PtrVal.isUndefined()) return;   // The pointer is not resolved yet!
+  if (PtrVal.isUnknown()) return;   // The pointer is not resolved yet!
 
   LatticeVal &IV = ValueState[&I];
   if (IV.isOverdefined()) return;
@@ -1094,7 +1057,7 @@ void SCCPSolver::visitLoadInst(LoadInst &I) {
   }
 
   // Transform load from a constant into a constant if possible.
-  if (Constant *C = ConstantFoldLoadFromConstPtr(Ptr, DL)) {
+  if (Constant *C = ConstantFoldLoadFromConstPtr(Ptr, I.getType(), DL)) {
     if (isa<UndefValue>(C))
       return;
     return markConstant(IV, &I, C);
@@ -1127,7 +1090,7 @@ CallOverdefined:
            AI != E; ++AI) {
         LatticeVal State = getValueState(*AI);
 
-        if (State.isUndefined())
+        if (State.isUnknown())
           return;  // Operands are not resolved yet.
         if (State.isOverdefined())
           return markOverdefined(I);
@@ -1275,11 +1238,11 @@ void SCCPSolver::Solve() {
 /// conservatively, as "(zext i8 X -> i32) & 0xFF00" must always return zero,
 /// even if X isn't defined.
 bool SCCPSolver::ResolvedUndefsIn(Function &F) {
-  for (Function::iterator BB = F.begin(), E = F.end(); BB != E; ++BB) {
-    if (!BBExecutable.count(&*BB))
+  for (BasicBlock &BB : F) {
+    if (!BBExecutable.count(&BB))
       continue;
 
-    for (Instruction &I : *BB) {
+    for (Instruction &I : BB) {
       // Look for instructions which produce undef values.
       if (I.getType()->isVoidTy()) continue;
 
@@ -1301,14 +1264,14 @@ bool SCCPSolver::ResolvedUndefsIn(Function &F) {
         // more precise than this but it isn't worth bothering.
         for (unsigned i = 0, e = STy->getNumElements(); i != e; ++i) {
           LatticeVal &LV = getStructValueState(&I, i);
-          if (LV.isUndefined())
+          if (LV.isUnknown())
             markOverdefined(LV, &I);
         }
         continue;
       }
 
       LatticeVal &LV = getValueState(&I);
-      if (!LV.isUndefined()) continue;
+      if (!LV.isUnknown()) continue;
 
       // extractvalue is safe; check here because the argument is a struct.
       if (isa<ExtractValueInst>(I))
@@ -1347,7 +1310,7 @@ bool SCCPSolver::ResolvedUndefsIn(Function &F) {
       case Instruction::FDiv:
       case Instruction::FRem:
         // Floating-point binary operation: be conservative.
-        if (Op0LV.isUndefined() && Op1LV.isUndefined())
+        if (Op0LV.isUnknown() && Op1LV.isUnknown())
           markForcedConstant(&I, Constant::getNullValue(ITy));
         else
           markOverdefined(&I);
@@ -1367,7 +1330,7 @@ bool SCCPSolver::ResolvedUndefsIn(Function &F) {
       case Instruction::Mul:
       case Instruction::And:
         // Both operands undef -> undef
-        if (Op0LV.isUndefined() && Op1LV.isUndefined())
+        if (Op0LV.isUnknown() && Op1LV.isUnknown())
           break;
         // undef * X -> 0.   X could be zero.
         // undef & X -> 0.   X could be zero.
@@ -1376,7 +1339,7 @@ bool SCCPSolver::ResolvedUndefsIn(Function &F) {
 
       case Instruction::Or:
         // Both operands undef -> undef
-        if (Op0LV.isUndefined() && Op1LV.isUndefined())
+        if (Op0LV.isUnknown() && Op1LV.isUnknown())
           break;
         // undef | X -> -1.   X could be -1.
         markForcedConstant(&I, Constant::getAllOnesValue(ITy));
@@ -1386,7 +1349,7 @@ bool SCCPSolver::ResolvedUndefsIn(Function &F) {
         // undef ^ undef -> 0; strictly speaking, this is not strictly
         // necessary, but we try to be nice to people who expect this
         // behavior in simple cases
-        if (Op0LV.isUndefined() && Op1LV.isUndefined()) {
+        if (Op0LV.isUnknown() && Op1LV.isUnknown()) {
           markForcedConstant(&I, Constant::getNullValue(ITy));
           return true;
         }
@@ -1399,7 +1362,7 @@ bool SCCPSolver::ResolvedUndefsIn(Function &F) {
       case Instruction::URem:
         // X / undef -> undef.  No change.
         // X % undef -> undef.  No change.
-        if (Op1LV.isUndefined()) break;
+        if (Op1LV.isUnknown()) break;
 
         // X / 0 -> undef.  No change.
         // X % 0 -> undef.  No change.
@@ -1413,7 +1376,15 @@ bool SCCPSolver::ResolvedUndefsIn(Function &F) {
 
       case Instruction::AShr:
         // X >>a undef -> undef.
-        if (Op1LV.isUndefined()) break;
+        if (Op1LV.isUnknown()) break;
+
+        // Shifting by the bitwidth or more is undefined.
+        if (Op1LV.isConstant()) {
+          if (auto *ShiftAmt = Op1LV.getConstantInt())
+            if (ShiftAmt->getLimitedValue() >=
+                ShiftAmt->getType()->getScalarSizeInBits())
+              break;
+        }
 
         // undef >>a X -> all ones
         markForcedConstant(&I, Constant::getAllOnesValue(ITy));
@@ -1422,7 +1393,15 @@ bool SCCPSolver::ResolvedUndefsIn(Function &F) {
       case Instruction::Shl:
         // X << undef -> undef.
         // X >> undef -> undef.
-        if (Op1LV.isUndefined()) break;
+        if (Op1LV.isUnknown()) break;
+
+        // Shifting by the bitwidth or more is undefined.
+        if (Op1LV.isConstant()) {
+          if (auto *ShiftAmt = Op1LV.getConstantInt())
+            if (ShiftAmt->getLimitedValue() >=
+                ShiftAmt->getType()->getScalarSizeInBits())
+              break;
+        }
 
         // undef << X -> 0
         // undef >> X -> 0
@@ -1431,13 +1410,13 @@ bool SCCPSolver::ResolvedUndefsIn(Function &F) {
       case Instruction::Select:
         Op1LV = getValueState(I.getOperand(1));
         // undef ? X : Y  -> X or Y.  There could be commonality between X/Y.
-        if (Op0LV.isUndefined()) {
+        if (Op0LV.isUnknown()) {
           if (!Op1LV.isConstant())  // Pick the constant one if there is any.
             Op1LV = getValueState(I.getOperand(2));
-        } else if (Op1LV.isUndefined()) {
+        } else if (Op1LV.isUnknown()) {
           // c ? undef : undef -> undef.  No change.
           Op1LV = getValueState(I.getOperand(2));
-          if (Op1LV.isUndefined())
+          if (Op1LV.isUnknown())
             break;
           // Otherwise, c ? undef : x -> x.
         } else {
@@ -1487,17 +1466,17 @@ bool SCCPSolver::ResolvedUndefsIn(Function &F) {
     // Check to see if we have a branch or switch on an undefined value.  If so
     // we force the branch to go one way or the other to make the successor
     // values live.  It doesn't really matter which way we force it.
-    TerminatorInst *TI = BB->getTerminator();
+    TerminatorInst *TI = BB.getTerminator();
     if (BranchInst *BI = dyn_cast<BranchInst>(TI)) {
       if (!BI->isConditional()) continue;
-      if (!getValueState(BI->getCondition()).isUndefined())
+      if (!getValueState(BI->getCondition()).isUnknown())
         continue;
 
       // If the input to SCCP is actually branch on undef, fix the undef to
       // false.
       if (isa<UndefValue>(BI->getCondition())) {
         BI->setCondition(ConstantInt::getFalse(BI->getContext()));
-        markEdgeExecutable(&*BB, TI->getSuccessor(1));
+        markEdgeExecutable(&BB, TI->getSuccessor(1));
         return true;
       }
 
@@ -1510,16 +1489,14 @@ bool SCCPSolver::ResolvedUndefsIn(Function &F) {
     }
 
     if (SwitchInst *SI = dyn_cast<SwitchInst>(TI)) {
-      if (!SI->getNumCases())
-        continue;
-      if (!getValueState(SI->getCondition()).isUndefined())
+      if (!SI->getNumCases() || !getValueState(SI->getCondition()).isUnknown())
         continue;
 
       // If the input to SCCP is actually switch on undef, fix the undef to
       // the first constant.
       if (isa<UndefValue>(SI->getCondition())) {
         SI->setCondition(SI->case_begin().getCaseValue());
-        markEdgeExecutable(&*BB, SI->case_begin().getCaseSuccessor());
+        markEdgeExecutable(&BB, SI->case_begin().getCaseSuccessor());
         return true;
       }
 
@@ -1531,75 +1508,53 @@ bool SCCPSolver::ResolvedUndefsIn(Function &F) {
   return false;
 }
 
-
-namespace {
-  //===--------------------------------------------------------------------===//
-  //
-  /// SCCP Class - This class uses the SCCPSolver to implement a per-function
-  /// Sparse Conditional Constant Propagator.
-  ///
-  struct SCCP : public FunctionPass {
-    void getAnalysisUsage(AnalysisUsage &AU) const override {
-      AU.addRequired<TargetLibraryInfoWrapperPass>();
-      AU.addPreserved<GlobalsAAWrapperPass>();
-    }
-    static char ID; // Pass identification, replacement for typeid
-    SCCP() : FunctionPass(ID) {
-      initializeSCCPPass(*PassRegistry::getPassRegistry());
+static bool tryToReplaceWithConstant(SCCPSolver &Solver, Value *V) {
+  Constant *Const = nullptr;
+  if (V->getType()->isStructTy()) {
+    std::vector<LatticeVal> IVs = Solver.getStructLatticeValueFor(V);
+    if (std::any_of(IVs.begin(), IVs.end(),
+                    [](LatticeVal &LV) { return LV.isOverdefined(); }))
+      return false;
+    std::vector<Constant *> ConstVals;
+    StructType *ST = dyn_cast<StructType>(V->getType());
+    for (unsigned i = 0, e = ST->getNumElements(); i != e; ++i) {
+      LatticeVal V = IVs[i];
+      ConstVals.push_back(V.isConstant()
+                              ? V.getConstant()
+                              : UndefValue::get(ST->getElementType(i)));
     }
+    Const = ConstantStruct::get(ST, ConstVals);
+  } else {
+    LatticeVal IV = Solver.getLatticeValueFor(V);
+    if (IV.isOverdefined())
+      return false;
+    Const = IV.isConstant() ? IV.getConstant() : UndefValue::get(V->getType());
+  }
+  assert(Const && "Constant is nullptr here!");
+  DEBUG(dbgs() << "  Constant: " << *Const << " = " << *V << '\n');
 
-    // runOnFunction - Run the Sparse Conditional Constant Propagation
-    // algorithm, and return true if the function was modified.
-    //
-    bool runOnFunction(Function &F) override;
-  };
-} // end anonymous namespace
-
-char SCCP::ID = 0;
-INITIALIZE_PASS(SCCP, "sccp",
-                "Sparse Conditional Constant Propagation", false, false)
-
-// createSCCPPass - This is the public interface to this file.
-FunctionPass *llvm::createSCCPPass() {
-  return new SCCP();
+  // Replaces all of the uses of a variable with uses of the constant.
+  V->replaceAllUsesWith(Const);
+  return true;
 }
 
-static void DeleteInstructionInBlock(BasicBlock *BB) {
-  DEBUG(dbgs() << "  BasicBlock Dead:" << *BB);
-  ++NumDeadBlocks;
-
-  // Check to see if there are non-terminating instructions to delete.
-  if (isa<TerminatorInst>(BB->begin()))
-    return;
+static bool tryToReplaceInstWithConstant(SCCPSolver &Solver, Instruction *Inst,
+                                         bool shouldEraseFromParent) {
+  if (!tryToReplaceWithConstant(Solver, Inst))
+    return false;
 
-  // Delete the instructions backwards, as it has a reduced likelihood of having
-  // to update as many def-use and use-def chains.
-  Instruction *EndInst = BB->getTerminator(); // Last not to be deleted.
-  while (EndInst != BB->begin()) {
-    // Delete the next to last instruction.
-    Instruction *Inst = &*--EndInst->getIterator();
-    if (!Inst->use_empty())
-      Inst->replaceAllUsesWith(UndefValue::get(Inst->getType()));
-    if (Inst->isEHPad()) {
-      EndInst = Inst;
-      continue;
-    }
-    BB->getInstList().erase(Inst);
-    ++NumInstRemoved;
-  }
+  // Delete the instruction.
+  if (shouldEraseFromParent)
+    Inst->eraseFromParent();
+  return true;
 }
 
-// runOnFunction() - Run the Sparse Conditional Constant Propagation algorithm,
+// runSCCP() - Run the Sparse Conditional Constant Propagation algorithm,
 // and return true if the function was modified.
 //
-bool SCCP::runOnFunction(Function &F) {
-  if (skipOptnoneFunction(F))
-    return false;
-
+static bool runSCCP(Function &F, const DataLayout &DL,
+                    const TargetLibraryInfo *TLI) {
   DEBUG(dbgs() << "SCCP on function '" << F.getName() << "'\n");
-  const DataLayout &DL = F.getParent()->getDataLayout();
-  const TargetLibraryInfo *TLI =
-      &getAnalysis<TargetLibraryInfoWrapperPass>().getTLI();
   SCCPSolver Solver(DL, TLI);
 
   // Mark the first block of the function as being executable.
@@ -1623,9 +1578,13 @@ bool SCCP::runOnFunction(Function &F) {
   // delete their contents now.  Note that we cannot actually delete the blocks,
   // as we cannot modify the CFG of the function.
 
-  for (Function::iterator BB = F.begin(), E = F.end(); BB != E; ++BB) {
-    if (!Solver.isBlockExecutable(&*BB)) {
-      DeleteInstructionInBlock(&*BB);
+  for (BasicBlock &BB : F) {
+    if (!Solver.isBlockExecutable(&BB)) {
+      DEBUG(dbgs() << "  BasicBlock Dead:" << BB);
+
+      ++NumDeadBlocks;
+      NumInstRemoved += removeAllNonTerminatorAndEHPadInstructions(&BB);
+
       MadeChanges = true;
       continue;
     }
@@ -1633,70 +1592,74 @@ bool SCCP::runOnFunction(Function &F) {
     // Iterate over all of the instructions in a function, replacing them with
     // constants if we have found them to be of constant values.
     //
-    for (BasicBlock::iterator BI = BB->begin(), E = BB->end(); BI != E; ) {
+    for (BasicBlock::iterator BI = BB.begin(), E = BB.end(); BI != E;) {
       Instruction *Inst = &*BI++;
       if (Inst->getType()->isVoidTy() || isa<TerminatorInst>(Inst))
         continue;
 
-      // TODO: Reconstruct structs from their elements.
-      if (Inst->getType()->isStructTy())
-        continue;
-
-      LatticeVal IV = Solver.getLatticeValueFor(Inst);
-      if (IV.isOverdefined())
-        continue;
-
-      Constant *Const = IV.isConstant()
-        ? IV.getConstant() : UndefValue::get(Inst->getType());
-      DEBUG(dbgs() << "  Constant: " << *Const << " = " << *Inst << '\n');
-
-      // Replaces all of the uses of a variable with uses of the constant.
-      Inst->replaceAllUsesWith(Const);
-
-      // Delete the instruction.
-      Inst->eraseFromParent();
-
-      // Hey, we just changed something!
-      MadeChanges = true;
-      ++NumInstRemoved;
+      if (tryToReplaceInstWithConstant(Solver, Inst,
+                                       true /* shouldEraseFromParent */)) {
+        // Hey, we just changed something!
+        MadeChanges = true;
+        ++NumInstRemoved;
+      }
     }
   }
 
   return MadeChanges;
 }
 
+PreservedAnalyses SCCPPass::run(Function &F, AnalysisManager<Function> &AM) {
+  const DataLayout &DL = F.getParent()->getDataLayout();
+  auto &TLI = AM.getResult<TargetLibraryAnalysis>(F);
+  if (!runSCCP(F, DL, &TLI))
+    return PreservedAnalyses::all();
+
+  auto PA = PreservedAnalyses();
+  PA.preserve<GlobalsAA>();
+  return PA;
+}
+
 namespace {
-  //===--------------------------------------------------------------------===//
+//===--------------------------------------------------------------------===//
+//
+/// SCCP Class - This class uses the SCCPSolver to implement a per-function
+/// Sparse Conditional Constant Propagator.
+///
+class SCCPLegacyPass : public FunctionPass {
+public:
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    AU.addRequired<TargetLibraryInfoWrapperPass>();
+    AU.addPreserved<GlobalsAAWrapperPass>();
+  }
+  static char ID; // Pass identification, replacement for typeid
+  SCCPLegacyPass() : FunctionPass(ID) {
+    initializeSCCPLegacyPassPass(*PassRegistry::getPassRegistry());
+  }
+
+  // runOnFunction - Run the Sparse Conditional Constant Propagation
+  // algorithm, and return true if the function was modified.
   //
-  /// IPSCCP Class - This class implements interprocedural Sparse Conditional
-  /// Constant Propagation.
-  ///
-  struct IPSCCP : public ModulePass {
-    void getAnalysisUsage(AnalysisUsage &AU) const override {
-      AU.addRequired<TargetLibraryInfoWrapperPass>();
-    }
-    static char ID;
-    IPSCCP() : ModulePass(ID) {
-      initializeIPSCCPPass(*PassRegistry::getPassRegistry());
-    }
-    bool runOnModule(Module &M) override;
-  };
+  bool runOnFunction(Function &F) override {
+    if (skipFunction(F))
+      return false;
+    const DataLayout &DL = F.getParent()->getDataLayout();
+    const TargetLibraryInfo *TLI =
+        &getAnalysis<TargetLibraryInfoWrapperPass>().getTLI();
+    return runSCCP(F, DL, TLI);
+  }
+};
 } // end anonymous namespace
 
-char IPSCCP::ID = 0;
-INITIALIZE_PASS_BEGIN(IPSCCP, "ipsccp",
-                "Interprocedural Sparse Conditional Constant Propagation",
-                false, false)
+char SCCPLegacyPass::ID = 0;
+INITIALIZE_PASS_BEGIN(SCCPLegacyPass, "sccp",
+                      "Sparse Conditional Constant Propagation", false, false)
 INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfoWrapperPass)
-INITIALIZE_PASS_END(IPSCCP, "ipsccp",
-                "Interprocedural Sparse Conditional Constant Propagation",
-                false, false)
-
-// createIPSCCPPass - This is the public interface to this file.
-ModulePass *llvm::createIPSCCPPass() {
-  return new IPSCCP();
-}
+INITIALIZE_PASS_END(SCCPLegacyPass, "sccp",
+                    "Sparse Conditional Constant Propagation", false, false)
 
+// createSCCPPass - This is the public interface to this file.
+FunctionPass *llvm::createSCCPPass() { return new SCCPLegacyPass(); }
 
 static bool AddressIsTaken(const GlobalValue *GV) {
   // Delete any dead constantexpr klingons.
@@ -1725,10 +1688,8 @@ static bool AddressIsTaken(const GlobalValue *GV) {
   return false;
 }
 
-bool IPSCCP::runOnModule(Module &M) {
-  const DataLayout &DL = M.getDataLayout();
-  const TargetLibraryInfo *TLI =
-      &getAnalysis<TargetLibraryInfoWrapperPass>().getTLI();
+static bool runIPSCCP(Module &M, const DataLayout &DL,
+                      const TargetLibraryInfo *TLI) {
   SCCPSolver Solver(DL, TLI);
 
   // AddressTakenFunctions - This set keeps track of the address-taken functions
@@ -1741,32 +1702,32 @@ bool IPSCCP::runOnModule(Module &M) {
   // Loop over all functions, marking arguments to those with their addresses
   // taken or that are external as overdefined.
   //
-  for (Module::iterator F = M.begin(), E = M.end(); F != E; ++F) {
-    if (F->isDeclaration())
+  for (Function &F : M) {
+    if (F.isDeclaration())
       continue;
 
-    // If this is a strong or ODR definition of this function, then we can
-    // propagate information about its result into callsites of it.
-    if (!F->mayBeOverridden())
-      Solver.AddTrackedFunction(&*F);
+    // If this is an exact definition of this function, then we can propagate
+    // information about its result into callsites of it.
+    if (F.hasExactDefinition())
+      Solver.AddTrackedFunction(&F);
 
     // If this function only has direct calls that we can see, we can track its
     // arguments and return value aggressively, and can assume it is not called
     // unless we see evidence to the contrary.
-    if (F->hasLocalLinkage()) {
-      if (AddressIsTaken(&*F))
-        AddressTakenFunctions.insert(&*F);
+    if (F.hasLocalLinkage()) {
+      if (AddressIsTaken(&F))
+        AddressTakenFunctions.insert(&F);
       else {
-        Solver.AddArgumentTrackedFunction(&*F);
+        Solver.AddArgumentTrackedFunction(&F);
         continue;
       }
     }
 
     // Assume the function is called.
-    Solver.MarkBlockExecutable(&F->front());
+    Solver.MarkBlockExecutable(&F.front());
 
     // Assume nothing about the incoming arguments.
-    for (Argument &AI : F->args())
+    for (Argument &AI : F.args())
       Solver.markAnythingOverdefined(&AI);
   }
 
@@ -1784,8 +1745,8 @@ bool IPSCCP::runOnModule(Module &M) {
 
     DEBUG(dbgs() << "RESOLVING UNDEFS\n");
     ResolvedUndefs = false;
-    for (Module::iterator F = M.begin(), E = M.end(); F != E; ++F)
-      ResolvedUndefs |= Solver.ResolvedUndefsIn(*F);
+    for (Function &F : M)
+      ResolvedUndefs |= Solver.ResolvedUndefsIn(F);
   }
 
   bool MadeChanges = false;
@@ -1795,79 +1756,47 @@ bool IPSCCP::runOnModule(Module &M) {
   //
   SmallVector<BasicBlock*, 512> BlocksToErase;
 
-  for (Module::iterator F = M.begin(), E = M.end(); F != E; ++F) {
-    if (F->isDeclaration())
+  for (Function &F : M) {
+    if (F.isDeclaration())
       continue;
 
-    if (Solver.isBlockExecutable(&F->front())) {
-      for (Function::arg_iterator AI = F->arg_begin(), E = F->arg_end();
-           AI != E; ++AI) {
-        if (AI->use_empty() || AI->getType()->isStructTy()) continue;
-
-        // TODO: Could use getStructLatticeValueFor to find out if the entire
-        // result is a constant and replace it entirely if so.
-
-        LatticeVal IV = Solver.getLatticeValueFor(&*AI);
-        if (IV.isOverdefined()) continue;
-
-        Constant *CST = IV.isConstant() ?
-        IV.getConstant() : UndefValue::get(AI->getType());
-        DEBUG(dbgs() << "***  Arg " << *AI << " = " << *CST <<"\n");
-
-        // Replaces all of the uses of a variable with uses of the
-        // constant.
-        AI->replaceAllUsesWith(CST);
-        ++IPNumArgsElimed;
+    if (Solver.isBlockExecutable(&F.front())) {
+      for (Function::arg_iterator AI = F.arg_begin(), E = F.arg_end(); AI != E;
+           ++AI) {
+        if (AI->use_empty())
+          continue;
+        if (tryToReplaceWithConstant(Solver, &*AI))
+          ++IPNumArgsElimed;
       }
     }
 
-    for (Function::iterator BB = F->begin(), E = F->end(); BB != E; ++BB) {
+    for (Function::iterator BB = F.begin(), E = F.end(); BB != E; ++BB) {
       if (!Solver.isBlockExecutable(&*BB)) {
-        DeleteInstructionInBlock(&*BB);
-        MadeChanges = true;
+        DEBUG(dbgs() << "  BasicBlock Dead:" << *BB);
 
-        TerminatorInst *TI = BB->getTerminator();
-        for (BasicBlock *Succ : TI->successors()) {
-          if (!Succ->empty() && isa<PHINode>(Succ->begin()))
-            Succ->removePredecessor(&*BB);
-        }
-        if (!TI->use_empty())
-          TI->replaceAllUsesWith(UndefValue::get(TI->getType()));
-        TI->eraseFromParent();
-        new UnreachableInst(M.getContext(), &*BB);
+        ++NumDeadBlocks;
+        NumInstRemoved +=
+            changeToUnreachable(BB->getFirstNonPHI(), /*UseLLVMTrap=*/false);
+
+        MadeChanges = true;
 
-        if (&*BB != &F->front())
+        if (&*BB != &F.front())
           BlocksToErase.push_back(&*BB);
         continue;
       }
 
       for (BasicBlock::iterator BI = BB->begin(), E = BB->end(); BI != E; ) {
         Instruction *Inst = &*BI++;
-        if (Inst->getType()->isVoidTy() || Inst->getType()->isStructTy())
+        if (Inst->getType()->isVoidTy())
           continue;
-
-        // TODO: Could use getStructLatticeValueFor to find out if the entire
-        // result is a constant and replace it entirely if so.
-
-        LatticeVal IV = Solver.getLatticeValueFor(Inst);
-        if (IV.isOverdefined())
-          continue;
-
-        Constant *Const = IV.isConstant()
-          ? IV.getConstant() : UndefValue::get(Inst->getType());
-        DEBUG(dbgs() << "  Constant: " << *Const << " = " << *Inst << '\n');
-
-        // Replaces all of the uses of a variable with uses of the
-        // constant.
-        Inst->replaceAllUsesWith(Const);
-
-        // Delete the instruction.
-        if (!isa<CallInst>(Inst) && !isa<TerminatorInst>(Inst))
-          Inst->eraseFromParent();
-
-        // Hey, we just changed something!
-        MadeChanges = true;
-        ++IPNumInstRemoved;
+        if (tryToReplaceInstWithConstant(
+                Solver, Inst,
+                !isa<CallInst>(Inst) &&
+                    !isa<TerminatorInst>(Inst) /* shouldEraseFromParent */)) {
+          // Hey, we just changed something!
+          MadeChanges = true;
+          ++IPNumInstRemoved;
+        }
       }
     }
 
@@ -1918,7 +1847,7 @@ bool IPSCCP::runOnModule(Module &M) {
       }
 
       // Finally, delete the basic block.
-      F->getBasicBlockList().erase(DeadBB);
+      F.getBasicBlockList().erase(DeadBB);
     }
     BlocksToErase.clear();
   }
@@ -1937,18 +1866,17 @@ bool IPSCCP::runOnModule(Module &M) {
 
   // TODO: Process multiple value ret instructions also.
   const DenseMap<Function*, LatticeVal> &RV = Solver.getTrackedRetVals();
-  for (DenseMap<Function*, LatticeVal>::const_iterator I = RV.begin(),
-       E = RV.end(); I != E; ++I) {
-    Function *F = I->first;
-    if (I->second.isOverdefined() || F->getReturnType()->isVoidTy())
+  for (const auto &I : RV) {
+    Function *F = I.first;
+    if (I.second.isOverdefined() || F->getReturnType()->isVoidTy())
       continue;
 
     // We can only do this if we know that nothing else can call the function.
     if (!F->hasLocalLinkage() || AddressTakenFunctions.count(F))
       continue;
 
-    for (Function::iterator BB = F->begin(), E = F->end(); BB != E; ++BB)
-      if (ReturnInst *RI = dyn_cast<ReturnInst>(BB->getTerminator()))
+    for (BasicBlock &BB : *F)
+      if (ReturnInst *RI = dyn_cast<ReturnInst>(BB.getTerminator()))
         if (!isa<UndefValue>(RI->getOperand(0)))
           ReturnsToZap.push_back(RI);
   }
@@ -1978,3 +1906,52 @@ bool IPSCCP::runOnModule(Module &M) {
 
   return MadeChanges;
 }
+
+PreservedAnalyses IPSCCPPass::run(Module &M, AnalysisManager<Module> &AM) {
+  const DataLayout &DL = M.getDataLayout();
+  auto &TLI = AM.getResult<TargetLibraryAnalysis>(M);
+  if (!runIPSCCP(M, DL, &TLI))
+    return PreservedAnalyses::all();
+  return PreservedAnalyses::none();
+}
+
+namespace {
+//===--------------------------------------------------------------------===//
+//
+/// IPSCCP Class - This class implements interprocedural Sparse Conditional
+/// Constant Propagation.
+///
+class IPSCCPLegacyPass : public ModulePass {
+public:
+  static char ID;
+
+  IPSCCPLegacyPass() : ModulePass(ID) {
+    initializeIPSCCPLegacyPassPass(*PassRegistry::getPassRegistry());
+  }
+
+  bool runOnModule(Module &M) override {
+    if (skipModule(M))
+      return false;
+    const DataLayout &DL = M.getDataLayout();
+    const TargetLibraryInfo *TLI =
+        &getAnalysis<TargetLibraryInfoWrapperPass>().getTLI();
+    return runIPSCCP(M, DL, TLI);
+  }
+
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    AU.addRequired<TargetLibraryInfoWrapperPass>();
+  }
+};
+} // end anonymous namespace
+
+char IPSCCPLegacyPass::ID = 0;
+INITIALIZE_PASS_BEGIN(IPSCCPLegacyPass, "ipsccp",
+                      "Interprocedural Sparse Conditional Constant Propagation",
+                      false, false)
+INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfoWrapperPass)
+INITIALIZE_PASS_END(IPSCCPLegacyPass, "ipsccp",
+                    "Interprocedural Sparse Conditional Constant Propagation",
+                    false, false)
+
+// createIPSCCPPass - This is the public interface to this file.
+ModulePass *llvm::createIPSCCPPass() { return new IPSCCPLegacyPass(); }
diff --git a/lib/Transforms/Scalar/SROA.cpp b/lib/Transforms/Scalar/SROA.cpp
index a7361b5fe083..7d33259c030b 100644
--- a/lib/Transforms/Scalar/SROA.cpp
+++ b/lib/Transforms/Scalar/SROA.cpp
@@ -55,8 +55,8 @@
 #include "llvm/Transforms/Utils/Local.h"
 #include "llvm/Transforms/Utils/PromoteMemToReg.h"
 
-#if __cplusplus >= 201103L && !defined(NDEBUG)
-// We only use this for a debug check in C++11
+#ifndef NDEBUG
+// We only use this for a debug check.
 #include <random>
 #endif
 
@@ -87,12 +87,13 @@ static cl::opt<bool> SROAStrictInbounds("sroa-strict-inbounds", cl::init(false),
                                         cl::Hidden);
 
 namespace {
-/// \brief A custom IRBuilder inserter which prefixes all names if they are
-/// preserved.
-template <bool preserveNames = true>
-class IRBuilderPrefixedInserter
-    : public IRBuilderDefaultInserter<preserveNames> {
+/// \brief A custom IRBuilder inserter which prefixes all names, but only in
+/// Assert builds.
+class IRBuilderPrefixedInserter : public IRBuilderDefaultInserter {
   std::string Prefix;
+  const Twine getNameWithPrefix(const Twine &Name) const {
+    return Name.isTriviallyEmpty() ? Name : Prefix + Name;
+  }
 
 public:
   void SetNamePrefix(const Twine &P) { Prefix = P.str(); }
@@ -100,27 +101,13 @@ public:
 protected:
   void InsertHelper(Instruction *I, const Twine &Name, BasicBlock *BB,
                     BasicBlock::iterator InsertPt) const {
-    IRBuilderDefaultInserter<preserveNames>::InsertHelper(
-        I, Name.isTriviallyEmpty() ? Name : Prefix + Name, BB, InsertPt);
+    IRBuilderDefaultInserter::InsertHelper(I, getNameWithPrefix(Name), BB,
+                                           InsertPt);
   }
 };
 
-// Specialization for not preserving the name is trivial.
-template <>
-class IRBuilderPrefixedInserter<false>
-    : public IRBuilderDefaultInserter<false> {
-public:
-  void SetNamePrefix(const Twine &P) {}
-};
-
 /// \brief Provide a typedef for IRBuilder that drops names in release builds.
-#ifndef NDEBUG
-typedef llvm::IRBuilder<true, ConstantFolder, IRBuilderPrefixedInserter<true>>
-    IRBuilderTy;
-#else
-typedef llvm::IRBuilder<false, ConstantFolder, IRBuilderPrefixedInserter<false>>
-    IRBuilderTy;
-#endif
+using IRBuilderTy = llvm::IRBuilder<ConstantFolder, IRBuilderPrefixedInserter>;
 }
 
 namespace {
@@ -694,7 +681,7 @@ private:
       // langref in a very strict sense. If we ever want to enable
       // SROAStrictInbounds, this code should be factored cleanly into
       // PtrUseVisitor, but it is easier to experiment with SROAStrictInbounds
-      // by writing out the code here where we have tho underlying allocation
+      // by writing out the code here where we have the underlying allocation
       // size readily available.
       APInt GEPOffset = Offset;
       const DataLayout &DL = GEPI.getModule()->getDataLayout();
@@ -1015,7 +1002,7 @@ AllocaSlices::AllocaSlices(const DataLayout &DL, AllocaInst &AI)
                               }),
                Slices.end());
 
-#if __cplusplus >= 201103L && !defined(NDEBUG)
+#ifndef NDEBUG
   if (SROARandomShuffleSlices) {
     std::mt19937 MT(static_cast<unsigned>(sys::TimeValue::now().msec()));
     std::shuffle(Slices.begin(), Slices.end(), MT);
@@ -1192,8 +1179,7 @@ static bool isSafePHIToSpeculate(PHINode &PN) {
     // If this pointer is always safe to load, or if we can prove that there
     // is already a load in the block, then we can move the load to the pred
     // block.
-    if (isDereferenceablePointer(InVal, DL) ||
-        isSafeToLoadUnconditionally(InVal, TI, MaxAlign))
+    if (isSafeToLoadUnconditionally(InVal, MaxAlign, DL, TI))
       continue;
 
     return false;
@@ -1262,8 +1248,6 @@ static bool isSafeSelectToSpeculate(SelectInst &SI) {
   Value *TValue = SI.getTrueValue();
   Value *FValue = SI.getFalseValue();
   const DataLayout &DL = SI.getModule()->getDataLayout();
-  bool TDerefable = isDereferenceablePointer(TValue, DL);
-  bool FDerefable = isDereferenceablePointer(FValue, DL);
 
   for (User *U : SI.users()) {
     LoadInst *LI = dyn_cast<LoadInst>(U);
@@ -1273,11 +1257,9 @@ static bool isSafeSelectToSpeculate(SelectInst &SI) {
     // Both operands to the select need to be dereferencable, either
     // absolutely (e.g. allocas) or at this point because we can see other
     // accesses to it.
-    if (!TDerefable &&
-        !isSafeToLoadUnconditionally(TValue, LI, LI->getAlignment()))
+    if (!isSafeToLoadUnconditionally(TValue, LI->getAlignment(), DL, LI))
       return false;
-    if (!FDerefable &&
-        !isSafeToLoadUnconditionally(FValue, LI, LI->getAlignment()))
+    if (!isSafeToLoadUnconditionally(FValue, LI->getAlignment(), DL, LI))
       return false;
   }
 
@@ -1570,7 +1552,7 @@ static Value *getAdjustedPtr(IRBuilderTy &IRB, const DataLayout &DL, Value *Ptr,
     if (Operator::getOpcode(Ptr) == Instruction::BitCast) {
       Ptr = cast<Operator>(Ptr)->getOperand(0);
     } else if (GlobalAlias *GA = dyn_cast<GlobalAlias>(Ptr)) {
-      if (GA->mayBeOverridden())
+      if (GA->isInterposable())
         break;
       Ptr = GA->getAliasee();
     } else {
@@ -1653,8 +1635,10 @@ static bool canConvertValue(const DataLayout &DL, Type *OldTy, Type *NewTy) {
   OldTy = OldTy->getScalarType();
   NewTy = NewTy->getScalarType();
   if (NewTy->isPointerTy() || OldTy->isPointerTy()) {
-    if (NewTy->isPointerTy() && OldTy->isPointerTy())
-      return true;
+    if (NewTy->isPointerTy() && OldTy->isPointerTy()) {
+      return cast<PointerType>(NewTy)->getPointerAddressSpace() ==
+        cast<PointerType>(OldTy)->getPointerAddressSpace();
+    }
     if (NewTy->isIntegerTy() || OldTy->isIntegerTy())
       return true;
     return false;
@@ -3123,9 +3107,14 @@ private:
     void emitFunc(Type *Ty, Value *&Agg, const Twine &Name) {
       assert(Ty->isSingleValueType());
       // Extract the single value and store it using the indices.
-      Value *Store = IRB.CreateStore(
-          IRB.CreateExtractValue(Agg, Indices, Name + ".extract"),
-          IRB.CreateInBoundsGEP(nullptr, Ptr, GEPIndices, Name + ".gep"));
+      //
+      // The gep and extractvalue values are factored out of the CreateStore
+      // call to make the output independent of the argument evaluation order.
+      Value *ExtractValue =
+          IRB.CreateExtractValue(Agg, Indices, Name + ".extract");
+      Value *InBoundsGEP =
+          IRB.CreateInBoundsGEP(nullptr, Ptr, GEPIndices, Name + ".gep");
+      Value *Store = IRB.CreateStore(ExtractValue, InBoundsGEP);
       (void)Store;
       DEBUG(dbgs() << "          to: " << *Store << "\n");
     }
@@ -3380,11 +3369,15 @@ bool SROA::presplitLoadsAndStores(AllocaInst &AI, AllocaSlices &AS) {
   for (auto &P : AS.partitions()) {
     for (Slice &S : P) {
       Instruction *I = cast<Instruction>(S.getUse()->getUser());
-      if (!S.isSplittable() ||S.endOffset() <= P.endOffset()) {
-        // If this was a load we have to track that it can't participate in any
-        // pre-splitting!
+      if (!S.isSplittable() || S.endOffset() <= P.endOffset()) {
+        // If this is a load we have to track that it can't participate in any
+        // pre-splitting. If this is a store of a load we have to track that
+        // that load also can't participate in any pre-splitting.
         if (auto *LI = dyn_cast<LoadInst>(I))
           UnsplittableLoads.insert(LI);
+        else if (auto *SI = dyn_cast<StoreInst>(I))
+          if (auto *LI = dyn_cast<LoadInst>(SI->getValueOperand()))
+            UnsplittableLoads.insert(LI);
         continue;
       }
       assert(P.endOffset() > S.beginOffset() &&
@@ -3411,9 +3404,9 @@ bool SROA::presplitLoadsAndStores(AllocaInst &AI, AllocaSlices &AS) {
         }
 
         Loads.push_back(LI);
-      } else if (auto *SI = dyn_cast<StoreInst>(S.getUse()->getUser())) {
-        if (!SI ||
-            S.getUse() != &SI->getOperandUse(SI->getPointerOperandIndex()))
+      } else if (auto *SI = dyn_cast<StoreInst>(I)) {
+        if (S.getUse() != &SI->getOperandUse(SI->getPointerOperandIndex()))
+          // Skip stores *of* pointers. FIXME: This shouldn't even be possible!
           continue;
         auto *StoredLoad = dyn_cast<LoadInst>(SI->getValueOperand());
         if (!StoredLoad || !StoredLoad->isSimple())
@@ -3937,15 +3930,19 @@ AllocaInst *SROA::rewritePartition(AllocaInst &AI, AllocaSlices &AS,
       Worklist.insert(NewAI);
     }
   } else {
-    // If we can't promote the alloca, iterate on it to check for new
-    // refinements exposed by splitting the current alloca. Don't iterate on an
-    // alloca which didn't actually change and didn't get promoted.
-    if (NewAI != &AI)
-      Worklist.insert(NewAI);
-
     // Drop any post-promotion work items if promotion didn't happen.
     while (PostPromotionWorklist.size() > PPWOldSize)
       PostPromotionWorklist.pop_back();
+
+    // We couldn't promote and we didn't create a new partition, nothing
+    // happened.
+    if (NewAI == &AI)
+      return nullptr;
+
+    // If we can't promote the alloca, iterate on it to check for new
+    // refinements exposed by splitting the current alloca. Don't iterate on an
+    // alloca which didn't actually change and didn't get promoted.
+    Worklist.insert(NewAI);
   }
 
   return NewAI;
@@ -4024,12 +4021,12 @@ bool SROA::splitAlloca(AllocaInst &AI, AllocaSlices &AS) {
     auto *Var = DbgDecl->getVariable();
     auto *Expr = DbgDecl->getExpression();
     DIBuilder DIB(*AI.getModule(), /*AllowUnresolved*/ false);
-    bool IsSplit = Pieces.size() > 1;
+    uint64_t AllocaSize = DL.getTypeSizeInBits(AI.getAllocatedType());
     for (auto Piece : Pieces) {
       // Create a piece expression describing the new partition or reuse AI's
       // expression if there is only one partition.
       auto *PieceExpr = Expr;
-      if (IsSplit || Expr->isBitPiece()) {
+      if (Piece.Size < AllocaSize || Expr->isBitPiece()) {
         // If this alloca is already a scalar replacement of a larger aggregate,
         // Piece.Offset describes the offset inside the scalar.
         uint64_t Offset = Expr->isBitPiece() ? Expr->getBitPieceOffset() : 0;
@@ -4043,6 +4040,9 @@ bool SROA::splitAlloca(AllocaInst &AI, AllocaSlices &AS) {
           Size = std::min(Size, AbsEnd - Start);
         }
         PieceExpr = DIB.createBitPieceExpression(Start, Size);
+      } else {
+        assert(Pieces.size() == 1 &&
+               "partition is as large as original alloca");
       }
 
       // Remove any existing dbg.declare intrinsic describing the same alloca.
@@ -4237,14 +4237,19 @@ PreservedAnalyses SROA::runImpl(Function &F, DominatorTree &RunDT,
     PostPromotionWorklist.clear();
   } while (!Worklist.empty());
 
+  if (!Changed)
+    return PreservedAnalyses::all();
+
   // FIXME: Even when promoting allocas we should preserve some abstract set of
   // CFG-specific analyses.
-  return Changed ? PreservedAnalyses::none() : PreservedAnalyses::all();
+  PreservedAnalyses PA;
+  PA.preserve<GlobalsAA>();
+  return PA;
 }
 
-PreservedAnalyses SROA::run(Function &F, AnalysisManager<Function> *AM) {
-  return runImpl(F, AM->getResult<DominatorTreeAnalysis>(F),
-                 AM->getResult<AssumptionAnalysis>(F));
+PreservedAnalyses SROA::run(Function &F, AnalysisManager<Function> &AM) {
+  return runImpl(F, AM.getResult<DominatorTreeAnalysis>(F),
+                 AM.getResult<AssumptionAnalysis>(F));
 }
 
 /// A legacy pass for the legacy pass manager that wraps the \c SROA pass.
@@ -4260,7 +4265,7 @@ public:
     initializeSROALegacyPassPass(*PassRegistry::getPassRegistry());
   }
   bool runOnFunction(Function &F) override {
-    if (skipOptnoneFunction(F))
+    if (skipFunction(F))
       return false;
 
     auto PA = Impl.runImpl(
diff --git a/lib/Transforms/Scalar/Scalar.cpp b/lib/Transforms/Scalar/Scalar.cpp
index 52d477cc9573..f235b12e49cc 100644
--- a/lib/Transforms/Scalar/Scalar.cpp
+++ b/lib/Transforms/Scalar/Scalar.cpp
@@ -20,6 +20,7 @@
 #include "llvm/Analysis/Passes.h"
 #include "llvm/Analysis/ScopedNoAliasAA.h"
 #include "llvm/Analysis/TypeBasedAliasAnalysis.h"
+#include "llvm/Transforms/Scalar/GVN.h"
 #include "llvm/IR/DataLayout.h"
 #include "llvm/IR/Verifier.h"
 #include "llvm/InitializePasses.h"
@@ -31,49 +32,52 @@ using namespace llvm;
 /// ScalarOpts library.
 void llvm::initializeScalarOpts(PassRegistry &Registry) {
   initializeADCELegacyPassPass(Registry);
-  initializeBDCEPass(Registry);
+  initializeBDCELegacyPassPass(Registry);
   initializeAlignmentFromAssumptionsPass(Registry);
-  initializeConstantHoistingPass(Registry);
+  initializeConstantHoistingLegacyPassPass(Registry);
   initializeConstantPropagationPass(Registry);
   initializeCorrelatedValuePropagationPass(Registry);
-  initializeDCEPass(Registry);
+  initializeDCELegacyPassPass(Registry);
   initializeDeadInstEliminationPass(Registry);
   initializeScalarizerPass(Registry);
-  initializeDSEPass(Registry);
-  initializeGVNPass(Registry);
+  initializeDSELegacyPassPass(Registry);
+  initializeGuardWideningLegacyPassPass(Registry);
+  initializeGVNLegacyPassPass(Registry);
   initializeEarlyCSELegacyPassPass(Registry);
+  initializeGVNHoistLegacyPassPass(Registry);
   initializeFlattenCFGPassPass(Registry);
   initializeInductiveRangeCheckEliminationPass(Registry);
-  initializeIndVarSimplifyPass(Registry);
+  initializeIndVarSimplifyLegacyPassPass(Registry);
   initializeJumpThreadingPass(Registry);
-  initializeLICMPass(Registry);
-  initializeLoopDeletionPass(Registry);
-  initializeLoopAccessAnalysisPass(Registry);
-  initializeLoopInstSimplifyPass(Registry);
+  initializeLegacyLICMPassPass(Registry);
+  initializeLoopDataPrefetchPass(Registry);
+  initializeLoopDeletionLegacyPassPass(Registry);
+  initializeLoopAccessLegacyAnalysisPass(Registry);
+  initializeLoopInstSimplifyLegacyPassPass(Registry);
   initializeLoopInterchangePass(Registry);
-  initializeLoopRotatePass(Registry);
+  initializeLoopRotateLegacyPassPass(Registry);
   initializeLoopStrengthReducePass(Registry);
   initializeLoopRerollPass(Registry);
   initializeLoopUnrollPass(Registry);
   initializeLoopUnswitchPass(Registry);
-  initializeLoopIdiomRecognizePass(Registry);
-  initializeLowerAtomicPass(Registry);
+  initializeLoopVersioningLICMPass(Registry);
+  initializeLoopIdiomRecognizeLegacyPassPass(Registry);
+  initializeLowerAtomicLegacyPassPass(Registry);
   initializeLowerExpectIntrinsicPass(Registry);
-  initializeMemCpyOptPass(Registry);
-  initializeMergedLoadStoreMotionPass(Registry);
+  initializeLowerGuardIntrinsicPass(Registry);
+  initializeMemCpyOptLegacyPassPass(Registry);
+  initializeMergedLoadStoreMotionLegacyPassPass(Registry);
   initializeNaryReassociatePass(Registry);
-  initializePartiallyInlineLibCallsPass(Registry);
-  initializeReassociatePass(Registry);
+  initializePartiallyInlineLibCallsLegacyPassPass(Registry);
+  initializeReassociateLegacyPassPass(Registry);
   initializeRegToMemPass(Registry);
   initializeRewriteStatepointsForGCPass(Registry);
-  initializeSCCPPass(Registry);
-  initializeIPSCCPPass(Registry);
+  initializeSCCPLegacyPassPass(Registry);
+  initializeIPSCCPLegacyPassPass(Registry);
   initializeSROALegacyPassPass(Registry);
-  initializeSROA_DTPass(Registry);
-  initializeSROA_SSAUpPass(Registry);
   initializeCFGSimplifyPassPass(Registry);
   initializeStructurizeCFGPass(Registry);
-  initializeSinkingPass(Registry);
+  initializeSinkingLegacyPassPass(Registry);
   initializeTailCallElimPass(Registry);
   initializeSeparateConstOffsetFromGEPPass(Registry);
   initializeSpeculativeExecutionPass(Registry);
@@ -81,9 +85,11 @@ void llvm::initializeScalarOpts(PassRegistry &Registry) {
   initializeLoadCombinePass(Registry);
   initializePlaceBackedgeSafepointsImplPass(Registry);
   initializePlaceSafepointsPass(Registry);
-  initializeFloat2IntPass(Registry);
-  initializeLoopDistributePass(Registry);
+  initializeFloat2IntLegacyPassPass(Registry);
+  initializeLoopDistributeLegacyPass(Registry);
   initializeLoopLoadEliminationPass(Registry);
+  initializeLoopSimplifyCFGLegacyPassPass(Registry);
+  initializeLoopVersioningPassPass(Registry);
 }
 
 void LLVMInitializeScalarOpts(LLVMPassRegistryRef R) {
@@ -154,6 +160,10 @@ void LLVMAddLoopRerollPass(LLVMPassManagerRef PM) {
   unwrap(PM)->add(createLoopRerollPass());
 }
 
+void LLVMAddLoopSimplifyCFGPass(LLVMPassManagerRef PM) {
+  unwrap(PM)->add(createLoopSimplifyCFGPass());
+}
+
 void LLVMAddLoopUnrollPass(LLVMPassManagerRef PM) {
   unwrap(PM)->add(createLoopUnrollPass());
 }
@@ -187,16 +197,16 @@ void LLVMAddSCCPPass(LLVMPassManagerRef PM) {
 }
 
 void LLVMAddScalarReplAggregatesPass(LLVMPassManagerRef PM) {
-  unwrap(PM)->add(createScalarReplAggregatesPass());
+  unwrap(PM)->add(createSROAPass());
 }
 
 void LLVMAddScalarReplAggregatesPassSSA(LLVMPassManagerRef PM) {
-  unwrap(PM)->add(createScalarReplAggregatesPass(-1, false));
+  unwrap(PM)->add(createSROAPass());
 }
 
 void LLVMAddScalarReplAggregatesPassWithThreshold(LLVMPassManagerRef PM,
                                                   int Threshold) {
-  unwrap(PM)->add(createScalarReplAggregatesPass(Threshold));
+  unwrap(PM)->add(createSROAPass());
 }
 
 void LLVMAddSimplifyLibCallsPass(LLVMPassManagerRef PM) {
@@ -227,6 +237,10 @@ void LLVMAddEarlyCSEPass(LLVMPassManagerRef PM) {
   unwrap(PM)->add(createEarlyCSEPass());
 }
 
+void LLVMAddGVNHoistLegacyPass(LLVMPassManagerRef PM) {
+  unwrap(PM)->add(createGVNHoistPass());
+}
+
 void LLVMAddTypeBasedAliasAnalysisPass(LLVMPassManagerRef PM) {
   unwrap(PM)->add(createTypeBasedAAWrapperPass());
 }
diff --git a/lib/Transforms/Scalar/ScalarReplAggregates.cpp b/lib/Transforms/Scalar/ScalarReplAggregates.cpp
deleted file mode 100644
index 114d22ddf2e4..000000000000
--- a/lib/Transforms/Scalar/ScalarReplAggregates.cpp
+++ /dev/null
@@ -1,2630 +0,0 @@
-//===- ScalarReplAggregates.cpp - Scalar Replacement of Aggregates --------===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-//
-// This transformation implements the well known scalar replacement of
-// aggregates transformation.  This xform breaks up alloca instructions of
-// aggregate type (structure or array) into individual alloca instructions for
-// each member (if possible).  Then, if possible, it transforms the individual
-// alloca instructions into nice clean scalar SSA form.
-//
-// This combines a simple SRoA algorithm with the Mem2Reg algorithm because they
-// often interact, especially for C++ programs.  As such, iterating between
-// SRoA, then Mem2Reg until we run out of things to promote works well.
-//
-//===----------------------------------------------------------------------===//
-
-#include "llvm/Transforms/Scalar.h"
-#include "llvm/ADT/SetVector.h"
-#include "llvm/ADT/SmallVector.h"
-#include "llvm/ADT/Statistic.h"
-#include "llvm/Analysis/AssumptionCache.h"
-#include "llvm/Analysis/Loads.h"
-#include "llvm/Analysis/ValueTracking.h"
-#include "llvm/IR/CallSite.h"
-#include "llvm/IR/Constants.h"
-#include "llvm/IR/DIBuilder.h"
-#include "llvm/IR/DataLayout.h"
-#include "llvm/IR/DebugInfo.h"
-#include "llvm/IR/DerivedTypes.h"
-#include "llvm/IR/Dominators.h"
-#include "llvm/IR/Function.h"
-#include "llvm/IR/GetElementPtrTypeIterator.h"
-#include "llvm/IR/GlobalVariable.h"
-#include "llvm/IR/IRBuilder.h"
-#include "llvm/IR/Instructions.h"
-#include "llvm/IR/IntrinsicInst.h"
-#include "llvm/IR/LLVMContext.h"
-#include "llvm/IR/Module.h"
-#include "llvm/IR/Operator.h"
-#include "llvm/Pass.h"
-#include "llvm/Support/Debug.h"
-#include "llvm/Support/ErrorHandling.h"
-#include "llvm/Support/MathExtras.h"
-#include "llvm/Support/raw_ostream.h"
-#include "llvm/Transforms/Utils/Local.h"
-#include "llvm/Transforms/Utils/PromoteMemToReg.h"
-#include "llvm/Transforms/Utils/SSAUpdater.h"
-using namespace llvm;
-
-#define DEBUG_TYPE "scalarrepl"
-
-STATISTIC(NumReplaced,  "Number of allocas broken up");
-STATISTIC(NumPromoted,  "Number of allocas promoted");
-STATISTIC(NumAdjusted,  "Number of scalar allocas adjusted to allow promotion");
-STATISTIC(NumConverted, "Number of aggregates converted to scalar");
-
-namespace {
-#define SROA SROA_
-  struct SROA : public FunctionPass {
-    SROA(int T, bool hasDT, char &ID, int ST, int AT, int SLT)
-      : FunctionPass(ID), HasDomTree(hasDT) {
-      if (T == -1)
-        SRThreshold = 128;
-      else
-        SRThreshold = T;
-      if (ST == -1)
-        StructMemberThreshold = 32;
-      else
-        StructMemberThreshold = ST;
-      if (AT == -1)
-        ArrayElementThreshold = 8;
-      else
-        ArrayElementThreshold = AT;
-      if (SLT == -1)
-        // Do not limit the scalar integer load size if no threshold is given.
-        ScalarLoadThreshold = -1;
-      else
-        ScalarLoadThreshold = SLT;
-    }
-
-    bool runOnFunction(Function &F) override;
-
-    bool performScalarRepl(Function &F);
-    bool performPromotion(Function &F);
-
-  private:
-    bool HasDomTree;
-
-    /// DeadInsts - Keep track of instructions we have made dead, so that
-    /// we can remove them after we are done working.
-    SmallVector<Value*, 32> DeadInsts;
-
-    /// AllocaInfo - When analyzing uses of an alloca instruction, this captures
-    /// information about the uses.  All these fields are initialized to false
-    /// and set to true when something is learned.
-    struct AllocaInfo {
-      /// The alloca to promote.
-      AllocaInst *AI;
-
-      /// CheckedPHIs - This is a set of verified PHI nodes, to prevent infinite
-      /// looping and avoid redundant work.
-      SmallPtrSet<PHINode*, 8> CheckedPHIs;
-
-      /// isUnsafe - This is set to true if the alloca cannot be SROA'd.
-      bool isUnsafe : 1;
-
-      /// isMemCpySrc - This is true if this aggregate is memcpy'd from.
-      bool isMemCpySrc : 1;
-
-      /// isMemCpyDst - This is true if this aggregate is memcpy'd into.
-      bool isMemCpyDst : 1;
-
-      /// hasSubelementAccess - This is true if a subelement of the alloca is
-      /// ever accessed, or false if the alloca is only accessed with mem
-      /// intrinsics or load/store that only access the entire alloca at once.
-      bool hasSubelementAccess : 1;
-
-      /// hasALoadOrStore - This is true if there are any loads or stores to it.
-      /// The alloca may just be accessed with memcpy, for example, which would
-      /// not set this.
-      bool hasALoadOrStore : 1;
-
-      explicit AllocaInfo(AllocaInst *ai)
-        : AI(ai), isUnsafe(false), isMemCpySrc(false), isMemCpyDst(false),
-          hasSubelementAccess(false), hasALoadOrStore(false) {}
-    };
-
-    /// SRThreshold - The maximum alloca size to considered for SROA.
-    unsigned SRThreshold;
-
-    /// StructMemberThreshold - The maximum number of members a struct can
-    /// contain to be considered for SROA.
-    unsigned StructMemberThreshold;
-
-    /// ArrayElementThreshold - The maximum number of elements an array can
-    /// have to be considered for SROA.
-    unsigned ArrayElementThreshold;
-
-    /// ScalarLoadThreshold - The maximum size in bits of scalars to load when
-    /// converting to scalar
-    unsigned ScalarLoadThreshold;
-
-    void MarkUnsafe(AllocaInfo &I, Instruction *User) {
-      I.isUnsafe = true;
-      DEBUG(dbgs() << "  Transformation preventing inst: " << *User << '\n');
-    }
-
-    bool isSafeAllocaToScalarRepl(AllocaInst *AI);
-
-    void isSafeForScalarRepl(Instruction *I, uint64_t Offset, AllocaInfo &Info);
-    void isSafePHISelectUseForScalarRepl(Instruction *User, uint64_t Offset,
-                                         AllocaInfo &Info);
-    void isSafeGEP(GetElementPtrInst *GEPI, uint64_t &Offset, AllocaInfo &Info);
-    void isSafeMemAccess(uint64_t Offset, uint64_t MemSize,
-                         Type *MemOpType, bool isStore, AllocaInfo &Info,
-                         Instruction *TheAccess, bool AllowWholeAccess);
-    bool TypeHasComponent(Type *T, uint64_t Offset, uint64_t Size,
-                          const DataLayout &DL);
-    uint64_t FindElementAndOffset(Type *&T, uint64_t &Offset, Type *&IdxTy,
-                                  const DataLayout &DL);
-
-    void DoScalarReplacement(AllocaInst *AI,
-                             std::vector<AllocaInst*> &WorkList);
-    void DeleteDeadInstructions();
-
-    void RewriteForScalarRepl(Instruction *I, AllocaInst *AI, uint64_t Offset,
-                              SmallVectorImpl<AllocaInst *> &NewElts);
-    void RewriteBitCast(BitCastInst *BC, AllocaInst *AI, uint64_t Offset,
-                        SmallVectorImpl<AllocaInst *> &NewElts);
-    void RewriteGEP(GetElementPtrInst *GEPI, AllocaInst *AI, uint64_t Offset,
-                    SmallVectorImpl<AllocaInst *> &NewElts);
-    void RewriteLifetimeIntrinsic(IntrinsicInst *II, AllocaInst *AI,
-                                  uint64_t Offset,
-                                  SmallVectorImpl<AllocaInst *> &NewElts);
-    void RewriteMemIntrinUserOfAlloca(MemIntrinsic *MI, Instruction *Inst,
-                                      AllocaInst *AI,
-                                      SmallVectorImpl<AllocaInst *> &NewElts);
-    void RewriteStoreUserOfWholeAlloca(StoreInst *SI, AllocaInst *AI,
-                                       SmallVectorImpl<AllocaInst *> &NewElts);
-    void RewriteLoadUserOfWholeAlloca(LoadInst *LI, AllocaInst *AI,
-                                      SmallVectorImpl<AllocaInst *> &NewElts);
-    bool ShouldAttemptScalarRepl(AllocaInst *AI);
-  };
-
-  // SROA_DT - SROA that uses DominatorTree.
-  struct SROA_DT : public SROA {
-    static char ID;
-  public:
-    SROA_DT(int T = -1, int ST = -1, int AT = -1, int SLT = -1) :
-        SROA(T, true, ID, ST, AT, SLT) {
-      initializeSROA_DTPass(*PassRegistry::getPassRegistry());
-    }
-
-    // getAnalysisUsage - This pass does not require any passes, but we know it
-    // will not alter the CFG, so say so.
-    void getAnalysisUsage(AnalysisUsage &AU) const override {
-      AU.addRequired<AssumptionCacheTracker>();
-      AU.addRequired<DominatorTreeWrapperPass>();
-      AU.setPreservesCFG();
-    }
-  };
-
-  // SROA_SSAUp - SROA that uses SSAUpdater.
-  struct SROA_SSAUp : public SROA {
-    static char ID;
-  public:
-    SROA_SSAUp(int T = -1, int ST = -1, int AT = -1, int SLT = -1) :
-        SROA(T, false, ID, ST, AT, SLT) {
-      initializeSROA_SSAUpPass(*PassRegistry::getPassRegistry());
-    }
-
-    // getAnalysisUsage - This pass does not require any passes, but we know it
-    // will not alter the CFG, so say so.
-    void getAnalysisUsage(AnalysisUsage &AU) const override {
-      AU.addRequired<AssumptionCacheTracker>();
-      AU.setPreservesCFG();
-    }
-  };
-
-}
-
-char SROA_DT::ID = 0;
-char SROA_SSAUp::ID = 0;
-
-INITIALIZE_PASS_BEGIN(SROA_DT, "scalarrepl",
-                "Scalar Replacement of Aggregates (DT)", false, false)
-INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker)
-INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
-INITIALIZE_PASS_END(SROA_DT, "scalarrepl",
-                "Scalar Replacement of Aggregates (DT)", false, false)
-
-INITIALIZE_PASS_BEGIN(SROA_SSAUp, "scalarrepl-ssa",
-                      "Scalar Replacement of Aggregates (SSAUp)", false, false)
-INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker)
-INITIALIZE_PASS_END(SROA_SSAUp, "scalarrepl-ssa",
-                    "Scalar Replacement of Aggregates (SSAUp)", false, false)
-
-// Public interface to the ScalarReplAggregates pass
-FunctionPass *llvm::createScalarReplAggregatesPass(int Threshold,
-                                                   bool UseDomTree,
-                                                   int StructMemberThreshold,
-                                                   int ArrayElementThreshold,
-                                                   int ScalarLoadThreshold) {
-  if (UseDomTree)
-    return new SROA_DT(Threshold, StructMemberThreshold, ArrayElementThreshold,
-                       ScalarLoadThreshold);
-  return new SROA_SSAUp(Threshold, StructMemberThreshold,
-                        ArrayElementThreshold, ScalarLoadThreshold);
-}
-
-
-//===----------------------------------------------------------------------===//
-// Convert To Scalar Optimization.
-//===----------------------------------------------------------------------===//
-
-namespace {
-/// ConvertToScalarInfo - This class implements the "Convert To Scalar"
-/// optimization, which scans the uses of an alloca and determines if it can
-/// rewrite it in terms of a single new alloca that can be mem2reg'd.
-class ConvertToScalarInfo {
-  /// AllocaSize - The size of the alloca being considered in bytes.
-  unsigned AllocaSize;
-  const DataLayout &DL;
-  unsigned ScalarLoadThreshold;
-
-  /// IsNotTrivial - This is set to true if there is some access to the object
-  /// which means that mem2reg can't promote it.
-  bool IsNotTrivial;
-
-  /// ScalarKind - Tracks the kind of alloca being considered for promotion,
-  /// computed based on the uses of the alloca rather than the LLVM type system.
-  enum {
-    Unknown,
-
-    // Accesses via GEPs that are consistent with element access of a vector
-    // type. This will not be converted into a vector unless there is a later
-    // access using an actual vector type.
-    ImplicitVector,
-
-    // Accesses via vector operations and GEPs that are consistent with the
-    // layout of a vector type.
-    Vector,
-
-    // An integer bag-of-bits with bitwise operations for insertion and
-    // extraction. Any combination of types can be converted into this kind
-    // of scalar.
-    Integer
-  } ScalarKind;
-
-  /// VectorTy - This tracks the type that we should promote the vector to if
-  /// it is possible to turn it into a vector.  This starts out null, and if it
-  /// isn't possible to turn into a vector type, it gets set to VoidTy.
-  VectorType *VectorTy;
-
-  /// HadNonMemTransferAccess - True if there is at least one access to the
-  /// alloca that is not a MemTransferInst.  We don't want to turn structs into
-  /// large integers unless there is some potential for optimization.
-  bool HadNonMemTransferAccess;
-
-  /// HadDynamicAccess - True if some element of this alloca was dynamic.
-  /// We don't yet have support for turning a dynamic access into a large
-  /// integer.
-  bool HadDynamicAccess;
-
-public:
-  explicit ConvertToScalarInfo(unsigned Size, const DataLayout &DL,
-                               unsigned SLT)
-    : AllocaSize(Size), DL(DL), ScalarLoadThreshold(SLT), IsNotTrivial(false),
-    ScalarKind(Unknown), VectorTy(nullptr), HadNonMemTransferAccess(false),
-    HadDynamicAccess(false) { }
-
-  AllocaInst *TryConvert(AllocaInst *AI);
-
-private:
-  bool CanConvertToScalar(Value *V, uint64_t Offset, Value* NonConstantIdx);
-  void MergeInTypeForLoadOrStore(Type *In, uint64_t Offset);
-  bool MergeInVectorType(VectorType *VInTy, uint64_t Offset);
-  void ConvertUsesToScalar(Value *Ptr, AllocaInst *NewAI, uint64_t Offset,
-                           Value *NonConstantIdx);
-
-  Value *ConvertScalar_ExtractValue(Value *NV, Type *ToType,
-                                    uint64_t Offset, Value* NonConstantIdx,
-                                    IRBuilder<> &Builder);
-  Value *ConvertScalar_InsertValue(Value *StoredVal, Value *ExistingVal,
-                                   uint64_t Offset, Value* NonConstantIdx,
-                                   IRBuilder<> &Builder);
-};
-} // end anonymous namespace.
-
-
-/// TryConvert - Analyze the specified alloca, and if it is safe to do so,
-/// rewrite it to be a new alloca which is mem2reg'able.  This returns the new
-/// alloca if possible or null if not.
-AllocaInst *ConvertToScalarInfo::TryConvert(AllocaInst *AI) {
-  // If we can't convert this scalar, or if mem2reg can trivially do it, bail
-  // out.
-  if (!CanConvertToScalar(AI, 0, nullptr) || !IsNotTrivial)
-    return nullptr;
-
-  // If an alloca has only memset / memcpy uses, it may still have an Unknown
-  // ScalarKind. Treat it as an Integer below.
-  if (ScalarKind == Unknown)
-    ScalarKind = Integer;
-
-  if (ScalarKind == Vector && VectorTy->getBitWidth() != AllocaSize * 8)
-    ScalarKind = Integer;
-
-  // If we were able to find a vector type that can handle this with
-  // insert/extract elements, and if there was at least one use that had
-  // a vector type, promote this to a vector.  We don't want to promote
-  // random stuff that doesn't use vectors (e.g. <9 x double>) because then
-  // we just get a lot of insert/extracts.  If at least one vector is
-  // involved, then we probably really do have a union of vector/array.
-  Type *NewTy;
-  if (ScalarKind == Vector) {
-    assert(VectorTy && "Missing type for vector scalar.");
-    DEBUG(dbgs() << "CONVERT TO VECTOR: " << *AI << "\n  TYPE = "
-          << *VectorTy << '\n');
-    NewTy = VectorTy;  // Use the vector type.
-  } else {
-    unsigned BitWidth = AllocaSize * 8;
-
-    // Do not convert to scalar integer if the alloca size exceeds the
-    // scalar load threshold.
-    if (BitWidth > ScalarLoadThreshold)
-      return nullptr;
-
-    if ((ScalarKind == ImplicitVector || ScalarKind == Integer) &&
-        !HadNonMemTransferAccess && !DL.fitsInLegalInteger(BitWidth))
-      return nullptr;
-    // Dynamic accesses on integers aren't yet supported.  They need us to shift
-    // by a dynamic amount which could be difficult to work out as we might not
-    // know whether to use a left or right shift.
-    if (ScalarKind == Integer && HadDynamicAccess)
-      return nullptr;
-
-    DEBUG(dbgs() << "CONVERT TO SCALAR INTEGER: " << *AI << "\n");
-    // Create and insert the integer alloca.
-    NewTy = IntegerType::get(AI->getContext(), BitWidth);
-  }
-  AllocaInst *NewAI =
-      new AllocaInst(NewTy, nullptr, "", &AI->getParent()->front());
-  ConvertUsesToScalar(AI, NewAI, 0, nullptr);
-  return NewAI;
-}
-
-/// MergeInTypeForLoadOrStore - Add the 'In' type to the accumulated vector type
-/// (VectorTy) so far at the offset specified by Offset (which is specified in
-/// bytes).
-///
-/// There are two cases we handle here:
-///   1) A union of vector types of the same size and potentially its elements.
-///      Here we turn element accesses into insert/extract element operations.
-///      This promotes a <4 x float> with a store of float to the third element
-///      into a <4 x float> that uses insert element.
-///   2) A fully general blob of memory, which we turn into some (potentially
-///      large) integer type with extract and insert operations where the loads
-///      and stores would mutate the memory.  We mark this by setting VectorTy
-///      to VoidTy.
-void ConvertToScalarInfo::MergeInTypeForLoadOrStore(Type *In,
-                                                    uint64_t Offset) {
-  // If we already decided to turn this into a blob of integer memory, there is
-  // nothing to be done.
-  if (ScalarKind == Integer)
-    return;
-
-  // If this could be contributing to a vector, analyze it.
-
-  // If the In type is a vector that is the same size as the alloca, see if it
-  // matches the existing VecTy.
-  if (VectorType *VInTy = dyn_cast<VectorType>(In)) {
-    if (MergeInVectorType(VInTy, Offset))
-      return;
-  } else if (In->isFloatTy() || In->isDoubleTy() ||
-             (In->isIntegerTy() && In->getPrimitiveSizeInBits() >= 8 &&
-              isPowerOf2_32(In->getPrimitiveSizeInBits()))) {
-    // Full width accesses can be ignored, because they can always be turned
-    // into bitcasts.
-    unsigned EltSize = In->getPrimitiveSizeInBits()/8;
-    if (EltSize == AllocaSize)
-      return;
-
-    // If we're accessing something that could be an element of a vector, see
-    // if the implied vector agrees with what we already have and if Offset is
-    // compatible with it.
-    if (Offset % EltSize == 0 && AllocaSize % EltSize == 0 &&
-        (!VectorTy || EltSize == VectorTy->getElementType()
-                                         ->getPrimitiveSizeInBits()/8)) {
-      if (!VectorTy) {
-        ScalarKind = ImplicitVector;
-        VectorTy = VectorType::get(In, AllocaSize/EltSize);
-      }
-      return;
-    }
-  }
-
-  // Otherwise, we have a case that we can't handle with an optimized vector
-  // form.  We can still turn this into a large integer.
-  ScalarKind = Integer;
-}
-
-/// MergeInVectorType - Handles the vector case of MergeInTypeForLoadOrStore,
-/// returning true if the type was successfully merged and false otherwise.
-bool ConvertToScalarInfo::MergeInVectorType(VectorType *VInTy,
-                                            uint64_t Offset) {
-  if (VInTy->getBitWidth()/8 == AllocaSize && Offset == 0) {
-    // If we're storing/loading a vector of the right size, allow it as a
-    // vector.  If this the first vector we see, remember the type so that
-    // we know the element size. If this is a subsequent access, ignore it
-    // even if it is a differing type but the same size. Worst case we can
-    // bitcast the resultant vectors.
-    if (!VectorTy)
-      VectorTy = VInTy;
-    ScalarKind = Vector;
-    return true;
-  }
-
-  return false;
-}
-
-/// CanConvertToScalar - V is a pointer.  If we can convert the pointee and all
-/// its accesses to a single vector type, return true and set VecTy to
-/// the new type.  If we could convert the alloca into a single promotable
-/// integer, return true but set VecTy to VoidTy.  Further, if the use is not a
-/// completely trivial use that mem2reg could promote, set IsNotTrivial.  Offset
-/// is the current offset from the base of the alloca being analyzed.
-///
-/// If we see at least one access to the value that is as a vector type, set the
-/// SawVec flag.
-bool ConvertToScalarInfo::CanConvertToScalar(Value *V, uint64_t Offset,
-                                             Value* NonConstantIdx) {
-  for (User *U : V->users()) {
-    Instruction *UI = cast<Instruction>(U);
-
-    if (LoadInst *LI = dyn_cast<LoadInst>(UI)) {
-      // Don't break volatile loads.
-      if (!LI->isSimple())
-        return false;
-      // Don't touch MMX operations.
-      if (LI->getType()->isX86_MMXTy())
-        return false;
-      HadNonMemTransferAccess = true;
-      MergeInTypeForLoadOrStore(LI->getType(), Offset);
-      continue;
-    }
-
-    if (StoreInst *SI = dyn_cast<StoreInst>(UI)) {
-      // Storing the pointer, not into the value?
-      if (SI->getOperand(0) == V || !SI->isSimple()) return false;
-      // Don't touch MMX operations.
-      if (SI->getOperand(0)->getType()->isX86_MMXTy())
-        return false;
-      HadNonMemTransferAccess = true;
-      MergeInTypeForLoadOrStore(SI->getOperand(0)->getType(), Offset);
-      continue;
-    }
-
-    if (BitCastInst *BCI = dyn_cast<BitCastInst>(UI)) {
-      if (!onlyUsedByLifetimeMarkers(BCI))
-        IsNotTrivial = true;  // Can't be mem2reg'd.
-      if (!CanConvertToScalar(BCI, Offset, NonConstantIdx))
-        return false;
-      continue;
-    }
-
-    if (GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(UI)) {
-      // If this is a GEP with a variable indices, we can't handle it.
-      PointerType* PtrTy = dyn_cast<PointerType>(GEP->getPointerOperandType());
-      if (!PtrTy)
-        return false;
-
-      // Compute the offset that this GEP adds to the pointer.
-      SmallVector<Value*, 8> Indices(GEP->op_begin()+1, GEP->op_end());
-      Value *GEPNonConstantIdx = nullptr;
-      if (!GEP->hasAllConstantIndices()) {
-        if (!isa<VectorType>(PtrTy->getElementType()))
-          return false;
-        if (NonConstantIdx)
-          return false;
-        GEPNonConstantIdx = Indices.pop_back_val();
-        if (!GEPNonConstantIdx->getType()->isIntegerTy(32))
-          return false;
-        HadDynamicAccess = true;
-      } else
-        GEPNonConstantIdx = NonConstantIdx;
-      uint64_t GEPOffset = DL.getIndexedOffset(PtrTy,
-                                               Indices);
-      // See if all uses can be converted.
-      if (!CanConvertToScalar(GEP, Offset+GEPOffset, GEPNonConstantIdx))
-        return false;
-      IsNotTrivial = true;  // Can't be mem2reg'd.
-      HadNonMemTransferAccess = true;
-      continue;
-    }
-
-    // If this is a constant sized memset of a constant value (e.g. 0) we can
-    // handle it.
-    if (MemSetInst *MSI = dyn_cast<MemSetInst>(UI)) {
-      // Store to dynamic index.
-      if (NonConstantIdx)
-        return false;
-      // Store of constant value.
-      if (!isa<ConstantInt>(MSI->getValue()))
-        return false;
-
-      // Store of constant size.
-      ConstantInt *Len = dyn_cast<ConstantInt>(MSI->getLength());
-      if (!Len)
-        return false;
-
-      // If the size differs from the alloca, we can only convert the alloca to
-      // an integer bag-of-bits.
-      // FIXME: This should handle all of the cases that are currently accepted
-      // as vector element insertions.
-      if (Len->getZExtValue() != AllocaSize || Offset != 0)
-        ScalarKind = Integer;
-
-      IsNotTrivial = true;  // Can't be mem2reg'd.
-      HadNonMemTransferAccess = true;
-      continue;
-    }
-
-    // If this is a memcpy or memmove into or out of the whole allocation, we
-    // can handle it like a load or store of the scalar type.
-    if (MemTransferInst *MTI = dyn_cast<MemTransferInst>(UI)) {
-      // Store to dynamic index.
-      if (NonConstantIdx)
-        return false;
-      ConstantInt *Len = dyn_cast<ConstantInt>(MTI->getLength());
-      if (!Len || Len->getZExtValue() != AllocaSize || Offset != 0)
-        return false;
-
-      IsNotTrivial = true;  // Can't be mem2reg'd.
-      continue;
-    }
-
-    // If this is a lifetime intrinsic, we can handle it.
-    if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(UI)) {
-      if (II->getIntrinsicID() == Intrinsic::lifetime_start ||
-          II->getIntrinsicID() == Intrinsic::lifetime_end) {
-        continue;
-      }
-    }
-
-    // Otherwise, we cannot handle this!
-    return false;
-  }
-
-  return true;
-}
-
-/// ConvertUsesToScalar - Convert all of the users of Ptr to use the new alloca
-/// directly.  This happens when we are converting an "integer union" to a
-/// single integer scalar, or when we are converting a "vector union" to a
-/// vector with insert/extractelement instructions.
-///
-/// Offset is an offset from the original alloca, in bits that need to be
-/// shifted to the right.  By the end of this, there should be no uses of Ptr.
-void ConvertToScalarInfo::ConvertUsesToScalar(Value *Ptr, AllocaInst *NewAI,
-                                              uint64_t Offset,
-                                              Value* NonConstantIdx) {
-  while (!Ptr->use_empty()) {
-    Instruction *User = cast<Instruction>(Ptr->user_back());
-
-    if (BitCastInst *CI = dyn_cast<BitCastInst>(User)) {
-      ConvertUsesToScalar(CI, NewAI, Offset, NonConstantIdx);
-      CI->eraseFromParent();
-      continue;
-    }
-
-    if (GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(User)) {
-      // Compute the offset that this GEP adds to the pointer.
-      SmallVector<Value*, 8> Indices(GEP->op_begin()+1, GEP->op_end());
-      Value* GEPNonConstantIdx = nullptr;
-      if (!GEP->hasAllConstantIndices()) {
-        assert(!NonConstantIdx &&
-               "Dynamic GEP reading from dynamic GEP unsupported");
-        GEPNonConstantIdx = Indices.pop_back_val();
-      } else
-        GEPNonConstantIdx = NonConstantIdx;
-      uint64_t GEPOffset = DL.getIndexedOffset(GEP->getPointerOperandType(),
-                                               Indices);
-      ConvertUsesToScalar(GEP, NewAI, Offset+GEPOffset*8, GEPNonConstantIdx);
-      GEP->eraseFromParent();
-      continue;
-    }
-
-    IRBuilder<> Builder(User);
-
-    if (LoadInst *LI = dyn_cast<LoadInst>(User)) {
-      // The load is a bit extract from NewAI shifted right by Offset bits.
-      Value *LoadedVal = Builder.CreateLoad(NewAI);
-      Value *NewLoadVal
-        = ConvertScalar_ExtractValue(LoadedVal, LI->getType(), Offset,
-                                     NonConstantIdx, Builder);
-      LI->replaceAllUsesWith(NewLoadVal);
-      LI->eraseFromParent();
-      continue;
-    }
-
-    if (StoreInst *SI = dyn_cast<StoreInst>(User)) {
-      assert(SI->getOperand(0) != Ptr && "Consistency error!");
-      Instruction *Old = Builder.CreateLoad(NewAI, NewAI->getName()+".in");
-      Value *New = ConvertScalar_InsertValue(SI->getOperand(0), Old, Offset,
-                                             NonConstantIdx, Builder);
-      Builder.CreateStore(New, NewAI);
-      SI->eraseFromParent();
-
-      // If the load we just inserted is now dead, then the inserted store
-      // overwrote the entire thing.
-      if (Old->use_empty())
-        Old->eraseFromParent();
-      continue;
-    }
-
-    // If this is a constant sized memset of a constant value (e.g. 0) we can
-    // transform it into a store of the expanded constant value.
-    if (MemSetInst *MSI = dyn_cast<MemSetInst>(User)) {
-      assert(MSI->getRawDest() == Ptr && "Consistency error!");
-      assert(!NonConstantIdx && "Cannot replace dynamic memset with insert");
-      int64_t SNumBytes = cast<ConstantInt>(MSI->getLength())->getSExtValue();
-      if (SNumBytes > 0 && (SNumBytes >> 32) == 0) {
-        unsigned NumBytes = static_cast<unsigned>(SNumBytes);
-        unsigned Val = cast<ConstantInt>(MSI->getValue())->getZExtValue();
-
-        // Compute the value replicated the right number of times.
-        APInt APVal(NumBytes*8, Val);
-
-        // Splat the value if non-zero.
-        if (Val)
-          for (unsigned i = 1; i != NumBytes; ++i)
-            APVal |= APVal << 8;
-
-        Instruction *Old = Builder.CreateLoad(NewAI, NewAI->getName()+".in");
-        Value *New = ConvertScalar_InsertValue(
-                                    ConstantInt::get(User->getContext(), APVal),
-                                               Old, Offset, nullptr, Builder);
-        Builder.CreateStore(New, NewAI);
-
-        // If the load we just inserted is now dead, then the memset overwrote
-        // the entire thing.
-        if (Old->use_empty())
-          Old->eraseFromParent();
-      }
-      MSI->eraseFromParent();
-      continue;
-    }
-
-    // If this is a memcpy or memmove into or out of the whole allocation, we
-    // can handle it like a load or store of the scalar type.
-    if (MemTransferInst *MTI = dyn_cast<MemTransferInst>(User)) {
-      assert(Offset == 0 && "must be store to start of alloca");
-      assert(!NonConstantIdx && "Cannot replace dynamic transfer with insert");
-
-      // If the source and destination are both to the same alloca, then this is
-      // a noop copy-to-self, just delete it.  Otherwise, emit a load and store
-      // as appropriate.
-      AllocaInst *OrigAI = cast<AllocaInst>(GetUnderlyingObject(Ptr, DL, 0));
-
-      if (GetUnderlyingObject(MTI->getSource(), DL, 0) != OrigAI) {
-        // Dest must be OrigAI, change this to be a load from the original
-        // pointer (bitcasted), then a store to our new alloca.
-        assert(MTI->getRawDest() == Ptr && "Neither use is of pointer?");
-        Value *SrcPtr = MTI->getSource();
-        PointerType* SPTy = cast<PointerType>(SrcPtr->getType());
-        PointerType* AIPTy = cast<PointerType>(NewAI->getType());
-        if (SPTy->getAddressSpace() != AIPTy->getAddressSpace()) {
-          AIPTy = PointerType::get(AIPTy->getElementType(),
-                                   SPTy->getAddressSpace());
-        }
-        SrcPtr = Builder.CreateBitCast(SrcPtr, AIPTy);
-
-        LoadInst *SrcVal = Builder.CreateLoad(SrcPtr, "srcval");
-        SrcVal->setAlignment(MTI->getAlignment());
-        Builder.CreateStore(SrcVal, NewAI);
-      } else if (GetUnderlyingObject(MTI->getDest(), DL, 0) != OrigAI) {
-        // Src must be OrigAI, change this to be a load from NewAI then a store
-        // through the original dest pointer (bitcasted).
-        assert(MTI->getRawSource() == Ptr && "Neither use is of pointer?");
-        LoadInst *SrcVal = Builder.CreateLoad(NewAI, "srcval");
-
-        PointerType* DPTy = cast<PointerType>(MTI->getDest()->getType());
-        PointerType* AIPTy = cast<PointerType>(NewAI->getType());
-        if (DPTy->getAddressSpace() != AIPTy->getAddressSpace()) {
-          AIPTy = PointerType::get(AIPTy->getElementType(),
-                                   DPTy->getAddressSpace());
-        }
-        Value *DstPtr = Builder.CreateBitCast(MTI->getDest(), AIPTy);
-
-        StoreInst *NewStore = Builder.CreateStore(SrcVal, DstPtr);
-        NewStore->setAlignment(MTI->getAlignment());
-      } else {
-        // Noop transfer. Src == Dst
-      }
-
-      MTI->eraseFromParent();
-      continue;
-    }
-
-    if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(User)) {
-      if (II->getIntrinsicID() == Intrinsic::lifetime_start ||
-          II->getIntrinsicID() == Intrinsic::lifetime_end) {
-        // There's no need to preserve these, as the resulting alloca will be
-        // converted to a register anyways.
-        II->eraseFromParent();
-        continue;
-      }
-    }
-
-    llvm_unreachable("Unsupported operation!");
-  }
-}
-
-/// ConvertScalar_ExtractValue - Extract a value of type ToType from an integer
-/// or vector value FromVal, extracting the bits from the offset specified by
-/// Offset.  This returns the value, which is of type ToType.
-///
-/// This happens when we are converting an "integer union" to a single
-/// integer scalar, or when we are converting a "vector union" to a vector with
-/// insert/extractelement instructions.
-///
-/// Offset is an offset from the original alloca, in bits that need to be
-/// shifted to the right.
-Value *ConvertToScalarInfo::
-ConvertScalar_ExtractValue(Value *FromVal, Type *ToType,
-                           uint64_t Offset, Value* NonConstantIdx,
-                           IRBuilder<> &Builder) {
-  // If the load is of the whole new alloca, no conversion is needed.
-  Type *FromType = FromVal->getType();
-  if (FromType == ToType && Offset == 0)
-    return FromVal;
-
-  // If the result alloca is a vector type, this is either an element
-  // access or a bitcast to another vector type of the same size.
-  if (VectorType *VTy = dyn_cast<VectorType>(FromType)) {
-    unsigned FromTypeSize = DL.getTypeAllocSize(FromType);
-    unsigned ToTypeSize = DL.getTypeAllocSize(ToType);
-    if (FromTypeSize == ToTypeSize)
-        return Builder.CreateBitCast(FromVal, ToType);
-
-    // Otherwise it must be an element access.
-    unsigned Elt = 0;
-    if (Offset) {
-      unsigned EltSize = DL.getTypeAllocSizeInBits(VTy->getElementType());
-      Elt = Offset/EltSize;
-      assert(EltSize*Elt == Offset && "Invalid modulus in validity checking");
-    }
-    // Return the element extracted out of it.
-    Value *Idx;
-    if (NonConstantIdx) {
-      if (Elt)
-        Idx = Builder.CreateAdd(NonConstantIdx,
-                                Builder.getInt32(Elt),
-                                "dyn.offset");
-      else
-        Idx = NonConstantIdx;
-    } else
-      Idx = Builder.getInt32(Elt);
-    Value *V = Builder.CreateExtractElement(FromVal, Idx);
-    if (V->getType() != ToType)
-      V = Builder.CreateBitCast(V, ToType);
-    return V;
-  }
-
-  // If ToType is a first class aggregate, extract out each of the pieces and
-  // use insertvalue's to form the FCA.
-  if (StructType *ST = dyn_cast<StructType>(ToType)) {
-    assert(!NonConstantIdx &&
-           "Dynamic indexing into struct types not supported");
-    const StructLayout &Layout = *DL.getStructLayout(ST);
-    Value *Res = UndefValue::get(ST);
-    for (unsigned i = 0, e = ST->getNumElements(); i != e; ++i) {
-      Value *Elt = ConvertScalar_ExtractValue(FromVal, ST->getElementType(i),
-                                        Offset+Layout.getElementOffsetInBits(i),
-                                              nullptr, Builder);
-      Res = Builder.CreateInsertValue(Res, Elt, i);
-    }
-    return Res;
-  }
-
-  if (ArrayType *AT = dyn_cast<ArrayType>(ToType)) {
-    assert(!NonConstantIdx &&
-           "Dynamic indexing into array types not supported");
-    uint64_t EltSize = DL.getTypeAllocSizeInBits(AT->getElementType());
-    Value *Res = UndefValue::get(AT);
-    for (unsigned i = 0, e = AT->getNumElements(); i != e; ++i) {
-      Value *Elt = ConvertScalar_ExtractValue(FromVal, AT->getElementType(),
-                                              Offset+i*EltSize, nullptr,
-                                              Builder);
-      Res = Builder.CreateInsertValue(Res, Elt, i);
-    }
-    return Res;
-  }
-
-  // Otherwise, this must be a union that was converted to an integer value.
-  IntegerType *NTy = cast<IntegerType>(FromVal->getType());
-
-  // If this is a big-endian system and the load is narrower than the
-  // full alloca type, we need to do a shift to get the right bits.
-  int ShAmt = 0;
-  if (DL.isBigEndian()) {
-    // On big-endian machines, the lowest bit is stored at the bit offset
-    // from the pointer given by getTypeStoreSizeInBits.  This matters for
-    // integers with a bitwidth that is not a multiple of 8.
-    ShAmt = DL.getTypeStoreSizeInBits(NTy) -
-            DL.getTypeStoreSizeInBits(ToType) - Offset;
-  } else {
-    ShAmt = Offset;
-  }
-
-  // Note: we support negative bitwidths (with shl) which are not defined.
-  // We do this to support (f.e.) loads off the end of a structure where
-  // only some bits are used.
-  if (ShAmt > 0 && (unsigned)ShAmt < NTy->getBitWidth())
-    FromVal = Builder.CreateLShr(FromVal,
-                                 ConstantInt::get(FromVal->getType(), ShAmt));
-  else if (ShAmt < 0 && (unsigned)-ShAmt < NTy->getBitWidth())
-    FromVal = Builder.CreateShl(FromVal,
-                                ConstantInt::get(FromVal->getType(), -ShAmt));
-
-  // Finally, unconditionally truncate the integer to the right width.
-  unsigned LIBitWidth = DL.getTypeSizeInBits(ToType);
-  if (LIBitWidth < NTy->getBitWidth())
-    FromVal =
-      Builder.CreateTrunc(FromVal, IntegerType::get(FromVal->getContext(),
-                                                    LIBitWidth));
-  else if (LIBitWidth > NTy->getBitWidth())
-    FromVal =
-       Builder.CreateZExt(FromVal, IntegerType::get(FromVal->getContext(),
-                                                    LIBitWidth));
-
-  // If the result is an integer, this is a trunc or bitcast.
-  if (ToType->isIntegerTy()) {
-    // Should be done.
-  } else if (ToType->isFloatingPointTy() || ToType->isVectorTy()) {
-    // Just do a bitcast, we know the sizes match up.
-    FromVal = Builder.CreateBitCast(FromVal, ToType);
-  } else {
-    // Otherwise must be a pointer.
-    FromVal = Builder.CreateIntToPtr(FromVal, ToType);
-  }
-  assert(FromVal->getType() == ToType && "Didn't convert right?");
-  return FromVal;
-}
-
-/// ConvertScalar_InsertValue - Insert the value "SV" into the existing integer
-/// or vector value "Old" at the offset specified by Offset.
-///
-/// This happens when we are converting an "integer union" to a
-/// single integer scalar, or when we are converting a "vector union" to a
-/// vector with insert/extractelement instructions.
-///
-/// Offset is an offset from the original alloca, in bits that need to be
-/// shifted to the right.
-///
-/// NonConstantIdx is an index value if there was a GEP with a non-constant
-/// index value.  If this is 0 then all GEPs used to find this insert address
-/// are constant.
-Value *ConvertToScalarInfo::
-ConvertScalar_InsertValue(Value *SV, Value *Old,
-                          uint64_t Offset, Value* NonConstantIdx,
-                          IRBuilder<> &Builder) {
-  // Convert the stored type to the actual type, shift it left to insert
-  // then 'or' into place.
-  Type *AllocaType = Old->getType();
-  LLVMContext &Context = Old->getContext();
-
-  if (VectorType *VTy = dyn_cast<VectorType>(AllocaType)) {
-    uint64_t VecSize = DL.getTypeAllocSizeInBits(VTy);
-    uint64_t ValSize = DL.getTypeAllocSizeInBits(SV->getType());
-
-    // Changing the whole vector with memset or with an access of a different
-    // vector type?
-    if (ValSize == VecSize)
-        return Builder.CreateBitCast(SV, AllocaType);
-
-    // Must be an element insertion.
-    Type *EltTy = VTy->getElementType();
-    if (SV->getType() != EltTy)
-      SV = Builder.CreateBitCast(SV, EltTy);
-    uint64_t EltSize = DL.getTypeAllocSizeInBits(EltTy);
-    unsigned Elt = Offset/EltSize;
-    Value *Idx;
-    if (NonConstantIdx) {
-      if (Elt)
-        Idx = Builder.CreateAdd(NonConstantIdx,
-                                Builder.getInt32(Elt),
-                                "dyn.offset");
-      else
-        Idx = NonConstantIdx;
-    } else
-      Idx = Builder.getInt32(Elt);
-    return Builder.CreateInsertElement(Old, SV, Idx);
-  }
-
-  // If SV is a first-class aggregate value, insert each value recursively.
-  if (StructType *ST = dyn_cast<StructType>(SV->getType())) {
-    assert(!NonConstantIdx &&
-           "Dynamic indexing into struct types not supported");
-    const StructLayout &Layout = *DL.getStructLayout(ST);
-    for (unsigned i = 0, e = ST->getNumElements(); i != e; ++i) {
-      Value *Elt = Builder.CreateExtractValue(SV, i);
-      Old = ConvertScalar_InsertValue(Elt, Old,
-                                      Offset+Layout.getElementOffsetInBits(i),
-                                      nullptr, Builder);
-    }
-    return Old;
-  }
-
-  if (ArrayType *AT = dyn_cast<ArrayType>(SV->getType())) {
-    assert(!NonConstantIdx &&
-           "Dynamic indexing into array types not supported");
-    uint64_t EltSize = DL.getTypeAllocSizeInBits(AT->getElementType());
-    for (unsigned i = 0, e = AT->getNumElements(); i != e; ++i) {
-      Value *Elt = Builder.CreateExtractValue(SV, i);
-      Old = ConvertScalar_InsertValue(Elt, Old, Offset+i*EltSize, nullptr,
-                                      Builder);
-    }
-    return Old;
-  }
-
-  // If SV is a float, convert it to the appropriate integer type.
-  // If it is a pointer, do the same.
-  unsigned SrcWidth = DL.getTypeSizeInBits(SV->getType());
-  unsigned DestWidth = DL.getTypeSizeInBits(AllocaType);
-  unsigned SrcStoreWidth = DL.getTypeStoreSizeInBits(SV->getType());
-  unsigned DestStoreWidth = DL.getTypeStoreSizeInBits(AllocaType);
-  if (SV->getType()->isFloatingPointTy() || SV->getType()->isVectorTy())
-    SV = Builder.CreateBitCast(SV, IntegerType::get(SV->getContext(),SrcWidth));
-  else if (SV->getType()->isPointerTy())
-    SV = Builder.CreatePtrToInt(SV, DL.getIntPtrType(SV->getType()));
-
-  // Zero extend or truncate the value if needed.
-  if (SV->getType() != AllocaType) {
-    if (SV->getType()->getPrimitiveSizeInBits() <
-             AllocaType->getPrimitiveSizeInBits())
-      SV = Builder.CreateZExt(SV, AllocaType);
-    else {
-      // Truncation may be needed if storing more than the alloca can hold
-      // (undefined behavior).
-      SV = Builder.CreateTrunc(SV, AllocaType);
-      SrcWidth = DestWidth;
-      SrcStoreWidth = DestStoreWidth;
-    }
-  }
-
-  // If this is a big-endian system and the store is narrower than the
-  // full alloca type, we need to do a shift to get the right bits.
-  int ShAmt = 0;
-  if (DL.isBigEndian()) {
-    // On big-endian machines, the lowest bit is stored at the bit offset
-    // from the pointer given by getTypeStoreSizeInBits.  This matters for
-    // integers with a bitwidth that is not a multiple of 8.
-    ShAmt = DestStoreWidth - SrcStoreWidth - Offset;
-  } else {
-    ShAmt = Offset;
-  }
-
-  // Note: we support negative bitwidths (with shr) which are not defined.
-  // We do this to support (f.e.) stores off the end of a structure where
-  // only some bits in the structure are set.
-  APInt Mask(APInt::getLowBitsSet(DestWidth, SrcWidth));
-  if (ShAmt > 0 && (unsigned)ShAmt < DestWidth) {
-    SV = Builder.CreateShl(SV, ConstantInt::get(SV->getType(), ShAmt));
-    Mask <<= ShAmt;
-  } else if (ShAmt < 0 && (unsigned)-ShAmt < DestWidth) {
-    SV = Builder.CreateLShr(SV, ConstantInt::get(SV->getType(), -ShAmt));
-    Mask = Mask.lshr(-ShAmt);
-  }
-
-  // Mask out the bits we are about to insert from the old value, and or
-  // in the new bits.
-  if (SrcWidth != DestWidth) {
-    assert(DestWidth > SrcWidth);
-    Old = Builder.CreateAnd(Old, ConstantInt::get(Context, ~Mask), "mask");
-    SV = Builder.CreateOr(Old, SV, "ins");
-  }
-  return SV;
-}
-
-
-//===----------------------------------------------------------------------===//
-// SRoA Driver
-//===----------------------------------------------------------------------===//
-
-
-bool SROA::runOnFunction(Function &F) {
-  if (skipOptnoneFunction(F))
-    return false;
-
-  bool Changed = performPromotion(F);
-
-  while (1) {
-    bool LocalChange = performScalarRepl(F);
-    if (!LocalChange) break;   // No need to repromote if no scalarrepl
-    Changed = true;
-    LocalChange = performPromotion(F);
-    if (!LocalChange) break;   // No need to re-scalarrepl if no promotion
-  }
-
-  return Changed;
-}
-
-namespace {
-class AllocaPromoter : public LoadAndStorePromoter {
-  AllocaInst *AI;
-  DIBuilder *DIB;
-  SmallVector<DbgDeclareInst *, 4> DDIs;
-  SmallVector<DbgValueInst *, 4> DVIs;
-public:
-  AllocaPromoter(ArrayRef<Instruction*> Insts, SSAUpdater &S,
-                 DIBuilder *DB)
-    : LoadAndStorePromoter(Insts, S), AI(nullptr), DIB(DB) {}
-
-  void run(AllocaInst *AI, const SmallVectorImpl<Instruction*> &Insts) {
-    // Remember which alloca we're promoting (for isInstInList).
-    this->AI = AI;
-    if (auto *L = LocalAsMetadata::getIfExists(AI)) {
-      if (auto *DINode = MetadataAsValue::getIfExists(AI->getContext(), L)) {
-        for (User *U : DINode->users())
-          if (DbgDeclareInst *DDI = dyn_cast<DbgDeclareInst>(U))
-            DDIs.push_back(DDI);
-          else if (DbgValueInst *DVI = dyn_cast<DbgValueInst>(U))
-            DVIs.push_back(DVI);
-      }
-    }
-
-    LoadAndStorePromoter::run(Insts);
-    AI->eraseFromParent();
-    for (SmallVectorImpl<DbgDeclareInst *>::iterator I = DDIs.begin(),
-           E = DDIs.end(); I != E; ++I) {
-      DbgDeclareInst *DDI = *I;
-      DDI->eraseFromParent();
-    }
-    for (SmallVectorImpl<DbgValueInst *>::iterator I = DVIs.begin(),
-           E = DVIs.end(); I != E; ++I) {
-      DbgValueInst *DVI = *I;
-      DVI->eraseFromParent();
-    }
-  }
-
-  bool isInstInList(Instruction *I,
-                    const SmallVectorImpl<Instruction*> &Insts) const override {
-    if (LoadInst *LI = dyn_cast<LoadInst>(I))
-      return LI->getOperand(0) == AI;
-    return cast<StoreInst>(I)->getPointerOperand() == AI;
-  }
-
-  void updateDebugInfo(Instruction *Inst) const override {
-    for (SmallVectorImpl<DbgDeclareInst *>::const_iterator I = DDIs.begin(),
-           E = DDIs.end(); I != E; ++I) {
-      DbgDeclareInst *DDI = *I;
-      if (StoreInst *SI = dyn_cast<StoreInst>(Inst))
-        ConvertDebugDeclareToDebugValue(DDI, SI, *DIB);
-      else if (LoadInst *LI = dyn_cast<LoadInst>(Inst))
-        ConvertDebugDeclareToDebugValue(DDI, LI, *DIB);
-    }
-    for (SmallVectorImpl<DbgValueInst *>::const_iterator I = DVIs.begin(),
-           E = DVIs.end(); I != E; ++I) {
-      DbgValueInst *DVI = *I;
-      Value *Arg = nullptr;
-      if (StoreInst *SI = dyn_cast<StoreInst>(Inst)) {
-        // If an argument is zero extended then use argument directly. The ZExt
-        // may be zapped by an optimization pass in future.
-        if (ZExtInst *ZExt = dyn_cast<ZExtInst>(SI->getOperand(0)))
-          Arg = dyn_cast<Argument>(ZExt->getOperand(0));
-        if (SExtInst *SExt = dyn_cast<SExtInst>(SI->getOperand(0)))
-          Arg = dyn_cast<Argument>(SExt->getOperand(0));
-        if (!Arg)
-          Arg = SI->getOperand(0);
-      } else if (LoadInst *LI = dyn_cast<LoadInst>(Inst)) {
-        Arg = LI->getOperand(0);
-      } else {
-        continue;
-      }
-      DIB->insertDbgValueIntrinsic(Arg, 0, DVI->getVariable(),
-                                   DVI->getExpression(), DVI->getDebugLoc(),
-                                   Inst);
-    }
-  }
-};
-} // end anon namespace
-
-/// isSafeSelectToSpeculate - Select instructions that use an alloca and are
-/// subsequently loaded can be rewritten to load both input pointers and then
-/// select between the result, allowing the load of the alloca to be promoted.
-/// From this:
-///   %P2 = select i1 %cond, i32* %Alloca, i32* %Other
-///   %V = load i32* %P2
-/// to:
-///   %V1 = load i32* %Alloca      -> will be mem2reg'd
-///   %V2 = load i32* %Other
-///   %V = select i1 %cond, i32 %V1, i32 %V2
-///
-/// We can do this to a select if its only uses are loads and if the operand to
-/// the select can be loaded unconditionally.
-static bool isSafeSelectToSpeculate(SelectInst *SI) {
-  const DataLayout &DL = SI->getModule()->getDataLayout();
-  bool TDerefable = isDereferenceablePointer(SI->getTrueValue(), DL);
-  bool FDerefable = isDereferenceablePointer(SI->getFalseValue(), DL);
-
-  for (User *U : SI->users()) {
-    LoadInst *LI = dyn_cast<LoadInst>(U);
-    if (!LI || !LI->isSimple()) return false;
-
-    // Both operands to the select need to be dereferencable, either absolutely
-    // (e.g. allocas) or at this point because we can see other accesses to it.
-    if (!TDerefable &&
-        !isSafeToLoadUnconditionally(SI->getTrueValue(), LI,
-                                     LI->getAlignment()))
-      return false;
-    if (!FDerefable &&
-        !isSafeToLoadUnconditionally(SI->getFalseValue(), LI,
-                                     LI->getAlignment()))
-      return false;
-  }
-
-  return true;
-}
-
-/// isSafePHIToSpeculate - PHI instructions that use an alloca and are
-/// subsequently loaded can be rewritten to load both input pointers in the pred
-/// blocks and then PHI the results, allowing the load of the alloca to be
-/// promoted.
-/// From this:
-///   %P2 = phi [i32* %Alloca, i32* %Other]
-///   %V = load i32* %P2
-/// to:
-///   %V1 = load i32* %Alloca      -> will be mem2reg'd
-///   ...
-///   %V2 = load i32* %Other
-///   ...
-///   %V = phi [i32 %V1, i32 %V2]
-///
-/// We can do this to a select if its only uses are loads and if the operand to
-/// the select can be loaded unconditionally.
-static bool isSafePHIToSpeculate(PHINode *PN) {
-  // For now, we can only do this promotion if the load is in the same block as
-  // the PHI, and if there are no stores between the phi and load.
-  // TODO: Allow recursive phi users.
-  // TODO: Allow stores.
-  BasicBlock *BB = PN->getParent();
-  unsigned MaxAlign = 0;
-  for (User *U : PN->users()) {
-    LoadInst *LI = dyn_cast<LoadInst>(U);
-    if (!LI || !LI->isSimple()) return false;
-
-    // For now we only allow loads in the same block as the PHI.  This is a
-    // common case that happens when instcombine merges two loads through a PHI.
-    if (LI->getParent() != BB) return false;
-
-    // Ensure that there are no instructions between the PHI and the load that
-    // could store.
-    for (BasicBlock::iterator BBI(PN); &*BBI != LI; ++BBI)
-      if (BBI->mayWriteToMemory())
-        return false;
-
-    MaxAlign = std::max(MaxAlign, LI->getAlignment());
-  }
-
-  const DataLayout &DL = PN->getModule()->getDataLayout();
-
-  // Okay, we know that we have one or more loads in the same block as the PHI.
-  // We can transform this if it is safe to push the loads into the predecessor
-  // blocks.  The only thing to watch out for is that we can't put a possibly
-  // trapping load in the predecessor if it is a critical edge.
-  for (unsigned i = 0, e = PN->getNumIncomingValues(); i != e; ++i) {
-    BasicBlock *Pred = PN->getIncomingBlock(i);
-    Value *InVal = PN->getIncomingValue(i);
-
-    // If the terminator of the predecessor has side-effects (an invoke),
-    // there is no safe place to put a load in the predecessor.
-    if (Pred->getTerminator()->mayHaveSideEffects())
-      return false;
-
-    // If the value is produced by the terminator of the predecessor
-    // (an invoke), there is no valid place to put a load in the predecessor.
-    if (Pred->getTerminator() == InVal)
-      return false;
-
-    // If the predecessor has a single successor, then the edge isn't critical.
-    if (Pred->getTerminator()->getNumSuccessors() == 1)
-      continue;
-
-    // If this pointer is always safe to load, or if we can prove that there is
-    // already a load in the block, then we can move the load to the pred block.
-    if (isDereferenceablePointer(InVal, DL) ||
-        isSafeToLoadUnconditionally(InVal, Pred->getTerminator(), MaxAlign))
-      continue;
-
-    return false;
-  }
-
-  return true;
-}
-
-
-/// tryToMakeAllocaBePromotable - This returns true if the alloca only has
-/// direct (non-volatile) loads and stores to it.  If the alloca is close but
-/// not quite there, this will transform the code to allow promotion.  As such,
-/// it is a non-pure predicate.
-static bool tryToMakeAllocaBePromotable(AllocaInst *AI, const DataLayout &DL) {
-  SetVector<Instruction*, SmallVector<Instruction*, 4>,
-            SmallPtrSet<Instruction*, 4> > InstsToRewrite;
-  for (User *U : AI->users()) {
-    if (LoadInst *LI = dyn_cast<LoadInst>(U)) {
-      if (!LI->isSimple())
-        return false;
-      continue;
-    }
-
-    if (StoreInst *SI = dyn_cast<StoreInst>(U)) {
-      if (SI->getOperand(0) == AI || !SI->isSimple())
-        return false;   // Don't allow a store OF the AI, only INTO the AI.
-      continue;
-    }
-
-    if (SelectInst *SI = dyn_cast<SelectInst>(U)) {
-      // If the condition being selected on is a constant, fold the select, yes
-      // this does (rarely) happen early on.
-      if (ConstantInt *CI = dyn_cast<ConstantInt>(SI->getCondition())) {
-        Value *Result = SI->getOperand(1+CI->isZero());
-        SI->replaceAllUsesWith(Result);
-        SI->eraseFromParent();
-
-        // This is very rare and we just scrambled the use list of AI, start
-        // over completely.
-        return tryToMakeAllocaBePromotable(AI, DL);
-      }
-
-      // If it is safe to turn "load (select c, AI, ptr)" into a select of two
-      // loads, then we can transform this by rewriting the select.
-      if (!isSafeSelectToSpeculate(SI))
-        return false;
-
-      InstsToRewrite.insert(SI);
-      continue;
-    }
-
-    if (PHINode *PN = dyn_cast<PHINode>(U)) {
-      if (PN->use_empty()) {  // Dead PHIs can be stripped.
-        InstsToRewrite.insert(PN);
-        continue;
-      }
-
-      // If it is safe to turn "load (phi [AI, ptr, ...])" into a PHI of loads
-      // in the pred blocks, then we can transform this by rewriting the PHI.
-      if (!isSafePHIToSpeculate(PN))
-        return false;
-
-      InstsToRewrite.insert(PN);
-      continue;
-    }
-
-    if (BitCastInst *BCI = dyn_cast<BitCastInst>(U)) {
-      if (onlyUsedByLifetimeMarkers(BCI)) {
-        InstsToRewrite.insert(BCI);
-        continue;
-      }
-    }
-
-    return false;
-  }
-
-  // If there are no instructions to rewrite, then all uses are load/stores and
-  // we're done!
-  if (InstsToRewrite.empty())
-    return true;
-
-  // If we have instructions that need to be rewritten for this to be promotable
-  // take care of it now.
-  for (unsigned i = 0, e = InstsToRewrite.size(); i != e; ++i) {
-    if (BitCastInst *BCI = dyn_cast<BitCastInst>(InstsToRewrite[i])) {
-      // This could only be a bitcast used by nothing but lifetime intrinsics.
-      for (BitCastInst::user_iterator I = BCI->user_begin(), E = BCI->user_end();
-           I != E;)
-        cast<Instruction>(*I++)->eraseFromParent();
-      BCI->eraseFromParent();
-      continue;
-    }
-
-    if (SelectInst *SI = dyn_cast<SelectInst>(InstsToRewrite[i])) {
-      // Selects in InstsToRewrite only have load uses.  Rewrite each as two
-      // loads with a new select.
-      while (!SI->use_empty()) {
-        LoadInst *LI = cast<LoadInst>(SI->user_back());
-
-        IRBuilder<> Builder(LI);
-        LoadInst *TrueLoad =
-          Builder.CreateLoad(SI->getTrueValue(), LI->getName()+".t");
-        LoadInst *FalseLoad =
-          Builder.CreateLoad(SI->getFalseValue(), LI->getName()+".f");
-
-        // Transfer alignment and AA info if present.
-        TrueLoad->setAlignment(LI->getAlignment());
-        FalseLoad->setAlignment(LI->getAlignment());
-
-        AAMDNodes Tags;
-        LI->getAAMetadata(Tags);
-        if (Tags) {
-          TrueLoad->setAAMetadata(Tags);
-          FalseLoad->setAAMetadata(Tags);
-        }
-
-        Value *V = Builder.CreateSelect(SI->getCondition(), TrueLoad, FalseLoad);
-        V->takeName(LI);
-        LI->replaceAllUsesWith(V);
-        LI->eraseFromParent();
-      }
-
-      // Now that all the loads are gone, the select is gone too.
-      SI->eraseFromParent();
-      continue;
-    }
-
-    // Otherwise, we have a PHI node which allows us to push the loads into the
-    // predecessors.
-    PHINode *PN = cast<PHINode>(InstsToRewrite[i]);
-    if (PN->use_empty()) {
-      PN->eraseFromParent();
-      continue;
-    }
-
-    Type *LoadTy = cast<PointerType>(PN->getType())->getElementType();
-    PHINode *NewPN = PHINode::Create(LoadTy, PN->getNumIncomingValues(),
-                                     PN->getName()+".ld", PN);
-
-    // Get the AA tags and alignment to use from one of the loads.  It doesn't
-    // matter which one we get and if any differ, it doesn't matter.
-    LoadInst *SomeLoad = cast<LoadInst>(PN->user_back());
-
-    AAMDNodes AATags;
-    SomeLoad->getAAMetadata(AATags);
-    unsigned Align = SomeLoad->getAlignment();
-
-    // Rewrite all loads of the PN to use the new PHI.
-    while (!PN->use_empty()) {
-      LoadInst *LI = cast<LoadInst>(PN->user_back());
-      LI->replaceAllUsesWith(NewPN);
-      LI->eraseFromParent();
-    }
-
-    // Inject loads into all of the pred blocks.  Keep track of which blocks we
-    // insert them into in case we have multiple edges from the same block.
-    DenseMap<BasicBlock*, LoadInst*> InsertedLoads;
-
-    for (unsigned i = 0, e = PN->getNumIncomingValues(); i != e; ++i) {
-      BasicBlock *Pred = PN->getIncomingBlock(i);
-      LoadInst *&Load = InsertedLoads[Pred];
-      if (!Load) {
-        Load = new LoadInst(PN->getIncomingValue(i),
-                            PN->getName() + "." + Pred->getName(),
-                            Pred->getTerminator());
-        Load->setAlignment(Align);
-        if (AATags) Load->setAAMetadata(AATags);
-      }
-
-      NewPN->addIncoming(Load, Pred);
-    }
-
-    PN->eraseFromParent();
-  }
-
-  ++NumAdjusted;
-  return true;
-}
-
-bool SROA::performPromotion(Function &F) {
-  std::vector<AllocaInst*> Allocas;
-  const DataLayout &DL = F.getParent()->getDataLayout();
-  DominatorTree *DT = nullptr;
-  if (HasDomTree)
-    DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
-  AssumptionCache &AC =
-      getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F);
-
-  BasicBlock &BB = F.getEntryBlock();  // Get the entry node for the function
-  DIBuilder DIB(*F.getParent(), /*AllowUnresolved*/ false);
-  bool Changed = false;
-  SmallVector<Instruction*, 64> Insts;
-  while (1) {
-    Allocas.clear();
-
-    // Find allocas that are safe to promote, by looking at all instructions in
-    // the entry node
-    for (BasicBlock::iterator I = BB.begin(), E = --BB.end(); I != E; ++I)
-      if (AllocaInst *AI = dyn_cast<AllocaInst>(I))       // Is it an alloca?
-        if (tryToMakeAllocaBePromotable(AI, DL))
-          Allocas.push_back(AI);
-
-    if (Allocas.empty()) break;
-
-    if (HasDomTree)
-      PromoteMemToReg(Allocas, *DT, nullptr, &AC);
-    else {
-      SSAUpdater SSA;
-      for (unsigned i = 0, e = Allocas.size(); i != e; ++i) {
-        AllocaInst *AI = Allocas[i];
-
-        // Build list of instructions to promote.
-        for (User *U : AI->users())
-          Insts.push_back(cast<Instruction>(U));
-        AllocaPromoter(Insts, SSA, &DIB).run(AI, Insts);
-        Insts.clear();
-      }
-    }
-    NumPromoted += Allocas.size();
-    Changed = true;
-  }
-
-  return Changed;
-}
-
-
-/// ShouldAttemptScalarRepl - Decide if an alloca is a good candidate for
-/// SROA.  It must be a struct or array type with a small number of elements.
-bool SROA::ShouldAttemptScalarRepl(AllocaInst *AI) {
-  Type *T = AI->getAllocatedType();
-  // Do not promote any struct that has too many members.
-  if (StructType *ST = dyn_cast<StructType>(T))
-    return ST->getNumElements() <= StructMemberThreshold;
-  // Do not promote any array that has too many elements.
-  if (ArrayType *AT = dyn_cast<ArrayType>(T))
-    return AT->getNumElements() <= ArrayElementThreshold;
-  return false;
-}
-
-// performScalarRepl - This algorithm is a simple worklist driven algorithm,
-// which runs on all of the alloca instructions in the entry block, removing
-// them if they are only used by getelementptr instructions.
-//
-bool SROA::performScalarRepl(Function &F) {
-  std::vector<AllocaInst*> WorkList;
-  const DataLayout &DL = F.getParent()->getDataLayout();
-
-  // Scan the entry basic block, adding allocas to the worklist.
-  BasicBlock &BB = F.getEntryBlock();
-  for (BasicBlock::iterator I = BB.begin(), E = BB.end(); I != E; ++I)
-    if (AllocaInst *A = dyn_cast<AllocaInst>(I))
-      WorkList.push_back(A);
-
-  // Process the worklist
-  bool Changed = false;
-  while (!WorkList.empty()) {
-    AllocaInst *AI = WorkList.back();
-    WorkList.pop_back();
-
-    // Handle dead allocas trivially.  These can be formed by SROA'ing arrays
-    // with unused elements.
-    if (AI->use_empty()) {
-      AI->eraseFromParent();
-      Changed = true;
-      continue;
-    }
-
-    // If this alloca is impossible for us to promote, reject it early.
-    if (AI->isArrayAllocation() || !AI->getAllocatedType()->isSized())
-      continue;
-
-    // Check to see if we can perform the core SROA transformation.  We cannot
-    // transform the allocation instruction if it is an array allocation
-    // (allocations OF arrays are ok though), and an allocation of a scalar
-    // value cannot be decomposed at all.
-    uint64_t AllocaSize = DL.getTypeAllocSize(AI->getAllocatedType());
-
-    // Do not promote [0 x %struct].
-    if (AllocaSize == 0) continue;
-
-    // Do not promote any struct whose size is too big.
-    if (AllocaSize > SRThreshold) continue;
-
-    // If the alloca looks like a good candidate for scalar replacement, and if
-    // all its users can be transformed, then split up the aggregate into its
-    // separate elements.
-    if (ShouldAttemptScalarRepl(AI) && isSafeAllocaToScalarRepl(AI)) {
-      DoScalarReplacement(AI, WorkList);
-      Changed = true;
-      continue;
-    }
-
-    // If we can turn this aggregate value (potentially with casts) into a
-    // simple scalar value that can be mem2reg'd into a register value.
-    // IsNotTrivial tracks whether this is something that mem2reg could have
-    // promoted itself.  If so, we don't want to transform it needlessly.  Note
-    // that we can't just check based on the type: the alloca may be of an i32
-    // but that has pointer arithmetic to set byte 3 of it or something.
-    if (AllocaInst *NewAI =
-            ConvertToScalarInfo((unsigned)AllocaSize, DL, ScalarLoadThreshold)
-                .TryConvert(AI)) {
-      NewAI->takeName(AI);
-      AI->eraseFromParent();
-      ++NumConverted;
-      Changed = true;
-      continue;
-    }
-
-    // Otherwise, couldn't process this alloca.
-  }
-
-  return Changed;
-}
-
-/// DoScalarReplacement - This alloca satisfied the isSafeAllocaToScalarRepl
-/// predicate, do SROA now.
-void SROA::DoScalarReplacement(AllocaInst *AI,
-                               std::vector<AllocaInst*> &WorkList) {
-  DEBUG(dbgs() << "Found inst to SROA: " << *AI << '\n');
-  SmallVector<AllocaInst*, 32> ElementAllocas;
-  if (StructType *ST = dyn_cast<StructType>(AI->getAllocatedType())) {
-    ElementAllocas.reserve(ST->getNumContainedTypes());
-    for (unsigned i = 0, e = ST->getNumContainedTypes(); i != e; ++i) {
-      AllocaInst *NA = new AllocaInst(ST->getContainedType(i), nullptr,
-                                      AI->getAlignment(),
-                                      AI->getName() + "." + Twine(i), AI);
-      ElementAllocas.push_back(NA);
-      WorkList.push_back(NA);  // Add to worklist for recursive processing
-    }
-  } else {
-    ArrayType *AT = cast<ArrayType>(AI->getAllocatedType());
-    ElementAllocas.reserve(AT->getNumElements());
-    Type *ElTy = AT->getElementType();
-    for (unsigned i = 0, e = AT->getNumElements(); i != e; ++i) {
-      AllocaInst *NA = new AllocaInst(ElTy, nullptr, AI->getAlignment(),
-                                      AI->getName() + "." + Twine(i), AI);
-      ElementAllocas.push_back(NA);
-      WorkList.push_back(NA);  // Add to worklist for recursive processing
-    }
-  }
-
-  // Now that we have created the new alloca instructions, rewrite all the
-  // uses of the old alloca.
-  RewriteForScalarRepl(AI, AI, 0, ElementAllocas);
-
-  // Now erase any instructions that were made dead while rewriting the alloca.
-  DeleteDeadInstructions();
-  AI->eraseFromParent();
-
-  ++NumReplaced;
-}
-
-/// DeleteDeadInstructions - Erase instructions on the DeadInstrs list,
-/// recursively including all their operands that become trivially dead.
-void SROA::DeleteDeadInstructions() {
-  while (!DeadInsts.empty()) {
-    Instruction *I = cast<Instruction>(DeadInsts.pop_back_val());
-
-    for (User::op_iterator OI = I->op_begin(), E = I->op_end(); OI != E; ++OI)
-      if (Instruction *U = dyn_cast<Instruction>(*OI)) {
-        // Zero out the operand and see if it becomes trivially dead.
-        // (But, don't add allocas to the dead instruction list -- they are
-        // already on the worklist and will be deleted separately.)
-        *OI = nullptr;
-        if (isInstructionTriviallyDead(U) && !isa<AllocaInst>(U))
-          DeadInsts.push_back(U);
-      }
-
-    I->eraseFromParent();
-  }
-}
-
-/// isSafeForScalarRepl - Check if instruction I is a safe use with regard to
-/// performing scalar replacement of alloca AI.  The results are flagged in
-/// the Info parameter.  Offset indicates the position within AI that is
-/// referenced by this instruction.
-void SROA::isSafeForScalarRepl(Instruction *I, uint64_t Offset,
-                               AllocaInfo &Info) {
-  const DataLayout &DL = I->getModule()->getDataLayout();
-  for (Use &U : I->uses()) {
-    Instruction *User = cast<Instruction>(U.getUser());
-
-    if (BitCastInst *BC = dyn_cast<BitCastInst>(User)) {
-      isSafeForScalarRepl(BC, Offset, Info);
-    } else if (GetElementPtrInst *GEPI = dyn_cast<GetElementPtrInst>(User)) {
-      uint64_t GEPOffset = Offset;
-      isSafeGEP(GEPI, GEPOffset, Info);
-      if (!Info.isUnsafe)
-        isSafeForScalarRepl(GEPI, GEPOffset, Info);
-    } else if (MemIntrinsic *MI = dyn_cast<MemIntrinsic>(User)) {
-      ConstantInt *Length = dyn_cast<ConstantInt>(MI->getLength());
-      if (!Length || Length->isNegative())
-        return MarkUnsafe(Info, User);
-
-      isSafeMemAccess(Offset, Length->getZExtValue(), nullptr,
-                      U.getOperandNo() == 0, Info, MI,
-                      true /*AllowWholeAccess*/);
-    } else if (LoadInst *LI = dyn_cast<LoadInst>(User)) {
-      if (!LI->isSimple())
-        return MarkUnsafe(Info, User);
-      Type *LIType = LI->getType();
-      isSafeMemAccess(Offset, DL.getTypeAllocSize(LIType), LIType, false, Info,
-                      LI, true /*AllowWholeAccess*/);
-      Info.hasALoadOrStore = true;
-
-    } else if (StoreInst *SI = dyn_cast<StoreInst>(User)) {
-      // Store is ok if storing INTO the pointer, not storing the pointer
-      if (!SI->isSimple() || SI->getOperand(0) == I)
-        return MarkUnsafe(Info, User);
-
-      Type *SIType = SI->getOperand(0)->getType();
-      isSafeMemAccess(Offset, DL.getTypeAllocSize(SIType), SIType, true, Info,
-                      SI, true /*AllowWholeAccess*/);
-      Info.hasALoadOrStore = true;
-    } else if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(User)) {
-      if (II->getIntrinsicID() != Intrinsic::lifetime_start &&
-          II->getIntrinsicID() != Intrinsic::lifetime_end)
-        return MarkUnsafe(Info, User);
-    } else if (isa<PHINode>(User) || isa<SelectInst>(User)) {
-      isSafePHISelectUseForScalarRepl(User, Offset, Info);
-    } else {
-      return MarkUnsafe(Info, User);
-    }
-    if (Info.isUnsafe) return;
-  }
-}
-
-
-/// isSafePHIUseForScalarRepl - If we see a PHI node or select using a pointer
-/// derived from the alloca, we can often still split the alloca into elements.
-/// This is useful if we have a large alloca where one element is phi'd
-/// together somewhere: we can SRoA and promote all the other elements even if
-/// we end up not being able to promote this one.
-///
-/// All we require is that the uses of the PHI do not index into other parts of
-/// the alloca.  The most important use case for this is single load and stores
-/// that are PHI'd together, which can happen due to code sinking.
-void SROA::isSafePHISelectUseForScalarRepl(Instruction *I, uint64_t Offset,
-                                           AllocaInfo &Info) {
-  // If we've already checked this PHI, don't do it again.
-  if (PHINode *PN = dyn_cast<PHINode>(I))
-    if (!Info.CheckedPHIs.insert(PN).second)
-      return;
-
-  const DataLayout &DL = I->getModule()->getDataLayout();
-  for (User *U : I->users()) {
-    Instruction *UI = cast<Instruction>(U);
-
-    if (BitCastInst *BC = dyn_cast<BitCastInst>(UI)) {
-      isSafePHISelectUseForScalarRepl(BC, Offset, Info);
-    } else if (GetElementPtrInst *GEPI = dyn_cast<GetElementPtrInst>(UI)) {
-      // Only allow "bitcast" GEPs for simplicity.  We could generalize this,
-      // but would have to prove that we're staying inside of an element being
-      // promoted.
-      if (!GEPI->hasAllZeroIndices())
-        return MarkUnsafe(Info, UI);
-      isSafePHISelectUseForScalarRepl(GEPI, Offset, Info);
-    } else if (LoadInst *LI = dyn_cast<LoadInst>(UI)) {
-      if (!LI->isSimple())
-        return MarkUnsafe(Info, UI);
-      Type *LIType = LI->getType();
-      isSafeMemAccess(Offset, DL.getTypeAllocSize(LIType), LIType, false, Info,
-                      LI, false /*AllowWholeAccess*/);
-      Info.hasALoadOrStore = true;
-
-    } else if (StoreInst *SI = dyn_cast<StoreInst>(UI)) {
-      // Store is ok if storing INTO the pointer, not storing the pointer
-      if (!SI->isSimple() || SI->getOperand(0) == I)
-        return MarkUnsafe(Info, UI);
-
-      Type *SIType = SI->getOperand(0)->getType();
-      isSafeMemAccess(Offset, DL.getTypeAllocSize(SIType), SIType, true, Info,
-                      SI, false /*AllowWholeAccess*/);
-      Info.hasALoadOrStore = true;
-    } else if (isa<PHINode>(UI) || isa<SelectInst>(UI)) {
-      isSafePHISelectUseForScalarRepl(UI, Offset, Info);
-    } else {
-      return MarkUnsafe(Info, UI);
-    }
-    if (Info.isUnsafe) return;
-  }
-}
-
-/// isSafeGEP - Check if a GEP instruction can be handled for scalar
-/// replacement.  It is safe when all the indices are constant, in-bounds
-/// references, and when the resulting offset corresponds to an element within
-/// the alloca type.  The results are flagged in the Info parameter.  Upon
-/// return, Offset is adjusted as specified by the GEP indices.
-void SROA::isSafeGEP(GetElementPtrInst *GEPI,
-                     uint64_t &Offset, AllocaInfo &Info) {
-  gep_type_iterator GEPIt = gep_type_begin(GEPI), E = gep_type_end(GEPI);
-  if (GEPIt == E)
-    return;
-  bool NonConstant = false;
-  unsigned NonConstantIdxSize = 0;
-
-  // Walk through the GEP type indices, checking the types that this indexes
-  // into.
-  for (; GEPIt != E; ++GEPIt) {
-    // Ignore struct elements, no extra checking needed for these.
-    if ((*GEPIt)->isStructTy())
-      continue;
-
-    ConstantInt *IdxVal = dyn_cast<ConstantInt>(GEPIt.getOperand());
-    if (!IdxVal)
-      return MarkUnsafe(Info, GEPI);
-  }
-
-  // Compute the offset due to this GEP and check if the alloca has a
-  // component element at that offset.
-  SmallVector<Value*, 8> Indices(GEPI->op_begin() + 1, GEPI->op_end());
-  // If this GEP is non-constant then the last operand must have been a
-  // dynamic index into a vector.  Pop this now as it has no impact on the
-  // constant part of the offset.
-  if (NonConstant)
-    Indices.pop_back();
-
-  const DataLayout &DL = GEPI->getModule()->getDataLayout();
-  Offset += DL.getIndexedOffset(GEPI->getPointerOperandType(), Indices);
-  if (!TypeHasComponent(Info.AI->getAllocatedType(), Offset, NonConstantIdxSize,
-                        DL))
-    MarkUnsafe(Info, GEPI);
-}
-
-/// isHomogeneousAggregate - Check if type T is a struct or array containing
-/// elements of the same type (which is always true for arrays).  If so,
-/// return true with NumElts and EltTy set to the number of elements and the
-/// element type, respectively.
-static bool isHomogeneousAggregate(Type *T, unsigned &NumElts,
-                                   Type *&EltTy) {
-  if (ArrayType *AT = dyn_cast<ArrayType>(T)) {
-    NumElts = AT->getNumElements();
-    EltTy = (NumElts == 0 ? nullptr : AT->getElementType());
-    return true;
-  }
-  if (StructType *ST = dyn_cast<StructType>(T)) {
-    NumElts = ST->getNumContainedTypes();
-    EltTy = (NumElts == 0 ? nullptr : ST->getContainedType(0));
-    for (unsigned n = 1; n < NumElts; ++n) {
-      if (ST->getContainedType(n) != EltTy)
-        return false;
-    }
-    return true;
-  }
-  return false;
-}
-
-/// isCompatibleAggregate - Check if T1 and T2 are either the same type or are
-/// "homogeneous" aggregates with the same element type and number of elements.
-static bool isCompatibleAggregate(Type *T1, Type *T2) {
-  if (T1 == T2)
-    return true;
-
-  unsigned NumElts1, NumElts2;
-  Type *EltTy1, *EltTy2;
-  if (isHomogeneousAggregate(T1, NumElts1, EltTy1) &&
-      isHomogeneousAggregate(T2, NumElts2, EltTy2) &&
-      NumElts1 == NumElts2 &&
-      EltTy1 == EltTy2)
-    return true;
-
-  return false;
-}
-
-/// isSafeMemAccess - Check if a load/store/memcpy operates on the entire AI
-/// alloca or has an offset and size that corresponds to a component element
-/// within it.  The offset checked here may have been formed from a GEP with a
-/// pointer bitcasted to a different type.
-///
-/// If AllowWholeAccess is true, then this allows uses of the entire alloca as a
-/// unit.  If false, it only allows accesses known to be in a single element.
-void SROA::isSafeMemAccess(uint64_t Offset, uint64_t MemSize,
-                           Type *MemOpType, bool isStore,
-                           AllocaInfo &Info, Instruction *TheAccess,
-                           bool AllowWholeAccess) {
-  const DataLayout &DL = TheAccess->getModule()->getDataLayout();
-  // Check if this is a load/store of the entire alloca.
-  if (Offset == 0 && AllowWholeAccess &&
-      MemSize == DL.getTypeAllocSize(Info.AI->getAllocatedType())) {
-    // This can be safe for MemIntrinsics (where MemOpType is 0) and integer
-    // loads/stores (which are essentially the same as the MemIntrinsics with
-    // regard to copying padding between elements).  But, if an alloca is
-    // flagged as both a source and destination of such operations, we'll need
-    // to check later for padding between elements.
-    if (!MemOpType || MemOpType->isIntegerTy()) {
-      if (isStore)
-        Info.isMemCpyDst = true;
-      else
-        Info.isMemCpySrc = true;
-      return;
-    }
-    // This is also safe for references using a type that is compatible with
-    // the type of the alloca, so that loads/stores can be rewritten using
-    // insertvalue/extractvalue.
-    if (isCompatibleAggregate(MemOpType, Info.AI->getAllocatedType())) {
-      Info.hasSubelementAccess = true;
-      return;
-    }
-  }
-  // Check if the offset/size correspond to a component within the alloca type.
-  Type *T = Info.AI->getAllocatedType();
-  if (TypeHasComponent(T, Offset, MemSize, DL)) {
-    Info.hasSubelementAccess = true;
-    return;
-  }
-
-  return MarkUnsafe(Info, TheAccess);
-}
-
-/// TypeHasComponent - Return true if T has a component type with the
-/// specified offset and size.  If Size is zero, do not check the size.
-bool SROA::TypeHasComponent(Type *T, uint64_t Offset, uint64_t Size,
-                            const DataLayout &DL) {
-  Type *EltTy;
-  uint64_t EltSize;
-  if (StructType *ST = dyn_cast<StructType>(T)) {
-    const StructLayout *Layout = DL.getStructLayout(ST);
-    unsigned EltIdx = Layout->getElementContainingOffset(Offset);
-    EltTy = ST->getContainedType(EltIdx);
-    EltSize = DL.getTypeAllocSize(EltTy);
-    Offset -= Layout->getElementOffset(EltIdx);
-  } else if (ArrayType *AT = dyn_cast<ArrayType>(T)) {
-    EltTy = AT->getElementType();
-    EltSize = DL.getTypeAllocSize(EltTy);
-    if (Offset >= AT->getNumElements() * EltSize)
-      return false;
-    Offset %= EltSize;
-  } else if (VectorType *VT = dyn_cast<VectorType>(T)) {
-    EltTy = VT->getElementType();
-    EltSize = DL.getTypeAllocSize(EltTy);
-    if (Offset >= VT->getNumElements() * EltSize)
-      return false;
-    Offset %= EltSize;
-  } else {
-    return false;
-  }
-  if (Offset == 0 && (Size == 0 || EltSize == Size))
-    return true;
-  // Check if the component spans multiple elements.
-  if (Offset + Size > EltSize)
-    return false;
-  return TypeHasComponent(EltTy, Offset, Size, DL);
-}
-
-/// RewriteForScalarRepl - Alloca AI is being split into NewElts, so rewrite
-/// the instruction I, which references it, to use the separate elements.
-/// Offset indicates the position within AI that is referenced by this
-/// instruction.
-void SROA::RewriteForScalarRepl(Instruction *I, AllocaInst *AI, uint64_t Offset,
-                                SmallVectorImpl<AllocaInst *> &NewElts) {
-  const DataLayout &DL = I->getModule()->getDataLayout();
-  for (Value::use_iterator UI = I->use_begin(), E = I->use_end(); UI!=E;) {
-    Use &TheUse = *UI++;
-    Instruction *User = cast<Instruction>(TheUse.getUser());
-
-    if (BitCastInst *BC = dyn_cast<BitCastInst>(User)) {
-      RewriteBitCast(BC, AI, Offset, NewElts);
-      continue;
-    }
-
-    if (GetElementPtrInst *GEPI = dyn_cast<GetElementPtrInst>(User)) {
-      RewriteGEP(GEPI, AI, Offset, NewElts);
-      continue;
-    }
-
-    if (MemIntrinsic *MI = dyn_cast<MemIntrinsic>(User)) {
-      ConstantInt *Length = dyn_cast<ConstantInt>(MI->getLength());
-      uint64_t MemSize = Length->getZExtValue();
-      if (Offset == 0 && MemSize == DL.getTypeAllocSize(AI->getAllocatedType()))
-        RewriteMemIntrinUserOfAlloca(MI, I, AI, NewElts);
-      // Otherwise the intrinsic can only touch a single element and the
-      // address operand will be updated, so nothing else needs to be done.
-      continue;
-    }
-
-    if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(User)) {
-      if (II->getIntrinsicID() == Intrinsic::lifetime_start ||
-          II->getIntrinsicID() == Intrinsic::lifetime_end) {
-        RewriteLifetimeIntrinsic(II, AI, Offset, NewElts);
-      }
-      continue;
-    }
-
-    if (LoadInst *LI = dyn_cast<LoadInst>(User)) {
-      Type *LIType = LI->getType();
-
-      if (isCompatibleAggregate(LIType, AI->getAllocatedType())) {
-        // Replace:
-        //   %res = load { i32, i32 }* %alloc
-        // with:
-        //   %load.0 = load i32* %alloc.0
-        //   %insert.0 insertvalue { i32, i32 } zeroinitializer, i32 %load.0, 0
-        //   %load.1 = load i32* %alloc.1
-        //   %insert = insertvalue { i32, i32 } %insert.0, i32 %load.1, 1
-        // (Also works for arrays instead of structs)
-        Value *Insert = UndefValue::get(LIType);
-        IRBuilder<> Builder(LI);
-        for (unsigned i = 0, e = NewElts.size(); i != e; ++i) {
-          Value *Load = Builder.CreateLoad(NewElts[i], "load");
-          Insert = Builder.CreateInsertValue(Insert, Load, i, "insert");
-        }
-        LI->replaceAllUsesWith(Insert);
-        DeadInsts.push_back(LI);
-      } else if (LIType->isIntegerTy() &&
-                 DL.getTypeAllocSize(LIType) ==
-                     DL.getTypeAllocSize(AI->getAllocatedType())) {
-        // If this is a load of the entire alloca to an integer, rewrite it.
-        RewriteLoadUserOfWholeAlloca(LI, AI, NewElts);
-      }
-      continue;
-    }
-
-    if (StoreInst *SI = dyn_cast<StoreInst>(User)) {
-      Value *Val = SI->getOperand(0);
-      Type *SIType = Val->getType();
-      if (isCompatibleAggregate(SIType, AI->getAllocatedType())) {
-        // Replace:
-        //   store { i32, i32 } %val, { i32, i32 }* %alloc
-        // with:
-        //   %val.0 = extractvalue { i32, i32 } %val, 0
-        //   store i32 %val.0, i32* %alloc.0
-        //   %val.1 = extractvalue { i32, i32 } %val, 1
-        //   store i32 %val.1, i32* %alloc.1
-        // (Also works for arrays instead of structs)
-        IRBuilder<> Builder(SI);
-        for (unsigned i = 0, e = NewElts.size(); i != e; ++i) {
-          Value *Extract = Builder.CreateExtractValue(Val, i, Val->getName());
-          Builder.CreateStore(Extract, NewElts[i]);
-        }
-        DeadInsts.push_back(SI);
-      } else if (SIType->isIntegerTy() &&
-                 DL.getTypeAllocSize(SIType) ==
-                     DL.getTypeAllocSize(AI->getAllocatedType())) {
-        // If this is a store of the entire alloca from an integer, rewrite it.
-        RewriteStoreUserOfWholeAlloca(SI, AI, NewElts);
-      }
-      continue;
-    }
-
-    if (isa<SelectInst>(User) || isa<PHINode>(User)) {
-      // If we have a PHI user of the alloca itself (as opposed to a GEP or
-      // bitcast) we have to rewrite it.  GEP and bitcast uses will be RAUW'd to
-      // the new pointer.
-      if (!isa<AllocaInst>(I)) continue;
-
-      assert(Offset == 0 && NewElts[0] &&
-             "Direct alloca use should have a zero offset");
-
-      // If we have a use of the alloca, we know the derived uses will be
-      // utilizing just the first element of the scalarized result.  Insert a
-      // bitcast of the first alloca before the user as required.
-      AllocaInst *NewAI = NewElts[0];
-      BitCastInst *BCI = new BitCastInst(NewAI, AI->getType(), "", NewAI);
-      NewAI->moveBefore(BCI);
-      TheUse = BCI;
-      continue;
-    }
-  }
-}
-
-/// RewriteBitCast - Update a bitcast reference to the alloca being replaced
-/// and recursively continue updating all of its uses.
-void SROA::RewriteBitCast(BitCastInst *BC, AllocaInst *AI, uint64_t Offset,
-                          SmallVectorImpl<AllocaInst *> &NewElts) {
-  RewriteForScalarRepl(BC, AI, Offset, NewElts);
-  if (BC->getOperand(0) != AI)
-    return;
-
-  // The bitcast references the original alloca.  Replace its uses with
-  // references to the alloca containing offset zero (which is normally at
-  // index zero, but might not be in cases involving structs with elements
-  // of size zero).
-  Type *T = AI->getAllocatedType();
-  uint64_t EltOffset = 0;
-  Type *IdxTy;
-  uint64_t Idx = FindElementAndOffset(T, EltOffset, IdxTy,
-                                      BC->getModule()->getDataLayout());
-  Instruction *Val = NewElts[Idx];
-  if (Val->getType() != BC->getDestTy()) {
-    Val = new BitCastInst(Val, BC->getDestTy(), "", BC);
-    Val->takeName(BC);
-  }
-  BC->replaceAllUsesWith(Val);
-  DeadInsts.push_back(BC);
-}
-
-/// FindElementAndOffset - Return the index of the element containing Offset
-/// within the specified type, which must be either a struct or an array.
-/// Sets T to the type of the element and Offset to the offset within that
-/// element.  IdxTy is set to the type of the index result to be used in a
-/// GEP instruction.
-uint64_t SROA::FindElementAndOffset(Type *&T, uint64_t &Offset, Type *&IdxTy,
-                                    const DataLayout &DL) {
-  uint64_t Idx = 0;
-
-  if (StructType *ST = dyn_cast<StructType>(T)) {
-    const StructLayout *Layout = DL.getStructLayout(ST);
-    Idx = Layout->getElementContainingOffset(Offset);
-    T = ST->getContainedType(Idx);
-    Offset -= Layout->getElementOffset(Idx);
-    IdxTy = Type::getInt32Ty(T->getContext());
-    return Idx;
-  } else if (ArrayType *AT = dyn_cast<ArrayType>(T)) {
-    T = AT->getElementType();
-    uint64_t EltSize = DL.getTypeAllocSize(T);
-    Idx = Offset / EltSize;
-    Offset -= Idx * EltSize;
-    IdxTy = Type::getInt64Ty(T->getContext());
-    return Idx;
-  }
-  VectorType *VT = cast<VectorType>(T);
-  T = VT->getElementType();
-  uint64_t EltSize = DL.getTypeAllocSize(T);
-  Idx = Offset / EltSize;
-  Offset -= Idx * EltSize;
-  IdxTy = Type::getInt64Ty(T->getContext());
-  return Idx;
-}
-
-/// RewriteGEP - Check if this GEP instruction moves the pointer across
-/// elements of the alloca that are being split apart, and if so, rewrite
-/// the GEP to be relative to the new element.
-void SROA::RewriteGEP(GetElementPtrInst *GEPI, AllocaInst *AI, uint64_t Offset,
-                      SmallVectorImpl<AllocaInst *> &NewElts) {
-  uint64_t OldOffset = Offset;
-  const DataLayout &DL = GEPI->getModule()->getDataLayout();
-  SmallVector<Value*, 8> Indices(GEPI->op_begin() + 1, GEPI->op_end());
-  // If the GEP was dynamic then it must have been a dynamic vector lookup.
-  // In this case, it must be the last GEP operand which is dynamic so keep that
-  // aside until we've found the constant GEP offset then add it back in at the
-  // end.
-  Value* NonConstantIdx = nullptr;
-  if (!GEPI->hasAllConstantIndices())
-    NonConstantIdx = Indices.pop_back_val();
-  Offset += DL.getIndexedOffset(GEPI->getPointerOperandType(), Indices);
-
-  RewriteForScalarRepl(GEPI, AI, Offset, NewElts);
-
-  Type *T = AI->getAllocatedType();
-  Type *IdxTy;
-  uint64_t OldIdx = FindElementAndOffset(T, OldOffset, IdxTy, DL);
-  if (GEPI->getOperand(0) == AI)
-    OldIdx = ~0ULL; // Force the GEP to be rewritten.
-
-  T = AI->getAllocatedType();
-  uint64_t EltOffset = Offset;
-  uint64_t Idx = FindElementAndOffset(T, EltOffset, IdxTy, DL);
-
-  // If this GEP does not move the pointer across elements of the alloca
-  // being split, then it does not needs to be rewritten.
-  if (Idx == OldIdx)
-    return;
-
-  Type *i32Ty = Type::getInt32Ty(AI->getContext());
-  SmallVector<Value*, 8> NewArgs;
-  NewArgs.push_back(Constant::getNullValue(i32Ty));
-  while (EltOffset != 0) {
-    uint64_t EltIdx = FindElementAndOffset(T, EltOffset, IdxTy, DL);
-    NewArgs.push_back(ConstantInt::get(IdxTy, EltIdx));
-  }
-  if (NonConstantIdx) {
-    Type* GepTy = T;
-    // This GEP has a dynamic index.  We need to add "i32 0" to index through
-    // any structs or arrays in the original type until we get to the vector
-    // to index.
-    while (!isa<VectorType>(GepTy)) {
-      NewArgs.push_back(Constant::getNullValue(i32Ty));
-      GepTy = cast<CompositeType>(GepTy)->getTypeAtIndex(0U);
-    }
-    NewArgs.push_back(NonConstantIdx);
-  }
-  Instruction *Val = NewElts[Idx];
-  if (NewArgs.size() > 1) {
-    Val = GetElementPtrInst::CreateInBounds(Val, NewArgs, "", GEPI);
-    Val->takeName(GEPI);
-  }
-  if (Val->getType() != GEPI->getType())
-    Val = new BitCastInst(Val, GEPI->getType(), Val->getName(), GEPI);
-  GEPI->replaceAllUsesWith(Val);
-  DeadInsts.push_back(GEPI);
-}
-
-/// RewriteLifetimeIntrinsic - II is a lifetime.start/lifetime.end. Rewrite it
-/// to mark the lifetime of the scalarized memory.
-void SROA::RewriteLifetimeIntrinsic(IntrinsicInst *II, AllocaInst *AI,
-                                    uint64_t Offset,
-                                    SmallVectorImpl<AllocaInst *> &NewElts) {
-  ConstantInt *OldSize = cast<ConstantInt>(II->getArgOperand(0));
-  // Put matching lifetime markers on everything from Offset up to
-  // Offset+OldSize.
-  Type *AIType = AI->getAllocatedType();
-  const DataLayout &DL = II->getModule()->getDataLayout();
-  uint64_t NewOffset = Offset;
-  Type *IdxTy;
-  uint64_t Idx = FindElementAndOffset(AIType, NewOffset, IdxTy, DL);
-
-  IRBuilder<> Builder(II);
-  uint64_t Size = OldSize->getLimitedValue();
-
-  if (NewOffset) {
-    // Splice the first element and index 'NewOffset' bytes in.  SROA will
-    // split the alloca again later.
-    unsigned AS = AI->getType()->getAddressSpace();
-    Value *V = Builder.CreateBitCast(NewElts[Idx], Builder.getInt8PtrTy(AS));
-    V = Builder.CreateGEP(Builder.getInt8Ty(), V, Builder.getInt64(NewOffset));
-
-    IdxTy = NewElts[Idx]->getAllocatedType();
-    uint64_t EltSize = DL.getTypeAllocSize(IdxTy) - NewOffset;
-    if (EltSize > Size) {
-      EltSize = Size;
-      Size = 0;
-    } else {
-      Size -= EltSize;
-    }
-    if (II->getIntrinsicID() == Intrinsic::lifetime_start)
-      Builder.CreateLifetimeStart(V, Builder.getInt64(EltSize));
-    else
-      Builder.CreateLifetimeEnd(V, Builder.getInt64(EltSize));
-    ++Idx;
-  }
-
-  for (; Idx != NewElts.size() && Size; ++Idx) {
-    IdxTy = NewElts[Idx]->getAllocatedType();
-    uint64_t EltSize = DL.getTypeAllocSize(IdxTy);
-    if (EltSize > Size) {
-      EltSize = Size;
-      Size = 0;
-    } else {
-      Size -= EltSize;
-    }
-    if (II->getIntrinsicID() == Intrinsic::lifetime_start)
-      Builder.CreateLifetimeStart(NewElts[Idx],
-                                  Builder.getInt64(EltSize));
-    else
-      Builder.CreateLifetimeEnd(NewElts[Idx],
-                                Builder.getInt64(EltSize));
-  }
-  DeadInsts.push_back(II);
-}
-
-/// RewriteMemIntrinUserOfAlloca - MI is a memcpy/memset/memmove from or to AI.
-/// Rewrite it to copy or set the elements of the scalarized memory.
-void
-SROA::RewriteMemIntrinUserOfAlloca(MemIntrinsic *MI, Instruction *Inst,
-                                   AllocaInst *AI,
-                                   SmallVectorImpl<AllocaInst *> &NewElts) {
-  // If this is a memcpy/memmove, construct the other pointer as the
-  // appropriate type.  The "Other" pointer is the pointer that goes to memory
-  // that doesn't have anything to do with the alloca that we are promoting. For
-  // memset, this Value* stays null.
-  Value *OtherPtr = nullptr;
-  unsigned MemAlignment = MI->getAlignment();
-  if (MemTransferInst *MTI = dyn_cast<MemTransferInst>(MI)) { // memmove/memcopy
-    if (Inst == MTI->getRawDest())
-      OtherPtr = MTI->getRawSource();
-    else {
-      assert(Inst == MTI->getRawSource());
-      OtherPtr = MTI->getRawDest();
-    }
-  }
-
-  // If there is an other pointer, we want to convert it to the same pointer
-  // type as AI has, so we can GEP through it safely.
-  if (OtherPtr) {
-    unsigned AddrSpace =
-      cast<PointerType>(OtherPtr->getType())->getAddressSpace();
-
-    // Remove bitcasts and all-zero GEPs from OtherPtr.  This is an
-    // optimization, but it's also required to detect the corner case where
-    // both pointer operands are referencing the same memory, and where
-    // OtherPtr may be a bitcast or GEP that currently being rewritten.  (This
-    // function is only called for mem intrinsics that access the whole
-    // aggregate, so non-zero GEPs are not an issue here.)
-    OtherPtr = OtherPtr->stripPointerCasts();
-
-    // Copying the alloca to itself is a no-op: just delete it.
-    if (OtherPtr == AI || OtherPtr == NewElts[0]) {
-      // This code will run twice for a no-op memcpy -- once for each operand.
-      // Put only one reference to MI on the DeadInsts list.
-      for (SmallVectorImpl<Value *>::const_iterator I = DeadInsts.begin(),
-             E = DeadInsts.end(); I != E; ++I)
-        if (*I == MI) return;
-      DeadInsts.push_back(MI);
-      return;
-    }
-
-    // If the pointer is not the right type, insert a bitcast to the right
-    // type.
-    Type *NewTy =
-      PointerType::get(AI->getType()->getElementType(), AddrSpace);
-
-    if (OtherPtr->getType() != NewTy)
-      OtherPtr = new BitCastInst(OtherPtr, NewTy, OtherPtr->getName(), MI);
-  }
-
-  // Process each element of the aggregate.
-  bool SROADest = MI->getRawDest() == Inst;
-
-  Constant *Zero = Constant::getNullValue(Type::getInt32Ty(MI->getContext()));
-  const DataLayout &DL = MI->getModule()->getDataLayout();
-
-  for (unsigned i = 0, e = NewElts.size(); i != e; ++i) {
-    // If this is a memcpy/memmove, emit a GEP of the other element address.
-    Value *OtherElt = nullptr;
-    unsigned OtherEltAlign = MemAlignment;
-
-    if (OtherPtr) {
-      Value *Idx[2] = { Zero,
-                      ConstantInt::get(Type::getInt32Ty(MI->getContext()), i) };
-      OtherElt = GetElementPtrInst::CreateInBounds(OtherPtr, Idx,
-                                              OtherPtr->getName()+"."+Twine(i),
-                                                   MI);
-      uint64_t EltOffset;
-      PointerType *OtherPtrTy = cast<PointerType>(OtherPtr->getType());
-      Type *OtherTy = OtherPtrTy->getElementType();
-      if (StructType *ST = dyn_cast<StructType>(OtherTy)) {
-        EltOffset = DL.getStructLayout(ST)->getElementOffset(i);
-      } else {
-        Type *EltTy = cast<SequentialType>(OtherTy)->getElementType();
-        EltOffset = DL.getTypeAllocSize(EltTy) * i;
-      }
-
-      // The alignment of the other pointer is the guaranteed alignment of the
-      // element, which is affected by both the known alignment of the whole
-      // mem intrinsic and the alignment of the element.  If the alignment of
-      // the memcpy (f.e.) is 32 but the element is at a 4-byte offset, then the
-      // known alignment is just 4 bytes.
-      OtherEltAlign = (unsigned)MinAlign(OtherEltAlign, EltOffset);
-    }
-
-    Value *EltPtr = NewElts[i];
-    Type *EltTy = cast<PointerType>(EltPtr->getType())->getElementType();
-
-    // If we got down to a scalar, insert a load or store as appropriate.
-    if (EltTy->isSingleValueType()) {
-      if (isa<MemTransferInst>(MI)) {
-        if (SROADest) {
-          // From Other to Alloca.
-          Value *Elt = new LoadInst(OtherElt, "tmp", false, OtherEltAlign, MI);
-          new StoreInst(Elt, EltPtr, MI);
-        } else {
-          // From Alloca to Other.
-          Value *Elt = new LoadInst(EltPtr, "tmp", MI);
-          new StoreInst(Elt, OtherElt, false, OtherEltAlign, MI);
-        }
-        continue;
-      }
-      assert(isa<MemSetInst>(MI));
-
-      // If the stored element is zero (common case), just store a null
-      // constant.
-      Constant *StoreVal;
-      if (ConstantInt *CI = dyn_cast<ConstantInt>(MI->getArgOperand(1))) {
-        if (CI->isZero()) {
-          StoreVal = Constant::getNullValue(EltTy);  // 0.0, null, 0, <0,0>
-        } else {
-          // If EltTy is a vector type, get the element type.
-          Type *ValTy = EltTy->getScalarType();
-
-          // Construct an integer with the right value.
-          unsigned EltSize = DL.getTypeSizeInBits(ValTy);
-          APInt OneVal(EltSize, CI->getZExtValue());
-          APInt TotalVal(OneVal);
-          // Set each byte.
-          for (unsigned i = 0; 8*i < EltSize; ++i) {
-            TotalVal = TotalVal.shl(8);
-            TotalVal |= OneVal;
-          }
-
-          // Convert the integer value to the appropriate type.
-          StoreVal = ConstantInt::get(CI->getContext(), TotalVal);
-          if (ValTy->isPointerTy())
-            StoreVal = ConstantExpr::getIntToPtr(StoreVal, ValTy);
-          else if (ValTy->isFloatingPointTy())
-            StoreVal = ConstantExpr::getBitCast(StoreVal, ValTy);
-          assert(StoreVal->getType() == ValTy && "Type mismatch!");
-
-          // If the requested value was a vector constant, create it.
-          if (EltTy->isVectorTy()) {
-            unsigned NumElts = cast<VectorType>(EltTy)->getNumElements();
-            StoreVal = ConstantVector::getSplat(NumElts, StoreVal);
-          }
-        }
-        new StoreInst(StoreVal, EltPtr, MI);
-        continue;
-      }
-      // Otherwise, if we're storing a byte variable, use a memset call for
-      // this element.
-    }
-
-    unsigned EltSize = DL.getTypeAllocSize(EltTy);
-    if (!EltSize)
-      continue;
-
-    IRBuilder<> Builder(MI);
-
-    // Finally, insert the meminst for this element.
-    if (isa<MemSetInst>(MI)) {
-      Builder.CreateMemSet(EltPtr, MI->getArgOperand(1), EltSize,
-                           MI->isVolatile());
-    } else {
-      assert(isa<MemTransferInst>(MI));
-      Value *Dst = SROADest ? EltPtr : OtherElt;  // Dest ptr
-      Value *Src = SROADest ? OtherElt : EltPtr;  // Src ptr
-
-      if (isa<MemCpyInst>(MI))
-        Builder.CreateMemCpy(Dst, Src, EltSize, OtherEltAlign,MI->isVolatile());
-      else
-        Builder.CreateMemMove(Dst, Src, EltSize,OtherEltAlign,MI->isVolatile());
-    }
-  }
-  DeadInsts.push_back(MI);
-}
-
-/// RewriteStoreUserOfWholeAlloca - We found a store of an integer that
-/// overwrites the entire allocation.  Extract out the pieces of the stored
-/// integer and store them individually.
-void
-SROA::RewriteStoreUserOfWholeAlloca(StoreInst *SI, AllocaInst *AI,
-                                    SmallVectorImpl<AllocaInst *> &NewElts) {
-  // Extract each element out of the integer according to its structure offset
-  // and store the element value to the individual alloca.
-  Value *SrcVal = SI->getOperand(0);
-  Type *AllocaEltTy = AI->getAllocatedType();
-  const DataLayout &DL = SI->getModule()->getDataLayout();
-  uint64_t AllocaSizeBits = DL.getTypeAllocSizeInBits(AllocaEltTy);
-
-  IRBuilder<> Builder(SI);
-
-  // Handle tail padding by extending the operand
-  if (DL.getTypeSizeInBits(SrcVal->getType()) != AllocaSizeBits)
-    SrcVal = Builder.CreateZExt(SrcVal,
-                            IntegerType::get(SI->getContext(), AllocaSizeBits));
-
-  DEBUG(dbgs() << "PROMOTING STORE TO WHOLE ALLOCA: " << *AI << '\n' << *SI
-               << '\n');
-
-  // There are two forms here: AI could be an array or struct.  Both cases
-  // have different ways to compute the element offset.
-  if (StructType *EltSTy = dyn_cast<StructType>(AllocaEltTy)) {
-    const StructLayout *Layout = DL.getStructLayout(EltSTy);
-
-    for (unsigned i = 0, e = NewElts.size(); i != e; ++i) {
-      // Get the number of bits to shift SrcVal to get the value.
-      Type *FieldTy = EltSTy->getElementType(i);
-      uint64_t Shift = Layout->getElementOffsetInBits(i);
-
-      if (DL.isBigEndian())
-        Shift = AllocaSizeBits - Shift - DL.getTypeAllocSizeInBits(FieldTy);
-
-      Value *EltVal = SrcVal;
-      if (Shift) {
-        Value *ShiftVal = ConstantInt::get(EltVal->getType(), Shift);
-        EltVal = Builder.CreateLShr(EltVal, ShiftVal, "sroa.store.elt");
-      }
-
-      // Truncate down to an integer of the right size.
-      uint64_t FieldSizeBits = DL.getTypeSizeInBits(FieldTy);
-
-      // Ignore zero sized fields like {}, they obviously contain no data.
-      if (FieldSizeBits == 0) continue;
-
-      if (FieldSizeBits != AllocaSizeBits)
-        EltVal = Builder.CreateTrunc(EltVal,
-                             IntegerType::get(SI->getContext(), FieldSizeBits));
-      Value *DestField = NewElts[i];
-      if (EltVal->getType() == FieldTy) {
-        // Storing to an integer field of this size, just do it.
-      } else if (FieldTy->isFloatingPointTy() || FieldTy->isVectorTy()) {
-        // Bitcast to the right element type (for fp/vector values).
-        EltVal = Builder.CreateBitCast(EltVal, FieldTy);
-      } else {
-        // Otherwise, bitcast the dest pointer (for aggregates).
-        DestField = Builder.CreateBitCast(DestField,
-                                     PointerType::getUnqual(EltVal->getType()));
-      }
-      new StoreInst(EltVal, DestField, SI);
-    }
-
-  } else {
-    ArrayType *ATy = cast<ArrayType>(AllocaEltTy);
-    Type *ArrayEltTy = ATy->getElementType();
-    uint64_t ElementOffset = DL.getTypeAllocSizeInBits(ArrayEltTy);
-    uint64_t ElementSizeBits = DL.getTypeSizeInBits(ArrayEltTy);
-
-    uint64_t Shift;
-
-    if (DL.isBigEndian())
-      Shift = AllocaSizeBits-ElementOffset;
-    else
-      Shift = 0;
-
-    for (unsigned i = 0, e = NewElts.size(); i != e; ++i) {
-      // Ignore zero sized fields like {}, they obviously contain no data.
-      if (ElementSizeBits == 0) continue;
-
-      Value *EltVal = SrcVal;
-      if (Shift) {
-        Value *ShiftVal = ConstantInt::get(EltVal->getType(), Shift);
-        EltVal = Builder.CreateLShr(EltVal, ShiftVal, "sroa.store.elt");
-      }
-
-      // Truncate down to an integer of the right size.
-      if (ElementSizeBits != AllocaSizeBits)
-        EltVal = Builder.CreateTrunc(EltVal,
-                                     IntegerType::get(SI->getContext(),
-                                                      ElementSizeBits));
-      Value *DestField = NewElts[i];
-      if (EltVal->getType() == ArrayEltTy) {
-        // Storing to an integer field of this size, just do it.
-      } else if (ArrayEltTy->isFloatingPointTy() ||
-                 ArrayEltTy->isVectorTy()) {
-        // Bitcast to the right element type (for fp/vector values).
-        EltVal = Builder.CreateBitCast(EltVal, ArrayEltTy);
-      } else {
-        // Otherwise, bitcast the dest pointer (for aggregates).
-        DestField = Builder.CreateBitCast(DestField,
-                                     PointerType::getUnqual(EltVal->getType()));
-      }
-      new StoreInst(EltVal, DestField, SI);
-
-      if (DL.isBigEndian())
-        Shift -= ElementOffset;
-      else
-        Shift += ElementOffset;
-    }
-  }
-
-  DeadInsts.push_back(SI);
-}
-
-/// RewriteLoadUserOfWholeAlloca - We found a load of the entire allocation to
-/// an integer.  Load the individual pieces to form the aggregate value.
-void
-SROA::RewriteLoadUserOfWholeAlloca(LoadInst *LI, AllocaInst *AI,
-                                   SmallVectorImpl<AllocaInst *> &NewElts) {
-  // Extract each element out of the NewElts according to its structure offset
-  // and form the result value.
-  Type *AllocaEltTy = AI->getAllocatedType();
-  const DataLayout &DL = LI->getModule()->getDataLayout();
-  uint64_t AllocaSizeBits = DL.getTypeAllocSizeInBits(AllocaEltTy);
-
-  DEBUG(dbgs() << "PROMOTING LOAD OF WHOLE ALLOCA: " << *AI << '\n' << *LI
-               << '\n');
-
-  // There are two forms here: AI could be an array or struct.  Both cases
-  // have different ways to compute the element offset.
-  const StructLayout *Layout = nullptr;
-  uint64_t ArrayEltBitOffset = 0;
-  if (StructType *EltSTy = dyn_cast<StructType>(AllocaEltTy)) {
-    Layout = DL.getStructLayout(EltSTy);
-  } else {
-    Type *ArrayEltTy = cast<ArrayType>(AllocaEltTy)->getElementType();
-    ArrayEltBitOffset = DL.getTypeAllocSizeInBits(ArrayEltTy);
-  }
-
-  Value *ResultVal =
-    Constant::getNullValue(IntegerType::get(LI->getContext(), AllocaSizeBits));
-
-  for (unsigned i = 0, e = NewElts.size(); i != e; ++i) {
-    // Load the value from the alloca.  If the NewElt is an aggregate, cast
-    // the pointer to an integer of the same size before doing the load.
-    Value *SrcField = NewElts[i];
-    Type *FieldTy =
-      cast<PointerType>(SrcField->getType())->getElementType();
-    uint64_t FieldSizeBits = DL.getTypeSizeInBits(FieldTy);
-
-    // Ignore zero sized fields like {}, they obviously contain no data.
-    if (FieldSizeBits == 0) continue;
-
-    IntegerType *FieldIntTy = IntegerType::get(LI->getContext(),
-                                                     FieldSizeBits);
-    if (!FieldTy->isIntegerTy() && !FieldTy->isFloatingPointTy() &&
-        !FieldTy->isVectorTy())
-      SrcField = new BitCastInst(SrcField,
-                                 PointerType::getUnqual(FieldIntTy),
-                                 "", LI);
-    SrcField = new LoadInst(SrcField, "sroa.load.elt", LI);
-
-    // If SrcField is a fp or vector of the right size but that isn't an
-    // integer type, bitcast to an integer so we can shift it.
-    if (SrcField->getType() != FieldIntTy)
-      SrcField = new BitCastInst(SrcField, FieldIntTy, "", LI);
-
-    // Zero extend the field to be the same size as the final alloca so that
-    // we can shift and insert it.
-    if (SrcField->getType() != ResultVal->getType())
-      SrcField = new ZExtInst(SrcField, ResultVal->getType(), "", LI);
-
-    // Determine the number of bits to shift SrcField.
-    uint64_t Shift;
-    if (Layout) // Struct case.
-      Shift = Layout->getElementOffsetInBits(i);
-    else  // Array case.
-      Shift = i*ArrayEltBitOffset;
-
-    if (DL.isBigEndian())
-      Shift = AllocaSizeBits-Shift-FieldIntTy->getBitWidth();
-
-    if (Shift) {
-      Value *ShiftVal = ConstantInt::get(SrcField->getType(), Shift);
-      SrcField = BinaryOperator::CreateShl(SrcField, ShiftVal, "", LI);
-    }
-
-    // Don't create an 'or x, 0' on the first iteration.
-    if (!isa<Constant>(ResultVal) ||
-        !cast<Constant>(ResultVal)->isNullValue())
-      ResultVal = BinaryOperator::CreateOr(SrcField, ResultVal, "", LI);
-    else
-      ResultVal = SrcField;
-  }
-
-  // Handle tail padding by truncating the result
-  if (DL.getTypeSizeInBits(LI->getType()) != AllocaSizeBits)
-    ResultVal = new TruncInst(ResultVal, LI->getType(), "", LI);
-
-  LI->replaceAllUsesWith(ResultVal);
-  DeadInsts.push_back(LI);
-}
-
-/// HasPadding - Return true if the specified type has any structure or
-/// alignment padding in between the elements that would be split apart
-/// by SROA; return false otherwise.
-static bool HasPadding(Type *Ty, const DataLayout &DL) {
-  if (ArrayType *ATy = dyn_cast<ArrayType>(Ty)) {
-    Ty = ATy->getElementType();
-    return DL.getTypeSizeInBits(Ty) != DL.getTypeAllocSizeInBits(Ty);
-  }
-
-  // SROA currently handles only Arrays and Structs.
-  StructType *STy = cast<StructType>(Ty);
-  const StructLayout *SL = DL.getStructLayout(STy);
-  unsigned PrevFieldBitOffset = 0;
-  for (unsigned i = 0, e = STy->getNumElements(); i != e; ++i) {
-    unsigned FieldBitOffset = SL->getElementOffsetInBits(i);
-
-    // Check to see if there is any padding between this element and the
-    // previous one.
-    if (i) {
-      unsigned PrevFieldEnd =
-        PrevFieldBitOffset+DL.getTypeSizeInBits(STy->getElementType(i-1));
-      if (PrevFieldEnd < FieldBitOffset)
-        return true;
-    }
-    PrevFieldBitOffset = FieldBitOffset;
-  }
-  // Check for tail padding.
-  if (unsigned EltCount = STy->getNumElements()) {
-    unsigned PrevFieldEnd = PrevFieldBitOffset +
-      DL.getTypeSizeInBits(STy->getElementType(EltCount-1));
-    if (PrevFieldEnd < SL->getSizeInBits())
-      return true;
-  }
-  return false;
-}
-
-/// isSafeStructAllocaToScalarRepl - Check to see if the specified allocation of
-/// an aggregate can be broken down into elements.  Return 0 if not, 3 if safe,
-/// or 1 if safe after canonicalization has been performed.
-bool SROA::isSafeAllocaToScalarRepl(AllocaInst *AI) {
-  // Loop over the use list of the alloca.  We can only transform it if all of
-  // the users are safe to transform.
-  AllocaInfo Info(AI);
-
-  isSafeForScalarRepl(AI, 0, Info);
-  if (Info.isUnsafe) {
-    DEBUG(dbgs() << "Cannot transform: " << *AI << '\n');
-    return false;
-  }
-
-  const DataLayout &DL = AI->getModule()->getDataLayout();
-
-  // Okay, we know all the users are promotable.  If the aggregate is a memcpy
-  // source and destination, we have to be careful.  In particular, the memcpy
-  // could be moving around elements that live in structure padding of the LLVM
-  // types, but may actually be used.  In these cases, we refuse to promote the
-  // struct.
-  if (Info.isMemCpySrc && Info.isMemCpyDst &&
-      HasPadding(AI->getAllocatedType(), DL))
-    return false;
-
-  // If the alloca never has an access to just *part* of it, but is accessed
-  // via loads and stores, then we should use ConvertToScalarInfo to promote
-  // the alloca instead of promoting each piece at a time and inserting fission
-  // and fusion code.
-  if (!Info.hasSubelementAccess && Info.hasALoadOrStore) {
-    // If the struct/array just has one element, use basic SRoA.
-    if (StructType *ST = dyn_cast<StructType>(AI->getAllocatedType())) {
-      if (ST->getNumElements() > 1) return false;
-    } else {
-      if (cast<ArrayType>(AI->getAllocatedType())->getNumElements() > 1)
-        return false;
-    }
-  }
-
-  return true;
-}
diff --git a/lib/Transforms/Scalar/Scalarizer.cpp b/lib/Transforms/Scalar/Scalarizer.cpp
index 054bacdc706b..aed4a4ad4d26 100644
--- a/lib/Transforms/Scalar/Scalarizer.cpp
+++ b/lib/Transforms/Scalar/Scalarizer.cpp
@@ -14,12 +14,11 @@
 //
 //===----------------------------------------------------------------------===//
 
+#include "llvm/Transforms/Scalar.h"
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/IR/IRBuilder.h"
 #include "llvm/IR/InstVisitor.h"
 #include "llvm/Pass.h"
-#include "llvm/Support/CommandLine.h"
-#include "llvm/Transforms/Scalar.h"
 #include "llvm/Transforms/Utils/BasicBlockUtils.h"
 
 using namespace llvm;
@@ -253,6 +252,8 @@ bool Scalarizer::doInitialization(Module &M) {
 }
 
 bool Scalarizer::runOnFunction(Function &F) {
+  if (skipFunction(F))
+    return false;
   assert(Gathered.empty() && Scattered.empty());
   for (BasicBlock &BB : F) {
     for (BasicBlock::iterator II = BB.begin(), IE = BB.end(); II != IE;) {
@@ -305,7 +306,11 @@ void Scalarizer::gather(Instruction *Op, const ValueVector &CV) {
   ValueVector &SV = Scattered[Op];
   if (!SV.empty()) {
     for (unsigned I = 0, E = SV.size(); I != E; ++I) {
-      Instruction *Old = cast<Instruction>(SV[I]);
+      Value *V = SV[I];
+      if (V == nullptr)
+        continue;
+
+      Instruction *Old = cast<Instruction>(V);
       CV[I]->takeName(Old);
       Old->replaceAllUsesWith(CV[I]);
       Old->eraseFromParent();
@@ -334,13 +339,11 @@ void Scalarizer::transferMetadata(Instruction *Op, const ValueVector &CV) {
   Op->getAllMetadataOtherThanDebugLoc(MDs);
   for (unsigned I = 0, E = CV.size(); I != E; ++I) {
     if (Instruction *New = dyn_cast<Instruction>(CV[I])) {
-      for (SmallVectorImpl<std::pair<unsigned, MDNode *>>::iterator
-               MI = MDs.begin(),
-               ME = MDs.end();
-           MI != ME; ++MI)
-        if (canTransferMetadata(MI->first))
-          New->setMetadata(MI->first, MI->second);
-      New->setDebugLoc(Op->getDebugLoc());
+      for (const auto &MD : MDs)
+        if (canTransferMetadata(MD.first))
+          New->setMetadata(MD.first, MD.second);
+      if (Op->getDebugLoc() && !New->getDebugLoc())
+        New->setDebugLoc(Op->getDebugLoc());
     }
   }
 }
@@ -646,10 +649,9 @@ bool Scalarizer::finish() {
   // made to the Function.
   if (Gathered.empty() && Scattered.empty())
     return false;
-  for (GatherList::iterator GMI = Gathered.begin(), GME = Gathered.end();
-       GMI != GME; ++GMI) {
-    Instruction *Op = GMI->first;
-    ValueVector &CV = *GMI->second;
+  for (const auto &GMI : Gathered) {
+    Instruction *Op = GMI.first;
+    ValueVector &CV = *GMI.second;
     if (!Op->use_empty()) {
       // The value is still needed, so recreate it using a series of
       // InsertElements.
diff --git a/lib/Transforms/Scalar/SeparateConstOffsetFromGEP.cpp b/lib/Transforms/Scalar/SeparateConstOffsetFromGEP.cpp
index 86a10d2a1612..d6ae186698c7 100644
--- a/lib/Transforms/Scalar/SeparateConstOffsetFromGEP.cpp
+++ b/lib/Transforms/Scalar/SeparateConstOffsetFromGEP.cpp
@@ -590,9 +590,9 @@ Value *ConstantOffsetExtractor::rebuildWithoutConstOffset() {
   distributeExtsAndCloneChain(UserChain.size() - 1);
   // Remove all nullptrs (used to be s/zext) from UserChain.
   unsigned NewSize = 0;
-  for (auto I = UserChain.begin(), E = UserChain.end(); I != E; ++I) {
-    if (*I != nullptr) {
-      UserChain[NewSize] = *I;
+  for (User *I : UserChain) {
+    if (I != nullptr) {
+      UserChain[NewSize] = I;
       NewSize++;
     }
   }
@@ -824,8 +824,8 @@ void SeparateConstOffsetFromGEP::lowerToSingleIndexGEPs(
   // If we created a GEP with constant index, and the base is loop invariant,
   // then we swap the first one with it, so LICM can move constant GEP out
   // later.
-  GetElementPtrInst *FirstGEP = dyn_cast<GetElementPtrInst>(FirstResult);
-  GetElementPtrInst *SecondGEP = dyn_cast<GetElementPtrInst>(ResultPtr);
+  GetElementPtrInst *FirstGEP = dyn_cast_or_null<GetElementPtrInst>(FirstResult);
+  GetElementPtrInst *SecondGEP = dyn_cast_or_null<GetElementPtrInst>(ResultPtr);
   if (isSwapCandidate && isLegalToSwapOperand(FirstGEP, SecondGEP, L))
     swapGEPOperand(FirstGEP, SecondGEP);
 
@@ -911,7 +911,7 @@ bool SeparateConstOffsetFromGEP::splitGEP(GetElementPtrInst *GEP) {
         getAnalysis<TargetTransformInfoWrapperPass>().getTTI(
             *GEP->getParent()->getParent());
     unsigned AddrSpace = GEP->getPointerAddressSpace();
-    if (!TTI.isLegalAddressingMode(GEP->getType()->getElementType(),
+    if (!TTI.isLegalAddressingMode(GEP->getResultElementType(),
                                    /*BaseGV=*/nullptr, AccumulativeByteOffset,
                                    /*HasBaseReg=*/true, /*Scale=*/0,
                                    AddrSpace)) {
@@ -1018,7 +1018,7 @@ bool SeparateConstOffsetFromGEP::splitGEP(GetElementPtrInst *GEP) {
   // unsigned.. Therefore, we cast ElementTypeSizeOfGEP to signed because it is
   // used with unsigned integers later.
   int64_t ElementTypeSizeOfGEP = static_cast<int64_t>(
-      DL->getTypeAllocSize(GEP->getType()->getElementType()));
+      DL->getTypeAllocSize(GEP->getResultElementType()));
   Type *IntPtrTy = DL->getIntPtrType(GEP->getType());
   if (AccumulativeByteOffset % ElementTypeSizeOfGEP == 0) {
     // Very likely. As long as %gep is natually aligned, the byte offset we
@@ -1064,7 +1064,7 @@ bool SeparateConstOffsetFromGEP::splitGEP(GetElementPtrInst *GEP) {
 }
 
 bool SeparateConstOffsetFromGEP::runOnFunction(Function &F) {
-  if (skipOptnoneFunction(F))
+  if (skipFunction(F))
     return false;
 
   if (DisableSeparateConstOffsetFromGEP)
@@ -1075,8 +1075,8 @@ bool SeparateConstOffsetFromGEP::runOnFunction(Function &F) {
   LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
   TLI = &getAnalysis<TargetLibraryInfoWrapperPass>().getTLI();
   bool Changed = false;
-  for (Function::iterator B = F.begin(), BE = F.end(); B != BE; ++B) {
-    for (BasicBlock::iterator I = B->begin(), IE = B->end(); I != IE;)
+  for (BasicBlock &B : F) {
+    for (BasicBlock::iterator I = B.begin(), IE = B.end(); I != IE;)
       if (GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(I++))
         Changed |= splitGEP(GEP);
     // No need to split GEP ConstantExprs because all its indices are constant
@@ -1162,8 +1162,8 @@ bool SeparateConstOffsetFromGEP::reuniteExts(Function &F) {
 }
 
 void SeparateConstOffsetFromGEP::verifyNoDeadCode(Function &F) {
-  for (auto &B : F) {
-    for (auto &I : B) {
+  for (BasicBlock &B : F) {
+    for (Instruction &I : B) {
       if (isInstructionTriviallyDead(&I)) {
         std::string ErrMessage;
         raw_string_ostream RSO(ErrMessage);
diff --git a/lib/Transforms/Scalar/SimplifyCFGPass.cpp b/lib/Transforms/Scalar/SimplifyCFGPass.cpp
index 63c8836bf381..2d0a21d2c518 100644
--- a/lib/Transforms/Scalar/SimplifyCFGPass.cpp
+++ b/lib/Transforms/Scalar/SimplifyCFGPass.cpp
@@ -21,12 +21,12 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "llvm/Transforms/Scalar/SimplifyCFG.h"
 #include "llvm/ADT/SmallPtrSet.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/Statistic.h"
-#include "llvm/Analysis/GlobalsModRef.h"
 #include "llvm/Analysis/AssumptionCache.h"
+#include "llvm/Analysis/CFG.h"
+#include "llvm/Analysis/GlobalsModRef.h"
 #include "llvm/Analysis/TargetTransformInfo.h"
 #include "llvm/IR/Attributes.h"
 #include "llvm/IR/CFG.h"
@@ -37,8 +37,10 @@
 #include "llvm/IR/Module.h"
 #include "llvm/Pass.h"
 #include "llvm/Support/CommandLine.h"
-#include "llvm/Transforms/Utils/Local.h"
 #include "llvm/Transforms/Scalar.h"
+#include "llvm/Transforms/Scalar/SimplifyCFG.h"
+#include "llvm/Transforms/Utils/Local.h"
+#include <utility>
 using namespace llvm;
 
 #define DEBUG_TYPE "simplifycfg"
@@ -131,12 +133,19 @@ static bool iterativelySimplifyCFG(Function &F, const TargetTransformInfo &TTI,
                                    unsigned BonusInstThreshold) {
   bool Changed = false;
   bool LocalChange = true;
+
+  SmallVector<std::pair<const BasicBlock *, const BasicBlock *>, 32> Edges;
+  FindFunctionBackedges(F, Edges);
+  SmallPtrSet<BasicBlock *, 16> LoopHeaders;
+  for (unsigned i = 0, e = Edges.size(); i != e; ++i)
+    LoopHeaders.insert(const_cast<BasicBlock *>(Edges[i].second));
+
   while (LocalChange) {
     LocalChange = false;
 
     // Loop over all of the basic blocks and remove them if they are unneeded.
     for (Function::iterator BBIt = F.begin(); BBIt != F.end(); ) {
-      if (SimplifyCFG(&*BBIt++, TTI, BonusInstThreshold, AC)) {
+      if (SimplifyCFG(&*BBIt++, TTI, BonusInstThreshold, AC, &LoopHeaders)) {
         LocalChange = true;
         ++NumSimpl;
       }
@@ -178,14 +187,15 @@ SimplifyCFGPass::SimplifyCFGPass(int BonusInstThreshold)
     : BonusInstThreshold(BonusInstThreshold) {}
 
 PreservedAnalyses SimplifyCFGPass::run(Function &F,
-                                       AnalysisManager<Function> *AM) {
-  auto &TTI = AM->getResult<TargetIRAnalysis>(F);
-  auto &AC = AM->getResult<AssumptionAnalysis>(F);
+                                       AnalysisManager<Function> &AM) {
+  auto &TTI = AM.getResult<TargetIRAnalysis>(F);
+  auto &AC = AM.getResult<AssumptionAnalysis>(F);
 
   if (!simplifyFunctionCFG(F, TTI, &AC, BonusInstThreshold))
-    return PreservedAnalyses::none();
-
-  return PreservedAnalyses::all();
+    return PreservedAnalyses::all();
+  PreservedAnalyses PA;
+  PA.preserve<GlobalsAA>();
+  return PA;
 }
 
 namespace {
@@ -196,15 +206,12 @@ struct CFGSimplifyPass : public FunctionPass {
 
   CFGSimplifyPass(int T = -1,
                   std::function<bool(const Function &)> Ftor = nullptr)
-      : FunctionPass(ID), PredicateFtor(Ftor) {
+      : FunctionPass(ID), PredicateFtor(std::move(Ftor)) {
     BonusInstThreshold = (T == -1) ? UserBonusInstThreshold : unsigned(T);
     initializeCFGSimplifyPassPass(*PassRegistry::getPassRegistry());
   }
   bool runOnFunction(Function &F) override {
-    if (PredicateFtor && !PredicateFtor(F))
-      return false;
-
-    if (skipOptnoneFunction(F))
+    if (skipFunction(F) || (PredicateFtor && !PredicateFtor(F)))
       return false;
 
     AssumptionCache *AC =
@@ -234,6 +241,5 @@ INITIALIZE_PASS_END(CFGSimplifyPass, "simplifycfg", "Simplify the CFG", false,
 FunctionPass *
 llvm::createCFGSimplificationPass(int Threshold,
                                   std::function<bool(const Function &)> Ftor) {
-  return new CFGSimplifyPass(Threshold, Ftor);
+  return new CFGSimplifyPass(Threshold, std::move(Ftor));
 }
-
diff --git a/lib/Transforms/Scalar/Sink.cpp b/lib/Transforms/Scalar/Sink.cpp
index 64109b2df117..d9a296c63122 100644
--- a/lib/Transforms/Scalar/Sink.cpp
+++ b/lib/Transforms/Scalar/Sink.cpp
@@ -12,7 +12,7 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "llvm/Transforms/Scalar.h"
+#include "llvm/Transforms/Scalar/Sink.h"
 #include "llvm/ADT/Statistic.h"
 #include "llvm/Analysis/AliasAnalysis.h"
 #include "llvm/Analysis/LoopInfo.h"
@@ -24,6 +24,7 @@
 #include "llvm/IR/Module.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/raw_ostream.h"
+#include "llvm/Transforms/Scalar.h"
 using namespace llvm;
 
 #define DEBUG_TYPE "sink"
@@ -31,50 +32,10 @@ using namespace llvm;
 STATISTIC(NumSunk, "Number of instructions sunk");
 STATISTIC(NumSinkIter, "Number of sinking iterations");
 
-namespace {
-  class Sinking : public FunctionPass {
-    DominatorTree *DT;
-    LoopInfo *LI;
-    AliasAnalysis *AA;
-
-  public:
-    static char ID; // Pass identification
-    Sinking() : FunctionPass(ID) {
-      initializeSinkingPass(*PassRegistry::getPassRegistry());
-    }
-
-    bool runOnFunction(Function &F) override;
-
-    void getAnalysisUsage(AnalysisUsage &AU) const override {
-      AU.setPreservesCFG();
-      FunctionPass::getAnalysisUsage(AU);
-      AU.addRequired<AAResultsWrapperPass>();
-      AU.addRequired<DominatorTreeWrapperPass>();
-      AU.addRequired<LoopInfoWrapperPass>();
-      AU.addPreserved<DominatorTreeWrapperPass>();
-      AU.addPreserved<LoopInfoWrapperPass>();
-    }
-  private:
-    bool ProcessBlock(BasicBlock &BB);
-    bool SinkInstruction(Instruction *I, SmallPtrSetImpl<Instruction*> &Stores);
-    bool AllUsesDominatedByBlock(Instruction *Inst, BasicBlock *BB) const;
-    bool IsAcceptableTarget(Instruction *Inst, BasicBlock *SuccToSinkTo) const;
-  };
-} // end anonymous namespace
-
-char Sinking::ID = 0;
-INITIALIZE_PASS_BEGIN(Sinking, "sink", "Code sinking", false, false)
-INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass)
-INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
-INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass)
-INITIALIZE_PASS_END(Sinking, "sink", "Code sinking", false, false)
-
-FunctionPass *llvm::createSinkingPass() { return new Sinking(); }
-
 /// AllUsesDominatedByBlock - Return true if all uses of the specified value
 /// occur in blocks dominated by the specified block.
-bool Sinking::AllUsesDominatedByBlock(Instruction *Inst,
-                                      BasicBlock *BB) const {
+static bool AllUsesDominatedByBlock(Instruction *Inst, BasicBlock *BB,
+                                    DominatorTree &DT) {
   // Ignoring debug uses is necessary so debug info doesn't affect the code.
   // This may leave a referencing dbg_value in the original block, before
   // the definition of the vreg.  Dwarf generator handles this although the
@@ -90,71 +51,13 @@ bool Sinking::AllUsesDominatedByBlock(Instruction *Inst,
       UseBlock = PN->getIncomingBlock(Num);
     }
     // Check that it dominates.
-    if (!DT->dominates(BB, UseBlock))
+    if (!DT.dominates(BB, UseBlock))
       return false;
   }
   return true;
 }
 
-bool Sinking::runOnFunction(Function &F) {
-  DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
-  LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
-  AA = &getAnalysis<AAResultsWrapperPass>().getAAResults();
-
-  bool MadeChange, EverMadeChange = false;
-
-  do {
-    MadeChange = false;
-    DEBUG(dbgs() << "Sinking iteration " << NumSinkIter << "\n");
-    // Process all basic blocks.
-    for (Function::iterator I = F.begin(), E = F.end();
-         I != E; ++I)
-      MadeChange |= ProcessBlock(*I);
-    EverMadeChange |= MadeChange;
-    NumSinkIter++;
-  } while (MadeChange);
-
-  return EverMadeChange;
-}
-
-bool Sinking::ProcessBlock(BasicBlock &BB) {
-  // Can't sink anything out of a block that has less than two successors.
-  if (BB.getTerminator()->getNumSuccessors() <= 1) return false;
-
-  // Don't bother sinking code out of unreachable blocks. In addition to being
-  // unprofitable, it can also lead to infinite looping, because in an
-  // unreachable loop there may be nowhere to stop.
-  if (!DT->isReachableFromEntry(&BB)) return false;
-
-  bool MadeChange = false;
-
-  // Walk the basic block bottom-up.  Remember if we saw a store.
-  BasicBlock::iterator I = BB.end();
-  --I;
-  bool ProcessedBegin = false;
-  SmallPtrSet<Instruction *, 8> Stores;
-  do {
-    Instruction *Inst = &*I; // The instruction to sink.
-
-    // Predecrement I (if it's not begin) so that it isn't invalidated by
-    // sinking.
-    ProcessedBegin = I == BB.begin();
-    if (!ProcessedBegin)
-      --I;
-
-    if (isa<DbgInfoIntrinsic>(Inst))
-      continue;
-
-    if (SinkInstruction(Inst, Stores))
-      ++NumSunk, MadeChange = true;
-
-    // If we just processed the first instruction in the block, we're done.
-  } while (!ProcessedBegin);
-
-  return MadeChange;
-}
-
-static bool isSafeToMove(Instruction *Inst, AliasAnalysis *AA,
+static bool isSafeToMove(Instruction *Inst, AliasAnalysis &AA,
                          SmallPtrSetImpl<Instruction *> &Stores) {
 
   if (Inst->mayWriteToMemory()) {
@@ -165,7 +68,7 @@ static bool isSafeToMove(Instruction *Inst, AliasAnalysis *AA,
   if (LoadInst *L = dyn_cast<LoadInst>(Inst)) {
     MemoryLocation Loc = MemoryLocation::get(L);
     for (Instruction *S : Stores)
-      if (AA->getModRefInfo(S, Loc) & MRI_Mod)
+      if (AA.getModRefInfo(S, Loc) & MRI_Mod)
         return false;
   }
 
@@ -173,11 +76,15 @@ static bool isSafeToMove(Instruction *Inst, AliasAnalysis *AA,
       Inst->mayThrow())
     return false;
 
-  // Convergent operations cannot be made control-dependent on additional
-  // values.
   if (auto CS = CallSite(Inst)) {
+    // Convergent operations cannot be made control-dependent on additional
+    // values.
     if (CS.hasFnAttr(Attribute::Convergent))
       return false;
+
+    for (Instruction *S : Stores)
+      if (AA.getModRefInfo(S, CS) & MRI_Mod)
+        return false;
   }
 
   return true;
@@ -185,8 +92,8 @@ static bool isSafeToMove(Instruction *Inst, AliasAnalysis *AA,
 
 /// IsAcceptableTarget - Return true if it is possible to sink the instruction
 /// in the specified basic block.
-bool Sinking::IsAcceptableTarget(Instruction *Inst,
-                                 BasicBlock *SuccToSinkTo) const {
+static bool IsAcceptableTarget(Instruction *Inst, BasicBlock *SuccToSinkTo,
+                               DominatorTree &DT, LoopInfo &LI) {
   assert(Inst && "Instruction to be sunk is null");
   assert(SuccToSinkTo && "Candidate sink target is null");
 
@@ -212,25 +119,26 @@ bool Sinking::IsAcceptableTarget(Instruction *Inst,
 
     // We don't want to sink across a critical edge if we don't dominate the
     // successor. We could be introducing calculations to new code paths.
-    if (!DT->dominates(Inst->getParent(), SuccToSinkTo))
+    if (!DT.dominates(Inst->getParent(), SuccToSinkTo))
       return false;
 
     // Don't sink instructions into a loop.
-    Loop *succ = LI->getLoopFor(SuccToSinkTo);
-    Loop *cur = LI->getLoopFor(Inst->getParent());
+    Loop *succ = LI.getLoopFor(SuccToSinkTo);
+    Loop *cur = LI.getLoopFor(Inst->getParent());
     if (succ != nullptr && succ != cur)
       return false;
   }
 
   // Finally, check that all the uses of the instruction are actually
   // dominated by the candidate
-  return AllUsesDominatedByBlock(Inst, SuccToSinkTo);
+  return AllUsesDominatedByBlock(Inst, SuccToSinkTo, DT);
 }
 
 /// SinkInstruction - Determine whether it is safe to sink the specified machine
 /// instruction out of its current block into a successor.
-bool Sinking::SinkInstruction(Instruction *Inst,
-                              SmallPtrSetImpl<Instruction *> &Stores) {
+static bool SinkInstruction(Instruction *Inst,
+                            SmallPtrSetImpl<Instruction *> &Stores,
+                            DominatorTree &DT, LoopInfo &LI, AAResults &AA) {
 
   // Don't sink static alloca instructions.  CodeGen assumes allocas outside the
   // entry block are dynamically sized stack objects.
@@ -257,12 +165,12 @@ bool Sinking::SinkInstruction(Instruction *Inst,
   // Instructions can only be sunk if all their uses are in blocks
   // dominated by one of the successors.
   // Look at all the postdominators and see if we can sink it in one.
-  DomTreeNode *DTN = DT->getNode(Inst->getParent());
+  DomTreeNode *DTN = DT.getNode(Inst->getParent());
   for (DomTreeNode::iterator I = DTN->begin(), E = DTN->end();
       I != E && SuccToSinkTo == nullptr; ++I) {
     BasicBlock *Candidate = (*I)->getBlock();
     if ((*I)->getIDom()->getBlock() == Inst->getParent() &&
-        IsAcceptableTarget(Inst, Candidate))
+        IsAcceptableTarget(Inst, Candidate, DT, LI))
       SuccToSinkTo = Candidate;
   }
 
@@ -270,7 +178,7 @@ bool Sinking::SinkInstruction(Instruction *Inst,
   // decide which one we should sink to, if any.
   for (succ_iterator I = succ_begin(Inst->getParent()),
       E = succ_end(Inst->getParent()); I != E && !SuccToSinkTo; ++I) {
-    if (IsAcceptableTarget(Inst, *I))
+    if (IsAcceptableTarget(Inst, *I, DT, LI))
       SuccToSinkTo = *I;
   }
 
@@ -288,3 +196,111 @@ bool Sinking::SinkInstruction(Instruction *Inst,
   Inst->moveBefore(&*SuccToSinkTo->getFirstInsertionPt());
   return true;
 }
+
+static bool ProcessBlock(BasicBlock &BB, DominatorTree &DT, LoopInfo &LI,
+                         AAResults &AA) {
+  // Can't sink anything out of a block that has less than two successors.
+  if (BB.getTerminator()->getNumSuccessors() <= 1) return false;
+
+  // Don't bother sinking code out of unreachable blocks. In addition to being
+  // unprofitable, it can also lead to infinite looping, because in an
+  // unreachable loop there may be nowhere to stop.
+  if (!DT.isReachableFromEntry(&BB)) return false;
+
+  bool MadeChange = false;
+
+  // Walk the basic block bottom-up.  Remember if we saw a store.
+  BasicBlock::iterator I = BB.end();
+  --I;
+  bool ProcessedBegin = false;
+  SmallPtrSet<Instruction *, 8> Stores;
+  do {
+    Instruction *Inst = &*I; // The instruction to sink.
+
+    // Predecrement I (if it's not begin) so that it isn't invalidated by
+    // sinking.
+    ProcessedBegin = I == BB.begin();
+    if (!ProcessedBegin)
+      --I;
+
+    if (isa<DbgInfoIntrinsic>(Inst))
+      continue;
+
+    if (SinkInstruction(Inst, Stores, DT, LI, AA)) {
+      ++NumSunk;
+      MadeChange = true;
+    }
+
+    // If we just processed the first instruction in the block, we're done.
+  } while (!ProcessedBegin);
+
+  return MadeChange;
+}
+
+static bool iterativelySinkInstructions(Function &F, DominatorTree &DT,
+                                        LoopInfo &LI, AAResults &AA) {
+  bool MadeChange, EverMadeChange = false;
+
+  do {
+    MadeChange = false;
+    DEBUG(dbgs() << "Sinking iteration " << NumSinkIter << "\n");
+    // Process all basic blocks.
+    for (BasicBlock &I : F)
+      MadeChange |= ProcessBlock(I, DT, LI, AA);
+    EverMadeChange |= MadeChange;
+    NumSinkIter++;
+  } while (MadeChange);
+
+  return EverMadeChange;
+}
+
+PreservedAnalyses SinkingPass::run(Function &F, AnalysisManager<Function> &AM) {
+  auto &DT = AM.getResult<DominatorTreeAnalysis>(F);
+  auto &LI = AM.getResult<LoopAnalysis>(F);
+  auto &AA = AM.getResult<AAManager>(F);
+
+  if (!iterativelySinkInstructions(F, DT, LI, AA))
+    return PreservedAnalyses::all();
+
+  auto PA = PreservedAnalyses();
+  PA.preserve<DominatorTreeAnalysis>();
+  PA.preserve<LoopAnalysis>();
+  return PA;
+}
+
+namespace {
+  class SinkingLegacyPass : public FunctionPass {
+  public:
+    static char ID; // Pass identification
+    SinkingLegacyPass() : FunctionPass(ID) {
+      initializeSinkingLegacyPassPass(*PassRegistry::getPassRegistry());
+    }
+
+    bool runOnFunction(Function &F) override {
+      auto &DT = getAnalysis<DominatorTreeWrapperPass>().getDomTree();
+      auto &LI = getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
+      auto &AA = getAnalysis<AAResultsWrapperPass>().getAAResults();
+
+      return iterativelySinkInstructions(F, DT, LI, AA);
+    }
+
+    void getAnalysisUsage(AnalysisUsage &AU) const override {
+      AU.setPreservesCFG();
+      FunctionPass::getAnalysisUsage(AU);
+      AU.addRequired<AAResultsWrapperPass>();
+      AU.addRequired<DominatorTreeWrapperPass>();
+      AU.addRequired<LoopInfoWrapperPass>();
+      AU.addPreserved<DominatorTreeWrapperPass>();
+      AU.addPreserved<LoopInfoWrapperPass>();
+    }
+  };
+} // end anonymous namespace
+
+char SinkingLegacyPass::ID = 0;
+INITIALIZE_PASS_BEGIN(SinkingLegacyPass, "sink", "Code sinking", false, false)
+INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass)
+INITIALIZE_PASS_END(SinkingLegacyPass, "sink", "Code sinking", false, false)
+
+FunctionPass *llvm::createSinkingPass() { return new SinkingLegacyPass(); }
diff --git a/lib/Transforms/Scalar/SpeculativeExecution.cpp b/lib/Transforms/Scalar/SpeculativeExecution.cpp
index 147d615488ff..9bf2d6206819 100644
--- a/lib/Transforms/Scalar/SpeculativeExecution.cpp
+++ b/lib/Transforms/Scalar/SpeculativeExecution.cpp
@@ -50,9 +50,19 @@
 // aggressive speculation while counting on later passes to either capitalize on
 // that or clean it up.
 //
+// If the pass was created by calling
+// createSpeculativeExecutionIfHasBranchDivergencePass or the
+// -spec-exec-only-if-divergent-target option is present, this pass only has an
+// effect on targets where TargetTransformInfo::hasBranchDivergence() is true;
+// on other targets, it is a nop.
+//
+// This lets you include this pass unconditionally in the IR pass pipeline, but
+// only enable it for relevant targets.
+//
 //===----------------------------------------------------------------------===//
 
 #include "llvm/ADT/SmallSet.h"
+#include "llvm/Analysis/GlobalsModRef.h"
 #include "llvm/Analysis/TargetTransformInfo.h"
 #include "llvm/Analysis/ValueTracking.h"
 #include "llvm/IR/Instructions.h"
@@ -83,19 +93,39 @@ static cl::opt<unsigned> SpecExecMaxNotHoisted(
              "number of instructions that would not be speculatively executed "
              "exceeds this limit."));
 
+static cl::opt<bool> SpecExecOnlyIfDivergentTarget(
+    "spec-exec-only-if-divergent-target", cl::init(false), cl::Hidden,
+    cl::desc("Speculative execution is applied only to targets with divergent "
+             "branches, even if the pass was configured to apply only to all "
+             "targets."));
+
 namespace {
+
 class SpeculativeExecution : public FunctionPass {
  public:
-  static char ID;
-  SpeculativeExecution(): FunctionPass(ID) {}
+   static char ID;
+   explicit SpeculativeExecution(bool OnlyIfDivergentTarget = false)
+       : FunctionPass(ID),
+         OnlyIfDivergentTarget(OnlyIfDivergentTarget ||
+                               SpecExecOnlyIfDivergentTarget) {}
+
+   void getAnalysisUsage(AnalysisUsage &AU) const override;
+   bool runOnFunction(Function &F) override;
 
-  void getAnalysisUsage(AnalysisUsage &AU) const override;
-  bool runOnFunction(Function &F) override;
+   const char *getPassName() const override {
+     if (OnlyIfDivergentTarget)
+       return "Speculatively execute instructions if target has divergent "
+              "branches";
+     return "Speculatively execute instructions";
+   }
 
  private:
   bool runOnBasicBlock(BasicBlock &B);
   bool considerHoistingFromTo(BasicBlock &FromBlock, BasicBlock &ToBlock);
 
+  // If true, this pass is a nop unless the target architecture has branch
+  // divergence.
+  const bool OnlyIfDivergentTarget;
   const TargetTransformInfo *TTI = nullptr;
 };
 } // namespace
@@ -105,17 +135,23 @@ INITIALIZE_PASS_BEGIN(SpeculativeExecution, "speculative-execution",
                       "Speculatively execute instructions", false, false)
 INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass)
 INITIALIZE_PASS_END(SpeculativeExecution, "speculative-execution",
-                      "Speculatively execute instructions", false, false)
+                    "Speculatively execute instructions", false, false)
 
 void SpeculativeExecution::getAnalysisUsage(AnalysisUsage &AU) const {
   AU.addRequired<TargetTransformInfoWrapperPass>();
+  AU.addPreserved<GlobalsAAWrapperPass>();
 }
 
 bool SpeculativeExecution::runOnFunction(Function &F) {
-  if (skipOptnoneFunction(F))
+  if (skipFunction(F))
     return false;
 
   TTI = &getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F);
+  if (OnlyIfDivergentTarget && !TTI->hasBranchDivergence()) {
+    DEBUG(dbgs() << "Not running SpeculativeExecution because "
+                    "TTI->hasBranchDivergence() is false.\n");
+    return false;
+  }
 
   bool Changed = false;
   for (auto& B : F) {
@@ -240,4 +276,8 @@ FunctionPass *createSpeculativeExecutionPass() {
   return new SpeculativeExecution();
 }
 
+FunctionPass *createSpeculativeExecutionIfHasBranchDivergencePass() {
+  return new SpeculativeExecution(/* OnlyIfDivergentTarget = */ true);
+}
+
 }  // namespace llvm
diff --git a/lib/Transforms/Scalar/StraightLineStrengthReduce.cpp b/lib/Transforms/Scalar/StraightLineStrengthReduce.cpp
index 1faa65eb3417..292d0400a516 100644
--- a/lib/Transforms/Scalar/StraightLineStrengthReduce.cpp
+++ b/lib/Transforms/Scalar/StraightLineStrengthReduce.cpp
@@ -57,8 +57,6 @@
 //   SLSR.
 #include <vector>
 
-#include "llvm/ADT/DenseSet.h"
-#include "llvm/ADT/FoldingSet.h"
 #include "llvm/Analysis/ScalarEvolution.h"
 #include "llvm/Analysis/TargetTransformInfo.h"
 #include "llvm/Analysis/ValueTracking.h"
@@ -76,6 +74,8 @@ using namespace PatternMatch;
 
 namespace {
 
+static const unsigned UnknownAddressSpace = ~0u;
+
 class StraightLineStrengthReduce : public FunctionPass {
 public:
   // SLSR candidate. Such a candidate must be in one of the forms described in
@@ -234,51 +234,22 @@ bool StraightLineStrengthReduce::isBasisFor(const Candidate &Basis,
           Basis.CandidateKind == C.CandidateKind);
 }
 
-// TODO: use TTI->getGEPCost.
 static bool isGEPFoldable(GetElementPtrInst *GEP,
-                          const TargetTransformInfo *TTI,
-                          const DataLayout *DL) {
-  GlobalVariable *BaseGV = nullptr;
-  int64_t BaseOffset = 0;
-  bool HasBaseReg = false;
-  int64_t Scale = 0;
-
-  if (GlobalVariable *GV = dyn_cast<GlobalVariable>(GEP->getPointerOperand()))
-    BaseGV = GV;
-  else
-    HasBaseReg = true;
-
-  gep_type_iterator GTI = gep_type_begin(GEP);
-  for (auto I = GEP->idx_begin(); I != GEP->idx_end(); ++I, ++GTI) {
-    if (isa<SequentialType>(*GTI)) {
-      int64_t ElementSize = DL->getTypeAllocSize(GTI.getIndexedType());
-      if (ConstantInt *ConstIdx = dyn_cast<ConstantInt>(*I)) {
-        BaseOffset += ConstIdx->getSExtValue() * ElementSize;
-      } else {
-        // Needs scale register.
-        if (Scale != 0) {
-          // No addressing mode takes two scale registers.
-          return false;
-        }
-        Scale = ElementSize;
-      }
-    } else {
-      StructType *STy = cast<StructType>(*GTI);
-      uint64_t Field = cast<ConstantInt>(*I)->getZExtValue();
-      BaseOffset += DL->getStructLayout(STy)->getElementOffset(Field);
-    }
-  }
-
-  unsigned AddrSpace = GEP->getPointerAddressSpace();
-  return TTI->isLegalAddressingMode(GEP->getType()->getElementType(), BaseGV,
-                                    BaseOffset, HasBaseReg, Scale, AddrSpace);
+                          const TargetTransformInfo *TTI) {
+  SmallVector<const Value*, 4> Indices;
+  for (auto I = GEP->idx_begin(); I != GEP->idx_end(); ++I)
+    Indices.push_back(*I);
+  return TTI->getGEPCost(GEP->getSourceElementType(), GEP->getPointerOperand(),
+                         Indices) == TargetTransformInfo::TCC_Free;
 }
 
 // Returns whether (Base + Index * Stride) can be folded to an addressing mode.
 static bool isAddFoldable(const SCEV *Base, ConstantInt *Index, Value *Stride,
                           TargetTransformInfo *TTI) {
-  return TTI->isLegalAddressingMode(Base->getType(), nullptr, 0, true,
-                                    Index->getSExtValue());
+  // Index->getSExtValue() may crash if Index is wider than 64-bit.
+  return Index->getBitWidth() <= 64 &&
+         TTI->isLegalAddressingMode(Base->getType(), nullptr, 0, true,
+                                    Index->getSExtValue(), UnknownAddressSpace);
 }
 
 bool StraightLineStrengthReduce::isFoldable(const Candidate &C,
@@ -287,7 +258,7 @@ bool StraightLineStrengthReduce::isFoldable(const Candidate &C,
   if (C.CandidateKind == Candidate::Add)
     return isAddFoldable(C.Base, C.Index, C.Stride, TTI);
   if (C.CandidateKind == Candidate::GEP)
-    return isGEPFoldable(cast<GetElementPtrInst>(C.Ins), TTI, DL);
+    return isGEPFoldable(cast<GetElementPtrInst>(C.Ins), TTI);
   return false;
 }
 
@@ -533,13 +504,23 @@ void StraightLineStrengthReduce::allocateCandidatesAndFindBasisForGEP(
                                           IndexExprs, GEP->isInBounds());
     Value *ArrayIdx = GEP->getOperand(I);
     uint64_t ElementSize = DL->getTypeAllocSize(*GTI);
-    factorArrayIndex(ArrayIdx, BaseExpr, ElementSize, GEP);
+    if (ArrayIdx->getType()->getIntegerBitWidth() <=
+        DL->getPointerSizeInBits(GEP->getAddressSpace())) {
+      // Skip factoring if ArrayIdx is wider than the pointer size, because
+      // ArrayIdx is implicitly truncated to the pointer size.
+      factorArrayIndex(ArrayIdx, BaseExpr, ElementSize, GEP);
+    }
     // When ArrayIdx is the sext of a value, we try to factor that value as
     // well.  Handling this case is important because array indices are
     // typically sign-extended to the pointer size.
     Value *TruncatedArrayIdx = nullptr;
-    if (match(ArrayIdx, m_SExt(m_Value(TruncatedArrayIdx))))
+    if (match(ArrayIdx, m_SExt(m_Value(TruncatedArrayIdx))) &&
+        TruncatedArrayIdx->getType()->getIntegerBitWidth() <=
+            DL->getPointerSizeInBits(GEP->getAddressSpace())) {
+      // Skip factoring if TruncatedArrayIdx is wider than the pointer size,
+      // because TruncatedArrayIdx is implicitly truncated to the pointer size.
       factorArrayIndex(TruncatedArrayIdx, BaseExpr, ElementSize, GEP);
+    }
 
     IndexExprs[I - 1] = OrigIndexExpr;
   }
@@ -567,10 +548,10 @@ Value *StraightLineStrengthReduce::emitBump(const Candidate &Basis,
     APInt ElementSize(
         IndexOffset.getBitWidth(),
         DL->getTypeAllocSize(
-            cast<GetElementPtrInst>(Basis.Ins)->getType()->getElementType()));
+            cast<GetElementPtrInst>(Basis.Ins)->getResultElementType()));
     APInt Q, R;
     APInt::sdivrem(IndexOffset, ElementSize, Q, R);
-    if (R.getSExtValue() == 0)
+    if (R == 0)
       IndexOffset = Q;
     else
       BumpWithUglyGEP = true;
@@ -578,10 +559,10 @@ Value *StraightLineStrengthReduce::emitBump(const Candidate &Basis,
 
   // Compute Bump = C - Basis = (i' - i) * S.
   // Common case 1: if (i' - i) is 1, Bump = S.
-  if (IndexOffset.getSExtValue() == 1)
+  if (IndexOffset == 1)
     return C.Stride;
   // Common case 2: if (i' - i) is -1, Bump = -S.
-  if (IndexOffset.getSExtValue() == -1)
+  if (IndexOffset.isAllOnesValue())
     return Builder.CreateNeg(C.Stride);
 
   // Otherwise, Bump = (i' - i) * sext/trunc(S). Note that (i' - i) and S may
@@ -685,7 +666,7 @@ void StraightLineStrengthReduce::rewriteCandidateWithBasis(
 }
 
 bool StraightLineStrengthReduce::runOnFunction(Function &F) {
-  if (skipOptnoneFunction(F))
+  if (skipFunction(F))
     return false;
 
   TTI = &getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F);
diff --git a/lib/Transforms/Scalar/StructurizeCFG.cpp b/lib/Transforms/Scalar/StructurizeCFG.cpp
index 662513c7d8ae..e9ac39beae5a 100644
--- a/lib/Transforms/Scalar/StructurizeCFG.cpp
+++ b/lib/Transforms/Scalar/StructurizeCFG.cpp
@@ -11,6 +11,7 @@
 #include "llvm/ADT/MapVector.h"
 #include "llvm/ADT/PostOrderIterator.h"
 #include "llvm/ADT/SCCIterator.h"
+#include "llvm/Analysis/DivergenceAnalysis.h"
 #include "llvm/Analysis/LoopInfo.h"
 #include "llvm/Analysis/RegionInfo.h"
 #include "llvm/Analysis/RegionIterator.h"
@@ -161,6 +162,9 @@ public:
 /// consist of a network of PHI nodes where the true incoming values expresses
 /// breaks and the false values expresses continue states.
 class StructurizeCFG : public RegionPass {
+  bool SkipUniformRegions;
+  DivergenceAnalysis *DA;
+
   Type *Boolean;
   ConstantInt *BoolTrue;
   ConstantInt *BoolFalse;
@@ -232,11 +236,18 @@ class StructurizeCFG : public RegionPass {
 
   void rebuildSSA();
 
+  bool hasOnlyUniformBranches(const Region *R);
+
 public:
   static char ID;
 
   StructurizeCFG() :
-    RegionPass(ID) {
+    RegionPass(ID), SkipUniformRegions(false) {
+    initializeStructurizeCFGPass(*PassRegistry::getPassRegistry());
+  }
+
+  StructurizeCFG(bool SkipUniformRegions) :
+    RegionPass(ID), SkipUniformRegions(SkipUniformRegions) {
     initializeStructurizeCFGPass(*PassRegistry::getPassRegistry());
   }
 
@@ -250,6 +261,8 @@ public:
   }
 
   void getAnalysisUsage(AnalysisUsage &AU) const override {
+    if (SkipUniformRegions)
+      AU.addRequired<DivergenceAnalysis>();
     AU.addRequiredID(LowerSwitchID);
     AU.addRequired<DominatorTreeWrapperPass>();
     AU.addRequired<LoopInfoWrapperPass>();
@@ -264,6 +277,7 @@ char StructurizeCFG::ID = 0;
 
 INITIALIZE_PASS_BEGIN(StructurizeCFG, "structurizecfg", "Structurize the CFG",
                       false, false)
+INITIALIZE_PASS_DEPENDENCY(DivergenceAnalysis)
 INITIALIZE_PASS_DEPENDENCY(LowerSwitch)
 INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
 INITIALIZE_PASS_DEPENDENCY(RegionInfoPass)
@@ -297,11 +311,7 @@ void StructurizeCFG::orderNodes() {
   for (RegionNode *RN : TempOrder) {
     BasicBlock *BB = RN->getEntry();
     Loop *Loop = LI->getLoopFor(BB);
-    if (!LoopBlocks.count(Loop)) {
-      LoopBlocks[Loop] = 1;
-      continue;
-    }
-    LoopBlocks[Loop]++;
+    ++LoopBlocks[Loop];
   }
 
   unsigned CurrentLoopDepth = 0;
@@ -319,11 +329,11 @@ void StructurizeCFG::orderNodes() {
       // the outer loop.
 
       RNVector::iterator LoopI = I;
-      while(LoopBlocks[CurrentLoop]) {
+      while (unsigned &BlockCount = LoopBlocks[CurrentLoop]) {
         LoopI++;
         BasicBlock *LoopBB = (*LoopI)->getEntry();
         if (LI->getLoopFor(LoopBB) == CurrentLoop) {
-          LoopBlocks[CurrentLoop]--;
+          --BlockCount;
           Order.push_back(*LoopI);
         }
       }
@@ -367,14 +377,8 @@ void StructurizeCFG::analyzeLoops(RegionNode *N) {
 /// \brief Invert the given condition
 Value *StructurizeCFG::invert(Value *Condition) {
   // First: Check if it's a constant
-  if (Condition == BoolTrue)
-    return BoolFalse;
-
-  if (Condition == BoolFalse)
-    return BoolTrue;
-
-  if (Condition == BoolUndef)
-    return BoolUndef;
+  if (Constant *C = dyn_cast<Constant>(Condition))
+    return ConstantExpr::getNot(C);
 
   // Second: If the condition is already inverted, return the original value
   if (match(Condition, m_Not(m_Value(Condition))))
@@ -491,21 +495,21 @@ void StructurizeCFG::collectInfos() {
   // Reset the visited nodes
   Visited.clear();
 
-  for (RNVector::reverse_iterator OI = Order.rbegin(), OE = Order.rend();
-       OI != OE; ++OI) {
+  for (RegionNode *RN : reverse(Order)) {
 
-    DEBUG(dbgs() << "Visiting: " <<
-                    ((*OI)->isSubRegion() ? "SubRegion with entry: " : "") <<
-                    (*OI)->getEntry()->getName() << " Loop Depth: " << LI->getLoopDepth((*OI)->getEntry()) << "\n");
+    DEBUG(dbgs() << "Visiting: "
+                 << (RN->isSubRegion() ? "SubRegion with entry: " : "")
+                 << RN->getEntry()->getName() << " Loop Depth: "
+                 << LI->getLoopDepth(RN->getEntry()) << "\n");
 
     // Analyze all the conditions leading to a node
-    gatherPredicates(*OI);
+    gatherPredicates(RN);
 
     // Remember that we've seen this node
-    Visited.insert((*OI)->getEntry());
+    Visited.insert(RN->getEntry());
 
     // Find the last back edges
-    analyzeLoops(*OI);
+    analyzeLoops(RN);
   }
 }
 
@@ -584,20 +588,18 @@ void StructurizeCFG::addPhiValues(BasicBlock *From, BasicBlock *To) {
 /// \brief Add the real PHI value as soon as everything is set up
 void StructurizeCFG::setPhiValues() {
   SSAUpdater Updater;
-  for (BB2BBVecMap::iterator AI = AddedPhis.begin(), AE = AddedPhis.end();
-       AI != AE; ++AI) {
+  for (const auto &AddedPhi : AddedPhis) {
 
-    BasicBlock *To = AI->first;
-    BBVector &From = AI->second;
+    BasicBlock *To = AddedPhi.first;
+    const BBVector &From = AddedPhi.second;
 
     if (!DeletedPhis.count(To))
       continue;
 
     PhiMap &Map = DeletedPhis[To];
-    for (PhiMap::iterator PI = Map.begin(), PE = Map.end();
-         PI != PE; ++PI) {
+    for (const auto &PI : Map) {
 
-      PHINode *Phi = PI->first;
+      PHINode *Phi = PI.first;
       Value *Undef = UndefValue::get(Phi->getType());
       Updater.Initialize(Phi->getType(), "");
       Updater.AddAvailableValue(&Func->getEntryBlock(), Undef);
@@ -605,22 +607,20 @@ void StructurizeCFG::setPhiValues() {
 
       NearestCommonDominator Dominator(DT);
       Dominator.addBlock(To, false);
-      for (BBValueVector::iterator VI = PI->second.begin(),
-           VE = PI->second.end(); VI != VE; ++VI) {
+      for (const auto &VI : PI.second) {
 
-        Updater.AddAvailableValue(VI->first, VI->second);
-        Dominator.addBlock(VI->first);
+        Updater.AddAvailableValue(VI.first, VI.second);
+        Dominator.addBlock(VI.first);
       }
 
       if (!Dominator.wasResultExplicitMentioned())
         Updater.AddAvailableValue(Dominator.getResult(), Undef);
 
-      for (BBVector::iterator FI = From.begin(), FE = From.end();
-           FI != FE; ++FI) {
+      for (BasicBlock *FI : From) {
 
-        int Idx = Phi->getBasicBlockIndex(*FI);
+        int Idx = Phi->getBasicBlockIndex(FI);
         assert(Idx != -1);
-        Phi->setIncomingValue(Idx, Updater.GetValueAtEndOfBlock(*FI));
+        Phi->setIncomingValue(Idx, Updater.GetValueAtEndOfBlock(FI));
       }
     }
 
@@ -914,11 +914,48 @@ void StructurizeCFG::rebuildSSA() {
     }
 }
 
+bool StructurizeCFG::hasOnlyUniformBranches(const Region *R) {
+  for (const BasicBlock *BB : R->blocks()) {
+    const BranchInst *Br = dyn_cast<BranchInst>(BB->getTerminator());
+    if (!Br || !Br->isConditional())
+      continue;
+
+    if (!DA->isUniform(Br->getCondition()))
+      return false;
+    DEBUG(dbgs() << "BB: " << BB->getName() << " has uniform terminator\n");
+  }
+  return true;
+}
+
 /// \brief Run the transformation for each region found
 bool StructurizeCFG::runOnRegion(Region *R, RGPassManager &RGM) {
   if (R->isTopLevelRegion())
     return false;
 
+  if (SkipUniformRegions) {
+    DA = &getAnalysis<DivergenceAnalysis>();
+    // TODO: We could probably be smarter here with how we handle sub-regions.
+    if (hasOnlyUniformBranches(R)) {
+      DEBUG(dbgs() << "Skipping region with uniform control flow: " << *R << '\n');
+
+      // Mark all direct child block terminators as having been treated as
+      // uniform. To account for a possible future in which non-uniform
+      // sub-regions are treated more cleverly, indirect children are not
+      // marked as uniform.
+      MDNode *MD = MDNode::get(R->getEntry()->getParent()->getContext(), {});
+      Region::element_iterator E = R->element_end();
+      for (Region::element_iterator I = R->element_begin(); I != E; ++I) {
+        if (I->isSubRegion())
+          continue;
+
+        if (Instruction *Term = I->getEntry()->getTerminator())
+          Term->setMetadata("structurizecfg.uniform", MD);
+      }
+
+      return false;
+    }
+  }
+
   Func = R->getEntry()->getParent();
   ParentRegion = R;
 
@@ -947,7 +984,6 @@ bool StructurizeCFG::runOnRegion(Region *R, RGPassManager &RGM) {
   return true;
 }
 
-/// \brief Create the pass
-Pass *llvm::createStructurizeCFGPass() {
-  return new StructurizeCFG();
+Pass *llvm::createStructurizeCFGPass(bool SkipUniformRegions) {
+  return new StructurizeCFG(SkipUniformRegions);
 }
diff --git a/lib/Transforms/Scalar/TailRecursionElimination.cpp b/lib/Transforms/Scalar/TailRecursionElimination.cpp
index 4e84d72ae7bd..d5ff99750370 100644
--- a/lib/Transforms/Scalar/TailRecursionElimination.cpp
+++ b/lib/Transforms/Scalar/TailRecursionElimination.cpp
@@ -50,6 +50,7 @@
 //
 //===----------------------------------------------------------------------===//
 
+#include "llvm/Transforms/Scalar/TailRecursionElimination.h"
 #include "llvm/Transforms/Scalar.h"
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/SmallPtrSet.h"
@@ -85,64 +86,9 @@ STATISTIC(NumEliminated, "Number of tail calls removed");
 STATISTIC(NumRetDuped,   "Number of return duplicated");
 STATISTIC(NumAccumAdded, "Number of accumulators introduced");
 
-namespace {
-  struct TailCallElim : public FunctionPass {
-    const TargetTransformInfo *TTI;
-
-    static char ID; // Pass identification, replacement for typeid
-    TailCallElim() : FunctionPass(ID) {
-      initializeTailCallElimPass(*PassRegistry::getPassRegistry());
-    }
-
-    void getAnalysisUsage(AnalysisUsage &AU) const override;
-
-    bool runOnFunction(Function &F) override;
-
-  private:
-    bool runTRE(Function &F);
-    bool markTails(Function &F, bool &AllCallsAreTailCalls);
-
-    CallInst *FindTRECandidate(Instruction *I,
-                               bool CannotTailCallElimCallsMarkedTail);
-    bool EliminateRecursiveTailCall(CallInst *CI, ReturnInst *Ret,
-                                    BasicBlock *&OldEntry,
-                                    bool &TailCallsAreMarkedTail,
-                                    SmallVectorImpl<PHINode *> &ArgumentPHIs,
-                                    bool CannotTailCallElimCallsMarkedTail);
-    bool FoldReturnAndProcessPred(BasicBlock *BB,
-                                  ReturnInst *Ret, BasicBlock *&OldEntry,
-                                  bool &TailCallsAreMarkedTail,
-                                  SmallVectorImpl<PHINode *> &ArgumentPHIs,
-                                  bool CannotTailCallElimCallsMarkedTail);
-    bool ProcessReturningBlock(ReturnInst *RI, BasicBlock *&OldEntry,
-                               bool &TailCallsAreMarkedTail,
-                               SmallVectorImpl<PHINode *> &ArgumentPHIs,
-                               bool CannotTailCallElimCallsMarkedTail);
-    bool CanMoveAboveCall(Instruction *I, CallInst *CI);
-    Value *CanTransformAccumulatorRecursion(Instruction *I, CallInst *CI);
-  };
-}
-
-char TailCallElim::ID = 0;
-INITIALIZE_PASS_BEGIN(TailCallElim, "tailcallelim",
-                      "Tail Call Elimination", false, false)
-INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass)
-INITIALIZE_PASS_END(TailCallElim, "tailcallelim",
-                    "Tail Call Elimination", false, false)
-
-// Public interface to the TailCallElimination pass
-FunctionPass *llvm::createTailCallEliminationPass() {
-  return new TailCallElim();
-}
-
-void TailCallElim::getAnalysisUsage(AnalysisUsage &AU) const {
-  AU.addRequired<TargetTransformInfoWrapperPass>();
-  AU.addPreserved<GlobalsAAWrapperPass>();
-}
-
 /// \brief Scan the specified function for alloca instructions.
 /// If it contains any dynamic allocas, returns false.
-static bool CanTRE(Function &F) {
+static bool canTRE(Function &F) {
   // Because of PR962, we don't TRE dynamic allocas.
   for (auto &BB : F) {
     for (auto &I : BB) {
@@ -156,20 +102,6 @@ static bool CanTRE(Function &F) {
   return true;
 }
 
-bool TailCallElim::runOnFunction(Function &F) {
-  if (skipOptnoneFunction(F))
-    return false;
-
-  if (F.getFnAttribute("disable-tail-calls").getValueAsString() == "true")
-    return false;
-
-  bool AllCallsAreTailCalls = false;
-  bool Modified = markTails(F, AllCallsAreTailCalls);
-  if (AllCallsAreTailCalls)
-    Modified |= runTRE(F);
-  return Modified;
-}
-
 namespace {
 struct AllocaDerivedValueTracker {
   // Start at a root value and walk its use-def chain to mark calls that use the
@@ -250,7 +182,7 @@ struct AllocaDerivedValueTracker {
 };
 }
 
-bool TailCallElim::markTails(Function &F, bool &AllCallsAreTailCalls) {
+static bool markTails(Function &F, bool &AllCallsAreTailCalls) {
   if (F.callsFunctionThatReturnsTwice())
     return false;
   AllCallsAreTailCalls = true;
@@ -385,63 +317,11 @@ bool TailCallElim::markTails(Function &F, bool &AllCallsAreTailCalls) {
   return Modified;
 }
 
-bool TailCallElim::runTRE(Function &F) {
-  // If this function is a varargs function, we won't be able to PHI the args
-  // right, so don't even try to convert it...
-  if (F.getFunctionType()->isVarArg()) return false;
-
-  TTI = &getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F);
-  BasicBlock *OldEntry = nullptr;
-  bool TailCallsAreMarkedTail = false;
-  SmallVector<PHINode*, 8> ArgumentPHIs;
-  bool MadeChange = false;
-
-  // If false, we cannot perform TRE on tail calls marked with the 'tail'
-  // attribute, because doing so would cause the stack size to increase (real
-  // TRE would deallocate variable sized allocas, TRE doesn't).
-  bool CanTRETailMarkedCall = CanTRE(F);
-
-  // Change any tail recursive calls to loops.
-  //
-  // FIXME: The code generator produces really bad code when an 'escaping
-  // alloca' is changed from being a static alloca to being a dynamic alloca.
-  // Until this is resolved, disable this transformation if that would ever
-  // happen.  This bug is PR962.
-  for (Function::iterator BBI = F.begin(), E = F.end(); BBI != E; /*in loop*/) {
-    BasicBlock *BB = &*BBI++; // FoldReturnAndProcessPred may delete BB.
-    if (ReturnInst *Ret = dyn_cast<ReturnInst>(BB->getTerminator())) {
-      bool Change = ProcessReturningBlock(Ret, OldEntry, TailCallsAreMarkedTail,
-                                          ArgumentPHIs, !CanTRETailMarkedCall);
-      if (!Change && BB->getFirstNonPHIOrDbg() == Ret)
-        Change = FoldReturnAndProcessPred(BB, Ret, OldEntry,
-                                          TailCallsAreMarkedTail, ArgumentPHIs,
-                                          !CanTRETailMarkedCall);
-      MadeChange |= Change;
-    }
-  }
-
-  // If we eliminated any tail recursions, it's possible that we inserted some
-  // silly PHI nodes which just merge an initial value (the incoming operand)
-  // with themselves.  Check to see if we did and clean up our mess if so.  This
-  // occurs when a function passes an argument straight through to its tail
-  // call.
-  for (PHINode *PN : ArgumentPHIs) {
-    // If the PHI Node is a dynamic constant, replace it with the value it is.
-    if (Value *PNV = SimplifyInstruction(PN, F.getParent()->getDataLayout())) {
-      PN->replaceAllUsesWith(PNV);
-      PN->eraseFromParent();
-    }
-  }
-
-  return MadeChange;
-}
-
-
 /// Return true if it is safe to move the specified
 /// instruction from after the call to before the call, assuming that all
 /// instructions between the call and this instruction are movable.
 ///
-bool TailCallElim::CanMoveAboveCall(Instruction *I, CallInst *CI) {
+static bool canMoveAboveCall(Instruction *I, CallInst *CI) {
   // FIXME: We can move load/store/call/free instructions above the call if the
   // call does not mod/ref the memory location being processed.
   if (I->mayHaveSideEffects())  // This also handles volatile loads.
@@ -454,9 +334,10 @@ bool TailCallElim::CanMoveAboveCall(Instruction *I, CallInst *CI) {
       // does not write to memory and the load provably won't trap.
       // FIXME: Writes to memory only matter if they may alias the pointer
       // being loaded from.
+      const DataLayout &DL = L->getModule()->getDataLayout();
       if (CI->mayWriteToMemory() ||
-          !isSafeToLoadUnconditionally(L->getPointerOperand(), L,
-                                       L->getAlignment()))
+          !isSafeToLoadUnconditionally(L->getPointerOperand(),
+                                       L->getAlignment(), DL, L))
         return false;
     }
   }
@@ -512,8 +393,8 @@ static Value *getCommonReturnValue(ReturnInst *IgnoreRI, CallInst *CI) {
   Function *F = CI->getParent()->getParent();
   Value *ReturnedValue = nullptr;
 
-  for (Function::iterator BBI = F->begin(), E = F->end(); BBI != E; ++BBI) {
-    ReturnInst *RI = dyn_cast<ReturnInst>(BBI->getTerminator());
+  for (BasicBlock &BBI : *F) {
+    ReturnInst *RI = dyn_cast<ReturnInst>(BBI.getTerminator());
     if (RI == nullptr || RI == IgnoreRI) continue;
 
     // We can only perform this transformation if the value returned is
@@ -534,8 +415,7 @@ static Value *getCommonReturnValue(ReturnInst *IgnoreRI, CallInst *CI) {
 /// If the specified instruction can be transformed using accumulator recursion
 /// elimination, return the constant which is the start of the accumulator
 /// value.  Otherwise return null.
-Value *TailCallElim::CanTransformAccumulatorRecursion(Instruction *I,
-                                                      CallInst *CI) {
+static Value *canTransformAccumulatorRecursion(Instruction *I, CallInst *CI) {
   if (!I->isAssociative() || !I->isCommutative()) return nullptr;
   assert(I->getNumOperands() == 2 &&
          "Associative/commutative operations should have 2 args!");
@@ -555,15 +435,15 @@ Value *TailCallElim::CanTransformAccumulatorRecursion(Instruction *I,
   return getCommonReturnValue(cast<ReturnInst>(I->user_back()), CI);
 }
 
-static Instruction *FirstNonDbg(BasicBlock::iterator I) {
+static Instruction *firstNonDbg(BasicBlock::iterator I) {
   while (isa<DbgInfoIntrinsic>(I))
     ++I;
   return &*I;
 }
 
-CallInst*
-TailCallElim::FindTRECandidate(Instruction *TI,
-                               bool CannotTailCallElimCallsMarkedTail) {
+static CallInst *findTRECandidate(Instruction *TI,
+                                  bool CannotTailCallElimCallsMarkedTail,
+                                  const TargetTransformInfo *TTI) {
   BasicBlock *BB = TI->getParent();
   Function *F = BB->getParent();
 
@@ -594,8 +474,8 @@ TailCallElim::FindTRECandidate(Instruction *TI,
   // and disable this xform in this case, because the code generator will
   // lower the call to fabs into inline code.
   if (BB == &F->getEntryBlock() &&
-      FirstNonDbg(BB->front().getIterator()) == CI &&
-      FirstNonDbg(std::next(BB->begin())) == TI && CI->getCalledFunction() &&
+      firstNonDbg(BB->front().getIterator()) == CI &&
+      firstNonDbg(std::next(BB->begin())) == TI && CI->getCalledFunction() &&
       !TTI->isLoweredToCall(CI->getCalledFunction())) {
     // A single-block function with just a call and a return. Check that
     // the arguments match.
@@ -612,7 +492,7 @@ TailCallElim::FindTRECandidate(Instruction *TI,
   return CI;
 }
 
-bool TailCallElim::EliminateRecursiveTailCall(CallInst *CI, ReturnInst *Ret,
+static bool eliminateRecursiveTailCall(CallInst *CI, ReturnInst *Ret,
                                        BasicBlock *&OldEntry,
                                        bool &TailCallsAreMarkedTail,
                                        SmallVectorImpl<PHINode *> &ArgumentPHIs,
@@ -636,14 +516,14 @@ bool TailCallElim::EliminateRecursiveTailCall(CallInst *CI, ReturnInst *Ret,
   // Check that this is the case now.
   BasicBlock::iterator BBI(CI);
   for (++BBI; &*BBI != Ret; ++BBI) {
-    if (CanMoveAboveCall(&*BBI, CI)) continue;
+    if (canMoveAboveCall(&*BBI, CI)) continue;
 
     // If we can't move the instruction above the call, it might be because it
     // is an associative and commutative operation that could be transformed
     // using accumulator recursion elimination.  Check to see if this is the
     // case, and if so, remember the initial accumulator value for later.
     if ((AccumulatorRecursionEliminationInitVal =
-             CanTransformAccumulatorRecursion(&*BBI, CI))) {
+             canTransformAccumulatorRecursion(&*BBI, CI))) {
       // Yes, this is accumulator recursion.  Remember which instruction
       // accumulates.
       AccumulatorRecursionInstr = &*BBI;
@@ -773,8 +653,8 @@ bool TailCallElim::EliminateRecursiveTailCall(CallInst *CI, ReturnInst *Ret,
     // Finally, rewrite any return instructions in the program to return the PHI
     // node instead of the "initval" that they do currently.  This loop will
     // actually rewrite the return value we are destroying, but that's ok.
-    for (Function::iterator BBI = F->begin(), E = F->end(); BBI != E; ++BBI)
-      if (ReturnInst *RI = dyn_cast<ReturnInst>(BBI->getTerminator()))
+    for (BasicBlock &BBI : *F)
+      if (ReturnInst *RI = dyn_cast<ReturnInst>(BBI.getTerminator()))
         RI->setOperand(0, AccPN);
     ++NumAccumAdded;
   }
@@ -790,11 +670,12 @@ bool TailCallElim::EliminateRecursiveTailCall(CallInst *CI, ReturnInst *Ret,
   return true;
 }
 
-bool TailCallElim::FoldReturnAndProcessPred(BasicBlock *BB,
-                                       ReturnInst *Ret, BasicBlock *&OldEntry,
-                                       bool &TailCallsAreMarkedTail,
-                                       SmallVectorImpl<PHINode *> &ArgumentPHIs,
-                                       bool CannotTailCallElimCallsMarkedTail) {
+static bool foldReturnAndProcessPred(BasicBlock *BB, ReturnInst *Ret,
+                                     BasicBlock *&OldEntry,
+                                     bool &TailCallsAreMarkedTail,
+                                     SmallVectorImpl<PHINode *> &ArgumentPHIs,
+                                     bool CannotTailCallElimCallsMarkedTail,
+                                     const TargetTransformInfo *TTI) {
   bool Change = false;
 
   // If the return block contains nothing but the return and PHI's,
@@ -813,7 +694,7 @@ bool TailCallElim::FoldReturnAndProcessPred(BasicBlock *BB,
   while (!UncondBranchPreds.empty()) {
     BranchInst *BI = UncondBranchPreds.pop_back_val();
     BasicBlock *Pred = BI->getParent();
-    if (CallInst *CI = FindTRECandidate(BI, CannotTailCallElimCallsMarkedTail)){
+    if (CallInst *CI = findTRECandidate(BI, CannotTailCallElimCallsMarkedTail, TTI)){
       DEBUG(dbgs() << "FOLDING: " << *BB
             << "INTO UNCOND BRANCH PRED: " << *Pred);
       ReturnInst *RI = FoldReturnIntoUncondBranch(Ret, BB, Pred);
@@ -821,11 +702,11 @@ bool TailCallElim::FoldReturnAndProcessPred(BasicBlock *BB,
       // Cleanup: if all predecessors of BB have been eliminated by
       // FoldReturnIntoUncondBranch, delete it.  It is important to empty it,
       // because the ret instruction in there is still using a value which
-      // EliminateRecursiveTailCall will attempt to remove.
+      // eliminateRecursiveTailCall will attempt to remove.
       if (!BB->hasAddressTaken() && pred_begin(BB) == pred_end(BB))
         BB->eraseFromParent();
 
-      EliminateRecursiveTailCall(CI, RI, OldEntry, TailCallsAreMarkedTail,
+      eliminateRecursiveTailCall(CI, RI, OldEntry, TailCallsAreMarkedTail,
                                  ArgumentPHIs,
                                  CannotTailCallElimCallsMarkedTail);
       ++NumRetDuped;
@@ -836,16 +717,124 @@ bool TailCallElim::FoldReturnAndProcessPred(BasicBlock *BB,
   return Change;
 }
 
-bool
-TailCallElim::ProcessReturningBlock(ReturnInst *Ret, BasicBlock *&OldEntry,
-                                    bool &TailCallsAreMarkedTail,
-                                    SmallVectorImpl<PHINode *> &ArgumentPHIs,
-                                    bool CannotTailCallElimCallsMarkedTail) {
-  CallInst *CI = FindTRECandidate(Ret, CannotTailCallElimCallsMarkedTail);
+static bool processReturningBlock(ReturnInst *Ret, BasicBlock *&OldEntry,
+                                  bool &TailCallsAreMarkedTail,
+                                  SmallVectorImpl<PHINode *> &ArgumentPHIs,
+                                  bool CannotTailCallElimCallsMarkedTail,
+                                  const TargetTransformInfo *TTI) {
+  CallInst *CI = findTRECandidate(Ret, CannotTailCallElimCallsMarkedTail, TTI);
   if (!CI)
     return false;
 
-  return EliminateRecursiveTailCall(CI, Ret, OldEntry, TailCallsAreMarkedTail,
+  return eliminateRecursiveTailCall(CI, Ret, OldEntry, TailCallsAreMarkedTail,
                                     ArgumentPHIs,
                                     CannotTailCallElimCallsMarkedTail);
 }
+
+static bool eliminateTailRecursion(Function &F, const TargetTransformInfo *TTI) {
+  if (F.getFnAttribute("disable-tail-calls").getValueAsString() == "true")
+    return false;
+
+  bool MadeChange = false;
+  bool AllCallsAreTailCalls = false;
+  MadeChange |= markTails(F, AllCallsAreTailCalls);
+  if (!AllCallsAreTailCalls)
+    return MadeChange;
+
+  // If this function is a varargs function, we won't be able to PHI the args
+  // right, so don't even try to convert it...
+  if (F.getFunctionType()->isVarArg())
+    return false;
+
+  BasicBlock *OldEntry = nullptr;
+  bool TailCallsAreMarkedTail = false;
+  SmallVector<PHINode*, 8> ArgumentPHIs;
+
+  // If false, we cannot perform TRE on tail calls marked with the 'tail'
+  // attribute, because doing so would cause the stack size to increase (real
+  // TRE would deallocate variable sized allocas, TRE doesn't).
+  bool CanTRETailMarkedCall = canTRE(F);
+
+  // Change any tail recursive calls to loops.
+  //
+  // FIXME: The code generator produces really bad code when an 'escaping
+  // alloca' is changed from being a static alloca to being a dynamic alloca.
+  // Until this is resolved, disable this transformation if that would ever
+  // happen.  This bug is PR962.
+  for (Function::iterator BBI = F.begin(), E = F.end(); BBI != E; /*in loop*/) {
+    BasicBlock *BB = &*BBI++; // foldReturnAndProcessPred may delete BB.
+    if (ReturnInst *Ret = dyn_cast<ReturnInst>(BB->getTerminator())) {
+      bool Change =
+          processReturningBlock(Ret, OldEntry, TailCallsAreMarkedTail,
+                                ArgumentPHIs, !CanTRETailMarkedCall, TTI);
+      if (!Change && BB->getFirstNonPHIOrDbg() == Ret)
+        Change =
+            foldReturnAndProcessPred(BB, Ret, OldEntry, TailCallsAreMarkedTail,
+                                     ArgumentPHIs, !CanTRETailMarkedCall, TTI);
+      MadeChange |= Change;
+    }
+  }
+
+  // If we eliminated any tail recursions, it's possible that we inserted some
+  // silly PHI nodes which just merge an initial value (the incoming operand)
+  // with themselves.  Check to see if we did and clean up our mess if so.  This
+  // occurs when a function passes an argument straight through to its tail
+  // call.
+  for (PHINode *PN : ArgumentPHIs) {
+    // If the PHI Node is a dynamic constant, replace it with the value it is.
+    if (Value *PNV = SimplifyInstruction(PN, F.getParent()->getDataLayout())) {
+      PN->replaceAllUsesWith(PNV);
+      PN->eraseFromParent();
+    }
+  }
+
+  return MadeChange;
+}
+
+namespace {
+struct TailCallElim : public FunctionPass {
+  static char ID; // Pass identification, replacement for typeid
+  TailCallElim() : FunctionPass(ID) {
+    initializeTailCallElimPass(*PassRegistry::getPassRegistry());
+  }
+
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    AU.addRequired<TargetTransformInfoWrapperPass>();
+    AU.addPreserved<GlobalsAAWrapperPass>();
+  }
+
+  bool runOnFunction(Function &F) override {
+    if (skipFunction(F))
+      return false;
+
+    return eliminateTailRecursion(
+        F, &getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F));
+  }
+};
+}
+
+char TailCallElim::ID = 0;
+INITIALIZE_PASS_BEGIN(TailCallElim, "tailcallelim", "Tail Call Elimination",
+                      false, false)
+INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass)
+INITIALIZE_PASS_END(TailCallElim, "tailcallelim", "Tail Call Elimination",
+                    false, false)
+
+// Public interface to the TailCallElimination pass
+FunctionPass *llvm::createTailCallEliminationPass() {
+  return new TailCallElim();
+}
+
+PreservedAnalyses TailCallElimPass::run(Function &F,
+                                        FunctionAnalysisManager &AM) {
+
+  TargetTransformInfo &TTI = AM.getResult<TargetIRAnalysis>(F);
+
+  bool Changed = eliminateTailRecursion(F, &TTI);
+
+  if (!Changed)
+    return PreservedAnalyses::all();
+  PreservedAnalyses PA;
+  PA.preserve<GlobalsAA>();
+  return PA;
+}
diff --git a/lib/Transforms/Utils/ASanStackFrameLayout.cpp b/lib/Transforms/Utils/ASanStackFrameLayout.cpp
index 409326eba401..7e50d4bb447e 100644
--- a/lib/Transforms/Utils/ASanStackFrameLayout.cpp
+++ b/lib/Transforms/Utils/ASanStackFrameLayout.cpp
@@ -44,7 +44,7 @@ static size_t VarAndRedzoneSize(size_t Size, size_t Alignment) {
   else if (Size <= 512) Res = Size + 64;
   else if (Size <= 4096) Res = Size + 128;
   else                   Res = Size + 256;
-  return RoundUpToAlignment(Res, Alignment);
+  return alignTo(Res, Alignment);
 }
 
 void
diff --git a/lib/Transforms/Utils/AddDiscriminators.cpp b/lib/Transforms/Utils/AddDiscriminators.cpp
index 0262358fa3d5..d034905b6572 100644
--- a/lib/Transforms/Utils/AddDiscriminators.cpp
+++ b/lib/Transforms/Utils/AddDiscriminators.cpp
@@ -52,7 +52,9 @@
 // http://wiki.dwarfstd.org/index.php?title=Path_Discriminators
 //===----------------------------------------------------------------------===//
 
+#include "llvm/Transforms/Utils/AddDiscriminators.h"
 #include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/DenseSet.h"
 #include "llvm/IR/BasicBlock.h"
 #include "llvm/IR/Constants.h"
 #include "llvm/IR/DIBuilder.h"
@@ -72,20 +74,22 @@ using namespace llvm;
 #define DEBUG_TYPE "add-discriminators"
 
 namespace {
-struct AddDiscriminators : public FunctionPass {
+// The legacy pass of AddDiscriminators.
+struct AddDiscriminatorsLegacyPass : public FunctionPass {
   static char ID; // Pass identification, replacement for typeid
-  AddDiscriminators() : FunctionPass(ID) {
-    initializeAddDiscriminatorsPass(*PassRegistry::getPassRegistry());
+  AddDiscriminatorsLegacyPass() : FunctionPass(ID) {
+    initializeAddDiscriminatorsLegacyPassPass(*PassRegistry::getPassRegistry());
   }
 
   bool runOnFunction(Function &F) override;
 };
-}
 
-char AddDiscriminators::ID = 0;
-INITIALIZE_PASS_BEGIN(AddDiscriminators, "add-discriminators",
+} // end anonymous namespace
+
+char AddDiscriminatorsLegacyPass::ID = 0;
+INITIALIZE_PASS_BEGIN(AddDiscriminatorsLegacyPass, "add-discriminators",
                       "Add DWARF path discriminators", false, false)
-INITIALIZE_PASS_END(AddDiscriminators, "add-discriminators",
+INITIALIZE_PASS_END(AddDiscriminatorsLegacyPass, "add-discriminators",
                     "Add DWARF path discriminators", false, false)
 
 // Command line option to disable discriminator generation even in the
@@ -95,13 +99,9 @@ static cl::opt<bool> NoDiscriminators(
     "no-discriminators", cl::init(false),
     cl::desc("Disable generation of discriminator information."));
 
+// Create the legacy AddDiscriminatorsPass.
 FunctionPass *llvm::createAddDiscriminatorsPass() {
-  return new AddDiscriminators();
-}
-
-static bool hasDebugInfo(const Function &F) {
-  DISubprogram *S = getDISubprogram(&F);
-  return S != nullptr;
+  return new AddDiscriminatorsLegacyPass();
 }
 
 /// \brief Assign DWARF discriminators.
@@ -155,13 +155,13 @@ static bool hasDebugInfo(const Function &F) {
 /// lexical block for I2 and all the instruction in B2 that share the same
 /// file and line location as I2. This new lexical block will have a
 /// different discriminator number than I1.
-bool AddDiscriminators::runOnFunction(Function &F) {
+static bool addDiscriminators(Function &F) {
   // If the function has debug information, but the user has disabled
   // discriminators, do nothing.
   // Simlarly, if the function has no debug info, do nothing.
   // Finally, if this module is built with dwarf versions earlier than 4,
   // do nothing (discriminator support is a DWARF 4 feature).
-  if (NoDiscriminators || !hasDebugInfo(F) ||
+  if (NoDiscriminators || !F.getSubprogram() ||
       F.getParent()->getDwarfVersion() < 4)
     return false;
 
@@ -173,8 +173,11 @@ bool AddDiscriminators::runOnFunction(Function &F) {
   typedef std::pair<StringRef, unsigned> Location;
   typedef DenseMap<const BasicBlock *, Metadata *> BBScopeMap;
   typedef DenseMap<Location, BBScopeMap> LocationBBMap;
+  typedef DenseMap<Location, unsigned> LocationDiscriminatorMap;
+  typedef DenseSet<Location> LocationSet;
 
   LocationBBMap LBM;
+  LocationDiscriminatorMap LDM;
 
   // Traverse all instructions in the function. If the source line location
   // of the instruction appears in other basic block, assign a new
@@ -199,8 +202,7 @@ bool AddDiscriminators::runOnFunction(Function &F) {
         auto *Scope = DIL->getScope();
         auto *File =
             Builder.createFile(DIL->getFilename(), Scope->getDirectory());
-        NewScope = Builder.createLexicalBlockFile(
-            Scope, File, DIL->computeNewDiscriminator());
+        NewScope = Builder.createLexicalBlockFile(Scope, File, ++LDM[L]);
       }
       I.setDebugLoc(DILocation::get(Ctx, DIL->getLine(), DIL->getColumn(),
                                     NewScope, DIL->getInlinedAt()));
@@ -217,32 +219,40 @@ bool AddDiscriminators::runOnFunction(Function &F) {
   // Sample base profile needs to distinguish different function calls within
   // a same source line for correct profile annotation.
   for (BasicBlock &B : F) {
-    const DILocation *FirstDIL = NULL;
+    LocationSet CallLocations;
     for (auto &I : B.getInstList()) {
       CallInst *Current = dyn_cast<CallInst>(&I);
       if (!Current || isa<DbgInfoIntrinsic>(&I))
         continue;
 
       DILocation *CurrentDIL = Current->getDebugLoc();
-      if (FirstDIL) {
-        if (CurrentDIL && CurrentDIL->getLine() == FirstDIL->getLine() &&
-            CurrentDIL->getFilename() == FirstDIL->getFilename()) {
-          auto *Scope = FirstDIL->getScope();
-          auto *File = Builder.createFile(FirstDIL->getFilename(),
-                                          Scope->getDirectory());
-          auto *NewScope = Builder.createLexicalBlockFile(
-              Scope, File, FirstDIL->computeNewDiscriminator());
-          Current->setDebugLoc(DILocation::get(
-              Ctx, CurrentDIL->getLine(), CurrentDIL->getColumn(), NewScope,
-              CurrentDIL->getInlinedAt()));
-          Changed = true;
-        } else {
-          FirstDIL = CurrentDIL;
-        }
-      } else {
-        FirstDIL = CurrentDIL;
+      if (!CurrentDIL)
+        continue;
+      Location L =
+          std::make_pair(CurrentDIL->getFilename(), CurrentDIL->getLine());
+      if (!CallLocations.insert(L).second) {
+        auto *Scope = CurrentDIL->getScope();
+        auto *File = Builder.createFile(CurrentDIL->getFilename(),
+                                        Scope->getDirectory());
+        auto *NewScope = Builder.createLexicalBlockFile(Scope, File, ++LDM[L]);
+        Current->setDebugLoc(DILocation::get(Ctx, CurrentDIL->getLine(),
+                                             CurrentDIL->getColumn(), NewScope,
+                                             CurrentDIL->getInlinedAt()));
+        Changed = true;
       }
     }
   }
   return Changed;
 }
+
+bool AddDiscriminatorsLegacyPass::runOnFunction(Function &F) {
+  return addDiscriminators(F);
+}
+PreservedAnalyses AddDiscriminatorsPass::run(Function &F,
+                                             AnalysisManager<Function> &AM) {
+  if (!addDiscriminators(F))
+    return PreservedAnalyses::all();
+
+  // FIXME: should be all()
+  return PreservedAnalyses::none();
+}
diff --git a/lib/Transforms/Utils/BasicBlockUtils.cpp b/lib/Transforms/Utils/BasicBlockUtils.cpp
index 72db980cf572..b90349d3cdad 100644
--- a/lib/Transforms/Utils/BasicBlockUtils.cpp
+++ b/lib/Transforms/Utils/BasicBlockUtils.cpp
@@ -31,8 +31,6 @@
 #include <algorithm>
 using namespace llvm;
 
-/// DeleteDeadBlock - Delete the specified block, which must have no
-/// predecessors.
 void llvm::DeleteDeadBlock(BasicBlock *BB) {
   assert((pred_begin(BB) == pred_end(BB) ||
          // Can delete self loop.
@@ -61,12 +59,8 @@ void llvm::DeleteDeadBlock(BasicBlock *BB) {
   BB->eraseFromParent();
 }
 
-/// FoldSingleEntryPHINodes - We know that BB has one predecessor.  If there are
-/// any single-entry PHI nodes in it, fold them away.  This handles the case
-/// when all entries to the PHI nodes in a block are guaranteed equal, such as
-/// when the block has exactly one predecessor.
 void llvm::FoldSingleEntryPHINodes(BasicBlock *BB,
-                                   MemoryDependenceAnalysis *MemDep) {
+                                   MemoryDependenceResults *MemDep) {
   if (!isa<PHINode>(BB->begin())) return;
 
   while (PHINode *PN = dyn_cast<PHINode>(BB->begin())) {
@@ -82,11 +76,6 @@ void llvm::FoldSingleEntryPHINodes(BasicBlock *BB,
   }
 }
 
-
-/// DeleteDeadPHIs - Examine each PHI in the given block and delete it if it
-/// is dead. Also recursively delete any operands that become dead as
-/// a result. This includes tracing the def-use list from the PHI to see if
-/// it is ultimately unused or if it reaches an unused cycle.
 bool llvm::DeleteDeadPHIs(BasicBlock *BB, const TargetLibraryInfo *TLI) {
   // Recursively deleting a PHI may cause multiple PHIs to be deleted
   // or RAUW'd undef, so use an array of WeakVH for the PHIs to delete.
@@ -103,11 +92,9 @@ bool llvm::DeleteDeadPHIs(BasicBlock *BB, const TargetLibraryInfo *TLI) {
   return Changed;
 }
 
-/// MergeBlockIntoPredecessor - Attempts to merge a block into its predecessor,
-/// if possible.  The return value indicates success or failure.
 bool llvm::MergeBlockIntoPredecessor(BasicBlock *BB, DominatorTree *DT,
                                      LoopInfo *LI,
-                                     MemoryDependenceAnalysis *MemDep) {
+                                     MemoryDependenceResults *MemDep) {
   // Don't merge away blocks who have their address taken.
   if (BB->hasAddressTaken()) return false;
 
@@ -165,10 +152,8 @@ bool llvm::MergeBlockIntoPredecessor(BasicBlock *BB, DominatorTree *DT,
     if (DomTreeNode *DTN = DT->getNode(BB)) {
       DomTreeNode *PredDTN = DT->getNode(PredBB);
       SmallVector<DomTreeNode *, 8> Children(DTN->begin(), DTN->end());
-      for (SmallVectorImpl<DomTreeNode *>::iterator DI = Children.begin(),
-                                                    DE = Children.end();
-           DI != DE; ++DI)
-        DT->changeImmediateDominator(*DI, PredDTN);
+      for (DomTreeNode *DI : Children)
+        DT->changeImmediateDominator(DI, PredDTN);
 
       DT->eraseNode(BB);
     }
@@ -183,9 +168,6 @@ bool llvm::MergeBlockIntoPredecessor(BasicBlock *BB, DominatorTree *DT,
   return true;
 }
 
-/// ReplaceInstWithValue - Replace all uses of an instruction (specified by BI)
-/// with a value, then remove and delete the original instruction.
-///
 void llvm::ReplaceInstWithValue(BasicBlock::InstListType &BIL,
                                 BasicBlock::iterator &BI, Value *V) {
   Instruction &I = *BI;
@@ -200,11 +182,6 @@ void llvm::ReplaceInstWithValue(BasicBlock::InstListType &BIL,
   BI = BIL.erase(BI);
 }
 
-
-/// ReplaceInstWithInst - Replace the instruction specified by BI with the
-/// instruction specified by I.  The original instruction is deleted and BI is
-/// updated to point to the new instruction.
-///
 void llvm::ReplaceInstWithInst(BasicBlock::InstListType &BIL,
                                BasicBlock::iterator &BI, Instruction *I) {
   assert(I->getParent() == nullptr &&
@@ -225,16 +202,11 @@ void llvm::ReplaceInstWithInst(BasicBlock::InstListType &BIL,
   BI = New;
 }
 
-/// ReplaceInstWithInst - Replace the instruction specified by From with the
-/// instruction specified by To.
-///
 void llvm::ReplaceInstWithInst(Instruction *From, Instruction *To) {
   BasicBlock::iterator BI(From);
   ReplaceInstWithInst(From->getParent()->getInstList(), BI, To);
 }
 
-/// SplitEdge -  Split the edge connecting specified block. Pass P must
-/// not be NULL.
 BasicBlock *llvm::SplitEdge(BasicBlock *BB, BasicBlock *Succ, DominatorTree *DT,
                             LoopInfo *LI) {
   unsigned SuccNum = GetSuccessorNumber(BB, Succ);
@@ -266,8 +238,8 @@ unsigned
 llvm::SplitAllCriticalEdges(Function &F,
                             const CriticalEdgeSplittingOptions &Options) {
   unsigned NumBroken = 0;
-  for (Function::iterator I = F.begin(), E = F.end(); I != E; ++I) {
-    TerminatorInst *TI = I->getTerminator();
+  for (BasicBlock &BB : F) {
+    TerminatorInst *TI = BB.getTerminator();
     if (TI->getNumSuccessors() > 1 && !isa<IndirectBrInst>(TI))
       for (unsigned i = 0, e = TI->getNumSuccessors(); i != e; ++i)
         if (SplitCriticalEdge(TI, i, Options))
@@ -276,11 +248,6 @@ llvm::SplitAllCriticalEdges(Function &F,
   return NumBroken;
 }
 
-/// SplitBlock - Split the specified block at the specified instruction - every
-/// thing before SplitPt stays in Old and everything starting with SplitPt moves
-/// to a new block.  The two blocks are joined by an unconditional branch and
-/// the loop info is updated.
-///
 BasicBlock *llvm::SplitBlock(BasicBlock *Old, Instruction *SplitPt,
                              DominatorTree *DT, LoopInfo *LI) {
   BasicBlock::iterator SplitIt = SplitPt->getIterator();
@@ -297,22 +264,17 @@ BasicBlock *llvm::SplitBlock(BasicBlock *Old, Instruction *SplitPt,
   if (DT)
     // Old dominates New. New node dominates all other nodes dominated by Old.
     if (DomTreeNode *OldNode = DT->getNode(Old)) {
-      std::vector<DomTreeNode *> Children;
-      for (DomTreeNode::iterator I = OldNode->begin(), E = OldNode->end();
-           I != E; ++I)
-        Children.push_back(*I);
+      std::vector<DomTreeNode *> Children(OldNode->begin(), OldNode->end());
 
       DomTreeNode *NewNode = DT->addNewBlock(New, Old);
-      for (std::vector<DomTreeNode *>::iterator I = Children.begin(),
-             E = Children.end(); I != E; ++I)
-        DT->changeImmediateDominator(*I, NewNode);
+      for (DomTreeNode *I : Children)
+        DT->changeImmediateDominator(I, NewNode);
     }
 
   return New;
 }
 
-/// UpdateAnalysisInformation - Update DominatorTree, LoopInfo, and LCCSA
-/// analysis information.
+/// Update DominatorTree, LoopInfo, and LCCSA analysis information.
 static void UpdateAnalysisInformation(BasicBlock *OldBB, BasicBlock *NewBB,
                                       ArrayRef<BasicBlock *> Preds,
                                       DominatorTree *DT, LoopInfo *LI,
@@ -331,10 +293,7 @@ static void UpdateAnalysisInformation(BasicBlock *OldBB, BasicBlock *NewBB,
   // this split will affect loops.
   bool IsLoopEntry = !!L;
   bool SplitMakesNewLoopHeader = false;
-  for (ArrayRef<BasicBlock *>::iterator i = Preds.begin(), e = Preds.end();
-       i != e; ++i) {
-    BasicBlock *Pred = *i;
-
+  for (BasicBlock *Pred : Preds) {
     // If we need to preserve LCSSA, determine if any of the preds is a loop
     // exit.
     if (PreserveLCSSA)
@@ -362,9 +321,7 @@ static void UpdateAnalysisInformation(BasicBlock *OldBB, BasicBlock *NewBB,
     // loops enclose them, and select the most-nested loop which contains the
     // loop containing the block being split.
     Loop *InnermostPredLoop = nullptr;
-    for (ArrayRef<BasicBlock*>::iterator
-           i = Preds.begin(), e = Preds.end(); i != e; ++i) {
-      BasicBlock *Pred = *i;
+    for (BasicBlock *Pred : Preds) {
       if (Loop *PredLoop = LI->getLoopFor(Pred)) {
         // Seek a loop which actually contains the block being split (to avoid
         // adjacent loops).
@@ -388,8 +345,8 @@ static void UpdateAnalysisInformation(BasicBlock *OldBB, BasicBlock *NewBB,
   }
 }
 
-/// UpdatePHINodes - Update the PHI nodes in OrigBB to include the values coming
-/// from NewBB. This also updates AliasAnalysis, if available.
+/// Update the PHI nodes in OrigBB to include the values coming from NewBB.
+/// This also updates AliasAnalysis, if available.
 static void UpdatePHINodes(BasicBlock *OrigBB, BasicBlock *NewBB,
                            ArrayRef<BasicBlock *> Preds, BranchInst *BI,
                            bool HasLoopExit) {
@@ -456,21 +413,6 @@ static void UpdatePHINodes(BasicBlock *OrigBB, BasicBlock *NewBB,
   }
 }
 
-/// SplitBlockPredecessors - This method introduces at least one new basic block
-/// into the function and moves some of the predecessors of BB to be
-/// predecessors of the new block. The new predecessors are indicated by the
-/// Preds array. The new block is given a suffix of 'Suffix'. Returns new basic
-/// block to which predecessors from Preds are now pointing.
-///
-/// If BB is a landingpad block then additional basicblock might be introduced.
-/// It will have suffix of 'Suffix'+".split_lp".
-/// See SplitLandingPadPredecessors for more details on this case.
-///
-/// This currently updates the LLVM IR, AliasAnalysis, DominatorTree,
-/// LoopInfo, and LCCSA but no other analyses. In particular, it does not
-/// preserve LoopSimplify (because it's complicated to handle the case where one
-/// of the edges being split is an exit of a loop with other exits).
-///
 BasicBlock *llvm::SplitBlockPredecessors(BasicBlock *BB,
                                          ArrayRef<BasicBlock *> Preds,
                                          const char *Suffix, DominatorTree *DT,
@@ -529,19 +471,6 @@ BasicBlock *llvm::SplitBlockPredecessors(BasicBlock *BB,
   return NewBB;
 }
 
-/// SplitLandingPadPredecessors - This method transforms the landing pad,
-/// OrigBB, by introducing two new basic blocks into the function. One of those
-/// new basic blocks gets the predecessors listed in Preds. The other basic
-/// block gets the remaining predecessors of OrigBB. The landingpad instruction
-/// OrigBB is clone into both of the new basic blocks. The new blocks are given
-/// the suffixes 'Suffix1' and 'Suffix2', and are returned in the NewBBs vector.
-///
-/// This currently updates the LLVM IR, AliasAnalysis, DominatorTree,
-/// DominanceFrontier, LoopInfo, and LCCSA but no other analyses. In particular,
-/// it does not preserve LoopSimplify (because it's complicated to handle the
-/// case where one of the edges being split is an exit of a loop with other
-/// exits).
-///
 void llvm::SplitLandingPadPredecessors(BasicBlock *OrigBB,
                                        ArrayRef<BasicBlock *> Preds,
                                        const char *Suffix1, const char *Suffix2,
@@ -603,9 +532,8 @@ void llvm::SplitLandingPadPredecessors(BasicBlock *OrigBB,
     BI2->setDebugLoc(OrigBB->getFirstNonPHI()->getDebugLoc());
 
     // Move the remaining edges from OrigBB to point to NewBB2.
-    for (SmallVectorImpl<BasicBlock*>::iterator
-           i = NewBB2Preds.begin(), e = NewBB2Preds.end(); i != e; ++i)
-      (*i)->getTerminator()->replaceUsesOfWith(OrigBB, NewBB2);
+    for (BasicBlock *NewBB2Pred : NewBB2Preds)
+      NewBB2Pred->getTerminator()->replaceUsesOfWith(OrigBB, NewBB2);
 
     // Update DominatorTree, LoopInfo, and LCCSA analysis information.
     HasLoopExit = false;
@@ -646,11 +574,6 @@ void llvm::SplitLandingPadPredecessors(BasicBlock *OrigBB,
   }
 }
 
-/// FoldReturnIntoUncondBranch - This method duplicates the specified return
-/// instruction into a predecessor which ends in an unconditional branch. If
-/// the return instruction returns a value defined by a PHI, propagate the
-/// right value into the return. It returns the new return instruction in the
-/// predecessor.
 ReturnInst *llvm::FoldReturnIntoUncondBranch(ReturnInst *RI, BasicBlock *BB,
                                              BasicBlock *Pred) {
   Instruction *UncondBranch = Pred->getTerminator();
@@ -689,31 +612,10 @@ ReturnInst *llvm::FoldReturnIntoUncondBranch(ReturnInst *RI, BasicBlock *BB,
   return cast<ReturnInst>(NewRet);
 }
 
-/// SplitBlockAndInsertIfThen - Split the containing block at the
-/// specified instruction - everything before and including SplitBefore stays
-/// in the old basic block, and everything after SplitBefore is moved to a
-/// new block. The two blocks are connected by a conditional branch
-/// (with value of Cmp being the condition).
-/// Before:
-///   Head
-///   SplitBefore
-///   Tail
-/// After:
-///   Head
-///   if (Cond)
-///     ThenBlock
-///   SplitBefore
-///   Tail
-///
-/// If Unreachable is true, then ThenBlock ends with
-/// UnreachableInst, otherwise it branches to Tail.
-/// Returns the NewBasicBlock's terminator.
-
-TerminatorInst *llvm::SplitBlockAndInsertIfThen(Value *Cond,
-                                                Instruction *SplitBefore,
-                                                bool Unreachable,
-                                                MDNode *BranchWeights,
-                                                DominatorTree *DT) {
+TerminatorInst *
+llvm::SplitBlockAndInsertIfThen(Value *Cond, Instruction *SplitBefore,
+                                bool Unreachable, MDNode *BranchWeights,
+                                DominatorTree *DT, LoopInfo *LI) {
   BasicBlock *Head = SplitBefore->getParent();
   BasicBlock *Tail = Head->splitBasicBlock(SplitBefore->getIterator());
   TerminatorInst *HeadOldTerm = Head->getTerminator();
@@ -735,7 +637,7 @@ TerminatorInst *llvm::SplitBlockAndInsertIfThen(Value *Cond,
       std::vector<DomTreeNode *> Children(OldNode->begin(), OldNode->end());
 
       DomTreeNode *NewNode = DT->addNewBlock(Tail, Head);
-      for (auto Child : Children)
+      for (DomTreeNode *Child : Children)
         DT->changeImmediateDominator(Child, NewNode);
 
       // Head dominates ThenBlock.
@@ -743,23 +645,15 @@ TerminatorInst *llvm::SplitBlockAndInsertIfThen(Value *Cond,
     }
   }
 
+  if (LI) {
+    Loop *L = LI->getLoopFor(Head);
+    L->addBasicBlockToLoop(ThenBlock, *LI);
+    L->addBasicBlockToLoop(Tail, *LI);
+  }
+
   return CheckTerm;
 }
 
-/// SplitBlockAndInsertIfThenElse is similar to SplitBlockAndInsertIfThen,
-/// but also creates the ElseBlock.
-/// Before:
-///   Head
-///   SplitBefore
-///   Tail
-/// After:
-///   Head
-///   if (Cond)
-///     ThenBlock
-///   else
-///     ElseBlock
-///   SplitBefore
-///   Tail
 void llvm::SplitBlockAndInsertIfThenElse(Value *Cond, Instruction *SplitBefore,
                                          TerminatorInst **ThenTerm,
                                          TerminatorInst **ElseTerm,
@@ -781,15 +675,6 @@ void llvm::SplitBlockAndInsertIfThenElse(Value *Cond, Instruction *SplitBefore,
 }
 
 
-/// GetIfCondition - Given a basic block (BB) with two predecessors,
-/// check to see if the merge at this block is due
-/// to an "if condition".  If so, return the boolean condition that determines
-/// which entry into BB will be taken.  Also, return by references the block
-/// that will be entered from if the condition is true, and the block that will
-/// be entered if the condition is false.
-///
-/// This does no checking to see if the true/false blocks have large or unsavory
-/// instructions in them.
 Value *llvm::GetIfCondition(BasicBlock *BB, BasicBlock *&IfTrue,
                              BasicBlock *&IfFalse) {
   PHINode *SomePHI = dyn_cast<PHINode>(BB->begin());
diff --git a/lib/Transforms/Utils/BreakCriticalEdges.cpp b/lib/Transforms/Utils/BreakCriticalEdges.cpp
index 95825991cee9..49b646a041f5 100644
--- a/lib/Transforms/Utils/BreakCriticalEdges.cpp
+++ b/lib/Transforms/Utils/BreakCriticalEdges.cpp
@@ -76,11 +76,10 @@ FunctionPass *llvm::createBreakCriticalEdgesPass() {
 //    Implementation of the external critical edge manipulation functions
 //===----------------------------------------------------------------------===//
 
-/// createPHIsForSplitLoopExit - When a loop exit edge is split, LCSSA form
-/// may require new PHIs in the new exit block. This function inserts the
-/// new PHIs, as needed. Preds is a list of preds inside the loop, SplitBB
-/// is the new loop exit block, and DestBB is the old loop exit, now the
-/// successor of SplitBB.
+/// When a loop exit edge is split, LCSSA form may require new PHIs in the new
+/// exit block. This function inserts the new PHIs, as needed. Preds is a list
+/// of preds inside the loop, SplitBB is the new loop exit block, and DestBB is
+/// the old loop exit, now the successor of SplitBB.
 static void createPHIsForSplitLoopExit(ArrayRef<BasicBlock *> Preds,
                                        BasicBlock *SplitBB,
                                        BasicBlock *DestBB) {
@@ -112,25 +111,9 @@ static void createPHIsForSplitLoopExit(ArrayRef<BasicBlock *> Preds,
   }
 }
 
-/// SplitCriticalEdge - If this edge is a critical edge, insert a new node to
-/// split the critical edge.  This will update DominatorTree information if it
-/// is available, thus calling this pass will not invalidate either of them.
-/// This returns the new block if the edge was split, null otherwise.
-///
-/// If MergeIdenticalEdges is true (not the default), *all* edges from TI to the
-/// specified successor will be merged into the same critical edge block.
-/// This is most commonly interesting with switch instructions, which may
-/// have many edges to any one destination.  This ensures that all edges to that
-/// dest go to one block instead of each going to a different block, but isn't
-/// the standard definition of a "critical edge".
-///
-/// It is invalid to call this function on a critical edge that starts at an
-/// IndirectBrInst.  Splitting these edges will almost always create an invalid
-/// program because the address of the new block won't be the one that is jumped
-/// to.
-///
-BasicBlock *llvm::SplitCriticalEdge(TerminatorInst *TI, unsigned SuccNum,
-                                    const CriticalEdgeSplittingOptions &Options) {
+BasicBlock *
+llvm::SplitCriticalEdge(TerminatorInst *TI, unsigned SuccNum,
+                        const CriticalEdgeSplittingOptions &Options) {
   if (!isCriticalEdge(TI, SuccNum, Options.MergeIdenticalEdges))
     return nullptr;
 
diff --git a/lib/Transforms/Utils/BuildLibCalls.cpp b/lib/Transforms/Utils/BuildLibCalls.cpp
index 64b44a6b7919..f4260a9ff980 100644
--- a/lib/Transforms/Utils/BuildLibCalls.cpp
+++ b/lib/Transforms/Utils/BuildLibCalls.cpp
@@ -13,6 +13,7 @@
 
 #include "llvm/Transforms/Utils/BuildLibCalls.h"
 #include "llvm/ADT/SmallString.h"
+#include "llvm/ADT/Statistic.h"
 #include "llvm/Analysis/TargetLibraryInfo.h"
 #include "llvm/IR/Constants.h"
 #include "llvm/IR/DataLayout.h"
@@ -25,81 +26,742 @@
 
 using namespace llvm;
 
-/// CastToCStr - Return V if it is an i8*, otherwise cast it to i8*.
-Value *llvm::CastToCStr(Value *V, IRBuilder<> &B) {
+#define DEBUG_TYPE "build-libcalls"
+
+//- Infer Attributes ---------------------------------------------------------//
+
+STATISTIC(NumReadNone, "Number of functions inferred as readnone");
+STATISTIC(NumReadOnly, "Number of functions inferred as readonly");
+STATISTIC(NumArgMemOnly, "Number of functions inferred as argmemonly");
+STATISTIC(NumNoUnwind, "Number of functions inferred as nounwind");
+STATISTIC(NumNoCapture, "Number of arguments inferred as nocapture");
+STATISTIC(NumReadOnlyArg, "Number of arguments inferred as readonly");
+STATISTIC(NumNoAlias, "Number of function returns inferred as noalias");
+STATISTIC(NumNonNull, "Number of function returns inferred as nonnull returns");
+
+static bool setDoesNotAccessMemory(Function &F) {
+  if (F.doesNotAccessMemory())
+    return false;
+  F.setDoesNotAccessMemory();
+  ++NumReadNone;
+  return true;
+}
+
+static bool setOnlyReadsMemory(Function &F) {
+  if (F.onlyReadsMemory())
+    return false;
+  F.setOnlyReadsMemory();
+  ++NumReadOnly;
+  return true;
+}
+
+static bool setOnlyAccessesArgMemory(Function &F) {
+  if (F.onlyAccessesArgMemory())
+    return false;
+  F.setOnlyAccessesArgMemory ();
+  ++NumArgMemOnly;
+  return true;
+}
+
+static bool setDoesNotThrow(Function &F) {
+  if (F.doesNotThrow())
+    return false;
+  F.setDoesNotThrow();
+  ++NumNoUnwind;
+  return true;
+}
+
+static bool setDoesNotCapture(Function &F, unsigned n) {
+  if (F.doesNotCapture(n))
+    return false;
+  F.setDoesNotCapture(n);
+  ++NumNoCapture;
+  return true;
+}
+
+static bool setOnlyReadsMemory(Function &F, unsigned n) {
+  if (F.onlyReadsMemory(n))
+    return false;
+  F.setOnlyReadsMemory(n);
+  ++NumReadOnlyArg;
+  return true;
+}
+
+static bool setDoesNotAlias(Function &F, unsigned n) {
+  if (F.doesNotAlias(n))
+    return false;
+  F.setDoesNotAlias(n);
+  ++NumNoAlias;
+  return true;
+}
+
+static bool setNonNull(Function &F, unsigned n) {
+  assert((n != AttributeSet::ReturnIndex ||
+          F.getReturnType()->isPointerTy()) &&
+         "nonnull applies only to pointers");
+  if (F.getAttributes().hasAttribute(n, Attribute::NonNull))
+    return false;
+  F.addAttribute(n, Attribute::NonNull);
+  ++NumNonNull;
+  return true;
+}
+
+bool llvm::inferLibFuncAttributes(Function &F, const TargetLibraryInfo &TLI) {
+  LibFunc::Func TheLibFunc;
+  if (!(TLI.getLibFunc(F, TheLibFunc) && TLI.has(TheLibFunc)))
+    return false;
+
+  bool Changed = false;
+  switch (TheLibFunc) {
+  case LibFunc::strlen:
+    Changed |= setOnlyReadsMemory(F);
+    Changed |= setDoesNotThrow(F);
+    Changed |= setDoesNotCapture(F, 1);
+    return Changed;
+  case LibFunc::strchr:
+  case LibFunc::strrchr:
+    Changed |= setOnlyReadsMemory(F);
+    Changed |= setDoesNotThrow(F);
+    return Changed;
+  case LibFunc::strtol:
+  case LibFunc::strtod:
+  case LibFunc::strtof:
+  case LibFunc::strtoul:
+  case LibFunc::strtoll:
+  case LibFunc::strtold:
+  case LibFunc::strtoull:
+    Changed |= setDoesNotThrow(F);
+    Changed |= setDoesNotCapture(F, 2);
+    Changed |= setOnlyReadsMemory(F, 1);
+    return Changed;
+  case LibFunc::strcpy:
+  case LibFunc::stpcpy:
+  case LibFunc::strcat:
+  case LibFunc::strncat:
+  case LibFunc::strncpy:
+  case LibFunc::stpncpy:
+    Changed |= setDoesNotThrow(F);
+    Changed |= setDoesNotCapture(F, 2);
+    Changed |= setOnlyReadsMemory(F, 2);
+    return Changed;
+  case LibFunc::strxfrm:
+    Changed |= setDoesNotThrow(F);
+    Changed |= setDoesNotCapture(F, 1);
+    Changed |= setDoesNotCapture(F, 2);
+    Changed |= setOnlyReadsMemory(F, 2);
+    return Changed;
+  case LibFunc::strcmp:      // 0,1
+  case LibFunc::strspn:      // 0,1
+  case LibFunc::strncmp:     // 0,1
+  case LibFunc::strcspn:     // 0,1
+  case LibFunc::strcoll:     // 0,1
+  case LibFunc::strcasecmp:  // 0,1
+  case LibFunc::strncasecmp: //
+    Changed |= setOnlyReadsMemory(F);
+    Changed |= setDoesNotThrow(F);
+    Changed |= setDoesNotCapture(F, 1);
+    Changed |= setDoesNotCapture(F, 2);
+    return Changed;
+  case LibFunc::strstr:
+  case LibFunc::strpbrk:
+    Changed |= setOnlyReadsMemory(F);
+    Changed |= setDoesNotThrow(F);
+    Changed |= setDoesNotCapture(F, 2);
+    return Changed;
+  case LibFunc::strtok:
+  case LibFunc::strtok_r:
+    Changed |= setDoesNotThrow(F);
+    Changed |= setDoesNotCapture(F, 2);
+    Changed |= setOnlyReadsMemory(F, 2);
+    return Changed;
+  case LibFunc::scanf:
+    Changed |= setDoesNotThrow(F);
+    Changed |= setDoesNotCapture(F, 1);
+    Changed |= setOnlyReadsMemory(F, 1);
+    return Changed;
+  case LibFunc::setbuf:
+  case LibFunc::setvbuf:
+    Changed |= setDoesNotThrow(F);
+    Changed |= setDoesNotCapture(F, 1);
+    return Changed;
+  case LibFunc::strdup:
+  case LibFunc::strndup:
+    Changed |= setDoesNotThrow(F);
+    Changed |= setDoesNotAlias(F, 0);
+    Changed |= setDoesNotCapture(F, 1);
+    Changed |= setOnlyReadsMemory(F, 1);
+    return Changed;
+  case LibFunc::stat:
+  case LibFunc::statvfs:
+    Changed |= setDoesNotThrow(F);
+    Changed |= setDoesNotCapture(F, 1);
+    Changed |= setDoesNotCapture(F, 2);
+    Changed |= setOnlyReadsMemory(F, 1);
+    return Changed;
+  case LibFunc::sscanf:
+    Changed |= setDoesNotThrow(F);
+    Changed |= setDoesNotCapture(F, 1);
+    Changed |= setDoesNotCapture(F, 2);
+    Changed |= setOnlyReadsMemory(F, 1);
+    Changed |= setOnlyReadsMemory(F, 2);
+    return Changed;
+  case LibFunc::sprintf:
+    Changed |= setDoesNotThrow(F);
+    Changed |= setDoesNotCapture(F, 1);
+    Changed |= setDoesNotCapture(F, 2);
+    Changed |= setOnlyReadsMemory(F, 2);
+    return Changed;
+  case LibFunc::snprintf:
+    Changed |= setDoesNotThrow(F);
+    Changed |= setDoesNotCapture(F, 1);
+    Changed |= setDoesNotCapture(F, 3);
+    Changed |= setOnlyReadsMemory(F, 3);
+    return Changed;
+  case LibFunc::setitimer:
+    Changed |= setDoesNotThrow(F);
+    Changed |= setDoesNotCapture(F, 2);
+    Changed |= setDoesNotCapture(F, 3);
+    Changed |= setOnlyReadsMemory(F, 2);
+    return Changed;
+  case LibFunc::system:
+    // May throw; "system" is a valid pthread cancellation point.
+    Changed |= setDoesNotCapture(F, 1);
+    Changed |= setOnlyReadsMemory(F, 1);
+    return Changed;
+  case LibFunc::malloc:
+    Changed |= setDoesNotThrow(F);
+    Changed |= setDoesNotAlias(F, 0);
+    return Changed;
+  case LibFunc::memcmp:
+    Changed |= setOnlyReadsMemory(F);
+    Changed |= setDoesNotThrow(F);
+    Changed |= setDoesNotCapture(F, 1);
+    Changed |= setDoesNotCapture(F, 2);
+    return Changed;
+  case LibFunc::memchr:
+  case LibFunc::memrchr:
+    Changed |= setOnlyReadsMemory(F);
+    Changed |= setDoesNotThrow(F);
+    return Changed;
+  case LibFunc::modf:
+  case LibFunc::modff:
+  case LibFunc::modfl:
+    Changed |= setDoesNotThrow(F);
+    Changed |= setDoesNotCapture(F, 2);
+    return Changed;
+  case LibFunc::memcpy:
+  case LibFunc::memccpy:
+  case LibFunc::memmove:
+    Changed |= setDoesNotThrow(F);
+    Changed |= setDoesNotCapture(F, 2);
+    Changed |= setOnlyReadsMemory(F, 2);
+    return Changed;
+  case LibFunc::memcpy_chk:
+    Changed |= setDoesNotThrow(F);
+    return Changed;
+  case LibFunc::memalign:
+    Changed |= setDoesNotAlias(F, 0);
+    return Changed;
+  case LibFunc::mkdir:
+    Changed |= setDoesNotThrow(F);
+    Changed |= setDoesNotCapture(F, 1);
+    Changed |= setOnlyReadsMemory(F, 1);
+    return Changed;
+  case LibFunc::mktime:
+    Changed |= setDoesNotThrow(F);
+    Changed |= setDoesNotCapture(F, 1);
+    return Changed;
+  case LibFunc::realloc:
+    Changed |= setDoesNotThrow(F);
+    Changed |= setDoesNotAlias(F, 0);
+    Changed |= setDoesNotCapture(F, 1);
+    return Changed;
+  case LibFunc::read:
+    // May throw; "read" is a valid pthread cancellation point.
+    Changed |= setDoesNotCapture(F, 2);
+    return Changed;
+  case LibFunc::rewind:
+    Changed |= setDoesNotThrow(F);
+    Changed |= setDoesNotCapture(F, 1);
+    return Changed;
+  case LibFunc::rmdir:
+  case LibFunc::remove:
+  case LibFunc::realpath:
+    Changed |= setDoesNotThrow(F);
+    Changed |= setDoesNotCapture(F, 1);
+    Changed |= setOnlyReadsMemory(F, 1);
+    return Changed;
+  case LibFunc::rename:
+    Changed |= setDoesNotThrow(F);
+    Changed |= setDoesNotCapture(F, 1);
+    Changed |= setDoesNotCapture(F, 2);
+    Changed |= setOnlyReadsMemory(F, 1);
+    Changed |= setOnlyReadsMemory(F, 2);
+    return Changed;
+  case LibFunc::readlink:
+    Changed |= setDoesNotThrow(F);
+    Changed |= setDoesNotCapture(F, 1);
+    Changed |= setDoesNotCapture(F, 2);
+    Changed |= setOnlyReadsMemory(F, 1);
+    return Changed;
+  case LibFunc::write:
+    // May throw; "write" is a valid pthread cancellation point.
+    Changed |= setDoesNotCapture(F, 2);
+    Changed |= setOnlyReadsMemory(F, 2);
+    return Changed;
+  case LibFunc::bcopy:
+    Changed |= setDoesNotThrow(F);
+    Changed |= setDoesNotCapture(F, 1);
+    Changed |= setDoesNotCapture(F, 2);
+    Changed |= setOnlyReadsMemory(F, 1);
+    return Changed;
+  case LibFunc::bcmp:
+    Changed |= setDoesNotThrow(F);
+    Changed |= setOnlyReadsMemory(F);
+    Changed |= setDoesNotCapture(F, 1);
+    Changed |= setDoesNotCapture(F, 2);
+    return Changed;
+  case LibFunc::bzero:
+    Changed |= setDoesNotThrow(F);
+    Changed |= setDoesNotCapture(F, 1);
+    return Changed;
+  case LibFunc::calloc:
+    Changed |= setDoesNotThrow(F);
+    Changed |= setDoesNotAlias(F, 0);
+    return Changed;
+  case LibFunc::chmod:
+  case LibFunc::chown:
+    Changed |= setDoesNotThrow(F);
+    Changed |= setDoesNotCapture(F, 1);
+    Changed |= setOnlyReadsMemory(F, 1);
+    return Changed;
+  case LibFunc::ctermid:
+  case LibFunc::clearerr:
+  case LibFunc::closedir:
+    Changed |= setDoesNotThrow(F);
+    Changed |= setDoesNotCapture(F, 1);
+    return Changed;
+  case LibFunc::atoi:
+  case LibFunc::atol:
+  case LibFunc::atof:
+  case LibFunc::atoll:
+    Changed |= setDoesNotThrow(F);
+    Changed |= setOnlyReadsMemory(F);
+    Changed |= setDoesNotCapture(F, 1);
+    return Changed;
+  case LibFunc::access:
+    Changed |= setDoesNotThrow(F);
+    Changed |= setDoesNotCapture(F, 1);
+    Changed |= setOnlyReadsMemory(F, 1);
+    return Changed;
+  case LibFunc::fopen:
+    Changed |= setDoesNotThrow(F);
+    Changed |= setDoesNotAlias(F, 0);
+    Changed |= setDoesNotCapture(F, 1);
+    Changed |= setDoesNotCapture(F, 2);
+    Changed |= setOnlyReadsMemory(F, 1);
+    Changed |= setOnlyReadsMemory(F, 2);
+    return Changed;
+  case LibFunc::fdopen:
+    Changed |= setDoesNotThrow(F);
+    Changed |= setDoesNotAlias(F, 0);
+    Changed |= setDoesNotCapture(F, 2);
+    Changed |= setOnlyReadsMemory(F, 2);
+    return Changed;
+  case LibFunc::feof:
+  case LibFunc::free:
+  case LibFunc::fseek:
+  case LibFunc::ftell:
+  case LibFunc::fgetc:
+  case LibFunc::fseeko:
+  case LibFunc::ftello:
+  case LibFunc::fileno:
+  case LibFunc::fflush:
+  case LibFunc::fclose:
+  case LibFunc::fsetpos:
+  case LibFunc::flockfile:
+  case LibFunc::funlockfile:
+  case LibFunc::ftrylockfile:
+    Changed |= setDoesNotThrow(F);
+    Changed |= setDoesNotCapture(F, 1);
+    return Changed;
+  case LibFunc::ferror:
+    Changed |= setDoesNotThrow(F);
+    Changed |= setDoesNotCapture(F, 1);
+    Changed |= setOnlyReadsMemory(F);
+    return Changed;
+  case LibFunc::fputc:
+  case LibFunc::fstat:
+  case LibFunc::frexp:
+  case LibFunc::frexpf:
+  case LibFunc::frexpl:
+  case LibFunc::fstatvfs:
+    Changed |= setDoesNotThrow(F);
+    Changed |= setDoesNotCapture(F, 2);
+    return Changed;
+  case LibFunc::fgets:
+    Changed |= setDoesNotThrow(F);
+    Changed |= setDoesNotCapture(F, 3);
+    return Changed;
+  case LibFunc::fread:
+    Changed |= setDoesNotThrow(F);
+    Changed |= setDoesNotCapture(F, 1);
+    Changed |= setDoesNotCapture(F, 4);
+    return Changed;
+  case LibFunc::fwrite:
+    Changed |= setDoesNotThrow(F);
+    Changed |= setDoesNotCapture(F, 1);
+    Changed |= setDoesNotCapture(F, 4);
+    // FIXME: readonly #1?
+    return Changed;
+  case LibFunc::fputs:
+    Changed |= setDoesNotThrow(F);
+    Changed |= setDoesNotCapture(F, 1);
+    Changed |= setDoesNotCapture(F, 2);
+    Changed |= setOnlyReadsMemory(F, 1);
+    return Changed;
+  case LibFunc::fscanf:
+  case LibFunc::fprintf:
+    Changed |= setDoesNotThrow(F);
+    Changed |= setDoesNotCapture(F, 1);
+    Changed |= setDoesNotCapture(F, 2);
+    Changed |= setOnlyReadsMemory(F, 2);
+    return Changed;
+  case LibFunc::fgetpos:
+    Changed |= setDoesNotThrow(F);
+    Changed |= setDoesNotCapture(F, 1);
+    Changed |= setDoesNotCapture(F, 2);
+    return Changed;
+  case LibFunc::getc:
+  case LibFunc::getlogin_r:
+  case LibFunc::getc_unlocked:
+    Changed |= setDoesNotThrow(F);
+    Changed |= setDoesNotCapture(F, 1);
+    return Changed;
+  case LibFunc::getenv:
+    Changed |= setDoesNotThrow(F);
+    Changed |= setOnlyReadsMemory(F);
+    Changed |= setDoesNotCapture(F, 1);
+    return Changed;
+  case LibFunc::gets:
+  case LibFunc::getchar:
+    Changed |= setDoesNotThrow(F);
+    return Changed;
+  case LibFunc::getitimer:
+    Changed |= setDoesNotThrow(F);
+    Changed |= setDoesNotCapture(F, 2);
+    return Changed;
+  case LibFunc::getpwnam:
+    Changed |= setDoesNotThrow(F);
+    Changed |= setDoesNotCapture(F, 1);
+    Changed |= setOnlyReadsMemory(F, 1);
+    return Changed;
+  case LibFunc::ungetc:
+    Changed |= setDoesNotThrow(F);
+    Changed |= setDoesNotCapture(F, 2);
+    return Changed;
+  case LibFunc::uname:
+    Changed |= setDoesNotThrow(F);
+    Changed |= setDoesNotCapture(F, 1);
+    return Changed;
+  case LibFunc::unlink:
+    Changed |= setDoesNotThrow(F);
+    Changed |= setDoesNotCapture(F, 1);
+    Changed |= setOnlyReadsMemory(F, 1);
+    return Changed;
+  case LibFunc::unsetenv:
+    Changed |= setDoesNotThrow(F);
+    Changed |= setDoesNotCapture(F, 1);
+    Changed |= setOnlyReadsMemory(F, 1);
+    return Changed;
+  case LibFunc::utime:
+  case LibFunc::utimes:
+    Changed |= setDoesNotThrow(F);
+    Changed |= setDoesNotCapture(F, 1);
+    Changed |= setDoesNotCapture(F, 2);
+    Changed |= setOnlyReadsMemory(F, 1);
+    Changed |= setOnlyReadsMemory(F, 2);
+    return Changed;
+  case LibFunc::putc:
+    Changed |= setDoesNotThrow(F);
+    Changed |= setDoesNotCapture(F, 2);
+    return Changed;
+  case LibFunc::puts:
+  case LibFunc::printf:
+  case LibFunc::perror:
+    Changed |= setDoesNotThrow(F);
+    Changed |= setDoesNotCapture(F, 1);
+    Changed |= setOnlyReadsMemory(F, 1);
+    return Changed;
+  case LibFunc::pread:
+    // May throw; "pread" is a valid pthread cancellation point.
+    Changed |= setDoesNotCapture(F, 2);
+    return Changed;
+  case LibFunc::pwrite:
+    // May throw; "pwrite" is a valid pthread cancellation point.
+    Changed |= setDoesNotCapture(F, 2);
+    Changed |= setOnlyReadsMemory(F, 2);
+    return Changed;
+  case LibFunc::putchar:
+    Changed |= setDoesNotThrow(F);
+    return Changed;
+  case LibFunc::popen:
+    Changed |= setDoesNotThrow(F);
+    Changed |= setDoesNotAlias(F, 0);
+    Changed |= setDoesNotCapture(F, 1);
+    Changed |= setDoesNotCapture(F, 2);
+    Changed |= setOnlyReadsMemory(F, 1);
+    Changed |= setOnlyReadsMemory(F, 2);
+    return Changed;
+  case LibFunc::pclose:
+    Changed |= setDoesNotThrow(F);
+    Changed |= setDoesNotCapture(F, 1);
+    return Changed;
+  case LibFunc::vscanf:
+    Changed |= setDoesNotThrow(F);
+    Changed |= setDoesNotCapture(F, 1);
+    Changed |= setOnlyReadsMemory(F, 1);
+    return Changed;
+  case LibFunc::vsscanf:
+    Changed |= setDoesNotThrow(F);
+    Changed |= setDoesNotCapture(F, 1);
+    Changed |= setDoesNotCapture(F, 2);
+    Changed |= setOnlyReadsMemory(F, 1);
+    Changed |= setOnlyReadsMemory(F, 2);
+    return Changed;
+  case LibFunc::vfscanf:
+    Changed |= setDoesNotThrow(F);
+    Changed |= setDoesNotCapture(F, 1);
+    Changed |= setDoesNotCapture(F, 2);
+    Changed |= setOnlyReadsMemory(F, 2);
+    return Changed;
+  case LibFunc::valloc:
+    Changed |= setDoesNotThrow(F);
+    Changed |= setDoesNotAlias(F, 0);
+    return Changed;
+  case LibFunc::vprintf:
+    Changed |= setDoesNotThrow(F);
+    Changed |= setDoesNotCapture(F, 1);
+    Changed |= setOnlyReadsMemory(F, 1);
+    return Changed;
+  case LibFunc::vfprintf:
+  case LibFunc::vsprintf:
+    Changed |= setDoesNotThrow(F);
+    Changed |= setDoesNotCapture(F, 1);
+    Changed |= setDoesNotCapture(F, 2);
+    Changed |= setOnlyReadsMemory(F, 2);
+    return Changed;
+  case LibFunc::vsnprintf:
+    Changed |= setDoesNotThrow(F);
+    Changed |= setDoesNotCapture(F, 1);
+    Changed |= setDoesNotCapture(F, 3);
+    Changed |= setOnlyReadsMemory(F, 3);
+    return Changed;
+  case LibFunc::open:
+    // May throw; "open" is a valid pthread cancellation point.
+    Changed |= setDoesNotCapture(F, 1);
+    Changed |= setOnlyReadsMemory(F, 1);
+    return Changed;
+  case LibFunc::opendir:
+    Changed |= setDoesNotThrow(F);
+    Changed |= setDoesNotAlias(F, 0);
+    Changed |= setDoesNotCapture(F, 1);
+    Changed |= setOnlyReadsMemory(F, 1);
+    return Changed;
+  case LibFunc::tmpfile:
+    Changed |= setDoesNotThrow(F);
+    Changed |= setDoesNotAlias(F, 0);
+    return Changed;
+  case LibFunc::times:
+    Changed |= setDoesNotThrow(F);
+    Changed |= setDoesNotCapture(F, 1);
+    return Changed;
+  case LibFunc::htonl:
+  case LibFunc::htons:
+  case LibFunc::ntohl:
+  case LibFunc::ntohs:
+    Changed |= setDoesNotThrow(F);
+    Changed |= setDoesNotAccessMemory(F);
+    return Changed;
+  case LibFunc::lstat:
+    Changed |= setDoesNotThrow(F);
+    Changed |= setDoesNotCapture(F, 1);
+    Changed |= setDoesNotCapture(F, 2);
+    Changed |= setOnlyReadsMemory(F, 1);
+    return Changed;
+  case LibFunc::lchown:
+    Changed |= setDoesNotThrow(F);
+    Changed |= setDoesNotCapture(F, 1);
+    Changed |= setOnlyReadsMemory(F, 1);
+    return Changed;
+  case LibFunc::qsort:
+    // May throw; places call through function pointer.
+    Changed |= setDoesNotCapture(F, 4);
+    return Changed;
+  case LibFunc::dunder_strdup:
+  case LibFunc::dunder_strndup:
+    Changed |= setDoesNotThrow(F);
+    Changed |= setDoesNotAlias(F, 0);
+    Changed |= setDoesNotCapture(F, 1);
+    Changed |= setOnlyReadsMemory(F, 1);
+    return Changed;
+  case LibFunc::dunder_strtok_r:
+    Changed |= setDoesNotThrow(F);
+    Changed |= setDoesNotCapture(F, 2);
+    Changed |= setOnlyReadsMemory(F, 2);
+    return Changed;
+  case LibFunc::under_IO_getc:
+    Changed |= setDoesNotThrow(F);
+    Changed |= setDoesNotCapture(F, 1);
+    return Changed;
+  case LibFunc::under_IO_putc:
+    Changed |= setDoesNotThrow(F);
+    Changed |= setDoesNotCapture(F, 2);
+    return Changed;
+  case LibFunc::dunder_isoc99_scanf:
+    Changed |= setDoesNotThrow(F);
+    Changed |= setDoesNotCapture(F, 1);
+    Changed |= setOnlyReadsMemory(F, 1);
+    return Changed;
+  case LibFunc::stat64:
+  case LibFunc::lstat64:
+  case LibFunc::statvfs64:
+    Changed |= setDoesNotThrow(F);
+    Changed |= setDoesNotCapture(F, 1);
+    Changed |= setDoesNotCapture(F, 2);
+    Changed |= setOnlyReadsMemory(F, 1);
+    return Changed;
+  case LibFunc::dunder_isoc99_sscanf:
+    Changed |= setDoesNotThrow(F);
+    Changed |= setDoesNotCapture(F, 1);
+    Changed |= setDoesNotCapture(F, 2);
+    Changed |= setOnlyReadsMemory(F, 1);
+    Changed |= setOnlyReadsMemory(F, 2);
+    return Changed;
+  case LibFunc::fopen64:
+    Changed |= setDoesNotThrow(F);
+    Changed |= setDoesNotAlias(F, 0);
+    Changed |= setDoesNotCapture(F, 1);
+    Changed |= setDoesNotCapture(F, 2);
+    Changed |= setOnlyReadsMemory(F, 1);
+    Changed |= setOnlyReadsMemory(F, 2);
+    return Changed;
+  case LibFunc::fseeko64:
+  case LibFunc::ftello64:
+    Changed |= setDoesNotThrow(F);
+    Changed |= setDoesNotCapture(F, 1);
+    return Changed;
+  case LibFunc::tmpfile64:
+    Changed |= setDoesNotThrow(F);
+    Changed |= setDoesNotAlias(F, 0);
+    return Changed;
+  case LibFunc::fstat64:
+  case LibFunc::fstatvfs64:
+    Changed |= setDoesNotThrow(F);
+    Changed |= setDoesNotCapture(F, 2);
+    return Changed;
+  case LibFunc::open64:
+    // May throw; "open" is a valid pthread cancellation point.
+    Changed |= setDoesNotCapture(F, 1);
+    Changed |= setOnlyReadsMemory(F, 1);
+    return Changed;
+  case LibFunc::gettimeofday:
+    // Currently some platforms have the restrict keyword on the arguments to
+    // gettimeofday. To be conservative, do not add noalias to gettimeofday's
+    // arguments.
+    Changed |= setDoesNotThrow(F);
+    Changed |= setDoesNotCapture(F, 1);
+    Changed |= setDoesNotCapture(F, 2);
+    return Changed;
+  case LibFunc::Znwj: // new(unsigned int)
+  case LibFunc::Znwm: // new(unsigned long)
+  case LibFunc::Znaj: // new[](unsigned int)
+  case LibFunc::Znam: // new[](unsigned long)
+  case LibFunc::msvc_new_int: // new(unsigned int)
+  case LibFunc::msvc_new_longlong: // new(unsigned long long)
+  case LibFunc::msvc_new_array_int: // new[](unsigned int)
+  case LibFunc::msvc_new_array_longlong: // new[](unsigned long long)
+    // Operator new always returns a nonnull noalias pointer
+    Changed |= setNonNull(F, AttributeSet::ReturnIndex);
+    Changed |= setDoesNotAlias(F, AttributeSet::ReturnIndex);
+    return Changed;
+  //TODO: add LibFunc entries for:
+  //case LibFunc::memset_pattern4:
+  //case LibFunc::memset_pattern8:
+  case LibFunc::memset_pattern16:
+    Changed |= setOnlyAccessesArgMemory(F);
+    Changed |= setDoesNotCapture(F, 1);
+    Changed |= setDoesNotCapture(F, 2);
+    Changed |= setOnlyReadsMemory(F, 2);
+    return Changed;
+  // int __nvvm_reflect(const char *)
+  case LibFunc::nvvm_reflect:
+    Changed |= setDoesNotAccessMemory(F);
+    Changed |= setDoesNotThrow(F);
+    return Changed;
+
+  default:
+    // FIXME: It'd be really nice to cover all the library functions we're
+    // aware of here.
+    return false;
+  }
+}
+
+//- Emit LibCalls ------------------------------------------------------------//
+
+Value *llvm::castToCStr(Value *V, IRBuilder<> &B) {
   unsigned AS = V->getType()->getPointerAddressSpace();
   return B.CreateBitCast(V, B.getInt8PtrTy(AS), "cstr");
 }
 
-/// EmitStrLen - Emit a call to the strlen function to the builder, for the
-/// specified pointer.  This always returns an integer value of size intptr_t.
-Value *llvm::EmitStrLen(Value *Ptr, IRBuilder<> &B, const DataLayout &DL,
+Value *llvm::emitStrLen(Value *Ptr, IRBuilder<> &B, const DataLayout &DL,
                         const TargetLibraryInfo *TLI) {
   if (!TLI->has(LibFunc::strlen))
     return nullptr;
 
-  Module *M = B.GetInsertBlock()->getParent()->getParent();
-  AttributeSet AS[2];
-  AS[0] = AttributeSet::get(M->getContext(), 1, Attribute::NoCapture);
-  Attribute::AttrKind AVs[2] = { Attribute::ReadOnly, Attribute::NoUnwind };
-  AS[1] = AttributeSet::get(M->getContext(), AttributeSet::FunctionIndex, AVs);
-
+  Module *M = B.GetInsertBlock()->getModule();
   LLVMContext &Context = B.GetInsertBlock()->getContext();
-  Constant *StrLen = M->getOrInsertFunction(
-      "strlen", AttributeSet::get(M->getContext(), AS),
-      DL.getIntPtrType(Context), B.getInt8PtrTy(), nullptr);
-  CallInst *CI = B.CreateCall(StrLen, CastToCStr(Ptr, B), "strlen");
+  Constant *StrLen = M->getOrInsertFunction("strlen", DL.getIntPtrType(Context),
+                                            B.getInt8PtrTy(), nullptr);
+  inferLibFuncAttributes(*M->getFunction("strlen"), *TLI);
+  CallInst *CI = B.CreateCall(StrLen, castToCStr(Ptr, B), "strlen");
   if (const Function *F = dyn_cast<Function>(StrLen->stripPointerCasts()))
     CI->setCallingConv(F->getCallingConv());
 
   return CI;
 }
 
-/// EmitStrChr - Emit a call to the strchr function to the builder, for the
-/// specified pointer and character.  Ptr is required to be some pointer type,
-/// and the return value has 'i8*' type.
-Value *llvm::EmitStrChr(Value *Ptr, char C, IRBuilder<> &B,
+Value *llvm::emitStrChr(Value *Ptr, char C, IRBuilder<> &B,
                         const TargetLibraryInfo *TLI) {
   if (!TLI->has(LibFunc::strchr))
     return nullptr;
 
-  Module *M = B.GetInsertBlock()->getParent()->getParent();
-  Attribute::AttrKind AVs[2] = { Attribute::ReadOnly, Attribute::NoUnwind };
-  AttributeSet AS =
-    AttributeSet::get(M->getContext(), AttributeSet::FunctionIndex, AVs);
-
+  Module *M = B.GetInsertBlock()->getModule();
   Type *I8Ptr = B.getInt8PtrTy();
   Type *I32Ty = B.getInt32Ty();
-  Constant *StrChr = M->getOrInsertFunction("strchr",
-                                            AttributeSet::get(M->getContext(),
-                                                             AS),
-                                            I8Ptr, I8Ptr, I32Ty, nullptr);
+  Constant *StrChr =
+      M->getOrInsertFunction("strchr", I8Ptr, I8Ptr, I32Ty, nullptr);
+  inferLibFuncAttributes(*M->getFunction("strchr"), *TLI);
   CallInst *CI = B.CreateCall(
-      StrChr, {CastToCStr(Ptr, B), ConstantInt::get(I32Ty, C)}, "strchr");
+      StrChr, {castToCStr(Ptr, B), ConstantInt::get(I32Ty, C)}, "strchr");
   if (const Function *F = dyn_cast<Function>(StrChr->stripPointerCasts()))
     CI->setCallingConv(F->getCallingConv());
   return CI;
 }
 
-/// EmitStrNCmp - Emit a call to the strncmp function to the builder.
-Value *llvm::EmitStrNCmp(Value *Ptr1, Value *Ptr2, Value *Len, IRBuilder<> &B,
+Value *llvm::emitStrNCmp(Value *Ptr1, Value *Ptr2, Value *Len, IRBuilder<> &B,
                          const DataLayout &DL, const TargetLibraryInfo *TLI) {
   if (!TLI->has(LibFunc::strncmp))
     return nullptr;
 
-  Module *M = B.GetInsertBlock()->getParent()->getParent();
-  AttributeSet AS[3];
-  AS[0] = AttributeSet::get(M->getContext(), 1, Attribute::NoCapture);
-  AS[1] = AttributeSet::get(M->getContext(), 2, Attribute::NoCapture);
-  Attribute::AttrKind AVs[2] = { Attribute::ReadOnly, Attribute::NoUnwind };
-  AS[2] = AttributeSet::get(M->getContext(), AttributeSet::FunctionIndex, AVs);
-
+  Module *M = B.GetInsertBlock()->getModule();
   LLVMContext &Context = B.GetInsertBlock()->getContext();
-  Value *StrNCmp = M->getOrInsertFunction(
-      "strncmp", AttributeSet::get(M->getContext(), AS), B.getInt32Ty(),
-      B.getInt8PtrTy(), B.getInt8PtrTy(), DL.getIntPtrType(Context), nullptr);
+  Value *StrNCmp = M->getOrInsertFunction("strncmp", B.getInt32Ty(),
+                                          B.getInt8PtrTy(), B.getInt8PtrTy(),
+                                          DL.getIntPtrType(Context), nullptr);
+  inferLibFuncAttributes(*M->getFunction("strncmp"), *TLI);
   CallInst *CI = B.CreateCall(
-      StrNCmp, {CastToCStr(Ptr1, B), CastToCStr(Ptr2, B), Len}, "strncmp");
+      StrNCmp, {castToCStr(Ptr1, B), castToCStr(Ptr2, B), Len}, "strncmp");
 
   if (const Function *F = dyn_cast<Function>(StrNCmp->stripPointerCasts()))
     CI->setCallingConv(F->getCallingConv());
@@ -107,64 +769,46 @@ Value *llvm::EmitStrNCmp(Value *Ptr1, Value *Ptr2, Value *Len, IRBuilder<> &B,
   return CI;
 }
 
-/// EmitStrCpy - Emit a call to the strcpy function to the builder, for the
-/// specified pointer arguments.
-Value *llvm::EmitStrCpy(Value *Dst, Value *Src, IRBuilder<> &B,
+Value *llvm::emitStrCpy(Value *Dst, Value *Src, IRBuilder<> &B,
                         const TargetLibraryInfo *TLI, StringRef Name) {
   if (!TLI->has(LibFunc::strcpy))
     return nullptr;
 
-  Module *M = B.GetInsertBlock()->getParent()->getParent();
-  AttributeSet AS[2];
-  AS[0] = AttributeSet::get(M->getContext(), 2, Attribute::NoCapture);
-  AS[1] = AttributeSet::get(M->getContext(), AttributeSet::FunctionIndex,
-                            Attribute::NoUnwind);
+  Module *M = B.GetInsertBlock()->getModule();
   Type *I8Ptr = B.getInt8PtrTy();
-  Value *StrCpy = M->getOrInsertFunction(Name,
-                                         AttributeSet::get(M->getContext(), AS),
-                                         I8Ptr, I8Ptr, I8Ptr, nullptr);
+  Value *StrCpy = M->getOrInsertFunction(Name, I8Ptr, I8Ptr, I8Ptr, nullptr);
+  inferLibFuncAttributes(*M->getFunction(Name), *TLI);
   CallInst *CI =
-      B.CreateCall(StrCpy, {CastToCStr(Dst, B), CastToCStr(Src, B)}, Name);
+      B.CreateCall(StrCpy, {castToCStr(Dst, B), castToCStr(Src, B)}, Name);
   if (const Function *F = dyn_cast<Function>(StrCpy->stripPointerCasts()))
     CI->setCallingConv(F->getCallingConv());
   return CI;
 }
 
-/// EmitStrNCpy - Emit a call to the strncpy function to the builder, for the
-/// specified pointer arguments.
-Value *llvm::EmitStrNCpy(Value *Dst, Value *Src, Value *Len, IRBuilder<> &B,
+Value *llvm::emitStrNCpy(Value *Dst, Value *Src, Value *Len, IRBuilder<> &B,
                          const TargetLibraryInfo *TLI, StringRef Name) {
   if (!TLI->has(LibFunc::strncpy))
     return nullptr;
 
-  Module *M = B.GetInsertBlock()->getParent()->getParent();
-  AttributeSet AS[2];
-  AS[0] = AttributeSet::get(M->getContext(), 2, Attribute::NoCapture);
-  AS[1] = AttributeSet::get(M->getContext(), AttributeSet::FunctionIndex,
-                            Attribute::NoUnwind);
+  Module *M = B.GetInsertBlock()->getModule();
   Type *I8Ptr = B.getInt8PtrTy();
-  Value *StrNCpy = M->getOrInsertFunction(Name,
-                                          AttributeSet::get(M->getContext(),
-                                                            AS),
-                                          I8Ptr, I8Ptr, I8Ptr,
+  Value *StrNCpy = M->getOrInsertFunction(Name, I8Ptr, I8Ptr, I8Ptr,
                                           Len->getType(), nullptr);
+  inferLibFuncAttributes(*M->getFunction(Name), *TLI);
   CallInst *CI = B.CreateCall(
-      StrNCpy, {CastToCStr(Dst, B), CastToCStr(Src, B), Len}, "strncpy");
+      StrNCpy, {castToCStr(Dst, B), castToCStr(Src, B), Len}, "strncpy");
   if (const Function *F = dyn_cast<Function>(StrNCpy->stripPointerCasts()))
     CI->setCallingConv(F->getCallingConv());
   return CI;
 }
 
-/// EmitMemCpyChk - Emit a call to the __memcpy_chk function to the builder.
-/// This expects that the Len and ObjSize have type 'intptr_t' and Dst/Src
-/// are pointers.
-Value *llvm::EmitMemCpyChk(Value *Dst, Value *Src, Value *Len, Value *ObjSize,
+Value *llvm::emitMemCpyChk(Value *Dst, Value *Src, Value *Len, Value *ObjSize,
                            IRBuilder<> &B, const DataLayout &DL,
                            const TargetLibraryInfo *TLI) {
   if (!TLI->has(LibFunc::memcpy_chk))
     return nullptr;
 
-  Module *M = B.GetInsertBlock()->getParent()->getParent();
+  Module *M = B.GetInsertBlock()->getModule();
   AttributeSet AS;
   AS = AttributeSet::get(M->getContext(), AttributeSet::FunctionIndex,
                          Attribute::NoUnwind);
@@ -173,30 +817,26 @@ Value *llvm::EmitMemCpyChk(Value *Dst, Value *Src, Value *Len, Value *ObjSize,
       "__memcpy_chk", AttributeSet::get(M->getContext(), AS), B.getInt8PtrTy(),
       B.getInt8PtrTy(), B.getInt8PtrTy(), DL.getIntPtrType(Context),
       DL.getIntPtrType(Context), nullptr);
-  Dst = CastToCStr(Dst, B);
-  Src = CastToCStr(Src, B);
+  Dst = castToCStr(Dst, B);
+  Src = castToCStr(Src, B);
   CallInst *CI = B.CreateCall(MemCpy, {Dst, Src, Len, ObjSize});
   if (const Function *F = dyn_cast<Function>(MemCpy->stripPointerCasts()))
     CI->setCallingConv(F->getCallingConv());
   return CI;
 }
 
-/// EmitMemChr - Emit a call to the memchr function.  This assumes that Ptr is
-/// a pointer, Val is an i32 value, and Len is an 'intptr_t' value.
-Value *llvm::EmitMemChr(Value *Ptr, Value *Val, Value *Len, IRBuilder<> &B,
+Value *llvm::emitMemChr(Value *Ptr, Value *Val, Value *Len, IRBuilder<> &B,
                         const DataLayout &DL, const TargetLibraryInfo *TLI) {
   if (!TLI->has(LibFunc::memchr))
     return nullptr;
 
-  Module *M = B.GetInsertBlock()->getParent()->getParent();
-  AttributeSet AS;
-  Attribute::AttrKind AVs[2] = { Attribute::ReadOnly, Attribute::NoUnwind };
-  AS = AttributeSet::get(M->getContext(), AttributeSet::FunctionIndex, AVs);
+  Module *M = B.GetInsertBlock()->getModule();
   LLVMContext &Context = B.GetInsertBlock()->getContext();
-  Value *MemChr = M->getOrInsertFunction(
-      "memchr", AttributeSet::get(M->getContext(), AS), B.getInt8PtrTy(),
-      B.getInt8PtrTy(), B.getInt32Ty(), DL.getIntPtrType(Context), nullptr);
-  CallInst *CI = B.CreateCall(MemChr, {CastToCStr(Ptr, B), Val, Len}, "memchr");
+  Value *MemChr = M->getOrInsertFunction("memchr", B.getInt8PtrTy(),
+                                         B.getInt8PtrTy(), B.getInt32Ty(),
+                                         DL.getIntPtrType(Context), nullptr);
+  inferLibFuncAttributes(*M->getFunction("memchr"), *TLI);
+  CallInst *CI = B.CreateCall(MemChr, {castToCStr(Ptr, B), Val, Len}, "memchr");
 
   if (const Function *F = dyn_cast<Function>(MemChr->stripPointerCasts()))
     CI->setCallingConv(F->getCallingConv());
@@ -204,25 +844,19 @@ Value *llvm::EmitMemChr(Value *Ptr, Value *Val, Value *Len, IRBuilder<> &B,
   return CI;
 }
 
-/// EmitMemCmp - Emit a call to the memcmp function.
-Value *llvm::EmitMemCmp(Value *Ptr1, Value *Ptr2, Value *Len, IRBuilder<> &B,
+Value *llvm::emitMemCmp(Value *Ptr1, Value *Ptr2, Value *Len, IRBuilder<> &B,
                         const DataLayout &DL, const TargetLibraryInfo *TLI) {
   if (!TLI->has(LibFunc::memcmp))
     return nullptr;
 
-  Module *M = B.GetInsertBlock()->getParent()->getParent();
-  AttributeSet AS[3];
-  AS[0] = AttributeSet::get(M->getContext(), 1, Attribute::NoCapture);
-  AS[1] = AttributeSet::get(M->getContext(), 2, Attribute::NoCapture);
-  Attribute::AttrKind AVs[2] = { Attribute::ReadOnly, Attribute::NoUnwind };
-  AS[2] = AttributeSet::get(M->getContext(), AttributeSet::FunctionIndex, AVs);
-
+  Module *M = B.GetInsertBlock()->getModule();
   LLVMContext &Context = B.GetInsertBlock()->getContext();
-  Value *MemCmp = M->getOrInsertFunction(
-      "memcmp", AttributeSet::get(M->getContext(), AS), B.getInt32Ty(),
-      B.getInt8PtrTy(), B.getInt8PtrTy(), DL.getIntPtrType(Context), nullptr);
+  Value *MemCmp = M->getOrInsertFunction("memcmp", B.getInt32Ty(),
+                                         B.getInt8PtrTy(), B.getInt8PtrTy(),
+                                         DL.getIntPtrType(Context), nullptr);
+  inferLibFuncAttributes(*M->getFunction("memcmp"), *TLI);
   CallInst *CI = B.CreateCall(
-      MemCmp, {CastToCStr(Ptr1, B), CastToCStr(Ptr2, B), Len}, "memcmp");
+      MemCmp, {castToCStr(Ptr1, B), castToCStr(Ptr2, B), Len}, "memcmp");
 
   if (const Function *F = dyn_cast<Function>(MemCmp->stripPointerCasts()))
     CI->setCallingConv(F->getCallingConv());
@@ -231,7 +865,8 @@ Value *llvm::EmitMemCmp(Value *Ptr1, Value *Ptr2, Value *Len, IRBuilder<> &B,
 }
 
 /// Append a suffix to the function name according to the type of 'Op'.
-static void AppendTypeSuffix(Value *Op, StringRef &Name, SmallString<20> &NameBuffer) {
+static void appendTypeSuffix(Value *Op, StringRef &Name,
+                             SmallString<20> &NameBuffer) {
   if (!Op->getType()->isDoubleTy()) {
       NameBuffer += Name;
 
@@ -242,19 +877,14 @@ static void AppendTypeSuffix(Value *Op, StringRef &Name, SmallString<20> &NameBu
 
     Name = NameBuffer;
   }  
-  return;
 }
 
-/// EmitUnaryFloatFnCall - Emit a call to the unary function named 'Name' (e.g.
-/// 'floor').  This function is known to take a single of type matching 'Op' and
-/// returns one value with the same type.  If 'Op' is a long double, 'l' is
-/// added as the suffix of name, if 'Op' is a float, we add a 'f' suffix.
-Value *llvm::EmitUnaryFloatFnCall(Value *Op, StringRef Name, IRBuilder<> &B,
+Value *llvm::emitUnaryFloatFnCall(Value *Op, StringRef Name, IRBuilder<> &B,
                                   const AttributeSet &Attrs) {
   SmallString<20> NameBuffer;
-  AppendTypeSuffix(Op, Name, NameBuffer);   
+  appendTypeSuffix(Op, Name, NameBuffer);
 
-  Module *M = B.GetInsertBlock()->getParent()->getParent();
+  Module *M = B.GetInsertBlock()->getModule();
   Value *Callee = M->getOrInsertFunction(Name, Op->getType(),
                                          Op->getType(), nullptr);
   CallInst *CI = B.CreateCall(Callee, Op, Name);
@@ -265,19 +895,14 @@ Value *llvm::EmitUnaryFloatFnCall(Value *Op, StringRef Name, IRBuilder<> &B,
   return CI;
 }
 
-/// EmitBinaryFloatFnCall - Emit a call to the binary function named 'Name'
-/// (e.g. 'fmin').  This function is known to take type matching 'Op1' and 'Op2'
-/// and return one value with the same type.  If 'Op1/Op2' are long double, 'l'
-/// is added as the suffix of name, if 'Op1/Op2' is a float, we add a 'f'
-/// suffix.
-Value *llvm::EmitBinaryFloatFnCall(Value *Op1, Value *Op2, StringRef Name,
+Value *llvm::emitBinaryFloatFnCall(Value *Op1, Value *Op2, StringRef Name,
                                   IRBuilder<> &B, const AttributeSet &Attrs) {
   SmallString<20> NameBuffer;
-  AppendTypeSuffix(Op1, Name, NameBuffer);   
+  appendTypeSuffix(Op1, Name, NameBuffer);
 
-  Module *M = B.GetInsertBlock()->getParent()->getParent();
-  Value *Callee = M->getOrInsertFunction(Name, Op1->getType(),
-                                         Op1->getType(), Op2->getType(), nullptr);
+  Module *M = B.GetInsertBlock()->getModule();
+  Value *Callee = M->getOrInsertFunction(Name, Op1->getType(), Op1->getType(),
+                                         Op2->getType(), nullptr);
   CallInst *CI = B.CreateCall(Callee, {Op1, Op2}, Name);
   CI->setAttributes(Attrs);
   if (const Function *F = dyn_cast<Function>(Callee->stripPointerCasts()))
@@ -286,14 +911,12 @@ Value *llvm::EmitBinaryFloatFnCall(Value *Op1, Value *Op2, StringRef Name,
   return CI;
 }
 
-/// EmitPutChar - Emit a call to the putchar function.  This assumes that Char
-/// is an integer.
-Value *llvm::EmitPutChar(Value *Char, IRBuilder<> &B,
+Value *llvm::emitPutChar(Value *Char, IRBuilder<> &B,
                          const TargetLibraryInfo *TLI) {
   if (!TLI->has(LibFunc::putchar))
     return nullptr;
 
-  Module *M = B.GetInsertBlock()->getParent()->getParent();
+  Module *M = B.GetInsertBlock()->getModule();
   Value *PutChar = M->getOrInsertFunction("putchar", B.getInt32Ty(),
                                           B.getInt32Ty(), nullptr);
   CallInst *CI = B.CreateCall(PutChar,
@@ -308,54 +931,31 @@ Value *llvm::EmitPutChar(Value *Char, IRBuilder<> &B,
   return CI;
 }
 
-/// EmitPutS - Emit a call to the puts function.  This assumes that Str is
-/// some pointer.
-Value *llvm::EmitPutS(Value *Str, IRBuilder<> &B,
+Value *llvm::emitPutS(Value *Str, IRBuilder<> &B,
                       const TargetLibraryInfo *TLI) {
   if (!TLI->has(LibFunc::puts))
     return nullptr;
 
-  Module *M = B.GetInsertBlock()->getParent()->getParent();
-  AttributeSet AS[2];
-  AS[0] = AttributeSet::get(M->getContext(), 1, Attribute::NoCapture);
-  AS[1] = AttributeSet::get(M->getContext(), AttributeSet::FunctionIndex,
-                            Attribute::NoUnwind);
-
-  Value *PutS = M->getOrInsertFunction("puts",
-                                       AttributeSet::get(M->getContext(), AS),
-                                       B.getInt32Ty(),
-                                       B.getInt8PtrTy(),
-                                       nullptr);
-  CallInst *CI = B.CreateCall(PutS, CastToCStr(Str, B), "puts");
+  Module *M = B.GetInsertBlock()->getModule();
+  Value *PutS =
+      M->getOrInsertFunction("puts", B.getInt32Ty(), B.getInt8PtrTy(), nullptr);
+  inferLibFuncAttributes(*M->getFunction("puts"), *TLI);
+  CallInst *CI = B.CreateCall(PutS, castToCStr(Str, B), "puts");
   if (const Function *F = dyn_cast<Function>(PutS->stripPointerCasts()))
     CI->setCallingConv(F->getCallingConv());
   return CI;
 }
 
-/// EmitFPutC - Emit a call to the fputc function.  This assumes that Char is
-/// an integer and File is a pointer to FILE.
-Value *llvm::EmitFPutC(Value *Char, Value *File, IRBuilder<> &B,
+Value *llvm::emitFPutC(Value *Char, Value *File, IRBuilder<> &B,
                        const TargetLibraryInfo *TLI) {
   if (!TLI->has(LibFunc::fputc))
     return nullptr;
 
-  Module *M = B.GetInsertBlock()->getParent()->getParent();
-  AttributeSet AS[2];
-  AS[0] = AttributeSet::get(M->getContext(), 2, Attribute::NoCapture);
-  AS[1] = AttributeSet::get(M->getContext(), AttributeSet::FunctionIndex,
-                            Attribute::NoUnwind);
-  Constant *F;
+  Module *M = B.GetInsertBlock()->getModule();
+  Constant *F = M->getOrInsertFunction("fputc", B.getInt32Ty(), B.getInt32Ty(),
+                                       File->getType(), nullptr);
   if (File->getType()->isPointerTy())
-    F = M->getOrInsertFunction("fputc",
-                               AttributeSet::get(M->getContext(), AS),
-                               B.getInt32Ty(),
-                               B.getInt32Ty(), File->getType(),
-                               nullptr);
-  else
-    F = M->getOrInsertFunction("fputc",
-                               B.getInt32Ty(),
-                               B.getInt32Ty(),
-                               File->getType(), nullptr);
+    inferLibFuncAttributes(*M->getFunction("fputc"), *TLI);
   Char = B.CreateIntCast(Char, B.getInt32Ty(), /*isSigned*/true,
                          "chari");
   CallInst *CI = B.CreateCall(F, {Char, File}, "fputc");
@@ -365,66 +965,40 @@ Value *llvm::EmitFPutC(Value *Char, Value *File, IRBuilder<> &B,
   return CI;
 }
 
-/// EmitFPutS - Emit a call to the puts function.  Str is required to be a
-/// pointer and File is a pointer to FILE.
-Value *llvm::EmitFPutS(Value *Str, Value *File, IRBuilder<> &B,
+Value *llvm::emitFPutS(Value *Str, Value *File, IRBuilder<> &B,
                        const TargetLibraryInfo *TLI) {
   if (!TLI->has(LibFunc::fputs))
     return nullptr;
 
-  Module *M = B.GetInsertBlock()->getParent()->getParent();
-  AttributeSet AS[3];
-  AS[0] = AttributeSet::get(M->getContext(), 1, Attribute::NoCapture);
-  AS[1] = AttributeSet::get(M->getContext(), 2, Attribute::NoCapture);
-  AS[2] = AttributeSet::get(M->getContext(), AttributeSet::FunctionIndex,
-                            Attribute::NoUnwind);
+  Module *M = B.GetInsertBlock()->getModule();
   StringRef FPutsName = TLI->getName(LibFunc::fputs);
-  Constant *F;
+  Constant *F = M->getOrInsertFunction(
+      FPutsName, B.getInt32Ty(), B.getInt8PtrTy(), File->getType(), nullptr);
   if (File->getType()->isPointerTy())
-    F = M->getOrInsertFunction(FPutsName,
-                               AttributeSet::get(M->getContext(), AS),
-                               B.getInt32Ty(),
-                               B.getInt8PtrTy(),
-                               File->getType(), nullptr);
-  else
-    F = M->getOrInsertFunction(FPutsName, B.getInt32Ty(),
-                               B.getInt8PtrTy(),
-                               File->getType(), nullptr);
-  CallInst *CI = B.CreateCall(F, {CastToCStr(Str, B), File}, "fputs");
+    inferLibFuncAttributes(*M->getFunction(FPutsName), *TLI);
+  CallInst *CI = B.CreateCall(F, {castToCStr(Str, B), File}, "fputs");
 
   if (const Function *Fn = dyn_cast<Function>(F->stripPointerCasts()))
     CI->setCallingConv(Fn->getCallingConv());
   return CI;
 }
 
-/// EmitFWrite - Emit a call to the fwrite function.  This assumes that Ptr is
-/// a pointer, Size is an 'intptr_t', and File is a pointer to FILE.
-Value *llvm::EmitFWrite(Value *Ptr, Value *Size, Value *File, IRBuilder<> &B,
+Value *llvm::emitFWrite(Value *Ptr, Value *Size, Value *File, IRBuilder<> &B,
                         const DataLayout &DL, const TargetLibraryInfo *TLI) {
   if (!TLI->has(LibFunc::fwrite))
     return nullptr;
 
-  Module *M = B.GetInsertBlock()->getParent()->getParent();
-  AttributeSet AS[3];
-  AS[0] = AttributeSet::get(M->getContext(), 1, Attribute::NoCapture);
-  AS[1] = AttributeSet::get(M->getContext(), 4, Attribute::NoCapture);
-  AS[2] = AttributeSet::get(M->getContext(), AttributeSet::FunctionIndex,
-                            Attribute::NoUnwind);
+  Module *M = B.GetInsertBlock()->getModule();
   LLVMContext &Context = B.GetInsertBlock()->getContext();
   StringRef FWriteName = TLI->getName(LibFunc::fwrite);
-  Constant *F;
+  Constant *F = M->getOrInsertFunction(
+      FWriteName, DL.getIntPtrType(Context), B.getInt8PtrTy(),
+      DL.getIntPtrType(Context), DL.getIntPtrType(Context), File->getType(),
+      nullptr);
   if (File->getType()->isPointerTy())
-    F = M->getOrInsertFunction(
-        FWriteName, AttributeSet::get(M->getContext(), AS),
-        DL.getIntPtrType(Context), B.getInt8PtrTy(), DL.getIntPtrType(Context),
-        DL.getIntPtrType(Context), File->getType(), nullptr);
-  else
-    F = M->getOrInsertFunction(FWriteName, DL.getIntPtrType(Context),
-                               B.getInt8PtrTy(), DL.getIntPtrType(Context),
-                               DL.getIntPtrType(Context), File->getType(),
-                               nullptr);
+    inferLibFuncAttributes(*M->getFunction(FWriteName), *TLI);
   CallInst *CI =
-      B.CreateCall(F, {CastToCStr(Ptr, B), Size,
+      B.CreateCall(F, {castToCStr(Ptr, B), Size,
                        ConstantInt::get(DL.getIntPtrType(Context), 1), File});
 
   if (const Function *Fn = dyn_cast<Function>(F->stripPointerCasts()))
diff --git a/lib/Transforms/Utils/CMakeLists.txt b/lib/Transforms/Utils/CMakeLists.txt
index 8308a9b69149..5aec0dce34db 100644
--- a/lib/Transforms/Utils/CMakeLists.txt
+++ b/lib/Transforms/Utils/CMakeLists.txt
@@ -11,7 +11,9 @@ add_llvm_library(LLVMTransformUtils
   CodeExtractor.cpp
   CtorUtils.cpp
   DemoteRegToStack.cpp
+  Evaluator.cpp
   FlattenCFG.cpp
+  FunctionImportUtils.cpp
   GlobalStatus.cpp
   InlineFunction.cpp
   InstructionNamer.cpp
@@ -26,10 +28,13 @@ add_llvm_library(LLVMTransformUtils
   LowerInvoke.cpp
   LowerSwitch.cpp
   Mem2Reg.cpp
+  MemorySSA.cpp
   MetaRenamer.cpp
   ModuleUtils.cpp
+  NameAnonFunctions.cpp
   PromoteMemoryToRegister.cpp
   SSAUpdater.cpp
+  SanitizerStats.cpp
   SimplifyCFG.cpp
   SimplifyIndVar.cpp
   SimplifyInstructions.cpp
diff --git a/lib/Transforms/Utils/CloneFunction.cpp b/lib/Transforms/Utils/CloneFunction.cpp
index 6454afb8bc42..c5ca56360fc8 100644
--- a/lib/Transforms/Utils/CloneFunction.cpp
+++ b/lib/Transforms/Utils/CloneFunction.cpp
@@ -119,6 +119,15 @@ void llvm::CloneFunctionInto(Function *NewFunc, const Function *OldFunc,
           .addAttributes(NewFunc->getContext(), AttributeSet::FunctionIndex,
                          OldAttrs.getFnAttributes()));
 
+  SmallVector<std::pair<unsigned, MDNode *>, 1> MDs;
+  OldFunc->getAllMetadata(MDs);
+  for (auto MD : MDs)
+    NewFunc->addMetadata(
+        MD.first,
+        *MapMetadata(MD.second, VMap,
+                     ModuleLevelChanges ? RF_None : RF_NoModuleLevelChanges,
+                     TypeMapper, Materializer));
+
   // Loop over all of the basic blocks in the function, cloning them as
   // appropriate.  Note that we save BE this way in order to handle cloning of
   // recursive functions into themselves.
@@ -163,65 +172,14 @@ void llvm::CloneFunctionInto(Function *NewFunc, const Function *OldFunc,
                        TypeMapper, Materializer);
 }
 
-// Find the MDNode which corresponds to the subprogram data that described F.
-static DISubprogram *FindSubprogram(const Function *F,
-                                    DebugInfoFinder &Finder) {
-  for (DISubprogram *Subprogram : Finder.subprograms()) {
-    if (Subprogram->describes(F))
-      return Subprogram;
-  }
-  return nullptr;
-}
-
-// Add an operand to an existing MDNode. The new operand will be added at the
-// back of the operand list.
-static void AddOperand(DICompileUnit *CU, DISubprogramArray SPs,
-                       Metadata *NewSP) {
-  SmallVector<Metadata *, 16> NewSPs;
-  NewSPs.reserve(SPs.size() + 1);
-  for (auto *SP : SPs)
-    NewSPs.push_back(SP);
-  NewSPs.push_back(NewSP);
-  CU->replaceSubprograms(MDTuple::get(CU->getContext(), NewSPs));
-}
-
-// Clone the module-level debug info associated with OldFunc. The cloned data
-// will point to NewFunc instead.
-static void CloneDebugInfoMetadata(Function *NewFunc, const Function *OldFunc,
-                            ValueToValueMapTy &VMap) {
-  DebugInfoFinder Finder;
-  Finder.processModule(*OldFunc->getParent());
-
-  const DISubprogram *OldSubprogramMDNode = FindSubprogram(OldFunc, Finder);
-  if (!OldSubprogramMDNode) return;
-
-  auto *NewSubprogram =
-      cast<DISubprogram>(MapMetadata(OldSubprogramMDNode, VMap));
-  NewFunc->setSubprogram(NewSubprogram);
-
-  for (auto *CU : Finder.compile_units()) {
-    auto Subprograms = CU->getSubprograms();
-    // If the compile unit's function list contains the old function, it should
-    // also contain the new one.
-    for (auto *SP : Subprograms) {
-      if (SP == OldSubprogramMDNode) {
-        AddOperand(CU, Subprograms, NewSubprogram);
-        break;
-      }
-    }
-  }
-}
-
-/// Return a copy of the specified function, but without
-/// embedding the function into another module.  Also, any references specified
-/// in the VMap are changed to refer to their mapped value instead of the
-/// original one.  If any of the arguments to the function are in the VMap,
-/// the arguments are deleted from the resultant function.  The VMap is
-/// updated to include mappings from all of the instructions and basicblocks in
-/// the function from their old to new values.
+/// Return a copy of the specified function and add it to that function's
+/// module.  Also, any references specified in the VMap are changed to refer to
+/// their mapped value instead of the original one.  If any of the arguments to
+/// the function are in the VMap, the arguments are deleted from the resultant
+/// function.  The VMap is updated to include mappings from all of the
+/// instructions and basicblocks in the function from their old to new values.
 ///
-Function *llvm::CloneFunction(const Function *F, ValueToValueMapTy &VMap,
-                              bool ModuleLevelChanges,
+Function *llvm::CloneFunction(Function *F, ValueToValueMapTy &VMap,
                               ClonedCodeInfo *CodeInfo) {
   std::vector<Type*> ArgTypes;
 
@@ -237,7 +195,8 @@ Function *llvm::CloneFunction(const Function *F, ValueToValueMapTy &VMap,
                                     ArgTypes, F->getFunctionType()->isVarArg());
 
   // Create the new function...
-  Function *NewF = Function::Create(FTy, F->getLinkage(), F->getName());
+  Function *NewF =
+      Function::Create(FTy, F->getLinkage(), F->getName(), F->getParent());
 
   // Loop over the arguments, copying the names of the mapped arguments over...
   Function::arg_iterator DestI = NewF->arg_begin();
@@ -247,11 +206,10 @@ Function *llvm::CloneFunction(const Function *F, ValueToValueMapTy &VMap,
       VMap[&I] = &*DestI++;        // Add mapping to VMap
     }
 
-  if (ModuleLevelChanges)
-    CloneDebugInfoMetadata(NewF, F, VMap);
-
   SmallVector<ReturnInst*, 8> Returns;  // Ignore returns cloned.
-  CloneFunctionInto(NewF, F, VMap, ModuleLevelChanges, Returns, "", CodeInfo);
+  CloneFunctionInto(NewF, F, VMap, /*ModuleLevelChanges=*/false, Returns, "",
+                    CodeInfo);
+
   return NewF;
 }
 
@@ -338,9 +296,11 @@ void PruningFunctionCloner::CloneBlock(const BasicBlock *BB,
         if (Value *MappedV = VMap.lookup(V))
           V = MappedV;
 
-        VMap[&*II] = V;
-        delete NewInst;
-        continue;
+        if (!NewInst->mayHaveSideEffects()) {
+          VMap[&*II] = V;
+          delete NewInst;
+          continue;
+        }
       }
     }
 
@@ -372,7 +332,7 @@ void PruningFunctionCloner::CloneBlock(const BasicBlock *BB,
       ConstantInt *Cond = dyn_cast<ConstantInt>(BI->getCondition());
       // Or is a known constant in the caller...
       if (!Cond) {
-        Value *V = VMap[BI->getCondition()];
+        Value *V = VMap.lookup(BI->getCondition());
         Cond = dyn_cast_or_null<ConstantInt>(V);
       }
 
@@ -388,7 +348,7 @@ void PruningFunctionCloner::CloneBlock(const BasicBlock *BB,
     // If switching on a value known constant in the caller.
     ConstantInt *Cond = dyn_cast<ConstantInt>(SI->getCondition());
     if (!Cond) { // Or known constant after constant prop in the callee...
-      Value *V = VMap[SI->getCondition()];
+      Value *V = VMap.lookup(SI->getCondition());
       Cond = dyn_cast_or_null<ConstantInt>(V);
     }
     if (Cond) {     // Constant fold to uncond branch!
@@ -475,7 +435,7 @@ void llvm::CloneAndPruneIntoFromInst(Function *NewFunc, const Function *OldFunc,
   // Defer PHI resolution until rest of function is resolved.
   SmallVector<const PHINode*, 16> PHIToResolve;
   for (const BasicBlock &BI : *OldFunc) {
-    Value *V = VMap[&BI];
+    Value *V = VMap.lookup(&BI);
     BasicBlock *NewBB = cast_or_null<BasicBlock>(V);
     if (!NewBB) continue;  // Dead block.
 
@@ -519,7 +479,7 @@ void llvm::CloneAndPruneIntoFromInst(Function *NewFunc, const Function *OldFunc,
       OPN = PHIToResolve[phino];
       PHINode *PN = cast<PHINode>(VMap[OPN]);
       for (unsigned pred = 0, e = NumPreds; pred != e; ++pred) {
-        Value *V = VMap[PN->getIncomingBlock(pred)];
+        Value *V = VMap.lookup(PN->getIncomingBlock(pred));
         if (BasicBlock *MappedBlock = cast_or_null<BasicBlock>(V)) {
           Value *InVal = MapValue(PN->getIncomingValue(pred),
                                   VMap, 
@@ -529,7 +489,8 @@ void llvm::CloneAndPruneIntoFromInst(Function *NewFunc, const Function *OldFunc,
           PN->setIncomingBlock(pred, MappedBlock);
         } else {
           PN->removeIncomingValue(pred, false);
-          --pred, --e;  // Revisit the next entry.
+          --pred;  // Revisit the next entry.
+          --e;
         }
       } 
     }
@@ -558,10 +519,9 @@ void llvm::CloneAndPruneIntoFromInst(Function *NewFunc, const Function *OldFunc,
       // entries.
       BasicBlock::iterator I = NewBB->begin();
       for (; (PN = dyn_cast<PHINode>(I)); ++I) {
-        for (std::map<BasicBlock*, unsigned>::iterator PCI =PredCount.begin(),
-             E = PredCount.end(); PCI != E; ++PCI) {
-          BasicBlock *Pred     = PCI->first;
-          for (unsigned NumToRemove = PCI->second; NumToRemove; --NumToRemove)
+        for (const auto &PCI : PredCount) {
+          BasicBlock *Pred = PCI.first;
+          for (unsigned NumToRemove = PCI.second; NumToRemove; --NumToRemove)
             PN->removeIncomingValue(Pred, false);
         }
       }
@@ -684,7 +644,7 @@ void llvm::remapInstructionsInBlocks(
   for (auto *BB : Blocks)
     for (auto &Inst : *BB)
       RemapInstruction(&Inst, VMap,
-                       RF_NoModuleLevelChanges | RF_IgnoreMissingEntries);
+                       RF_NoModuleLevelChanges | RF_IgnoreMissingLocals);
 }
 
 /// \brief Clones a loop \p OrigLoop.  Returns the loop and the blocks in \p
@@ -697,6 +657,8 @@ Loop *llvm::cloneLoopWithPreheader(BasicBlock *Before, BasicBlock *LoopDomBB,
                                    const Twine &NameSuffix, LoopInfo *LI,
                                    DominatorTree *DT,
                                    SmallVectorImpl<BasicBlock *> &Blocks) {
+  assert(OrigLoop->getSubLoops().empty() && 
+         "Loop to be cloned cannot have inner loop");
   Function *F = OrigLoop->getHeader()->getParent();
   Loop *ParentLoop = OrigLoop->getParentLoop();
 
@@ -727,13 +689,19 @@ Loop *llvm::cloneLoopWithPreheader(BasicBlock *Before, BasicBlock *LoopDomBB,
     // Update LoopInfo.
     NewLoop->addBasicBlockToLoop(NewBB, *LI);
 
-    // Update DominatorTree.
-    BasicBlock *IDomBB = DT->getNode(BB)->getIDom()->getBlock();
-    DT->addNewBlock(NewBB, cast<BasicBlock>(VMap[IDomBB]));
+    // Add DominatorTree node. After seeing all blocks, update to correct IDom.
+    DT->addNewBlock(NewBB, NewPH);
 
     Blocks.push_back(NewBB);
   }
 
+  for (BasicBlock *BB : OrigLoop->getBlocks()) {
+    // Update DominatorTree.
+    BasicBlock *IDomBB = DT->getNode(BB)->getIDom()->getBlock();
+    DT->changeImmediateDominator(cast<BasicBlock>(VMap[BB]),
+                                 cast<BasicBlock>(VMap[IDomBB]));
+  }
+
   // Move them physically from the end of the block list.
   F->getBasicBlockList().splice(Before->getIterator(), F->getBasicBlockList(),
                                 NewPH);
diff --git a/lib/Transforms/Utils/CloneModule.cpp b/lib/Transforms/Utils/CloneModule.cpp
index ab083353ece6..17e34c4ffa0f 100644
--- a/lib/Transforms/Utils/CloneModule.cpp
+++ b/lib/Transforms/Utils/CloneModule.cpp
@@ -38,7 +38,7 @@ std::unique_ptr<Module> llvm::CloneModule(const Module *M,
 
 std::unique_ptr<Module> llvm::CloneModule(
     const Module *M, ValueToValueMapTy &VMap,
-    std::function<bool(const GlobalValue *)> ShouldCloneDefinition) {
+    function_ref<bool(const GlobalValue *)> ShouldCloneDefinition) {
   // First off, we need to create the new module.
   std::unique_ptr<Module> New =
       llvm::make_unique<Module>(M->getModuleIdentifier(), M->getContext());
@@ -53,7 +53,7 @@ std::unique_ptr<Module> llvm::CloneModule(
   for (Module::const_global_iterator I = M->global_begin(), E = M->global_end();
        I != E; ++I) {
     GlobalVariable *GV = new GlobalVariable(*New, 
-                                            I->getType()->getElementType(),
+                                            I->getValueType(),
                                             I->isConstant(), I->getLinkage(),
                                             (Constant*) nullptr, I->getName(),
                                             (GlobalVariable*) nullptr,
@@ -64,12 +64,11 @@ std::unique_ptr<Module> llvm::CloneModule(
   }
 
   // Loop over the functions in the module, making external functions as before
-  for (Module::const_iterator I = M->begin(), E = M->end(); I != E; ++I) {
-    Function *NF =
-        Function::Create(cast<FunctionType>(I->getType()->getElementType()),
-                         I->getLinkage(), I->getName(), New.get());
-    NF->copyAttributesFrom(&*I);
-    VMap[&*I] = NF;
+  for (const Function &I : *M) {
+    Function *NF = Function::Create(cast<FunctionType>(I.getValueType()),
+                                    I.getLinkage(), I.getName(), New.get());
+    NF->copyAttributesFrom(&I);
+    VMap[&I] = NF;
   }
 
   // Loop over the aliases in the module
@@ -109,6 +108,9 @@ std::unique_ptr<Module> llvm::CloneModule(
   //
   for (Module::const_global_iterator I = M->global_begin(), E = M->global_end();
        I != E; ++I) {
+    if (I->isDeclaration())
+      continue;
+
     GlobalVariable *GV = cast<GlobalVariable>(VMap[&*I]);
     if (!ShouldCloneDefinition(&*I)) {
       // Skip after setting the correct linkage for an external reference.
@@ -121,27 +123,31 @@ std::unique_ptr<Module> llvm::CloneModule(
 
   // Similarly, copy over function bodies now...
   //
-  for (Module::const_iterator I = M->begin(), E = M->end(); I != E; ++I) {
-    Function *F = cast<Function>(VMap[&*I]);
-    if (!ShouldCloneDefinition(&*I)) {
+  for (const Function &I : *M) {
+    if (I.isDeclaration())
+      continue;
+
+    Function *F = cast<Function>(VMap[&I]);
+    if (!ShouldCloneDefinition(&I)) {
       // Skip after setting the correct linkage for an external reference.
       F->setLinkage(GlobalValue::ExternalLinkage);
+      // Personality function is not valid on a declaration.
+      F->setPersonalityFn(nullptr);
       continue;
     }
-    if (!I->isDeclaration()) {
-      Function::arg_iterator DestI = F->arg_begin();
-      for (Function::const_arg_iterator J = I->arg_begin(); J != I->arg_end();
-           ++J) {
-        DestI->setName(J->getName());
-        VMap[&*J] = &*DestI++;
-      }
-
-      SmallVector<ReturnInst*, 8> Returns;  // Ignore returns cloned.
-      CloneFunctionInto(F, &*I, VMap, /*ModuleLevelChanges=*/true, Returns);
+
+    Function::arg_iterator DestI = F->arg_begin();
+    for (Function::const_arg_iterator J = I.arg_begin(); J != I.arg_end();
+         ++J) {
+      DestI->setName(J->getName());
+      VMap[&*J] = &*DestI++;
     }
 
-    if (I->hasPersonalityFn())
-      F->setPersonalityFn(MapValue(I->getPersonalityFn(), VMap));
+    SmallVector<ReturnInst *, 8> Returns; // Ignore returns cloned.
+    CloneFunctionInto(F, &I, VMap, /*ModuleLevelChanges=*/true, Returns);
+
+    if (I.hasPersonalityFn())
+      F->setPersonalityFn(MapValue(I.getPersonalityFn(), VMap));
   }
 
   // And aliases
diff --git a/lib/Transforms/Utils/CodeExtractor.cpp b/lib/Transforms/Utils/CodeExtractor.cpp
index 823696d88e65..9f2181f87cee 100644
--- a/lib/Transforms/Utils/CodeExtractor.cpp
+++ b/lib/Transforms/Utils/CodeExtractor.cpp
@@ -77,15 +77,15 @@ static SetVector<BasicBlock *> buildExtractionBlockSet(IteratorT BBBegin,
 
   // Loop over the blocks, adding them to our set-vector, and aborting with an
   // empty set if we encounter invalid blocks.
-  for (IteratorT I = BBBegin, E = BBEnd; I != E; ++I) {
-    if (!Result.insert(*I))
+  do {
+    if (!Result.insert(*BBBegin))
       llvm_unreachable("Repeated basic blocks in extraction input");
 
-    if (!isBlockValidForExtraction(**I)) {
+    if (!isBlockValidForExtraction(**BBBegin)) {
       Result.clear();
       return Result;
     }
-  }
+  } while (++BBBegin != BBEnd);
 
 #ifndef NDEBUG
   for (SetVector<BasicBlock *>::iterator I = std::next(Result.begin()),
@@ -159,23 +159,18 @@ static bool definedInCaller(const SetVector<BasicBlock *> &Blocks, Value *V) {
 
 void CodeExtractor::findInputsOutputs(ValueSet &Inputs,
                                       ValueSet &Outputs) const {
-  for (SetVector<BasicBlock *>::const_iterator I = Blocks.begin(),
-                                               E = Blocks.end();
-       I != E; ++I) {
-    BasicBlock *BB = *I;
-
+  for (BasicBlock *BB : Blocks) {
     // If a used value is defined outside the region, it's an input.  If an
     // instruction is used outside the region, it's an output.
-    for (BasicBlock::iterator II = BB->begin(), IE = BB->end();
-         II != IE; ++II) {
-      for (User::op_iterator OI = II->op_begin(), OE = II->op_end();
-           OI != OE; ++OI)
+    for (Instruction &II : *BB) {
+      for (User::op_iterator OI = II.op_begin(), OE = II.op_end(); OI != OE;
+           ++OI)
         if (definedInCaller(Blocks, *OI))
           Inputs.insert(*OI);
 
-      for (User *U : II->users())
+      for (User *U : II.users())
         if (!definedInRegion(Blocks, U)) {
-          Outputs.insert(&*II);
+          Outputs.insert(&II);
           break;
         }
     }
@@ -263,25 +258,21 @@ void CodeExtractor::severSplitPHINodes(BasicBlock *&Header) {
 }
 
 void CodeExtractor::splitReturnBlocks() {
-  for (SetVector<BasicBlock *>::iterator I = Blocks.begin(), E = Blocks.end();
-       I != E; ++I)
-    if (ReturnInst *RI = dyn_cast<ReturnInst>((*I)->getTerminator())) {
+  for (BasicBlock *Block : Blocks)
+    if (ReturnInst *RI = dyn_cast<ReturnInst>(Block->getTerminator())) {
       BasicBlock *New =
-          (*I)->splitBasicBlock(RI->getIterator(), (*I)->getName() + ".ret");
+          Block->splitBasicBlock(RI->getIterator(), Block->getName() + ".ret");
       if (DT) {
         // Old dominates New. New node dominates all other nodes dominated
         // by Old.
-        DomTreeNode *OldNode = DT->getNode(*I);
-        SmallVector<DomTreeNode*, 8> Children;
-        for (DomTreeNode::iterator DI = OldNode->begin(), DE = OldNode->end();
-             DI != DE; ++DI) 
-          Children.push_back(*DI);
+        DomTreeNode *OldNode = DT->getNode(Block);
+        SmallVector<DomTreeNode *, 8> Children(OldNode->begin(),
+                                               OldNode->end());
 
-        DomTreeNode *NewNode = DT->addNewBlock(New, *I);
+        DomTreeNode *NewNode = DT->addNewBlock(New, Block);
 
-        for (SmallVectorImpl<DomTreeNode *>::iterator I = Children.begin(),
-               E = Children.end(); I != E; ++I)
-          DT->changeImmediateDominator(*I, NewNode);
+        for (DomTreeNode *I : Children)
+          DT->changeImmediateDominator(I, NewNode);
       }
     }
 }
@@ -310,28 +301,26 @@ Function *CodeExtractor::constructFunction(const ValueSet &inputs,
   std::vector<Type*> paramTy;
 
   // Add the types of the input values to the function's argument list
-  for (ValueSet::const_iterator i = inputs.begin(), e = inputs.end();
-       i != e; ++i) {
-    const Value *value = *i;
+  for (Value *value : inputs) {
     DEBUG(dbgs() << "value used in func: " << *value << "\n");
     paramTy.push_back(value->getType());
   }
 
   // Add the types of the output values to the function's argument list.
-  for (ValueSet::const_iterator I = outputs.begin(), E = outputs.end();
-       I != E; ++I) {
-    DEBUG(dbgs() << "instr used in func: " << **I << "\n");
+  for (Value *output : outputs) {
+    DEBUG(dbgs() << "instr used in func: " << *output << "\n");
     if (AggregateArgs)
-      paramTy.push_back((*I)->getType());
+      paramTy.push_back(output->getType());
     else
-      paramTy.push_back(PointerType::getUnqual((*I)->getType()));
+      paramTy.push_back(PointerType::getUnqual(output->getType()));
   }
 
-  DEBUG(dbgs() << "Function type: " << *RetTy << " f(");
-  for (std::vector<Type*>::iterator i = paramTy.begin(),
-         e = paramTy.end(); i != e; ++i)
-    DEBUG(dbgs() << **i << ", ");
-  DEBUG(dbgs() << ")\n");
+  DEBUG({
+    dbgs() << "Function type: " << *RetTy << " f(";
+    for (Type *i : paramTy)
+      dbgs() << *i << ", ";
+    dbgs() << ")\n";
+  });
 
   StructType *StructTy;
   if (AggregateArgs && (inputs.size() + outputs.size() > 0)) {
@@ -372,9 +361,8 @@ Function *CodeExtractor::constructFunction(const ValueSet &inputs,
       RewriteVal = &*AI++;
 
     std::vector<User*> Users(inputs[i]->user_begin(), inputs[i]->user_end());
-    for (std::vector<User*>::iterator use = Users.begin(), useE = Users.end();
-         use != useE; ++use)
-      if (Instruction* inst = dyn_cast<Instruction>(*use))
+    for (User *use : Users)
+      if (Instruction *inst = dyn_cast<Instruction>(use))
         if (Blocks.count(inst->getParent()))
           inst->replaceUsesOfWith(inputs[i], RewriteVal);
   }
@@ -429,19 +417,19 @@ emitCallAndSwitchStatement(Function *newFunction, BasicBlock *codeReplacer,
   LLVMContext &Context = newFunction->getContext();
 
   // Add inputs as params, or to be filled into the struct
-  for (ValueSet::iterator i = inputs.begin(), e = inputs.end(); i != e; ++i)
+  for (Value *input : inputs)
     if (AggregateArgs)
-      StructValues.push_back(*i);
+      StructValues.push_back(input);
     else
-      params.push_back(*i);
+      params.push_back(input);
 
   // Create allocas for the outputs
-  for (ValueSet::iterator i = outputs.begin(), e = outputs.end(); i != e; ++i) {
+  for (Value *output : outputs) {
     if (AggregateArgs) {
-      StructValues.push_back(*i);
+      StructValues.push_back(output);
     } else {
       AllocaInst *alloca =
-          new AllocaInst((*i)->getType(), nullptr, (*i)->getName() + ".loc",
+          new AllocaInst(output->getType(), nullptr, output->getName() + ".loc",
                          &codeReplacer->getParent()->front().front());
       ReloadOutputs.push_back(alloca);
       params.push_back(alloca);
@@ -522,9 +510,8 @@ emitCallAndSwitchStatement(Function *newFunction, BasicBlock *codeReplacer,
   std::map<BasicBlock*, BasicBlock*> ExitBlockMap;
 
   unsigned switchVal = 0;
-  for (SetVector<BasicBlock*>::const_iterator i = Blocks.begin(),
-         e = Blocks.end(); i != e; ++i) {
-    TerminatorInst *TI = (*i)->getTerminator();
+  for (BasicBlock *Block : Blocks) {
+    TerminatorInst *TI = Block->getTerminator();
     for (unsigned i = 0, e = TI->getNumSuccessors(); i != e; ++i)
       if (!Blocks.count(TI->getSuccessor(i))) {
         BasicBlock *OldTarget = TI->getSuccessor(i);
@@ -576,10 +563,9 @@ emitCallAndSwitchStatement(Function *newFunction, BasicBlock *codeReplacer,
               // Make sure we are looking at the original successor block, not
               // at a newly inserted exit block, which won't be in the dominator
               // info.
-              for (std::map<BasicBlock*, BasicBlock*>::iterator I =
-                     ExitBlockMap.begin(), E = ExitBlockMap.end(); I != E; ++I)
-                if (DefBlock == I->second) {
-                  DefBlock = I->first;
+              for (const auto &I : ExitBlockMap)
+                if (DefBlock == I.second) {
+                  DefBlock = I.first;
                   break;
                 }
 
@@ -677,13 +663,12 @@ void CodeExtractor::moveCodeToFunction(Function *newFunction) {
   Function::BasicBlockListType &oldBlocks = oldFunc->getBasicBlockList();
   Function::BasicBlockListType &newBlocks = newFunction->getBasicBlockList();
 
-  for (SetVector<BasicBlock*>::const_iterator i = Blocks.begin(),
-         e = Blocks.end(); i != e; ++i) {
+  for (BasicBlock *Block : Blocks) {
     // Delete the basic block from the old function, and the list of blocks
-    oldBlocks.remove(*i);
+    oldBlocks.remove(Block);
 
     // Insert this basic block into the new function
-    newBlocks.push_back(*i);
+    newBlocks.push_back(Block);
   }
 }
 
@@ -721,9 +706,9 @@ Function *CodeExtractor::extractCodeRegion() {
   findInputsOutputs(inputs, outputs);
 
   SmallPtrSet<BasicBlock *, 1> ExitBlocks;
-  for (SetVector<BasicBlock *>::iterator I = Blocks.begin(), E = Blocks.end();
-       I != E; ++I)
-    for (succ_iterator SI = succ_begin(*I), SE = succ_end(*I); SI != SE; ++SI)
+  for (BasicBlock *Block : Blocks)
+    for (succ_iterator SI = succ_begin(Block), SE = succ_end(Block); SI != SE;
+         ++SI)
       if (!Blocks.count(*SI))
         ExitBlocks.insert(*SI);
   NumExitBlocks = ExitBlocks.size();
diff --git a/lib/Transforms/Utils/Evaluator.cpp b/lib/Transforms/Utils/Evaluator.cpp
new file mode 100644
index 000000000000..cd130abf4519
--- /dev/null
+++ b/lib/Transforms/Utils/Evaluator.cpp
@@ -0,0 +1,596 @@
+//===- Evaluator.cpp - LLVM IR evaluator ----------------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// Function evaluator for LLVM IR.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Transforms/Utils/Evaluator.h"
+#include "llvm/Analysis/ConstantFolding.h"
+#include "llvm/IR/BasicBlock.h"
+#include "llvm/IR/CallSite.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/DerivedTypes.h"
+#include "llvm/IR/DiagnosticPrinter.h"
+#include "llvm/IR/GlobalVariable.h"
+#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/Operator.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+
+#define DEBUG_TYPE "evaluator"
+
+using namespace llvm;
+
+static inline bool
+isSimpleEnoughValueToCommit(Constant *C,
+                            SmallPtrSetImpl<Constant *> &SimpleConstants,
+                            const DataLayout &DL);
+
+/// Return true if the specified constant can be handled by the code generator.
+/// We don't want to generate something like:
+///   void *X = &X/42;
+/// because the code generator doesn't have a relocation that can handle that.
+///
+/// This function should be called if C was not found (but just got inserted)
+/// in SimpleConstants to avoid having to rescan the same constants all the
+/// time.
+static bool
+isSimpleEnoughValueToCommitHelper(Constant *C,
+                                  SmallPtrSetImpl<Constant *> &SimpleConstants,
+                                  const DataLayout &DL) {
+  // Simple global addresses are supported, do not allow dllimport or
+  // thread-local globals.
+  if (auto *GV = dyn_cast<GlobalValue>(C))
+    return !GV->hasDLLImportStorageClass() && !GV->isThreadLocal();
+
+  // Simple integer, undef, constant aggregate zero, etc are all supported.
+  if (C->getNumOperands() == 0 || isa<BlockAddress>(C))
+    return true;
+
+  // Aggregate values are safe if all their elements are.
+  if (isa<ConstantAggregate>(C)) {
+    for (Value *Op : C->operands())
+      if (!isSimpleEnoughValueToCommit(cast<Constant>(Op), SimpleConstants, DL))
+        return false;
+    return true;
+  }
+
+  // We don't know exactly what relocations are allowed in constant expressions,
+  // so we allow &global+constantoffset, which is safe and uniformly supported
+  // across targets.
+  ConstantExpr *CE = cast<ConstantExpr>(C);
+  switch (CE->getOpcode()) {
+  case Instruction::BitCast:
+    // Bitcast is fine if the casted value is fine.
+    return isSimpleEnoughValueToCommit(CE->getOperand(0), SimpleConstants, DL);
+
+  case Instruction::IntToPtr:
+  case Instruction::PtrToInt:
+    // int <=> ptr is fine if the int type is the same size as the
+    // pointer type.
+    if (DL.getTypeSizeInBits(CE->getType()) !=
+        DL.getTypeSizeInBits(CE->getOperand(0)->getType()))
+      return false;
+    return isSimpleEnoughValueToCommit(CE->getOperand(0), SimpleConstants, DL);
+
+  // GEP is fine if it is simple + constant offset.
+  case Instruction::GetElementPtr:
+    for (unsigned i = 1, e = CE->getNumOperands(); i != e; ++i)
+      if (!isa<ConstantInt>(CE->getOperand(i)))
+        return false;
+    return isSimpleEnoughValueToCommit(CE->getOperand(0), SimpleConstants, DL);
+
+  case Instruction::Add:
+    // We allow simple+cst.
+    if (!isa<ConstantInt>(CE->getOperand(1)))
+      return false;
+    return isSimpleEnoughValueToCommit(CE->getOperand(0), SimpleConstants, DL);
+  }
+  return false;
+}
+
+static inline bool
+isSimpleEnoughValueToCommit(Constant *C,
+                            SmallPtrSetImpl<Constant *> &SimpleConstants,
+                            const DataLayout &DL) {
+  // If we already checked this constant, we win.
+  if (!SimpleConstants.insert(C).second)
+    return true;
+  // Check the constant.
+  return isSimpleEnoughValueToCommitHelper(C, SimpleConstants, DL);
+}
+
+/// Return true if this constant is simple enough for us to understand.  In
+/// particular, if it is a cast to anything other than from one pointer type to
+/// another pointer type, we punt.  We basically just support direct accesses to
+/// globals and GEP's of globals.  This should be kept up to date with
+/// CommitValueTo.
+static bool isSimpleEnoughPointerToCommit(Constant *C) {
+  // Conservatively, avoid aggregate types. This is because we don't
+  // want to worry about them partially overlapping other stores.
+  if (!cast<PointerType>(C->getType())->getElementType()->isSingleValueType())
+    return false;
+
+  if (GlobalVariable *GV = dyn_cast<GlobalVariable>(C))
+    // Do not allow weak/*_odr/linkonce linkage or external globals.
+    return GV->hasUniqueInitializer();
+
+  if (ConstantExpr *CE = dyn_cast<ConstantExpr>(C)) {
+    // Handle a constantexpr gep.
+    if (CE->getOpcode() == Instruction::GetElementPtr &&
+        isa<GlobalVariable>(CE->getOperand(0)) &&
+        cast<GEPOperator>(CE)->isInBounds()) {
+      GlobalVariable *GV = cast<GlobalVariable>(CE->getOperand(0));
+      // Do not allow weak/*_odr/linkonce/dllimport/dllexport linkage or
+      // external globals.
+      if (!GV->hasUniqueInitializer())
+        return false;
+
+      // The first index must be zero.
+      ConstantInt *CI = dyn_cast<ConstantInt>(*std::next(CE->op_begin()));
+      if (!CI || !CI->isZero()) return false;
+
+      // The remaining indices must be compile-time known integers within the
+      // notional bounds of the corresponding static array types.
+      if (!CE->isGEPWithNoNotionalOverIndexing())
+        return false;
+
+      return ConstantFoldLoadThroughGEPConstantExpr(GV->getInitializer(), CE);
+
+    // A constantexpr bitcast from a pointer to another pointer is a no-op,
+    // and we know how to evaluate it by moving the bitcast from the pointer
+    // operand to the value operand.
+    } else if (CE->getOpcode() == Instruction::BitCast &&
+               isa<GlobalVariable>(CE->getOperand(0))) {
+      // Do not allow weak/*_odr/linkonce/dllimport/dllexport linkage or
+      // external globals.
+      return cast<GlobalVariable>(CE->getOperand(0))->hasUniqueInitializer();
+    }
+  }
+
+  return false;
+}
+
+/// Return the value that would be computed by a load from P after the stores
+/// reflected by 'memory' have been performed.  If we can't decide, return null.
+Constant *Evaluator::ComputeLoadResult(Constant *P) {
+  // If this memory location has been recently stored, use the stored value: it
+  // is the most up-to-date.
+  DenseMap<Constant*, Constant*>::const_iterator I = MutatedMemory.find(P);
+  if (I != MutatedMemory.end()) return I->second;
+
+  // Access it.
+  if (GlobalVariable *GV = dyn_cast<GlobalVariable>(P)) {
+    if (GV->hasDefinitiveInitializer())
+      return GV->getInitializer();
+    return nullptr;
+  }
+
+  // Handle a constantexpr getelementptr.
+  if (ConstantExpr *CE = dyn_cast<ConstantExpr>(P))
+    if (CE->getOpcode() == Instruction::GetElementPtr &&
+        isa<GlobalVariable>(CE->getOperand(0))) {
+      GlobalVariable *GV = cast<GlobalVariable>(CE->getOperand(0));
+      if (GV->hasDefinitiveInitializer())
+        return ConstantFoldLoadThroughGEPConstantExpr(GV->getInitializer(), CE);
+    }
+
+  return nullptr;  // don't know how to evaluate.
+}
+
+/// Evaluate all instructions in block BB, returning true if successful, false
+/// if we can't evaluate it.  NewBB returns the next BB that control flows into,
+/// or null upon return.
+bool Evaluator::EvaluateBlock(BasicBlock::iterator CurInst,
+                              BasicBlock *&NextBB) {
+  // This is the main evaluation loop.
+  while (1) {
+    Constant *InstResult = nullptr;
+
+    DEBUG(dbgs() << "Evaluating Instruction: " << *CurInst << "\n");
+
+    if (StoreInst *SI = dyn_cast<StoreInst>(CurInst)) {
+      if (!SI->isSimple()) {
+        DEBUG(dbgs() << "Store is not simple! Can not evaluate.\n");
+        return false;  // no volatile/atomic accesses.
+      }
+      Constant *Ptr = getVal(SI->getOperand(1));
+      if (ConstantExpr *CE = dyn_cast<ConstantExpr>(Ptr)) {
+        DEBUG(dbgs() << "Folding constant ptr expression: " << *Ptr);
+        Ptr = ConstantFoldConstantExpression(CE, DL, TLI);
+        DEBUG(dbgs() << "; To: " << *Ptr << "\n");
+      }
+      if (!isSimpleEnoughPointerToCommit(Ptr)) {
+        // If this is too complex for us to commit, reject it.
+        DEBUG(dbgs() << "Pointer is too complex for us to evaluate store.");
+        return false;
+      }
+
+      Constant *Val = getVal(SI->getOperand(0));
+
+      // If this might be too difficult for the backend to handle (e.g. the addr
+      // of one global variable divided by another) then we can't commit it.
+      if (!isSimpleEnoughValueToCommit(Val, SimpleConstants, DL)) {
+        DEBUG(dbgs() << "Store value is too complex to evaluate store. " << *Val
+              << "\n");
+        return false;
+      }
+
+      if (ConstantExpr *CE = dyn_cast<ConstantExpr>(Ptr)) {
+        if (CE->getOpcode() == Instruction::BitCast) {
+          DEBUG(dbgs() << "Attempting to resolve bitcast on constant ptr.\n");
+          // If we're evaluating a store through a bitcast, then we need
+          // to pull the bitcast off the pointer type and push it onto the
+          // stored value.
+          Ptr = CE->getOperand(0);
+
+          Type *NewTy = cast<PointerType>(Ptr->getType())->getElementType();
+
+          // In order to push the bitcast onto the stored value, a bitcast
+          // from NewTy to Val's type must be legal.  If it's not, we can try
+          // introspecting NewTy to find a legal conversion.
+          while (!Val->getType()->canLosslesslyBitCastTo(NewTy)) {
+            // If NewTy is a struct, we can convert the pointer to the struct
+            // into a pointer to its first member.
+            // FIXME: This could be extended to support arrays as well.
+            if (StructType *STy = dyn_cast<StructType>(NewTy)) {
+              NewTy = STy->getTypeAtIndex(0U);
+
+              IntegerType *IdxTy = IntegerType::get(NewTy->getContext(), 32);
+              Constant *IdxZero = ConstantInt::get(IdxTy, 0, false);
+              Constant * const IdxList[] = {IdxZero, IdxZero};
+
+              Ptr = ConstantExpr::getGetElementPtr(nullptr, Ptr, IdxList);
+              if (ConstantExpr *CE = dyn_cast<ConstantExpr>(Ptr))
+                Ptr = ConstantFoldConstantExpression(CE, DL, TLI);
+
+            // If we can't improve the situation by introspecting NewTy,
+            // we have to give up.
+            } else {
+              DEBUG(dbgs() << "Failed to bitcast constant ptr, can not "
+                    "evaluate.\n");
+              return false;
+            }
+          }
+
+          // If we found compatible types, go ahead and push the bitcast
+          // onto the stored value.
+          Val = ConstantExpr::getBitCast(Val, NewTy);
+
+          DEBUG(dbgs() << "Evaluated bitcast: " << *Val << "\n");
+        }
+      }
+
+      MutatedMemory[Ptr] = Val;
+    } else if (BinaryOperator *BO = dyn_cast<BinaryOperator>(CurInst)) {
+      InstResult = ConstantExpr::get(BO->getOpcode(),
+                                     getVal(BO->getOperand(0)),
+                                     getVal(BO->getOperand(1)));
+      DEBUG(dbgs() << "Found a BinaryOperator! Simplifying: " << *InstResult
+            << "\n");
+    } else if (CmpInst *CI = dyn_cast<CmpInst>(CurInst)) {
+      InstResult = ConstantExpr::getCompare(CI->getPredicate(),
+                                            getVal(CI->getOperand(0)),
+                                            getVal(CI->getOperand(1)));
+      DEBUG(dbgs() << "Found a CmpInst! Simplifying: " << *InstResult
+            << "\n");
+    } else if (CastInst *CI = dyn_cast<CastInst>(CurInst)) {
+      InstResult = ConstantExpr::getCast(CI->getOpcode(),
+                                         getVal(CI->getOperand(0)),
+                                         CI->getType());
+      DEBUG(dbgs() << "Found a Cast! Simplifying: " << *InstResult
+            << "\n");
+    } else if (SelectInst *SI = dyn_cast<SelectInst>(CurInst)) {
+      InstResult = ConstantExpr::getSelect(getVal(SI->getOperand(0)),
+                                           getVal(SI->getOperand(1)),
+                                           getVal(SI->getOperand(2)));
+      DEBUG(dbgs() << "Found a Select! Simplifying: " << *InstResult
+            << "\n");
+    } else if (auto *EVI = dyn_cast<ExtractValueInst>(CurInst)) {
+      InstResult = ConstantExpr::getExtractValue(
+          getVal(EVI->getAggregateOperand()), EVI->getIndices());
+      DEBUG(dbgs() << "Found an ExtractValueInst! Simplifying: " << *InstResult
+                   << "\n");
+    } else if (auto *IVI = dyn_cast<InsertValueInst>(CurInst)) {
+      InstResult = ConstantExpr::getInsertValue(
+          getVal(IVI->getAggregateOperand()),
+          getVal(IVI->getInsertedValueOperand()), IVI->getIndices());
+      DEBUG(dbgs() << "Found an InsertValueInst! Simplifying: " << *InstResult
+                   << "\n");
+    } else if (GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(CurInst)) {
+      Constant *P = getVal(GEP->getOperand(0));
+      SmallVector<Constant*, 8> GEPOps;
+      for (User::op_iterator i = GEP->op_begin() + 1, e = GEP->op_end();
+           i != e; ++i)
+        GEPOps.push_back(getVal(*i));
+      InstResult =
+          ConstantExpr::getGetElementPtr(GEP->getSourceElementType(), P, GEPOps,
+                                         cast<GEPOperator>(GEP)->isInBounds());
+      DEBUG(dbgs() << "Found a GEP! Simplifying: " << *InstResult
+            << "\n");
+    } else if (LoadInst *LI = dyn_cast<LoadInst>(CurInst)) {
+
+      if (!LI->isSimple()) {
+        DEBUG(dbgs() << "Found a Load! Not a simple load, can not evaluate.\n");
+        return false;  // no volatile/atomic accesses.
+      }
+
+      Constant *Ptr = getVal(LI->getOperand(0));
+      if (ConstantExpr *CE = dyn_cast<ConstantExpr>(Ptr)) {
+        Ptr = ConstantFoldConstantExpression(CE, DL, TLI);
+        DEBUG(dbgs() << "Found a constant pointer expression, constant "
+              "folding: " << *Ptr << "\n");
+      }
+      InstResult = ComputeLoadResult(Ptr);
+      if (!InstResult) {
+        DEBUG(dbgs() << "Failed to compute load result. Can not evaluate load."
+              "\n");
+        return false; // Could not evaluate load.
+      }
+
+      DEBUG(dbgs() << "Evaluated load: " << *InstResult << "\n");
+    } else if (AllocaInst *AI = dyn_cast<AllocaInst>(CurInst)) {
+      if (AI->isArrayAllocation()) {
+        DEBUG(dbgs() << "Found an array alloca. Can not evaluate.\n");
+        return false;  // Cannot handle array allocs.
+      }
+      Type *Ty = AI->getAllocatedType();
+      AllocaTmps.push_back(
+          make_unique<GlobalVariable>(Ty, false, GlobalValue::InternalLinkage,
+                                      UndefValue::get(Ty), AI->getName()));
+      InstResult = AllocaTmps.back().get();
+      DEBUG(dbgs() << "Found an alloca. Result: " << *InstResult << "\n");
+    } else if (isa<CallInst>(CurInst) || isa<InvokeInst>(CurInst)) {
+      CallSite CS(&*CurInst);
+
+      // Debug info can safely be ignored here.
+      if (isa<DbgInfoIntrinsic>(CS.getInstruction())) {
+        DEBUG(dbgs() << "Ignoring debug info.\n");
+        ++CurInst;
+        continue;
+      }
+
+      // Cannot handle inline asm.
+      if (isa<InlineAsm>(CS.getCalledValue())) {
+        DEBUG(dbgs() << "Found inline asm, can not evaluate.\n");
+        return false;
+      }
+
+      if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(CS.getInstruction())) {
+        if (MemSetInst *MSI = dyn_cast<MemSetInst>(II)) {
+          if (MSI->isVolatile()) {
+            DEBUG(dbgs() << "Can not optimize a volatile memset " <<
+                  "intrinsic.\n");
+            return false;
+          }
+          Constant *Ptr = getVal(MSI->getDest());
+          Constant *Val = getVal(MSI->getValue());
+          Constant *DestVal = ComputeLoadResult(getVal(Ptr));
+          if (Val->isNullValue() && DestVal && DestVal->isNullValue()) {
+            // This memset is a no-op.
+            DEBUG(dbgs() << "Ignoring no-op memset.\n");
+            ++CurInst;
+            continue;
+          }
+        }
+
+        if (II->getIntrinsicID() == Intrinsic::lifetime_start ||
+            II->getIntrinsicID() == Intrinsic::lifetime_end) {
+          DEBUG(dbgs() << "Ignoring lifetime intrinsic.\n");
+          ++CurInst;
+          continue;
+        }
+
+        if (II->getIntrinsicID() == Intrinsic::invariant_start) {
+          // We don't insert an entry into Values, as it doesn't have a
+          // meaningful return value.
+          if (!II->use_empty()) {
+            DEBUG(dbgs() << "Found unused invariant_start. Can't evaluate.\n");
+            return false;
+          }
+          ConstantInt *Size = cast<ConstantInt>(II->getArgOperand(0));
+          Value *PtrArg = getVal(II->getArgOperand(1));
+          Value *Ptr = PtrArg->stripPointerCasts();
+          if (GlobalVariable *GV = dyn_cast<GlobalVariable>(Ptr)) {
+            Type *ElemTy = GV->getValueType();
+            if (!Size->isAllOnesValue() &&
+                Size->getValue().getLimitedValue() >=
+                    DL.getTypeStoreSize(ElemTy)) {
+              Invariants.insert(GV);
+              DEBUG(dbgs() << "Found a global var that is an invariant: " << *GV
+                    << "\n");
+            } else {
+              DEBUG(dbgs() << "Found a global var, but can not treat it as an "
+                    "invariant.\n");
+            }
+          }
+          // Continue even if we do nothing.
+          ++CurInst;
+          continue;
+        } else if (II->getIntrinsicID() == Intrinsic::assume) {
+          DEBUG(dbgs() << "Skipping assume intrinsic.\n");
+          ++CurInst;
+          continue;
+        }
+
+        DEBUG(dbgs() << "Unknown intrinsic. Can not evaluate.\n");
+        return false;
+      }
+
+      // Resolve function pointers.
+      Function *Callee = dyn_cast<Function>(getVal(CS.getCalledValue()));
+      if (!Callee || Callee->isInterposable()) {
+        DEBUG(dbgs() << "Can not resolve function pointer.\n");
+        return false;  // Cannot resolve.
+      }
+
+      SmallVector<Constant*, 8> Formals;
+      for (User::op_iterator i = CS.arg_begin(), e = CS.arg_end(); i != e; ++i)
+        Formals.push_back(getVal(*i));
+
+      if (Callee->isDeclaration()) {
+        // If this is a function we can constant fold, do it.
+        if (Constant *C = ConstantFoldCall(Callee, Formals, TLI)) {
+          InstResult = C;
+          DEBUG(dbgs() << "Constant folded function call. Result: " <<
+                *InstResult << "\n");
+        } else {
+          DEBUG(dbgs() << "Can not constant fold function call.\n");
+          return false;
+        }
+      } else {
+        if (Callee->getFunctionType()->isVarArg()) {
+          DEBUG(dbgs() << "Can not constant fold vararg function call.\n");
+          return false;
+        }
+
+        Constant *RetVal = nullptr;
+        // Execute the call, if successful, use the return value.
+        ValueStack.emplace_back();
+        if (!EvaluateFunction(Callee, RetVal, Formals)) {
+          DEBUG(dbgs() << "Failed to evaluate function.\n");
+          return false;
+        }
+        ValueStack.pop_back();
+        InstResult = RetVal;
+
+        if (InstResult) {
+          DEBUG(dbgs() << "Successfully evaluated function. Result: "
+                       << *InstResult << "\n\n");
+        } else {
+          DEBUG(dbgs() << "Successfully evaluated function. Result: 0\n\n");
+        }
+      }
+    } else if (isa<TerminatorInst>(CurInst)) {
+      DEBUG(dbgs() << "Found a terminator instruction.\n");
+
+      if (BranchInst *BI = dyn_cast<BranchInst>(CurInst)) {
+        if (BI->isUnconditional()) {
+          NextBB = BI->getSuccessor(0);
+        } else {
+          ConstantInt *Cond =
+            dyn_cast<ConstantInt>(getVal(BI->getCondition()));
+          if (!Cond) return false;  // Cannot determine.
+
+          NextBB = BI->getSuccessor(!Cond->getZExtValue());
+        }
+      } else if (SwitchInst *SI = dyn_cast<SwitchInst>(CurInst)) {
+        ConstantInt *Val =
+          dyn_cast<ConstantInt>(getVal(SI->getCondition()));
+        if (!Val) return false;  // Cannot determine.
+        NextBB = SI->findCaseValue(Val).getCaseSuccessor();
+      } else if (IndirectBrInst *IBI = dyn_cast<IndirectBrInst>(CurInst)) {
+        Value *Val = getVal(IBI->getAddress())->stripPointerCasts();
+        if (BlockAddress *BA = dyn_cast<BlockAddress>(Val))
+          NextBB = BA->getBasicBlock();
+        else
+          return false;  // Cannot determine.
+      } else if (isa<ReturnInst>(CurInst)) {
+        NextBB = nullptr;
+      } else {
+        // invoke, unwind, resume, unreachable.
+        DEBUG(dbgs() << "Can not handle terminator.");
+        return false;  // Cannot handle this terminator.
+      }
+
+      // We succeeded at evaluating this block!
+      DEBUG(dbgs() << "Successfully evaluated block.\n");
+      return true;
+    } else {
+      // Did not know how to evaluate this!
+      DEBUG(dbgs() << "Failed to evaluate block due to unhandled instruction."
+            "\n");
+      return false;
+    }
+
+    if (!CurInst->use_empty()) {
+      if (ConstantExpr *CE = dyn_cast<ConstantExpr>(InstResult))
+        InstResult = ConstantFoldConstantExpression(CE, DL, TLI);
+
+      setVal(&*CurInst, InstResult);
+    }
+
+    // If we just processed an invoke, we finished evaluating the block.
+    if (InvokeInst *II = dyn_cast<InvokeInst>(CurInst)) {
+      NextBB = II->getNormalDest();
+      DEBUG(dbgs() << "Found an invoke instruction. Finished Block.\n\n");
+      return true;
+    }
+
+    // Advance program counter.
+    ++CurInst;
+  }
+}
+
+/// Evaluate a call to function F, returning true if successful, false if we
+/// can't evaluate it.  ActualArgs contains the formal arguments for the
+/// function.
+bool Evaluator::EvaluateFunction(Function *F, Constant *&RetVal,
+                                 const SmallVectorImpl<Constant*> &ActualArgs) {
+  // Check to see if this function is already executing (recursion).  If so,
+  // bail out.  TODO: we might want to accept limited recursion.
+  if (std::find(CallStack.begin(), CallStack.end(), F) != CallStack.end())
+    return false;
+
+  CallStack.push_back(F);
+
+  // Initialize arguments to the incoming values specified.
+  unsigned ArgNo = 0;
+  for (Function::arg_iterator AI = F->arg_begin(), E = F->arg_end(); AI != E;
+       ++AI, ++ArgNo)
+    setVal(&*AI, ActualArgs[ArgNo]);
+
+  // ExecutedBlocks - We only handle non-looping, non-recursive code.  As such,
+  // we can only evaluate any one basic block at most once.  This set keeps
+  // track of what we have executed so we can detect recursive cases etc.
+  SmallPtrSet<BasicBlock*, 32> ExecutedBlocks;
+
+  // CurBB - The current basic block we're evaluating.
+  BasicBlock *CurBB = &F->front();
+
+  BasicBlock::iterator CurInst = CurBB->begin();
+
+  while (1) {
+    BasicBlock *NextBB = nullptr; // Initialized to avoid compiler warnings.
+    DEBUG(dbgs() << "Trying to evaluate BB: " << *CurBB << "\n");
+
+    if (!EvaluateBlock(CurInst, NextBB))
+      return false;
+
+    if (!NextBB) {
+      // Successfully running until there's no next block means that we found
+      // the return.  Fill it the return value and pop the call stack.
+      ReturnInst *RI = cast<ReturnInst>(CurBB->getTerminator());
+      if (RI->getNumOperands())
+        RetVal = getVal(RI->getOperand(0));
+      CallStack.pop_back();
+      return true;
+    }
+
+    // Okay, we succeeded in evaluating this control flow.  See if we have
+    // executed the new block before.  If so, we have a looping function,
+    // which we cannot evaluate in reasonable time.
+    if (!ExecutedBlocks.insert(NextBB).second)
+      return false;  // looped!
+
+    // Okay, we have never been in this block before.  Check to see if there
+    // are any PHI nodes.  If so, evaluate them with information about where
+    // we came from.
+    PHINode *PN = nullptr;
+    for (CurInst = NextBB->begin();
+         (PN = dyn_cast<PHINode>(CurInst)); ++CurInst)
+      setVal(PN, getVal(PN->getIncomingValueForBlock(CurBB)));
+
+    // Advance to the next block.
+    CurBB = NextBB;
+  }
+}
+
diff --git a/lib/Transforms/Utils/FunctionImportUtils.cpp b/lib/Transforms/Utils/FunctionImportUtils.cpp
new file mode 100644
index 000000000000..fcb25baf3216
--- /dev/null
+++ b/lib/Transforms/Utils/FunctionImportUtils.cpp
@@ -0,0 +1,243 @@
+//===- lib/Transforms/Utils/FunctionImportUtils.cpp - Importing utilities -===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the FunctionImportGlobalProcessing class, used
+// to perform the necessary global value handling for function importing.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Analysis/ModuleSummaryAnalysis.h"
+#include "llvm/Transforms/Utils/FunctionImportUtils.h"
+#include "llvm/IR/InstIterator.h"
+#include "llvm/IR/Instructions.h"
+using namespace llvm;
+
+/// Checks if we should import SGV as a definition, otherwise import as a
+/// declaration.
+bool FunctionImportGlobalProcessing::doImportAsDefinition(
+    const GlobalValue *SGV, DenseSet<const GlobalValue *> *GlobalsToImport) {
+
+  // For alias, we tie the definition to the base object. Extract it and recurse
+  if (auto *GA = dyn_cast<GlobalAlias>(SGV)) {
+    if (GA->hasWeakAnyLinkage())
+      return false;
+    const GlobalObject *GO = GA->getBaseObject();
+    if (!GO->hasLinkOnceODRLinkage())
+      return false;
+    return FunctionImportGlobalProcessing::doImportAsDefinition(
+        GO, GlobalsToImport);
+  }
+  // Only import the globals requested for importing.
+  if (GlobalsToImport->count(SGV))
+    return true;
+  // Otherwise no.
+  return false;
+}
+
+bool FunctionImportGlobalProcessing::doImportAsDefinition(
+    const GlobalValue *SGV) {
+  if (!isPerformingImport())
+    return false;
+  return FunctionImportGlobalProcessing::doImportAsDefinition(SGV,
+                                                              GlobalsToImport);
+}
+
+bool FunctionImportGlobalProcessing::doPromoteLocalToGlobal(
+    const GlobalValue *SGV) {
+  assert(SGV->hasLocalLinkage());
+  // Both the imported references and the original local variable must
+  // be promoted.
+  if (!isPerformingImport() && !isModuleExporting())
+    return false;
+
+  // Local const variables never need to be promoted unless they are address
+  // taken. The imported uses can simply use the clone created in this module.
+  // For now we are conservative in determining which variables are not
+  // address taken by checking the unnamed addr flag. To be more aggressive,
+  // the address taken information must be checked earlier during parsing
+  // of the module and recorded in the summary index for use when importing
+  // from that module.
+  auto *GVar = dyn_cast<GlobalVariable>(SGV);
+  if (GVar && GVar->isConstant() && GVar->hasGlobalUnnamedAddr())
+    return false;
+
+  if (GVar && GVar->hasSection())
+    // Some sections like "__DATA,__cfstring" are "magic" and promotion is not
+    // allowed. Just disable promotion on any GVar with sections right now.
+    return false;
+
+  // Eventually we only need to promote functions in the exporting module that
+  // are referenced by a potentially exported function (i.e. one that is in the
+  // summary index).
+  return true;
+}
+
+std::string FunctionImportGlobalProcessing::getName(const GlobalValue *SGV) {
+  // For locals that must be promoted to global scope, ensure that
+  // the promoted name uniquely identifies the copy in the original module,
+  // using the ID assigned during combined index creation. When importing,
+  // we rename all locals (not just those that are promoted) in order to
+  // avoid naming conflicts between locals imported from different modules.
+  if (SGV->hasLocalLinkage() &&
+      (doPromoteLocalToGlobal(SGV) || isPerformingImport()))
+    return ModuleSummaryIndex::getGlobalNameForLocal(
+        SGV->getName(),
+        ImportIndex.getModuleHash(SGV->getParent()->getModuleIdentifier()));
+  return SGV->getName();
+}
+
+GlobalValue::LinkageTypes
+FunctionImportGlobalProcessing::getLinkage(const GlobalValue *SGV) {
+  // Any local variable that is referenced by an exported function needs
+  // to be promoted to global scope. Since we don't currently know which
+  // functions reference which local variables/functions, we must treat
+  // all as potentially exported if this module is exporting anything.
+  if (isModuleExporting()) {
+    if (SGV->hasLocalLinkage() && doPromoteLocalToGlobal(SGV))
+      return GlobalValue::ExternalLinkage;
+    return SGV->getLinkage();
+  }
+
+  // Otherwise, if we aren't importing, no linkage change is needed.
+  if (!isPerformingImport())
+    return SGV->getLinkage();
+
+  switch (SGV->getLinkage()) {
+  case GlobalValue::ExternalLinkage:
+    // External defnitions are converted to available_externally
+    // definitions upon import, so that they are available for inlining
+    // and/or optimization, but are turned into declarations later
+    // during the EliminateAvailableExternally pass.
+    if (doImportAsDefinition(SGV) && !dyn_cast<GlobalAlias>(SGV))
+      return GlobalValue::AvailableExternallyLinkage;
+    // An imported external declaration stays external.
+    return SGV->getLinkage();
+
+  case GlobalValue::AvailableExternallyLinkage:
+    // An imported available_externally definition converts
+    // to external if imported as a declaration.
+    if (!doImportAsDefinition(SGV))
+      return GlobalValue::ExternalLinkage;
+    // An imported available_externally declaration stays that way.
+    return SGV->getLinkage();
+
+  case GlobalValue::LinkOnceAnyLinkage:
+  case GlobalValue::LinkOnceODRLinkage:
+    // These both stay the same when importing the definition.
+    // The ThinLTO pass will eventually force-import their definitions.
+    return SGV->getLinkage();
+
+  case GlobalValue::WeakAnyLinkage:
+    // Can't import weak_any definitions correctly, or we might change the
+    // program semantics, since the linker will pick the first weak_any
+    // definition and importing would change the order they are seen by the
+    // linker. The module linking caller needs to enforce this.
+    assert(!doImportAsDefinition(SGV));
+    // If imported as a declaration, it becomes external_weak.
+    return SGV->getLinkage();
+
+  case GlobalValue::WeakODRLinkage:
+    // For weak_odr linkage, there is a guarantee that all copies will be
+    // equivalent, so the issue described above for weak_any does not exist,
+    // and the definition can be imported. It can be treated similarly
+    // to an imported externally visible global value.
+    if (doImportAsDefinition(SGV) && !dyn_cast<GlobalAlias>(SGV))
+      return GlobalValue::AvailableExternallyLinkage;
+    else
+      return GlobalValue::ExternalLinkage;
+
+  case GlobalValue::AppendingLinkage:
+    // It would be incorrect to import an appending linkage variable,
+    // since it would cause global constructors/destructors to be
+    // executed multiple times. This should have already been handled
+    // by linkIfNeeded, and we will assert in shouldLinkFromSource
+    // if we try to import, so we simply return AppendingLinkage.
+    return GlobalValue::AppendingLinkage;
+
+  case GlobalValue::InternalLinkage:
+  case GlobalValue::PrivateLinkage:
+    // If we are promoting the local to global scope, it is handled
+    // similarly to a normal externally visible global.
+    if (doPromoteLocalToGlobal(SGV)) {
+      if (doImportAsDefinition(SGV) && !dyn_cast<GlobalAlias>(SGV))
+        return GlobalValue::AvailableExternallyLinkage;
+      else
+        return GlobalValue::ExternalLinkage;
+    }
+    // A non-promoted imported local definition stays local.
+    // The ThinLTO pass will eventually force-import their definitions.
+    return SGV->getLinkage();
+
+  case GlobalValue::ExternalWeakLinkage:
+    // External weak doesn't apply to definitions, must be a declaration.
+    assert(!doImportAsDefinition(SGV));
+    // Linkage stays external_weak.
+    return SGV->getLinkage();
+
+  case GlobalValue::CommonLinkage:
+    // Linkage stays common on definitions.
+    // The ThinLTO pass will eventually force-import their definitions.
+    return SGV->getLinkage();
+  }
+
+  llvm_unreachable("unknown linkage type");
+}
+
+void FunctionImportGlobalProcessing::processGlobalForThinLTO(GlobalValue &GV) {
+  if (GV.hasLocalLinkage() &&
+      (doPromoteLocalToGlobal(&GV) || isPerformingImport())) {
+    GV.setName(getName(&GV));
+    GV.setLinkage(getLinkage(&GV));
+    if (!GV.hasLocalLinkage())
+      GV.setVisibility(GlobalValue::HiddenVisibility);
+  } else
+    GV.setLinkage(getLinkage(&GV));
+
+  // Remove functions imported as available externally defs from comdats,
+  // as this is a declaration for the linker, and will be dropped eventually.
+  // It is illegal for comdats to contain declarations.
+  auto *GO = dyn_cast_or_null<GlobalObject>(&GV);
+  if (GO && GO->isDeclarationForLinker() && GO->hasComdat()) {
+    // The IRMover should not have placed any imported declarations in
+    // a comdat, so the only declaration that should be in a comdat
+    // at this point would be a definition imported as available_externally.
+    assert(GO->hasAvailableExternallyLinkage() &&
+           "Expected comdat on definition (possibly available external)");
+    GO->setComdat(nullptr);
+  }
+}
+
+void FunctionImportGlobalProcessing::processGlobalsForThinLTO() {
+  if (!moduleCanBeRenamedForThinLTO(M)) {
+    // We would have blocked importing from this module by suppressing index
+    // generation. We still may be able to import into this module though.
+    assert(!isPerformingImport() &&
+           "Should have blocked importing from module with local used in ASM");
+    return;
+  }
+
+  for (GlobalVariable &GV : M.globals())
+    processGlobalForThinLTO(GV);
+  for (Function &SF : M)
+    processGlobalForThinLTO(SF);
+  for (GlobalAlias &GA : M.aliases())
+    processGlobalForThinLTO(GA);
+}
+
+bool FunctionImportGlobalProcessing::run() {
+  processGlobalsForThinLTO();
+  return false;
+}
+
+bool llvm::renameModuleForThinLTO(
+    Module &M, const ModuleSummaryIndex &Index,
+    DenseSet<const GlobalValue *> *GlobalsToImport) {
+  FunctionImportGlobalProcessing ThinLTOProcessing(M, Index, GlobalsToImport);
+  return ThinLTOProcessing.run();
+}
diff --git a/lib/Transforms/Utils/GlobalStatus.cpp b/lib/Transforms/Utils/GlobalStatus.cpp
index 3893a752503b..266be41fbead 100644
--- a/lib/Transforms/Utils/GlobalStatus.cpp
+++ b/lib/Transforms/Utils/GlobalStatus.cpp
@@ -20,11 +20,11 @@ using namespace llvm;
 /// and release, then return AcquireRelease.
 ///
 static AtomicOrdering strongerOrdering(AtomicOrdering X, AtomicOrdering Y) {
-  if (X == Acquire && Y == Release)
-    return AcquireRelease;
-  if (Y == Acquire && X == Release)
-    return AcquireRelease;
-  return (AtomicOrdering)std::max(X, Y);
+  if (X == AtomicOrdering::Acquire && Y == AtomicOrdering::Release)
+    return AtomicOrdering::AcquireRelease;
+  if (Y == AtomicOrdering::Acquire && X == AtomicOrdering::Release)
+    return AtomicOrdering::AcquireRelease;
+  return (AtomicOrdering)std::max((unsigned)X, (unsigned)Y);
 }
 
 /// It is safe to destroy a constant iff it is only used by constants itself.
@@ -105,7 +105,7 @@ static bool analyzeGlobalAux(const Value *V, GlobalStatus &GS,
               }
             }
 
-            if (StoredVal == GV->getInitializer()) {
+            if (GV->hasInitializer() && StoredVal == GV->getInitializer()) {
               if (GS.StoredType < GlobalStatus::InitializerStored)
                 GS.StoredType = GlobalStatus::InitializerStored;
             } else if (isa<LoadInst>(StoredVal) &&
@@ -185,4 +185,4 @@ GlobalStatus::GlobalStatus()
     : IsCompared(false), IsLoaded(false), StoredType(NotStored),
       StoredOnceValue(nullptr), AccessingFunction(nullptr),
       HasMultipleAccessingFunctions(false), HasNonInstructionUser(false),
-      Ordering(NotAtomic) {}
+      Ordering(AtomicOrdering::NotAtomic) {}
diff --git a/lib/Transforms/Utils/InlineFunction.cpp b/lib/Transforms/Utils/InlineFunction.cpp
index 79282a2a703b..1fbb19d2b8ad 100644
--- a/lib/Transforms/Utils/InlineFunction.cpp
+++ b/lib/Transforms/Utils/InlineFunction.cpp
@@ -427,6 +427,17 @@ static BasicBlock *HandleCallsInBlockInlinedThroughInvoke(
     if (!CI || CI->doesNotThrow() || isa<InlineAsm>(CI->getCalledValue()))
       continue;
 
+    // We do not need to (and in fact, cannot) convert possibly throwing calls
+    // to @llvm.experimental_deoptimize (resp. @llvm.experimental.guard) into
+    // invokes.  The caller's "segment" of the deoptimization continuation
+    // attached to the newly inlined @llvm.experimental_deoptimize
+    // (resp. @llvm.experimental.guard) call should contain the exception
+    // handling logic, if any.
+    if (auto *F = CI->getCalledFunction())
+      if (F->getIntrinsicID() == Intrinsic::experimental_deoptimize ||
+          F->getIntrinsicID() == Intrinsic::experimental_guard)
+        continue;
+
     if (auto FuncletBundle = CI->getOperandBundle(LLVMContext::OB_funclet)) {
       // This call is nested inside a funclet.  If that funclet has an unwind
       // destination within the inlinee, then unwinding out of this call would
@@ -677,6 +688,34 @@ static void HandleInlinedEHPad(InvokeInst *II, BasicBlock *FirstNewBlock,
   UnwindDest->removePredecessor(InvokeBB);
 }
 
+/// When inlining a call site that has !llvm.mem.parallel_loop_access metadata,
+/// that metadata should be propagated to all memory-accessing cloned
+/// instructions.
+static void PropagateParallelLoopAccessMetadata(CallSite CS,
+                                                ValueToValueMapTy &VMap) {
+  MDNode *M =
+    CS.getInstruction()->getMetadata(LLVMContext::MD_mem_parallel_loop_access);
+  if (!M)
+    return;
+
+  for (ValueToValueMapTy::iterator VMI = VMap.begin(), VMIE = VMap.end();
+       VMI != VMIE; ++VMI) {
+    if (!VMI->second)
+      continue;
+
+    Instruction *NI = dyn_cast<Instruction>(VMI->second);
+    if (!NI)
+      continue;
+
+    if (MDNode *PM = NI->getMetadata(LLVMContext::MD_mem_parallel_loop_access)) {
+        M = MDNode::concatenate(PM, M);
+      NI->setMetadata(LLVMContext::MD_mem_parallel_loop_access, M);
+    } else if (NI->mayReadOrWriteMemory()) {
+      NI->setMetadata(LLVMContext::MD_mem_parallel_loop_access, M);
+    }
+  }
+}
+
 /// When inlining a function that contains noalias scope metadata,
 /// this metadata needs to be cloned so that the inlined blocks
 /// have different "unqiue scopes" at every call site. Were this not done, then
@@ -693,12 +732,11 @@ static void CloneAliasScopeMetadata(CallSite CS, ValueToValueMapTy &VMap) {
   // inter-procedural alias analysis passes. We can revisit this if it becomes
   // an efficiency or overhead problem.
 
-  for (Function::const_iterator I = CalledFunc->begin(), IE = CalledFunc->end();
-       I != IE; ++I)
-    for (BasicBlock::const_iterator J = I->begin(), JE = I->end(); J != JE; ++J) {
-      if (const MDNode *M = J->getMetadata(LLVMContext::MD_alias_scope))
+  for (const BasicBlock &I : *CalledFunc)
+    for (const Instruction &J : I) {
+      if (const MDNode *M = J.getMetadata(LLVMContext::MD_alias_scope))
         MD.insert(M);
-      if (const MDNode *M = J->getMetadata(LLVMContext::MD_noalias))
+      if (const MDNode *M = J.getMetadata(LLVMContext::MD_noalias))
         MD.insert(M);
     }
 
@@ -720,20 +758,18 @@ static void CloneAliasScopeMetadata(CallSite CS, ValueToValueMapTy &VMap) {
   // the noalias scopes and the lists of those scopes.
   SmallVector<TempMDTuple, 16> DummyNodes;
   DenseMap<const MDNode *, TrackingMDNodeRef> MDMap;
-  for (SetVector<const MDNode *>::iterator I = MD.begin(), IE = MD.end();
-       I != IE; ++I) {
+  for (const MDNode *I : MD) {
     DummyNodes.push_back(MDTuple::getTemporary(CalledFunc->getContext(), None));
-    MDMap[*I].reset(DummyNodes.back().get());
+    MDMap[I].reset(DummyNodes.back().get());
   }
 
   // Create new metadata nodes to replace the dummy nodes, replacing old
   // metadata references with either a dummy node or an already-created new
   // node.
-  for (SetVector<const MDNode *>::iterator I = MD.begin(), IE = MD.end();
-       I != IE; ++I) {
+  for (const MDNode *I : MD) {
     SmallVector<Metadata *, 4> NewOps;
-    for (unsigned i = 0, ie = (*I)->getNumOperands(); i != ie; ++i) {
-      const Metadata *V = (*I)->getOperand(i);
+    for (unsigned i = 0, ie = I->getNumOperands(); i != ie; ++i) {
+      const Metadata *V = I->getOperand(i);
       if (const MDNode *M = dyn_cast<MDNode>(V))
         NewOps.push_back(MDMap[M]);
       else
@@ -741,7 +777,7 @@ static void CloneAliasScopeMetadata(CallSite CS, ValueToValueMapTy &VMap) {
     }
 
     MDNode *NewM = MDNode::get(CalledFunc->getContext(), NewOps);
-    MDTuple *TempM = cast<MDTuple>(MDMap[*I]);
+    MDTuple *TempM = cast<MDTuple>(MDMap[I]);
     assert(TempM->isTemporary() && "Expected temporary node");
 
     TempM->replaceAllUsesWith(NewM);
@@ -801,10 +837,9 @@ static void AddAliasScopeMetadata(CallSite CS, ValueToValueMapTy &VMap,
   const Function *CalledFunc = CS.getCalledFunction();
   SmallVector<const Argument *, 4> NoAliasArgs;
 
-  for (const Argument &I : CalledFunc->args()) {
-    if (I.hasNoAliasAttr() && !I.hasNUses(0))
-      NoAliasArgs.push_back(&I);
-  }
+  for (const Argument &Arg : CalledFunc->args())
+    if (Arg.hasNoAliasAttr() && !Arg.use_empty())
+      NoAliasArgs.push_back(&Arg);
 
   if (NoAliasArgs.empty())
     return;
@@ -885,17 +920,16 @@ static void AddAliasScopeMetadata(CallSite CS, ValueToValueMapTy &VMap,
             IsArgMemOnlyCall = true;
         }
 
-        for (ImmutableCallSite::arg_iterator AI = ICS.arg_begin(),
-             AE = ICS.arg_end(); AI != AE; ++AI) {
+        for (Value *Arg : ICS.args()) {
           // We need to check the underlying objects of all arguments, not just
           // the pointer arguments, because we might be passing pointers as
           // integers, etc.
           // However, if we know that the call only accesses pointer arguments,
           // then we only need to check the pointer arguments.
-          if (IsArgMemOnlyCall && !(*AI)->getType()->isPointerTy())
+          if (IsArgMemOnlyCall && !Arg->getType()->isPointerTy())
             continue;
 
-          PtrArgs.push_back(*AI);
+          PtrArgs.push_back(Arg);
         }
       }
 
@@ -913,9 +947,9 @@ static void AddAliasScopeMetadata(CallSite CS, ValueToValueMapTy &VMap,
       SmallVector<Metadata *, 4> Scopes, NoAliases;
 
       SmallSetVector<const Argument *, 4> NAPtrArgs;
-      for (unsigned i = 0, ie = PtrArgs.size(); i != ie; ++i) {
+      for (const Value *V : PtrArgs) {
         SmallVector<Value *, 4> Objects;
-        GetUnderlyingObjects(const_cast<Value*>(PtrArgs[i]),
+        GetUnderlyingObjects(const_cast<Value*>(V),
                              Objects, DL, /* LI = */ nullptr);
 
         for (Value *O : Objects)
@@ -1228,7 +1262,8 @@ static bool hasLifetimeMarkers(AllocaInst *AI) {
 /// Rebuild the entire inlined-at chain for this instruction so that the top of
 /// the chain now is inlined-at the new call site.
 static DebugLoc
-updateInlinedAtInfo(DebugLoc DL, DILocation *InlinedAtNode, LLVMContext &Ctx,
+updateInlinedAtInfo(const DebugLoc &DL, DILocation *InlinedAtNode,
+                    LLVMContext &Ctx,
                     DenseMap<const DILocation *, DILocation *> &IANodes) {
   SmallVector<DILocation *, 3> InlinedAtLocations;
   DILocation *Last = InlinedAtNode;
@@ -1249,8 +1284,7 @@ updateInlinedAtInfo(DebugLoc DL, DILocation *InlinedAtNode, LLVMContext &Ctx,
   // Starting from the top, rebuild the nodes to point to the new inlined-at
   // location (then rebuilding the rest of the chain behind it) and update the
   // map of already-constructed inlined-at nodes.
-  for (const DILocation *MD : make_range(InlinedAtLocations.rbegin(),
-                                         InlinedAtLocations.rend())) {
+  for (const DILocation *MD : reverse(InlinedAtLocations)) {
     Last = IANodes[MD] = DILocation::getDistinct(
         Ctx, MD->getLine(), MD->getColumn(), MD->getScope(), Last);
   }
@@ -1264,7 +1298,7 @@ updateInlinedAtInfo(DebugLoc DL, DILocation *InlinedAtNode, LLVMContext &Ctx,
 /// to encode location where these instructions are inlined.
 static void fixupLineNumbers(Function *Fn, Function::iterator FI,
                              Instruction *TheCall) {
-  DebugLoc TheCallDL = TheCall->getDebugLoc();
+  const DebugLoc &TheCallDL = TheCall->getDebugLoc();
   if (!TheCallDL)
     return;
 
@@ -1422,6 +1456,19 @@ bool llvm::InlineFunction(CallSite CS, InlineFunctionInfo &IFI,
     }
   }
 
+  // Determine if we are dealing with a call in an EHPad which does not unwind
+  // to caller.
+  bool EHPadForCallUnwindsLocally = false;
+  if (CallSiteEHPad && CS.isCall()) {
+    UnwindDestMemoTy FuncletUnwindMap;
+    Value *CallSiteUnwindDestToken =
+        getUnwindDestToken(CallSiteEHPad, FuncletUnwindMap);
+
+    EHPadForCallUnwindsLocally =
+        CallSiteUnwindDestToken &&
+        !isa<ConstantTokenNone>(CallSiteUnwindDestToken);
+  }
+
   // Get an iterator to the last basic block in the function, which will have
   // the new function inlined after it.
   Function::iterator LastBlock = --Caller->end();
@@ -1552,6 +1599,9 @@ bool llvm::InlineFunction(CallSite CS, InlineFunctionInfo &IFI,
     // Add noalias metadata if necessary.
     AddAliasScopeMetadata(CS, VMap, DL, CalleeAAR);
 
+    // Propagate llvm.mem.parallel_loop_access if necessary.
+    PropagateParallelLoopAccessMetadata(CS, VMap);
+
     // FIXME: We could register any cloned assumptions instead of clearing the
     // whole function's cache.
     if (IFI.ACT)
@@ -1602,7 +1652,7 @@ bool llvm::InlineFunction(CallSite CS, InlineFunctionInfo &IFI,
       replaceDbgDeclareForAlloca(AI, AI, DIB, /*Deref=*/false);
   }
 
-  bool InlinedMustTailCalls = false;
+  bool InlinedMustTailCalls = false, InlinedDeoptimizeCalls = false;
   if (InlinedFunctionInfo.ContainsCalls) {
     CallInst::TailCallKind CallSiteTailKind = CallInst::TCK_None;
     if (CallInst *CI = dyn_cast<CallInst>(TheCall))
@@ -1615,6 +1665,10 @@ bool llvm::InlineFunction(CallSite CS, InlineFunctionInfo &IFI,
         if (!CI)
           continue;
 
+        if (Function *F = CI->getCalledFunction())
+          InlinedDeoptimizeCalls |=
+              F->getIntrinsicID() == Intrinsic::experimental_deoptimize;
+
         // We need to reduce the strength of any inlined tail calls.  For
         // musttail, we have to avoid introducing potential unbounded stack
         // growth.  For example, if functions 'f' and 'g' are mutually recursive
@@ -1677,11 +1731,14 @@ bool llvm::InlineFunction(CallSite CS, InlineFunctionInfo &IFI,
 
       builder.CreateLifetimeStart(AI, AllocaSize);
       for (ReturnInst *RI : Returns) {
-        // Don't insert llvm.lifetime.end calls between a musttail call and a
-        // return.  The return kills all local allocas.
+        // Don't insert llvm.lifetime.end calls between a musttail or deoptimize
+        // call and a return.  The return kills all local allocas.
         if (InlinedMustTailCalls &&
             RI->getParent()->getTerminatingMustTailCall())
           continue;
+        if (InlinedDeoptimizeCalls &&
+            RI->getParent()->getTerminatingDeoptimizeCall())
+          continue;
         IRBuilder<>(RI).CreateLifetimeEnd(AI, AllocaSize);
       }
     }
@@ -1702,10 +1759,12 @@ bool llvm::InlineFunction(CallSite CS, InlineFunctionInfo &IFI,
     // Insert a call to llvm.stackrestore before any return instructions in the
     // inlined function.
     for (ReturnInst *RI : Returns) {
-      // Don't insert llvm.stackrestore calls between a musttail call and a
-      // return.  The return will restore the stack pointer.
+      // Don't insert llvm.stackrestore calls between a musttail or deoptimize
+      // call and a return.  The return will restore the stack pointer.
       if (InlinedMustTailCalls && RI->getParent()->getTerminatingMustTailCall())
         continue;
+      if (InlinedDeoptimizeCalls && RI->getParent()->getTerminatingDeoptimizeCall())
+        continue;
       IRBuilder<>(RI).CreateCall(StackRestore, SavedPtr);
     }
   }
@@ -1758,7 +1817,6 @@ bool llvm::InlineFunction(CallSite CS, InlineFunctionInfo &IFI,
           NewInst = CallInst::Create(cast<CallInst>(I), OpBundles, I);
         else
           NewInst = InvokeInst::Create(cast<InvokeInst>(I), OpBundles, I);
-        NewInst->setDebugLoc(I->getDebugLoc());
         NewInst->takeName(I);
         I->replaceAllUsesWith(NewInst);
         I->eraseFromParent();
@@ -1766,6 +1824,14 @@ bool llvm::InlineFunction(CallSite CS, InlineFunctionInfo &IFI,
         OpBundles.clear();
       }
 
+      // It is problematic if the inlinee has a cleanupret which unwinds to
+      // caller and we inline it into a call site which doesn't unwind but into
+      // an EH pad that does.  Such an edge must be dynamically unreachable.
+      // As such, we replace the cleanupret with unreachable.
+      if (auto *CleanupRet = dyn_cast<CleanupReturnInst>(BB->getTerminator()))
+        if (CleanupRet->unwindsToCaller() && EHPadForCallUnwindsLocally)
+          changeToUnreachable(CleanupRet, /*UseLLVMTrap=*/false);
+
       Instruction *I = BB->getFirstNonPHI();
       if (!I->isEHPad())
         continue;
@@ -1781,6 +1847,64 @@ bool llvm::InlineFunction(CallSite CS, InlineFunctionInfo &IFI,
     }
   }
 
+  if (InlinedDeoptimizeCalls) {
+    // We need to at least remove the deoptimizing returns from the Return set,
+    // so that the control flow from those returns does not get merged into the
+    // caller (but terminate it instead).  If the caller's return type does not
+    // match the callee's return type, we also need to change the return type of
+    // the intrinsic.
+    if (Caller->getReturnType() == TheCall->getType()) {
+      auto NewEnd = remove_if(Returns, [](ReturnInst *RI) {
+        return RI->getParent()->getTerminatingDeoptimizeCall() != nullptr;
+      });
+      Returns.erase(NewEnd, Returns.end());
+    } else {
+      SmallVector<ReturnInst *, 8> NormalReturns;
+      Function *NewDeoptIntrinsic = Intrinsic::getDeclaration(
+          Caller->getParent(), Intrinsic::experimental_deoptimize,
+          {Caller->getReturnType()});
+
+      for (ReturnInst *RI : Returns) {
+        CallInst *DeoptCall = RI->getParent()->getTerminatingDeoptimizeCall();
+        if (!DeoptCall) {
+          NormalReturns.push_back(RI);
+          continue;
+        }
+
+        // The calling convention on the deoptimize call itself may be bogus,
+        // since the code we're inlining may have undefined behavior (and may
+        // never actually execute at runtime); but all
+        // @llvm.experimental.deoptimize declarations have to have the same
+        // calling convention in a well-formed module.
+        auto CallingConv = DeoptCall->getCalledFunction()->getCallingConv();
+        NewDeoptIntrinsic->setCallingConv(CallingConv);
+        auto *CurBB = RI->getParent();
+        RI->eraseFromParent();
+
+        SmallVector<Value *, 4> CallArgs(DeoptCall->arg_begin(),
+                                         DeoptCall->arg_end());
+
+        SmallVector<OperandBundleDef, 1> OpBundles;
+        DeoptCall->getOperandBundlesAsDefs(OpBundles);
+        DeoptCall->eraseFromParent();
+        assert(!OpBundles.empty() &&
+               "Expected at least the deopt operand bundle");
+
+        IRBuilder<> Builder(CurBB);
+        CallInst *NewDeoptCall =
+            Builder.CreateCall(NewDeoptIntrinsic, CallArgs, OpBundles);
+        NewDeoptCall->setCallingConv(CallingConv);
+        if (NewDeoptCall->getType()->isVoidTy())
+          Builder.CreateRetVoid();
+        else
+          Builder.CreateRet(NewDeoptCall);
+      }
+
+      // Leave behind the normal returns so we can merge control flow.
+      std::swap(Returns, NormalReturns);
+    }
+  }
+
   // Handle any inlined musttail call sites.  In order for a new call site to be
   // musttail, the source of the clone and the inlined call site must have been
   // musttail.  Therefore it's safe to return without merging control into the
diff --git a/lib/Transforms/Utils/InstructionNamer.cpp b/lib/Transforms/Utils/InstructionNamer.cpp
index da890a297005..8a1973d1db05 100644
--- a/lib/Transforms/Utils/InstructionNamer.cpp
+++ b/lib/Transforms/Utils/InstructionNamer.cpp
@@ -37,13 +37,13 @@ namespace {
         if (!AI->hasName() && !AI->getType()->isVoidTy())
           AI->setName("arg");
 
-      for (Function::iterator BB = F.begin(), E = F.end(); BB != E; ++BB) {
-        if (!BB->hasName())
-          BB->setName("bb");
-        
-        for (BasicBlock::iterator I = BB->begin(), E = BB->end(); I != E; ++I)
-          if (!I->hasName() && !I->getType()->isVoidTy())
-            I->setName("tmp");
+      for (BasicBlock &BB : F) {
+        if (!BB.hasName())
+          BB.setName("bb");
+
+        for (Instruction &I : BB)
+          if (!I.hasName() && !I.getType()->isVoidTy())
+            I.setName("tmp");
       }
       return true;
     }
diff --git a/lib/Transforms/Utils/IntegerDivision.cpp b/lib/Transforms/Utils/IntegerDivision.cpp
index 5687afa61e2a..5a90dcb033b2 100644
--- a/lib/Transforms/Utils/IntegerDivision.cpp
+++ b/lib/Transforms/Utils/IntegerDivision.cpp
@@ -390,6 +390,8 @@ bool llvm::expandRemainder(BinaryOperator *Rem) {
     Value *Remainder = generateSignedRemainderCode(Rem->getOperand(0),
                                                    Rem->getOperand(1), Builder);
 
+    // Check whether this is the insert point while Rem is still valid.
+    bool IsInsertPoint = Rem->getIterator() == Builder.GetInsertPoint();
     Rem->replaceAllUsesWith(Remainder);
     Rem->dropAllReferences();
     Rem->eraseFromParent();
@@ -397,7 +399,7 @@ bool llvm::expandRemainder(BinaryOperator *Rem) {
     // If we didn't actually generate an urem instruction, we're done
     // This happens for example if the input were constant. In this case the
     // Builder insertion point was unchanged
-    if (Rem == Builder.GetInsertPoint().getNodePtrUnchecked())
+    if (IsInsertPoint)
       return true;
 
     BinaryOperator *BO = dyn_cast<BinaryOperator>(Builder.GetInsertPoint());
@@ -446,6 +448,9 @@ bool llvm::expandDivision(BinaryOperator *Div) {
     // Lower the code to unsigned division, and reset Div to point to the udiv.
     Value *Quotient = generateSignedDivisionCode(Div->getOperand(0),
                                                  Div->getOperand(1), Builder);
+
+    // Check whether this is the insert point while Div is still valid.
+    bool IsInsertPoint = Div->getIterator() == Builder.GetInsertPoint();
     Div->replaceAllUsesWith(Quotient);
     Div->dropAllReferences();
     Div->eraseFromParent();
@@ -453,7 +458,7 @@ bool llvm::expandDivision(BinaryOperator *Div) {
     // If we didn't actually generate an udiv instruction, we're done
     // This happens for example if the input were constant. In this case the
     // Builder insertion point was unchanged
-    if (Div == Builder.GetInsertPoint().getNodePtrUnchecked())
+    if (IsInsertPoint)
       return true;
 
     BinaryOperator *BO = dyn_cast<BinaryOperator>(Builder.GetInsertPoint());
diff --git a/lib/Transforms/Utils/LCSSA.cpp b/lib/Transforms/Utils/LCSSA.cpp
index b4b2e148dfbb..9658966779b9 100644
--- a/lib/Transforms/Utils/LCSSA.cpp
+++ b/lib/Transforms/Utils/LCSSA.cpp
@@ -27,10 +27,11 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "llvm/Transforms/Scalar.h"
+#include "llvm/Transforms/Utils/LCSSA.h"
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/Statistic.h"
 #include "llvm/Analysis/AliasAnalysis.h"
+#include "llvm/Analysis/BasicAliasAnalysis.h"
 #include "llvm/Analysis/GlobalsModRef.h"
 #include "llvm/Analysis/LoopPass.h"
 #include "llvm/Analysis/ScalarEvolution.h"
@@ -41,6 +42,7 @@
 #include "llvm/IR/Instructions.h"
 #include "llvm/IR/PredIteratorCache.h"
 #include "llvm/Pass.h"
+#include "llvm/Transforms/Scalar.h"
 #include "llvm/Transforms/Utils/LoopUtils.h"
 #include "llvm/Transforms/Utils/SSAUpdater.h"
 using namespace llvm;
@@ -52,154 +54,156 @@ STATISTIC(NumLCSSA, "Number of live out of a loop variables");
 /// Return true if the specified block is in the list.
 static bool isExitBlock(BasicBlock *BB,
                         const SmallVectorImpl<BasicBlock *> &ExitBlocks) {
-  for (unsigned i = 0, e = ExitBlocks.size(); i != e; ++i)
-    if (ExitBlocks[i] == BB)
-      return true;
-  return false;
+  return find(ExitBlocks, BB) != ExitBlocks.end();
 }
 
-/// Given an instruction in the loop, check to see if it has any uses that are
-/// outside the current loop.  If so, insert LCSSA PHI nodes and rewrite the
-/// uses.
-static bool processInstruction(Loop &L, Instruction &Inst, DominatorTree &DT,
-                               const SmallVectorImpl<BasicBlock *> &ExitBlocks,
-                               PredIteratorCache &PredCache, LoopInfo *LI) {
+/// For every instruction from the worklist, check to see if it has any uses
+/// that are outside the current loop.  If so, insert LCSSA PHI nodes and
+/// rewrite the uses.
+bool llvm::formLCSSAForInstructions(SmallVectorImpl<Instruction *> &Worklist,
+                                    DominatorTree &DT, LoopInfo &LI) {
   SmallVector<Use *, 16> UsesToRewrite;
+  SmallVector<BasicBlock *, 8> ExitBlocks;
+  PredIteratorCache PredCache;
+  bool Changed = false;
 
-  // Tokens cannot be used in PHI nodes, so we skip over them.
-  // We can run into tokens which are live out of a loop with catchswitch
-  // instructions in Windows EH if the catchswitch has one catchpad which
-  // is inside the loop and another which is not.
-  if (Inst.getType()->isTokenTy())
-    return false;
+  while (!Worklist.empty()) {
+    UsesToRewrite.clear();
+    ExitBlocks.clear();
 
-  BasicBlock *InstBB = Inst.getParent();
+    Instruction *I = Worklist.pop_back_val();
+    BasicBlock *InstBB = I->getParent();
+    Loop *L = LI.getLoopFor(InstBB);
+    L->getExitBlocks(ExitBlocks);
 
-  for (Use &U : Inst.uses()) {
-    Instruction *User = cast<Instruction>(U.getUser());
-    BasicBlock *UserBB = User->getParent();
-    if (PHINode *PN = dyn_cast<PHINode>(User))
-      UserBB = PN->getIncomingBlock(U);
+    if (ExitBlocks.empty())
+      continue;
 
-    if (InstBB != UserBB && !L.contains(UserBB))
-      UsesToRewrite.push_back(&U);
-  }
+    // Tokens cannot be used in PHI nodes, so we skip over them.
+    // We can run into tokens which are live out of a loop with catchswitch
+    // instructions in Windows EH if the catchswitch has one catchpad which
+    // is inside the loop and another which is not.
+    if (I->getType()->isTokenTy())
+      continue;
 
-  // If there are no uses outside the loop, exit with no change.
-  if (UsesToRewrite.empty())
-    return false;
+    for (Use &U : I->uses()) {
+      Instruction *User = cast<Instruction>(U.getUser());
+      BasicBlock *UserBB = User->getParent();
+      if (PHINode *PN = dyn_cast<PHINode>(User))
+        UserBB = PN->getIncomingBlock(U);
 
-  ++NumLCSSA; // We are applying the transformation
+      if (InstBB != UserBB && !L->contains(UserBB))
+        UsesToRewrite.push_back(&U);
+    }
 
-  // Invoke instructions are special in that their result value is not available
-  // along their unwind edge. The code below tests to see whether DomBB
-  // dominates the value, so adjust DomBB to the normal destination block,
-  // which is effectively where the value is first usable.
-  BasicBlock *DomBB = Inst.getParent();
-  if (InvokeInst *Inv = dyn_cast<InvokeInst>(&Inst))
-    DomBB = Inv->getNormalDest();
+    // If there are no uses outside the loop, exit with no change.
+    if (UsesToRewrite.empty())
+      continue;
 
-  DomTreeNode *DomNode = DT.getNode(DomBB);
+    ++NumLCSSA; // We are applying the transformation
 
-  SmallVector<PHINode *, 16> AddedPHIs;
-  SmallVector<PHINode *, 8> PostProcessPHIs;
+    // Invoke instructions are special in that their result value is not
+    // available along their unwind edge. The code below tests to see whether
+    // DomBB dominates the value, so adjust DomBB to the normal destination
+    // block, which is effectively where the value is first usable.
+    BasicBlock *DomBB = InstBB;
+    if (InvokeInst *Inv = dyn_cast<InvokeInst>(I))
+      DomBB = Inv->getNormalDest();
 
-  SSAUpdater SSAUpdate;
-  SSAUpdate.Initialize(Inst.getType(), Inst.getName());
+    DomTreeNode *DomNode = DT.getNode(DomBB);
 
-  // Insert the LCSSA phi's into all of the exit blocks dominated by the
-  // value, and add them to the Phi's map.
-  for (BasicBlock *ExitBB : ExitBlocks) {
-    if (!DT.dominates(DomNode, DT.getNode(ExitBB)))
-      continue;
+    SmallVector<PHINode *, 16> AddedPHIs;
+    SmallVector<PHINode *, 8> PostProcessPHIs;
 
-    // If we already inserted something for this BB, don't reprocess it.
-    if (SSAUpdate.HasValueForBlock(ExitBB))
-      continue;
+    SSAUpdater SSAUpdate;
+    SSAUpdate.Initialize(I->getType(), I->getName());
 
-    PHINode *PN = PHINode::Create(Inst.getType(), PredCache.size(ExitBB),
-                                  Inst.getName() + ".lcssa", &ExitBB->front());
+    // Insert the LCSSA phi's into all of the exit blocks dominated by the
+    // value, and add them to the Phi's map.
+    for (BasicBlock *ExitBB : ExitBlocks) {
+      if (!DT.dominates(DomNode, DT.getNode(ExitBB)))
+        continue;
 
-    // Add inputs from inside the loop for this PHI.
-    for (BasicBlock *Pred : PredCache.get(ExitBB)) {
-      PN->addIncoming(&Inst, Pred);
+      // If we already inserted something for this BB, don't reprocess it.
+      if (SSAUpdate.HasValueForBlock(ExitBB))
+        continue;
 
-      // If the exit block has a predecessor not within the loop, arrange for
-      // the incoming value use corresponding to that predecessor to be
-      // rewritten in terms of a different LCSSA PHI.
-      if (!L.contains(Pred))
-        UsesToRewrite.push_back(
-            &PN->getOperandUse(PN->getOperandNumForIncomingValue(
-                 PN->getNumIncomingValues() - 1)));
+      PHINode *PN = PHINode::Create(I->getType(), PredCache.size(ExitBB),
+                                    I->getName() + ".lcssa", &ExitBB->front());
+
+      // Add inputs from inside the loop for this PHI.
+      for (BasicBlock *Pred : PredCache.get(ExitBB)) {
+        PN->addIncoming(I, Pred);
+
+        // If the exit block has a predecessor not within the loop, arrange for
+        // the incoming value use corresponding to that predecessor to be
+        // rewritten in terms of a different LCSSA PHI.
+        if (!L->contains(Pred))
+          UsesToRewrite.push_back(
+              &PN->getOperandUse(PN->getOperandNumForIncomingValue(
+                  PN->getNumIncomingValues() - 1)));
+      }
+
+      AddedPHIs.push_back(PN);
+
+      // Remember that this phi makes the value alive in this block.
+      SSAUpdate.AddAvailableValue(ExitBB, PN);
+
+      // LoopSimplify might fail to simplify some loops (e.g. when indirect
+      // branches are involved). In such situations, it might happen that an
+      // exit for Loop L1 is the header of a disjoint Loop L2. Thus, when we
+      // create PHIs in such an exit block, we are also inserting PHIs into L2's
+      // header. This could break LCSSA form for L2 because these inserted PHIs
+      // can also have uses outside of L2. Remember all PHIs in such situation
+      // as to revisit than later on. FIXME: Remove this if indirectbr support
+      // into LoopSimplify gets improved.
+      if (auto *OtherLoop = LI.getLoopFor(ExitBB))
+        if (!L->contains(OtherLoop))
+          PostProcessPHIs.push_back(PN);
     }
 
-    AddedPHIs.push_back(PN);
-
-    // Remember that this phi makes the value alive in this block.
-    SSAUpdate.AddAvailableValue(ExitBB, PN);
-
-    // LoopSimplify might fail to simplify some loops (e.g. when indirect
-    // branches are involved). In such situations, it might happen that an exit
-    // for Loop L1 is the header of a disjoint Loop L2. Thus, when we create
-    // PHIs in such an exit block, we are also inserting PHIs into L2's header.
-    // This could break LCSSA form for L2 because these inserted PHIs can also
-    // have uses outside of L2. Remember all PHIs in such situation as to
-    // revisit than later on. FIXME: Remove this if indirectbr support into
-    // LoopSimplify gets improved.
-    if (auto *OtherLoop = LI->getLoopFor(ExitBB))
-      if (!L.contains(OtherLoop))
-        PostProcessPHIs.push_back(PN);
-  }
+    // Rewrite all uses outside the loop in terms of the new PHIs we just
+    // inserted.
+    for (Use *UseToRewrite : UsesToRewrite) {
+      // If this use is in an exit block, rewrite to use the newly inserted PHI.
+      // This is required for correctness because SSAUpdate doesn't handle uses
+      // in the same block.  It assumes the PHI we inserted is at the end of the
+      // block.
+      Instruction *User = cast<Instruction>(UseToRewrite->getUser());
+      BasicBlock *UserBB = User->getParent();
+      if (PHINode *PN = dyn_cast<PHINode>(User))
+        UserBB = PN->getIncomingBlock(*UseToRewrite);
+
+      if (isa<PHINode>(UserBB->begin()) && isExitBlock(UserBB, ExitBlocks)) {
+        // Tell the VHs that the uses changed. This updates SCEV's caches.
+        if (UseToRewrite->get()->hasValueHandle())
+          ValueHandleBase::ValueIsRAUWd(*UseToRewrite, &UserBB->front());
+        UseToRewrite->set(&UserBB->front());
+        continue;
+      }
 
-  // Rewrite all uses outside the loop in terms of the new PHIs we just
-  // inserted.
-  for (Use *UseToRewrite : UsesToRewrite) {
-    // If this use is in an exit block, rewrite to use the newly inserted PHI.
-    // This is required for correctness because SSAUpdate doesn't handle uses in
-    // the same block.  It assumes the PHI we inserted is at the end of the
-    // block.
-    Instruction *User = cast<Instruction>(UseToRewrite->getUser());
-    BasicBlock *UserBB = User->getParent();
-    if (PHINode *PN = dyn_cast<PHINode>(User))
-      UserBB = PN->getIncomingBlock(*UseToRewrite);
-
-    if (isa<PHINode>(UserBB->begin()) && isExitBlock(UserBB, ExitBlocks)) {
-      // Tell the VHs that the uses changed. This updates SCEV's caches.
-      if (UseToRewrite->get()->hasValueHandle())
-        ValueHandleBase::ValueIsRAUWd(*UseToRewrite, &UserBB->front());
-      UseToRewrite->set(&UserBB->front());
-      continue;
+      // Otherwise, do full PHI insertion.
+      SSAUpdate.RewriteUse(*UseToRewrite);
     }
 
-    // Otherwise, do full PHI insertion.
-    SSAUpdate.RewriteUse(*UseToRewrite);
-  }
+    // Post process PHI instructions that were inserted into another disjoint
+    // loop and update their exits properly.
+    for (auto *PostProcessPN : PostProcessPHIs) {
+      if (PostProcessPN->use_empty())
+        continue;
 
-  // Post process PHI instructions that were inserted into another disjoint loop
-  // and update their exits properly.
-  for (auto *I : PostProcessPHIs) {
-    if (I->use_empty())
-      continue;
+      // Reprocess each PHI instruction.
+      Worklist.push_back(PostProcessPN);
+    }
 
-    BasicBlock *PHIBB = I->getParent();
-    Loop *OtherLoop = LI->getLoopFor(PHIBB);
-    SmallVector<BasicBlock *, 8> EBs;
-    OtherLoop->getExitBlocks(EBs);
-    if (EBs.empty())
-      continue;
+    // Remove PHI nodes that did not have any uses rewritten.
+    for (PHINode *PN : AddedPHIs)
+      if (PN->use_empty())
+        PN->eraseFromParent();
 
-    // Recurse and re-process each PHI instruction. FIXME: we should really
-    // convert this entire thing to a worklist approach where we process a
-    // vector of instructions...
-    processInstruction(*OtherLoop, *I, DT, EBs, PredCache, LI);
+    Changed = true;
   }
-
-  // Remove PHI nodes that did not have any uses rewritten.
-  for (PHINode *PN : AddedPHIs)
-    if (PN->use_empty())
-      PN->eraseFromParent();
-
-  return true;
+  return Changed;
 }
 
 /// Return true if the specified block dominates at least
@@ -209,11 +213,9 @@ blockDominatesAnExit(BasicBlock *BB,
                      DominatorTree &DT,
                      const SmallVectorImpl<BasicBlock *> &ExitBlocks) {
   DomTreeNode *DomNode = DT.getNode(BB);
-  for (BasicBlock *ExitBB : ExitBlocks)
-    if (DT.dominates(DomNode, DT.getNode(ExitBB)))
-      return true;
-
-  return false;
+  return llvm::any_of(ExitBlocks, [&](BasicBlock * EB) {
+    return DT.dominates(DomNode, DT.getNode(EB));
+  });
 }
 
 bool llvm::formLCSSA(Loop &L, DominatorTree &DT, LoopInfo *LI,
@@ -227,10 +229,10 @@ bool llvm::formLCSSA(Loop &L, DominatorTree &DT, LoopInfo *LI,
   if (ExitBlocks.empty())
     return false;
 
-  PredIteratorCache PredCache;
+  SmallVector<Instruction *, 8> Worklist;
 
   // Look at all the instructions in the loop, checking to see if they have uses
-  // outside the loop.  If so, rewrite those uses.
+  // outside the loop.  If so, put them into the worklist to rewrite those uses.
   for (BasicBlock *BB : L.blocks()) {
     // For large loops, avoid use-scanning by using dominance information:  In
     // particular, if a block does not dominate any of the loop exits, then none
@@ -246,9 +248,10 @@ bool llvm::formLCSSA(Loop &L, DominatorTree &DT, LoopInfo *LI,
            !isa<PHINode>(I.user_back())))
         continue;
 
-      Changed |= processInstruction(L, I, DT, ExitBlocks, PredCache, LI);
+      Worklist.push_back(&I);
     }
   }
+  Changed = formLCSSAForInstructions(Worklist, DT, *LI);
 
   // If we modified the code, remove any caches about the loop from SCEV to
   // avoid dangling entries.
@@ -274,11 +277,20 @@ bool llvm::formLCSSARecursively(Loop &L, DominatorTree &DT, LoopInfo *LI,
   return Changed;
 }
 
+/// Process all loops in the function, inner-most out.
+static bool formLCSSAOnAllLoops(LoopInfo *LI, DominatorTree &DT,
+                                ScalarEvolution *SE) {
+  bool Changed = false;
+  for (auto &L : *LI)
+    Changed |= formLCSSARecursively(*L, DT, LI, SE);
+  return Changed;
+}
+
 namespace {
-struct LCSSA : public FunctionPass {
+struct LCSSAWrapperPass : public FunctionPass {
   static char ID; // Pass identification, replacement for typeid
-  LCSSA() : FunctionPass(ID) {
-    initializeLCSSAPass(*PassRegistry::getPassRegistry());
+  LCSSAWrapperPass() : FunctionPass(ID) {
+    initializeLCSSAWrapperPassPass(*PassRegistry::getPassRegistry());
   }
 
   // Cached analysis information for the current function.
@@ -298,6 +310,7 @@ struct LCSSA : public FunctionPass {
     AU.addRequired<LoopInfoWrapperPass>();
     AU.addPreservedID(LoopSimplifyID);
     AU.addPreserved<AAResultsWrapperPass>();
+    AU.addPreserved<BasicAAWrapperPass>();
     AU.addPreserved<GlobalsAAWrapperPass>();
     AU.addPreserved<ScalarEvolutionWrapperPass>();
     AU.addPreserved<SCEVAAWrapperPass>();
@@ -305,30 +318,39 @@ struct LCSSA : public FunctionPass {
 };
 }
 
-char LCSSA::ID = 0;
-INITIALIZE_PASS_BEGIN(LCSSA, "lcssa", "Loop-Closed SSA Form Pass", false, false)
+char LCSSAWrapperPass::ID = 0;
+INITIALIZE_PASS_BEGIN(LCSSAWrapperPass, "lcssa", "Loop-Closed SSA Form Pass",
+                      false, false)
 INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
 INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass)
-INITIALIZE_PASS_DEPENDENCY(GlobalsAAWrapperPass)
-INITIALIZE_PASS_DEPENDENCY(SCEVAAWrapperPass)
-INITIALIZE_PASS_END(LCSSA, "lcssa", "Loop-Closed SSA Form Pass", false, false)
-
-Pass *llvm::createLCSSAPass() { return new LCSSA(); }
-char &llvm::LCSSAID = LCSSA::ID;
+INITIALIZE_PASS_END(LCSSAWrapperPass, "lcssa", "Loop-Closed SSA Form Pass",
+                    false, false)
 
+Pass *llvm::createLCSSAPass() { return new LCSSAWrapperPass(); }
+char &llvm::LCSSAID = LCSSAWrapperPass::ID;
 
-/// Process all loops in the function, inner-most out.
-bool LCSSA::runOnFunction(Function &F) {
-  bool Changed = false;
+/// Transform \p F into loop-closed SSA form.
+bool LCSSAWrapperPass::runOnFunction(Function &F) {
   LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
   DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
   auto *SEWP = getAnalysisIfAvailable<ScalarEvolutionWrapperPass>();
   SE = SEWP ? &SEWP->getSE() : nullptr;
 
-  // Simplify each loop nest in the function.
-  for (LoopInfo::iterator I = LI->begin(), E = LI->end(); I != E; ++I)
-    Changed |= formLCSSARecursively(**I, *DT, LI, SE);
-
-  return Changed;
+  return formLCSSAOnAllLoops(LI, *DT, SE);
 }
 
+PreservedAnalyses LCSSAPass::run(Function &F, AnalysisManager<Function> &AM) {
+  auto &LI = AM.getResult<LoopAnalysis>(F);
+  auto &DT = AM.getResult<DominatorTreeAnalysis>(F);
+  auto *SE = AM.getCachedResult<ScalarEvolutionAnalysis>(F);
+  if (!formLCSSAOnAllLoops(&LI, DT, SE))
+    return PreservedAnalyses::all();
+
+  // FIXME: This should also 'preserve the CFG'.
+  PreservedAnalyses PA;
+  PA.preserve<BasicAA>();
+  PA.preserve<GlobalsAA>();
+  PA.preserve<SCEVAA>();
+  PA.preserve<ScalarEvolutionAnalysis>();
+  return PA;
+}
diff --git a/lib/Transforms/Utils/Local.cpp b/lib/Transforms/Utils/Local.cpp
index abc9b65f7a39..f1838d891466 100644
--- a/lib/Transforms/Utils/Local.cpp
+++ b/lib/Transforms/Utils/Local.cpp
@@ -42,11 +42,13 @@
 #include "llvm/IR/MDBuilder.h"
 #include "llvm/IR/Metadata.h"
 #include "llvm/IR/Operator.h"
+#include "llvm/IR/PatternMatch.h"
 #include "llvm/IR/ValueHandle.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/MathExtras.h"
 #include "llvm/Support/raw_ostream.h"
 using namespace llvm;
+using namespace llvm::PatternMatch;
 
 #define DEBUG_TYPE "local"
 
@@ -148,9 +150,7 @@ bool llvm::ConstantFoldTerminator(BasicBlock *BB, bool DeleteDeadConditions,
           SmallVector<uint32_t, 8> Weights;
           for (unsigned MD_i = 1, MD_e = MD->getNumOperands(); MD_i < MD_e;
                ++MD_i) {
-            ConstantInt *CI =
-                mdconst::dyn_extract<ConstantInt>(MD->getOperand(MD_i));
-            assert(CI);
+            auto *CI = mdconst::extract<ConstantInt>(MD->getOperand(MD_i));
             Weights.push_back(CI->getValue().getZExtValue());
           }
           // Merge weight of this case to the default weight.
@@ -321,8 +321,12 @@ bool llvm::isInstructionTriviallyDead(Instruction *I,
         II->getIntrinsicID() == Intrinsic::lifetime_end)
       return isa<UndefValue>(II->getArgOperand(1));
 
-    // Assumptions are dead if their condition is trivially true.
-    if (II->getIntrinsicID() == Intrinsic::assume) {
+    // Assumptions are dead if their condition is trivially true.  Guards on
+    // true are operationally no-ops.  In the future we can consider more
+    // sophisticated tradeoffs for guards considering potential for check
+    // widening, but for now we keep things simple.
+    if (II->getIntrinsicID() == Intrinsic::assume ||
+        II->getIntrinsicID() == Intrinsic::experimental_guard) {
       if (ConstantInt *Cond = dyn_cast<ConstantInt>(II->getArgOperand(0)))
         return !Cond->isZero();
 
@@ -452,14 +456,23 @@ simplifyAndDCEInstruction(Instruction *I,
   if (Value *SimpleV = SimplifyInstruction(I, DL)) {
     // Add the users to the worklist. CAREFUL: an instruction can use itself,
     // in the case of a phi node.
-    for (User *U : I->users())
-      if (U != I)
+    for (User *U : I->users()) {
+      if (U != I) {
         WorkList.insert(cast<Instruction>(U));
+      }
+    }
 
     // Replace the instruction with its simplified value.
-    I->replaceAllUsesWith(SimpleV);
-    I->eraseFromParent();
-    return true;
+    bool Changed = false;
+    if (!I->use_empty()) {
+      I->replaceAllUsesWith(SimpleV);
+      Changed = true;
+    }
+    if (isInstructionTriviallyDead(I, TLI)) {
+      I->eraseFromParent();
+      Changed = true;
+    }
+    return Changed;
   }
   return false;
 }
@@ -486,7 +499,8 @@ bool llvm::SimplifyInstructionsInBlock(BasicBlock *BB,
   // Iterate over the original function, only adding insts to the worklist
   // if they actually need to be revisited. This avoids having to pre-init
   // the worklist with the entire function's worth of instructions.
-  for (BasicBlock::iterator BI = BB->begin(), E = std::prev(BB->end()); BI != E;) {
+  for (BasicBlock::iterator BI = BB->begin(), E = std::prev(BB->end());
+       BI != E;) {
     assert(!BI->isTerminator());
     Instruction *I = &*BI;
     ++BI;
@@ -1025,7 +1039,8 @@ unsigned llvm::getOrEnforceKnownAlignment(Value *V, unsigned PrefAlign,
 ///
 
 /// See if there is a dbg.value intrinsic for DIVar before I.
-static bool LdStHasDebugValue(const DILocalVariable *DIVar, Instruction *I) {
+static bool LdStHasDebugValue(DILocalVariable *DIVar, DIExpression *DIExpr,
+                              Instruction *I) {
   // Since we can't guarantee that the original dbg.declare instrinsic
   // is removed by LowerDbgDeclare(), we need to make sure that we are
   // not inserting the same dbg.value intrinsic over and over.
@@ -1035,7 +1050,8 @@ static bool LdStHasDebugValue(const DILocalVariable *DIVar, Instruction *I) {
     if (DbgValueInst *DVI = dyn_cast<DbgValueInst>(PrevI))
       if (DVI->getValue() == I->getOperand(0) &&
           DVI->getOffset() == 0 &&
-          DVI->getVariable() == DIVar)
+          DVI->getVariable() == DIVar &&
+          DVI->getExpression() == DIExpr)
         return true;
   }
   return false;
@@ -1049,9 +1065,6 @@ bool llvm::ConvertDebugDeclareToDebugValue(DbgDeclareInst *DDI,
   auto *DIExpr = DDI->getExpression();
   assert(DIVar && "Missing variable");
 
-  if (LdStHasDebugValue(DIVar, SI))
-    return true;
-
   // If an argument is zero extended then use argument directly. The ZExt
   // may be zapped by an optimization pass in future.
   Argument *ExtendedArg = nullptr;
@@ -1066,25 +1079,25 @@ bool llvm::ConvertDebugDeclareToDebugValue(DbgDeclareInst *DDI,
     // to the alloca described by DDI, if it's first operand is an extend,
     // we're guaranteed that before extension, the value was narrower than
     // the size of the alloca, hence the size of the described variable.
-    SmallVector<uint64_t, 3> NewDIExpr;
+    SmallVector<uint64_t, 3> Ops;
     unsigned PieceOffset = 0;
     // If this already is a bit piece, we drop the bit piece from the expression
     // and record the offset.
     if (DIExpr->isBitPiece()) {
-      NewDIExpr.append(DIExpr->elements_begin(), DIExpr->elements_end()-3);
+      Ops.append(DIExpr->elements_begin(), DIExpr->elements_end()-3);
       PieceOffset = DIExpr->getBitPieceOffset();
     } else {
-      NewDIExpr.append(DIExpr->elements_begin(), DIExpr->elements_end());
+      Ops.append(DIExpr->elements_begin(), DIExpr->elements_end());
     }
-    NewDIExpr.push_back(dwarf::DW_OP_bit_piece);
-    NewDIExpr.push_back(PieceOffset); //Offset
+    Ops.push_back(dwarf::DW_OP_bit_piece);
+    Ops.push_back(PieceOffset); // Offset
     const DataLayout &DL = DDI->getModule()->getDataLayout();
-    NewDIExpr.push_back(DL.getTypeSizeInBits(ExtendedArg->getType())); // Size
-    Builder.insertDbgValueIntrinsic(ExtendedArg, 0, DIVar,
-                                    Builder.createExpression(NewDIExpr),
-                                    DDI->getDebugLoc(), SI);
-  }
-  else
+    Ops.push_back(DL.getTypeSizeInBits(ExtendedArg->getType())); // Size
+    auto NewDIExpr = Builder.createExpression(Ops);
+    if (!LdStHasDebugValue(DIVar, NewDIExpr, SI))
+      Builder.insertDbgValueIntrinsic(ExtendedArg, 0, DIVar, NewDIExpr,
+                                      DDI->getDebugLoc(), SI);
+  } else if (!LdStHasDebugValue(DIVar, DIExpr, SI))
     Builder.insertDbgValueIntrinsic(SI->getOperand(0), 0, DIVar, DIExpr,
                                     DDI->getDebugLoc(), SI);
   return true;
@@ -1098,7 +1111,7 @@ bool llvm::ConvertDebugDeclareToDebugValue(DbgDeclareInst *DDI,
   auto *DIExpr = DDI->getExpression();
   assert(DIVar && "Missing variable");
 
-  if (LdStHasDebugValue(DIVar, LI))
+  if (LdStHasDebugValue(DIVar, DIExpr, LI))
     return true;
 
   // We are now tracking the loaded value instead of the address. In the
@@ -1140,12 +1153,14 @@ bool llvm::LowerDbgDeclare(Function &F) {
     // the stack slot (and at a lexical-scope granularity). Later
     // passes will attempt to elide the stack slot.
     if (AI && !isArray(AI)) {
-      for (User *U : AI->users())
-        if (StoreInst *SI = dyn_cast<StoreInst>(U))
-          ConvertDebugDeclareToDebugValue(DDI, SI, DIB);
-        else if (LoadInst *LI = dyn_cast<LoadInst>(U))
+      for (auto &AIUse : AI->uses()) {
+        User *U = AIUse.getUser();
+        if (StoreInst *SI = dyn_cast<StoreInst>(U)) {
+          if (AIUse.getOperandNo() == 1)
+            ConvertDebugDeclareToDebugValue(DDI, SI, DIB);
+        } else if (LoadInst *LI = dyn_cast<LoadInst>(U)) {
           ConvertDebugDeclareToDebugValue(DDI, LI, DIB);
-        else if (CallInst *CI = dyn_cast<CallInst>(U)) {
+        } else if (CallInst *CI = dyn_cast<CallInst>(U)) {
           // This is a call by-value or some other instruction that
           // takes a pointer to the variable. Insert a *value*
           // intrinsic that describes the alloca.
@@ -1157,6 +1172,7 @@ bool llvm::LowerDbgDeclare(Function &F) {
                                       DIB.createExpression(NewDIExpr),
                                       DDI->getDebugLoc(), CI);
         }
+      }
       DDI->eraseFromParent();
     }
   }
@@ -1175,6 +1191,38 @@ DbgDeclareInst *llvm::FindAllocaDbgDeclare(Value *V) {
   return nullptr;
 }
 
+static void DIExprAddDeref(SmallVectorImpl<uint64_t> &Expr) {
+  Expr.push_back(dwarf::DW_OP_deref);
+}
+
+static void DIExprAddOffset(SmallVectorImpl<uint64_t> &Expr, int Offset) {
+  if (Offset > 0) {
+    Expr.push_back(dwarf::DW_OP_plus);
+    Expr.push_back(Offset);
+  } else if (Offset < 0) {
+    Expr.push_back(dwarf::DW_OP_minus);
+    Expr.push_back(-Offset);
+  }
+}
+
+static DIExpression *BuildReplacementDIExpr(DIBuilder &Builder,
+                                            DIExpression *DIExpr, bool Deref,
+                                            int Offset) {
+  if (!Deref && !Offset)
+    return DIExpr;
+  // Create a copy of the original DIDescriptor for user variable, prepending
+  // "deref" operation to a list of address elements, as new llvm.dbg.declare
+  // will take a value storing address of the memory for variable, not
+  // alloca itself.
+  SmallVector<uint64_t, 4> NewDIExpr;
+  if (Deref)
+    DIExprAddDeref(NewDIExpr);
+  DIExprAddOffset(NewDIExpr, Offset);
+  if (DIExpr)
+    NewDIExpr.append(DIExpr->elements_begin(), DIExpr->elements_end());
+  return Builder.createExpression(NewDIExpr);
+}
+
 bool llvm::replaceDbgDeclare(Value *Address, Value *NewAddress,
                              Instruction *InsertBefore, DIBuilder &Builder,
                              bool Deref, int Offset) {
@@ -1186,25 +1234,7 @@ bool llvm::replaceDbgDeclare(Value *Address, Value *NewAddress,
   auto *DIExpr = DDI->getExpression();
   assert(DIVar && "Missing variable");
 
-  if (Deref || Offset) {
-    // Create a copy of the original DIDescriptor for user variable, prepending
-    // "deref" operation to a list of address elements, as new llvm.dbg.declare
-    // will take a value storing address of the memory for variable, not
-    // alloca itself.
-    SmallVector<uint64_t, 4> NewDIExpr;
-    if (Deref)
-      NewDIExpr.push_back(dwarf::DW_OP_deref);
-    if (Offset > 0) {
-      NewDIExpr.push_back(dwarf::DW_OP_plus);
-      NewDIExpr.push_back(Offset);
-    } else if (Offset < 0) {
-      NewDIExpr.push_back(dwarf::DW_OP_minus);
-      NewDIExpr.push_back(-Offset);
-    }
-    if (DIExpr)
-      NewDIExpr.append(DIExpr->elements_begin(), DIExpr->elements_end());
-    DIExpr = Builder.createExpression(NewDIExpr);
-  }
+  DIExpr = BuildReplacementDIExpr(Builder, DIExpr, Deref, Offset);
 
   // Insert llvm.dbg.declare immediately after the original alloca, and remove
   // old llvm.dbg.declare.
@@ -1219,12 +1249,73 @@ bool llvm::replaceDbgDeclareForAlloca(AllocaInst *AI, Value *NewAllocaAddress,
                            Deref, Offset);
 }
 
-void llvm::changeToUnreachable(Instruction *I, bool UseLLVMTrap) {
+static void replaceOneDbgValueForAlloca(DbgValueInst *DVI, Value *NewAddress,
+                                        DIBuilder &Builder, int Offset) {
+  DebugLoc Loc = DVI->getDebugLoc();
+  auto *DIVar = DVI->getVariable();
+  auto *DIExpr = DVI->getExpression();
+  assert(DIVar && "Missing variable");
+
+  // This is an alloca-based llvm.dbg.value. The first thing it should do with
+  // the alloca pointer is dereference it. Otherwise we don't know how to handle
+  // it and give up.
+  if (!DIExpr || DIExpr->getNumElements() < 1 ||
+      DIExpr->getElement(0) != dwarf::DW_OP_deref)
+    return;
+
+  // Insert the offset immediately after the first deref.
+  // We could just change the offset argument of dbg.value, but it's unsigned...
+  if (Offset) {
+    SmallVector<uint64_t, 4> NewDIExpr;
+    DIExprAddDeref(NewDIExpr);
+    DIExprAddOffset(NewDIExpr, Offset);
+    NewDIExpr.append(DIExpr->elements_begin() + 1, DIExpr->elements_end());
+    DIExpr = Builder.createExpression(NewDIExpr);
+  }
+
+  Builder.insertDbgValueIntrinsic(NewAddress, DVI->getOffset(), DIVar, DIExpr,
+                                  Loc, DVI);
+  DVI->eraseFromParent();
+}
+
+void llvm::replaceDbgValueForAlloca(AllocaInst *AI, Value *NewAllocaAddress,
+                                    DIBuilder &Builder, int Offset) {
+  if (auto *L = LocalAsMetadata::getIfExists(AI))
+    if (auto *MDV = MetadataAsValue::getIfExists(AI->getContext(), L))
+      for (auto UI = MDV->use_begin(), UE = MDV->use_end(); UI != UE;) {
+        Use &U = *UI++;
+        if (auto *DVI = dyn_cast<DbgValueInst>(U.getUser()))
+          replaceOneDbgValueForAlloca(DVI, NewAllocaAddress, Builder, Offset);
+      }
+}
+
+unsigned llvm::removeAllNonTerminatorAndEHPadInstructions(BasicBlock *BB) {
+  unsigned NumDeadInst = 0;
+  // Delete the instructions backwards, as it has a reduced likelihood of
+  // having to update as many def-use and use-def chains.
+  Instruction *EndInst = BB->getTerminator(); // Last not to be deleted.
+  while (EndInst != &BB->front()) {
+    // Delete the next to last instruction.
+    Instruction *Inst = &*--EndInst->getIterator();
+    if (!Inst->use_empty() && !Inst->getType()->isTokenTy())
+      Inst->replaceAllUsesWith(UndefValue::get(Inst->getType()));
+    if (Inst->isEHPad() || Inst->getType()->isTokenTy()) {
+      EndInst = Inst;
+      continue;
+    }
+    if (!isa<DbgInfoIntrinsic>(Inst))
+      ++NumDeadInst;
+    Inst->eraseFromParent();
+  }
+  return NumDeadInst;
+}
+
+unsigned llvm::changeToUnreachable(Instruction *I, bool UseLLVMTrap) {
   BasicBlock *BB = I->getParent();
   // Loop over all of the successors, removing BB's entry from any PHI
   // nodes.
-  for (succ_iterator SI = succ_begin(BB), SE = succ_end(BB); SI != SE; ++SI)
-    (*SI)->removePredecessor(BB);
+  for (BasicBlock *Successor : successors(BB))
+    Successor->removePredecessor(BB);
 
   // Insert a call to llvm.trap right before this.  This turns the undefined
   // behavior into a hard fail instead of falling through into random code.
@@ -1237,12 +1328,15 @@ void llvm::changeToUnreachable(Instruction *I, bool UseLLVMTrap) {
   new UnreachableInst(I->getContext(), I);
 
   // All instructions after this are dead.
+  unsigned NumInstrsRemoved = 0;
   BasicBlock::iterator BBI = I->getIterator(), BBE = BB->end();
   while (BBI != BBE) {
     if (!BBI->use_empty())
       BBI->replaceAllUsesWith(UndefValue::get(BBI->getType()));
     BB->getInstList().erase(BBI++);
+    ++NumInstrsRemoved;
   }
+  return NumInstrsRemoved;
 }
 
 /// changeToCall - Convert the specified invoke into a normal call.
@@ -1280,36 +1374,52 @@ static bool markAliveBlocks(Function &F,
     // Do a quick scan of the basic block, turning any obviously unreachable
     // instructions into LLVM unreachable insts.  The instruction combining pass
     // canonicalizes unreachable insts into stores to null or undef.
-    for (BasicBlock::iterator BBI = BB->begin(), E = BB->end(); BBI != E;++BBI){
+    for (Instruction &I : *BB) {
       // Assumptions that are known to be false are equivalent to unreachable.
       // Also, if the condition is undefined, then we make the choice most
       // beneficial to the optimizer, and choose that to also be unreachable.
-      if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(BBI))
+      if (auto *II = dyn_cast<IntrinsicInst>(&I)) {
         if (II->getIntrinsicID() == Intrinsic::assume) {
-          bool MakeUnreachable = false;
-          if (isa<UndefValue>(II->getArgOperand(0)))
-            MakeUnreachable = true;
-          else if (ConstantInt *Cond =
-                   dyn_cast<ConstantInt>(II->getArgOperand(0)))
-            MakeUnreachable = Cond->isZero();
-
-          if (MakeUnreachable) {
+          if (match(II->getArgOperand(0), m_CombineOr(m_Zero(), m_Undef()))) {
             // Don't insert a call to llvm.trap right before the unreachable.
-            changeToUnreachable(&*BBI, false);
+            changeToUnreachable(II, false);
             Changed = true;
             break;
           }
         }
 
-      if (CallInst *CI = dyn_cast<CallInst>(BBI)) {
+        if (II->getIntrinsicID() == Intrinsic::experimental_guard) {
+          // A call to the guard intrinsic bails out of the current compilation
+          // unit if the predicate passed to it is false.  If the predicate is a
+          // constant false, then we know the guard will bail out of the current
+          // compile unconditionally, so all code following it is dead.
+          //
+          // Note: unlike in llvm.assume, it is not "obviously profitable" for
+          // guards to treat `undef` as `false` since a guard on `undef` can
+          // still be useful for widening.
+          if (match(II->getArgOperand(0), m_Zero()))
+            if (!isa<UnreachableInst>(II->getNextNode())) {
+              changeToUnreachable(II->getNextNode(), /*UseLLVMTrap=*/ false);
+              Changed = true;
+              break;
+            }
+        }
+      }
+
+      if (auto *CI = dyn_cast<CallInst>(&I)) {
+        Value *Callee = CI->getCalledValue();
+        if (isa<ConstantPointerNull>(Callee) || isa<UndefValue>(Callee)) {
+          changeToUnreachable(CI, /*UseLLVMTrap=*/false);
+          Changed = true;
+          break;
+        }
         if (CI->doesNotReturn()) {
           // If we found a call to a no-return function, insert an unreachable
           // instruction after it.  Make sure there isn't *already* one there
           // though.
-          ++BBI;
-          if (!isa<UnreachableInst>(BBI)) {
+          if (!isa<UnreachableInst>(CI->getNextNode())) {
             // Don't insert a call to llvm.trap right before the unreachable.
-            changeToUnreachable(&*BBI, false);
+            changeToUnreachable(CI->getNextNode(), false);
             Changed = true;
           }
           break;
@@ -1319,7 +1429,7 @@ static bool markAliveBlocks(Function &F,
       // Store to undef and store to null are undefined and used to signal that
       // they should be changed to unreachable by passes that can't modify the
       // CFG.
-      if (StoreInst *SI = dyn_cast<StoreInst>(BBI)) {
+      if (auto *SI = dyn_cast<StoreInst>(&I)) {
         // Don't touch volatile stores.
         if (SI->isVolatile()) continue;
 
@@ -1393,9 +1503,9 @@ static bool markAliveBlocks(Function &F,
     }
 
     Changed |= ConstantFoldTerminator(BB, true);
-    for (succ_iterator SI = succ_begin(BB), SE = succ_end(BB); SI != SE; ++SI)
-      if (Reachable.insert(*SI).second)
-        Worklist.push_back(*SI);
+    for (BasicBlock *Successor : successors(BB))
+      if (Reachable.insert(Successor).second)
+        Worklist.push_back(Successor);
   } while (!Worklist.empty());
   return Changed;
 }
@@ -1438,7 +1548,7 @@ void llvm::removeUnwindEdge(BasicBlock *BB) {
 /// if they are in a dead cycle.  Return true if a change was made, false
 /// otherwise.
 bool llvm::removeUnreachableBlocks(Function &F, LazyValueInfo *LVI) {
-  SmallPtrSet<BasicBlock*, 128> Reachable;
+  SmallPtrSet<BasicBlock*, 16> Reachable;
   bool Changed = markAliveBlocks(F, Reachable);
 
   // If there are unreachable blocks in the CFG...
@@ -1454,10 +1564,9 @@ bool llvm::removeUnreachableBlocks(Function &F, LazyValueInfo *LVI) {
     if (Reachable.count(&*BB))
       continue;
 
-    for (succ_iterator SI = succ_begin(&*BB), SE = succ_end(&*BB); SI != SE;
-         ++SI)
-      if (Reachable.count(*SI))
-        (*SI)->removePredecessor(&*BB);
+    for (BasicBlock *Successor : successors(&*BB))
+      if (Reachable.count(Successor))
+        Successor->removePredecessor(&*BB);
     if (LVI)
       LVI->eraseBlock(&*BB);
     BB->dropAllReferences();
@@ -1495,6 +1604,7 @@ void llvm::combineMetadata(Instruction *K, const Instruction *J,
         K->setMetadata(Kind, MDNode::getMostGenericAliasScope(JMD, KMD));
         break;
       case LLVMContext::MD_noalias:
+      case LLVMContext::MD_mem_parallel_loop_access:
         K->setMetadata(Kind, MDNode::intersect(JMD, KMD));
         break;
       case LLVMContext::MD_range:
@@ -1566,7 +1676,7 @@ unsigned llvm::replaceDominatedUsesWith(Value *From, Value *To,
        UI != UE;) {
     Use &U = *UI++;
     auto *I = cast<Instruction>(U.getUser());
-    if (DT.dominates(BB, I->getParent())) {
+    if (DT.properlyDominates(BB, I->getParent())) {
       U.set(To);
       DEBUG(dbgs() << "Replace dominated use of '" << From->getName() << "' as "
                    << *To << " in " << *U << "\n");
@@ -1577,18 +1687,18 @@ unsigned llvm::replaceDominatedUsesWith(Value *From, Value *To,
 }
 
 bool llvm::callsGCLeafFunction(ImmutableCallSite CS) {
-  if (isa<IntrinsicInst>(CS.getInstruction()))
-    // Most LLVM intrinsics are things which can never take a safepoint.
-    // As a result, we don't need to have the stack parsable at the
-    // callsite.  This is a highly useful optimization since intrinsic
-    // calls are fairly prevalent, particularly in debug builds.
-    return true;
-
   // Check if the function is specifically marked as a gc leaf function.
   if (CS.hasFnAttr("gc-leaf-function"))
     return true;
-  if (const Function *F = CS.getCalledFunction())
-    return F->hasFnAttribute("gc-leaf-function");
+  if (const Function *F = CS.getCalledFunction()) {
+    if (F->hasFnAttribute("gc-leaf-function"))
+      return true;
+
+    if (auto IID = F->getIntrinsicID())
+      // Most LLVM intrinsics do not take safepoints.
+      return IID != Intrinsic::experimental_gc_statepoint &&
+             IID != Intrinsic::experimental_deoptimize;
+  }
 
   return false;
 }
@@ -1723,7 +1833,23 @@ collectBitParts(Value *V, bool MatchBSwaps, bool MatchBitReversals,
         // If the AndMask is zero for this bit, clear the bit.
         if ((AndMask & Bit) == 0)
           Result->Provenance[i] = BitPart::Unset;
+      return Result;
+    }
 
+    // If this is a zext instruction zero extend the result.
+    if (I->getOpcode() == Instruction::ZExt) {
+      auto &Res = collectBitParts(I->getOperand(0), MatchBSwaps,
+                                  MatchBitReversals, BPS);
+      if (!Res)
+        return Result;
+
+      Result = BitPart(Res->Provider, BitWidth);
+      auto NarrowBitWidth =
+          cast<IntegerType>(cast<ZExtInst>(I)->getSrcTy())->getBitWidth();
+      for (unsigned i = 0; i < NarrowBitWidth; ++i)
+        Result->Provenance[i] = Res->Provenance[i];
+      for (unsigned i = NarrowBitWidth; i < BitWidth; ++i)
+        Result->Provenance[i] = BitPart::Unset;
       return Result;
     }
   }
@@ -1754,7 +1880,7 @@ static bool bitTransformIsCorrectForBitReverse(unsigned From, unsigned To,
 
 /// Given an OR instruction, check to see if this is a bitreverse
 /// idiom. If so, insert the new intrinsic and return true.
-bool llvm::recognizeBitReverseOrBSwapIdiom(
+bool llvm::recognizeBSwapOrBitReverseIdiom(
     Instruction *I, bool MatchBSwaps, bool MatchBitReversals,
     SmallVectorImpl<Instruction *> &InsertedInsts) {
   if (Operator::getOpcode(I) != Instruction::Or)
@@ -1766,6 +1892,15 @@ bool llvm::recognizeBitReverseOrBSwapIdiom(
     return false;   // Can't do vectors or integers > 128 bits.
   unsigned BW = ITy->getBitWidth();
 
+  unsigned DemandedBW = BW;
+  IntegerType *DemandedTy = ITy;
+  if (I->hasOneUse()) {
+    if (TruncInst *Trunc = dyn_cast<TruncInst>(I->user_back())) {
+      DemandedTy = cast<IntegerType>(Trunc->getType());
+      DemandedBW = DemandedTy->getBitWidth();
+    }
+  }
+
   // Try to find all the pieces corresponding to the bswap.
   std::map<Value *, Optional<BitPart>> BPS;
   auto Res = collectBitParts(I, MatchBSwaps, MatchBitReversals, BPS);
@@ -1775,11 +1910,12 @@ bool llvm::recognizeBitReverseOrBSwapIdiom(
 
   // Now, is the bit permutation correct for a bswap or a bitreverse? We can
   // only byteswap values with an even number of bytes.
-  bool OKForBSwap = BW % 16 == 0, OKForBitReverse = true;
-  for (unsigned i = 0; i < BW; ++i) {
-    OKForBSwap &= bitTransformIsCorrectForBSwap(BitProvenance[i], i, BW);
+  bool OKForBSwap = DemandedBW % 16 == 0, OKForBitReverse = true;
+  for (unsigned i = 0; i < DemandedBW; ++i) {
+    OKForBSwap &=
+        bitTransformIsCorrectForBSwap(BitProvenance[i], i, DemandedBW);
     OKForBitReverse &=
-        bitTransformIsCorrectForBitReverse(BitProvenance[i], i, BW);
+        bitTransformIsCorrectForBitReverse(BitProvenance[i], i, DemandedBW);
   }
 
   Intrinsic::ID Intrin;
@@ -1790,7 +1926,51 @@ bool llvm::recognizeBitReverseOrBSwapIdiom(
   else
     return false;
 
+  if (ITy != DemandedTy) {
+    Function *F = Intrinsic::getDeclaration(I->getModule(), Intrin, DemandedTy);
+    Value *Provider = Res->Provider;
+    IntegerType *ProviderTy = cast<IntegerType>(Provider->getType());
+    // We may need to truncate the provider.
+    if (DemandedTy != ProviderTy) {
+      auto *Trunc = CastInst::Create(Instruction::Trunc, Provider, DemandedTy,
+                                     "trunc", I);
+      InsertedInsts.push_back(Trunc);
+      Provider = Trunc;
+    }
+    auto *CI = CallInst::Create(F, Provider, "rev", I);
+    InsertedInsts.push_back(CI);
+    auto *ExtInst = CastInst::Create(Instruction::ZExt, CI, ITy, "zext", I);
+    InsertedInsts.push_back(ExtInst);
+    return true;
+  }
+
   Function *F = Intrinsic::getDeclaration(I->getModule(), Intrin, ITy);
   InsertedInsts.push_back(CallInst::Create(F, Res->Provider, "rev", I));
   return true;
 }
+
+// CodeGen has special handling for some string functions that may replace
+// them with target-specific intrinsics.  Since that'd skip our interceptors
+// in ASan/MSan/TSan/DFSan, and thus make us miss some memory accesses,
+// we mark affected calls as NoBuiltin, which will disable optimization
+// in CodeGen.
+void llvm::maybeMarkSanitizerLibraryCallNoBuiltin(CallInst *CI,
+                                          const TargetLibraryInfo *TLI) {
+  Function *F = CI->getCalledFunction();
+  LibFunc::Func Func;
+  if (!F || F->hasLocalLinkage() || !F->hasName() ||
+      !TLI->getLibFunc(F->getName(), Func))
+    return;
+  switch (Func) {
+    default: break;
+    case LibFunc::memcmp:
+    case LibFunc::memchr:
+    case LibFunc::strcpy:
+    case LibFunc::stpcpy:
+    case LibFunc::strcmp:
+    case LibFunc::strlen:
+    case LibFunc::strnlen:
+      CI->addAttribute(AttributeSet::FunctionIndex, Attribute::NoBuiltin);
+      break;
+  }
+}
diff --git a/lib/Transforms/Utils/LoopSimplify.cpp b/lib/Transforms/Utils/LoopSimplify.cpp
index 1fa469595d16..b3a928bf7753 100644
--- a/lib/Transforms/Utils/LoopSimplify.cpp
+++ b/lib/Transforms/Utils/LoopSimplify.cpp
@@ -37,6 +37,7 @@
 //
 //===----------------------------------------------------------------------===//
 
+#include "llvm/Transforms/Utils/LoopSimplify.h"
 #include "llvm/Transforms/Scalar.h"
 #include "llvm/ADT/DepthFirstIterator.h"
 #include "llvm/ADT/SetOperations.h"
@@ -489,14 +490,9 @@ ReprocessLoop:
       DEBUG(dbgs() << "LoopSimplify: Deleting edge from dead predecessor "
                    << P->getName() << "\n");
 
-      // Inform each successor of each dead pred.
-      for (succ_iterator SI = succ_begin(P), SE = succ_end(P); SI != SE; ++SI)
-        (*SI)->removePredecessor(P);
       // Zap the dead pred's terminator and replace it with unreachable.
       TerminatorInst *TI = P->getTerminator();
-       TI->replaceAllUsesWith(UndefValue::get(TI->getType()));
-      P->getTerminator()->eraseFromParent();
-      new UnreachableInst(P->getContext(), P);
+      changeToUnreachable(TI, /*UseLLVMTrap=*/false);
       Changed = true;
     }
   }
@@ -506,14 +502,13 @@ ReprocessLoop:
   // trip count computations.
   SmallVector<BasicBlock*, 8> ExitingBlocks;
   L->getExitingBlocks(ExitingBlocks);
-  for (SmallVectorImpl<BasicBlock *>::iterator I = ExitingBlocks.begin(),
-       E = ExitingBlocks.end(); I != E; ++I)
-    if (BranchInst *BI = dyn_cast<BranchInst>((*I)->getTerminator()))
+  for (BasicBlock *ExitingBlock : ExitingBlocks)
+    if (BranchInst *BI = dyn_cast<BranchInst>(ExitingBlock->getTerminator()))
       if (BI->isConditional()) {
         if (UndefValue *Cond = dyn_cast<UndefValue>(BI->getCondition())) {
 
           DEBUG(dbgs() << "LoopSimplify: Resolving \"br i1 undef\" to exit in "
-                       << (*I)->getName() << "\n");
+                       << ExitingBlock->getName() << "\n");
 
           BI->setCondition(ConstantInt::get(Cond->getType(),
                                             !L->contains(BI->getSuccessor(0))));
@@ -545,9 +540,7 @@ ReprocessLoop:
 
   SmallSetVector<BasicBlock *, 8> ExitBlockSet(ExitBlocks.begin(),
                                                ExitBlocks.end());
-  for (SmallSetVector<BasicBlock *, 8>::iterator I = ExitBlockSet.begin(),
-         E = ExitBlockSet.end(); I != E; ++I) {
-    BasicBlock *ExitBlock = *I;
+  for (BasicBlock *ExitBlock : ExitBlockSet) {
     for (pred_iterator PI = pred_begin(ExitBlock), PE = pred_end(ExitBlock);
          PI != PE; ++PI)
       // Must be exactly this loop: no subloops, parent loops, or non-loop preds
@@ -691,8 +684,10 @@ ReprocessLoop:
       }
       DT->eraseNode(ExitingBlock);
 
-      BI->getSuccessor(0)->removePredecessor(ExitingBlock);
-      BI->getSuccessor(1)->removePredecessor(ExitingBlock);
+      BI->getSuccessor(0)->removePredecessor(
+          ExitingBlock, /* DontDeleteUselessPHIs */ PreserveLCSSA);
+      BI->getSuccessor(1)->removePredecessor(
+          ExitingBlock, /* DontDeleteUselessPHIs */ PreserveLCSSA);
       ExitingBlock->eraseFromParent();
     }
   }
@@ -731,11 +726,6 @@ namespace {
       initializeLoopSimplifyPass(*PassRegistry::getPassRegistry());
     }
 
-    DominatorTree *DT;
-    LoopInfo *LI;
-    ScalarEvolution *SE;
-    AssumptionCache *AC;
-
     bool runOnFunction(Function &F) override;
 
     void getAnalysisUsage(AnalysisUsage &AU) const override {
@@ -753,7 +743,8 @@ namespace {
       AU.addPreserved<GlobalsAAWrapperPass>();
       AU.addPreserved<ScalarEvolutionWrapperPass>();
       AU.addPreserved<SCEVAAWrapperPass>();
-      AU.addPreserved<DependenceAnalysis>();
+      AU.addPreservedID(LCSSAID);
+      AU.addPreserved<DependenceAnalysisWrapperPass>();
       AU.addPreservedID(BreakCriticalEdgesID);  // No critical edges added.
     }
 
@@ -768,9 +759,6 @@ INITIALIZE_PASS_BEGIN(LoopSimplify, "loop-simplify",
 INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker)
 INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
 INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass)
-INITIALIZE_PASS_DEPENDENCY(BasicAAWrapperPass)
-INITIALIZE_PASS_DEPENDENCY(GlobalsAAWrapperPass)
-INITIALIZE_PASS_DEPENDENCY(SCEVAAWrapperPass)
 INITIALIZE_PASS_END(LoopSimplify, "loop-simplify",
                 "Canonicalize natural loops", false, false)
 
@@ -783,20 +771,64 @@ Pass *llvm::createLoopSimplifyPass() { return new LoopSimplify(); }
 ///
 bool LoopSimplify::runOnFunction(Function &F) {
   bool Changed = false;
-  LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
-  DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
+  LoopInfo *LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
+  DominatorTree *DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
   auto *SEWP = getAnalysisIfAvailable<ScalarEvolutionWrapperPass>();
-  SE = SEWP ? &SEWP->getSE() : nullptr;
-  AC = &getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F);
+  ScalarEvolution *SE = SEWP ? &SEWP->getSE() : nullptr;
+  AssumptionCache *AC =
+      &getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F);
+
   bool PreserveLCSSA = mustPreserveAnalysisID(LCSSAID);
+#ifndef NDEBUG
+  if (PreserveLCSSA) {
+    assert(DT && "DT not available.");
+    assert(LI && "LI not available.");
+    bool InLCSSA =
+        all_of(*LI, [&](Loop *L) { return L->isRecursivelyLCSSAForm(*DT); });
+    assert(InLCSSA && "Requested to preserve LCSSA, but it's already broken.");
+  }
+#endif
 
   // Simplify each loop nest in the function.
   for (LoopInfo::iterator I = LI->begin(), E = LI->end(); I != E; ++I)
     Changed |= simplifyLoop(*I, DT, LI, SE, AC, PreserveLCSSA);
 
+#ifndef NDEBUG
+  if (PreserveLCSSA) {
+    bool InLCSSA =
+        all_of(*LI, [&](Loop *L) { return L->isRecursivelyLCSSAForm(*DT); });
+    assert(InLCSSA && "LCSSA is broken after loop-simplify.");
+  }
+#endif
   return Changed;
 }
 
+PreservedAnalyses LoopSimplifyPass::run(Function &F,
+                                        AnalysisManager<Function> &AM) {
+  bool Changed = false;
+  LoopInfo *LI = &AM.getResult<LoopAnalysis>(F);
+  DominatorTree *DT = &AM.getResult<DominatorTreeAnalysis>(F);
+  ScalarEvolution *SE = AM.getCachedResult<ScalarEvolutionAnalysis>(F);
+  AssumptionCache *AC = &AM.getResult<AssumptionAnalysis>(F);
+
+  // FIXME: This pass should verify that the loops on which it's operating
+  // are in canonical SSA form, and that the pass itself preserves this form.
+  for (LoopInfo::iterator I = LI->begin(), E = LI->end(); I != E; ++I)
+    Changed |= simplifyLoop(*I, DT, LI, SE, AC, true /* PreserveLCSSA */);
+
+  if (!Changed)
+    return PreservedAnalyses::all();
+  PreservedAnalyses PA;
+  PA.preserve<DominatorTreeAnalysis>();
+  PA.preserve<LoopAnalysis>();
+  PA.preserve<BasicAA>();
+  PA.preserve<GlobalsAA>();
+  PA.preserve<SCEVAA>();
+  PA.preserve<ScalarEvolutionAnalysis>();
+  PA.preserve<DependenceAnalysis>();
+  return PA;
+}
+
 // FIXME: Restore this code when we re-enable verification in verifyAnalysis
 // below.
 #if 0
diff --git a/lib/Transforms/Utils/LoopUnroll.cpp b/lib/Transforms/Utils/LoopUnroll.cpp
index eea9237ba80c..7f1f78fa8b41 100644
--- a/lib/Transforms/Utils/LoopUnroll.cpp
+++ b/lib/Transforms/Utils/LoopUnroll.cpp
@@ -34,6 +34,7 @@
 #include "llvm/Transforms/Utils/BasicBlockUtils.h"
 #include "llvm/Transforms/Utils/Cloning.h"
 #include "llvm/Transforms/Utils/Local.h"
+#include "llvm/Transforms/Utils/LoopSimplify.h"
 #include "llvm/Transforms/Utils/LoopUtils.h"
 #include "llvm/Transforms/Utils/SimplifyIndVar.h"
 using namespace llvm;
@@ -44,9 +45,14 @@ using namespace llvm;
 STATISTIC(NumCompletelyUnrolled, "Number of loops completely unrolled");
 STATISTIC(NumUnrolled, "Number of loops unrolled (completely or otherwise)");
 
-/// RemapInstruction - Convert the instruction operands from referencing the
-/// current values into those specified by VMap.
-static inline void RemapInstruction(Instruction *I,
+static cl::opt<bool>
+UnrollRuntimeEpilog("unroll-runtime-epilog", cl::init(true), cl::Hidden,
+                    cl::desc("Allow runtime unrolled loops to be unrolled "
+                             "with epilog instead of prolog."));
+
+/// Convert the instruction operands from referencing the current values into
+/// those specified by VMap.
+static inline void remapInstruction(Instruction *I,
                                     ValueToValueMapTy &VMap) {
   for (unsigned op = 0, E = I->getNumOperands(); op != E; ++op) {
     Value *Op = I->getOperand(op);
@@ -64,8 +70,8 @@ static inline void RemapInstruction(Instruction *I,
   }
 }
 
-/// FoldBlockIntoPredecessor - Folds a basic block into its predecessor if it
-/// only has one predecessor, and that predecessor only has one successor.
+/// Folds a basic block into its predecessor if it only has one predecessor, and
+/// that predecessor only has one successor.
 /// The LoopInfo Analysis that is passed will be kept consistent.  If folding is
 /// successful references to the containing loop must be removed from
 /// ScalarEvolution by calling ScalarEvolution::forgetLoop because SE may have
@@ -73,8 +79,9 @@ static inline void RemapInstruction(Instruction *I,
 /// of loops that have already been forgotten to prevent redundant, expensive
 /// calls to ScalarEvolution::forgetLoop.  Returns the new combined block.
 static BasicBlock *
-FoldBlockIntoPredecessor(BasicBlock *BB, LoopInfo* LI, ScalarEvolution *SE,
-                         SmallPtrSetImpl<Loop *> &ForgottenLoops) {
+foldBlockIntoPredecessor(BasicBlock *BB, LoopInfo *LI, ScalarEvolution *SE,
+                         SmallPtrSetImpl<Loop *> &ForgottenLoops,
+                         DominatorTree *DT) {
   // Merge basic blocks into their predecessor if there is only one distinct
   // pred, and if there is only one distinct successor of the predecessor, and
   // if there are no PHI nodes.
@@ -106,7 +113,16 @@ FoldBlockIntoPredecessor(BasicBlock *BB, LoopInfo* LI, ScalarEvolution *SE,
   // OldName will be valid until erased.
   StringRef OldName = BB->getName();
 
-  // Erase basic block from the function...
+  // Erase the old block and update dominator info.
+  if (DT)
+    if (DomTreeNode *DTN = DT->getNode(BB)) {
+      DomTreeNode *PredDTN = DT->getNode(OnlyPred);
+      SmallVector<DomTreeNode *, 8> Children(DTN->begin(), DTN->end());
+      for (auto *DI : Children)
+        DT->changeImmediateDominator(DI, PredDTN);
+
+      DT->eraseNode(BB);
+    }
 
   // ScalarEvolution holds references to loop exit blocks.
   if (SE) {
@@ -126,6 +142,35 @@ FoldBlockIntoPredecessor(BasicBlock *BB, LoopInfo* LI, ScalarEvolution *SE,
   return OnlyPred;
 }
 
+/// Check if unrolling created a situation where we need to insert phi nodes to
+/// preserve LCSSA form.
+/// \param Blocks is a vector of basic blocks representing unrolled loop.
+/// \param L is the outer loop.
+/// It's possible that some of the blocks are in L, and some are not. In this
+/// case, if there is a use is outside L, and definition is inside L, we need to
+/// insert a phi-node, otherwise LCSSA will be broken.
+/// The function is just a helper function for llvm::UnrollLoop that returns
+/// true if this situation occurs, indicating that LCSSA needs to be fixed.
+static bool needToInsertPhisForLCSSA(Loop *L, std::vector<BasicBlock *> Blocks,
+                                     LoopInfo *LI) {
+  for (BasicBlock *BB : Blocks) {
+    if (LI->getLoopFor(BB) == L)
+      continue;
+    for (Instruction &I : *BB) {
+      for (Use &U : I.operands()) {
+        if (auto Def = dyn_cast<Instruction>(U)) {
+          Loop *DefLoop = LI->getLoopFor(Def->getParent());
+          if (!DefLoop)
+            continue;
+          if (DefLoop->contains(L))
+            return true;
+        }
+      }
+    }
+  }
+  return false;
+}
+
 /// Unroll the given loop by Count. The loop must be in LCSSA form. Returns true
 /// if unrolling was successful, or false if the loop was unmodified. Unrolling
 /// can only fail when the loop's latch block is not terminated by a conditional
@@ -155,7 +200,7 @@ FoldBlockIntoPredecessor(BasicBlock *BB, LoopInfo* LI, ScalarEvolution *SE,
 ///
 /// This utility preserves LoopInfo. It will also preserve ScalarEvolution and
 /// DominatorTree if they are non-null.
-bool llvm::UnrollLoop(Loop *L, unsigned Count, unsigned TripCount,
+bool llvm::UnrollLoop(Loop *L, unsigned Count, unsigned TripCount, bool Force,
                       bool AllowRuntime, bool AllowExpensiveTripCount,
                       unsigned TripMultiple, LoopInfo *LI, ScalarEvolution *SE,
                       DominatorTree *DT, AssumptionCache *AC,
@@ -218,20 +263,48 @@ bool llvm::UnrollLoop(Loop *L, unsigned Count, unsigned TripCount,
   bool CompletelyUnroll = Count == TripCount;
   SmallVector<BasicBlock *, 4> ExitBlocks;
   L->getExitBlocks(ExitBlocks);
-  Loop *ParentL = L->getParentLoop();
-  bool AllExitsAreInsideParentLoop = !ParentL ||
-      std::all_of(ExitBlocks.begin(), ExitBlocks.end(),
-                  [&](BasicBlock *BB) { return ParentL->contains(BB); });
+  std::vector<BasicBlock*> OriginalLoopBlocks = L->getBlocks();
+
+  // Go through all exits of L and see if there are any phi-nodes there. We just
+  // conservatively assume that they're inserted to preserve LCSSA form, which
+  // means that complete unrolling might break this form. We need to either fix
+  // it in-place after the transformation, or entirely rebuild LCSSA. TODO: For
+  // now we just recompute LCSSA for the outer loop, but it should be possible
+  // to fix it in-place.
+  bool NeedToFixLCSSA = PreserveLCSSA && CompletelyUnroll &&
+      std::any_of(ExitBlocks.begin(), ExitBlocks.end(),
+                  [&](BasicBlock *BB) { return isa<PHINode>(BB->begin()); });
 
   // We assume a run-time trip count if the compiler cannot
   // figure out the loop trip count and the unroll-runtime
   // flag is specified.
   bool RuntimeTripCount = (TripCount == 0 && Count > 0 && AllowRuntime);
 
-  if (RuntimeTripCount &&
-      !UnrollRuntimeLoopProlog(L, Count, AllowExpensiveTripCount, LI, SE, DT,
-                               PreserveLCSSA))
-    return false;
+  // Loops containing convergent instructions must have a count that divides
+  // their TripMultiple.
+  DEBUG(
+      {
+        bool HasConvergent = false;
+        for (auto &BB : L->blocks())
+          for (auto &I : *BB)
+            if (auto CS = CallSite(&I))
+              HasConvergent |= CS.isConvergent();
+        assert((!HasConvergent || TripMultiple % Count == 0) &&
+               "Unroll count must divide trip multiple if loop contains a "
+               "convergent operation.");
+      });
+  // Don't output the runtime loop remainder if Count is a multiple of
+  // TripMultiple.  Such a remainder is never needed, and is unsafe if the loop
+  // contains a convergent instruction.
+  if (RuntimeTripCount && TripMultiple % Count != 0 &&
+      !UnrollRuntimeLoopRemainder(L, Count, AllowExpensiveTripCount,
+                                  UnrollRuntimeEpilog, LI, SE, DT, 
+                                  PreserveLCSSA)) {
+    if (Force)
+      RuntimeTripCount = false;
+    else
+      return false;
+  }
 
   // Notify ScalarEvolution that the loop will be substantially changed,
   // if not outright eliminated.
@@ -308,6 +381,7 @@ bool llvm::UnrollLoop(Loop *L, unsigned Count, unsigned TripCount,
   LoopBlocksDFS::RPOIterator BlockBegin = DFS.beginRPO();
   LoopBlocksDFS::RPOIterator BlockEnd = DFS.endRPO();
 
+  std::vector<BasicBlock*> UnrolledLoopBlocks = L->getBlocks();
   for (unsigned It = 1; It != Count; ++It) {
     std::vector<BasicBlock*> NewBlocks;
     SmallDenseMap<const Loop *, Loop *, 4> NewLoops;
@@ -349,13 +423,13 @@ bool llvm::UnrollLoop(Loop *L, unsigned Count, unsigned TripCount,
       if (*BB == Header)
         // Loop over all of the PHI nodes in the block, changing them to use
         // the incoming values from the previous block.
-        for (unsigned i = 0, e = OrigPHINode.size(); i != e; ++i) {
-          PHINode *NewPHI = cast<PHINode>(VMap[OrigPHINode[i]]);
+        for (PHINode *OrigPHI : OrigPHINode) {
+          PHINode *NewPHI = cast<PHINode>(VMap[OrigPHI]);
           Value *InVal = NewPHI->getIncomingValueForBlock(LatchBlock);
           if (Instruction *InValI = dyn_cast<Instruction>(InVal))
             if (It > 1 && L->contains(InValI))
               InVal = LastValueMap[InValI];
-          VMap[OrigPHINode[i]] = InVal;
+          VMap[OrigPHI] = InVal;
           New->getInstList().erase(NewPHI);
         }
 
@@ -366,11 +440,10 @@ bool llvm::UnrollLoop(Loop *L, unsigned Count, unsigned TripCount,
         LastValueMap[VI->first] = VI->second;
 
       // Add phi entries for newly created values to all exit blocks.
-      for (succ_iterator SI = succ_begin(*BB), SE = succ_end(*BB);
-           SI != SE; ++SI) {
-        if (L->contains(*SI))
+      for (BasicBlock *Succ : successors(*BB)) {
+        if (L->contains(Succ))
           continue;
-        for (BasicBlock::iterator BBI = (*SI)->begin();
+        for (BasicBlock::iterator BBI = Succ->begin();
              PHINode *phi = dyn_cast<PHINode>(BBI); ++BBI) {
           Value *Incoming = phi->getIncomingValueForBlock(*BB);
           ValueToValueMapTy::iterator It = LastValueMap.find(Incoming);
@@ -387,18 +460,33 @@ bool llvm::UnrollLoop(Loop *L, unsigned Count, unsigned TripCount,
         Latches.push_back(New);
 
       NewBlocks.push_back(New);
+      UnrolledLoopBlocks.push_back(New);
+
+      // Update DomTree: since we just copy the loop body, and each copy has a
+      // dedicated entry block (copy of the header block), this header's copy
+      // dominates all copied blocks. That means, dominance relations in the
+      // copied body are the same as in the original body.
+      if (DT) {
+        if (*BB == Header)
+          DT->addNewBlock(New, Latches[It - 1]);
+        else {
+          auto BBDomNode = DT->getNode(*BB);
+          auto BBIDom = BBDomNode->getIDom();
+          BasicBlock *OriginalBBIDom = BBIDom->getBlock();
+          DT->addNewBlock(
+              New, cast<BasicBlock>(LastValueMap[cast<Value>(OriginalBBIDom)]));
+        }
+      }
     }
 
     // Remap all instructions in the most recent iteration
-    for (unsigned i = 0; i < NewBlocks.size(); ++i)
-      for (BasicBlock::iterator I = NewBlocks[i]->begin(),
-           E = NewBlocks[i]->end(); I != E; ++I)
-        ::RemapInstruction(&*I, LastValueMap);
+    for (BasicBlock *NewBlock : NewBlocks)
+      for (Instruction &I : *NewBlock)
+        ::remapInstruction(&I, LastValueMap);
   }
 
   // Loop over the PHI nodes in the original block, setting incoming values.
-  for (unsigned i = 0, e = OrigPHINode.size(); i != e; ++i) {
-    PHINode *PN = OrigPHINode[i];
+  for (PHINode *PN : OrigPHINode) {
     if (CompletelyUnroll) {
       PN->replaceAllUsesWith(PN->getIncomingValueForBlock(Preheader));
       Header->getInstList().erase(PN);
@@ -453,11 +541,10 @@ bool llvm::UnrollLoop(Loop *L, unsigned Count, unsigned TripCount,
       // Remove phi operands at this loop exit
       if (Dest != LoopExit) {
         BasicBlock *BB = Latches[i];
-        for (succ_iterator SI = succ_begin(BB), SE = succ_end(BB);
-             SI != SE; ++SI) {
-          if (*SI == Headers[i])
+        for (BasicBlock *Succ: successors(BB)) {
+          if (Succ == Headers[i])
             continue;
-          for (BasicBlock::iterator BBI = (*SI)->begin();
+          for (BasicBlock::iterator BBI = Succ->begin();
                PHINode *Phi = dyn_cast<PHINode>(BBI); ++BBI) {
             Phi->removeIncomingValue(BB, false);
           }
@@ -468,16 +555,43 @@ bool llvm::UnrollLoop(Loop *L, unsigned Count, unsigned TripCount,
       Term->eraseFromParent();
     }
   }
+  // Update dominators of blocks we might reach through exits.
+  // Immediate dominator of such block might change, because we add more
+  // routes which can lead to the exit: we can now reach it from the copied
+  // iterations too. Thus, the new idom of the block will be the nearest
+  // common dominator of the previous idom and common dominator of all copies of
+  // the previous idom. This is equivalent to the nearest common dominator of
+  // the previous idom and the first latch, which dominates all copies of the
+  // previous idom.
+  if (DT && Count > 1) {
+    for (auto *BB : OriginalLoopBlocks) {
+      auto *BBDomNode = DT->getNode(BB);
+      SmallVector<BasicBlock *, 16> ChildrenToUpdate;
+      for (auto *ChildDomNode : BBDomNode->getChildren()) {
+        auto *ChildBB = ChildDomNode->getBlock();
+        if (!L->contains(ChildBB))
+          ChildrenToUpdate.push_back(ChildBB);
+      }
+      BasicBlock *NewIDom = DT->findNearestCommonDominator(BB, Latches[0]);
+      for (auto *ChildBB : ChildrenToUpdate)
+        DT->changeImmediateDominator(ChildBB, NewIDom);
+    }
+  }
 
   // Merge adjacent basic blocks, if possible.
   SmallPtrSet<Loop *, 4> ForgottenLoops;
-  for (unsigned i = 0, e = Latches.size(); i != e; ++i) {
-    BranchInst *Term = cast<BranchInst>(Latches[i]->getTerminator());
+  for (BasicBlock *Latch : Latches) {
+    BranchInst *Term = cast<BranchInst>(Latch->getTerminator());
     if (Term->isUnconditional()) {
       BasicBlock *Dest = Term->getSuccessor(0);
-      if (BasicBlock *Fold = FoldBlockIntoPredecessor(Dest, LI, SE,
-                                                      ForgottenLoops))
+      if (BasicBlock *Fold =
+              foldBlockIntoPredecessor(Dest, LI, SE, ForgottenLoops, DT)) {
+        // Dest has been folded into Fold. Update our worklists accordingly.
         std::replace(Latches.begin(), Latches.end(), Dest, Fold);
+        UnrolledLoopBlocks.erase(std::remove(UnrolledLoopBlocks.begin(),
+                                             UnrolledLoopBlocks.end(), Dest),
+                                 UnrolledLoopBlocks.end());
+      }
     }
   }
 
@@ -485,10 +599,12 @@ bool llvm::UnrollLoop(Loop *L, unsigned Count, unsigned TripCount,
   // whole function's cache.
   AC->clear();
 
-  // FIXME: Reconstruct dom info, because it is not preserved properly.
-  // Incrementally updating domtree after loop unrolling would be easy.
-  if (DT)
+  // FIXME: We only preserve DT info for complete unrolling now. Incrementally
+  // updating domtree after partial loop unrolling should also be easy.
+  if (DT && !CompletelyUnroll)
     DT->recalculate(*L->getHeader()->getParent());
+  else if (DT)
+    DEBUG(DT->verifyDomTree());
 
   // Simplify any new induction variables in the partially unrolled loop.
   if (SE && !CompletelyUnroll) {
@@ -508,19 +624,17 @@ bool llvm::UnrollLoop(Loop *L, unsigned Count, unsigned TripCount,
   // go.
   const DataLayout &DL = Header->getModule()->getDataLayout();
   const std::vector<BasicBlock*> &NewLoopBlocks = L->getBlocks();
-  for (std::vector<BasicBlock*>::const_iterator BB = NewLoopBlocks.begin(),
-       BBE = NewLoopBlocks.end(); BB != BBE; ++BB)
-    for (BasicBlock::iterator I = (*BB)->begin(), E = (*BB)->end(); I != E; ) {
+  for (BasicBlock *BB : NewLoopBlocks) {
+    for (BasicBlock::iterator I = BB->begin(), E = BB->end(); I != E; ) {
       Instruction *Inst = &*I++;
 
-      if (isInstructionTriviallyDead(Inst))
-        (*BB)->getInstList().erase(Inst);
-      else if (Value *V = SimplifyInstruction(Inst, DL))
-        if (LI->replacementPreservesLCSSAForm(Inst, V)) {
+      if (Value *V = SimplifyInstruction(Inst, DL))
+        if (LI->replacementPreservesLCSSAForm(Inst, V))
           Inst->replaceAllUsesWith(V);
-          (*BB)->getInstList().erase(Inst);
-        }
+      if (isInstructionTriviallyDead(Inst))
+        BB->getInstList().erase(Inst);
     }
+  }
 
   NumCompletelyUnrolled += CompletelyUnroll;
   ++NumUnrolled;
@@ -530,6 +644,17 @@ bool llvm::UnrollLoop(Loop *L, unsigned Count, unsigned TripCount,
   if (CompletelyUnroll)
     LI->markAsRemoved(L);
 
+  // After complete unrolling most of the blocks should be contained in OuterL.
+  // However, some of them might happen to be out of OuterL (e.g. if they
+  // precede a loop exit). In this case we might need to insert PHI nodes in
+  // order to preserve LCSSA form.
+  // We don't need to check this if we already know that we need to fix LCSSA
+  // form.
+  // TODO: For now we just recompute LCSSA for the outer loop in this case, but
+  // it should be possible to fix it in-place.
+  if (PreserveLCSSA && OuterL && CompletelyUnroll && !NeedToFixLCSSA)
+    NeedToFixLCSSA |= ::needToInsertPhisForLCSSA(OuterL, UnrolledLoopBlocks, LI);
+
   // If we have a pass and a DominatorTree we should re-simplify impacted loops
   // to ensure subsequent analyses can rely on this form. We want to simplify
   // at least one layer outside of the loop that was unrolled so that any
@@ -538,7 +663,7 @@ bool llvm::UnrollLoop(Loop *L, unsigned Count, unsigned TripCount,
     if (!OuterL && !CompletelyUnroll)
       OuterL = L;
     if (OuterL) {
-      bool Simplified = simplifyLoop(OuterL, DT, LI, SE, AC, PreserveLCSSA);
+      simplifyLoop(OuterL, DT, LI, SE, AC, PreserveLCSSA);
 
       // LCSSA must be performed on the outermost affected loop. The unrolled
       // loop's last loop latch is guaranteed to be in the outermost loop after
@@ -548,7 +673,7 @@ bool llvm::UnrollLoop(Loop *L, unsigned Count, unsigned TripCount,
         while (OuterL->getParentLoop() != LatchLoop)
           OuterL = OuterL->getParentLoop();
 
-      if (CompletelyUnroll && (!AllExitsAreInsideParentLoop || Simplified))
+      if (NeedToFixLCSSA)
         formLCSSARecursively(*OuterL, *DT, LI, SE);
       else
         assert(OuterL->isLCSSAForm(*DT) &&
diff --git a/lib/Transforms/Utils/LoopUnrollRuntime.cpp b/lib/Transforms/Utils/LoopUnrollRuntime.cpp
index 0d68f18ad0e5..861a50cf354d 100644
--- a/lib/Transforms/Utils/LoopUnrollRuntime.cpp
+++ b/lib/Transforms/Utils/LoopUnrollRuntime.cpp
@@ -16,8 +16,8 @@
 // case, we need to generate code to execute these 'left over' iterations.
 //
 // The current strategy generates an if-then-else sequence prior to the
-// unrolled loop to execute the 'left over' iterations.  Other strategies
-// include generate a loop before or after the unrolled loop.
+// unrolled loop to execute the 'left over' iterations before or after the
+// unrolled loop.
 //
 //===----------------------------------------------------------------------===//
 
@@ -60,91 +60,220 @@ STATISTIC(NumRuntimeUnrolled,
 ///   than the unroll factor.
 ///
 static void ConnectProlog(Loop *L, Value *BECount, unsigned Count,
-                          BasicBlock *LastPrologBB, BasicBlock *PrologEnd,
-                          BasicBlock *OrigPH, BasicBlock *NewPH,
-                          ValueToValueMapTy &VMap, DominatorTree *DT,
-                          LoopInfo *LI, bool PreserveLCSSA) {
+                          BasicBlock *PrologExit, BasicBlock *PreHeader,
+                          BasicBlock *NewPreHeader, ValueToValueMapTy &VMap,
+                          DominatorTree *DT, LoopInfo *LI, bool PreserveLCSSA) {
   BasicBlock *Latch = L->getLoopLatch();
   assert(Latch && "Loop must have a latch");
+  BasicBlock *PrologLatch = cast<BasicBlock>(VMap[Latch]);
 
   // Create a PHI node for each outgoing value from the original loop
   // (which means it is an outgoing value from the prolog code too).
   // The new PHI node is inserted in the prolog end basic block.
-  // The new PHI name is added as an operand of a PHI node in either
+  // The new PHI node value is added as an operand of a PHI node in either
   // the loop header or the loop exit block.
-  for (succ_iterator SBI = succ_begin(Latch), SBE = succ_end(Latch);
-       SBI != SBE; ++SBI) {
-    for (BasicBlock::iterator BBI = (*SBI)->begin();
-         PHINode *PN = dyn_cast<PHINode>(BBI); ++BBI) {
-
+  for (BasicBlock *Succ : successors(Latch)) {
+    for (Instruction &BBI : *Succ) {
+      PHINode *PN = dyn_cast<PHINode>(&BBI);
+      // Exit when we passed all PHI nodes.
+      if (!PN)
+        break;
       // Add a new PHI node to the prolog end block and add the
       // appropriate incoming values.
-      PHINode *NewPN = PHINode::Create(PN->getType(), 2, PN->getName()+".unr",
-                                       PrologEnd->getTerminator());
+      PHINode *NewPN = PHINode::Create(PN->getType(), 2, PN->getName() + ".unr",
+                                       PrologExit->getFirstNonPHI());
       // Adding a value to the new PHI node from the original loop preheader.
       // This is the value that skips all the prolog code.
       if (L->contains(PN)) {
-        NewPN->addIncoming(PN->getIncomingValueForBlock(NewPH), OrigPH);
+        NewPN->addIncoming(PN->getIncomingValueForBlock(NewPreHeader),
+                           PreHeader);
       } else {
-        NewPN->addIncoming(UndefValue::get(PN->getType()), OrigPH);
+        NewPN->addIncoming(UndefValue::get(PN->getType()), PreHeader);
       }
 
       Value *V = PN->getIncomingValueForBlock(Latch);
       if (Instruction *I = dyn_cast<Instruction>(V)) {
         if (L->contains(I)) {
-          V = VMap[I];
+          V = VMap.lookup(I);
         }
       }
       // Adding a value to the new PHI node from the last prolog block
       // that was created.
-      NewPN->addIncoming(V, LastPrologBB);
+      NewPN->addIncoming(V, PrologLatch);
 
       // Update the existing PHI node operand with the value from the
       // new PHI node.  How this is done depends on if the existing
       // PHI node is in the original loop block, or the exit block.
       if (L->contains(PN)) {
-        PN->setIncomingValue(PN->getBasicBlockIndex(NewPH), NewPN);
+        PN->setIncomingValue(PN->getBasicBlockIndex(NewPreHeader), NewPN);
       } else {
-        PN->addIncoming(NewPN, PrologEnd);
+        PN->addIncoming(NewPN, PrologExit);
       }
     }
   }
 
-  // Create a branch around the orignal loop, which is taken if there are no
+  // Create a branch around the original loop, which is taken if there are no
   // iterations remaining to be executed after running the prologue.
-  Instruction *InsertPt = PrologEnd->getTerminator();
+  Instruction *InsertPt = PrologExit->getTerminator();
   IRBuilder<> B(InsertPt);
 
   assert(Count != 0 && "nonsensical Count!");
 
-  // If BECount <u (Count - 1) then (BECount + 1) & (Count - 1) == (BECount + 1)
-  // (since Count is a power of 2).  This means %xtraiter is (BECount + 1) and
-  // and all of the iterations of this loop were executed by the prologue.  Note
-  // that if BECount <u (Count - 1) then (BECount + 1) cannot unsigned-overflow.
+  // If BECount <u (Count - 1) then (BECount + 1) % Count == (BECount + 1)
+  // This means %xtraiter is (BECount + 1) and all of the iterations of this
+  // loop were executed by the prologue.  Note that if BECount <u (Count - 1)
+  // then (BECount + 1) cannot unsigned-overflow.
   Value *BrLoopExit =
       B.CreateICmpULT(BECount, ConstantInt::get(BECount->getType(), Count - 1));
   BasicBlock *Exit = L->getUniqueExitBlock();
   assert(Exit && "Loop must have a single exit block only");
   // Split the exit to maintain loop canonicalization guarantees
-  SmallVector<BasicBlock*, 4> Preds(pred_begin(Exit), pred_end(Exit));
+  SmallVector<BasicBlock*, 4> Preds(predecessors(Exit));
   SplitBlockPredecessors(Exit, Preds, ".unr-lcssa", DT, LI,
                          PreserveLCSSA);
   // Add the branch to the exit block (around the unrolled loop)
-  B.CreateCondBr(BrLoopExit, Exit, NewPH);
+  B.CreateCondBr(BrLoopExit, Exit, NewPreHeader);
+  InsertPt->eraseFromParent();
+}
+
+/// Connect the unrolling epilog code to the original loop.
+/// The unrolling epilog code contains code to execute the
+/// 'extra' iterations if the run-time trip count modulo the
+/// unroll count is non-zero.
+///
+/// This function performs the following:
+/// - Update PHI nodes at the unrolling loop exit and epilog loop exit
+/// - Create PHI nodes at the unrolling loop exit to combine
+///   values that exit the unrolling loop code and jump around it.
+/// - Update PHI operands in the epilog loop by the new PHI nodes
+/// - Branch around the epilog loop if extra iters (ModVal) is zero.
+///
+static void ConnectEpilog(Loop *L, Value *ModVal, BasicBlock *NewExit,
+                          BasicBlock *Exit, BasicBlock *PreHeader,
+                          BasicBlock *EpilogPreHeader, BasicBlock *NewPreHeader,
+                          ValueToValueMapTy &VMap, DominatorTree *DT,
+                          LoopInfo *LI, bool PreserveLCSSA)  {
+  BasicBlock *Latch = L->getLoopLatch();
+  assert(Latch && "Loop must have a latch");
+  BasicBlock *EpilogLatch = cast<BasicBlock>(VMap[Latch]);
+
+  // Loop structure should be the following:
+  //
+  // PreHeader
+  // NewPreHeader
+  //   Header
+  //   ...
+  //   Latch
+  // NewExit (PN)
+  // EpilogPreHeader
+  //   EpilogHeader
+  //   ...
+  //   EpilogLatch
+  // Exit (EpilogPN)
+
+  // Update PHI nodes at NewExit and Exit.
+  for (Instruction &BBI : *NewExit) {
+    PHINode *PN = dyn_cast<PHINode>(&BBI);
+    // Exit when we passed all PHI nodes.
+    if (!PN)
+      break;
+    // PN should be used in another PHI located in Exit block as
+    // Exit was split by SplitBlockPredecessors into Exit and NewExit
+    // Basicaly it should look like:
+    // NewExit:
+    //   PN = PHI [I, Latch]
+    // ...
+    // Exit:
+    //   EpilogPN = PHI [PN, EpilogPreHeader]
+    //
+    // There is EpilogPreHeader incoming block instead of NewExit as
+    // NewExit was spilt 1 more time to get EpilogPreHeader.
+    assert(PN->hasOneUse() && "The phi should have 1 use");
+    PHINode *EpilogPN = cast<PHINode> (PN->use_begin()->getUser());
+    assert(EpilogPN->getParent() == Exit && "EpilogPN should be in Exit block");
+
+    // Add incoming PreHeader from branch around the Loop
+    PN->addIncoming(UndefValue::get(PN->getType()), PreHeader);
+
+    Value *V = PN->getIncomingValueForBlock(Latch);
+    Instruction *I = dyn_cast<Instruction>(V);
+    if (I && L->contains(I))
+      // If value comes from an instruction in the loop add VMap value.
+      V = VMap.lookup(I);
+    // For the instruction out of the loop, constant or undefined value
+    // insert value itself.
+    EpilogPN->addIncoming(V, EpilogLatch);
+
+    assert(EpilogPN->getBasicBlockIndex(EpilogPreHeader) >= 0 &&
+          "EpilogPN should have EpilogPreHeader incoming block");
+    // Change EpilogPreHeader incoming block to NewExit.
+    EpilogPN->setIncomingBlock(EpilogPN->getBasicBlockIndex(EpilogPreHeader),
+                               NewExit);
+    // Now PHIs should look like:
+    // NewExit:
+    //   PN = PHI [I, Latch], [undef, PreHeader]
+    // ...
+    // Exit:
+    //   EpilogPN = PHI [PN, NewExit], [VMap[I], EpilogLatch]
+  }
+
+  // Create PHI nodes at NewExit (from the unrolling loop Latch and PreHeader).
+  // Update corresponding PHI nodes in epilog loop.
+  for (BasicBlock *Succ : successors(Latch)) {
+    // Skip this as we already updated phis in exit blocks.
+    if (!L->contains(Succ))
+      continue;
+    for (Instruction &BBI : *Succ) {
+      PHINode *PN = dyn_cast<PHINode>(&BBI);
+      // Exit when we passed all PHI nodes.
+      if (!PN)
+        break;
+      // Add new PHI nodes to the loop exit block and update epilog
+      // PHIs with the new PHI values.
+      PHINode *NewPN = PHINode::Create(PN->getType(), 2, PN->getName() + ".unr",
+                                       NewExit->getFirstNonPHI());
+      // Adding a value to the new PHI node from the unrolling loop preheader.
+      NewPN->addIncoming(PN->getIncomingValueForBlock(NewPreHeader), PreHeader);
+      // Adding a value to the new PHI node from the unrolling loop latch.
+      NewPN->addIncoming(PN->getIncomingValueForBlock(Latch), Latch);
+
+      // Update the existing PHI node operand with the value from the new PHI
+      // node.  Corresponding instruction in epilog loop should be PHI.
+      PHINode *VPN = cast<PHINode>(VMap[&BBI]);
+      VPN->setIncomingValue(VPN->getBasicBlockIndex(EpilogPreHeader), NewPN);
+    }
+  }
+
+  Instruction *InsertPt = NewExit->getTerminator();
+  IRBuilder<> B(InsertPt);
+  Value *BrLoopExit = B.CreateIsNotNull(ModVal, "lcmp.mod");
+  assert(Exit && "Loop must have a single exit block only");
+  // Split the exit to maintain loop canonicalization guarantees
+  SmallVector<BasicBlock*, 4> Preds(predecessors(Exit));
+  SplitBlockPredecessors(Exit, Preds, ".epilog-lcssa", DT, LI,
+                         PreserveLCSSA);
+  // Add the branch to the exit block (around the unrolling loop)
+  B.CreateCondBr(BrLoopExit, EpilogPreHeader, Exit);
   InsertPt->eraseFromParent();
 }
 
 /// Create a clone of the blocks in a loop and connect them together.
-/// If UnrollProlog is true, loop structure will not be cloned, otherwise a new
-/// loop will be created including all cloned blocks, and the iterator of it
-/// switches to count NewIter down to 0.
+/// If CreateRemainderLoop is false, loop structure will not be cloned,
+/// otherwise a new loop will be created including all cloned blocks, and the
+/// iterator of it switches to count NewIter down to 0.
+/// The cloned blocks should be inserted between InsertTop and InsertBot.
+/// If loop structure is cloned InsertTop should be new preheader, InsertBot
+/// new loop exit.
 ///
-static void CloneLoopBlocks(Loop *L, Value *NewIter, const bool UnrollProlog,
+static void CloneLoopBlocks(Loop *L, Value *NewIter,
+                            const bool CreateRemainderLoop,
+                            const bool UseEpilogRemainder,
                             BasicBlock *InsertTop, BasicBlock *InsertBot,
+                            BasicBlock *Preheader,
                             std::vector<BasicBlock *> &NewBlocks,
                             LoopBlocksDFS &LoopBlocks, ValueToValueMapTy &VMap,
                             LoopInfo *LI) {
-  BasicBlock *Preheader = L->getLoopPreheader();
+  StringRef suffix = UseEpilogRemainder ? "epil" : "prol";
   BasicBlock *Header = L->getHeader();
   BasicBlock *Latch = L->getLoopLatch();
   Function *F = Header->getParent();
@@ -152,7 +281,7 @@ static void CloneLoopBlocks(Loop *L, Value *NewIter, const bool UnrollProlog,
   LoopBlocksDFS::RPOIterator BlockEnd = LoopBlocks.endRPO();
   Loop *NewLoop = nullptr;
   Loop *ParentLoop = L->getParentLoop();
-  if (!UnrollProlog) {
+  if (CreateRemainderLoop) {
     NewLoop = new Loop();
     if (ParentLoop)
       ParentLoop->addChildLoop(NewLoop);
@@ -163,7 +292,7 @@ static void CloneLoopBlocks(Loop *L, Value *NewIter, const bool UnrollProlog,
   // For each block in the original loop, create a new copy,
   // and update the value map with the newly created values.
   for (LoopBlocksDFS::RPOIterator BB = BlockBegin; BB != BlockEnd; ++BB) {
-    BasicBlock *NewBB = CloneBasicBlock(*BB, VMap, ".prol", F);
+    BasicBlock *NewBB = CloneBasicBlock(*BB, VMap, "." + suffix, F);
     NewBlocks.push_back(NewBB);
 
     if (NewLoop)
@@ -176,19 +305,20 @@ static void CloneLoopBlocks(Loop *L, Value *NewIter, const bool UnrollProlog,
       // For the first block, add a CFG connection to this newly
       // created block.
       InsertTop->getTerminator()->setSuccessor(0, NewBB);
-
     }
+
     if (Latch == *BB) {
-      // For the last block, if UnrollProlog is true, create a direct jump to
-      // InsertBot. If not, create a loop back to cloned head.
+      // For the last block, if CreateRemainderLoop is false, create a direct
+      // jump to InsertBot. If not, create a loop back to cloned head.
       VMap.erase((*BB)->getTerminator());
       BasicBlock *FirstLoopBB = cast<BasicBlock>(VMap[Header]);
       BranchInst *LatchBR = cast<BranchInst>(NewBB->getTerminator());
       IRBuilder<> Builder(LatchBR);
-      if (UnrollProlog) {
+      if (!CreateRemainderLoop) {
         Builder.CreateBr(InsertBot);
       } else {
-        PHINode *NewIdx = PHINode::Create(NewIter->getType(), 2, "prol.iter",
+        PHINode *NewIdx = PHINode::Create(NewIter->getType(), 2,
+                                          suffix + ".iter",
                                           FirstLoopBB->getFirstNonPHI());
         Value *IdxSub =
             Builder.CreateSub(NewIdx, ConstantInt::get(NewIdx->getType(), 1),
@@ -207,9 +337,15 @@ static void CloneLoopBlocks(Loop *L, Value *NewIter, const bool UnrollProlog,
   // cloned loop.
   for (BasicBlock::iterator I = Header->begin(); isa<PHINode>(I); ++I) {
     PHINode *NewPHI = cast<PHINode>(VMap[&*I]);
-    if (UnrollProlog) {
-      VMap[&*I] = NewPHI->getIncomingValueForBlock(Preheader);
-      cast<BasicBlock>(VMap[Header])->getInstList().erase(NewPHI);
+    if (!CreateRemainderLoop) {
+      if (UseEpilogRemainder) {
+        unsigned idx = NewPHI->getBasicBlockIndex(Preheader);
+        NewPHI->setIncomingBlock(idx, InsertTop);
+        NewPHI->removeIncomingValue(Latch, false);
+      } else {
+        VMap[&*I] = NewPHI->getIncomingValueForBlock(Preheader);
+        cast<BasicBlock>(VMap[Header])->getInstList().erase(NewPHI);
+      }
     } else {
       unsigned idx = NewPHI->getBasicBlockIndex(Preheader);
       NewPHI->setIncomingBlock(idx, InsertTop);
@@ -217,8 +353,8 @@ static void CloneLoopBlocks(Loop *L, Value *NewIter, const bool UnrollProlog,
       idx = NewPHI->getBasicBlockIndex(Latch);
       Value *InVal = NewPHI->getIncomingValue(idx);
       NewPHI->setIncomingBlock(idx, NewLatch);
-      if (VMap[InVal])
-        NewPHI->setIncomingValue(idx, VMap[InVal]);
+      if (Value *V = VMap.lookup(InVal))
+        NewPHI->setIncomingValue(idx, V);
     }
   }
   if (NewLoop) {
@@ -254,11 +390,11 @@ static void CloneLoopBlocks(Loop *L, Value *NewIter, const bool UnrollProlog,
   }
 }
 
-/// Insert code in the prolog code when unrolling a loop with a
+/// Insert code in the prolog/epilog code when unrolling a loop with a
 /// run-time trip-count.
 ///
 /// This method assumes that the loop unroll factor is total number
-/// of loop bodes in the loop after unrolling. (Some folks refer
+/// of loop bodies in the loop after unrolling. (Some folks refer
 /// to the unroll factor as the number of *extra* copies added).
 /// We assume also that the loop unroll factor is a power-of-two. So, after
 /// unrolling the loop, the number of loop bodies executed is 2,
@@ -266,37 +402,56 @@ static void CloneLoopBlocks(Loop *L, Value *NewIter, const bool UnrollProlog,
 /// instruction in SimplifyCFG.cpp.  Then, the backend decides how code for
 /// the switch instruction is generated.
 ///
+/// ***Prolog case***
 ///        extraiters = tripcount % loopfactor
 ///        if (extraiters == 0) jump Loop:
-///        else jump Prol
+///        else jump Prol:
 /// Prol:  LoopBody;
 ///        extraiters -= 1                 // Omitted if unroll factor is 2.
 ///        if (extraiters != 0) jump Prol: // Omitted if unroll factor is 2.
-///        if (tripcount < loopfactor) jump End
+///        if (tripcount < loopfactor) jump End:
 /// Loop:
 /// ...
 /// End:
 ///
-bool llvm::UnrollRuntimeLoopProlog(Loop *L, unsigned Count,
-                                   bool AllowExpensiveTripCount, LoopInfo *LI,
-                                   ScalarEvolution *SE, DominatorTree *DT,
-                                   bool PreserveLCSSA) {
+/// ***Epilog case***
+///        extraiters = tripcount % loopfactor
+///        if (tripcount < loopfactor) jump LoopExit:
+///        unroll_iters = tripcount - extraiters
+/// Loop:  LoopBody; (executes unroll_iter times);
+///        unroll_iter -= 1
+///        if (unroll_iter != 0) jump Loop:
+/// LoopExit:
+///        if (extraiters == 0) jump EpilExit:
+/// Epil:  LoopBody; (executes extraiters times)
+///        extraiters -= 1                 // Omitted if unroll factor is 2.
+///        if (extraiters != 0) jump Epil: // Omitted if unroll factor is 2.
+/// EpilExit:
+
+bool llvm::UnrollRuntimeLoopRemainder(Loop *L, unsigned Count,
+                                      bool AllowExpensiveTripCount,
+                                      bool UseEpilogRemainder,
+                                      LoopInfo *LI, ScalarEvolution *SE,
+                                      DominatorTree *DT, bool PreserveLCSSA) {
   // for now, only unroll loops that contain a single exit
   if (!L->getExitingBlock())
     return false;
 
   // Make sure the loop is in canonical form, and there is a single
   // exit block only.
-  if (!L->isLoopSimplifyForm() || !L->getUniqueExitBlock())
+  if (!L->isLoopSimplifyForm())
+    return false;
+  BasicBlock *Exit = L->getUniqueExitBlock(); // successor out of loop
+  if (!Exit)
     return false;
 
-  // Use Scalar Evolution to compute the trip count.  This allows more
-  // loops to be unrolled than relying on induction var simplification
+  // Use Scalar Evolution to compute the trip count. This allows more loops to
+  // be unrolled than relying on induction var simplification.
   if (!SE)
     return false;
 
-  // Only unroll loops with a computable trip count and the trip count needs
-  // to be an int value (allowing a pointer type is a TODO item)
+  // Only unroll loops with a computable trip count, and the trip count needs
+  // to be an int value (allowing a pointer type is a TODO item).
   const SCEV *BECountSC = SE->getBackedgeTakenCount(L);
   if (isa<SCEVCouldNotCompute>(BECountSC) ||
       !BECountSC->getType()->isIntegerTy())
@@ -304,21 +459,19 @@ bool llvm::UnrollRuntimeLoopProlog(Loop *L, unsigned Count,
 
   unsigned BEWidth = cast<IntegerType>(BECountSC->getType())->getBitWidth();
 
-  // Add 1 since the backedge count doesn't include the first loop iteration
+  // Add 1 since the backedge count doesn't include the first loop iteration.
   const SCEV *TripCountSC =
       SE->getAddExpr(BECountSC, SE->getConstant(BECountSC->getType(), 1));
   if (isa<SCEVCouldNotCompute>(TripCountSC))
     return false;
 
   BasicBlock *Header = L->getHeader();
+  BasicBlock *PreHeader = L->getLoopPreheader();
+  BranchInst *PreHeaderBR = cast<BranchInst>(PreHeader->getTerminator());
   const DataLayout &DL = Header->getModule()->getDataLayout();
   SCEVExpander Expander(*SE, DL, "loop-unroll");
-  if (!AllowExpensiveTripCount && Expander.isHighCostExpansion(TripCountSC, L))
-    return false;
-
-  // We only handle cases when the unroll factor is a power of 2.
-  // Count is the loop unroll factor, the number of extra copies added + 1.
-  if (!isPowerOf2_32(Count))
+  if (!AllowExpensiveTripCount &&
+      Expander.isHighCostExpansion(TripCountSC, L, PreHeaderBR))
     return false;
 
   // This constraint lets us deal with an overflowing trip count easily; see the
@@ -326,51 +479,115 @@ bool llvm::UnrollRuntimeLoopProlog(Loop *L, unsigned Count,
   if (Log2_32(Count) > BEWidth)
     return false;
 
-  // If this loop is nested, then the loop unroller changes the code in
-  // parent loop, so the Scalar Evolution pass needs to be run again
+  // If this loop is nested, then the loop unroller changes the code in the
+  // parent loop, so the Scalar Evolution pass needs to be run again.
   if (Loop *ParentLoop = L->getParentLoop())
     SE->forgetLoop(ParentLoop);
 
-  BasicBlock *PH = L->getLoopPreheader();
   BasicBlock *Latch = L->getLoopLatch();
-  // It helps to splits the original preheader twice, one for the end of the
-  // prolog code and one for a new loop preheader
-  BasicBlock *PEnd = SplitEdge(PH, Header, DT, LI);
-  BasicBlock *NewPH = SplitBlock(PEnd, PEnd->getTerminator(), DT, LI);
-  BranchInst *PreHeaderBR = cast<BranchInst>(PH->getTerminator());
 
+  // Loop structure is the following:
+  //
+  // PreHeader
+  //   Header
+  //   ...
+  //   Latch
+  // Exit
+
+  BasicBlock *NewPreHeader;
+  BasicBlock *NewExit = nullptr;
+  BasicBlock *PrologExit = nullptr;
+  BasicBlock *EpilogPreHeader = nullptr;
+  BasicBlock *PrologPreHeader = nullptr;
+
+  if (UseEpilogRemainder) {
+    // If epilog remainder
+    // Split PreHeader to insert a branch around loop for unrolling.
+    NewPreHeader = SplitBlock(PreHeader, PreHeader->getTerminator(), DT, LI);
+    NewPreHeader->setName(PreHeader->getName() + ".new");
+    // Split Exit to create phi nodes from branch above.
+    SmallVector<BasicBlock*, 4> Preds(predecessors(Exit));
+    NewExit = SplitBlockPredecessors(Exit, Preds, ".unr-lcssa",
+                                     DT, LI, PreserveLCSSA);
+    // Split NewExit to insert epilog remainder loop.
+    EpilogPreHeader = SplitBlock(NewExit, NewExit->getTerminator(), DT, LI);
+    EpilogPreHeader->setName(Header->getName() + ".epil.preheader");
+  } else {
+    // If prolog remainder
+    // Split the original preheader twice to insert prolog remainder loop
+    PrologPreHeader = SplitEdge(PreHeader, Header, DT, LI);
+    PrologPreHeader->setName(Header->getName() + ".prol.preheader");
+    PrologExit = SplitBlock(PrologPreHeader, PrologPreHeader->getTerminator(),
+                            DT, LI);
+    PrologExit->setName(Header->getName() + ".prol.loopexit");
+    // Split PrologExit to get NewPreHeader.
+    NewPreHeader = SplitBlock(PrologExit, PrologExit->getTerminator(), DT, LI);
+    NewPreHeader->setName(PreHeader->getName() + ".new");
+  }
+  // Loop structure should be the following:
+  //  Epilog             Prolog
+  //
+  // PreHeader         PreHeader
+  // *NewPreHeader     *PrologPreHeader
+  //   Header          *PrologExit
+  //   ...             *NewPreHeader
+  //   Latch             Header
+  // *NewExit            ...
+  // *EpilogPreHeader    Latch
+  // Exit              Exit
+
+  // Calculate conditions for branch around loop for unrolling
+  // in epilog case and around prolog remainder loop in prolog case.
   // Compute the number of extra iterations required, which is:
-  //  extra iterations = run-time trip count % (loop unroll factor + 1)
+  //  extra iterations = run-time trip count % loop unroll factor
+  PreHeaderBR = cast<BranchInst>(PreHeader->getTerminator());
   Value *TripCount = Expander.expandCodeFor(TripCountSC, TripCountSC->getType(),
                                             PreHeaderBR);
   Value *BECount = Expander.expandCodeFor(BECountSC, BECountSC->getType(),
                                           PreHeaderBR);
-
   IRBuilder<> B(PreHeaderBR);
-  Value *ModVal = B.CreateAnd(TripCount, Count - 1, "xtraiter");
-
-  // If ModVal is zero, we know that either
-  //  1. there are no iteration to be run in the prologue loop
-  // OR
-  //  2. the addition computing TripCount overflowed
-  //
-  // If (2) is true, we know that TripCount really is (1 << BEWidth) and so the
-  // number of iterations that remain to be run in the original loop is a
-  // multiple Count == (1 << Log2(Count)) because Log2(Count) <= BEWidth (we
-  // explicitly check this above).
-
-  Value *BranchVal = B.CreateIsNotNull(ModVal, "lcmp.mod");
-
-  // Branch to either the extra iterations or the cloned/unrolled loop
-  // We will fix up the true branch label when adding loop body copies
-  B.CreateCondBr(BranchVal, PEnd, PEnd);
-  assert(PreHeaderBR->isUnconditional() &&
-         PreHeaderBR->getSuccessor(0) == PEnd &&
-         "CFG edges in Preheader are not correct");
+  Value *ModVal;
+  // Calculate ModVal = (BECount + 1) % Count.
+  // Note that TripCount is BECount + 1.
+  if (isPowerOf2_32(Count)) {
+    // When Count is power of 2 we don't BECount for epilog case, however we'll
+    // need it for a branch around unrolling loop for prolog case.
+    ModVal = B.CreateAnd(TripCount, Count - 1, "xtraiter");
+    //  1. There are no iterations to be run in the prolog/epilog loop.
+    // OR
+    //  2. The addition computing TripCount overflowed.
+    //
+    // If (2) is true, we know that TripCount really is (1 << BEWidth) and so
+    // the number of iterations that remain to be run in the original loop is a
+    // multiple Count == (1 << Log2(Count)) because Log2(Count) <= BEWidth (we
+    // explicitly check this above).
+  } else {
+    // As (BECount + 1) can potentially unsigned overflow we count
+    // (BECount % Count) + 1 which is overflow safe as BECount % Count < Count.
+    Value *ModValTmp = B.CreateURem(BECount,
+                                    ConstantInt::get(BECount->getType(),
+                                                     Count));
+    Value *ModValAdd = B.CreateAdd(ModValTmp,
+                                   ConstantInt::get(ModValTmp->getType(), 1));
+    // At that point (BECount % Count) + 1 could be equal to Count.
+    // To handle this case we need to take mod by Count one more time.
+    ModVal = B.CreateURem(ModValAdd,
+                          ConstantInt::get(BECount->getType(), Count),
+                          "xtraiter");
+  }
+  Value *BranchVal =
+      UseEpilogRemainder ? B.CreateICmpULT(BECount,
+                                           ConstantInt::get(BECount->getType(),
+                                                            Count - 1)) :
+                           B.CreateIsNotNull(ModVal, "lcmp.mod");
+  BasicBlock *RemainderLoop = UseEpilogRemainder ? NewExit : PrologPreHeader;
+  BasicBlock *UnrollingLoop = UseEpilogRemainder ? NewPreHeader : PrologExit;
+  // Branch to either remainder (extra iterations) loop or unrolling loop.
+  B.CreateCondBr(BranchVal, RemainderLoop, UnrollingLoop);
   PreHeaderBR->eraseFromParent();
   Function *F = Header->getParent();
   // Get an ordered list of blocks in the loop to help with the ordering of the
-  // cloned blocks in the prolog code
+  // cloned blocks in the prolog/epilog code
   LoopBlocksDFS LoopBlocks(L);
   LoopBlocks.perform(LI);
 
@@ -382,34 +599,80 @@ bool llvm::UnrollRuntimeLoopProlog(Loop *L, unsigned Count,
   std::vector<BasicBlock *> NewBlocks;
   ValueToValueMapTy VMap;
 
-  bool UnrollPrologue = Count == 2;
+  // For unroll factor 2 remainder loop will have 1 iterations.
+  // Do not create 1 iteration loop.
+  bool CreateRemainderLoop = (Count != 2);
 
   // Clone all the basic blocks in the loop. If Count is 2, we don't clone
   // the loop, otherwise we create a cloned loop to execute the extra
   // iterations. This function adds the appropriate CFG connections.
-  CloneLoopBlocks(L, ModVal, UnrollPrologue, PH, PEnd, NewBlocks, LoopBlocks,
-                  VMap, LI);
-
-  // Insert the cloned blocks into function just before the original loop
-  F->getBasicBlockList().splice(PEnd->getIterator(), F->getBasicBlockList(),
-                                NewBlocks[0]->getIterator(), F->end());
-
-  // Rewrite the cloned instruction operands to use the values
-  // created when the clone is created.
-  for (unsigned i = 0, e = NewBlocks.size(); i != e; ++i) {
-    for (BasicBlock::iterator I = NewBlocks[i]->begin(),
-                              E = NewBlocks[i]->end();
-         I != E; ++I) {
-      RemapInstruction(&*I, VMap,
-                       RF_NoModuleLevelChanges | RF_IgnoreMissingEntries);
+  BasicBlock *InsertBot = UseEpilogRemainder ? Exit : PrologExit;
+  BasicBlock *InsertTop = UseEpilogRemainder ? EpilogPreHeader : PrologPreHeader;
+  CloneLoopBlocks(L, ModVal, CreateRemainderLoop, UseEpilogRemainder, InsertTop,
+                  InsertBot, NewPreHeader, NewBlocks, LoopBlocks, VMap, LI);
+
+  // Insert the cloned blocks into the function.
+  F->getBasicBlockList().splice(InsertBot->getIterator(),
+                                F->getBasicBlockList(),
+                                NewBlocks[0]->getIterator(),
+                                F->end());
+
+  // Loop structure should be the following:
+  //  Epilog             Prolog
+  //
+  // PreHeader         PreHeader
+  // NewPreHeader      PrologPreHeader
+  //   Header            PrologHeader
+  //   ...               ...
+  //   Latch             PrologLatch
+  // NewExit           PrologExit
+  // EpilogPreHeader   NewPreHeader
+  //   EpilogHeader      Header
+  //   ...               ...
+  //   EpilogLatch       Latch
+  // Exit              Exit
+
+  // Rewrite the cloned instruction operands to use the values created when the
+  // clone is created.
+  for (BasicBlock *BB : NewBlocks) {
+    for (Instruction &I : *BB) {
+      RemapInstruction(&I, VMap,
+                       RF_NoModuleLevelChanges | RF_IgnoreMissingLocals);
     }
   }
 
-  // Connect the prolog code to the original loop and update the
-  // PHI functions.
-  BasicBlock *LastLoopBB = cast<BasicBlock>(VMap[Latch]);
-  ConnectProlog(L, BECount, Count, LastLoopBB, PEnd, PH, NewPH, VMap, DT, LI,
-                PreserveLCSSA);
+  if (UseEpilogRemainder) {
+    // Connect the epilog code to the original loop and update the
+    // PHI functions.
+    ConnectEpilog(L, ModVal, NewExit, Exit, PreHeader,
+                  EpilogPreHeader, NewPreHeader, VMap, DT, LI,
+                  PreserveLCSSA);
+
+    // Update counter in loop for unrolling.
+    // I should be multiply of Count.
+    IRBuilder<> B2(NewPreHeader->getTerminator());
+    Value *TestVal = B2.CreateSub(TripCount, ModVal, "unroll_iter");
+    BranchInst *LatchBR = cast<BranchInst>(Latch->getTerminator());
+    B2.SetInsertPoint(LatchBR);
+    PHINode *NewIdx = PHINode::Create(TestVal->getType(), 2, "niter",
+                                      Header->getFirstNonPHI());
+    Value *IdxSub =
+        B2.CreateSub(NewIdx, ConstantInt::get(NewIdx->getType(), 1),
+                     NewIdx->getName() + ".nsub");
+    Value *IdxCmp;
+    if (LatchBR->getSuccessor(0) == Header)
+      IdxCmp = B2.CreateIsNotNull(IdxSub, NewIdx->getName() + ".ncmp");
+    else
+      IdxCmp = B2.CreateIsNull(IdxSub, NewIdx->getName() + ".ncmp");
+    NewIdx->addIncoming(TestVal, NewPreHeader);
+    NewIdx->addIncoming(IdxSub, Latch);
+    LatchBR->setCondition(IdxCmp);
+  } else {
+    // Connect the prolog code to the original loop and update the
+    // PHI functions.
+    ConnectProlog(L, BECount, Count, PrologExit, PreHeader, NewPreHeader,
+                  VMap, DT, LI, PreserveLCSSA);
+  }
   NumRuntimeUnrolled++;
   return true;
 }
diff --git a/lib/Transforms/Utils/LoopUtils.cpp b/lib/Transforms/Utils/LoopUtils.cpp
index fa958e913b7b..3902c67c6a01 100644
--- a/lib/Transforms/Utils/LoopUtils.cpp
+++ b/lib/Transforms/Utils/LoopUtils.cpp
@@ -11,13 +11,20 @@
 //
 //===----------------------------------------------------------------------===//
 
+#include "llvm/Analysis/AliasAnalysis.h"
+#include "llvm/Analysis/BasicAliasAnalysis.h"
 #include "llvm/Analysis/LoopInfo.h"
+#include "llvm/Analysis/GlobalsModRef.h"
 #include "llvm/Analysis/ScalarEvolution.h"
+#include "llvm/Analysis/ScalarEvolutionExpander.h"
 #include "llvm/Analysis/ScalarEvolutionExpressions.h"
+#include "llvm/Analysis/ScalarEvolutionAliasAnalysis.h"
+#include "llvm/IR/Dominators.h"
 #include "llvm/IR/Instructions.h"
 #include "llvm/IR/Module.h"
 #include "llvm/IR/PatternMatch.h"
 #include "llvm/IR/ValueHandle.h"
+#include "llvm/Pass.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Transforms/Utils/LoopUtils.h"
 
@@ -423,7 +430,7 @@ RecurrenceDescriptor::isRecurrenceInstr(Instruction *I, RecurrenceKind Kind,
   default:
     return InstDesc(false, I);
   case Instruction::PHI:
-    return InstDesc(I, Prev.getMinMaxKind());
+    return InstDesc(I, Prev.getMinMaxKind(), Prev.getUnsafeAlgebraInst());
   case Instruction::Sub:
   case Instruction::Add:
     return InstDesc(Kind == RK_IntegerAdd, I);
@@ -466,12 +473,10 @@ bool RecurrenceDescriptor::hasMultipleUsesOf(
 bool RecurrenceDescriptor::isReductionPHI(PHINode *Phi, Loop *TheLoop,
                                           RecurrenceDescriptor &RedDes) {
 
-  bool HasFunNoNaNAttr = false;
   BasicBlock *Header = TheLoop->getHeader();
   Function &F = *Header->getParent();
-  if (F.hasFnAttribute("no-nans-fp-math"))
-    HasFunNoNaNAttr =
-        F.getFnAttribute("no-nans-fp-math").getValueAsString() == "true";
+  bool HasFunNoNaNAttr =
+      F.getFnAttribute("no-nans-fp-math").getValueAsString() == "true";
 
   if (AddReductionVar(Phi, RK_IntegerAdd, TheLoop, HasFunNoNaNAttr, RedDes)) {
     DEBUG(dbgs() << "Found an ADD reduction PHI." << *Phi << "\n");
@@ -514,6 +519,43 @@ bool RecurrenceDescriptor::isReductionPHI(PHINode *Phi, Loop *TheLoop,
   return false;
 }
 
+bool RecurrenceDescriptor::isFirstOrderRecurrence(PHINode *Phi, Loop *TheLoop,
+                                                  DominatorTree *DT) {
+
+  // Ensure the phi node is in the loop header and has two incoming values.
+  if (Phi->getParent() != TheLoop->getHeader() ||
+      Phi->getNumIncomingValues() != 2)
+    return false;
+
+  // Ensure the loop has a preheader and a single latch block. The loop
+  // vectorizer will need the latch to set up the next iteration of the loop.
+  auto *Preheader = TheLoop->getLoopPreheader();
+  auto *Latch = TheLoop->getLoopLatch();
+  if (!Preheader || !Latch)
+    return false;
+
+  // Ensure the phi node's incoming blocks are the loop preheader and latch.
+  if (Phi->getBasicBlockIndex(Preheader) < 0 ||
+      Phi->getBasicBlockIndex(Latch) < 0)
+    return false;
+
+  // Get the previous value. The previous value comes from the latch edge while
+  // the initial value comes form the preheader edge.
+  auto *Previous = dyn_cast<Instruction>(Phi->getIncomingValueForBlock(Latch));
+  if (!Previous || !TheLoop->contains(Previous) || isa<PHINode>(Previous))
+    return false;
+
+  // Ensure every user of the phi node is dominated by the previous value. The
+  // dominance requirement ensures the loop vectorizer will not need to
+  // vectorize the initial value prior to the first iteration of the loop.
+  for (User *U : Phi->users())
+    if (auto *I = dyn_cast<Instruction>(U))
+      if (!DT->dominates(Previous, I))
+        return false;
+
+  return true;
+}
+
 /// This function returns the identity element (or neutral element) for
 /// the operation K.
 Constant *RecurrenceDescriptor::getRecurrenceIdentity(RecurrenceKind K,
@@ -612,61 +654,120 @@ Value *RecurrenceDescriptor::createMinMaxOp(IRBuilder<> &Builder,
 }
 
 InductionDescriptor::InductionDescriptor(Value *Start, InductionKind K,
-                                         ConstantInt *Step)
-  : StartValue(Start), IK(K), StepValue(Step) {
+                                         const SCEV *Step)
+  : StartValue(Start), IK(K), Step(Step) {
   assert(IK != IK_NoInduction && "Not an induction");
+
+  // Start value type should match the induction kind and the value
+  // itself should not be null.
   assert(StartValue && "StartValue is null");
-  assert(StepValue && !StepValue->isZero() && "StepValue is zero");
   assert((IK != IK_PtrInduction || StartValue->getType()->isPointerTy()) &&
          "StartValue is not a pointer for pointer induction");
   assert((IK != IK_IntInduction || StartValue->getType()->isIntegerTy()) &&
          "StartValue is not an integer for integer induction");
-  assert(StepValue->getType()->isIntegerTy() &&
-         "StepValue is not an integer");
+
+  // Check the Step Value. It should be non-zero integer value.
+  assert((!getConstIntStepValue() || !getConstIntStepValue()->isZero()) &&
+         "Step value is zero");
+
+  assert((IK != IK_PtrInduction || getConstIntStepValue()) &&
+         "Step value should be constant for pointer induction");
+  assert(Step->getType()->isIntegerTy() && "StepValue is not an integer");
 }
 
 int InductionDescriptor::getConsecutiveDirection() const {
-  if (StepValue && (StepValue->isOne() || StepValue->isMinusOne()))
-    return StepValue->getSExtValue();
+  ConstantInt *ConstStep = getConstIntStepValue();
+  if (ConstStep && (ConstStep->isOne() || ConstStep->isMinusOne()))
+    return ConstStep->getSExtValue();
   return 0;
 }
 
-Value *InductionDescriptor::transform(IRBuilder<> &B, Value *Index) const {
+ConstantInt *InductionDescriptor::getConstIntStepValue() const {
+  if (isa<SCEVConstant>(Step))
+    return dyn_cast<ConstantInt>(cast<SCEVConstant>(Step)->getValue());
+  return nullptr;
+}
+
+Value *InductionDescriptor::transform(IRBuilder<> &B, Value *Index,
+                                      ScalarEvolution *SE,
+                                      const DataLayout& DL) const {
+
+  SCEVExpander Exp(*SE, DL, "induction");
   switch (IK) {
-  case IK_IntInduction:
+  case IK_IntInduction: {
     assert(Index->getType() == StartValue->getType() &&
            "Index type does not match StartValue type");
-    if (StepValue->isMinusOne())
-      return B.CreateSub(StartValue, Index);
-    if (!StepValue->isOne())
-      Index = B.CreateMul(Index, StepValue);
-    return B.CreateAdd(StartValue, Index);
 
-  case IK_PtrInduction:
-    assert(Index->getType() == StepValue->getType() &&
+    // FIXME: Theoretically, we can call getAddExpr() of ScalarEvolution
+    // and calculate (Start + Index * Step) for all cases, without
+    // special handling for "isOne" and "isMinusOne".
+    // But in the real life the result code getting worse. We mix SCEV
+    // expressions and ADD/SUB operations and receive redundant
+    // intermediate values being calculated in different ways and
+    // Instcombine is unable to reduce them all.
+
+    if (getConstIntStepValue() &&
+        getConstIntStepValue()->isMinusOne())
+      return B.CreateSub(StartValue, Index);
+    if (getConstIntStepValue() &&
+        getConstIntStepValue()->isOne())
+      return B.CreateAdd(StartValue, Index);
+    const SCEV *S = SE->getAddExpr(SE->getSCEV(StartValue),
+                                   SE->getMulExpr(Step, SE->getSCEV(Index)));
+    return Exp.expandCodeFor(S, StartValue->getType(), &*B.GetInsertPoint());
+  }
+  case IK_PtrInduction: {
+    assert(Index->getType() == Step->getType() &&
            "Index type does not match StepValue type");
-    if (StepValue->isMinusOne())
-      Index = B.CreateNeg(Index);
-    else if (!StepValue->isOne())
-      Index = B.CreateMul(Index, StepValue);
+    assert(isa<SCEVConstant>(Step) &&
+           "Expected constant step for pointer induction");
+    const SCEV *S = SE->getMulExpr(SE->getSCEV(Index), Step);
+    Index = Exp.expandCodeFor(S, Index->getType(), &*B.GetInsertPoint());
     return B.CreateGEP(nullptr, StartValue, Index);
-
+  }
   case IK_NoInduction:
     return nullptr;
   }
   llvm_unreachable("invalid enum");
 }
 
-bool InductionDescriptor::isInductionPHI(PHINode *Phi, ScalarEvolution *SE,
-                                         InductionDescriptor &D) {
+bool InductionDescriptor::isInductionPHI(PHINode *Phi,
+                                         PredicatedScalarEvolution &PSE,
+                                         InductionDescriptor &D,
+                                         bool Assume) {
+  Type *PhiTy = Phi->getType();
+  // We only handle integer and pointer inductions variables.
+  if (!PhiTy->isIntegerTy() && !PhiTy->isPointerTy())
+    return false;
+
+  const SCEV *PhiScev = PSE.getSCEV(Phi);
+  const auto *AR = dyn_cast<SCEVAddRecExpr>(PhiScev);
+
+  // We need this expression to be an AddRecExpr.
+  if (Assume && !AR)
+    AR = PSE.getAsAddRec(Phi);
+
+  if (!AR) {
+    DEBUG(dbgs() << "LV: PHI is not a poly recurrence.\n");
+    return false;
+  }
+
+  return isInductionPHI(Phi, PSE.getSE(), D, AR);
+}
+
+bool InductionDescriptor::isInductionPHI(PHINode *Phi,
+                                         ScalarEvolution *SE,
+                                         InductionDescriptor &D,
+                                         const SCEV *Expr) {
   Type *PhiTy = Phi->getType();
   // We only handle integer and pointer inductions variables.
   if (!PhiTy->isIntegerTy() && !PhiTy->isPointerTy())
     return false;
 
   // Check that the PHI is consecutive.
-  const SCEV *PhiScev = SE->getSCEV(Phi);
+  const SCEV *PhiScev = Expr ? Expr : SE->getSCEV(Phi);
   const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(PhiScev);
+
   if (!AR) {
     DEBUG(dbgs() << "LV: PHI is not a poly recurrence.\n");
     return false;
@@ -678,17 +779,22 @@ bool InductionDescriptor::isInductionPHI(PHINode *Phi, ScalarEvolution *SE,
     Phi->getIncomingValueForBlock(AR->getLoop()->getLoopPreheader());
   const SCEV *Step = AR->getStepRecurrence(*SE);
   // Calculate the pointer stride and check if it is consecutive.
-  const SCEVConstant *C = dyn_cast<SCEVConstant>(Step);
-  if (!C)
+  // The stride may be a constant or a loop invariant integer value.
+  const SCEVConstant *ConstStep = dyn_cast<SCEVConstant>(Step);
+  if (!ConstStep && !SE->isLoopInvariant(Step, AR->getLoop()))
     return false;
 
-  ConstantInt *CV = C->getValue();
   if (PhiTy->isIntegerTy()) {
-    D = InductionDescriptor(StartValue, IK_IntInduction, CV);
+    D = InductionDescriptor(StartValue, IK_IntInduction, Step);
     return true;
   }
 
   assert(PhiTy->isPointerTy() && "The PHI must be a pointer");
+  // Pointer induction should be a constant.
+  if (!ConstStep)
+    return false;
+
+  ConstantInt *CV = ConstStep->getValue();
   Type *PointerElementType = PhiTy->getPointerElementType();
   // The pointer stride cannot be determined if the pointer element type is not
   // sized.
@@ -703,8 +809,8 @@ bool InductionDescriptor::isInductionPHI(PHINode *Phi, ScalarEvolution *SE,
   int64_t CVSize = CV->getSExtValue();
   if (CVSize % Size)
     return false;
-  auto *StepValue = ConstantInt::getSigned(CV->getType(), CVSize / Size);
-
+  auto *StepValue = SE->getConstant(CV->getType(), CVSize / Size,
+                                    true /* signed */);
   D = InductionDescriptor(StartValue, IK_PtrInduction, StepValue);
   return true;
 }
@@ -727,3 +833,137 @@ SmallVector<Instruction *, 8> llvm::findDefsUsedOutsideOfLoop(Loop *L) {
 
   return UsedOutside;
 }
+
+void llvm::getLoopAnalysisUsage(AnalysisUsage &AU) {
+  // By definition, all loop passes need the LoopInfo analysis and the
+  // Dominator tree it depends on. Because they all participate in the loop
+  // pass manager, they must also preserve these.
+  AU.addRequired<DominatorTreeWrapperPass>();
+  AU.addPreserved<DominatorTreeWrapperPass>();
+  AU.addRequired<LoopInfoWrapperPass>();
+  AU.addPreserved<LoopInfoWrapperPass>();
+
+  // We must also preserve LoopSimplify and LCSSA. We locally access their IDs
+  // here because users shouldn't directly get them from this header.
+  extern char &LoopSimplifyID;
+  extern char &LCSSAID;
+  AU.addRequiredID(LoopSimplifyID);
+  AU.addPreservedID(LoopSimplifyID);
+  AU.addRequiredID(LCSSAID);
+  AU.addPreservedID(LCSSAID);
+
+  // Loop passes are designed to run inside of a loop pass manager which means
+  // that any function analyses they require must be required by the first loop
+  // pass in the manager (so that it is computed before the loop pass manager
+  // runs) and preserved by all loop pasess in the manager. To make this
+  // reasonably robust, the set needed for most loop passes is maintained here.
+  // If your loop pass requires an analysis not listed here, you will need to
+  // carefully audit the loop pass manager nesting structure that results.
+  AU.addRequired<AAResultsWrapperPass>();
+  AU.addPreserved<AAResultsWrapperPass>();
+  AU.addPreserved<BasicAAWrapperPass>();
+  AU.addPreserved<GlobalsAAWrapperPass>();
+  AU.addPreserved<SCEVAAWrapperPass>();
+  AU.addRequired<ScalarEvolutionWrapperPass>();
+  AU.addPreserved<ScalarEvolutionWrapperPass>();
+}
+
+/// Manually defined generic "LoopPass" dependency initialization. This is used
+/// to initialize the exact set of passes from above in \c
+/// getLoopAnalysisUsage. It can be used within a loop pass's initialization
+/// with:
+///
+///   INITIALIZE_PASS_DEPENDENCY(LoopPass)
+///
+/// As-if "LoopPass" were a pass.
+void llvm::initializeLoopPassPass(PassRegistry &Registry) {
+  INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
+  INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass)
+  INITIALIZE_PASS_DEPENDENCY(LoopSimplify)
+  INITIALIZE_PASS_DEPENDENCY(LCSSAWrapperPass)
+  INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass)
+  INITIALIZE_PASS_DEPENDENCY(BasicAAWrapperPass)
+  INITIALIZE_PASS_DEPENDENCY(GlobalsAAWrapperPass)
+  INITIALIZE_PASS_DEPENDENCY(SCEVAAWrapperPass)
+  INITIALIZE_PASS_DEPENDENCY(ScalarEvolutionWrapperPass)
+}
+
+/// \brief Find string metadata for loop
+///
+/// If it has a value (e.g. {"llvm.distribute", 1} return the value as an
+/// operand or null otherwise.  If the string metadata is not found return
+/// Optional's not-a-value.
+Optional<const MDOperand *> llvm::findStringMetadataForLoop(Loop *TheLoop,
+                                                            StringRef Name) {
+  MDNode *LoopID = TheLoop->getLoopID();
+  // Return none if LoopID is false.
+  if (!LoopID)
+    return None;
+
+  // First operand should refer to the loop id itself.
+  assert(LoopID->getNumOperands() > 0 && "requires at least one operand");
+  assert(LoopID->getOperand(0) == LoopID && "invalid loop id");
+
+  // Iterate over LoopID operands and look for MDString Metadata
+  for (unsigned i = 1, e = LoopID->getNumOperands(); i < e; ++i) {
+    MDNode *MD = dyn_cast<MDNode>(LoopID->getOperand(i));
+    if (!MD)
+      continue;
+    MDString *S = dyn_cast<MDString>(MD->getOperand(0));
+    if (!S)
+      continue;
+    // Return true if MDString holds expected MetaData.
+    if (Name.equals(S->getString()))
+      switch (MD->getNumOperands()) {
+      case 1:
+        return nullptr;
+      case 2:
+        return &MD->getOperand(1);
+      default:
+        llvm_unreachable("loop metadata has 0 or 1 operand");
+      }
+  }
+  return None;
+}
+
+/// Returns true if the instruction in a loop is guaranteed to execute at least
+/// once.
+bool llvm::isGuaranteedToExecute(const Instruction &Inst,
+                                 const DominatorTree *DT, const Loop *CurLoop,
+                                 const LoopSafetyInfo *SafetyInfo) {
+  // We have to check to make sure that the instruction dominates all
+  // of the exit blocks.  If it doesn't, then there is a path out of the loop
+  // which does not execute this instruction, so we can't hoist it.
+
+  // If the instruction is in the header block for the loop (which is very
+  // common), it is always guaranteed to dominate the exit blocks.  Since this
+  // is a common case, and can save some work, check it now.
+  if (Inst.getParent() == CurLoop->getHeader())
+    // If there's a throw in the header block, we can't guarantee we'll reach
+    // Inst.
+    return !SafetyInfo->HeaderMayThrow;
+
+  // Somewhere in this loop there is an instruction which may throw and make us
+  // exit the loop.
+  if (SafetyInfo->MayThrow)
+    return false;
+
+  // Get the exit blocks for the current loop.
+  SmallVector<BasicBlock *, 8> ExitBlocks;
+  CurLoop->getExitBlocks(ExitBlocks);
+
+  // Verify that the block dominates each of the exit blocks of the loop.
+  for (BasicBlock *ExitBlock : ExitBlocks)
+    if (!DT->dominates(Inst.getParent(), ExitBlock))
+      return false;
+
+  // As a degenerate case, if the loop is statically infinite then we haven't
+  // proven anything since there are no exit blocks.
+  if (ExitBlocks.empty())
+    return false;
+
+  // FIXME: In general, we have to prove that the loop isn't an infinite loop.
+  // See http::llvm.org/PR24078 .  (The "ExitBlocks.empty()" check above is
+  // just a special case of this.)
+  return true;
+}
diff --git a/lib/Transforms/Utils/LoopVersioning.cpp b/lib/Transforms/Utils/LoopVersioning.cpp
index 9a2a06cf6891..b3c61691da30 100644
--- a/lib/Transforms/Utils/LoopVersioning.cpp
+++ b/lib/Transforms/Utils/LoopVersioning.cpp
@@ -18,11 +18,18 @@
 #include "llvm/Analysis/LoopInfo.h"
 #include "llvm/Analysis/ScalarEvolutionExpander.h"
 #include "llvm/IR/Dominators.h"
+#include "llvm/IR/MDBuilder.h"
 #include "llvm/Transforms/Utils/BasicBlockUtils.h"
 #include "llvm/Transforms/Utils/Cloning.h"
 
 using namespace llvm;
 
+static cl::opt<bool>
+    AnnotateNoAlias("loop-version-annotate-no-alias", cl::init(true),
+                    cl::Hidden,
+                    cl::desc("Add no-alias annotation for instructions that "
+                             "are disambiguated by memchecks"));
+
 LoopVersioning::LoopVersioning(const LoopAccessInfo &LAI, Loop *L, LoopInfo *LI,
                                DominatorTree *DT, ScalarEvolution *SE,
                                bool UseLAIChecks)
@@ -32,12 +39,12 @@ LoopVersioning::LoopVersioning(const LoopAccessInfo &LAI, Loop *L, LoopInfo *LI,
   assert(L->getLoopPreheader() && "No preheader");
   if (UseLAIChecks) {
     setAliasChecks(LAI.getRuntimePointerChecking()->getChecks());
-    setSCEVChecks(LAI.PSE.getUnionPredicate());
+    setSCEVChecks(LAI.getPSE().getUnionPredicate());
   }
 }
 
 void LoopVersioning::setAliasChecks(
-    const SmallVector<RuntimePointerChecking::PointerCheck, 4> Checks) {
+    SmallVector<RuntimePointerChecking::PointerCheck, 4> Checks) {
   AliasChecks = std::move(Checks);
 }
 
@@ -56,9 +63,8 @@ void LoopVersioning::versionLoop(
   BasicBlock *RuntimeCheckBB = VersionedLoop->getLoopPreheader();
   std::tie(FirstCheckInst, MemRuntimeCheck) =
       LAI.addRuntimeChecks(RuntimeCheckBB->getTerminator(), AliasChecks);
-  assert(MemRuntimeCheck && "called even though needsAnyChecking = false");
 
-  const SCEVUnionPredicate &Pred = LAI.PSE.getUnionPredicate();
+  const SCEVUnionPredicate &Pred = LAI.getPSE().getUnionPredicate();
   SCEVExpander Exp(*SE, RuntimeCheckBB->getModule()->getDataLayout(),
                    "scev.check");
   SCEVRuntimeCheck =
@@ -71,7 +77,7 @@ void LoopVersioning::versionLoop(
 
   if (MemRuntimeCheck && SCEVRuntimeCheck) {
     RuntimeCheck = BinaryOperator::Create(Instruction::Or, MemRuntimeCheck,
-                                          SCEVRuntimeCheck, "ldist.safe");
+                                          SCEVRuntimeCheck, "lver.safe");
     if (auto *I = dyn_cast<Instruction>(RuntimeCheck))
       I->insertBefore(RuntimeCheckBB->getTerminator());
   } else
@@ -119,16 +125,14 @@ void LoopVersioning::addPHINodes(
     const SmallVectorImpl<Instruction *> &DefsUsedOutside) {
   BasicBlock *PHIBlock = VersionedLoop->getExitBlock();
   assert(PHIBlock && "No single successor to loop exit block");
+  PHINode *PN;
 
+  // First add a single-operand PHI for each DefsUsedOutside if one does not
+  // exists yet.
   for (auto *Inst : DefsUsedOutside) {
-    auto *NonVersionedLoopInst = cast<Instruction>(VMap[Inst]);
-    PHINode *PN;
-
-    // First see if we have a single-operand PHI with the value defined by the
+    // See if we have a single-operand PHI with the value defined by the
     // original loop.
     for (auto I = PHIBlock->begin(); (PN = dyn_cast<PHINode>(I)); ++I) {
-      assert(PN->getNumOperands() == 1 &&
-             "Exit block should only have on predecessor");
       if (PN->getIncomingValue(0) == Inst)
         break;
     }
@@ -141,7 +145,179 @@ void LoopVersioning::addPHINodes(
           User->replaceUsesOfWith(Inst, PN);
       PN->addIncoming(Inst, VersionedLoop->getExitingBlock());
     }
-    // Add the new incoming value from the non-versioned loop.
-    PN->addIncoming(NonVersionedLoopInst, NonVersionedLoop->getExitingBlock());
   }
+
+  // Then for each PHI add the operand for the edge from the cloned loop.
+  for (auto I = PHIBlock->begin(); (PN = dyn_cast<PHINode>(I)); ++I) {
+    assert(PN->getNumOperands() == 1 &&
+           "Exit block should only have on predecessor");
+
+    // If the definition was cloned used that otherwise use the same value.
+    Value *ClonedValue = PN->getIncomingValue(0);
+    auto Mapped = VMap.find(ClonedValue);
+    if (Mapped != VMap.end())
+      ClonedValue = Mapped->second;
+
+    PN->addIncoming(ClonedValue, NonVersionedLoop->getExitingBlock());
+  }
+}
+
+void LoopVersioning::prepareNoAliasMetadata() {
+  // We need to turn the no-alias relation between pointer checking groups into
+  // no-aliasing annotations between instructions.
+  //
+  // We accomplish this by mapping each pointer checking group (a set of
+  // pointers memchecked together) to an alias scope and then also mapping each
+  // group to the list of scopes it can't alias.
+
+  const RuntimePointerChecking *RtPtrChecking = LAI.getRuntimePointerChecking();
+  LLVMContext &Context = VersionedLoop->getHeader()->getContext();
+
+  // First allocate an aliasing scope for each pointer checking group.
+  //
+  // While traversing through the checking groups in the loop, also create a
+  // reverse map from pointers to the pointer checking group they were assigned
+  // to.
+  MDBuilder MDB(Context);
+  MDNode *Domain = MDB.createAnonymousAliasScopeDomain("LVerDomain");
+
+  for (const auto &Group : RtPtrChecking->CheckingGroups) {
+    GroupToScope[&Group] = MDB.createAnonymousAliasScope(Domain);
+
+    for (unsigned PtrIdx : Group.Members)
+      PtrToGroup[RtPtrChecking->getPointerInfo(PtrIdx).PointerValue] = &Group;
+  }
+
+  // Go through the checks and for each pointer group, collect the scopes for
+  // each non-aliasing pointer group.
+  DenseMap<const RuntimePointerChecking::CheckingPtrGroup *,
+           SmallVector<Metadata *, 4>>
+      GroupToNonAliasingScopes;
+
+  for (const auto &Check : AliasChecks)
+    GroupToNonAliasingScopes[Check.first].push_back(GroupToScope[Check.second]);
+
+  // Finally, transform the above to actually map to scope list which is what
+  // the metadata uses.
+
+  for (auto Pair : GroupToNonAliasingScopes)
+    GroupToNonAliasingScopeList[Pair.first] = MDNode::get(Context, Pair.second);
+}
+
+void LoopVersioning::annotateLoopWithNoAlias() {
+  if (!AnnotateNoAlias)
+    return;
+
+  // First prepare the maps.
+  prepareNoAliasMetadata();
+
+  // Add the scope and no-alias metadata to the instructions.
+  for (Instruction *I : LAI.getDepChecker().getMemoryInstructions()) {
+    annotateInstWithNoAlias(I);
+  }
+}
+
+void LoopVersioning::annotateInstWithNoAlias(Instruction *VersionedInst,
+                                             const Instruction *OrigInst) {
+  if (!AnnotateNoAlias)
+    return;
+
+  LLVMContext &Context = VersionedLoop->getHeader()->getContext();
+  const Value *Ptr = isa<LoadInst>(OrigInst)
+                         ? cast<LoadInst>(OrigInst)->getPointerOperand()
+                         : cast<StoreInst>(OrigInst)->getPointerOperand();
+
+  // Find the group for the pointer and then add the scope metadata.
+  auto Group = PtrToGroup.find(Ptr);
+  if (Group != PtrToGroup.end()) {
+    VersionedInst->setMetadata(
+        LLVMContext::MD_alias_scope,
+        MDNode::concatenate(
+            VersionedInst->getMetadata(LLVMContext::MD_alias_scope),
+            MDNode::get(Context, GroupToScope[Group->second])));
+
+    // Add the no-alias metadata.
+    auto NonAliasingScopeList = GroupToNonAliasingScopeList.find(Group->second);
+    if (NonAliasingScopeList != GroupToNonAliasingScopeList.end())
+      VersionedInst->setMetadata(
+          LLVMContext::MD_noalias,
+          MDNode::concatenate(
+              VersionedInst->getMetadata(LLVMContext::MD_noalias),
+              NonAliasingScopeList->second));
+  }
+}
+
+namespace {
+/// \brief Also expose this is a pass.  Currently this is only used for
+/// unit-testing.  It adds all memchecks necessary to remove all may-aliasing
+/// array accesses from the loop.
+class LoopVersioningPass : public FunctionPass {
+public:
+  LoopVersioningPass() : FunctionPass(ID) {
+    initializeLoopVersioningPassPass(*PassRegistry::getPassRegistry());
+  }
+
+  bool runOnFunction(Function &F) override {
+    auto *LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
+    auto *LAA = &getAnalysis<LoopAccessLegacyAnalysis>();
+    auto *DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
+    auto *SE = &getAnalysis<ScalarEvolutionWrapperPass>().getSE();
+
+    // Build up a worklist of inner-loops to version. This is necessary as the
+    // act of versioning a loop creates new loops and can invalidate iterators
+    // across the loops.
+    SmallVector<Loop *, 8> Worklist;
+
+    for (Loop *TopLevelLoop : *LI)
+      for (Loop *L : depth_first(TopLevelLoop))
+        // We only handle inner-most loops.
+        if (L->empty())
+          Worklist.push_back(L);
+
+    // Now walk the identified inner loops.
+    bool Changed = false;
+    for (Loop *L : Worklist) {
+      const LoopAccessInfo &LAI = LAA->getInfo(L);
+      if (LAI.getNumRuntimePointerChecks() ||
+          !LAI.getPSE().getUnionPredicate().isAlwaysTrue()) {
+        LoopVersioning LVer(LAI, L, LI, DT, SE);
+        LVer.versionLoop();
+        LVer.annotateLoopWithNoAlias();
+        Changed = true;
+      }
+    }
+
+    return Changed;
+  }
+
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    AU.addRequired<LoopInfoWrapperPass>();
+    AU.addPreserved<LoopInfoWrapperPass>();
+    AU.addRequired<LoopAccessLegacyAnalysis>();
+    AU.addRequired<DominatorTreeWrapperPass>();
+    AU.addPreserved<DominatorTreeWrapperPass>();
+    AU.addRequired<ScalarEvolutionWrapperPass>();
+  }
+
+  static char ID;
+};
+}
+
+#define LVER_OPTION "loop-versioning"
+#define DEBUG_TYPE LVER_OPTION
+
+char LoopVersioningPass::ID;
+static const char LVer_name[] = "Loop Versioning";
+
+INITIALIZE_PASS_BEGIN(LoopVersioningPass, LVER_OPTION, LVer_name, false, false)
+INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(LoopAccessLegacyAnalysis)
+INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(ScalarEvolutionWrapperPass)
+INITIALIZE_PASS_END(LoopVersioningPass, LVER_OPTION, LVer_name, false, false)
+
+namespace llvm {
+FunctionPass *createLoopVersioningPass() {
+  return new LoopVersioningPass();
+}
 }
diff --git a/lib/Transforms/Utils/LowerInvoke.cpp b/lib/Transforms/Utils/LowerInvoke.cpp
index b0ad4d5e84a1..1b31c5ae580a 100644
--- a/lib/Transforms/Utils/LowerInvoke.cpp
+++ b/lib/Transforms/Utils/LowerInvoke.cpp
@@ -14,14 +14,13 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "llvm/Transforms/Scalar.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/Statistic.h"
 #include "llvm/IR/Instructions.h"
 #include "llvm/IR/LLVMContext.h"
 #include "llvm/IR/Module.h"
 #include "llvm/Pass.h"
-#include "llvm/Support/CommandLine.h"
+#include "llvm/Transforms/Scalar.h"
 using namespace llvm;
 
 #define DEBUG_TYPE "lowerinvoke"
@@ -53,8 +52,8 @@ FunctionPass *llvm::createLowerInvokePass() {
 
 bool LowerInvoke::runOnFunction(Function &F) {
   bool Changed = false;
-  for (Function::iterator BB = F.begin(), E = F.end(); BB != E; ++BB)
-    if (InvokeInst *II = dyn_cast<InvokeInst>(BB->getTerminator())) {
+  for (BasicBlock &BB : F)
+    if (InvokeInst *II = dyn_cast<InvokeInst>(BB.getTerminator())) {
       SmallVector<Value*,16> CallArgs(II->op_begin(), II->op_end() - 3);
       // Insert a normal call instruction...
       CallInst *NewCall = CallInst::Create(II->getCalledValue(),
@@ -69,10 +68,10 @@ bool LowerInvoke::runOnFunction(Function &F) {
       BranchInst::Create(II->getNormalDest(), II);
 
       // Remove any PHI node entries from the exception destination.
-      II->getUnwindDest()->removePredecessor(&*BB);
+      II->getUnwindDest()->removePredecessor(&BB);
 
       // Remove the invoke instruction now.
-      BB->getInstList().erase(II);
+      BB.getInstList().erase(II);
 
       ++NumInvokes; Changed = true;
     }
diff --git a/lib/Transforms/Utils/LowerSwitch.cpp b/lib/Transforms/Utils/LowerSwitch.cpp
index 52beb1542497..5c07469869ff 100644
--- a/lib/Transforms/Utils/LowerSwitch.cpp
+++ b/lib/Transforms/Utils/LowerSwitch.cpp
@@ -59,12 +59,6 @@ namespace {
 
     bool runOnFunction(Function &F) override;
 
-    void getAnalysisUsage(AnalysisUsage &AU) const override {
-      // This is a cluster of orthogonal Transforms
-      AU.addPreserved<UnifyFunctionExitNodes>();
-      AU.addPreservedID(LowerInvokePassID);
-    }
-
     struct CaseRange {
       ConstantInt* Low;
       ConstantInt* High;
@@ -192,8 +186,8 @@ static void fixPhis(BasicBlock *SuccBB, BasicBlock *OrigBB, BasicBlock *NewBB,
       }
     // Remove incoming values in the reverse order to prevent invalidating
     // *successive* index.
-    for (auto III = Indices.rbegin(), IIE = Indices.rend(); III != IIE; ++III)
-      PN->removeIncomingValue(*III);
+    for (unsigned III : reverse(Indices))
+      PN->removeIncomingValue(III);
   }
 }
 
diff --git a/lib/Transforms/Utils/Makefile b/lib/Transforms/Utils/Makefile
deleted file mode 100644
index d1e9336d67f0..000000000000
--- a/lib/Transforms/Utils/Makefile
+++ /dev/null
@@ -1,15 +0,0 @@
-##===- lib/Transforms/Utils/Makefile -----------------------*- Makefile -*-===##
-#
-#                     The LLVM Compiler Infrastructure
-#
-# This file is distributed under the University of Illinois Open Source
-# License. See LICENSE.TXT for details.
-#
-##===----------------------------------------------------------------------===##
-
-LEVEL = ../../..
-LIBRARYNAME = LLVMTransformUtils
-BUILD_ARCHIVE = 1
-
-include $(LEVEL)/Makefile.common
-
diff --git a/lib/Transforms/Utils/Mem2Reg.cpp b/lib/Transforms/Utils/Mem2Reg.cpp
index aa1e35ddba02..1419254bcb4f 100644
--- a/lib/Transforms/Utils/Mem2Reg.cpp
+++ b/lib/Transforms/Utils/Mem2Reg.cpp
@@ -12,12 +12,13 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "llvm/Transforms/Scalar.h"
+#include "llvm/Transforms/Utils/Mem2Reg.h"
 #include "llvm/ADT/Statistic.h"
 #include "llvm/Analysis/AssumptionCache.h"
 #include "llvm/IR/Dominators.h"
 #include "llvm/IR/Function.h"
 #include "llvm/IR/Instructions.h"
+#include "llvm/Transforms/Scalar.h"
 #include "llvm/Transforms/Utils/PromoteMemToReg.h"
 #include "llvm/Transforms/Utils/UnifyFunctionExitNodes.h"
 using namespace llvm;
@@ -26,51 +27,11 @@ using namespace llvm;
 
 STATISTIC(NumPromoted, "Number of alloca's promoted");
 
-namespace {
-  struct PromotePass : public FunctionPass {
-    static char ID; // Pass identification, replacement for typeid
-    PromotePass() : FunctionPass(ID) {
-      initializePromotePassPass(*PassRegistry::getPassRegistry());
-    }
-
-    // runOnFunction - To run this pass, first we calculate the alloca
-    // instructions that are safe for promotion, then we promote each one.
-    //
-    bool runOnFunction(Function &F) override;
-
-    void getAnalysisUsage(AnalysisUsage &AU) const override {
-      AU.addRequired<AssumptionCacheTracker>();
-      AU.addRequired<DominatorTreeWrapperPass>();
-      AU.setPreservesCFG();
-      // This is a cluster of orthogonal Transforms
-      AU.addPreserved<UnifyFunctionExitNodes>();
-      AU.addPreservedID(LowerSwitchID);
-      AU.addPreservedID(LowerInvokePassID);
-    }
-  };
-}  // end of anonymous namespace
-
-char PromotePass::ID = 0;
-INITIALIZE_PASS_BEGIN(PromotePass, "mem2reg", "Promote Memory to Register",
-                false, false)
-INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker)
-INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
-INITIALIZE_PASS_END(PromotePass, "mem2reg", "Promote Memory to Register",
-                false, false)
-
-bool PromotePass::runOnFunction(Function &F) {
-  std::vector<AllocaInst*> Allocas;
-
-  BasicBlock &BB = F.getEntryBlock();  // Get the entry node for the function
-
-  if (F.hasFnAttribute(Attribute::OptimizeNone))
-    return false;
-
-  bool Changed  = false;
-
-  DominatorTree &DT = getAnalysis<DominatorTreeWrapperPass>().getDomTree();
-  AssumptionCache &AC =
-      getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F);
+static bool promoteMemoryToRegister(Function &F, DominatorTree &DT,
+                                    AssumptionCache &AC) {
+  std::vector<AllocaInst *> Allocas;
+  BasicBlock &BB = F.getEntryBlock(); // Get the entry node for the function
+  bool Changed = false;
 
   while (1) {
     Allocas.clear();
@@ -78,22 +39,69 @@ bool PromotePass::runOnFunction(Function &F) {
     // Find allocas that are safe to promote, by looking at all instructions in
     // the entry node
     for (BasicBlock::iterator I = BB.begin(), E = --BB.end(); I != E; ++I)
-      if (AllocaInst *AI = dyn_cast<AllocaInst>(I))       // Is it an alloca?
+      if (AllocaInst *AI = dyn_cast<AllocaInst>(I)) // Is it an alloca?
         if (isAllocaPromotable(AI))
           Allocas.push_back(AI);
 
-    if (Allocas.empty()) break;
+    if (Allocas.empty())
+      break;
 
     PromoteMemToReg(Allocas, DT, nullptr, &AC);
     NumPromoted += Allocas.size();
     Changed = true;
   }
-
   return Changed;
 }
 
+PreservedAnalyses PromotePass::run(Function &F, AnalysisManager<Function> &AM) {
+  auto &DT = AM.getResult<DominatorTreeAnalysis>(F);
+  auto &AC = AM.getResult<AssumptionAnalysis>(F);
+  if (!promoteMemoryToRegister(F, DT, AC))
+    return PreservedAnalyses::all();
+
+  // FIXME: This should also 'preserve the CFG'.
+  return PreservedAnalyses::none();
+}
+
+namespace {
+struct PromoteLegacyPass : public FunctionPass {
+  static char ID; // Pass identification, replacement for typeid
+  PromoteLegacyPass() : FunctionPass(ID) {
+    initializePromoteLegacyPassPass(*PassRegistry::getPassRegistry());
+  }
+
+  // runOnFunction - To run this pass, first we calculate the alloca
+  // instructions that are safe for promotion, then we promote each one.
+  //
+  bool runOnFunction(Function &F) override {
+    if (skipFunction(F))
+      return false;
+
+    DominatorTree &DT = getAnalysis<DominatorTreeWrapperPass>().getDomTree();
+    AssumptionCache &AC =
+        getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F);
+    return promoteMemoryToRegister(F, DT, AC);
+  }
+
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    AU.addRequired<AssumptionCacheTracker>();
+    AU.addRequired<DominatorTreeWrapperPass>();
+    AU.setPreservesCFG();
+  }
+  };
+}  // end of anonymous namespace
+
+char PromoteLegacyPass::ID = 0;
+INITIALIZE_PASS_BEGIN(PromoteLegacyPass, "mem2reg", "Promote Memory to "
+                                                    "Register",
+                      false, false)
+INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker)
+INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
+INITIALIZE_PASS_END(PromoteLegacyPass, "mem2reg", "Promote Memory to Register",
+                    false, false)
+
 // createPromoteMemoryToRegister - Provide an entry point to create this pass.
 //
 FunctionPass *llvm::createPromoteMemoryToRegisterPass() {
-  return new PromotePass();
+  return new PromoteLegacyPass();
 }
diff --git a/lib/Transforms/Utils/MemorySSA.cpp b/lib/Transforms/Utils/MemorySSA.cpp
new file mode 100644
index 000000000000..8ba3cae43b18
--- /dev/null
+++ b/lib/Transforms/Utils/MemorySSA.cpp
@@ -0,0 +1,1361 @@
+//===-- MemorySSA.cpp - Memory SSA Builder---------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------===//
+//
+// This file implements the MemorySSA class.
+//
+//===----------------------------------------------------------------===//
+#include "llvm/Transforms/Utils/MemorySSA.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/DenseSet.h"
+#include "llvm/ADT/DepthFirstIterator.h"
+#include "llvm/ADT/GraphTraits.h"
+#include "llvm/ADT/PostOrderIterator.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/SmallPtrSet.h"
+#include "llvm/ADT/SmallSet.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/Analysis/AliasAnalysis.h"
+#include "llvm/Analysis/CFG.h"
+#include "llvm/Analysis/GlobalsModRef.h"
+#include "llvm/Analysis/IteratedDominanceFrontier.h"
+#include "llvm/Analysis/MemoryLocation.h"
+#include "llvm/Analysis/PHITransAddr.h"
+#include "llvm/IR/AssemblyAnnotationWriter.h"
+#include "llvm/IR/DataLayout.h"
+#include "llvm/IR/Dominators.h"
+#include "llvm/IR/GlobalVariable.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/LLVMContext.h"
+#include "llvm/IR/Metadata.h"
+#include "llvm/IR/Module.h"
+#include "llvm/IR/PatternMatch.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/FormattedStream.h"
+#include "llvm/Transforms/Scalar.h"
+#include <algorithm>
+
+#define DEBUG_TYPE "memoryssa"
+using namespace llvm;
+STATISTIC(NumClobberCacheLookups, "Number of Memory SSA version cache lookups");
+STATISTIC(NumClobberCacheHits, "Number of Memory SSA version cache hits");
+STATISTIC(NumClobberCacheInserts, "Number of MemorySSA version cache inserts");
+
+INITIALIZE_PASS_BEGIN(MemorySSAWrapperPass, "memoryssa", "Memory SSA", false,
+                      true)
+INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass)
+INITIALIZE_PASS_END(MemorySSAWrapperPass, "memoryssa", "Memory SSA", false,
+                    true)
+
+INITIALIZE_PASS_BEGIN(MemorySSAPrinterLegacyPass, "print-memoryssa",
+                      "Memory SSA Printer", false, false)
+INITIALIZE_PASS_DEPENDENCY(MemorySSAWrapperPass)
+INITIALIZE_PASS_END(MemorySSAPrinterLegacyPass, "print-memoryssa",
+                    "Memory SSA Printer", false, false)
+
+static cl::opt<bool>
+    VerifyMemorySSA("verify-memoryssa", cl::init(false), cl::Hidden,
+                    cl::desc("Verify MemorySSA in legacy printer pass."));
+
+namespace llvm {
+/// \brief An assembly annotator class to print Memory SSA information in
+/// comments.
+class MemorySSAAnnotatedWriter : public AssemblyAnnotationWriter {
+  friend class MemorySSA;
+  const MemorySSA *MSSA;
+
+public:
+  MemorySSAAnnotatedWriter(const MemorySSA *M) : MSSA(M) {}
+
+  virtual void emitBasicBlockStartAnnot(const BasicBlock *BB,
+                                        formatted_raw_ostream &OS) {
+    if (MemoryAccess *MA = MSSA->getMemoryAccess(BB))
+      OS << "; " << *MA << "\n";
+  }
+
+  virtual void emitInstructionAnnot(const Instruction *I,
+                                    formatted_raw_ostream &OS) {
+    if (MemoryAccess *MA = MSSA->getMemoryAccess(I))
+      OS << "; " << *MA << "\n";
+  }
+};
+
+/// \brief A MemorySSAWalker that does AA walks and caching of lookups to
+/// disambiguate accesses.
+///
+/// FIXME: The current implementation of this can take quadratic space in rare
+/// cases. This can be fixed, but it is something to note until it is fixed.
+///
+/// In order to trigger this behavior, you need to store to N distinct locations
+/// (that AA can prove don't alias), perform M stores to other memory
+/// locations that AA can prove don't alias any of the initial N locations, and
+/// then load from all of the N locations. In this case, we insert M cache
+/// entries for each of the N loads.
+///
+/// For example:
+/// define i32 @foo() {
+///   %a = alloca i32, align 4
+///   %b = alloca i32, align 4
+///   store i32 0, i32* %a, align 4
+///   store i32 0, i32* %b, align 4
+///
+///   ; Insert M stores to other memory that doesn't alias %a or %b here
+///
+///   %c = load i32, i32* %a, align 4 ; Caches M entries in
+///                                   ; CachedUpwardsClobberingAccess for the
+///                                   ; MemoryLocation %a
+///   %d = load i32, i32* %b, align 4 ; Caches M entries in
+///                                   ; CachedUpwardsClobberingAccess for the
+///                                   ; MemoryLocation %b
+///
+///   ; For completeness' sake, loading %a or %b again would not cache *another*
+///   ; M entries.
+///   %r = add i32 %c, %d
+///   ret i32 %r
+/// }
+class MemorySSA::CachingWalker final : public MemorySSAWalker {
+public:
+  CachingWalker(MemorySSA *, AliasAnalysis *, DominatorTree *);
+  ~CachingWalker() override;
+
+  MemoryAccess *getClobberingMemoryAccess(const Instruction *) override;
+  MemoryAccess *getClobberingMemoryAccess(MemoryAccess *,
+                                          MemoryLocation &) override;
+  void invalidateInfo(MemoryAccess *) override;
+
+protected:
+  struct UpwardsMemoryQuery;
+  MemoryAccess *doCacheLookup(const MemoryAccess *, const UpwardsMemoryQuery &,
+                              const MemoryLocation &);
+
+  void doCacheInsert(const MemoryAccess *, MemoryAccess *,
+                     const UpwardsMemoryQuery &, const MemoryLocation &);
+
+  void doCacheRemove(const MemoryAccess *, const UpwardsMemoryQuery &,
+                     const MemoryLocation &);
+
+private:
+  MemoryAccessPair UpwardsDFSWalk(MemoryAccess *, const MemoryLocation &,
+                                  UpwardsMemoryQuery &, bool);
+  MemoryAccess *getClobberingMemoryAccess(MemoryAccess *, UpwardsMemoryQuery &);
+  bool instructionClobbersQuery(const MemoryDef *, UpwardsMemoryQuery &,
+                                const MemoryLocation &Loc) const;
+  void verifyRemoved(MemoryAccess *);
+  SmallDenseMap<ConstMemoryAccessPair, MemoryAccess *>
+      CachedUpwardsClobberingAccess;
+  DenseMap<const MemoryAccess *, MemoryAccess *> CachedUpwardsClobberingCall;
+  AliasAnalysis *AA;
+  DominatorTree *DT;
+};
+}
+
+namespace {
+struct RenamePassData {
+  DomTreeNode *DTN;
+  DomTreeNode::const_iterator ChildIt;
+  MemoryAccess *IncomingVal;
+
+  RenamePassData(DomTreeNode *D, DomTreeNode::const_iterator It,
+                 MemoryAccess *M)
+      : DTN(D), ChildIt(It), IncomingVal(M) {}
+  void swap(RenamePassData &RHS) {
+    std::swap(DTN, RHS.DTN);
+    std::swap(ChildIt, RHS.ChildIt);
+    std::swap(IncomingVal, RHS.IncomingVal);
+  }
+};
+}
+
+namespace llvm {
+/// \brief Rename a single basic block into MemorySSA form.
+/// Uses the standard SSA renaming algorithm.
+/// \returns The new incoming value.
+MemoryAccess *MemorySSA::renameBlock(BasicBlock *BB,
+                                     MemoryAccess *IncomingVal) {
+  auto It = PerBlockAccesses.find(BB);
+  // Skip most processing if the list is empty.
+  if (It != PerBlockAccesses.end()) {
+    AccessList *Accesses = It->second.get();
+    for (MemoryAccess &L : *Accesses) {
+      switch (L.getValueID()) {
+      case Value::MemoryUseVal:
+        cast<MemoryUse>(&L)->setDefiningAccess(IncomingVal);
+        break;
+      case Value::MemoryDefVal:
+        // We can't legally optimize defs, because we only allow single
+        // memory phis/uses on operations, and if we optimize these, we can
+        // end up with multiple reaching defs. Uses do not have this
+        // problem, since they do not produce a value
+        cast<MemoryDef>(&L)->setDefiningAccess(IncomingVal);
+        IncomingVal = &L;
+        break;
+      case Value::MemoryPhiVal:
+        IncomingVal = &L;
+        break;
+      }
+    }
+  }
+
+  // Pass through values to our successors
+  for (const BasicBlock *S : successors(BB)) {
+    auto It = PerBlockAccesses.find(S);
+    // Rename the phi nodes in our successor block
+    if (It == PerBlockAccesses.end() || !isa<MemoryPhi>(It->second->front()))
+      continue;
+    AccessList *Accesses = It->second.get();
+    auto *Phi = cast<MemoryPhi>(&Accesses->front());
+    Phi->addIncoming(IncomingVal, BB);
+  }
+
+  return IncomingVal;
+}
+
+/// \brief This is the standard SSA renaming algorithm.
+///
+/// We walk the dominator tree in preorder, renaming accesses, and then filling
+/// in phi nodes in our successors.
+void MemorySSA::renamePass(DomTreeNode *Root, MemoryAccess *IncomingVal,
+                           SmallPtrSet<BasicBlock *, 16> &Visited) {
+  SmallVector<RenamePassData, 32> WorkStack;
+  IncomingVal = renameBlock(Root->getBlock(), IncomingVal);
+  WorkStack.push_back({Root, Root->begin(), IncomingVal});
+  Visited.insert(Root->getBlock());
+
+  while (!WorkStack.empty()) {
+    DomTreeNode *Node = WorkStack.back().DTN;
+    DomTreeNode::const_iterator ChildIt = WorkStack.back().ChildIt;
+    IncomingVal = WorkStack.back().IncomingVal;
+
+    if (ChildIt == Node->end()) {
+      WorkStack.pop_back();
+    } else {
+      DomTreeNode *Child = *ChildIt;
+      ++WorkStack.back().ChildIt;
+      BasicBlock *BB = Child->getBlock();
+      Visited.insert(BB);
+      IncomingVal = renameBlock(BB, IncomingVal);
+      WorkStack.push_back({Child, Child->begin(), IncomingVal});
+    }
+  }
+}
+
+/// \brief Compute dominator levels, used by the phi insertion algorithm above.
+void MemorySSA::computeDomLevels(DenseMap<DomTreeNode *, unsigned> &DomLevels) {
+  for (auto DFI = df_begin(DT->getRootNode()), DFE = df_end(DT->getRootNode());
+       DFI != DFE; ++DFI)
+    DomLevels[*DFI] = DFI.getPathLength() - 1;
+}
+
+/// \brief This handles unreachable block accesses by deleting phi nodes in
+/// unreachable blocks, and marking all other unreachable MemoryAccess's as
+/// being uses of the live on entry definition.
+void MemorySSA::markUnreachableAsLiveOnEntry(BasicBlock *BB) {
+  assert(!DT->isReachableFromEntry(BB) &&
+         "Reachable block found while handling unreachable blocks");
+
+  // Make sure phi nodes in our reachable successors end up with a
+  // LiveOnEntryDef for our incoming edge, even though our block is forward
+  // unreachable.  We could just disconnect these blocks from the CFG fully,
+  // but we do not right now.
+  for (const BasicBlock *S : successors(BB)) {
+    if (!DT->isReachableFromEntry(S))
+      continue;
+    auto It = PerBlockAccesses.find(S);
+    // Rename the phi nodes in our successor block
+    if (It == PerBlockAccesses.end() || !isa<MemoryPhi>(It->second->front()))
+      continue;
+    AccessList *Accesses = It->second.get();
+    auto *Phi = cast<MemoryPhi>(&Accesses->front());
+    Phi->addIncoming(LiveOnEntryDef.get(), BB);
+  }
+
+  auto It = PerBlockAccesses.find(BB);
+  if (It == PerBlockAccesses.end())
+    return;
+
+  auto &Accesses = It->second;
+  for (auto AI = Accesses->begin(), AE = Accesses->end(); AI != AE;) {
+    auto Next = std::next(AI);
+    // If we have a phi, just remove it. We are going to replace all
+    // users with live on entry.
+    if (auto *UseOrDef = dyn_cast<MemoryUseOrDef>(AI))
+      UseOrDef->setDefiningAccess(LiveOnEntryDef.get());
+    else
+      Accesses->erase(AI);
+    AI = Next;
+  }
+}
+
+MemorySSA::MemorySSA(Function &Func, AliasAnalysis *AA, DominatorTree *DT)
+    : AA(AA), DT(DT), F(Func), LiveOnEntryDef(nullptr), Walker(nullptr),
+      NextID(0) {
+  buildMemorySSA();
+}
+
+MemorySSA::MemorySSA(MemorySSA &&MSSA)
+    : AA(MSSA.AA), DT(MSSA.DT), F(MSSA.F),
+      ValueToMemoryAccess(std::move(MSSA.ValueToMemoryAccess)),
+      PerBlockAccesses(std::move(MSSA.PerBlockAccesses)),
+      LiveOnEntryDef(std::move(MSSA.LiveOnEntryDef)),
+      Walker(std::move(MSSA.Walker)), NextID(MSSA.NextID) {
+  // Update the Walker MSSA pointer so it doesn't point to the moved-from MSSA
+  // object any more.
+  Walker->MSSA = this;
+}
+
+MemorySSA::~MemorySSA() {
+  // Drop all our references
+  for (const auto &Pair : PerBlockAccesses)
+    for (MemoryAccess &MA : *Pair.second)
+      MA.dropAllReferences();
+}
+
+MemorySSA::AccessList *MemorySSA::getOrCreateAccessList(const BasicBlock *BB) {
+  auto Res = PerBlockAccesses.insert(std::make_pair(BB, nullptr));
+
+  if (Res.second)
+    Res.first->second = make_unique<AccessList>();
+  return Res.first->second.get();
+}
+
+void MemorySSA::buildMemorySSA() {
+  // We create an access to represent "live on entry", for things like
+  // arguments or users of globals, where the memory they use is defined before
+  // the beginning of the function. We do not actually insert it into the IR.
+  // We do not define a live on exit for the immediate uses, and thus our
+  // semantics do *not* imply that something with no immediate uses can simply
+  // be removed.
+  BasicBlock &StartingPoint = F.getEntryBlock();
+  LiveOnEntryDef = make_unique<MemoryDef>(F.getContext(), nullptr, nullptr,
+                                          &StartingPoint, NextID++);
+
+  // We maintain lists of memory accesses per-block, trading memory for time. We
+  // could just look up the memory access for every possible instruction in the
+  // stream.
+  SmallPtrSet<BasicBlock *, 32> DefiningBlocks;
+  SmallPtrSet<BasicBlock *, 32> DefUseBlocks;
+  // Go through each block, figure out where defs occur, and chain together all
+  // the accesses.
+  for (BasicBlock &B : F) {
+    bool InsertIntoDef = false;
+    AccessList *Accesses = nullptr;
+    for (Instruction &I : B) {
+      MemoryUseOrDef *MUD = createNewAccess(&I);
+      if (!MUD)
+        continue;
+      InsertIntoDef |= isa<MemoryDef>(MUD);
+
+      if (!Accesses)
+        Accesses = getOrCreateAccessList(&B);
+      Accesses->push_back(MUD);
+    }
+    if (InsertIntoDef)
+      DefiningBlocks.insert(&B);
+    if (Accesses)
+      DefUseBlocks.insert(&B);
+  }
+
+  // Compute live-in.
+  // Live in is normally defined as "all the blocks on the path from each def to
+  // each of it's uses".
+  // MemoryDef's are implicit uses of previous state, so they are also uses.
+  // This means we don't really have def-only instructions.  The only
+  // MemoryDef's that are not really uses are those that are of the LiveOnEntry
+  // variable (because LiveOnEntry can reach anywhere, and every def is a
+  // must-kill of LiveOnEntry).
+  // In theory, you could precisely compute live-in by using alias-analysis to
+  // disambiguate defs and uses to see which really pair up with which.
+  // In practice, this would be really expensive and difficult. So we simply
+  // assume all defs are also uses that need to be kept live.
+  // Because of this, the end result of this live-in computation will be "the
+  // entire set of basic blocks that reach any use".
+
+  SmallPtrSet<BasicBlock *, 32> LiveInBlocks;
+  SmallVector<BasicBlock *, 64> LiveInBlockWorklist(DefUseBlocks.begin(),
+                                                    DefUseBlocks.end());
+  // Now that we have a set of blocks where a value is live-in, recursively add
+  // predecessors until we find the full region the value is live.
+  while (!LiveInBlockWorklist.empty()) {
+    BasicBlock *BB = LiveInBlockWorklist.pop_back_val();
+
+    // The block really is live in here, insert it into the set.  If already in
+    // the set, then it has already been processed.
+    if (!LiveInBlocks.insert(BB).second)
+      continue;
+
+    // Since the value is live into BB, it is either defined in a predecessor or
+    // live into it to.
+    LiveInBlockWorklist.append(pred_begin(BB), pred_end(BB));
+  }
+
+  // Determine where our MemoryPhi's should go
+  ForwardIDFCalculator IDFs(*DT);
+  IDFs.setDefiningBlocks(DefiningBlocks);
+  IDFs.setLiveInBlocks(LiveInBlocks);
+  SmallVector<BasicBlock *, 32> IDFBlocks;
+  IDFs.calculate(IDFBlocks);
+
+  // Now place MemoryPhi nodes.
+  for (auto &BB : IDFBlocks) {
+    // Insert phi node
+    AccessList *Accesses = getOrCreateAccessList(BB);
+    MemoryPhi *Phi = new MemoryPhi(BB->getContext(), BB, NextID++);
+    ValueToMemoryAccess.insert(std::make_pair(BB, Phi));
+    // Phi's always are placed at the front of the block.
+    Accesses->push_front(Phi);
+  }
+
+  // Now do regular SSA renaming on the MemoryDef/MemoryUse. Visited will get
+  // filled in with all blocks.
+  SmallPtrSet<BasicBlock *, 16> Visited;
+  renamePass(DT->getRootNode(), LiveOnEntryDef.get(), Visited);
+
+  MemorySSAWalker *Walker = getWalker();
+
+  // Now optimize the MemoryUse's defining access to point to the nearest
+  // dominating clobbering def.
+  // This ensures that MemoryUse's that are killed by the same store are
+  // immediate users of that store, one of the invariants we guarantee.
+  for (auto DomNode : depth_first(DT)) {
+    BasicBlock *BB = DomNode->getBlock();
+    auto AI = PerBlockAccesses.find(BB);
+    if (AI == PerBlockAccesses.end())
+      continue;
+    AccessList *Accesses = AI->second.get();
+    for (auto &MA : *Accesses) {
+      if (auto *MU = dyn_cast<MemoryUse>(&MA)) {
+        Instruction *Inst = MU->getMemoryInst();
+        MU->setDefiningAccess(Walker->getClobberingMemoryAccess(Inst));
+      }
+    }
+  }
+
+  // Mark the uses in unreachable blocks as live on entry, so that they go
+  // somewhere.
+  for (auto &BB : F)
+    if (!Visited.count(&BB))
+      markUnreachableAsLiveOnEntry(&BB);
+}
+
+MemorySSAWalker *MemorySSA::getWalker() {
+  if (Walker)
+    return Walker.get();
+
+  Walker = make_unique<CachingWalker>(this, AA, DT);
+  return Walker.get();
+}
+
+MemoryPhi *MemorySSA::createMemoryPhi(BasicBlock *BB) {
+  assert(!getMemoryAccess(BB) && "MemoryPhi already exists for this BB");
+  AccessList *Accesses = getOrCreateAccessList(BB);
+  MemoryPhi *Phi = new MemoryPhi(BB->getContext(), BB, NextID++);
+  ValueToMemoryAccess.insert(std::make_pair(BB, Phi));
+  // Phi's always are placed at the front of the block.
+  Accesses->push_front(Phi);
+  return Phi;
+}
+
+MemoryUseOrDef *MemorySSA::createDefinedAccess(Instruction *I,
+                                               MemoryAccess *Definition) {
+  assert(!isa<PHINode>(I) && "Cannot create a defined access for a PHI");
+  MemoryUseOrDef *NewAccess = createNewAccess(I);
+  assert(
+      NewAccess != nullptr &&
+      "Tried to create a memory access for a non-memory touching instruction");
+  NewAccess->setDefiningAccess(Definition);
+  return NewAccess;
+}
+
+MemoryAccess *MemorySSA::createMemoryAccessInBB(Instruction *I,
+                                                MemoryAccess *Definition,
+                                                const BasicBlock *BB,
+                                                InsertionPlace Point) {
+  MemoryUseOrDef *NewAccess = createDefinedAccess(I, Definition);
+  auto *Accesses = getOrCreateAccessList(BB);
+  if (Point == Beginning) {
+    // It goes after any phi nodes
+    auto AI = std::find_if(
+        Accesses->begin(), Accesses->end(),
+        [](const MemoryAccess &MA) { return !isa<MemoryPhi>(MA); });
+
+    Accesses->insert(AI, NewAccess);
+  } else {
+    Accesses->push_back(NewAccess);
+  }
+
+  return NewAccess;
+}
+MemoryAccess *MemorySSA::createMemoryAccessBefore(Instruction *I,
+                                                  MemoryAccess *Definition,
+                                                  MemoryAccess *InsertPt) {
+  assert(I->getParent() == InsertPt->getBlock() &&
+         "New and old access must be in the same block");
+  MemoryUseOrDef *NewAccess = createDefinedAccess(I, Definition);
+  auto *Accesses = getOrCreateAccessList(InsertPt->getBlock());
+  Accesses->insert(AccessList::iterator(InsertPt), NewAccess);
+  return NewAccess;
+}
+
+MemoryAccess *MemorySSA::createMemoryAccessAfter(Instruction *I,
+                                                 MemoryAccess *Definition,
+                                                 MemoryAccess *InsertPt) {
+  assert(I->getParent() == InsertPt->getBlock() &&
+         "New and old access must be in the same block");
+  MemoryUseOrDef *NewAccess = createDefinedAccess(I, Definition);
+  auto *Accesses = getOrCreateAccessList(InsertPt->getBlock());
+  Accesses->insertAfter(AccessList::iterator(InsertPt), NewAccess);
+  return NewAccess;
+}
+
+/// \brief Helper function to create new memory accesses
+MemoryUseOrDef *MemorySSA::createNewAccess(Instruction *I) {
+  // The assume intrinsic has a control dependency which we model by claiming
+  // that it writes arbitrarily. Ignore that fake memory dependency here.
+  // FIXME: Replace this special casing with a more accurate modelling of
+  // assume's control dependency.
+  if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(I))
+    if (II->getIntrinsicID() == Intrinsic::assume)
+      return nullptr;
+
+  // Find out what affect this instruction has on memory.
+  ModRefInfo ModRef = AA->getModRefInfo(I);
+  bool Def = bool(ModRef & MRI_Mod);
+  bool Use = bool(ModRef & MRI_Ref);
+
+  // It's possible for an instruction to not modify memory at all. During
+  // construction, we ignore them.
+  if (!Def && !Use)
+    return nullptr;
+
+  assert((Def || Use) &&
+         "Trying to create a memory access with a non-memory instruction");
+
+  MemoryUseOrDef *MUD;
+  if (Def)
+    MUD = new MemoryDef(I->getContext(), nullptr, I, I->getParent(), NextID++);
+  else
+    MUD = new MemoryUse(I->getContext(), nullptr, I, I->getParent());
+  ValueToMemoryAccess.insert(std::make_pair(I, MUD));
+  return MUD;
+}
+
+MemoryAccess *MemorySSA::findDominatingDef(BasicBlock *UseBlock,
+                                           enum InsertionPlace Where) {
+  // Handle the initial case
+  if (Where == Beginning)
+    // The only thing that could define us at the beginning is a phi node
+    if (MemoryPhi *Phi = getMemoryAccess(UseBlock))
+      return Phi;
+
+  DomTreeNode *CurrNode = DT->getNode(UseBlock);
+  // Need to be defined by our dominator
+  if (Where == Beginning)
+    CurrNode = CurrNode->getIDom();
+  Where = End;
+  while (CurrNode) {
+    auto It = PerBlockAccesses.find(CurrNode->getBlock());
+    if (It != PerBlockAccesses.end()) {
+      auto &Accesses = It->second;
+      for (MemoryAccess &RA : reverse(*Accesses)) {
+        if (isa<MemoryDef>(RA) || isa<MemoryPhi>(RA))
+          return &RA;
+      }
+    }
+    CurrNode = CurrNode->getIDom();
+  }
+  return LiveOnEntryDef.get();
+}
+
+/// \brief Returns true if \p Replacer dominates \p Replacee .
+bool MemorySSA::dominatesUse(const MemoryAccess *Replacer,
+                             const MemoryAccess *Replacee) const {
+  if (isa<MemoryUseOrDef>(Replacee))
+    return DT->dominates(Replacer->getBlock(), Replacee->getBlock());
+  const auto *MP = cast<MemoryPhi>(Replacee);
+  // For a phi node, the use occurs in the predecessor block of the phi node.
+  // Since we may occur multiple times in the phi node, we have to check each
+  // operand to ensure Replacer dominates each operand where Replacee occurs.
+  for (const Use &Arg : MP->operands()) {
+    if (Arg.get() != Replacee &&
+        !DT->dominates(Replacer->getBlock(), MP->getIncomingBlock(Arg)))
+      return false;
+  }
+  return true;
+}
+
+/// \brief If all arguments of a MemoryPHI are defined by the same incoming
+/// argument, return that argument.
+static MemoryAccess *onlySingleValue(MemoryPhi *MP) {
+  MemoryAccess *MA = nullptr;
+
+  for (auto &Arg : MP->operands()) {
+    if (!MA)
+      MA = cast<MemoryAccess>(Arg);
+    else if (MA != Arg)
+      return nullptr;
+  }
+  return MA;
+}
+
+/// \brief Properly remove \p MA from all of MemorySSA's lookup tables.
+///
+/// Because of the way the intrusive list and use lists work, it is important to
+/// do removal in the right order.
+void MemorySSA::removeFromLookups(MemoryAccess *MA) {
+  assert(MA->use_empty() &&
+         "Trying to remove memory access that still has uses");
+  if (MemoryUseOrDef *MUD = dyn_cast<MemoryUseOrDef>(MA))
+    MUD->setDefiningAccess(nullptr);
+  // Invalidate our walker's cache if necessary
+  if (!isa<MemoryUse>(MA))
+    Walker->invalidateInfo(MA);
+  // The call below to erase will destroy MA, so we can't change the order we
+  // are doing things here
+  Value *MemoryInst;
+  if (MemoryUseOrDef *MUD = dyn_cast<MemoryUseOrDef>(MA)) {
+    MemoryInst = MUD->getMemoryInst();
+  } else {
+    MemoryInst = MA->getBlock();
+  }
+  ValueToMemoryAccess.erase(MemoryInst);
+
+  auto AccessIt = PerBlockAccesses.find(MA->getBlock());
+  std::unique_ptr<AccessList> &Accesses = AccessIt->second;
+  Accesses->erase(MA);
+  if (Accesses->empty())
+    PerBlockAccesses.erase(AccessIt);
+}
+
+void MemorySSA::removeMemoryAccess(MemoryAccess *MA) {
+  assert(!isLiveOnEntryDef(MA) && "Trying to remove the live on entry def");
+  // We can only delete phi nodes if they have no uses, or we can replace all
+  // uses with a single definition.
+  MemoryAccess *NewDefTarget = nullptr;
+  if (MemoryPhi *MP = dyn_cast<MemoryPhi>(MA)) {
+    // Note that it is sufficient to know that all edges of the phi node have
+    // the same argument.  If they do, by the definition of dominance frontiers
+    // (which we used to place this phi), that argument must dominate this phi,
+    // and thus, must dominate the phi's uses, and so we will not hit the assert
+    // below.
+    NewDefTarget = onlySingleValue(MP);
+    assert((NewDefTarget || MP->use_empty()) &&
+           "We can't delete this memory phi");
+  } else {
+    NewDefTarget = cast<MemoryUseOrDef>(MA)->getDefiningAccess();
+  }
+
+  // Re-point the uses at our defining access
+  if (!MA->use_empty())
+    MA->replaceAllUsesWith(NewDefTarget);
+
+  // The call below to erase will destroy MA, so we can't change the order we
+  // are doing things here
+  removeFromLookups(MA);
+}
+
+void MemorySSA::print(raw_ostream &OS) const {
+  MemorySSAAnnotatedWriter Writer(this);
+  F.print(OS, &Writer);
+}
+
+void MemorySSA::dump() const {
+  MemorySSAAnnotatedWriter Writer(this);
+  F.print(dbgs(), &Writer);
+}
+
+void MemorySSA::verifyMemorySSA() const {
+  verifyDefUses(F);
+  verifyDomination(F);
+  verifyOrdering(F);
+}
+
+/// \brief Verify that the order and existence of MemoryAccesses matches the
+/// order and existence of memory affecting instructions.
+void MemorySSA::verifyOrdering(Function &F) const {
+  // Walk all the blocks, comparing what the lookups think and what the access
+  // lists think, as well as the order in the blocks vs the order in the access
+  // lists.
+  SmallVector<MemoryAccess *, 32> ActualAccesses;
+  for (BasicBlock &B : F) {
+    const AccessList *AL = getBlockAccesses(&B);
+    MemoryAccess *Phi = getMemoryAccess(&B);
+    if (Phi)
+      ActualAccesses.push_back(Phi);
+    for (Instruction &I : B) {
+      MemoryAccess *MA = getMemoryAccess(&I);
+      assert((!MA || AL) && "We have memory affecting instructions "
+                            "in this block but they are not in the "
+                            "access list");
+      if (MA)
+        ActualAccesses.push_back(MA);
+    }
+    // Either we hit the assert, really have no accesses, or we have both
+    // accesses and an access list
+    if (!AL)
+      continue;
+    assert(AL->size() == ActualAccesses.size() &&
+           "We don't have the same number of accesses in the block as on the "
+           "access list");
+    auto ALI = AL->begin();
+    auto AAI = ActualAccesses.begin();
+    while (ALI != AL->end() && AAI != ActualAccesses.end()) {
+      assert(&*ALI == *AAI && "Not the same accesses in the same order");
+      ++ALI;
+      ++AAI;
+    }
+    ActualAccesses.clear();
+  }
+}
+
+/// \brief Verify the domination properties of MemorySSA by checking that each
+/// definition dominates all of its uses.
+void MemorySSA::verifyDomination(Function &F) const {
+  for (BasicBlock &B : F) {
+    // Phi nodes are attached to basic blocks
+    if (MemoryPhi *MP = getMemoryAccess(&B)) {
+      for (User *U : MP->users()) {
+        BasicBlock *UseBlock;
+        // Phi operands are used on edges, we simulate the right domination by
+        // acting as if the use occurred at the end of the predecessor block.
+        if (MemoryPhi *P = dyn_cast<MemoryPhi>(U)) {
+          for (const auto &Arg : P->operands()) {
+            if (Arg == MP) {
+              UseBlock = P->getIncomingBlock(Arg);
+              break;
+            }
+          }
+        } else {
+          UseBlock = cast<MemoryAccess>(U)->getBlock();
+        }
+        (void)UseBlock;
+        assert(DT->dominates(MP->getBlock(), UseBlock) &&
+               "Memory PHI does not dominate it's uses");
+      }
+    }
+
+    for (Instruction &I : B) {
+      MemoryAccess *MD = dyn_cast_or_null<MemoryDef>(getMemoryAccess(&I));
+      if (!MD)
+        continue;
+
+      for (User *U : MD->users()) {
+        BasicBlock *UseBlock;
+        (void)UseBlock;
+        // Things are allowed to flow to phi nodes over their predecessor edge.
+        if (auto *P = dyn_cast<MemoryPhi>(U)) {
+          for (const auto &Arg : P->operands()) {
+            if (Arg == MD) {
+              UseBlock = P->getIncomingBlock(Arg);
+              break;
+            }
+          }
+        } else {
+          UseBlock = cast<MemoryAccess>(U)->getBlock();
+        }
+        assert(DT->dominates(MD->getBlock(), UseBlock) &&
+               "Memory Def does not dominate it's uses");
+      }
+    }
+  }
+}
+
+/// \brief Verify the def-use lists in MemorySSA, by verifying that \p Use
+/// appears in the use list of \p Def.
+///
+/// llvm_unreachable is used instead of asserts because this may be called in
+/// a build without asserts. In that case, we don't want this to turn into a
+/// nop.
+void MemorySSA::verifyUseInDefs(MemoryAccess *Def, MemoryAccess *Use) const {
+  // The live on entry use may cause us to get a NULL def here
+  if (!Def) {
+    if (!isLiveOnEntryDef(Use))
+      llvm_unreachable("Null def but use not point to live on entry def");
+  } else if (std::find(Def->user_begin(), Def->user_end(), Use) ==
+             Def->user_end()) {
+    llvm_unreachable("Did not find use in def's use list");
+  }
+}
+
+/// \brief Verify the immediate use information, by walking all the memory
+/// accesses and verifying that, for each use, it appears in the
+/// appropriate def's use list
+void MemorySSA::verifyDefUses(Function &F) const {
+  for (BasicBlock &B : F) {
+    // Phi nodes are attached to basic blocks
+    if (MemoryPhi *Phi = getMemoryAccess(&B)) {
+      assert(Phi->getNumOperands() == static_cast<unsigned>(std::distance(
+                                          pred_begin(&B), pred_end(&B))) &&
+             "Incomplete MemoryPhi Node");
+      for (unsigned I = 0, E = Phi->getNumIncomingValues(); I != E; ++I)
+        verifyUseInDefs(Phi->getIncomingValue(I), Phi);
+    }
+
+    for (Instruction &I : B) {
+      if (MemoryAccess *MA = getMemoryAccess(&I)) {
+        assert(isa<MemoryUseOrDef>(MA) &&
+               "Found a phi node not attached to a bb");
+        verifyUseInDefs(cast<MemoryUseOrDef>(MA)->getDefiningAccess(), MA);
+      }
+    }
+  }
+}
+
+MemoryAccess *MemorySSA::getMemoryAccess(const Value *I) const {
+  return ValueToMemoryAccess.lookup(I);
+}
+
+MemoryPhi *MemorySSA::getMemoryAccess(const BasicBlock *BB) const {
+  return cast_or_null<MemoryPhi>(getMemoryAccess((const Value *)BB));
+}
+
+/// \brief Determine, for two memory accesses in the same block,
+/// whether \p Dominator dominates \p Dominatee.
+/// \returns True if \p Dominator dominates \p Dominatee.
+bool MemorySSA::locallyDominates(const MemoryAccess *Dominator,
+                                 const MemoryAccess *Dominatee) const {
+
+  assert((Dominator->getBlock() == Dominatee->getBlock()) &&
+         "Asking for local domination when accesses are in different blocks!");
+
+  // A node dominates itself.
+  if (Dominatee == Dominator)
+    return true;
+
+  // When Dominatee is defined on function entry, it is not dominated by another
+  // memory access.
+  if (isLiveOnEntryDef(Dominatee))
+    return false;
+
+  // When Dominator is defined on function entry, it dominates the other memory
+  // access.
+  if (isLiveOnEntryDef(Dominator))
+    return true;
+
+  // Get the access list for the block
+  const AccessList *AccessList = getBlockAccesses(Dominator->getBlock());
+  AccessList::const_reverse_iterator It(Dominator->getIterator());
+
+  // If we hit the beginning of the access list before we hit dominatee, we must
+  // dominate it
+  return std::none_of(It, AccessList->rend(),
+                      [&](const MemoryAccess &MA) { return &MA == Dominatee; });
+}
+
+const static char LiveOnEntryStr[] = "liveOnEntry";
+
+void MemoryDef::print(raw_ostream &OS) const {
+  MemoryAccess *UO = getDefiningAccess();
+
+  OS << getID() << " = MemoryDef(";
+  if (UO && UO->getID())
+    OS << UO->getID();
+  else
+    OS << LiveOnEntryStr;
+  OS << ')';
+}
+
+void MemoryPhi::print(raw_ostream &OS) const {
+  bool First = true;
+  OS << getID() << " = MemoryPhi(";
+  for (const auto &Op : operands()) {
+    BasicBlock *BB = getIncomingBlock(Op);
+    MemoryAccess *MA = cast<MemoryAccess>(Op);
+    if (!First)
+      OS << ',';
+    else
+      First = false;
+
+    OS << '{';
+    if (BB->hasName())
+      OS << BB->getName();
+    else
+      BB->printAsOperand(OS, false);
+    OS << ',';
+    if (unsigned ID = MA->getID())
+      OS << ID;
+    else
+      OS << LiveOnEntryStr;
+    OS << '}';
+  }
+  OS << ')';
+}
+
+MemoryAccess::~MemoryAccess() {}
+
+void MemoryUse::print(raw_ostream &OS) const {
+  MemoryAccess *UO = getDefiningAccess();
+  OS << "MemoryUse(";
+  if (UO && UO->getID())
+    OS << UO->getID();
+  else
+    OS << LiveOnEntryStr;
+  OS << ')';
+}
+
+void MemoryAccess::dump() const {
+  print(dbgs());
+  dbgs() << "\n";
+}
+
+char MemorySSAPrinterLegacyPass::ID = 0;
+
+MemorySSAPrinterLegacyPass::MemorySSAPrinterLegacyPass() : FunctionPass(ID) {
+  initializeMemorySSAPrinterLegacyPassPass(*PassRegistry::getPassRegistry());
+}
+
+void MemorySSAPrinterLegacyPass::getAnalysisUsage(AnalysisUsage &AU) const {
+  AU.setPreservesAll();
+  AU.addRequired<MemorySSAWrapperPass>();
+  AU.addPreserved<MemorySSAWrapperPass>();
+}
+
+bool MemorySSAPrinterLegacyPass::runOnFunction(Function &F) {
+  auto &MSSA = getAnalysis<MemorySSAWrapperPass>().getMSSA();
+  MSSA.print(dbgs());
+  if (VerifyMemorySSA)
+    MSSA.verifyMemorySSA();
+  return false;
+}
+
+char MemorySSAAnalysis::PassID;
+
+MemorySSA MemorySSAAnalysis::run(Function &F, AnalysisManager<Function> &AM) {
+  auto &DT = AM.getResult<DominatorTreeAnalysis>(F);
+  auto &AA = AM.getResult<AAManager>(F);
+  return MemorySSA(F, &AA, &DT);
+}
+
+PreservedAnalyses MemorySSAPrinterPass::run(Function &F,
+                                            FunctionAnalysisManager &AM) {
+  OS << "MemorySSA for function: " << F.getName() << "\n";
+  AM.getResult<MemorySSAAnalysis>(F).print(OS);
+
+  return PreservedAnalyses::all();
+}
+
+PreservedAnalyses MemorySSAVerifierPass::run(Function &F,
+                                             FunctionAnalysisManager &AM) {
+  AM.getResult<MemorySSAAnalysis>(F).verifyMemorySSA();
+
+  return PreservedAnalyses::all();
+}
+
+char MemorySSAWrapperPass::ID = 0;
+
+MemorySSAWrapperPass::MemorySSAWrapperPass() : FunctionPass(ID) {
+  initializeMemorySSAWrapperPassPass(*PassRegistry::getPassRegistry());
+}
+
+void MemorySSAWrapperPass::releaseMemory() { MSSA.reset(); }
+
+void MemorySSAWrapperPass::getAnalysisUsage(AnalysisUsage &AU) const {
+  AU.setPreservesAll();
+  AU.addRequiredTransitive<DominatorTreeWrapperPass>();
+  AU.addRequiredTransitive<AAResultsWrapperPass>();
+}
+
+bool MemorySSAWrapperPass::runOnFunction(Function &F) {
+  auto &DT = getAnalysis<DominatorTreeWrapperPass>().getDomTree();
+  auto &AA = getAnalysis<AAResultsWrapperPass>().getAAResults();
+  MSSA.reset(new MemorySSA(F, &AA, &DT));
+  return false;
+}
+
+void MemorySSAWrapperPass::verifyAnalysis() const { MSSA->verifyMemorySSA(); }
+
+void MemorySSAWrapperPass::print(raw_ostream &OS, const Module *M) const {
+  MSSA->print(OS);
+}
+
+MemorySSAWalker::MemorySSAWalker(MemorySSA *M) : MSSA(M) {}
+
+MemorySSA::CachingWalker::CachingWalker(MemorySSA *M, AliasAnalysis *A,
+                                        DominatorTree *D)
+    : MemorySSAWalker(M), AA(A), DT(D) {}
+
+MemorySSA::CachingWalker::~CachingWalker() {}
+
+struct MemorySSA::CachingWalker::UpwardsMemoryQuery {
+  // True if we saw a phi whose predecessor was a backedge
+  bool SawBackedgePhi;
+  // True if our original query started off as a call
+  bool IsCall;
+  // The pointer location we started the query with. This will be empty if
+  // IsCall is true.
+  MemoryLocation StartingLoc;
+  // This is the instruction we were querying about.
+  const Instruction *Inst;
+  // Set of visited Instructions for this query.
+  DenseSet<MemoryAccessPair> Visited;
+  // Vector of visited call accesses for this query. This is separated out
+  // because you can always cache and lookup the result of call queries (IE when
+  // IsCall == true) for every call in the chain. The calls have no AA location
+  // associated with them with them, and thus, no context dependence.
+  SmallVector<const MemoryAccess *, 32> VisitedCalls;
+  // The MemoryAccess we actually got called with, used to test local domination
+  const MemoryAccess *OriginalAccess;
+
+  UpwardsMemoryQuery()
+      : SawBackedgePhi(false), IsCall(false), Inst(nullptr),
+        OriginalAccess(nullptr) {}
+
+  UpwardsMemoryQuery(const Instruction *Inst, const MemoryAccess *Access)
+      : SawBackedgePhi(false), IsCall(ImmutableCallSite(Inst)), Inst(Inst),
+        OriginalAccess(Access) {}
+};
+
+void MemorySSA::CachingWalker::invalidateInfo(MemoryAccess *MA) {
+
+  // TODO: We can do much better cache invalidation with differently stored
+  // caches.  For now, for MemoryUses, we simply remove them
+  // from the cache, and kill the entire call/non-call cache for everything
+  // else.  The problem is for phis or defs, currently we'd need to follow use
+  // chains down and invalidate anything below us in the chain that currently
+  // terminates at this access.
+
+  // See if this is a MemoryUse, if so, just remove the cached info. MemoryUse
+  // is by definition never a barrier, so nothing in the cache could point to
+  // this use. In that case, we only need invalidate the info for the use
+  // itself.
+
+  if (MemoryUse *MU = dyn_cast<MemoryUse>(MA)) {
+    UpwardsMemoryQuery Q;
+    Instruction *I = MU->getMemoryInst();
+    Q.IsCall = bool(ImmutableCallSite(I));
+    Q.Inst = I;
+    if (!Q.IsCall)
+      Q.StartingLoc = MemoryLocation::get(I);
+    doCacheRemove(MA, Q, Q.StartingLoc);
+  } else {
+    // If it is not a use, the best we can do right now is destroy the cache.
+    CachedUpwardsClobberingCall.clear();
+    CachedUpwardsClobberingAccess.clear();
+  }
+
+#ifdef EXPENSIVE_CHECKS
+  // Run this only when expensive checks are enabled.
+  verifyRemoved(MA);
+#endif
+}
+
+void MemorySSA::CachingWalker::doCacheRemove(const MemoryAccess *M,
+                                             const UpwardsMemoryQuery &Q,
+                                             const MemoryLocation &Loc) {
+  if (Q.IsCall)
+    CachedUpwardsClobberingCall.erase(M);
+  else
+    CachedUpwardsClobberingAccess.erase({M, Loc});
+}
+
+void MemorySSA::CachingWalker::doCacheInsert(const MemoryAccess *M,
+                                             MemoryAccess *Result,
+                                             const UpwardsMemoryQuery &Q,
+                                             const MemoryLocation &Loc) {
+  // This is fine for Phis, since there are times where we can't optimize them.
+  // Making a def its own clobber is never correct, though.
+  assert((Result != M || isa<MemoryPhi>(M)) &&
+         "Something can't clobber itself!");
+  ++NumClobberCacheInserts;
+  if (Q.IsCall)
+    CachedUpwardsClobberingCall[M] = Result;
+  else
+    CachedUpwardsClobberingAccess[{M, Loc}] = Result;
+}
+
+MemoryAccess *
+MemorySSA::CachingWalker::doCacheLookup(const MemoryAccess *M,
+                                        const UpwardsMemoryQuery &Q,
+                                        const MemoryLocation &Loc) {
+  ++NumClobberCacheLookups;
+  MemoryAccess *Result;
+
+  if (Q.IsCall)
+    Result = CachedUpwardsClobberingCall.lookup(M);
+  else
+    Result = CachedUpwardsClobberingAccess.lookup({M, Loc});
+
+  if (Result)
+    ++NumClobberCacheHits;
+  return Result;
+}
+
+bool MemorySSA::CachingWalker::instructionClobbersQuery(
+    const MemoryDef *MD, UpwardsMemoryQuery &Q,
+    const MemoryLocation &Loc) const {
+  Instruction *DefMemoryInst = MD->getMemoryInst();
+  assert(DefMemoryInst && "Defining instruction not actually an instruction");
+
+  if (!Q.IsCall)
+    return AA->getModRefInfo(DefMemoryInst, Loc) & MRI_Mod;
+
+  // If this is a call, mark it for caching
+  if (ImmutableCallSite(DefMemoryInst))
+    Q.VisitedCalls.push_back(MD);
+  ModRefInfo I = AA->getModRefInfo(DefMemoryInst, ImmutableCallSite(Q.Inst));
+  return I != MRI_NoModRef;
+}
+
+MemoryAccessPair MemorySSA::CachingWalker::UpwardsDFSWalk(
+    MemoryAccess *StartingAccess, const MemoryLocation &Loc,
+    UpwardsMemoryQuery &Q, bool FollowingBackedge) {
+  MemoryAccess *ModifyingAccess = nullptr;
+
+  auto DFI = df_begin(StartingAccess);
+  for (auto DFE = df_end(StartingAccess); DFI != DFE;) {
+    MemoryAccess *CurrAccess = *DFI;
+    if (MSSA->isLiveOnEntryDef(CurrAccess))
+      return {CurrAccess, Loc};
+    // If this is a MemoryDef, check whether it clobbers our current query. This
+    // needs to be done before consulting the cache, because the cache reports
+    // the clobber for CurrAccess. If CurrAccess is a clobber for this query,
+    // and we ask the cache for information first, then we might skip this
+    // clobber, which is bad.
+    if (auto *MD = dyn_cast<MemoryDef>(CurrAccess)) {
+      // If we hit the top, stop following this path.
+      // While we can do lookups, we can't sanely do inserts here unless we were
+      // to track everything we saw along the way, since we don't know where we
+      // will stop.
+      if (instructionClobbersQuery(MD, Q, Loc)) {
+        ModifyingAccess = CurrAccess;
+        break;
+      }
+    }
+    if (auto CacheResult = doCacheLookup(CurrAccess, Q, Loc))
+      return {CacheResult, Loc};
+
+    // We need to know whether it is a phi so we can track backedges.
+    // Otherwise, walk all upward defs.
+    if (!isa<MemoryPhi>(CurrAccess)) {
+      ++DFI;
+      continue;
+    }
+
+#ifndef NDEBUG
+    // The loop below visits the phi's children for us. Because phis are the
+    // only things with multiple edges, skipping the children should always lead
+    // us to the end of the loop.
+    //
+    // Use a copy of DFI because skipChildren would kill our search stack, which
+    // would make caching anything on the way back impossible.
+    auto DFICopy = DFI;
+    assert(DFICopy.skipChildren() == DFE &&
+           "Skipping phi's children doesn't end the DFS?");
+#endif
+
+    const MemoryAccessPair PHIPair(CurrAccess, Loc);
+
+    // Don't try to optimize this phi again if we've already tried to do so.
+    if (!Q.Visited.insert(PHIPair).second) {
+      ModifyingAccess = CurrAccess;
+      break;
+    }
+
+    std::size_t InitialVisitedCallSize = Q.VisitedCalls.size();
+
+    // Recurse on PHI nodes, since we need to change locations.
+    // TODO: Allow graphtraits on pairs, which would turn this whole function
+    // into a normal single depth first walk.
+    MemoryAccess *FirstDef = nullptr;
+    for (auto MPI = upward_defs_begin(PHIPair), MPE = upward_defs_end();
+         MPI != MPE; ++MPI) {
+      bool Backedge =
+          !FollowingBackedge &&
+          DT->dominates(CurrAccess->getBlock(), MPI.getPhiArgBlock());
+
+      MemoryAccessPair CurrentPair =
+          UpwardsDFSWalk(MPI->first, MPI->second, Q, Backedge);
+      // All the phi arguments should reach the same point if we can bypass
+      // this phi. The alternative is that they hit this phi node, which
+      // means we can skip this argument.
+      if (FirstDef && CurrentPair.first != PHIPair.first &&
+          CurrentPair.first != FirstDef) {
+        ModifyingAccess = CurrAccess;
+        break;
+      }
+
+      if (!FirstDef)
+        FirstDef = CurrentPair.first;
+    }
+
+    // If we exited the loop early, go with the result it gave us.
+    if (!ModifyingAccess) {
+      assert(FirstDef && "Found a Phi with no upward defs?");
+      ModifyingAccess = FirstDef;
+    } else {
+      // If we can't optimize this Phi, then we can't safely cache any of the
+      // calls we visited when trying to optimize it. Wipe them out now.
+      Q.VisitedCalls.resize(InitialVisitedCallSize);
+    }
+    break;
+  }
+
+  if (!ModifyingAccess)
+    return {MSSA->getLiveOnEntryDef(), Q.StartingLoc};
+
+  const BasicBlock *OriginalBlock = StartingAccess->getBlock();
+  assert(DFI.getPathLength() > 0 && "We dropped our path?");
+  unsigned N = DFI.getPathLength();
+  // If we found a clobbering def, the last element in the path will be our
+  // clobber, so we don't want to cache that to itself. OTOH, if we optimized a
+  // phi, we can add the last thing in the path to the cache, since that won't
+  // be the result.
+  if (DFI.getPath(N - 1) == ModifyingAccess)
+    --N;
+  for (; N > 1; --N) {
+    MemoryAccess *CacheAccess = DFI.getPath(N - 1);
+    BasicBlock *CurrBlock = CacheAccess->getBlock();
+    if (!FollowingBackedge)
+      doCacheInsert(CacheAccess, ModifyingAccess, Q, Loc);
+    if (DT->dominates(CurrBlock, OriginalBlock) &&
+        (CurrBlock != OriginalBlock || !FollowingBackedge ||
+         MSSA->locallyDominates(CacheAccess, StartingAccess)))
+      break;
+  }
+
+  // Cache everything else on the way back. The caller should cache
+  // StartingAccess for us.
+  for (; N > 1; --N) {
+    MemoryAccess *CacheAccess = DFI.getPath(N - 1);
+    doCacheInsert(CacheAccess, ModifyingAccess, Q, Loc);
+  }
+
+  return {ModifyingAccess, Loc};
+}
+
+/// \brief Walk the use-def chains starting at \p MA and find
+/// the MemoryAccess that actually clobbers Loc.
+///
+/// \returns our clobbering memory access
+MemoryAccess *MemorySSA::CachingWalker::getClobberingMemoryAccess(
+    MemoryAccess *StartingAccess, UpwardsMemoryQuery &Q) {
+  return UpwardsDFSWalk(StartingAccess, Q.StartingLoc, Q, false).first;
+}
+
+MemoryAccess *MemorySSA::CachingWalker::getClobberingMemoryAccess(
+    MemoryAccess *StartingAccess, MemoryLocation &Loc) {
+  if (isa<MemoryPhi>(StartingAccess))
+    return StartingAccess;
+
+  auto *StartingUseOrDef = cast<MemoryUseOrDef>(StartingAccess);
+  if (MSSA->isLiveOnEntryDef(StartingUseOrDef))
+    return StartingUseOrDef;
+
+  Instruction *I = StartingUseOrDef->getMemoryInst();
+
+  // Conservatively, fences are always clobbers, so don't perform the walk if we
+  // hit a fence.
+  if (!ImmutableCallSite(I) && I->isFenceLike())
+    return StartingUseOrDef;
+
+  UpwardsMemoryQuery Q;
+  Q.OriginalAccess = StartingUseOrDef;
+  Q.StartingLoc = Loc;
+  Q.Inst = StartingUseOrDef->getMemoryInst();
+  Q.IsCall = false;
+
+  if (auto CacheResult = doCacheLookup(StartingUseOrDef, Q, Q.StartingLoc))
+    return CacheResult;
+
+  // Unlike the other function, do not walk to the def of a def, because we are
+  // handed something we already believe is the clobbering access.
+  MemoryAccess *DefiningAccess = isa<MemoryUse>(StartingUseOrDef)
+                                     ? StartingUseOrDef->getDefiningAccess()
+                                     : StartingUseOrDef;
+
+  MemoryAccess *Clobber = getClobberingMemoryAccess(DefiningAccess, Q);
+  // Only cache this if it wouldn't make Clobber point to itself.
+  if (Clobber != StartingAccess)
+    doCacheInsert(Q.OriginalAccess, Clobber, Q, Q.StartingLoc);
+  DEBUG(dbgs() << "Starting Memory SSA clobber for " << *I << " is ");
+  DEBUG(dbgs() << *StartingUseOrDef << "\n");
+  DEBUG(dbgs() << "Final Memory SSA clobber for " << *I << " is ");
+  DEBUG(dbgs() << *Clobber << "\n");
+  return Clobber;
+}
+
+MemoryAccess *
+MemorySSA::CachingWalker::getClobberingMemoryAccess(const Instruction *I) {
+  // There should be no way to lookup an instruction and get a phi as the
+  // access, since we only map BB's to PHI's. So, this must be a use or def.
+  auto *StartingAccess = cast<MemoryUseOrDef>(MSSA->getMemoryAccess(I));
+
+  bool IsCall = bool(ImmutableCallSite(I));
+
+  // We can't sanely do anything with a fences, they conservatively
+  // clobber all memory, and have no locations to get pointers from to
+  // try to disambiguate.
+  if (!IsCall && I->isFenceLike())
+    return StartingAccess;
+
+  UpwardsMemoryQuery Q;
+  Q.OriginalAccess = StartingAccess;
+  Q.IsCall = IsCall;
+  if (!Q.IsCall)
+    Q.StartingLoc = MemoryLocation::get(I);
+  Q.Inst = I;
+  if (auto CacheResult = doCacheLookup(StartingAccess, Q, Q.StartingLoc))
+    return CacheResult;
+
+  // Start with the thing we already think clobbers this location
+  MemoryAccess *DefiningAccess = StartingAccess->getDefiningAccess();
+
+  // At this point, DefiningAccess may be the live on entry def.
+  // If it is, we will not get a better result.
+  if (MSSA->isLiveOnEntryDef(DefiningAccess))
+    return DefiningAccess;
+
+  MemoryAccess *Result = getClobberingMemoryAccess(DefiningAccess, Q);
+  // DFS won't cache a result for DefiningAccess. So, if DefiningAccess isn't
+  // our clobber, be sure that it gets a cache entry, too.
+  if (Result != DefiningAccess)
+    doCacheInsert(DefiningAccess, Result, Q, Q.StartingLoc);
+  doCacheInsert(Q.OriginalAccess, Result, Q, Q.StartingLoc);
+  // TODO: When this implementation is more mature, we may want to figure out
+  // what this additional caching buys us. It's most likely A Good Thing.
+  if (Q.IsCall)
+    for (const MemoryAccess *MA : Q.VisitedCalls)
+      if (MA != Result)
+        doCacheInsert(MA, Result, Q, Q.StartingLoc);
+
+  DEBUG(dbgs() << "Starting Memory SSA clobber for " << *I << " is ");
+  DEBUG(dbgs() << *DefiningAccess << "\n");
+  DEBUG(dbgs() << "Final Memory SSA clobber for " << *I << " is ");
+  DEBUG(dbgs() << *Result << "\n");
+
+  return Result;
+}
+
+// Verify that MA doesn't exist in any of the caches.
+void MemorySSA::CachingWalker::verifyRemoved(MemoryAccess *MA) {
+#ifndef NDEBUG
+  for (auto &P : CachedUpwardsClobberingAccess)
+    assert(P.first.first != MA && P.second != MA &&
+           "Found removed MemoryAccess in cache.");
+  for (auto &P : CachedUpwardsClobberingCall)
+    assert(P.first != MA && P.second != MA &&
+           "Found removed MemoryAccess in cache.");
+#endif // !NDEBUG
+}
+
+MemoryAccess *
+DoNothingMemorySSAWalker::getClobberingMemoryAccess(const Instruction *I) {
+  MemoryAccess *MA = MSSA->getMemoryAccess(I);
+  if (auto *Use = dyn_cast<MemoryUseOrDef>(MA))
+    return Use->getDefiningAccess();
+  return MA;
+}
+
+MemoryAccess *DoNothingMemorySSAWalker::getClobberingMemoryAccess(
+    MemoryAccess *StartingAccess, MemoryLocation &) {
+  if (auto *Use = dyn_cast<MemoryUseOrDef>(StartingAccess))
+    return Use->getDefiningAccess();
+  return StartingAccess;
+}
+}
diff --git a/lib/Transforms/Utils/ModuleUtils.cpp b/lib/Transforms/Utils/ModuleUtils.cpp
index 9ec28a3f3d47..eb9188518624 100644
--- a/lib/Transforms/Utils/ModuleUtils.cpp
+++ b/lib/Transforms/Utils/ModuleUtils.cpp
@@ -12,7 +12,6 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/Transforms/Utils/ModuleUtils.h"
-#include "llvm/ADT/SmallPtrSet.h"
 #include "llvm/IR/DerivedTypes.h"
 #include "llvm/IR/Function.h"
 #include "llvm/IR/IRBuilder.h"
@@ -21,8 +20,8 @@
 
 using namespace llvm;
 
-static void appendToGlobalArray(const char *Array, 
-                                Module &M, Function *F, int Priority) {
+static void appendToGlobalArray(const char *Array, Module &M, Function *F,
+                                int Priority, Constant *Data) {
   IRBuilder<> IRB(M.getContext());
   FunctionType *FnTy = FunctionType::get(IRB.getVoidTy(), false);
 
@@ -31,15 +30,26 @@ static void appendToGlobalArray(const char *Array,
   SmallVector<Constant *, 16> CurrentCtors;
   StructType *EltTy;
   if (GlobalVariable *GVCtor = M.getNamedGlobal(Array)) {
-    // If there is a global_ctors array, use the existing struct type, which can
-    // have 2 or 3 fields.
-    ArrayType *ATy = cast<ArrayType>(GVCtor->getType()->getElementType());
-    EltTy = cast<StructType>(ATy->getElementType());
+    ArrayType *ATy = cast<ArrayType>(GVCtor->getValueType());
+    StructType *OldEltTy = cast<StructType>(ATy->getElementType());
+    // Upgrade a 2-field global array type to the new 3-field format if needed.
+    if (Data && OldEltTy->getNumElements() < 3)
+      EltTy = StructType::get(IRB.getInt32Ty(), PointerType::getUnqual(FnTy),
+                              IRB.getInt8PtrTy(), nullptr);
+    else
+      EltTy = OldEltTy;
     if (Constant *Init = GVCtor->getInitializer()) {
       unsigned n = Init->getNumOperands();
       CurrentCtors.reserve(n + 1);
-      for (unsigned i = 0; i != n; ++i)
-        CurrentCtors.push_back(cast<Constant>(Init->getOperand(i)));
+      for (unsigned i = 0; i != n; ++i) {
+        auto Ctor = cast<Constant>(Init->getOperand(i));
+        if (EltTy != OldEltTy)
+          Ctor = ConstantStruct::get(
+              EltTy, Ctor->getAggregateElement((unsigned)0),
+              Ctor->getAggregateElement(1),
+              Constant::getNullValue(IRB.getInt8PtrTy()), nullptr);
+        CurrentCtors.push_back(Ctor);
+      }
     }
     GVCtor->eraseFromParent();
   } else {
@@ -54,7 +64,8 @@ static void appendToGlobalArray(const char *Array,
   CSVals[1] = F;
   // FIXME: Drop support for the two element form in LLVM 4.0.
   if (EltTy->getNumElements() >= 3)
-    CSVals[2] = llvm::Constant::getNullValue(IRB.getInt8PtrTy());
+    CSVals[2] = Data ? ConstantExpr::getPointerCast(Data, IRB.getInt8PtrTy())
+                     : Constant::getNullValue(IRB.getInt8PtrTy());
   Constant *RuntimeCtorInit =
       ConstantStruct::get(EltTy, makeArrayRef(CSVals, EltTy->getNumElements()));
 
@@ -70,29 +81,12 @@ static void appendToGlobalArray(const char *Array,
                            GlobalValue::AppendingLinkage, NewInit, Array);
 }
 
-void llvm::appendToGlobalCtors(Module &M, Function *F, int Priority) {
-  appendToGlobalArray("llvm.global_ctors", M, F, Priority);
+void llvm::appendToGlobalCtors(Module &M, Function *F, int Priority, Constant *Data) {
+  appendToGlobalArray("llvm.global_ctors", M, F, Priority, Data);
 }
 
-void llvm::appendToGlobalDtors(Module &M, Function *F, int Priority) {
-  appendToGlobalArray("llvm.global_dtors", M, F, Priority);
-}
-
-GlobalVariable *
-llvm::collectUsedGlobalVariables(Module &M, SmallPtrSetImpl<GlobalValue *> &Set,
-                                 bool CompilerUsed) {
-  const char *Name = CompilerUsed ? "llvm.compiler.used" : "llvm.used";
-  GlobalVariable *GV = M.getGlobalVariable(Name);
-  if (!GV || !GV->hasInitializer())
-    return GV;
-
-  const ConstantArray *Init = cast<ConstantArray>(GV->getInitializer());
-  for (unsigned I = 0, E = Init->getNumOperands(); I != E; ++I) {
-    Value *Op = Init->getOperand(I);
-    GlobalValue *G = cast<GlobalValue>(Op->stripPointerCastsNoFollowAliases());
-    Set.insert(G);
-  }
-  return GV;
+void llvm::appendToGlobalDtors(Module &M, Function *F, int Priority, Constant *Data) {
+  appendToGlobalArray("llvm.global_dtors", M, F, Priority, Data);
 }
 
 Function *llvm::checkSanitizerInterfaceFunction(Constant *FuncOrBitcast) {
@@ -132,4 +126,3 @@ std::pair<Function *, Function *> llvm::createSanitizerCtorAndInitFunctions(
   }
   return std::make_pair(Ctor, InitFunction);
 }
-
diff --git a/lib/Transforms/Utils/NameAnonFunctions.cpp b/lib/Transforms/Utils/NameAnonFunctions.cpp
new file mode 100644
index 000000000000..c4f3839d8482
--- /dev/null
+++ b/lib/Transforms/Utils/NameAnonFunctions.cpp
@@ -0,0 +1,102 @@
+//===- NameAnonFunctions.cpp - ThinLTO Summary-based Function Import ------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements naming anonymous function to make sure they can be
+// refered to by ThinLTO.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/ADT/SmallString.h"
+#include "llvm/IR/Module.h"
+#include "llvm/Support/MD5.h"
+#include "llvm/Transforms/Utils/ModuleUtils.h"
+
+using namespace llvm;
+
+// Compute a "unique" hash for the module based on the name of the public
+// functions.
+class ModuleHasher {
+  Module &TheModule;
+  std::string TheHash;
+
+public:
+  ModuleHasher(Module &M) : TheModule(M) {}
+
+  /// Return the lazily computed hash.
+  std::string &get() {
+    if (!TheHash.empty())
+      // Cache hit :)
+      return TheHash;
+
+    MD5 Hasher;
+    for (auto &F : TheModule) {
+      if (F.isDeclaration() || F.hasLocalLinkage() || !F.hasName())
+        continue;
+      auto Name = F.getName();
+      Hasher.update(Name);
+    }
+    for (auto &GV : TheModule.globals()) {
+      if (GV.isDeclaration() || GV.hasLocalLinkage() || !GV.hasName())
+        continue;
+      auto Name = GV.getName();
+      Hasher.update(Name);
+    }
+
+    // Now return the result.
+    MD5::MD5Result Hash;
+    Hasher.final(Hash);
+    SmallString<32> Result;
+    MD5::stringifyResult(Hash, Result);
+    TheHash = Result.str();
+    return TheHash;
+  }
+};
+
+// Rename all the anon functions in the module
+bool llvm::nameUnamedFunctions(Module &M) {
+  bool Changed = false;
+  ModuleHasher ModuleHash(M);
+  int count = 0;
+  for (auto &F : M) {
+    if (F.hasName())
+      continue;
+    F.setName(Twine("anon.") + ModuleHash.get() + "." + Twine(count++));
+    Changed = true;
+  }
+  return Changed;
+}
+
+namespace {
+
+// Simple pass that provides a name to every anon function.
+class NameAnonFunction : public ModulePass {
+
+public:
+  /// Pass identification, replacement for typeid
+  static char ID;
+
+  /// Specify pass name for debug output
+  const char *getPassName() const override { return "Name Anon Functions"; }
+
+  explicit NameAnonFunction() : ModulePass(ID) {}
+
+  bool runOnModule(Module &M) override { return nameUnamedFunctions(M); }
+};
+char NameAnonFunction::ID = 0;
+
+} // anonymous namespace
+
+INITIALIZE_PASS_BEGIN(NameAnonFunction, "name-anon-functions",
+                      "Provide a name to nameless functions", false, false)
+INITIALIZE_PASS_END(NameAnonFunction, "name-anon-functions",
+                    "Provide a name to nameless functions", false, false)
+
+namespace llvm {
+ModulePass *createNameAnonFunctionPass() { return new NameAnonFunction(); }
+}
diff --git a/lib/Transforms/Utils/PromoteMemoryToRegister.cpp b/lib/Transforms/Utils/PromoteMemoryToRegister.cpp
index c4f9b9f61407..cbf385d56339 100644
--- a/lib/Transforms/Utils/PromoteMemoryToRegister.cpp
+++ b/lib/Transforms/Utils/PromoteMemoryToRegister.cpp
@@ -523,7 +523,7 @@ void PromoteMem2Reg::run() {
 
   AllocaInfo Info;
   LargeBlockInfo LBI;
-  IDFCalculator IDF(DT);
+  ForwardIDFCalculator IDF(DT);
 
   for (unsigned AllocaNum = 0; AllocaNum != Allocas.size(); ++AllocaNum) {
     AllocaInst *AI = Allocas[AllocaNum];
@@ -802,7 +802,8 @@ void PromoteMem2Reg::ComputeLiveInBlocks(
         // actually live-in here.
         LiveInBlockWorklist[i] = LiveInBlockWorklist.back();
         LiveInBlockWorklist.pop_back();
-        --i, --e;
+        --i;
+        --e;
         break;
       }
 
diff --git a/lib/Transforms/Utils/SanitizerStats.cpp b/lib/Transforms/Utils/SanitizerStats.cpp
new file mode 100644
index 000000000000..9afd175c10ed
--- /dev/null
+++ b/lib/Transforms/Utils/SanitizerStats.cpp
@@ -0,0 +1,108 @@
+//===- SanitizerStats.cpp - Sanitizer statistics gathering ----------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// Implements code generation for sanitizer statistics gathering.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Transforms/Utils/SanitizerStats.h"
+#include "llvm/Transforms/Utils/ModuleUtils.h"
+#include "llvm/ADT/Triple.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/DerivedTypes.h"
+#include "llvm/IR/GlobalVariable.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/Module.h"
+
+using namespace llvm;
+
+SanitizerStatReport::SanitizerStatReport(Module *M) : M(M) {
+  StatTy = ArrayType::get(Type::getInt8PtrTy(M->getContext()), 2);
+  EmptyModuleStatsTy = makeModuleStatsTy();
+
+  ModuleStatsGV = new GlobalVariable(*M, EmptyModuleStatsTy, false,
+                                     GlobalValue::InternalLinkage, nullptr);
+}
+
+ArrayType *SanitizerStatReport::makeModuleStatsArrayTy() {
+  return ArrayType::get(StatTy, Inits.size());
+}
+
+StructType *SanitizerStatReport::makeModuleStatsTy() {
+  return StructType::get(M->getContext(), {Type::getInt8PtrTy(M->getContext()),
+                                           Type::getInt32Ty(M->getContext()),
+                                           makeModuleStatsArrayTy()});
+}
+
+void SanitizerStatReport::create(IRBuilder<> &B, SanitizerStatKind SK) {
+  Function *F = B.GetInsertBlock()->getParent();
+  Module *M = F->getParent();
+  PointerType *Int8PtrTy = B.getInt8PtrTy();
+  IntegerType *IntPtrTy = B.getIntPtrTy(M->getDataLayout());
+  ArrayType *StatTy = ArrayType::get(Int8PtrTy, 2);
+
+  Inits.push_back(ConstantArray::get(
+      StatTy,
+      {Constant::getNullValue(Int8PtrTy),
+       ConstantExpr::getIntToPtr(
+           ConstantInt::get(IntPtrTy, uint64_t(SK) << (IntPtrTy->getBitWidth() -
+                                                       kSanitizerStatKindBits)),
+           Int8PtrTy)}));
+
+  FunctionType *StatReportTy =
+      FunctionType::get(B.getVoidTy(), Int8PtrTy, false);
+  Constant *StatReport = M->getOrInsertFunction(
+      "__sanitizer_stat_report", StatReportTy);
+
+  auto InitAddr = ConstantExpr::getGetElementPtr(
+      EmptyModuleStatsTy, ModuleStatsGV,
+      ArrayRef<Constant *>{
+          ConstantInt::get(IntPtrTy, 0), ConstantInt::get(B.getInt32Ty(), 2),
+          ConstantInt::get(IntPtrTy, Inits.size() - 1),
+      });
+  B.CreateCall(StatReport, ConstantExpr::getBitCast(InitAddr, Int8PtrTy));
+}
+
+void SanitizerStatReport::finish() {
+  if (Inits.empty()) {
+    ModuleStatsGV->eraseFromParent();
+    return;
+  }
+
+  PointerType *Int8PtrTy = Type::getInt8PtrTy(M->getContext());
+  IntegerType *Int32Ty = Type::getInt32Ty(M->getContext());
+  Type *VoidTy = Type::getVoidTy(M->getContext());
+
+  // Create a new ModuleStatsGV to replace the old one. We can't just set the
+  // old one's initializer because its type is different.
+  auto NewModuleStatsGV = new GlobalVariable(
+      *M, makeModuleStatsTy(), false, GlobalValue::InternalLinkage,
+      ConstantStruct::getAnon(
+          {Constant::getNullValue(Int8PtrTy),
+           ConstantInt::get(Int32Ty, Inits.size()),
+           ConstantArray::get(makeModuleStatsArrayTy(), Inits)}));
+  ModuleStatsGV->replaceAllUsesWith(
+      ConstantExpr::getBitCast(NewModuleStatsGV, ModuleStatsGV->getType()));
+  ModuleStatsGV->eraseFromParent();
+
+  // Create a global constructor to register NewModuleStatsGV.
+  auto F = Function::Create(FunctionType::get(VoidTy, false),
+                            GlobalValue::InternalLinkage, "", M);
+  auto BB = BasicBlock::Create(M->getContext(), "", F);
+  IRBuilder<> B(BB);
+
+  FunctionType *StatInitTy = FunctionType::get(VoidTy, Int8PtrTy, false);
+  Constant *StatInit = M->getOrInsertFunction(
+      "__sanitizer_stat_init", StatInitTy);
+
+  B.CreateCall(StatInit, ConstantExpr::getBitCast(NewModuleStatsGV, Int8PtrTy));
+  B.CreateRetVoid();
+
+  appendToGlobalCtors(*M, F, 0);
+}
diff --git a/lib/Transforms/Utils/SimplifyCFG.cpp b/lib/Transforms/Utils/SimplifyCFG.cpp
index e484b690597e..0504646c304e 100644
--- a/lib/Transforms/Utils/SimplifyCFG.cpp
+++ b/lib/Transforms/Utils/SimplifyCFG.cpp
@@ -11,7 +11,6 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "llvm/Transforms/Utils/Local.h"
 #include "llvm/ADT/DenseMap.h"
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/SetOperations.h"
@@ -45,6 +44,7 @@
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/Transforms/Utils/BasicBlockUtils.h"
+#include "llvm/Transforms/Utils/Local.h"
 #include "llvm/Transforms/Utils/ValueMapper.h"
 #include <algorithm>
 #include <map>
@@ -58,17 +58,18 @@ using namespace PatternMatch;
 // a select, so the "clamp" idiom (of a min followed by a max) will be caught.
 // To catch this, we need to fold a compare and a select, hence '2' being the
 // minimum reasonable default.
-static cl::opt<unsigned>
-PHINodeFoldingThreshold("phi-node-folding-threshold", cl::Hidden, cl::init(2),
-   cl::desc("Control the amount of phi node folding to perform (default = 2)"));
+static cl::opt<unsigned> PHINodeFoldingThreshold(
+    "phi-node-folding-threshold", cl::Hidden, cl::init(2),
+    cl::desc(
+        "Control the amount of phi node folding to perform (default = 2)"));
 
-static cl::opt<bool>
-DupRet("simplifycfg-dup-ret", cl::Hidden, cl::init(false),
-       cl::desc("Duplicate return instructions into unconditional branches"));
+static cl::opt<bool> DupRet(
+    "simplifycfg-dup-ret", cl::Hidden, cl::init(false),
+    cl::desc("Duplicate return instructions into unconditional branches"));
 
 static cl::opt<bool>
-SinkCommon("simplifycfg-sink-common", cl::Hidden, cl::init(true),
-       cl::desc("Sink common instructions down to the end block"));
+    SinkCommon("simplifycfg-sink-common", cl::Hidden, cl::init(true),
+               cl::desc("Sink common instructions down to the end block"));
 
 static cl::opt<bool> HoistCondStores(
     "simplifycfg-hoist-cond-stores", cl::Hidden, cl::init(true),
@@ -96,48 +97,54 @@ static cl::opt<unsigned> MaxSpeculationDepth(
              "speculatively executed instructions"));
 
 STATISTIC(NumBitMaps, "Number of switch instructions turned into bitmaps");
-STATISTIC(NumLinearMaps, "Number of switch instructions turned into linear mapping");
-STATISTIC(NumLookupTables, "Number of switch instructions turned into lookup tables");
-STATISTIC(NumLookupTablesHoles, "Number of switch instructions turned into lookup tables (holes checked)");
+STATISTIC(NumLinearMaps,
+          "Number of switch instructions turned into linear mapping");
+STATISTIC(NumLookupTables,
+          "Number of switch instructions turned into lookup tables");
+STATISTIC(
+    NumLookupTablesHoles,
+    "Number of switch instructions turned into lookup tables (holes checked)");
 STATISTIC(NumTableCmpReuses, "Number of reused switch table lookup compares");
-STATISTIC(NumSinkCommons, "Number of common instructions sunk down to the end block");
+STATISTIC(NumSinkCommons,
+          "Number of common instructions sunk down to the end block");
 STATISTIC(NumSpeculations, "Number of speculative executed instructions");
 
 namespace {
-  // The first field contains the value that the switch produces when a certain
-  // case group is selected, and the second field is a vector containing the
-  // cases composing the case group.
-  typedef SmallVector<std::pair<Constant *, SmallVector<ConstantInt *, 4>>, 2>
+// The first field contains the value that the switch produces when a certain
+// case group is selected, and the second field is a vector containing the
+// cases composing the case group.
+typedef SmallVector<std::pair<Constant *, SmallVector<ConstantInt *, 4>>, 2>
     SwitchCaseResultVectorTy;
-  // The first field contains the phi node that generates a result of the switch
-  // and the second field contains the value generated for a certain case in the
-  // switch for that PHI.
-  typedef SmallVector<std::pair<PHINode *, Constant *>, 4> SwitchCaseResultsTy;
+// The first field contains the phi node that generates a result of the switch
+// and the second field contains the value generated for a certain case in the
+// switch for that PHI.
+typedef SmallVector<std::pair<PHINode *, Constant *>, 4> SwitchCaseResultsTy;
 
-  /// ValueEqualityComparisonCase - Represents a case of a switch.
-  struct ValueEqualityComparisonCase {
-    ConstantInt *Value;
-    BasicBlock *Dest;
+/// ValueEqualityComparisonCase - Represents a case of a switch.
+struct ValueEqualityComparisonCase {
+  ConstantInt *Value;
+  BasicBlock *Dest;
 
-    ValueEqualityComparisonCase(ConstantInt *Value, BasicBlock *Dest)
+  ValueEqualityComparisonCase(ConstantInt *Value, BasicBlock *Dest)
       : Value(Value), Dest(Dest) {}
 
-    bool operator<(ValueEqualityComparisonCase RHS) const {
-      // Comparing pointers is ok as we only rely on the order for uniquing.
-      return Value < RHS.Value;
-    }
+  bool operator<(ValueEqualityComparisonCase RHS) const {
+    // Comparing pointers is ok as we only rely on the order for uniquing.
+    return Value < RHS.Value;
+  }
 
-    bool operator==(BasicBlock *RHSDest) const { return Dest == RHSDest; }
-  };
+  bool operator==(BasicBlock *RHSDest) const { return Dest == RHSDest; }
+};
 
 class SimplifyCFGOpt {
   const TargetTransformInfo &TTI;
   const DataLayout &DL;
   unsigned BonusInstThreshold;
   AssumptionCache *AC;
+  SmallPtrSetImpl<BasicBlock *> *LoopHeaders;
   Value *isValueEqualityComparison(TerminatorInst *TI);
-  BasicBlock *GetValueEqualityComparisonCases(TerminatorInst *TI,
-                               std::vector<ValueEqualityComparisonCase> &Cases);
+  BasicBlock *GetValueEqualityComparisonCases(
+      TerminatorInst *TI, std::vector<ValueEqualityComparisonCase> &Cases);
   bool SimplifyEqualityComparisonWithOnlyPredecessor(TerminatorInst *TI,
                                                      BasicBlock *Pred,
                                                      IRBuilder<> &Builder);
@@ -152,13 +159,15 @@ class SimplifyCFGOpt {
   bool SimplifyUnreachable(UnreachableInst *UI);
   bool SimplifySwitch(SwitchInst *SI, IRBuilder<> &Builder);
   bool SimplifyIndirectBr(IndirectBrInst *IBI);
-  bool SimplifyUncondBranch(BranchInst *BI, IRBuilder <> &Builder);
-  bool SimplifyCondBranch(BranchInst *BI, IRBuilder <>&Builder);
+  bool SimplifyUncondBranch(BranchInst *BI, IRBuilder<> &Builder);
+  bool SimplifyCondBranch(BranchInst *BI, IRBuilder<> &Builder);
 
 public:
   SimplifyCFGOpt(const TargetTransformInfo &TTI, const DataLayout &DL,
-                 unsigned BonusInstThreshold, AssumptionCache *AC)
-      : TTI(TTI), DL(DL), BonusInstThreshold(BonusInstThreshold), AC(AC) {}
+                 unsigned BonusInstThreshold, AssumptionCache *AC,
+                 SmallPtrSetImpl<BasicBlock *> *LoopHeaders)
+      : TTI(TTI), DL(DL), BonusInstThreshold(BonusInstThreshold), AC(AC),
+        LoopHeaders(LoopHeaders) {}
   bool run(BasicBlock *BB);
 };
 }
@@ -166,19 +175,19 @@ public:
 /// Return true if it is safe to merge these two
 /// terminator instructions together.
 static bool SafeToMergeTerminators(TerminatorInst *SI1, TerminatorInst *SI2) {
-  if (SI1 == SI2) return false;  // Can't merge with self!
+  if (SI1 == SI2)
+    return false; // Can't merge with self!
 
   // It is not safe to merge these two switch instructions if they have a common
   // successor, and if that successor has a PHI node, and if *that* PHI node has
   // conflicting incoming values from the two switch blocks.
   BasicBlock *SI1BB = SI1->getParent();
   BasicBlock *SI2BB = SI2->getParent();
-  SmallPtrSet<BasicBlock*, 16> SI1Succs(succ_begin(SI1BB), succ_end(SI1BB));
+  SmallPtrSet<BasicBlock *, 16> SI1Succs(succ_begin(SI1BB), succ_end(SI1BB));
 
-  for (succ_iterator I = succ_begin(SI2BB), E = succ_end(SI2BB); I != E; ++I)
-    if (SI1Succs.count(*I))
-      for (BasicBlock::iterator BBI = (*I)->begin();
-           isa<PHINode>(BBI); ++BBI) {
+  for (BasicBlock *Succ : successors(SI2BB))
+    if (SI1Succs.count(Succ))
+      for (BasicBlock::iterator BBI = Succ->begin(); isa<PHINode>(BBI); ++BBI) {
         PHINode *PN = cast<PHINode>(BBI);
         if (PN->getIncomingValueForBlock(SI1BB) !=
             PN->getIncomingValueForBlock(SI2BB))
@@ -191,11 +200,12 @@ static bool SafeToMergeTerminators(TerminatorInst *SI1, TerminatorInst *SI2) {
 /// Return true if it is safe and profitable to merge these two terminator
 /// instructions together, where SI1 is an unconditional branch. PhiNodes will
 /// store all PHI nodes in common successors.
-static bool isProfitableToFoldUnconditional(BranchInst *SI1,
-                                          BranchInst *SI2,
-                                          Instruction *Cond,
-                                          SmallVectorImpl<PHINode*> &PhiNodes) {
-  if (SI1 == SI2) return false;  // Can't merge with self!
+static bool
+isProfitableToFoldUnconditional(BranchInst *SI1, BranchInst *SI2,
+                                Instruction *Cond,
+                                SmallVectorImpl<PHINode *> &PhiNodes) {
+  if (SI1 == SI2)
+    return false; // Can't merge with self!
   assert(SI1->isUnconditional() && SI2->isConditional());
 
   // We fold the unconditional branch if we can easily update all PHI nodes in
@@ -204,7 +214,8 @@ static bool isProfitableToFoldUnconditional(BranchInst *SI1,
   // 2> We have "Cond" as the incoming value for the unconditional branch;
   // 3> SI2->getCondition() and Cond have same operands.
   CmpInst *Ci2 = dyn_cast<CmpInst>(SI2->getCondition());
-  if (!Ci2) return false;
+  if (!Ci2)
+    return false;
   if (!(Cond->getOperand(0) == Ci2->getOperand(0) &&
         Cond->getOperand(1) == Ci2->getOperand(1)) &&
       !(Cond->getOperand(0) == Ci2->getOperand(1) &&
@@ -213,11 +224,10 @@ static bool isProfitableToFoldUnconditional(BranchInst *SI1,
 
   BasicBlock *SI1BB = SI1->getParent();
   BasicBlock *SI2BB = SI2->getParent();
-  SmallPtrSet<BasicBlock*, 16> SI1Succs(succ_begin(SI1BB), succ_end(SI1BB));
-  for (succ_iterator I = succ_begin(SI2BB), E = succ_end(SI2BB); I != E; ++I)
-    if (SI1Succs.count(*I))
-      for (BasicBlock::iterator BBI = (*I)->begin();
-           isa<PHINode>(BBI); ++BBI) {
+  SmallPtrSet<BasicBlock *, 16> SI1Succs(succ_begin(SI1BB), succ_end(SI1BB));
+  for (BasicBlock *Succ : successors(SI2BB))
+    if (SI1Succs.count(Succ))
+      for (BasicBlock::iterator BBI = Succ->begin(); isa<PHINode>(BBI); ++BBI) {
         PHINode *PN = cast<PHINode>(BBI);
         if (PN->getIncomingValueForBlock(SI1BB) != Cond ||
             !isa<ConstantInt>(PN->getIncomingValueForBlock(SI2BB)))
@@ -233,11 +243,11 @@ static bool isProfitableToFoldUnconditional(BranchInst *SI1,
 /// of Succ.
 static void AddPredecessorToBlock(BasicBlock *Succ, BasicBlock *NewPred,
                                   BasicBlock *ExistPred) {
-  if (!isa<PHINode>(Succ->begin())) return; // Quick exit if nothing to do
+  if (!isa<PHINode>(Succ->begin()))
+    return; // Quick exit if nothing to do
 
   PHINode *PN;
-  for (BasicBlock::iterator I = Succ->begin();
-       (PN = dyn_cast<PHINode>(I)); ++I)
+  for (BasicBlock::iterator I = Succ->begin(); (PN = dyn_cast<PHINode>(I)); ++I)
     PN->addIncoming(PN->getIncomingValueForBlock(ExistPred), NewPred);
 }
 
@@ -270,7 +280,7 @@ static unsigned ComputeSpeculationCost(const User *I,
 /// V plus its non-dominating operands.  If that cost is greater than
 /// CostRemaining, false is returned and CostRemaining is undefined.
 static bool DominatesMergePoint(Value *V, BasicBlock *BB,
-                                SmallPtrSetImpl<Instruction*> *AggressiveInsts,
+                                SmallPtrSetImpl<Instruction *> *AggressiveInsts,
                                 unsigned &CostRemaining,
                                 const TargetTransformInfo &TTI,
                                 unsigned Depth = 0) {
@@ -294,7 +304,8 @@ static bool DominatesMergePoint(Value *V, BasicBlock *BB,
 
   // We don't want to allow weird loops that might have the "if condition" in
   // the bottom of this block.
-  if (PBB == BB) return false;
+  if (PBB == BB)
+    return false;
 
   // If this instruction is defined in a block that contains an unconditional
   // branch to BB, then it must be in the 'conditional' part of the "if
@@ -305,10 +316,12 @@ static bool DominatesMergePoint(Value *V, BasicBlock *BB,
 
   // If we aren't allowing aggressive promotion anymore, then don't consider
   // instructions in the 'if region'.
-  if (!AggressiveInsts) return false;
+  if (!AggressiveInsts)
+    return false;
 
   // If we have seen this instruction before, don't count it again.
-  if (AggressiveInsts->count(I)) return true;
+  if (AggressiveInsts->count(I))
+    return true;
 
   // Okay, it looks like the instruction IS in the "condition".  Check to
   // see if it's a cheap instruction to unconditionally compute, and if it
@@ -366,8 +379,8 @@ static ConstantInt *GetConstantInt(Value *V, const DataLayout &DL) {
         if (CI->getType() == PtrTy)
           return CI;
         else
-          return cast<ConstantInt>
-            (ConstantExpr::getIntegerCast(CI, PtrTy, /*isSigned=*/false));
+          return cast<ConstantInt>(
+              ConstantExpr::getIntegerCast(CI, PtrTy, /*isSigned=*/false));
       }
   return nullptr;
 }
@@ -403,11 +416,11 @@ struct ConstantComparesGatherer {
   operator=(const ConstantComparesGatherer &) = delete;
 
 private:
-
   /// Try to set the current value used for the comparison, it succeeds only if
   /// it wasn't set before or if the new value is the same as the old one
   bool setValueOnce(Value *NewVal) {
-    if(CompValue && CompValue != NewVal) return false;
+    if (CompValue && CompValue != NewVal)
+      return false;
     CompValue = NewVal;
     return (CompValue != nullptr);
   }
@@ -424,35 +437,99 @@ private:
     ICmpInst *ICI;
     ConstantInt *C;
     if (!((ICI = dyn_cast<ICmpInst>(I)) &&
-             (C = GetConstantInt(I->getOperand(1), DL)))) {
+          (C = GetConstantInt(I->getOperand(1), DL)))) {
       return false;
     }
 
     Value *RHSVal;
-    ConstantInt *RHSC;
+    const APInt *RHSC;
 
     // Pattern match a special case
-    // (x & ~2^x) == y --> x == y || x == y|2^x
+    // (x & ~2^z) == y --> x == y || x == y|2^z
     // This undoes a transformation done by instcombine to fuse 2 compares.
-    if (ICI->getPredicate() == (isEQ ? ICmpInst::ICMP_EQ:ICmpInst::ICMP_NE)) {
+    if (ICI->getPredicate() == (isEQ ? ICmpInst::ICMP_EQ : ICmpInst::ICMP_NE)) {
+
+      // It's a little bit hard to see why the following transformations are
+      // correct. Here is a CVC3 program to verify them for 64-bit values:
+
+      /*
+         ONE  : BITVECTOR(64) = BVZEROEXTEND(0bin1, 63);
+         x    : BITVECTOR(64);
+         y    : BITVECTOR(64);
+         z    : BITVECTOR(64);
+         mask : BITVECTOR(64) = BVSHL(ONE, z);
+         QUERY( (y & ~mask = y) =>
+                ((x & ~mask = y) <=> (x = y OR x = (y |  mask)))
+         );
+         QUERY( (y |  mask = y) =>
+                ((x |  mask = y) <=> (x = y OR x = (y & ~mask)))
+         );
+      */
+
+      // Please note that each pattern must be a dual implication (<--> or
+      // iff). One directional implication can create spurious matches. If the
+      // implication is only one-way, an unsatisfiable condition on the left
+      // side can imply a satisfiable condition on the right side. Dual
+      // implication ensures that satisfiable conditions are transformed to
+      // other satisfiable conditions and unsatisfiable conditions are
+      // transformed to other unsatisfiable conditions.
+
+      // Here is a concrete example of a unsatisfiable condition on the left
+      // implying a satisfiable condition on the right:
+      //
+      // mask = (1 << z)
+      // (x & ~mask) == y  --> (x == y || x == (y | mask))
+      //
+      // Substituting y = 3, z = 0 yields:
+      // (x & -2) == 3 --> (x == 3 || x == 2)
+
+      // Pattern match a special case:
+      /*
+        QUERY( (y & ~mask = y) =>
+               ((x & ~mask = y) <=> (x = y OR x = (y |  mask)))
+        );
+      */
       if (match(ICI->getOperand(0),
-                m_And(m_Value(RHSVal), m_ConstantInt(RHSC)))) {
-        APInt Not = ~RHSC->getValue();
-        if (Not.isPowerOf2()) {
+                m_And(m_Value(RHSVal), m_APInt(RHSC)))) {
+        APInt Mask = ~*RHSC;
+        if (Mask.isPowerOf2() && (C->getValue() & ~Mask) == C->getValue()) {
           // If we already have a value for the switch, it has to match!
-          if(!setValueOnce(RHSVal))
+          if (!setValueOnce(RHSVal))
+            return false;
+
+          Vals.push_back(C);
+          Vals.push_back(
+              ConstantInt::get(C->getContext(),
+                               C->getValue() | Mask));
+          UsedICmps++;
+          return true;
+        }
+      }
+
+      // Pattern match a special case:
+      /*
+        QUERY( (y |  mask = y) =>
+               ((x |  mask = y) <=> (x = y OR x = (y & ~mask)))
+        );
+      */
+      if (match(ICI->getOperand(0),
+                m_Or(m_Value(RHSVal), m_APInt(RHSC)))) {
+        APInt Mask = *RHSC;
+        if (Mask.isPowerOf2() && (C->getValue() | Mask) == C->getValue()) {
+          // If we already have a value for the switch, it has to match!
+          if (!setValueOnce(RHSVal))
             return false;
 
           Vals.push_back(C);
           Vals.push_back(ConstantInt::get(C->getContext(),
-                                          C->getValue() | Not));
+                                          C->getValue() & ~Mask));
           UsedICmps++;
           return true;
         }
       }
 
       // If we already have a value for the switch, it has to match!
-      if(!setValueOnce(ICI->getOperand(0)))
+      if (!setValueOnce(ICI->getOperand(0)))
         return false;
 
       UsedICmps++;
@@ -467,8 +544,8 @@ private:
     // Shift the range if the compare is fed by an add. This is the range
     // compare idiom as emitted by instcombine.
     Value *CandidateVal = I->getOperand(0);
-    if(match(I->getOperand(0), m_Add(m_Value(RHSVal), m_ConstantInt(RHSC)))) {
-      Span = Span.subtract(RHSC->getValue());
+    if (match(I->getOperand(0), m_Add(m_Value(RHSVal), m_APInt(RHSC)))) {
+      Span = Span.subtract(*RHSC);
       CandidateVal = RHSVal;
     }
 
@@ -484,7 +561,7 @@ private:
     }
 
     // If we already have a value for the switch, it has to match!
-    if(!setValueOnce(CandidateVal))
+    if (!setValueOnce(CandidateVal))
       return false;
 
     // Add all values from the range to the set
@@ -493,7 +570,6 @@ private:
 
     UsedICmps++;
     return true;
-
   }
 
   /// Given a potentially 'or'd or 'and'd together collection of icmp
@@ -507,18 +583,22 @@ private:
 
     // Keep a stack (SmallVector for efficiency) for depth-first traversal
     SmallVector<Value *, 8> DFT;
+    SmallPtrSet<Value *, 8> Visited;
 
     // Initialize
+    Visited.insert(V);
     DFT.push_back(V);
 
-    while(!DFT.empty()) {
+    while (!DFT.empty()) {
       V = DFT.pop_back_val();
 
       if (Instruction *I = dyn_cast<Instruction>(V)) {
         // If it is a || (or && depending on isEQ), process the operands.
         if (I->getOpcode() == (isEQ ? Instruction::Or : Instruction::And)) {
-          DFT.push_back(I->getOperand(1));
-          DFT.push_back(I->getOperand(0));
+          if (Visited.insert(I->getOperand(1)).second)
+            DFT.push_back(I->getOperand(1));
+          if (Visited.insert(I->getOperand(0)).second)
+            DFT.push_back(I->getOperand(0));
           continue;
         }
 
@@ -541,7 +621,6 @@ private:
     }
   }
 };
-
 }
 
 static void EraseTerminatorInstAndDCECond(TerminatorInst *TI) {
@@ -556,7 +635,8 @@ static void EraseTerminatorInstAndDCECond(TerminatorInst *TI) {
   }
 
   TI->eraseFromParent();
-  if (Cond) RecursivelyDeleteTriviallyDeadInstructions(Cond);
+  if (Cond)
+    RecursivelyDeleteTriviallyDeadInstructions(Cond);
 }
 
 /// Return true if the specified terminator checks
@@ -566,8 +646,9 @@ Value *SimplifyCFGOpt::isValueEqualityComparison(TerminatorInst *TI) {
   if (SwitchInst *SI = dyn_cast<SwitchInst>(TI)) {
     // Do not permit merging of large switch instructions into their
     // predecessors unless there is only one predecessor.
-    if (SI->getNumSuccessors()*std::distance(pred_begin(SI->getParent()),
-                                             pred_end(SI->getParent())) <= 128)
+    if (SI->getNumSuccessors() * std::distance(pred_begin(SI->getParent()),
+                                               pred_end(SI->getParent())) <=
+        128)
       CV = SI->getCondition();
   } else if (BranchInst *BI = dyn_cast<BranchInst>(TI))
     if (BI->isConditional() && BI->getCondition()->hasOneUse())
@@ -589,46 +670,44 @@ Value *SimplifyCFGOpt::isValueEqualityComparison(TerminatorInst *TI) {
 
 /// Given a value comparison instruction,
 /// decode all of the 'cases' that it represents and return the 'default' block.
-BasicBlock *SimplifyCFGOpt::
-GetValueEqualityComparisonCases(TerminatorInst *TI,
-                                std::vector<ValueEqualityComparisonCase>
-                                                                       &Cases) {
+BasicBlock *SimplifyCFGOpt::GetValueEqualityComparisonCases(
+    TerminatorInst *TI, std::vector<ValueEqualityComparisonCase> &Cases) {
   if (SwitchInst *SI = dyn_cast<SwitchInst>(TI)) {
     Cases.reserve(SI->getNumCases());
-    for (SwitchInst::CaseIt i = SI->case_begin(), e = SI->case_end(); i != e; ++i)
-      Cases.push_back(ValueEqualityComparisonCase(i.getCaseValue(),
-                                                  i.getCaseSuccessor()));
+    for (SwitchInst::CaseIt i = SI->case_begin(), e = SI->case_end(); i != e;
+         ++i)
+      Cases.push_back(
+          ValueEqualityComparisonCase(i.getCaseValue(), i.getCaseSuccessor()));
     return SI->getDefaultDest();
   }
 
   BranchInst *BI = cast<BranchInst>(TI);
   ICmpInst *ICI = cast<ICmpInst>(BI->getCondition());
   BasicBlock *Succ = BI->getSuccessor(ICI->getPredicate() == ICmpInst::ICMP_NE);
-  Cases.push_back(ValueEqualityComparisonCase(GetConstantInt(ICI->getOperand(1),
-                                                             DL),
-                                              Succ));
+  Cases.push_back(ValueEqualityComparisonCase(
+      GetConstantInt(ICI->getOperand(1), DL), Succ));
   return BI->getSuccessor(ICI->getPredicate() == ICmpInst::ICMP_EQ);
 }
 
-
 /// Given a vector of bb/value pairs, remove any entries
 /// in the list that match the specified block.
-static void EliminateBlockCases(BasicBlock *BB,
-                              std::vector<ValueEqualityComparisonCase> &Cases) {
+static void
+EliminateBlockCases(BasicBlock *BB,
+                    std::vector<ValueEqualityComparisonCase> &Cases) {
   Cases.erase(std::remove(Cases.begin(), Cases.end(), BB), Cases.end());
 }
 
 /// Return true if there are any keys in C1 that exist in C2 as well.
-static bool
-ValuesOverlap(std::vector<ValueEqualityComparisonCase> &C1,
-              std::vector<ValueEqualityComparisonCase > &C2) {
+static bool ValuesOverlap(std::vector<ValueEqualityComparisonCase> &C1,
+                          std::vector<ValueEqualityComparisonCase> &C2) {
   std::vector<ValueEqualityComparisonCase> *V1 = &C1, *V2 = &C2;
 
   // Make V1 be smaller than V2.
   if (V1->size() > V2->size())
     std::swap(V1, V2);
 
-  if (V1->size() == 0) return false;
+  if (V1->size() == 0)
+    return false;
   if (V1->size() == 1) {
     // Just scan V2.
     ConstantInt *TheVal = (*V1)[0].Value;
@@ -657,30 +736,30 @@ ValuesOverlap(std::vector<ValueEqualityComparisonCase> &C1,
 /// also a value comparison with the same value, and if that comparison
 /// determines the outcome of this comparison. If so, simplify TI. This does a
 /// very limited form of jump threading.
-bool SimplifyCFGOpt::
-SimplifyEqualityComparisonWithOnlyPredecessor(TerminatorInst *TI,
-                                              BasicBlock *Pred,
-                                              IRBuilder<> &Builder) {
+bool SimplifyCFGOpt::SimplifyEqualityComparisonWithOnlyPredecessor(
+    TerminatorInst *TI, BasicBlock *Pred, IRBuilder<> &Builder) {
   Value *PredVal = isValueEqualityComparison(Pred->getTerminator());
-  if (!PredVal) return false;  // Not a value comparison in predecessor.
+  if (!PredVal)
+    return false; // Not a value comparison in predecessor.
 
   Value *ThisVal = isValueEqualityComparison(TI);
   assert(ThisVal && "This isn't a value comparison!!");
-  if (ThisVal != PredVal) return false;  // Different predicates.
+  if (ThisVal != PredVal)
+    return false; // Different predicates.
 
   // TODO: Preserve branch weight metadata, similarly to how
   // FoldValueComparisonIntoPredecessors preserves it.
 
   // Find out information about when control will move from Pred to TI's block.
   std::vector<ValueEqualityComparisonCase> PredCases;
-  BasicBlock *PredDef = GetValueEqualityComparisonCases(Pred->getTerminator(),
-                                                        PredCases);
-  EliminateBlockCases(PredDef, PredCases);  // Remove default from cases.
+  BasicBlock *PredDef =
+      GetValueEqualityComparisonCases(Pred->getTerminator(), PredCases);
+  EliminateBlockCases(PredDef, PredCases); // Remove default from cases.
 
   // Find information about how control leaves this block.
   std::vector<ValueEqualityComparisonCase> ThisCases;
   BasicBlock *ThisDef = GetValueEqualityComparisonCases(TI, ThisCases);
-  EliminateBlockCases(ThisDef, ThisCases);  // Remove default from cases.
+  EliminateBlockCases(ThisDef, ThisCases); // Remove default from cases.
 
   // If TI's block is the default block from Pred's comparison, potentially
   // simplify TI based on this knowledge.
@@ -697,13 +776,14 @@ SimplifyEqualityComparisonWithOnlyPredecessor(TerminatorInst *TI,
       assert(ThisCases.size() == 1 && "Branch can only have one case!");
       // Insert the new branch.
       Instruction *NI = Builder.CreateBr(ThisDef);
-      (void) NI;
+      (void)NI;
 
       // Remove PHI node entries for the dead edge.
       ThisCases[0].Dest->removePredecessor(TI->getParent());
 
       DEBUG(dbgs() << "Threading pred instr: " << *Pred->getTerminator()
-           << "Through successor TI: " << *TI << "Leaving: " << *NI << "\n");
+                   << "Through successor TI: " << *TI << "Leaving: " << *NI
+                   << "\n");
 
       EraseTerminatorInstAndDCECond(TI);
       return true;
@@ -711,7 +791,7 @@ SimplifyEqualityComparisonWithOnlyPredecessor(TerminatorInst *TI,
 
     SwitchInst *SI = cast<SwitchInst>(TI);
     // Okay, TI has cases that are statically dead, prune them away.
-    SmallPtrSet<Constant*, 16> DeadCases;
+    SmallPtrSet<Constant *, 16> DeadCases;
     for (unsigned i = 0, e = PredCases.size(); i != e; ++i)
       DeadCases.insert(PredCases[i].Value);
 
@@ -732,7 +812,7 @@ SimplifyEqualityComparisonWithOnlyPredecessor(TerminatorInst *TI,
       --i;
       if (DeadCases.count(i.getCaseValue())) {
         if (HasWeight) {
-          std::swap(Weights[i.getCaseIndex()+1], Weights.back());
+          std::swap(Weights[i.getCaseIndex() + 1], Weights.back());
           Weights.pop_back();
         }
         i.getCaseSuccessor()->removePredecessor(TI->getParent());
@@ -741,8 +821,8 @@ SimplifyEqualityComparisonWithOnlyPredecessor(TerminatorInst *TI,
     }
     if (HasWeight && Weights.size() >= 2)
       SI->setMetadata(LLVMContext::MD_prof,
-                      MDBuilder(SI->getParent()->getContext()).
-                      createBranchWeights(Weights));
+                      MDBuilder(SI->getParent()->getContext())
+                          .createBranchWeights(Weights));
 
     DEBUG(dbgs() << "Leaving: " << *TI << "\n");
     return true;
@@ -755,7 +835,7 @@ SimplifyEqualityComparisonWithOnlyPredecessor(TerminatorInst *TI,
   for (unsigned i = 0, e = PredCases.size(); i != e; ++i)
     if (PredCases[i].Dest == TIBB) {
       if (TIV)
-        return false;  // Cannot handle multiple values coming to this block.
+        return false; // Cannot handle multiple values coming to this block.
       TIV = PredCases[i].Value;
     }
   assert(TIV && "No edge from pred to succ?");
@@ -770,53 +850,53 @@ SimplifyEqualityComparisonWithOnlyPredecessor(TerminatorInst *TI,
     }
 
   // If not handled by any explicit cases, it is handled by the default case.
-  if (!TheRealDest) TheRealDest = ThisDef;
+  if (!TheRealDest)
+    TheRealDest = ThisDef;
 
   // Remove PHI node entries for dead edges.
   BasicBlock *CheckEdge = TheRealDest;
-  for (succ_iterator SI = succ_begin(TIBB), e = succ_end(TIBB); SI != e; ++SI)
-    if (*SI != CheckEdge)
-      (*SI)->removePredecessor(TIBB);
+  for (BasicBlock *Succ : successors(TIBB))
+    if (Succ != CheckEdge)
+      Succ->removePredecessor(TIBB);
     else
       CheckEdge = nullptr;
 
   // Insert the new branch.
   Instruction *NI = Builder.CreateBr(TheRealDest);
-  (void) NI;
+  (void)NI;
 
   DEBUG(dbgs() << "Threading pred instr: " << *Pred->getTerminator()
-            << "Through successor TI: " << *TI << "Leaving: " << *NI << "\n");
+               << "Through successor TI: " << *TI << "Leaving: " << *NI
+               << "\n");
 
   EraseTerminatorInstAndDCECond(TI);
   return true;
 }
 
 namespace {
-  /// This class implements a stable ordering of constant
-  /// integers that does not depend on their address.  This is important for
-  /// applications that sort ConstantInt's to ensure uniqueness.
-  struct ConstantIntOrdering {
-    bool operator()(const ConstantInt *LHS, const ConstantInt *RHS) const {
-      return LHS->getValue().ult(RHS->getValue());
-    }
-  };
+/// This class implements a stable ordering of constant
+/// integers that does not depend on their address.  This is important for
+/// applications that sort ConstantInt's to ensure uniqueness.
+struct ConstantIntOrdering {
+  bool operator()(const ConstantInt *LHS, const ConstantInt *RHS) const {
+    return LHS->getValue().ult(RHS->getValue());
+  }
+};
 }
 
 static int ConstantIntSortPredicate(ConstantInt *const *P1,
                                     ConstantInt *const *P2) {
   const ConstantInt *LHS = *P1;
   const ConstantInt *RHS = *P2;
-  if (LHS->getValue().ult(RHS->getValue()))
-    return 1;
-  if (LHS->getValue() == RHS->getValue())
+  if (LHS == RHS)
     return 0;
-  return -1;
+  return LHS->getValue().ult(RHS->getValue()) ? 1 : -1;
 }
 
-static inline bool HasBranchWeights(const Instruction* I) {
+static inline bool HasBranchWeights(const Instruction *I) {
   MDNode *ProfMD = I->getMetadata(LLVMContext::MD_prof);
   if (ProfMD && ProfMD->getOperand(0))
-    if (MDString* MDS = dyn_cast<MDString>(ProfMD->getOperand(0)))
+    if (MDString *MDS = dyn_cast<MDString>(ProfMD->getOperand(0)))
       return MDS->getString().equals("branch_weights");
 
   return false;
@@ -837,7 +917,7 @@ static void GetBranchWeights(TerminatorInst *TI,
   // If TI is a conditional eq, the default case is the false case,
   // and the corresponding branch-weight data is at index 2. We swap the
   // default weight to be the first entry.
-  if (BranchInst* BI = dyn_cast<BranchInst>(TI)) {
+  if (BranchInst *BI = dyn_cast<BranchInst>(TI)) {
     assert(Weights.size() == 2);
     ICmpInst *ICI = cast<ICmpInst>(BI->getCondition());
     if (ICI->getPredicate() == ICmpInst::ICMP_EQ)
@@ -862,17 +942,17 @@ static void FitWeights(MutableArrayRef<uint64_t> Weights) {
 bool SimplifyCFGOpt::FoldValueComparisonIntoPredecessors(TerminatorInst *TI,
                                                          IRBuilder<> &Builder) {
   BasicBlock *BB = TI->getParent();
-  Value *CV = isValueEqualityComparison(TI);  // CondVal
+  Value *CV = isValueEqualityComparison(TI); // CondVal
   assert(CV && "Not a comparison?");
   bool Changed = false;
 
-  SmallVector<BasicBlock*, 16> Preds(pred_begin(BB), pred_end(BB));
+  SmallVector<BasicBlock *, 16> Preds(pred_begin(BB), pred_end(BB));
   while (!Preds.empty()) {
     BasicBlock *Pred = Preds.pop_back_val();
 
     // See if the predecessor is a comparison with the same value.
     TerminatorInst *PTI = Pred->getTerminator();
-    Value *PCV = isValueEqualityComparison(PTI);  // PredCondVal
+    Value *PCV = isValueEqualityComparison(PTI); // PredCondVal
 
     if (PCV == CV && SafeToMergeTerminators(TI, PTI)) {
       // Figure out which 'cases' to copy from SI to PSI.
@@ -885,7 +965,7 @@ bool SimplifyCFGOpt::FoldValueComparisonIntoPredecessors(TerminatorInst *TI,
       // Based on whether the default edge from PTI goes to BB or not, fill in
       // PredCases and PredDefault with the new switch cases we would like to
       // build.
-      SmallVector<BasicBlock*, 8> NewSuccessors;
+      SmallVector<BasicBlock *, 8> NewSuccessors;
 
       // Update the branch weight metadata along the way
       SmallVector<uint64_t, 8> Weights;
@@ -915,7 +995,7 @@ bool SimplifyCFGOpt::FoldValueComparisonIntoPredecessors(TerminatorInst *TI,
       if (PredDefault == BB) {
         // If this is the default destination from PTI, only the edges in TI
         // that don't occur in PTI, or that branch to BB will be activated.
-        std::set<ConstantInt*, ConstantIntOrdering> PTIHandled;
+        std::set<ConstantInt *, ConstantIntOrdering> PTIHandled;
         for (unsigned i = 0, e = PredCases.size(); i != e; ++i)
           if (PredCases[i].Dest != BB)
             PTIHandled.insert(PredCases[i].Value);
@@ -925,13 +1005,14 @@ bool SimplifyCFGOpt::FoldValueComparisonIntoPredecessors(TerminatorInst *TI,
 
             if (PredHasWeights || SuccHasWeights) {
               // Increase weight for the default case.
-              Weights[0] += Weights[i+1];
-              std::swap(Weights[i+1], Weights.back());
+              Weights[0] += Weights[i + 1];
+              std::swap(Weights[i + 1], Weights.back());
               Weights.pop_back();
             }
 
             PredCases.pop_back();
-            --i; --e;
+            --i;
+            --e;
           }
 
         // Reconstruct the new switch statement we will be building.
@@ -952,8 +1033,8 @@ bool SimplifyCFGOpt::FoldValueComparisonIntoPredecessors(TerminatorInst *TI,
               // The default weight is at index 0, so weight for the ith case
               // should be at index i+1. Scale the cases from successor by
               // PredDefaultWeight (Weights[0]).
-              Weights.push_back(Weights[0] * SuccWeights[i+1]);
-              ValidTotalSuccWeight += SuccWeights[i+1];
+              Weights.push_back(Weights[0] * SuccWeights[i + 1]);
+              ValidTotalSuccWeight += SuccWeights[i + 1];
             }
           }
 
@@ -969,21 +1050,22 @@ bool SimplifyCFGOpt::FoldValueComparisonIntoPredecessors(TerminatorInst *TI,
         // If this is not the default destination from PSI, only the edges
         // in SI that occur in PSI with a destination of BB will be
         // activated.
-        std::set<ConstantInt*, ConstantIntOrdering> PTIHandled;
-        std::map<ConstantInt*, uint64_t> WeightsForHandled;
+        std::set<ConstantInt *, ConstantIntOrdering> PTIHandled;
+        std::map<ConstantInt *, uint64_t> WeightsForHandled;
         for (unsigned i = 0, e = PredCases.size(); i != e; ++i)
           if (PredCases[i].Dest == BB) {
             PTIHandled.insert(PredCases[i].Value);
 
             if (PredHasWeights || SuccHasWeights) {
-              WeightsForHandled[PredCases[i].Value] = Weights[i+1];
-              std::swap(Weights[i+1], Weights.back());
+              WeightsForHandled[PredCases[i].Value] = Weights[i + 1];
+              std::swap(Weights[i + 1], Weights.back());
               Weights.pop_back();
             }
 
             std::swap(PredCases[i], PredCases.back());
             PredCases.pop_back();
-            --i; --e;
+            --i;
+            --e;
           }
 
         // Okay, now we know which constants were sent to BB from the
@@ -995,17 +1077,16 @@ bool SimplifyCFGOpt::FoldValueComparisonIntoPredecessors(TerminatorInst *TI,
               Weights.push_back(WeightsForHandled[BBCases[i].Value]);
             PredCases.push_back(BBCases[i]);
             NewSuccessors.push_back(BBCases[i].Dest);
-            PTIHandled.erase(BBCases[i].Value);// This constant is taken care of
+            PTIHandled.erase(
+                BBCases[i].Value); // This constant is taken care of
           }
 
         // If there are any constants vectored to BB that TI doesn't handle,
         // they must go to the default destination of TI.
-        for (std::set<ConstantInt*, ConstantIntOrdering>::iterator I =
-                                    PTIHandled.begin(),
-               E = PTIHandled.end(); I != E; ++I) {
+        for (ConstantInt *I : PTIHandled) {
           if (PredHasWeights || SuccHasWeights)
-            Weights.push_back(WeightsForHandled[*I]);
-          PredCases.push_back(ValueEqualityComparisonCase(*I, BBDefault));
+            Weights.push_back(WeightsForHandled[I]);
+          PredCases.push_back(ValueEqualityComparisonCase(I, BBDefault));
           NewSuccessors.push_back(BBDefault);
         }
       }
@@ -1024,8 +1105,8 @@ bool SimplifyCFGOpt::FoldValueComparisonIntoPredecessors(TerminatorInst *TI,
       }
 
       // Now that the successors are updated, create the new Switch instruction.
-      SwitchInst *NewSI = Builder.CreateSwitch(CV, PredDefault,
-                                               PredCases.size());
+      SwitchInst *NewSI =
+          Builder.CreateSwitch(CV, PredDefault, PredCases.size());
       NewSI->setDebugLoc(PTI->getDebugLoc());
       for (ValueEqualityComparisonCase &V : PredCases)
         NewSI->addCase(V.Value, V.Dest);
@@ -1036,9 +1117,9 @@ bool SimplifyCFGOpt::FoldValueComparisonIntoPredecessors(TerminatorInst *TI,
 
         SmallVector<uint32_t, 8> MDWeights(Weights.begin(), Weights.end());
 
-        NewSI->setMetadata(LLVMContext::MD_prof,
-                           MDBuilder(BB->getContext()).
-                           createBranchWeights(MDWeights));
+        NewSI->setMetadata(
+            LLVMContext::MD_prof,
+            MDBuilder(BB->getContext()).createBranchWeights(MDWeights));
       }
 
       EraseTerminatorInstAndDCECond(PTI);
@@ -1052,8 +1133,8 @@ bool SimplifyCFGOpt::FoldValueComparisonIntoPredecessors(TerminatorInst *TI,
           if (!InfLoopBlock) {
             // Insert it at the end of the function, because it's either code,
             // or it won't matter if it's hot. :)
-            InfLoopBlock = BasicBlock::Create(BB->getContext(),
-                                              "infloop", BB->getParent());
+            InfLoopBlock = BasicBlock::Create(BB->getContext(), "infloop",
+                                              BB->getParent());
             BranchInst::Create(InfLoopBlock, InfLoopBlock);
           }
           NewSI->setSuccessor(i, InfLoopBlock);
@@ -1070,13 +1151,13 @@ bool SimplifyCFGOpt::FoldValueComparisonIntoPredecessors(TerminatorInst *TI,
 // can't hoist the invoke, as there is nowhere to put the select in this case.
 static bool isSafeToHoistInvoke(BasicBlock *BB1, BasicBlock *BB2,
                                 Instruction *I1, Instruction *I2) {
-  for (succ_iterator SI = succ_begin(BB1), E = succ_end(BB1); SI != E; ++SI) {
+  for (BasicBlock *Succ : successors(BB1)) {
     PHINode *PN;
-    for (BasicBlock::iterator BBI = SI->begin();
+    for (BasicBlock::iterator BBI = Succ->begin();
          (PN = dyn_cast<PHINode>(BBI)); ++BBI) {
       Value *BB1V = PN->getIncomingValueForBlock(BB1);
       Value *BB2V = PN->getIncomingValueForBlock(BB2);
-      if (BB1V != BB2V && (BB1V==I1 || BB2V==I2)) {
+      if (BB1V != BB2V && (BB1V == I1 || BB2V == I2)) {
         return false;
       }
     }
@@ -1096,8 +1177,8 @@ static bool HoistThenElseCodeToIf(BranchInst *BI,
   // O(M*N) situations here where M and N are the sizes of BB1 and BB2.  As
   // such, we currently just scan for obviously identical instructions in an
   // identical order.
-  BasicBlock *BB1 = BI->getSuccessor(0);  // The true destination.
-  BasicBlock *BB2 = BI->getSuccessor(1);  // The false destination
+  BasicBlock *BB1 = BI->getSuccessor(0); // The true destination.
+  BasicBlock *BB2 = BI->getSuccessor(1); // The false destination
 
   BasicBlock::iterator BB1_Itr = BB1->begin();
   BasicBlock::iterator BB2_Itr = BB2->begin();
@@ -1135,12 +1216,16 @@ static bool HoistThenElseCodeToIf(BranchInst *BI,
     if (!I2->use_empty())
       I2->replaceAllUsesWith(I1);
     I1->intersectOptionalDataWith(I2);
-    unsigned KnownIDs[] = {
-        LLVMContext::MD_tbaa,    LLVMContext::MD_range,
-        LLVMContext::MD_fpmath,  LLVMContext::MD_invariant_load,
-        LLVMContext::MD_nonnull, LLVMContext::MD_invariant_group,
-        LLVMContext::MD_align,   LLVMContext::MD_dereferenceable,
-        LLVMContext::MD_dereferenceable_or_null};
+    unsigned KnownIDs[] = {LLVMContext::MD_tbaa,
+                           LLVMContext::MD_range,
+                           LLVMContext::MD_fpmath,
+                           LLVMContext::MD_invariant_load,
+                           LLVMContext::MD_nonnull,
+                           LLVMContext::MD_invariant_group,
+                           LLVMContext::MD_align,
+                           LLVMContext::MD_dereferenceable,
+                           LLVMContext::MD_dereferenceable_or_null,
+                           LLVMContext::MD_mem_parallel_loop_access};
     combineMetadata(I1, I2, KnownIDs);
     I2->eraseFromParent();
     Changed = true;
@@ -1165,9 +1250,9 @@ HoistTerminator:
   if (isa<InvokeInst>(I1) && !isSafeToHoistInvoke(BB1, BB2, I1, I2))
     return Changed;
 
-  for (succ_iterator SI = succ_begin(BB1), E = succ_end(BB1); SI != E; ++SI) {
+  for (BasicBlock *Succ : successors(BB1)) {
     PHINode *PN;
-    for (BasicBlock::iterator BBI = SI->begin();
+    for (BasicBlock::iterator BBI = Succ->begin();
          (PN = dyn_cast<PHINode>(BBI)); ++BBI) {
       Value *BB1V = PN->getIncomingValueForBlock(BB1);
       Value *BB2V = PN->getIncomingValueForBlock(BB2);
@@ -1178,7 +1263,7 @@ HoistTerminator:
       // eliminate undefined control flow then converting it to a select.
       if (passingValueIsAlwaysUndefined(BB1V, PN) ||
           passingValueIsAlwaysUndefined(BB2V, PN))
-       return Changed;
+        return Changed;
 
       if (isa<ConstantExpr>(BB1V) && !isSafeToSpeculativelyExecute(BB1V))
         return Changed;
@@ -1196,27 +1281,28 @@ HoistTerminator:
     NT->takeName(I1);
   }
 
-  IRBuilder<true, NoFolder> Builder(NT);
+  IRBuilder<NoFolder> Builder(NT);
   // Hoisting one of the terminators from our successor is a great thing.
   // Unfortunately, the successors of the if/else blocks may have PHI nodes in
   // them.  If they do, all PHI entries for BB1/BB2 must agree for all PHI
   // nodes, so we insert select instruction to compute the final result.
-  std::map<std::pair<Value*,Value*>, SelectInst*> InsertedSelects;
-  for (succ_iterator SI = succ_begin(BB1), E = succ_end(BB1); SI != E; ++SI) {
+  std::map<std::pair<Value *, Value *>, SelectInst *> InsertedSelects;
+  for (BasicBlock *Succ : successors(BB1)) {
     PHINode *PN;
-    for (BasicBlock::iterator BBI = SI->begin();
+    for (BasicBlock::iterator BBI = Succ->begin();
          (PN = dyn_cast<PHINode>(BBI)); ++BBI) {
       Value *BB1V = PN->getIncomingValueForBlock(BB1);
       Value *BB2V = PN->getIncomingValueForBlock(BB2);
-      if (BB1V == BB2V) continue;
+      if (BB1V == BB2V)
+        continue;
 
       // These values do not agree.  Insert a select instruction before NT
       // that determines the right value.
       SelectInst *&SI = InsertedSelects[std::make_pair(BB1V, BB2V)];
       if (!SI)
-        SI = cast<SelectInst>
-          (Builder.CreateSelect(BI->getCondition(), BB1V, BB2V,
-                                BB1V->getName()+"."+BB2V->getName()));
+        SI = cast<SelectInst>(
+            Builder.CreateSelect(BI->getCondition(), BB1V, BB2V,
+                                 BB1V->getName() + "." + BB2V->getName(), BI));
 
       // Make the PHI node use the select for all incoming values for BB1/BB2
       for (unsigned i = 0, e = PN->getNumIncomingValues(); i != e; ++i)
@@ -1226,8 +1312,8 @@ HoistTerminator:
   }
 
   // Update any PHI nodes in our new successors.
-  for (succ_iterator SI = succ_begin(BB1), E = succ_end(BB1); SI != E; ++SI)
-    AddPredecessorToBlock(*SI, BIParent, BB1);
+  for (BasicBlock *Succ : successors(BB1))
+    AddPredecessorToBlock(Succ, BIParent, BB1);
 
   EraseTerminatorInstAndDCECond(BI);
   return true;
@@ -1280,10 +1366,12 @@ static bool SinkThenElseCodeToEnd(BranchInst *BI1) {
                                              RI2 = BB2->getInstList().rbegin(),
                                              RE2 = BB2->getInstList().rend();
   // Skip debug info.
-  while (RI1 != RE1 && isa<DbgInfoIntrinsic>(&*RI1)) ++RI1;
+  while (RI1 != RE1 && isa<DbgInfoIntrinsic>(&*RI1))
+    ++RI1;
   if (RI1 == RE1)
     return false;
-  while (RI2 != RE2 && isa<DbgInfoIntrinsic>(&*RI2)) ++RI2;
+  while (RI2 != RE2 && isa<DbgInfoIntrinsic>(&*RI2))
+    ++RI2;
   if (RI2 == RE2)
     return false;
   // Skip the unconditional branches.
@@ -1293,10 +1381,12 @@ static bool SinkThenElseCodeToEnd(BranchInst *BI1) {
   bool Changed = false;
   while (RI1 != RE1 && RI2 != RE2) {
     // Skip debug info.
-    while (RI1 != RE1 && isa<DbgInfoIntrinsic>(&*RI1)) ++RI1;
+    while (RI1 != RE1 && isa<DbgInfoIntrinsic>(&*RI1))
+      ++RI1;
     if (RI1 == RE1)
       return Changed;
-    while (RI2 != RE2 && isa<DbgInfoIntrinsic>(&*RI2)) ++RI2;
+    while (RI2 != RE2 && isa<DbgInfoIntrinsic>(&*RI2))
+      ++RI2;
     if (RI2 == RE2)
       return Changed;
 
@@ -1305,22 +1395,19 @@ static bool SinkThenElseCodeToEnd(BranchInst *BI1) {
     // I1 and I2 should have a single use in the same PHI node, and they
     // perform the same operation.
     // Cannot move control-flow-involving, volatile loads, vaarg, etc.
-    if (isa<PHINode>(I1) || isa<PHINode>(I2) ||
-        isa<TerminatorInst>(I1) || isa<TerminatorInst>(I2) ||
-        I1->isEHPad() || I2->isEHPad() ||
+    if (isa<PHINode>(I1) || isa<PHINode>(I2) || isa<TerminatorInst>(I1) ||
+        isa<TerminatorInst>(I2) || I1->isEHPad() || I2->isEHPad() ||
         isa<AllocaInst>(I1) || isa<AllocaInst>(I2) ||
         I1->mayHaveSideEffects() || I2->mayHaveSideEffects() ||
         I1->mayReadOrWriteMemory() || I2->mayReadOrWriteMemory() ||
-        !I1->hasOneUse() || !I2->hasOneUse() ||
-        !JointValueMap.count(InstPair))
+        !I1->hasOneUse() || !I2->hasOneUse() || !JointValueMap.count(InstPair))
       return Changed;
 
     // Check whether we should swap the operands of ICmpInst.
     // TODO: Add support of communativity.
     ICmpInst *ICmp1 = dyn_cast<ICmpInst>(I1), *ICmp2 = dyn_cast<ICmpInst>(I2);
     bool SwapOpnds = false;
-    if (ICmp1 && ICmp2 &&
-        ICmp1->getOperand(0) != ICmp2->getOperand(0) &&
+    if (ICmp1 && ICmp2 && ICmp1->getOperand(0) != ICmp2->getOperand(0) &&
         ICmp1->getOperand(1) != ICmp2->getOperand(1) &&
         (ICmp1->getOperand(0) == ICmp2->getOperand(1) ||
          ICmp1->getOperand(1) == ICmp2->getOperand(0))) {
@@ -1343,8 +1430,7 @@ static bool SinkThenElseCodeToEnd(BranchInst *BI1) {
         continue;
       // Early exit if we have more-than one pair of different operands or if
       // we need a PHI node to replace a constant.
-      if (Op1Idx != ~0U ||
-          isa<Constant>(I1->getOperand(I)) ||
+      if (Op1Idx != ~0U || isa<Constant>(I1->getOperand(I)) ||
           isa<Constant>(I2->getOperand(I))) {
         // If we can't sink the instructions, undo the swapping.
         if (SwapOpnds)
@@ -1379,7 +1465,7 @@ static bool SinkThenElseCodeToEnd(BranchInst *BI1) {
 
     // We need to update RE1 and RE2 if we are going to sink the first
     // instruction in the basic block down.
-    bool UpdateRE1 = (I1 == BB1->begin()), UpdateRE2 = (I2 == BB2->begin());
+    bool UpdateRE1 = (I1 == &BB1->front()), UpdateRE2 = (I2 == &BB2->front());
     // Sink the instruction.
     BBEnd->getInstList().splice(FirstNonPhiInBBEnd->getIterator(),
                                 BB1->getInstList(), I1);
@@ -1444,22 +1530,26 @@ static Value *isSafeToSpeculateStore(Instruction *I, BasicBlock *BrBB,
   Value *StorePtr = StoreToHoist->getPointerOperand();
 
   // Look for a store to the same pointer in BrBB.
-  unsigned MaxNumInstToLookAt = 10;
-  for (BasicBlock::reverse_iterator RI = BrBB->rbegin(),
-       RE = BrBB->rend(); RI != RE && (--MaxNumInstToLookAt); ++RI) {
-    Instruction *CurI = &*RI;
+  unsigned MaxNumInstToLookAt = 9;
+  for (Instruction &CurI : reverse(*BrBB)) {
+    if (!MaxNumInstToLookAt)
+      break;
+    // Skip debug info.
+    if (isa<DbgInfoIntrinsic>(CurI))
+      continue;
+    --MaxNumInstToLookAt;
 
     // Could be calling an instruction that effects memory like free().
-    if (CurI->mayHaveSideEffects() && !isa<StoreInst>(CurI))
+    if (CurI.mayHaveSideEffects() && !isa<StoreInst>(CurI))
       return nullptr;
 
-    StoreInst *SI = dyn_cast<StoreInst>(CurI);
-    // Found the previous store make sure it stores to the same location.
-    if (SI && SI->getPointerOperand() == StorePtr)
-      // Found the previous store, return its value operand.
-      return SI->getValueOperand();
-    else if (SI)
+    if (auto *SI = dyn_cast<StoreInst>(&CurI)) {
+      // Found the previous store make sure it stores to the same location.
+      if (SI->getPointerOperand() == StorePtr)
+        // Found the previous store, return its value operand.
+        return SI->getValueOperand();
       return nullptr; // Unknown store.
+    }
   }
 
   return nullptr;
@@ -1562,11 +1652,9 @@ static bool SpeculativelyExecuteBB(BranchInst *BI, BasicBlock *ThenBB,
     // Do not hoist the instruction if any of its operands are defined but not
     // used in BB. The transformation will prevent the operand from
     // being sunk into the use block.
-    for (User::op_iterator i = I->op_begin(), e = I->op_end();
-         i != e; ++i) {
+    for (User::op_iterator i = I->op_begin(), e = I->op_end(); i != e; ++i) {
       Instruction *OpI = dyn_cast<Instruction>(*i);
-      if (!OpI || OpI->getParent() != BB ||
-          OpI->mayHaveSideEffects())
+      if (!OpI || OpI->getParent() != BB || OpI->mayHaveSideEffects())
         continue; // Not a candidate for sinking.
 
       ++SinkCandidateUseCounts[OpI];
@@ -1576,8 +1664,9 @@ static bool SpeculativelyExecuteBB(BranchInst *BI, BasicBlock *ThenBB,
   // Consider any sink candidates which are only used in CondBB as costs for
   // speculation. Note, while we iterate over a DenseMap here, we are summing
   // and so iteration order isn't significant.
-  for (SmallDenseMap<Instruction *, unsigned, 4>::iterator I =
-           SinkCandidateUseCounts.begin(), E = SinkCandidateUseCounts.end();
+  for (SmallDenseMap<Instruction *, unsigned, 4>::iterator
+           I = SinkCandidateUseCounts.begin(),
+           E = SinkCandidateUseCounts.end();
        I != E; ++I)
     if (I->first->getNumUses() == I->second) {
       ++SpeculationCost;
@@ -1613,8 +1702,8 @@ static bool SpeculativelyExecuteBB(BranchInst *BI, BasicBlock *ThenBB,
       return false;
     unsigned OrigCost = OrigCE ? ComputeSpeculationCost(OrigCE, TTI) : 0;
     unsigned ThenCost = ThenCE ? ComputeSpeculationCost(ThenCE, TTI) : 0;
-    unsigned MaxCost = 2 * PHINodeFoldingThreshold *
-      TargetTransformInfo::TCC_Basic;
+    unsigned MaxCost =
+        2 * PHINodeFoldingThreshold * TargetTransformInfo::TCC_Basic;
     if (OrigCost + ThenCost > MaxCost)
       return false;
 
@@ -1637,19 +1726,19 @@ static bool SpeculativelyExecuteBB(BranchInst *BI, BasicBlock *ThenBB,
 
   // Insert a select of the value of the speculated store.
   if (SpeculatedStoreValue) {
-    IRBuilder<true, NoFolder> Builder(BI);
+    IRBuilder<NoFolder> Builder(BI);
     Value *TrueV = SpeculatedStore->getValueOperand();
     Value *FalseV = SpeculatedStoreValue;
     if (Invert)
       std::swap(TrueV, FalseV);
-    Value *S = Builder.CreateSelect(BrCond, TrueV, FalseV, TrueV->getName() +
-                                    "." + FalseV->getName());
+    Value *S = Builder.CreateSelect(
+        BrCond, TrueV, FalseV, TrueV->getName() + "." + FalseV->getName(), BI);
     SpeculatedStore->setOperand(0, S);
   }
 
   // Metadata can be dependent on the condition we are hoisting above.
   // Conservatively strip all metadata on the instruction.
-  for (auto &I: *ThenBB)
+  for (auto &I : *ThenBB)
     I.dropUnknownNonDebugMetadata();
 
   // Hoist the instructions.
@@ -1657,7 +1746,7 @@ static bool SpeculativelyExecuteBB(BranchInst *BI, BasicBlock *ThenBB,
                            ThenBB->begin(), std::prev(ThenBB->end()));
 
   // Insert selects and rewrite the PHI operands.
-  IRBuilder<true, NoFolder> Builder(BI);
+  IRBuilder<NoFolder> Builder(BI);
   for (BasicBlock::iterator I = EndBB->begin();
        PHINode *PN = dyn_cast<PHINode>(I); ++I) {
     unsigned OrigI = PN->getBasicBlockIndex(BB);
@@ -1675,8 +1764,8 @@ static bool SpeculativelyExecuteBB(BranchInst *BI, BasicBlock *ThenBB,
     Value *TrueV = ThenV, *FalseV = OrigV;
     if (Invert)
       std::swap(TrueV, FalseV);
-    Value *V = Builder.CreateSelect(BrCond, TrueV, FalseV,
-                                    TrueV->getName() + "." + FalseV->getName());
+    Value *V = Builder.CreateSelect(
+        BrCond, TrueV, FalseV, TrueV->getName() + "." + FalseV->getName(), BI);
     PN->setIncomingValue(OrigI, V);
     PN->setIncomingValue(ThenI, V);
   }
@@ -1685,19 +1774,6 @@ static bool SpeculativelyExecuteBB(BranchInst *BI, BasicBlock *ThenBB,
   return true;
 }
 
-/// \returns True if this block contains a CallInst with the NoDuplicate
-/// attribute.
-static bool HasNoDuplicateCall(const BasicBlock *BB) {
-  for (BasicBlock::const_iterator I = BB->begin(), E = BB->end(); I != E; ++I) {
-    const CallInst *CI = dyn_cast<CallInst>(I);
-    if (!CI)
-      continue;
-    if (CI->cannotDuplicate())
-      return true;
-  }
-  return false;
-}
-
 /// Return true if we can thread a branch across this block.
 static bool BlockIsSimpleEnoughToThreadThrough(BasicBlock *BB) {
   BranchInst *BI = cast<BranchInst>(BB->getTerminator());
@@ -1706,14 +1782,16 @@ static bool BlockIsSimpleEnoughToThreadThrough(BasicBlock *BB) {
   for (BasicBlock::iterator BBI = BB->begin(); &*BBI != BI; ++BBI) {
     if (isa<DbgInfoIntrinsic>(BBI))
       continue;
-    if (Size > 10) return false;  // Don't clone large BB's.
+    if (Size > 10)
+      return false; // Don't clone large BB's.
     ++Size;
 
     // We can only support instructions that do not define values that are
     // live outside of the current basic block.
     for (User *U : BBI->users()) {
       Instruction *UI = cast<Instruction>(U);
-      if (UI->getParent() != BB || isa<PHINode>(UI)) return false;
+      if (UI->getParent() != BB || isa<PHINode>(UI))
+        return false;
     }
 
     // Looks ok, continue checking.
@@ -1740,32 +1818,41 @@ static bool FoldCondBranchOnPHI(BranchInst *BI, const DataLayout &DL) {
   }
 
   // Now we know that this block has multiple preds and two succs.
-  if (!BlockIsSimpleEnoughToThreadThrough(BB)) return false;
+  if (!BlockIsSimpleEnoughToThreadThrough(BB))
+    return false;
 
-  if (HasNoDuplicateCall(BB)) return false;
+  // Can't fold blocks that contain noduplicate or convergent calls.
+  if (llvm::any_of(*BB, [](const Instruction &I) {
+        const CallInst *CI = dyn_cast<CallInst>(&I);
+        return CI && (CI->cannotDuplicate() || CI->isConvergent());
+      }))
+    return false;
 
   // Okay, this is a simple enough basic block.  See if any phi values are
   // constants.
   for (unsigned i = 0, e = PN->getNumIncomingValues(); i != e; ++i) {
     ConstantInt *CB = dyn_cast<ConstantInt>(PN->getIncomingValue(i));
-    if (!CB || !CB->getType()->isIntegerTy(1)) continue;
+    if (!CB || !CB->getType()->isIntegerTy(1))
+      continue;
 
     // Okay, we now know that all edges from PredBB should be revectored to
     // branch to RealDest.
     BasicBlock *PredBB = PN->getIncomingBlock(i);
     BasicBlock *RealDest = BI->getSuccessor(!CB->getZExtValue());
 
-    if (RealDest == BB) continue;  // Skip self loops.
+    if (RealDest == BB)
+      continue; // Skip self loops.
     // Skip if the predecessor's terminator is an indirect branch.
-    if (isa<IndirectBrInst>(PredBB->getTerminator())) continue;
+    if (isa<IndirectBrInst>(PredBB->getTerminator()))
+      continue;
 
     // The dest block might have PHI nodes, other predecessors and other
     // difficult cases.  Instead of being smart about this, just insert a new
     // block that jumps to the destination block, effectively splitting
     // the edge we are about to create.
-    BasicBlock *EdgeBB = BasicBlock::Create(BB->getContext(),
-                                            RealDest->getName()+".critedge",
-                                            RealDest->getParent(), RealDest);
+    BasicBlock *EdgeBB =
+        BasicBlock::Create(BB->getContext(), RealDest->getName() + ".critedge",
+                           RealDest->getParent(), RealDest);
     BranchInst::Create(RealDest, EdgeBB);
 
     // Update PHI nodes.
@@ -1775,7 +1862,7 @@ static bool FoldCondBranchOnPHI(BranchInst *BI, const DataLayout &DL) {
     // instructions into EdgeBB.  We know that there will be no uses of the
     // cloned instructions outside of EdgeBB.
     BasicBlock::iterator InsertPt = EdgeBB->begin();
-    DenseMap<Value*, Value*> TranslateMap;  // Track translated values.
+    DenseMap<Value *, Value *> TranslateMap; // Track translated values.
     for (BasicBlock::iterator BBI = BB->begin(); &*BBI != BI; ++BBI) {
       if (PHINode *PN = dyn_cast<PHINode>(BBI)) {
         TranslateMap[PN] = PN->getIncomingValueForBlock(PredBB);
@@ -1783,26 +1870,31 @@ static bool FoldCondBranchOnPHI(BranchInst *BI, const DataLayout &DL) {
       }
       // Clone the instruction.
       Instruction *N = BBI->clone();
-      if (BBI->hasName()) N->setName(BBI->getName()+".c");
+      if (BBI->hasName())
+        N->setName(BBI->getName() + ".c");
 
       // Update operands due to translation.
-      for (User::op_iterator i = N->op_begin(), e = N->op_end();
-           i != e; ++i) {
-        DenseMap<Value*, Value*>::iterator PI = TranslateMap.find(*i);
+      for (User::op_iterator i = N->op_begin(), e = N->op_end(); i != e; ++i) {
+        DenseMap<Value *, Value *>::iterator PI = TranslateMap.find(*i);
         if (PI != TranslateMap.end())
           *i = PI->second;
       }
 
       // Check for trivial simplification.
       if (Value *V = SimplifyInstruction(N, DL)) {
-        TranslateMap[&*BBI] = V;
-        delete N;   // Instruction folded away, don't need actual inst
+        if (!BBI->use_empty())
+          TranslateMap[&*BBI] = V;
+        if (!N->mayHaveSideEffects()) {
+          delete N; // Instruction folded away, don't need actual inst
+          N = nullptr;
+        }
       } else {
-        // Insert the new instruction into its new home.
-        EdgeBB->getInstList().insert(InsertPt, N);
         if (!BBI->use_empty())
           TranslateMap[&*BBI] = N;
       }
+      // Insert the new instruction into its new home.
+      if (N)
+        EdgeBB->getInstList().insert(InsertPt, N);
     }
 
     // Loop over all of the edges from PredBB to BB, changing them to branch
@@ -1852,7 +1944,7 @@ static bool FoldTwoEntryPHINode(PHINode *PN, const TargetTransformInfo &TTI,
   // Loop over the PHI's seeing if we can promote them all to select
   // instructions.  While we are at it, keep track of the instructions
   // that need to be moved to the dominating block.
-  SmallPtrSet<Instruction*, 4> AggressiveInsts;
+  SmallPtrSet<Instruction *, 4> AggressiveInsts;
   unsigned MaxCostVal0 = PHINodeFoldingThreshold,
            MaxCostVal1 = PHINodeFoldingThreshold;
   MaxCostVal0 *= TargetTransformInfo::TCC_Basic;
@@ -1876,7 +1968,8 @@ static bool FoldTwoEntryPHINode(PHINode *PN, const TargetTransformInfo &TTI,
   // If we folded the first phi, PN dangles at this point.  Refresh it.  If
   // we ran out of PHIs then we simplified them all.
   PN = dyn_cast<PHINode>(BB->begin());
-  if (!PN) return true;
+  if (!PN)
+    return true;
 
   // Don't fold i1 branches on PHIs which contain binary operators.  These can
   // often be turned into switches and other things.
@@ -1886,10 +1979,10 @@ static bool FoldTwoEntryPHINode(PHINode *PN, const TargetTransformInfo &TTI,
        isa<BinaryOperator>(IfCond)))
     return false;
 
-  // If we all PHI nodes are promotable, check to make sure that all
-  // instructions in the predecessor blocks can be promoted as well.  If
-  // not, we won't be able to get rid of the control flow, so it's not
-  // worth promoting to select instructions.
+  // If all PHI nodes are promotable, check to make sure that all instructions
+  // in the predecessor blocks can be promoted as well. If not, we won't be able
+  // to get rid of the control flow, so it's not worth promoting to select
+  // instructions.
   BasicBlock *DomBlock = nullptr;
   BasicBlock *IfBlock1 = PN->getIncomingBlock(0);
   BasicBlock *IfBlock2 = PN->getIncomingBlock(1);
@@ -1897,11 +1990,12 @@ static bool FoldTwoEntryPHINode(PHINode *PN, const TargetTransformInfo &TTI,
     IfBlock1 = nullptr;
   } else {
     DomBlock = *pred_begin(IfBlock1);
-    for (BasicBlock::iterator I = IfBlock1->begin();!isa<TerminatorInst>(I);++I)
+    for (BasicBlock::iterator I = IfBlock1->begin(); !isa<TerminatorInst>(I);
+         ++I)
       if (!AggressiveInsts.count(&*I) && !isa<DbgInfoIntrinsic>(I)) {
         // This is not an aggressive instruction that we can promote.
-        // Because of this, we won't be able to get rid of the control
-        // flow, so the xform is not worth it.
+        // Because of this, we won't be able to get rid of the control flow, so
+        // the xform is not worth it.
         return false;
       }
   }
@@ -1910,11 +2004,12 @@ static bool FoldTwoEntryPHINode(PHINode *PN, const TargetTransformInfo &TTI,
     IfBlock2 = nullptr;
   } else {
     DomBlock = *pred_begin(IfBlock2);
-    for (BasicBlock::iterator I = IfBlock2->begin();!isa<TerminatorInst>(I);++I)
+    for (BasicBlock::iterator I = IfBlock2->begin(); !isa<TerminatorInst>(I);
+         ++I)
       if (!AggressiveInsts.count(&*I) && !isa<DbgInfoIntrinsic>(I)) {
         // This is not an aggressive instruction that we can promote.
-        // Because of this, we won't be able to get rid of the control
-        // flow, so the xform is not worth it.
+        // Because of this, we won't be able to get rid of the control flow, so
+        // the xform is not worth it.
         return false;
       }
   }
@@ -1925,7 +2020,7 @@ static bool FoldTwoEntryPHINode(PHINode *PN, const TargetTransformInfo &TTI,
   // If we can still promote the PHI nodes after this gauntlet of tests,
   // do all of the PHI's now.
   Instruction *InsertPt = DomBlock->getTerminator();
-  IRBuilder<true, NoFolder> Builder(InsertPt);
+  IRBuilder<NoFolder> Builder(InsertPt);
 
   // Move all 'aggressive' instructions, which are defined in the
   // conditional parts of the if's up to the dominating block.
@@ -1940,13 +2035,12 @@ static bool FoldTwoEntryPHINode(PHINode *PN, const TargetTransformInfo &TTI,
 
   while (PHINode *PN = dyn_cast<PHINode>(BB->begin())) {
     // Change the PHI node into a select instruction.
-    Value *TrueVal  = PN->getIncomingValue(PN->getIncomingBlock(0) == IfFalse);
+    Value *TrueVal = PN->getIncomingValue(PN->getIncomingBlock(0) == IfFalse);
     Value *FalseVal = PN->getIncomingValue(PN->getIncomingBlock(0) == IfTrue);
 
-    SelectInst *NV =
-      cast<SelectInst>(Builder.CreateSelect(IfCond, TrueVal, FalseVal, ""));
-    PN->replaceAllUsesWith(NV);
-    NV->takeName(PN);
+    Value *Sel = Builder.CreateSelect(IfCond, TrueVal, FalseVal, "", InsertPt);
+    PN->replaceAllUsesWith(Sel);
+    Sel->takeName(PN);
     PN->eraseFromParent();
   }
 
@@ -2029,51 +2123,32 @@ static bool SimplifyCondBranchToTwoReturns(BranchInst *BI,
     } else if (isa<UndefValue>(TrueValue)) {
       TrueValue = FalseValue;
     } else {
-      TrueValue = Builder.CreateSelect(BrCond, TrueValue,
-                                       FalseValue, "retval");
+      TrueValue =
+          Builder.CreateSelect(BrCond, TrueValue, FalseValue, "retval", BI);
     }
   }
 
-  Value *RI = !TrueValue ?
-    Builder.CreateRetVoid() : Builder.CreateRet(TrueValue);
+  Value *RI =
+      !TrueValue ? Builder.CreateRetVoid() : Builder.CreateRet(TrueValue);
 
-  (void) RI;
+  (void)RI;
 
   DEBUG(dbgs() << "\nCHANGING BRANCH TO TWO RETURNS INTO SELECT:"
                << "\n  " << *BI << "NewRet = " << *RI
-               << "TRUEBLOCK: " << *TrueSucc << "FALSEBLOCK: "<< *FalseSucc);
+               << "TRUEBLOCK: " << *TrueSucc << "FALSEBLOCK: " << *FalseSucc);
 
   EraseTerminatorInstAndDCECond(BI);
 
   return true;
 }
 
-/// Given a conditional BranchInstruction, retrieve the probabilities of the
-/// branch taking each edge. Fills in the two APInt parameters and returns true,
-/// or returns false if no or invalid metadata was found.
-static bool ExtractBranchMetadata(BranchInst *BI,
-                                  uint64_t &ProbTrue, uint64_t &ProbFalse) {
-  assert(BI->isConditional() &&
-         "Looking for probabilities on unconditional branch?");
-  MDNode *ProfileData = BI->getMetadata(LLVMContext::MD_prof);
-  if (!ProfileData || ProfileData->getNumOperands() != 3) return false;
-  ConstantInt *CITrue =
-      mdconst::dyn_extract<ConstantInt>(ProfileData->getOperand(1));
-  ConstantInt *CIFalse =
-      mdconst::dyn_extract<ConstantInt>(ProfileData->getOperand(2));
-  if (!CITrue || !CIFalse) return false;
-  ProbTrue = CITrue->getValue().getZExtValue();
-  ProbFalse = CIFalse->getValue().getZExtValue();
-  return true;
-}
-
 /// Return true if the given instruction is available
 /// in its predecessor block. If yes, the instruction will be removed.
 static bool checkCSEInPredecessor(Instruction *Inst, BasicBlock *PB) {
   if (!isa<BinaryOperator>(Inst) && !isa<CmpInst>(Inst))
     return false;
-  for (BasicBlock::iterator I = PB->begin(), E = PB->end(); I != E; I++) {
-    Instruction *PBI = &*I;
+  for (Instruction &I : *PB) {
+    Instruction *PBI = &I;
     // Check whether Inst and PBI generate the same value.
     if (Inst->isIdenticalTo(PBI)) {
       Inst->replaceAllUsesWith(PBI);
@@ -2084,6 +2159,29 @@ static bool checkCSEInPredecessor(Instruction *Inst, BasicBlock *PB) {
   return false;
 }
 
+/// Return true if either PBI or BI has branch weight available, and store
+/// the weights in {Pred|Succ}{True|False}Weight. If one of PBI and BI does
+/// not have branch weight, use 1:1 as its weight.
+static bool extractPredSuccWeights(BranchInst *PBI, BranchInst *BI,
+                                   uint64_t &PredTrueWeight,
+                                   uint64_t &PredFalseWeight,
+                                   uint64_t &SuccTrueWeight,
+                                   uint64_t &SuccFalseWeight) {
+  bool PredHasWeights =
+      PBI->extractProfMetadata(PredTrueWeight, PredFalseWeight);
+  bool SuccHasWeights =
+      BI->extractProfMetadata(SuccTrueWeight, SuccFalseWeight);
+  if (PredHasWeights || SuccHasWeights) {
+    if (!PredHasWeights)
+      PredTrueWeight = PredFalseWeight = 1;
+    if (!SuccHasWeights)
+      SuccTrueWeight = SuccFalseWeight = 1;
+    return true;
+  } else {
+    return false;
+  }
+}
+
 /// If this basic block is simple enough, and if a predecessor branches to us
 /// and one of our successors, fold the block into the predecessor and use
 /// logical operations to pick the right destination.
@@ -2103,8 +2201,7 @@ bool llvm::FoldBranchToCommonDest(BranchInst *BI, unsigned BonusInstThreshold) {
         if (PBI->isConditional() &&
             (BI->getSuccessor(0) == PBI->getSuccessor(0) ||
              BI->getSuccessor(0) == PBI->getSuccessor(1))) {
-          for (BasicBlock::iterator I = BB->begin(), E = BB->end();
-               I != E; ) {
+          for (BasicBlock::iterator I = BB->begin(), E = BB->end(); I != E;) {
             Instruction *Curr = &*I++;
             if (isa<CmpInst>(Curr)) {
               Cond = Curr;
@@ -2122,13 +2219,14 @@ bool llvm::FoldBranchToCommonDest(BranchInst *BI, unsigned BonusInstThreshold) {
 
   if (!Cond || (!isa<CmpInst>(Cond) && !isa<BinaryOperator>(Cond)) ||
       Cond->getParent() != BB || !Cond->hasOneUse())
-  return false;
+    return false;
 
   // Make sure the instruction after the condition is the cond branch.
   BasicBlock::iterator CondIt = ++Cond->getIterator();
 
   // Ignore dbg intrinsics.
-  while (isa<DbgInfoIntrinsic>(CondIt)) ++CondIt;
+  while (isa<DbgInfoIntrinsic>(CondIt))
+    ++CondIt;
 
   if (&*CondIt != BI)
     return false;
@@ -2139,7 +2237,7 @@ bool llvm::FoldBranchToCommonDest(BranchInst *BI, unsigned BonusInstThreshold) {
   // as "bonus instructions", and only allow this transformation when the
   // number of the bonus instructions does not exceed a certain threshold.
   unsigned NumBonusInsts = 0;
-  for (auto I = BB->begin(); Cond != I; ++I) {
+  for (auto I = BB->begin(); Cond != &*I; ++I) {
     // Ignore dbg intrinsics.
     if (isa<DbgInfoIntrinsic>(I))
       continue;
@@ -2168,7 +2266,7 @@ bool llvm::FoldBranchToCommonDest(BranchInst *BI, unsigned BonusInstThreshold) {
       return false;
 
   // Finally, don't infinitely unroll conditional loops.
-  BasicBlock *TrueDest  = BI->getSuccessor(0);
+  BasicBlock *TrueDest = BI->getSuccessor(0);
   BasicBlock *FalseDest = (BI->isConditional()) ? BI->getSuccessor(1) : nullptr;
   if (TrueDest == BB || FalseDest == BB)
     return false;
@@ -2180,10 +2278,9 @@ bool llvm::FoldBranchToCommonDest(BranchInst *BI, unsigned BonusInstThreshold) {
     // Check that we have two conditional branches.  If there is a PHI node in
     // the common successor, verify that the same value flows in from both
     // blocks.
-    SmallVector<PHINode*, 4> PHIs;
+    SmallVector<PHINode *, 4> PHIs;
     if (!PBI || PBI->isUnconditional() ||
-        (BI->isConditional() &&
-         !SafeToMergeTerminators(BI, PBI)) ||
+        (BI->isConditional() && !SafeToMergeTerminators(BI, PBI)) ||
         (!BI->isConditional() &&
          !isProfitableToFoldUnconditional(BI, PBI, Cond, PHIs)))
       continue;
@@ -2193,16 +2290,19 @@ bool llvm::FoldBranchToCommonDest(BranchInst *BI, unsigned BonusInstThreshold) {
     bool InvertPredCond = false;
 
     if (BI->isConditional()) {
-      if (PBI->getSuccessor(0) == TrueDest)
+      if (PBI->getSuccessor(0) == TrueDest) {
         Opc = Instruction::Or;
-      else if (PBI->getSuccessor(1) == FalseDest)
+      } else if (PBI->getSuccessor(1) == FalseDest) {
         Opc = Instruction::And;
-      else if (PBI->getSuccessor(0) == FalseDest)
-        Opc = Instruction::And, InvertPredCond = true;
-      else if (PBI->getSuccessor(1) == TrueDest)
-        Opc = Instruction::Or, InvertPredCond = true;
-      else
+      } else if (PBI->getSuccessor(0) == FalseDest) {
+        Opc = Instruction::And;
+        InvertPredCond = true;
+      } else if (PBI->getSuccessor(1) == TrueDest) {
+        Opc = Instruction::Or;
+        InvertPredCond = true;
+      } else {
         continue;
+      }
     } else {
       if (PBI->getSuccessor(0) != TrueDest && PBI->getSuccessor(1) != TrueDest)
         continue;
@@ -2219,8 +2319,8 @@ bool llvm::FoldBranchToCommonDest(BranchInst *BI, unsigned BonusInstThreshold) {
         CmpInst *CI = cast<CmpInst>(NewCond);
         CI->setPredicate(CI->getInversePredicate());
       } else {
-        NewCond = Builder.CreateNot(NewCond,
-                                    PBI->getCondition()->getName()+".not");
+        NewCond =
+            Builder.CreateNot(NewCond, PBI->getCondition()->getName() + ".not");
       }
 
       PBI->setCondition(NewCond);
@@ -2234,12 +2334,12 @@ bool llvm::FoldBranchToCommonDest(BranchInst *BI, unsigned BonusInstThreshold) {
     // We already make sure Cond is the last instruction before BI. Therefore,
     // all instructions before Cond other than DbgInfoIntrinsic are bonus
     // instructions.
-    for (auto BonusInst = BB->begin(); Cond != BonusInst; ++BonusInst) {
+    for (auto BonusInst = BB->begin(); Cond != &*BonusInst; ++BonusInst) {
       if (isa<DbgInfoIntrinsic>(BonusInst))
         continue;
       Instruction *NewBonusInst = BonusInst->clone();
       RemapInstruction(NewBonusInst, VMap,
-                       RF_NoModuleLevelChanges | RF_IgnoreMissingEntries);
+                       RF_NoModuleLevelChanges | RF_IgnoreMissingLocals);
       VMap[&*BonusInst] = NewBonusInst;
 
       // If we moved a load, we cannot any longer claim any knowledge about
@@ -2258,49 +2358,49 @@ bool llvm::FoldBranchToCommonDest(BranchInst *BI, unsigned BonusInstThreshold) {
     // two conditions together.
     Instruction *New = Cond->clone();
     RemapInstruction(New, VMap,
-                     RF_NoModuleLevelChanges | RF_IgnoreMissingEntries);
+                     RF_NoModuleLevelChanges | RF_IgnoreMissingLocals);
     PredBlock->getInstList().insert(PBI->getIterator(), New);
     New->takeName(Cond);
     Cond->setName(New->getName() + ".old");
 
     if (BI->isConditional()) {
-      Instruction *NewCond =
-        cast<Instruction>(Builder.CreateBinOp(Opc, PBI->getCondition(),
-                                            New, "or.cond"));
+      Instruction *NewCond = cast<Instruction>(
+          Builder.CreateBinOp(Opc, PBI->getCondition(), New, "or.cond"));
       PBI->setCondition(NewCond);
 
       uint64_t PredTrueWeight, PredFalseWeight, SuccTrueWeight, SuccFalseWeight;
-      bool PredHasWeights = ExtractBranchMetadata(PBI, PredTrueWeight,
-                                                  PredFalseWeight);
-      bool SuccHasWeights = ExtractBranchMetadata(BI, SuccTrueWeight,
-                                                  SuccFalseWeight);
+      bool HasWeights =
+          extractPredSuccWeights(PBI, BI, PredTrueWeight, PredFalseWeight,
+                                 SuccTrueWeight, SuccFalseWeight);
       SmallVector<uint64_t, 8> NewWeights;
 
       if (PBI->getSuccessor(0) == BB) {
-        if (PredHasWeights && SuccHasWeights) {
+        if (HasWeights) {
           // PBI: br i1 %x, BB, FalseDest
           // BI:  br i1 %y, TrueDest, FalseDest
-          //TrueWeight is TrueWeight for PBI * TrueWeight for BI.
+          // TrueWeight is TrueWeight for PBI * TrueWeight for BI.
           NewWeights.push_back(PredTrueWeight * SuccTrueWeight);
-          //FalseWeight is FalseWeight for PBI * TotalWeight for BI +
+          // FalseWeight is FalseWeight for PBI * TotalWeight for BI +
           //               TrueWeight for PBI * FalseWeight for BI.
           // We assume that total weights of a BranchInst can fit into 32 bits.
           // Therefore, we will not have overflow using 64-bit arithmetic.
-          NewWeights.push_back(PredFalseWeight * (SuccFalseWeight +
-               SuccTrueWeight) + PredTrueWeight * SuccFalseWeight);
+          NewWeights.push_back(PredFalseWeight *
+                                   (SuccFalseWeight + SuccTrueWeight) +
+                               PredTrueWeight * SuccFalseWeight);
         }
         AddPredecessorToBlock(TrueDest, PredBlock, BB);
         PBI->setSuccessor(0, TrueDest);
       }
       if (PBI->getSuccessor(1) == BB) {
-        if (PredHasWeights && SuccHasWeights) {
+        if (HasWeights) {
           // PBI: br i1 %x, TrueDest, BB
           // BI:  br i1 %y, TrueDest, FalseDest
-          //TrueWeight is TrueWeight for PBI * TotalWeight for BI +
+          // TrueWeight is TrueWeight for PBI * TotalWeight for BI +
           //              FalseWeight for PBI * TrueWeight for BI.
-          NewWeights.push_back(PredTrueWeight * (SuccFalseWeight +
-              SuccTrueWeight) + PredFalseWeight * SuccTrueWeight);
-          //FalseWeight is FalseWeight for PBI * FalseWeight for BI.
+          NewWeights.push_back(PredTrueWeight *
+                                   (SuccFalseWeight + SuccTrueWeight) +
+                               PredFalseWeight * SuccTrueWeight);
+          // FalseWeight is FalseWeight for PBI * FalseWeight for BI.
           NewWeights.push_back(PredFalseWeight * SuccFalseWeight);
         }
         AddPredecessorToBlock(FalseDest, PredBlock, BB);
@@ -2310,51 +2410,42 @@ bool llvm::FoldBranchToCommonDest(BranchInst *BI, unsigned BonusInstThreshold) {
         // Halve the weights if any of them cannot fit in an uint32_t
         FitWeights(NewWeights);
 
-        SmallVector<uint32_t, 8> MDWeights(NewWeights.begin(),NewWeights.end());
-        PBI->setMetadata(LLVMContext::MD_prof,
-                         MDBuilder(BI->getContext()).
-                         createBranchWeights(MDWeights));
+        SmallVector<uint32_t, 8> MDWeights(NewWeights.begin(),
+                                           NewWeights.end());
+        PBI->setMetadata(
+            LLVMContext::MD_prof,
+            MDBuilder(BI->getContext()).createBranchWeights(MDWeights));
       } else
         PBI->setMetadata(LLVMContext::MD_prof, nullptr);
     } else {
       // Update PHI nodes in the common successors.
       for (unsigned i = 0, e = PHIs.size(); i != e; ++i) {
         ConstantInt *PBI_C = cast<ConstantInt>(
-          PHIs[i]->getIncomingValueForBlock(PBI->getParent()));
+            PHIs[i]->getIncomingValueForBlock(PBI->getParent()));
         assert(PBI_C->getType()->isIntegerTy(1));
         Instruction *MergedCond = nullptr;
         if (PBI->getSuccessor(0) == TrueDest) {
           // Create (PBI_Cond and PBI_C) or (!PBI_Cond and BI_Value)
           // PBI_C is true: PBI_Cond or (!PBI_Cond and BI_Value)
           //       is false: !PBI_Cond and BI_Value
-          Instruction *NotCond =
-            cast<Instruction>(Builder.CreateNot(PBI->getCondition(),
-                                "not.cond"));
-          MergedCond =
-            cast<Instruction>(Builder.CreateBinOp(Instruction::And,
-                                NotCond, New,
-                                "and.cond"));
+          Instruction *NotCond = cast<Instruction>(
+              Builder.CreateNot(PBI->getCondition(), "not.cond"));
+          MergedCond = cast<Instruction>(
+              Builder.CreateBinOp(Instruction::And, NotCond, New, "and.cond"));
           if (PBI_C->isOne())
-            MergedCond =
-              cast<Instruction>(Builder.CreateBinOp(Instruction::Or,
-                                  PBI->getCondition(), MergedCond,
-                                  "or.cond"));
+            MergedCond = cast<Instruction>(Builder.CreateBinOp(
+                Instruction::Or, PBI->getCondition(), MergedCond, "or.cond"));
         } else {
           // Create (PBI_Cond and BI_Value) or (!PBI_Cond and PBI_C)
           // PBI_C is true: (PBI_Cond and BI_Value) or (!PBI_Cond)
           //       is false: PBI_Cond and BI_Value
-          MergedCond =
-            cast<Instruction>(Builder.CreateBinOp(Instruction::And,
-                                PBI->getCondition(), New,
-                                "and.cond"));
+          MergedCond = cast<Instruction>(Builder.CreateBinOp(
+              Instruction::And, PBI->getCondition(), New, "and.cond"));
           if (PBI_C->isOne()) {
-            Instruction *NotCond =
-              cast<Instruction>(Builder.CreateNot(PBI->getCondition(),
-                                  "not.cond"));
-            MergedCond =
-              cast<Instruction>(Builder.CreateBinOp(Instruction::Or,
-                                  NotCond, MergedCond,
-                                  "or.cond"));
+            Instruction *NotCond = cast<Instruction>(
+                Builder.CreateNot(PBI->getCondition(), "not.cond"));
+            MergedCond = cast<Instruction>(Builder.CreateBinOp(
+                Instruction::Or, NotCond, MergedCond, "or.cond"));
           }
         }
         // Update PHI Node.
@@ -2371,9 +2462,9 @@ bool llvm::FoldBranchToCommonDest(BranchInst *BI, unsigned BonusInstThreshold) {
     // could replace PBI's branch probabilities with BI's.
 
     // Copy any debug value intrinsics into the end of PredBlock.
-    for (BasicBlock::iterator I = BB->begin(), E = BB->end(); I != E; ++I)
-      if (isa<DbgInfoIntrinsic>(*I))
-        I->clone()->insertBefore(PBI);
+    for (Instruction &I : *BB)
+      if (isa<DbgInfoIntrinsic>(I))
+        I.clone()->insertBefore(PBI);
 
     return true;
   }
@@ -2417,7 +2508,7 @@ static Value *ensureValueAvailableInSuccessor(Value *V, BasicBlock *BB,
   // where OtherBB is the single other predecessor of BB's only successor.
   PHINode *PHI = nullptr;
   BasicBlock *Succ = BB->getSingleSuccessor();
-  
+
   for (auto I = Succ->begin(); isa<PHINode>(I); ++I)
     if (cast<PHINode>(I)->getIncomingValueForBlock(BB) == V) {
       PHI = cast<PHINode>(I);
@@ -2443,8 +2534,8 @@ static Value *ensureValueAvailableInSuccessor(Value *V, BasicBlock *BB,
   PHI->addIncoming(V, BB);
   for (BasicBlock *PredBB : predecessors(Succ))
     if (PredBB != BB)
-      PHI->addIncoming(AlternativeV ? AlternativeV : UndefValue::get(V->getType()),
-                       PredBB);
+      PHI->addIncoming(
+          AlternativeV ? AlternativeV : UndefValue::get(V->getType()), PredBB);
   return PHI;
 }
 
@@ -2481,10 +2572,9 @@ static bool mergeConditionalStoreToAddress(BasicBlock *PTB, BasicBlock *PFB,
     return N <= PHINodeFoldingThreshold;
   };
 
-  if (!MergeCondStoresAggressively && (!IsWorthwhile(PTB) ||
-                                       !IsWorthwhile(PFB) ||
-                                       !IsWorthwhile(QTB) ||
-                                       !IsWorthwhile(QFB)))
+  if (!MergeCondStoresAggressively &&
+      (!IsWorthwhile(PTB) || !IsWorthwhile(PFB) || !IsWorthwhile(QTB) ||
+       !IsWorthwhile(QFB)))
     return false;
 
   // For every pointer, there must be exactly two stores, one coming from
@@ -2561,7 +2651,7 @@ static bool mergeConditionalStoreToAddress(BasicBlock *PTB, BasicBlock *PFB,
 
   QStore->eraseFromParent();
   PStore->eraseFromParent();
-  
+
   return true;
 }
 
@@ -2593,7 +2683,7 @@ static bool mergeConditionalStores(BranchInst *PBI, BranchInst *QBI) {
   // We model triangles as a type of diamond with a nullptr "true" block.
   // Triangles are canonicalized so that the fallthrough edge is represented by
   // a true condition, as in the diagram above.
-  //  
+  //
   BasicBlock *PTB = PBI->getSuccessor(0);
   BasicBlock *PFB = PBI->getSuccessor(1);
   BasicBlock *QTB = QBI->getSuccessor(0);
@@ -2622,8 +2712,7 @@ static bool mergeConditionalStores(BranchInst *PBI, BranchInst *QBI) {
   // the post-dominating block, and the non-fallthroughs must only have one
   // predecessor.
   auto HasOnePredAndOneSucc = [](BasicBlock *BB, BasicBlock *P, BasicBlock *S) {
-    return BB->getSinglePredecessor() == P &&
-           BB->getSingleSuccessor() == S;
+    return BB->getSinglePredecessor() == P && BB->getSingleSuccessor() == S;
   };
   if (!PostBB ||
       !HasOnePredAndOneSucc(PFB, PBI->getParent(), QBI->getParent()) ||
@@ -2637,7 +2726,7 @@ static bool mergeConditionalStores(BranchInst *PBI, BranchInst *QBI) {
 
   // OK, this is a sequence of two diamonds or triangles.
   // Check if there are stores in PTB or PFB that are repeated in QTB or QFB.
-  SmallPtrSet<Value *,4> PStoreAddresses, QStoreAddresses;
+  SmallPtrSet<Value *, 4> PStoreAddresses, QStoreAddresses;
   for (auto *BB : {PTB, PFB}) {
     if (!BB)
       continue;
@@ -2652,7 +2741,7 @@ static bool mergeConditionalStores(BranchInst *PBI, BranchInst *QBI) {
       if (StoreInst *SI = dyn_cast<StoreInst>(&I))
         QStoreAddresses.insert(SI->getPointerOperand());
   }
-  
+
   set_intersect(PStoreAddresses, QStoreAddresses);
   // set_intersect mutates PStoreAddresses in place. Rename it here to make it
   // clear what it contains.
@@ -2684,9 +2773,9 @@ static bool SimplifyCondBranchToCondBranch(BranchInst *PBI, BranchInst *BI,
     if (BB->getSinglePredecessor()) {
       // Turn this into a branch on constant.
       bool CondIsTrue = PBI->getSuccessor(0) == BB;
-      BI->setCondition(ConstantInt::get(Type::getInt1Ty(BB->getContext()),
-                                        CondIsTrue));
-      return true;  // Nuke the branch on constant.
+      BI->setCondition(
+          ConstantInt::get(Type::getInt1Ty(BB->getContext()), CondIsTrue));
+      return true; // Nuke the branch on constant.
     }
 
     // Otherwise, if there are multiple predecessors, insert a PHI that merges
@@ -2702,13 +2791,13 @@ static bool SimplifyCondBranchToCondBranch(BranchInst *PBI, BranchInst *BI,
       // Any predecessor where the condition is not computable we keep symbolic.
       for (pred_iterator PI = PB; PI != PE; ++PI) {
         BasicBlock *P = *PI;
-        if ((PBI = dyn_cast<BranchInst>(P->getTerminator())) &&
-            PBI != BI && PBI->isConditional() &&
-            PBI->getCondition() == BI->getCondition() &&
+        if ((PBI = dyn_cast<BranchInst>(P->getTerminator())) && PBI != BI &&
+            PBI->isConditional() && PBI->getCondition() == BI->getCondition() &&
             PBI->getSuccessor(0) != PBI->getSuccessor(1)) {
           bool CondIsTrue = PBI->getSuccessor(0) == BB;
-          NewPN->addIncoming(ConstantInt::get(Type::getInt1Ty(BB->getContext()),
-                                              CondIsTrue), P);
+          NewPN->addIncoming(
+              ConstantInt::get(Type::getInt1Ty(BB->getContext()), CondIsTrue),
+              P);
         } else {
           NewPN->addIncoming(BI->getCondition(), P);
         }
@@ -2723,19 +2812,6 @@ static bool SimplifyCondBranchToCondBranch(BranchInst *PBI, BranchInst *BI,
     if (CE->canTrap())
       return false;
 
-  // If BI is reached from the true path of PBI and PBI's condition implies
-  // BI's condition, we know the direction of the BI branch.
-  if (PBI->getSuccessor(0) == BI->getParent() &&
-      isImpliedCondition(PBI->getCondition(), BI->getCondition(), DL) &&
-      PBI->getSuccessor(0) != PBI->getSuccessor(1) &&
-      BB->getSinglePredecessor()) {
-    // Turn this into a branch on constant.
-    auto *OldCond = BI->getCondition();
-    BI->setCondition(ConstantInt::getTrue(BB->getContext()));
-    RecursivelyDeleteTriviallyDeadInstructions(OldCond);
-    return true;  // Nuke the branch on constant.
-  }
-
   // If both branches are conditional and both contain stores to the same
   // address, remove the stores from the conditionals and create a conditional
   // merged store at the end.
@@ -2753,16 +2829,21 @@ static bool SimplifyCondBranchToCondBranch(BranchInst *PBI, BranchInst *BI,
     return false;
 
   int PBIOp, BIOp;
-  if (PBI->getSuccessor(0) == BI->getSuccessor(0))
-    PBIOp = BIOp = 0;
-  else if (PBI->getSuccessor(0) == BI->getSuccessor(1))
-    PBIOp = 0, BIOp = 1;
-  else if (PBI->getSuccessor(1) == BI->getSuccessor(0))
-    PBIOp = 1, BIOp = 0;
-  else if (PBI->getSuccessor(1) == BI->getSuccessor(1))
-    PBIOp = BIOp = 1;
-  else
+  if (PBI->getSuccessor(0) == BI->getSuccessor(0)) {
+    PBIOp = 0;
+    BIOp = 0;
+  } else if (PBI->getSuccessor(0) == BI->getSuccessor(1)) {
+    PBIOp = 0;
+    BIOp = 1;
+  } else if (PBI->getSuccessor(1) == BI->getSuccessor(0)) {
+    PBIOp = 1;
+    BIOp = 0;
+  } else if (PBI->getSuccessor(1) == BI->getSuccessor(1)) {
+    PBIOp = 1;
+    BIOp = 1;
+  } else {
     return false;
+  }
 
   // Check to make sure that the other destination of this branch
   // isn't BB itself.  If so, this is an infinite loop that will
@@ -2780,8 +2861,8 @@ static bool SimplifyCondBranchToCondBranch(BranchInst *PBI, BranchInst *BI,
 
   BasicBlock *CommonDest = PBI->getSuccessor(PBIOp);
   unsigned NumPhis = 0;
-  for (BasicBlock::iterator II = CommonDest->begin();
-       isa<PHINode>(II); ++II, ++NumPhis) {
+  for (BasicBlock::iterator II = CommonDest->begin(); isa<PHINode>(II);
+       ++II, ++NumPhis) {
     if (NumPhis > 2) // Disable this xform.
       return false;
 
@@ -2804,7 +2885,6 @@ static bool SimplifyCondBranchToCondBranch(BranchInst *PBI, BranchInst *BI,
   DEBUG(dbgs() << "FOLDING BRs:" << *PBI->getParent()
                << "AND: " << *BI->getParent());
 
-
   // If OtherDest *is* BB, then BB is a basic block with a single conditional
   // branch in it, where one edge (OtherDest) goes back to itself but the other
   // exits.  We don't *know* that the program avoids the infinite loop
@@ -2815,8 +2895,8 @@ static bool SimplifyCondBranchToCondBranch(BranchInst *PBI, BranchInst *BI,
   if (OtherDest == BB) {
     // Insert it at the end of the function, because it's either code,
     // or it won't matter if it's hot. :)
-    BasicBlock *InfLoopBlock = BasicBlock::Create(BB->getContext(),
-                                                  "infloop", BB->getParent());
+    BasicBlock *InfLoopBlock =
+        BasicBlock::Create(BB->getContext(), "infloop", BB->getParent());
     BranchInst::Create(InfLoopBlock, InfLoopBlock);
     OtherDest = InfLoopBlock;
   }
@@ -2828,13 +2908,13 @@ static bool SimplifyCondBranchToCondBranch(BranchInst *PBI, BranchInst *BI,
 
   // Make sure we get to CommonDest on True&True directions.
   Value *PBICond = PBI->getCondition();
-  IRBuilder<true, NoFolder> Builder(PBI);
+  IRBuilder<NoFolder> Builder(PBI);
   if (PBIOp)
-    PBICond = Builder.CreateNot(PBICond, PBICond->getName()+".not");
+    PBICond = Builder.CreateNot(PBICond, PBICond->getName() + ".not");
 
   Value *BICond = BI->getCondition();
   if (BIOp)
-    BICond = Builder.CreateNot(BICond, BICond->getName()+".not");
+    BICond = Builder.CreateNot(BICond, BICond->getName() + ".not");
 
   // Merge the conditions.
   Value *Cond = Builder.CreateOr(PBICond, BICond, "brmerge");
@@ -2846,15 +2926,15 @@ static bool SimplifyCondBranchToCondBranch(BranchInst *PBI, BranchInst *BI,
 
   // Update branch weight for PBI.
   uint64_t PredTrueWeight, PredFalseWeight, SuccTrueWeight, SuccFalseWeight;
-  bool PredHasWeights = ExtractBranchMetadata(PBI, PredTrueWeight,
-                                              PredFalseWeight);
-  bool SuccHasWeights = ExtractBranchMetadata(BI, SuccTrueWeight,
-                                              SuccFalseWeight);
-  if (PredHasWeights && SuccHasWeights) {
-    uint64_t PredCommon = PBIOp ? PredFalseWeight : PredTrueWeight;
-    uint64_t PredOther = PBIOp ?PredTrueWeight : PredFalseWeight;
-    uint64_t SuccCommon = BIOp ? SuccFalseWeight : SuccTrueWeight;
-    uint64_t SuccOther = BIOp ? SuccTrueWeight : SuccFalseWeight;
+  uint64_t PredCommon, PredOther, SuccCommon, SuccOther;
+  bool HasWeights =
+      extractPredSuccWeights(PBI, BI, PredTrueWeight, PredFalseWeight,
+                             SuccTrueWeight, SuccFalseWeight);
+  if (HasWeights) {
+    PredCommon = PBIOp ? PredFalseWeight : PredTrueWeight;
+    PredOther = PBIOp ? PredTrueWeight : PredFalseWeight;
+    SuccCommon = BIOp ? SuccFalseWeight : SuccTrueWeight;
+    SuccOther = BIOp ? SuccTrueWeight : SuccFalseWeight;
     // The weight to CommonDest should be PredCommon * SuccTotal +
     //                                    PredOther * SuccCommon.
     // The weight to OtherDest should be PredOther * SuccOther.
@@ -2885,9 +2965,29 @@ static bool SimplifyCondBranchToCondBranch(BranchInst *PBI, BranchInst *BI,
     Value *PBIV = PN->getIncomingValue(PBBIdx);
     if (BIV != PBIV) {
       // Insert a select in PBI to pick the right value.
-      Value *NV = cast<SelectInst>
-        (Builder.CreateSelect(PBICond, PBIV, BIV, PBIV->getName()+".mux"));
+      SelectInst *NV = cast<SelectInst>(
+          Builder.CreateSelect(PBICond, PBIV, BIV, PBIV->getName() + ".mux"));
       PN->setIncomingValue(PBBIdx, NV);
+      // Although the select has the same condition as PBI, the original branch
+      // weights for PBI do not apply to the new select because the select's
+      // 'logical' edges are incoming edges of the phi that is eliminated, not
+      // the outgoing edges of PBI.
+      if (HasWeights) {
+        uint64_t PredCommon = PBIOp ? PredFalseWeight : PredTrueWeight;
+        uint64_t PredOther = PBIOp ? PredTrueWeight : PredFalseWeight;
+        uint64_t SuccCommon = BIOp ? SuccFalseWeight : SuccTrueWeight;
+        uint64_t SuccOther = BIOp ? SuccTrueWeight : SuccFalseWeight;
+        // The weight to PredCommonDest should be PredCommon * SuccTotal.
+        // The weight to PredOtherDest should be PredOther * SuccCommon.
+        uint64_t NewWeights[2] = {PredCommon * (SuccCommon + SuccOther),
+                                  PredOther * SuccCommon};
+
+        FitWeights(NewWeights);
+
+        NV->setMetadata(LLVMContext::MD_prof,
+                        MDBuilder(BI->getContext())
+                            .createBranchWeights(NewWeights[0], NewWeights[1]));
+      }
     }
   }
 
@@ -2907,7 +3007,7 @@ static bool SimplifyCondBranchToCondBranch(BranchInst *PBI, BranchInst *BI,
 static bool SimplifyTerminatorOnSelect(TerminatorInst *OldTerm, Value *Cond,
                                        BasicBlock *TrueBB, BasicBlock *FalseBB,
                                        uint32_t TrueWeight,
-                                       uint32_t FalseWeight){
+                                       uint32_t FalseWeight) {
   // Remove any superfluous successor edges from the CFG.
   // First, figure out which successors to preserve.
   // If TrueBB and FalseBB are equal, only try to preserve one copy of that
@@ -2942,8 +3042,8 @@ static bool SimplifyTerminatorOnSelect(TerminatorInst *OldTerm, Value *Cond,
       BranchInst *NewBI = Builder.CreateCondBr(Cond, TrueBB, FalseBB);
       if (TrueWeight != FalseWeight)
         NewBI->setMetadata(LLVMContext::MD_prof,
-                           MDBuilder(OldTerm->getContext()).
-                           createBranchWeights(TrueWeight, FalseWeight));
+                           MDBuilder(OldTerm->getContext())
+                               .createBranchWeights(TrueWeight, FalseWeight));
     }
   } else if (KeepEdge1 && (KeepEdge2 || TrueBB == FalseBB)) {
     // Neither of the selected blocks were successors, so this
@@ -2988,16 +3088,16 @@ static bool SimplifySwitchOnSelect(SwitchInst *SI, SelectInst *Select) {
   if (HasWeights) {
     GetBranchWeights(SI, Weights);
     if (Weights.size() == 1 + SI->getNumCases()) {
-      TrueWeight = (uint32_t)Weights[SI->findCaseValue(TrueVal).
-                                     getSuccessorIndex()];
-      FalseWeight = (uint32_t)Weights[SI->findCaseValue(FalseVal).
-                                      getSuccessorIndex()];
+      TrueWeight =
+          (uint32_t)Weights[SI->findCaseValue(TrueVal).getSuccessorIndex()];
+      FalseWeight =
+          (uint32_t)Weights[SI->findCaseValue(FalseVal).getSuccessorIndex()];
     }
   }
 
   // Perform the actual simplification.
-  return SimplifyTerminatorOnSelect(SI, Condition, TrueBB, FalseBB,
-                                    TrueWeight, FalseWeight);
+  return SimplifyTerminatorOnSelect(SI, Condition, TrueBB, FalseBB, TrueWeight,
+                                    FalseWeight);
 }
 
 // Replaces
@@ -3017,8 +3117,8 @@ static bool SimplifyIndirectBrOnSelect(IndirectBrInst *IBI, SelectInst *SI) {
   BasicBlock *FalseBB = FBA->getBasicBlock();
 
   // Perform the actual simplification.
-  return SimplifyTerminatorOnSelect(IBI, SI->getCondition(), TrueBB, FalseBB,
-                                    0, 0);
+  return SimplifyTerminatorOnSelect(IBI, SI->getCondition(), TrueBB, FalseBB, 0,
+                                    0);
 }
 
 /// This is called when we find an icmp instruction
@@ -3046,7 +3146,8 @@ static bool TryToSimplifyUncondBranchWithICmpInIt(
 
   // If the block has any PHIs in it or the icmp has multiple uses, it is too
   // complex.
-  if (isa<PHINode>(BB->begin()) || !ICI->hasOneUse()) return false;
+  if (isa<PHINode>(BB->begin()) || !ICI->hasOneUse())
+    return false;
 
   Value *V = ICI->getOperand(0);
   ConstantInt *Cst = cast<ConstantInt>(ICI->getOperand(1));
@@ -3055,7 +3156,8 @@ static bool TryToSimplifyUncondBranchWithICmpInIt(
   // 'V' and this block is the default case for the switch.  In this case we can
   // fold the compared value into the switch to simplify things.
   BasicBlock *Pred = BB->getSinglePredecessor();
-  if (!Pred || !isa<SwitchInst>(Pred->getTerminator())) return false;
+  if (!Pred || !isa<SwitchInst>(Pred->getTerminator()))
+    return false;
 
   SwitchInst *SI = cast<SwitchInst>(Pred->getTerminator());
   if (SI->getCondition() != V)
@@ -3104,7 +3206,7 @@ static bool TryToSimplifyUncondBranchWithICmpInIt(
   // If the icmp is a SETEQ, then the default dest gets false, the new edge gets
   // true in the PHI.
   Constant *DefaultCst = ConstantInt::getTrue(BB->getContext());
-  Constant *NewCst     = ConstantInt::getFalse(BB->getContext());
+  Constant *NewCst = ConstantInt::getFalse(BB->getContext());
 
   if (ICI->getPredicate() == ICmpInst::ICMP_EQ)
     std::swap(DefaultCst, NewCst);
@@ -3116,21 +3218,21 @@ static bool TryToSimplifyUncondBranchWithICmpInIt(
 
   // Okay, the switch goes to this block on a default value.  Add an edge from
   // the switch to the merge point on the compared value.
-  BasicBlock *NewBB = BasicBlock::Create(BB->getContext(), "switch.edge",
-                                         BB->getParent(), BB);
+  BasicBlock *NewBB =
+      BasicBlock::Create(BB->getContext(), "switch.edge", BB->getParent(), BB);
   SmallVector<uint64_t, 8> Weights;
   bool HasWeights = HasBranchWeights(SI);
   if (HasWeights) {
     GetBranchWeights(SI, Weights);
     if (Weights.size() == 1 + SI->getNumCases()) {
       // Split weight for default case to case for "Cst".
-      Weights[0] = (Weights[0]+1) >> 1;
+      Weights[0] = (Weights[0] + 1) >> 1;
       Weights.push_back(Weights[0]);
 
       SmallVector<uint32_t, 8> MDWeights(Weights.begin(), Weights.end());
-      SI->setMetadata(LLVMContext::MD_prof,
-                      MDBuilder(SI->getContext()).
-                      createBranchWeights(MDWeights));
+      SI->setMetadata(
+          LLVMContext::MD_prof,
+          MDBuilder(SI->getContext()).createBranchWeights(MDWeights));
     }
   }
   SI->addCase(Cst, NewBB);
@@ -3149,7 +3251,8 @@ static bool TryToSimplifyUncondBranchWithICmpInIt(
 static bool SimplifyBranchOnICmpChain(BranchInst *BI, IRBuilder<> &Builder,
                                       const DataLayout &DL) {
   Instruction *Cond = dyn_cast<Instruction>(BI->getCondition());
-  if (!Cond) return false;
+  if (!Cond)
+    return false;
 
   // Change br (X == 0 | X == 1), T, F into a switch instruction.
   // If this is a bunch of seteq's or'd together, or if it's a bunch of
@@ -3158,13 +3261,14 @@ static bool SimplifyBranchOnICmpChain(BranchInst *BI, IRBuilder<> &Builder,
   // Try to gather values from a chain of and/or to be turned into a switch
   ConstantComparesGatherer ConstantCompare(Cond, DL);
   // Unpack the result
-  SmallVectorImpl<ConstantInt*> &Values = ConstantCompare.Vals;
+  SmallVectorImpl<ConstantInt *> &Values = ConstantCompare.Vals;
   Value *CompVal = ConstantCompare.CompValue;
   unsigned UsedICmps = ConstantCompare.UsedICmps;
   Value *ExtraCase = ConstantCompare.Extra;
 
   // If we didn't have a multiply compared value, fail.
-  if (!CompVal) return false;
+  if (!CompVal)
+    return false;
 
   // Avoid turning single icmps into a switch.
   if (UsedICmps <= 1)
@@ -3179,20 +3283,23 @@ static bool SimplifyBranchOnICmpChain(BranchInst *BI, IRBuilder<> &Builder,
 
   // If Extra was used, we require at least two switch values to do the
   // transformation.  A switch with one value is just a conditional branch.
-  if (ExtraCase && Values.size() < 2) return false;
+  if (ExtraCase && Values.size() < 2)
+    return false;
 
   // TODO: Preserve branch weight metadata, similarly to how
   // FoldValueComparisonIntoPredecessors preserves it.
 
   // Figure out which block is which destination.
   BasicBlock *DefaultBB = BI->getSuccessor(1);
-  BasicBlock *EdgeBB    = BI->getSuccessor(0);
-  if (!TrueWhenEqual) std::swap(DefaultBB, EdgeBB);
+  BasicBlock *EdgeBB = BI->getSuccessor(0);
+  if (!TrueWhenEqual)
+    std::swap(DefaultBB, EdgeBB);
 
   BasicBlock *BB = BI->getParent();
 
   DEBUG(dbgs() << "Converting 'icmp' chain with " << Values.size()
-               << " cases into SWITCH.  BB is:\n" << *BB);
+               << " cases into SWITCH.  BB is:\n"
+               << *BB);
 
   // If there are any extra values that couldn't be folded into the switch
   // then we evaluate them with an explicit branch first.  Split the block
@@ -3216,7 +3323,7 @@ static bool SimplifyBranchOnICmpChain(BranchInst *BI, IRBuilder<> &Builder,
     AddPredecessorToBlock(EdgeBB, BB, NewBB);
 
     DEBUG(dbgs() << "  ** 'icmp' chain unhandled condition: " << *ExtraCase
-          << "\nEXTRABB = " << *BB);
+                 << "\nEXTRABB = " << *BB);
     BB = NewBB;
   }
 
@@ -3237,11 +3344,10 @@ static bool SimplifyBranchOnICmpChain(BranchInst *BI, IRBuilder<> &Builder,
   // We added edges from PI to the EdgeBB.  As such, if there were any
   // PHI nodes in EdgeBB, they need entries to be added corresponding to
   // the number of edges added.
-  for (BasicBlock::iterator BBI = EdgeBB->begin();
-       isa<PHINode>(BBI); ++BBI) {
+  for (BasicBlock::iterator BBI = EdgeBB->begin(); isa<PHINode>(BBI); ++BBI) {
     PHINode *PN = cast<PHINode>(BBI);
     Value *InVal = PN->getIncomingValueForBlock(BB);
-    for (unsigned i = 0, e = Values.size()-1; i != e; ++i)
+    for (unsigned i = 0, e = Values.size() - 1; i != e; ++i)
       PN->addIncoming(InVal, BB);
   }
 
@@ -3270,7 +3376,7 @@ bool SimplifyCFGOpt::SimplifyCommonResume(ResumeInst *RI) {
   // Check that there are no other instructions except for debug intrinsics
   // between the phi of landing pads (RI->getValue()) and resume instruction.
   BasicBlock::iterator I = cast<Instruction>(RI->getValue())->getIterator(),
-		  	  	  	   E = RI->getIterator();
+                       E = RI->getIterator();
   while (++I != E)
     if (!isa<DbgInfoIntrinsic>(I))
       return false;
@@ -3279,8 +3385,8 @@ bool SimplifyCFGOpt::SimplifyCommonResume(ResumeInst *RI) {
   auto *PhiLPInst = cast<PHINode>(RI->getValue());
 
   // Check incoming blocks to see if any of them are trivial.
-  for (unsigned Idx = 0, End = PhiLPInst->getNumIncomingValues();
-       Idx != End; Idx++) {
+  for (unsigned Idx = 0, End = PhiLPInst->getNumIncomingValues(); Idx != End;
+       Idx++) {
     auto *IncomingBB = PhiLPInst->getIncomingBlock(Idx);
     auto *IncomingValue = PhiLPInst->getIncomingValue(Idx);
 
@@ -3289,8 +3395,7 @@ bool SimplifyCFGOpt::SimplifyCommonResume(ResumeInst *RI) {
     if (IncomingBB->getUniqueSuccessor() != BB)
       continue;
 
-    auto *LandingPad =
-        dyn_cast<LandingPadInst>(IncomingBB->getFirstNonPHI());
+    auto *LandingPad = dyn_cast<LandingPadInst>(IncomingBB->getFirstNonPHI());
     // Not the landing pad that caused the control to branch here.
     if (IncomingValue != LandingPad)
       continue;
@@ -3310,7 +3415,8 @@ bool SimplifyCFGOpt::SimplifyCommonResume(ResumeInst *RI) {
   }
 
   // If no trivial unwind blocks, don't do any simplifications.
-  if (TrivialUnwindBlocks.empty()) return false;
+  if (TrivialUnwindBlocks.empty())
+    return false;
 
   // Turn all invokes that unwind here into calls.
   for (auto *TrivialBB : TrivialUnwindBlocks) {
@@ -3346,8 +3452,8 @@ bool SimplifyCFGOpt::SimplifyCommonResume(ResumeInst *RI) {
 bool SimplifyCFGOpt::SimplifySingleResume(ResumeInst *RI) {
   BasicBlock *BB = RI->getParent();
   LandingPadInst *LPInst = dyn_cast<LandingPadInst>(BB->getFirstNonPHI());
-  assert (RI->getValue() == LPInst &&
-          "Resume must unwind the exception that caused control to here");
+  assert(RI->getValue() == LPInst &&
+         "Resume must unwind the exception that caused control to here");
 
   // Check that there are no other instructions except for debug intrinsics.
   BasicBlock::iterator I = LPInst->getIterator(), E = RI->getIterator();
@@ -3363,10 +3469,12 @@ bool SimplifyCFGOpt::SimplifySingleResume(ResumeInst *RI) {
 
   // The landingpad is now unreachable.  Zap it.
   BB->eraseFromParent();
+  if (LoopHeaders)
+    LoopHeaders->erase(BB);
   return true;
 }
 
-bool SimplifyCFGOpt::SimplifyCleanupReturn(CleanupReturnInst *RI) {
+static bool removeEmptyCleanup(CleanupReturnInst *RI) {
   // If this is a trivial cleanup pad that executes no instructions, it can be
   // eliminated.  If the cleanup pad continues to the caller, any predecessor
   // that is an EH pad will be updated to continue to the caller and any
@@ -3381,12 +3489,29 @@ bool SimplifyCFGOpt::SimplifyCleanupReturn(CleanupReturnInst *RI) {
     // This isn't an empty cleanup.
     return false;
 
-  // Check that there are no other instructions except for debug intrinsics.
+  // We cannot kill the pad if it has multiple uses.  This typically arises
+  // from unreachable basic blocks.
+  if (!CPInst->hasOneUse())
+    return false;
+
+  // Check that there are no other instructions except for benign intrinsics.
   BasicBlock::iterator I = CPInst->getIterator(), E = RI->getIterator();
-  while (++I != E)
-    if (!isa<DbgInfoIntrinsic>(I))
+  while (++I != E) {
+    auto *II = dyn_cast<IntrinsicInst>(I);
+    if (!II)
       return false;
 
+    Intrinsic::ID IntrinsicID = II->getIntrinsicID();
+    switch (IntrinsicID) {
+    case Intrinsic::dbg_declare:
+    case Intrinsic::dbg_value:
+    case Intrinsic::lifetime_end:
+      break;
+    default:
+      return false;
+    }
+  }
+
   // If the cleanup return we are simplifying unwinds to the caller, this will
   // set UnwindDest to nullptr.
   BasicBlock *UnwindDest = RI->getUnwindDest();
@@ -3430,7 +3555,7 @@ bool SimplifyCFGOpt::SimplifyCleanupReturn(CleanupReturnInst *RI) {
         // removing, we need to merge that PHI node's incoming values into
         // DestPN.
         for (unsigned SrcIdx = 0, SrcE = SrcPN->getNumIncomingValues();
-              SrcIdx != SrcE; ++SrcIdx) {
+             SrcIdx != SrcE; ++SrcIdx) {
           DestPN->addIncoming(SrcPN->getIncomingValue(SrcIdx),
                               SrcPN->getIncomingBlock(SrcIdx));
         }
@@ -3484,13 +3609,63 @@ bool SimplifyCFGOpt::SimplifyCleanupReturn(CleanupReturnInst *RI) {
   return true;
 }
 
+// Try to merge two cleanuppads together.
+static bool mergeCleanupPad(CleanupReturnInst *RI) {
+  // Skip any cleanuprets which unwind to caller, there is nothing to merge
+  // with.
+  BasicBlock *UnwindDest = RI->getUnwindDest();
+  if (!UnwindDest)
+    return false;
+
+  // This cleanupret isn't the only predecessor of this cleanuppad, it wouldn't
+  // be safe to merge without code duplication.
+  if (UnwindDest->getSinglePredecessor() != RI->getParent())
+    return false;
+
+  // Verify that our cleanuppad's unwind destination is another cleanuppad.
+  auto *SuccessorCleanupPad = dyn_cast<CleanupPadInst>(&UnwindDest->front());
+  if (!SuccessorCleanupPad)
+    return false;
+
+  CleanupPadInst *PredecessorCleanupPad = RI->getCleanupPad();
+  // Replace any uses of the successor cleanupad with the predecessor pad
+  // The only cleanuppad uses should be this cleanupret, it's cleanupret and
+  // funclet bundle operands.
+  SuccessorCleanupPad->replaceAllUsesWith(PredecessorCleanupPad);
+  // Remove the old cleanuppad.
+  SuccessorCleanupPad->eraseFromParent();
+  // Now, we simply replace the cleanupret with a branch to the unwind
+  // destination.
+  BranchInst::Create(UnwindDest, RI->getParent());
+  RI->eraseFromParent();
+
+  return true;
+}
+
+bool SimplifyCFGOpt::SimplifyCleanupReturn(CleanupReturnInst *RI) {
+  // It is possible to transiantly have an undef cleanuppad operand because we
+  // have deleted some, but not all, dead blocks.
+  // Eventually, this block will be deleted.
+  if (isa<UndefValue>(RI->getOperand(0)))
+    return false;
+
+  if (mergeCleanupPad(RI))
+    return true;
+
+  if (removeEmptyCleanup(RI))
+    return true;
+
+  return false;
+}
+
 bool SimplifyCFGOpt::SimplifyReturn(ReturnInst *RI, IRBuilder<> &Builder) {
   BasicBlock *BB = RI->getParent();
-  if (!BB->getFirstNonPHIOrDbg()->isTerminator()) return false;
+  if (!BB->getFirstNonPHIOrDbg()->isTerminator())
+    return false;
 
   // Find predecessors that end with branches.
-  SmallVector<BasicBlock*, 8> UncondBranchPreds;
-  SmallVector<BranchInst*, 8> CondBranchPreds;
+  SmallVector<BasicBlock *, 8> UncondBranchPreds;
+  SmallVector<BranchInst *, 8> CondBranchPreds;
   for (pred_iterator PI = pred_begin(BB), E = pred_end(BB); PI != E; ++PI) {
     BasicBlock *P = *PI;
     TerminatorInst *PTI = P->getTerminator();
@@ -3507,14 +3682,17 @@ bool SimplifyCFGOpt::SimplifyReturn(ReturnInst *RI, IRBuilder<> &Builder) {
     while (!UncondBranchPreds.empty()) {
       BasicBlock *Pred = UncondBranchPreds.pop_back_val();
       DEBUG(dbgs() << "FOLDING: " << *BB
-            << "INTO UNCOND BRANCH PRED: " << *Pred);
+                   << "INTO UNCOND BRANCH PRED: " << *Pred);
       (void)FoldReturnIntoUncondBranch(RI, BB, Pred);
     }
 
     // If we eliminated all predecessors of the block, delete the block now.
-    if (pred_empty(BB))
+    if (pred_empty(BB)) {
       // We know there are no successors, so just nuke the block.
       BB->eraseFromParent();
+      if (LoopHeaders)
+        LoopHeaders->erase(BB);
+    }
 
     return true;
   }
@@ -3547,7 +3725,8 @@ bool SimplifyCFGOpt::SimplifyUnreachable(UnreachableInst *UI) {
     // Do not delete instructions that can have side effects which might cause
     // the unreachable to not be reachable; specifically, calls and volatile
     // operations may have this effect.
-    if (isa<CallInst>(BBI) && !isa<DbgInfoIntrinsic>(BBI)) break;
+    if (isa<CallInst>(BBI) && !isa<DbgInfoIntrinsic>(BBI))
+      break;
 
     if (BBI->mayHaveSideEffects()) {
       if (auto *SI = dyn_cast<StoreInst>(BBI)) {
@@ -3589,9 +3768,10 @@ bool SimplifyCFGOpt::SimplifyUnreachable(UnreachableInst *UI) {
 
   // If the unreachable instruction is the first in the block, take a gander
   // at all of the predecessors of this instruction, and simplify them.
-  if (&BB->front() != UI) return Changed;
+  if (&BB->front() != UI)
+    return Changed;
 
-  SmallVector<BasicBlock*, 8> Preds(pred_begin(BB), pred_end(BB));
+  SmallVector<BasicBlock *, 8> Preds(pred_begin(BB), pred_end(BB));
   for (unsigned i = 0, e = Preds.size(); i != e; ++i) {
     TerminatorInst *TI = Preds[i]->getTerminator();
     IRBuilder<> Builder(TI);
@@ -3613,12 +3793,13 @@ bool SimplifyCFGOpt::SimplifyUnreachable(UnreachableInst *UI) {
         }
       }
     } else if (auto *SI = dyn_cast<SwitchInst>(TI)) {
-      for (SwitchInst::CaseIt i = SI->case_begin(), e = SI->case_end();
-           i != e; ++i)
+      for (SwitchInst::CaseIt i = SI->case_begin(), e = SI->case_end(); i != e;
+           ++i)
         if (i.getCaseSuccessor() == BB) {
           BB->removePredecessor(SI->getParent());
           SI->removeCase(i);
-          --i; --e;
+          --i;
+          --e;
           Changed = true;
         }
     } else if (auto *II = dyn_cast<InvokeInst>(TI)) {
@@ -3667,10 +3848,11 @@ bool SimplifyCFGOpt::SimplifyUnreachable(UnreachableInst *UI) {
   }
 
   // If this block is now dead, remove it.
-  if (pred_empty(BB) &&
-      BB != &BB->getParent()->getEntryBlock()) {
+  if (pred_empty(BB) && BB != &BB->getParent()->getEntryBlock()) {
     // We know there are no successors, so just nuke the block.
     BB->eraseFromParent();
+    if (LoopHeaders)
+      LoopHeaders->erase(BB);
     return true;
   }
 
@@ -3699,25 +3881,28 @@ static bool TurnSwitchRangeIntoICmp(SwitchInst *SI, IRBuilder<> &Builder) {
   // Partition the cases into two sets with different destinations.
   BasicBlock *DestA = HasDefault ? SI->getDefaultDest() : nullptr;
   BasicBlock *DestB = nullptr;
-  SmallVector <ConstantInt *, 16> CasesA;
-  SmallVector <ConstantInt *, 16> CasesB;
+  SmallVector<ConstantInt *, 16> CasesA;
+  SmallVector<ConstantInt *, 16> CasesB;
 
   for (SwitchInst::CaseIt I : SI->cases()) {
     BasicBlock *Dest = I.getCaseSuccessor();
-    if (!DestA) DestA = Dest;
+    if (!DestA)
+      DestA = Dest;
     if (Dest == DestA) {
       CasesA.push_back(I.getCaseValue());
       continue;
     }
-    if (!DestB) DestB = Dest;
+    if (!DestB)
+      DestB = Dest;
     if (Dest == DestB) {
       CasesB.push_back(I.getCaseValue());
       continue;
     }
-    return false;  // More than two destinations.
+    return false; // More than two destinations.
   }
 
-  assert(DestA && DestB && "Single-destination switch should have been folded.");
+  assert(DestA && DestB &&
+         "Single-destination switch should have been folded.");
   assert(DestA != DestB);
   assert(DestB != SI->getDefaultDest());
   assert(!CasesB.empty() && "There must be non-default cases.");
@@ -3741,7 +3926,8 @@ static bool TurnSwitchRangeIntoICmp(SwitchInst *SI, IRBuilder<> &Builder) {
   // Start building the compare and branch.
 
   Constant *Offset = ConstantExpr::getNeg(ContiguousCases->back());
-  Constant *NumCases = ConstantInt::get(Offset->getType(), ContiguousCases->size());
+  Constant *NumCases =
+      ConstantInt::get(Offset->getType(), ContiguousCases->size());
 
   Value *Sub = SI->getCondition();
   if (!Offset->isNullValue())
@@ -3773,21 +3959,24 @@ static bool TurnSwitchRangeIntoICmp(SwitchInst *SI, IRBuilder<> &Builder) {
         FalseWeight /= 2;
       }
       NewBI->setMetadata(LLVMContext::MD_prof,
-                         MDBuilder(SI->getContext()).createBranchWeights(
-                             (uint32_t)TrueWeight, (uint32_t)FalseWeight));
+                         MDBuilder(SI->getContext())
+                             .createBranchWeights((uint32_t)TrueWeight,
+                                                  (uint32_t)FalseWeight));
     }
   }
 
   // Prune obsolete incoming values off the successors' PHI nodes.
   for (auto BBI = ContiguousDest->begin(); isa<PHINode>(BBI); ++BBI) {
     unsigned PreviousEdges = ContiguousCases->size();
-    if (ContiguousDest == SI->getDefaultDest()) ++PreviousEdges;
+    if (ContiguousDest == SI->getDefaultDest())
+      ++PreviousEdges;
     for (unsigned I = 0, E = PreviousEdges - 1; I != E; ++I)
       cast<PHINode>(BBI)->removeIncomingValue(SI->getParent());
   }
   for (auto BBI = OtherDest->begin(); isa<PHINode>(BBI); ++BBI) {
     unsigned PreviousEdges = SI->getNumCases() - ContiguousCases->size();
-    if (OtherDest == SI->getDefaultDest()) ++PreviousEdges;
+    if (OtherDest == SI->getDefaultDest())
+      ++PreviousEdges;
     for (unsigned I = 0, E = PreviousEdges - 1; I != E; ++I)
       cast<PHINode>(BBI)->removeIncomingValue(SI->getParent());
   }
@@ -3807,32 +3996,38 @@ static bool EliminateDeadSwitchCases(SwitchInst *SI, AssumptionCache *AC,
   APInt KnownZero(Bits, 0), KnownOne(Bits, 0);
   computeKnownBits(Cond, KnownZero, KnownOne, DL, 0, AC, SI);
 
+  // We can also eliminate cases by determining that their values are outside of
+  // the limited range of the condition based on how many significant (non-sign)
+  // bits are in the condition value.
+  unsigned ExtraSignBits = ComputeNumSignBits(Cond, DL, 0, AC, SI) - 1;
+  unsigned MaxSignificantBitsInCond = Bits - ExtraSignBits;
+
   // Gather dead cases.
-  SmallVector<ConstantInt*, 8> DeadCases;
-  for (SwitchInst::CaseIt I = SI->case_begin(), E = SI->case_end(); I != E; ++I) {
-    if ((I.getCaseValue()->getValue() & KnownZero) != 0 ||
-        (I.getCaseValue()->getValue() & KnownOne) != KnownOne) {
-      DeadCases.push_back(I.getCaseValue());
-      DEBUG(dbgs() << "SimplifyCFG: switch case '"
-                   << I.getCaseValue() << "' is dead.\n");
+  SmallVector<ConstantInt *, 8> DeadCases;
+  for (auto &Case : SI->cases()) {
+    APInt CaseVal = Case.getCaseValue()->getValue();
+    if ((CaseVal & KnownZero) != 0 || (CaseVal & KnownOne) != KnownOne ||
+        (CaseVal.getMinSignedBits() > MaxSignificantBitsInCond)) {
+      DeadCases.push_back(Case.getCaseValue());
+      DEBUG(dbgs() << "SimplifyCFG: switch case " << CaseVal << " is dead.\n");
     }
   }
 
-  // If we can prove that the cases must cover all possible values, the 
-  // default destination becomes dead and we can remove it.  If we know some 
+  // If we can prove that the cases must cover all possible values, the
+  // default destination becomes dead and we can remove it.  If we know some
   // of the bits in the value, we can use that to more precisely compute the
   // number of possible unique case values.
   bool HasDefault =
-    !isa<UnreachableInst>(SI->getDefaultDest()->getFirstNonPHIOrDbg());
-  const unsigned NumUnknownBits = Bits - 
-    (KnownZero.Or(KnownOne)).countPopulation();
+      !isa<UnreachableInst>(SI->getDefaultDest()->getFirstNonPHIOrDbg());
+  const unsigned NumUnknownBits =
+      Bits - (KnownZero.Or(KnownOne)).countPopulation();
   assert(NumUnknownBits <= Bits);
   if (HasDefault && DeadCases.empty() &&
-      NumUnknownBits < 64 /* avoid overflow */ &&  
+      NumUnknownBits < 64 /* avoid overflow */ &&
       SI->getNumCases() == (1ULL << NumUnknownBits)) {
     DEBUG(dbgs() << "SimplifyCFG: switch default is dead.\n");
-    BasicBlock *NewDefault = SplitBlockPredecessors(SI->getDefaultDest(),
-                                                    SI->getParent(), "");
+    BasicBlock *NewDefault =
+        SplitBlockPredecessors(SI->getDefaultDest(), SI->getParent(), "");
     SI->setDefaultDest(&*NewDefault);
     SplitBlock(&*NewDefault, &NewDefault->front());
     auto *OldTI = NewDefault->getTerminator();
@@ -3849,12 +4044,12 @@ static bool EliminateDeadSwitchCases(SwitchInst *SI, AssumptionCache *AC,
   }
 
   // Remove dead cases from the switch.
-  for (unsigned I = 0, E = DeadCases.size(); I != E; ++I) {
-    SwitchInst::CaseIt Case = SI->findCaseValue(DeadCases[I]);
+  for (ConstantInt *DeadCase : DeadCases) {
+    SwitchInst::CaseIt Case = SI->findCaseValue(DeadCase);
     assert(Case != SI->case_default() &&
            "Case was not found. Probably mistake in DeadCases forming.");
     if (HasWeight) {
-      std::swap(Weights[Case.getCaseIndex()+1], Weights.back());
+      std::swap(Weights[Case.getCaseIndex() + 1], Weights.back());
       Weights.pop_back();
     }
 
@@ -3865,8 +4060,8 @@ static bool EliminateDeadSwitchCases(SwitchInst *SI, AssumptionCache *AC,
   if (HasWeight && Weights.size() >= 2) {
     SmallVector<uint32_t, 8> MDWeights(Weights.begin(), Weights.end());
     SI->setMetadata(LLVMContext::MD_prof,
-                    MDBuilder(SI->getParent()->getContext()).
-                    createBranchWeights(MDWeights));
+                    MDBuilder(SI->getParent()->getContext())
+                        .createBranchWeights(MDWeights));
   }
 
   return !DeadCases.empty();
@@ -3878,8 +4073,7 @@ static bool EliminateDeadSwitchCases(SwitchInst *SI, AssumptionCache *AC,
 /// block and see if the incoming value is equal to CaseValue. If so, return
 /// the phi node, and set PhiIndex to BB's index in the phi node.
 static PHINode *FindPHIForConditionForwarding(ConstantInt *CaseValue,
-                                              BasicBlock *BB,
-                                              int *PhiIndex) {
+                                              BasicBlock *BB, int *PhiIndex) {
   if (BB->getFirstNonPHIOrDbg() != BB->getTerminator())
     return nullptr; // BB must be empty to be a candidate for simplification.
   if (!BB->getSinglePredecessor())
@@ -3897,7 +4091,8 @@ static PHINode *FindPHIForConditionForwarding(ConstantInt *CaseValue,
     assert(Idx >= 0 && "PHI has no entry for predecessor?");
 
     Value *InValue = PHI->getIncomingValue(Idx);
-    if (InValue != CaseValue) continue;
+    if (InValue != CaseValue)
+      continue;
 
     *PhiIndex = Idx;
     return PHI;
@@ -3911,17 +4106,19 @@ static PHINode *FindPHIForConditionForwarding(ConstantInt *CaseValue,
 /// blocks of the switch can be folded away.
 /// Returns true if a change is made.
 static bool ForwardSwitchConditionToPHI(SwitchInst *SI) {
-  typedef DenseMap<PHINode*, SmallVector<int,4> > ForwardingNodesMap;
+  typedef DenseMap<PHINode *, SmallVector<int, 4>> ForwardingNodesMap;
   ForwardingNodesMap ForwardingNodes;
 
-  for (SwitchInst::CaseIt I = SI->case_begin(), E = SI->case_end(); I != E; ++I) {
+  for (SwitchInst::CaseIt I = SI->case_begin(), E = SI->case_end(); I != E;
+       ++I) {
     ConstantInt *CaseValue = I.getCaseValue();
     BasicBlock *CaseDest = I.getCaseSuccessor();
 
     int PhiIndex;
-    PHINode *PHI = FindPHIForConditionForwarding(CaseValue, CaseDest,
-                                                 &PhiIndex);
-    if (!PHI) continue;
+    PHINode *PHI =
+        FindPHIForConditionForwarding(CaseValue, CaseDest, &PhiIndex);
+    if (!PHI)
+      continue;
 
     ForwardingNodes[PHI].push_back(PhiIndex);
   }
@@ -3929,11 +4126,13 @@ static bool ForwardSwitchConditionToPHI(SwitchInst *SI) {
   bool Changed = false;
 
   for (ForwardingNodesMap::iterator I = ForwardingNodes.begin(),
-       E = ForwardingNodes.end(); I != E; ++I) {
+                                    E = ForwardingNodes.end();
+       I != E; ++I) {
     PHINode *Phi = I->first;
     SmallVectorImpl<int> &Indexes = I->second;
 
-    if (Indexes.size() < 2) continue;
+    if (Indexes.size() < 2)
+      continue;
 
     for (size_t I = 0, E = Indexes.size(); I != E; ++I)
       Phi->setIncomingValue(Indexes[I], SI->getCondition());
@@ -3954,17 +4153,16 @@ static bool ValidLookupTableConstant(Constant *C) {
   if (ConstantExpr *CE = dyn_cast<ConstantExpr>(C))
     return CE->isGEPWithNoNotionalOverIndexing();
 
-  return isa<ConstantFP>(C) ||
-      isa<ConstantInt>(C) ||
-      isa<ConstantPointerNull>(C) ||
-      isa<GlobalValue>(C) ||
-      isa<UndefValue>(C);
+  return isa<ConstantFP>(C) || isa<ConstantInt>(C) ||
+         isa<ConstantPointerNull>(C) || isa<GlobalValue>(C) ||
+         isa<UndefValue>(C);
 }
 
 /// If V is a Constant, return it. Otherwise, try to look up
 /// its constant value in ConstantPool, returning 0 if it's not there.
-static Constant *LookupConstant(Value *V,
-                         const SmallDenseMap<Value*, Constant*>& ConstantPool) {
+static Constant *
+LookupConstant(Value *V,
+               const SmallDenseMap<Value *, Constant *> &ConstantPool) {
   if (Constant *C = dyn_cast<Constant>(V))
     return C;
   return ConstantPool.lookup(V);
@@ -4001,7 +4199,7 @@ ConstantFold(Instruction *I, const DataLayout &DL,
                                            COps[1], DL);
   }
 
-  return ConstantFoldInstOperands(I->getOpcode(), I->getType(), COps, DL);
+  return ConstantFoldInstOperands(I, COps, DL);
 }
 
 /// Try to determine the resulting constant values in phi nodes
@@ -4018,7 +4216,7 @@ GetCaseResults(SwitchInst *SI, ConstantInt *CaseVal, BasicBlock *CaseDest,
 
   // If CaseDest is empty except for some side-effect free instructions through
   // which we can constant-propagate the CaseVal, continue to its successor.
-  SmallDenseMap<Value*, Constant*> ConstantPool;
+  SmallDenseMap<Value *, Constant *> ConstantPool;
   ConstantPool.insert(std::make_pair(SI->getCondition(), CaseVal));
   for (BasicBlock::iterator I = CaseDest->begin(), E = CaseDest->end(); I != E;
        ++I) {
@@ -4068,8 +4266,8 @@ GetCaseResults(SwitchInst *SI, ConstantInt *CaseVal, BasicBlock *CaseDest,
     if (Idx == -1)
       continue;
 
-    Constant *ConstVal = LookupConstant(PHI->getIncomingValue(Idx),
-                                        ConstantPool);
+    Constant *ConstVal =
+        LookupConstant(PHI->getIncomingValue(Idx), ConstantPool);
     if (!ConstVal)
       return false;
 
@@ -4086,16 +4284,16 @@ GetCaseResults(SwitchInst *SI, ConstantInt *CaseVal, BasicBlock *CaseDest,
 // Helper function used to add CaseVal to the list of cases that generate
 // Result.
 static void MapCaseToResult(ConstantInt *CaseVal,
-    SwitchCaseResultVectorTy &UniqueResults,
-    Constant *Result) {
+                            SwitchCaseResultVectorTy &UniqueResults,
+                            Constant *Result) {
   for (auto &I : UniqueResults) {
     if (I.first == Result) {
       I.second.push_back(CaseVal);
       return;
     }
   }
-  UniqueResults.push_back(std::make_pair(Result,
-        SmallVector<ConstantInt*, 4>(1, CaseVal)));
+  UniqueResults.push_back(
+      std::make_pair(Result, SmallVector<ConstantInt *, 4>(1, CaseVal)));
 }
 
 // Helper function that initializes a map containing
@@ -4137,7 +4335,7 @@ static bool InitializeUniqueCases(SwitchInst *SI, PHINode *&PHI,
   DefaultResult =
       DefaultResults.size() == 1 ? DefaultResults.begin()->second : nullptr;
   if ((!DefaultResult &&
-        !isa<UnreachableInst>(DefaultDest->getFirstNonPHIOrDbg())))
+       !isa<UnreachableInst>(DefaultDest->getFirstNonPHIOrDbg())))
     return false;
 
   return true;
@@ -4154,12 +4352,11 @@ static bool InitializeUniqueCases(SwitchInst *SI, PHINode *&PHI,
 //   default:
 //     return 4;
 // }
-static Value *
-ConvertTwoCaseSwitch(const SwitchCaseResultVectorTy &ResultVector,
-                     Constant *DefaultResult, Value *Condition,
-                     IRBuilder<> &Builder) {
+static Value *ConvertTwoCaseSwitch(const SwitchCaseResultVectorTy &ResultVector,
+                                   Constant *DefaultResult, Value *Condition,
+                                   IRBuilder<> &Builder) {
   assert(ResultVector.size() == 2 &&
-      "We should have exactly two unique results at this point");
+         "We should have exactly two unique results at this point");
   // If we are selecting between only two cases transform into a simple
   // select or a two-way select if default is possible.
   if (ResultVector[0].second.size() == 1 &&
@@ -4177,8 +4374,8 @@ ConvertTwoCaseSwitch(const SwitchCaseResultVectorTy &ResultVector,
     }
     Value *const ValueCompare =
         Builder.CreateICmpEQ(Condition, FirstCase, "switch.selectcmp");
-    return Builder.CreateSelect(ValueCompare, ResultVector[0].first, SelectValue,
-                                "switch.select");
+    return Builder.CreateSelect(ValueCompare, ResultVector[0].first,
+                                SelectValue, "switch.select");
   }
 
   return nullptr;
@@ -4227,9 +4424,8 @@ static bool SwitchToSelect(SwitchInst *SI, IRBuilder<> &Builder,
   assert(PHI != nullptr && "PHI for value select not found");
 
   Builder.SetInsertPoint(SI);
-  Value *SelectValue = ConvertTwoCaseSwitch(
-      UniqueResults,
-      DefaultResult, Cond, Builder);
+  Value *SelectValue =
+      ConvertTwoCaseSwitch(UniqueResults, DefaultResult, Cond, Builder);
   if (SelectValue) {
     RemoveSwitchAfterSelectConversion(SI, PHI, SelectValue, Builder);
     return true;
@@ -4239,62 +4435,62 @@ static bool SwitchToSelect(SwitchInst *SI, IRBuilder<> &Builder,
 }
 
 namespace {
-  /// This class represents a lookup table that can be used to replace a switch.
-  class SwitchLookupTable {
-  public:
-    /// Create a lookup table to use as a switch replacement with the contents
-    /// of Values, using DefaultValue to fill any holes in the table.
-    SwitchLookupTable(
-        Module &M, uint64_t TableSize, ConstantInt *Offset,
-        const SmallVectorImpl<std::pair<ConstantInt *, Constant *>> &Values,
-        Constant *DefaultValue, const DataLayout &DL);
-
-    /// Build instructions with Builder to retrieve the value at
-    /// the position given by Index in the lookup table.
-    Value *BuildLookup(Value *Index, IRBuilder<> &Builder);
-
-    /// Return true if a table with TableSize elements of
-    /// type ElementType would fit in a target-legal register.
-    static bool WouldFitInRegister(const DataLayout &DL, uint64_t TableSize,
-                                   Type *ElementType);
-
-  private:
-    // Depending on the contents of the table, it can be represented in
-    // different ways.
-    enum {
-      // For tables where each element contains the same value, we just have to
-      // store that single value and return it for each lookup.
-      SingleValueKind,
-
-      // For tables where there is a linear relationship between table index
-      // and values. We calculate the result with a simple multiplication
-      // and addition instead of a table lookup.
-      LinearMapKind,
-
-      // For small tables with integer elements, we can pack them into a bitmap
-      // that fits into a target-legal register. Values are retrieved by
-      // shift and mask operations.
-      BitMapKind,
-
-      // The table is stored as an array of values. Values are retrieved by load
-      // instructions from the table.
-      ArrayKind
-    } Kind;
-
-    // For SingleValueKind, this is the single value.
-    Constant *SingleValue;
-
-    // For BitMapKind, this is the bitmap.
-    ConstantInt *BitMap;
-    IntegerType *BitMapElementTy;
-
-    // For LinearMapKind, these are the constants used to derive the value.
-    ConstantInt *LinearOffset;
-    ConstantInt *LinearMultiplier;
-
-    // For ArrayKind, this is the array.
-    GlobalVariable *Array;
-  };
+/// This class represents a lookup table that can be used to replace a switch.
+class SwitchLookupTable {
+public:
+  /// Create a lookup table to use as a switch replacement with the contents
+  /// of Values, using DefaultValue to fill any holes in the table.
+  SwitchLookupTable(
+      Module &M, uint64_t TableSize, ConstantInt *Offset,
+      const SmallVectorImpl<std::pair<ConstantInt *, Constant *>> &Values,
+      Constant *DefaultValue, const DataLayout &DL);
+
+  /// Build instructions with Builder to retrieve the value at
+  /// the position given by Index in the lookup table.
+  Value *BuildLookup(Value *Index, IRBuilder<> &Builder);
+
+  /// Return true if a table with TableSize elements of
+  /// type ElementType would fit in a target-legal register.
+  static bool WouldFitInRegister(const DataLayout &DL, uint64_t TableSize,
+                                 Type *ElementType);
+
+private:
+  // Depending on the contents of the table, it can be represented in
+  // different ways.
+  enum {
+    // For tables where each element contains the same value, we just have to
+    // store that single value and return it for each lookup.
+    SingleValueKind,
+
+    // For tables where there is a linear relationship between table index
+    // and values. We calculate the result with a simple multiplication
+    // and addition instead of a table lookup.
+    LinearMapKind,
+
+    // For small tables with integer elements, we can pack them into a bitmap
+    // that fits into a target-legal register. Values are retrieved by
+    // shift and mask operations.
+    BitMapKind,
+
+    // The table is stored as an array of values. Values are retrieved by load
+    // instructions from the table.
+    ArrayKind
+  } Kind;
+
+  // For SingleValueKind, this is the single value.
+  Constant *SingleValue;
+
+  // For BitMapKind, this is the bitmap.
+  ConstantInt *BitMap;
+  IntegerType *BitMapElementTy;
+
+  // For LinearMapKind, these are the constants used to derive the value.
+  ConstantInt *LinearOffset;
+  ConstantInt *LinearMultiplier;
+
+  // For ArrayKind, this is the array.
+  GlobalVariable *Array;
+};
 }
 
 SwitchLookupTable::SwitchLookupTable(
@@ -4312,14 +4508,13 @@ SwitchLookupTable::SwitchLookupTable(
   Type *ValueType = Values.begin()->second->getType();
 
   // Build up the table contents.
-  SmallVector<Constant*, 64> TableContents(TableSize);
+  SmallVector<Constant *, 64> TableContents(TableSize);
   for (size_t I = 0, E = Values.size(); I != E; ++I) {
     ConstantInt *CaseVal = Values[I].first;
     Constant *CaseRes = Values[I].second;
     assert(CaseRes->getType() == ValueType);
 
-    uint64_t Idx = (CaseVal->getValue() - Offset->getValue())
-                   .getLimitedValue();
+    uint64_t Idx = (CaseVal->getValue() - Offset->getValue()).getLimitedValue();
     TableContents[Idx] = CaseRes;
 
     if (CaseRes != SingleValue)
@@ -4407,65 +4602,62 @@ SwitchLookupTable::SwitchLookupTable(
   ArrayType *ArrayTy = ArrayType::get(ValueType, TableSize);
   Constant *Initializer = ConstantArray::get(ArrayTy, TableContents);
 
-  Array = new GlobalVariable(M, ArrayTy, /*constant=*/ true,
-                             GlobalVariable::PrivateLinkage,
-                             Initializer,
+  Array = new GlobalVariable(M, ArrayTy, /*constant=*/true,
+                             GlobalVariable::PrivateLinkage, Initializer,
                              "switch.table");
-  Array->setUnnamedAddr(true);
+  Array->setUnnamedAddr(GlobalValue::UnnamedAddr::Global);
   Kind = ArrayKind;
 }
 
 Value *SwitchLookupTable::BuildLookup(Value *Index, IRBuilder<> &Builder) {
   switch (Kind) {
-    case SingleValueKind:
-      return SingleValue;
-    case LinearMapKind: {
-      // Derive the result value from the input value.
-      Value *Result = Builder.CreateIntCast(Index, LinearMultiplier->getType(),
-                                            false, "switch.idx.cast");
-      if (!LinearMultiplier->isOne())
-        Result = Builder.CreateMul(Result, LinearMultiplier, "switch.idx.mult");
-      if (!LinearOffset->isZero())
-        Result = Builder.CreateAdd(Result, LinearOffset, "switch.offset");
-      return Result;
-    }
-    case BitMapKind: {
-      // Type of the bitmap (e.g. i59).
-      IntegerType *MapTy = BitMap->getType();
-
-      // Cast Index to the same type as the bitmap.
-      // Note: The Index is <= the number of elements in the table, so
-      // truncating it to the width of the bitmask is safe.
-      Value *ShiftAmt = Builder.CreateZExtOrTrunc(Index, MapTy, "switch.cast");
-
-      // Multiply the shift amount by the element width.
-      ShiftAmt = Builder.CreateMul(ShiftAmt,
-                      ConstantInt::get(MapTy, BitMapElementTy->getBitWidth()),
-                                   "switch.shiftamt");
-
-      // Shift down.
-      Value *DownShifted = Builder.CreateLShr(BitMap, ShiftAmt,
-                                              "switch.downshift");
-      // Mask off.
-      return Builder.CreateTrunc(DownShifted, BitMapElementTy,
-                                 "switch.masked");
-    }
-    case ArrayKind: {
-      // Make sure the table index will not overflow when treated as signed.
-      IntegerType *IT = cast<IntegerType>(Index->getType());
-      uint64_t TableSize = Array->getInitializer()->getType()
-                                ->getArrayNumElements();
-      if (TableSize > (1ULL << (IT->getBitWidth() - 1)))
-        Index = Builder.CreateZExt(Index,
-                                   IntegerType::get(IT->getContext(),
-                                                    IT->getBitWidth() + 1),
-                                   "switch.tableidx.zext");
-
-      Value *GEPIndices[] = { Builder.getInt32(0), Index };
-      Value *GEP = Builder.CreateInBoundsGEP(Array->getValueType(), Array,
-                                             GEPIndices, "switch.gep");
-      return Builder.CreateLoad(GEP, "switch.load");
-    }
+  case SingleValueKind:
+    return SingleValue;
+  case LinearMapKind: {
+    // Derive the result value from the input value.
+    Value *Result = Builder.CreateIntCast(Index, LinearMultiplier->getType(),
+                                          false, "switch.idx.cast");
+    if (!LinearMultiplier->isOne())
+      Result = Builder.CreateMul(Result, LinearMultiplier, "switch.idx.mult");
+    if (!LinearOffset->isZero())
+      Result = Builder.CreateAdd(Result, LinearOffset, "switch.offset");
+    return Result;
+  }
+  case BitMapKind: {
+    // Type of the bitmap (e.g. i59).
+    IntegerType *MapTy = BitMap->getType();
+
+    // Cast Index to the same type as the bitmap.
+    // Note: The Index is <= the number of elements in the table, so
+    // truncating it to the width of the bitmask is safe.
+    Value *ShiftAmt = Builder.CreateZExtOrTrunc(Index, MapTy, "switch.cast");
+
+    // Multiply the shift amount by the element width.
+    ShiftAmt = Builder.CreateMul(
+        ShiftAmt, ConstantInt::get(MapTy, BitMapElementTy->getBitWidth()),
+        "switch.shiftamt");
+
+    // Shift down.
+    Value *DownShifted =
+        Builder.CreateLShr(BitMap, ShiftAmt, "switch.downshift");
+    // Mask off.
+    return Builder.CreateTrunc(DownShifted, BitMapElementTy, "switch.masked");
+  }
+  case ArrayKind: {
+    // Make sure the table index will not overflow when treated as signed.
+    IntegerType *IT = cast<IntegerType>(Index->getType());
+    uint64_t TableSize =
+        Array->getInitializer()->getType()->getArrayNumElements();
+    if (TableSize > (1ULL << (IT->getBitWidth() - 1)))
+      Index = Builder.CreateZExt(
+          Index, IntegerType::get(IT->getContext(), IT->getBitWidth() + 1),
+          "switch.tableidx.zext");
+
+    Value *GEPIndices[] = {Builder.getInt32(0), Index};
+    Value *GEP = Builder.CreateInBoundsGEP(Array->getValueType(), Array,
+                                           GEPIndices, "switch.gep");
+    return Builder.CreateLoad(GEP, "switch.load");
+  }
   }
   llvm_unreachable("Unknown lookup table kind!");
 }
@@ -4480,7 +4672,7 @@ bool SwitchLookupTable::WouldFitInRegister(const DataLayout &DL,
   // are <= 15, we could try to narrow the type.
 
   // Avoid overflow, fitsInLegalInteger uses unsigned int for the width.
-  if (TableSize >= UINT_MAX/IT->getBitWidth())
+  if (TableSize >= UINT_MAX / IT->getBitWidth())
     return false;
   return DL.fitsInLegalInteger(TableSize * IT->getBitWidth());
 }
@@ -4503,8 +4695,9 @@ ShouldBuildLookupTable(SwitchInst *SI, uint64_t TableSize,
     HasIllegalType = HasIllegalType || !TTI.isTypeLegal(Ty);
 
     // Saturate this flag to false.
-    AllTablesFitInRegister = AllTablesFitInRegister &&
-      SwitchLookupTable::WouldFitInRegister(DL, TableSize, Ty);
+    AllTablesFitInRegister =
+        AllTablesFitInRegister &&
+        SwitchLookupTable::WouldFitInRegister(DL, TableSize, Ty);
 
     // If both flags saturate, we're done. NOTE: This *only* works with
     // saturating flags, and all flags have to saturate first due to the
@@ -4547,9 +4740,10 @@ ShouldBuildLookupTable(SwitchInst *SI, uint64_t TableSize,
 ///        ...
 /// \endcode
 /// Jump threading will then eliminate the second if(cond).
-static void reuseTableCompare(User *PhiUser, BasicBlock *PhiBlock,
-          BranchInst *RangeCheckBranch, Constant *DefaultValue,
-          const SmallVectorImpl<std::pair<ConstantInt*, Constant*> >& Values) {
+static void reuseTableCompare(
+    User *PhiUser, BasicBlock *PhiBlock, BranchInst *RangeCheckBranch,
+    Constant *DefaultValue,
+    const SmallVectorImpl<std::pair<ConstantInt *, Constant *>> &Values) {
 
   ICmpInst *CmpInst = dyn_cast<ICmpInst>(PhiUser);
   if (!CmpInst)
@@ -4578,13 +4772,13 @@ static void reuseTableCompare(User *PhiUser, BasicBlock *PhiBlock,
   // compare result.
   for (auto ValuePair : Values) {
     Constant *CaseConst = ConstantExpr::getICmp(CmpInst->getPredicate(),
-                              ValuePair.second, CmpOp1, true);
+                                                ValuePair.second, CmpOp1, true);
     if (!CaseConst || CaseConst == DefaultConst)
       return;
     assert((CaseConst == TrueConst || CaseConst == FalseConst) &&
            "Expect true or false as compare result.");
   }
-  
+
   // Check if the branch instruction dominates the phi node. It's a simple
   // dominance check, but sufficient for our needs.
   // Although this check is invariant in the calling loops, it's better to do it
@@ -4602,9 +4796,9 @@ static void reuseTableCompare(User *PhiUser, BasicBlock *PhiBlock,
     ++NumTableCmpReuses;
   } else {
     // The compare yields the same result, just inverted. We can replace it.
-    Value *InvertedTableCmp = BinaryOperator::CreateXor(RangeCmp,
-                ConstantInt::get(RangeCmp->getType(), 1), "inverted.cmp",
-                RangeCheckBranch);
+    Value *InvertedTableCmp = BinaryOperator::CreateXor(
+        RangeCmp, ConstantInt::get(RangeCmp->getType(), 1), "inverted.cmp",
+        RangeCheckBranch);
     CmpInst->replaceAllUsesWith(InvertedTableCmp);
     ++NumTableCmpReuses;
   }
@@ -4629,7 +4823,8 @@ static bool SwitchToLookupTable(SwitchInst *SI, IRBuilder<> &Builder,
   // GEP needs a runtime relocation in PIC code. We should just build one big
   // string and lookup indices into that.
 
-  // Ignore switches with less than three cases. Lookup tables will not make them
+  // Ignore switches with less than three cases. Lookup tables will not make
+  // them
   // faster, so we don't analyze them.
   if (SI->getNumCases() < 3)
     return false;
@@ -4642,11 +4837,11 @@ static bool SwitchToLookupTable(SwitchInst *SI, IRBuilder<> &Builder,
   ConstantInt *MaxCaseVal = CI.getCaseValue();
 
   BasicBlock *CommonDest = nullptr;
-  typedef SmallVector<std::pair<ConstantInt*, Constant*>, 4> ResultListTy;
-  SmallDenseMap<PHINode*, ResultListTy> ResultLists;
-  SmallDenseMap<PHINode*, Constant*> DefaultResults;
-  SmallDenseMap<PHINode*, Type*> ResultTypes;
-  SmallVector<PHINode*, 4> PHIs;
+  typedef SmallVector<std::pair<ConstantInt *, Constant *>, 4> ResultListTy;
+  SmallDenseMap<PHINode *, ResultListTy> ResultLists;
+  SmallDenseMap<PHINode *, Constant *> DefaultResults;
+  SmallDenseMap<PHINode *, Type *> ResultTypes;
+  SmallVector<PHINode *, 4> PHIs;
 
   for (SwitchInst::CaseIt E = SI->case_end(); CI != E; ++CI) {
     ConstantInt *CaseVal = CI.getCaseValue();
@@ -4656,7 +4851,7 @@ static bool SwitchToLookupTable(SwitchInst *SI, IRBuilder<> &Builder,
       MaxCaseVal = CaseVal;
 
     // Resulting value at phi nodes for this case value.
-    typedef SmallVector<std::pair<PHINode*, Constant*>, 4> ResultsTy;
+    typedef SmallVector<std::pair<PHINode *, Constant *>, 4> ResultsTy;
     ResultsTy Results;
     if (!GetCaseResults(SI, CaseVal, CI.getCaseSuccessor(), &CommonDest,
                         Results, DL))
@@ -4684,14 +4879,14 @@ static bool SwitchToLookupTable(SwitchInst *SI, IRBuilder<> &Builder,
 
   // If the table has holes, we need a constant result for the default case
   // or a bitmask that fits in a register.
-  SmallVector<std::pair<PHINode*, Constant*>, 4> DefaultResultsList;
+  SmallVector<std::pair<PHINode *, Constant *>, 4> DefaultResultsList;
   bool HasDefaultResults = GetCaseResults(SI, nullptr, SI->getDefaultDest(),
                                           &CommonDest, DefaultResultsList, DL);
 
   bool NeedMask = (TableHasHoles && !HasDefaultResults);
   if (NeedMask) {
     // As an extra penalty for the validity test we require more cases.
-    if (SI->getNumCases() < 4)  // FIXME: Find best threshold value (benchmark).
+    if (SI->getNumCases() < 4) // FIXME: Find best threshold value (benchmark).
       return false;
     if (!DL.fitsInLegalInteger(TableSize))
       return false;
@@ -4708,15 +4903,13 @@ static bool SwitchToLookupTable(SwitchInst *SI, IRBuilder<> &Builder,
 
   // Create the BB that does the lookups.
   Module &Mod = *CommonDest->getParent()->getParent();
-  BasicBlock *LookupBB = BasicBlock::Create(Mod.getContext(),
-                                            "switch.lookup",
-                                            CommonDest->getParent(),
-                                            CommonDest);
+  BasicBlock *LookupBB = BasicBlock::Create(
+      Mod.getContext(), "switch.lookup", CommonDest->getParent(), CommonDest);
 
   // Compute the table index value.
   Builder.SetInsertPoint(SI);
-  Value *TableIndex = Builder.CreateSub(SI->getCondition(), MinCaseVal,
-                                        "switch.tableidx");
+  Value *TableIndex =
+      Builder.CreateSub(SI->getCondition(), MinCaseVal, "switch.tableidx");
 
   // Compute the maximum table size representable by the integer type we are
   // switching upon.
@@ -4739,9 +4932,10 @@ static bool SwitchToLookupTable(SwitchInst *SI, IRBuilder<> &Builder,
     // Note: We call removeProdecessor later since we need to be able to get the
     // PHI value for the default case in case we're using a bit mask.
   } else {
-    Value *Cmp = Builder.CreateICmpULT(TableIndex, ConstantInt::get(
-                                       MinCaseVal->getType(), TableSize));
-    RangeCheckBranch = Builder.CreateCondBr(Cmp, LookupBB, SI->getDefaultDest());
+    Value *Cmp = Builder.CreateICmpULT(
+        TableIndex, ConstantInt::get(MinCaseVal->getType(), TableSize));
+    RangeCheckBranch =
+        Builder.CreateCondBr(Cmp, LookupBB, SI->getDefaultDest());
   }
 
   // Populate the BB that does the lookups.
@@ -4753,10 +4947,8 @@ static bool SwitchToLookupTable(SwitchInst *SI, IRBuilder<> &Builder,
     // and we create a new LookupBB.
     BasicBlock *MaskBB = LookupBB;
     MaskBB->setName("switch.hole_check");
-    LookupBB = BasicBlock::Create(Mod.getContext(),
-                                  "switch.lookup",
-                                  CommonDest->getParent(),
-                                  CommonDest);
+    LookupBB = BasicBlock::Create(Mod.getContext(), "switch.lookup",
+                                  CommonDest->getParent(), CommonDest);
 
     // Make the mask's bitwidth at least 8bit and a power-of-2 to avoid
     // unnecessary illegal types.
@@ -4766,8 +4958,8 @@ static bool SwitchToLookupTable(SwitchInst *SI, IRBuilder<> &Builder,
     // Build bitmask; fill in a 1 bit for every case.
     const ResultListTy &ResultList = ResultLists[PHIs[0]];
     for (size_t I = 0, E = ResultList.size(); I != E; ++I) {
-      uint64_t Idx = (ResultList[I].first->getValue() -
-                      MinCaseVal->getValue()).getLimitedValue();
+      uint64_t Idx = (ResultList[I].first->getValue() - MinCaseVal->getValue())
+                         .getLimitedValue();
       MaskInt |= One << Idx;
     }
     ConstantInt *TableMask = ConstantInt::get(Mod.getContext(), MaskInt);
@@ -4776,13 +4968,11 @@ static bool SwitchToLookupTable(SwitchInst *SI, IRBuilder<> &Builder,
     // If this bit is 0 (meaning hole) jump to the default destination,
     // else continue with table lookup.
     IntegerType *MapTy = TableMask->getType();
-    Value *MaskIndex = Builder.CreateZExtOrTrunc(TableIndex, MapTy,
-                                                 "switch.maskindex");
-    Value *Shifted = Builder.CreateLShr(TableMask, MaskIndex,
-                                        "switch.shifted");
-    Value *LoBit = Builder.CreateTrunc(Shifted,
-                                       Type::getInt1Ty(Mod.getContext()),
-                                       "switch.lobit");
+    Value *MaskIndex =
+        Builder.CreateZExtOrTrunc(TableIndex, MapTy, "switch.maskindex");
+    Value *Shifted = Builder.CreateLShr(TableMask, MaskIndex, "switch.shifted");
+    Value *LoBit = Builder.CreateTrunc(
+        Shifted, Type::getInt1Ty(Mod.getContext()), "switch.lobit");
     Builder.CreateCondBr(LoBit, LookupBB, SI->getDefaultDest());
 
     Builder.SetInsertPoint(LookupBB);
@@ -4905,7 +5095,8 @@ bool SimplifyCFGOpt::SimplifyIndirectBr(IndirectBrInst *IBI) {
     if (!Dest->hasAddressTaken() || !Succs.insert(Dest).second) {
       Dest->removePredecessor(BB);
       IBI->removeDestination(i);
-      --i; --e;
+      --i;
+      --e;
       Changed = true;
     }
   }
@@ -4968,27 +5159,28 @@ static bool TryToMergeLandingPad(LandingPadInst *LPad, BranchInst *BI,
     LandingPadInst *LPad2 = dyn_cast<LandingPadInst>(I);
     if (!LPad2 || !LPad2->isIdenticalTo(LPad))
       continue;
-    for (++I; isa<DbgInfoIntrinsic>(I); ++I) {}
+    for (++I; isa<DbgInfoIntrinsic>(I); ++I) {
+    }
     BranchInst *BI2 = dyn_cast<BranchInst>(I);
     if (!BI2 || !BI2->isIdenticalTo(BI))
       continue;
 
-    // We've found an identical block.  Update our predeccessors to take that
+    // We've found an identical block.  Update our predecessors to take that
     // path instead and make ourselves dead.
     SmallSet<BasicBlock *, 16> Preds;
     Preds.insert(pred_begin(BB), pred_end(BB));
     for (BasicBlock *Pred : Preds) {
       InvokeInst *II = cast<InvokeInst>(Pred->getTerminator());
-      assert(II->getNormalDest() != BB &&
-             II->getUnwindDest() == BB && "unexpected successor");
+      assert(II->getNormalDest() != BB && II->getUnwindDest() == BB &&
+             "unexpected successor");
       II->setUnwindDest(OtherPred);
     }
 
     // The debug info in OtherPred doesn't cover the merged control flow that
     // used to go through BB.  We need to delete it or update it.
-    for (auto I = OtherPred->begin(), E = OtherPred->end();
-         I != E;) {
-      Instruction &Inst = *I; I++;
+    for (auto I = OtherPred->begin(), E = OtherPred->end(); I != E;) {
+      Instruction &Inst = *I;
+      I++;
       if (isa<DbgInfoIntrinsic>(Inst))
         Inst.eraseFromParent();
     }
@@ -5007,15 +5199,22 @@ static bool TryToMergeLandingPad(LandingPadInst *LPad, BranchInst *BI,
   return false;
 }
 
-bool SimplifyCFGOpt::SimplifyUncondBranch(BranchInst *BI, IRBuilder<> &Builder){
+bool SimplifyCFGOpt::SimplifyUncondBranch(BranchInst *BI,
+                                          IRBuilder<> &Builder) {
   BasicBlock *BB = BI->getParent();
 
   if (SinkCommon && SinkThenElseCodeToEnd(BI))
     return true;
 
   // If the Terminator is the only non-phi instruction, simplify the block.
+  // if LoopHeader is provided, check if the block is a loop header
+  // (This is for early invocations before loop simplify and vectorization
+  // to keep canonical loop forms for nested loops.
+  // These blocks can be eliminated when the pass is invoked later
+  // in the back-end.)
   BasicBlock::iterator I = BB->getFirstNonPHIOrDbg()->getIterator();
   if (I->isTerminator() && BB != &BB->getParent()->getEntryBlock() &&
+      (!LoopHeaders || !LoopHeaders->count(BB)) &&
       TryToSimplifyUncondBranchFromEmptyBlock(BB))
     return true;
 
@@ -5034,9 +5233,9 @@ bool SimplifyCFGOpt::SimplifyUncondBranch(BranchInst *BI, IRBuilder<> &Builder){
   // See if we can merge an empty landing pad block with another which is
   // equivalent.
   if (LandingPadInst *LPad = dyn_cast<LandingPadInst>(I)) {
-    for (++I; isa<DbgInfoIntrinsic>(I); ++I) {}
-    if (I->isTerminator() &&
-        TryToMergeLandingPad(LPad, BI, BB))
+    for (++I; isa<DbgInfoIntrinsic>(I); ++I) {
+    }
+    if (I->isTerminator() && TryToMergeLandingPad(LPad, BI, BB))
       return true;
   }
 
@@ -5081,7 +5280,7 @@ bool SimplifyCFGOpt::SimplifyCondBranch(BranchInst *BI, IRBuilder<> &Builder) {
     if (&*I == BI) {
       if (FoldValueComparisonIntoPredecessors(BI, Builder))
         return SimplifyCFG(BB, TTI, BonusInstThreshold, AC) | true;
-    } else if (&*I == cast<Instruction>(BI->getCondition())){
+    } else if (&*I == cast<Instruction>(BI->getCondition())) {
       ++I;
       // Ignore dbg intrinsics.
       while (isa<DbgInfoIntrinsic>(I))
@@ -5095,6 +5294,30 @@ bool SimplifyCFGOpt::SimplifyCondBranch(BranchInst *BI, IRBuilder<> &Builder) {
   if (SimplifyBranchOnICmpChain(BI, Builder, DL))
     return true;
 
+  // If this basic block has a single dominating predecessor block and the
+  // dominating block's condition implies BI's condition, we know the direction
+  // of the BI branch.
+  if (BasicBlock *Dom = BB->getSinglePredecessor()) {
+    auto *PBI = dyn_cast_or_null<BranchInst>(Dom->getTerminator());
+    if (PBI && PBI->isConditional() &&
+        PBI->getSuccessor(0) != PBI->getSuccessor(1) &&
+        (PBI->getSuccessor(0) == BB || PBI->getSuccessor(1) == BB)) {
+      bool CondIsFalse = PBI->getSuccessor(1) == BB;
+      Optional<bool> Implication = isImpliedCondition(
+          PBI->getCondition(), BI->getCondition(), DL, CondIsFalse);
+      if (Implication) {
+        // Turn this into a branch on constant.
+        auto *OldCond = BI->getCondition();
+        ConstantInt *CI = *Implication
+                              ? ConstantInt::getTrue(BB->getContext())
+                              : ConstantInt::getFalse(BB->getContext());
+        BI->setCondition(CI);
+        RecursivelyDeleteTriviallyDeadInstructions(OldCond);
+        return SimplifyCFG(BB, TTI, BonusInstThreshold, AC) | true;
+      }
+    }
+  }
+
   // If this basic block is ONLY a compare and a branch, and if a predecessor
   // branches to us and one of our successors, fold the comparison into the
   // predecessor and use logical operations to pick the right destination.
@@ -5149,7 +5372,7 @@ bool SimplifyCFGOpt::SimplifyCondBranch(BranchInst *BI, IRBuilder<> &Builder) {
         if (PBI != BI && PBI->isConditional())
           if (mergeConditionalStores(PBI, BI))
             return SimplifyCFG(BB, TTI, BonusInstThreshold, AC) | true;
-  
+
   return false;
 }
 
@@ -5162,7 +5385,7 @@ static bool passingValueIsAlwaysUndefined(Value *V, Instruction *I) {
   if (I->use_empty())
     return false;
 
-  if (C->isNullValue()) {
+  if (C->isNullValue() || isa<UndefValue>(C)) {
     // Only look at the first use, avoid hurting compile time with long uselists
     User *Use = *I->user_begin();
 
@@ -5189,7 +5412,12 @@ static bool passingValueIsAlwaysUndefined(Value *V, Instruction *I) {
     // Store to null is undefined.
     if (StoreInst *SI = dyn_cast<StoreInst>(Use))
       if (!SI->isVolatile())
-        return SI->getPointerAddressSpace() == 0 && SI->getPointerOperand() == I;
+        return SI->getPointerAddressSpace() == 0 &&
+               SI->getPointerOperand() == I;
+
+    // A call to null is undefined.
+    if (auto CS = CallSite(Use))
+      return CS.getCalledValue() == I;
   }
   return false;
 }
@@ -5210,8 +5438,8 @@ static bool removeUndefIntroducingPredecessor(BasicBlock *BB) {
           if (BI->isUnconditional())
             Builder.CreateUnreachable();
           else
-            Builder.CreateBr(BI->getSuccessor(0) == BB ? BI->getSuccessor(1) :
-                                                         BI->getSuccessor(0));
+            Builder.CreateBr(BI->getSuccessor(0) == BB ? BI->getSuccessor(1)
+                                                       : BI->getSuccessor(0));
           BI->eraseFromParent();
           return true;
         }
@@ -5229,8 +5457,7 @@ bool SimplifyCFGOpt::run(BasicBlock *BB) {
 
   // Remove basic blocks that have no predecessors (except the entry block)...
   // or that just have themself as a predecessor.  These are unreachable.
-  if ((pred_empty(BB) &&
-       BB != &BB->getParent()->getEntryBlock()) ||
+  if ((pred_empty(BB) && BB != &BB->getParent()->getEntryBlock()) ||
       BB->getSinglePredecessor() == BB) {
     DEBUG(dbgs() << "Removing BB: \n" << *BB);
     DeleteDeadBlock(BB);
@@ -5265,25 +5492,33 @@ bool SimplifyCFGOpt::run(BasicBlock *BB) {
   Builder.SetInsertPoint(BB->getTerminator());
   if (BranchInst *BI = dyn_cast<BranchInst>(BB->getTerminator())) {
     if (BI->isUnconditional()) {
-      if (SimplifyUncondBranch(BI, Builder)) return true;
+      if (SimplifyUncondBranch(BI, Builder))
+        return true;
     } else {
-      if (SimplifyCondBranch(BI, Builder)) return true;
+      if (SimplifyCondBranch(BI, Builder))
+        return true;
     }
   } else if (ReturnInst *RI = dyn_cast<ReturnInst>(BB->getTerminator())) {
-    if (SimplifyReturn(RI, Builder)) return true;
+    if (SimplifyReturn(RI, Builder))
+      return true;
   } else if (ResumeInst *RI = dyn_cast<ResumeInst>(BB->getTerminator())) {
-    if (SimplifyResume(RI, Builder)) return true;
+    if (SimplifyResume(RI, Builder))
+      return true;
   } else if (CleanupReturnInst *RI =
-               dyn_cast<CleanupReturnInst>(BB->getTerminator())) {
-    if (SimplifyCleanupReturn(RI)) return true;
+                 dyn_cast<CleanupReturnInst>(BB->getTerminator())) {
+    if (SimplifyCleanupReturn(RI))
+      return true;
   } else if (SwitchInst *SI = dyn_cast<SwitchInst>(BB->getTerminator())) {
-    if (SimplifySwitch(SI, Builder)) return true;
+    if (SimplifySwitch(SI, Builder))
+      return true;
   } else if (UnreachableInst *UI =
-               dyn_cast<UnreachableInst>(BB->getTerminator())) {
-    if (SimplifyUnreachable(UI)) return true;
+                 dyn_cast<UnreachableInst>(BB->getTerminator())) {
+    if (SimplifyUnreachable(UI))
+      return true;
   } else if (IndirectBrInst *IBI =
-               dyn_cast<IndirectBrInst>(BB->getTerminator())) {
-    if (SimplifyIndirectBr(IBI)) return true;
+                 dyn_cast<IndirectBrInst>(BB->getTerminator())) {
+    if (SimplifyIndirectBr(IBI))
+      return true;
   }
 
   return Changed;
@@ -5295,7 +5530,9 @@ bool SimplifyCFGOpt::run(BasicBlock *BB) {
 /// of the CFG.  It returns true if a modification was made.
 ///
 bool llvm::SimplifyCFG(BasicBlock *BB, const TargetTransformInfo &TTI,
-                       unsigned BonusInstThreshold, AssumptionCache *AC) {
+                       unsigned BonusInstThreshold, AssumptionCache *AC,
+                       SmallPtrSetImpl<BasicBlock *> *LoopHeaders) {
   return SimplifyCFGOpt(TTI, BB->getModule()->getDataLayout(),
-                        BonusInstThreshold, AC).run(BB);
+                        BonusInstThreshold, AC, LoopHeaders)
+      .run(BB);
 }
diff --git a/lib/Transforms/Utils/SimplifyIndVar.cpp b/lib/Transforms/Utils/SimplifyIndVar.cpp
index ddd8775a8431..6b1d3dc41330 100644
--- a/lib/Transforms/Utils/SimplifyIndVar.cpp
+++ b/lib/Transforms/Utils/SimplifyIndVar.cpp
@@ -25,7 +25,6 @@
 #include "llvm/IR/IRBuilder.h"
 #include "llvm/IR/Instructions.h"
 #include "llvm/IR/IntrinsicInst.h"
-#include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/raw_ostream.h"
 
@@ -71,14 +70,12 @@ namespace {
 
     bool eliminateIdentitySCEV(Instruction *UseInst, Instruction *IVOperand);
 
+    bool eliminateOverflowIntrinsic(CallInst *CI);
     bool eliminateIVUser(Instruction *UseInst, Instruction *IVOperand);
     void eliminateIVComparison(ICmpInst *ICmp, Value *IVOperand);
     void eliminateIVRemainder(BinaryOperator *Rem, Value *IVOperand,
                               bool IsSigned);
     bool strengthenOverflowingOperation(BinaryOperator *OBO, Value *IVOperand);
-
-    Instruction *splitOverflowIntrinsic(Instruction *IVUser,
-                                        const DominatorTree *DT);
   };
 }
 
@@ -183,9 +180,8 @@ void SimplifyIndvar::eliminateIVComparison(ICmpInst *ICmp, Value *IVOperand) {
     DeadInsts.emplace_back(ICmp);
     DEBUG(dbgs() << "INDVARS: Eliminated comparison: " << *ICmp << '\n');
   } else if (isa<PHINode>(IVOperand) &&
-             SE->isLoopInvariantPredicate(Pred, S, X, ICmpLoop,
-                                          InvariantPredicate, InvariantLHS,
-                                          InvariantRHS)) {
+             SE->isLoopInvariantPredicate(Pred, S, X, L, InvariantPredicate,
+                                          InvariantLHS, InvariantRHS)) {
 
     // Rewrite the comparison to a loop invariant comparison if it can be done
     // cheaply, where cheaply means "we don't need to emit any new
@@ -201,9 +197,48 @@ void SimplifyIndvar::eliminateIVComparison(ICmpInst *ICmp, Value *IVOperand) {
       NewRHS =
           ICmp->getOperand(S == InvariantRHS ? IVOperIdx : (1 - IVOperIdx));
 
-    for (Value *Incoming : cast<PHINode>(IVOperand)->incoming_values()) {
-      if (NewLHS && NewRHS)
-        break;
+    auto *PN = cast<PHINode>(IVOperand);
+    for (unsigned i = 0, e = PN->getNumIncomingValues();
+         i != e && (!NewLHS || !NewRHS);
+         ++i) {
+
+      // If this is a value incoming from the backedge, then it cannot be a loop
+      // invariant value (since we know that IVOperand is an induction variable).
+      if (L->contains(PN->getIncomingBlock(i)))
+        continue;
+
+      // NB! This following assert does not fundamentally have to be true, but
+      // it is true today given how SCEV analyzes induction variables.
+      // Specifically, today SCEV will *not* recognize %iv as an induction
+      // variable in the following case:
+      //
+      // define void @f(i32 %k) {
+      // entry:
+      //   br i1 undef, label %r, label %l
+      //
+      // l:
+      //   %k.inc.l = add i32 %k, 1
+      //   br label %loop
+      //
+      // r:
+      //   %k.inc.r = add i32 %k, 1
+      //   br label %loop
+      //
+      // loop:
+      //   %iv = phi i32 [ %k.inc.l, %l ], [ %k.inc.r, %r ], [ %iv.inc, %loop ]
+      //   %iv.inc = add i32 %iv, 1
+      //   br label %loop
+      // }
+      //
+      // but if it starts to, at some point, then the assertion below will have
+      // to be changed to a runtime check.
+
+      Value *Incoming = PN->getIncomingValue(i);
+
+#ifndef NDEBUG
+      if (auto *I = dyn_cast<Instruction>(Incoming))
+        assert(DT->dominates(I, ICmp) && "Should be a unique loop dominating value!");
+#endif
 
       const SCEV *IncomingS = SE->getSCEV(Incoming);
 
@@ -280,6 +315,108 @@ void SimplifyIndvar::eliminateIVRemainder(BinaryOperator *Rem,
   DeadInsts.emplace_back(Rem);
 }
 
+bool SimplifyIndvar::eliminateOverflowIntrinsic(CallInst *CI) {
+  auto *F = CI->getCalledFunction();
+  if (!F)
+    return false;
+
+  typedef const SCEV *(ScalarEvolution::*OperationFunctionTy)(
+      const SCEV *, const SCEV *, SCEV::NoWrapFlags);
+  typedef const SCEV *(ScalarEvolution::*ExtensionFunctionTy)(
+      const SCEV *, Type *);
+
+  OperationFunctionTy Operation;
+  ExtensionFunctionTy Extension;
+
+  Instruction::BinaryOps RawOp;
+
+  // We always have exactly one of nsw or nuw.  If NoSignedOverflow is false, we
+  // have nuw.
+  bool NoSignedOverflow;
+
+  switch (F->getIntrinsicID()) {
+  default:
+    return false;
+
+  case Intrinsic::sadd_with_overflow:
+    Operation = &ScalarEvolution::getAddExpr;
+    Extension = &ScalarEvolution::getSignExtendExpr;
+    RawOp = Instruction::Add;
+    NoSignedOverflow = true;
+    break;
+
+  case Intrinsic::uadd_with_overflow:
+    Operation = &ScalarEvolution::getAddExpr;
+    Extension = &ScalarEvolution::getZeroExtendExpr;
+    RawOp = Instruction::Add;
+    NoSignedOverflow = false;
+    break;
+
+  case Intrinsic::ssub_with_overflow:
+    Operation = &ScalarEvolution::getMinusSCEV;
+    Extension = &ScalarEvolution::getSignExtendExpr;
+    RawOp = Instruction::Sub;
+    NoSignedOverflow = true;
+    break;
+
+  case Intrinsic::usub_with_overflow:
+    Operation = &ScalarEvolution::getMinusSCEV;
+    Extension = &ScalarEvolution::getZeroExtendExpr;
+    RawOp = Instruction::Sub;
+    NoSignedOverflow = false;
+    break;
+  }
+
+  const SCEV *LHS = SE->getSCEV(CI->getArgOperand(0));
+  const SCEV *RHS = SE->getSCEV(CI->getArgOperand(1));
+
+  auto *NarrowTy = cast<IntegerType>(LHS->getType());
+  auto *WideTy =
+    IntegerType::get(NarrowTy->getContext(), NarrowTy->getBitWidth() * 2);
+
+  const SCEV *A =
+      (SE->*Extension)((SE->*Operation)(LHS, RHS, SCEV::FlagAnyWrap), WideTy);
+  const SCEV *B =
+      (SE->*Operation)((SE->*Extension)(LHS, WideTy),
+                       (SE->*Extension)(RHS, WideTy), SCEV::FlagAnyWrap);
+
+  if (A != B)
+    return false;
+
+  // Proved no overflow, nuke the overflow check and, if possible, the overflow
+  // intrinsic as well.
+
+  BinaryOperator *NewResult = BinaryOperator::Create(
+      RawOp, CI->getArgOperand(0), CI->getArgOperand(1), "", CI);
+
+  if (NoSignedOverflow)
+    NewResult->setHasNoSignedWrap(true);
+  else
+    NewResult->setHasNoUnsignedWrap(true);
+
+  SmallVector<ExtractValueInst *, 4> ToDelete;
+
+  for (auto *U : CI->users()) {
+    if (auto *EVI = dyn_cast<ExtractValueInst>(U)) {
+      if (EVI->getIndices()[0] == 1)
+        EVI->replaceAllUsesWith(ConstantInt::getFalse(CI->getContext()));
+      else {
+        assert(EVI->getIndices()[0] == 0 && "Only two possibilities!");
+        EVI->replaceAllUsesWith(NewResult);
+      }
+      ToDelete.push_back(EVI);
+    }
+  }
+
+  for (auto *EVI : ToDelete)
+    EVI->eraseFromParent();
+
+  if (CI->use_empty())
+    CI->eraseFromParent();
+
+  return true;
+}
+
 /// Eliminate an operation that consumes a simple IV and has no observable
 /// side-effect given the range of IV values.  IVOperand is guaranteed SCEVable,
 /// but UseInst may not be.
@@ -297,6 +434,10 @@ bool SimplifyIndvar::eliminateIVUser(Instruction *UseInst,
     }
   }
 
+  if (auto *CI = dyn_cast<CallInst>(UseInst))
+    if (eliminateOverflowIntrinsic(CI))
+      return true;
+
   if (eliminateIdentitySCEV(UseInst, IVOperand))
     return true;
 
@@ -408,69 +549,6 @@ bool SimplifyIndvar::strengthenOverflowingOperation(BinaryOperator *BO,
   return Changed;
 }
 
-/// \brief Split sadd.with.overflow into add + sadd.with.overflow to allow
-/// analysis and optimization.
-///
-/// \return A new value representing the non-overflowing add if possible,
-/// otherwise return the original value.
-Instruction *SimplifyIndvar::splitOverflowIntrinsic(Instruction *IVUser,
-                                                    const DominatorTree *DT) {
-  IntrinsicInst *II = dyn_cast<IntrinsicInst>(IVUser);
-  if (!II || II->getIntrinsicID() != Intrinsic::sadd_with_overflow)
-    return IVUser;
-
-  // Find a branch guarded by the overflow check.
-  BranchInst *Branch = nullptr;
-  Instruction *AddVal = nullptr;
-  for (User *U : II->users()) {
-    if (ExtractValueInst *ExtractInst = dyn_cast<ExtractValueInst>(U)) {
-      if (ExtractInst->getNumIndices() != 1)
-        continue;
-      if (ExtractInst->getIndices()[0] == 0)
-        AddVal = ExtractInst;
-      else if (ExtractInst->getIndices()[0] == 1 && ExtractInst->hasOneUse())
-        Branch = dyn_cast<BranchInst>(ExtractInst->user_back());
-    }
-  }
-  if (!AddVal || !Branch)
-    return IVUser;
-
-  BasicBlock *ContinueBB = Branch->getSuccessor(1);
-  if (std::next(pred_begin(ContinueBB)) != pred_end(ContinueBB))
-    return IVUser;
-
-  // Check if all users of the add are provably NSW.
-  bool AllNSW = true;
-  for (Use &U : AddVal->uses()) {
-    if (Instruction *UseInst = dyn_cast<Instruction>(U.getUser())) {
-      BasicBlock *UseBB = UseInst->getParent();
-      if (PHINode *PHI = dyn_cast<PHINode>(UseInst))
-        UseBB = PHI->getIncomingBlock(U);
-      if (!DT->dominates(ContinueBB, UseBB)) {
-        AllNSW = false;
-        break;
-      }
-    }
-  }
-  if (!AllNSW)
-    return IVUser;
-
-  // Go for it...
-  IRBuilder<> Builder(IVUser);
-  Instruction *AddInst = dyn_cast<Instruction>(
-    Builder.CreateNSWAdd(II->getOperand(0), II->getOperand(1)));
-
-  // The caller expects the new add to have the same form as the intrinsic. The
-  // IV operand position must be the same.
-  assert((AddInst->getOpcode() == Instruction::Add &&
-          AddInst->getOperand(0) == II->getOperand(0)) &&
-         "Bad add instruction created from overflow intrinsic.");
-
-  AddVal->replaceAllUsesWith(AddInst);
-  DeadInsts.emplace_back(AddVal);
-  return AddInst;
-}
-
 /// Add all uses of Def to the current IV's worklist.
 static void pushIVUsers(
   Instruction *Def,
@@ -545,12 +623,6 @@ void SimplifyIndvar::simplifyUsers(PHINode *CurrIV, IVVisitor *V) {
     // Bypass back edges to avoid extra work.
     if (UseInst == CurrIV) continue;
 
-    if (V && V->shouldSplitOverflowInstrinsics()) {
-      UseInst = splitOverflowIntrinsic(UseInst, V->getDomTree());
-      if (!UseInst)
-        continue;
-    }
-
     Instruction *IVOperand = UseOper.second;
     for (unsigned N = 0; IVOperand; ++N) {
       assert(N <= Simplified.size() && "runaway iteration");
diff --git a/lib/Transforms/Utils/SimplifyInstructions.cpp b/lib/Transforms/Utils/SimplifyInstructions.cpp
index d5377f9a4c1f..df299067094f 100644
--- a/lib/Transforms/Utils/SimplifyInstructions.cpp
+++ b/lib/Transforms/Utils/SimplifyInstructions.cpp
@@ -14,7 +14,7 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "llvm/Transforms/Scalar.h"
+#include "llvm/Transforms/Utils/SimplifyInstructions.h"
 #include "llvm/ADT/DepthFirstIterator.h"
 #include "llvm/ADT/SmallPtrSet.h"
 #include "llvm/ADT/Statistic.h"
@@ -27,12 +27,60 @@
 #include "llvm/IR/Type.h"
 #include "llvm/Pass.h"
 #include "llvm/Transforms/Utils/Local.h"
+#include "llvm/Transforms/Scalar.h"
 using namespace llvm;
 
 #define DEBUG_TYPE "instsimplify"
 
 STATISTIC(NumSimplified, "Number of redundant instructions removed");
 
+static bool runImpl(Function &F, const DominatorTree *DT, const TargetLibraryInfo *TLI,
+                    AssumptionCache *AC) {
+  const DataLayout &DL = F.getParent()->getDataLayout();
+  SmallPtrSet<const Instruction*, 8> S1, S2, *ToSimplify = &S1, *Next = &S2;
+  bool Changed = false;
+
+  do {
+    for (BasicBlock *BB : depth_first(&F.getEntryBlock()))
+      // Here be subtlety: the iterator must be incremented before the loop
+      // body (not sure why), so a range-for loop won't work here.
+      for (BasicBlock::iterator BI = BB->begin(), BE = BB->end(); BI != BE;) {
+        Instruction *I = &*BI++;
+        // The first time through the loop ToSimplify is empty and we try to
+        // simplify all instructions.  On later iterations ToSimplify is not
+        // empty and we only bother simplifying instructions that are in it.
+        if (!ToSimplify->empty() && !ToSimplify->count(I))
+          continue;
+        // Don't waste time simplifying unused instructions.
+        if (!I->use_empty())
+          if (Value *V = SimplifyInstruction(I, DL, TLI, DT, AC)) {
+            // Mark all uses for resimplification next time round the loop.
+            for (User *U : I->users())
+              Next->insert(cast<Instruction>(U));
+            I->replaceAllUsesWith(V);
+            ++NumSimplified;
+            Changed = true;
+          }
+        bool res = RecursivelyDeleteTriviallyDeadInstructions(I, TLI);
+        if (res)  {
+          // RecursivelyDeleteTriviallyDeadInstruction can remove
+          // more than one instruction, so simply incrementing the
+          // iterator does not work. When instructions get deleted
+          // re-iterate instead.
+          BI = BB->begin(); BE = BB->end();
+          Changed |= res;
+        }
+      }
+
+    // Place the list of instructions to simplify on the next loop iteration
+    // into ToSimplify.
+    std::swap(ToSimplify, Next);
+    Next->clear();
+  } while (!ToSimplify->empty());
+
+  return Changed;
+}
+
 namespace {
   struct InstSimplifier : public FunctionPass {
     static char ID; // Pass identification, replacement for typeid
@@ -48,56 +96,17 @@ namespace {
 
     /// runOnFunction - Remove instructions that simplify.
     bool runOnFunction(Function &F) override {
+      if (skipFunction(F))
+        return false;
+
       const DominatorTreeWrapperPass *DTWP =
           getAnalysisIfAvailable<DominatorTreeWrapperPass>();
       const DominatorTree *DT = DTWP ? &DTWP->getDomTree() : nullptr;
-      const DataLayout &DL = F.getParent()->getDataLayout();
       const TargetLibraryInfo *TLI =
           &getAnalysis<TargetLibraryInfoWrapperPass>().getTLI();
       AssumptionCache *AC =
           &getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F);
-      SmallPtrSet<const Instruction*, 8> S1, S2, *ToSimplify = &S1, *Next = &S2;
-      bool Changed = false;
-
-      do {
-        for (BasicBlock *BB : depth_first(&F.getEntryBlock()))
-          // Here be subtlety: the iterator must be incremented before the loop
-          // body (not sure why), so a range-for loop won't work here.
-          for (BasicBlock::iterator BI = BB->begin(), BE = BB->end(); BI != BE;) {
-            Instruction *I = &*BI++;
-            // The first time through the loop ToSimplify is empty and we try to
-            // simplify all instructions.  On later iterations ToSimplify is not
-            // empty and we only bother simplifying instructions that are in it.
-            if (!ToSimplify->empty() && !ToSimplify->count(I))
-              continue;
-            // Don't waste time simplifying unused instructions.
-            if (!I->use_empty())
-              if (Value *V = SimplifyInstruction(I, DL, TLI, DT, AC)) {
-                // Mark all uses for resimplification next time round the loop.
-                for (User *U : I->users())
-                  Next->insert(cast<Instruction>(U));
-                I->replaceAllUsesWith(V);
-                ++NumSimplified;
-                Changed = true;
-              }
-            bool res = RecursivelyDeleteTriviallyDeadInstructions(I, TLI);
-            if (res)  {
-              // RecursivelyDeleteTriviallyDeadInstruction can remove
-              // more than one instruction, so simply incrementing the
-              // iterator does not work. When instructions get deleted
-              // re-iterate instead.
-              BI = BB->begin(); BE = BB->end();
-              Changed |= res;
-            }
-          }
-
-        // Place the list of instructions to simplify on the next loop iteration
-        // into ToSimplify.
-        std::swap(ToSimplify, Next);
-        Next->clear();
-      } while (!ToSimplify->empty());
-
-      return Changed;
+      return runImpl(F, DT, TLI, AC);
     }
   };
 }
@@ -115,3 +124,15 @@ char &llvm::InstructionSimplifierID = InstSimplifier::ID;
 FunctionPass *llvm::createInstructionSimplifierPass() {
   return new InstSimplifier();
 }
+
+PreservedAnalyses InstSimplifierPass::run(Function &F,
+                                      AnalysisManager<Function> &AM) {
+  auto *DT = AM.getCachedResult<DominatorTreeAnalysis>(F);
+  auto &TLI = AM.getResult<TargetLibraryAnalysis>(F);
+  auto &AC = AM.getResult<AssumptionAnalysis>(F);
+  bool Changed = runImpl(F, DT, &TLI, &AC);
+  if (!Changed)
+    return PreservedAnalyses::all();
+  // FIXME: This should also 'preserve the CFG'.
+  return PreservedAnalyses::none();
+}
diff --git a/lib/Transforms/Utils/SimplifyLibCalls.cpp b/lib/Transforms/Utils/SimplifyLibCalls.cpp
index 2f3c31128cf0..c2986951e48f 100644
--- a/lib/Transforms/Utils/SimplifyLibCalls.cpp
+++ b/lib/Transforms/Utils/SimplifyLibCalls.cpp
@@ -29,7 +29,6 @@
 #include "llvm/IR/LLVMContext.h"
 #include "llvm/IR/Module.h"
 #include "llvm/IR/PatternMatch.h"
-#include "llvm/Support/Allocator.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Transforms/Utils/BuildLibCalls.h"
 #include "llvm/Transforms/Utils/Local.h"
@@ -104,101 +103,11 @@ static bool hasUnaryFloatFn(const TargetLibraryInfo *TLI, Type *Ty,
   }
 }
 
-/// \brief Check whether we can use unsafe floating point math for
-/// the function passed as input.
-static bool canUseUnsafeFPMath(Function *F) {
-
-  // FIXME: For finer-grain optimization, we need intrinsics to have the same
-  // fast-math flag decorations that are applied to FP instructions. For now,
-  // we have to rely on the function-level unsafe-fp-math attribute to do this
-  // optimization because there's no other way to express that the call can be
-  // relaxed.
-  if (F->hasFnAttribute("unsafe-fp-math")) {
-    Attribute Attr = F->getFnAttribute("unsafe-fp-math");
-    if (Attr.getValueAsString() == "true")
-      return true;
-  }
-  return false;
-}
-
-/// \brief Returns whether \p F matches the signature expected for the
-/// string/memory copying library function \p Func.
-/// Acceptable functions are st[rp][n]?cpy, memove, memcpy, and memset.
-/// Their fortified (_chk) counterparts are also accepted.
-static bool checkStringCopyLibFuncSignature(Function *F, LibFunc::Func Func) {
-  const DataLayout &DL = F->getParent()->getDataLayout();
-  FunctionType *FT = F->getFunctionType();
-  LLVMContext &Context = F->getContext();
-  Type *PCharTy = Type::getInt8PtrTy(Context);
-  Type *SizeTTy = DL.getIntPtrType(Context);
-  unsigned NumParams = FT->getNumParams();
-
-  // All string libfuncs return the same type as the first parameter.
-  if (FT->getReturnType() != FT->getParamType(0))
-    return false;
-
-  switch (Func) {
-  default:
-    llvm_unreachable("Can't check signature for non-string-copy libfunc.");
-  case LibFunc::stpncpy_chk:
-  case LibFunc::strncpy_chk:
-    --NumParams; // fallthrough
-  case LibFunc::stpncpy:
-  case LibFunc::strncpy: {
-    if (NumParams != 3 || FT->getParamType(0) != FT->getParamType(1) ||
-        FT->getParamType(0) != PCharTy || !FT->getParamType(2)->isIntegerTy())
-      return false;
-    break;
-  }
-  case LibFunc::strcpy_chk:
-  case LibFunc::stpcpy_chk:
-    --NumParams; // fallthrough
-  case LibFunc::stpcpy:
-  case LibFunc::strcpy: {
-    if (NumParams != 2 || FT->getParamType(0) != FT->getParamType(1) ||
-        FT->getParamType(0) != PCharTy)
-      return false;
-    break;
-  }
-  case LibFunc::memmove_chk:
-  case LibFunc::memcpy_chk:
-    --NumParams; // fallthrough
-  case LibFunc::memmove:
-  case LibFunc::memcpy: {
-    if (NumParams != 3 || !FT->getParamType(0)->isPointerTy() ||
-        !FT->getParamType(1)->isPointerTy() || FT->getParamType(2) != SizeTTy)
-      return false;
-    break;
-  }
-  case LibFunc::memset_chk:
-    --NumParams; // fallthrough
-  case LibFunc::memset: {
-    if (NumParams != 3 || !FT->getParamType(0)->isPointerTy() ||
-        !FT->getParamType(1)->isIntegerTy() || FT->getParamType(2) != SizeTTy)
-      return false;
-    break;
-  }
-  }
-  // If this is a fortified libcall, the last parameter is a size_t.
-  if (NumParams == FT->getNumParams() - 1)
-    return FT->getParamType(FT->getNumParams() - 1) == SizeTTy;
-  return true;
-}
-
 //===----------------------------------------------------------------------===//
 // String and Memory Library Call Optimizations
 //===----------------------------------------------------------------------===//
 
 Value *LibCallSimplifier::optimizeStrCat(CallInst *CI, IRBuilder<> &B) {
-  Function *Callee = CI->getCalledFunction();
-  // Verify the "strcat" function prototype.
-  FunctionType *FT = Callee->getFunctionType();
-  if (FT->getNumParams() != 2||
-      FT->getReturnType() != B.getInt8PtrTy() ||
-      FT->getParamType(0) != FT->getReturnType() ||
-      FT->getParamType(1) != FT->getReturnType())
-    return nullptr;
-
   // Extract some information from the instruction
   Value *Dst = CI->getArgOperand(0);
   Value *Src = CI->getArgOperand(1);
@@ -220,7 +129,7 @@ Value *LibCallSimplifier::emitStrLenMemCpy(Value *Src, Value *Dst, uint64_t Len,
                                            IRBuilder<> &B) {
   // We need to find the end of the destination string.  That's where the
   // memory is to be moved to. We just generate a call to strlen.
-  Value *DstLen = EmitStrLen(Dst, B, DL, TLI);
+  Value *DstLen = emitStrLen(Dst, B, DL, TLI);
   if (!DstLen)
     return nullptr;
 
@@ -238,15 +147,6 @@ Value *LibCallSimplifier::emitStrLenMemCpy(Value *Src, Value *Dst, uint64_t Len,
 }
 
 Value *LibCallSimplifier::optimizeStrNCat(CallInst *CI, IRBuilder<> &B) {
-  Function *Callee = CI->getCalledFunction();
-  // Verify the "strncat" function prototype.
-  FunctionType *FT = Callee->getFunctionType();
-  if (FT->getNumParams() != 3 || FT->getReturnType() != B.getInt8PtrTy() ||
-      FT->getParamType(0) != FT->getReturnType() ||
-      FT->getParamType(1) != FT->getReturnType() ||
-      !FT->getParamType(2)->isIntegerTy())
-    return nullptr;
-
   // Extract some information from the instruction.
   Value *Dst = CI->getArgOperand(0);
   Value *Src = CI->getArgOperand(1);
@@ -281,13 +181,7 @@ Value *LibCallSimplifier::optimizeStrNCat(CallInst *CI, IRBuilder<> &B) {
 
 Value *LibCallSimplifier::optimizeStrChr(CallInst *CI, IRBuilder<> &B) {
   Function *Callee = CI->getCalledFunction();
-  // Verify the "strchr" function prototype.
   FunctionType *FT = Callee->getFunctionType();
-  if (FT->getNumParams() != 2 || FT->getReturnType() != B.getInt8PtrTy() ||
-      FT->getParamType(0) != FT->getReturnType() ||
-      !FT->getParamType(1)->isIntegerTy(32))
-    return nullptr;
-
   Value *SrcStr = CI->getArgOperand(0);
 
   // If the second operand is non-constant, see if we can compute the length
@@ -298,7 +192,7 @@ Value *LibCallSimplifier::optimizeStrChr(CallInst *CI, IRBuilder<> &B) {
     if (Len == 0 || !FT->getParamType(1)->isIntegerTy(32)) // memchr needs i32.
       return nullptr;
 
-    return EmitMemChr(SrcStr, CI->getArgOperand(1), // include nul.
+    return emitMemChr(SrcStr, CI->getArgOperand(1), // include nul.
                       ConstantInt::get(DL.getIntPtrType(CI->getContext()), Len),
                       B, DL, TLI);
   }
@@ -308,7 +202,7 @@ Value *LibCallSimplifier::optimizeStrChr(CallInst *CI, IRBuilder<> &B) {
   StringRef Str;
   if (!getConstantStringInfo(SrcStr, Str)) {
     if (CharC->isZero()) // strchr(p, 0) -> p + strlen(p)
-      return B.CreateGEP(B.getInt8Ty(), SrcStr, EmitStrLen(SrcStr, B, DL, TLI),
+      return B.CreateGEP(B.getInt8Ty(), SrcStr, emitStrLen(SrcStr, B, DL, TLI),
                          "strchr");
     return nullptr;
   }
@@ -326,14 +220,6 @@ Value *LibCallSimplifier::optimizeStrChr(CallInst *CI, IRBuilder<> &B) {
 }
 
 Value *LibCallSimplifier::optimizeStrRChr(CallInst *CI, IRBuilder<> &B) {
-  Function *Callee = CI->getCalledFunction();
-  // Verify the "strrchr" function prototype.
-  FunctionType *FT = Callee->getFunctionType();
-  if (FT->getNumParams() != 2 || FT->getReturnType() != B.getInt8PtrTy() ||
-      FT->getParamType(0) != FT->getReturnType() ||
-      !FT->getParamType(1)->isIntegerTy(32))
-    return nullptr;
-
   Value *SrcStr = CI->getArgOperand(0);
   ConstantInt *CharC = dyn_cast<ConstantInt>(CI->getArgOperand(1));
 
@@ -345,7 +231,7 @@ Value *LibCallSimplifier::optimizeStrRChr(CallInst *CI, IRBuilder<> &B) {
   if (!getConstantStringInfo(SrcStr, Str)) {
     // strrchr(s, 0) -> strchr(s, 0)
     if (CharC->isZero())
-      return EmitStrChr(SrcStr, '\0', B, TLI);
+      return emitStrChr(SrcStr, '\0', B, TLI);
     return nullptr;
   }
 
@@ -361,14 +247,6 @@ Value *LibCallSimplifier::optimizeStrRChr(CallInst *CI, IRBuilder<> &B) {
 }
 
 Value *LibCallSimplifier::optimizeStrCmp(CallInst *CI, IRBuilder<> &B) {
-  Function *Callee = CI->getCalledFunction();
-  // Verify the "strcmp" function prototype.
-  FunctionType *FT = Callee->getFunctionType();
-  if (FT->getNumParams() != 2 || !FT->getReturnType()->isIntegerTy(32) ||
-      FT->getParamType(0) != FT->getParamType(1) ||
-      FT->getParamType(0) != B.getInt8PtrTy())
-    return nullptr;
-
   Value *Str1P = CI->getArgOperand(0), *Str2P = CI->getArgOperand(1);
   if (Str1P == Str2P) // strcmp(x,x)  -> 0
     return ConstantInt::get(CI->getType(), 0);
@@ -392,7 +270,7 @@ Value *LibCallSimplifier::optimizeStrCmp(CallInst *CI, IRBuilder<> &B) {
   uint64_t Len1 = GetStringLength(Str1P);
   uint64_t Len2 = GetStringLength(Str2P);
   if (Len1 && Len2) {
-    return EmitMemCmp(Str1P, Str2P,
+    return emitMemCmp(Str1P, Str2P,
                       ConstantInt::get(DL.getIntPtrType(CI->getContext()),
                                        std::min(Len1, Len2)),
                       B, DL, TLI);
@@ -402,15 +280,6 @@ Value *LibCallSimplifier::optimizeStrCmp(CallInst *CI, IRBuilder<> &B) {
 }
 
 Value *LibCallSimplifier::optimizeStrNCmp(CallInst *CI, IRBuilder<> &B) {
-  Function *Callee = CI->getCalledFunction();
-  // Verify the "strncmp" function prototype.
-  FunctionType *FT = Callee->getFunctionType();
-  if (FT->getNumParams() != 3 || !FT->getReturnType()->isIntegerTy(32) ||
-      FT->getParamType(0) != FT->getParamType(1) ||
-      FT->getParamType(0) != B.getInt8PtrTy() ||
-      !FT->getParamType(2)->isIntegerTy())
-    return nullptr;
-
   Value *Str1P = CI->getArgOperand(0), *Str2P = CI->getArgOperand(1);
   if (Str1P == Str2P) // strncmp(x,x,n)  -> 0
     return ConstantInt::get(CI->getType(), 0);
@@ -426,7 +295,7 @@ Value *LibCallSimplifier::optimizeStrNCmp(CallInst *CI, IRBuilder<> &B) {
     return ConstantInt::get(CI->getType(), 0);
 
   if (Length == 1) // strncmp(x,y,1) -> memcmp(x,y,1)
-    return EmitMemCmp(Str1P, Str2P, CI->getArgOperand(2), B, DL, TLI);
+    return emitMemCmp(Str1P, Str2P, CI->getArgOperand(2), B, DL, TLI);
 
   StringRef Str1, Str2;
   bool HasStr1 = getConstantStringInfo(Str1P, Str1);
@@ -450,11 +319,6 @@ Value *LibCallSimplifier::optimizeStrNCmp(CallInst *CI, IRBuilder<> &B) {
 }
 
 Value *LibCallSimplifier::optimizeStrCpy(CallInst *CI, IRBuilder<> &B) {
-  Function *Callee = CI->getCalledFunction();
-
-  if (!checkStringCopyLibFuncSignature(Callee, LibFunc::strcpy))
-    return nullptr;
-
   Value *Dst = CI->getArgOperand(0), *Src = CI->getArgOperand(1);
   if (Dst == Src) // strcpy(x,x)  -> x
     return Src;
@@ -473,12 +337,9 @@ Value *LibCallSimplifier::optimizeStrCpy(CallInst *CI, IRBuilder<> &B) {
 
 Value *LibCallSimplifier::optimizeStpCpy(CallInst *CI, IRBuilder<> &B) {
   Function *Callee = CI->getCalledFunction();
-  if (!checkStringCopyLibFuncSignature(Callee, LibFunc::stpcpy))
-    return nullptr;
-
   Value *Dst = CI->getArgOperand(0), *Src = CI->getArgOperand(1);
   if (Dst == Src) { // stpcpy(x,x)  -> x+strlen(x)
-    Value *StrLen = EmitStrLen(Src, B, DL, TLI);
+    Value *StrLen = emitStrLen(Src, B, DL, TLI);
     return StrLen ? B.CreateInBoundsGEP(B.getInt8Ty(), Dst, StrLen) : nullptr;
   }
 
@@ -500,9 +361,6 @@ Value *LibCallSimplifier::optimizeStpCpy(CallInst *CI, IRBuilder<> &B) {
 
 Value *LibCallSimplifier::optimizeStrNCpy(CallInst *CI, IRBuilder<> &B) {
   Function *Callee = CI->getCalledFunction();
-  if (!checkStringCopyLibFuncSignature(Callee, LibFunc::strncpy))
-    return nullptr;
-
   Value *Dst = CI->getArgOperand(0);
   Value *Src = CI->getArgOperand(1);
   Value *LenOp = CI->getArgOperand(2);
@@ -540,18 +398,63 @@ Value *LibCallSimplifier::optimizeStrNCpy(CallInst *CI, IRBuilder<> &B) {
 }
 
 Value *LibCallSimplifier::optimizeStrLen(CallInst *CI, IRBuilder<> &B) {
-  Function *Callee = CI->getCalledFunction();
-  FunctionType *FT = Callee->getFunctionType();
-  if (FT->getNumParams() != 1 || FT->getParamType(0) != B.getInt8PtrTy() ||
-      !FT->getReturnType()->isIntegerTy())
-    return nullptr;
-
   Value *Src = CI->getArgOperand(0);
 
   // Constant folding: strlen("xyz") -> 3
   if (uint64_t Len = GetStringLength(Src))
     return ConstantInt::get(CI->getType(), Len - 1);
 
+  // If s is a constant pointer pointing to a string literal, we can fold
+  // strlen(s + x) to strlen(s) - x, when x is known to be in the range 
+  // [0, strlen(s)] or the string has a single null terminator '\0' at the end.
+  // We only try to simplify strlen when the pointer s points to an array 
+  // of i8. Otherwise, we would need to scale the offset x before doing the
+  // subtraction. This will make the optimization more complex, and it's not 
+  // very useful because calling strlen for a pointer of other types is 
+  // very uncommon.
+  if (GEPOperator *GEP = dyn_cast<GEPOperator>(Src)) {
+    if (!isGEPBasedOnPointerToString(GEP))
+      return nullptr;
+
+    StringRef Str;
+    if (getConstantStringInfo(GEP->getOperand(0), Str, 0, false)) {
+      size_t NullTermIdx = Str.find('\0');
+      
+      // If the string does not have '\0', leave it to strlen to compute
+      // its length.
+      if (NullTermIdx == StringRef::npos)
+        return nullptr;
+     
+      Value *Offset = GEP->getOperand(2);
+      unsigned BitWidth = Offset->getType()->getIntegerBitWidth();
+      APInt KnownZero(BitWidth, 0);
+      APInt KnownOne(BitWidth, 0);
+      computeKnownBits(Offset, KnownZero, KnownOne, DL, 0, nullptr, CI, 
+                       nullptr);
+      KnownZero.flipAllBits();
+      size_t ArrSize = 
+             cast<ArrayType>(GEP->getSourceElementType())->getNumElements();
+
+      // KnownZero's bits are flipped, so zeros in KnownZero now represent 
+      // bits known to be zeros in Offset, and ones in KnowZero represent 
+      // bits unknown in Offset. Therefore, Offset is known to be in range
+      // [0, NullTermIdx] when the flipped KnownZero is non-negative and 
+      // unsigned-less-than NullTermIdx.
+      //
+      // If Offset is not provably in the range [0, NullTermIdx], we can still 
+      // optimize if we can prove that the program has undefined behavior when 
+      // Offset is outside that range. That is the case when GEP->getOperand(0) 
+      // is a pointer to an object whose memory extent is NullTermIdx+1.
+      if ((KnownZero.isNonNegative() && KnownZero.ule(NullTermIdx)) || 
+          (GEP->isInBounds() && isa<GlobalVariable>(GEP->getOperand(0)) &&
+           NullTermIdx == ArrSize - 1))
+        return B.CreateSub(ConstantInt::get(CI->getType(), NullTermIdx), 
+                           Offset);
+    }
+
+    return nullptr;
+  }
+
   // strlen(x?"foo":"bars") --> x ? 3 : 4
   if (SelectInst *SI = dyn_cast<SelectInst>(Src)) {
     uint64_t LenTrue = GetStringLength(SI->getTrueValue());
@@ -576,13 +479,6 @@ Value *LibCallSimplifier::optimizeStrLen(CallInst *CI, IRBuilder<> &B) {
 }
 
 Value *LibCallSimplifier::optimizeStrPBrk(CallInst *CI, IRBuilder<> &B) {
-  Function *Callee = CI->getCalledFunction();
-  FunctionType *FT = Callee->getFunctionType();
-  if (FT->getNumParams() != 2 || FT->getParamType(0) != B.getInt8PtrTy() ||
-      FT->getParamType(1) != FT->getParamType(0) ||
-      FT->getReturnType() != FT->getParamType(0))
-    return nullptr;
-
   StringRef S1, S2;
   bool HasS1 = getConstantStringInfo(CI->getArgOperand(0), S1);
   bool HasS2 = getConstantStringInfo(CI->getArgOperand(1), S2);
@@ -604,19 +500,12 @@ Value *LibCallSimplifier::optimizeStrPBrk(CallInst *CI, IRBuilder<> &B) {
 
   // strpbrk(s, "a") -> strchr(s, 'a')
   if (HasS2 && S2.size() == 1)
-    return EmitStrChr(CI->getArgOperand(0), S2[0], B, TLI);
+    return emitStrChr(CI->getArgOperand(0), S2[0], B, TLI);
 
   return nullptr;
 }
 
 Value *LibCallSimplifier::optimizeStrTo(CallInst *CI, IRBuilder<> &B) {
-  Function *Callee = CI->getCalledFunction();
-  FunctionType *FT = Callee->getFunctionType();
-  if ((FT->getNumParams() != 2 && FT->getNumParams() != 3) ||
-      !FT->getParamType(0)->isPointerTy() ||
-      !FT->getParamType(1)->isPointerTy())
-    return nullptr;
-
   Value *EndPtr = CI->getArgOperand(1);
   if (isa<ConstantPointerNull>(EndPtr)) {
     // With a null EndPtr, this function won't capture the main argument.
@@ -628,13 +517,6 @@ Value *LibCallSimplifier::optimizeStrTo(CallInst *CI, IRBuilder<> &B) {
 }
 
 Value *LibCallSimplifier::optimizeStrSpn(CallInst *CI, IRBuilder<> &B) {
-  Function *Callee = CI->getCalledFunction();
-  FunctionType *FT = Callee->getFunctionType();
-  if (FT->getNumParams() != 2 || FT->getParamType(0) != B.getInt8PtrTy() ||
-      FT->getParamType(1) != FT->getParamType(0) ||
-      !FT->getReturnType()->isIntegerTy())
-    return nullptr;
-
   StringRef S1, S2;
   bool HasS1 = getConstantStringInfo(CI->getArgOperand(0), S1);
   bool HasS2 = getConstantStringInfo(CI->getArgOperand(1), S2);
@@ -656,13 +538,6 @@ Value *LibCallSimplifier::optimizeStrSpn(CallInst *CI, IRBuilder<> &B) {
 }
 
 Value *LibCallSimplifier::optimizeStrCSpn(CallInst *CI, IRBuilder<> &B) {
-  Function *Callee = CI->getCalledFunction();
-  FunctionType *FT = Callee->getFunctionType();
-  if (FT->getNumParams() != 2 || FT->getParamType(0) != B.getInt8PtrTy() ||
-      FT->getParamType(1) != FT->getParamType(0) ||
-      !FT->getReturnType()->isIntegerTy())
-    return nullptr;
-
   StringRef S1, S2;
   bool HasS1 = getConstantStringInfo(CI->getArgOperand(0), S1);
   bool HasS2 = getConstantStringInfo(CI->getArgOperand(1), S2);
@@ -681,29 +556,22 @@ Value *LibCallSimplifier::optimizeStrCSpn(CallInst *CI, IRBuilder<> &B) {
 
   // strcspn(s, "") -> strlen(s)
   if (HasS2 && S2.empty())
-    return EmitStrLen(CI->getArgOperand(0), B, DL, TLI);
+    return emitStrLen(CI->getArgOperand(0), B, DL, TLI);
 
   return nullptr;
 }
 
 Value *LibCallSimplifier::optimizeStrStr(CallInst *CI, IRBuilder<> &B) {
-  Function *Callee = CI->getCalledFunction();
-  FunctionType *FT = Callee->getFunctionType();
-  if (FT->getNumParams() != 2 || !FT->getParamType(0)->isPointerTy() ||
-      !FT->getParamType(1)->isPointerTy() ||
-      !FT->getReturnType()->isPointerTy())
-    return nullptr;
-
   // fold strstr(x, x) -> x.
   if (CI->getArgOperand(0) == CI->getArgOperand(1))
     return B.CreateBitCast(CI->getArgOperand(0), CI->getType());
 
   // fold strstr(a, b) == a -> strncmp(a, b, strlen(b)) == 0
   if (isOnlyUsedInEqualityComparison(CI, CI->getArgOperand(0))) {
-    Value *StrLen = EmitStrLen(CI->getArgOperand(1), B, DL, TLI);
+    Value *StrLen = emitStrLen(CI->getArgOperand(1), B, DL, TLI);
     if (!StrLen)
       return nullptr;
-    Value *StrNCmp = EmitStrNCmp(CI->getArgOperand(0), CI->getArgOperand(1),
+    Value *StrNCmp = emitStrNCmp(CI->getArgOperand(0), CI->getArgOperand(1),
                                  StrLen, B, DL, TLI);
     if (!StrNCmp)
       return nullptr;
@@ -734,28 +602,20 @@ Value *LibCallSimplifier::optimizeStrStr(CallInst *CI, IRBuilder<> &B) {
       return Constant::getNullValue(CI->getType());
 
     // strstr("abcd", "bc") -> gep((char*)"abcd", 1)
-    Value *Result = CastToCStr(CI->getArgOperand(0), B);
+    Value *Result = castToCStr(CI->getArgOperand(0), B);
     Result = B.CreateConstInBoundsGEP1_64(Result, Offset, "strstr");
     return B.CreateBitCast(Result, CI->getType());
   }
 
   // fold strstr(x, "y") -> strchr(x, 'y').
   if (HasStr2 && ToFindStr.size() == 1) {
-    Value *StrChr = EmitStrChr(CI->getArgOperand(0), ToFindStr[0], B, TLI);
+    Value *StrChr = emitStrChr(CI->getArgOperand(0), ToFindStr[0], B, TLI);
     return StrChr ? B.CreateBitCast(StrChr, CI->getType()) : nullptr;
   }
   return nullptr;
 }
 
 Value *LibCallSimplifier::optimizeMemChr(CallInst *CI, IRBuilder<> &B) {
-  Function *Callee = CI->getCalledFunction();
-  FunctionType *FT = Callee->getFunctionType();
-  if (FT->getNumParams() != 3 || !FT->getParamType(0)->isPointerTy() ||
-      !FT->getParamType(1)->isIntegerTy(32) ||
-      !FT->getParamType(2)->isIntegerTy() ||
-      !FT->getReturnType()->isPointerTy())
-    return nullptr;
-
   Value *SrcStr = CI->getArgOperand(0);
   ConstantInt *CharC = dyn_cast<ConstantInt>(CI->getArgOperand(1));
   ConstantInt *LenC = dyn_cast<ConstantInt>(CI->getArgOperand(2));
@@ -834,13 +694,6 @@ Value *LibCallSimplifier::optimizeMemChr(CallInst *CI, IRBuilder<> &B) {
 }
 
 Value *LibCallSimplifier::optimizeMemCmp(CallInst *CI, IRBuilder<> &B) {
-  Function *Callee = CI->getCalledFunction();
-  FunctionType *FT = Callee->getFunctionType();
-  if (FT->getNumParams() != 3 || !FT->getParamType(0)->isPointerTy() ||
-      !FT->getParamType(1)->isPointerTy() ||
-      !FT->getReturnType()->isIntegerTy(32))
-    return nullptr;
-
   Value *LHS = CI->getArgOperand(0), *RHS = CI->getArgOperand(1);
 
   if (LHS == RHS) // memcmp(s,s,x) -> 0
@@ -857,9 +710,9 @@ Value *LibCallSimplifier::optimizeMemCmp(CallInst *CI, IRBuilder<> &B) {
 
   // memcmp(S1,S2,1) -> *(unsigned char*)LHS - *(unsigned char*)RHS
   if (Len == 1) {
-    Value *LHSV = B.CreateZExt(B.CreateLoad(CastToCStr(LHS, B), "lhsc"),
+    Value *LHSV = B.CreateZExt(B.CreateLoad(castToCStr(LHS, B), "lhsc"),
                                CI->getType(), "lhsv");
-    Value *RHSV = B.CreateZExt(B.CreateLoad(CastToCStr(RHS, B), "rhsc"),
+    Value *RHSV = B.CreateZExt(B.CreateLoad(castToCStr(RHS, B), "rhsc"),
                                CI->getType(), "rhsv");
     return B.CreateSub(LHSV, RHSV, "chardiff");
   }
@@ -909,11 +762,6 @@ Value *LibCallSimplifier::optimizeMemCmp(CallInst *CI, IRBuilder<> &B) {
 }
 
 Value *LibCallSimplifier::optimizeMemCpy(CallInst *CI, IRBuilder<> &B) {
-  Function *Callee = CI->getCalledFunction();
-
-  if (!checkStringCopyLibFuncSignature(Callee, LibFunc::memcpy))
-    return nullptr;
-
   // memcpy(x, y, n) -> llvm.memcpy(x, y, n, 1)
   B.CreateMemCpy(CI->getArgOperand(0), CI->getArgOperand(1),
                  CI->getArgOperand(2), 1);
@@ -921,23 +769,81 @@ Value *LibCallSimplifier::optimizeMemCpy(CallInst *CI, IRBuilder<> &B) {
 }
 
 Value *LibCallSimplifier::optimizeMemMove(CallInst *CI, IRBuilder<> &B) {
-  Function *Callee = CI->getCalledFunction();
-
-  if (!checkStringCopyLibFuncSignature(Callee, LibFunc::memmove))
-    return nullptr;
-
   // memmove(x, y, n) -> llvm.memmove(x, y, n, 1)
   B.CreateMemMove(CI->getArgOperand(0), CI->getArgOperand(1),
                   CI->getArgOperand(2), 1);
   return CI->getArgOperand(0);
 }
 
-Value *LibCallSimplifier::optimizeMemSet(CallInst *CI, IRBuilder<> &B) {
-  Function *Callee = CI->getCalledFunction();
+// TODO: Does this belong in BuildLibCalls or should all of those similar
+// functions be moved here?
+static Value *emitCalloc(Value *Num, Value *Size, const AttributeSet &Attrs,
+                         IRBuilder<> &B, const TargetLibraryInfo &TLI) {
+  LibFunc::Func Func;
+  if (!TLI.getLibFunc("calloc", Func) || !TLI.has(Func))
+    return nullptr;
+
+  Module *M = B.GetInsertBlock()->getModule();
+  const DataLayout &DL = M->getDataLayout();
+  IntegerType *PtrType = DL.getIntPtrType((B.GetInsertBlock()->getContext()));
+  Value *Calloc = M->getOrInsertFunction("calloc", Attrs, B.getInt8PtrTy(),
+                                         PtrType, PtrType, nullptr);
+  CallInst *CI = B.CreateCall(Calloc, { Num, Size }, "calloc");
+
+  if (const auto *F = dyn_cast<Function>(Calloc->stripPointerCasts()))
+    CI->setCallingConv(F->getCallingConv());
+
+  return CI;
+}
 
-  if (!checkStringCopyLibFuncSignature(Callee, LibFunc::memset))
+/// Fold memset[_chk](malloc(n), 0, n) --> calloc(1, n).
+static Value *foldMallocMemset(CallInst *Memset, IRBuilder<> &B,
+                               const TargetLibraryInfo &TLI) {
+  // This has to be a memset of zeros (bzero).
+  auto *FillValue = dyn_cast<ConstantInt>(Memset->getArgOperand(1));
+  if (!FillValue || FillValue->getZExtValue() != 0)
     return nullptr;
 
+  // TODO: We should handle the case where the malloc has more than one use.
+  // This is necessary to optimize common patterns such as when the result of
+  // the malloc is checked against null or when a memset intrinsic is used in
+  // place of a memset library call.
+  auto *Malloc = dyn_cast<CallInst>(Memset->getArgOperand(0));
+  if (!Malloc || !Malloc->hasOneUse())
+    return nullptr;
+
+  // Is the inner call really malloc()?
+  Function *InnerCallee = Malloc->getCalledFunction();
+  LibFunc::Func Func;
+  if (!TLI.getLibFunc(*InnerCallee, Func) || !TLI.has(Func) ||
+      Func != LibFunc::malloc)
+    return nullptr;
+
+  // The memset must cover the same number of bytes that are malloc'd.
+  if (Memset->getArgOperand(2) != Malloc->getArgOperand(0))
+    return nullptr;
+
+  // Replace the malloc with a calloc. We need the data layout to know what the
+  // actual size of a 'size_t' parameter is. 
+  B.SetInsertPoint(Malloc->getParent(), ++Malloc->getIterator());
+  const DataLayout &DL = Malloc->getModule()->getDataLayout();
+  IntegerType *SizeType = DL.getIntPtrType(B.GetInsertBlock()->getContext());
+  Value *Calloc = emitCalloc(ConstantInt::get(SizeType, 1),
+                             Malloc->getArgOperand(0), Malloc->getAttributes(),
+                             B, TLI);
+  if (!Calloc)
+    return nullptr;
+
+  Malloc->replaceAllUsesWith(Calloc);
+  Malloc->eraseFromParent();
+
+  return Calloc;
+}
+
+Value *LibCallSimplifier::optimizeMemSet(CallInst *CI, IRBuilder<> &B) {
+  if (auto *Calloc = foldMallocMemset(CI, B, *TLI))
+    return Calloc;
+
   // memset(p, v, n) -> llvm.memset(p, v, n, 1)
   Value *Val = B.CreateIntCast(CI->getArgOperand(1), B.getInt8Ty(), false);
   B.CreateMemSet(CI->getArgOperand(0), Val, CI->getArgOperand(2), 1);
@@ -970,34 +876,12 @@ static Value *valueHasFloatPrecision(Value *Val) {
   return nullptr;
 }
 
-/// Any floating-point library function that we're trying to simplify will have
-/// a signature of the form: fptype foo(fptype param1, fptype param2, ...).
-/// CheckDoubleTy indicates that 'fptype' must be 'double'.
-static bool matchesFPLibFunctionSignature(const Function *F, unsigned NumParams,
-                                          bool CheckDoubleTy) {
-  FunctionType *FT = F->getFunctionType();
-  if (FT->getNumParams() != NumParams)
-    return false;
-
-  // The return type must match what we're looking for.
-  Type *RetTy = FT->getReturnType();
-  if (CheckDoubleTy ? !RetTy->isDoubleTy() : !RetTy->isFloatingPointTy())
-    return false;
-
-  // Each parameter must match the return type, and therefore, match every other
-  // parameter too.
-  for (const Type *ParamTy : FT->params())
-    if (ParamTy != RetTy)
-      return false;
-
-  return true;
-}
-
 /// Shrink double -> float for unary functions like 'floor'.
 static Value *optimizeUnaryDoubleFP(CallInst *CI, IRBuilder<> &B,
                                     bool CheckRetType) {
   Function *Callee = CI->getCalledFunction();
-  if (!matchesFPLibFunctionSignature(Callee, 1, true))
+  // We know this libcall has a valid prototype, but we don't know which.
+  if (!CI->getType()->isDoubleTy())
     return nullptr;
 
   if (CheckRetType) {
@@ -1026,7 +910,7 @@ static Value *optimizeUnaryDoubleFP(CallInst *CI, IRBuilder<> &B,
     V = B.CreateCall(F, V);
   } else {
     // The call is a library call rather than an intrinsic.
-    V = EmitUnaryFloatFnCall(V, Callee->getName(), B, Callee->getAttributes());
+    V = emitUnaryFloatFnCall(V, Callee->getName(), B, Callee->getAttributes());
   }
 
   return B.CreateFPExt(V, B.getDoubleTy());
@@ -1035,7 +919,8 @@ static Value *optimizeUnaryDoubleFP(CallInst *CI, IRBuilder<> &B,
 /// Shrink double -> float for binary functions like 'fmin/fmax'.
 static Value *optimizeBinaryDoubleFP(CallInst *CI, IRBuilder<> &B) {
   Function *Callee = CI->getCalledFunction();
-  if (!matchesFPLibFunctionSignature(Callee, 2, true))
+  // We know this libcall has a valid prototype, but we don't know which.
+  if (!CI->getType()->isDoubleTy())
     return nullptr;
 
   // If this is something like 'fmin((double)floatval1, (double)floatval2)',
@@ -1054,7 +939,7 @@ static Value *optimizeBinaryDoubleFP(CallInst *CI, IRBuilder<> &B) {
   // fmin((double)floatval1, (double)floatval2)
   //                      -> (double)fminf(floatval1, floatval2)
   // TODO: Handle intrinsics in the same way as in optimizeUnaryDoubleFP().
-  Value *V = EmitBinaryFloatFnCall(V1, V2, Callee->getName(), B,
+  Value *V = emitBinaryFloatFnCall(V1, V2, Callee->getName(), B,
                                    Callee->getAttributes());
   return B.CreateFPExt(V, B.getDoubleTy());
 }
@@ -1066,13 +951,6 @@ Value *LibCallSimplifier::optimizeCos(CallInst *CI, IRBuilder<> &B) {
   if (UnsafeFPShrink && Name == "cos" && hasFloatVersion(Name))
     Ret = optimizeUnaryDoubleFP(CI, B, true);
 
-  FunctionType *FT = Callee->getFunctionType();
-  // Just make sure this has 1 argument of FP type, which matches the
-  // result type.
-  if (FT->getNumParams() != 1 || FT->getReturnType() != FT->getParamType(0) ||
-      !FT->getParamType(0)->isFloatingPointTy())
-    return Ret;
-
   // cos(-x) -> cos(x)
   Value *Op1 = CI->getArgOperand(0);
   if (BinaryOperator::isFNeg(Op1)) {
@@ -1114,14 +992,6 @@ Value *LibCallSimplifier::optimizePow(CallInst *CI, IRBuilder<> &B) {
   if (UnsafeFPShrink && Name == "pow" && hasFloatVersion(Name))
     Ret = optimizeUnaryDoubleFP(CI, B, true);
 
-  FunctionType *FT = Callee->getFunctionType();
-  // Just make sure this has 2 arguments of the same FP type, which match the
-  // result type.
-  if (FT->getNumParams() != 2 || FT->getReturnType() != FT->getParamType(0) ||
-      FT->getParamType(0) != FT->getParamType(1) ||
-      !FT->getParamType(0)->isFloatingPointTy())
-    return Ret;
-
   Value *Op1 = CI->getArgOperand(0), *Op2 = CI->getArgOperand(1);
   if (ConstantFP *Op1C = dyn_cast<ConstantFP>(Op1)) {
     // pow(1.0, x) -> 1.0
@@ -1131,19 +1001,16 @@ Value *LibCallSimplifier::optimizePow(CallInst *CI, IRBuilder<> &B) {
     if (Op1C->isExactlyValue(2.0) &&
         hasUnaryFloatFn(TLI, Op1->getType(), LibFunc::exp2, LibFunc::exp2f,
                         LibFunc::exp2l))
-      return EmitUnaryFloatFnCall(Op2, TLI->getName(LibFunc::exp2), B,
+      return emitUnaryFloatFnCall(Op2, TLI->getName(LibFunc::exp2), B,
                                   Callee->getAttributes());
     // pow(10.0, x) -> exp10(x)
     if (Op1C->isExactlyValue(10.0) &&
         hasUnaryFloatFn(TLI, Op1->getType(), LibFunc::exp10, LibFunc::exp10f,
                         LibFunc::exp10l))
-      return EmitUnaryFloatFnCall(Op2, TLI->getName(LibFunc::exp10), B,
+      return emitUnaryFloatFnCall(Op2, TLI->getName(LibFunc::exp10), B,
                                   Callee->getAttributes());
   }
 
-  // FIXME: Use instruction-level FMF.
-  bool UnsafeFPMath = canUseUnsafeFPMath(CI->getParent()->getParent());
-
   // pow(exp(x), y) -> exp(x * y)
   // pow(exp2(x), y) -> exp2(x * y)
   // We enable these only with fast-math. Besides rounding differences, the
@@ -1159,7 +1026,7 @@ Value *LibCallSimplifier::optimizePow(CallInst *CI, IRBuilder<> &B) {
       IRBuilder<>::FastMathFlagGuard Guard(B);
       B.setFastMathFlags(CI->getFastMathFlags());
       Value *FMul = B.CreateFMul(OpC->getArgOperand(0), Op2, "mul");
-      return EmitUnaryFloatFnCall(FMul, OpCCallee->getName(), B,
+      return emitUnaryFloatFnCall(FMul, OpCCallee->getName(), B,
                                   OpCCallee->getAttributes());
     }
   }
@@ -1181,7 +1048,7 @@ Value *LibCallSimplifier::optimizePow(CallInst *CI, IRBuilder<> &B) {
     if (CI->hasUnsafeAlgebra()) {
       IRBuilder<>::FastMathFlagGuard Guard(B);
       B.setFastMathFlags(CI->getFastMathFlags());
-      return EmitUnaryFloatFnCall(Op1, TLI->getName(LibFunc::sqrt), B,
+      return emitUnaryFloatFnCall(Op1, TLI->getName(LibFunc::sqrt), B,
                                   Callee->getAttributes());
     }
 
@@ -1191,9 +1058,9 @@ Value *LibCallSimplifier::optimizePow(CallInst *CI, IRBuilder<> &B) {
     // TODO: In finite-only mode, this could be just fabs(sqrt(x)).
     Value *Inf = ConstantFP::getInfinity(CI->getType());
     Value *NegInf = ConstantFP::getInfinity(CI->getType(), true);
-    Value *Sqrt = EmitUnaryFloatFnCall(Op1, "sqrt", B, Callee->getAttributes());
+    Value *Sqrt = emitUnaryFloatFnCall(Op1, "sqrt", B, Callee->getAttributes());
     Value *FAbs =
-        EmitUnaryFloatFnCall(Sqrt, "fabs", B, Callee->getAttributes());
+        emitUnaryFloatFnCall(Sqrt, "fabs", B, Callee->getAttributes());
     Value *FCmp = B.CreateFCmpOEQ(Op1, NegInf);
     Value *Sel = B.CreateSelect(FCmp, Inf, FAbs);
     return Sel;
@@ -1207,7 +1074,7 @@ Value *LibCallSimplifier::optimizePow(CallInst *CI, IRBuilder<> &B) {
     return B.CreateFDiv(ConstantFP::get(CI->getType(), 1.0), Op1, "powrecip");
 
   // In -ffast-math, generate repeated fmul instead of generating pow(x, n).
-  if (UnsafeFPMath) {
+  if (CI->hasUnsafeAlgebra()) {
     APFloat V = abs(Op2C->getValueAPF());
     // We limit to a max of 7 fmul(s). Thus max exponent is 32.
     // This transformation applies to integer exponents only.
@@ -1224,6 +1091,8 @@ Value *LibCallSimplifier::optimizePow(CallInst *CI, IRBuilder<> &B) {
     // So we first convert V to something which could be converted to double.
     bool ignored;
     V.convert(APFloat::IEEEdouble, APFloat::rmTowardZero, &ignored);
+    
+    // TODO: Should the new instructions propagate the 'fast' flag of the pow()?
     Value *FMul = getPow(InnerChain, V.convertToDouble(), B);
     // For negative exponents simply compute the reciprocal.
     if (Op2C->isNegative())
@@ -1236,19 +1105,11 @@ Value *LibCallSimplifier::optimizePow(CallInst *CI, IRBuilder<> &B) {
 
 Value *LibCallSimplifier::optimizeExp2(CallInst *CI, IRBuilder<> &B) {
   Function *Callee = CI->getCalledFunction();
-  Function *Caller = CI->getParent()->getParent();
   Value *Ret = nullptr;
   StringRef Name = Callee->getName();
   if (UnsafeFPShrink && Name == "exp2" && hasFloatVersion(Name))
     Ret = optimizeUnaryDoubleFP(CI, B, true);
 
-  FunctionType *FT = Callee->getFunctionType();
-  // Just make sure this has 1 argument of FP type, which matches the
-  // result type.
-  if (FT->getNumParams() != 1 || FT->getReturnType() != FT->getParamType(0) ||
-      !FT->getParamType(0)->isFloatingPointTy())
-    return Ret;
-
   Value *Op = CI->getArgOperand(0);
   // Turn exp2(sitofp(x)) -> ldexp(1.0, sext(x))  if sizeof(x) <= 32
   // Turn exp2(uitofp(x)) -> ldexp(1.0, zext(x))  if sizeof(x) < 32
@@ -1273,11 +1134,11 @@ Value *LibCallSimplifier::optimizeExp2(CallInst *CI, IRBuilder<> &B) {
       if (!Op->getType()->isFloatTy())
         One = ConstantExpr::getFPExtend(One, Op->getType());
 
-      Module *M = Caller->getParent();
-      Value *Callee =
+      Module *M = CI->getModule();
+      Value *NewCallee =
           M->getOrInsertFunction(TLI->getName(LdExp), Op->getType(),
                                  Op->getType(), B.getInt32Ty(), nullptr);
-      CallInst *CI = B.CreateCall(Callee, {One, LdExpArg});
+      CallInst *CI = B.CreateCall(NewCallee, {One, LdExpArg});
       if (const Function *F = dyn_cast<Function>(Callee->stripPointerCasts()))
         CI->setCallingConv(F->getCallingConv());
 
@@ -1294,12 +1155,6 @@ Value *LibCallSimplifier::optimizeFabs(CallInst *CI, IRBuilder<> &B) {
   if (Name == "fabs" && hasFloatVersion(Name))
     Ret = optimizeUnaryDoubleFP(CI, B, false);
 
-  FunctionType *FT = Callee->getFunctionType();
-  // Make sure this has 1 argument of FP type which matches the result type.
-  if (FT->getNumParams() != 1 || FT->getReturnType() != FT->getParamType(0) ||
-      !FT->getParamType(0)->isFloatingPointTy())
-    return Ret;
-
   Value *Op = CI->getArgOperand(0);
   if (Instruction *I = dyn_cast<Instruction>(Op)) {
     // Fold fabs(x * x) -> x * x; any squared FP value must already be positive.
@@ -1311,21 +1166,14 @@ Value *LibCallSimplifier::optimizeFabs(CallInst *CI, IRBuilder<> &B) {
 }
 
 Value *LibCallSimplifier::optimizeFMinFMax(CallInst *CI, IRBuilder<> &B) {
+  Function *Callee = CI->getCalledFunction();
   // If we can shrink the call to a float function rather than a double
   // function, do that first.
-  Function *Callee = CI->getCalledFunction();
   StringRef Name = Callee->getName();
   if ((Name == "fmin" || Name == "fmax") && hasFloatVersion(Name))
     if (Value *Ret = optimizeBinaryDoubleFP(CI, B))
       return Ret;
 
-  // Make sure this has 2 arguments of FP type which match the result type.
-  FunctionType *FT = Callee->getFunctionType();
-  if (FT->getNumParams() != 2 || FT->getReturnType() != FT->getParamType(0) ||
-      FT->getParamType(0) != FT->getParamType(1) ||
-      !FT->getParamType(0)->isFloatingPointTy())
-    return nullptr;
-
   IRBuilder<>::FastMathFlagGuard Guard(B);
   FastMathFlags FMF;
   if (CI->hasUnsafeAlgebra()) {
@@ -1360,13 +1208,6 @@ Value *LibCallSimplifier::optimizeLog(CallInst *CI, IRBuilder<> &B) {
   StringRef Name = Callee->getName();
   if (UnsafeFPShrink && hasFloatVersion(Name))
     Ret = optimizeUnaryDoubleFP(CI, B, true);
-  FunctionType *FT = Callee->getFunctionType();
-
-  // Just make sure this has 1 argument of FP type, which matches the
-  // result type.
-  if (FT->getNumParams() != 1 || FT->getReturnType() != FT->getParamType(0) ||
-      !FT->getParamType(0)->isFloatingPointTy())
-    return Ret;
 
   if (!CI->hasUnsafeAlgebra())
     return Ret;
@@ -1392,7 +1233,7 @@ Value *LibCallSimplifier::optimizeLog(CallInst *CI, IRBuilder<> &B) {
   if (F && ((TLI->getLibFunc(F->getName(), Func) && TLI->has(Func) &&
       Func == LibFunc::pow) || F->getIntrinsicID() == Intrinsic::pow))
     return B.CreateFMul(OpC->getArgOperand(1),
-      EmitUnaryFloatFnCall(OpC->getOperand(0), Callee->getName(), B,
+      emitUnaryFloatFnCall(OpC->getOperand(0), Callee->getName(), B,
                            Callee->getAttributes()), "mul");
 
   // log(exp2(y)) -> y*log(2)
@@ -1400,7 +1241,7 @@ Value *LibCallSimplifier::optimizeLog(CallInst *CI, IRBuilder<> &B) {
       TLI->has(Func) && Func == LibFunc::exp2)
     return B.CreateFMul(
         OpC->getArgOperand(0),
-        EmitUnaryFloatFnCall(ConstantFP::get(CI->getType(), 2.0),
+        emitUnaryFloatFnCall(ConstantFP::get(CI->getType(), 2.0),
                              Callee->getName(), B, Callee->getAttributes()),
         "logmul");
   return Ret;
@@ -1408,21 +1249,11 @@ Value *LibCallSimplifier::optimizeLog(CallInst *CI, IRBuilder<> &B) {
 
 Value *LibCallSimplifier::optimizeSqrt(CallInst *CI, IRBuilder<> &B) {
   Function *Callee = CI->getCalledFunction();
-
   Value *Ret = nullptr;
   if (TLI->has(LibFunc::sqrtf) && (Callee->getName() == "sqrt" ||
                                    Callee->getIntrinsicID() == Intrinsic::sqrt))
     Ret = optimizeUnaryDoubleFP(CI, B, true);
 
-  // FIXME: Refactor - this check is repeated all over this file and even in the
-  // preceding call to shrink double -> float.
-
-  // Make sure this has 1 argument of FP type, which matches the result type.
-  FunctionType *FT = Callee->getFunctionType();
-  if (FT->getNumParams() != 1 || FT->getReturnType() != FT->getParamType(0) ||
-      !FT->getParamType(0)->isFloatingPointTy())
-    return Ret;
-
   if (!CI->hasUnsafeAlgebra())
     return Ret;
 
@@ -1489,13 +1320,6 @@ Value *LibCallSimplifier::optimizeTan(CallInst *CI, IRBuilder<> &B) {
   StringRef Name = Callee->getName();
   if (UnsafeFPShrink && Name == "tan" && hasFloatVersion(Name))
     Ret = optimizeUnaryDoubleFP(CI, B, true);
-  FunctionType *FT = Callee->getFunctionType();
-
-  // Just make sure this has 1 argument of FP type, which matches the
-  // result type.
-  if (FT->getNumParams() != 1 || FT->getReturnType() != FT->getParamType(0) ||
-      !FT->getParamType(0)->isFloatingPointTy())
-    return Ret;
 
   Value *Op1 = CI->getArgOperand(0);
   auto *OpC = dyn_cast<CallInst>(Op1);
@@ -1519,13 +1343,65 @@ Value *LibCallSimplifier::optimizeTan(CallInst *CI, IRBuilder<> &B) {
   return Ret;
 }
 
-static bool isTrigLibCall(CallInst *CI);
+static bool isTrigLibCall(CallInst *CI) {
+  // We can only hope to do anything useful if we can ignore things like errno
+  // and floating-point exceptions.
+  // We already checked the prototype.
+  return CI->hasFnAttr(Attribute::NoUnwind) &&
+         CI->hasFnAttr(Attribute::ReadNone);
+}
+
 static void insertSinCosCall(IRBuilder<> &B, Function *OrigCallee, Value *Arg,
                              bool UseFloat, Value *&Sin, Value *&Cos,
-                             Value *&SinCos);
+                             Value *&SinCos) {
+  Type *ArgTy = Arg->getType();
+  Type *ResTy;
+  StringRef Name;
 
-Value *LibCallSimplifier::optimizeSinCosPi(CallInst *CI, IRBuilder<> &B) {
+  Triple T(OrigCallee->getParent()->getTargetTriple());
+  if (UseFloat) {
+    Name = "__sincospif_stret";
+
+    assert(T.getArch() != Triple::x86 && "x86 messy and unsupported for now");
+    // x86_64 can't use {float, float} since that would be returned in both
+    // xmm0 and xmm1, which isn't what a real struct would do.
+    ResTy = T.getArch() == Triple::x86_64
+    ? static_cast<Type *>(VectorType::get(ArgTy, 2))
+    : static_cast<Type *>(StructType::get(ArgTy, ArgTy, nullptr));
+  } else {
+    Name = "__sincospi_stret";
+    ResTy = StructType::get(ArgTy, ArgTy, nullptr);
+  }
+
+  Module *M = OrigCallee->getParent();
+  Value *Callee = M->getOrInsertFunction(Name, OrigCallee->getAttributes(),
+                                         ResTy, ArgTy, nullptr);
+
+  if (Instruction *ArgInst = dyn_cast<Instruction>(Arg)) {
+    // If the argument is an instruction, it must dominate all uses so put our
+    // sincos call there.
+    B.SetInsertPoint(ArgInst->getParent(), ++ArgInst->getIterator());
+  } else {
+    // Otherwise (e.g. for a constant) the beginning of the function is as
+    // good a place as any.
+    BasicBlock &EntryBB = B.GetInsertBlock()->getParent()->getEntryBlock();
+    B.SetInsertPoint(&EntryBB, EntryBB.begin());
+  }
+
+  SinCos = B.CreateCall(Callee, Arg, "sincospi");
+
+  if (SinCos->getType()->isStructTy()) {
+    Sin = B.CreateExtractValue(SinCos, 0, "sinpi");
+    Cos = B.CreateExtractValue(SinCos, 1, "cospi");
+  } else {
+    Sin = B.CreateExtractElement(SinCos, ConstantInt::get(B.getInt32Ty(), 0),
+                                 "sinpi");
+    Cos = B.CreateExtractElement(SinCos, ConstantInt::get(B.getInt32Ty(), 1),
+                                 "cospi");
+  }
+}
 
+Value *LibCallSimplifier::optimizeSinCosPi(CallInst *CI, IRBuilder<> &B) {
   // Make sure the prototype is as expected, otherwise the rest of the
   // function is probably invalid and likely to abort.
   if (!isTrigLibCall(CI))
@@ -1541,9 +1417,9 @@ Value *LibCallSimplifier::optimizeSinCosPi(CallInst *CI, IRBuilder<> &B) {
   // Look for all compatible sinpi, cospi and sincospi calls with the same
   // argument. If there are enough (in some sense) we can make the
   // substitution.
+  Function *F = CI->getFunction();
   for (User *U : Arg->users())
-    classifyArgUse(U, CI->getParent(), IsFloat, SinCalls, CosCalls,
-                   SinCosCalls);
+    classifyArgUse(U, F, IsFloat, SinCalls, CosCalls, SinCosCalls);
 
   // It's only worthwhile if both sinpi and cospi are actually used.
   if (SinCosCalls.empty() && (SinCalls.empty() || CosCalls.empty()))
@@ -1559,35 +1435,23 @@ Value *LibCallSimplifier::optimizeSinCosPi(CallInst *CI, IRBuilder<> &B) {
   return nullptr;
 }
 
-static bool isTrigLibCall(CallInst *CI) {
-  Function *Callee = CI->getCalledFunction();
-  FunctionType *FT = Callee->getFunctionType();
-
-  // We can only hope to do anything useful if we can ignore things like errno
-  // and floating-point exceptions.
-  bool AttributesSafe =
-      CI->hasFnAttr(Attribute::NoUnwind) && CI->hasFnAttr(Attribute::ReadNone);
-
-  // Other than that we need float(float) or double(double)
-  return AttributesSafe && FT->getNumParams() == 1 &&
-         FT->getReturnType() == FT->getParamType(0) &&
-         (FT->getParamType(0)->isFloatTy() ||
-          FT->getParamType(0)->isDoubleTy());
-}
-
-void
-LibCallSimplifier::classifyArgUse(Value *Val, BasicBlock *BB, bool IsFloat,
-                                  SmallVectorImpl<CallInst *> &SinCalls,
-                                  SmallVectorImpl<CallInst *> &CosCalls,
-                                  SmallVectorImpl<CallInst *> &SinCosCalls) {
+void LibCallSimplifier::classifyArgUse(
+    Value *Val, Function *F, bool IsFloat,
+    SmallVectorImpl<CallInst *> &SinCalls,
+    SmallVectorImpl<CallInst *> &CosCalls,
+    SmallVectorImpl<CallInst *> &SinCosCalls) {
   CallInst *CI = dyn_cast<CallInst>(Val);
 
   if (!CI)
     return;
 
+  // Don't consider calls in other functions.
+  if (CI->getFunction() != F)
+    return;
+
   Function *Callee = CI->getCalledFunction();
   LibFunc::Func Func;
-  if (!Callee || !TLI->getLibFunc(Callee->getName(), Func) || !TLI->has(Func) ||
+  if (!Callee || !TLI->getLibFunc(*Callee, Func) || !TLI->has(Func) ||
       !isTrigLibCall(CI))
     return;
 
@@ -1614,69 +1478,12 @@ void LibCallSimplifier::replaceTrigInsts(SmallVectorImpl<CallInst *> &Calls,
     replaceAllUsesWith(C, Res);
 }
 
-void insertSinCosCall(IRBuilder<> &B, Function *OrigCallee, Value *Arg,
-                      bool UseFloat, Value *&Sin, Value *&Cos, Value *&SinCos) {
-  Type *ArgTy = Arg->getType();
-  Type *ResTy;
-  StringRef Name;
-
-  Triple T(OrigCallee->getParent()->getTargetTriple());
-  if (UseFloat) {
-    Name = "__sincospif_stret";
-
-    assert(T.getArch() != Triple::x86 && "x86 messy and unsupported for now");
-    // x86_64 can't use {float, float} since that would be returned in both
-    // xmm0 and xmm1, which isn't what a real struct would do.
-    ResTy = T.getArch() == Triple::x86_64
-                ? static_cast<Type *>(VectorType::get(ArgTy, 2))
-                : static_cast<Type *>(StructType::get(ArgTy, ArgTy, nullptr));
-  } else {
-    Name = "__sincospi_stret";
-    ResTy = StructType::get(ArgTy, ArgTy, nullptr);
-  }
-
-  Module *M = OrigCallee->getParent();
-  Value *Callee = M->getOrInsertFunction(Name, OrigCallee->getAttributes(),
-                                         ResTy, ArgTy, nullptr);
-
-  if (Instruction *ArgInst = dyn_cast<Instruction>(Arg)) {
-    // If the argument is an instruction, it must dominate all uses so put our
-    // sincos call there.
-    B.SetInsertPoint(ArgInst->getParent(), ++ArgInst->getIterator());
-  } else {
-    // Otherwise (e.g. for a constant) the beginning of the function is as
-    // good a place as any.
-    BasicBlock &EntryBB = B.GetInsertBlock()->getParent()->getEntryBlock();
-    B.SetInsertPoint(&EntryBB, EntryBB.begin());
-  }
-
-  SinCos = B.CreateCall(Callee, Arg, "sincospi");
-
-  if (SinCos->getType()->isStructTy()) {
-    Sin = B.CreateExtractValue(SinCos, 0, "sinpi");
-    Cos = B.CreateExtractValue(SinCos, 1, "cospi");
-  } else {
-    Sin = B.CreateExtractElement(SinCos, ConstantInt::get(B.getInt32Ty(), 0),
-                                 "sinpi");
-    Cos = B.CreateExtractElement(SinCos, ConstantInt::get(B.getInt32Ty(), 1),
-                                 "cospi");
-  }
-}
-
 //===----------------------------------------------------------------------===//
 // Integer Library Call Optimizations
 //===----------------------------------------------------------------------===//
 
-static bool checkIntUnaryReturnAndParam(Function *Callee) {
-  FunctionType *FT = Callee->getFunctionType();
-  return FT->getNumParams() == 1 && FT->getReturnType()->isIntegerTy(32) &&
-    FT->getParamType(0)->isIntegerTy();
-}
-
 Value *LibCallSimplifier::optimizeFFS(CallInst *CI, IRBuilder<> &B) {
   Function *Callee = CI->getCalledFunction();
-  if (!checkIntUnaryReturnAndParam(Callee))
-    return nullptr;
   Value *Op = CI->getArgOperand(0);
 
   // Constant fold.
@@ -1700,13 +1507,6 @@ Value *LibCallSimplifier::optimizeFFS(CallInst *CI, IRBuilder<> &B) {
 }
 
 Value *LibCallSimplifier::optimizeAbs(CallInst *CI, IRBuilder<> &B) {
-  Function *Callee = CI->getCalledFunction();
-  FunctionType *FT = Callee->getFunctionType();
-  // We require integer(integer) where the types agree.
-  if (FT->getNumParams() != 1 || !FT->getReturnType()->isIntegerTy() ||
-      FT->getParamType(0) != FT->getReturnType())
-    return nullptr;
-
   // abs(x) -> x >s -1 ? x : -x
   Value *Op = CI->getArgOperand(0);
   Value *Pos =
@@ -1716,9 +1516,6 @@ Value *LibCallSimplifier::optimizeAbs(CallInst *CI, IRBuilder<> &B) {
 }
 
 Value *LibCallSimplifier::optimizeIsDigit(CallInst *CI, IRBuilder<> &B) {
-  if (!checkIntUnaryReturnAndParam(CI->getCalledFunction()))
-    return nullptr;
-
   // isdigit(c) -> (c-'0') <u 10
   Value *Op = CI->getArgOperand(0);
   Op = B.CreateSub(Op, B.getInt32('0'), "isdigittmp");
@@ -1727,9 +1524,6 @@ Value *LibCallSimplifier::optimizeIsDigit(CallInst *CI, IRBuilder<> &B) {
 }
 
 Value *LibCallSimplifier::optimizeIsAscii(CallInst *CI, IRBuilder<> &B) {
-  if (!checkIntUnaryReturnAndParam(CI->getCalledFunction()))
-    return nullptr;
-
   // isascii(c) -> c <u 128
   Value *Op = CI->getArgOperand(0);
   Op = B.CreateICmpULT(Op, B.getInt32(128), "isascii");
@@ -1737,9 +1531,6 @@ Value *LibCallSimplifier::optimizeIsAscii(CallInst *CI, IRBuilder<> &B) {
 }
 
 Value *LibCallSimplifier::optimizeToAscii(CallInst *CI, IRBuilder<> &B) {
-  if (!checkIntUnaryReturnAndParam(CI->getCalledFunction()))
-    return nullptr;
-
   // toascii(c) -> c & 0x7f
   return B.CreateAnd(CI->getArgOperand(0),
                      ConstantInt::get(CI->getType(), 0x7F));
@@ -1753,6 +1544,7 @@ static bool isReportingError(Function *Callee, CallInst *CI, int StreamArg);
 
 Value *LibCallSimplifier::optimizeErrorReporting(CallInst *CI, IRBuilder<> &B,
                                                  int StreamArg) {
+  Function *Callee = CI->getCalledFunction();
   // Error reporting calls should be cold, mark them as such.
   // This applies even to non-builtin calls: it is only a hint and applies to
   // functions that the frontend might not understand as builtins.
@@ -1761,8 +1553,6 @@ Value *LibCallSimplifier::optimizeErrorReporting(CallInst *CI, IRBuilder<> &B,
   // Improving Static Branch Prediction in a Compiler
   // Brian L. Deitrich, Ben-Chung Cheng, Wen-mei W. Hwu
   // Proceedings of PACT'98, Oct. 1998, IEEE
-  Function *Callee = CI->getCalledFunction();
-
   if (!CI->hasFnAttr(Attribute::Cold) &&
       isReportingError(Callee, CI, StreamArg)) {
     CI->addAttribute(AttributeSet::FunctionIndex, Attribute::Cold);
@@ -1808,12 +1598,18 @@ Value *LibCallSimplifier::optimizePrintFString(CallInst *CI, IRBuilder<> &B) {
   if (!CI->use_empty())
     return nullptr;
 
-  // printf("x") -> putchar('x'), even for '%'.
-  if (FormatStr.size() == 1) {
-    Value *Res = EmitPutChar(B.getInt32(FormatStr[0]), B, TLI);
-    if (CI->use_empty() || !Res)
-      return Res;
-    return B.CreateIntCast(Res, CI->getType(), true);
+  // printf("x") -> putchar('x'), even for "%" and "%%".
+  if (FormatStr.size() == 1 || FormatStr == "%%")
+    return emitPutChar(B.getInt32(FormatStr[0]), B, TLI);
+
+  // printf("%s", "a") --> putchar('a')
+  if (FormatStr == "%s" && CI->getNumArgOperands() > 1) {
+    StringRef ChrStr;
+    if (!getConstantStringInfo(CI->getOperand(1), ChrStr))
+      return nullptr;
+    if (ChrStr.size() != 1)
+      return nullptr;
+    return emitPutChar(B.getInt32(ChrStr[0]), B, TLI);
   }
 
   // printf("foo\n") --> puts("foo")
@@ -1823,40 +1619,26 @@ Value *LibCallSimplifier::optimizePrintFString(CallInst *CI, IRBuilder<> &B) {
     // pass to be run after this pass, to merge duplicate strings.
     FormatStr = FormatStr.drop_back();
     Value *GV = B.CreateGlobalString(FormatStr, "str");
-    Value *NewCI = EmitPutS(GV, B, TLI);
-    return (CI->use_empty() || !NewCI)
-               ? NewCI
-               : ConstantInt::get(CI->getType(), FormatStr.size() + 1);
+    return emitPutS(GV, B, TLI);
   }
 
   // Optimize specific format strings.
   // printf("%c", chr) --> putchar(chr)
   if (FormatStr == "%c" && CI->getNumArgOperands() > 1 &&
-      CI->getArgOperand(1)->getType()->isIntegerTy()) {
-    Value *Res = EmitPutChar(CI->getArgOperand(1), B, TLI);
-
-    if (CI->use_empty() || !Res)
-      return Res;
-    return B.CreateIntCast(Res, CI->getType(), true);
-  }
+      CI->getArgOperand(1)->getType()->isIntegerTy())
+    return emitPutChar(CI->getArgOperand(1), B, TLI);
 
   // printf("%s\n", str) --> puts(str)
   if (FormatStr == "%s\n" && CI->getNumArgOperands() > 1 &&
-      CI->getArgOperand(1)->getType()->isPointerTy()) {
-    return EmitPutS(CI->getArgOperand(1), B, TLI);
-  }
+      CI->getArgOperand(1)->getType()->isPointerTy())
+    return emitPutS(CI->getArgOperand(1), B, TLI);
   return nullptr;
 }
 
 Value *LibCallSimplifier::optimizePrintF(CallInst *CI, IRBuilder<> &B) {
 
   Function *Callee = CI->getCalledFunction();
-  // Require one fixed pointer argument and an integer/void result.
   FunctionType *FT = Callee->getFunctionType();
-  if (FT->getNumParams() < 1 || !FT->getParamType(0)->isPointerTy() ||
-      !(FT->getReturnType()->isIntegerTy() || FT->getReturnType()->isVoidTy()))
-    return nullptr;
-
   if (Value *V = optimizePrintFString(CI, B)) {
     return V;
   }
@@ -1909,7 +1691,7 @@ Value *LibCallSimplifier::optimizeSPrintFString(CallInst *CI, IRBuilder<> &B) {
     if (!CI->getArgOperand(2)->getType()->isIntegerTy())
       return nullptr;
     Value *V = B.CreateTrunc(CI->getArgOperand(2), B.getInt8Ty(), "char");
-    Value *Ptr = CastToCStr(CI->getArgOperand(0), B);
+    Value *Ptr = castToCStr(CI->getArgOperand(0), B);
     B.CreateStore(V, Ptr);
     Ptr = B.CreateGEP(B.getInt8Ty(), Ptr, B.getInt32(1), "nul");
     B.CreateStore(B.getInt8(0), Ptr);
@@ -1922,7 +1704,7 @@ Value *LibCallSimplifier::optimizeSPrintFString(CallInst *CI, IRBuilder<> &B) {
     if (!CI->getArgOperand(2)->getType()->isPointerTy())
       return nullptr;
 
-    Value *Len = EmitStrLen(CI->getArgOperand(2), B, DL, TLI);
+    Value *Len = emitStrLen(CI->getArgOperand(2), B, DL, TLI);
     if (!Len)
       return nullptr;
     Value *IncLen =
@@ -1937,13 +1719,7 @@ Value *LibCallSimplifier::optimizeSPrintFString(CallInst *CI, IRBuilder<> &B) {
 
 Value *LibCallSimplifier::optimizeSPrintF(CallInst *CI, IRBuilder<> &B) {
   Function *Callee = CI->getCalledFunction();
-  // Require two fixed pointer arguments and an integer result.
   FunctionType *FT = Callee->getFunctionType();
-  if (FT->getNumParams() != 2 || !FT->getParamType(0)->isPointerTy() ||
-      !FT->getParamType(1)->isPointerTy() ||
-      !FT->getReturnType()->isIntegerTy())
-    return nullptr;
-
   if (Value *V = optimizeSPrintFString(CI, B)) {
     return V;
   }
@@ -1982,7 +1758,7 @@ Value *LibCallSimplifier::optimizeFPrintFString(CallInst *CI, IRBuilder<> &B) {
       if (FormatStr[i] == '%') // Could handle %% -> % if we cared.
         return nullptr;        // We found a format specifier.
 
-    return EmitFWrite(
+    return emitFWrite(
         CI->getArgOperand(1),
         ConstantInt::get(DL.getIntPtrType(CI->getContext()), FormatStr.size()),
         CI->getArgOperand(0), B, DL, TLI);
@@ -1999,27 +1775,21 @@ Value *LibCallSimplifier::optimizeFPrintFString(CallInst *CI, IRBuilder<> &B) {
     // fprintf(F, "%c", chr) --> fputc(chr, F)
     if (!CI->getArgOperand(2)->getType()->isIntegerTy())
       return nullptr;
-    return EmitFPutC(CI->getArgOperand(2), CI->getArgOperand(0), B, TLI);
+    return emitFPutC(CI->getArgOperand(2), CI->getArgOperand(0), B, TLI);
   }
 
   if (FormatStr[1] == 's') {
     // fprintf(F, "%s", str) --> fputs(str, F)
     if (!CI->getArgOperand(2)->getType()->isPointerTy())
       return nullptr;
-    return EmitFPutS(CI->getArgOperand(2), CI->getArgOperand(0), B, TLI);
+    return emitFPutS(CI->getArgOperand(2), CI->getArgOperand(0), B, TLI);
   }
   return nullptr;
 }
 
 Value *LibCallSimplifier::optimizeFPrintF(CallInst *CI, IRBuilder<> &B) {
   Function *Callee = CI->getCalledFunction();
-  // Require two fixed paramters as pointers and integer result.
   FunctionType *FT = Callee->getFunctionType();
-  if (FT->getNumParams() != 2 || !FT->getParamType(0)->isPointerTy() ||
-      !FT->getParamType(1)->isPointerTy() ||
-      !FT->getReturnType()->isIntegerTy())
-    return nullptr;
-
   if (Value *V = optimizeFPrintFString(CI, B)) {
     return V;
   }
@@ -2041,16 +1811,6 @@ Value *LibCallSimplifier::optimizeFPrintF(CallInst *CI, IRBuilder<> &B) {
 Value *LibCallSimplifier::optimizeFWrite(CallInst *CI, IRBuilder<> &B) {
   optimizeErrorReporting(CI, B, 3);
 
-  Function *Callee = CI->getCalledFunction();
-  // Require a pointer, an integer, an integer, a pointer, returning integer.
-  FunctionType *FT = Callee->getFunctionType();
-  if (FT->getNumParams() != 4 || !FT->getParamType(0)->isPointerTy() ||
-      !FT->getParamType(1)->isIntegerTy() ||
-      !FT->getParamType(2)->isIntegerTy() ||
-      !FT->getParamType(3)->isPointerTy() ||
-      !FT->getReturnType()->isIntegerTy())
-    return nullptr;
-
   // Get the element size and count.
   ConstantInt *SizeC = dyn_cast<ConstantInt>(CI->getArgOperand(1));
   ConstantInt *CountC = dyn_cast<ConstantInt>(CI->getArgOperand(2));
@@ -2065,8 +1825,8 @@ Value *LibCallSimplifier::optimizeFWrite(CallInst *CI, IRBuilder<> &B) {
   // If this is writing one byte, turn it into fputc.
   // This optimisation is only valid, if the return value is unused.
   if (Bytes == 1 && CI->use_empty()) { // fwrite(S,1,1,F) -> fputc(S[0],F)
-    Value *Char = B.CreateLoad(CastToCStr(CI->getArgOperand(0), B), "char");
-    Value *NewCI = EmitFPutC(Char, CI->getArgOperand(3), B, TLI);
+    Value *Char = B.CreateLoad(castToCStr(CI->getArgOperand(0), B), "char");
+    Value *NewCI = emitFPutC(Char, CI->getArgOperand(3), B, TLI);
     return NewCI ? ConstantInt::get(CI->getType(), 1) : nullptr;
   }
 
@@ -2076,12 +1836,13 @@ Value *LibCallSimplifier::optimizeFWrite(CallInst *CI, IRBuilder<> &B) {
 Value *LibCallSimplifier::optimizeFPuts(CallInst *CI, IRBuilder<> &B) {
   optimizeErrorReporting(CI, B, 1);
 
-  Function *Callee = CI->getCalledFunction();
+  // Don't rewrite fputs to fwrite when optimising for size because fwrite
+  // requires more arguments and thus extra MOVs are required.
+  if (CI->getParent()->getParent()->optForSize())
+    return nullptr;
 
-  // Require two pointers.  Also, we can't optimize if return value is used.
-  FunctionType *FT = Callee->getFunctionType();
-  if (FT->getNumParams() != 2 || !FT->getParamType(0)->isPointerTy() ||
-      !FT->getParamType(1)->isPointerTy() || !CI->use_empty())
+  // We can't optimize if return value is used.
+  if (!CI->use_empty())
     return nullptr;
 
   // fputs(s,F) --> fwrite(s,1,strlen(s),F)
@@ -2090,20 +1851,13 @@ Value *LibCallSimplifier::optimizeFPuts(CallInst *CI, IRBuilder<> &B) {
     return nullptr;
 
   // Known to have no uses (see above).
-  return EmitFWrite(
+  return emitFWrite(
       CI->getArgOperand(0),
       ConstantInt::get(DL.getIntPtrType(CI->getContext()), Len - 1),
       CI->getArgOperand(1), B, DL, TLI);
 }
 
 Value *LibCallSimplifier::optimizePuts(CallInst *CI, IRBuilder<> &B) {
-  Function *Callee = CI->getCalledFunction();
-  // Require one fixed pointer argument and an integer/void result.
-  FunctionType *FT = Callee->getFunctionType();
-  if (FT->getNumParams() < 1 || !FT->getParamType(0)->isPointerTy() ||
-      !(FT->getReturnType()->isIntegerTy() || FT->getReturnType()->isVoidTy()))
-    return nullptr;
-
   // Check for a constant string.
   StringRef Str;
   if (!getConstantStringInfo(CI->getArgOperand(0), Str))
@@ -2111,7 +1865,7 @@ Value *LibCallSimplifier::optimizePuts(CallInst *CI, IRBuilder<> &B) {
 
   if (Str.empty() && CI->use_empty()) {
     // puts("") -> putchar('\n')
-    Value *Res = EmitPutChar(B.getInt32('\n'), B, TLI);
+    Value *Res = emitPutChar(B.getInt32('\n'), B, TLI);
     if (CI->use_empty() || !Res)
       return Res;
     return B.CreateIntCast(Res, CI->getType(), true);
@@ -2133,10 +1887,8 @@ Value *LibCallSimplifier::optimizeStringMemoryLibCall(CallInst *CI,
                                                       IRBuilder<> &Builder) {
   LibFunc::Func Func;
   Function *Callee = CI->getCalledFunction();
-  StringRef FuncName = Callee->getName();
-
   // Check for string/memory library functions.
-  if (TLI->getLibFunc(FuncName, Func) && TLI->has(Func)) {
+  if (TLI->getLibFunc(*Callee, Func) && TLI->has(Func)) {
     // Make sure we never change the calling convention.
     assert((ignoreCallingConv(Func) ||
             CI->getCallingConv() == llvm::CallingConv::C) &&
@@ -2208,10 +1960,10 @@ Value *LibCallSimplifier::optimizeCall(CallInst *CI) {
   IRBuilder<> Builder(CI, /*FPMathTag=*/nullptr, OpBundles);
   bool isCallingConvC = CI->getCallingConv() == llvm::CallingConv::C;
 
-  // Command-line parameter overrides function attribute.
+  // Command-line parameter overrides instruction attribute.
   if (EnableUnsafeFPShrink.getNumOccurrences() > 0)
     UnsafeFPShrink = EnableUnsafeFPShrink;
-  else if (canUseUnsafeFPMath(Callee))
+  else if (isa<FPMathOperator>(CI) && CI->hasUnsafeAlgebra())
     UnsafeFPShrink = true;
 
   // First, check for intrinsics.
@@ -2229,6 +1981,7 @@ Value *LibCallSimplifier::optimizeCall(CallInst *CI) {
       return optimizeLog(CI, Builder);
     case Intrinsic::sqrt:
       return optimizeSqrt(CI, Builder);
+    // TODO: Use foldMallocMemset() with memset intrinsic.
     default:
       return nullptr;
     }
@@ -2253,7 +2006,7 @@ Value *LibCallSimplifier::optimizeCall(CallInst *CI) {
   }
 
   // Then check for known library functions.
-  if (TLI->getLibFunc(FuncName, Func) && TLI->has(Func)) {
+  if (TLI->getLibFunc(*Callee, Func) && TLI->has(Func)) {
     // We never change the calling convention.
     if (!ignoreCallingConv(Func) && !isCallingConvC)
       return nullptr;
@@ -2457,11 +2210,6 @@ bool FortifiedLibCallSimplifier::isFortifiedCallFoldable(CallInst *CI,
 
 Value *FortifiedLibCallSimplifier::optimizeMemCpyChk(CallInst *CI,
                                                      IRBuilder<> &B) {
-  Function *Callee = CI->getCalledFunction();
-
-  if (!checkStringCopyLibFuncSignature(Callee, LibFunc::memcpy_chk))
-    return nullptr;
-
   if (isFortifiedCallFoldable(CI, 3, 2, false)) {
     B.CreateMemCpy(CI->getArgOperand(0), CI->getArgOperand(1),
                    CI->getArgOperand(2), 1);
@@ -2472,11 +2220,6 @@ Value *FortifiedLibCallSimplifier::optimizeMemCpyChk(CallInst *CI,
 
 Value *FortifiedLibCallSimplifier::optimizeMemMoveChk(CallInst *CI,
                                                       IRBuilder<> &B) {
-  Function *Callee = CI->getCalledFunction();
-
-  if (!checkStringCopyLibFuncSignature(Callee, LibFunc::memmove_chk))
-    return nullptr;
-
   if (isFortifiedCallFoldable(CI, 3, 2, false)) {
     B.CreateMemMove(CI->getArgOperand(0), CI->getArgOperand(1),
                     CI->getArgOperand(2), 1);
@@ -2487,10 +2230,7 @@ Value *FortifiedLibCallSimplifier::optimizeMemMoveChk(CallInst *CI,
 
 Value *FortifiedLibCallSimplifier::optimizeMemSetChk(CallInst *CI,
                                                      IRBuilder<> &B) {
-  Function *Callee = CI->getCalledFunction();
-
-  if (!checkStringCopyLibFuncSignature(Callee, LibFunc::memset_chk))
-    return nullptr;
+  // TODO: Try foldMallocMemset() here.
 
   if (isFortifiedCallFoldable(CI, 3, 2, false)) {
     Value *Val = B.CreateIntCast(CI->getArgOperand(1), B.getInt8Ty(), false);
@@ -2506,16 +2246,12 @@ Value *FortifiedLibCallSimplifier::optimizeStrpCpyChk(CallInst *CI,
   Function *Callee = CI->getCalledFunction();
   StringRef Name = Callee->getName();
   const DataLayout &DL = CI->getModule()->getDataLayout();
-
-  if (!checkStringCopyLibFuncSignature(Callee, Func))
-    return nullptr;
-
   Value *Dst = CI->getArgOperand(0), *Src = CI->getArgOperand(1),
         *ObjSize = CI->getArgOperand(2);
 
   // __stpcpy_chk(x,x,...)  -> x+strlen(x)
   if (Func == LibFunc::stpcpy_chk && !OnlyLowerUnknownSize && Dst == Src) {
-    Value *StrLen = EmitStrLen(Src, B, DL, TLI);
+    Value *StrLen = emitStrLen(Src, B, DL, TLI);
     return StrLen ? B.CreateInBoundsGEP(B.getInt8Ty(), Dst, StrLen) : nullptr;
   }
 
@@ -2525,7 +2261,7 @@ Value *FortifiedLibCallSimplifier::optimizeStrpCpyChk(CallInst *CI,
   // TODO: It might be nice to get a maximum length out of the possible
   // string lengths for varying.
   if (isFortifiedCallFoldable(CI, 2, 1, true))
-    return EmitStrCpy(Dst, Src, B, TLI, Name.substr(2, 6));
+    return emitStrCpy(Dst, Src, B, TLI, Name.substr(2, 6));
 
   if (OnlyLowerUnknownSize)
     return nullptr;
@@ -2537,7 +2273,7 @@ Value *FortifiedLibCallSimplifier::optimizeStrpCpyChk(CallInst *CI,
 
   Type *SizeTTy = DL.getIntPtrType(CI->getContext());
   Value *LenV = ConstantInt::get(SizeTTy, Len);
-  Value *Ret = EmitMemCpyChk(Dst, Src, LenV, ObjSize, B, DL, TLI);
+  Value *Ret = emitMemCpyChk(Dst, Src, LenV, ObjSize, B, DL, TLI);
   // If the function was an __stpcpy_chk, and we were able to fold it into
   // a __memcpy_chk, we still need to return the correct end pointer.
   if (Ret && Func == LibFunc::stpcpy_chk)
@@ -2550,11 +2286,8 @@ Value *FortifiedLibCallSimplifier::optimizeStrpNCpyChk(CallInst *CI,
                                                        LibFunc::Func Func) {
   Function *Callee = CI->getCalledFunction();
   StringRef Name = Callee->getName();
-
-  if (!checkStringCopyLibFuncSignature(Callee, Func))
-    return nullptr;
   if (isFortifiedCallFoldable(CI, 3, 2, false)) {
-    Value *Ret = EmitStrNCpy(CI->getArgOperand(0), CI->getArgOperand(1),
+    Value *Ret = emitStrNCpy(CI->getArgOperand(0), CI->getArgOperand(1),
                              CI->getArgOperand(2), B, TLI, Name.substr(2, 7));
     return Ret;
   }
@@ -2577,15 +2310,15 @@ Value *FortifiedLibCallSimplifier::optimizeCall(CallInst *CI) {
 
   LibFunc::Func Func;
   Function *Callee = CI->getCalledFunction();
-  StringRef FuncName = Callee->getName();
 
   SmallVector<OperandBundleDef, 2> OpBundles;
   CI->getOperandBundlesAsDefs(OpBundles);
   IRBuilder<> Builder(CI, /*FPMathTag=*/nullptr, OpBundles);
   bool isCallingConvC = CI->getCallingConv() == llvm::CallingConv::C;
 
-  // First, check that this is a known library functions.
-  if (!TLI->getLibFunc(FuncName, Func))
+  // First, check that this is a known library functions and that the prototype
+  // is correct.
+  if (!TLI->getLibFunc(*Callee, Func))
     return nullptr;
 
   // We never change the calling convention.
diff --git a/lib/Transforms/Utils/SplitModule.cpp b/lib/Transforms/Utils/SplitModule.cpp
index ad6b782caf8b..e9a368f4faa4 100644
--- a/lib/Transforms/Utils/SplitModule.cpp
+++ b/lib/Transforms/Utils/SplitModule.cpp
@@ -13,19 +13,184 @@
 //
 //===----------------------------------------------------------------------===//
 
+#define DEBUG_TYPE "split-module"
+
 #include "llvm/Transforms/Utils/SplitModule.h"
+#include "llvm/ADT/EquivalenceClasses.h"
 #include "llvm/ADT/Hashing.h"
+#include "llvm/ADT/MapVector.h"
+#include "llvm/ADT/SetVector.h"
+#include "llvm/IR/Constants.h"
 #include "llvm/IR/Function.h"
 #include "llvm/IR/GlobalAlias.h"
 #include "llvm/IR/GlobalObject.h"
 #include "llvm/IR/GlobalValue.h"
 #include "llvm/IR/Module.h"
+#include "llvm/Support/Debug.h"
 #include "llvm/Support/MD5.h"
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/Transforms/Utils/Cloning.h"
+#include <queue>
 
 using namespace llvm;
 
+namespace {
+typedef EquivalenceClasses<const GlobalValue *> ClusterMapType;
+typedef DenseMap<const Comdat *, const GlobalValue *> ComdatMembersType;
+typedef DenseMap<const GlobalValue *, unsigned> ClusterIDMapType;
+}
+
+static void addNonConstUser(ClusterMapType &GVtoClusterMap,
+                            const GlobalValue *GV, const User *U) {
+  assert((!isa<Constant>(U) || isa<GlobalValue>(U)) && "Bad user");
+
+  if (const Instruction *I = dyn_cast<Instruction>(U)) {
+    const GlobalValue *F = I->getParent()->getParent();
+    GVtoClusterMap.unionSets(GV, F);
+  } else if (isa<GlobalIndirectSymbol>(U) || isa<Function>(U) ||
+             isa<GlobalVariable>(U)) {
+    GVtoClusterMap.unionSets(GV, cast<GlobalValue>(U));
+  } else {
+    llvm_unreachable("Underimplemented use case");
+  }
+}
+
+// Adds all GlobalValue users of V to the same cluster as GV.
+static void addAllGlobalValueUsers(ClusterMapType &GVtoClusterMap,
+                                   const GlobalValue *GV, const Value *V) {
+  for (auto *U : V->users()) {
+    SmallVector<const User *, 4> Worklist;
+    Worklist.push_back(U);
+    while (!Worklist.empty()) {
+      const User *UU = Worklist.pop_back_val();
+      // For each constant that is not a GV (a pure const) recurse.
+      if (isa<Constant>(UU) && !isa<GlobalValue>(UU)) {
+        Worklist.append(UU->user_begin(), UU->user_end());
+        continue;
+      }
+      addNonConstUser(GVtoClusterMap, GV, UU);
+    }
+  }
+}
+
+// Find partitions for module in the way that no locals need to be
+// globalized.
+// Try to balance pack those partitions into N files since this roughly equals
+// thread balancing for the backend codegen step.
+static void findPartitions(Module *M, ClusterIDMapType &ClusterIDMap,
+                           unsigned N) {
+  // At this point module should have the proper mix of globals and locals.
+  // As we attempt to partition this module, we must not change any
+  // locals to globals.
+  DEBUG(dbgs() << "Partition module with (" << M->size() << ")functions\n");
+  ClusterMapType GVtoClusterMap;
+  ComdatMembersType ComdatMembers;
+
+  auto recordGVSet = [&GVtoClusterMap, &ComdatMembers](GlobalValue &GV) {
+    if (GV.isDeclaration())
+      return;
+
+    if (!GV.hasName())
+      GV.setName("__llvmsplit_unnamed");
+
+    // Comdat groups must not be partitioned. For comdat groups that contain
+    // locals, record all their members here so we can keep them together.
+    // Comdat groups that only contain external globals are already handled by
+    // the MD5-based partitioning.
+    if (const Comdat *C = GV.getComdat()) {
+      auto &Member = ComdatMembers[C];
+      if (Member)
+        GVtoClusterMap.unionSets(Member, &GV);
+      else
+        Member = &GV;
+    }
+
+    // For aliases we should not separate them from their aliasees regardless
+    // of linkage.
+    if (auto *GIS = dyn_cast<GlobalIndirectSymbol>(&GV)) {
+      if (const GlobalObject *Base = GIS->getBaseObject())
+        GVtoClusterMap.unionSets(&GV, Base);
+    }
+
+    if (const Function *F = dyn_cast<Function>(&GV)) {
+      for (const BasicBlock &BB : *F) {
+        BlockAddress *BA = BlockAddress::lookup(&BB);
+        if (!BA || !BA->isConstantUsed())
+          continue;
+        addAllGlobalValueUsers(GVtoClusterMap, F, BA);
+      }
+    }
+
+    if (GV.hasLocalLinkage())
+      addAllGlobalValueUsers(GVtoClusterMap, &GV, &GV);
+  };
+
+  std::for_each(M->begin(), M->end(), recordGVSet);
+  std::for_each(M->global_begin(), M->global_end(), recordGVSet);
+  std::for_each(M->alias_begin(), M->alias_end(), recordGVSet);
+
+  // Assigned all GVs to merged clusters while balancing number of objects in
+  // each.
+  auto CompareClusters = [](const std::pair<unsigned, unsigned> &a,
+                            const std::pair<unsigned, unsigned> &b) {
+    if (a.second || b.second)
+      return a.second > b.second;
+    else
+      return a.first > b.first;
+  };
+
+  std::priority_queue<std::pair<unsigned, unsigned>,
+                      std::vector<std::pair<unsigned, unsigned>>,
+                      decltype(CompareClusters)>
+      BalancinQueue(CompareClusters);
+  // Pre-populate priority queue with N slot blanks.
+  for (unsigned i = 0; i < N; ++i)
+    BalancinQueue.push(std::make_pair(i, 0));
+
+  typedef std::pair<unsigned, ClusterMapType::iterator> SortType;
+  SmallVector<SortType, 64> Sets;
+  SmallPtrSet<const GlobalValue *, 32> Visited;
+
+  // To guarantee determinism, we have to sort SCC according to size.
+  // When size is the same, use leader's name.
+  for (ClusterMapType::iterator I = GVtoClusterMap.begin(),
+                                E = GVtoClusterMap.end(); I != E; ++I)
+    if (I->isLeader())
+      Sets.push_back(
+          std::make_pair(std::distance(GVtoClusterMap.member_begin(I),
+                                       GVtoClusterMap.member_end()), I));
+
+  std::sort(Sets.begin(), Sets.end(), [](const SortType &a, const SortType &b) {
+    if (a.first == b.first)
+      return a.second->getData()->getName() > b.second->getData()->getName();
+    else
+      return a.first > b.first;
+  });
+
+  for (auto &I : Sets) {
+    unsigned CurrentClusterID = BalancinQueue.top().first;
+    unsigned CurrentClusterSize = BalancinQueue.top().second;
+    BalancinQueue.pop();
+
+    DEBUG(dbgs() << "Root[" << CurrentClusterID << "] cluster_size(" << I.first
+                 << ") ----> " << I.second->getData()->getName() << "\n");
+
+    for (ClusterMapType::member_iterator MI =
+             GVtoClusterMap.findLeader(I.second);
+         MI != GVtoClusterMap.member_end(); ++MI) {
+      if (!Visited.insert(*MI).second)
+        continue;
+      DEBUG(dbgs() << "----> " << (*MI)->getName()
+                   << ((*MI)->hasLocalLinkage() ? " l " : " e ") << "\n");
+      Visited.insert(*MI);
+      ClusterIDMap[*MI] = CurrentClusterID;
+      CurrentClusterSize++;
+    }
+    // Add this set size to the number of entries in this cluster.
+    BalancinQueue.push(std::make_pair(CurrentClusterID, CurrentClusterSize));
+  }
+}
+
 static void externalize(GlobalValue *GV) {
   if (GV->hasLocalLinkage()) {
     GV->setLinkage(GlobalValue::ExternalLinkage);
@@ -40,8 +205,8 @@ static void externalize(GlobalValue *GV) {
 
 // Returns whether GV should be in partition (0-based) I of N.
 static bool isInPartition(const GlobalValue *GV, unsigned I, unsigned N) {
-  if (auto GA = dyn_cast<GlobalAlias>(GV))
-    if (const GlobalObject *Base = GA->getBaseObject())
+  if (auto *GIS = dyn_cast<GlobalIndirectSymbol>(GV))
+    if (const GlobalObject *Base = GIS->getBaseObject())
       GV = Base;
 
   StringRef Name;
@@ -62,21 +227,34 @@ static bool isInPartition(const GlobalValue *GV, unsigned I, unsigned N) {
 
 void llvm::SplitModule(
     std::unique_ptr<Module> M, unsigned N,
-    std::function<void(std::unique_ptr<Module> MPart)> ModuleCallback) {
-  for (Function &F : *M)
-    externalize(&F);
-  for (GlobalVariable &GV : M->globals())
-    externalize(&GV);
-  for (GlobalAlias &GA : M->aliases())
-    externalize(&GA);
+    function_ref<void(std::unique_ptr<Module> MPart)> ModuleCallback,
+    bool PreserveLocals) {
+  if (!PreserveLocals) {
+    for (Function &F : *M)
+      externalize(&F);
+    for (GlobalVariable &GV : M->globals())
+      externalize(&GV);
+    for (GlobalAlias &GA : M->aliases())
+      externalize(&GA);
+    for (GlobalIFunc &GIF : M->ifuncs())
+      externalize(&GIF);
+  }
+
+  // This performs splitting without a need for externalization, which might not
+  // always be possible.
+  ClusterIDMapType ClusterIDMap;
+  findPartitions(M.get(), ClusterIDMap, N);
 
   // FIXME: We should be able to reuse M as the last partition instead of
   // cloning it.
-  for (unsigned I = 0; I != N; ++I) {
+  for (unsigned I = 0; I < N; ++I) {
     ValueToValueMapTy VMap;
     std::unique_ptr<Module> MPart(
-        CloneModule(M.get(), VMap, [=](const GlobalValue *GV) {
-          return isInPartition(GV, I, N);
+        CloneModule(M.get(), VMap, [&](const GlobalValue *GV) {
+          if (ClusterIDMap.count(GV))
+            return (ClusterIDMap[GV] == I);
+          else
+            return isInPartition(GV, I, N);
         }));
     if (I != 0)
       MPart->setModuleInlineAsm("");
diff --git a/lib/Transforms/Utils/SymbolRewriter.cpp b/lib/Transforms/Utils/SymbolRewriter.cpp
index 1d1f602b041d..7523ca527b68 100644
--- a/lib/Transforms/Utils/SymbolRewriter.cpp
+++ b/lib/Transforms/Utils/SymbolRewriter.cpp
@@ -58,7 +58,6 @@
 //===----------------------------------------------------------------------===//
 
 #define DEBUG_TYPE "symbol-rewriter"
-#include "llvm/CodeGen/Passes.h"
 #include "llvm/Pass.h"
 #include "llvm/ADT/SmallString.h"
 #include "llvm/IR/LegacyPassManager.h"
diff --git a/lib/Transforms/Utils/UnifyFunctionExitNodes.cpp b/lib/Transforms/Utils/UnifyFunctionExitNodes.cpp
index 6b1d1dae5f01..9385f825523c 100644
--- a/lib/Transforms/Utils/UnifyFunctionExitNodes.cpp
+++ b/lib/Transforms/Utils/UnifyFunctionExitNodes.cpp
@@ -66,9 +66,7 @@ bool UnifyFunctionExitNodes::runOnFunction(Function &F) {
                                           "UnifiedUnreachableBlock", &F);
     new UnreachableInst(F.getContext(), UnreachableBlock);
 
-    for (std::vector<BasicBlock*>::iterator I = UnreachableBlocks.begin(),
-           E = UnreachableBlocks.end(); I != E; ++I) {
-      BasicBlock *BB = *I;
+    for (BasicBlock *BB : UnreachableBlocks) {
       BB->getInstList().pop_back();  // Remove the unreachable inst.
       BranchInst::Create(UnreachableBlock, BB);
     }
@@ -104,10 +102,7 @@ bool UnifyFunctionExitNodes::runOnFunction(Function &F) {
   // Loop over all of the blocks, replacing the return instruction with an
   // unconditional branch.
   //
-  for (std::vector<BasicBlock*>::iterator I = ReturningBlocks.begin(),
-         E = ReturningBlocks.end(); I != E; ++I) {
-    BasicBlock *BB = *I;
-
+  for (BasicBlock *BB : ReturningBlocks) {
     // Add an incoming element to the PHI node for every return instruction that
     // is merging into this new block...
     if (PN)
diff --git a/lib/Transforms/Utils/Utils.cpp b/lib/Transforms/Utils/Utils.cpp
index ed4f45c6a615..8f85f19efe38 100644
--- a/lib/Transforms/Utils/Utils.cpp
+++ b/lib/Transforms/Utils/Utils.cpp
@@ -21,17 +21,20 @@ using namespace llvm;
 /// initializeTransformUtils - Initialize all passes in the TransformUtils
 /// library.
 void llvm::initializeTransformUtils(PassRegistry &Registry) {
-  initializeAddDiscriminatorsPass(Registry);
+  initializeAddDiscriminatorsLegacyPassPass(Registry);
   initializeBreakCriticalEdgesPass(Registry);
   initializeInstNamerPass(Registry);
-  initializeLCSSAPass(Registry);
+  initializeLCSSAWrapperPassPass(Registry);
   initializeLoopSimplifyPass(Registry);
   initializeLowerInvokePass(Registry);
   initializeLowerSwitchPass(Registry);
-  initializePromotePassPass(Registry);
+  initializeNameAnonFunctionPass(Registry);
+  initializePromoteLegacyPassPass(Registry);
   initializeUnifyFunctionExitNodesPass(Registry);
   initializeInstSimplifierPass(Registry);
   initializeMetaRenamerPass(Registry);
+  initializeMemorySSAWrapperPassPass(Registry);
+  initializeMemorySSAPrinterLegacyPassPass(Registry);
 }
 
 /// LLVMInitializeTransformUtils - C binding for initializeTransformUtilsPasses.
diff --git a/lib/Transforms/Utils/ValueMapper.cpp b/lib/Transforms/Utils/ValueMapper.cpp
index f47ddb9f064f..2eade8cbe8ef 100644
--- a/lib/Transforms/Utils/ValueMapper.cpp
+++ b/lib/Transforms/Utils/ValueMapper.cpp
@@ -13,9 +13,13 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/Transforms/Utils/ValueMapper.h"
+#include "llvm/ADT/DenseSet.h"
 #include "llvm/IR/CallSite.h"
 #include "llvm/IR/Constants.h"
+#include "llvm/IR/DebugInfoMetadata.h"
 #include "llvm/IR/Function.h"
+#include "llvm/IR/GlobalAlias.h"
+#include "llvm/IR/GlobalVariable.h"
 #include "llvm/IR/InlineAsm.h"
 #include "llvm/IR/Instructions.h"
 #include "llvm/IR/Metadata.h"
@@ -25,25 +29,326 @@ using namespace llvm;
 // Out of line method to get vtable etc for class.
 void ValueMapTypeRemapper::anchor() {}
 void ValueMaterializer::anchor() {}
-void ValueMaterializer::materializeInitFor(GlobalValue *New, GlobalValue *Old) {
-}
 
-Value *llvm::MapValue(const Value *V, ValueToValueMapTy &VM, RemapFlags Flags,
-                      ValueMapTypeRemapper *TypeMapper,
-                      ValueMaterializer *Materializer) {
-  ValueToValueMapTy::iterator I = VM.find(V);
-  
+namespace {
+
+/// A basic block used in a BlockAddress whose function body is not yet
+/// materialized.
+struct DelayedBasicBlock {
+  BasicBlock *OldBB;
+  std::unique_ptr<BasicBlock> TempBB;
+
+  // Explicit move for MSVC.
+  DelayedBasicBlock(DelayedBasicBlock &&X)
+      : OldBB(std::move(X.OldBB)), TempBB(std::move(X.TempBB)) {}
+  DelayedBasicBlock &operator=(DelayedBasicBlock &&X) {
+    OldBB = std::move(X.OldBB);
+    TempBB = std::move(X.TempBB);
+    return *this;
+  }
+
+  DelayedBasicBlock(const BlockAddress &Old)
+      : OldBB(Old.getBasicBlock()),
+        TempBB(BasicBlock::Create(Old.getContext())) {}
+};
+
+struct WorklistEntry {
+  enum EntryKind {
+    MapGlobalInit,
+    MapAppendingVar,
+    MapGlobalAliasee,
+    RemapFunction
+  };
+  struct GVInitTy {
+    GlobalVariable *GV;
+    Constant *Init;
+  };
+  struct AppendingGVTy {
+    GlobalVariable *GV;
+    Constant *InitPrefix;
+  };
+  struct GlobalAliaseeTy {
+    GlobalAlias *GA;
+    Constant *Aliasee;
+  };
+
+  unsigned Kind : 2;
+  unsigned MCID : 29;
+  unsigned AppendingGVIsOldCtorDtor : 1;
+  unsigned AppendingGVNumNewMembers;
+  union {
+    GVInitTy GVInit;
+    AppendingGVTy AppendingGV;
+    GlobalAliaseeTy GlobalAliasee;
+    Function *RemapF;
+  } Data;
+};
+
+struct MappingContext {
+  ValueToValueMapTy *VM;
+  ValueMaterializer *Materializer = nullptr;
+
+  /// Construct a MappingContext with a value map and materializer.
+  explicit MappingContext(ValueToValueMapTy &VM,
+                          ValueMaterializer *Materializer = nullptr)
+      : VM(&VM), Materializer(Materializer) {}
+};
+
+class MDNodeMapper;
+class Mapper {
+  friend class MDNodeMapper;
+
+#ifndef NDEBUG
+  DenseSet<GlobalValue *> AlreadyScheduled;
+#endif
+
+  RemapFlags Flags;
+  ValueMapTypeRemapper *TypeMapper;
+  unsigned CurrentMCID = 0;
+  SmallVector<MappingContext, 2> MCs;
+  SmallVector<WorklistEntry, 4> Worklist;
+  SmallVector<DelayedBasicBlock, 1> DelayedBBs;
+  SmallVector<Constant *, 16> AppendingInits;
+
+public:
+  Mapper(ValueToValueMapTy &VM, RemapFlags Flags,
+         ValueMapTypeRemapper *TypeMapper, ValueMaterializer *Materializer)
+      : Flags(Flags), TypeMapper(TypeMapper),
+        MCs(1, MappingContext(VM, Materializer)) {}
+
+  /// ValueMapper should explicitly call \a flush() before destruction.
+  ~Mapper() { assert(!hasWorkToDo() && "Expected to be flushed"); }
+
+  bool hasWorkToDo() const { return !Worklist.empty(); }
+
+  unsigned
+  registerAlternateMappingContext(ValueToValueMapTy &VM,
+                                  ValueMaterializer *Materializer = nullptr) {
+    MCs.push_back(MappingContext(VM, Materializer));
+    return MCs.size() - 1;
+  }
+
+  void addFlags(RemapFlags Flags);
+
+  Value *mapValue(const Value *V);
+  void remapInstruction(Instruction *I);
+  void remapFunction(Function &F);
+
+  Constant *mapConstant(const Constant *C) {
+    return cast_or_null<Constant>(mapValue(C));
+  }
+
+  /// Map metadata.
+  ///
+  /// Find the mapping for MD.  Guarantees that the return will be resolved
+  /// (not an MDNode, or MDNode::isResolved() returns true).
+  Metadata *mapMetadata(const Metadata *MD);
+
+  void scheduleMapGlobalInitializer(GlobalVariable &GV, Constant &Init,
+                                    unsigned MCID);
+  void scheduleMapAppendingVariable(GlobalVariable &GV, Constant *InitPrefix,
+                                    bool IsOldCtorDtor,
+                                    ArrayRef<Constant *> NewMembers,
+                                    unsigned MCID);
+  void scheduleMapGlobalAliasee(GlobalAlias &GA, Constant &Aliasee,
+                                unsigned MCID);
+  void scheduleRemapFunction(Function &F, unsigned MCID);
+
+  void flush();
+
+private:
+  void mapGlobalInitializer(GlobalVariable &GV, Constant &Init);
+  void mapAppendingVariable(GlobalVariable &GV, Constant *InitPrefix,
+                            bool IsOldCtorDtor,
+                            ArrayRef<Constant *> NewMembers);
+  void mapGlobalAliasee(GlobalAlias &GA, Constant &Aliasee);
+  void remapFunction(Function &F, ValueToValueMapTy &VM);
+
+  ValueToValueMapTy &getVM() { return *MCs[CurrentMCID].VM; }
+  ValueMaterializer *getMaterializer() { return MCs[CurrentMCID].Materializer; }
+
+  Value *mapBlockAddress(const BlockAddress &BA);
+
+  /// Map metadata that doesn't require visiting operands.
+  Optional<Metadata *> mapSimpleMetadata(const Metadata *MD);
+
+  Metadata *mapToMetadata(const Metadata *Key, Metadata *Val);
+  Metadata *mapToSelf(const Metadata *MD);
+};
+
+class MDNodeMapper {
+  Mapper &M;
+
+  /// Data about a node in \a UniquedGraph.
+  struct Data {
+    bool HasChanged = false;
+    unsigned ID = ~0u;
+    TempMDNode Placeholder;
+
+    Data() {}
+    Data(Data &&X)
+        : HasChanged(std::move(X.HasChanged)), ID(std::move(X.ID)),
+          Placeholder(std::move(X.Placeholder)) {}
+    Data &operator=(Data &&X) {
+      HasChanged = std::move(X.HasChanged);
+      ID = std::move(X.ID);
+      Placeholder = std::move(X.Placeholder);
+      return *this;
+    }
+  };
+
+  /// A graph of uniqued nodes.
+  struct UniquedGraph {
+    SmallDenseMap<const Metadata *, Data, 32> Info; // Node properties.
+    SmallVector<MDNode *, 16> POT;                  // Post-order traversal.
+
+    /// Propagate changed operands through the post-order traversal.
+    ///
+    /// Iteratively update \a Data::HasChanged for each node based on \a
+    /// Data::HasChanged of its operands, until fixed point.
+    void propagateChanges();
+
+    /// Get a forward reference to a node to use as an operand.
+    Metadata &getFwdReference(MDNode &Op);
+  };
+
+  /// Worklist of distinct nodes whose operands need to be remapped.
+  SmallVector<MDNode *, 16> DistinctWorklist;
+
+  // Storage for a UniquedGraph.
+  SmallDenseMap<const Metadata *, Data, 32> InfoStorage;
+  SmallVector<MDNode *, 16> POTStorage;
+
+public:
+  MDNodeMapper(Mapper &M) : M(M) {}
+
+  /// Map a metadata node (and its transitive operands).
+  ///
+  /// Map all the (unmapped) nodes in the subgraph under \c N.  The iterative
+  /// algorithm handles distinct nodes and uniqued node subgraphs using
+  /// different strategies.
+  ///
+  /// Distinct nodes are immediately mapped and added to \a DistinctWorklist
+  /// using \a mapDistinctNode().  Their mapping can always be computed
+  /// immediately without visiting operands, even if their operands change.
+  ///
+  /// The mapping for uniqued nodes depends on whether their operands change.
+  /// \a mapTopLevelUniquedNode() traverses the transitive uniqued subgraph of
+  /// a node to calculate uniqued node mappings in bulk.  Distinct leafs are
+  /// added to \a DistinctWorklist with \a mapDistinctNode().
+  ///
+  /// After mapping \c N itself, this function remaps the operands of the
+  /// distinct nodes in \a DistinctWorklist until the entire subgraph under \c
+  /// N has been mapped.
+  Metadata *map(const MDNode &N);
+
+private:
+  /// Map a top-level uniqued node and the uniqued subgraph underneath it.
+  ///
+  /// This builds up a post-order traversal of the (unmapped) uniqued subgraph
+  /// underneath \c FirstN and calculates the nodes' mapping.  Each node uses
+  /// the identity mapping (\a Mapper::mapToSelf()) as long as all of its
+  /// operands uses the identity mapping.
+  ///
+  /// The algorithm works as follows:
+  ///
+  ///  1. \a createPOT(): traverse the uniqued subgraph under \c FirstN and
+  ///     save the post-order traversal in the given \a UniquedGraph, tracking
+  ///     nodes' operands change.
+  ///
+  ///  2. \a UniquedGraph::propagateChanges(): propagate changed operands
+  ///     through the \a UniquedGraph until fixed point, following the rule
+  ///     that if a node changes, any node that references must also change.
+  ///
+  ///  3. \a mapNodesInPOT(): map the uniqued nodes, creating new uniqued nodes
+  ///     (referencing new operands) where necessary.
+  Metadata *mapTopLevelUniquedNode(const MDNode &FirstN);
+
+  /// Try to map the operand of an \a MDNode.
+  ///
+  /// If \c Op is already mapped, return the mapping.  If it's not an \a
+  /// MDNode, compute and return the mapping.  If it's a distinct \a MDNode,
+  /// return the result of \a mapDistinctNode().
+  ///
+  /// \return None if \c Op is an unmapped uniqued \a MDNode.
+  /// \post getMappedOp(Op) only returns None if this returns None.
+  Optional<Metadata *> tryToMapOperand(const Metadata *Op);
+
+  /// Map a distinct node.
+  ///
+  /// Return the mapping for the distinct node \c N, saving the result in \a
+  /// DistinctWorklist for later remapping.
+  ///
+  /// \pre \c N is not yet mapped.
+  /// \pre \c N.isDistinct().
+  MDNode *mapDistinctNode(const MDNode &N);
+
+  /// Get a previously mapped node.
+  Optional<Metadata *> getMappedOp(const Metadata *Op) const;
+
+  /// Create a post-order traversal of an unmapped uniqued node subgraph.
+  ///
+  /// This traverses the metadata graph deeply enough to map \c FirstN.  It
+  /// uses \a tryToMapOperand() (via \a Mapper::mapSimplifiedNode()), so any
+  /// metadata that has already been mapped will not be part of the POT.
+  ///
+  /// Each node that has a changed operand from outside the graph (e.g., a
+  /// distinct node, an already-mapped uniqued node, or \a ConstantAsMetadata)
+  /// is marked with \a Data::HasChanged.
+  ///
+  /// \return \c true if any nodes in \c G have \a Data::HasChanged.
+  /// \post \c G.POT is a post-order traversal ending with \c FirstN.
+  /// \post \a Data::hasChanged in \c G.Info indicates whether any node needs
+  /// to change because of operands outside the graph.
+  bool createPOT(UniquedGraph &G, const MDNode &FirstN);
+
+  /// Visit the operands of a uniqued node in the POT.
+  ///
+  /// Visit the operands in the range from \c I to \c E, returning the first
+  /// uniqued node we find that isn't yet in \c G.  \c I is always advanced to
+  /// where to continue the loop through the operands.
+  ///
+  /// This sets \c HasChanged if any of the visited operands change.
+  MDNode *visitOperands(UniquedGraph &G, MDNode::op_iterator &I,
+                        MDNode::op_iterator E, bool &HasChanged);
+
+  /// Map all the nodes in the given uniqued graph.
+  ///
+  /// This visits all the nodes in \c G in post-order, using the identity
+  /// mapping or creating a new node depending on \a Data::HasChanged.
+  ///
+  /// \pre \a getMappedOp() returns None for nodes in \c G, but not for any of
+  /// their operands outside of \c G.
+  /// \pre \a Data::HasChanged is true for a node in \c G iff any of its
+  /// operands have changed.
+  /// \post \a getMappedOp() returns the mapped node for every node in \c G.
+  void mapNodesInPOT(UniquedGraph &G);
+
+  /// Remap a node's operands using the given functor.
+  ///
+  /// Iterate through the operands of \c N and update them in place using \c
+  /// mapOperand.
+  ///
+  /// \pre N.isDistinct() or N.isTemporary().
+  template <class OperandMapper>
+  void remapOperands(MDNode &N, OperandMapper mapOperand);
+};
+
+} // end namespace
+
+Value *Mapper::mapValue(const Value *V) {
+  ValueToValueMapTy::iterator I = getVM().find(V);
+
   // If the value already exists in the map, use it.
-  if (I != VM.end() && I->second) return I->second;
-  
+  if (I != getVM().end()) {
+    assert(I->second && "Unexpected null mapping");
+    return I->second;
+  }
+
   // If we have a materializer and it can materialize a value, use that.
-  if (Materializer) {
-    if (Value *NewV =
-            Materializer->materializeDeclFor(const_cast<Value *>(V))) {
-      VM[V] = NewV;
-      if (auto *NewGV = dyn_cast<GlobalValue>(NewV))
-        Materializer->materializeInitFor(
-            NewGV, const_cast<GlobalValue *>(cast<GlobalValue>(V)));
+  if (auto *Materializer = getMaterializer()) {
+    if (Value *NewV = Materializer->materialize(const_cast<Value *>(V))) {
+      getVM()[V] = NewV;
       return NewV;
     }
   }
@@ -51,13 +356,9 @@ Value *llvm::MapValue(const Value *V, ValueToValueMapTy &VM, RemapFlags Flags,
   // Global values do not need to be seeded into the VM if they
   // are using the identity mapping.
   if (isa<GlobalValue>(V)) {
-    if (Flags & RF_NullMapMissingGlobalValues) {
-      assert(!(Flags & RF_IgnoreMissingEntries) &&
-             "Illegal to specify both RF_NullMapMissingGlobalValues and "
-             "RF_IgnoreMissingEntries");
+    if (Flags & RF_NullMapMissingGlobalValues)
       return nullptr;
-    }
-    return VM[V] = const_cast<Value*>(V);
+    return getVM()[V] = const_cast<Value *>(V);
   }
 
   if (const InlineAsm *IA = dyn_cast<InlineAsm>(V)) {
@@ -70,28 +371,39 @@ Value *llvm::MapValue(const Value *V, ValueToValueMapTy &VM, RemapFlags Flags,
         V = InlineAsm::get(NewTy, IA->getAsmString(), IA->getConstraintString(),
                            IA->hasSideEffects(), IA->isAlignStack());
     }
-    
-    return VM[V] = const_cast<Value*>(V);
+
+    return getVM()[V] = const_cast<Value *>(V);
   }
 
   if (const auto *MDV = dyn_cast<MetadataAsValue>(V)) {
     const Metadata *MD = MDV->getMetadata();
+
+    if (auto *LAM = dyn_cast<LocalAsMetadata>(MD)) {
+      // Look through to grab the local value.
+      if (Value *LV = mapValue(LAM->getValue())) {
+        if (V == LAM->getValue())
+          return const_cast<Value *>(V);
+        return MetadataAsValue::get(V->getContext(), ValueAsMetadata::get(LV));
+      }
+
+      // FIXME: always return nullptr once Verifier::verifyDominatesUse()
+      // ensures metadata operands only reference defined SSA values.
+      return (Flags & RF_IgnoreMissingLocals)
+                 ? nullptr
+                 : MetadataAsValue::get(V->getContext(),
+                                        MDTuple::get(V->getContext(), None));
+    }
+
     // If this is a module-level metadata and we know that nothing at the module
     // level is changing, then use an identity mapping.
-    if (!isa<LocalAsMetadata>(MD) && (Flags & RF_NoModuleLevelChanges))
-      return VM[V] = const_cast<Value *>(V);
-
-    auto *MappedMD = MapMetadata(MD, VM, Flags, TypeMapper, Materializer);
-    if (MD == MappedMD || (!MappedMD && (Flags & RF_IgnoreMissingEntries)))
-      return VM[V] = const_cast<Value *>(V);
-
-    // FIXME: This assert crashes during bootstrap, but I think it should be
-    // correct.  For now, just match behaviour from before the metadata/value
-    // split.
-    //
-    //    assert((MappedMD || (Flags & RF_NullMapMissingGlobalValues)) &&
-    //           "Referenced metadata value not in value map");
-    return VM[V] = MetadataAsValue::get(V->getContext(), MappedMD);
+    if (Flags & RF_NoModuleLevelChanges)
+      return getVM()[V] = const_cast<Value *>(V);
+
+    // Map the metadata and turn it into a value.
+    auto *MappedMD = mapMetadata(MD);
+    if (MD == MappedMD)
+      return getVM()[V] = const_cast<Value *>(V);
+    return getVM()[V] = MetadataAsValue::get(V->getContext(), MappedMD);
   }
 
   // Okay, this either must be a constant (which may or may not be mappable) or
@@ -99,25 +411,31 @@ Value *llvm::MapValue(const Value *V, ValueToValueMapTy &VM, RemapFlags Flags,
   Constant *C = const_cast<Constant*>(dyn_cast<Constant>(V));
   if (!C)
     return nullptr;
-  
-  if (BlockAddress *BA = dyn_cast<BlockAddress>(C)) {
-    Function *F = 
-      cast<Function>(MapValue(BA->getFunction(), VM, Flags, TypeMapper, Materializer));
-    BasicBlock *BB = cast_or_null<BasicBlock>(MapValue(BA->getBasicBlock(), VM,
-                                                       Flags, TypeMapper, Materializer));
-    return VM[V] = BlockAddress::get(F, BB ? BB : BA->getBasicBlock());
-  }
-  
+
+  if (BlockAddress *BA = dyn_cast<BlockAddress>(C))
+    return mapBlockAddress(*BA);
+
+  auto mapValueOrNull = [this](Value *V) {
+    auto Mapped = mapValue(V);
+    assert((Mapped || (Flags & RF_NullMapMissingGlobalValues)) &&
+           "Unexpected null mapping for constant operand without "
+           "NullMapMissingGlobalValues flag");
+    return Mapped;
+  };
+
   // Otherwise, we have some other constant to remap.  Start by checking to see
   // if all operands have an identity remapping.
   unsigned OpNo = 0, NumOperands = C->getNumOperands();
   Value *Mapped = nullptr;
   for (; OpNo != NumOperands; ++OpNo) {
     Value *Op = C->getOperand(OpNo);
-    Mapped = MapValue(Op, VM, Flags, TypeMapper, Materializer);
-    if (Mapped != C) break;
+    Mapped = mapValueOrNull(Op);
+    if (!Mapped)
+      return nullptr;
+    if (Mapped != Op)
+      break;
   }
-  
+
   // See if the type mapper wants to remap the type as well.
   Type *NewTy = C->getType();
   if (TypeMapper)
@@ -126,23 +444,26 @@ Value *llvm::MapValue(const Value *V, ValueToValueMapTy &VM, RemapFlags Flags,
   // If the result type and all operands match up, then just insert an identity
   // mapping.
   if (OpNo == NumOperands && NewTy == C->getType())
-    return VM[V] = C;
-  
+    return getVM()[V] = C;
+
   // Okay, we need to create a new constant.  We've already processed some or
   // all of the operands, set them all up now.
   SmallVector<Constant*, 8> Ops;
   Ops.reserve(NumOperands);
   for (unsigned j = 0; j != OpNo; ++j)
     Ops.push_back(cast<Constant>(C->getOperand(j)));
-  
+
   // If one of the operands mismatch, push it and the other mapped operands.
   if (OpNo != NumOperands) {
     Ops.push_back(cast<Constant>(Mapped));
-  
+
     // Map the rest of the operands that aren't processed yet.
-    for (++OpNo; OpNo != NumOperands; ++OpNo)
-      Ops.push_back(MapValue(cast<Constant>(C->getOperand(OpNo)), VM,
-                             Flags, TypeMapper, Materializer));
+    for (++OpNo; OpNo != NumOperands; ++OpNo) {
+      Mapped = mapValueOrNull(C->getOperand(OpNo));
+      if (!Mapped)
+        return nullptr;
+      Ops.push_back(cast<Constant>(Mapped));
+    }
   }
   Type *NewSrcTy = nullptr;
   if (TypeMapper)
@@ -150,309 +471,407 @@ Value *llvm::MapValue(const Value *V, ValueToValueMapTy &VM, RemapFlags Flags,
       NewSrcTy = TypeMapper->remapType(GEPO->getSourceElementType());
 
   if (ConstantExpr *CE = dyn_cast<ConstantExpr>(C))
-    return VM[V] = CE->getWithOperands(Ops, NewTy, false, NewSrcTy);
+    return getVM()[V] = CE->getWithOperands(Ops, NewTy, false, NewSrcTy);
   if (isa<ConstantArray>(C))
-    return VM[V] = ConstantArray::get(cast<ArrayType>(NewTy), Ops);
+    return getVM()[V] = ConstantArray::get(cast<ArrayType>(NewTy), Ops);
   if (isa<ConstantStruct>(C))
-    return VM[V] = ConstantStruct::get(cast<StructType>(NewTy), Ops);
+    return getVM()[V] = ConstantStruct::get(cast<StructType>(NewTy), Ops);
   if (isa<ConstantVector>(C))
-    return VM[V] = ConstantVector::get(Ops);
+    return getVM()[V] = ConstantVector::get(Ops);
   // If this is a no-operand constant, it must be because the type was remapped.
   if (isa<UndefValue>(C))
-    return VM[V] = UndefValue::get(NewTy);
+    return getVM()[V] = UndefValue::get(NewTy);
   if (isa<ConstantAggregateZero>(C))
-    return VM[V] = ConstantAggregateZero::get(NewTy);
+    return getVM()[V] = ConstantAggregateZero::get(NewTy);
   assert(isa<ConstantPointerNull>(C));
-  return VM[V] = ConstantPointerNull::get(cast<PointerType>(NewTy));
-}
-
-static Metadata *mapToMetadata(ValueToValueMapTy &VM, const Metadata *Key,
-                               Metadata *Val, ValueMaterializer *Materializer,
-                               RemapFlags Flags) {
-  VM.MD()[Key].reset(Val);
-  if (Materializer && !(Flags & RF_HaveUnmaterializedMetadata)) {
-    auto *N = dyn_cast_or_null<MDNode>(Val);
-    // Need to invoke this once we have non-temporary MD.
-    if (!N || !N->isTemporary())
-      Materializer->replaceTemporaryMetadata(Key, Val);
+  return getVM()[V] = ConstantPointerNull::get(cast<PointerType>(NewTy));
+}
+
+Value *Mapper::mapBlockAddress(const BlockAddress &BA) {
+  Function *F = cast<Function>(mapValue(BA.getFunction()));
+
+  // F may not have materialized its initializer.  In that case, create a
+  // dummy basic block for now, and replace it once we've materialized all
+  // the initializers.
+  BasicBlock *BB;
+  if (F->empty()) {
+    DelayedBBs.push_back(DelayedBasicBlock(BA));
+    BB = DelayedBBs.back().TempBB.get();
+  } else {
+    BB = cast_or_null<BasicBlock>(mapValue(BA.getBasicBlock()));
   }
-  return Val;
+
+  return getVM()[&BA] = BlockAddress::get(F, BB ? BB : BA.getBasicBlock());
 }
 
-static Metadata *mapToSelf(ValueToValueMapTy &VM, const Metadata *MD,
-                           ValueMaterializer *Materializer, RemapFlags Flags) {
-  return mapToMetadata(VM, MD, const_cast<Metadata *>(MD), Materializer, Flags);
+Metadata *Mapper::mapToMetadata(const Metadata *Key, Metadata *Val) {
+  getVM().MD()[Key].reset(Val);
+  return Val;
 }
 
-static Metadata *MapMetadataImpl(const Metadata *MD,
-                                 SmallVectorImpl<MDNode *> &DistinctWorklist,
-                                 ValueToValueMapTy &VM, RemapFlags Flags,
-                                 ValueMapTypeRemapper *TypeMapper,
-                                 ValueMaterializer *Materializer);
+Metadata *Mapper::mapToSelf(const Metadata *MD) {
+  return mapToMetadata(MD, const_cast<Metadata *>(MD));
+}
 
-static Metadata *mapMetadataOp(Metadata *Op,
-                               SmallVectorImpl<MDNode *> &DistinctWorklist,
-                               ValueToValueMapTy &VM, RemapFlags Flags,
-                               ValueMapTypeRemapper *TypeMapper,
-                               ValueMaterializer *Materializer) {
+Optional<Metadata *> MDNodeMapper::tryToMapOperand(const Metadata *Op) {
   if (!Op)
     return nullptr;
 
-  if (Materializer && !Materializer->isMetadataNeeded(Op))
+  if (Optional<Metadata *> MappedOp = M.mapSimpleMetadata(Op)) {
+#ifndef NDEBUG
+    if (auto *CMD = dyn_cast<ConstantAsMetadata>(Op))
+      assert((!*MappedOp || M.getVM().count(CMD->getValue()) ||
+              M.getVM().getMappedMD(Op)) &&
+             "Expected Value to be memoized");
+    else
+      assert((isa<MDString>(Op) || M.getVM().getMappedMD(Op)) &&
+             "Expected result to be memoized");
+#endif
+    return *MappedOp;
+  }
+
+  const MDNode &N = *cast<MDNode>(Op);
+  if (N.isDistinct())
+    return mapDistinctNode(N);
+  return None;
+}
+
+MDNode *MDNodeMapper::mapDistinctNode(const MDNode &N) {
+  assert(N.isDistinct() && "Expected a distinct node");
+  assert(!M.getVM().getMappedMD(&N) && "Expected an unmapped node");
+  DistinctWorklist.push_back(cast<MDNode>(
+      (M.Flags & RF_MoveDistinctMDs)
+          ? M.mapToSelf(&N)
+          : M.mapToMetadata(&N, MDNode::replaceWithDistinct(N.clone()))));
+  return DistinctWorklist.back();
+}
+
+static ConstantAsMetadata *wrapConstantAsMetadata(const ConstantAsMetadata &CMD,
+                                                  Value *MappedV) {
+  if (CMD.getValue() == MappedV)
+    return const_cast<ConstantAsMetadata *>(&CMD);
+  return MappedV ? ConstantAsMetadata::getConstant(MappedV) : nullptr;
+}
+
+Optional<Metadata *> MDNodeMapper::getMappedOp(const Metadata *Op) const {
+  if (!Op)
     return nullptr;
 
-  if (Metadata *MappedOp = MapMetadataImpl(Op, DistinctWorklist, VM, Flags,
-                                           TypeMapper, Materializer))
-    return MappedOp;
-  // Use identity map if MappedOp is null and we can ignore missing entries.
-  if (Flags & RF_IgnoreMissingEntries)
+  if (Optional<Metadata *> MappedOp = M.getVM().getMappedMD(Op))
+    return *MappedOp;
+
+  if (isa<MDString>(Op))
+    return const_cast<Metadata *>(Op);
+
+  if (auto *CMD = dyn_cast<ConstantAsMetadata>(Op))
+    return wrapConstantAsMetadata(*CMD, M.getVM().lookup(CMD->getValue()));
+
+  return None;
+}
+
+Metadata &MDNodeMapper::UniquedGraph::getFwdReference(MDNode &Op) {
+  auto Where = Info.find(&Op);
+  assert(Where != Info.end() && "Expected a valid reference");
+
+  auto &OpD = Where->second;
+  if (!OpD.HasChanged)
     return Op;
 
-  // FIXME: This assert crashes during bootstrap, but I think it should be
-  // correct.  For now, just match behaviour from before the metadata/value
-  // split.
-  //
-  //    assert((Flags & RF_NullMapMissingGlobalValues) &&
-  //           "Referenced metadata not in value map!");
-  return nullptr;
+  // Lazily construct a temporary node.
+  if (!OpD.Placeholder)
+    OpD.Placeholder = Op.clone();
+
+  return *OpD.Placeholder;
 }
 
-/// Resolve uniquing cycles involving the given metadata.
-static void resolveCycles(Metadata *MD, bool AllowTemps) {
-  if (auto *N = dyn_cast_or_null<MDNode>(MD)) {
-    if (AllowTemps && N->isTemporary())
-      return;
-    if (!N->isResolved()) {
-      if (AllowTemps)
-        // Note that this will drop RAUW support on any temporaries, which
-        // blocks uniquing. If this ends up being an issue, in the future
-        // we can experiment with delaying resolving these nodes until
-        // after metadata is fully materialized (i.e. when linking metadata
-        // as a postpass after function importing).
-        N->resolveNonTemporaries();
-      else
-        N->resolveCycles();
-    }
+template <class OperandMapper>
+void MDNodeMapper::remapOperands(MDNode &N, OperandMapper mapOperand) {
+  assert(!N.isUniqued() && "Expected distinct or temporary nodes");
+  for (unsigned I = 0, E = N.getNumOperands(); I != E; ++I) {
+    Metadata *Old = N.getOperand(I);
+    Metadata *New = mapOperand(Old);
+
+    if (Old != New)
+      N.replaceOperandWith(I, New);
   }
 }
 
-/// Remap the operands of an MDNode.
-///
-/// If \c Node is temporary, uniquing cycles are ignored.  If \c Node is
-/// distinct, uniquing cycles are resolved as they're found.
-///
-/// \pre \c Node.isDistinct() or \c Node.isTemporary().
-static bool remapOperands(MDNode &Node,
-                          SmallVectorImpl<MDNode *> &DistinctWorklist,
-                          ValueToValueMapTy &VM, RemapFlags Flags,
-                          ValueMapTypeRemapper *TypeMapper,
-                          ValueMaterializer *Materializer) {
-  assert(!Node.isUniqued() && "Expected temporary or distinct node");
-  const bool IsDistinct = Node.isDistinct();
-
-  bool AnyChanged = false;
-  for (unsigned I = 0, E = Node.getNumOperands(); I != E; ++I) {
-    Metadata *Old = Node.getOperand(I);
-    Metadata *New = mapMetadataOp(Old, DistinctWorklist, VM, Flags, TypeMapper,
-                                  Materializer);
-    if (Old != New) {
-      AnyChanged = true;
-      Node.replaceOperandWith(I, New);
-
-      // Resolve uniquing cycles underneath distinct nodes on the fly so they
-      // don't infect later operands.
-      if (IsDistinct)
-        resolveCycles(New, Flags & RF_HaveUnmaterializedMetadata);
+namespace {
+/// An entry in the worklist for the post-order traversal.
+struct POTWorklistEntry {
+  MDNode *N;              ///< Current node.
+  MDNode::op_iterator Op; ///< Current operand of \c N.
+
+  /// Keep a flag of whether operands have changed in the worklist to avoid
+  /// hitting the map in \a UniquedGraph.
+  bool HasChanged = false;
+
+  POTWorklistEntry(MDNode &N) : N(&N), Op(N.op_begin()) {}
+};
+} // end namespace
+
+bool MDNodeMapper::createPOT(UniquedGraph &G, const MDNode &FirstN) {
+  assert(G.Info.empty() && "Expected a fresh traversal");
+  assert(FirstN.isUniqued() && "Expected uniqued node in POT");
+
+  // Construct a post-order traversal of the uniqued subgraph under FirstN.
+  bool AnyChanges = false;
+  SmallVector<POTWorklistEntry, 16> Worklist;
+  Worklist.push_back(POTWorklistEntry(const_cast<MDNode &>(FirstN)));
+  (void)G.Info[&FirstN];
+  while (!Worklist.empty()) {
+    // Start or continue the traversal through the this node's operands.
+    auto &WE = Worklist.back();
+    if (MDNode *N = visitOperands(G, WE.Op, WE.N->op_end(), WE.HasChanged)) {
+      // Push a new node to traverse first.
+      Worklist.push_back(POTWorklistEntry(*N));
+      continue;
     }
+
+    // Push the node onto the POT.
+    assert(WE.N->isUniqued() && "Expected only uniqued nodes");
+    assert(WE.Op == WE.N->op_end() && "Expected to visit all operands");
+    auto &D = G.Info[WE.N];
+    AnyChanges |= D.HasChanged = WE.HasChanged;
+    D.ID = G.POT.size();
+    G.POT.push_back(WE.N);
+
+    // Pop the node off the worklist.
+    Worklist.pop_back();
   }
+  return AnyChanges;
+}
 
-  return AnyChanged;
-}
-
-/// Map a distinct MDNode.
-///
-/// Whether distinct nodes change is independent of their operands.  If \a
-/// RF_MoveDistinctMDs, then they are reused, and their operands remapped in
-/// place; effectively, they're moved from one graph to another.  Otherwise,
-/// they're cloned/duplicated, and the new copy's operands are remapped.
-static Metadata *mapDistinctNode(const MDNode *Node,
-                                 SmallVectorImpl<MDNode *> &DistinctWorklist,
-                                 ValueToValueMapTy &VM, RemapFlags Flags,
-                                 ValueMapTypeRemapper *TypeMapper,
-                                 ValueMaterializer *Materializer) {
-  assert(Node->isDistinct() && "Expected distinct node");
-
-  MDNode *NewMD;
-  if (Flags & RF_MoveDistinctMDs)
-    NewMD = const_cast<MDNode *>(Node);
-  else
-    NewMD = MDNode::replaceWithDistinct(Node->clone());
-
-  // Remap operands later.
-  DistinctWorklist.push_back(NewMD);
-  return mapToMetadata(VM, Node, NewMD, Materializer, Flags);
-}
-
-/// \brief Map a uniqued MDNode.
-///
-/// Uniqued nodes may not need to be recreated (they may map to themselves).
-static Metadata *mapUniquedNode(const MDNode *Node,
-                                SmallVectorImpl<MDNode *> &DistinctWorklist,
-                                ValueToValueMapTy &VM, RemapFlags Flags,
-                                ValueMapTypeRemapper *TypeMapper,
-                                ValueMaterializer *Materializer) {
-  assert(((Flags & RF_HaveUnmaterializedMetadata) || Node->isUniqued()) &&
-         "Expected uniqued node");
-
-  // Create a temporary node and map it upfront in case we have a uniquing
-  // cycle.  If necessary, this mapping will get updated by RAUW logic before
-  // returning.
-  auto ClonedMD = Node->clone();
-  mapToMetadata(VM, Node, ClonedMD.get(), Materializer, Flags);
-  if (!remapOperands(*ClonedMD, DistinctWorklist, VM, Flags, TypeMapper,
-                     Materializer)) {
-    // No operands changed, so use the original.
-    ClonedMD->replaceAllUsesWith(const_cast<MDNode *>(Node));
-    // Even though replaceAllUsesWith would have replaced the value map
-    // entry, we need to explictly map with the final non-temporary node
-    // to replace any temporary metadata via the callback.
-    return mapToSelf(VM, Node, Materializer, Flags);
+MDNode *MDNodeMapper::visitOperands(UniquedGraph &G, MDNode::op_iterator &I,
+                                    MDNode::op_iterator E, bool &HasChanged) {
+  while (I != E) {
+    Metadata *Op = *I++; // Increment even on early return.
+    if (Optional<Metadata *> MappedOp = tryToMapOperand(Op)) {
+      // Check if the operand changes.
+      HasChanged |= Op != *MappedOp;
+      continue;
+    }
+
+    // A uniqued metadata node.
+    MDNode &OpN = *cast<MDNode>(Op);
+    assert(OpN.isUniqued() &&
+           "Only uniqued operands cannot be mapped immediately");
+    if (G.Info.insert(std::make_pair(&OpN, Data())).second)
+      return &OpN; // This is a new one.  Return it.
   }
+  return nullptr;
+}
 
-  // Uniquify the cloned node. Explicitly map it with the final non-temporary
-  // node so that replacement of temporary metadata via the callback occurs.
-  return mapToMetadata(VM, Node,
-                       MDNode::replaceWithUniqued(std::move(ClonedMD)),
-                       Materializer, Flags);
+void MDNodeMapper::UniquedGraph::propagateChanges() {
+  bool AnyChanges;
+  do {
+    AnyChanges = false;
+    for (MDNode *N : POT) {
+      auto &D = Info[N];
+      if (D.HasChanged)
+        continue;
+
+      if (!llvm::any_of(N->operands(), [&](const Metadata *Op) {
+            auto Where = Info.find(Op);
+            return Where != Info.end() && Where->second.HasChanged;
+          }))
+        continue;
+
+      AnyChanges = D.HasChanged = true;
+    }
+  } while (AnyChanges);
 }
 
-static Metadata *MapMetadataImpl(const Metadata *MD,
-                                 SmallVectorImpl<MDNode *> &DistinctWorklist,
-                                 ValueToValueMapTy &VM, RemapFlags Flags,
-                                 ValueMapTypeRemapper *TypeMapper,
-                                 ValueMaterializer *Materializer) {
-  // If the value already exists in the map, use it.
-  if (Metadata *NewMD = VM.MD().lookup(MD).get())
-    return NewMD;
+void MDNodeMapper::mapNodesInPOT(UniquedGraph &G) {
+  // Construct uniqued nodes, building forward references as necessary.
+  SmallVector<MDNode *, 16> CyclicNodes;
+  for (auto *N : G.POT) {
+    auto &D = G.Info[N];
+    if (!D.HasChanged) {
+      // The node hasn't changed.
+      M.mapToSelf(N);
+      continue;
+    }
 
-  if (isa<MDString>(MD))
-    return mapToSelf(VM, MD, Materializer, Flags);
-
-  if (isa<ConstantAsMetadata>(MD))
-    if ((Flags & RF_NoModuleLevelChanges))
-      return mapToSelf(VM, MD, Materializer, Flags);
-
-  if (const auto *VMD = dyn_cast<ValueAsMetadata>(MD)) {
-    Value *MappedV =
-        MapValue(VMD->getValue(), VM, Flags, TypeMapper, Materializer);
-    if (VMD->getValue() == MappedV ||
-        (!MappedV && (Flags & RF_IgnoreMissingEntries)))
-      return mapToSelf(VM, MD, Materializer, Flags);
-
-    // FIXME: This assert crashes during bootstrap, but I think it should be
-    // correct.  For now, just match behaviour from before the metadata/value
-    // split.
-    //
-    //    assert((MappedV || (Flags & RF_NullMapMissingGlobalValues)) &&
-    //           "Referenced metadata not in value map!");
-    if (MappedV)
-      return mapToMetadata(VM, MD, ValueAsMetadata::get(MappedV), Materializer,
-                           Flags);
-    return nullptr;
+    // Remember whether this node had a placeholder.
+    bool HadPlaceholder(D.Placeholder);
+
+    // Clone the uniqued node and remap the operands.
+    TempMDNode ClonedN = D.Placeholder ? std::move(D.Placeholder) : N->clone();
+    remapOperands(*ClonedN, [this, &D, &G](Metadata *Old) {
+      if (Optional<Metadata *> MappedOp = getMappedOp(Old))
+        return *MappedOp;
+      assert(G.Info[Old].ID > D.ID && "Expected a forward reference");
+      return &G.getFwdReference(*cast<MDNode>(Old));
+    });
+
+    auto *NewN = MDNode::replaceWithUniqued(std::move(ClonedN));
+    M.mapToMetadata(N, NewN);
+
+    // Nodes that were referenced out of order in the POT are involved in a
+    // uniquing cycle.
+    if (HadPlaceholder)
+      CyclicNodes.push_back(NewN);
   }
 
-  // Note: this cast precedes the Flags check so we always get its associated
-  // assertion.
-  const MDNode *Node = cast<MDNode>(MD);
+  // Resolve cycles.
+  for (auto *N : CyclicNodes)
+    if (!N->isResolved())
+      N->resolveCycles();
+}
 
-  // If this is a module-level metadata and we know that nothing at the
-  // module level is changing, then use an identity mapping.
-  if (Flags & RF_NoModuleLevelChanges)
-    return mapToSelf(VM, MD, Materializer, Flags);
+Metadata *MDNodeMapper::map(const MDNode &N) {
+  assert(DistinctWorklist.empty() && "MDNodeMapper::map is not recursive");
+  assert(!(M.Flags & RF_NoModuleLevelChanges) &&
+         "MDNodeMapper::map assumes module-level changes");
 
   // Require resolved nodes whenever metadata might be remapped.
-  assert(((Flags & RF_HaveUnmaterializedMetadata) || Node->isResolved()) &&
-         "Unexpected unresolved node");
-
-  if (Materializer && Node->isTemporary()) {
-    assert(Flags & RF_HaveUnmaterializedMetadata);
-    Metadata *TempMD =
-        Materializer->mapTemporaryMetadata(const_cast<Metadata *>(MD));
-    // If the above callback returned an existing temporary node, use it
-    // instead of the current temporary node. This happens when earlier
-    // function importing passes already created and saved a temporary
-    // metadata node for the same value id.
-    if (TempMD) {
-      mapToMetadata(VM, MD, TempMD, Materializer, Flags);
-      return TempMD;
-    }
+  assert(N.isResolved() && "Unexpected unresolved node");
+
+  Metadata *MappedN =
+      N.isUniqued() ? mapTopLevelUniquedNode(N) : mapDistinctNode(N);
+  while (!DistinctWorklist.empty())
+    remapOperands(*DistinctWorklist.pop_back_val(), [this](Metadata *Old) {
+      if (Optional<Metadata *> MappedOp = tryToMapOperand(Old))
+        return *MappedOp;
+      return mapTopLevelUniquedNode(*cast<MDNode>(Old));
+    });
+  return MappedN;
+}
+
+Metadata *MDNodeMapper::mapTopLevelUniquedNode(const MDNode &FirstN) {
+  assert(FirstN.isUniqued() && "Expected uniqued node");
+
+  // Create a post-order traversal of uniqued nodes under FirstN.
+  UniquedGraph G;
+  if (!createPOT(G, FirstN)) {
+    // Return early if no nodes have changed.
+    for (const MDNode *N : G.POT)
+      M.mapToSelf(N);
+    return &const_cast<MDNode &>(FirstN);
   }
 
-  if (Node->isDistinct())
-    return mapDistinctNode(Node, DistinctWorklist, VM, Flags, TypeMapper,
-                           Materializer);
+  // Update graph with all nodes that have changed.
+  G.propagateChanges();
 
-  return mapUniquedNode(Node, DistinctWorklist, VM, Flags, TypeMapper,
-                        Materializer);
+  // Map all the nodes in the graph.
+  mapNodesInPOT(G);
+
+  // Return the original node, remapped.
+  return *getMappedOp(&FirstN);
 }
 
-Metadata *llvm::MapMetadata(const Metadata *MD, ValueToValueMapTy &VM,
-                            RemapFlags Flags, ValueMapTypeRemapper *TypeMapper,
-                            ValueMaterializer *Materializer) {
-  SmallVector<MDNode *, 8> DistinctWorklist;
-  Metadata *NewMD = MapMetadataImpl(MD, DistinctWorklist, VM, Flags, TypeMapper,
-                                    Materializer);
+namespace {
 
-  // When there are no module-level changes, it's possible that the metadata
-  // graph has temporaries.  Skip the logic to resolve cycles, since it's
-  // unnecessary (and invalid) in that case.
-  if (Flags & RF_NoModuleLevelChanges)
-    return NewMD;
+struct MapMetadataDisabler {
+  ValueToValueMapTy &VM;
 
-  // Resolve cycles involving the entry metadata.
-  resolveCycles(NewMD, Flags & RF_HaveUnmaterializedMetadata);
+  MapMetadataDisabler(ValueToValueMapTy &VM) : VM(VM) {
+    VM.disableMapMetadata();
+  }
+  ~MapMetadataDisabler() { VM.enableMapMetadata(); }
+};
 
-  // Remap the operands of distinct MDNodes.
-  while (!DistinctWorklist.empty())
-    remapOperands(*DistinctWorklist.pop_back_val(), DistinctWorklist, VM, Flags,
-                  TypeMapper, Materializer);
+} // end namespace
 
-  return NewMD;
+Optional<Metadata *> Mapper::mapSimpleMetadata(const Metadata *MD) {
+  // If the value already exists in the map, use it.
+  if (Optional<Metadata *> NewMD = getVM().getMappedMD(MD))
+    return *NewMD;
+
+  if (isa<MDString>(MD))
+    return const_cast<Metadata *>(MD);
+
+  // This is a module-level metadata.  If nothing at the module level is
+  // changing, use an identity mapping.
+  if ((Flags & RF_NoModuleLevelChanges))
+    return const_cast<Metadata *>(MD);
+
+  if (auto *CMD = dyn_cast<ConstantAsMetadata>(MD)) {
+    // Disallow recursion into metadata mapping through mapValue.
+    MapMetadataDisabler MMD(getVM());
+
+    // Don't memoize ConstantAsMetadata.  Instead of lasting until the
+    // LLVMContext is destroyed, they can be deleted when the GlobalValue they
+    // reference is destructed.  These aren't super common, so the extra
+    // indirection isn't that expensive.
+    return wrapConstantAsMetadata(*CMD, mapValue(CMD->getValue()));
+  }
+
+  assert(isa<MDNode>(MD) && "Expected a metadata node");
+
+  return None;
 }
 
-MDNode *llvm::MapMetadata(const MDNode *MD, ValueToValueMapTy &VM,
-                          RemapFlags Flags, ValueMapTypeRemapper *TypeMapper,
-                          ValueMaterializer *Materializer) {
-  return cast<MDNode>(MapMetadata(static_cast<const Metadata *>(MD), VM, Flags,
-                                  TypeMapper, Materializer));
+Metadata *Mapper::mapMetadata(const Metadata *MD) {
+  assert(MD && "Expected valid metadata");
+  assert(!isa<LocalAsMetadata>(MD) && "Unexpected local metadata");
+
+  if (Optional<Metadata *> NewMD = mapSimpleMetadata(MD))
+    return *NewMD;
+
+  return MDNodeMapper(*this).map(*cast<MDNode>(MD));
+}
+
+void Mapper::flush() {
+  // Flush out the worklist of global values.
+  while (!Worklist.empty()) {
+    WorklistEntry E = Worklist.pop_back_val();
+    CurrentMCID = E.MCID;
+    switch (E.Kind) {
+    case WorklistEntry::MapGlobalInit:
+      E.Data.GVInit.GV->setInitializer(mapConstant(E.Data.GVInit.Init));
+      break;
+    case WorklistEntry::MapAppendingVar: {
+      unsigned PrefixSize = AppendingInits.size() - E.AppendingGVNumNewMembers;
+      mapAppendingVariable(*E.Data.AppendingGV.GV,
+                           E.Data.AppendingGV.InitPrefix,
+                           E.AppendingGVIsOldCtorDtor,
+                           makeArrayRef(AppendingInits).slice(PrefixSize));
+      AppendingInits.resize(PrefixSize);
+      break;
+    }
+    case WorklistEntry::MapGlobalAliasee:
+      E.Data.GlobalAliasee.GA->setAliasee(
+          mapConstant(E.Data.GlobalAliasee.Aliasee));
+      break;
+    case WorklistEntry::RemapFunction:
+      remapFunction(*E.Data.RemapF);
+      break;
+    }
+  }
+  CurrentMCID = 0;
+
+  // Finish logic for block addresses now that all global values have been
+  // handled.
+  while (!DelayedBBs.empty()) {
+    DelayedBasicBlock DBB = DelayedBBs.pop_back_val();
+    BasicBlock *BB = cast_or_null<BasicBlock>(mapValue(DBB.OldBB));
+    DBB.TempBB->replaceAllUsesWith(BB ? BB : DBB.OldBB);
+  }
 }
 
-/// RemapInstruction - Convert the instruction operands from referencing the
-/// current values into those specified by VMap.
-///
-void llvm::RemapInstruction(Instruction *I, ValueToValueMapTy &VMap,
-                            RemapFlags Flags, ValueMapTypeRemapper *TypeMapper,
-                            ValueMaterializer *Materializer){
+void Mapper::remapInstruction(Instruction *I) {
   // Remap operands.
-  for (User::op_iterator op = I->op_begin(), E = I->op_end(); op != E; ++op) {
-    Value *V = MapValue(*op, VMap, Flags, TypeMapper, Materializer);
+  for (Use &Op : I->operands()) {
+    Value *V = mapValue(Op);
     // If we aren't ignoring missing entries, assert that something happened.
     if (V)
-      *op = V;
+      Op = V;
     else
-      assert((Flags & RF_IgnoreMissingEntries) &&
+      assert((Flags & RF_IgnoreMissingLocals) &&
              "Referenced value not in value map!");
   }
 
   // Remap phi nodes' incoming blocks.
   if (PHINode *PN = dyn_cast<PHINode>(I)) {
     for (unsigned i = 0, e = PN->getNumIncomingValues(); i != e; ++i) {
-      Value *V = MapValue(PN->getIncomingBlock(i), VMap, Flags);
+      Value *V = mapValue(PN->getIncomingBlock(i));
       // If we aren't ignoring missing entries, assert that something happened.
       if (V)
         PN->setIncomingBlock(i, cast<BasicBlock>(V));
       else
-        assert((Flags & RF_IgnoreMissingEntries) &&
+        assert((Flags & RF_IgnoreMissingLocals) &&
                "Referenced block not in value map!");
     }
   }
@@ -462,11 +881,11 @@ void llvm::RemapInstruction(Instruction *I, ValueToValueMapTy &VMap,
   I->getAllMetadata(MDs);
   for (const auto &MI : MDs) {
     MDNode *Old = MI.second;
-    MDNode *New = MapMetadata(Old, VMap, Flags, TypeMapper, Materializer);
+    MDNode *New = cast_or_null<MDNode>(mapMetadata(Old));
     if (New != Old)
       I->setMetadata(MI.first, New);
   }
-  
+
   if (!TypeMapper)
     return;
 
@@ -491,3 +910,213 @@ void llvm::RemapInstruction(Instruction *I, ValueToValueMapTy &VMap,
   }
   I->mutateType(TypeMapper->remapType(I->getType()));
 }
+
+void Mapper::remapFunction(Function &F) {
+  // Remap the operands.
+  for (Use &Op : F.operands())
+    if (Op)
+      Op = mapValue(Op);
+
+  // Remap the metadata attachments.
+  SmallVector<std::pair<unsigned, MDNode *>, 8> MDs;
+  F.getAllMetadata(MDs);
+  F.clearMetadata();
+  for (const auto &I : MDs)
+    F.addMetadata(I.first, *cast<MDNode>(mapMetadata(I.second)));
+
+  // Remap the argument types.
+  if (TypeMapper)
+    for (Argument &A : F.args())
+      A.mutateType(TypeMapper->remapType(A.getType()));
+
+  // Remap the instructions.
+  for (BasicBlock &BB : F)
+    for (Instruction &I : BB)
+      remapInstruction(&I);
+}
+
+void Mapper::mapAppendingVariable(GlobalVariable &GV, Constant *InitPrefix,
+                                  bool IsOldCtorDtor,
+                                  ArrayRef<Constant *> NewMembers) {
+  SmallVector<Constant *, 16> Elements;
+  if (InitPrefix) {
+    unsigned NumElements =
+        cast<ArrayType>(InitPrefix->getType())->getNumElements();
+    for (unsigned I = 0; I != NumElements; ++I)
+      Elements.push_back(InitPrefix->getAggregateElement(I));
+  }
+
+  PointerType *VoidPtrTy;
+  Type *EltTy;
+  if (IsOldCtorDtor) {
+    // FIXME: This upgrade is done during linking to support the C API.  See
+    // also IRLinker::linkAppendingVarProto() in IRMover.cpp.
+    VoidPtrTy = Type::getInt8Ty(GV.getContext())->getPointerTo();
+    auto &ST = *cast<StructType>(NewMembers.front()->getType());
+    Type *Tys[3] = {ST.getElementType(0), ST.getElementType(1), VoidPtrTy};
+    EltTy = StructType::get(GV.getContext(), Tys, false);
+  }
+
+  for (auto *V : NewMembers) {
+    Constant *NewV;
+    if (IsOldCtorDtor) {
+      auto *S = cast<ConstantStruct>(V);
+      auto *E1 = mapValue(S->getOperand(0));
+      auto *E2 = mapValue(S->getOperand(1));
+      Value *Null = Constant::getNullValue(VoidPtrTy);
+      NewV =
+          ConstantStruct::get(cast<StructType>(EltTy), E1, E2, Null, nullptr);
+    } else {
+      NewV = cast_or_null<Constant>(mapValue(V));
+    }
+    Elements.push_back(NewV);
+  }
+
+  GV.setInitializer(ConstantArray::get(
+      cast<ArrayType>(GV.getType()->getElementType()), Elements));
+}
+
+void Mapper::scheduleMapGlobalInitializer(GlobalVariable &GV, Constant &Init,
+                                          unsigned MCID) {
+  assert(AlreadyScheduled.insert(&GV).second && "Should not reschedule");
+  assert(MCID < MCs.size() && "Invalid mapping context");
+
+  WorklistEntry WE;
+  WE.Kind = WorklistEntry::MapGlobalInit;
+  WE.MCID = MCID;
+  WE.Data.GVInit.GV = &GV;
+  WE.Data.GVInit.Init = &Init;
+  Worklist.push_back(WE);
+}
+
+void Mapper::scheduleMapAppendingVariable(GlobalVariable &GV,
+                                          Constant *InitPrefix,
+                                          bool IsOldCtorDtor,
+                                          ArrayRef<Constant *> NewMembers,
+                                          unsigned MCID) {
+  assert(AlreadyScheduled.insert(&GV).second && "Should not reschedule");
+  assert(MCID < MCs.size() && "Invalid mapping context");
+
+  WorklistEntry WE;
+  WE.Kind = WorklistEntry::MapAppendingVar;
+  WE.MCID = MCID;
+  WE.Data.AppendingGV.GV = &GV;
+  WE.Data.AppendingGV.InitPrefix = InitPrefix;
+  WE.AppendingGVIsOldCtorDtor = IsOldCtorDtor;
+  WE.AppendingGVNumNewMembers = NewMembers.size();
+  Worklist.push_back(WE);
+  AppendingInits.append(NewMembers.begin(), NewMembers.end());
+}
+
+void Mapper::scheduleMapGlobalAliasee(GlobalAlias &GA, Constant &Aliasee,
+                                      unsigned MCID) {
+  assert(AlreadyScheduled.insert(&GA).second && "Should not reschedule");
+  assert(MCID < MCs.size() && "Invalid mapping context");
+
+  WorklistEntry WE;
+  WE.Kind = WorklistEntry::MapGlobalAliasee;
+  WE.MCID = MCID;
+  WE.Data.GlobalAliasee.GA = &GA;
+  WE.Data.GlobalAliasee.Aliasee = &Aliasee;
+  Worklist.push_back(WE);
+}
+
+void Mapper::scheduleRemapFunction(Function &F, unsigned MCID) {
+  assert(AlreadyScheduled.insert(&F).second && "Should not reschedule");
+  assert(MCID < MCs.size() && "Invalid mapping context");
+
+  WorklistEntry WE;
+  WE.Kind = WorklistEntry::RemapFunction;
+  WE.MCID = MCID;
+  WE.Data.RemapF = &F;
+  Worklist.push_back(WE);
+}
+
+void Mapper::addFlags(RemapFlags Flags) {
+  assert(!hasWorkToDo() && "Expected to have flushed the worklist");
+  this->Flags = this->Flags | Flags;
+}
+
+static Mapper *getAsMapper(void *pImpl) {
+  return reinterpret_cast<Mapper *>(pImpl);
+}
+
+namespace {
+
+class FlushingMapper {
+  Mapper &M;
+
+public:
+  explicit FlushingMapper(void *pImpl) : M(*getAsMapper(pImpl)) {
+    assert(!M.hasWorkToDo() && "Expected to be flushed");
+  }
+  ~FlushingMapper() { M.flush(); }
+  Mapper *operator->() const { return &M; }
+};
+
+} // end namespace
+
+ValueMapper::ValueMapper(ValueToValueMapTy &VM, RemapFlags Flags,
+                         ValueMapTypeRemapper *TypeMapper,
+                         ValueMaterializer *Materializer)
+    : pImpl(new Mapper(VM, Flags, TypeMapper, Materializer)) {}
+
+ValueMapper::~ValueMapper() { delete getAsMapper(pImpl); }
+
+unsigned
+ValueMapper::registerAlternateMappingContext(ValueToValueMapTy &VM,
+                                             ValueMaterializer *Materializer) {
+  return getAsMapper(pImpl)->registerAlternateMappingContext(VM, Materializer);
+}
+
+void ValueMapper::addFlags(RemapFlags Flags) {
+  FlushingMapper(pImpl)->addFlags(Flags);
+}
+
+Value *ValueMapper::mapValue(const Value &V) {
+  return FlushingMapper(pImpl)->mapValue(&V);
+}
+
+Constant *ValueMapper::mapConstant(const Constant &C) {
+  return cast_or_null<Constant>(mapValue(C));
+}
+
+Metadata *ValueMapper::mapMetadata(const Metadata &MD) {
+  return FlushingMapper(pImpl)->mapMetadata(&MD);
+}
+
+MDNode *ValueMapper::mapMDNode(const MDNode &N) {
+  return cast_or_null<MDNode>(mapMetadata(N));
+}
+
+void ValueMapper::remapInstruction(Instruction &I) {
+  FlushingMapper(pImpl)->remapInstruction(&I);
+}
+
+void ValueMapper::remapFunction(Function &F) {
+  FlushingMapper(pImpl)->remapFunction(F);
+}
+
+void ValueMapper::scheduleMapGlobalInitializer(GlobalVariable &GV,
+                                               Constant &Init,
+                                               unsigned MCID) {
+  getAsMapper(pImpl)->scheduleMapGlobalInitializer(GV, Init, MCID);
+}
+
+void ValueMapper::scheduleMapAppendingVariable(GlobalVariable &GV,
+                                               Constant *InitPrefix,
+                                               bool IsOldCtorDtor,
+                                               ArrayRef<Constant *> NewMembers,
+                                               unsigned MCID) {
+  getAsMapper(pImpl)->scheduleMapAppendingVariable(
+      GV, InitPrefix, IsOldCtorDtor, NewMembers, MCID);
+}
+
+void ValueMapper::scheduleMapGlobalAliasee(GlobalAlias &GA, Constant &Aliasee,
+                                           unsigned MCID) {
+  getAsMapper(pImpl)->scheduleMapGlobalAliasee(GA, Aliasee, MCID);
+}
+
+void ValueMapper::scheduleRemapFunction(Function &F, unsigned MCID) {
+  getAsMapper(pImpl)->scheduleRemapFunction(F, MCID);
+}
diff --git a/lib/Transforms/Vectorize/BBVectorize.cpp b/lib/Transforms/Vectorize/BBVectorize.cpp
index 8844d574a79d..af594cb751aa 100644
--- a/lib/Transforms/Vectorize/BBVectorize.cpp
+++ b/lib/Transforms/Vectorize/BBVectorize.cpp
@@ -397,7 +397,7 @@ namespace {
                      Instruction *I, Instruction *J);
 
     bool vectorizeBB(BasicBlock &BB) {
-      if (skipOptnoneFunction(BB))
+      if (skipBasicBlock(BB))
         return false;
       if (!DT->isReachableFromEntry(&BB)) {
         DEBUG(dbgs() << "BBV: skipping unreachable " << BB.getName() <<
@@ -886,9 +886,16 @@ namespace {
       Type *DestTy = C->getDestTy();
       if (!DestTy->isSingleValueType())
         return false;
-    } else if (isa<SelectInst>(I)) {
+    } else if (SelectInst *SI = dyn_cast<SelectInst>(I)) {
       if (!Config.VectorizeSelect)
         return false;
+      // We can vectorize a select if either all operands are scalars,
+      // or all operands are vectors. Trying to "widen" a select between
+      // vectors that has a scalar condition results in a malformed select.
+      // FIXME: We could probably be smarter about this by rewriting the select
+      // with different types instead.
+      return (SI->getCondition()->getType()->isVectorTy() == 
+              SI->getTrueValue()->getType()->isVectorTy());
     } else if (isa<CmpInst>(I)) {
       if (!Config.VectorizeCmp)
         return false;
@@ -1117,16 +1124,25 @@ namespace {
       }
 
       if (IID && TTI) {
+        FastMathFlags FMFCI;
+        if (auto *FPMOCI = dyn_cast<FPMathOperator>(CI))
+          FMFCI = FPMOCI->getFastMathFlags();
+
         SmallVector<Type*, 4> Tys;
         for (unsigned i = 0, ie = CI->getNumArgOperands(); i != ie; ++i)
           Tys.push_back(CI->getArgOperand(i)->getType());
-        unsigned ICost = TTI->getIntrinsicInstrCost(IID, IT1, Tys);
+        unsigned ICost = TTI->getIntrinsicInstrCost(IID, IT1, Tys, FMFCI);
 
         Tys.clear();
         CallInst *CJ = cast<CallInst>(J);
+
+        FastMathFlags FMFCJ;
+        if (auto *FPMOCJ = dyn_cast<FPMathOperator>(CJ))
+          FMFCJ = FPMOCJ->getFastMathFlags();
+
         for (unsigned i = 0, ie = CJ->getNumArgOperands(); i != ie; ++i)
           Tys.push_back(CJ->getArgOperand(i)->getType());
-        unsigned JCost = TTI->getIntrinsicInstrCost(IID, JT1, Tys);
+        unsigned JCost = TTI->getIntrinsicInstrCost(IID, JT1, Tys, FMFCJ);
 
         Tys.clear();
         assert(CI->getNumArgOperands() == CJ->getNumArgOperands() &&
@@ -1140,8 +1156,10 @@ namespace {
                                             CJ->getArgOperand(i)->getType()));
         }
 
+        FastMathFlags FMFV = FMFCI;
+        FMFV &= FMFCJ;
         Type *RetTy = getVecTypeForPair(IT1, JT1);
-        unsigned VCost = TTI->getIntrinsicInstrCost(IID, RetTy, Tys);
+        unsigned VCost = TTI->getIntrinsicInstrCost(IID, RetTy, Tys, FMFV);
 
         if (VCost > ICost + JCost)
           return false;
@@ -1259,7 +1277,7 @@ namespace {
       bool JAfterStart = IAfterStart;
       BasicBlock::iterator J = std::next(I);
       for (unsigned ss = 0; J != E && ss <= Config.SearchLimit; ++J, ++ss) {
-        if (&*J == Start)
+        if (J == Start)
           JAfterStart = true;
 
         // Determine if J uses I, if so, exit the loop.
diff --git a/lib/Transforms/Vectorize/CMakeLists.txt b/lib/Transforms/Vectorize/CMakeLists.txt
index 905c069cf851..23c2ab025f37 100644
--- a/lib/Transforms/Vectorize/CMakeLists.txt
+++ b/lib/Transforms/Vectorize/CMakeLists.txt
@@ -1,8 +1,9 @@
 add_llvm_library(LLVMVectorize
   BBVectorize.cpp
-  Vectorize.cpp
+  LoadStoreVectorizer.cpp
   LoopVectorize.cpp
   SLPVectorizer.cpp
+  Vectorize.cpp
 
   ADDITIONAL_HEADER_DIRS
   ${LLVM_MAIN_INCLUDE_DIR}/llvm/Transforms
diff --git a/lib/Transforms/Vectorize/LoadStoreVectorizer.cpp b/lib/Transforms/Vectorize/LoadStoreVectorizer.cpp
new file mode 100644
index 000000000000..c8906bde15e0
--- /dev/null
+++ b/lib/Transforms/Vectorize/LoadStoreVectorizer.cpp
@@ -0,0 +1,999 @@
+//===----- LoadStoreVectorizer.cpp - GPU Load & Store Vectorizer ----------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/ADT/MapVector.h"
+#include "llvm/ADT/PostOrderIterator.h"
+#include "llvm/ADT/SetVector.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/ADT/Triple.h"
+#include "llvm/Analysis/AliasAnalysis.h"
+#include "llvm/Analysis/ScalarEvolution.h"
+#include "llvm/Analysis/ScalarEvolutionExpressions.h"
+#include "llvm/Analysis/TargetTransformInfo.h"
+#include "llvm/Analysis/ValueTracking.h"
+#include "llvm/Analysis/VectorUtils.h"
+#include "llvm/IR/DataLayout.h"
+#include "llvm/IR/Dominators.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/Module.h"
+#include "llvm/IR/Type.h"
+#include "llvm/IR/Value.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Transforms/Vectorize.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "load-store-vectorizer"
+STATISTIC(NumVectorInstructions, "Number of vector accesses generated");
+STATISTIC(NumScalarsVectorized, "Number of scalar accesses vectorized");
+
+namespace {
+
+// TODO: Remove this
+static const unsigned TargetBaseAlign = 4;
+
+class Vectorizer {
+  typedef SmallVector<Value *, 8> ValueList;
+  typedef MapVector<Value *, ValueList> ValueListMap;
+
+  Function &F;
+  AliasAnalysis &AA;
+  DominatorTree &DT;
+  ScalarEvolution &SE;
+  TargetTransformInfo &TTI;
+  const DataLayout &DL;
+  IRBuilder<> Builder;
+  ValueListMap StoreRefs;
+  ValueListMap LoadRefs;
+
+public:
+  Vectorizer(Function &F, AliasAnalysis &AA, DominatorTree &DT,
+             ScalarEvolution &SE, TargetTransformInfo &TTI)
+      : F(F), AA(AA), DT(DT), SE(SE), TTI(TTI),
+        DL(F.getParent()->getDataLayout()), Builder(SE.getContext()) {}
+
+  bool run();
+
+private:
+  Value *getPointerOperand(Value *I);
+
+  unsigned getPointerAddressSpace(Value *I);
+
+  unsigned getAlignment(LoadInst *LI) const {
+    unsigned Align = LI->getAlignment();
+    if (Align != 0)
+      return Align;
+
+    return DL.getABITypeAlignment(LI->getType());
+  }
+
+  unsigned getAlignment(StoreInst *SI) const {
+    unsigned Align = SI->getAlignment();
+    if (Align != 0)
+      return Align;
+
+    return DL.getABITypeAlignment(SI->getValueOperand()->getType());
+  }
+
+  bool isConsecutiveAccess(Value *A, Value *B);
+
+  /// After vectorization, reorder the instructions that I depends on
+  /// (the instructions defining its operands), to ensure they dominate I.
+  void reorder(Instruction *I);
+
+  /// Returns the first and the last instructions in Chain.
+  std::pair<BasicBlock::iterator, BasicBlock::iterator>
+  getBoundaryInstrs(ArrayRef<Value *> Chain);
+
+  /// Erases the original instructions after vectorizing.
+  void eraseInstructions(ArrayRef<Value *> Chain);
+
+  /// "Legalize" the vector type that would be produced by combining \p
+  /// ElementSizeBits elements in \p Chain. Break into two pieces such that the
+  /// total size of each piece is 1, 2 or a multiple of 4 bytes. \p Chain is
+  /// expected to have more than 4 elements.
+  std::pair<ArrayRef<Value *>, ArrayRef<Value *>>
+  splitOddVectorElts(ArrayRef<Value *> Chain, unsigned ElementSizeBits);
+
+  /// Checks for instructions which may affect the memory accessed
+  /// in the chain between \p From and \p To. Returns Index, where
+  /// \p Chain[0, Index) is the largest vectorizable chain prefix.
+  /// The elements of \p Chain should be all loads or all stores.
+  unsigned getVectorizablePrefixEndIdx(ArrayRef<Value *> Chain,
+                                       BasicBlock::iterator From,
+                                       BasicBlock::iterator To);
+
+  /// Collects load and store instructions to vectorize.
+  void collectInstructions(BasicBlock *BB);
+
+  /// Processes the collected instructions, the \p Map. The elements of \p Map
+  /// should be all loads or all stores.
+  bool vectorizeChains(ValueListMap &Map);
+
+  /// Finds the load/stores to consecutive memory addresses and vectorizes them.
+  bool vectorizeInstructions(ArrayRef<Value *> Instrs);
+
+  /// Vectorizes the load instructions in Chain.
+  bool vectorizeLoadChain(ArrayRef<Value *> Chain,
+                          SmallPtrSet<Value *, 16> *InstructionsProcessed);
+
+  /// Vectorizes the store instructions in Chain.
+  bool vectorizeStoreChain(ArrayRef<Value *> Chain,
+                           SmallPtrSet<Value *, 16> *InstructionsProcessed);
+
+  /// Check if this load/store access is misaligned accesses
+  bool accessIsMisaligned(unsigned SzInBytes, unsigned AddressSpace,
+                          unsigned Alignment);
+};
+
+class LoadStoreVectorizer : public FunctionPass {
+public:
+  static char ID;
+
+  LoadStoreVectorizer() : FunctionPass(ID) {
+    initializeLoadStoreVectorizerPass(*PassRegistry::getPassRegistry());
+  }
+
+  bool runOnFunction(Function &F) override;
+
+  const char *getPassName() const override {
+    return "GPU Load and Store Vectorizer";
+  }
+
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    AU.addRequired<AAResultsWrapperPass>();
+    AU.addRequired<ScalarEvolutionWrapperPass>();
+    AU.addRequired<DominatorTreeWrapperPass>();
+    AU.addRequired<TargetTransformInfoWrapperPass>();
+    AU.setPreservesCFG();
+  }
+};
+}
+
+INITIALIZE_PASS_BEGIN(LoadStoreVectorizer, DEBUG_TYPE,
+                      "Vectorize load and Store instructions", false, false)
+INITIALIZE_PASS_DEPENDENCY(SCEVAAWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(GlobalsAAWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass)
+INITIALIZE_PASS_END(LoadStoreVectorizer, DEBUG_TYPE,
+                    "Vectorize load and store instructions", false, false)
+
+char LoadStoreVectorizer::ID = 0;
+
+Pass *llvm::createLoadStoreVectorizerPass() {
+  return new LoadStoreVectorizer();
+}
+
+bool LoadStoreVectorizer::runOnFunction(Function &F) {
+  // Don't vectorize when the attribute NoImplicitFloat is used.
+  if (skipFunction(F) || F.hasFnAttribute(Attribute::NoImplicitFloat))
+    return false;
+
+  AliasAnalysis &AA = getAnalysis<AAResultsWrapperPass>().getAAResults();
+  DominatorTree &DT = getAnalysis<DominatorTreeWrapperPass>().getDomTree();
+  ScalarEvolution &SE = getAnalysis<ScalarEvolutionWrapperPass>().getSE();
+  TargetTransformInfo &TTI =
+      getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F);
+
+  Vectorizer V(F, AA, DT, SE, TTI);
+  return V.run();
+}
+
+// Vectorizer Implementation
+bool Vectorizer::run() {
+  bool Changed = false;
+
+  // Scan the blocks in the function in post order.
+  for (BasicBlock *BB : post_order(&F)) {
+    collectInstructions(BB);
+    Changed |= vectorizeChains(LoadRefs);
+    Changed |= vectorizeChains(StoreRefs);
+  }
+
+  return Changed;
+}
+
+Value *Vectorizer::getPointerOperand(Value *I) {
+  if (LoadInst *LI = dyn_cast<LoadInst>(I))
+    return LI->getPointerOperand();
+  if (StoreInst *SI = dyn_cast<StoreInst>(I))
+    return SI->getPointerOperand();
+  return nullptr;
+}
+
+unsigned Vectorizer::getPointerAddressSpace(Value *I) {
+  if (LoadInst *L = dyn_cast<LoadInst>(I))
+    return L->getPointerAddressSpace();
+  if (StoreInst *S = dyn_cast<StoreInst>(I))
+    return S->getPointerAddressSpace();
+  return -1;
+}
+
+// FIXME: Merge with llvm::isConsecutiveAccess
+bool Vectorizer::isConsecutiveAccess(Value *A, Value *B) {
+  Value *PtrA = getPointerOperand(A);
+  Value *PtrB = getPointerOperand(B);
+  unsigned ASA = getPointerAddressSpace(A);
+  unsigned ASB = getPointerAddressSpace(B);
+
+  // Check that the address spaces match and that the pointers are valid.
+  if (!PtrA || !PtrB || (ASA != ASB))
+    return false;
+
+  // Make sure that A and B are different pointers of the same size type.
+  unsigned PtrBitWidth = DL.getPointerSizeInBits(ASA);
+  Type *PtrATy = PtrA->getType()->getPointerElementType();
+  Type *PtrBTy = PtrB->getType()->getPointerElementType();
+  if (PtrA == PtrB ||
+      DL.getTypeStoreSize(PtrATy) != DL.getTypeStoreSize(PtrBTy) ||
+      DL.getTypeStoreSize(PtrATy->getScalarType()) !=
+          DL.getTypeStoreSize(PtrBTy->getScalarType()))
+    return false;
+
+  APInt Size(PtrBitWidth, DL.getTypeStoreSize(PtrATy));
+
+  APInt OffsetA(PtrBitWidth, 0), OffsetB(PtrBitWidth, 0);
+  PtrA = PtrA->stripAndAccumulateInBoundsConstantOffsets(DL, OffsetA);
+  PtrB = PtrB->stripAndAccumulateInBoundsConstantOffsets(DL, OffsetB);
+
+  APInt OffsetDelta = OffsetB - OffsetA;
+
+  // Check if they are based on the same pointer. That makes the offsets
+  // sufficient.
+  if (PtrA == PtrB)
+    return OffsetDelta == Size;
+
+  // Compute the necessary base pointer delta to have the necessary final delta
+  // equal to the size.
+  APInt BaseDelta = Size - OffsetDelta;
+
+  // Compute the distance with SCEV between the base pointers.
+  const SCEV *PtrSCEVA = SE.getSCEV(PtrA);
+  const SCEV *PtrSCEVB = SE.getSCEV(PtrB);
+  const SCEV *C = SE.getConstant(BaseDelta);
+  const SCEV *X = SE.getAddExpr(PtrSCEVA, C);
+  if (X == PtrSCEVB)
+    return true;
+
+  // Sometimes even this doesn't work, because SCEV can't always see through
+  // patterns that look like (gep (ext (add (shl X, C1), C2))). Try checking
+  // things the hard way.
+
+  // Look through GEPs after checking they're the same except for the last
+  // index.
+  GetElementPtrInst *GEPA = dyn_cast<GetElementPtrInst>(getPointerOperand(A));
+  GetElementPtrInst *GEPB = dyn_cast<GetElementPtrInst>(getPointerOperand(B));
+  if (!GEPA || !GEPB || GEPA->getNumOperands() != GEPB->getNumOperands())
+    return false;
+  unsigned FinalIndex = GEPA->getNumOperands() - 1;
+  for (unsigned i = 0; i < FinalIndex; i++)
+    if (GEPA->getOperand(i) != GEPB->getOperand(i))
+      return false;
+
+  Instruction *OpA = dyn_cast<Instruction>(GEPA->getOperand(FinalIndex));
+  Instruction *OpB = dyn_cast<Instruction>(GEPB->getOperand(FinalIndex));
+  if (!OpA || !OpB || OpA->getOpcode() != OpB->getOpcode() ||
+      OpA->getType() != OpB->getType())
+    return false;
+
+  // Only look through a ZExt/SExt.
+  if (!isa<SExtInst>(OpA) && !isa<ZExtInst>(OpA))
+    return false;
+
+  bool Signed = isa<SExtInst>(OpA);
+
+  OpA = dyn_cast<Instruction>(OpA->getOperand(0));
+  OpB = dyn_cast<Instruction>(OpB->getOperand(0));
+  if (!OpA || !OpB || OpA->getType() != OpB->getType())
+    return false;
+
+  // Now we need to prove that adding 1 to OpA won't overflow.
+  bool Safe = false;
+  // First attempt: if OpB is an add with NSW/NUW, and OpB is 1 added to OpA,
+  // we're okay.
+  if (OpB->getOpcode() == Instruction::Add &&
+      isa<ConstantInt>(OpB->getOperand(1)) &&
+      cast<ConstantInt>(OpB->getOperand(1))->getSExtValue() > 0) {
+    if (Signed)
+      Safe = cast<BinaryOperator>(OpB)->hasNoSignedWrap();
+    else
+      Safe = cast<BinaryOperator>(OpB)->hasNoUnsignedWrap();
+  }
+
+  unsigned BitWidth = OpA->getType()->getScalarSizeInBits();
+
+  // Second attempt:
+  // If any bits are known to be zero other than the sign bit in OpA, we can
+  // add 1 to it while guaranteeing no overflow of any sort.
+  if (!Safe) {
+    APInt KnownZero(BitWidth, 0);
+    APInt KnownOne(BitWidth, 0);
+    computeKnownBits(OpA, KnownZero, KnownOne, DL, 0, nullptr, OpA, &DT);
+    KnownZero &= ~APInt::getHighBitsSet(BitWidth, 1);
+    if (KnownZero != 0)
+      Safe = true;
+  }
+
+  if (!Safe)
+    return false;
+
+  const SCEV *OffsetSCEVA = SE.getSCEV(OpA);
+  const SCEV *OffsetSCEVB = SE.getSCEV(OpB);
+  const SCEV *One = SE.getConstant(APInt(BitWidth, 1));
+  const SCEV *X2 = SE.getAddExpr(OffsetSCEVA, One);
+  return X2 == OffsetSCEVB;
+}
+
+void Vectorizer::reorder(Instruction *I) {
+  SmallPtrSet<Instruction *, 16> InstructionsToMove;
+  SmallVector<Instruction *, 16> Worklist;
+
+  Worklist.push_back(I);
+  while (!Worklist.empty()) {
+    Instruction *IW = Worklist.pop_back_val();
+    int NumOperands = IW->getNumOperands();
+    for (int i = 0; i < NumOperands; i++) {
+      Instruction *IM = dyn_cast<Instruction>(IW->getOperand(i));
+      if (!IM || IM->getOpcode() == Instruction::PHI)
+        continue;
+
+      if (!DT.dominates(IM, I)) {
+        InstructionsToMove.insert(IM);
+        Worklist.push_back(IM);
+        assert(IM->getParent() == IW->getParent() &&
+               "Instructions to move should be in the same basic block");
+      }
+    }
+  }
+
+  // All instructions to move should follow I. Start from I, not from begin().
+  for (auto BBI = I->getIterator(), E = I->getParent()->end(); BBI != E;
+       ++BBI) {
+    if (!is_contained(InstructionsToMove, &*BBI))
+      continue;
+    Instruction *IM = &*BBI;
+    --BBI;
+    IM->removeFromParent();
+    IM->insertBefore(I);
+  }
+}
+
+std::pair<BasicBlock::iterator, BasicBlock::iterator>
+Vectorizer::getBoundaryInstrs(ArrayRef<Value *> Chain) {
+  Instruction *C0 = cast<Instruction>(Chain[0]);
+  BasicBlock::iterator FirstInstr = C0->getIterator();
+  BasicBlock::iterator LastInstr = C0->getIterator();
+
+  BasicBlock *BB = C0->getParent();
+  unsigned NumFound = 0;
+  for (Instruction &I : *BB) {
+    if (!is_contained(Chain, &I))
+      continue;
+
+    ++NumFound;
+    if (NumFound == 1) {
+      FirstInstr = I.getIterator();
+    }
+    if (NumFound == Chain.size()) {
+      LastInstr = I.getIterator();
+      break;
+    }
+  }
+
+  // Range is [first, last).
+  return std::make_pair(FirstInstr, ++LastInstr);
+}
+
+void Vectorizer::eraseInstructions(ArrayRef<Value *> Chain) {
+  SmallVector<Instruction *, 16> Instrs;
+  for (Value *V : Chain) {
+    Value *PtrOperand = getPointerOperand(V);
+    assert(PtrOperand && "Instruction must have a pointer operand.");
+    Instrs.push_back(cast<Instruction>(V));
+    if (GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(PtrOperand))
+      Instrs.push_back(GEP);
+  }
+
+  // Erase instructions.
+  for (Value *V : Instrs) {
+    Instruction *Instr = cast<Instruction>(V);
+    if (Instr->use_empty())
+      Instr->eraseFromParent();
+  }
+}
+
+std::pair<ArrayRef<Value *>, ArrayRef<Value *>>
+Vectorizer::splitOddVectorElts(ArrayRef<Value *> Chain,
+                               unsigned ElementSizeBits) {
+  unsigned ElemSizeInBytes = ElementSizeBits / 8;
+  unsigned SizeInBytes = ElemSizeInBytes * Chain.size();
+  unsigned NumRight = (SizeInBytes % 4) / ElemSizeInBytes;
+  unsigned NumLeft = Chain.size() - NumRight;
+  return std::make_pair(Chain.slice(0, NumLeft), Chain.slice(NumLeft));
+}
+
+unsigned Vectorizer::getVectorizablePrefixEndIdx(ArrayRef<Value *> Chain,
+                                                 BasicBlock::iterator From,
+                                                 BasicBlock::iterator To) {
+  SmallVector<std::pair<Value *, unsigned>, 16> MemoryInstrs;
+  SmallVector<std::pair<Value *, unsigned>, 16> ChainInstrs;
+
+  unsigned InstrIdx = 0;
+  for (auto I = From; I != To; ++I, ++InstrIdx) {
+    if (isa<LoadInst>(I) || isa<StoreInst>(I)) {
+      if (!is_contained(Chain, &*I))
+        MemoryInstrs.push_back({&*I, InstrIdx});
+      else
+        ChainInstrs.push_back({&*I, InstrIdx});
+    } else if (I->mayHaveSideEffects()) {
+      DEBUG(dbgs() << "LSV: Found side-effecting operation: " << *I << '\n');
+      return 0;
+    }
+  }
+
+  assert(Chain.size() == ChainInstrs.size() &&
+         "All instructions in the Chain must exist in [From, To).");
+
+  unsigned ChainIdx = 0;
+  for (auto EntryChain : ChainInstrs) {
+    Value *ChainInstrValue = EntryChain.first;
+    unsigned ChainInstrIdx = EntryChain.second;
+    for (auto EntryMem : MemoryInstrs) {
+      Value *MemInstrValue = EntryMem.first;
+      unsigned MemInstrIdx = EntryMem.second;
+      if (isa<LoadInst>(MemInstrValue) && isa<LoadInst>(ChainInstrValue))
+        continue;
+
+      // We can ignore the alias as long as the load comes before the store,
+      // because that means we won't be moving the load past the store to
+      // vectorize it (the vectorized load is inserted at the location of the
+      // first load in the chain).
+      if (isa<StoreInst>(MemInstrValue) && isa<LoadInst>(ChainInstrValue) &&
+          ChainInstrIdx < MemInstrIdx)
+        continue;
+
+      // Same case, but in reverse.
+      if (isa<LoadInst>(MemInstrValue) && isa<StoreInst>(ChainInstrValue) &&
+          ChainInstrIdx > MemInstrIdx)
+        continue;
+
+      Instruction *M0 = cast<Instruction>(MemInstrValue);
+      Instruction *M1 = cast<Instruction>(ChainInstrValue);
+
+      if (!AA.isNoAlias(MemoryLocation::get(M0), MemoryLocation::get(M1))) {
+        DEBUG({
+          Value *Ptr0 = getPointerOperand(M0);
+          Value *Ptr1 = getPointerOperand(M1);
+
+          dbgs() << "LSV: Found alias.\n"
+                    "        Aliasing instruction and pointer:\n"
+                 << *MemInstrValue << " aliases " << *Ptr0 << '\n'
+                 << "        Aliased instruction and pointer:\n"
+                 << *ChainInstrValue << " aliases " << *Ptr1 << '\n';
+        });
+
+        return ChainIdx;
+      }
+    }
+    ChainIdx++;
+  }
+  return Chain.size();
+}
+
+void Vectorizer::collectInstructions(BasicBlock *BB) {
+  LoadRefs.clear();
+  StoreRefs.clear();
+
+  for (Instruction &I : *BB) {
+    if (!I.mayReadOrWriteMemory())
+      continue;
+
+    if (LoadInst *LI = dyn_cast<LoadInst>(&I)) {
+      if (!LI->isSimple())
+        continue;
+
+      Type *Ty = LI->getType();
+      if (!VectorType::isValidElementType(Ty->getScalarType()))
+        continue;
+
+      // Skip weird non-byte sizes. They probably aren't worth the effort of
+      // handling correctly.
+      unsigned TySize = DL.getTypeSizeInBits(Ty);
+      if (TySize < 8)
+        continue;
+
+      Value *Ptr = LI->getPointerOperand();
+      unsigned AS = Ptr->getType()->getPointerAddressSpace();
+      unsigned VecRegSize = TTI.getLoadStoreVecRegBitWidth(AS);
+
+      // No point in looking at these if they're too big to vectorize.
+      if (TySize > VecRegSize / 2)
+        continue;
+
+      // Make sure all the users of a vector are constant-index extracts.
+      if (isa<VectorType>(Ty) && !all_of(LI->users(), [LI](const User *U) {
+            const Instruction *UI = cast<Instruction>(U);
+            return isa<ExtractElementInst>(UI) &&
+                   isa<ConstantInt>(UI->getOperand(1));
+          }))
+        continue;
+
+      // TODO: Target hook to filter types.
+
+      // Save the load locations.
+      Value *ObjPtr = GetUnderlyingObject(Ptr, DL);
+      LoadRefs[ObjPtr].push_back(LI);
+
+    } else if (StoreInst *SI = dyn_cast<StoreInst>(&I)) {
+      if (!SI->isSimple())
+        continue;
+
+      Type *Ty = SI->getValueOperand()->getType();
+      if (!VectorType::isValidElementType(Ty->getScalarType()))
+        continue;
+
+      // Skip weird non-byte sizes. They probably aren't worth the effort of
+      // handling correctly.
+      unsigned TySize = DL.getTypeSizeInBits(Ty);
+      if (TySize < 8)
+        continue;
+
+      Value *Ptr = SI->getPointerOperand();
+      unsigned AS = Ptr->getType()->getPointerAddressSpace();
+      unsigned VecRegSize = TTI.getLoadStoreVecRegBitWidth(AS);
+      if (TySize > VecRegSize / 2)
+        continue;
+
+      if (isa<VectorType>(Ty) && !all_of(SI->users(), [SI](const User *U) {
+            const Instruction *UI = cast<Instruction>(U);
+            return isa<ExtractElementInst>(UI) &&
+                   isa<ConstantInt>(UI->getOperand(1));
+          }))
+        continue;
+
+      // Save store location.
+      Value *ObjPtr = GetUnderlyingObject(Ptr, DL);
+      StoreRefs[ObjPtr].push_back(SI);
+    }
+  }
+}
+
+bool Vectorizer::vectorizeChains(ValueListMap &Map) {
+  bool Changed = false;
+
+  for (const std::pair<Value *, ValueList> &Chain : Map) {
+    unsigned Size = Chain.second.size();
+    if (Size < 2)
+      continue;
+
+    DEBUG(dbgs() << "LSV: Analyzing a chain of length " << Size << ".\n");
+
+    // Process the stores in chunks of 64.
+    for (unsigned CI = 0, CE = Size; CI < CE; CI += 64) {
+      unsigned Len = std::min<unsigned>(CE - CI, 64);
+      ArrayRef<Value *> Chunk(&Chain.second[CI], Len);
+      Changed |= vectorizeInstructions(Chunk);
+    }
+  }
+
+  return Changed;
+}
+
+bool Vectorizer::vectorizeInstructions(ArrayRef<Value *> Instrs) {
+  DEBUG(dbgs() << "LSV: Vectorizing " << Instrs.size() << " instructions.\n");
+  SmallSetVector<int, 16> Heads, Tails;
+  int ConsecutiveChain[64];
+
+  // Do a quadratic search on all of the given stores and find all of the pairs
+  // of stores that follow each other.
+  for (int i = 0, e = Instrs.size(); i < e; ++i) {
+    ConsecutiveChain[i] = -1;
+    for (int j = e - 1; j >= 0; --j) {
+      if (i == j)
+        continue;
+
+      if (isConsecutiveAccess(Instrs[i], Instrs[j])) {
+        if (ConsecutiveChain[i] != -1) {
+          int CurDistance = std::abs(ConsecutiveChain[i] - i);
+          int NewDistance = std::abs(ConsecutiveChain[i] - j);
+          if (j < i || NewDistance > CurDistance)
+            continue; // Should not insert.
+        }
+
+        Tails.insert(j);
+        Heads.insert(i);
+        ConsecutiveChain[i] = j;
+      }
+    }
+  }
+
+  bool Changed = false;
+  SmallPtrSet<Value *, 16> InstructionsProcessed;
+
+  for (int Head : Heads) {
+    if (InstructionsProcessed.count(Instrs[Head]))
+      continue;
+    bool longerChainExists = false;
+    for (unsigned TIt = 0; TIt < Tails.size(); TIt++)
+      if (Head == Tails[TIt] &&
+          !InstructionsProcessed.count(Instrs[Heads[TIt]])) {
+        longerChainExists = true;
+        break;
+      }
+    if (longerChainExists)
+      continue;
+
+    // We found an instr that starts a chain. Now follow the chain and try to
+    // vectorize it.
+    SmallVector<Value *, 16> Operands;
+    int I = Head;
+    while (I != -1 && (Tails.count(I) || Heads.count(I))) {
+      if (InstructionsProcessed.count(Instrs[I]))
+        break;
+
+      Operands.push_back(Instrs[I]);
+      I = ConsecutiveChain[I];
+    }
+
+    bool Vectorized = false;
+    if (isa<LoadInst>(*Operands.begin()))
+      Vectorized = vectorizeLoadChain(Operands, &InstructionsProcessed);
+    else
+      Vectorized = vectorizeStoreChain(Operands, &InstructionsProcessed);
+
+    Changed |= Vectorized;
+  }
+
+  return Changed;
+}
+
+bool Vectorizer::vectorizeStoreChain(
+    ArrayRef<Value *> Chain, SmallPtrSet<Value *, 16> *InstructionsProcessed) {
+  StoreInst *S0 = cast<StoreInst>(Chain[0]);
+
+  // If the vector has an int element, default to int for the whole load.
+  Type *StoreTy;
+  for (const auto &V : Chain) {
+    StoreTy = cast<StoreInst>(V)->getValueOperand()->getType();
+    if (StoreTy->isIntOrIntVectorTy())
+      break;
+
+    if (StoreTy->isPtrOrPtrVectorTy()) {
+      StoreTy = Type::getIntNTy(F.getParent()->getContext(),
+                                DL.getTypeSizeInBits(StoreTy));
+      break;
+    }
+  }
+
+  unsigned Sz = DL.getTypeSizeInBits(StoreTy);
+  unsigned AS = S0->getPointerAddressSpace();
+  unsigned VecRegSize = TTI.getLoadStoreVecRegBitWidth(AS);
+  unsigned VF = VecRegSize / Sz;
+  unsigned ChainSize = Chain.size();
+
+  if (!isPowerOf2_32(Sz) || VF < 2 || ChainSize < 2) {
+    InstructionsProcessed->insert(Chain.begin(), Chain.end());
+    return false;
+  }
+
+  BasicBlock::iterator First, Last;
+  std::tie(First, Last) = getBoundaryInstrs(Chain);
+  unsigned StopChain = getVectorizablePrefixEndIdx(Chain, First, Last);
+  if (StopChain == 0) {
+    // There exists a side effect instruction, no vectorization possible.
+    InstructionsProcessed->insert(Chain.begin(), Chain.end());
+    return false;
+  }
+  if (StopChain == 1) {
+    // Failed after the first instruction. Discard it and try the smaller chain.
+    InstructionsProcessed->insert(Chain.front());
+    return false;
+  }
+
+  // Update Chain to the valid vectorizable subchain.
+  Chain = Chain.slice(0, StopChain);
+  ChainSize = Chain.size();
+
+  // Store size should be 1B, 2B or multiple of 4B.
+  // TODO: Target hook for size constraint?
+  unsigned SzInBytes = (Sz / 8) * ChainSize;
+  if (SzInBytes > 2 && SzInBytes % 4 != 0) {
+    DEBUG(dbgs() << "LSV: Size should be 1B, 2B "
+                    "or multiple of 4B. Splitting.\n");
+    if (SzInBytes == 3)
+      return vectorizeStoreChain(Chain.slice(0, ChainSize - 1),
+                                 InstructionsProcessed);
+
+    auto Chains = splitOddVectorElts(Chain, Sz);
+    return vectorizeStoreChain(Chains.first, InstructionsProcessed) |
+           vectorizeStoreChain(Chains.second, InstructionsProcessed);
+  }
+
+  VectorType *VecTy;
+  VectorType *VecStoreTy = dyn_cast<VectorType>(StoreTy);
+  if (VecStoreTy)
+    VecTy = VectorType::get(StoreTy->getScalarType(),
+                            Chain.size() * VecStoreTy->getNumElements());
+  else
+    VecTy = VectorType::get(StoreTy, Chain.size());
+
+  // If it's more than the max vector size, break it into two pieces.
+  // TODO: Target hook to control types to split to.
+  if (ChainSize > VF) {
+    DEBUG(dbgs() << "LSV: Vector factor is too big."
+                    " Creating two separate arrays.\n");
+    return vectorizeStoreChain(Chain.slice(0, VF), InstructionsProcessed) |
+           vectorizeStoreChain(Chain.slice(VF), InstructionsProcessed);
+  }
+
+  DEBUG({
+    dbgs() << "LSV: Stores to vectorize:\n";
+    for (Value *V : Chain)
+      V->dump();
+  });
+
+  // We won't try again to vectorize the elements of the chain, regardless of
+  // whether we succeed below.
+  InstructionsProcessed->insert(Chain.begin(), Chain.end());
+
+  // Check alignment restrictions.
+  unsigned Alignment = getAlignment(S0);
+
+  // If the store is going to be misaligned, don't vectorize it.
+  if (accessIsMisaligned(SzInBytes, AS, Alignment)) {
+    if (S0->getPointerAddressSpace() != 0)
+      return false;
+
+    // If we're storing to an object on the stack, we control its alignment,
+    // so we can cheat and change it!
+    Value *V = GetUnderlyingObject(S0->getPointerOperand(), DL);
+    if (AllocaInst *AI = dyn_cast_or_null<AllocaInst>(V)) {
+      AI->setAlignment(TargetBaseAlign);
+      Alignment = TargetBaseAlign;
+    } else {
+      return false;
+    }
+  }
+
+  // Set insert point.
+  Builder.SetInsertPoint(&*Last);
+
+  Value *Vec = UndefValue::get(VecTy);
+
+  if (VecStoreTy) {
+    unsigned VecWidth = VecStoreTy->getNumElements();
+    for (unsigned I = 0, E = Chain.size(); I != E; ++I) {
+      StoreInst *Store = cast<StoreInst>(Chain[I]);
+      for (unsigned J = 0, NE = VecStoreTy->getNumElements(); J != NE; ++J) {
+        unsigned NewIdx = J + I * VecWidth;
+        Value *Extract = Builder.CreateExtractElement(Store->getValueOperand(),
+                                                      Builder.getInt32(J));
+        if (Extract->getType() != StoreTy->getScalarType())
+          Extract = Builder.CreateBitCast(Extract, StoreTy->getScalarType());
+
+        Value *Insert =
+            Builder.CreateInsertElement(Vec, Extract, Builder.getInt32(NewIdx));
+        Vec = Insert;
+      }
+    }
+  } else {
+    for (unsigned I = 0, E = Chain.size(); I != E; ++I) {
+      StoreInst *Store = cast<StoreInst>(Chain[I]);
+      Value *Extract = Store->getValueOperand();
+      if (Extract->getType() != StoreTy->getScalarType())
+        Extract =
+            Builder.CreateBitOrPointerCast(Extract, StoreTy->getScalarType());
+
+      Value *Insert =
+          Builder.CreateInsertElement(Vec, Extract, Builder.getInt32(I));
+      Vec = Insert;
+    }
+  }
+
+  Value *Bitcast =
+      Builder.CreateBitCast(S0->getPointerOperand(), VecTy->getPointerTo(AS));
+  StoreInst *SI = cast<StoreInst>(Builder.CreateStore(Vec, Bitcast));
+  propagateMetadata(SI, Chain);
+  SI->setAlignment(Alignment);
+
+  eraseInstructions(Chain);
+  ++NumVectorInstructions;
+  NumScalarsVectorized += Chain.size();
+  return true;
+}
+
+bool Vectorizer::vectorizeLoadChain(
+    ArrayRef<Value *> Chain, SmallPtrSet<Value *, 16> *InstructionsProcessed) {
+  LoadInst *L0 = cast<LoadInst>(Chain[0]);
+
+  // If the vector has an int element, default to int for the whole load.
+  Type *LoadTy;
+  for (const auto &V : Chain) {
+    LoadTy = cast<LoadInst>(V)->getType();
+    if (LoadTy->isIntOrIntVectorTy())
+      break;
+
+    if (LoadTy->isPtrOrPtrVectorTy()) {
+      LoadTy = Type::getIntNTy(F.getParent()->getContext(),
+                               DL.getTypeSizeInBits(LoadTy));
+      break;
+    }
+  }
+
+  unsigned Sz = DL.getTypeSizeInBits(LoadTy);
+  unsigned AS = L0->getPointerAddressSpace();
+  unsigned VecRegSize = TTI.getLoadStoreVecRegBitWidth(AS);
+  unsigned VF = VecRegSize / Sz;
+  unsigned ChainSize = Chain.size();
+
+  if (!isPowerOf2_32(Sz) || VF < 2 || ChainSize < 2) {
+    InstructionsProcessed->insert(Chain.begin(), Chain.end());
+    return false;
+  }
+
+  BasicBlock::iterator First, Last;
+  std::tie(First, Last) = getBoundaryInstrs(Chain);
+  unsigned StopChain = getVectorizablePrefixEndIdx(Chain, First, Last);
+  if (StopChain == 0) {
+    // There exists a side effect instruction, no vectorization possible.
+    InstructionsProcessed->insert(Chain.begin(), Chain.end());
+    return false;
+  }
+  if (StopChain == 1) {
+    // Failed after the first instruction. Discard it and try the smaller chain.
+    InstructionsProcessed->insert(Chain.front());
+    return false;
+  }
+
+  // Update Chain to the valid vectorizable subchain.
+  Chain = Chain.slice(0, StopChain);
+  ChainSize = Chain.size();
+
+  // Load size should be 1B, 2B or multiple of 4B.
+  // TODO: Should size constraint be a target hook?
+  unsigned SzInBytes = (Sz / 8) * ChainSize;
+  if (SzInBytes > 2 && SzInBytes % 4 != 0) {
+    DEBUG(dbgs() << "LSV: Size should be 1B, 2B "
+                    "or multiple of 4B. Splitting.\n");
+    if (SzInBytes == 3)
+      return vectorizeLoadChain(Chain.slice(0, ChainSize - 1),
+                                InstructionsProcessed);
+    auto Chains = splitOddVectorElts(Chain, Sz);
+    return vectorizeLoadChain(Chains.first, InstructionsProcessed) |
+           vectorizeLoadChain(Chains.second, InstructionsProcessed);
+  }
+
+  VectorType *VecTy;
+  VectorType *VecLoadTy = dyn_cast<VectorType>(LoadTy);
+  if (VecLoadTy)
+    VecTy = VectorType::get(LoadTy->getScalarType(),
+                            Chain.size() * VecLoadTy->getNumElements());
+  else
+    VecTy = VectorType::get(LoadTy, Chain.size());
+
+  // If it's more than the max vector size, break it into two pieces.
+  // TODO: Target hook to control types to split to.
+  if (ChainSize > VF) {
+    DEBUG(dbgs() << "LSV: Vector factor is too big. "
+                    "Creating two separate arrays.\n");
+    return vectorizeLoadChain(Chain.slice(0, VF), InstructionsProcessed) |
+           vectorizeLoadChain(Chain.slice(VF), InstructionsProcessed);
+  }
+
+  // We won't try again to vectorize the elements of the chain, regardless of
+  // whether we succeed below.
+  InstructionsProcessed->insert(Chain.begin(), Chain.end());
+
+  // Check alignment restrictions.
+  unsigned Alignment = getAlignment(L0);
+
+  // If the load is going to be misaligned, don't vectorize it.
+  if (accessIsMisaligned(SzInBytes, AS, Alignment)) {
+    if (L0->getPointerAddressSpace() != 0)
+      return false;
+
+    // If we're loading from an object on the stack, we control its alignment,
+    // so we can cheat and change it!
+    Value *V = GetUnderlyingObject(L0->getPointerOperand(), DL);
+    if (AllocaInst *AI = dyn_cast_or_null<AllocaInst>(V)) {
+      AI->setAlignment(TargetBaseAlign);
+      Alignment = TargetBaseAlign;
+    } else {
+      return false;
+    }
+  }
+
+  DEBUG({
+    dbgs() << "LSV: Loads to vectorize:\n";
+    for (Value *V : Chain)
+      V->dump();
+  });
+
+  // Set insert point.
+  Builder.SetInsertPoint(&*First);
+
+  Value *Bitcast =
+      Builder.CreateBitCast(L0->getPointerOperand(), VecTy->getPointerTo(AS));
+
+  LoadInst *LI = cast<LoadInst>(Builder.CreateLoad(Bitcast));
+  propagateMetadata(LI, Chain);
+  LI->setAlignment(Alignment);
+
+  if (VecLoadTy) {
+    SmallVector<Instruction *, 16> InstrsToErase;
+    SmallVector<Instruction *, 16> InstrsToReorder;
+    InstrsToReorder.push_back(cast<Instruction>(Bitcast));
+
+    unsigned VecWidth = VecLoadTy->getNumElements();
+    for (unsigned I = 0, E = Chain.size(); I != E; ++I) {
+      for (auto Use : Chain[I]->users()) {
+        Instruction *UI = cast<Instruction>(Use);
+        unsigned Idx = cast<ConstantInt>(UI->getOperand(1))->getZExtValue();
+        unsigned NewIdx = Idx + I * VecWidth;
+        Value *V = Builder.CreateExtractElement(LI, Builder.getInt32(NewIdx));
+        Instruction *Extracted = cast<Instruction>(V);
+        if (Extracted->getType() != UI->getType())
+          Extracted = cast<Instruction>(
+              Builder.CreateBitCast(Extracted, UI->getType()));
+
+        // Replace the old instruction.
+        UI->replaceAllUsesWith(Extracted);
+        InstrsToErase.push_back(UI);
+      }
+    }
+
+    for (Instruction *ModUser : InstrsToReorder)
+      reorder(ModUser);
+
+    for (auto I : InstrsToErase)
+      I->eraseFromParent();
+  } else {
+    SmallVector<Instruction *, 16> InstrsToReorder;
+    InstrsToReorder.push_back(cast<Instruction>(Bitcast));
+
+    for (unsigned I = 0, E = Chain.size(); I != E; ++I) {
+      Value *V = Builder.CreateExtractElement(LI, Builder.getInt32(I));
+      Instruction *Extracted = cast<Instruction>(V);
+      Instruction *UI = cast<Instruction>(Chain[I]);
+      if (Extracted->getType() != UI->getType()) {
+        Extracted = cast<Instruction>(
+            Builder.CreateBitOrPointerCast(Extracted, UI->getType()));
+      }
+
+      // Replace the old instruction.
+      UI->replaceAllUsesWith(Extracted);
+    }
+
+    for (Instruction *ModUser : InstrsToReorder)
+      reorder(ModUser);
+  }
+
+  eraseInstructions(Chain);
+
+  ++NumVectorInstructions;
+  NumScalarsVectorized += Chain.size();
+  return true;
+}
+
+bool Vectorizer::accessIsMisaligned(unsigned SzInBytes, unsigned AddressSpace,
+                                    unsigned Alignment) {
+  bool Fast = false;
+  bool Allows = TTI.allowsMisalignedMemoryAccesses(SzInBytes * 8, AddressSpace,
+                                                   Alignment, &Fast);
+  // TODO: Remove TargetBaseAlign
+  return !(Allows && Fast) && (Alignment % SzInBytes) != 0 &&
+         (Alignment % TargetBaseAlign) != 0;
+}
diff --git a/lib/Transforms/Vectorize/LoopVectorize.cpp b/lib/Transforms/Vectorize/LoopVectorize.cpp
index 17c25dfffc10..8b85e320d3b2 100644
--- a/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -46,7 +46,7 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "llvm/Transforms/Vectorize.h"
+#include "llvm/Transforms/Vectorize/LoopVectorize.h"
 #include "llvm/ADT/DenseMap.h"
 #include "llvm/ADT/Hashing.h"
 #include "llvm/ADT/MapVector.h"
@@ -56,23 +56,15 @@
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/Statistic.h"
 #include "llvm/ADT/StringExtras.h"
-#include "llvm/Analysis/AliasAnalysis.h"
-#include "llvm/Analysis/BasicAliasAnalysis.h"
-#include "llvm/Analysis/AliasSetTracker.h"
-#include "llvm/Analysis/AssumptionCache.h"
-#include "llvm/Analysis/BlockFrequencyInfo.h"
 #include "llvm/Analysis/CodeMetrics.h"
-#include "llvm/Analysis/DemandedBits.h"
 #include "llvm/Analysis/GlobalsModRef.h"
-#include "llvm/Analysis/LoopAccessAnalysis.h"
 #include "llvm/Analysis/LoopInfo.h"
 #include "llvm/Analysis/LoopIterator.h"
 #include "llvm/Analysis/LoopPass.h"
-#include "llvm/Analysis/ScalarEvolution.h"
 #include "llvm/Analysis/ScalarEvolutionExpander.h"
 #include "llvm/Analysis/ScalarEvolutionExpressions.h"
-#include "llvm/Analysis/TargetTransformInfo.h"
 #include "llvm/Analysis/ValueTracking.h"
+#include "llvm/Analysis/VectorUtils.h"
 #include "llvm/IR/Constants.h"
 #include "llvm/IR/DataLayout.h"
 #include "llvm/IR/DebugInfo.h"
@@ -98,10 +90,10 @@
 #include "llvm/Transforms/Scalar.h"
 #include "llvm/Transforms/Utils/BasicBlockUtils.h"
 #include "llvm/Transforms/Utils/Local.h"
-#include "llvm/Analysis/VectorUtils.h"
 #include "llvm/Transforms/Utils/LoopUtils.h"
+#include "llvm/Transforms/Utils/LoopVersioning.h"
+#include "llvm/Transforms/Vectorize.h"
 #include <algorithm>
-#include <functional>
 #include <map>
 #include <tuple>
 
@@ -115,37 +107,21 @@ STATISTIC(LoopsVectorized, "Number of loops vectorized");
 STATISTIC(LoopsAnalyzed, "Number of loops analyzed for vectorization");
 
 static cl::opt<bool>
-EnableIfConversion("enable-if-conversion", cl::init(true), cl::Hidden,
-                   cl::desc("Enable if-conversion during vectorization."));
+    EnableIfConversion("enable-if-conversion", cl::init(true), cl::Hidden,
+                       cl::desc("Enable if-conversion during vectorization."));
 
 /// We don't vectorize loops with a known constant trip count below this number.
-static cl::opt<unsigned>
-TinyTripCountVectorThreshold("vectorizer-min-trip-count", cl::init(16),
-                             cl::Hidden,
-                             cl::desc("Don't vectorize loops with a constant "
-                                      "trip count that is smaller than this "
-                                      "value."));
+static cl::opt<unsigned> TinyTripCountVectorThreshold(
+    "vectorizer-min-trip-count", cl::init(16), cl::Hidden,
+    cl::desc("Don't vectorize loops with a constant "
+             "trip count that is smaller than this "
+             "value."));
 
 static cl::opt<bool> MaximizeBandwidth(
     "vectorizer-maximize-bandwidth", cl::init(false), cl::Hidden,
     cl::desc("Maximize bandwidth when selecting vectorization factor which "
              "will be determined by the smallest type in loop."));
 
-/// This enables versioning on the strides of symbolically striding memory
-/// accesses in code like the following.
-///   for (i = 0; i < N; ++i)
-///     A[i * Stride1] += B[i * Stride2] ...
-///
-/// Will be roughly translated to
-///    if (Stride1 == 1 && Stride2 == 1) {
-///      for (i = 0; i < N; i+=4)
-///       A[i:i+3] += ...
-///    } else
-///      ...
-static cl::opt<bool> EnableMemAccessVersioning(
-    "enable-mem-access-versioning", cl::init(true), cl::Hidden,
-    cl::desc("Enable symbolic stride memory access versioning"));
-
 static cl::opt<bool> EnableInterleavedMemAccesses(
     "enable-interleaved-mem-accesses", cl::init(false), cl::Hidden,
     cl::desc("Enable vectorization on interleaved memory accesses in a loop"));
@@ -262,7 +238,7 @@ public:
 /// A helper function for converting Scalar types to vector types.
 /// If the incoming type is void, we return void. If the VF is 1, we return
 /// the scalar type.
-static Type* ToVectorTy(Type *Scalar, unsigned VF) {
+static Type *ToVectorTy(Type *Scalar, unsigned VF) {
   if (Scalar->isVoidTy() || VF == 1)
     return Scalar;
   return VectorType::get(Scalar, VF);
@@ -313,21 +289,25 @@ public:
   InnerLoopVectorizer(Loop *OrigLoop, PredicatedScalarEvolution &PSE,
                       LoopInfo *LI, DominatorTree *DT,
                       const TargetLibraryInfo *TLI,
-                      const TargetTransformInfo *TTI, unsigned VecWidth,
-                      unsigned UnrollFactor)
+                      const TargetTransformInfo *TTI, AssumptionCache *AC,
+                      unsigned VecWidth, unsigned UnrollFactor)
       : OrigLoop(OrigLoop), PSE(PSE), LI(LI), DT(DT), TLI(TLI), TTI(TTI),
-        VF(VecWidth), UF(UnrollFactor), Builder(PSE.getSE()->getContext()),
-        Induction(nullptr), OldInduction(nullptr), WidenMap(UnrollFactor),
-        TripCount(nullptr), VectorTripCount(nullptr), Legal(nullptr),
-        AddedSafetyChecks(false) {}
+        AC(AC), VF(VecWidth), UF(UnrollFactor),
+        Builder(PSE.getSE()->getContext()), Induction(nullptr),
+        OldInduction(nullptr), WidenMap(UnrollFactor), TripCount(nullptr),
+        VectorTripCount(nullptr), Legal(nullptr), AddedSafetyChecks(false) {}
 
   // Perform the actual loop widening (vectorization).
   // MinimumBitWidths maps scalar integer values to the smallest bitwidth they
   // can be validly truncated to. The cost model has assumed this truncation
-  // will happen when vectorizing.
+  // will happen when vectorizing. VecValuesToIgnore contains scalar values
+  // that the cost model has chosen to ignore because they will not be
+  // vectorized.
   void vectorize(LoopVectorizationLegality *L,
-                 MapVector<Instruction*,uint64_t> MinimumBitWidths) {
-    MinBWs = MinimumBitWidths;
+                 const MapVector<Instruction *, uint64_t> &MinimumBitWidths,
+                 SmallPtrSetImpl<const Value *> &VecValuesToIgnore) {
+    MinBWs = &MinimumBitWidths;
+    ValuesNotWidened = &VecValuesToIgnore;
     Legal = L;
     // Create a new empty loop. Unlink the old loop and connect the new one.
     createEmptyLoop();
@@ -337,33 +317,41 @@ public:
   }
 
   // Return true if any runtime check is added.
-  bool IsSafetyChecksAdded() {
-    return AddedSafetyChecks;
-  }
+  bool areSafetyChecksAdded() { return AddedSafetyChecks; }
 
   virtual ~InnerLoopVectorizer() {}
 
 protected:
   /// A small list of PHINodes.
-  typedef SmallVector<PHINode*, 4> PhiVector;
+  typedef SmallVector<PHINode *, 4> PhiVector;
   /// When we unroll loops we have multiple vector values for each scalar.
   /// This data structure holds the unrolled and vectorized values that
   /// originated from one scalar instruction.
-  typedef SmallVector<Value*, 2> VectorParts;
+  typedef SmallVector<Value *, 2> VectorParts;
 
   // When we if-convert we need to create edge masks. We have to cache values
   // so that we don't end up with exponential recursion/IR.
-  typedef DenseMap<std::pair<BasicBlock*, BasicBlock*>,
-                   VectorParts> EdgeMaskCache;
+  typedef DenseMap<std::pair<BasicBlock *, BasicBlock *>, VectorParts>
+      EdgeMaskCache;
 
   /// Create an empty loop, based on the loop ranges of the old loop.
   void createEmptyLoop();
+
+  /// Set up the values of the IVs correctly when exiting the vector loop.
+  void fixupIVUsers(PHINode *OrigPhi, const InductionDescriptor &II,
+                    Value *CountRoundDown, Value *EndValue,
+                    BasicBlock *MiddleBlock);
+
   /// Create a new induction variable inside L.
   PHINode *createInductionVariable(Loop *L, Value *Start, Value *End,
                                    Value *Step, Instruction *DL);
   /// Copy and widen the instructions from the old loop.
   virtual void vectorizeLoop();
 
+  /// Fix a first-order recurrence. This is the second phase of vectorizing
+  /// this phi node.
+  void fixFirstOrderRecurrence(PHINode *Phi);
+
   /// \brief The Loop exit block may have single value PHI nodes where the
   /// incoming value is 'Undef'. While vectorizing we only handled real values
   /// that were defined inside the loop. Here we fix the 'undef case'.
@@ -372,7 +360,7 @@ protected:
 
   /// Shrinks vector element sizes based on information in "MinBWs".
   void truncateToMinimalBitwidths();
-  
+
   /// A helper function that computes the predicate of the block BB, assuming
   /// that the header block of the loop is set to True. It returns the *entry*
   /// mask for the block BB.
@@ -383,12 +371,12 @@ protected:
 
   /// A helper function to vectorize a single BB within the innermost loop.
   void vectorizeBlockInLoop(BasicBlock *BB, PhiVector *PV);
-  
+
   /// Vectorize a single PHINode in a block. This method handles the induction
   /// variable canonicalization. It supports both VF = 1 for unrolled loops and
   /// arbitrary length vectors.
-  void widenPHIInstruction(Instruction *PN, VectorParts &Entry,
-                           unsigned UF, unsigned VF, PhiVector *PV);
+  void widenPHIInstruction(Instruction *PN, VectorParts &Entry, unsigned UF,
+                           unsigned VF, PhiVector *PV);
 
   /// Insert the new loop to the loop hierarchy and pass manager
   /// and update the analysis passes.
@@ -399,7 +387,7 @@ protected:
   /// scalarized instruction behind an if block predicated on the control
   /// dependence of the instruction.
   virtual void scalarizeInstruction(Instruction *Instr,
-                                    bool IfPredicateStore=false);
+                                    bool IfPredicateStore = false);
 
   /// Vectorize Load and Store instructions,
   virtual void vectorizeMemoryInstruction(Instruction *Instr);
@@ -415,6 +403,26 @@ protected:
   /// to each vector element of Val. The sequence starts at StartIndex.
   virtual Value *getStepVector(Value *Val, int StartIdx, Value *Step);
 
+  /// Compute scalar induction steps. \p ScalarIV is the scalar induction
+  /// variable on which to base the steps, \p Step is the size of the step, and
+  /// \p EntryVal is the value from the original loop that maps to the steps.
+  /// Note that \p EntryVal doesn't have to be an induction variable (e.g., it
+  /// can be a truncate instruction).
+  void buildScalarSteps(Value *ScalarIV, Value *Step, Value *EntryVal);
+
+  /// Create a vector induction phi node based on an existing scalar one. This
+  /// currently only works for integer induction variables with a constant
+  /// step. If \p TruncType is non-null, instead of widening the original IV,
+  /// we widen a version of the IV truncated to \p TruncType.
+  void createVectorIntInductionPHI(const InductionDescriptor &II,
+                                   VectorParts &Entry, IntegerType *TruncType);
+
+  /// Widen an integer induction variable \p IV. If \p Trunc is provided, the
+  /// induction variable will first be truncated to the corresponding type. The
+  /// widened values are placed in \p Entry.
+  void widenIntInduction(PHINode *IV, VectorParts &Entry,
+                         TruncInst *Trunc = nullptr);
+
   /// When we go over instructions in the basic block we rely on previous
   /// values within the current basic block or on loop invariant values.
   /// When we widen (vectorize) values we place them in the map. If the values
@@ -445,6 +453,24 @@ protected:
   /// Emit bypass checks to check any memory assumptions we may have made.
   void emitMemRuntimeChecks(Loop *L, BasicBlock *Bypass);
 
+  /// Add additional metadata to \p To that was not present on \p Orig.
+  ///
+  /// Currently this is used to add the noalias annotations based on the
+  /// inserted memchecks.  Use this for instructions that are *cloned* into the
+  /// vector loop.
+  void addNewMetadata(Instruction *To, const Instruction *Orig);
+
+  /// Add metadata from one instruction to another.
+  ///
+  /// This includes both the original MDs from \p From and additional ones (\see
+  /// addNewMetadata).  Use this for *newly created* instructions in the vector
+  /// loop.
+  void addMetadata(Instruction *To, Instruction *From);
+
+  /// \brief Similar to the previous function but it adds the metadata to a
+  /// vector of instructions.
+  void addMetadata(ArrayRef<Value *> To, Instruction *From);
+
   /// This is a helper class that holds the vectorizer state. It maps scalar
   /// instructions to vector instructions. When the code is 'unrolled' then
   /// then a single scalar value is mapped to multiple vector parts. The parts
@@ -501,6 +527,15 @@ protected:
   const TargetLibraryInfo *TLI;
   /// Target Transform Info.
   const TargetTransformInfo *TTI;
+  /// Assumption Cache.
+  AssumptionCache *AC;
+
+  /// \brief LoopVersioning.  It's only set up (non-null) if memchecks were
+  /// used.
+  ///
+  /// This is currently only used to add no-alias metadata based on the
+  /// memchecks.  The actually versioning is performed manually.
+  std::unique_ptr<LoopVersioning> LVer;
 
   /// The vectorization SIMD factor to use. Each vector will have this many
   /// vector elements.
@@ -522,11 +557,11 @@ protected:
   BasicBlock *LoopScalarPreHeader;
   /// Middle Block between the vector and the scalar.
   BasicBlock *LoopMiddleBlock;
-  ///The ExitBlock of the scalar loop.
+  /// The ExitBlock of the scalar loop.
   BasicBlock *LoopExitBlock;
-  ///The vector loop body.
-  SmallVector<BasicBlock *, 4> LoopVectorBody;
-  ///The scalar loop body.
+  /// The vector loop body.
+  BasicBlock *LoopVectorBody;
+  /// The scalar loop body.
   BasicBlock *LoopScalarBody;
   /// A list of all bypass blocks. The first block is the entry of the loop.
   SmallVector<BasicBlock *, 4> LoopBypassBlocks;
@@ -537,9 +572,20 @@ protected:
   PHINode *OldInduction;
   /// Maps scalars to widened vectors.
   ValueMap WidenMap;
+
+  /// A map of induction variables from the original loop to their
+  /// corresponding VF * UF scalarized values in the vectorized loop. The
+  /// purpose of ScalarIVMap is similar to that of WidenMap. Whereas WidenMap
+  /// maps original loop values to their vector versions in the new loop,
+  /// ScalarIVMap maps induction variables from the original loop that are not
+  /// vectorized to their scalar equivalents in the vector loop. Maintaining a
+  /// separate map for scalarized induction variables allows us to avoid
+  /// unnecessary scalar-to-vector-to-scalar conversions.
+  DenseMap<Value *, SmallVector<Value *, 8>> ScalarIVMap;
+
   /// Store instructions that should be predicated, as a pair
   ///   <StoreInst, Predicate>
-  SmallVector<std::pair<StoreInst*,Value*>, 4> PredicatedStores;
+  SmallVector<std::pair<StoreInst *, Value *>, 4> PredicatedStores;
   EdgeMaskCache MaskCache;
   /// Trip count of the original loop.
   Value *TripCount;
@@ -549,10 +595,15 @@ protected:
   /// Map of scalar integer values to the smallest bitwidth they can be legally
   /// represented as. The vector equivalents of these values should be truncated
   /// to this type.
-  MapVector<Instruction*,uint64_t> MinBWs;
+  const MapVector<Instruction *, uint64_t> *MinBWs;
+
+  /// A set of values that should not be widened. This is taken from
+  /// VecValuesToIgnore in the cost model.
+  SmallPtrSetImpl<const Value *> *ValuesNotWidened;
+
   LoopVectorizationLegality *Legal;
 
-  // Record whether runtime check is added.
+  // Record whether runtime checks are added.
   bool AddedSafetyChecks;
 };
 
@@ -561,8 +612,10 @@ public:
   InnerLoopUnroller(Loop *OrigLoop, PredicatedScalarEvolution &PSE,
                     LoopInfo *LI, DominatorTree *DT,
                     const TargetLibraryInfo *TLI,
-                    const TargetTransformInfo *TTI, unsigned UnrollFactor)
-      : InnerLoopVectorizer(OrigLoop, PSE, LI, DT, TLI, TTI, 1, UnrollFactor) {}
+                    const TargetTransformInfo *TTI, AssumptionCache *AC,
+                    unsigned UnrollFactor)
+      : InnerLoopVectorizer(OrigLoop, PSE, LI, DT, TLI, TTI, AC, 1,
+                            UnrollFactor) {}
 
 private:
   void scalarizeInstruction(Instruction *Instr,
@@ -618,36 +671,26 @@ static std::string getDebugLocString(const Loop *L) {
 }
 #endif
 
-/// \brief Propagate known metadata from one instruction to another.
-static void propagateMetadata(Instruction *To, const Instruction *From) {
-  SmallVector<std::pair<unsigned, MDNode *>, 4> Metadata;
-  From->getAllMetadataOtherThanDebugLoc(Metadata);
-
-  for (auto M : Metadata) {
-    unsigned Kind = M.first;
-
-    // These are safe to transfer (this is safe for TBAA, even when we
-    // if-convert, because should that metadata have had a control dependency
-    // on the condition, and thus actually aliased with some other
-    // non-speculated memory access when the condition was false, this would be
-    // caught by the runtime overlap checks).
-    if (Kind != LLVMContext::MD_tbaa &&
-        Kind != LLVMContext::MD_alias_scope &&
-        Kind != LLVMContext::MD_noalias &&
-        Kind != LLVMContext::MD_fpmath &&
-        Kind != LLVMContext::MD_nontemporal)
-      continue;
+void InnerLoopVectorizer::addNewMetadata(Instruction *To,
+                                         const Instruction *Orig) {
+  // If the loop was versioned with memchecks, add the corresponding no-alias
+  // metadata.
+  if (LVer && (isa<LoadInst>(Orig) || isa<StoreInst>(Orig)))
+    LVer->annotateInstWithNoAlias(To, Orig);
+}
 
-    To->setMetadata(Kind, M.second);
-  }
+void InnerLoopVectorizer::addMetadata(Instruction *To,
+                                      Instruction *From) {
+  propagateMetadata(To, From);
+  addNewMetadata(To, From);
 }
 
-/// \brief Propagate known metadata from one instruction to a vector of others.
-static void propagateMetadata(SmallVectorImpl<Value *> &To,
-                              const Instruction *From) {
-  for (Value *V : To)
+void InnerLoopVectorizer::addMetadata(ArrayRef<Value *> To,
+                                      Instruction *From) {
+  for (Value *V : To) {
     if (Instruction *I = dyn_cast<Instruction>(V))
-      propagateMetadata(I, From);
+      addMetadata(I, From);
+  }
 }
 
 /// \brief The group of interleaved loads/stores sharing the same stride and
@@ -785,8 +828,9 @@ private:
 class InterleavedAccessInfo {
 public:
   InterleavedAccessInfo(PredicatedScalarEvolution &PSE, Loop *L,
-                        DominatorTree *DT)
-      : PSE(PSE), TheLoop(L), DT(DT) {}
+                        DominatorTree *DT, LoopInfo *LI)
+      : PSE(PSE), TheLoop(L), DT(DT), LI(LI), LAI(nullptr),
+        RequiresScalarEpilogue(false) {}
 
   ~InterleavedAccessInfo() {
     SmallSet<InterleaveGroup *, 4> DelSet;
@@ -806,6 +850,14 @@ public:
     return InterleaveGroupMap.count(Instr);
   }
 
+  /// \brief Return the maximum interleave factor of all interleaved groups.
+  unsigned getMaxInterleaveFactor() const {
+    unsigned MaxFactor = 1;
+    for (auto &Entry : InterleaveGroupMap)
+      MaxFactor = std::max(MaxFactor, Entry.second->getFactor());
+    return MaxFactor;
+  }
+
   /// \brief Get the interleave group that \p Instr belongs to.
   ///
   /// \returns nullptr if doesn't have such group.
@@ -815,6 +867,13 @@ public:
     return nullptr;
   }
 
+  /// \brief Returns true if an interleaved group that may access memory
+  /// out-of-bounds requires a scalar epilogue iteration for correctness.
+  bool requiresScalarEpilogue() const { return RequiresScalarEpilogue; }
+
+  /// \brief Initialize the LoopAccessInfo used for dependence checking.
+  void setLAI(const LoopAccessInfo *Info) { LAI = Info; }
+
 private:
   /// A wrapper around ScalarEvolution, used to add runtime SCEV checks.
   /// Simplifies SCEV expressions in the context of existing SCEV assumptions.
@@ -823,24 +882,39 @@ private:
   PredicatedScalarEvolution &PSE;
   Loop *TheLoop;
   DominatorTree *DT;
+  LoopInfo *LI;
+  const LoopAccessInfo *LAI;
+
+  /// True if the loop may contain non-reversed interleaved groups with
+  /// out-of-bounds accesses. We ensure we don't speculatively access memory
+  /// out-of-bounds by executing at least one scalar epilogue iteration.
+  bool RequiresScalarEpilogue;
 
   /// Holds the relationships between the members and the interleave group.
   DenseMap<Instruction *, InterleaveGroup *> InterleaveGroupMap;
 
+  /// Holds dependences among the memory accesses in the loop. It maps a source
+  /// access to a set of dependent sink accesses.
+  DenseMap<Instruction *, SmallPtrSet<Instruction *, 2>> Dependences;
+
   /// \brief The descriptor for a strided memory access.
   struct StrideDescriptor {
-    StrideDescriptor(int Stride, const SCEV *Scev, unsigned Size,
+    StrideDescriptor(int64_t Stride, const SCEV *Scev, uint64_t Size,
                      unsigned Align)
         : Stride(Stride), Scev(Scev), Size(Size), Align(Align) {}
 
-    StrideDescriptor() : Stride(0), Scev(nullptr), Size(0), Align(0) {}
+    StrideDescriptor() = default;
 
-    int Stride; // The access's stride. It is negative for a reverse access.
-    const SCEV *Scev; // The scalar expression of this access
-    unsigned Size;    // The size of the memory object.
-    unsigned Align;   // The alignment of this access.
+    // The access's stride. It is negative for a reverse access.
+    int64_t Stride = 0;
+    const SCEV *Scev = nullptr; // The scalar expression of this access
+    uint64_t Size = 0;          // The size of the memory object.
+    unsigned Align = 0;         // The alignment of this access.
   };
 
+  /// \brief A type for holding instructions and their stride descriptors.
+  typedef std::pair<Instruction *, StrideDescriptor> StrideEntry;
+
   /// \brief Create a new interleave group with the given instruction \p Instr,
   /// stride \p Stride and alignment \p Align.
   ///
@@ -863,9 +937,86 @@ private:
   }
 
   /// \brief Collect all the accesses with a constant stride in program order.
-  void collectConstStridedAccesses(
-      MapVector<Instruction *, StrideDescriptor> &StrideAccesses,
+  void collectConstStrideAccesses(
+      MapVector<Instruction *, StrideDescriptor> &AccessStrideInfo,
       const ValueToValueMap &Strides);
+
+  /// \brief Returns true if \p Stride is allowed in an interleaved group.
+  static bool isStrided(int Stride) {
+    unsigned Factor = std::abs(Stride);
+    return Factor >= 2 && Factor <= MaxInterleaveGroupFactor;
+  }
+
+  /// \brief Returns true if \p BB is a predicated block.
+  bool isPredicated(BasicBlock *BB) const {
+    return LoopAccessInfo::blockNeedsPredication(BB, TheLoop, DT);
+  }
+
+  /// \brief Returns true if LoopAccessInfo can be used for dependence queries.
+  bool areDependencesValid() const {
+    return LAI && LAI->getDepChecker().getDependences();
+  }
+
+  /// \brief Returns true if memory accesses \p A and \p B can be reordered, if
+  /// necessary, when constructing interleaved groups.
+  ///
+  /// \p A must precede \p B in program order. We return false if reordering is
+  /// not necessary or is prevented because \p A and \p B may be dependent.
+  bool canReorderMemAccessesForInterleavedGroups(StrideEntry *A,
+                                                 StrideEntry *B) const {
+
+    // Code motion for interleaved accesses can potentially hoist strided loads
+    // and sink strided stores. The code below checks the legality of the
+    // following two conditions:
+    //
+    // 1. Potentially moving a strided load (B) before any store (A) that
+    //    precedes B, or
+    //
+    // 2. Potentially moving a strided store (A) after any load or store (B)
+    //    that A precedes.
+    //
+    // It's legal to reorder A and B if we know there isn't a dependence from A
+    // to B. Note that this determination is conservative since some
+    // dependences could potentially be reordered safely.
+
+    // A is potentially the source of a dependence.
+    auto *Src = A->first;
+    auto SrcDes = A->second;
+
+    // B is potentially the sink of a dependence.
+    auto *Sink = B->first;
+    auto SinkDes = B->second;
+
+    // Code motion for interleaved accesses can't violate WAR dependences.
+    // Thus, reordering is legal if the source isn't a write.
+    if (!Src->mayWriteToMemory())
+      return true;
+
+    // At least one of the accesses must be strided.
+    if (!isStrided(SrcDes.Stride) && !isStrided(SinkDes.Stride))
+      return true;
+
+    // If dependence information is not available from LoopAccessInfo,
+    // conservatively assume the instructions can't be reordered.
+    if (!areDependencesValid())
+      return false;
+
+    // If we know there is a dependence from source to sink, assume the
+    // instructions can't be reordered. Otherwise, reordering is legal.
+    return !Dependences.count(Src) || !Dependences.lookup(Src).count(Sink);
+  }
+
+  /// \brief Collect the dependences from LoopAccessInfo.
+  ///
+  /// We process the dependences once during the interleaved access analysis to
+  /// enable constant-time dependence queries.
+  void collectDependences() {
+    if (!areDependencesValid())
+      return;
+    auto *Deps = LAI->getDepChecker().getDependences();
+    for (auto Dep : *Deps)
+      Dependences[Dep.getSource(*LAI)].insert(Dep.getDestination(*LAI));
+  }
 };
 
 /// Utility class for getting and setting loop vectorizer hints in the form
@@ -878,20 +1029,16 @@ private:
 /// for example 'force', means a decision has been made. So, we need to be
 /// careful NOT to add them if the user hasn't specifically asked so.
 class LoopVectorizeHints {
-  enum HintKind {
-    HK_WIDTH,
-    HK_UNROLL,
-    HK_FORCE
-  };
+  enum HintKind { HK_WIDTH, HK_UNROLL, HK_FORCE };
 
   /// Hint - associates name and validation with the hint value.
   struct Hint {
-    const char * Name;
+    const char *Name;
     unsigned Value; // This may have to change for non-numeric values.
     HintKind Kind;
 
-    Hint(const char * Name, unsigned Value, HintKind Kind)
-      : Name(Name), Value(Value), Kind(Kind) { }
+    Hint(const char *Name, unsigned Value, HintKind Kind)
+        : Name(Name), Value(Value), Kind(Kind) {}
 
     bool validate(unsigned Val) {
       switch (Kind) {
@@ -916,6 +1063,9 @@ class LoopVectorizeHints {
   /// Return the loop metadata prefix.
   static StringRef Prefix() { return "llvm.loop."; }
 
+  /// True if there is any unsafe math in the loop.
+  bool PotentiallyUnsafe;
+
 public:
   enum ForceKind {
     FK_Undefined = -1, ///< Not selected.
@@ -928,7 +1078,7 @@ public:
               HK_WIDTH),
         Interleave("interleave.count", DisableInterleaving, HK_UNROLL),
         Force("vectorize.enable", FK_Undefined, HK_FORCE),
-        TheLoop(L) {
+        PotentiallyUnsafe(false), TheLoop(L) {
     // Populate values with existing loop metadata.
     getHintsFromMetadata();
 
@@ -1005,16 +1155,17 @@ public:
   unsigned getWidth() const { return Width.Value; }
   unsigned getInterleave() const { return Interleave.Value; }
   enum ForceKind getForce() const { return (ForceKind)Force.Value; }
+
+  /// \brief If hints are provided that force vectorization, use the AlwaysPrint
+  /// pass name to force the frontend to print the diagnostic.
   const char *vectorizeAnalysisPassName() const {
-    // If hints are provided that don't disable vectorization use the
-    // AlwaysPrint pass name to force the frontend to print the diagnostic.
     if (getWidth() == 1)
       return LV_NAME;
     if (getForce() == LoopVectorizeHints::FK_Disabled)
       return LV_NAME;
     if (getForce() == LoopVectorizeHints::FK_Undefined && getWidth() == 0)
       return LV_NAME;
-    return DiagnosticInfo::AlwaysPrint;
+    return DiagnosticInfoOptimizationRemarkAnalysis::AlwaysPrint;
   }
 
   bool allowReordering() const {
@@ -1026,6 +1177,17 @@ public:
     return getForce() == LoopVectorizeHints::FK_Enabled || getWidth() > 1;
   }
 
+  bool isPotentiallyUnsafe() const {
+    // Avoid FP vectorization if the target is unsure about proper support.
+    // This may be related to the SIMD unit in the target not handling
+    // IEEE 754 FP ops properly, or bad single-to-double promotions.
+    // Otherwise, a sequence of vectorized loops, even without reduction,
+    // could lead to different end results on the destination vectors.
+    return getForce() != LoopVectorizeHints::FK_Enabled && PotentiallyUnsafe;
+  }
+
+  void setPotentiallyUnsafe() { PotentiallyUnsafe = true; }
+
 private:
   /// Find hints specified in the loop metadata and update local values.
   void getHintsFromMetadata() {
@@ -1071,7 +1233,8 @@ private:
     Name = Name.substr(Prefix().size(), StringRef::npos);
 
     const ConstantInt *C = mdconst::dyn_extract<ConstantInt>(Arg);
-    if (!C) return;
+    if (!C)
+      return;
     unsigned Val = C->getZExtValue();
 
     Hint *Hints[] = {&Width, &Interleave, &Force};
@@ -1097,7 +1260,7 @@ private:
 
   /// Matches metadata with hint name.
   bool matchesHintMetadataName(MDNode *Node, ArrayRef<Hint> HintTypes) {
-    MDString* Name = dyn_cast<MDString>(Node->getOperand(0));
+    MDString *Name = dyn_cast<MDString>(Node->getOperand(0));
     if (!Name)
       return false;
 
@@ -1181,17 +1344,17 @@ static void emitMissedWarning(Function *F, Loop *L,
 /// induction variable and the different reduction variables.
 class LoopVectorizationLegality {
 public:
-  LoopVectorizationLegality(Loop *L, PredicatedScalarEvolution &PSE,
-                            DominatorTree *DT, TargetLibraryInfo *TLI,
-                            AliasAnalysis *AA, Function *F,
-                            const TargetTransformInfo *TTI,
-                            LoopAccessAnalysis *LAA,
-                            LoopVectorizationRequirements *R,
-                            const LoopVectorizeHints *H)
+  LoopVectorizationLegality(
+      Loop *L, PredicatedScalarEvolution &PSE, DominatorTree *DT,
+      TargetLibraryInfo *TLI, AliasAnalysis *AA, Function *F,
+      const TargetTransformInfo *TTI,
+      std::function<const LoopAccessInfo &(Loop &)> *GetLAA, LoopInfo *LI,
+      LoopVectorizationRequirements *R, LoopVectorizeHints *H)
       : NumPredStores(0), TheLoop(L), PSE(PSE), TLI(TLI), TheFunction(F),
-        TTI(TTI), DT(DT), LAA(LAA), LAI(nullptr), InterleaveInfo(PSE, L, DT),
-        Induction(nullptr), WidestIndTy(nullptr), HasFunNoNaNAttr(false),
-        Requirements(R), Hints(H) {}
+        TTI(TTI), DT(DT), GetLAA(GetLAA), LAI(nullptr),
+        InterleaveInfo(PSE, L, DT, LI), Induction(nullptr),
+        WidestIndTy(nullptr), HasFunNoNaNAttr(false), Requirements(R),
+        Hints(H) {}
 
   /// ReductionList contains the reduction descriptors for all
   /// of the reductions that were found in the loop.
@@ -1199,7 +1362,11 @@ public:
 
   /// InductionList saves induction variables and maps them to the
   /// induction descriptor.
-  typedef MapVector<PHINode*, InductionDescriptor> InductionList;
+  typedef MapVector<PHINode *, InductionDescriptor> InductionList;
+
+  /// RecurrenceSet contains the phi nodes that are recurrences other than
+  /// inductions and reductions.
+  typedef SmallPtrSet<const PHINode *, 8> RecurrenceSet;
 
   /// Returns true if it is legal to vectorize this loop.
   /// This does not mean that it is profitable to vectorize this
@@ -1215,6 +1382,9 @@ public:
   /// Returns the induction variables found in the loop.
   InductionList *getInductionVars() { return &Inductions; }
 
+  /// Return the first-order recurrences found in the loop.
+  RecurrenceSet *getFirstOrderRecurrences() { return &FirstOrderRecurrences; }
+
   /// Returns the widest induction type.
   Type *getWidestInductionType() { return WidestIndTy; }
 
@@ -1224,11 +1394,14 @@ public:
   /// Returns True if PN is a reduction variable in this loop.
   bool isReductionVariable(PHINode *PN) { return Reductions.count(PN); }
 
+  /// Returns True if Phi is a first-order recurrence in this loop.
+  bool isFirstOrderRecurrence(const PHINode *Phi);
+
   /// Return true if the block BB needs to be predicated in order for the loop
   /// to be vectorized.
   bool blockNeedsPredication(BasicBlock *BB);
 
-  /// Check if this  pointer is consecutive when vectorizing. This happens
+  /// Check if this pointer is consecutive when vectorizing. This happens
   /// when the last index of the GEP is the induction variable, or that the
   /// pointer itself is an induction variable.
   /// This check allows us to vectorize A[idx] into a wide load/store.
@@ -1242,35 +1415,39 @@ public:
   bool isUniform(Value *V);
 
   /// Returns true if this instruction will remain scalar after vectorization.
-  bool isUniformAfterVectorization(Instruction* I) { return Uniforms.count(I); }
+  bool isUniformAfterVectorization(Instruction *I) { return Uniforms.count(I); }
 
   /// Returns the information that we collected about runtime memory check.
   const RuntimePointerChecking *getRuntimePointerChecking() const {
     return LAI->getRuntimePointerChecking();
   }
 
-  const LoopAccessInfo *getLAI() const {
-    return LAI;
-  }
+  const LoopAccessInfo *getLAI() const { return LAI; }
 
   /// \brief Check if \p Instr belongs to any interleaved access group.
   bool isAccessInterleaved(Instruction *Instr) {
     return InterleaveInfo.isInterleaved(Instr);
   }
 
+  /// \brief Return the maximum interleave factor of all interleaved groups.
+  unsigned getMaxInterleaveFactor() const {
+    return InterleaveInfo.getMaxInterleaveFactor();
+  }
+
   /// \brief Get the interleaved access group that \p Instr belongs to.
   const InterleaveGroup *getInterleavedAccessGroup(Instruction *Instr) {
     return InterleaveInfo.getInterleaveGroup(Instr);
   }
 
+  /// \brief Returns true if an interleaved group requires a scalar iteration
+  /// to handle accesses with gaps.
+  bool requiresScalarEpilogue() const {
+    return InterleaveInfo.requiresScalarEpilogue();
+  }
+
   unsigned getMaxSafeDepDistBytes() { return LAI->getMaxSafeDepDistBytes(); }
 
-  bool hasStride(Value *V) { return StrideSet.count(V); }
-  bool mustCheckStrides() { return !StrideSet.empty(); }
-  SmallPtrSet<Value *, 8>::iterator strides_begin() {
-    return StrideSet.begin();
-  }
-  SmallPtrSet<Value *, 8>::iterator strides_end() { return StrideSet.end(); }
+  bool hasStride(Value *V) { return LAI->hasStride(V); }
 
   /// Returns true if the target machine supports masked store operation
   /// for the given \p DataType and kind of access to \p Ptr.
@@ -1282,20 +1459,24 @@ public:
   bool isLegalMaskedLoad(Type *DataType, Value *Ptr) {
     return isConsecutivePtr(Ptr) && TTI->isLegalMaskedLoad(DataType);
   }
-  /// Returns true if vector representation of the instruction \p I
-  /// requires mask.
-  bool isMaskRequired(const Instruction* I) {
-    return (MaskedOp.count(I) != 0);
-  }
-  unsigned getNumStores() const {
-    return LAI->getNumStores();
+  /// Returns true if the target machine supports masked scatter operation
+  /// for the given \p DataType.
+  bool isLegalMaskedScatter(Type *DataType) {
+    return TTI->isLegalMaskedScatter(DataType);
   }
-  unsigned getNumLoads() const {
-    return LAI->getNumLoads();
-  }
-  unsigned getNumPredStores() const {
-    return NumPredStores;
+  /// Returns true if the target machine supports masked gather operation
+  /// for the given \p DataType.
+  bool isLegalMaskedGather(Type *DataType) {
+    return TTI->isLegalMaskedGather(DataType);
   }
+
+  /// Returns true if vector representation of the instruction \p I
+  /// requires mask.
+  bool isMaskRequired(const Instruction *I) { return (MaskedOp.count(I) != 0); }
+  unsigned getNumStores() const { return LAI->getNumStores(); }
+  unsigned getNumLoads() const { return LAI->getNumLoads(); }
+  unsigned getNumPredStores() const { return NumPredStores; }
+
 private:
   /// Check if a single basic block loop is vectorizable.
   /// At this point we know that this is a loop with a constant trip count
@@ -1320,11 +1501,11 @@ private:
   /// and we know that we can read from them without segfault.
   bool blockCanBePredicated(BasicBlock *BB, SmallPtrSetImpl<Value *> &SafePtrs);
 
-  /// \brief Collect memory access with loop invariant strides.
-  ///
-  /// Looks for accesses like "a[i * StrideA]" where "StrideA" is loop
-  /// invariant.
-  void collectStridedAccess(Value *LoadOrStoreInst);
+  /// Updates the vectorization state by adding \p Phi to the inductions list.
+  /// This can set \p Phi as the main induction of the loop if \p Phi is a
+  /// better choice for the main induction than the existing one.
+  void addInductionPhi(PHINode *Phi, const InductionDescriptor &ID,
+                       SmallPtrSetImpl<Value *> &AllowedExit);
 
   /// Report an analysis message to assist the user in diagnosing loops that are
   /// not vectorized.  These are handled as LoopAccessReport rather than
@@ -1334,6 +1515,16 @@ private:
     emitAnalysisDiag(TheFunction, TheLoop, *Hints, Message);
   }
 
+  /// \brief If an access has a symbolic strides, this maps the pointer value to
+  /// the stride symbol.
+  const ValueToValueMap *getSymbolicStrides() {
+    // FIXME: Currently, the set of symbolic strides is sometimes queried before
+    // it's collected.  This happens from canVectorizeWithIfConvert, when the
+    // pointer is checked to reference consecutive elements suitable for a
+    // masked access.
+    return LAI ? &LAI->getSymbolicStrides() : nullptr;
+  }
+
   unsigned NumPredStores;
 
   /// The loop that we evaluate.
@@ -1353,7 +1544,7 @@ private:
   /// Dominator Tree.
   DominatorTree *DT;
   // LoopAccess analysis.
-  LoopAccessAnalysis *LAA;
+  std::function<const LoopAccessInfo &(Loop &)> *GetLAA;
   // And the loop-accesses info corresponding to this loop.  This pointer is
   // null until canVectorizeMemory sets it up.
   const LoopAccessInfo *LAI;
@@ -1373,15 +1564,17 @@ private:
   /// Notice that inductions don't need to start at zero and that induction
   /// variables can be pointers.
   InductionList Inductions;
+  /// Holds the phi nodes that are first-order recurrences.
+  RecurrenceSet FirstOrderRecurrences;
   /// Holds the widest induction type encountered.
   Type *WidestIndTy;
 
-  /// Allowed outside users. This holds the reduction
+  /// Allowed outside users. This holds the induction and reduction
   /// vars which can be accessed from outside the loop.
-  SmallPtrSet<Value*, 4> AllowedExit;
+  SmallPtrSet<Value *, 4> AllowedExit;
   /// This set holds the variables which are known to be uniform after
   /// vectorization.
-  SmallPtrSet<Instruction*, 4> Uniforms;
+  SmallPtrSet<Instruction *, 4> Uniforms;
 
   /// Can we assume the absence of NaNs.
   bool HasFunNoNaNAttr;
@@ -1390,10 +1583,7 @@ private:
   LoopVectorizationRequirements *Requirements;
 
   /// Used to emit an analysis of any legality issues.
-  const LoopVectorizeHints *Hints;
-
-  ValueToValueMap Strides;
-  SmallPtrSet<Value *, 8> StrideSet;
+  LoopVectorizeHints *Hints;
 
   /// While vectorizing these instructions we have to generate a
   /// call to the appropriate masked intrinsic
@@ -1409,20 +1599,19 @@ private:
 /// different operations.
 class LoopVectorizationCostModel {
 public:
-  LoopVectorizationCostModel(Loop *L, ScalarEvolution *SE, LoopInfo *LI,
-                             LoopVectorizationLegality *Legal,
+  LoopVectorizationCostModel(Loop *L, PredicatedScalarEvolution &PSE,
+                             LoopInfo *LI, LoopVectorizationLegality *Legal,
                              const TargetTransformInfo &TTI,
                              const TargetLibraryInfo *TLI, DemandedBits *DB,
                              AssumptionCache *AC, const Function *F,
-                             const LoopVectorizeHints *Hints,
-                             SmallPtrSetImpl<const Value *> &ValuesToIgnore)
-      : TheLoop(L), SE(SE), LI(LI), Legal(Legal), TTI(TTI), TLI(TLI), DB(DB),
-        TheFunction(F), Hints(Hints), ValuesToIgnore(ValuesToIgnore) {}
+                             const LoopVectorizeHints *Hints)
+      : TheLoop(L), PSE(PSE), LI(LI), Legal(Legal), TTI(TTI), TLI(TLI), DB(DB),
+        AC(AC), TheFunction(F), Hints(Hints) {}
 
   /// Information about vectorization costs
   struct VectorizationFactor {
     unsigned Width; // Vector width with best cost
-    unsigned Cost; // Cost of the loop with that width
+    unsigned Cost;  // Cost of the loop with that width
   };
   /// \return The most profitable vectorization factor and the cost of that VF.
   /// This method checks every power of two up to VF. If UserVF is not ZERO
@@ -1462,19 +1651,34 @@ public:
 
   /// \return Returns information about the register usages of the loop for the
   /// given vectorization factors.
-  SmallVector<RegisterUsage, 8>
-  calculateRegisterUsage(const SmallVector<unsigned, 8> &VFs);
+  SmallVector<RegisterUsage, 8> calculateRegisterUsage(ArrayRef<unsigned> VFs);
+
+  /// Collect values we want to ignore in the cost model.
+  void collectValuesToIgnore();
 
 private:
+  /// The vectorization cost is a combination of the cost itself and a boolean
+  /// indicating whether any of the contributing operations will actually
+  /// operate on
+  /// vector values after type legalization in the backend. If this latter value
+  /// is
+  /// false, then all operations will be scalarized (i.e. no vectorization has
+  /// actually taken place).
+  typedef std::pair<unsigned, bool> VectorizationCostTy;
+
   /// Returns the expected execution cost. The unit of the cost does
   /// not matter because we use the 'cost' units to compare different
   /// vector widths. The cost that is returned is *not* normalized by
   /// the factor width.
-  unsigned expectedCost(unsigned VF);
+  VectorizationCostTy expectedCost(unsigned VF);
 
   /// Returns the execution time cost of an instruction for a given vector
   /// width. Vector width of one means scalar.
-  unsigned getInstructionCost(Instruction *I, unsigned VF);
+  VectorizationCostTy getInstructionCost(Instruction *I, unsigned VF);
+
+  /// The cost-computation logic from getInstructionCost which provides
+  /// the vector type as an output parameter.
+  unsigned getInstructionCost(Instruction *I, unsigned VF, Type *&VectorTy);
 
   /// Returns whether the instruction is a load or store and will be a emitted
   /// as a vector operation.
@@ -1492,12 +1696,12 @@ public:
   /// Map of scalar integer values to the smallest bitwidth they can be legally
   /// represented as. The vector equivalents of these values should be truncated
   /// to this type.
-  MapVector<Instruction*,uint64_t> MinBWs;
+  MapVector<Instruction *, uint64_t> MinBWs;
 
   /// The loop that we evaluate.
   Loop *TheLoop;
-  /// Scev analysis.
-  ScalarEvolution *SE;
+  /// Predicated scalar evolution analysis.
+  PredicatedScalarEvolution &PSE;
   /// Loop Info analysis.
   LoopInfo *LI;
   /// Vectorization legality.
@@ -1506,13 +1710,17 @@ public:
   const TargetTransformInfo &TTI;
   /// Target Library Info.
   const TargetLibraryInfo *TLI;
-  /// Demanded bits analysis
+  /// Demanded bits analysis.
   DemandedBits *DB;
+  /// Assumption cache.
+  AssumptionCache *AC;
   const Function *TheFunction;
-  // Loop Vectorize Hint.
+  /// Loop Vectorize Hint.
   const LoopVectorizeHints *Hints;
-  // Values to ignore in the cost model.
-  const SmallPtrSetImpl<const Value *> &ValuesToIgnore;
+  /// Values to ignore in the cost model.
+  SmallPtrSet<const Value *, 16> ValuesToIgnore;
+  /// Values to ignore in the cost model when VF > 1.
+  SmallPtrSet<const Value *, 16> VecValuesToIgnore;
 };
 
 /// \brief This holds vectorization requirements that must be verified late in
@@ -1588,328 +1796,35 @@ struct LoopVectorize : public FunctionPass {
   static char ID;
 
   explicit LoopVectorize(bool NoUnrolling = false, bool AlwaysVectorize = true)
-    : FunctionPass(ID),
-      DisableUnrolling(NoUnrolling),
-      AlwaysVectorize(AlwaysVectorize) {
+      : FunctionPass(ID) {
+    Impl.DisableUnrolling = NoUnrolling;
+    Impl.AlwaysVectorize = AlwaysVectorize;
     initializeLoopVectorizePass(*PassRegistry::getPassRegistry());
   }
 
-  ScalarEvolution *SE;
-  LoopInfo *LI;
-  TargetTransformInfo *TTI;
-  DominatorTree *DT;
-  BlockFrequencyInfo *BFI;
-  TargetLibraryInfo *TLI;
-  DemandedBits *DB;
-  AliasAnalysis *AA;
-  AssumptionCache *AC;
-  LoopAccessAnalysis *LAA;
-  bool DisableUnrolling;
-  bool AlwaysVectorize;
-
-  BlockFrequency ColdEntryFreq;
+  LoopVectorizePass Impl;
 
   bool runOnFunction(Function &F) override {
-    SE = &getAnalysis<ScalarEvolutionWrapperPass>().getSE();
-    LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
-    TTI = &getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F);
-    DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
-    BFI = &getAnalysis<BlockFrequencyInfoWrapperPass>().getBFI();
-    auto *TLIP = getAnalysisIfAvailable<TargetLibraryInfoWrapperPass>();
-    TLI = TLIP ? &TLIP->getTLI() : nullptr;
-    AA = &getAnalysis<AAResultsWrapperPass>().getAAResults();
-    AC = &getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F);
-    LAA = &getAnalysis<LoopAccessAnalysis>();
-    DB = &getAnalysis<DemandedBits>();
-
-    // Compute some weights outside of the loop over the loops. Compute this
-    // using a BranchProbability to re-use its scaling math.
-    const BranchProbability ColdProb(1, 5); // 20%
-    ColdEntryFreq = BlockFrequency(BFI->getEntryFreq()) * ColdProb;
-
-    // Don't attempt if
-    // 1. the target claims to have no vector registers, and
-    // 2. interleaving won't help ILP.
-    //
-    // The second condition is necessary because, even if the target has no
-    // vector registers, loop vectorization may still enable scalar
-    // interleaving.
-    if (!TTI->getNumberOfRegisters(true) && TTI->getMaxInterleaveFactor(1) < 2)
+    if (skipFunction(F))
       return false;
 
-    // Build up a worklist of inner-loops to vectorize. This is necessary as
-    // the act of vectorizing or partially unrolling a loop creates new loops
-    // and can invalidate iterators across the loops.
-    SmallVector<Loop *, 8> Worklist;
-
-    for (Loop *L : *LI)
-      addInnerLoop(*L, Worklist);
-
-    LoopsAnalyzed += Worklist.size();
-
-    // Now walk the identified inner loops.
-    bool Changed = false;
-    while (!Worklist.empty())
-      Changed |= processLoop(Worklist.pop_back_val());
-
-    // Process each loop nest in the function.
-    return Changed;
-  }
-
-  static void AddRuntimeUnrollDisableMetaData(Loop *L) {
-    SmallVector<Metadata *, 4> MDs;
-    // Reserve first location for self reference to the LoopID metadata node.
-    MDs.push_back(nullptr);
-    bool IsUnrollMetadata = false;
-    MDNode *LoopID = L->getLoopID();
-    if (LoopID) {
-      // First find existing loop unrolling disable metadata.
-      for (unsigned i = 1, ie = LoopID->getNumOperands(); i < ie; ++i) {
-        MDNode *MD = dyn_cast<MDNode>(LoopID->getOperand(i));
-        if (MD) {
-          const MDString *S = dyn_cast<MDString>(MD->getOperand(0));
-          IsUnrollMetadata =
-              S && S->getString().startswith("llvm.loop.unroll.disable");
-        }
-        MDs.push_back(LoopID->getOperand(i));
-      }
-    }
-
-    if (!IsUnrollMetadata) {
-      // Add runtime unroll disable metadata.
-      LLVMContext &Context = L->getHeader()->getContext();
-      SmallVector<Metadata *, 1> DisableOperands;
-      DisableOperands.push_back(
-          MDString::get(Context, "llvm.loop.unroll.runtime.disable"));
-      MDNode *DisableNode = MDNode::get(Context, DisableOperands);
-      MDs.push_back(DisableNode);
-      MDNode *NewLoopID = MDNode::get(Context, MDs);
-      // Set operand 0 to refer to the loop id itself.
-      NewLoopID->replaceOperandWith(0, NewLoopID);
-      L->setLoopID(NewLoopID);
-    }
-  }
-
-  bool processLoop(Loop *L) {
-    assert(L->empty() && "Only process inner loops.");
-
-#ifndef NDEBUG
-    const std::string DebugLocStr = getDebugLocString(L);
-#endif /* NDEBUG */
-
-    DEBUG(dbgs() << "\nLV: Checking a loop in \""
-                 << L->getHeader()->getParent()->getName() << "\" from "
-                 << DebugLocStr << "\n");
-
-    LoopVectorizeHints Hints(L, DisableUnrolling);
-
-    DEBUG(dbgs() << "LV: Loop hints:"
-                 << " force="
-                 << (Hints.getForce() == LoopVectorizeHints::FK_Disabled
-                         ? "disabled"
-                         : (Hints.getForce() == LoopVectorizeHints::FK_Enabled
-                                ? "enabled"
-                                : "?")) << " width=" << Hints.getWidth()
-                 << " unroll=" << Hints.getInterleave() << "\n");
-
-    // Function containing loop
-    Function *F = L->getHeader()->getParent();
-
-    // Looking at the diagnostic output is the only way to determine if a loop
-    // was vectorized (other than looking at the IR or machine code), so it
-    // is important to generate an optimization remark for each loop. Most of
-    // these messages are generated by emitOptimizationRemarkAnalysis. Remarks
-    // generated by emitOptimizationRemark and emitOptimizationRemarkMissed are
-    // less verbose reporting vectorized loops and unvectorized loops that may
-    // benefit from vectorization, respectively.
-
-    if (!Hints.allowVectorization(F, L, AlwaysVectorize)) {
-      DEBUG(dbgs() << "LV: Loop hints prevent vectorization.\n");
-      return false;
-    }
-
-    // Check the loop for a trip count threshold:
-    // do not vectorize loops with a tiny trip count.
-    const unsigned TC = SE->getSmallConstantTripCount(L);
-    if (TC > 0u && TC < TinyTripCountVectorThreshold) {
-      DEBUG(dbgs() << "LV: Found a loop with a very small trip count. "
-                   << "This loop is not worth vectorizing.");
-      if (Hints.getForce() == LoopVectorizeHints::FK_Enabled)
-        DEBUG(dbgs() << " But vectorizing was explicitly forced.\n");
-      else {
-        DEBUG(dbgs() << "\n");
-        emitAnalysisDiag(F, L, Hints, VectorizationReport()
-                                          << "vectorization is not beneficial "
-                                             "and is not explicitly forced");
-        return false;
-      }
-    }
-
-    PredicatedScalarEvolution PSE(*SE);
-
-    // Check if it is legal to vectorize the loop.
-    LoopVectorizationRequirements Requirements;
-    LoopVectorizationLegality LVL(L, PSE, DT, TLI, AA, F, TTI, LAA,
-                                  &Requirements, &Hints);
-    if (!LVL.canVectorize()) {
-      DEBUG(dbgs() << "LV: Not vectorizing: Cannot prove legality.\n");
-      emitMissedWarning(F, L, Hints);
-      return false;
-    }
-
-    // Collect values we want to ignore in the cost model. This includes
-    // type-promoting instructions we identified during reduction detection.
-    SmallPtrSet<const Value *, 32> ValuesToIgnore;
-    CodeMetrics::collectEphemeralValues(L, AC, ValuesToIgnore);
-    for (auto &Reduction : *LVL.getReductionVars()) {
-      RecurrenceDescriptor &RedDes = Reduction.second;
-      SmallPtrSetImpl<Instruction *> &Casts = RedDes.getCastInsts();
-      ValuesToIgnore.insert(Casts.begin(), Casts.end());
-    }
-
-    // Use the cost model.
-    LoopVectorizationCostModel CM(L, PSE.getSE(), LI, &LVL, *TTI, TLI, DB, AC,
-                                  F, &Hints, ValuesToIgnore);
-
-    // Check the function attributes to find out if this function should be
-    // optimized for size.
-    bool OptForSize = Hints.getForce() != LoopVectorizeHints::FK_Enabled &&
-                      F->optForSize();
-
-    // Compute the weighted frequency of this loop being executed and see if it
-    // is less than 20% of the function entry baseline frequency. Note that we
-    // always have a canonical loop here because we think we *can* vectorize.
-    // FIXME: This is hidden behind a flag due to pervasive problems with
-    // exactly what block frequency models.
-    if (LoopVectorizeWithBlockFrequency) {
-      BlockFrequency LoopEntryFreq = BFI->getBlockFreq(L->getLoopPreheader());
-      if (Hints.getForce() != LoopVectorizeHints::FK_Enabled &&
-          LoopEntryFreq < ColdEntryFreq)
-        OptForSize = true;
-    }
-
-    // Check the function attributes to see if implicit floats are allowed.
-    // FIXME: This check doesn't seem possibly correct -- what if the loop is
-    // an integer loop and the vector instructions selected are purely integer
-    // vector instructions?
-    if (F->hasFnAttribute(Attribute::NoImplicitFloat)) {
-      DEBUG(dbgs() << "LV: Can't vectorize when the NoImplicitFloat"
-            "attribute is used.\n");
-      emitAnalysisDiag(
-          F, L, Hints,
-          VectorizationReport()
-              << "loop not vectorized due to NoImplicitFloat attribute");
-      emitMissedWarning(F, L, Hints);
-      return false;
-    }
-
-    // Select the optimal vectorization factor.
-    const LoopVectorizationCostModel::VectorizationFactor VF =
-        CM.selectVectorizationFactor(OptForSize);
-
-    // Select the interleave count.
-    unsigned IC = CM.selectInterleaveCount(OptForSize, VF.Width, VF.Cost);
-
-    // Get user interleave count.
-    unsigned UserIC = Hints.getInterleave();
-
-    // Identify the diagnostic messages that should be produced.
-    std::string VecDiagMsg, IntDiagMsg;
-    bool VectorizeLoop = true, InterleaveLoop = true;
-
-    if (Requirements.doesNotMeet(F, L, Hints)) {
-      DEBUG(dbgs() << "LV: Not vectorizing: loop did not meet vectorization "
-                      "requirements.\n");
-      emitMissedWarning(F, L, Hints);
-      return false;
-    }
-
-    if (VF.Width == 1) {
-      DEBUG(dbgs() << "LV: Vectorization is possible but not beneficial.\n");
-      VecDiagMsg =
-          "the cost-model indicates that vectorization is not beneficial";
-      VectorizeLoop = false;
-    }
-
-    if (IC == 1 && UserIC <= 1) {
-      // Tell the user interleaving is not beneficial.
-      DEBUG(dbgs() << "LV: Interleaving is not beneficial.\n");
-      IntDiagMsg =
-          "the cost-model indicates that interleaving is not beneficial";
-      InterleaveLoop = false;
-      if (UserIC == 1)
-        IntDiagMsg +=
-            " and is explicitly disabled or interleave count is set to 1";
-    } else if (IC > 1 && UserIC == 1) {
-      // Tell the user interleaving is beneficial, but it explicitly disabled.
-      DEBUG(dbgs()
-            << "LV: Interleaving is beneficial but is explicitly disabled.");
-      IntDiagMsg = "the cost-model indicates that interleaving is beneficial "
-                   "but is explicitly disabled or interleave count is set to 1";
-      InterleaveLoop = false;
-    }
-
-    // Override IC if user provided an interleave count.
-    IC = UserIC > 0 ? UserIC : IC;
-
-    // Emit diagnostic messages, if any.
-    const char *VAPassName = Hints.vectorizeAnalysisPassName();
-    if (!VectorizeLoop && !InterleaveLoop) {
-      // Do not vectorize or interleaving the loop.
-      emitOptimizationRemarkAnalysis(F->getContext(), VAPassName, *F,
-                                     L->getStartLoc(), VecDiagMsg);
-      emitOptimizationRemarkAnalysis(F->getContext(), LV_NAME, *F,
-                                     L->getStartLoc(), IntDiagMsg);
-      return false;
-    } else if (!VectorizeLoop && InterleaveLoop) {
-      DEBUG(dbgs() << "LV: Interleave Count is " << IC << '\n');
-      emitOptimizationRemarkAnalysis(F->getContext(), VAPassName, *F,
-                                     L->getStartLoc(), VecDiagMsg);
-    } else if (VectorizeLoop && !InterleaveLoop) {
-      DEBUG(dbgs() << "LV: Found a vectorizable loop (" << VF.Width << ") in "
-                   << DebugLocStr << '\n');
-      emitOptimizationRemarkAnalysis(F->getContext(), LV_NAME, *F,
-                                     L->getStartLoc(), IntDiagMsg);
-    } else if (VectorizeLoop && InterleaveLoop) {
-      DEBUG(dbgs() << "LV: Found a vectorizable loop (" << VF.Width << ") in "
-                   << DebugLocStr << '\n');
-      DEBUG(dbgs() << "LV: Interleave Count is " << IC << '\n');
-    }
-
-    if (!VectorizeLoop) {
-      assert(IC > 1 && "interleave count should not be 1 or 0");
-      // If we decided that it is not legal to vectorize the loop then
-      // interleave it.
-      InnerLoopUnroller Unroller(L, PSE, LI, DT, TLI, TTI, IC);
-      Unroller.vectorize(&LVL, CM.MinBWs);
-
-      emitOptimizationRemark(F->getContext(), LV_NAME, *F, L->getStartLoc(),
-                             Twine("interleaved loop (interleaved count: ") +
-                                 Twine(IC) + ")");
-    } else {
-      // If we decided that it is *legal* to vectorize the loop then do it.
-      InnerLoopVectorizer LB(L, PSE, LI, DT, TLI, TTI, VF.Width, IC);
-      LB.vectorize(&LVL, CM.MinBWs);
-      ++LoopsVectorized;
-
-      // Add metadata to disable runtime unrolling scalar loop when there's no
-      // runtime check about strides and memory. Because at this situation,
-      // scalar loop is rarely used not worthy to be unrolled.
-      if (!LB.IsSafetyChecksAdded())
-        AddRuntimeUnrollDisableMetaData(L);
-
-      // Report the vectorization decision.
-      emitOptimizationRemark(F->getContext(), LV_NAME, *F, L->getStartLoc(),
-                             Twine("vectorized loop (vectorization width: ") +
-                                 Twine(VF.Width) + ", interleaved count: " +
-                                 Twine(IC) + ")");
-    }
+    auto *SE = &getAnalysis<ScalarEvolutionWrapperPass>().getSE();
+    auto *LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
+    auto *TTI = &getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F);
+    auto *DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
+    auto *BFI = &getAnalysis<BlockFrequencyInfoWrapperPass>().getBFI();
+    auto *TLIP = getAnalysisIfAvailable<TargetLibraryInfoWrapperPass>();
+    auto *TLI = TLIP ? &TLIP->getTLI() : nullptr;
+    auto *AA = &getAnalysis<AAResultsWrapperPass>().getAAResults();
+    auto *AC = &getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F);
+    auto *LAA = &getAnalysis<LoopAccessLegacyAnalysis>();
+    auto *DB = &getAnalysis<DemandedBitsWrapperPass>().getDemandedBits();
 
-    // Mark the loop as already vectorized to avoid vectorizing again.
-    Hints.setAlreadyVectorized();
+    std::function<const LoopAccessInfo &(Loop &)> GetLAA =
+        [&](Loop &L) -> const LoopAccessInfo & { return LAA->getInfo(&L); };
 
-    DEBUG(verifyFunction(*L->getHeader()->getParent()));
-    return true;
+    return Impl.runImpl(F, *SE, *LI, *TTI, *DT, *BFI, TLI, *DB, *AA, *AC,
+                        GetLAA);
   }
 
   void getAnalysisUsage(AnalysisUsage &AU) const override {
@@ -1922,15 +1837,13 @@ struct LoopVectorize : public FunctionPass {
     AU.addRequired<ScalarEvolutionWrapperPass>();
     AU.addRequired<TargetTransformInfoWrapperPass>();
     AU.addRequired<AAResultsWrapperPass>();
-    AU.addRequired<LoopAccessAnalysis>();
-    AU.addRequired<DemandedBits>();
+    AU.addRequired<LoopAccessLegacyAnalysis>();
+    AU.addRequired<DemandedBitsWrapperPass>();
     AU.addPreserved<LoopInfoWrapperPass>();
     AU.addPreserved<DominatorTreeWrapperPass>();
     AU.addPreserved<BasicAAWrapperPass>();
-    AU.addPreserved<AAResultsWrapperPass>();
     AU.addPreserved<GlobalsAAWrapperPass>();
   }
-
 };
 
 } // end anonymous namespace
@@ -1943,9 +1856,7 @@ struct LoopVectorize : public FunctionPass {
 Value *InnerLoopVectorizer::getBroadcastInstrs(Value *V) {
   // We need to place the broadcast of invariant variables outside the loop.
   Instruction *Instr = dyn_cast<Instruction>(V);
-  bool NewInstr =
-      (Instr && std::find(LoopVectorBody.begin(), LoopVectorBody.end(),
-                          Instr->getParent()) != LoopVectorBody.end());
+  bool NewInstr = (Instr && Instr->getParent() == LoopVectorBody);
   bool Invariant = OrigLoop->isLoopInvariant(V) && !NewInstr;
 
   // Place the code for broadcasting invariant variables in the new preheader.
@@ -1959,6 +1870,111 @@ Value *InnerLoopVectorizer::getBroadcastInstrs(Value *V) {
   return Shuf;
 }
 
+void InnerLoopVectorizer::createVectorIntInductionPHI(
+    const InductionDescriptor &II, VectorParts &Entry, IntegerType *TruncType) {
+  Value *Start = II.getStartValue();
+  ConstantInt *Step = II.getConstIntStepValue();
+  assert(Step && "Can not widen an IV with a non-constant step");
+
+  // Construct the initial value of the vector IV in the vector loop preheader
+  auto CurrIP = Builder.saveIP();
+  Builder.SetInsertPoint(LoopVectorPreHeader->getTerminator());
+  if (TruncType) {
+    Step = ConstantInt::getSigned(TruncType, Step->getSExtValue());
+    Start = Builder.CreateCast(Instruction::Trunc, Start, TruncType);
+  }
+  Value *SplatStart = Builder.CreateVectorSplat(VF, Start);
+  Value *SteppedStart = getStepVector(SplatStart, 0, Step);
+  Builder.restoreIP(CurrIP);
+
+  Value *SplatVF =
+      ConstantVector::getSplat(VF, ConstantInt::getSigned(Start->getType(),
+                               VF * Step->getSExtValue()));
+  // We may need to add the step a number of times, depending on the unroll
+  // factor. The last of those goes into the PHI.
+  PHINode *VecInd = PHINode::Create(SteppedStart->getType(), 2, "vec.ind",
+                                    &*LoopVectorBody->getFirstInsertionPt());
+  Value *LastInduction = VecInd;
+  for (unsigned Part = 0; Part < UF; ++Part) {
+    Entry[Part] = LastInduction;
+    LastInduction = Builder.CreateAdd(LastInduction, SplatVF, "step.add");
+  }
+
+  VecInd->addIncoming(SteppedStart, LoopVectorPreHeader);
+  VecInd->addIncoming(LastInduction, LoopVectorBody);
+}
+
+void InnerLoopVectorizer::widenIntInduction(PHINode *IV, VectorParts &Entry,
+                                            TruncInst *Trunc) {
+
+  auto II = Legal->getInductionVars()->find(IV);
+  assert(II != Legal->getInductionVars()->end() && "IV is not an induction");
+
+  auto ID = II->second;
+  assert(IV->getType() == ID.getStartValue()->getType() && "Types must match");
+
+  // If a truncate instruction was provided, get the smaller type.
+  auto *TruncType = Trunc ? cast<IntegerType>(Trunc->getType()) : nullptr;
+
+  // The step of the induction.
+  Value *Step = nullptr;
+
+  // If the induction variable has a constant integer step value, go ahead and
+  // get it now.
+  if (ID.getConstIntStepValue())
+    Step = ID.getConstIntStepValue();
+
+  // Try to create a new independent vector induction variable. If we can't
+  // create the phi node, we will splat the scalar induction variable in each
+  // loop iteration.
+  if (VF > 1 && IV->getType() == Induction->getType() && Step &&
+      !ValuesNotWidened->count(IV))
+    return createVectorIntInductionPHI(ID, Entry, TruncType);
+
+  // The scalar value to broadcast. This will be derived from the canonical
+  // induction variable.
+  Value *ScalarIV = nullptr;
+
+  // Define the scalar induction variable and step values. If we were given a
+  // truncation type, truncate the canonical induction variable and constant
+  // step. Otherwise, derive these values from the induction descriptor.
+  if (TruncType) {
+    assert(Step && "Truncation requires constant integer step");
+    auto StepInt = cast<ConstantInt>(Step)->getSExtValue();
+    ScalarIV = Builder.CreateCast(Instruction::Trunc, Induction, TruncType);
+    Step = ConstantInt::getSigned(TruncType, StepInt);
+  } else {
+    ScalarIV = Induction;
+    auto &DL = OrigLoop->getHeader()->getModule()->getDataLayout();
+    if (IV != OldInduction) {
+      ScalarIV = Builder.CreateSExtOrTrunc(ScalarIV, IV->getType());
+      ScalarIV = ID.transform(Builder, ScalarIV, PSE.getSE(), DL);
+      ScalarIV->setName("offset.idx");
+    }
+    if (!Step) {
+      SCEVExpander Exp(*PSE.getSE(), DL, "induction");
+      Step = Exp.expandCodeFor(ID.getStep(), ID.getStep()->getType(),
+                               &*Builder.GetInsertPoint());
+    }
+  }
+
+  // Splat the scalar induction variable, and build the necessary step vectors.
+  Value *Broadcasted = getBroadcastInstrs(ScalarIV);
+  for (unsigned Part = 0; Part < UF; ++Part)
+    Entry[Part] = getStepVector(Broadcasted, VF * Part, Step);
+
+  // If an induction variable is only used for counting loop iterations or
+  // calculating addresses, it doesn't need to be widened. Create scalar steps
+  // that can be used by instructions we will later scalarize. Note that the
+  // addition of the scalar steps will not increase the number of instructions
+  // in the loop in the common case prior to InstCombine. We will be trading
+  // one vector extract for each scalar step.
+  if (VF > 1 && ValuesNotWidened->count(IV)) {
+    auto *EntryVal = Trunc ? cast<Value>(Trunc) : IV;
+    buildScalarSteps(ScalarIV, Step, EntryVal);
+  }
+}
+
 Value *InnerLoopVectorizer::getStepVector(Value *Val, int StartIdx,
                                           Value *Step) {
   assert(Val->getType()->isVectorTy() && "Must be a vector");
@@ -1970,7 +1986,7 @@ Value *InnerLoopVectorizer::getStepVector(Value *Val, int StartIdx,
   Type *ITy = Val->getType()->getScalarType();
   VectorType *Ty = cast<VectorType>(Val->getType());
   int VLen = Ty->getNumElements();
-  SmallVector<Constant*, 8> Indices;
+  SmallVector<Constant *, 8> Indices;
 
   // Create a vector of consecutive numbers from zero to VF.
   for (int i = 0; i < VLen; ++i)
@@ -1987,6 +2003,27 @@ Value *InnerLoopVectorizer::getStepVector(Value *Val, int StartIdx,
   return Builder.CreateAdd(Val, Step, "induction");
 }
 
+void InnerLoopVectorizer::buildScalarSteps(Value *ScalarIV, Value *Step,
+                                           Value *EntryVal) {
+
+  // We shouldn't have to build scalar steps if we aren't vectorizing.
+  assert(VF > 1 && "VF should be greater than one");
+
+  // Get the value type and ensure it and the step have the same integer type.
+  Type *ScalarIVTy = ScalarIV->getType()->getScalarType();
+  assert(ScalarIVTy->isIntegerTy() && ScalarIVTy == Step->getType() &&
+         "Val and Step should have the same integer type");
+
+  // Compute the scalar steps and save the results in ScalarIVMap.
+  for (unsigned Part = 0; Part < UF; ++Part)
+    for (unsigned I = 0; I < VF; ++I) {
+      auto *StartIdx = ConstantInt::get(ScalarIVTy, VF * Part + I);
+      auto *Mul = Builder.CreateMul(StartIdx, Step);
+      auto *Add = Builder.CreateAdd(ScalarIV, Mul);
+      ScalarIVMap[EntryVal].push_back(Add);
+    }
+}
+
 int LoopVectorizationLegality::isConsecutivePtr(Value *Ptr) {
   assert(Ptr->getType()->isPointerTy() && "Unexpected non-ptr");
   auto *SE = PSE.getSE();
@@ -1994,7 +2031,7 @@ int LoopVectorizationLegality::isConsecutivePtr(Value *Ptr) {
   if (Ptr->getType()->getPointerElementType()->isAggregateType())
     return 0;
 
-  // If this value is a pointer induction variable we know it is consecutive.
+  // If this value is a pointer induction variable, we know it is consecutive.
   PHINode *Phi = dyn_cast_or_null<PHINode>(Ptr);
   if (Phi && Inductions.count(Phi)) {
     InductionDescriptor II = Inductions[Phi];
@@ -2008,7 +2045,7 @@ int LoopVectorizationLegality::isConsecutivePtr(Value *Ptr) {
   unsigned NumOperands = Gep->getNumOperands();
   Value *GpPtr = Gep->getPointerOperand();
   // If this GEP value is a consecutive pointer induction variable and all of
-  // the indices are constant then we know it is consecutive. We can
+  // the indices are constant, then we know it is consecutive.
   Phi = dyn_cast<PHINode>(GpPtr);
   if (Phi && Inductions.count(Phi)) {
 
@@ -2038,7 +2075,7 @@ int LoopVectorizationLegality::isConsecutivePtr(Value *Ptr) {
   // We can emit wide load/stores only if the last non-zero index is the
   // induction variable.
   const SCEV *Last = nullptr;
-  if (!Strides.count(Gep))
+  if (!getSymbolicStrides() || !getSymbolicStrides()->count(Gep))
     Last = PSE.getSCEV(Gep->getOperand(InductionOperand));
   else {
     // Because of the multiplication by a stride we can have a s/zext cast.
@@ -2050,7 +2087,7 @@ int LoopVectorizationLegality::isConsecutivePtr(Value *Ptr) {
     //  %idxprom = zext i32 %mul to i64  << Safe cast.
     //  %arrayidx = getelementptr inbounds i32* %B, i64 %idxprom
     //
-    Last = replaceSymbolicStrideSCEV(PSE, Strides,
+    Last = replaceSymbolicStrideSCEV(PSE, *getSymbolicStrides(),
                                      Gep->getOperand(InductionOperand), Gep);
     if (const SCEVCastExpr *C = dyn_cast<SCEVCastExpr>(Last))
       Last =
@@ -2076,7 +2113,7 @@ bool LoopVectorizationLegality::isUniform(Value *V) {
   return LAI->isUniform(V);
 }
 
-InnerLoopVectorizer::VectorParts&
+InnerLoopVectorizer::VectorParts &
 InnerLoopVectorizer::getVectorValue(Value *V) {
   assert(V != Induction && "The new induction variable should not be used.");
   assert(!V->getType()->isVectorTy() && "Can't widen a vector");
@@ -2097,7 +2134,7 @@ InnerLoopVectorizer::getVectorValue(Value *V) {
 
 Value *InnerLoopVectorizer::reverseVector(Value *Vec) {
   assert(Vec->getType()->isVectorTy() && "Invalid type");
-  SmallVector<Constant*, 8> ShuffleMask;
+  SmallVector<Constant *, 8> ShuffleMask;
   for (unsigned i = 0; i < VF; ++i)
     ShuffleMask.push_back(Builder.getInt32(VF - i - 1));
 
@@ -2308,7 +2345,7 @@ void InnerLoopVectorizer::vectorizeInterleaveGroup(Instruction *Instr) {
             Group->isReverse() ? reverseVector(StridedVec) : StridedVec;
       }
 
-      propagateMetadata(NewLoadInstr, Instr);
+      addMetadata(NewLoadInstr, Instr);
     }
     return;
   }
@@ -2326,7 +2363,7 @@ void InnerLoopVectorizer::vectorizeInterleaveGroup(Instruction *Instr) {
       assert(Member && "Fail to get a member from an interleaved store group");
 
       Value *StoredVec =
-          getVectorValue(dyn_cast<StoreInst>(Member)->getValueOperand())[Part];
+          getVectorValue(cast<StoreInst>(Member)->getValueOperand())[Part];
       if (Group->isReverse())
         StoredVec = reverseVector(StoredVec);
 
@@ -2347,7 +2384,7 @@ void InnerLoopVectorizer::vectorizeInterleaveGroup(Instruction *Instr) {
 
     Instruction *NewStoreInstr =
         Builder.CreateAlignedStore(IVec, NewPtrs[Part], Group->getAlignment());
-    propagateMetadata(NewStoreInstr, Instr);
+    addMetadata(NewStoreInstr, Instr);
   }
 }
 
@@ -2372,8 +2409,8 @@ void InnerLoopVectorizer::vectorizeMemoryInstruction(Instruction *Instr) {
   if (!Alignment)
     Alignment = DL.getABITypeAlignment(ScalarDataTy);
   unsigned AddressSpace = Ptr->getType()->getPointerAddressSpace();
-  unsigned ScalarAllocatedSize = DL.getTypeAllocSize(ScalarDataTy);
-  unsigned VectorElementSize = DL.getTypeStoreSize(DataTy) / VF;
+  uint64_t ScalarAllocatedSize = DL.getTypeAllocSize(ScalarDataTy);
+  uint64_t VectorElementSize = DL.getTypeStoreSize(DataTy) / VF;
 
   if (SI && Legal->blockNeedsPredication(SI->getParent()) &&
       !Legal->isMaskRequired(SI))
@@ -2382,69 +2419,115 @@ void InnerLoopVectorizer::vectorizeMemoryInstruction(Instruction *Instr) {
   if (ScalarAllocatedSize != VectorElementSize)
     return scalarizeInstruction(Instr);
 
-  // If the pointer is loop invariant or if it is non-consecutive,
-  // scalarize the load.
+  // If the pointer is loop invariant scalarize the load.
+  if (LI && Legal->isUniform(Ptr))
+    return scalarizeInstruction(Instr);
+
+  // If the pointer is non-consecutive and gather/scatter is not supported
+  // scalarize the instruction.
   int ConsecutiveStride = Legal->isConsecutivePtr(Ptr);
   bool Reverse = ConsecutiveStride < 0;
-  bool UniformLoad = LI && Legal->isUniform(Ptr);
-  if (!ConsecutiveStride || UniformLoad)
+  bool CreateGatherScatter =
+      !ConsecutiveStride && ((LI && Legal->isLegalMaskedGather(ScalarDataTy)) ||
+                             (SI && Legal->isLegalMaskedScatter(ScalarDataTy)));
+
+  if (!ConsecutiveStride && !CreateGatherScatter)
     return scalarizeInstruction(Instr);
 
   Constant *Zero = Builder.getInt32(0);
   VectorParts &Entry = WidenMap.get(Instr);
+  VectorParts VectorGep;
 
   // Handle consecutive loads/stores.
   GetElementPtrInst *Gep = getGEPInstruction(Ptr);
-  if (Gep && Legal->isInductionVariable(Gep->getPointerOperand())) {
-    setDebugLocFromInst(Builder, Gep);
-    Value *PtrOperand = Gep->getPointerOperand();
-    Value *FirstBasePtr = getVectorValue(PtrOperand)[0];
-    FirstBasePtr = Builder.CreateExtractElement(FirstBasePtr, Zero);
-
-    // Create the new GEP with the new induction variable.
-    GetElementPtrInst *Gep2 = cast<GetElementPtrInst>(Gep->clone());
-    Gep2->setOperand(0, FirstBasePtr);
-    Gep2->setName("gep.indvar.base");
-    Ptr = Builder.Insert(Gep2);
-  } else if (Gep) {
-    setDebugLocFromInst(Builder, Gep);
-    assert(PSE.getSE()->isLoopInvariant(PSE.getSCEV(Gep->getPointerOperand()),
-                                        OrigLoop) &&
-           "Base ptr must be invariant");
-
-    // The last index does not have to be the induction. It can be
-    // consecutive and be a function of the index. For example A[I+1];
-    unsigned NumOperands = Gep->getNumOperands();
-    unsigned InductionOperand = getGEPInductionOperand(Gep);
-    // Create the new GEP with the new induction variable.
-    GetElementPtrInst *Gep2 = cast<GetElementPtrInst>(Gep->clone());
-
-    for (unsigned i = 0; i < NumOperands; ++i) {
-      Value *GepOperand = Gep->getOperand(i);
-      Instruction *GepOperandInst = dyn_cast<Instruction>(GepOperand);
-
-      // Update last index or loop invariant instruction anchored in loop.
-      if (i == InductionOperand ||
-          (GepOperandInst && OrigLoop->contains(GepOperandInst))) {
-        assert((i == InductionOperand ||
-                PSE.getSE()->isLoopInvariant(PSE.getSCEV(GepOperandInst),
-                                             OrigLoop)) &&
-               "Must be last index or loop invariant");
-
-        VectorParts &GEPParts = getVectorValue(GepOperand);
-        Value *Index = GEPParts[0];
-        Index = Builder.CreateExtractElement(Index, Zero);
-        Gep2->setOperand(i, Index);
-        Gep2->setName("gep.indvar.idx");
+  if (ConsecutiveStride) {
+    if (Gep && Legal->isInductionVariable(Gep->getPointerOperand())) {
+      setDebugLocFromInst(Builder, Gep);
+      Value *PtrOperand = Gep->getPointerOperand();
+      Value *FirstBasePtr = getVectorValue(PtrOperand)[0];
+      FirstBasePtr = Builder.CreateExtractElement(FirstBasePtr, Zero);
+
+      // Create the new GEP with the new induction variable.
+      GetElementPtrInst *Gep2 = cast<GetElementPtrInst>(Gep->clone());
+      Gep2->setOperand(0, FirstBasePtr);
+      Gep2->setName("gep.indvar.base");
+      Ptr = Builder.Insert(Gep2);
+    } else if (Gep) {
+      setDebugLocFromInst(Builder, Gep);
+      assert(PSE.getSE()->isLoopInvariant(PSE.getSCEV(Gep->getPointerOperand()),
+                                          OrigLoop) &&
+             "Base ptr must be invariant");
+      // The last index does not have to be the induction. It can be
+      // consecutive and be a function of the index. For example A[I+1];
+      unsigned NumOperands = Gep->getNumOperands();
+      unsigned InductionOperand = getGEPInductionOperand(Gep);
+      // Create the new GEP with the new induction variable.
+      GetElementPtrInst *Gep2 = cast<GetElementPtrInst>(Gep->clone());
+
+      for (unsigned i = 0; i < NumOperands; ++i) {
+        Value *GepOperand = Gep->getOperand(i);
+        Instruction *GepOperandInst = dyn_cast<Instruction>(GepOperand);
+
+        // Update last index or loop invariant instruction anchored in loop.
+        if (i == InductionOperand ||
+            (GepOperandInst && OrigLoop->contains(GepOperandInst))) {
+          assert((i == InductionOperand ||
+                  PSE.getSE()->isLoopInvariant(PSE.getSCEV(GepOperandInst),
+                                               OrigLoop)) &&
+                 "Must be last index or loop invariant");
+
+          VectorParts &GEPParts = getVectorValue(GepOperand);
+
+          // If GepOperand is an induction variable, and there's a scalarized
+          // version of it available, use it. Otherwise, we will need to create
+          // an extractelement instruction.
+          Value *Index = ScalarIVMap.count(GepOperand)
+                             ? ScalarIVMap[GepOperand][0]
+                             : Builder.CreateExtractElement(GEPParts[0], Zero);
+
+          Gep2->setOperand(i, Index);
+          Gep2->setName("gep.indvar.idx");
+        }
       }
+      Ptr = Builder.Insert(Gep2);
+    } else { // No GEP
+      // Use the induction element ptr.
+      assert(isa<PHINode>(Ptr) && "Invalid induction ptr");
+      setDebugLocFromInst(Builder, Ptr);
+      VectorParts &PtrVal = getVectorValue(Ptr);
+      Ptr = Builder.CreateExtractElement(PtrVal[0], Zero);
     }
-    Ptr = Builder.Insert(Gep2);
   } else {
-    // Use the induction element ptr.
-    assert(isa<PHINode>(Ptr) && "Invalid induction ptr");
-    setDebugLocFromInst(Builder, Ptr);
-    VectorParts &PtrVal = getVectorValue(Ptr);
-    Ptr = Builder.CreateExtractElement(PtrVal[0], Zero);
+    // At this point we should vector version of GEP for Gather or Scatter
+    assert(CreateGatherScatter && "The instruction should be scalarized");
+    if (Gep) {
+      // Vectorizing GEP, across UF parts. We want to get a vector value for base
+      // and each index that's defined inside the loop, even if it is
+      // loop-invariant but wasn't hoisted out. Otherwise we want to keep them
+      // scalar.
+      SmallVector<VectorParts, 4> OpsV;
+      for (Value *Op : Gep->operands()) {
+        Instruction *SrcInst = dyn_cast<Instruction>(Op);
+        if (SrcInst && OrigLoop->contains(SrcInst))
+          OpsV.push_back(getVectorValue(Op));
+        else
+          OpsV.push_back(VectorParts(UF, Op));
+      }
+      for (unsigned Part = 0; Part < UF; ++Part) {
+        SmallVector<Value *, 4> Ops;
+        Value *GEPBasePtr = OpsV[0][Part];
+        for (unsigned i = 1; i < Gep->getNumOperands(); i++)
+          Ops.push_back(OpsV[i][Part]);
+        Value *NewGep =  Builder.CreateGEP(GEPBasePtr, Ops, "VectorGep");
+        cast<GetElementPtrInst>(NewGep)->setIsInBounds(Gep->isInBounds());
+        assert(NewGep->getType()->isVectorTy() && "Expected vector GEP");
+
+        NewGep =
+            Builder.CreateBitCast(NewGep, VectorType::get(Ptr->getType(), VF));
+        VectorGep.push_back(NewGep);
+      }
+    } else
+      VectorGep = getVectorValue(Ptr);
   }
 
   VectorParts Mask = createBlockInMask(Instr->getParent());
@@ -2458,62 +2541,78 @@ void InnerLoopVectorizer::vectorizeMemoryInstruction(Instruction *Instr) {
     VectorParts StoredVal = getVectorValue(SI->getValueOperand());
 
     for (unsigned Part = 0; Part < UF; ++Part) {
+      Instruction *NewSI = nullptr;
+      if (CreateGatherScatter) {
+        Value *MaskPart = Legal->isMaskRequired(SI) ? Mask[Part] : nullptr;
+        NewSI = Builder.CreateMaskedScatter(StoredVal[Part], VectorGep[Part],
+                                            Alignment, MaskPart);
+      } else {
+        // Calculate the pointer for the specific unroll-part.
+        Value *PartPtr =
+            Builder.CreateGEP(nullptr, Ptr, Builder.getInt32(Part * VF));
+
+        if (Reverse) {
+          // If we store to reverse consecutive memory locations, then we need
+          // to reverse the order of elements in the stored value.
+          StoredVal[Part] = reverseVector(StoredVal[Part]);
+          // If the address is consecutive but reversed, then the
+          // wide store needs to start at the last vector element.
+          PartPtr =
+              Builder.CreateGEP(nullptr, Ptr, Builder.getInt32(-Part * VF));
+          PartPtr =
+              Builder.CreateGEP(nullptr, PartPtr, Builder.getInt32(1 - VF));
+          Mask[Part] = reverseVector(Mask[Part]);
+        }
+
+        Value *VecPtr =
+            Builder.CreateBitCast(PartPtr, DataTy->getPointerTo(AddressSpace));
+
+        if (Legal->isMaskRequired(SI))
+          NewSI = Builder.CreateMaskedStore(StoredVal[Part], VecPtr, Alignment,
+                                            Mask[Part]);
+        else
+          NewSI =
+              Builder.CreateAlignedStore(StoredVal[Part], VecPtr, Alignment);
+      }
+      addMetadata(NewSI, SI);
+    }
+    return;
+  }
+
+  // Handle loads.
+  assert(LI && "Must have a load instruction");
+  setDebugLocFromInst(Builder, LI);
+  for (unsigned Part = 0; Part < UF; ++Part) {
+    Instruction *NewLI;
+    if (CreateGatherScatter) {
+      Value *MaskPart = Legal->isMaskRequired(LI) ? Mask[Part] : nullptr;
+      NewLI = Builder.CreateMaskedGather(VectorGep[Part], Alignment, MaskPart,
+                                         0, "wide.masked.gather");
+      Entry[Part] = NewLI;
+    } else {
       // Calculate the pointer for the specific unroll-part.
       Value *PartPtr =
           Builder.CreateGEP(nullptr, Ptr, Builder.getInt32(Part * VF));
 
       if (Reverse) {
-        // If we store to reverse consecutive memory locations, then we need
-        // to reverse the order of elements in the stored value.
-        StoredVal[Part] = reverseVector(StoredVal[Part]);
         // If the address is consecutive but reversed, then the
-        // wide store needs to start at the last vector element.
+        // wide load needs to start at the last vector element.
         PartPtr = Builder.CreateGEP(nullptr, Ptr, Builder.getInt32(-Part * VF));
         PartPtr = Builder.CreateGEP(nullptr, PartPtr, Builder.getInt32(1 - VF));
         Mask[Part] = reverseVector(Mask[Part]);
       }
 
-      Value *VecPtr = Builder.CreateBitCast(PartPtr,
-                                            DataTy->getPointerTo(AddressSpace));
-
-      Instruction *NewSI;
-      if (Legal->isMaskRequired(SI))
-        NewSI = Builder.CreateMaskedStore(StoredVal[Part], VecPtr, Alignment,
-                                          Mask[Part]);
-      else 
-        NewSI = Builder.CreateAlignedStore(StoredVal[Part], VecPtr, Alignment);
-      propagateMetadata(NewSI, SI);
-    }
-    return;
-  }
-
-  // Handle loads.
-  assert(LI && "Must have a load instruction");
-  setDebugLocFromInst(Builder, LI);
-  for (unsigned Part = 0; Part < UF; ++Part) {
-    // Calculate the pointer for the specific unroll-part.
-    Value *PartPtr =
-        Builder.CreateGEP(nullptr, Ptr, Builder.getInt32(Part * VF));
-
-    if (Reverse) {
-      // If the address is consecutive but reversed, then the
-      // wide load needs to start at the last vector element.
-      PartPtr = Builder.CreateGEP(nullptr, Ptr, Builder.getInt32(-Part * VF));
-      PartPtr = Builder.CreateGEP(nullptr, PartPtr, Builder.getInt32(1 - VF));
-      Mask[Part] = reverseVector(Mask[Part]);
+      Value *VecPtr =
+          Builder.CreateBitCast(PartPtr, DataTy->getPointerTo(AddressSpace));
+      if (Legal->isMaskRequired(LI))
+        NewLI = Builder.CreateMaskedLoad(VecPtr, Alignment, Mask[Part],
+                                         UndefValue::get(DataTy),
+                                         "wide.masked.load");
+      else
+        NewLI = Builder.CreateAlignedLoad(VecPtr, Alignment, "wide.load");
+      Entry[Part] = Reverse ? reverseVector(NewLI) : NewLI;
     }
-
-    Instruction* NewLI;
-    Value *VecPtr = Builder.CreateBitCast(PartPtr,
-                                          DataTy->getPointerTo(AddressSpace));
-    if (Legal->isMaskRequired(LI))
-      NewLI = Builder.CreateMaskedLoad(VecPtr, Alignment, Mask[Part],
-                                       UndefValue::get(DataTy),
-                                       "wide.masked.load");
-    else
-      NewLI = Builder.CreateAlignedLoad(VecPtr, Alignment, "wide.load");
-    propagateMetadata(NewLI, LI);
-    Entry[Part] = Reverse ? reverseVector(NewLI) :  NewLI;
+    addMetadata(NewLI, LI);
   }
 }
 
@@ -2526,9 +2625,7 @@ void InnerLoopVectorizer::scalarizeInstruction(Instruction *Instr,
   setDebugLocFromInst(Builder, Instr);
 
   // Find all of the vectorized parameters.
-  for (unsigned op = 0, e = Instr->getNumOperands(); op != e; ++op) {
-    Value *SrcOp = Instr->getOperand(op);
-
+  for (Value *SrcOp : Instr->operands()) {
     // If we are accessing the old induction variable, use the new one.
     if (SrcOp == OldInduction) {
       Params.push_back(getVectorValue(SrcOp));
@@ -2536,7 +2633,7 @@ void InnerLoopVectorizer::scalarizeInstruction(Instruction *Instr,
     }
 
     // Try using previously calculated values.
-    Instruction *SrcInst = dyn_cast<Instruction>(SrcOp);
+    auto *SrcInst = dyn_cast<Instruction>(SrcOp);
 
     // If the src is an instruction that appeared earlier in the basic block,
     // then it should already be vectorized.
@@ -2558,8 +2655,9 @@ void InnerLoopVectorizer::scalarizeInstruction(Instruction *Instr,
   // Does this instruction return a value ?
   bool IsVoidRetTy = Instr->getType()->isVoidTy();
 
-  Value *UndefVec = IsVoidRetTy ? nullptr :
-    UndefValue::get(VectorType::get(Instr->getType(), VF));
+  Value *UndefVec =
+      IsVoidRetTy ? nullptr
+                  : UndefValue::get(VectorType::get(Instr->getType(), VF));
   // Create a new entry in the WidenMap and initialize it to Undef or Null.
   VectorParts &VecResults = WidenMap.splat(Instr, UndefVec);
 
@@ -2589,16 +2687,28 @@ void InnerLoopVectorizer::scalarizeInstruction(Instruction *Instr,
         Cloned->setName(Instr->getName() + ".cloned");
       // Replace the operands of the cloned instructions with extracted scalars.
       for (unsigned op = 0, e = Instr->getNumOperands(); op != e; ++op) {
-        Value *Op = Params[op][Part];
-        // Param is a vector. Need to extract the right lane.
-        if (Op->getType()->isVectorTy())
-          Op = Builder.CreateExtractElement(Op, Builder.getInt32(Width));
-        Cloned->setOperand(op, Op);
+
+        // If the operand is an induction variable, and there's a scalarized
+        // version of it available, use it. Otherwise, we will need to create
+        // an extractelement instruction if vectorizing.
+        auto *NewOp = Params[op][Part];
+        auto *ScalarOp = Instr->getOperand(op);
+        if (ScalarIVMap.count(ScalarOp))
+          NewOp = ScalarIVMap[ScalarOp][VF * Part + Width];
+        else if (NewOp->getType()->isVectorTy())
+          NewOp = Builder.CreateExtractElement(NewOp, Builder.getInt32(Width));
+        Cloned->setOperand(op, NewOp);
       }
+      addNewMetadata(Cloned, Instr);
 
       // Place the cloned scalar in the new loop.
       Builder.Insert(Cloned);
 
+      // If we just cloned a new assumption, add it the assumption cache.
+      if (auto *II = dyn_cast<IntrinsicInst>(Cloned))
+        if (II->getIntrinsicID() == Intrinsic::assume)
+          AC->registerAssumption(II);
+
       // If the original scalar returns a value we need to place it in a vector
       // so that future users will be able to use it.
       if (!IsVoidRetTy)
@@ -2606,8 +2716,8 @@ void InnerLoopVectorizer::scalarizeInstruction(Instruction *Instr,
                                                        Builder.getInt32(Width));
       // End if-block.
       if (IfPredicateStore)
-        PredicatedStores.push_back(std::make_pair(cast<StoreInst>(Cloned),
-                                                  Cmp));
+        PredicatedStores.push_back(
+            std::make_pair(cast<StoreInst>(Cloned), Cmp));
     }
   }
 }
@@ -2627,7 +2737,7 @@ PHINode *InnerLoopVectorizer::createInductionVariable(Loop *L, Value *Start,
   auto *Induction = Builder.CreatePHI(Start->getType(), 2, "index");
 
   Builder.SetInsertPoint(Latch->getTerminator());
-  
+
   // Create i+1 and fill the PHINode.
   Value *Next = Builder.CreateAdd(Induction, Step, "index.next");
   Induction->addIncoming(Start, L->getLoopPreheader());
@@ -2635,7 +2745,7 @@ PHINode *InnerLoopVectorizer::createInductionVariable(Loop *L, Value *Start,
   // Create the compare.
   Value *ICmp = Builder.CreateICmpEQ(Next, End);
   Builder.CreateCondBr(ICmp, L->getExitBlock(), Header);
-  
+
   // Now we have two terminators. Remove the old one from the block.
   Latch->getTerminator()->eraseFromParent();
 
@@ -2649,12 +2759,12 @@ Value *InnerLoopVectorizer::getOrCreateTripCount(Loop *L) {
   IRBuilder<> Builder(L->getLoopPreheader()->getTerminator());
   // Find the loop boundaries.
   ScalarEvolution *SE = PSE.getSE();
-  const SCEV *BackedgeTakenCount = SE->getBackedgeTakenCount(OrigLoop);
+  const SCEV *BackedgeTakenCount = PSE.getBackedgeTakenCount();
   assert(BackedgeTakenCount != SE->getCouldNotCompute() &&
          "Invalid loop count");
 
   Type *IdxTy = Legal->getWidestInductionType();
-  
+
   // The exit count might have the type of i64 while the phi is i32. This can
   // happen if we have an induction variable that is sign extended before the
   // compare. The only way that we get a backedge taken count is that the
@@ -2664,7 +2774,7 @@ Value *InnerLoopVectorizer::getOrCreateTripCount(Loop *L) {
       IdxTy->getPrimitiveSizeInBits())
     BackedgeTakenCount = SE->getTruncateOrNoop(BackedgeTakenCount, IdxTy);
   BackedgeTakenCount = SE->getNoopOrZeroExtend(BackedgeTakenCount, IdxTy);
-  
+
   // Get the total trip count from the count by adding 1.
   const SCEV *ExitCount = SE->getAddExpr(
       BackedgeTakenCount, SE->getOne(BackedgeTakenCount->getType()));
@@ -2681,9 +2791,8 @@ Value *InnerLoopVectorizer::getOrCreateTripCount(Loop *L) {
 
   if (TripCount->getType()->isPointerTy())
     TripCount =
-      CastInst::CreatePointerCast(TripCount, IdxTy,
-                                  "exitcount.ptrcnt.to.int",
-                                  L->getLoopPreheader()->getTerminator());
+        CastInst::CreatePointerCast(TripCount, IdxTy, "exitcount.ptrcnt.to.int",
+                                    L->getLoopPreheader()->getTerminator());
 
   return TripCount;
 }
@@ -2691,16 +2800,30 @@ Value *InnerLoopVectorizer::getOrCreateTripCount(Loop *L) {
 Value *InnerLoopVectorizer::getOrCreateVectorTripCount(Loop *L) {
   if (VectorTripCount)
     return VectorTripCount;
-  
+
   Value *TC = getOrCreateTripCount(L);
   IRBuilder<> Builder(L->getLoopPreheader()->getTerminator());
-  
-  // Now we need to generate the expression for N - (N % VF), which is
-  // the part that the vectorized body will execute.
-  // The loop step is equal to the vectorization factor (num of SIMD elements)
-  // times the unroll factor (num of SIMD instructions).
+
+  // Now we need to generate the expression for the part of the loop that the
+  // vectorized body will execute. This is equal to N - (N % Step) if scalar
+  // iterations are not required for correctness, or N - Step, otherwise. Step
+  // is equal to the vectorization factor (number of SIMD elements) times the
+  // unroll factor (number of SIMD instructions).
   Constant *Step = ConstantInt::get(TC->getType(), VF * UF);
   Value *R = Builder.CreateURem(TC, Step, "n.mod.vf");
+
+  // If there is a non-reversed interleaved group that may speculatively access
+  // memory out-of-bounds, we need to ensure that there will be at least one
+  // iteration of the scalar epilogue loop. Thus, if the step evenly divides
+  // the trip count, we set the remainder to be equal to the step. If the step
+  // does not evenly divide the trip count, no adjustment is necessary since
+  // there will already be scalar iterations. Note that the minimum iterations
+  // check ensures that N >= Step.
+  if (VF > 1 && Legal->requiresScalarEpilogue()) {
+    auto *IsZero = Builder.CreateICmpEQ(R, ConstantInt::get(R->getType(), 0));
+    R = Builder.CreateSelect(IsZero, Step, R);
+  }
+
   VectorTripCount = Builder.CreateSub(TC, R, "n.vec");
 
   return VectorTripCount;
@@ -2714,13 +2837,15 @@ void InnerLoopVectorizer::emitMinimumIterationCountCheck(Loop *L,
 
   // Generate code to check that the loop's trip count that we computed by
   // adding one to the backedge-taken count will not overflow.
-  Value *CheckMinIters =
-    Builder.CreateICmpULT(Count,
-                          ConstantInt::get(Count->getType(), VF * UF),
-                          "min.iters.check");
-  
-  BasicBlock *NewBB = BB->splitBasicBlock(BB->getTerminator(),
-                                          "min.iters.checked");
+  Value *CheckMinIters = Builder.CreateICmpULT(
+      Count, ConstantInt::get(Count->getType(), VF * UF), "min.iters.check");
+
+  BasicBlock *NewBB =
+      BB->splitBasicBlock(BB->getTerminator(), "min.iters.checked");
+  // Update dominator tree immediately if the generated block is a
+  // LoopBypassBlock because SCEV expansions to generate loop bypass
+  // checks may query it before the current function is finished.
+  DT->addNewBlock(NewBB, BB);
   if (L->getParentLoop())
     L->getParentLoop()->addBasicBlockToLoop(NewBB, *LI);
   ReplaceInstWithInst(BB->getTerminator(),
@@ -2733,7 +2858,7 @@ void InnerLoopVectorizer::emitVectorLoopEnteredCheck(Loop *L,
   Value *TC = getOrCreateVectorTripCount(L);
   BasicBlock *BB = L->getLoopPreheader();
   IRBuilder<> Builder(BB->getTerminator());
-  
+
   // Now, compare the new count to zero. If it is zero skip the vector loop and
   // jump to the scalar loop.
   Value *Cmp = Builder.CreateICmpEQ(TC, Constant::getNullValue(TC->getType()),
@@ -2741,8 +2866,11 @@ void InnerLoopVectorizer::emitVectorLoopEnteredCheck(Loop *L,
 
   // Generate code to check that the loop's trip count that we computed by
   // adding one to the backedge-taken count will not overflow.
-  BasicBlock *NewBB = BB->splitBasicBlock(BB->getTerminator(),
-                                          "vector.ph");
+  BasicBlock *NewBB = BB->splitBasicBlock(BB->getTerminator(), "vector.ph");
+  // Update dominator tree immediately if the generated block is a
+  // LoopBypassBlock because SCEV expansions to generate loop bypass
+  // checks may query it before the current function is finished.
+  DT->addNewBlock(NewBB, BB);
   if (L->getParentLoop())
     L->getParentLoop()->addBasicBlockToLoop(NewBB, *LI);
   ReplaceInstWithInst(BB->getTerminator(),
@@ -2768,6 +2896,10 @@ void InnerLoopVectorizer::emitSCEVChecks(Loop *L, BasicBlock *Bypass) {
   // Create a new block containing the stride check.
   BB->setName("vector.scevcheck");
   auto *NewBB = BB->splitBasicBlock(BB->getTerminator(), "vector.ph");
+  // Update dominator tree immediately if the generated block is a
+  // LoopBypassBlock because SCEV expansions to generate loop bypass
+  // checks may query it before the current function is finished.
+  DT->addNewBlock(NewBB, BB);
   if (L->getParentLoop())
     L->getParentLoop()->addBasicBlockToLoop(NewBB, *LI);
   ReplaceInstWithInst(BB->getTerminator(),
@@ -2776,8 +2908,7 @@ void InnerLoopVectorizer::emitSCEVChecks(Loop *L, BasicBlock *Bypass) {
   AddedSafetyChecks = true;
 }
 
-void InnerLoopVectorizer::emitMemRuntimeChecks(Loop *L,
-                                               BasicBlock *Bypass) {
+void InnerLoopVectorizer::emitMemRuntimeChecks(Loop *L, BasicBlock *Bypass) {
   BasicBlock *BB = L->getLoopPreheader();
 
   // Generate the code that checks in runtime if arrays overlap. We put the
@@ -2793,14 +2924,23 @@ void InnerLoopVectorizer::emitMemRuntimeChecks(Loop *L,
   // Create a new block containing the memory check.
   BB->setName("vector.memcheck");
   auto *NewBB = BB->splitBasicBlock(BB->getTerminator(), "vector.ph");
+  // Update dominator tree immediately if the generated block is a
+  // LoopBypassBlock because SCEV expansions to generate loop bypass
+  // checks may query it before the current function is finished.
+  DT->addNewBlock(NewBB, BB);
   if (L->getParentLoop())
     L->getParentLoop()->addBasicBlockToLoop(NewBB, *LI);
   ReplaceInstWithInst(BB->getTerminator(),
                       BranchInst::Create(Bypass, NewBB, MemRuntimeCheck));
   LoopBypassBlocks.push_back(BB);
   AddedSafetyChecks = true;
-}
 
+  // We currently don't use LoopVersioning for the actual loop cloning but we
+  // still use it to add the noalias metadata.
+  LVer = llvm::make_unique<LoopVersioning>(*Legal->getLAI(), OrigLoop, LI, DT,
+                                           PSE.getSE());
+  LVer->prepareNoAliasMetadata();
+}
 
 void InnerLoopVectorizer::createEmptyLoop() {
   /*
@@ -2859,12 +2999,12 @@ void InnerLoopVectorizer::createEmptyLoop() {
   BasicBlock *VecBody =
       VectorPH->splitBasicBlock(VectorPH->getTerminator(), "vector.body");
   BasicBlock *MiddleBlock =
-  VecBody->splitBasicBlock(VecBody->getTerminator(), "middle.block");
+      VecBody->splitBasicBlock(VecBody->getTerminator(), "middle.block");
   BasicBlock *ScalarPH =
-  MiddleBlock->splitBasicBlock(MiddleBlock->getTerminator(), "scalar.ph");
+      MiddleBlock->splitBasicBlock(MiddleBlock->getTerminator(), "scalar.ph");
 
   // Create and register the new vector loop.
-  Loop* Lp = new Loop();
+  Loop *Lp = new Loop();
   Loop *ParentLoop = OrigLoop->getParentLoop();
 
   // Insert the new loop into the loop nest and register the new basic blocks
@@ -2899,15 +3039,15 @@ void InnerLoopVectorizer::createEmptyLoop() {
   // checks into a separate block to make the more common case of few elements
   // faster.
   emitMemRuntimeChecks(Lp, ScalarPH);
-  
+
   // Generate the induction variable.
   // The loop step is equal to the vectorization factor (num of SIMD elements)
   // times the unroll factor (num of SIMD instructions).
   Value *CountRoundDown = getOrCreateVectorTripCount(Lp);
   Constant *Step = ConstantInt::get(IdxTy, VF * UF);
   Induction =
-    createInductionVariable(Lp, StartIdx, CountRoundDown, Step,
-                            getDebugLocFromInstOrOperands(OldInduction));
+      createInductionVariable(Lp, StartIdx, CountRoundDown, Step,
+                              getDebugLocFromInstOrOperands(OldInduction));
 
   // We are going to resume the execution of the scalar loop.
   // Go over all of the induction variables that we found and fix the
@@ -2920,16 +3060,14 @@ void InnerLoopVectorizer::createEmptyLoop() {
   // This variable saves the new starting index for the scalar loop. It is used
   // to test if there are any tail iterations left once the vector loop has
   // completed.
-  LoopVectorizationLegality::InductionList::iterator I, E;
   LoopVectorizationLegality::InductionList *List = Legal->getInductionVars();
-  for (I = List->begin(), E = List->end(); I != E; ++I) {
-    PHINode *OrigPhi = I->first;
-    InductionDescriptor II = I->second;
+  for (auto &InductionEntry : *List) {
+    PHINode *OrigPhi = InductionEntry.first;
+    InductionDescriptor II = InductionEntry.second;
 
     // Create phi nodes to merge from the  backedge-taken check block.
-    PHINode *BCResumeVal = PHINode::Create(OrigPhi->getType(), 3,
-                                           "bc.resume.val",
-                                           ScalarPH->getTerminator());
+    PHINode *BCResumeVal = PHINode::Create(
+        OrigPhi->getType(), 3, "bc.resume.val", ScalarPH->getTerminator());
     Value *EndValue;
     if (OrigPhi == OldInduction) {
       // We know what the end value is.
@@ -2937,9 +3075,9 @@ void InnerLoopVectorizer::createEmptyLoop() {
     } else {
       IRBuilder<> B(LoopBypassBlocks.back()->getTerminator());
       Value *CRD = B.CreateSExtOrTrunc(CountRoundDown,
-                                       II.getStepValue()->getType(),
-                                       "cast.crd");
-      EndValue = II.transform(B, CRD);
+                                       II.getStep()->getType(), "cast.crd");
+      const DataLayout &DL = OrigLoop->getHeader()->getModule()->getDataLayout();
+      EndValue = II.transform(B, CRD, PSE.getSE(), DL);
       EndValue->setName("ind.end");
     }
 
@@ -2947,22 +3085,25 @@ void InnerLoopVectorizer::createEmptyLoop() {
     // or the value at the end of the vectorized loop.
     BCResumeVal->addIncoming(EndValue, MiddleBlock);
 
+    // Fix up external users of the induction variable.
+    fixupIVUsers(OrigPhi, II, CountRoundDown, EndValue, MiddleBlock);
+
     // Fix the scalar body counter (PHI node).
     unsigned BlockIdx = OrigPhi->getBasicBlockIndex(ScalarPH);
 
     // The old induction's phi node in the scalar body needs the truncated
     // value.
-    for (unsigned I = 0, E = LoopBypassBlocks.size(); I != E; ++I)
-      BCResumeVal->addIncoming(II.getStartValue(), LoopBypassBlocks[I]);
+    for (BasicBlock *BB : LoopBypassBlocks)
+      BCResumeVal->addIncoming(II.getStartValue(), BB);
     OrigPhi->setIncomingValue(BlockIdx, BCResumeVal);
   }
 
   // Add a check in the middle block to see if we have completed
   // all of the iterations in the first vector loop.
   // If (N - N%VF) == N, then we *don't* need to run the remainder.
-  Value *CmpN = CmpInst::Create(Instruction::ICmp, CmpInst::ICMP_EQ, Count,
-                                CountRoundDown, "cmp.n",
-                                MiddleBlock->getTerminator());
+  Value *CmpN =
+      CmpInst::Create(Instruction::ICmp, CmpInst::ICMP_EQ, Count,
+                      CountRoundDown, "cmp.n", MiddleBlock->getTerminator());
   ReplaceInstWithInst(MiddleBlock->getTerminator(),
                       BranchInst::Create(ExitBlock, ScalarPH, CmpN));
 
@@ -2974,13 +3115,79 @@ void InnerLoopVectorizer::createEmptyLoop() {
   LoopScalarPreHeader = ScalarPH;
   LoopMiddleBlock = MiddleBlock;
   LoopExitBlock = ExitBlock;
-  LoopVectorBody.push_back(VecBody);
+  LoopVectorBody = VecBody;
   LoopScalarBody = OldBasicBlock;
 
+  // Keep all loop hints from the original loop on the vector loop (we'll
+  // replace the vectorizer-specific hints below).
+  if (MDNode *LID = OrigLoop->getLoopID())
+    Lp->setLoopID(LID);
+
   LoopVectorizeHints Hints(Lp, true);
   Hints.setAlreadyVectorized();
 }
 
+// Fix up external users of the induction variable. At this point, we are
+// in LCSSA form, with all external PHIs that use the IV having one input value,
+// coming from the remainder loop. We need those PHIs to also have a correct
+// value for the IV when arriving directly from the middle block.
+void InnerLoopVectorizer::fixupIVUsers(PHINode *OrigPhi,
+                                       const InductionDescriptor &II,
+                                       Value *CountRoundDown, Value *EndValue,
+                                       BasicBlock *MiddleBlock) {
+  // There are two kinds of external IV usages - those that use the value
+  // computed in the last iteration (the PHI) and those that use the penultimate
+  // value (the value that feeds into the phi from the loop latch).
+  // We allow both, but they, obviously, have different values.
+
+  assert(OrigLoop->getExitBlock() && "Expected a single exit block");
+
+  DenseMap<Value *, Value *> MissingVals;
+
+  // An external user of the last iteration's value should see the value that
+  // the remainder loop uses to initialize its own IV.
+  Value *PostInc = OrigPhi->getIncomingValueForBlock(OrigLoop->getLoopLatch());
+  for (User *U : PostInc->users()) {
+    Instruction *UI = cast<Instruction>(U);
+    if (!OrigLoop->contains(UI)) {
+      assert(isa<PHINode>(UI) && "Expected LCSSA form");
+      MissingVals[UI] = EndValue;
+    }
+  }
+
+  // An external user of the penultimate value need to see EndValue - Step.
+  // The simplest way to get this is to recompute it from the constituent SCEVs,
+  // that is Start + (Step * (CRD - 1)).
+  for (User *U : OrigPhi->users()) {
+    auto *UI = cast<Instruction>(U);
+    if (!OrigLoop->contains(UI)) {
+      const DataLayout &DL =
+          OrigLoop->getHeader()->getModule()->getDataLayout();
+      assert(isa<PHINode>(UI) && "Expected LCSSA form");
+
+      IRBuilder<> B(MiddleBlock->getTerminator());
+      Value *CountMinusOne = B.CreateSub(
+          CountRoundDown, ConstantInt::get(CountRoundDown->getType(), 1));
+      Value *CMO = B.CreateSExtOrTrunc(CountMinusOne, II.getStep()->getType(),
+                                       "cast.cmo");
+      Value *Escape = II.transform(B, CMO, PSE.getSE(), DL);
+      Escape->setName("ind.escape");
+      MissingVals[UI] = Escape;
+    }
+  }
+
+  for (auto &I : MissingVals) {
+    PHINode *PHI = cast<PHINode>(I.first);
+    // One corner case we have to handle is two IVs "chasing" each-other,
+    // that is %IV2 = phi [...], [ %IV1, %latch ]
+    // In this case, if IV1 has an external use, we need to avoid adding both
+    // "last value of IV1" and "penultimate value of IV2". So, verify that we
+    // don't already have an incoming value for the middle block.
+    if (PHI->getBasicBlockIndex(MiddleBlock) == -1)
+      PHI->addIncoming(I.second, MiddleBlock);
+  }
+}
+
 namespace {
 struct CSEDenseMapInfo {
   static bool canHandle(Instruction *I) {
@@ -3007,48 +3214,31 @@ struct CSEDenseMapInfo {
 };
 }
 
-/// \brief Check whether this block is a predicated block.
-/// Due to if predication of stores we might create a sequence of "if(pred) a[i]
-/// = ...;  " blocks. We start with one vectorized basic block. For every
-/// conditional block we split this vectorized block. Therefore, every second
-/// block will be a predicated one.
-static bool isPredicatedBlock(unsigned BlockNum) {
-  return BlockNum % 2;
-}
-
 ///\brief Perform cse of induction variable instructions.
-static void cse(SmallVector<BasicBlock *, 4> &BBs) {
+static void cse(BasicBlock *BB) {
   // Perform simple cse.
   SmallDenseMap<Instruction *, Instruction *, 4, CSEDenseMapInfo> CSEMap;
-  for (unsigned i = 0, e = BBs.size(); i != e; ++i) {
-    BasicBlock *BB = BBs[i];
-    for (BasicBlock::iterator I = BB->begin(), E = BB->end(); I != E;) {
-      Instruction *In = &*I++;
-
-      if (!CSEDenseMapInfo::canHandle(In))
-        continue;
+  for (BasicBlock::iterator I = BB->begin(), E = BB->end(); I != E;) {
+    Instruction *In = &*I++;
 
-      // Check if we can replace this instruction with any of the
-      // visited instructions.
-      if (Instruction *V = CSEMap.lookup(In)) {
-        In->replaceAllUsesWith(V);
-        In->eraseFromParent();
-        continue;
-      }
-      // Ignore instructions in conditional blocks. We create "if (pred) a[i] =
-      // ...;" blocks for predicated stores. Every second block is a predicated
-      // block.
-      if (isPredicatedBlock(i))
-        continue;
+    if (!CSEDenseMapInfo::canHandle(In))
+      continue;
 
-      CSEMap[In] = In;
+    // Check if we can replace this instruction with any of the
+    // visited instructions.
+    if (Instruction *V = CSEMap.lookup(In)) {
+      In->replaceAllUsesWith(V);
+      In->eraseFromParent();
+      continue;
     }
+
+    CSEMap[In] = In;
   }
 }
 
 /// \brief Adds a 'fast' flag to floating point operations.
 static Value *addFastMathFlag(Value *V) {
-  if (isa<FPMathOperator>(V)){
+  if (isa<FPMathOperator>(V)) {
     FastMathFlags Flags;
     Flags.setUnsafeAlgebra();
     cast<Instruction>(V)->setFastMathFlags(Flags);
@@ -3066,11 +3256,11 @@ static unsigned getScalarizationOverhead(Type *Ty, bool Insert, bool Extract,
   assert(Ty->isVectorTy() && "Can only scalarize vectors");
   unsigned Cost = 0;
 
-  for (int i = 0, e = Ty->getVectorNumElements(); i < e; ++i) {
+  for (unsigned I = 0, E = Ty->getVectorNumElements(); I < E; ++I) {
     if (Insert)
-      Cost += TTI.getVectorInstrCost(Instruction::InsertElement, Ty, i);
+      Cost += TTI.getVectorInstrCost(Instruction::InsertElement, Ty, I);
     if (Extract)
-      Cost += TTI.getVectorInstrCost(Instruction::ExtractElement, Ty, i);
+      Cost += TTI.getVectorInstrCost(Instruction::ExtractElement, Ty, I);
   }
 
   return Cost;
@@ -3101,15 +3291,15 @@ static unsigned getVectorCallCost(CallInst *CI, unsigned VF,
 
   // Compute corresponding vector type for return value and arguments.
   Type *RetTy = ToVectorTy(ScalarRetTy, VF);
-  for (unsigned i = 0, ie = ScalarTys.size(); i != ie; ++i)
-    Tys.push_back(ToVectorTy(ScalarTys[i], VF));
+  for (Type *ScalarTy : ScalarTys)
+    Tys.push_back(ToVectorTy(ScalarTy, VF));
 
   // Compute costs of unpacking argument values for the scalar calls and
   // packing the return values to a vector.
   unsigned ScalarizationCost =
       getScalarizationOverhead(RetTy, true, false, TTI);
-  for (unsigned i = 0, ie = Tys.size(); i != ie; ++i)
-    ScalarizationCost += getScalarizationOverhead(Tys[i], false, true, TTI);
+  for (Type *Ty : Tys)
+    ScalarizationCost += getScalarizationOverhead(Ty, false, true, TTI);
 
   unsigned Cost = ScalarCallCost * VF + ScalarizationCost;
 
@@ -3134,25 +3324,29 @@ static unsigned getVectorCallCost(CallInst *CI, unsigned VF,
 static unsigned getVectorIntrinsicCost(CallInst *CI, unsigned VF,
                                        const TargetTransformInfo &TTI,
                                        const TargetLibraryInfo *TLI) {
-  Intrinsic::ID ID = getIntrinsicIDForCall(CI, TLI);
+  Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);
   assert(ID && "Expected intrinsic call!");
 
   Type *RetTy = ToVectorTy(CI->getType(), VF);
   SmallVector<Type *, 4> Tys;
-  for (unsigned i = 0, ie = CI->getNumArgOperands(); i != ie; ++i)
-    Tys.push_back(ToVectorTy(CI->getArgOperand(i)->getType(), VF));
+  for (Value *ArgOperand : CI->arg_operands())
+    Tys.push_back(ToVectorTy(ArgOperand->getType(), VF));
 
-  return TTI.getIntrinsicInstrCost(ID, RetTy, Tys);
+  FastMathFlags FMF;
+  if (auto *FPMO = dyn_cast<FPMathOperator>(CI))
+    FMF = FPMO->getFastMathFlags();
+
+  return TTI.getIntrinsicInstrCost(ID, RetTy, Tys, FMF);
 }
 
 static Type *smallestIntegerVectorType(Type *T1, Type *T2) {
-  IntegerType *I1 = cast<IntegerType>(T1->getVectorElementType());
-  IntegerType *I2 = cast<IntegerType>(T2->getVectorElementType());
+  auto *I1 = cast<IntegerType>(T1->getVectorElementType());
+  auto *I2 = cast<IntegerType>(T2->getVectorElementType());
   return I1->getBitWidth() < I2->getBitWidth() ? T1 : T2;
 }
 static Type *largestIntegerVectorType(Type *T1, Type *T2) {
-  IntegerType *I1 = cast<IntegerType>(T1->getVectorElementType());
-  IntegerType *I2 = cast<IntegerType>(T2->getVectorElementType());
+  auto *I1 = cast<IntegerType>(T1->getVectorElementType());
+  auto *I2 = cast<IntegerType>(T2->getVectorElementType());
   return I1->getBitWidth() > I2->getBitWidth() ? T1 : T2;
 }
 
@@ -3161,21 +3355,22 @@ void InnerLoopVectorizer::truncateToMinimalBitwidths() {
   // truncated version of `I` and reextend its result. InstCombine runs
   // later and will remove any ext/trunc pairs.
   //
-  for (auto &KV : MinBWs) {
+  SmallPtrSet<Value *, 4> Erased;
+  for (const auto &KV : *MinBWs) {
     VectorParts &Parts = WidenMap.get(KV.first);
     for (Value *&I : Parts) {
-      if (I->use_empty())
+      if (Erased.count(I) || I->use_empty() || !isa<Instruction>(I))
         continue;
       Type *OriginalTy = I->getType();
-      Type *ScalarTruncatedTy = IntegerType::get(OriginalTy->getContext(),
-                                                 KV.second);
+      Type *ScalarTruncatedTy =
+          IntegerType::get(OriginalTy->getContext(), KV.second);
       Type *TruncatedTy = VectorType::get(ScalarTruncatedTy,
                                           OriginalTy->getVectorNumElements());
       if (TruncatedTy == OriginalTy)
         continue;
 
       IRBuilder<> B(cast<Instruction>(I));
-      auto ShrinkOperand = [&](Value *V) -> Value* {
+      auto ShrinkOperand = [&](Value *V) -> Value * {
         if (auto *ZI = dyn_cast<ZExtInst>(V))
           if (ZI->getSrcTy() == TruncatedTy)
             return ZI->getOperand(0);
@@ -3185,50 +3380,59 @@ void InnerLoopVectorizer::truncateToMinimalBitwidths() {
       // The actual instruction modification depends on the instruction type,
       // unfortunately.
       Value *NewI = nullptr;
-      if (BinaryOperator *BO = dyn_cast<BinaryOperator>(I)) {
-        NewI = B.CreateBinOp(BO->getOpcode(),
-                             ShrinkOperand(BO->getOperand(0)),
+      if (auto *BO = dyn_cast<BinaryOperator>(I)) {
+        NewI = B.CreateBinOp(BO->getOpcode(), ShrinkOperand(BO->getOperand(0)),
                              ShrinkOperand(BO->getOperand(1)));
         cast<BinaryOperator>(NewI)->copyIRFlags(I);
-      } else if (ICmpInst *CI = dyn_cast<ICmpInst>(I)) {
-        NewI = B.CreateICmp(CI->getPredicate(),
-                            ShrinkOperand(CI->getOperand(0)),
-                            ShrinkOperand(CI->getOperand(1)));
-      } else if (SelectInst *SI = dyn_cast<SelectInst>(I)) {
+      } else if (auto *CI = dyn_cast<ICmpInst>(I)) {
+        NewI =
+            B.CreateICmp(CI->getPredicate(), ShrinkOperand(CI->getOperand(0)),
+                         ShrinkOperand(CI->getOperand(1)));
+      } else if (auto *SI = dyn_cast<SelectInst>(I)) {
         NewI = B.CreateSelect(SI->getCondition(),
                               ShrinkOperand(SI->getTrueValue()),
                               ShrinkOperand(SI->getFalseValue()));
-      } else if (CastInst *CI = dyn_cast<CastInst>(I)) {
+      } else if (auto *CI = dyn_cast<CastInst>(I)) {
         switch (CI->getOpcode()) {
-        default: llvm_unreachable("Unhandled cast!");
+        default:
+          llvm_unreachable("Unhandled cast!");
         case Instruction::Trunc:
           NewI = ShrinkOperand(CI->getOperand(0));
           break;
         case Instruction::SExt:
-          NewI = B.CreateSExtOrTrunc(CI->getOperand(0),
-                                     smallestIntegerVectorType(OriginalTy,
-                                                               TruncatedTy));
+          NewI = B.CreateSExtOrTrunc(
+              CI->getOperand(0),
+              smallestIntegerVectorType(OriginalTy, TruncatedTy));
           break;
         case Instruction::ZExt:
-          NewI = B.CreateZExtOrTrunc(CI->getOperand(0),
-                                     smallestIntegerVectorType(OriginalTy,
-                                                               TruncatedTy));
+          NewI = B.CreateZExtOrTrunc(
+              CI->getOperand(0),
+              smallestIntegerVectorType(OriginalTy, TruncatedTy));
           break;
         }
-      } else if (ShuffleVectorInst *SI = dyn_cast<ShuffleVectorInst>(I)) {
+      } else if (auto *SI = dyn_cast<ShuffleVectorInst>(I)) {
         auto Elements0 = SI->getOperand(0)->getType()->getVectorNumElements();
-        auto *O0 =
-          B.CreateZExtOrTrunc(SI->getOperand(0),
-                              VectorType::get(ScalarTruncatedTy, Elements0));
+        auto *O0 = B.CreateZExtOrTrunc(
+            SI->getOperand(0), VectorType::get(ScalarTruncatedTy, Elements0));
         auto Elements1 = SI->getOperand(1)->getType()->getVectorNumElements();
-        auto *O1 =
-          B.CreateZExtOrTrunc(SI->getOperand(1),
-                              VectorType::get(ScalarTruncatedTy, Elements1));
+        auto *O1 = B.CreateZExtOrTrunc(
+            SI->getOperand(1), VectorType::get(ScalarTruncatedTy, Elements1));
 
         NewI = B.CreateShuffleVector(O0, O1, SI->getMask());
       } else if (isa<LoadInst>(I)) {
         // Don't do anything with the operands, just extend the result.
         continue;
+      } else if (auto *IE = dyn_cast<InsertElementInst>(I)) {
+        auto Elements = IE->getOperand(0)->getType()->getVectorNumElements();
+        auto *O0 = B.CreateZExtOrTrunc(
+            IE->getOperand(0), VectorType::get(ScalarTruncatedTy, Elements));
+        auto *O1 = B.CreateZExtOrTrunc(IE->getOperand(1), ScalarTruncatedTy);
+        NewI = B.CreateInsertElement(O0, O1, IE->getOperand(2));
+      } else if (auto *EE = dyn_cast<ExtractElementInst>(I)) {
+        auto Elements = EE->getOperand(0)->getType()->getVectorNumElements();
+        auto *O0 = B.CreateZExtOrTrunc(
+            EE->getOperand(0), VectorType::get(ScalarTruncatedTy, Elements));
+        NewI = B.CreateExtractElement(O0, EE->getOperand(2));
       } else {
         llvm_unreachable("Unhandled instruction type!");
       }
@@ -3238,12 +3442,13 @@ void InnerLoopVectorizer::truncateToMinimalBitwidths() {
       Value *Res = B.CreateZExtOrTrunc(NewI, OriginalTy);
       I->replaceAllUsesWith(Res);
       cast<Instruction>(I)->eraseFromParent();
+      Erased.insert(I);
       I = Res;
     }
   }
 
   // We'll have created a bunch of ZExts that are now parentless. Clean up.
-  for (auto &KV : MinBWs) {
+  for (const auto &KV : *MinBWs) {
     VectorParts &Parts = WidenMap.get(KV.first);
     for (Value *&I : Parts) {
       ZExtInst *Inst = dyn_cast<ZExtInst>(I);
@@ -3266,15 +3471,14 @@ void InnerLoopVectorizer::vectorizeLoop() {
   //===------------------------------------------------===//
   Constant *Zero = Builder.getInt32(0);
 
-  // In order to support reduction variables we need to be able to vectorize
-  // Phi nodes. Phi nodes have cycles, so we need to vectorize them in two
-  // stages. First, we create a new vector PHI node with no incoming edges.
-  // We use this value when we vectorize all of the instructions that use the
-  // PHI. Next, after all of the instructions in the block are complete we
-  // add the new incoming edges to the PHI. At this point all of the
-  // instructions in the basic block are vectorized, so we can use them to
-  // construct the PHI.
-  PhiVector RdxPHIsToFix;
+  // In order to support recurrences we need to be able to vectorize Phi nodes.
+  // Phi nodes have cycles, so we need to vectorize them in two stages. First,
+  // we create a new vector PHI node with no incoming edges. We use this value
+  // when we vectorize all of the instructions that use the PHI. Next, after
+  // all of the instructions in the block are complete we add the new incoming
+  // edges to the PHI. At this point all of the instructions in the basic block
+  // are vectorized, so we can use them to construct the PHI.
+  PhiVector PHIsToFix;
 
   // Scan the loop in a topological order to ensure that defs are vectorized
   // before users.
@@ -3282,33 +3486,32 @@ void InnerLoopVectorizer::vectorizeLoop() {
   DFS.perform(LI);
 
   // Vectorize all of the blocks in the original loop.
-  for (LoopBlocksDFS::RPOIterator bb = DFS.beginRPO(),
-       be = DFS.endRPO(); bb != be; ++bb)
-    vectorizeBlockInLoop(*bb, &RdxPHIsToFix);
+  for (BasicBlock *BB : make_range(DFS.beginRPO(), DFS.endRPO()))
+    vectorizeBlockInLoop(BB, &PHIsToFix);
 
   // Insert truncates and extends for any truncated instructions as hints to
   // InstCombine.
   if (VF > 1)
     truncateToMinimalBitwidths();
-  
-  // At this point every instruction in the original loop is widened to
-  // a vector form. We are almost done. Now, we need to fix the PHI nodes
-  // that we vectorized. The PHI nodes are currently empty because we did
-  // not want to introduce cycles. Notice that the remaining PHI nodes
-  // that we need to fix are reduction variables.
-
-  // Create the 'reduced' values for each of the induction vars.
-  // The reduced values are the vector values that we scalarize and combine
-  // after the loop is finished.
-  for (PhiVector::iterator it = RdxPHIsToFix.begin(), e = RdxPHIsToFix.end();
-       it != e; ++it) {
-    PHINode *RdxPhi = *it;
-    assert(RdxPhi && "Unable to recover vectorized PHI");
-
-    // Find the reduction variable descriptor.
-    assert(Legal->isReductionVariable(RdxPhi) &&
+
+  // At this point every instruction in the original loop is widened to a
+  // vector form. Now we need to fix the recurrences in PHIsToFix. These PHI
+  // nodes are currently empty because we did not want to introduce cycles.
+  // This is the second stage of vectorizing recurrences.
+  for (PHINode *Phi : PHIsToFix) {
+    assert(Phi && "Unable to recover vectorized PHI");
+
+    // Handle first-order recurrences that need to be fixed.
+    if (Legal->isFirstOrderRecurrence(Phi)) {
+      fixFirstOrderRecurrence(Phi);
+      continue;
+    }
+
+    // If the phi node is not a first-order recurrence, it must be a reduction.
+    // Get it's reduction variable descriptor.
+    assert(Legal->isReductionVariable(Phi) &&
            "Unable to find the reduction variable");
-    RecurrenceDescriptor RdxDesc = (*Legal->getReductionVars())[RdxPhi];
+    RecurrenceDescriptor RdxDesc = (*Legal->getReductionVars())[Phi];
 
     RecurrenceDescriptor::RecurrenceKind RK = RdxDesc.getRecurrenceKind();
     TrackingVH<Value> ReductionStartValue = RdxDesc.getRecurrenceStartValue();
@@ -3363,18 +3566,18 @@ void InnerLoopVectorizer::vectorizeLoop() {
 
     // Reductions do not have to start at zero. They can start with
     // any loop invariant values.
-    VectorParts &VecRdxPhi = WidenMap.get(RdxPhi);
+    VectorParts &VecRdxPhi = WidenMap.get(Phi);
     BasicBlock *Latch = OrigLoop->getLoopLatch();
-    Value *LoopVal = RdxPhi->getIncomingValueForBlock(Latch);
+    Value *LoopVal = Phi->getIncomingValueForBlock(Latch);
     VectorParts &Val = getVectorValue(LoopVal);
     for (unsigned part = 0; part < UF; ++part) {
       // Make sure to add the reduction stat value only to the
       // first unroll part.
       Value *StartVal = (part == 0) ? VectorStart : Identity;
-      cast<PHINode>(VecRdxPhi[part])->addIncoming(StartVal,
-                                                  LoopVectorPreHeader);
-      cast<PHINode>(VecRdxPhi[part])->addIncoming(Val[part],
-                                                  LoopVectorBody.back());
+      cast<PHINode>(VecRdxPhi[part])
+          ->addIncoming(StartVal, LoopVectorPreHeader);
+      cast<PHINode>(VecRdxPhi[part])
+          ->addIncoming(Val[part], LoopVectorBody);
     }
 
     // Before each round, move the insertion point right between
@@ -3389,9 +3592,9 @@ void InnerLoopVectorizer::vectorizeLoop() {
     // If the vector reduction can be performed in a smaller type, we truncate
     // then extend the loop exit value to enable InstCombine to evaluate the
     // entire expression in the smaller type.
-    if (VF > 1 && RdxPhi->getType() != RdxDesc.getRecurrenceType()) {
+    if (VF > 1 && Phi->getType() != RdxDesc.getRecurrenceType()) {
       Type *RdxVecTy = VectorType::get(RdxDesc.getRecurrenceType(), VF);
-      Builder.SetInsertPoint(LoopVectorBody.back()->getTerminator());
+      Builder.SetInsertPoint(LoopVectorBody->getTerminator());
       for (unsigned part = 0; part < UF; ++part) {
         Value *Trunc = Builder.CreateTrunc(RdxParts[part], RdxVecTy);
         Value *Extnd = RdxDesc.isSigned() ? Builder.CreateSExt(Trunc, VecTy)
@@ -3432,21 +3635,19 @@ void InnerLoopVectorizer::vectorizeLoop() {
       assert(isPowerOf2_32(VF) &&
              "Reduction emission only supported for pow2 vectors!");
       Value *TmpVec = ReducedPartRdx;
-      SmallVector<Constant*, 32> ShuffleMask(VF, nullptr);
+      SmallVector<Constant *, 32> ShuffleMask(VF, nullptr);
       for (unsigned i = VF; i != 1; i >>= 1) {
         // Move the upper half of the vector to the lower half.
-        for (unsigned j = 0; j != i/2; ++j)
-          ShuffleMask[j] = Builder.getInt32(i/2 + j);
+        for (unsigned j = 0; j != i / 2; ++j)
+          ShuffleMask[j] = Builder.getInt32(i / 2 + j);
 
         // Fill the rest of the mask with undef.
-        std::fill(&ShuffleMask[i/2], ShuffleMask.end(),
+        std::fill(&ShuffleMask[i / 2], ShuffleMask.end(),
                   UndefValue::get(Builder.getInt32Ty()));
 
-        Value *Shuf =
-        Builder.CreateShuffleVector(TmpVec,
-                                    UndefValue::get(TmpVec->getType()),
-                                    ConstantVector::get(ShuffleMask),
-                                    "rdx.shuf");
+        Value *Shuf = Builder.CreateShuffleVector(
+            TmpVec, UndefValue::get(TmpVec->getType()),
+            ConstantVector::get(ShuffleMask), "rdx.shuf");
 
         if (Op != Instruction::ICmp && Op != Instruction::FCmp)
           // Floating point operations had to be 'fast' to enable the reduction.
@@ -3458,21 +3659,21 @@ void InnerLoopVectorizer::vectorizeLoop() {
       }
 
       // The result is in the first element of the vector.
-      ReducedPartRdx = Builder.CreateExtractElement(TmpVec,
-                                                    Builder.getInt32(0));
+      ReducedPartRdx =
+          Builder.CreateExtractElement(TmpVec, Builder.getInt32(0));
 
       // If the reduction can be performed in a smaller type, we need to extend
       // the reduction to the wider type before we branch to the original loop.
-      if (RdxPhi->getType() != RdxDesc.getRecurrenceType())
+      if (Phi->getType() != RdxDesc.getRecurrenceType())
         ReducedPartRdx =
             RdxDesc.isSigned()
-                ? Builder.CreateSExt(ReducedPartRdx, RdxPhi->getType())
-                : Builder.CreateZExt(ReducedPartRdx, RdxPhi->getType());
+                ? Builder.CreateSExt(ReducedPartRdx, Phi->getType())
+                : Builder.CreateZExt(ReducedPartRdx, Phi->getType());
     }
 
     // Create a phi node that merges control-flow from the backedge-taken check
     // block and the middle block.
-    PHINode *BCBlockPhi = PHINode::Create(RdxPhi->getType(), 2, "bc.merge.rdx",
+    PHINode *BCBlockPhi = PHINode::Create(Phi->getType(), 2, "bc.merge.rdx",
                                           LoopScalarPreHeader->getTerminator());
     for (unsigned I = 0, E = LoopBypassBlocks.size(); I != E; ++I)
       BCBlockPhi->addIncoming(ReductionStartValue, LoopBypassBlocks[I]);
@@ -3483,9 +3684,11 @@ void InnerLoopVectorizer::vectorizeLoop() {
     // We know that the loop is in LCSSA form. We need to update the
     // PHI nodes in the exit blocks.
     for (BasicBlock::iterator LEI = LoopExitBlock->begin(),
-         LEE = LoopExitBlock->end(); LEI != LEE; ++LEI) {
+                              LEE = LoopExitBlock->end();
+         LEI != LEE; ++LEI) {
       PHINode *LCSSAPhi = dyn_cast<PHINode>(LEI);
-      if (!LCSSAPhi) break;
+      if (!LCSSAPhi)
+        break;
 
       // All PHINodes need to have a single entry edge, or two if
       // we already fixed them.
@@ -3498,30 +3701,30 @@ void InnerLoopVectorizer::vectorizeLoop() {
         LCSSAPhi->addIncoming(ReducedPartRdx, LoopMiddleBlock);
         break;
       }
-    }// end of the LCSSA phi scan.
+    } // end of the LCSSA phi scan.
 
     // Fix the scalar loop reduction variable with the incoming reduction sum
     // from the vector body and from the backedge value.
     int IncomingEdgeBlockIdx =
-    (RdxPhi)->getBasicBlockIndex(OrigLoop->getLoopLatch());
+        Phi->getBasicBlockIndex(OrigLoop->getLoopLatch());
     assert(IncomingEdgeBlockIdx >= 0 && "Invalid block index");
     // Pick the other block.
     int SelfEdgeBlockIdx = (IncomingEdgeBlockIdx ? 0 : 1);
-    (RdxPhi)->setIncomingValue(SelfEdgeBlockIdx, BCBlockPhi);
-    (RdxPhi)->setIncomingValue(IncomingEdgeBlockIdx, LoopExitInst);
-  }// end of for each redux variable.
+    Phi->setIncomingValue(SelfEdgeBlockIdx, BCBlockPhi);
+    Phi->setIncomingValue(IncomingEdgeBlockIdx, LoopExitInst);
+  } // end of for each Phi in PHIsToFix.
 
   fixLCSSAPHIs();
 
   // Make sure DomTree is updated.
   updateAnalysis();
-  
+
   // Predicate any stores.
   for (auto KV : PredicatedStores) {
     BasicBlock::iterator I(KV.first);
     auto *BB = SplitBlock(I->getParent(), &*std::next(I), DT, LI);
     auto *T = SplitBlockAndInsertIfThen(KV.second, &*I, /*Unreachable=*/false,
-                                        /*BranchWeights=*/nullptr, DT);
+                                        /*BranchWeights=*/nullptr, DT, LI);
     I->moveBefore(T);
     I->getParent()->setName("pred.store.if");
     BB->setName("pred.store.continue");
@@ -3531,11 +3734,162 @@ void InnerLoopVectorizer::vectorizeLoop() {
   cse(LoopVectorBody);
 }
 
+void InnerLoopVectorizer::fixFirstOrderRecurrence(PHINode *Phi) {
+
+  // This is the second phase of vectorizing first-order recurrences. An
+  // overview of the transformation is described below. Suppose we have the
+  // following loop.
+  //
+  //   for (int i = 0; i < n; ++i)
+  //     b[i] = a[i] - a[i - 1];
+  //
+  // There is a first-order recurrence on "a". For this loop, the shorthand
+  // scalar IR looks like:
+  //
+  //   scalar.ph:
+  //     s_init = a[-1]
+  //     br scalar.body
+  //
+  //   scalar.body:
+  //     i = phi [0, scalar.ph], [i+1, scalar.body]
+  //     s1 = phi [s_init, scalar.ph], [s2, scalar.body]
+  //     s2 = a[i]
+  //     b[i] = s2 - s1
+  //     br cond, scalar.body, ...
+  //
+  // In this example, s1 is a recurrence because it's value depends on the
+  // previous iteration. In the first phase of vectorization, we created a
+  // temporary value for s1. We now complete the vectorization and produce the
+  // shorthand vector IR shown below (for VF = 4, UF = 1).
+  //
+  //   vector.ph:
+  //     v_init = vector(..., ..., ..., a[-1])
+  //     br vector.body
+  //
+  //   vector.body
+  //     i = phi [0, vector.ph], [i+4, vector.body]
+  //     v1 = phi [v_init, vector.ph], [v2, vector.body]
+  //     v2 = a[i, i+1, i+2, i+3];
+  //     v3 = vector(v1(3), v2(0, 1, 2))
+  //     b[i, i+1, i+2, i+3] = v2 - v3
+  //     br cond, vector.body, middle.block
+  //
+  //   middle.block:
+  //     x = v2(3)
+  //     br scalar.ph
+  //
+  //   scalar.ph:
+  //     s_init = phi [x, middle.block], [a[-1], otherwise]
+  //     br scalar.body
+  //
+  // After execution completes the vector loop, we extract the next value of
+  // the recurrence (x) to use as the initial value in the scalar loop.
+
+  // Get the original loop preheader and single loop latch.
+  auto *Preheader = OrigLoop->getLoopPreheader();
+  auto *Latch = OrigLoop->getLoopLatch();
+
+  // Get the initial and previous values of the scalar recurrence.
+  auto *ScalarInit = Phi->getIncomingValueForBlock(Preheader);
+  auto *Previous = Phi->getIncomingValueForBlock(Latch);
+
+  // Create a vector from the initial value.
+  auto *VectorInit = ScalarInit;
+  if (VF > 1) {
+    Builder.SetInsertPoint(LoopVectorPreHeader->getTerminator());
+    VectorInit = Builder.CreateInsertElement(
+        UndefValue::get(VectorType::get(VectorInit->getType(), VF)), VectorInit,
+        Builder.getInt32(VF - 1), "vector.recur.init");
+  }
+
+  // We constructed a temporary phi node in the first phase of vectorization.
+  // This phi node will eventually be deleted.
+  auto &PhiParts = getVectorValue(Phi);
+  Builder.SetInsertPoint(cast<Instruction>(PhiParts[0]));
+
+  // Create a phi node for the new recurrence. The current value will either be
+  // the initial value inserted into a vector or loop-varying vector value.
+  auto *VecPhi = Builder.CreatePHI(VectorInit->getType(), 2, "vector.recur");
+  VecPhi->addIncoming(VectorInit, LoopVectorPreHeader);
+
+  // Get the vectorized previous value. We ensured the previous values was an
+  // instruction when detecting the recurrence.
+  auto &PreviousParts = getVectorValue(Previous);
+
+  // Set the insertion point to be after this instruction. We ensured the
+  // previous value dominated all uses of the phi when detecting the
+  // recurrence.
+  Builder.SetInsertPoint(
+      &*++BasicBlock::iterator(cast<Instruction>(PreviousParts[UF - 1])));
+
+  // We will construct a vector for the recurrence by combining the values for
+  // the current and previous iterations. This is the required shuffle mask.
+  SmallVector<Constant *, 8> ShuffleMask(VF);
+  ShuffleMask[0] = Builder.getInt32(VF - 1);
+  for (unsigned I = 1; I < VF; ++I)
+    ShuffleMask[I] = Builder.getInt32(I + VF - 1);
+
+  // The vector from which to take the initial value for the current iteration
+  // (actual or unrolled). Initially, this is the vector phi node.
+  Value *Incoming = VecPhi;
+
+  // Shuffle the current and previous vector and update the vector parts.
+  for (unsigned Part = 0; Part < UF; ++Part) {
+    auto *Shuffle =
+        VF > 1
+            ? Builder.CreateShuffleVector(Incoming, PreviousParts[Part],
+                                          ConstantVector::get(ShuffleMask))
+            : Incoming;
+    PhiParts[Part]->replaceAllUsesWith(Shuffle);
+    cast<Instruction>(PhiParts[Part])->eraseFromParent();
+    PhiParts[Part] = Shuffle;
+    Incoming = PreviousParts[Part];
+  }
+
+  // Fix the latch value of the new recurrence in the vector loop.
+  VecPhi->addIncoming(Incoming, LI->getLoopFor(LoopVectorBody)->getLoopLatch());
+
+  // Extract the last vector element in the middle block. This will be the
+  // initial value for the recurrence when jumping to the scalar loop.
+  auto *Extract = Incoming;
+  if (VF > 1) {
+    Builder.SetInsertPoint(LoopMiddleBlock->getTerminator());
+    Extract = Builder.CreateExtractElement(Extract, Builder.getInt32(VF - 1),
+                                           "vector.recur.extract");
+  }
+
+  // Fix the initial value of the original recurrence in the scalar loop.
+  Builder.SetInsertPoint(&*LoopScalarPreHeader->begin());
+  auto *Start = Builder.CreatePHI(Phi->getType(), 2, "scalar.recur.init");
+  for (auto *BB : predecessors(LoopScalarPreHeader)) {
+    auto *Incoming = BB == LoopMiddleBlock ? Extract : ScalarInit;
+    Start->addIncoming(Incoming, BB);
+  }
+
+  Phi->setIncomingValue(Phi->getBasicBlockIndex(LoopScalarPreHeader), Start);
+  Phi->setName("scalar.recur");
+
+  // Finally, fix users of the recurrence outside the loop. The users will need
+  // either the last value of the scalar recurrence or the last value of the
+  // vector recurrence we extracted in the middle block. Since the loop is in
+  // LCSSA form, we just need to find the phi node for the original scalar
+  // recurrence in the exit block, and then add an edge for the middle block.
+  for (auto &I : *LoopExitBlock) {
+    auto *LCSSAPhi = dyn_cast<PHINode>(&I);
+    if (!LCSSAPhi)
+      break;
+    if (LCSSAPhi->getIncomingValue(0) == Phi) {
+      LCSSAPhi->addIncoming(Extract, LoopMiddleBlock);
+      break;
+    }
+  }
+}
+
 void InnerLoopVectorizer::fixLCSSAPHIs() {
-  for (BasicBlock::iterator LEI = LoopExitBlock->begin(),
-       LEE = LoopExitBlock->end(); LEI != LEE; ++LEI) {
-    PHINode *LCSSAPhi = dyn_cast<PHINode>(LEI);
-    if (!LCSSAPhi) break;
+  for (Instruction &LEI : *LoopExitBlock) {
+    auto *LCSSAPhi = dyn_cast<PHINode>(&LEI);
+    if (!LCSSAPhi)
+      break;
     if (LCSSAPhi->getNumIncomingValues() == 1)
       LCSSAPhi->addIncoming(UndefValue::get(LCSSAPhi->getType()),
                             LoopMiddleBlock);
@@ -3548,7 +3902,7 @@ InnerLoopVectorizer::createEdgeMask(BasicBlock *Src, BasicBlock *Dst) {
          "Invalid edge");
 
   // Look for cached value.
-  std::pair<BasicBlock*, BasicBlock*> Edge(Src, Dst);
+  std::pair<BasicBlock *, BasicBlock *> Edge(Src, Dst);
   EdgeMaskCache::iterator ECEntryIt = MaskCache.find(Edge);
   if (ECEntryIt != MaskCache.end())
     return ECEntryIt->second;
@@ -3604,15 +3958,15 @@ InnerLoopVectorizer::createBlockInMask(BasicBlock *BB) {
 void InnerLoopVectorizer::widenPHIInstruction(
     Instruction *PN, InnerLoopVectorizer::VectorParts &Entry, unsigned UF,
     unsigned VF, PhiVector *PV) {
-  PHINode* P = cast<PHINode>(PN);
-  // Handle reduction variables:
-  if (Legal->isReductionVariable(P)) {
+  PHINode *P = cast<PHINode>(PN);
+  // Handle recurrences.
+  if (Legal->isReductionVariable(P) || Legal->isFirstOrderRecurrence(P)) {
     for (unsigned part = 0; part < UF; ++part) {
       // This is phase one of vectorizing PHIs.
-      Type *VecTy = (VF == 1) ? PN->getType() :
-      VectorType::get(PN->getType(), VF);
+      Type *VecTy =
+          (VF == 1) ? PN->getType() : VectorType::get(PN->getType(), VF);
       Entry[part] = PHINode::Create(
-          VecTy, 2, "vec.phi", &*LoopVectorBody.back()->getFirstInsertionPt());
+          VecTy, 2, "vec.phi", &*LoopVectorBody->getFirstInsertionPt());
     }
     PV->push_back(P);
     return;
@@ -3635,21 +3989,20 @@ void InnerLoopVectorizer::widenPHIInstruction(
     //      SELECT(Mask2, In2,
     //                   ( ...)))
     for (unsigned In = 0; In < NumIncoming; In++) {
-      VectorParts Cond = createEdgeMask(P->getIncomingBlock(In),
-                                        P->getParent());
+      VectorParts Cond =
+          createEdgeMask(P->getIncomingBlock(In), P->getParent());
       VectorParts &In0 = getVectorValue(P->getIncomingValue(In));
 
       for (unsigned part = 0; part < UF; ++part) {
         // We might have single edge PHIs (blocks) - use an identity
         // 'select' for the first PHI operand.
         if (In == 0)
-          Entry[part] = Builder.CreateSelect(Cond[part], In0[part],
-                                             In0[part]);
+          Entry[part] = Builder.CreateSelect(Cond[part], In0[part], In0[part]);
         else
           // Select between the current value and the previous incoming edge
           // based on the incoming mask.
-          Entry[part] = Builder.CreateSelect(Cond[part], In0[part],
-                                             Entry[part], "predphi");
+          Entry[part] = Builder.CreateSelect(Cond[part], In0[part], Entry[part],
+                                             "predphi");
       }
     }
     return;
@@ -3657,85 +4010,68 @@ void InnerLoopVectorizer::widenPHIInstruction(
 
   // This PHINode must be an induction variable.
   // Make sure that we know about it.
-  assert(Legal->getInductionVars()->count(P) &&
-         "Not an induction variable");
+  assert(Legal->getInductionVars()->count(P) && "Not an induction variable");
 
   InductionDescriptor II = Legal->getInductionVars()->lookup(P);
+  const DataLayout &DL = OrigLoop->getHeader()->getModule()->getDataLayout();
 
   // FIXME: The newly created binary instructions should contain nsw/nuw flags,
   // which can be found from the original scalar operations.
   switch (II.getKind()) {
-    case InductionDescriptor::IK_NoInduction:
-      llvm_unreachable("Unknown induction");
-    case InductionDescriptor::IK_IntInduction: {
-      assert(P->getType() == II.getStartValue()->getType() &&
-             "Types must match");
-      // Handle other induction variables that are now based on the
-      // canonical one.
-      Value *V = Induction;
-      if (P != OldInduction) {
-        V = Builder.CreateSExtOrTrunc(Induction, P->getType());
-        V = II.transform(Builder, V);
-        V->setName("offset.idx");
+  case InductionDescriptor::IK_NoInduction:
+    llvm_unreachable("Unknown induction");
+  case InductionDescriptor::IK_IntInduction:
+    return widenIntInduction(P, Entry);
+  case InductionDescriptor::IK_PtrInduction:
+    // Handle the pointer induction variable case.
+    assert(P->getType()->isPointerTy() && "Unexpected type.");
+    // This is the normalized GEP that starts counting at zero.
+    Value *PtrInd = Induction;
+    PtrInd = Builder.CreateSExtOrTrunc(PtrInd, II.getStep()->getType());
+    // This is the vector of results. Notice that we don't generate
+    // vector geps because scalar geps result in better code.
+    for (unsigned part = 0; part < UF; ++part) {
+      if (VF == 1) {
+        int EltIndex = part;
+        Constant *Idx = ConstantInt::get(PtrInd->getType(), EltIndex);
+        Value *GlobalIdx = Builder.CreateAdd(PtrInd, Idx);
+        Value *SclrGep = II.transform(Builder, GlobalIdx, PSE.getSE(), DL);
+        SclrGep->setName("next.gep");
+        Entry[part] = SclrGep;
+        continue;
       }
-      Value *Broadcasted = getBroadcastInstrs(V);
-      // After broadcasting the induction variable we need to make the vector
-      // consecutive by adding 0, 1, 2, etc.
-      for (unsigned part = 0; part < UF; ++part)
-        Entry[part] = getStepVector(Broadcasted, VF * part, II.getStepValue());
-      return;
-    }
-    case InductionDescriptor::IK_PtrInduction:
-      // Handle the pointer induction variable case.
-      assert(P->getType()->isPointerTy() && "Unexpected type.");
-      // This is the normalized GEP that starts counting at zero.
-      Value *PtrInd = Induction;
-      PtrInd = Builder.CreateSExtOrTrunc(PtrInd, II.getStepValue()->getType());
-      // This is the vector of results. Notice that we don't generate
-      // vector geps because scalar geps result in better code.
-      for (unsigned part = 0; part < UF; ++part) {
-        if (VF == 1) {
-          int EltIndex = part;
-          Constant *Idx = ConstantInt::get(PtrInd->getType(), EltIndex);
-          Value *GlobalIdx = Builder.CreateAdd(PtrInd, Idx);
-          Value *SclrGep = II.transform(Builder, GlobalIdx);
-          SclrGep->setName("next.gep");
-          Entry[part] = SclrGep;
-          continue;
-        }
 
-        Value *VecVal = UndefValue::get(VectorType::get(P->getType(), VF));
-        for (unsigned int i = 0; i < VF; ++i) {
-          int EltIndex = i + part * VF;
-          Constant *Idx = ConstantInt::get(PtrInd->getType(), EltIndex);
-          Value *GlobalIdx = Builder.CreateAdd(PtrInd, Idx);
-          Value *SclrGep = II.transform(Builder, GlobalIdx);
-          SclrGep->setName("next.gep");
-          VecVal = Builder.CreateInsertElement(VecVal, SclrGep,
-                                               Builder.getInt32(i),
-                                               "insert.gep");
-        }
-        Entry[part] = VecVal;
+      Value *VecVal = UndefValue::get(VectorType::get(P->getType(), VF));
+      for (unsigned int i = 0; i < VF; ++i) {
+        int EltIndex = i + part * VF;
+        Constant *Idx = ConstantInt::get(PtrInd->getType(), EltIndex);
+        Value *GlobalIdx = Builder.CreateAdd(PtrInd, Idx);
+        Value *SclrGep = II.transform(Builder, GlobalIdx, PSE.getSE(), DL);
+        SclrGep->setName("next.gep");
+        VecVal = Builder.CreateInsertElement(VecVal, SclrGep,
+                                             Builder.getInt32(i), "insert.gep");
       }
-      return;
+      Entry[part] = VecVal;
+    }
+    return;
   }
 }
 
 void InnerLoopVectorizer::vectorizeBlockInLoop(BasicBlock *BB, PhiVector *PV) {
   // For each instruction in the old loop.
-  for (BasicBlock::iterator it = BB->begin(), e = BB->end(); it != e; ++it) {
-    VectorParts &Entry = WidenMap.get(&*it);
+  for (Instruction &I : *BB) {
+    VectorParts &Entry = WidenMap.get(&I);
 
-    switch (it->getOpcode()) {
+    switch (I.getOpcode()) {
     case Instruction::Br:
       // Nothing to do for PHIs and BR, since we already took care of the
       // loop control flow instructions.
       continue;
     case Instruction::PHI: {
       // Vectorize PHINodes.
-      widenPHIInstruction(&*it, Entry, UF, VF, PV);
+      widenPHIInstruction(&I, Entry, UF, VF, PV);
       continue;
-    }// End of PHI.
+    } // End of PHI.
 
     case Instruction::Add:
     case Instruction::FAdd:
@@ -3756,10 +4092,10 @@ void InnerLoopVectorizer::vectorizeBlockInLoop(BasicBlock *BB, PhiVector *PV) {
     case Instruction::Or:
     case Instruction::Xor: {
       // Just widen binops.
-      BinaryOperator *BinOp = dyn_cast<BinaryOperator>(it);
+      auto *BinOp = cast<BinaryOperator>(&I);
       setDebugLocFromInst(Builder, BinOp);
-      VectorParts &A = getVectorValue(it->getOperand(0));
-      VectorParts &B = getVectorValue(it->getOperand(1));
+      VectorParts &A = getVectorValue(BinOp->getOperand(0));
+      VectorParts &B = getVectorValue(BinOp->getOperand(1));
 
       // Use this vector value for all users of the original instruction.
       for (unsigned Part = 0; Part < UF; ++Part) {
@@ -3771,7 +4107,7 @@ void InnerLoopVectorizer::vectorizeBlockInLoop(BasicBlock *BB, PhiVector *PV) {
         Entry[Part] = V;
       }
 
-      propagateMetadata(Entry, &*it);
+      addMetadata(Entry, BinOp);
       break;
     }
     case Instruction::Select: {
@@ -3780,58 +4116,58 @@ void InnerLoopVectorizer::vectorizeBlockInLoop(BasicBlock *BB, PhiVector *PV) {
       // instruction with a scalar condition. Otherwise, use vector-select.
       auto *SE = PSE.getSE();
       bool InvariantCond =
-          SE->isLoopInvariant(PSE.getSCEV(it->getOperand(0)), OrigLoop);
-      setDebugLocFromInst(Builder, &*it);
+          SE->isLoopInvariant(PSE.getSCEV(I.getOperand(0)), OrigLoop);
+      setDebugLocFromInst(Builder, &I);
 
       // The condition can be loop invariant  but still defined inside the
       // loop. This means that we can't just use the original 'cond' value.
       // We have to take the 'vectorized' value and pick the first lane.
       // Instcombine will make this a no-op.
-      VectorParts &Cond = getVectorValue(it->getOperand(0));
-      VectorParts &Op0  = getVectorValue(it->getOperand(1));
-      VectorParts &Op1  = getVectorValue(it->getOperand(2));
-      
-      Value *ScalarCond = (VF == 1) ? Cond[0] :
-        Builder.CreateExtractElement(Cond[0], Builder.getInt32(0));
+      VectorParts &Cond = getVectorValue(I.getOperand(0));
+      VectorParts &Op0 = getVectorValue(I.getOperand(1));
+      VectorParts &Op1 = getVectorValue(I.getOperand(2));
+
+      Value *ScalarCond =
+          (VF == 1)
+              ? Cond[0]
+              : Builder.CreateExtractElement(Cond[0], Builder.getInt32(0));
 
       for (unsigned Part = 0; Part < UF; ++Part) {
         Entry[Part] = Builder.CreateSelect(
-          InvariantCond ? ScalarCond : Cond[Part],
-          Op0[Part],
-          Op1[Part]);
+            InvariantCond ? ScalarCond : Cond[Part], Op0[Part], Op1[Part]);
       }
 
-      propagateMetadata(Entry, &*it);
+      addMetadata(Entry, &I);
       break;
     }
 
     case Instruction::ICmp:
     case Instruction::FCmp: {
       // Widen compares. Generate vector compares.
-      bool FCmp = (it->getOpcode() == Instruction::FCmp);
-      CmpInst *Cmp = dyn_cast<CmpInst>(it);
-      setDebugLocFromInst(Builder, &*it);
-      VectorParts &A = getVectorValue(it->getOperand(0));
-      VectorParts &B = getVectorValue(it->getOperand(1));
+      bool FCmp = (I.getOpcode() == Instruction::FCmp);
+      auto *Cmp = dyn_cast<CmpInst>(&I);
+      setDebugLocFromInst(Builder, Cmp);
+      VectorParts &A = getVectorValue(Cmp->getOperand(0));
+      VectorParts &B = getVectorValue(Cmp->getOperand(1));
       for (unsigned Part = 0; Part < UF; ++Part) {
         Value *C = nullptr;
         if (FCmp) {
           C = Builder.CreateFCmp(Cmp->getPredicate(), A[Part], B[Part]);
-          cast<FCmpInst>(C)->copyFastMathFlags(&*it);
+          cast<FCmpInst>(C)->copyFastMathFlags(Cmp);
         } else {
           C = Builder.CreateICmp(Cmp->getPredicate(), A[Part], B[Part]);
         }
         Entry[Part] = C;
       }
 
-      propagateMetadata(Entry, &*it);
+      addMetadata(Entry, &I);
       break;
     }
 
     case Instruction::Store:
     case Instruction::Load:
-      vectorizeMemoryInstruction(&*it);
-        break;
+      vectorizeMemoryInstruction(&I);
+      break;
     case Instruction::ZExt:
     case Instruction::SExt:
     case Instruction::FPToUI:
@@ -3844,58 +4180,52 @@ void InnerLoopVectorizer::vectorizeBlockInLoop(BasicBlock *BB, PhiVector *PV) {
     case Instruction::Trunc:
     case Instruction::FPTrunc:
     case Instruction::BitCast: {
-      CastInst *CI = dyn_cast<CastInst>(it);
-      setDebugLocFromInst(Builder, &*it);
-      /// Optimize the special case where the source is the induction
-      /// variable. Notice that we can only optimize the 'trunc' case
-      /// because: a. FP conversions lose precision, b. sext/zext may wrap,
-      /// c. other casts depend on pointer size.
-      if (CI->getOperand(0) == OldInduction &&
-          it->getOpcode() == Instruction::Trunc) {
-        Value *ScalarCast = Builder.CreateCast(CI->getOpcode(), Induction,
-                                               CI->getType());
-        Value *Broadcasted = getBroadcastInstrs(ScalarCast);
-        InductionDescriptor II =
-            Legal->getInductionVars()->lookup(OldInduction);
-        Constant *Step = ConstantInt::getSigned(
-            CI->getType(), II.getStepValue()->getSExtValue());
-        for (unsigned Part = 0; Part < UF; ++Part)
-          Entry[Part] = getStepVector(Broadcasted, VF * Part, Step);
-        propagateMetadata(Entry, &*it);
+      auto *CI = dyn_cast<CastInst>(&I);
+      setDebugLocFromInst(Builder, CI);
+
+      // Optimize the special case where the source is a constant integer
+      // induction variable. Notice that we can only optimize the 'trunc' case
+      // because (a) FP conversions lose precision, (b) sext/zext may wrap, and
+      // (c) other casts depend on pointer size.
+      auto ID = Legal->getInductionVars()->lookup(OldInduction);
+      if (isa<TruncInst>(CI) && CI->getOperand(0) == OldInduction &&
+          ID.getConstIntStepValue()) {
+        widenIntInduction(OldInduction, Entry, cast<TruncInst>(CI));
+        addMetadata(Entry, &I);
         break;
       }
+
       /// Vectorize casts.
-      Type *DestTy = (VF == 1) ? CI->getType() :
-                                 VectorType::get(CI->getType(), VF);
+      Type *DestTy =
+          (VF == 1) ? CI->getType() : VectorType::get(CI->getType(), VF);
 
-      VectorParts &A = getVectorValue(it->getOperand(0));
+      VectorParts &A = getVectorValue(CI->getOperand(0));
       for (unsigned Part = 0; Part < UF; ++Part)
         Entry[Part] = Builder.CreateCast(CI->getOpcode(), A[Part], DestTy);
-      propagateMetadata(Entry, &*it);
+      addMetadata(Entry, &I);
       break;
     }
 
     case Instruction::Call: {
       // Ignore dbg intrinsics.
-      if (isa<DbgInfoIntrinsic>(it))
+      if (isa<DbgInfoIntrinsic>(I))
         break;
-      setDebugLocFromInst(Builder, &*it);
+      setDebugLocFromInst(Builder, &I);
 
       Module *M = BB->getParent()->getParent();
-      CallInst *CI = cast<CallInst>(it);
+      auto *CI = cast<CallInst>(&I);
 
       StringRef FnName = CI->getCalledFunction()->getName();
       Function *F = CI->getCalledFunction();
       Type *RetTy = ToVectorTy(CI->getType(), VF);
       SmallVector<Type *, 4> Tys;
-      for (unsigned i = 0, ie = CI->getNumArgOperands(); i != ie; ++i)
-        Tys.push_back(ToVectorTy(CI->getArgOperand(i)->getType(), VF));
-
-      Intrinsic::ID ID = getIntrinsicIDForCall(CI, TLI);
-      if (ID &&
-          (ID == Intrinsic::assume || ID == Intrinsic::lifetime_end ||
-           ID == Intrinsic::lifetime_start)) {
-        scalarizeInstruction(&*it);
+      for (Value *ArgOperand : CI->arg_operands())
+        Tys.push_back(ToVectorTy(ArgOperand->getType(), VF));
+
+      Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);
+      if (ID && (ID == Intrinsic::assume || ID == Intrinsic::lifetime_end ||
+                 ID == Intrinsic::lifetime_start)) {
+        scalarizeInstruction(&I);
         break;
       }
       // The flag shows whether we use Intrinsic or a usual Call for vectorized
@@ -3906,7 +4236,7 @@ void InnerLoopVectorizer::vectorizeBlockInLoop(BasicBlock *BB, PhiVector *PV) {
       bool UseVectorIntrinsic =
           ID && getVectorIntrinsicCost(CI, VF, *TTI, TLI) <= CallCost;
       if (!UseVectorIntrinsic && NeedToScalarize) {
-        scalarizeInstruction(&*it);
+        scalarizeInstruction(&I);
         break;
       }
 
@@ -3944,19 +4274,27 @@ void InnerLoopVectorizer::vectorizeBlockInLoop(BasicBlock *BB, PhiVector *PV) {
           }
         }
         assert(VectorF && "Can't create vector function.");
-        Entry[Part] = Builder.CreateCall(VectorF, Args);
+
+        SmallVector<OperandBundleDef, 1> OpBundles;
+        CI->getOperandBundlesAsDefs(OpBundles);
+        CallInst *V = Builder.CreateCall(VectorF, Args, OpBundles);
+
+        if (isa<FPMathOperator>(V))
+          V->copyFastMathFlags(CI);
+
+        Entry[Part] = V;
       }
 
-      propagateMetadata(Entry, &*it);
+      addMetadata(Entry, &I);
       break;
     }
 
     default:
       // All other instructions are unsupported. Scalarize them.
-      scalarizeInstruction(&*it);
+      scalarizeInstruction(&I);
       break;
-    }// end of switch.
-  }// end of for_each instr.
+    } // end of switch.
+  }   // end of for_each instr.
 }
 
 void InnerLoopVectorizer::updateAnalysis() {
@@ -3967,16 +4305,11 @@ void InnerLoopVectorizer::updateAnalysis() {
   assert(DT->properlyDominates(LoopBypassBlocks.front(), LoopExitBlock) &&
          "Entry does not dominate exit.");
 
-  for (unsigned I = 1, E = LoopBypassBlocks.size(); I != E; ++I)
-    DT->addNewBlock(LoopBypassBlocks[I], LoopBypassBlocks[I-1]);
-  DT->addNewBlock(LoopVectorPreHeader, LoopBypassBlocks.back());
-
   // We don't predicate stores by this point, so the vector body should be a
   // single loop.
-  assert(LoopVectorBody.size() == 1 && "Expected single block loop!");
-  DT->addNewBlock(LoopVectorBody[0], LoopVectorPreHeader);
+  DT->addNewBlock(LoopVectorBody, LoopVectorPreHeader);
 
-  DT->addNewBlock(LoopMiddleBlock, LoopVectorBody.back());
+  DT->addNewBlock(LoopMiddleBlock, LoopVectorBody);
   DT->addNewBlock(LoopScalarPreHeader, LoopBypassBlocks[0]);
   DT->changeImmediateDominator(LoopScalarBody, LoopScalarPreHeader);
   DT->changeImmediateDominator(LoopExitBlock, LoopBypassBlocks[0]);
@@ -3989,12 +4322,12 @@ void InnerLoopVectorizer::updateAnalysis() {
 /// Phi nodes with constant expressions that can trap are not safe to if
 /// convert.
 static bool canIfConvertPHINodes(BasicBlock *BB) {
-  for (BasicBlock::iterator I = BB->begin(), E = BB->end(); I != E; ++I) {
-    PHINode *Phi = dyn_cast<PHINode>(I);
+  for (Instruction &I : *BB) {
+    auto *Phi = dyn_cast<PHINode>(&I);
     if (!Phi)
       return true;
-    for (unsigned p = 0, e = Phi->getNumIncomingValues(); p != e; ++p)
-      if (Constant *C = dyn_cast<Constant>(Phi->getIncomingValue(p)))
+    for (Value *V : Phi->incoming_values())
+      if (auto *C = dyn_cast<Constant>(V))
         if (C->canTrap())
           return false;
   }
@@ -4013,27 +4346,21 @@ bool LoopVectorizationLegality::canVectorizeWithIfConvert() {
   SmallPtrSet<Value *, 8> SafePointes;
 
   // Collect safe addresses.
-  for (Loop::block_iterator BI = TheLoop->block_begin(),
-         BE = TheLoop->block_end(); BI != BE; ++BI) {
-    BasicBlock *BB = *BI;
-
+  for (BasicBlock *BB : TheLoop->blocks()) {
     if (blockNeedsPredication(BB))
       continue;
 
-    for (BasicBlock::iterator I = BB->begin(), E = BB->end(); I != E; ++I) {
-      if (LoadInst *LI = dyn_cast<LoadInst>(I))
+    for (Instruction &I : *BB) {
+      if (auto *LI = dyn_cast<LoadInst>(&I))
         SafePointes.insert(LI->getPointerOperand());
-      else if (StoreInst *SI = dyn_cast<StoreInst>(I))
+      else if (auto *SI = dyn_cast<StoreInst>(&I))
         SafePointes.insert(SI->getPointerOperand());
     }
   }
 
   // Collect the blocks that need predication.
   BasicBlock *Header = TheLoop->getHeader();
-  for (Loop::block_iterator BI = TheLoop->block_begin(),
-         BE = TheLoop->block_end(); BI != BE; ++BI) {
-    BasicBlock *BB = *BI;
-
+  for (BasicBlock *BB : TheLoop->blocks()) {
     // We don't support switch statements inside loops.
     if (!isa<BranchInst>(BB->getTerminator())) {
       emitAnalysis(VectorizationReport(BB->getTerminator())
@@ -4063,9 +4390,8 @@ bool LoopVectorizationLegality::canVectorize() {
   // We must have a loop in canonical form. Loops with indirectbr in them cannot
   // be canonicalized.
   if (!TheLoop->getLoopPreheader()) {
-    emitAnalysis(
-        VectorizationReport() <<
-        "loop control flow is not understood by vectorizer");
+    emitAnalysis(VectorizationReport()
+                 << "loop control flow is not understood by vectorizer");
     return false;
   }
 
@@ -4077,17 +4403,15 @@ bool LoopVectorizationLegality::canVectorize() {
 
   // We must have a single backedge.
   if (TheLoop->getNumBackEdges() != 1) {
-    emitAnalysis(
-        VectorizationReport() <<
-        "loop control flow is not understood by vectorizer");
+    emitAnalysis(VectorizationReport()
+                 << "loop control flow is not understood by vectorizer");
     return false;
   }
 
   // We must have a single exiting block.
   if (!TheLoop->getExitingBlock()) {
-    emitAnalysis(
-        VectorizationReport() <<
-        "loop control flow is not understood by vectorizer");
+    emitAnalysis(VectorizationReport()
+                 << "loop control flow is not understood by vectorizer");
     return false;
   }
 
@@ -4095,15 +4419,14 @@ bool LoopVectorizationLegality::canVectorize() {
   // checked at the end of each iteration. With that we can assume that all
   // instructions in the loop are executed the same number of times.
   if (TheLoop->getExitingBlock() != TheLoop->getLoopLatch()) {
-    emitAnalysis(
-        VectorizationReport() <<
-        "loop control flow is not understood by vectorizer");
+    emitAnalysis(VectorizationReport()
+                 << "loop control flow is not understood by vectorizer");
     return false;
   }
 
   // We need to have a loop header.
-  DEBUG(dbgs() << "LV: Found a loop: " <<
-        TheLoop->getHeader()->getName() << '\n');
+  DEBUG(dbgs() << "LV: Found a loop: " << TheLoop->getHeader()->getName()
+               << '\n');
 
   // Check if we can if-convert non-single-bb loops.
   unsigned NumBlocks = TheLoop->getNumBlocks();
@@ -4113,7 +4436,7 @@ bool LoopVectorizationLegality::canVectorize() {
   }
 
   // ScalarEvolution needs to be able to find the exit count.
-  const SCEV *ExitCount = PSE.getSE()->getBackedgeTakenCount(TheLoop);
+  const SCEV *ExitCount = PSE.getBackedgeTakenCount();
   if (ExitCount == PSE.getSE()->getCouldNotCompute()) {
     emitAnalysis(VectorizationReport()
                  << "could not determine number of loop iterations");
@@ -4150,7 +4473,7 @@ bool LoopVectorizationLegality::canVectorize() {
 
   // Analyze interleaved memory accesses.
   if (UseInterleaved)
-    InterleaveInfo.analyzeInterleaving(Strides);
+    InterleaveInfo.analyzeInterleaving(*getSymbolicStrides());
 
   unsigned SCEVThreshold = VectorizeSCEVCheckThreshold;
   if (Hints->getForce() == LoopVectorizeHints::FK_Enabled)
@@ -4182,7 +4505,7 @@ static Type *convertPointerToIntegerType(const DataLayout &DL, Type *Ty) {
   return Ty;
 }
 
-static Type* getWiderType(const DataLayout &DL, Type *Ty0, Type *Ty1) {
+static Type *getWiderType(const DataLayout &DL, Type *Ty0, Type *Ty1) {
   Ty0 = convertPointerToIntegerType(DL, Ty0);
   Ty1 = convertPointerToIntegerType(DL, Ty1);
   if (Ty0->getScalarSizeInBits() > Ty1->getScalarSizeInBits())
@@ -4193,11 +4516,11 @@ static Type* getWiderType(const DataLayout &DL, Type *Ty0, Type *Ty1) {
 /// \brief Check that the instruction has outside loop users and is not an
 /// identified reduction variable.
 static bool hasOutsideLoopUser(const Loop *TheLoop, Instruction *Inst,
-                               SmallPtrSetImpl<Value *> &Reductions) {
-  // Reduction instructions are allowed to have exit users. All other
-  // instructions must not have external users.
-  if (!Reductions.count(Inst))
-    //Check that all of the users of the loop are inside the BB.
+                               SmallPtrSetImpl<Value *> &AllowedExit) {
+  // Reduction and Induction instructions are allowed to have exit users. All
+  // other instructions must not have external users.
+  if (!AllowedExit.count(Inst))
+    // Check that all of the users of the loop are inside the BB.
     for (User *U : Inst->users()) {
       Instruction *UI = cast<Instruction>(U);
       // This user may be a reduction exit value.
@@ -4209,31 +4532,61 @@ static bool hasOutsideLoopUser(const Loop *TheLoop, Instruction *Inst,
   return false;
 }
 
+void LoopVectorizationLegality::addInductionPhi(
+    PHINode *Phi, const InductionDescriptor &ID,
+    SmallPtrSetImpl<Value *> &AllowedExit) {
+  Inductions[Phi] = ID;
+  Type *PhiTy = Phi->getType();
+  const DataLayout &DL = Phi->getModule()->getDataLayout();
+
+  // Get the widest type.
+  if (!WidestIndTy)
+    WidestIndTy = convertPointerToIntegerType(DL, PhiTy);
+  else
+    WidestIndTy = getWiderType(DL, PhiTy, WidestIndTy);
+
+  // Int inductions are special because we only allow one IV.
+  if (ID.getKind() == InductionDescriptor::IK_IntInduction &&
+      ID.getConstIntStepValue() &&
+      ID.getConstIntStepValue()->isOne() &&
+      isa<Constant>(ID.getStartValue()) &&
+      cast<Constant>(ID.getStartValue())->isNullValue()) {
+
+    // Use the phi node with the widest type as induction. Use the last
+    // one if there are multiple (no good reason for doing this other
+    // than it is expedient). We've checked that it begins at zero and
+    // steps by one, so this is a canonical induction variable.
+    if (!Induction || PhiTy == WidestIndTy)
+      Induction = Phi;
+  }
+
+  // Both the PHI node itself, and the "post-increment" value feeding
+  // back into the PHI node may have external users.
+  AllowedExit.insert(Phi);
+  AllowedExit.insert(Phi->getIncomingValueForBlock(TheLoop->getLoopLatch()));
+
+  DEBUG(dbgs() << "LV: Found an induction variable.\n");
+  return;
+}
+
 bool LoopVectorizationLegality::canVectorizeInstrs() {
   BasicBlock *Header = TheLoop->getHeader();
 
   // Look for the attribute signaling the absence of NaNs.
   Function &F = *Header->getParent();
-  const DataLayout &DL = F.getParent()->getDataLayout();
-  if (F.hasFnAttribute("no-nans-fp-math"))
-    HasFunNoNaNAttr =
-        F.getFnAttribute("no-nans-fp-math").getValueAsString() == "true";
+  HasFunNoNaNAttr =
+      F.getFnAttribute("no-nans-fp-math").getValueAsString() == "true";
 
   // For each block in the loop.
-  for (Loop::block_iterator bb = TheLoop->block_begin(),
-       be = TheLoop->block_end(); bb != be; ++bb) {
-
+  for (BasicBlock *BB : TheLoop->blocks()) {
     // Scan the instructions in the block and look for hazards.
-    for (BasicBlock::iterator it = (*bb)->begin(), e = (*bb)->end(); it != e;
-         ++it) {
-
-      if (PHINode *Phi = dyn_cast<PHINode>(it)) {
+    for (Instruction &I : *BB) {
+      if (auto *Phi = dyn_cast<PHINode>(&I)) {
         Type *PhiTy = Phi->getType();
         // Check that this PHI type is allowed.
-        if (!PhiTy->isIntegerTy() &&
-            !PhiTy->isFloatingPointTy() &&
+        if (!PhiTy->isIntegerTy() && !PhiTy->isFloatingPointTy() &&
             !PhiTy->isPointerTy()) {
-          emitAnalysis(VectorizationReport(&*it)
+          emitAnalysis(VectorizationReport(Phi)
                        << "loop control flow is not understood by vectorizer");
           DEBUG(dbgs() << "LV: Found an non-int non-pointer PHI.\n");
           return false;
@@ -4242,61 +4595,25 @@ bool LoopVectorizationLegality::canVectorizeInstrs() {
         // If this PHINode is not in the header block, then we know that we
         // can convert it to select during if-conversion. No need to check if
         // the PHIs in this block are induction or reduction variables.
-        if (*bb != Header) {
+        if (BB != Header) {
           // Check that this instruction has no outside users or is an
           // identified reduction value with an outside user.
-          if (!hasOutsideLoopUser(TheLoop, &*it, AllowedExit))
+          if (!hasOutsideLoopUser(TheLoop, Phi, AllowedExit))
             continue;
-          emitAnalysis(VectorizationReport(&*it) <<
-                       "value could not be identified as "
-                       "an induction or reduction variable");
+          emitAnalysis(VectorizationReport(Phi)
+                       << "value could not be identified as "
+                          "an induction or reduction variable");
           return false;
         }
 
         // We only allow if-converted PHIs with exactly two incoming values.
         if (Phi->getNumIncomingValues() != 2) {
-          emitAnalysis(VectorizationReport(&*it)
+          emitAnalysis(VectorizationReport(Phi)
                        << "control flow not understood by vectorizer");
           DEBUG(dbgs() << "LV: Found an invalid PHI.\n");
           return false;
         }
 
-        InductionDescriptor ID;
-        if (InductionDescriptor::isInductionPHI(Phi, PSE.getSE(), ID)) {
-          Inductions[Phi] = ID;
-          // Get the widest type.
-          if (!WidestIndTy)
-            WidestIndTy = convertPointerToIntegerType(DL, PhiTy);
-          else
-            WidestIndTy = getWiderType(DL, PhiTy, WidestIndTy);
-
-          // Int inductions are special because we only allow one IV.
-          if (ID.getKind() == InductionDescriptor::IK_IntInduction &&
-              ID.getStepValue()->isOne() &&
-              isa<Constant>(ID.getStartValue()) &&
-                cast<Constant>(ID.getStartValue())->isNullValue()) {
-            // Use the phi node with the widest type as induction. Use the last
-            // one if there are multiple (no good reason for doing this other
-            // than it is expedient). We've checked that it begins at zero and
-            // steps by one, so this is a canonical induction variable.
-            if (!Induction || PhiTy == WidestIndTy)
-              Induction = Phi;
-          }
-
-          DEBUG(dbgs() << "LV: Found an induction variable.\n");
-
-          // Until we explicitly handle the case of an induction variable with
-          // an outside loop user we have to give up vectorizing this loop.
-          if (hasOutsideLoopUser(TheLoop, &*it, AllowedExit)) {
-            emitAnalysis(VectorizationReport(&*it) <<
-                         "use of induction value outside of the "
-                         "loop is not handled by vectorizer");
-            return false;
-          }
-
-          continue;
-        }
-
         RecurrenceDescriptor RedDes;
         if (RecurrenceDescriptor::isReductionPHI(Phi, TheLoop, RedDes)) {
           if (RedDes.hasUnsafeAlgebra())
@@ -4306,22 +4623,41 @@ bool LoopVectorizationLegality::canVectorizeInstrs() {
           continue;
         }
 
-        emitAnalysis(VectorizationReport(&*it) <<
-                     "value that could not be identified as "
-                     "reduction is used outside the loop");
-        DEBUG(dbgs() << "LV: Found an unidentified PHI."<< *Phi <<"\n");
+        InductionDescriptor ID;
+        if (InductionDescriptor::isInductionPHI(Phi, PSE, ID)) {
+          addInductionPhi(Phi, ID, AllowedExit);
+          continue;
+        }
+
+        if (RecurrenceDescriptor::isFirstOrderRecurrence(Phi, TheLoop, DT)) {
+          FirstOrderRecurrences.insert(Phi);
+          continue;
+        }
+
+        // As a last resort, coerce the PHI to a AddRec expression
+        // and re-try classifying it a an induction PHI.
+        if (InductionDescriptor::isInductionPHI(Phi, PSE, ID, true)) {
+          addInductionPhi(Phi, ID, AllowedExit);
+          continue;
+        }
+
+        emitAnalysis(VectorizationReport(Phi)
+                     << "value that could not be identified as "
+                        "reduction is used outside the loop");
+        DEBUG(dbgs() << "LV: Found an unidentified PHI." << *Phi << "\n");
         return false;
-      }// end of PHI handling
+      } // end of PHI handling
 
       // We handle calls that:
       //   * Are debug info intrinsics.
       //   * Have a mapping to an IR intrinsic.
       //   * Have a vector version available.
-      CallInst *CI = dyn_cast<CallInst>(it);
-      if (CI && !getIntrinsicIDForCall(CI, TLI) && !isa<DbgInfoIntrinsic>(CI) &&
+      auto *CI = dyn_cast<CallInst>(&I);
+      if (CI && !getVectorIntrinsicIDForCall(CI, TLI) &&
+          !isa<DbgInfoIntrinsic>(CI) &&
           !(CI->getCalledFunction() && TLI &&
             TLI->isFunctionVectorizable(CI->getCalledFunction()->getName()))) {
-        emitAnalysis(VectorizationReport(&*it)
+        emitAnalysis(VectorizationReport(CI)
                      << "call instruction cannot be vectorized");
         DEBUG(dbgs() << "LV: Found a non-intrinsic, non-libfunc callsite.\n");
         return false;
@@ -4329,11 +4665,11 @@ bool LoopVectorizationLegality::canVectorizeInstrs() {
 
       // Intrinsics such as powi,cttz and ctlz are legal to vectorize if the
       // second argument is the same (i.e. loop invariant)
-      if (CI &&
-          hasVectorInstrinsicScalarOpd(getIntrinsicIDForCall(CI, TLI), 1)) {
+      if (CI && hasVectorInstrinsicScalarOpd(
+                    getVectorIntrinsicIDForCall(CI, TLI), 1)) {
         auto *SE = PSE.getSE();
         if (!SE->isLoopInvariant(PSE.getSCEV(CI->getOperand(1)), TheLoop)) {
-          emitAnalysis(VectorizationReport(&*it)
+          emitAnalysis(VectorizationReport(CI)
                        << "intrinsic instruction cannot be vectorized");
           DEBUG(dbgs() << "LV: Found unvectorizable intrinsic " << *CI << "\n");
           return false;
@@ -4342,40 +4678,44 @@ bool LoopVectorizationLegality::canVectorizeInstrs() {
 
       // Check that the instruction return type is vectorizable.
       // Also, we can't vectorize extractelement instructions.
-      if ((!VectorType::isValidElementType(it->getType()) &&
-           !it->getType()->isVoidTy()) || isa<ExtractElementInst>(it)) {
-        emitAnalysis(VectorizationReport(&*it)
+      if ((!VectorType::isValidElementType(I.getType()) &&
+           !I.getType()->isVoidTy()) ||
+          isa<ExtractElementInst>(I)) {
+        emitAnalysis(VectorizationReport(&I)
                      << "instruction return type cannot be vectorized");
         DEBUG(dbgs() << "LV: Found unvectorizable type.\n");
         return false;
       }
 
       // Check that the stored type is vectorizable.
-      if (StoreInst *ST = dyn_cast<StoreInst>(it)) {
+      if (auto *ST = dyn_cast<StoreInst>(&I)) {
         Type *T = ST->getValueOperand()->getType();
         if (!VectorType::isValidElementType(T)) {
-          emitAnalysis(VectorizationReport(ST) <<
-                       "store instruction cannot be vectorized");
+          emitAnalysis(VectorizationReport(ST)
+                       << "store instruction cannot be vectorized");
           return false;
         }
-        if (EnableMemAccessVersioning)
-          collectStridedAccess(ST);
-      }
 
-      if (EnableMemAccessVersioning)
-        if (LoadInst *LI = dyn_cast<LoadInst>(it))
-          collectStridedAccess(LI);
+        // FP instructions can allow unsafe algebra, thus vectorizable by
+        // non-IEEE-754 compliant SIMD units.
+        // This applies to floating-point math operations and calls, not memory
+        // operations, shuffles, or casts, as they don't change precision or
+        // semantics.
+      } else if (I.getType()->isFloatingPointTy() && (CI || I.isBinaryOp()) &&
+                 !I.hasUnsafeAlgebra()) {
+        DEBUG(dbgs() << "LV: Found FP op with unsafe algebra.\n");
+        Hints->setPotentiallyUnsafe();
+      }
 
       // Reduction instructions are allowed to have exit users.
       // All other instructions must not have external users.
-      if (hasOutsideLoopUser(TheLoop, &*it, AllowedExit)) {
-        emitAnalysis(VectorizationReport(&*it) <<
-                     "value cannot be used outside the loop");
+      if (hasOutsideLoopUser(TheLoop, &I, AllowedExit)) {
+        emitAnalysis(VectorizationReport(&I)
+                     << "value cannot be used outside the loop");
         return false;
       }
 
     } // next instr.
-
   }
 
   if (!Induction) {
@@ -4396,64 +4736,90 @@ bool LoopVectorizationLegality::canVectorizeInstrs() {
   return true;
 }
 
-void LoopVectorizationLegality::collectStridedAccess(Value *MemAccess) {
-  Value *Ptr = nullptr;
-  if (LoadInst *LI = dyn_cast<LoadInst>(MemAccess))
-    Ptr = LI->getPointerOperand();
-  else if (StoreInst *SI = dyn_cast<StoreInst>(MemAccess))
-    Ptr = SI->getPointerOperand();
-  else
-    return;
-
-  Value *Stride = getStrideFromPointer(Ptr, PSE.getSE(), TheLoop);
-  if (!Stride)
-    return;
-
-  DEBUG(dbgs() << "LV: Found a strided access that we can version");
-  DEBUG(dbgs() << "  Ptr: " << *Ptr << " Stride: " << *Stride << "\n");
-  Strides[Ptr] = Stride;
-  StrideSet.insert(Stride);
-}
-
 void LoopVectorizationLegality::collectLoopUniforms() {
   // We now know that the loop is vectorizable!
   // Collect variables that will remain uniform after vectorization.
-  std::vector<Value*> Worklist;
-  BasicBlock *Latch = TheLoop->getLoopLatch();
 
-  // Start with the conditional branch and walk up the block.
-  Worklist.push_back(Latch->getTerminator()->getOperand(0));
+  // If V is not an instruction inside the current loop, it is a Value
+  // outside of the scope which we are interesting in.
+  auto isOutOfScope = [&](Value *V) -> bool {
+    Instruction *I = dyn_cast<Instruction>(V);
+    return (!I || !TheLoop->contains(I));
+  };
+
+  SetVector<Instruction *> Worklist;
+  BasicBlock *Latch = TheLoop->getLoopLatch();
+  // Start with the conditional branch.
+  if (!isOutOfScope(Latch->getTerminator()->getOperand(0))) {
+    Instruction *Cmp = cast<Instruction>(Latch->getTerminator()->getOperand(0));
+    Worklist.insert(Cmp);
+    DEBUG(dbgs() << "LV: Found uniform instruction: " << *Cmp << "\n");
+  }
 
   // Also add all consecutive pointer values; these values will be uniform
-  // after vectorization (and subsequent cleanup) and, until revectorization is
-  // supported, all dependencies must also be uniform.
-  for (Loop::block_iterator B = TheLoop->block_begin(),
-       BE = TheLoop->block_end(); B != BE; ++B)
-    for (BasicBlock::iterator I = (*B)->begin(), IE = (*B)->end();
-         I != IE; ++I)
-      if (I->getType()->isPointerTy() && isConsecutivePtr(&*I))
-        Worklist.insert(Worklist.end(), I->op_begin(), I->op_end());
-
-  while (!Worklist.empty()) {
-    Instruction *I = dyn_cast<Instruction>(Worklist.back());
-    Worklist.pop_back();
-
-    // Look at instructions inside this loop.
-    // Stop when reaching PHI nodes.
-    // TODO: we need to follow values all over the loop, not only in this block.
-    if (!I || !TheLoop->contains(I) || isa<PHINode>(I))
-      continue;
+  // after vectorization (and subsequent cleanup).
+  for (auto *BB : TheLoop->blocks()) {
+    for (auto &I : *BB) {
+      if (I.getType()->isPointerTy() && isConsecutivePtr(&I)) {
+        Worklist.insert(&I);
+        DEBUG(dbgs() << "LV: Found uniform instruction: " << I << "\n");
+      }
+    }
+  }
 
-    // This is a known uniform.
-    Uniforms.insert(I);
+  // Expand Worklist in topological order: whenever a new instruction
+  // is added , its users should be either already inside Worklist, or
+  // out of scope. It ensures a uniform instruction will only be used
+  // by uniform instructions or out of scope instructions.
+  unsigned idx = 0;
+  do {
+    Instruction *I = Worklist[idx++];
 
-    // Insert all operands.
-    Worklist.insert(Worklist.end(), I->op_begin(), I->op_end());
+    for (auto OV : I->operand_values()) {
+      if (isOutOfScope(OV))
+        continue;
+      auto *OI = cast<Instruction>(OV);
+      if (all_of(OI->users(), [&](User *U) -> bool {
+            return isOutOfScope(U) || Worklist.count(cast<Instruction>(U));
+          })) {
+        Worklist.insert(OI);
+        DEBUG(dbgs() << "LV: Found uniform instruction: " << *OI << "\n");
+      }
+    }
+  } while (idx != Worklist.size());
+
+  // For an instruction to be added into Worklist above, all its users inside
+  // the current loop should be already added into Worklist. This condition
+  // cannot be true for phi instructions which is always in a dependence loop.
+  // Because any instruction in the dependence cycle always depends on others
+  // in the cycle to be added into Worklist first, the result is no ones in
+  // the cycle will be added into Worklist in the end.
+  // That is why we process PHI separately.
+  for (auto &Induction : *getInductionVars()) {
+    auto *PN = Induction.first;
+    auto *UpdateV = PN->getIncomingValueForBlock(TheLoop->getLoopLatch());
+    if (all_of(PN->users(),
+               [&](User *U) -> bool {
+                 return U == UpdateV || isOutOfScope(U) ||
+                        Worklist.count(cast<Instruction>(U));
+               }) &&
+        all_of(UpdateV->users(), [&](User *U) -> bool {
+          return U == PN || isOutOfScope(U) ||
+                 Worklist.count(cast<Instruction>(U));
+        })) {
+      Worklist.insert(cast<Instruction>(PN));
+      Worklist.insert(cast<Instruction>(UpdateV));
+      DEBUG(dbgs() << "LV: Found uniform instruction: " << *PN << "\n");
+      DEBUG(dbgs() << "LV: Found uniform instruction: " << *UpdateV << "\n");
+    }
   }
+
+  Uniforms.insert(Worklist.begin(), Worklist.end());
 }
 
 bool LoopVectorizationLegality::canVectorizeMemory() {
-  LAI = &LAA->getInfo(TheLoop, Strides);
+  LAI = &(*GetLAA)(*TheLoop);
+  InterleaveInfo.setLAI(LAI);
   auto &OptionalReport = LAI->getReport();
   if (OptionalReport)
     emitAnalysis(VectorizationReport(*OptionalReport));
@@ -4469,13 +4835,13 @@ bool LoopVectorizationLegality::canVectorizeMemory() {
   }
 
   Requirements->addRuntimePointerChecks(LAI->getNumRuntimePointerChecks());
-  PSE.addPredicate(LAI->PSE.getUnionPredicate());
+  PSE.addPredicate(LAI->getPSE().getUnionPredicate());
 
   return true;
 }
 
 bool LoopVectorizationLegality::isInductionVariable(const Value *V) {
-  Value *In0 = const_cast<Value*>(V);
+  Value *In0 = const_cast<Value *>(V);
   PHINode *PN = dyn_cast_or_null<PHINode>(In0);
   if (!PN)
     return false;
@@ -4483,67 +4849,73 @@ bool LoopVectorizationLegality::isInductionVariable(const Value *V) {
   return Inductions.count(PN);
 }
 
-bool LoopVectorizationLegality::blockNeedsPredication(BasicBlock *BB)  {
+bool LoopVectorizationLegality::isFirstOrderRecurrence(const PHINode *Phi) {
+  return FirstOrderRecurrences.count(Phi);
+}
+
+bool LoopVectorizationLegality::blockNeedsPredication(BasicBlock *BB) {
   return LoopAccessInfo::blockNeedsPredication(BB, TheLoop, DT);
 }
 
-bool LoopVectorizationLegality::blockCanBePredicated(BasicBlock *BB,
-                                           SmallPtrSetImpl<Value *> &SafePtrs) {
-  
-  for (BasicBlock::iterator it = BB->begin(), e = BB->end(); it != e; ++it) {
+bool LoopVectorizationLegality::blockCanBePredicated(
+    BasicBlock *BB, SmallPtrSetImpl<Value *> &SafePtrs) {
+  const bool IsAnnotatedParallel = TheLoop->isAnnotatedParallel();
+
+  for (Instruction &I : *BB) {
     // Check that we don't have a constant expression that can trap as operand.
-    for (Instruction::op_iterator OI = it->op_begin(), OE = it->op_end();
-         OI != OE; ++OI) {
-      if (Constant *C = dyn_cast<Constant>(*OI))
+    for (Value *Operand : I.operands()) {
+      if (auto *C = dyn_cast<Constant>(Operand))
         if (C->canTrap())
           return false;
     }
     // We might be able to hoist the load.
-    if (it->mayReadFromMemory()) {
-      LoadInst *LI = dyn_cast<LoadInst>(it);
+    if (I.mayReadFromMemory()) {
+      auto *LI = dyn_cast<LoadInst>(&I);
       if (!LI)
         return false;
       if (!SafePtrs.count(LI->getPointerOperand())) {
-        if (isLegalMaskedLoad(LI->getType(), LI->getPointerOperand())) {
+        if (isLegalMaskedLoad(LI->getType(), LI->getPointerOperand()) ||
+            isLegalMaskedGather(LI->getType())) {
           MaskedOp.insert(LI);
           continue;
         }
+        // !llvm.mem.parallel_loop_access implies if-conversion safety.
+        if (IsAnnotatedParallel)
+          continue;
         return false;
       }
     }
 
     // We don't predicate stores at the moment.
-    if (it->mayWriteToMemory()) {
-      StoreInst *SI = dyn_cast<StoreInst>(it);
+    if (I.mayWriteToMemory()) {
+      auto *SI = dyn_cast<StoreInst>(&I);
       // We only support predication of stores in basic blocks with one
       // predecessor.
       if (!SI)
         return false;
 
+      // Build a masked store if it is legal for the target.
+      if (isLegalMaskedStore(SI->getValueOperand()->getType(),
+                             SI->getPointerOperand()) ||
+          isLegalMaskedScatter(SI->getValueOperand()->getType())) {
+        MaskedOp.insert(SI);
+        continue;
+      }
+
       bool isSafePtr = (SafePtrs.count(SI->getPointerOperand()) != 0);
       bool isSinglePredecessor = SI->getParent()->getSinglePredecessor();
-      
+
       if (++NumPredStores > NumberOfStoresToPredicate || !isSafePtr ||
-          !isSinglePredecessor) {
-        // Build a masked store if it is legal for the target, otherwise
-        // scalarize the block.
-        bool isLegalMaskedOp =
-          isLegalMaskedStore(SI->getValueOperand()->getType(),
-                             SI->getPointerOperand());
-        if (isLegalMaskedOp) {
-          --NumPredStores;
-          MaskedOp.insert(SI);
-          continue;
-        }
+          !isSinglePredecessor)
         return false;
-      }
     }
-    if (it->mayThrow())
+    if (I.mayThrow())
       return false;
 
     // The instructions below can trap.
-    switch (it->getOpcode()) {
-    default: continue;
+    switch (I.getOpcode()) {
+    default:
+      continue;
     case Instruction::UDiv:
     case Instruction::SDiv:
     case Instruction::URem:
@@ -4555,199 +4927,273 @@ bool LoopVectorizationLegality::blockCanBePredicated(BasicBlock *BB,
   return true;
 }
 
-void InterleavedAccessInfo::collectConstStridedAccesses(
-    MapVector<Instruction *, StrideDescriptor> &StrideAccesses,
+void InterleavedAccessInfo::collectConstStrideAccesses(
+    MapVector<Instruction *, StrideDescriptor> &AccessStrideInfo,
     const ValueToValueMap &Strides) {
-  // Holds load/store instructions in program order.
-  SmallVector<Instruction *, 16> AccessList;
 
-  for (auto *BB : TheLoop->getBlocks()) {
-    bool IsPred = LoopAccessInfo::blockNeedsPredication(BB, TheLoop, DT);
+  auto &DL = TheLoop->getHeader()->getModule()->getDataLayout();
 
+  // Since it's desired that the load/store instructions be maintained in
+  // "program order" for the interleaved access analysis, we have to visit the
+  // blocks in the loop in reverse postorder (i.e., in a topological order).
+  // Such an ordering will ensure that any load/store that may be executed
+  // before a second load/store will precede the second load/store in
+  // AccessStrideInfo.
+  LoopBlocksDFS DFS(TheLoop);
+  DFS.perform(LI);
+  for (BasicBlock *BB : make_range(DFS.beginRPO(), DFS.endRPO()))
     for (auto &I : *BB) {
-      if (!isa<LoadInst>(&I) && !isa<StoreInst>(&I))
+      auto *LI = dyn_cast<LoadInst>(&I);
+      auto *SI = dyn_cast<StoreInst>(&I);
+      if (!LI && !SI)
         continue;
-      // FIXME: Currently we can't handle mixed accesses and predicated accesses
-      if (IsPred)
-        return;
-
-      AccessList.push_back(&I);
-    }
-  }
-
-  if (AccessList.empty())
-    return;
-
-  auto &DL = TheLoop->getHeader()->getModule()->getDataLayout();
-  for (auto I : AccessList) {
-    LoadInst *LI = dyn_cast<LoadInst>(I);
-    StoreInst *SI = dyn_cast<StoreInst>(I);
-
-    Value *Ptr = LI ? LI->getPointerOperand() : SI->getPointerOperand();
-    int Stride = isStridedPtr(PSE, Ptr, TheLoop, Strides);
-
-    // The factor of the corresponding interleave group.
-    unsigned Factor = std::abs(Stride);
 
-    // Ignore the access if the factor is too small or too large.
-    if (Factor < 2 || Factor > MaxInterleaveGroupFactor)
-      continue;
+      Value *Ptr = LI ? LI->getPointerOperand() : SI->getPointerOperand();
+      int64_t Stride = getPtrStride(PSE, Ptr, TheLoop, Strides);
 
-    const SCEV *Scev = replaceSymbolicStrideSCEV(PSE, Strides, Ptr);
-    PointerType *PtrTy = dyn_cast<PointerType>(Ptr->getType());
-    unsigned Size = DL.getTypeAllocSize(PtrTy->getElementType());
+      const SCEV *Scev = replaceSymbolicStrideSCEV(PSE, Strides, Ptr);
+      PointerType *PtrTy = dyn_cast<PointerType>(Ptr->getType());
+      uint64_t Size = DL.getTypeAllocSize(PtrTy->getElementType());
 
-    // An alignment of 0 means target ABI alignment.
-    unsigned Align = LI ? LI->getAlignment() : SI->getAlignment();
-    if (!Align)
-      Align = DL.getABITypeAlignment(PtrTy->getElementType());
+      // An alignment of 0 means target ABI alignment.
+      unsigned Align = LI ? LI->getAlignment() : SI->getAlignment();
+      if (!Align)
+        Align = DL.getABITypeAlignment(PtrTy->getElementType());
 
-    StrideAccesses[I] = StrideDescriptor(Stride, Scev, Size, Align);
-  }
+      AccessStrideInfo[&I] = StrideDescriptor(Stride, Scev, Size, Align);
+    }
 }
 
-// Analyze interleaved accesses and collect them into interleave groups.
+// Analyze interleaved accesses and collect them into interleaved load and
+// store groups.
+//
+// When generating code for an interleaved load group, we effectively hoist all
+// loads in the group to the location of the first load in program order. When
+// generating code for an interleaved store group, we sink all stores to the
+// location of the last store. This code motion can change the order of load
+// and store instructions and may break dependences.
+//
+// The code generation strategy mentioned above ensures that we won't violate
+// any write-after-read (WAR) dependences.
 //
-// Notice that the vectorization on interleaved groups will change instruction
-// orders and may break dependences. But the memory dependence check guarantees
-// that there is no overlap between two pointers of different strides, element
-// sizes or underlying bases.
+// E.g., for the WAR dependence:  a = A[i];      // (1)
+//                                A[i] = b;      // (2)
 //
-// For pointers sharing the same stride, element size and underlying base, no
-// need to worry about Read-After-Write dependences and Write-After-Read
+// The store group of (2) is always inserted at or below (2), and the load
+// group of (1) is always inserted at or above (1). Thus, the instructions will
+// never be reordered. All other dependences are checked to ensure the
+// correctness of the instruction reordering.
+//
+// The algorithm visits all memory accesses in the loop in bottom-up program
+// order. Program order is established by traversing the blocks in the loop in
+// reverse postorder when collecting the accesses.
+//
+// We visit the memory accesses in bottom-up order because it can simplify the
+// construction of store groups in the presence of write-after-write (WAW)
 // dependences.
 //
-// E.g. The RAW dependence:  A[i] = a;
-//                           b = A[i];
-// This won't exist as it is a store-load forwarding conflict, which has
-// already been checked and forbidden in the dependence check.
+// E.g., for the WAW dependence:  A[i] = a;      // (1)
+//                                A[i] = b;      // (2)
+//                                A[i + 1] = c;  // (3)
 //
-// E.g. The WAR dependence:  a = A[i];  // (1)
-//                           A[i] = b;  // (2)
-// The store group of (2) is always inserted at or below (2), and the load group
-// of (1) is always inserted at or above (1). The dependence is safe.
+// We will first create a store group with (3) and (2). (1) can't be added to
+// this group because it and (2) are dependent. However, (1) can be grouped
+// with other accesses that may precede it in program order. Note that a
+// bottom-up order does not imply that WAW dependences should not be checked.
 void InterleavedAccessInfo::analyzeInterleaving(
     const ValueToValueMap &Strides) {
   DEBUG(dbgs() << "LV: Analyzing interleaved accesses...\n");
 
-  // Holds all the stride accesses.
-  MapVector<Instruction *, StrideDescriptor> StrideAccesses;
-  collectConstStridedAccesses(StrideAccesses, Strides);
+  // Holds all accesses with a constant stride.
+  MapVector<Instruction *, StrideDescriptor> AccessStrideInfo;
+  collectConstStrideAccesses(AccessStrideInfo, Strides);
 
-  if (StrideAccesses.empty())
+  if (AccessStrideInfo.empty())
     return;
 
+  // Collect the dependences in the loop.
+  collectDependences();
+
   // Holds all interleaved store groups temporarily.
   SmallSetVector<InterleaveGroup *, 4> StoreGroups;
   // Holds all interleaved load groups temporarily.
   SmallSetVector<InterleaveGroup *, 4> LoadGroups;
 
-  // Search the load-load/write-write pair B-A in bottom-up order and try to
-  // insert B into the interleave group of A according to 3 rules:
-  //   1. A and B have the same stride.
-  //   2. A and B have the same memory object size.
-  //   3. B belongs to the group according to the distance.
+  // Search in bottom-up program order for pairs of accesses (A and B) that can
+  // form interleaved load or store groups. In the algorithm below, access A
+  // precedes access B in program order. We initialize a group for B in the
+  // outer loop of the algorithm, and then in the inner loop, we attempt to
+  // insert each A into B's group if:
+  //
+  //  1. A and B have the same stride,
+  //  2. A and B have the same memory object size, and
+  //  3. A belongs in B's group according to its distance from B.
   //
-  // The bottom-up order can avoid breaking the Write-After-Write dependences
-  // between two pointers of the same base.
-  // E.g.  A[i]   = a;   (1)
-  //       A[i]   = b;   (2)
-  //       A[i+1] = c    (3)
-  // We form the group (2)+(3) in front, so (1) has to form groups with accesses
-  // above (1), which guarantees that (1) is always above (2).
-  for (auto I = StrideAccesses.rbegin(), E = StrideAccesses.rend(); I != E;
-       ++I) {
-    Instruction *A = I->first;
-    StrideDescriptor DesA = I->second;
-
-    InterleaveGroup *Group = getInterleaveGroup(A);
-    if (!Group) {
-      DEBUG(dbgs() << "LV: Creating an interleave group with:" << *A << '\n');
-      Group = createInterleaveGroup(A, DesA.Stride, DesA.Align);
+  // Special care is taken to ensure group formation will not break any
+  // dependences.
+  for (auto BI = AccessStrideInfo.rbegin(), E = AccessStrideInfo.rend();
+       BI != E; ++BI) {
+    Instruction *B = BI->first;
+    StrideDescriptor DesB = BI->second;
+
+    // Initialize a group for B if it has an allowable stride. Even if we don't
+    // create a group for B, we continue with the bottom-up algorithm to ensure
+    // we don't break any of B's dependences.
+    InterleaveGroup *Group = nullptr;
+    if (isStrided(DesB.Stride)) {
+      Group = getInterleaveGroup(B);
+      if (!Group) {
+        DEBUG(dbgs() << "LV: Creating an interleave group with:" << *B << '\n');
+        Group = createInterleaveGroup(B, DesB.Stride, DesB.Align);
+      }
+      if (B->mayWriteToMemory())
+        StoreGroups.insert(Group);
+      else
+        LoadGroups.insert(Group);
     }
 
-    if (A->mayWriteToMemory())
-      StoreGroups.insert(Group);
-    else
-      LoadGroups.insert(Group);
+    for (auto AI = std::next(BI); AI != E; ++AI) {
+      Instruction *A = AI->first;
+      StrideDescriptor DesA = AI->second;
+
+      // Our code motion strategy implies that we can't have dependences
+      // between accesses in an interleaved group and other accesses located
+      // between the first and last member of the group. Note that this also
+      // means that a group can't have more than one member at a given offset.
+      // The accesses in a group can have dependences with other accesses, but
+      // we must ensure we don't extend the boundaries of the group such that
+      // we encompass those dependent accesses.
+      //
+      // For example, assume we have the sequence of accesses shown below in a
+      // stride-2 loop:
+      //
+      //  (1, 2) is a group | A[i]   = a;  // (1)
+      //                    | A[i-1] = b;  // (2) |
+      //                      A[i-3] = c;  // (3)
+      //                      A[i]   = d;  // (4) | (2, 4) is not a group
+      //
+      // Because accesses (2) and (3) are dependent, we can group (2) with (1)
+      // but not with (4). If we did, the dependent access (3) would be within
+      // the boundaries of the (2, 4) group.
+      if (!canReorderMemAccessesForInterleavedGroups(&*AI, &*BI)) {
+
+        // If a dependence exists and A is already in a group, we know that A
+        // must be a store since A precedes B and WAR dependences are allowed.
+        // Thus, A would be sunk below B. We release A's group to prevent this
+        // illegal code motion. A will then be free to form another group with
+        // instructions that precede it.
+        if (isInterleaved(A)) {
+          InterleaveGroup *StoreGroup = getInterleaveGroup(A);
+          StoreGroups.remove(StoreGroup);
+          releaseGroup(StoreGroup);
+        }
 
-    for (auto II = std::next(I); II != E; ++II) {
-      Instruction *B = II->first;
-      StrideDescriptor DesB = II->second;
+        // If a dependence exists and A is not already in a group (or it was
+        // and we just released it), B might be hoisted above A (if B is a
+        // load) or another store might be sunk below A (if B is a store). In
+        // either case, we can't add additional instructions to B's group. B
+        // will only form a group with instructions that it precedes.
+        break;
+      }
 
-      // Ignore if B is already in a group or B is a different memory operation.
-      if (isInterleaved(B) || A->mayReadFromMemory() != B->mayReadFromMemory())
+      // At this point, we've checked for illegal code motion. If either A or B
+      // isn't strided, there's nothing left to do.
+      if (!isStrided(DesA.Stride) || !isStrided(DesB.Stride))
         continue;
 
-      // Check the rule 1 and 2.
-      if (DesB.Stride != DesA.Stride || DesB.Size != DesA.Size)
+      // Ignore A if it's already in a group or isn't the same kind of memory
+      // operation as B.
+      if (isInterleaved(A) || A->mayReadFromMemory() != B->mayReadFromMemory())
         continue;
 
-      // Calculate the distance and prepare for the rule 3.
-      const SCEVConstant *DistToA = dyn_cast<SCEVConstant>(
-          PSE.getSE()->getMinusSCEV(DesB.Scev, DesA.Scev));
-      if (!DistToA)
+      // Check rules 1 and 2. Ignore A if its stride or size is different from
+      // that of B.
+      if (DesA.Stride != DesB.Stride || DesA.Size != DesB.Size)
         continue;
 
-      int DistanceToA = DistToA->getAPInt().getSExtValue();
+      // Calculate the distance from A to B.
+      const SCEVConstant *DistToB = dyn_cast<SCEVConstant>(
+          PSE.getSE()->getMinusSCEV(DesA.Scev, DesB.Scev));
+      if (!DistToB)
+        continue;
+      int64_t DistanceToB = DistToB->getAPInt().getSExtValue();
+
+      // Check rule 3. Ignore A if its distance to B is not a multiple of the
+      // size.
+      if (DistanceToB % static_cast<int64_t>(DesB.Size))
+        continue;
 
-      // Skip if the distance is not multiple of size as they are not in the
-      // same group.
-      if (DistanceToA % static_cast<int>(DesA.Size))
+      // Ignore A if either A or B is in a predicated block. Although we
+      // currently prevent group formation for predicated accesses, we may be
+      // able to relax this limitation in the future once we handle more
+      // complicated blocks.
+      if (isPredicated(A->getParent()) || isPredicated(B->getParent()))
         continue;
 
-      // The index of B is the index of A plus the related index to A.
-      int IndexB =
-          Group->getIndex(A) + DistanceToA / static_cast<int>(DesA.Size);
+      // The index of A is the index of B plus A's distance to B in multiples
+      // of the size.
+      int IndexA =
+          Group->getIndex(B) + DistanceToB / static_cast<int64_t>(DesB.Size);
 
-      // Try to insert B into the group.
-      if (Group->insertMember(B, IndexB, DesB.Align)) {
-        DEBUG(dbgs() << "LV: Inserted:" << *B << '\n'
-                     << "    into the interleave group with" << *A << '\n');
-        InterleaveGroupMap[B] = Group;
+      // Try to insert A into B's group.
+      if (Group->insertMember(A, IndexA, DesA.Align)) {
+        DEBUG(dbgs() << "LV: Inserted:" << *A << '\n'
+                     << "    into the interleave group with" << *B << '\n');
+        InterleaveGroupMap[A] = Group;
 
         // Set the first load in program order as the insert position.
-        if (B->mayReadFromMemory())
-          Group->setInsertPos(B);
+        if (A->mayReadFromMemory())
+          Group->setInsertPos(A);
       }
-    } // Iteration on instruction B
-  }   // Iteration on instruction A
+    } // Iteration over A accesses.
+  } // Iteration over B accesses.
 
   // Remove interleaved store groups with gaps.
   for (InterleaveGroup *Group : StoreGroups)
     if (Group->getNumMembers() != Group->getFactor())
       releaseGroup(Group);
 
-  // Remove interleaved load groups that don't have the first and last member.
-  // This guarantees that we won't do speculative out of bounds loads.
+  // If there is a non-reversed interleaved load group with gaps, we will need
+  // to execute at least one scalar epilogue iteration. This will ensure that
+  // we don't speculatively access memory out-of-bounds. Note that we only need
+  // to look for a member at index factor - 1, since every group must have a
+  // member at index zero.
   for (InterleaveGroup *Group : LoadGroups)
-    if (!Group->getMember(0) || !Group->getMember(Group->getFactor() - 1))
-      releaseGroup(Group);
+    if (!Group->getMember(Group->getFactor() - 1)) {
+      if (Group->isReverse()) {
+        releaseGroup(Group);
+      } else {
+        DEBUG(dbgs() << "LV: Interleaved group requires epilogue iteration.\n");
+        RequiresScalarEpilogue = true;
+      }
+    }
 }
 
 LoopVectorizationCostModel::VectorizationFactor
 LoopVectorizationCostModel::selectVectorizationFactor(bool OptForSize) {
   // Width 1 means no vectorize
-  VectorizationFactor Factor = { 1U, 0U };
+  VectorizationFactor Factor = {1U, 0U};
   if (OptForSize && Legal->getRuntimePointerChecking()->Need) {
-    emitAnalysis(VectorizationReport() <<
-                 "runtime pointer checks needed. Enable vectorization of this "
-                 "loop with '#pragma clang loop vectorize(enable)' when "
-                 "compiling with -Os/-Oz");
-    DEBUG(dbgs() <<
-          "LV: Aborting. Runtime ptr check is required with -Os/-Oz.\n");
+    emitAnalysis(
+        VectorizationReport()
+        << "runtime pointer checks needed. Enable vectorization of this "
+           "loop with '#pragma clang loop vectorize(enable)' when "
+           "compiling with -Os/-Oz");
+    DEBUG(dbgs()
+          << "LV: Aborting. Runtime ptr check is required with -Os/-Oz.\n");
     return Factor;
   }
 
   if (!EnableCondStoresVectorization && Legal->getNumPredStores()) {
-    emitAnalysis(VectorizationReport() <<
-                 "store that is conditionally executed prevents vectorization");
+    emitAnalysis(
+        VectorizationReport()
+        << "store that is conditionally executed prevents vectorization");
     DEBUG(dbgs() << "LV: No vectorization. There are conditional stores.\n");
     return Factor;
   }
 
   // Find the trip count.
-  unsigned TC = SE->getSmallConstantTripCount(TheLoop);
+  unsigned TC = PSE.getSE()->getSmallConstantTripCount(TheLoop);
   DEBUG(dbgs() << "LV: Found trip count: " << TC << '\n');
 
   MinBWs = computeMinimumValueSizes(TheLoop->getBlocks(), *DB, &TTI);
@@ -4755,16 +5201,25 @@ LoopVectorizationCostModel::selectVectorizationFactor(bool OptForSize) {
   std::tie(SmallestType, WidestType) = getSmallestAndWidestTypes();
   unsigned WidestRegister = TTI.getRegisterBitWidth(true);
   unsigned MaxSafeDepDist = -1U;
+
+  // Get the maximum safe dependence distance in bits computed by LAA. If the
+  // loop contains any interleaved accesses, we divide the dependence distance
+  // by the maximum interleave factor of all interleaved groups. Note that
+  // although the division ensures correctness, this is a fairly conservative
+  // computation because the maximum distance computed by LAA may not involve
+  // any of the interleaved accesses.
   if (Legal->getMaxSafeDepDistBytes() != -1U)
-    MaxSafeDepDist = Legal->getMaxSafeDepDistBytes() * 8;
-  WidestRegister = ((WidestRegister < MaxSafeDepDist) ?
-                    WidestRegister : MaxSafeDepDist);
+    MaxSafeDepDist =
+        Legal->getMaxSafeDepDistBytes() * 8 / Legal->getMaxInterleaveFactor();
+
+  WidestRegister =
+      ((WidestRegister < MaxSafeDepDist) ? WidestRegister : MaxSafeDepDist);
   unsigned MaxVectorSize = WidestRegister / WidestType;
 
   DEBUG(dbgs() << "LV: The Smallest and Widest types: " << SmallestType << " / "
                << WidestType << " bits.\n");
-  DEBUG(dbgs() << "LV: The Widest register is: "
-          << WidestRegister << " bits.\n");
+  DEBUG(dbgs() << "LV: The Widest register is: " << WidestRegister
+               << " bits.\n");
 
   if (MaxVectorSize == 0) {
     DEBUG(dbgs() << "LV: The target has no vector registers.\n");
@@ -4772,7 +5227,7 @@ LoopVectorizationCostModel::selectVectorizationFactor(bool OptForSize) {
   }
 
   assert(MaxVectorSize <= 64 && "Did not expect to pack so many elements"
-         " into one vector!");
+                                " into one vector!");
 
   unsigned VF = MaxVectorSize;
   if (MaximizeBandwidth && !OptForSize) {
@@ -4800,9 +5255,9 @@ LoopVectorizationCostModel::selectVectorizationFactor(bool OptForSize) {
   if (OptForSize) {
     // If we are unable to calculate the trip count then don't try to vectorize.
     if (TC < 2) {
-      emitAnalysis
-        (VectorizationReport() <<
-         "unable to calculate the loop count due to complex control flow");
+      emitAnalysis(
+          VectorizationReport()
+          << "unable to calculate the loop count due to complex control flow");
       DEBUG(dbgs() << "LV: Aborting. A tail loop is required with -Os/-Oz.\n");
       return Factor;
     }
@@ -4815,11 +5270,11 @@ LoopVectorizationCostModel::selectVectorizationFactor(bool OptForSize) {
     else {
       // If the trip count that we found modulo the vectorization factor is not
       // zero then we require a tail.
-      emitAnalysis(VectorizationReport() <<
-                   "cannot optimize for size and vectorize at the "
-                   "same time. Enable vectorization of this loop "
-                   "with '#pragma clang loop vectorize(enable)' "
-                   "when compiling with -Os/-Oz");
+      emitAnalysis(VectorizationReport()
+                   << "cannot optimize for size and vectorize at the "
+                      "same time. Enable vectorization of this loop "
+                      "with '#pragma clang loop vectorize(enable)' "
+                      "when compiling with -Os/-Oz");
       DEBUG(dbgs() << "LV: Aborting. A tail loop is required with -Os/-Oz.\n");
       return Factor;
     }
@@ -4834,7 +5289,7 @@ LoopVectorizationCostModel::selectVectorizationFactor(bool OptForSize) {
     return Factor;
   }
 
-  float Cost = expectedCost(1);
+  float Cost = expectedCost(1).first;
 #ifndef NDEBUG
   const float ScalarCost = Cost;
 #endif /* NDEBUG */
@@ -4845,16 +5300,23 @@ LoopVectorizationCostModel::selectVectorizationFactor(bool OptForSize) {
   // Ignore scalar width, because the user explicitly wants vectorization.
   if (ForceVectorization && VF > 1) {
     Width = 2;
-    Cost = expectedCost(Width) / (float)Width;
+    Cost = expectedCost(Width).first / (float)Width;
   }
 
-  for (unsigned i=2; i <= VF; i*=2) {
+  for (unsigned i = 2; i <= VF; i *= 2) {
     // Notice that the vector loop needs to be executed less times, so
     // we need to divide the cost of the vector loops by the width of
     // the vector elements.
-    float VectorCost = expectedCost(i) / (float)i;
-    DEBUG(dbgs() << "LV: Vector loop of width " << i << " costs: " <<
-          (int)VectorCost << ".\n");
+    VectorizationCostTy C = expectedCost(i);
+    float VectorCost = C.first / (float)i;
+    DEBUG(dbgs() << "LV: Vector loop of width " << i
+                 << " costs: " << (int)VectorCost << ".\n");
+    if (!C.second && !ForceVectorization) {
+      DEBUG(
+          dbgs() << "LV: Not considering vector loop of width " << i
+                 << " because it will not generate any vector instructions.\n");
+      continue;
+    }
     if (VectorCost < Cost) {
       Cost = VectorCost;
       Width = i;
@@ -4864,7 +5326,7 @@ LoopVectorizationCostModel::selectVectorizationFactor(bool OptForSize) {
   DEBUG(if (ForceVectorization && Width > 1 && Cost >= ScalarCost) dbgs()
         << "LV: Vectorization seems to be not beneficial, "
         << "but was forced by a user.\n");
-  DEBUG(dbgs() << "LV: Selecting VF: "<< Width << ".\n");
+  DEBUG(dbgs() << "LV: Selecting VF: " << Width << ".\n");
   Factor.Width = Width;
   Factor.Cost = Width * Cost;
   return Factor;
@@ -4877,25 +5339,22 @@ LoopVectorizationCostModel::getSmallestAndWidestTypes() {
   const DataLayout &DL = TheFunction->getParent()->getDataLayout();
 
   // For each block.
-  for (Loop::block_iterator bb = TheLoop->block_begin(),
-       be = TheLoop->block_end(); bb != be; ++bb) {
-    BasicBlock *BB = *bb;
-
+  for (BasicBlock *BB : TheLoop->blocks()) {
     // For each instruction in the loop.
-    for (BasicBlock::iterator it = BB->begin(), e = BB->end(); it != e; ++it) {
-      Type *T = it->getType();
+    for (Instruction &I : *BB) {
+      Type *T = I.getType();
 
       // Skip ignored values.
-      if (ValuesToIgnore.count(&*it))
+      if (ValuesToIgnore.count(&I))
         continue;
 
       // Only examine Loads, Stores and PHINodes.
-      if (!isa<LoadInst>(it) && !isa<StoreInst>(it) && !isa<PHINode>(it))
+      if (!isa<LoadInst>(I) && !isa<StoreInst>(I) && !isa<PHINode>(I))
         continue;
 
       // Examine PHI nodes that are reduction variables. Update the type to
       // account for the recurrence type.
-      if (PHINode *PN = dyn_cast<PHINode>(it)) {
+      if (auto *PN = dyn_cast<PHINode>(&I)) {
         if (!Legal->isReductionVariable(PN))
           continue;
         RecurrenceDescriptor RdxDesc = (*Legal->getReductionVars())[PN];
@@ -4903,13 +5362,13 @@ LoopVectorizationCostModel::getSmallestAndWidestTypes() {
       }
 
       // Examine the stored values.
-      if (StoreInst *ST = dyn_cast<StoreInst>(it))
+      if (auto *ST = dyn_cast<StoreInst>(&I))
         T = ST->getValueOperand()->getType();
 
       // Ignore loaded pointer types and stored pointer types that are not
       // consecutive. However, we do want to take consecutive stores/loads of
       // pointer vectors into account.
-      if (T->isPointerTy() && !isConsecutiveLoadOrStore(&*it))
+      if (T->isPointerTy() && !isConsecutiveLoadOrStore(&I))
         continue;
 
       MinWidth = std::min(MinWidth,
@@ -4949,13 +5408,13 @@ unsigned LoopVectorizationCostModel::selectInterleaveCount(bool OptForSize,
     return 1;
 
   // Do not interleave loops with a relatively small trip count.
-  unsigned TC = SE->getSmallConstantTripCount(TheLoop);
+  unsigned TC = PSE.getSE()->getSmallConstantTripCount(TheLoop);
   if (TC > 1 && TC < TinyTripCountInterleaveThreshold)
     return 1;
 
   unsigned TargetNumRegisters = TTI.getNumberOfRegisters(VF > 1);
-  DEBUG(dbgs() << "LV: The target has " << TargetNumRegisters <<
-        " registers\n");
+  DEBUG(dbgs() << "LV: The target has " << TargetNumRegisters
+               << " registers\n");
 
   if (VF == 1) {
     if (ForceTargetNumScalarRegs.getNumOccurrences() > 0)
@@ -5002,7 +5461,7 @@ unsigned LoopVectorizationCostModel::selectInterleaveCount(bool OptForSize,
   // If we did not calculate the cost for VF (because the user selected the VF)
   // then we calculate the cost of VF here.
   if (LoopCost == 0)
-    LoopCost = expectedCost(VF);
+    LoopCost = expectedCost(VF).first;
 
   // Clamp the calculated IC to be between the 1 and the max interleave count
   // that the target allows.
@@ -5044,8 +5503,7 @@ unsigned LoopVectorizationCostModel::selectInterleaveCount(bool OptForSize,
     // by this point), we can increase the critical path length if the loop
     // we're interleaving is inside another loop. Limit, by default to 2, so the
     // critical path only gets increased by one reduction operation.
-    if (Legal->getReductionVars()->size() &&
-        TheLoop->getLoopDepth() > 1) {
+    if (Legal->getReductionVars()->size() && TheLoop->getLoopDepth() > 1) {
       unsigned F = static_cast<unsigned>(MaxNestedScalarReductionIC);
       SmallIC = std::min(SmallIC, F);
       StoresIC = std::min(StoresIC, F);
@@ -5075,8 +5533,7 @@ unsigned LoopVectorizationCostModel::selectInterleaveCount(bool OptForSize,
 }
 
 SmallVector<LoopVectorizationCostModel::RegisterUsage, 8>
-LoopVectorizationCostModel::calculateRegisterUsage(
-    const SmallVector<unsigned, 8> &VFs) {
+LoopVectorizationCostModel::calculateRegisterUsage(ArrayRef<unsigned> VFs) {
   // This function calculates the register usage by measuring the highest number
   // of values that are alive at a single location. Obviously, this is a very
   // rough estimation. We scan the loop in a topological order in order and
@@ -5103,31 +5560,30 @@ LoopVectorizationCostModel::calculateRegisterUsage(
   // Each 'key' in the map opens a new interval. The values
   // of the map are the index of the 'last seen' usage of the
   // instruction that is the key.
-  typedef DenseMap<Instruction*, unsigned> IntervalMap;
+  typedef DenseMap<Instruction *, unsigned> IntervalMap;
   // Maps instruction to its index.
-  DenseMap<unsigned, Instruction*> IdxToInstr;
+  DenseMap<unsigned, Instruction *> IdxToInstr;
   // Marks the end of each interval.
   IntervalMap EndPoint;
   // Saves the list of instruction indices that are used in the loop.
-  SmallSet<Instruction*, 8> Ends;
+  SmallSet<Instruction *, 8> Ends;
   // Saves the list of values that are used in the loop but are
   // defined outside the loop, such as arguments and constants.
-  SmallPtrSet<Value*, 8> LoopInvariants;
+  SmallPtrSet<Value *, 8> LoopInvariants;
 
   unsigned Index = 0;
-  for (LoopBlocksDFS::RPOIterator bb = DFS.beginRPO(),
-       be = DFS.endRPO(); bb != be; ++bb) {
-    RU.NumInstructions += (*bb)->size();
-    for (Instruction &I : **bb) {
+  for (BasicBlock *BB : make_range(DFS.beginRPO(), DFS.endRPO())) {
+    RU.NumInstructions += BB->size();
+    for (Instruction &I : *BB) {
       IdxToInstr[Index++] = &I;
 
       // Save the end location of each USE.
-      for (unsigned i = 0; i < I.getNumOperands(); ++i) {
-        Value *U = I.getOperand(i);
-        Instruction *Instr = dyn_cast<Instruction>(U);
+      for (Value *U : I.operands()) {
+        auto *Instr = dyn_cast<Instruction>(U);
 
         // Ignore non-instruction values such as arguments, constants, etc.
-        if (!Instr) continue;
+        if (!Instr)
+          continue;
 
         // If this instruction is outside the loop then record it and continue.
         if (!TheLoop->contains(Instr)) {
@@ -5143,15 +5599,14 @@ LoopVectorizationCostModel::calculateRegisterUsage(
   }
 
   // Saves the list of intervals that end with the index in 'key'.
-  typedef SmallVector<Instruction*, 2> InstrList;
+  typedef SmallVector<Instruction *, 2> InstrList;
   DenseMap<unsigned, InstrList> TransposeEnds;
 
   // Transpose the EndPoints to a list of values that end at each index.
-  for (IntervalMap::iterator it = EndPoint.begin(), e = EndPoint.end();
-       it != e; ++it)
-    TransposeEnds[it->second].push_back(it->first);
+  for (auto &Interval : EndPoint)
+    TransposeEnds[Interval.second].push_back(Interval.first);
 
-  SmallSet<Instruction*, 8> OpenIntervals;
+  SmallSet<Instruction *, 8> OpenIntervals;
 
   // Get the size of the widest register.
   unsigned MaxSafeDepDist = -1U;
@@ -5168,6 +5623,8 @@ LoopVectorizationCostModel::calculateRegisterUsage(
 
   // A lambda that gets the register usage for the given type and VF.
   auto GetRegUsage = [&DL, WidestRegister](Type *Ty, unsigned VF) {
+    if (Ty->isTokenTy())
+      return 0U;
     unsigned TypeSize = DL.getTypeSizeInBits(Ty->getScalarType());
     return std::max<unsigned>(1, VF * TypeSize / WidestRegister);
   };
@@ -5175,16 +5632,17 @@ LoopVectorizationCostModel::calculateRegisterUsage(
   for (unsigned int i = 0; i < Index; ++i) {
     Instruction *I = IdxToInstr[i];
     // Ignore instructions that are never used within the loop.
-    if (!Ends.count(I)) continue;
-
-    // Skip ignored values.
-    if (ValuesToIgnore.count(I))
+    if (!Ends.count(I))
       continue;
 
     // Remove all of the instructions that end at this location.
     InstrList &List = TransposeEnds[i];
-    for (unsigned int j = 0, e = List.size(); j < e; ++j)
-      OpenIntervals.erase(List[j]);
+    for (Instruction *ToRemove : List)
+      OpenIntervals.erase(ToRemove);
+
+    // Skip ignored values.
+    if (ValuesToIgnore.count(I))
+      continue;
 
     // For each VF find the maximum usage of registers.
     for (unsigned j = 0, e = VFs.size(); j < e; ++j) {
@@ -5195,8 +5653,12 @@ LoopVectorizationCostModel::calculateRegisterUsage(
 
       // Count the number of live intervals.
       unsigned RegUsage = 0;
-      for (auto Inst : OpenIntervals)
+      for (auto Inst : OpenIntervals) {
+        // Skip ignored values for VF > 1.
+        if (VecValuesToIgnore.count(Inst))
+          continue;
         RegUsage += GetRegUsage(Inst->getType(), VFs[j]);
+      }
       MaxUsages[j] = std::max(MaxUsages[j], RegUsage);
     }
 
@@ -5216,7 +5678,7 @@ LoopVectorizationCostModel::calculateRegisterUsage(
         Invariant += GetRegUsage(Inst->getType(), VFs[i]);
     }
 
-    DEBUG(dbgs() << "LV(REG): VF = " << VFs[i] <<  '\n');
+    DEBUG(dbgs() << "LV(REG): VF = " << VFs[i] << '\n');
     DEBUG(dbgs() << "LV(REG): Found max usage: " << MaxUsages[i] << '\n');
     DEBUG(dbgs() << "LV(REG): Found invariant usage: " << Invariant << '\n');
     DEBUG(dbgs() << "LV(REG): LoopSize: " << RU.NumInstructions << '\n');
@@ -5229,48 +5691,62 @@ LoopVectorizationCostModel::calculateRegisterUsage(
   return RUs;
 }
 
-unsigned LoopVectorizationCostModel::expectedCost(unsigned VF) {
-  unsigned Cost = 0;
+LoopVectorizationCostModel::VectorizationCostTy
+LoopVectorizationCostModel::expectedCost(unsigned VF) {
+  VectorizationCostTy Cost;
 
   // For each block.
-  for (Loop::block_iterator bb = TheLoop->block_begin(),
-       be = TheLoop->block_end(); bb != be; ++bb) {
-    unsigned BlockCost = 0;
-    BasicBlock *BB = *bb;
+  for (BasicBlock *BB : TheLoop->blocks()) {
+    VectorizationCostTy BlockCost;
 
     // For each instruction in the old loop.
-    for (BasicBlock::iterator it = BB->begin(), e = BB->end(); it != e; ++it) {
+    for (Instruction &I : *BB) {
       // Skip dbg intrinsics.
-      if (isa<DbgInfoIntrinsic>(it))
+      if (isa<DbgInfoIntrinsic>(I))
         continue;
 
       // Skip ignored values.
-      if (ValuesToIgnore.count(&*it))
+      if (ValuesToIgnore.count(&I))
         continue;
 
-      unsigned C = getInstructionCost(&*it, VF);
+      VectorizationCostTy C = getInstructionCost(&I, VF);
 
       // Check if we should override the cost.
       if (ForceTargetInstructionCost.getNumOccurrences() > 0)
-        C = ForceTargetInstructionCost;
+        C.first = ForceTargetInstructionCost;
 
-      BlockCost += C;
-      DEBUG(dbgs() << "LV: Found an estimated cost of " << C << " for VF " <<
-            VF << " For instruction: " << *it << '\n');
+      BlockCost.first += C.first;
+      BlockCost.second |= C.second;
+      DEBUG(dbgs() << "LV: Found an estimated cost of " << C.first << " for VF "
+                   << VF << " For instruction: " << I << '\n');
     }
 
     // We assume that if-converted blocks have a 50% chance of being executed.
     // When the code is scalar then some of the blocks are avoided due to CF.
     // When the code is vectorized we execute all code paths.
-    if (VF == 1 && Legal->blockNeedsPredication(*bb))
-      BlockCost /= 2;
+    if (VF == 1 && Legal->blockNeedsPredication(BB))
+      BlockCost.first /= 2;
 
-    Cost += BlockCost;
+    Cost.first += BlockCost.first;
+    Cost.second |= BlockCost.second;
   }
 
   return Cost;
 }
 
+/// \brief Check if the load/store instruction \p I may be translated into
+/// gather/scatter during vectorization.
+///
+/// Pointer \p Ptr specifies address in memory for the given scalar memory
+/// instruction. We need it to retrieve data type.
+/// Using gather/scatter is possible when it is supported by target.
+static bool isGatherOrScatterLegal(Instruction *I, Value *Ptr,
+                                   LoopVectorizationLegality *Legal) {
+  auto *DataTy = cast<PointerType>(Ptr->getType())->getElementType();
+  return (isa<LoadInst>(I) && Legal->isLegalMaskedGather(DataTy)) ||
+         (isa<StoreInst>(I) && Legal->isLegalMaskedScatter(DataTy));
+}
+
 /// \brief Check whether the address computation for a non-consecutive memory
 /// access looks like an unlikely candidate for being merged into the indexing
 /// mode.
@@ -5284,7 +5760,7 @@ static bool isLikelyComplexAddressComputation(Value *Ptr,
                                               LoopVectorizationLegality *Legal,
                                               ScalarEvolution *SE,
                                               const Loop *TheLoop) {
-  GetElementPtrInst *Gep = dyn_cast<GetElementPtrInst>(Ptr);
+  auto *Gep = dyn_cast<GetElementPtrInst>(Ptr);
   if (!Gep)
     return true;
 
@@ -5309,7 +5785,7 @@ static bool isLikelyComplexAddressComputation(Value *Ptr,
   // Check the step is constant.
   const SCEV *Step = AddRec->getStepRecurrence(*SE);
   // Calculate the pointer stride and check if it is consecutive.
-  const SCEVConstant *C = dyn_cast<SCEVConstant>(Step);
+  const auto *C = dyn_cast<SCEVConstant>(Step);
   if (!C)
     return true;
 
@@ -5329,17 +5805,29 @@ static bool isStrideMul(Instruction *I, LoopVectorizationLegality *Legal) {
          Legal->hasStride(I->getOperand(1));
 }
 
-unsigned
+LoopVectorizationCostModel::VectorizationCostTy
 LoopVectorizationCostModel::getInstructionCost(Instruction *I, unsigned VF) {
   // If we know that this instruction will remain uniform, check the cost of
   // the scalar version.
   if (Legal->isUniformAfterVectorization(I))
     VF = 1;
 
+  Type *VectorTy;
+  unsigned C = getInstructionCost(I, VF, VectorTy);
+
+  bool TypeNotScalarized =
+      VF > 1 && !VectorTy->isVoidTy() && TTI.getNumberOfParts(VectorTy) < VF;
+  return VectorizationCostTy(C, TypeNotScalarized);
+}
+
+unsigned LoopVectorizationCostModel::getInstructionCost(Instruction *I,
+                                                        unsigned VF,
+                                                        Type *&VectorTy) {
   Type *RetTy = I->getType();
   if (VF > 1 && MinBWs.count(I))
     RetTy = IntegerType::get(RetTy->getContext(), MinBWs[I]);
-  Type *VectorTy = ToVectorTy(RetTy, VF);
+  VectorTy = ToVectorTy(RetTy, VF);
+  auto SE = PSE.getSE();
 
   // TODO: We need to estimate the cost of intrinsic calls.
   switch (I->getOpcode()) {
@@ -5352,9 +5840,17 @@ LoopVectorizationCostModel::getInstructionCost(Instruction *I, unsigned VF) {
   case Instruction::Br: {
     return TTI.getCFInstrCost(I->getOpcode());
   }
-  case Instruction::PHI:
-    //TODO: IF-converted IFs become selects.
+  case Instruction::PHI: {
+    auto *Phi = cast<PHINode>(I);
+
+    // First-order recurrences are replaced by vector shuffles inside the loop.
+    if (VF > 1 && Legal->isFirstOrderRecurrence(Phi))
+      return TTI.getShuffleCost(TargetTransformInfo::SK_ExtractSubvector,
+                                VectorTy, VF - 1, VectorTy);
+
+    // TODO: IF-converted IFs become selects.
     return 0;
+  }
   case Instruction::Add:
   case Instruction::FAdd:
   case Instruction::Sub:
@@ -5379,9 +5875,9 @@ LoopVectorizationCostModel::getInstructionCost(Instruction *I, unsigned VF) {
     // Certain instructions can be cheaper to vectorize if they have a constant
     // second vector operand. One example of this are shifts on x86.
     TargetTransformInfo::OperandValueKind Op1VK =
-      TargetTransformInfo::OK_AnyValue;
+        TargetTransformInfo::OK_AnyValue;
     TargetTransformInfo::OperandValueKind Op2VK =
-      TargetTransformInfo::OK_AnyValue;
+        TargetTransformInfo::OK_AnyValue;
     TargetTransformInfo::OperandValueProperties Op1VP =
         TargetTransformInfo::OP_None;
     TargetTransformInfo::OperandValueProperties Op2VP =
@@ -5432,20 +5928,28 @@ LoopVectorizationCostModel::getInstructionCost(Instruction *I, unsigned VF) {
   case Instruction::Load: {
     StoreInst *SI = dyn_cast<StoreInst>(I);
     LoadInst *LI = dyn_cast<LoadInst>(I);
-    Type *ValTy = (SI ? SI->getValueOperand()->getType() :
-                   LI->getType());
+    Type *ValTy = (SI ? SI->getValueOperand()->getType() : LI->getType());
     VectorTy = ToVectorTy(ValTy, VF);
 
     unsigned Alignment = SI ? SI->getAlignment() : LI->getAlignment();
-    unsigned AS = SI ? SI->getPointerAddressSpace() :
-      LI->getPointerAddressSpace();
+    unsigned AS =
+        SI ? SI->getPointerAddressSpace() : LI->getPointerAddressSpace();
     Value *Ptr = SI ? SI->getPointerOperand() : LI->getPointerOperand();
     // We add the cost of address computation here instead of with the gep
     // instruction because only here we know whether the operation is
     // scalarized.
     if (VF == 1)
       return TTI.getAddressComputationCost(VectorTy) +
-        TTI.getMemoryOpCost(I->getOpcode(), VectorTy, Alignment, AS);
+             TTI.getMemoryOpCost(I->getOpcode(), VectorTy, Alignment, AS);
+
+    if (LI && Legal->isUniform(Ptr)) {
+      // Scalar load + broadcast
+      unsigned Cost = TTI.getAddressComputationCost(ValTy->getScalarType());
+      Cost += TTI.getMemoryOpCost(I->getOpcode(), ValTy->getScalarType(),
+                                  Alignment, AS);
+      return Cost +
+             TTI.getShuffleCost(TargetTransformInfo::SK_Broadcast, ValTy);
+    }
 
     // For an interleaved access, calculate the total cost of the whole
     // interleave group.
@@ -5463,7 +5967,7 @@ LoopVectorizationCostModel::getInstructionCost(Instruction *I, unsigned VF) {
                           VectorTy->getVectorNumElements() * InterleaveFactor);
 
       // Holds the indices of existing members in an interleaved load group.
-      // An interleaved store group doesn't need this as it dones't allow gaps.
+      // An interleaved store group doesn't need this as it doesn't allow gaps.
       SmallVector<unsigned, 4> Indices;
       if (LI) {
         for (unsigned i = 0; i < InterleaveFactor; i++)
@@ -5489,13 +5993,17 @@ LoopVectorizationCostModel::getInstructionCost(Instruction *I, unsigned VF) {
 
     // Scalarized loads/stores.
     int ConsecutiveStride = Legal->isConsecutivePtr(Ptr);
+    bool UseGatherOrScatter =
+        (ConsecutiveStride == 0) && isGatherOrScatterLegal(I, Ptr, Legal);
+
     bool Reverse = ConsecutiveStride < 0;
     const DataLayout &DL = I->getModule()->getDataLayout();
-    unsigned ScalarAllocatedSize = DL.getTypeAllocSize(ValTy);
-    unsigned VectorElementSize = DL.getTypeStoreSize(VectorTy) / VF;
-    if (!ConsecutiveStride || ScalarAllocatedSize != VectorElementSize) {
+    uint64_t ScalarAllocatedSize = DL.getTypeAllocSize(ValTy);
+    uint64_t VectorElementSize = DL.getTypeStoreSize(VectorTy) / VF;
+    if ((!ConsecutiveStride && !UseGatherOrScatter) ||
+        ScalarAllocatedSize != VectorElementSize) {
       bool IsComplexComputation =
-        isLikelyComplexAddressComputation(Ptr, Legal, SE, TheLoop);
+          isLikelyComplexAddressComputation(Ptr, Legal, SE, TheLoop);
       unsigned Cost = 0;
       // The cost of extracting from the value vector and pointer vector.
       Type *PtrTy = ToVectorTy(Ptr->getType(), VF);
@@ -5505,29 +6013,36 @@ LoopVectorizationCostModel::getInstructionCost(Instruction *I, unsigned VF) {
         // In case of STORE, the cost of ExtractElement from the vector.
         // In case of LOAD, the cost of InsertElement into the returned
         // vector.
-        Cost += TTI.getVectorInstrCost(SI ? Instruction::ExtractElement :
-                                            Instruction::InsertElement,
-                                            VectorTy, i);
+        Cost += TTI.getVectorInstrCost(SI ? Instruction::ExtractElement
+                                          : Instruction::InsertElement,
+                                       VectorTy, i);
       }
 
       // The cost of the scalar loads/stores.
       Cost += VF * TTI.getAddressComputationCost(PtrTy, IsComplexComputation);
-      Cost += VF * TTI.getMemoryOpCost(I->getOpcode(), ValTy->getScalarType(),
-                                       Alignment, AS);
+      Cost += VF *
+              TTI.getMemoryOpCost(I->getOpcode(), ValTy->getScalarType(),
+                                  Alignment, AS);
       return Cost;
     }
 
-    // Wide load/stores.
     unsigned Cost = TTI.getAddressComputationCost(VectorTy);
+    if (UseGatherOrScatter) {
+      assert(ConsecutiveStride == 0 &&
+             "Gather/Scatter are not used for consecutive stride");
+      return Cost +
+             TTI.getGatherScatterOpCost(I->getOpcode(), VectorTy, Ptr,
+                                        Legal->isMaskRequired(I), Alignment);
+    }
+    // Wide load/stores.
     if (Legal->isMaskRequired(I))
-      Cost += TTI.getMaskedMemoryOpCost(I->getOpcode(), VectorTy, Alignment,
-                                        AS);
+      Cost +=
+          TTI.getMaskedMemoryOpCost(I->getOpcode(), VectorTy, Alignment, AS);
     else
       Cost += TTI.getMemoryOpCost(I->getOpcode(), VectorTy, Alignment, AS);
 
     if (Reverse)
-      Cost += TTI.getShuffleCost(TargetTransformInfo::SK_Reverse,
-                                  VectorTy, 0);
+      Cost += TTI.getShuffleCost(TargetTransformInfo::SK_Reverse, VectorTy, 0);
     return Cost;
   }
   case Instruction::ZExt:
@@ -5548,7 +6063,7 @@ LoopVectorizationCostModel::getInstructionCost(Instruction *I, unsigned VF) {
         Legal->isInductionVariable(I->getOperand(0)))
       return TTI.getCastInstrCost(I->getOpcode(), I->getType(),
                                   I->getOperand(0)->getType());
-    
+
     Type *SrcScalarTy = I->getOperand(0)->getType();
     Type *SrcVecTy = ToVectorTy(SrcScalarTy, VF);
     if (VF > 1 && MinBWs.count(I)) {
@@ -5560,23 +6075,23 @@ LoopVectorizationCostModel::getInstructionCost(Instruction *I, unsigned VF) {
       Type *MinVecTy = VectorTy;
       if (I->getOpcode() == Instruction::Trunc) {
         SrcVecTy = smallestIntegerVectorType(SrcVecTy, MinVecTy);
-        VectorTy = largestIntegerVectorType(ToVectorTy(I->getType(), VF),
-                                            MinVecTy);
+        VectorTy =
+            largestIntegerVectorType(ToVectorTy(I->getType(), VF), MinVecTy);
       } else if (I->getOpcode() == Instruction::ZExt ||
                  I->getOpcode() == Instruction::SExt) {
         SrcVecTy = largestIntegerVectorType(SrcVecTy, MinVecTy);
-        VectorTy = smallestIntegerVectorType(ToVectorTy(I->getType(), VF),
-                                             MinVecTy);
+        VectorTy =
+            smallestIntegerVectorType(ToVectorTy(I->getType(), VF), MinVecTy);
       }
     }
-    
+
     return TTI.getCastInstrCost(I->getOpcode(), VectorTy, SrcVecTy);
   }
   case Instruction::Call: {
     bool NeedToScalarize;
     CallInst *CI = cast<CallInst>(I);
     unsigned CallCost = getVectorCallCost(CI, VF, TTI, TLI, NeedToScalarize);
-    if (getIntrinsicIDForCall(CI, TLI))
+    if (getVectorIntrinsicIDForCall(CI, TLI))
       return std::min(CallCost, getVectorIntrinsicCost(CI, VF, TTI, TLI));
     return CallCost;
   }
@@ -5587,10 +6102,10 @@ LoopVectorizationCostModel::getInstructionCost(Instruction *I, unsigned VF) {
     unsigned Cost = 0;
 
     if (!RetTy->isVoidTy() && VF != 1) {
-      unsigned InsCost = TTI.getVectorInstrCost(Instruction::InsertElement,
-                                                VectorTy);
-      unsigned ExtCost = TTI.getVectorInstrCost(Instruction::ExtractElement,
-                                                VectorTy);
+      unsigned InsCost =
+          TTI.getVectorInstrCost(Instruction::InsertElement, VectorTy);
+      unsigned ExtCost =
+          TTI.getVectorInstrCost(Instruction::ExtractElement, VectorTy);
 
       // The cost of inserting the results plus extracting each one of the
       // operands.
@@ -5602,7 +6117,7 @@ LoopVectorizationCostModel::getInstructionCost(Instruction *I, unsigned VF) {
     Cost += VF * TTI.getArithmeticInstrCost(Instruction::Mul, VectorTy);
     return Cost;
   }
-  }// end of switch.
+  } // end of switch.
 }
 
 char LoopVectorize::ID = 0;
@@ -5616,31 +6131,101 @@ INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker)
 INITIALIZE_PASS_DEPENDENCY(BlockFrequencyInfoWrapperPass)
 INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
 INITIALIZE_PASS_DEPENDENCY(ScalarEvolutionWrapperPass)
-INITIALIZE_PASS_DEPENDENCY(LCSSA)
+INITIALIZE_PASS_DEPENDENCY(LCSSAWrapperPass)
 INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass)
 INITIALIZE_PASS_DEPENDENCY(LoopSimplify)
-INITIALIZE_PASS_DEPENDENCY(LoopAccessAnalysis)
-INITIALIZE_PASS_DEPENDENCY(DemandedBits)
+INITIALIZE_PASS_DEPENDENCY(LoopAccessLegacyAnalysis)
+INITIALIZE_PASS_DEPENDENCY(DemandedBitsWrapperPass)
 INITIALIZE_PASS_END(LoopVectorize, LV_NAME, lv_name, false, false)
 
 namespace llvm {
-  Pass *createLoopVectorizePass(bool NoUnrolling, bool AlwaysVectorize) {
-    return new LoopVectorize(NoUnrolling, AlwaysVectorize);
-  }
+Pass *createLoopVectorizePass(bool NoUnrolling, bool AlwaysVectorize) {
+  return new LoopVectorize(NoUnrolling, AlwaysVectorize);
+}
 }
 
 bool LoopVectorizationCostModel::isConsecutiveLoadOrStore(Instruction *Inst) {
   // Check for a store.
-  if (StoreInst *ST = dyn_cast<StoreInst>(Inst))
+  if (auto *ST = dyn_cast<StoreInst>(Inst))
     return Legal->isConsecutivePtr(ST->getPointerOperand()) != 0;
 
   // Check for a load.
-  if (LoadInst *LI = dyn_cast<LoadInst>(Inst))
+  if (auto *LI = dyn_cast<LoadInst>(Inst))
     return Legal->isConsecutivePtr(LI->getPointerOperand()) != 0;
 
   return false;
 }
 
+void LoopVectorizationCostModel::collectValuesToIgnore() {
+  // Ignore ephemeral values.
+  CodeMetrics::collectEphemeralValues(TheLoop, AC, ValuesToIgnore);
+
+  // Ignore type-promoting instructions we identified during reduction
+  // detection.
+  for (auto &Reduction : *Legal->getReductionVars()) {
+    RecurrenceDescriptor &RedDes = Reduction.second;
+    SmallPtrSetImpl<Instruction *> &Casts = RedDes.getCastInsts();
+    VecValuesToIgnore.insert(Casts.begin(), Casts.end());
+  }
+
+  // Ignore induction phis that are only used in either GetElementPtr or ICmp
+  // instruction to exit loop. Induction variables usually have large types and
+  // can have big impact when estimating register usage.
+  // This is for when VF > 1.
+  for (auto &Induction : *Legal->getInductionVars()) {
+    auto *PN = Induction.first;
+    auto *UpdateV = PN->getIncomingValueForBlock(TheLoop->getLoopLatch());
+
+    // Check that the PHI is only used by the induction increment (UpdateV) or
+    // by GEPs. Then check that UpdateV is only used by a compare instruction,
+    // the loop header PHI, or by GEPs.
+    // FIXME: Need precise def-use analysis to determine if this instruction
+    // variable will be vectorized.
+    if (all_of(PN->users(),
+               [&](const User *U) -> bool {
+                 return U == UpdateV || isa<GetElementPtrInst>(U);
+               }) &&
+        all_of(UpdateV->users(), [&](const User *U) -> bool {
+          return U == PN || isa<ICmpInst>(U) || isa<GetElementPtrInst>(U);
+        })) {
+      VecValuesToIgnore.insert(PN);
+      VecValuesToIgnore.insert(UpdateV);
+    }
+  }
+
+  // Ignore instructions that will not be vectorized.
+  // This is for when VF > 1.
+  for (BasicBlock *BB : TheLoop->blocks()) {
+    for (auto &Inst : *BB) {
+      switch (Inst.getOpcode())
+      case Instruction::GetElementPtr: {
+        // Ignore GEP if its last operand is an induction variable so that it is
+        // a consecutive load/store and won't be vectorized as scatter/gather
+        // pattern.
+
+        GetElementPtrInst *Gep = cast<GetElementPtrInst>(&Inst);
+        unsigned NumOperands = Gep->getNumOperands();
+        unsigned InductionOperand = getGEPInductionOperand(Gep);
+        bool GepToIgnore = true;
+
+        // Check that all of the gep indices are uniform except for the
+        // induction operand.
+        for (unsigned i = 0; i != NumOperands; ++i) {
+          if (i != InductionOperand &&
+              !PSE.getSE()->isLoopInvariant(PSE.getSCEV(Gep->getOperand(i)),
+                                            TheLoop)) {
+            GepToIgnore = false;
+            break;
+          }
+        }
+
+        if (GepToIgnore)
+          VecValuesToIgnore.insert(&Inst);
+        break;
+      }
+    }
+  }
+}
 
 void InnerLoopUnroller::scalarizeInstruction(Instruction *Instr,
                                              bool IfPredicateStore) {
@@ -5651,9 +6236,7 @@ void InnerLoopUnroller::scalarizeInstruction(Instruction *Instr,
   setDebugLocFromInst(Builder, Instr);
 
   // Find all of the vectorized parameters.
-  for (unsigned op = 0, e = Instr->getNumOperands(); op != e; ++op) {
-    Value *SrcOp = Instr->getOperand(op);
-
+  for (Value *SrcOp : Instr->operands()) {
     // If we are accessing the old induction variable, use the new one.
     if (SrcOp == OldInduction) {
       Params.push_back(getVectorValue(SrcOp));
@@ -5683,8 +6266,7 @@ void InnerLoopUnroller::scalarizeInstruction(Instruction *Instr,
   // Does this instruction return a value ?
   bool IsVoidRetTy = Instr->getType()->isVoidTy();
 
-  Value *UndefVec = IsVoidRetTy ? nullptr :
-  UndefValue::get(Instr->getType());
+  Value *UndefVec = IsVoidRetTy ? nullptr : UndefValue::get(Instr->getType());
   // Create a new entry in the WidenMap and initialize it to Undef or Null.
   VectorParts &VecResults = WidenMap.splat(Instr, UndefVec);
 
@@ -5711,43 +6293,43 @@ void InnerLoopUnroller::scalarizeInstruction(Instruction *Instr,
     }
 
     Instruction *Cloned = Instr->clone();
-      if (!IsVoidRetTy)
-        Cloned->setName(Instr->getName() + ".cloned");
-      // Replace the operands of the cloned instructions with extracted scalars.
-      for (unsigned op = 0, e = Instr->getNumOperands(); op != e; ++op) {
-        Value *Op = Params[op][Part];
-        Cloned->setOperand(op, Op);
-      }
+    if (!IsVoidRetTy)
+      Cloned->setName(Instr->getName() + ".cloned");
+    // Replace the operands of the cloned instructions with extracted scalars.
+    for (unsigned op = 0, e = Instr->getNumOperands(); op != e; ++op) {
+      Value *Op = Params[op][Part];
+      Cloned->setOperand(op, Op);
+    }
 
-      // Place the cloned scalar in the new loop.
-      Builder.Insert(Cloned);
+    // Place the cloned scalar in the new loop.
+    Builder.Insert(Cloned);
 
-      // If the original scalar returns a value we need to place it in a vector
-      // so that future users will be able to use it.
-      if (!IsVoidRetTy)
-        VecResults[Part] = Cloned;
+    // If we just cloned a new assumption, add it the assumption cache.
+    if (auto *II = dyn_cast<IntrinsicInst>(Cloned))
+      if (II->getIntrinsicID() == Intrinsic::assume)
+        AC->registerAssumption(II);
 
-      // End if-block.
-      if (IfPredicateStore)
-        PredicatedStores.push_back(std::make_pair(cast<StoreInst>(Cloned),
-                                                  Cmp));
+    // If the original scalar returns a value we need to place it in a vector
+    // so that future users will be able to use it.
+    if (!IsVoidRetTy)
+      VecResults[Part] = Cloned;
+
+    // End if-block.
+    if (IfPredicateStore)
+      PredicatedStores.push_back(std::make_pair(cast<StoreInst>(Cloned), Cmp));
   }
 }
 
 void InnerLoopUnroller::vectorizeMemoryInstruction(Instruction *Instr) {
-  StoreInst *SI = dyn_cast<StoreInst>(Instr);
+  auto *SI = dyn_cast<StoreInst>(Instr);
   bool IfPredicateStore = (SI && Legal->blockNeedsPredication(SI->getParent()));
 
   return scalarizeInstruction(Instr, IfPredicateStore);
 }
 
-Value *InnerLoopUnroller::reverseVector(Value *Vec) {
-  return Vec;
-}
+Value *InnerLoopUnroller::reverseVector(Value *Vec) { return Vec; }
 
-Value *InnerLoopUnroller::getBroadcastInstrs(Value *V) {
-  return V;
-}
+Value *InnerLoopUnroller::getBroadcastInstrs(Value *V) { return V; }
 
 Value *InnerLoopUnroller::getStepVector(Value *Val, int StartIdx, Value *Step) {
   // When unrolling and the VF is 1, we only need to add a simple scalar.
@@ -5756,3 +6338,346 @@ Value *InnerLoopUnroller::getStepVector(Value *Val, int StartIdx, Value *Step) {
   Constant *C = ConstantInt::get(ITy, StartIdx);
   return Builder.CreateAdd(Val, Builder.CreateMul(C, Step), "induction");
 }
+
+static void AddRuntimeUnrollDisableMetaData(Loop *L) {
+  SmallVector<Metadata *, 4> MDs;
+  // Reserve first location for self reference to the LoopID metadata node.
+  MDs.push_back(nullptr);
+  bool IsUnrollMetadata = false;
+  MDNode *LoopID = L->getLoopID();
+  if (LoopID) {
+    // First find existing loop unrolling disable metadata.
+    for (unsigned i = 1, ie = LoopID->getNumOperands(); i < ie; ++i) {
+      auto *MD = dyn_cast<MDNode>(LoopID->getOperand(i));
+      if (MD) {
+        const auto *S = dyn_cast<MDString>(MD->getOperand(0));
+        IsUnrollMetadata =
+            S && S->getString().startswith("llvm.loop.unroll.disable");
+      }
+      MDs.push_back(LoopID->getOperand(i));
+    }
+  }
+
+  if (!IsUnrollMetadata) {
+    // Add runtime unroll disable metadata.
+    LLVMContext &Context = L->getHeader()->getContext();
+    SmallVector<Metadata *, 1> DisableOperands;
+    DisableOperands.push_back(
+        MDString::get(Context, "llvm.loop.unroll.runtime.disable"));
+    MDNode *DisableNode = MDNode::get(Context, DisableOperands);
+    MDs.push_back(DisableNode);
+    MDNode *NewLoopID = MDNode::get(Context, MDs);
+    // Set operand 0 to refer to the loop id itself.
+    NewLoopID->replaceOperandWith(0, NewLoopID);
+    L->setLoopID(NewLoopID);
+  }
+}
+
+bool LoopVectorizePass::processLoop(Loop *L) {
+  assert(L->empty() && "Only process inner loops.");
+
+#ifndef NDEBUG
+  const std::string DebugLocStr = getDebugLocString(L);
+#endif /* NDEBUG */
+
+  DEBUG(dbgs() << "\nLV: Checking a loop in \""
+               << L->getHeader()->getParent()->getName() << "\" from "
+               << DebugLocStr << "\n");
+
+  LoopVectorizeHints Hints(L, DisableUnrolling);
+
+  DEBUG(dbgs() << "LV: Loop hints:"
+               << " force="
+               << (Hints.getForce() == LoopVectorizeHints::FK_Disabled
+                       ? "disabled"
+                       : (Hints.getForce() == LoopVectorizeHints::FK_Enabled
+                              ? "enabled"
+                              : "?"))
+               << " width=" << Hints.getWidth()
+               << " unroll=" << Hints.getInterleave() << "\n");
+
+  // Function containing loop
+  Function *F = L->getHeader()->getParent();
+
+  // Looking at the diagnostic output is the only way to determine if a loop
+  // was vectorized (other than looking at the IR or machine code), so it
+  // is important to generate an optimization remark for each loop. Most of
+  // these messages are generated by emitOptimizationRemarkAnalysis. Remarks
+  // generated by emitOptimizationRemark and emitOptimizationRemarkMissed are
+  // less verbose reporting vectorized loops and unvectorized loops that may
+  // benefit from vectorization, respectively.
+
+  if (!Hints.allowVectorization(F, L, AlwaysVectorize)) {
+    DEBUG(dbgs() << "LV: Loop hints prevent vectorization.\n");
+    return false;
+  }
+
+  // Check the loop for a trip count threshold:
+  // do not vectorize loops with a tiny trip count.
+  const unsigned TC = SE->getSmallConstantTripCount(L);
+  if (TC > 0u && TC < TinyTripCountVectorThreshold) {
+    DEBUG(dbgs() << "LV: Found a loop with a very small trip count. "
+                 << "This loop is not worth vectorizing.");
+    if (Hints.getForce() == LoopVectorizeHints::FK_Enabled)
+      DEBUG(dbgs() << " But vectorizing was explicitly forced.\n");
+    else {
+      DEBUG(dbgs() << "\n");
+      emitAnalysisDiag(F, L, Hints, VectorizationReport()
+                                        << "vectorization is not beneficial "
+                                           "and is not explicitly forced");
+      return false;
+    }
+  }
+
+  PredicatedScalarEvolution PSE(*SE, *L);
+
+  // Check if it is legal to vectorize the loop.
+  LoopVectorizationRequirements Requirements;
+  LoopVectorizationLegality LVL(L, PSE, DT, TLI, AA, F, TTI, GetLAA, LI,
+                                &Requirements, &Hints);
+  if (!LVL.canVectorize()) {
+    DEBUG(dbgs() << "LV: Not vectorizing: Cannot prove legality.\n");
+    emitMissedWarning(F, L, Hints);
+    return false;
+  }
+
+  // Use the cost model.
+  LoopVectorizationCostModel CM(L, PSE, LI, &LVL, *TTI, TLI, DB, AC, F,
+                                &Hints);
+  CM.collectValuesToIgnore();
+
+  // Check the function attributes to find out if this function should be
+  // optimized for size.
+  bool OptForSize =
+      Hints.getForce() != LoopVectorizeHints::FK_Enabled && F->optForSize();
+
+  // Compute the weighted frequency of this loop being executed and see if it
+  // is less than 20% of the function entry baseline frequency. Note that we
+  // always have a canonical loop here because we think we *can* vectorize.
+  // FIXME: This is hidden behind a flag due to pervasive problems with
+  // exactly what block frequency models.
+  if (LoopVectorizeWithBlockFrequency) {
+    BlockFrequency LoopEntryFreq = BFI->getBlockFreq(L->getLoopPreheader());
+    if (Hints.getForce() != LoopVectorizeHints::FK_Enabled &&
+        LoopEntryFreq < ColdEntryFreq)
+      OptForSize = true;
+  }
+
+  // Check the function attributes to see if implicit floats are allowed.
+  // FIXME: This check doesn't seem possibly correct -- what if the loop is
+  // an integer loop and the vector instructions selected are purely integer
+  // vector instructions?
+  if (F->hasFnAttribute(Attribute::NoImplicitFloat)) {
+    DEBUG(dbgs() << "LV: Can't vectorize when the NoImplicitFloat"
+                    "attribute is used.\n");
+    emitAnalysisDiag(
+        F, L, Hints,
+        VectorizationReport()
+            << "loop not vectorized due to NoImplicitFloat attribute");
+    emitMissedWarning(F, L, Hints);
+    return false;
+  }
+
+  // Check if the target supports potentially unsafe FP vectorization.
+  // FIXME: Add a check for the type of safety issue (denormal, signaling)
+  // for the target we're vectorizing for, to make sure none of the
+  // additional fp-math flags can help.
+  if (Hints.isPotentiallyUnsafe() &&
+      TTI->isFPVectorizationPotentiallyUnsafe()) {
+    DEBUG(dbgs() << "LV: Potentially unsafe FP op prevents vectorization.\n");
+    emitAnalysisDiag(F, L, Hints,
+                     VectorizationReport()
+                         << "loop not vectorized due to unsafe FP support.");
+    emitMissedWarning(F, L, Hints);
+    return false;
+  }
+
+  // Select the optimal vectorization factor.
+  const LoopVectorizationCostModel::VectorizationFactor VF =
+      CM.selectVectorizationFactor(OptForSize);
+
+  // Select the interleave count.
+  unsigned IC = CM.selectInterleaveCount(OptForSize, VF.Width, VF.Cost);
+
+  // Get user interleave count.
+  unsigned UserIC = Hints.getInterleave();
+
+  // Identify the diagnostic messages that should be produced.
+  std::string VecDiagMsg, IntDiagMsg;
+  bool VectorizeLoop = true, InterleaveLoop = true;
+
+  if (Requirements.doesNotMeet(F, L, Hints)) {
+    DEBUG(dbgs() << "LV: Not vectorizing: loop did not meet vectorization "
+                    "requirements.\n");
+    emitMissedWarning(F, L, Hints);
+    return false;
+  }
+
+  if (VF.Width == 1) {
+    DEBUG(dbgs() << "LV: Vectorization is possible but not beneficial.\n");
+    VecDiagMsg =
+        "the cost-model indicates that vectorization is not beneficial";
+    VectorizeLoop = false;
+  }
+
+  if (IC == 1 && UserIC <= 1) {
+    // Tell the user interleaving is not beneficial.
+    DEBUG(dbgs() << "LV: Interleaving is not beneficial.\n");
+    IntDiagMsg =
+        "the cost-model indicates that interleaving is not beneficial";
+    InterleaveLoop = false;
+    if (UserIC == 1)
+      IntDiagMsg +=
+          " and is explicitly disabled or interleave count is set to 1";
+  } else if (IC > 1 && UserIC == 1) {
+    // Tell the user interleaving is beneficial, but it explicitly disabled.
+    DEBUG(dbgs()
+          << "LV: Interleaving is beneficial but is explicitly disabled.");
+    IntDiagMsg = "the cost-model indicates that interleaving is beneficial "
+                 "but is explicitly disabled or interleave count is set to 1";
+    InterleaveLoop = false;
+  }
+
+  // Override IC if user provided an interleave count.
+  IC = UserIC > 0 ? UserIC : IC;
+
+  // Emit diagnostic messages, if any.
+  const char *VAPassName = Hints.vectorizeAnalysisPassName();
+  if (!VectorizeLoop && !InterleaveLoop) {
+    // Do not vectorize or interleaving the loop.
+    emitOptimizationRemarkAnalysis(F->getContext(), VAPassName, *F,
+                                   L->getStartLoc(), VecDiagMsg);
+    emitOptimizationRemarkAnalysis(F->getContext(), LV_NAME, *F,
+                                   L->getStartLoc(), IntDiagMsg);
+    return false;
+  } else if (!VectorizeLoop && InterleaveLoop) {
+    DEBUG(dbgs() << "LV: Interleave Count is " << IC << '\n');
+    emitOptimizationRemarkAnalysis(F->getContext(), VAPassName, *F,
+                                   L->getStartLoc(), VecDiagMsg);
+  } else if (VectorizeLoop && !InterleaveLoop) {
+    DEBUG(dbgs() << "LV: Found a vectorizable loop (" << VF.Width << ") in "
+                 << DebugLocStr << '\n');
+    emitOptimizationRemarkAnalysis(F->getContext(), LV_NAME, *F,
+                                   L->getStartLoc(), IntDiagMsg);
+  } else if (VectorizeLoop && InterleaveLoop) {
+    DEBUG(dbgs() << "LV: Found a vectorizable loop (" << VF.Width << ") in "
+                 << DebugLocStr << '\n');
+    DEBUG(dbgs() << "LV: Interleave Count is " << IC << '\n');
+  }
+
+  if (!VectorizeLoop) {
+    assert(IC > 1 && "interleave count should not be 1 or 0");
+    // If we decided that it is not legal to vectorize the loop, then
+    // interleave it.
+    InnerLoopUnroller Unroller(L, PSE, LI, DT, TLI, TTI, AC, IC);
+    Unroller.vectorize(&LVL, CM.MinBWs, CM.VecValuesToIgnore);
+
+    emitOptimizationRemark(F->getContext(), LV_NAME, *F, L->getStartLoc(),
+                           Twine("interleaved loop (interleaved count: ") +
+                               Twine(IC) + ")");
+  } else {
+    // If we decided that it is *legal* to vectorize the loop, then do it.
+    InnerLoopVectorizer LB(L, PSE, LI, DT, TLI, TTI, AC, VF.Width, IC);
+    LB.vectorize(&LVL, CM.MinBWs, CM.VecValuesToIgnore);
+    ++LoopsVectorized;
+
+    // Add metadata to disable runtime unrolling a scalar loop when there are
+    // no runtime checks about strides and memory. A scalar loop that is
+    // rarely used is not worth unrolling.
+    if (!LB.areSafetyChecksAdded())
+      AddRuntimeUnrollDisableMetaData(L);
+
+    // Report the vectorization decision.
+    emitOptimizationRemark(F->getContext(), LV_NAME, *F, L->getStartLoc(),
+                           Twine("vectorized loop (vectorization width: ") +
+                               Twine(VF.Width) + ", interleaved count: " +
+                               Twine(IC) + ")");
+  }
+
+  // Mark the loop as already vectorized to avoid vectorizing again.
+  Hints.setAlreadyVectorized();
+
+  DEBUG(verifyFunction(*L->getHeader()->getParent()));
+  return true;
+}
+
+bool LoopVectorizePass::runImpl(
+    Function &F, ScalarEvolution &SE_, LoopInfo &LI_, TargetTransformInfo &TTI_,
+    DominatorTree &DT_, BlockFrequencyInfo &BFI_, TargetLibraryInfo *TLI_,
+    DemandedBits &DB_, AliasAnalysis &AA_, AssumptionCache &AC_,
+    std::function<const LoopAccessInfo &(Loop &)> &GetLAA_) {
+
+  SE = &SE_;
+  LI = &LI_;
+  TTI = &TTI_;
+  DT = &DT_;
+  BFI = &BFI_;
+  TLI = TLI_;
+  AA = &AA_;
+  AC = &AC_;
+  GetLAA = &GetLAA_;
+  DB = &DB_;
+
+  // Compute some weights outside of the loop over the loops. Compute this
+  // using a BranchProbability to re-use its scaling math.
+  const BranchProbability ColdProb(1, 5); // 20%
+  ColdEntryFreq = BlockFrequency(BFI->getEntryFreq()) * ColdProb;
+
+  // Don't attempt if
+  // 1. the target claims to have no vector registers, and
+  // 2. interleaving won't help ILP.
+  //
+  // The second condition is necessary because, even if the target has no
+  // vector registers, loop vectorization may still enable scalar
+  // interleaving.
+  if (!TTI->getNumberOfRegisters(true) && TTI->getMaxInterleaveFactor(1) < 2)
+    return false;
+
+  // Build up a worklist of inner-loops to vectorize. This is necessary as
+  // the act of vectorizing or partially unrolling a loop creates new loops
+  // and can invalidate iterators across the loops.
+  SmallVector<Loop *, 8> Worklist;
+
+  for (Loop *L : *LI)
+    addInnerLoop(*L, Worklist);
+
+  LoopsAnalyzed += Worklist.size();
+
+  // Now walk the identified inner loops.
+  bool Changed = false;
+  while (!Worklist.empty())
+    Changed |= processLoop(Worklist.pop_back_val());
+
+  // Process each loop nest in the function.
+  return Changed;
+
+}
+
+
+PreservedAnalyses LoopVectorizePass::run(Function &F,
+                                         FunctionAnalysisManager &AM) {
+    auto &SE = AM.getResult<ScalarEvolutionAnalysis>(F);
+    auto &LI = AM.getResult<LoopAnalysis>(F);
+    auto &TTI = AM.getResult<TargetIRAnalysis>(F);
+    auto &DT = AM.getResult<DominatorTreeAnalysis>(F);
+    auto &BFI = AM.getResult<BlockFrequencyAnalysis>(F);
+    auto *TLI = AM.getCachedResult<TargetLibraryAnalysis>(F);
+    auto &AA = AM.getResult<AAManager>(F);
+    auto &AC = AM.getResult<AssumptionAnalysis>(F);
+    auto &DB = AM.getResult<DemandedBitsAnalysis>(F);
+
+    auto &LAM = AM.getResult<LoopAnalysisManagerFunctionProxy>(F).getManager();
+    std::function<const LoopAccessInfo &(Loop &)> GetLAA =
+        [&](Loop &L) -> const LoopAccessInfo & {
+      return LAM.getResult<LoopAccessAnalysis>(L);
+    };
+    bool Changed = runImpl(F, SE, LI, TTI, DT, BFI, TLI, DB, AA, AC, GetLAA);
+    if (!Changed)
+      return PreservedAnalyses::all();
+    PreservedAnalyses PA;
+    PA.preserve<LoopAnalysis>();
+    PA.preserve<DominatorTreeAnalysis>();
+    PA.preserve<BasicAA>();
+    PA.preserve<GlobalsAA>();
+    return PA;
+}
diff --git a/lib/Transforms/Vectorize/Makefile b/lib/Transforms/Vectorize/Makefile
deleted file mode 100644
index 86c36585f23f..000000000000
--- a/lib/Transforms/Vectorize/Makefile
+++ /dev/null
@@ -1,15 +0,0 @@
-##===- lib/Transforms/Vectorize/Makefile -----------------*- Makefile -*-===##
-#
-#                     The LLVM Compiler Infrastructure
-#
-# This file is distributed under the University of Illinois Open Source
-# License. See LICENSE.TXT for details.
-#
-##===----------------------------------------------------------------------===##
-
-LEVEL = ../../..
-LIBRARYNAME = LLVMVectorize
-BUILD_ARCHIVE = 1
-
-include $(LEVEL)/Makefile.common
-
diff --git a/lib/Transforms/Vectorize/SLPVectorizer.cpp b/lib/Transforms/Vectorize/SLPVectorizer.cpp
index f69a4e52c7e1..8a3c4d14fecb 100644
--- a/lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ b/lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -15,21 +15,17 @@
 //  "Loop-Aware SLP in GCC" by Ira Rosen, Dorit Nuzman, Ayal Zaks.
 //
 //===----------------------------------------------------------------------===//
-#include "llvm/Transforms/Vectorize.h"
-#include "llvm/ADT/MapVector.h"
+#include "llvm/Transforms/Vectorize/SLPVectorizer.h"
 #include "llvm/ADT/Optional.h"
 #include "llvm/ADT/PostOrderIterator.h"
 #include "llvm/ADT/SetVector.h"
 #include "llvm/ADT/Statistic.h"
-#include "llvm/Analysis/AliasAnalysis.h"
-#include "llvm/Analysis/GlobalsModRef.h"
-#include "llvm/Analysis/AssumptionCache.h"
 #include "llvm/Analysis/CodeMetrics.h"
-#include "llvm/Analysis/LoopInfo.h"
-#include "llvm/Analysis/ScalarEvolution.h"
+#include "llvm/Analysis/GlobalsModRef.h"
+#include "llvm/Analysis/LoopAccessAnalysis.h"
 #include "llvm/Analysis/ScalarEvolutionExpressions.h"
-#include "llvm/Analysis/TargetTransformInfo.h"
 #include "llvm/Analysis/ValueTracking.h"
+#include "llvm/Analysis/VectorUtils.h"
 #include "llvm/IR/DataLayout.h"
 #include "llvm/IR/Dominators.h"
 #include "llvm/IR/IRBuilder.h"
@@ -44,12 +40,12 @@
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/raw_ostream.h"
-#include "llvm/Analysis/VectorUtils.h"
+#include "llvm/Transforms/Vectorize.h"
 #include <algorithm>
-#include <map>
 #include <memory>
 
 using namespace llvm;
+using namespace slpvectorizer;
 
 #define SV_NAME "slp-vectorizer"
 #define DEBUG_TYPE "SLP"
@@ -82,11 +78,11 @@ static cl::opt<int>
 ScheduleRegionSizeBudget("slp-schedule-budget", cl::init(100000), cl::Hidden,
     cl::desc("Limit the size of the SLP scheduling region per block"));
 
-namespace {
+static cl::opt<int> MinVectorRegSizeOption(
+    "slp-min-reg-size", cl::init(128), cl::Hidden,
+    cl::desc("Attempt to vectorize for this register size in bits"));
 
 // FIXME: Set this via cl::opt to allow overriding.
-static const unsigned MinVecRegSize = 128;
-
 static const unsigned RecursionMaxDepth = 12;
 
 // Limit the number of alias checks. The limit is chosen so that
@@ -134,8 +130,8 @@ static BasicBlock *getSameBlock(ArrayRef<Value *> VL) {
 
 /// \returns True if all of the values in \p VL are constants.
 static bool allConstant(ArrayRef<Value *> VL) {
-  for (unsigned i = 0, e = VL.size(); i < e; ++i)
-    if (!isa<Constant>(VL[i]))
+  for (Value *i : VL)
+    if (!isa<Constant>(i))
       return false;
   return true;
 }
@@ -223,46 +219,6 @@ static void propagateIRFlags(Value *I, ArrayRef<Value *> VL) {
   }
 }
 
-/// \returns \p I after propagating metadata from \p VL.
-static Instruction *propagateMetadata(Instruction *I, ArrayRef<Value *> VL) {
-  Instruction *I0 = cast<Instruction>(VL[0]);
-  SmallVector<std::pair<unsigned, MDNode *>, 4> Metadata;
-  I0->getAllMetadataOtherThanDebugLoc(Metadata);
-
-  for (unsigned i = 0, n = Metadata.size(); i != n; ++i) {
-    unsigned Kind = Metadata[i].first;
-    MDNode *MD = Metadata[i].second;
-
-    for (int i = 1, e = VL.size(); MD && i != e; i++) {
-      Instruction *I = cast<Instruction>(VL[i]);
-      MDNode *IMD = I->getMetadata(Kind);
-
-      switch (Kind) {
-      default:
-        MD = nullptr; // Remove unknown metadata
-        break;
-      case LLVMContext::MD_tbaa:
-        MD = MDNode::getMostGenericTBAA(MD, IMD);
-        break;
-      case LLVMContext::MD_alias_scope:
-        MD = MDNode::getMostGenericAliasScope(MD, IMD);
-        break;
-      case LLVMContext::MD_noalias:
-        MD = MDNode::intersect(MD, IMD);
-        break;
-      case LLVMContext::MD_fpmath:
-        MD = MDNode::getMostGenericFPMath(MD, IMD);
-        break;
-      case LLVMContext::MD_nontemporal:
-        MD = MDNode::intersect(MD, IMD);
-        break;
-      }
-    }
-    I->setMetadata(Kind, MD);
-  }
-  return I;
-}
-
 /// \returns The type that all of the values in \p VL have or null if there
 /// are different types.
 static Type* getSameType(ArrayRef<Value *> VL) {
@@ -274,36 +230,17 @@ static Type* getSameType(ArrayRef<Value *> VL) {
   return Ty;
 }
 
-/// \returns True if the ExtractElement instructions in VL can be vectorized
-/// to use the original vector.
-static bool CanReuseExtract(ArrayRef<Value *> VL) {
-  assert(Instruction::ExtractElement == getSameOpcode(VL) && "Invalid opcode");
-  // Check if all of the extracts come from the same vector and from the
-  // correct offset.
-  Value *VL0 = VL[0];
-  ExtractElementInst *E0 = cast<ExtractElementInst>(VL0);
-  Value *Vec = E0->getOperand(0);
-
-  // We have to extract from the same vector type.
-  unsigned NElts = Vec->getType()->getVectorNumElements();
-
-  if (NElts != VL.size())
-    return false;
-
-  // Check that all of the indices extract from the correct offset.
-  ConstantInt *CI = dyn_cast<ConstantInt>(E0->getOperand(1));
-  if (!CI || CI->getZExtValue())
-    return false;
-
-  for (unsigned i = 1, e = VL.size(); i < e; ++i) {
-    ExtractElementInst *E = cast<ExtractElementInst>(VL[i]);
+/// \returns True if Extract{Value,Element} instruction extracts element Idx.
+static bool matchExtractIndex(Instruction *E, unsigned Idx, unsigned Opcode) {
+  assert(Opcode == Instruction::ExtractElement ||
+         Opcode == Instruction::ExtractValue);
+  if (Opcode == Instruction::ExtractElement) {
     ConstantInt *CI = dyn_cast<ConstantInt>(E->getOperand(1));
-
-    if (!CI || CI->getZExtValue() != i || E->getOperand(0) != Vec)
-      return false;
+    return CI && CI->getZExtValue() == Idx;
+  } else {
+    ExtractValueInst *EI = cast<ExtractValueInst>(E);
+    return EI->getNumIndices() == 1 && *EI->idx_begin() == Idx;
   }
-
-  return true;
 }
 
 /// \returns True if in-tree use also needs extract. This refers to
@@ -323,7 +260,7 @@ static bool InTreeUserNeedToExtract(Value *Scalar, Instruction *UserInst,
   }
   case Instruction::Call: {
     CallInst *CI = cast<CallInst>(UserInst);
-    Intrinsic::ID ID = getIntrinsicIDForCall(CI, TLI);
+    Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);
     if (hasVectorInstrinsicScalarOpd(ID, 1)) {
       return (CI->getArgOperand(1) == Scalar);
     }
@@ -353,6 +290,8 @@ static bool isSimple(Instruction *I) {
   return true;
 }
 
+namespace llvm {
+namespace slpvectorizer {
 /// Bottom Up SLP Vectorizer.
 class BoUpSLP {
 public:
@@ -363,11 +302,24 @@ public:
 
   BoUpSLP(Function *Func, ScalarEvolution *Se, TargetTransformInfo *Tti,
           TargetLibraryInfo *TLi, AliasAnalysis *Aa, LoopInfo *Li,
-          DominatorTree *Dt, AssumptionCache *AC)
+          DominatorTree *Dt, AssumptionCache *AC, DemandedBits *DB,
+          const DataLayout *DL)
       : NumLoadsWantToKeepOrder(0), NumLoadsWantToChangeOrder(0), F(Func),
-        SE(Se), TTI(Tti), TLI(TLi), AA(Aa), LI(Li), DT(Dt),
-        Builder(Se->getContext()) {
+        SE(Se), TTI(Tti), TLI(TLi), AA(Aa), LI(Li), DT(Dt), AC(AC), DB(DB),
+        DL(DL), Builder(Se->getContext()) {
     CodeMetrics::collectEphemeralValues(F, AC, EphValues);
+    // Use the vector register size specified by the target unless overridden
+    // by a command-line option.
+    // TODO: It would be better to limit the vectorization factor based on
+    //       data type rather than just register size. For example, x86 AVX has
+    //       256-bit registers, but it does not support integer operations
+    //       at that width (that requires AVX2).
+    if (MaxVectorRegSizeOption.getNumOccurrences())
+      MaxVecRegSize = MaxVectorRegSizeOption;
+    else
+      MaxVecRegSize = TTI->getRegisterBitWidth(true);
+
+    MinVecRegSize = MinVectorRegSizeOption;
   }
 
   /// \brief Vectorize the tree that starts with the elements in \p VL.
@@ -399,11 +351,9 @@ public:
       BlockScheduling *BS = Iter.second.get();
       BS->clear();
     }
+    MinBWs.clear();
   }
 
-  /// \returns true if the memory operations A and B are consecutive.
-  bool isConsecutiveAccess(Value *A, Value *B, const DataLayout &DL);
-
   /// \brief Perform LICM and CSE on the newly generated gather sequences.
   void optimizeGatherSequence();
 
@@ -412,6 +362,32 @@ public:
     return NumLoadsWantToChangeOrder > NumLoadsWantToKeepOrder;
   }
 
+  /// \return The vector element size in bits to use when vectorizing the
+  /// expression tree ending at \p V. If V is a store, the size is the width of
+  /// the stored value. Otherwise, the size is the width of the largest loaded
+  /// value reaching V. This method is used by the vectorizer to calculate
+  /// vectorization factors.
+  unsigned getVectorElementSize(Value *V);
+
+  /// Compute the minimum type sizes required to represent the entries in a
+  /// vectorizable tree.
+  void computeMinimumValueSizes();
+
+  // \returns maximum vector register size as set by TTI or overridden by cl::opt.
+  unsigned getMaxVecRegSize() const {
+    return MaxVecRegSize;
+  }
+
+  // \returns minimum vector register size as set by cl::opt.
+  unsigned getMinVecRegSize() const {
+    return MinVecRegSize;
+  }
+
+  /// \brief Check if ArrayType or StructType is isomorphic to some VectorType.
+  ///
+  /// \returns number of elements in vector if isomorphism exists, 0 otherwise.
+  unsigned canMapToVector(Type *T, const DataLayout &DL) const;
+
 private:
   struct TreeEntry;
 
@@ -421,6 +397,10 @@ private:
   /// This is the recursive part of buildTree.
   void buildTree_rec(ArrayRef<Value *> Roots, unsigned Depth);
 
+  /// \returns True if the ExtractElement/ExtractValue instructions in VL can
+  /// be vectorized to use the original vector (or aggregate "bitcast" to a vector).
+  bool canReuseExtract(ArrayRef<Value *> VL, unsigned Opcode) const;
+
   /// Vectorize a single entry in the tree.
   Value *vectorizeTree(TreeEntry *E);
 
@@ -431,14 +411,6 @@ private:
   /// vectorized, or NULL. They may happen in cycles.
   Value *alreadyVectorized(ArrayRef<Value *> VL) const;
 
-  /// \brief Take the pointer operand from the Load/Store instruction.
-  /// \returns NULL if this is not a valid Load/Store instruction.
-  static Value *getPointerOperand(Value *I);
-
-  /// \brief Take the address space operand from the Load/Store instruction.
-  /// \returns -1 if this is not a valid Load/Store instruction.
-  static unsigned getAddressSpaceOperand(Value *I);
-
   /// \returns the scalarization cost for this type. Scalarization in this
   /// context means the creation of vectors from a group of scalars.
   int getGatherCost(Type *Ty);
@@ -719,8 +691,11 @@ private:
   };
 
 #ifndef NDEBUG
-  friend raw_ostream &operator<<(raw_ostream &os,
-                                 const BoUpSLP::ScheduleData &SD);
+  friend inline raw_ostream &operator<<(raw_ostream &os,
+                                        const BoUpSLP::ScheduleData &SD) {
+    SD.dump(os);
+    return os;
+  }
 #endif
 
   /// Contains all scheduling data for a basic block.
@@ -917,16 +892,21 @@ private:
   AliasAnalysis *AA;
   LoopInfo *LI;
   DominatorTree *DT;
+  AssumptionCache *AC;
+  DemandedBits *DB;
+  const DataLayout *DL;
+  unsigned MaxVecRegSize; // This is set by TTI or overridden by cl::opt.
+  unsigned MinVecRegSize; // Set by cl::opt (default: 128).
   /// Instruction builder to construct the vectorized tree.
   IRBuilder<> Builder;
+
+  /// A map of scalar integer values to the smallest bit width with which they
+  /// can legally be represented.
+  MapVector<Value *, uint64_t> MinBWs;
 };
 
-#ifndef NDEBUG
-raw_ostream &operator<<(raw_ostream &os, const BoUpSLP::ScheduleData &SD) {
-  SD.dump(os);
-  return os;
-}
-#endif
+} // end namespace llvm
+} // end namespace slpvectorizer
 
 void BoUpSLP::buildTree(ArrayRef<Value *> Roots,
                         ArrayRef<Value *> UserIgnoreLst) {
@@ -937,8 +917,8 @@ void BoUpSLP::buildTree(ArrayRef<Value *> Roots,
   buildTree_rec(Roots, 0);
 
   // Collect the values that we need to extract from the tree.
-  for (int EIdx = 0, EE = VectorizableTree.size(); EIdx < EE; ++EIdx) {
-    TreeEntry *Entry = &VectorizableTree[EIdx];
+  for (TreeEntry &EIdx : VectorizableTree) {
+    TreeEntry *Entry = &EIdx;
 
     // For each lane:
     for (int Lane = 0, LE = Entry->Scalars.size(); Lane != LE; ++Lane) {
@@ -987,7 +967,7 @@ void BoUpSLP::buildTree(ArrayRef<Value *> Roots,
 
 
 void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth) {
-  bool SameTy = getSameType(VL); (void)SameTy;
+  bool SameTy = allConstant(VL) || getSameType(VL); (void)SameTy;
   bool isAltShuffle = false;
   assert(SameTy && "Invalid types!");
 
@@ -1138,16 +1118,17 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth) {
       for (unsigned i = 0, e = PH->getNumIncomingValues(); i < e; ++i) {
         ValueList Operands;
         // Prepare the operand vector.
-        for (unsigned j = 0; j < VL.size(); ++j)
-          Operands.push_back(cast<PHINode>(VL[j])->getIncomingValueForBlock(
+        for (Value *j : VL)
+          Operands.push_back(cast<PHINode>(j)->getIncomingValueForBlock(
               PH->getIncomingBlock(i)));
 
         buildTree_rec(Operands, Depth + 1);
       }
       return;
     }
+    case Instruction::ExtractValue:
     case Instruction::ExtractElement: {
-      bool Reuse = CanReuseExtract(VL);
+      bool Reuse = canReuseExtract(VL, Opcode);
       if (Reuse) {
         DEBUG(dbgs() << "SLP: Reusing extract sequence.\n");
       } else {
@@ -1164,11 +1145,10 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth) {
       // loading/storing it as an i8 struct. If we vectorize loads/stores from
       // such a struct we read/write packed bits disagreeing with the
       // unvectorized version.
-      const DataLayout &DL = F->getParent()->getDataLayout();
       Type *ScalarTy = VL[0]->getType();
 
-      if (DL.getTypeSizeInBits(ScalarTy) !=
-          DL.getTypeAllocSizeInBits(ScalarTy)) {
+      if (DL->getTypeSizeInBits(ScalarTy) !=
+          DL->getTypeAllocSizeInBits(ScalarTy)) {
         BS.cancelScheduling(VL);
         newTreeEntry(VL, false);
         DEBUG(dbgs() << "SLP: Gathering loads of non-packed type.\n");
@@ -1184,8 +1164,8 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth) {
           return;
         }
 
-        if (!isConsecutiveAccess(VL[i], VL[i + 1], DL)) {
-          if (VL.size() == 2 && isConsecutiveAccess(VL[1], VL[0], DL)) {
+        if (!isConsecutiveAccess(VL[i], VL[i + 1], *DL, *SE)) {
+          if (VL.size() == 2 && isConsecutiveAccess(VL[1], VL[0], *DL, *SE)) {
             ++NumLoadsWantToChangeOrder;
           }
           BS.cancelScheduling(VL);
@@ -1227,8 +1207,8 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth) {
       for (unsigned i = 0, e = VL0->getNumOperands(); i < e; ++i) {
         ValueList Operands;
         // Prepare the operand vector.
-        for (unsigned j = 0; j < VL.size(); ++j)
-          Operands.push_back(cast<Instruction>(VL[j])->getOperand(i));
+        for (Value *j : VL)
+          Operands.push_back(cast<Instruction>(j)->getOperand(i));
 
         buildTree_rec(Operands, Depth+1);
       }
@@ -1256,8 +1236,8 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth) {
       for (unsigned i = 0, e = VL0->getNumOperands(); i < e; ++i) {
         ValueList Operands;
         // Prepare the operand vector.
-        for (unsigned j = 0; j < VL.size(); ++j)
-          Operands.push_back(cast<Instruction>(VL[j])->getOperand(i));
+        for (Value *j : VL)
+          Operands.push_back(cast<Instruction>(j)->getOperand(i));
 
         buildTree_rec(Operands, Depth+1);
       }
@@ -1298,8 +1278,8 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth) {
       for (unsigned i = 0, e = VL0->getNumOperands(); i < e; ++i) {
         ValueList Operands;
         // Prepare the operand vector.
-        for (unsigned j = 0; j < VL.size(); ++j)
-          Operands.push_back(cast<Instruction>(VL[j])->getOperand(i));
+        for (Value *j : VL)
+          Operands.push_back(cast<Instruction>(j)->getOperand(i));
 
         buildTree_rec(Operands, Depth+1);
       }
@@ -1346,18 +1326,17 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth) {
       for (unsigned i = 0, e = 2; i < e; ++i) {
         ValueList Operands;
         // Prepare the operand vector.
-        for (unsigned j = 0; j < VL.size(); ++j)
-          Operands.push_back(cast<Instruction>(VL[j])->getOperand(i));
+        for (Value *j : VL)
+          Operands.push_back(cast<Instruction>(j)->getOperand(i));
 
         buildTree_rec(Operands, Depth + 1);
       }
       return;
     }
     case Instruction::Store: {
-      const DataLayout &DL = F->getParent()->getDataLayout();
       // Check if the stores are consecutive or of we need to swizzle them.
       for (unsigned i = 0, e = VL.size() - 1; i < e; ++i)
-        if (!isConsecutiveAccess(VL[i], VL[i + 1], DL)) {
+        if (!isConsecutiveAccess(VL[i], VL[i + 1], *DL, *SE)) {
           BS.cancelScheduling(VL);
           newTreeEntry(VL, false);
           DEBUG(dbgs() << "SLP: Non-consecutive store.\n");
@@ -1368,8 +1347,8 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth) {
       DEBUG(dbgs() << "SLP: added a vector of stores.\n");
 
       ValueList Operands;
-      for (unsigned j = 0; j < VL.size(); ++j)
-        Operands.push_back(cast<Instruction>(VL[j])->getOperand(0));
+      for (Value *j : VL)
+        Operands.push_back(cast<Instruction>(j)->getOperand(0));
 
       buildTree_rec(Operands, Depth + 1);
       return;
@@ -1379,7 +1358,7 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth) {
       CallInst *CI = cast<CallInst>(VL[0]);
       // Check if this is an Intrinsic call or something that can be
       // represented by an intrinsic call
-      Intrinsic::ID ID = getIntrinsicIDForCall(CI, TLI);
+      Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);
       if (!isTriviallyVectorizable(ID)) {
         BS.cancelScheduling(VL);
         newTreeEntry(VL, false);
@@ -1393,7 +1372,8 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth) {
       for (unsigned i = 1, e = VL.size(); i != e; ++i) {
         CallInst *CI2 = dyn_cast<CallInst>(VL[i]);
         if (!CI2 || CI2->getCalledFunction() != Int ||
-            getIntrinsicIDForCall(CI2, TLI) != ID) {
+            getVectorIntrinsicIDForCall(CI2, TLI) != ID ||
+            !CI->hasIdenticalOperandBundleSchema(*CI2)) {
           BS.cancelScheduling(VL);
           newTreeEntry(VL, false);
           DEBUG(dbgs() << "SLP: mismatched calls:" << *CI << "!=" << *VL[i]
@@ -1413,14 +1393,25 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth) {
             return;
           }
         }
+        // Verify that the bundle operands are identical between the two calls.
+        if (CI->hasOperandBundles() &&
+            !std::equal(CI->op_begin() + CI->getBundleOperandsStartIndex(),
+                        CI->op_begin() + CI->getBundleOperandsEndIndex(),
+                        CI2->op_begin() + CI2->getBundleOperandsStartIndex())) {
+          BS.cancelScheduling(VL);
+          newTreeEntry(VL, false);
+          DEBUG(dbgs() << "SLP: mismatched bundle operands in calls:" << *CI << "!="
+                       << *VL[i] << '\n');
+          return;
+        }
       }
 
       newTreeEntry(VL, true);
       for (unsigned i = 0, e = CI->getNumArgOperands(); i != e; ++i) {
         ValueList Operands;
         // Prepare the operand vector.
-        for (unsigned j = 0; j < VL.size(); ++j) {
-          CallInst *CI2 = dyn_cast<CallInst>(VL[j]);
+        for (Value *j : VL) {
+          CallInst *CI2 = dyn_cast<CallInst>(j);
           Operands.push_back(CI2->getArgOperand(i));
         }
         buildTree_rec(Operands, Depth + 1);
@@ -1451,8 +1442,8 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth) {
       for (unsigned i = 0, e = VL0->getNumOperands(); i < e; ++i) {
         ValueList Operands;
         // Prepare the operand vector.
-        for (unsigned j = 0; j < VL.size(); ++j)
-          Operands.push_back(cast<Instruction>(VL[j])->getOperand(i));
+        for (Value *j : VL)
+          Operands.push_back(cast<Instruction>(j)->getOperand(i));
 
         buildTree_rec(Operands, Depth + 1);
       }
@@ -1466,6 +1457,74 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth) {
   }
 }
 
+unsigned BoUpSLP::canMapToVector(Type *T, const DataLayout &DL) const {
+  unsigned N;
+  Type *EltTy;
+  auto *ST = dyn_cast<StructType>(T);
+  if (ST) {
+    N = ST->getNumElements();
+    EltTy = *ST->element_begin();
+  } else {
+    N = cast<ArrayType>(T)->getNumElements();
+    EltTy = cast<ArrayType>(T)->getElementType();
+  }
+  if (!isValidElementType(EltTy))
+    return 0;
+  uint64_t VTSize = DL.getTypeStoreSizeInBits(VectorType::get(EltTy, N));
+  if (VTSize < MinVecRegSize || VTSize > MaxVecRegSize || VTSize != DL.getTypeStoreSizeInBits(T))
+    return 0;
+  if (ST) {
+    // Check that struct is homogeneous.
+    for (const auto *Ty : ST->elements())
+      if (Ty != EltTy)
+        return 0;
+  }
+  return N;
+}
+
+bool BoUpSLP::canReuseExtract(ArrayRef<Value *> VL, unsigned Opcode) const {
+  assert(Opcode == Instruction::ExtractElement ||
+         Opcode == Instruction::ExtractValue);
+  assert(Opcode == getSameOpcode(VL) && "Invalid opcode");
+  // Check if all of the extracts come from the same vector and from the
+  // correct offset.
+  Value *VL0 = VL[0];
+  Instruction *E0 = cast<Instruction>(VL0);
+  Value *Vec = E0->getOperand(0);
+
+  // We have to extract from a vector/aggregate with the same number of elements.
+  unsigned NElts;
+  if (Opcode == Instruction::ExtractValue) {
+    const DataLayout &DL = E0->getModule()->getDataLayout();
+    NElts = canMapToVector(Vec->getType(), DL);
+    if (!NElts)
+      return false;
+    // Check if load can be rewritten as load of vector.
+    LoadInst *LI = dyn_cast<LoadInst>(Vec);
+    if (!LI || !LI->isSimple() || !LI->hasNUses(VL.size()))
+      return false;
+  } else {
+    NElts = Vec->getType()->getVectorNumElements();
+  }
+
+  if (NElts != VL.size())
+    return false;
+
+  // Check that all of the indices extract from the correct offset.
+  if (!matchExtractIndex(E0, 0, Opcode))
+    return false;
+
+  for (unsigned i = 1, e = VL.size(); i < e; ++i) {
+    Instruction *E = cast<Instruction>(VL[i]);
+    if (!matchExtractIndex(E, i, Opcode))
+      return false;
+    if (E->getOperand(0) != Vec)
+      return false;
+  }
+
+  return true;
+}
+
 int BoUpSLP::getEntryCost(TreeEntry *E) {
   ArrayRef<Value*> VL = E->Scalars;
 
@@ -1474,6 +1533,12 @@ int BoUpSLP::getEntryCost(TreeEntry *E) {
     ScalarTy = SI->getValueOperand()->getType();
   VectorType *VecTy = VectorType::get(ScalarTy, VL.size());
 
+  // If we have computed a smaller type for the expression, update VecTy so
+  // that the costs will be accurate.
+  if (MinBWs.count(VL[0]))
+    VecTy = VectorType::get(IntegerType::get(F->getContext(), MinBWs[VL[0]]),
+                            VL.size());
+
   if (E->NeedToGather) {
     if (allConstant(VL))
       return 0;
@@ -1489,11 +1554,12 @@ int BoUpSLP::getEntryCost(TreeEntry *E) {
     case Instruction::PHI: {
       return 0;
     }
+    case Instruction::ExtractValue:
     case Instruction::ExtractElement: {
-      if (CanReuseExtract(VL)) {
+      if (canReuseExtract(VL, Opcode)) {
         int DeadCost = 0;
         for (unsigned i = 0, e = VL.size(); i < e; ++i) {
-          ExtractElementInst *E = cast<ExtractElementInst>(VL[i]);
+          Instruction *E = cast<Instruction>(VL[i]);
           if (E->hasOneUse())
             // Take credit for instruction that will become dead.
             DeadCost +=
@@ -1527,7 +1593,14 @@ int BoUpSLP::getEntryCost(TreeEntry *E) {
     }
     case Instruction::FCmp:
     case Instruction::ICmp:
-    case Instruction::Select:
+    case Instruction::Select: {
+      // Calculate the cost of this instruction.
+      VectorType *MaskTy = VectorType::get(Builder.getInt1Ty(), VL.size());
+      int ScalarCost = VecTy->getNumElements() *
+          TTI->getCmpSelInstrCost(Opcode, ScalarTy, Builder.getInt1Ty());
+      int VecCost = TTI->getCmpSelInstrCost(Opcode, VecTy, MaskTy);
+      return VecCost - ScalarCost;
+    }
     case Instruction::Add:
     case Instruction::FAdd:
     case Instruction::Sub:
@@ -1546,59 +1619,48 @@ int BoUpSLP::getEntryCost(TreeEntry *E) {
     case Instruction::And:
     case Instruction::Or:
     case Instruction::Xor: {
-      // Calculate the cost of this instruction.
-      int ScalarCost = 0;
-      int VecCost = 0;
-      if (Opcode == Instruction::FCmp || Opcode == Instruction::ICmp ||
-          Opcode == Instruction::Select) {
-        VectorType *MaskTy = VectorType::get(Builder.getInt1Ty(), VL.size());
-        ScalarCost = VecTy->getNumElements() *
-        TTI->getCmpSelInstrCost(Opcode, ScalarTy, Builder.getInt1Ty());
-        VecCost = TTI->getCmpSelInstrCost(Opcode, VecTy, MaskTy);
-      } else {
-        // Certain instructions can be cheaper to vectorize if they have a
-        // constant second vector operand.
-        TargetTransformInfo::OperandValueKind Op1VK =
-            TargetTransformInfo::OK_AnyValue;
-        TargetTransformInfo::OperandValueKind Op2VK =
-            TargetTransformInfo::OK_UniformConstantValue;
-        TargetTransformInfo::OperandValueProperties Op1VP =
-            TargetTransformInfo::OP_None;
-        TargetTransformInfo::OperandValueProperties Op2VP =
-            TargetTransformInfo::OP_None;
-
-        // If all operands are exactly the same ConstantInt then set the
-        // operand kind to OK_UniformConstantValue.
-        // If instead not all operands are constants, then set the operand kind
-        // to OK_AnyValue. If all operands are constants but not the same,
-        // then set the operand kind to OK_NonUniformConstantValue.
-        ConstantInt *CInt = nullptr;
-        for (unsigned i = 0; i < VL.size(); ++i) {
-          const Instruction *I = cast<Instruction>(VL[i]);
-          if (!isa<ConstantInt>(I->getOperand(1))) {
-            Op2VK = TargetTransformInfo::OK_AnyValue;
-            break;
-          }
-          if (i == 0) {
-            CInt = cast<ConstantInt>(I->getOperand(1));
-            continue;
-          }
-          if (Op2VK == TargetTransformInfo::OK_UniformConstantValue &&
-              CInt != cast<ConstantInt>(I->getOperand(1)))
-            Op2VK = TargetTransformInfo::OK_NonUniformConstantValue;
+      // Certain instructions can be cheaper to vectorize if they have a
+      // constant second vector operand.
+      TargetTransformInfo::OperandValueKind Op1VK =
+          TargetTransformInfo::OK_AnyValue;
+      TargetTransformInfo::OperandValueKind Op2VK =
+          TargetTransformInfo::OK_UniformConstantValue;
+      TargetTransformInfo::OperandValueProperties Op1VP =
+          TargetTransformInfo::OP_None;
+      TargetTransformInfo::OperandValueProperties Op2VP =
+          TargetTransformInfo::OP_None;
+
+      // If all operands are exactly the same ConstantInt then set the
+      // operand kind to OK_UniformConstantValue.
+      // If instead not all operands are constants, then set the operand kind
+      // to OK_AnyValue. If all operands are constants but not the same,
+      // then set the operand kind to OK_NonUniformConstantValue.
+      ConstantInt *CInt = nullptr;
+      for (unsigned i = 0; i < VL.size(); ++i) {
+        const Instruction *I = cast<Instruction>(VL[i]);
+        if (!isa<ConstantInt>(I->getOperand(1))) {
+          Op2VK = TargetTransformInfo::OK_AnyValue;
+          break;
+        }
+        if (i == 0) {
+          CInt = cast<ConstantInt>(I->getOperand(1));
+          continue;
         }
-        // FIXME: Currently cost of model modification for division by
-        // power of 2 is handled only for X86. Add support for other targets.
-        if (Op2VK == TargetTransformInfo::OK_UniformConstantValue && CInt &&
-            CInt->getValue().isPowerOf2())
-          Op2VP = TargetTransformInfo::OP_PowerOf2;
-
-        ScalarCost = VecTy->getNumElements() *
-                     TTI->getArithmeticInstrCost(Opcode, ScalarTy, Op1VK, Op2VK,
-                                                 Op1VP, Op2VP);
-        VecCost = TTI->getArithmeticInstrCost(Opcode, VecTy, Op1VK, Op2VK,
-                                              Op1VP, Op2VP);
+        if (Op2VK == TargetTransformInfo::OK_UniformConstantValue &&
+            CInt != cast<ConstantInt>(I->getOperand(1)))
+          Op2VK = TargetTransformInfo::OK_NonUniformConstantValue;
       }
+      // FIXME: Currently cost of model modification for division by power of
+      // 2 is handled for X86 and AArch64. Add support for other targets.
+      if (Op2VK == TargetTransformInfo::OK_UniformConstantValue && CInt &&
+          CInt->getValue().isPowerOf2())
+        Op2VP = TargetTransformInfo::OP_PowerOf2;
+
+      int ScalarCost = VecTy->getNumElements() *
+                       TTI->getArithmeticInstrCost(Opcode, ScalarTy, Op1VK,
+                                                   Op2VK, Op1VP, Op2VP);
+      int VecCost = TTI->getArithmeticInstrCost(Opcode, VecTy, Op1VK, Op2VK,
+                                                Op1VP, Op2VP);
       return VecCost - ScalarCost;
     }
     case Instruction::GetElementPtr: {
@@ -1617,21 +1679,25 @@ int BoUpSLP::getEntryCost(TreeEntry *E) {
     }
     case Instruction::Load: {
       // Cost of wide load - cost of scalar loads.
+      unsigned alignment = dyn_cast<LoadInst>(VL0)->getAlignment();
       int ScalarLdCost = VecTy->getNumElements() *
-      TTI->getMemoryOpCost(Instruction::Load, ScalarTy, 1, 0);
-      int VecLdCost = TTI->getMemoryOpCost(Instruction::Load, VecTy, 1, 0);
+            TTI->getMemoryOpCost(Instruction::Load, ScalarTy, alignment, 0);
+      int VecLdCost = TTI->getMemoryOpCost(Instruction::Load,
+                                           VecTy, alignment, 0);
       return VecLdCost - ScalarLdCost;
     }
     case Instruction::Store: {
       // We know that we can merge the stores. Calculate the cost.
+      unsigned alignment = dyn_cast<StoreInst>(VL0)->getAlignment();
       int ScalarStCost = VecTy->getNumElements() *
-      TTI->getMemoryOpCost(Instruction::Store, ScalarTy, 1, 0);
-      int VecStCost = TTI->getMemoryOpCost(Instruction::Store, VecTy, 1, 0);
+            TTI->getMemoryOpCost(Instruction::Store, ScalarTy, alignment, 0);
+      int VecStCost = TTI->getMemoryOpCost(Instruction::Store,
+                                           VecTy, alignment, 0);
       return VecStCost - ScalarStCost;
     }
     case Instruction::Call: {
       CallInst *CI = cast<CallInst>(VL0);
-      Intrinsic::ID ID = getIntrinsicIDForCall(CI, TLI);
+      Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);
 
       // Calculate the cost of the scalar and vector calls.
       SmallVector<Type*, 4> ScalarTys, VecTys;
@@ -1641,10 +1707,14 @@ int BoUpSLP::getEntryCost(TreeEntry *E) {
                                          VecTy->getNumElements()));
       }
 
+      FastMathFlags FMF;
+      if (auto *FPMO = dyn_cast<FPMathOperator>(CI))
+        FMF = FPMO->getFastMathFlags();
+
       int ScalarCallCost = VecTy->getNumElements() *
-          TTI->getIntrinsicInstrCost(ID, ScalarTy, ScalarTys);
+          TTI->getIntrinsicInstrCost(ID, ScalarTy, ScalarTys, FMF);
 
-      int VecCallCost = TTI->getIntrinsicInstrCost(ID, VecTy, VecTys);
+      int VecCallCost = TTI->getIntrinsicInstrCost(ID, VecTy, VecTys, FMF);
 
       DEBUG(dbgs() << "SLP: Call cost "<< VecCallCost - ScalarCallCost
             << " (" << VecCallCost  << "-" <<  ScalarCallCost << ")"
@@ -1659,8 +1729,8 @@ int BoUpSLP::getEntryCost(TreeEntry *E) {
           TargetTransformInfo::OK_AnyValue;
       int ScalarCost = 0;
       int VecCost = 0;
-      for (unsigned i = 0; i < VL.size(); ++i) {
-        Instruction *I = cast<Instruction>(VL[i]);
+      for (Value *i : VL) {
+        Instruction *I = cast<Instruction>(i);
         if (!I)
           break;
         ScalarCost +=
@@ -1715,8 +1785,8 @@ int BoUpSLP::getSpillCost() {
   SmallPtrSet<Instruction*, 4> LiveValues;
   Instruction *PrevInst = nullptr;
 
-  for (unsigned N = 0; N < VectorizableTree.size(); ++N) {
-    Instruction *Inst = dyn_cast<Instruction>(VectorizableTree[N].Scalars[0]);
+  for (const auto &N : VectorizableTree) {
+    Instruction *Inst = dyn_cast<Instruction>(N.Scalars[0]);
     if (!Inst)
       continue;
 
@@ -1725,6 +1795,13 @@ int BoUpSLP::getSpillCost() {
       continue;
     }
 
+    // Update LiveValues.
+    LiveValues.erase(PrevInst);
+    for (auto &J : PrevInst->operands()) {
+      if (isa<Instruction>(&*J) && ScalarToTreeEntry.count(&*J))
+        LiveValues.insert(cast<Instruction>(&*J));
+    }
+
     DEBUG(
       dbgs() << "SLP: #LV: " << LiveValues.size();
       for (auto *X : LiveValues)
@@ -1733,13 +1810,6 @@ int BoUpSLP::getSpillCost() {
       Inst->dump();
       );
 
-    // Update LiveValues.
-    LiveValues.erase(PrevInst);
-    for (auto &J : PrevInst->operands()) {
-      if (isa<Instruction>(&*J) && ScalarToTreeEntry.count(&*J))
-        LiveValues.insert(cast<Instruction>(&*J));
-    }
-
     // Now find the sequence of instructions between PrevInst and Inst.
     BasicBlock::reverse_iterator InstIt(Inst->getIterator()),
         PrevInstIt(PrevInst->getIterator());
@@ -1763,7 +1833,6 @@ int BoUpSLP::getSpillCost() {
     PrevInst = Inst;
   }
 
-  DEBUG(dbgs() << "SLP: SpillCost=" << Cost << "\n");
   return Cost;
 }
 
@@ -1785,7 +1854,7 @@ int BoUpSLP::getTreeCost() {
   for (TreeEntry &TE : VectorizableTree) {
     int C = getEntryCost(&TE);
     DEBUG(dbgs() << "SLP: Adding cost " << C << " for bundle that starts with "
-          << TE.Scalars[0] << " .\n");
+                 << *TE.Scalars[0] << ".\n");
     Cost += C;
   }
 
@@ -1802,15 +1871,29 @@ int BoUpSLP::getTreeCost() {
     if (EphValues.count(EU.User))
       continue;
 
-    VectorType *VecTy = VectorType::get(EU.Scalar->getType(), BundleWidth);
-    ExtractCost += TTI->getVectorInstrCost(Instruction::ExtractElement, VecTy,
-                                           EU.Lane);
+    // If we plan to rewrite the tree in a smaller type, we will need to sign
+    // extend the extracted value back to the original type. Here, we account
+    // for the extract and the added cost of the sign extend if needed.
+    auto *VecTy = VectorType::get(EU.Scalar->getType(), BundleWidth);
+    auto *ScalarRoot = VectorizableTree[0].Scalars[0];
+    if (MinBWs.count(ScalarRoot)) {
+      auto *MinTy = IntegerType::get(F->getContext(), MinBWs[ScalarRoot]);
+      VecTy = VectorType::get(MinTy, BundleWidth);
+      ExtractCost += TTI->getExtractWithExtendCost(
+          Instruction::SExt, EU.Scalar->getType(), VecTy, EU.Lane);
+    } else {
+      ExtractCost +=
+          TTI->getVectorInstrCost(Instruction::ExtractElement, VecTy, EU.Lane);
+    }
   }
 
-  Cost += getSpillCost();
+  int SpillCost = getSpillCost();
+  Cost += SpillCost + ExtractCost;
 
-  DEBUG(dbgs() << "SLP: Total Cost " << Cost + ExtractCost<< ".\n");
-  return  Cost + ExtractCost;
+  DEBUG(dbgs() << "SLP: Spill Cost = " << SpillCost << ".\n"
+               << "SLP: Extract Cost = " << ExtractCost << ".\n"
+               << "SLP: Total Cost = " << Cost << ".\n");
+  return Cost;
 }
 
 int BoUpSLP::getGatherCost(Type *Ty) {
@@ -1830,63 +1913,6 @@ int BoUpSLP::getGatherCost(ArrayRef<Value *> VL) {
   return getGatherCost(VecTy);
 }
 
-Value *BoUpSLP::getPointerOperand(Value *I) {
-  if (LoadInst *LI = dyn_cast<LoadInst>(I))
-    return LI->getPointerOperand();
-  if (StoreInst *SI = dyn_cast<StoreInst>(I))
-    return SI->getPointerOperand();
-  return nullptr;
-}
-
-unsigned BoUpSLP::getAddressSpaceOperand(Value *I) {
-  if (LoadInst *L = dyn_cast<LoadInst>(I))
-    return L->getPointerAddressSpace();
-  if (StoreInst *S = dyn_cast<StoreInst>(I))
-    return S->getPointerAddressSpace();
-  return -1;
-}
-
-bool BoUpSLP::isConsecutiveAccess(Value *A, Value *B, const DataLayout &DL) {
-  Value *PtrA = getPointerOperand(A);
-  Value *PtrB = getPointerOperand(B);
-  unsigned ASA = getAddressSpaceOperand(A);
-  unsigned ASB = getAddressSpaceOperand(B);
-
-  // Check that the address spaces match and that the pointers are valid.
-  if (!PtrA || !PtrB || (ASA != ASB))
-    return false;
-
-  // Make sure that A and B are different pointers of the same type.
-  if (PtrA == PtrB || PtrA->getType() != PtrB->getType())
-    return false;
-
-  unsigned PtrBitWidth = DL.getPointerSizeInBits(ASA);
-  Type *Ty = cast<PointerType>(PtrA->getType())->getElementType();
-  APInt Size(PtrBitWidth, DL.getTypeStoreSize(Ty));
-
-  APInt OffsetA(PtrBitWidth, 0), OffsetB(PtrBitWidth, 0);
-  PtrA = PtrA->stripAndAccumulateInBoundsConstantOffsets(DL, OffsetA);
-  PtrB = PtrB->stripAndAccumulateInBoundsConstantOffsets(DL, OffsetB);
-
-  APInt OffsetDelta = OffsetB - OffsetA;
-
-  // Check if they are based on the same pointer. That makes the offsets
-  // sufficient.
-  if (PtrA == PtrB)
-    return OffsetDelta == Size;
-
-  // Compute the necessary base pointer delta to have the necessary final delta
-  // equal to the size.
-  APInt BaseDelta = Size - OffsetDelta;
-
-  // Otherwise compute the distance with SCEV between the base pointers.
-  const SCEV *PtrSCEVA = SE->getSCEV(PtrA);
-  const SCEV *PtrSCEVB = SE->getSCEV(PtrB);
-  const SCEV *C = SE->getConstant(BaseDelta);
-  const SCEV *X = SE->getAddExpr(PtrSCEVA, C);
-  return X == PtrSCEVB;
-}
-
 // Reorder commutative operations in alternate shuffle if the resulting vectors
 // are consecutive loads. This would allow us to vectorize the tree.
 // If we have something like-
@@ -1899,12 +1925,10 @@ bool BoUpSLP::isConsecutiveAccess(Value *A, Value *B, const DataLayout &DL) {
 void BoUpSLP::reorderAltShuffleOperands(ArrayRef<Value *> VL,
                                         SmallVectorImpl<Value *> &Left,
                                         SmallVectorImpl<Value *> &Right) {
-  const DataLayout &DL = F->getParent()->getDataLayout();
-
   // Push left and right operands of binary operation into Left and Right
-  for (unsigned i = 0, e = VL.size(); i < e; ++i) {
-    Left.push_back(cast<Instruction>(VL[i])->getOperand(0));
-    Right.push_back(cast<Instruction>(VL[i])->getOperand(1));
+  for (Value *i : VL) {
+    Left.push_back(cast<Instruction>(i)->getOperand(0));
+    Right.push_back(cast<Instruction>(i)->getOperand(1));
   }
 
   // Reorder if we have a commutative operation and consecutive access
@@ -1914,10 +1938,11 @@ void BoUpSLP::reorderAltShuffleOperands(ArrayRef<Value *> VL,
       if (LoadInst *L1 = dyn_cast<LoadInst>(Right[j + 1])) {
         Instruction *VL1 = cast<Instruction>(VL[j]);
         Instruction *VL2 = cast<Instruction>(VL[j + 1]);
-        if (isConsecutiveAccess(L, L1, DL) && VL1->isCommutative()) {
+        if (VL1->isCommutative() && isConsecutiveAccess(L, L1, *DL, *SE)) {
           std::swap(Left[j], Right[j]);
           continue;
-        } else if (isConsecutiveAccess(L, L1, DL) && VL2->isCommutative()) {
+        } else if (VL2->isCommutative() &&
+                   isConsecutiveAccess(L, L1, *DL, *SE)) {
           std::swap(Left[j + 1], Right[j + 1]);
           continue;
         }
@@ -1928,10 +1953,11 @@ void BoUpSLP::reorderAltShuffleOperands(ArrayRef<Value *> VL,
       if (LoadInst *L1 = dyn_cast<LoadInst>(Left[j + 1])) {
         Instruction *VL1 = cast<Instruction>(VL[j]);
         Instruction *VL2 = cast<Instruction>(VL[j + 1]);
-        if (isConsecutiveAccess(L, L1, DL) && VL1->isCommutative()) {
+        if (VL1->isCommutative() && isConsecutiveAccess(L, L1, *DL, *SE)) {
           std::swap(Left[j], Right[j]);
           continue;
-        } else if (isConsecutiveAccess(L, L1, DL) && VL2->isCommutative()) {
+        } else if (VL2->isCommutative() &&
+                   isConsecutiveAccess(L, L1, *DL, *SE)) {
           std::swap(Left[j + 1], Right[j + 1]);
           continue;
         }
@@ -2061,8 +2087,6 @@ void BoUpSLP::reorderInputsAccordingToOpcode(ArrayRef<Value *> VL,
   if (SplatRight || SplatLeft)
     return;
 
-  const DataLayout &DL = F->getParent()->getDataLayout();
-
   // Finally check if we can get longer vectorizable chain by reordering
   // without breaking the good operand order detected above.
   // E.g. If we have something like-
@@ -2081,7 +2105,7 @@ void BoUpSLP::reorderInputsAccordingToOpcode(ArrayRef<Value *> VL,
   for (unsigned j = 0; j < VL.size() - 1; ++j) {
     if (LoadInst *L = dyn_cast<LoadInst>(Left[j])) {
       if (LoadInst *L1 = dyn_cast<LoadInst>(Right[j + 1])) {
-        if (isConsecutiveAccess(L, L1, DL)) {
+        if (isConsecutiveAccess(L, L1, *DL, *SE)) {
           std::swap(Left[j + 1], Right[j + 1]);
           continue;
         }
@@ -2089,7 +2113,7 @@ void BoUpSLP::reorderInputsAccordingToOpcode(ArrayRef<Value *> VL,
     }
     if (LoadInst *L = dyn_cast<LoadInst>(Right[j])) {
       if (LoadInst *L1 = dyn_cast<LoadInst>(Left[j + 1])) {
-        if (isConsecutiveAccess(L, L1, DL)) {
+        if (isConsecutiveAccess(L, L1, *DL, *SE)) {
           std::swap(Left[j + 1], Right[j + 1]);
           continue;
         }
@@ -2185,7 +2209,6 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) {
     return Gather(E->Scalars, VecTy);
   }
 
-  const DataLayout &DL = F->getParent()->getDataLayout();
   unsigned Opcode = getSameOpcode(E->Scalars);
 
   switch (Opcode) {
@@ -2225,13 +2248,25 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) {
     }
 
     case Instruction::ExtractElement: {
-      if (CanReuseExtract(E->Scalars)) {
+      if (canReuseExtract(E->Scalars, Instruction::ExtractElement)) {
         Value *V = VL0->getOperand(0);
         E->VectorizedValue = V;
         return V;
       }
       return Gather(E->Scalars, VecTy);
     }
+    case Instruction::ExtractValue: {
+      if (canReuseExtract(E->Scalars, Instruction::ExtractValue)) {
+        LoadInst *LI = cast<LoadInst>(VL0->getOperand(0));
+        Builder.SetInsertPoint(LI);
+        PointerType *PtrTy = PointerType::get(VecTy, LI->getPointerAddressSpace());
+        Value *Ptr = Builder.CreateBitCast(LI->getOperand(0), PtrTy);
+        LoadInst *V = Builder.CreateAlignedLoad(Ptr, LI->getAlignment());
+        E->VectorizedValue = V;
+        return propagateMetadata(V, E->Scalars);
+      }
+      return Gather(E->Scalars, VecTy);
+    }
     case Instruction::ZExt:
     case Instruction::SExt:
     case Instruction::FPToUI:
@@ -2382,7 +2417,7 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) {
       unsigned Alignment = LI->getAlignment();
       LI = Builder.CreateLoad(VecPtr);
       if (!Alignment) {
-        Alignment = DL.getABITypeAlignment(ScalarLoadTy);
+        Alignment = DL->getABITypeAlignment(ScalarLoadTy);
       }
       LI->setAlignment(Alignment);
       E->VectorizedValue = LI;
@@ -2413,7 +2448,7 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) {
             ExternalUser(SI->getPointerOperand(), cast<User>(VecPtr), 0));
 
       if (!Alignment) {
-        Alignment = DL.getABITypeAlignment(SI->getValueOperand()->getType());
+        Alignment = DL->getABITypeAlignment(SI->getValueOperand()->getType());
       }
       S->setAlignment(Alignment);
       E->VectorizedValue = S;
@@ -2481,10 +2516,12 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) {
       }
 
       Module *M = F->getParent();
-      Intrinsic::ID ID = getIntrinsicIDForCall(CI, TLI);
+      Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);
       Type *Tys[] = { VectorType::get(CI->getType(), E->Scalars.size()) };
       Function *CF = Intrinsic::getDeclaration(M, ID, Tys);
-      Value *V = Builder.CreateCall(CF, OpVecs);
+      SmallVector<OperandBundleDef, 1> OpBundles;
+      CI->getOperandBundlesAsDefs(OpBundles);
+      Value *V = Builder.CreateCall(CF, OpVecs, OpBundles);
 
       // The scalar argument uses an in-tree scalar so we add the new vectorized
       // call to ExternalUses list to make sure that an extract will be
@@ -2559,15 +2596,28 @@ Value *BoUpSLP::vectorizeTree() {
   }
 
   Builder.SetInsertPoint(&F->getEntryBlock().front());
-  vectorizeTree(&VectorizableTree[0]);
+  auto *VectorRoot = vectorizeTree(&VectorizableTree[0]);
+
+  // If the vectorized tree can be rewritten in a smaller type, we truncate the
+  // vectorized root. InstCombine will then rewrite the entire expression. We
+  // sign extend the extracted values below.
+  auto *ScalarRoot = VectorizableTree[0].Scalars[0];
+  if (MinBWs.count(ScalarRoot)) {
+    if (auto *I = dyn_cast<Instruction>(VectorRoot))
+      Builder.SetInsertPoint(&*++BasicBlock::iterator(I));
+    auto BundleWidth = VectorizableTree[0].Scalars.size();
+    auto *MinTy = IntegerType::get(F->getContext(), MinBWs[ScalarRoot]);
+    auto *VecTy = VectorType::get(MinTy, BundleWidth);
+    auto *Trunc = Builder.CreateTrunc(VectorRoot, VecTy);
+    VectorizableTree[0].VectorizedValue = Trunc;
+  }
 
   DEBUG(dbgs() << "SLP: Extracting " << ExternalUses.size() << " values .\n");
 
   // Extract all of the elements with the external uses.
-  for (UserList::iterator it = ExternalUses.begin(), e = ExternalUses.end();
-       it != e; ++it) {
-    Value *Scalar = it->Scalar;
-    llvm::User *User = it->User;
+  for (const auto &ExternalUse : ExternalUses) {
+    Value *Scalar = ExternalUse.Scalar;
+    llvm::User *User = ExternalUse.User;
 
     // Skip users that we already RAUW. This happens when one instruction
     // has multiple uses of the same value.
@@ -2583,15 +2633,24 @@ Value *BoUpSLP::vectorizeTree() {
     Value *Vec = E->VectorizedValue;
     assert(Vec && "Can't find vectorizable value");
 
-    Value *Lane = Builder.getInt32(it->Lane);
+    Value *Lane = Builder.getInt32(ExternalUse.Lane);
     // Generate extracts for out-of-tree users.
     // Find the insertion point for the extractelement lane.
-    if (isa<Instruction>(Vec)){
+    if (auto *VecI = dyn_cast<Instruction>(Vec)) {
       if (PHINode *PH = dyn_cast<PHINode>(User)) {
         for (int i = 0, e = PH->getNumIncomingValues(); i != e; ++i) {
           if (PH->getIncomingValue(i) == Scalar) {
-            Builder.SetInsertPoint(PH->getIncomingBlock(i)->getTerminator());
+            TerminatorInst *IncomingTerminator =
+                PH->getIncomingBlock(i)->getTerminator();
+            if (isa<CatchSwitchInst>(IncomingTerminator)) {
+              Builder.SetInsertPoint(VecI->getParent(),
+                                     std::next(VecI->getIterator()));
+            } else {
+              Builder.SetInsertPoint(PH->getIncomingBlock(i)->getTerminator());
+            }
             Value *Ex = Builder.CreateExtractElement(Vec, Lane);
+            if (MinBWs.count(ScalarRoot))
+              Ex = Builder.CreateSExt(Ex, Scalar->getType());
             CSEBlocks.insert(PH->getIncomingBlock(i));
             PH->setOperand(i, Ex);
           }
@@ -2599,12 +2658,16 @@ Value *BoUpSLP::vectorizeTree() {
       } else {
         Builder.SetInsertPoint(cast<Instruction>(User));
         Value *Ex = Builder.CreateExtractElement(Vec, Lane);
+        if (MinBWs.count(ScalarRoot))
+          Ex = Builder.CreateSExt(Ex, Scalar->getType());
         CSEBlocks.insert(cast<Instruction>(User)->getParent());
         User->replaceUsesOfWith(Scalar, Ex);
      }
     } else {
       Builder.SetInsertPoint(&F->getEntryBlock().front());
       Value *Ex = Builder.CreateExtractElement(Vec, Lane);
+      if (MinBWs.count(ScalarRoot))
+        Ex = Builder.CreateSExt(Ex, Scalar->getType());
       CSEBlocks.insert(&F->getEntryBlock());
       User->replaceUsesOfWith(Scalar, Ex);
     }
@@ -2613,8 +2676,8 @@ Value *BoUpSLP::vectorizeTree() {
   }
 
   // For each vectorized value:
-  for (int EIdx = 0, EE = VectorizableTree.size(); EIdx < EE; ++EIdx) {
-    TreeEntry *Entry = &VectorizableTree[EIdx];
+  for (TreeEntry &EIdx : VectorizableTree) {
+    TreeEntry *Entry = &EIdx;
 
     // For each lane:
     for (int Lane = 0, LE = Entry->Scalars.size(); Lane != LE; ++Lane) {
@@ -2655,9 +2718,8 @@ void BoUpSLP::optimizeGatherSequence() {
   DEBUG(dbgs() << "SLP: Optimizing " << GatherSeq.size()
         << " gather sequences instructions.\n");
   // LICM InsertElementInst sequences.
-  for (SetVector<Instruction *>::iterator it = GatherSeq.begin(),
-       e = GatherSeq.end(); it != e; ++it) {
-    InsertElementInst *Insert = dyn_cast<InsertElementInst>(*it);
+  for (Instruction *it : GatherSeq) {
+    InsertElementInst *Insert = dyn_cast<InsertElementInst>(it);
 
     if (!Insert)
       continue;
@@ -2718,12 +2780,10 @@ void BoUpSLP::optimizeGatherSequence() {
 
       // Check if we can replace this instruction with any of the
       // visited instructions.
-      for (SmallVectorImpl<Instruction *>::iterator v = Visited.begin(),
-                                                    ve = Visited.end();
-           v != ve; ++v) {
-        if (In->isIdenticalTo(*v) &&
-            DT->dominates((*v)->getParent(), In->getParent())) {
-          In->replaceAllUsesWith(*v);
+      for (Instruction *v : Visited) {
+        if (In->isIdenticalTo(v) &&
+            DT->dominates(v->getParent(), In->getParent())) {
+          In->replaceAllUsesWith(v);
           eraseInstruction(In);
           In = nullptr;
           break;
@@ -3139,90 +3199,265 @@ void BoUpSLP::scheduleBlock(BlockScheduling *BS) {
   BS->ScheduleStart = nullptr;
 }
 
-/// The SLPVectorizer Pass.
-struct SLPVectorizer : public FunctionPass {
-  typedef SmallVector<StoreInst *, 8> StoreList;
-  typedef MapVector<Value *, StoreList> StoreListMap;
+unsigned BoUpSLP::getVectorElementSize(Value *V) {
+  // If V is a store, just return the width of the stored value without
+  // traversing the expression tree. This is the common case.
+  if (auto *Store = dyn_cast<StoreInst>(V))
+    return DL->getTypeSizeInBits(Store->getValueOperand()->getType());
+
+  // If V is not a store, we can traverse the expression tree to find loads
+  // that feed it. The type of the loaded value may indicate a more suitable
+  // width than V's type. We want to base the vector element size on the width
+  // of memory operations where possible.
+  SmallVector<Instruction *, 16> Worklist;
+  SmallPtrSet<Instruction *, 16> Visited;
+  if (auto *I = dyn_cast<Instruction>(V))
+    Worklist.push_back(I);
+
+  // Traverse the expression tree in bottom-up order looking for loads. If we
+  // encounter an instruciton we don't yet handle, we give up.
+  auto MaxWidth = 0u;
+  auto FoundUnknownInst = false;
+  while (!Worklist.empty() && !FoundUnknownInst) {
+    auto *I = Worklist.pop_back_val();
+    Visited.insert(I);
+
+    // We should only be looking at scalar instructions here. If the current
+    // instruction has a vector type, give up.
+    auto *Ty = I->getType();
+    if (isa<VectorType>(Ty))
+      FoundUnknownInst = true;
+
+    // If the current instruction is a load, update MaxWidth to reflect the
+    // width of the loaded value.
+    else if (isa<LoadInst>(I))
+      MaxWidth = std::max<unsigned>(MaxWidth, DL->getTypeSizeInBits(Ty));
+
+    // Otherwise, we need to visit the operands of the instruction. We only
+    // handle the interesting cases from buildTree here. If an operand is an
+    // instruction we haven't yet visited, we add it to the worklist.
+    else if (isa<PHINode>(I) || isa<CastInst>(I) || isa<GetElementPtrInst>(I) ||
+             isa<CmpInst>(I) || isa<SelectInst>(I) || isa<BinaryOperator>(I)) {
+      for (Use &U : I->operands())
+        if (auto *J = dyn_cast<Instruction>(U.get()))
+          if (!Visited.count(J))
+            Worklist.push_back(J);
+    }
 
-  /// Pass identification, replacement for typeid
-  static char ID;
+    // If we don't yet handle the instruction, give up.
+    else
+      FoundUnknownInst = true;
+  }
 
-  explicit SLPVectorizer() : FunctionPass(ID) {
-    initializeSLPVectorizerPass(*PassRegistry::getPassRegistry());
+  // If we didn't encounter a memory access in the expression tree, or if we
+  // gave up for some reason, just return the width of V.
+  if (!MaxWidth || FoundUnknownInst)
+    return DL->getTypeSizeInBits(V->getType());
+
+  // Otherwise, return the maximum width we found.
+  return MaxWidth;
+}
+
+// Determine if a value V in a vectorizable expression Expr can be demoted to a
+// smaller type with a truncation. We collect the values that will be demoted
+// in ToDemote and additional roots that require investigating in Roots.
+static bool collectValuesToDemote(Value *V, SmallPtrSetImpl<Value *> &Expr,
+                                  SmallVectorImpl<Value *> &ToDemote,
+                                  SmallVectorImpl<Value *> &Roots) {
+
+  // We can always demote constants.
+  if (isa<Constant>(V)) {
+    ToDemote.push_back(V);
+    return true;
   }
 
-  ScalarEvolution *SE;
-  TargetTransformInfo *TTI;
-  TargetLibraryInfo *TLI;
-  AliasAnalysis *AA;
-  LoopInfo *LI;
-  DominatorTree *DT;
-  AssumptionCache *AC;
+  // If the value is not an instruction in the expression with only one use, it
+  // cannot be demoted.
+  auto *I = dyn_cast<Instruction>(V);
+  if (!I || !I->hasOneUse() || !Expr.count(I))
+    return false;
 
-  bool runOnFunction(Function &F) override {
-    if (skipOptnoneFunction(F))
+  switch (I->getOpcode()) {
+
+  // We can always demote truncations and extensions. Since truncations can
+  // seed additional demotion, we save the truncated value.
+  case Instruction::Trunc:
+    Roots.push_back(I->getOperand(0));
+  case Instruction::ZExt:
+  case Instruction::SExt:
+    break;
+
+  // We can demote certain binary operations if we can demote both of their
+  // operands.
+  case Instruction::Add:
+  case Instruction::Sub:
+  case Instruction::Mul:
+  case Instruction::And:
+  case Instruction::Or:
+  case Instruction::Xor:
+    if (!collectValuesToDemote(I->getOperand(0), Expr, ToDemote, Roots) ||
+        !collectValuesToDemote(I->getOperand(1), Expr, ToDemote, Roots))
       return false;
+    break;
 
-    SE = &getAnalysis<ScalarEvolutionWrapperPass>().getSE();
-    TTI = &getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F);
-    auto *TLIP = getAnalysisIfAvailable<TargetLibraryInfoWrapperPass>();
-    TLI = TLIP ? &TLIP->getTLI() : nullptr;
-    AA = &getAnalysis<AAResultsWrapperPass>().getAAResults();
-    LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
-    DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
-    AC = &getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F);
-
-    StoreRefs.clear();
-    bool Changed = false;
-
-    // If the target claims to have no vector registers don't attempt
-    // vectorization.
-    if (!TTI->getNumberOfRegisters(true))
+  // We can demote selects if we can demote their true and false values.
+  case Instruction::Select: {
+    SelectInst *SI = cast<SelectInst>(I);
+    if (!collectValuesToDemote(SI->getTrueValue(), Expr, ToDemote, Roots) ||
+        !collectValuesToDemote(SI->getFalseValue(), Expr, ToDemote, Roots))
       return false;
+    break;
+  }
 
-    // Use the vector register size specified by the target unless overridden
-    // by a command-line option.
-    // TODO: It would be better to limit the vectorization factor based on
-    //       data type rather than just register size. For example, x86 AVX has
-    //       256-bit registers, but it does not support integer operations
-    //       at that width (that requires AVX2).
-    if (MaxVectorRegSizeOption.getNumOccurrences())
-      MaxVecRegSize = MaxVectorRegSizeOption;
-    else
-      MaxVecRegSize = TTI->getRegisterBitWidth(true);
+  // We can demote phis if we can demote all their incoming operands. Note that
+  // we don't need to worry about cycles since we ensure single use above.
+  case Instruction::PHI: {
+    PHINode *PN = cast<PHINode>(I);
+    for (Value *IncValue : PN->incoming_values())
+      if (!collectValuesToDemote(IncValue, Expr, ToDemote, Roots))
+        return false;
+    break;
+  }
 
-    // Don't vectorize when the attribute NoImplicitFloat is used.
-    if (F.hasFnAttribute(Attribute::NoImplicitFloat))
-      return false;
+  // Otherwise, conservatively give up.
+  default:
+    return false;
+  }
 
-    DEBUG(dbgs() << "SLP: Analyzing blocks in " << F.getName() << ".\n");
+  // Record the value that we can demote.
+  ToDemote.push_back(V);
+  return true;
+}
 
-    // Use the bottom up slp vectorizer to construct chains that start with
-    // store instructions.
-    BoUpSLP R(&F, SE, TTI, TLI, AA, LI, DT, AC);
+void BoUpSLP::computeMinimumValueSizes() {
+  // If there are no external uses, the expression tree must be rooted by a
+  // store. We can't demote in-memory values, so there is nothing to do here.
+  if (ExternalUses.empty())
+    return;
 
-    // A general note: the vectorizer must use BoUpSLP::eraseInstruction() to
-    // delete instructions.
+  // We only attempt to truncate integer expressions.
+  auto &TreeRoot = VectorizableTree[0].Scalars;
+  auto *TreeRootIT = dyn_cast<IntegerType>(TreeRoot[0]->getType());
+  if (!TreeRootIT)
+    return;
 
-    // Scan the blocks in the function in post order.
-    for (auto BB : post_order(&F.getEntryBlock())) {
-      // Vectorize trees that end at stores.
-      if (unsigned count = collectStores(BB, R)) {
-        (void)count;
-        DEBUG(dbgs() << "SLP: Found " << count << " stores to vectorize.\n");
-        Changed |= vectorizeStoreChains(R);
-      }
+  // If the expression is not rooted by a store, these roots should have
+  // external uses. We will rely on InstCombine to rewrite the expression in
+  // the narrower type. However, InstCombine only rewrites single-use values.
+  // This means that if a tree entry other than a root is used externally, it
+  // must have multiple uses and InstCombine will not rewrite it. The code
+  // below ensures that only the roots are used externally.
+  SmallPtrSet<Value *, 32> Expr(TreeRoot.begin(), TreeRoot.end());
+  for (auto &EU : ExternalUses)
+    if (!Expr.erase(EU.Scalar))
+      return;
+  if (!Expr.empty())
+    return;
 
-      // Vectorize trees that end at reductions.
-      Changed |= vectorizeChainsInBlock(BB, R);
-    }
+  // Collect the scalar values of the vectorizable expression. We will use this
+  // context to determine which values can be demoted. If we see a truncation,
+  // we mark it as seeding another demotion.
+  for (auto &Entry : VectorizableTree)
+    Expr.insert(Entry.Scalars.begin(), Entry.Scalars.end());
 
-    if (Changed) {
-      R.optimizeGatherSequence();
-      DEBUG(dbgs() << "SLP: vectorized \"" << F.getName() << "\"\n");
-      DEBUG(verifyFunction(F));
+  // Ensure the roots of the vectorizable tree don't form a cycle. They must
+  // have a single external user that is not in the vectorizable tree.
+  for (auto *Root : TreeRoot)
+    if (!Root->hasOneUse() || Expr.count(*Root->user_begin()))
+      return;
+
+  // Conservatively determine if we can actually truncate the roots of the
+  // expression. Collect the values that can be demoted in ToDemote and
+  // additional roots that require investigating in Roots.
+  SmallVector<Value *, 32> ToDemote;
+  SmallVector<Value *, 4> Roots;
+  for (auto *Root : TreeRoot)
+    if (!collectValuesToDemote(Root, Expr, ToDemote, Roots))
+      return;
+
+  // The maximum bit width required to represent all the values that can be
+  // demoted without loss of precision. It would be safe to truncate the roots
+  // of the expression to this width.
+  auto MaxBitWidth = 8u;
+
+  // We first check if all the bits of the roots are demanded. If they're not,
+  // we can truncate the roots to this narrower type.
+  for (auto *Root : TreeRoot) {
+    auto Mask = DB->getDemandedBits(cast<Instruction>(Root));
+    MaxBitWidth = std::max<unsigned>(
+        Mask.getBitWidth() - Mask.countLeadingZeros(), MaxBitWidth);
+  }
+
+  // If all the bits of the roots are demanded, we can try a little harder to
+  // compute a narrower type. This can happen, for example, if the roots are
+  // getelementptr indices. InstCombine promotes these indices to the pointer
+  // width. Thus, all their bits are technically demanded even though the
+  // address computation might be vectorized in a smaller type.
+  //
+  // We start by looking at each entry that can be demoted. We compute the
+  // maximum bit width required to store the scalar by using ValueTracking to
+  // compute the number of high-order bits we can truncate.
+  if (MaxBitWidth == DL->getTypeSizeInBits(TreeRoot[0]->getType())) {
+    MaxBitWidth = 8u;
+    for (auto *Scalar : ToDemote) {
+      auto NumSignBits = ComputeNumSignBits(Scalar, *DL, 0, AC, 0, DT);
+      auto NumTypeBits = DL->getTypeSizeInBits(Scalar->getType());
+      MaxBitWidth = std::max<unsigned>(NumTypeBits - NumSignBits, MaxBitWidth);
     }
-    return Changed;
+  }
+
+  // Round MaxBitWidth up to the next power-of-two.
+  if (!isPowerOf2_64(MaxBitWidth))
+    MaxBitWidth = NextPowerOf2(MaxBitWidth);
+
+  // If the maximum bit width we compute is less than the with of the roots'
+  // type, we can proceed with the narrowing. Otherwise, do nothing.
+  if (MaxBitWidth >= TreeRootIT->getBitWidth())
+    return;
+
+  // If we can truncate the root, we must collect additional values that might
+  // be demoted as a result. That is, those seeded by truncations we will
+  // modify.
+  while (!Roots.empty())
+    collectValuesToDemote(Roots.pop_back_val(), Expr, ToDemote, Roots);
+
+  // Finally, map the values we can demote to the maximum bit with we computed.
+  for (auto *Scalar : ToDemote)
+    MinBWs[Scalar] = MaxBitWidth;
+}
+
+namespace {
+/// The SLPVectorizer Pass.
+struct SLPVectorizer : public FunctionPass {
+  SLPVectorizerPass Impl;
+
+  /// Pass identification, replacement for typeid
+  static char ID;
+
+  explicit SLPVectorizer() : FunctionPass(ID) {
+    initializeSLPVectorizerPass(*PassRegistry::getPassRegistry());
+  }
+
+
+  bool doInitialization(Module &M) override {
+    return false;
+  }
+
+  bool runOnFunction(Function &F) override {
+    if (skipFunction(F))
+      return false;
+
+    auto *SE = &getAnalysis<ScalarEvolutionWrapperPass>().getSE();
+    auto *TTI = &getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F);
+    auto *TLIP = getAnalysisIfAvailable<TargetLibraryInfoWrapperPass>();
+    auto *TLI = TLIP ? &TLIP->getTLI() : nullptr;
+    auto *AA = &getAnalysis<AAResultsWrapperPass>().getAAResults();
+    auto *LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
+    auto *DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
+    auto *AC = &getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F);
+    auto *DB = &getAnalysis<DemandedBitsWrapperPass>().getDemandedBits();
+
+    return Impl.runImpl(F, SE, TTI, TLI, AA, LI, DT, AC, DB);
   }
 
   void getAnalysisUsage(AnalysisUsage &AU) const override {
@@ -3233,51 +3468,105 @@ struct SLPVectorizer : public FunctionPass {
     AU.addRequired<TargetTransformInfoWrapperPass>();
     AU.addRequired<LoopInfoWrapperPass>();
     AU.addRequired<DominatorTreeWrapperPass>();
+    AU.addRequired<DemandedBitsWrapperPass>();
     AU.addPreserved<LoopInfoWrapperPass>();
     AU.addPreserved<DominatorTreeWrapperPass>();
     AU.addPreserved<AAResultsWrapperPass>();
     AU.addPreserved<GlobalsAAWrapperPass>();
     AU.setPreservesCFG();
   }
+};
+} // end anonymous namespace
 
-private:
+PreservedAnalyses SLPVectorizerPass::run(Function &F, FunctionAnalysisManager &AM) {
+  auto *SE = &AM.getResult<ScalarEvolutionAnalysis>(F);
+  auto *TTI = &AM.getResult<TargetIRAnalysis>(F);
+  auto *TLI = AM.getCachedResult<TargetLibraryAnalysis>(F);
+  auto *AA = &AM.getResult<AAManager>(F);
+  auto *LI = &AM.getResult<LoopAnalysis>(F);
+  auto *DT = &AM.getResult<DominatorTreeAnalysis>(F);
+  auto *AC = &AM.getResult<AssumptionAnalysis>(F);
+  auto *DB = &AM.getResult<DemandedBitsAnalysis>(F);
+
+  bool Changed = runImpl(F, SE, TTI, TLI, AA, LI, DT, AC, DB);
+  if (!Changed)
+    return PreservedAnalyses::all();
+  PreservedAnalyses PA;
+  PA.preserve<LoopAnalysis>();
+  PA.preserve<DominatorTreeAnalysis>();
+  PA.preserve<AAManager>();
+  PA.preserve<GlobalsAA>();
+  return PA;
+}
 
-  /// \brief Collect memory references and sort them according to their base
-  /// object. We sort the stores to their base objects to reduce the cost of the
-  /// quadratic search on the stores. TODO: We can further reduce this cost
-  /// if we flush the chain creation every time we run into a memory barrier.
-  unsigned collectStores(BasicBlock *BB, BoUpSLP &R);
+bool SLPVectorizerPass::runImpl(Function &F, ScalarEvolution *SE_,
+                                TargetTransformInfo *TTI_,
+                                TargetLibraryInfo *TLI_, AliasAnalysis *AA_,
+                                LoopInfo *LI_, DominatorTree *DT_,
+                                AssumptionCache *AC_, DemandedBits *DB_) {
+  SE = SE_;
+  TTI = TTI_;
+  TLI = TLI_;
+  AA = AA_;
+  LI = LI_;
+  DT = DT_;
+  AC = AC_;
+  DB = DB_;
+  DL = &F.getParent()->getDataLayout();
+
+  Stores.clear();
+  GEPs.clear();
+  bool Changed = false;
 
-  /// \brief Try to vectorize a chain that starts at two arithmetic instrs.
-  bool tryToVectorizePair(Value *A, Value *B, BoUpSLP &R);
+  // If the target claims to have no vector registers don't attempt
+  // vectorization.
+  if (!TTI->getNumberOfRegisters(true))
+    return false;
 
-  /// \brief Try to vectorize a list of operands.
-  /// \@param BuildVector A list of users to ignore for the purpose of
-  ///                     scheduling and that don't need extracting.
-  /// \returns true if a value was vectorized.
-  bool tryToVectorizeList(ArrayRef<Value *> VL, BoUpSLP &R,
-                          ArrayRef<Value *> BuildVector = None,
-                          bool allowReorder = false);
+  // Don't vectorize when the attribute NoImplicitFloat is used.
+  if (F.hasFnAttribute(Attribute::NoImplicitFloat))
+    return false;
 
-  /// \brief Try to vectorize a chain that may start at the operands of \V;
-  bool tryToVectorize(BinaryOperator *V, BoUpSLP &R);
+  DEBUG(dbgs() << "SLP: Analyzing blocks in " << F.getName() << ".\n");
 
-  /// \brief Vectorize the stores that were collected in StoreRefs.
-  bool vectorizeStoreChains(BoUpSLP &R);
+  // Use the bottom up slp vectorizer to construct chains that start with
+  // store instructions.
+  BoUpSLP R(&F, SE, TTI, TLI, AA, LI, DT, AC, DB, DL);
 
-  /// \brief Scan the basic block and look for patterns that are likely to start
-  /// a vectorization chain.
-  bool vectorizeChainsInBlock(BasicBlock *BB, BoUpSLP &R);
+  // A general note: the vectorizer must use BoUpSLP::eraseInstruction() to
+  // delete instructions.
 
-  bool vectorizeStoreChain(ArrayRef<Value *> Chain, int CostThreshold,
-                           BoUpSLP &R, unsigned VecRegSize);
+  // Scan the blocks in the function in post order.
+  for (auto BB : post_order(&F.getEntryBlock())) {
+    collectSeedInstructions(BB);
 
-  bool vectorizeStores(ArrayRef<StoreInst *> Stores, int costThreshold,
-                       BoUpSLP &R);
-private:
-  StoreListMap StoreRefs;
-  unsigned MaxVecRegSize; // This is set by TTI or overridden by cl::opt.
-};
+    // Vectorize trees that end at stores.
+    if (!Stores.empty()) {
+      DEBUG(dbgs() << "SLP: Found stores for " << Stores.size()
+                   << " underlying objects.\n");
+      Changed |= vectorizeStoreChains(R);
+    }
+
+    // Vectorize trees that end at reductions.
+    Changed |= vectorizeChainsInBlock(BB, R);
+
+    // Vectorize the index computations of getelementptr instructions. This
+    // is primarily intended to catch gather-like idioms ending at
+    // non-consecutive loads.
+    if (!GEPs.empty()) {
+      DEBUG(dbgs() << "SLP: Found GEPs for " << GEPs.size()
+                   << " underlying objects.\n");
+      Changed |= vectorizeGEPIndices(BB, R);
+    }
+  }
+
+  if (Changed) {
+    R.optimizeGatherSequence();
+    DEBUG(dbgs() << "SLP: vectorized \"" << F.getName() << "\"\n");
+    DEBUG(verifyFunction(F));
+  }
+  return Changed;
+}
 
 /// \brief Check that the Values in the slice in VL array are still existent in
 /// the WeakVH array.
@@ -3290,15 +3579,13 @@ static bool hasValueBeenRAUWed(ArrayRef<Value *> VL, ArrayRef<WeakVH> VH,
   return !std::equal(VL.begin(), VL.end(), VH.begin());
 }
 
-bool SLPVectorizer::vectorizeStoreChain(ArrayRef<Value *> Chain,
-                                        int CostThreshold, BoUpSLP &R,
-                                        unsigned VecRegSize) {
+bool SLPVectorizerPass::vectorizeStoreChain(ArrayRef<Value *> Chain,
+                                            int CostThreshold, BoUpSLP &R,
+                                            unsigned VecRegSize) {
   unsigned ChainLen = Chain.size();
   DEBUG(dbgs() << "SLP: Analyzing a store chain of length " << ChainLen
         << "\n");
-  Type *StoreTy = cast<StoreInst>(Chain[0])->getValueOperand()->getType();
-  auto &DL = cast<StoreInst>(Chain[0])->getModule()->getDataLayout();
-  unsigned Sz = DL.getTypeSizeInBits(StoreTy);
+  unsigned Sz = R.getVectorElementSize(Chain[0]);
   unsigned VF = VecRegSize / Sz;
 
   if (!isPowerOf2_32(Sz) || VF < 2)
@@ -3322,6 +3609,7 @@ bool SLPVectorizer::vectorizeStoreChain(ArrayRef<Value *> Chain,
     ArrayRef<Value *> Operands = Chain.slice(i, VF);
 
     R.buildTree(Operands);
+    R.computeMinimumValueSizes();
 
     int Cost = R.getTreeCost();
 
@@ -3339,8 +3627,8 @@ bool SLPVectorizer::vectorizeStoreChain(ArrayRef<Value *> Chain,
   return Changed;
 }
 
-bool SLPVectorizer::vectorizeStores(ArrayRef<StoreInst *> Stores,
-                                    int costThreshold, BoUpSLP &R) {
+bool SLPVectorizerPass::vectorizeStores(ArrayRef<StoreInst *> Stores,
+                                        int costThreshold, BoUpSLP &R) {
   SetVector<StoreInst *> Heads, Tails;
   SmallDenseMap<StoreInst *, StoreInst *> ConsecutiveChain;
 
@@ -3353,7 +3641,6 @@ bool SLPVectorizer::vectorizeStores(ArrayRef<StoreInst *> Stores,
   // all of the pairs of stores that follow each other.
   SmallVector<unsigned, 16> IndexQueue;
   for (unsigned i = 0, e = Stores.size(); i < e; ++i) {
-    const DataLayout &DL = Stores[i]->getModule()->getDataLayout();
     IndexQueue.clear();
     // If a store has multiple consecutive store candidates, search Stores
     // array according to the sequence: from i+1 to e, then from i-1 to 0.
@@ -3366,7 +3653,7 @@ bool SLPVectorizer::vectorizeStores(ArrayRef<StoreInst *> Stores,
       IndexQueue.push_back(j - 1);
 
     for (auto &k : IndexQueue) {
-      if (R.isConsecutiveAccess(Stores[i], Stores[k], DL)) {
+      if (isConsecutiveAccess(Stores[i], Stores[k], *DL, *SE)) {
         Tails.insert(Stores[k]);
         Heads.insert(Stores[i]);
         ConsecutiveChain[Stores[i]] = Stores[k];
@@ -3396,7 +3683,7 @@ bool SLPVectorizer::vectorizeStores(ArrayRef<StoreInst *> Stores,
 
     // FIXME: Is division-by-2 the correct step? Should we assert that the
     // register size is a power-of-2?
-    for (unsigned Size = MaxVecRegSize; Size >= MinVecRegSize; Size /= 2) {
+    for (unsigned Size = R.getMaxVecRegSize(); Size >= R.getMinVecRegSize(); Size /= 2) {
       if (vectorizeStoreChain(Operands, costThreshold, R, Size)) {
         // Mark the vectorized stores so that we don't vectorize them again.
         VectorizedStores.insert(Operands.begin(), Operands.end());
@@ -3409,45 +3696,53 @@ bool SLPVectorizer::vectorizeStores(ArrayRef<StoreInst *> Stores,
   return Changed;
 }
 
+void SLPVectorizerPass::collectSeedInstructions(BasicBlock *BB) {
 
-unsigned SLPVectorizer::collectStores(BasicBlock *BB, BoUpSLP &R) {
-  unsigned count = 0;
-  StoreRefs.clear();
-  const DataLayout &DL = BB->getModule()->getDataLayout();
-  for (Instruction &I : *BB) {
-    StoreInst *SI = dyn_cast<StoreInst>(&I);
-    if (!SI)
-      continue;
-
-    // Don't touch volatile stores.
-    if (!SI->isSimple())
-      continue;
+  // Initialize the collections. We will make a single pass over the block.
+  Stores.clear();
+  GEPs.clear();
 
-    // Check that the pointer points to scalars.
-    Type *Ty = SI->getValueOperand()->getType();
-    if (!isValidElementType(Ty))
-      continue;
+  // Visit the store and getelementptr instructions in BB and organize them in
+  // Stores and GEPs according to the underlying objects of their pointer
+  // operands.
+  for (Instruction &I : *BB) {
 
-    // Find the base pointer.
-    Value *Ptr = GetUnderlyingObject(SI->getPointerOperand(), DL);
+    // Ignore store instructions that are volatile or have a pointer operand
+    // that doesn't point to a scalar type.
+    if (auto *SI = dyn_cast<StoreInst>(&I)) {
+      if (!SI->isSimple())
+        continue;
+      if (!isValidElementType(SI->getValueOperand()->getType()))
+        continue;
+      Stores[GetUnderlyingObject(SI->getPointerOperand(), *DL)].push_back(SI);
+    }
 
-    // Save the store locations.
-    StoreRefs[Ptr].push_back(SI);
-    count++;
+    // Ignore getelementptr instructions that have more than one index, a
+    // constant index, or a pointer operand that doesn't point to a scalar
+    // type.
+    else if (auto *GEP = dyn_cast<GetElementPtrInst>(&I)) {
+      auto Idx = GEP->idx_begin()->get();
+      if (GEP->getNumIndices() > 1 || isa<Constant>(Idx))
+        continue;
+      if (!isValidElementType(Idx->getType()))
+        continue;
+      if (GEP->getType()->isVectorTy())
+        continue;
+      GEPs[GetUnderlyingObject(GEP->getPointerOperand(), *DL)].push_back(GEP);
+    }
   }
-  return count;
 }
 
-bool SLPVectorizer::tryToVectorizePair(Value *A, Value *B, BoUpSLP &R) {
+bool SLPVectorizerPass::tryToVectorizePair(Value *A, Value *B, BoUpSLP &R) {
   if (!A || !B)
     return false;
   Value *VL[] = { A, B };
   return tryToVectorizeList(VL, R, None, true);
 }
 
-bool SLPVectorizer::tryToVectorizeList(ArrayRef<Value *> VL, BoUpSLP &R,
-                                       ArrayRef<Value *> BuildVector,
-                                       bool allowReorder) {
+bool SLPVectorizerPass::tryToVectorizeList(ArrayRef<Value *> VL, BoUpSLP &R,
+                                           ArrayRef<Value *> BuildVector,
+                                           bool allowReorder) {
   if (VL.size() < 2)
     return false;
 
@@ -3459,13 +3754,11 @@ bool SLPVectorizer::tryToVectorizeList(ArrayRef<Value *> VL, BoUpSLP &R,
     return false;
 
   unsigned Opcode0 = I0->getOpcode();
-  const DataLayout &DL = I0->getModule()->getDataLayout();
 
-  Type *Ty0 = I0->getType();
-  unsigned Sz = DL.getTypeSizeInBits(Ty0);
   // FIXME: Register size should be a parameter to this function, so we can
   // try different vectorization factors.
-  unsigned VF = MinVecRegSize / Sz;
+  unsigned Sz = R.getVectorElementSize(I0);
+  unsigned VF = R.getMinVecRegSize() / Sz;
 
   for (Value *V : VL) {
     Type *Ty = V->getType();
@@ -3513,6 +3806,7 @@ bool SLPVectorizer::tryToVectorizeList(ArrayRef<Value *> VL, BoUpSLP &R,
       Value *ReorderedOps[] = { Ops[1], Ops[0] };
       R.buildTree(ReorderedOps, None);
     }
+    R.computeMinimumValueSizes();
     int Cost = R.getTreeCost();
 
     if (Cost < -SLPCostThreshold) {
@@ -3529,15 +3823,16 @@ bool SLPVectorizer::tryToVectorizeList(ArrayRef<Value *> VL, BoUpSLP &R,
         Instruction *InsertAfter = cast<Instruction>(BuildVectorSlice.back());
         unsigned VecIdx = 0;
         for (auto &V : BuildVectorSlice) {
-          IRBuilder<true, NoFolder> Builder(
-              InsertAfter->getParent(), ++BasicBlock::iterator(InsertAfter));
-          InsertElementInst *IE = cast<InsertElementInst>(V);
+          IRBuilder<NoFolder> Builder(InsertAfter->getParent(),
+                                      ++BasicBlock::iterator(InsertAfter));
+          Instruction *I = cast<Instruction>(V);
+          assert(isa<InsertElementInst>(I) || isa<InsertValueInst>(I));
           Instruction *Extract = cast<Instruction>(Builder.CreateExtractElement(
               VectorizedRoot, Builder.getInt32(VecIdx++)));
-          IE->setOperand(1, Extract);
-          IE->removeFromParent();
-          IE->insertAfter(Extract);
-          InsertAfter = IE;
+          I->setOperand(1, Extract);
+          I->removeFromParent();
+          I->insertAfter(Extract);
+          InsertAfter = I;
         }
       }
       // Move to the next bundle.
@@ -3549,7 +3844,7 @@ bool SLPVectorizer::tryToVectorizeList(ArrayRef<Value *> VL, BoUpSLP &R,
   return Changed;
 }
 
-bool SLPVectorizer::tryToVectorize(BinaryOperator *V, BoUpSLP &R) {
+bool SLPVectorizerPass::tryToVectorize(BinaryOperator *V, BoUpSLP &R) {
   if (!V)
     return false;
 
@@ -3662,9 +3957,14 @@ public:
   /// The width of one full horizontal reduction operation.
   unsigned ReduxWidth;
 
-  HorizontalReduction()
-    : ReductionRoot(nullptr), ReductionPHI(nullptr), ReductionOpcode(0),
-    ReducedValueOpcode(0), IsPairwiseReduction(false), ReduxWidth(0) {}
+  /// Minimal width of available vector registers. It's used to determine
+  /// ReduxWidth.
+  unsigned MinVecRegSize;
+
+  HorizontalReduction(unsigned MinVecRegSize)
+      : ReductionRoot(nullptr), ReductionPHI(nullptr), ReductionOpcode(0),
+        ReducedValueOpcode(0), IsPairwiseReduction(false), ReduxWidth(0),
+        MinVecRegSize(MinVecRegSize) {}
 
   /// \brief Try to find a reduction tree.
   bool matchAssociativeReduction(PHINode *Phi, BinaryOperator *B) {
@@ -3779,6 +4079,7 @@ public:
 
     for (; i < NumReducedVals - ReduxWidth + 1; i += ReduxWidth) {
       V.buildTree(makeArrayRef(&ReducedVals[i], ReduxWidth), ReductionOps);
+      V.computeMinimumValueSizes();
 
       // Estimate cost.
       int Cost = V.getTreeCost() + getReductionCost(TTI, ReducedVals[i]);
@@ -3928,6 +4229,25 @@ static bool findBuildVector(InsertElementInst *FirstInsertElem,
   return false;
 }
 
+/// \brief Like findBuildVector, but looks backwards for construction of aggregate.
+///
+/// \return true if it matches.
+static bool findBuildAggregate(InsertValueInst *IV,
+                               SmallVectorImpl<Value *> &BuildVector,
+                               SmallVectorImpl<Value *> &BuildVectorOpds) {
+  if (!IV->hasOneUse())
+    return false;
+  Value *V = IV->getAggregateOperand();
+  if (!isa<UndefValue>(V)) {
+    InsertValueInst *I = dyn_cast<InsertValueInst>(V);
+    if (!I || !findBuildAggregate(I, BuildVector, BuildVectorOpds))
+      return false;
+  }
+  BuildVector.push_back(IV);
+  BuildVectorOpds.push_back(IV->getInsertedValueOperand());
+  return true;
+}
+
 static bool PhiTypeSorterFunc(Value *V, Value *V2) {
   return V->getType() < V2->getType();
 }
@@ -3991,11 +4311,12 @@ static Value *getReductionValue(const DominatorTree *DT, PHINode *P,
 /// \returns true if a horizontal reduction was matched and reduced.
 /// \returns false if a horizontal reduction was not matched.
 static bool canMatchHorizontalReduction(PHINode *P, BinaryOperator *BI,
-                                        BoUpSLP &R, TargetTransformInfo *TTI) {
+                                        BoUpSLP &R, TargetTransformInfo *TTI,
+                                        unsigned MinRegSize) {
   if (!ShouldVectorizeHor)
     return false;
 
-  HorizontalReduction HorRdx;
+  HorizontalReduction HorRdx(MinRegSize);
   if (!HorRdx.matchAssociativeReduction(P, BI))
     return false;
 
@@ -4008,7 +4329,7 @@ static bool canMatchHorizontalReduction(PHINode *P, BinaryOperator *BI,
   return HorRdx.tryToReduce(R, TTI);
 }
 
-bool SLPVectorizer::vectorizeChainsInBlock(BasicBlock *BB, BoUpSLP &R) {
+bool SLPVectorizerPass::vectorizeChainsInBlock(BasicBlock *BB, BoUpSLP &R) {
   bool Changed = false;
   SmallVector<Value *, 4> Incoming;
   SmallSet<Value *, 16> VisitedInstrs;
@@ -4083,7 +4404,7 @@ bool SLPVectorizer::vectorizeChainsInBlock(BasicBlock *BB, BoUpSLP &R) {
         continue;
 
       // Try to match and vectorize a horizontal reduction.
-      if (canMatchHorizontalReduction(P, BI, R, TTI)) {
+      if (canMatchHorizontalReduction(P, BI, R, TTI, R.getMinVecRegSize())) {
         Changed = true;
         it = BB->begin();
         e = BB->end();
@@ -4110,7 +4431,8 @@ bool SLPVectorizer::vectorizeChainsInBlock(BasicBlock *BB, BoUpSLP &R) {
       if (StoreInst *SI = dyn_cast<StoreInst>(it))
         if (BinaryOperator *BinOp =
                 dyn_cast<BinaryOperator>(SI->getValueOperand())) {
-          if (canMatchHorizontalReduction(nullptr, BinOp, R, TTI) ||
+          if (canMatchHorizontalReduction(nullptr, BinOp, R, TTI,
+                                          R.getMinVecRegSize()) ||
               tryToVectorize(BinOp, R)) {
             Changed = true;
             it = BB->begin();
@@ -4178,16 +4500,121 @@ bool SLPVectorizer::vectorizeChainsInBlock(BasicBlock *BB, BoUpSLP &R) {
 
       continue;
     }
+
+    // Try to vectorize trees that start at insertvalue instructions feeding into
+    // a store.
+    if (StoreInst *SI = dyn_cast<StoreInst>(it)) {
+      if (InsertValueInst *LastInsertValue = dyn_cast<InsertValueInst>(SI->getValueOperand())) {
+        const DataLayout &DL = BB->getModule()->getDataLayout();
+        if (R.canMapToVector(SI->getValueOperand()->getType(), DL)) {
+          SmallVector<Value *, 16> BuildVector;
+          SmallVector<Value *, 16> BuildVectorOpds;
+          if (!findBuildAggregate(LastInsertValue, BuildVector, BuildVectorOpds))
+            continue;
+
+          DEBUG(dbgs() << "SLP: store of array mappable to vector: " << *SI << "\n");
+          if (tryToVectorizeList(BuildVectorOpds, R, BuildVector, false)) {
+            Changed = true;
+            it = BB->begin();
+            e = BB->end();
+          }
+          continue;
+        }
+      }
+    }
   }
 
   return Changed;
 }
 
-bool SLPVectorizer::vectorizeStoreChains(BoUpSLP &R) {
+bool SLPVectorizerPass::vectorizeGEPIndices(BasicBlock *BB, BoUpSLP &R) {
+  auto Changed = false;
+  for (auto &Entry : GEPs) {
+
+    // If the getelementptr list has fewer than two elements, there's nothing
+    // to do.
+    if (Entry.second.size() < 2)
+      continue;
+
+    DEBUG(dbgs() << "SLP: Analyzing a getelementptr list of length "
+                 << Entry.second.size() << ".\n");
+
+    // We process the getelementptr list in chunks of 16 (like we do for
+    // stores) to minimize compile-time.
+    for (unsigned BI = 0, BE = Entry.second.size(); BI < BE; BI += 16) {
+      auto Len = std::min<unsigned>(BE - BI, 16);
+      auto GEPList = makeArrayRef(&Entry.second[BI], Len);
+
+      // Initialize a set a candidate getelementptrs. Note that we use a
+      // SetVector here to preserve program order. If the index computations
+      // are vectorizable and begin with loads, we want to minimize the chance
+      // of having to reorder them later.
+      SetVector<Value *> Candidates(GEPList.begin(), GEPList.end());
+
+      // Some of the candidates may have already been vectorized after we
+      // initially collected them. If so, the WeakVHs will have nullified the
+      // values, so remove them from the set of candidates.
+      Candidates.remove(nullptr);
+
+      // Remove from the set of candidates all pairs of getelementptrs with
+      // constant differences. Such getelementptrs are likely not good
+      // candidates for vectorization in a bottom-up phase since one can be
+      // computed from the other. We also ensure all candidate getelementptr
+      // indices are unique.
+      for (int I = 0, E = GEPList.size(); I < E && Candidates.size() > 1; ++I) {
+        auto *GEPI = cast<GetElementPtrInst>(GEPList[I]);
+        if (!Candidates.count(GEPI))
+          continue;
+        auto *SCEVI = SE->getSCEV(GEPList[I]);
+        for (int J = I + 1; J < E && Candidates.size() > 1; ++J) {
+          auto *GEPJ = cast<GetElementPtrInst>(GEPList[J]);
+          auto *SCEVJ = SE->getSCEV(GEPList[J]);
+          if (isa<SCEVConstant>(SE->getMinusSCEV(SCEVI, SCEVJ))) {
+            Candidates.remove(GEPList[I]);
+            Candidates.remove(GEPList[J]);
+          } else if (GEPI->idx_begin()->get() == GEPJ->idx_begin()->get()) {
+            Candidates.remove(GEPList[J]);
+          }
+        }
+      }
+
+      // We break out of the above computation as soon as we know there are
+      // fewer than two candidates remaining.
+      if (Candidates.size() < 2)
+        continue;
+
+      // Add the single, non-constant index of each candidate to the bundle. We
+      // ensured the indices met these constraints when we originally collected
+      // the getelementptrs.
+      SmallVector<Value *, 16> Bundle(Candidates.size());
+      auto BundleIndex = 0u;
+      for (auto *V : Candidates) {
+        auto *GEP = cast<GetElementPtrInst>(V);
+        auto *GEPIdx = GEP->idx_begin()->get();
+        assert(GEP->getNumIndices() == 1 || !isa<Constant>(GEPIdx));
+        Bundle[BundleIndex++] = GEPIdx;
+      }
+
+      // Try and vectorize the indices. We are currently only interested in
+      // gather-like cases of the form:
+      //
+      // ... = g[a[0] - b[0]] + g[a[1] - b[1]] + ...
+      //
+      // where the loads of "a", the loads of "b", and the subtractions can be
+      // performed in parallel. It's likely that detecting this pattern in a
+      // bottom-up phase will be simpler and less costly than building a
+      // full-blown top-down phase beginning at the consecutive loads.
+      Changed |= tryToVectorizeList(Bundle, R);
+    }
+  }
+  return Changed;
+}
+
+bool SLPVectorizerPass::vectorizeStoreChains(BoUpSLP &R) {
   bool Changed = false;
   // Attempt to sort and vectorize each of the store-groups.
-  for (StoreListMap::iterator it = StoreRefs.begin(), e = StoreRefs.end();
-       it != e; ++it) {
+  for (StoreListMap::iterator it = Stores.begin(), e = Stores.end(); it != e;
+       ++it) {
     if (it->second.size() < 2)
       continue;
 
@@ -4207,8 +4634,6 @@ bool SLPVectorizer::vectorizeStoreChains(BoUpSLP &R) {
   return Changed;
 }
 
-} // end anonymous namespace
-
 char SLPVectorizer::ID = 0;
 static const char lv_name[] = "SLP Vectorizer";
 INITIALIZE_PASS_BEGIN(SLPVectorizer, SV_NAME, lv_name, false, false)
@@ -4217,6 +4642,7 @@ INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass)
 INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker)
 INITIALIZE_PASS_DEPENDENCY(ScalarEvolutionWrapperPass)
 INITIALIZE_PASS_DEPENDENCY(LoopSimplify)
+INITIALIZE_PASS_DEPENDENCY(DemandedBitsWrapperPass)
 INITIALIZE_PASS_END(SLPVectorizer, SV_NAME, lv_name, false, false)
 
 namespace llvm {
diff --git a/lib/Transforms/Vectorize/Vectorize.cpp b/lib/Transforms/Vectorize/Vectorize.cpp
index 6e002fd5d5db..28e0b2eb9866 100644
--- a/lib/Transforms/Vectorize/Vectorize.cpp
+++ b/lib/Transforms/Vectorize/Vectorize.cpp
@@ -29,6 +29,7 @@ void llvm::initializeVectorization(PassRegistry &Registry) {
   initializeBBVectorizePass(Registry);
   initializeLoopVectorizePass(Registry);
   initializeSLPVectorizerPass(Registry);
+  initializeLoadStoreVectorizerPass(Registry);
 }
 
 void LLVMInitializeVectorization(LLVMPassRegistryRef R) {